6 4 25 28 22 29 29 25 25 769 988 985 986 990 990 990 690 757 761 990 39 17 23 5 5 8 8 8 8 8 8 8 8 2 2 2 2 4 4 4 4 4 4 4 1 1 1 1 1 1 1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 /* $NetBSD: fpu.c,v 1.87 2023/07/18 12:34:25 riastradh Exp $ */ /* * Copyright (c) 2008, 2019 The NetBSD Foundation, Inc. All * rights reserved. * * This code is derived from software developed for The NetBSD Foundation * by Andrew Doran and Maxime Villard. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /* * Copyright (c) 1991 The Regents of the University of California. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)npx.c 7.2 (Berkeley) 5/12/91 */ /* * Copyright (c) 1994, 1995, 1998 Charles M. Hannum. All rights reserved. * Copyright (c) 1990 William Jolitz. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)npx.c 7.2 (Berkeley) 5/12/91 */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: fpu.c,v 1.87 2023/07/18 12:34:25 riastradh Exp $"); #include "opt_ddb.h" #include "opt_multiprocessor.h" #include <sys/param.h> #include <sys/systm.h> #include <sys/conf.h> #include <sys/cpu.h> #include <sys/file.h> #include <sys/proc.h> #include <sys/kernel.h> #include <sys/sysctl.h> #include <sys/xcall.h> #include <machine/cpu.h> #include <machine/cpuvar.h> #include <machine/cputypes.h> #include <machine/intr.h> #include <machine/cpufunc.h> #include <machine/pcb.h> #include <machine/trap.h> #include <machine/specialreg.h> #include <x86/cpu.h> #include <x86/fpu.h> #ifdef DDB #include <ddb/ddb.h> #endif #ifdef XENPV #define clts() HYPERVISOR_fpu_taskswitch(0) #define stts() HYPERVISOR_fpu_taskswitch(1) #endif void fpu_handle_deferred(void); void fpu_switch(struct lwp *, struct lwp *); uint32_t x86_fpu_mxcsr_mask __read_mostly = 0; static inline union savefpu * fpu_lwp_area(struct lwp *l) { struct pcb *pcb = lwp_getpcb(l); union savefpu *area = &pcb->pcb_savefpu; KASSERT((l->l_flag & LW_SYSTEM) == 0); if (l == curlwp) { fpu_save(); } KASSERT(!(l->l_md.md_flags & MDL_FPU_IN_CPU)); return area; } static inline void fpu_save_lwp(struct lwp *l) { struct pcb *pcb = lwp_getpcb(l); union savefpu *area = &pcb->pcb_savefpu; int s; s = splvm(); if (l->l_md.md_flags & MDL_FPU_IN_CPU) { KASSERT((l->l_flag & LW_SYSTEM) == 0); fpu_area_save(area, x86_xsave_features, !(l->l_proc->p_flag & PK_32)); l->l_md.md_flags &= ~MDL_FPU_IN_CPU; } splx(s); } /* * Bring curlwp's FPU state in memory. It will get installed back in the CPU * when returning to userland. */ void fpu_save(void) { fpu_save_lwp(curlwp); } void fpuinit(struct cpu_info *ci) { /* * This might not be strictly necessary since it will be initialized * for each process. However it does no harm. */ clts(); fninit(); stts(); } void fpuinit_mxcsr_mask(void) { #ifndef XENPV union savefpu fpusave __aligned(64); u_long psl; memset(&fpusave, 0, sizeof(fpusave)); /* Disable interrupts, and enable FPU */ psl = x86_read_psl(); x86_disable_intr(); clts(); /* Fill in the FPU area */ fxsave(&fpusave); /* Restore previous state */ stts(); x86_write_psl(psl); if (fpusave.sv_xmm.fx_mxcsr_mask == 0) { x86_fpu_mxcsr_mask = __INITIAL_MXCSR_MASK__; } else { x86_fpu_mxcsr_mask = fpusave.sv_xmm.fx_mxcsr_mask; } #else /* * XXX XXX XXX: On Xen the FXSAVE above faults. That's because * &fpusave is not 16-byte aligned. Stack alignment problem * somewhere, it seems. */ x86_fpu_mxcsr_mask = __INITIAL_MXCSR_MASK__; #endif } static inline void fpu_errata_amd(void) { uint16_t sw; /* * AMD FPUs do not restore FIP, FDP, and FOP on fxrstor and xrstor * when FSW.ES=0, leaking other threads' execution history. * * Clear them manually by loading a zero (fldummy). We do this * unconditionally, regardless of FSW.ES. * * Before that, clear the ES bit in the x87 status word if it is * currently set, in order to avoid causing a fault in the * upcoming load. * * Newer generations of AMD CPUs have CPUID_Fn80000008_EBX[2], * which indicates that FIP/FDP/FOP are restored (same behavior * as Intel). We're not using it though. */ fnstsw(&sw); if (sw & 0x80) fnclex(); fldummy(); } #ifdef __x86_64__ #define XS64(x) (is_64bit ? x##64 : x) #else #define XS64(x) x #endif void fpu_area_save(void *area, uint64_t xsave_features, bool is_64bit) { switch (x86_fpu_save) { case FPU_SAVE_FSAVE: fnsave(area); break; case FPU_SAVE_FXSAVE: XS64(fxsave)(area); break; case FPU_SAVE_XSAVE: XS64(xsave)(area, xsave_features); break; case FPU_SAVE_XSAVEOPT: XS64(xsaveopt)(area, xsave_features); break; } stts(); } void fpu_area_restore(const void *area, uint64_t xsave_features, bool is_64bit) { clts(); switch (x86_fpu_save) { case FPU_SAVE_FSAVE: frstor(area); break; case FPU_SAVE_FXSAVE: if (cpu_vendor == CPUVENDOR_AMD) fpu_errata_amd(); XS64(fxrstor)(area); break; case FPU_SAVE_XSAVE: case FPU_SAVE_XSAVEOPT: if (cpu_vendor == CPUVENDOR_AMD) fpu_errata_amd(); XS64(xrstor)(area, xsave_features); break; } } void fpu_handle_deferred(void) { struct pcb *pcb = lwp_getpcb(curlwp); fpu_area_restore(&pcb->pcb_savefpu, x86_xsave_features, !(curlwp->l_proc->p_flag & PK_32)); } void fpu_switch(struct lwp *oldlwp, struct lwp *newlwp) { struct cpu_info *ci __diagused = curcpu(); struct pcb *pcb; KASSERTMSG(ci->ci_ilevel >= IPL_SCHED, "cpu%d ilevel=%d", cpu_index(ci), ci->ci_ilevel); if (oldlwp->l_md.md_flags & MDL_FPU_IN_CPU) { KASSERT(!(oldlwp->l_flag & LW_SYSTEM)); pcb = lwp_getpcb(oldlwp); fpu_area_save(&pcb->pcb_savefpu, x86_xsave_features, !(oldlwp->l_proc->p_flag & PK_32)); oldlwp->l_md.md_flags &= ~MDL_FPU_IN_CPU; } KASSERT(!(newlwp->l_md.md_flags & MDL_FPU_IN_CPU)); } void fpu_lwp_fork(struct lwp *l1, struct lwp *l2) { struct pcb *pcb2 = lwp_getpcb(l2); union savefpu *fpu_save; /* Kernel threads have no FPU. */ if (__predict_false(l2->l_flag & LW_SYSTEM)) { return; } /* For init(8). */ if (__predict_false(l1->l_flag & LW_SYSTEM)) { memset(&pcb2->pcb_savefpu, 0, x86_fpu_save_size); return; } fpu_save = fpu_lwp_area(l1); memcpy(&pcb2->pcb_savefpu, fpu_save, x86_fpu_save_size); l2->l_md.md_flags &= ~MDL_FPU_IN_CPU; } void fpu_lwp_abandon(struct lwp *l) { int s; KASSERT(l == curlwp); s = splvm(); l->l_md.md_flags &= ~MDL_FPU_IN_CPU; stts(); splx(s); } /* -------------------------------------------------------------------------- */ /* * fpu_kern_enter() * * Begin using the FPU. Raises to splvm, disabling most * interrupts and rendering the thread non-preemptible; caller * should not use this for long periods of time, and must call * fpu_kern_leave() afterward. Non-recursive -- you cannot call * fpu_kern_enter() again without calling fpu_kern_leave() first. * * Must be used only at IPL_VM or below -- never in IPL_SCHED or * IPL_HIGH interrupt handlers. */ void fpu_kern_enter(void) { static const union savefpu safe_fpu __aligned(64) = { .sv_xmm = { .fx_mxcsr = __SAFE_MXCSR__, }, }; struct lwp *l = curlwp; struct cpu_info *ci; int s; s = splvm(); ci = curcpu(); #if 0 /* * Can't assert this because if the caller holds a spin lock at * IPL_VM, and previously held and released a spin lock at * higher IPL, the IPL remains raised above IPL_VM. */ KASSERTMSG(ci->ci_ilevel <= IPL_VM || cold, "ilevel=%d", ci->ci_ilevel); #endif KASSERT(ci->ci_kfpu_spl == -1); ci->ci_kfpu_spl = s; /* * If we are in a softint and have a pinned lwp, the fpu state is that * of the pinned lwp, so save it there. */ while ((l->l_pflag & LP_INTR) && (l->l_switchto != NULL)) l = l->l_switchto; fpu_save_lwp(l); /* * Clear CR0_TS, which fpu_save_lwp set if it saved anything -- * otherwise the CPU will trap if we try to use the FPU under * the false impression that there has been a task switch since * the last FPU usage requiring that we save the FPU state. */ clts(); /* * Zero the FPU registers and install safe control words. */ fpu_area_restore(&safe_fpu, x86_xsave_features, /*is_64bit*/false); } /* * fpu_kern_leave() * * End using the FPU after fpu_kern_enter(). */ void fpu_kern_leave(void) { static const union savefpu zero_fpu __aligned(64); struct cpu_info *ci = curcpu(); int s; #if 0 /* * Can't assert this because if the caller holds a spin lock at * IPL_VM, and previously held and released a spin lock at * higher IPL, the IPL remains raised above IPL_VM. */ KASSERT(ci->ci_ilevel == IPL_VM || cold); #endif KASSERT(ci->ci_kfpu_spl != -1); /* * Zero the fpu registers; otherwise we might leak secrets * through Spectre-class attacks to userland, even if there are * no bugs in fpu state management. */ fpu_area_restore(&zero_fpu, x86_xsave_features, /*is_64bit*/false); /* * Set CR0_TS again so that the kernel can't accidentally use * the FPU. */ stts(); s = ci->ci_kfpu_spl; ci->ci_kfpu_spl = -1; splx(s); } /* -------------------------------------------------------------------------- */ /* * The following table is used to ensure that the FPE_... value * that is passed as a trapcode to the signal handler of the user * process does not have more than one bit set. * * Multiple bits may be set if SSE simd instructions generate errors * on more than one value or if the user process modifies the control * word while a status word bit is already set (which this is a sign * of bad coding). * We have no choice than to narrow them down to one bit, since we must * not send a trapcode that is not exactly one of the FPE_ macros. * * The mechanism has a static table with 127 entries. Each combination * of the 7 FPU status word exception bits directly translates to a * position in this table, where a single FPE_... value is stored. * This FPE_... value stored there is considered the "most important" * of the exception bits and will be sent as the signal code. The * precedence of the bits is based upon Intel Document "Numerical * Applications", Chapter "Special Computational Situations". * * The code to choose one of these values does these steps: * 1) Throw away status word bits that cannot be masked. * 2) Throw away the bits currently masked in the control word, * assuming the user isn't interested in them anymore. * 3) Reinsert status word bit 7 (stack fault) if it is set, which * cannot be masked but must be preserved. * 'Stack fault' is a sub-class of 'invalid operation'. * 4) Use the remaining bits to point into the trapcode table. * * The 6 maskable bits in order of their preference, as stated in the * above referenced Intel manual: * 1 Invalid operation (FP_X_INV) * 1a Stack underflow * 1b Stack overflow * 1c Operand of unsupported format * 1d SNaN operand. * 2 QNaN operand (not an exception, irrelevant here) * 3 Any other invalid-operation not mentioned above or zero divide * (FP_X_INV, FP_X_DZ) * 4 Denormal operand (FP_X_DNML) * 5 Numeric over/underflow (FP_X_OFL, FP_X_UFL) * 6 Inexact result (FP_X_IMP) * * NB: the above seems to mix up the mxscr error bits and the x87 ones. * They are in the same order, but there is no EN_SW_STACK_FAULT in the mmx * status. * * The table is nearly, but not quite, in bit order (ZERODIV and DENORM * are swapped). * * This table assumes that any stack fault is cleared - so that an INVOP * fault will only be reported as FLTSUB once. * This might not happen if the mask is being changed. */ #define FPE_xxx1(f) (f & EN_SW_INVOP \ ? (f & EN_SW_STACK_FAULT ? FPE_FLTSUB : FPE_FLTINV) \ : f & EN_SW_ZERODIV ? FPE_FLTDIV \ : f & EN_SW_DENORM ? FPE_FLTUND \ : f & EN_SW_OVERFLOW ? FPE_FLTOVF \ : f & EN_SW_UNDERFLOW ? FPE_FLTUND \ : f & EN_SW_PRECLOSS ? FPE_FLTRES \ : f & EN_SW_STACK_FAULT ? FPE_FLTSUB : 0) #define FPE_xxx2(f) FPE_xxx1(f), FPE_xxx1((f + 1)) #define FPE_xxx4(f) FPE_xxx2(f), FPE_xxx2((f + 2)) #define FPE_xxx8(f) FPE_xxx4(f), FPE_xxx4((f + 4)) #define FPE_xxx16(f) FPE_xxx8(f), FPE_xxx8((f + 8)) #define FPE_xxx32(f) FPE_xxx16(f), FPE_xxx16((f + 16)) static const uint8_t fpetable[128] = { FPE_xxx32(0), FPE_xxx32(32), FPE_xxx32(64), FPE_xxx32(96) }; #undef FPE_xxx1 #undef FPE_xxx2 #undef FPE_xxx4 #undef FPE_xxx8 #undef FPE_xxx16 #undef FPE_xxx32 /* * This is a synchronous trap on either an x87 instruction (due to an unmasked * error on the previous x87 instruction) or on an SSE/SSE2/etc instruction due * to an error on the instruction itself. * * If trap actually generates a signal, then the fpu state is saved and then * copied onto the lwp's user-stack, and then recovered from there when the * signal returns. * * All this code needs to do is save the reason for the trap. For x87 traps the * status word bits need clearing to stop the trap re-occurring. For SSE traps * the mxcsr bits are 'sticky' and need clearing to not confuse a later trap. * * We come here with interrupts disabled. */ void fputrap(struct trapframe *frame) { uint32_t statbits; ksiginfo_t ksi; if (__predict_false(!USERMODE(frame->tf_cs))) { register_t ip = X86_TF_RIP(frame); char where[128]; #ifdef DDB db_symstr(where, sizeof(where), (db_expr_t)ip, DB_STGY_PROC); #else snprintf(where, sizeof(where), "%p", (void *)ip); #endif panic("fpu trap from kernel at %s, trapframe %p\n", where, frame); } KASSERT(curlwp->l_md.md_flags & MDL_FPU_IN_CPU); if (frame->tf_trapno == T_XMM) { uint32_t mxcsr; x86_stmxcsr(&mxcsr); statbits = mxcsr; /* Clear the sticky status bits */ mxcsr &= ~0x3f; x86_ldmxcsr(&mxcsr); /* Remove masked interrupts and non-status bits */ statbits &= ~(statbits >> 7) & 0x3f; /* Mark this is an XMM status */ statbits |= 0x10000; } else { uint16_t cw, sw; /* Get current control and status words */ fnstcw(&cw); fnstsw(&sw); /* Clear any pending exceptions from status word */ fnclex(); /* Remove masked interrupts */ statbits = sw & ~(cw & 0x3f); } /* Doesn't matter now if we get pre-empted */ x86_enable_intr(); KSI_INIT_TRAP(&ksi); ksi.ksi_signo = SIGFPE; ksi.ksi_addr = (void *)X86_TF_RIP(frame); ksi.ksi_code = fpetable[statbits & 0x7f]; ksi.ksi_trap = statbits; (*curlwp->l_proc->p_emul->e_trapsignal)(curlwp, &ksi); } void fpudna(struct trapframe *frame) { panic("fpudna from %s, ip %p, trapframe %p", USERMODE(frame->tf_cs) ? "userland" : "kernel", (void *)X86_TF_RIP(frame), frame); } /* -------------------------------------------------------------------------- */ static inline void fpu_xstate_reload(union savefpu *fpu_save, uint64_t xstate) { /* * Force a reload of the given xstate during the next XRSTOR. */ if (x86_fpu_save >= FPU_SAVE_XSAVE) { fpu_save->sv_xsave_hdr.xsh_xstate_bv |= xstate; } } void fpu_set_default_cw(struct lwp *l, unsigned int x87_cw) { union savefpu *fpu_save = fpu_lwp_area(l); struct pcb *pcb = lwp_getpcb(l); if (i386_use_fxsave) { fpu_save->sv_xmm.fx_cw = x87_cw; if (x87_cw != __INITIAL_NPXCW__) { fpu_xstate_reload(fpu_save, XCR0_X87); } } else { fpu_save->sv_87.s87_cw = x87_cw; } pcb->pcb_fpu_dflt_cw = x87_cw; } void fpu_clear(struct lwp *l, unsigned int x87_cw) { union savefpu *fpu_save; struct pcb *pcb; KASSERT(l == curlwp); fpu_save = fpu_lwp_area(l); switch (x86_fpu_save) { case FPU_SAVE_FSAVE: memset(&fpu_save->sv_87, 0, x86_fpu_save_size); fpu_save->sv_87.s87_tw = 0xffff; fpu_save->sv_87.s87_cw = x87_cw; break; case FPU_SAVE_FXSAVE: memset(&fpu_save->sv_xmm, 0, x86_fpu_save_size); fpu_save->sv_xmm.fx_mxcsr = __INITIAL_MXCSR__; fpu_save->sv_xmm.fx_mxcsr_mask = x86_fpu_mxcsr_mask; fpu_save->sv_xmm.fx_cw = x87_cw; break; case FPU_SAVE_XSAVE: case FPU_SAVE_XSAVEOPT: memset(&fpu_save->sv_xmm, 0, x86_fpu_save_size); fpu_save->sv_xmm.fx_mxcsr = __INITIAL_MXCSR__; fpu_save->sv_xmm.fx_mxcsr_mask = x86_fpu_mxcsr_mask; fpu_save->sv_xmm.fx_cw = x87_cw; if (__predict_false(x87_cw != __INITIAL_NPXCW__)) { fpu_xstate_reload(fpu_save, XCR0_X87); } break; } pcb = lwp_getpcb(l); pcb->pcb_fpu_dflt_cw = x87_cw; } void fpu_sigreset(struct lwp *l) { union savefpu *fpu_save = fpu_lwp_area(l); struct pcb *pcb = lwp_getpcb(l); /* * For signal handlers the register values don't matter. Just reset * a few fields. */ if (i386_use_fxsave) { fpu_save->sv_xmm.fx_mxcsr = __INITIAL_MXCSR__; fpu_save->sv_xmm.fx_mxcsr_mask = x86_fpu_mxcsr_mask; fpu_save->sv_xmm.fx_tw = 0; fpu_save->sv_xmm.fx_cw = pcb->pcb_fpu_dflt_cw; } else { fpu_save->sv_87.s87_tw = 0xffff; fpu_save->sv_87.s87_cw = pcb->pcb_fpu_dflt_cw; } } void process_write_fpregs_xmm(struct lwp *l, const struct fxsave *fpregs) { union savefpu *fpu_save = fpu_lwp_area(l); if (i386_use_fxsave) { memcpy(&fpu_save->sv_xmm, fpregs, sizeof(fpu_save->sv_xmm)); /* * Invalid bits in mxcsr or mxcsr_mask will cause faults. */ fpu_save->sv_xmm.fx_mxcsr_mask &= x86_fpu_mxcsr_mask; fpu_save->sv_xmm.fx_mxcsr &= fpu_save->sv_xmm.fx_mxcsr_mask; fpu_xstate_reload(fpu_save, XCR0_X87 | XCR0_SSE); } else { process_xmm_to_s87(fpregs, &fpu_save->sv_87); } } void process_write_fpregs_s87(struct lwp *l, const struct save87 *fpregs) { union savefpu *fpu_save = fpu_lwp_area(l); if (i386_use_fxsave) { process_s87_to_xmm(fpregs, &fpu_save->sv_xmm); fpu_xstate_reload(fpu_save, XCR0_X87 | XCR0_SSE); } else { memcpy(&fpu_save->sv_87, fpregs, sizeof(fpu_save->sv_87)); } } void process_read_fpregs_xmm(struct lwp *l, struct fxsave *fpregs) { union savefpu *fpu_save = fpu_lwp_area(l); if (i386_use_fxsave) { memcpy(fpregs, &fpu_save->sv_xmm, sizeof(fpu_save->sv_xmm)); } else { memset(fpregs, 0, sizeof(*fpregs)); process_s87_to_xmm(&fpu_save->sv_87, fpregs); } } void process_read_fpregs_s87(struct lwp *l, struct save87 *fpregs) { union savefpu *fpu_save = fpu_lwp_area(l); if (i386_use_fxsave) { memset(fpregs, 0, sizeof(*fpregs)); process_xmm_to_s87(&fpu_save->sv_xmm, fpregs); } else { memcpy(fpregs, &fpu_save->sv_87, sizeof(fpu_save->sv_87)); } } int process_read_xstate(struct lwp *l, struct xstate *xstate) { union savefpu *fpu_save = fpu_lwp_area(l); if (x86_fpu_save == FPU_SAVE_FSAVE) { /* Convert from legacy FSAVE format. */ memset(&xstate->xs_fxsave, 0, sizeof(xstate->xs_fxsave)); process_s87_to_xmm(&fpu_save->sv_87, &xstate->xs_fxsave); /* We only got x87 data. */ xstate->xs_rfbm = XCR0_X87; xstate->xs_xstate_bv = XCR0_X87; return 0; } /* Copy the legacy area. */ memcpy(&xstate->xs_fxsave, fpu_save->sv_xsave_hdr.xsh_fxsave, sizeof(xstate->xs_fxsave)); if (x86_fpu_save == FPU_SAVE_FXSAVE) { /* FXSAVE means we've got x87 + SSE data. */ xstate->xs_rfbm = XCR0_X87 | XCR0_SSE; xstate->xs_xstate_bv = XCR0_X87 | XCR0_SSE; return 0; } /* Copy the bitmap indicating which states are available. */ xstate->xs_rfbm = x86_xsave_features & XCR0_FPU; xstate->xs_xstate_bv = fpu_save->sv_xsave_hdr.xsh_xstate_bv; KASSERT(!(xstate->xs_xstate_bv & ~xstate->xs_rfbm)); #define COPY_COMPONENT(xcr0_val, xsave_val, field) \ if (xstate->xs_xstate_bv & xcr0_val) { \ KASSERT(x86_xsave_offsets[xsave_val] \ >= sizeof(struct xsave_header)); \ KASSERT(x86_xsave_sizes[xsave_val] \ >= sizeof(xstate->field)); \ memcpy(&xstate->field, \ (char*)fpu_save + x86_xsave_offsets[xsave_val], \ sizeof(xstate->field)); \ } COPY_COMPONENT(XCR0_YMM_Hi128, XSAVE_YMM_Hi128, xs_ymm_hi128); COPY_COMPONENT(XCR0_Opmask, XSAVE_Opmask, xs_opmask); COPY_COMPONENT(XCR0_ZMM_Hi256, XSAVE_ZMM_Hi256, xs_zmm_hi256); COPY_COMPONENT(XCR0_Hi16_ZMM, XSAVE_Hi16_ZMM, xs_hi16_zmm); #undef COPY_COMPONENT return 0; } int process_verify_xstate(const struct xstate *xstate) { /* xstate_bv must be a subset of RFBM */ if (xstate->xs_xstate_bv & ~xstate->xs_rfbm) return EINVAL; switch (x86_fpu_save) { case FPU_SAVE_FSAVE: if ((xstate->xs_rfbm & ~XCR0_X87)) return EINVAL; break; case FPU_SAVE_FXSAVE: if ((xstate->xs_rfbm & ~(XCR0_X87 | XCR0_SSE))) return EINVAL; break; default: /* Verify whether no unsupported features are enabled */ if ((xstate->xs_rfbm & ~(x86_xsave_features & XCR0_FPU)) != 0) return EINVAL; } return 0; } int process_write_xstate(struct lwp *l, const struct xstate *xstate) { union savefpu *fpu_save = fpu_lwp_area(l); /* Convert data into legacy FSAVE format. */ if (x86_fpu_save == FPU_SAVE_FSAVE) { if (xstate->xs_xstate_bv & XCR0_X87) process_xmm_to_s87(&xstate->xs_fxsave, &fpu_save->sv_87); return 0; } /* If XSAVE is supported, make sure that xstate_bv is set correctly. */ if (x86_fpu_save >= FPU_SAVE_XSAVE) { /* * Bit-wise "xstate->xs_rfbm ? xstate->xs_xstate_bv : * fpu_save->sv_xsave_hdr.xsh_xstate_bv" */ fpu_save->sv_xsave_hdr.xsh_xstate_bv = (fpu_save->sv_xsave_hdr.xsh_xstate_bv & ~xstate->xs_rfbm) | xstate->xs_xstate_bv; } if (xstate->xs_xstate_bv & XCR0_X87) { /* * X87 state is split into two areas, interspersed with SSE * data. */ memcpy(&fpu_save->sv_xmm, &xstate->xs_fxsave, 24); memcpy(fpu_save->sv_xmm.fx_87_ac, xstate->xs_fxsave.fx_87_ac, sizeof(xstate->xs_fxsave.fx_87_ac)); } /* * Copy MXCSR if either SSE or AVX state is requested, to match the * XSAVE behavior for those flags. */ if (xstate->xs_xstate_bv & (XCR0_SSE|XCR0_YMM_Hi128)) { /* * Invalid bits in mxcsr or mxcsr_mask will cause faults. */ fpu_save->sv_xmm.fx_mxcsr_mask = xstate->xs_fxsave.fx_mxcsr_mask & x86_fpu_mxcsr_mask; fpu_save->sv_xmm.fx_mxcsr = xstate->xs_fxsave.fx_mxcsr & fpu_save->sv_xmm.fx_mxcsr_mask; } if (xstate->xs_xstate_bv & XCR0_SSE) { memcpy(&fpu_save->sv_xsave_hdr.xsh_fxsave[160], xstate->xs_fxsave.fx_xmm, sizeof(xstate->xs_fxsave.fx_xmm)); } #define COPY_COMPONENT(xcr0_val, xsave_val, field) \ if (xstate->xs_xstate_bv & xcr0_val) { \ KASSERT(x86_xsave_offsets[xsave_val] \ >= sizeof(struct xsave_header)); \ KASSERT(x86_xsave_sizes[xsave_val] \ >= sizeof(xstate->field)); \ memcpy((char *)fpu_save + x86_xsave_offsets[xsave_val], \ &xstate->field, sizeof(xstate->field)); \ } COPY_COMPONENT(XCR0_YMM_Hi128, XSAVE_YMM_Hi128, xs_ymm_hi128); COPY_COMPONENT(XCR0_Opmask, XSAVE_Opmask, xs_opmask); COPY_COMPONENT(XCR0_ZMM_Hi256, XSAVE_ZMM_Hi256, xs_zmm_hi256); COPY_COMPONENT(XCR0_Hi16_ZMM, XSAVE_Hi16_ZMM, xs_hi16_zmm); #undef COPY_COMPONENT return 0; }
5 5 3 4 2 14 14 16 16 3 1 12 1 1 1 1 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 /* $NetBSD: uvm_readahead.c,v 1.16 2023/09/23 18:21:12 ad Exp $ */ /*- * Copyright (c)2003, 2005, 2009 YAMAMOTO Takashi, * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * uvm_object read-ahead * * TODO: * - tune. * - handle multiple streams. * - find a better way to deal with PGO_LOCKED pager requests. * (currently just ignored) * - consider the amount of memory in the system. * - consider the speed of the underlying device. * - consider filesystem block size / block layout. */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: uvm_readahead.c,v 1.16 2023/09/23 18:21:12 ad Exp $"); #include <sys/param.h> #include <sys/kmem.h> #include <uvm/uvm.h> #include <uvm/uvm_readahead.h> #if defined(READAHEAD_DEBUG) #define DPRINTF(a) printf a #else /* defined(READAHEAD_DEBUG) */ #define DPRINTF(a) /* nothing */ #endif /* defined(READAHEAD_DEBUG) */ /* * uvm_ractx: read-ahead context. */ struct uvm_ractx { int ra_flags; #define RA_VALID 1 off_t ra_winstart; /* window start offset */ size_t ra_winsize; /* window size */ off_t ra_next; /* next offset to read-ahead */ }; #if defined(sun2) || defined(sun3) /* XXX: on sun2 and sun3 MAXPHYS is 0xe000 */ #undef MAXPHYS #define MAXPHYS 0x8000 /* XXX */ #endif #define RA_WINSIZE_INIT MAXPHYS /* initial window size */ #define RA_WINSIZE_MAX (MAXPHYS * 16) /* max window size */ #define RA_WINSIZE_SEQENTIAL RA_WINSIZE_MAX /* fixed window size used for SEQUENTIAL hint */ #define RA_MINSIZE (MAXPHYS * 2) /* min size to start i/o */ #define RA_IOCHUNK MAXPHYS /* read-ahead i/o chunk size */ static off_t ra_startio(struct uvm_object *, off_t, size_t); static struct uvm_ractx *ra_allocctx(void); static void ra_freectx(struct uvm_ractx *); /* * uvm_ra_init: initialize readahead module. */ void uvm_ra_init(void) { } static struct uvm_ractx * ra_allocctx(void) { return kmem_alloc(sizeof(struct uvm_ractx), KM_NOSLEEP); } static void ra_freectx(struct uvm_ractx *ra) { kmem_free(ra, sizeof(struct uvm_ractx)); } /* * ra_startio: start i/o for read-ahead. * * => start i/o for each RA_IOCHUNK sized chunk. * => return offset to which we started i/o. */ static off_t ra_startio(struct uvm_object *uobj, off_t off, size_t sz) { const off_t endoff = off + sz; DPRINTF(("%s: uobj=%p, off=%" PRIu64 ", endoff=%" PRIu64 "\n", __func__, uobj, off, endoff)); KASSERT(rw_write_held(uobj->vmobjlock)); /* * Don't issue read-ahead if the last page of the range is already cached. * The assumption is that since the access is sequential, the intermediate * pages would have similar LRU stats, and hence likely to be still in cache * too. This speeds up I/O using cache, since it avoids lookups and temporary * allocations done by full pgo_get. */ struct vm_page *pg = uvm_pagelookup(uobj, trunc_page(endoff - 1)); if (pg != NULL) { DPRINTF(("%s: off=%" PRIu64 ", sz=%zu already cached\n", __func__, off, sz)); return endoff; } off = trunc_page(off); while (off < endoff) { const size_t chunksize = RA_IOCHUNK; int error; size_t donebytes; int npages; int orignpages; size_t bytelen; KASSERT((chunksize & (chunksize - 1)) == 0); KASSERT((off & PAGE_MASK) == 0); bytelen = ((off + chunksize) & -(off_t)chunksize) - off; KASSERT((bytelen & PAGE_MASK) == 0); npages = orignpages = bytelen >> PAGE_SHIFT; KASSERT(npages != 0); /* * use UVM_ADV_RANDOM to avoid recursion. */ error = (*uobj->pgops->pgo_get)(uobj, off, NULL, &npages, 0, VM_PROT_READ, UVM_ADV_RANDOM, PGO_NOTIMESTAMP); rw_enter(uobj->vmobjlock, RW_WRITER); DPRINTF(("%s: off=%" PRIu64 ", bytelen=%zu -> %d\n", __func__, off, bytelen, error)); if (error != 0 && error != EBUSY) { if (error != EINVAL) { /* maybe past EOF */ DPRINTF(("%s: error=%d\n", __func__, error)); } break; } KASSERT(orignpages == npages); donebytes = orignpages << PAGE_SHIFT; off += donebytes; } return off; } /* ------------------------------------------------------------ */ /* * uvm_ra_allocctx: allocate a context. */ struct uvm_ractx * uvm_ra_allocctx(void) { struct uvm_ractx *ra; ra = ra_allocctx(); if (ra != NULL) { ra->ra_flags = 0; } return ra; } /* * uvm_ra_freectx: free a context. */ void uvm_ra_freectx(struct uvm_ractx *ra) { KASSERT(ra != NULL); ra_freectx(ra); } /* * uvm_ra_request: update a read-ahead context and start i/o if appropriate. * * => called when [reqoff, reqoff+reqsize) is requested. * => object must be locked by caller, will return locked. */ void uvm_ra_request(struct uvm_ractx *ra, int advice, struct uvm_object *uobj, off_t reqoff, size_t reqsize) { KASSERT(rw_write_held(uobj->vmobjlock)); if (ra == NULL || advice == UVM_ADV_RANDOM) { return; } if (advice == UVM_ADV_SEQUENTIAL) { /* * always do read-ahead with a large window. */ if ((ra->ra_flags & RA_VALID) == 0) { ra->ra_winstart = ra->ra_next = 0; ra->ra_flags |= RA_VALID; } if (reqoff < ra->ra_winstart) { ra->ra_next = reqoff; } ra->ra_winsize = RA_WINSIZE_SEQENTIAL; goto do_readahead; } /* * a request with UVM_ADV_NORMAL hint. (ie. no hint) * * we keep a sliding window in order to determine: * - if the previous read-ahead was successful or not. * - how many bytes to read-ahead. */ /* * if it's the first request for this context, * initialize context and return. */ if ((ra->ra_flags & RA_VALID) == 0) { initialize: ra->ra_winstart = ra->ra_next = reqoff + reqsize; ra->ra_winsize = RA_WINSIZE_INIT; ra->ra_flags |= RA_VALID; goto done; } /* * if it isn't in our window, * initialize context and return. * (read-ahead miss) */ if (reqoff < ra->ra_winstart || ra->ra_winstart + ra->ra_winsize < reqoff) { /* * ... unless we seem to be reading the same chunk repeatedly. * * XXX should have some margin? */ if (reqoff + reqsize == ra->ra_winstart) { DPRINTF(("%s: %p: same block: off=%" PRIu64 ", size=%zd, winstart=%" PRIu64 "\n", __func__, ra, reqoff, reqsize, ra->ra_winstart)); goto done; } goto initialize; } /* * it's in our window. (read-ahead hit) * - start read-ahead i/o if appropriate. * - advance and enlarge window. */ do_readahead: /* * don't bother to read-ahead behind current request. */ if (reqoff > ra->ra_next) { ra->ra_next = reqoff; } /* * try to make [reqoff, reqoff+ra_winsize) in-core. * note that [reqoff, ra_next) is considered already done. */ if (reqoff + ra->ra_winsize > ra->ra_next) { off_t raoff = MAX(reqoff, ra->ra_next); size_t rasize = reqoff + ra->ra_winsize - ra->ra_next; #if defined(DIAGNOSTIC) if (rasize > RA_WINSIZE_MAX) { printf("%s: corrupted context", __func__); rasize = RA_WINSIZE_MAX; } #endif /* defined(DIAGNOSTIC) */ /* * issue read-ahead only if we can start big enough i/o. * otherwise we end up with a stream of small i/o. */ if (rasize >= RA_MINSIZE) { off_t next; next = ra_startio(uobj, raoff, rasize); ra->ra_next = next; } } /* * update window. * * enlarge window by reqsize, so that it grows in a predictable manner * regardless of the size of each read(2). */ ra->ra_winstart = reqoff + reqsize; ra->ra_winsize = MIN(RA_WINSIZE_MAX, ra->ra_winsize + reqsize); done:; } int uvm_readahead(struct uvm_object *uobj, off_t off, off_t size) { /* * don't allow too much read-ahead. */ if (size > RA_WINSIZE_MAX) { size = RA_WINSIZE_MAX; } rw_enter(uobj->vmobjlock, RW_WRITER); ra_startio(uobj, off, size); rw_exit(uobj->vmobjlock); return 0; }
3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 /* $NetBSD: statvfs.h,v 1.5 2024/01/19 18:39:15 christos Exp $ */ /*- * Copyright (c) 2019 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Christos Zoulas. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #ifndef _COMPAT_SYS_STATVFS_H_ #define _COMPAT_SYS_STATVFS_H_ #include <sys/statvfs.h> struct statvfs90 { unsigned long f_flag; /* copy of mount exported flags */ unsigned long f_bsize; /* file system block size */ unsigned long f_frsize; /* fundamental file system block size */ unsigned long f_iosize; /* optimal file system block size */ /* The following are in units of f_frsize */ fsblkcnt_t f_blocks; /* number of blocks in file system, */ fsblkcnt_t f_bfree; /* free blocks avail in file system */ fsblkcnt_t f_bavail; /* free blocks avail to non-root */ fsblkcnt_t f_bresvd; /* blocks reserved for root */ fsfilcnt_t f_files; /* total file nodes in file system */ fsfilcnt_t f_ffree; /* free file nodes in file system */ fsfilcnt_t f_favail; /* free file nodes avail to non-root */ fsfilcnt_t f_fresvd; /* file nodes reserved for root */ uint64_t f_syncreads; /* count of sync reads since mount */ uint64_t f_syncwrites; /* count of sync writes since mount */ uint64_t f_asyncreads; /* count of async reads since mount */ uint64_t f_asyncwrites; /* count of async writes since mount */ fsid_t f_fsidx; /* NetBSD compatible fsid */ unsigned long f_fsid; /* Posix compatible fsid */ unsigned long f_namemax; /* maximum filename length */ uid_t f_owner; /* user that mounted the file system */ uint32_t f_spare[4]; /* spare space */ char f_fstypename[_VFS_NAMELEN]; /* fs type name */ char f_mntonname[_VFS_MNAMELEN]; /* directory on which mounted */ char f_mntfromname[_VFS_MNAMELEN]; /* mounted file system */ }; __BEGIN_DECLS #ifndef _KERNEL #include <string.h> #endif static __inline void statvfs_to_statvfs90(const struct statvfs *s, struct statvfs90 *s90) { memset(s90, 0, sizeof(*s90)); s90->f_flag = s->f_flag; s90->f_bsize = s->f_bsize; s90->f_frsize = s->f_frsize; s90->f_iosize = s->f_iosize; s90->f_blocks = s->f_blocks; s90->f_bfree = s->f_bfree; s90->f_bavail = s->f_bavail; s90->f_bresvd = s->f_bresvd; s90->f_files = s->f_files; s90->f_ffree = s->f_ffree; s90->f_favail = s->f_favail; s90->f_fresvd = s->f_fresvd; s90->f_syncreads = s->f_syncreads; s90->f_syncwrites = s->f_syncwrites; s90->f_asyncreads = s->f_asyncreads; s90->f_asyncwrites = s->f_asyncwrites; s90->f_fsidx = s->f_fsidx; s90->f_fsid = s->f_fsid; s90->f_namemax = s->f_namemax; s90->f_owner = s->f_owner; memcpy(s90->f_fstypename, s->f_fstypename, sizeof(s90->f_fstypename)); memcpy(s90->f_mntonname, s->f_mntonname, sizeof(s90->f_mntonname)); memcpy(s90->f_mntfromname, s->f_mntfromname, sizeof(s90->f_mntfromname)); } #ifdef _KERNEL static __inline int statvfs_to_statvfs90_copy(const void *vs, void *vs90, size_t l) { struct statvfs90 *s90 = kmem_zalloc(sizeof(*s90), KM_SLEEP); int error; statvfs_to_statvfs90(vs, s90); error = copyout(s90, vs90, sizeof(*s90)); kmem_free(s90, sizeof(*s90)); return error; } #else #ifdef __LIBC12_SOURCE__ int __compat_statvfs(const char *__restrict, struct statvfs90 *__restrict); int __compat_statvfs1(const char *__restrict, struct statvfs90 *__restrict, int); int __compat_fstatvfs(int, struct statvfs90 *); int __compat_fstatvfs1(int, struct statvfs90 *, int); int __compat___getmntinfo13(struct statvfs90 **, int); int __compat___fhstatvfs40(const void *, size_t, struct statvfs90 *); int __compat___fhstatvfs140(const void *, size_t, struct statvfs90 *, int); int __compat_getvfsstat(struct statvfs90 *, size_t, int); int __statvfs90(const char *__restrict, struct statvfs *__restrict); int __statvfs190(const char *__restrict, struct statvfs *__restrict, int); int __fstatvfs90(int, struct statvfs *); int __fstatvfs190(int, struct statvfs *, int); int __fhstatvfs90(const void *, size_t, struct statvfs *); int __fhstatvfs190(const void *, size_t, struct statvfs *, int); int __getvfsstat90(struct statvfs *, size_t, int); int __getmntinfo90(struct statvfs **, int); struct compat_30_fhandle; int fhstatvfs(const struct compat_30_fhandle *, struct statvfs90 *); int fhstatvfs1(const struct compat_30_fhandle *, struct statvfs90 *, int); #endif /* __LIBC12_SOURCE__ */ #endif /* _KERNEL */ __END_DECLS #endif /* !_COMPAT_SYS_STATVFS_H_ */
1 1 1 26 2 23 1 1 1 1 1 1 1 1 5 5 5 4 15 15 1 3 3 3 2 1 1 1 1 1 1 1 3 3 3 3 2 1 2 1 1 31 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 /* $NetBSD: udp_usrreq.c,v 1.264 2022/11/04 09:00:58 ozaki-r Exp $ */ /* * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the project nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)udp_usrreq.c 8.6 (Berkeley) 5/23/95 */ /* * UDP protocol implementation. * Per RFC 768, August, 1980. */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: udp_usrreq.c,v 1.264 2022/11/04 09:00:58 ozaki-r Exp $"); #ifdef _KERNEL_OPT #include "opt_inet.h" #include "opt_ipsec.h" #include "opt_inet_csum.h" #include "opt_mbuftrace.h" #include "opt_net_mpsafe.h" #endif #include <sys/param.h> #include <sys/mbuf.h> #include <sys/once.h> #include <sys/protosw.h> #include <sys/socket.h> #include <sys/socketvar.h> #include <sys/systm.h> #include <sys/proc.h> #include <sys/domain.h> #include <sys/sysctl.h> #include <net/if.h> #include <netinet/in.h> #include <netinet/in_systm.h> #include <netinet/in_var.h> #include <netinet/ip.h> #include <netinet/in_pcb.h> #include <netinet/ip_var.h> #include <netinet/ip_icmp.h> #include <netinet/udp.h> #include <netinet/udp_var.h> #include <netinet/udp_private.h> #ifdef INET6 #include <netinet/ip6.h> #include <netinet6/ip6_var.h> #include <netinet6/ip6_private.h> #include <netinet6/in6_pcb.h> #include <netinet6/udp6_var.h> #include <netinet6/udp6_private.h> #endif #ifndef INET6 #include <netinet/ip6.h> #endif #ifdef IPSEC #include <netipsec/ipsec.h> #include <netipsec/esp.h> #endif int udpcksum = 1; int udp_do_loopback_cksum = 0; struct inpcbtable udbtable; percpu_t *udpstat_percpu; #ifdef INET #ifdef IPSEC static int udp4_espinudp(struct mbuf **, int); #endif static void udp4_sendup(struct mbuf *, int, struct sockaddr *, struct socket *); static int udp4_realinput(struct sockaddr_in *, struct sockaddr_in *, struct mbuf **, int); static int udp4_input_checksum(struct mbuf *, const struct udphdr *, int, int); #endif #ifdef INET static void udp_notify (struct inpcb *, int); #endif #ifndef UDBHASHSIZE #define UDBHASHSIZE 128 #endif int udbhashsize = UDBHASHSIZE; /* * For send - really max datagram size; for receive - 40 1K datagrams. */ static int udp_sendspace = 9216; static int udp_recvspace = 40 * (1024 + sizeof(struct sockaddr_in)); #ifdef MBUFTRACE struct mowner udp_mowner = MOWNER_INIT("udp", ""); struct mowner udp_rx_mowner = MOWNER_INIT("udp", "rx"); struct mowner udp_tx_mowner = MOWNER_INIT("udp", "tx"); #endif #ifdef UDP_CSUM_COUNTERS #include <sys/device.h> #if defined(INET) struct evcnt udp_hwcsum_bad = EVCNT_INITIALIZER(EVCNT_TYPE_MISC, NULL, "udp", "hwcsum bad"); struct evcnt udp_hwcsum_ok = EVCNT_INITIALIZER(EVCNT_TYPE_MISC, NULL, "udp", "hwcsum ok"); struct evcnt udp_hwcsum_data = EVCNT_INITIALIZER(EVCNT_TYPE_MISC, NULL, "udp", "hwcsum data"); struct evcnt udp_swcsum = EVCNT_INITIALIZER(EVCNT_TYPE_MISC, NULL, "udp", "swcsum"); EVCNT_ATTACH_STATIC(udp_hwcsum_bad); EVCNT_ATTACH_STATIC(udp_hwcsum_ok); EVCNT_ATTACH_STATIC(udp_hwcsum_data); EVCNT_ATTACH_STATIC(udp_swcsum); #endif /* defined(INET) */ #define UDP_CSUM_COUNTER_INCR(ev) (ev)->ev_count++ #else #define UDP_CSUM_COUNTER_INCR(ev) /* nothing */ #endif /* UDP_CSUM_COUNTERS */ static void sysctl_net_inet_udp_setup(struct sysctllog **); static int do_udpinit(void) { inpcb_init(&udbtable, udbhashsize, udbhashsize); udpstat_percpu = percpu_alloc(sizeof(uint64_t) * UDP_NSTATS); MOWNER_ATTACH(&udp_tx_mowner); MOWNER_ATTACH(&udp_rx_mowner); MOWNER_ATTACH(&udp_mowner); return 0; } void udp_init_common(void) { static ONCE_DECL(doudpinit); RUN_ONCE(&doudpinit, do_udpinit); } void udp_init(void) { sysctl_net_inet_udp_setup(NULL); udp_init_common(); } /* * Checksum extended UDP header and data. */ int udp_input_checksum(int af, struct mbuf *m, const struct udphdr *uh, int iphlen, int len) { switch (af) { #ifdef INET case AF_INET: return udp4_input_checksum(m, uh, iphlen, len); #endif #ifdef INET6 case AF_INET6: return udp6_input_checksum(m, uh, iphlen, len); #endif } #ifdef DIAGNOSTIC panic("udp_input_checksum: unknown af %d", af); #endif /* NOTREACHED */ return -1; } #ifdef INET /* * Checksum extended UDP header and data. */ static int udp4_input_checksum(struct mbuf *m, const struct udphdr *uh, int iphlen, int len) { /* * XXX it's better to record and check if this mbuf is * already checked. */ if (uh->uh_sum == 0) return 0; switch (m->m_pkthdr.csum_flags & ((m_get_rcvif_NOMPSAFE(m)->if_csum_flags_rx & M_CSUM_UDPv4) | M_CSUM_TCP_UDP_BAD | M_CSUM_DATA)) { case M_CSUM_UDPv4|M_CSUM_TCP_UDP_BAD: UDP_CSUM_COUNTER_INCR(&udp_hwcsum_bad); goto badcsum; case M_CSUM_UDPv4|M_CSUM_DATA: { u_int32_t hw_csum = m->m_pkthdr.csum_data; UDP_CSUM_COUNTER_INCR(&udp_hwcsum_data); if (m->m_pkthdr.csum_flags & M_CSUM_NO_PSEUDOHDR) { const struct ip *ip = mtod(m, const struct ip *); hw_csum = in_cksum_phdr(ip->ip_src.s_addr, ip->ip_dst.s_addr, htons(hw_csum + len + IPPROTO_UDP)); } if ((hw_csum ^ 0xffff) != 0) goto badcsum; break; } case M_CSUM_UDPv4: /* Checksum was okay. */ UDP_CSUM_COUNTER_INCR(&udp_hwcsum_ok); break; default: /* * Need to compute it ourselves. Maybe skip checksum * on loopback interfaces. */ if (__predict_true(!(m_get_rcvif_NOMPSAFE(m)->if_flags & IFF_LOOPBACK) || udp_do_loopback_cksum)) { UDP_CSUM_COUNTER_INCR(&udp_swcsum); if (in4_cksum(m, IPPROTO_UDP, iphlen, len) != 0) goto badcsum; } break; } return 0; badcsum: UDP_STATINC(UDP_STAT_BADSUM); return -1; } void udp_input(struct mbuf *m, int off, int proto) { struct sockaddr_in src, dst; struct ip *ip; struct udphdr *uh; int iphlen = off; int len; int n; u_int16_t ip_len; MCLAIM(m, &udp_rx_mowner); UDP_STATINC(UDP_STAT_IPACKETS); /* * Get IP and UDP header together in first mbuf. */ ip = mtod(m, struct ip *); M_REGION_GET(uh, struct udphdr *, m, iphlen, sizeof(struct udphdr)); if (uh == NULL) { UDP_STATINC(UDP_STAT_HDROPS); return; } /* * Enforce alignment requirements that are violated in * some cases, see kern/50766 for details. */ if (ACCESSIBLE_POINTER(uh, struct udphdr) == 0) { m = m_copyup(m, iphlen + sizeof(struct udphdr), 0); if (m == NULL) { UDP_STATINC(UDP_STAT_HDROPS); return; } ip = mtod(m, struct ip *); uh = (struct udphdr *)(mtod(m, char *) + iphlen); } KASSERT(ACCESSIBLE_POINTER(uh, struct udphdr)); /* destination port of 0 is illegal, based on RFC768. */ if (uh->uh_dport == 0) goto bad; /* * Make mbuf data length reflect UDP length. * If not enough data to reflect UDP length, drop. */ ip_len = ntohs(ip->ip_len); len = ntohs((u_int16_t)uh->uh_ulen); if (len < sizeof(struct udphdr)) { UDP_STATINC(UDP_STAT_BADLEN); goto bad; } if (ip_len != iphlen + len) { if (ip_len < iphlen + len) { UDP_STATINC(UDP_STAT_BADLEN); goto bad; } m_adj(m, iphlen + len - ip_len); } /* * Checksum extended UDP header and data. */ if (udp4_input_checksum(m, uh, iphlen, len)) goto badcsum; /* construct source and dst sockaddrs. */ sockaddr_in_init(&src, &ip->ip_src, uh->uh_sport); sockaddr_in_init(&dst, &ip->ip_dst, uh->uh_dport); if ((n = udp4_realinput(&src, &dst, &m, iphlen)) == -1) { UDP_STATINC(UDP_STAT_HDROPS); return; } if (m == NULL) { /* * packet has been processed by ESP stuff - * e.g. dropped NAT-T-keep-alive-packet ... */ return; } ip = mtod(m, struct ip *); M_REGION_GET(uh, struct udphdr *, m, iphlen, sizeof(struct udphdr)); if (uh == NULL) { UDP_STATINC(UDP_STAT_HDROPS); return; } /* XXX Re-enforce alignment? */ #ifdef INET6 if (IN_MULTICAST(ip->ip_dst.s_addr) || n == 0) { struct sockaddr_in6 src6, dst6; memset(&src6, 0, sizeof(src6)); src6.sin6_family = AF_INET6; src6.sin6_len = sizeof(struct sockaddr_in6); in6_in_2_v4mapin6(&ip->ip_src, &src6.sin6_addr); src6.sin6_port = uh->uh_sport; memset(&dst6, 0, sizeof(dst6)); dst6.sin6_family = AF_INET6; dst6.sin6_len = sizeof(struct sockaddr_in6); in6_in_2_v4mapin6(&ip->ip_dst, &dst6.sin6_addr); dst6.sin6_port = uh->uh_dport; n += udp6_realinput(AF_INET, &src6, &dst6, &m, iphlen); } #endif if (n == 0) { if (m->m_flags & (M_BCAST | M_MCAST)) { UDP_STATINC(UDP_STAT_NOPORTBCAST); goto bad; } UDP_STATINC(UDP_STAT_NOPORT); icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_PORT, 0, 0); m = NULL; } bad: if (m) m_freem(m); return; badcsum: m_freem(m); } #endif #ifdef INET static void udp4_sendup(struct mbuf *m, int off /* offset of data portion */, struct sockaddr *src, struct socket *so) { struct mbuf *opts = NULL; struct mbuf *n; struct inpcb *inp; KASSERT(so != NULL); KASSERT(so->so_proto->pr_domain->dom_family == AF_INET); inp = sotoinpcb(so); KASSERT(inp != NULL); #if defined(IPSEC) if (ipsec_used && ipsec_in_reject(m, inp)) { if ((n = m_copypacket(m, M_DONTWAIT)) != NULL) icmp_error(n, ICMP_UNREACH, ICMP_UNREACH_ADMIN_PROHIBIT, 0, 0); return; } #endif if ((n = m_copypacket(m, M_DONTWAIT)) != NULL) { if (inp->inp_flags & INP_CONTROLOPTS || SOOPT_TIMESTAMP(so->so_options)) { struct ip *ip = mtod(n, struct ip *); ip_savecontrol(inp, &opts, ip, n); } m_adj(n, off); if (sbappendaddr(&so->so_rcv, src, n, opts) == 0) { m_freem(n); if (opts) m_freem(opts); UDP_STATINC(UDP_STAT_FULLSOCK); soroverflow(so); } else sorwakeup(so); } } #endif #ifdef INET static int udp4_realinput(struct sockaddr_in *src, struct sockaddr_in *dst, struct mbuf **mp, int off /* offset of udphdr */) { u_int16_t *sport, *dport; int rcvcnt; struct in_addr *src4, *dst4; struct inpcb *inp; struct mbuf *m = *mp; rcvcnt = 0; off += sizeof(struct udphdr); /* now, offset of payload */ if (src->sin_family != AF_INET || dst->sin_family != AF_INET) goto bad; src4 = &src->sin_addr; sport = &src->sin_port; dst4 = &dst->sin_addr; dport = &dst->sin_port; if (IN_MULTICAST(dst4->s_addr) || in_broadcast(*dst4, m_get_rcvif_NOMPSAFE(m))) { /* * Deliver a multicast or broadcast datagram to *all* sockets * for which the local and remote addresses and ports match * those of the incoming datagram. This allows more than * one process to receive multi/broadcasts on the same port. * (This really ought to be done for unicast datagrams as * well, but that would cause problems with existing * applications that open both address-specific sockets and * a wildcard socket listening to the same port -- they would * end up receiving duplicates of every unicast datagram. * Those applications open the multiple sockets to overcome an * inadequacy of the UDP socket interface, but for backwards * compatibility we avoid the problem here rather than * fixing the interface. Maybe 4.5BSD will remedy this?) */ /* * KAME note: traditionally we dropped udpiphdr from mbuf here. * we need udpiphdr for IPsec processing so we do that later. */ /* * Locate pcb(s) for datagram. */ TAILQ_FOREACH(inp, &udbtable.inpt_queue, inp_queue) { if (inp->inp_af != AF_INET) continue; if (inp->inp_lport != *dport) continue; if (!in_nullhost(in4p_laddr(inp))) { if (!in_hosteq(in4p_laddr(inp), *dst4)) continue; } if (!in_nullhost(in4p_faddr(inp))) { if (!in_hosteq(in4p_faddr(inp), *src4) || inp->inp_fport != *sport) continue; } udp4_sendup(m, off, (struct sockaddr *)src, inp->inp_socket); rcvcnt++; /* * Don't look for additional matches if this one does * not have either the SO_REUSEPORT or SO_REUSEADDR * socket options set. This heuristic avoids searching * through all pcbs in the common case of a non-shared * port. It assumes that an application will never * clear these options after setting them. */ if ((inp->inp_socket->so_options & (SO_REUSEPORT|SO_REUSEADDR)) == 0) break; } } else { /* * Locate pcb for datagram. */ inp = inpcb_lookup(&udbtable, *src4, *sport, *dst4, *dport, 0); if (inp == 0) { UDP_STATINC(UDP_STAT_PCBHASHMISS); inp = inpcb_lookup_bound(&udbtable, *dst4, *dport); if (inp == 0) return rcvcnt; } #ifdef IPSEC /* Handle ESP over UDP */ if (inp->inp_flags & INP_ESPINUDP) { switch (udp4_espinudp(mp, off)) { case -1: /* Error, m was freed */ rcvcnt = -1; goto bad; case 1: /* ESP over UDP */ rcvcnt++; goto bad; case 0: /* plain UDP */ default: /* Unexpected */ /* * Normal UDP processing will take place, * m may have changed. */ m = *mp; break; } } #endif if (inp->inp_overudp_cb != NULL) { int ret; ret = inp->inp_overudp_cb(mp, off, inp->inp_socket, sintosa(src), inp->inp_overudp_arg); switch (ret) { case -1: /* Error, m was freed */ rcvcnt = -1; goto bad; case 1: /* Foo over UDP */ KASSERT(*mp == NULL); rcvcnt++; goto bad; case 0: /* plain UDP */ default: /* Unexpected */ /* * Normal UDP processing will take place, * m may have changed. */ m = *mp; break; } } /* * Check the minimum TTL for socket. */ if (mtod(m, struct ip *)->ip_ttl < in4p_ip_minttl(inp)) goto bad; udp4_sendup(m, off, (struct sockaddr *)src, inp->inp_socket); rcvcnt++; } bad: return rcvcnt; } #endif #ifdef INET /* * Notify a udp user of an asynchronous error; * just wake up so that he can collect error status. */ static void udp_notify(struct inpcb *inp, int errno) { inp->inp_socket->so_error = errno; sorwakeup(inp->inp_socket); sowwakeup(inp->inp_socket); } void * udp_ctlinput(int cmd, const struct sockaddr *sa, void *v) { struct ip *ip = v; struct udphdr *uh; void (*notify)(struct inpcb *, int) = udp_notify; int errno; if (sa->sa_family != AF_INET || sa->sa_len != sizeof(struct sockaddr_in)) return NULL; if ((unsigned)cmd >= PRC_NCMDS) return NULL; errno = inetctlerrmap[cmd]; if (PRC_IS_REDIRECT(cmd)) { notify = inpcb_rtchange; ip = NULL; } else if (cmd == PRC_HOSTDEAD) { ip = NULL; } else if (errno == 0) { return NULL; } if (ip) { uh = (struct udphdr *)((char *)ip + (ip->ip_hl << 2)); inpcb_notify(&udbtable, satocsin(sa)->sin_addr, uh->uh_dport, ip->ip_src, uh->uh_sport, errno, notify); /* XXX mapped address case */ } else { inpcb_notifyall(&udbtable, satocsin(sa)->sin_addr, errno, notify); } return NULL; } int udp_ctloutput(int op, struct socket *so, struct sockopt *sopt) { int s; int error = 0; struct inpcb *inp; int family; int optval; family = so->so_proto->pr_domain->dom_family; s = splsoftnet(); switch (family) { #ifdef INET case PF_INET: if (sopt->sopt_level != IPPROTO_UDP) { error = ip_ctloutput(op, so, sopt); goto end; } break; #endif #ifdef INET6 case PF_INET6: if (sopt->sopt_level != IPPROTO_UDP) { error = ip6_ctloutput(op, so, sopt); goto end; } break; #endif default: error = EAFNOSUPPORT; goto end; } switch (op) { case PRCO_SETOPT: inp = sotoinpcb(so); switch (sopt->sopt_name) { case UDP_ENCAP: error = sockopt_getint(sopt, &optval); if (error) break; switch(optval) { case 0: inp->inp_flags &= ~INP_ESPINUDP; break; case UDP_ENCAP_ESPINUDP: inp->inp_flags |= INP_ESPINUDP; break; default: error = EINVAL; break; } break; default: error = ENOPROTOOPT; break; } break; default: error = EINVAL; break; } end: splx(s); return error; } int udp_output(struct mbuf *m, struct inpcb *inp, struct mbuf *control, struct lwp *l) { struct udpiphdr *ui; struct route *ro; struct ip_pktopts pktopts; kauth_cred_t cred; int len = m->m_pkthdr.len; int error, flags = 0; MCLAIM(m, &udp_tx_mowner); /* * Calculate data length and get a mbuf * for UDP and IP headers. */ M_PREPEND(m, sizeof(struct udpiphdr), M_DONTWAIT); if (m == NULL) { error = ENOBUFS; goto release; } /* * Compute the packet length of the IP header, and * punt if the length looks bogus. */ if (len + sizeof(struct udpiphdr) > IP_MAXPACKET) { error = EMSGSIZE; goto release; } if (l == NULL) cred = NULL; else cred = l->l_cred; /* Setup IP outgoing packet options */ memset(&pktopts, 0, sizeof(pktopts)); error = ip_setpktopts(control, &pktopts, &flags, inp, cred); if (error != 0) goto release; if (control != NULL) { m_freem(control); control = NULL; } /* * Fill in mbuf with extended UDP header * and addresses and length put into network format. */ ui = mtod(m, struct udpiphdr *); ui->ui_pr = IPPROTO_UDP; ui->ui_src = pktopts.ippo_laddr.sin_addr; ui->ui_dst = in4p_faddr(inp); ui->ui_sport = inp->inp_lport; ui->ui_dport = inp->inp_fport; ui->ui_ulen = htons((u_int16_t)len + sizeof(struct udphdr)); ro = &inp->inp_route; /* * Set up checksum and output datagram. */ if (udpcksum) { /* * XXX Cache pseudo-header checksum part for * XXX "connected" UDP sockets. */ ui->ui_sum = in_cksum_phdr(ui->ui_src.s_addr, ui->ui_dst.s_addr, htons((u_int16_t)len + sizeof(struct udphdr) + IPPROTO_UDP)); m->m_pkthdr.csum_flags = M_CSUM_UDPv4; m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum); } else ui->ui_sum = 0; ((struct ip *)ui)->ip_len = htons(sizeof(struct udpiphdr) + len); ((struct ip *)ui)->ip_ttl = in4p_ip(inp).ip_ttl; /* XXX */ ((struct ip *)ui)->ip_tos = in4p_ip(inp).ip_tos; /* XXX */ UDP_STATINC(UDP_STAT_OPACKETS); flags |= inp->inp_socket->so_options & (SO_DONTROUTE|SO_BROADCAST); return ip_output(m, inp->inp_options, ro, flags, pktopts.ippo_imo, inp); release: if (control != NULL) m_freem(control); m_freem(m); return error; } static int udp_attach(struct socket *so, int proto) { struct inpcb *inp; int error; KASSERT(sotoinpcb(so) == NULL); /* Assign the lock (must happen even if we will error out). */ sosetlock(so); #ifdef MBUFTRACE so->so_mowner = &udp_mowner; so->so_rcv.sb_mowner = &udp_rx_mowner; so->so_snd.sb_mowner = &udp_tx_mowner; #endif if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0) { error = soreserve(so, udp_sendspace, udp_recvspace); if (error) { return error; } } error = inpcb_create(so, &udbtable); if (error) { return error; } inp = sotoinpcb(so); in4p_ip(inp).ip_ttl = ip_defttl; KASSERT(solocked(so)); return error; } static void udp_detach(struct socket *so) { struct inpcb *inp; KASSERT(solocked(so)); inp = sotoinpcb(so); KASSERT(inp != NULL); inpcb_destroy(inp); } static int udp_accept(struct socket *so, struct sockaddr *nam) { KASSERT(solocked(so)); panic("udp_accept"); return EOPNOTSUPP; } static int udp_bind(struct socket *so, struct sockaddr *nam, struct lwp *l) { struct inpcb *inp = sotoinpcb(so); struct sockaddr_in *sin = (struct sockaddr_in *)nam; int error = 0; int s; KASSERT(solocked(so)); KASSERT(inp != NULL); KASSERT(nam != NULL); s = splsoftnet(); error = inpcb_bind(inp, sin, l); splx(s); return error; } static int udp_listen(struct socket *so, struct lwp *l) { KASSERT(solocked(so)); return EOPNOTSUPP; } static int udp_connect(struct socket *so, struct sockaddr *nam, struct lwp *l) { struct inpcb *inp = sotoinpcb(so); int error = 0; int s; KASSERT(solocked(so)); KASSERT(inp != NULL); KASSERT(nam != NULL); s = splsoftnet(); error = inpcb_connect(inp, (struct sockaddr_in *)nam, l); if (! error) soisconnected(so); splx(s); return error; } static int udp_connect2(struct socket *so, struct socket *so2) { KASSERT(solocked(so)); return EOPNOTSUPP; } static int udp_disconnect(struct socket *so) { struct inpcb *inp = sotoinpcb(so); int s; KASSERT(solocked(so)); KASSERT(inp != NULL); s = splsoftnet(); /*soisdisconnected(so);*/ so->so_state &= ~SS_ISCONNECTED; /* XXX */ inpcb_disconnect(inp); in4p_laddr(inp) = zeroin_addr; /* XXX */ inpcb_set_state(inp, INP_BOUND); /* XXX */ splx(s); return 0; } static int udp_shutdown(struct socket *so) { int s; KASSERT(solocked(so)); s = splsoftnet(); socantsendmore(so); splx(s); return 0; } static int udp_abort(struct socket *so) { KASSERT(solocked(so)); panic("udp_abort"); return EOPNOTSUPP; } static int udp_ioctl(struct socket *so, u_long cmd, void *nam, struct ifnet *ifp) { return in_control(so, cmd, nam, ifp); } static int udp_stat(struct socket *so, struct stat *ub) { KASSERT(solocked(so)); /* stat: don't bother with a blocksize. */ return 0; } static int udp_peeraddr(struct socket *so, struct sockaddr *nam) { int s; KASSERT(solocked(so)); KASSERT(sotoinpcb(so) != NULL); KASSERT(nam != NULL); s = splsoftnet(); inpcb_fetch_peeraddr(sotoinpcb(so), (struct sockaddr_in *)nam); splx(s); return 0; } static int udp_sockaddr(struct socket *so, struct sockaddr *nam) { int s; KASSERT(solocked(so)); KASSERT(sotoinpcb(so) != NULL); KASSERT(nam != NULL); s = splsoftnet(); inpcb_fetch_sockaddr(sotoinpcb(so), (struct sockaddr_in *)nam); splx(s); return 0; } static int udp_rcvd(struct socket *so, int flags, struct lwp *l) { KASSERT(solocked(so)); return EOPNOTSUPP; } static int udp_recvoob(struct socket *so, struct mbuf *m, int flags) { KASSERT(solocked(so)); return EOPNOTSUPP; } int udp_send(struct socket *so, struct mbuf *m, struct sockaddr *nam, struct mbuf *control, struct lwp *l) { struct inpcb *inp = sotoinpcb(so); int error = 0; struct in_addr laddr; /* XXX */ int s; KASSERT(solocked(so)); KASSERT(inp != NULL); KASSERT(m != NULL); memset(&laddr, 0, sizeof laddr); s = splsoftnet(); if (nam) { laddr = in4p_laddr(inp); /* XXX */ if ((so->so_state & SS_ISCONNECTED) != 0) { error = EISCONN; goto die; } error = inpcb_connect(inp, (struct sockaddr_in *)nam, l); if (error) goto die; } else { if ((so->so_state & SS_ISCONNECTED) == 0) { error = ENOTCONN; goto die; } } error = udp_output(m, inp, control, l); m = NULL; control = NULL; if (nam) { inpcb_disconnect(inp); in4p_laddr(inp) = laddr; /* XXX */ inpcb_set_state(inp, INP_BOUND); /* XXX */ } die: if (m != NULL) m_freem(m); if (control != NULL) m_freem(control); splx(s); return error; } static int udp_sendoob(struct socket *so, struct mbuf *m, struct mbuf *control) { KASSERT(solocked(so)); m_freem(m); m_freem(control); return EOPNOTSUPP; } static int udp_purgeif(struct socket *so, struct ifnet *ifp) { int s; s = splsoftnet(); mutex_enter(softnet_lock); inpcb_purgeif0(&udbtable, ifp); #ifdef NET_MPSAFE mutex_exit(softnet_lock); #endif in_purgeif(ifp); #ifdef NET_MPSAFE mutex_enter(softnet_lock); #endif inpcb_purgeif(&udbtable, ifp); mutex_exit(softnet_lock); splx(s); return 0; } static int sysctl_net_inet_udp_stats(SYSCTLFN_ARGS) { return (NETSTAT_SYSCTL(udpstat_percpu, UDP_NSTATS)); } /* * Sysctl for udp variables. */ static void sysctl_net_inet_udp_setup(struct sysctllog **clog) { sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT, CTLTYPE_NODE, "inet", NULL, NULL, 0, NULL, 0, CTL_NET, PF_INET, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT, CTLTYPE_NODE, "udp", SYSCTL_DESCR("UDPv4 related settings"), NULL, 0, NULL, 0, CTL_NET, PF_INET, IPPROTO_UDP, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_INT, "checksum", SYSCTL_DESCR("Compute UDP checksums"), NULL, 0, &udpcksum, 0, CTL_NET, PF_INET, IPPROTO_UDP, UDPCTL_CHECKSUM, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_INT, "sendspace", SYSCTL_DESCR("Default UDP send buffer size"), NULL, 0, &udp_sendspace, 0, CTL_NET, PF_INET, IPPROTO_UDP, UDPCTL_SENDSPACE, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_INT, "recvspace", SYSCTL_DESCR("Default UDP receive buffer size"), NULL, 0, &udp_recvspace, 0, CTL_NET, PF_INET, IPPROTO_UDP, UDPCTL_RECVSPACE, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_INT, "do_loopback_cksum", SYSCTL_DESCR("Perform UDP checksum on loopback"), NULL, 0, &udp_do_loopback_cksum, 0, CTL_NET, PF_INET, IPPROTO_UDP, UDPCTL_LOOPBACKCKSUM, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT, CTLTYPE_STRUCT, "pcblist", SYSCTL_DESCR("UDP protocol control block list"), sysctl_inpcblist, 0, &udbtable, 0, CTL_NET, PF_INET, IPPROTO_UDP, CTL_CREATE, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT, CTLTYPE_STRUCT, "stats", SYSCTL_DESCR("UDP statistics"), sysctl_net_inet_udp_stats, 0, NULL, 0, CTL_NET, PF_INET, IPPROTO_UDP, UDPCTL_STATS, CTL_EOL); } #endif void udp_statinc(u_int stat) { KASSERT(stat < UDP_NSTATS); UDP_STATINC(stat); } #if defined(INET) && defined(IPSEC) /* * Handle ESP-in-UDP packets (RFC3948). * * We need to distinguish between ESP packets and IKE packets. We do so by * looking at the Non-ESP marker. If IKE, we process the UDP packet as usual. * Otherwise, ESP, we invoke IPsec. * * Returns: * 1 if the packet was processed * 0 if normal UDP processing should take place * -1 if an error occurred and m was freed */ static int udp4_espinudp(struct mbuf **mp, int off) { const size_t skip = sizeof(struct udphdr); size_t len; uint8_t *data; size_t minlen; size_t iphdrlen; struct ip *ip; struct m_tag *tag; struct udphdr *udphdr; u_int16_t sport, dport; struct mbuf *m = *mp; uint32_t *marker; minlen = off + sizeof(struct esp); if (minlen > m->m_pkthdr.len) minlen = m->m_pkthdr.len; if (m->m_len < minlen) { if ((*mp = m_pullup(m, minlen)) == NULL) { return -1; } m = *mp; } len = m->m_len - off; data = mtod(m, uint8_t *) + off; /* Ignore keepalive packets. */ if ((len == 1) && (*data == 0xff)) { m_freem(m); *mp = NULL; /* avoid any further processing by caller */ return 1; } /* Handle Non-ESP marker (32bit). If zero, then IKE. */ marker = (uint32_t *)data; if (len <= sizeof(uint32_t)) return 0; if (marker[0] == 0) return 0; /* * Get the UDP ports. They are handled in network order * everywhere in the IPSEC_NAT_T code. */ udphdr = (struct udphdr *)((char *)data - skip); sport = udphdr->uh_sport; dport = udphdr->uh_dport; /* * Remove the UDP header, plus a possible marker. IP header * length is iphdrlen. * * Before: * <--- off ---> * +----+------+-----+ * | IP | UDP | ESP | * +----+------+-----+ * <-skip-> * After: * +----+-----+ * | IP | ESP | * +----+-----+ * <-skip-> */ iphdrlen = off - sizeof(struct udphdr); memmove(mtod(m, char *) + skip, mtod(m, void *), iphdrlen); m_adj(m, skip); ip = mtod(m, struct ip *); ip->ip_len = htons(ntohs(ip->ip_len) - skip); ip->ip_p = IPPROTO_ESP; /* * We have modified the packet - it is now ESP, so we should not * return to UDP processing. * * Add a PACKET_TAG_IPSEC_NAT_T_PORTS tag to remember the source * UDP port. This is required if we want to select the right SPD * for multiple hosts behind same NAT. */ if ((tag = m_tag_get(PACKET_TAG_IPSEC_NAT_T_PORTS, sizeof(sport) + sizeof(dport), M_DONTWAIT)) == NULL) { m_freem(m); return -1; } ((u_int16_t *)(tag + 1))[0] = sport; ((u_int16_t *)(tag + 1))[1] = dport; m_tag_prepend(m, tag); if (ipsec_used) ipsec4_common_input(m, iphdrlen, IPPROTO_ESP); else m_freem(m); /* We handled it, it shouldn't be handled by UDP */ *mp = NULL; /* avoid free by caller ... */ return 1; } #endif PR_WRAP_USRREQS(udp) #define udp_attach udp_attach_wrapper #define udp_detach udp_detach_wrapper #define udp_accept udp_accept_wrapper #define udp_bind udp_bind_wrapper #define udp_listen udp_listen_wrapper #define udp_connect udp_connect_wrapper #define udp_connect2 udp_connect2_wrapper #define udp_disconnect udp_disconnect_wrapper #define udp_shutdown udp_shutdown_wrapper #define udp_abort udp_abort_wrapper #define udp_ioctl udp_ioctl_wrapper #define udp_stat udp_stat_wrapper #define udp_peeraddr udp_peeraddr_wrapper #define udp_sockaddr udp_sockaddr_wrapper #define udp_rcvd udp_rcvd_wrapper #define udp_recvoob udp_recvoob_wrapper #define udp_send udp_send_wrapper #define udp_sendoob udp_sendoob_wrapper #define udp_purgeif udp_purgeif_wrapper const struct pr_usrreqs udp_usrreqs = { .pr_attach = udp_attach, .pr_detach = udp_detach, .pr_accept = udp_accept, .pr_bind = udp_bind, .pr_listen = udp_listen, .pr_connect = udp_connect, .pr_connect2 = udp_connect2, .pr_disconnect = udp_disconnect, .pr_shutdown = udp_shutdown, .pr_abort = udp_abort, .pr_ioctl = udp_ioctl, .pr_stat = udp_stat, .pr_peeraddr = udp_peeraddr, .pr_sockaddr = udp_sockaddr, .pr_rcvd = udp_rcvd, .pr_recvoob = udp_recvoob, .pr_send = udp_send, .pr_sendoob = udp_sendoob, .pr_purgeif = udp_purgeif, };
2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 /*- * Copyright (c) 2009-2013 The NetBSD Foundation, Inc. * All rights reserved. * * This material is based upon work partially supported by The * NetBSD Foundation under a contract with Mindaugas Rasiukevicius. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /* * NPF main: dynamic load/initialisation and unload routines. */ #ifdef _KERNEL #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: npf.c,v 1.44 2020/08/27 18:50:25 riastradh Exp $"); #include <sys/param.h> #include <sys/types.h> #include <sys/conf.h> #include <sys/kmem.h> #include <sys/percpu.h> #include <sys/xcall.h> #endif #include "npf_impl.h" #include "npf_conn.h" static __read_mostly npf_t * npf_kernel_ctx = NULL; __dso_public int npfk_sysinit(unsigned nworkers) { npf_bpf_sysinit(); npf_tableset_sysinit(); npf_nat_sysinit(); npf_portmap_sysinit(); return npf_worker_sysinit(nworkers); } __dso_public void npfk_sysfini(void) { npf_worker_sysfini(); npf_portmap_sysfini(); npf_nat_sysfini(); npf_tableset_sysfini(); npf_bpf_sysfini(); } __dso_public npf_t * npfk_create(int flags, const npf_mbufops_t *mbufops, const npf_ifops_t *ifops, void *arg) { npf_t *npf; npf = kmem_zalloc(sizeof(npf_t), KM_SLEEP); npf->ebr = npf_ebr_create(); npf->stats_percpu = percpu_alloc(NPF_STATS_SIZE); npf->mbufops = mbufops; npf->arg = arg; npf_param_init(npf); npf_state_sysinit(npf); npf_ifmap_init(npf, ifops); npf_conn_init(npf); npf_portmap_init(npf); npf_alg_init(npf); npf_ext_init(npf); /* Load an empty configuration. */ npf_config_init(npf); if ((flags & NPF_NO_GC) == 0) { npf_worker_enlist(npf); } return npf; } __dso_public void npfk_destroy(npf_t *npf) { npf_worker_discharge(npf); /* * Destroy the current configuration. Note: at this point all * handlers must be deactivated; we will drain any processing. */ npf_config_fini(npf); /* Finally, safe to destroy the subsystems. */ npf_ext_fini(npf); npf_alg_fini(npf); npf_portmap_fini(npf); npf_conn_fini(npf); npf_ifmap_fini(npf); npf_state_sysfini(npf); npf_param_fini(npf); npf_ebr_destroy(npf->ebr); percpu_free(npf->stats_percpu, NPF_STATS_SIZE); kmem_free(npf, sizeof(npf_t)); } /* * npfk_load: (re)load the configuration. * * => Will not modify the configuration reference. */ __dso_public int npfk_load(npf_t *npf, const void *config_ref, npf_error_t *err) { const nvlist_t *req = (const nvlist_t *)config_ref; nvlist_t *resp; int error; resp = nvlist_create(0); error = npfctl_run_op(npf, IOC_NPF_LOAD, req, resp); nvlist_destroy(resp); return error; } __dso_public void npfk_gc(npf_t *npf) { npf_conn_worker(npf); } __dso_public void npfk_thread_register(npf_t *npf) { npf_ebr_register(npf->ebr); } __dso_public void npfk_thread_unregister(npf_t *npf) { npf_ebr_full_sync(npf->ebr); npf_ebr_unregister(npf->ebr); } __dso_public void * npfk_getarg(npf_t *npf) { return npf->arg; } void npf_setkernctx(npf_t *npf) { npf_kernel_ctx = npf; } npf_t * npf_getkernctx(void) { return npf_kernel_ctx; } /* * NPF statistics interface. */ void npf_stats_inc(npf_t *npf, npf_stats_t st) { uint64_t *stats = percpu_getref(npf->stats_percpu); stats[st]++; percpu_putref(npf->stats_percpu); } void npf_stats_dec(npf_t *npf, npf_stats_t st) { uint64_t *stats = percpu_getref(npf->stats_percpu); stats[st]--; percpu_putref(npf->stats_percpu); } static void npf_stats_collect(void *mem, void *arg, struct cpu_info *ci) { uint64_t *percpu_stats = mem, *full_stats = arg; for (unsigned i = 0; i < NPF_STATS_COUNT; i++) { full_stats[i] += percpu_stats[i]; } } static void npf_stats_clear_cb(void *mem, void *arg, struct cpu_info *ci) { uint64_t *percpu_stats = mem; for (unsigned i = 0; i < NPF_STATS_COUNT; i++) { percpu_stats[i] = 0; } } /* * npf_stats: export collected statistics. */ __dso_public void npfk_stats(npf_t *npf, uint64_t *buf) { memset(buf, 0, NPF_STATS_SIZE); percpu_foreach_xcall(npf->stats_percpu, XC_HIGHPRI_IPL(IPL_SOFTNET), npf_stats_collect, buf); } __dso_public void npfk_stats_clear(npf_t *npf) { percpu_foreach_xcall(npf->stats_percpu, XC_HIGHPRI_IPL(IPL_SOFTNET), npf_stats_clear_cb, NULL); }
6 6 3 3 3 6 7 1 7 6 1 2 4 7 3 3 1 2 3 3 3 4 3 9 9 9 9 9 9 2 2 7 2 1 1 2 2 2 2 2 1 1 11 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 /* $NetBSD: vnd.c,v 1.289 2023/05/19 15:42:43 mlelstv Exp $ */ /*- * Copyright (c) 1996, 1997, 1998, 2008, 2020 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Jason R. Thorpe. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /* * Copyright (c) 1988 University of Utah. * Copyright (c) 1990, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * the Systems Programming Group of the University of Utah Computer * Science Department. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: Utah $Hdr: vn.c 1.13 94/04/02$ * * @(#)vn.c 8.9 (Berkeley) 5/14/95 */ /* * Vnode disk driver. * * Block/character interface to a vnode. Allows one to treat a file * as a disk (e.g. build a filesystem in it, mount it, etc.). * * NOTE 1: If the vnode supports the VOP_BMAP and VOP_STRATEGY operations, * this uses them to avoid distorting the local buffer cache. If those * block-level operations are not available, this falls back to the regular * read and write calls. Using these may distort the cache in some cases * but better have the driver working than preventing it to work on file * systems where the block-level operations are not implemented for * whatever reason. * * NOTE 2: There is a security issue involved with this driver. * Once mounted all access to the contents of the "mapped" file via * the special file is controlled by the permissions on the special * file, the protection of the mapped file is ignored (effectively, * by using root credentials in all transactions). * * NOTE 3: Doesn't interact with leases, should it? */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: vnd.c,v 1.289 2023/05/19 15:42:43 mlelstv Exp $"); #if defined(_KERNEL_OPT) #include "opt_vnd.h" #include "opt_compat_netbsd.h" #endif #include <sys/param.h> #include <sys/systm.h> #include <sys/namei.h> #include <sys/proc.h> #include <sys/kthread.h> #include <sys/errno.h> #include <sys/buf.h> #include <sys/bufq.h> #include <sys/malloc.h> #include <sys/ioctl.h> #include <sys/disklabel.h> #include <sys/device.h> #include <sys/disk.h> #include <sys/stat.h> #include <sys/mount.h> #include <sys/vnode.h> #include <sys/fstrans.h> #include <sys/file.h> #include <sys/uio.h> #include <sys/conf.h> #include <sys/kauth.h> #include <sys/module.h> #include <sys/compat_stub.h> #include <sys/atomic.h> #include <uvm/uvm.h> #include <net/zlib.h> #include <miscfs/genfs/genfs.h> #include <miscfs/specfs/specdev.h> #include <dev/dkvar.h> #include <dev/vndvar.h> #include "ioconf.h" #if defined(VNDDEBUG) && !defined(DEBUG) #define DEBUG #endif #ifdef DEBUG int dovndcluster = 1; #define VDB_FOLLOW 0x01 #define VDB_INIT 0x02 #define VDB_IO 0x04 #define VDB_LABEL 0x08 int vnddebug = 0; #endif #define vndunit(x) DISKUNIT(x) struct vndxfer { struct buf vx_buf; struct vnd_softc *vx_vnd; }; #define VND_BUFTOXFER(bp) ((struct vndxfer *)(void *)bp) #define VND_GETXFER(vnd) pool_get(&(vnd)->sc_vxpool, PR_WAITOK) #define VND_PUTXFER(vnd, vx) pool_put(&(vnd)->sc_vxpool, (vx)) #define VNDLABELDEV(dev) \ (MAKEDISKDEV(major((dev)), vndunit((dev)), RAW_PART)) #define VND_MAXPENDING(vnd) ((vnd)->sc_maxactive * 4) #define VND_MAXPAGES(vnd) (1024 * 1024 / PAGE_SIZE) static void vndclear(struct vnd_softc *, int); static int vnddoclear(struct vnd_softc *, int, int, bool); static int vndsetcred(struct vnd_softc *, kauth_cred_t); static void vndthrottle(struct vnd_softc *, struct vnode *); static void vndiodone(struct buf *); #if 0 static void vndshutdown(void); #endif static void vndgetdefaultlabel(struct vnd_softc *, struct disklabel *); static void vndgetdisklabel(dev_t, struct vnd_softc *); static int vndlock(struct vnd_softc *); static void vndunlock(struct vnd_softc *); #ifdef VND_COMPRESSION static void compstrategy(struct buf *, off_t); static void *vnd_alloc(void *, u_int, u_int); static void vnd_free(void *, void *); #endif /* VND_COMPRESSION */ static void vndthread(void *); static bool vnode_has_op(const struct vnode *, int); static void handle_with_rdwr(struct vnd_softc *, const struct buf *, struct buf *); static void handle_with_strategy(struct vnd_softc *, const struct buf *, struct buf *); static void vnd_set_geometry(struct vnd_softc *); static dev_type_open(vndopen); static dev_type_close(vndclose); static dev_type_read(vndread); static dev_type_write(vndwrite); static dev_type_ioctl(vndioctl); static dev_type_strategy(vndstrategy); static dev_type_dump(vnddump); static dev_type_size(vndsize); const struct bdevsw vnd_bdevsw = { .d_open = vndopen, .d_close = vndclose, .d_strategy = vndstrategy, .d_ioctl = vndioctl, .d_dump = vnddump, .d_psize = vndsize, .d_discard = nodiscard, .d_flag = D_DISK }; const struct cdevsw vnd_cdevsw = { .d_open = vndopen, .d_close = vndclose, .d_read = vndread, .d_write = vndwrite, .d_ioctl = vndioctl, .d_stop = nostop, .d_tty = notty, .d_poll = nopoll, .d_mmap = nommap, .d_kqfilter = nokqfilter, .d_discard = nodiscard, .d_flag = D_DISK }; static int vnd_match(device_t, cfdata_t, void *); static void vnd_attach(device_t, device_t, void *); static int vnd_detach(device_t, int); CFATTACH_DECL3_NEW(vnd, sizeof(struct vnd_softc), vnd_match, vnd_attach, vnd_detach, NULL, NULL, NULL, DVF_DETACH_SHUTDOWN); static struct vnd_softc *vnd_spawn(int); static int vnd_destroy(device_t); static const struct dkdriver vnddkdriver = { .d_strategy = vndstrategy, .d_minphys = minphys }; void vndattach(int num) { int error; error = config_cfattach_attach(vnd_cd.cd_name, &vnd_ca); if (error) aprint_error("%s: unable to register cfattach, error = %d\n", vnd_cd.cd_name, error); } static int vnd_match(device_t self, cfdata_t cfdata, void *aux) { return 1; } static void vnd_attach(device_t parent, device_t self, void *aux) { struct vnd_softc *sc = device_private(self); sc->sc_dev = self; sc->sc_comp_offsets = NULL; sc->sc_comp_buff = NULL; sc->sc_comp_decombuf = NULL; bufq_alloc(&sc->sc_tab, "disksort", BUFQ_SORT_RAWBLOCK); disk_init(&sc->sc_dkdev, device_xname(self), &vnddkdriver); if (!pmf_device_register(self, NULL, NULL)) aprint_error_dev(self, "couldn't establish power handler\n"); } static int vnd_detach(device_t self, int flags) { int error; struct vnd_softc *sc = device_private(self); if (sc->sc_flags & VNF_INITED) { error = vnddoclear(sc, 0, -1, (flags & DETACH_FORCE) != 0); if (error != 0) return error; } pmf_device_deregister(self); bufq_free(sc->sc_tab); disk_destroy(&sc->sc_dkdev); return 0; } static struct vnd_softc * vnd_spawn(int unit) { cfdata_t cf; cf = malloc(sizeof(*cf), M_DEVBUF, M_WAITOK); cf->cf_name = vnd_cd.cd_name; cf->cf_atname = vnd_cd.cd_name; cf->cf_unit = unit; cf->cf_fstate = FSTATE_STAR; return device_private(config_attach_pseudo(cf)); } static int vnd_destroy(device_t dev) { int error; cfdata_t cf; cf = device_cfdata(dev); error = config_detach(dev, DETACH_QUIET); if (error) return error; free(cf, M_DEVBUF); return 0; } static int vndopen(dev_t dev, int flags, int mode, struct lwp *l) { int unit = vndunit(dev); struct vnd_softc *sc; int error = 0, part, pmask; struct disklabel *lp; #ifdef DEBUG if (vnddebug & VDB_FOLLOW) printf("vndopen(0x%"PRIx64", 0x%x, 0x%x, %p)\n", dev, flags, mode, l); #endif sc = device_lookup_private(&vnd_cd, unit); if (sc == NULL) { sc = vnd_spawn(unit); if (sc == NULL) return ENOMEM; /* compatibility, keep disklabel after close */ sc->sc_flags = VNF_KLABEL; } if ((error = vndlock(sc)) != 0) return error; mutex_enter(&sc->sc_dkdev.dk_openlock); if ((sc->sc_flags & VNF_CLEARING) != 0) { error = ENXIO; goto done; } lp = sc->sc_dkdev.dk_label; part = DISKPART(dev); pmask = (1 << part); if (sc->sc_dkdev.dk_nwedges != 0 && part != RAW_PART) { error = EBUSY; goto done; } if (sc->sc_flags & VNF_INITED) { if ((sc->sc_dkdev.dk_openmask & ~(1<<RAW_PART)) != 0) { /* * If any non-raw partition is open, but the disk * has been invalidated, disallow further opens. */ if ((sc->sc_flags & VNF_VLABEL) == 0) { error = EIO; goto done; } } else { /* * Load the partition info if not already loaded. */ if ((sc->sc_flags & VNF_VLABEL) == 0) { sc->sc_flags |= VNF_VLABEL; vndgetdisklabel(dev, sc); } } } /* Check that the partitions exists. */ if (part != RAW_PART) { if (((sc->sc_flags & VNF_INITED) == 0) || ((part >= lp->d_npartitions) || (lp->d_partitions[part].p_fstype == FS_UNUSED))) { error = ENXIO; goto done; } } /* Prevent our unit from being unconfigured while open. */ switch (mode) { case S_IFCHR: sc->sc_dkdev.dk_copenmask |= pmask; break; case S_IFBLK: sc->sc_dkdev.dk_bopenmask |= pmask; break; } sc->sc_dkdev.dk_openmask = sc->sc_dkdev.dk_copenmask | sc->sc_dkdev.dk_bopenmask; done: mutex_exit(&sc->sc_dkdev.dk_openlock); vndunlock(sc); return error; } static int vndclose(dev_t dev, int flags, int mode, struct lwp *l) { int unit = vndunit(dev); struct vnd_softc *sc; int error = 0, part; #ifdef DEBUG if (vnddebug & VDB_FOLLOW) printf("vndclose(0x%"PRIx64", 0x%x, 0x%x, %p)\n", dev, flags, mode, l); #endif sc = device_lookup_private(&vnd_cd, unit); if (sc == NULL) return ENXIO; if ((error = vndlock(sc)) != 0) return error; mutex_enter(&sc->sc_dkdev.dk_openlock); part = DISKPART(dev); /* ...that much closer to allowing unconfiguration... */ switch (mode) { case S_IFCHR: sc->sc_dkdev.dk_copenmask &= ~(1 << part); break; case S_IFBLK: sc->sc_dkdev.dk_bopenmask &= ~(1 << part); break; } sc->sc_dkdev.dk_openmask = sc->sc_dkdev.dk_copenmask | sc->sc_dkdev.dk_bopenmask; /* are we last opener ? */ if (sc->sc_dkdev.dk_openmask == 0) { if ((sc->sc_flags & VNF_KLABEL) == 0) sc->sc_flags &= ~VNF_VLABEL; } mutex_exit(&sc->sc_dkdev.dk_openlock); vndunlock(sc); if ((sc->sc_flags & VNF_INITED) == 0) { if ((error = vnd_destroy(sc->sc_dev)) != 0) { aprint_error_dev(sc->sc_dev, "unable to detach instance\n"); return error; } } return 0; } /* * Queue the request, and wakeup the kernel thread to handle it. */ static void vndstrategy(struct buf *bp) { int unit = vndunit(bp->b_dev); struct vnd_softc *vnd = device_lookup_private(&vnd_cd, unit); struct disklabel *lp; daddr_t blkno; int s = splbio(); if (vnd == NULL) { bp->b_error = ENXIO; goto done; } lp = vnd->sc_dkdev.dk_label; if ((vnd->sc_flags & VNF_INITED) == 0) { bp->b_error = ENXIO; goto done; } /* * The transfer must be a whole number of blocks. */ if ((bp->b_bcount % lp->d_secsize) != 0) { bp->b_error = EINVAL; goto done; } /* * check if we're read-only. */ if ((vnd->sc_flags & VNF_READONLY) && !(bp->b_flags & B_READ)) { bp->b_error = EACCES; goto done; } /* If it's a nil transfer, wake up the top half now. */ if (bp->b_bcount == 0) { goto done; } /* * Do bounds checking and adjust transfer. If there's an error, * the bounds check will flag that for us. */ if (DISKPART(bp->b_dev) == RAW_PART) { if (bounds_check_with_mediasize(bp, DEV_BSIZE, vnd->sc_size) <= 0) goto done; } else { if (bounds_check_with_label(&vnd->sc_dkdev, bp, vnd->sc_flags & (VNF_WLABEL|VNF_LABELLING)) <= 0) goto done; } /* * Put the block number in terms of the logical blocksize * of the "device". */ blkno = bp->b_blkno / (lp->d_secsize / DEV_BSIZE); /* * Translate the partition-relative block number to an absolute. */ if (DISKPART(bp->b_dev) != RAW_PART) { struct partition *pp; pp = &vnd->sc_dkdev.dk_label->d_partitions[ DISKPART(bp->b_dev)]; blkno += pp->p_offset; } bp->b_rawblkno = blkno; #ifdef DEBUG if (vnddebug & VDB_FOLLOW) printf("vndstrategy(%p): unit %d\n", bp, unit); #endif if ((vnd->sc_flags & VNF_USE_VN_RDWR)) { /* * Limit the number of pending requests to not exhaust * resources needed for I/O but always allow the worker * thread to add requests, as a wedge on vnd queues * requests with biodone() -> dkstart() -> vndstrategy(). */ if (curlwp != vnd->sc_kthread && curlwp != uvm.pagedaemon_lwp) { while (vnd->sc_pending >= VND_MAXPENDING(vnd)) tsleep(&vnd->sc_pending, PRIBIO, "vndpc", 0); } vnd->sc_pending++; KASSERT(vnd->sc_pending > 0); } bufq_put(vnd->sc_tab, bp); wakeup(&vnd->sc_tab); splx(s); return; done: bp->b_resid = bp->b_bcount; biodone(bp); splx(s); } static bool vnode_has_strategy(struct vnd_softc *vnd) { return vnode_has_op(vnd->sc_vp, VOFFSET(vop_bmap)) && vnode_has_op(vnd->sc_vp, VOFFSET(vop_strategy)); } /* Verify that I/O requests cannot be smaller than the * smallest I/O size supported by the backend. */ static bool vnode_has_large_blocks(struct vnd_softc *vnd) { u_int32_t vnd_secsize, iosize; iosize = vnd->sc_iosize; vnd_secsize = vnd->sc_geom.vng_secsize; return vnd_secsize % iosize != 0; } /* XXX this function needs a reliable check to detect * sparse files. Otherwise, bmap/strategy may be used * and fail on non-allocated blocks. VOP_READ/VOP_WRITE * works on sparse files. */ #if notyet static bool vnode_strategy_probe(struct vnd_softc *vnd) { int error; daddr_t nbn; if (!vnode_has_strategy(vnd)) return false; if (vnode_has_large_blocks(vnd)) return false; /* Convert the first logical block number to its * physical block number. */ error = 0; vn_lock(vnd->sc_vp, LK_EXCLUSIVE | LK_RETRY); error = VOP_BMAP(vnd->sc_vp, 0, NULL, &nbn, NULL); VOP_UNLOCK(vnd->sc_vp); /* Test if that worked. */ if (error == 0 && (long)nbn == -1) return false; return true; } #endif static void vndthread(void *arg) { struct vnd_softc *vnd = arg; int s; /* Determine whether we can *use* VOP_BMAP and VOP_STRATEGY to * directly access the backing vnode. If we can, use these two * operations to avoid messing with the local buffer cache. * Otherwise fall back to regular VOP_READ/VOP_WRITE operations * which are guaranteed to work with any file system. */ if ((vnd->sc_flags & VNF_USE_VN_RDWR) == 0 && ! vnode_has_strategy(vnd)) vnd->sc_flags |= VNF_USE_VN_RDWR; /* VOP_STRATEGY can only be used if the backing vnode allows * to access blocks as small as defined by the vnd geometry. */ if ((vnd->sc_flags & VNF_USE_VN_RDWR) == 0 && vnode_has_large_blocks(vnd)) vnd->sc_flags |= VNF_USE_VN_RDWR; #ifdef DEBUG if (vnddebug & VDB_INIT) printf("vndthread: vp %p, %s\n", vnd->sc_vp, (vnd->sc_flags & VNF_USE_VN_RDWR) == 0 ? "using bmap/strategy operations" : "using read/write operations"); #endif s = splbio(); vnd->sc_flags |= VNF_KTHREAD; wakeup(&vnd->sc_kthread); /* * Dequeue requests and serve them depending on the available * vnode operations. */ while ((vnd->sc_flags & VNF_VUNCONF) == 0) { struct vndxfer *vnx; struct buf *obp; struct buf *bp; obp = bufq_get(vnd->sc_tab); if (obp == NULL) { tsleep(&vnd->sc_tab, PRIBIO, "vndbp", 0); continue; }; if ((vnd->sc_flags & VNF_USE_VN_RDWR)) { KASSERT(vnd->sc_pending > 0); if (vnd->sc_pending-- == VND_MAXPENDING(vnd)) wakeup(&vnd->sc_pending); } splx(s); #ifdef DEBUG if (vnddebug & VDB_FOLLOW) printf("vndthread(%p)\n", obp); #endif if (vnd->sc_vp->v_mount == NULL) { obp->b_error = ENXIO; goto done; } #ifdef VND_COMPRESSION /* handle a compressed read */ if ((obp->b_flags & B_READ) != 0 && (vnd->sc_flags & VNF_COMP)) { off_t bn; /* Convert to a byte offset within the file. */ bn = obp->b_rawblkno * vnd->sc_dkdev.dk_label->d_secsize; compstrategy(obp, bn); goto done; } #endif /* VND_COMPRESSION */ /* * Allocate a header for this transfer and link it to the * buffer */ s = splbio(); vnx = VND_GETXFER(vnd); splx(s); vnx->vx_vnd = vnd; s = splbio(); while (vnd->sc_active >= vnd->sc_maxactive) { tsleep(&vnd->sc_tab, PRIBIO, "vndac", 0); } vnd->sc_active++; splx(s); /* Instrumentation. */ disk_busy(&vnd->sc_dkdev); bp = &vnx->vx_buf; buf_init(bp); bp->b_flags = (obp->b_flags & (B_READ | B_PHYS | B_RAW)); bp->b_oflags = obp->b_oflags; bp->b_cflags = obp->b_cflags; bp->b_iodone = vndiodone; bp->b_private = obp; bp->b_vp = vnd->sc_vp; bp->b_objlock = bp->b_vp->v_interlock; bp->b_data = obp->b_data; bp->b_bcount = obp->b_bcount; BIO_COPYPRIO(bp, obp); /* Make sure the request succeeds while suspending this fs. */ fstrans_start_lazy(vnd->sc_vp->v_mount); /* Handle the request using the appropriate operations. */ if ((vnd->sc_flags & VNF_USE_VN_RDWR) == 0) handle_with_strategy(vnd, obp, bp); else handle_with_rdwr(vnd, obp, bp); fstrans_done(vnd->sc_vp->v_mount); s = splbio(); continue; done: biodone(obp); s = splbio(); } vnd->sc_flags &= (~VNF_KTHREAD | VNF_VUNCONF); wakeup(&vnd->sc_kthread); splx(s); kthread_exit(0); } /* * Checks if the given vnode supports the requested operation. * The operation is specified the offset returned by VOFFSET. * * XXX The test below used to determine this is quite fragile * because it relies on the file system to use genfs to specify * unimplemented operations. There might be another way to do * it more cleanly. */ static bool vnode_has_op(const struct vnode *vp, int opoffset) { int (*defaultp)(void *); int (*opp)(void *); defaultp = vp->v_op[VOFFSET(vop_default)]; opp = vp->v_op[opoffset]; return opp != defaultp && opp != genfs_eopnotsupp && opp != genfs_badop && opp != genfs_nullop; } /* * Handles the read/write request given in 'bp' using the vnode's VOP_READ * and VOP_WRITE operations. * * 'obp' is a pointer to the original request fed to the vnd device. */ static void handle_with_rdwr(struct vnd_softc *vnd, const struct buf *obp, struct buf *bp) { bool doread; off_t offset; size_t len, resid; struct vnode *vp; int npages; doread = bp->b_flags & B_READ; offset = obp->b_rawblkno * vnd->sc_dkdev.dk_label->d_secsize; len = bp->b_bcount; vp = vnd->sc_vp; #if defined(DEBUG) if (vnddebug & VDB_IO) printf("vnd (rdwr): vp %p, %s, rawblkno 0x%" PRIx64 ", secsize %d, offset %" PRIu64 ", bcount %d\n", vp, doread ? "read" : "write", obp->b_rawblkno, vnd->sc_dkdev.dk_label->d_secsize, offset, bp->b_bcount); #endif /* Issue the read or write operation. */ bp->b_error = vn_rdwr(doread ? UIO_READ : UIO_WRITE, vp, bp->b_data, len, offset, UIO_SYSSPACE, IO_ADV_ENCODE(POSIX_FADV_NOREUSE) | IO_DIRECT, vnd->sc_cred, &resid, NULL); bp->b_resid = resid; /* * Avoid caching too many pages, the vnd user * is usually a filesystem and caches itself. * We need some amount of caching to not hinder * read-ahead and write-behind operations. */ npages = atomic_load_relaxed(&vp->v_uobj.uo_npages); if (npages > VND_MAXPAGES(vnd)) { rw_enter(vp->v_uobj.vmobjlock, RW_WRITER); (void) VOP_PUTPAGES(vp, 0, 0, PGO_ALLPAGES | PGO_CLEANIT | PGO_FREE); } /* We need to increase the number of outputs on the vnode if * there was any write to it. */ if (!doread) { mutex_enter(vp->v_interlock); vp->v_numoutput++; mutex_exit(vp->v_interlock); } biodone(bp); } /* * Handes the read/write request given in 'bp' using the vnode's VOP_BMAP * and VOP_STRATEGY operations. * * 'obp' is a pointer to the original request fed to the vnd device. */ static void handle_with_strategy(struct vnd_softc *vnd, const struct buf *obp, struct buf *bp) { int bsize, error, flags, skipped; size_t resid, sz; off_t bn, offset; struct vnode *vp; struct buf *nbp = NULL; flags = obp->b_flags; /* convert to a byte offset within the file. */ bn = obp->b_rawblkno * vnd->sc_dkdev.dk_label->d_secsize; bsize = vnd->sc_vp->v_mount->mnt_stat.f_iosize; skipped = 0; /* * Break the request into bsize pieces and feed them * sequentially using VOP_BMAP/VOP_STRATEGY. * We do it this way to keep from flooding NFS servers if we * are connected to an NFS file. This places the burden on * the client rather than the server. */ error = 0; bp->b_resid = bp->b_bcount; for (offset = 0, resid = bp->b_resid; /* true */; resid -= sz, offset += sz) { daddr_t nbn; int off, nra; nra = 0; vn_lock(vnd->sc_vp, LK_EXCLUSIVE | LK_RETRY); error = VOP_BMAP(vnd->sc_vp, bn / bsize, &vp, &nbn, &nra); VOP_UNLOCK(vnd->sc_vp); if (error == 0 && (long)nbn == -1) error = EIO; /* * If there was an error or a hole in the file...punt. * Note that we may have to wait for any operations * that we have already fired off before releasing * the buffer. * * XXX we could deal with holes here but it would be * a hassle (in the write case). */ if (error) { skipped += resid; break; } #ifdef DEBUG if (!dovndcluster) nra = 0; #endif off = bn % bsize; sz = MIN(((off_t)1 + nra) * bsize - off, resid); #ifdef DEBUG if (vnddebug & VDB_IO) printf("vndstrategy: vp %p/%p bn 0x%qx/0x%" PRIx64 " sz 0x%zx\n", vnd->sc_vp, vp, (long long)bn, nbn, sz); #endif nbp = getiobuf(vp, true); nestiobuf_setup(bp, nbp, offset, sz); nbp->b_blkno = nbn + btodb(off); #if 0 /* XXX #ifdef DEBUG */ if (vnddebug & VDB_IO) printf("vndstart(%ld): bp %p vp %p blkno " "0x%" PRIx64 " flags %x addr %p cnt 0x%x\n", (long) (vnd-vnd_softc), &nbp->vb_buf, nbp->vb_buf.b_vp, nbp->vb_buf.b_blkno, nbp->vb_buf.b_flags, nbp->vb_buf.b_data, nbp->vb_buf.b_bcount); #endif if (resid == sz) { break; } VOP_STRATEGY(vp, nbp); bn += sz; } if (!(flags & B_READ)) { struct vnode *w_vp; /* * this is the last nested buf, account for * the parent buf write too. * This has to be done last, so that * fsync won't wait for this write which * has no chance to complete before all nested bufs * have been queued. But it has to be done * before the last VOP_STRATEGY() * or the call to nestiobuf_done(). */ w_vp = bp->b_vp; mutex_enter(w_vp->v_interlock); w_vp->v_numoutput++; mutex_exit(w_vp->v_interlock); } KASSERT(skipped != 0 || nbp != NULL); if (skipped) nestiobuf_done(bp, skipped, error); else VOP_STRATEGY(vp, nbp); } static void vndiodone(struct buf *bp) { struct vndxfer *vnx = VND_BUFTOXFER(bp); struct vnd_softc *vnd = vnx->vx_vnd; struct buf *obp = bp->b_private; int s = splbio(); KERNEL_LOCK(1, NULL); /* XXXSMP */ KASSERT(&vnx->vx_buf == bp); KASSERT(vnd->sc_active > 0); #ifdef DEBUG if (vnddebug & VDB_IO) { printf("vndiodone1: bp %p iodone: error %d\n", bp, bp->b_error); } #endif disk_unbusy(&vnd->sc_dkdev, bp->b_bcount - bp->b_resid, (bp->b_flags & B_READ)); vnd->sc_active--; if (vnd->sc_active == 0) { wakeup(&vnd->sc_tab); } KERNEL_UNLOCK_ONE(NULL); /* XXXSMP */ splx(s); obp->b_error = bp->b_error; obp->b_resid = bp->b_resid; buf_destroy(bp); VND_PUTXFER(vnd, vnx); biodone(obp); } /* ARGSUSED */ static int vndread(dev_t dev, struct uio *uio, int flags) { int unit = vndunit(dev); struct vnd_softc *sc; #ifdef DEBUG if (vnddebug & VDB_FOLLOW) printf("vndread(0x%"PRIx64", %p)\n", dev, uio); #endif sc = device_lookup_private(&vnd_cd, unit); if (sc == NULL) return ENXIO; if ((sc->sc_flags & VNF_INITED) == 0) return ENXIO; return physio(vndstrategy, NULL, dev, B_READ, minphys, uio); } /* ARGSUSED */ static int vndwrite(dev_t dev, struct uio *uio, int flags) { int unit = vndunit(dev); struct vnd_softc *sc; #ifdef DEBUG if (vnddebug & VDB_FOLLOW) printf("vndwrite(0x%"PRIx64", %p)\n", dev, uio); #endif sc = device_lookup_private(&vnd_cd, unit); if (sc == NULL) return ENXIO; if ((sc->sc_flags & VNF_INITED) == 0) return ENXIO; return physio(vndstrategy, NULL, dev, B_WRITE, minphys, uio); } static int vnd_cget(struct lwp *l, int unit, int *un, struct vattr *va) { int error; struct vnd_softc *vnd; if (*un == -1) *un = unit; if (*un < 0) return EINVAL; vnd = device_lookup_private(&vnd_cd, *un); if (vnd == NULL) return -1; if ((vnd->sc_flags & VNF_INITED) == 0) return -1; vn_lock(vnd->sc_vp, LK_SHARED | LK_RETRY); error = VOP_GETATTR(vnd->sc_vp, va, l->l_cred); VOP_UNLOCK(vnd->sc_vp); return error; } static int vnddoclear(struct vnd_softc *vnd, int pmask, int minor, bool force) { int error; if ((error = vndlock(vnd)) != 0) return error; /* * Don't unconfigure if any other partitions are open * or if both the character and block flavors of this * partition are open. */ if (DK_BUSY(vnd, pmask) && !force) { vndunlock(vnd); return EBUSY; } /* Delete all of our wedges */ dkwedge_delall(&vnd->sc_dkdev); /* * XXX vndclear() might call vndclose() implicitly; * release lock to avoid recursion * * Set VNF_CLEARING to prevent vndopen() from * sneaking in after we vndunlock(). */ vnd->sc_flags |= VNF_CLEARING; vndunlock(vnd); vndclear(vnd, minor); #ifdef DEBUG if (vnddebug & VDB_INIT) printf("%s: CLRed\n", __func__); #endif /* Destroy the xfer and buffer pools. */ pool_destroy(&vnd->sc_vxpool); /* Detach the disk. */ disk_detach(&vnd->sc_dkdev); return 0; } static int vndioctl_get(struct lwp *l, void *data, int unit, struct vattr *va) { int error; KASSERT(l); /* the first member is always int vnd_unit in all the versions */ if (*(int *)data >= vnd_cd.cd_ndevs) return ENXIO; switch (error = vnd_cget(l, unit, (int *)data, va)) { case -1: /* unused is not an error */ memset(va, 0, sizeof(*va)); /*FALLTHROUGH*/ case 0: return 0; default: return error; } } /* ARGSUSED */ static int vndioctl(dev_t dev, u_long cmd, void *data, int flag, struct lwp *l) { bool force; int unit = vndunit(dev); struct vnd_softc *vnd; struct vnd_ioctl *vio; struct vattr vattr; struct pathbuf *pb; struct vnode *vp; int error, part, pmask; uint64_t geomsize; int fflags; #ifdef __HAVE_OLD_DISKLABEL struct disklabel newlabel; #endif #ifdef DEBUG if (vnddebug & VDB_FOLLOW) printf("vndioctl(0x%"PRIx64", 0x%lx, %p, 0x%x, %p): unit %d\n", dev, cmd, data, flag, l->l_proc, unit); #endif /* Do the get's first; they don't need initialization or verification */ switch (cmd) { case VNDIOCGET: if ((error = vndioctl_get(l, data, unit, &vattr)) != 0) return error; struct vnd_user *vnu = data; vnu->vnu_dev = vattr.va_fsid; vnu->vnu_ino = vattr.va_fileid; return 0; default: /* First check for COMPAT_50 hook */ MODULE_HOOK_CALL(compat_vndioctl_50_hook, (cmd, l, data, unit, &vattr, vndioctl_get), enosys(), error); /* * If not present, then COMPAT_30 hook also not * present, so just continue with checks for the * "write" commands */ if (error == ENOSYS) { error = 0; break; } /* If not already handled, try the COMPAT_30 hook */ if (error == EPASSTHROUGH) MODULE_HOOK_CALL(compat_vndioctl_30_hook, (cmd, l, data, unit, &vattr, vndioctl_get), enosys(), error); /* If no COMPAT_30 module, or not handled, check writes */ if (error == ENOSYS || error == EPASSTHROUGH) { error = 0; break; } return error; } vnd = device_lookup_private(&vnd_cd, unit); if (vnd == NULL) return ENXIO; vio = (struct vnd_ioctl *)data; /* Must be open for writes for these commands... */ switch (cmd) { case VNDIOCSET50: case VNDIOCCLR50: if (!compat_vndioctl_50_hook.hooked) return EINVAL; /* FALLTHROUGH */ case VNDIOCSET: case VNDIOCCLR: case DIOCSDINFO: case DIOCWDINFO: #ifdef __HAVE_OLD_DISKLABEL case ODIOCSDINFO: case ODIOCWDINFO: #endif case DIOCKLABEL: case DIOCWLABEL: case DIOCCACHESYNC: if ((flag & FWRITE) == 0) return EBADF; } switch (cmd) { case VNDIOCSET50: case VNDIOCSET: /* Must not be initialized */ if (vnd->sc_flags & VNF_INITED) return EBUSY; break; default: /* Must be initialized */ if ((vnd->sc_flags & VNF_INITED) == 0) return ENXIO; break; } error = disk_ioctl(&vnd->sc_dkdev, dev, cmd, data, flag, l); if (error != EPASSTHROUGH) return error; switch (cmd) { case VNDIOCSET50: case VNDIOCSET: if ((error = vndlock(vnd)) != 0) return error; fflags = FREAD; if ((vio->vnd_flags & VNDIOF_READONLY) == 0) fflags |= FWRITE; if ((vio->vnd_flags & VNDIOF_FILEIO) != 0) vnd->sc_flags |= VNF_USE_VN_RDWR; error = pathbuf_copyin(vio->vnd_file, &pb); if (error) { goto unlock_and_exit; } error = vn_open(NULL, pb, 0, fflags, 0, &vp, NULL, NULL); if (error != 0) { pathbuf_destroy(pb); goto unlock_and_exit; } KASSERT(l); error = VOP_GETATTR(vp, &vattr, l->l_cred); if (!error && vp->v_type != VREG) error = EOPNOTSUPP; if (!error && vattr.va_bytes < vattr.va_size) /* File is definitely sparse, use vn_rdwr() */ vnd->sc_flags |= VNF_USE_VN_RDWR; if (error) { VOP_UNLOCK(vp); goto close_and_exit; } /* If using a compressed file, initialize its info */ /* (or abort with an error if kernel has no compression) */ if (vio->vnd_flags & VNDIOF_COMP) { #ifdef VND_COMPRESSION struct vnd_comp_header *ch; int i; uint32_t comp_size; uint32_t comp_maxsize; /* allocate space for compressed file header */ ch = malloc(sizeof(struct vnd_comp_header), M_TEMP, M_WAITOK); /* read compressed file header */ error = vn_rdwr(UIO_READ, vp, (void *)ch, sizeof(struct vnd_comp_header), 0, UIO_SYSSPACE, IO_UNIT|IO_NODELOCKED, l->l_cred, NULL, NULL); if (error) { free(ch, M_TEMP); VOP_UNLOCK(vp); goto close_and_exit; } if (be32toh(ch->block_size) == 0 || be32toh(ch->num_blocks) > UINT32_MAX - 1) { free(ch, M_TEMP); VOP_UNLOCK(vp); goto close_and_exit; } /* save some header info */ vnd->sc_comp_blksz = be32toh(ch->block_size); /* note last offset is the file byte size */ vnd->sc_comp_numoffs = be32toh(ch->num_blocks) + 1; free(ch, M_TEMP); if (!DK_DEV_BSIZE_OK(vnd->sc_comp_blksz)) { VOP_UNLOCK(vp); error = EINVAL; goto close_and_exit; } KASSERT(0 < vnd->sc_comp_blksz); KASSERT(0 < vnd->sc_comp_numoffs); /* * @#^@!$& gcc -Wtype-limits refuses to let me * write SIZE_MAX/sizeof(uint64_t) < numoffs, * because the range of the type on amd64 makes * the comparisons always false. */ #if SIZE_MAX <= UINT32_MAX*(64/CHAR_BIT) if (SIZE_MAX/sizeof(uint64_t) < vnd->sc_comp_numoffs) { VOP_UNLOCK(vp); error = EINVAL; goto close_and_exit; } #endif if ((vattr.va_size < sizeof(struct vnd_comp_header)) || (vattr.va_size - sizeof(struct vnd_comp_header) < sizeof(uint64_t)*vnd->sc_comp_numoffs) || (UQUAD_MAX/vnd->sc_comp_blksz < vnd->sc_comp_numoffs - 1)) { VOP_UNLOCK(vp); error = EINVAL; goto close_and_exit; } /* set decompressed file size */ KASSERT(vnd->sc_comp_numoffs - 1 <= UQUAD_MAX/vnd->sc_comp_blksz); vattr.va_size = ((u_quad_t)vnd->sc_comp_numoffs - 1) * (u_quad_t)vnd->sc_comp_blksz; /* allocate space for all the compressed offsets */ __CTASSERT(UINT32_MAX <= UQUAD_MAX/sizeof(uint64_t)); vnd->sc_comp_offsets = malloc(sizeof(uint64_t) * vnd->sc_comp_numoffs, M_DEVBUF, M_WAITOK); /* read in the offsets */ error = vn_rdwr(UIO_READ, vp, (void *)vnd->sc_comp_offsets, sizeof(uint64_t) * vnd->sc_comp_numoffs, sizeof(struct vnd_comp_header), UIO_SYSSPACE, IO_UNIT|IO_NODELOCKED, l->l_cred, NULL, NULL); if (error) { VOP_UNLOCK(vp); goto close_and_exit; } /* * find largest block size (used for allocation limit). * Also convert offset to native byte order. */ comp_maxsize = 0; for (i = 0; i < vnd->sc_comp_numoffs - 1; i++) { vnd->sc_comp_offsets[i] = be64toh(vnd->sc_comp_offsets[i]); comp_size = be64toh(vnd->sc_comp_offsets[i + 1]) - vnd->sc_comp_offsets[i]; if (comp_size > comp_maxsize) comp_maxsize = comp_size; } vnd->sc_comp_offsets[vnd->sc_comp_numoffs - 1] = be64toh(vnd->sc_comp_offsets[vnd->sc_comp_numoffs - 1]); /* create compressed data buffer */ vnd->sc_comp_buff = malloc(comp_maxsize, M_DEVBUF, M_WAITOK); /* create decompressed buffer */ vnd->sc_comp_decombuf = malloc(vnd->sc_comp_blksz, M_DEVBUF, M_WAITOK); vnd->sc_comp_buffblk = -1; /* Initialize decompress stream */ memset(&vnd->sc_comp_stream, 0, sizeof(z_stream)); vnd->sc_comp_stream.zalloc = vnd_alloc; vnd->sc_comp_stream.zfree = vnd_free; error = inflateInit2(&vnd->sc_comp_stream, MAX_WBITS); if (error) { if (vnd->sc_comp_stream.msg) printf("vnd%d: compressed file, %s\n", unit, vnd->sc_comp_stream.msg); VOP_UNLOCK(vp); error = EINVAL; goto close_and_exit; } vnd->sc_flags |= VNF_COMP | VNF_READONLY; #else /* !VND_COMPRESSION */ VOP_UNLOCK(vp); error = EOPNOTSUPP; goto close_and_exit; #endif /* VND_COMPRESSION */ } VOP_UNLOCK(vp); vnd->sc_vp = vp; vnd->sc_size = btodb(vattr.va_size); /* note truncation */ /* get smallest I/O size for underlying device, fall back to * fundamental I/O size of underlying filesystem */ error = bdev_ioctl(vattr.va_fsid, DIOCGSECTORSIZE, &vnd->sc_iosize, FKIOCTL, l); if (error) vnd->sc_iosize = vnd->sc_vp->v_mount->mnt_stat.f_frsize; /* Default I/O size to DEV_BSIZE */ if (vnd->sc_iosize == 0) vnd->sc_iosize = DEV_BSIZE; /* * Use pseudo-geometry specified. If none was provided, * use "standard" Adaptec fictitious geometry. */ if (vio->vnd_flags & VNDIOF_HASGEOM) { memcpy(&vnd->sc_geom, &vio->vnd_geom, sizeof(vio->vnd_geom)); /* * Sanity-check the sector size. */ if (!DK_DEV_BSIZE_OK(vnd->sc_geom.vng_secsize) || vnd->sc_geom.vng_ntracks == 0 || vnd->sc_geom.vng_nsectors == 0) { error = EINVAL; goto close_and_exit; } /* * Compute missing cylinder count from size */ if (vnd->sc_geom.vng_ncylinders == 0) vnd->sc_geom.vng_ncylinders = vnd->sc_size / ( (vnd->sc_geom.vng_secsize / DEV_BSIZE) * vnd->sc_geom.vng_ntracks * vnd->sc_geom.vng_nsectors); /* * Compute the size (in DEV_BSIZE blocks) specified * by the geometry. */ geomsize = (int64_t)vnd->sc_geom.vng_nsectors * vnd->sc_geom.vng_ntracks * vnd->sc_geom.vng_ncylinders * (vnd->sc_geom.vng_secsize / DEV_BSIZE); /* * Sanity-check the size against the specified * geometry. */ if (vnd->sc_size < geomsize) { error = EINVAL; goto close_and_exit; } } else if (vnd->sc_size >= (32 * 64)) { /* * Size must be at least 2048 DEV_BSIZE blocks * (1M) in order to use this geometry. */ vnd->sc_geom.vng_secsize = DEV_BSIZE; vnd->sc_geom.vng_nsectors = 32; vnd->sc_geom.vng_ntracks = 64; vnd->sc_geom.vng_ncylinders = vnd->sc_size / (64 * 32); } else { vnd->sc_geom.vng_secsize = DEV_BSIZE; vnd->sc_geom.vng_nsectors = 1; vnd->sc_geom.vng_ntracks = 1; vnd->sc_geom.vng_ncylinders = vnd->sc_size; } vnd_set_geometry(vnd); if (vio->vnd_flags & VNDIOF_READONLY) { vnd->sc_flags |= VNF_READONLY; } if ((error = vndsetcred(vnd, l->l_cred)) != 0) goto close_and_exit; vndthrottle(vnd, vnd->sc_vp); vio->vnd_osize = dbtob(vnd->sc_size); if (cmd != VNDIOCSET50) vio->vnd_size = dbtob(vnd->sc_size); vnd->sc_flags |= VNF_INITED; /* create the kernel thread, wait for it to be up */ error = kthread_create(PRI_NONE, 0, NULL, vndthread, vnd, &vnd->sc_kthread, "%s", device_xname(vnd->sc_dev)); if (error) goto close_and_exit; while ((vnd->sc_flags & VNF_KTHREAD) == 0) { tsleep(&vnd->sc_kthread, PRIBIO, "vndthr", 0); } #ifdef DEBUG if (vnddebug & VDB_INIT) printf("vndioctl: SET vp %p size 0x%lx %d/%d/%d/%d\n", vnd->sc_vp, (unsigned long) vnd->sc_size, vnd->sc_geom.vng_secsize, vnd->sc_geom.vng_nsectors, vnd->sc_geom.vng_ntracks, vnd->sc_geom.vng_ncylinders); #endif /* Attach the disk. */ disk_attach(&vnd->sc_dkdev); /* Initialize the xfer and buffer pools. */ pool_init(&vnd->sc_vxpool, sizeof(struct vndxfer), 0, 0, 0, "vndxpl", NULL, IPL_BIO); vndunlock(vnd); pathbuf_destroy(pb); /* Discover wedges on this disk */ dkwedge_discover(&vnd->sc_dkdev); break; close_and_exit: (void) vn_close(vp, fflags, l->l_cred); pathbuf_destroy(pb); unlock_and_exit: #ifdef VND_COMPRESSION /* free any allocated memory (for compressed file) */ if (vnd->sc_comp_offsets) { free(vnd->sc_comp_offsets, M_DEVBUF); vnd->sc_comp_offsets = NULL; } if (vnd->sc_comp_buff) { free(vnd->sc_comp_buff, M_DEVBUF); vnd->sc_comp_buff = NULL; } if (vnd->sc_comp_decombuf) { free(vnd->sc_comp_decombuf, M_DEVBUF); vnd->sc_comp_decombuf = NULL; } #endif /* VND_COMPRESSION */ vndunlock(vnd); return error; case VNDIOCCLR50: case VNDIOCCLR: part = DISKPART(dev); pmask = (1 << part); force = (vio->vnd_flags & VNDIOF_FORCE) != 0; if ((error = vnddoclear(vnd, pmask, minor(dev), force)) != 0) return error; break; case DIOCWDINFO: case DIOCSDINFO: #ifdef __HAVE_OLD_DISKLABEL case ODIOCWDINFO: case ODIOCSDINFO: #endif { struct disklabel *lp; if ((error = vndlock(vnd)) != 0) return error; vnd->sc_flags |= VNF_LABELLING; #ifdef __HAVE_OLD_DISKLABEL if (cmd == ODIOCSDINFO || cmd == ODIOCWDINFO) { memset(&newlabel, 0, sizeof newlabel); memcpy(&newlabel, data, sizeof (struct olddisklabel)); lp = &newlabel; } else #endif lp = (struct disklabel *)data; error = setdisklabel(vnd->sc_dkdev.dk_label, lp, 0, vnd->sc_dkdev.dk_cpulabel); if (error == 0) { if (cmd == DIOCWDINFO #ifdef __HAVE_OLD_DISKLABEL || cmd == ODIOCWDINFO #endif ) error = writedisklabel(VNDLABELDEV(dev), vndstrategy, vnd->sc_dkdev.dk_label, vnd->sc_dkdev.dk_cpulabel); } vnd->sc_flags &= ~VNF_LABELLING; vndunlock(vnd); if (error) return error; break; } case DIOCKLABEL: if (*(int *)data != 0) vnd->sc_flags |= VNF_KLABEL; else vnd->sc_flags &= ~VNF_KLABEL; break; case DIOCWLABEL: if (*(int *)data != 0) vnd->sc_flags |= VNF_WLABEL; else vnd->sc_flags &= ~VNF_WLABEL; break; case DIOCGDEFLABEL: vndgetdefaultlabel(vnd, (struct disklabel *)data); break; #ifdef __HAVE_OLD_DISKLABEL case ODIOCGDEFLABEL: vndgetdefaultlabel(vnd, &newlabel); if (newlabel.d_npartitions > OLDMAXPARTITIONS) return ENOTTY; memcpy(data, &newlabel, sizeof (struct olddisklabel)); break; #endif case DIOCGSTRATEGY: { struct disk_strategy *dks = (void *)data; /* No lock needed, never changed */ strlcpy(dks->dks_name, bufq_getstrategyname(vnd->sc_tab), sizeof(dks->dks_name)); dks->dks_paramlen = 0; break; } case DIOCGCACHE: { int *bits = (int *)data; *bits |= DKCACHE_READ | DKCACHE_WRITE; break; } case DIOCCACHESYNC: vn_lock(vnd->sc_vp, LK_EXCLUSIVE | LK_RETRY); error = VOP_FSYNC(vnd->sc_vp, vnd->sc_cred, FSYNC_WAIT | FSYNC_DATAONLY | FSYNC_CACHE, 0, 0); VOP_UNLOCK(vnd->sc_vp); return error; default: return ENOTTY; } return 0; } /* * Duplicate the current processes' credentials. Since we are called only * as the result of a SET ioctl and only root can do that, any future access * to this "disk" is essentially as root. Note that credentials may change * if some other uid can write directly to the mapped file (NFS). */ static int vndsetcred(struct vnd_softc *vnd, kauth_cred_t cred) { struct uio auio; struct iovec aiov; char *tmpbuf; int error; vnd->sc_cred = kauth_cred_dup(cred); tmpbuf = malloc(DEV_BSIZE, M_TEMP, M_WAITOK); /* XXX: Horrible kludge to establish credentials for NFS */ aiov.iov_base = tmpbuf; aiov.iov_len = uimin(DEV_BSIZE, dbtob(vnd->sc_size)); auio.uio_iov = &aiov; auio.uio_iovcnt = 1; auio.uio_offset = 0; auio.uio_rw = UIO_READ; auio.uio_resid = aiov.iov_len; UIO_SETUP_SYSSPACE(&auio); vn_lock(vnd->sc_vp, LK_EXCLUSIVE | LK_RETRY); error = VOP_READ(vnd->sc_vp, &auio, 0, vnd->sc_cred); if (error == 0) { /* * Because vnd does all IO directly through the vnode * we need to flush (at least) the buffer from the above * VOP_READ from the buffer cache to prevent cache * incoherencies. Also, be careful to write dirty * buffers back to stable storage. */ error = vinvalbuf(vnd->sc_vp, V_SAVE, vnd->sc_cred, curlwp, 0, 0); } VOP_UNLOCK(vnd->sc_vp); free(tmpbuf, M_TEMP); return error; } /* * Set maxactive based on FS type */ static void vndthrottle(struct vnd_softc *vnd, struct vnode *vp) { if (vp->v_tag == VT_NFS) vnd->sc_maxactive = 2; else vnd->sc_maxactive = 8; if (vnd->sc_maxactive < 1) vnd->sc_maxactive = 1; } #if 0 static void vndshutdown(void) { struct vnd_softc *vnd; for (vnd = &vnd_softc[0]; vnd < &vnd_softc[numvnd]; vnd++) if (vnd->sc_flags & VNF_INITED) vndclear(vnd); } #endif static void vndclear(struct vnd_softc *vnd, int myminor) { struct vnode *vp = vnd->sc_vp; int fflags = FREAD; int bmaj, cmaj, i, mn; int s; #ifdef DEBUG if (vnddebug & VDB_FOLLOW) printf("vndclear(%p): vp %p\n", vnd, vp); #endif /* locate the major number */ bmaj = bdevsw_lookup_major(&vnd_bdevsw); cmaj = cdevsw_lookup_major(&vnd_cdevsw); /* Nuke the vnodes for any open instances */ for (i = 0; i < MAXPARTITIONS; i++) { mn = DISKMINOR(device_unit(vnd->sc_dev), i); if (mn != myminor) { /* XXX avoid to kill own vnode */ vdevgone(bmaj, mn, mn, VBLK); vdevgone(cmaj, mn, mn, VCHR); } } if ((vnd->sc_flags & VNF_READONLY) == 0) fflags |= FWRITE; s = splbio(); bufq_drain(vnd->sc_tab); splx(s); vnd->sc_flags |= VNF_VUNCONF; wakeup(&vnd->sc_tab); while (vnd->sc_flags & VNF_KTHREAD) tsleep(&vnd->sc_kthread, PRIBIO, "vnthr", 0); #ifdef VND_COMPRESSION /* free the compressed file buffers */ if (vnd->sc_flags & VNF_COMP) { if (vnd->sc_comp_offsets) { free(vnd->sc_comp_offsets, M_DEVBUF); vnd->sc_comp_offsets = NULL; } if (vnd->sc_comp_buff) { free(vnd->sc_comp_buff, M_DEVBUF); vnd->sc_comp_buff = NULL; } if (vnd->sc_comp_decombuf) { free(vnd->sc_comp_decombuf, M_DEVBUF); vnd->sc_comp_decombuf = NULL; } } #endif /* VND_COMPRESSION */ vnd->sc_flags &= ~(VNF_INITED | VNF_READONLY | VNF_KLABEL | VNF_VLABEL | VNF_VUNCONF | VNF_COMP | VNF_CLEARING); if (vp == NULL) panic("vndclear: null vp"); (void) vn_close(vp, fflags, vnd->sc_cred); kauth_cred_free(vnd->sc_cred); vnd->sc_vp = NULL; vnd->sc_cred = NULL; vnd->sc_size = 0; } static int vndsize(dev_t dev) { struct vnd_softc *sc; struct disklabel *lp; int part, unit, omask; int size; unit = vndunit(dev); sc = device_lookup_private(&vnd_cd, unit); if (sc == NULL) return -1; if ((sc->sc_flags & VNF_INITED) == 0) return -1; part = DISKPART(dev); omask = sc->sc_dkdev.dk_openmask & (1 << part); lp = sc->sc_dkdev.dk_label; if (omask == 0 && vndopen(dev, 0, S_IFBLK, curlwp)) /* XXX */ return -1; if (lp->d_partitions[part].p_fstype != FS_SWAP) size = -1; else size = lp->d_partitions[part].p_size * (lp->d_secsize / DEV_BSIZE); if (omask == 0 && vndclose(dev, 0, S_IFBLK, curlwp)) /* XXX */ return -1; return size; } static int vnddump(dev_t dev, daddr_t blkno, void *va, size_t size) { /* Not implemented. */ return ENXIO; } static void vndgetdefaultlabel(struct vnd_softc *sc, struct disklabel *lp) { struct vndgeom *vng = &sc->sc_geom; struct partition *pp; unsigned spb; memset(lp, 0, sizeof(*lp)); spb = vng->vng_secsize / DEV_BSIZE; if (sc->sc_size / spb > UINT32_MAX) lp->d_secperunit = UINT32_MAX; else lp->d_secperunit = sc->sc_size / spb; lp->d_secsize = vng->vng_secsize; lp->d_nsectors = vng->vng_nsectors; lp->d_ntracks = vng->vng_ntracks; lp->d_ncylinders = vng->vng_ncylinders; lp->d_secpercyl = lp->d_ntracks * lp->d_nsectors; strncpy(lp->d_typename, "vnd", sizeof(lp->d_typename)); lp->d_type = DKTYPE_VND; strncpy(lp->d_packname, "fictitious", sizeof(lp->d_packname)); lp->d_rpm = 3600; lp->d_interleave = 1; lp->d_flags = 0; pp = &lp->d_partitions[RAW_PART]; pp->p_offset = 0; pp->p_size = lp->d_secperunit; pp->p_fstype = FS_UNUSED; lp->d_npartitions = RAW_PART + 1; lp->d_magic = DISKMAGIC; lp->d_magic2 = DISKMAGIC; lp->d_checksum = dkcksum(lp); } /* * Read the disklabel from a vnd. If one is not present, create a fake one. */ static void vndgetdisklabel(dev_t dev, struct vnd_softc *sc) { const char *errstring; struct disklabel *lp = sc->sc_dkdev.dk_label; struct cpu_disklabel *clp = sc->sc_dkdev.dk_cpulabel; int i; memset(clp, 0, sizeof(*clp)); vndgetdefaultlabel(sc, lp); /* * Call the generic disklabel extraction routine. */ errstring = readdisklabel(VNDLABELDEV(dev), vndstrategy, lp, clp); if (errstring) { /* * Lack of disklabel is common, but we print the warning * anyway, since it might contain other useful information. */ aprint_normal_dev(sc->sc_dev, "%s\n", errstring); /* * For historical reasons, if there's no disklabel * present, all partitions must be FS_BSDFFS and * occupy the entire disk. */ for (i = 0; i < MAXPARTITIONS; i++) { /* * Don't wipe out port specific hack (such as * dos partition hack of i386 port). */ if (lp->d_partitions[i].p_size != 0) continue; lp->d_partitions[i].p_size = lp->d_secperunit; lp->d_partitions[i].p_offset = 0; lp->d_partitions[i].p_fstype = FS_BSDFFS; } strncpy(lp->d_packname, "default label", sizeof(lp->d_packname)); lp->d_npartitions = MAXPARTITIONS; lp->d_checksum = dkcksum(lp); } } /* * Wait interruptibly for an exclusive lock. * * XXX * Several drivers do this; it should be abstracted and made MP-safe. */ static int vndlock(struct vnd_softc *sc) { int error; while ((sc->sc_flags & VNF_LOCKED) != 0) { sc->sc_flags |= VNF_WANTED; if ((error = tsleep(sc, PRIBIO | PCATCH, "vndlck", 0)) != 0) return error; } sc->sc_flags |= VNF_LOCKED; return 0; } /* * Unlock and wake up any waiters. */ static void vndunlock(struct vnd_softc *sc) { sc->sc_flags &= ~VNF_LOCKED; if ((sc->sc_flags & VNF_WANTED) != 0) { sc->sc_flags &= ~VNF_WANTED; wakeup(sc); } } #ifdef VND_COMPRESSION /* compressed file read */ static void compstrategy(struct buf *bp, off_t bn) { int error; int unit = vndunit(bp->b_dev); struct vnd_softc *vnd = device_lookup_private(&vnd_cd, unit); u_int32_t comp_block; struct uio auio; char *addr; int s; /* set up constants for data move */ auio.uio_rw = UIO_READ; UIO_SETUP_SYSSPACE(&auio); /* read, and transfer the data */ addr = bp->b_data; bp->b_resid = bp->b_bcount; s = splbio(); while (bp->b_resid > 0) { unsigned length; size_t length_in_buffer; u_int32_t offset_in_buffer; struct iovec aiov; /* calculate the compressed block number */ comp_block = bn / (off_t)vnd->sc_comp_blksz; /* check for good block number */ if (comp_block >= vnd->sc_comp_numoffs) { bp->b_error = EINVAL; splx(s); return; } /* read in the compressed block, if not in buffer */ if (comp_block != vnd->sc_comp_buffblk) { length = vnd->sc_comp_offsets[comp_block + 1] - vnd->sc_comp_offsets[comp_block]; vn_lock(vnd->sc_vp, LK_EXCLUSIVE | LK_RETRY); error = vn_rdwr(UIO_READ, vnd->sc_vp, vnd->sc_comp_buff, length, vnd->sc_comp_offsets[comp_block], UIO_SYSSPACE, IO_NODELOCKED|IO_UNIT, vnd->sc_cred, NULL, NULL); if (error) { bp->b_error = error; VOP_UNLOCK(vnd->sc_vp); splx(s); return; } /* uncompress the buffer */ vnd->sc_comp_stream.next_in = vnd->sc_comp_buff; vnd->sc_comp_stream.avail_in = length; vnd->sc_comp_stream.next_out = vnd->sc_comp_decombuf; vnd->sc_comp_stream.avail_out = vnd->sc_comp_blksz; inflateReset(&vnd->sc_comp_stream); error = inflate(&vnd->sc_comp_stream, Z_FINISH); if (error != Z_STREAM_END) { if (vnd->sc_comp_stream.msg) aprint_normal_dev(vnd->sc_dev, "compressed file, %s\n", vnd->sc_comp_stream.msg); bp->b_error = EBADMSG; VOP_UNLOCK(vnd->sc_vp); splx(s); return; } vnd->sc_comp_buffblk = comp_block; VOP_UNLOCK(vnd->sc_vp); } /* transfer the usable uncompressed data */ offset_in_buffer = bn % (off_t)vnd->sc_comp_blksz; length_in_buffer = vnd->sc_comp_blksz - offset_in_buffer; if (length_in_buffer > bp->b_resid) length_in_buffer = bp->b_resid; auio.uio_iov = &aiov; auio.uio_iovcnt = 1; aiov.iov_base = addr; aiov.iov_len = length_in_buffer; auio.uio_resid = aiov.iov_len; auio.uio_offset = 0; error = uiomove(vnd->sc_comp_decombuf + offset_in_buffer, length_in_buffer, &auio); if (error) { bp->b_error = error; splx(s); return; } bn += length_in_buffer; addr += length_in_buffer; bp->b_resid -= length_in_buffer; } splx(s); } /* compression memory allocation routines */ static void * vnd_alloc(void *aux, u_int items, u_int siz) { return malloc(items * siz, M_TEMP, M_NOWAIT); } static void vnd_free(void *aux, void *ptr) { free(ptr, M_TEMP); } #endif /* VND_COMPRESSION */ static void vnd_set_geometry(struct vnd_softc *vnd) { struct disk_geom *dg = &vnd->sc_dkdev.dk_geom; unsigned spb; memset(dg, 0, sizeof(*dg)); spb = vnd->sc_geom.vng_secsize / DEV_BSIZE; dg->dg_secperunit = vnd->sc_size / spb; dg->dg_secsize = vnd->sc_geom.vng_secsize; dg->dg_nsectors = vnd->sc_geom.vng_nsectors; dg->dg_ntracks = vnd->sc_geom.vng_ntracks; dg->dg_ncylinders = vnd->sc_geom.vng_ncylinders; #ifdef DEBUG if (vnddebug & VDB_LABEL) { printf("dg->dg_secperunit: %" PRId64 "\n", dg->dg_secperunit); printf("dg->dg_ncylinders: %u\n", dg->dg_ncylinders); } #endif disk_set_info(vnd->sc_dev, &vnd->sc_dkdev, NULL); } #ifdef VND_COMPRESSION #define VND_DEPENDS "zlib" #else #define VND_DEPENDS NULL #endif MODULE(MODULE_CLASS_DRIVER, vnd, VND_DEPENDS); #ifdef _MODULE int vnd_bmajor = -1, vnd_cmajor = -1; CFDRIVER_DECL(vnd, DV_DISK, NULL); #endif static int vnd_modcmd(modcmd_t cmd, void *arg) { int error = 0; switch (cmd) { case MODULE_CMD_INIT: #ifdef _MODULE /* * Attach the {b,c}devsw's */ error = devsw_attach("vnd", &vnd_bdevsw, &vnd_bmajor, &vnd_cdevsw, &vnd_cmajor); if (error) { #ifdef DIAGNOSTIC aprint_error("%s: unable to attach %s devsw, " "error %d", __func__, vnd_cd.cd_name, error); #endif break; } error = config_cfdriver_attach(&vnd_cd); if (error) { devsw_detach(&vnd_bdevsw, &vnd_cdevsw); break; } error = config_cfattach_attach(vnd_cd.cd_name, &vnd_ca); if (error) { config_cfdriver_detach(&vnd_cd); devsw_detach(&vnd_bdevsw, &vnd_cdevsw); #ifdef DIAGNOSTIC aprint_error("%s: unable to register cfattach for \n" "%s, error %d", __func__, vnd_cd.cd_name, error); #endif break; } #endif break; case MODULE_CMD_FINI: #ifdef _MODULE /* * Remove device from autoconf database */ error = config_cfattach_detach(vnd_cd.cd_name, &vnd_ca); if (error) { #ifdef DIAGNOSTIC aprint_error("%s: failed to detach %s cfattach, " "error %d\n", __func__, vnd_cd.cd_name, error); #endif break; } error = config_cfdriver_detach(&vnd_cd); if (error) { (void)config_cfattach_attach(vnd_cd.cd_name, &vnd_ca); #ifdef DIAGNOSTIC aprint_error("%s: failed to detach %s cfdriver, " "error %d\n", __func__, vnd_cd.cd_name, error); break; #endif } /* * Remove {b,c}devsw's */ devsw_detach(&vnd_bdevsw, &vnd_cdevsw); #endif break; case MODULE_CMD_STAT: return ENOTTY; default: return ENOTTY; } return error; }
2 3 2 5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 /* $NetBSD: kern_info_43.c,v 1.40 2021/09/07 11:43:02 riastradh Exp $ */ /* * Copyright (c) 1982, 1986, 1991, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)subr_xxx.c 8.1 (Berkeley) 6/10/93 */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: kern_info_43.c,v 1.40 2021/09/07 11:43:02 riastradh Exp $"); #if defined(_KERNEL_OPT) #include "opt_compat_netbsd.h" #endif #include <sys/param.h> #include <sys/systm.h> #include <sys/filedesc.h> #include <sys/kernel.h> #include <sys/vnode.h> #include <sys/proc.h> #include <sys/file.h> #include <sys/socket.h> #include <sys/socketvar.h> #include <sys/stat.h> #include <sys/ioctl.h> #include <sys/fcntl.h> #include <sys/syslog.h> #include <sys/unistd.h> #include <sys/resourcevar.h> #include <sys/kauth.h> #include <sys/cpu.h> #include <uvm/uvm_extern.h> #include <sys/sysctl.h> #include <sys/mount.h> #include <sys/syscall.h> #include <sys/syscallvar.h> #include <sys/syscallargs.h> #include <compat/sys/time.h> #include <compat/common/compat_mod.h> static struct syscall_package kern_info_43_syscalls[] = { { SYS_compat_43_ogetdtablesize, 0, (sy_call_t *)compat_43_sys_getdtablesize }, { SYS_compat_43_ogethostid, 0, (sy_call_t *)compat_43_sys_gethostid }, { SYS_compat_43_ogethostname, 0, (sy_call_t *)compat_43_sys_gethostname }, { SYS_compat_43_ogetkerninfo, 0, (sy_call_t *)compat_43_sys_getkerninfo }, { SYS_compat_43_osethostid, 0, (sy_call_t *)compat_43_sys_sethostid }, { SYS_compat_43_osethostname, 0, (sy_call_t *)compat_43_sys_sethostname }, { 0, 0, NULL } }; int compat_43_sys_getdtablesize(struct lwp *l, const void *v, register_t *retval) { struct proc *p = l->l_proc; mutex_enter(p->p_lock); *retval = uimin((int)p->p_rlimit[RLIMIT_NOFILE].rlim_cur, maxfiles); mutex_exit(p->p_lock); return (0); } /* ARGSUSED */ int compat_43_sys_gethostid(struct lwp *l, const void *v, register_t *retval) { *(int32_t *)retval = hostid; return (0); } /*ARGSUSED*/ int compat_43_sys_gethostname(struct lwp *l, const struct compat_43_sys_gethostname_args *uap, register_t *retval) { /* { syscallarg(char *) hostname; syscallarg(u_int) len; } */ int name[2]; size_t sz; name[0] = CTL_KERN; name[1] = KERN_HOSTNAME; sz = SCARG(uap, len); return (old_sysctl(&name[0], 2, SCARG(uap, hostname), &sz, 0, 0, l)); } #define KINFO_PROC (0<<8) #define KINFO_RT (1<<8) #define KINFO_VNODE (2<<8) #define KINFO_FILE (3<<8) #define KINFO_METER (4<<8) #define KINFO_LOADAVG (5<<8) #define KINFO_CLOCKRATE (6<<8) #define KINFO_BSDI_SYSINFO (101<<8) /* * The string data is appended to the end of the bsdi_si structure during * copyout. The "char *" offsets in the bsdi_si struct are relative to the * base of the bsdi_si struct. */ struct bsdi_si { char *machine; char *cpu_model; long ncpu; long cpuspeed; long hwflags; u_long physmem; u_long usermem; u_long pagesize; char *ostype; char *osrelease; long os_revision; long posix1_version; char *version; long hz; long profhz; int ngroups_max; long arg_max; long open_max; long child_max; struct timeval50 boottime; char *hostname; }; int compat_43_sys_getkerninfo(struct lwp *l, const struct compat_43_sys_getkerninfo_args *uap, register_t *retval) { /* { syscallarg(int) op; syscallarg(char *) where; syscallarg(int *) size; syscallarg(int) arg; } */ int error, name[6]; int isize; size_t size; if (!SCARG(uap, size)) return EINVAL; if ((error = copyin(SCARG(uap, size), &isize, sizeof(isize))) != 0) return error; if (isize < 0 || isize > 4096) return EINVAL; size = isize; switch (SCARG(uap, op) & 0xff00) { case KINFO_RT: name[0] = CTL_NET; name[1] = PF_ROUTE; name[2] = 0; name[3] = (SCARG(uap, op) & 0xff0000) >> 16; name[4] = SCARG(uap, op) & 0xff; name[5] = SCARG(uap, arg); error = old_sysctl(&name[0], 6, SCARG(uap, where), &size, NULL, 0, l); break; case KINFO_VNODE: name[0] = CTL_KERN; name[1] = KERN_VNODE; error = old_sysctl(&name[0], 2, SCARG(uap, where), &size, NULL, 0, l); break; case KINFO_PROC: name[0] = CTL_KERN; name[1] = KERN_PROC; name[2] = SCARG(uap, op) & 0xff; name[3] = SCARG(uap, arg); error = old_sysctl(&name[0], 4, SCARG(uap, where), &size, NULL, 0, l); break; case KINFO_FILE: name[0] = CTL_KERN; name[1] = KERN_FILE; error = old_sysctl(&name[0], 2, SCARG(uap, where), &size, NULL, 0, l); break; case KINFO_METER: name[0] = CTL_VM; name[1] = VM_METER; error = old_sysctl(&name[0], 2, SCARG(uap, where), &size, NULL, 0, l); break; case KINFO_LOADAVG: name[0] = CTL_VM; name[1] = VM_LOADAVG; error = old_sysctl(&name[0], 2, SCARG(uap, where), &size, NULL, 0, l); break; case KINFO_CLOCKRATE: name[0] = CTL_KERN; name[1] = KERN_CLOCKRATE; error = old_sysctl(&name[0], 2, SCARG(uap, where), &size, NULL, 0, l); break; case KINFO_BSDI_SYSINFO: { size_t len; struct bsdi_si *usi = (struct bsdi_si *) SCARG(uap, where); struct bsdi_si ksi; struct timeval tv; const char *cpu_model = cpu_getmodel(); char *us = (char *) &usi[1]; if (usi == NULL) { size = sizeof(ksi) + strlen(ostype) + strlen(cpu_model) + strlen(osrelease) + strlen(machine) + strlen(version) + strlen(hostname) + 6; error = 0; break; } memset(&ksi, 0, sizeof(ksi)); #define COPY(fld) \ ksi.fld = us - (u_long) usi; \ if ((error = copyoutstr(fld, us, 1024, &len)) != 0)\ return error; \ us += len COPY(machine); COPY(cpu_model); ksi.ncpu = ncpu; /* XXX */ ksi.cpuspeed = 40; /* XXX */ ksi.hwflags = 0; /* XXX */ ksi.physmem = ctob(physmem); ksi.usermem = ctob(physmem); /* XXX */ ksi.pagesize = PAGE_SIZE; COPY(ostype); COPY(osrelease); ksi.os_revision = NetBSD; /* XXX */ ksi.posix1_version = _POSIX_VERSION; COPY(version); /* XXX */ ksi.hz = hz; ksi.profhz = profhz; ksi.ngroups_max = NGROUPS_MAX; ksi.arg_max = ARG_MAX; ksi.open_max = OPEN_MAX; ksi.child_max = CHILD_MAX; getmicroboottime(&tv); timeval_to_timeval50(&tv, &ksi.boottime); COPY(hostname); size = (us - (char *) &usi[1]) + sizeof(ksi); if ((error = copyout(&ksi, usi, sizeof(ksi))) != 0) return error; } break; default: return (EOPNOTSUPP); } if (error) return (error); *retval = size; if (SCARG(uap, size)) error = copyout((void *)&size, (void *)SCARG(uap, size), sizeof(size)); return (error); } /* ARGSUSED */ int compat_43_sys_sethostid(struct lwp *l, const struct compat_43_sys_sethostid_args *uap, register_t *retval) { long uhostid; int name[2]; uhostid = SCARG(uap, hostid); name[0] = CTL_KERN; name[1] = KERN_HOSTID; return (old_sysctl(&name[0], 2, 0, 0, &uhostid, sizeof(long), l)); } /* ARGSUSED */ int compat_43_sys_sethostname(struct lwp *l, const struct compat_43_sys_sethostname_args *uap, register_t *retval) { int name[2]; name[0] = CTL_KERN; name[1] = KERN_HOSTNAME; return (old_sysctl(&name[0], 2, 0, 0, SCARG(uap, hostname), SCARG(uap, len), l)); } int kern_info_43_init(void) { return syscall_establish(NULL, kern_info_43_syscalls); } int kern_info_43_fini(void) { return syscall_disestablish(NULL, kern_info_43_syscalls); }
37 37 37 36 37 37 37 37 37 9 9 9 8 9 9 9 9 9 9 4 2 2 4 7 7 7 7 3 4 7 65 65 65 16 50 16 49 65 64 16 16 16 16 16 16 4 4 16 13 14 14 16 43 43 43 40 3 3 3 7 76 16 34 1 1 9 5 7 7 7 7 7 7 4 4 4 4 2 2 4 4 4 4 41 41 41 41 41 21 41 41 41 17 13 4 31 28 31 24 31 31 24 28 31 24 17 17 17 17 13 7 5 17 17 17 17 17 65 65 65 65 65 65 65 1 1 65 1 1 1 65 65 65 65 3 63 65 65 65 65 65 65 64 1 1 1 65 65 50 16 1 65 29 29 29 29 29 29 28 29 29 29 29 29 5 5 26 26 26 1 1 1 29 30 30 30 30 30 30 30 30 30 30 30 19 19 22 8 30 41 26 24 41 41 12 41 31 41 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 /* $NetBSD: ffs_alloc.c,v 1.172 2023/01/07 19:41:30 chs Exp $ */ /*- * Copyright (c) 2008, 2009 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Wasabi Systems, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /* * Copyright (c) 2002 Networks Associates Technology, Inc. * All rights reserved. * * This software was developed for the FreeBSD Project by Marshall * Kirk McKusick and Network Associates Laboratories, the Security * Research Division of Network Associates, Inc. under DARPA/SPAWAR * contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA CHATS * research program * * Copyright (c) 1982, 1986, 1989, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)ffs_alloc.c 8.19 (Berkeley) 7/13/95 */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: ffs_alloc.c,v 1.172 2023/01/07 19:41:30 chs Exp $"); #if defined(_KERNEL_OPT) #include "opt_ffs.h" #include "opt_quota.h" #include "opt_uvm_page_trkown.h" #endif #include <sys/param.h> #include <sys/systm.h> #include <sys/buf.h> #include <sys/cprng.h> #include <sys/kauth.h> #include <sys/kernel.h> #include <sys/mount.h> #include <sys/proc.h> #include <sys/syslog.h> #include <sys/vnode.h> #include <sys/wapbl.h> #include <sys/cprng.h> #include <miscfs/specfs/specdev.h> #include <ufs/ufs/quota.h> #include <ufs/ufs/ufsmount.h> #include <ufs/ufs/inode.h> #include <ufs/ufs/ufs_extern.h> #include <ufs/ufs/ufs_bswap.h> #include <ufs/ufs/ufs_wapbl.h> #include <ufs/ffs/fs.h> #include <ufs/ffs/ffs_extern.h> #ifdef UVM_PAGE_TRKOWN #include <uvm/uvm_object.h> #include <uvm/uvm_page.h> #endif static daddr_t ffs_alloccg(struct inode *, u_int, daddr_t, int, int, int); static daddr_t ffs_alloccgblk(struct inode *, struct buf *, daddr_t, int, int); static ino_t ffs_dirpref(struct inode *); static daddr_t ffs_fragextend(struct inode *, u_int, daddr_t, int, int); static void ffs_fserr(struct fs *, kauth_cred_t, const char *); static daddr_t ffs_hashalloc(struct inode *, u_int, daddr_t, int, int, int, daddr_t (*)(struct inode *, u_int, daddr_t, int, int, int)); static daddr_t ffs_nodealloccg(struct inode *, u_int, daddr_t, int, int, int); static int32_t ffs_mapsearch(struct fs *, struct cg *, daddr_t, int); static void ffs_blkfree_common(struct ufsmount *, struct fs *, dev_t, struct buf *, daddr_t, long, bool); static void ffs_freefile_common(struct ufsmount *, struct fs *, dev_t, struct buf *, ino_t, int, bool); /* if 1, changes in optimalization strategy are logged */ int ffs_log_changeopt = 0; /* in ffs_tables.c */ extern const int inside[], around[]; extern const u_char * const fragtbl[]; /* Basic consistency check for block allocations */ static int ffs_check_bad_allocation(const char *func, struct fs *fs, daddr_t bno, long size, dev_t dev, ino_t inum) { if ((u_int)size > fs->fs_bsize || ffs_fragoff(fs, size) != 0 || ffs_fragnum(fs, bno) + ffs_numfrags(fs, size) > fs->fs_frag) { panic("%s: bad size: dev = 0x%llx, bno = %" PRId64 " bsize = %d, size = %ld, fs = %s", func, (long long)dev, bno, fs->fs_bsize, size, fs->fs_fsmnt); } if (bno >= fs->fs_size) { printf("%s: bad block %" PRId64 ", ino %llu\n", func, bno, (unsigned long long)inum); ffs_fserr(fs, NOCRED, "bad block"); return EINVAL; } return 0; } /* * Allocate a block in the file system. * * The size of the requested block is given, which must be some * multiple of fs_fsize and <= fs_bsize. * A preference may be optionally specified. If a preference is given * the following hierarchy is used to allocate a block: * 1) allocate the requested block. * 2) allocate a rotationally optimal block in the same cylinder. * 3) allocate a block in the same cylinder group. * 4) quadradically rehash into other cylinder groups, until an * available block is located. * If no block preference is given the following hierarchy is used * to allocate a block: * 1) allocate a block in the cylinder group that contains the * inode for the file. * 2) quadradically rehash into other cylinder groups, until an * available block is located. * * => called with um_lock held * => releases um_lock before returning */ int ffs_alloc(struct inode *ip, daddr_t lbn, daddr_t bpref, int size, int flags, kauth_cred_t cred, daddr_t *bnp) { struct ufsmount *ump; struct fs *fs; daddr_t bno; u_int cg; #if defined(QUOTA) || defined(QUOTA2) int error; #endif fs = ip->i_fs; ump = ip->i_ump; KASSERT(mutex_owned(&ump->um_lock)); #ifdef UVM_PAGE_TRKOWN /* * Sanity-check that allocations within the file size * do not allow other threads to read the stale contents * of newly allocated blocks. * Usually pages will exist to cover the new allocation. * There is an optimization in ffs_write() where we skip * creating pages if several conditions are met: * - the file must not be mapped (in any user address space). * - the write must cover whole pages and whole blocks. * If those conditions are not met then pages must exist and * be locked by the current thread. */ struct vnode *vp = ITOV(ip); if (vp->v_type == VREG && (flags & IO_EXT) == 0 && ffs_lblktosize(fs, (voff_t)lbn) < round_page(vp->v_size) && ((vp->v_vflag & VV_MAPPED) != 0 || (size & PAGE_MASK) != 0 || ffs_blkoff(fs, size) != 0)) { struct vm_page *pg __diagused; struct uvm_object *uobj = &vp->v_uobj; voff_t off = trunc_page(ffs_lblktosize(fs, lbn)); voff_t endoff = round_page(ffs_lblktosize(fs, lbn) + size); rw_enter(uobj->vmobjlock, RW_WRITER); while (off < endoff) { pg = uvm_pagelookup(uobj, off); KASSERT((pg != NULL && pg->owner_tag != NULL && pg->owner == curproc->p_pid && pg->lowner == curlwp->l_lid)); off += PAGE_SIZE; } rw_exit(uobj->vmobjlock); } #endif *bnp = 0; KASSERTMSG((cred != NOCRED), "missing credential"); KASSERTMSG(((u_int)size <= fs->fs_bsize), "bad size: dev = 0x%llx, bsize = %d, size = %d, fs = %s", (unsigned long long)ip->i_dev, fs->fs_bsize, size, fs->fs_fsmnt); KASSERTMSG((ffs_fragoff(fs, size) == 0), "bad size: dev = 0x%llx, bsize = %d, size = %d, fs = %s", (unsigned long long)ip->i_dev, fs->fs_bsize, size, fs->fs_fsmnt); if (size == fs->fs_bsize && fs->fs_cstotal.cs_nbfree == 0) goto nospace; if (freespace(fs, fs->fs_minfree) <= 0 && kauth_authorize_system(cred, KAUTH_SYSTEM_FS_RESERVEDSPACE, 0, NULL, NULL, NULL) != 0) goto nospace; #if defined(QUOTA) || defined(QUOTA2) mutex_exit(&ump->um_lock); if ((error = chkdq(ip, btodb(size), cred, 0)) != 0) return (error); mutex_enter(&ump->um_lock); #endif if (bpref >= fs->fs_size) bpref = 0; if (bpref == 0) cg = ino_to_cg(fs, ip->i_number); else cg = dtog(fs, bpref); bno = ffs_hashalloc(ip, cg, bpref, size, 0, flags, ffs_alloccg); if (bno > 0) { DIP_ADD(ip, blocks, btodb(size)); if (flags & IO_EXT) ip->i_flag |= IN_CHANGE; else ip->i_flag |= IN_CHANGE | IN_UPDATE; *bnp = bno; return (0); } #if defined(QUOTA) || defined(QUOTA2) /* * Restore user's disk quota because allocation failed. */ (void) chkdq(ip, -btodb(size), cred, FORCE); #endif if (flags & B_CONTIG) { /* * XXX ump->um_lock handling is "suspect" at best. * For the case where ffs_hashalloc() fails early * in the B_CONTIG case we reach here with um_lock * already unlocked, so we can't release it again * like in the normal error path. See kern/39206. * * * Fail silently - it's up to our caller to report * errors. */ return (ENOSPC); } nospace: mutex_exit(&ump->um_lock); ffs_fserr(fs, cred, "file system full"); uprintf("\n%s: write failed, file system is full\n", fs->fs_fsmnt); return (ENOSPC); } /* * Reallocate a fragment to a bigger size * * The number and size of the old block is given, and a preference * and new size is also specified. The allocator attempts to extend * the original block. Failing that, the regular block allocator is * invoked to get an appropriate block. * * => called with um_lock held * => return with um_lock released */ int ffs_realloccg(struct inode *ip, daddr_t lbprev, daddr_t bprev, daddr_t bpref, int osize, int nsize, int flags, kauth_cred_t cred, struct buf **bpp, daddr_t *blknop) { struct ufsmount *ump; struct fs *fs; struct buf *bp; u_int cg, request; int error; daddr_t bno; fs = ip->i_fs; ump = ip->i_ump; KASSERT(mutex_owned(&ump->um_lock)); #ifdef UVM_PAGE_TRKOWN /* * Sanity-check that allocations within the file size * do not allow other threads to read the stale contents * of newly allocated blocks. * Unlike in ffs_alloc(), here pages must always exist * for such allocations, because only the last block of a file * can be a fragment and ffs_write() will reallocate the * fragment to the new size using ufs_balloc_range(), * which always creates pages to cover blocks it allocates. */ if (ITOV(ip)->v_type == VREG) { struct vm_page *pg __diagused; struct uvm_object *uobj = &ITOV(ip)->v_uobj; voff_t off = trunc_page(ffs_lblktosize(fs, lbprev)); voff_t endoff = round_page(ffs_lblktosize(fs, lbprev) + osize); rw_enter(uobj->vmobjlock, RW_WRITER); while (off < endoff) { pg = uvm_pagelookup(uobj, off); KASSERT(pg->owner == curproc->p_pid && pg->lowner == curlwp->l_lid); off += PAGE_SIZE; } rw_exit(uobj->vmobjlock); } #endif KASSERTMSG((cred != NOCRED), "missing credential"); KASSERTMSG(((u_int)osize <= fs->fs_bsize), "bad size: dev=0x%llx, bsize=%d, osize=%d, nsize=%d, fs=%s", (unsigned long long)ip->i_dev, fs->fs_bsize, osize, nsize, fs->fs_fsmnt); KASSERTMSG((ffs_fragoff(fs, osize) == 0), "bad size: dev=0x%llx, bsize=%d, osize=%d, nsize=%d, fs=%s", (unsigned long long)ip->i_dev, fs->fs_bsize, osize, nsize, fs->fs_fsmnt); KASSERTMSG(((u_int)nsize <= fs->fs_bsize), "bad size: dev=0x%llx, bsize=%d, osize=%d, nsize=%d, fs=%s", (unsigned long long)ip->i_dev, fs->fs_bsize, osize, nsize, fs->fs_fsmnt); KASSERTMSG((ffs_fragoff(fs, nsize) == 0), "bad size: dev=0x%llx, bsize=%d, osize=%d, nsize=%d, fs=%s", (unsigned long long)ip->i_dev, fs->fs_bsize, osize, nsize, fs->fs_fsmnt); if (freespace(fs, fs->fs_minfree) <= 0 && kauth_authorize_system(cred, KAUTH_SYSTEM_FS_RESERVEDSPACE, 0, NULL, NULL, NULL) != 0) { mutex_exit(&ump->um_lock); goto nospace; } if (bprev == 0) { panic("%s: bad bprev: dev = 0x%llx, bsize = %d, bprev = %" PRId64 ", fs = %s", __func__, (unsigned long long)ip->i_dev, fs->fs_bsize, bprev, fs->fs_fsmnt); } mutex_exit(&ump->um_lock); /* * Allocate the extra space in the buffer. */ if (bpp != NULL && (error = bread(ITOV(ip), lbprev, osize, 0, &bp)) != 0) { return (error); } #if defined(QUOTA) || defined(QUOTA2) if ((error = chkdq(ip, btodb(nsize - osize), cred, 0)) != 0) { if (bpp != NULL) { brelse(bp, 0); } return (error); } #endif /* * Check for extension in the existing location. */ cg = dtog(fs, bprev); mutex_enter(&ump->um_lock); if ((bno = ffs_fragextend(ip, cg, bprev, osize, nsize)) != 0) { DIP_ADD(ip, blocks, btodb(nsize - osize)); if (flags & IO_EXT) ip->i_flag |= IN_CHANGE; else ip->i_flag |= IN_CHANGE | IN_UPDATE; if (bpp != NULL) { if (bp->b_blkno != FFS_FSBTODB(fs, bno)) { panic("%s: bad blockno %#llx != %#llx", __func__, (unsigned long long) bp->b_blkno, (unsigned long long)FFS_FSBTODB(fs, bno)); } allocbuf(bp, nsize, 1); memset((char *)bp->b_data + osize, 0, nsize - osize); mutex_enter(bp->b_objlock); KASSERT(!cv_has_waiters(&bp->b_done)); bp->b_oflags |= BO_DONE; mutex_exit(bp->b_objlock); *bpp = bp; } if (blknop != NULL) { *blknop = bno; } return (0); } /* * Allocate a new disk location. */ if (bpref >= fs->fs_size) bpref = 0; switch ((int)fs->fs_optim) { case FS_OPTSPACE: /* * Allocate an exact sized fragment. Although this makes * best use of space, we will waste time relocating it if * the file continues to grow. If the fragmentation is * less than half of the minimum free reserve, we choose * to begin optimizing for time. */ request = nsize; if (fs->fs_minfree < 5 || fs->fs_cstotal.cs_nffree > fs->fs_dsize * fs->fs_minfree / (2 * 100)) break; if (ffs_log_changeopt) { log(LOG_NOTICE, "%s: optimization changed from SPACE to TIME\n", fs->fs_fsmnt); } fs->fs_optim = FS_OPTTIME; break; case FS_OPTTIME: /* * At this point we have discovered a file that is trying to * grow a small fragment to a larger fragment. To save time, * we allocate a full sized block, then free the unused portion. * If the file continues to grow, the `ffs_fragextend' call * above will be able to grow it in place without further * copying. If aberrant programs cause disk fragmentation to * grow within 2% of the free reserve, we choose to begin * optimizing for space. */ request = fs->fs_bsize; if (fs->fs_cstotal.cs_nffree < fs->fs_dsize * (fs->fs_minfree - 2) / 100) break; if (ffs_log_changeopt) { log(LOG_NOTICE, "%s: optimization changed from TIME to SPACE\n", fs->fs_fsmnt); } fs->fs_optim = FS_OPTSPACE; break; default: panic("%s: bad optim: dev = 0x%llx, optim = %d, fs = %s", __func__, (unsigned long long)ip->i_dev, fs->fs_optim, fs->fs_fsmnt); /* NOTREACHED */ } bno = ffs_hashalloc(ip, cg, bpref, request, nsize, 0, ffs_alloccg); if (bno > 0) { /* * Use forced deallocation registration, we can't handle * failure here. This is safe, as this place is ever hit * maximum once per write operation, when fragment is extended * to longer fragment, or a full block. */ if ((ip->i_ump->um_mountp->mnt_wapbl) && (ITOV(ip)->v_type != VREG)) { /* this should never fail */ error = UFS_WAPBL_REGISTER_DEALLOCATION_FORCE( ip->i_ump->um_mountp, FFS_FSBTODB(fs, bprev), osize); if (error) panic("ffs_realloccg: dealloc registration failed"); } else { ffs_blkfree(fs, ip->i_devvp, bprev, (long)osize, ip->i_number); } DIP_ADD(ip, blocks, btodb(nsize - osize)); if (flags & IO_EXT) ip->i_flag |= IN_CHANGE; else ip->i_flag |= IN_CHANGE | IN_UPDATE; if (bpp != NULL) { bp->b_blkno = FFS_FSBTODB(fs, bno); allocbuf(bp, nsize, 1); memset((char *)bp->b_data + osize, 0, (u_int)nsize - osize); mutex_enter(bp->b_objlock); KASSERT(!cv_has_waiters(&bp->b_done)); bp->b_oflags |= BO_DONE; mutex_exit(bp->b_objlock); *bpp = bp; } if (blknop != NULL) { *blknop = bno; } return (0); } mutex_exit(&ump->um_lock); #if defined(QUOTA) || defined(QUOTA2) /* * Restore user's disk quota because allocation failed. */ (void) chkdq(ip, -btodb(nsize - osize), cred, FORCE); #endif if (bpp != NULL) { brelse(bp, 0); } nospace: /* * no space available */ ffs_fserr(fs, cred, "file system full"); uprintf("\n%s: write failed, file system is full\n", fs->fs_fsmnt); return (ENOSPC); } /* * Allocate an inode in the file system. * * If allocating a directory, use ffs_dirpref to select the inode. * If allocating in a directory, the following hierarchy is followed: * 1) allocate the preferred inode. * 2) allocate an inode in the same cylinder group. * 3) quadradically rehash into other cylinder groups, until an * available inode is located. * If no inode preference is given the following hierarchy is used * to allocate an inode: * 1) allocate an inode in cylinder group 0. * 2) quadradically rehash into other cylinder groups, until an * available inode is located. * * => um_lock not held upon entry or return */ int ffs_valloc(struct vnode *pvp, int mode, kauth_cred_t cred, ino_t *inop) { struct ufsmount *ump; struct inode *pip; struct fs *fs; ino_t ino, ipref; u_int cg; int error; UFS_WAPBL_JUNLOCK_ASSERT(pvp->v_mount); pip = VTOI(pvp); fs = pip->i_fs; ump = pip->i_ump; error = UFS_WAPBL_BEGIN(pvp->v_mount); if (error) { return error; } mutex_enter(&ump->um_lock); if (fs->fs_cstotal.cs_nifree == 0) goto noinodes; if ((mode & IFMT) == IFDIR) ipref = ffs_dirpref(pip); else ipref = pip->i_number; if (ipref >= fs->fs_ncg * fs->fs_ipg) ipref = 0; cg = ino_to_cg(fs, ipref); /* * Track number of dirs created one after another * in a same cg without intervening by files. */ if ((mode & IFMT) == IFDIR) { if (fs->fs_contigdirs[cg] < 255) fs->fs_contigdirs[cg]++; } else { if (fs->fs_contigdirs[cg] > 0) fs->fs_contigdirs[cg]--; } ino = (ino_t)ffs_hashalloc(pip, cg, ipref, mode, 0, 0, ffs_nodealloccg); if (ino == 0) goto noinodes; UFS_WAPBL_END(pvp->v_mount); *inop = ino; return 0; noinodes: mutex_exit(&ump->um_lock); UFS_WAPBL_END(pvp->v_mount); ffs_fserr(fs, cred, "out of inodes"); uprintf("\n%s: create/symlink failed, no inodes free\n", fs->fs_fsmnt); return ENOSPC; } /* * Find a cylinder group in which to place a directory. * * The policy implemented by this algorithm is to allocate a * directory inode in the same cylinder group as its parent * directory, but also to reserve space for its files inodes * and data. Restrict the number of directories which may be * allocated one after another in the same cylinder group * without intervening allocation of files. * * If we allocate a first level directory then force allocation * in another cylinder group. */ static ino_t ffs_dirpref(struct inode *pip) { register struct fs *fs; u_int cg, prefcg; uint64_t dirsize, cgsize, curdsz; u_int avgifree, avgbfree, avgndir; u_int minifree, minbfree, maxndir; u_int mincg, minndir; u_int maxcontigdirs; KASSERT(mutex_owned(&pip->i_ump->um_lock)); fs = pip->i_fs; avgifree = fs->fs_cstotal.cs_nifree / fs->fs_ncg; avgbfree = fs->fs_cstotal.cs_nbfree / fs->fs_ncg; avgndir = fs->fs_cstotal.cs_ndir / fs->fs_ncg; /* * Force allocation in another cg if creating a first level dir. */ if (ITOV(pip)->v_vflag & VV_ROOT) { prefcg = cprng_fast32() % fs->fs_ncg; mincg = prefcg; minndir = fs->fs_ipg; for (cg = prefcg; cg < fs->fs_ncg; cg++) if (fs->fs_cs(fs, cg).cs_ndir < minndir && fs->fs_cs(fs, cg).cs_nifree >= avgifree && fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) { mincg = cg; minndir = fs->fs_cs(fs, cg).cs_ndir; } for (cg = 0; cg < prefcg; cg++) if (fs->fs_cs(fs, cg).cs_ndir < minndir && fs->fs_cs(fs, cg).cs_nifree >= avgifree && fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) { mincg = cg; minndir = fs->fs_cs(fs, cg).cs_ndir; } return ((ino_t)(fs->fs_ipg * mincg)); } /* * Count various limits which used for * optimal allocation of a directory inode. * Try cylinder groups with >75% avgifree and avgbfree. * Avoid cylinder groups with no free blocks or inodes as that * triggers an I/O-expensive cylinder group scan. */ maxndir = uimin(avgndir + fs->fs_ipg / 16, fs->fs_ipg); minifree = avgifree - avgifree / 4; if (minifree < 1) minifree = 1; minbfree = avgbfree - avgbfree / 4; if (minbfree < 1) minbfree = 1; cgsize = (int64_t)fs->fs_fsize * fs->fs_fpg; dirsize = (int64_t)fs->fs_avgfilesize * fs->fs_avgfpdir; if (avgndir != 0) { curdsz = (cgsize - (int64_t)avgbfree * fs->fs_bsize) / avgndir; if (dirsize < curdsz) dirsize = curdsz; } if (cgsize < dirsize * 255) maxcontigdirs = (avgbfree * fs->fs_bsize) / dirsize; else maxcontigdirs = 255; if (fs->fs_avgfpdir > 0) maxcontigdirs = uimin(maxcontigdirs, fs->fs_ipg / fs->fs_avgfpdir); if (maxcontigdirs == 0) maxcontigdirs = 1; /* * Limit number of dirs in one cg and reserve space for * regular files, but only if we have no deficit in * inodes or space. */ prefcg = ino_to_cg(fs, pip->i_number); for (cg = prefcg; cg < fs->fs_ncg; cg++) if (fs->fs_cs(fs, cg).cs_ndir < maxndir && fs->fs_cs(fs, cg).cs_nifree >= minifree && fs->fs_cs(fs, cg).cs_nbfree >= minbfree) { if (fs->fs_contigdirs[cg] < maxcontigdirs) return ((ino_t)(fs->fs_ipg * cg)); } for (cg = 0; cg < prefcg; cg++) if (fs->fs_cs(fs, cg).cs_ndir < maxndir && fs->fs_cs(fs, cg).cs_nifree >= minifree && fs->fs_cs(fs, cg).cs_nbfree >= minbfree) { if (fs->fs_contigdirs[cg] < maxcontigdirs) return ((ino_t)(fs->fs_ipg * cg)); } /* * This is a backstop when we are deficient in space. */ for (cg = prefcg; cg < fs->fs_ncg; cg++) if (fs->fs_cs(fs, cg).cs_nifree >= avgifree) return ((ino_t)(fs->fs_ipg * cg)); for (cg = 0; cg < prefcg; cg++) if (fs->fs_cs(fs, cg).cs_nifree >= avgifree) break; return ((ino_t)(fs->fs_ipg * cg)); } /* * Select the desired position for the next block in a file. The file is * logically divided into sections. The first section is composed of the * direct blocks. Each additional section contains fs_maxbpg blocks. * * If no blocks have been allocated in the first section, the policy is to * request a block in the same cylinder group as the inode that describes * the file. If no blocks have been allocated in any other section, the * policy is to place the section in a cylinder group with a greater than * average number of free blocks. An appropriate cylinder group is found * by using a rotor that sweeps the cylinder groups. When a new group of * blocks is needed, the sweep begins in the cylinder group following the * cylinder group from which the previous allocation was made. The sweep * continues until a cylinder group with greater than the average number * of free blocks is found. If the allocation is for the first block in an * indirect block, the information on the previous allocation is unavailable; * here a best guess is made based upon the logical block number being * allocated. * * If a section is already partially allocated, the policy is to * contiguously allocate fs_maxcontig blocks. The end of one of these * contiguous blocks and the beginning of the next is laid out * contigously if possible. * * => um_lock held on entry and exit */ daddr_t ffs_blkpref_ufs1(struct inode *ip, daddr_t lbn, int indx, int flags, int32_t *bap /* XXX ondisk32 */) { struct fs *fs; u_int cg; u_int avgbfree, startcg; KASSERT(mutex_owned(&ip->i_ump->um_lock)); fs = ip->i_fs; /* * If allocating a contiguous file with B_CONTIG, use the hints * in the inode extensions to return the desired block. * * For metadata (indirect blocks) return the address of where * the first indirect block resides - we'll scan for the next * available slot if we need to allocate more than one indirect * block. For data, return the address of the actual block * relative to the address of the first data block. */ if (flags & B_CONTIG) { KASSERT(ip->i_ffs_first_data_blk != 0); KASSERT(ip->i_ffs_first_indir_blk != 0); if (flags & B_METAONLY) return ip->i_ffs_first_indir_blk; else return ip->i_ffs_first_data_blk + ffs_blkstofrags(fs, lbn); } if (indx % fs->fs_maxbpg == 0 || bap[indx - 1] == 0) { if (lbn < UFS_NDADDR + FFS_NINDIR(fs)) { cg = ino_to_cg(fs, ip->i_number); return (cgbase(fs, cg) + fs->fs_frag); } /* * Find a cylinder with greater than average number of * unused data blocks. */ if (indx == 0 || bap[indx - 1] == 0) startcg = ino_to_cg(fs, ip->i_number) + lbn / fs->fs_maxbpg; else startcg = dtog(fs, ufs_rw32(bap[indx - 1], UFS_FSNEEDSWAP(fs)) + 1); startcg %= fs->fs_ncg; avgbfree = fs->fs_cstotal.cs_nbfree / fs->fs_ncg; for (cg = startcg; cg < fs->fs_ncg; cg++) if (fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) { return (cgbase(fs, cg) + fs->fs_frag); } for (cg = 0; cg < startcg; cg++) if (fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) { return (cgbase(fs, cg) + fs->fs_frag); } return (0); } /* * We just always try to lay things out contiguously. */ return ufs_rw32(bap[indx - 1], UFS_FSNEEDSWAP(fs)) + fs->fs_frag; } daddr_t ffs_blkpref_ufs2(struct inode *ip, daddr_t lbn, int indx, int flags, int64_t *bap) { struct fs *fs; u_int cg; u_int avgbfree, startcg; KASSERT(mutex_owned(&ip->i_ump->um_lock)); fs = ip->i_fs; /* * If allocating a contiguous file with B_CONTIG, use the hints * in the inode extensions to return the desired block. * * For metadata (indirect blocks) return the address of where * the first indirect block resides - we'll scan for the next * available slot if we need to allocate more than one indirect * block. For data, return the address of the actual block * relative to the address of the first data block. */ if (flags & B_CONTIG) { KASSERT(ip->i_ffs_first_data_blk != 0); KASSERT(ip->i_ffs_first_indir_blk != 0); if (flags & B_METAONLY) return ip->i_ffs_first_indir_blk; else return ip->i_ffs_first_data_blk + ffs_blkstofrags(fs, lbn); } if (indx % fs->fs_maxbpg == 0 || bap[indx - 1] == 0) { if (lbn < UFS_NDADDR + FFS_NINDIR(fs)) { cg = ino_to_cg(fs, ip->i_number); return (cgbase(fs, cg) + fs->fs_frag); } /* * Find a cylinder with greater than average number of * unused data blocks. */ if (indx == 0 || bap[indx - 1] == 0) startcg = ino_to_cg(fs, ip->i_number) + lbn / fs->fs_maxbpg; else startcg = dtog(fs, ufs_rw64(bap[indx - 1], UFS_FSNEEDSWAP(fs)) + 1); startcg %= fs->fs_ncg; avgbfree = fs->fs_cstotal.cs_nbfree / fs->fs_ncg; for (cg = startcg; cg < fs->fs_ncg; cg++) if (fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) { return (cgbase(fs, cg) + fs->fs_frag); } for (cg = 0; cg < startcg; cg++) if (fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) { return (cgbase(fs, cg) + fs->fs_frag); } return (0); } /* * We just always try to lay things out contiguously. */ return ufs_rw64(bap[indx - 1], UFS_FSNEEDSWAP(fs)) + fs->fs_frag; } /* * Implement the cylinder overflow algorithm. * * The policy implemented by this algorithm is: * 1) allocate the block in its requested cylinder group. * 2) quadradically rehash on the cylinder group number. * 3) brute force search for a free block. * * => called with um_lock held * => returns with um_lock released on success, held on failure * (*allocator releases lock on success, retains lock on failure) */ /*VARARGS5*/ static daddr_t ffs_hashalloc(struct inode *ip, u_int cg, daddr_t pref, int size /* size for data blocks, mode for inodes */, int realsize, int flags, daddr_t (*allocator)(struct inode *, u_int, daddr_t, int, int, int)) { struct fs *fs; daddr_t result; u_int i, icg = cg; fs = ip->i_fs; /* * 1: preferred cylinder group */ result = (*allocator)(ip, cg, pref, size, realsize, flags); if (result) return (result); if (flags & B_CONTIG) return (result); /* * 2: quadratic rehash */ for (i = 1; i < fs->fs_ncg; i *= 2) { cg += i; if (cg >= fs->fs_ncg) cg -= fs->fs_ncg; result = (*allocator)(ip, cg, 0, size, realsize, flags); if (result) return (result); } /* * 3: brute force search * Note that we start at i == 2, since 0 was checked initially, * and 1 is always checked in the quadratic rehash. */ cg = (icg + 2) % fs->fs_ncg; for (i = 2; i < fs->fs_ncg; i++) { result = (*allocator)(ip, cg, 0, size, realsize, flags); if (result) return (result); cg++; if (cg == fs->fs_ncg) cg = 0; } return (0); } /* * Determine whether a fragment can be extended. * * Check to see if the necessary fragments are available, and * if they are, allocate them. * * => called with um_lock held * => returns with um_lock released on success, held on failure */ static daddr_t ffs_fragextend(struct inode *ip, u_int cg, daddr_t bprev, int osize, int nsize) { struct ufsmount *ump; struct fs *fs; struct cg *cgp; struct buf *bp; daddr_t bno; int frags, bbase; int i, error; u_int8_t *blksfree; fs = ip->i_fs; ump = ip->i_ump; KASSERT(mutex_owned(&ump->um_lock)); if (fs->fs_cs(fs, cg).cs_nffree < ffs_numfrags(fs, nsize - osize)) return (0); frags = ffs_numfrags(fs, nsize); bbase = ffs_fragnum(fs, bprev); if (bbase > ffs_fragnum(fs, (bprev + frags - 1))) { /* cannot extend across a block boundary */ return (0); } mutex_exit(&ump->um_lock); error = bread(ip->i_devvp, FFS_FSBTODB(fs, cgtod(fs, cg)), (int)fs->fs_cgsize, B_MODIFY, &bp); if (error) goto fail; cgp = (struct cg *)bp->b_data; if (!cg_chkmagic(cgp, UFS_FSNEEDSWAP(fs))) goto fail; cgp->cg_old_time = ufs_rw32(time_second, UFS_FSNEEDSWAP(fs)); if ((fs->fs_magic != FS_UFS1_MAGIC) || (fs->fs_old_flags & FS_FLAGS_UPDATED)) cgp->cg_time = ufs_rw64(time_second, UFS_FSNEEDSWAP(fs)); bno = dtogd(fs, bprev); blksfree = cg_blksfree(cgp, UFS_FSNEEDSWAP(fs)); for (i = ffs_numfrags(fs, osize); i < frags; i++) if (isclr(blksfree, bno + i)) goto fail; /* * the current fragment can be extended * deduct the count on fragment being extended into * increase the count on the remaining fragment (if any) * allocate the extended piece */ for (i = frags; i < fs->fs_frag - bbase; i++) if (isclr(blksfree, bno + i)) break; ufs_add32(cgp->cg_frsum[i - ffs_numfrags(fs, osize)], -1, UFS_FSNEEDSWAP(fs)); if (i != frags) ufs_add32(cgp->cg_frsum[i - frags], 1, UFS_FSNEEDSWAP(fs)); mutex_enter(&ump->um_lock); for (i = ffs_numfrags(fs, osize); i < frags; i++) { clrbit(blksfree, bno + i); ufs_add32(cgp->cg_cs.cs_nffree, -1, UFS_FSNEEDSWAP(fs)); fs->fs_cstotal.cs_nffree--; fs->fs_cs(fs, cg).cs_nffree--; } fs->fs_fmod = 1; ACTIVECG_CLR(fs, cg); mutex_exit(&ump->um_lock); bdwrite(bp); return (bprev); fail: if (bp != NULL) brelse(bp, 0); mutex_enter(&ump->um_lock); return (0); } /* * Determine whether a block can be allocated. * * Check to see if a block of the appropriate size is available, * and if it is, allocate it. */ static daddr_t ffs_alloccg(struct inode *ip, u_int cg, daddr_t bpref, int size, int realsize, int flags) { struct ufsmount *ump; struct fs *fs = ip->i_fs; struct cg *cgp; struct buf *bp; int32_t bno; daddr_t blkno; int error, frags, allocsiz, i; u_int8_t *blksfree; const int needswap = UFS_FSNEEDSWAP(fs); ump = ip->i_ump; KASSERT(mutex_owned(&ump->um_lock)); if (fs->fs_cs(fs, cg).cs_nbfree == 0 && size == fs->fs_bsize) return (0); mutex_exit(&ump->um_lock); error = bread(ip->i_devvp, FFS_FSBTODB(fs, cgtod(fs, cg)), (int)fs->fs_cgsize, B_MODIFY, &bp); if (error) goto fail; cgp = (struct cg *)bp->b_data; if (!cg_chkmagic(cgp, needswap) || (cgp->cg_cs.cs_nbfree == 0 && size == fs->fs_bsize)) goto fail; cgp->cg_old_time = ufs_rw32(time_second, needswap); if ((fs->fs_magic != FS_UFS1_MAGIC) || (fs->fs_old_flags & FS_FLAGS_UPDATED)) cgp->cg_time = ufs_rw64(time_second, needswap); if (size == fs->fs_bsize) { mutex_enter(&ump->um_lock); blkno = ffs_alloccgblk(ip, bp, bpref, realsize, flags); ACTIVECG_CLR(fs, cg); mutex_exit(&ump->um_lock); /* * If actually needed size is lower, free the extra blocks now. * This is safe to call here, there is no outside reference * to this block yet. It is not necessary to keep um_lock * locked. */ if (realsize != 0 && realsize < size) { ffs_blkfree_common(ip->i_ump, ip->i_fs, ip->i_devvp->v_rdev, bp, blkno + ffs_numfrags(fs, realsize), (long)(size - realsize), false); } bdwrite(bp); return (blkno); } /* * check to see if any fragments are already available * allocsiz is the size which will be allocated, hacking * it down to a smaller size if necessary */ blksfree = cg_blksfree(cgp, needswap); frags = ffs_numfrags(fs, size); for (allocsiz = frags; allocsiz < fs->fs_frag; allocsiz++) if (cgp->cg_frsum[allocsiz] != 0) break; if (allocsiz == fs->fs_frag) { /* * no fragments were available, so a block will be * allocated, and hacked up */ if (cgp->cg_cs.cs_nbfree == 0) goto fail; mutex_enter(&ump->um_lock); blkno = ffs_alloccgblk(ip, bp, bpref, realsize, flags); bno = dtogd(fs, blkno); for (i = frags; i < fs->fs_frag; i++) setbit(blksfree, bno + i); i = fs->fs_frag - frags; ufs_add32(cgp->cg_cs.cs_nffree, i, needswap); fs->fs_cstotal.cs_nffree += i; fs->fs_cs(fs, cg).cs_nffree += i; fs->fs_fmod = 1; ufs_add32(cgp->cg_frsum[i], 1, needswap); ACTIVECG_CLR(fs, cg); mutex_exit(&ump->um_lock); bdwrite(bp); return (blkno); } bno = ffs_mapsearch(fs, cgp, bpref, allocsiz); #if 0 /* * XXX fvdl mapsearch will panic, and never return -1 * also: returning NULL as daddr_t ? */ if (bno < 0) goto fail; #endif for (i = 0; i < frags; i++) clrbit(blksfree, bno + i); mutex_enter(&ump->um_lock); ufs_add32(cgp->cg_cs.cs_nffree, -frags, needswap); fs->fs_cstotal.cs_nffree -= frags; fs->fs_cs(fs, cg).cs_nffree -= frags; fs->fs_fmod = 1; ufs_add32(cgp->cg_frsum[allocsiz], -1, needswap); if (frags != allocsiz) ufs_add32(cgp->cg_frsum[allocsiz - frags], 1, needswap); blkno = cgbase(fs, cg) + bno; ACTIVECG_CLR(fs, cg); mutex_exit(&ump->um_lock); bdwrite(bp); return blkno; fail: if (bp != NULL) brelse(bp, 0); mutex_enter(&ump->um_lock); return (0); } /* * Allocate a block in a cylinder group. * * This algorithm implements the following policy: * 1) allocate the requested block. * 2) allocate a rotationally optimal block in the same cylinder. * 3) allocate the next available block on the block rotor for the * specified cylinder group. * Note that this routine only allocates fs_bsize blocks; these * blocks may be fragmented by the routine that allocates them. */ static daddr_t ffs_alloccgblk(struct inode *ip, struct buf *bp, daddr_t bpref, int realsize, int flags) { struct fs *fs = ip->i_fs; struct cg *cgp; int cg; daddr_t blkno; int32_t bno; u_int8_t *blksfree; const int needswap = UFS_FSNEEDSWAP(fs); KASSERT(mutex_owned(&ip->i_ump->um_lock)); cgp = (struct cg *)bp->b_data; blksfree = cg_blksfree(cgp, needswap); if (bpref == 0 || dtog(fs, bpref) != ufs_rw32(cgp->cg_cgx, needswap)) { bpref = ufs_rw32(cgp->cg_rotor, needswap); } else { bpref = ffs_blknum(fs, bpref); bno = dtogd(fs, bpref); /* * if the requested block is available, use it */ if (ffs_isblock(fs, blksfree, ffs_fragstoblks(fs, bno))) goto gotit; /* * if the requested data block isn't available and we are * trying to allocate a contiguous file, return an error. */ if ((flags & (B_CONTIG | B_METAONLY)) == B_CONTIG) return (0); } /* * Take the next available block in this cylinder group. */ bno = ffs_mapsearch(fs, cgp, bpref, (int)fs->fs_frag); #if 0 /* * XXX jdolecek ffs_mapsearch() succeeds or panics */ if (bno < 0) return (0); #endif cgp->cg_rotor = ufs_rw32(bno, needswap); gotit: blkno = ffs_fragstoblks(fs, bno); ffs_clrblock(fs, blksfree, blkno); ffs_clusteracct(fs, cgp, blkno, -1); ufs_add32(cgp->cg_cs.cs_nbfree, -1, needswap); fs->fs_cstotal.cs_nbfree--; fs->fs_cs(fs, ufs_rw32(cgp->cg_cgx, needswap)).cs_nbfree--; if ((fs->fs_magic == FS_UFS1_MAGIC) && ((fs->fs_old_flags & FS_FLAGS_UPDATED) == 0)) { int cylno; cylno = old_cbtocylno(fs, bno); KASSERT(cylno >= 0); KASSERT(cylno < fs->fs_old_ncyl); KASSERT(old_cbtorpos(fs, bno) >= 0); KASSERT(fs->fs_old_nrpos == 0 || old_cbtorpos(fs, bno) < fs->fs_old_nrpos); ufs_add16(old_cg_blks(fs, cgp, cylno, needswap)[old_cbtorpos(fs, bno)], -1, needswap); ufs_add32(old_cg_blktot(cgp, needswap)[cylno], -1, needswap); } fs->fs_fmod = 1; cg = ufs_rw32(cgp->cg_cgx, needswap); blkno = cgbase(fs, cg) + bno; return (blkno); } /* * Determine whether an inode can be allocated. * * Check to see if an inode is available, and if it is, * allocate it using the following policy: * 1) allocate the requested inode. * 2) allocate the next available inode after the requested * inode in the specified cylinder group. */ static daddr_t ffs_nodealloccg(struct inode *ip, u_int cg, daddr_t ipref, int mode, int realsize, int flags) { struct ufsmount *ump = ip->i_ump; struct fs *fs = ip->i_fs; struct cg *cgp; struct buf *bp, *ibp; u_int8_t *inosused; int error, start, len, loc, map, i; int32_t initediblk, maxiblk, irotor; daddr_t nalloc; struct ufs2_dinode *dp2; const int needswap = UFS_FSNEEDSWAP(fs); KASSERT(mutex_owned(&ump->um_lock)); UFS_WAPBL_JLOCK_ASSERT(ip->i_ump->um_mountp); if (fs->fs_cs(fs, cg).cs_nifree == 0) return (0); mutex_exit(&ump->um_lock); ibp = NULL; if (fs->fs_magic == FS_UFS2_MAGIC) { initediblk = -1; } else { initediblk = fs->fs_ipg; } maxiblk = initediblk; retry: error = bread(ip->i_devvp, FFS_FSBTODB(fs, cgtod(fs, cg)), (int)fs->fs_cgsize, B_MODIFY, &bp); if (error) goto fail; cgp = (struct cg *)bp->b_data; if (!cg_chkmagic(cgp, needswap) || cgp->cg_cs.cs_nifree == 0) goto fail; if (ibp != NULL && initediblk != ufs_rw32(cgp->cg_initediblk, needswap)) { /* Another thread allocated more inodes so we retry the test. */ brelse(ibp, 0); ibp = NULL; } /* * Check to see if we need to initialize more inodes. */ if (fs->fs_magic == FS_UFS2_MAGIC && ibp == NULL) { initediblk = ufs_rw32(cgp->cg_initediblk, needswap); maxiblk = initediblk; nalloc = fs->fs_ipg - ufs_rw32(cgp->cg_cs.cs_nifree, needswap); if (nalloc + FFS_INOPB(fs) > initediblk && initediblk < ufs_rw32(cgp->cg_niblk, needswap)) { /* * We have to release the cg buffer here to prevent * a deadlock when reading the inode block will * run a copy-on-write that might use this cg. */ brelse(bp, 0); bp = NULL; error = ffs_getblk(ip->i_devvp, FFS_FSBTODB(fs, ino_to_fsba(fs, cg * fs->fs_ipg + initediblk)), FFS_NOBLK, fs->fs_bsize, false, &ibp); if (error) goto fail; maxiblk += FFS_INOPB(fs); goto retry; } } cgp->cg_old_time = ufs_rw32(time_second, needswap); if ((fs->fs_magic != FS_UFS1_MAGIC) || (fs->fs_old_flags & FS_FLAGS_UPDATED)) cgp->cg_time = ufs_rw64(time_second, needswap); inosused = cg_inosused(cgp, needswap); if (ipref) { ipref %= fs->fs_ipg; /* safeguard to stay in (to be) allocated range */ if (ipref < maxiblk && isclr(inosused, ipref)) goto gotit; } irotor = ufs_rw32(cgp->cg_irotor, needswap); KASSERTMSG(irotor < initediblk, "%s: allocation botch: cg=%d, irotor %d" " out of bounds, initediblk=%d", __func__, cg, irotor, initediblk); start = irotor / NBBY; len = howmany(maxiblk - irotor, NBBY); loc = skpc(0xff, len, &inosused[start]); if (loc == 0) { len = start + 1; start = 0; loc = skpc(0xff, len, &inosused[0]); if (loc == 0) { panic("%s: map corrupted: cg=%d, irotor=%d, fs=%s", __func__, cg, ufs_rw32(cgp->cg_irotor, needswap), fs->fs_fsmnt); /* NOTREACHED */ } } i = start + len - loc; map = inosused[i] ^ 0xff; if (map == 0) { panic("%s: block not in map: fs=%s", __func__, fs->fs_fsmnt); } ipref = i * NBBY + ffs(map) - 1; cgp->cg_irotor = ufs_rw32(ipref, needswap); gotit: KASSERTMSG(ipref < maxiblk, "%s: allocation botch: cg=%d attempt to " "allocate inode index %d beyond max allocated index %d" " of %d inodes/cg", __func__, cg, (int)ipref, maxiblk, cgp->cg_niblk); UFS_WAPBL_REGISTER_INODE(ip->i_ump->um_mountp, cg * fs->fs_ipg + ipref, mode); /* * Check to see if we need to initialize more inodes. */ if (ibp != NULL) { KASSERT(initediblk == ufs_rw32(cgp->cg_initediblk, needswap)); memset(ibp->b_data, 0, fs->fs_bsize); dp2 = (struct ufs2_dinode *)(ibp->b_data); for (i = 0; i < FFS_INOPB(fs); i++) { /* * Don't bother to swap, it's supposed to be * random, after all. */ dp2->di_gen = (cprng_fast32() & INT32_MAX) / 2 + 1; dp2++; } initediblk += FFS_INOPB(fs); cgp->cg_initediblk = ufs_rw32(initediblk, needswap); } mutex_enter(&ump->um_lock); ACTIVECG_CLR(fs, cg); setbit(inosused, ipref); ufs_add32(cgp->cg_cs.cs_nifree, -1, needswap); fs->fs_cstotal.cs_nifree--; fs->fs_cs(fs, cg).cs_nifree--; fs->fs_fmod = 1; if ((mode & IFMT) == IFDIR) { ufs_add32(cgp->cg_cs.cs_ndir, 1, needswap); fs->fs_cstotal.cs_ndir++; fs->fs_cs(fs, cg).cs_ndir++; } mutex_exit(&ump->um_lock); if (ibp != NULL) { bwrite(ibp); bwrite(bp); } else bdwrite(bp); return ((ino_t)(cg * fs->fs_ipg + ipref)); fail: if (bp != NULL) brelse(bp, 0); if (ibp != NULL) brelse(ibp, 0); mutex_enter(&ump->um_lock); return (0); } /* * Allocate a block or fragment. * * The specified block or fragment is removed from the * free map, possibly fragmenting a block in the process. * * This implementation should mirror fs_blkfree * * => um_lock not held on entry or exit */ int ffs_blkalloc(struct inode *ip, daddr_t bno, long size) { int error; error = ffs_check_bad_allocation(__func__, ip->i_fs, bno, size, ip->i_dev, ip->i_uid); if (error) return error; return ffs_blkalloc_ump(ip->i_ump, bno, size); } int ffs_blkalloc_ump(struct ufsmount *ump, daddr_t bno, long size) { struct fs *fs = ump->um_fs; struct cg *cgp; struct buf *bp; int32_t fragno, cgbno; int i, error, blk, frags, bbase; u_int cg; u_int8_t *blksfree; const int needswap = UFS_FSNEEDSWAP(fs); KASSERT((u_int)size <= fs->fs_bsize && ffs_fragoff(fs, size) == 0 && ffs_fragnum(fs, bno) + ffs_numfrags(fs, size) <= fs->fs_frag); KASSERT(bno < fs->fs_size); cg = dtog(fs, bno); error = bread(ump->um_devvp, FFS_FSBTODB(fs, cgtod(fs, cg)), (int)fs->fs_cgsize, B_MODIFY, &bp); if (error) { return error; } cgp = (struct cg *)bp->b_data; if (!cg_chkmagic(cgp, needswap)) { brelse(bp, 0); return EIO; } cgp->cg_old_time = ufs_rw32(time_second, needswap); cgp->cg_time = ufs_rw64(time_second, needswap); cgbno = dtogd(fs, bno); blksfree = cg_blksfree(cgp, needswap); mutex_enter(&ump->um_lock); if (size == fs->fs_bsize) { fragno = ffs_fragstoblks(fs, cgbno); if (!ffs_isblock(fs, blksfree, fragno)) { mutex_exit(&ump->um_lock); brelse(bp, 0); return EBUSY; } ffs_clrblock(fs, blksfree, fragno); ffs_clusteracct(fs, cgp, fragno, -1); ufs_add32(cgp->cg_cs.cs_nbfree, -1, needswap); fs->fs_cstotal.cs_nbfree--; fs->fs_cs(fs, cg).cs_nbfree--; } else { bbase = cgbno - ffs_fragnum(fs, cgbno); frags = ffs_numfrags(fs, size); for (i = 0; i < frags; i++) { if (isclr(blksfree, cgbno + i)) { mutex_exit(&ump->um_lock); brelse(bp, 0); return EBUSY; } } /* * if a complete block is being split, account for it */ fragno = ffs_fragstoblks(fs, bbase); if (ffs_isblock(fs, blksfree, fragno)) { ufs_add32(cgp->cg_cs.cs_nffree, fs->fs_frag, needswap); fs->fs_cstotal.cs_nffree += fs->fs_frag; fs->fs_cs(fs, cg).cs_nffree += fs->fs_frag; ffs_clusteracct(fs, cgp, fragno, -1); ufs_add32(cgp->cg_cs.cs_nbfree, -1, needswap); fs->fs_cstotal.cs_nbfree--; fs->fs_cs(fs, cg).cs_nbfree--; } /* * decrement the counts associated with the old frags */ blk = blkmap(fs, blksfree, bbase); ffs_fragacct(fs, blk, cgp->cg_frsum, -1, needswap); /* * allocate the fragment */ for (i = 0; i < frags; i++) { clrbit(blksfree, cgbno + i); } ufs_add32(cgp->cg_cs.cs_nffree, -i, needswap); fs->fs_cstotal.cs_nffree -= i; fs->fs_cs(fs, cg).cs_nffree -= i; /* * add back in counts associated with the new frags */ blk = blkmap(fs, blksfree, bbase); ffs_fragacct(fs, blk, cgp->cg_frsum, 1, needswap); } fs->fs_fmod = 1; ACTIVECG_CLR(fs, cg); mutex_exit(&ump->um_lock); bdwrite(bp); return 0; } /* * Free a block or fragment. * * The specified block or fragment is placed back in the * free map. If a fragment is deallocated, a possible * block reassembly is checked. * * => um_lock not held on entry or exit */ static void ffs_blkfree_cg(struct fs *fs, struct vnode *devvp, daddr_t bno, long size) { struct cg *cgp; struct buf *bp; struct ufsmount *ump; daddr_t cgblkno; int error; u_int cg; dev_t dev; const bool devvp_is_snapshot = (devvp->v_type != VBLK); const int needswap = UFS_FSNEEDSWAP(fs); KASSERT(!devvp_is_snapshot); cg = dtog(fs, bno); dev = devvp->v_rdev; ump = VFSTOUFS(spec_node_getmountedfs(devvp)); KASSERT(fs == ump->um_fs); cgblkno = FFS_FSBTODB(fs, cgtod(fs, cg)); error = bread(devvp, cgblkno, (int)fs->fs_cgsize, B_MODIFY, &bp); if (error) { return; } cgp = (struct cg *)bp->b_data; if (!cg_chkmagic(cgp, needswap)) { brelse(bp, 0); return; } ffs_blkfree_common(ump, fs, dev, bp, bno, size, devvp_is_snapshot); bdwrite(bp); } struct discardopdata { struct work wk; /* must be first */ struct vnode *devvp; daddr_t bno; long size; }; struct discarddata { struct fs *fs; struct discardopdata *entry; long maxsize; kmutex_t entrylk; struct workqueue *wq; int wqcnt, wqdraining; kmutex_t wqlk; kcondvar_t wqcv; /* timer for flush? */ }; static void ffs_blkfree_td(struct fs *fs, struct discardopdata *td) { struct mount *mp = spec_node_getmountedfs(td->devvp); long todo; int error; while (td->size) { todo = uimin(td->size, ffs_lfragtosize(fs, (fs->fs_frag - ffs_fragnum(fs, td->bno)))); error = UFS_WAPBL_BEGIN(mp); if (error) { printf("ffs: failed to begin wapbl transaction" " for discard: %d\n", error); break; } ffs_blkfree_cg(fs, td->devvp, td->bno, todo); UFS_WAPBL_END(mp); td->bno += ffs_numfrags(fs, todo); td->size -= todo; } } static void ffs_discardcb(struct work *wk, void *arg) { struct discardopdata *td = (void *)wk; struct discarddata *ts = arg; struct fs *fs = ts->fs; off_t start, len; #ifdef TRIMDEBUG int error; #endif /* like FSBTODB but emits bytes; XXX move to fs.h */ #ifndef FFS_FSBTOBYTES #define FFS_FSBTOBYTES(fs, b) ((b) << (fs)->fs_fshift) #endif start = FFS_FSBTOBYTES(fs, td->bno); len = td->size; vn_lock(td->devvp, LK_EXCLUSIVE | LK_RETRY); #ifdef TRIMDEBUG error = #endif VOP_FDISCARD(td->devvp, start, len); VOP_UNLOCK(td->devvp); #ifdef TRIMDEBUG printf("trim(%" PRId64 ",%ld):%d\n", td->bno, td->size, error); #endif ffs_blkfree_td(fs, td); kmem_free(td, sizeof(*td)); mutex_enter(&ts->wqlk); ts->wqcnt--; if (ts->wqdraining && !ts->wqcnt) cv_signal(&ts->wqcv); mutex_exit(&ts->wqlk); } void * ffs_discard_init(struct vnode *devvp, struct fs *fs) { struct discarddata *ts; int error; ts = kmem_zalloc(sizeof (*ts), KM_SLEEP); error = workqueue_create(&ts->wq, "trimwq", ffs_discardcb, ts, PRI_USER, IPL_NONE, 0); if (error) { kmem_free(ts, sizeof (*ts)); return NULL; } mutex_init(&ts->entrylk, MUTEX_DEFAULT, IPL_NONE); mutex_init(&ts->wqlk, MUTEX_DEFAULT, IPL_NONE); cv_init(&ts->wqcv, "trimwqcv"); ts->maxsize = 100*1024; /* XXX */ ts->fs = fs; return ts; } void ffs_discard_finish(void *vts, int flags) { struct discarddata *ts = vts; struct discardopdata *td = NULL; /* wait for workqueue to drain */ mutex_enter(&ts->wqlk); if (ts->wqcnt) { ts->wqdraining = 1; cv_wait(&ts->wqcv, &ts->wqlk); } mutex_exit(&ts->wqlk); mutex_enter(&ts->entrylk); if (ts->entry) { td = ts->entry; ts->entry = NULL; } mutex_exit(&ts->entrylk); if (td) { /* XXX don't tell disk, its optional */ ffs_blkfree_td(ts->fs, td); #ifdef TRIMDEBUG printf("finish(%" PRId64 ",%ld)\n", td->bno, td->size); #endif kmem_free(td, sizeof(*td)); } cv_destroy(&ts->wqcv); mutex_destroy(&ts->entrylk); mutex_destroy(&ts->wqlk); workqueue_destroy(ts->wq); kmem_free(ts, sizeof(*ts)); } void ffs_blkfree(struct fs *fs, struct vnode *devvp, daddr_t bno, long size, ino_t inum) { struct ufsmount *ump; int error; dev_t dev; struct discarddata *ts; struct discardopdata *td; dev = devvp->v_rdev; ump = VFSTOUFS(spec_node_getmountedfs(devvp)); if (ffs_snapblkfree(fs, devvp, bno, size, inum)) return; error = ffs_check_bad_allocation(__func__, fs, bno, size, dev, inum); if (error) return; if (!ump->um_discarddata) { ffs_blkfree_cg(fs, devvp, bno, size); return; } #ifdef TRIMDEBUG printf("blkfree(%" PRId64 ",%ld)\n", bno, size); #endif ts = ump->um_discarddata; td = NULL; mutex_enter(&ts->entrylk); if (ts->entry) { td = ts->entry; /* ffs deallocs backwards, check for prepend only */ if (td->bno == bno + ffs_numfrags(fs, size) && td->size + size <= ts->maxsize) { td->bno = bno; td->size += size; if (td->size < ts->maxsize) { #ifdef TRIMDEBUG printf("defer(%" PRId64 ",%ld)\n", td->bno, td->size); #endif mutex_exit(&ts->entrylk); return; } size = 0; /* mark done */ } ts->entry = NULL; } mutex_exit(&ts->entrylk); if (td) { #ifdef TRIMDEBUG printf("enq old(%" PRId64 ",%ld)\n", td->bno, td->size); #endif mutex_enter(&ts->wqlk); ts->wqcnt++; mutex_exit(&ts->wqlk); workqueue_enqueue(ts->wq, &td->wk, NULL); } if (!size) return; td = kmem_alloc(sizeof(*td), KM_SLEEP); td->devvp = devvp; td->bno = bno; td->size = size; if (td->size < ts->maxsize) { /* XXX always the case */ mutex_enter(&ts->entrylk); if (!ts->entry) { /* possible race? */ #ifdef TRIMDEBUG printf("defer(%" PRId64 ",%ld)\n", td->bno, td->size); #endif ts->entry = td; td = NULL; } mutex_exit(&ts->entrylk); } if (td) { #ifdef TRIMDEBUG printf("enq new(%" PRId64 ",%ld)\n", td->bno, td->size); #endif mutex_enter(&ts->wqlk); ts->wqcnt++; mutex_exit(&ts->wqlk); workqueue_enqueue(ts->wq, &td->wk, NULL); } } /* * Free a block or fragment from a snapshot cg copy. * * The specified block or fragment is placed back in the * free map. If a fragment is deallocated, a possible * block reassembly is checked. * * => um_lock not held on entry or exit */ void ffs_blkfree_snap(struct fs *fs, struct vnode *devvp, daddr_t bno, long size, ino_t inum) { struct cg *cgp; struct buf *bp; struct ufsmount *ump; daddr_t cgblkno; int error, cg; dev_t dev; const bool devvp_is_snapshot = (devvp->v_type != VBLK); const int needswap = UFS_FSNEEDSWAP(fs); KASSERT(devvp_is_snapshot); cg = dtog(fs, bno); dev = VTOI(devvp)->i_devvp->v_rdev; ump = VFSTOUFS(devvp->v_mount); cgblkno = ffs_fragstoblks(fs, cgtod(fs, cg)); error = ffs_check_bad_allocation(__func__, fs, bno, size, dev, inum); if (error) return; error = bread(devvp, cgblkno, (int)fs->fs_cgsize, B_MODIFY, &bp); if (error) { return; } cgp = (struct cg *)bp->b_data; if (!cg_chkmagic(cgp, needswap)) { brelse(bp, 0); return; } ffs_blkfree_common(ump, fs, dev, bp, bno, size, devvp_is_snapshot); bdwrite(bp); } static void ffs_blkfree_common(struct ufsmount *ump, struct fs *fs, dev_t dev, struct buf *bp, daddr_t bno, long size, bool devvp_is_snapshot) { struct cg *cgp; int32_t fragno, cgbno; int i, blk, frags, bbase; u_int cg; u_int8_t *blksfree; const int needswap = UFS_FSNEEDSWAP(fs); cg = dtog(fs, bno); cgp = (struct cg *)bp->b_data; cgp->cg_old_time = ufs_rw32(time_second, needswap); if ((fs->fs_magic != FS_UFS1_MAGIC) || (fs->fs_old_flags & FS_FLAGS_UPDATED)) cgp->cg_time = ufs_rw64(time_second, needswap); cgbno = dtogd(fs, bno); blksfree = cg_blksfree(cgp, needswap); mutex_enter(&ump->um_lock); if (size == fs->fs_bsize) { fragno = ffs_fragstoblks(fs, cgbno); if (!ffs_isfreeblock(fs, blksfree, fragno)) { if (devvp_is_snapshot) { mutex_exit(&ump->um_lock); return; } panic("%s: freeing free block: dev = 0x%llx, block = %" PRId64 ", fs = %s", __func__, (unsigned long long)dev, bno, fs->fs_fsmnt); } ffs_setblock(fs, blksfree, fragno); ffs_clusteracct(fs, cgp, fragno, 1); ufs_add32(cgp->cg_cs.cs_nbfree, 1, needswap); fs->fs_cstotal.cs_nbfree++; fs->fs_cs(fs, cg).cs_nbfree++; if ((fs->fs_magic == FS_UFS1_MAGIC) && ((fs->fs_old_flags & FS_FLAGS_UPDATED) == 0)) { i = old_cbtocylno(fs, cgbno); KASSERT(i >= 0); KASSERT(i < fs->fs_old_ncyl); KASSERT(old_cbtorpos(fs, cgbno) >= 0); KASSERT(fs->fs_old_nrpos == 0 || old_cbtorpos(fs, cgbno) < fs->fs_old_nrpos); ufs_add16(old_cg_blks(fs, cgp, i, needswap)[old_cbtorpos(fs, cgbno)], 1, needswap); ufs_add32(old_cg_blktot(cgp, needswap)[i], 1, needswap); } } else { bbase = cgbno - ffs_fragnum(fs, cgbno); /* * decrement the counts associated with the old frags */ blk = blkmap(fs, blksfree, bbase); ffs_fragacct(fs, blk, cgp->cg_frsum, -1, needswap); /* * deallocate the fragment */ frags = ffs_numfrags(fs, size); for (i = 0; i < frags; i++) { if (isset(blksfree, cgbno + i)) { panic("%s: freeing free frag: " "dev = 0x%llx, block = %" PRId64 ", fs = %s", __func__, (unsigned long long)dev, bno + i, fs->fs_fsmnt); } setbit(blksfree, cgbno + i); } ufs_add32(cgp->cg_cs.cs_nffree, i, needswap); fs->fs_cstotal.cs_nffree += i; fs->fs_cs(fs, cg).cs_nffree += i; /* * add back in counts associated with the new frags */ blk = blkmap(fs, blksfree, bbase); ffs_fragacct(fs, blk, cgp->cg_frsum, 1, needswap); /* * if a complete block has been reassembled, account for it */ fragno = ffs_fragstoblks(fs, bbase); if (ffs_isblock(fs, blksfree, fragno)) { ufs_add32(cgp->cg_cs.cs_nffree, -fs->fs_frag, needswap); fs->fs_cstotal.cs_nffree -= fs->fs_frag; fs->fs_cs(fs, cg).cs_nffree -= fs->fs_frag; ffs_clusteracct(fs, cgp, fragno, 1); ufs_add32(cgp->cg_cs.cs_nbfree, 1, needswap); fs->fs_cstotal.cs_nbfree++; fs->fs_cs(fs, cg).cs_nbfree++; if ((fs->fs_magic == FS_UFS1_MAGIC) && ((fs->fs_old_flags & FS_FLAGS_UPDATED) == 0)) { i = old_cbtocylno(fs, bbase); KASSERT(i >= 0); KASSERT(i < fs->fs_old_ncyl); KASSERT(old_cbtorpos(fs, bbase) >= 0); KASSERT(fs->fs_old_nrpos == 0 || old_cbtorpos(fs, bbase) < fs->fs_old_nrpos); ufs_add16(old_cg_blks(fs, cgp, i, needswap)[old_cbtorpos(fs, bbase)], 1, needswap); ufs_add32(old_cg_blktot(cgp, needswap)[i], 1, needswap); } } } fs->fs_fmod = 1; ACTIVECG_CLR(fs, cg); mutex_exit(&ump->um_lock); } /* * Free an inode. */ int ffs_vfree(struct vnode *vp, ino_t ino, int mode) { return ffs_freefile(vp->v_mount, ino, mode); } /* * Do the actual free operation. * The specified inode is placed back in the free map. * * => um_lock not held on entry or exit */ int ffs_freefile(struct mount *mp, ino_t ino, int mode) { struct ufsmount *ump = VFSTOUFS(mp); struct fs *fs = ump->um_fs; struct vnode *devvp; struct cg *cgp; struct buf *bp; int error; u_int cg; daddr_t cgbno; dev_t dev; const int needswap = UFS_FSNEEDSWAP(fs); cg = ino_to_cg(fs, ino); devvp = ump->um_devvp; dev = devvp->v_rdev; cgbno = FFS_FSBTODB(fs, cgtod(fs, cg)); if (ino >= fs->fs_ipg * fs->fs_ncg) panic("%s: range: dev = 0x%llx, ino = %llu, fs = %s", __func__, (long long)dev, (unsigned long long)ino, fs->fs_fsmnt); error = bread(devvp, cgbno, (int)fs->fs_cgsize, B_MODIFY, &bp); if (error) { return (error); } cgp = (struct cg *)bp->b_data; if (!cg_chkmagic(cgp, needswap)) { brelse(bp, 0); return (0); } ffs_freefile_common(ump, fs, dev, bp, ino, mode, false); bdwrite(bp); return 0; } int ffs_freefile_snap(struct fs *fs, struct vnode *devvp, ino_t ino, int mode) { struct ufsmount *ump; struct cg *cgp; struct buf *bp; int error, cg; daddr_t cgbno; dev_t dev; const int needswap = UFS_FSNEEDSWAP(fs); KASSERT(devvp->v_type != VBLK); cg = ino_to_cg(fs, ino); dev = VTOI(devvp)->i_devvp->v_rdev; ump = VFSTOUFS(devvp->v_mount); cgbno = ffs_fragstoblks(fs, cgtod(fs, cg)); if (ino >= fs->fs_ipg * fs->fs_ncg) panic("%s: range: dev = 0x%llx, ino = %llu, fs = %s", __func__, (unsigned long long)dev, (unsigned long long)ino, fs->fs_fsmnt); error = bread(devvp, cgbno, (int)fs->fs_cgsize, B_MODIFY, &bp); if (error) { return (error); } cgp = (struct cg *)bp->b_data; if (!cg_chkmagic(cgp, needswap)) { brelse(bp, 0); return (0); } ffs_freefile_common(ump, fs, dev, bp, ino, mode, true); bdwrite(bp); return 0; } static void ffs_freefile_common(struct ufsmount *ump, struct fs *fs, dev_t dev, struct buf *bp, ino_t ino, int mode, bool devvp_is_snapshot) { u_int cg; struct cg *cgp; u_int8_t *inosused; const int needswap = UFS_FSNEEDSWAP(fs); ino_t cgino; cg = ino_to_cg(fs, ino); cgp = (struct cg *)bp->b_data; cgp->cg_old_time = ufs_rw32(time_second, needswap); if ((fs->fs_magic != FS_UFS1_MAGIC) || (fs->fs_old_flags & FS_FLAGS_UPDATED)) cgp->cg_time = ufs_rw64(time_second, needswap); inosused = cg_inosused(cgp, needswap); cgino = ino % fs->fs_ipg; if (isclr(inosused, cgino)) { printf("ifree: dev = 0x%llx, ino = %llu, fs = %s\n", (unsigned long long)dev, (unsigned long long)ino, fs->fs_fsmnt); if (fs->fs_ronly == 0) panic("%s: freeing free inode", __func__); } clrbit(inosused, cgino); if (!devvp_is_snapshot) UFS_WAPBL_UNREGISTER_INODE(ump->um_mountp, ino, mode); if (cgino < ufs_rw32(cgp->cg_irotor, needswap)) cgp->cg_irotor = ufs_rw32(cgino, needswap); ufs_add32(cgp->cg_cs.cs_nifree, 1, needswap); mutex_enter(&ump->um_lock); fs->fs_cstotal.cs_nifree++; fs->fs_cs(fs, cg).cs_nifree++; if ((mode & IFMT) == IFDIR) { ufs_add32(cgp->cg_cs.cs_ndir, -1, needswap); fs->fs_cstotal.cs_ndir--; fs->fs_cs(fs, cg).cs_ndir--; } fs->fs_fmod = 1; ACTIVECG_CLR(fs, cg); mutex_exit(&ump->um_lock); } /* * Check to see if a file is free. */ int ffs_checkfreefile(struct fs *fs, struct vnode *devvp, ino_t ino) { struct cg *cgp; struct buf *bp; daddr_t cgbno; int ret; u_int cg; u_int8_t *inosused; const bool devvp_is_snapshot = (devvp->v_type != VBLK); KASSERT(devvp_is_snapshot); cg = ino_to_cg(fs, ino); if (devvp_is_snapshot) cgbno = ffs_fragstoblks(fs, cgtod(fs, cg)); else cgbno = FFS_FSBTODB(fs, cgtod(fs, cg)); if (ino >= fs->fs_ipg * fs->fs_ncg) return 1; if (bread(devvp, cgbno, (int)fs->fs_cgsize, 0, &bp)) { return 1; } cgp = (struct cg *)bp->b_data; if (!cg_chkmagic(cgp, UFS_FSNEEDSWAP(fs))) { brelse(bp, 0); return 1; } inosused = cg_inosused(cgp, UFS_FSNEEDSWAP(fs)); ino %= fs->fs_ipg; ret = isclr(inosused, ino); brelse(bp, 0); return ret; } /* * Find a block of the specified size in the specified cylinder group. * * It is a panic if a request is made to find a block if none are * available. */ static int32_t ffs_mapsearch(struct fs *fs, struct cg *cgp, daddr_t bpref, int allocsiz) { int32_t bno; int start, len, loc, i; int blk, field, subfield, pos; int ostart, olen; u_int8_t *blksfree; const int needswap = UFS_FSNEEDSWAP(fs); /* KASSERT(mutex_owned(&ump->um_lock)); */ /* * find the fragment by searching through the free block * map for an appropriate bit pattern */ if (bpref) start = dtogd(fs, bpref) / NBBY; else start = ufs_rw32(cgp->cg_frotor, needswap) / NBBY; blksfree = cg_blksfree(cgp, needswap); len = howmany(fs->fs_fpg, NBBY) - start; ostart = start; olen = len; loc = scanc((u_int)len, (const u_char *)&blksfree[start], (const u_char *)fragtbl[fs->fs_frag], (1 << (allocsiz - 1 + (fs->fs_frag & (NBBY - 1))))); if (loc == 0) { len = start + 1; start = 0; loc = scanc((u_int)len, (const u_char *)&blksfree[0], (const u_char *)fragtbl[fs->fs_frag], (1 << (allocsiz - 1 + (fs->fs_frag & (NBBY - 1))))); if (loc == 0) { panic("%s: map corrupted: start=%d, len=%d, " "fs = %s, offset=%d/%ld, cg %d", __func__, ostart, olen, fs->fs_fsmnt, ufs_rw32(cgp->cg_freeoff, needswap), (long)blksfree - (long)cgp, cgp->cg_cgx); /* NOTREACHED */ } } bno = (start + len - loc) * NBBY; cgp->cg_frotor = ufs_rw32(bno, needswap); /* * found the byte in the map * sift through the bits to find the selected frag */ for (i = bno + NBBY; bno < i; bno += fs->fs_frag) { blk = blkmap(fs, blksfree, bno); blk <<= 1; field = around[allocsiz]; subfield = inside[allocsiz]; for (pos = 0; pos <= fs->fs_frag - allocsiz; pos++) { if ((blk & field) == subfield) return (bno + pos); field <<= 1; subfield <<= 1; } } panic("%s: block not in map: bno=%d, fs=%s", __func__, bno, fs->fs_fsmnt); /* return (-1); */ } /* * Fserr prints the name of a file system with an error diagnostic. * * The form of the error message is: * fs: error message */ static void ffs_fserr(struct fs *fs, kauth_cred_t cred, const char *cp) { KASSERT(cred != NULL); if (cred == NOCRED || cred == FSCRED) { log(LOG_ERR, "pid %d, command %s, on %s: %s\n", curproc->p_pid, curproc->p_comm, fs->fs_fsmnt, cp); } else { log(LOG_ERR, "uid %d, pid %d, command %s, on %s: %s\n", kauth_cred_getuid(cred), curproc->p_pid, curproc->p_comm, fs->fs_fsmnt, cp); } }
237 241 75 75 74 75 82 82 81 7 75 18 18 18 18 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 /* $NetBSD: ipi.c,v 1.30 2019/12/01 15:34:46 ad Exp $ */ /*- * Copyright (c) 2000, 2008, 2009, 2019 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by RedBack Networks Inc. * * Author: Bill Sommerfeld * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: ipi.c,v 1.30 2019/12/01 15:34:46 ad Exp $"); #include "opt_mtrr.h" #include <sys/param.h> #include <sys/device.h> #include <sys/systm.h> #include <sys/atomic.h> #include <sys/intr.h> #include <sys/ipi.h> #include <sys/cpu.h> #include <sys/xcall.h> #ifdef MULTIPROCESSOR #include <machine/cpufunc.h> #include <machine/cpuvar.h> #include <machine/i82093var.h> #include <machine/i82489reg.h> #include <machine/i82489var.h> #include <machine/mtrr.h> #include <machine/gdt.h> #include "acpica.h" #include <x86/fpu.h> static void x86_ipi_ast(struct cpu_info *); static void x86_ipi_halt(struct cpu_info *); static void x86_ipi_kpreempt(struct cpu_info *); static void x86_ipi_xcall(struct cpu_info *); static void x86_ipi_generic(struct cpu_info *); #ifdef MTRR static void x86_ipi_reload_mtrr(struct cpu_info *); #else #define x86_ipi_reload_mtrr NULL #endif #if NACPICA > 0 void acpi_cpu_sleep(struct cpu_info *); #else #define acpi_cpu_sleep NULL #endif static void x86_ipi_synch_fpu(struct cpu_info *); void (* const ipifunc[X86_NIPI])(struct cpu_info *) = { x86_ipi_halt, /* X86_IPI_HALT */ x86_ipi_ast, /* X86_IPI_AST */ x86_ipi_generic, /* X86_IPI_GENERIC */ x86_ipi_synch_fpu, /* X86_IPI_SYNCH_FPU */ x86_ipi_reload_mtrr, /* X86_IPI_MTRR */ NULL, /* X86_IPI_GDT */ x86_ipi_xcall, /* X86_IPI_XCALL */ acpi_cpu_sleep, /* X86_IPI_ACPI_CPU_SLEEP */ x86_ipi_kpreempt /* X86_IPI_KPREEMPT */ }; /* * x86 IPI interface. */ int x86_send_ipi(struct cpu_info *ci, int ipimask) { uint32_t o, n; int ret = 0; /* Don't send IPI to CPU which isn't (yet) running. */ if (__predict_false((ci->ci_flags & CPUF_RUNNING) == 0)) return ENOENT; /* Set in new IPI bit, and capture previous state. */ for (o = 0;; o = n) { n = atomic_cas_32(&ci->ci_ipis, o, o | ipimask); if (__predict_true(o == n)) { break; } } /* If no IPI already pending, send one. */ if (o == 0) { ret = x86_ipi(LAPIC_IPI_VECTOR, ci->ci_cpuid, LAPIC_DLMODE_FIXED); if (ret != 0) { printf("ipi of %x from %s to %s failed\n", ipimask, device_xname(curcpu()->ci_dev), device_xname(ci->ci_dev)); } } return ret; } void x86_broadcast_ipi(int ipimask) { struct cpu_info *ci, *self = curcpu(); int count = 0; CPU_INFO_ITERATOR cii; for (CPU_INFO_FOREACH(cii, ci)) { if (ci == self) continue; if ((ci->ci_flags & CPUF_RUNNING) == 0) continue; atomic_or_32(&ci->ci_ipis, ipimask); count++; } if (!count) return; x86_ipi(LAPIC_IPI_VECTOR, LAPIC_DEST_ALLEXCL, LAPIC_DLMODE_FIXED); } void x86_ipi_handler(void) { struct cpu_info *ci = curcpu(); uint32_t pending; int bit; pending = atomic_swap_32(&ci->ci_ipis, 0); KDASSERT((pending >> X86_NIPI) == 0); while ((bit = ffs(pending)) != 0) { bit--; pending &= ~(1 << bit); ci->ci_ipi_events[bit].ev_count++; (*ipifunc[bit])(ci); } } /* * Common x86 IPI handlers. */ static void x86_ipi_halt(struct cpu_info *ci) { x86_disable_intr(); atomic_and_32(&ci->ci_flags, ~CPUF_RUNNING); for (;;) { x86_hlt(); } } static void x86_ipi_synch_fpu(struct cpu_info *ci) { panic("%s: impossible", __func__); } #ifdef MTRR static void x86_ipi_reload_mtrr(struct cpu_info *ci) { if (mtrr_funcs != NULL) { /* * mtrr_reload_cpu() is a macro in mtrr.h which picks * the appropriate function to use. */ mtrr_reload_cpu(ci); } } #endif static void x86_ipi_kpreempt(struct cpu_info *ci) { softint_trigger(1 << SIR_PREEMPT); } static void x86_ipi_ast(struct cpu_info *ci) { aston(ci->ci_onproc); } /* * MD support for xcall(9) interface. */ static void x86_ipi_xcall(struct cpu_info *ci) { xc_ipi_handler(); } static void x86_ipi_generic(struct cpu_info *ci) { ipi_cpu_handler(); } void xc_send_ipi(struct cpu_info *ci) { KASSERT(kpreempt_disabled()); KASSERT(curcpu() != ci); if (ci) { /* Unicast: remote CPU. */ x86_send_ipi(ci, X86_IPI_XCALL); } else { /* Broadcast: all, but local CPU (caller will handle it). */ x86_broadcast_ipi(X86_IPI_XCALL); } } void cpu_ipi(struct cpu_info *ci) { KASSERT(kpreempt_disabled()); KASSERT(curcpu() != ci); if (ci) { /* Unicast: remote CPU. */ x86_send_ipi(ci, X86_IPI_GENERIC); } else { /* Broadcast: all, but local CPU (caller will handle it). */ x86_broadcast_ipi(X86_IPI_GENERIC); } } #else int x86_send_ipi(struct cpu_info *ci, int ipimask) { return 0; } void x86_broadcast_ipi(int ipimask) { } void cpu_ipi(struct cpu_info *ci) { } #endif
5 4 2 5 1 2 2 2 2 2 2 2 2 2 2 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 /* $NetBSD: at_control.c,v 1.44 2023/03/30 15:58:10 riastradh Exp $ */ /* * Copyright (c) 1990,1994 Regents of The University of Michigan. * All Rights Reserved. * * Permission to use, copy, modify, and distribute this software and * its documentation for any purpose and without fee is hereby granted, * provided that the above copyright notice appears in all copies and * that both that copyright notice and this permission notice appear * in supporting documentation, and that the name of The University * of Michigan not be used in advertising or publicity pertaining to * distribution of the software without specific, written prior * permission. This software is supplied as is without expressed or * implied warranties of any kind. * * This product includes software developed by the University of * California, Berkeley and its contributors. * * Research Systems Unix Group * The University of Michigan * c/o Wesley Craig * 535 W. William Street * Ann Arbor, Michigan * +1-313-764-2278 * netatalk@umich.edu */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: at_control.c,v 1.44 2023/03/30 15:58:10 riastradh Exp $"); #include "opt_atalk.h" #include <sys/param.h> #include <sys/systm.h> #include <sys/proc.h> #include <sys/errno.h> #include <sys/ioctl.h> #include <sys/mbuf.h> #include <sys/kernel.h> #include <sys/socket.h> #include <sys/socketvar.h> #include <sys/kauth.h> #include <net/if.h> #include <net/route.h> #include <net/if_ether.h> #include <netinet/in.h> #undef s_net #include <netatalk/at.h> #include <netatalk/at_var.h> #include <netatalk/aarp.h> #include <netatalk/phase2.h> #include <netatalk/at_extern.h> static int aa_dorangeroute(struct ifaddr * ifa, u_int first, u_int last, int cmd); static int aa_addsingleroute(struct ifaddr * ifa, struct at_addr * addr, struct at_addr * mask); static int aa_delsingleroute(struct ifaddr * ifa, struct at_addr * addr, struct at_addr * mask); static int aa_dosingleroute(struct ifaddr * ifa, struct at_addr * addr, struct at_addr * mask, int cmd, int flags); static int at_scrub(struct ifnet * ifp, struct at_ifaddr * aa); static int at_ifinit(struct ifnet *, struct at_ifaddr *, const struct sockaddr_at *); #if 0 static void aa_clean(void); #endif #define sateqaddr(a,b) ((a)->sat_len == (b)->sat_len && \ (a)->sat_family == (b)->sat_family && \ (a)->sat_addr.s_net == (b)->sat_addr.s_net && \ (a)->sat_addr.s_node == (b)->sat_addr.s_node ) int at_control(u_long cmd, void *data, struct ifnet *ifp) { struct ifreq *ifr = (struct ifreq *) data; const struct sockaddr_at *csat; struct netrange *nr; const struct netrange *cnr; struct at_aliasreq *ifra = (struct at_aliasreq *) data; struct at_ifaddr *aa0; struct at_ifaddr *aa = 0; /* * If we have an ifp, then find the matching at_ifaddr if it exists */ if (ifp) TAILQ_FOREACH(aa, &at_ifaddr, aa_list) if (aa->aa_ifp == ifp) break; /* * In this first switch table we are basically getting ready for * the second one, by getting the atalk-specific things set up * so that they start to look more similar to other protocols etc. */ switch (cmd) { case SIOCAIFADDR: case SIOCDIFADDR: /* * If we have an appletalk sockaddr, scan forward of where * we are now on the at_ifaddr list to find one with a matching * address on this interface. * This may leave aa pointing to the first address on the * NEXT interface! */ if (ifra->ifra_addr.sat_family == AF_APPLETALK) { for (; aa; aa = TAILQ_NEXT(aa, aa_list)) if (aa->aa_ifp == ifp && sateqaddr(&aa->aa_addr, &ifra->ifra_addr)) break; } /* * If we a retrying to delete an address but didn't find such, * then return with an error */ if (cmd == SIOCDIFADDR && aa == 0) return (EADDRNOTAVAIL); /* FALLTHROUGH */ case SIOCSIFADDR: /* * If we are not superuser, then we don't get to do these * ops. */ if (kauth_authorize_network(kauth_cred_get(), KAUTH_NETWORK_INTERFACE, KAUTH_REQ_NETWORK_INTERFACE_SETPRIV, ifp, (void *)cmd, NULL) != 0) return (EPERM); csat = satocsat(ifreq_getaddr(cmd, ifr)); cnr = (const struct netrange *)csat->sat_zero; if (cnr->nr_phase == 1) { /* * Look for a phase 1 address on this interface. * This may leave aa pointing to the first address on * the NEXT interface! */ for (; aa; aa = TAILQ_NEXT(aa, aa_list)) { if (aa->aa_ifp == ifp && (aa->aa_flags & AFA_PHASE2) == 0) break; } } else { /* default to phase 2 */ /* * Look for a phase 2 address on this interface. * This may leave aa pointing to the first address on * the NEXT interface! */ for (; aa; aa = TAILQ_NEXT(aa, aa_list)) { if (aa->aa_ifp == ifp && (aa->aa_flags & AFA_PHASE2)) break; } } if (ifp == 0) panic("at_control"); /* * If we failed to find an existing at_ifaddr entry, then we * allocate a fresh one. * XXX change this to use malloc */ if (aa == (struct at_ifaddr *) 0) { aa = (struct at_ifaddr *) malloc(sizeof(struct at_ifaddr), M_IFADDR, M_WAITOK|M_ZERO); if (aa == NULL) return (ENOBUFS); callout_init(&aa->aa_probe_ch, 0); if ((aa0 = TAILQ_FIRST(&at_ifaddr)) != NULL) { /* * Don't let the loopback be first, since the * first address is the machine's default * address for binding. * If it is, stick ourself in front, otherwise * go to the back of the list. */ if (aa0->aa_ifp->if_flags & IFF_LOOPBACK) { TAILQ_INSERT_HEAD(&at_ifaddr, aa, aa_list); } else { TAILQ_INSERT_TAIL(&at_ifaddr, aa, aa_list); } } else { TAILQ_INSERT_TAIL(&at_ifaddr, aa, aa_list); } ifaref(&aa->aa_ifa); ifa_psref_init(&aa->aa_ifa); /* * Find the end of the interface's addresses * and link our new one on the end */ ifa_insert(ifp, &aa->aa_ifa); /* * As the at_ifaddr contains the actual sockaddrs, * and the ifaddr itself, link them al together * correctly. */ aa->aa_ifa.ifa_addr = (struct sockaddr *) &aa->aa_addr; aa->aa_ifa.ifa_dstaddr = (struct sockaddr *) &aa->aa_addr; aa->aa_ifa.ifa_netmask = (struct sockaddr *) &aa->aa_netmask; /* * Set/clear the phase 2 bit. */ if (cnr->nr_phase == 1) aa->aa_flags &= ~AFA_PHASE2; else aa->aa_flags |= AFA_PHASE2; /* * and link it all together */ aa->aa_ifp = ifp; } else { /* * If we DID find one then we clobber any routes * dependent on it.. */ at_scrub(ifp, aa); } break; case SIOCGIFADDR: csat = satocsat(ifreq_getaddr(cmd, ifr)); cnr = (const struct netrange *)csat->sat_zero; if (cnr->nr_phase == 1) { /* * If the request is specifying phase 1, then * only look at a phase one address */ for (; aa; aa = TAILQ_NEXT(aa, aa_list)) { if (aa->aa_ifp == ifp && (aa->aa_flags & AFA_PHASE2) == 0) break; } } else if (cnr->nr_phase == 2) { /* * If the request is specifying phase 2, then * only look at a phase two address */ for (; aa; aa = TAILQ_NEXT(aa, aa_list)) { if (aa->aa_ifp == ifp && (aa->aa_flags & AFA_PHASE2)) break; } } else { /* * default to everything */ for (; aa; aa = TAILQ_NEXT(aa, aa_list)) { if (aa->aa_ifp == ifp) break; } } if (aa == (struct at_ifaddr *) 0) return (EADDRNOTAVAIL); break; } /* * By the time this switch is run we should be able to assume that * the "aa" pointer is valid when needed. */ switch (cmd) { case SIOCGIFADDR: { union { struct sockaddr sa; struct sockaddr_at sat; } u; /* * copy the contents of the sockaddr blindly. */ sockaddr_copy(&u.sa, sizeof(u), (const struct sockaddr *)&aa->aa_addr); /* * and do some cleanups */ nr = (struct netrange *)&u.sat.sat_zero; nr->nr_phase = (aa->aa_flags & AFA_PHASE2) ? 2 : 1; nr->nr_firstnet = aa->aa_firstnet; nr->nr_lastnet = aa->aa_lastnet; ifreq_setaddr(cmd, ifr, &u.sa); break; } case SIOCSIFADDR: return at_ifinit(ifp, aa, (const struct sockaddr_at *)ifreq_getaddr(cmd, ifr)); case SIOCAIFADDR: if (sateqaddr(&ifra->ifra_addr, &aa->aa_addr)) return 0; return at_ifinit(ifp, aa, (const struct sockaddr_at *)ifreq_getaddr(cmd, ifr)); case SIOCDIFADDR: at_purgeaddr(&aa->aa_ifa); break; default: return ENOTTY; } return (0); } void at_purgeaddr(struct ifaddr *ifa) { struct ifnet *ifp = ifa->ifa_ifp; struct at_ifaddr *aa = (void *) ifa; /* * scrub all routes.. didn't we just DO this? XXX yes, del it * XXX above XXX not necessarily true anymore */ at_scrub(ifp, aa); /* * remove the ifaddr from the interface */ ifa_remove(ifp, &aa->aa_ifa); TAILQ_REMOVE(&at_ifaddr, aa, aa_list); ifafree(&aa->aa_ifa); } void at_purgeif(struct ifnet *ifp) { if_purgeaddrs(ifp, AF_APPLETALK, at_purgeaddr); } /* * Given an interface and an at_ifaddr (supposedly on that interface) remove * any routes that depend on this. Why ifp is needed I'm not sure, as * aa->at_ifaddr.ifa_ifp should be the same. */ static int at_scrub(struct ifnet *ifp, struct at_ifaddr *aa) { int error = 0; if (aa->aa_flags & AFA_ROUTE) { if (ifp->if_flags & IFF_LOOPBACK) error = aa_delsingleroute(&aa->aa_ifa, &aa->aa_addr.sat_addr, &aa->aa_netmask.sat_addr); else if (ifp->if_flags & IFF_POINTOPOINT) error = rtinit(&aa->aa_ifa, RTM_DELETE, RTF_HOST); else if (ifp->if_flags & IFF_BROADCAST) error = aa_dorangeroute(&aa->aa_ifa, ntohs(aa->aa_firstnet), ntohs(aa->aa_lastnet), RTM_DELETE); aa->aa_ifa.ifa_flags &= ~IFA_ROUTE; aa->aa_flags &= ~AFA_ROUTE; } return error; } /* * given an at_ifaddr,a sockaddr_at and an ifp, * bang them all together at high speed and see what happens */ static int at_ifinit(struct ifnet *ifp, struct at_ifaddr *aa, const struct sockaddr_at *sat) { struct netrange nr, onr; struct sockaddr_at oldaddr; int s = splnet(), error = 0, i, j; int netinc, nodeinc, nnets; u_short net; /* * save the old addresses in the at_ifaddr just in case we need them. */ oldaddr = aa->aa_addr; onr.nr_firstnet = aa->aa_firstnet; onr.nr_lastnet = aa->aa_lastnet; /* * take the address supplied as an argument, and add it to the * at_ifnet (also given). Remember ing to update * those parts of the at_ifaddr that need special processing */ memset(AA_SAT(aa), 0, sizeof(struct sockaddr_at)); memcpy(&nr, sat->sat_zero, sizeof(struct netrange)); memcpy(AA_SAT(aa)->sat_zero, sat->sat_zero, sizeof(struct netrange)); nnets = ntohs(nr.nr_lastnet) - ntohs(nr.nr_firstnet) + 1; aa->aa_firstnet = nr.nr_firstnet; aa->aa_lastnet = nr.nr_lastnet; #ifdef NETATALKDEBUG printf("at_ifinit: %s: %u.%u range %u-%u phase %d\n", ifp->if_xname, ntohs(sat->sat_addr.s_net), sat->sat_addr.s_node, ntohs(aa->aa_firstnet), ntohs(aa->aa_lastnet), (aa->aa_flags & AFA_PHASE2) ? 2 : 1); #endif /* * We could eliminate the need for a second phase 1 probe (post * autoconf) if we check whether we're resetting the node. Note * that phase 1 probes use only nodes, not net.node pairs. Under * phase 2, both the net and node must be the same. */ AA_SAT(aa)->sat_len = sizeof(struct sockaddr_at); AA_SAT(aa)->sat_family = AF_APPLETALK; if (ifp->if_flags & IFF_LOOPBACK) { AA_SAT(aa)->sat_addr.s_net = sat->sat_addr.s_net; AA_SAT(aa)->sat_addr.s_node = sat->sat_addr.s_node; #if 0 } else if (fp->if_flags & IFF_POINTOPOINT) { /* unimplemented */ /* * we'd have to copy the dstaddr field over from the sat * but it's not clear that it would contain the right info.. */ #endif } else { /* * We are a normal (probably ethernet) interface. * apply the new address to the interface structures etc. * We will probe this address on the net first, before * applying it to ensure that it is free.. If it is not, then * we will try a number of other randomly generated addresses * in this net and then increment the net. etc.etc. until * we find an unused address. */ aa->aa_flags |= AFA_PROBING; /* if not loopback we Must * probe? */ if (aa->aa_flags & AFA_PHASE2) { if (sat->sat_addr.s_net == ATADDR_ANYNET) { /* * If we are phase 2, and the net was not * specified * then we select a random net * within the supplied netrange. * XXX use /dev/random? */ if (nnets != 1) { net = ntohs(nr.nr_firstnet) + time_second % (nnets - 1); } else { net = ntohs(nr.nr_firstnet); } } else { /* * if a net was supplied, then check that it * is within the netrange. If it is not then * replace the old values and return an error */ if (ntohs(sat->sat_addr.s_net) < ntohs(nr.nr_firstnet) || ntohs(sat->sat_addr.s_net) > ntohs(nr.nr_lastnet)) { aa->aa_addr = oldaddr; aa->aa_firstnet = onr.nr_firstnet; aa->aa_lastnet = onr.nr_lastnet; splx(s); return (EINVAL); } /* * otherwise just use the new net number.. */ net = ntohs(sat->sat_addr.s_net); } } else { /* * we must be phase one, so just use whatever we were * given. I guess it really isn't going to be used... * RIGHT? */ net = ntohs(sat->sat_addr.s_net); } /* * set the node part of the address into the ifaddr. If it's * not specified, be random about it... XXX use /dev/random? */ if (sat->sat_addr.s_node == ATADDR_ANYNODE) { AA_SAT(aa)->sat_addr.s_node = time_second; } else { AA_SAT(aa)->sat_addr.s_node = sat->sat_addr.s_node; } /* * step through the nets in the range starting at the * (possibly random) start point. */ for (i = nnets, netinc = 1; i > 0; net = ntohs(nr.nr_firstnet) + ((net - ntohs(nr.nr_firstnet) + netinc) % nnets), i--) { AA_SAT(aa)->sat_addr.s_net = htons(net); /* * using a rather strange stepping method, * stagger through the possible node addresses * Once again, starting at the (possibly random) * initial node address. */ for (j = 0, nodeinc = time_second | 1; j < 256; j++, AA_SAT(aa)->sat_addr.s_node += nodeinc) { if (AA_SAT(aa)->sat_addr.s_node > 253 || AA_SAT(aa)->sat_addr.s_node < 1) { continue; } aa->aa_probcnt = 10; /* * start off the probes as an asynchronous * activity. though why wait 200mSec? */ callout_reset(&aa->aa_probe_ch, hz / 5, aarpprobe, ifp); if (tsleep(aa, PPAUSE | PCATCH, "at_ifinit", 0)) { /* * theoretically we shouldn't time out * here so if we returned with an error. */ printf("at_ifinit: timeout?!\n"); aa->aa_addr = oldaddr; aa->aa_firstnet = onr.nr_firstnet; aa->aa_lastnet = onr.nr_lastnet; splx(s); return (EINTR); } /* * The async activity should have woken us * up. We need to see if it was successful in * finding a free spot, or if we need to * iterate to the next address to try. */ if ((aa->aa_flags & AFA_PROBING) == 0) break; } /* * of course we need to break out through two loops... */ if ((aa->aa_flags & AFA_PROBING) == 0) break; /* reset node for next network */ AA_SAT(aa)->sat_addr.s_node = time_second; } /* * if we are still trying to probe, then we have finished all * the possible addresses, so we need to give up */ if (aa->aa_flags & AFA_PROBING) { aa->aa_addr = oldaddr; aa->aa_firstnet = onr.nr_firstnet; aa->aa_lastnet = onr.nr_lastnet; splx(s); return (EADDRINUSE); } } /* * Now that we have selected an address, we need to tell the * interface about it, just in case it needs to adjust something. */ if ((error = if_addr_init(ifp, &aa->aa_ifa, true)) != 0) { /* * of course this could mean that it objects violently * so if it does, we back out again.. */ aa->aa_addr = oldaddr; aa->aa_firstnet = onr.nr_firstnet; aa->aa_lastnet = onr.nr_lastnet; splx(s); return (error); } /* * set up the netmask part of the at_ifaddr and point the appropriate * pointer in the ifaddr to it. probably pointless, but what the * heck.. XXX */ memset(&aa->aa_netmask, 0, sizeof(aa->aa_netmask)); aa->aa_netmask.sat_len = sizeof(struct sockaddr_at); aa->aa_netmask.sat_family = AF_APPLETALK; aa->aa_netmask.sat_addr.s_net = 0xffff; aa->aa_netmask.sat_addr.s_node = 0; #if 0 aa->aa_ifa.ifa_netmask = (struct sockaddr *) &(aa->aa_netmask);/* XXX */ #endif /* * Initialize broadcast (or remote p2p) address */ memset(&aa->aa_broadaddr, 0, sizeof(aa->aa_broadaddr)); aa->aa_broadaddr.sat_len = sizeof(struct sockaddr_at); aa->aa_broadaddr.sat_family = AF_APPLETALK; aa->aa_ifa.ifa_metric = ifp->if_metric; if (ifp->if_flags & IFF_BROADCAST) { aa->aa_broadaddr.sat_addr.s_net = htons(ATADDR_ANYNET); aa->aa_broadaddr.sat_addr.s_node = ATADDR_BCAST; aa->aa_ifa.ifa_broadaddr = (struct sockaddr *) &aa->aa_broadaddr; /* add the range of routes needed */ error = aa_dorangeroute(&aa->aa_ifa, ntohs(aa->aa_firstnet), ntohs(aa->aa_lastnet), RTM_ADD); } else if (ifp->if_flags & IFF_POINTOPOINT) { struct at_addr rtaddr, rtmask; memset(&rtaddr, 0, sizeof(rtaddr)); memset(&rtmask, 0, sizeof(rtmask)); /* fill in the far end if we know it here XXX */ aa->aa_ifa.ifa_dstaddr = (struct sockaddr *) & aa->aa_dstaddr; error = aa_addsingleroute(&aa->aa_ifa, &rtaddr, &rtmask); } else if (ifp->if_flags & IFF_LOOPBACK) { struct at_addr rtaddr, rtmask; memset(&rtaddr, 0, sizeof(rtaddr)); memset(&rtmask, 0, sizeof(rtmask)); rtaddr.s_net = AA_SAT(aa)->sat_addr.s_net; rtaddr.s_node = AA_SAT(aa)->sat_addr.s_node; rtmask.s_net = 0xffff; rtmask.s_node = 0x0; error = aa_addsingleroute(&aa->aa_ifa, &rtaddr, &rtmask); } /* * of course if we can't add these routes we back out, but it's getting * risky by now XXX */ if (error) { at_scrub(ifp, aa); aa->aa_addr = oldaddr; aa->aa_firstnet = onr.nr_firstnet; aa->aa_lastnet = onr.nr_lastnet; splx(s); return (error); } /* * note that the address has a route associated with it.... */ aa->aa_ifa.ifa_flags |= IFA_ROUTE; aa->aa_flags |= AFA_ROUTE; splx(s); return (0); } /* * check whether a given address is a broadcast address for us.. */ int at_broadcast(const struct sockaddr_at *sat) { struct at_ifaddr *aa; /* * If the node is not right, it can't be a broadcast */ if (sat->sat_addr.s_node != ATADDR_BCAST) return 0; /* * If the node was right then if the net is right, it's a broadcast */ if (sat->sat_addr.s_net == ATADDR_ANYNET) return 1; /* * failing that, if the net is one we have, it's a broadcast as well. */ TAILQ_FOREACH(aa, &at_ifaddr, aa_list) { if ((aa->aa_ifp->if_flags & IFF_BROADCAST) && (ntohs(sat->sat_addr.s_net) >= ntohs(aa->aa_firstnet) && ntohs(sat->sat_addr.s_net) <= ntohs(aa->aa_lastnet))) return 1; } return 0; } /* * aa_dorangeroute() * * Add a route for a range of networks from bot to top - 1. * Algorithm: * * Split the range into two subranges such that the middle * of the two ranges is the point where the highest bit of difference * between the two addresses, makes its transition * Each of the upper and lower ranges might not exist, or might be * representable by 1 or more netmasks. In addition, if both * ranges can be represented by the same netmask, then teh can be merged * by using the next higher netmask.. */ static int aa_dorangeroute(struct ifaddr *ifa, u_int bot, u_int top, int cmd) { u_int mask1; struct at_addr addr; struct at_addr mask; int error; /* * slight sanity check */ if (bot > top) return (EINVAL); addr.s_node = 0; mask.s_node = 0; /* * just start out with the lowest boundary * and keep extending the mask till it's too big. */ while (bot <= top) { mask1 = 1; while (((bot & ~mask1) >= bot) && ((bot | mask1) <= top)) { mask1 <<= 1; mask1 |= 1; } mask1 >>= 1; mask.s_net = htons(~mask1); addr.s_net = htons(bot); if (cmd == RTM_ADD) { error = aa_addsingleroute(ifa, &addr, &mask); if (error) { /* XXX clean up? */ return (error); } } else { error = aa_delsingleroute(ifa, &addr, &mask); } bot = (bot | mask1) + 1; } return 0; } static int aa_addsingleroute(struct ifaddr *ifa, struct at_addr *addr, struct at_addr *mask) { int error; #ifdef NETATALKDEBUG printf("aa_addsingleroute: %x.%x mask %x.%x ...", ntohs(addr->s_net), addr->s_node, ntohs(mask->s_net), mask->s_node); #endif error = aa_dosingleroute(ifa, addr, mask, RTM_ADD, RTF_UP); #ifdef NETATALKDEBUG if (error) printf("aa_addsingleroute: error %d\n", error); #endif return (error); } static int aa_delsingleroute(struct ifaddr *ifa, struct at_addr *addr, struct at_addr *mask) { int error; #ifdef NETATALKDEBUG printf("aa_delsingleroute: %x.%x mask %x.%x ...", ntohs(addr->s_net), addr->s_node, ntohs(mask->s_net), mask->s_node); #endif error = aa_dosingleroute(ifa, addr, mask, RTM_DELETE, 0); #ifdef NETATALKDEBUG if (error) printf("aa_delsingleroute: error %d\n", error); #endif return (error); } static int aa_dosingleroute(struct ifaddr *ifa, struct at_addr *at_addr, struct at_addr *at_mask, int cmd, int flags) { struct sockaddr_at addr, mask, *gate; memset(&addr, 0, sizeof(addr)); memset(&mask, 0, sizeof(mask)); addr.sat_family = AF_APPLETALK; addr.sat_len = sizeof(struct sockaddr_at); addr.sat_addr.s_net = at_addr->s_net; addr.sat_addr.s_node = at_addr->s_node; mask.sat_family = AF_APPLETALK; mask.sat_len = sizeof(struct sockaddr_at); mask.sat_addr.s_net = at_mask->s_net; mask.sat_addr.s_node = at_mask->s_node; if (at_mask->s_node) { gate = satosat(ifa->ifa_dstaddr); flags |= RTF_HOST; } else { gate = satosat(ifa->ifa_addr); } #ifdef NETATALKDEBUG printf("on %s %x.%x\n", (flags & RTF_HOST) ? "host" : "net", ntohs(gate->sat_addr.s_net), gate->sat_addr.s_node); #endif return (rtrequest(cmd, (struct sockaddr *) &addr, (struct sockaddr *) gate, (struct sockaddr *) &mask, flags, NULL)); } #if 0 static void aa_clean(void) { struct at_ifaddr *aa; struct ifaddr *ifa; struct ifnet *ifp; while ((aa = TAILQ_FIRST(&at_ifaddr)) != NULL) { TAILQ_REMOVE(&at_ifaddr, aa, aa_list); ifp = aa->aa_ifp; at_scrub(ifp, aa); IFADDR_READER_FOREACH(ifa, ifp) { if (ifa == &aa->aa_ifa) break; } if (ifa == NULL) panic("aa not present"); ifa_remove(ifp, ifa); } } #endif
199 200 19 648 647 449 269 645 428 34 267 435 269 201 21 19 200 2 191 285 1 1 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 /* $NetBSD: trap.c,v 1.129 2023/10/05 19:41:03 ad Exp $ */ /* * Copyright (c) 1998, 2000, 2017 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Charles M. Hannum, and by Maxime Villard. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /* * Copyright (c) 1990 The Regents of the University of California. * All rights reserved. * * This code is derived from software contributed to Berkeley by * the University of Utah, and William Jolitz. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)trap.c 7.4 (Berkeley) 5/13/91 */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: trap.c,v 1.129 2023/10/05 19:41:03 ad Exp $"); #include "opt_ddb.h" #include "opt_kgdb.h" #include "opt_xen.h" #include "opt_dtrace.h" #include <sys/param.h> #include <sys/systm.h> #include <sys/proc.h> #include <sys/acct.h> #include <sys/kauth.h> #include <sys/kernel.h> #include <sys/kmem.h> #include <sys/ras.h> #include <sys/signal.h> #include <sys/syscall.h> #include <sys/cpu.h> #include <sys/ucontext.h> #include <sys/module_hook.h> #include <sys/compat_stub.h> #include <uvm/uvm_extern.h> #include <machine/cpufunc.h> #include <x86/fpu.h> #include <x86/dbregs.h> #include <machine/psl.h> #include <machine/reg.h> #include <machine/trap.h> #include <machine/userret.h> #include <machine/db_machdep.h> #include <x86/nmi.h> #ifndef XENPV #include "isa.h" #endif #include <sys/kgdb.h> #ifdef KDTRACE_HOOKS #include <sys/dtrace_bsd.h> /* * This is a hook which is initialized by the dtrace module to handle traps * which might occur during DTrace probe execution. */ dtrace_trap_func_t dtrace_trap_func = NULL; dtrace_doubletrap_func_t dtrace_doubletrap_func = NULL; #endif /* * Module hook for amd64_oosyscall */ struct amd64_oosyscall_hook_t amd64_oosyscall_hook; void nmitrap(struct trapframe *); void doubletrap(struct trapframe *); void trap(struct trapframe *); const char * const trap_type[] = { "privileged instruction fault", /* 0 T_PRIVINFLT */ "breakpoint trap", /* 1 T_BPTFLT */ "arithmetic trap", /* 2 T_ARITHTRAP */ "asynchronous system trap", /* 3 T_ASTFLT */ "protection fault", /* 4 T_PROTFLT */ "trace trap", /* 5 T_TRCTRAP */ "page fault", /* 6 T_PAGEFLT */ "alignment fault", /* 7 T_ALIGNFLT */ "integer divide fault", /* 8 T_DIVIDE */ "non-maskable interrupt", /* 9 T_NMI */ "overflow trap", /* 10 T_OFLOW */ "bounds check fault", /* 11 T_BOUND */ "FPU not available fault", /* 12 T_DNA */ "double fault", /* 13 T_DOUBLEFLT */ "FPU operand fetch fault", /* 14 T_FPOPFLT */ "invalid TSS fault", /* 15 T_TSSFLT */ "segment not present fault", /* 16 T_SEGNPFLT */ "stack fault", /* 17 T_STKFLT */ "machine check fault", /* 18 T_MCA */ "SSE FP exception", /* 19 T_XMM */ "reserved trap", /* 20 T_RESERVED */ }; int trap_types = __arraycount(trap_type); #ifdef TRAP_SIGDEBUG static void sigdebug(const struct trapframe *, const ksiginfo_t *, int); #define SIGDEBUG(a, b, c) sigdebug(a, b, c) #else #define SIGDEBUG(a, b, c) #endif static void onfault_restore(struct trapframe *frame, void *onfault, int error) { frame->tf_rip = (uintptr_t)onfault; frame->tf_rax = error; } static void * onfault_handler(const struct pcb *pcb, const struct trapframe *tf) { struct onfault_table { uintptr_t start; uintptr_t end; void *handler; }; extern const struct onfault_table onfault_table[]; const struct onfault_table *p; uintptr_t pc; if (pcb->pcb_onfault != NULL) { return pcb->pcb_onfault; } pc = tf->tf_rip; for (p = onfault_table; p->start; p++) { if (p->start <= pc && pc < p->end) { return p->handler; } } return NULL; } static void trap_print(const struct trapframe *frame, const lwp_t *l) { const int type = frame->tf_trapno; if (frame->tf_trapno < trap_types) { printf("fatal %s", trap_type[type]); } else { printf("unknown trap %d", type); } printf(" in %s mode\n", (type & T_USER) ? "user" : "supervisor"); printf("trap type %d code %#lx rip %#lx cs %#lx rflags %#lx cr2 %#lx " "ilevel %#x rsp %#lx\n", type, frame->tf_err, (u_long)frame->tf_rip, frame->tf_cs, frame->tf_rflags, rcr2(), curcpu()->ci_ilevel, frame->tf_rsp); printf("curlwp %p pid %d.%d lowest kstack %p\n", l, l->l_proc->p_pid, l->l_lid, KSTACK_LOWEST_ADDR(l)); } void nmitrap(struct trapframe *frame) { const int type = T_NMI; if (nmi_dispatch(frame)) return; /* NMI can be hooked up to a pushbutton for debugging */ if (kgdb_trap(type, frame)) return; if (kdb_trap(type, 0, frame)) return; /* machine/parity/power fail/"kitchen sink" faults */ x86_nmi(); } void doubletrap(struct trapframe *frame) { const int type = T_DOUBLEFLT; struct lwp *l = curlwp; trap_print(frame, l); if (kdb_trap(type, 0, frame)) return; if (kgdb_trap(type, frame)) return; panic("double fault"); } /* * trap(frame): exception, fault, and trap interface to BSD kernel. * * This common code is called from assembly language IDT gate entry routines * that prepare a suitable stack frame, and restore this frame after the * exception has been processed. Note that the effect is as if the arguments * were passed call by reference. * * Note that the fpu traps (07 T_DNA, 10 T_ARITHTRAP and 13 T_XMM) * jump directly into the code in x86/fpu.c so they get processed * without interrupts being enabled. */ void trap(struct trapframe *frame) { struct lwp *l = curlwp; struct proc *p; struct pcb *pcb; extern char kcopy_fault[]; ksiginfo_t ksi; void *onfault; int type, error; uint64_t cr2; bool pfail; if (__predict_true(l != NULL)) { pcb = lwp_getpcb(l); p = l->l_proc; } else { /* * This can happen eg on break points in early on boot. */ pcb = NULL; p = NULL; } type = frame->tf_trapno; if (!KERNELMODE(frame->tf_cs)) { type |= T_USER; l->l_md.md_regs = frame; } #ifdef KDTRACE_HOOKS /* * A trap can occur while DTrace executes a probe. Before * executing the probe, DTrace blocks re-scheduling and sets * a flag in its per-cpu flags to indicate that it doesn't * want to fault. On returning from the probe, the no-fault * flag is cleared and finally re-scheduling is enabled. * * If the DTrace kernel module has registered a trap handler, * call it and if it returns non-zero, assume that it has * handled the trap and modified the trap frame so that this * function can return normally. */ if ((type == T_PROTFLT || type == T_PAGEFLT) && dtrace_trap_func != NULL) { if ((*dtrace_trap_func)(frame, type)) { return; } } #endif switch (type) { default: we_re_toast: trap_print(frame, l); if (kdb_trap(type, 0, frame)) return; if (kgdb_trap(type, frame)) return; /* * If this is a breakpoint, don't panic if we're not connected. */ if (type == T_BPTFLT && kgdb_disconnected()) { printf("kgdb: ignored %s\n", trap_type[type]); return; } panic("trap"); /*NOTREACHED*/ case T_PROTFLT: case T_SEGNPFLT: case T_ALIGNFLT: case T_STKFLT: case T_TSSFLT: if (p == NULL) goto we_re_toast; /* Check for copyin/copyout fault. */ onfault = onfault_handler(pcb, frame); if (onfault != NULL) { onfault_restore(frame, onfault, EFAULT); return; } goto we_re_toast; case T_PROTFLT|T_USER: /* protection fault */ { int hook_ret; MODULE_HOOK_CALL(amd64_oosyscall_hook, (p, frame), ENOSYS, hook_ret); if (hook_ret == 0) { /* Do the syscall */ p->p_md.md_syscall(frame); goto out; } } /* FALLTHROUGH */ case T_TSSFLT|T_USER: case T_SEGNPFLT|T_USER: case T_STKFLT|T_USER: case T_ALIGNFLT|T_USER: KSI_INIT_TRAP(&ksi); ksi.ksi_trap = type & ~T_USER; ksi.ksi_addr = (void *)frame->tf_rip; switch (type) { case T_SEGNPFLT|T_USER: case T_STKFLT|T_USER: ksi.ksi_signo = SIGBUS; ksi.ksi_code = BUS_ADRERR; break; case T_TSSFLT|T_USER: ksi.ksi_signo = SIGBUS; ksi.ksi_code = BUS_OBJERR; break; case T_ALIGNFLT|T_USER: ksi.ksi_signo = SIGBUS; ksi.ksi_code = BUS_ADRALN; break; case T_PROTFLT|T_USER: ksi.ksi_signo = SIGSEGV; ksi.ksi_code = SEGV_ACCERR; break; default: KASSERT(0); break; } goto trapsignal; case T_PRIVINFLT|T_USER: /* privileged instruction fault */ case T_FPOPFLT|T_USER: /* coprocessor operand fault */ KSI_INIT_TRAP(&ksi); ksi.ksi_signo = SIGILL; ksi.ksi_trap = type & ~T_USER; ksi.ksi_addr = (void *) frame->tf_rip; switch (type) { case T_PRIVINFLT|T_USER: ksi.ksi_code = ILL_PRVOPC; break; case T_FPOPFLT|T_USER: ksi.ksi_code = ILL_COPROC; break; default: KASSERT(0); break; } goto trapsignal; case T_ASTFLT|T_USER: /* Allow process switch. */ //curcpu()->ci_data.cpu_nast++; if (l->l_pflag & LP_OWEUPC) { l->l_pflag &= ~LP_OWEUPC; ADDUPROF(l); } goto out; case T_BOUND|T_USER: case T_OFLOW|T_USER: case T_DIVIDE|T_USER: KSI_INIT_TRAP(&ksi); ksi.ksi_signo = SIGFPE; ksi.ksi_trap = type & ~T_USER; ksi.ksi_addr = (void *)frame->tf_rip; switch (type) { case T_BOUND|T_USER: ksi.ksi_code = FPE_FLTSUB; break; case T_OFLOW|T_USER: ksi.ksi_code = FPE_INTOVF; break; case T_DIVIDE|T_USER: ksi.ksi_code = FPE_INTDIV; break; default: KASSERT(0); break; } goto trapsignal; case T_PAGEFLT: /* Allow page faults in kernel mode. */ if (__predict_false(l == NULL)) goto we_re_toast; onfault = pcb->pcb_onfault; if (cpu_intr_p() || (l->l_pflag & LP_INTR) != 0) { goto we_re_toast; } cr2 = rcr2(); if (frame->tf_err & PGEX_I) { /* SMEP might have brought us here */ if (cr2 < VM_MAXUSER_ADDRESS) { printf("prevented execution of %p (SMEP)\n", (void *)cr2); goto we_re_toast; } } if ((frame->tf_err & PGEX_P) && cr2 < VM_MAXUSER_ADDRESS) { /* SMAP might have brought us here */ if (onfault_handler(pcb, frame) == NULL) { printf("prevented access to %p (SMAP)\n", (void *)cr2); goto we_re_toast; } } goto pagefltcommon; case T_PAGEFLT|T_USER: { register vaddr_t va; register struct vmspace *vm; register struct vm_map *map; vm_prot_t ftype; extern struct vm_map *kernel_map; cr2 = rcr2(); if (p->p_emul->e_usertrap != NULL && (*p->p_emul->e_usertrap)(l, cr2, frame) != 0) return; pagefltcommon: vm = p->p_vmspace; if (__predict_false(vm == NULL)) { goto we_re_toast; } pcb->pcb_cr2 = cr2; va = trunc_page((vaddr_t)cr2); /* * It is only a kernel address space fault iff: * 1. (type & T_USER) == 0 and * 2. pcb_onfault not set or * 3. pcb_onfault set but supervisor space fault * The last can occur during an exec() copyin where the * argument space is lazy-allocated. */ if (type == T_PAGEFLT && va >= VM_MIN_KERNEL_ADDRESS) map = kernel_map; else map = &vm->vm_map; if (frame->tf_err & PGEX_W) ftype = VM_PROT_WRITE; else if (frame->tf_err & PGEX_I) ftype = VM_PROT_EXECUTE; else ftype = VM_PROT_READ; #ifdef DIAGNOSTIC if (map == kernel_map && va == 0) { printf("trap: bad kernel access at %lx\n", va); goto we_re_toast; } #endif /* Fault the original page in. */ onfault = pcb->pcb_onfault; pcb->pcb_onfault = NULL; error = uvm_fault(map, va, ftype); pcb->pcb_onfault = onfault; if (error == 0) { if (map != kernel_map && (void *)va >= vm->vm_maxsaddr) uvm_grow(p, va); pfail = false; while (type == T_PAGEFLT) { /* * we need to switch pmap now if we're in * the middle of copyin/out. * * but we don't need to do so for kcopy as * it never touch userspace. */ kpreempt_disable(); if (curcpu()->ci_want_pmapload) { onfault = onfault_handler(pcb, frame); if (onfault != kcopy_fault) { pmap_load(); } } /* * We need to keep the pmap loaded and * so avoid being preempted until back * into the copy functions. Disable * interrupts at the hardware level before * re-enabling preemption. Interrupts * will be re-enabled by 'iret' when * returning back out of the trap stub. * They'll only be re-enabled when the * program counter is once again in * the copy functions, and so visible * to cpu_kpreempt_exit(). */ #ifndef XENPV x86_disable_intr(); #endif l->l_nopreempt--; if (l->l_nopreempt > 0 || !l->l_dopreempt || pfail) { return; } #ifndef XENPV x86_enable_intr(); #endif /* * If preemption fails for some reason, * don't retry it. The conditions won't * change under our nose. */ pfail = kpreempt(0); } goto out; } if (type == T_PAGEFLT) { onfault = onfault_handler(pcb, frame); if (onfault != NULL) { onfault_restore(frame, onfault, error); return; } printf("uvm_fault(%p, 0x%lx, %d) -> %x\n", map, va, ftype, error); goto we_re_toast; } KSI_INIT_TRAP(&ksi); ksi.ksi_trap = type & ~T_USER; ksi.ksi_addr = (void *)cr2; switch (error) { case EINVAL: ksi.ksi_signo = SIGBUS; ksi.ksi_code = BUS_ADRERR; break; case EACCES: ksi.ksi_signo = SIGSEGV; ksi.ksi_code = SEGV_ACCERR; error = EFAULT; break; case ENOMEM: ksi.ksi_signo = SIGKILL; printf("UVM: pid %d.%d (%s), uid %d killed: " "out of swap\n", p->p_pid, l->l_lid, p->p_comm, l->l_cred ? kauth_cred_geteuid(l->l_cred) : -1); break; default: ksi.ksi_signo = SIGSEGV; ksi.ksi_code = SEGV_MAPERR; break; } SIGDEBUG(frame, &ksi, error); (*p->p_emul->e_trapsignal)(l, &ksi); break; } case T_TRCTRAP: /* * Ignore debug register trace traps due to * accesses in the user's address space, which * can happen under several conditions such as * if a user sets a watchpoint on a buffer and * then passes that buffer to a system call. * We still want to get TRCTRAPS for addresses * in kernel space because that is useful when * debugging the kernel. */ if (x86_dbregs_user_trap()) break; goto we_re_toast; case T_BPTFLT|T_USER: /* bpt instruction fault */ case T_TRCTRAP|T_USER: /* trace trap */ /* * Don't go single-stepping into a RAS. */ if (p->p_raslist == NULL || (ras_lookup(p, (void *)frame->tf_rip) == (void *)-1)) { KSI_INIT_TRAP(&ksi); ksi.ksi_signo = SIGTRAP; ksi.ksi_trap = type & ~T_USER; if (x86_dbregs_user_trap()) { x86_dbregs_store_dr6(l); ksi.ksi_code = TRAP_DBREG; } else if (type == (T_BPTFLT|T_USER)) ksi.ksi_code = TRAP_BRKPT; else ksi.ksi_code = TRAP_TRACE; (*p->p_emul->e_trapsignal)(l, &ksi); } break; } if ((type & T_USER) == 0) return; out: userret(l); return; trapsignal: SIGDEBUG(frame, &ksi, 0); (*p->p_emul->e_trapsignal)(l, &ksi); userret(l); } /* * startlwp: start of a new LWP. */ void startlwp(void *arg) { ucontext_t *uc = arg; lwp_t *l = curlwp; int error __diagused; error = cpu_setmcontext(l, &uc->uc_mcontext, uc->uc_flags); KASSERT(error == 0); kmem_free(uc, sizeof(ucontext_t)); userret(l); } #ifdef TRAP_SIGDEBUG static void frame_dump(const struct trapframe *tf, struct pcb *pcb) { printf("trapframe %p\n", tf); printf("rip %#018lx rsp %#018lx rfl %#018lx\n", tf->tf_rip, tf->tf_rsp, tf->tf_rflags); printf("rdi %#018lx rsi %#018lx rdx %#018lx\n", tf->tf_rdi, tf->tf_rsi, tf->tf_rdx); printf("rcx %#018lx r8 %#018lx r9 %#018lx\n", tf->tf_rcx, tf->tf_r8, tf->tf_r9); printf("r10 %#018lx r11 %#018lx r12 %#018lx\n", tf->tf_r10, tf->tf_r11, tf->tf_r12); printf("r13 %#018lx r14 %#018lx r15 %#018lx\n", tf->tf_r13, tf->tf_r14, tf->tf_r15); printf("rbp %#018lx rbx %#018lx rax %#018lx\n", tf->tf_rbp, tf->tf_rbx, tf->tf_rax); printf("cs %#04lx ds %#04lx es %#04lx " "fs %#04lx gs %#04lx ss %#04lx\n", tf->tf_cs & 0xffff, tf->tf_ds & 0xffff, tf->tf_es & 0xffff, tf->tf_fs & 0xffff, tf->tf_gs & 0xffff, tf->tf_ss & 0xffff); printf("fsbase %#018lx gsbase %#018lx\n", pcb->pcb_fs, pcb->pcb_gs); printf("\n"); hexdump(printf, "Stack dump", tf, 256); } static void sigdebug(const struct trapframe *tf, const ksiginfo_t *ksi, int e) { struct lwp *l = curlwp; struct proc *p = l->l_proc; printf("pid %d.%d (%s): signal %d code=%d (trap %#lx) " "@rip %#lx addr %#lx error=%d\n", p->p_pid, l->l_lid, p->p_comm, ksi->ksi_signo, ksi->ksi_code, tf->tf_trapno, tf->tf_rip, rcr2(), e); frame_dump(tf, lwp_getpcb(l)); } #endif
23 16 1 2 3 1 4 5 1 1 1 1 3 1 1 1 1 10 10 10 2 9 9 2 9 5 4 2 1 3 3 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 /* $NetBSD: tty_43.c,v 1.40 2022/07/10 13:57:14 riastradh Exp $ */ /*- * Copyright (c) 2008 The NetBSD Foundation, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /*- * Copyright (c) 1982, 1986, 1991, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)tty_compat.c 8.2 (Berkeley) 1/9/95 */ /* * mapping routines for old line discipline (yuck) */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: tty_43.c,v 1.40 2022/07/10 13:57:14 riastradh Exp $"); #if defined(_KERNEL_OPT) #include "opt_compat_netbsd.h" #endif #include <sys/param.h> #include <sys/systm.h> #include <sys/ioctl.h> #include <sys/proc.h> #include <sys/conf.h> #include <sys/tty.h> #include <sys/termios.h> #include <sys/file.h> #include <sys/kernel.h> #include <sys/syslog.h> #include <sys/compat_stub.h> #include <sys/module_hook.h> #include <sys/ioctl_compat.h> #include <compat/common/compat_mod.h> #include <compat/sys/ttycom.h> int ttydebug = 0; static const struct speedtab compatspeeds[] = { #define MAX_SPEED 17 { 115200, 17 }, { 57600, 16 }, { 38400, 15 }, { 19200, 14 }, { 9600, 13 }, { 4800, 12 }, { 2400, 11 }, { 1800, 10 }, { 1200, 9 }, { 600, 8 }, { 300, 7 }, { 200, 6 }, { 150, 5 }, { 134, 4 }, { 110, 3 }, { 75, 2 }, { 50, 1 }, { 0, 0 }, { -1, -1 }, }; static const int compatspcodes[] = { 0, 50, 75, 110, 134, 150, 200, 300, 600, 1200, 1800, 2400, 4800, 9600, 19200, 38400, 57600, 115200 }; static int ttcompatgetflags(struct tty *); static void ttcompatsetflags(struct tty *, struct termios *); static void ttcompatsetlflags(struct tty *, struct termios *); /*ARGSUSED*/ int compat_43_ttioctl(struct tty *tp, u_long com, void *data, int flag, struct lwp *l) { switch (com) { case TIOCGETP: { struct sgttyb *sg = (struct sgttyb *)data; int speed; mutex_spin_enter(&tty_lock); speed = ttspeedtab(tp->t_ospeed, compatspeeds); sg->sg_ospeed = (speed == -1) ? MAX_SPEED : speed; if (tp->t_ispeed == 0) sg->sg_ispeed = sg->sg_ospeed; else { speed = ttspeedtab(tp->t_ispeed, compatspeeds); sg->sg_ispeed = (speed == -1) ? MAX_SPEED : speed; } sg->sg_erase = tty_getctrlchar(tp, VERASE); sg->sg_kill = tty_getctrlchar(tp, VKILL); sg->sg_flags = ttcompatgetflags(tp); mutex_spin_exit(&tty_lock); break; } case TIOCSETP: case TIOCSETN: { struct sgttyb *sg = (struct sgttyb *)data; struct termios term; int speed; mutex_spin_enter(&tty_lock); term = tp->t_termios; if ((speed = sg->sg_ispeed) > MAX_SPEED || speed < 0) term.c_ispeed = speed; else term.c_ispeed = compatspcodes[speed]; if ((speed = sg->sg_ospeed) > MAX_SPEED || speed < 0) term.c_ospeed = speed; else term.c_ospeed = compatspcodes[speed]; term.c_cc[VERASE] = sg->sg_erase; term.c_cc[VKILL] = sg->sg_kill; tp->t_flags = (ttcompatgetflags(tp)&0xffff0000) | (sg->sg_flags&0xffff); ttcompatsetflags(tp, &term); mutex_spin_exit(&tty_lock); return (ttioctl(tp, com == TIOCSETP ? TIOCSETAF : TIOCSETA, (void *)&term, flag, l)); } case TIOCGETC: { struct tchars *tc = (struct tchars *)data; tc->t_intrc = tty_getctrlchar(tp, VINTR); tc->t_quitc = tty_getctrlchar(tp, VQUIT); tc->t_startc = tty_getctrlchar(tp, VSTART); tc->t_stopc = tty_getctrlchar(tp, VSTOP); tc->t_eofc = tty_getctrlchar(tp, VEOF); tc->t_brkc = tty_getctrlchar(tp, VEOL); break; } case TIOCSETC: { struct tchars *tc = (struct tchars *)data; tty_setctrlchar(tp, VINTR, tc->t_intrc); tty_setctrlchar(tp, VQUIT, tc->t_quitc); tty_setctrlchar(tp, VSTART, tc->t_startc); tty_setctrlchar(tp, VSTOP, tc->t_stopc); tty_setctrlchar(tp, VEOF, tc->t_eofc); tty_setctrlchar(tp, VEOL, tc->t_brkc); if (tc->t_brkc == (char)-1) tty_setctrlchar(tp, VEOL2, _POSIX_VDISABLE); break; } case TIOCSLTC: { struct ltchars *ltc = (struct ltchars *)data; tty_setctrlchar(tp, VSUSP, ltc->t_suspc); tty_setctrlchar(tp, VDSUSP, ltc->t_dsuspc); tty_setctrlchar(tp, VREPRINT, ltc->t_rprntc); tty_setctrlchar(tp, VDISCARD, ltc->t_flushc); tty_setctrlchar(tp, VWERASE, ltc->t_werasc); tty_setctrlchar(tp, VLNEXT, ltc->t_lnextc); break; } case TIOCGLTC: { struct ltchars *ltc = (struct ltchars *)data; ltc->t_suspc = tty_getctrlchar(tp, VSUSP); ltc->t_dsuspc = tty_getctrlchar(tp, VDSUSP); ltc->t_rprntc = tty_getctrlchar(tp, VREPRINT); ltc->t_flushc = tty_getctrlchar(tp, VDISCARD); ltc->t_werasc = tty_getctrlchar(tp, VWERASE); ltc->t_lnextc = tty_getctrlchar(tp, VLNEXT); break; } case TIOCLBIS: case TIOCLBIC: case TIOCLSET: { struct termios term; unsigned argbits, flags; argbits = *(int *)data; mutex_spin_enter(&tty_lock); term = tp->t_termios; flags = ttcompatgetflags(tp); switch (com) { case TIOCLSET: tp->t_flags = (flags & 0xffff) | (argbits << 16); break; case TIOCLBIS: tp->t_flags = flags | (argbits << 16); break; case TIOCLBIC: tp->t_flags = flags & ~(argbits << 16); break; } ttcompatsetlflags(tp, &term); mutex_spin_exit(&tty_lock); return (ttioctl(tp, TIOCSETA, (void *)&term, flag, l)); } case TIOCLGET: mutex_spin_enter(&tty_lock); *(int *)data = ttcompatgetflags(tp)>>16; mutex_spin_exit(&tty_lock); if (ttydebug) printf("CLGET: returning %x\n", *(int *)data); break; case OTIOCGETD: mutex_spin_enter(&tty_lock); *(int *)data = (tp->t_linesw == NULL || tp->t_linesw->l_no == 0) ? 2 /* XXX old NTTYDISC */ : tp->t_linesw->l_no; mutex_spin_exit(&tty_lock); break; case OTIOCSETD: { int ldisczero = 0; return (ttioctl(tp, TIOCSETD, *(int *)data == 2 ? (void *)&ldisczero : data, flag, l)); } case OTIOCCONS: *(int *)data = 1; return (ttioctl(tp, TIOCCONS, data, flag, l)); case TIOCHPCL: mutex_spin_enter(&tty_lock); SET(tp->t_cflag, HUPCL); mutex_spin_exit(&tty_lock); break; default: return (EPASSTHROUGH); } return (0); } static int ttcompatgetflags(struct tty *tp) { tcflag_t iflag = tp->t_iflag; tcflag_t lflag = tp->t_lflag; tcflag_t oflag = tp->t_oflag; tcflag_t cflag = tp->t_cflag; int flags = 0; KASSERT(mutex_owned(&tty_lock)); if (ISSET(iflag, IXOFF)) SET(flags, TANDEM); if (ISSET(iflag, ICRNL) || ISSET(oflag, ONLCR)) SET(flags, CRMOD); if (ISSET(cflag, PARENB)) { if (ISSET(iflag, INPCK)) { if (ISSET(cflag, PARODD)) SET(flags, ODDP); else SET(flags, EVENP); } else SET(flags, ANYP); } if (!ISSET(lflag, ICANON)) { /* fudge */ if (ISSET(iflag, IXON) || ISSET(lflag, ISIG|IEXTEN) || ISSET(cflag, PARENB)) SET(flags, CBREAK); else SET(flags, RAW); } if (ISSET(flags, RAW)) SET(flags, ISSET(tp->t_flags, LITOUT|PASS8)); else if (ISSET(cflag, CSIZE) == CS8) { if (!ISSET(oflag, OPOST)) SET(flags, LITOUT); if (!ISSET(iflag, ISTRIP)) SET(flags, PASS8); } if (ISSET(cflag, MDMBUF)) SET(flags, MDMBUF); if (!ISSET(cflag, HUPCL)) SET(flags, NOHANG); if (ISSET(oflag, OXTABS)) SET(flags, XTABS); if (ISSET(lflag, ECHOE)) SET(flags, CRTERA|CRTBS); if (ISSET(lflag, ECHOKE)) SET(flags, CRTKIL|CRTBS); if (ISSET(lflag, ECHOPRT)) SET(flags, PRTERA); if (ISSET(lflag, ECHOCTL)) SET(flags, CTLECH); if (!ISSET(iflag, IXANY)) SET(flags, DECCTQ); SET(flags, ISSET(lflag, ECHO|TOSTOP|FLUSHO|PENDIN|NOFLSH)); if (ttydebug) printf("getflags: %x\n", flags); return (flags); } static void ttcompatsetflags(struct tty *tp, struct termios *t) { int flags = tp->t_flags; KASSERT(mutex_owned(&tty_lock)); tcflag_t iflag = t->c_iflag; tcflag_t oflag = t->c_oflag; tcflag_t lflag = t->c_lflag; tcflag_t cflag = t->c_cflag; if (ISSET(flags, TANDEM)) SET(iflag, IXOFF); else CLR(iflag, IXOFF); if (ISSET(flags, ECHO)) SET(lflag, ECHO); else CLR(lflag, ECHO); if (ISSET(flags, CRMOD)) { SET(iflag, ICRNL); SET(oflag, ONLCR); } else { CLR(iflag, ICRNL); CLR(oflag, ONLCR); } if (ISSET(flags, XTABS)) SET(oflag, OXTABS); else CLR(oflag, OXTABS); if (ISSET(flags, RAW)) { iflag &= IXOFF; CLR(lflag, ISIG|ICANON|IEXTEN); CLR(cflag, PARENB); } else { SET(iflag, BRKINT|IXON|IMAXBEL); SET(lflag, ISIG|IEXTEN); if (ISSET(flags, CBREAK)) CLR(lflag, ICANON); else SET(lflag, ICANON); switch (ISSET(flags, ANYP)) { case 0: CLR(cflag, PARENB); break; case ANYP: SET(cflag, PARENB); CLR(iflag, INPCK); break; case EVENP: SET(cflag, PARENB); SET(iflag, INPCK); CLR(cflag, PARODD); break; case ODDP: SET(cflag, PARENB); SET(iflag, INPCK); SET(cflag, PARODD); break; } } if (ISSET(flags, RAW|LITOUT|PASS8)) { CLR(cflag, CSIZE); SET(cflag, CS8); if (!ISSET(flags, RAW|PASS8)) SET(iflag, ISTRIP); else CLR(iflag, ISTRIP); if (!ISSET(flags, RAW|LITOUT)) SET(oflag, OPOST); else CLR(oflag, OPOST); } else { CLR(cflag, CSIZE); SET(cflag, CS7); SET(iflag, ISTRIP); SET(oflag, OPOST); } t->c_iflag = iflag; t->c_oflag = oflag; t->c_lflag = lflag; t->c_cflag = cflag; } static void ttcompatsetlflags(struct tty *tp, struct termios *t) { int flags = tp->t_flags; tcflag_t iflag = t->c_iflag; tcflag_t oflag = t->c_oflag; tcflag_t lflag = t->c_lflag; tcflag_t cflag = t->c_cflag; KASSERT(mutex_owned(&tty_lock)); /* Nothing we can do with CRTBS. */ if (ISSET(flags, PRTERA)) SET(lflag, ECHOPRT); else CLR(lflag, ECHOPRT); if (ISSET(flags, CRTERA)) SET(lflag, ECHOE); else CLR(lflag, ECHOE); /* Nothing we can do with TILDE. */ if (ISSET(flags, MDMBUF)) SET(cflag, MDMBUF); else CLR(cflag, MDMBUF); if (ISSET(flags, NOHANG)) CLR(cflag, HUPCL); else SET(cflag, HUPCL); if (ISSET(flags, CRTKIL)) SET(lflag, ECHOKE); else CLR(lflag, ECHOKE); if (ISSET(flags, CTLECH)) SET(lflag, ECHOCTL); else CLR(lflag, ECHOCTL); if (!ISSET(flags, DECCTQ)) SET(iflag, IXANY); else CLR(iflag, IXANY); CLR(lflag, TOSTOP|FLUSHO|PENDIN|NOFLSH); SET(lflag, ISSET(flags, TOSTOP|FLUSHO|PENDIN|NOFLSH)); if (ISSET(flags, RAW|LITOUT|PASS8)) { CLR(cflag, CSIZE); SET(cflag, CS8); if (!ISSET(flags, RAW|PASS8)) SET(iflag, ISTRIP); else CLR(iflag, ISTRIP); if (!ISSET(flags, RAW|LITOUT)) SET(oflag, OPOST); else CLR(oflag, OPOST); } else { CLR(cflag, CSIZE); SET(cflag, CS7); SET(iflag, ISTRIP); SET(oflag, OPOST); } t->c_iflag = iflag; t->c_oflag = oflag; t->c_lflag = lflag; t->c_cflag = cflag; } int kern_tty_43_init(void) { MODULE_HOOK_SET(tty_ttioctl_43_hook, compat_43_ttioctl); return 0; } int kern_tty_43_fini(void) { MODULE_HOOK_UNSET(tty_ttioctl_43_hook); return 0; }
1946 502 504 49 49 43 49 49 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 /* $NetBSD: subr_cpu.c,v 1.22 2024/03/05 20:59:41 thorpej Exp $ */ /*- * Copyright (c) 2007, 2008, 2009, 2010, 2012, 2019, 2020 * The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Andrew Doran. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /*- * Copyright (c)2007 YAMAMOTO Takashi, * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * CPU related routines shared with rump. */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: subr_cpu.c,v 1.22 2024/03/05 20:59:41 thorpej Exp $"); #include <sys/param.h> #include <sys/atomic.h> #include <sys/systm.h> #include <sys/sched.h> #include <sys/conf.h> #include <sys/cpu.h> #include <sys/proc.h> #include <sys/kernel.h> #include <sys/kmem.h> static void cpu_topology_fake1(struct cpu_info *); kmutex_t cpu_lock __cacheline_aligned; int ncpu __read_mostly; int ncpuonline __read_mostly; bool mp_online __read_mostly; static bool cpu_topology_present __read_mostly; static bool cpu_topology_haveslow __read_mostly; int64_t cpu_counts[CPU_COUNT_MAX]; /* An array of CPUs. There are ncpu entries. */ struct cpu_info **cpu_infos __read_mostly; /* Note: set on mi_cpu_attach() and idle_loop(). */ kcpuset_t * kcpuset_attached __read_mostly = NULL; kcpuset_t * kcpuset_running __read_mostly = NULL; static char cpu_model[128]; /* * mi_cpu_init: early initialisation of MI CPU related structures. * * Note: may not block and memory allocator is not yet available. */ void mi_cpu_init(void) { struct cpu_info *ci; mutex_init(&cpu_lock, MUTEX_DEFAULT, IPL_NONE); kcpuset_create(&kcpuset_attached, true); kcpuset_create(&kcpuset_running, true); kcpuset_set(kcpuset_running, 0); ci = curcpu(); cpu_topology_fake1(ci); } int cpu_setmodel(const char *fmt, ...) { int len; va_list ap; va_start(ap, fmt); len = vsnprintf(cpu_model, sizeof(cpu_model), fmt, ap); va_end(ap); return len; } const char * cpu_getmodel(void) { return cpu_model; } bool cpu_softintr_p(void) { return (curlwp->l_pflag & LP_INTR) != 0; } bool curcpu_stable(void) { struct lwp *const l = curlwp; const int pflag = l->l_pflag; const int nopreempt = l->l_nopreempt; /* * - Softints (LP_INTR) never migrate between CPUs. * - Bound lwps (LP_BOUND), either kthreads created bound to * a CPU or any lwps bound with curlwp_bind, never migrate. * - If kpreemption is disabled, the lwp can't migrate. * - If we're in interrupt context, preemption is blocked. * * We combine the LP_INTR, LP_BOUND, and l_nopreempt test into * a single predicted-true branch so this is cheap to assert in * most contexts where it will be used, then fall back to * calling the full kpreempt_disabled() and cpu_intr_p() as * subroutines. * * XXX Is cpu_intr_p redundant with kpreempt_disabled? */ return __predict_true(((pflag & (LP_INTR|LP_BOUND)) | nopreempt) != 0) || kpreempt_disabled() || cpu_intr_p(); } /* * Collect CPU topology information as each CPU is attached. This can be * called early during boot, so we need to be careful what we do. */ void cpu_topology_set(struct cpu_info *ci, u_int package_id, u_int core_id, u_int smt_id, u_int numa_id) { enum cpu_rel rel; cpu_topology_present = true; ci->ci_package_id = package_id; ci->ci_core_id = core_id; ci->ci_smt_id = smt_id; ci->ci_numa_id = numa_id; for (rel = 0; rel < __arraycount(ci->ci_sibling); rel++) { ci->ci_sibling[rel] = ci; ci->ci_nsibling[rel] = 1; } } /* * Collect CPU relative speed */ void cpu_topology_setspeed(struct cpu_info *ci, bool slow) { cpu_topology_haveslow |= slow; ci->ci_is_slow = slow; } /* * Link a CPU into the given circular list. */ static void cpu_topology_link(struct cpu_info *ci, struct cpu_info *ci2, enum cpu_rel rel) { struct cpu_info *ci3; /* Walk to the end of the existing circular list and append. */ for (ci3 = ci2;; ci3 = ci3->ci_sibling[rel]) { ci3->ci_nsibling[rel]++; if (ci3->ci_sibling[rel] == ci2) { break; } } ci->ci_sibling[rel] = ci2; ci3->ci_sibling[rel] = ci; ci->ci_nsibling[rel] = ci3->ci_nsibling[rel]; } /* * Print out the topology lists. */ static void cpu_topology_dump(void) { CPU_INFO_ITERATOR cii; struct cpu_info *ci, *ci2; const char *names[] = { "core", "pkg", "1st" }; enum cpu_rel rel; int i; CTASSERT(__arraycount(names) >= __arraycount(ci->ci_sibling)); if (ncpu == 1) { return; } for (CPU_INFO_FOREACH(cii, ci)) { if (cpu_topology_haveslow) aprint_debug("%s ", ci->ci_is_slow ? "slow" : "fast"); for (rel = 0; rel < __arraycount(ci->ci_sibling); rel++) { aprint_debug("%s has %d %s siblings:", cpu_name(ci), ci->ci_nsibling[rel], names[rel]); ci2 = ci->ci_sibling[rel]; i = 0; do { aprint_debug(" %s", cpu_name(ci2)); ci2 = ci2->ci_sibling[rel]; } while (++i < 64 && ci2 != ci->ci_sibling[rel]); if (i == 64) { aprint_debug(" GAVE UP"); } aprint_debug("\n"); } aprint_debug("%s first in package: %s\n", cpu_name(ci), cpu_name(ci->ci_package1st)); } } /* * Fake up topology info if we have none, or if what we got was bogus. * Used early in boot, and by cpu_topology_fake(). */ static void cpu_topology_fake1(struct cpu_info *ci) { enum cpu_rel rel; for (rel = 0; rel < __arraycount(ci->ci_sibling); rel++) { ci->ci_sibling[rel] = ci; ci->ci_nsibling[rel] = 1; } if (!cpu_topology_present) { ci->ci_package_id = cpu_index(ci); } ci->ci_schedstate.spc_flags |= (SPCF_CORE1ST | SPCF_PACKAGE1ST | SPCF_1STCLASS); ci->ci_package1st = ci; if (!cpu_topology_haveslow) { ci->ci_is_slow = false; } } /* * Fake up topology info if we have none, or if what we got was bogus. * Don't override ci_package_id, etc, if cpu_topology_present is set. * MD code also uses these. */ static void cpu_topology_fake(void) { CPU_INFO_ITERATOR cii; struct cpu_info *ci; for (CPU_INFO_FOREACH(cii, ci)) { cpu_topology_fake1(ci); /* Undo (early boot) flag set so everything links OK. */ ci->ci_schedstate.spc_flags &= ~(SPCF_CORE1ST | SPCF_PACKAGE1ST | SPCF_1STCLASS); } } /* * Fix up basic CPU topology info. Right now that means attach each CPU to * circular lists of its siblings in the same core, and in the same package. */ void cpu_topology_init(void) { CPU_INFO_ITERATOR cii, cii2; struct cpu_info *ci, *ci2, *ci3; u_int minsmt, mincore; if (!cpu_topology_present) { cpu_topology_fake(); goto linkit; } /* Find siblings in same core and package. */ for (CPU_INFO_FOREACH(cii, ci)) { ci->ci_schedstate.spc_flags &= ~(SPCF_CORE1ST | SPCF_PACKAGE1ST | SPCF_1STCLASS); for (CPU_INFO_FOREACH(cii2, ci2)) { /* Avoid bad things happening. */ if (ci2->ci_package_id == ci->ci_package_id && ci2->ci_core_id == ci->ci_core_id && ci2->ci_smt_id == ci->ci_smt_id && ci2 != ci) { #ifdef DEBUG printf("cpu%u %p pkg %u core %u smt %u same as " "cpu%u %p pkg %u core %u smt %u\n", cpu_index(ci), ci, ci->ci_package_id, ci->ci_core_id, ci->ci_smt_id, cpu_index(ci2), ci2, ci2->ci_package_id, ci2->ci_core_id, ci2->ci_smt_id); #endif printf("cpu_topology_init: info bogus, " "faking it\n"); cpu_topology_fake(); goto linkit; } if (ci2 == ci || ci2->ci_package_id != ci->ci_package_id) { continue; } /* Find CPUs in the same core. */ if (ci->ci_nsibling[CPUREL_CORE] == 1 && ci->ci_core_id == ci2->ci_core_id) { cpu_topology_link(ci, ci2, CPUREL_CORE); } /* Find CPUs in the same package. */ if (ci->ci_nsibling[CPUREL_PACKAGE] == 1) { cpu_topology_link(ci, ci2, CPUREL_PACKAGE); } if (ci->ci_nsibling[CPUREL_CORE] > 1 && ci->ci_nsibling[CPUREL_PACKAGE] > 1) { break; } } } linkit: /* Identify lowest numbered SMT in each core. */ for (CPU_INFO_FOREACH(cii, ci)) { ci2 = ci3 = ci; minsmt = ci->ci_smt_id; do { if (ci2->ci_smt_id < minsmt) { ci3 = ci2; minsmt = ci2->ci_smt_id; } ci2 = ci2->ci_sibling[CPUREL_CORE]; } while (ci2 != ci); ci3->ci_schedstate.spc_flags |= SPCF_CORE1ST; } /* Identify lowest numbered SMT in each package. */ ci3 = NULL; for (CPU_INFO_FOREACH(cii, ci)) { if ((ci->ci_schedstate.spc_flags & SPCF_CORE1ST) == 0) { continue; } ci2 = ci3 = ci; mincore = ci->ci_core_id; do { if ((ci2->ci_schedstate.spc_flags & SPCF_CORE1ST) != 0 && ci2->ci_core_id < mincore) { ci3 = ci2; mincore = ci2->ci_core_id; } ci2 = ci2->ci_sibling[CPUREL_PACKAGE]; } while (ci2 != ci); if ((ci3->ci_schedstate.spc_flags & SPCF_PACKAGE1ST) != 0) { /* Already identified - nothing more to do. */ continue; } ci3->ci_schedstate.spc_flags |= SPCF_PACKAGE1ST; /* Walk through all CPUs in package and point to first. */ ci2 = ci3; do { ci2->ci_package1st = ci3; ci2->ci_sibling[CPUREL_PACKAGE1ST] = ci3; ci2 = ci2->ci_sibling[CPUREL_PACKAGE]; } while (ci2 != ci3); /* Now look for somebody else to link to. */ for (CPU_INFO_FOREACH(cii2, ci2)) { if ((ci2->ci_schedstate.spc_flags & SPCF_PACKAGE1ST) != 0 && ci2 != ci3) { cpu_topology_link(ci3, ci2, CPUREL_PACKAGE1ST); break; } } } /* Walk through all packages, starting with value of ci3 from above. */ KASSERT(ci3 != NULL); ci = ci3; do { /* Walk through CPUs in the package and copy in PACKAGE1ST. */ ci2 = ci; do { ci2->ci_sibling[CPUREL_PACKAGE1ST] = ci->ci_sibling[CPUREL_PACKAGE1ST]; ci2->ci_nsibling[CPUREL_PACKAGE1ST] = ci->ci_nsibling[CPUREL_PACKAGE1ST]; ci2 = ci2->ci_sibling[CPUREL_PACKAGE]; } while (ci2 != ci); ci = ci->ci_sibling[CPUREL_PACKAGE1ST]; } while (ci != ci3); if (cpu_topology_haveslow) { /* * For asymmetric systems where some CPUs are slower than * others, mark first class CPUs for the scheduler. This * conflicts with SMT right now so whinge if observed. */ if (curcpu()->ci_nsibling[CPUREL_CORE] > 1) { printf("cpu_topology_init: asymmetric & SMT??\n"); } for (CPU_INFO_FOREACH(cii, ci)) { if (!ci->ci_is_slow) { ci->ci_schedstate.spc_flags |= SPCF_1STCLASS; } } } else { /* * For any other configuration mark the 1st CPU in each * core as a first class CPU. */ for (CPU_INFO_FOREACH(cii, ci)) { if ((ci->ci_schedstate.spc_flags & SPCF_CORE1ST) != 0) { ci->ci_schedstate.spc_flags |= SPCF_1STCLASS; } } } cpu_topology_dump(); } /* * Adjust one count, for a counter that's NOT updated from interrupt * context. Hardly worth making an inline due to preemption stuff. */ void cpu_count(enum cpu_count idx, int64_t delta) { lwp_t *l = curlwp; KPREEMPT_DISABLE(l); l->l_cpu->ci_counts[idx] += delta; KPREEMPT_ENABLE(l); } /* * Fetch fresh sum total for all counts. Expensive - don't call often. * * If poll is true, the caller is okay with less recent values (but * no more than 1/hz seconds old). Where this is called very often that * should be the case. * * This should be reasonably quick so that any value collected get isn't * totally out of whack, and it can also be called from interrupt context, * so go to splvm() while summing the counters. It's tempting to use a spin * mutex here but this routine is called from DDB. */ void cpu_count_sync(bool poll) { CPU_INFO_ITERATOR cii; struct cpu_info *ci; int64_t sum[CPU_COUNT_MAX], *ptr; static int lasttick; int curtick, s; enum cpu_count i; KASSERT(sizeof(ci->ci_counts) == sizeof(cpu_counts)); if (__predict_false(!mp_online)) { memcpy(cpu_counts, curcpu()->ci_counts, sizeof(cpu_counts)); return; } s = splvm(); curtick = getticks(); if (poll && atomic_load_acquire(&lasttick) == curtick) { splx(s); return; } memset(sum, 0, sizeof(sum)); curcpu()->ci_counts[CPU_COUNT_SYNC]++; for (CPU_INFO_FOREACH(cii, ci)) { ptr = ci->ci_counts; for (i = 0; i < CPU_COUNT_MAX; i += 8) { sum[i+0] += ptr[i+0]; sum[i+1] += ptr[i+1]; sum[i+2] += ptr[i+2]; sum[i+3] += ptr[i+3]; sum[i+4] += ptr[i+4]; sum[i+5] += ptr[i+5]; sum[i+6] += ptr[i+6]; sum[i+7] += ptr[i+7]; } KASSERT(i == CPU_COUNT_MAX); } memcpy(cpu_counts, sum, sizeof(cpu_counts)); atomic_store_release(&lasttick, curtick); splx(s); }
10 10 10 8 8 10 11 6 5 8 2 19 2 1 4 2 1 1 1 1 1 1 2 1 2 1 4 8 4 4 4 4 4 6 5 6 5 14 14 14 1 1 1 1 1 1 2 1 2 9 7 7 7 2 3 5 3 3 5 5 22 22 1 21 1 3 1 5 1 3 1 8 2 6 2 2 1 2 1 9 1 5 3 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 /* $NetBSD: sysv_msg.c,v 1.76 2019/10/04 23:20:22 kamil Exp $ */ /*- * Copyright (c) 1999, 2006, 2007 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility, * NASA Ames Research Center, and by Andrew Doran. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /* * Implementation of SVID messages * * Author: Daniel Boulet * * Copyright 1993 Daniel Boulet and RTMX Inc. * * This system call was implemented by Daniel Boulet under contract from RTMX. * * Redistribution and use in source forms, with and without modification, * are permitted provided that this entire comment appears intact. * * Redistribution in binary form may occur without any restrictions. * Obviously, it would be nice if you gave credit where credit is due * but requiring it would be too onerous. * * This software is provided ``AS IS'' without any warranties of any kind. */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: sysv_msg.c,v 1.76 2019/10/04 23:20:22 kamil Exp $"); #ifdef _KERNEL_OPT #include "opt_sysv.h" #endif #include <sys/param.h> #include <sys/kernel.h> #include <sys/msg.h> #include <sys/sysctl.h> #include <sys/mount.h> /* XXX for <sys/syscallargs.h> */ #include <sys/syscallargs.h> #include <sys/kauth.h> #define MSG_DEBUG #undef MSG_DEBUG_OK #ifdef MSG_DEBUG_OK #define MSG_PRINTF(a) printf a #else #define MSG_PRINTF(a) #endif static int nfree_msgmaps; /* # of free map entries */ static short free_msgmaps; /* head of linked list of free map entries */ static struct __msg *free_msghdrs; /* list of free msg headers */ static char *msgpool; /* MSGMAX byte long msg buffer pool */ static struct msgmap *msgmaps; /* MSGSEG msgmap structures */ static struct __msg *msghdrs; /* MSGTQL msg headers */ kmsq_t *msqs; /* MSGMNI msqid_ds struct's */ kmutex_t msgmutex; /* subsystem lock */ static u_int msg_waiters = 0; /* total number of msgrcv waiters */ static bool msg_realloc_state; static kcondvar_t msg_realloc_cv; static void msg_freehdr(struct __msg *); extern int kern_has_sysvmsg; SYSCTL_SETUP_PROTO(sysctl_ipc_msg_setup); int msginit(void) { int i, sz; vaddr_t v; /* * msginfo.msgssz should be a power of two for efficiency reasons. * It is also pretty silly if msginfo.msgssz is less than 8 * or greater than about 256 so ... */ i = 8; while (i < 1024 && i != msginfo.msgssz) i <<= 1; if (i != msginfo.msgssz) { printf("msginfo.msgssz = %d, not a small power of 2", msginfo.msgssz); return EINVAL; } if (msginfo.msgseg > 32767) { printf("msginfo.msgseg = %d > 32767", msginfo.msgseg); return EINVAL; } /* Allocate the wired memory for our structures */ sz = ALIGN(msginfo.msgmax) + ALIGN(msginfo.msgseg * sizeof(struct msgmap)) + ALIGN(msginfo.msgtql * sizeof(struct __msg)) + ALIGN(msginfo.msgmni * sizeof(kmsq_t)); sz = round_page(sz); v = uvm_km_alloc(kernel_map, sz, 0, UVM_KMF_WIRED|UVM_KMF_ZERO); if (v == 0) { printf("sysv_msg: cannot allocate memory"); return ENOMEM; } msgpool = (void *)v; msgmaps = (void *)((uintptr_t)msgpool + ALIGN(msginfo.msgmax)); msghdrs = (void *)((uintptr_t)msgmaps + ALIGN(msginfo.msgseg * sizeof(struct msgmap))); msqs = (void *)((uintptr_t)msghdrs + ALIGN(msginfo.msgtql * sizeof(struct __msg))); for (i = 0; i < (msginfo.msgseg - 1); i++) msgmaps[i].next = i + 1; msgmaps[msginfo.msgseg - 1].next = -1; free_msgmaps = 0; nfree_msgmaps = msginfo.msgseg; for (i = 0; i < (msginfo.msgtql - 1); i++) { msghdrs[i].msg_type = 0; msghdrs[i].msg_next = &msghdrs[i + 1]; } i = msginfo.msgtql - 1; msghdrs[i].msg_type = 0; msghdrs[i].msg_next = NULL; free_msghdrs = &msghdrs[0]; for (i = 0; i < msginfo.msgmni; i++) { cv_init(&msqs[i].msq_cv, "msgwait"); /* Implies entry is available */ msqs[i].msq_u.msg_qbytes = 0; /* Reset to a known value */ msqs[i].msq_u.msg_perm._seq = 0; } mutex_init(&msgmutex, MUTEX_DEFAULT, IPL_NONE); cv_init(&msg_realloc_cv, "msgrealc"); msg_realloc_state = false; kern_has_sysvmsg = 1; return 0; } int msgfini(void) { int i, sz; vaddr_t v = (vaddr_t)msgpool; mutex_enter(&msgmutex); for (i = 0; i < msginfo.msgmni; i++) { if (msqs[i].msq_u.msg_qbytes != 0) { mutex_exit(&msgmutex); return 1; /* queue not available, prevent unload! */ } } /* * Destroy all condvars and free the memory we're using */ for (i = 0; i < msginfo.msgmni; i++) { cv_destroy(&msqs[i].msq_cv); } sz = ALIGN(msginfo.msgmax) + ALIGN(msginfo.msgseg * sizeof(struct msgmap)) + ALIGN(msginfo.msgtql * sizeof(struct __msg)) + ALIGN(msginfo.msgmni * sizeof(kmsq_t)); sz = round_page(sz); uvm_km_free(kernel_map, v, sz, UVM_KMF_WIRED); cv_destroy(&msg_realloc_cv); mutex_exit(&msgmutex); mutex_destroy(&msgmutex); kern_has_sysvmsg = 0; return 0; } static int msgrealloc(int newmsgmni, int newmsgseg) { struct msgmap *new_msgmaps; struct __msg *new_msghdrs, *new_free_msghdrs; char *old_msgpool, *new_msgpool; kmsq_t *new_msqs; vaddr_t v; int i, sz, msqid, newmsgmax, new_nfree_msgmaps; short new_free_msgmaps; if (newmsgmni < 1 || newmsgseg < 1) return EINVAL; /* Allocate the wired memory for our structures */ newmsgmax = msginfo.msgssz * newmsgseg; sz = ALIGN(newmsgmax) + ALIGN(newmsgseg * sizeof(struct msgmap)) + ALIGN(msginfo.msgtql * sizeof(struct __msg)) + ALIGN(newmsgmni * sizeof(kmsq_t)); sz = round_page(sz); v = uvm_km_alloc(kernel_map, sz, 0, UVM_KMF_WIRED|UVM_KMF_ZERO); if (v == 0) return ENOMEM; mutex_enter(&msgmutex); if (msg_realloc_state) { mutex_exit(&msgmutex); uvm_km_free(kernel_map, v, sz, UVM_KMF_WIRED); return EBUSY; } msg_realloc_state = true; if (msg_waiters) { /* * Mark reallocation state, wake-up all waiters, * and wait while they will all exit. */ for (i = 0; i < msginfo.msgmni; i++) cv_broadcast(&msqs[i].msq_cv); while (msg_waiters) cv_wait(&msg_realloc_cv, &msgmutex); } old_msgpool = msgpool; /* We cannot reallocate less memory than we use */ i = 0; for (msqid = 0; msqid < msginfo.msgmni; msqid++) { struct msqid_ds *mptr; kmsq_t *msq; msq = &msqs[msqid]; mptr = &msq->msq_u; if (mptr->msg_qbytes || (mptr->msg_perm.mode & MSG_LOCKED)) i = msqid; } if (i >= newmsgmni || (msginfo.msgseg - nfree_msgmaps) > newmsgseg) { mutex_exit(&msgmutex); uvm_km_free(kernel_map, v, sz, UVM_KMF_WIRED); return EBUSY; } new_msgpool = (void *)v; new_msgmaps = (void *)((uintptr_t)new_msgpool + ALIGN(newmsgmax)); new_msghdrs = (void *)((uintptr_t)new_msgmaps + ALIGN(newmsgseg * sizeof(struct msgmap))); new_msqs = (void *)((uintptr_t)new_msghdrs + ALIGN(msginfo.msgtql * sizeof(struct __msg))); /* Initialize the structures */ for (i = 0; i < (newmsgseg - 1); i++) new_msgmaps[i].next = i + 1; new_msgmaps[newmsgseg - 1].next = -1; new_free_msgmaps = 0; new_nfree_msgmaps = newmsgseg; for (i = 0; i < (msginfo.msgtql - 1); i++) { new_msghdrs[i].msg_type = 0; new_msghdrs[i].msg_next = &new_msghdrs[i + 1]; } i = msginfo.msgtql - 1; new_msghdrs[i].msg_type = 0; new_msghdrs[i].msg_next = NULL; new_free_msghdrs = &new_msghdrs[0]; for (i = 0; i < newmsgmni; i++) { new_msqs[i].msq_u.msg_qbytes = 0; new_msqs[i].msq_u.msg_perm._seq = 0; cv_init(&new_msqs[i].msq_cv, "msgwait"); } /* * Copy all message queue identifiers, message headers and buffer * pools to the new memory location. */ for (msqid = 0; msqid < msginfo.msgmni; msqid++) { struct __msg *nmsghdr, *msghdr, *pmsghdr; struct msqid_ds *nmptr, *mptr; kmsq_t *nmsq, *msq; msq = &msqs[msqid]; mptr = &msq->msq_u; if (mptr->msg_qbytes == 0 && (mptr->msg_perm.mode & MSG_LOCKED) == 0) continue; nmsq = &new_msqs[msqid]; nmptr = &nmsq->msq_u; memcpy(nmptr, mptr, sizeof(struct msqid_ds)); /* * Go through the message headers, and copy each one * by taking the new ones, and thus defragmenting. */ nmsghdr = pmsghdr = NULL; msghdr = mptr->_msg_first; while (msghdr) { short nnext = 0, next; u_short msgsz, segcnt; /* Take an entry from the new list of free msghdrs */ nmsghdr = new_free_msghdrs; KASSERT(nmsghdr != NULL); new_free_msghdrs = nmsghdr->msg_next; nmsghdr->msg_next = NULL; if (pmsghdr) { pmsghdr->msg_next = nmsghdr; } else { nmptr->_msg_first = nmsghdr; pmsghdr = nmsghdr; } nmsghdr->msg_ts = msghdr->msg_ts; nmsghdr->msg_spot = -1; /* Compute the amount of segments and reserve them */ msgsz = msghdr->msg_ts; segcnt = (msgsz + msginfo.msgssz - 1) / msginfo.msgssz; if (segcnt == 0) continue; while (segcnt--) { nnext = new_free_msgmaps; new_free_msgmaps = new_msgmaps[nnext].next; new_nfree_msgmaps--; new_msgmaps[nnext].next = nmsghdr->msg_spot; nmsghdr->msg_spot = nnext; } /* Copy all segments */ KASSERT(nnext == nmsghdr->msg_spot); next = msghdr->msg_spot; while (msgsz > 0) { size_t tlen; if (msgsz >= msginfo.msgssz) { tlen = msginfo.msgssz; msgsz -= msginfo.msgssz; } else { tlen = msgsz; msgsz = 0; } /* Copy the message buffer */ memcpy(&new_msgpool[nnext * msginfo.msgssz], &msgpool[next * msginfo.msgssz], tlen); /* Next entry of the map */ nnext = msgmaps[nnext].next; next = msgmaps[next].next; } /* Next message header */ msghdr = msghdr->msg_next; } nmptr->_msg_last = nmsghdr; } KASSERT((msginfo.msgseg - nfree_msgmaps) == (newmsgseg - new_nfree_msgmaps)); sz = ALIGN(msginfo.msgmax) + ALIGN(msginfo.msgseg * sizeof(struct msgmap)) + ALIGN(msginfo.msgtql * sizeof(struct __msg)) + ALIGN(msginfo.msgmni * sizeof(kmsq_t)); sz = round_page(sz); for (i = 0; i < msginfo.msgmni; i++) cv_destroy(&msqs[i].msq_cv); /* Set the pointers and update the new values */ msgpool = new_msgpool; msgmaps = new_msgmaps; msghdrs = new_msghdrs; msqs = new_msqs; free_msghdrs = new_free_msghdrs; free_msgmaps = new_free_msgmaps; nfree_msgmaps = new_nfree_msgmaps; msginfo.msgmni = newmsgmni; msginfo.msgseg = newmsgseg; msginfo.msgmax = newmsgmax; /* Reallocation completed - notify all waiters, if any */ msg_realloc_state = false; cv_broadcast(&msg_realloc_cv); mutex_exit(&msgmutex); uvm_km_free(kernel_map, (vaddr_t)old_msgpool, sz, UVM_KMF_WIRED); return 0; } static void msg_freehdr(struct __msg *msghdr) { KASSERT(mutex_owned(&msgmutex)); while (msghdr->msg_ts > 0) { short next; KASSERT(msghdr->msg_spot >= 0); KASSERT(msghdr->msg_spot < msginfo.msgseg); next = msgmaps[msghdr->msg_spot].next; msgmaps[msghdr->msg_spot].next = free_msgmaps; free_msgmaps = msghdr->msg_spot; nfree_msgmaps++; msghdr->msg_spot = next; if (msghdr->msg_ts >= msginfo.msgssz) msghdr->msg_ts -= msginfo.msgssz; else msghdr->msg_ts = 0; } KASSERT(msghdr->msg_spot == -1); msghdr->msg_next = free_msghdrs; free_msghdrs = msghdr; } int sys___msgctl50(struct lwp *l, const struct sys___msgctl50_args *uap, register_t *retval) { /* { syscallarg(int) msqid; syscallarg(int) cmd; syscallarg(struct msqid_ds *) buf; } */ struct msqid_ds msqbuf; int cmd, error; cmd = SCARG(uap, cmd); if (cmd == IPC_SET) { error = copyin(SCARG(uap, buf), &msqbuf, sizeof(msqbuf)); if (error) return (error); } error = msgctl1(l, SCARG(uap, msqid), cmd, (cmd == IPC_SET || cmd == IPC_STAT) ? &msqbuf : NULL); if (error == 0 && cmd == IPC_STAT) error = copyout(&msqbuf, SCARG(uap, buf), sizeof(msqbuf)); return (error); } int msgctl1(struct lwp *l, int msqid, int cmd, struct msqid_ds *msqbuf) { kauth_cred_t cred = l->l_cred; struct msqid_ds *msqptr; kmsq_t *msq; int error = 0, ix; MSG_PRINTF(("call to msgctl1(%d, %d)\n", msqid, cmd)); ix = IPCID_TO_IX(msqid); mutex_enter(&msgmutex); if (ix < 0 || ix >= msginfo.msgmni) { MSG_PRINTF(("msqid (%d) out of range (0<=msqid<%d)\n", ix, msginfo.msgmni)); error = EINVAL; goto unlock; } msq = &msqs[ix]; msqptr = &msq->msq_u; if (msqptr->msg_qbytes == 0) { MSG_PRINTF(("no such msqid\n")); error = EINVAL; goto unlock; } if (msqptr->msg_perm._seq != IPCID_TO_SEQ(msqid)) { MSG_PRINTF(("wrong sequence number\n")); error = EINVAL; goto unlock; } switch (cmd) { case IPC_RMID: { struct __msg *msghdr; if ((error = ipcperm(cred, &msqptr->msg_perm, IPC_M)) != 0) break; /* Free the message headers */ msghdr = msqptr->_msg_first; while (msghdr != NULL) { struct __msg *msghdr_tmp; /* Free the segments of each message */ msqptr->_msg_cbytes -= msghdr->msg_ts; msqptr->msg_qnum--; msghdr_tmp = msghdr; msghdr = msghdr->msg_next; msg_freehdr(msghdr_tmp); } KASSERT(msqptr->_msg_cbytes == 0); KASSERT(msqptr->msg_qnum == 0); /* Mark it as free */ msqptr->msg_qbytes = 0; cv_broadcast(&msq->msq_cv); } break; case IPC_SET: if ((error = ipcperm(cred, &msqptr->msg_perm, IPC_M))) break; if (msqbuf->msg_qbytes > msqptr->msg_qbytes && kauth_authorize_system(cred, KAUTH_SYSTEM_SYSVIPC, KAUTH_REQ_SYSTEM_SYSVIPC_MSGQ_OVERSIZE, KAUTH_ARG(msqbuf->msg_qbytes), KAUTH_ARG(msqptr->msg_qbytes), NULL) != 0) { error = EPERM; break; } if (msqbuf->msg_qbytes > msginfo.msgmnb) { MSG_PRINTF(("can't increase msg_qbytes beyond %d " "(truncating)\n", msginfo.msgmnb)); /* silently restrict qbytes to system limit */ msqbuf->msg_qbytes = msginfo.msgmnb; } if (msqbuf->msg_qbytes == 0) { MSG_PRINTF(("can't reduce msg_qbytes to 0\n")); error = EINVAL; /* XXX non-standard errno! */ break; } msqptr->msg_perm.uid = msqbuf->msg_perm.uid; msqptr->msg_perm.gid = msqbuf->msg_perm.gid; msqptr->msg_perm.mode = (msqptr->msg_perm.mode & ~0777) | (msqbuf->msg_perm.mode & 0777); msqptr->msg_qbytes = msqbuf->msg_qbytes; msqptr->msg_ctime = time_second; break; case IPC_STAT: if ((error = ipcperm(cred, &msqptr->msg_perm, IPC_R))) { MSG_PRINTF(("requester doesn't have read access\n")); break; } memset(msqbuf, 0, sizeof *msqbuf); msqbuf->msg_perm = msqptr->msg_perm; msqbuf->msg_perm.mode &= 0777; msqbuf->msg_qnum = msqptr->msg_qnum; msqbuf->msg_qbytes = msqptr->msg_qbytes; msqbuf->msg_lspid = msqptr->msg_lspid; msqbuf->msg_lrpid = msqptr->msg_lrpid; msqbuf->msg_stime = msqptr->msg_stime; msqbuf->msg_rtime = msqptr->msg_rtime; msqbuf->msg_ctime = msqptr->msg_ctime; break; default: MSG_PRINTF(("invalid command %d\n", cmd)); error = EINVAL; break; } unlock: mutex_exit(&msgmutex); return (error); } int sys_msgget(struct lwp *l, const struct sys_msgget_args *uap, register_t *retval) { /* { syscallarg(key_t) key; syscallarg(int) msgflg; } */ int msqid, error = 0; int key = SCARG(uap, key); int msgflg = SCARG(uap, msgflg); kauth_cred_t cred = l->l_cred; struct msqid_ds *msqptr = NULL; kmsq_t *msq; mutex_enter(&msgmutex); MSG_PRINTF(("msgget(0x%x, 0%o)\n", key, msgflg)); if (key != IPC_PRIVATE) { for (msqid = 0; msqid < msginfo.msgmni; msqid++) { msq = &msqs[msqid]; msqptr = &msq->msq_u; if (msqptr->msg_qbytes != 0 && msqptr->msg_perm._key == key) break; } if (msqid < msginfo.msgmni) { MSG_PRINTF(("found public key\n")); if ((msgflg & IPC_CREAT) && (msgflg & IPC_EXCL)) { MSG_PRINTF(("not exclusive\n")); error = EEXIST; goto unlock; } if ((error = ipcperm(cred, &msqptr->msg_perm, msgflg & 0700 ))) { MSG_PRINTF(("requester doesn't have 0%o access\n", msgflg & 0700)); goto unlock; } goto found; } } MSG_PRINTF(("need to allocate the msqid_ds\n")); if (key == IPC_PRIVATE || (msgflg & IPC_CREAT)) { for (msqid = 0; msqid < msginfo.msgmni; msqid++) { /* * Look for an unallocated and unlocked msqid_ds. * msqid_ds's can be locked by msgsnd or msgrcv while * they are copying the message in/out. We can't * re-use the entry until they release it. */ msq = &msqs[msqid]; msqptr = &msq->msq_u; if (msqptr->msg_qbytes == 0 && (msqptr->msg_perm.mode & MSG_LOCKED) == 0) break; } if (msqid == msginfo.msgmni) { MSG_PRINTF(("no more msqid_ds's available\n")); error = ENOSPC; goto unlock; } MSG_PRINTF(("msqid %d is available\n", msqid)); msqptr->msg_perm._key = key; msqptr->msg_perm.cuid = kauth_cred_geteuid(cred); msqptr->msg_perm.uid = kauth_cred_geteuid(cred); msqptr->msg_perm.cgid = kauth_cred_getegid(cred); msqptr->msg_perm.gid = kauth_cred_getegid(cred); msqptr->msg_perm.mode = (msgflg & 0777); /* Make sure that the returned msqid is unique */ msqptr->msg_perm._seq++; msqptr->_msg_first = NULL; msqptr->_msg_last = NULL; msqptr->_msg_cbytes = 0; msqptr->msg_qnum = 0; msqptr->msg_qbytes = msginfo.msgmnb; msqptr->msg_lspid = 0; msqptr->msg_lrpid = 0; msqptr->msg_stime = 0; msqptr->msg_rtime = 0; msqptr->msg_ctime = time_second; } else { MSG_PRINTF(("didn't find it and wasn't asked to create it\n")); error = ENOENT; goto unlock; } found: /* Construct the unique msqid */ *retval = IXSEQ_TO_IPCID(msqid, msqptr->msg_perm); unlock: mutex_exit(&msgmutex); return (error); } int sys_msgsnd(struct lwp *l, const struct sys_msgsnd_args *uap, register_t *retval) { /* { syscallarg(int) msqid; syscallarg(const void *) msgp; syscallarg(size_t) msgsz; syscallarg(int) msgflg; } */ return msgsnd1(l, SCARG(uap, msqid), SCARG(uap, msgp), SCARG(uap, msgsz), SCARG(uap, msgflg), sizeof(long), copyin); } int msgsnd1(struct lwp *l, int msqidr, const char *user_msgp, size_t msgsz, int msgflg, size_t typesz, copyin_t fetch_type) { int segs_needed, error = 0, msqid; kauth_cred_t cred = l->l_cred; struct msqid_ds *msqptr; struct __msg *msghdr; kmsq_t *msq; short next; MSG_PRINTF(("call to msgsnd(%d, %p, %lld, %d)\n", msqidr, user_msgp, (long long)msgsz, msgflg)); if ((ssize_t)msgsz < 0) return EINVAL; restart: msqid = IPCID_TO_IX(msqidr); mutex_enter(&msgmutex); /* In case of reallocation, we will wait for completion */ while (__predict_false(msg_realloc_state)) cv_wait(&msg_realloc_cv, &msgmutex); if (msqid < 0 || msqid >= msginfo.msgmni) { MSG_PRINTF(("msqid (%d) out of range (0<=msqid<%d)\n", msqid, msginfo.msgmni)); error = EINVAL; goto unlock; } msq = &msqs[msqid]; msqptr = &msq->msq_u; if (msqptr->msg_qbytes == 0) { MSG_PRINTF(("no such message queue id\n")); error = EINVAL; goto unlock; } if (msqptr->msg_perm._seq != IPCID_TO_SEQ(msqidr)) { MSG_PRINTF(("wrong sequence number\n")); error = EINVAL; goto unlock; } if ((error = ipcperm(cred, &msqptr->msg_perm, IPC_W))) { MSG_PRINTF(("requester doesn't have write access\n")); goto unlock; } segs_needed = (msgsz + msginfo.msgssz - 1) / msginfo.msgssz; MSG_PRINTF(("msgsz=%lld, msgssz=%d, segs_needed=%d\n", (long long)msgsz, msginfo.msgssz, segs_needed)); for (;;) { int need_more_resources = 0; /* * check msgsz [cannot be negative since it is unsigned] * (inside this loop in case msg_qbytes changes while we sleep) */ if (msgsz > msqptr->msg_qbytes) { MSG_PRINTF(("msgsz > msqptr->msg_qbytes\n")); error = EINVAL; goto unlock; } if (msqptr->msg_perm.mode & MSG_LOCKED) { MSG_PRINTF(("msqid is locked\n")); need_more_resources = 1; } if (msgsz + msqptr->_msg_cbytes > msqptr->msg_qbytes) { MSG_PRINTF(("msgsz + msg_cbytes > msg_qbytes\n")); need_more_resources = 1; } if (segs_needed > nfree_msgmaps) { MSG_PRINTF(("segs_needed > nfree_msgmaps\n")); need_more_resources = 1; } if (free_msghdrs == NULL) { MSG_PRINTF(("no more msghdrs\n")); need_more_resources = 1; } if (need_more_resources) { int we_own_it; if ((msgflg & IPC_NOWAIT) != 0) { MSG_PRINTF(("need more resources but caller " "doesn't want to wait\n")); error = EAGAIN; goto unlock; } if ((msqptr->msg_perm.mode & MSG_LOCKED) != 0) { MSG_PRINTF(("we don't own the msqid_ds\n")); we_own_it = 0; } else { /* Force later arrivals to wait for our request */ MSG_PRINTF(("we own the msqid_ds\n")); msqptr->msg_perm.mode |= MSG_LOCKED; we_own_it = 1; } msg_waiters++; MSG_PRINTF(("goodnight\n")); error = cv_wait_sig(&msq->msq_cv, &msgmutex); MSG_PRINTF(("good morning, error=%d\n", error)); msg_waiters--; if (we_own_it) msqptr->msg_perm.mode &= ~MSG_LOCKED; /* * In case of such state, notify reallocator and * restart the call. */ if (msg_realloc_state) { cv_broadcast(&msg_realloc_cv); mutex_exit(&msgmutex); goto restart; } if (error != 0) { MSG_PRINTF(("msgsnd: interrupted system " "call\n")); error = EINTR; goto unlock; } /* * Make sure that the msq queue still exists */ if (msqptr->msg_qbytes == 0) { MSG_PRINTF(("msqid deleted\n")); error = EIDRM; goto unlock; } } else { MSG_PRINTF(("got all the resources that we need\n")); break; } } /* * We have the resources that we need. * Make sure! */ KASSERT((msqptr->msg_perm.mode & MSG_LOCKED) == 0); KASSERT(segs_needed <= nfree_msgmaps); KASSERT(msgsz + msqptr->_msg_cbytes <= msqptr->msg_qbytes); KASSERT(free_msghdrs != NULL); /* * Re-lock the msqid_ds in case we page-fault when copying in the * message */ KASSERT((msqptr->msg_perm.mode & MSG_LOCKED) == 0); msqptr->msg_perm.mode |= MSG_LOCKED; /* * Allocate a message header */ msghdr = free_msghdrs; free_msghdrs = msghdr->msg_next; msghdr->msg_spot = -1; msghdr->msg_ts = msgsz; /* * Allocate space for the message */ while (segs_needed > 0) { KASSERT(nfree_msgmaps > 0); KASSERT(free_msgmaps != -1); KASSERT(free_msgmaps < msginfo.msgseg); next = free_msgmaps; MSG_PRINTF(("allocating segment %d to message\n", next)); free_msgmaps = msgmaps[next].next; nfree_msgmaps--; msgmaps[next].next = msghdr->msg_spot; msghdr->msg_spot = next; segs_needed--; } /* * Copy in the message type */ mutex_exit(&msgmutex); error = (*fetch_type)(user_msgp, &msghdr->msg_type, typesz); mutex_enter(&msgmutex); if (error != 0) { MSG_PRINTF(("error %d copying the message type\n", error)); msg_freehdr(msghdr); msqptr->msg_perm.mode &= ~MSG_LOCKED; cv_broadcast(&msq->msq_cv); goto unlock; } user_msgp += typesz; /* * Validate the message type */ if (msghdr->msg_type < 1) { msg_freehdr(msghdr); msqptr->msg_perm.mode &= ~MSG_LOCKED; cv_broadcast(&msq->msq_cv); MSG_PRINTF(("mtype (%ld) < 1\n", msghdr->msg_type)); error = EINVAL; goto unlock; } /* * Copy in the message body */ next = msghdr->msg_spot; while (msgsz > 0) { size_t tlen; KASSERT(next > -1); KASSERT(next < msginfo.msgseg); if (msgsz > msginfo.msgssz) tlen = msginfo.msgssz; else tlen = msgsz; mutex_exit(&msgmutex); error = copyin(user_msgp, &msgpool[next * msginfo.msgssz], tlen); mutex_enter(&msgmutex); if (error != 0) { MSG_PRINTF(("error %d copying in message segment\n", error)); msg_freehdr(msghdr); msqptr->msg_perm.mode &= ~MSG_LOCKED; cv_broadcast(&msq->msq_cv); goto unlock; } msgsz -= tlen; user_msgp += tlen; next = msgmaps[next].next; } KASSERT(next == -1); /* * We've got the message. Unlock the msqid_ds. */ msqptr->msg_perm.mode &= ~MSG_LOCKED; /* * Make sure that the msqid_ds is still allocated. */ if (msqptr->msg_qbytes == 0) { msg_freehdr(msghdr); cv_broadcast(&msq->msq_cv); error = EIDRM; goto unlock; } /* * Put the message into the queue */ if (msqptr->_msg_first == NULL) { msqptr->_msg_first = msghdr; msqptr->_msg_last = msghdr; } else { msqptr->_msg_last->msg_next = msghdr; msqptr->_msg_last = msghdr; } msqptr->_msg_last->msg_next = NULL; msqptr->_msg_cbytes += msghdr->msg_ts; msqptr->msg_qnum++; msqptr->msg_lspid = l->l_proc->p_pid; msqptr->msg_stime = time_second; cv_broadcast(&msq->msq_cv); unlock: mutex_exit(&msgmutex); return error; } int sys_msgrcv(struct lwp *l, const struct sys_msgrcv_args *uap, register_t *retval) { /* { syscallarg(int) msqid; syscallarg(void *) msgp; syscallarg(size_t) msgsz; syscallarg(long) msgtyp; syscallarg(int) msgflg; } */ return msgrcv1(l, SCARG(uap, msqid), SCARG(uap, msgp), SCARG(uap, msgsz), SCARG(uap, msgtyp), SCARG(uap, msgflg), sizeof(long), copyout, retval); } int msgrcv1(struct lwp *l, int msqidr, char *user_msgp, size_t msgsz, long msgtyp, int msgflg, size_t typesz, copyout_t put_type, register_t *retval) { size_t len; kauth_cred_t cred = l->l_cred; struct msqid_ds *msqptr; struct __msg *msghdr; int error = 0, msqid; kmsq_t *msq; short next; MSG_PRINTF(("call to msgrcv(%d, %p, %lld, %ld, %d)\n", msqidr, user_msgp, (long long)msgsz, msgtyp, msgflg)); if ((ssize_t)msgsz < 0) return EINVAL; restart: msqid = IPCID_TO_IX(msqidr); mutex_enter(&msgmutex); /* In case of reallocation, we will wait for completion */ while (__predict_false(msg_realloc_state)) cv_wait(&msg_realloc_cv, &msgmutex); if (msqid < 0 || msqid >= msginfo.msgmni) { MSG_PRINTF(("msqid (%d) out of range (0<=msqid<%d)\n", msqid, msginfo.msgmni)); error = EINVAL; goto unlock; } msq = &msqs[msqid]; msqptr = &msq->msq_u; if (msqptr->msg_qbytes == 0) { MSG_PRINTF(("no such message queue id\n")); error = EINVAL; goto unlock; } if (msqptr->msg_perm._seq != IPCID_TO_SEQ(msqidr)) { MSG_PRINTF(("wrong sequence number\n")); error = EINVAL; goto unlock; } if ((error = ipcperm(cred, &msqptr->msg_perm, IPC_R))) { MSG_PRINTF(("requester doesn't have read access\n")); goto unlock; } msghdr = NULL; while (msghdr == NULL) { if (msgtyp == 0) { msghdr = msqptr->_msg_first; if (msghdr != NULL) { if (msgsz < msghdr->msg_ts && (msgflg & MSG_NOERROR) == 0) { MSG_PRINTF(("first msg on the queue " "is too big (want %lld, got %d)\n", (long long)msgsz, msghdr->msg_ts)); error = E2BIG; goto unlock; } if (msqptr->_msg_first == msqptr->_msg_last) { msqptr->_msg_first = NULL; msqptr->_msg_last = NULL; } else { msqptr->_msg_first = msghdr->msg_next; KASSERT(msqptr->_msg_first != NULL); } } } else { struct __msg *previous; struct __msg **prev; for (previous = NULL, prev = &msqptr->_msg_first; (msghdr = *prev) != NULL; previous = msghdr, prev = &msghdr->msg_next) { /* * Is this message's type an exact match or is * this message's type less than or equal to * the absolute value of a negative msgtyp? * Note that the second half of this test can * NEVER be true if msgtyp is positive since * msg_type is always positive! */ if (msgtyp != msghdr->msg_type && msgtyp != LONG_MIN && msghdr->msg_type > -msgtyp) continue; MSG_PRINTF(("found message type %ld, requested %ld\n", msghdr->msg_type, msgtyp)); if (msgsz < msghdr->msg_ts && (msgflg & MSG_NOERROR) == 0) { MSG_PRINTF(("requested message on the queue " "is too big (want %lld, got %d)\n", (long long)msgsz, msghdr->msg_ts)); error = E2BIG; goto unlock; } *prev = msghdr->msg_next; if (msghdr != msqptr->_msg_last) break; if (previous == NULL) { KASSERT(prev == &msqptr->_msg_first); msqptr->_msg_first = NULL; msqptr->_msg_last = NULL; } else { KASSERT(prev != &msqptr->_msg_first); msqptr->_msg_last = previous; } break; } } /* * We've either extracted the msghdr for the appropriate * message or there isn't one. * If there is one then bail out of this loop. */ if (msghdr != NULL) break; /* * Hmph! No message found. Does the user want to wait? */ if ((msgflg & IPC_NOWAIT) != 0) { MSG_PRINTF(("no appropriate message found (msgtyp=%ld)\n", msgtyp)); error = ENOMSG; goto unlock; } /* * Wait for something to happen */ msg_waiters++; MSG_PRINTF(("msgrcv: goodnight\n")); error = cv_wait_sig(&msq->msq_cv, &msgmutex); MSG_PRINTF(("msgrcv: good morning (error=%d)\n", error)); msg_waiters--; /* * In case of such state, notify reallocator and * restart the call. */ if (msg_realloc_state) { cv_broadcast(&msg_realloc_cv); mutex_exit(&msgmutex); goto restart; } if (error != 0) { MSG_PRINTF(("msgsnd: interrupted system call\n")); error = EINTR; goto unlock; } /* * Make sure that the msq queue still exists */ if (msqptr->msg_qbytes == 0 || msqptr->msg_perm._seq != IPCID_TO_SEQ(msqidr)) { MSG_PRINTF(("msqid deleted\n")); error = EIDRM; goto unlock; } } /* * Return the message to the user. * * First, do the bookkeeping (before we risk being interrupted). */ msqptr->_msg_cbytes -= msghdr->msg_ts; msqptr->msg_qnum--; msqptr->msg_lrpid = l->l_proc->p_pid; msqptr->msg_rtime = time_second; /* * Make msgsz the actual amount that we'll be returning. * Note that this effectively truncates the message if it is too long * (since msgsz is never increased). */ MSG_PRINTF(("found a message, msgsz=%lld, msg_ts=%d\n", (long long)msgsz, msghdr->msg_ts)); if (msgsz > msghdr->msg_ts) msgsz = msghdr->msg_ts; /* * Return the type to the user. */ mutex_exit(&msgmutex); error = (*put_type)(&msghdr->msg_type, user_msgp, typesz); mutex_enter(&msgmutex); if (error != 0) { MSG_PRINTF(("error (%d) copying out message type\n", error)); msg_freehdr(msghdr); cv_broadcast(&msq->msq_cv); goto unlock; } user_msgp += typesz; /* * Return the segments to the user */ next = msghdr->msg_spot; for (len = 0; len < msgsz; len += msginfo.msgssz) { size_t tlen; KASSERT(next > -1); KASSERT(next < msginfo.msgseg); if (msgsz - len > msginfo.msgssz) tlen = msginfo.msgssz; else tlen = msgsz - len; mutex_exit(&msgmutex); error = copyout(&msgpool[next * msginfo.msgssz], user_msgp, tlen); mutex_enter(&msgmutex); if (error != 0) { MSG_PRINTF(("error (%d) copying out message segment\n", error)); msg_freehdr(msghdr); cv_broadcast(&msq->msq_cv); goto unlock; } user_msgp += tlen; next = msgmaps[next].next; } /* * Done, return the actual number of bytes copied out. */ msg_freehdr(msghdr); cv_broadcast(&msq->msq_cv); *retval = msgsz; unlock: mutex_exit(&msgmutex); return error; } /* * Sysctl initialization and nodes. */ static int sysctl_ipc_msgmni(SYSCTLFN_ARGS) { int newsize, error; struct sysctlnode node; node = *rnode; node.sysctl_data = &newsize; newsize = msginfo.msgmni; error = sysctl_lookup(SYSCTLFN_CALL(&node)); if (error || newp == NULL) return error; sysctl_unlock(); error = msgrealloc(newsize, msginfo.msgseg); sysctl_relock(); return error; } static int sysctl_ipc_msgseg(SYSCTLFN_ARGS) { int newsize, error; struct sysctlnode node; node = *rnode; node.sysctl_data = &newsize; newsize = msginfo.msgseg; error = sysctl_lookup(SYSCTLFN_CALL(&node)); if (error || newp == NULL) return error; sysctl_unlock(); error = msgrealloc(msginfo.msgmni, newsize); sysctl_relock(); return error; } SYSCTL_SETUP(sysctl_ipc_msg_setup, "sysctl kern.ipc subtree setup") { const struct sysctlnode *node = NULL; sysctl_createv(clog, 0, NULL, &node, CTLFLAG_PERMANENT, CTLTYPE_NODE, "ipc", SYSCTL_DESCR("SysV IPC options"), NULL, 0, NULL, 0, CTL_KERN, KERN_SYSVIPC, CTL_EOL); if (node == NULL) return; sysctl_createv(clog, 0, &node, NULL, CTLFLAG_PERMANENT | CTLFLAG_READWRITE, CTLTYPE_INT, "msgmni", SYSCTL_DESCR("Max number of message queue identifiers"), sysctl_ipc_msgmni, 0, &msginfo.msgmni, 0, CTL_CREATE, CTL_EOL); sysctl_createv(clog, 0, &node, NULL, CTLFLAG_PERMANENT | CTLFLAG_READWRITE, CTLTYPE_INT, "msgseg", SYSCTL_DESCR("Max number of number of message segments"), sysctl_ipc_msgseg, 0, &msginfo.msgseg, 0, CTL_CREATE, CTL_EOL); }
212 210 20 24 67 89 1 20 71 5 1 77 34 76 12 66 77 26 3 48 30 27 27 1 26 13 14 1 1 12 7 6 76 75 44 70 14 57 13 58 71 71 71 71 70 59 59 57 3 1 1 1 46 17 17 17 58 56 1 1 1 1 4 4 4 4 60 61 1 59 1 8 7 52 20 20 25 131 131 129 130 113 20 3 1 86 4 40 125 1 123 7 2 1 125 25 124 118 11 12 48 68 125 1 123 33 120 4 122 8 9 115 119 4 122 8 81 7 44 128 1 128 3 15 15 15 63 5 59 54 10 3 49 12 12 12 51 51 12 2 19 1 5 5 3 30 4 1 1 2 2 29 29 16 2 3 31 18 12 2 10 9 1 10 31 25 1 1 6 8 2 6 8 4 4 2 8 23 17 17 15 2 1 5 9 9 1 8 1 1 14 1 8 2 3 3 3 2 14 1 6 15 15 6 2 9 12 28 1 27 25 25 1 16 9 3 3 78 78 60 19 28 3 3 3 1 1 1 12 12 6 3 3 11 11 2 1 1 3 1 1 1 1 1 5 1 1 1 9 9 5 43 217 43 43 176 43 192 10 1 1 10 2 1 1 2 2 2 2 2 2 60 60 36 25 273 77 206 271 273 269 202 76 13 1 33 13 2 11 12 31 99 43 83 72 95 7 12 10 8 14 4 17 17 17 3 9 9 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 /* $NetBSD: uipc_socket.c,v 1.309 2024/02/11 13:01:29 jdolecek Exp $ */ /* * Copyright (c) 2002, 2007, 2008, 2009, 2023 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Jason R. Thorpe of Wasabi Systems, Inc, and by Andrew Doran. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /* * Copyright (c) 2004 The FreeBSD Foundation * Copyright (c) 2004 Robert Watson * Copyright (c) 1982, 1986, 1988, 1990, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)uipc_socket.c 8.6 (Berkeley) 5/2/95 */ /* * Socket operation routines. * * These routines are called by the routines in sys_socket.c or from a * system process, and implement the semantics of socket operations by * switching out to the protocol specific routines. */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: uipc_socket.c,v 1.309 2024/02/11 13:01:29 jdolecek Exp $"); #ifdef _KERNEL_OPT #include "opt_compat_netbsd.h" #include "opt_sock_counters.h" #include "opt_sosend_loan.h" #include "opt_mbuftrace.h" #include "opt_somaxkva.h" #include "opt_multiprocessor.h" /* XXX */ #include "opt_sctp.h" #include "opt_pipe.h" #endif #include <sys/param.h> #include <sys/systm.h> #include <sys/proc.h> #include <sys/file.h> #include <sys/filedesc.h> #include <sys/kmem.h> #include <sys/mbuf.h> #include <sys/domain.h> #include <sys/kernel.h> #include <sys/protosw.h> #include <sys/socket.h> #include <sys/socketvar.h> #include <sys/signalvar.h> #include <sys/resourcevar.h> #include <sys/uidinfo.h> #include <sys/event.h> #include <sys/poll.h> #include <sys/kauth.h> #include <sys/mutex.h> #include <sys/condvar.h> #include <sys/kthread.h> #include <sys/compat_stub.h> #include <compat/sys/time.h> #include <compat/sys/socket.h> #include <uvm/uvm_extern.h> #include <uvm/uvm_loan.h> #include <uvm/uvm_page.h> #ifdef SCTP #include <netinet/sctp_route.h> #endif MALLOC_DEFINE(M_SONAME, "soname", "socket name"); extern const struct fileops socketops; static int sooptions; extern int somaxconn; /* patchable (XXX sysctl) */ int somaxconn = SOMAXCONN; kmutex_t *softnet_lock; #ifdef SOSEND_COUNTERS #include <sys/device.h> static struct evcnt sosend_loan_big = EVCNT_INITIALIZER(EVCNT_TYPE_MISC, NULL, "sosend", "loan big"); static struct evcnt sosend_copy_big = EVCNT_INITIALIZER(EVCNT_TYPE_MISC, NULL, "sosend", "copy big"); static struct evcnt sosend_copy_small = EVCNT_INITIALIZER(EVCNT_TYPE_MISC, NULL, "sosend", "copy small"); static struct evcnt sosend_kvalimit = EVCNT_INITIALIZER(EVCNT_TYPE_MISC, NULL, "sosend", "kva limit"); #define SOSEND_COUNTER_INCR(ev) (ev)->ev_count++ EVCNT_ATTACH_STATIC(sosend_loan_big); EVCNT_ATTACH_STATIC(sosend_copy_big); EVCNT_ATTACH_STATIC(sosend_copy_small); EVCNT_ATTACH_STATIC(sosend_kvalimit); #else #define SOSEND_COUNTER_INCR(ev) /* nothing */ #endif /* SOSEND_COUNTERS */ #if defined(SOSEND_NO_LOAN) || defined(MULTIPROCESSOR) int sock_loan_thresh = -1; #else int sock_loan_thresh = 4096; #endif static kmutex_t so_pendfree_lock; static struct mbuf *so_pendfree = NULL; #ifndef SOMAXKVA #define SOMAXKVA (16 * 1024 * 1024) #endif int somaxkva = SOMAXKVA; static int socurkva; static kcondvar_t socurkva_cv; #ifndef SOFIXEDBUF #define SOFIXEDBUF true #endif bool sofixedbuf = SOFIXEDBUF; static kauth_listener_t socket_listener; #define SOCK_LOAN_CHUNK 65536 static void sopendfree_thread(void *); static kcondvar_t pendfree_thread_cv; static lwp_t *sopendfree_lwp; static void sysctl_kern_socket_setup(void); static struct sysctllog *socket_sysctllog; static vsize_t sokvareserve(struct socket *so, vsize_t len) { int error; mutex_enter(&so_pendfree_lock); while (socurkva + len > somaxkva) { SOSEND_COUNTER_INCR(&sosend_kvalimit); error = cv_wait_sig(&socurkva_cv, &so_pendfree_lock); if (error) { len = 0; break; } } socurkva += len; mutex_exit(&so_pendfree_lock); return len; } static void sokvaunreserve(vsize_t len) { mutex_enter(&so_pendfree_lock); socurkva -= len; cv_broadcast(&socurkva_cv); mutex_exit(&so_pendfree_lock); } /* * sokvaalloc: allocate kva for loan. */ vaddr_t sokvaalloc(vaddr_t sva, vsize_t len, struct socket *so) { vaddr_t lva; if (sokvareserve(so, len) == 0) return 0; lva = uvm_km_alloc(kernel_map, len, atop(sva) & uvmexp.colormask, UVM_KMF_COLORMATCH | UVM_KMF_VAONLY | UVM_KMF_WAITVA); if (lva == 0) { sokvaunreserve(len); return 0; } return lva; } /* * sokvafree: free kva for loan. */ void sokvafree(vaddr_t sva, vsize_t len) { uvm_km_free(kernel_map, sva, len, UVM_KMF_VAONLY); sokvaunreserve(len); } static void sodoloanfree(struct vm_page **pgs, void *buf, size_t size) { vaddr_t sva, eva; vsize_t len; int npgs; KASSERT(pgs != NULL); eva = round_page((vaddr_t) buf + size); sva = trunc_page((vaddr_t) buf); len = eva - sva; npgs = len >> PAGE_SHIFT; pmap_kremove(sva, len); pmap_update(pmap_kernel()); uvm_unloan(pgs, npgs, UVM_LOAN_TOPAGE); sokvafree(sva, len); } /* * sopendfree_thread: free mbufs on "pendfree" list. Unlock and relock * so_pendfree_lock when freeing mbufs. */ static void sopendfree_thread(void *v) { struct mbuf *m, *next; size_t rv; mutex_enter(&so_pendfree_lock); for (;;) { rv = 0; while (so_pendfree != NULL) { m = so_pendfree; so_pendfree = NULL; mutex_exit(&so_pendfree_lock); for (; m != NULL; m = next) { next = m->m_next; KASSERT((~m->m_flags & (M_EXT|M_EXT_PAGES)) == 0); KASSERT(m->m_ext.ext_refcnt == 0); rv += m->m_ext.ext_size; sodoloanfree(m->m_ext.ext_pgs, m->m_ext.ext_buf, m->m_ext.ext_size); pool_cache_put(mb_cache, m); } mutex_enter(&so_pendfree_lock); } if (rv) cv_broadcast(&socurkva_cv); cv_wait(&pendfree_thread_cv, &so_pendfree_lock); } panic("sopendfree_thread"); /* NOTREACHED */ } void soloanfree(struct mbuf *m, void *buf, size_t size, void *arg) { KASSERT(m != NULL); /* * postpone freeing mbuf. * * we can't do it in interrupt context * because we need to put kva back to kernel_map. */ mutex_enter(&so_pendfree_lock); m->m_next = so_pendfree; so_pendfree = m; cv_signal(&pendfree_thread_cv); mutex_exit(&so_pendfree_lock); } static long sosend_loan(struct socket *so, struct uio *uio, struct mbuf *m, long space) { struct iovec *iov = uio->uio_iov; vaddr_t sva, eva; vsize_t len; vaddr_t lva; int npgs, error; vaddr_t va; int i; if (VMSPACE_IS_KERNEL_P(uio->uio_vmspace)) return 0; if (iov->iov_len < (size_t) space) space = iov->iov_len; if (space > SOCK_LOAN_CHUNK) space = SOCK_LOAN_CHUNK; eva = round_page((vaddr_t) iov->iov_base + space); sva = trunc_page((vaddr_t) iov->iov_base); len = eva - sva; npgs = len >> PAGE_SHIFT; KASSERT(npgs <= M_EXT_MAXPAGES); lva = sokvaalloc(sva, len, so); if (lva == 0) return 0; error = uvm_loan(&uio->uio_vmspace->vm_map, sva, len, m->m_ext.ext_pgs, UVM_LOAN_TOPAGE); if (error) { sokvafree(lva, len); return 0; } for (i = 0, va = lva; i < npgs; i++, va += PAGE_SIZE) pmap_kenter_pa(va, VM_PAGE_TO_PHYS(m->m_ext.ext_pgs[i]), VM_PROT_READ, 0); pmap_update(pmap_kernel()); lva += (vaddr_t) iov->iov_base & PAGE_MASK; MEXTADD(m, (void *) lva, space, M_MBUF, soloanfree, so); m->m_flags |= M_EXT_PAGES | M_EXT_ROMAP; uio->uio_resid -= space; /* uio_offset not updated, not set/used for write(2) */ uio->uio_iov->iov_base = (char *)uio->uio_iov->iov_base + space; uio->uio_iov->iov_len -= space; if (uio->uio_iov->iov_len == 0) { uio->uio_iov++; uio->uio_iovcnt--; } return space; } static int socket_listener_cb(kauth_cred_t cred, kauth_action_t action, void *cookie, void *arg0, void *arg1, void *arg2, void *arg3) { int result; enum kauth_network_req req; result = KAUTH_RESULT_DEFER; req = (enum kauth_network_req)(uintptr_t)arg0; if ((action != KAUTH_NETWORK_SOCKET) && (action != KAUTH_NETWORK_BIND)) return result; switch (req) { case KAUTH_REQ_NETWORK_BIND_PORT: result = KAUTH_RESULT_ALLOW; break; case KAUTH_REQ_NETWORK_SOCKET_DROP: { /* Normal users can only drop their own connections. */ struct socket *so = (struct socket *)arg1; if (so->so_cred && proc_uidmatch(cred, so->so_cred) == 0) result = KAUTH_RESULT_ALLOW; break; } case KAUTH_REQ_NETWORK_SOCKET_OPEN: /* We allow "raw" routing/bluetooth sockets to anyone. */ switch ((u_long)arg1) { case PF_ROUTE: case PF_OROUTE: case PF_BLUETOOTH: case PF_CAN: result = KAUTH_RESULT_ALLOW; break; default: /* Privileged, let secmodel handle this. */ if ((u_long)arg2 == SOCK_RAW) break; result = KAUTH_RESULT_ALLOW; break; } break; case KAUTH_REQ_NETWORK_SOCKET_CANSEE: result = KAUTH_RESULT_ALLOW; break; default: break; } return result; } void soinit(void) { sysctl_kern_socket_setup(); #ifdef SCTP /* Update the SCTP function hooks if necessary*/ vec_sctp_add_ip_address = sctp_add_ip_address; vec_sctp_delete_ip_address = sctp_delete_ip_address; #endif mutex_init(&so_pendfree_lock, MUTEX_DEFAULT, IPL_VM); softnet_lock = mutex_obj_alloc(MUTEX_DEFAULT, IPL_NONE); cv_init(&socurkva_cv, "sokva"); cv_init(&pendfree_thread_cv, "sopendfr"); soinit2(); /* Set the initial adjusted socket buffer size. */ if (sb_max_set(sb_max)) panic("bad initial sb_max value: %lu", sb_max); socket_listener = kauth_listen_scope(KAUTH_SCOPE_NETWORK, socket_listener_cb, NULL); } void soinit1(void) { int error = kthread_create(PRI_NONE, KTHREAD_MPSAFE, NULL, sopendfree_thread, NULL, &sopendfree_lwp, "sopendfree"); if (error) panic("soinit1 %d", error); } /* * socreate: create a new socket of the specified type and the protocol. * * => Caller may specify another socket for lock sharing (must not be held). * => Returns the new socket without lock held. */ int socreate(int dom, struct socket **aso, int type, int proto, struct lwp *l, struct socket *lockso) { const struct protosw *prp; struct socket *so; uid_t uid; int error; kmutex_t *lock; error = kauth_authorize_network(l->l_cred, KAUTH_NETWORK_SOCKET, KAUTH_REQ_NETWORK_SOCKET_OPEN, KAUTH_ARG(dom), KAUTH_ARG(type), KAUTH_ARG(proto)); if (error != 0) return error; if (proto) prp = pffindproto(dom, proto, type); else prp = pffindtype(dom, type); if (prp == NULL) { /* no support for domain */ if (pffinddomain(dom) == 0) return EAFNOSUPPORT; /* no support for socket type */ if (proto == 0 && type != 0) return EPROTOTYPE; return EPROTONOSUPPORT; } if (prp->pr_usrreqs == NULL) return EPROTONOSUPPORT; if (prp->pr_type != type) return EPROTOTYPE; so = soget(true); so->so_type = type; so->so_proto = prp; so->so_send = sosend; so->so_receive = soreceive; so->so_options = sooptions; #ifdef MBUFTRACE so->so_rcv.sb_mowner = &prp->pr_domain->dom_mowner; so->so_snd.sb_mowner = &prp->pr_domain->dom_mowner; so->so_mowner = &prp->pr_domain->dom_mowner; #endif uid = kauth_cred_geteuid(l->l_cred); so->so_uidinfo = uid_find(uid); so->so_egid = kauth_cred_getegid(l->l_cred); so->so_cpid = l->l_proc->p_pid; /* * Lock assigned and taken during PCB attach, unless we share * the lock with another socket, e.g. socketpair(2) case. */ if (lockso) { /* * lockso->so_lock should be stable at this point, so * no need for atomic_load_*. */ lock = lockso->so_lock; so->so_lock = lock; mutex_obj_hold(lock); mutex_enter(lock); } /* Attach the PCB (returns with the socket lock held). */ error = (*prp->pr_usrreqs->pr_attach)(so, proto); KASSERT(solocked(so)); if (error) { KASSERT(so->so_pcb == NULL); so->so_state |= SS_NOFDREF; sofree(so); return error; } so->so_cred = kauth_cred_hold(l->l_cred); sounlock(so); *aso = so; return 0; } /* * fsocreate: create a socket and a file descriptor associated with it. * Returns the allocated file structure in *fpp, but the descriptor * is not visible yet for the process. * Caller is responsible for calling fd_affix() for the returned *fpp once * it's socket initialization is finished successfully, or fd_abort() if it's * initialization fails. * * * => On success, write file descriptor to *fdout and *fpp and return zero. * => On failure, return non-zero; *fdout and *fpp will be undefined. */ int fsocreate(int domain, struct socket **sop, int type, int proto, int *fdout, file_t **fpp, struct socket *lockso) { lwp_t *l = curlwp; int error, fd, flags; struct socket *so; file_t *fp; flags = type & SOCK_FLAGS_MASK; type &= ~SOCK_FLAGS_MASK; error = socreate(domain, &so, type, proto, l, lockso); if (error) { return error; } if ((error = fd_allocfile(&fp, &fd)) != 0) { soclose(so); return error; } fd_set_exclose(l, fd, (flags & SOCK_CLOEXEC) != 0); fp->f_flag = FREAD|FWRITE|((flags & SOCK_NONBLOCK) ? FNONBLOCK : 0)| ((flags & SOCK_NOSIGPIPE) ? FNOSIGPIPE : 0); fp->f_type = DTYPE_SOCKET; fp->f_ops = &socketops; if (flags & SOCK_NONBLOCK) { so->so_state |= SS_NBIO; } fp->f_socket = so; if (sop != NULL) { *sop = so; } *fdout = fd; *fpp = fp; return error; } int sofamily(const struct socket *so) { const struct protosw *pr; const struct domain *dom; if ((pr = so->so_proto) == NULL) return AF_UNSPEC; if ((dom = pr->pr_domain) == NULL) return AF_UNSPEC; return dom->dom_family; } int sobind(struct socket *so, struct sockaddr *nam, struct lwp *l) { int error; solock(so); if (nam->sa_family != so->so_proto->pr_domain->dom_family) { sounlock(so); return EAFNOSUPPORT; } error = (*so->so_proto->pr_usrreqs->pr_bind)(so, nam, l); sounlock(so); return error; } int solisten(struct socket *so, int backlog, struct lwp *l) { int error; short oldopt, oldqlimit; solock(so); if ((so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING | SS_ISDISCONNECTING)) != 0) { sounlock(so); return EINVAL; } oldopt = so->so_options; oldqlimit = so->so_qlimit; if (TAILQ_EMPTY(&so->so_q)) so->so_options |= SO_ACCEPTCONN; if (backlog < 0) backlog = 0; so->so_qlimit = uimin(backlog, somaxconn); error = (*so->so_proto->pr_usrreqs->pr_listen)(so, l); if (error != 0) { so->so_options = oldopt; so->so_qlimit = oldqlimit; sounlock(so); return error; } sounlock(so); return 0; } void sofree(struct socket *so) { u_int refs; KASSERT(solocked(so)); if (so->so_pcb || (so->so_state & SS_NOFDREF) == 0) { sounlock(so); return; } if (so->so_head) { /* * We must not decommission a socket that's on the accept(2) * queue. If we do, then accept(2) may hang after select(2) * indicated that the listening socket was ready. */ if (!soqremque(so, 0)) { sounlock(so); return; } } if (so->so_rcv.sb_hiwat) (void)chgsbsize(so->so_uidinfo, &so->so_rcv.sb_hiwat, 0, RLIM_INFINITY); if (so->so_snd.sb_hiwat) (void)chgsbsize(so->so_uidinfo, &so->so_snd.sb_hiwat, 0, RLIM_INFINITY); sbrelease(&so->so_snd, so); KASSERT(!cv_has_waiters(&so->so_cv)); KASSERT(!cv_has_waiters(&so->so_rcv.sb_cv)); KASSERT(!cv_has_waiters(&so->so_snd.sb_cv)); sorflush(so); refs = so->so_aborting; /* XXX */ /* Remove accept filter if one is present. */ if (so->so_accf != NULL) (void)accept_filt_clear(so); sounlock(so); if (refs == 0) /* XXX */ soput(so); } /* * soclose: close a socket on last file table reference removal. * Initiate disconnect if connected. Free socket when disconnect complete. */ int soclose(struct socket *so) { struct socket *so2; int error = 0; solock(so); if (so->so_options & SO_ACCEPTCONN) { for (;;) { if ((so2 = TAILQ_FIRST(&so->so_q0)) != 0) { KASSERT(solocked2(so, so2)); (void) soqremque(so2, 0); /* soabort drops the lock. */ (void) soabort(so2); solock(so); continue; } if ((so2 = TAILQ_FIRST(&so->so_q)) != 0) { KASSERT(solocked2(so, so2)); (void) soqremque(so2, 1); /* soabort drops the lock. */ (void) soabort(so2); solock(so); continue; } break; } } if (so->so_pcb == NULL) goto discard; if (so->so_state & SS_ISCONNECTED) { if ((so->so_state & SS_ISDISCONNECTING) == 0) { error = sodisconnect(so); if (error) goto drop; } if (so->so_options & SO_LINGER) { if ((so->so_state & (SS_ISDISCONNECTING|SS_NBIO)) == (SS_ISDISCONNECTING|SS_NBIO)) goto drop; while (so->so_state & SS_ISCONNECTED) { error = sowait(so, true, so->so_linger * hz); if (error) break; } } } drop: if (so->so_pcb) { KASSERT(solocked(so)); (*so->so_proto->pr_usrreqs->pr_detach)(so); } discard: KASSERT((so->so_state & SS_NOFDREF) == 0); kauth_cred_free(so->so_cred); so->so_cred = NULL; so->so_state |= SS_NOFDREF; sofree(so); return error; } /* * Must be called with the socket locked.. Will return with it unlocked. */ int soabort(struct socket *so) { u_int refs; int error; KASSERT(solocked(so)); KASSERT(so->so_head == NULL); so->so_aborting++; /* XXX */ error = (*so->so_proto->pr_usrreqs->pr_abort)(so); refs = --so->so_aborting; /* XXX */ if (error || (refs == 0)) { sofree(so); } else { sounlock(so); } return error; } int soaccept(struct socket *so, struct sockaddr *nam) { int error; KASSERT(solocked(so)); KASSERT((so->so_state & SS_NOFDREF) != 0); so->so_state &= ~SS_NOFDREF; if ((so->so_state & SS_ISDISCONNECTED) == 0 || (so->so_proto->pr_flags & PR_ABRTACPTDIS) == 0) error = (*so->so_proto->pr_usrreqs->pr_accept)(so, nam); else error = ECONNABORTED; return error; } int soconnect(struct socket *so, struct sockaddr *nam, struct lwp *l) { int error; KASSERT(solocked(so)); if (so->so_options & SO_ACCEPTCONN) return EOPNOTSUPP; /* * If protocol is connection-based, can only connect once. * Otherwise, if connected, try to disconnect first. * This allows user to disconnect by connecting to, e.g., * a null address. */ if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING) && ((so->so_proto->pr_flags & PR_CONNREQUIRED) || (error = sodisconnect(so)))) { error = EISCONN; } else { if (nam->sa_family != so->so_proto->pr_domain->dom_family) { return EAFNOSUPPORT; } error = (*so->so_proto->pr_usrreqs->pr_connect)(so, nam, l); } return error; } int soconnect2(struct socket *so1, struct socket *so2) { KASSERT(solocked2(so1, so2)); return (*so1->so_proto->pr_usrreqs->pr_connect2)(so1, so2); } int sodisconnect(struct socket *so) { int error; KASSERT(solocked(so)); if ((so->so_state & SS_ISCONNECTED) == 0) { error = ENOTCONN; } else if (so->so_state & SS_ISDISCONNECTING) { error = EALREADY; } else { error = (*so->so_proto->pr_usrreqs->pr_disconnect)(so); } return error; } #define SBLOCKWAIT(f) (((f) & MSG_DONTWAIT) ? M_NOWAIT : M_WAITOK) /* * Send on a socket. * If send must go all at once and message is larger than * send buffering, then hard error. * Lock against other senders. * If must go all at once and not enough room now, then * inform user that this would block and do nothing. * Otherwise, if nonblocking, send as much as possible. * The data to be sent is described by "uio" if nonzero, * otherwise by the mbuf chain "top" (which must be null * if uio is not). Data provided in mbuf chain must be small * enough to send all at once. * * Returns nonzero on error, timeout or signal; callers * must check for short counts if EINTR/ERESTART are returned. * Data and control buffers are freed on return. */ int sosend(struct socket *so, struct sockaddr *addr, struct uio *uio, struct mbuf *top, struct mbuf *control, int flags, struct lwp *l) { struct mbuf **mp, *m; long space, len, resid, clen, mlen; int error, s, dontroute, atomic; short wakeup_state = 0; clen = 0; /* * solock() provides atomicity of access. splsoftnet() prevents * protocol processing soft interrupts from interrupting us and * blocking (expensive). */ s = splsoftnet(); solock(so); atomic = sosendallatonce(so) || top; if (uio) resid = uio->uio_resid; else resid = top->m_pkthdr.len; /* * In theory resid should be unsigned. * However, space must be signed, as it might be less than 0 * if we over-committed, and we must use a signed comparison * of space and resid. On the other hand, a negative resid * causes us to loop sending 0-length segments to the protocol. */ if (resid < 0) { error = EINVAL; goto out; } dontroute = (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0 && (so->so_proto->pr_flags & PR_ATOMIC); l->l_ru.ru_msgsnd++; if (control) clen = control->m_len; restart: if ((error = sblock(&so->so_snd, SBLOCKWAIT(flags))) != 0) goto out; do { if (so->so_state & SS_CANTSENDMORE) { error = EPIPE; goto release; } if (so->so_error) { error = so->so_error; if ((flags & MSG_PEEK) == 0) so->so_error = 0; goto release; } if ((so->so_state & SS_ISCONNECTED) == 0) { if (so->so_proto->pr_flags & PR_CONNREQUIRED) { if (resid || clen == 0) { error = ENOTCONN; goto release; } } else if (addr == NULL) { error = EDESTADDRREQ; goto release; } } space = sbspace(&so->so_snd); if (flags & MSG_OOB) space += 1024; if ((atomic && resid > so->so_snd.sb_hiwat) || clen > so->so_snd.sb_hiwat) { error = EMSGSIZE; goto release; } if (space < resid + clen && (atomic || space < so->so_snd.sb_lowat || space < clen)) { if ((so->so_state & SS_NBIO) || (flags & MSG_NBIO)) { error = EWOULDBLOCK; goto release; } sbunlock(&so->so_snd); if (wakeup_state & SS_RESTARTSYS) { error = ERESTART; goto out; } error = sbwait(&so->so_snd); if (error) goto out; wakeup_state = so->so_state; goto restart; } wakeup_state = 0; mp = &top; space -= clen; do { if (uio == NULL) { /* * Data is prepackaged in "top". */ resid = 0; if (flags & MSG_EOR) top->m_flags |= M_EOR; } else do { sounlock(so); splx(s); if (top == NULL) { m = m_gethdr(M_WAIT, MT_DATA); mlen = MHLEN; m->m_pkthdr.len = 0; m_reset_rcvif(m); } else { m = m_get(M_WAIT, MT_DATA); mlen = MLEN; } MCLAIM(m, so->so_snd.sb_mowner); if (sock_loan_thresh >= 0 && uio->uio_iov->iov_len >= sock_loan_thresh && space >= sock_loan_thresh && (len = sosend_loan(so, uio, m, space)) != 0) { SOSEND_COUNTER_INCR(&sosend_loan_big); space -= len; goto have_data; } if (resid >= MINCLSIZE && space >= MCLBYTES) { SOSEND_COUNTER_INCR(&sosend_copy_big); m_clget(m, M_DONTWAIT); if ((m->m_flags & M_EXT) == 0) goto nopages; mlen = MCLBYTES; if (atomic && top == 0) { len = lmin(MCLBYTES - max_hdr, resid); m->m_data += max_hdr; } else len = lmin(MCLBYTES, resid); space -= len; } else { nopages: SOSEND_COUNTER_INCR(&sosend_copy_small); len = lmin(lmin(mlen, resid), space); space -= len; /* * For datagram protocols, leave room * for protocol headers in first mbuf. */ if (atomic && top == 0 && len < mlen) m_align(m, len); } error = uiomove(mtod(m, void *), (int)len, uio); have_data: resid = uio->uio_resid; m->m_len = len; *mp = m; top->m_pkthdr.len += len; s = splsoftnet(); solock(so); if (error != 0) goto release; mp = &m->m_next; if (resid <= 0) { if (flags & MSG_EOR) top->m_flags |= M_EOR; break; } } while (space > 0 && atomic); if (so->so_state & SS_CANTSENDMORE) { error = EPIPE; goto release; } if (dontroute) so->so_options |= SO_DONTROUTE; if (resid > 0) so->so_state |= SS_MORETOCOME; if (flags & MSG_OOB) { error = (*so->so_proto->pr_usrreqs->pr_sendoob)( so, top, control); } else { error = (*so->so_proto->pr_usrreqs->pr_send)(so, top, addr, control, l); } if (dontroute) so->so_options &= ~SO_DONTROUTE; if (resid > 0) so->so_state &= ~SS_MORETOCOME; clen = 0; control = NULL; top = NULL; mp = &top; if (error != 0) goto release; } while (resid && space > 0); } while (resid); release: sbunlock(&so->so_snd); out: sounlock(so); splx(s); if (top) m_freem(top); if (control) m_freem(control); return error; } /* * Following replacement or removal of the first mbuf on the first * mbuf chain of a socket buffer, push necessary state changes back * into the socket buffer so that other consumers see the values * consistently. 'nextrecord' is the caller's locally stored value of * the original value of sb->sb_mb->m_nextpkt which must be restored * when the lead mbuf changes. NOTE: 'nextrecord' may be NULL. */ static void sbsync(struct sockbuf *sb, struct mbuf *nextrecord) { KASSERT(solocked(sb->sb_so)); /* * First, update for the new value of nextrecord. If necessary, * make it the first record. */ if (sb->sb_mb != NULL) sb->sb_mb->m_nextpkt = nextrecord; else sb->sb_mb = nextrecord; /* * Now update any dependent socket buffer fields to reflect * the new state. This is an inline of SB_EMPTY_FIXUP, with * the addition of a second clause that takes care of the * case where sb_mb has been updated, but remains the last * record. */ if (sb->sb_mb == NULL) { sb->sb_mbtail = NULL; sb->sb_lastrecord = NULL; } else if (sb->sb_mb->m_nextpkt == NULL) sb->sb_lastrecord = sb->sb_mb; } /* * Implement receive operations on a socket. * * We depend on the way that records are added to the sockbuf by sbappend*. In * particular, each record (mbufs linked through m_next) must begin with an * address if the protocol so specifies, followed by an optional mbuf or mbufs * containing ancillary data, and then zero or more mbufs of data. * * In order to avoid blocking network interrupts for the entire time here, we * splx() while doing the actual copy to user space. Although the sockbuf is * locked, new data may still be appended, and thus we must maintain * consistency of the sockbuf during that time. * * The caller may receive the data as a single mbuf chain by supplying an mbuf * **mp0 for use in returning the chain. The uio is then used only for the * count in uio_resid. */ int soreceive(struct socket *so, struct mbuf **paddr, struct uio *uio, struct mbuf **mp0, struct mbuf **controlp, int *flagsp) { struct lwp *l = curlwp; struct mbuf *m, **mp, *mt; size_t len, offset, moff, orig_resid; int atomic, flags, error, s, type; const struct protosw *pr; struct mbuf *nextrecord; int mbuf_removed = 0; const struct domain *dom; short wakeup_state = 0; pr = so->so_proto; atomic = pr->pr_flags & PR_ATOMIC; dom = pr->pr_domain; mp = mp0; type = 0; orig_resid = uio->uio_resid; if (paddr != NULL) *paddr = NULL; if (controlp != NULL) *controlp = NULL; if (flagsp != NULL) flags = *flagsp &~ MSG_EOR; else flags = 0; if (flags & MSG_OOB) { m = m_get(M_WAIT, MT_DATA); solock(so); error = (*pr->pr_usrreqs->pr_recvoob)(so, m, flags & MSG_PEEK); sounlock(so); if (error) goto bad; do { error = uiomove(mtod(m, void *), MIN(uio->uio_resid, m->m_len), uio); m = m_free(m); } while (uio->uio_resid > 0 && error == 0 && m); bad: if (m != NULL) m_freem(m); return error; } if (mp != NULL) *mp = NULL; /* * solock() provides atomicity of access. splsoftnet() prevents * protocol processing soft interrupts from interrupting us and * blocking (expensive). */ s = splsoftnet(); solock(so); restart: if ((error = sblock(&so->so_rcv, SBLOCKWAIT(flags))) != 0) { sounlock(so); splx(s); return error; } m = so->so_rcv.sb_mb; /* * If we have less data than requested, block awaiting more * (subject to any timeout) if: * 1. the current count is less than the low water mark, * 2. MSG_WAITALL is set, and it is possible to do the entire * receive operation at once if we block (resid <= hiwat), or * 3. MSG_DONTWAIT is not set. * If MSG_WAITALL is set but resid is larger than the receive buffer, * we have to do the receive in sections, and thus risk returning * a short count if a timeout or signal occurs after we start. */ if (m == NULL || ((flags & MSG_DONTWAIT) == 0 && so->so_rcv.sb_cc < uio->uio_resid && (so->so_rcv.sb_cc < so->so_rcv.sb_lowat || ((flags & MSG_WAITALL) && uio->uio_resid <= so->so_rcv.sb_hiwat)) && m->m_nextpkt == NULL && !atomic)) { #ifdef DIAGNOSTIC if (m == NULL && so->so_rcv.sb_cc) panic("receive 1"); #endif if (so->so_error || so->so_rerror) { u_short *e; if (m != NULL) goto dontblock; e = so->so_error ? &so->so_error : &so->so_rerror; error = *e; if ((flags & MSG_PEEK) == 0) *e = 0; goto release; } if (so->so_state & SS_CANTRCVMORE) { if (m != NULL) goto dontblock; else goto release; } for (; m != NULL; m = m->m_next) if (m->m_type == MT_OOBDATA || (m->m_flags & M_EOR)) { m = so->so_rcv.sb_mb; goto dontblock; } if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 && (so->so_proto->pr_flags & PR_CONNREQUIRED)) { error = ENOTCONN; goto release; } if (uio->uio_resid == 0) goto release; if ((so->so_state & SS_NBIO) || (flags & (MSG_DONTWAIT|MSG_NBIO))) { error = EWOULDBLOCK; goto release; } SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 1"); SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 1"); sbunlock(&so->so_rcv); if (wakeup_state & SS_RESTARTSYS) error = ERESTART; else error = sbwait(&so->so_rcv); if (error != 0) { sounlock(so); splx(s); return error; } wakeup_state = so->so_state; goto restart; } dontblock: /* * On entry here, m points to the first record of the socket buffer. * From this point onward, we maintain 'nextrecord' as a cache of the * pointer to the next record in the socket buffer. We must keep the * various socket buffer pointers and local stack versions of the * pointers in sync, pushing out modifications before dropping the * socket lock, and re-reading them when picking it up. * * Otherwise, we will race with the network stack appending new data * or records onto the socket buffer by using inconsistent/stale * versions of the field, possibly resulting in socket buffer * corruption. * * By holding the high-level sblock(), we prevent simultaneous * readers from pulling off the front of the socket buffer. */ if (l != NULL) l->l_ru.ru_msgrcv++; KASSERT(m == so->so_rcv.sb_mb); SBLASTRECORDCHK(&so->so_rcv, "soreceive 1"); SBLASTMBUFCHK(&so->so_rcv, "soreceive 1"); nextrecord = m->m_nextpkt; if (pr->pr_flags & PR_ADDR) { KASSERT(m->m_type == MT_SONAME); orig_resid = 0; if (flags & MSG_PEEK) { if (paddr) *paddr = m_copym(m, 0, m->m_len, M_DONTWAIT); m = m->m_next; } else { sbfree(&so->so_rcv, m); mbuf_removed = 1; if (paddr != NULL) { *paddr = m; so->so_rcv.sb_mb = m->m_next; m->m_next = NULL; m = so->so_rcv.sb_mb; } else { m = so->so_rcv.sb_mb = m_free(m); } sbsync(&so->so_rcv, nextrecord); } } if (pr->pr_flags & PR_ADDR_OPT) { /* * For SCTP we may be getting a whole message OR a partial * delivery. */ if (m->m_type == MT_SONAME) { orig_resid = 0; if (flags & MSG_PEEK) { if (paddr) *paddr = m_copym(m, 0, m->m_len, M_DONTWAIT); m = m->m_next; } else { sbfree(&so->so_rcv, m); mbuf_removed = 1; if (paddr) { *paddr = m; so->so_rcv.sb_mb = m->m_next; m->m_next = 0; m = so->so_rcv.sb_mb; } else { m = so->so_rcv.sb_mb = m_free(m); } sbsync(&so->so_rcv, nextrecord); } } } /* * Process one or more MT_CONTROL mbufs present before any data mbufs * in the first mbuf chain on the socket buffer. If MSG_PEEK, we * just copy the data; if !MSG_PEEK, we call into the protocol to * perform externalization (or freeing if controlp == NULL). */ if (__predict_false(m != NULL && m->m_type == MT_CONTROL)) { struct mbuf *cm = NULL, *cmn; struct mbuf **cme = &cm; do { if (flags & MSG_PEEK) { if (controlp != NULL) { *controlp = m_copym(m, 0, m->m_len, M_DONTWAIT); controlp = (*controlp == NULL ? NULL : &(*controlp)->m_next); } m = m->m_next; } else { sbfree(&so->so_rcv, m); so->so_rcv.sb_mb = m->m_next; m->m_next = NULL; *cme = m; cme = &(*cme)->m_next; m = so->so_rcv.sb_mb; } } while (m != NULL && m->m_type == MT_CONTROL); if ((flags & MSG_PEEK) == 0) sbsync(&so->so_rcv, nextrecord); for (; cm != NULL; cm = cmn) { cmn = cm->m_next; cm->m_next = NULL; type = mtod(cm, struct cmsghdr *)->cmsg_type; if (controlp != NULL) { if (dom->dom_externalize != NULL && type == SCM_RIGHTS) { sounlock(so); splx(s); error = (*dom->dom_externalize)(cm, l, (flags & MSG_CMSG_CLOEXEC) ? O_CLOEXEC : 0); s = splsoftnet(); solock(so); } *controlp = cm; while (*controlp != NULL) controlp = &(*controlp)->m_next; } else { /* * Dispose of any SCM_RIGHTS message that went * through the read path rather than recv. */ if (dom->dom_dispose != NULL && type == SCM_RIGHTS) { sounlock(so); (*dom->dom_dispose)(cm); solock(so); } m_freem(cm); } } if (m != NULL) nextrecord = so->so_rcv.sb_mb->m_nextpkt; else nextrecord = so->so_rcv.sb_mb; orig_resid = 0; } /* If m is non-NULL, we have some data to read. */ if (__predict_true(m != NULL)) { type = m->m_type; if (type == MT_OOBDATA) flags |= MSG_OOB; } SBLASTRECORDCHK(&so->so_rcv, "soreceive 2"); SBLASTMBUFCHK(&so->so_rcv, "soreceive 2"); moff = 0; offset = 0; while (m != NULL && uio->uio_resid > 0 && error == 0) { /* * If the type of mbuf has changed, end the receive * operation and do a short read. */ if (m->m_type == MT_OOBDATA) { if (type != MT_OOBDATA) break; } else if (type == MT_OOBDATA) { break; } else if (m->m_type == MT_CONTROL) { break; } #ifdef DIAGNOSTIC else if (m->m_type != MT_DATA && m->m_type != MT_HEADER) { panic("%s: m_type=%d", __func__, m->m_type); } #endif so->so_state &= ~SS_RCVATMARK; wakeup_state = 0; len = uio->uio_resid; if (so->so_oobmark && len > so->so_oobmark - offset) len = so->so_oobmark - offset; if (len > m->m_len - moff) len = m->m_len - moff; /* * If mp is set, just pass back the mbufs. * Otherwise copy them out via the uio, then free. * Sockbuf must be consistent here (points to current mbuf, * it points to next record) when we drop priority; * we must note any additions to the sockbuf when we * block interrupts again. */ if (mp == NULL) { SBLASTRECORDCHK(&so->so_rcv, "soreceive uiomove"); SBLASTMBUFCHK(&so->so_rcv, "soreceive uiomove"); sounlock(so); splx(s); error = uiomove(mtod(m, char *) + moff, len, uio); s = splsoftnet(); solock(so); if (error != 0) { /* * If any part of the record has been removed * (such as the MT_SONAME mbuf, which will * happen when PR_ADDR, and thus also * PR_ATOMIC, is set), then drop the entire * record to maintain the atomicity of the * receive operation. * * This avoids a later panic("receive 1a") * when compiled with DIAGNOSTIC. */ if (m && mbuf_removed && atomic) (void) sbdroprecord(&so->so_rcv); goto release; } } else { uio->uio_resid -= len; } if (len == m->m_len - moff) { if (m->m_flags & M_EOR) flags |= MSG_EOR; #ifdef SCTP if (m->m_flags & M_NOTIFICATION) flags |= MSG_NOTIFICATION; #endif if (flags & MSG_PEEK) { m = m->m_next; moff = 0; } else { nextrecord = m->m_nextpkt; sbfree(&so->so_rcv, m); if (mp) { *mp = m; mp = &m->m_next; so->so_rcv.sb_mb = m = m->m_next; *mp = NULL; } else { m = so->so_rcv.sb_mb = m_free(m); } /* * If m != NULL, we also know that * so->so_rcv.sb_mb != NULL. */ KASSERT(so->so_rcv.sb_mb == m); if (m) { m->m_nextpkt = nextrecord; if (nextrecord == NULL) so->so_rcv.sb_lastrecord = m; } else { so->so_rcv.sb_mb = nextrecord; SB_EMPTY_FIXUP(&so->so_rcv); } SBLASTRECORDCHK(&so->so_rcv, "soreceive 3"); SBLASTMBUFCHK(&so->so_rcv, "soreceive 3"); } } else if (flags & MSG_PEEK) { moff += len; } else { if (mp != NULL) { mt = m_copym(m, 0, len, M_NOWAIT); if (__predict_false(mt == NULL)) { sounlock(so); mt = m_copym(m, 0, len, M_WAIT); solock(so); } *mp = mt; } m->m_data += len; m->m_len -= len; so->so_rcv.sb_cc -= len; } if (so->so_oobmark) { if ((flags & MSG_PEEK) == 0) { so->so_oobmark -= len; if (so->so_oobmark == 0) { so->so_state |= SS_RCVATMARK; break; } } else { offset += len; if (offset == so->so_oobmark) break; } } else { so->so_state &= ~SS_POLLRDBAND; } if (flags & MSG_EOR) break; /* * If the MSG_WAITALL flag is set (for non-atomic socket), * we must not quit until "uio->uio_resid == 0" or an error * termination. If a signal/timeout occurs, return * with a short count but without error. * Keep sockbuf locked against other readers. */ while (flags & MSG_WAITALL && m == NULL && uio->uio_resid > 0 && !sosendallatonce(so) && !nextrecord) { if (so->so_error || so->so_rerror || so->so_state & SS_CANTRCVMORE) break; /* * If we are peeking and the socket receive buffer is * full, stop since we can't get more data to peek at. */ if ((flags & MSG_PEEK) && sbspace(&so->so_rcv) <= 0) break; /* * If we've drained the socket buffer, tell the * protocol in case it needs to do something to * get it filled again. */ if ((pr->pr_flags & PR_WANTRCVD) && so->so_pcb) (*pr->pr_usrreqs->pr_rcvd)(so, flags, l); SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 2"); SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 2"); if (wakeup_state & SS_RESTARTSYS) error = ERESTART; else error = sbwait(&so->so_rcv); if (error != 0) { sbunlock(&so->so_rcv); sounlock(so); splx(s); return 0; } if ((m = so->so_rcv.sb_mb) != NULL) nextrecord = m->m_nextpkt; wakeup_state = so->so_state; } } if (m && atomic) { flags |= MSG_TRUNC; if ((flags & MSG_PEEK) == 0) (void) sbdroprecord(&so->so_rcv); } if ((flags & MSG_PEEK) == 0) { if (m == NULL) { /* * First part is an inline SB_EMPTY_FIXUP(). Second * part makes sure sb_lastrecord is up-to-date if * there is still data in the socket buffer. */ so->so_rcv.sb_mb = nextrecord; if (so->so_rcv.sb_mb == NULL) { so->so_rcv.sb_mbtail = NULL; so->so_rcv.sb_lastrecord = NULL; } else if (nextrecord->m_nextpkt == NULL) so->so_rcv.sb_lastrecord = nextrecord; } SBLASTRECORDCHK(&so->so_rcv, "soreceive 4"); SBLASTMBUFCHK(&so->so_rcv, "soreceive 4"); if (pr->pr_flags & PR_WANTRCVD && so->so_pcb) (*pr->pr_usrreqs->pr_rcvd)(so, flags, l); } if (orig_resid == uio->uio_resid && orig_resid && (flags & MSG_EOR) == 0 && (so->so_state & SS_CANTRCVMORE) == 0) { sbunlock(&so->so_rcv); goto restart; } if (flagsp != NULL) *flagsp |= flags; release: sbunlock(&so->so_rcv); sounlock(so); splx(s); return error; } int soshutdown(struct socket *so, int how) { const struct protosw *pr; int error; KASSERT(solocked(so)); pr = so->so_proto; if (!(how == SHUT_RD || how == SHUT_WR || how == SHUT_RDWR)) return EINVAL; if (how == SHUT_RD || how == SHUT_RDWR) { sorflush(so); error = 0; } if (how == SHUT_WR || how == SHUT_RDWR) error = (*pr->pr_usrreqs->pr_shutdown)(so); return error; } void sorestart(struct socket *so) { /* * An application has called close() on an fd on which another * of its threads has called a socket system call. * Mark this and wake everyone up, and code that would block again * instead returns ERESTART. * On system call re-entry the fd is validated and EBADF returned. * Any other fd will block again on the 2nd syscall. */ solock(so); so->so_state |= SS_RESTARTSYS; cv_broadcast(&so->so_cv); cv_broadcast(&so->so_snd.sb_cv); cv_broadcast(&so->so_rcv.sb_cv); sounlock(so); } void sorflush(struct socket *so) { struct sockbuf *sb, asb; const struct protosw *pr; KASSERT(solocked(so)); sb = &so->so_rcv; pr = so->so_proto; socantrcvmore(so); sb->sb_flags |= SB_NOINTR; (void )sblock(sb, M_WAITOK); sbunlock(sb); asb = *sb; /* * Clear most of the sockbuf structure, but leave some of the * fields valid. */ memset(&sb->sb_startzero, 0, sizeof(*sb) - offsetof(struct sockbuf, sb_startzero)); if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose) { sounlock(so); (*pr->pr_domain->dom_dispose)(asb.sb_mb); solock(so); } sbrelease(&asb, so); } /* * internal set SOL_SOCKET options */ static int sosetopt1(struct socket *so, const struct sockopt *sopt) { int error, opt; int optval = 0; /* XXX: gcc */ struct linger l; struct timeval tv; opt = sopt->sopt_name; switch (opt) { case SO_ACCEPTFILTER: error = accept_filt_setopt(so, sopt); KASSERT(solocked(so)); break; case SO_LINGER: error = sockopt_get(sopt, &l, sizeof(l)); solock(so); if (error) break; if (l.l_linger < 0 || l.l_linger > USHRT_MAX || l.l_linger > (INT_MAX / hz)) { error = EDOM; break; } so->so_linger = l.l_linger; if (l.l_onoff) so->so_options |= SO_LINGER; else so->so_options &= ~SO_LINGER; break; case SO_DEBUG: case SO_KEEPALIVE: case SO_DONTROUTE: case SO_USELOOPBACK: case SO_BROADCAST: case SO_REUSEADDR: case SO_REUSEPORT: case SO_OOBINLINE: case SO_TIMESTAMP: case SO_NOSIGPIPE: case SO_RERROR: error = sockopt_getint(sopt, &optval); solock(so); if (error) break; if (optval) so->so_options |= opt; else so->so_options &= ~opt; break; case SO_SNDBUF: case SO_RCVBUF: case SO_SNDLOWAT: case SO_RCVLOWAT: error = sockopt_getint(sopt, &optval); solock(so); if (error) break; /* * Values < 1 make no sense for any of these * options, so disallow them. */ if (optval < 1) { error = EINVAL; break; } switch (opt) { case SO_SNDBUF: if (sbreserve(&so->so_snd, (u_long)optval, so) == 0) { error = ENOBUFS; break; } if (sofixedbuf) so->so_snd.sb_flags &= ~SB_AUTOSIZE; break; case SO_RCVBUF: if (sbreserve(&so->so_rcv, (u_long)optval, so) == 0) { error = ENOBUFS; break; } if (sofixedbuf) so->so_rcv.sb_flags &= ~SB_AUTOSIZE; break; /* * Make sure the low-water is never greater than * the high-water. */ case SO_SNDLOWAT: if (optval > so->so_snd.sb_hiwat) optval = so->so_snd.sb_hiwat; so->so_snd.sb_lowat = optval; break; case SO_RCVLOWAT: if (optval > so->so_rcv.sb_hiwat) optval = so->so_rcv.sb_hiwat; so->so_rcv.sb_lowat = optval; break; } break; case SO_SNDTIMEO: case SO_RCVTIMEO: solock(so); error = sockopt_get(sopt, &tv, sizeof(tv)); if (error) break; if (tv.tv_sec < 0 || tv.tv_usec < 0 || tv.tv_usec >= 1000000) { error = EDOM; break; } if (tv.tv_sec > (INT_MAX - tv.tv_usec / tick) / hz) { error = EDOM; break; } optval = tv.tv_sec * hz + tv.tv_usec / tick; if (optval == 0 && tv.tv_usec != 0) optval = 1; switch (opt) { case SO_SNDTIMEO: so->so_snd.sb_timeo = optval; break; case SO_RCVTIMEO: so->so_rcv.sb_timeo = optval; break; } break; default: MODULE_HOOK_CALL(uipc_socket_50_setopt1_hook, (opt, so, sopt), enosys(), error); if (error == ENOSYS || error == EPASSTHROUGH) { solock(so); error = ENOPROTOOPT; } break; } KASSERT(solocked(so)); return error; } int sosetopt(struct socket *so, struct sockopt *sopt) { int error, prerr; if (sopt->sopt_level == SOL_SOCKET) { error = sosetopt1(so, sopt); KASSERT(solocked(so)); } else { error = ENOPROTOOPT; solock(so); } if ((error == 0 || error == ENOPROTOOPT) && so->so_proto != NULL && so->so_proto->pr_ctloutput != NULL) { /* give the protocol stack a shot */ prerr = (*so->so_proto->pr_ctloutput)(PRCO_SETOPT, so, sopt); if (prerr == 0) error = 0; else if (prerr != ENOPROTOOPT) error = prerr; } sounlock(so); return error; } /* * so_setsockopt() is a wrapper providing a sockopt structure for sosetopt() */ int so_setsockopt(struct lwp *l, struct socket *so, int level, int name, const void *val, size_t valsize) { struct sockopt sopt; int error; KASSERT(valsize == 0 || val != NULL); sockopt_init(&sopt, level, name, valsize); sockopt_set(&sopt, val, valsize); error = sosetopt(so, &sopt); sockopt_destroy(&sopt); return error; } /* * internal get SOL_SOCKET options */ static int sogetopt1(struct socket *so, struct sockopt *sopt) { int error, optval, opt; struct linger l; struct timeval tv; switch ((opt = sopt->sopt_name)) { case SO_ACCEPTFILTER: error = accept_filt_getopt(so, sopt); break; case SO_LINGER: l.l_onoff = (so->so_options & SO_LINGER) ? 1 : 0; l.l_linger = so->so_linger; error = sockopt_set(sopt, &l, sizeof(l)); break; case SO_USELOOPBACK: case SO_DONTROUTE: case SO_DEBUG: case SO_KEEPALIVE: case SO_REUSEADDR: case SO_REUSEPORT: case SO_BROADCAST: case SO_OOBINLINE: case SO_TIMESTAMP: case SO_NOSIGPIPE: case SO_RERROR: case SO_ACCEPTCONN: error = sockopt_setint(sopt, (so->so_options & opt) ? 1 : 0); break; case SO_TYPE: error = sockopt_setint(sopt, so->so_type); break; case SO_ERROR: if (so->so_error == 0) { so->so_error = so->so_rerror; so->so_rerror = 0; } error = sockopt_setint(sopt, so->so_error); so->so_error = 0; break; case SO_SNDBUF: error = sockopt_setint(sopt, so->so_snd.sb_hiwat); break; case SO_RCVBUF: error = sockopt_setint(sopt, so->so_rcv.sb_hiwat); break; case SO_SNDLOWAT: error = sockopt_setint(sopt, so->so_snd.sb_lowat); break; case SO_RCVLOWAT: error = sockopt_setint(sopt, so->so_rcv.sb_lowat); break; case SO_SNDTIMEO: case SO_RCVTIMEO: optval = (opt == SO_SNDTIMEO ? so->so_snd.sb_timeo : so->so_rcv.sb_timeo); memset(&tv, 0, sizeof(tv)); tv.tv_sec = optval / hz; tv.tv_usec = (optval % hz) * tick; error = sockopt_set(sopt, &tv, sizeof(tv)); break; case SO_OVERFLOWED: error = sockopt_setint(sopt, so->so_rcv.sb_overflowed); break; default: MODULE_HOOK_CALL(uipc_socket_50_getopt1_hook, (opt, so, sopt), enosys(), error); if (error) error = ENOPROTOOPT; break; } return error; } int sogetopt(struct socket *so, struct sockopt *sopt) { int error; solock(so); if (sopt->sopt_level != SOL_SOCKET) { if (so->so_proto && so->so_proto->pr_ctloutput) { error = ((*so->so_proto->pr_ctloutput) (PRCO_GETOPT, so, sopt)); } else error = (ENOPROTOOPT); } else { error = sogetopt1(so, sopt); } sounlock(so); return error; } /* * alloc sockopt data buffer buffer * - will be released at destroy */ static int sockopt_alloc(struct sockopt *sopt, size_t len, km_flag_t kmflag) { void *data; KASSERT(sopt->sopt_size == 0); if (len > sizeof(sopt->sopt_buf)) { data = kmem_zalloc(len, kmflag); if (data == NULL) return ENOMEM; sopt->sopt_data = data; } else sopt->sopt_data = sopt->sopt_buf; sopt->sopt_size = len; return 0; } /* * initialise sockopt storage * - MAY sleep during allocation */ void sockopt_init(struct sockopt *sopt, int level, int name, size_t size) { memset(sopt, 0, sizeof(*sopt)); sopt->sopt_level = level; sopt->sopt_name = name; (void)sockopt_alloc(sopt, size, KM_SLEEP); } /* * destroy sockopt storage * - will release any held memory references */ void sockopt_destroy(struct sockopt *sopt) { if (sopt->sopt_data != sopt->sopt_buf) kmem_free(sopt->sopt_data, sopt->sopt_size); memset(sopt, 0, sizeof(*sopt)); } /* * set sockopt value * - value is copied into sockopt * - memory is allocated when necessary, will not sleep */ int sockopt_set(struct sockopt *sopt, const void *buf, size_t len) { int error; if (sopt->sopt_size == 0) { error = sockopt_alloc(sopt, len, KM_NOSLEEP); if (error) return error; } sopt->sopt_retsize = MIN(sopt->sopt_size, len); if (sopt->sopt_retsize > 0) { memcpy(sopt->sopt_data, buf, sopt->sopt_retsize); } return 0; } /* * common case of set sockopt integer value */ int sockopt_setint(struct sockopt *sopt, int val) { return sockopt_set(sopt, &val, sizeof(int)); } /* * get sockopt value * - correct size must be given */ int sockopt_get(const struct sockopt *sopt, void *buf, size_t len) { if (sopt->sopt_size != len) return EINVAL; memcpy(buf, sopt->sopt_data, len); return 0; } /* * common case of get sockopt integer value */ int sockopt_getint(const struct sockopt *sopt, int *valp) { return sockopt_get(sopt, valp, sizeof(int)); } /* * set sockopt value from mbuf * - ONLY for legacy code * - mbuf is released by sockopt * - will not sleep */ int sockopt_setmbuf(struct sockopt *sopt, struct mbuf *m) { size_t len; int error; len = m_length(m); if (sopt->sopt_size == 0) { error = sockopt_alloc(sopt, len, KM_NOSLEEP); if (error) return error; } sopt->sopt_retsize = MIN(sopt->sopt_size, len); m_copydata(m, 0, sopt->sopt_retsize, sopt->sopt_data); m_freem(m); return 0; } /* * get sockopt value into mbuf * - ONLY for legacy code * - mbuf to be released by the caller * - will not sleep */ struct mbuf * sockopt_getmbuf(const struct sockopt *sopt) { struct mbuf *m; if (sopt->sopt_size > MCLBYTES) return NULL; m = m_get(M_DONTWAIT, MT_SOOPTS); if (m == NULL) return NULL; if (sopt->sopt_size > MLEN) { MCLGET(m, M_DONTWAIT); if ((m->m_flags & M_EXT) == 0) { m_free(m); return NULL; } } memcpy(mtod(m, void *), sopt->sopt_data, sopt->sopt_size); m->m_len = sopt->sopt_size; return m; } void sohasoutofband(struct socket *so) { so->so_state |= SS_POLLRDBAND; fownsignal(so->so_pgid, SIGURG, POLL_PRI, POLLPRI|POLLRDBAND, so); selnotify(&so->so_rcv.sb_sel, POLLPRI | POLLRDBAND, NOTE_SUBMIT); } static void filt_sordetach(struct knote *kn) { struct socket *so; so = ((file_t *)kn->kn_obj)->f_socket; solock(so); if (selremove_knote(&so->so_rcv.sb_sel, kn)) so->so_rcv.sb_flags &= ~SB_KNOTE; sounlock(so); } /*ARGSUSED*/ static int filt_soread(struct knote *kn, long hint) { struct socket *so; int rv; so = ((file_t *)kn->kn_obj)->f_socket; if (hint != NOTE_SUBMIT) solock(so); kn->kn_data = so->so_rcv.sb_cc; if (so->so_state & SS_CANTRCVMORE) { knote_set_eof(kn, 0); kn->kn_fflags = so->so_error; rv = 1; } else if (so->so_error || so->so_rerror) rv = 1; else if (kn->kn_sfflags & NOTE_LOWAT) rv = (kn->kn_data >= kn->kn_sdata); else rv = (kn->kn_data >= so->so_rcv.sb_lowat); if (hint != NOTE_SUBMIT) sounlock(so); return rv; } static void filt_sowdetach(struct knote *kn) { struct socket *so; so = ((file_t *)kn->kn_obj)->f_socket; solock(so); if (selremove_knote(&so->so_snd.sb_sel, kn)) so->so_snd.sb_flags &= ~SB_KNOTE; sounlock(so); } /*ARGSUSED*/ static int filt_sowrite(struct knote *kn, long hint) { struct socket *so; int rv; so = ((file_t *)kn->kn_obj)->f_socket; if (hint != NOTE_SUBMIT) solock(so); kn->kn_data = sbspace(&so->so_snd); if (so->so_state & SS_CANTSENDMORE) { knote_set_eof(kn, 0); kn->kn_fflags = so->so_error; rv = 1; } else if (so->so_error) rv = 1; else if (((so->so_state & SS_ISCONNECTED) == 0) && (so->so_proto->pr_flags & PR_CONNREQUIRED)) rv = 0; else if (kn->kn_sfflags & NOTE_LOWAT) rv = (kn->kn_data >= kn->kn_sdata); else rv = (kn->kn_data >= so->so_snd.sb_lowat); if (hint != NOTE_SUBMIT) sounlock(so); return rv; } static int filt_soempty(struct knote *kn, long hint) { struct socket *so; int rv; so = ((file_t *)kn->kn_obj)->f_socket; if (hint != NOTE_SUBMIT) solock(so); rv = (kn->kn_data = sbused(&so->so_snd)) == 0 || (so->so_options & SO_ACCEPTCONN) != 0; if (hint != NOTE_SUBMIT) sounlock(so); return rv; } /*ARGSUSED*/ static int filt_solisten(struct knote *kn, long hint) { struct socket *so; int rv; so = ((file_t *)kn->kn_obj)->f_socket; /* * Set kn_data to number of incoming connections, not * counting partial (incomplete) connections. */ if (hint != NOTE_SUBMIT) solock(so); kn->kn_data = so->so_qlen; rv = (kn->kn_data > 0); if (hint != NOTE_SUBMIT) sounlock(so); return rv; } static const struct filterops solisten_filtops = { .f_flags = FILTEROP_ISFD | FILTEROP_MPSAFE, .f_attach = NULL, .f_detach = filt_sordetach, .f_event = filt_solisten, }; static const struct filterops soread_filtops = { .f_flags = FILTEROP_ISFD | FILTEROP_MPSAFE, .f_attach = NULL, .f_detach = filt_sordetach, .f_event = filt_soread, }; static const struct filterops sowrite_filtops = { .f_flags = FILTEROP_ISFD | FILTEROP_MPSAFE, .f_attach = NULL, .f_detach = filt_sowdetach, .f_event = filt_sowrite, }; static const struct filterops soempty_filtops = { .f_flags = FILTEROP_ISFD | FILTEROP_MPSAFE, .f_attach = NULL, .f_detach = filt_sowdetach, .f_event = filt_soempty, }; int soo_kqfilter(struct file *fp, struct knote *kn) { struct socket *so; struct sockbuf *sb; so = ((file_t *)kn->kn_obj)->f_socket; solock(so); switch (kn->kn_filter) { case EVFILT_READ: if (so->so_options & SO_ACCEPTCONN) kn->kn_fop = &solisten_filtops; else kn->kn_fop = &soread_filtops; sb = &so->so_rcv; break; case EVFILT_WRITE: kn->kn_fop = &sowrite_filtops; sb = &so->so_snd; #ifdef PIPE_SOCKETPAIR if (so->so_state & SS_ISAPIPE) { /* Other end of pipe has been closed. */ if (so->so_state & SS_ISDISCONNECTED) { sounlock(so); return EBADF; } } #endif break; case EVFILT_EMPTY: kn->kn_fop = &soempty_filtops; sb = &so->so_snd; break; default: sounlock(so); return EINVAL; } selrecord_knote(&sb->sb_sel, kn); sb->sb_flags |= SB_KNOTE; sounlock(so); return 0; } static int sodopoll(struct socket *so, int events) { int revents; revents = 0; if (events & (POLLIN | POLLRDNORM)) if (soreadable(so)) revents |= events & (POLLIN | POLLRDNORM); if (events & (POLLOUT | POLLWRNORM)) if (sowritable(so)) revents |= events & (POLLOUT | POLLWRNORM); if (events & (POLLPRI | POLLRDBAND)) if (so->so_state & SS_POLLRDBAND) revents |= events & (POLLPRI | POLLRDBAND); return revents; } int sopoll(struct socket *so, int events) { int revents = 0; #ifndef DIAGNOSTIC /* * Do a quick, unlocked check in expectation that the socket * will be ready for I/O. Don't do this check if DIAGNOSTIC, * as the solocked() assertions will fail. */ if ((revents = sodopoll(so, events)) != 0) return revents; #endif solock(so); if ((revents = sodopoll(so, events)) == 0) { if (events & (POLLIN | POLLPRI | POLLRDNORM | POLLRDBAND)) { selrecord(curlwp, &so->so_rcv.sb_sel); so->so_rcv.sb_flags |= SB_NOTIFY; } if (events & (POLLOUT | POLLWRNORM)) { selrecord(curlwp, &so->so_snd.sb_sel); so->so_snd.sb_flags |= SB_NOTIFY; } } sounlock(so); return revents; } struct mbuf ** sbsavetimestamp(int opt, struct mbuf **mp) { struct timeval tv; int error; memset(&tv, 0, sizeof(tv)); microtime(&tv); MODULE_HOOK_CALL(uipc_socket_50_sbts_hook, (opt, &mp), enosys(), error); if (error == 0) return mp; if (opt & SO_TIMESTAMP) { *mp = sbcreatecontrol(&tv, sizeof(tv), SCM_TIMESTAMP, SOL_SOCKET); if (*mp) mp = &(*mp)->m_next; } return mp; } #include <sys/sysctl.h> static int sysctl_kern_somaxkva(SYSCTLFN_PROTO); static int sysctl_kern_sbmax(SYSCTLFN_PROTO); /* * sysctl helper routine for kern.somaxkva. ensures that the given * value is not too small. * (XXX should we maybe make sure it's not too large as well?) */ static int sysctl_kern_somaxkva(SYSCTLFN_ARGS) { int error, new_somaxkva; struct sysctlnode node; new_somaxkva = somaxkva; node = *rnode; node.sysctl_data = &new_somaxkva; error = sysctl_lookup(SYSCTLFN_CALL(&node)); if (error || newp == NULL) return error; if (new_somaxkva < (16 * 1024 * 1024)) /* sanity */ return EINVAL; mutex_enter(&so_pendfree_lock); somaxkva = new_somaxkva; cv_broadcast(&socurkva_cv); mutex_exit(&so_pendfree_lock); return error; } /* * sysctl helper routine for kern.sbmax. Basically just ensures that * any new value is not too small. */ static int sysctl_kern_sbmax(SYSCTLFN_ARGS) { int error, new_sbmax; struct sysctlnode node; new_sbmax = sb_max; node = *rnode; node.sysctl_data = &new_sbmax; error = sysctl_lookup(SYSCTLFN_CALL(&node)); if (error || newp == NULL) return error; KERNEL_LOCK(1, NULL); error = sb_max_set(new_sbmax); KERNEL_UNLOCK_ONE(NULL); return error; } /* * sysctl helper routine for kern.sooptions. Ensures that only allowed * options can be set. */ static int sysctl_kern_sooptions(SYSCTLFN_ARGS) { int error, new_options; struct sysctlnode node; new_options = sooptions; node = *rnode; node.sysctl_data = &new_options; error = sysctl_lookup(SYSCTLFN_CALL(&node)); if (error || newp == NULL) return error; if (new_options & ~SO_DEFOPTS) return EINVAL; sooptions = new_options; return 0; } static void sysctl_kern_socket_setup(void) { KASSERT(socket_sysctllog == NULL); sysctl_createv(&socket_sysctllog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_INT, "somaxkva", SYSCTL_DESCR("Maximum amount of kernel memory to be " "used for socket buffers"), sysctl_kern_somaxkva, 0, NULL, 0, CTL_KERN, KERN_SOMAXKVA, CTL_EOL); sysctl_createv(&socket_sysctllog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_BOOL, "sofixedbuf", SYSCTL_DESCR("Prevent scaling of fixed socket buffers"), NULL, 0, &sofixedbuf, 0, CTL_KERN, KERN_SOFIXEDBUF, CTL_EOL); sysctl_createv(&socket_sysctllog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_INT, "sbmax", SYSCTL_DESCR("Maximum socket buffer size"), sysctl_kern_sbmax, 0, NULL, 0, CTL_KERN, KERN_SBMAX, CTL_EOL); sysctl_createv(&socket_sysctllog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_INT, "sooptions", SYSCTL_DESCR("Default socket options"), sysctl_kern_sooptions, 0, NULL, 0, CTL_KERN, CTL_CREATE, CTL_EOL); }
33 18 18 1 18 1 1 2 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 /* $NetBSD: ptyfs_subr.c,v 1.34 2020/11/27 14:43:57 christos Exp $ */ /* * Copyright (c) 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * Jan-Simon Pendry. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)ptyfs_subr.c 8.6 (Berkeley) 5/14/95 */ /* * Copyright (c) 1994 Christopher G. Demetriou. All rights reserved. * Copyright (c) 1993 Jan-Simon Pendry * * This code is derived from software contributed to Berkeley by * Jan-Simon Pendry. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)procfs_subr.c 8.6 (Berkeley) 5/14/95 */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: ptyfs_subr.c,v 1.34 2020/11/27 14:43:57 christos Exp $"); #include <sys/param.h> #include <sys/systm.h> #include <sys/time.h> #include <sys/kernel.h> #include <sys/vnode.h> #include <sys/stat.h> #include <sys/malloc.h> #include <sys/file.h> #include <sys/namei.h> #include <sys/filedesc.h> #include <sys/select.h> #include <sys/tty.h> #include <sys/pty.h> #include <sys/kauth.h> #include <sys/lwp.h> #include <fs/ptyfs/ptyfs.h> static kmutex_t ptyfs_hashlock; static SLIST_HEAD(ptyfs_hashhead, ptyfsnode) *ptyfs_node_tbl; static u_long ptyfs_node_mask; /* size of hash table - 1 */ /* * allocate a ptyfsnode/vnode pair. the vnode is referenced. * * the pty, ptyfs_type, and mount point uniquely * identify a ptyfsnode. the mount point is needed * because someone might mount this filesystem * twice. */ int ptyfs_allocvp(struct mount *mp, struct vnode **vpp, ptyfstype type, int pty) { struct ptyfskey key; memset(&key, 0, sizeof(key)); key.ptk_pty = pty; key.ptk_type = type; return vcache_get(mp, &key, sizeof(key), vpp); } /* * Initialize ptyfsnode hash table. */ void ptyfs_hashinit(void) { ptyfs_node_tbl = hashinit(16, HASH_SLIST, true, &ptyfs_node_mask); mutex_init(&ptyfs_hashlock, MUTEX_DEFAULT, IPL_NONE); } /* * Free ptyfsnode hash table. */ void ptyfs_hashdone(void) { mutex_destroy(&ptyfs_hashlock); hashdone(ptyfs_node_tbl, HASH_SLIST, ptyfs_node_mask); } /* * Get a ptyfsnode from the hash table, or allocate one. */ struct ptyfsnode * ptyfs_get_node(ptyfstype type, int pty) { struct ptyfs_hashhead *ppp; struct ptyfsnode *pp; ppp = &ptyfs_node_tbl[PTYFS_FILENO(type, pty) & ptyfs_node_mask]; mutex_enter(&ptyfs_hashlock); SLIST_FOREACH(pp, ppp, ptyfs_hash) { if (pty == pp->ptyfs_pty && pp->ptyfs_type == type) { mutex_exit(&ptyfs_hashlock); return pp; } } mutex_exit(&ptyfs_hashlock); pp = malloc(sizeof(struct ptyfsnode), M_TEMP, M_WAITOK); pp->ptyfs_pty = pty; pp->ptyfs_type = type; pp->ptyfs_fileno = PTYFS_FILENO(type, pty); if (pp->ptyfs_type == PTYFSroot) pp->ptyfs_mode = S_IRUSR|S_IXUSR|S_IRGRP|S_IXGRP| S_IROTH|S_IXOTH; else pp->ptyfs_mode = S_IRUSR|S_IWUSR|S_IRGRP|S_IWGRP| S_IROTH|S_IWOTH; pp->ptyfs_uid = pp->ptyfs_gid = 0; pp->ptyfs_status = PTYFS_CHANGE; PTYFS_ITIMES(pp, NULL, NULL, NULL); pp->ptyfs_birthtime = pp->ptyfs_mtime = pp->ptyfs_atime = pp->ptyfs_ctime; pp->ptyfs_flags = 0; mutex_enter(&ptyfs_hashlock); /* * XXX We have minimum race condition when opening master side * first time, if other threads through other mount points, trying * opening the same device. As follow we have little chance have * unused list entries. */ SLIST_INSERT_HEAD(ppp, pp, ptyfs_hash); mutex_exit(&ptyfs_hashlock); return pp; } /* * Mark this controlling pty as active. */ void ptyfs_set_active(struct mount *mp, int pty) { struct ptyfsmount *pmnt = VFSTOPTY(mp); KASSERT(pty >= 0); /* Reallocate map if needed. */ if (pty >= pmnt->pmnt_bitmap_size * NBBY) { int osize, nsize; uint8_t *obitmap, *nbitmap; nsize = roundup(howmany(pty + 1, NBBY), 64); nbitmap = kmem_alloc(nsize, KM_SLEEP); mutex_enter(&pmnt->pmnt_lock); if (pty < pmnt->pmnt_bitmap_size * NBBY) { mutex_exit(&pmnt->pmnt_lock); kmem_free(nbitmap, nsize); } else { osize = pmnt->pmnt_bitmap_size; obitmap = pmnt->pmnt_bitmap; pmnt->pmnt_bitmap_size = nsize; pmnt->pmnt_bitmap = nbitmap; if (osize > 0) memcpy(pmnt->pmnt_bitmap, obitmap, osize); memset(pmnt->pmnt_bitmap + osize, 0, nsize - osize); mutex_exit(&pmnt->pmnt_lock); if (osize > 0) kmem_free(obitmap, osize); } } mutex_enter(&pmnt->pmnt_lock); setbit(pmnt->pmnt_bitmap, pty); mutex_exit(&pmnt->pmnt_lock); } /* * Mark this controlling pty as inactive. */ void ptyfs_clr_active(struct mount *mp, int pty) { struct ptyfsmount *pmnt = VFSTOPTY(mp); KASSERT(pty >= 0); mutex_enter(&pmnt->pmnt_lock); if (pty >= 0 && pty < pmnt->pmnt_bitmap_size * NBBY) clrbit(pmnt->pmnt_bitmap, pty); mutex_exit(&pmnt->pmnt_lock); } /* * Lookup the next active controlling pty greater or equal "pty". * Return -1 if not found. */ int ptyfs_next_active(struct mount *mp, int pty) { struct ptyfsmount *pmnt = VFSTOPTY(mp); KASSERT(pty >= 0); mutex_enter(&pmnt->pmnt_lock); while (pty < pmnt->pmnt_bitmap_size * NBBY) { if (isset(pmnt->pmnt_bitmap, pty)) { mutex_exit(&pmnt->pmnt_lock); return pty; } pty++; } mutex_exit(&pmnt->pmnt_lock); return -1; }
766 990 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 /* $NetBSD: cpufunc.h,v 1.42 2020/10/24 07:14:29 mgorny Exp $ */ /* * Copyright (c) 1998, 2007, 2019 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Charles M. Hannum, and by Andrew Doran. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #ifndef _X86_CPUFUNC_H_ #define _X86_CPUFUNC_H_ /* * Functions to provide access to x86-specific instructions. */ #include <sys/cdefs.h> #include <sys/types.h> #include <machine/segments.h> #include <machine/specialreg.h> #ifdef _KERNEL #if defined(_KERNEL_OPT) #include "opt_xen.h" #endif static inline void x86_pause(void) { __asm volatile ("pause"); } void x86_lfence(void); void x86_sfence(void); void x86_mfence(void); void x86_flush(void); void x86_hlt(void); void x86_stihlt(void); void tlbflush(void); void tlbflushg(void); void invlpg(vaddr_t); void wbinvd(void); void breakpoint(void); #define INVPCID_ADDRESS 0 #define INVPCID_CONTEXT 1 #define INVPCID_ALL 2 #define INVPCID_ALL_NONGLOBAL 3 static inline void invpcid(register_t op, uint64_t pcid, vaddr_t va) { struct { uint64_t pcid; uint64_t addr; } desc = { .pcid = pcid, .addr = va }; __asm volatile ( "invpcid %[desc],%[op]" : : [desc] "m" (desc), [op] "r" (op) : "memory" ); } extern uint64_t (*rdtsc)(void); #define _SERIALIZE_lfence __asm volatile ("lfence") #define _SERIALIZE_mfence __asm volatile ("mfence") #define _SERIALIZE_cpuid __asm volatile ("xor %%eax, %%eax;cpuid" ::: \ "eax", "ebx", "ecx", "edx"); #define RDTSCFUNC(fence) \ static inline uint64_t \ rdtsc_##fence(void) \ { \ uint32_t low, high; \ \ _SERIALIZE_##fence; \ __asm volatile ( \ "rdtsc" \ : "=a" (low), "=d" (high) \ : \ ); \ \ return (low | ((uint64_t)high << 32)); \ } RDTSCFUNC(lfence) RDTSCFUNC(mfence) RDTSCFUNC(cpuid) #undef _SERIALIZE_LFENCE #undef _SERIALIZE_MFENCE #undef _SERIALIZE_CPUID #ifndef XENPV struct x86_hotpatch_source { uint8_t *saddr; uint8_t *eaddr; }; struct x86_hotpatch_descriptor { uint8_t name; uint8_t nsrc; const struct x86_hotpatch_source *srcs[]; }; void x86_hotpatch(uint8_t, uint8_t); void x86_patch(bool); #endif void x86_monitor(const void *, uint32_t, uint32_t); void x86_mwait(uint32_t, uint32_t); static inline void x86_cpuid2(uint32_t eax, uint32_t ecx, uint32_t *regs) { uint32_t ebx, edx; __asm volatile ( "cpuid" : "=a" (eax), "=b" (ebx), "=c" (ecx), "=d" (edx) : "a" (eax), "c" (ecx) ); regs[0] = eax; regs[1] = ebx; regs[2] = ecx; regs[3] = edx; } #define x86_cpuid(a,b) x86_cpuid2((a), 0, (b)) /* -------------------------------------------------------------------------- */ void lidt(struct region_descriptor *); void lldt(u_short); void ltr(u_short); static inline uint16_t x86_getss(void) { uint16_t val; __asm volatile ( "mov %%ss,%[val]" : [val] "=r" (val) : ); return val; } static inline void setds(uint16_t val) { __asm volatile ( "mov %[val],%%ds" : : [val] "r" (val) ); } static inline void setes(uint16_t val) { __asm volatile ( "mov %[val],%%es" : : [val] "r" (val) ); } static inline void setfs(uint16_t val) { __asm volatile ( "mov %[val],%%fs" : : [val] "r" (val) ); } void setusergs(int); /* -------------------------------------------------------------------------- */ #define FUNC_CR(crnum) \ static inline void lcr##crnum(register_t val) \ { \ __asm volatile ( \ "mov %[val],%%cr" #crnum \ : \ : [val] "r" (val) \ : "memory" \ ); \ } \ static inline register_t rcr##crnum(void) \ { \ register_t val; \ __asm volatile ( \ "mov %%cr" #crnum ",%[val]" \ : [val] "=r" (val) \ : \ ); \ return val; \ } #define PROTO_CR(crnum) \ void lcr##crnum(register_t); \ register_t rcr##crnum(void); #ifndef XENPV FUNC_CR(0) FUNC_CR(2) FUNC_CR(3) #else PROTO_CR(0) PROTO_CR(2) PROTO_CR(3) #endif FUNC_CR(4) FUNC_CR(8) /* -------------------------------------------------------------------------- */ #define FUNC_DR(drnum) \ static inline void ldr##drnum(register_t val) \ { \ __asm volatile ( \ "mov %[val],%%dr" #drnum \ : \ : [val] "r" (val) \ ); \ } \ static inline register_t rdr##drnum(void) \ { \ register_t val; \ __asm volatile ( \ "mov %%dr" #drnum ",%[val]" \ : [val] "=r" (val) \ : \ ); \ return val; \ } #define PROTO_DR(drnum) \ register_t rdr##drnum(void); \ void ldr##drnum(register_t); #ifndef XENPV FUNC_DR(0) FUNC_DR(1) FUNC_DR(2) FUNC_DR(3) FUNC_DR(6) FUNC_DR(7) #else PROTO_DR(0) PROTO_DR(1) PROTO_DR(2) PROTO_DR(3) PROTO_DR(6) PROTO_DR(7) #endif /* -------------------------------------------------------------------------- */ union savefpu; static inline void fninit(void) { __asm volatile ("fninit" ::: "memory"); } static inline void fnclex(void) { __asm volatile ("fnclex"); } static inline void fnstcw(uint16_t *val) { __asm volatile ( "fnstcw %[val]" : [val] "=m" (*val) : ); } static inline void fnstsw(uint16_t *val) { __asm volatile ( "fnstsw %[val]" : [val] "=m" (*val) : ); } static inline void clts(void) { __asm volatile ("clts" ::: "memory"); } void stts(void); static inline void x86_stmxcsr(uint32_t *val) { __asm volatile ( "stmxcsr %[val]" : [val] "=m" (*val) : ); } static inline void x86_ldmxcsr(uint32_t *val) { __asm volatile ( "ldmxcsr %[val]" : : [val] "m" (*val) ); } void fldummy(void); static inline uint64_t rdxcr(uint32_t xcr) { uint32_t low, high; __asm volatile ( "xgetbv" : "=a" (low), "=d" (high) : "c" (xcr) ); return (low | ((uint64_t)high << 32)); } static inline void wrxcr(uint32_t xcr, uint64_t val) { uint32_t low, high; low = val; high = val >> 32; __asm volatile ( "xsetbv" : : "a" (low), "d" (high), "c" (xcr) ); } static inline void fnsave(void *addr) { uint8_t *area = addr; __asm volatile ( "fnsave %[area]" : [area] "=m" (*area) : : "memory" ); } static inline void frstor(const void *addr) { const uint8_t *area = addr; __asm volatile ( "frstor %[area]" : : [area] "m" (*area) : "memory" ); } static inline void fxsave(void *addr) { uint8_t *area = addr; __asm volatile ( "fxsave %[area]" : [area] "=m" (*area) : : "memory" ); } static inline void fxrstor(const void *addr) { const uint8_t *area = addr; __asm volatile ( "fxrstor %[area]" : : [area] "m" (*area) : "memory" ); } static inline void xsave(void *addr, uint64_t mask) { uint8_t *area = addr; uint32_t low, high; low = mask; high = mask >> 32; __asm volatile ( "xsave %[area]" : [area] "=m" (*area) : "a" (low), "d" (high) : "memory" ); } static inline void xsaveopt(void *addr, uint64_t mask) { uint8_t *area = addr; uint32_t low, high; low = mask; high = mask >> 32; __asm volatile ( "xsaveopt %[area]" : [area] "=m" (*area) : "a" (low), "d" (high) : "memory" ); } static inline void xrstor(const void *addr, uint64_t mask) { const uint8_t *area = addr; uint32_t low, high; low = mask; high = mask >> 32; __asm volatile ( "xrstor %[area]" : : [area] "m" (*area), "a" (low), "d" (high) : "memory" ); } #ifdef __x86_64__ static inline void fxsave64(void *addr) { uint8_t *area = addr; __asm volatile ( "fxsave64 %[area]" : [area] "=m" (*area) : : "memory" ); } static inline void fxrstor64(const void *addr) { const uint8_t *area = addr; __asm volatile ( "fxrstor64 %[area]" : : [area] "m" (*area) : "memory" ); } static inline void xsave64(void *addr, uint64_t mask) { uint8_t *area = addr; uint32_t low, high; low = mask; high = mask >> 32; __asm volatile ( "xsave64 %[area]" : [area] "=m" (*area) : "a" (low), "d" (high) : "memory" ); } static inline void xsaveopt64(void *addr, uint64_t mask) { uint8_t *area = addr; uint32_t low, high; low = mask; high = mask >> 32; __asm volatile ( "xsaveopt64 %[area]" : [area] "=m" (*area) : "a" (low), "d" (high) : "memory" ); } static inline void xrstor64(const void *addr, uint64_t mask) { const uint8_t *area = addr; uint32_t low, high; low = mask; high = mask >> 32; __asm volatile ( "xrstor64 %[area]" : : [area] "m" (*area), "a" (low), "d" (high) : "memory" ); } #endif /* -------------------------------------------------------------------------- */ #ifdef XENPV void x86_disable_intr(void); void x86_enable_intr(void); #else static inline void x86_disable_intr(void) { __asm volatile ("cli" ::: "memory"); } static inline void x86_enable_intr(void) { __asm volatile ("sti" ::: "memory"); } #endif /* XENPV */ /* Use read_psl, write_psl when saving and restoring interrupt state. */ u_long x86_read_psl(void); void x86_write_psl(u_long); /* Use read_flags, write_flags to adjust other members of %eflags. */ u_long x86_read_flags(void); void x86_write_flags(u_long); void x86_reset(void); /* -------------------------------------------------------------------------- */ /* * Some of the undocumented AMD64 MSRs need a 'passcode' to access. * See LinuxBIOSv2: src/cpu/amd/model_fxx/model_fxx_init.c */ #define OPTERON_MSR_PASSCODE 0x9c5a203aU static inline uint64_t rdmsr(u_int msr) { uint32_t low, high; __asm volatile ( "rdmsr" : "=a" (low), "=d" (high) : "c" (msr) ); return (low | ((uint64_t)high << 32)); } static inline uint64_t rdmsr_locked(u_int msr) { uint32_t low, high, pass = OPTERON_MSR_PASSCODE; __asm volatile ( "rdmsr" : "=a" (low), "=d" (high) : "c" (msr), "D" (pass) ); return (low | ((uint64_t)high << 32)); } int rdmsr_safe(u_int, uint64_t *); static inline void wrmsr(u_int msr, uint64_t val) { uint32_t low, high; low = val; high = val >> 32; __asm volatile ( "wrmsr" : : "a" (low), "d" (high), "c" (msr) : "memory" ); } static inline void wrmsr_locked(u_int msr, uint64_t val) { uint32_t low, high, pass = OPTERON_MSR_PASSCODE; low = val; high = val >> 32; __asm volatile ( "wrmsr" : : "a" (low), "d" (high), "c" (msr), "D" (pass) : "memory" ); } #endif /* _KERNEL */ #endif /* !_X86_CPUFUNC_H_ */
375 785 771 123 203 204 25 203 1136 1061 969 430 430 962 312 1133 1133 1078 794 789 789 787 789 757 483 794 792 793 792 792 793 749 34 595 598 509 748 746 747 747 749 644 642 645 750 747 748 744 748 748 746 643 642 642 529 84 241 640 201 390 93 398 396 53 27 393 390 387 388 92 398 397 360 180 256 256 257 33 240 675 198 357 644 130 257 256 257 398 273 273 273 272 272 273 54 267 79 44 44 44 61 61 3 58 85 264 254 230 231 231 157 158 157 88 87 88 158 157 524 523 289 213 398 93 397 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 /* $NetBSD: rb.c,v 1.16 2021/09/16 21:29:41 andvar Exp $ */ /*- * Copyright (c) 2001 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Matt Thomas <matt@3am-software.com>. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #if HAVE_NBTOOL_CONFIG_H #include "nbtool_config.h" #endif #if !defined(_KERNEL) && !defined(_STANDALONE) #include <sys/types.h> #include <stddef.h> #include <assert.h> #include <stdbool.h> #ifdef RBDEBUG #define KASSERT(s) assert(s) #define __rbt_unused #else #define KASSERT(s) do { } while (/*CONSTCOND*/ 0) #define __rbt_unused __unused #endif __RCSID("$NetBSD: rb.c,v 1.16 2021/09/16 21:29:41 andvar Exp $"); #else #include <lib/libkern/libkern.h> __KERNEL_RCSID(0, "$NetBSD: rb.c,v 1.16 2021/09/16 21:29:41 andvar Exp $"); #ifndef DIAGNOSTIC #define __rbt_unused __unused #else #define __rbt_unused #endif #endif #ifdef _LIBC __weak_alias(rb_tree_init, _rb_tree_init) __weak_alias(rb_tree_find_node, _rb_tree_find_node) __weak_alias(rb_tree_find_node_geq, _rb_tree_find_node_geq) __weak_alias(rb_tree_find_node_leq, _rb_tree_find_node_leq) __weak_alias(rb_tree_insert_node, _rb_tree_insert_node) __weak_alias(rb_tree_remove_node, _rb_tree_remove_node) __weak_alias(rb_tree_iterate, _rb_tree_iterate) #ifdef RBDEBUG __weak_alias(rb_tree_check, _rb_tree_check) __weak_alias(rb_tree_depths, _rb_tree_depths) #endif #include "namespace.h" #endif #ifdef RBTEST #include "rbtree.h" #else #include <sys/rbtree.h> #endif static void rb_tree_insert_rebalance(struct rb_tree *, struct rb_node *); static void rb_tree_removal_rebalance(struct rb_tree *, struct rb_node *, unsigned int); #ifdef RBDEBUG static const struct rb_node *rb_tree_iterate_const(const struct rb_tree *, const struct rb_node *, const unsigned int); static bool rb_tree_check_node(const struct rb_tree *, const struct rb_node *, const struct rb_node *, bool); #else #define rb_tree_check_node(a, b, c, d) true #endif #define RB_NODETOITEM(rbto, rbn) \ ((void *)((uintptr_t)(rbn) - (rbto)->rbto_node_offset)) #define RB_ITEMTONODE(rbto, rbn) \ ((rb_node_t *)((uintptr_t)(rbn) + (rbto)->rbto_node_offset)) #define RB_SENTINEL_NODE NULL void rb_tree_init(struct rb_tree *rbt, const rb_tree_ops_t *ops) { rbt->rbt_ops = ops; rbt->rbt_root = RB_SENTINEL_NODE; RB_TAILQ_INIT(&rbt->rbt_nodes); #ifndef RBSMALL rbt->rbt_minmax[RB_DIR_LEFT] = rbt->rbt_root; /* minimum node */ rbt->rbt_minmax[RB_DIR_RIGHT] = rbt->rbt_root; /* maximum node */ #endif #ifdef RBSTATS rbt->rbt_count = 0; rbt->rbt_insertions = 0; rbt->rbt_removals = 0; rbt->rbt_insertion_rebalance_calls = 0; rbt->rbt_insertion_rebalance_passes = 0; rbt->rbt_removal_rebalance_calls = 0; rbt->rbt_removal_rebalance_passes = 0; #endif } void * rb_tree_find_node(struct rb_tree *rbt, const void *key) { const rb_tree_ops_t *rbto = rbt->rbt_ops; rbto_compare_key_fn compare_key = rbto->rbto_compare_key; struct rb_node *parent = rbt->rbt_root; while (!RB_SENTINEL_P(parent)) { void *pobj = RB_NODETOITEM(rbto, parent); const signed int diff = (*compare_key)(rbto->rbto_context, pobj, key); if (diff == 0) return pobj; parent = parent->rb_nodes[diff < 0]; } return NULL; } void * rb_tree_find_node_geq(struct rb_tree *rbt, const void *key) { const rb_tree_ops_t *rbto = rbt->rbt_ops; rbto_compare_key_fn compare_key = rbto->rbto_compare_key; struct rb_node *parent = rbt->rbt_root, *last = NULL; while (!RB_SENTINEL_P(parent)) { void *pobj = RB_NODETOITEM(rbto, parent); const signed int diff = (*compare_key)(rbto->rbto_context, pobj, key); if (diff == 0) return pobj; if (diff > 0) last = parent; parent = parent->rb_nodes[diff < 0]; } return last == NULL ? NULL : RB_NODETOITEM(rbto, last); } void * rb_tree_find_node_leq(struct rb_tree *rbt, const void *key) { const rb_tree_ops_t *rbto = rbt->rbt_ops; rbto_compare_key_fn compare_key = rbto->rbto_compare_key; struct rb_node *parent = rbt->rbt_root, *last = NULL; while (!RB_SENTINEL_P(parent)) { void *pobj = RB_NODETOITEM(rbto, parent); const signed int diff = (*compare_key)(rbto->rbto_context, pobj, key); if (diff == 0) return pobj; if (diff < 0) last = parent; parent = parent->rb_nodes[diff < 0]; } return last == NULL ? NULL : RB_NODETOITEM(rbto, last); } void * rb_tree_insert_node(struct rb_tree *rbt, void *object) { const rb_tree_ops_t *rbto = rbt->rbt_ops; rbto_compare_nodes_fn compare_nodes = rbto->rbto_compare_nodes; struct rb_node *parent, *tmp, *self = RB_ITEMTONODE(rbto, object); unsigned int position; bool rebalance; RBSTAT_INC(rbt->rbt_insertions); tmp = rbt->rbt_root; /* * This is a hack. Because rbt->rbt_root is just a struct rb_node *, * just like rb_node->rb_nodes[RB_DIR_LEFT], we can use this fact to * avoid a lot of tests for root and know that even at root, * updating RB_FATHER(rb_node)->rb_nodes[RB_POSITION(rb_node)] will * update rbt->rbt_root. */ parent = (struct rb_node *)(void *)&rbt->rbt_root; position = RB_DIR_LEFT; /* * Find out where to place this new leaf. */ while (!RB_SENTINEL_P(tmp)) { void *tobj = RB_NODETOITEM(rbto, tmp); const signed int diff = (*compare_nodes)(rbto->rbto_context, tobj, object); if (__predict_false(diff == 0)) { /* * Node already exists; return it. */ return tobj; } parent = tmp; position = (diff < 0); tmp = parent->rb_nodes[position]; } #ifdef RBDEBUG { struct rb_node *prev = NULL, *next = NULL; if (position == RB_DIR_RIGHT) prev = parent; else if (tmp != rbt->rbt_root) next = parent; /* * Verify our sequential position */ KASSERT(prev == NULL || !RB_SENTINEL_P(prev)); KASSERT(next == NULL || !RB_SENTINEL_P(next)); if (prev != NULL && next == NULL) next = TAILQ_NEXT(prev, rb_link); if (prev == NULL && next != NULL) prev = TAILQ_PREV(next, rb_node_qh, rb_link); KASSERT(prev == NULL || !RB_SENTINEL_P(prev)); KASSERT(next == NULL || !RB_SENTINEL_P(next)); KASSERT(prev == NULL || (*compare_nodes)(rbto->rbto_context, RB_NODETOITEM(rbto, prev), RB_NODETOITEM(rbto, self)) < 0); KASSERT(next == NULL || (*compare_nodes)(rbto->rbto_context, RB_NODETOITEM(rbto, self), RB_NODETOITEM(rbto, next)) < 0); } #endif /* * Initialize the node and insert as a leaf into the tree. */ RB_SET_FATHER(self, parent); RB_SET_POSITION(self, position); if (__predict_false(parent == (struct rb_node *)(void *)&rbt->rbt_root)) { RB_MARK_BLACK(self); /* root is always black */ #ifndef RBSMALL rbt->rbt_minmax[RB_DIR_LEFT] = self; rbt->rbt_minmax[RB_DIR_RIGHT] = self; #endif rebalance = false; } else { KASSERT(position == RB_DIR_LEFT || position == RB_DIR_RIGHT); #ifndef RBSMALL /* * Keep track of the minimum and maximum nodes. If our * parent is a minmax node and we on their min/max side, * we must be the new min/max node. */ if (parent == rbt->rbt_minmax[position]) rbt->rbt_minmax[position] = self; #endif /* !RBSMALL */ /* * All new nodes are colored red. We only need to rebalance * if our parent is also red. */ RB_MARK_RED(self); rebalance = RB_RED_P(parent); } KASSERT(RB_SENTINEL_P(parent->rb_nodes[position])); self->rb_left = parent->rb_nodes[position]; self->rb_right = parent->rb_nodes[position]; parent->rb_nodes[position] = self; KASSERT(RB_CHILDLESS_P(self)); /* * Insert the new node into a sorted list for easy sequential access */ RBSTAT_INC(rbt->rbt_count); #ifdef RBDEBUG if (RB_ROOT_P(rbt, self)) { RB_TAILQ_INSERT_HEAD(&rbt->rbt_nodes, self, rb_link); } else if (position == RB_DIR_LEFT) { KASSERT((*compare_nodes)(rbto->rbto_context, RB_NODETOITEM(rbto, self), RB_NODETOITEM(rbto, RB_FATHER(self))) < 0); RB_TAILQ_INSERT_BEFORE(RB_FATHER(self), self, rb_link); } else { KASSERT((*compare_nodes)(rbto->rbto_context, RB_NODETOITEM(rbto, RB_FATHER(self)), RB_NODETOITEM(rbto, self)) < 0); RB_TAILQ_INSERT_AFTER(&rbt->rbt_nodes, RB_FATHER(self), self, rb_link); } #endif KASSERT(rb_tree_check_node(rbt, self, NULL, !rebalance)); /* * Rebalance tree after insertion */ if (rebalance) { rb_tree_insert_rebalance(rbt, self); KASSERT(rb_tree_check_node(rbt, self, NULL, true)); } /* Successfully inserted, return our node pointer. */ return object; } /* * Swap the location and colors of 'self' and its child @ which. The child * can not be a sentinel node. This is our rotation function. However, * since it preserves coloring, it great simplifies both insertion and * removal since rotation almost always involves the exchanging of colors * as a separate step. */ static void rb_tree_reparent_nodes(__rbt_unused struct rb_tree *rbt, struct rb_node *old_father, const unsigned int which) { const unsigned int other = which ^ RB_DIR_OTHER; struct rb_node * const grandpa = RB_FATHER(old_father); struct rb_node * const old_child = old_father->rb_nodes[which]; struct rb_node * const new_father = old_child; struct rb_node * const new_child = old_father; KASSERT(which == RB_DIR_LEFT || which == RB_DIR_RIGHT); KASSERT(!RB_SENTINEL_P(old_child)); KASSERT(RB_FATHER(old_child) == old_father); KASSERT(rb_tree_check_node(rbt, old_father, NULL, false)); KASSERT(rb_tree_check_node(rbt, old_child, NULL, false)); KASSERT(RB_ROOT_P(rbt, old_father) || rb_tree_check_node(rbt, grandpa, NULL, false)); /* * Exchange descendant linkages. */ grandpa->rb_nodes[RB_POSITION(old_father)] = new_father; new_child->rb_nodes[which] = old_child->rb_nodes[other]; new_father->rb_nodes[other] = new_child; /* * Update ancestor linkages */ RB_SET_FATHER(new_father, grandpa); RB_SET_FATHER(new_child, new_father); /* * Exchange properties between new_father and new_child. The only * change is that new_child's position is now on the other side. */ #if 0 { struct rb_node tmp; tmp.rb_info = 0; RB_COPY_PROPERTIES(&tmp, old_child); RB_COPY_PROPERTIES(new_father, old_father); RB_COPY_PROPERTIES(new_child, &tmp); } #else RB_SWAP_PROPERTIES(new_father, new_child); #endif RB_SET_POSITION(new_child, other); /* * Make sure to reparent the new child to ourself. */ if (!RB_SENTINEL_P(new_child->rb_nodes[which])) { RB_SET_FATHER(new_child->rb_nodes[which], new_child); RB_SET_POSITION(new_child->rb_nodes[which], which); } KASSERT(rb_tree_check_node(rbt, new_father, NULL, false)); KASSERT(rb_tree_check_node(rbt, new_child, NULL, false)); KASSERT(RB_ROOT_P(rbt, new_father) || rb_tree_check_node(rbt, grandpa, NULL, false)); } static void rb_tree_insert_rebalance(struct rb_tree *rbt, struct rb_node *self) { struct rb_node * father = RB_FATHER(self); struct rb_node * grandpa = RB_FATHER(father); struct rb_node * uncle; unsigned int which; unsigned int other; KASSERT(!RB_ROOT_P(rbt, self)); KASSERT(RB_RED_P(self)); KASSERT(RB_RED_P(father)); RBSTAT_INC(rbt->rbt_insertion_rebalance_calls); for (;;) { KASSERT(!RB_SENTINEL_P(self)); KASSERT(RB_RED_P(self)); KASSERT(RB_RED_P(father)); /* * We are red and our parent is red, therefore we must have a * grandfather and he must be black. */ grandpa = RB_FATHER(father); KASSERT(RB_BLACK_P(grandpa)); KASSERT(RB_DIR_RIGHT == 1 && RB_DIR_LEFT == 0); which = (father == grandpa->rb_right); other = which ^ RB_DIR_OTHER; uncle = grandpa->rb_nodes[other]; if (RB_BLACK_P(uncle)) break; RBSTAT_INC(rbt->rbt_insertion_rebalance_passes); /* * Case 1: our uncle is red * Simply invert the colors of our parent and * uncle and make our grandparent red. And * then solve the problem up at his level. */ RB_MARK_BLACK(uncle); RB_MARK_BLACK(father); if (__predict_false(RB_ROOT_P(rbt, grandpa))) { /* * If our grandpa is root, don't bother * setting him to red, just return. */ KASSERT(RB_BLACK_P(grandpa)); return; } RB_MARK_RED(grandpa); self = grandpa; father = RB_FATHER(self); KASSERT(RB_RED_P(self)); if (RB_BLACK_P(father)) { /* * If our greatgrandpa is black, we're done. */ KASSERT(RB_BLACK_P(rbt->rbt_root)); return; } } KASSERT(!RB_ROOT_P(rbt, self)); KASSERT(RB_RED_P(self)); KASSERT(RB_RED_P(father)); KASSERT(RB_BLACK_P(uncle)); KASSERT(RB_BLACK_P(grandpa)); /* * Case 2&3: our uncle is black. */ if (self == father->rb_nodes[other]) { /* * Case 2: we are on the same side as our uncle * Swap ourselves with our parent so this case * becomes case 3. Basically our parent becomes our * child. */ rb_tree_reparent_nodes(rbt, father, other); KASSERT(RB_FATHER(father) == self); KASSERT(self->rb_nodes[which] == father); KASSERT(RB_FATHER(self) == grandpa); self = father; father = RB_FATHER(self); } KASSERT(RB_RED_P(self) && RB_RED_P(father)); KASSERT(grandpa->rb_nodes[which] == father); /* * Case 3: we are opposite a child of a black uncle. * Swap our parent and grandparent. Since our grandfather * is black, our father will become black and our new sibling * (former grandparent) will become red. */ rb_tree_reparent_nodes(rbt, grandpa, which); KASSERT(RB_FATHER(self) == father); KASSERT(RB_FATHER(self)->rb_nodes[RB_POSITION(self) ^ RB_DIR_OTHER] == grandpa); KASSERT(RB_RED_P(self)); KASSERT(RB_BLACK_P(father)); KASSERT(RB_RED_P(grandpa)); /* * Final step: Set the root to black. */ RB_MARK_BLACK(rbt->rbt_root); } static void rb_tree_prune_node(struct rb_tree *rbt, struct rb_node *self, bool rebalance) { const unsigned int which = RB_POSITION(self); struct rb_node *father = RB_FATHER(self); #ifndef RBSMALL const bool was_root = RB_ROOT_P(rbt, self); #endif KASSERT(rebalance || (RB_ROOT_P(rbt, self) || RB_RED_P(self))); KASSERT(!rebalance || RB_BLACK_P(self)); KASSERT(RB_CHILDLESS_P(self)); KASSERT(rb_tree_check_node(rbt, self, NULL, false)); /* * Since we are childless, we know that self->rb_left is pointing * to the sentinel node. */ father->rb_nodes[which] = self->rb_left; /* * Remove ourselves from the node list, decrement the count, * and update min/max. */ RB_TAILQ_REMOVE(&rbt->rbt_nodes, self, rb_link); RBSTAT_DEC(rbt->rbt_count); #ifndef RBSMALL if (__predict_false(rbt->rbt_minmax[RB_POSITION(self)] == self)) { rbt->rbt_minmax[RB_POSITION(self)] = father; /* * When removing the root, rbt->rbt_minmax[RB_DIR_LEFT] is * updated automatically, but we also need to update * rbt->rbt_minmax[RB_DIR_RIGHT]; */ if (__predict_false(was_root)) { rbt->rbt_minmax[RB_DIR_RIGHT] = father; } } RB_SET_FATHER(self, NULL); #endif /* * Rebalance if requested. */ if (rebalance) rb_tree_removal_rebalance(rbt, father, which); KASSERT(was_root || rb_tree_check_node(rbt, father, NULL, true)); } /* * When deleting an interior node */ static void rb_tree_swap_prune_and_rebalance(struct rb_tree *rbt, struct rb_node *self, struct rb_node *standin) { const unsigned int standin_which = RB_POSITION(standin); unsigned int standin_other = standin_which ^ RB_DIR_OTHER; struct rb_node *standin_son; struct rb_node *standin_father = RB_FATHER(standin); bool rebalance = RB_BLACK_P(standin); if (standin_father == self) { /* * As a child of self, any childen would be opposite of * our parent. */ KASSERT(RB_SENTINEL_P(standin->rb_nodes[standin_other])); standin_son = standin->rb_nodes[standin_which]; } else { /* * Since we aren't a child of self, any childen would be * on the same side as our parent. */ KASSERT(RB_SENTINEL_P(standin->rb_nodes[standin_which])); standin_son = standin->rb_nodes[standin_other]; } /* * the node we are removing must have two children. */ KASSERT(RB_TWOCHILDREN_P(self)); /* * If standin has a child, it must be red. */ KASSERT(RB_SENTINEL_P(standin_son) || RB_RED_P(standin_son)); /* * Verify things are sane. */ KASSERT(rb_tree_check_node(rbt, self, NULL, false)); KASSERT(rb_tree_check_node(rbt, standin, NULL, false)); if (__predict_false(RB_RED_P(standin_son))) { /* * We know we have a red child so if we flip it to black * we don't have to rebalance. */ KASSERT(rb_tree_check_node(rbt, standin_son, NULL, true)); RB_MARK_BLACK(standin_son); rebalance = false; if (standin_father == self) { KASSERT(RB_POSITION(standin_son) == standin_which); } else { KASSERT(RB_POSITION(standin_son) == standin_other); /* * Change the son's parentage to point to his grandpa. */ RB_SET_FATHER(standin_son, standin_father); RB_SET_POSITION(standin_son, standin_which); } } if (standin_father == self) { /* * If we are about to delete the standin's father, then when * we call rebalance, we need to use ourselves as our father. * Otherwise remember our original father. Also, sincef we are * our standin's father we only need to reparent the standin's * brother. * * | R --> S | * | Q S --> Q T | * | t --> | */ KASSERT(RB_SENTINEL_P(standin->rb_nodes[standin_other])); KASSERT(!RB_SENTINEL_P(self->rb_nodes[standin_other])); KASSERT(self->rb_nodes[standin_which] == standin); /* * Have our son/standin adopt his brother as his new son. */ standin_father = standin; } else { /* * | R --> S . | * | / \ | T --> / \ | / | * | ..... | S --> ..... | T | * * Sever standin's connection to his father. */ standin_father->rb_nodes[standin_which] = standin_son; /* * Adopt the far son. */ standin->rb_nodes[standin_other] = self->rb_nodes[standin_other]; RB_SET_FATHER(standin->rb_nodes[standin_other], standin); KASSERT(RB_POSITION(self->rb_nodes[standin_other]) == standin_other); /* * Use standin_other because we need to preserve standin_which * for the removal_rebalance. */ standin_other = standin_which; } /* * Move the only remaining son to our standin. If our standin is our * son, this will be the only son needed to be moved. */ KASSERT(standin->rb_nodes[standin_other] != self->rb_nodes[standin_other]); standin->rb_nodes[standin_other] = self->rb_nodes[standin_other]; RB_SET_FATHER(standin->rb_nodes[standin_other], standin); /* * Now copy the result of self to standin and then replace * self with standin in the tree. */ RB_COPY_PROPERTIES(standin, self); RB_SET_FATHER(standin, RB_FATHER(self)); RB_FATHER(standin)->rb_nodes[RB_POSITION(standin)] = standin; /* * Remove ourselves from the node list, decrement the count, * and update min/max. */ RB_TAILQ_REMOVE(&rbt->rbt_nodes, self, rb_link); RBSTAT_DEC(rbt->rbt_count); #ifndef RBSMALL if (__predict_false(rbt->rbt_minmax[RB_POSITION(self)] == self)) rbt->rbt_minmax[RB_POSITION(self)] = RB_FATHER(self); RB_SET_FATHER(self, NULL); #endif KASSERT(rb_tree_check_node(rbt, standin, NULL, false)); KASSERT(RB_FATHER_SENTINEL_P(standin) || rb_tree_check_node(rbt, standin_father, NULL, false)); KASSERT(RB_LEFT_SENTINEL_P(standin) || rb_tree_check_node(rbt, standin->rb_left, NULL, false)); KASSERT(RB_RIGHT_SENTINEL_P(standin) || rb_tree_check_node(rbt, standin->rb_right, NULL, false)); if (!rebalance) return; rb_tree_removal_rebalance(rbt, standin_father, standin_which); KASSERT(rb_tree_check_node(rbt, standin, NULL, true)); } /* * We could do this by doing * rb_tree_node_swap(rbt, self, which); * rb_tree_prune_node(rbt, self, false); * * But it's more efficient to just evalate and recolor the child. */ static void rb_tree_prune_blackred_branch(struct rb_tree *rbt, struct rb_node *self, unsigned int which) { struct rb_node *father = RB_FATHER(self); struct rb_node *son = self->rb_nodes[which]; #ifndef RBSMALL const bool was_root = RB_ROOT_P(rbt, self); #endif KASSERT(which == RB_DIR_LEFT || which == RB_DIR_RIGHT); KASSERT(RB_BLACK_P(self) && RB_RED_P(son)); KASSERT(!RB_TWOCHILDREN_P(son)); KASSERT(RB_CHILDLESS_P(son)); KASSERT(rb_tree_check_node(rbt, self, NULL, false)); KASSERT(rb_tree_check_node(rbt, son, NULL, false)); /* * Remove ourselves from the tree and give our former child our * properties (position, color, root). */ RB_COPY_PROPERTIES(son, self); father->rb_nodes[RB_POSITION(son)] = son; RB_SET_FATHER(son, father); /* * Remove ourselves from the node list, decrement the count, * and update minmax. */ RB_TAILQ_REMOVE(&rbt->rbt_nodes, self, rb_link); RBSTAT_DEC(rbt->rbt_count); #ifndef RBSMALL if (__predict_false(was_root)) { KASSERT(rbt->rbt_minmax[which] == son); rbt->rbt_minmax[which ^ RB_DIR_OTHER] = son; } else if (rbt->rbt_minmax[RB_POSITION(self)] == self) { rbt->rbt_minmax[RB_POSITION(self)] = son; } RB_SET_FATHER(self, NULL); #endif KASSERT(was_root || rb_tree_check_node(rbt, father, NULL, true)); KASSERT(rb_tree_check_node(rbt, son, NULL, true)); } void rb_tree_remove_node(struct rb_tree *rbt, void *object) { const rb_tree_ops_t *rbto = rbt->rbt_ops; struct rb_node *standin, *self = RB_ITEMTONODE(rbto, object); unsigned int which; KASSERT(!RB_SENTINEL_P(self)); RBSTAT_INC(rbt->rbt_removals); /* * In the following diagrams, we (the node to be removed) are S. Red * nodes are lowercase. T could be either red or black. * * Remember the major axiom of the red-black tree: the number of * black nodes from the root to each leaf is constant across all * leaves, only the number of red nodes varies. * * Thus removing a red leaf doesn't require any other changes to a * red-black tree. So if we must remove a node, attempt to rearrange * the tree so we can remove a red node. * * The simpliest case is a childless red node or a childless root node: * * | T --> T | or | R --> * | * | s --> * | */ if (RB_CHILDLESS_P(self)) { const bool rebalance = RB_BLACK_P(self) && !RB_ROOT_P(rbt, self); rb_tree_prune_node(rbt, self, rebalance); return; } KASSERT(!RB_CHILDLESS_P(self)); if (!RB_TWOCHILDREN_P(self)) { /* * The next simpliest case is the node we are deleting is * black and has one red child. * * | T --> T --> T | * | S --> R --> R | * | r --> s --> * | */ which = RB_LEFT_SENTINEL_P(self) ? RB_DIR_RIGHT : RB_DIR_LEFT; KASSERT(RB_BLACK_P(self)); KASSERT(RB_RED_P(self->rb_nodes[which])); KASSERT(RB_CHILDLESS_P(self->rb_nodes[which])); rb_tree_prune_blackred_branch(rbt, self, which); return; } KASSERT(RB_TWOCHILDREN_P(self)); /* * We invert these because we prefer to remove from the inside of * the tree. */ which = RB_POSITION(self) ^ RB_DIR_OTHER; /* * Let's find the node closes to us opposite of our parent * Now swap it with ourself, "prune" it, and rebalance, if needed. */ standin = RB_ITEMTONODE(rbto, rb_tree_iterate(rbt, object, which)); rb_tree_swap_prune_and_rebalance(rbt, self, standin); } static void rb_tree_removal_rebalance(struct rb_tree *rbt, struct rb_node *parent, unsigned int which) { KASSERT(!RB_SENTINEL_P(parent)); KASSERT(RB_SENTINEL_P(parent->rb_nodes[which])); KASSERT(which == RB_DIR_LEFT || which == RB_DIR_RIGHT); RBSTAT_INC(rbt->rbt_removal_rebalance_calls); while (RB_BLACK_P(parent->rb_nodes[which])) { unsigned int other = which ^ RB_DIR_OTHER; struct rb_node *brother = parent->rb_nodes[other]; RBSTAT_INC(rbt->rbt_removal_rebalance_passes); KASSERT(!RB_SENTINEL_P(brother)); /* * For cases 1, 2a, and 2b, our brother's children must * be black and our father must be black */ if (RB_BLACK_P(parent) && RB_BLACK_P(brother->rb_left) && RB_BLACK_P(brother->rb_right)) { if (RB_RED_P(brother)) { /* * Case 1: Our brother is red, swap its * position (and colors) with our parent. * This should now be case 2b (unless C or E * has a red child which is case 3; thus no * explicit branch to case 2b). * * B -> D * A d -> b E * C E -> A C */ KASSERT(RB_BLACK_P(parent)); rb_tree_reparent_nodes(rbt, parent, other); brother = parent->rb_nodes[other]; KASSERT(!RB_SENTINEL_P(brother)); KASSERT(RB_RED_P(parent)); KASSERT(RB_BLACK_P(brother)); KASSERT(rb_tree_check_node(rbt, brother, NULL, false)); KASSERT(rb_tree_check_node(rbt, parent, NULL, false)); } else { /* * Both our parent and brother are black. * Change our brother to red, advance up rank * and go through the loop again. * * B -> *B * *A D -> A d * C E -> C E */ RB_MARK_RED(brother); KASSERT(RB_BLACK_P(brother->rb_left)); KASSERT(RB_BLACK_P(brother->rb_right)); if (RB_ROOT_P(rbt, parent)) return; /* root == parent == black */ KASSERT(rb_tree_check_node(rbt, brother, NULL, false)); KASSERT(rb_tree_check_node(rbt, parent, NULL, false)); which = RB_POSITION(parent); parent = RB_FATHER(parent); continue; } } /* * Avoid an else here so that case 2a above can hit either * case 2b, 3, or 4. */ if (RB_RED_P(parent) && RB_BLACK_P(brother) && RB_BLACK_P(brother->rb_left) && RB_BLACK_P(brother->rb_right)) { KASSERT(RB_RED_P(parent)); KASSERT(RB_BLACK_P(brother)); KASSERT(RB_BLACK_P(brother->rb_left)); KASSERT(RB_BLACK_P(brother->rb_right)); /* * We are black, our father is red, our brother and * both nephews are black. Simply invert/exchange the * colors of our father and brother (to black and red * respectively). * * | f --> F | * | * B --> * b | * | N N --> N N | */ RB_MARK_BLACK(parent); RB_MARK_RED(brother); KASSERT(rb_tree_check_node(rbt, brother, NULL, true)); break; /* We're done! */ } else { /* * Our brother must be black and have at least one * red child (it may have two). */ KASSERT(RB_BLACK_P(brother)); KASSERT(RB_RED_P(brother->rb_nodes[which]) || RB_RED_P(brother->rb_nodes[other])); if (RB_BLACK_P(brother->rb_nodes[other])) { /* * Case 3: our brother is black, our near * nephew is red, and our far nephew is black. * Swap our brother with our near nephew. * This result in a tree that matches case 4. * (Our father could be red or black). * * | F --> F | * | x B --> x B | * | n --> n | */ KASSERT(RB_RED_P(brother->rb_nodes[which])); rb_tree_reparent_nodes(rbt, brother, which); KASSERT(RB_FATHER(brother) == parent->rb_nodes[other]); brother = parent->rb_nodes[other]; KASSERT(RB_RED_P(brother->rb_nodes[other])); } /* * Case 4: our brother is black and our far nephew * is red. Swap our father and brother locations and * change our far nephew to black. (these can be * done in either order so we change the color first). * The result is a valid red-black tree and is a * terminal case. (again we don't care about the * father's color) * * If the father is red, we will get a red-black-black * tree: * | f -> f --> b | * | B -> B --> F N | * | n -> N --> | * * If the father is black, we will get an all black * tree: * | F -> F --> B | * | B -> B --> F N | * | n -> N --> | * * If we had two red nephews, then after the swap, * our former father would have a red grandson. */ KASSERT(RB_BLACK_P(brother)); KASSERT(RB_RED_P(brother->rb_nodes[other])); RB_MARK_BLACK(brother->rb_nodes[other]); rb_tree_reparent_nodes(rbt, parent, other); break; /* We're done! */ } } KASSERT(rb_tree_check_node(rbt, parent, NULL, true)); } void * rb_tree_iterate(struct rb_tree *rbt, void *object, const unsigned int direction) { const rb_tree_ops_t *rbto = rbt->rbt_ops; const unsigned int other = direction ^ RB_DIR_OTHER; struct rb_node *self; KASSERT(direction == RB_DIR_LEFT || direction == RB_DIR_RIGHT); if (object == NULL) { #ifndef RBSMALL if (RB_SENTINEL_P(rbt->rbt_root)) return NULL; return RB_NODETOITEM(rbto, rbt->rbt_minmax[direction]); #else self = rbt->rbt_root; if (RB_SENTINEL_P(self)) return NULL; while (!RB_SENTINEL_P(self->rb_nodes[direction])) self = self->rb_nodes[direction]; return RB_NODETOITEM(rbto, self); #endif /* !RBSMALL */ } self = RB_ITEMTONODE(rbto, object); KASSERT(!RB_SENTINEL_P(self)); /* * We can't go any further in this direction. We proceed up in the * opposite direction until our parent is in direction we want to go. */ if (RB_SENTINEL_P(self->rb_nodes[direction])) { while (!RB_ROOT_P(rbt, self)) { if (other == RB_POSITION(self)) return RB_NODETOITEM(rbto, RB_FATHER(self)); self = RB_FATHER(self); } return NULL; } /* * Advance down one in current direction and go down as far as possible * in the opposite direction. */ self = self->rb_nodes[direction]; KASSERT(!RB_SENTINEL_P(self)); while (!RB_SENTINEL_P(self->rb_nodes[other])) self = self->rb_nodes[other]; return RB_NODETOITEM(rbto, self); } #ifdef RBDEBUG static const struct rb_node * rb_tree_iterate_const(const struct rb_tree *rbt, const struct rb_node *self, const unsigned int direction) { const unsigned int other = direction ^ RB_DIR_OTHER; KASSERT(direction == RB_DIR_LEFT || direction == RB_DIR_RIGHT); if (self == NULL) { #ifndef RBSMALL if (RB_SENTINEL_P(rbt->rbt_root)) return NULL; return rbt->rbt_minmax[direction]; #else self = rbt->rbt_root; if (RB_SENTINEL_P(self)) return NULL; while (!RB_SENTINEL_P(self->rb_nodes[direction])) self = self->rb_nodes[direction]; return self; #endif /* !RBSMALL */ } KASSERT(!RB_SENTINEL_P(self)); /* * We can't go any further in this direction. We proceed up in the * opposite direction until our parent is in direction we want to go. */ if (RB_SENTINEL_P(self->rb_nodes[direction])) { while (!RB_ROOT_P(rbt, self)) { if (other == RB_POSITION(self)) return RB_FATHER(self); self = RB_FATHER(self); } return NULL; } /* * Advance down one in current direction and go down as far as possible * in the opposite direction. */ self = self->rb_nodes[direction]; KASSERT(!RB_SENTINEL_P(self)); while (!RB_SENTINEL_P(self->rb_nodes[other])) self = self->rb_nodes[other]; return self; } static unsigned int rb_tree_count_black(const struct rb_node *self) { unsigned int left, right; if (RB_SENTINEL_P(self)) return 0; left = rb_tree_count_black(self->rb_left); right = rb_tree_count_black(self->rb_right); KASSERT(left == right); return left + RB_BLACK_P(self); } static bool rb_tree_check_node(const struct rb_tree *rbt, const struct rb_node *self, const struct rb_node *prev, bool red_check) { const rb_tree_ops_t *rbto = rbt->rbt_ops; rbto_compare_nodes_fn compare_nodes = rbto->rbto_compare_nodes; KASSERT(!RB_SENTINEL_P(self)); KASSERT(prev == NULL || (*compare_nodes)(rbto->rbto_context, RB_NODETOITEM(rbto, prev), RB_NODETOITEM(rbto, self)) < 0); /* * Verify our relationship to our parent. */ if (RB_ROOT_P(rbt, self)) { KASSERT(self == rbt->rbt_root); KASSERT(RB_POSITION(self) == RB_DIR_LEFT); KASSERT(RB_FATHER(self)->rb_nodes[RB_DIR_LEFT] == self); KASSERT(RB_FATHER(self) == (const struct rb_node *) &rbt->rbt_root); } else { int diff = (*compare_nodes)(rbto->rbto_context, RB_NODETOITEM(rbto, self), RB_NODETOITEM(rbto, RB_FATHER(self))); KASSERT(self != rbt->rbt_root); KASSERT(!RB_FATHER_SENTINEL_P(self)); if (RB_POSITION(self) == RB_DIR_LEFT) { KASSERT(diff < 0); KASSERT(RB_FATHER(self)->rb_nodes[RB_DIR_LEFT] == self); } else { KASSERT(diff > 0); KASSERT(RB_FATHER(self)->rb_nodes[RB_DIR_RIGHT] == self); } } /* * Verify our position in the linked list against the tree itself. */ { const struct rb_node *prev0 = rb_tree_iterate_const(rbt, self, RB_DIR_LEFT); const struct rb_node *next0 = rb_tree_iterate_const(rbt, self, RB_DIR_RIGHT); KASSERT(prev0 == TAILQ_PREV(self, rb_node_qh, rb_link)); KASSERT(next0 == TAILQ_NEXT(self, rb_link)); #ifndef RBSMALL KASSERT(prev0 != NULL || self == rbt->rbt_minmax[RB_DIR_LEFT]); KASSERT(next0 != NULL || self == rbt->rbt_minmax[RB_DIR_RIGHT]); #endif } /* * The root must be black. * There can never be two adjacent red nodes. */ if (red_check) { KASSERT(!RB_ROOT_P(rbt, self) || RB_BLACK_P(self)); (void) rb_tree_count_black(self); if (RB_RED_P(self)) { const struct rb_node *brother; KASSERT(!RB_ROOT_P(rbt, self)); brother = RB_FATHER(self)->rb_nodes[RB_POSITION(self) ^ RB_DIR_OTHER]; KASSERT(RB_BLACK_P(RB_FATHER(self))); /* * I'm red and have no children, then I must either * have no brother or my brother also be red and * also have no children. (black count == 0) */ KASSERT(!RB_CHILDLESS_P(self) || RB_SENTINEL_P(brother) || RB_RED_P(brother) || RB_CHILDLESS_P(brother)); /* * If I'm not childless, I must have two children * and they must be both be black. */ KASSERT(RB_CHILDLESS_P(self) || (RB_TWOCHILDREN_P(self) && RB_BLACK_P(self->rb_left) && RB_BLACK_P(self->rb_right))); /* * If I'm not childless, thus I have black children, * then my brother must either be black or have two * black children. */ KASSERT(RB_CHILDLESS_P(self) || RB_BLACK_P(brother) || (RB_TWOCHILDREN_P(brother) && RB_BLACK_P(brother->rb_left) && RB_BLACK_P(brother->rb_right))); } else { /* * If I'm black and have one child, that child must * be red and childless. */ KASSERT(RB_CHILDLESS_P(self) || RB_TWOCHILDREN_P(self) || (!RB_LEFT_SENTINEL_P(self) && RB_RIGHT_SENTINEL_P(self) && RB_RED_P(self->rb_left) && RB_CHILDLESS_P(self->rb_left)) || (!RB_RIGHT_SENTINEL_P(self) && RB_LEFT_SENTINEL_P(self) && RB_RED_P(self->rb_right) && RB_CHILDLESS_P(self->rb_right))); /* * If I'm a childless black node and my parent is * black, my 2nd closet relative away from my parent * is either red or has a red parent or red children. */ if (!RB_ROOT_P(rbt, self) && RB_CHILDLESS_P(self) && RB_BLACK_P(RB_FATHER(self))) { const unsigned int which = RB_POSITION(self); const unsigned int other = which ^ RB_DIR_OTHER; const struct rb_node *relative0, *relative; relative0 = rb_tree_iterate_const(rbt, self, other); KASSERT(relative0 != NULL); relative = rb_tree_iterate_const(rbt, relative0, other); KASSERT(relative != NULL); KASSERT(RB_SENTINEL_P(relative->rb_nodes[which])); #if 0 KASSERT(RB_RED_P(relative) || RB_RED_P(relative->rb_left) || RB_RED_P(relative->rb_right) || RB_RED_P(RB_FATHER(relative))); #endif } } /* * A grandparent's children must be real nodes and not * sentinels. First check out grandparent. */ KASSERT(RB_ROOT_P(rbt, self) || RB_ROOT_P(rbt, RB_FATHER(self)) || RB_TWOCHILDREN_P(RB_FATHER(RB_FATHER(self)))); /* * If we are have grandchildren on our left, then * we must have a child on our right. */ KASSERT(RB_LEFT_SENTINEL_P(self) || RB_CHILDLESS_P(self->rb_left) || !RB_RIGHT_SENTINEL_P(self)); /* * If we are have grandchildren on our right, then * we must have a child on our left. */ KASSERT(RB_RIGHT_SENTINEL_P(self) || RB_CHILDLESS_P(self->rb_right) || !RB_LEFT_SENTINEL_P(self)); /* * If we have a child on the left and it doesn't have two * children make sure we don't have great-great-grandchildren on * the right. */ KASSERT(RB_TWOCHILDREN_P(self->rb_left) || RB_CHILDLESS_P(self->rb_right) || RB_CHILDLESS_P(self->rb_right->rb_left) || RB_CHILDLESS_P(self->rb_right->rb_left->rb_left) || RB_CHILDLESS_P(self->rb_right->rb_left->rb_right) || RB_CHILDLESS_P(self->rb_right->rb_right) || RB_CHILDLESS_P(self->rb_right->rb_right->rb_left) || RB_CHILDLESS_P(self->rb_right->rb_right->rb_right)); /* * If we have a child on the right and it doesn't have two * children make sure we don't have great-great-grandchildren on * the left. */ KASSERT(RB_TWOCHILDREN_P(self->rb_right) || RB_CHILDLESS_P(self->rb_left) || RB_CHILDLESS_P(self->rb_left->rb_left) || RB_CHILDLESS_P(self->rb_left->rb_left->rb_left) || RB_CHILDLESS_P(self->rb_left->rb_left->rb_right) || RB_CHILDLESS_P(self->rb_left->rb_right) || RB_CHILDLESS_P(self->rb_left->rb_right->rb_left) || RB_CHILDLESS_P(self->rb_left->rb_right->rb_right)); /* * If we are fully interior node, then our predecessors and * successors must have no children in our direction. */ if (RB_TWOCHILDREN_P(self)) { const struct rb_node *prev0; const struct rb_node *next0; prev0 = rb_tree_iterate_const(rbt, self, RB_DIR_LEFT); KASSERT(prev0 != NULL); KASSERT(RB_RIGHT_SENTINEL_P(prev0)); next0 = rb_tree_iterate_const(rbt, self, RB_DIR_RIGHT); KASSERT(next0 != NULL); KASSERT(RB_LEFT_SENTINEL_P(next0)); } } return true; } void rb_tree_check(const struct rb_tree *rbt, bool red_check) { const struct rb_node *self; const struct rb_node *prev; #ifdef RBSTATS unsigned int count = 0; #endif KASSERT(rbt->rbt_root != NULL); KASSERT(RB_LEFT_P(rbt->rbt_root)); #if defined(RBSTATS) && !defined(RBSMALL) KASSERT(rbt->rbt_count > 1 || rbt->rbt_minmax[RB_DIR_LEFT] == rbt->rbt_minmax[RB_DIR_RIGHT]); #endif prev = NULL; TAILQ_FOREACH(self, &rbt->rbt_nodes, rb_link) { rb_tree_check_node(rbt, self, prev, false); #ifdef RBSTATS count++; #endif } #ifdef RBSTATS KASSERT(rbt->rbt_count == count); #endif if (red_check) { KASSERT(RB_BLACK_P(rbt->rbt_root)); KASSERT(RB_SENTINEL_P(rbt->rbt_root) || rb_tree_count_black(rbt->rbt_root)); /* * The root must be black. * There can never be two adjacent red nodes. */ TAILQ_FOREACH(self, &rbt->rbt_nodes, rb_link) { rb_tree_check_node(rbt, self, NULL, true); } } } #endif /* RBDEBUG */ #ifdef RBSTATS static void rb_tree_mark_depth(const struct rb_tree *rbt, const struct rb_node *self, size_t *depths, size_t depth) { if (RB_SENTINEL_P(self)) return; if (RB_TWOCHILDREN_P(self)) { rb_tree_mark_depth(rbt, self->rb_left, depths, depth + 1); rb_tree_mark_depth(rbt, self->rb_right, depths, depth + 1); return; } depths[depth]++; if (!RB_LEFT_SENTINEL_P(self)) { rb_tree_mark_depth(rbt, self->rb_left, depths, depth + 1); } if (!RB_RIGHT_SENTINEL_P(self)) { rb_tree_mark_depth(rbt, self->rb_right, depths, depth + 1); } } void rb_tree_depths(const struct rb_tree *rbt, size_t *depths) { rb_tree_mark_depth(rbt, rbt->rbt_root, depths, 1); } #endif /* RBSTATS */
2 1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 /*- * Copyright (c) 2009-2016 The NetBSD Foundation, Inc. * All rights reserved. * * This material is based upon work partially supported by The * NetBSD Foundation under a contract with Mindaugas Rasiukevicius. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /* * NPF main: dynamic load/initialisation and unload routines. */ #ifdef _KERNEL #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: npf_os.c,v 1.21 2021/01/27 17:39:13 christos Exp $"); #ifdef _KERNEL_OPT #include "pf.h" #if NPF > 0 #error "NPF and PF are mutually exclusive; please select one" #endif #endif #include <sys/param.h> #include <sys/types.h> #include <sys/conf.h> #include <sys/kauth.h> #include <sys/kmem.h> #include <sys/lwp.h> #include <sys/module.h> #include <sys/pserialize.h> #include <sys/socketvar.h> #include <sys/uio.h> #include <netinet/in.h> #include <netinet6/in6_var.h> #endif #include "npf_impl.h" #include "npfkern.h" #ifdef _KERNEL #ifndef _MODULE #include "opt_modular.h" #include "opt_net_mpsafe.h" #endif #include "ioconf.h" #endif /* * Module and device structures. */ #ifndef _MODULE /* * Modular kernels load drivers too early, and we need percpu to be inited * So we make this misc; a better way would be to have early boot and late * boot drivers. */ MODULE(MODULE_CLASS_MISC, npf, "bpf"); #else /* This module autoloads via /dev/npf so it needs to be a driver */ MODULE(MODULE_CLASS_DRIVER, npf, "bpf"); #endif #define NPF_IOCTL_DATA_LIMIT (4 * 1024 * 1024) static int npf_pfil_register(bool); static void npf_pfil_unregister(bool); static int npf_dev_open(dev_t, int, int, lwp_t *); static int npf_dev_close(dev_t, int, int, lwp_t *); static int npf_dev_ioctl(dev_t, u_long, void *, int, lwp_t *); static int npf_dev_poll(dev_t, int, lwp_t *); static int npf_dev_read(dev_t, struct uio *, int); const struct cdevsw npf_cdevsw = { .d_open = npf_dev_open, .d_close = npf_dev_close, .d_read = npf_dev_read, .d_write = nowrite, .d_ioctl = npf_dev_ioctl, .d_stop = nostop, .d_tty = notty, .d_poll = npf_dev_poll, .d_mmap = nommap, .d_kqfilter = nokqfilter, .d_discard = nodiscard, .d_flag = D_OTHER | D_MPSAFE }; static const char * npf_ifop_getname(npf_t *, ifnet_t *); static ifnet_t * npf_ifop_lookup(npf_t *, const char *); static void npf_ifop_flush(npf_t *, void *); static void * npf_ifop_getmeta(npf_t *, const ifnet_t *); static void npf_ifop_setmeta(npf_t *, ifnet_t *, void *); static const unsigned nworkers = 1; static bool pfil_registered = false; static pfil_head_t * npf_ph_if = NULL; static pfil_head_t * npf_ph_inet = NULL; static pfil_head_t * npf_ph_inet6 = NULL; static const npf_ifops_t kern_ifops = { .getname = npf_ifop_getname, .lookup = npf_ifop_lookup, .flush = npf_ifop_flush, .getmeta = npf_ifop_getmeta, .setmeta = npf_ifop_setmeta, }; static int npf_fini(void) { npf_t *npf = npf_getkernctx(); /* At first, detach device and remove pfil hooks. */ #ifdef _MODULE devsw_detach(NULL, &npf_cdevsw); #endif npf_pfil_unregister(true); npfk_destroy(npf); npfk_sysfini(); return 0; } static int npf_init(void) { npf_t *npf; int error = 0; error = npfk_sysinit(nworkers); if (error) return error; npf = npfk_create(0, NULL, &kern_ifops, NULL); npf_setkernctx(npf); npf_pfil_register(true); #ifdef _MODULE devmajor_t bmajor = NODEVMAJOR, cmajor = NODEVMAJOR; /* Attach /dev/npf device. */ error = devsw_attach("npf", NULL, &bmajor, &npf_cdevsw, &cmajor); if (error) { /* It will call devsw_detach(), which is safe. */ (void)npf_fini(); } #endif return error; } /* * Module interface. */ static int npf_modcmd(modcmd_t cmd, void *arg) { switch (cmd) { case MODULE_CMD_INIT: return npf_init(); case MODULE_CMD_FINI: return npf_fini(); case MODULE_CMD_AUTOUNLOAD: if (npf_autounload_p()) { return EBUSY; } break; default: return ENOTTY; } return 0; } void npfattach(int nunits) { /* Nothing */ } static int npf_dev_open(dev_t dev, int flag, int mode, lwp_t *l) { /* Available only for super-user. */ if (kauth_authorize_network(l->l_cred, KAUTH_NETWORK_FIREWALL, KAUTH_REQ_NETWORK_FIREWALL_FW, NULL, NULL, NULL)) { return EPERM; } return 0; } static int npf_dev_close(dev_t dev, int flag, int mode, lwp_t *l) { return 0; } static int npf_stats_export(npf_t *npf, void *data) { uint64_t *fullst, *uptr = *(uint64_t **)data; int error; fullst = kmem_alloc(NPF_STATS_SIZE, KM_SLEEP); npfk_stats(npf, fullst); /* will zero the buffer */ error = copyout(fullst, uptr, NPF_STATS_SIZE); kmem_free(fullst, NPF_STATS_SIZE); return error; } /* * npfctl_switch: enable or disable packet inspection. */ static int npfctl_switch(void *data) { const bool onoff = *(int *)data ? true : false; int error; if (onoff) { /* Enable: add pfil hooks. */ error = npf_pfil_register(false); } else { /* Disable: remove pfil hooks. */ npf_pfil_unregister(false); error = 0; } return error; } static int npf_dev_ioctl(dev_t dev, u_long cmd, void *data, int flag, lwp_t *l) { npf_t *npf = npf_getkernctx(); nvlist_t *req, *resp; int error; /* Available only for super-user. */ if (kauth_authorize_network(l->l_cred, KAUTH_NETWORK_FIREWALL, KAUTH_REQ_NETWORK_FIREWALL_FW, NULL, NULL, NULL)) { return EPERM; } switch (cmd) { case IOC_NPF_VERSION: *(int *)data = NPF_VERSION; return 0; case IOC_NPF_SWITCH: return npfctl_switch(data); case IOC_NPF_TABLE: return npfctl_table(npf, data); case IOC_NPF_STATS: return npf_stats_export(npf, data); case IOC_NPF_LOAD: case IOC_NPF_SAVE: case IOC_NPF_RULE: case IOC_NPF_CONN_LOOKUP: case IOC_NPF_TABLE_REPLACE: /* nvlist_ref_t argument, handled below */ break; default: return EINVAL; } error = nvlist_copyin(data, &req, NPF_IOCTL_DATA_LIMIT); if (__predict_false(error)) { #ifdef __NetBSD__ /* Until the version bump. */ if (cmd != IOC_NPF_SAVE) { return error; } req = nvlist_create(0); #else return error; #endif } resp = nvlist_create(0); if ((error = npfctl_run_op(npf, cmd, req, resp)) == 0) { error = nvlist_copyout(data, resp); } nvlist_destroy(resp); nvlist_destroy(req); return error; } static int npf_dev_poll(dev_t dev, int events, lwp_t *l) { return ENOTSUP; } static int npf_dev_read(dev_t dev, struct uio *uio, int flag) { return ENOTSUP; } bool npf_autounload_p(void) { if (npf_active_p()) return false; npf_t *npf = npf_getkernctx(); npf_config_enter(npf); bool pass = npf_default_pass(npf); npf_config_exit(npf); return pass; } /* * Interface operations. */ static const char * npf_ifop_getname(npf_t *npf __unused, ifnet_t *ifp) { return ifp->if_xname; } static ifnet_t * npf_ifop_lookup(npf_t *npf __unused, const char *name) { return ifunit(name); } static void npf_ifop_flush(npf_t *npf __unused, void *arg) { ifnet_t *ifp; KERNEL_LOCK(1, NULL); IFNET_GLOBAL_LOCK(); IFNET_WRITER_FOREACH(ifp) { ifp->if_npf_private = arg; } IFNET_GLOBAL_UNLOCK(); KERNEL_UNLOCK_ONE(NULL); } static void * npf_ifop_getmeta(npf_t *npf __unused, const ifnet_t *ifp) { return ifp->if_npf_private; } static void npf_ifop_setmeta(npf_t *npf __unused, ifnet_t *ifp, void *arg) { ifp->if_npf_private = arg; } #ifdef _KERNEL /* * Wrapper of the main packet handler to pass the kernel NPF context. */ static int npfos_packet_handler(void *arg, struct mbuf **mp, ifnet_t *ifp, int di) { npf_t *npf = npf_getkernctx(); return npfk_packet_handler(npf, mp, ifp, di); } /* * npf_ifhook: hook handling interface changes. */ static void npf_ifhook(void *arg, unsigned long cmd, void *arg2) { npf_t *npf = npf_getkernctx(); ifnet_t *ifp = arg2; switch (cmd) { case PFIL_IFNET_ATTACH: npfk_ifmap_attach(npf, ifp); npf_ifaddr_sync(npf, ifp); break; case PFIL_IFNET_DETACH: npfk_ifmap_detach(npf, ifp); npf_ifaddr_flush(npf, ifp); break; } } static void npf_ifaddrhook(void *arg, u_long cmd, void *arg2) { npf_t *npf = npf_getkernctx(); struct ifaddr *ifa = arg2; switch (cmd) { case SIOCSIFADDR: case SIOCAIFADDR: case SIOCDIFADDR: #ifdef INET6 case SIOCSIFADDR_IN6: case SIOCAIFADDR_IN6: case SIOCDIFADDR_IN6: #endif KASSERT(ifa != NULL); break; default: return; } npf_ifaddr_sync(npf, ifa->ifa_ifp); } /* * npf_pfil_register: register pfil(9) hooks. */ static int npf_pfil_register(bool init) { npf_t *npf = npf_getkernctx(); int error = 0; SOFTNET_KERNEL_LOCK_UNLESS_NET_MPSAFE(); /* Init: interface re-config and attach/detach hook. */ if (!npf_ph_if) { npf_ph_if = pfil_head_get(PFIL_TYPE_IFNET, 0); if (!npf_ph_if) { error = ENOENT; goto out; } error = pfil_add_ihook(npf_ifhook, NULL, PFIL_IFNET, npf_ph_if); KASSERT(error == 0); error = pfil_add_ihook(npf_ifaddrhook, NULL, PFIL_IFADDR, npf_ph_if); KASSERT(error == 0); } if (init) { goto out; } /* Check if pfil hooks are not already registered. */ if (pfil_registered) { error = EEXIST; goto out; } /* Capture points of the activity in the IP layer. */ npf_ph_inet = pfil_head_get(PFIL_TYPE_AF, (void *)AF_INET); npf_ph_inet6 = pfil_head_get(PFIL_TYPE_AF, (void *)AF_INET6); if (!npf_ph_inet && !npf_ph_inet6) { error = ENOENT; goto out; } /* Packet IN/OUT handlers for IP layer. */ if (npf_ph_inet) { error = pfil_add_hook(npfos_packet_handler, npf, PFIL_ALL, npf_ph_inet); KASSERT(error == 0); } if (npf_ph_inet6) { error = pfil_add_hook(npfos_packet_handler, npf, PFIL_ALL, npf_ph_inet6); KASSERT(error == 0); } /* * It is necessary to re-sync all/any interface address tables, * since we did not listen for any changes. */ npf_ifaddr_syncall(npf); pfil_registered = true; out: SOFTNET_KERNEL_UNLOCK_UNLESS_NET_MPSAFE(); return error; } /* * npf_pfil_unregister: unregister pfil(9) hooks. */ static void npf_pfil_unregister(bool fini) { npf_t *npf = npf_getkernctx(); SOFTNET_KERNEL_LOCK_UNLESS_NET_MPSAFE(); if (fini && npf_ph_if) { (void)pfil_remove_ihook(npf_ifhook, NULL, PFIL_IFNET, npf_ph_if); (void)pfil_remove_ihook(npf_ifaddrhook, NULL, PFIL_IFADDR, npf_ph_if); } if (npf_ph_inet) { (void)pfil_remove_hook(npfos_packet_handler, npf, PFIL_ALL, npf_ph_inet); } if (npf_ph_inet6) { (void)pfil_remove_hook(npfos_packet_handler, npf, PFIL_ALL, npf_ph_inet6); } pfil_registered = false; SOFTNET_KERNEL_UNLOCK_UNLESS_NET_MPSAFE(); } bool npf_active_p(void) { return pfil_registered; } #endif #ifdef __NetBSD__ /* * Epoch-Based Reclamation (EBR) wrappers: in NetBSD, we rely on the * passive serialization mechanism (see pserialize(9) manual page), * which provides sufficient guarantees for NPF. */ ebr_t * npf_ebr_create(void) { return pserialize_create(); } void npf_ebr_destroy(ebr_t *ebr) { pserialize_destroy(ebr); } void npf_ebr_register(ebr_t *ebr) { KASSERT(ebr != NULL); (void)ebr; } void npf_ebr_unregister(ebr_t *ebr) { KASSERT(ebr != NULL); (void)ebr; } int npf_ebr_enter(ebr_t *ebr) { KASSERT(ebr != NULL); (void)ebr; return pserialize_read_enter(); } void npf_ebr_exit(ebr_t *ebr, int s) { KASSERT(ebr != NULL); (void)ebr; pserialize_read_exit(s); } void npf_ebr_full_sync(ebr_t *ebr) { pserialize_perform(ebr); } bool npf_ebr_incrit_p(ebr_t *ebr) { KASSERT(ebr != NULL); (void)ebr; return pserialize_in_read_section(); } #endif
1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 /* $NetBSD: subr_blist.c,v 1.15 2022/05/31 08:43:16 andvar Exp $ */ /*- * Copyright (c) 1998 Matthew Dillon. All Rights Reserved. * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* * BLIST.C - Bitmap allocator/deallocator, using a radix tree with hinting * * This module implements a general bitmap allocator/deallocator. The * allocator eats around 2 bits per 'block'. The module does not * try to interpret the meaning of a 'block' other than to return * BLIST_NONE on an allocation failure. * * A radix tree is used to maintain the bitmap. Two radix constants are * involved: One for the bitmaps contained in the leaf nodes (typically * 32), and one for the meta nodes (typically 16). Both meta and leaf * nodes have a hint field. This field gives us a hint as to the largest * free contiguous range of blocks under the node. It may contain a * value that is too high, but will never contain a value that is too * low. When the radix tree is searched, allocation failures in subtrees * update the hint. * * The radix tree also implements two collapsed states for meta nodes: * the ALL-ALLOCATED state and the ALL-FREE state. If a meta node is * in either of these two states, all information contained underneath * the node is considered stale. These states are used to optimize * allocation and freeing operations. * * The hinting greatly increases code efficiency for allocations while * the general radix structure optimizes both allocations and frees. The * radix tree should be able to operate well no matter how much * fragmentation there is and no matter how large a bitmap is used. * * Unlike the rlist code, the blist code wires all necessary memory at * creation time. Neither allocations nor frees require interaction with * the memory subsystem. In contrast, the rlist code may allocate memory * on an rlist_free() call. The non-blocking features of the blist code * are used to great advantage in the swap code (vm/nswap_pager.c). The * rlist code uses a little less overall memory than the blist code (but * due to swap interleaving not all that much less), but the blist code * scales much, much better. * * LAYOUT: The radix tree is laid out recursively using a * linear array. Each meta node is immediately followed (laid out * sequentially in memory) by BLIST_META_RADIX lower level nodes. This * is a recursive structure but one that can be easily scanned through * a very simple 'skip' calculation. In order to support large radixes, * portions of the tree may reside outside our memory allocation. We * handle this with an early-termination optimization (when bighint is * set to -1) on the scan. The memory allocation is only large enough * to cover the number of blocks requested at creation time even if it * must be encompassed in larger root-node radix. * * NOTE: the allocator cannot currently allocate more than * BLIST_BMAP_RADIX blocks per call. It will panic with 'allocation too * large' if you try. This is an area that could use improvement. The * radix is large enough that this restriction does not effect the swap * system, though. Currently only the allocation code is effected by * this algorithmic unfeature. The freeing code can handle arbitrary * ranges. * * This code can be compiled stand-alone for debugging. */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: subr_blist.c,v 1.15 2022/05/31 08:43:16 andvar Exp $"); #if 0 __FBSDID("$FreeBSD: src/sys/kern/subr_blist.c,v 1.17 2004/06/04 04:03:25 alc Exp $"); #endif #ifdef _KERNEL #include <sys/param.h> #include <sys/systm.h> #include <sys/blist.h> #include <sys/kmem.h> #else #ifndef BLIST_NO_DEBUG #define BLIST_DEBUG #endif #include <sys/types.h> #include <stdio.h> #include <string.h> #include <stdlib.h> #include <stdarg.h> #include <inttypes.h> #define KM_SLEEP 1 #define kmem_zalloc(a,b) calloc(1, (a)) #define kmem_alloc(a,b) malloc(a) #define kmem_free(a,b) free(a) #include "../sys/blist.h" void panic(const char *ctl, ...) __printflike(1, 2); #endif /* * blmeta and bl_bitmap_t MUST be a power of 2 in size. */ typedef struct blmeta { union { blist_blkno_t bmu_avail; /* space available under us */ blist_bitmap_t bmu_bitmap; /* bitmap if we are a leaf */ } u; blist_blkno_t bm_bighint; /* biggest contiguous block hint*/ } blmeta_t; struct blist { blist_blkno_t bl_blocks; /* area of coverage */ blist_blkno_t bl_radix; /* coverage radix */ blist_blkno_t bl_skip; /* starting skip */ blist_blkno_t bl_free; /* number of free blocks */ blmeta_t *bl_root; /* root of radix tree */ blist_blkno_t bl_rootblks; /* blks allocated for tree */ }; #define BLIST_META_RADIX 16 /* * static support functions */ static blist_blkno_t blst_leaf_alloc(blmeta_t *scan, blist_blkno_t blk, int count); static blist_blkno_t blst_meta_alloc(blmeta_t *scan, blist_blkno_t blk, blist_blkno_t count, blist_blkno_t radix, blist_blkno_t skip); static void blst_leaf_free(blmeta_t *scan, blist_blkno_t relblk, int count); static void blst_meta_free(blmeta_t *scan, blist_blkno_t freeBlk, blist_blkno_t count, blist_blkno_t radix, blist_blkno_t skip, blist_blkno_t blk); static void blst_copy(blmeta_t *scan, blist_blkno_t blk, blist_blkno_t radix, blist_blkno_t skip, blist_t dest, blist_blkno_t count); static int blst_leaf_fill(blmeta_t *scan, blist_blkno_t blk, int count); static blist_blkno_t blst_meta_fill(blmeta_t *scan, blist_blkno_t allocBlk, blist_blkno_t count, blist_blkno_t radix, blist_blkno_t skip, blist_blkno_t blk); static blist_blkno_t blst_radix_init(blmeta_t *scan, blist_blkno_t radix, blist_blkno_t skip, blist_blkno_t count); #ifndef _KERNEL static void blst_radix_print(blmeta_t *scan, blist_blkno_t blk, blist_blkno_t radix, blist_blkno_t skip, int tab); #endif /* * blist_create() - create a blist capable of handling up to the specified * number of blocks * * blocks must be greater than 0 * * The smallest blist consists of a single leaf node capable of * managing BLIST_BMAP_RADIX blocks. */ blist_t blist_create(blist_blkno_t blocks) { blist_t bl; blist_blkno_t radix; blist_blkno_t skip = 0; /* * Calculate radix and skip field used for scanning. * * XXX check overflow */ radix = BLIST_BMAP_RADIX; while (radix < blocks) { radix *= BLIST_META_RADIX; skip = (skip + 1) * BLIST_META_RADIX; } bl = kmem_zalloc(sizeof(struct blist), KM_SLEEP); bl->bl_blocks = blocks; bl->bl_radix = radix; bl->bl_skip = skip; bl->bl_rootblks = 1 + blst_radix_init(NULL, bl->bl_radix, bl->bl_skip, blocks); bl->bl_root = kmem_alloc(sizeof(blmeta_t) * bl->bl_rootblks, KM_SLEEP); #if defined(BLIST_DEBUG) printf( "BLIST representing %" PRIu64 " blocks (%" PRIu64 " MB of swap)" ", requiring %" PRIu64 "K of ram\n", (uint64_t)bl->bl_blocks, (uint64_t)bl->bl_blocks * 4 / 1024, ((uint64_t)bl->bl_rootblks * sizeof(blmeta_t) + 1023) / 1024 ); printf("BLIST raw radix tree contains %" PRIu64 " records\n", (uint64_t)bl->bl_rootblks); #endif blst_radix_init(bl->bl_root, bl->bl_radix, bl->bl_skip, blocks); return(bl); } void blist_destroy(blist_t bl) { kmem_free(bl->bl_root, sizeof(blmeta_t) * bl->bl_rootblks); kmem_free(bl, sizeof(struct blist)); } /* * blist_alloc() - reserve space in the block bitmap. Return the base * of a contiguous region or BLIST_NONE if space could * not be allocated. */ blist_blkno_t blist_alloc(blist_t bl, blist_blkno_t count) { blist_blkno_t blk = BLIST_NONE; if (bl) { if (bl->bl_radix == BLIST_BMAP_RADIX) blk = blst_leaf_alloc(bl->bl_root, 0, count); else blk = blst_meta_alloc(bl->bl_root, 0, count, bl->bl_radix, bl->bl_skip); if (blk != BLIST_NONE) bl->bl_free -= count; } return(blk); } /* * blist_free() - free up space in the block bitmap. Return the base * of a contiguous region. Panic if an inconsistency is * found. */ void blist_free(blist_t bl, blist_blkno_t blkno, blist_blkno_t count) { if (bl) { if (bl->bl_radix == BLIST_BMAP_RADIX) blst_leaf_free(bl->bl_root, blkno, count); else blst_meta_free(bl->bl_root, blkno, count, bl->bl_radix, bl->bl_skip, 0); bl->bl_free += count; } } /* * blist_fill() - mark a region in the block bitmap as off-limits * to the allocator (i.e. allocate it), ignoring any * existing allocations. Return the number of blocks * actually filled that were free before the call. */ blist_blkno_t blist_fill(blist_t bl, blist_blkno_t blkno, blist_blkno_t count) { blist_blkno_t filled; if (bl) { if (bl->bl_radix == BLIST_BMAP_RADIX) filled = blst_leaf_fill(bl->bl_root, blkno, count); else filled = blst_meta_fill(bl->bl_root, blkno, count, bl->bl_radix, bl->bl_skip, 0); bl->bl_free -= filled; return filled; } else return 0; } /* * blist_resize() - resize an existing radix tree to handle the * specified number of blocks. This will reallocate * the tree and transfer the previous bitmap to the new * one. When extending the tree you can specify whether * the new blocks are to left allocated or freed. */ void blist_resize(blist_t *pbl, blist_blkno_t count, int freenew) { blist_t newbl = blist_create(count); blist_t save = *pbl; *pbl = newbl; if (count > save->bl_blocks) count = save->bl_blocks; blst_copy(save->bl_root, 0, save->bl_radix, save->bl_skip, newbl, count); /* * If resizing upwards, should we free the new space or not? */ if (freenew && count < newbl->bl_blocks) { blist_free(newbl, count, newbl->bl_blocks - count); } blist_destroy(save); } #ifdef BLIST_DEBUG /* * blist_print() - dump radix tree */ void blist_print(blist_t bl) { printf("BLIST {\n"); blst_radix_print(bl->bl_root, 0, bl->bl_radix, bl->bl_skip, 4); printf("}\n"); } #endif /************************************************************************ * ALLOCATION SUPPORT FUNCTIONS * ************************************************************************ * * These support functions do all the actual work. They may seem * rather longish, but that's because I've commented them up. The * actual code is straight forward. * */ /* * blist_leaf_alloc() - allocate at a leaf in the radix tree (a bitmap). * * This is the core of the allocator and is optimized for the 1 block * and the BLIST_BMAP_RADIX block allocation cases. Other cases are * somewhat slower. The 1 block allocation case is log2 and extremely * quick. */ static blist_blkno_t blst_leaf_alloc( blmeta_t *scan, blist_blkno_t blk, int count ) { blist_bitmap_t orig = scan->u.bmu_bitmap; if (orig == 0) { /* * Optimize bitmap all-allocated case. Also, count = 1 * case assumes at least 1 bit is free in the bitmap, so * we have to take care of this case here. */ scan->bm_bighint = 0; return(BLIST_NONE); } if (count == 1) { /* * Optimized code to allocate one bit out of the bitmap */ blist_bitmap_t mask; int j = BLIST_BMAP_RADIX/2; int r = 0; mask = (blist_bitmap_t)-1 >> (BLIST_BMAP_RADIX/2); while (j) { if ((orig & mask) == 0) { r += j; orig >>= j; } j >>= 1; mask >>= j; } scan->u.bmu_bitmap &= ~((blist_bitmap_t)1 << r); return(blk + r); } if (count <= BLIST_BMAP_RADIX) { /* * non-optimized code to allocate N bits out of the bitmap. * The more bits, the faster the code runs. It will run * the slowest allocating 2 bits, but since there aren't any * memory ops in the core loop (or shouldn't be, anyway), * you probably won't notice the difference. */ int j; int n = BLIST_BMAP_RADIX - count; blist_bitmap_t mask; mask = (blist_bitmap_t)-1 >> n; for (j = 0; j <= n; ++j) { if ((orig & mask) == mask) { scan->u.bmu_bitmap &= ~mask; return(blk + j); } mask = (mask << 1); } } /* * We couldn't allocate count in this subtree, update bighint. */ scan->bm_bighint = count - 1; return(BLIST_NONE); } /* * blist_meta_alloc() - allocate at a meta in the radix tree. * * Attempt to allocate at a meta node. If we can't, we update * bighint and return a failure. Updating bighint optimize future * calls that hit this node. We have to check for our collapse cases * and we have a few optimizations strewn in as well. */ static blist_blkno_t blst_meta_alloc( blmeta_t *scan, blist_blkno_t blk, blist_blkno_t count, blist_blkno_t radix, blist_blkno_t skip ) { blist_blkno_t i; blist_blkno_t next_skip = (skip / BLIST_META_RADIX); if (scan->u.bmu_avail == 0) { /* * ALL-ALLOCATED special case */ scan->bm_bighint = count; return(BLIST_NONE); } if (scan->u.bmu_avail == radix) { radix /= BLIST_META_RADIX; /* * ALL-FREE special case, initialize uninitialize * sublevel. */ for (i = 1; i <= skip; i += next_skip) { if (scan[i].bm_bighint == (blist_blkno_t)-1) break; if (next_skip == 1) { scan[i].u.bmu_bitmap = (blist_bitmap_t)-1; scan[i].bm_bighint = BLIST_BMAP_RADIX; } else { scan[i].bm_bighint = radix; scan[i].u.bmu_avail = radix; } } } else { radix /= BLIST_META_RADIX; } for (i = 1; i <= skip; i += next_skip) { if (scan[i].bm_bighint == (blist_blkno_t)-1) { /* * Terminator */ break; } else if (count <= scan[i].bm_bighint) { /* * count fits in object */ blist_blkno_t r; if (next_skip == 1) { r = blst_leaf_alloc(&scan[i], blk, count); } else { r = blst_meta_alloc(&scan[i], blk, count, radix, next_skip - 1); } if (r != BLIST_NONE) { scan->u.bmu_avail -= count; if (scan->bm_bighint > scan->u.bmu_avail) scan->bm_bighint = scan->u.bmu_avail; return(r); } } else if (count > radix) { /* * count does not fit in object even if it were * complete free. */ panic("blist_meta_alloc: allocation too large"); } blk += radix; } /* * We couldn't allocate count in this subtree, update bighint. */ if (scan->bm_bighint >= count) scan->bm_bighint = count - 1; return(BLIST_NONE); } /* * BLST_LEAF_FREE() - free allocated block from leaf bitmap * */ static void blst_leaf_free( blmeta_t *scan, blist_blkno_t blk, int count ) { /* * free some data in this bitmap * * e.g. * 0000111111111110000 * \_________/\__/ * v n */ int n = blk & (BLIST_BMAP_RADIX - 1); blist_bitmap_t mask; mask = ((blist_bitmap_t)-1 << n) & ((blist_bitmap_t)-1 >> (BLIST_BMAP_RADIX - count - n)); if (scan->u.bmu_bitmap & mask) panic("blst_radix_free: freeing free block"); scan->u.bmu_bitmap |= mask; /* * We could probably do a better job here. We are required to make * bighint at least as large as the biggest contiguous block of * data. If we just shoehorn it, a little extra overhead will * be incured on the next allocation (but only that one typically). */ scan->bm_bighint = BLIST_BMAP_RADIX; } /* * BLST_META_FREE() - free allocated blocks from radix tree meta info * * This support routine frees a range of blocks from the bitmap. * The range must be entirely enclosed by this radix node. If a * meta node, we break the range down recursively to free blocks * in subnodes (which means that this code can free an arbitrary * range whereas the allocation code cannot allocate an arbitrary * range). */ static void blst_meta_free( blmeta_t *scan, blist_blkno_t freeBlk, blist_blkno_t count, blist_blkno_t radix, blist_blkno_t skip, blist_blkno_t blk ) { blist_blkno_t i; blist_blkno_t next_skip = (skip / BLIST_META_RADIX); #if 0 printf("FREE (%" PRIx64 ",%" PRIu64 ") FROM (%" PRIx64 ",%" PRIu64 ")\n", (uint64_t)freeBlk, (uint64_t)count, (uint64_t)blk, (uint64_t)radix ); #endif if (scan->u.bmu_avail == 0) { /* * ALL-ALLOCATED special case, with possible * shortcut to ALL-FREE special case. */ scan->u.bmu_avail = count; scan->bm_bighint = count; if (count != radix) { for (i = 1; i <= skip; i += next_skip) { if (scan[i].bm_bighint == (blist_blkno_t)-1) break; scan[i].bm_bighint = 0; if (next_skip == 1) { scan[i].u.bmu_bitmap = 0; } else { scan[i].u.bmu_avail = 0; } } /* fall through */ } } else { scan->u.bmu_avail += count; /* scan->bm_bighint = radix; */ } /* * ALL-FREE special case. */ if (scan->u.bmu_avail == radix) return; if (scan->u.bmu_avail > radix) panic("blst_meta_free: freeing already free blocks (%" PRIu64 ") %" PRIu64 "/%" PRIu64, (uint64_t)count, (uint64_t)scan->u.bmu_avail, (uint64_t)radix); /* * Break the free down into its components */ radix /= BLIST_META_RADIX; i = (freeBlk - blk) / radix; blk += i * radix; i = i * next_skip + 1; while (i <= skip && blk < freeBlk + count) { blist_blkno_t v; v = blk + radix - freeBlk; if (v > count) v = count; if (scan->bm_bighint == (blist_blkno_t)-1) panic("blst_meta_free: freeing unexpected range"); if (next_skip == 1) { blst_leaf_free(&scan[i], freeBlk, v); } else { blst_meta_free(&scan[i], freeBlk, v, radix, next_skip - 1, blk); } if (scan->bm_bighint < scan[i].bm_bighint) scan->bm_bighint = scan[i].bm_bighint; count -= v; freeBlk += v; blk += radix; i += next_skip; } } /* * BLIST_RADIX_COPY() - copy one radix tree to another * * Locates free space in the source tree and frees it in the destination * tree. The space may not already be free in the destination. */ static void blst_copy( blmeta_t *scan, blist_blkno_t blk, blist_blkno_t radix, blist_blkno_t skip, blist_t dest, blist_blkno_t count ) { blist_blkno_t next_skip; blist_blkno_t i; /* * Leaf node */ if (radix == BLIST_BMAP_RADIX) { blist_bitmap_t v = scan->u.bmu_bitmap; if (v == (blist_bitmap_t)-1) { blist_free(dest, blk, count); } else if (v != 0) { int j; for (j = 0; j < BLIST_BMAP_RADIX && j < count; ++j) { if (v & (1 << j)) blist_free(dest, blk + j, 1); } } return; } /* * Meta node */ if (scan->u.bmu_avail == 0) { /* * Source all allocated, leave dest allocated */ return; } if (scan->u.bmu_avail == radix) { /* * Source all free, free entire dest */ if (count < radix) blist_free(dest, blk, count); else blist_free(dest, blk, radix); return; } radix /= BLIST_META_RADIX; next_skip = (skip / BLIST_META_RADIX); for (i = 1; count && i <= skip; i += next_skip) { if (scan[i].bm_bighint == (blist_blkno_t)-1) break; if (count >= radix) { blst_copy( &scan[i], blk, radix, next_skip - 1, dest, radix ); count -= radix; } else { if (count) { blst_copy( &scan[i], blk, radix, next_skip - 1, dest, count ); } count = 0; } blk += radix; } } /* * BLST_LEAF_FILL() - allocate specific blocks in leaf bitmap * * This routine allocates all blocks in the specified range * regardless of any existing allocations in that range. Returns * the number of blocks allocated by the call. */ static int blst_leaf_fill(blmeta_t *scan, blist_blkno_t blk, int count) { int n = blk & (BLIST_BMAP_RADIX - 1); int nblks; blist_bitmap_t mask, bitmap; mask = ((blist_bitmap_t)-1 << n) & ((blist_bitmap_t)-1 >> (BLIST_BMAP_RADIX - count - n)); /* Count the number of blocks we're about to allocate */ bitmap = scan->u.bmu_bitmap & mask; for (nblks = 0; bitmap != 0; nblks++) bitmap &= bitmap - 1; scan->u.bmu_bitmap &= ~mask; return nblks; } /* * BLIST_META_FILL() - allocate specific blocks at a meta node * * This routine allocates the specified range of blocks, * regardless of any existing allocations in the range. The * range must be within the extent of this node. Returns the * number of blocks allocated by the call. */ static blist_blkno_t blst_meta_fill( blmeta_t *scan, blist_blkno_t allocBlk, blist_blkno_t count, blist_blkno_t radix, blist_blkno_t skip, blist_blkno_t blk ) { blist_blkno_t i; blist_blkno_t next_skip = (skip / BLIST_META_RADIX); blist_blkno_t nblks = 0; if (count == radix || scan->u.bmu_avail == 0) { /* * ALL-ALLOCATED special case */ nblks = scan->u.bmu_avail; scan->u.bmu_avail = 0; scan->bm_bighint = count; return nblks; } if (count > radix) panic("blist_meta_fill: allocation too large"); if (scan->u.bmu_avail == radix) { radix /= BLIST_META_RADIX; /* * ALL-FREE special case, initialize sublevel */ for (i = 1; i <= skip; i += next_skip) { if (scan[i].bm_bighint == (blist_blkno_t)-1) break; if (next_skip == 1) { scan[i].u.bmu_bitmap = (blist_bitmap_t)-1; scan[i].bm_bighint = BLIST_BMAP_RADIX; } else { scan[i].bm_bighint = radix; scan[i].u.bmu_avail = radix; } } } else { radix /= BLIST_META_RADIX; } i = (allocBlk - blk) / radix; blk += i * radix; i = i * next_skip + 1; while (i <= skip && blk < allocBlk + count) { blist_blkno_t v; v = blk + radix - allocBlk; if (v > count) v = count; if (scan->bm_bighint == (blist_blkno_t)-1) panic("blst_meta_fill: filling unexpected range"); if (next_skip == 1) { nblks += blst_leaf_fill(&scan[i], allocBlk, v); } else { nblks += blst_meta_fill(&scan[i], allocBlk, v, radix, next_skip - 1, blk); } count -= v; allocBlk += v; blk += radix; i += next_skip; } scan->u.bmu_avail -= nblks; return nblks; } /* * BLST_RADIX_INIT() - initialize radix tree * * Initialize our meta structures and bitmaps and calculate the exact * amount of space required to manage 'count' blocks - this space may * be considerably less than the calculated radix due to the large * RADIX values we use. */ static blist_blkno_t blst_radix_init(blmeta_t *scan, blist_blkno_t radix, blist_blkno_t skip, blist_blkno_t count) { blist_blkno_t i; blist_blkno_t next_skip; blist_blkno_t memindex = 0; /* * Leaf node */ if (radix == BLIST_BMAP_RADIX) { if (scan) { scan->bm_bighint = 0; scan->u.bmu_bitmap = 0; } return(memindex); } /* * Meta node. If allocating the entire object we can special * case it. However, we need to figure out how much memory * is required to manage 'count' blocks, so we continue on anyway. */ if (scan) { scan->bm_bighint = 0; scan->u.bmu_avail = 0; } radix /= BLIST_META_RADIX; next_skip = (skip / BLIST_META_RADIX); for (i = 1; i <= skip; i += next_skip) { if (count >= radix) { /* * Allocate the entire object */ memindex = i + blst_radix_init( ((scan) ? &scan[i] : NULL), radix, next_skip - 1, radix ); count -= radix; } else if (count > 0) { /* * Allocate a partial object */ memindex = i + blst_radix_init( ((scan) ? &scan[i] : NULL), radix, next_skip - 1, count ); count = 0; } else { /* * Add terminator and break out */ if (scan) scan[i].bm_bighint = (blist_blkno_t)-1; break; } } if (memindex < i) memindex = i; return(memindex); } #ifdef BLIST_DEBUG static void blst_radix_print(blmeta_t *scan, blist_blkno_t blk, blist_blkno_t radix, blist_blkno_t skip, int tab) { blist_blkno_t i; blist_blkno_t next_skip; int lastState = 0; if (radix == BLIST_BMAP_RADIX) { printf( "%*.*s(%0*" PRIx64 ",%" PRIu64 "): bitmap %0*" PRIx64 " big=%" PRIu64 "\n", tab, tab, "", sizeof(blk) * 2, (uint64_t)blk, (uint64_t)radix, sizeof(scan->u.bmu_bitmap) * 2, (uint64_t)scan->u.bmu_bitmap, (uint64_t)scan->bm_bighint ); return; } if (scan->u.bmu_avail == 0) { printf( "%*.*s(%0*" PRIx64 ",%" PRIu64") ALL ALLOCATED\n", tab, tab, "", sizeof(blk) * 2, (uint64_t)blk, (uint64_t)radix ); return; } if (scan->u.bmu_avail == radix) { printf( "%*.*s(%0*" PRIx64 ",%" PRIu64 ") ALL FREE\n", tab, tab, "", sizeof(blk) * 2, (uint64_t)blk, (uint64_t)radix ); return; } printf( "%*.*s(%0*" PRIx64 ",%" PRIu64 "): subtree (%" PRIu64 "/%" PRIu64 ") big=%" PRIu64 " {\n", tab, tab, "", sizeof(blk) * 2, (uint64_t)blk, (uint64_t)radix, (uint64_t)scan->u.bmu_avail, (uint64_t)radix, (uint64_t)scan->bm_bighint ); radix /= BLIST_META_RADIX; next_skip = (skip / BLIST_META_RADIX); tab += 4; for (i = 1; i <= skip; i += next_skip) { if (scan[i].bm_bighint == (blist_blkno_t)-1) { printf( "%*.*s(%0*" PRIx64 ",%" PRIu64 "): Terminator\n", tab, tab, "", sizeof(blk) * 2, (uint64_t)blk, (uint64_t)radix ); lastState = 0; break; } blst_radix_print( &scan[i], blk, radix, next_skip - 1, tab ); blk += radix; } tab -= 4; printf( "%*.*s}\n", tab, tab, "" ); } #endif #ifdef BLIST_DEBUG int main(int ac, char **av) { blist_blkno_t size = 1024; int i; blist_t bl; for (i = 1; i < ac; ++i) { const char *ptr = av[i]; if (*ptr != '-') { size = strtol(ptr, NULL, 0); continue; } ptr += 2; fprintf(stderr, "Bad option: %s\n", ptr - 2); exit(1); } bl = blist_create(size); blist_free(bl, 0, size); for (;;) { char buf[1024]; uint64_t da = 0; uint64_t count = 0; printf("%" PRIu64 "/%" PRIu64 "/%" PRIu64 "> ", (uint64_t)bl->bl_free, (uint64_t)size, (uint64_t)bl->bl_radix); fflush(stdout); if (fgets(buf, sizeof(buf), stdin) == NULL) break; switch(buf[0]) { case 'r': if (sscanf(buf + 1, "%" SCNu64, &count) == 1) { blist_resize(&bl, count, 1); } else { printf("?\n"); } case 'p': blist_print(bl); break; case 'a': if (sscanf(buf + 1, "%" SCNu64, &count) == 1) { blist_blkno_t blk = blist_alloc(bl, count); printf(" R=%0*" PRIx64 "\n", sizeof(blk) * 2, (uint64_t)blk); } else { printf("?\n"); } break; case 'f': if (sscanf(buf + 1, "%" SCNx64 " %" SCNu64, &da, &count) == 2) { blist_free(bl, da, count); } else { printf("?\n"); } break; case 'l': if (sscanf(buf + 1, "%" SCNx64 " %" SCNu64, &da, &count) == 2) { printf(" n=%" PRIu64 "\n", (uint64_t)blist_fill(bl, da, count)); } else { printf("?\n"); } break; case '?': case 'h': puts( "p -print\n" "a %d -allocate\n" "f %x %d -free\n" "l %x %d -fill\n" "r %d -resize\n" "h/? -help" ); break; default: printf("?\n"); break; } } return(0); } void panic(const char *ctl, ...) { va_list va; va_start(va, ctl); vfprintf(stderr, ctl, va); fprintf(stderr, "\n"); va_end(va); exit(1); } #endif
3 2 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 /* $NetBSD: strnlen.c,v 1.2 2014/01/09 11:25:11 apb Exp $ */ /*- * Copyright (c) 2009 David Schultz <das@FreeBSD.org> * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #if HAVE_NBTOOL_CONFIG_H #include "nbtool_config.h" #endif #include <sys/cdefs.h> #if defined(LIBC_SCCS) && !defined(lint) __RCSID("$NetBSD: strnlen.c,v 1.2 2014/01/09 11:25:11 apb Exp $"); #endif /* LIBC_SCCS and not lint */ /* FreeBSD: src/lib/libc/string/strnlen.c,v 1.1 2009/02/28 06:00:58 das Exp */ #if !defined(_KERNEL) && !defined(_STANDALONE) #include <string.h> #else #include <lib/libkern/libkern.h> #endif #if !HAVE_STRNLEN size_t strnlen(const char *s, size_t maxlen) { size_t len; for (len = 0; len < maxlen; len++, s++) { if (!*s) break; } return (len); } #endif /* !HAVE_STRNLEN */
1 1 1 1 1 1 1 1 1 1 2 2 2 1 1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 /* $NetBSD: pci_machdep.c,v 1.98 2023/11/21 23:22:23 gutteridge Exp $ */ /*- * Copyright (c) 1997, 1998 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility, * NASA Ames Research Center. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /* * Copyright (c) 1996 Christopher G. Demetriou. All rights reserved. * Copyright (c) 1994 Charles M. Hannum. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by Charles M. Hannum. * 4. The name of the author may not be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* * Machine-specific functions for PCI autoconfiguration. * * On PCs, there are two methods of generating PCI configuration cycles. * We try to detect the appropriate mechanism for this machine and set * up a few function pointers to access the correct method directly. * * The configuration method can be hard-coded in the config file by * using `options PCI_CONF_MODE=N', where `N' is the configuration mode * as defined in section 3.6.4.1, `Generating Configuration Cycles'. */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: pci_machdep.c,v 1.98 2023/11/21 23:22:23 gutteridge Exp $"); #include <sys/types.h> #include <sys/param.h> #include <sys/time.h> #include <sys/systm.h> #include <sys/errno.h> #include <sys/device.h> #include <sys/bus.h> #include <sys/cpu.h> #include <sys/kmem.h> #include <uvm/uvm_extern.h> #include <machine/bus_private.h> #include <machine/pio.h> #include <machine/lock.h> #include <dev/isa/isareg.h> #include <dev/isa/isavar.h> #include <dev/pci/pcivar.h> #include <dev/pci/pcireg.h> #include <dev/pci/pccbbreg.h> #include <dev/pci/pcidevs.h> #include <dev/pci/ppbvar.h> #include <dev/pci/genfb_pcivar.h> #include <dev/wsfb/genfbvar.h> #include <arch/x86/include/genfb_machdep.h> #include <arch/xen/include/hypervisor.h> #include <arch/xen/include/xen.h> #include <dev/ic/vgareg.h> #include "acpica.h" #include "genfb.h" #include "isa.h" #include "opt_acpi.h" #include "opt_ddb.h" #include "opt_mpbios.h" #include "opt_puc.h" #include "opt_vga.h" #include "pci.h" #include "wsdisplay.h" #include "com.h" #include "opt_xen.h" #ifdef DDB #include <machine/db_machdep.h> #include <ddb/db_sym.h> #include <ddb/db_extern.h> #endif #ifdef VGA_POST #include <x86/vga_post.h> #endif #include <x86/cpuvar.h> #include <machine/autoconf.h> #include <machine/bootinfo.h> #ifdef MPBIOS #include <machine/mpbiosvar.h> #endif #if NACPICA > 0 #include <machine/mpacpi.h> #if !defined(NO_PCI_EXTENDED_CONFIG) #include <dev/acpi/acpivar.h> #include <dev/acpi/acpi_mcfg.h> #endif #endif #include <machine/mpconfig.h> #if NCOM > 0 #include <dev/pci/puccn.h> #endif #ifndef XENPV #include <x86/efi.h> #endif #include "opt_pci_conf_mode.h" #ifdef PCI_CONF_MODE #if (PCI_CONF_MODE == 1) || (PCI_CONF_MODE == 2) static int pci_mode = PCI_CONF_MODE; #else #error Invalid PCI configuration mode. #endif #else static int pci_mode = -1; #endif struct pci_conf_lock { uint32_t cl_cpuno; /* 0: unlocked * 1 + n: locked by CPU n (0 <= n) */ uint32_t cl_sel; /* the address that's being read. */ }; static void pci_conf_unlock(struct pci_conf_lock *); static uint32_t pci_conf_selector(pcitag_t, int); static unsigned int pci_conf_port(pcitag_t, int); static void pci_conf_select(uint32_t); static void pci_conf_lock(struct pci_conf_lock *, uint32_t); static void pci_bridge_hook(pci_chipset_tag_t, pcitag_t, void *); struct pci_bridge_hook_arg { void (*func)(pci_chipset_tag_t, pcitag_t, void *); void *arg; }; #define PCI_MODE1_ENABLE 0x80000000UL #define PCI_MODE1_ADDRESS_REG 0x0cf8 #define PCI_MODE1_DATA_REG 0x0cfc #define PCI_MODE2_ENABLE_REG 0x0cf8 #define PCI_MODE2_FORWARD_REG 0x0cfa #define _tag(b, d, f) \ {.mode1 = PCI_MODE1_ENABLE | ((b) << 16) | ((d) << 11) | ((f) << 8)} #define _qe(bus, dev, fcn, vend, prod) \ {_tag(bus, dev, fcn), PCI_ID_CODE(vend, prod)} const struct { pcitag_t tag; pcireg_t id; } pcim1_quirk_tbl[] = { _qe(0, 0, 0, PCI_VENDOR_INVALID, 0x0000), /* patchable */ _qe(0, 0, 0, PCI_VENDOR_COMPAQ, PCI_PRODUCT_COMPAQ_TRIFLEX1), /* XXX Triflex2 not tested */ _qe(0, 0, 0, PCI_VENDOR_COMPAQ, PCI_PRODUCT_COMPAQ_TRIFLEX2), _qe(0, 0, 0, PCI_VENDOR_COMPAQ, PCI_PRODUCT_COMPAQ_TRIFLEX4), #if 0 /* Triton needed for Connectix Virtual PC */ _qe(0, 0, 0, PCI_VENDOR_INTEL, PCI_PRODUCT_INTEL_82437FX), /* Connectix Virtual PC 5 has a 440BX */ _qe(0, 0, 0, PCI_VENDOR_INTEL, PCI_PRODUCT_INTEL_82443BX_NOAGP), /* Parallels Desktop for Mac */ _qe(0, 2, 0, PCI_VENDOR_PARALLELS, PCI_PRODUCT_PARALLELS_VIDEO), _qe(0, 3, 0, PCI_VENDOR_PARALLELS, PCI_PRODUCT_PARALLELS_TOOLS), /* SIS 740 */ _qe(0, 0, 0, PCI_VENDOR_SIS, PCI_PRODUCT_SIS_740), /* SIS 741 */ _qe(0, 0, 0, PCI_VENDOR_SIS, PCI_PRODUCT_SIS_741), /* VIA Technologies VX900 */ _qe(0, 0, 0, PCI_VENDOR_VIATECH, PCI_PRODUCT_VIATECH_VX900_HB) #endif }; #undef _tag #undef _qe /* arch/xen does not support MSI/MSI-X yet. */ #ifdef __HAVE_PCI_MSI_MSIX #define PCI_QUIRK_DISABLE_MSI 1 /* Neither MSI nor MSI-X work */ #define PCI_QUIRK_DISABLE_MSIX 2 /* MSI-X does not work */ #define PCI_QUIRK_ENABLE_MSI_VM 3 /* Older chipset in VM where MSI and MSI-X works */ #define _dme(vend, prod) \ { PCI_QUIRK_DISABLE_MSI, PCI_ID_CODE(vend, prod) } #define _dmxe(vend, prod) \ { PCI_QUIRK_DISABLE_MSIX, PCI_ID_CODE(vend, prod) } #define _emve(vend, prod) \ { PCI_QUIRK_ENABLE_MSI_VM, PCI_ID_CODE(vend, prod) } const struct { int type; pcireg_t id; } pci_msi_quirk_tbl[] = { _dme(PCI_VENDOR_INTEL, PCI_PRODUCT_INTEL_PCMC), _dme(PCI_VENDOR_INTEL, PCI_PRODUCT_INTEL_82437FX), _dme(PCI_VENDOR_INTEL, PCI_PRODUCT_INTEL_82437MX), _dme(PCI_VENDOR_INTEL, PCI_PRODUCT_INTEL_82437VX), _dme(PCI_VENDOR_INTEL, PCI_PRODUCT_INTEL_82439HX), _dme(PCI_VENDOR_INTEL, PCI_PRODUCT_INTEL_82439TX), _dme(PCI_VENDOR_INTEL, PCI_PRODUCT_INTEL_82443GX), _dme(PCI_VENDOR_INTEL, PCI_PRODUCT_INTEL_82443GX_AGP), _dme(PCI_VENDOR_INTEL, PCI_PRODUCT_INTEL_82440MX), _dme(PCI_VENDOR_INTEL, PCI_PRODUCT_INTEL_82441FX), _dme(PCI_VENDOR_INTEL, PCI_PRODUCT_INTEL_82443BX), _dme(PCI_VENDOR_INTEL, PCI_PRODUCT_INTEL_82443BX_AGP), _dme(PCI_VENDOR_INTEL, PCI_PRODUCT_INTEL_82443BX_NOAGP), _dme(PCI_VENDOR_INTEL, PCI_PRODUCT_INTEL_82443GX_NOAGP), _dme(PCI_VENDOR_INTEL, PCI_PRODUCT_INTEL_82443LX), _dme(PCI_VENDOR_INTEL, PCI_PRODUCT_INTEL_82443LX_AGP), _dme(PCI_VENDOR_INTEL, PCI_PRODUCT_INTEL_82810_MCH), _dme(PCI_VENDOR_INTEL, PCI_PRODUCT_INTEL_82810E_MCH), _dme(PCI_VENDOR_INTEL, PCI_PRODUCT_INTEL_82815_FULL_HUB), _dme(PCI_VENDOR_INTEL, PCI_PRODUCT_INTEL_82820_MCH), _dme(PCI_VENDOR_INTEL, PCI_PRODUCT_INTEL_82830MP_IO_1), _dme(PCI_VENDOR_INTEL, PCI_PRODUCT_INTEL_82840_HB), _dme(PCI_VENDOR_NVIDIA, PCI_PRODUCT_NVIDIA_NFORCE_PCHB), _dme(PCI_VENDOR_NVIDIA, PCI_PRODUCT_NVIDIA_NFORCE2_PCHB), _dme(PCI_VENDOR_AMD, PCI_PRODUCT_AMD_SC751_SC), _dme(PCI_VENDOR_AMD, PCI_PRODUCT_AMD_SC761_SC), _dme(PCI_VENDOR_AMD, PCI_PRODUCT_AMD_SC762_NB), _emve(PCI_VENDOR_INTEL, PCI_PRODUCT_INTEL_82441FX), /* QEMU */ _emve(PCI_VENDOR_INTEL, PCI_PRODUCT_INTEL_82443BX), /* VMWare */ }; #undef _dme #undef _dmxe #undef _emve #endif /* __HAVE_PCI_MSI_MSIX */ /* * PCI doesn't have any special needs; just use the generic versions * of these functions. */ struct x86_bus_dma_tag pci_bus_dma_tag = { ._tag_needs_free = 0, #if defined(_LP64) || defined(PAE) ._bounce_thresh = PCI32_DMA_BOUNCE_THRESHOLD, ._bounce_alloc_lo = ISA_DMA_BOUNCE_THRESHOLD, ._bounce_alloc_hi = PCI32_DMA_BOUNCE_THRESHOLD, #else ._bounce_thresh = 0, ._bounce_alloc_lo = 0, ._bounce_alloc_hi = 0, #endif ._may_bounce = NULL, }; #ifdef _LP64 struct x86_bus_dma_tag pci_bus_dma64_tag = { ._tag_needs_free = 0, ._bounce_thresh = 0, ._bounce_alloc_lo = 0, ._bounce_alloc_hi = 0, ._may_bounce = NULL, }; #endif static struct pci_conf_lock cl0 = { .cl_cpuno = 0UL , .cl_sel = 0UL }; static struct pci_conf_lock * const cl = &cl0; static struct genfb_colormap_callback gfb_cb; static struct genfb_pmf_callback pmf_cb; static struct genfb_mode_callback mode_cb; #ifdef VGA_POST static struct vga_post *vga_posth = NULL; #endif static void pci_conf_lock(struct pci_conf_lock *ocl, uint32_t sel) { uint32_t cpuno; KASSERT(sel != 0); kpreempt_disable(); cpuno = cpu_number() + 1; /* If the kernel enters pci_conf_lock() through an interrupt * handler, then the CPU may already hold the lock. * * If the CPU does not already hold the lock, spin until * we can acquire it. */ if (cpuno == cl->cl_cpuno) { ocl->cl_cpuno = cpuno; } else { #ifdef LOCKDEBUG u_int spins = 0; #endif u_int count; count = SPINLOCK_BACKOFF_MIN; ocl->cl_cpuno = 0; while (atomic_cas_32(&cl->cl_cpuno, 0, cpuno) != 0) { SPINLOCK_BACKOFF(count); #ifdef LOCKDEBUG if (SPINLOCK_SPINOUT(spins)) { panic("%s: cpu %" PRId32 " spun out waiting for cpu %" PRId32, __func__, cpuno, cl->cl_cpuno); } #endif } } /* Only one CPU can be here, so an interlocked atomic_swap(3) * is not necessary. * * Evaluating atomic_cas_32_ni()'s argument, cl->cl_sel, * and applying atomic_cas_32_ni() is not an atomic operation, * however, any interrupt that, in the middle of the * operation, modifies cl->cl_sel, will also restore * cl->cl_sel. So cl->cl_sel will have the same value when * we apply atomic_cas_32_ni() as when we evaluated it, * before. */ ocl->cl_sel = atomic_cas_32_ni(&cl->cl_sel, cl->cl_sel, sel); pci_conf_select(sel); } static void pci_conf_unlock(struct pci_conf_lock *ocl) { atomic_cas_32_ni(&cl->cl_sel, cl->cl_sel, ocl->cl_sel); pci_conf_select(ocl->cl_sel); if (ocl->cl_cpuno != cl->cl_cpuno) atomic_cas_32(&cl->cl_cpuno, cl->cl_cpuno, ocl->cl_cpuno); kpreempt_enable(); } static uint32_t pci_conf_selector(pcitag_t tag, int reg) { static const pcitag_t mode2_mask = { .mode2 = { .enable = 0xff , .forward = 0xff } }; switch (pci_mode) { case 1: return tag.mode1 | reg; case 2: return tag.mode1 & mode2_mask.mode1; default: panic("%s: mode %d not configured", __func__, pci_mode); } } static unsigned int pci_conf_port(pcitag_t tag, int reg) { switch (pci_mode) { case 1: return PCI_MODE1_DATA_REG; case 2: return tag.mode2.port | reg; default: panic("%s: mode %d not configured", __func__, pci_mode); } } static void pci_conf_select(uint32_t sel) { pcitag_t tag; switch (pci_mode) { case 1: outl(PCI_MODE1_ADDRESS_REG, sel); return; case 2: tag.mode1 = sel; outb(PCI_MODE2_ENABLE_REG, tag.mode2.enable); if (tag.mode2.enable != 0) outb(PCI_MODE2_FORWARD_REG, tag.mode2.forward); return; default: panic("%s: mode %d not configured", __func__, pci_mode); } } static int pci_mode_check(void) { pcireg_t x; pcitag_t t; int device; const int maxdev = pci_bus_maxdevs(NULL, 0); for (device = 0; device < maxdev; device++) { t = pci_make_tag(NULL, 0, device, 0); x = pci_conf_read(NULL, t, PCI_CLASS_REG); if (PCI_CLASS(x) == PCI_CLASS_BRIDGE && PCI_SUBCLASS(x) == PCI_SUBCLASS_BRIDGE_HOST) return 0; x = pci_conf_read(NULL, t, PCI_ID_REG); switch (PCI_VENDOR(x)) { case PCI_VENDOR_COMPAQ: case PCI_VENDOR_INTEL: case PCI_VENDOR_VIATECH: return 0; } } return -1; } #ifdef __HAVE_PCI_MSI_MSIX static int pci_has_msi_quirk(pcireg_t id, int type) { int i; for (i = 0; i < __arraycount(pci_msi_quirk_tbl); i++) { if (id == pci_msi_quirk_tbl[i].id && type == pci_msi_quirk_tbl[i].type) return 1; } return 0; } #endif void pci_attach_hook(device_t parent, device_t self, struct pcibus_attach_args *pba) { #ifdef __HAVE_PCI_MSI_MSIX pci_chipset_tag_t pc = pba->pba_pc; pcitag_t tag; pcireg_t id, class; int i; bool havehb = false; #endif if (pba->pba_bus == 0) aprint_normal(": configuration mode %d", pci_mode); #ifdef MPBIOS mpbios_pci_attach_hook(parent, self, pba); #endif #if NACPICA > 0 mpacpi_pci_attach_hook(parent, self, pba); #endif #if NACPICA > 0 && !defined(NO_PCI_EXTENDED_CONFIG) acpimcfg_map_bus(self, pba->pba_pc, pba->pba_bus); #endif #ifdef __HAVE_PCI_MSI_MSIX /* * In order to decide whether the system supports MSI we look * at the host bridge, which should be device 0 on bus 0. * It is better to not enable MSI on systems that * support it than the other way around, so be conservative * here. So we don't enable MSI if we don't find a host * bridge there. We also deliberately don't enable MSI on * chipsets from low-end manufacturers like VIA and SiS. */ for (i = 0; i <= 7; i++) { tag = pci_make_tag(pc, 0, 0, i); id = pci_conf_read(pc, tag, PCI_ID_REG); class = pci_conf_read(pc, tag, PCI_CLASS_REG); if (PCI_CLASS(class) == PCI_CLASS_BRIDGE && PCI_SUBCLASS(class) == PCI_SUBCLASS_BRIDGE_HOST) { havehb = true; break; } } if (havehb == false) return; /* VMware and KVM use old chipset, but they can use MSI/MSI-X */ if ((cpu_feature[1] & CPUID2_RAZ) && (pci_has_msi_quirk(id, PCI_QUIRK_ENABLE_MSI_VM))) { pba->pba_flags |= PCI_FLAGS_MSI_OKAY; pba->pba_flags |= PCI_FLAGS_MSIX_OKAY; } else if (pci_has_msi_quirk(id, PCI_QUIRK_DISABLE_MSI)) { pba->pba_flags &= ~PCI_FLAGS_MSI_OKAY; pba->pba_flags &= ~PCI_FLAGS_MSIX_OKAY; aprint_verbose("\n"); aprint_verbose_dev(self, "This pci host supports neither MSI nor MSI-X."); } else if (pci_has_msi_quirk(id, PCI_QUIRK_DISABLE_MSIX)) { pba->pba_flags |= PCI_FLAGS_MSI_OKAY; pba->pba_flags &= ~PCI_FLAGS_MSIX_OKAY; aprint_verbose("\n"); aprint_verbose_dev(self, "This pci host does not support MSI-X."); #if NACPICA > 0 } else if (acpi_active && AcpiGbl_FADT.Header.Revision >= 4 && (AcpiGbl_FADT.BootFlags & ACPI_FADT_NO_MSI) != 0) { pba->pba_flags &= ~PCI_FLAGS_MSI_OKAY; pba->pba_flags &= ~PCI_FLAGS_MSIX_OKAY; aprint_verbose("\n"); aprint_verbose_dev(self, "MSI support disabled via ACPI IAPC_BOOT_ARCH flag.\n"); #endif } else { pba->pba_flags |= PCI_FLAGS_MSI_OKAY; pba->pba_flags |= PCI_FLAGS_MSIX_OKAY; } /* * Don't enable MSI on a HyperTransport bus. In order to * determine that bus 0 is a HyperTransport bus, we look at * device 24 function 0, which is the HyperTransport * host/primary interface integrated on most 64-bit AMD CPUs. * If that device has a HyperTransport capability, bus 0 must * be a HyperTransport bus and we disable MSI. */ if (24 < pci_bus_maxdevs(pc, 0)) { tag = pci_make_tag(pc, 0, 24, 0); if (pci_get_capability(pc, tag, PCI_CAP_LDT, NULL, NULL)) { pba->pba_flags &= ~PCI_FLAGS_MSI_OKAY; pba->pba_flags &= ~PCI_FLAGS_MSIX_OKAY; } } #endif /* __HAVE_PCI_MSI_MSIX */ } int pci_bus_maxdevs(pci_chipset_tag_t pc, int busno) { /* * Bus number is irrelevant. If Configuration Mechanism 2 is in * use, can only have devices 0-15 on any bus. If Configuration * Mechanism 1 is in use, can have devices 0-32 (i.e. the `normal' * range). */ if (pci_mode == 2) return (16); else return (32); } pcitag_t pci_make_tag(pci_chipset_tag_t pc, int bus, int device, int function) { pci_chipset_tag_t ipc; pcitag_t tag; for (ipc = pc; ipc != NULL; ipc = ipc->pc_super) { if ((ipc->pc_present & PCI_OVERRIDE_MAKE_TAG) == 0) continue; return (*ipc->pc_ov->ov_make_tag)(ipc->pc_ctx, pc, bus, device, function); } switch (pci_mode) { case 1: if (bus >= 256 || device >= 32 || function >= 8) panic("%s: bad request(%d, %d, %d)", __func__, bus, device, function); tag.mode1 = PCI_MODE1_ENABLE | (bus << 16) | (device << 11) | (function << 8); return tag; case 2: if (bus >= 256 || device >= 16 || function >= 8) panic("%s: bad request(%d, %d, %d)", __func__, bus, device, function); tag.mode2.port = 0xc000 | (device << 8); tag.mode2.enable = 0xf0 | (function << 1); tag.mode2.forward = bus; return tag; default: panic("%s: mode %d not configured", __func__, pci_mode); } } void pci_decompose_tag(pci_chipset_tag_t pc, pcitag_t tag, int *bp, int *dp, int *fp) { pci_chipset_tag_t ipc; for (ipc = pc; ipc != NULL; ipc = ipc->pc_super) { if ((ipc->pc_present & PCI_OVERRIDE_DECOMPOSE_TAG) == 0) continue; (*ipc->pc_ov->ov_decompose_tag)(ipc->pc_ctx, pc, tag, bp, dp, fp); return; } switch (pci_mode) { case 1: if (bp != NULL) *bp = (tag.mode1 >> 16) & 0xff; if (dp != NULL) *dp = (tag.mode1 >> 11) & 0x1f; if (fp != NULL) *fp = (tag.mode1 >> 8) & 0x7; return; case 2: if (bp != NULL) *bp = tag.mode2.forward & 0xff; if (dp != NULL) *dp = (tag.mode2.port >> 8) & 0xf; if (fp != NULL) *fp = (tag.mode2.enable >> 1) & 0x7; return; default: panic("%s: mode %d not configured", __func__, pci_mode); } } pcireg_t pci_conf_read(pci_chipset_tag_t pc, pcitag_t tag, int reg) { pci_chipset_tag_t ipc; pcireg_t data; struct pci_conf_lock ocl; int dev; KASSERT((reg & 0x3) == 0); for (ipc = pc; ipc != NULL; ipc = ipc->pc_super) { if ((ipc->pc_present & PCI_OVERRIDE_CONF_READ) == 0) continue; return (*ipc->pc_ov->ov_conf_read)(ipc->pc_ctx, pc, tag, reg); } pci_decompose_tag(pc, tag, NULL, &dev, NULL); if (__predict_false(pci_mode == 2 && dev >= 16)) return (pcireg_t) -1; if (reg < 0) return (pcireg_t) -1; if (reg >= PCI_CONF_SIZE) { #if NACPICA > 0 && !defined(NO_PCI_EXTENDED_CONFIG) if (reg >= PCI_EXTCONF_SIZE) return (pcireg_t) -1; acpimcfg_conf_read(pc, tag, reg, &data); return data; #else return (pcireg_t) -1; #endif } pci_conf_lock(&ocl, pci_conf_selector(tag, reg)); data = inl(pci_conf_port(tag, reg)); pci_conf_unlock(&ocl); return data; } void pci_conf_write(pci_chipset_tag_t pc, pcitag_t tag, int reg, pcireg_t data) { pci_chipset_tag_t ipc; struct pci_conf_lock ocl; int dev; KASSERT((reg & 0x3) == 0); for (ipc = pc; ipc != NULL; ipc = ipc->pc_super) { if ((ipc->pc_present & PCI_OVERRIDE_CONF_WRITE) == 0) continue; (*ipc->pc_ov->ov_conf_write)(ipc->pc_ctx, pc, tag, reg, data); return; } pci_decompose_tag(pc, tag, NULL, &dev, NULL); if (__predict_false(pci_mode == 2 && dev >= 16)) { return; } if (reg < 0) return; if (reg >= PCI_CONF_SIZE) { #if NACPICA > 0 && !defined(NO_PCI_EXTENDED_CONFIG) if (reg >= PCI_EXTCONF_SIZE) return; acpimcfg_conf_write(pc, tag, reg, data); #endif return; } pci_conf_lock(&ocl, pci_conf_selector(tag, reg)); outl(pci_conf_port(tag, reg), data); pci_conf_unlock(&ocl); } #ifdef XENPV void pci_conf_write16(pci_chipset_tag_t pc, pcitag_t tag, int reg, uint16_t data) { pci_chipset_tag_t ipc; struct pci_conf_lock ocl; int dev; KASSERT((reg & 0x1) == 0); for (ipc = pc; ipc != NULL; ipc = ipc->pc_super) { if ((ipc->pc_present & PCI_OVERRIDE_CONF_WRITE) == 0) continue; panic("pci_conf_write16 and override"); } pci_decompose_tag(pc, tag, NULL, &dev, NULL); if (__predict_false(pci_mode == 2 && dev >= 16)) { return; } if (reg < 0) return; if (reg >= PCI_CONF_SIZE) { #if NACPICA > 0 && !defined(NO_PCI_EXTENDED_CONFIG) if (reg >= PCI_EXTCONF_SIZE) return; panic("pci_conf_write16 and reg >= PCI_CONF_SIZE"); #endif return; } pci_conf_lock(&ocl, pci_conf_selector(tag, reg & ~0x3)); outl(pci_conf_port(tag, reg & ~0x3) + (reg & 0x3), data); pci_conf_unlock(&ocl); } #endif /* XENPV */ void pci_mode_set(int mode) { KASSERT(pci_mode == -1 || pci_mode == mode); pci_mode = mode; } int pci_mode_detect(void) { uint32_t sav, val; int i; pcireg_t idreg; if (pci_mode != -1) return pci_mode; /* * We try to divine which configuration mode the host bridge wants. */ sav = inl(PCI_MODE1_ADDRESS_REG); pci_mode = 1; /* assume this for now */ /* * catch some known buggy implementations of mode 1 */ for (i = 0; i < __arraycount(pcim1_quirk_tbl); i++) { pcitag_t t; if (PCI_VENDOR(pcim1_quirk_tbl[i].id) == PCI_VENDOR_INVALID) continue; t.mode1 = pcim1_quirk_tbl[i].tag.mode1; idreg = pci_conf_read(NULL, t, PCI_ID_REG); /* needs "pci_mode" */ if (idreg == pcim1_quirk_tbl[i].id) { #ifdef DEBUG printf("%s: known mode 1 PCI chipset (%08x)\n", __func__, idreg); #endif return (pci_mode); } } #if 0 extern char cpu_brand_string[]; const char *reason, *system_vendor, *system_product; if (memcmp(cpu_brand_string, "QEMU", 4) == 0) /* PR 45671, https://bugs.launchpad.net/qemu/+bug/897771 */ reason = "QEMU"; else if ((system_vendor = pmf_get_platform("system-vendor")) != NULL && strcmp(system_vendor, "Xen") == 0 && (system_product = pmf_get_platform("system-product")) != NULL && strcmp(system_product, "HVM domU") == 0) reason = "Xen"; else reason = NULL; if (reason) { #ifdef DEBUG printf("%s: forcing PCI mode 1 for %s\n", __func__, reason); #endif return (pci_mode); } #endif /* * Strong check for standard compliant mode 1: * 1. bit 31 ("enable") can be set * 2. byte/word access does not affect register */ outl(PCI_MODE1_ADDRESS_REG, PCI_MODE1_ENABLE); outb(PCI_MODE1_ADDRESS_REG + 3, 0); outw(PCI_MODE1_ADDRESS_REG + 2, 0); val = inl(PCI_MODE1_ADDRESS_REG); if ((val & 0x80fffffc) != PCI_MODE1_ENABLE) { #ifdef DEBUG printf("%s: mode 1 enable failed (%x)\n", __func__, val); #endif /* Try out mode 1 to see if we can find a host bridge. */ if (pci_mode_check() == 0) { #ifdef DEBUG printf("%s: mode 1 functional, using\n", __func__); #endif return (pci_mode); } goto not1; } outl(PCI_MODE1_ADDRESS_REG, 0); val = inl(PCI_MODE1_ADDRESS_REG); if ((val & 0x80fffffc) != 0) goto not1; return (pci_mode); not1: outl(PCI_MODE1_ADDRESS_REG, sav); /* * This mode 2 check is quite weak (and known to give false * positives on some Compaq machines). * However, this doesn't matter, because this is the * last test, and simply no PCI devices will be found if * this happens. */ outb(PCI_MODE2_ENABLE_REG, 0); outb(PCI_MODE2_FORWARD_REG, 0); if (inb(PCI_MODE2_ENABLE_REG) != 0 || inb(PCI_MODE2_FORWARD_REG) != 0) goto not2; return (pci_mode = 2); not2: return (pci_mode = 0); } void pci_device_foreach(pci_chipset_tag_t pc, int maxbus, void (*func)(pci_chipset_tag_t, pcitag_t, void *), void *context) { pci_device_foreach_min(pc, 0, maxbus, func, context); } void pci_device_foreach_min(pci_chipset_tag_t pc, int minbus, int maxbus, void (*func)(pci_chipset_tag_t, pcitag_t, void *), void *context) { const struct pci_quirkdata *qd; int bus, device, function, maxdevs, nfuncs; pcireg_t id, bhlcr; pcitag_t tag; for (bus = minbus; bus <= maxbus; bus++) { maxdevs = pci_bus_maxdevs(pc, bus); for (device = 0; device < maxdevs; device++) { tag = pci_make_tag(pc, bus, device, 0); id = pci_conf_read(pc, tag, PCI_ID_REG); /* Invalid vendor ID value? */ if (PCI_VENDOR(id) == PCI_VENDOR_INVALID) continue; /* XXX Not invalid, but we've done this ~forever. */ if (PCI_VENDOR(id) == 0) continue; qd = pci_lookup_quirkdata(PCI_VENDOR(id), PCI_PRODUCT(id)); bhlcr = pci_conf_read(pc, tag, PCI_BHLC_REG); if (PCI_HDRTYPE_MULTIFN(bhlcr) || (qd != NULL && (qd->quirks & PCI_QUIRK_MULTIFUNCTION) != 0)) nfuncs = 8; else nfuncs = 1; for (function = 0; function < nfuncs; function++) { tag = pci_make_tag(pc, bus, device, function); id = pci_conf_read(pc, tag, PCI_ID_REG); /* Invalid vendor ID value? */ if (PCI_VENDOR(id) == PCI_VENDOR_INVALID) continue; /* * XXX Not invalid, but we've done this * ~forever. */ if (PCI_VENDOR(id) == 0) continue; (*func)(pc, tag, context); } } } } void pci_bridge_foreach(pci_chipset_tag_t pc, int minbus, int maxbus, void (*func)(pci_chipset_tag_t, pcitag_t, void *), void *ctx) { struct pci_bridge_hook_arg bridge_hook; bridge_hook.func = func; bridge_hook.arg = ctx; pci_device_foreach_min(pc, minbus, maxbus, pci_bridge_hook, &bridge_hook); } static void pci_bridge_hook(pci_chipset_tag_t pc, pcitag_t tag, void *ctx) { struct pci_bridge_hook_arg *bridge_hook = (void *)ctx; pcireg_t reg; reg = pci_conf_read(pc, tag, PCI_CLASS_REG); if (PCI_CLASS(reg) == PCI_CLASS_BRIDGE && (PCI_SUBCLASS(reg) == PCI_SUBCLASS_BRIDGE_PCI || PCI_SUBCLASS(reg) == PCI_SUBCLASS_BRIDGE_CARDBUS)) { (*bridge_hook->func)(pc, tag, bridge_hook->arg); } } static const void * bit_to_function_pointer(const struct pci_overrides *ov, uint64_t bit) { switch (bit) { case PCI_OVERRIDE_CONF_READ: return ov->ov_conf_read; case PCI_OVERRIDE_CONF_WRITE: return ov->ov_conf_write; case PCI_OVERRIDE_INTR_MAP: return ov->ov_intr_map; case PCI_OVERRIDE_INTR_STRING: return ov->ov_intr_string; case PCI_OVERRIDE_INTR_EVCNT: return ov->ov_intr_evcnt; case PCI_OVERRIDE_INTR_ESTABLISH: return ov->ov_intr_establish; case PCI_OVERRIDE_INTR_DISESTABLISH: return ov->ov_intr_disestablish; case PCI_OVERRIDE_MAKE_TAG: return ov->ov_make_tag; case PCI_OVERRIDE_DECOMPOSE_TAG: return ov->ov_decompose_tag; default: return NULL; } } void pci_chipset_tag_destroy(pci_chipset_tag_t pc) { kmem_free(pc, sizeof(struct pci_chipset_tag)); } int pci_chipset_tag_create(pci_chipset_tag_t opc, const uint64_t present, const struct pci_overrides *ov, void *ctx, pci_chipset_tag_t *pcp) { uint64_t bit, bits, nbits; pci_chipset_tag_t pc; const void *fp; if (ov == NULL || present == 0) return EINVAL; pc = kmem_alloc(sizeof(struct pci_chipset_tag), KM_SLEEP); pc->pc_super = opc; for (bits = present; bits != 0; bits = nbits) { nbits = bits & (bits - 1); bit = nbits ^ bits; if ((fp = bit_to_function_pointer(ov, bit)) == NULL) { #ifdef DEBUG printf("%s: missing bit %" PRIx64 "\n", __func__, bit); #endif goto einval; } } pc->pc_ov = ov; pc->pc_present = present; pc->pc_ctx = ctx; *pcp = pc; return 0; einval: kmem_free(pc, sizeof(struct pci_chipset_tag)); return EINVAL; } static void x86_genfb_set_mapreg(void *opaque, int index, int r, int g, int b) { outb(IO_VGA + VGA_DAC_ADDRW, index); outb(IO_VGA + VGA_DAC_PALETTE, (uint8_t)r >> 2); outb(IO_VGA + VGA_DAC_PALETTE, (uint8_t)g >> 2); outb(IO_VGA + VGA_DAC_PALETTE, (uint8_t)b >> 2); } static bool x86_genfb_setmode(struct genfb_softc *sc, int newmode) { #if NGENFB > 0 # if NACPICA > 0 && defined(VGA_POST) && !defined(XENPV) static int curmode = WSDISPLAYIO_MODE_EMUL; # endif switch (newmode) { case WSDISPLAYIO_MODE_EMUL: # if NACPICA > 0 && defined(VGA_POST) && !defined(XENPV) if (curmode != newmode) { if (vga_posth != NULL && acpi_md_vesa_modenum != 0) { vga_post_set_vbe(vga_posth, acpi_md_vesa_modenum); } } # endif break; } # if NACPICA > 0 && defined(VGA_POST) && !defined(XENPV) curmode = newmode; # endif #endif return true; } static bool x86_genfb_suspend(device_t dev, const pmf_qual_t *qual) { return true; } static bool x86_genfb_resume(device_t dev, const pmf_qual_t *qual) { #if NGENFB > 0 struct pci_genfb_softc *psc = device_private(dev); #if NACPICA > 0 && defined(VGA_POST) && !defined(XENPV) if (vga_posth != NULL && acpi_md_vbios_reset == 2) { vga_post_call(vga_posth); if (acpi_md_vesa_modenum != 0) vga_post_set_vbe(vga_posth, acpi_md_vesa_modenum); } #endif genfb_restore_palette(&psc->sc_gen); #endif return true; } static void populate_fbinfo(device_t dev, prop_dictionary_t dict) { #if NWSDISPLAY > 0 && NGENFB > 0 struct rasops_info *ri = &x86_genfb_console_screen.scr_ri; #endif const void *fbptr = NULL; struct btinfo_framebuffer fbinfo; #if NWSDISPLAY > 0 && NGENFB > 0 && defined(XEN) && defined(DOM0OPS) if ((vm_guest == VM_GUEST_XENPVH || vm_guest == VM_GUEST_XENPV) && xendomain_is_dom0()) fbptr = xen_genfb_getbtinfo(); #endif if (fbptr == NULL) fbptr = lookup_bootinfo(BTINFO_FRAMEBUFFER); if (fbptr == NULL) return; memcpy(&fbinfo, fbptr, sizeof(fbinfo)); if (fbinfo.physaddr != 0) { prop_dictionary_set_uint32(dict, "width", fbinfo.width); prop_dictionary_set_uint32(dict, "height", fbinfo.height); prop_dictionary_set_uint8(dict, "depth", fbinfo.depth); prop_dictionary_set_uint16(dict, "linebytes", fbinfo.stride); prop_dictionary_set_uint64(dict, "address", fbinfo.physaddr); #if NWSDISPLAY > 0 && NGENFB > 0 if (ri->ri_bits != NULL) { prop_dictionary_set_uint64(dict, "virtual_address", ri->ri_hwbits != NULL ? (vaddr_t)ri->ri_hworigbits : (vaddr_t)ri->ri_origbits); } #endif } #if notyet prop_dictionary_set_bool(dict, "splash", (fbinfo.flags & BI_FB_SPLASH) != 0); #endif if (fbinfo.depth == 8) { gfb_cb.gcc_cookie = NULL; gfb_cb.gcc_set_mapreg = x86_genfb_set_mapreg; prop_dictionary_set_uint64(dict, "cmap_callback", (uint64_t)(uintptr_t)&gfb_cb); } if (fbinfo.physaddr != 0) { mode_cb.gmc_setmode = x86_genfb_setmode; prop_dictionary_set_uint64(dict, "mode_callback", (uint64_t)(uintptr_t)&mode_cb); } #if NWSDISPLAY > 0 && NGENFB > 0 if (device_is_a(dev, "genfb")) { prop_dictionary_set_bool(dict, "enable_shadowfb", ri->ri_hwbits != NULL); x86_genfb_set_console_dev(dev); #ifdef DDB db_trap_callback = x86_genfb_ddb_trap_callback; #endif } #endif } device_t device_pci_register(device_t dev, void *aux) { device_t parent = device_parent(dev); device_pci_props_register(dev, aux); /* * Handle network interfaces here, the attachment information is * not available driver-independently later. * * For disks, there is nothing useful available at attach time. */ if (device_class(dev) == DV_IFNET) { struct btinfo_netif *bin = lookup_bootinfo(BTINFO_NETIF); if (bin == NULL) return NULL; /* * We don't check the driver name against the device name * passed by the boot ROM. The ROM should stay usable if * the driver becomes obsolete. The physical attachment * information (checked below) must be sufficient to * identify the device. */ if (bin->bus == BI_BUS_PCI && device_is_a(parent, "pci")) { struct pci_attach_args *paa = aux; int b, d, f; /* * Calculate BIOS representation of: * * <bus,device,function> * * and compare. */ pci_decompose_tag(paa->pa_pc, paa->pa_tag, &b, &d, &f); if (bin->addr.tag == ((b << 8) | (d << 3) | f)) return dev; #ifndef XENPV /* * efiboot reports parent ppb bus/device/function. */ device_t grand = device_parent(parent); if (efi_probe() && grand && device_is_a(grand, "ppb")) { struct ppb_softc *ppb_sc = device_private(grand); pci_decompose_tag(ppb_sc->sc_pc, ppb_sc->sc_tag, &b, &d, &f); if (bin->addr.tag == ((b << 8) | (d << 3) | f)) return dev; } #endif } } if (parent && device_is_a(parent, "pci") && x86_found_console == false) { struct pci_attach_args *pa = aux; if (PCI_CLASS(pa->pa_class) == PCI_CLASS_DISPLAY) { prop_dictionary_t dict = device_properties(dev); /* * framebuffer drivers other than genfb can work * without the address property */ populate_fbinfo(dev, dict); /* * If the bootloader requested console=pc and * specified a framebuffer, and if * x86_genfb_cnattach succeeded in setting it * up during consinit, then consinit will call * genfb_cnattach which makes genfb_is_console * return true. In this case, if it's the * first genfb we've seen, we will instruct the * genfb driver via the is_console property * that it has been selected as the console. * * If not all of that happened, then consinit * can't have selected a genfb console, so this * device is definitely not the console. * * XXX What happens if there's more than one * PCI display device, and the bootloader picks * the second one's framebuffer as the console * framebuffer address? Tough...but this has * probably never worked. */ #if NGENFB > 0 prop_dictionary_set_bool(dict, "is_console", genfb_is_console()); #else prop_dictionary_set_bool(dict, "is_console", true); #endif prop_dictionary_set_bool(dict, "clear-screen", false); #if NWSDISPLAY > 0 && NGENFB > 0 prop_dictionary_set_uint16(dict, "cursor-row", x86_genfb_console_screen.scr_ri.ri_crow); #endif #if notyet prop_dictionary_set_bool(dict, "splash", (fbinfo->flags & BI_FB_SPLASH) != 0); #endif pmf_cb.gpc_suspend = x86_genfb_suspend; pmf_cb.gpc_resume = x86_genfb_resume; prop_dictionary_set_uint64(dict, "pmf_callback", (uint64_t)(uintptr_t)&pmf_cb); #ifdef VGA_POST vga_posth = vga_post_init(pa->pa_bus, pa->pa_device, pa->pa_function); #endif x86_found_console = true; return NULL; } } return NULL; } #ifndef PUC_CNBUS #define PUC_CNBUS 0 #endif #if NCOM > 0 int cpu_puc_cnprobe(struct consdev *cn, struct pci_attach_args *pa) { pci_mode_detect(); pa->pa_iot = x86_bus_space_io; pa->pa_memt = x86_bus_space_mem; pa->pa_pc = 0; pa->pa_tag = pci_make_tag(0, PUC_CNBUS, pci_bus_maxdevs(NULL, 0) - 1, 0); return 0; } #endif
129 128 128 86 128 54 54 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 /* $NetBSD: strlcpy.c,v 1.3 2007/06/04 18:19:27 christos Exp $ */ /* $OpenBSD: strlcpy.c,v 1.7 2003/04/12 21:56:39 millert Exp $ */ /* * Copyright (c) 1998 Todd C. Miller <Todd.Miller@courtesan.com> * * Permission to use, copy, modify, and distribute this software for any * purpose with or without fee is hereby granted, provided that the above * copyright notice and this permission notice appear in all copies. * * THE SOFTWARE IS PROVIDED "AS IS" AND TODD C. MILLER DISCLAIMS ALL * WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL TODD C. MILLER BE LIABLE * FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ #if !defined(_KERNEL) && !defined(_STANDALONE) #if HAVE_NBTOOL_CONFIG_H #include "nbtool_config.h" #endif #include <sys/cdefs.h> #if defined(LIBC_SCCS) && !defined(lint) __RCSID("$NetBSD: strlcpy.c,v 1.3 2007/06/04 18:19:27 christos Exp $"); #endif /* LIBC_SCCS and not lint */ #ifdef _LIBC #include "namespace.h" #endif #include <sys/types.h> #include <assert.h> #include <string.h> #ifdef _LIBC # ifdef __weak_alias __weak_alias(strlcpy, _strlcpy) # endif #endif #else #include <lib/libkern/libkern.h> #endif /* !_KERNEL && !_STANDALONE */ #if !HAVE_STRLCPY /* * Copy src to string dst of size siz. At most siz-1 characters * will be copied. Always NUL terminates (unless siz == 0). * Returns strlen(src); if retval >= siz, truncation occurred. */ size_t strlcpy(char *dst, const char *src, size_t siz) { char *d = dst; const char *s = src; size_t n = siz; _DIAGASSERT(dst != NULL); _DIAGASSERT(src != NULL); /* Copy as many bytes as will fit */ if (n != 0 && --n != 0) { do { if ((*d++ = *s++) == 0) break; } while (--n != 0); } /* Not enough room in dst, add NUL and traverse rest of src */ if (n == 0) { if (siz != 0) *d = '\0'; /* NUL-terminate dst */ while (*s++) ; } return(s - src - 1); /* count does not include NUL */ } #endif
4 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 /* $NetBSD: ufs_vfsops.c,v 1.61 2023/02/22 21:49:45 riastradh Exp $ */ /* * Copyright (c) 1991, 1993, 1994 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)ufs_vfsops.c 8.8 (Berkeley) 5/20/95 */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: ufs_vfsops.c,v 1.61 2023/02/22 21:49:45 riastradh Exp $"); #if defined(_KERNEL_OPT) #include "opt_ffs.h" #include "opt_quota.h" #include "opt_wapbl.h" #endif #include <sys/param.h> #include <sys/mount.h> #include <sys/proc.h> #include <sys/buf.h> #include <sys/module.h> #include <sys/vnode.h> #include <sys/kmem.h> #include <sys/kauth.h> #include <miscfs/specfs/specdev.h> #include <sys/quotactl.h> #include <ufs/ufs/quota.h> #include <ufs/ufs/inode.h> #include <ufs/ufs/ufsmount.h> #include <ufs/ufs/ufs_extern.h> #ifdef UFS_DIRHASH #include <ufs/ufs/dirhash.h> #endif /* how many times ufs_init() was called */ static int ufs_initcount = 0; pool_cache_t ufs_direct_cache; /* * Make a filesystem operational. * Nothing to do at the moment. */ /* ARGSUSED */ int ufs_start(struct mount *mp, int flags) { return (0); } /* * Return the root of a filesystem. */ int ufs_root(struct mount *mp, int lktype, struct vnode **vpp) { struct vnode *nvp; int error; if ((error = VFS_VGET(mp, (ino_t)UFS_ROOTINO, lktype, &nvp)) != 0) return (error); *vpp = nvp; return (0); } /* * Look up and return a vnode/inode pair by inode number. */ int ufs_vget(struct mount *mp, ino_t ino, int lktype, struct vnode **vpp) { int error; error = vcache_get(mp, &ino, sizeof(ino), vpp); if (error) return error; error = vn_lock(*vpp, lktype); if (error) { vrele(*vpp); *vpp = NULL; return error; } return 0; } /* * Do operations associated with quotas */ int ufs_quotactl(struct mount *mp, struct quotactl_args *args) { #if !defined(QUOTA) && !defined(QUOTA2) (void) mp; (void) args; return (EOPNOTSUPP); #else struct lwp *l = curlwp; int error; /* Mark the mount busy, as we're passing it to kauth(9). */ error = vfs_busy(mp); if (error) { return (error); } mutex_enter(mp->mnt_updating); error = quota_handle_cmd(mp, l, args); mutex_exit(mp->mnt_updating); vfs_unbusy(mp); return (error); #endif } #if 0 switch (cmd) { case Q_SYNC: break; case Q_GETQUOTA: /* The user can always query about his own quota. */ if (uid == kauth_cred_getuid(l->l_cred)) break; error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_FS_QUOTA, KAUTH_REQ_SYSTEM_FS_QUOTA_GET, mp, KAUTH_ARG(uid), NULL); break; case Q_QUOTAON: case Q_QUOTAOFF: error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_FS_QUOTA, KAUTH_REQ_SYSTEM_FS_QUOTA_ONOFF, mp, NULL, NULL); break; case Q_SETQUOTA: case Q_SETUSE: error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_FS_QUOTA, KAUTH_REQ_SYSTEM_FS_QUOTA_MANAGE, mp, KAUTH_ARG(uid), NULL); break; default: error = EINVAL; break; } type = cmds & SUBCMDMASK; if (!error) { /* Only check if there was no error above. */ if ((u_int)type >= MAXQUOTAS) error = EINVAL; } if (error) { vfs_unbusy(mp); return (error); } mutex_enter(mp->mnt_updating); switch (cmd) { case Q_QUOTAON: error = quotaon(l, mp, type, arg); break; case Q_QUOTAOFF: error = quotaoff(l, mp, type); break; case Q_SETQUOTA: error = setquota(mp, uid, type, arg); break; case Q_SETUSE: error = setuse(mp, uid, type, arg); break; case Q_GETQUOTA: error = getquota(mp, uid, type, arg); break; case Q_SYNC: error = qsync(mp); break; default: error = EINVAL; } mutex_exit(mp->mnt_updating); vfs_unbusy(mp); return (error); #endif /* * This is the generic part of fhtovp called after the underlying * filesystem has validated the file handle. */ int ufs_fhtovp(struct mount *mp, struct ufid *ufhp, int lktype, struct vnode **vpp) { struct vnode *nvp; struct inode *ip; int error; if ((error = VFS_VGET(mp, ufhp->ufid_ino, lktype, &nvp)) != 0) { if (error == ENOENT) error = ESTALE; *vpp = NULLVP; return (error); } ip = VTOI(nvp); KASSERT(ip != NULL); if (ip->i_mode == 0 || ip->i_gen != ufhp->ufid_gen || ((ip->i_mode & IFMT) == IFDIR && ip->i_size == 0)) { vput(nvp); *vpp = NULLVP; return (ESTALE); } *vpp = nvp; return (0); } /* * Initialize UFS filesystems, done only once. */ void ufs_init(void) { if (ufs_initcount++ > 0) return; ufs_direct_cache = pool_cache_init(sizeof(struct direct), 0, 0, 0, "ufsdir", NULL, IPL_NONE, NULL, NULL, NULL); #if defined(QUOTA) || defined(QUOTA2) dqinit(); #endif #ifdef UFS_DIRHASH ufsdirhash_init(); #endif #ifdef UFS_EXTATTR ufs_extattr_init(); #endif } void ufs_reinit(void) { #if defined(QUOTA) || defined(QUOTA2) dqreinit(); #endif } /* * Free UFS filesystem resources, done only once. */ void ufs_done(void) { if (--ufs_initcount > 0) return; #if defined(QUOTA) || defined(QUOTA2) dqdone(); #endif pool_cache_destroy(ufs_direct_cache); #ifdef UFS_DIRHASH ufsdirhash_done(); #endif #ifdef UFS_EXTATTR ufs_extattr_done(); #endif } /* * module interface */ #ifdef WAPBL MODULE(MODULE_CLASS_MISC, ufs, "wapbl"); #else MODULE(MODULE_CLASS_MISC, ufs, NULL); #endif static int ufs_modcmd(modcmd_t cmd, void *arg) { int error; switch (cmd) { case MODULE_CMD_INIT: ufs_init(); error = 0; break; case MODULE_CMD_FINI: ufs_done(); error = 0; break; default: error = ENOTTY; break; } return error; }
1 1 1 27 24 3 7 19 1 25 23 1 5 17 1 21 1 12 1 24 3 27 42 8 1 6 5 1 4 24 5 5 1 5 3 3 3 3 3 6 6 6 1 5 5 6 6 6 2 4 6 1 2 2 2 1 1 1 1 3 3 3 1 30 29 30 1 8 1 19 3 68 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 /* $NetBSD: raw_ip6.c,v 1.184 2024/02/24 21:41:13 mlelstv Exp $ */ /* $KAME: raw_ip6.c,v 1.82 2001/07/23 18:57:56 jinmei Exp $ */ /* * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the project nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * Copyright (c) 1982, 1986, 1988, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)raw_ip.c 8.2 (Berkeley) 1/4/94 */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: raw_ip6.c,v 1.184 2024/02/24 21:41:13 mlelstv Exp $"); #ifdef _KERNEL_OPT #include "opt_ipsec.h" #include "opt_net_mpsafe.h" #endif #include <sys/param.h> #include <sys/sysctl.h> #include <sys/mbuf.h> #include <sys/socket.h> #include <sys/protosw.h> #include <sys/socketvar.h> #include <sys/systm.h> #include <sys/proc.h> #include <sys/kauth.h> #include <sys/kmem.h> #include <net/if.h> #include <net/if_types.h> #include <net/net_stats.h> #include <netinet/in.h> #include <netinet/in_var.h> #include <netinet/ip6.h> #include <netinet6/ip6_var.h> #include <netinet6/ip6_private.h> #include <netinet6/ip6_mroute.h> #include <netinet/icmp6.h> #include <netinet6/icmp6_private.h> #include <netinet6/in6_pcb.h> #include <netinet6/ip6protosw.h> #include <netinet6/scope6_var.h> #include <netinet6/raw_ip6.h> #ifdef IPSEC #include <netipsec/ipsec.h> #include <netipsec/ipsec6.h> #endif #include "faith.h" #if defined(NFAITH) && 0 < NFAITH #include <net/if_faith.h> #endif extern struct inpcbtable rawcbtable; struct inpcbtable raw6cbtable; #define ifatoia6(ifa) ((struct in6_ifaddr *)(ifa)) /* * Raw interface to IP6 protocol. */ static percpu_t *rip6stat_percpu; #define RIP6_STATINC(x) _NET_STATINC(rip6stat_percpu, x) static void sysctl_net_inet6_raw6_setup(struct sysctllog **); /* * Initialize raw connection block queue. */ void rip6_init(void) { sysctl_net_inet6_raw6_setup(NULL); in6pcb_init(&raw6cbtable, 1, 1); rip6stat_percpu = percpu_alloc(sizeof(uint64_t) * RIP6_NSTATS); } static void rip6_sbappendaddr(struct inpcb *last, struct ip6_hdr *ip6, const struct sockaddr *sa, int hlen, struct mbuf *n) { struct mbuf *opts = NULL; if (last->inp_flags & IN6P_CONTROLOPTS || SOOPT_TIMESTAMP(last->inp_socket->so_options)) ip6_savecontrol(last, &opts, ip6, n); m_adj(n, hlen); if (sbappendaddr(&last->inp_socket->so_rcv, sa, n, opts) == 0) { soroverflow(last->inp_socket); m_freem(n); if (opts) m_freem(opts); RIP6_STATINC(RIP6_STAT_FULLSOCK); } else { sorwakeup(last->inp_socket); } } /* * Setup generic address and protocol structures * for raw_input routine, then pass them along with * mbuf chain. */ int rip6_input(struct mbuf **mp, int *offp, int proto) { struct mbuf *m = *mp; struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *); struct inpcb *inp; struct inpcb *last = NULL; struct sockaddr_in6 rip6src; struct mbuf *n; RIP6_STATINC(RIP6_STAT_IPACKETS); #if defined(NFAITH) && 0 < NFAITH if (faithprefix(&ip6->ip6_dst)) { /* send icmp6 host unreach? */ m_freem(m); return IPPROTO_DONE; } #endif sockaddr_in6_init(&rip6src, &ip6->ip6_src, 0, 0, 0); if (sa6_recoverscope(&rip6src) != 0) { /* XXX: should be impossible. */ m_freem(m); return IPPROTO_DONE; } TAILQ_FOREACH(inp, &raw6cbtable.inpt_queue, inp_queue) { if (inp->inp_af != AF_INET6) continue; if (in6p_ip6(inp).ip6_nxt && in6p_ip6(inp).ip6_nxt != proto) continue; if (!IN6_IS_ADDR_UNSPECIFIED(&in6p_laddr(inp)) && !IN6_ARE_ADDR_EQUAL(&in6p_laddr(inp), &ip6->ip6_dst)) continue; if (!IN6_IS_ADDR_UNSPECIFIED(&in6p_faddr(inp)) && !IN6_ARE_ADDR_EQUAL(&in6p_faddr(inp), &ip6->ip6_src)) continue; if (in6p_cksum(inp) != -1) { RIP6_STATINC(RIP6_STAT_ISUM); /* * Although in6_cksum() does not need the position of * the checksum field for verification, enforce that it * is located within the packet. Userland has given * a checksum offset, a packet too short for that is * invalid. Avoid overflow with user supplied offset. */ if (m->m_pkthdr.len < *offp + 2 || m->m_pkthdr.len - *offp - 2 < in6p_cksum(inp) || in6_cksum(m, proto, *offp, m->m_pkthdr.len - *offp)) { RIP6_STATINC(RIP6_STAT_BADSUM); continue; } } if (last == NULL) { ; } #ifdef IPSEC else if (ipsec_used && ipsec_in_reject(m, last)) { /* do not inject data into pcb */ } #endif else if ((n = m_copypacket(m, M_DONTWAIT)) != NULL) { rip6_sbappendaddr(last, ip6, sin6tosa(&rip6src), *offp, n); } last = inp; } #ifdef IPSEC if (ipsec_used && last && ipsec_in_reject(m, last)) { m_freem(m); IP6_STATDEC(IP6_STAT_DELIVERED); /* do not inject data into pcb */ } else #endif if (last != NULL) { rip6_sbappendaddr(last, ip6, sin6tosa(&rip6src), *offp, m); } else { RIP6_STATINC(RIP6_STAT_NOSOCK); if (m->m_flags & M_MCAST) RIP6_STATINC(RIP6_STAT_NOSOCKMCAST); if (proto == IPPROTO_NONE) m_freem(m); else { int s; struct ifnet *rcvif = m_get_rcvif(m, &s); const int prvnxt = ip6_get_prevhdr(m, *offp); in6_ifstat_inc(rcvif, ifs6_in_protounknown); m_put_rcvif(rcvif, &s); icmp6_error(m, ICMP6_PARAM_PROB, ICMP6_PARAMPROB_NEXTHEADER, prvnxt); } IP6_STATDEC(IP6_STAT_DELIVERED); } return IPPROTO_DONE; } void * rip6_ctlinput(int cmd, const struct sockaddr *sa, void *d) { struct ip6_hdr *ip6; struct ip6ctlparam *ip6cp = NULL; const struct sockaddr_in6 *sa6_src = NULL; void *cmdarg; void (*notify)(struct inpcb *, int) = in6pcb_rtchange; int nxt; if (sa->sa_family != AF_INET6 || sa->sa_len != sizeof(struct sockaddr_in6)) return NULL; if ((unsigned)cmd >= PRC_NCMDS) return NULL; if (PRC_IS_REDIRECT(cmd)) notify = in6pcb_rtchange, d = NULL; else if (cmd == PRC_HOSTDEAD) d = NULL; else if (cmd == PRC_MSGSIZE) ; /* special code is present, see below */ else if (inet6ctlerrmap[cmd] == 0) return NULL; /* if the parameter is from icmp6, decode it. */ if (d != NULL) { ip6cp = (struct ip6ctlparam *)d; ip6 = ip6cp->ip6c_ip6; cmdarg = ip6cp->ip6c_cmdarg; sa6_src = ip6cp->ip6c_src; nxt = ip6cp->ip6c_nxt; } else { ip6 = NULL; cmdarg = NULL; sa6_src = &sa6_any; nxt = -1; } if (ip6 && cmd == PRC_MSGSIZE) { const struct sockaddr_in6 *sa6 = (const struct sockaddr_in6 *)sa; int valid = 0; struct inpcb *inp; /* * Check to see if we have a valid raw IPv6 socket * corresponding to the address in the ICMPv6 message * payload, and the protocol (ip6_nxt) meets the socket. * XXX chase extension headers, or pass final nxt value * from icmp6_notify_error() */ inp = NULL; inp = in6pcb_lookup(&raw6cbtable, &sa6->sin6_addr, 0, (const struct in6_addr *)&sa6_src->sin6_addr, 0, 0, 0); #if 0 if (!inp) { /* * As the use of sendto(2) is fairly popular, * we may want to allow non-connected pcb too. * But it could be too weak against attacks... * We should at least check if the local * address (= s) is really ours. */ inp = in6pcb_lookup_bound(&raw6cbtable, &sa6->sin6_addr, 0, 0); } #endif if (inp && in6p_ip6(inp).ip6_nxt && in6p_ip6(inp).ip6_nxt == nxt) valid++; /* * Depending on the value of "valid" and routing table * size (mtudisc_{hi,lo}wat), we will: * - recalculate the new MTU and create the * corresponding routing entry, or * - ignore the MTU change notification. */ icmp6_mtudisc_update((struct ip6ctlparam *)d, valid); /* * regardless of if we called icmp6_mtudisc_update(), * we need to call in6pcb_notify(), to notify path MTU * change to the userland (RFC3542), because some * unconnected sockets may share the same destination * and want to know the path MTU. */ } (void) in6pcb_notify(&raw6cbtable, sa, 0, sin6tocsa(sa6_src), 0, cmd, cmdarg, notify); return NULL; } /* * Generate IPv6 header and pass packet to ip6_output. * Tack on options user may have setup with control call. */ int rip6_output(struct mbuf *m, struct socket * const so, struct sockaddr_in6 * const dstsock, struct mbuf * const control) { struct in6_addr *dst; struct ip6_hdr *ip6; struct inpcb *inp; u_int plen = m->m_pkthdr.len; int error = 0; struct ip6_pktopts opt, *optp = NULL; struct ifnet *oifp = NULL; int type, code; /* for ICMPv6 output statistics only */ int scope_ambiguous = 0; int bound = curlwp_bind(); struct psref psref; inp = sotoinpcb(so); dst = &dstsock->sin6_addr; if (control) { if ((error = ip6_setpktopts(control, &opt, in6p_outputopts(inp), kauth_cred_get(), so->so_proto->pr_protocol)) != 0) { goto bad; } optp = &opt; } else optp = in6p_outputopts(inp); /* * Check and convert scope zone ID into internal form. * XXX: we may still need to determine the zone later. */ if (!(so->so_state & SS_ISCONNECTED)) { if (dstsock->sin6_scope_id == 0 && !ip6_use_defzone) scope_ambiguous = 1; if ((error = sa6_embedscope(dstsock, ip6_use_defzone)) != 0) goto bad; } /* * For an ICMPv6 packet, we should know its type and code * to update statistics. */ if (so->so_proto->pr_protocol == IPPROTO_ICMPV6) { struct icmp6_hdr *icmp6; if (m->m_len < sizeof(struct icmp6_hdr) && (m = m_pullup(m, sizeof(struct icmp6_hdr))) == NULL) { error = ENOBUFS; goto bad; } icmp6 = mtod(m, struct icmp6_hdr *); type = icmp6->icmp6_type; code = icmp6->icmp6_code; } else { type = 0; code = 0; } M_PREPEND(m, sizeof(*ip6), M_DONTWAIT); if (!m) { error = ENOBUFS; goto bad; } ip6 = mtod(m, struct ip6_hdr *); /* * Next header might not be ICMP6 but use its pseudo header anyway. */ ip6->ip6_dst = *dst; /* * Source address selection. */ error = in6_selectsrc(dstsock, optp, in6p_moptions(inp), &inp->inp_route, &in6p_laddr(inp), &oifp, &psref, &ip6->ip6_src); if (error != 0) goto bad; if (oifp && scope_ambiguous) { /* * Application should provide a proper zone ID or the use of * default zone IDs should be enabled. Unfortunately, some * applications do not behave as it should, so we need a * workaround. Even if an appropriate ID is not determined * (when it's required), if we can determine the outgoing * interface. determine the zone ID based on the interface. */ error = in6_setscope(&dstsock->sin6_addr, oifp, NULL); if (error != 0) goto bad; } ip6->ip6_dst = dstsock->sin6_addr; /* fill in the rest of the IPv6 header fields */ ip6->ip6_flow = in6p_flowinfo(inp) & IPV6_FLOWINFO_MASK; ip6->ip6_vfc &= ~IPV6_VERSION_MASK; ip6->ip6_vfc |= IPV6_VERSION; /* ip6_plen will be filled in ip6_output, so not fill it here. */ ip6->ip6_nxt = in6p_ip6(inp).ip6_nxt; ip6->ip6_hlim = in6pcb_selecthlim(inp, oifp); if_put(oifp, &psref); oifp = NULL; if (so->so_proto->pr_protocol == IPPROTO_ICMPV6 || in6p_cksum(inp) != -1) { const uint8_t nxt = ip6->ip6_nxt; int off; u_int16_t sum; /* compute checksum */ if (so->so_proto->pr_protocol == IPPROTO_ICMPV6) off = offsetof(struct icmp6_hdr, icmp6_cksum); else off = in6p_cksum(inp); if (plen < 2 || plen - 2 < off) { error = EINVAL; goto bad; } off += sizeof(struct ip6_hdr); sum = 0; m = m_copyback_cow(m, off, sizeof(sum), (void *)&sum, M_DONTWAIT); if (m == NULL) { error = ENOBUFS; goto bad; } sum = in6_cksum(m, nxt, sizeof(*ip6), plen); m = m_copyback_cow(m, off, sizeof(sum), (void *)&sum, M_DONTWAIT); if (m == NULL) { error = ENOBUFS; goto bad; } } { struct ifnet *ret_oifp = NULL; error = ip6_output(m, optp, &inp->inp_route, 0, in6p_moptions(inp), inp, &ret_oifp); if (so->so_proto->pr_protocol == IPPROTO_ICMPV6) { if (ret_oifp) icmp6_ifoutstat_inc(ret_oifp, type, code); ICMP6_STATINC(ICMP6_STAT_OUTHIST + type); } else RIP6_STATINC(RIP6_STAT_OPACKETS); } goto freectl; bad: if (m) m_freem(m); freectl: if (control) { ip6_clearpktopts(&opt, -1); m_freem(control); } if_put(oifp, &psref); curlwp_bindx(bound); return error; } /* * Raw IPv6 socket option processing. */ int rip6_ctloutput(int op, struct socket *so, struct sockopt *sopt) { int error = 0; if (sopt->sopt_level == SOL_SOCKET && sopt->sopt_name == SO_NOHEADER) { int optval; /* need to fiddle w/ opt(IPPROTO_IPV6, IPV6_CHECKSUM)? */ if (op == PRCO_GETOPT) { optval = 1; error = sockopt_set(sopt, &optval, sizeof(optval)); } else if (op == PRCO_SETOPT) { error = sockopt_getint(sopt, &optval); if (error) goto out; if (optval == 0) error = EINVAL; } goto out; } else if (sopt->sopt_level != IPPROTO_IPV6) return ip6_ctloutput(op, so, sopt); switch (sopt->sopt_name) { case MRT6_INIT: case MRT6_DONE: case MRT6_ADD_MIF: case MRT6_DEL_MIF: case MRT6_ADD_MFC: case MRT6_DEL_MFC: case MRT6_PIM: if (op == PRCO_SETOPT) error = ip6_mrouter_set(so, sopt); else if (op == PRCO_GETOPT) error = ip6_mrouter_get(so, sopt); else error = EINVAL; break; case IPV6_CHECKSUM: return ip6_raw_ctloutput(op, so, sopt); default: return ip6_ctloutput(op, so, sopt); } out: return error; } extern u_long rip6_sendspace; extern u_long rip6_recvspace; int rip6_attach(struct socket *so, int proto) { struct inpcb *inp; int s, error; KASSERT(sotoinpcb(so) == NULL); sosetlock(so); error = kauth_authorize_network(kauth_cred_get(), KAUTH_NETWORK_SOCKET, KAUTH_REQ_NETWORK_SOCKET_RAWSOCK, KAUTH_ARG(AF_INET6), KAUTH_ARG(SOCK_RAW), KAUTH_ARG(so->so_proto->pr_protocol)); if (error) { return error; } s = splsoftnet(); error = soreserve(so, rip6_sendspace, rip6_recvspace); if (error) { splx(s); return error; } if ((error = inpcb_create(so, &raw6cbtable)) != 0) { splx(s); return error; } splx(s); inp = sotoinpcb(so); in6p_ip6(inp).ip6_nxt = proto; in6p_cksum(inp) = -1; in6p_icmp6filt(inp) = kmem_alloc(sizeof(struct icmp6_filter), KM_SLEEP); ICMP6_FILTER_SETPASSALL(in6p_icmp6filt(inp)); KASSERT(solocked(so)); return error; } static void rip6_detach(struct socket *so) { struct inpcb *inp = sotoinpcb(so); KASSERT(solocked(so)); KASSERT(inp != NULL); if (so == ip6_mrouter) { ip6_mrouter_done(); } /* xxx: RSVP */ if (in6p_icmp6filt(inp) != NULL) { kmem_free(in6p_icmp6filt(inp), sizeof(struct icmp6_filter)); in6p_icmp6filt(inp) = NULL; } inpcb_destroy(inp); } static int rip6_accept(struct socket *so, struct sockaddr *nam) { KASSERT(solocked(so)); return EOPNOTSUPP; } static int rip6_bind(struct socket *so, struct sockaddr *nam, struct lwp *l) { struct inpcb *inp = sotoinpcb(so); struct sockaddr_in6 *addr = (struct sockaddr_in6 *)nam; struct ifaddr *ifa = NULL; int error = 0; int s; KASSERT(solocked(so)); KASSERT(inp != NULL); KASSERT(nam != NULL); if (addr->sin6_len != sizeof(*addr)) return EINVAL; if (IFNET_READER_EMPTY() || addr->sin6_family != AF_INET6) return EADDRNOTAVAIL; if ((error = sa6_embedscope(addr, ip6_use_defzone)) != 0) return error; /* * we don't support mapped address here, it would confuse * users so reject it */ if (IN6_IS_ADDR_V4MAPPED(&addr->sin6_addr)) return EADDRNOTAVAIL; s = pserialize_read_enter(); if (!IN6_IS_ADDR_UNSPECIFIED(&addr->sin6_addr) && (ifa = ifa_ifwithaddr(sin6tosa(addr))) == NULL) { error = EADDRNOTAVAIL; goto out; } if (ifa && (ifatoia6(ifa))->ia6_flags & (IN6_IFF_ANYCAST | IN6_IFF_DUPLICATED)) { error = EADDRNOTAVAIL; goto out; } in6p_laddr(inp) = addr->sin6_addr; error = 0; out: pserialize_read_exit(s); return error; } static int rip6_listen(struct socket *so, struct lwp *l) { KASSERT(solocked(so)); return EOPNOTSUPP; } static int rip6_connect(struct socket *so, struct sockaddr *nam, struct lwp *l) { struct inpcb *inp = sotoinpcb(so); struct sockaddr_in6 *addr = (struct sockaddr_in6 *)nam; struct in6_addr in6a; struct ifnet *ifp = NULL; int scope_ambiguous = 0; int error = 0; struct psref psref; int bound; KASSERT(solocked(so)); KASSERT(inp != NULL); KASSERT(nam != NULL); if (IFNET_READER_EMPTY()) return EADDRNOTAVAIL; if (addr->sin6_family != AF_INET6) return EAFNOSUPPORT; if (addr->sin6_len != sizeof(*addr)) return EINVAL; /* * Application should provide a proper zone ID or the use of * default zone IDs should be enabled. Unfortunately, some * applications do not behave as it should, so we need a * workaround. Even if an appropriate ID is not determined, * we'll see if we can determine the outgoing interface. If we * can, determine the zone ID based on the interface below. */ if (addr->sin6_scope_id == 0 && !ip6_use_defzone) scope_ambiguous = 1; if ((error = sa6_embedscope(addr, ip6_use_defzone)) != 0) return error; bound = curlwp_bind(); /* Source address selection. XXX: need pcblookup? */ error = in6_selectsrc(addr, in6p_outputopts(inp), in6p_moptions(inp), &inp->inp_route, &in6p_laddr(inp), &ifp, &psref, &in6a); if (error != 0) goto out; /* XXX: see above */ if (ifp && scope_ambiguous && (error = in6_setscope(&addr->sin6_addr, ifp, NULL)) != 0) { goto out; } in6p_laddr(inp) = in6a; in6p_faddr(inp) = addr->sin6_addr; soisconnected(so); out: if_put(ifp, &psref); curlwp_bindx(bound); return error; } static int rip6_connect2(struct socket *so, struct socket *so2) { KASSERT(solocked(so)); return EOPNOTSUPP; } static int rip6_disconnect(struct socket *so) { struct inpcb *inp = sotoinpcb(so); KASSERT(solocked(so)); KASSERT(inp != NULL); if ((so->so_state & SS_ISCONNECTED) == 0) return ENOTCONN; in6p_faddr(inp) = in6addr_any; so->so_state &= ~SS_ISCONNECTED; /* XXX */ return 0; } static int rip6_shutdown(struct socket *so) { KASSERT(solocked(so)); /* * Mark the connection as being incapable of further input. */ socantsendmore(so); return 0; } static int rip6_abort(struct socket *so) { KASSERT(solocked(so)); soisdisconnected(so); rip6_detach(so); return 0; } static int rip6_ioctl(struct socket *so, u_long cmd, void *nam, struct ifnet *ifp) { return in6_control(so, cmd, nam, ifp); } static int rip6_stat(struct socket *so, struct stat *ub) { KASSERT(solocked(so)); /* stat: don't bother with a blocksize */ return 0; } static int rip6_peeraddr(struct socket *so, struct sockaddr *nam) { KASSERT(solocked(so)); KASSERT(sotoinpcb(so) != NULL); KASSERT(nam != NULL); in6pcb_fetch_peeraddr(sotoinpcb(so), (struct sockaddr_in6 *)nam); return 0; } static int rip6_sockaddr(struct socket *so, struct sockaddr *nam) { KASSERT(solocked(so)); KASSERT(sotoinpcb(so) != NULL); KASSERT(nam != NULL); in6pcb_fetch_sockaddr(sotoinpcb(so), (struct sockaddr_in6 *)nam); return 0; } static int rip6_rcvd(struct socket *so, int flags, struct lwp *l) { KASSERT(solocked(so)); return EOPNOTSUPP; } static int rip6_recvoob(struct socket *so, struct mbuf *m, int flags) { KASSERT(solocked(so)); return EOPNOTSUPP; } static int rip6_send(struct socket *so, struct mbuf *m, struct sockaddr *nam, struct mbuf *control, struct lwp *l) { struct inpcb *inp = sotoinpcb(so); struct sockaddr_in6 tmp; struct sockaddr_in6 *dst; int error = 0; KASSERT(solocked(so)); KASSERT(inp != NULL); KASSERT(m != NULL); /* * Ship a packet out. The appropriate raw output * routine handles any messaging necessary. */ /* always copy sockaddr to avoid overwrites */ if (so->so_state & SS_ISCONNECTED) { if (nam) { error = EISCONN; goto release; } /* XXX */ sockaddr_in6_init(&tmp, &in6p_faddr(inp), 0, 0, 0); dst = &tmp; } else { if (nam == NULL) { error = ENOTCONN; goto release; } tmp = *(struct sockaddr_in6 *)nam; dst = &tmp; if (dst->sin6_family != AF_INET6) { error = EAFNOSUPPORT; goto release; } if (dst->sin6_len != sizeof(*dst)) { error = EINVAL; goto release; } } error = rip6_output(m, so, dst, control); m = NULL; release: if (m) m_freem(m); return error; } static int rip6_sendoob(struct socket *so, struct mbuf *m, struct mbuf *control) { KASSERT(solocked(so)); m_freem(m); m_freem(control); return EOPNOTSUPP; } static int rip6_purgeif(struct socket *so, struct ifnet *ifp) { mutex_enter(softnet_lock); in6pcb_purgeif0(&raw6cbtable, ifp); #ifdef NET_MPSAFE mutex_exit(softnet_lock); #endif in6_purgeif(ifp); #ifdef NET_MPSAFE mutex_enter(softnet_lock); #endif in6pcb_purgeif(&raw6cbtable, ifp); mutex_exit(softnet_lock); return 0; } static int sysctl_net_inet6_raw6_stats(SYSCTLFN_ARGS) { return (NETSTAT_SYSCTL(rip6stat_percpu, RIP6_NSTATS)); } static void sysctl_net_inet6_raw6_setup(struct sysctllog **clog) { sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT, CTLTYPE_NODE, "inet6", NULL, NULL, 0, NULL, 0, CTL_NET, PF_INET6, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT, CTLTYPE_NODE, "raw6", SYSCTL_DESCR("Raw IPv6 settings"), NULL, 0, NULL, 0, CTL_NET, PF_INET6, IPPROTO_RAW, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT, CTLTYPE_STRUCT, "pcblist", SYSCTL_DESCR("Raw IPv6 control block list"), sysctl_inpcblist, 0, &raw6cbtable, 0, CTL_NET, PF_INET6, IPPROTO_RAW, CTL_CREATE, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT, CTLTYPE_STRUCT, "stats", SYSCTL_DESCR("Raw IPv6 statistics"), sysctl_net_inet6_raw6_stats, 0, NULL, 0, CTL_NET, PF_INET6, IPPROTO_RAW, RAW6CTL_STATS, CTL_EOL); } PR_WRAP_USRREQS(rip6) #define rip6_attach rip6_attach_wrapper #define rip6_detach rip6_detach_wrapper #define rip6_accept rip6_accept_wrapper #define rip6_bind rip6_bind_wrapper #define rip6_listen rip6_listen_wrapper #define rip6_connect rip6_connect_wrapper #define rip6_connect2 rip6_connect2_wrapper #define rip6_disconnect rip6_disconnect_wrapper #define rip6_shutdown rip6_shutdown_wrapper #define rip6_abort rip6_abort_wrapper #define rip6_ioctl rip6_ioctl_wrapper #define rip6_stat rip6_stat_wrapper #define rip6_peeraddr rip6_peeraddr_wrapper #define rip6_sockaddr rip6_sockaddr_wrapper #define rip6_rcvd rip6_rcvd_wrapper #define rip6_recvoob rip6_recvoob_wrapper #define rip6_send rip6_send_wrapper #define rip6_sendoob rip6_sendoob_wrapper #define rip6_purgeif rip6_purgeif_wrapper const struct pr_usrreqs rip6_usrreqs = { .pr_attach = rip6_attach, .pr_detach = rip6_detach, .pr_accept = rip6_accept, .pr_bind = rip6_bind, .pr_listen = rip6_listen, .pr_connect = rip6_connect, .pr_connect2 = rip6_connect2, .pr_disconnect = rip6_disconnect, .pr_shutdown = rip6_shutdown, .pr_abort = rip6_abort, .pr_ioctl = rip6_ioctl, .pr_stat = rip6_stat, .pr_peeraddr = rip6_peeraddr, .pr_sockaddr = rip6_sockaddr, .pr_rcvd = rip6_rcvd, .pr_recvoob = rip6_recvoob, .pr_send = rip6_send, .pr_sendoob = rip6_sendoob, .pr_purgeif = rip6_purgeif, };
4 4 2 2 4 2 2 4 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 /* $NetBSD: vfs_syscalls_90.c,v 1.1 2019/09/22 22:59:38 christos Exp $ */ /*- * Copyright (c) 2005, 2008 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Christos Zoulas. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: vfs_syscalls_90.c,v 1.1 2019/09/22 22:59:38 christos Exp $"); #if defined(_KERNEL_OPT) #include "opt_compat_netbsd.h" #endif #include <sys/param.h> #include <sys/systm.h> #include <sys/namei.h> #include <sys/filedesc.h> #include <sys/kernel.h> #include <sys/file.h> #include <sys/stat.h> #include <sys/socketvar.h> #include <sys/vnode.h> #include <sys/mount.h> #include <sys/proc.h> #include <sys/uio.h> #include <sys/dirent.h> #include <sys/malloc.h> #include <sys/kauth.h> #include <sys/vfs_syscalls.h> #include <sys/syscall.h> #include <sys/syscallvar.h> #include <sys/syscallargs.h> #include <compat/common/compat_mod.h> #include <compat/common/compat_util.h> #include <compat/sys/statvfs.h> static const struct syscall_package vfs_syscalls_90_syscalls[] = { { SYS_compat_90_getvfsstat, 0, (sy_call_t *)compat_90_sys_getvfsstat }, { SYS_compat_90_statvfs1, 0, (sy_call_t *)compat_90_sys_statvfs1 }, { SYS_compat_90_fstatvfs1, 0, (sy_call_t *)compat_90_sys_fstatvfs1 }, { SYS_compat_90_fhstatvfs1, 0, (sy_call_t *)compat_90_sys_fhstatvfs1 }, { 0,0, NULL } }; int compat_90_sys_getvfsstat(struct lwp *l, const struct compat_90_sys_getvfsstat_args *uap, register_t *retval) { /* { syscallarg(struct statvfs90 *) buf; syscallarg(size_t) bufsize; syscallarg(int) flags; } */ return do_sys_getvfsstat(l, SCARG(uap, buf), SCARG(uap, bufsize), SCARG(uap, flags), statvfs_to_statvfs90_copy, sizeof(struct statvfs90), retval); } int compat_90_sys_statvfs1(struct lwp *l, const struct compat_90_sys_statvfs1_args *uap, register_t *retval) { /* { syscallarg(const char *) path; syscallarg(struct statvfs90 *) buf; syscallarg(int) flags; } */ struct statvfs *sb = STATVFSBUF_GET(); int error = do_sys_pstatvfs(l, SCARG(uap, path), SCARG(uap, flags), sb); if (!error) error = statvfs_to_statvfs90_copy(sb, SCARG(uap, buf), sizeof(struct statvfs90)); STATVFSBUF_PUT(sb); return error; } int compat_90_sys_fstatvfs1(struct lwp *l, const struct compat_90_sys_fstatvfs1_args *uap, register_t *retval) { /* { syscallarg(int) fd; syscallarg(struct statvfs90 *) buf; syscallarg(int) flags; } */ struct statvfs *sb = STATVFSBUF_GET(); int error = do_sys_fstatvfs(l, SCARG(uap, fd), SCARG(uap, flags), sb); if (!error) error = statvfs_to_statvfs90_copy(sb, SCARG(uap, buf), sizeof(struct statvfs90)); STATVFSBUF_PUT(sb); return error; } int compat_90_sys_fhstatvfs1(struct lwp *l, const struct compat_90_sys_fhstatvfs1_args *uap, register_t *retval) { /* { syscallarg(const void *) fhp; syscallarg(size_t) fh_size; syscallarg(struct statvfs90 *) buf; syscallarg(int) flags; } */ struct statvfs *sb = STATVFSBUF_GET(); int error = do_fhstatvfs(l, SCARG(uap, fhp), SCARG(uap, fh_size), sb, SCARG(uap, flags)); if (!error) error = statvfs_to_statvfs90_copy(sb, SCARG(uap, buf), sizeof(struct statvfs90)); STATVFSBUF_PUT(sb); return error; } int vfs_syscalls_90_init(void) { return syscall_establish(NULL, vfs_syscalls_90_syscalls); } int vfs_syscalls_90_fini(void) { return syscall_disestablish(NULL, vfs_syscalls_90_syscalls); }
497 495 497 496 277 278 278 277 187 248 19 38 176 20 266 38 270 152 264 264 19 20 1 8 19 15 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 /* $NetBSD: uvm_page_status.c,v 1.6 2020/08/14 09:06:15 chs Exp $ */ /*- * Copyright (c)2011 YAMAMOTO Takashi, * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: uvm_page_status.c,v 1.6 2020/08/14 09:06:15 chs Exp $"); #include <sys/param.h> #include <sys/systm.h> #include <uvm/uvm.h> /* * page dirtiness status tracking * * separated from uvm_page.c mainly for rump */ /* * these constants are chosen to match so that we can convert between * them quickly. */ __CTASSERT(UVM_PAGE_STATUS_UNKNOWN == 0); __CTASSERT(UVM_PAGE_STATUS_DIRTY == PG_DIRTY); __CTASSERT(UVM_PAGE_STATUS_CLEAN == PG_CLEAN); /* * uvm_pagegetdirty: return the dirtiness status (one of UVM_PAGE_STATUS_ * values) of the page. * * called with the owner locked. */ unsigned int uvm_pagegetdirty(struct vm_page *pg) { struct uvm_object * const uobj __diagused = pg->uobject; KASSERT((~pg->flags & (PG_CLEAN|PG_DIRTY)) != 0); KASSERT(uvm_page_owner_locked_p(pg, false)); KASSERT(uobj == NULL || ((pg->flags & PG_CLEAN) == 0) == uvm_obj_page_dirty_p(pg)); return pg->flags & (PG_CLEAN|PG_DIRTY); } /* * uvm_pagemarkdirty: set the dirtiness status (one of UVM_PAGE_STATUS_ values) * of the page. * * called with the owner locked. * * update the radix tree tag for object-owned page. * * if new status is UVM_PAGE_STATUS_UNKNOWN, clear pmap-level dirty bit * so that later uvm_pagecheckdirty() can notice modifications on the page. */ void uvm_pagemarkdirty(struct vm_page *pg, unsigned int newstatus) { struct uvm_object * const uobj = pg->uobject; const unsigned int oldstatus = uvm_pagegetdirty(pg); enum cpu_count base; KASSERT((~newstatus & (PG_CLEAN|PG_DIRTY)) != 0); KASSERT((newstatus & ~(PG_CLEAN|PG_DIRTY)) == 0); KASSERT(uvm_page_owner_locked_p(pg, true)); KASSERT(uobj == NULL || ((pg->flags & PG_CLEAN) == 0) == uvm_obj_page_dirty_p(pg)); if (oldstatus == newstatus) { return; } /* * set UVM_PAGE_DIRTY_TAG tag unless known CLEAN so that putpages can * find possibly-dirty pages quickly. */ if (uobj != NULL) { if (newstatus == UVM_PAGE_STATUS_CLEAN) { uvm_obj_page_clear_dirty(pg); } else if (oldstatus == UVM_PAGE_STATUS_CLEAN) { /* * on first dirty page, mark the object dirty. * for vnodes this inserts to the syncer worklist. */ if (uvm_obj_clean_p(uobj) && uobj->pgops->pgo_markdirty != NULL) { (*uobj->pgops->pgo_markdirty)(uobj); } uvm_obj_page_set_dirty(pg); } } if (newstatus == UVM_PAGE_STATUS_UNKNOWN) { /* * start relying on pmap-level dirtiness tracking. */ pmap_clear_modify(pg); } pg->flags &= ~(PG_CLEAN|PG_DIRTY); pg->flags |= newstatus; KASSERT(uobj == NULL || ((pg->flags & PG_CLEAN) == 0) == uvm_obj_page_dirty_p(pg)); if ((pg->flags & PG_STAT) != 0) { if ((pg->flags & PG_SWAPBACKED) != 0) { base = CPU_COUNT_ANONUNKNOWN; } else { base = CPU_COUNT_FILEUNKNOWN; } kpreempt_disable(); CPU_COUNT(base + oldstatus, -1); CPU_COUNT(base + newstatus, +1); kpreempt_enable(); } } /* * uvm_pagecheckdirty: check if page is dirty, and remove its dirty bit. * * called with the owner locked. * * returns if the page was dirty. * * if protected is true, mark the page CLEAN. otherwise, mark the page UNKNOWN. * ("mark" in the sense of uvm_pagemarkdirty().) */ bool uvm_pagecheckdirty(struct vm_page *pg, bool pgprotected) { const unsigned int oldstatus = uvm_pagegetdirty(pg); bool modified; KASSERT(uvm_page_owner_locked_p(pg, true)); /* * if pgprotected is true, mark the page CLEAN. * otherwise mark the page UNKNOWN unless it's CLEAN. * * possible transitions: * * CLEAN -> CLEAN , modified = false * UNKNOWN -> UNKNOWN, modified = true * UNKNOWN -> UNKNOWN, modified = false * UNKNOWN -> CLEAN , modified = true * UNKNOWN -> CLEAN , modified = false * DIRTY -> UNKNOWN, modified = true * DIRTY -> CLEAN , modified = true * * pmap_clear_modify is necessary if either of * oldstatus or newstatus is UVM_PAGE_STATUS_UNKNOWN. */ if (oldstatus == UVM_PAGE_STATUS_CLEAN) { modified = false; } else { const unsigned int newstatus = pgprotected ? UVM_PAGE_STATUS_CLEAN : UVM_PAGE_STATUS_UNKNOWN; if (oldstatus == UVM_PAGE_STATUS_DIRTY) { modified = true; if (newstatus == UVM_PAGE_STATUS_UNKNOWN) { pmap_clear_modify(pg); } } else { KASSERT(oldstatus == UVM_PAGE_STATUS_UNKNOWN); modified = pmap_clear_modify(pg); } uvm_pagemarkdirty(pg, newstatus); } return modified; }
441 8 8 8 8 3 3 3 3 3 437 440 439 23 441 1 8 8 7 6 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 /* $NetBSD: subr_percpu.c,v 1.25 2020/05/11 21:37:31 riastradh Exp $ */ /*- * Copyright (c)2007,2008 YAMAMOTO Takashi, * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * per-cpu storage. */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: subr_percpu.c,v 1.25 2020/05/11 21:37:31 riastradh Exp $"); #include <sys/param.h> #include <sys/cpu.h> #include <sys/kernel.h> #include <sys/kmem.h> #include <sys/mutex.h> #include <sys/percpu.h> #include <sys/rwlock.h> #include <sys/vmem.h> #include <sys/xcall.h> #define PERCPU_QUANTUM_SIZE (ALIGNBYTES + 1) #define PERCPU_QCACHE_MAX 0 #define PERCPU_IMPORT_SIZE 2048 struct percpu { unsigned pc_offset; size_t pc_size; percpu_callback_t pc_ctor; percpu_callback_t pc_dtor; void *pc_cookie; LIST_ENTRY(percpu) pc_list; }; static krwlock_t percpu_swap_lock __cacheline_aligned; static vmem_t * percpu_offset_arena __read_mostly; static struct { kmutex_t lock; unsigned int nextoff; LIST_HEAD(, percpu) ctor_list; struct lwp *busy; kcondvar_t cv; } percpu_allocation __cacheline_aligned; static percpu_cpu_t * cpu_percpu(struct cpu_info *ci) { return &ci->ci_data.cpu_percpu; } static unsigned int percpu_offset(percpu_t *pc) { const unsigned int off = pc->pc_offset; KASSERT(off < percpu_allocation.nextoff); return off; } /* * percpu_cpu_swap: crosscall handler for percpu_cpu_enlarge */ __noubsan static void percpu_cpu_swap(void *p1, void *p2) { struct cpu_info * const ci = p1; percpu_cpu_t * const newpcc = p2; percpu_cpu_t * const pcc = cpu_percpu(ci); KASSERT(ci == curcpu() || !mp_online); /* * swap *pcc and *newpcc unless anyone has beaten us. */ rw_enter(&percpu_swap_lock, RW_WRITER); if (newpcc->pcc_size > pcc->pcc_size) { percpu_cpu_t tmp; int s; tmp = *pcc; /* * block interrupts so that we don't lose their modifications. */ s = splhigh(); /* * copy data to new storage. */ memcpy(newpcc->pcc_data, pcc->pcc_data, pcc->pcc_size); /* * this assignment needs to be atomic for percpu_getptr_remote. */ pcc->pcc_data = newpcc->pcc_data; splx(s); pcc->pcc_size = newpcc->pcc_size; *newpcc = tmp; } rw_exit(&percpu_swap_lock); } /* * percpu_cpu_enlarge: ensure that percpu_cpu_t of each cpus have enough space */ static void percpu_cpu_enlarge(size_t size) { CPU_INFO_ITERATOR cii; struct cpu_info *ci; for (CPU_INFO_FOREACH(cii, ci)) { percpu_cpu_t pcc; pcc.pcc_data = kmem_alloc(size, KM_SLEEP); /* XXX cacheline */ pcc.pcc_size = size; if (!mp_online) { percpu_cpu_swap(ci, &pcc); } else { uint64_t where; where = xc_unicast(0, percpu_cpu_swap, ci, &pcc, ci); xc_wait(where); } KASSERT(pcc.pcc_size <= size); if (pcc.pcc_data != NULL) { kmem_free(pcc.pcc_data, pcc.pcc_size); } } } /* * percpu_backend_alloc: vmem import callback for percpu_offset_arena */ static int percpu_backend_alloc(vmem_t *dummy, vmem_size_t size, vmem_size_t *resultsize, vm_flag_t vmflags, vmem_addr_t *addrp) { unsigned int offset; unsigned int nextoff; ASSERT_SLEEPABLE(); KASSERT(dummy == NULL); if ((vmflags & VM_NOSLEEP) != 0) return ENOMEM; size = roundup(size, PERCPU_IMPORT_SIZE); mutex_enter(&percpu_allocation.lock); offset = percpu_allocation.nextoff; percpu_allocation.nextoff = nextoff = percpu_allocation.nextoff + size; mutex_exit(&percpu_allocation.lock); percpu_cpu_enlarge(nextoff); *resultsize = size; *addrp = (vmem_addr_t)offset; return 0; } static void percpu_zero_cb(void *vp, void *vp2, struct cpu_info *ci) { size_t sz = (uintptr_t)vp2; memset(vp, 0, sz); } /* * percpu_zero: initialize percpu storage with zero. */ static void percpu_zero(percpu_t *pc, size_t sz) { percpu_foreach(pc, percpu_zero_cb, (void *)(uintptr_t)sz); } /* * percpu_init: subsystem initialization */ void percpu_init(void) { ASSERT_SLEEPABLE(); rw_init(&percpu_swap_lock); mutex_init(&percpu_allocation.lock, MUTEX_DEFAULT, IPL_NONE); percpu_allocation.nextoff = PERCPU_QUANTUM_SIZE; LIST_INIT(&percpu_allocation.ctor_list); percpu_allocation.busy = NULL; cv_init(&percpu_allocation.cv, "percpu"); percpu_offset_arena = vmem_xcreate("percpu", 0, 0, PERCPU_QUANTUM_SIZE, percpu_backend_alloc, NULL, NULL, PERCPU_QCACHE_MAX, VM_SLEEP, IPL_NONE); } /* * percpu_init_cpu: cpu initialization * * => should be called before the cpu appears on the list for CPU_INFO_FOREACH. * => may be called for static CPUs afterward (typically just primary CPU) */ void percpu_init_cpu(struct cpu_info *ci) { percpu_cpu_t * const pcc = cpu_percpu(ci); struct percpu *pc; size_t size = percpu_allocation.nextoff; /* XXX racy */ ASSERT_SLEEPABLE(); /* * For the primary CPU, prior percpu_create may have already * triggered allocation, so there's nothing more for us to do * here. */ if (pcc->pcc_size) return; KASSERT(pcc->pcc_data == NULL); /* * Otherwise, allocate storage and, while the constructor list * is locked, run constructors for all percpus on this CPU. */ pcc->pcc_size = size; if (size) { pcc->pcc_data = kmem_zalloc(pcc->pcc_size, KM_SLEEP); mutex_enter(&percpu_allocation.lock); while (percpu_allocation.busy) cv_wait(&percpu_allocation.cv, &percpu_allocation.lock); percpu_allocation.busy = curlwp; LIST_FOREACH(pc, &percpu_allocation.ctor_list, pc_list) { KASSERT(pc->pc_ctor); mutex_exit(&percpu_allocation.lock); (*pc->pc_ctor)((char *)pcc->pcc_data + pc->pc_offset, pc->pc_cookie, ci); mutex_enter(&percpu_allocation.lock); } KASSERT(percpu_allocation.busy == curlwp); percpu_allocation.busy = NULL; cv_broadcast(&percpu_allocation.cv); mutex_exit(&percpu_allocation.lock); } } /* * percpu_alloc: allocate percpu storage * * => called in thread context. * => considered as an expensive and rare operation. * => allocated storage is initialized with zeros. */ percpu_t * percpu_alloc(size_t size) { return percpu_create(size, NULL, NULL, NULL); } /* * percpu_create: allocate percpu storage and associate ctor/dtor with it * * => called in thread context. * => considered as an expensive and rare operation. * => allocated storage is initialized by ctor, or zeros if ctor is null * => percpu_free will call dtor first, if dtor is nonnull * => ctor or dtor may sleep, even on allocation */ percpu_t * percpu_create(size_t size, percpu_callback_t ctor, percpu_callback_t dtor, void *cookie) { vmem_addr_t offset; percpu_t *pc; ASSERT_SLEEPABLE(); (void)vmem_alloc(percpu_offset_arena, size, VM_SLEEP | VM_BESTFIT, &offset); pc = kmem_alloc(sizeof(*pc), KM_SLEEP); pc->pc_offset = offset; pc->pc_size = size; pc->pc_ctor = ctor; pc->pc_dtor = dtor; pc->pc_cookie = cookie; if (ctor) { CPU_INFO_ITERATOR cii; struct cpu_info *ci; void *buf; /* * Wait until nobody is using the list of percpus with * constructors. */ mutex_enter(&percpu_allocation.lock); while (percpu_allocation.busy) cv_wait(&percpu_allocation.cv, &percpu_allocation.lock); percpu_allocation.busy = curlwp; mutex_exit(&percpu_allocation.lock); /* * Run the constructor for all CPUs. We use a * temporary buffer wo that we need not hold the * percpu_swap_lock while running the constructor. */ buf = kmem_alloc(size, KM_SLEEP); for (CPU_INFO_FOREACH(cii, ci)) { memset(buf, 0, size); (*ctor)(buf, cookie, ci); percpu_traverse_enter(); memcpy(percpu_getptr_remote(pc, ci), buf, size); percpu_traverse_exit(); } explicit_memset(buf, 0, size); kmem_free(buf, size); /* * Insert the percpu into the list of percpus with * constructors. We are now done using the list, so it * is safe for concurrent percpu_create or concurrent * percpu_init_cpu to run. */ mutex_enter(&percpu_allocation.lock); KASSERT(percpu_allocation.busy == curlwp); percpu_allocation.busy = NULL; cv_broadcast(&percpu_allocation.cv); LIST_INSERT_HEAD(&percpu_allocation.ctor_list, pc, pc_list); mutex_exit(&percpu_allocation.lock); } else { percpu_zero(pc, size); } return pc; } /* * percpu_free: free percpu storage * * => called in thread context. * => considered as an expensive and rare operation. */ void percpu_free(percpu_t *pc, size_t size) { ASSERT_SLEEPABLE(); KASSERT(size == pc->pc_size); /* * If there's a constructor, take the percpu off the list of * percpus with constructors, but first wait until nobody is * using the list. */ if (pc->pc_ctor) { mutex_enter(&percpu_allocation.lock); while (percpu_allocation.busy) cv_wait(&percpu_allocation.cv, &percpu_allocation.lock); LIST_REMOVE(pc, pc_list); mutex_exit(&percpu_allocation.lock); } /* If there's a destructor, run it now for all CPUs. */ if (pc->pc_dtor) { CPU_INFO_ITERATOR cii; struct cpu_info *ci; void *buf; buf = kmem_alloc(size, KM_SLEEP); for (CPU_INFO_FOREACH(cii, ci)) { percpu_traverse_enter(); memcpy(buf, percpu_getptr_remote(pc, ci), size); explicit_memset(percpu_getptr_remote(pc, ci), 0, size); percpu_traverse_exit(); (*pc->pc_dtor)(buf, pc->pc_cookie, ci); } explicit_memset(buf, 0, size); kmem_free(buf, size); } vmem_free(percpu_offset_arena, (vmem_addr_t)percpu_offset(pc), size); kmem_free(pc, sizeof(*pc)); } /* * percpu_getref: * * => safe to be used in either thread or interrupt context * => disables preemption; must be bracketed with a percpu_putref() */ void * percpu_getref(percpu_t *pc) { kpreempt_disable(); return percpu_getptr_remote(pc, curcpu()); } /* * percpu_putref: * * => drops the preemption-disabled count after caller is done with per-cpu * data */ void percpu_putref(percpu_t *pc) { kpreempt_enable(); } /* * percpu_traverse_enter, percpu_traverse_exit, percpu_getptr_remote: * helpers to access remote cpu's percpu data. * * => called in thread context. * => percpu_traverse_enter can block low-priority xcalls. * => typical usage would be: * * sum = 0; * percpu_traverse_enter(); * for (CPU_INFO_FOREACH(cii, ci)) { * unsigned int *p = percpu_getptr_remote(pc, ci); * sum += *p; * } * percpu_traverse_exit(); */ void percpu_traverse_enter(void) { ASSERT_SLEEPABLE(); rw_enter(&percpu_swap_lock, RW_READER); } void percpu_traverse_exit(void) { rw_exit(&percpu_swap_lock); } void * percpu_getptr_remote(percpu_t *pc, struct cpu_info *ci) { return &((char *)cpu_percpu(ci)->pcc_data)[percpu_offset(pc)]; } /* * percpu_foreach: call the specified callback function for each cpus. * * => must be called from thread context. * => callback executes on **current** CPU (or, really, arbitrary CPU, * in case of preemption) * => caller should not rely on the cpu iteration order. * => the callback function should be minimum because it is executed with * holding a global lock, which can block low-priority xcalls. * eg. it's illegal for a callback function to sleep for memory allocation. */ void percpu_foreach(percpu_t *pc, percpu_callback_t cb, void *arg) { CPU_INFO_ITERATOR cii; struct cpu_info *ci; percpu_traverse_enter(); for (CPU_INFO_FOREACH(cii, ci)) { (*cb)(percpu_getptr_remote(pc, ci), arg, ci); } percpu_traverse_exit(); } struct percpu_xcall_ctx { percpu_callback_t ctx_cb; void *ctx_arg; }; static void percpu_xcfunc(void * const v1, void * const v2) { percpu_t * const pc = v1; struct percpu_xcall_ctx * const ctx = v2; (*ctx->ctx_cb)(percpu_getref(pc), ctx->ctx_arg, curcpu()); percpu_putref(pc); } /* * percpu_foreach_xcall: call the specified callback function for each * cpu. This version uses an xcall to run the callback on each cpu. * * => must be called from thread context. * => callback executes on **remote** CPU in soft-interrupt context * (at the specified soft interrupt priority). * => caller should not rely on the cpu iteration order. * => the callback function should be minimum because it may be * executed in soft-interrupt context. eg. it's illegal for * a callback function to sleep for memory allocation. */ void percpu_foreach_xcall(percpu_t *pc, u_int xcflags, percpu_callback_t cb, void *arg) { struct percpu_xcall_ctx ctx = { .ctx_cb = cb, .ctx_arg = arg, }; CPU_INFO_ITERATOR cii; struct cpu_info *ci; for (CPU_INFO_FOREACH(cii, ci)) { xc_wait(xc_unicast(xcflags, percpu_xcfunc, pc, &ctx, ci)); } }
22 1 1 2 1 1 16 12 5 3 13 13 13 5 8 6 2 13 1 1 3 2 3 1 2 37 37 37 22 22 22 3 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 /* $NetBSD: union_vfsops.c,v 1.87 2023/02/13 08:39:40 hannken Exp $ */ /* * Copyright (c) 1994 The Regents of the University of California. * All rights reserved. * * This code is derived from software donated to Berkeley by * Jan-Simon Pendry. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)union_vfsops.c 8.20 (Berkeley) 5/20/95 */ /* * Copyright (c) 1994 Jan-Simon Pendry. * All rights reserved. * * This code is derived from software donated to Berkeley by * Jan-Simon Pendry. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)union_vfsops.c 8.20 (Berkeley) 5/20/95 */ /* * Union Layer */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: union_vfsops.c,v 1.87 2023/02/13 08:39:40 hannken Exp $"); #include <sys/param.h> #include <sys/systm.h> #include <sys/sysctl.h> #include <sys/time.h> #include <sys/proc.h> #include <sys/vnode.h> #include <sys/mount.h> #include <sys/namei.h> #include <sys/malloc.h> #include <sys/filedesc.h> #include <sys/queue.h> #include <sys/stat.h> #include <sys/kauth.h> #include <sys/module.h> #include <miscfs/genfs/genfs.h> #include <fs/union/union.h> MODULE(MODULE_CLASS_VFS, union, NULL); /* * Mount union filesystem */ int union_mount(struct mount *mp, const char *path, void *data, size_t *data_len) { struct lwp *l = curlwp; int error = 0; struct union_args *args = data; struct vnode *lowerrootvp = NULLVP; struct vnode *upperrootvp = NULLVP; struct union_mount *um = 0; const char *cp; char *xp; int len; size_t size; if (args == NULL) return EINVAL; if (*data_len < sizeof *args) return EINVAL; #ifdef UNION_DIAGNOSTIC printf("%s(mp = %p)\n", __func__, mp); #endif if (mp->mnt_flag & MNT_GETARGS) { um = MOUNTTOUNIONMOUNT(mp); if (um == NULL) return EIO; args->target = NULL; args->mntflags = um->um_op; *data_len = sizeof *args; return 0; } /* * Update is a no-op */ if (mp->mnt_flag & MNT_UPDATE) { /* * Need to provide. * 1. a way to convert between rdonly and rdwr mounts. * 2. support for nfs exports. */ error = EOPNOTSUPP; goto bad; } lowerrootvp = mp->mnt_vnodecovered; vref(lowerrootvp); /* * Find upper node. */ error = namei_simple_user(args->target, NSM_FOLLOW_NOEMULROOT, &upperrootvp); if (error != 0) goto bad; if (upperrootvp->v_type != VDIR) { error = EINVAL; goto bad; } um = kmem_zalloc(sizeof(*um), KM_SLEEP); /* * Keep a held reference to the target vnodes. * They are vrele'd in union_unmount. * * Depending on the _BELOW flag, the filesystems are * viewed in a different order. In effect, this is the * same as providing a mount under option to the mount syscall. */ um->um_op = args->mntflags & UNMNT_OPMASK; switch (um->um_op) { case UNMNT_ABOVE: um->um_lowervp = lowerrootvp; um->um_uppervp = upperrootvp; break; case UNMNT_BELOW: um->um_lowervp = upperrootvp; um->um_uppervp = lowerrootvp; break; case UNMNT_REPLACE: vrele(lowerrootvp); lowerrootvp = NULLVP; um->um_uppervp = upperrootvp; um->um_lowervp = lowerrootvp; break; default: error = EINVAL; goto bad; } /* * This mount is mp-safe if both lower mounts are mp-safe. */ if (((um->um_lowervp == NULLVP) || (um->um_lowervp->v_mount->mnt_iflag & IMNT_MPSAFE)) && (um->um_uppervp->v_mount->mnt_iflag & IMNT_MPSAFE)) mp->mnt_iflag |= IMNT_MPSAFE; /* * Unless the mount is readonly, ensure that the top layer * supports whiteout operations */ if ((mp->mnt_flag & MNT_RDONLY) == 0) { static struct componentname nullcn = { .cn_nameiop = LOOKUP, .cn_cred = NOCRED }; vn_lock(um->um_uppervp, LK_EXCLUSIVE | LK_RETRY); error = VOP_WHITEOUT(um->um_uppervp, &nullcn, LOOKUP); VOP_UNLOCK(um->um_uppervp); if (error) goto bad; } um->um_cred = l->l_cred; kauth_cred_hold(um->um_cred); um->um_cmode = UN_DIRMODE &~ l->l_proc->p_cwdi->cwdi_cmask; /* * Depending on what you think the MNT_LOCAL flag might mean, * you may want the && to be || on the conditional below. * At the moment it has been defined that the filesystem is * only local if it is all local, ie the MNT_LOCAL flag implies * that the entire namespace is local. If you think the MNT_LOCAL * flag implies that some of the files might be stored locally * then you will want to change the conditional. */ if (um->um_op == UNMNT_ABOVE) { if (((um->um_lowervp == NULLVP) || (um->um_lowervp->v_mount->mnt_flag & MNT_LOCAL)) && (um->um_uppervp->v_mount->mnt_flag & MNT_LOCAL)) mp->mnt_flag |= MNT_LOCAL; } /* * Copy in the upper layer's RDONLY flag. This is for the benefit * of lookup() which explicitly checks the flag, rather than asking * the filesystem for its own opinion. This means, that an update * mount of the underlying filesystem to go from rdonly to rdwr * will leave the unioned view as read-only. */ mp->mnt_flag |= (um->um_uppervp->v_mount->mnt_flag & MNT_RDONLY); mp->mnt_data = um; vfs_getnewfsid(mp); error = set_statvfs_info(path, UIO_USERSPACE, NULL, UIO_USERSPACE, mp->mnt_op->vfs_name, mp, l); if (error) goto bad; error = vfs_set_lowermount(mp, um->um_uppervp->v_mount); if (error) goto bad; switch (um->um_op) { case UNMNT_ABOVE: cp = "<above>:"; break; case UNMNT_BELOW: cp = "<below>:"; break; case UNMNT_REPLACE: cp = ""; break; default: cp = "<invalid>:"; #ifdef DIAGNOSTIC panic("%s: bad um_op", __func__); #endif break; } len = strlen(cp); memcpy(mp->mnt_stat.f_mntfromname, cp, len); xp = mp->mnt_stat.f_mntfromname + len; len = MNAMELEN - len; (void) copyinstr(args->target, xp, len - 1, &size); memset(xp + size, 0, len - size); #ifdef UNION_DIAGNOSTIC printf("%s: from %s, on %s\n", __func__, mp->mnt_stat.f_mntfromname, mp->mnt_stat.f_mntonname); #endif /* Setup the readdir hook if it's not set already */ if (!vn_union_readdir_hook) vn_union_readdir_hook = union_readdirhook; return 0; bad: if (um) { if (um->um_cred) kauth_cred_free(um->um_cred); kmem_free(um, sizeof(*um)); } if (upperrootvp) vrele(upperrootvp); if (lowerrootvp) vrele(lowerrootvp); return error; } /* * VFS start. Nothing needed here - the start routine * on the underlying filesystem(s) will have been called * when that filesystem was mounted. */ /*ARGSUSED*/ int union_start(struct mount *mp, int flags) { return 0; } /* * Free reference to union layer */ static bool union_unmount_selector(void *cl, struct vnode *vp) { int *count = cl; KASSERT(mutex_owned(vp->v_interlock)); *count += 1; return false; } int union_unmount(struct mount *mp, int mntflags) { struct union_mount *um = MOUNTTOUNIONMOUNT(mp); int freeing; int error; #ifdef UNION_DIAGNOSTIC printf("%s(mp = %p)\n", __func__, mp); #endif /* * Keep flushing vnodes from the mount list. * This is needed because of the un_pvp held * reference to the parent vnode. * If more vnodes have been freed on a given pass, * the try again. The loop will iterate at most * (d) times, where (d) is the maximum tree depth * in the filesystem. */ for (freeing = 0; (error = vflush(mp, NULL, 0)) != 0;) { struct vnode_iterator *marker; int n; /* count #vnodes held on mount list */ n = 0; vfs_vnode_iterator_init(mp, &marker); vfs_vnode_iterator_next(marker, union_unmount_selector, &n); vfs_vnode_iterator_destroy(marker); /* if this is unchanged then stop */ if (n == freeing) break; /* otherwise try once more time */ freeing = n; } /* * Ok, now that we've tried doing it gently, get out the hammer. */ if (mntflags & MNT_FORCE) error = vflush(mp, NULL, FORCECLOSE); if (error) return error; /* * Discard references to upper and lower target vnodes. */ if (um->um_lowervp) vrele(um->um_lowervp); vrele(um->um_uppervp); kauth_cred_free(um->um_cred); /* * Finally, throw away the union_mount structure */ kmem_free(um, sizeof(*um)); mp->mnt_data = NULL; return 0; } int union_root(struct mount *mp, int lktype, struct vnode **vpp) { struct union_mount *um = MOUNTTOUNIONMOUNT(mp); int error; /* * Return locked reference to root. */ vref(um->um_uppervp); if (um->um_lowervp) vref(um->um_lowervp); error = union_allocvp(vpp, mp, NULL, NULL, NULL, um->um_uppervp, um->um_lowervp, 1); if (error) { vrele(um->um_uppervp); if (um->um_lowervp) vrele(um->um_lowervp); return error; } vn_lock(*vpp, lktype | LK_RETRY); return 0; } int union_statvfs(struct mount *mp, struct statvfs *sbp) { int error; struct union_mount *um = MOUNTTOUNIONMOUNT(mp); struct statvfs *sbuf = kmem_zalloc(sizeof(*sbuf), KM_SLEEP); unsigned long lbsize; #ifdef UNION_DIAGNOSTIC printf("%s(mp = %p, lvp = %p, uvp = %p)\n", __func__, mp, um->um_lowervp, um->um_uppervp); #endif if (um->um_lowervp) { error = VFS_STATVFS(um->um_lowervp->v_mount, sbuf); if (error) goto done; } /* now copy across the "interesting" information and fake the rest */ lbsize = sbuf->f_bsize; sbp->f_blocks = sbuf->f_blocks - sbuf->f_bfree; sbp->f_files = sbuf->f_files - sbuf->f_ffree; error = VFS_STATVFS(um->um_uppervp->v_mount, sbuf); if (error) goto done; sbp->f_flag = sbuf->f_flag; sbp->f_bsize = sbuf->f_bsize; sbp->f_frsize = sbuf->f_frsize; sbp->f_iosize = sbuf->f_iosize; /* * The "total" fields count total resources in all layers, * the "free" fields count only those resources which are * free in the upper layer (since only the upper layer * is writable). */ if (sbuf->f_bsize != lbsize) sbp->f_blocks = sbp->f_blocks * lbsize / sbuf->f_bsize; sbp->f_blocks += sbuf->f_blocks; sbp->f_bfree = sbuf->f_bfree; sbp->f_bavail = sbuf->f_bavail; sbp->f_bresvd = sbuf->f_bresvd; sbp->f_files += sbuf->f_files; sbp->f_ffree = sbuf->f_ffree; sbp->f_favail = sbuf->f_favail; sbp->f_fresvd = sbuf->f_fresvd; copy_statvfs_info(sbp, mp); done: kmem_free(sbuf, sizeof(*sbuf)); return error; } /*ARGSUSED*/ int union_sync(struct mount *mp, int waitfor, kauth_cred_t cred) { /* * XXX - Assumes no data cached at union layer. */ return 0; } /*ARGSUSED*/ int union_vget(struct mount *mp, ino_t ino, int lktype, struct vnode **vpp) { return EOPNOTSUPP; } static int union_renamelock_enter(struct mount *mp) { struct union_mount *um = MOUNTTOUNIONMOUNT(mp); /* Lock just the upper fs, where the action happens. */ return VFS_RENAMELOCK_ENTER(um->um_uppervp->v_mount); } static void union_renamelock_exit(struct mount *mp) { struct union_mount *um = MOUNTTOUNIONMOUNT(mp); VFS_RENAMELOCK_EXIT(um->um_uppervp->v_mount); } extern const struct vnodeopv_desc union_vnodeop_opv_desc; const struct vnodeopv_desc * const union_vnodeopv_descs[] = { &union_vnodeop_opv_desc, NULL, }; struct vfsops union_vfsops = { .vfs_name = MOUNT_UNION, .vfs_min_mount_data = sizeof (struct union_args), .vfs_mount = union_mount, .vfs_start = union_start, .vfs_unmount = union_unmount, .vfs_root = union_root, .vfs_quotactl = (void *)eopnotsupp, .vfs_statvfs = union_statvfs, .vfs_sync = union_sync, .vfs_vget = union_vget, .vfs_loadvnode = union_loadvnode, .vfs_fhtovp = (void *)eopnotsupp, .vfs_vptofh = (void *)eopnotsupp, .vfs_init = union_init, .vfs_reinit = union_reinit, .vfs_done = union_done, .vfs_snapshot = (void *)eopnotsupp, .vfs_extattrctl = vfs_stdextattrctl, .vfs_suspendctl = genfs_suspendctl, .vfs_renamelock_enter = union_renamelock_enter, .vfs_renamelock_exit = union_renamelock_exit, .vfs_fsync = (void *)eopnotsupp, .vfs_opv_descs = union_vnodeopv_descs }; SYSCTL_SETUP(unionfs_sysctl_setup, "unionfs sysctl") { sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT, CTLTYPE_NODE, "union", SYSCTL_DESCR("Union file system"), NULL, 0, NULL, 0, CTL_VFS, 15, CTL_EOL); /* * XXX the "15" above could be dynamic, thereby eliminating * one more instance of the "number to vfs" mapping problem, * but "15" is the order as taken from sys/mount.h */ } static int union_modcmd(modcmd_t cmd, void *arg) { switch (cmd) { case MODULE_CMD_INIT: return vfs_attach(&union_vfsops); case MODULE_CMD_FINI: return vfs_detach(&union_vfsops); default: return ENOTTY; } }
20 20 1 13 13 9 6 7 10 7 3 13 12 10 2 7 6 8 2 4 2 1 1 4 9 4 2 4 6 3 3 3 3 3 1 2 3 4 4 4 7 7 6 1 7 4 3 4 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 /* $NetBSD: fifo_vnops.c,v 1.91 2021/10/11 01:07:36 thorpej Exp $ */ /*- * Copyright (c) 2008 The NetBSD Foundation, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /* * Copyright (c) 1990, 1993, 1995 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)fifo_vnops.c 8.10 (Berkeley) 5/27/95 */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: fifo_vnops.c,v 1.91 2021/10/11 01:07:36 thorpej Exp $"); #include <sys/param.h> #include <sys/systm.h> #include <sys/proc.h> #include <sys/time.h> #include <sys/namei.h> #include <sys/vnode.h> #include <sys/socket.h> #include <sys/protosw.h> #include <sys/socketvar.h> #include <sys/stat.h> #include <sys/ioctl.h> #include <sys/file.h> #include <sys/errno.h> #include <sys/kmem.h> #include <sys/un.h> #include <sys/poll.h> #include <sys/event.h> #include <sys/condvar.h> #include <miscfs/fifofs/fifo.h> #include <miscfs/genfs/genfs.h> /* * This structure is associated with the FIFO vnode and stores * the state associated with the FIFO. */ struct fifoinfo { struct socket *fi_readsock; struct socket *fi_writesock; kcondvar_t fi_rcv; int fi_readers; kcondvar_t fi_wcv; int fi_writers; }; /* * Trivial lookup routine that always fails. */ /* ARGSUSED */ static int fifo_lookup(void *v) { struct vop_lookup_v2_args /* { struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; } */ *ap = v; *ap->a_vpp = NULL; return (ENOTDIR); } /* * Open called to set up a new instance of a fifo or * to find an active instance of a fifo. */ static int fifo_open(void *v) { struct vop_open_args /* { struct vnode *a_vp; int a_mode; kauth_cred_t a_cred; } */ *ap = v; struct lwp *l = curlwp; struct vnode *vp; struct fifoinfo *fip; struct socket *rso, *wso; int error; vp = ap->a_vp; KASSERT(VOP_ISLOCKED(vp)); if ((fip = vp->v_fifoinfo) == NULL) { fip = kmem_alloc(sizeof(*fip), KM_SLEEP); error = socreate(AF_LOCAL, &rso, SOCK_STREAM, 0, l, NULL); if (error != 0) { kmem_free(fip, sizeof(*fip)); return (error); } fip->fi_readsock = rso; error = socreate(AF_LOCAL, &wso, SOCK_STREAM, 0, l, rso); if (error != 0) { (void)soclose(rso); kmem_free(fip, sizeof(*fip)); return (error); } fip->fi_writesock = wso; solock(wso); if ((error = unp_connect2(wso, rso)) != 0) { sounlock(wso); (void)soclose(wso); (void)soclose(rso); kmem_free(fip, sizeof(*fip)); return (error); } /* * FIFOs must be readable when there is at least 1 * byte of data available in the receive buffer. * * FIFOs must be writable when there is space for * at least PIPE_BUF bytes in the send buffer. * If we're increasing the low water mark for the * send buffer, then mimic how soreserve() would * have set the high water mark. */ rso->so_rcv.sb_lowat = 1; if (wso->so_snd.sb_lowat < PIPE_BUF) { wso->so_snd.sb_hiwat = PIPE_BUF * 2; } wso->so_snd.sb_lowat = PIPE_BUF; fip->fi_readers = 0; fip->fi_writers = 0; wso->so_state |= SS_CANTRCVMORE; rso->so_state |= SS_CANTSENDMORE; cv_init(&fip->fi_rcv, "fiford"); cv_init(&fip->fi_wcv, "fifowr"); vp->v_fifoinfo = fip; } else { wso = fip->fi_writesock; rso = fip->fi_readsock; solock(wso); } if (ap->a_mode & FREAD) { if (fip->fi_readers++ == 0) { wso->so_state &= ~SS_CANTSENDMORE; cv_broadcast(&fip->fi_wcv); } } if (ap->a_mode & FWRITE) { if (fip->fi_writers++ == 0) { rso->so_state &= ~SS_CANTRCVMORE; cv_broadcast(&fip->fi_rcv); } } if (ap->a_mode & FREAD) { if (ap->a_mode & O_NONBLOCK) { } else { while (!soreadable(rso) && fip->fi_writers == 0) { VOP_UNLOCK(vp); error = cv_wait_sig(&fip->fi_rcv, wso->so_lock); sounlock(wso); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); if (error) goto bad; solock(wso); } } } if (ap->a_mode & FWRITE) { if (ap->a_mode & O_NONBLOCK) { if (fip->fi_readers == 0) { error = ENXIO; sounlock(wso); goto bad; } } else { while (fip->fi_readers == 0) { VOP_UNLOCK(vp); error = cv_wait_sig(&fip->fi_wcv, wso->so_lock); sounlock(wso); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); if (error) goto bad; solock(wso); } } } sounlock(wso); return (0); bad: VOP_CLOSE(vp, ap->a_mode, ap->a_cred); return (error); } /* * Vnode op for read */ /* ARGSUSED */ static int fifo_read(void *v) { struct vop_read_args /* { struct vnode *a_vp; struct uio *a_uio; int a_ioflag; kauth_cred_t a_cred; } */ *ap = v; struct uio *uio; struct socket *rso; int error, sflags; size_t startresid; uio = ap->a_uio; rso = ap->a_vp->v_fifoinfo->fi_readsock; #ifdef DIAGNOSTIC if (uio->uio_rw != UIO_READ) panic("fifo_read mode"); #endif if (uio->uio_resid == 0) return (0); startresid = uio->uio_resid; VOP_UNLOCK(ap->a_vp); sflags = (ap->a_ioflag & IO_NDELAY) ? MSG_NBIO : 0; error = (*rso->so_receive)(rso, NULL, uio, NULL, NULL, &sflags); /* * Clear EOF indication after first such return. */ if (error == 0 && uio->uio_resid == startresid) rso->so_state &= ~SS_CANTRCVMORE; if (ap->a_ioflag & IO_NDELAY) { if (error == EWOULDBLOCK && ap->a_vp->v_fifoinfo->fi_writers == 0) error = 0; } vn_lock(ap->a_vp, LK_EXCLUSIVE | LK_RETRY); return (error); } /* * Vnode op for write */ /* ARGSUSED */ static int fifo_write(void *v) { struct vop_write_args /* { struct vnode *a_vp; struct uio *a_uio; int a_ioflag; kauth_cred_t a_cred; } */ *ap = v; struct socket *wso; int error, sflags; wso = ap->a_vp->v_fifoinfo->fi_writesock; #ifdef DIAGNOSTIC if (ap->a_uio->uio_rw != UIO_WRITE) panic("fifo_write mode"); #endif VOP_UNLOCK(ap->a_vp); sflags = (ap->a_ioflag & IO_NDELAY) ? MSG_NBIO : 0; error = (*wso->so_send)(wso, NULL, ap->a_uio, 0, NULL, sflags, curlwp); vn_lock(ap->a_vp, LK_EXCLUSIVE | LK_RETRY); return (error); } /* * Device ioctl operation. */ /* ARGSUSED */ static int fifo_ioctl(void *v) { struct vop_ioctl_args /* { struct vnode *a_vp; u_long a_command; void *a_data; int a_fflag; kauth_cred_t a_cred; struct lwp *a_l; } */ *ap = v; struct file filetmp; int error; if (ap->a_command == FIONBIO) return (0); if (ap->a_fflag & FREAD) { filetmp.f_data = ap->a_vp->v_fifoinfo->fi_readsock; error = soo_ioctl(&filetmp, ap->a_command, ap->a_data); if (error) return (error); } if (ap->a_fflag & FWRITE) { filetmp.f_data = ap->a_vp->v_fifoinfo->fi_writesock; error = soo_ioctl(&filetmp, ap->a_command, ap->a_data); if (error) return (error); } return (0); } /* ARGSUSED */ static int fifo_poll(void *v) { struct vop_poll_args /* { struct vnode *a_vp; int a_events; } */ *ap = v; struct socket *rso = ap->a_vp->v_fifoinfo->fi_readsock; struct socket *wso = ap->a_vp->v_fifoinfo->fi_writesock; struct socket *lso = NULL; int events; /* * N.B. We're using a slightly different naming convention * for these variables that most poll handlers. */ int revents = 0; int wevents = 0; if (rso != NULL) { lso = rso; } else if (wso != NULL) { lso = wso; } if (lso == NULL) { /* No associated sockets -> no events to report. */ return 0; } KASSERT(rso == NULL || lso->so_lock == rso->so_lock); KASSERT(wso == NULL || lso->so_lock == wso->so_lock); solock(lso); if (rso != NULL) { events = ap->a_events & (POLLIN | POLLRDNORM); if (events != 0 && soreadable(rso)) { revents |= events; } if (rso->so_state & SS_CANTRCVMORE) { revents |= POLLHUP; } /* * We always selrecord the read side here regardless * of the caller's read interest because we need to * action POLLHUP. */ if (revents == 0) { selrecord(curlwp, &rso->so_rcv.sb_sel); rso->so_rcv.sb_flags |= SB_NOTIFY; } } /* POSIX sez: POLLHUP and POLLOUT are mutually-exclusive. */ if (wso != NULL && (revents & POLLHUP) == 0) { events = ap->a_events & (POLLOUT | POLLWRNORM); if (events != 0 && sowritable(wso)) { wevents |= events; } if (wevents == 0 && events != 0) { selrecord(curlwp, &wso->so_snd.sb_sel); wso->so_snd.sb_flags |= SB_NOTIFY; } } sounlock(lso); return (revents | wevents); } static int fifo_inactive(void *v) { struct vop_inactive_v2_args /* { struct vnode *a_vp; struct lwp *a_l; } */ *ap __unused = v; return (0); } /* * This is a noop, simply returning what one has been given. */ static int fifo_bmap(void *v) { struct vop_bmap_args /* { struct vnode *a_vp; daddr_t a_bn; struct vnode **a_vpp; daddr_t *a_bnp; int *a_runp; } */ *ap = v; if (ap->a_vpp != NULL) *ap->a_vpp = ap->a_vp; if (ap->a_bnp != NULL) *ap->a_bnp = ap->a_bn; if (ap->a_runp != NULL) *ap->a_runp = 0; return (0); } /* * This is like socantrcvmore(), but we send the POLL_HUP code. */ static void fifo_socantrcvmore(struct socket *so) { KASSERT(solocked(so)); so->so_state |= SS_CANTRCVMORE; if (sb_notify(&so->so_rcv)) { sowakeup(so, &so->so_rcv, POLL_HUP); } } /* * Device close routine */ /* ARGSUSED */ static int fifo_close(void *v) { struct vop_close_args /* { struct vnode *a_vp; int a_fflag; kauth_cred_t a_cred; struct lwp *a_l; } */ *ap = v; struct vnode *vp; struct fifoinfo *fip; struct socket *wso, *rso; int isrevoke; vp = ap->a_vp; fip = vp->v_fifoinfo; isrevoke = (ap->a_fflag & (FREAD | FWRITE | FNONBLOCK)) == FNONBLOCK; wso = fip->fi_writesock; rso = fip->fi_readsock; solock(wso); if (isrevoke) { if (fip->fi_readers != 0) { fip->fi_readers = 0; socantsendmore(wso); } if (fip->fi_writers != 0) { fip->fi_writers = 0; fifo_socantrcvmore(rso); } } else { if ((ap->a_fflag & FREAD) && --fip->fi_readers == 0) socantsendmore(wso); if ((ap->a_fflag & FWRITE) && --fip->fi_writers == 0) fifo_socantrcvmore(rso); } if ((fip->fi_readers + fip->fi_writers) == 0) { sounlock(wso); (void) soclose(rso); (void) soclose(wso); cv_destroy(&fip->fi_rcv); cv_destroy(&fip->fi_wcv); kmem_free(fip, sizeof(*fip)); vp->v_fifoinfo = NULL; } else sounlock(wso); return (0); } /* * Print out internal contents of a fifo vnode. */ static void fifo_printinfo(struct vnode *vp) { struct fifoinfo *fip; fip = vp->v_fifoinfo; printf(", fifo with %d readers and %d writers", fip->fi_readers, fip->fi_writers); } /* * Print out the contents of a fifo vnode. */ static int fifo_print(void *v) { struct vop_print_args /* { struct vnode *a_vp; } */ *ap = v; /* * We are most likely being called with the vnode belonging * to some file system and this is not printed. */ if (ap->a_vp->v_tag == VT_NON) printf("tag VT_NON"); fifo_printinfo(ap->a_vp); printf("\n"); return 0; } /* * Return POSIX pathconf information applicable to fifo's. */ static int fifo_pathconf(void *v) { struct vop_pathconf_args /* { struct vnode *a_vp; int a_name; register_t *a_retval; } */ *ap = v; switch (ap->a_name) { case _PC_LINK_MAX: *ap->a_retval = LINK_MAX; return (0); case _PC_PIPE_BUF: *ap->a_retval = PIPE_BUF; return (0); case _PC_CHOWN_RESTRICTED: *ap->a_retval = 1; return (0); case _PC_SYNC_IO: *ap->a_retval = 1; return (0); default: return genfs_pathconf(ap); } /* NOTREACHED */ } static void filt_fifordetach(struct knote *kn) { struct socket *so; so = (struct socket *)kn->kn_hook; solock(so); if (selremove_knote(&so->so_rcv.sb_sel, kn)) so->so_rcv.sb_flags &= ~SB_KNOTE; sounlock(so); } static int filt_fiforead(struct knote *kn, long hint) { struct socket *so; int rv; so = (struct socket *)kn->kn_hook; if (hint != NOTE_SUBMIT) solock(so); kn->kn_data = so->so_rcv.sb_cc; if (so->so_state & SS_CANTRCVMORE) { knote_set_eof(kn, 0); rv = 1; } else { knote_clear_eof(kn); rv = (kn->kn_data >= so->so_rcv.sb_lowat); } if (hint != NOTE_SUBMIT) sounlock(so); return rv; } static void filt_fifowdetach(struct knote *kn) { struct socket *so; so = (struct socket *)kn->kn_hook; solock(so); if (selremove_knote(&so->so_snd.sb_sel, kn)) so->so_snd.sb_flags &= ~SB_KNOTE; sounlock(so); } static int filt_fifowrite(struct knote *kn, long hint) { struct socket *so; int rv; so = (struct socket *)kn->kn_hook; if (hint != NOTE_SUBMIT) solock(so); kn->kn_data = sbspace(&so->so_snd); if (so->so_state & SS_CANTSENDMORE) { knote_set_eof(kn, 0); rv = 1; } else { knote_clear_eof(kn); rv = (kn->kn_data >= so->so_snd.sb_lowat); } if (hint != NOTE_SUBMIT) sounlock(so); return rv; } static const struct filterops fiforead_filtops = { .f_flags = FILTEROP_ISFD | FILTEROP_MPSAFE, .f_attach = NULL, .f_detach = filt_fifordetach, .f_event = filt_fiforead, }; static const struct filterops fifowrite_filtops = { .f_flags = FILTEROP_ISFD | FILTEROP_MPSAFE, .f_attach = NULL, .f_detach = filt_fifowdetach, .f_event = filt_fifowrite, }; /* ARGSUSED */ static int fifo_kqfilter(void *v) { struct vop_kqfilter_args /* { struct vnode *a_vp; struct knote *a_kn; } */ *ap = v; struct socket *so; struct sockbuf *sb; switch (ap->a_kn->kn_filter) { case EVFILT_READ: so = (struct socket *)ap->a_vp->v_fifoinfo->fi_readsock; ap->a_kn->kn_fop = &fiforead_filtops; sb = &so->so_rcv; break; case EVFILT_WRITE: so = (struct socket *)ap->a_vp->v_fifoinfo->fi_writesock; ap->a_kn->kn_fop = &fifowrite_filtops; sb = &so->so_snd; break; default: return (EINVAL); } ap->a_kn->kn_hook = so; solock(so); selrecord_knote(&sb->sb_sel, ap->a_kn); sb->sb_flags |= SB_KNOTE; sounlock(so); return (0); } int (**fifo_vnodeop_p)(void *); const struct vnodeopv_entry_desc fifo_vnodeop_entries[] = { { &vop_default_desc, vn_default_error }, { &vop_parsepath_desc, genfs_parsepath }, /* parsepath */ { &vop_lookup_desc, fifo_lookup }, /* lookup */ { &vop_create_desc, genfs_badop }, /* create */ { &vop_mknod_desc, genfs_badop }, /* mknod */ { &vop_open_desc, fifo_open }, /* open */ { &vop_close_desc, fifo_close }, /* close */ { &vop_access_desc, genfs_ebadf }, /* access */ { &vop_accessx_desc, genfs_accessx }, /* accessx */ { &vop_getattr_desc, genfs_ebadf }, /* getattr */ { &vop_setattr_desc, genfs_ebadf }, /* setattr */ { &vop_read_desc, fifo_read }, /* read */ { &vop_write_desc, fifo_write }, /* write */ { &vop_fallocate_desc, genfs_eopnotsupp }, /* fallocate */ { &vop_fdiscard_desc, genfs_eopnotsupp }, /* fdiscard */ { &vop_ioctl_desc, fifo_ioctl }, /* ioctl */ { &vop_poll_desc, fifo_poll }, /* poll */ { &vop_kqfilter_desc, fifo_kqfilter }, /* kqfilter */ { &vop_revoke_desc, genfs_revoke }, /* revoke */ { &vop_mmap_desc, genfs_badop }, /* mmap */ { &vop_fsync_desc, genfs_nullop }, /* fsync */ { &vop_seek_desc, genfs_badop }, /* seek */ { &vop_remove_desc, genfs_badop }, /* remove */ { &vop_link_desc, genfs_badop }, /* link */ { &vop_rename_desc, genfs_badop }, /* rename */ { &vop_mkdir_desc, genfs_badop }, /* mkdir */ { &vop_rmdir_desc, genfs_badop }, /* rmdir */ { &vop_symlink_desc, genfs_badop }, /* symlink */ { &vop_readdir_desc, genfs_badop }, /* readdir */ { &vop_readlink_desc, genfs_badop }, /* readlink */ { &vop_abortop_desc, genfs_badop }, /* abortop */ { &vop_inactive_desc, fifo_inactive }, /* inactive */ { &vop_reclaim_desc, genfs_nullop }, /* reclaim */ { &vop_lock_desc, genfs_lock }, /* lock */ { &vop_unlock_desc, genfs_unlock }, /* unlock */ { &vop_bmap_desc, fifo_bmap }, /* bmap */ { &vop_strategy_desc, genfs_badop }, /* strategy */ { &vop_print_desc, fifo_print }, /* print */ { &vop_islocked_desc, genfs_islocked }, /* islocked */ { &vop_pathconf_desc, fifo_pathconf }, /* pathconf */ { &vop_advlock_desc, genfs_einval }, /* advlock */ { &vop_bwrite_desc, genfs_nullop }, /* bwrite */ { &vop_putpages_desc, genfs_null_putpages }, /* putpages */ { NULL, NULL } }; const struct vnodeopv_desc fifo_vnodeop_opv_desc = { &fifo_vnodeop_p, fifo_vnodeop_entries };
18 18 18 5 2 5 8 2 8 5 8 11 56 56 56 56 55 3 5 51 2 18 32 50 48 1 33 1 51 55 20 20 20 20 20 20 17 2 20 3 17 17 16 2 15 13 5 9 1 3 17 16 8 3 17 3 15 6 11 3 16 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 19 18 2 15 2 15 2 3 3 13 18 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 /* $NetBSD: ufs_readwrite.c,v 1.128 2022/02/21 17:07:45 hannken Exp $ */ /*- * Copyright (c) 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)ufs_readwrite.c 8.11 (Berkeley) 5/8/95 */ #include <sys/cdefs.h> __KERNEL_RCSID(1, "$NetBSD: ufs_readwrite.c,v 1.128 2022/02/21 17:07:45 hannken Exp $"); #define FS struct fs #define I_FS i_fs #define READ ffs_read #define READ_S "ffs_read" #define WRITE ffs_write #define WRITE_S "ffs_write" #define BUFRD ffs_bufrd #define BUFWR ffs_bufwr #define ufs_blkoff ffs_blkoff #define ufs_blksize ffs_blksize #define ufs_lblkno ffs_lblkno #define ufs_lblktosize ffs_lblktosize #define ufs_blkroundup ffs_blkroundup static int ufs_post_read_update(struct vnode *, int, int); static int ufs_post_write_update(struct vnode *, struct uio *, int, kauth_cred_t, off_t, int, int); /* * Vnode op for reading. */ /* ARGSUSED */ int READ(void *v) { struct vop_read_args /* { struct vnode *a_vp; struct uio *a_uio; int a_ioflag; kauth_cred_t a_cred; } */ *ap = v; struct vnode *vp; struct inode *ip; struct uio *uio; struct ufsmount *ump; vsize_t bytelen; int error, ioflag, advice; vp = ap->a_vp; ip = VTOI(vp); ump = ip->i_ump; uio = ap->a_uio; ioflag = ap->a_ioflag; error = 0; KASSERT(uio->uio_rw == UIO_READ); KASSERT(vp->v_type == VREG || vp->v_type == VDIR); /* XXX Eliminate me by refusing directory reads from userland. */ if (vp->v_type == VDIR) return BUFRD(vp, uio, ioflag, ap->a_cred); if ((u_int64_t)uio->uio_offset > ump->um_maxfilesize) return (EFBIG); if (uio->uio_resid == 0) return (0); if ((ip->i_flags & (SF_SNAPSHOT | SF_SNAPINVAL)) == SF_SNAPSHOT) return ffs_snapshot_read(vp, uio, ioflag); if (uio->uio_offset >= ip->i_size) goto out; KASSERT(vp->v_type == VREG); advice = IO_ADV_DECODE(ap->a_ioflag); while (uio->uio_resid > 0) { if (ioflag & IO_DIRECT) { genfs_directio(vp, uio, ioflag); } bytelen = MIN(ip->i_size - uio->uio_offset, uio->uio_resid); if (bytelen == 0) break; error = ubc_uiomove(&vp->v_uobj, uio, bytelen, advice, UBC_READ | UBC_PARTIALOK | UBC_VNODE_FLAGS(vp)); if (error) break; } out: error = ufs_post_read_update(vp, ap->a_ioflag, error); return (error); } /* * UFS op for reading via the buffer cache */ int BUFRD(struct vnode *vp, struct uio *uio, int ioflag, kauth_cred_t cred) { struct inode *ip; struct ufsmount *ump; FS *fs; struct buf *bp; daddr_t lbn, nextlbn; off_t bytesinfile; long size, xfersize, blkoffset; int error; KASSERT(VOP_ISLOCKED(vp)); KASSERT(vp->v_type == VDIR || vp->v_type == VLNK); KASSERT(uio->uio_rw == UIO_READ); ip = VTOI(vp); ump = ip->i_ump; fs = ip->I_FS; error = 0; KASSERT(vp->v_type != VLNK || ip->i_size >= ump->um_maxsymlinklen); KASSERT(vp->v_type != VLNK || ump->um_maxsymlinklen != 0 || DIP(ip, blocks) != 0); if (uio->uio_offset > ump->um_maxfilesize) return EFBIG; if (uio->uio_resid == 0) return 0; KASSERT(!ISSET(ip->i_flags, (SF_SNAPSHOT | SF_SNAPINVAL))); if (uio->uio_offset >= ip->i_size) goto out; for (error = 0, bp = NULL; uio->uio_resid > 0; bp = NULL) { bytesinfile = ip->i_size - uio->uio_offset; if (bytesinfile <= 0) break; lbn = ufs_lblkno(fs, uio->uio_offset); nextlbn = lbn + 1; size = ufs_blksize(fs, ip, lbn); blkoffset = ufs_blkoff(fs, uio->uio_offset); xfersize = MIN(MIN(fs->fs_bsize - blkoffset, uio->uio_resid), bytesinfile); if (ufs_lblktosize(fs, nextlbn) >= ip->i_size) error = bread(vp, lbn, size, 0, &bp); else { int nextsize = ufs_blksize(fs, ip, nextlbn); error = breadn(vp, lbn, size, &nextlbn, &nextsize, 1, 0, &bp); } if (error) break; /* * We should only get non-zero b_resid when an I/O error * has occurred, which should cause us to break above. * However, if the short read did not cause an error, * then we want to ensure that we do not uiomove bad * or uninitialized data. */ size -= bp->b_resid; if (size < xfersize) { if (size == 0) break; xfersize = size; } error = uiomove((char *)bp->b_data + blkoffset, xfersize, uio); if (error) break; brelse(bp, 0); } if (bp != NULL) brelse(bp, 0); out: error = ufs_post_read_update(vp, ioflag, error); return (error); } static int ufs_post_read_update(struct vnode *vp, int ioflag, int oerror) { struct inode *ip = VTOI(vp); int error = oerror; if (!(vp->v_mount->mnt_flag & MNT_NOATIME)) { ip->i_flag |= IN_ACCESS; if ((ioflag & IO_SYNC) == IO_SYNC) { error = UFS_WAPBL_BEGIN(vp->v_mount); if (error) goto out; error = UFS_UPDATE(vp, NULL, NULL, UPDATE_WAIT); UFS_WAPBL_END(vp->v_mount); } } out: /* Read error overrides any inode update error. */ if (oerror) error = oerror; return error; } /* * Vnode op for writing. */ int WRITE(void *v) { struct vop_write_args /* { struct vnode *a_vp; struct uio *a_uio; int a_ioflag; kauth_cred_t a_cred; } */ *ap = v; struct vnode *vp; struct uio *uio; struct inode *ip; FS *fs; kauth_cred_t cred; off_t osize, origoff, oldoff, preallocoff, endallocoff, nsize; int blkoffset, error, flags, ioflag, resid; int aflag; vsize_t bytelen; bool async; struct ufsmount *ump; cred = ap->a_cred; ioflag = ap->a_ioflag; uio = ap->a_uio; vp = ap->a_vp; ip = VTOI(vp); ump = ip->i_ump; KASSERT(vp->v_size == ip->i_size); KASSERT(uio->uio_rw == UIO_WRITE); KASSERT(vp->v_type == VREG); KASSERT(!ISSET(ioflag, IO_JOURNALLOCKED)); UFS_WAPBL_JUNLOCK_ASSERT(vp->v_mount); if (ioflag & IO_APPEND) uio->uio_offset = ip->i_size; if ((ip->i_flags & APPEND) && uio->uio_offset != ip->i_size) return (EPERM); fs = ip->I_FS; if (uio->uio_offset < 0 || (u_int64_t)uio->uio_offset + uio->uio_resid > ump->um_maxfilesize) return (EFBIG); if (uio->uio_resid == 0) return (0); flags = ioflag & IO_SYNC ? B_SYNC : 0; async = vp->v_mount->mnt_flag & MNT_ASYNC; origoff = uio->uio_offset; resid = uio->uio_resid; osize = ip->i_size; error = 0; KASSERT(vp->v_type == VREG); /* * XXX The entire write operation must occur in a single WAPBL * transaction because it may allocate disk blocks, if * appending or filling holes, which is allowed to happen only * if the write fully succeeds. * * If ubc_uiomove fails in the middle with EFAULT, we can clean * up at the end with UFS_TRUNCATE. But if the power fails in * the middle, there would be nobody to deallocate the blocks, * without an fsck to globally analyze the file system. * * If the increasingly inaccurately named WAPBL were augmented * with rollback records for block allocations, then we could * split this into multiple transactions and commit the * allocations in the last one. * * But WAPBL doesn't have that notion now, so we'll have to * live with gigantic transactions and WAPBL tentacles in * genfs_getpages/putpages to cope with the possibility that * the transaction may or may not be locked on entry to the * page cache. * * And even if we added that notion to WAPBL, it wouldn't help * us get rid of the tentacles in genfs_getpages/putpages * because we'd have to interoperate with old implementations * that assume they can replay the log without fsck. */ error = UFS_WAPBL_BEGIN(vp->v_mount); if (error) { return error; } preallocoff = round_page(ufs_blkroundup(fs, MAX(osize, uio->uio_offset))); aflag = ioflag & IO_SYNC ? B_SYNC : 0; nsize = MAX(osize, uio->uio_offset + uio->uio_resid); endallocoff = nsize - ufs_blkoff(fs, nsize); /* * if we're increasing the file size, deal with expanding * the fragment if there is one. */ if (nsize > osize && ufs_lblkno(fs, osize) < UFS_NDADDR && ufs_lblkno(fs, osize) != ufs_lblkno(fs, nsize) && ufs_blkroundup(fs, osize) != osize) { off_t eob; eob = ufs_blkroundup(fs, osize); uvm_vnp_setwritesize(vp, eob); error = ufs_balloc_range(vp, osize, eob - osize, cred, aflag); if (error) goto out; if (flags & B_SYNC) { rw_enter(vp->v_uobj.vmobjlock, RW_WRITER); VOP_PUTPAGES(vp, trunc_page(osize & fs->fs_bmask), round_page(eob), PGO_CLEANIT | PGO_SYNCIO | PGO_JOURNALLOCKED); } } while (uio->uio_resid > 0) { int ubc_flags = UBC_WRITE; bool overwrite; /* if we're overwrite a whole block */ off_t newoff; if (ioflag & IO_DIRECT) { genfs_directio(vp, uio, ioflag | IO_JOURNALLOCKED); } oldoff = uio->uio_offset; blkoffset = ufs_blkoff(fs, uio->uio_offset); bytelen = MIN(fs->fs_bsize - blkoffset, uio->uio_resid); if (bytelen == 0) { break; } /* * if we're filling in a hole, allocate the blocks now and * initialize the pages first. if we're extending the file, * we can safely allocate blocks without initializing pages * since the new blocks will be inaccessible until the write * is complete. */ overwrite = uio->uio_offset >= preallocoff && uio->uio_offset < endallocoff; if (!overwrite && (vp->v_vflag & VV_MAPPED) == 0 && ufs_blkoff(fs, uio->uio_offset) == 0 && (uio->uio_offset & PAGE_MASK) == 0) { vsize_t len; len = trunc_page(bytelen); len -= ufs_blkoff(fs, len); if (len > 0) { overwrite = true; bytelen = len; } } newoff = oldoff + bytelen; if (vp->v_size < newoff) { uvm_vnp_setwritesize(vp, newoff); } if (!overwrite) { error = ufs_balloc_range(vp, uio->uio_offset, bytelen, cred, aflag); if (error) break; } else { genfs_node_wrlock(vp); error = GOP_ALLOC(vp, uio->uio_offset, bytelen, aflag, cred); genfs_node_unlock(vp); if (error) break; ubc_flags |= UBC_FAULTBUSY; } /* * copy the data. */ error = ubc_uiomove(&vp->v_uobj, uio, bytelen, IO_ADV_DECODE(ioflag), ubc_flags | UBC_VNODE_FLAGS(vp)); /* * update UVM's notion of the size now that we've * copied the data into the vnode's pages. * * we should update the size even when uiomove failed. */ if (vp->v_size < newoff) { uvm_vnp_setsize(vp, newoff); } if (error) break; /* * flush what we just wrote if necessary. * XXXUBC simplistic async flushing. */ if (!async && oldoff >> 16 != uio->uio_offset >> 16) { rw_enter(vp->v_uobj.vmobjlock, RW_WRITER); error = VOP_PUTPAGES(vp, (oldoff >> 16) << 16, (uio->uio_offset >> 16) << 16, PGO_CLEANIT | PGO_JOURNALLOCKED | PGO_LAZY); if (error) break; } } if (error == 0 && ioflag & IO_SYNC) { rw_enter(vp->v_uobj.vmobjlock, RW_WRITER); error = VOP_PUTPAGES(vp, trunc_page(origoff & fs->fs_bmask), round_page(ufs_blkroundup(fs, uio->uio_offset)), PGO_CLEANIT | PGO_SYNCIO | PGO_JOURNALLOCKED); } out: error = ufs_post_write_update(vp, uio, ioflag, cred, osize, resid, error); UFS_WAPBL_END(vp->v_mount); return (error); } /* * UFS op for writing via the buffer cache */ int BUFWR(struct vnode *vp, struct uio *uio, int ioflag, kauth_cred_t cred) { struct inode *ip; struct ufsmount *ump; FS *fs; int flags; struct buf *bp; off_t osize; int resid, xfersize, size, blkoffset; daddr_t lbn; int error; KASSERT(ISSET(ioflag, IO_NODELOCKED)); KASSERT(VOP_ISLOCKED(vp) == LK_EXCLUSIVE); KASSERT(vp->v_type == VDIR || vp->v_type == VLNK); KASSERT(vp->v_type != VDIR || ISSET(ioflag, IO_SYNC)); KASSERT(uio->uio_rw == UIO_WRITE); KASSERT(ISSET(ioflag, IO_JOURNALLOCKED)); UFS_WAPBL_JLOCK_ASSERT(vp->v_mount); ip = VTOI(vp); ump = ip->i_ump; fs = ip->I_FS; KASSERT(vp->v_size == ip->i_size); if (uio->uio_offset < 0 || uio->uio_resid > ump->um_maxfilesize || uio->uio_offset > (ump->um_maxfilesize - uio->uio_resid)) return EFBIG; if (uio->uio_resid == 0) return 0; flags = ioflag & IO_SYNC ? B_SYNC : 0; resid = uio->uio_resid; osize = ip->i_size; error = 0; KASSERT(vp->v_type != VREG); /* XXX Should never have pages cached here. */ KASSERT(vp->v_uobj.uo_npages == 0); while (uio->uio_resid > 0) { lbn = ufs_lblkno(fs, uio->uio_offset); blkoffset = ufs_blkoff(fs, uio->uio_offset); xfersize = MIN(fs->fs_bsize - blkoffset, uio->uio_resid); if (fs->fs_bsize > xfersize) flags |= B_CLRBUF; else flags &= ~B_CLRBUF; error = UFS_BALLOC(vp, uio->uio_offset, xfersize, cred, flags, &bp); if (error) break; if (uio->uio_offset + xfersize > ip->i_size) { ip->i_size = uio->uio_offset + xfersize; DIP_ASSIGN(ip, size, ip->i_size); uvm_vnp_setsize(vp, ip->i_size); } size = ufs_blksize(fs, ip, lbn) - bp->b_resid; if (xfersize > size) xfersize = size; error = uiomove((char *)bp->b_data + blkoffset, xfersize, uio); /* * if we didn't clear the block and the uiomove failed, * the buf will now contain part of some other file, * so we need to invalidate it. */ if (error && (flags & B_CLRBUF) == 0) { brelse(bp, BC_INVAL); break; } if (ioflag & IO_SYNC) (void)bwrite(bp); else if (xfersize + blkoffset == fs->fs_bsize) bawrite(bp); else bdwrite(bp); if (error || xfersize == 0) break; } error = ufs_post_write_update(vp, uio, ioflag, cred, osize, resid, error); return (error); } static int ufs_post_write_update(struct vnode *vp, struct uio *uio, int ioflag, kauth_cred_t cred, off_t osize, int resid, int oerror) { struct inode *ip = VTOI(vp); int error = oerror; /* Trigger ctime and mtime updates, and atime if MNT_RELATIME. */ ip->i_flag |= IN_CHANGE | IN_UPDATE; if (vp->v_mount->mnt_flag & MNT_RELATIME) ip->i_flag |= IN_ACCESS; /* * If we successfully wrote any data and we are not the superuser, * we clear the setuid and setgid bits as a precaution against * tampering. */ if (resid > uio->uio_resid && cred) { if (ip->i_mode & ISUID) { if (kauth_authorize_vnode(cred, KAUTH_VNODE_RETAIN_SUID, vp, NULL, EPERM) != 0) { ip->i_mode &= ~ISUID; DIP_ASSIGN(ip, mode, ip->i_mode); } } if (ip->i_mode & ISGID) { if (kauth_authorize_vnode(cred, KAUTH_VNODE_RETAIN_SGID, vp, NULL, EPERM) != 0) { ip->i_mode &= ~ISGID; DIP_ASSIGN(ip, mode, ip->i_mode); } } } /* * Update the size on disk: truncate back to original size on * error, or reflect the new size on success. */ if (error) { (void) UFS_TRUNCATE(vp, osize, ioflag & IO_SYNC, cred); uio->uio_offset -= resid - uio->uio_resid; uio->uio_resid = resid; } else if (resid > uio->uio_resid && (ioflag & IO_SYNC) == IO_SYNC) error = UFS_UPDATE(vp, NULL, NULL, UPDATE_WAIT); else UFS_WAPBL_UPDATE(vp, NULL, NULL, 0); /* Make sure the vnode uvm size matches the inode file size. */ KASSERT(vp->v_size == ip->i_size); /* Write error overrides any inode update error. */ if (oerror) error = oerror; return error; }
34 34 14 14 13 4 4 4 4 3 1 1 1 1 1 1 10 10 10 1 2 4 4 5 7 7 5 1 1 2 5 5 2 4 2 40 40 39 16 16 16 34 40 40 40 40 37 3 17 23 23 17 40 17 23 40 40 40 37 3 40 39 40 5 5 5 5 5 5 5 5 5 5 4 2 5 4 1 5 5 30 30 30 25 5 30 9 21 21 4 18 30 25 5 30 14 16 30 30 30 30 16 14 18 18 17 1035 1025 1025 1029 498 499 499 265 264 265 6 6 1053 1055 1056 4 1057 446 25 25 25 218 218 218 1027 1031 1024 808 15 15 15 15 3 5 2 5 1 5 13 41 41 41 41 13 47 48 47 46 48 48 47 48 48 5 4 47 46 47 47 47 2 5 5 5 5 5 2646 5 5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 /* $NetBSD: kern_lwp.c,v 1.269 2023/12/20 21:03:50 andvar Exp $ */ /*- * Copyright (c) 2001, 2006, 2007, 2008, 2009, 2019, 2020, 2023 * The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Nathan J. Williams, and Andrew Doran. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /* * Overview * * Lightweight processes (LWPs) are the basic unit or thread of * execution within the kernel. The core state of an LWP is described * by "struct lwp", also known as lwp_t. * * Each LWP is contained within a process (described by "struct proc"), * Every process contains at least one LWP, but may contain more. The * process describes attributes shared among all of its LWPs such as a * private address space, global execution state (stopped, active, * zombie, ...), signal disposition and so on. On a multiprocessor * machine, multiple LWPs be executing concurrently in the kernel. * * Execution states * * At any given time, an LWP has overall state that is described by * lwp::l_stat. The states are broken into two sets below. The first * set is guaranteed to represent the absolute, current state of the * LWP: * * LSONPROC * * On processor: the LWP is executing on a CPU, either in the * kernel or in user space. * * LSRUN * * Runnable: the LWP is parked on a run queue, and may soon be * chosen to run by an idle processor, or by a processor that * has been asked to preempt a currently running but lower * priority LWP. * * LSIDL * * Idle: the LWP has been created but has not yet executed, or * it has ceased executing a unit of work and is waiting to be * started again. This state exists so that the LWP can occupy * a slot in the process & PID table, but without having to * worry about being touched; lookups of the LWP by ID will * fail while in this state. The LWP will become visible for * lookup once its state transitions further. Some special * kernel threads also (ab)use this state to indicate that they * are idle (soft interrupts and idle LWPs). * * LSSUSPENDED: * * Suspended: the LWP has had its execution suspended by * another LWP in the same process using the _lwp_suspend() * system call. User-level LWPs also enter the suspended * state when the system is shutting down. * * The second set represent a "statement of intent" on behalf of the * LWP. The LWP may in fact be executing on a processor, may be * sleeping or idle. It is expected to take the necessary action to * stop executing or become "running" again within a short timeframe. * The LP_RUNNING flag in lwp::l_pflag indicates that an LWP is running. * Importantly, it indicates that its state is tied to a CPU. * * LSZOMB: * * Dead or dying: the LWP has released most of its resources * and is about to switch away into oblivion, or has already * switched away. When it switches away, its few remaining * resources can be collected. * * LSSLEEP: * * Sleeping: the LWP has entered itself onto a sleep queue, and * has switched away or will switch away shortly to allow other * LWPs to run on the CPU. * * LSSTOP: * * Stopped: the LWP has been stopped as a result of a job * control signal, or as a result of the ptrace() interface. * * Stopped LWPs may run briefly within the kernel to handle * signals that they receive, but will not return to user space * until their process' state is changed away from stopped. * * Single LWPs within a process can not be set stopped * selectively: all actions that can stop or continue LWPs * occur at the process level. * * State transitions * * Note that the LSSTOP state may only be set when returning to * user space in userret(), or when sleeping interruptably. The * LSSUSPENDED state may only be set in userret(). Before setting * those states, we try to ensure that the LWPs will release all * locks that they hold, and at a minimum try to ensure that the * LWP can be set runnable again by a signal. * * LWPs may transition states in the following ways: * * RUN -------> ONPROC ONPROC -----> RUN * > SLEEP * > STOPPED * > SUSPENDED * > ZOMB * > IDL (special cases) * * STOPPED ---> RUN SUSPENDED --> RUN * > SLEEP * * SLEEP -----> ONPROC IDL --------> RUN * > RUN > SUSPENDED * > STOPPED > STOPPED * > ONPROC (special cases) * * Some state transitions are only possible with kernel threads (eg * ONPROC -> IDL) and happen under tightly controlled circumstances * free of unwanted side effects. * * Migration * * Migration of threads from one CPU to another could be performed * internally by the scheduler via sched_takecpu() or sched_catchlwp() * functions. The universal lwp_migrate() function should be used for * any other cases. Subsystems in the kernel must be aware that CPU * of LWP may change, while it is not locked. * * Locking * * The majority of fields in 'struct lwp' are covered by a single, * general spin lock pointed to by lwp::l_mutex. The locks covering * each field are documented in sys/lwp.h. * * State transitions must be made with the LWP's general lock held, * and may cause the LWP's lock pointer to change. Manipulation of * the general lock is not performed directly, but through calls to * lwp_lock(), lwp_unlock() and others. It should be noted that the * adaptive locks are not allowed to be released while the LWP's lock * is being held (unlike for other spin-locks). * * States and their associated locks: * * LSIDL, LSONPROC, LSZOMB, LSSUPENDED: * * Always covered by spc_lwplock, which protects LWPs not * associated with any other sync object. This is a per-CPU * lock and matches lwp::l_cpu. * * LSRUN: * * Always covered by spc_mutex, which protects the run queues. * This is a per-CPU lock and matches lwp::l_cpu. * * LSSLEEP: * * Covered by a lock associated with the sleep queue (sometimes * a turnstile sleep queue) that the LWP resides on. This can * be spc_lwplock for SOBJ_SLEEPQ_NULL (an "untracked" sleep). * * LSSTOP: * * If the LWP was previously sleeping (l_wchan != NULL), then * l_mutex references the sleep queue lock. If the LWP was * runnable or on the CPU when halted, or has been removed from * the sleep queue since halted, then the lock is spc_lwplock. * * The lock order is as follows: * * sleepq -> turnstile -> spc_lwplock -> spc_mutex * * Each process has a scheduler state lock (proc::p_lock), and a * number of counters on LWPs and their states: p_nzlwps, p_nrlwps, and * so on. When an LWP is to be entered into or removed from one of the * following states, p_lock must be held and the process wide counters * adjusted: * * LSIDL, LSZOMB, LSSTOP, LSSUSPENDED * * (But not always for kernel threads. There are some special cases * as mentioned above: soft interrupts, and the idle loops.) * * Note that an LWP is considered running or likely to run soon if in * one of the following states. This affects the value of p_nrlwps: * * LSRUN, LSONPROC, LSSLEEP * * p_lock does not need to be held when transitioning among these * three states, hence p_lock is rarely taken for state transitions. */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: kern_lwp.c,v 1.269 2023/12/20 21:03:50 andvar Exp $"); #include "opt_ddb.h" #include "opt_lockdebug.h" #include "opt_dtrace.h" #define _LWP_API_PRIVATE #include <sys/param.h> #include <sys/atomic.h> #include <sys/cprng.h> #include <sys/cpu.h> #include <sys/dtrace_bsd.h> #include <sys/filedesc.h> #include <sys/fstrans.h> #include <sys/futex.h> #include <sys/intr.h> #include <sys/kauth.h> #include <sys/kcov.h> #include <sys/kmem.h> #include <sys/lockdebug.h> #include <sys/lwpctl.h> #include <sys/msan.h> #include <sys/pool.h> #include <sys/proc.h> #include <sys/pset.h> #include <sys/psref.h> #include <sys/ptrace.h> #include <sys/sdt.h> #include <sys/sleepq.h> #include <sys/syncobj.h> #include <sys/syscall_stats.h> #include <sys/syscallargs.h> #include <sys/sysctl.h> #include <sys/systm.h> #include <sys/uidinfo.h> #include <sys/xcall.h> #include <uvm/uvm_extern.h> #include <uvm/uvm_object.h> static pool_cache_t lwp_cache __read_mostly; struct lwplist alllwp __cacheline_aligned; static int lwp_ctor(void *, void *, int); static void lwp_dtor(void *, void *); /* DTrace proc provider probes */ SDT_PROVIDER_DEFINE(proc); SDT_PROBE_DEFINE1(proc, kernel, , lwp__create, "struct lwp *"); SDT_PROBE_DEFINE1(proc, kernel, , lwp__start, "struct lwp *"); SDT_PROBE_DEFINE1(proc, kernel, , lwp__exit, "struct lwp *"); struct turnstile turnstile0 __cacheline_aligned; struct lwp lwp0 __aligned(MIN_LWP_ALIGNMENT) = { #ifdef LWP0_CPU_INFO .l_cpu = LWP0_CPU_INFO, #endif #ifdef LWP0_MD_INITIALIZER .l_md = LWP0_MD_INITIALIZER, #endif .l_proc = &proc0, .l_lid = 0, /* we own proc0's slot in the pid table */ .l_flag = LW_SYSTEM, .l_stat = LSONPROC, .l_ts = &turnstile0, .l_syncobj = &sched_syncobj, .l_refcnt = 0, .l_priority = PRI_USER + NPRI_USER - 1, .l_inheritedprio = -1, .l_class = SCHED_OTHER, .l_psid = PS_NONE, .l_pi_lenders = SLIST_HEAD_INITIALIZER(&lwp0.l_pi_lenders), .l_name = __UNCONST("swapper"), .l_fd = &filedesc0, }; static int lwp_maxlwp(void) { /* Assume 1 LWP per 1MiB. */ uint64_t lwps_per = ctob(physmem) / (1024 * 1024); return MAX(MIN(MAXMAXLWP, lwps_per), MAXLWP); } static int sysctl_kern_maxlwp(SYSCTLFN_PROTO); /* * sysctl helper routine for kern.maxlwp. Ensures that the new * values are not too low or too high. */ static int sysctl_kern_maxlwp(SYSCTLFN_ARGS) { int error, nmaxlwp; struct sysctlnode node; nmaxlwp = maxlwp; node = *rnode; node.sysctl_data = &nmaxlwp; error = sysctl_lookup(SYSCTLFN_CALL(&node)); if (error || newp == NULL) return error; if (nmaxlwp < 0 || nmaxlwp >= MAXMAXLWP) return EINVAL; if (nmaxlwp > lwp_maxlwp()) return EINVAL; maxlwp = nmaxlwp; return 0; } static void sysctl_kern_lwp_setup(void) { sysctl_createv(NULL, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_INT, "maxlwp", SYSCTL_DESCR("Maximum number of simultaneous threads"), sysctl_kern_maxlwp, 0, NULL, 0, CTL_KERN, CTL_CREATE, CTL_EOL); } void lwpinit(void) { LIST_INIT(&alllwp); lwpinit_specificdata(); /* * Provide a barrier to ensure that all mutex_oncpu() and rw_oncpu() * calls will exit before memory of LWPs is returned to the pool, where * KVA of LWP structure might be freed and re-used for other purposes. * Kernel preemption is disabled around mutex_oncpu() and rw_oncpu() * callers, therefore a regular passive serialization barrier will * do the job. */ lwp_cache = pool_cache_init(sizeof(lwp_t), MIN_LWP_ALIGNMENT, 0, PR_PSERIALIZE, "lwppl", NULL, IPL_NONE, lwp_ctor, lwp_dtor, NULL); maxlwp = lwp_maxlwp(); sysctl_kern_lwp_setup(); } void lwp0_init(void) { struct lwp *l = &lwp0; KASSERT((void *)uvm_lwp_getuarea(l) != NULL); LIST_INSERT_HEAD(&alllwp, l, l_list); callout_init(&l->l_timeout_ch, CALLOUT_MPSAFE); callout_setfunc(&l->l_timeout_ch, sleepq_timeout, l); cv_init(&l->l_sigcv, "sigwait"); cv_init(&l->l_waitcv, "vfork"); l->l_cred = kauth_cred_hold(proc0.p_cred); kdtrace_thread_ctor(NULL, l); lwp_initspecific(l); SYSCALL_TIME_LWP_INIT(l); } /* * Initialize the non-zeroed portion of an lwp_t. */ static int lwp_ctor(void *arg, void *obj, int flags) { lwp_t *l = obj; l->l_stat = LSIDL; l->l_cpu = curcpu(); l->l_mutex = l->l_cpu->ci_schedstate.spc_lwplock; l->l_ts = kmem_alloc(sizeof(*l->l_ts), flags == PR_WAITOK ? KM_SLEEP : KM_NOSLEEP); if (l->l_ts == NULL) { return ENOMEM; } else { turnstile_ctor(l->l_ts); return 0; } } static void lwp_dtor(void *arg, void *obj) { lwp_t *l = obj; /* * The value of l->l_cpu must still be valid at this point. */ KASSERT(l->l_cpu != NULL); /* * We can't return turnstile0 to the pool (it didn't come from it), * so if it comes up just drop it quietly and move on. */ if (l->l_ts != &turnstile0) kmem_free(l->l_ts, sizeof(*l->l_ts)); } /* * Set an LWP suspended. * * Must be called with p_lock held, and the LWP locked. Will unlock the * LWP before return. */ int lwp_suspend(struct lwp *curl, struct lwp *t) { int error; KASSERT(mutex_owned(t->l_proc->p_lock)); KASSERT(lwp_locked(t, NULL)); KASSERT(curl != t || curl->l_stat == LSONPROC); /* * If the current LWP has been told to exit, we must not suspend anyone * else or deadlock could occur. We won't return to userspace. */ if ((curl->l_flag & (LW_WEXIT | LW_WCORE)) != 0) { lwp_unlock(t); return (EDEADLK); } if ((t->l_flag & LW_DBGSUSPEND) != 0) { lwp_unlock(t); return 0; } error = 0; switch (t->l_stat) { case LSRUN: case LSONPROC: t->l_flag |= LW_WSUSPEND; lwp_need_userret(t); lwp_unlock(t); break; case LSSLEEP: t->l_flag |= LW_WSUSPEND; lwp_need_userret(t); /* * Kick the LWP and try to get it to the kernel boundary * so that it will release any locks that it holds. * setrunnable() will release the lock. */ if ((t->l_flag & LW_SINTR) != 0) setrunnable(t); else lwp_unlock(t); break; case LSSUSPENDED: lwp_unlock(t); break; case LSSTOP: t->l_flag |= LW_WSUSPEND; lwp_need_userret(t); setrunnable(t); break; case LSIDL: case LSZOMB: error = EINTR; /* It's what Solaris does..... */ lwp_unlock(t); break; } return (error); } /* * Restart a suspended LWP. * * Must be called with p_lock held, and the LWP locked. Will unlock the * LWP before return. */ void lwp_continue(struct lwp *l) { KASSERT(mutex_owned(l->l_proc->p_lock)); KASSERT(lwp_locked(l, NULL)); /* If rebooting or not suspended, then just bail out. */ if ((l->l_flag & LW_WREBOOT) != 0) { lwp_unlock(l); return; } l->l_flag &= ~LW_WSUSPEND; if (l->l_stat != LSSUSPENDED || (l->l_flag & LW_DBGSUSPEND) != 0) { lwp_unlock(l); return; } /* setrunnable() will release the lock. */ setrunnable(l); } /* * Restart a stopped LWP. * * Must be called with p_lock held, and the LWP NOT locked. Will unlock the * LWP before return. */ void lwp_unstop(struct lwp *l) { struct proc *p = l->l_proc; KASSERT(mutex_owned(&proc_lock)); KASSERT(mutex_owned(p->p_lock)); lwp_lock(l); KASSERT((l->l_flag & LW_DBGSUSPEND) == 0); /* If not stopped, then just bail out. */ if (l->l_stat != LSSTOP) { lwp_unlock(l); return; } p->p_stat = SACTIVE; p->p_sflag &= ~PS_STOPPING; if (!p->p_waited) p->p_pptr->p_nstopchild--; if (l->l_wchan == NULL) { /* setrunnable() will release the lock. */ setrunnable(l); } else if (p->p_xsig && (l->l_flag & LW_SINTR) != 0) { /* setrunnable() so we can receive the signal */ setrunnable(l); } else { l->l_stat = LSSLEEP; p->p_nrlwps++; lwp_unlock(l); } } /* * Wait for an LWP within the current process to exit. If 'lid' is * non-zero, we are waiting for a specific LWP. * * Must be called with p->p_lock held. */ int lwp_wait(struct lwp *l, lwpid_t lid, lwpid_t *departed, bool exiting) { const lwpid_t curlid = l->l_lid; proc_t *p = l->l_proc; lwp_t *l2, *next; int error; KASSERT(mutex_owned(p->p_lock)); p->p_nlwpwait++; l->l_waitingfor = lid; for (;;) { int nfound; /* * Avoid a race between exit1() and sigexit(): if the * process is dumping core, then we need to bail out: call * into lwp_userret() where we will be suspended until the * deed is done. */ if ((p->p_sflag & PS_WCORE) != 0) { mutex_exit(p->p_lock); lwp_userret(l); KASSERT(false); } /* * First off, drain any detached LWP that is waiting to be * reaped. */ if ((l2 = p->p_zomblwp) != NULL) { p->p_zomblwp = NULL; lwp_free(l2, false, false);/* releases proc mutex */ mutex_enter(p->p_lock); continue; } /* * Now look for an LWP to collect. If the whole process is * exiting, count detached LWPs as eligible to be collected, * but don't drain them here. */ nfound = 0; error = 0; /* * If given a specific LID, go via pid_table and make sure * it's not detached. */ if (lid != 0) { l2 = proc_find_lwp(p, lid); if (l2 == NULL) { error = ESRCH; break; } KASSERT(l2->l_lid == lid); if ((l2->l_prflag & LPR_DETACHED) != 0) { error = EINVAL; break; } } else { l2 = LIST_FIRST(&p->p_lwps); } for (; l2 != NULL; l2 = next) { next = (lid != 0 ? NULL : LIST_NEXT(l2, l_sibling)); /* * If a specific wait and the target is waiting on * us, then avoid deadlock. This also traps LWPs * that try to wait on themselves. * * Note that this does not handle more complicated * cycles, like: t1 -> t2 -> t3 -> t1. The process * can still be killed so it is not a major problem. */ if (l2->l_lid == lid && l2->l_waitingfor == curlid) { error = EDEADLK; break; } if (l2 == l) continue; if ((l2->l_prflag & LPR_DETACHED) != 0) { nfound += exiting; continue; } if (lid != 0) { /* * Mark this LWP as the first waiter, if there * is no other. */ if (l2->l_waiter == 0) l2->l_waiter = curlid; } else if (l2->l_waiter != 0) { /* * It already has a waiter - so don't * collect it. If the waiter doesn't * grab it we'll get another chance * later. */ nfound++; continue; } nfound++; /* No need to lock the LWP in order to see LSZOMB. */ if (l2->l_stat != LSZOMB) continue; /* * We're no longer waiting. Reset the "first waiter" * pointer on the target, in case it was us. */ l->l_waitingfor = 0; l2->l_waiter = 0; p->p_nlwpwait--; if (departed) *departed = l2->l_lid; sched_lwp_collect(l2); /* lwp_free() releases the proc lock. */ lwp_free(l2, false, false); mutex_enter(p->p_lock); return 0; } if (error != 0) break; if (nfound == 0) { error = ESRCH; break; } /* * Note: since the lock will be dropped, need to restart on * wakeup to run all LWPs again, e.g. there may be new LWPs. */ if (exiting) { KASSERT(p->p_nlwps > 1); error = cv_timedwait(&p->p_lwpcv, p->p_lock, 1); break; } /* * Break out if all LWPs are in _lwp_wait(). There are * other ways to hang the process with _lwp_wait(), but the * sleep is interruptable so little point checking for them. */ if (p->p_nlwpwait == p->p_nlwps) { error = EDEADLK; break; } /* * Sit around and wait for something to happen. We'll be * awoken if any of the conditions examined change: if an * LWP exits, is collected, or is detached. */ if ((error = cv_wait_sig(&p->p_lwpcv, p->p_lock)) != 0) break; } /* * We didn't find any LWPs to collect, we may have received a * signal, or some other condition has caused us to bail out. * * If waiting on a specific LWP, clear the waiters marker: some * other LWP may want it. Then, kick all the remaining waiters * so that they can re-check for zombies and for deadlock. */ if (lid != 0) { l2 = proc_find_lwp(p, lid); KASSERT(l2 == NULL || l2->l_lid == lid); if (l2 != NULL && l2->l_waiter == curlid) l2->l_waiter = 0; } p->p_nlwpwait--; l->l_waitingfor = 0; cv_broadcast(&p->p_lwpcv); return error; } /* * Create a new LWP within process 'p2', using LWP 'l1' as a template. * The new LWP is created in state LSIDL and must be set running, * suspended, or stopped by the caller. */ int lwp_create(lwp_t *l1, proc_t *p2, vaddr_t uaddr, int flags, void *stack, size_t stacksize, void (*func)(void *), void *arg, lwp_t **rnewlwpp, int sclass, const sigset_t *sigmask, const stack_t *sigstk) { struct lwp *l2; KASSERT(l1 == curlwp || l1->l_proc == &proc0); /* * Enforce limits, excluding the first lwp and kthreads. We must * use the process credentials here when adjusting the limit, as * they are what's tied to the accounting entity. However for * authorizing the action, we'll use the LWP's credentials. */ mutex_enter(p2->p_lock); if (p2->p_nlwps != 0 && p2 != &proc0) { uid_t uid = kauth_cred_getuid(p2->p_cred); int count = chglwpcnt(uid, 1); if (__predict_false(count > p2->p_rlimit[RLIMIT_NTHR].rlim_cur)) { if (kauth_authorize_process(l1->l_cred, KAUTH_PROCESS_RLIMIT, p2, KAUTH_ARG(KAUTH_REQ_PROCESS_RLIMIT_BYPASS), &p2->p_rlimit[RLIMIT_NTHR], KAUTH_ARG(RLIMIT_NTHR)) != 0) { (void)chglwpcnt(uid, -1); mutex_exit(p2->p_lock); return EAGAIN; } } } /* * First off, reap any detached LWP waiting to be collected. * We can re-use its LWP structure and turnstile. */ if ((l2 = p2->p_zomblwp) != NULL) { p2->p_zomblwp = NULL; lwp_free(l2, true, false); /* p2 now unlocked by lwp_free() */ KASSERT(l2->l_ts != NULL); KASSERT(l2->l_inheritedprio == -1); KASSERT(SLIST_EMPTY(&l2->l_pi_lenders)); memset(&l2->l_startzero, 0, sizeof(*l2) - offsetof(lwp_t, l_startzero)); } else { mutex_exit(p2->p_lock); l2 = pool_cache_get(lwp_cache, PR_WAITOK); memset(&l2->l_startzero, 0, sizeof(*l2) - offsetof(lwp_t, l_startzero)); SLIST_INIT(&l2->l_pi_lenders); } /* * Because of lockless lookup via pid_table, the LWP can be locked * and inspected briefly even after it's freed, so a few fields are * kept stable. */ KASSERT(l2->l_stat == LSIDL); KASSERT(l2->l_cpu != NULL); KASSERT(l2->l_ts != NULL); KASSERT(l2->l_mutex == l2->l_cpu->ci_schedstate.spc_lwplock); l2->l_proc = p2; l2->l_refcnt = 0; l2->l_class = sclass; /* * Allocate a process ID for this LWP. We need to do this now * while we can still unwind if it fails. Because we're marked * as LSIDL, no lookups by the ID will succeed. * * N.B. this will always succeed for the first LWP in a process, * because proc_alloc_lwpid() will usurp the slot. Also note * that l2->l_proc MUST be valid so that lookups of the proc * will succeed, even if the LWP itself is not visible. */ if (__predict_false(proc_alloc_lwpid(p2, l2) == -1)) { pool_cache_put(lwp_cache, l2); return EAGAIN; } /* * If vfork(), we want the LWP to run fast and on the same CPU * as its parent, so that it can reuse the VM context and cache * footprint on the local CPU. */ l2->l_boostpri = ((flags & LWP_VFORK) ? PRI_KERNEL : PRI_USER); l2->l_priority = l1->l_priority; l2->l_inheritedprio = -1; l2->l_protectprio = -1; l2->l_auxprio = -1; l2->l_flag = 0; l2->l_pflag = LP_MPSAFE; TAILQ_INIT(&l2->l_ld_locks); l2->l_psrefs = 0; kmsan_lwp_alloc(l2); /* * For vfork, borrow parent's lwpctl context if it exists. * This also causes us to return via lwp_userret. */ if (flags & LWP_VFORK && l1->l_lwpctl) { l2->l_lwpctl = l1->l_lwpctl; l2->l_flag |= LW_LWPCTL; } /* * If not the first LWP in the process, grab a reference to the * descriptor table. */ l2->l_fd = p2->p_fd; if (p2->p_nlwps != 0) { KASSERT(l1->l_proc == p2); fd_hold(l2); } else { KASSERT(l1->l_proc != p2); } if (p2->p_flag & PK_SYSTEM) { /* Mark it as a system LWP. */ l2->l_flag |= LW_SYSTEM; } kdtrace_thread_ctor(NULL, l2); lwp_initspecific(l2); sched_lwp_fork(l1, l2); callout_init(&l2->l_timeout_ch, CALLOUT_MPSAFE); callout_setfunc(&l2->l_timeout_ch, sleepq_timeout, l2); cv_init(&l2->l_sigcv, "sigwait"); cv_init(&l2->l_waitcv, "vfork"); l2->l_syncobj = &sched_syncobj; PSREF_DEBUG_INIT_LWP(l2); if (rnewlwpp != NULL) *rnewlwpp = l2; /* * PCU state needs to be saved before calling uvm_lwp_fork() so that * the MD cpu_lwp_fork() can copy the saved state to the new LWP. */ pcu_save_all(l1); #if PCU_UNIT_COUNT > 0 l2->l_pcu_valid = l1->l_pcu_valid; #endif uvm_lwp_setuarea(l2, uaddr); uvm_lwp_fork(l1, l2, stack, stacksize, func, (arg != NULL) ? arg : l2); mutex_enter(p2->p_lock); l2->l_cred = kauth_cred_hold(p2->p_cred); if ((flags & LWP_DETACHED) != 0) { l2->l_prflag = LPR_DETACHED; p2->p_ndlwps++; } else l2->l_prflag = 0; if (l1->l_proc == p2) { /* * These flags are set while p_lock is held. Copy with * p_lock held too, so the LWP doesn't sneak into the * process without them being set. */ l2->l_flag |= (l1->l_flag & (LW_WEXIT | LW_WREBOOT | LW_WCORE)); } else { /* fork(): pending core/exit doesn't apply to child. */ l2->l_flag |= (l1->l_flag & LW_WREBOOT); } l2->l_sigstk = *sigstk; l2->l_sigmask = *sigmask; TAILQ_INIT(&l2->l_sigpend.sp_info); sigemptyset(&l2->l_sigpend.sp_set); LIST_INSERT_HEAD(&p2->p_lwps, l2, l_sibling); p2->p_nlwps++; p2->p_nrlwps++; KASSERT(l2->l_affinity == NULL); /* Inherit the affinity mask. */ if (l1->l_affinity) { /* * Note that we hold the state lock while inheriting * the affinity to avoid race with sched_setaffinity(). */ lwp_lock(l1); if (l1->l_affinity) { kcpuset_use(l1->l_affinity); l2->l_affinity = l1->l_affinity; } lwp_unlock(l1); } /* Ensure a trip through lwp_userret() if needed. */ if ((l2->l_flag & LW_USERRET) != 0) { lwp_need_userret(l2); } /* This marks the end of the "must be atomic" section. */ mutex_exit(p2->p_lock); SDT_PROBE(proc, kernel, , lwp__create, l2, 0, 0, 0, 0); mutex_enter(&proc_lock); LIST_INSERT_HEAD(&alllwp, l2, l_list); /* Inherit a processor-set */ l2->l_psid = l1->l_psid; mutex_exit(&proc_lock); SYSCALL_TIME_LWP_INIT(l2); if (p2->p_emul->e_lwp_fork) (*p2->p_emul->e_lwp_fork)(l1, l2); return (0); } /* * Set a new LWP running. If the process is stopping, then the LWP is * created stopped. */ void lwp_start(lwp_t *l, int flags) { proc_t *p = l->l_proc; mutex_enter(p->p_lock); lwp_lock(l); KASSERT(l->l_stat == LSIDL); if ((flags & LWP_SUSPENDED) != 0) { /* It'll suspend itself in lwp_userret(). */ l->l_flag |= LW_WSUSPEND; lwp_need_userret(l); } if (p->p_stat == SSTOP || (p->p_sflag & PS_STOPPING) != 0) { KASSERT(l->l_wchan == NULL); l->l_stat = LSSTOP; p->p_nrlwps--; lwp_unlock(l); } else { setrunnable(l); /* LWP now unlocked */ } mutex_exit(p->p_lock); } /* * Called by MD code when a new LWP begins execution. Must be called * with the previous LWP locked (so at splsched), or if there is no * previous LWP, at splsched. */ void lwp_startup(struct lwp *prev, struct lwp *new_lwp) { kmutex_t *lock; KASSERTMSG(new_lwp == curlwp, "l %p curlwp %p prevlwp %p", new_lwp, curlwp, prev); KASSERT(kpreempt_disabled()); KASSERT(prev != NULL); KASSERT((prev->l_pflag & LP_RUNNING) != 0); KASSERT(curcpu()->ci_mtx_count == -2); /* * Immediately mark the previous LWP as no longer running and * unlock (to keep lock wait times short as possible). If a * zombie, don't touch after clearing LP_RUNNING as it could be * reaped by another CPU. Use atomic_store_release to ensure * this -- matches atomic_load_acquire in lwp_free. */ lock = prev->l_mutex; if (__predict_false(prev->l_stat == LSZOMB)) { atomic_store_release(&prev->l_pflag, prev->l_pflag & ~LP_RUNNING); } else { prev->l_pflag &= ~LP_RUNNING; } mutex_spin_exit(lock); /* Correct spin mutex count after mi_switch(). */ curcpu()->ci_mtx_count = 0; /* Install new VM context. */ if (__predict_true(new_lwp->l_proc->p_vmspace)) { pmap_activate(new_lwp); } /* We remain at IPL_SCHED from mi_switch() - reset it. */ spl0(); LOCKDEBUG_BARRIER(NULL, 0); SDT_PROBE(proc, kernel, , lwp__start, new_lwp, 0, 0, 0, 0); /* For kthreads, acquire kernel lock if not MPSAFE. */ if (__predict_false((new_lwp->l_pflag & LP_MPSAFE) == 0)) { KERNEL_LOCK(1, new_lwp); } } /* * Exit an LWP. * * *** WARNING *** This can be called with (l != curlwp) in error paths. */ void lwp_exit(struct lwp *l) { struct proc *p = l->l_proc; struct lwp *l2; bool current; current = (l == curlwp); KASSERT(current || l->l_stat == LSIDL); KASSERT(current || l->l_target_cpu == NULL); KASSERT(p == curproc); SDT_PROBE(proc, kernel, , lwp__exit, l, 0, 0, 0, 0); /* Verify that we hold no locks; for DIAGNOSTIC check kernel_lock. */ LOCKDEBUG_BARRIER(NULL, 0); KASSERTMSG(curcpu()->ci_biglock_count == 0, "kernel_lock leaked"); /* * If we are the last live LWP in a process, we need to exit the * entire process. We do so with an exit status of zero, because * it's a "controlled" exit, and because that's what Solaris does. * * We are not quite a zombie yet, but for accounting purposes we * must increment the count of zombies here. * * Note: the last LWP's specificdata will be deleted here. */ mutex_enter(p->p_lock); if (p->p_nlwps - p->p_nzlwps == 1) { KASSERT(current == true); KASSERT(p != &proc0); exit1(l, 0, 0); /* NOTREACHED */ } p->p_nzlwps++; /* * Perform any required thread cleanup. Do this early so * anyone wanting to look us up with lwp_getref_lwpid() will * fail to find us before we become a zombie. * * N.B. this will unlock p->p_lock on our behalf. */ lwp_thread_cleanup(l); if (p->p_emul->e_lwp_exit) (*p->p_emul->e_lwp_exit)(l); /* Drop filedesc reference. */ fd_free(); /* Release fstrans private data. */ fstrans_lwp_dtor(l); /* Delete the specificdata while it's still safe to sleep. */ lwp_finispecific(l); /* * Release our cached credentials. */ kauth_cred_free(l->l_cred); callout_destroy(&l->l_timeout_ch); /* * If traced, report LWP exit event to the debugger. * * Remove the LWP from the global list. * Free its LID from the PID namespace if needed. */ mutex_enter(&proc_lock); if ((p->p_slflag & (PSL_TRACED|PSL_TRACELWP_EXIT)) == (PSL_TRACED|PSL_TRACELWP_EXIT)) { mutex_enter(p->p_lock); if (ISSET(p->p_sflag, PS_WEXIT)) { mutex_exit(p->p_lock); /* * We are exiting, bail out without informing parent * about a terminating LWP as it would deadlock. */ } else { eventswitch(TRAP_LWP, PTRACE_LWP_EXIT, l->l_lid); mutex_enter(&proc_lock); } } LIST_REMOVE(l, l_list); mutex_exit(&proc_lock); /* * Get rid of all references to the LWP that others (e.g. procfs) * may have, and mark the LWP as a zombie. If the LWP is detached, * mark it waiting for collection in the proc structure. Note that * before we can do that, we need to free any other dead, detached * LWP waiting to meet its maker. * * All conditions need to be observed upon under the same hold of * p_lock, because if the lock is dropped any of them can change. */ mutex_enter(p->p_lock); for (;;) { if (lwp_drainrefs(l)) continue; if ((l->l_prflag & LPR_DETACHED) != 0) { if ((l2 = p->p_zomblwp) != NULL) { p->p_zomblwp = NULL; lwp_free(l2, false, false); /* proc now unlocked */ mutex_enter(p->p_lock); continue; } p->p_zomblwp = l; } break; } /* * If we find a pending signal for the process and we have been * asked to check for signals, then we lose: arrange to have * all other LWPs in the process check for signals. */ if ((l->l_flag & LW_PENDSIG) != 0 && firstsig(&p->p_sigpend.sp_set) != 0) { LIST_FOREACH(l2, &p->p_lwps, l_sibling) { lwp_lock(l2); signotify(l2); lwp_unlock(l2); } } /* * Release any PCU resources before becoming a zombie. */ pcu_discard_all(l); lwp_lock(l); l->l_stat = LSZOMB; if (l->l_name != NULL) { strcpy(l->l_name, "(zombie)"); } lwp_unlock(l); p->p_nrlwps--; if (l->l_lwpctl != NULL) l->l_lwpctl->lc_curcpu = LWPCTL_CPU_EXITED; mutex_exit(p->p_lock); cv_broadcast(&p->p_lwpcv); /* * We can no longer block. At this point, lwp_free() may already * be gunning for us. On a multi-CPU system, we may be off p_lwps. * * Free MD LWP resources. */ cpu_lwp_free(l, 0); if (current) { /* Switch away into oblivion. */ lwp_lock(l); spc_lock(l->l_cpu); mi_switch(l); panic("lwp_exit"); } } /* * Free a dead LWP's remaining resources. * * XXXLWP limits. */ void lwp_free(struct lwp *l, bool recycle, bool last) { struct proc *p = l->l_proc; struct rusage *ru; ksiginfoq_t kq; KASSERT(l != curlwp); KASSERT(last || mutex_owned(p->p_lock)); /* * We use the process credentials instead of the lwp credentials here * because the lwp credentials maybe cached (just after a setuid call) * and we don't want pay for syncing, since the lwp is going away * anyway */ if (p != &proc0 && p->p_nlwps != 1) (void)chglwpcnt(kauth_cred_getuid(p->p_cred), -1); /* * In the unlikely event that the LWP is still on the CPU, * then spin until it has switched away. * * atomic_load_acquire matches atomic_store_release in * lwp_startup and mi_switch. */ while (__predict_false((atomic_load_acquire(&l->l_pflag) & LP_RUNNING) != 0)) { SPINLOCK_BACKOFF_HOOK; } /* * Now that the LWP's known off the CPU, reset its state back to * LSIDL, which defeats anything that might have gotten a hold on * the LWP via pid_table before the ID was freed. It's important * to do this with both the LWP locked and p_lock held. * * Also reset the CPU and lock pointer back to curcpu(), since the * LWP will in all likelyhood be cached with the current CPU in * lwp_cache when we free it and later allocated from there again * (avoid incidental lock contention). */ lwp_lock(l); l->l_stat = LSIDL; l->l_cpu = curcpu(); lwp_unlock_to(l, l->l_cpu->ci_schedstate.spc_lwplock); /* * If this was not the last LWP in the process, then adjust counters * and unlock. This is done differently for the last LWP in exit1(). */ if (!last) { /* * Add the LWP's run time to the process' base value. * This needs to co-incide with coming off p_lwps. */ bintime_add(&p->p_rtime, &l->l_rtime); p->p_pctcpu += l->l_pctcpu; ru = &p->p_stats->p_ru; ruadd(ru, &l->l_ru); LIST_REMOVE(l, l_sibling); p->p_nlwps--; p->p_nzlwps--; if ((l->l_prflag & LPR_DETACHED) != 0) p->p_ndlwps--; mutex_exit(p->p_lock); /* * Have any LWPs sleeping in lwp_wait() recheck for * deadlock. */ cv_broadcast(&p->p_lwpcv); /* Free the LWP ID. */ mutex_enter(&proc_lock); proc_free_lwpid(p, l->l_lid); mutex_exit(&proc_lock); } /* * Destroy the LWP's remaining signal information. */ ksiginfo_queue_init(&kq); sigclear(&l->l_sigpend, NULL, &kq); ksiginfo_queue_drain(&kq); cv_destroy(&l->l_sigcv); cv_destroy(&l->l_waitcv); /* * Free lwpctl structure and affinity. */ if (l->l_lwpctl) { lwp_ctl_free(l); } if (l->l_affinity) { kcpuset_unuse(l->l_affinity, NULL); l->l_affinity = NULL; } /* * Free remaining data structures and the LWP itself unless the * caller wants to recycle. */ if (l->l_name != NULL) kmem_free(l->l_name, MAXCOMLEN); kmsan_lwp_free(l); kcov_lwp_free(l); cpu_lwp_free2(l); uvm_lwp_exit(l); KASSERT(SLIST_EMPTY(&l->l_pi_lenders)); KASSERT(l->l_inheritedprio == -1); KASSERT(l->l_blcnt == 0); kdtrace_thread_dtor(NULL, l); if (!recycle) pool_cache_put(lwp_cache, l); } /* * Migrate the LWP to the another CPU. Unlocks the LWP. */ void lwp_migrate(lwp_t *l, struct cpu_info *tci) { struct schedstate_percpu *tspc; int lstat = l->l_stat; KASSERT(lwp_locked(l, NULL)); KASSERT(tci != NULL); /* If LWP is still on the CPU, it must be handled like LSONPROC */ if ((l->l_pflag & LP_RUNNING) != 0) { lstat = LSONPROC; } /* * The destination CPU could be changed while previous migration * was not finished. */ if (l->l_target_cpu != NULL) { l->l_target_cpu = tci; lwp_unlock(l); return; } /* Nothing to do if trying to migrate to the same CPU */ if (l->l_cpu == tci) { lwp_unlock(l); return; } KASSERT(l->l_target_cpu == NULL); tspc = &tci->ci_schedstate; switch (lstat) { case LSRUN: l->l_target_cpu = tci; break; case LSSLEEP: l->l_cpu = tci; break; case LSIDL: case LSSTOP: case LSSUSPENDED: l->l_cpu = tci; if (l->l_wchan == NULL) { lwp_unlock_to(l, tspc->spc_lwplock); return; } break; case LSONPROC: l->l_target_cpu = tci; spc_lock(l->l_cpu); sched_resched_cpu(l->l_cpu, PRI_USER_RT, true); /* spc now unlocked */ break; } lwp_unlock(l); } #define lwp_find_exclude(l) \ ((l)->l_stat == LSIDL || (l)->l_stat == LSZOMB) /* * Find the LWP in the process. Arguments may be zero, in such case, * the calling process and first LWP in the list will be used. * On success - returns proc locked. * * => pid == 0 -> look in curproc. * => pid == -1 -> match any proc. * => otherwise look up the proc. * * => lid == 0 -> first LWP in the proc * => otherwise specific LWP */ struct lwp * lwp_find2(pid_t pid, lwpid_t lid) { proc_t *p; lwp_t *l; /* First LWP of specified proc. */ if (lid == 0) { switch (pid) { case -1: /* No lookup keys. */ return NULL; case 0: p = curproc; mutex_enter(p->p_lock); break; default: mutex_enter(&proc_lock); p = proc_find(pid); if (__predict_false(p == NULL)) { mutex_exit(&proc_lock); return NULL; } mutex_enter(p->p_lock); mutex_exit(&proc_lock); break; } LIST_FOREACH(l, &p->p_lwps, l_sibling) { if (__predict_true(!lwp_find_exclude(l))) break; } goto out; } l = proc_find_lwp_acquire_proc(lid, &p); if (l == NULL) return NULL; KASSERT(p != NULL); KASSERT(mutex_owned(p->p_lock)); if (__predict_false(lwp_find_exclude(l))) { l = NULL; goto out; } /* Apply proc filter, if applicable. */ switch (pid) { case -1: /* Match anything. */ break; case 0: if (p != curproc) l = NULL; break; default: if (p->p_pid != pid) l = NULL; break; } out: if (__predict_false(l == NULL)) { mutex_exit(p->p_lock); } return l; } /* * Look up a live LWP within the specified process. * * Must be called with p->p_lock held (as it looks at the radix tree, * and also wants to exclude idle and zombie LWPs). */ struct lwp * lwp_find(struct proc *p, lwpid_t id) { struct lwp *l; KASSERT(mutex_owned(p->p_lock)); l = proc_find_lwp(p, id); KASSERT(l == NULL || l->l_lid == id); /* * No need to lock - all of these conditions will * be visible with the process level mutex held. */ if (__predict_false(l != NULL && lwp_find_exclude(l))) l = NULL; return l; } /* * Verify that an LWP is locked, and optionally verify that the lock matches * one we specify. */ int lwp_locked(struct lwp *l, kmutex_t *mtx) { kmutex_t *cur = l->l_mutex; return mutex_owned(cur) && (mtx == cur || mtx == NULL); } /* * Lend a new mutex to an LWP. The old mutex must be held. */ kmutex_t * lwp_setlock(struct lwp *l, kmutex_t *mtx) { kmutex_t *oldmtx = l->l_mutex; KASSERT(mutex_owned(oldmtx)); atomic_store_release(&l->l_mutex, mtx); return oldmtx; } /* * Lend a new mutex to an LWP, and release the old mutex. The old mutex * must be held. */ void lwp_unlock_to(struct lwp *l, kmutex_t *mtx) { kmutex_t *old; KASSERT(lwp_locked(l, NULL)); old = l->l_mutex; atomic_store_release(&l->l_mutex, mtx); mutex_spin_exit(old); } int lwp_trylock(struct lwp *l) { kmutex_t *old; for (;;) { if (!mutex_tryenter(old = atomic_load_consume(&l->l_mutex))) return 0; if (__predict_true(atomic_load_relaxed(&l->l_mutex) == old)) return 1; mutex_spin_exit(old); } } void lwp_unsleep(lwp_t *l, bool unlock) { KASSERT(mutex_owned(l->l_mutex)); (*l->l_syncobj->sobj_unsleep)(l, unlock); } /* * Lock an LWP. */ void lwp_lock(lwp_t *l) { kmutex_t *old = atomic_load_consume(&l->l_mutex); /* * Note: mutex_spin_enter() will have posted a read barrier. * Re-test l->l_mutex. If it has changed, we need to try again. */ mutex_spin_enter(old); while (__predict_false(atomic_load_relaxed(&l->l_mutex) != old)) { mutex_spin_exit(old); old = atomic_load_consume(&l->l_mutex); mutex_spin_enter(old); } } /* * Unlock an LWP. */ void lwp_unlock(lwp_t *l) { mutex_spin_exit(l->l_mutex); } void lwp_changepri(lwp_t *l, pri_t pri) { KASSERT(mutex_owned(l->l_mutex)); if (l->l_priority == pri) return; (*l->l_syncobj->sobj_changepri)(l, pri); KASSERT(l->l_priority == pri); } void lwp_lendpri(lwp_t *l, pri_t pri) { KASSERT(mutex_owned(l->l_mutex)); (*l->l_syncobj->sobj_lendpri)(l, pri); KASSERT(l->l_inheritedprio == pri); } pri_t lwp_eprio(lwp_t *l) { pri_t pri = l->l_priority; KASSERT(mutex_owned(l->l_mutex)); /* * Timeshared/user LWPs get a temporary priority boost for blocking * in kernel. This is key to good interactive response on a loaded * system: without it, things will seem very sluggish to the user. * * The function of the boost is to get the LWP onto a CPU and * running quickly. Once that happens the LWP loses the priority * boost and could be preempted very quickly by another LWP but that * won't happen often enough to be an annoyance. */ if (pri <= MAXPRI_USER && l->l_boostpri > MAXPRI_USER) pri = (pri >> 1) + l->l_boostpri; return MAX(l->l_auxprio, pri); } /* * Handle exceptions for mi_userret(). Called if a member of LW_USERRET is * set or a preemption is required. */ void lwp_userret(struct lwp *l) { struct proc *p; int sig, f; KASSERT(l == curlwp); KASSERT(l->l_stat == LSONPROC); p = l->l_proc; for (;;) { /* * This is the main location that user preemptions are * processed. */ preempt_point(); /* * It is safe to do this unlocked and without raised SPL, * since whenever a flag of interest is added to l_flag the * LWP will take an AST and come down this path again. If a * remote CPU posts the AST, it will be done with an IPI * (strongly synchronising). */ if ((f = atomic_load_relaxed(&l->l_flag) & LW_USERRET) == 0) { return; } /* * Start out with the correct credentials. */ if ((f & LW_CACHECRED) != 0) { kauth_cred_t oc = l->l_cred; mutex_enter(p->p_lock); l->l_cred = kauth_cred_hold(p->p_cred); lwp_lock(l); l->l_flag &= ~LW_CACHECRED; lwp_unlock(l); mutex_exit(p->p_lock); kauth_cred_free(oc); } /* * Process pending signals first, unless the process * is dumping core or exiting, where we will instead * enter the LW_WSUSPEND case below. */ if ((f & (LW_PENDSIG | LW_WCORE | LW_WEXIT)) == LW_PENDSIG) { mutex_enter(p->p_lock); while ((sig = issignal(l)) != 0) postsig(sig); mutex_exit(p->p_lock); continue; } /* * Core-dump or suspend pending. * * In case of core dump, suspend ourselves, so that the kernel * stack and therefore the userland registers saved in the * trapframe are around for coredump() to write them out. * We also need to save any PCU resources that we have so that * they accessible for coredump(). We issue a wakeup on * p->p_lwpcv so that sigexit() will write the core file out * once all other LWPs are suspended. */ if ((f & LW_WSUSPEND) != 0) { pcu_save_all(l); mutex_enter(p->p_lock); p->p_nrlwps--; lwp_lock(l); l->l_stat = LSSUSPENDED; lwp_unlock(l); mutex_exit(p->p_lock); cv_broadcast(&p->p_lwpcv); lwp_lock(l); spc_lock(l->l_cpu); mi_switch(l); continue; } /* * Process is exiting. The core dump and signal cases must * be handled first. */ if ((f & LW_WEXIT) != 0) { lwp_exit(l); KASSERT(0); /* NOTREACHED */ } /* * Update lwpctl processor (for vfork child_return). */ if ((f & LW_LWPCTL) != 0) { lwp_lock(l); KASSERT(kpreempt_disabled()); l->l_lwpctl->lc_curcpu = (int)cpu_index(l->l_cpu); l->l_lwpctl->lc_pctr++; l->l_flag &= ~LW_LWPCTL; lwp_unlock(l); continue; } } } /* * Force an LWP to enter the kernel, to take a trip through lwp_userret(). */ void lwp_need_userret(struct lwp *l) { KASSERT(!cpu_intr_p()); KASSERT(lwp_locked(l, NULL) || l->l_stat == LSIDL); /* * If the LWP is in any state other than LSONPROC, we know that it * is executing in-kernel and will hit userret() on the way out. * * If the LWP is curlwp, then we know we'll be back out to userspace * soon (can't be called from a hardware interrupt here). * * Otherwise, we can't be sure what the LWP is doing, so first make * sure the update to l_flag will be globally visible, and then * force the LWP to take a trip through trap() where it will do * userret(). */ if (l->l_stat == LSONPROC && l != curlwp) { membar_producer(); cpu_signotify(l); } } /* * Add one reference to an LWP. This will prevent the LWP from * exiting, thus keep the lwp structure and PCB around to inspect. */ void lwp_addref(struct lwp *l) { KASSERT(mutex_owned(l->l_proc->p_lock)); KASSERT(l->l_stat != LSZOMB); l->l_refcnt++; } /* * Remove one reference to an LWP. If this is the last reference, * then we must finalize the LWP's death. */ void lwp_delref(struct lwp *l) { struct proc *p = l->l_proc; mutex_enter(p->p_lock); lwp_delref2(l); mutex_exit(p->p_lock); } /* * Remove one reference to an LWP. If this is the last reference, * then we must finalize the LWP's death. The proc mutex is held * on entry. */ void lwp_delref2(struct lwp *l) { struct proc *p = l->l_proc; KASSERT(mutex_owned(p->p_lock)); KASSERT(l->l_stat != LSZOMB); KASSERT(l->l_refcnt > 0); if (--l->l_refcnt == 0) cv_broadcast(&p->p_lwpcv); } /* * Drain all references to the current LWP. Returns true if * we blocked. */ bool lwp_drainrefs(struct lwp *l) { struct proc *p = l->l_proc; bool rv = false; KASSERT(mutex_owned(p->p_lock)); l->l_prflag |= LPR_DRAINING; while (l->l_refcnt > 0) { rv = true; cv_wait(&p->p_lwpcv, p->p_lock); } return rv; } /* * Return true if the specified LWP is 'alive'. Only p->p_lock need * be held. */ bool lwp_alive(lwp_t *l) { KASSERT(mutex_owned(l->l_proc->p_lock)); switch (l->l_stat) { case LSSLEEP: case LSRUN: case LSONPROC: case LSSTOP: case LSSUSPENDED: return true; default: return false; } } /* * Return first live LWP in the process. */ lwp_t * lwp_find_first(proc_t *p) { lwp_t *l; KASSERT(mutex_owned(p->p_lock)); LIST_FOREACH(l, &p->p_lwps, l_sibling) { if (lwp_alive(l)) { return l; } } return NULL; } /* * Allocate a new lwpctl structure for a user LWP. */ int lwp_ctl_alloc(vaddr_t *uaddr) { lcproc_t *lp; u_int bit, i, offset; struct uvm_object *uao; int error; lcpage_t *lcp; proc_t *p; lwp_t *l; l = curlwp; p = l->l_proc; /* don't allow a vforked process to create lwp ctls */ if (p->p_lflag & PL_PPWAIT) return EBUSY; if (l->l_lcpage != NULL) { lcp = l->l_lcpage; *uaddr = lcp->lcp_uaddr + (vaddr_t)l->l_lwpctl - lcp->lcp_kaddr; return 0; } /* First time around, allocate header structure for the process. */ if ((lp = p->p_lwpctl) == NULL) { lp = kmem_alloc(sizeof(*lp), KM_SLEEP); mutex_init(&lp->lp_lock, MUTEX_DEFAULT, IPL_NONE); lp->lp_uao = NULL; TAILQ_INIT(&lp->lp_pages); mutex_enter(p->p_lock); if (p->p_lwpctl == NULL) { p->p_lwpctl = lp; mutex_exit(p->p_lock); } else { mutex_exit(p->p_lock); mutex_destroy(&lp->lp_lock); kmem_free(lp, sizeof(*lp)); lp = p->p_lwpctl; } } /* * Set up an anonymous memory region to hold the shared pages. * Map them into the process' address space. The user vmspace * gets the first reference on the UAO. */ mutex_enter(&lp->lp_lock); if (lp->lp_uao == NULL) { lp->lp_uao = uao_create(LWPCTL_UAREA_SZ, 0); lp->lp_cur = 0; lp->lp_max = LWPCTL_UAREA_SZ; lp->lp_uva = p->p_emul->e_vm_default_addr(p, (vaddr_t)p->p_vmspace->vm_daddr, LWPCTL_UAREA_SZ, p->p_vmspace->vm_map.flags & VM_MAP_TOPDOWN); error = uvm_map(&p->p_vmspace->vm_map, &lp->lp_uva, LWPCTL_UAREA_SZ, lp->lp_uao, 0, 0, UVM_MAPFLAG(UVM_PROT_RW, UVM_PROT_RW, UVM_INH_NONE, UVM_ADV_NORMAL, 0)); if (error != 0) { uao_detach(lp->lp_uao); lp->lp_uao = NULL; mutex_exit(&lp->lp_lock); return error; } } /* Get a free block and allocate for this LWP. */ TAILQ_FOREACH(lcp, &lp->lp_pages, lcp_chain) { if (lcp->lcp_nfree != 0) break; } if (lcp == NULL) { /* Nothing available - try to set up a free page. */ if (lp->lp_cur == lp->lp_max) { mutex_exit(&lp->lp_lock); return ENOMEM; } lcp = kmem_alloc(LWPCTL_LCPAGE_SZ, KM_SLEEP); /* * Wire the next page down in kernel space. Since this * is a new mapping, we must add a reference. */ uao = lp->lp_uao; (*uao->pgops->pgo_reference)(uao); lcp->lcp_kaddr = vm_map_min(kernel_map); error = uvm_map(kernel_map, &lcp->lcp_kaddr, PAGE_SIZE, uao, lp->lp_cur, PAGE_SIZE, UVM_MAPFLAG(UVM_PROT_RW, UVM_PROT_RW, UVM_INH_NONE, UVM_ADV_RANDOM, 0)); if (error != 0) { mutex_exit(&lp->lp_lock); kmem_free(lcp, LWPCTL_LCPAGE_SZ); (*uao->pgops->pgo_detach)(uao); return error; } error = uvm_map_pageable(kernel_map, lcp->lcp_kaddr, lcp->lcp_kaddr + PAGE_SIZE, FALSE, 0); if (error != 0) { mutex_exit(&lp->lp_lock); uvm_unmap(kernel_map, lcp->lcp_kaddr, lcp->lcp_kaddr + PAGE_SIZE); kmem_free(lcp, LWPCTL_LCPAGE_SZ); return error; } /* Prepare the page descriptor and link into the list. */ lcp->lcp_uaddr = lp->lp_uva + lp->lp_cur; lp->lp_cur += PAGE_SIZE; lcp->lcp_nfree = LWPCTL_PER_PAGE; lcp->lcp_rotor = 0; memset(lcp->lcp_bitmap, 0xff, LWPCTL_BITMAP_SZ); TAILQ_INSERT_HEAD(&lp->lp_pages, lcp, lcp_chain); } for (i = lcp->lcp_rotor; lcp->lcp_bitmap[i] == 0;) { if (++i >= LWPCTL_BITMAP_ENTRIES) i = 0; } bit = ffs(lcp->lcp_bitmap[i]) - 1; lcp->lcp_bitmap[i] ^= (1U << bit); lcp->lcp_rotor = i; lcp->lcp_nfree--; l->l_lcpage = lcp; offset = (i << 5) + bit; l->l_lwpctl = (lwpctl_t *)lcp->lcp_kaddr + offset; *uaddr = lcp->lcp_uaddr + offset * sizeof(lwpctl_t); mutex_exit(&lp->lp_lock); KPREEMPT_DISABLE(l); l->l_lwpctl->lc_curcpu = (int)cpu_index(curcpu()); KPREEMPT_ENABLE(l); return 0; } /* * Free an lwpctl structure back to the per-process list. */ void lwp_ctl_free(lwp_t *l) { struct proc *p = l->l_proc; lcproc_t *lp; lcpage_t *lcp; u_int map, offset; /* don't free a lwp context we borrowed for vfork */ if (p->p_lflag & PL_PPWAIT) { l->l_lwpctl = NULL; return; } lp = p->p_lwpctl; KASSERT(lp != NULL); lcp = l->l_lcpage; offset = (u_int)((lwpctl_t *)l->l_lwpctl - (lwpctl_t *)lcp->lcp_kaddr); KASSERT(offset < LWPCTL_PER_PAGE); mutex_enter(&lp->lp_lock); lcp->lcp_nfree++; map = offset >> 5; lcp->lcp_bitmap[map] |= (1U << (offset & 31)); if (lcp->lcp_bitmap[lcp->lcp_rotor] == 0) lcp->lcp_rotor = map; if (TAILQ_FIRST(&lp->lp_pages)->lcp_nfree == 0) { TAILQ_REMOVE(&lp->lp_pages, lcp, lcp_chain); TAILQ_INSERT_HEAD(&lp->lp_pages, lcp, lcp_chain); } mutex_exit(&lp->lp_lock); } /* * Process is exiting; tear down lwpctl state. This can only be safely * called by the last LWP in the process. */ void lwp_ctl_exit(void) { lcpage_t *lcp, *next; lcproc_t *lp; proc_t *p; lwp_t *l; l = curlwp; l->l_lwpctl = NULL; l->l_lcpage = NULL; p = l->l_proc; lp = p->p_lwpctl; KASSERT(lp != NULL); KASSERT(p->p_nlwps == 1); for (lcp = TAILQ_FIRST(&lp->lp_pages); lcp != NULL; lcp = next) { next = TAILQ_NEXT(lcp, lcp_chain); uvm_unmap(kernel_map, lcp->lcp_kaddr, lcp->lcp_kaddr + PAGE_SIZE); kmem_free(lcp, LWPCTL_LCPAGE_SZ); } if (lp->lp_uao != NULL) { uvm_unmap(&p->p_vmspace->vm_map, lp->lp_uva, lp->lp_uva + LWPCTL_UAREA_SZ); } mutex_destroy(&lp->lp_lock); kmem_free(lp, sizeof(*lp)); p->p_lwpctl = NULL; } /* * Return the current LWP's "preemption counter". Used to detect * preemption across operations that can tolerate preemption without * crashing, but which may generate incorrect results if preempted. * * We do arithmetic in unsigned long to avoid undefined behaviour in * the event of arithmetic overflow on LP32, and issue __insn_barrier() * on both sides so this can safely be used to detect changes to the * preemption counter in loops around other memory accesses even in the * event of whole-program optimization (e.g., gcc -flto). */ long lwp_pctr(void) { unsigned long pctr; __insn_barrier(); pctr = curlwp->l_ru.ru_nvcsw; pctr += curlwp->l_ru.ru_nivcsw; __insn_barrier(); return pctr; } /* * Set an LWP's private data pointer. */ int lwp_setprivate(struct lwp *l, void *ptr) { int error = 0; l->l_private = ptr; #ifdef __HAVE_CPU_LWP_SETPRIVATE error = cpu_lwp_setprivate(l, ptr); #endif return error; } /* * Perform any thread-related cleanup on LWP exit. * N.B. l->l_proc->p_lock must be HELD on entry but will * be released before returning! */ void lwp_thread_cleanup(struct lwp *l) { KASSERT(mutex_owned(l->l_proc->p_lock)); mutex_exit(l->l_proc->p_lock); /* * If the LWP has robust futexes, release them all * now. */ if (__predict_false(l->l_robust_head != 0)) { futex_release_all_lwp(l); } } #if defined(DDB) #include <machine/pcb.h> void lwp_whatis(uintptr_t addr, void (*pr)(const char *, ...)) { lwp_t *l; LIST_FOREACH(l, &alllwp, l_list) { uintptr_t stack = (uintptr_t)KSTACK_LOWEST_ADDR(l); if (addr < stack || stack + KSTACK_SIZE <= addr) { continue; } (*pr)("%p is %p+%zu, LWP %p's stack\n", (void *)addr, (void *)stack, (size_t)(addr - stack), l); } } #endif /* defined(DDB) */
757 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 /* $NetBSD: hash.h,v 1.8 2014/09/05 05:46:15 matt Exp $ */ /*- * Copyright (c) 2001 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Luke Mewburn. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #ifndef _SYS_HASH_H_ #define _SYS_HASH_H_ #include <sys/types.h> #ifdef __HAVE_MACHINE_HASH_H #include <machine/hash.h> #endif #ifndef __HAVE_HASH32_BUF /* not overridden by MD hash */ #define HASH32_BUF_INIT 5381 /* * uint32_t * hash32_buf(const void *bf, size_t len, uint32_t hash) * return a 32 bit hash of the binary buffer buf (size len), * seeded with an initial hash value of hash (usually HASH32_BUF_INIT). */ static __inline uint32_t hash32_buf(const void *bf, size_t len, uint32_t hash) { const uint8_t *s = (const uint8_t *)bf; while (len-- != 0) /* "nemesi": k=257, r=r*257 */ hash = hash * 257 + *s++; return (hash * 257); } #endif /* __HAVE_HASH32_BUF */ #ifndef __HAVE_HASH32_STR /* not overridden by MD hash */ #define HASH32_STR_INIT 5381 /* * uint32_t * hash32_str(const void *bf, uint32_t hash) * return a 32 bit hash of NUL terminated ASCII string buf, * seeded with an initial hash value of hash (usually HASH32_STR_INIT). */ static __inline uint32_t hash32_str(const void *bf, uint32_t hash) { const uint8_t *s = (const uint8_t *)bf; uint8_t c; while ((c = *s++) != 0) hash = hash * 33 + c; /* "perl": k=33, r=r+r/32 */ return (hash + (hash >> 5)); } /* * uint32_t * hash32_strn(const void *bf, size_t len, uint32_t hash) * return a 32 bit hash of NUL terminated ASCII string buf up to * a maximum of len bytes, * seeded with an initial hash value of hash (usually HASH32_STR_INIT). */ static __inline uint32_t hash32_strn(const void *bf, size_t len, uint32_t hash) { const uint8_t *s = (const uint8_t *)bf; uint8_t c; while ((c = *s++) != 0 && len-- != 0) hash = hash * 33 + c; /* "perl": k=33, r=r+r/32 */ return (hash + (hash >> 5)); } #endif /* __HAVE_HASH32_STR */ __BEGIN_DECLS uint32_t murmurhash2(const void *, size_t, uint32_t); __END_DECLS #endif /* !_SYS_HASH_H_ */
2 2 2 2 1 1 1 1 2 2 2 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 /* $NetBSD: rfcomm_upper.c,v 1.23 2018/09/03 16:29:36 riastradh Exp $ */ /*- * Copyright (c) 2006 Itronix Inc. * All rights reserved. * * Written by Iain Hibbert for Itronix Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The name of Itronix Inc. may not be used to endorse * or promote products derived from this software without specific * prior written permission. * * THIS SOFTWARE IS PROVIDED BY ITRONIX INC. ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL ITRONIX INC. BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: rfcomm_upper.c,v 1.23 2018/09/03 16:29:36 riastradh Exp $"); #include <sys/param.h> #include <sys/kernel.h> #include <sys/mbuf.h> #include <sys/kmem.h> #include <sys/socketvar.h> #include <sys/systm.h> #include <netbt/bluetooth.h> #include <netbt/hci.h> #include <netbt/l2cap.h> #include <netbt/rfcomm.h> /**************************************************************************** * * RFCOMM DLC - Upper Protocol API * * Currently the only 'Port Emulation Entity' is the RFCOMM socket code * but it is should be possible to provide a pseudo-device for a direct * tty interface. */ /* * rfcomm_attach_pcb(handle, proto, upper) * * attach a new RFCOMM DLC to handle, populate with reasonable defaults */ int rfcomm_attach_pcb(struct rfcomm_dlc **handle, const struct btproto *proto, void *upper) { struct rfcomm_dlc *dlc; KASSERT(handle != NULL); KASSERT(proto != NULL); KASSERT(upper != NULL); dlc = kmem_intr_zalloc(sizeof(struct rfcomm_dlc), KM_NOSLEEP); if (dlc == NULL) return ENOMEM; dlc->rd_state = RFCOMM_DLC_CLOSED; dlc->rd_mtu = rfcomm_mtu_default; dlc->rd_proto = proto; dlc->rd_upper = upper; dlc->rd_laddr.bt_len = sizeof(struct sockaddr_bt); dlc->rd_laddr.bt_family = AF_BLUETOOTH; dlc->rd_laddr.bt_psm = L2CAP_PSM_RFCOMM; dlc->rd_raddr.bt_len = sizeof(struct sockaddr_bt); dlc->rd_raddr.bt_family = AF_BLUETOOTH; dlc->rd_raddr.bt_psm = L2CAP_PSM_RFCOMM; dlc->rd_lmodem = RFCOMM_MSC_RTC | RFCOMM_MSC_RTR | RFCOMM_MSC_DV; callout_init(&dlc->rd_timeout, 0); callout_setfunc(&dlc->rd_timeout, rfcomm_dlc_timeout, dlc); *handle = dlc; return 0; } /* * rfcomm_bind_pcb(dlc, sockaddr) * * bind DLC to local address */ int rfcomm_bind_pcb(struct rfcomm_dlc *dlc, struct sockaddr_bt *addr) { if (dlc->rd_state != RFCOMM_DLC_CLOSED) return EINVAL; memcpy(&dlc->rd_laddr, addr, sizeof(struct sockaddr_bt)); return 0; } /* * rfcomm_sockaddr_pcb(dlc, sockaddr) * * return local address */ int rfcomm_sockaddr_pcb(struct rfcomm_dlc *dlc, struct sockaddr_bt *addr) { memcpy(addr, &dlc->rd_laddr, sizeof(struct sockaddr_bt)); return 0; } /* * rfcomm_connect_pcb(dlc, sockaddr) * * Initiate connection of RFCOMM DLC to remote address. */ int rfcomm_connect_pcb(struct rfcomm_dlc *dlc, struct sockaddr_bt *dest) { struct rfcomm_session *rs; int err = 0; if (dlc->rd_state != RFCOMM_DLC_CLOSED) return EISCONN; memcpy(&dlc->rd_raddr, dest, sizeof(struct sockaddr_bt)); if (dlc->rd_raddr.bt_channel < RFCOMM_CHANNEL_MIN || dlc->rd_raddr.bt_channel > RFCOMM_CHANNEL_MAX || bdaddr_any(&dlc->rd_raddr.bt_bdaddr)) return EDESTADDRREQ; if (dlc->rd_raddr.bt_psm == L2CAP_PSM_ANY) dlc->rd_raddr.bt_psm = L2CAP_PSM_RFCOMM; else if (dlc->rd_raddr.bt_psm != L2CAP_PSM_RFCOMM && (dlc->rd_raddr.bt_psm < 0x1001 || L2CAP_PSM_INVALID(dlc->rd_raddr.bt_psm))) return EINVAL; /* * We are allowed only one RFCOMM session between any 2 Bluetooth * devices, so see if there is a session already otherwise create * one and set it connecting. */ rs = rfcomm_session_lookup(&dlc->rd_laddr, &dlc->rd_raddr); if (rs == NULL) { rs = rfcomm_session_alloc(&rfcomm_session_active, &dlc->rd_laddr); if (rs == NULL) return ENOMEM; rs->rs_flags |= RFCOMM_SESSION_INITIATOR; rs->rs_state = RFCOMM_SESSION_WAIT_CONNECT; err = l2cap_connect_pcb(rs->rs_l2cap, &dlc->rd_raddr); if (err) { rfcomm_session_free(rs); return err; } /* * This session will start up automatically when its * L2CAP channel is connected. */ } /* construct DLC */ dlc->rd_dlci = RFCOMM_MKDLCI(IS_INITIATOR(rs) ? 0:1, dest->bt_channel); if (rfcomm_dlc_lookup(rs, dlc->rd_dlci)) return EBUSY; l2cap_sockaddr_pcb(rs->rs_l2cap, &dlc->rd_laddr); /* * attach the DLC to the session and start it off */ dlc->rd_session = rs; dlc->rd_state = RFCOMM_DLC_WAIT_SESSION; LIST_INSERT_HEAD(&rs->rs_dlcs, dlc, rd_next); if (rs->rs_state == RFCOMM_SESSION_OPEN) err = rfcomm_dlc_connect(dlc); return err; } /* * rfcomm_peeraddr_pcb(dlc, sockaddr) * * return remote address */ int rfcomm_peeraddr_pcb(struct rfcomm_dlc *dlc, struct sockaddr_bt *addr) { memcpy(addr, &dlc->rd_raddr, sizeof(struct sockaddr_bt)); return 0; } /* * rfcomm_disconnect_pcb(dlc, linger) * * disconnect RFCOMM DLC */ int rfcomm_disconnect_pcb(struct rfcomm_dlc *dlc, int linger) { struct rfcomm_session *rs = dlc->rd_session; int err = 0; KASSERT(dlc != NULL); switch (dlc->rd_state) { case RFCOMM_DLC_CLOSED: case RFCOMM_DLC_LISTEN: return EINVAL; case RFCOMM_DLC_WAIT_SEND_UA: err = rfcomm_session_send_frame(rs, RFCOMM_FRAME_DM, dlc->rd_dlci); /* fall through */ case RFCOMM_DLC_WAIT_SESSION: case RFCOMM_DLC_WAIT_CONNECT: case RFCOMM_DLC_WAIT_SEND_SABM: rfcomm_dlc_close(dlc, 0); break; case RFCOMM_DLC_OPEN: if (dlc->rd_txbuf != NULL && linger != 0) { dlc->rd_flags |= RFCOMM_DLC_SHUTDOWN; break; } /* else fall through */ case RFCOMM_DLC_WAIT_RECV_UA: dlc->rd_state = RFCOMM_DLC_WAIT_DISCONNECT; err = rfcomm_session_send_frame(rs, RFCOMM_FRAME_DISC, dlc->rd_dlci); callout_schedule(&dlc->rd_timeout, rfcomm_ack_timeout * hz); break; case RFCOMM_DLC_WAIT_DISCONNECT: err = EALREADY; break; default: UNKNOWN(dlc->rd_state); break; } return err; } /* * rfcomm_detach_pcb(handle) * * detach RFCOMM DLC from handle */ void rfcomm_detach_pcb(struct rfcomm_dlc **handle) { struct rfcomm_dlc *dlc = *handle; if (dlc->rd_state != RFCOMM_DLC_CLOSED) rfcomm_dlc_close(dlc, 0); if (dlc->rd_txbuf != NULL) { m_freem(dlc->rd_txbuf); dlc->rd_txbuf = NULL; } dlc->rd_upper = NULL; *handle = NULL; /* * If callout is invoking we can't free the DLC so * mark it and let the callout release it. */ if (callout_invoking(&dlc->rd_timeout)) dlc->rd_flags |= RFCOMM_DLC_DETACH; else { callout_destroy(&dlc->rd_timeout); kmem_intr_free(dlc, sizeof(*dlc)); } } /* * rfcomm_listen_pcb(dlc) * * This DLC is a listener. We look for an existing listening session * with a matching address to attach to or else create a new one on * the listeners list. If the ANY channel is given, allocate the first * available for the session. */ int rfcomm_listen_pcb(struct rfcomm_dlc *dlc) { struct rfcomm_session *rs; struct rfcomm_dlc *used; struct sockaddr_bt addr; int err, channel; if (dlc->rd_state != RFCOMM_DLC_CLOSED) return EISCONN; if (dlc->rd_laddr.bt_channel != RFCOMM_CHANNEL_ANY && (dlc->rd_laddr.bt_channel < RFCOMM_CHANNEL_MIN || dlc->rd_laddr.bt_channel > RFCOMM_CHANNEL_MAX)) return EADDRNOTAVAIL; if (dlc->rd_laddr.bt_psm == L2CAP_PSM_ANY) dlc->rd_laddr.bt_psm = L2CAP_PSM_RFCOMM; else if (dlc->rd_laddr.bt_psm != L2CAP_PSM_RFCOMM && (dlc->rd_laddr.bt_psm < 0x1001 || L2CAP_PSM_INVALID(dlc->rd_laddr.bt_psm))) return EADDRNOTAVAIL; LIST_FOREACH(rs, &rfcomm_session_listen, rs_next) { l2cap_sockaddr_pcb(rs->rs_l2cap, &addr); if (addr.bt_psm != dlc->rd_laddr.bt_psm) continue; if (bdaddr_same(&dlc->rd_laddr.bt_bdaddr, &addr.bt_bdaddr)) break; } if (rs == NULL) { rs = rfcomm_session_alloc(&rfcomm_session_listen, &dlc->rd_laddr); if (rs == NULL) return ENOMEM; rs->rs_state = RFCOMM_SESSION_LISTEN; err = l2cap_listen_pcb(rs->rs_l2cap); if (err) { rfcomm_session_free(rs); return err; } } if (dlc->rd_laddr.bt_channel == RFCOMM_CHANNEL_ANY) { channel = RFCOMM_CHANNEL_MIN; used = LIST_FIRST(&rs->rs_dlcs); while (used != NULL) { if (used->rd_laddr.bt_channel == channel) { if (channel++ == RFCOMM_CHANNEL_MAX) return EADDRNOTAVAIL; used = LIST_FIRST(&rs->rs_dlcs); } else { used = LIST_NEXT(used, rd_next); } } dlc->rd_laddr.bt_channel = channel; } dlc->rd_session = rs; dlc->rd_state = RFCOMM_DLC_LISTEN; LIST_INSERT_HEAD(&rs->rs_dlcs, dlc, rd_next); return 0; } /* * rfcomm_send_pcb(dlc, mbuf) * * Output data on DLC. This is streamed data, so we add it * to our buffer and start the DLC, which will assemble * packets and send them if it can. */ int rfcomm_send_pcb(struct rfcomm_dlc *dlc, struct mbuf *m) { if (dlc->rd_txbuf != NULL) { dlc->rd_txbuf->m_pkthdr.len += m->m_pkthdr.len; m_cat(dlc->rd_txbuf, m); } else { dlc->rd_txbuf = m; } if (dlc->rd_state == RFCOMM_DLC_OPEN) rfcomm_dlc_start(dlc); return 0; } /* * rfcomm_rcvd_pcb(dlc, space) * * Indicate space now available in receive buffer * * This should be used to give an initial value of the receive buffer * size when the DLC is attached and anytime data is cleared from the * buffer after that. */ int rfcomm_rcvd_pcb(struct rfcomm_dlc *dlc, size_t space) { KASSERT(dlc != NULL); dlc->rd_rxsize = space; /* * if we are using credit based flow control, we may * want to send some credits.. */ if (dlc->rd_state == RFCOMM_DLC_OPEN && (dlc->rd_session->rs_flags & RFCOMM_SESSION_CFC)) rfcomm_dlc_start(dlc); return 0; } /* * rfcomm_setopt(dlc, sopt) * * set DLC options */ int rfcomm_setopt(struct rfcomm_dlc *dlc, const struct sockopt *sopt) { int mode, err = 0; uint16_t mtu; switch (sopt->sopt_name) { case SO_RFCOMM_MTU: err = sockopt_get(sopt, &mtu, sizeof(mtu)); if (err) break; if (mtu < RFCOMM_MTU_MIN || mtu > RFCOMM_MTU_MAX) err = EINVAL; else if (dlc->rd_state == RFCOMM_DLC_CLOSED) dlc->rd_mtu = mtu; else err = EBUSY; break; case SO_RFCOMM_LM: err = sockopt_getint(sopt, &mode); if (err) break; mode &= (RFCOMM_LM_SECURE | RFCOMM_LM_ENCRYPT | RFCOMM_LM_AUTH); if (mode & RFCOMM_LM_SECURE) mode |= RFCOMM_LM_ENCRYPT; if (mode & RFCOMM_LM_ENCRYPT) mode |= RFCOMM_LM_AUTH; dlc->rd_mode = mode; if (dlc->rd_state == RFCOMM_DLC_OPEN) err = rfcomm_dlc_setmode(dlc); break; default: err = ENOPROTOOPT; break; } return err; } /* * rfcomm_getopt(dlc, sopt) * * get DLC options */ int rfcomm_getopt(struct rfcomm_dlc *dlc, struct sockopt *sopt) { struct rfcomm_fc_info fc; switch (sopt->sopt_name) { case SO_RFCOMM_MTU: return sockopt_set(sopt, &dlc->rd_mtu, sizeof(uint16_t)); case SO_RFCOMM_FC_INFO: memset(&fc, 0, sizeof(fc)); fc.lmodem = dlc->rd_lmodem; fc.rmodem = dlc->rd_rmodem; fc.tx_cred = uimax(dlc->rd_txcred, 0xff); fc.rx_cred = uimax(dlc->rd_rxcred, 0xff); if (dlc->rd_session && (dlc->rd_session->rs_flags & RFCOMM_SESSION_CFC)) fc.cfc = 1; return sockopt_set(sopt, &fc, sizeof(fc)); case SO_RFCOMM_LM: return sockopt_setint(sopt, dlc->rd_mode); default: break; } return ENOPROTOOPT; }
31 12 19 12 1 41 34 8 8 8 8 39 2 39 1 41 40 1 41 41 4 4 38 1 1 1 41 1 41 32 11 38 9 39 7 1 8 38 1 38 1 1 1 38 38 1 39 1 38 3 39 5 1 1 15 32 15 14 1 1 1 16 22 37 38 38 3 6 1 36 26 7 32 1 32 2 30 15 16 31 30 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 40 8 41 3 38 41 3 4 1 1 1 7 35 136 138 137 7 2 11 96 1 4 1 1 1 1 2 1 2 1 1 1 1 1 1 7 1 2 2 1 2 1 1 1 19 43 1 1 1 1 1 1 12 1 1 1 1 1 1 6 2 4 4 1 1 1 1 1 1 1 2 25 2 1 1 1 1 7 7 4 4 4 4 4 1 4 6 6 4 4 5 2 2 2 2 2 2 2 30 3 2 15 2 8 10 1 9 10 15 4 43 31 20 3 2 1 2 2 2 4 2 5 5 15 2 12 9 11 15 2 12 5 5 7 8 5 22 5 5 12 1 1 13 11 2 2 7 5 3 2 2 29 21 1 5 1 1 3 2 1 3 1 2 1 1 1 2 1 1 1 1 1 1 1 1 1 2 1 1 3 1 1 1 5 2 3 11 10 1 1 1 1 5 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 /* $NetBSD: ip6_output.c,v 1.235 2024/04/19 00:55:35 riastradh Exp $ */ /* $KAME: ip6_output.c,v 1.172 2001/03/25 09:55:56 itojun Exp $ */ /* * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the project nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * Copyright (c) 1982, 1986, 1988, 1990, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)ip_output.c 8.3 (Berkeley) 1/21/94 */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: ip6_output.c,v 1.235 2024/04/19 00:55:35 riastradh Exp $"); #ifdef _KERNEL_OPT #include "opt_inet.h" #include "opt_inet6.h" #include "opt_ipsec.h" #endif #include <sys/param.h> #include <sys/malloc.h> #include <sys/mbuf.h> #include <sys/errno.h> #include <sys/socket.h> #include <sys/socketvar.h> #include <sys/syslog.h> #include <sys/systm.h> #include <sys/proc.h> #include <sys/kauth.h> #include <net/if.h> #include <net/route.h> #include <net/pfil.h> #include <netinet/in.h> #include <netinet/in_var.h> #include <netinet/ip6.h> #include <netinet/ip_var.h> #include <netinet/icmp6.h> #include <netinet/in_offload.h> #include <netinet/portalgo.h> #include <netinet6/in6_offload.h> #include <netinet6/ip6_var.h> #include <netinet6/ip6_private.h> #include <netinet6/in6_pcb.h> #include <netinet6/nd6.h> #include <netinet6/ip6protosw.h> #include <netinet6/scope6_var.h> #ifdef IPSEC #include <netipsec/ipsec.h> #include <netipsec/ipsec6.h> #include <netipsec/key.h> #endif extern pfil_head_t *inet6_pfil_hook; /* XXX */ struct ip6_exthdrs { struct mbuf *ip6e_ip6; struct mbuf *ip6e_hbh; struct mbuf *ip6e_dest1; struct mbuf *ip6e_rthdr; struct mbuf *ip6e_dest2; }; static int ip6_pcbopt(int, u_char *, int, struct ip6_pktopts **, kauth_cred_t, int); static int ip6_getpcbopt(struct ip6_pktopts *, int, struct sockopt *); static int ip6_setpktopt(int, u_char *, int, struct ip6_pktopts *, kauth_cred_t, int, int, int); static int ip6_setmoptions(const struct sockopt *, struct inpcb *); static int ip6_getmoptions(struct sockopt *, struct inpcb *); static int ip6_copyexthdr(struct mbuf **, void *, int); static int ip6_insertfraghdr(struct mbuf *, struct mbuf *, int, struct ip6_frag **); static int ip6_insert_jumboopt(struct ip6_exthdrs *, u_int32_t); static int ip6_splithdr(struct mbuf *, struct ip6_exthdrs *); static int ip6_getpmtu(struct rtentry *, struct ifnet *, u_long *, int *); static int copypktopts(struct ip6_pktopts *, struct ip6_pktopts *, int); static int ip6_ifaddrvalid(const struct in6_addr *, const struct in6_addr *); static int ip6_handle_rthdr(struct ip6_rthdr *, struct ip6_hdr *); #ifdef RFC2292 static int ip6_pcbopts(struct ip6_pktopts **, struct socket *, struct sockopt *); #endif static int ip6_handle_rthdr(struct ip6_rthdr *rh, struct ip6_hdr *ip6) { int error = 0; switch (rh->ip6r_type) { case IPV6_RTHDR_TYPE_0: /* Dropped, RFC5095. */ default: /* is it possible? */ error = EINVAL; } return error; } /* * Send an IP packet to a host. */ int ip6_if_output(struct ifnet * const ifp, struct ifnet * const origifp, struct mbuf * const m, const struct sockaddr_in6 * const dst, const struct rtentry *rt) { int error = 0; if (rt != NULL) { error = rt_check_reject_route(rt, ifp); if (error != 0) { IP6_STATINC(IP6_STAT_RTREJECT); m_freem(m); return error; } } /* discard the packet if IPv6 operation is disabled on the interface */ if ((ND_IFINFO(ifp)->flags & ND6_IFF_IFDISABLED)) { m_freem(m); return ENETDOWN; /* better error? */ } if ((ifp->if_flags & IFF_LOOPBACK) != 0) error = if_output_lock(ifp, origifp, m, sin6tocsa(dst), rt); else error = if_output_lock(ifp, ifp, m, sin6tocsa(dst), rt); return error; } /* * IP6 output. The packet in mbuf chain m contains a skeletal IP6 * header (with pri, len, nxt, hlim, src, dst). * * This function may modify ver and hlim only. The mbuf chain containing the * packet will be freed. The mbuf opt, if present, will not be freed. * * Type of "mtu": rt_rmx.rmx_mtu is u_long, ifnet.ifr_mtu is int, and * nd_ifinfo.linkmtu is u_int32_t. So we use u_long to hold largest one, * which is rt_rmx.rmx_mtu. */ int ip6_output( struct mbuf *m0, struct ip6_pktopts *opt, struct route *ro, int flags, struct ip6_moptions *im6o, struct inpcb *inp, struct ifnet **ifpp /* XXX: just for statistics */ ) { struct ip6_hdr *ip6, *mhip6; struct ifnet *ifp = NULL, *origifp = NULL; struct mbuf *m = m0; int tlen, len, off; bool tso; struct route ip6route; struct rtentry *rt = NULL, *rt_pmtu; const struct sockaddr_in6 *dst; struct sockaddr_in6 src_sa, dst_sa; int error = 0; struct in6_ifaddr *ia = NULL; u_long mtu; int alwaysfrag, dontfrag; u_int32_t optlen = 0, plen = 0, unfragpartlen = 0; struct ip6_exthdrs exthdrs; struct in6_addr finaldst, src0, dst0; u_int32_t zone; struct route *ro_pmtu = NULL; int hdrsplit = 0; int needipsec = 0; #ifdef IPSEC struct secpolicy *sp = NULL; #endif struct psref psref, psref_ia; int bound = curlwp_bind(); bool release_psref_ia = false; #ifdef DIAGNOSTIC if ((m->m_flags & M_PKTHDR) == 0) panic("ip6_output: no HDR"); if ((m->m_pkthdr.csum_flags & (M_CSUM_TCPv4|M_CSUM_UDPv4|M_CSUM_TSOv4)) != 0) { panic("ip6_output: IPv4 checksum offload flags: %d", m->m_pkthdr.csum_flags); } if ((m->m_pkthdr.csum_flags & (M_CSUM_TCPv6|M_CSUM_UDPv6)) == (M_CSUM_TCPv6|M_CSUM_UDPv6)) { panic("ip6_output: conflicting checksum offload flags: %d", m->m_pkthdr.csum_flags); } #endif M_CSUM_DATA_IPv6_SET(m->m_pkthdr.csum_data, sizeof(struct ip6_hdr)); #define MAKE_EXTHDR(hp, mp) \ do { \ if (hp) { \ struct ip6_ext *eh = (struct ip6_ext *)(hp); \ error = ip6_copyexthdr((mp), (void *)(hp), \ ((eh)->ip6e_len + 1) << 3); \ if (error) \ goto freehdrs; \ } \ } while (/*CONSTCOND*/ 0) memset(&exthdrs, 0, sizeof(exthdrs)); if (opt) { /* Hop-by-Hop options header */ MAKE_EXTHDR(opt->ip6po_hbh, &exthdrs.ip6e_hbh); /* Destination options header (1st part) */ MAKE_EXTHDR(opt->ip6po_dest1, &exthdrs.ip6e_dest1); /* Routing header */ MAKE_EXTHDR(opt->ip6po_rthdr, &exthdrs.ip6e_rthdr); /* Destination options header (2nd part) */ MAKE_EXTHDR(opt->ip6po_dest2, &exthdrs.ip6e_dest2); } /* * Calculate the total length of the extension header chain. * Keep the length of the unfragmentable part for fragmentation. */ optlen = 0; if (exthdrs.ip6e_hbh) optlen += exthdrs.ip6e_hbh->m_len; if (exthdrs.ip6e_dest1) optlen += exthdrs.ip6e_dest1->m_len; if (exthdrs.ip6e_rthdr) optlen += exthdrs.ip6e_rthdr->m_len; unfragpartlen = optlen + sizeof(struct ip6_hdr); /* NOTE: we don't add AH/ESP length here. do that later. */ if (exthdrs.ip6e_dest2) optlen += exthdrs.ip6e_dest2->m_len; #ifdef IPSEC if (ipsec_used) { /* Check the security policy (SP) for the packet */ sp = ipsec6_check_policy(m, inp, flags, &needipsec, &error); if (error != 0) { /* * Hack: -EINVAL is used to signal that a packet * should be silently discarded. This is typically * because we asked key management for an SA and * it was delayed (e.g. kicked up to IKE). */ if (error == -EINVAL) error = 0; IP6_STATINC(IP6_STAT_IPSECDROP_OUT); goto freehdrs; } } #endif if (needipsec && (m->m_pkthdr.csum_flags & (M_CSUM_UDPv6|M_CSUM_TCPv6)) != 0) { in6_undefer_cksum_tcpudp(m); m->m_pkthdr.csum_flags &= ~(M_CSUM_UDPv6|M_CSUM_TCPv6); } /* * If we need IPsec, or there is at least one extension header, * separate IP6 header from the payload. */ if ((needipsec || optlen) && !hdrsplit) { if ((error = ip6_splithdr(m, &exthdrs)) != 0) { IP6_STATINC(IP6_STAT_ODROPPED); m = NULL; goto freehdrs; } m = exthdrs.ip6e_ip6; hdrsplit++; } /* adjust pointer */ ip6 = mtod(m, struct ip6_hdr *); /* adjust mbuf packet header length */ m->m_pkthdr.len += optlen; plen = m->m_pkthdr.len - sizeof(*ip6); /* If this is a jumbo payload, insert a jumbo payload option. */ if (plen > IPV6_MAXPACKET) { if (!hdrsplit) { if ((error = ip6_splithdr(m, &exthdrs)) != 0) { IP6_STATINC(IP6_STAT_ODROPPED); m = NULL; goto freehdrs; } m = exthdrs.ip6e_ip6; hdrsplit++; } /* adjust pointer */ ip6 = mtod(m, struct ip6_hdr *); if ((error = ip6_insert_jumboopt(&exthdrs, plen)) != 0) { IP6_STATINC(IP6_STAT_ODROPPED); goto freehdrs; } optlen += 8; /* XXX JUMBOOPTLEN */ ip6->ip6_plen = 0; } else ip6->ip6_plen = htons(plen); /* * Concatenate headers and fill in next header fields. * Here we have, on "m" * IPv6 payload * and we insert headers accordingly. Finally, we should be getting: * IPv6 hbh dest1 rthdr ah* [esp* dest2 payload] * * during the header composing process, "m" points to IPv6 header. * "mprev" points to an extension header prior to esp. */ { u_char *nexthdrp = &ip6->ip6_nxt; struct mbuf *mprev = m; /* * we treat dest2 specially. this makes IPsec processing * much easier. the goal here is to make mprev point the * mbuf prior to dest2. * * result: IPv6 dest2 payload * m and mprev will point to IPv6 header. */ if (exthdrs.ip6e_dest2) { if (!hdrsplit) panic("assumption failed: hdr not split"); exthdrs.ip6e_dest2->m_next = m->m_next; m->m_next = exthdrs.ip6e_dest2; *mtod(exthdrs.ip6e_dest2, u_char *) = ip6->ip6_nxt; ip6->ip6_nxt = IPPROTO_DSTOPTS; } #define MAKE_CHAIN(m, mp, p, i)\ do {\ if (m) {\ if (!hdrsplit) \ panic("assumption failed: hdr not split"); \ *mtod((m), u_char *) = *(p);\ *(p) = (i);\ p = mtod((m), u_char *);\ (m)->m_next = (mp)->m_next;\ (mp)->m_next = (m);\ (mp) = (m);\ }\ } while (/*CONSTCOND*/ 0) /* * result: IPv6 hbh dest1 rthdr dest2 payload * m will point to IPv6 header. mprev will point to the * extension header prior to dest2 (rthdr in the above case). */ MAKE_CHAIN(exthdrs.ip6e_hbh, mprev, nexthdrp, IPPROTO_HOPOPTS); MAKE_CHAIN(exthdrs.ip6e_dest1, mprev, nexthdrp, IPPROTO_DSTOPTS); MAKE_CHAIN(exthdrs.ip6e_rthdr, mprev, nexthdrp, IPPROTO_ROUTING); M_CSUM_DATA_IPv6_SET(m->m_pkthdr.csum_data, sizeof(struct ip6_hdr) + optlen); } /* Need to save for pmtu */ finaldst = ip6->ip6_dst; /* * If there is a routing header, replace destination address field * with the first hop of the routing header. */ if (exthdrs.ip6e_rthdr) { struct ip6_rthdr *rh; rh = mtod(exthdrs.ip6e_rthdr, struct ip6_rthdr *); error = ip6_handle_rthdr(rh, ip6); if (error != 0) { IP6_STATINC(IP6_STAT_ODROPPED); goto bad; } } /* Source address validation */ if (IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_src) && (flags & IPV6_UNSPECSRC) == 0) { error = EOPNOTSUPP; IP6_STATINC(IP6_STAT_BADSCOPE); goto bad; } if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_src)) { error = EOPNOTSUPP; IP6_STATINC(IP6_STAT_BADSCOPE); goto bad; } IP6_STATINC(IP6_STAT_LOCALOUT); /* * Route packet. */ /* initialize cached route */ if (ro == NULL) { memset(&ip6route, 0, sizeof(ip6route)); ro = &ip6route; } ro_pmtu = ro; if (opt && opt->ip6po_rthdr) ro = &opt->ip6po_route; /* * if specified, try to fill in the traffic class field. * do not override if a non-zero value is already set. * we check the diffserv field and the ecn field separately. */ if (opt && opt->ip6po_tclass >= 0) { int mask = 0; if ((ip6->ip6_flow & htonl(0xfc << 20)) == 0) mask |= 0xfc; if ((ip6->ip6_flow & htonl(0x03 << 20)) == 0) mask |= 0x03; if (mask != 0) ip6->ip6_flow |= htonl((opt->ip6po_tclass & mask) << 20); } /* fill in or override the hop limit field, if necessary. */ if (opt && opt->ip6po_hlim != -1) ip6->ip6_hlim = opt->ip6po_hlim & 0xff; else if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) { if (im6o != NULL) ip6->ip6_hlim = im6o->im6o_multicast_hlim; else ip6->ip6_hlim = ip6_defmcasthlim; } #ifdef IPSEC if (needipsec) { error = ipsec6_process_packet(m, sp->req, flags); /* * Preserve KAME behaviour: ENOENT can be returned * when an SA acquire is in progress. Don't propagate * this to user-level; it confuses applications. * XXX this will go away when the SADB is redone. */ if (error == ENOENT) error = 0; goto done; } #endif /* adjust pointer */ ip6 = mtod(m, struct ip6_hdr *); sockaddr_in6_init(&dst_sa, &ip6->ip6_dst, 0, 0, 0); /* We do not need a route for multicast */ if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) { struct in6_pktinfo *pi = NULL; /* * If the outgoing interface for the address is specified by * the caller, use it. */ if (opt && (pi = opt->ip6po_pktinfo) != NULL) { /* XXX boundary check is assumed to be already done. */ ifp = if_get_byindex(pi->ipi6_ifindex, &psref); } else if (im6o != NULL) { ifp = if_get_byindex(im6o->im6o_multicast_if_index, &psref); } } if (ifp == NULL) { error = in6_selectroute(&dst_sa, opt, &ro, &rt, true); if (error != 0) goto bad; ifp = if_get_byindex(rt->rt_ifp->if_index, &psref); } if (rt == NULL) { /* * If in6_selectroute() does not return a route entry, * dst may not have been updated. */ error = rtcache_setdst(ro, sin6tosa(&dst_sa)); if (error) { IP6_STATINC(IP6_STAT_ODROPPED); goto bad; } } /* * then rt (for unicast) and ifp must be non-NULL valid values. */ if ((flags & IPV6_FORWARDING) == 0) { /* XXX: the FORWARDING flag can be set for mrouting. */ in6_ifstat_inc(ifp, ifs6_out_request); } if (rt != NULL) { ia = (struct in6_ifaddr *)(rt->rt_ifa); rt->rt_use++; } /* * The outgoing interface must be in the zone of source and * destination addresses. We should use ia_ifp to support the * case of sending packets to an address of our own. */ if (ia != NULL) { origifp = ia->ia_ifp; if (if_is_deactivated(origifp)) { IP6_STATINC(IP6_STAT_ODROPPED); goto bad; } if_acquire(origifp, &psref_ia); release_psref_ia = true; } else origifp = ifp; src0 = ip6->ip6_src; if (in6_setscope(&src0, origifp, &zone)) goto badscope; sockaddr_in6_init(&src_sa, &ip6->ip6_src, 0, 0, 0); if (sa6_recoverscope(&src_sa) || zone != src_sa.sin6_scope_id) goto badscope; dst0 = ip6->ip6_dst; if (in6_setscope(&dst0, origifp, &zone)) goto badscope; /* re-initialize to be sure */ sockaddr_in6_init(&dst_sa, &ip6->ip6_dst, 0, 0, 0); if (sa6_recoverscope(&dst_sa) || zone != dst_sa.sin6_scope_id) goto badscope; /* scope check is done. */ /* Ensure we only send from a valid address. */ if ((ifp->if_flags & IFF_LOOPBACK) == 0 && (flags & IPV6_FORWARDING) == 0 && (error = ip6_ifaddrvalid(&src0, &dst0)) != 0) { char ip6buf[INET6_ADDRSTRLEN]; nd6log(LOG_ERR, "refusing to send from invalid address %s (pid %d)\n", IN6_PRINT(ip6buf, &src0), curproc->p_pid); IP6_STATINC(IP6_STAT_ODROPPED); in6_ifstat_inc(origifp, ifs6_out_discard); if (error == 1) /* * Address exists, but is tentative or detached. * We can't send from it because it's invalid, * so we drop the packet. */ error = 0; else error = EADDRNOTAVAIL; goto bad; } if (rt != NULL && (rt->rt_flags & RTF_GATEWAY) && !IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) dst = satocsin6(rt->rt_gateway); else dst = satocsin6(rtcache_getdst(ro)); /* * XXXXXX: original code follows: */ if (!IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) m->m_flags &= ~(M_BCAST | M_MCAST); /* just in case */ else { bool ingroup; m->m_flags = (m->m_flags & ~M_BCAST) | M_MCAST; in6_ifstat_inc(ifp, ifs6_out_mcast); /* * Confirm that the outgoing interface supports multicast. */ if (!(ifp->if_flags & IFF_MULTICAST)) { IP6_STATINC(IP6_STAT_NOROUTE); in6_ifstat_inc(ifp, ifs6_out_discard); error = ENETUNREACH; goto bad; } ingroup = in6_multi_group(&ip6->ip6_dst, ifp); if (ingroup && (im6o == NULL || im6o->im6o_multicast_loop)) { /* * If we belong to the destination multicast group * on the outgoing interface, and the caller did not * forbid loopback, loop back a copy. */ KASSERT(dst != NULL); ip6_mloopback(ifp, m, dst); } else { /* * If we are acting as a multicast router, perform * multicast forwarding as if the packet had just * arrived on the interface to which we are about * to send. The multicast forwarding function * recursively calls this function, using the * IPV6_FORWARDING flag to prevent infinite recursion. * * Multicasts that are looped back by ip6_mloopback(), * above, will be forwarded by the ip6_input() routine, * if necessary. */ if (ip6_mrouter && (flags & IPV6_FORWARDING) == 0) { if (ip6_mforward(ip6, ifp, m) != 0) { m_freem(m); goto done; } } } /* * Multicasts with a hoplimit of zero may be looped back, * above, but must not be transmitted on a network. * Also, multicasts addressed to the loopback interface * are not sent -- the above call to ip6_mloopback() will * loop back a copy if this host actually belongs to the * destination group on the loopback interface. */ if (ip6->ip6_hlim == 0 || (ifp->if_flags & IFF_LOOPBACK) || IN6_IS_ADDR_MC_INTFACELOCAL(&ip6->ip6_dst)) { m_freem(m); goto done; } } /* * Fill the outgoing interface to tell the upper layer * to increment per-interface statistics. */ if (ifpp) *ifpp = ifp; /* Determine path MTU. */ /* * ro_pmtu represent final destination while * ro might represent immediate destination. * Use ro_pmtu destination since MTU might differ. */ if (ro_pmtu != ro) { union { struct sockaddr dst; struct sockaddr_in6 dst6; } u; /* ro_pmtu may not have a cache */ sockaddr_in6_init(&u.dst6, &finaldst, 0, 0, 0); rt_pmtu = rtcache_lookup(ro_pmtu, &u.dst); } else rt_pmtu = rt; error = ip6_getpmtu(rt_pmtu, ifp, &mtu, &alwaysfrag); if (rt_pmtu != NULL && rt_pmtu != rt) rtcache_unref(rt_pmtu, ro_pmtu); KASSERT(error == 0); /* ip6_getpmtu never fail if ifp is passed */ /* * The caller of this function may specify to use the minimum MTU * in some cases. * An advanced API option (IPV6_USE_MIN_MTU) can also override MTU * setting. The logic is a bit complicated; by default, unicast * packets will follow path MTU while multicast packets will be sent at * the minimum MTU. If IP6PO_MINMTU_ALL is specified, all packets * including unicast ones will be sent at the minimum MTU. Multicast * packets will always be sent at the minimum MTU unless * IP6PO_MINMTU_DISABLE is explicitly specified. * See RFC 3542 for more details. */ if (mtu > IPV6_MMTU) { if ((flags & IPV6_MINMTU)) mtu = IPV6_MMTU; else if (opt && opt->ip6po_minmtu == IP6PO_MINMTU_ALL) mtu = IPV6_MMTU; else if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) && (opt == NULL || opt->ip6po_minmtu != IP6PO_MINMTU_DISABLE)) { mtu = IPV6_MMTU; } } /* * clear embedded scope identifiers if necessary. * in6_clearscope will touch the addresses only when necessary. */ in6_clearscope(&ip6->ip6_src); in6_clearscope(&ip6->ip6_dst); /* * If the outgoing packet contains a hop-by-hop options header, * it must be examined and processed even by the source node. * (RFC 2460, section 4.) * * XXX Is this really necessary? */ if (ip6->ip6_nxt == IPPROTO_HOPOPTS) { u_int32_t dummy1 = 0; /* XXX unused */ u_int32_t dummy2; /* XXX unused */ int hoff = sizeof(struct ip6_hdr); if (ip6_hopopts_input(&dummy1, &dummy2, &m, &hoff)) { /* m was already freed at this point */ error = EINVAL; goto done; } ip6 = mtod(m, struct ip6_hdr *); } /* * Run through list of hooks for output packets. */ error = pfil_run_hooks(inet6_pfil_hook, &m, ifp, PFIL_OUT); if (error != 0 || m == NULL) { IP6_STATINC(IP6_STAT_PFILDROP_OUT); goto done; } ip6 = mtod(m, struct ip6_hdr *); /* * Send the packet to the outgoing interface. * If necessary, do IPv6 fragmentation before sending. * * the logic here is rather complex: * 1: normal case (dontfrag == 0, alwaysfrag == 0) * 1-a: send as is if tlen <= path mtu * 1-b: fragment if tlen > path mtu * * 2: if user asks us not to fragment (dontfrag == 1) * 2-a: send as is if tlen <= interface mtu * 2-b: error if tlen > interface mtu * * 3: if we always need to attach fragment header (alwaysfrag == 1) * always fragment * * 4: if dontfrag == 1 && alwaysfrag == 1 * error, as we cannot handle this conflicting request */ tlen = m->m_pkthdr.len; tso = (m->m_pkthdr.csum_flags & M_CSUM_TSOv6) != 0; if (opt && (opt->ip6po_flags & IP6PO_DONTFRAG)) dontfrag = 1; else dontfrag = 0; if (dontfrag && alwaysfrag) { /* case 4 */ /* conflicting request - can't transmit */ IP6_STATINC(IP6_STAT_CANTFRAG); error = EMSGSIZE; goto bad; } if (dontfrag && (!tso && tlen > ifp->if_mtu)) { /* case 2-b */ /* * Even if the DONTFRAG option is specified, we cannot send the * packet when the data length is larger than the MTU of the * outgoing interface. * Notify the error by sending IPV6_PATHMTU ancillary data as * well as returning an error code (the latter is not described * in the API spec.) */ u_int32_t mtu32; struct ip6ctlparam ip6cp; mtu32 = (u_int32_t)mtu; memset(&ip6cp, 0, sizeof(ip6cp)); ip6cp.ip6c_cmdarg = (void *)&mtu32; pfctlinput2(PRC_MSGSIZE, rtcache_getdst(ro_pmtu), &ip6cp); IP6_STATINC(IP6_STAT_CANTFRAG); error = EMSGSIZE; goto bad; } /* * transmit packet without fragmentation */ if (dontfrag || (!alwaysfrag && (tlen <= mtu || tso))) { /* case 1-a and 2-a */ struct in6_ifaddr *ia6; int sw_csum; int s; ip6 = mtod(m, struct ip6_hdr *); s = pserialize_read_enter(); ia6 = in6_ifawithifp(ifp, &ip6->ip6_src); if (ia6) { /* Record statistics for this interface address. */ ia6->ia_ifa.ifa_data.ifad_outbytes += m->m_pkthdr.len; } pserialize_read_exit(s); sw_csum = m->m_pkthdr.csum_flags & ~ifp->if_csum_flags_tx; if ((sw_csum & (M_CSUM_UDPv6|M_CSUM_TCPv6)) != 0) { if (IN6_NEED_CHECKSUM(ifp, sw_csum & (M_CSUM_UDPv6|M_CSUM_TCPv6))) { in6_undefer_cksum_tcpudp(m); } m->m_pkthdr.csum_flags &= ~(M_CSUM_UDPv6|M_CSUM_TCPv6); } KASSERT(dst != NULL); if (__predict_false(sw_csum & M_CSUM_TSOv6)) { /* * TSO6 is required by a packet, but disabled for * the interface. */ error = ip6_tso_output(ifp, origifp, m, dst, rt); } else error = ip6_if_output(ifp, origifp, m, dst, rt); goto done; } if (tso) { IP6_STATINC(IP6_STAT_CANTFRAG); /* XXX */ error = EINVAL; /* XXX */ goto bad; } /* * try to fragment the packet. case 1-b and 3 */ if (mtu < IPV6_MMTU) { /* path MTU cannot be less than IPV6_MMTU */ IP6_STATINC(IP6_STAT_CANTFRAG); error = EMSGSIZE; in6_ifstat_inc(ifp, ifs6_out_fragfail); goto bad; } else if (ip6->ip6_plen == 0) { /* jumbo payload cannot be fragmented */ IP6_STATINC(IP6_STAT_CANTFRAG); error = EMSGSIZE; in6_ifstat_inc(ifp, ifs6_out_fragfail); goto bad; } else { const uint32_t id = ip6_randomid(); struct mbuf **mnext, *m_frgpart; const int hlen = unfragpartlen; struct ip6_frag *ip6f; u_char nextproto; if (mtu > IPV6_MAXPACKET) mtu = IPV6_MAXPACKET; /* * Must be able to put at least 8 bytes per fragment. */ len = (mtu - hlen - sizeof(struct ip6_frag)) & ~7; if (len < 8) { IP6_STATINC(IP6_STAT_CANTFRAG); error = EMSGSIZE; in6_ifstat_inc(ifp, ifs6_out_fragfail); goto bad; } mnext = &m->m_nextpkt; /* * Change the next header field of the last header in the * unfragmentable part. */ if (exthdrs.ip6e_rthdr) { nextproto = *mtod(exthdrs.ip6e_rthdr, u_char *); *mtod(exthdrs.ip6e_rthdr, u_char *) = IPPROTO_FRAGMENT; } else if (exthdrs.ip6e_dest1) { nextproto = *mtod(exthdrs.ip6e_dest1, u_char *); *mtod(exthdrs.ip6e_dest1, u_char *) = IPPROTO_FRAGMENT; } else if (exthdrs.ip6e_hbh) { nextproto = *mtod(exthdrs.ip6e_hbh, u_char *); *mtod(exthdrs.ip6e_hbh, u_char *) = IPPROTO_FRAGMENT; } else { nextproto = ip6->ip6_nxt; ip6->ip6_nxt = IPPROTO_FRAGMENT; } if ((m->m_pkthdr.csum_flags & (M_CSUM_UDPv6|M_CSUM_TCPv6)) != 0) { if (IN6_NEED_CHECKSUM(ifp, m->m_pkthdr.csum_flags & (M_CSUM_UDPv6|M_CSUM_TCPv6))) { in6_undefer_cksum_tcpudp(m); } m->m_pkthdr.csum_flags &= ~(M_CSUM_UDPv6|M_CSUM_TCPv6); } /* * Loop through length of segment after first fragment, * make new header and copy data of each part and link onto * chain. */ m0 = m; for (off = hlen; off < tlen; off += len) { struct mbuf *mlast; MGETHDR(m, M_DONTWAIT, MT_HEADER); if (!m) { error = ENOBUFS; IP6_STATINC(IP6_STAT_ODROPPED); goto sendorfree; } m_reset_rcvif(m); m->m_flags = m0->m_flags & M_COPYFLAGS; *mnext = m; mnext = &m->m_nextpkt; m->m_data += max_linkhdr; mhip6 = mtod(m, struct ip6_hdr *); *mhip6 = *ip6; m->m_len = sizeof(*mhip6); ip6f = NULL; error = ip6_insertfraghdr(m0, m, hlen, &ip6f); if (error) { IP6_STATINC(IP6_STAT_ODROPPED); goto sendorfree; } /* Fill in the Frag6 Header */ ip6f->ip6f_offlg = htons((u_int16_t)((off - hlen) & ~7)); if (off + len >= tlen) len = tlen - off; else ip6f->ip6f_offlg |= IP6F_MORE_FRAG; ip6f->ip6f_reserved = 0; ip6f->ip6f_ident = id; ip6f->ip6f_nxt = nextproto; mhip6->ip6_plen = htons((u_int16_t)(len + hlen + sizeof(*ip6f) - sizeof(struct ip6_hdr))); if ((m_frgpart = m_copym(m0, off, len, M_DONTWAIT)) == NULL) { error = ENOBUFS; IP6_STATINC(IP6_STAT_ODROPPED); goto sendorfree; } for (mlast = m; mlast->m_next; mlast = mlast->m_next) ; mlast->m_next = m_frgpart; m->m_pkthdr.len = len + hlen + sizeof(*ip6f); m_reset_rcvif(m); IP6_STATINC(IP6_STAT_OFRAGMENTS); in6_ifstat_inc(ifp, ifs6_out_fragcreat); } in6_ifstat_inc(ifp, ifs6_out_fragok); } sendorfree: m = m0->m_nextpkt; m0->m_nextpkt = 0; m_freem(m0); for (m0 = m; m; m = m0) { m0 = m->m_nextpkt; m->m_nextpkt = 0; if (error == 0) { struct in6_ifaddr *ia6; int s; ip6 = mtod(m, struct ip6_hdr *); s = pserialize_read_enter(); ia6 = in6_ifawithifp(ifp, &ip6->ip6_src); if (ia6) { /* * Record statistics for this interface * address. */ ia6->ia_ifa.ifa_data.ifad_outbytes += m->m_pkthdr.len; } pserialize_read_exit(s); KASSERT(dst != NULL); error = ip6_if_output(ifp, origifp, m, dst, rt); } else m_freem(m); } if (error == 0) IP6_STATINC(IP6_STAT_FRAGMENTED); done: rtcache_unref(rt, ro); if (ro == &ip6route) rtcache_free(&ip6route); #ifdef IPSEC if (sp != NULL) KEY_SP_UNREF(&sp); #endif if_put(ifp, &psref); if (release_psref_ia) if_put(origifp, &psref_ia); curlwp_bindx(bound); return error; freehdrs: m_freem(exthdrs.ip6e_hbh); m_freem(exthdrs.ip6e_dest1); m_freem(exthdrs.ip6e_rthdr); m_freem(exthdrs.ip6e_dest2); /* FALLTHROUGH */ bad: m_freem(m); goto done; badscope: IP6_STATINC(IP6_STAT_BADSCOPE); in6_ifstat_inc(origifp, ifs6_out_discard); if (error == 0) error = EHOSTUNREACH; /* XXX */ goto bad; } static int ip6_copyexthdr(struct mbuf **mp, void *hdr, int hlen) { struct mbuf *m; if (hlen > MCLBYTES) return ENOBUFS; /* XXX */ MGET(m, M_DONTWAIT, MT_DATA); if (!m) return ENOBUFS; if (hlen > MLEN) { MCLGET(m, M_DONTWAIT); if ((m->m_flags & M_EXT) == 0) { m_free(m); return ENOBUFS; } } m->m_len = hlen; if (hdr) memcpy(mtod(m, void *), hdr, hlen); *mp = m; return 0; } /* * Insert jumbo payload option. */ static int ip6_insert_jumboopt(struct ip6_exthdrs *exthdrs, u_int32_t plen) { struct mbuf *mopt; u_int8_t *optbuf; u_int32_t v; #define JUMBOOPTLEN 8 /* length of jumbo payload option and padding */ /* * If there is no hop-by-hop options header, allocate new one. * If there is one but it doesn't have enough space to store the * jumbo payload option, allocate a cluster to store the whole options. * Otherwise, use it to store the options. */ if (exthdrs->ip6e_hbh == NULL) { MGET(mopt, M_DONTWAIT, MT_DATA); if (mopt == 0) return (ENOBUFS); mopt->m_len = JUMBOOPTLEN; optbuf = mtod(mopt, u_int8_t *); optbuf[1] = 0; /* = ((JUMBOOPTLEN) >> 3) - 1 */ exthdrs->ip6e_hbh = mopt; } else { struct ip6_hbh *hbh; mopt = exthdrs->ip6e_hbh; if (M_TRAILINGSPACE(mopt) < JUMBOOPTLEN) { const int oldoptlen = mopt->m_len; struct mbuf *n; /* * Assumptions: * - exthdrs->ip6e_hbh is not referenced from places * other than exthdrs. * - exthdrs->ip6e_hbh is not an mbuf chain. */ KASSERT(mopt->m_next == NULL); /* * Give up if the whole (new) hbh header does not fit * even in an mbuf cluster. */ if (oldoptlen + JUMBOOPTLEN > MCLBYTES) return ENOBUFS; /* * At this point, we must always prepare a cluster. */ MGET(n, M_DONTWAIT, MT_DATA); if (n) { MCLGET(n, M_DONTWAIT); if ((n->m_flags & M_EXT) == 0) { m_freem(n); n = NULL; } } if (!n) return ENOBUFS; n->m_len = oldoptlen + JUMBOOPTLEN; bcopy(mtod(mopt, void *), mtod(n, void *), oldoptlen); optbuf = mtod(n, u_int8_t *) + oldoptlen; m_freem(mopt); mopt = exthdrs->ip6e_hbh = n; } else { optbuf = mtod(mopt, u_int8_t *) + mopt->m_len; mopt->m_len += JUMBOOPTLEN; } optbuf[0] = IP6OPT_PADN; optbuf[1] = 0; /* * Adjust the header length according to the pad and * the jumbo payload option. */ hbh = mtod(mopt, struct ip6_hbh *); hbh->ip6h_len += (JUMBOOPTLEN >> 3); } /* fill in the option. */ optbuf[2] = IP6OPT_JUMBO; optbuf[3] = 4; v = (u_int32_t)htonl(plen + JUMBOOPTLEN); memcpy(&optbuf[4], &v, sizeof(u_int32_t)); /* finally, adjust the packet header length */ exthdrs->ip6e_ip6->m_pkthdr.len += JUMBOOPTLEN; return 0; #undef JUMBOOPTLEN } /* * Insert fragment header and copy unfragmentable header portions. * * *frghdrp will not be read, and it is guaranteed that either an * error is returned or that *frghdrp will point to space allocated * for the fragment header. * * On entry, m contains: * IPv6 Header * On exit, it contains: * IPv6 Header -> Unfragmentable Part -> Frag6 Header */ static int ip6_insertfraghdr(struct mbuf *m0, struct mbuf *m, int hlen, struct ip6_frag **frghdrp) { struct mbuf *n, *mlast; if (hlen > sizeof(struct ip6_hdr)) { n = m_copym(m0, sizeof(struct ip6_hdr), hlen - sizeof(struct ip6_hdr), M_DONTWAIT); if (n == NULL) return ENOBUFS; m->m_next = n; } else n = m; /* Search for the last mbuf of unfragmentable part. */ for (mlast = n; mlast->m_next; mlast = mlast->m_next) ; if ((mlast->m_flags & M_EXT) == 0 && M_TRAILINGSPACE(mlast) >= sizeof(struct ip6_frag)) { /* use the trailing space of the last mbuf for the fragment hdr */ *frghdrp = (struct ip6_frag *)(mtod(mlast, char *) + mlast->m_len); mlast->m_len += sizeof(struct ip6_frag); } else { /* allocate a new mbuf for the fragment header */ struct mbuf *mfrg; MGET(mfrg, M_DONTWAIT, MT_DATA); if (mfrg == NULL) return ENOBUFS; mfrg->m_len = sizeof(struct ip6_frag); *frghdrp = mtod(mfrg, struct ip6_frag *); mlast->m_next = mfrg; } return 0; } static int ip6_getpmtu(struct rtentry *rt, struct ifnet *ifp, u_long *mtup, int *alwaysfragp) { u_int32_t mtu = 0; int alwaysfrag = 0; int error = 0; if (rt != NULL) { if (ifp == NULL) ifp = rt->rt_ifp; mtu = rt->rt_rmx.rmx_mtu; if (mtu == 0) mtu = ifp->if_mtu; else if (mtu < IPV6_MMTU) { /* * RFC2460 section 5, last paragraph: * if we record ICMPv6 too big message with * mtu < IPV6_MMTU, transmit packets sized IPV6_MMTU * or smaller, with fragment header attached. * (fragment header is needed regardless from the * packet size, for translators to identify packets) */ alwaysfrag = 1; mtu = IPV6_MMTU; } else if (mtu > ifp->if_mtu) { /* * The MTU on the route is larger than the MTU on * the interface! This shouldn't happen, unless the * MTU of the interface has been changed after the * interface was brought up. Change the MTU in the * route to match the interface MTU (as long as the * field isn't locked). */ mtu = ifp->if_mtu; if (!(rt->rt_rmx.rmx_locks & RTV_MTU)) rt->rt_rmx.rmx_mtu = mtu; } } else if (ifp) { mtu = ifp->if_mtu; } else error = EHOSTUNREACH; /* XXX */ *mtup = mtu; if (alwaysfragp) *alwaysfragp = alwaysfrag; return (error); } /* * IP6 socket option processing. */ int ip6_ctloutput(int op, struct socket *so, struct sockopt *sopt) { int optdatalen, uproto; void *optdata; struct inpcb *inp = sotoinpcb(so); struct ip_moptions **mopts; int error, optval; int level, optname; KASSERT(solocked(so)); KASSERT(sopt != NULL); level = sopt->sopt_level; optname = sopt->sopt_name; error = optval = 0; uproto = (int)so->so_proto->pr_protocol; switch (level) { case IPPROTO_IP: switch (optname) { case IP_ADD_MEMBERSHIP: case IP_DROP_MEMBERSHIP: case IP_MULTICAST_IF: case IP_MULTICAST_LOOP: case IP_MULTICAST_TTL: mopts = &inp->inp_moptions; switch (op) { case PRCO_GETOPT: return ip_getmoptions(*mopts, sopt); case PRCO_SETOPT: return ip_setmoptions(mopts, sopt); default: return EINVAL; } default: return ENOPROTOOPT; } case IPPROTO_IPV6: break; default: return ENOPROTOOPT; } switch (op) { case PRCO_SETOPT: switch (optname) { #ifdef RFC2292 case IPV6_2292PKTOPTIONS: error = ip6_pcbopts(&in6p_outputopts(inp), so, sopt); break; #endif /* * Use of some Hop-by-Hop options or some * Destination options, might require special * privilege. That is, normal applications * (without special privilege) might be forbidden * from setting certain options in outgoing packets, * and might never see certain options in received * packets. [RFC 2292 Section 6] * KAME specific note: * KAME prevents non-privileged users from sending or * receiving ANY hbh/dst options in order to avoid * overhead of parsing options in the kernel. */ case IPV6_RECVHOPOPTS: case IPV6_RECVDSTOPTS: case IPV6_RECVRTHDRDSTOPTS: error = kauth_authorize_network( kauth_cred_get(), KAUTH_NETWORK_IPV6, KAUTH_REQ_NETWORK_IPV6_HOPBYHOP, NULL, NULL, NULL); if (error) break; /* FALLTHROUGH */ case IPV6_UNICAST_HOPS: case IPV6_HOPLIMIT: case IPV6_FAITH: case IPV6_RECVPKTINFO: case IPV6_RECVHOPLIMIT: case IPV6_RECVRTHDR: case IPV6_RECVPATHMTU: case IPV6_RECVTCLASS: case IPV6_V6ONLY: case IPV6_BINDANY: error = sockopt_getint(sopt, &optval); if (error) break; switch (optname) { case IPV6_UNICAST_HOPS: if (optval < -1 || optval >= 256) error = EINVAL; else { /* -1 = kernel default */ in6p_hops6(inp) = optval; } break; #define OPTSET(bit) \ do { \ if (optval) \ inp->inp_flags |= (bit); \ else \ inp->inp_flags &= ~(bit); \ } while (/*CONSTCOND*/ 0) #ifdef RFC2292 #define OPTSET2292(bit) \ do { \ inp->inp_flags |= IN6P_RFC2292; \ if (optval) \ inp->inp_flags |= (bit); \ else \ inp->inp_flags &= ~(bit); \ } while (/*CONSTCOND*/ 0) #endif #define OPTBIT(bit) (inp->inp_flags & (bit) ? 1 : 0) case IPV6_RECVPKTINFO: #ifdef RFC2292 /* cannot mix with RFC2292 */ if (OPTBIT(IN6P_RFC2292)) { error = EINVAL; break; } #endif OPTSET(IN6P_PKTINFO); break; case IPV6_HOPLIMIT: { struct ip6_pktopts **optp; #ifdef RFC2292 /* cannot mix with RFC2292 */ if (OPTBIT(IN6P_RFC2292)) { error = EINVAL; break; } #endif optp = &in6p_outputopts(inp); error = ip6_pcbopt(IPV6_HOPLIMIT, (u_char *)&optval, sizeof(optval), optp, kauth_cred_get(), uproto); break; } case IPV6_RECVHOPLIMIT: #ifdef RFC2292 /* cannot mix with RFC2292 */ if (OPTBIT(IN6P_RFC2292)) { error = EINVAL; break; } #endif OPTSET(IN6P_HOPLIMIT); break; case IPV6_RECVHOPOPTS: #ifdef RFC2292 /* cannot mix with RFC2292 */ if (OPTBIT(IN6P_RFC2292)) { error = EINVAL; break; } #endif OPTSET(IN6P_HOPOPTS); break; case IPV6_RECVDSTOPTS: #ifdef RFC2292 /* cannot mix with RFC2292 */ if (OPTBIT(IN6P_RFC2292)) { error = EINVAL; break; } #endif OPTSET(IN6P_DSTOPTS); break; case IPV6_RECVRTHDRDSTOPTS: #ifdef RFC2292 /* cannot mix with RFC2292 */ if (OPTBIT(IN6P_RFC2292)) { error = EINVAL; break; } #endif OPTSET(IN6P_RTHDRDSTOPTS); break; case IPV6_RECVRTHDR: #ifdef RFC2292 /* cannot mix with RFC2292 */ if (OPTBIT(IN6P_RFC2292)) { error = EINVAL; break; } #endif OPTSET(IN6P_RTHDR); break; case IPV6_FAITH: OPTSET(IN6P_FAITH); break; case IPV6_RECVPATHMTU: /* * We ignore this option for TCP * sockets. * (RFC3542 leaves this case * unspecified.) */ if (uproto != IPPROTO_TCP) OPTSET(IN6P_MTU); break; case IPV6_V6ONLY: /* * make setsockopt(IPV6_V6ONLY) * available only prior to bind(2). * see ipng mailing list, Jun 22 2001. */ if (inp->inp_lport || !IN6_IS_ADDR_UNSPECIFIED(&in6p_laddr(inp))) { error = EINVAL; break; } #ifdef INET6_BINDV6ONLY if (!optval) error = EINVAL; #else OPTSET(IN6P_IPV6_V6ONLY); #endif break; case IPV6_RECVTCLASS: #ifdef RFC2292 /* cannot mix with RFC2292 XXX */ if (OPTBIT(IN6P_RFC2292)) { error = EINVAL; break; } #endif OPTSET(IN6P_TCLASS); break; case IPV6_BINDANY: error = kauth_authorize_network( kauth_cred_get(), KAUTH_NETWORK_BIND, KAUTH_REQ_NETWORK_BIND_ANYADDR, so, NULL, NULL); if (error) break; OPTSET(IN6P_BINDANY); break; } break; case IPV6_OTCLASS: { struct ip6_pktopts **optp; u_int8_t tclass; error = sockopt_get(sopt, &tclass, sizeof(tclass)); if (error) break; optp = &in6p_outputopts(inp); error = ip6_pcbopt(optname, (u_char *)&tclass, sizeof(tclass), optp, kauth_cred_get(), uproto); break; } case IPV6_TCLASS: case IPV6_DONTFRAG: case IPV6_USE_MIN_MTU: case IPV6_PREFER_TEMPADDR: error = sockopt_getint(sopt, &optval); if (error) break; { struct ip6_pktopts **optp; optp = &in6p_outputopts(inp); error = ip6_pcbopt(optname, (u_char *)&optval, sizeof(optval), optp, kauth_cred_get(), uproto); break; } #ifdef RFC2292 case IPV6_2292PKTINFO: case IPV6_2292HOPLIMIT: case IPV6_2292HOPOPTS: case IPV6_2292DSTOPTS: case IPV6_2292RTHDR: /* RFC 2292 */ error = sockopt_getint(sopt, &optval); if (error) break; switch (optname) { case IPV6_2292PKTINFO: OPTSET2292(IN6P_PKTINFO); break; case IPV6_2292HOPLIMIT: OPTSET2292(IN6P_HOPLIMIT); break; case IPV6_2292HOPOPTS: /* * Check super-user privilege. * See comments for IPV6_RECVHOPOPTS. */ error = kauth_authorize_network( kauth_cred_get(), KAUTH_NETWORK_IPV6, KAUTH_REQ_NETWORK_IPV6_HOPBYHOP, NULL, NULL, NULL); if (error) return (error); OPTSET2292(IN6P_HOPOPTS); break; case IPV6_2292DSTOPTS: error = kauth_authorize_network( kauth_cred_get(), KAUTH_NETWORK_IPV6, KAUTH_REQ_NETWORK_IPV6_HOPBYHOP, NULL, NULL, NULL); if (error) return (error); OPTSET2292(IN6P_DSTOPTS|IN6P_RTHDRDSTOPTS); /* XXX */ break; case IPV6_2292RTHDR: OPTSET2292(IN6P_RTHDR); break; } break; #endif case IPV6_PKTINFO: case IPV6_HOPOPTS: case IPV6_RTHDR: case IPV6_DSTOPTS: case IPV6_RTHDRDSTOPTS: case IPV6_NEXTHOP: { /* new advanced API (RFC3542) */ void *optbuf; int optbuflen; struct ip6_pktopts **optp; #ifdef RFC2292 /* cannot mix with RFC2292 */ if (OPTBIT(IN6P_RFC2292)) { error = EINVAL; break; } #endif optbuflen = sopt->sopt_size; optbuf = malloc(optbuflen, M_IP6OPT, M_NOWAIT); if (optbuf == NULL) { error = ENOBUFS; break; } error = sockopt_get(sopt, optbuf, optbuflen); if (error) { free(optbuf, M_IP6OPT); break; } optp = &in6p_outputopts(inp); error = ip6_pcbopt(optname, optbuf, optbuflen, optp, kauth_cred_get(), uproto); free(optbuf, M_IP6OPT); break; } #undef OPTSET case IPV6_MULTICAST_IF: case IPV6_MULTICAST_HOPS: case IPV6_MULTICAST_LOOP: case IPV6_JOIN_GROUP: case IPV6_LEAVE_GROUP: error = ip6_setmoptions(sopt, inp); break; case IPV6_PORTRANGE: error = sockopt_getint(sopt, &optval); if (error) break; switch (optval) { case IPV6_PORTRANGE_DEFAULT: inp->inp_flags &= ~(IN6P_LOWPORT); inp->inp_flags &= ~(IN6P_HIGHPORT); break; case IPV6_PORTRANGE_HIGH: inp->inp_flags &= ~(IN6P_LOWPORT); inp->inp_flags |= IN6P_HIGHPORT; break; case IPV6_PORTRANGE_LOW: inp->inp_flags &= ~(IN6P_HIGHPORT); inp->inp_flags |= IN6P_LOWPORT; break; default: error = EINVAL; break; } break; case IPV6_PORTALGO: error = sockopt_getint(sopt, &optval); if (error) break; error = portalgo_algo_index_select(inp, optval); break; #if defined(IPSEC) case IPV6_IPSEC_POLICY: if (ipsec_enabled) { error = ipsec_set_policy(inp, sopt->sopt_data, sopt->sopt_size, kauth_cred_get()); } else error = ENOPROTOOPT; break; #endif /* IPSEC */ default: error = ENOPROTOOPT; break; } break; case PRCO_GETOPT: switch (optname) { #ifdef RFC2292 case IPV6_2292PKTOPTIONS: /* * RFC3542 (effectively) deprecated the * semantics of the 2292-style pktoptions. * Since it was not reliable in nature (i.e., * applications had to expect the lack of some * information after all), it would make sense * to simplify this part by always returning * empty data. */ break; #endif case IPV6_RECVHOPOPTS: case IPV6_RECVDSTOPTS: case IPV6_RECVRTHDRDSTOPTS: case IPV6_UNICAST_HOPS: case IPV6_RECVPKTINFO: case IPV6_RECVHOPLIMIT: case IPV6_RECVRTHDR: case IPV6_RECVPATHMTU: case IPV6_FAITH: case IPV6_V6ONLY: case IPV6_PORTRANGE: case IPV6_RECVTCLASS: case IPV6_BINDANY: switch (optname) { case IPV6_RECVHOPOPTS: optval = OPTBIT(IN6P_HOPOPTS); break; case IPV6_RECVDSTOPTS: optval = OPTBIT(IN6P_DSTOPTS); break; case IPV6_RECVRTHDRDSTOPTS: optval = OPTBIT(IN6P_RTHDRDSTOPTS); break; case IPV6_UNICAST_HOPS: optval = in6p_hops6(inp); break; case IPV6_RECVPKTINFO: optval = OPTBIT(IN6P_PKTINFO); break; case IPV6_RECVHOPLIMIT: optval = OPTBIT(IN6P_HOPLIMIT); break; case IPV6_RECVRTHDR: optval = OPTBIT(IN6P_RTHDR); break; case IPV6_RECVPATHMTU: optval = OPTBIT(IN6P_MTU); break; case IPV6_FAITH: optval = OPTBIT(IN6P_FAITH); break; case IPV6_V6ONLY: optval = OPTBIT(IN6P_IPV6_V6ONLY); break; case IPV6_PORTRANGE: { int flags; flags = inp->inp_flags; if (flags & IN6P_HIGHPORT) optval = IPV6_PORTRANGE_HIGH; else if (flags & IN6P_LOWPORT) optval = IPV6_PORTRANGE_LOW; else optval = 0; break; } case IPV6_RECVTCLASS: optval = OPTBIT(IN6P_TCLASS); break; case IPV6_BINDANY: optval = OPTBIT(IN6P_BINDANY); break; } if (error) break; error = sockopt_setint(sopt, optval); break; case IPV6_PATHMTU: { u_long pmtu = 0; struct ip6_mtuinfo mtuinfo; struct route *ro = &inp->inp_route; struct rtentry *rt; union { struct sockaddr dst; struct sockaddr_in6 dst6; } u; if (!(so->so_state & SS_ISCONNECTED)) return (ENOTCONN); /* * XXX: we dot not consider the case of source * routing, or optional information to specify * the outgoing interface. */ sockaddr_in6_init(&u.dst6, &in6p_faddr(inp), 0, 0, 0); rt = rtcache_lookup(ro, &u.dst); error = ip6_getpmtu(rt, NULL, &pmtu, NULL); rtcache_unref(rt, ro); if (error) break; if (pmtu > IPV6_MAXPACKET) pmtu = IPV6_MAXPACKET; memset(&mtuinfo, 0, sizeof(mtuinfo)); mtuinfo.ip6m_mtu = (u_int32_t)pmtu; optdata = (void *)&mtuinfo; optdatalen = sizeof(mtuinfo); if (optdatalen > MCLBYTES) return (EMSGSIZE); /* XXX */ error = sockopt_set(sopt, optdata, optdatalen); break; } #ifdef RFC2292 case IPV6_2292PKTINFO: case IPV6_2292HOPLIMIT: case IPV6_2292HOPOPTS: case IPV6_2292RTHDR: case IPV6_2292DSTOPTS: switch (optname) { case IPV6_2292PKTINFO: optval = OPTBIT(IN6P_PKTINFO); break; case IPV6_2292HOPLIMIT: optval = OPTBIT(IN6P_HOPLIMIT); break; case IPV6_2292HOPOPTS: optval = OPTBIT(IN6P_HOPOPTS); break; case IPV6_2292RTHDR: optval = OPTBIT(IN6P_RTHDR); break; case IPV6_2292DSTOPTS: optval = OPTBIT(IN6P_DSTOPTS|IN6P_RTHDRDSTOPTS); break; } error = sockopt_setint(sopt, optval); break; #endif case IPV6_PKTINFO: case IPV6_HOPOPTS: case IPV6_RTHDR: case IPV6_DSTOPTS: case IPV6_RTHDRDSTOPTS: case IPV6_NEXTHOP: case IPV6_OTCLASS: case IPV6_TCLASS: case IPV6_DONTFRAG: case IPV6_USE_MIN_MTU: case IPV6_PREFER_TEMPADDR: error = ip6_getpcbopt(in6p_outputopts(inp), optname, sopt); break; case IPV6_MULTICAST_IF: case IPV6_MULTICAST_HOPS: case IPV6_MULTICAST_LOOP: case IPV6_JOIN_GROUP: case IPV6_LEAVE_GROUP: error = ip6_getmoptions(sopt, inp); break; case IPV6_PORTALGO: optval = inp->inp_portalgo; error = sockopt_setint(sopt, optval); break; #if defined(IPSEC) case IPV6_IPSEC_POLICY: if (ipsec_used) { struct mbuf *m = NULL; /* * XXX: this will return EINVAL as sopt is * empty */ error = ipsec_get_policy(inp, sopt->sopt_data, sopt->sopt_size, &m); if (!error) error = sockopt_setmbuf(sopt, m); } else error = ENOPROTOOPT; break; #endif /* IPSEC */ default: error = ENOPROTOOPT; break; } break; } return (error); } int ip6_raw_ctloutput(int op, struct socket *so, struct sockopt *sopt) { int error = 0, optval; const int icmp6off = offsetof(struct icmp6_hdr, icmp6_cksum); struct inpcb *inp = sotoinpcb(so); int level, optname; KASSERT(sopt != NULL); level = sopt->sopt_level; optname = sopt->sopt_name; if (level != IPPROTO_IPV6) { return ENOPROTOOPT; } switch (optname) { case IPV6_CHECKSUM: /* * For ICMPv6 sockets, no modification allowed for checksum * offset, permit "no change" values to help existing apps. * * XXX RFC3542 says: "An attempt to set IPV6_CHECKSUM * for an ICMPv6 socket will fail." The current * behavior does not meet RFC3542. */ switch (op) { case PRCO_SETOPT: error = sockopt_getint(sopt, &optval); if (error) break; if (optval < -1 || (optval > 0 && (optval % 2) != 0)) { /* * The API assumes non-negative even offset * values or -1 as a special value. */ error = EINVAL; } else if (so->so_proto->pr_protocol == IPPROTO_ICMPV6) { if (optval != icmp6off) error = EINVAL; } else in6p_cksum(inp) = optval; break; case PRCO_GETOPT: if (so->so_proto->pr_protocol == IPPROTO_ICMPV6) optval = icmp6off; else optval = in6p_cksum(inp); error = sockopt_setint(sopt, optval); break; default: error = EINVAL; break; } break; default: error = ENOPROTOOPT; break; } return (error); } #ifdef RFC2292 /* * Set up IP6 options in pcb for insertion in output packets or * specifying behavior of outgoing packets. */ static int ip6_pcbopts(struct ip6_pktopts **pktopt, struct socket *so, struct sockopt *sopt) { struct ip6_pktopts *opt = *pktopt; struct mbuf *m; int error = 0; KASSERT(solocked(so)); /* turn off any old options. */ if (opt) { #ifdef DIAGNOSTIC if (opt->ip6po_pktinfo || opt->ip6po_nexthop || opt->ip6po_hbh || opt->ip6po_dest1 || opt->ip6po_dest2 || opt->ip6po_rhinfo.ip6po_rhi_rthdr) printf("ip6_pcbopts: all specified options are cleared.\n"); #endif ip6_clearpktopts(opt, -1); } else { opt = malloc(sizeof(*opt), M_IP6OPT, M_NOWAIT); if (opt == NULL) return (ENOBUFS); } *pktopt = NULL; if (sopt == NULL || sopt->sopt_size == 0) { /* * Only turning off any previous options, regardless of * whether the opt is just created or given. */ free(opt, M_IP6OPT); return (0); } /* set options specified by user. */ m = sockopt_getmbuf(sopt); if (m == NULL) { free(opt, M_IP6OPT); return (ENOBUFS); } error = ip6_setpktopts(m, opt, NULL, kauth_cred_get(), so->so_proto->pr_protocol); m_freem(m); if (error != 0) { ip6_clearpktopts(opt, -1); /* XXX: discard all options */ free(opt, M_IP6OPT); return (error); } *pktopt = opt; return (0); } #endif /* * initialize ip6_pktopts. beware that there are non-zero default values in * the struct. */ void ip6_initpktopts(struct ip6_pktopts *opt) { memset(opt, 0, sizeof(*opt)); opt->ip6po_hlim = -1; /* -1 means default hop limit */ opt->ip6po_tclass = -1; /* -1 means default traffic class */ opt->ip6po_minmtu = IP6PO_MINMTU_MCASTONLY; opt->ip6po_prefer_tempaddr = IP6PO_TEMPADDR_SYSTEM; } #define sin6tosa(sin6) ((struct sockaddr *)(sin6)) /* XXX */ static int ip6_pcbopt(int optname, u_char *buf, int len, struct ip6_pktopts **pktopt, kauth_cred_t cred, int uproto) { struct ip6_pktopts *opt; if (*pktopt == NULL) { *pktopt = malloc(sizeof(struct ip6_pktopts), M_IP6OPT, M_NOWAIT); if (*pktopt == NULL) return (ENOBUFS); ip6_initpktopts(*pktopt); } opt = *pktopt; return (ip6_setpktopt(optname, buf, len, opt, cred, 1, 0, uproto)); } static int ip6_getpcbopt(struct ip6_pktopts *pktopt, int optname, struct sockopt *sopt) { void *optdata = NULL; int optdatalen = 0; struct ip6_ext *ip6e; int error = 0; struct in6_pktinfo null_pktinfo; int deftclass = 0, on; int defminmtu = IP6PO_MINMTU_MCASTONLY; int defpreftemp = IP6PO_TEMPADDR_SYSTEM; switch (optname) { case IPV6_PKTINFO: if (pktopt && pktopt->ip6po_pktinfo) optdata = (void *)pktopt->ip6po_pktinfo; else { /* XXX: we don't have to do this every time... */ memset(&null_pktinfo, 0, sizeof(null_pktinfo)); optdata = (void *)&null_pktinfo; } optdatalen = sizeof(struct in6_pktinfo); break; case IPV6_OTCLASS: /* XXX */ return (EINVAL); case IPV6_TCLASS: if (pktopt && pktopt->ip6po_tclass >= 0) optdata = (void *)&pktopt->ip6po_tclass; else optdata = (void *)&deftclass; optdatalen = sizeof(int); break; case IPV6_HOPOPTS: if (pktopt && pktopt->ip6po_hbh) { optdata = (void *)pktopt->ip6po_hbh; ip6e = (struct ip6_ext *)pktopt->ip6po_hbh; optdatalen = (ip6e->ip6e_len + 1) << 3; } break; case IPV6_RTHDR: if (pktopt && pktopt->ip6po_rthdr) { optdata = (void *)pktopt->ip6po_rthdr; ip6e = (struct ip6_ext *)pktopt->ip6po_rthdr; optdatalen = (ip6e->ip6e_len + 1) << 3; } break; case IPV6_RTHDRDSTOPTS: if (pktopt && pktopt->ip6po_dest1) { optdata = (void *)pktopt->ip6po_dest1; ip6e = (struct ip6_ext *)pktopt->ip6po_dest1; optdatalen = (ip6e->ip6e_len + 1) << 3; } break; case IPV6_DSTOPTS: if (pktopt && pktopt->ip6po_dest2) { optdata = (void *)pktopt->ip6po_dest2; ip6e = (struct ip6_ext *)pktopt->ip6po_dest2; optdatalen = (ip6e->ip6e_len + 1) << 3; } break; case IPV6_NEXTHOP: if (pktopt && pktopt->ip6po_nexthop) { optdata = (void *)pktopt->ip6po_nexthop; optdatalen = pktopt->ip6po_nexthop->sa_len; } break; case IPV6_USE_MIN_MTU: if (pktopt) optdata = (void *)&pktopt->ip6po_minmtu; else optdata = (void *)&defminmtu; optdatalen = sizeof(int); break; case IPV6_DONTFRAG: if (pktopt && ((pktopt->ip6po_flags) & IP6PO_DONTFRAG)) on = 1; else on = 0; optdata = (void *)&on; optdatalen = sizeof(on); break; case IPV6_PREFER_TEMPADDR: if (pktopt) optdata = (void *)&pktopt->ip6po_prefer_tempaddr; else optdata = (void *)&defpreftemp; optdatalen = sizeof(int); break; default: /* should not happen */ #ifdef DIAGNOSTIC panic("ip6_getpcbopt: unexpected option\n"); #endif return (ENOPROTOOPT); } error = sockopt_set(sopt, optdata, optdatalen); return (error); } void ip6_clearpktopts(struct ip6_pktopts *pktopt, int optname) { if (optname == -1 || optname == IPV6_PKTINFO) { if (pktopt->ip6po_pktinfo) free(pktopt->ip6po_pktinfo, M_IP6OPT); pktopt->ip6po_pktinfo = NULL; } if (optname == -1 || optname == IPV6_HOPLIMIT) pktopt->ip6po_hlim = -1; if (optname == -1 || optname == IPV6_TCLASS) pktopt->ip6po_tclass = -1; if (optname == -1 || optname == IPV6_NEXTHOP) { rtcache_free(&pktopt->ip6po_nextroute); if (pktopt->ip6po_nexthop) free(pktopt->ip6po_nexthop, M_IP6OPT); pktopt->ip6po_nexthop = NULL; } if (optname == -1 || optname == IPV6_HOPOPTS) { if (pktopt->ip6po_hbh) free(pktopt->ip6po_hbh, M_IP6OPT); pktopt->ip6po_hbh = NULL; } if (optname == -1 || optname == IPV6_RTHDRDSTOPTS) { if (pktopt->ip6po_dest1) free(pktopt->ip6po_dest1, M_IP6OPT); pktopt->ip6po_dest1 = NULL; } if (optname == -1 || optname == IPV6_RTHDR) { if (pktopt->ip6po_rhinfo.ip6po_rhi_rthdr) free(pktopt->ip6po_rhinfo.ip6po_rhi_rthdr, M_IP6OPT); pktopt->ip6po_rhinfo.ip6po_rhi_rthdr = NULL; rtcache_free(&pktopt->ip6po_route); } if (optname == -1 || optname == IPV6_DSTOPTS) { if (pktopt->ip6po_dest2) free(pktopt->ip6po_dest2, M_IP6OPT); pktopt->ip6po_dest2 = NULL; } } #define PKTOPT_EXTHDRCPY(type) \ do { \ if (src->type) { \ int hlen = (((struct ip6_ext *)src->type)->ip6e_len + 1) << 3;\ dst->type = malloc(hlen, M_IP6OPT, canwait); \ if (dst->type == NULL) \ goto bad; \ memcpy(dst->type, src->type, hlen); \ } \ } while (/*CONSTCOND*/ 0) static int copypktopts(struct ip6_pktopts *dst, struct ip6_pktopts *src, int canwait) { dst->ip6po_hlim = src->ip6po_hlim; dst->ip6po_tclass = src->ip6po_tclass; dst->ip6po_flags = src->ip6po_flags; dst->ip6po_minmtu = src->ip6po_minmtu; dst->ip6po_prefer_tempaddr = src->ip6po_prefer_tempaddr; if (src->ip6po_pktinfo) { dst->ip6po_pktinfo = malloc(sizeof(*dst->ip6po_pktinfo), M_IP6OPT, canwait); if (dst->ip6po_pktinfo == NULL) goto bad; *dst->ip6po_pktinfo = *src->ip6po_pktinfo; } if (src->ip6po_nexthop) { dst->ip6po_nexthop = malloc(src->ip6po_nexthop->sa_len, M_IP6OPT, canwait); if (dst->ip6po_nexthop == NULL) goto bad; memcpy(dst->ip6po_nexthop, src->ip6po_nexthop, src->ip6po_nexthop->sa_len); } PKTOPT_EXTHDRCPY(ip6po_hbh); PKTOPT_EXTHDRCPY(ip6po_dest1); PKTOPT_EXTHDRCPY(ip6po_dest2); PKTOPT_EXTHDRCPY(ip6po_rthdr); /* not copy the cached route */ return (0); bad: if (dst->ip6po_pktinfo) free(dst->ip6po_pktinfo, M_IP6OPT); if (dst->ip6po_nexthop) free(dst->ip6po_nexthop, M_IP6OPT); if (dst->ip6po_hbh) free(dst->ip6po_hbh, M_IP6OPT); if (dst->ip6po_dest1) free(dst->ip6po_dest1, M_IP6OPT); if (dst->ip6po_dest2) free(dst->ip6po_dest2, M_IP6OPT); if (dst->ip6po_rthdr) free(dst->ip6po_rthdr, M_IP6OPT); return (ENOBUFS); } #undef PKTOPT_EXTHDRCPY struct ip6_pktopts * ip6_copypktopts(struct ip6_pktopts *src, int canwait) { int error; struct ip6_pktopts *dst; dst = malloc(sizeof(*dst), M_IP6OPT, canwait); if (dst == NULL) return (NULL); ip6_initpktopts(dst); if ((error = copypktopts(dst, src, canwait)) != 0) { free(dst, M_IP6OPT); return (NULL); } return (dst); } void ip6_freepcbopts(struct ip6_pktopts *pktopt) { if (pktopt == NULL) return; ip6_clearpktopts(pktopt, -1); free(pktopt, M_IP6OPT); } int ip6_get_membership(const struct sockopt *sopt, struct ifnet **ifp, struct psref *psref, void *v, size_t l) { struct ipv6_mreq mreq; int error; struct in6_addr *ia = &mreq.ipv6mr_multiaddr; struct in_addr *ia4 = (void *)&ia->s6_addr32[3]; error = sockopt_get(sopt, &mreq, sizeof(mreq)); if (error != 0) return error; if (IN6_IS_ADDR_UNSPECIFIED(ia)) { /* * We use the unspecified address to specify to accept * all multicast addresses. Only super user is allowed * to do this. */ if (kauth_authorize_network(kauth_cred_get(), KAUTH_NETWORK_IPV6, KAUTH_REQ_NETWORK_IPV6_JOIN_MULTICAST, NULL, NULL, NULL)) return EACCES; } else if (IN6_IS_ADDR_V4MAPPED(ia)) { // Don't bother if we are not going to use ifp. if (l == sizeof(*ia)) { memcpy(v, ia, l); return 0; } } else if (!IN6_IS_ADDR_MULTICAST(ia)) { return EINVAL; } /* * If no interface was explicitly specified, choose an * appropriate one according to the given multicast address. */ if (mreq.ipv6mr_interface == 0) { struct rtentry *rt; union { struct sockaddr dst; struct sockaddr_in dst4; struct sockaddr_in6 dst6; } u; struct route ro; /* * Look up the routing table for the * address, and choose the outgoing interface. * XXX: is it a good approach? */ memset(&ro, 0, sizeof(ro)); if (IN6_IS_ADDR_V4MAPPED(ia)) sockaddr_in_init(&u.dst4, ia4, 0); else sockaddr_in6_init(&u.dst6, ia, 0, 0, 0); error = rtcache_setdst(&ro, &u.dst); if (error != 0) return error; rt = rtcache_init(&ro); *ifp = rt != NULL ? if_get_byindex(rt->rt_ifp->if_index, psref) : NULL; rtcache_unref(rt, &ro); rtcache_free(&ro); } else { /* * If the interface is specified, validate it. */ *ifp = if_get_byindex(mreq.ipv6mr_interface, psref); if (*ifp == NULL) return ENXIO; /* XXX EINVAL? */ } if (sizeof(*ia) == l) memcpy(v, ia, l); else memcpy(v, ia4, l); return 0; } /* * Set the IP6 multicast options in response to user setsockopt(). */ static int ip6_setmoptions(const struct sockopt *sopt, struct inpcb *inp) { int error = 0; u_int loop, ifindex; struct ipv6_mreq mreq; struct in6_addr ia; struct ifnet *ifp; struct ip6_moptions *im6o = in6p_moptions(inp); struct in6_multi_mship *imm; KASSERT(inp_locked(inp)); if (im6o == NULL) { /* * No multicast option buffer attached to the pcb; * allocate one and initialize to default values. */ im6o = malloc(sizeof(*im6o), M_IPMOPTS, M_NOWAIT); if (im6o == NULL) return (ENOBUFS); in6p_moptions(inp) = im6o; im6o->im6o_multicast_if_index = 0; im6o->im6o_multicast_hlim = ip6_defmcasthlim; im6o->im6o_multicast_loop = IPV6_DEFAULT_MULTICAST_LOOP; LIST_INIT(&im6o->im6o_memberships); } switch (sopt->sopt_name) { case IPV6_MULTICAST_IF: { int s; /* * Select the interface for outgoing multicast packets. */ error = sockopt_get(sopt, &ifindex, sizeof(ifindex)); if (error != 0) break; s = pserialize_read_enter(); if (ifindex != 0) { if ((ifp = if_byindex(ifindex)) == NULL) { pserialize_read_exit(s); error = ENXIO; /* XXX EINVAL? */ break; } if ((ifp->if_flags & IFF_MULTICAST) == 0) { pserialize_read_exit(s); error = EADDRNOTAVAIL; break; } } else ifp = NULL; im6o->im6o_multicast_if_index = if_get_index(ifp); pserialize_read_exit(s); break; } case IPV6_MULTICAST_HOPS: { /* * Set the IP6 hoplimit for outgoing multicast packets. */ int optval; error = sockopt_getint(sopt, &optval); if (error != 0) break; if (optval < -1 || optval >= 256) error = EINVAL; else if (optval == -1) im6o->im6o_multicast_hlim = ip6_defmcasthlim; else im6o->im6o_multicast_hlim = optval; break; } case IPV6_MULTICAST_LOOP: /* * Set the loopback flag for outgoing multicast packets. * Must be zero or one. */ error = sockopt_get(sopt, &loop, sizeof(loop)); if (error != 0) break; if (loop > 1) { error = EINVAL; break; } im6o->im6o_multicast_loop = loop; break; case IPV6_JOIN_GROUP: { int bound; struct psref psref; /* * Add a multicast group membership. * Group must be a valid IP6 multicast address. */ bound = curlwp_bind(); ifp = NULL; error = ip6_get_membership(sopt, &ifp, &psref, &ia, sizeof(ia)); if (error != 0) { KASSERT(ifp == NULL); curlwp_bindx(bound); return error; } if (IN6_IS_ADDR_V4MAPPED(&ia)) { error = ip_setmoptions(&inp->inp_moptions, sopt); goto put_break; } /* * See if we found an interface, and confirm that it * supports multicast */ if (ifp == NULL || (ifp->if_flags & IFF_MULTICAST) == 0) { error = EADDRNOTAVAIL; goto put_break; } if (in6_setscope(&ia, ifp, NULL)) { error = EADDRNOTAVAIL; /* XXX: should not happen */ goto put_break; } /* * See if the membership already exists. */ LIST_FOREACH(imm, &im6o->im6o_memberships, i6mm_chain) { if (imm->i6mm_maddr->in6m_ifp == ifp && IN6_ARE_ADDR_EQUAL(&imm->i6mm_maddr->in6m_addr, &ia)) goto put_break; } if (imm != NULL) { error = EADDRINUSE; goto put_break; } /* * Everything looks good; add a new record to the multicast * address list for the given interface. */ imm = in6_joingroup(ifp, &ia, &error, 0); if (imm == NULL) goto put_break; LIST_INSERT_HEAD(&im6o->im6o_memberships, imm, i6mm_chain); put_break: if_put(ifp, &psref); curlwp_bindx(bound); break; } case IPV6_LEAVE_GROUP: { /* * Drop a multicast group membership. * Group must be a valid IP6 multicast address. */ error = sockopt_get(sopt, &mreq, sizeof(mreq)); if (error != 0) break; if (IN6_IS_ADDR_V4MAPPED(&mreq.ipv6mr_multiaddr)) { error = ip_setmoptions(&inp->inp_moptions, sopt); break; } /* * If an interface address was specified, get a pointer * to its ifnet structure. */ if (mreq.ipv6mr_interface != 0) { if ((ifp = if_byindex(mreq.ipv6mr_interface)) == NULL) { error = ENXIO; /* XXX EINVAL? */ break; } } else ifp = NULL; /* Fill in the scope zone ID */ if (ifp) { if (in6_setscope(&mreq.ipv6mr_multiaddr, ifp, NULL)) { /* XXX: should not happen */ error = EADDRNOTAVAIL; break; } } else if (mreq.ipv6mr_interface != 0) { /* * XXX: This case would happens when the (positive) * index is in the valid range, but the corresponding * interface has been detached dynamically. The above * check probably avoids such case to happen here, but * we check it explicitly for safety. */ error = EADDRNOTAVAIL; break; } else { /* ipv6mr_interface == 0 */ struct sockaddr_in6 sa6_mc; /* * The API spec says as follows: * If the interface index is specified as 0, the * system may choose a multicast group membership to * drop by matching the multicast address only. * On the other hand, we cannot disambiguate the scope * zone unless an interface is provided. Thus, we * check if there's ambiguity with the default scope * zone as the last resort. */ sockaddr_in6_init(&sa6_mc, &mreq.ipv6mr_multiaddr, 0, 0, 0); error = sa6_embedscope(&sa6_mc, ip6_use_defzone); if (error != 0) break; mreq.ipv6mr_multiaddr = sa6_mc.sin6_addr; } /* * Find the membership in the membership list. */ LIST_FOREACH(imm, &im6o->im6o_memberships, i6mm_chain) { if ((ifp == NULL || imm->i6mm_maddr->in6m_ifp == ifp) && IN6_ARE_ADDR_EQUAL(&imm->i6mm_maddr->in6m_addr, &mreq.ipv6mr_multiaddr)) break; } if (imm == NULL) { /* Unable to resolve interface */ error = EADDRNOTAVAIL; break; } /* * Give up the multicast address record to which the * membership points. */ LIST_REMOVE(imm, i6mm_chain); in6_leavegroup(imm); /* in6m_ifp should not leave thanks to inp_lock */ break; } default: error = EOPNOTSUPP; break; } /* * If all options have default values, no need to keep the mbuf. */ if (im6o->im6o_multicast_if_index == 0 && im6o->im6o_multicast_hlim == ip6_defmcasthlim && im6o->im6o_multicast_loop == IPV6_DEFAULT_MULTICAST_LOOP && LIST_EMPTY(&im6o->im6o_memberships)) { free(in6p_moptions(inp), M_IPMOPTS); in6p_moptions(inp) = NULL; } return (error); } /* * Return the IP6 multicast options in response to user getsockopt(). */ static int ip6_getmoptions(struct sockopt *sopt, struct inpcb *inp) { u_int optval; int error; struct ip6_moptions *im6o = in6p_moptions(inp); switch (sopt->sopt_name) { case IPV6_MULTICAST_IF: if (im6o == NULL || im6o->im6o_multicast_if_index == 0) optval = 0; else optval = im6o->im6o_multicast_if_index; error = sockopt_set(sopt, &optval, sizeof(optval)); break; case IPV6_MULTICAST_HOPS: if (im6o == NULL) optval = ip6_defmcasthlim; else optval = im6o->im6o_multicast_hlim; error = sockopt_set(sopt, &optval, sizeof(optval)); break; case IPV6_MULTICAST_LOOP: if (im6o == NULL) optval = IPV6_DEFAULT_MULTICAST_LOOP; else optval = im6o->im6o_multicast_loop; error = sockopt_set(sopt, &optval, sizeof(optval)); break; default: error = EOPNOTSUPP; } return (error); } /* * Discard the IP6 multicast options. */ void ip6_freemoptions(struct ip6_moptions *im6o) { struct in6_multi_mship *imm, *nimm; if (im6o == NULL) return; /* The owner of im6o (inp) should be protected by solock */ LIST_FOREACH_SAFE(imm, &im6o->im6o_memberships, i6mm_chain, nimm) { LIST_REMOVE(imm, i6mm_chain); in6_leavegroup(imm); } free(im6o, M_IPMOPTS); } /* * Set IPv6 outgoing packet options based on advanced API. */ int ip6_setpktopts(struct mbuf *control, struct ip6_pktopts *opt, struct ip6_pktopts *stickyopt, kauth_cred_t cred, int uproto) { struct cmsghdr *cm = 0; if (control == NULL || opt == NULL) return (EINVAL); ip6_initpktopts(opt); if (stickyopt) { int error; /* * If stickyopt is provided, make a local copy of the options * for this particular packet, then override them by ancillary * objects. * XXX: copypktopts() does not copy the cached route to a next * hop (if any). This is not very good in terms of efficiency, * but we can allow this since this option should be rarely * used. */ if ((error = copypktopts(opt, stickyopt, M_NOWAIT)) != 0) return (error); } /* * XXX: Currently, we assume all the optional information is stored * in a single mbuf. */ if (control->m_next) return (EINVAL); /* XXX if cm->cmsg_len is not aligned, control->m_len can become <0 */ for (; control->m_len > 0; control->m_data += CMSG_ALIGN(cm->cmsg_len), control->m_len -= CMSG_ALIGN(cm->cmsg_len)) { int error; if (control->m_len < CMSG_LEN(0)) return (EINVAL); cm = mtod(control, struct cmsghdr *); if (cm->cmsg_len < CMSG_LEN(0) || cm->cmsg_len > control->m_len) return (EINVAL); if (cm->cmsg_level != IPPROTO_IPV6) continue; error = ip6_setpktopt(cm->cmsg_type, CMSG_DATA(cm), cm->cmsg_len - CMSG_LEN(0), opt, cred, 0, 1, uproto); if (error) return (error); } return (0); } /* * Set a particular packet option, as a sticky option or an ancillary data * item. "len" can be 0 only when it's a sticky option. * We have 4 cases of combination of "sticky" and "cmsg": * "sticky=0, cmsg=0": impossible * "sticky=0, cmsg=1": RFC2292 or RFC3542 ancillary data * "sticky=1, cmsg=0": RFC3542 socket option * "sticky=1, cmsg=1": RFC2292 socket option */ static int ip6_setpktopt(int optname, u_char *buf, int len, struct ip6_pktopts *opt, kauth_cred_t cred, int sticky, int cmsg, int uproto) { int minmtupolicy; int error; if (!sticky && !cmsg) { #ifdef DIAGNOSTIC printf("ip6_setpktopt: impossible case\n"); #endif return (EINVAL); } /* * IPV6_2292xxx is for backward compatibility to RFC2292, and should * not be specified in the context of RFC3542. Conversely, * RFC3542 types should not be specified in the context of RFC2292. */ if (!cmsg) { switch (optname) { case IPV6_2292PKTINFO: case IPV6_2292HOPLIMIT: case IPV6_2292NEXTHOP: case IPV6_2292HOPOPTS: case IPV6_2292DSTOPTS: case IPV6_2292RTHDR: case IPV6_2292PKTOPTIONS: return (ENOPROTOOPT); } } if (sticky && cmsg) { switch (optname) { case IPV6_PKTINFO: case IPV6_HOPLIMIT: case IPV6_NEXTHOP: case IPV6_HOPOPTS: case IPV6_DSTOPTS: case IPV6_RTHDRDSTOPTS: case IPV6_RTHDR: case IPV6_USE_MIN_MTU: case IPV6_DONTFRAG: case IPV6_OTCLASS: case IPV6_TCLASS: case IPV6_PREFER_TEMPADDR: /* XXX not an RFC3542 option */ return (ENOPROTOOPT); } } switch (optname) { #ifdef RFC2292 case IPV6_2292PKTINFO: #endif case IPV6_PKTINFO: { struct in6_pktinfo *pktinfo; if (len != sizeof(struct in6_pktinfo)) return (EINVAL); pktinfo = (struct in6_pktinfo *)buf; /* * An application can clear any sticky IPV6_PKTINFO option by * doing a "regular" setsockopt with ipi6_addr being * in6addr_any and ipi6_ifindex being zero. * [RFC 3542, Section 6] */ if (optname == IPV6_PKTINFO && opt->ip6po_pktinfo && pktinfo->ipi6_ifindex == 0 && IN6_IS_ADDR_UNSPECIFIED(&pktinfo->ipi6_addr)) { ip6_clearpktopts(opt, optname); break; } if (uproto == IPPROTO_TCP && optname == IPV6_PKTINFO && sticky && !IN6_IS_ADDR_UNSPECIFIED(&pktinfo->ipi6_addr)) { return (EINVAL); } /* Validate the interface index if specified. */ if (pktinfo->ipi6_ifindex) { struct ifnet *ifp; int s = pserialize_read_enter(); ifp = if_byindex(pktinfo->ipi6_ifindex); if (ifp == NULL) { pserialize_read_exit(s); return ENXIO; } pserialize_read_exit(s); } /* * We store the address anyway, and let in6_selectsrc() * validate the specified address. This is because ipi6_addr * may not have enough information about its scope zone, and * we may need additional information (such as outgoing * interface or the scope zone of a destination address) to * disambiguate the scope. * XXX: the delay of the validation may confuse the * application when it is used as a sticky option. */ if (opt->ip6po_pktinfo == NULL) { opt->ip6po_pktinfo = malloc(sizeof(*pktinfo), M_IP6OPT, M_NOWAIT); if (opt->ip6po_pktinfo == NULL) return (ENOBUFS); } memcpy(opt->ip6po_pktinfo, pktinfo, sizeof(*pktinfo)); break; } #ifdef RFC2292 case IPV6_2292HOPLIMIT: #endif case IPV6_HOPLIMIT: { int *hlimp; /* * RFC 3542 deprecated the usage of sticky IPV6_HOPLIMIT * to simplify the ordering among hoplimit options. */ if (optname == IPV6_HOPLIMIT && sticky) return (ENOPROTOOPT); if (len != sizeof(int)) return (EINVAL); hlimp = (int *)buf; if (*hlimp < -1 || *hlimp > 255) return (EINVAL); opt->ip6po_hlim = *hlimp; break; } case IPV6_OTCLASS: if (len != sizeof(u_int8_t)) return (EINVAL); opt->ip6po_tclass = *(u_int8_t *)buf; break; case IPV6_TCLASS: { int tclass; if (len != sizeof(int)) return (EINVAL); tclass = *(int *)buf; if (tclass < -1 || tclass > 255) return (EINVAL); opt->ip6po_tclass = tclass; break; } #ifdef RFC2292 case IPV6_2292NEXTHOP: #endif case IPV6_NEXTHOP: error = kauth_authorize_network(cred, KAUTH_NETWORK_IPV6, KAUTH_REQ_NETWORK_IPV6_HOPBYHOP, NULL, NULL, NULL); if (error) return (error); if (len == 0) { /* just remove the option */ ip6_clearpktopts(opt, IPV6_NEXTHOP); break; } /* check if cmsg_len is large enough for sa_len */ if (len < sizeof(struct sockaddr) || len < *buf) return (EINVAL); switch (((struct sockaddr *)buf)->sa_family) { case AF_INET6: { struct sockaddr_in6 *sa6 = (struct sockaddr_in6 *)buf; if (sa6->sin6_len != sizeof(struct sockaddr_in6)) return (EINVAL); if (IN6_IS_ADDR_UNSPECIFIED(&sa6->sin6_addr) || IN6_IS_ADDR_MULTICAST(&sa6->sin6_addr)) { return (EINVAL); } if ((error = sa6_embedscope(sa6, ip6_use_defzone)) != 0) { return (error); } break; } case AF_LINK: /* eventually be supported? */ default: return (EAFNOSUPPORT); } /* turn off the previous option, then set the new option. */ ip6_clearpktopts(opt, IPV6_NEXTHOP); opt->ip6po_nexthop = malloc(*buf, M_IP6OPT, M_NOWAIT); if (opt->ip6po_nexthop == NULL) return (ENOBUFS); memcpy(opt->ip6po_nexthop, buf, *buf); break; #ifdef RFC2292 case IPV6_2292HOPOPTS: #endif case IPV6_HOPOPTS: { struct ip6_hbh *hbh; int hbhlen; /* * XXX: We don't allow a non-privileged user to set ANY HbH * options, since per-option restriction has too much * overhead. */ error = kauth_authorize_network(cred, KAUTH_NETWORK_IPV6, KAUTH_REQ_NETWORK_IPV6_HOPBYHOP, NULL, NULL, NULL); if (error) return (error); if (len == 0) { ip6_clearpktopts(opt, IPV6_HOPOPTS); break; /* just remove the option */ } /* message length validation */ if (len < sizeof(struct ip6_hbh)) return (EINVAL); hbh = (struct ip6_hbh *)buf; hbhlen = (hbh->ip6h_len + 1) << 3; if (len != hbhlen) return (EINVAL); /* turn off the previous option, then set the new option. */ ip6_clearpktopts(opt, IPV6_HOPOPTS); opt->ip6po_hbh = malloc(hbhlen, M_IP6OPT, M_NOWAIT); if (opt->ip6po_hbh == NULL) return (ENOBUFS); memcpy(opt->ip6po_hbh, hbh, hbhlen); break; } #ifdef RFC2292 case IPV6_2292DSTOPTS: #endif case IPV6_DSTOPTS: case IPV6_RTHDRDSTOPTS: { struct ip6_dest *dest, **newdest = NULL; int destlen; /* XXX: see the comment for IPV6_HOPOPTS */ error = kauth_authorize_network(cred, KAUTH_NETWORK_IPV6, KAUTH_REQ_NETWORK_IPV6_HOPBYHOP, NULL, NULL, NULL); if (error) return (error); if (len == 0) { ip6_clearpktopts(opt, optname); break; /* just remove the option */ } /* message length validation */ if (len < sizeof(struct ip6_dest)) return (EINVAL); dest = (struct ip6_dest *)buf; destlen = (dest->ip6d_len + 1) << 3; if (len != destlen) return (EINVAL); /* * Determine the position that the destination options header * should be inserted; before or after the routing header. */ switch (optname) { case IPV6_2292DSTOPTS: /* * The old advanced API is ambiguous on this point. * Our approach is to determine the position based * according to the existence of a routing header. * Note, however, that this depends on the order of the * extension headers in the ancillary data; the 1st * part of the destination options header must appear * before the routing header in the ancillary data, * too. * RFC3542 solved the ambiguity by introducing * separate ancillary data or option types. */ if (opt->ip6po_rthdr == NULL) newdest = &opt->ip6po_dest1; else newdest = &opt->ip6po_dest2; break; case IPV6_RTHDRDSTOPTS: newdest = &opt->ip6po_dest1; break; case IPV6_DSTOPTS: newdest = &opt->ip6po_dest2; break; } /* turn off the previous option, then set the new option. */ ip6_clearpktopts(opt, optname); *newdest = malloc(destlen, M_IP6OPT, M_NOWAIT); if (*newdest == NULL) return (ENOBUFS); memcpy(*newdest, dest, destlen); break; } #ifdef RFC2292 case IPV6_2292RTHDR: #endif case IPV6_RTHDR: { struct ip6_rthdr *rth; int rthlen; if (len == 0) { ip6_clearpktopts(opt, IPV6_RTHDR); break; /* just remove the option */ } /* message length validation */ if (len < sizeof(struct ip6_rthdr)) return (EINVAL); rth = (struct ip6_rthdr *)buf; rthlen = (rth->ip6r_len + 1) << 3; if (len != rthlen) return (EINVAL); switch (rth->ip6r_type) { case IPV6_RTHDR_TYPE_0: /* Dropped, RFC5095. */ default: return (EINVAL); /* not supported */ } /* turn off the previous option */ ip6_clearpktopts(opt, IPV6_RTHDR); opt->ip6po_rthdr = malloc(rthlen, M_IP6OPT, M_NOWAIT); if (opt->ip6po_rthdr == NULL) return (ENOBUFS); memcpy(opt->ip6po_rthdr, rth, rthlen); break; } case IPV6_USE_MIN_MTU: if (len != sizeof(int)) return (EINVAL); minmtupolicy = *(int *)buf; if (minmtupolicy != IP6PO_MINMTU_MCASTONLY && minmtupolicy != IP6PO_MINMTU_DISABLE && minmtupolicy != IP6PO_MINMTU_ALL) { return (EINVAL); } opt->ip6po_minmtu = minmtupolicy; break; case IPV6_DONTFRAG: if (len != sizeof(int)) return (EINVAL); if (uproto == IPPROTO_TCP || *(int *)buf == 0) { /* * we ignore this option for TCP sockets. * (RFC3542 leaves this case unspecified.) */ opt->ip6po_flags &= ~IP6PO_DONTFRAG; } else opt->ip6po_flags |= IP6PO_DONTFRAG; break; case IPV6_PREFER_TEMPADDR: { int preftemp; if (len != sizeof(int)) return (EINVAL); preftemp = *(int *)buf; switch (preftemp) { case IP6PO_TEMPADDR_SYSTEM: case IP6PO_TEMPADDR_NOTPREFER: case IP6PO_TEMPADDR_PREFER: break; default: return (EINVAL); } opt->ip6po_prefer_tempaddr = preftemp; break; } default: return (ENOPROTOOPT); } /* end of switch */ return (0); } /* * Routine called from ip6_output() to loop back a copy of an IP6 multicast * packet to the input queue of a specified interface. Note that this * calls the output routine of the loopback "driver", but with an interface * pointer that might NOT be lo0ifp -- easier than replicating that code here. */ void ip6_mloopback(struct ifnet *ifp, struct mbuf *m, const struct sockaddr_in6 *dst) { struct mbuf *copym; struct ip6_hdr *ip6; copym = m_copypacket(m, M_DONTWAIT); if (copym == NULL) return; /* * Make sure to deep-copy IPv6 header portion in case the data * is in an mbuf cluster, so that we can safely override the IPv6 * header portion later. */ if ((copym->m_flags & M_EXT) != 0 || copym->m_len < sizeof(struct ip6_hdr)) { copym = m_pullup(copym, sizeof(struct ip6_hdr)); if (copym == NULL) return; } #ifdef DIAGNOSTIC if (copym->m_len < sizeof(*ip6)) { m_freem(copym); return; } #endif ip6 = mtod(copym, struct ip6_hdr *); /* * clear embedded scope identifiers if necessary. * in6_clearscope will touch the addresses only when necessary. */ in6_clearscope(&ip6->ip6_src); in6_clearscope(&ip6->ip6_dst); (void)looutput(ifp, copym, (const struct sockaddr *)dst, NULL); } /* * Chop IPv6 header off from the payload. */ static int ip6_splithdr(struct mbuf *m, struct ip6_exthdrs *exthdrs) { struct mbuf *mh; struct ip6_hdr *ip6; ip6 = mtod(m, struct ip6_hdr *); if (m->m_len > sizeof(*ip6)) { MGETHDR(mh, M_DONTWAIT, MT_HEADER); if (mh == NULL) { m_freem(m); return ENOBUFS; } m_move_pkthdr(mh, m); m_align(mh, sizeof(*ip6)); m->m_len -= sizeof(*ip6); m->m_data += sizeof(*ip6); mh->m_next = m; mh->m_len = sizeof(*ip6); memcpy(mtod(mh, void *), (void *)ip6, sizeof(*ip6)); m = mh; } exthdrs->ip6e_ip6 = m; return 0; } /* * Compute IPv6 extension header length. */ int ip6_optlen(struct inpcb *inp) { int len; if (!in6p_outputopts(inp)) return 0; len = 0; #define elen(x) \ (((struct ip6_ext *)(x)) ? (((struct ip6_ext *)(x))->ip6e_len + 1) << 3 : 0) len += elen(in6p_outputopts(inp)->ip6po_hbh); len += elen(in6p_outputopts(inp)->ip6po_dest1); len += elen(in6p_outputopts(inp)->ip6po_rthdr); len += elen(in6p_outputopts(inp)->ip6po_dest2); return len; #undef elen } /* * Ensure sending address is valid. * Returns 0 on success, -1 if an error should be sent back or 1 * if the packet could be dropped without error (protocol dependent). */ static int ip6_ifaddrvalid(const struct in6_addr *src, const struct in6_addr *dst) { struct sockaddr_in6 sin6; int s, error; struct ifaddr *ifa; struct in6_ifaddr *ia6; if (IN6_IS_ADDR_UNSPECIFIED(src)) return 0; memset(&sin6, 0, sizeof(sin6)); sin6.sin6_family = AF_INET6; sin6.sin6_len = sizeof(sin6); sin6.sin6_addr = *src; s = pserialize_read_enter(); ifa = ifa_ifwithaddr(sin6tosa(&sin6)); if ((ia6 = ifatoia6(ifa)) == NULL || ia6->ia6_flags & (IN6_IFF_ANYCAST | IN6_IFF_DUPLICATED)) error = -1; else if (ia6->ia6_flags & IN6_IFF_TENTATIVE) error = 1; else if (ia6->ia6_flags & IN6_IFF_DETACHED && (sin6.sin6_addr = *dst, ifa_ifwithaddr(sin6tosa(&sin6)) == NULL)) /* Allow internal traffic to DETACHED addresses */ error = 1; else error = 0; pserialize_read_exit(s); return error; }
12 12 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 /* $NetBSD: tcp_syncache.c,v 1.6 2022/11/04 09:01:53 ozaki-r Exp $ */ /* * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the project nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * @(#)COPYRIGHT 1.1 (NRL) 17 January 1995 * * NRL grants permission for redistribution and use in source and binary * forms, with or without modification, of the software and documentation * created at NRL provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgements: * This product includes software developed by the University of * California, Berkeley and its contributors. * This product includes software developed at the Information * Technology Division, US Naval Research Laboratory. * 4. Neither the name of the NRL nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THE SOFTWARE PROVIDED BY NRL IS PROVIDED BY NRL AND CONTRIBUTORS ``AS * IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NRL OR * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * The views and conclusions contained in the software and documentation * are those of the authors and should not be interpreted as representing * official policies, either expressed or implied, of the US Naval * Research Laboratory (NRL). */ /*- * Copyright (c) 1997, 1998, 1999, 2001, 2005, 2006, * 2011 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Coyote Point Systems, Inc. * This code is derived from software contributed to The NetBSD Foundation * by Jason R. Thorpe and Kevin M. Lahey of the Numerical Aerospace Simulation * Facility, NASA Ames Research Center. * This code is derived from software contributed to The NetBSD Foundation * by Charles M. Hannum. * This code is derived from software contributed to The NetBSD Foundation * by Rui Paulo. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /* * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994, 1995 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)tcp_input.c 8.12 (Berkeley) 5/24/95 */ /* * TODO list for SYN cache stuff: * * Find room for a "state" field, which is needed to keep a * compressed state for TIME_WAIT TCBs. It's been noted already * that this is fairly important for very high-volume web and * mail servers, which use a large number of short-lived * connections. */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: tcp_syncache.c,v 1.6 2022/11/04 09:01:53 ozaki-r Exp $"); #ifdef _KERNEL_OPT #include "opt_inet.h" #include "opt_ipsec.h" #endif #include <sys/param.h> #include <sys/systm.h> #include <sys/mbuf.h> #include <sys/protosw.h> #include <sys/socket.h> #include <sys/socketvar.h> #include <sys/errno.h> #include <sys/syslog.h> #include <sys/pool.h> #include <sys/domain.h> #include <sys/kernel.h> #include <sys/lwp.h> /* for lwp0 */ #include <sys/cprng.h> #include <netinet/in.h> #include <netinet/ip.h> #include <netinet/in_pcb.h> #include <netinet/in_var.h> #include <netinet/ip_var.h> #include <netinet/ip6.h> #ifdef INET6 #include <netinet6/ip6_var.h> #include <netinet6/in6_pcb.h> #include <netinet6/ip6_var.h> #include <netinet6/in6_var.h> #endif #include <netinet/tcp.h> #include <netinet/tcp_fsm.h> #include <netinet/tcp_seq.h> #include <netinet/tcp_timer.h> #include <netinet/tcp_var.h> #include <netinet/tcp_private.h> #include <netinet/tcp_syncache.h> #ifdef TCP_SIGNATURE #ifdef IPSEC #include <netipsec/ipsec.h> #include <netipsec/key.h> #ifdef INET6 #include <netipsec/ipsec6.h> #endif #endif /* IPSEC*/ #endif static void syn_cache_timer(void *); static struct syn_cache * syn_cache_lookup(const struct sockaddr *, const struct sockaddr *, struct syn_cache_head **); static int syn_cache_respond(struct syn_cache *); /* syn hash parameters */ #define TCP_SYN_HASH_SIZE 293 #define TCP_SYN_BUCKET_SIZE 35 static int tcp_syn_cache_size = TCP_SYN_HASH_SIZE; int tcp_syn_cache_limit = TCP_SYN_HASH_SIZE*TCP_SYN_BUCKET_SIZE; int tcp_syn_bucket_limit = 3*TCP_SYN_BUCKET_SIZE; static struct syn_cache_head tcp_syn_cache[TCP_SYN_HASH_SIZE]; /* * TCP compressed state engine. Currently used to hold compressed * state for SYN_RECEIVED. */ u_long syn_cache_count; static u_int32_t syn_hash1, syn_hash2; #define SYN_HASH(sa, sp, dp) \ ((((sa)->s_addr^syn_hash1)*(((((u_int32_t)(dp))<<16) + \ ((u_int32_t)(sp)))^syn_hash2))) #ifndef INET6 #define SYN_HASHALL(hash, src, dst) \ do { \ hash = SYN_HASH(&((const struct sockaddr_in *)(src))->sin_addr, \ ((const struct sockaddr_in *)(src))->sin_port, \ ((const struct sockaddr_in *)(dst))->sin_port); \ } while (/*CONSTCOND*/ 0) #else #define SYN_HASH6(sa, sp, dp) \ ((((sa)->s6_addr32[0] ^ (sa)->s6_addr32[3] ^ syn_hash1) * \ (((((u_int32_t)(dp))<<16) + ((u_int32_t)(sp)))^syn_hash2)) \ & 0x7fffffff) #define SYN_HASHALL(hash, src, dst) \ do { \ switch ((src)->sa_family) { \ case AF_INET: \ hash = SYN_HASH(&((const struct sockaddr_in *)(src))->sin_addr, \ ((const struct sockaddr_in *)(src))->sin_port, \ ((const struct sockaddr_in *)(dst))->sin_port); \ break; \ case AF_INET6: \ hash = SYN_HASH6(&((const struct sockaddr_in6 *)(src))->sin6_addr, \ ((const struct sockaddr_in6 *)(src))->sin6_port, \ ((const struct sockaddr_in6 *)(dst))->sin6_port); \ break; \ default: \ hash = 0; \ } \ } while (/*CONSTCOND*/0) #endif /* INET6 */ static struct pool syn_cache_pool; /* * We don't estimate RTT with SYNs, so each packet starts with the default * RTT and each timer step has a fixed timeout value. */ static inline void syn_cache_timer_arm(struct syn_cache *sc) { TCPT_RANGESET(sc->sc_rxtcur, TCPTV_SRTTDFLT * tcp_backoff[sc->sc_rxtshift], TCPTV_MIN, TCPTV_REXMTMAX); callout_reset(&sc->sc_timer, sc->sc_rxtcur * (hz / PR_SLOWHZ), syn_cache_timer, sc); } #define SYN_CACHE_TIMESTAMP(sc) (tcp_now - (sc)->sc_timebase) static inline void syn_cache_rm(struct syn_cache *sc) { TAILQ_REMOVE(&tcp_syn_cache[sc->sc_bucketidx].sch_bucket, sc, sc_bucketq); sc->sc_tp = NULL; LIST_REMOVE(sc, sc_tpq); tcp_syn_cache[sc->sc_bucketidx].sch_length--; callout_stop(&sc->sc_timer); syn_cache_count--; } static inline void syn_cache_put(struct syn_cache *sc) { if (sc->sc_ipopts) (void) m_free(sc->sc_ipopts); rtcache_free(&sc->sc_route); sc->sc_flags |= SCF_DEAD; if (!callout_invoking(&sc->sc_timer)) callout_schedule(&(sc)->sc_timer, 1); } void syn_cache_init(void) { int i; pool_init(&syn_cache_pool, sizeof(struct syn_cache), 0, 0, 0, "synpl", NULL, IPL_SOFTNET); /* Initialize the hash buckets. */ for (i = 0; i < tcp_syn_cache_size; i++) TAILQ_INIT(&tcp_syn_cache[i].sch_bucket); } void syn_cache_insert(struct syn_cache *sc, struct tcpcb *tp) { struct syn_cache_head *scp; struct syn_cache *sc2; int s; /* * If there are no entries in the hash table, reinitialize * the hash secrets. */ if (syn_cache_count == 0) { syn_hash1 = cprng_fast32(); syn_hash2 = cprng_fast32(); } SYN_HASHALL(sc->sc_hash, &sc->sc_src.sa, &sc->sc_dst.sa); sc->sc_bucketidx = sc->sc_hash % tcp_syn_cache_size; scp = &tcp_syn_cache[sc->sc_bucketidx]; /* * Make sure that we don't overflow the per-bucket * limit or the total cache size limit. */ s = splsoftnet(); if (scp->sch_length >= tcp_syn_bucket_limit) { TCP_STATINC(TCP_STAT_SC_BUCKETOVERFLOW); /* * The bucket is full. Toss the oldest element in the * bucket. This will be the first entry in the bucket. */ sc2 = TAILQ_FIRST(&scp->sch_bucket); #ifdef DIAGNOSTIC /* * This should never happen; we should always find an * entry in our bucket. */ if (sc2 == NULL) panic("syn_cache_insert: bucketoverflow: impossible"); #endif syn_cache_rm(sc2); syn_cache_put(sc2); /* calls pool_put but see spl above */ } else if (syn_cache_count >= tcp_syn_cache_limit) { struct syn_cache_head *scp2, *sce; TCP_STATINC(TCP_STAT_SC_OVERFLOWED); /* * The cache is full. Toss the oldest entry in the * first non-empty bucket we can find. * * XXX We would really like to toss the oldest * entry in the cache, but we hope that this * condition doesn't happen very often. */ scp2 = scp; if (TAILQ_EMPTY(&scp2->sch_bucket)) { sce = &tcp_syn_cache[tcp_syn_cache_size]; for (++scp2; scp2 != scp; scp2++) { if (scp2 >= sce) scp2 = &tcp_syn_cache[0]; if (! TAILQ_EMPTY(&scp2->sch_bucket)) break; } #ifdef DIAGNOSTIC /* * This should never happen; we should always find a * non-empty bucket. */ if (scp2 == scp) panic("syn_cache_insert: cacheoverflow: " "impossible"); #endif } sc2 = TAILQ_FIRST(&scp2->sch_bucket); syn_cache_rm(sc2); syn_cache_put(sc2); /* calls pool_put but see spl above */ } /* * Initialize the entry's timer. */ sc->sc_rxttot = 0; sc->sc_rxtshift = 0; syn_cache_timer_arm(sc); /* Link it from tcpcb entry */ LIST_INSERT_HEAD(&tp->t_sc, sc, sc_tpq); /* Put it into the bucket. */ TAILQ_INSERT_TAIL(&scp->sch_bucket, sc, sc_bucketq); scp->sch_length++; syn_cache_count++; TCP_STATINC(TCP_STAT_SC_ADDED); splx(s); } /* * Walk the timer queues, looking for SYN,ACKs that need to be retransmitted. * If we have retransmitted an entry the maximum number of times, expire * that entry. */ static void syn_cache_timer(void *arg) { struct syn_cache *sc = arg; mutex_enter(softnet_lock); KERNEL_LOCK(1, NULL); callout_ack(&sc->sc_timer); if (__predict_false(sc->sc_flags & SCF_DEAD)) { TCP_STATINC(TCP_STAT_SC_DELAYED_FREE); goto free; } if (__predict_false(sc->sc_rxtshift == TCP_MAXRXTSHIFT)) { /* Drop it -- too many retransmissions. */ goto dropit; } /* * Compute the total amount of time this entry has * been on a queue. If this entry has been on longer * than the keep alive timer would allow, expire it. */ sc->sc_rxttot += sc->sc_rxtcur; if (sc->sc_rxttot >= MIN(tcp_keepinit, TCP_TIMER_MAXTICKS)) goto dropit; TCP_STATINC(TCP_STAT_SC_RETRANSMITTED); (void)syn_cache_respond(sc); /* Advance the timer back-off. */ sc->sc_rxtshift++; syn_cache_timer_arm(sc); goto out; dropit: TCP_STATINC(TCP_STAT_SC_TIMED_OUT); syn_cache_rm(sc); if (sc->sc_ipopts) (void) m_free(sc->sc_ipopts); rtcache_free(&sc->sc_route); free: callout_destroy(&sc->sc_timer); pool_put(&syn_cache_pool, sc); out: KERNEL_UNLOCK_ONE(NULL); mutex_exit(softnet_lock); } /* * Remove syn cache created by the specified tcb entry, * because this does not make sense to keep them * (if there's no tcb entry, syn cache entry will never be used) */ void syn_cache_cleanup(struct tcpcb *tp) { struct syn_cache *sc, *nsc; int s; s = splsoftnet(); for (sc = LIST_FIRST(&tp->t_sc); sc != NULL; sc = nsc) { nsc = LIST_NEXT(sc, sc_tpq); #ifdef DIAGNOSTIC if (sc->sc_tp != tp) panic("invalid sc_tp in syn_cache_cleanup"); #endif syn_cache_rm(sc); syn_cache_put(sc); /* calls pool_put but see spl above */ } /* just for safety */ LIST_INIT(&tp->t_sc); splx(s); } /* * Find an entry in the syn cache. */ static struct syn_cache * syn_cache_lookup(const struct sockaddr *src, const struct sockaddr *dst, struct syn_cache_head **headp) { struct syn_cache *sc; struct syn_cache_head *scp; u_int32_t hash; int s; SYN_HASHALL(hash, src, dst); scp = &tcp_syn_cache[hash % tcp_syn_cache_size]; *headp = scp; s = splsoftnet(); for (sc = TAILQ_FIRST(&scp->sch_bucket); sc != NULL; sc = TAILQ_NEXT(sc, sc_bucketq)) { if (sc->sc_hash != hash) continue; if (!memcmp(&sc->sc_src, src, src->sa_len) && !memcmp(&sc->sc_dst, dst, dst->sa_len)) { splx(s); return (sc); } } splx(s); return (NULL); } /* * This function gets called when we receive an ACK for a socket in the * LISTEN state. We look up the connection in the syn cache, and if it's * there, we pull it out of the cache and turn it into a full-blown * connection in the SYN-RECEIVED state. * * The return values may not be immediately obvious, and their effects * can be subtle, so here they are: * * NULL SYN was not found in cache; caller should drop the * packet and send an RST. * * -1 We were unable to create the new connection, and are * aborting it. An ACK,RST is being sent to the peer * (unless we got screwey sequence numbers; see below), * because the 3-way handshake has been completed. Caller * should not free the mbuf, since we may be using it. If * we are not, we will free it. * * Otherwise, the return value is a pointer to the new socket * associated with the connection. */ struct socket * syn_cache_get(struct sockaddr *src, struct sockaddr *dst, struct tcphdr *th, struct socket *so, struct mbuf *m) { struct syn_cache *sc; struct syn_cache_head *scp; struct inpcb *inp = NULL; struct tcpcb *tp; int s; struct socket *oso; s = splsoftnet(); if ((sc = syn_cache_lookup(src, dst, &scp)) == NULL) { splx(s); return NULL; } /* * Verify the sequence and ack numbers. Try getting the correct * response again. */ if ((th->th_ack != sc->sc_iss + 1) || SEQ_LEQ(th->th_seq, sc->sc_irs) || SEQ_GT(th->th_seq, sc->sc_irs + 1 + sc->sc_win)) { m_freem(m); (void)syn_cache_respond(sc); splx(s); return ((struct socket *)(-1)); } /* Remove this cache entry */ syn_cache_rm(sc); splx(s); /* * Ok, create the full blown connection, and set things up * as they would have been set up if we had created the * connection when the SYN arrived. If we can't create * the connection, abort it. */ /* * inp still has the OLD in_pcb stuff, set the * v6-related flags on the new guy, too. This is * done particularly for the case where an AF_INET6 * socket is bound only to a port, and a v4 connection * comes in on that port. * we also copy the flowinfo from the original pcb * to the new one. */ oso = so; so = sonewconn(so, true); if (so == NULL) goto resetandabort; inp = sotoinpcb(so); switch (src->sa_family) { case AF_INET: if (inp->inp_af == AF_INET) { in4p_laddr(inp) = ((struct sockaddr_in *)dst)->sin_addr; inp->inp_lport = ((struct sockaddr_in *)dst)->sin_port; inp->inp_options = ip_srcroute(m); inpcb_set_state(inp, INP_BOUND); if (inp->inp_options == NULL) { inp->inp_options = sc->sc_ipopts; sc->sc_ipopts = NULL; } } #ifdef INET6 else if (inp->inp_af == AF_INET6) { /* IPv4 packet to AF_INET6 socket */ memset(&in6p_laddr(inp), 0, sizeof(in6p_laddr(inp))); in6p_laddr(inp).s6_addr16[5] = htons(0xffff); bcopy(&((struct sockaddr_in *)dst)->sin_addr, &in6p_laddr(inp).s6_addr32[3], sizeof(((struct sockaddr_in *)dst)->sin_addr)); inp->inp_lport = ((struct sockaddr_in *)dst)->sin_port; intotcpcb(inp)->t_family = AF_INET; if (sotoinpcb(oso)->inp_flags & IN6P_IPV6_V6ONLY) inp->inp_flags |= IN6P_IPV6_V6ONLY; else inp->inp_flags &= ~IN6P_IPV6_V6ONLY; inpcb_set_state(inp, INP_BOUND); } #endif break; #ifdef INET6 case AF_INET6: if (inp->inp_af == AF_INET6) { in6p_laddr(inp) = ((struct sockaddr_in6 *)dst)->sin6_addr; inp->inp_lport = ((struct sockaddr_in6 *)dst)->sin6_port; inpcb_set_state(inp, INP_BOUND); } break; #endif } #ifdef INET6 if (inp && intotcpcb(inp)->t_family == AF_INET6 && sotoinpcb(oso)) { struct inpcb *oinp = sotoinpcb(oso); /* inherit socket options from the listening socket */ inp->inp_flags |= (oinp->inp_flags & IN6P_CONTROLOPTS); if (inp->inp_flags & IN6P_CONTROLOPTS) { m_freem(inp->inp_options); inp->inp_options = NULL; } ip6_savecontrol(inp, &inp->inp_options, mtod(m, struct ip6_hdr *), m); } #endif /* * Give the new socket our cached route reference. */ rtcache_copy(&inp->inp_route, &sc->sc_route); rtcache_free(&sc->sc_route); if (inp->inp_af == AF_INET) { struct sockaddr_in sin; memcpy(&sin, src, src->sa_len); if (inpcb_connect(inp, &sin, &lwp0)) { goto resetandabort; } } #ifdef INET6 else if (inp->inp_af == AF_INET6) { struct sockaddr_in6 sin6; memcpy(&sin6, src, src->sa_len); if (src->sa_family == AF_INET) { /* IPv4 packet to AF_INET6 socket */ in6_sin_2_v4mapsin6((struct sockaddr_in *)src, &sin6); } if (in6pcb_connect(inp, &sin6, NULL)) { goto resetandabort; } } #endif else { goto resetandabort; } tp = intotcpcb(inp); tp->t_flags = sototcpcb(oso)->t_flags & TF_NODELAY; if (sc->sc_request_r_scale != 15) { tp->requested_s_scale = sc->sc_requested_s_scale; tp->request_r_scale = sc->sc_request_r_scale; tp->snd_scale = sc->sc_requested_s_scale; tp->rcv_scale = sc->sc_request_r_scale; tp->t_flags |= TF_REQ_SCALE|TF_RCVD_SCALE; } if (sc->sc_flags & SCF_TIMESTAMP) tp->t_flags |= TF_REQ_TSTMP|TF_RCVD_TSTMP; tp->ts_timebase = sc->sc_timebase; tp->t_template = tcp_template(tp); if (tp->t_template == 0) { tp = tcp_drop(tp, ENOBUFS); /* destroys socket */ so = NULL; m_freem(m); goto abort; } tp->iss = sc->sc_iss; tp->irs = sc->sc_irs; tcp_sendseqinit(tp); tcp_rcvseqinit(tp); tp->t_state = TCPS_SYN_RECEIVED; TCP_TIMER_ARM(tp, TCPT_KEEP, tp->t_keepinit); TCP_STATINC(TCP_STAT_ACCEPTS); if ((sc->sc_flags & SCF_SACK_PERMIT) && tcp_do_sack) tp->t_flags |= TF_WILL_SACK; if ((sc->sc_flags & SCF_ECN_PERMIT) && tcp_do_ecn) tp->t_flags |= TF_ECN_PERMIT; #ifdef TCP_SIGNATURE if (sc->sc_flags & SCF_SIGNATURE) tp->t_flags |= TF_SIGNATURE; #endif /* Initialize tp->t_ourmss before we deal with the peer's! */ tp->t_ourmss = sc->sc_ourmaxseg; tcp_mss_from_peer(tp, sc->sc_peermaxseg); /* * Initialize the initial congestion window. If we * had to retransmit the SYN,ACK, we must initialize cwnd * to 1 segment (i.e. the Loss Window). */ if (sc->sc_rxtshift) tp->snd_cwnd = tp->t_peermss; else { int ss = tcp_init_win; if (inp->inp_af == AF_INET && in_localaddr(in4p_faddr(inp))) ss = tcp_init_win_local; #ifdef INET6 else if (inp->inp_af == AF_INET6 && in6_localaddr(&in6p_faddr(inp))) ss = tcp_init_win_local; #endif tp->snd_cwnd = TCP_INITIAL_WINDOW(ss, tp->t_peermss); } tcp_rmx_rtt(tp); tp->snd_wl1 = sc->sc_irs; tp->rcv_up = sc->sc_irs + 1; /* * This is what would have happened in tcp_output() when * the SYN,ACK was sent. */ tp->snd_up = tp->snd_una; tp->snd_max = tp->snd_nxt = tp->iss+1; TCP_TIMER_ARM(tp, TCPT_REXMT, tp->t_rxtcur); if (sc->sc_win > 0 && SEQ_GT(tp->rcv_nxt + sc->sc_win, tp->rcv_adv)) tp->rcv_adv = tp->rcv_nxt + sc->sc_win; tp->last_ack_sent = tp->rcv_nxt; tp->t_partialacks = -1; tp->t_dupacks = 0; TCP_STATINC(TCP_STAT_SC_COMPLETED); s = splsoftnet(); syn_cache_put(sc); splx(s); return so; resetandabort: (void)tcp_respond(NULL, m, m, th, (tcp_seq)0, th->th_ack, TH_RST); abort: if (so != NULL) { (void) soqremque(so, 1); (void) soabort(so); mutex_enter(softnet_lock); } s = splsoftnet(); syn_cache_put(sc); splx(s); TCP_STATINC(TCP_STAT_SC_ABORTED); return ((struct socket *)(-1)); } /* * This function is called when we get a RST for a * non-existent connection, so that we can see if the * connection is in the syn cache. If it is, zap it. */ void syn_cache_reset(struct sockaddr *src, struct sockaddr *dst, struct tcphdr *th) { struct syn_cache *sc; struct syn_cache_head *scp; int s = splsoftnet(); if ((sc = syn_cache_lookup(src, dst, &scp)) == NULL) { splx(s); return; } if (SEQ_LT(th->th_seq, sc->sc_irs) || SEQ_GT(th->th_seq, sc->sc_irs+1)) { splx(s); return; } syn_cache_rm(sc); TCP_STATINC(TCP_STAT_SC_RESET); syn_cache_put(sc); /* calls pool_put but see spl above */ splx(s); } void syn_cache_unreach(const struct sockaddr *src, const struct sockaddr *dst, struct tcphdr *th) { struct syn_cache *sc; struct syn_cache_head *scp; int s; s = splsoftnet(); if ((sc = syn_cache_lookup(src, dst, &scp)) == NULL) { splx(s); return; } /* If the sequence number != sc_iss, then it's a bogus ICMP msg */ if (ntohl(th->th_seq) != sc->sc_iss) { splx(s); return; } /* * If we've retransmitted 3 times and this is our second error, * we remove the entry. Otherwise, we allow it to continue on. * This prevents us from incorrectly nuking an entry during a * spurious network outage. * * See tcp_notify(). */ if ((sc->sc_flags & SCF_UNREACH) == 0 || sc->sc_rxtshift < 3) { sc->sc_flags |= SCF_UNREACH; splx(s); return; } syn_cache_rm(sc); TCP_STATINC(TCP_STAT_SC_UNREACH); syn_cache_put(sc); /* calls pool_put but see spl above */ splx(s); } /* * Given a LISTEN socket and an inbound SYN request, add this to the syn * cache, and send back a segment: * <SEQ=ISS><ACK=RCV_NXT><CTL=SYN,ACK> * to the source. * * IMPORTANT NOTE: We do _NOT_ ACK data that might accompany the SYN. * Doing so would require that we hold onto the data and deliver it * to the application. However, if we are the target of a SYN-flood * DoS attack, an attacker could send data which would eventually * consume all available buffer space if it were ACKed. By not ACKing * the data, we avoid this DoS scenario. */ int syn_cache_add(struct sockaddr *src, struct sockaddr *dst, struct tcphdr *th, unsigned int toff, struct socket *so, struct mbuf *m, u_char *optp, int optlen, struct tcp_opt_info *oi) { struct tcpcb tb, *tp; long win; struct syn_cache *sc; struct syn_cache_head *scp; struct mbuf *ipopts; int s; tp = sototcpcb(so); /* * Initialize some local state. */ win = sbspace(&so->so_rcv); if (win > TCP_MAXWIN) win = TCP_MAXWIN; #ifdef TCP_SIGNATURE if (optp || (tp->t_flags & TF_SIGNATURE)) #else if (optp) #endif { tb.t_flags = tcp_do_rfc1323 ? (TF_REQ_SCALE|TF_REQ_TSTMP) : 0; #ifdef TCP_SIGNATURE tb.t_flags |= (tp->t_flags & TF_SIGNATURE); #endif tb.t_state = TCPS_LISTEN; if (tcp_dooptions(&tb, optp, optlen, th, m, toff, oi) < 0) return 0; } else tb.t_flags = 0; switch (src->sa_family) { case AF_INET: /* Remember the IP options, if any. */ ipopts = ip_srcroute(m); break; default: ipopts = NULL; } /* * See if we already have an entry for this connection. * If we do, resend the SYN,ACK. We do not count this * as a retransmission (XXX though maybe we should). */ if ((sc = syn_cache_lookup(src, dst, &scp)) != NULL) { TCP_STATINC(TCP_STAT_SC_DUPESYN); if (ipopts) { /* * If we were remembering a previous source route, * forget it and use the new one we've been given. */ if (sc->sc_ipopts) (void)m_free(sc->sc_ipopts); sc->sc_ipopts = ipopts; } sc->sc_timestamp = tb.ts_recent; m_freem(m); if (syn_cache_respond(sc) == 0) { uint64_t *tcps = TCP_STAT_GETREF(); tcps[TCP_STAT_SNDACKS]++; tcps[TCP_STAT_SNDTOTAL]++; TCP_STAT_PUTREF(); } return 1; } s = splsoftnet(); sc = pool_get(&syn_cache_pool, PR_NOWAIT); splx(s); if (sc == NULL) { if (ipopts) (void)m_free(ipopts); return 0; } /* * Fill in the cache, and put the necessary IP and TCP * options into the reply. */ memset(sc, 0, sizeof(struct syn_cache)); callout_init(&sc->sc_timer, CALLOUT_MPSAFE); memcpy(&sc->sc_src, src, src->sa_len); memcpy(&sc->sc_dst, dst, dst->sa_len); sc->sc_flags = 0; sc->sc_ipopts = ipopts; sc->sc_irs = th->th_seq; switch (src->sa_family) { case AF_INET: { struct sockaddr_in *srcin = (void *)src; struct sockaddr_in *dstin = (void *)dst; sc->sc_iss = tcp_new_iss1(&dstin->sin_addr, &srcin->sin_addr, dstin->sin_port, srcin->sin_port, sizeof(dstin->sin_addr)); break; } #ifdef INET6 case AF_INET6: { struct sockaddr_in6 *srcin6 = (void *)src; struct sockaddr_in6 *dstin6 = (void *)dst; sc->sc_iss = tcp_new_iss1(&dstin6->sin6_addr, &srcin6->sin6_addr, dstin6->sin6_port, srcin6->sin6_port, sizeof(dstin6->sin6_addr)); break; } #endif } sc->sc_peermaxseg = oi->maxseg; sc->sc_ourmaxseg = tcp_mss_to_advertise(m->m_flags & M_PKTHDR ? m_get_rcvif_NOMPSAFE(m) : NULL, sc->sc_src.sa.sa_family); sc->sc_win = win; sc->sc_timebase = tcp_now - 1; /* see tcp_newtcpcb() */ sc->sc_timestamp = tb.ts_recent; if ((tb.t_flags & (TF_REQ_TSTMP|TF_RCVD_TSTMP)) == (TF_REQ_TSTMP|TF_RCVD_TSTMP)) sc->sc_flags |= SCF_TIMESTAMP; if ((tb.t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == (TF_RCVD_SCALE|TF_REQ_SCALE)) { sc->sc_requested_s_scale = tb.requested_s_scale; sc->sc_request_r_scale = 0; /* * Pick the smallest possible scaling factor that * will still allow us to scale up to sb_max. * * We do this because there are broken firewalls that * will corrupt the window scale option, leading to * the other endpoint believing that our advertised * window is unscaled. At scale factors larger than * 5 the unscaled window will drop below 1500 bytes, * leading to serious problems when traversing these * broken firewalls. * * With the default sbmax of 256K, a scale factor * of 3 will be chosen by this algorithm. Those who * choose a larger sbmax should watch out * for the compatibility problems mentioned above. * * RFC1323: The Window field in a SYN (i.e., a <SYN> * or <SYN,ACK>) segment itself is never scaled. */ while (sc->sc_request_r_scale < TCP_MAX_WINSHIFT && (TCP_MAXWIN << sc->sc_request_r_scale) < sb_max) sc->sc_request_r_scale++; } else { sc->sc_requested_s_scale = 15; sc->sc_request_r_scale = 15; } if ((tb.t_flags & TF_SACK_PERMIT) && tcp_do_sack) sc->sc_flags |= SCF_SACK_PERMIT; /* * ECN setup packet received. */ if ((th->th_flags & (TH_ECE|TH_CWR)) && tcp_do_ecn) sc->sc_flags |= SCF_ECN_PERMIT; #ifdef TCP_SIGNATURE if (tb.t_flags & TF_SIGNATURE) sc->sc_flags |= SCF_SIGNATURE; #endif sc->sc_tp = tp; m_freem(m); if (syn_cache_respond(sc) == 0) { uint64_t *tcps = TCP_STAT_GETREF(); tcps[TCP_STAT_SNDACKS]++; tcps[TCP_STAT_SNDTOTAL]++; TCP_STAT_PUTREF(); syn_cache_insert(sc, tp); } else { s = splsoftnet(); /* * syn_cache_put() will try to schedule the timer, so * we need to initialize it */ syn_cache_timer_arm(sc); syn_cache_put(sc); splx(s); TCP_STATINC(TCP_STAT_SC_DROPPED); } return 1; } /* * syn_cache_respond: (re)send SYN+ACK. * * Returns 0 on success. */ static int syn_cache_respond(struct syn_cache *sc) { #ifdef INET6 struct rtentry *rt = NULL; #endif struct route *ro; u_int8_t *optp; int optlen, error; u_int16_t tlen; struct ip *ip = NULL; #ifdef INET6 struct ip6_hdr *ip6 = NULL; #endif struct tcpcb *tp; struct tcphdr *th; struct mbuf *m; u_int hlen; #ifdef TCP_SIGNATURE struct secasvar *sav = NULL; u_int8_t *sigp = NULL; #endif ro = &sc->sc_route; switch (sc->sc_src.sa.sa_family) { case AF_INET: hlen = sizeof(struct ip); break; #ifdef INET6 case AF_INET6: hlen = sizeof(struct ip6_hdr); break; #endif default: return EAFNOSUPPORT; } /* Worst case scenario, since we don't know the option size yet. */ tlen = hlen + sizeof(struct tcphdr) + MAX_TCPOPTLEN; KASSERT(max_linkhdr + tlen <= MCLBYTES); /* * Create the IP+TCP header from scratch. */ MGETHDR(m, M_DONTWAIT, MT_DATA); if (m && (max_linkhdr + tlen) > MHLEN) { MCLGET(m, M_DONTWAIT); if ((m->m_flags & M_EXT) == 0) { m_freem(m); m = NULL; } } if (m == NULL) return ENOBUFS; MCLAIM(m, &tcp_tx_mowner); tp = sc->sc_tp; /* Fixup the mbuf. */ m->m_data += max_linkhdr; m_reset_rcvif(m); memset(mtod(m, void *), 0, tlen); switch (sc->sc_src.sa.sa_family) { case AF_INET: ip = mtod(m, struct ip *); ip->ip_v = 4; ip->ip_dst = sc->sc_src.sin.sin_addr; ip->ip_src = sc->sc_dst.sin.sin_addr; ip->ip_p = IPPROTO_TCP; th = (struct tcphdr *)(ip + 1); th->th_dport = sc->sc_src.sin.sin_port; th->th_sport = sc->sc_dst.sin.sin_port; break; #ifdef INET6 case AF_INET6: ip6 = mtod(m, struct ip6_hdr *); ip6->ip6_vfc = IPV6_VERSION; ip6->ip6_dst = sc->sc_src.sin6.sin6_addr; ip6->ip6_src = sc->sc_dst.sin6.sin6_addr; ip6->ip6_nxt = IPPROTO_TCP; /* ip6_plen will be updated in ip6_output() */ th = (struct tcphdr *)(ip6 + 1); th->th_dport = sc->sc_src.sin6.sin6_port; th->th_sport = sc->sc_dst.sin6.sin6_port; break; #endif default: panic("%s: impossible (1)", __func__); } th->th_seq = htonl(sc->sc_iss); th->th_ack = htonl(sc->sc_irs + 1); th->th_flags = TH_SYN|TH_ACK; th->th_win = htons(sc->sc_win); /* th_x2, th_sum, th_urp already 0 from memset */ /* Tack on the TCP options. */ optp = (u_int8_t *)(th + 1); optlen = 0; *optp++ = TCPOPT_MAXSEG; *optp++ = TCPOLEN_MAXSEG; *optp++ = (sc->sc_ourmaxseg >> 8) & 0xff; *optp++ = sc->sc_ourmaxseg & 0xff; optlen += TCPOLEN_MAXSEG; if (sc->sc_request_r_scale != 15) { *((u_int32_t *)optp) = htonl(TCPOPT_NOP << 24 | TCPOPT_WINDOW << 16 | TCPOLEN_WINDOW << 8 | sc->sc_request_r_scale); optp += TCPOLEN_WINDOW + TCPOLEN_NOP; optlen += TCPOLEN_WINDOW + TCPOLEN_NOP; } if (sc->sc_flags & SCF_SACK_PERMIT) { /* Let the peer know that we will SACK. */ *optp++ = TCPOPT_SACK_PERMITTED; *optp++ = TCPOLEN_SACK_PERMITTED; optlen += TCPOLEN_SACK_PERMITTED; } if (sc->sc_flags & SCF_TIMESTAMP) { while (optlen % 4 != 2) { optlen += TCPOLEN_NOP; *optp++ = TCPOPT_NOP; } *optp++ = TCPOPT_TIMESTAMP; *optp++ = TCPOLEN_TIMESTAMP; u_int32_t *lp = (u_int32_t *)(optp); /* Form timestamp option as shown in appendix A of RFC 1323. */ *lp++ = htonl(SYN_CACHE_TIMESTAMP(sc)); *lp = htonl(sc->sc_timestamp); optp += TCPOLEN_TIMESTAMP - 2; optlen += TCPOLEN_TIMESTAMP; } #ifdef TCP_SIGNATURE if (sc->sc_flags & SCF_SIGNATURE) { sav = tcp_signature_getsav(m); if (sav == NULL) { m_freem(m); return EPERM; } *optp++ = TCPOPT_SIGNATURE; *optp++ = TCPOLEN_SIGNATURE; sigp = optp; memset(optp, 0, TCP_SIGLEN); optp += TCP_SIGLEN; optlen += TCPOLEN_SIGNATURE; } #endif /* * Terminate and pad TCP options to a 4 byte boundary. * * According to RFC793: "The content of the header beyond the * End-of-Option option must be header padding (i.e., zero)." * And later: "The padding is composed of zeros." */ if (optlen % 4) { optlen += TCPOLEN_EOL; *optp++ = TCPOPT_EOL; } while (optlen % 4) { optlen += TCPOLEN_PAD; *optp++ = TCPOPT_PAD; } /* Compute the actual values now that we've added the options. */ tlen = hlen + sizeof(struct tcphdr) + optlen; m->m_len = m->m_pkthdr.len = tlen; th->th_off = (sizeof(struct tcphdr) + optlen) >> 2; #ifdef TCP_SIGNATURE if (sav) { (void)tcp_signature(m, th, hlen, sav, sigp); key_sa_recordxfer(sav, m); KEY_SA_UNREF(&sav); } #endif /* * Send ECN SYN-ACK setup packet. * Routes can be asymmetric, so, even if we receive a packet * with ECE and CWR set, we must not assume no one will block * the ECE packet we are about to send. */ if ((sc->sc_flags & SCF_ECN_PERMIT) && tp && SEQ_GEQ(tp->snd_nxt, tp->snd_max)) { th->th_flags |= TH_ECE; TCP_STATINC(TCP_STAT_ECN_SHS); /* * draft-ietf-tcpm-ecnsyn-00.txt * * "[...] a TCP node MAY respond to an ECN-setup * SYN packet by setting ECT in the responding * ECN-setup SYN/ACK packet, indicating to routers * that the SYN/ACK packet is ECN-Capable. * This allows a congested router along the path * to mark the packet instead of dropping the * packet as an indication of congestion." * * "[...] There can be a great benefit in setting * an ECN-capable codepoint in SYN/ACK packets [...] * Congestion is most likely to occur in * the server-to-client direction. As a result, * setting an ECN-capable codepoint in SYN/ACK * packets can reduce the occurrence of three-second * retransmit timeouts resulting from the drop * of SYN/ACK packets." * * Page 4 and 6, January 2006. */ switch (sc->sc_src.sa.sa_family) { case AF_INET: ip->ip_tos |= IPTOS_ECN_ECT0; break; #ifdef INET6 case AF_INET6: ip6->ip6_flow |= htonl(IPTOS_ECN_ECT0 << 20); break; #endif } TCP_STATINC(TCP_STAT_ECN_ECT); } /* * Compute the packet's checksum. * * Fill in some straggling IP bits. Note the stack expects * ip_len to be in host order, for convenience. */ switch (sc->sc_src.sa.sa_family) { case AF_INET: ip->ip_len = htons(tlen - hlen); th->th_sum = 0; th->th_sum = in4_cksum(m, IPPROTO_TCP, hlen, tlen - hlen); ip->ip_len = htons(tlen); ip->ip_ttl = ip_defttl; /* XXX tos? */ break; #ifdef INET6 case AF_INET6: ip6->ip6_plen = htons(tlen - hlen); th->th_sum = 0; th->th_sum = in6_cksum(m, IPPROTO_TCP, hlen, tlen - hlen); ip6->ip6_vfc &= ~IPV6_VERSION_MASK; ip6->ip6_vfc |= IPV6_VERSION; ip6->ip6_plen = htons(tlen - hlen); /* ip6_hlim will be initialized afterwards */ /* XXX flowlabel? */ break; #endif } /* XXX use IPsec policy on listening socket, on SYN ACK */ tp = sc->sc_tp; switch (sc->sc_src.sa.sa_family) { case AF_INET: error = ip_output(m, sc->sc_ipopts, ro, (ip_mtudisc ? IP_MTUDISC : 0), NULL, tp ? tp->t_inpcb : NULL); break; #ifdef INET6 case AF_INET6: ip6->ip6_hlim = in6pcb_selecthlim(NULL, (rt = rtcache_validate(ro)) != NULL ? rt->rt_ifp : NULL); rtcache_unref(rt, ro); error = ip6_output(m, NULL /*XXX*/, ro, 0, NULL, tp ? tp->t_inpcb : NULL, NULL); break; #endif default: panic("%s: impossible (2)", __func__); } return error; }
5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 /* $NetBSD: sys_process.c,v 1.180 2020/05/26 00:50:53 kamil Exp $ */ /*- * Copyright (c) 2008, 2009 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Andrew Doran. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /*- * Copyright (c) 1982, 1986, 1989, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * This code is derived from software contributed to Berkeley by * Jan-Simon Pendry. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)sys_process.c 8.1 (Berkeley) 6/10/93 */ /*- * Copyright (c) 1993 Jan-Simon Pendry. * Copyright (c) 1994 Christopher G. Demetriou. All rights reserved. * * This code is derived from software contributed to Berkeley by * Jan-Simon Pendry. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)sys_process.c 8.1 (Berkeley) 6/10/93 */ /* * References: * (1) Bach's "The Design of the UNIX Operating System", * (2) sys/miscfs/procfs from UCB's 4.4BSD-Lite distribution, * (3) the "4.4BSD Programmer's Reference Manual" published * by USENIX and O'Reilly & Associates. * The 4.4BSD PRM does a reasonably good job of documenting what the various * ptrace() requests should actually do, and its text is quoted several times * in this file. */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: sys_process.c,v 1.180 2020/05/26 00:50:53 kamil Exp $"); #ifdef _KERNEL_OPT #include "opt_ptrace.h" #include "opt_ktrace.h" #include "opt_pax.h" #endif #include <sys/param.h> #include <sys/systm.h> #include <sys/proc.h> #include <sys/errno.h> #include <sys/exec.h> #include <sys/pax.h> #include <sys/ptrace.h> #include <sys/uio.h> #include <sys/ras.h> #include <sys/kmem.h> #include <sys/kauth.h> #include <sys/mount.h> #include <sys/syscallargs.h> #include <uvm/uvm_extern.h> #include <machine/reg.h> #if defined(KTRACE) || defined(PTRACE_HOOKS) int process_domem(struct lwp *curl /*tracer*/, struct lwp *l /*traced*/, struct uio *uio) { struct proc *p = l->l_proc; /* traced */ struct vmspace *vm; int error; size_t len; error = 0; len = uio->uio_resid; if (len == 0) return 0; #ifdef PMAP_NEED_PROCWR vaddr_t addr = uio->uio_offset; #endif vm = p->p_vmspace; if ((l->l_flag & LW_WEXIT) || vm->vm_refcnt < 1) error = EFAULT; if (error == 0) uvmspace_addref(p->p_vmspace); if (error != 0) return error; error = uvm_io(&vm->vm_map, uio, pax_mprotect_prot(l)); #ifdef PMAP_NEED_PROCWR if (error == 0 && uio->uio_rw == UIO_WRITE) pmap_procwr(p, addr, len); #endif uvmspace_free(vm); return error; } #endif /* KTRACE || PTRACE_HOOKS */ /* * Dummy routine so that ptrace_common module will fail to load if this * routine is not defined. */ #if defined(PTRACE_HOOKS) void ptrace_hooks(void) { } #endif
235 235 236 234 235 236 34 34 34 34 34 34 34 34 11 44 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 /* $NetBSD: uvm_anon.c,v 1.80 2020/10/25 00:05:26 chs Exp $ */ /* * Copyright (c) 1997 Charles D. Cranor and Washington University. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* * uvm_anon.c: uvm anon ops */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: uvm_anon.c,v 1.80 2020/10/25 00:05:26 chs Exp $"); #include "opt_uvmhist.h" #include <sys/param.h> #include <sys/systm.h> #include <sys/pool.h> #include <sys/kernel.h> #include <sys/atomic.h> #include <uvm/uvm.h> #include <uvm/uvm_swap.h> #include <uvm/uvm_pdpolicy.h> static struct pool_cache uvm_anon_cache; static int uvm_anon_ctor(void *, void *, int); void uvm_anon_init(void) { pool_cache_bootstrap(&uvm_anon_cache, sizeof(struct vm_anon), 0, 0, PR_LARGECACHE, "anonpl", NULL, IPL_NONE, uvm_anon_ctor, NULL, NULL); } static int uvm_anon_ctor(void *arg, void *object, int flags) { struct vm_anon *anon = object; anon->an_ref = 0; anon->an_lock = NULL; anon->an_page = NULL; #if defined(VMSWAP) anon->an_swslot = 0; #endif return 0; } /* * uvm_analloc: allocate a new anon. * * => anon will have no lock associated. */ struct vm_anon * uvm_analloc(void) { struct vm_anon *anon; anon = pool_cache_get(&uvm_anon_cache, PR_NOWAIT); if (anon) { KASSERT(anon->an_ref == 0); KASSERT(anon->an_lock == NULL); KASSERT(anon->an_page == NULL); #if defined(VMSWAP) KASSERT(anon->an_swslot == 0); #endif anon->an_ref = 1; } return anon; } /* * uvm_anfree: free a single anon structure * * => anon must be removed from the amap (if anon was in an amap). * => amap must be locked, if anon was owned by amap. * => we may drop and re-acquire the lock here (to break loans). */ void uvm_anfree(struct vm_anon *anon) { struct vm_page *pg = anon->an_page, *pg2 __diagused; UVMHIST_FUNC(__func__); UVMHIST_CALLARGS(maphist,"(anon=%#jx)", (uintptr_t)anon, 0,0,0); KASSERT(anon->an_lock == NULL || rw_write_held(anon->an_lock)); KASSERT(anon->an_ref == 0); /* * Dispose of the page, if it is resident. */ if (__predict_true(pg != NULL)) { KASSERT(anon->an_lock != NULL); /* * If there is a resident page and it is loaned, then anon * may not own it. Call out to uvm_anon_lockloanpg() to * identify and lock the real owner of the page. */ if (__predict_false(pg->loan_count != 0)) { pg2 = uvm_anon_lockloanpg(anon); KASSERT(pg2 == pg); } /* * If the page is owned by a UVM object (now locked), * then kill the loan on the page rather than free it, * and release the object lock. */ if (__predict_false(pg->uobject != NULL)) { mutex_enter(&pg->interlock); KASSERT(pg->loan_count > 0); pg->loan_count--; pg->uanon = NULL; mutex_exit(&pg->interlock); rw_exit(pg->uobject->vmobjlock); } else { /* * If page has no UVM object, then anon is the owner, * and it is already locked. */ KASSERT((pg->flags & PG_RELEASED) == 0); pmap_page_protect(pg, VM_PROT_NONE); /* * If the page is busy, mark it as PG_RELEASED, so * that uvm_anon_release(9) would release it later. */ if (__predict_false((pg->flags & PG_BUSY) != 0)) { pg->flags |= PG_RELEASED; rw_obj_hold(anon->an_lock); return; } uvm_pagefree(pg); UVMHIST_LOG(maphist, "anon %#jx, page %#jx: " "freed now!", (uintptr_t)anon, (uintptr_t)pg, 0, 0); } } else { #if defined(VMSWAP) if (anon->an_swslot > 0) { /* This page is no longer only in swap. */ KASSERT(uvmexp.swpgonly > 0); atomic_dec_uint(&uvmexp.swpgonly); } #endif } anon->an_lock = NULL; /* * Free any swap resources, leave a page replacement hint. */ uvm_anon_dropswap(anon); uvmpdpol_anfree(anon); UVMHIST_LOG(maphist,"<- done!",0,0,0,0); pool_cache_put(&uvm_anon_cache, anon); } /* * uvm_anon_lockloanpg: given a locked anon, lock its resident page owner. * * => anon is locked by caller * => on return: anon is locked * if there is a resident page: * if it has a uobject, it is locked by us * if it is ownerless, we take over as owner * we return the resident page (it can change during * this function) * => note that the only time an anon has an ownerless resident page * is if the page was loaned from a uvm_object and the uvm_object * disowned it * => this only needs to be called when you want to do an operation * on an anon's resident page and that page has a non-zero loan * count. */ struct vm_page * uvm_anon_lockloanpg(struct vm_anon *anon) { struct vm_page *pg; krw_t op; KASSERT(rw_lock_held(anon->an_lock)); /* * loop while we have a resident page that has a non-zero loan count. * if we successfully get our lock, we will "break" the loop. * note that the test for pg->loan_count is not protected -- this * may produce false positive results. note that a false positive * result may cause us to do more work than we need to, but it will * not produce an incorrect result. */ while (((pg = anon->an_page) != NULL) && pg->loan_count != 0) { mutex_enter(&pg->interlock); if (pg->uobject) { /* * if we didn't get a lock (try lock failed), then we * toggle our anon lock and try again */ if (!rw_tryenter(pg->uobject->vmobjlock, RW_WRITER)) { /* * someone locking the object has a chance to * lock us right now * * XXX Better than yielding but inadequate. */ mutex_exit(&pg->interlock); op = rw_lock_op(anon->an_lock); rw_exit(anon->an_lock); kpause("lkloanpg", false, 1, NULL); rw_enter(anon->an_lock, op); continue; } } /* * If page is un-owned i.e. the object dropped its ownership, * then we have to take the ownership. */ if (pg->uobject == NULL && (pg->flags & PG_ANON) == 0) { pg->flags |= PG_ANON; pg->loan_count--; } mutex_exit(&pg->interlock); break; } return pg; } #if defined(VMSWAP) /* * uvm_anon_pagein: fetch an anon's page. * * => anon must be locked, and is unlocked upon return. * => returns true if pagein was aborted due to lack of memory. */ bool uvm_anon_pagein(struct vm_amap *amap, struct vm_anon *anon) { struct vm_page *pg; struct uvm_object *uobj; KASSERT(rw_write_held(anon->an_lock)); KASSERT(anon->an_lock == amap->am_lock); /* * Get the page of the anon. */ switch (uvmfault_anonget(NULL, amap, anon)) { case 0: /* Success - we have the page. */ KASSERT(rw_write_held(anon->an_lock)); break; case EIO: case ERESTART: /* * Nothing more to do on errors. ERESTART means that the * anon was freed. */ return false; case ENOLCK: panic("uvm_anon_pagein"); default: return true; } /* * Mark the page as dirty and clear its swslot. */ pg = anon->an_page; uobj = pg->uobject; if (anon->an_swslot > 0) { uvm_swap_free(anon->an_swslot, 1); } anon->an_swslot = 0; uvm_pagemarkdirty(pg, UVM_PAGE_STATUS_DIRTY); /* * Deactivate the page (to put it on a page queue). */ uvm_pagelock(pg); uvm_pagedeactivate(pg); uvm_pageunlock(pg); rw_exit(anon->an_lock); if (uobj) { rw_exit(uobj->vmobjlock); } return false; } /* * uvm_anon_dropswap: release any swap resources from this anon. * * => anon must be locked or have a reference count of 0. */ void uvm_anon_dropswap(struct vm_anon *anon) { UVMHIST_FUNC(__func__); UVMHIST_CALLED(maphist); if (anon->an_swslot == 0) return; UVMHIST_LOG(maphist,"freeing swap for anon %#jx, paged to swslot %#jx", (uintptr_t)anon, anon->an_swslot, 0, 0); uvm_swap_free(anon->an_swslot, 1); anon->an_swslot = 0; } #endif /* * uvm_anon_release: release an anon and its page. * * => anon should not have any references. * => anon must be locked. */ void uvm_anon_release(struct vm_anon *anon) { struct vm_page *pg = anon->an_page; krwlock_t *lock; KASSERT(rw_write_held(anon->an_lock)); KASSERT(pg != NULL); KASSERT((pg->flags & PG_RELEASED) != 0); KASSERT((pg->flags & PG_BUSY) != 0); KASSERT(pg->uobject == NULL); KASSERT(pg->uanon == anon); KASSERT(pg->loan_count == 0); KASSERT(anon->an_ref == 0); if ((pg->flags & PG_PAGEOUT) != 0) { pg->flags &= ~PG_PAGEOUT; uvm_pageout_done(1); } uvm_pagefree(pg); KASSERT(anon->an_page == NULL); lock = anon->an_lock; uvm_anfree(anon); rw_exit(lock); /* Note: extra reference is held for PG_RELEASED case. */ rw_obj_free(lock); }
14 300 20 426 276 275 261 81 82 82 81 466 465 166 38 38 20 20 6 14 14 181 180 182 181 301 523 173 523 520 31 20 507 421 397 19 161 175 181 192 182 182 180 182 48 48 48 48 48 48 10 46 46 46 10 1 45 45 45 44 44 502 504 470 302 302 302 302 302 302 301 301 297 300 291 290 46 277 52 276 258 276 276 257 272 237 300 300 13 13 13 13 13 13 400 401 189 185 184 185 185 185 185 185 184 185 184 178 18 19 19 19 19 19 19 19 19 19 9 15 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 /* $NetBSD: radixtree.c,v 1.34 2024/05/04 17:58:24 chs Exp $ */ /*- * Copyright (c)2011,2012,2013 YAMAMOTO Takashi, * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * radixtree.c * * Overview: * * This is an implementation of radix tree, whose keys are uint64_t and leafs * are user provided pointers. * * Leaf nodes are just void * and this implementation doesn't care about * what they actually point to. However, this implementation has an assumption * about their alignment. Specifically, this implementation assumes that their * 2 LSBs are always zero and uses them for internal accounting. * * Intermediate nodes and memory allocation: * * Intermediate nodes are automatically allocated and freed internally and * basically users don't need to care about them. The allocation is done via * kmem_zalloc(9) for _KERNEL, malloc(3) for userland, and alloc() for * _STANDALONE environment. Only radix_tree_insert_node function can allocate * memory for intermediate nodes and thus can fail for ENOMEM. * * Memory Efficiency: * * It's designed to work efficiently with dense index distribution. * The memory consumption (number of necessary intermediate nodes) heavily * depends on the index distribution. Basically, more dense index distribution * consumes less nodes per item. Approximately, * * - the best case: about RADIX_TREE_PTR_PER_NODE items per intermediate node. * it would look like the following. * * root (t_height=1) * | * v * [ | | | ] (intermediate node. RADIX_TREE_PTR_PER_NODE=4 in this fig) * | | | | * v v v v * p p p p (items) * * - the worst case: RADIX_TREE_MAX_HEIGHT intermediate nodes per item. * it would look like the following if RADIX_TREE_MAX_HEIGHT=3. * * root (t_height=3) * | * v * [ | | | ] * | * v * [ | | | ] * | * v * [ | | | ] * | * v * p * * The height of tree (t_height) is dynamic. It's smaller if only small * index values are used. As an extreme case, if only index 0 is used, * the corresponding value is directly stored in the root of the tree * (struct radix_tree) without allocating any intermediate nodes. In that * case, t_height=0. * * Gang lookup: * * This implementation provides a way to scan many nodes quickly via * radix_tree_gang_lookup_node function and its varients. * * Tags: * * This implementation provides tagging functionality, which allows quick * scanning of a subset of leaf nodes. Leaf nodes are untagged when inserted * into the tree and can be tagged by radix_tree_set_tag function. * radix_tree_gang_lookup_tagged_node function and its variants returns only * leaf nodes with the given tag. To reduce amount of nodes to visit for * these functions, this implementation keeps tagging information in internal * intermediate nodes and quickly skips uninterested parts of a tree. * * A tree has RADIX_TREE_TAG_ID_MAX independent tag spaces, each of which are * identified by a zero-origin numbers, tagid. For the current implementation, * RADIX_TREE_TAG_ID_MAX is 2. A set of tags is described as a bitmask tagmask, * which is a bitwise OR of (1 << tagid). */ #include <sys/cdefs.h> #if defined(_KERNEL) || defined(_STANDALONE) __KERNEL_RCSID(0, "$NetBSD: radixtree.c,v 1.34 2024/05/04 17:58:24 chs Exp $"); #include <sys/param.h> #include <sys/errno.h> #include <sys/kmem.h> #include <sys/radixtree.h> #include <lib/libkern/libkern.h> #if defined(_STANDALONE) #include <lib/libsa/stand.h> #endif /* defined(_STANDALONE) */ #else /* defined(_KERNEL) || defined(_STANDALONE) */ __RCSID("$NetBSD: radixtree.c,v 1.34 2024/05/04 17:58:24 chs Exp $"); #include <assert.h> #include <errno.h> #include <stdbool.h> #include <stdlib.h> #include <string.h> #if 1 #define KASSERT assert #else #define KASSERT(a) /* nothing */ #endif #endif /* defined(_KERNEL) || defined(_STANDALONE) */ #include <sys/radixtree.h> #define RADIX_TREE_BITS_PER_HEIGHT 4 /* XXX tune */ #define RADIX_TREE_PTR_PER_NODE (1 << RADIX_TREE_BITS_PER_HEIGHT) #define RADIX_TREE_MAX_HEIGHT (64 / RADIX_TREE_BITS_PER_HEIGHT) #define RADIX_TREE_INVALID_HEIGHT (RADIX_TREE_MAX_HEIGHT + 1) __CTASSERT((64 % RADIX_TREE_BITS_PER_HEIGHT) == 0); __CTASSERT(((1 << RADIX_TREE_TAG_ID_MAX) & (sizeof(int) - 1)) == 0); #define RADIX_TREE_TAG_MASK ((1 << RADIX_TREE_TAG_ID_MAX) - 1) static inline void * entry_ptr(void *p) { return (void *)((uintptr_t)p & ~RADIX_TREE_TAG_MASK); } static inline unsigned int entry_tagmask(void *p) { return (uintptr_t)p & RADIX_TREE_TAG_MASK; } static inline void * entry_compose(void *p, unsigned int tagmask) { return (void *)((uintptr_t)p | tagmask); } static inline bool entry_match_p(void *p, unsigned int tagmask) { KASSERT(entry_ptr(p) != NULL || entry_tagmask(p) == 0); if (p == NULL) { return false; } if (tagmask == 0) { return true; } return (entry_tagmask(p) & tagmask) != 0; } /* * radix_tree_node: an intermediate node * * we don't care the type of leaf nodes. they are just void *. * * we used to maintain a count of non-NULL nodes in this structure, but it * prevented it from being aligned to a cache line boundary; the performance * benefit from being cache friendly is greater than the benefit of having * a dedicated count value, especially in multi-processor situations where * we need to avoid intra-pool-page false sharing. */ struct radix_tree_node { void *n_ptrs[RADIX_TREE_PTR_PER_NODE]; }; /* * p_refs[0].pptr == &t->t_root * : * p_refs[n].pptr == &(*p_refs[n-1])->n_ptrs[x] * : * : * p_refs[t->t_height].pptr == &leaf_pointer */ struct radix_tree_path { struct radix_tree_node_ref { void **pptr; } p_refs[RADIX_TREE_MAX_HEIGHT + 1]; /* +1 for the root ptr */ /* * p_lastidx is either the index of the last valid element of p_refs[] * or RADIX_TREE_INVALID_HEIGHT. * RADIX_TREE_INVALID_HEIGHT means that radix_tree_lookup_ptr found * that the height of the tree is not enough to cover the given index. */ unsigned int p_lastidx; }; static inline void ** path_pptr(const struct radix_tree *t, const struct radix_tree_path *p, unsigned int height) { KASSERT(height <= t->t_height); return p->p_refs[height].pptr; } static inline struct radix_tree_node * path_node(const struct radix_tree * t, const struct radix_tree_path *p, unsigned int height) { KASSERT(height <= t->t_height); return entry_ptr(*path_pptr(t, p, height)); } /* * radix_tree_init_tree: * * Initialize a tree. */ void radix_tree_init_tree(struct radix_tree *t) { t->t_height = 0; t->t_root = NULL; } /* * radix_tree_fini_tree: * * Finish using a tree. */ void radix_tree_fini_tree(struct radix_tree *t) { KASSERT(t->t_root == NULL); KASSERT(t->t_height == 0); } /* * radix_tree_empty_tree_p: * * Return if the tree is empty. */ bool radix_tree_empty_tree_p(struct radix_tree *t) { return t->t_root == NULL; } /* * radix_tree_empty_tree_p: * * Return true if the tree has any nodes with the given tag. Otherwise * return false. * * It's illegal to call this function with tagmask 0. */ bool radix_tree_empty_tagged_tree_p(struct radix_tree *t, unsigned int tagmask) { KASSERT(tagmask != 0); return (entry_tagmask(t->t_root) & tagmask) == 0; } static void radix_tree_node_init(struct radix_tree_node *n) { memset(n, 0, sizeof(*n)); } #if defined(_KERNEL) /* * radix_tree_init: * * initialize the subsystem. */ void radix_tree_init(void) { /* nothing right now */ } /* * radix_tree_await_memory: * * after an insert has failed with ENOMEM, wait for memory to become * available, so the caller can retry. this needs to ensure that the * maximum possible required number of nodes is available. */ void radix_tree_await_memory(void) { struct radix_tree_node *nodes[RADIX_TREE_MAX_HEIGHT]; int i; for (i = 0; i < __arraycount(nodes); i++) { nodes[i] = kmem_intr_alloc(sizeof(struct radix_tree_node), KM_SLEEP); } while (--i >= 0) { kmem_intr_free(nodes[i], sizeof(struct radix_tree_node)); } } #endif /* defined(_KERNEL) */ /* * radix_tree_sum_node: * * return the logical sum of all entries in the given node. used to quickly * check for tag masks or empty nodes. */ static uintptr_t radix_tree_sum_node(const struct radix_tree_node *n) { #if RADIX_TREE_PTR_PER_NODE > 16 unsigned int i; uintptr_t sum; for (i = 0, sum = 0; i < RADIX_TREE_PTR_PER_NODE; i++) { sum |= (uintptr_t)n->n_ptrs[i]; } return sum; #else /* RADIX_TREE_PTR_PER_NODE > 16 */ uintptr_t sum; /* * Unrolling the above is much better than a tight loop with two * test+branch pairs. On x86 with gcc 5.5.0 this compiles into 19 * deterministic instructions including the "return" and prologue & * epilogue. */ sum = (uintptr_t)n->n_ptrs[0]; sum |= (uintptr_t)n->n_ptrs[1]; sum |= (uintptr_t)n->n_ptrs[2]; sum |= (uintptr_t)n->n_ptrs[3]; #if RADIX_TREE_PTR_PER_NODE > 4 sum |= (uintptr_t)n->n_ptrs[4]; sum |= (uintptr_t)n->n_ptrs[5]; sum |= (uintptr_t)n->n_ptrs[6]; sum |= (uintptr_t)n->n_ptrs[7]; #endif #if RADIX_TREE_PTR_PER_NODE > 8 sum |= (uintptr_t)n->n_ptrs[8]; sum |= (uintptr_t)n->n_ptrs[9]; sum |= (uintptr_t)n->n_ptrs[10]; sum |= (uintptr_t)n->n_ptrs[11]; sum |= (uintptr_t)n->n_ptrs[12]; sum |= (uintptr_t)n->n_ptrs[13]; sum |= (uintptr_t)n->n_ptrs[14]; sum |= (uintptr_t)n->n_ptrs[15]; #endif return sum; #endif /* RADIX_TREE_PTR_PER_NODE > 16 */ } static int __unused radix_tree_node_count_ptrs(const struct radix_tree_node *n) { unsigned int i, c; for (i = c = 0; i < RADIX_TREE_PTR_PER_NODE; i++) { c += (n->n_ptrs[i] != NULL); } return c; } static struct radix_tree_node * radix_tree_alloc_node(void) { struct radix_tree_node *n; #if defined(_KERNEL) /* * We must not block waiting for memory because this function * can be called in contexts where waiting for memory is illegal. */ n = kmem_intr_alloc(sizeof(struct radix_tree_node), KM_NOSLEEP); #elif defined(_STANDALONE) n = alloc(sizeof(*n)); #else /* defined(_STANDALONE) */ n = malloc(sizeof(*n)); #endif /* defined(_STANDALONE) */ if (n != NULL) { radix_tree_node_init(n); } KASSERT(n == NULL || radix_tree_sum_node(n) == 0); return n; } static void radix_tree_free_node(struct radix_tree_node *n) { KASSERT(radix_tree_sum_node(n) == 0); #if defined(_KERNEL) kmem_intr_free(n, sizeof(struct radix_tree_node)); #elif defined(_STANDALONE) dealloc(n, sizeof(*n)); #else free(n); #endif } /* * radix_tree_grow: * * increase the height of the tree. */ static __noinline int radix_tree_grow(struct radix_tree *t, unsigned int newheight) { const unsigned int tagmask = entry_tagmask(t->t_root); struct radix_tree_node *newnodes[RADIX_TREE_MAX_HEIGHT]; void *root; int h; KASSERT(newheight <= RADIX_TREE_MAX_HEIGHT); if ((root = t->t_root) == NULL) { t->t_height = newheight; return 0; } for (h = t->t_height; h < newheight; h++) { newnodes[h] = radix_tree_alloc_node(); if (__predict_false(newnodes[h] == NULL)) { while (--h >= (int)t->t_height) { newnodes[h]->n_ptrs[0] = NULL; radix_tree_free_node(newnodes[h]); } return ENOMEM; } newnodes[h]->n_ptrs[0] = root; root = entry_compose(newnodes[h], tagmask); } t->t_root = root; t->t_height = h; return 0; } /* * radix_tree_lookup_ptr: * * an internal helper function used for various exported functions. * * return the pointer to store the node for the given index. * * if alloc is true, try to allocate the storage. (note for _KERNEL: * in that case, this function can block.) if the allocation failed or * alloc is false, return NULL. * * if path is not NULL, fill it for the caller's investigation. * * if tagmask is not zero, search only for nodes with the tag set. * note that, however, this function doesn't check the tagmask for the leaf * pointer. it's a caller's responsibility to investigate the value which * is pointed by the returned pointer if necessary. * * while this function is a bit large, as it's called with some constant * arguments, inlining might have benefits. anyway, a compiler will decide. */ static inline void ** radix_tree_lookup_ptr(struct radix_tree *t, uint64_t idx, struct radix_tree_path *path, bool alloc, const unsigned int tagmask) { struct radix_tree_node *n; int hshift = RADIX_TREE_BITS_PER_HEIGHT * t->t_height; int shift; void **vpp; const uint64_t mask = (UINT64_C(1) << RADIX_TREE_BITS_PER_HEIGHT) - 1; struct radix_tree_node_ref *refs = NULL; /* * check unsupported combinations */ KASSERT(tagmask == 0 || !alloc); KASSERT(path == NULL || !alloc); vpp = &t->t_root; if (path != NULL) { refs = path->p_refs; refs->pptr = vpp; } n = NULL; for (shift = 64 - RADIX_TREE_BITS_PER_HEIGHT; shift >= 0;) { struct radix_tree_node *c; void *entry; const uint64_t i = (idx >> shift) & mask; if (shift >= hshift) { unsigned int newheight; KASSERT(vpp == &t->t_root); if (i == 0) { shift -= RADIX_TREE_BITS_PER_HEIGHT; continue; } if (!alloc) { if (path != NULL) { KASSERT((refs - path->p_refs) == 0); path->p_lastidx = RADIX_TREE_INVALID_HEIGHT; } return NULL; } newheight = shift / RADIX_TREE_BITS_PER_HEIGHT + 1; if (radix_tree_grow(t, newheight)) { return NULL; } hshift = RADIX_TREE_BITS_PER_HEIGHT * t->t_height; } entry = *vpp; c = entry_ptr(entry); if (c == NULL || (tagmask != 0 && (entry_tagmask(entry) & tagmask) == 0)) { if (!alloc) { if (path != NULL) { path->p_lastidx = refs - path->p_refs; } return NULL; } c = radix_tree_alloc_node(); if (c == NULL) { return NULL; } *vpp = c; } n = c; vpp = &n->n_ptrs[i]; if (path != NULL) { refs++; refs->pptr = vpp; } shift -= RADIX_TREE_BITS_PER_HEIGHT; } if (alloc) { KASSERT(*vpp == NULL); } if (path != NULL) { path->p_lastidx = refs - path->p_refs; } return vpp; } /* * radix_tree_undo_insert_node: * * Undo the effects of a failed insert. The conditions that led to the * insert may change and it may not be retried. If the insert is not * retried, there will be no corresponding radix_tree_remove_node() for * this index in the future. Therefore any adjustments made to the tree * before memory was exhausted must be reverted. */ static __noinline void radix_tree_undo_insert_node(struct radix_tree *t, uint64_t idx) { struct radix_tree_path path; int i; (void)radix_tree_lookup_ptr(t, idx, &path, false, 0); if (path.p_lastidx == RADIX_TREE_INVALID_HEIGHT) { /* * no nodes were inserted. */ return; } for (i = path.p_lastidx - 1; i >= 0; i--) { struct radix_tree_node ** const pptr = (struct radix_tree_node **)path_pptr(t, &path, i); struct radix_tree_node *n; KASSERT(pptr != NULL); n = entry_ptr(*pptr); KASSERT(n != NULL); if (radix_tree_sum_node(n) != 0) { break; } radix_tree_free_node(n); *pptr = NULL; } /* * fix up height */ if (i < 0) { KASSERT(t->t_root == NULL); t->t_height = 0; } } /* * radix_tree_insert_node: * * Insert the node at the given index. * * It's illegal to insert NULL. It's illegal to insert a non-aligned pointer. * * This function returns ENOMEM if necessary memory allocation failed. * Otherwise, this function returns 0. * * Note that inserting a node can involves memory allocation for intermediate * nodes. If _KERNEL, it's done with no-sleep IPL_NONE memory allocation. * * For the newly inserted node, all tags are cleared. */ int radix_tree_insert_node(struct radix_tree *t, uint64_t idx, void *p) { void **vpp; KASSERT(p != NULL); KASSERT(entry_tagmask(entry_compose(p, 0)) == 0); vpp = radix_tree_lookup_ptr(t, idx, NULL, true, 0); if (__predict_false(vpp == NULL)) { radix_tree_undo_insert_node(t, idx); return ENOMEM; } KASSERT(*vpp == NULL); *vpp = p; return 0; } /* * radix_tree_replace_node: * * Replace a node at the given index with the given node and return the * replaced one. * * It's illegal to try to replace a node which has not been inserted. * * This function keeps tags intact. */ void * radix_tree_replace_node(struct radix_tree *t, uint64_t idx, void *p) { void **vpp; void *oldp; KASSERT(p != NULL); KASSERT(entry_tagmask(entry_compose(p, 0)) == 0); vpp = radix_tree_lookup_ptr(t, idx, NULL, false, 0); KASSERT(vpp != NULL); oldp = *vpp; KASSERT(oldp != NULL); *vpp = entry_compose(p, entry_tagmask(*vpp)); return entry_ptr(oldp); } /* * radix_tree_remove_node: * * Remove the node at the given index. * * It's illegal to try to remove a node which has not been inserted. */ void * radix_tree_remove_node(struct radix_tree *t, uint64_t idx) { struct radix_tree_path path; void **vpp; void *oldp; int i; vpp = radix_tree_lookup_ptr(t, idx, &path, false, 0); KASSERT(vpp != NULL); oldp = *vpp; KASSERT(oldp != NULL); KASSERT(path.p_lastidx == t->t_height); KASSERT(vpp == path_pptr(t, &path, path.p_lastidx)); *vpp = NULL; for (i = t->t_height - 1; i >= 0; i--) { void *entry; struct radix_tree_node ** const pptr = (struct radix_tree_node **)path_pptr(t, &path, i); struct radix_tree_node *n; KASSERT(pptr != NULL); entry = *pptr; n = entry_ptr(entry); KASSERT(n != NULL); if (radix_tree_sum_node(n) != 0) { break; } radix_tree_free_node(n); *pptr = NULL; } /* * fix up height */ if (i < 0) { KASSERT(t->t_root == NULL); t->t_height = 0; } /* * update tags */ for (; i >= 0; i--) { void *entry; struct radix_tree_node ** const pptr = (struct radix_tree_node **)path_pptr(t, &path, i); struct radix_tree_node *n; unsigned int newmask; KASSERT(pptr != NULL); entry = *pptr; n = entry_ptr(entry); KASSERT(n != NULL); KASSERT(radix_tree_sum_node(n) != 0); newmask = radix_tree_sum_node(n) & RADIX_TREE_TAG_MASK; if (newmask == entry_tagmask(entry)) { break; } *pptr = entry_compose(n, newmask); } /* * XXX is it worth to try to reduce height? * if we do that, make radix_tree_grow rollback its change as well. */ return entry_ptr(oldp); } /* * radix_tree_lookup_node: * * Returns the node at the given index. * Returns NULL if nothing is found at the given index. */ void * radix_tree_lookup_node(struct radix_tree *t, uint64_t idx) { void **vpp; vpp = radix_tree_lookup_ptr(t, idx, NULL, false, 0); if (vpp == NULL) { return NULL; } return entry_ptr(*vpp); } static inline void gang_lookup_init(struct radix_tree *t, uint64_t idx, struct radix_tree_path *path, const unsigned int tagmask) { void **vpp __unused; vpp = radix_tree_lookup_ptr(t, idx, path, false, tagmask); KASSERT(vpp == NULL || vpp == path_pptr(t, path, path->p_lastidx)); KASSERT(&t->t_root == path_pptr(t, path, 0)); KASSERT(path->p_lastidx == RADIX_TREE_INVALID_HEIGHT || path->p_lastidx == t->t_height || !entry_match_p(*path_pptr(t, path, path->p_lastidx), tagmask)); } /* * gang_lookup_scan: * * a helper routine for radix_tree_gang_lookup_node and its variants. */ static inline unsigned int __attribute__((__always_inline__)) gang_lookup_scan(struct radix_tree *t, struct radix_tree_path *path, void **results, const unsigned int maxresults, const unsigned int tagmask, const bool reverse, const bool dense) { /* * we keep the path updated only for lastidx-1. * vpp is what path_pptr(t, path, lastidx) would be. */ void **vpp; unsigned int nfound; unsigned int lastidx; /* * set up scan direction dependant constants so that we can iterate * n_ptrs as the following. * * for (i = first; i != guard; i += step) * visit n->n_ptrs[i]; */ const int step = reverse ? -1 : 1; const unsigned int first = reverse ? RADIX_TREE_PTR_PER_NODE - 1 : 0; const unsigned int last = reverse ? 0 : RADIX_TREE_PTR_PER_NODE - 1; const unsigned int guard = last + step; KASSERT(maxresults > 0); KASSERT(&t->t_root == path_pptr(t, path, 0)); lastidx = path->p_lastidx; KASSERT(lastidx == RADIX_TREE_INVALID_HEIGHT || lastidx == t->t_height || !entry_match_p(*path_pptr(t, path, lastidx), tagmask)); nfound = 0; if (lastidx == RADIX_TREE_INVALID_HEIGHT) { /* * requested idx is beyond the right-most node. */ if (reverse && !dense) { lastidx = 0; vpp = path_pptr(t, path, lastidx); goto descend; } return 0; } vpp = path_pptr(t, path, lastidx); while (/*CONSTCOND*/true) { struct radix_tree_node *n; unsigned int i; if (entry_match_p(*vpp, tagmask)) { KASSERT(lastidx == t->t_height); /* * record the matching non-NULL leaf. */ results[nfound] = entry_ptr(*vpp); nfound++; if (nfound == maxresults) { return nfound; } } else if (dense) { return nfound; } scan_siblings: /* * try to find the next matching non-NULL sibling. */ if (lastidx == 0) { /* * the root has no siblings. * we've done. */ KASSERT(vpp == &t->t_root); break; } n = path_node(t, path, lastidx - 1); for (i = vpp - n->n_ptrs + step; i != guard; i += step) { KASSERT(i < RADIX_TREE_PTR_PER_NODE); if (entry_match_p(n->n_ptrs[i], tagmask)) { vpp = &n->n_ptrs[i]; break; } else if (dense) { return nfound; } } if (i == guard) { /* * not found. go to parent. */ lastidx--; vpp = path_pptr(t, path, lastidx); goto scan_siblings; } descend: /* * following the left-most (or right-most in the case of * reverse scan) child node, descend until reaching the leaf or * a non-matching entry. */ while (entry_match_p(*vpp, tagmask) && lastidx < t->t_height) { /* * save vpp in the path so that we can come back to this * node after finishing visiting children. */ path->p_refs[lastidx].pptr = vpp; n = entry_ptr(*vpp); vpp = &n->n_ptrs[first]; lastidx++; } } return nfound; } /* * radix_tree_gang_lookup_node: * * Scan the tree starting from the given index in the ascending order and * return found nodes. * * results should be an array large enough to hold maxresults pointers. * This function returns the number of nodes found, up to maxresults. * Returning less than maxresults means there are no more nodes in the tree. * * If dense == true, this function stops scanning when it founds a hole of * indexes. I.e. an index for which radix_tree_lookup_node would returns NULL. * If dense == false, this function skips holes and continue scanning until * maxresults nodes are found or it reaches the limit of the index range. * * The result of this function is semantically equivalent to what could be * obtained by repeated calls of radix_tree_lookup_node with increasing index. * but this function is expected to be computationally cheaper when looking up * multiple nodes at once. Especially, it's expected to be much cheaper when * node indexes are distributed sparsely. * * Note that this function doesn't return index values of found nodes. * Thus, in the case of dense == false, if index values are important for * a caller, it's the caller's responsibility to check them, typically * by examining the returned nodes using some caller-specific knowledge * about them. * In the case of dense == true, a node returned via results[N] is always for * the index (idx + N). */ unsigned int radix_tree_gang_lookup_node(struct radix_tree *t, uint64_t idx, void **results, unsigned int maxresults, bool dense) { struct radix_tree_path path; gang_lookup_init(t, idx, &path, 0); return gang_lookup_scan(t, &path, results, maxresults, 0, false, dense); } /* * radix_tree_gang_lookup_node_reverse: * * Same as radix_tree_gang_lookup_node except that this one scans the * tree in the reverse order. I.e. descending index values. */ unsigned int radix_tree_gang_lookup_node_reverse(struct radix_tree *t, uint64_t idx, void **results, unsigned int maxresults, bool dense) { struct radix_tree_path path; gang_lookup_init(t, idx, &path, 0); return gang_lookup_scan(t, &path, results, maxresults, 0, true, dense); } /* * radix_tree_gang_lookup_tagged_node: * * Same as radix_tree_gang_lookup_node except that this one only returns * nodes tagged with tagid. * * It's illegal to call this function with tagmask 0. */ unsigned int radix_tree_gang_lookup_tagged_node(struct radix_tree *t, uint64_t idx, void **results, unsigned int maxresults, bool dense, unsigned int tagmask) { struct radix_tree_path path; KASSERT(tagmask != 0); gang_lookup_init(t, idx, &path, tagmask); return gang_lookup_scan(t, &path, results, maxresults, tagmask, false, dense); } /* * radix_tree_gang_lookup_tagged_node_reverse: * * Same as radix_tree_gang_lookup_tagged_node except that this one scans the * tree in the reverse order. I.e. descending index values. */ unsigned int radix_tree_gang_lookup_tagged_node_reverse(struct radix_tree *t, uint64_t idx, void **results, unsigned int maxresults, bool dense, unsigned int tagmask) { struct radix_tree_path path; KASSERT(tagmask != 0); gang_lookup_init(t, idx, &path, tagmask); return gang_lookup_scan(t, &path, results, maxresults, tagmask, true, dense); } /* * radix_tree_get_tag: * * Return the tagmask for the node at the given index. * * It's illegal to call this function for a node which has not been inserted. */ unsigned int radix_tree_get_tag(struct radix_tree *t, uint64_t idx, unsigned int tagmask) { /* * the following two implementations should behave same. * the former one was chosen because it seems faster. */ #if 1 void **vpp; vpp = radix_tree_lookup_ptr(t, idx, NULL, false, tagmask); if (vpp == NULL) { return false; } KASSERT(*vpp != NULL); return (entry_tagmask(*vpp) & tagmask); #else void **vpp; vpp = radix_tree_lookup_ptr(t, idx, NULL, false, 0); KASSERT(vpp != NULL); return (entry_tagmask(*vpp) & tagmask); #endif } /* * radix_tree_set_tag: * * Set the tag for the node at the given index. * * It's illegal to call this function for a node which has not been inserted. * It's illegal to call this function with tagmask 0. */ void radix_tree_set_tag(struct radix_tree *t, uint64_t idx, unsigned int tagmask) { struct radix_tree_path path; void **vpp __unused; int i; KASSERT(tagmask != 0); vpp = radix_tree_lookup_ptr(t, idx, &path, false, 0); KASSERT(vpp != NULL); KASSERT(*vpp != NULL); KASSERT(path.p_lastidx == t->t_height); KASSERT(vpp == path_pptr(t, &path, path.p_lastidx)); for (i = t->t_height; i >= 0; i--) { void ** const pptr = (void **)path_pptr(t, &path, i); void *entry; KASSERT(pptr != NULL); entry = *pptr; if ((entry_tagmask(entry) & tagmask) != 0) { break; } *pptr = (void *)((uintptr_t)entry | tagmask); } } /* * radix_tree_clear_tag: * * Clear the tag for the node at the given index. * * It's illegal to call this function for a node which has not been inserted. * It's illegal to call this function with tagmask 0. */ void radix_tree_clear_tag(struct radix_tree *t, uint64_t idx, unsigned int tagmask) { struct radix_tree_path path; void **vpp; int i; KASSERT(tagmask != 0); vpp = radix_tree_lookup_ptr(t, idx, &path, false, 0); KASSERT(vpp != NULL); KASSERT(*vpp != NULL); KASSERT(path.p_lastidx == t->t_height); KASSERT(vpp == path_pptr(t, &path, path.p_lastidx)); /* * if already cleared, nothing to do */ if ((entry_tagmask(*vpp) & tagmask) == 0) { return; } /* * clear the tag only if no children have the tag. */ for (i = t->t_height; i >= 0; i--) { void ** const pptr = (void **)path_pptr(t, &path, i); void *entry; KASSERT(pptr != NULL); entry = *pptr; KASSERT((entry_tagmask(entry) & tagmask) != 0); *pptr = entry_compose(entry_ptr(entry), entry_tagmask(entry) & ~tagmask); /* * check if we should proceed to process the next level. */ if (0 < i) { struct radix_tree_node *n = path_node(t, &path, i - 1); if ((radix_tree_sum_node(n) & tagmask) != 0) { break; } } } } #if defined(UNITTEST) #include <inttypes.h> #include <stdio.h> static void radix_tree_dump_node(const struct radix_tree *t, void *vp, uint64_t offset, unsigned int height) { struct radix_tree_node *n; unsigned int i; for (i = 0; i < t->t_height - height; i++) { printf(" "); } if (entry_tagmask(vp) == 0) { printf("[%" PRIu64 "] %p", offset, entry_ptr(vp)); } else { printf("[%" PRIu64 "] %p (tagmask=0x%x)", offset, entry_ptr(vp), entry_tagmask(vp)); } if (height == 0) { printf(" (leaf)\n"); return; } n = entry_ptr(vp); assert((radix_tree_sum_node(n) & RADIX_TREE_TAG_MASK) == entry_tagmask(vp)); printf(" (%u children)\n", radix_tree_node_count_ptrs(n)); for (i = 0; i < __arraycount(n->n_ptrs); i++) { void *c; c = n->n_ptrs[i]; if (c == NULL) { continue; } radix_tree_dump_node(t, c, offset + i * (UINT64_C(1) << (RADIX_TREE_BITS_PER_HEIGHT * (height - 1))), height - 1); } } void radix_tree_dump(const struct radix_tree *); void radix_tree_dump(const struct radix_tree *t) { printf("tree %p height=%u\n", t, t->t_height); radix_tree_dump_node(t, t->t_root, 0, t->t_height); } static void test1(void) { struct radix_tree s; struct radix_tree *t = &s; void *results[3]; radix_tree_init_tree(t); radix_tree_dump(t); assert(radix_tree_lookup_node(t, 0) == NULL); assert(radix_tree_lookup_node(t, 1000) == NULL); assert(radix_tree_gang_lookup_node(t, 0, results, 3, false) == 0); assert(radix_tree_gang_lookup_node(t, 0, results, 3, true) == 0); assert(radix_tree_gang_lookup_node(t, 1000, results, 3, false) == 0); assert(radix_tree_gang_lookup_node(t, 1000, results, 3, true) == 0); assert(radix_tree_gang_lookup_node_reverse(t, 0, results, 3, false) == 0); assert(radix_tree_gang_lookup_node_reverse(t, 0, results, 3, true) == 0); assert(radix_tree_gang_lookup_node_reverse(t, 1000, results, 3, false) == 0); assert(radix_tree_gang_lookup_node_reverse(t, 1000, results, 3, true) == 0); assert(radix_tree_gang_lookup_tagged_node(t, 0, results, 3, false, 1) == 0); assert(radix_tree_gang_lookup_tagged_node(t, 0, results, 3, true, 1) == 0); assert(radix_tree_gang_lookup_tagged_node(t, 1000, results, 3, false, 1) == 0); assert(radix_tree_gang_lookup_tagged_node(t, 1000, results, 3, true, 1) == 0); assert(radix_tree_gang_lookup_tagged_node_reverse(t, 0, results, 3, false, 1) == 0); assert(radix_tree_gang_lookup_tagged_node_reverse(t, 0, results, 3, true, 1) == 0); assert(radix_tree_gang_lookup_tagged_node_reverse(t, 1000, results, 3, false, 1) == 0); assert(radix_tree_gang_lookup_tagged_node_reverse(t, 1000, results, 3, true, 1) == 0); assert(radix_tree_empty_tree_p(t)); assert(radix_tree_empty_tagged_tree_p(t, 1)); assert(radix_tree_empty_tagged_tree_p(t, 2)); assert(radix_tree_insert_node(t, 0, (void *)0xdeadbea0) == 0); assert(!radix_tree_empty_tree_p(t)); assert(radix_tree_empty_tagged_tree_p(t, 1)); assert(radix_tree_empty_tagged_tree_p(t, 2)); assert(radix_tree_lookup_node(t, 0) == (void *)0xdeadbea0); assert(radix_tree_lookup_node(t, 1000) == NULL); memset(results, 0, sizeof(results)); assert(radix_tree_gang_lookup_node(t, 0, results, 3, false) == 1); assert(results[0] == (void *)0xdeadbea0); memset(results, 0, sizeof(results)); assert(radix_tree_gang_lookup_node(t, 0, results, 3, true) == 1); assert(results[0] == (void *)0xdeadbea0); assert(radix_tree_gang_lookup_node(t, 1000, results, 3, false) == 0); assert(radix_tree_gang_lookup_node(t, 1000, results, 3, true) == 0); memset(results, 0, sizeof(results)); assert(radix_tree_gang_lookup_node_reverse(t, 0, results, 3, false) == 1); assert(results[0] == (void *)0xdeadbea0); memset(results, 0, sizeof(results)); assert(radix_tree_gang_lookup_node_reverse(t, 0, results, 3, true) == 1); assert(results[0] == (void *)0xdeadbea0); memset(results, 0, sizeof(results)); assert(radix_tree_gang_lookup_node_reverse(t, 1000, results, 3, false) == 1); assert(results[0] == (void *)0xdeadbea0); assert(radix_tree_gang_lookup_node_reverse(t, 1000, results, 3, true) == 0); assert(radix_tree_gang_lookup_tagged_node(t, 0, results, 3, false, 1) == 0); assert(radix_tree_gang_lookup_tagged_node(t, 0, results, 3, true, 1) == 0); assert(radix_tree_gang_lookup_tagged_node_reverse(t, 0, results, 3, false, 1) == 0); assert(radix_tree_gang_lookup_tagged_node_reverse(t, 0, results, 3, true, 1) == 0); assert(radix_tree_insert_node(t, 1000, (void *)0xdeadbea0) == 0); assert(radix_tree_remove_node(t, 0) == (void *)0xdeadbea0); assert(!radix_tree_empty_tree_p(t)); radix_tree_dump(t); assert(radix_tree_lookup_node(t, 0) == NULL); assert(radix_tree_lookup_node(t, 1000) == (void *)0xdeadbea0); memset(results, 0, sizeof(results)); assert(radix_tree_gang_lookup_node(t, 0, results, 3, false) == 1); assert(results[0] == (void *)0xdeadbea0); assert(radix_tree_gang_lookup_node(t, 0, results, 3, true) == 0); memset(results, 0, sizeof(results)); assert(radix_tree_gang_lookup_node(t, 1000, results, 3, false) == 1); assert(results[0] == (void *)0xdeadbea0); memset(results, 0, sizeof(results)); assert(radix_tree_gang_lookup_node(t, 1000, results, 3, true) == 1); assert(results[0] == (void *)0xdeadbea0); assert(radix_tree_gang_lookup_node_reverse(t, 0, results, 3, false) == 0); assert(radix_tree_gang_lookup_node_reverse(t, 0, results, 3, true) == 0); memset(results, 0, sizeof(results)); assert(radix_tree_gang_lookup_node_reverse(t, 1000, results, 3, false) == 1); memset(results, 0, sizeof(results)); assert(radix_tree_gang_lookup_node_reverse(t, 1000, results, 3, true) == 1); assert(results[0] == (void *)0xdeadbea0); assert(radix_tree_gang_lookup_tagged_node(t, 0, results, 3, false, 1) == 0); assert(radix_tree_gang_lookup_tagged_node(t, 0, results, 3, true, 1) == 0); assert(radix_tree_gang_lookup_tagged_node_reverse(t, 0, results, 3, false, 1) == 0); assert(radix_tree_gang_lookup_tagged_node_reverse(t, 0, results, 3, true, 1) == 0); assert(!radix_tree_get_tag(t, 1000, 1)); assert(!radix_tree_get_tag(t, 1000, 2)); assert(radix_tree_get_tag(t, 1000, 2 | 1) == 0); assert(radix_tree_empty_tagged_tree_p(t, 1)); assert(radix_tree_empty_tagged_tree_p(t, 2)); radix_tree_set_tag(t, 1000, 2); assert(!radix_tree_get_tag(t, 1000, 1)); assert(radix_tree_get_tag(t, 1000, 2)); assert(radix_tree_get_tag(t, 1000, 2 | 1) == 2); assert(radix_tree_empty_tagged_tree_p(t, 1)); assert(!radix_tree_empty_tagged_tree_p(t, 2)); radix_tree_dump(t); assert(radix_tree_lookup_node(t, 1000) == (void *)0xdeadbea0); assert(radix_tree_insert_node(t, 0, (void *)0xbea0) == 0); radix_tree_dump(t); assert(radix_tree_lookup_node(t, 0) == (void *)0xbea0); assert(radix_tree_lookup_node(t, 1000) == (void *)0xdeadbea0); assert(radix_tree_insert_node(t, UINT64_C(10000000000), (void *)0xdea0) == 0); radix_tree_dump(t); assert(radix_tree_lookup_node(t, 0) == (void *)0xbea0); assert(radix_tree_lookup_node(t, 1000) == (void *)0xdeadbea0); assert(radix_tree_lookup_node(t, UINT64_C(10000000000)) == (void *)0xdea0); radix_tree_dump(t); assert(!radix_tree_get_tag(t, 0, 2)); assert(radix_tree_get_tag(t, 1000, 2)); assert(!radix_tree_get_tag(t, UINT64_C(10000000000), 1)); radix_tree_set_tag(t, 0, 2); radix_tree_set_tag(t, UINT64_C(10000000000), 2); radix_tree_dump(t); assert(radix_tree_get_tag(t, 0, 2)); assert(radix_tree_get_tag(t, 1000, 2)); assert(radix_tree_get_tag(t, UINT64_C(10000000000), 2)); radix_tree_clear_tag(t, 0, 2); radix_tree_clear_tag(t, UINT64_C(10000000000), 2); radix_tree_dump(t); assert(!radix_tree_get_tag(t, 0, 2)); assert(radix_tree_get_tag(t, 1000, 2)); assert(!radix_tree_get_tag(t, UINT64_C(10000000000), 2)); radix_tree_dump(t); assert(radix_tree_replace_node(t, 1000, (void *)0x12345678) == (void *)0xdeadbea0); assert(!radix_tree_get_tag(t, 1000, 1)); assert(radix_tree_get_tag(t, 1000, 2)); assert(radix_tree_get_tag(t, 1000, 2 | 1) == 2); memset(results, 0, sizeof(results)); assert(radix_tree_gang_lookup_node(t, 0, results, 3, false) == 3); assert(results[0] == (void *)0xbea0); assert(results[1] == (void *)0x12345678); assert(results[2] == (void *)0xdea0); memset(results, 0, sizeof(results)); assert(radix_tree_gang_lookup_node(t, 0, results, 3, true) == 1); assert(results[0] == (void *)0xbea0); memset(results, 0, sizeof(results)); assert(radix_tree_gang_lookup_node(t, 1, results, 3, false) == 2); assert(results[0] == (void *)0x12345678); assert(results[1] == (void *)0xdea0); assert(radix_tree_gang_lookup_node(t, 1, results, 3, true) == 0); memset(results, 0, sizeof(results)); assert(radix_tree_gang_lookup_node(t, 1001, results, 3, false) == 1); assert(results[0] == (void *)0xdea0); assert(radix_tree_gang_lookup_node(t, 1001, results, 3, true) == 0); assert(radix_tree_gang_lookup_node(t, UINT64_C(10000000001), results, 3, false) == 0); assert(radix_tree_gang_lookup_node(t, UINT64_C(10000000001), results, 3, true) == 0); assert(radix_tree_gang_lookup_node(t, UINT64_C(1000000000000), results, 3, false) == 0); assert(radix_tree_gang_lookup_node(t, UINT64_C(1000000000000), results, 3, true) == 0); memset(results, 0, sizeof(results)); assert(radix_tree_gang_lookup_tagged_node(t, 0, results, 100, false, 2) == 1); assert(results[0] == (void *)0x12345678); assert(radix_tree_gang_lookup_tagged_node(t, 0, results, 100, true, 2) == 0); assert(entry_tagmask(t->t_root) != 0); assert(radix_tree_remove_node(t, 1000) == (void *)0x12345678); assert(entry_tagmask(t->t_root) == 0); radix_tree_dump(t); assert(radix_tree_insert_node(t, UINT64_C(10000000001), (void *)0xfff0) == 0); memset(results, 0, sizeof(results)); assert(radix_tree_gang_lookup_node(t, UINT64_C(10000000000), results, 3, false) == 2); assert(results[0] == (void *)0xdea0); assert(results[1] == (void *)0xfff0); memset(results, 0, sizeof(results)); assert(radix_tree_gang_lookup_node(t, UINT64_C(10000000000), results, 3, true) == 2); assert(results[0] == (void *)0xdea0); assert(results[1] == (void *)0xfff0); memset(results, 0, sizeof(results)); assert(radix_tree_gang_lookup_node_reverse(t, UINT64_C(10000000001), results, 3, false) == 3); assert(results[0] == (void *)0xfff0); assert(results[1] == (void *)0xdea0); assert(results[2] == (void *)0xbea0); memset(results, 0, sizeof(results)); assert(radix_tree_gang_lookup_node_reverse(t, UINT64_C(10000000001), results, 3, true) == 2); assert(results[0] == (void *)0xfff0); assert(results[1] == (void *)0xdea0); assert(radix_tree_remove_node(t, UINT64_C(10000000000)) == (void *)0xdea0); assert(radix_tree_remove_node(t, UINT64_C(10000000001)) == (void *)0xfff0); radix_tree_dump(t); assert(radix_tree_remove_node(t, 0) == (void *)0xbea0); radix_tree_dump(t); radix_tree_fini_tree(t); } #include <sys/time.h> struct testnode { uint64_t idx; bool tagged[RADIX_TREE_TAG_ID_MAX]; }; static void printops(const char *title, const char *name, int tag, unsigned int n, const struct timeval *stv, const struct timeval *etv) { uint64_t s = stv->tv_sec * 1000000 + stv->tv_usec; uint64_t e = etv->tv_sec * 1000000 + etv->tv_usec; printf("RESULT %s %s %d %lf op/s\n", title, name, tag, (double)n / (e - s) * 1000000); } #define TEST2_GANG_LOOKUP_NODES 16 static bool test2_should_tag(unsigned int i, unsigned int tagid) { if (tagid == 0) { return (i % 4) == 0; /* 25% */ } else { return (i % 7) == 0; /* 14% */ } return 1; } static void check_tag_count(const unsigned int *ntagged, unsigned int tagmask, unsigned int count) { unsigned int tag; for (tag = 0; tag < RADIX_TREE_TAG_ID_MAX; tag++) { if ((tagmask & (1 << tag)) == 0) { continue; } if (((tagmask - 1) & tagmask) == 0) { assert(count == ntagged[tag]); } else { assert(count >= ntagged[tag]); } } } static void test2(const char *title, bool dense) { struct radix_tree s; struct radix_tree *t = &s; struct testnode *n; unsigned int i; unsigned int nnodes = 100000; unsigned int removed; unsigned int tag; unsigned int tagmask; unsigned int ntagged[RADIX_TREE_TAG_ID_MAX]; struct testnode *nodes; struct timeval stv; struct timeval etv; nodes = malloc(nnodes * sizeof(*nodes)); for (tag = 0; tag < RADIX_TREE_TAG_ID_MAX; tag++) { ntagged[tag] = 0; } radix_tree_init_tree(t); for (i = 0; i < nnodes; i++) { n = &nodes[i]; n->idx = random(); if (sizeof(long) == 4) { n->idx <<= 32; n->idx |= (uint32_t)random(); } if (dense) { n->idx %= nnodes * 2; } while (radix_tree_lookup_node(t, n->idx) != NULL) { n->idx++; } radix_tree_insert_node(t, n->idx, n); for (tag = 0; tag < RADIX_TREE_TAG_ID_MAX; tag++) { tagmask = 1 << tag; n->tagged[tag] = test2_should_tag(i, tag); if (n->tagged[tag]) { radix_tree_set_tag(t, n->idx, tagmask); ntagged[tag]++; } assert((n->tagged[tag] ? tagmask : 0) == radix_tree_get_tag(t, n->idx, tagmask)); } } gettimeofday(&stv, NULL); for (i = 0; i < nnodes; i++) { n = &nodes[i]; assert(radix_tree_lookup_node(t, n->idx) == n); } gettimeofday(&etv, NULL); printops(title, "lookup", 0, nnodes, &stv, &etv); for (tagmask = 1; tagmask <= RADIX_TREE_TAG_MASK; tagmask ++) { unsigned int count = 0; gettimeofday(&stv, NULL); for (i = 0; i < nnodes; i++) { unsigned int tagged; n = &nodes[i]; tagged = radix_tree_get_tag(t, n->idx, tagmask); assert((tagged & ~tagmask) == 0); for (tag = 0; tag < RADIX_TREE_TAG_ID_MAX; tag++) { assert((tagmask & (1 << tag)) == 0 || n->tagged[tag] == !!(tagged & (1 << tag))); } if (tagged) { count++; } } gettimeofday(&etv, NULL); check_tag_count(ntagged, tagmask, count); printops(title, "get_tag", tagmask, nnodes, &stv, &etv); } gettimeofday(&stv, NULL); for (i = 0; i < nnodes; i++) { n = &nodes[i]; radix_tree_remove_node(t, n->idx); } gettimeofday(&etv, NULL); printops(title, "remove", 0, nnodes, &stv, &etv); gettimeofday(&stv, NULL); for (i = 0; i < nnodes; i++) { n = &nodes[i]; radix_tree_insert_node(t, n->idx, n); } gettimeofday(&etv, NULL); printops(title, "insert", 0, nnodes, &stv, &etv); for (tag = 0; tag < RADIX_TREE_TAG_ID_MAX; tag++) { tagmask = 1 << tag; ntagged[tag] = 0; gettimeofday(&stv, NULL); for (i = 0; i < nnodes; i++) { n = &nodes[i]; if (n->tagged[tag]) { radix_tree_set_tag(t, n->idx, tagmask); ntagged[tag]++; } } gettimeofday(&etv, NULL); printops(title, "set_tag", tag, ntagged[tag], &stv, &etv); } gettimeofday(&stv, NULL); { struct testnode *results[TEST2_GANG_LOOKUP_NODES]; uint64_t nextidx; unsigned int nfound; unsigned int total; nextidx = 0; total = 0; while ((nfound = radix_tree_gang_lookup_node(t, nextidx, (void *)results, __arraycount(results), false)) > 0) { nextidx = results[nfound - 1]->idx + 1; total += nfound; if (nextidx == 0) { break; } } assert(total == nnodes); } gettimeofday(&etv, NULL); printops(title, "ganglookup", 0, nnodes, &stv, &etv); gettimeofday(&stv, NULL); { struct testnode *results[TEST2_GANG_LOOKUP_NODES]; uint64_t nextidx; unsigned int nfound; unsigned int total; nextidx = UINT64_MAX; total = 0; while ((nfound = radix_tree_gang_lookup_node_reverse(t, nextidx, (void *)results, __arraycount(results), false)) > 0) { nextidx = results[nfound - 1]->idx - 1; total += nfound; if (nextidx == UINT64_MAX) { break; } } assert(total == nnodes); } gettimeofday(&etv, NULL); printops(title, "ganglookup_reverse", 0, nnodes, &stv, &etv); for (tagmask = 1; tagmask <= RADIX_TREE_TAG_MASK; tagmask ++) { unsigned int total = 0; gettimeofday(&stv, NULL); { struct testnode *results[TEST2_GANG_LOOKUP_NODES]; uint64_t nextidx; unsigned int nfound; nextidx = 0; while ((nfound = radix_tree_gang_lookup_tagged_node(t, nextidx, (void *)results, __arraycount(results), false, tagmask)) > 0) { nextidx = results[nfound - 1]->idx + 1; total += nfound; } } gettimeofday(&etv, NULL); check_tag_count(ntagged, tagmask, total); assert(tagmask != 0 || total == 0); printops(title, "ganglookup_tag", tagmask, total, &stv, &etv); } for (tagmask = 1; tagmask <= RADIX_TREE_TAG_MASK; tagmask ++) { unsigned int total = 0; gettimeofday(&stv, NULL); { struct testnode *results[TEST2_GANG_LOOKUP_NODES]; uint64_t nextidx; unsigned int nfound; nextidx = UINT64_MAX; while ((nfound = radix_tree_gang_lookup_tagged_node_reverse(t, nextidx, (void *)results, __arraycount(results), false, tagmask)) > 0) { nextidx = results[nfound - 1]->idx - 1; total += nfound; if (nextidx == UINT64_MAX) { break; } } } gettimeofday(&etv, NULL); check_tag_count(ntagged, tagmask, total); assert(tagmask != 0 || total == 0); printops(title, "ganglookup_tag_reverse", tagmask, total, &stv, &etv); } removed = 0; for (tag = 0; tag < RADIX_TREE_TAG_ID_MAX; tag++) { unsigned int total; total = 0; tagmask = 1 << tag; gettimeofday(&stv, NULL); { struct testnode *results[TEST2_GANG_LOOKUP_NODES]; uint64_t nextidx; unsigned int nfound; nextidx = 0; while ((nfound = radix_tree_gang_lookup_tagged_node(t, nextidx, (void *)results, __arraycount(results), false, tagmask)) > 0) { for (i = 0; i < nfound; i++) { radix_tree_remove_node(t, results[i]->idx); } nextidx = results[nfound - 1]->idx + 1; total += nfound; if (nextidx == 0) { break; } } } gettimeofday(&etv, NULL); if (tag == 0) { check_tag_count(ntagged, tagmask, total); } else { assert(total <= ntagged[tag]); } printops(title, "ganglookup_tag+remove", tagmask, total, &stv, &etv); removed += total; } gettimeofday(&stv, NULL); { struct testnode *results[TEST2_GANG_LOOKUP_NODES]; uint64_t nextidx; unsigned int nfound; unsigned int total; nextidx = 0; total = 0; while ((nfound = radix_tree_gang_lookup_node(t, nextidx, (void *)results, __arraycount(results), false)) > 0) { for (i = 0; i < nfound; i++) { assert(results[i] == radix_tree_remove_node(t, results[i]->idx)); } nextidx = results[nfound - 1]->idx + 1; total += nfound; if (nextidx == 0) { break; } } assert(total == nnodes - removed); } gettimeofday(&etv, NULL); printops(title, "ganglookup+remove", 0, nnodes - removed, &stv, &etv); assert(radix_tree_empty_tree_p(t)); for (tagmask = 1; tagmask <= RADIX_TREE_TAG_MASK; tagmask ++) { assert(radix_tree_empty_tagged_tree_p(t, tagmask)); } radix_tree_fini_tree(t); free(nodes); } int main(int argc, char *argv[]) { test1(); test2("dense", true); test2("sparse", false); return 0; } #endif /* defined(UNITTEST) */
209 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 40 2 2 1 1 1 3 3 2 2 2 3 2 1 2 6 4 4 4 4 11 3 3 3 32 34 3 34 30 30 30 30 13 24 2 2 1 1 1 1 1 1 1 1 1 1 1 2 3 2 1 14 14 14 14 14 13 4 4 4 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 53 6 6 5 5 1 1 65 65 58 58 8 8 145 59 118 5 33 5 35 74 3 43 44 6 54 40 1 1 2 2 1 2 1 2 2 1 1 1 1 2 2 80 4 80 5 75 6 44 24 69 5 3 2 3 3 2 1 3 6 54 15 54 54 14 41 39 38 41 55 1 54 35 19 54 54 2 3 2 2 3 3 3 2 2 2 2 2 20 20 20 7 5 3 8 8 6 6 6 6 1 1 20 20 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 /* $NetBSD: if.c,v 1.529 2023/02/24 11:02:45 riastradh Exp $ */ /*- * Copyright (c) 1999, 2000, 2001, 2008 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by William Studenmund and Jason R. Thorpe. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /* * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the project nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * Copyright (c) 1980, 1986, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)if.c 8.5 (Berkeley) 1/9/95 */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: if.c,v 1.529 2023/02/24 11:02:45 riastradh Exp $"); #if defined(_KERNEL_OPT) #include "opt_inet.h" #include "opt_ipsec.h" #include "opt_atalk.h" #include "opt_wlan.h" #include "opt_net_mpsafe.h" #include "opt_mrouting.h" #endif #include <sys/param.h> #include <sys/mbuf.h> #include <sys/systm.h> #include <sys/callout.h> #include <sys/proc.h> #include <sys/socket.h> #include <sys/socketvar.h> #include <sys/domain.h> #include <sys/protosw.h> #include <sys/kernel.h> #include <sys/ioctl.h> #include <sys/sysctl.h> #include <sys/syslog.h> #include <sys/kauth.h> #include <sys/kmem.h> #include <sys/xcall.h> #include <sys/cpu.h> #include <sys/intr.h> #include <sys/module_hook.h> #include <sys/compat_stub.h> #include <sys/msan.h> #include <sys/hook.h> #include <net/if.h> #include <net/if_dl.h> #include <net/if_ether.h> #include <net/if_media.h> #include <net80211/ieee80211.h> #include <net80211/ieee80211_ioctl.h> #include <net/if_types.h> #include <net/route.h> #include <sys/module.h> #ifdef NETATALK #include <netatalk/at_extern.h> #include <netatalk/at.h> #endif #include <net/pfil.h> #include <netinet/in.h> #include <netinet/in_var.h> #include <netinet/ip_encap.h> #include <net/bpf.h> #ifdef INET6 #include <netinet6/in6_var.h> #include <netinet6/nd6.h> #endif #include "ether.h" #include "bridge.h" #if NBRIDGE > 0 #include <net/if_bridgevar.h> #endif #include "carp.h" #if NCARP > 0 #include <netinet/ip_carp.h> #endif #include <compat/sys/sockio.h> MALLOC_DEFINE(M_IFADDR, "ifaddr", "interface address"); MALLOC_DEFINE(M_IFMADDR, "ether_multi", "link-level multicast address"); /* * XXX reusing (ifp)->if_snd->ifq_lock rather than having another spin mutex * for each ifnet. It doesn't matter because: * - if IFEF_MPSAFE is enabled, if_snd isn't used and lock contentions on * ifq_lock don't happen * - if IFEF_MPSAFE is disabled, there is no lock contention on ifq_lock * because if_snd, if_link_state_change and if_link_state_change_process * are all called with KERNEL_LOCK */ #define IF_LINK_STATE_CHANGE_LOCK(ifp) \ mutex_enter((ifp)->if_snd.ifq_lock) #define IF_LINK_STATE_CHANGE_UNLOCK(ifp) \ mutex_exit((ifp)->if_snd.ifq_lock) /* * Global list of interfaces. */ /* DEPRECATED. Remove it once kvm(3) users disappeared */ struct ifnet_head ifnet_list; struct pslist_head ifnet_pslist; static ifnet_t ** ifindex2ifnet = NULL; static u_int if_index = 1; static size_t if_indexlim = 0; static uint64_t index_gen; /* Mutex to protect the above objects. */ kmutex_t ifnet_mtx __cacheline_aligned; static struct psref_class *ifnet_psref_class __read_mostly; static pserialize_t ifnet_psz; static struct workqueue *ifnet_link_state_wq __read_mostly; static struct workqueue *if_slowtimo_wq __read_mostly; static kmutex_t if_clone_mtx; struct ifnet *lo0ifp; int ifqmaxlen = IFQ_MAXLEN; struct psref_class *ifa_psref_class __read_mostly; static int if_delroute_matcher(struct rtentry *, void *); static bool if_is_unit(const char *); static struct if_clone *if_clone_lookup(const char *, int *); static LIST_HEAD(, if_clone) if_cloners = LIST_HEAD_INITIALIZER(if_cloners); static int if_cloners_count; /* Packet filtering hook for interfaces. */ pfil_head_t * if_pfil __read_mostly; static kauth_listener_t if_listener; static int doifioctl(struct socket *, u_long, void *, struct lwp *); static void sysctl_sndq_setup(struct sysctllog **, const char *, struct ifaltq *); static void if_slowtimo_intr(void *); static void if_slowtimo_work(struct work *, void *); static int sysctl_if_watchdog(SYSCTLFN_PROTO); static void sysctl_watchdog_setup(struct ifnet *); static void if_attachdomain1(struct ifnet *); static int ifconf(u_long, void *); static int if_transmit(struct ifnet *, struct mbuf *); static int if_clone_create(const char *); static int if_clone_destroy(const char *); static void if_link_state_change_work(struct work *, void *); static void if_up_locked(struct ifnet *); static void _if_down(struct ifnet *); static void if_down_deactivated(struct ifnet *); struct if_percpuq { struct ifnet *ipq_ifp; void *ipq_si; struct percpu *ipq_ifqs; /* struct ifqueue */ }; static struct mbuf *if_percpuq_dequeue(struct if_percpuq *); static void if_percpuq_drops(void *, void *, struct cpu_info *); static int sysctl_percpuq_drops_handler(SYSCTLFN_PROTO); static void sysctl_percpuq_setup(struct sysctllog **, const char *, struct if_percpuq *); struct if_deferred_start { struct ifnet *ids_ifp; void (*ids_if_start)(struct ifnet *); void *ids_si; }; static void if_deferred_start_softint(void *); static void if_deferred_start_common(struct ifnet *); static void if_deferred_start_destroy(struct ifnet *); struct if_slowtimo_data { kmutex_t isd_lock; struct callout isd_ch; struct work isd_work; struct ifnet *isd_ifp; bool isd_queued; bool isd_dying; bool isd_trigger; }; /* * Hook for if_vlan - needed by if_agr */ struct if_vlan_vlan_input_hook_t if_vlan_vlan_input_hook; static void if_sysctl_setup(struct sysctllog **); static int if_listener_cb(kauth_cred_t cred, kauth_action_t action, void *cookie, void *arg0, void *arg1, void *arg2, void *arg3) { int result; enum kauth_network_req req; result = KAUTH_RESULT_DEFER; req = (enum kauth_network_req)(uintptr_t)arg1; if (action != KAUTH_NETWORK_INTERFACE) return result; if ((req == KAUTH_REQ_NETWORK_INTERFACE_GET) || (req == KAUTH_REQ_NETWORK_INTERFACE_SET)) result = KAUTH_RESULT_ALLOW; return result; } /* * Network interface utility routines. * * Routines with ifa_ifwith* names take sockaddr *'s as * parameters. */ void ifinit(void) { #if (defined(INET) || defined(INET6)) encapinit(); #endif if_listener = kauth_listen_scope(KAUTH_SCOPE_NETWORK, if_listener_cb, NULL); /* interfaces are available, inform socket code */ ifioctl = doifioctl; } /* * XXX Initialization before configure(). * XXX hack to get pfil_add_hook working in autoconf. */ void ifinit1(void) { int error __diagused; #ifdef NET_MPSAFE printf("NET_MPSAFE enabled\n"); #endif mutex_init(&if_clone_mtx, MUTEX_DEFAULT, IPL_NONE); TAILQ_INIT(&ifnet_list); mutex_init(&ifnet_mtx, MUTEX_DEFAULT, IPL_NONE); ifnet_psz = pserialize_create(); ifnet_psref_class = psref_class_create("ifnet", IPL_SOFTNET); ifa_psref_class = psref_class_create("ifa", IPL_SOFTNET); error = workqueue_create(&ifnet_link_state_wq, "iflnkst", if_link_state_change_work, NULL, PRI_SOFTNET, IPL_NET, WQ_MPSAFE); KASSERT(error == 0); PSLIST_INIT(&ifnet_pslist); error = workqueue_create(&if_slowtimo_wq, "ifwdog", if_slowtimo_work, NULL, PRI_SOFTNET, IPL_SOFTCLOCK, WQ_MPSAFE); KASSERTMSG(error == 0, "error=%d", error); if_indexlim = 8; if_pfil = pfil_head_create(PFIL_TYPE_IFNET, NULL); KASSERT(if_pfil != NULL); #if NETHER > 0 || defined(NETATALK) || defined(WLAN) etherinit(); #endif } /* XXX must be after domaininit() */ void ifinit_post(void) { if_sysctl_setup(NULL); } ifnet_t * if_alloc(u_char type) { return kmem_zalloc(sizeof(ifnet_t), KM_SLEEP); } void if_free(ifnet_t *ifp) { kmem_free(ifp, sizeof(ifnet_t)); } void if_initname(struct ifnet *ifp, const char *name, int unit) { (void)snprintf(ifp->if_xname, sizeof(ifp->if_xname), "%s%d", name, unit); } /* * Null routines used while an interface is going away. These routines * just return an error. */ int if_nulloutput(struct ifnet *ifp, struct mbuf *m, const struct sockaddr *so, const struct rtentry *rt) { return ENXIO; } void if_nullinput(struct ifnet *ifp, struct mbuf *m) { /* Nothing. */ } void if_nullstart(struct ifnet *ifp) { /* Nothing. */ } int if_nulltransmit(struct ifnet *ifp, struct mbuf *m) { m_freem(m); return ENXIO; } int if_nullioctl(struct ifnet *ifp, u_long cmd, void *data) { return ENXIO; } int if_nullinit(struct ifnet *ifp) { return ENXIO; } void if_nullstop(struct ifnet *ifp, int disable) { /* Nothing. */ } void if_nullslowtimo(struct ifnet *ifp) { /* Nothing. */ } void if_nulldrain(struct ifnet *ifp) { /* Nothing. */ } void if_set_sadl(struct ifnet *ifp, const void *lla, u_char addrlen, bool factory) { struct ifaddr *ifa; struct sockaddr_dl *sdl; ifp->if_addrlen = addrlen; if_alloc_sadl(ifp); ifa = ifp->if_dl; sdl = satosdl(ifa->ifa_addr); (void)sockaddr_dl_setaddr(sdl, sdl->sdl_len, lla, ifp->if_addrlen); if (factory) { KASSERT(ifp->if_hwdl == NULL); ifp->if_hwdl = ifp->if_dl; ifaref(ifp->if_hwdl); } /* TBD routing socket */ } struct ifaddr * if_dl_create(const struct ifnet *ifp, const struct sockaddr_dl **sdlp) { unsigned socksize, ifasize; int addrlen, namelen; struct sockaddr_dl *mask, *sdl; struct ifaddr *ifa; namelen = strlen(ifp->if_xname); addrlen = ifp->if_addrlen; socksize = roundup(sockaddr_dl_measure(namelen, addrlen), sizeof(long)); ifasize = sizeof(*ifa) + 2 * socksize; ifa = malloc(ifasize, M_IFADDR, M_WAITOK | M_ZERO); sdl = (struct sockaddr_dl *)(ifa + 1); mask = (struct sockaddr_dl *)(socksize + (char *)sdl); sockaddr_dl_init(sdl, socksize, ifp->if_index, ifp->if_type, ifp->if_xname, namelen, NULL, addrlen); mask->sdl_family = AF_LINK; mask->sdl_len = sockaddr_dl_measure(namelen, 0); memset(&mask->sdl_data[0], 0xff, namelen); ifa->ifa_rtrequest = link_rtrequest; ifa->ifa_addr = (struct sockaddr *)sdl; ifa->ifa_netmask = (struct sockaddr *)mask; ifa_psref_init(ifa); *sdlp = sdl; return ifa; } static void if_sadl_setrefs(struct ifnet *ifp, struct ifaddr *ifa) { const struct sockaddr_dl *sdl; ifp->if_dl = ifa; ifaref(ifa); sdl = satosdl(ifa->ifa_addr); ifp->if_sadl = sdl; } /* * Allocate the link level name for the specified interface. This * is an attachment helper. It must be called after ifp->if_addrlen * is initialized, which may not be the case when if_attach() is * called. */ void if_alloc_sadl(struct ifnet *ifp) { struct ifaddr *ifa; const struct sockaddr_dl *sdl; /* * If the interface already has a link name, release it * now. This is useful for interfaces that can change * link types, and thus switch link names often. */ if (ifp->if_sadl != NULL) if_free_sadl(ifp, 0); ifa = if_dl_create(ifp, &sdl); ifa_insert(ifp, ifa); if_sadl_setrefs(ifp, ifa); } static void if_deactivate_sadl(struct ifnet *ifp) { struct ifaddr *ifa; KASSERT(ifp->if_dl != NULL); ifa = ifp->if_dl; ifp->if_sadl = NULL; ifp->if_dl = NULL; ifafree(ifa); } static void if_replace_sadl(struct ifnet *ifp, struct ifaddr *ifa) { struct ifaddr *old; KASSERT(ifp->if_dl != NULL); old = ifp->if_dl; ifaref(ifa); /* XXX Update if_dl and if_sadl atomically */ ifp->if_dl = ifa; ifp->if_sadl = satosdl(ifa->ifa_addr); ifafree(old); } void if_activate_sadl(struct ifnet *ifp, struct ifaddr *ifa0, const struct sockaddr_dl *sdl) { struct ifaddr *ifa; const int bound = curlwp_bind(); KASSERT(ifa_held(ifa0)); const int s = splsoftnet(); if_replace_sadl(ifp, ifa0); int ss = pserialize_read_enter(); IFADDR_READER_FOREACH(ifa, ifp) { struct psref psref; ifa_acquire(ifa, &psref); pserialize_read_exit(ss); rtinit(ifa, RTM_LLINFO_UPD, 0); ss = pserialize_read_enter(); ifa_release(ifa, &psref); } pserialize_read_exit(ss); splx(s); curlwp_bindx(bound); } /* * Free the link level name for the specified interface. This is * a detach helper. This is called from if_detach(). */ void if_free_sadl(struct ifnet *ifp, int factory) { struct ifaddr *ifa; if (factory && ifp->if_hwdl != NULL) { ifa = ifp->if_hwdl; ifp->if_hwdl = NULL; ifafree(ifa); } ifa = ifp->if_dl; if (ifa == NULL) { KASSERT(ifp->if_sadl == NULL); return; } KASSERT(ifp->if_sadl != NULL); const int s = splsoftnet(); KASSERT(ifa->ifa_addr->sa_family == AF_LINK); ifa_remove(ifp, ifa); if_deactivate_sadl(ifp); splx(s); } static void if_getindex(ifnet_t *ifp) { bool hitlimit = false; char xnamebuf[HOOKNAMSIZ]; ifp->if_index_gen = index_gen++; snprintf(xnamebuf, sizeof(xnamebuf), "%s-lshk", ifp->if_xname); ifp->if_linkstate_hooks = simplehook_create(IPL_NET, xnamebuf); ifp->if_index = if_index; if (ifindex2ifnet == NULL) { if_index++; goto skip; } while (if_byindex(ifp->if_index)) { /* * If we hit USHRT_MAX, we skip back to 0 since * there are a number of places where the value * of if_index or if_index itself is compared * to or stored in an unsigned short. By * jumping back, we won't botch those assignments * or comparisons. */ if (++if_index == 0) { if_index = 1; } else if (if_index == USHRT_MAX) { /* * However, if we have to jump back to * zero *twice* without finding an empty * slot in ifindex2ifnet[], then there * there are too many (>65535) interfaces. */ if (hitlimit) panic("too many interfaces"); hitlimit = true; if_index = 1; } ifp->if_index = if_index; } skip: /* * ifindex2ifnet is indexed by if_index. Since if_index will * grow dynamically, it should grow too. */ if (ifindex2ifnet == NULL || ifp->if_index >= if_indexlim) { size_t m, n, oldlim; void *q; oldlim = if_indexlim; while (ifp->if_index >= if_indexlim) if_indexlim <<= 1; /* grow ifindex2ifnet */ m = oldlim * sizeof(struct ifnet *); n = if_indexlim * sizeof(struct ifnet *); q = malloc(n, M_IFADDR, M_WAITOK | M_ZERO); if (ifindex2ifnet != NULL) { memcpy(q, ifindex2ifnet, m); free(ifindex2ifnet, M_IFADDR); } ifindex2ifnet = (struct ifnet **)q; } ifindex2ifnet[ifp->if_index] = ifp; } /* * Initialize an interface and assign an index for it. * * It must be called prior to a device specific attach routine * (e.g., ether_ifattach and ieee80211_ifattach) or if_alloc_sadl, * and be followed by if_register: * * if_initialize(ifp); * ether_ifattach(ifp, enaddr); * if_register(ifp); */ void if_initialize(ifnet_t *ifp) { KASSERT(if_indexlim > 0); TAILQ_INIT(&ifp->if_addrlist); /* * Link level name is allocated later by a separate call to * if_alloc_sadl(). */ if (ifp->if_snd.ifq_maxlen == 0) ifp->if_snd.ifq_maxlen = ifqmaxlen; ifp->if_broadcastaddr = 0; /* reliably crash if used uninitialized */ ifp->if_link_state = LINK_STATE_UNKNOWN; ifp->if_link_queue = -1; /* all bits set, see link_state_change() */ ifp->if_link_scheduled = false; ifp->if_capenable = 0; ifp->if_csum_flags_tx = 0; ifp->if_csum_flags_rx = 0; #ifdef ALTQ ifp->if_snd.altq_type = 0; ifp->if_snd.altq_disc = NULL; ifp->if_snd.altq_flags &= ALTQF_CANTCHANGE; ifp->if_snd.altq_tbr = NULL; ifp->if_snd.altq_ifp = ifp; #endif IFQ_LOCK_INIT(&ifp->if_snd); ifp->if_pfil = pfil_head_create(PFIL_TYPE_IFNET, ifp); pfil_run_ifhooks(if_pfil, PFIL_IFNET_ATTACH, ifp); IF_AFDATA_LOCK_INIT(ifp); PSLIST_ENTRY_INIT(ifp, if_pslist_entry); PSLIST_INIT(&ifp->if_addr_pslist); psref_target_init(&ifp->if_psref, ifnet_psref_class); ifp->if_ioctl_lock = mutex_obj_alloc(MUTEX_DEFAULT, IPL_NONE); LIST_INIT(&ifp->if_multiaddrs); if_stats_init(ifp); IFNET_GLOBAL_LOCK(); if_getindex(ifp); IFNET_GLOBAL_UNLOCK(); } /* * Register an interface to the list of "active" interfaces. */ void if_register(ifnet_t *ifp) { /* * If the driver has not supplied its own if_ioctl or if_stop, * then supply the default. */ if (ifp->if_ioctl == NULL) ifp->if_ioctl = ifioctl_common; if (ifp->if_stop == NULL) ifp->if_stop = if_nullstop; sysctl_sndq_setup(&ifp->if_sysctl_log, ifp->if_xname, &ifp->if_snd); if (!STAILQ_EMPTY(&domains)) if_attachdomain1(ifp); /* Announce the interface. */ rt_ifannouncemsg(ifp, IFAN_ARRIVAL); if (ifp->if_slowtimo != NULL) { struct if_slowtimo_data *isd; isd = kmem_zalloc(sizeof(*isd), KM_SLEEP); mutex_init(&isd->isd_lock, MUTEX_DEFAULT, IPL_SOFTCLOCK); callout_init(&isd->isd_ch, CALLOUT_MPSAFE); callout_setfunc(&isd->isd_ch, if_slowtimo_intr, ifp); isd->isd_ifp = ifp; ifp->if_slowtimo_data = isd; if_slowtimo_intr(ifp); sysctl_watchdog_setup(ifp); } if (ifp->if_transmit == NULL || ifp->if_transmit == if_nulltransmit) ifp->if_transmit = if_transmit; IFNET_GLOBAL_LOCK(); TAILQ_INSERT_TAIL(&ifnet_list, ifp, if_list); IFNET_WRITER_INSERT_TAIL(ifp); IFNET_GLOBAL_UNLOCK(); } /* * The if_percpuq framework * * It allows network device drivers to execute the network stack * in softint (so called softint-based if_input). It utilizes * softint and percpu ifqueue. It doesn't distribute any packets * between CPUs, unlike pktqueue(9). * * Currently we support two options for device drivers to apply the framework: * - Use it implicitly with less changes * - If you use if_attach in driver's _attach function and if_input in * driver's Rx interrupt handler, a packet is queued and a softint handles * the packet implicitly * - Use it explicitly in each driver (recommended) * - You can use if_percpuq_* directly in your driver * - In this case, you need to allocate struct if_percpuq in driver's softc * - See wm(4) as a reference implementation */ static void if_percpuq_softint(void *arg) { struct if_percpuq *ipq = arg; struct ifnet *ifp = ipq->ipq_ifp; struct mbuf *m; while ((m = if_percpuq_dequeue(ipq)) != NULL) { if_statinc(ifp, if_ipackets); bpf_mtap(ifp, m, BPF_D_IN); ifp->_if_input(ifp, m); } } static void if_percpuq_init_ifq(void *p, void *arg __unused, struct cpu_info *ci __unused) { struct ifqueue *const ifq = p; memset(ifq, 0, sizeof(*ifq)); ifq->ifq_maxlen = IFQ_MAXLEN; } struct if_percpuq * if_percpuq_create(struct ifnet *ifp) { struct if_percpuq *ipq; u_int flags = SOFTINT_NET; flags |= if_is_mpsafe(ifp) ? SOFTINT_MPSAFE : 0; ipq = kmem_zalloc(sizeof(*ipq), KM_SLEEP); ipq->ipq_ifp = ifp; ipq->ipq_si = softint_establish(flags, if_percpuq_softint, ipq); ipq->ipq_ifqs = percpu_alloc(sizeof(struct ifqueue)); percpu_foreach(ipq->ipq_ifqs, &if_percpuq_init_ifq, NULL); sysctl_percpuq_setup(&ifp->if_sysctl_log, ifp->if_xname, ipq); return ipq; } static struct mbuf * if_percpuq_dequeue(struct if_percpuq *ipq) { struct mbuf *m; struct ifqueue *ifq; const int s = splnet(); ifq = percpu_getref(ipq->ipq_ifqs); IF_DEQUEUE(ifq, m); percpu_putref(ipq->ipq_ifqs); splx(s); return m; } static void if_percpuq_purge_ifq(void *p, void *arg __unused, struct cpu_info *ci __unused) { struct ifqueue *const ifq = p; IF_PURGE(ifq); } void if_percpuq_destroy(struct if_percpuq *ipq) { /* if_detach may already destroy it */ if (ipq == NULL) return; softint_disestablish(ipq->ipq_si); percpu_foreach(ipq->ipq_ifqs, &if_percpuq_purge_ifq, NULL); percpu_free(ipq->ipq_ifqs, sizeof(struct ifqueue)); kmem_free(ipq, sizeof(*ipq)); } void if_percpuq_enqueue(struct if_percpuq *ipq, struct mbuf *m) { struct ifqueue *ifq; KASSERT(ipq != NULL); const int s = splnet(); ifq = percpu_getref(ipq->ipq_ifqs); if (IF_QFULL(ifq)) { IF_DROP(ifq); percpu_putref(ipq->ipq_ifqs); m_freem(m); goto out; } IF_ENQUEUE(ifq, m); percpu_putref(ipq->ipq_ifqs); softint_schedule(ipq->ipq_si); out: splx(s); } static void if_percpuq_drops(void *p, void *arg, struct cpu_info *ci __unused) { struct ifqueue *const ifq = p; uint64_t *sum = arg; *sum += ifq->ifq_drops; } static int sysctl_percpuq_drops_handler(SYSCTLFN_ARGS) { struct sysctlnode node; struct if_percpuq *ipq; uint64_t sum = 0; int error; node = *rnode; ipq = node.sysctl_data; percpu_foreach(ipq->ipq_ifqs, if_percpuq_drops, &sum); node.sysctl_data = &sum; error = sysctl_lookup(SYSCTLFN_CALL(&node)); if (error != 0 || newp == NULL) return error; return 0; } static void sysctl_percpuq_setup(struct sysctllog **clog, const char* ifname, struct if_percpuq *ipq) { const struct sysctlnode *cnode, *rnode; if (sysctl_createv(clog, 0, NULL, &rnode, CTLFLAG_PERMANENT, CTLTYPE_NODE, "interfaces", SYSCTL_DESCR("Per-interface controls"), NULL, 0, NULL, 0, CTL_NET, CTL_CREATE, CTL_EOL) != 0) goto bad; if (sysctl_createv(clog, 0, &rnode, &rnode, CTLFLAG_PERMANENT, CTLTYPE_NODE, ifname, SYSCTL_DESCR("Interface controls"), NULL, 0, NULL, 0, CTL_CREATE, CTL_EOL) != 0) goto bad; if (sysctl_createv(clog, 0, &rnode, &rnode, CTLFLAG_PERMANENT, CTLTYPE_NODE, "rcvq", SYSCTL_DESCR("Interface input queue controls"), NULL, 0, NULL, 0, CTL_CREATE, CTL_EOL) != 0) goto bad; #ifdef NOTYET /* XXX Should show each per-CPU queue length? */ if (sysctl_createv(clog, 0, &rnode, &rnode, CTLFLAG_PERMANENT, CTLTYPE_INT, "len", SYSCTL_DESCR("Current input queue length"), sysctl_percpuq_len, 0, NULL, 0, CTL_CREATE, CTL_EOL) != 0) goto bad; if (sysctl_createv(clog, 0, &rnode, &cnode, CTLFLAG_PERMANENT | CTLFLAG_READWRITE, CTLTYPE_INT, "maxlen", SYSCTL_DESCR("Maximum allowed input queue length"), sysctl_percpuq_maxlen_handler, 0, (void *)ipq, 0, CTL_CREATE, CTL_EOL) != 0) goto bad; #endif if (sysctl_createv(clog, 0, &rnode, &cnode, CTLFLAG_PERMANENT, CTLTYPE_QUAD, "drops", SYSCTL_DESCR("Total packets dropped due to full input queue"), sysctl_percpuq_drops_handler, 0, (void *)ipq, 0, CTL_CREATE, CTL_EOL) != 0) goto bad; return; bad: printf("%s: could not attach sysctl nodes\n", ifname); return; } /* * The deferred if_start framework * * The common APIs to defer if_start to softint when if_start is requested * from a device driver running in hardware interrupt context. */ /* * Call ifp->if_start (or equivalent) in a dedicated softint for * deferred if_start. */ static void if_deferred_start_softint(void *arg) { struct if_deferred_start *ids = arg; struct ifnet *ifp = ids->ids_ifp; ids->ids_if_start(ifp); } /* * The default callback function for deferred if_start. */ static void if_deferred_start_common(struct ifnet *ifp) { const int s = splnet(); if_start_lock(ifp); splx(s); } static inline bool if_snd_is_used(struct ifnet *ifp) { return ALTQ_IS_ENABLED(&ifp->if_snd) || ifp->if_transmit == if_transmit || ifp->if_transmit == NULL || ifp->if_transmit == if_nulltransmit; } /* * Schedule deferred if_start. */ void if_schedule_deferred_start(struct ifnet *ifp) { KASSERT(ifp->if_deferred_start != NULL); if (if_snd_is_used(ifp) && IFQ_IS_EMPTY(&ifp->if_snd)) return; softint_schedule(ifp->if_deferred_start->ids_si); } /* * Create an instance of deferred if_start. A driver should call the function * only if the driver needs deferred if_start. Drivers can setup their own * deferred if_start function via 2nd argument. */ void if_deferred_start_init(struct ifnet *ifp, void (*func)(struct ifnet *)) { struct if_deferred_start *ids; u_int flags = SOFTINT_NET; flags |= if_is_mpsafe(ifp) ? SOFTINT_MPSAFE : 0; ids = kmem_zalloc(sizeof(*ids), KM_SLEEP); ids->ids_ifp = ifp; ids->ids_si = softint_establish(flags, if_deferred_start_softint, ids); if (func != NULL) ids->ids_if_start = func; else ids->ids_if_start = if_deferred_start_common; ifp->if_deferred_start = ids; } static void if_deferred_start_destroy(struct ifnet *ifp) { if (ifp->if_deferred_start == NULL) return; softint_disestablish(ifp->if_deferred_start->ids_si); kmem_free(ifp->if_deferred_start, sizeof(*ifp->if_deferred_start)); ifp->if_deferred_start = NULL; } /* * The common interface input routine that is called by device drivers, * which should be used only when the driver's rx handler already runs * in softint. */ void if_input(struct ifnet *ifp, struct mbuf *m) { KASSERT(ifp->if_percpuq == NULL); KASSERT(!cpu_intr_p()); if_statinc(ifp, if_ipackets); bpf_mtap(ifp, m, BPF_D_IN); ifp->_if_input(ifp, m); } /* * DEPRECATED. Use if_initialize and if_register instead. * See the above comment of if_initialize. * * Note that it implicitly enables if_percpuq to make drivers easy to * migrate softint-based if_input without much changes. If you don't * want to enable it, use if_initialize instead. */ void if_attach(ifnet_t *ifp) { if_initialize(ifp); ifp->if_percpuq = if_percpuq_create(ifp); if_register(ifp); } void if_attachdomain(void) { struct ifnet *ifp; const int bound = curlwp_bind(); int s = pserialize_read_enter(); IFNET_READER_FOREACH(ifp) { struct psref psref; psref_acquire(&psref, &ifp->if_psref, ifnet_psref_class); pserialize_read_exit(s); if_attachdomain1(ifp); s = pserialize_read_enter(); psref_release(&psref, &ifp->if_psref, ifnet_psref_class); } pserialize_read_exit(s); curlwp_bindx(bound); } static void if_attachdomain1(struct ifnet *ifp) { struct domain *dp; const int s = splsoftnet(); /* address family dependent data region */ memset(ifp->if_afdata, 0, sizeof(ifp->if_afdata)); DOMAIN_FOREACH(dp) { if (dp->dom_ifattach != NULL) ifp->if_afdata[dp->dom_family] = (*dp->dom_ifattach)(ifp); } splx(s); } /* * Deactivate an interface. This points all of the procedure * handles at error stubs. May be called from interrupt context. */ void if_deactivate(struct ifnet *ifp) { const int s = splsoftnet(); ifp->if_output = if_nulloutput; ifp->_if_input = if_nullinput; ifp->if_start = if_nullstart; ifp->if_transmit = if_nulltransmit; ifp->if_ioctl = if_nullioctl; ifp->if_init = if_nullinit; ifp->if_stop = if_nullstop; if (ifp->if_slowtimo) ifp->if_slowtimo = if_nullslowtimo; ifp->if_drain = if_nulldrain; /* No more packets may be enqueued. */ ifp->if_snd.ifq_maxlen = 0; splx(s); } bool if_is_deactivated(const struct ifnet *ifp) { return ifp->if_output == if_nulloutput; } void if_purgeaddrs(struct ifnet *ifp, int family, void (*purgeaddr)(struct ifaddr *)) { struct ifaddr *ifa, *nifa; int s; s = pserialize_read_enter(); for (ifa = IFADDR_READER_FIRST(ifp); ifa; ifa = nifa) { nifa = IFADDR_READER_NEXT(ifa); if (ifa->ifa_addr->sa_family != family) continue; pserialize_read_exit(s); (*purgeaddr)(ifa); s = pserialize_read_enter(); } pserialize_read_exit(s); } #ifdef IFAREF_DEBUG static struct ifaddr **ifa_list; static int ifa_list_size; /* Depends on only one if_attach runs at once */ static void if_build_ifa_list(struct ifnet *ifp) { struct ifaddr *ifa; int i; KASSERT(ifa_list == NULL); KASSERT(ifa_list_size == 0); IFADDR_READER_FOREACH(ifa, ifp) ifa_list_size++; ifa_list = kmem_alloc(sizeof(*ifa) * ifa_list_size, KM_SLEEP); i = 0; IFADDR_READER_FOREACH(ifa, ifp) { ifa_list[i++] = ifa; ifaref(ifa); } } static void if_check_and_free_ifa_list(struct ifnet *ifp) { int i; struct ifaddr *ifa; if (ifa_list == NULL) return; for (i = 0; i < ifa_list_size; i++) { char buf[64]; ifa = ifa_list[i]; sockaddr_format(ifa->ifa_addr, buf, sizeof(buf)); if (ifa->ifa_refcnt > 1) { log(LOG_WARNING, "ifa(%s) still referenced (refcnt=%d)\n", buf, ifa->ifa_refcnt - 1); } else log(LOG_DEBUG, "ifa(%s) not referenced (refcnt=%d)\n", buf, ifa->ifa_refcnt - 1); ifafree(ifa); } kmem_free(ifa_list, sizeof(*ifa) * ifa_list_size); ifa_list = NULL; ifa_list_size = 0; } #endif /* * Detach an interface from the list of "active" interfaces, * freeing any resources as we go along. * * NOTE: This routine must be called with a valid thread context, * as it may block. */ void if_detach(struct ifnet *ifp) { struct socket so; struct ifaddr *ifa; #ifdef IFAREF_DEBUG struct ifaddr *last_ifa = NULL; #endif struct domain *dp; const struct protosw *pr; int i, family, purged; #ifdef IFAREF_DEBUG if_build_ifa_list(ifp); #endif /* * XXX It's kind of lame that we have to have the * XXX socket structure... */ memset(&so, 0, sizeof(so)); const int s = splnet(); sysctl_teardown(&ifp->if_sysctl_log); IFNET_LOCK(ifp); /* * Unset all queued link states and pretend a * link state change is scheduled. * This stops any more link state changes occurring for this * interface while it's being detached so it's safe * to drain the workqueue. */ IF_LINK_STATE_CHANGE_LOCK(ifp); ifp->if_link_queue = -1; /* all bits set, see link_state_change() */ ifp->if_link_scheduled = true; IF_LINK_STATE_CHANGE_UNLOCK(ifp); workqueue_wait(ifnet_link_state_wq, &ifp->if_link_work); if_deactivate(ifp); IFNET_UNLOCK(ifp); /* * Unlink from the list and wait for all readers to leave * from pserialize read sections. Note that we can't do * psref_target_destroy here. See below. */ IFNET_GLOBAL_LOCK(); ifindex2ifnet[ifp->if_index] = NULL; TAILQ_REMOVE(&ifnet_list, ifp, if_list); IFNET_WRITER_REMOVE(ifp); pserialize_perform(ifnet_psz); IFNET_GLOBAL_UNLOCK(); if (ifp->if_slowtimo != NULL) { struct if_slowtimo_data *isd = ifp->if_slowtimo_data; mutex_enter(&isd->isd_lock); isd->isd_dying = true; mutex_exit(&isd->isd_lock); callout_halt(&isd->isd_ch, NULL); workqueue_wait(if_slowtimo_wq, &isd->isd_work); callout_destroy(&isd->isd_ch); mutex_destroy(&isd->isd_lock); kmem_free(isd, sizeof(*isd)); ifp->if_slowtimo_data = NULL; /* paraonia */ ifp->if_slowtimo = NULL; /* paranoia */ } if_deferred_start_destroy(ifp); /* * Do an if_down() to give protocols a chance to do something. */ if_down_deactivated(ifp); #ifdef ALTQ if (ALTQ_IS_ENABLED(&ifp->if_snd)) altq_disable(&ifp->if_snd); if (ALTQ_IS_ATTACHED(&ifp->if_snd)) altq_detach(&ifp->if_snd); #endif #if NCARP > 0 /* Remove the interface from any carp group it is a part of. */ if (ifp->if_carp != NULL && ifp->if_type != IFT_CARP) carp_ifdetach(ifp); #endif /* * Ensure that all packets on protocol input pktqueues have been * processed, or, at least, removed from the queues. * * A cross-call will ensure that the interrupts have completed. * FIXME: not quite.. */ pktq_ifdetach(); xc_barrier(0); /* * Rip all the addresses off the interface. This should make * all of the routes go away. * * pr_usrreq calls can remove an arbitrary number of ifaddrs * from the list, including our "cursor", ifa. For safety, * and to honor the TAILQ abstraction, I just restart the * loop after each removal. Note that the loop will exit * when all of the remaining ifaddrs belong to the AF_LINK * family. I am counting on the historical fact that at * least one pr_usrreq in each address domain removes at * least one ifaddr. */ again: /* * At this point, no other one tries to remove ifa in the list, * so we don't need to take a lock or psref. Avoid using * IFADDR_READER_FOREACH to pass over an inspection of contract * violations of pserialize. */ IFADDR_WRITER_FOREACH(ifa, ifp) { family = ifa->ifa_addr->sa_family; #ifdef IFAREF_DEBUG printf("if_detach: ifaddr %p, family %d, refcnt %d\n", ifa, family, ifa->ifa_refcnt); if (last_ifa != NULL && ifa == last_ifa) panic("if_detach: loop detected"); last_ifa = ifa; #endif if (family == AF_LINK) continue; dp = pffinddomain(family); KASSERTMSG(dp != NULL, "no domain for AF %d", family); /* * XXX These PURGEIF calls are redundant with the * purge-all-families calls below, but are left in for * now both to make a smaller change, and to avoid * unplanned interactions with clearing of * ifp->if_addrlist. */ purged = 0; for (pr = dp->dom_protosw; pr < dp->dom_protoswNPROTOSW; pr++) { so.so_proto = pr; if (pr->pr_usrreqs) { (void) (*pr->pr_usrreqs->pr_purgeif)(&so, ifp); purged = 1; } } if (purged == 0) { /* * XXX What's really the best thing to do * XXX here? --thorpej@NetBSD.org */ printf("if_detach: WARNING: AF %d not purged\n", family); ifa_remove(ifp, ifa); } goto again; } if_free_sadl(ifp, 1); restart: IFADDR_WRITER_FOREACH(ifa, ifp) { family = ifa->ifa_addr->sa_family; KASSERT(family == AF_LINK); ifa_remove(ifp, ifa); goto restart; } /* Delete stray routes from the routing table. */ for (i = 0; i <= AF_MAX; i++) rt_delete_matched_entries(i, if_delroute_matcher, ifp, false); DOMAIN_FOREACH(dp) { if (dp->dom_ifdetach != NULL && ifp->if_afdata[dp->dom_family]) { void *p = ifp->if_afdata[dp->dom_family]; if (p) { ifp->if_afdata[dp->dom_family] = NULL; (*dp->dom_ifdetach)(ifp, p); } } /* * One would expect multicast memberships (INET and * INET6) on UDP sockets to be purged by the PURGEIF * calls above, but if all addresses were removed from * the interface prior to destruction, the calls will * not be made (e.g. ppp, for which pppd(8) generally * removes addresses before destroying the interface). * Because there is no invariant that multicast * memberships only exist for interfaces with IPv4 * addresses, we must call PURGEIF regardless of * addresses. (Protocols which might store ifnet * pointers are marked with PR_PURGEIF.) */ for (pr = dp->dom_protosw; pr < dp->dom_protoswNPROTOSW; pr++) { so.so_proto = pr; if (pr->pr_usrreqs && pr->pr_flags & PR_PURGEIF) (void)(*pr->pr_usrreqs->pr_purgeif)(&so, ifp); } } /* * Must be done after the above pr_purgeif because if_psref may be * still used in pr_purgeif. */ psref_target_destroy(&ifp->if_psref, ifnet_psref_class); PSLIST_ENTRY_DESTROY(ifp, if_pslist_entry); pfil_run_ifhooks(if_pfil, PFIL_IFNET_DETACH, ifp); (void)pfil_head_destroy(ifp->if_pfil); /* Announce that the interface is gone. */ rt_ifannouncemsg(ifp, IFAN_DEPARTURE); IF_AFDATA_LOCK_DESTROY(ifp); if (ifp->if_percpuq != NULL) { if_percpuq_destroy(ifp->if_percpuq); ifp->if_percpuq = NULL; } mutex_obj_free(ifp->if_ioctl_lock); ifp->if_ioctl_lock = NULL; mutex_obj_free(ifp->if_snd.ifq_lock); if_stats_fini(ifp); KASSERT(!simplehook_has_hooks(ifp->if_linkstate_hooks)); simplehook_destroy(ifp->if_linkstate_hooks); splx(s); #ifdef IFAREF_DEBUG if_check_and_free_ifa_list(ifp); #endif } /* * Callback for a radix tree walk to delete all references to an * ifnet. */ static int if_delroute_matcher(struct rtentry *rt, void *v) { struct ifnet *ifp = (struct ifnet *)v; if (rt->rt_ifp == ifp) return 1; else return 0; } /* * Create a clone network interface. */ static int if_clone_create(const char *name) { struct if_clone *ifc; struct ifnet *ifp; struct psref psref; int unit; KASSERT(mutex_owned(&if_clone_mtx)); ifc = if_clone_lookup(name, &unit); if (ifc == NULL) return EINVAL; ifp = if_get(name, &psref); if (ifp != NULL) { if_put(ifp, &psref); return EEXIST; } return (*ifc->ifc_create)(ifc, unit); } /* * Destroy a clone network interface. */ static int if_clone_destroy(const char *name) { struct if_clone *ifc; struct ifnet *ifp; struct psref psref; int error; int (*if_ioctlfn)(struct ifnet *, u_long, void *); KASSERT(mutex_owned(&if_clone_mtx)); ifc = if_clone_lookup(name, NULL); if (ifc == NULL) return EINVAL; if (ifc->ifc_destroy == NULL) return EOPNOTSUPP; ifp = if_get(name, &psref); if (ifp == NULL) return ENXIO; /* We have to disable ioctls here */ IFNET_LOCK(ifp); if_ioctlfn = ifp->if_ioctl; ifp->if_ioctl = if_nullioctl; IFNET_UNLOCK(ifp); /* * We cannot call ifc_destroy with holding ifp. * Releasing ifp here is safe thanks to if_clone_mtx. */ if_put(ifp, &psref); error = (*ifc->ifc_destroy)(ifp); if (error != 0) { /* We have to restore if_ioctl on error */ IFNET_LOCK(ifp); ifp->if_ioctl = if_ioctlfn; IFNET_UNLOCK(ifp); } return error; } static bool if_is_unit(const char *name) { while (*name != '\0') { if (*name < '0' || *name > '9') return false; name++; } return true; } /* * Look up a network interface cloner. */ static struct if_clone * if_clone_lookup(const char *name, int *unitp) { struct if_clone *ifc; const char *cp; char *dp, ifname[IFNAMSIZ + 3]; int unit; KASSERT(mutex_owned(&if_clone_mtx)); strcpy(ifname, "if_"); /* separate interface name from unit */ /* TODO: search unit number from backward */ for (dp = ifname + 3, cp = name; cp - name < IFNAMSIZ && *cp && !if_is_unit(cp);) *dp++ = *cp++; if (cp == name || cp - name == IFNAMSIZ || !*cp) return NULL; /* No name or unit number */ *dp++ = '\0'; again: LIST_FOREACH(ifc, &if_cloners, ifc_list) { if (strcmp(ifname + 3, ifc->ifc_name) == 0) break; } if (ifc == NULL) { int error; if (*ifname == '\0') return NULL; mutex_exit(&if_clone_mtx); error = module_autoload(ifname, MODULE_CLASS_DRIVER); mutex_enter(&if_clone_mtx); if (error) return NULL; *ifname = '\0'; goto again; } unit = 0; while (cp - name < IFNAMSIZ && *cp) { if (*cp < '0' || *cp > '9' || unit >= INT_MAX / 10) { /* Bogus unit number. */ return NULL; } unit = (unit * 10) + (*cp++ - '0'); } if (unitp != NULL) *unitp = unit; return ifc; } /* * Register a network interface cloner. */ void if_clone_attach(struct if_clone *ifc) { mutex_enter(&if_clone_mtx); LIST_INSERT_HEAD(&if_cloners, ifc, ifc_list); if_cloners_count++; mutex_exit(&if_clone_mtx); } /* * Unregister a network interface cloner. */ void if_clone_detach(struct if_clone *ifc) { mutex_enter(&if_clone_mtx); LIST_REMOVE(ifc, ifc_list); if_cloners_count--; mutex_exit(&if_clone_mtx); } /* * Provide list of interface cloners to userspace. */ int if_clone_list(int buf_count, char *buffer, int *total) { char outbuf[IFNAMSIZ], *dst; struct if_clone *ifc; int count, error = 0; mutex_enter(&if_clone_mtx); *total = if_cloners_count; if ((dst = buffer) == NULL) { /* Just asking how many there are. */ goto out; } if (buf_count < 0) { error = EINVAL; goto out; } count = (if_cloners_count < buf_count) ? if_cloners_count : buf_count; for (ifc = LIST_FIRST(&if_cloners); ifc != NULL && count != 0; ifc = LIST_NEXT(ifc, ifc_list), count--, dst += IFNAMSIZ) { (void)strncpy(outbuf, ifc->ifc_name, sizeof(outbuf)); if (outbuf[sizeof(outbuf) - 1] != '\0') { error = ENAMETOOLONG; goto out; } error = copyout(outbuf, dst, sizeof(outbuf)); if (error != 0) break; } out: mutex_exit(&if_clone_mtx); return error; } void ifa_psref_init(struct ifaddr *ifa) { psref_target_init(&ifa->ifa_psref, ifa_psref_class); } void ifaref(struct ifaddr *ifa) { atomic_inc_uint(&ifa->ifa_refcnt); } void ifafree(struct ifaddr *ifa) { KASSERT(ifa != NULL); KASSERTMSG(ifa->ifa_refcnt > 0, "ifa_refcnt=%d", ifa->ifa_refcnt); membar_release(); if (atomic_dec_uint_nv(&ifa->ifa_refcnt) != 0) return; membar_acquire(); free(ifa, M_IFADDR); } bool ifa_is_destroying(struct ifaddr *ifa) { return ISSET(ifa->ifa_flags, IFA_DESTROYING); } void ifa_insert(struct ifnet *ifp, struct ifaddr *ifa) { ifa->ifa_ifp = ifp; /* * Check MP-safety for IFEF_MPSAFE drivers. * Check !IFF_RUNNING for initialization routines that normally don't * take IFNET_LOCK but it's safe because there is no competitor. * XXX there are false positive cases because IFF_RUNNING can be off on * if_stop. */ KASSERT(!if_is_mpsafe(ifp) || !ISSET(ifp->if_flags, IFF_RUNNING) || IFNET_LOCKED(ifp)); TAILQ_INSERT_TAIL(&ifp->if_addrlist, ifa, ifa_list); IFADDR_ENTRY_INIT(ifa); IFADDR_WRITER_INSERT_TAIL(ifp, ifa); ifaref(ifa); } void ifa_remove(struct ifnet *ifp, struct ifaddr *ifa) { KASSERT(ifa->ifa_ifp == ifp); /* * Check MP-safety for IFEF_MPSAFE drivers. * if_is_deactivated indicates ifa_remove is called from if_detach * where it is safe even if IFNET_LOCK isn't held. */ KASSERT(!if_is_mpsafe(ifp) || if_is_deactivated(ifp) || IFNET_LOCKED(ifp)); TAILQ_REMOVE(&ifp->if_addrlist, ifa, ifa_list); IFADDR_WRITER_REMOVE(ifa); #ifdef NET_MPSAFE IFNET_GLOBAL_LOCK(); pserialize_perform(ifnet_psz); IFNET_GLOBAL_UNLOCK(); #endif #ifdef NET_MPSAFE psref_target_destroy(&ifa->ifa_psref, ifa_psref_class); #endif IFADDR_ENTRY_DESTROY(ifa); ifafree(ifa); } void ifa_acquire(struct ifaddr *ifa, struct psref *psref) { PSREF_DEBUG_FILL_RETURN_ADDRESS(psref); psref_acquire(psref, &ifa->ifa_psref, ifa_psref_class); } void ifa_release(struct ifaddr *ifa, struct psref *psref) { if (ifa == NULL) return; psref_release(psref, &ifa->ifa_psref, ifa_psref_class); } bool ifa_held(struct ifaddr *ifa) { return psref_held(&ifa->ifa_psref, ifa_psref_class); } static inline int equal(const struct sockaddr *sa1, const struct sockaddr *sa2) { return sockaddr_cmp(sa1, sa2) == 0; } /* * Locate an interface based on a complete address. */ /*ARGSUSED*/ struct ifaddr * ifa_ifwithaddr(const struct sockaddr *addr) { struct ifnet *ifp; struct ifaddr *ifa; IFNET_READER_FOREACH(ifp) { if (if_is_deactivated(ifp)) continue; IFADDR_READER_FOREACH(ifa, ifp) { if (ifa->ifa_addr->sa_family != addr->sa_family) continue; if (equal(addr, ifa->ifa_addr)) return ifa; if ((ifp->if_flags & IFF_BROADCAST) && ifa->ifa_broadaddr && /* IP6 doesn't have broadcast */ ifa->ifa_broadaddr->sa_len != 0 && equal(ifa->ifa_broadaddr, addr)) return ifa; } } return NULL; } struct ifaddr * ifa_ifwithaddr_psref(const struct sockaddr *addr, struct psref *psref) { struct ifaddr *ifa; int s = pserialize_read_enter(); ifa = ifa_ifwithaddr(addr); if (ifa != NULL) ifa_acquire(ifa, psref); pserialize_read_exit(s); return ifa; } /* * Locate the point to point interface with a given destination address. */ /*ARGSUSED*/ struct ifaddr * ifa_ifwithdstaddr(const struct sockaddr *addr) { struct ifnet *ifp; struct ifaddr *ifa; IFNET_READER_FOREACH(ifp) { if (if_is_deactivated(ifp)) continue; if ((ifp->if_flags & IFF_POINTOPOINT) == 0) continue; IFADDR_READER_FOREACH(ifa, ifp) { if (ifa->ifa_addr->sa_family != addr->sa_family || ifa->ifa_dstaddr == NULL) continue; if (equal(addr, ifa->ifa_dstaddr)) return ifa; } } return NULL; } struct ifaddr * ifa_ifwithdstaddr_psref(const struct sockaddr *addr, struct psref *psref) { struct ifaddr *ifa; int s; s = pserialize_read_enter(); ifa = ifa_ifwithdstaddr(addr); if (ifa != NULL) ifa_acquire(ifa, psref); pserialize_read_exit(s); return ifa; } /* * Find an interface on a specific network. If many, choice * is most specific found. */ struct ifaddr * ifa_ifwithnet(const struct sockaddr *addr) { struct ifnet *ifp; struct ifaddr *ifa, *ifa_maybe = NULL; const struct sockaddr_dl *sdl; u_int af = addr->sa_family; const char *addr_data = addr->sa_data, *cplim; if (af == AF_LINK) { sdl = satocsdl(addr); if (sdl->sdl_index && sdl->sdl_index < if_indexlim && ifindex2ifnet[sdl->sdl_index] && !if_is_deactivated(ifindex2ifnet[sdl->sdl_index])) { return ifindex2ifnet[sdl->sdl_index]->if_dl; } } #ifdef NETATALK if (af == AF_APPLETALK) { const struct sockaddr_at *sat, *sat2; sat = (const struct sockaddr_at *)addr; IFNET_READER_FOREACH(ifp) { if (if_is_deactivated(ifp)) continue; ifa = at_ifawithnet((const struct sockaddr_at *)addr, ifp); if (ifa == NULL) continue; sat2 = (struct sockaddr_at *)ifa->ifa_addr; if (sat2->sat_addr.s_net == sat->sat_addr.s_net) return ifa; /* exact match */ if (ifa_maybe == NULL) { /* else keep the if with the right range */ ifa_maybe = ifa; } } return ifa_maybe; } #endif IFNET_READER_FOREACH(ifp) { if (if_is_deactivated(ifp)) continue; IFADDR_READER_FOREACH(ifa, ifp) { const char *cp, *cp2, *cp3; if (ifa->ifa_addr->sa_family != af || ifa->ifa_netmask == NULL) next: continue; cp = addr_data; cp2 = ifa->ifa_addr->sa_data; cp3 = ifa->ifa_netmask->sa_data; cplim = (const char *)ifa->ifa_netmask + ifa->ifa_netmask->sa_len; while (cp3 < cplim) { if ((*cp++ ^ *cp2++) & *cp3++) { /* want to continue for() loop */ goto next; } } if (ifa_maybe == NULL || rt_refines(ifa->ifa_netmask, ifa_maybe->ifa_netmask)) ifa_maybe = ifa; } } return ifa_maybe; } struct ifaddr * ifa_ifwithnet_psref(const struct sockaddr *addr, struct psref *psref) { struct ifaddr *ifa; int s; s = pserialize_read_enter(); ifa = ifa_ifwithnet(addr); if (ifa != NULL) ifa_acquire(ifa, psref); pserialize_read_exit(s); return ifa; } /* * Find the interface of the address. */ struct ifaddr * ifa_ifwithladdr(const struct sockaddr *addr) { struct ifaddr *ia; if ((ia = ifa_ifwithaddr(addr)) || (ia = ifa_ifwithdstaddr(addr)) || (ia = ifa_ifwithnet(addr))) return ia; return NULL; } struct ifaddr * ifa_ifwithladdr_psref(const struct sockaddr *addr, struct psref *psref) { struct ifaddr *ifa; int s; s = pserialize_read_enter(); ifa = ifa_ifwithladdr(addr); if (ifa != NULL) ifa_acquire(ifa, psref); pserialize_read_exit(s); return ifa; } /* * Find an interface using a specific address family */ struct ifaddr * ifa_ifwithaf(int af) { struct ifnet *ifp; struct ifaddr *ifa = NULL; int s; s = pserialize_read_enter(); IFNET_READER_FOREACH(ifp) { if (if_is_deactivated(ifp)) continue; IFADDR_READER_FOREACH(ifa, ifp) { if (ifa->ifa_addr->sa_family == af) goto out; } } out: pserialize_read_exit(s); return ifa; } /* * Find an interface address specific to an interface best matching * a given address. */ struct ifaddr * ifaof_ifpforaddr(const struct sockaddr *addr, struct ifnet *ifp) { struct ifaddr *ifa; const char *cp, *cp2, *cp3; const char *cplim; struct ifaddr *ifa_maybe = 0; u_int af = addr->sa_family; if (if_is_deactivated(ifp)) return NULL; if (af >= AF_MAX) return NULL; IFADDR_READER_FOREACH(ifa, ifp) { if (ifa->ifa_addr->sa_family != af) continue; ifa_maybe = ifa; if (ifa->ifa_netmask == NULL) { if (equal(addr, ifa->ifa_addr) || (ifa->ifa_dstaddr && equal(addr, ifa->ifa_dstaddr))) return ifa; continue; } cp = addr->sa_data; cp2 = ifa->ifa_addr->sa_data; cp3 = ifa->ifa_netmask->sa_data; cplim = ifa->ifa_netmask->sa_len + (char *)ifa->ifa_netmask; for (; cp3 < cplim; cp3++) { if ((*cp++ ^ *cp2++) & *cp3) break; } if (cp3 == cplim) return ifa; } return ifa_maybe; } struct ifaddr * ifaof_ifpforaddr_psref(const struct sockaddr *addr, struct ifnet *ifp, struct psref *psref) { struct ifaddr *ifa; int s; s = pserialize_read_enter(); ifa = ifaof_ifpforaddr(addr, ifp); if (ifa != NULL) ifa_acquire(ifa, psref); pserialize_read_exit(s); return ifa; } /* * Default action when installing a route with a Link Level gateway. * Lookup an appropriate real ifa to point to. * This should be moved to /sys/net/link.c eventually. */ void link_rtrequest(int cmd, struct rtentry *rt, const struct rt_addrinfo *info) { struct ifaddr *ifa; const struct sockaddr *dst; struct ifnet *ifp; struct psref psref; if (cmd != RTM_ADD || ISSET(info->rti_flags, RTF_DONTCHANGEIFA)) return; ifp = rt->rt_ifa->ifa_ifp; dst = rt_getkey(rt); if ((ifa = ifaof_ifpforaddr_psref(dst, ifp, &psref)) != NULL) { rt_replace_ifa(rt, ifa); if (ifa->ifa_rtrequest && ifa->ifa_rtrequest != link_rtrequest) ifa->ifa_rtrequest(cmd, rt, info); ifa_release(ifa, &psref); } } /* * bitmask macros to manage a densely packed link_state change queue. * Because we need to store LINK_STATE_UNKNOWN(0), LINK_STATE_DOWN(1) and * LINK_STATE_UP(2) we need 2 bits for each state change. * As a state change to store is 0, treat all bits set as an unset item. */ #define LQ_ITEM_BITS 2 #define LQ_ITEM_MASK ((1 << LQ_ITEM_BITS) - 1) #define LQ_MASK(i) (LQ_ITEM_MASK << (i) * LQ_ITEM_BITS) #define LINK_STATE_UNSET LQ_ITEM_MASK #define LQ_ITEM(q, i) (((q) & LQ_MASK((i))) >> (i) * LQ_ITEM_BITS) #define LQ_STORE(q, i, v) \ do { \ (q) &= ~LQ_MASK((i)); \ (q) |= (v) << (i) * LQ_ITEM_BITS; \ } while (0 /* CONSTCOND */) #define LQ_MAX(q) ((sizeof((q)) * NBBY) / LQ_ITEM_BITS) #define LQ_POP(q, v) \ do { \ (v) = LQ_ITEM((q), 0); \ (q) >>= LQ_ITEM_BITS; \ (q) |= LINK_STATE_UNSET << (LQ_MAX((q)) - 1) * LQ_ITEM_BITS; \ } while (0 /* CONSTCOND */) #define LQ_PUSH(q, v) \ do { \ (q) >>= LQ_ITEM_BITS; \ (q) |= (v) << (LQ_MAX((q)) - 1) * LQ_ITEM_BITS; \ } while (0 /* CONSTCOND */) #define LQ_FIND_UNSET(q, i) \ for ((i) = 0; i < LQ_MAX((q)); (i)++) { \ if (LQ_ITEM((q), (i)) == LINK_STATE_UNSET) \ break; \ } /* * Handle a change in the interface link state and * queue notifications. */ void if_link_state_change(struct ifnet *ifp, int link_state) { int idx; /* Ensure change is to a valid state */ switch (link_state) { case LINK_STATE_UNKNOWN: /* FALLTHROUGH */ case LINK_STATE_DOWN: /* FALLTHROUGH */ case LINK_STATE_UP: break; default: #ifdef DEBUG printf("%s: invalid link state %d\n", ifp->if_xname, link_state); #endif return; } IF_LINK_STATE_CHANGE_LOCK(ifp); /* Find the last unset event in the queue. */ LQ_FIND_UNSET(ifp->if_link_queue, idx); if (idx == 0) { /* * There is no queue of link state changes. * As we have the lock we can safely compare against the * current link state and return if the same. * Otherwise, if scheduled is true then the interface is being * detached and the queue is being drained so we need * to avoid queuing more work. */ if (ifp->if_link_state == link_state || ifp->if_link_scheduled) goto out; } else { /* Ensure link_state doesn't match the last queued state. */ if (LQ_ITEM(ifp->if_link_queue, idx - 1) == (uint8_t)link_state) goto out; } /* Handle queue overflow. */ if (idx == LQ_MAX(ifp->if_link_queue)) { uint8_t lost; /* * The DOWN state must be protected from being pushed off * the queue to ensure that userland will always be * in a sane state. * Because DOWN is protected, there is no need to protect * UNKNOWN. * It should be invalid to change from any other state to * UNKNOWN anyway ... */ lost = LQ_ITEM(ifp->if_link_queue, 0); LQ_PUSH(ifp->if_link_queue, (uint8_t)link_state); if (lost == LINK_STATE_DOWN) { lost = LQ_ITEM(ifp->if_link_queue, 0); LQ_STORE(ifp->if_link_queue, 0, LINK_STATE_DOWN); } printf("%s: lost link state change %s\n", ifp->if_xname, lost == LINK_STATE_UP ? "UP" : lost == LINK_STATE_DOWN ? "DOWN" : "UNKNOWN"); } else LQ_STORE(ifp->if_link_queue, idx, (uint8_t)link_state); if (ifp->if_link_scheduled) goto out; ifp->if_link_scheduled = true; workqueue_enqueue(ifnet_link_state_wq, &ifp->if_link_work, NULL); out: IF_LINK_STATE_CHANGE_UNLOCK(ifp); } /* * Handle interface link state change notifications. */ static void if_link_state_change_process(struct ifnet *ifp, int link_state) { struct domain *dp; const int s = splnet(); bool notify; KASSERT(!cpu_intr_p()); IF_LINK_STATE_CHANGE_LOCK(ifp); /* Ensure the change is still valid. */ if (ifp->if_link_state == link_state) { IF_LINK_STATE_CHANGE_UNLOCK(ifp); splx(s); return; } #ifdef DEBUG log(LOG_DEBUG, "%s: link state %s (was %s)\n", ifp->if_xname, link_state == LINK_STATE_UP ? "UP" : link_state == LINK_STATE_DOWN ? "DOWN" : "UNKNOWN", ifp->if_link_state == LINK_STATE_UP ? "UP" : ifp->if_link_state == LINK_STATE_DOWN ? "DOWN" : "UNKNOWN"); #endif /* * When going from UNKNOWN to UP, we need to mark existing * addresses as tentative and restart DAD as we may have * erroneously not found a duplicate. * * This needs to happen before rt_ifmsg to avoid a race where * listeners would have an address and expect it to work right * away. */ notify = (link_state == LINK_STATE_UP && ifp->if_link_state == LINK_STATE_UNKNOWN); ifp->if_link_state = link_state; /* The following routines may sleep so release the spin mutex */ IF_LINK_STATE_CHANGE_UNLOCK(ifp); KERNEL_LOCK_UNLESS_NET_MPSAFE(); if (notify) { DOMAIN_FOREACH(dp) { if (dp->dom_if_link_state_change != NULL) dp->dom_if_link_state_change(ifp, LINK_STATE_DOWN); } } /* Notify that the link state has changed. */ rt_ifmsg(ifp); simplehook_dohooks(ifp->if_linkstate_hooks); DOMAIN_FOREACH(dp) { if (dp->dom_if_link_state_change != NULL) dp->dom_if_link_state_change(ifp, link_state); } KERNEL_UNLOCK_UNLESS_NET_MPSAFE(); splx(s); } /* * Process the interface link state change queue. */ static void if_link_state_change_work(struct work *work, void *arg) { struct ifnet *ifp = container_of(work, struct ifnet, if_link_work); uint8_t state; KERNEL_LOCK_UNLESS_NET_MPSAFE(); const int s = splnet(); /* * Pop a link state change from the queue and process it. * If there is nothing to process then if_detach() has been called. * We keep if_link_scheduled = true so the queue can safely drain * without more work being queued. */ IF_LINK_STATE_CHANGE_LOCK(ifp); LQ_POP(ifp->if_link_queue, state); IF_LINK_STATE_CHANGE_UNLOCK(ifp); if (state == LINK_STATE_UNSET) goto out; if_link_state_change_process(ifp, state); /* If there is a link state change to come, schedule it. */ IF_LINK_STATE_CHANGE_LOCK(ifp); if (LQ_ITEM(ifp->if_link_queue, 0) != LINK_STATE_UNSET) { ifp->if_link_scheduled = true; workqueue_enqueue(ifnet_link_state_wq, &ifp->if_link_work, NULL); } else ifp->if_link_scheduled = false; IF_LINK_STATE_CHANGE_UNLOCK(ifp); out: splx(s); KERNEL_UNLOCK_UNLESS_NET_MPSAFE(); } void * if_linkstate_change_establish(struct ifnet *ifp, void (*fn)(void *), void *arg) { khook_t *hk; hk = simplehook_establish(ifp->if_linkstate_hooks, fn, arg); return (void *)hk; } void if_linkstate_change_disestablish(struct ifnet *ifp, void *vhook, kmutex_t *lock) { simplehook_disestablish(ifp->if_linkstate_hooks, vhook, lock); } /* * Used to mark addresses on an interface as DETATCHED or TENTATIVE * and thus start Duplicate Address Detection without changing the * real link state. */ void if_domain_link_state_change(struct ifnet *ifp, int link_state) { struct domain *dp; const int s = splnet(); KERNEL_LOCK_UNLESS_NET_MPSAFE(); DOMAIN_FOREACH(dp) { if (dp->dom_if_link_state_change != NULL) dp->dom_if_link_state_change(ifp, link_state); } splx(s); KERNEL_UNLOCK_UNLESS_NET_MPSAFE(); } /* * Default action when installing a local route on a point-to-point * interface. */ void p2p_rtrequest(int req, struct rtentry *rt, __unused const struct rt_addrinfo *info) { struct ifnet *ifp = rt->rt_ifp; struct ifaddr *ifa, *lo0ifa; int s = pserialize_read_enter(); switch (req) { case RTM_ADD: if ((rt->rt_flags & RTF_LOCAL) == 0) break; rt->rt_ifp = lo0ifp; if (ISSET(info->rti_flags, RTF_DONTCHANGEIFA)) break; IFADDR_READER_FOREACH(ifa, ifp) { if (equal(rt_getkey(rt), ifa->ifa_addr)) break; } if (ifa == NULL) break; /* * Ensure lo0 has an address of the same family. */ IFADDR_READER_FOREACH(lo0ifa, lo0ifp) { if (lo0ifa->ifa_addr->sa_family == ifa->ifa_addr->sa_family) break; } if (lo0ifa == NULL) break; /* * Make sure to set rt->rt_ifa to the interface * address we are using, otherwise we will have trouble * with source address selection. */ if (ifa != rt->rt_ifa) rt_replace_ifa(rt, ifa); break; case RTM_DELETE: default: break; } pserialize_read_exit(s); } static void _if_down(struct ifnet *ifp) { struct ifaddr *ifa; struct domain *dp; struct psref psref; ifp->if_flags &= ~IFF_UP; nanotime(&ifp->if_lastchange); const int bound = curlwp_bind(); int s = pserialize_read_enter(); IFADDR_READER_FOREACH(ifa, ifp) { ifa_acquire(ifa, &psref); pserialize_read_exit(s); pfctlinput(PRC_IFDOWN, ifa->ifa_addr); s = pserialize_read_enter(); ifa_release(ifa, &psref); } pserialize_read_exit(s); curlwp_bindx(bound); IFQ_PURGE(&ifp->if_snd); #if NCARP > 0 if (ifp->if_carp) carp_carpdev_state(ifp); #endif rt_ifmsg(ifp); DOMAIN_FOREACH(dp) { if (dp->dom_if_down) dp->dom_if_down(ifp); } } static void if_down_deactivated(struct ifnet *ifp) { KASSERT(if_is_deactivated(ifp)); _if_down(ifp); } void if_down_locked(struct ifnet *ifp) { KASSERT(IFNET_LOCKED(ifp)); _if_down(ifp); } /* * Mark an interface down and notify protocols of * the transition. * NOTE: must be called at splsoftnet or equivalent. */ void if_down(struct ifnet *ifp) { IFNET_LOCK(ifp); if_down_locked(ifp); IFNET_UNLOCK(ifp); } /* * Must be called with holding if_ioctl_lock. */ static void if_up_locked(struct ifnet *ifp) { #ifdef notyet struct ifaddr *ifa; #endif struct domain *dp; KASSERT(IFNET_LOCKED(ifp)); KASSERT(!if_is_deactivated(ifp)); ifp->if_flags |= IFF_UP; nanotime(&ifp->if_lastchange); #ifdef notyet /* this has no effect on IP, and will kill all ISO connections XXX */ IFADDR_READER_FOREACH(ifa, ifp) pfctlinput(PRC_IFUP, ifa->ifa_addr); #endif #if NCARP > 0 if (ifp->if_carp) carp_carpdev_state(ifp); #endif rt_ifmsg(ifp); DOMAIN_FOREACH(dp) { if (dp->dom_if_up) dp->dom_if_up(ifp); } } /* * Handle interface slowtimo timer routine. Called * from softclock, we decrement timer (if set) and * call the appropriate interface routine on expiration. */ static bool if_slowtimo_countdown(struct ifnet *ifp) { bool fire = false; const int s = splnet(); KERNEL_LOCK(1, NULL); if (ifp->if_timer != 0 && --ifp->if_timer == 0) fire = true; KERNEL_UNLOCK_ONE(NULL); splx(s); return fire; } static void if_slowtimo_intr(void *arg) { struct ifnet *ifp = arg; struct if_slowtimo_data *isd = ifp->if_slowtimo_data; mutex_enter(&isd->isd_lock); if (!isd->isd_dying) { if (isd->isd_trigger || if_slowtimo_countdown(ifp)) { if (!isd->isd_queued) { isd->isd_queued = true; workqueue_enqueue(if_slowtimo_wq, &isd->isd_work, NULL); } } else callout_schedule(&isd->isd_ch, hz / IFNET_SLOWHZ); } mutex_exit(&isd->isd_lock); } static void if_slowtimo_work(struct work *work, void *arg) { struct if_slowtimo_data *isd = container_of(work, struct if_slowtimo_data, isd_work); struct ifnet *ifp = isd->isd_ifp; const int s = splnet(); KERNEL_LOCK(1, NULL); (*ifp->if_slowtimo)(ifp); KERNEL_UNLOCK_ONE(NULL); splx(s); mutex_enter(&isd->isd_lock); if (isd->isd_trigger) { isd->isd_trigger = false; printf("%s: watchdog triggered\n", ifp->if_xname); } isd->isd_queued = false; if (!isd->isd_dying) callout_schedule(&isd->isd_ch, hz / IFNET_SLOWHZ); mutex_exit(&isd->isd_lock); } static int sysctl_if_watchdog(SYSCTLFN_ARGS) { struct sysctlnode node = *rnode; struct ifnet *ifp = node.sysctl_data; struct if_slowtimo_data *isd = ifp->if_slowtimo_data; int arg = 0; int error; node.sysctl_data = &arg; error = sysctl_lookup(SYSCTLFN_CALL(&node)); if (error || newp == NULL) return error; if (arg) { mutex_enter(&isd->isd_lock); KASSERT(!isd->isd_dying); isd->isd_trigger = true; callout_schedule(&isd->isd_ch, 0); mutex_exit(&isd->isd_lock); } return 0; } static void sysctl_watchdog_setup(struct ifnet *ifp) { struct sysctllog **clog = &ifp->if_sysctl_log; const struct sysctlnode *rnode; if (sysctl_createv(clog, 0, NULL, &rnode, CTLFLAG_PERMANENT, CTLTYPE_NODE, "interfaces", SYSCTL_DESCR("Per-interface controls"), NULL, 0, NULL, 0, CTL_NET, CTL_CREATE, CTL_EOL) != 0) goto bad; if (sysctl_createv(clog, 0, &rnode, &rnode, CTLFLAG_PERMANENT, CTLTYPE_NODE, ifp->if_xname, SYSCTL_DESCR("Interface controls"), NULL, 0, NULL, 0, CTL_CREATE, CTL_EOL) != 0) goto bad; if (sysctl_createv(clog, 0, &rnode, &rnode, CTLFLAG_PERMANENT, CTLTYPE_NODE, "watchdog", SYSCTL_DESCR("Interface watchdog controls"), NULL, 0, NULL, 0, CTL_CREATE, CTL_EOL) != 0) goto bad; if (sysctl_createv(clog, 0, &rnode, NULL, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_INT, "trigger", SYSCTL_DESCR("Trigger watchdog timeout"), sysctl_if_watchdog, 0, (int *)ifp, 0, CTL_CREATE, CTL_EOL) != 0) goto bad; return; bad: printf("%s: could not attach sysctl watchdog nodes\n", ifp->if_xname); } /* * Mark an interface up and notify protocols of * the transition. * NOTE: must be called at splsoftnet or equivalent. */ void if_up(struct ifnet *ifp) { IFNET_LOCK(ifp); if_up_locked(ifp); IFNET_UNLOCK(ifp); } /* * Set/clear promiscuous mode on interface ifp based on the truth value * of pswitch. The calls are reference counted so that only the first * "on" request actually has an effect, as does the final "off" request. * Results are undefined if the "off" and "on" requests are not matched. */ int ifpromisc_locked(struct ifnet *ifp, int pswitch) { int pcount, ret = 0; u_short nflags; KASSERT(IFNET_LOCKED(ifp)); pcount = ifp->if_pcount; if (pswitch) { /* * Allow the device to be "placed" into promiscuous * mode even if it is not configured up. It will * consult IFF_PROMISC when it is brought up. */ if (ifp->if_pcount++ != 0) goto out; nflags = ifp->if_flags | IFF_PROMISC; } else { if (--ifp->if_pcount > 0) goto out; nflags = ifp->if_flags & ~IFF_PROMISC; } ret = if_flags_set(ifp, nflags); /* Restore interface state if not successful. */ if (ret != 0) ifp->if_pcount = pcount; out: return ret; } int ifpromisc(struct ifnet *ifp, int pswitch) { int e; IFNET_LOCK(ifp); e = ifpromisc_locked(ifp, pswitch); IFNET_UNLOCK(ifp); return e; } /* * if_ioctl(ifp, cmd, data) * * Apply an ioctl command to the interface. Returns 0 on success, * nonzero errno(3) number on failure. * * For SIOCADDMULTI/SIOCDELMULTI, caller need not hold locks -- it * is the driver's responsibility to take any internal locks. * (Kernel logic should generally invoke these only through * if_mcast_op.) * * For all other ioctls, caller must hold ifp->if_ioctl_lock, * a.k.a. IFNET_LOCK. May sleep. */ int if_ioctl(struct ifnet *ifp, u_long cmd, void *data) { switch (cmd) { case SIOCADDMULTI: case SIOCDELMULTI: break; default: KASSERTMSG(IFNET_LOCKED(ifp), "%s", ifp->if_xname); } return (*ifp->if_ioctl)(ifp, cmd, data); } /* * if_init(ifp) * * Prepare the hardware underlying ifp to process packets * according to its current configuration. Returns 0 on success, * nonzero errno(3) number on failure. * * May sleep. Caller must hold ifp->if_ioctl_lock, a.k.a * IFNET_LOCK. */ int if_init(struct ifnet *ifp) { KASSERTMSG(IFNET_LOCKED(ifp), "%s", ifp->if_xname); return (*ifp->if_init)(ifp); } /* * if_stop(ifp, disable) * * Stop the hardware underlying ifp from processing packets. * * If disable is true, ... XXX(?) * * May sleep. Caller must hold ifp->if_ioctl_lock, a.k.a * IFNET_LOCK. */ void if_stop(struct ifnet *ifp, int disable) { KASSERTMSG(IFNET_LOCKED(ifp), "%s", ifp->if_xname); (*ifp->if_stop)(ifp, disable); } /* * Map interface name to * interface structure pointer. */ struct ifnet * ifunit(const char *name) { struct ifnet *ifp; const char *cp = name; u_int unit = 0; u_int i; /* * If the entire name is a number, treat it as an ifindex. */ for (i = 0; i < IFNAMSIZ && *cp >= '0' && *cp <= '9'; i++, cp++) unit = unit * 10 + (*cp - '0'); /* * If the number took all of the name, then it's a valid ifindex. */ if (i == IFNAMSIZ || (cp != name && *cp == '\0')) return if_byindex(unit); ifp = NULL; const int s = pserialize_read_enter(); IFNET_READER_FOREACH(ifp) { if (if_is_deactivated(ifp)) continue; if (strcmp(ifp->if_xname, name) == 0) goto out; } out: pserialize_read_exit(s); return ifp; } /* * Get a reference of an ifnet object by an interface name. * The returned reference is protected by psref(9). The caller * must release a returned reference by if_put after use. */ struct ifnet * if_get(const char *name, struct psref *psref) { struct ifnet *ifp; const char *cp = name; u_int unit = 0; u_int i; /* * If the entire name is a number, treat it as an ifindex. */ for (i = 0; i < IFNAMSIZ && *cp >= '0' && *cp <= '9'; i++, cp++) unit = unit * 10 + (*cp - '0'); /* * If the number took all of the name, then it's a valid ifindex. */ if (i == IFNAMSIZ || (cp != name && *cp == '\0')) return if_get_byindex(unit, psref); ifp = NULL; const int s = pserialize_read_enter(); IFNET_READER_FOREACH(ifp) { if (if_is_deactivated(ifp)) continue; if (strcmp(ifp->if_xname, name) == 0) { PSREF_DEBUG_FILL_RETURN_ADDRESS(psref); psref_acquire(psref, &ifp->if_psref, ifnet_psref_class); goto out; } } out: pserialize_read_exit(s); return ifp; } /* * Release a reference of an ifnet object given by if_get, if_get_byindex * or if_get_bylla. */ void if_put(const struct ifnet *ifp, struct psref *psref) { if (ifp == NULL) return; psref_release(psref, &ifp->if_psref, ifnet_psref_class); } /* * Return ifp having idx. Return NULL if not found. Normally if_byindex * should be used. */ ifnet_t * _if_byindex(u_int idx) { return (__predict_true(idx < if_indexlim)) ? ifindex2ifnet[idx] : NULL; } /* * Return ifp having idx. Return NULL if not found or the found ifp is * already deactivated. */ ifnet_t * if_byindex(u_int idx) { ifnet_t *ifp; ifp = _if_byindex(idx); if (ifp != NULL && if_is_deactivated(ifp)) ifp = NULL; return ifp; } /* * Get a reference of an ifnet object by an interface index. * The returned reference is protected by psref(9). The caller * must release a returned reference by if_put after use. */ ifnet_t * if_get_byindex(u_int idx, struct psref *psref) { ifnet_t *ifp; const int s = pserialize_read_enter(); ifp = if_byindex(idx); if (__predict_true(ifp != NULL)) { PSREF_DEBUG_FILL_RETURN_ADDRESS(psref); psref_acquire(psref, &ifp->if_psref, ifnet_psref_class); } pserialize_read_exit(s); return ifp; } ifnet_t * if_get_bylla(const void *lla, unsigned char lla_len, struct psref *psref) { ifnet_t *ifp; const int s = pserialize_read_enter(); IFNET_READER_FOREACH(ifp) { if (if_is_deactivated(ifp)) continue; if (ifp->if_addrlen != lla_len) continue; if (memcmp(lla, CLLADDR(ifp->if_sadl), lla_len) == 0) { psref_acquire(psref, &ifp->if_psref, ifnet_psref_class); break; } } pserialize_read_exit(s); return ifp; } /* * Note that it's safe only if the passed ifp is guaranteed to not be freed, * for example using pserialize or the ifp is already held or some other * object is held which guarantes the ifp to not be freed indirectly. */ void if_acquire(struct ifnet *ifp, struct psref *psref) { KASSERT(ifp->if_index != 0); psref_acquire(psref, &ifp->if_psref, ifnet_psref_class); } bool if_held(struct ifnet *ifp) { return psref_held(&ifp->if_psref, ifnet_psref_class); } /* * Some tunnel interfaces can nest, e.g. IPv4 over IPv4 gif(4) tunnel over * IPv4. Check the tunnel nesting count. * Return > 0, if tunnel nesting count is more than limit. * Return 0, if tunnel nesting count is equal or less than limit. */ int if_tunnel_check_nesting(struct ifnet *ifp, struct mbuf *m, int limit) { struct m_tag *mtag; int *count; mtag = m_tag_find(m, PACKET_TAG_TUNNEL_INFO); if (mtag != NULL) { count = (int *)(mtag + 1); if (++(*count) > limit) { log(LOG_NOTICE, "%s: recursively called too many times(%d)\n", ifp->if_xname, *count); return EIO; } } else { mtag = m_tag_get(PACKET_TAG_TUNNEL_INFO, sizeof(*count), M_NOWAIT); if (mtag != NULL) { m_tag_prepend(m, mtag); count = (int *)(mtag + 1); *count = 0; } else { log(LOG_DEBUG, "%s: m_tag_get() failed, " "recursion calls are not prevented.\n", ifp->if_xname); } } return 0; } static void if_tunnel_ro_init_pc(void *p, void *arg __unused, struct cpu_info *ci __unused) { struct tunnel_ro *tro = p; tro->tr_ro = kmem_zalloc(sizeof(*tro->tr_ro), KM_SLEEP); tro->tr_lock = mutex_obj_alloc(MUTEX_DEFAULT, IPL_NONE); } static void if_tunnel_ro_fini_pc(void *p, void *arg __unused, struct cpu_info *ci __unused) { struct tunnel_ro *tro = p; rtcache_free(tro->tr_ro); kmem_free(tro->tr_ro, sizeof(*tro->tr_ro)); mutex_obj_free(tro->tr_lock); } percpu_t * if_tunnel_alloc_ro_percpu(void) { return percpu_create(sizeof(struct tunnel_ro), if_tunnel_ro_init_pc, if_tunnel_ro_fini_pc, NULL); } void if_tunnel_free_ro_percpu(percpu_t *ro_percpu) { percpu_free(ro_percpu, sizeof(struct tunnel_ro)); } static void if_tunnel_rtcache_free_pc(void *p, void *arg __unused, struct cpu_info *ci __unused) { struct tunnel_ro *tro = p; mutex_enter(tro->tr_lock); rtcache_free(tro->tr_ro); mutex_exit(tro->tr_lock); } void if_tunnel_ro_percpu_rtcache_free(percpu_t *ro_percpu) { percpu_foreach(ro_percpu, if_tunnel_rtcache_free_pc, NULL); } void if_export_if_data(ifnet_t * const ifp, struct if_data *ifi, bool zero_stats) { /* Collect the volatile stats first; this zeros *ifi. */ if_stats_to_if_data(ifp, ifi, zero_stats); ifi->ifi_type = ifp->if_type; ifi->ifi_addrlen = ifp->if_addrlen; ifi->ifi_hdrlen = ifp->if_hdrlen; ifi->ifi_link_state = ifp->if_link_state; ifi->ifi_mtu = ifp->if_mtu; ifi->ifi_metric = ifp->if_metric; ifi->ifi_baudrate = ifp->if_baudrate; ifi->ifi_lastchange = ifp->if_lastchange; } /* common */ int ifioctl_common(struct ifnet *ifp, u_long cmd, void *data) { struct ifreq *ifr; struct ifcapreq *ifcr; struct ifdatareq *ifdr; unsigned short flags; char *descr; int error; switch (cmd) { case SIOCSIFCAP: ifcr = data; if ((ifcr->ifcr_capenable & ~ifp->if_capabilities) != 0) return EINVAL; if (ifcr->ifcr_capenable == ifp->if_capenable) return 0; ifp->if_capenable = ifcr->ifcr_capenable; /* Pre-compute the checksum flags mask. */ ifp->if_csum_flags_tx = 0; ifp->if_csum_flags_rx = 0; if (ifp->if_capenable & IFCAP_CSUM_IPv4_Tx) ifp->if_csum_flags_tx |= M_CSUM_IPv4; if (ifp->if_capenable & IFCAP_CSUM_IPv4_Rx) ifp->if_csum_flags_rx |= M_CSUM_IPv4; if (ifp->if_capenable & IFCAP_CSUM_TCPv4_Tx) ifp->if_csum_flags_tx |= M_CSUM_TCPv4; if (ifp->if_capenable & IFCAP_CSUM_TCPv4_Rx) ifp->if_csum_flags_rx |= M_CSUM_TCPv4; if (ifp->if_capenable & IFCAP_CSUM_UDPv4_Tx) ifp->if_csum_flags_tx |= M_CSUM_UDPv4; if (ifp->if_capenable & IFCAP_CSUM_UDPv4_Rx) ifp->if_csum_flags_rx |= M_CSUM_UDPv4; if (ifp->if_capenable & IFCAP_CSUM_TCPv6_Tx) ifp->if_csum_flags_tx |= M_CSUM_TCPv6; if (ifp->if_capenable & IFCAP_CSUM_TCPv6_Rx) ifp->if_csum_flags_rx |= M_CSUM_TCPv6; if (ifp->if_capenable & IFCAP_CSUM_UDPv6_Tx) ifp->if_csum_flags_tx |= M_CSUM_UDPv6; if (ifp->if_capenable & IFCAP_CSUM_UDPv6_Rx) ifp->if_csum_flags_rx |= M_CSUM_UDPv6; if (ifp->if_capenable & IFCAP_TSOv4) ifp->if_csum_flags_tx |= M_CSUM_TSOv4; if (ifp->if_capenable & IFCAP_TSOv6) ifp->if_csum_flags_tx |= M_CSUM_TSOv6; #if NBRIDGE > 0 if (ifp->if_bridge != NULL) bridge_calc_csum_flags(ifp->if_bridge); #endif if (ifp->if_flags & IFF_UP) return ENETRESET; return 0; case SIOCSIFFLAGS: ifr = data; /* * If if_is_mpsafe(ifp), KERNEL_LOCK isn't held here, but if_up * and if_down aren't MP-safe yet, so we must hold the lock. */ KERNEL_LOCK_IF_IFP_MPSAFE(ifp); if (ifp->if_flags & IFF_UP && (ifr->ifr_flags & IFF_UP) == 0) { const int s = splsoftnet(); if_down_locked(ifp); splx(s); } if (ifr->ifr_flags & IFF_UP && (ifp->if_flags & IFF_UP) == 0) { const int s = splsoftnet(); if_up_locked(ifp); splx(s); } KERNEL_UNLOCK_IF_IFP_MPSAFE(ifp); flags = (ifp->if_flags & IFF_CANTCHANGE) | (ifr->ifr_flags &~ IFF_CANTCHANGE); if (ifp->if_flags != flags) { ifp->if_flags = flags; /* Notify that the flags have changed. */ rt_ifmsg(ifp); } break; case SIOCGIFFLAGS: ifr = data; ifr->ifr_flags = ifp->if_flags; break; case SIOCGIFMETRIC: ifr = data; ifr->ifr_metric = ifp->if_metric; break; case SIOCGIFMTU: ifr = data; ifr->ifr_mtu = ifp->if_mtu; break; case SIOCGIFDLT: ifr = data; ifr->ifr_dlt = ifp->if_dlt; break; case SIOCGIFCAP: ifcr = data; ifcr->ifcr_capabilities = ifp->if_capabilities; ifcr->ifcr_capenable = ifp->if_capenable; break; case SIOCSIFMETRIC: ifr = data; ifp->if_metric = ifr->ifr_metric; break; case SIOCGIFDATA: ifdr = data; if_export_if_data(ifp, &ifdr->ifdr_data, false); break; case SIOCGIFINDEX: ifr = data; ifr->ifr_index = ifp->if_index; break; case SIOCZIFDATA: ifdr = data; if_export_if_data(ifp, &ifdr->ifdr_data, true); getnanotime(&ifp->if_lastchange); break; case SIOCSIFMTU: ifr = data; if (ifp->if_mtu == ifr->ifr_mtu) break; ifp->if_mtu = ifr->ifr_mtu; return ENETRESET; case SIOCSIFDESCR: error = kauth_authorize_network(kauth_cred_get(), KAUTH_NETWORK_INTERFACE, KAUTH_REQ_NETWORK_INTERFACE_SETPRIV, ifp, KAUTH_ARG(cmd), NULL); if (error) return error; ifr = data; if (ifr->ifr_buflen > IFDESCRSIZE) return ENAMETOOLONG; if (ifr->ifr_buf == NULL || ifr->ifr_buflen == 0) { /* unset description */ descr = NULL; } else { descr = kmem_zalloc(IFDESCRSIZE, KM_SLEEP); /* * copy (IFDESCRSIZE - 1) bytes to ensure * terminating nul */ error = copyin(ifr->ifr_buf, descr, IFDESCRSIZE - 1); if (error) { kmem_free(descr, IFDESCRSIZE); return error; } } if (ifp->if_description != NULL) kmem_free(ifp->if_description, IFDESCRSIZE); ifp->if_description = descr; break; case SIOCGIFDESCR: ifr = data; descr = ifp->if_description; if (descr == NULL) return ENOMSG; if (ifr->ifr_buflen < IFDESCRSIZE) return EINVAL; error = copyout(descr, ifr->ifr_buf, IFDESCRSIZE); if (error) return error; break; default: return ENOTTY; } return 0; } int ifaddrpref_ioctl(struct socket *so, u_long cmd, void *data, struct ifnet *ifp) { struct if_addrprefreq *ifap = (struct if_addrprefreq *)data; struct ifaddr *ifa; const struct sockaddr *any, *sa; union { struct sockaddr sa; struct sockaddr_storage ss; } u, v; int s, error = 0; switch (cmd) { case SIOCSIFADDRPREF: error = kauth_authorize_network(kauth_cred_get(), KAUTH_NETWORK_INTERFACE, KAUTH_REQ_NETWORK_INTERFACE_SETPRIV, ifp, KAUTH_ARG(cmd), NULL); if (error) return error; break; case SIOCGIFADDRPREF: break; default: return EOPNOTSUPP; } /* sanity checks */ if (data == NULL || ifp == NULL) { panic("invalid argument to %s", __func__); /*NOTREACHED*/ } /* address must be specified on ADD and DELETE */ sa = sstocsa(&ifap->ifap_addr); if (sa->sa_family != sofamily(so)) return EINVAL; if ((any = sockaddr_any(sa)) == NULL || sa->sa_len != any->sa_len) return EINVAL; sockaddr_externalize(&v.sa, sizeof(v.ss), sa); s = pserialize_read_enter(); IFADDR_READER_FOREACH(ifa, ifp) { if (ifa->ifa_addr->sa_family != sa->sa_family) continue; sockaddr_externalize(&u.sa, sizeof(u.ss), ifa->ifa_addr); if (sockaddr_cmp(&u.sa, &v.sa) == 0) break; } if (ifa == NULL) { error = EADDRNOTAVAIL; goto out; } switch (cmd) { case SIOCSIFADDRPREF: ifa->ifa_preference = ifap->ifap_preference; goto out; case SIOCGIFADDRPREF: /* fill in the if_laddrreq structure */ (void)sockaddr_copy(sstosa(&ifap->ifap_addr), sizeof(ifap->ifap_addr), ifa->ifa_addr); ifap->ifap_preference = ifa->ifa_preference; goto out; default: error = EOPNOTSUPP; } out: pserialize_read_exit(s); return error; } /* * Interface ioctls. */ static int doifioctl(struct socket *so, u_long cmd, void *data, struct lwp *l) { struct ifnet *ifp; struct ifreq *ifr; int error = 0; u_long ocmd = cmd; u_short oif_flags; struct ifreq ifrb; struct oifreq *oifr = NULL; int r; struct psref psref; bool do_if43_post = false; bool do_ifm80_post = false; switch (cmd) { case SIOCGIFCONF: return ifconf(cmd, data); case SIOCINITIFADDR: return EPERM; default: MODULE_HOOK_CALL(uipc_syscalls_40_hook, (cmd, data), enosys(), error); if (error != ENOSYS) return error; MODULE_HOOK_CALL(uipc_syscalls_50_hook, (l, cmd, data), enosys(), error); if (error != ENOSYS) return error; error = 0; break; } ifr = data; /* Pre-conversion */ MODULE_HOOK_CALL(if_cvtcmd_43_hook, (&cmd, ocmd), enosys(), error); if (cmd != ocmd) { oifr = data; data = ifr = &ifrb; IFREQO2N_43(oifr, ifr); do_if43_post = true; } MODULE_HOOK_CALL(ifmedia_80_pre_hook, (ifr, &cmd, &do_ifm80_post), enosys(), error); switch (cmd) { case SIOCIFCREATE: case SIOCIFDESTROY: { const int bound = curlwp_bind(); if (l != NULL) { ifp = if_get(ifr->ifr_name, &psref); error = kauth_authorize_network(l->l_cred, KAUTH_NETWORK_INTERFACE, KAUTH_REQ_NETWORK_INTERFACE_SETPRIV, ifp, KAUTH_ARG(cmd), NULL); if (ifp != NULL) if_put(ifp, &psref); if (error != 0) { curlwp_bindx(bound); return error; } } KERNEL_LOCK_UNLESS_NET_MPSAFE(); mutex_enter(&if_clone_mtx); r = (cmd == SIOCIFCREATE) ? if_clone_create(ifr->ifr_name) : if_clone_destroy(ifr->ifr_name); mutex_exit(&if_clone_mtx); KERNEL_UNLOCK_UNLESS_NET_MPSAFE(); curlwp_bindx(bound); return r; } case SIOCIFGCLONERS: { struct if_clonereq *req = (struct if_clonereq *)data; return if_clone_list(req->ifcr_count, req->ifcr_buffer, &req->ifcr_total); } } if ((cmd & IOC_IN) == 0 || IOCPARM_LEN(cmd) < sizeof(ifr->ifr_name)) return EINVAL; const int bound = curlwp_bind(); ifp = if_get(ifr->ifr_name, &psref); if (ifp == NULL) { curlwp_bindx(bound); return ENXIO; } switch (cmd) { case SIOCALIFADDR: case SIOCDLIFADDR: case SIOCSIFADDRPREF: case SIOCSIFFLAGS: case SIOCSIFCAP: case SIOCSIFMETRIC: case SIOCZIFDATA: case SIOCSIFMTU: case SIOCSIFPHYADDR: case SIOCDIFPHYADDR: #ifdef INET6 case SIOCSIFPHYADDR_IN6: #endif case SIOCSLIFPHYADDR: case SIOCADDMULTI: case SIOCDELMULTI: case SIOCSETHERCAP: case SIOCSIFMEDIA: case SIOCSDRVSPEC: case SIOCG80211: case SIOCS80211: case SIOCS80211NWID: case SIOCS80211NWKEY: case SIOCS80211POWER: case SIOCS80211BSSID: case SIOCS80211CHANNEL: case SIOCSLINKSTR: if (l != NULL) { error = kauth_authorize_network(l->l_cred, KAUTH_NETWORK_INTERFACE, KAUTH_REQ_NETWORK_INTERFACE_SETPRIV, ifp, KAUTH_ARG(cmd), NULL); if (error != 0) goto out; } } oif_flags = ifp->if_flags; KERNEL_LOCK_UNLESS_IFP_MPSAFE(ifp); IFNET_LOCK(ifp); error = if_ioctl(ifp, cmd, data); if (error != ENOTTY) ; else if (so->so_proto == NULL) error = EOPNOTSUPP; else { KERNEL_LOCK_IF_IFP_MPSAFE(ifp); MODULE_HOOK_CALL(if_ifioctl_43_hook, (so, ocmd, cmd, data, l), enosys(), error); if (error == ENOSYS) error = (*so->so_proto->pr_usrreqs->pr_ioctl)(so, cmd, data, ifp); KERNEL_UNLOCK_IF_IFP_MPSAFE(ifp); } if (((oif_flags ^ ifp->if_flags) & IFF_UP) != 0) { if ((ifp->if_flags & IFF_UP) != 0) { const int s = splsoftnet(); if_up_locked(ifp); splx(s); } } /* Post-conversion */ if (do_ifm80_post && (error == 0)) MODULE_HOOK_CALL(ifmedia_80_post_hook, (ifr, cmd), enosys(), error); if (do_if43_post) IFREQN2O_43(oifr, ifr); IFNET_UNLOCK(ifp); KERNEL_UNLOCK_UNLESS_IFP_MPSAFE(ifp); out: if_put(ifp, &psref); curlwp_bindx(bound); return error; } /* * Return interface configuration * of system. List may be used * in later ioctl's (above) to get * other information. * * Each record is a struct ifreq. Before the addition of * sockaddr_storage, the API rule was that sockaddr flavors that did * not fit would extend beyond the struct ifreq, with the next struct * ifreq starting sa_len beyond the struct sockaddr. Because the * union in struct ifreq includes struct sockaddr_storage, every kind * of sockaddr must fit. Thus, there are no longer any overlength * records. * * Records are added to the user buffer if they fit, and ifc_len is * adjusted to the length that was written. Thus, the user is only * assured of getting the complete list if ifc_len on return is at * least sizeof(struct ifreq) less than it was on entry. * * If the user buffer pointer is NULL, this routine copies no data and * returns the amount of space that would be needed. * * Invariants: * ifrp points to the next part of the user's buffer to be used. If * ifrp != NULL, space holds the number of bytes remaining that we may * write at ifrp. Otherwise, space holds the number of bytes that * would have been written had there been adequate space. */ /*ARGSUSED*/ static int ifconf(u_long cmd, void *data) { struct ifconf *ifc = (struct ifconf *)data; struct ifnet *ifp; struct ifaddr *ifa; struct ifreq ifr, *ifrp = NULL; int space = 0, error = 0; const int sz = (int)sizeof(struct ifreq); const bool docopy = ifc->ifc_req != NULL; struct psref psref; if (docopy) { if (ifc->ifc_len < 0) return EINVAL; space = ifc->ifc_len; ifrp = ifc->ifc_req; } memset(&ifr, 0, sizeof(ifr)); const int bound = curlwp_bind(); int s = pserialize_read_enter(); IFNET_READER_FOREACH(ifp) { psref_acquire(&psref, &ifp->if_psref, ifnet_psref_class); pserialize_read_exit(s); (void)strncpy(ifr.ifr_name, ifp->if_xname, sizeof(ifr.ifr_name)); if (ifr.ifr_name[sizeof(ifr.ifr_name) - 1] != '\0') { error = ENAMETOOLONG; goto release_exit; } if (IFADDR_READER_EMPTY(ifp)) { /* Interface with no addresses - send zero sockaddr. */ memset(&ifr.ifr_addr, 0, sizeof(ifr.ifr_addr)); if (!docopy) { space += sz; goto next; } if (space >= sz) { error = copyout(&ifr, ifrp, sz); if (error != 0) goto release_exit; ifrp++; space -= sz; } } s = pserialize_read_enter(); IFADDR_READER_FOREACH(ifa, ifp) { struct sockaddr *sa = ifa->ifa_addr; /* all sockaddrs must fit in sockaddr_storage */ KASSERT(sa->sa_len <= sizeof(ifr.ifr_ifru)); if (!docopy) { space += sz; continue; } memcpy(&ifr.ifr_space, sa, sa->sa_len); pserialize_read_exit(s); if (space >= sz) { error = copyout(&ifr, ifrp, sz); if (error != 0) goto release_exit; ifrp++; space -= sz; } s = pserialize_read_enter(); } pserialize_read_exit(s); next: s = pserialize_read_enter(); psref_release(&psref, &ifp->if_psref, ifnet_psref_class); } pserialize_read_exit(s); curlwp_bindx(bound); if (docopy) { KASSERT(0 <= space && space <= ifc->ifc_len); ifc->ifc_len -= space; } else { KASSERT(space >= 0); ifc->ifc_len = space; } return 0; release_exit: psref_release(&psref, &ifp->if_psref, ifnet_psref_class); curlwp_bindx(bound); return error; } int ifreq_setaddr(u_long cmd, struct ifreq *ifr, const struct sockaddr *sa) { uint8_t len = sizeof(ifr->ifr_ifru.ifru_space); struct ifreq ifrb; struct oifreq *oifr = NULL; u_long ocmd = cmd; int hook; MODULE_HOOK_CALL(if_cvtcmd_43_hook, (&cmd, ocmd), enosys(), hook); if (hook != ENOSYS) { if (cmd != ocmd) { oifr = (struct oifreq *)(void *)ifr; ifr = &ifrb; IFREQO2N_43(oifr, ifr); len = sizeof(oifr->ifr_addr); } } if (len < sa->sa_len) return EFBIG; memset(&ifr->ifr_addr, 0, len); sockaddr_copy(&ifr->ifr_addr, len, sa); if (cmd != ocmd) IFREQN2O_43(oifr, ifr); return 0; } /* * wrapper function for the drivers which doesn't have if_transmit(). */ static int if_transmit(struct ifnet *ifp, struct mbuf *m) { int error; size_t pktlen = m->m_pkthdr.len; bool mcast = (m->m_flags & M_MCAST) != 0; const int s = splnet(); IFQ_ENQUEUE(&ifp->if_snd, m, error); if (error != 0) { /* mbuf is already freed */ goto out; } net_stat_ref_t nsr = IF_STAT_GETREF(ifp); if_statadd_ref(nsr, if_obytes, pktlen); if (mcast) if_statinc_ref(nsr, if_omcasts); IF_STAT_PUTREF(ifp); if ((ifp->if_flags & IFF_OACTIVE) == 0) if_start_lock(ifp); out: splx(s); return error; } int if_transmit_lock(struct ifnet *ifp, struct mbuf *m) { int error; kmsan_check_mbuf(m); #ifdef ALTQ KERNEL_LOCK(1, NULL); if (ALTQ_IS_ENABLED(&ifp->if_snd)) { error = if_transmit(ifp, m); KERNEL_UNLOCK_ONE(NULL); } else { KERNEL_UNLOCK_ONE(NULL); error = (*ifp->if_transmit)(ifp, m); /* mbuf is already freed */ } #else /* !ALTQ */ error = (*ifp->if_transmit)(ifp, m); /* mbuf is already freed */ #endif /* !ALTQ */ return error; } /* * Queue message on interface, and start output if interface * not yet active. */ int ifq_enqueue(struct ifnet *ifp, struct mbuf *m) { return if_transmit_lock(ifp, m); } /* * Queue message on interface, possibly using a second fast queue */ int ifq_enqueue2(struct ifnet *ifp, struct ifqueue *ifq, struct mbuf *m) { int error = 0; if (ifq != NULL #ifdef ALTQ && ALTQ_IS_ENABLED(&ifp->if_snd) == 0 #endif ) { if (IF_QFULL(ifq)) { IF_DROP(&ifp->if_snd); m_freem(m); if (error == 0) error = ENOBUFS; } else IF_ENQUEUE(ifq, m); } else IFQ_ENQUEUE(&ifp->if_snd, m, error); if (error != 0) { if_statinc(ifp, if_oerrors); return error; } return 0; } int if_addr_init(ifnet_t *ifp, struct ifaddr *ifa, const bool src) { int rc; KASSERT(IFNET_LOCKED(ifp)); if (ifp->if_initaddr != NULL) rc = (*ifp->if_initaddr)(ifp, ifa, src); else if (src || (rc = if_ioctl(ifp, SIOCSIFDSTADDR, ifa)) == ENOTTY) rc = if_ioctl(ifp, SIOCINITIFADDR, ifa); return rc; } int if_do_dad(struct ifnet *ifp) { if ((ifp->if_flags & IFF_LOOPBACK) != 0) return 0; switch (ifp->if_type) { case IFT_FAITH: /* * These interfaces do not have the IFF_LOOPBACK flag, * but loop packets back. We do not have to do DAD on such * interfaces. We should even omit it, because loop-backed * responses would confuse the DAD procedure. */ return 0; default: /* * Our DAD routine requires the interface up and running. * However, some interfaces can be up before the RUNNING * status. Additionally, users may try to assign addresses * before the interface becomes up (or running). * We simply skip DAD in such a case as a work around. * XXX: we should rather mark "tentative" on such addresses, * and do DAD after the interface becomes ready. */ if ((ifp->if_flags & (IFF_UP | IFF_RUNNING)) != (IFF_UP | IFF_RUNNING)) return 0; return 1; } } /* * if_flags_set(ifp, flags) * * Ask ifp to change ifp->if_flags to flags, as if with the * SIOCSIFFLAGS ioctl command. * * May sleep. Caller must hold ifp->if_ioctl_lock, a.k.a * IFNET_LOCK. */ int if_flags_set(ifnet_t *ifp, const u_short flags) { int rc; KASSERT(IFNET_LOCKED(ifp)); if (ifp->if_setflags != NULL) rc = (*ifp->if_setflags)(ifp, flags); else { u_short cantflags, chgdflags; struct ifreq ifr; chgdflags = ifp->if_flags ^ flags; cantflags = chgdflags & IFF_CANTCHANGE; if (cantflags != 0) ifp->if_flags ^= cantflags; /* * Traditionally, we do not call if_ioctl after * setting/clearing only IFF_PROMISC if the interface * isn't IFF_UP. Uphold that tradition. */ if (chgdflags == IFF_PROMISC && (ifp->if_flags & IFF_UP) == 0) return 0; memset(&ifr, 0, sizeof(ifr)); ifr.ifr_flags = flags & ~IFF_CANTCHANGE; rc = if_ioctl(ifp, SIOCSIFFLAGS, &ifr); if (rc != 0 && cantflags != 0) ifp->if_flags ^= cantflags; } return rc; } /* * if_mcast_op(ifp, cmd, sa) * * Apply a multicast command, SIOCADDMULTI/SIOCDELMULTI, to the * interface. Returns 0 on success, nonzero errno(3) number on * failure. * * May sleep. * * Use this, not if_ioctl, for the multicast commands. */ int if_mcast_op(ifnet_t *ifp, const unsigned long cmd, const struct sockaddr *sa) { int rc; struct ifreq ifr; switch (cmd) { case SIOCADDMULTI: case SIOCDELMULTI: break; default: panic("invalid ifnet multicast command: 0x%lx", cmd); } ifreq_setaddr(cmd, &ifr, sa); rc = if_ioctl(ifp, cmd, &ifr); return rc; } static void sysctl_sndq_setup(struct sysctllog **clog, const char *ifname, struct ifaltq *ifq) { const struct sysctlnode *cnode, *rnode; if (sysctl_createv(clog, 0, NULL, &rnode, CTLFLAG_PERMANENT, CTLTYPE_NODE, "interfaces", SYSCTL_DESCR("Per-interface controls"), NULL, 0, NULL, 0, CTL_NET, CTL_CREATE, CTL_EOL) != 0) goto bad; if (sysctl_createv(clog, 0, &rnode, &rnode, CTLFLAG_PERMANENT, CTLTYPE_NODE, ifname, SYSCTL_DESCR("Interface controls"), NULL, 0, NULL, 0, CTL_CREATE, CTL_EOL) != 0) goto bad; if (sysctl_createv(clog, 0, &rnode, &rnode, CTLFLAG_PERMANENT, CTLTYPE_NODE, "sndq", SYSCTL_DESCR("Interface output queue controls"), NULL, 0, NULL, 0, CTL_CREATE, CTL_EOL) != 0) goto bad; if (sysctl_createv(clog, 0, &rnode, &cnode, CTLFLAG_PERMANENT, CTLTYPE_INT, "len", SYSCTL_DESCR("Current output queue length"), NULL, 0, &ifq->ifq_len, 0, CTL_CREATE, CTL_EOL) != 0) goto bad; if (sysctl_createv(clog, 0, &rnode, &cnode, CTLFLAG_PERMANENT | CTLFLAG_READWRITE, CTLTYPE_INT, "maxlen", SYSCTL_DESCR("Maximum allowed output queue length"), NULL, 0, &ifq->ifq_maxlen, 0, CTL_CREATE, CTL_EOL) != 0) goto bad; if (sysctl_createv(clog, 0, &rnode, &cnode, CTLFLAG_PERMANENT, CTLTYPE_QUAD, "drops", SYSCTL_DESCR("Packets dropped due to full output queue"), NULL, 0, &ifq->ifq_drops, 0, CTL_CREATE, CTL_EOL) != 0) goto bad; return; bad: printf("%s: could not attach sysctl nodes\n", ifname); return; } static int if_sdl_sysctl(SYSCTLFN_ARGS) { struct ifnet *ifp; const struct sockaddr_dl *sdl; struct psref psref; int error = 0; if (namelen != 1) return EINVAL; const int bound = curlwp_bind(); ifp = if_get_byindex(name[0], &psref); if (ifp == NULL) { error = ENODEV; goto out0; } sdl = ifp->if_sadl; if (sdl == NULL) { *oldlenp = 0; goto out1; } if (oldp == NULL) { *oldlenp = sdl->sdl_alen; goto out1; } if (*oldlenp >= sdl->sdl_alen) *oldlenp = sdl->sdl_alen; error = sysctl_copyout(l, &sdl->sdl_data[sdl->sdl_nlen], oldp, *oldlenp); out1: if_put(ifp, &psref); out0: curlwp_bindx(bound); return error; } static void if_sysctl_setup(struct sysctllog **clog) { const struct sysctlnode *rnode = NULL; sysctl_createv(clog, 0, NULL, &rnode, CTLFLAG_PERMANENT, CTLTYPE_NODE, "sdl", SYSCTL_DESCR("Get active link-layer address"), if_sdl_sysctl, 0, NULL, 0, CTL_NET, CTL_CREATE, CTL_EOL); }
2281 2789 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 /* $NetBSD: cpu.h,v 1.72 2023/09/04 20:58:52 mrg Exp $ */ /*- * Copyright (c) 1990 The Regents of the University of California. * All rights reserved. * * This code is derived from software contributed to Berkeley by * William Jolitz. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)cpu.h 5.4 (Berkeley) 5/9/91 */ #ifndef _AMD64_CPU_H_ #define _AMD64_CPU_H_ #ifdef __x86_64__ #include <x86/cpu.h> #ifdef _KERNEL #if defined(__GNUC__) && !defined(_MODULE) static struct cpu_info *x86_curcpu(void); static lwp_t *x86_curlwp(void); /* * XXXGCC12 has: * ./machine/cpu.h:57:9: error: array subscript 0 is outside array bounds of 'struct cpu_info * const[0]' [-Werror=array-bounds] * 56 | __asm("movq %%gs:%1, %0" : */ #pragma GCC push_options #pragma GCC diagnostic ignored "-Warray-bounds" __inline __always_inline static struct cpu_info * __unused __nomsan x86_curcpu(void) { struct cpu_info *ci; __asm("movq %%gs:%1, %0" : "=r" (ci) : "m" (*(struct cpu_info * const *)offsetof(struct cpu_info, ci_self))); return ci; } __inline static lwp_t * __unused __nomsan __attribute__ ((const)) x86_curlwp(void) { lwp_t *l; __asm("movq %%gs:%1, %0" : "=r" (l) : "m" (*(struct cpu_info * const *)offsetof(struct cpu_info, ci_curlwp))); return l; } #pragma GCC pop_options #endif /* __GNUC__ && !_MODULE */ #ifdef XENPV #define CLKF_USERMODE(frame) (curcpu()->ci_xen_clockf_usermode) #define CLKF_PC(frame) (curcpu()->ci_xen_clockf_pc) #else /* XENPV */ #define CLKF_USERMODE(frame) USERMODE((frame)->cf_if.if_tf.tf_cs) #define CLKF_PC(frame) ((frame)->cf_if.if_tf.tf_rip) #endif /* XENPV */ #define CLKF_INTR(frame) (curcpu()->ci_idepth > 0) #define LWP_PC(l) ((l)->l_md.md_regs->tf_rip) void *cpu_uarea_alloc(bool); bool cpu_uarea_free(void *); #endif /* _KERNEL */ #else /* __x86_64__ */ #include <i386/cpu.h> #endif /* __x86_64__ */ #endif /* !_AMD64_CPU_H_ */
1 1 1 5 4 5 5 8 4 1 1 2 1 2 2 6 6 6 5 4 4 1 1 1 1 2 2 2 2 2 2 2 2 2 1 7 7 7 1 3 4 2 2 26 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 /* $NetBSD: raw_ip.c,v 1.184 2022/11/04 09:00:58 ozaki-r Exp $ */ /* * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the project nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * Copyright (c) 1982, 1986, 1988, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)raw_ip.c 8.7 (Berkeley) 5/15/95 */ /* * Raw interface to IP protocol. */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: raw_ip.c,v 1.184 2022/11/04 09:00:58 ozaki-r Exp $"); #ifdef _KERNEL_OPT #include "opt_inet.h" #include "opt_ipsec.h" #include "opt_mrouting.h" #include "opt_net_mpsafe.h" #endif #include <sys/param.h> #include <sys/sysctl.h> #include <sys/mbuf.h> #include <sys/socket.h> #include <sys/protosw.h> #include <sys/socketvar.h> #include <sys/errno.h> #include <sys/systm.h> #include <sys/proc.h> #include <sys/kauth.h> #include <net/if.h> #include <netinet/in.h> #include <netinet/in_systm.h> #include <netinet/ip.h> #include <netinet/ip_var.h> #include <netinet/ip_private.h> #include <netinet/ip_mroute.h> #include <netinet/ip_icmp.h> #include <netinet/in_pcb.h> #include <netinet/in_proto.h> #include <netinet/in_var.h> #ifdef IPSEC #include <netipsec/ipsec.h> #endif struct inpcbtable rawcbtable; int rip_pcbnotify(struct inpcbtable *, struct in_addr, struct in_addr, int, int, void (*)(struct inpcb *, int)); static int rip_connect_pcb(struct inpcb *, struct sockaddr_in *); static void rip_disconnect1(struct inpcb *); static void sysctl_net_inet_raw_setup(struct sysctllog **); /* * Nominal space allocated to a raw ip socket. */ #define RIPSNDQ 8192 #define RIPRCVQ 8192 static u_long rip_sendspace = RIPSNDQ; static u_long rip_recvspace = RIPRCVQ; /* * Raw interface to IP protocol. */ /* * Initialize raw connection block q. */ void rip_init(void) { sysctl_net_inet_raw_setup(NULL); inpcb_init(&rawcbtable, 1, 1); } static void rip_sbappendaddr(struct inpcb *last, struct ip *ip, const struct sockaddr *sa, int hlen, struct mbuf *n) { struct mbuf *opts = NULL; if (last->inp_flags & INP_NOHEADER) m_adj(n, hlen); if (last->inp_flags & INP_CONTROLOPTS || SOOPT_TIMESTAMP(last->inp_socket->so_options)) ip_savecontrol(last, &opts, ip, n); if (sbappendaddr(&last->inp_socket->so_rcv, sa, n, opts) == 0) { soroverflow(last->inp_socket); m_freem(n); if (opts) m_freem(opts); } else { sorwakeup(last->inp_socket); } } /* * Setup generic address and protocol structures * for raw_input routine, then pass them along with * mbuf chain. */ void rip_input(struct mbuf *m, int off, int proto) { struct ip *ip = mtod(m, struct ip *); struct inpcb *inp; struct inpcb *last = NULL; struct mbuf *n; struct sockaddr_in ripsrc; int hlen; sockaddr_in_init(&ripsrc, &ip->ip_src, 0); /* * XXX Compatibility: programs using raw IP expect ip_len * XXX to have the header length subtracted, and in host order. * XXX ip_off is also expected to be host order. */ hlen = ip->ip_hl << 2; ip->ip_len = ntohs(ip->ip_len) - hlen; NTOHS(ip->ip_off); TAILQ_FOREACH(inp, &rawcbtable.inpt_queue, inp_queue) { if (inp->inp_af != AF_INET) continue; if (in4p_ip(inp).ip_p && in4p_ip(inp).ip_p != proto) continue; if (!in_nullhost(in4p_laddr(inp)) && !in_hosteq(in4p_laddr(inp), ip->ip_dst)) continue; if (!in_nullhost(in4p_faddr(inp)) && !in_hosteq(in4p_faddr(inp), ip->ip_src)) continue; if (last == NULL) { ; } #if defined(IPSEC) else if (ipsec_used && ipsec_in_reject(m, last)) { /* do not inject data into pcb */ } #endif else if ((n = m_copypacket(m, M_DONTWAIT)) != NULL) { rip_sbappendaddr(last, ip, sintosa(&ripsrc), hlen, n); } last = inp; } #if defined(IPSEC) if (ipsec_used && last != NULL && ipsec_in_reject(m, last)) { m_freem(m); IP_STATDEC(IP_STAT_DELIVERED); /* do not inject data into pcb */ } else #endif if (last != NULL) { rip_sbappendaddr(last, ip, sintosa(&ripsrc), hlen, m); } else if (inetsw[ip_protox[ip->ip_p]].pr_input == rip_input) { uint64_t *ips; icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_PROTOCOL, 0, 0); ips = IP_STAT_GETREF(); ips[IP_STAT_NOPROTO]++; ips[IP_STAT_DELIVERED]--; IP_STAT_PUTREF(); } else { m_freem(m); } return; } int rip_pcbnotify(struct inpcbtable *table, struct in_addr faddr, struct in_addr laddr, int proto, int errno, void (*notify)(struct inpcb *, int)) { struct inpcb *inp; int nmatch; nmatch = 0; TAILQ_FOREACH(inp, &table->inpt_queue, inp_queue) { if (inp->inp_af != AF_INET) continue; if (in4p_ip(inp).ip_p && in4p_ip(inp).ip_p != proto) continue; if (in_hosteq(in4p_faddr(inp), faddr) && in_hosteq(in4p_laddr(inp), laddr)) { (*notify)(inp, errno); nmatch++; } } return nmatch; } void * rip_ctlinput(int cmd, const struct sockaddr *sa, void *v) { struct ip *ip = v; void (*notify)(struct inpcb *, int) = inpcb_rtchange; int errno; if (sa->sa_family != AF_INET || sa->sa_len != sizeof(struct sockaddr_in)) return NULL; if ((unsigned)cmd >= PRC_NCMDS) return NULL; errno = inetctlerrmap[cmd]; if (PRC_IS_REDIRECT(cmd)) notify = inpcb_rtchange, ip = 0; else if (cmd == PRC_HOSTDEAD) ip = 0; else if (errno == 0) return NULL; if (ip) { rip_pcbnotify(&rawcbtable, satocsin(sa)->sin_addr, ip->ip_src, ip->ip_p, errno, notify); /* XXX mapped address case */ } else inpcb_notifyall(&rawcbtable, satocsin(sa)->sin_addr, errno, notify); return NULL; } /* * Generate IP header and pass packet to ip_output. * Tack on options user may have setup with control call. */ int rip_output(struct mbuf *m, struct inpcb *inp, struct mbuf *control, struct lwp *l) { struct ip *ip; struct mbuf *opts; struct ip_pktopts pktopts; kauth_cred_t cred; int error, flags; flags = (inp->inp_socket->so_options & SO_DONTROUTE) | IP_ALLOWBROADCAST | IP_RETURNMTU; if (l == NULL) cred = NULL; else cred = l->l_cred; /* Setup IP outgoing packet options */ memset(&pktopts, 0, sizeof(pktopts)); error = ip_setpktopts(control, &pktopts, &flags, inp, cred); if (control != NULL) m_freem(control); if (error != 0) goto release; /* * If the user handed us a complete IP packet, use it. * Otherwise, allocate an mbuf for a header and fill it in. */ if ((inp->inp_flags & INP_HDRINCL) == 0) { if ((m->m_pkthdr.len + sizeof(struct ip)) > IP_MAXPACKET) { error = EMSGSIZE; goto release; } M_PREPEND(m, sizeof(struct ip), M_DONTWAIT); if (!m) { error = ENOBUFS; goto release; } ip = mtod(m, struct ip *); ip->ip_tos = 0; ip->ip_off = htons(0); ip->ip_p = in4p_ip(inp).ip_p; ip->ip_len = htons(m->m_pkthdr.len); ip->ip_src = pktopts.ippo_laddr.sin_addr; ip->ip_dst = in4p_faddr(inp); ip->ip_ttl = MAXTTL; opts = inp->inp_options; } else { if (m->m_pkthdr.len > IP_MAXPACKET) { error = EMSGSIZE; goto release; } if (m->m_pkthdr.len < sizeof(struct ip)) { error = EINVAL; goto release; } ip = mtod(m, struct ip *); /* * If the mbuf is read-only, we need to allocate * a new mbuf for the header, since we need to * modify the header. */ if (M_READONLY(m)) { int hlen = ip->ip_hl << 2; m = m_copyup(m, hlen, (max_linkhdr + 3) & ~3); if (m == NULL) { error = ENOMEM; goto release; } ip = mtod(m, struct ip *); } /* XXX userland passes ip_len and ip_off in host order */ if (m->m_pkthdr.len != ip->ip_len) { error = EINVAL; goto release; } HTONS(ip->ip_len); HTONS(ip->ip_off); if (ip->ip_id != 0 || m->m_pkthdr.len < IP_MINFRAGSIZE) flags |= IP_NOIPNEWID; opts = NULL; /* Prevent ip_output from overwriting header fields. */ flags |= IP_RAWOUTPUT; IP_STATINC(IP_STAT_RAWOUT); } /* * IP output. Note: if IP_RETURNMTU flag is set, the MTU size * will be stored in inp_errormtu. */ return ip_output(m, opts, &inp->inp_route, flags, pktopts.ippo_imo, inp); release: if (m != NULL) m_freem(m); return error; } /* * Raw IP socket option processing. */ int rip_ctloutput(int op, struct socket *so, struct sockopt *sopt) { struct inpcb *inp = sotoinpcb(so); int error = 0; int optval; if (sopt->sopt_level == SOL_SOCKET && sopt->sopt_name == SO_NOHEADER) { if (op == PRCO_GETOPT) { optval = (inp->inp_flags & INP_NOHEADER) ? 1 : 0; error = sockopt_set(sopt, &optval, sizeof(optval)); } else if (op == PRCO_SETOPT) { error = sockopt_getint(sopt, &optval); if (error) goto out; if (optval) { inp->inp_flags &= ~INP_HDRINCL; inp->inp_flags |= INP_NOHEADER; } else inp->inp_flags &= ~INP_NOHEADER; } goto out; } else if (sopt->sopt_level != IPPROTO_IP) return ip_ctloutput(op, so, sopt); switch (op) { case PRCO_SETOPT: switch (sopt->sopt_name) { case IP_HDRINCL: error = sockopt_getint(sopt, &optval); if (error) break; if (optval) inp->inp_flags |= INP_HDRINCL; else inp->inp_flags &= ~INP_HDRINCL; break; #ifdef MROUTING case MRT_INIT: case MRT_DONE: case MRT_ADD_VIF: case MRT_DEL_VIF: case MRT_ADD_MFC: case MRT_DEL_MFC: case MRT_ASSERT: case MRT_API_CONFIG: case MRT_ADD_BW_UPCALL: case MRT_DEL_BW_UPCALL: error = ip_mrouter_set(so, sopt); break; #endif default: error = ip_ctloutput(op, so, sopt); break; } break; case PRCO_GETOPT: switch (sopt->sopt_name) { case IP_HDRINCL: optval = inp->inp_flags & INP_HDRINCL; error = sockopt_set(sopt, &optval, sizeof(optval)); break; #ifdef MROUTING case MRT_VERSION: case MRT_ASSERT: case MRT_API_SUPPORT: case MRT_API_CONFIG: error = ip_mrouter_get(so, sopt); break; #endif default: error = ip_ctloutput(op, so, sopt); break; } break; } out: return error; } int rip_connect_pcb(struct inpcb *inp, struct sockaddr_in *addr) { if (IFNET_READER_EMPTY()) return (EADDRNOTAVAIL); if (addr->sin_family != AF_INET) return (EAFNOSUPPORT); if (addr->sin_len != sizeof(*addr)) return EINVAL; in4p_faddr(inp) = addr->sin_addr; return (0); } static void rip_disconnect1(struct inpcb *inp) { in4p_faddr(inp) = zeroin_addr; } static int rip_attach(struct socket *so, int proto) { struct inpcb *inp; int error; KASSERT(sotoinpcb(so) == NULL); sosetlock(so); if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0) { error = soreserve(so, rip_sendspace, rip_recvspace); if (error) { return error; } } error = inpcb_create(so, &rawcbtable); if (error) { return error; } inp = sotoinpcb(so); in4p_ip(inp).ip_p = proto; KASSERT(solocked(so)); return 0; } static void rip_detach(struct socket *so) { struct inpcb *inp; KASSERT(solocked(so)); inp = sotoinpcb(so); KASSERT(inp != NULL); #ifdef MROUTING extern struct socket *ip_mrouter; if (so == ip_mrouter) { ip_mrouter_done(); } #endif inpcb_destroy(inp); } static int rip_accept(struct socket *so, struct sockaddr *nam) { KASSERT(solocked(so)); panic("rip_accept"); return EOPNOTSUPP; } static int rip_bind(struct socket *so, struct sockaddr *nam, struct lwp *l) { struct inpcb *inp = sotoinpcb(so); struct sockaddr_in *addr = (struct sockaddr_in *)nam; int error = 0; int s, ss; struct ifaddr *ifa; KASSERT(solocked(so)); KASSERT(inp != NULL); KASSERT(nam != NULL); if (addr->sin_len != sizeof(*addr)) return EINVAL; s = splsoftnet(); if (IFNET_READER_EMPTY()) { error = EADDRNOTAVAIL; goto release; } if (addr->sin_family != AF_INET) { error = EAFNOSUPPORT; goto release; } ss = pserialize_read_enter(); if ((ifa = ifa_ifwithaddr(sintosa(addr))) == NULL && (inp->inp_flags & INP_BINDANY) == 0 && !in_nullhost(addr->sin_addr)) { pserialize_read_exit(ss); error = EADDRNOTAVAIL; goto release; } if (ifa && (ifatoia(ifa))->ia4_flags & IN6_IFF_DUPLICATED) { pserialize_read_exit(ss); error = EADDRNOTAVAIL; goto release; } pserialize_read_exit(ss); in4p_laddr(inp) = addr->sin_addr; release: splx(s); return error; } static int rip_listen(struct socket *so, struct lwp *l) { KASSERT(solocked(so)); return EOPNOTSUPP; } static int rip_connect(struct socket *so, struct sockaddr *nam, struct lwp *l) { struct inpcb *inp = sotoinpcb(so); int error = 0; int s; KASSERT(solocked(so)); KASSERT(inp != NULL); KASSERT(nam != NULL); s = splsoftnet(); error = rip_connect_pcb(inp, (struct sockaddr_in *)nam); if (! error) soisconnected(so); splx(s); return error; } static int rip_connect2(struct socket *so, struct socket *so2) { KASSERT(solocked(so)); return EOPNOTSUPP; } static int rip_disconnect(struct socket *so) { struct inpcb *inp = sotoinpcb(so); int s; KASSERT(solocked(so)); KASSERT(inp != NULL); s = splsoftnet(); soisdisconnected(so); rip_disconnect1(inp); splx(s); return 0; } static int rip_shutdown(struct socket *so) { int s; KASSERT(solocked(so)); /* * Mark the connection as being incapable of further input. */ s = splsoftnet(); socantsendmore(so); splx(s); return 0; } static int rip_abort(struct socket *so) { KASSERT(solocked(so)); panic("rip_abort"); return EOPNOTSUPP; } static int rip_ioctl(struct socket *so, u_long cmd, void *nam, struct ifnet *ifp) { return in_control(so, cmd, nam, ifp); } static int rip_stat(struct socket *so, struct stat *ub) { KASSERT(solocked(so)); /* stat: don't bother with a blocksize. */ return 0; } static int rip_peeraddr(struct socket *so, struct sockaddr *nam) { int s; KASSERT(solocked(so)); KASSERT(sotoinpcb(so) != NULL); KASSERT(nam != NULL); s = splsoftnet(); inpcb_fetch_peeraddr(sotoinpcb(so), (struct sockaddr_in *)nam); splx(s); return 0; } static int rip_sockaddr(struct socket *so, struct sockaddr *nam) { int s; KASSERT(solocked(so)); KASSERT(sotoinpcb(so) != NULL); KASSERT(nam != NULL); s = splsoftnet(); inpcb_fetch_sockaddr(sotoinpcb(so), (struct sockaddr_in *)nam); splx(s); return 0; } static int rip_rcvd(struct socket *so, int flags, struct lwp *l) { KASSERT(solocked(so)); return EOPNOTSUPP; } static int rip_recvoob(struct socket *so, struct mbuf *m, int flags) { KASSERT(solocked(so)); return EOPNOTSUPP; } static int rip_send(struct socket *so, struct mbuf *m, struct sockaddr *nam, struct mbuf *control, struct lwp *l) { struct inpcb *inp = sotoinpcb(so); int error = 0; int s; KASSERT(solocked(so)); KASSERT(inp != NULL); KASSERT(m != NULL); /* * Ship a packet out. The appropriate raw output * routine handles any massaging necessary. */ s = splsoftnet(); if (nam) { if ((so->so_state & SS_ISCONNECTED) != 0) { error = EISCONN; goto die; } error = rip_connect_pcb(inp, (struct sockaddr_in *)nam); if (error) goto die; } else { if ((so->so_state & SS_ISCONNECTED) == 0) { error = ENOTCONN; goto die; } } error = rip_output(m, inp, control, l); m = NULL; control = NULL; if (nam) rip_disconnect1(inp); die: if (m != NULL) m_freem(m); if (control != NULL) m_freem(control); splx(s); return error; } static int rip_sendoob(struct socket *so, struct mbuf *m, struct mbuf *control) { KASSERT(solocked(so)); m_freem(m); m_freem(control); return EOPNOTSUPP; } static int rip_purgeif(struct socket *so, struct ifnet *ifp) { int s; s = splsoftnet(); mutex_enter(softnet_lock); inpcb_purgeif0(&rawcbtable, ifp); #ifdef NET_MPSAFE mutex_exit(softnet_lock); #endif in_purgeif(ifp); #ifdef NET_MPSAFE mutex_enter(softnet_lock); #endif inpcb_purgeif(&rawcbtable, ifp); mutex_exit(softnet_lock); splx(s); return 0; } PR_WRAP_USRREQS(rip) #define rip_attach rip_attach_wrapper #define rip_detach rip_detach_wrapper #define rip_accept rip_accept_wrapper #define rip_bind rip_bind_wrapper #define rip_listen rip_listen_wrapper #define rip_connect rip_connect_wrapper #define rip_connect2 rip_connect2_wrapper #define rip_disconnect rip_disconnect_wrapper #define rip_shutdown rip_shutdown_wrapper #define rip_abort rip_abort_wrapper #define rip_ioctl rip_ioctl_wrapper #define rip_stat rip_stat_wrapper #define rip_peeraddr rip_peeraddr_wrapper #define rip_sockaddr rip_sockaddr_wrapper #define rip_rcvd rip_rcvd_wrapper #define rip_recvoob rip_recvoob_wrapper #define rip_send rip_send_wrapper #define rip_sendoob rip_sendoob_wrapper #define rip_purgeif rip_purgeif_wrapper const struct pr_usrreqs rip_usrreqs = { .pr_attach = rip_attach, .pr_detach = rip_detach, .pr_accept = rip_accept, .pr_bind = rip_bind, .pr_listen = rip_listen, .pr_connect = rip_connect, .pr_connect2 = rip_connect2, .pr_disconnect = rip_disconnect, .pr_shutdown = rip_shutdown, .pr_abort = rip_abort, .pr_ioctl = rip_ioctl, .pr_stat = rip_stat, .pr_peeraddr = rip_peeraddr, .pr_sockaddr = rip_sockaddr, .pr_rcvd = rip_rcvd, .pr_recvoob = rip_recvoob, .pr_send = rip_send, .pr_sendoob = rip_sendoob, .pr_purgeif = rip_purgeif, }; static void sysctl_net_inet_raw_setup(struct sysctllog **clog) { sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT, CTLTYPE_NODE, "inet", NULL, NULL, 0, NULL, 0, CTL_NET, PF_INET, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT, CTLTYPE_NODE, "raw", SYSCTL_DESCR("Raw IPv4 settings"), NULL, 0, NULL, 0, CTL_NET, PF_INET, IPPROTO_RAW, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT, CTLTYPE_STRUCT, "pcblist", SYSCTL_DESCR("Raw IPv4 control block list"), sysctl_inpcblist, 0, &rawcbtable, 0, CTL_NET, PF_INET, IPPROTO_RAW, CTL_CREATE, CTL_EOL); }
5 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 /* $NetBSD: rtsock.c,v 1.256 2022/08/27 08:36:41 skrll Exp $ */ /* * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the project nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * Copyright (c) 1988, 1991, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)rtsock.c 8.7 (Berkeley) 10/12/95 */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: rtsock.c,v 1.256 2022/08/27 08:36:41 skrll Exp $"); #ifdef _KERNEL_OPT #include "opt_inet.h" #include "opt_compat_netbsd.h" #endif #include <sys/param.h> #include <sys/systm.h> #include <sys/proc.h> #include <sys/socket.h> #include <sys/socketvar.h> #include <sys/domain.h> #include <sys/protosw.h> #include <sys/sysctl.h> #include <sys/kauth.h> #include <sys/kmem.h> #include <sys/intr.h> #include <sys/condvar.h> #include <sys/compat_stub.h> #include <net/if.h> #include <net/if_llatbl.h> #include <net/if_types.h> #include <net/route.h> #include <net/raw_cb.h> #include <netinet/in_var.h> #include <netinet/if_inarp.h> #include <netmpls/mpls.h> #include <compat/net/if.h> #include <compat/net/route.h> #ifdef COMPAT_RTSOCK #undef COMPAT_RTSOCK #endif static int if_addrflags(struct ifaddr *); #include <net/rtsock_shared.c> /* * XXX avoid using void * once msghdr compat disappears. */ void rt_setmetrics(void *in, struct rtentry *out) { const struct rt_xmsghdr *rtm = in; _rt_setmetrics(rtm->rtm_inits, rtm, out); } int rt_msg3(int type, struct rt_addrinfo *rtinfo, void *cpv, struct rt_walkarg *w, int *lenp) { return rt_msg2(type, rtinfo, cpv, w, lenp); } static int if_addrflags(struct ifaddr *ifa) { switch (ifa->ifa_addr->sa_family) { #ifdef INET case AF_INET: return ifatoia(ifa)->ia4_flags; #endif #ifdef INET6 case AF_INET6: return ifatoia6(ifa)->ia6_flags; #endif default: return 0; } } /* * Send a routing message as mimicing that a cloned route is added. */ void rt_clonedmsg(int type, const struct sockaddr *src, const struct sockaddr *dst, const uint8_t *lladdr, const struct ifnet *ifp) { struct rt_addrinfo info; /* Mimic flags exactly */ #define RTF_LLINFO 0x400 #define RTF_CLONED 0x2000 int flags = RTF_DONE; union { struct sockaddr sa; struct sockaddr_storage ss; struct sockaddr_dl sdl; } u; if (type != RTM_MISS) flags |= RTF_HOST | RTF_CLONED | RTF_LLINFO; if (type == RTM_ADD || type == RTM_CHANGE) flags |= RTF_UP; memset(&info, 0, sizeof(info)); info.rti_info[RTAX_AUTHOR] = src; info.rti_info[RTAX_DST] = dst; sockaddr_dl_init(&u.sdl, sizeof(u.ss), ifp->if_index, ifp->if_type, NULL, 0, lladdr, ifp->if_addrlen); info.rti_info[RTAX_GATEWAY] = &u.sa; rt_missmsg(type, &info, flags, 0); #undef RTF_LLINFO #undef RTF_CLONED } /* * The remaining code implements the routing-table sysctl node. It is * compiled only for the non-COMPAT case. */ /* * This is used in dumping the kernel table via sysctl(). */ static int sysctl_dumpentry(struct rtentry *rt, void *v) { struct rt_walkarg *w = v; int error = 0, size; struct rt_addrinfo info; if (w->w_op == NET_RT_FLAGS && !(rt->rt_flags & w->w_arg)) return 0; memset(&info, 0, sizeof(info)); info.rti_info[RTAX_DST] = rt_getkey(rt); info.rti_info[RTAX_GATEWAY] = rt->rt_gateway; info.rti_info[RTAX_NETMASK] = rt_mask(rt); info.rti_info[RTAX_TAG] = rt_gettag(rt); if (rt->rt_ifp) { const struct ifaddr *rtifa; info.rti_info[RTAX_IFP] = rt->rt_ifp->if_dl->ifa_addr; /* rtifa used to be simply rt->rt_ifa. If rt->rt_ifa != NULL, * then rt_get_ifa() != NULL. So this ought to still be safe. * --dyoung */ rtifa = rt_get_ifa(rt); info.rti_info[RTAX_IFA] = rtifa->ifa_addr; if (rt->rt_ifp->if_flags & IFF_POINTOPOINT) info.rti_info[RTAX_BRD] = rtifa->ifa_dstaddr; } if ((error = rt_msg2(RTM_GET, &info, 0, w, &size))) return error; if (w->w_where && w->w_tmem && w->w_needed <= 0) { struct rt_xmsghdr *rtm = (struct rt_xmsghdr *)w->w_tmem; rtm->rtm_flags = rt->rt_flags; rtm->rtm_use = rt->rt_use; rtm_setmetrics(rt, rtm); KASSERT(rt->rt_ifp != NULL); rtm->rtm_index = rt->rt_ifp->if_index; rtm->rtm_errno = rtm->rtm_pid = rtm->rtm_seq = 0; rtm->rtm_addrs = info.rti_addrs; if ((error = copyout(rtm, w->w_where, size)) != 0) w->w_where = NULL; else w->w_where = (char *)w->w_where + size; } return error; } static int sysctl_iflist_if(struct ifnet *ifp, struct rt_walkarg *w, struct rt_addrinfo *info, size_t len) { struct if_xmsghdr *ifm; int error; ifm = (struct if_xmsghdr *)w->w_tmem; ifm->ifm_index = ifp->if_index; ifm->ifm_flags = ifp->if_flags; if_export_if_data(ifp, &ifm->ifm_data, false); ifm->ifm_addrs = info->rti_addrs; if ((error = copyout(ifm, w->w_where, len)) == 0) w->w_where = (char *)w->w_where + len; return error; } static int sysctl_iflist_addr(struct rt_walkarg *w, struct ifaddr *ifa, struct rt_addrinfo *info) { int len, error; if ((error = rt_msg2(RTM_XNEWADDR, info, 0, w, &len))) return error; if (w->w_where && w->w_tmem && w->w_needed <= 0) { struct ifa_xmsghdr *ifam; ifam = (struct ifa_xmsghdr *)w->w_tmem; ifam->ifam_index = ifa->ifa_ifp->if_index; ifam->ifam_flags = ifa->ifa_flags; ifam->ifam_metric = ifa->ifa_metric; ifam->ifam_addrs = info->rti_addrs; ifam->ifam_pid = 0; ifam->ifam_addrflags = if_addrflags(ifa); if ((error = copyout(w->w_tmem, w->w_where, len)) == 0) w->w_where = (char *)w->w_where + len; } return error; } static int sysctl_iflist(int af, struct rt_walkarg *w, int type) { struct ifnet *ifp; struct ifaddr *ifa; struct rt_addrinfo info; int cmd, len, error = 0; int s; struct psref psref; int bound; switch (type) { case NET_RT_IFLIST: cmd = RTM_IFINFO; break; case NET_RT_OOOIFLIST: cmd = RTM_OOIFINFO; break; case NET_RT_OOIFLIST: cmd = RTM_OIFINFO; break; case NET_RT_OIFLIST: cmd = RTM_IFINFO; break; default: #ifdef RTSOCK_DEBUG printf("%s: unsupported IFLIST type %d\n", __func__, type); #endif return EINVAL; } memset(&info, 0, sizeof(info)); bound = curlwp_bind(); s = pserialize_read_enter(); IFNET_READER_FOREACH(ifp) { int _s; if (w->w_arg && w->w_arg != ifp->if_index) continue; if (IFADDR_READER_EMPTY(ifp)) continue; if_acquire(ifp, &psref); pserialize_read_exit(s); info.rti_info[RTAX_IFP] = ifp->if_dl->ifa_addr; if ((error = rt_msg2(cmd, &info, NULL, w, &len)) != 0) goto release_exit; info.rti_info[RTAX_IFP] = NULL; if (w->w_where && w->w_tmem && w->w_needed <= 0) { switch (type) { case NET_RT_OIFLIST: /* old _70 */ if (!rtsock_iflist_70_hook.hooked) { error = EINVAL; break; } /* FALLTHROUGH */ case NET_RT_IFLIST: /* current */ error = sysctl_iflist_if(ifp, w, &info, len); break; case NET_RT_OOIFLIST: /* old _50 */ MODULE_HOOK_CALL(rtsock_iflist_50_hook, (ifp, w, &info, len), enosys(), error); break; case NET_RT_OOOIFLIST: /* old _14 */ MODULE_HOOK_CALL(rtsock_iflist_14_hook, (ifp, w, &info, len), enosys(), error); break; default: error = EINVAL; } if (error != 0) { if (error == ENOSYS) error = EINVAL; goto release_exit; } } _s = pserialize_read_enter(); IFADDR_READER_FOREACH(ifa, ifp) { struct psref _psref; if (af && af != ifa->ifa_addr->sa_family) continue; ifa_acquire(ifa, &_psref); pserialize_read_exit(_s); info.rti_info[RTAX_IFA] = ifa->ifa_addr; info.rti_info[RTAX_NETMASK] = ifa->ifa_netmask; info.rti_info[RTAX_BRD] = ifa->ifa_dstaddr; switch (type) { case NET_RT_IFLIST: error = sysctl_iflist_addr(w, ifa, &info); break; case NET_RT_OIFLIST: case NET_RT_OOIFLIST: case NET_RT_OOOIFLIST: MODULE_HOOK_CALL(rtsock_iflist_70_hook, (w, ifa, &info), enosys(), error); break; default: error = EINVAL; } _s = pserialize_read_enter(); ifa_release(ifa, &_psref); if (error != 0) { pserialize_read_exit(_s); goto release_exit; } } pserialize_read_exit(_s); info.rti_info[RTAX_IFA] = info.rti_info[RTAX_NETMASK] = info.rti_info[RTAX_BRD] = NULL; s = pserialize_read_enter(); if_release(ifp, &psref); } pserialize_read_exit(s); curlwp_bindx(bound); return 0; release_exit: if_release(ifp, &psref); curlwp_bindx(bound); return error; } static int sysctl_rtable(SYSCTLFN_ARGS) { void *where = oldp; size_t *given = oldlenp; int i, error = EINVAL; u_char af; struct rt_walkarg w; if (namelen == 1 && name[0] == CTL_QUERY) return sysctl_query(SYSCTLFN_CALL(rnode)); if (newp) return EPERM; if (namelen != 3) return EINVAL; af = name[0]; w.w_tmemneeded = 0; w.w_tmemsize = 0; w.w_tmem = NULL; again: /* we may return here if a later [re]alloc of the t_mem buffer fails */ if (w.w_tmemneeded) { w.w_tmem = kmem_zalloc(w.w_tmemneeded, KM_SLEEP); w.w_tmemsize = w.w_tmemneeded; w.w_tmemneeded = 0; } w.w_op = name[1]; w.w_arg = name[2]; w.w_given = *given; w.w_needed = 0 - w.w_given; w.w_where = where; KERNEL_LOCK_UNLESS_NET_MPSAFE(); const int s = splsoftnet(); switch (w.w_op) { case NET_RT_DUMP: case NET_RT_FLAGS: #if defined(INET) || defined(INET6) /* * take care of llinfo entries, the caller must * specify an AF */ if (w.w_op == NET_RT_FLAGS && (w.w_arg == 0 || w.w_arg & RTF_LLDATA)) { if (af != 0) error = lltable_sysctl_dump(af, &w); else error = EINVAL; break; } #endif for (i = 1; i <= AF_MAX; i++) { if (af == 0 || af == i) { error = rt_walktree(i, sysctl_dumpentry, &w); if (error != 0) break; #if defined(INET) || defined(INET6) /* * Return ARP/NDP entries too for * backward compatibility. */ error = lltable_sysctl_dump(i, &w); if (error != 0) break; #endif } } break; case NET_RT_OOOIFLIST: /* compat_14 */ case NET_RT_OOIFLIST: /* compat_50 */ case NET_RT_OIFLIST: /* compat_70 */ case NET_RT_IFLIST: /* current */ error = sysctl_iflist(af, &w, w.w_op); break; } splx(s); KERNEL_UNLOCK_UNLESS_NET_MPSAFE(); /* check to see if we couldn't allocate memory with NOWAIT */ if (error == ENOBUFS && w.w_tmem == 0 && w.w_tmemneeded) goto again; if (w.w_tmem) kmem_free(w.w_tmem, w.w_tmemsize); w.w_needed += w.w_given; if (where) { *given = (char *)w.w_where - (char *)where; if (*given < w.w_needed) return ENOMEM; } else { *given = (11 * w.w_needed) / 10; } return error; } void sysctl_net_route_setup(struct sysctllog **clog, int pf, const char *name) { const struct sysctlnode *rnode = NULL; sysctl_createv(clog, 0, NULL, &rnode, CTLFLAG_PERMANENT, CTLTYPE_NODE, name, SYSCTL_DESCR("PF_ROUTE information"), NULL, 0, NULL, 0, CTL_NET, pf, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT, CTLTYPE_NODE, "rtable", SYSCTL_DESCR("Routing table information"), sysctl_rtable, 0, NULL, 0, CTL_NET, pf, 0 /* any protocol */, CTL_EOL); sysctl_createv(clog, 0, &rnode, NULL, CTLFLAG_PERMANENT, CTLTYPE_STRUCT, "stats", SYSCTL_DESCR("Routing statistics"), NULL, 0, &rtstat, sizeof(rtstat), CTL_CREATE, CTL_EOL); }
61 63 63 63 62 62 8 62 63 63 63 62 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 /* $NetBSD: cprng_fast.c,v 1.19 2023/08/05 11:39:18 riastradh Exp $ */ /*- * Copyright (c) 2014 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Taylor R. Campbell. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: cprng_fast.c,v 1.19 2023/08/05 11:39:18 riastradh Exp $"); #include <sys/types.h> #include <sys/param.h> #include <sys/bitops.h> #include <sys/cprng.h> #include <sys/cpu.h> #include <sys/entropy.h> #include <sys/evcnt.h> #include <sys/kmem.h> #include <sys/percpu.h> #include <sys/pserialize.h> #include <crypto/chacha/chacha.h> #define CPRNG_FAST_SEED_BYTES CHACHA_STREAM_KEYBYTES struct cprng_fast { /* 128-bit vector unit generates 256 bytes at once */ uint8_t buf[256]; uint8_t key[CPRNG_FAST_SEED_BYTES]; uint8_t nonce[CHACHA_STREAM_NONCEBYTES]; unsigned i; struct evcnt *reseed_evcnt; unsigned epoch; }; static void cprng_fast_init_cpu(void *, void *, struct cpu_info *); static void cprng_fast_reseed(struct cprng_fast **, unsigned); static void cprng_fast_seed(struct cprng_fast *, const void *); static void cprng_fast_buf(struct cprng_fast *, void *, unsigned); static void cprng_fast_buf_short(void *, size_t); static void cprng_fast_buf_long(void *, size_t); static percpu_t *cprng_fast_percpu __read_mostly; void cprng_fast_init(void) { cprng_fast_percpu = percpu_create(sizeof(struct cprng_fast), cprng_fast_init_cpu, NULL, NULL); } static void cprng_fast_init_cpu(void *p, void *arg __unused, struct cpu_info *ci) { struct cprng_fast *const cprng = p; cprng->epoch = 0; cprng->reseed_evcnt = kmem_alloc(sizeof(*cprng->reseed_evcnt), KM_SLEEP); evcnt_attach_dynamic(cprng->reseed_evcnt, EVCNT_TYPE_MISC, NULL, ci->ci_cpuname, "cprng_fast reseed"); } static int cprng_fast_get(struct cprng_fast **cprngp) { struct cprng_fast *cprng; unsigned epoch; int s; KASSERT(!cpu_intr_p()); KASSERT(pserialize_not_in_read_section()); *cprngp = cprng = percpu_getref(cprng_fast_percpu); s = splsoftserial(); epoch = entropy_epoch(); if (__predict_false(cprng->epoch != epoch)) { splx(s); cprng_fast_reseed(cprngp, epoch); s = splsoftserial(); } return s; } static void cprng_fast_put(struct cprng_fast *cprng, int s) { KASSERT((cprng == percpu_getref(cprng_fast_percpu)) && (percpu_putref(cprng_fast_percpu), true)); splx(s); percpu_putref(cprng_fast_percpu); } static void cprng_fast_reseed(struct cprng_fast **cprngp, unsigned epoch) { struct cprng_fast *cprng; uint8_t seed[CPRNG_FAST_SEED_BYTES]; int s; /* * Drop the percpu(9) reference to extract a fresh seed from * the entropy pool. cprng_strong may sleep on an adaptive * lock, which invalidates our percpu(9) reference. * * This may race with reseeding in another thread, which is no * big deal -- worst case, we rewind the entropy epoch here and * cause the next caller to reseed again, and in the end we * just reseed a couple more times than necessary. */ percpu_putref(cprng_fast_percpu); cprng_strong(kern_cprng, seed, sizeof(seed), 0); *cprngp = cprng = percpu_getref(cprng_fast_percpu); s = splsoftserial(); cprng_fast_seed(cprng, seed); cprng->epoch = epoch; cprng->reseed_evcnt->ev_count++; splx(s); explicit_memset(seed, 0, sizeof(seed)); } /* CPRNG algorithm */ static void cprng_fast_seed(struct cprng_fast *cprng, const void *seed) { (void)memset(cprng->buf, 0, sizeof cprng->buf); (void)memcpy(cprng->key, seed, sizeof cprng->key); (void)memset(cprng->nonce, 0, sizeof cprng->nonce); cprng->i = sizeof cprng->buf; } static void cprng_fast_buf(struct cprng_fast *cprng, void *buf, unsigned len) { uint8_t *p = buf; unsigned n = len, n0; KASSERT(cprng->i <= sizeof(cprng->buf)); KASSERT(len <= sizeof(cprng->buf)); n0 = MIN(n, sizeof(cprng->buf) - cprng->i); memcpy(p, &cprng->buf[cprng->i], n0); if ((n -= n0) == 0) { cprng->i += n0; KASSERT(cprng->i <= sizeof(cprng->buf)); return; } p += n0; le64enc(cprng->nonce, 1 + le64dec(cprng->nonce)); chacha_stream(cprng->buf, sizeof(cprng->buf), 0, cprng->nonce, cprng->key, 8); memcpy(p, cprng->buf, n); cprng->i = n; } /* Public API */ static void cprng_fast_buf_short(void *buf, size_t len) { struct cprng_fast *cprng; int s; KASSERT(len <= sizeof(cprng->buf)); s = cprng_fast_get(&cprng); cprng_fast_buf(cprng, buf, len); cprng_fast_put(cprng, s); } static void cprng_fast_buf_long(void *buf, size_t len) { uint8_t seed[CHACHA_STREAM_KEYBYTES]; uint8_t nonce[CHACHA_STREAM_NONCEBYTES] = {0}; CTASSERT(sizeof(seed) <= sizeof(((struct cprng_fast *)0)->buf)); #if SIZE_MAX >= 0x3fffffffff /* >=256 GB is not reasonable */ KASSERT(len <= 0x3fffffffff); #endif cprng_fast_buf_short(seed, sizeof seed); chacha_stream(buf, len, 0, nonce, seed, 8); (void)explicit_memset(seed, 0, sizeof seed); } uint32_t cprng_fast32(void) { uint32_t v; cprng_fast_buf_short(&v, sizeof v); return v; } uint64_t cprng_fast64(void) { uint64_t v; cprng_fast_buf_short(&v, sizeof v); return v; } size_t cprng_fast(void *buf, size_t len) { /* * We don't want to hog the CPU, so we use the short version, * to generate output without preemption, only if we can do it * with at most one ChaCha call. */ if (len <= sizeof(((struct cprng_fast *)0)->buf)) cprng_fast_buf_short(buf, len); else cprng_fast_buf_long(buf, len); return len; /* hysterical raisins */ }
1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 /* $NetBSD: in6_offload.c,v 1.12 2018/12/12 01:40:20 rin Exp $ */ /* * Copyright (c)2006 YAMAMOTO Takashi, * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: in6_offload.c,v 1.12 2018/12/12 01:40:20 rin Exp $"); #include <sys/param.h> #include <sys/mbuf.h> #include <net/if.h> #include <netinet/in.h> #include <netinet/in_systm.h> #include <netinet/ip6.h> #include <netinet/tcp.h> #include <netinet6/in6_var.h> #include <netinet6/ip6_var.h> #include <netinet6/nd6.h> #include <netinet6/in6_offload.h> /* * Handle M_CSUM_TSOv6 in software. Split the TCP payload in chunks of * size MSS, and return mbuf chain consists of them. */ struct mbuf * tcp6_segment(struct mbuf *m, int off) { int mss; int iphlen; int thlen; int hlen; int len; struct ip6_hdr *iph; struct tcphdr *th; uint32_t tcpseq; uint16_t phsum; struct mbuf *hdr = NULL; struct mbuf *m0 = NULL; struct mbuf *prev = NULL; struct mbuf *n, *t; int nsegs; KASSERT((m->m_flags & M_PKTHDR) != 0); KASSERT((m->m_pkthdr.csum_flags & M_CSUM_TSOv6) != 0); m->m_pkthdr.csum_flags = 0; len = m->m_pkthdr.len; KASSERT(len >= off + sizeof(*iph) + sizeof(*th)); hlen = off + sizeof(*iph); if (m->m_len < hlen) { m = m_pullup(m, hlen); if (m == NULL) goto quit; } iph = (void *)(mtod(m, char *) + off); iphlen = sizeof(*iph); KASSERT((iph->ip6_vfc & IPV6_VERSION_MASK) == IPV6_VERSION); KASSERT(iph->ip6_nxt == IPPROTO_TCP); hlen = off + iphlen + sizeof(*th); if (m->m_len < hlen) { m = m_pullup(m, hlen); if (m == NULL) goto quit; } th = (void *)(mtod(m, char *) + off + iphlen); tcpseq = ntohl(th->th_seq); thlen = th->th_off * 4; hlen = off + iphlen + thlen; mss = m->m_pkthdr.segsz; KASSERT(mss != 0); KASSERT(len > hlen); t = m_split(m, hlen, M_NOWAIT); if (t == NULL) goto quit; hdr = m; m = t; len -= hlen; KASSERT(len % mss == 0); iph = (void *)(mtod(hdr, char *) + off); iph->ip6_plen = htons(thlen + mss); phsum = in6_cksum_phdr(&iph->ip6_src, &iph->ip6_dst, htonl(thlen + mss), htonl(IPPROTO_TCP)); for (nsegs = len / mss; nsegs > 0; nsegs--) { if (nsegs > 1) { n = m_dup(hdr, 0, hlen, M_NOWAIT); if (n == NULL) goto quit; } else n = hdr; KASSERT(n->m_len == hlen); /* XXX */ if (nsegs > 1) { t = m_split(m, mss, M_NOWAIT); if (t == NULL) { m_freem(n); goto quit; } } else t = m; m_cat(n, m); m = t; KASSERT(n->m_len >= hlen); /* XXX */ if (m0 == NULL) m0 = n; if (prev != NULL) prev->m_nextpkt = n; n->m_pkthdr.len = hlen + mss; n->m_nextpkt = NULL; /* XXX */ th = (void *)(mtod(n, char *) + off + iphlen); th->th_seq = htonl(tcpseq); th->th_sum = phsum; th->th_sum = in6_cksum(n, 0, off + iphlen, thlen + mss); tcpseq += mss; prev = n; } return m0; quit: if (hdr != NULL) m_freem(hdr); if (m != NULL) m_freem(m); for (m = m0; m != NULL; m = n) { n = m->m_nextpkt; m_freem(m); } return NULL; } int ip6_tso_output(struct ifnet *ifp, struct ifnet *origifp, struct mbuf *m, const struct sockaddr_in6 *dst, struct rtentry *rt) { struct mbuf *n; int error = 0; m = tcp6_segment(m, 0); if (m == NULL) return ENOMEM; do { n = m->m_nextpkt; if (error == 0) error = ip6_if_output(ifp, origifp, m, dst, rt); else m_freem(m); m = n; } while (m != NULL); return error; } /* * Compute now in software the IP and TCP/UDP checksums. Cancel the * hardware offloading. */ void in6_undefer_cksum(struct mbuf *m, size_t hdrlen, int csum_flags) { const size_t ip6_plen_offset = hdrlen + offsetof(struct ip6_hdr, ip6_plen); size_t l4hdroff; size_t l4offset; uint16_t plen; uint16_t csum; KASSERT(m->m_flags & M_PKTHDR); KASSERT((m->m_pkthdr.csum_flags & csum_flags) == csum_flags); KASSERT(csum_flags == M_CSUM_UDPv6 || csum_flags == M_CSUM_TCPv6); if (__predict_true(hdrlen + sizeof(struct ip6_hdr) <= m->m_len)) { plen = *(uint16_t *)(mtod(m, char *) + ip6_plen_offset); } else { m_copydata(m, ip6_plen_offset, sizeof(plen), &plen); } plen = ntohs(plen); l4hdroff = M_CSUM_DATA_IPv6_IPHL(m->m_pkthdr.csum_data); l4offset = hdrlen + l4hdroff; csum = in6_cksum(m, 0, l4offset, plen - (l4hdroff - sizeof(struct ip6_hdr))); if (csum == 0 && (csum_flags & M_CSUM_UDPv6) != 0) csum = 0xffff; l4offset += M_CSUM_DATA_IPv6_OFFSET(m->m_pkthdr.csum_data); if (__predict_true((l4offset + sizeof(uint16_t)) <= m->m_len)) { *(uint16_t *)(mtod(m, char *) + l4offset) = csum; } else { m_copyback(m, l4offset, sizeof(csum), (void *) &csum); } m->m_pkthdr.csum_flags ^= csum_flags; } /* * Compute now in software the TCP/UDP checksum. Cancel the hardware * offloading. */ void in6_undefer_cksum_tcpudp(struct mbuf *m) { uint16_t csum, offset; KASSERT((m->m_pkthdr.csum_flags & (M_CSUM_UDPv6|M_CSUM_TCPv6)) != 0); KASSERT((~m->m_pkthdr.csum_flags & (M_CSUM_UDPv6|M_CSUM_TCPv6)) != 0); KASSERT((m->m_pkthdr.csum_flags & (M_CSUM_UDPv4|M_CSUM_TCPv4|M_CSUM_TSOv4)) == 0); offset = M_CSUM_DATA_IPv6_IPHL(m->m_pkthdr.csum_data); csum = in6_cksum(m, 0, offset, m->m_pkthdr.len - offset); if (csum == 0 && (m->m_pkthdr.csum_flags & M_CSUM_UDPv6) != 0) { csum = 0xffff; } offset += M_CSUM_DATA_IPv6_OFFSET(m->m_pkthdr.csum_data); if ((offset + sizeof(csum)) > m->m_len) { m_copyback(m, offset, sizeof(csum), &csum); } else { *(uint16_t *)(mtod(m, char *) + offset) = csum; } }
65 65 26 26 5 22 5 22 26 26 26 26 26 26 26 34 34 5 5 17 17 5 5 20 20 20 20 17 20 8 3 20 20 20 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 /* $NetBSD: ffs_subr.c,v 1.54 2023/01/07 19:41:30 chs Exp $ */ /* * Copyright (c) 1982, 1986, 1989, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)ffs_subr.c 8.5 (Berkeley) 3/21/95 */ #if HAVE_NBTOOL_CONFIG_H #include "nbtool_config.h" #endif #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: ffs_subr.c,v 1.54 2023/01/07 19:41:30 chs Exp $"); #include <sys/param.h> /* in ffs_tables.c */ extern const int inside[], around[]; extern const u_char * const fragtbl[]; #ifndef _KERNEL #define FFS_EI /* always include byteswapped filesystems support */ #endif #include <ufs/ffs/fs.h> #include <ufs/ffs/ffs_extern.h> #include <ufs/ufs/ufs_bswap.h> #ifndef _KERNEL #include <ufs/ufs/dinode.h> void panic(const char *, ...) __attribute__((__noreturn__,__format__(__printf__,1,2))); #else /* _KERNEL */ #include <sys/systm.h> #include <sys/vnode.h> #include <sys/mount.h> #include <sys/buf.h> #include <sys/inttypes.h> #include <sys/pool.h> #include <sys/fstrans.h> #include <ufs/ufs/inode.h> #include <ufs/ufs/ufsmount.h> #include <ufs/ufs/ufs_extern.h> /* * Load up the contents of an inode and copy the appropriate pieces * to the incore copy. */ void ffs_load_inode(struct buf *bp, struct inode *ip, struct fs *fs, ino_t ino) { struct ufs1_dinode *dp1; struct ufs2_dinode *dp2; if (ip->i_ump->um_fstype == UFS1) { dp1 = (struct ufs1_dinode *)bp->b_data + ino_to_fsbo(fs, ino); #ifdef FFS_EI if (UFS_FSNEEDSWAP(fs)) ffs_dinode1_swap(dp1, ip->i_din.ffs1_din); else #endif *ip->i_din.ffs1_din = *dp1; ip->i_mode = ip->i_ffs1_mode; ip->i_nlink = ip->i_ffs1_nlink; ip->i_size = ip->i_ffs1_size; ip->i_flags = ip->i_ffs1_flags; ip->i_gen = ip->i_ffs1_gen; ip->i_uid = ip->i_ffs1_uid; ip->i_gid = ip->i_ffs1_gid; } else { dp2 = (struct ufs2_dinode *)bp->b_data + ino_to_fsbo(fs, ino); #ifdef FFS_EI if (UFS_FSNEEDSWAP(fs)) ffs_dinode2_swap(dp2, ip->i_din.ffs2_din); else #endif *ip->i_din.ffs2_din = *dp2; ip->i_mode = ip->i_ffs2_mode; ip->i_nlink = ip->i_ffs2_nlink; ip->i_size = ip->i_ffs2_size; ip->i_flags = ip->i_ffs2_flags; ip->i_gen = ip->i_ffs2_gen; ip->i_uid = ip->i_ffs2_uid; ip->i_gid = ip->i_ffs2_gid; } } int ffs_getblk(struct vnode *vp, daddr_t lblkno, daddr_t blkno, int size, bool clearbuf, buf_t **bpp) { int error = 0; KASSERT(blkno >= 0 || blkno == FFS_NOBLK); if ((*bpp = getblk(vp, lblkno, size, 0, 0)) == NULL) return ENOMEM; if (blkno != FFS_NOBLK) (*bpp)->b_blkno = blkno; if (clearbuf) clrbuf(*bpp); if ((*bpp)->b_blkno >= 0 && (error = fscow_run(*bpp, false)) != 0) { brelse(*bpp, BC_INVAL); *bpp = NULL; } return error; } #endif /* _KERNEL */ /* * Update the frsum fields to reflect addition or deletion * of some frags. */ void ffs_fragacct(struct fs *fs, int fragmap, uint32_t fraglist[], int cnt, int needswap) { int inblk; int field, subfield; int siz, pos; inblk = (int)(fragtbl[fs->fs_frag][fragmap]) << 1; fragmap <<= 1; for (siz = 1; siz < fs->fs_frag; siz++) { if ((inblk & (1 << (siz + (fs->fs_frag & (NBBY - 1))))) == 0) continue; field = around[siz]; subfield = inside[siz]; for (pos = siz; pos <= fs->fs_frag; pos++) { if ((fragmap & field) == subfield) { fraglist[siz] = ufs_rw32( ufs_rw32(fraglist[siz], needswap) + cnt, needswap); pos += siz; field <<= siz; subfield <<= siz; } field <<= 1; subfield <<= 1; } } } /* * block operations * * check if a block is available * returns true if all the corresponding bits in the free map are 1 * returns false if any corresponding bit in the free map is 0 */ int ffs_isblock(struct fs *fs, u_char *cp, int32_t h) { u_char mask; switch ((int)fs->fs_fragshift) { case 3: return (cp[h] == 0xff); case 2: mask = 0x0f << ((h & 0x1) << 2); return ((cp[h >> 1] & mask) == mask); case 1: mask = 0x03 << ((h & 0x3) << 1); return ((cp[h >> 2] & mask) == mask); case 0: mask = 0x01 << (h & 0x7); return ((cp[h >> 3] & mask) == mask); default: panic("%s: unknown fs_fragshift %d", __func__, (int)fs->fs_fragshift); } } /* * check if a block is completely allocated * returns true if all the corresponding bits in the free map are 0 * returns false if any corresponding bit in the free map is 1 */ int ffs_isfreeblock(struct fs *fs, u_char *cp, int32_t h) { switch ((int)fs->fs_fragshift) { case 3: return (cp[h] == 0); case 2: return ((cp[h >> 1] & (0x0f << ((h & 0x1) << 2))) == 0); case 1: return ((cp[h >> 2] & (0x03 << ((h & 0x3) << 1))) == 0); case 0: return ((cp[h >> 3] & (0x01 << (h & 0x7))) == 0); default: panic("%s: unknown fs_fragshift %d", __func__, (int)fs->fs_fragshift); } } /* * take a block out of the map */ void ffs_clrblock(struct fs *fs, u_char *cp, int32_t h) { switch ((int)fs->fs_fragshift) { case 3: cp[h] = 0; return; case 2: cp[h >> 1] &= ~(0x0f << ((h & 0x1) << 2)); return; case 1: cp[h >> 2] &= ~(0x03 << ((h & 0x3) << 1)); return; case 0: cp[h >> 3] &= ~(0x01 << (h & 0x7)); return; default: panic("%s: unknown fs_fragshift %d", __func__, (int)fs->fs_fragshift); } } /* * put a block into the map */ void ffs_setblock(struct fs *fs, u_char *cp, int32_t h) { switch ((int)fs->fs_fragshift) { case 3: cp[h] = 0xff; return; case 2: cp[h >> 1] |= (0x0f << ((h & 0x1) << 2)); return; case 1: cp[h >> 2] |= (0x03 << ((h & 0x3) << 1)); return; case 0: cp[h >> 3] |= (0x01 << (h & 0x7)); return; default: panic("%s: unknown fs_fragshift %d", __func__, (int)fs->fs_fragshift); } } /* * Update the cluster map because of an allocation or free. * * Cnt == 1 means free; cnt == -1 means allocating. */ void ffs_clusteracct(struct fs *fs, struct cg *cgp, int32_t blkno, int cnt) { int32_t *sump; int32_t *lp; u_char *freemapp, *mapp; int i, start, end, forw, back, map; unsigned int bit; const int needswap = UFS_FSNEEDSWAP(fs); /* KASSERT(mutex_owned(&ump->um_lock)); */ if (fs->fs_contigsumsize <= 0) return; freemapp = cg_clustersfree(cgp, needswap); sump = cg_clustersum(cgp, needswap); /* * Allocate or clear the actual block. */ if (cnt > 0) setbit(freemapp, blkno); else clrbit(freemapp, blkno); /* * Find the size of the cluster going forward. */ start = blkno + 1; end = start + fs->fs_contigsumsize; if ((uint32_t)end >= ufs_rw32(cgp->cg_nclusterblks, needswap)) end = ufs_rw32(cgp->cg_nclusterblks, needswap); mapp = &freemapp[start / NBBY]; map = *mapp++; bit = 1U << ((unsigned int)start % NBBY); for (i = start; i < end; i++) { if ((map & bit) == 0) break; if ((i & (NBBY - 1)) != (NBBY - 1)) { bit <<= 1; } else { map = *mapp++; bit = 1; } } forw = i - start; /* * Find the size of the cluster going backward. */ start = blkno - 1; end = start - fs->fs_contigsumsize; if (end < 0) end = -1; mapp = &freemapp[start / NBBY]; map = *mapp--; bit = 1U << ((unsigned int)start % NBBY); for (i = start; i > end; i--) { if ((map & bit) == 0) break; if ((i & (NBBY - 1)) != 0) { bit >>= 1; } else { map = *mapp--; bit = 1U << (NBBY - 1); } } back = start - i; /* * Account for old cluster and the possibly new forward and * back clusters. */ i = back + forw + 1; if (i > fs->fs_contigsumsize) i = fs->fs_contigsumsize; ufs_add32(sump[i], cnt, needswap); if (back > 0) ufs_add32(sump[back], -cnt, needswap); if (forw > 0) ufs_add32(sump[forw], -cnt, needswap); /* * Update cluster summary information. */ lp = &sump[fs->fs_contigsumsize]; for (i = fs->fs_contigsumsize; i > 0; i--) if (ufs_rw32(*lp--, needswap) > 0) break; #if defined(_KERNEL) fs->fs_maxcluster[ufs_rw32(cgp->cg_cgx, needswap)] = i; #endif }
5 1 2 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 /* $NetBSD: lfs_vfsops.c,v 1.382 2022/03/19 13:53:33 hannken Exp $ */ /*- * Copyright (c) 1999, 2000, 2001, 2002, 2003, 2007, 2007 * The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Konrad E. Schroder <perseant@hhhh.org>. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /*- * Copyright (c) 1989, 1991, 1993, 1994 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)lfs_vfsops.c 8.20 (Berkeley) 6/10/95 */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: lfs_vfsops.c,v 1.382 2022/03/19 13:53:33 hannken Exp $"); #if defined(_KERNEL_OPT) #include "opt_lfs.h" #include "opt_quota.h" #include "opt_uvmhist.h" #endif #include <sys/param.h> #include <sys/systm.h> #include <sys/namei.h> #include <sys/proc.h> #include <sys/kernel.h> #include <sys/vnode.h> #include <sys/mount.h> #include <sys/kthread.h> #include <sys/buf.h> #include <sys/device.h> #include <sys/file.h> #include <sys/disklabel.h> #include <sys/ioctl.h> #include <sys/errno.h> #include <sys/malloc.h> #include <sys/pool.h> #include <sys/socket.h> #include <sys/syslog.h> #include <sys/sysctl.h> #include <sys/conf.h> #include <sys/kauth.h> #include <sys/module.h> #include <sys/syscallvar.h> #include <sys/syscall.h> #include <sys/syscallargs.h> #include <miscfs/specfs/specdev.h> #include <ufs/lfs/ulfs_quotacommon.h> #include <ufs/lfs/ulfs_inode.h> #include <ufs/lfs/ulfsmount.h> #include <ufs/lfs/ulfs_bswap.h> #include <ufs/lfs/ulfs_extern.h> #ifdef UVMHIST #include <uvm/uvm.h> #endif #include <uvm/uvm_extern.h> #include <uvm/uvm_object.h> #include <uvm/uvm_page.h> #include <uvm/uvm_stat.h> #include <ufs/lfs/lfs.h> #include <ufs/lfs/lfs_accessors.h> #include <ufs/lfs/lfs_kernel.h> #include <ufs/lfs/lfs_extern.h> #include <miscfs/genfs/genfs.h> #include <miscfs/genfs/genfs_node.h> MODULE(MODULE_CLASS_VFS, lfs, NULL); static int lfs_gop_write(struct vnode *, struct vm_page **, int, int); static int lfs_mountfs(struct vnode *, struct mount *, struct lwp *); static int lfs_flushfiles(struct mount *, int); extern const struct vnodeopv_desc lfs_vnodeop_opv_desc; extern const struct vnodeopv_desc lfs_specop_opv_desc; extern const struct vnodeopv_desc lfs_fifoop_opv_desc; struct lwp * lfs_writer_daemon = NULL; kcondvar_t lfs_writerd_cv; int lfs_do_flush = 0; #ifdef LFS_KERNEL_RFW int lfs_do_rfw = 0; #endif const struct vnodeopv_desc * const lfs_vnodeopv_descs[] = { &lfs_vnodeop_opv_desc, &lfs_specop_opv_desc, &lfs_fifoop_opv_desc, NULL, }; struct vfsops lfs_vfsops = { .vfs_name = MOUNT_LFS, .vfs_min_mount_data = sizeof (struct ulfs_args), .vfs_mount = lfs_mount, .vfs_start = ulfs_start, .vfs_unmount = lfs_unmount, .vfs_root = ulfs_root, .vfs_quotactl = ulfs_quotactl, .vfs_statvfs = lfs_statvfs, .vfs_sync = lfs_sync, .vfs_vget = lfs_vget, .vfs_loadvnode = lfs_loadvnode, .vfs_newvnode = lfs_newvnode, .vfs_fhtovp = lfs_fhtovp, .vfs_vptofh = lfs_vptofh, .vfs_init = lfs_init, .vfs_reinit = lfs_reinit, .vfs_done = lfs_done, .vfs_mountroot = lfs_mountroot, .vfs_snapshot = (void *)eopnotsupp, .vfs_extattrctl = lfs_extattrctl, .vfs_suspendctl = genfs_suspendctl, .vfs_renamelock_enter = genfs_renamelock_enter, .vfs_renamelock_exit = genfs_renamelock_exit, .vfs_fsync = (void *)eopnotsupp, .vfs_opv_descs = lfs_vnodeopv_descs }; const struct genfs_ops lfs_genfsops = { .gop_size = lfs_gop_size, .gop_alloc = ulfs_gop_alloc, .gop_write = lfs_gop_write, .gop_markupdate = ulfs_gop_markupdate, .gop_putrange = genfs_gop_putrange, }; struct shortlong { const char *sname; const char *lname; }; static int sysctl_lfs_dostats(SYSCTLFN_ARGS) { extern struct lfs_stats lfs_stats; extern int lfs_dostats; int error; error = sysctl_lookup(SYSCTLFN_CALL(rnode)); if (error || newp == NULL) return (error); if (lfs_dostats == 0) memset(&lfs_stats, 0, sizeof(lfs_stats)); return (0); } SYSCTL_SETUP(lfs_sysctl_setup, "lfs sysctl") { int i; extern int lfs_writeindir, lfs_dostats, lfs_clean_vnhead, lfs_fs_pagetrip, lfs_ignore_lazy_sync; #ifdef DEBUG extern int lfs_debug_log_subsys[DLOG_MAX]; struct shortlong dlog_names[DLOG_MAX] = { /* Must match lfs.h ! */ { "rollforward", "Debug roll-forward code" }, { "alloc", "Debug inode allocation and free list" }, { "avail", "Debug space-available-now accounting" }, { "flush", "Debug flush triggers" }, { "lockedlist", "Debug locked list accounting" }, { "vnode_verbose", "Verbose per-vnode-written debugging" }, { "vnode", "Debug vnode use during segment write" }, { "segment", "Debug segment writing" }, { "seguse", "Debug segment used-bytes accounting" }, { "cleaner", "Debug cleaning routines" }, { "mount", "Debug mount/unmount routines" }, { "pagecache", "Debug UBC interactions" }, { "dirop", "Debug directory-operation accounting" }, { "malloc", "Debug private malloc accounting" }, }; #endif /* DEBUG */ struct shortlong stat_names[] = { /* Must match lfs.h! */ { "segsused", "Number of new segments allocated" }, { "psegwrites", "Number of partial-segment writes" }, { "psyncwrites", "Number of synchronous partial-segment" " writes" }, { "pcleanwrites", "Number of partial-segment writes by the" " cleaner" }, { "blocktot", "Number of blocks written" }, { "cleanblocks", "Number of blocks written by the cleaner" }, { "ncheckpoints", "Number of checkpoints made" }, { "nwrites", "Number of whole writes" }, { "nsync_writes", "Number of synchronous writes" }, { "wait_exceeded", "Number of times writer waited for" " cleaner" }, { "write_exceeded", "Number of times writer invoked flush" }, { "flush_invoked", "Number of times flush was invoked" }, { "vflush_invoked", "Number of time vflush was called" }, { "clean_inlocked", "Number of vnodes skipped for being dead" }, { "clean_vnlocked", "Number of vnodes skipped for vget failure" }, { "segs_reclaimed", "Number of segments reclaimed" }, }; sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT, CTLTYPE_NODE, "lfs", SYSCTL_DESCR("Log-structured file system"), NULL, 0, NULL, 0, CTL_VFS, 5, CTL_EOL); /* * XXX the "5" above could be dynamic, thereby eliminating one * more instance of the "number to vfs" mapping problem, but * "5" is the order as taken from sys/mount.h */ sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_INT, "flushindir", NULL, NULL, 0, &lfs_writeindir, 0, CTL_VFS, 5, LFS_WRITEINDIR, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_INT, "clean_vnhead", NULL, NULL, 0, &lfs_clean_vnhead, 0, CTL_VFS, 5, LFS_CLEAN_VNHEAD, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_INT, "dostats", SYSCTL_DESCR("Maintain statistics on LFS operations"), sysctl_lfs_dostats, 0, &lfs_dostats, 0, CTL_VFS, 5, LFS_DOSTATS, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_INT, "pagetrip", SYSCTL_DESCR("How many dirty pages in fs triggers" " a flush"), NULL, 0, &lfs_fs_pagetrip, 0, CTL_VFS, 5, LFS_FS_PAGETRIP, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_INT, "ignore_lazy_sync", SYSCTL_DESCR("Lazy Sync is ignored entirely"), NULL, 0, &lfs_ignore_lazy_sync, 0, CTL_VFS, 5, LFS_IGNORE_LAZY_SYNC, CTL_EOL); #ifdef LFS_KERNEL_RFW sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_INT, "rfw", SYSCTL_DESCR("Use in-kernel roll-forward on mount"), NULL, 0, &lfs_do_rfw, 0, CTL_VFS, 5, LFS_DO_RFW, CTL_EOL); #endif sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT, CTLTYPE_NODE, "stats", SYSCTL_DESCR("Debugging options"), NULL, 0, NULL, 0, CTL_VFS, 5, LFS_STATS, CTL_EOL); for (i = 0; i < sizeof(struct lfs_stats) / sizeof(u_int); i++) { sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_READONLY, CTLTYPE_INT, stat_names[i].sname, SYSCTL_DESCR(stat_names[i].lname), NULL, 0, &(((u_int *)&lfs_stats.segsused)[i]), 0, CTL_VFS, 5, LFS_STATS, i, CTL_EOL); } #ifdef DEBUG sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT, CTLTYPE_NODE, "debug", SYSCTL_DESCR("Debugging options"), NULL, 0, NULL, 0, CTL_VFS, 5, LFS_DEBUGLOG, CTL_EOL); for (i = 0; i < DLOG_MAX; i++) { sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_INT, dlog_names[i].sname, SYSCTL_DESCR(dlog_names[i].lname), NULL, 0, &(lfs_debug_log_subsys[i]), 0, CTL_VFS, 5, LFS_DEBUGLOG, i, CTL_EOL); } #endif } /* old cleaner syscall interface. see VOP_FCNTL() */ static const struct syscall_package lfs_syscalls[] = { { SYS_lfs_bmapv, 0, (sy_call_t *)sys_lfs_bmapv }, { SYS_lfs_markv, 0, (sy_call_t *)sys_lfs_markv }, { SYS___lfs_segwait50, 0, (sy_call_t *)sys___lfs_segwait50 }, { SYS_lfs_segclean, 0, (sy_call_t *)sys_lfs_segclean }, { 0, 0, NULL }, }; static int lfs_modcmd(modcmd_t cmd, void *arg) { int error; switch (cmd) { case MODULE_CMD_INIT: error = syscall_establish(NULL, lfs_syscalls); if (error) return error; error = vfs_attach(&lfs_vfsops); if (error != 0) { syscall_disestablish(NULL, lfs_syscalls); break; } cv_init(&lfs_allclean_wakeup, "segment"); break; case MODULE_CMD_FINI: error = vfs_detach(&lfs_vfsops); if (error != 0) break; syscall_disestablish(NULL, lfs_syscalls); cv_destroy(&lfs_allclean_wakeup); break; default: error = ENOTTY; break; } return (error); } /* * XXX Same structure as FFS inodes? Should we share a common pool? */ struct pool lfs_inode_pool; struct pool lfs_dinode_pool; struct pool lfs_inoext_pool; struct pool lfs_lbnentry_pool; /* * The writer daemon. UVM keeps track of how many dirty pages we are holding * in lfs_subsys_pages; the daemon flushes the filesystem when this value * crosses the (user-defined) threshold LFS_MAX_PAGES. */ static void lfs_writerd(void *arg) { mount_iterator_t *iter; struct mount *mp; struct lfs *fs; struct vfsops *vfs = NULL; int fsflags; int lfsc; int wrote_something = 0; mutex_enter(&lfs_lock); KASSERTMSG(lfs_writer_daemon == NULL, "more than one LFS writer daemon"); lfs_writer_daemon = curlwp; mutex_exit(&lfs_lock); /* Take an extra reference to the LFS vfsops. */ vfs = vfs_getopsbyname(MOUNT_LFS); mutex_enter(&lfs_lock); for (;;) { KASSERT(mutex_owned(&lfs_lock)); if (wrote_something == 0) cv_timedwait(&lfs_writerd_cv, &lfs_lock, hz/10 + 1); KASSERT(mutex_owned(&lfs_lock)); wrote_something = 0; /* * If global state wants a flush, flush everything. */ if (lfs_do_flush || locked_queue_count > LFS_MAX_BUFS || locked_queue_bytes > LFS_MAX_BYTES || lfs_subsys_pages > LFS_MAX_PAGES) { if (lfs_do_flush) { DLOG((DLOG_FLUSH, "lfs_writerd: lfs_do_flush\n")); } if (locked_queue_count > LFS_MAX_BUFS) { DLOG((DLOG_FLUSH, "lfs_writerd: lqc = %d, max %d\n", locked_queue_count, LFS_MAX_BUFS)); } if (locked_queue_bytes > LFS_MAX_BYTES) { DLOG((DLOG_FLUSH, "lfs_writerd: lqb = %ld, max %ld\n", locked_queue_bytes, LFS_MAX_BYTES)); } if (lfs_subsys_pages > LFS_MAX_PAGES) { DLOG((DLOG_FLUSH, "lfs_writerd: lssp = %d, max %d\n", lfs_subsys_pages, LFS_MAX_PAGES)); } lfs_flush(NULL, SEGM_WRITERD, 0); lfs_do_flush = 0; KASSERT(mutex_owned(&lfs_lock)); continue; } KASSERT(mutex_owned(&lfs_lock)); mutex_exit(&lfs_lock); /* * Look through the list of LFSs to see if any of them * have requested pageouts. */ mountlist_iterator_init(&iter); lfsc = 0; while ((mp = mountlist_iterator_next(iter)) != NULL) { KASSERT(!mutex_owned(&lfs_lock)); if (strncmp(mp->mnt_stat.f_fstypename, MOUNT_LFS, sizeof(mp->mnt_stat.f_fstypename)) == 0) { ++lfsc; fs = VFSTOULFS(mp)->um_lfs; daddr_t ooffset = 0; fsflags = SEGM_SINGLE; mutex_enter(&lfs_lock); ooffset = lfs_sb_getoffset(fs); if (lfs_sb_getnextseg(fs) < lfs_sb_getcurseg(fs) && fs->lfs_nowrap) { /* Don't try to write if we're suspended */ mutex_exit(&lfs_lock); continue; } if (LFS_STARVED_FOR_SEGS(fs)) { mutex_exit(&lfs_lock); DLOG((DLOG_FLUSH, "lfs_writerd: need cleaning before writing possible\n")); lfs_wakeup_cleaner(fs); continue; } if ((fs->lfs_dirvcount > LFS_MAX_FSDIROP(fs) || lfs_dirvcount > LFS_MAX_DIROP) && fs->lfs_dirops == 0) { fsflags &= ~SEGM_SINGLE; fsflags |= SEGM_CKP; DLOG((DLOG_FLUSH, "lfs_writerd: checkpoint\n")); lfs_flush_fs(fs, fsflags); } else if (fs->lfs_pdflush) { DLOG((DLOG_FLUSH, "lfs_writerd: pdflush set\n")); lfs_flush_fs(fs, fsflags); } else if (!TAILQ_EMPTY(&fs->lfs_pchainhd)) { DLOG((DLOG_FLUSH, "lfs_writerd: pchain non-empty\n")); mutex_exit(&lfs_lock); lfs_writer_enter(fs, "wrdirop"); lfs_flush_pchain(fs); lfs_writer_leave(fs); mutex_enter(&lfs_lock); } if (lfs_sb_getoffset(fs) != ooffset) ++wrote_something; mutex_exit(&lfs_lock); } KASSERT(!mutex_owned(&lfs_lock)); } if (lfsc == 0) { mutex_enter(&lfs_lock); lfs_writer_daemon = NULL; mutex_exit(&lfs_lock); mountlist_iterator_destroy(iter); break; } mountlist_iterator_destroy(iter); mutex_enter(&lfs_lock); } KASSERT(!mutex_owned(&lfs_lock)); /* Give up our extra reference so the module can be unloaded. */ mutex_enter(&vfs_list_lock); if (vfs != NULL) vfs->vfs_refcount--; mutex_exit(&vfs_list_lock); /* Done! */ kthread_exit(0); } /* * Initialize the filesystem, most work done by ulfs_init. */ void lfs_init(void) { /* * XXX: should we use separate pools for 32-bit and 64-bit * dinodes? */ malloc_type_attach(M_SEGMENT); pool_init(&lfs_inode_pool, sizeof(struct inode), 0, 0, 0, "lfsinopl", &pool_allocator_nointr, IPL_NONE); pool_init(&lfs_dinode_pool, sizeof(union lfs_dinode), 0, 0, 0, "lfsdinopl", &pool_allocator_nointr, IPL_NONE); pool_init(&lfs_inoext_pool, sizeof(struct lfs_inode_ext), 8, 0, 0, "lfsinoextpl", &pool_allocator_nointr, IPL_NONE); pool_init(&lfs_lbnentry_pool, sizeof(struct lbnentry), 0, 0, 0, "lfslbnpool", &pool_allocator_nointr, IPL_NONE); ulfs_init(); #ifdef DEBUG memset(lfs_log, 0, sizeof(lfs_log)); #endif mutex_init(&lfs_lock, MUTEX_DEFAULT, IPL_NONE); cv_init(&lfs_writerd_cv, "lfswrite"); cv_init(&locked_queue_cv, "lfsbuf"); cv_init(&lfs_writing_cv, "lfsflush"); } void lfs_reinit(void) { ulfs_reinit(); } void lfs_done(void) { ulfs_done(); mutex_destroy(&lfs_lock); cv_destroy(&lfs_writerd_cv); cv_destroy(&locked_queue_cv); cv_destroy(&lfs_writing_cv); pool_destroy(&lfs_inode_pool); pool_destroy(&lfs_dinode_pool); pool_destroy(&lfs_inoext_pool); pool_destroy(&lfs_lbnentry_pool); malloc_type_detach(M_SEGMENT); } /* * Called by main() when ulfs is going to be mounted as root. */ int lfs_mountroot(void) { extern struct vnode *rootvp; struct lfs *fs = NULL; /* LFS */ struct mount *mp; struct lwp *l = curlwp; struct ulfsmount *ump; int error; if (device_class(root_device) != DV_DISK) return (ENODEV); if (rootdev == NODEV) return (ENODEV); if ((error = vfs_rootmountalloc(MOUNT_LFS, "root_device", &mp))) { vrele(rootvp); return (error); } if ((error = lfs_mountfs(rootvp, mp, l))) { vfs_unbusy(mp); vfs_rele(mp); return (error); } mountlist_append(mp); ump = VFSTOULFS(mp); fs = ump->um_lfs; lfs_sb_setfsmnt(fs, mp->mnt_stat.f_mntonname); (void)lfs_statvfs(mp, &mp->mnt_stat); vfs_unbusy(mp); setrootfstime((time_t)lfs_sb_gettstamp(VFSTOULFS(mp)->um_lfs)); return (0); } /* * VFS Operations. * * mount system call */ int lfs_mount(struct mount *mp, const char *path, void *data, size_t *data_len) { struct lwp *l = curlwp; struct vnode *devvp; struct ulfs_args *args = data; struct ulfsmount *ump = NULL; struct lfs *fs = NULL; /* LFS */ int error = 0, update; mode_t accessmode; if (args == NULL) return EINVAL; if (*data_len < sizeof *args) return EINVAL; if (mp->mnt_flag & MNT_GETARGS) { ump = VFSTOULFS(mp); if (ump == NULL) return EIO; args->fspec = NULL; *data_len = sizeof *args; return 0; } update = mp->mnt_flag & MNT_UPDATE; /* Check arguments */ if (args->fspec != NULL) { /* * Look up the name and verify that it's sane. */ error = namei_simple_user(args->fspec, NSM_FOLLOW_NOEMULROOT, &devvp); if (error != 0) return (error); if (!update) { /* * Be sure this is a valid block device */ if (devvp->v_type != VBLK) error = ENOTBLK; else if (bdevsw_lookup(devvp->v_rdev) == NULL) error = ENXIO; } else { /* * Be sure we're still naming the same device * used for our initial mount * * XXX dholland 20151010: if namei gives us a * different vnode for the same device, * wouldn't it be better to use it going * forward rather than ignore it in favor of * the old one? */ ump = VFSTOULFS(mp); fs = ump->um_lfs; if (devvp != fs->lfs_devvp) { if (devvp->v_rdev != fs->lfs_devvp->v_rdev) error = EINVAL; else { vrele(devvp); devvp = fs->lfs_devvp; vref(devvp); } } } } else { if (!update) { /* New mounts must have a filename for the device */ return (EINVAL); } else { /* Use the extant mount */ ump = VFSTOULFS(mp); fs = ump->um_lfs; devvp = fs->lfs_devvp; vref(devvp); } } /* * If mount by non-root, then verify that user has necessary * permissions on the device. */ if (error == 0) { accessmode = VREAD; if (update ? (mp->mnt_iflag & IMNT_WANTRDWR) != 0 : (mp->mnt_flag & MNT_RDONLY) == 0) accessmode |= VWRITE; vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY); error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_MOUNT, KAUTH_REQ_SYSTEM_MOUNT_DEVICE, mp, devvp, KAUTH_ARG(accessmode)); VOP_UNLOCK(devvp); } if (error) { vrele(devvp); return (error); } if (!update) { int flags; if (mp->mnt_flag & MNT_RDONLY) flags = FREAD; else flags = FREAD|FWRITE; vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY); error = VOP_OPEN(devvp, flags, FSCRED); VOP_UNLOCK(devvp); if (error) goto fail; error = lfs_mountfs(devvp, mp, l); /* LFS */ if (error) { vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY); (void)VOP_CLOSE(devvp, flags, NOCRED); VOP_UNLOCK(devvp); goto fail; } ump = VFSTOULFS(mp); fs = ump->um_lfs; } else { /* * Update the mount. */ /* * The initial mount got a reference on this * device, so drop the one obtained via * namei(), above. */ vrele(devvp); ump = VFSTOULFS(mp); fs = ump->um_lfs; if (!fs->lfs_ronly && (mp->mnt_iflag & IMNT_WANTRDONLY)) { /* * Changing from read/write to read-only. */ int flags = WRITECLOSE; if (mp->mnt_flag & MNT_FORCE) flags |= FORCECLOSE; error = lfs_flushfiles(mp, flags); if (error) return error; fs->lfs_ronly = 1; } else if (fs->lfs_ronly && (mp->mnt_iflag & IMNT_WANTRDWR)) { /* * Changing from read-only to read/write. * Note in the superblocks that we're writing. */ /* XXX: quotas should have been on even if readonly */ if (fs->lfs_use_quota2) { #ifdef LFS_QUOTA2 error = lfs_quota2_mount(mp); #else uprintf("%s: no kernel support for this " "filesystem's quotas\n", mp->mnt_stat.f_mntonname); if (mp->mnt_flag & MNT_FORCE) { uprintf("%s: mounting anyway; " "fsck afterwards\n", mp->mnt_stat.f_mntonname); } else { error = EINVAL; } #endif if (error) { return error; } } fs->lfs_ronly = 0; if (lfs_sb_getpflags(fs) & LFS_PF_CLEAN) { lfs_sb_setpflags(fs, lfs_sb_getpflags(fs) & ~LFS_PF_CLEAN); lfs_writesuper(fs, lfs_sb_getsboff(fs, 0)); lfs_writesuper(fs, lfs_sb_getsboff(fs, 1)); } } if (args->fspec == NULL) return 0; } error = set_statvfs_info(path, UIO_USERSPACE, args->fspec, UIO_USERSPACE, mp->mnt_op->vfs_name, mp, l); if (error == 0) lfs_sb_setfsmnt(fs, mp->mnt_stat.f_mntonname); return error; fail: vrele(devvp); return (error); } /* * Helper for mountfs. Note that the fs pointer may be a dummy one * pointing into a superblock buffer. (Which is gross; see below.) */ static int lfs_checkmagic(struct lfs *fs) { switch (fs->lfs_dlfs_u.u_32.dlfs_magic) { case LFS_MAGIC: fs->lfs_is64 = false; fs->lfs_dobyteswap = false; break; case LFS64_MAGIC: fs->lfs_is64 = true; fs->lfs_dobyteswap = false; break; #ifdef LFS_EI case LFS_MAGIC_SWAPPED: fs->lfs_is64 = false; fs->lfs_dobyteswap = true; break; case LFS64_MAGIC_SWAPPED: fs->lfs_is64 = true; fs->lfs_dobyteswap = true; break; #endif default: /* XXX needs translation */ return EINVAL; } return 0; } /* * Common code for mount and mountroot * LFS specific */ int lfs_mountfs(struct vnode *devvp, struct mount *mp, struct lwp *l) { struct lfs *primarysb, *altsb, *thesb; struct buf *primarybuf, *altbuf; struct lfs *fs; struct ulfsmount *ump; struct vnode *vp; dev_t dev; int error, i, ronly, fsbsize; kauth_cred_t cred; CLEANERINFO *cip; SEGUSE *sup; daddr_t sb_addr; ino_t *orphan; size_t norphan; cred = l ? l->l_cred : NOCRED; /* The superblock is supposed to be 512 bytes. */ __CTASSERT(sizeof(struct dlfs) == DEV_BSIZE); /* * Flush out any old buffers remaining from a previous use. */ vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY); error = vinvalbuf(devvp, V_SAVE, cred, l, 0, 0); VOP_UNLOCK(devvp); if (error) return (error); ronly = (mp->mnt_flag & MNT_RDONLY) != 0; /* Don't free random space on error. */ primarybuf = NULL; altbuf = NULL; ump = NULL; sb_addr = LFS_LABELPAD / DEV_BSIZE; while (1) { /* * Read in the superblock. * * Note that because LFS_SBPAD is substantially larger * (8K) than the actual on-disk superblock (512 bytes) * the buffer contains enough space to be used as a * whole struct lfs (in-memory superblock) - we do this * only so we can set and use the is64 and dobyteswap * members. XXX this is gross and the logic here should * be reworked. */ error = bread(devvp, sb_addr, LFS_SBPAD, 0, &primarybuf); if (error) goto out; primarysb = (struct lfs *)primarybuf->b_data; /* Check the basics. */ error = lfs_checkmagic(primarysb); if (error) { DLOG((DLOG_MOUNT, "lfs_mountfs: primary superblock wrong magic\n")); goto out; } if (lfs_sb_getbsize(primarysb) > MAXBSIZE || lfs_sb_getversion(primarysb) > LFS_VERSION || lfs_sb_getbsize(primarysb) < sizeof(struct dlfs)) { DLOG((DLOG_MOUNT, "lfs_mountfs: primary superblock sanity failed\n")); /* XXX needs translation */ error = EINVAL; goto out; } if (lfs_sb_getinodefmt(primarysb) > LFS_MAXINODEFMT) { DLOG((DLOG_MOUNT, "lfs_mountfs: unknown inode format %d\n", lfs_sb_getinodefmt(primarysb))); error = EINVAL; goto out; } if (lfs_sb_getversion(primarysb) == 1) fsbsize = DEV_BSIZE; else { fsbsize = 1 << lfs_sb_getffshift(primarysb); /* * Could be, if the frag size is large enough, that we * don't have the "real" primary superblock. If that's * the case, get the real one, and try again. */ if (sb_addr != (lfs_sb_getsboff(primarysb, 0) << (lfs_sb_getffshift(primarysb) - DEV_BSHIFT))) { DLOG((DLOG_MOUNT, "lfs_mountfs: sb daddr" " 0x%llx is not right, trying 0x%llx\n", (long long)sb_addr, (long long)(lfs_sb_getsboff(primarysb, 0) << (lfs_sb_getffshift(primarysb) - DEV_BSHIFT)))); sb_addr = lfs_sb_getsboff(primarysb, 0) << (lfs_sb_getffshift(primarysb) - DEV_BSHIFT); brelse(primarybuf, BC_INVAL); continue; } } break; } /* * Check the second superblock to see which is newer; then mount * using the older of the two. This is necessary to ensure that * the filesystem is valid if it was not unmounted cleanly. */ if (lfs_sb_getsboff(primarysb, 1) && lfs_sb_getsboff(primarysb, 1) - LFS_LABELPAD / fsbsize > LFS_SBPAD / fsbsize) { error = bread(devvp, lfs_sb_getsboff(primarysb, 1) * (fsbsize / DEV_BSIZE), LFS_SBPAD, 0, &altbuf); if (error) goto out; altsb = (struct lfs *)altbuf->b_data; /* * Note: this used to do the sanity check only if the * timestamp/serial comparison required use of altsb; * this way is less tolerant, but if altsb is corrupted * enough that the magic number, version, and blocksize * are bogus, why would the timestamp or serial fields * mean anything either? If this kind of thing happens, * you need to fsck anyway. */ error = lfs_checkmagic(altsb); if (error) goto out; /* Check the basics. */ if (lfs_sb_getbsize(altsb) > MAXBSIZE || lfs_sb_getversion(altsb) > LFS_VERSION || lfs_sb_getbsize(altsb) < sizeof(struct dlfs)) { DLOG((DLOG_MOUNT, "lfs_mountfs: alt superblock" " sanity failed\n")); error = EINVAL; /* XXX needs translation */ goto out; } if (lfs_sb_getversion(primarysb) == 1) { /* 1s resolution comparison */ if (lfs_sb_gettstamp(altsb) < lfs_sb_gettstamp(primarysb)) thesb = altsb; else thesb = primarysb; } else { /* monotonic infinite-resolution comparison */ if (lfs_sb_getserial(altsb) < lfs_sb_getserial(primarysb)) thesb = altsb; else thesb = primarysb; } } else { DLOG((DLOG_MOUNT, "lfs_mountfs: invalid alt superblock location" " daddr=0x%x\n", lfs_sb_getsboff(primarysb, 1))); error = EINVAL; goto out; } /* * Allocate the mount structure, copy the superblock into it. * Note that the 32-bit and 64-bit superblocks are the same size. */ fs = kmem_zalloc(sizeof(struct lfs), KM_SLEEP); memcpy(&fs->lfs_dlfs_u.u_32, &thesb->lfs_dlfs_u.u_32, sizeof(struct dlfs)); fs->lfs_is64 = thesb->lfs_is64; fs->lfs_dobyteswap = thesb->lfs_dobyteswap; fs->lfs_hasolddirfmt = false; /* set for real below */ /* Compatibility */ if (lfs_sb_getversion(fs) < 2) { lfs_sb_setsumsize(fs, LFS_V1_SUMMARY_SIZE); lfs_sb_setibsize(fs, lfs_sb_getbsize(fs)); lfs_sb_sets0addr(fs, lfs_sb_getsboff(fs, 0)); lfs_sb_settstamp(fs, lfs_sb_getotstamp(fs)); lfs_sb_setfsbtodb(fs, 0); } if (lfs_sb_getresvseg(fs) == 0) lfs_sb_setresvseg(fs, MIN(lfs_sb_getminfreeseg(fs) - 1, \ MAX(MIN_RESV_SEGS, lfs_sb_getminfreeseg(fs) / 2 + 1))); /* * If we aren't going to be able to write meaningfully to this * filesystem, and were not mounted readonly, bomb out now. */ if (lfs_fsbtob(fs, LFS_NRESERVE(fs)) > LFS_MAX_BYTES && !ronly) { DLOG((DLOG_MOUNT, "lfs_mount: to mount this filesystem read/write," " we need BUFPAGES >= %lld\n", (long long)((bufmem_hiwater / bufmem_lowater) * LFS_INVERSE_MAX_BYTES( lfs_fsbtob(fs, LFS_NRESERVE(fs))) >> PAGE_SHIFT))); kmem_free(fs, sizeof(struct lfs)); error = EFBIG; /* XXX needs translation */ goto out; } /* Before rolling forward, lock so vget will sleep for other procs */ if (l != NULL) { fs->lfs_flags = LFS_NOTYET; fs->lfs_rfpid = l->l_proc->p_pid; } ump = kmem_zalloc(sizeof(*ump), KM_SLEEP); ump->um_lfs = fs; ump->um_fstype = fs->lfs_is64 ? ULFS2 : ULFS1; /* ump->um_cleaner_thread = NULL; */ brelse(primarybuf, BC_INVAL); brelse(altbuf, BC_INVAL); primarybuf = NULL; altbuf = NULL; /* Set up the I/O information */ fs->lfs_devbsize = DEV_BSIZE; fs->lfs_iocount = 0; fs->lfs_diropwait = 0; fs->lfs_activesb = 0; lfs_sb_setuinodes(fs, 0); fs->lfs_ravail = 0; fs->lfs_favail = 0; fs->lfs_sbactive = 0; /* Set up the ifile and lock aflags */ fs->lfs_doifile = 0; fs->lfs_writer = 0; fs->lfs_dirops = 0; fs->lfs_nadirop = 0; fs->lfs_seglock = 0; fs->lfs_pdflush = 0; fs->lfs_sleepers = 0; fs->lfs_pages = 0; rw_init(&fs->lfs_fraglock); rw_init(&fs->lfs_iflock); cv_init(&fs->lfs_sleeperscv, "lfs_slp"); cv_init(&fs->lfs_diropscv, "lfs_dirop"); cv_init(&fs->lfs_stopcv, "lfsstop"); cv_init(&fs->lfs_nextsegsleep, "segment"); /* Set the file system readonly/modify bits. */ fs->lfs_ronly = ronly; if (ronly == 0) fs->lfs_fmod = 1; /* Device we're using */ dev = devvp->v_rdev; fs->lfs_dev = dev; fs->lfs_devvp = devvp; /* ulfs-level information */ fs->um_flags = 0; fs->um_bptrtodb = lfs_sb_getffshift(fs) - DEV_BSHIFT; fs->um_seqinc = lfs_sb_getfrag(fs); fs->um_nindir = lfs_sb_getnindir(fs); fs->um_lognindir = ffs(lfs_sb_getnindir(fs)) - 1; fs->um_maxsymlinklen = lfs_sb_getmaxsymlinklen(fs); fs->um_dirblksiz = LFS_DIRBLKSIZ; fs->um_maxfilesize = lfs_sb_getmaxfilesize(fs); /* quota stuff */ /* XXX: these need to come from the on-disk superblock to be used */ fs->lfs_use_quota2 = 0; fs->lfs_quota_magic = 0; fs->lfs_quota_flags = 0; fs->lfs_quotaino[0] = 0; fs->lfs_quotaino[1] = 0; /* Initialize the mount structure. */ mp->mnt_data = ump; mp->mnt_stat.f_fsidx.__fsid_val[0] = (long)dev; mp->mnt_stat.f_fsidx.__fsid_val[1] = makefstype(MOUNT_LFS); mp->mnt_stat.f_fsid = mp->mnt_stat.f_fsidx.__fsid_val[0]; mp->mnt_stat.f_namemax = LFS_MAXNAMLEN; mp->mnt_stat.f_iosize = lfs_sb_getbsize(fs); mp->mnt_flag |= MNT_LOCAL; mp->mnt_iflag |= IMNT_SHRLOOKUP; mp->mnt_fs_bshift = lfs_sb_getbshift(fs); mp->mnt_iflag |= IMNT_CAN_RWTORO; if (fs->um_maxsymlinklen > 0) mp->mnt_iflag |= IMNT_DTYPE; else fs->lfs_hasolddirfmt = true; ump->um_mountp = mp; for (i = 0; i < ULFS_MAXQUOTAS; i++) ump->um_quotas[i] = NULLVP; spec_node_setmountedfs(devvp, mp); /* Set up reserved memory for pageout */ lfs_setup_resblks(fs); /* Set up vdirop tailq */ TAILQ_INIT(&fs->lfs_dchainhd); /* and paging tailq */ TAILQ_INIT(&fs->lfs_pchainhd); /* and delayed segment accounting for truncation list */ LIST_INIT(&fs->lfs_segdhd); /* * We use the ifile vnode for almost every operation. Instead of * retrieving it from the hash table each time we retrieve it here, * artificially increment the reference count and keep a pointer * to it in the incore copy of the superblock. */ if ((error = VFS_VGET(mp, LFS_IFILE_INUM, LK_EXCLUSIVE, &vp)) != 0) { DLOG((DLOG_MOUNT, "lfs_mountfs: ifile vget failed, error=%d\n", error)); goto out; } fs->lfs_ivnode = vp; vref(vp); /* Set up inode bitmap, order free list, and gather orphans. */ lfs_order_freelist(fs, &orphan, &norphan); /* Set up segment usage flags for the autocleaner. */ fs->lfs_nactive = 0; fs->lfs_suflags = malloc(2 * sizeof(u_int32_t *), M_SEGMENT, M_WAITOK); fs->lfs_suflags[0] = malloc(lfs_sb_getnseg(fs) * sizeof(u_int32_t), M_SEGMENT, M_WAITOK); fs->lfs_suflags[1] = malloc(lfs_sb_getnseg(fs) * sizeof(u_int32_t), M_SEGMENT, M_WAITOK); memset(fs->lfs_suflags[1], 0, lfs_sb_getnseg(fs) * sizeof(u_int32_t)); for (i = 0; i < lfs_sb_getnseg(fs); i++) { int changed; struct buf *bp; LFS_SEGENTRY(sup, fs, i, bp); changed = 0; if (!ronly) { if (sup->su_nbytes == 0 && !(sup->su_flags & SEGUSE_EMPTY)) { sup->su_flags |= SEGUSE_EMPTY; ++changed; } else if (!(sup->su_nbytes == 0) && (sup->su_flags & SEGUSE_EMPTY)) { sup->su_flags &= ~SEGUSE_EMPTY; ++changed; } if (sup->su_flags & (SEGUSE_ACTIVE|SEGUSE_INVAL)) { sup->su_flags &= ~(SEGUSE_ACTIVE|SEGUSE_INVAL); ++changed; } } fs->lfs_suflags[0][i] = sup->su_flags; if (changed) LFS_WRITESEGENTRY(sup, fs, i, bp); else brelse(bp, 0); } /* Free the orphans we discovered while ordering the freelist. */ lfs_free_orphans(fs, orphan, norphan); /* * XXX: if the fs has quotas, quotas should be on even if * readonly. Otherwise you can't query the quota info! * However, that's not how the quota2 code got written and I * don't know if it'll behave itself if enabled while * readonly, so for now use the same enable logic as ffs. * * XXX: also, if you use the -f behavior allowed here (and * equivalently above for remount) it will corrupt the fs. It * ought not to allow that. It should allow mounting readonly * if there are quotas and the kernel doesn't have the quota * code, but only readonly. * * XXX: and if you use the -f behavior allowed here it will * likely crash at unmount time (or remount time) because we * think quotas are active. * * Although none of this applies until there's a way to set * lfs_use_quota2 and have quotas in the fs at all. */ if (!ronly && fs->lfs_use_quota2) { #ifdef LFS_QUOTA2 error = lfs_quota2_mount(mp); #else uprintf("%s: no kernel support for this filesystem's quotas\n", mp->mnt_stat.f_mntonname); if (mp->mnt_flag & MNT_FORCE) { uprintf("%s: mounting anyway; fsck afterwards\n", mp->mnt_stat.f_mntonname); } else { error = EINVAL; } #endif if (error) { /* XXX XXX must clean up the stuff immediately above */ printf("lfs_mountfs: sorry, leaking some memory\n"); goto out; } } #ifdef LFS_KERNEL_RFW lfs_roll_forward(fs, mp, l); #endif /* If writing, sb is not clean; record in case of immediate crash */ if (!fs->lfs_ronly) { lfs_sb_setpflags(fs, lfs_sb_getpflags(fs) & ~LFS_PF_CLEAN); lfs_writesuper(fs, lfs_sb_getsboff(fs, 0)); lfs_writesuper(fs, lfs_sb_getsboff(fs, 1)); } /* Allow vget now that roll-forward is complete */ fs->lfs_flags &= ~(LFS_NOTYET); wakeup(&fs->lfs_flags); /* * Initialize the ifile cleaner info with information from * the superblock. */ { struct buf *bp; LFS_CLEANERINFO(cip, fs, bp); lfs_ci_setclean(fs, cip, lfs_sb_getnclean(fs)); lfs_ci_setdirty(fs, cip, lfs_sb_getnseg(fs) - lfs_sb_getnclean(fs)); lfs_ci_setavail(fs, cip, lfs_sb_getavail(fs)); lfs_ci_setbfree(fs, cip, lfs_sb_getbfree(fs)); (void) LFS_BWRITE_LOG(bp); /* Ifile */ } /* * Mark the current segment as ACTIVE, since we're going to * be writing to it. */ { struct buf *bp; LFS_SEGENTRY(sup, fs, lfs_dtosn(fs, lfs_sb_getoffset(fs)), bp); sup->su_flags |= SEGUSE_DIRTY | SEGUSE_ACTIVE; fs->lfs_nactive++; LFS_WRITESEGENTRY(sup, fs, lfs_dtosn(fs, lfs_sb_getoffset(fs)), bp); /* Ifile */ } /* Now that roll-forward is done, unlock the Ifile */ vput(vp); /* Start the pagedaemon-anticipating daemon */ mutex_enter(&lfs_lock); if (lfs_writer_daemon == NULL && kthread_create(PRI_BIO, 0, NULL, lfs_writerd, NULL, NULL, "lfs_writer") != 0) panic("fork lfs_writer"); mutex_exit(&lfs_lock); printf("WARNING: the log-structured file system is experimental\n" "WARNING: it may cause system crashes and/or corrupt data\n"); return (0); out: if (primarybuf) brelse(primarybuf, BC_INVAL); if (altbuf) brelse(altbuf, BC_INVAL); if (ump) { kmem_free(ump->um_lfs, sizeof(struct lfs)); kmem_free(ump, sizeof(*ump)); mp->mnt_data = NULL; } return (error); } /* * unmount system call */ int lfs_unmount(struct mount *mp, int mntflags) { struct ulfsmount *ump; struct lfs *fs; int error, ronly; ump = VFSTOULFS(mp); fs = ump->um_lfs; error = lfs_flushfiles(mp, mntflags & MNT_FORCE ? FORCECLOSE : 0); if (error) return error; /* Finish with the Ifile, now that we're done with it */ vgone(fs->lfs_ivnode); ronly = !fs->lfs_ronly; if (fs->lfs_devvp->v_type != VBAD) spec_node_setmountedfs(fs->lfs_devvp, NULL); vn_lock(fs->lfs_devvp, LK_EXCLUSIVE | LK_RETRY); error = VOP_CLOSE(fs->lfs_devvp, ronly ? FREAD : FREAD|FWRITE, NOCRED); vput(fs->lfs_devvp); /* Complain about page leakage */ if (fs->lfs_pages > 0) printf("lfs_unmount: still claim %d pages (%d in subsystem)\n", fs->lfs_pages, lfs_subsys_pages); /* Free per-mount data structures */ free(fs->lfs_ino_bitmap, M_SEGMENT); free(fs->lfs_suflags[0], M_SEGMENT); free(fs->lfs_suflags[1], M_SEGMENT); free(fs->lfs_suflags, M_SEGMENT); lfs_free_resblks(fs); cv_destroy(&fs->lfs_sleeperscv); cv_destroy(&fs->lfs_diropscv); cv_destroy(&fs->lfs_stopcv); cv_destroy(&fs->lfs_nextsegsleep); rw_destroy(&fs->lfs_fraglock); rw_destroy(&fs->lfs_iflock); kmem_free(fs, sizeof(struct lfs)); kmem_free(ump, sizeof(*ump)); mp->mnt_data = NULL; mp->mnt_flag &= ~MNT_LOCAL; return (error); } static int lfs_flushfiles(struct mount *mp, int flags) { struct lwp *l = curlwp; struct ulfsmount *ump; struct lfs *fs; struct vnode *vp; int error; ump = VFSTOULFS(mp); fs = ump->um_lfs; /* Two checkpoints */ if (!fs->lfs_ronly) { lfs_segwrite(mp, SEGM_CKP | SEGM_SYNC); lfs_segwrite(mp, SEGM_CKP | SEGM_SYNC); } /* wake up the cleaner so it can die */ /* XXX: shouldn't this be *after* the error cases below? */ lfs_wakeup_cleaner(fs); mutex_enter(&lfs_lock); while (fs->lfs_sleepers) cv_wait(&fs->lfs_sleeperscv, &lfs_lock); mutex_exit(&lfs_lock); #ifdef LFS_EXTATTR if (ump->um_fstype == ULFS1) { if (ump->um_extattr.uepm_flags & ULFS_EXTATTR_UEPM_STARTED) { ulfs_extattr_stop(mp, curlwp); } if (ump->um_extattr.uepm_flags & ULFS_EXTATTR_UEPM_INITIALIZED) { ulfs_extattr_uepm_destroy(&ump->um_extattr); mp->mnt_flag &= ~MNT_EXTATTR; } } #endif #ifdef LFS_QUOTA if ((error = lfsquota1_umount(mp, flags)) != 0) return (error); #endif #ifdef LFS_QUOTA2 if ((error = lfsquota2_umount(mp, flags)) != 0) return (error); #endif if ((error = vflush(mp, fs->lfs_ivnode, flags)) != 0) return (error); if ((error = VFS_SYNC(mp, 1, l->l_cred)) != 0) return (error); vp = fs->lfs_ivnode; mutex_enter(vp->v_interlock); if (LIST_FIRST(&vp->v_dirtyblkhd)) panic("lfs_unmount: still dirty blocks on ifile vnode"); mutex_exit(vp->v_interlock); /* Explicitly write the superblock, to update serial and pflags */ if (!fs->lfs_ronly) { lfs_sb_setpflags(fs, lfs_sb_getpflags(fs) | LFS_PF_CLEAN); lfs_writesuper(fs, lfs_sb_getsboff(fs, 0)); lfs_writesuper(fs, lfs_sb_getsboff(fs, 1)); } mutex_enter(&lfs_lock); while (fs->lfs_iocount) mtsleep(&fs->lfs_iocount, PRIBIO + 1, "lfs_umount", 0, &lfs_lock); mutex_exit(&lfs_lock); return 0; } /* * Get file system statistics. * * NB: We don't lock to access the superblock here, because it's not * really that important if we get it wrong. */ int lfs_statvfs(struct mount *mp, struct statvfs *sbp) { struct lfs *fs; struct ulfsmount *ump; ump = VFSTOULFS(mp); fs = ump->um_lfs; sbp->f_bsize = lfs_sb_getbsize(fs); sbp->f_frsize = lfs_sb_getfsize(fs); sbp->f_iosize = lfs_sb_getbsize(fs); sbp->f_blocks = LFS_EST_NONMETA(fs) - VTOI(fs->lfs_ivnode)->i_lfs_effnblks; sbp->f_bfree = LFS_EST_BFREE(fs); /* * XXX this should be lfs_sb_getsize (measured in frags) * rather than dsize (measured in diskblocks). However, * getsize needs a format version check (for version 1 it * needs to be blockstofrags'd) so for the moment I'm going to * leave this... it won't fire wrongly as frags are at least * as big as diskblocks. */ KASSERT(sbp->f_bfree <= lfs_sb_getdsize(fs)); #if 0 if (sbp->f_bfree < 0) sbp->f_bfree = 0; #endif sbp->f_bresvd = LFS_EST_RSVD(fs); if (sbp->f_bfree > sbp->f_bresvd) sbp->f_bavail = sbp->f_bfree - sbp->f_bresvd; else sbp->f_bavail = 0; /* XXX: huh? - dholland 20150728 */ sbp->f_files = lfs_sb_getbfree(fs) / lfs_btofsb(fs, lfs_sb_getibsize(fs)) * LFS_INOPB(fs); sbp->f_ffree = sbp->f_files - lfs_sb_getnfiles(fs); sbp->f_favail = sbp->f_ffree; sbp->f_fresvd = 0; copy_statvfs_info(sbp, mp); return (0); } /* * Go through the disk queues to initiate sandbagged IO; * go through the inodes to write those that have been modified; * initiate the writing of the super block if it has been modified. * * Note: we are always called with the filesystem marked `MPBUSY'. */ int lfs_sync(struct mount *mp, int waitfor, kauth_cred_t cred) { int error; struct lfs *fs; fs = VFSTOULFS(mp)->um_lfs; if (fs->lfs_ronly) return 0; /* Snapshots should not hose the syncer */ /* * XXX Sync can block here anyway, since we don't have a very * XXX good idea of how much data is pending. If it's more * XXX than a segment and lfs_nextseg is close to the end of * XXX the log, we'll likely block. */ mutex_enter(&lfs_lock); if (fs->lfs_nowrap && lfs_sb_getnextseg(fs) < lfs_sb_getcurseg(fs)) { mutex_exit(&lfs_lock); return 0; } mutex_exit(&lfs_lock); lfs_writer_enter(fs, "lfs_dirops"); /* All syncs must be checkpoints until roll-forward is implemented. */ DLOG((DLOG_FLUSH, "lfs_sync at 0x%jx\n", (uintmax_t)lfs_sb_getoffset(fs))); error = lfs_segwrite(mp, SEGM_CKP | (waitfor ? SEGM_SYNC : 0)); lfs_writer_leave(fs); #ifdef LFS_QUOTA lfs_qsync(mp); #endif return (error); } /* * Look up an LFS dinode number to find its incore vnode. If not already * in core, read it in from the specified device. Return the inode locked. * Detection and handling of mount points must be done by the calling routine. */ int lfs_vget(struct mount *mp, ino_t ino, int lktype, struct vnode **vpp) { int error; error = vcache_get(mp, &ino, sizeof(ino), vpp); if (error) return error; error = vn_lock(*vpp, lktype); if (error) { vrele(*vpp); *vpp = NULL; return error; } return 0; } /* * Create a new vnode/inode pair and initialize what fields we can. */ static void lfs_init_vnode(struct ulfsmount *ump, ino_t ino, struct vnode *vp) { struct lfs *fs = ump->um_lfs; struct inode *ip; union lfs_dinode *dp; ASSERT_NO_SEGLOCK(fs); /* Initialize the inode. */ ip = pool_get(&lfs_inode_pool, PR_WAITOK); memset(ip, 0, sizeof(*ip)); dp = pool_get(&lfs_dinode_pool, PR_WAITOK); memset(dp, 0, sizeof(*dp)); ip->inode_ext.lfs = pool_get(&lfs_inoext_pool, PR_WAITOK); memset(ip->inode_ext.lfs, 0, sizeof(*ip->inode_ext.lfs)); ip->i_din = dp; ip->i_ump = ump; ip->i_vnode = vp; ip->i_dev = fs->lfs_dev; lfs_dino_setinumber(fs, dp, ino); ip->i_number = ino; ip->i_lfs = fs; ip->i_lfs_effnblks = 0; SPLAY_INIT(&ip->i_lfs_lbtree); ip->i_lfs_nbtree = 0; LIST_INIT(&ip->i_lfs_segdhd); vp->v_tag = VT_LFS; vp->v_op = lfs_vnodeop_p; vp->v_data = ip; } /* * Undo lfs_init_vnode(). */ static void lfs_deinit_vnode(struct ulfsmount *ump, struct vnode *vp) { struct inode *ip = VTOI(vp); pool_put(&lfs_inoext_pool, ip->inode_ext.lfs); pool_put(&lfs_dinode_pool, ip->i_din); pool_put(&lfs_inode_pool, ip); vp->v_data = NULL; } /* * Read an inode from disk and initialize this vnode / inode pair. * Caller assures no other thread will try to load this inode. */ int lfs_loadvnode(struct mount *mp, struct vnode *vp, const void *key, size_t key_len, const void **new_key) { struct lfs *fs; union lfs_dinode *dip; struct inode *ip; struct buf *bp; IFILE *ifp; struct ulfsmount *ump; ino_t ino; daddr_t daddr; int error, retries; struct timespec ts; KASSERT(key_len == sizeof(ino)); memcpy(&ino, key, key_len); memset(&ts, 0, sizeof ts); /* XXX gcc */ ump = VFSTOULFS(mp); fs = ump->um_lfs; /* * If the filesystem is not completely mounted yet, suspend * any access requests (wait for roll-forward to complete). */ mutex_enter(&lfs_lock); while ((fs->lfs_flags & LFS_NOTYET) && curproc->p_pid != fs->lfs_rfpid) mtsleep(&fs->lfs_flags, PRIBIO+1, "lfs_notyet", 0, &lfs_lock); mutex_exit(&lfs_lock); /* Translate the inode number to a disk address. */ if (ino == LFS_IFILE_INUM) daddr = lfs_sb_getidaddr(fs); else { /* XXX bounds-check this too */ LFS_IENTRY(ifp, fs, ino, bp); daddr = lfs_if_getdaddr(fs, ifp); if (lfs_sb_getversion(fs) > 1) { ts.tv_sec = lfs_if_getatime_sec(fs, ifp); ts.tv_nsec = lfs_if_getatime_nsec(fs, ifp); } brelse(bp, 0); if (daddr == LFS_UNUSED_DADDR) return (ENOENT); } /* Allocate/init new vnode/inode. */ lfs_init_vnode(ump, ino, vp); ip = VTOI(vp); /* If the cleaner supplied the inode, use it. */ if (curlwp == fs->lfs_cleaner_thread && fs->lfs_cleaner_hint != NULL && fs->lfs_cleaner_hint->bi_lbn == LFS_UNUSED_LBN) { dip = fs->lfs_cleaner_hint->bi_bp; if (fs->lfs_is64) { error = copyin(dip, &ip->i_din->u_64, sizeof(struct lfs64_dinode)); } else { error = copyin(dip, &ip->i_din->u_32, sizeof(struct lfs32_dinode)); } if (error) { lfs_deinit_vnode(ump, vp); return error; } KASSERT(ip->i_number == ino); goto out; } /* Read in the disk contents for the inode, copy into the inode. */ retries = 0; again: error = bread(fs->lfs_devvp, LFS_FSBTODB(fs, daddr), (lfs_sb_getversion(fs) == 1 ? lfs_sb_getbsize(fs) : lfs_sb_getibsize(fs)), 0, &bp); if (error) { lfs_deinit_vnode(ump, vp); return error; } dip = lfs_ifind(fs, ino, bp); if (dip == NULL) { /* Assume write has not completed yet; try again */ brelse(bp, BC_INVAL); ++retries; if (retries <= LFS_IFIND_RETRIES) { mutex_enter(&lfs_lock); if (fs->lfs_iocount) { DLOG((DLOG_VNODE, "%s: dinode %d not found, retrying...\n", __func__, ino)); (void)mtsleep(&fs->lfs_iocount, PRIBIO + 1, "lfs ifind", 1, &lfs_lock); } else retries = LFS_IFIND_RETRIES; mutex_exit(&lfs_lock); goto again; } #ifdef DEBUG /* If the seglock is held look at the bpp to see what is there anyway */ mutex_enter(&lfs_lock); if (fs->lfs_seglock > 0) { struct buf **bpp; union lfs_dinode *dp; int i; for (bpp = fs->lfs_sp->bpp; bpp != fs->lfs_sp->cbpp; ++bpp) { if ((*bpp)->b_vp == fs->lfs_ivnode && bpp != fs->lfs_sp->bpp) { /* Inode block */ printf("%s: block 0x%" PRIx64 ": ", __func__, (*bpp)->b_blkno); for (i = 0; i < LFS_INOPB(fs); i++) { dp = DINO_IN_BLOCK(fs, (*bpp)->b_data, i); if (lfs_dino_getinumber(fs, dp)) printf("%ju ", (uintmax_t)lfs_dino_getinumber(fs, dp)); } printf("\n"); } } } mutex_exit(&lfs_lock); #endif /* DEBUG */ panic("lfs_loadvnode: dinode not found"); } lfs_copy_dinode(fs, ip->i_din, dip); brelse(bp, 0); out: if (lfs_sb_getversion(fs) > 1) { lfs_dino_setatime(fs, ip->i_din, ts.tv_sec); lfs_dino_setatimensec(fs, ip->i_din, ts.tv_nsec); } lfs_vinit(mp, &vp); *new_key = &ip->i_number; return 0; } /* * Create a new inode and initialize this vnode / inode pair. */ int lfs_newvnode(struct mount *mp, struct vnode *dvp, struct vnode *vp, struct vattr *vap, kauth_cred_t cred, void *extra, size_t *key_len, const void **new_key) { ino_t ino; struct inode *ip; struct ulfsmount *ump; struct lfs *fs; int error, mode, gen; KASSERT(dvp != NULL || vap->va_fileid > 0); KASSERT(dvp != NULL && dvp->v_mount == mp); KASSERT(vap->va_type != VNON); *key_len = sizeof(ino); ump = VFSTOULFS(mp); fs = ump->um_lfs; mode = MAKEIMODE(vap->va_type, vap->va_mode); /* * Allocate fresh inode. With "dvp == NULL" take the inode number * and version from "vap". */ if (dvp == NULL) { ino = vap->va_fileid; gen = vap->va_gen; error = lfs_valloc_fixed(fs, ino, gen); } else { error = lfs_valloc(dvp, mode, cred, &ino, &gen); } if (error) return error; /* Attach inode to vnode. */ lfs_init_vnode(ump, ino, vp); ip = VTOI(vp); mutex_enter(&lfs_lock); LFS_SET_UINO(ip, IN_CHANGE); mutex_exit(&lfs_lock); /* Note no blocks yet */ ip->i_lfs_hiblk = -1; /* Set a new generation number for this inode. */ ip->i_gen = gen; lfs_dino_setgen(fs, ip->i_din, gen); memset(ip->i_lfs_fragsize, 0, ULFS_NDADDR * sizeof(*ip->i_lfs_fragsize)); /* Set uid / gid. */ if (cred == NOCRED || cred == FSCRED) { ip->i_gid = 0; ip->i_uid = 0; } else { ip->i_gid = VTOI(dvp)->i_gid; ip->i_uid = kauth_cred_geteuid(cred); } DIP_ASSIGN(ip, gid, ip->i_gid); DIP_ASSIGN(ip, uid, ip->i_uid); #if defined(LFS_QUOTA) || defined(LFS_QUOTA2) error = lfs_chkiq(ip, 1, cred, 0); if (error) { lfs_vfree(dvp, ino, mode); lfs_deinit_vnode(ump, vp); return error; } #endif /* Set type and finalize. */ ip->i_flags = 0; DIP_ASSIGN(ip, flags, 0); ip->i_mode = mode; DIP_ASSIGN(ip, mode, mode); if (vap->va_rdev != VNOVAL) { /* * Want to be able to use this to make badblock * inodes, so don't truncate the dev number. */ // XXX clean this up if (ump->um_fstype == ULFS1) ip->i_din->u_32.di_rdev = ulfs_rw32(vap->va_rdev, ULFS_MPNEEDSWAP(fs)); else ip->i_din->u_64.di_rdev = ulfs_rw64(vap->va_rdev, ULFS_MPNEEDSWAP(fs)); } lfs_vinit(mp, &vp); *new_key = &ip->i_number; return 0; } /* * File handle to vnode */ int lfs_fhtovp(struct mount *mp, struct fid *fhp, int lktype, struct vnode **vpp) { struct lfid lfh; struct lfs *fs; if (fhp->fid_len != sizeof(struct lfid)) return EINVAL; memcpy(&lfh, fhp, sizeof(lfh)); if (lfh.lfid_ino < LFS_IFILE_INUM) return ESTALE; fs = VFSTOULFS(mp)->um_lfs; if (lfh.lfid_ident != lfs_sb_getident(fs)) return ESTALE; if (lfh.lfid_ino > ((lfs_dino_getsize(fs, VTOI(fs->lfs_ivnode)->i_din) >> lfs_sb_getbshift(fs)) - lfs_sb_getcleansz(fs) - lfs_sb_getsegtabsz(fs)) * lfs_sb_getifpb(fs)) return ESTALE; return (ulfs_fhtovp(mp, &lfh.lfid_ufid, lktype, vpp)); } /* * Vnode pointer to File handle */ /* ARGSUSED */ int lfs_vptofh(struct vnode *vp, struct fid *fhp, size_t *fh_size) { struct inode *ip; struct lfid lfh; if (*fh_size < sizeof(struct lfid)) { *fh_size = sizeof(struct lfid); return E2BIG; } *fh_size = sizeof(struct lfid); ip = VTOI(vp); memset(&lfh, 0, sizeof(lfh)); lfh.lfid_len = sizeof(struct lfid); lfh.lfid_ino = ip->i_number; lfh.lfid_gen = ip->i_gen; lfh.lfid_ident = lfs_sb_getident(ip->i_lfs); memcpy(fhp, &lfh, sizeof(lfh)); return (0); } /* * ulfs_bmaparray callback function for writing. * * Since blocks will be written to the new segment anyway, * we don't care about current daddr of them. */ static bool lfs_issequential_hole(const struct lfs *fs, daddr_t daddr0, daddr_t daddr1) { (void)fs; /* not used */ KASSERT(daddr0 == UNWRITTEN || (0 <= daddr0 && daddr0 <= LFS_MAX_DADDR(fs))); KASSERT(daddr1 == UNWRITTEN || (0 <= daddr1 && daddr1 <= LFS_MAX_DADDR(fs))); /* NOTE: all we want to know here is 'hole or not'. */ /* NOTE: UNASSIGNED is converted to 0 by ulfs_bmaparray. */ /* * treat UNWRITTENs and all resident blocks as 'contiguous' */ if (daddr0 != 0 && daddr1 != 0) return true; /* * both are in hole? */ if (daddr0 == 0 && daddr1 == 0) return true; /* all holes are 'contiguous' for us. */ return false; } /* * lfs_gop_write functions exactly like genfs_gop_write, except that * (1) it requires the seglock to be held by its caller, and sp->fip * to be properly initialized (it will return without re-initializing * sp->fip, and without calling lfs_writeseg). * (2) it uses the remaining space in the segment, rather than VOP_BMAP, * to determine how large a block it can write at once (though it does * still use VOP_BMAP to find holes in the file); * (3) it calls lfs_gatherblock instead of VOP_STRATEGY on its blocks * (leaving lfs_writeseg to deal with the cluster blocks, so we might * now have clusters of clusters, ick.) */ static int lfs_gop_write(struct vnode *vp, struct vm_page **pgs, int npages, int flags) { int i, error, run, haveeof = 0; int fs_bshift; vaddr_t kva; off_t eof, offset, startoffset = 0; size_t bytes, iobytes, skipbytes; bool async = (flags & PGO_SYNCIO) == 0; daddr_t lbn, blkno; struct vm_page *pg; struct buf *mbp, *bp; struct vnode *devvp = VTOI(vp)->i_devvp; struct inode *ip = VTOI(vp); struct lfs *fs = ip->i_lfs; struct segment *sp = fs->lfs_sp; SEGSUM *ssp; UVMHIST_FUNC("lfs_gop_write"); UVMHIST_CALLED(ubchist); const char * failreason = NULL; ASSERT_SEGLOCK(fs); /* The Ifile lives in the buffer cache */ KASSERT(vp != fs->lfs_ivnode); /* * We don't want to fill the disk before the cleaner has a chance * to make room for us. If we're in danger of doing that, fail * with EAGAIN. The caller will have to notice this, unlock * so the cleaner can run, relock and try again. * * We must write everything, however, if our vnode is being * reclaimed. */ mutex_enter(vp->v_interlock); if (LFS_STARVED_FOR_SEGS(fs) && vdead_check(vp, VDEAD_NOWAIT) == 0) { mutex_exit(vp->v_interlock); failreason = "Starved for segs and not flushing vp"; goto tryagain; } mutex_exit(vp->v_interlock); /* * Sometimes things slip past the filters in lfs_putpages, * and the pagedaemon tries to write pages---problem is * that the pagedaemon never acquires the segment lock. * * Alternatively, pages that were clean when we called * genfs_putpages may have become dirty in the meantime. In this * case the segment header is not properly set up for blocks * to be added to it. * * Unbusy and unclean the pages, and put them on the ACTIVE * queue under the hypothesis that they couldn't have got here * unless they were modified *quite* recently. * * XXXUBC that last statement is an oversimplification of course. */ if (!LFS_SEGLOCK_HELD(fs)) { failreason = "Seglock not held"; goto tryagain; } if (ip->i_lfs_iflags & LFSI_NO_GOP_WRITE) { failreason = "Inode with no_gop_write"; goto tryagain; } if ((pgs[0]->offset & lfs_sb_getbmask(fs)) != 0) { failreason = "Bad page offset"; goto tryagain; } UVMHIST_LOG(ubchist, "vp %#jx pgs %#jx npages %jd flags 0x%jx", (uintptr_t)vp, (uintptr_t)pgs, npages, flags); GOP_SIZE(vp, vp->v_size, &eof, 0); haveeof = 1; if (vp->v_type == VREG) fs_bshift = vp->v_mount->mnt_fs_bshift; else fs_bshift = DEV_BSHIFT; error = 0; pg = pgs[0]; startoffset = pg->offset; KASSERT(eof >= 0); if (startoffset >= eof) { failreason = "Offset beyond EOF"; goto tryagain; } else bytes = MIN(npages << PAGE_SHIFT, eof - startoffset); skipbytes = 0; KASSERT(bytes != 0); /* Swap PG_DELWRI for PG_PAGEOUT */ for (i = 0; i < npages; i++) { if (pgs[i]->flags & PG_DELWRI) { KASSERT(!(pgs[i]->flags & PG_PAGEOUT)); pgs[i]->flags &= ~PG_DELWRI; pgs[i]->flags |= PG_PAGEOUT; uvm_pageout_start(1); rw_enter(vp->v_uobj.vmobjlock, RW_WRITER); uvm_pagelock(pgs[i]); uvm_pageunwire(pgs[i]); uvm_pageunlock(pgs[i]); rw_exit(vp->v_uobj.vmobjlock); } } /* * Check to make sure we're starting on a block boundary. * We'll check later to make sure we always write entire * blocks (or fragments). */ if (startoffset & lfs_sb_getbmask(fs)) printf("%" PRId64 " & %" PRIu64 " = %" PRId64 "\n", startoffset, lfs_sb_getbmask(fs), startoffset & lfs_sb_getbmask(fs)); KASSERT((startoffset & lfs_sb_getbmask(fs)) == 0); if (bytes & lfs_sb_getffmask(fs)) { printf("lfs_gop_write: asked to write %ld bytes\n", (long)bytes); panic("lfs_gop_write: non-integer blocks"); } /* * We could deadlock here on pager_map with UVMPAGER_MAPIN_WAITOK. * If we would, write what we have and try again. If we don't * have anything to write, we'll have to sleep. */ ssp = (SEGSUM *)sp->segsum; if ((kva = uvm_pagermapin(pgs, npages, UVMPAGER_MAPIN_WRITE | (lfs_ss_getnfinfo(fs, ssp) < 1 ? UVMPAGER_MAPIN_WAITOK : 0))) == 0x0) { DLOG((DLOG_PAGE, "lfs_gop_write: forcing write\n")); #if 0 " with nfinfo=%d at offset 0x%jx\n", (int)lfs_ss_getnfinfo(fs, ssp), (uintmax_t)lfs_sb_getoffset(fs))); #endif lfs_updatemeta(sp); lfs_release_finfo(fs); (void) lfs_writeseg(fs, sp); lfs_acquire_finfo(fs, ip->i_number, ip->i_gen); /* * Having given up all of the pager_map we were holding, * we can now wait for aiodoned to reclaim it for us * without fear of deadlock. */ kva = uvm_pagermapin(pgs, npages, UVMPAGER_MAPIN_WRITE | UVMPAGER_MAPIN_WAITOK); } mbp = getiobuf(NULL, true); UVMHIST_LOG(ubchist, "vp %#jx mbp %#jx num now %jd bytes 0x%jx", (uintptr_t)vp, (uintptr_t)mbp, vp->v_numoutput, bytes); mbp->b_bufsize = npages << PAGE_SHIFT; mbp->b_data = (void *)kva; mbp->b_resid = mbp->b_bcount = bytes; mbp->b_cflags |= BC_BUSY|BC_AGE; mbp->b_iodone = uvm_aio_aiodone; bp = NULL; for (offset = startoffset; bytes > 0; offset += iobytes, bytes -= iobytes) { lbn = offset >> fs_bshift; error = ulfs_bmaparray(vp, lbn, &blkno, NULL, NULL, &run, lfs_issequential_hole); if (error) { UVMHIST_LOG(ubchist, "ulfs_bmaparray() -> %jd", error,0,0,0); skipbytes += bytes; bytes = 0; break; } iobytes = MIN((((off_t)lbn + 1 + run) << fs_bshift) - offset, bytes); if (blkno == (daddr_t)-1) { skipbytes += iobytes; continue; } /* * Discover how much we can really pack into this buffer. */ /* If no room in the current segment, finish it up */ if (sp->sum_bytes_left < sizeof(int32_t) || sp->seg_bytes_left < (1 << lfs_sb_getbshift(fs))) { int vers; lfs_updatemeta(sp); vers = lfs_fi_getversion(fs, sp->fip); lfs_release_finfo(fs); (void) lfs_writeseg(fs, sp); lfs_acquire_finfo(fs, ip->i_number, vers); } /* Check both for space in segment and space in segsum */ iobytes = MIN(iobytes, (sp->seg_bytes_left >> fs_bshift) << fs_bshift); iobytes = MIN(iobytes, (sp->sum_bytes_left / sizeof(int32_t)) << fs_bshift); KASSERT(iobytes > 0); /* if it's really one i/o, don't make a second buf */ if (offset == startoffset && iobytes == bytes) { bp = mbp; /* * All the LFS output is done by the segwriter. It * will increment numoutput by one for all the bufs it * receives. However this buffer needs one extra to * account for aiodone. */ mutex_enter(vp->v_interlock); vp->v_numoutput++; mutex_exit(vp->v_interlock); } else { bp = getiobuf(NULL, true); UVMHIST_LOG(ubchist, "vp %#jx bp %#jx num now %jd", (uintptr_t)vp, (uintptr_t)bp, vp->v_numoutput, 0); nestiobuf_setup(mbp, bp, offset - pg->offset, iobytes); /* * LFS doesn't like async I/O here, dies with * an assert in lfs_bwrite(). Is that assert * valid? I retained non-async behaviour when * converted this to use nestiobuf --pooka */ bp->b_flags &= ~B_ASYNC; } /* XXX This is silly ... is this necessary? */ mutex_enter(&bufcache_lock); mutex_enter(vp->v_interlock); bgetvp(vp, bp); mutex_exit(vp->v_interlock); mutex_exit(&bufcache_lock); bp->b_lblkno = lfs_lblkno(fs, offset); bp->b_private = mbp; if (devvp->v_type == VBLK) { bp->b_dev = devvp->v_rdev; } VOP_BWRITE(bp->b_vp, bp); while (lfs_gatherblock(sp, bp, NULL)) continue; } nestiobuf_done(mbp, skipbytes, error); if (skipbytes) { UVMHIST_LOG(ubchist, "skipbytes %jd", skipbytes, 0,0,0); } UVMHIST_LOG(ubchist, "returning 0", 0,0,0,0); if (!async) { /* Start a segment write. */ UVMHIST_LOG(ubchist, "flushing", 0,0,0,0); mutex_enter(&lfs_lock); lfs_flush(fs, 0, 1); mutex_exit(&lfs_lock); } if ((sp->seg_flags & SEGM_SINGLE) && lfs_sb_getcurseg(fs) != fs->lfs_startseg) return EAGAIN; return (0); tryagain: /* * We can't write the pages, for whatever reason. * Clean up after ourselves, and make the caller try again. */ mutex_enter(vp->v_interlock); /* Tell why we're here, if we know */ if (failreason != NULL) { DLOG((DLOG_PAGE, "lfs_gop_write: %s\n", failreason)); } if (haveeof && startoffset >= eof) { DLOG((DLOG_PAGE, "lfs_gop_write: ino %d start 0x%" PRIx64 " eof 0x%" PRIx64 " npages=%d\n", VTOI(vp)->i_number, pgs[0]->offset, eof, npages)); } for (i = 0; i < npages; i++) { pg = pgs[i]; if (pg->flags & PG_PAGEOUT) uvm_pageout_done(1); uvm_pagelock(pg); if (pg->flags & PG_DELWRI) { uvm_pageunwire(pg); } uvm_pageactivate(pg); uvm_pageunlock(pg); pg->flags &= ~(PG_DELWRI|PG_PAGEOUT|PG_RELEASED); uvm_pagemarkdirty(pg, UVM_PAGE_STATUS_DIRTY); DLOG((DLOG_PAGE, "pg[%d] = %p (vp %p off %" PRIx64 ")\n", i, pg, vp, pg->offset)); DLOG((DLOG_PAGE, "pg[%d]->flags = %x\n", i, pg->flags)); DLOG((DLOG_PAGE, "pg[%d]->pqflags = %x\n", i, pg->pqflags)); DLOG((DLOG_PAGE, "pg[%d]->uanon = %p\n", i, pg->uanon)); DLOG((DLOG_PAGE, "pg[%d]->uobject = %p\n", i, pg->uobject)); DLOG((DLOG_PAGE, "pg[%d]->wire_count = %d\n", i, pg->wire_count)); DLOG((DLOG_PAGE, "pg[%d]->loan_count = %d\n", i, pg->loan_count)); } uvm_page_unbusy(pgs, npages); mutex_exit(vp->v_interlock); return EAGAIN; } /* * finish vnode/inode initialization. * used by lfs_vget. */ void lfs_vinit(struct mount *mp, struct vnode **vpp) { struct vnode *vp = *vpp; struct inode *ip = VTOI(vp); struct ulfsmount *ump = VFSTOULFS(mp); struct lfs *fs = ump->um_lfs; int i; ip->i_mode = lfs_dino_getmode(fs, ip->i_din); ip->i_nlink = lfs_dino_getnlink(fs, ip->i_din); ip->i_lfs_osize = ip->i_size = lfs_dino_getsize(fs, ip->i_din); ip->i_flags = lfs_dino_getflags(fs, ip->i_din); ip->i_gen = lfs_dino_getgen(fs, ip->i_din); ip->i_uid = lfs_dino_getuid(fs, ip->i_din); ip->i_gid = lfs_dino_getgid(fs, ip->i_din); ip->i_lfs_effnblks = lfs_dino_getblocks(fs, ip->i_din); ip->i_lfs_odnlink = lfs_dino_getnlink(fs, ip->i_din); /* * Initialize the vnode from the inode, check for aliases. In all * cases re-init ip, the underlying vnode/inode may have changed. */ ulfs_vinit(mp, lfs_specop_p, lfs_fifoop_p, &vp); ip = VTOI(vp); memset(ip->i_lfs_fragsize, 0, ULFS_NDADDR * sizeof(*ip->i_lfs_fragsize)); if (vp->v_type != VLNK || ip->i_size >= ip->i_lfs->um_maxsymlinklen) { #ifdef DEBUG for (i = (ip->i_size + lfs_sb_getbsize(fs) - 1) >> lfs_sb_getbshift(fs); i < ULFS_NDADDR; i++) { if ((vp->v_type == VBLK || vp->v_type == VCHR) && i == 0) continue; if (lfs_dino_getdb(fs, ip->i_din, i) != 0) { lfs_dump_dinode(fs, ip->i_din); panic("inconsistent inode (direct)"); } } for ( ; i < ULFS_NDADDR + ULFS_NIADDR; i++) { if (lfs_dino_getib(fs, ip->i_din, i - ULFS_NDADDR) != 0) { lfs_dump_dinode(fs, ip->i_din); panic("inconsistent inode (indirect)"); } } #endif /* DEBUG */ for (i = 0; i < ULFS_NDADDR; i++) if (lfs_dino_getdb(fs, ip->i_din, i) != 0) ip->i_lfs_fragsize[i] = lfs_blksize(fs, ip, i); } KASSERTMSG((vp->v_type != VNON), "lfs_vinit: ino %llu is type VNON! (ifmt=%o)\n", (unsigned long long)ip->i_number, (ip->i_mode & LFS_IFMT) >> 12); /* * Finish inode initialization now that aliasing has been resolved. */ ip->i_devvp = fs->lfs_devvp; vref(ip->i_devvp); #if defined(LFS_QUOTA) || defined(LFS_QUOTA2) ulfsquota_init(ip); #endif genfs_node_init(vp, &lfs_genfsops); uvm_vnp_setsize(vp, ip->i_size); /* Initialize hiblk from file size */ ip->i_lfs_hiblk = lfs_lblkno(ip->i_lfs, ip->i_size + lfs_sb_getbsize(ip->i_lfs) - 1) - 1; *vpp = vp; } /* * Resize the filesystem to contain the specified number of segments. */ int lfs_resize_fs(struct lfs *fs, int newnsegs) { SEGUSE *sup; CLEANERINFO *cip; struct buf *bp, *obp; daddr_t olast, nlast, ilast, noff, start, end; struct vnode *ivp; struct inode *ip; int error, badnews, inc, oldnsegs; int sbbytes, csbbytes, gain, cgain; int i; /* Only support v2 and up */ if (lfs_sb_getversion(fs) < 2) return EOPNOTSUPP; /* If we're doing nothing, do it fast */ oldnsegs = lfs_sb_getnseg(fs); if (newnsegs == oldnsegs) return 0; /* We always have to have two superblocks */ if (newnsegs <= lfs_dtosn(fs, lfs_sb_getsboff(fs, 1))) /* XXX this error code is rather nonsense */ return EFBIG; ivp = fs->lfs_ivnode; ip = VTOI(ivp); error = 0; /* Take the segment lock so no one else calls lfs_newseg() */ lfs_seglock(fs, SEGM_PROT); /* * Make sure the segments we're going to be losing, if any, * are in fact empty. We hold the seglock, so their status * cannot change underneath us. Count the superblocks we lose, * while we're at it. */ sbbytes = csbbytes = 0; cgain = 0; for (i = newnsegs; i < oldnsegs; i++) { LFS_SEGENTRY(sup, fs, i, bp); badnews = sup->su_nbytes || !(sup->su_flags & SEGUSE_INVAL); if (sup->su_flags & SEGUSE_SUPERBLOCK) sbbytes += LFS_SBPAD; if (!(sup->su_flags & SEGUSE_DIRTY)) { ++cgain; if (sup->su_flags & SEGUSE_SUPERBLOCK) csbbytes += LFS_SBPAD; } brelse(bp, 0); if (badnews) { error = EBUSY; goto out; } } /* Note old and new segment table endpoints, and old ifile size */ olast = lfs_sb_getcleansz(fs) + lfs_sb_getsegtabsz(fs); nlast = howmany(newnsegs, lfs_sb_getsepb(fs)) + lfs_sb_getcleansz(fs); ilast = ivp->v_size >> lfs_sb_getbshift(fs); noff = nlast - olast; /* * Make sure no one can use the Ifile while we change it around. * Even after taking the iflock we need to make sure no one still * is holding Ifile buffers, so we get each one, to drain them. * (XXX this could be done better.) */ rw_enter(&fs->lfs_iflock, RW_WRITER); for (i = 0; i < ilast; i++) { /* XXX what to do if bread fails? */ bread(ivp, i, lfs_sb_getbsize(fs), 0, &bp); brelse(bp, 0); } /* Allocate new Ifile blocks */ for (i = ilast; i < ilast + noff; i++) { if (lfs_balloc(ivp, i * lfs_sb_getbsize(fs), lfs_sb_getbsize(fs), NOCRED, 0, &bp) != 0) panic("balloc extending ifile"); memset(bp->b_data, 0, lfs_sb_getbsize(fs)); VOP_BWRITE(bp->b_vp, bp); } /* Register new ifile size */ ip->i_size += noff * lfs_sb_getbsize(fs); lfs_dino_setsize(fs, ip->i_din, ip->i_size); uvm_vnp_setsize(ivp, ip->i_size); /* Copy the inode table to its new position */ if (noff != 0) { if (noff < 0) { start = nlast; end = ilast + noff; inc = 1; } else { start = ilast + noff - 1; end = nlast - 1; inc = -1; } for (i = start; i != end; i += inc) { if (bread(ivp, i, lfs_sb_getbsize(fs), B_MODIFY, &bp) != 0) panic("resize: bread dst blk failed"); if (bread(ivp, i - noff, lfs_sb_getbsize(fs), 0, &obp)) panic("resize: bread src blk failed"); memcpy(bp->b_data, obp->b_data, lfs_sb_getbsize(fs)); VOP_BWRITE(bp->b_vp, bp); brelse(obp, 0); } } /* If we are expanding, write the new empty SEGUSE entries */ if (newnsegs > oldnsegs) { for (i = oldnsegs; i < newnsegs; i++) { if ((error = bread(ivp, i / lfs_sb_getsepb(fs) + lfs_sb_getcleansz(fs), lfs_sb_getbsize(fs), B_MODIFY, &bp)) != 0) panic("lfs: ifile read: %d", error); while ((i + 1) % lfs_sb_getsepb(fs) && i < newnsegs) { sup = &((SEGUSE *)bp->b_data)[i % lfs_sb_getsepb(fs)]; memset(sup, 0, sizeof(*sup)); i++; } VOP_BWRITE(bp->b_vp, bp); } } /* Zero out unused superblock offsets */ for (i = 2; i < LFS_MAXNUMSB; i++) if (lfs_dtosn(fs, lfs_sb_getsboff(fs, i)) >= newnsegs) lfs_sb_setsboff(fs, i, 0x0); /* * Correct superblock entries that depend on fs size. * The computations of these are as follows: * * size = lfs_segtod(fs, nseg) * dsize = lfs_segtod(fs, nseg - minfreeseg) - lfs_btofsb(#super * LFS_SBPAD) * bfree = dsize - lfs_btofsb(fs, bsize * nseg / 2) - blocks_actually_used * avail = lfs_segtod(fs, nclean) - lfs_btofsb(#clean_super * LFS_SBPAD) * + (lfs_segtod(fs, 1) - (offset - curseg)) * - lfs_segtod(fs, minfreeseg - (minfreeseg / 2)) * * XXX - we should probably adjust minfreeseg as well. */ gain = (newnsegs - oldnsegs); lfs_sb_setnseg(fs, newnsegs); lfs_sb_setsegtabsz(fs, nlast - lfs_sb_getcleansz(fs)); lfs_sb_addsize(fs, gain * lfs_btofsb(fs, lfs_sb_getssize(fs))); lfs_sb_adddsize(fs, gain * lfs_btofsb(fs, lfs_sb_getssize(fs)) - lfs_btofsb(fs, sbbytes)); lfs_sb_addbfree(fs, gain * lfs_btofsb(fs, lfs_sb_getssize(fs)) - lfs_btofsb(fs, sbbytes) - gain * lfs_btofsb(fs, lfs_sb_getbsize(fs) / 2)); if (gain > 0) { lfs_sb_addnclean(fs, gain); lfs_sb_addavail(fs, gain * lfs_btofsb(fs, lfs_sb_getssize(fs))); } else { lfs_sb_subnclean(fs, cgain); lfs_sb_subavail(fs, cgain * lfs_btofsb(fs, lfs_sb_getssize(fs)) - lfs_btofsb(fs, csbbytes)); } /* Resize segment flag cache */ fs->lfs_suflags[0] = realloc(fs->lfs_suflags[0], lfs_sb_getnseg(fs) * sizeof(u_int32_t), M_SEGMENT, M_WAITOK); fs->lfs_suflags[1] = realloc(fs->lfs_suflags[1], lfs_sb_getnseg(fs) * sizeof(u_int32_t), M_SEGMENT, M_WAITOK); for (i = oldnsegs; i < newnsegs; i++) fs->lfs_suflags[0][i] = fs->lfs_suflags[1][i] = 0x0; /* Truncate Ifile if necessary */ if (noff < 0) lfs_truncate(ivp, ivp->v_size + (noff << lfs_sb_getbshift(fs)), 0, NOCRED); /* Update cleaner info so the cleaner can die */ /* XXX what to do if bread fails? */ bread(ivp, 0, lfs_sb_getbsize(fs), B_MODIFY, &bp); cip = bp->b_data; lfs_ci_setclean(fs, cip, lfs_sb_getnclean(fs)); lfs_ci_setdirty(fs, cip, lfs_sb_getnseg(fs) - lfs_sb_getnclean(fs)); VOP_BWRITE(bp->b_vp, bp); /* Let Ifile accesses proceed */ rw_exit(&fs->lfs_iflock); out: lfs_segunlock(fs); return error; } /* * Extended attribute dispatch */ int lfs_extattrctl(struct mount *mp, int cmd, struct vnode *vp, int attrnamespace, const char *attrname) { #ifdef LFS_EXTATTR struct ulfsmount *ump; ump = VFSTOULFS(mp); if (ump->um_fstype == ULFS1) { return ulfs_extattrctl(mp, cmd, vp, attrnamespace, attrname); } #endif return vfs_stdextattrctl(mp, cmd, vp, attrnamespace, attrname); }
16 16 6 13 6 5 3 2 1 6 6 5 5 5 4 4 4 4 2 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 17 5 2 3 1 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 /* $NetBSD: fdesc_vnops.c,v 1.140 2022/03/27 17:10:55 christos Exp $ */ /* * Copyright (c) 1992, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software donated to Berkeley by * Jan-Simon Pendry. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)fdesc_vnops.c 8.17 (Berkeley) 5/22/95 * * #Id: fdesc_vnops.c,v 1.12 1993/04/06 16:17:17 jsp Exp # */ /* * /dev/fd Filesystem */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: fdesc_vnops.c,v 1.140 2022/03/27 17:10:55 christos Exp $"); #include <sys/param.h> #include <sys/systm.h> #include <sys/time.h> #include <sys/proc.h> #include <sys/resourcevar.h> #include <sys/socketvar.h> #include <sys/filedesc.h> #include <sys/vnode.h> #include <sys/malloc.h> #include <sys/conf.h> #include <sys/file.h> #include <sys/stat.h> #include <sys/mount.h> #include <sys/namei.h> #include <sys/buf.h> #include <sys/dirent.h> #include <sys/tty.h> #include <sys/kauth.h> #include <sys/atomic.h> #include <miscfs/fdesc/fdesc.h> #include <miscfs/genfs/genfs.h> #define cttyvp(p) ((p)->p_lflag & PL_CONTROLT ? (p)->p_session->s_ttyvp : NULL) dev_t devctty; #if (FD_STDIN != FD_STDOUT-1) || (FD_STDOUT != FD_STDERR-1) FD_STDIN, FD_STDOUT, FD_STDERR must be a sequence n, n+1, n+2 #endif int fdesc_lookup(void *); int fdesc_open(void *); int fdesc_getattr(void *); int fdesc_setattr(void *); int fdesc_read(void *); int fdesc_write(void *); int fdesc_ioctl(void *); int fdesc_poll(void *); int fdesc_kqfilter(void *); int fdesc_readdir(void *); int fdesc_readlink(void *); int fdesc_inactive(void *); int fdesc_reclaim(void *); int fdesc_print(void *); int fdesc_pathconf(void *); static int fdesc_attr(int, struct vattr *, kauth_cred_t); int (**fdesc_vnodeop_p)(void *); const struct vnodeopv_entry_desc fdesc_vnodeop_entries[] = { { &vop_default_desc, vn_default_error }, { &vop_parsepath_desc, genfs_parsepath }, /* parsepath */ { &vop_lookup_desc, fdesc_lookup }, /* lookup */ { &vop_create_desc, genfs_eopnotsupp }, /* create */ { &vop_mknod_desc, genfs_eopnotsupp }, /* mknod */ { &vop_open_desc, fdesc_open }, /* open */ { &vop_close_desc, genfs_nullop }, /* close */ { &vop_access_desc, genfs_nullop }, /* access */ { &vop_accessx_desc, genfs_accessx }, /* accessx */ { &vop_getattr_desc, fdesc_getattr }, /* getattr */ { &vop_setattr_desc, fdesc_setattr }, /* setattr */ { &vop_read_desc, fdesc_read }, /* read */ { &vop_write_desc, fdesc_write }, /* write */ { &vop_fallocate_desc, genfs_eopnotsupp }, /* fallocate */ { &vop_fdiscard_desc, genfs_eopnotsupp }, /* fdiscard */ { &vop_ioctl_desc, fdesc_ioctl }, /* ioctl */ { &vop_fcntl_desc, genfs_fcntl }, /* fcntl */ { &vop_poll_desc, fdesc_poll }, /* poll */ { &vop_kqfilter_desc, fdesc_kqfilter }, /* kqfilter */ { &vop_revoke_desc, genfs_revoke }, /* revoke */ { &vop_mmap_desc, genfs_eopnotsupp }, /* mmap */ { &vop_fsync_desc, genfs_nullop }, /* fsync */ { &vop_seek_desc, genfs_seek }, /* seek */ { &vop_remove_desc, genfs_eopnotsupp }, /* remove */ { &vop_link_desc, genfs_erofs_link }, /* link */ { &vop_rename_desc, genfs_eopnotsupp }, /* rename */ { &vop_mkdir_desc, genfs_eopnotsupp }, /* mkdir */ { &vop_rmdir_desc, genfs_eopnotsupp }, /* rmdir */ { &vop_symlink_desc, genfs_erofs_symlink }, /* symlink */ { &vop_readdir_desc, fdesc_readdir }, /* readdir */ { &vop_readlink_desc, fdesc_readlink }, /* readlink */ { &vop_abortop_desc, genfs_abortop }, /* abortop */ { &vop_inactive_desc, fdesc_inactive }, /* inactive */ { &vop_reclaim_desc, fdesc_reclaim }, /* reclaim */ { &vop_lock_desc, genfs_lock }, /* lock */ { &vop_unlock_desc, genfs_unlock }, /* unlock */ { &vop_bmap_desc, genfs_eopnotsupp }, /* bmap */ { &vop_strategy_desc, genfs_badop }, /* strategy */ { &vop_print_desc, fdesc_print }, /* print */ { &vop_islocked_desc, genfs_islocked }, /* islocked */ { &vop_pathconf_desc, fdesc_pathconf }, /* pathconf */ { &vop_advlock_desc, genfs_einval }, /* advlock */ { &vop_bwrite_desc, genfs_eopnotsupp }, /* bwrite */ { &vop_putpages_desc, genfs_null_putpages }, /* putpages */ { NULL, NULL } }; const struct vnodeopv_desc fdesc_vnodeop_opv_desc = { &fdesc_vnodeop_p, fdesc_vnodeop_entries }; /* * Initialise cache headers */ void fdesc_init(void) { int cttymajor; /* locate the major number */ cttymajor = devsw_name2chr("ctty", NULL, 0); devctty = makedev(cttymajor, 0); } void fdesc_done(void) { } /* * vp is the current namei directory * ndp is the name to locate in that directory... */ int fdesc_lookup(void *v) { struct vop_lookup_v2_args /* { struct vnode * a_dvp; struct vnode ** a_vpp; struct componentname * a_cnp; } */ *ap = v; struct vnode **vpp = ap->a_vpp; struct vnode *dvp = ap->a_dvp; struct componentname *cnp = ap->a_cnp; struct lwp *l = curlwp; const char *pname = cnp->cn_nameptr; struct proc *p = l->l_proc; unsigned fd = 0; int error, ix = -1; fdtab_t *dt; dt = atomic_load_consume(&curlwp->l_fd->fd_dt); if (cnp->cn_namelen == 1 && *pname == '.') { *vpp = dvp; vref(dvp); return (0); } switch (VTOFDESC(dvp)->fd_type) { default: case Flink: case Fdesc: case Fctty: error = ENOTDIR; goto bad; case Froot: if (cnp->cn_namelen == 2 && memcmp(pname, "fd", 2) == 0) { ix = FD_DEVFD; goto good; } if (cnp->cn_namelen == 3 && memcmp(pname, "tty", 3) == 0) { struct vnode *ttyvp = cttyvp(p); if (ttyvp == NULL) { error = ENXIO; goto bad; } ix = FD_CTTY; goto good; } switch (cnp->cn_namelen) { case 5: if (memcmp(pname, "stdin", 5) == 0) { ix = FD_STDIN; goto good; } break; case 6: if (memcmp(pname, "stdout", 6) == 0) { ix = FD_STDOUT; goto good; } else if (memcmp(pname, "stderr", 6) == 0) { ix = FD_STDERR; goto good; } break; } error = ENOENT; goto bad; case Fdevfd: if (cnp->cn_namelen == 2 && memcmp(pname, "..", 2) == 0) { ix = FD_ROOT; goto good; } fd = 0; while (*pname >= '0' && *pname <= '9') { fd = 10 * fd + *pname++ - '0'; if (fd >= dt->dt_nfiles) break; } if (*pname != '\0') { error = ENOENT; goto bad; } if (fd >= dt->dt_nfiles || dt->dt_ff[fd] == NULL || dt->dt_ff[fd]->ff_file == NULL) { error = EBADF; goto bad; } ix = FD_DESC + fd; goto good; } bad: *vpp = NULL; return error; good: KASSERT(ix != -1); error = vcache_get(dvp->v_mount, &ix, sizeof(ix), vpp); if (error) return error; /* * Prevent returning VNON nodes. * Operation fdesc_inactive() will reset the type to VNON. */ if (ix == FD_CTTY) (*vpp)->v_type = VCHR; else if (ix >= FD_DESC) (*vpp)->v_type = VREG; KASSERT((*vpp)->v_type != VNON); return 0; } int fdesc_open(void *v) { struct vop_open_args /* { struct vnode *a_vp; int a_mode; kauth_cred_t a_cred; } */ *ap = v; struct vnode *vp = ap->a_vp; switch (VTOFDESC(vp)->fd_type) { case Fdesc: /* * XXX Kludge: set dupfd to contain the value of the * the file descriptor being sought for duplication. * The error return ensures that the vnode for this * device will be released by vn_open. vn_open will * then detect this special error and take the actions * in fd_dupopen. Other callers of vn_open or VOP_OPEN * not prepared to deal with this situation will * report a real error. */ curlwp->l_dupfd = VTOFDESC(vp)->fd_fd; /* XXX */ return EDUPFD; case Fctty: return cdev_open(devctty, ap->a_mode, 0, curlwp); case Froot: case Fdevfd: case Flink: break; } return (0); } static int fdesc_attr(int fd, struct vattr *vap, kauth_cred_t cred) { file_t *fp; struct stat stb; int error; if ((fp = fd_getfile(fd)) == NULL) return (EBADF); switch (fp->f_type) { case DTYPE_VNODE: vn_lock(fp->f_vnode, LK_SHARED | LK_RETRY); error = VOP_GETATTR(fp->f_vnode, vap, cred); VOP_UNLOCK(fp->f_vnode); if (error == 0 && vap->va_type == VDIR) { /* * directories can cause loops in the namespace, * so turn off the 'x' bits to avoid trouble. */ vap->va_mode &= ~(S_IXUSR|S_IXGRP|S_IXOTH); } break; default: memset(&stb, 0, sizeof(stb)); error = (*fp->f_ops->fo_stat)(fp, &stb); if (error) break; vattr_null(vap); switch(fp->f_type) { case DTYPE_SOCKET: vap->va_type = VSOCK; break; case DTYPE_PIPE: vap->va_type = VFIFO; break; default: /* use VNON perhaps? */ vap->va_type = VBAD; break; } vap->va_mode = stb.st_mode; vap->va_nlink = stb.st_nlink; vap->va_uid = stb.st_uid; vap->va_gid = stb.st_gid; vap->va_fsid = stb.st_dev; vap->va_fileid = stb.st_ino; vap->va_size = stb.st_size; vap->va_blocksize = stb.st_blksize; vap->va_atime = stb.st_atimespec; vap->va_mtime = stb.st_mtimespec; vap->va_ctime = stb.st_ctimespec; vap->va_gen = stb.st_gen; vap->va_flags = stb.st_flags; vap->va_rdev = stb.st_rdev; vap->va_bytes = stb.st_blocks * stb.st_blksize; break; } fd_putfile(fd); return (error); } int fdesc_getattr(void *v) { struct vop_getattr_args /* { struct vnode *a_vp; struct vattr *a_vap; kauth_cred_t a_cred; struct lwp *a_l; } */ *ap = v; struct vnode *vp = ap->a_vp; struct vattr *vap = ap->a_vap; unsigned fd; int error = 0; struct timeval tv; switch (VTOFDESC(vp)->fd_type) { case Froot: case Fdevfd: case Flink: case Fctty: vattr_null(vap); vap->va_fileid = VTOFDESC(vp)->fd_ix; #define R_ALL (S_IRUSR|S_IRGRP|S_IROTH) #define W_ALL (S_IWUSR|S_IWGRP|S_IWOTH) #define X_ALL (S_IXUSR|S_IXGRP|S_IXOTH) switch (VTOFDESC(vp)->fd_type) { case Flink: vap->va_mode = R_ALL|X_ALL; vap->va_type = VLNK; vap->va_rdev = 0; vap->va_nlink = 1; vap->va_size = strlen(VTOFDESC(vp)->fd_link); break; case Fctty: vap->va_mode = R_ALL|W_ALL; vap->va_type = VCHR; vap->va_rdev = devctty; vap->va_nlink = 1; vap->va_size = 0; break; default: vap->va_mode = R_ALL|X_ALL; vap->va_type = VDIR; vap->va_rdev = 0; vap->va_nlink = 2; vap->va_size = DEV_BSIZE; break; } vap->va_uid = 0; vap->va_gid = 0; vap->va_fsid = vp->v_mount->mnt_stat.f_fsidx.__fsid_val[0]; vap->va_blocksize = DEV_BSIZE; getmicroboottime(&tv); vap->va_atime.tv_sec = tv.tv_sec; vap->va_atime.tv_nsec = 0; vap->va_mtime = vap->va_atime; vap->va_ctime = vap->va_mtime; vap->va_gen = 0; vap->va_flags = 0; vap->va_bytes = 0; break; case Fdesc: fd = VTOFDESC(vp)->fd_fd; error = fdesc_attr(fd, vap, ap->a_cred); break; default: panic("fdesc_getattr"); break; } if (error == 0) vp->v_type = vap->va_type; return (error); } int fdesc_setattr(void *v) { struct vop_setattr_args /* { struct vnode *a_vp; struct vattr *a_vap; kauth_cred_t a_cred; } */ *ap = v; file_t *fp; unsigned fd; /* * Can't mess with the root vnode */ switch (VTOFDESC(ap->a_vp)->fd_type) { case Fdesc: break; case Fctty: return (0); default: return (EACCES); } fd = VTOFDESC(ap->a_vp)->fd_fd; if ((fp = fd_getfile(fd)) == NULL) return (EBADF); /* * XXX: Can't reasonably set the attr's on any types currently. * On vnode's this will cause truncation and socket/pipes make * no sense. */ fd_putfile(fd); return (0); } struct fdesc_target { ino_t ft_fileno; u_char ft_type; u_char ft_namlen; const char *ft_name; } fdesc_targets[] = { #define N(s) sizeof(s)-1, s { FD_DEVFD, DT_DIR, N("fd") }, { FD_STDIN, DT_LNK, N("stdin") }, { FD_STDOUT, DT_LNK, N("stdout") }, { FD_STDERR, DT_LNK, N("stderr") }, { FD_CTTY, DT_UNKNOWN, N("tty") }, #undef N #define UIO_MX _DIRENT_RECLEN((struct dirent *)NULL, sizeof("stderr") - 1) }; static int nfdesc_targets = sizeof(fdesc_targets) / sizeof(fdesc_targets[0]); int fdesc_readdir(void *v) { struct vop_readdir_args /* { struct vnode *a_vp; struct uio *a_uio; kauth_cred_t a_cred; int *a_eofflag; off_t **a_cookies; int *a_ncookies; } */ *ap = v; struct uio *uio = ap->a_uio; struct dirent d; off_t i; int j; int error; off_t *cookies = NULL; int ncookies; fdtab_t *dt; switch (VTOFDESC(ap->a_vp)->fd_type) { case Fctty: return 0; case Fdesc: return ENOTDIR; default: break; } dt = atomic_load_consume(&curlwp->l_fd->fd_dt); if (uio->uio_resid < UIO_MX) return EINVAL; if (uio->uio_offset < 0) return EINVAL; error = 0; i = uio->uio_offset; (void)memset(&d, 0, UIO_MX); d.d_reclen = UIO_MX; if (ap->a_ncookies) ncookies = uio->uio_resid / UIO_MX; else ncookies = 0; if (VTOFDESC(ap->a_vp)->fd_type == Froot) { struct fdesc_target *ft; if (i >= nfdesc_targets) return 0; if (ap->a_ncookies) { ncookies = uimin(ncookies, (nfdesc_targets - i)); cookies = malloc(ncookies * sizeof(off_t), M_TEMP, M_WAITOK); *ap->a_cookies = cookies; *ap->a_ncookies = ncookies; } for (ft = &fdesc_targets[i]; uio->uio_resid >= UIO_MX && i < nfdesc_targets; ft++, i++) { switch (ft->ft_fileno) { case FD_CTTY: if (cttyvp(curproc) == NULL) continue; break; case FD_STDIN: case FD_STDOUT: case FD_STDERR: if ((ft->ft_fileno - FD_STDIN) >= dt->dt_nfiles) continue; if (dt->dt_ff[ft->ft_fileno - FD_STDIN] == NULL || dt->dt_ff[ft->ft_fileno - FD_STDIN]->ff_file == NULL) continue; break; } d.d_fileno = ft->ft_fileno; d.d_namlen = ft->ft_namlen; (void)memcpy(d.d_name, ft->ft_name, ft->ft_namlen + 1); d.d_type = ft->ft_type; if ((error = uiomove(&d, UIO_MX, uio)) != 0) break; if (cookies) *cookies++ = i + 1; } } else { if (ap->a_ncookies) { ncookies = uimin(ncookies, dt->dt_nfiles + 2); cookies = malloc(ncookies * sizeof(off_t), M_TEMP, M_WAITOK); *ap->a_cookies = cookies; *ap->a_ncookies = ncookies; } for (; i - 2 < dt->dt_nfiles && uio->uio_resid >= UIO_MX; i++) { switch (i) { case 0: case 1: d.d_fileno = FD_ROOT; /* XXX */ d.d_namlen = i + 1; (void)memcpy(d.d_name, "..", d.d_namlen); d.d_name[i + 1] = '\0'; d.d_type = DT_DIR; break; default: j = (int)i - 2; if (dt->dt_ff[j] == NULL || dt->dt_ff[j]->ff_file == NULL) continue; d.d_fileno = j + FD_STDIN; d.d_namlen = snprintf(d.d_name, sizeof(d.d_name), "%d", j); d.d_type = DT_UNKNOWN; break; } if ((error = uiomove(&d, UIO_MX, uio)) != 0) break; if (cookies) *cookies++ = i + 1; } } if (ap->a_ncookies && error) { free(*ap->a_cookies, M_TEMP); *ap->a_ncookies = 0; *ap->a_cookies = NULL; } uio->uio_offset = i; return error; } int fdesc_readlink(void *v) { struct vop_readlink_args /* { struct vnode *a_vp; struct uio *a_uio; kauth_cred_t a_cred; } */ *ap = v; struct vnode *vp = ap->a_vp; int error; if (vp->v_type != VLNK) return (EPERM); if (VTOFDESC(vp)->fd_type == Flink) { const char *ln = VTOFDESC(vp)->fd_link; error = uiomove(__UNCONST(ln), strlen(ln), ap->a_uio); } else { error = EOPNOTSUPP; } return (error); } int fdesc_read(void *v) { struct vop_read_args /* { struct vnode *a_vp; struct uio *a_uio; int a_ioflag; kauth_cred_t a_cred; } */ *ap = v; int error = EOPNOTSUPP; struct vnode *vp = ap->a_vp; switch (VTOFDESC(vp)->fd_type) { case Fctty: VOP_UNLOCK(vp); error = cdev_read(devctty, ap->a_uio, ap->a_ioflag); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); break; default: error = EOPNOTSUPP; break; } return (error); } int fdesc_write(void *v) { struct vop_write_args /* { struct vnode *a_vp; struct uio *a_uio; int a_ioflag; kauth_cred_t a_cred; } */ *ap = v; int error = EOPNOTSUPP; struct vnode *vp = ap->a_vp; switch (VTOFDESC(vp)->fd_type) { case Fctty: VOP_UNLOCK(vp); error = cdev_write(devctty, ap->a_uio, ap->a_ioflag); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); break; default: error = EOPNOTSUPP; break; } return (error); } int fdesc_ioctl(void *v) { struct vop_ioctl_args /* { struct vnode *a_vp; u_long a_command; void *a_data; int a_fflag; kauth_cred_t a_cred; } */ *ap = v; int error = EOPNOTSUPP; switch (VTOFDESC(ap->a_vp)->fd_type) { case Fctty: error = cdev_ioctl(devctty, ap->a_command, ap->a_data, ap->a_fflag, curlwp); break; default: error = EOPNOTSUPP; break; } return (error); } int fdesc_poll(void *v) { struct vop_poll_args /* { struct vnode *a_vp; int a_events; } */ *ap = v; int revents; switch (VTOFDESC(ap->a_vp)->fd_type) { case Fctty: revents = cdev_poll(devctty, ap->a_events, curlwp); break; default: revents = genfs_poll(v); break; } return (revents); } int fdesc_kqfilter(void *v) { struct vop_kqfilter_args /* { struct vnode *a_vp; struct knote *a_kn; } */ *ap = v; int error, fd; file_t *fp; switch (VTOFDESC(ap->a_vp)->fd_type) { case Fctty: error = cdev_kqfilter(devctty, ap->a_kn); break; case Fdesc: /* just invoke kqfilter for the underlying descriptor */ fd = VTOFDESC(ap->a_vp)->fd_fd; if ((fp = fd_getfile(fd)) == NULL) return (1); error = (*fp->f_ops->fo_kqfilter)(fp, ap->a_kn); fd_putfile(fd); break; default: return (genfs_kqfilter(v)); } return (error); } int fdesc_inactive(void *v) { struct vop_inactive_v2_args /* { struct vnode *a_vp; } */ *ap = v; struct vnode *vp = ap->a_vp; struct fdescnode *fd = VTOFDESC(vp); /* * Clear out the v_type field to avoid * nasty things happening on reclaim. */ if (fd->fd_type == Fctty || fd->fd_type == Fdesc) vp->v_type = VNON; return (0); } int fdesc_reclaim(void *v) { struct vop_reclaim_v2_args /* { struct vnode *a_vp; } */ *ap = v; struct vnode *vp = ap->a_vp; struct fdescnode *fd = VTOFDESC(vp); VOP_UNLOCK(vp); vp->v_data = NULL; kmem_free(fd, sizeof(struct fdescnode)); return (0); } /* * Return POSIX pathconf information applicable to special devices. */ int fdesc_pathconf(void *v) { struct vop_pathconf_args /* { struct vnode *a_vp; int a_name; register_t *a_retval; } */ *ap = v; switch (ap->a_name) { case _PC_LINK_MAX: *ap->a_retval = LINK_MAX; return (0); case _PC_MAX_CANON: *ap->a_retval = MAX_CANON; return (0); case _PC_MAX_INPUT: *ap->a_retval = MAX_INPUT; return (0); case _PC_PIPE_BUF: *ap->a_retval = PIPE_BUF; return (0); case _PC_CHOWN_RESTRICTED: *ap->a_retval = 1; return (0); case _PC_VDISABLE: *ap->a_retval = _POSIX_VDISABLE; return (0); case _PC_SYNC_IO: *ap->a_retval = 1; return (0); default: return genfs_pathconf(ap); } /* NOTREACHED */ } /* * Print out the contents of a /dev/fd vnode. */ /* ARGSUSED */ int fdesc_print(void *v) { printf("tag VT_NON, fdesc vnode\n"); return (0); }
4 9 5 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 /* $NetBSD: exec_elf.c,v 1.105 2023/08/17 06:58:26 rin Exp $ */ /*- * Copyright (c) 1994, 2000, 2005, 2015, 2020 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Christos Zoulas and Maxime Villard. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /* * Copyright (c) 1996 Christopher G. Demetriou * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The name of the author may not be used to endorse or promote products * derived from this software without specific prior written permission * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include <sys/cdefs.h> __KERNEL_RCSID(1, "$NetBSD: exec_elf.c,v 1.105 2023/08/17 06:58:26 rin Exp $"); #ifdef _KERNEL_OPT #include "opt_pax.h" #endif /* _KERNEL_OPT */ #include <sys/param.h> #include <sys/proc.h> #include <sys/kmem.h> #include <sys/namei.h> #include <sys/vnode.h> #include <sys/exec.h> #include <sys/exec_elf.h> #include <sys/syscall.h> #include <sys/signalvar.h> #include <sys/mount.h> #include <sys/stat.h> #include <sys/kauth.h> #include <sys/bitops.h> #include <sys/cpu.h> #include <machine/reg.h> #include <compat/common/compat_util.h> #include <sys/pax.h> #include <uvm/uvm_param.h> #define elf_check_header ELFNAME(check_header) #define elf_copyargs ELFNAME(copyargs) #define elf_populate_auxv ELFNAME(populate_auxv) #define elf_load_interp ELFNAME(load_interp) #define elf_load_psection ELFNAME(load_psection) #define exec_elf_makecmds ELFNAME2(exec,makecmds) #define netbsd_elf_signature ELFNAME2(netbsd,signature) #define netbsd_elf_note ELFNAME2(netbsd,note) #define netbsd_elf_probe ELFNAME2(netbsd,probe) #define coredump ELFNAMEEND(coredump) #define elf_free_emul_arg ELFNAME(free_emul_arg) static int elf_load_interp(struct lwp *, struct exec_package *, char *, struct exec_vmcmd_set *, u_long *, Elf_Addr *); static int elf_load_psection(struct exec_vmcmd_set *, struct vnode *, const Elf_Phdr *, Elf_Addr *, u_long *, int); int netbsd_elf_signature(struct lwp *, struct exec_package *, Elf_Ehdr *); int netbsd_elf_note(struct exec_package *, const Elf_Nhdr *, const char *, const char *); int netbsd_elf_probe(struct lwp *, struct exec_package *, void *, char *, vaddr_t *); static void elf_free_emul_arg(void *); #ifdef DEBUG_ELF #define DPRINTF(a, ...) printf("%s: " a "\n", __func__, ##__VA_ARGS__) #else #define DPRINTF(a, ...) #endif /* round up and down to page boundaries. */ #define ELF_ROUND(a, b) (((a) + (b) - 1) & ~((b) - 1)) #define ELF_TRUNC(a, b) ((a) & ~((b) - 1)) static int elf_placedynexec(struct exec_package *epp, Elf_Ehdr *eh, Elf_Phdr *ph) { Elf_Addr align, offset; int i; for (align = 1, i = 0; i < eh->e_phnum; i++) if (ph[i].p_type == PT_LOAD && ph[i].p_align > align) align = ph[i].p_align; offset = (Elf_Addr)pax_aslr_exec_offset(epp, align); if (offset < epp->ep_vm_minaddr) offset = roundup(epp->ep_vm_minaddr, align); if ((offset & (align - 1)) != 0) { DPRINTF("bad offset=%#jx align=%#jx", (uintmax_t)offset, (uintmax_t)align); return EINVAL; } for (i = 0; i < eh->e_phnum; i++) ph[i].p_vaddr += offset; epp->ep_entryoffset = offset; eh->e_entry += offset; return 0; } int elf_populate_auxv(struct lwp *l, struct exec_package *pack, char **stackp) { size_t len, vlen; AuxInfo ai[ELF_AUX_ENTRIES], *a, *execname; struct elf_args *ap; char *path = l->l_proc->p_path; int error; execname = NULL; a = ai; memset(ai, 0, sizeof(ai)); /* * Push extra arguments on the stack needed by dynamically * linked binaries */ if ((ap = (struct elf_args *)pack->ep_emul_arg)) { struct vattr *vap = pack->ep_vap; a->a_type = AT_PHDR; a->a_v = ap->arg_phaddr; a++; a->a_type = AT_PHENT; a->a_v = ap->arg_phentsize; a++; a->a_type = AT_PHNUM; a->a_v = ap->arg_phnum; a++; a->a_type = AT_PAGESZ; a->a_v = PAGE_SIZE; a++; a->a_type = AT_BASE; a->a_v = ap->arg_interp; a++; a->a_type = AT_FLAGS; a->a_v = 0; a++; a->a_type = AT_ENTRY; a->a_v = ap->arg_entry; a++; a->a_type = AT_STACKBASE; a->a_v = l->l_proc->p_stackbase; a++; a->a_type = AT_EUID; if (vap->va_mode & S_ISUID) a->a_v = vap->va_uid; else a->a_v = kauth_cred_geteuid(l->l_cred); a++; a->a_type = AT_RUID; a->a_v = kauth_cred_getuid(l->l_cred); a++; a->a_type = AT_EGID; if (vap->va_mode & S_ISGID) a->a_v = vap->va_gid; else a->a_v = kauth_cred_getegid(l->l_cred); a++; a->a_type = AT_RGID; a->a_v = kauth_cred_getgid(l->l_cred); a++; /* "/" means fexecve(2) could not resolve the pathname */ if (path[0] == '/' && path[1] != '\0') { execname = a; a->a_type = AT_SUN_EXECNAME; a++; } exec_free_emul_arg(pack); } a->a_type = AT_NULL; a->a_v = 0; a++; vlen = (a - ai) * sizeof(ai[0]); KASSERT(vlen <= sizeof(ai)); if (execname) { execname->a_v = (uintptr_t)(*stackp + vlen); len = strlen(path) + 1; if ((error = copyout(path, (*stackp + vlen), len)) != 0) return error; len = ALIGN(len); } else { len = 0; } if ((error = copyout(ai, *stackp, vlen)) != 0) return error; *stackp += vlen + len; return 0; } /* * Copy arguments onto the stack in the normal way, but add some * extra information in case of dynamic binding. */ int elf_copyargs(struct lwp *l, struct exec_package *pack, struct ps_strings *arginfo, char **stackp, void *argp) { int error; if ((error = copyargs(l, pack, arginfo, stackp, argp)) != 0) return error; return elf_populate_auxv(l, pack, stackp); } /* * elf_check_header(): * * Check header for validity; return 0 if ok, ENOEXEC if error */ int elf_check_header(Elf_Ehdr *eh) { if (memcmp(eh->e_ident, ELFMAG, SELFMAG) != 0 || eh->e_ident[EI_CLASS] != ELFCLASS) { DPRINTF("bad magic e_ident[EI_MAG0,EI_MAG3] %#x%x%x%x, " "e_ident[EI_CLASS] %#x", eh->e_ident[EI_MAG0], eh->e_ident[EI_MAG1], eh->e_ident[EI_MAG2], eh->e_ident[EI_MAG3], eh->e_ident[EI_CLASS]); return ENOEXEC; } switch (eh->e_machine) { ELFDEFNNAME(MACHDEP_ID_CASES) default: DPRINTF("bad machine %#x", eh->e_machine); return ENOEXEC; } if (ELF_EHDR_FLAGS_OK(eh) == 0) { DPRINTF("bad flags %#x", eh->e_flags); return ENOEXEC; } if (eh->e_shnum > ELF_MAXSHNUM || eh->e_phnum > ELF_MAXPHNUM) { DPRINTF("bad shnum/phnum %#x/%#x", eh->e_shnum, eh->e_phnum); return ENOEXEC; } return 0; } /* * elf_load_psection(): * * Load a psection at the appropriate address */ static int elf_load_psection(struct exec_vmcmd_set *vcset, struct vnode *vp, const Elf_Phdr *ph, Elf_Addr *addr, u_long *size, int flags) { u_long msize, psize, rm, rf; long diff, offset; int vmprot = 0; KASSERT(VOP_ISLOCKED(vp) != LK_NONE); /* * If the user specified an address, then we load there. */ if (*addr == ELFDEFNNAME(NO_ADDR)) *addr = ph->p_vaddr; if (ph->p_align > 1) { /* * Make sure we are virtually aligned as we are supposed to be. */ diff = ph->p_vaddr - ELF_TRUNC(ph->p_vaddr, ph->p_align); if (*addr - diff != ELF_TRUNC(*addr, ph->p_align)) { DPRINTF("bad alignment %#jx != %#jx\n", (uintptr_t)(*addr - diff), (uintptr_t)ELF_TRUNC(*addr, ph->p_align)); return EINVAL; } /* * But make sure to not map any pages before the start of the * psection by limiting the difference to within a page. */ diff &= PAGE_MASK; } else diff = 0; vmprot |= (ph->p_flags & PF_R) ? VM_PROT_READ : 0; vmprot |= (ph->p_flags & PF_W) ? VM_PROT_WRITE : 0; vmprot |= (ph->p_flags & PF_X) ? VM_PROT_EXECUTE : 0; /* * Adjust everything so it all starts on a page boundary. */ *addr -= diff; offset = ph->p_offset - diff; *size = ph->p_filesz + diff; msize = ph->p_memsz + diff; if (ph->p_align >= PAGE_SIZE) { if ((ph->p_flags & PF_W) != 0) { /* * Because the pagedvn pager can't handle zero fill * of the last data page if it's not page aligned we * map the last page readvn. */ psize = trunc_page(*size); } else { psize = round_page(*size); } } else { psize = *size; } if (psize > 0) { NEW_VMCMD2(vcset, ph->p_align < PAGE_SIZE ? vmcmd_map_readvn : vmcmd_map_pagedvn, psize, *addr, vp, offset, vmprot, flags); flags &= VMCMD_RELATIVE; } if (psize < *size) { NEW_VMCMD2(vcset, vmcmd_map_readvn, *size - psize, *addr + psize, vp, offset + psize, vmprot, flags); } /* * Check if we need to extend the size of the segment (does * bss extend page the next page boundary)? */ rm = round_page(*addr + msize); rf = round_page(*addr + *size); if (rm != rf) { NEW_VMCMD2(vcset, vmcmd_map_zero, rm - rf, rf, NULLVP, 0, vmprot, flags & VMCMD_RELATIVE); *size = msize; } return 0; } /* * elf_load_interp(): * * Load an interpreter pointed to by path. */ static int elf_load_interp(struct lwp *l, struct exec_package *epp, char *path, struct exec_vmcmd_set *vcset, u_long *entryoff, Elf_Addr *last) { int error, i; struct vnode *vp; Elf_Ehdr eh; Elf_Phdr *ph = NULL; const Elf_Phdr *base_ph; const Elf_Phdr *last_ph; u_long phsize; Elf_Addr addr = *last; struct proc *p; bool use_topdown; p = l->l_proc; KASSERT(p->p_vmspace); KASSERT(p->p_vmspace != proc0.p_vmspace); #ifdef __USE_TOPDOWN_VM use_topdown = epp->ep_flags & EXEC_TOPDOWN_VM; #else use_topdown = false; #endif /* * 1. open file * 2. read filehdr * 3. map text, data, and bss out of it using VM_* */ vp = epp->ep_interp; if (vp == NULL) { error = emul_find_interp(l, epp, path); if (error != 0) return error; vp = epp->ep_interp; } /* We'll tidy this ourselves - otherwise we have locking issues */ epp->ep_interp = NULL; vn_lock(vp, LK_SHARED | LK_RETRY); /* * Similarly, if it's not marked as executable, or it's not a regular * file, we don't allow it to be used. */ if (vp->v_type != VREG) { error = EACCES; goto bad; } if ((error = VOP_ACCESS(vp, VEXEC, l->l_cred)) != 0) goto bad; /* * Check mount point. Though we're not trying to exec this binary, * we will be executing code from it, so if the mount point * disallows execution or set-id-ness, we punt or kill the set-id. */ if (vp->v_mount->mnt_flag & MNT_NOEXEC) { error = EACCES; goto bad; } if (vp->v_mount->mnt_flag & MNT_NOSUID) epp->ep_vap->va_mode &= ~(S_ISUID | S_ISGID); error = vn_marktext(vp); if (error) goto bad; error = exec_read(l, vp, 0, &eh, sizeof(eh), IO_NODELOCKED); if (error != 0) goto bad; if ((error = elf_check_header(&eh)) != 0) goto bad; if (eh.e_type != ET_DYN || eh.e_phnum == 0) { DPRINTF("bad interpreter type %#x", eh.e_type); error = ENOEXEC; goto bad; } phsize = eh.e_phnum * sizeof(Elf_Phdr); ph = kmem_alloc(phsize, KM_SLEEP); error = exec_read(l, vp, eh.e_phoff, ph, phsize, IO_NODELOCKED); if (error != 0) goto bad; #ifdef ELF_INTERP_NON_RELOCATABLE /* * Evil hack: Only MIPS should be non-relocatable, and the * psections should have a high address (typically 0x5ffe0000). * If it's now relocatable, it should be linked at 0 and the * psections should have zeros in the upper part of the address. * Otherwise, force the load at the linked address. */ if (*last == ELF_LINK_ADDR && (ph->p_vaddr & 0xffff0000) == 0) *last = ELFDEFNNAME(NO_ADDR); #endif /* * If no position to load the interpreter was set by a probe * function, pick the same address that a non-fixed mmap(0, ..) * would (i.e. something safely out of the way). */ if (*last == ELFDEFNNAME(NO_ADDR)) { u_long limit = 0; /* * Find the start and ending addresses of the psections to * be loaded. This will give us the size. */ for (i = 0, base_ph = NULL; i < eh.e_phnum; i++) { if (ph[i].p_type == PT_LOAD) { u_long psize = ph[i].p_vaddr + ph[i].p_memsz; if (base_ph == NULL) base_ph = &ph[i]; if (psize > limit) limit = psize; } } if (base_ph == NULL) { DPRINTF("no interpreter loadable sections"); error = ENOEXEC; goto bad; } /* * Now compute the size and load address. */ addr = (*epp->ep_esch->es_emul->e_vm_default_addr)(p, epp->ep_daddr, round_page(limit) - trunc_page(base_ph->p_vaddr), use_topdown); addr += (Elf_Addr)pax_aslr_rtld_offset(epp, base_ph->p_align, use_topdown); } else { addr = *last; /* may be ELF_LINK_ADDR */ } /* * Load all the necessary sections */ for (i = 0, base_ph = NULL, last_ph = NULL; i < eh.e_phnum; i++) { switch (ph[i].p_type) { case PT_LOAD: { u_long size; int flags; if (base_ph == NULL) { /* * First encountered psection is always the * base psection. Make sure it's aligned * properly (align down for topdown and align * upwards for not topdown). */ base_ph = &ph[i]; flags = VMCMD_BASE; if (addr == ELF_LINK_ADDR) addr = ph[i].p_vaddr; if (use_topdown) addr = ELF_TRUNC(addr, ph[i].p_align); else addr = ELF_ROUND(addr, ph[i].p_align); } else { u_long limit = round_page(last_ph->p_vaddr + last_ph->p_memsz); u_long base = trunc_page(ph[i].p_vaddr); /* * If there is a gap in between the psections, * map it as inaccessible so nothing else * mmap'ed will be placed there. */ if (limit != base) { NEW_VMCMD2(vcset, vmcmd_map_zero, base - limit, limit - base_ph->p_vaddr, NULLVP, 0, VM_PROT_NONE, VMCMD_RELATIVE); } addr = ph[i].p_vaddr - base_ph->p_vaddr; flags = VMCMD_RELATIVE; } last_ph = &ph[i]; if ((error = elf_load_psection(vcset, vp, &ph[i], &addr, &size, flags)) != 0) goto bad; /* * If entry is within this psection then this * must contain the .text section. *entryoff is * relative to the base psection. */ if (eh.e_entry >= ph[i].p_vaddr && eh.e_entry < (ph[i].p_vaddr + size)) { *entryoff = eh.e_entry - base_ph->p_vaddr; } addr += size; break; } default: break; } } kmem_free(ph, phsize); /* * This value is ignored if TOPDOWN. */ *last = addr; vput(vp); return 0; bad: if (ph != NULL) kmem_free(ph, phsize); vput(vp); return error; } /* * exec_elf_makecmds(): Prepare an Elf binary's exec package * * First, set of the various offsets/lengths in the exec package. * * Then, mark the text image busy (so it can be demand paged) or error * out if this is not possible. Finally, set up vmcmds for the * text, data, bss, and stack segments. */ int exec_elf_makecmds(struct lwp *l, struct exec_package *epp) { Elf_Ehdr *eh = epp->ep_hdr; Elf_Phdr *ph, *pp; Elf_Addr phdr = 0, computed_phdr = 0, pos = 0, end_text = 0; int error, i; char *interp = NULL; u_long phsize; struct elf_args *ap; bool is_dyn = false; if (epp->ep_hdrvalid < sizeof(Elf_Ehdr)) { DPRINTF("small header %#x", epp->ep_hdrvalid); return ENOEXEC; } if ((error = elf_check_header(eh)) != 0) return error; if (eh->e_type == ET_DYN) /* PIE, and some libs have an entry point */ is_dyn = true; else if (eh->e_type != ET_EXEC) { DPRINTF("bad type %#x", eh->e_type); return ENOEXEC; } if (eh->e_phnum == 0) { DPRINTF("no program headers"); return ENOEXEC; } /* XXX only LK_EXCLUSIVE to match all others - allow spinning */ vn_lock(epp->ep_vp, LK_EXCLUSIVE | LK_RETRY); error = vn_marktext(epp->ep_vp); if (error) { VOP_UNLOCK(epp->ep_vp); return error; } /* * Allocate space to hold all the program headers, and read them * from the file */ phsize = eh->e_phnum * sizeof(Elf_Phdr); ph = kmem_alloc(phsize, KM_SLEEP); error = exec_read(l, epp->ep_vp, eh->e_phoff, ph, phsize, IO_NODELOCKED); if (error != 0) { VOP_UNLOCK(epp->ep_vp); goto bad; } epp->ep_taddr = epp->ep_tsize = ELFDEFNNAME(NO_ADDR); epp->ep_daddr = epp->ep_dsize = ELFDEFNNAME(NO_ADDR); for (i = 0; i < eh->e_phnum; i++) { pp = &ph[i]; if (pp->p_type == PT_INTERP) { if (pp->p_filesz < 2 || pp->p_filesz > MAXPATHLEN) { DPRINTF("bad interpreter namelen %#jx", (uintmax_t)pp->p_filesz); error = ENOEXEC; VOP_UNLOCK(epp->ep_vp); goto bad; } interp = PNBUF_GET(); error = exec_read(l, epp->ep_vp, pp->p_offset, interp, pp->p_filesz, IO_NODELOCKED); if (error != 0) { VOP_UNLOCK(epp->ep_vp); goto bad; } /* Ensure interp is NUL-terminated and of the expected length */ if (strnlen(interp, pp->p_filesz) != pp->p_filesz - 1) { DPRINTF("bad interpreter name"); error = ENOEXEC; VOP_UNLOCK(epp->ep_vp); goto bad; } break; } } /* * On the same architecture, we may be emulating different systems. * See which one will accept this executable. * * Probe functions would normally see if the interpreter (if any) * exists. Emulation packages may possibly replace the interpreter in * interp with a changed path (/emul/xxx/<path>). */ pos = ELFDEFNNAME(NO_ADDR); if (epp->ep_esch->u.elf_probe_func) { vaddr_t startp = (vaddr_t)pos; error = (*epp->ep_esch->u.elf_probe_func)(l, epp, eh, interp, &startp); if (error) { VOP_UNLOCK(epp->ep_vp); goto bad; } pos = (Elf_Addr)startp; } if (is_dyn && (error = elf_placedynexec(epp, eh, ph)) != 0) { VOP_UNLOCK(epp->ep_vp); goto bad; } /* * Load all the necessary sections */ for (i = 0; i < eh->e_phnum; i++) { Elf_Addr addr = ELFDEFNNAME(NO_ADDR); u_long size = 0; switch (ph[i].p_type) { case PT_LOAD: if ((error = elf_load_psection(&epp->ep_vmcmds, epp->ep_vp, &ph[i], &addr, &size, VMCMD_FIXED)) != 0) { VOP_UNLOCK(epp->ep_vp); goto bad; } /* * Consider this as text segment, if it is executable. * If there is more than one text segment, pick the * largest. */ if (ph[i].p_flags & PF_X) { if (epp->ep_taddr == ELFDEFNNAME(NO_ADDR) || size > epp->ep_tsize) { epp->ep_taddr = addr; epp->ep_tsize = size; } end_text = addr + size; } else { epp->ep_daddr = addr; epp->ep_dsize = size; } if (ph[i].p_offset == 0) { computed_phdr = ph[i].p_vaddr + eh->e_phoff; } break; case PT_SHLIB: /* SCO has these sections. */ case PT_INTERP: /* Already did this one. */ case PT_DYNAMIC: case PT_NOTE: break; case PT_PHDR: /* Note address of program headers (in text segment) */ phdr = ph[i].p_vaddr; break; default: /* * Not fatal; we don't need to understand everything. */ break; } } /* Now done with the vnode. */ VOP_UNLOCK(epp->ep_vp); if (epp->ep_vmcmds.evs_used == 0) { /* No VMCMD; there was no PT_LOAD section, or those * sections were empty */ DPRINTF("no vmcommands"); error = ENOEXEC; goto bad; } if (epp->ep_daddr == ELFDEFNNAME(NO_ADDR)) { epp->ep_daddr = round_page(end_text); epp->ep_dsize = 0; } /* * Check if we found a dynamically linked binary and arrange to load * its interpreter */ if (interp) { u_int nused = epp->ep_vmcmds.evs_used; u_long interp_offset = 0; if ((error = elf_load_interp(l, epp, interp, &epp->ep_vmcmds, &interp_offset, &pos)) != 0) { goto bad; } if (epp->ep_vmcmds.evs_used == nused) { /* elf_load_interp() has not set up any new VMCMD */ DPRINTF("no vmcommands for interpreter"); error = ENOEXEC; goto bad; } ap = kmem_alloc(sizeof(*ap), KM_SLEEP); ap->arg_interp = epp->ep_vmcmds.evs_cmds[nused].ev_addr; epp->ep_entryoffset = interp_offset; epp->ep_entry = ap->arg_interp + interp_offset; PNBUF_PUT(interp); interp = NULL; } else { epp->ep_entry = eh->e_entry; if (epp->ep_flags & EXEC_FORCEAUX) { ap = kmem_zalloc(sizeof(*ap), KM_SLEEP); ap->arg_interp = (vaddr_t)NULL; } else { ap = NULL; } } if (ap) { ap->arg_phaddr = phdr ? phdr : computed_phdr; ap->arg_phentsize = eh->e_phentsize; ap->arg_phnum = eh->e_phnum; ap->arg_entry = eh->e_entry; epp->ep_emul_arg = ap; epp->ep_emul_arg_free = elf_free_emul_arg; } #ifdef ELF_MAP_PAGE_ZERO /* Dell SVR4 maps page zero, yeuch! */ NEW_VMCMD(&epp->ep_vmcmds, vmcmd_map_readvn, PAGE_SIZE, 0, epp->ep_vp, 0, VM_PROT_READ); #endif error = (*epp->ep_esch->es_setup_stack)(l, epp); if (error) goto bad; kmem_free(ph, phsize); return 0; bad: if (interp) PNBUF_PUT(interp); exec_free_emul_arg(epp); kmem_free(ph, phsize); kill_vmcmds(&epp->ep_vmcmds); return error; } int netbsd_elf_signature(struct lwp *l, struct exec_package *epp, Elf_Ehdr *eh) { size_t i; Elf_Phdr *ph; size_t phsize; char *nbuf; int error; int isnetbsd = 0; epp->ep_pax_flags = 0; if (eh->e_phnum > ELF_MAXPHNUM || eh->e_phnum == 0) { DPRINTF("no signature %#x", eh->e_phnum); return ENOEXEC; } phsize = eh->e_phnum * sizeof(Elf_Phdr); ph = kmem_alloc(phsize, KM_SLEEP); error = exec_read(l, epp->ep_vp, eh->e_phoff, ph, phsize, IO_NODELOCKED); if (error) goto out; nbuf = kmem_alloc(ELF_MAXNOTESIZE, KM_SLEEP); for (i = 0; i < eh->e_phnum; i++) { const char *nptr; size_t nlen; if (ph[i].p_type != PT_NOTE || ph[i].p_filesz > ELF_MAXNOTESIZE) continue; nlen = ph[i].p_filesz; error = exec_read(l, epp->ep_vp, ph[i].p_offset, nbuf, nlen, IO_NODELOCKED); if (error) continue; nptr = nbuf; while (nlen > 0) { const Elf_Nhdr *np; const char *ndata, *ndesc; /* note header */ np = (const Elf_Nhdr *)nptr; if (nlen < sizeof(*np)) { break; } nptr += sizeof(*np); nlen -= sizeof(*np); /* note name */ ndata = nptr; if (nlen < roundup(np->n_namesz, 4)) { break; } nptr += roundup(np->n_namesz, 4); nlen -= roundup(np->n_namesz, 4); /* note description */ ndesc = nptr; if (nlen < roundup(np->n_descsz, 4)) { break; } nptr += roundup(np->n_descsz, 4); nlen -= roundup(np->n_descsz, 4); isnetbsd |= netbsd_elf_note(epp, np, ndata, ndesc); } } kmem_free(nbuf, ELF_MAXNOTESIZE); error = isnetbsd ? 0 : ENOEXEC; #ifdef DEBUG_ELF if (error) DPRINTF("not netbsd"); #endif out: kmem_free(ph, phsize); return error; } int netbsd_elf_note(struct exec_package *epp, const Elf_Nhdr *np, const char *ndata, const char *ndesc) { int isnetbsd = 0; #ifdef DIAGNOSTIC const char *badnote; #define BADNOTE(n) badnote = (n) #else #define BADNOTE(n) #endif switch (np->n_type) { case ELF_NOTE_TYPE_NETBSD_TAG: /* It is us */ if (np->n_namesz == ELF_NOTE_NETBSD_NAMESZ && np->n_descsz == ELF_NOTE_NETBSD_DESCSZ && memcmp(ndata, ELF_NOTE_NETBSD_NAME, ELF_NOTE_NETBSD_NAMESZ) == 0) { memcpy(&epp->ep_osversion, ndesc, ELF_NOTE_NETBSD_DESCSZ); isnetbsd = 1; break; } /* * Ignore SuSE tags; SuSE's n_type is the same the * NetBSD one. */ if (np->n_namesz == ELF_NOTE_SUSE_NAMESZ && memcmp(ndata, ELF_NOTE_SUSE_NAME, ELF_NOTE_SUSE_NAMESZ) == 0) break; /* * Ignore old GCC */ if (np->n_namesz == ELF_NOTE_OGCC_NAMESZ && memcmp(ndata, ELF_NOTE_OGCC_NAME, ELF_NOTE_OGCC_NAMESZ) == 0) break; BADNOTE("NetBSD tag"); goto bad; case ELF_NOTE_TYPE_PAX_TAG: if (np->n_namesz == ELF_NOTE_PAX_NAMESZ && np->n_descsz == ELF_NOTE_PAX_DESCSZ && memcmp(ndata, ELF_NOTE_PAX_NAME, ELF_NOTE_PAX_NAMESZ) == 0) { uint32_t flags; memcpy(&flags, ndesc, sizeof(flags)); /* Convert the flags and insert them into * the exec package. */ pax_setup_elf_flags(epp, flags); break; } BADNOTE("PaX tag"); goto bad; case ELF_NOTE_TYPE_MARCH_TAG: /* Copy the machine arch into the package. */ if (np->n_namesz == ELF_NOTE_MARCH_NAMESZ && memcmp(ndata, ELF_NOTE_MARCH_NAME, ELF_NOTE_MARCH_NAMESZ) == 0) { /* Do not truncate the buffer */ if (np->n_descsz > sizeof(epp->ep_machine_arch)) { BADNOTE("description size limit"); goto bad; } /* * Ensure ndesc is NUL-terminated and of the * expected length. */ if (strnlen(ndesc, np->n_descsz) + 1 != np->n_descsz) { BADNOTE("description size"); goto bad; } strlcpy(epp->ep_machine_arch, ndesc, sizeof(epp->ep_machine_arch)); break; } BADNOTE("march tag"); goto bad; case ELF_NOTE_TYPE_MCMODEL_TAG: /* arch specific check for code model */ #ifdef ELF_MD_MCMODEL_CHECK if (np->n_namesz == ELF_NOTE_MCMODEL_NAMESZ && memcmp(ndata, ELF_NOTE_MCMODEL_NAME, ELF_NOTE_MCMODEL_NAMESZ) == 0) { ELF_MD_MCMODEL_CHECK(epp, ndesc, np->n_descsz); break; } BADNOTE("mcmodel tag"); goto bad; #endif break; case ELF_NOTE_TYPE_SUSE_VERSION_TAG: break; case ELF_NOTE_TYPE_GO_BUILDID_TAG: break; case ELF_NOTE_TYPE_FDO_PACKAGING_METADATA: break; case ELF_NOTE_TYPE_NETBSD_EMUL_TAG: /* Ancient NetBSD version tag */ break; default: BADNOTE("unknown tag"); bad: #ifdef DIAGNOSTIC /* Ignore GNU tags */ if (np->n_namesz == ELF_NOTE_GNU_NAMESZ && memcmp(ndata, ELF_NOTE_GNU_NAME, ELF_NOTE_GNU_NAMESZ) == 0) break; int ns = (int)np->n_namesz; printf("%s: Unknown elf note type %d (%s): " "[namesz=%d, descsz=%d name=%-*.*s]\n", epp->ep_kname, np->n_type, badnote, np->n_namesz, np->n_descsz, ns, ns, ndata); #endif break; } return isnetbsd; } int netbsd_elf_probe(struct lwp *l, struct exec_package *epp, void *eh, char *itp, vaddr_t *pos) { int error; if ((error = netbsd_elf_signature(l, epp, eh)) != 0) return error; #ifdef ELF_MD_PROBE_FUNC if ((error = ELF_MD_PROBE_FUNC(l, epp, eh, itp, pos)) != 0) return error; #elif defined(ELF_INTERP_NON_RELOCATABLE) *pos = ELF_LINK_ADDR; #endif epp->ep_flags |= EXEC_FORCEAUX; return 0; } void elf_free_emul_arg(void *arg) { struct elf_args *ap = arg; KASSERT(ap != NULL); kmem_free(ap, sizeof(*ap)); }
2742 2739 2749 2763 2738 15 2739 2742 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 /* $NetBSD: userret.h,v 1.35 2024/01/28 10:06:19 skrll Exp $ */ /*- * Copyright (c) 1998, 2000, 2003, 2006, 2008, 2019, 2020, 2023 * The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Charles M. Hannum, and Andrew Doran. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #ifndef _SYS_USERRET_H_ #define _SYS_USERRET_H_ #include <sys/lockdebug.h> #include <sys/intr.h> #include <sys/psref.h> /* * Define the MI code needed before returning to user mode, for trap and * syscall. * * We handle "exceptional" events: pending signals, stop/exit actions, etc. * Note that the event must be flagged BEFORE any AST is posted as we are * reading unlocked. */ static __inline void mi_userret(struct lwp *l) { int exception; KPREEMPT_DISABLE(l); KASSERTMSG(l->l_cpu->ci_biglock_count == 0, "kernel_lock leaked"); KASSERT(l->l_blcnt == 0); exception = l->l_cpu->ci_want_resched | (l->l_flag & LW_USERRET); KPREEMPT_ENABLE(l); if (__predict_false(exception)) { lwp_userret(l); } LOCKDEBUG_BARRIER(NULL, 0); KASSERT(l->l_nopreempt == 0); PSREF_DEBUG_BARRIER(); KASSERT(l->l_psrefs == 0); } #endif /* !_SYS_USERRET_H_ */
8 8 8 8 12 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 /* $NetBSD: bufq_disksort.c,v 1.14 2017/05/04 11:03:27 kamil Exp $ */ /* NetBSD: subr_disk.c,v 1.61 2004/09/25 03:30:44 thorpej Exp */ /*- * Copyright (c) 1996, 1997, 1999, 2000 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility, * NASA Ames Research Center. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /* * Copyright (c) 1982, 1986, 1988, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)ufs_disksubr.c 8.5 (Berkeley) 1/21/94 */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: bufq_disksort.c,v 1.14 2017/05/04 11:03:27 kamil Exp $"); #include <sys/param.h> #include <sys/systm.h> #include <sys/buf.h> #include <sys/bufq.h> #include <sys/bufq_impl.h> #include <sys/kmem.h> #include <sys/module.h> /* * Seek sort for disks. * * There are actually two queues, sorted in ascendening order. The first * queue holds those requests which are positioned after the current block; * the second holds requests which came in after their position was passed. * Thus we implement a one-way scan, retracting after reaching the end of * the drive to the first request on the second queue, at which time it * becomes the first queue. * * A one-way scan is natural because of the way UNIX read-ahead blocks are * allocated. */ struct bufq_disksort { TAILQ_HEAD(, buf) bq_head; /* actual list of buffers */ }; static void bufq_disksort_init(struct bufq_state *); static void bufq_disksort_put(struct bufq_state *, struct buf *); static struct buf *bufq_disksort_get(struct bufq_state *, int); BUFQ_DEFINE(disksort, 20, bufq_disksort_init); static void bufq_disksort_put(struct bufq_state *bufq, struct buf *bp) { struct bufq_disksort *disksort = bufq_private(bufq); struct buf *bq, *nbq; int sortby; sortby = bufq->bq_flags & BUFQ_SORT_MASK; bq = TAILQ_FIRST(&disksort->bq_head); /* * If the queue is empty it's easy; we just go on the end. */ if (bq == NULL) { TAILQ_INSERT_TAIL(&disksort->bq_head, bp, b_actq); return; } /* * If we lie before the currently active request, then we * must locate the second request list and add ourselves to it. */ if (buf_inorder(bp, bq, sortby)) { while ((nbq = TAILQ_NEXT(bq, b_actq)) != NULL) { /* * Check for an ``inversion'' in the normally ascending * block numbers, indicating the start of the second * request list. */ if (buf_inorder(nbq, bq, sortby)) { /* * Search the second request list for the first * request at a larger block number. We go * after that; if there is no such request, we * go at the end. */ do { if (buf_inorder(bp, nbq, sortby)) goto insert; bq = nbq; } while ((nbq = TAILQ_NEXT(bq, b_actq)) != NULL); goto insert; /* after last */ } bq = nbq; } /* * No inversions... we will go after the last, and * be the first request in the second request list. */ goto insert; } /* * Request is at/after the current request... * sort in the first request list. */ while ((nbq = TAILQ_NEXT(bq, b_actq)) != NULL) { /* * We want to go after the current request if there is an * inversion after it (i.e. it is the end of the first * request list), or if the next request is a larger cylinder * than our request. */ if (buf_inorder(nbq, bq, sortby) || buf_inorder(bp, nbq, sortby)) goto insert; bq = nbq; } /* * Neither a second list nor a larger request... we go at the end of * the first list, which is the same as the end of the whole schebang. */ insert: TAILQ_INSERT_AFTER(&disksort->bq_head, bq, bp, b_actq); } static struct buf * bufq_disksort_get(struct bufq_state *bufq, int remove) { struct bufq_disksort *disksort = bufq_private(bufq); struct buf *bp; bp = TAILQ_FIRST(&disksort->bq_head); if (bp != NULL && remove) TAILQ_REMOVE(&disksort->bq_head, bp, b_actq); return (bp); } static struct buf * bufq_disksort_cancel(struct bufq_state *bufq, struct buf *buf) { struct bufq_disksort *disksort = bufq_private(bufq); struct buf *bq; TAILQ_FOREACH(bq, &disksort->bq_head, b_actq) { if (bq == buf) { TAILQ_REMOVE(&disksort->bq_head, bq, b_actq); return buf; } } return NULL; } static void bufq_disksort_fini(struct bufq_state *bufq) { KASSERT(bufq->bq_private != NULL); kmem_free(bufq->bq_private, sizeof(struct bufq_disksort)); } static void bufq_disksort_init(struct bufq_state *bufq) { struct bufq_disksort *disksort; disksort = kmem_zalloc(sizeof(*disksort), KM_SLEEP); bufq->bq_private = disksort; bufq->bq_get = bufq_disksort_get; bufq->bq_put = bufq_disksort_put; bufq->bq_cancel = bufq_disksort_cancel; bufq->bq_fini = bufq_disksort_fini; TAILQ_INIT(&disksort->bq_head); } MODULE(MODULE_CLASS_BUFQ, bufq_disksort, NULL); static int bufq_disksort_modcmd(modcmd_t cmd, void *opaque) { switch (cmd) { case MODULE_CMD_INIT: return bufq_register(&bufq_strat_disksort); case MODULE_CMD_FINI: return bufq_unregister(&bufq_strat_disksort); default: return ENOTTY; } }
2 2 2 2 2 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 /* $NetBSD: pckbc.c,v 1.65 2022/11/17 23:57:20 riastradh Exp $ */ /* * Copyright (c) 2004 Ben Harris. * Copyright (c) 1998 * Matthias Drochner. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: pckbc.c,v 1.65 2022/11/17 23:57:20 riastradh Exp $"); #include <sys/param.h> #include <sys/systm.h> #include <sys/callout.h> #include <sys/kernel.h> #include <sys/proc.h> #include <sys/device.h> #include <sys/malloc.h> #include <sys/errno.h> #include <sys/queue.h> #include <sys/bus.h> #include <dev/ic/i8042reg.h> #include <dev/ic/pckbcvar.h> #include <dev/pckbport/pckbportvar.h> #include "locators.h" #include <sys/rndsource.h> /* data per slave device */ struct pckbc_slotdata { int polling; /* don't process data in interrupt handler */ int poll_data; /* data read from inr handler if polling */ int poll_stat; /* status read from inr handler if polling */ krndsource_t rnd_source; }; static void pckbc_init_slotdata(struct pckbc_slotdata *); static int pckbc_attach_slot(struct pckbc_softc *, pckbc_slot_t); struct pckbc_internal pckbc_consdata; int pckbc_console_attached; static int pckbc_console; static struct pckbc_slotdata pckbc_cons_slotdata; static int pckbc_xt_translation(void *, pckbport_slot_t, int); static int pckbc_send_devcmd(void *, pckbport_slot_t, u_char); static void pckbc_slot_enable(void *, pckbport_slot_t, int); static void pckbc_intr_establish(void *, pckbport_slot_t); static void pckbc_set_poll(void *, pckbc_slot_t, int on); static int pckbc_wait_output(bus_space_tag_t, bus_space_handle_t); static int pckbc_get8042cmd(struct pckbc_internal *); static int pckbc_put8042cmd(struct pckbc_internal *); void pckbc_cleanqueue(struct pckbc_slotdata *); void pckbc_cleanup(void *); int pckbc_cmdresponse(struct pckbc_internal *, pckbc_slot_t, u_char); void pckbc_start(struct pckbc_internal *, pckbc_slot_t); const char * const pckbc_slot_names[] = { "kbd", "aux" }; static const struct pckbport_accessops pckbc_ops = { .t_xt_translation = pckbc_xt_translation, .t_send_devcmd = pckbc_send_devcmd, .t_poll_data1 = pckbc_poll_data1, .t_slot_enable = pckbc_slot_enable, .t_intr_establish = pckbc_intr_establish, .t_set_poll = pckbc_set_poll, }; #define KBD_DELAY DELAY(8) static inline int pckbc_wait_output(bus_space_tag_t iot, bus_space_handle_t ioh_c) { u_int i; for (i = 100000; i; i--) if (!(bus_space_read_1(iot, ioh_c, 0) & KBS_IBF)) { KBD_DELAY; return (1); } return (0); } int pckbc_send_cmd(bus_space_tag_t iot, bus_space_handle_t ioh_c, u_char val) { if (!pckbc_wait_output(iot, ioh_c)) return (0); bus_space_write_1(iot, ioh_c, 0, val); return (1); } /* * Note: the spl games here are to deal with some strange PC kbd controllers * in some system configurations. * This is not canonical way to handle polling input. */ int pckbc_poll_data1(void *pt, pckbc_slot_t slot) { struct pckbc_internal *t = pt; struct pckbc_slotdata *q = t->t_slotdata[slot]; int s; u_char stat, c; int i = 100; /* polls for ~100ms */ int checkaux = t->t_haveaux; s = splhigh(); if (q && q->polling && q->poll_data != -1 && q->poll_stat != -1) { stat = q->poll_stat; c = q->poll_data; q->poll_data = -1; q->poll_stat = -1; goto process; } for (; i; i--, delay(1000)) { stat = bus_space_read_1(t->t_iot, t->t_ioh_c, 0); if (stat & KBS_DIB) { c = bus_space_read_1(t->t_iot, t->t_ioh_d, 0); process: if (checkaux && (stat & 0x20)) { /* aux data */ if (slot != PCKBC_AUX_SLOT) { #ifdef PCKBCDEBUG printf("pckbc: lost aux 0x%x\n", c); #endif continue; } } else { if (slot == PCKBC_AUX_SLOT) { #ifdef PCKBCDEBUG printf("pckbc: lost kbd 0x%x\n", c); #endif continue; } } splx(s); return (c); } } splx(s); return (-1); } /* * Get the current command byte. */ static int pckbc_get8042cmd(struct pckbc_internal *t) { bus_space_tag_t iot = t->t_iot; bus_space_handle_t ioh_c = t->t_ioh_c; int data; if (!pckbc_send_cmd(iot, ioh_c, K_RDCMDBYTE)) return (0); data = pckbc_poll_data1(t, PCKBC_KBD_SLOT); if (data == -1) return (0); t->t_cmdbyte = data; return (1); } /* * Pass command byte to keyboard controller (8042). */ static int pckbc_put8042cmd(struct pckbc_internal *t) { bus_space_tag_t iot = t->t_iot; bus_space_handle_t ioh_d = t->t_ioh_d; bus_space_handle_t ioh_c = t->t_ioh_c; if (!pckbc_send_cmd(iot, ioh_c, K_LDCMDBYTE)) return (0); if (!pckbc_wait_output(iot, ioh_c)) return (0); bus_space_write_1(iot, ioh_d, 0, t->t_cmdbyte); return (1); } static int pckbc_send_devcmd(void *pt, pckbc_slot_t slot, u_char val) { struct pckbc_internal *t = pt; bus_space_tag_t iot = t->t_iot; bus_space_handle_t ioh_d = t->t_ioh_d; bus_space_handle_t ioh_c = t->t_ioh_c; if (slot == PCKBC_AUX_SLOT) { if (!pckbc_send_cmd(iot, ioh_c, KBC_AUXWRITE)) return (0); } if (!pckbc_wait_output(iot, ioh_c)) return (0); bus_space_write_1(iot, ioh_d, 0, val); return (1); } int pckbc_is_console(bus_space_tag_t iot, bus_addr_t addr) { if (pckbc_console && !pckbc_console_attached && bus_space_is_equal(pckbc_consdata.t_iot, iot) && pckbc_consdata.t_addr == addr) return (1); return (0); } static int pckbc_attach_slot(struct pckbc_softc *sc, pckbc_slot_t slot) { struct pckbc_internal *t = sc->id; void *sdata; device_t child; int alloced = 0; if (t->t_slotdata[slot] == NULL) { sdata = malloc(sizeof(struct pckbc_slotdata), M_DEVBUF, M_WAITOK); t->t_slotdata[slot] = sdata; pckbc_init_slotdata(t->t_slotdata[slot]); alloced++; } child = pckbport_attach_slot(sc->sc_dv, t->t_pt, slot); if (child == NULL && alloced) { free(t->t_slotdata[slot], M_DEVBUF); t->t_slotdata[slot] = NULL; } if (child != NULL && t->t_slotdata[slot] != NULL) { memset(&t->t_slotdata[slot]->rnd_source, 0, sizeof(t->t_slotdata[slot]->rnd_source)); rnd_attach_source(&t->t_slotdata[slot]->rnd_source, device_xname(child), RND_TYPE_TTY, RND_FLAG_DEFAULT); } return child != NULL; } void pckbc_attach(struct pckbc_softc *sc) { struct pckbc_internal *t; bus_space_tag_t iot; bus_space_handle_t ioh_d, ioh_c; int res; u_char cmdbits = 0; t = sc->id; iot = t->t_iot; ioh_d = t->t_ioh_d; ioh_c = t->t_ioh_c; t->t_pt = pckbport_attach(t, &pckbc_ops); if (t->t_pt == NULL) { aprint_error(": attach failed\n"); return; } /* flush */ (void) pckbc_poll_data1(t, PCKBC_KBD_SLOT); /* set initial cmd byte */ if (!pckbc_put8042cmd(t)) { aprint_error("pckbc: cmd word write error\n"); return; } /* * XXX Don't check the keyboard port. There are broken keyboard controllers * which don't pass the test but work normally otherwise. */ #if 0 /* * check kbd port ok */ if (!pckbc_send_cmd(iot, ioh_c, KBC_KBDTEST)) return; res = pckbc_poll_data1(t, PCKBC_KBD_SLOT); /* * Normally, we should get a "0" here. * But there are keyboard controllers behaving differently. */ if (!(res == 0 || res == 0xfa || res == 0x01 || res == 0xab)) { printf("pckbc: kbd port test: %x\n", res); return; } #ifdef PCKBCDEBUG if (res != 0) printf("pckbc: returned %x on kbd slot test\n", res); #endif #endif /* 0 */ if (pckbc_attach_slot(sc, PCKBC_KBD_SLOT)) cmdbits |= KC8_KENABLE; /* * Check aux port ok. * Avoid KBC_AUXTEST because it hangs some older controllers * (eg UMC880?). */ if (!pckbc_send_cmd(iot, ioh_c, KBC_AUXECHO)) { aprint_error("pckbc: aux echo error 1\n"); goto nomouse; } if (!pckbc_wait_output(iot, ioh_c)) { aprint_error("pckbc: aux echo error 2\n"); goto nomouse; } t->t_haveaux = 1; bus_space_write_1(iot, ioh_d, 0, 0x5a); /* a random value */ res = pckbc_poll_data1(t, PCKBC_AUX_SLOT); /* * The following is needed to find the aux port on the Tadpole * SPARCle. */ if (res == -1 && ISSET(t->t_flags, PCKBC_NEED_AUXWRITE)) { /* Read of aux echo timed out, try again */ if (!pckbc_send_cmd(iot, ioh_c, KBC_AUXWRITE)) goto nomouse; if (!pckbc_wait_output(iot, ioh_c)) goto nomouse; bus_space_write_1(iot, ioh_d, 0, 0x5a); res = pckbc_poll_data1(t, PCKBC_AUX_SLOT); } if (res != -1) { /* * In most cases, the 0x5a gets echoed. * Some older controllers (Gateway 2000 circa 1993) * return 0xfe here. * We are satisfied if there is anything in the * aux output buffer. */ if (pckbc_attach_slot(sc, PCKBC_AUX_SLOT)) cmdbits |= KC8_MENABLE; } else { #ifdef PCKBCDEBUG printf("pckbc: aux echo test failed\n"); #endif t->t_haveaux = 0; } nomouse: /* enable needed interrupts */ t->t_cmdbyte |= cmdbits; if (!pckbc_put8042cmd(t)) aprint_error("pckbc: cmd word write error\n"); } static void pckbc_init_slotdata(struct pckbc_slotdata *q) { q->polling = 0; } /* * switch scancode translation on / off * return nonzero on success */ static int pckbc_xt_translation(void *self, pckbc_slot_t slot, int on) { struct pckbc_internal *t = self; int ison; if (ISSET(t->t_flags, PCKBC_CANT_TRANSLATE)) return (-1); if (slot != PCKBC_KBD_SLOT) { /* translation only for kbd slot */ if (on) return (0); else return (1); } ison = t->t_cmdbyte & KC8_TRANS; if ((on && ison) || (!on && !ison)) return (1); t->t_cmdbyte ^= KC8_TRANS; if (!pckbc_put8042cmd(t)) return (0); /* read back to be sure */ if (!pckbc_get8042cmd(t)) return (0); ison = t->t_cmdbyte & KC8_TRANS; if ((on && ison) || (!on && !ison)) return (1); return (0); } static const struct pckbc_portcmd { u_char cmd_en, cmd_dis; } pckbc_portcmd[2] = { { KBC_KBDENABLE, KBC_KBDDISABLE, }, { KBC_AUXENABLE, KBC_AUXDISABLE, } }; void pckbc_slot_enable(void *self, pckbc_slot_t slot, int on) { struct pckbc_internal *t = (struct pckbc_internal *)self; const struct pckbc_portcmd *cmd; cmd = &pckbc_portcmd[slot]; if (!pckbc_send_cmd(t->t_iot, t->t_ioh_c, on ? cmd->cmd_en : cmd->cmd_dis)) printf("pckbc: pckbc_slot_enable(%d) failed\n", on); } static void pckbc_set_poll(void *self, pckbc_slot_t slot, int on) { struct pckbc_internal *t = (struct pckbc_internal *)self; t->t_slotdata[slot]->polling = on; if (on) { t->t_slotdata[slot]->poll_data = -1; t->t_slotdata[slot]->poll_stat = -1; } else { int s; /* * If disabling polling on a device that's been configured, * make sure there are no bytes left in the FIFO, holding up * the interrupt line. Otherwise we won't get any further * interrupts. */ if (t->t_sc) { s = spltty(); pckbcintr(t->t_sc); splx(s); } } } static void pckbc_intr_establish(void *pt, pckbport_slot_t slot) { struct pckbc_internal *t = pt; (*t->t_sc->intr_establish)(t->t_sc, slot); } int pckbcintr_hard(void *vsc) { struct pckbc_softc *sc = (struct pckbc_softc *)vsc; struct pckbc_internal *t = sc->id; u_char stat; pckbc_slot_t slot; struct pckbc_slotdata *q; int served = 0, data, next, s; for(;;) { stat = bus_space_read_1(t->t_iot, t->t_ioh_c, 0); if (!(stat & KBS_DIB)) break; served = 1; slot = (t->t_haveaux && (stat & 0x20)) ? PCKBC_AUX_SLOT : PCKBC_KBD_SLOT; q = t->t_slotdata[slot]; if (!q) { /* XXX do something for live insertion? */ printf("pckbc: no dev for slot %d\n", slot); (void) bus_space_read_1(t->t_iot, t->t_ioh_d, 0); continue; } data = bus_space_read_1(t->t_iot, t->t_ioh_d, 0); rnd_add_uint32(&q->rnd_source, (stat<<8)|data); if (q->polling) { q->poll_data = data; q->poll_stat = stat; break; /* pckbc_poll_data() will get it */ } #if 0 /* XXXBJH */ if (CMD_IN_QUEUE(q) && pckbc_cmdresponse(t, slot, data)) continue; #endif s = splhigh(); next = (t->rbuf_write+1) % PCKBC_RBUF_SIZE; if (next == t->rbuf_read) { splx(s); break; } t->rbuf[t->rbuf_write].data = data; t->rbuf[t->rbuf_write].slot = slot; t->rbuf_write = next; splx(s); } return (served); } void pckbcintr_soft(void *vsc) { struct pckbc_softc *sc = vsc; struct pckbc_internal *t = sc->id; int data, slot, s; #ifndef __GENERIC_SOFT_INTERRUPTS_ALL_LEVELS int st; st = spltty(); #endif s = splhigh(); while (t->rbuf_read != t->rbuf_write) { slot = t->rbuf[t->rbuf_read].slot; data = t->rbuf[t->rbuf_read].data; t->rbuf_read = (t->rbuf_read+1) % PCKBC_RBUF_SIZE; splx(s); pckbportintr(t->t_pt, slot, data); s = splhigh(); } splx(s); #ifndef __GENERIC_SOFT_INTERRUPTS_ALL_LEVELS splx(st); #endif } int pckbcintr(void *vsc) { struct pckbc_softc *sc = (struct pckbc_softc *)vsc; struct pckbc_internal *t = sc->id; u_char stat; pckbc_slot_t slot; struct pckbc_slotdata *q; int served = 0, data; for(;;) { stat = bus_space_read_1(t->t_iot, t->t_ioh_c, 0); if (!(stat & KBS_DIB)) break; slot = (t->t_haveaux && (stat & 0x20)) ? PCKBC_AUX_SLOT : PCKBC_KBD_SLOT; q = t->t_slotdata[slot]; if (q != NULL && q->polling) return 0; served = 1; data = bus_space_read_1(t->t_iot, t->t_ioh_d, 0); if (q != NULL) rnd_add_uint32(&q->rnd_source, (stat<<8)|data); pckbportintr(t->t_pt, slot, data); } return (served); } int pckbc_cnattach(bus_space_tag_t iot, bus_addr_t addr, bus_size_t cmd_offset, pckbc_slot_t slot, int flags) { bus_space_handle_t ioh_d, ioh_c; #ifdef PCKBC_CNATTACH_SELFTEST int reply; #endif int res = 0; if (bus_space_map(iot, addr + KBDATAP, 1, 0, &ioh_d)) return (ENXIO); if (bus_space_map(iot, addr + cmd_offset, 1, 0, &ioh_c)) { bus_space_unmap(iot, ioh_d, 1); return (ENXIO); } memset(&pckbc_consdata, 0, sizeof(pckbc_consdata)); pckbc_consdata.t_iot = iot; pckbc_consdata.t_ioh_d = ioh_d; pckbc_consdata.t_ioh_c = ioh_c; pckbc_consdata.t_addr = addr; pckbc_consdata.t_flags = flags; callout_init(&pckbc_consdata.t_cleanup, 0); /* flush */ (void) pckbc_poll_data1(&pckbc_consdata, PCKBC_KBD_SLOT); #ifdef PCKBC_CNATTACH_SELFTEST /* * In some machines (e.g. netwinder) pckbc refuses to talk at * all until we request a self-test. */ if (!pckbc_send_cmd(iot, ioh_c, KBC_SELFTEST)) { printf("pckbc: unable to request selftest\n"); res = EIO; goto out; } reply = pckbc_poll_data1(&pckbc_consdata, PCKBC_KBD_SLOT); if (reply != 0x55) { printf("pckbc: selftest returned 0x%02x\n", reply); res = EIO; goto out; } #endif /* PCKBC_CNATTACH_SELFTEST */ /* init cmd byte, enable ports */ pckbc_consdata.t_cmdbyte = KC8_CPU; if (!pckbc_put8042cmd(&pckbc_consdata)) { printf("pckbc: cmd word write error\n"); res = EIO; goto out; } res = pckbport_cnattach(&pckbc_consdata, &pckbc_ops, slot); out: if (res) { bus_space_unmap(iot, pckbc_consdata.t_ioh_d, 1); bus_space_unmap(iot, pckbc_consdata.t_ioh_c, 1); } else { pckbc_consdata.t_slotdata[slot] = &pckbc_cons_slotdata; pckbc_init_slotdata(&pckbc_cons_slotdata); pckbc_console = 1; } return (res); } bool pckbc_resume(device_t dv, const pmf_qual_t *qual) { struct pckbc_softc *sc = device_private(dv); struct pckbc_internal *t; t = sc->id; (void)pckbc_poll_data1(t, PCKBC_KBD_SLOT); if (!pckbc_send_cmd(t->t_iot, t->t_ioh_c, KBC_SELFTEST)) return false; (void)pckbc_poll_data1(t, PCKBC_KBD_SLOT); (void)pckbc_put8042cmd(t); pckbcintr(t->t_sc); return true; }
15 6 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 /* $NetBSD: vfs_hooks.c,v 1.6 2009/03/15 17:14:40 cegger Exp $ */ /*- * Copyright (c) 2005 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Julio M. Merino Vidal. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /* * VFS hooks. */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: vfs_hooks.c,v 1.6 2009/03/15 17:14:40 cegger Exp $"); #include <sys/param.h> #include <sys/queue.h> #include <sys/mount.h> #include <sys/mutex.h> LIST_HEAD(vfs_hooks_head, vfs_hooks) vfs_hooks_head = LIST_HEAD_INITIALIZER(vfs_hooks_head); kmutex_t vfs_hooks_lock; void vfs_hooks_init(void) { mutex_init(&vfs_hooks_lock, MUTEX_DEFAULT, IPL_NONE); } int vfs_hooks_attach(struct vfs_hooks *vfs_hooks) { mutex_enter(&vfs_hooks_lock); LIST_INSERT_HEAD(&vfs_hooks_head, vfs_hooks, vfs_hooks_list); mutex_exit(&vfs_hooks_lock); return (0); } int vfs_hooks_detach(struct vfs_hooks *vfs_hooks) { struct vfs_hooks *hp; int ret = 0; mutex_enter(&vfs_hooks_lock); LIST_FOREACH(hp, &vfs_hooks_head, vfs_hooks_list) { if (hp == vfs_hooks) { LIST_REMOVE(hp, vfs_hooks_list); break; } } if (hp == NULL) ret = ESRCH; mutex_exit(&vfs_hooks_lock); return (ret); } /* * Macro to be used in one of the vfs_hooks_* function for hooks that * return an error code. Calls will stop as soon as one of the hooks * fails. */ #define VFS_HOOKS_W_ERROR(func, fargs, hook, hargs) \ int \ func fargs \ { \ int error; \ struct vfs_hooks *hp; \ \ error = EJUSTRETURN; \ \ mutex_enter(&vfs_hooks_lock); \ LIST_FOREACH(hp, &vfs_hooks_head, vfs_hooks_list) { \ if (hp-> hook != NULL) { \ error = hp-> hook hargs; \ if (error != 0) \ break; \ } \ } \ mutex_exit(&vfs_hooks_lock); \ \ return error; \ } /* * Macro to be used in one of the vfs_hooks_* function for hooks that * do not return any error code. All hooks will be executed * unconditionally. */ #define VFS_HOOKS_WO_ERROR(func, fargs, hook, hargs) \ void \ func fargs \ { \ struct vfs_hooks *hp; \ \ mutex_enter(&vfs_hooks_lock); \ LIST_FOREACH(hp, &vfs_hooks_head, vfs_hooks_list) { \ if (hp-> hook != NULL) \ hp-> hook hargs; \ } \ mutex_exit(&vfs_hooks_lock); \ } /* * Routines to iterate over VFS hooks lists and execute them. */ VFS_HOOKS_WO_ERROR(vfs_hooks_unmount, (struct mount *mp), vh_unmount, (mp)); VFS_HOOKS_W_ERROR(vfs_hooks_reexport, (struct mount *mp, const char *path, void *data), vh_reexport, (mp, path, data));
8 8 8 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 /* $NetBSD: kern_rate.c,v 1.2 2012/12/12 11:10:56 pooka Exp $ */ /*- * Copyright (c) 2000, 2004, 2005, 2007, 2008 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Christopher G. Demetriou. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: kern_rate.c,v 1.2 2012/12/12 11:10:56 pooka Exp $"); #include <sys/param.h> #include <sys/time.h> /* * ratecheck(): simple time-based rate-limit checking. see ratecheck(9) * for usage and rationale. */ int ratecheck(struct timeval *lasttime, const struct timeval *mininterval) { struct timeval tv, delta; int rv = 0; getmicrouptime(&tv); timersub(&tv, lasttime, &delta); /* * check for 0,0 is so that the message will be seen at least once, * even if interval is huge. */ if (timercmp(&delta, mininterval, >=) || (lasttime->tv_sec == 0 && lasttime->tv_usec == 0)) { *lasttime = tv; rv = 1; } return (rv); } /* * ppsratecheck(): packets (or events) per second limitation. */ int ppsratecheck(struct timeval *lasttime, int *curpps, int maxpps) { struct timeval tv, delta; int rv; getmicrouptime(&tv); timersub(&tv, lasttime, &delta); /* * check for 0,0 is so that the message will be seen at least once. * if more than one second have passed since the last update of * lasttime, reset the counter. * * we do increment *curpps even in *curpps < maxpps case, as some may * try to use *curpps for stat purposes as well. */ if ((lasttime->tv_sec == 0 && lasttime->tv_usec == 0) || delta.tv_sec >= 1) { *lasttime = tv; *curpps = 0; } if (maxpps < 0) rv = 1; else if (*curpps < maxpps) rv = 1; else rv = 0; #if 1 /*DIAGNOSTIC?*/ /* be careful about wrap-around */ if (__predict_true(*curpps != INT_MAX)) *curpps = *curpps + 1; #else /* * assume that there's not too many calls to this function. * not sure if the assumption holds, as it depends on *caller's* * behavior, not the behavior of this function. * IMHO it is wrong to make assumption on the caller's behavior, * so the above #if is #if 1, not #ifdef DIAGNOSTIC. */ *curpps = *curpps + 1; #endif return (rv); }
2 1 3 3 3 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 /* $NetBSD: prop_kern.c,v 1.25 2022/08/03 21:13:46 riastradh Exp $ */ /*- * Copyright (c) 2006, 2009 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Jason R. Thorpe. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #if defined(__NetBSD__) #include <sys/types.h> #include <sys/ioctl.h> #include <prop/proplib.h> #if !defined(_KERNEL) && !defined(_STANDALONE) #include <sys/mman.h> #include <errno.h> #include <string.h> #include <stdlib.h> #include <stdio.h> #ifdef RUMP_ACTION #include <rump/rump_syscalls.h> #define ioctl(a,b,c) rump_sys_ioctl(a,b,c) #endif static int _prop_object_externalize_to_pref(prop_object_t obj, struct plistref *pref, char **bufp) { char *buf; switch (prop_object_type(obj)) { case PROP_TYPE_DICTIONARY: buf = prop_dictionary_externalize(obj); break; case PROP_TYPE_ARRAY: buf = prop_array_externalize(obj); break; default: return (ENOTSUP); } if (buf == NULL) { /* Assume we ran out of memory. */ return (ENOMEM); } pref->pref_plist = buf; pref->pref_len = strlen(buf) + 1; *bufp = buf; return (0); } bool prop_array_externalize_to_pref(prop_array_t array, struct plistref *prefp) { char *buf; int rv; rv = _prop_object_externalize_to_pref(array, prefp, &buf); if (rv != 0) errno = rv; /* pass up error value in errno */ return (rv == 0); } /* * prop_array_externalize_to_pref -- * Externalize an array into a plistref for sending to the kernel. */ int prop_array_send_syscall(prop_array_t array, struct plistref *prefp) { if (prop_array_externalize_to_pref(array, prefp)) return 0; else return errno; } bool prop_dictionary_externalize_to_pref(prop_dictionary_t dict, struct plistref *prefp) { char *buf; int rv; rv = _prop_object_externalize_to_pref(dict, prefp, &buf); if (rv != 0) errno = rv; /* pass up error value in errno */ return (rv == 0); } /* * prop_dictionary_externalize_to_pref -- * Externalize an dictionary into a plistref for sending to the kernel. */ int prop_dictionary_send_syscall(prop_dictionary_t dict, struct plistref *prefp) { if (prop_dictionary_externalize_to_pref(dict, prefp)) return 0; else return errno; } static int _prop_object_send_ioctl(prop_object_t obj, int fd, unsigned long cmd) { struct plistref pref; char *buf; int error; error = _prop_object_externalize_to_pref(obj, &pref, &buf); if (error) return (error); if (ioctl(fd, cmd, &pref) == -1) error = errno; else error = 0; free(buf); return (error); } /* * prop_array_send_ioctl -- * Send an array to the kernel using the specified ioctl. */ int prop_array_send_ioctl(prop_array_t array, int fd, unsigned long cmd) { int rv; rv = _prop_object_send_ioctl(array, fd, cmd); if (rv != 0) { errno = rv; /* pass up error value in errno */ return rv; } else return 0; } /* * prop_dictionary_send_ioctl -- * Send a dictionary to the kernel using the specified ioctl. */ int prop_dictionary_send_ioctl(prop_dictionary_t dict, int fd, unsigned long cmd) { int rv; rv = _prop_object_send_ioctl(dict, fd, cmd); if (rv != 0) { errno = rv; /* pass up error value in errno */ return rv; } else return 0; } static int _prop_object_internalize_from_pref(const struct plistref *pref, prop_type_t type, prop_object_t *objp) { prop_object_t obj = NULL; char *buf; int error = 0; if (pref->pref_len == 0) { /* * This should never happen; we should always get the XML * for an empty dictionary if it's really empty. */ error = EIO; goto out; } else { buf = pref->pref_plist; buf[pref->pref_len - 1] = '\0'; /* extra insurance */ switch (type) { case PROP_TYPE_DICTIONARY: obj = prop_dictionary_internalize(buf); break; case PROP_TYPE_ARRAY: obj = prop_array_internalize(buf); break; default: error = ENOTSUP; } (void) munmap(buf, pref->pref_len); if (obj == NULL && error == 0) error = EIO; } out: if (error == 0) *objp = obj; return (error); } /* * prop_array_internalize_from_pref -- * Internalize a pref into a prop_array_t object. */ bool prop_array_internalize_from_pref(const struct plistref *prefp, prop_array_t *arrayp) { int rv; rv = _prop_object_internalize_from_pref(prefp, PROP_TYPE_ARRAY, (prop_object_t *)arrayp); if (rv != 0) errno = rv; /* pass up error value in errno */ return (rv == 0); } /* * prop_array_recv_syscall -- * Internalize an array received from the kernel as pref. */ int prop_array_recv_syscall(const struct plistref *prefp, prop_array_t *arrayp) { if (prop_array_internalize_from_pref(prefp, arrayp)) return 0; else return errno; } /* * prop_dictionary_internalize_from_pref -- * Internalize a pref into a prop_dictionary_t object. */ bool prop_dictionary_internalize_from_pref(const struct plistref *prefp, prop_dictionary_t *dictp) { int rv; rv = _prop_object_internalize_from_pref(prefp, PROP_TYPE_DICTIONARY, (prop_object_t *)dictp); if (rv != 0) errno = rv; /* pass up error value in errno */ return (rv == 0); } /* * prop_dictionary_recv_syscall -- * Internalize a dictionary received from the kernel as pref. */ int prop_dictionary_recv_syscall(const struct plistref *prefp, prop_dictionary_t *dictp) { if (prop_dictionary_internalize_from_pref(prefp, dictp)) return 0; else return errno; } /* * prop_array_recv_ioctl -- * Receive an array from the kernel using the specified ioctl. */ int prop_array_recv_ioctl(int fd, unsigned long cmd, prop_array_t *arrayp) { int rv; struct plistref pref; rv = ioctl(fd, cmd, &pref); if (rv == -1) return errno; rv = _prop_object_internalize_from_pref(&pref, PROP_TYPE_ARRAY, (prop_object_t *)arrayp); if (rv != 0) { errno = rv; /* pass up error value in errno */ return rv; } else return 0; } /* * prop_dictionary_recv_ioctl -- * Receive a dictionary from the kernel using the specified ioctl. */ int prop_dictionary_recv_ioctl(int fd, unsigned long cmd, prop_dictionary_t *dictp) { int rv; struct plistref pref; rv = ioctl(fd, cmd, &pref); if (rv == -1) return errno; rv = _prop_object_internalize_from_pref(&pref, PROP_TYPE_DICTIONARY, (prop_object_t *)dictp); if (rv != 0) { errno = rv; /* pass up error value in errno */ return rv; } else return 0; } /* * prop_dictionary_sendrecv_ioctl -- * Combination send/receive a dictionary to/from the kernel using * the specified ioctl. */ int prop_dictionary_sendrecv_ioctl(prop_dictionary_t dict, int fd, unsigned long cmd, prop_dictionary_t *dictp) { struct plistref pref; char *buf; int error; error = _prop_object_externalize_to_pref(dict, &pref, &buf); if (error != 0) { errno = error; return error; } if (ioctl(fd, cmd, &pref) == -1) error = errno; else error = 0; free(buf); if (error != 0) return error; error = _prop_object_internalize_from_pref(&pref, PROP_TYPE_DICTIONARY, (prop_object_t *)dictp); if (error != 0) { errno = error; /* pass up error value in errno */ return error; } else return 0; } #endif /* !_KERNEL && !_STANDALONE */ #if defined(_KERNEL) #include <sys/param.h> #include <sys/mman.h> #include <sys/errno.h> #include <sys/malloc.h> #include <sys/systm.h> #include <sys/proc.h> #include <sys/resource.h> #include <sys/pool.h> #include <uvm/uvm_extern.h> #include "prop_object_impl.h" /* Arbitrary limit ioctl input to 128KB */ unsigned int prop_object_copyin_limit = 128 * 1024; /* initialize proplib for use in the kernel */ void prop_kern_init(void) { __link_set_decl(prop_linkpools, struct prop_pool_init); struct prop_pool_init * const *pi; __link_set_foreach(pi, prop_linkpools) pool_init((*pi)->pp, (*pi)->size, 0, 0, 0, (*pi)->wchan, &pool_allocator_nointr, IPL_NONE); } static int _prop_object_copyin(const struct plistref *pref, const prop_type_t type, prop_object_t *objp, size_t lim) { prop_object_t obj = NULL; char *buf; int error; if (pref->pref_len >= lim) return E2BIG; /* * Allocate an extra byte so we can guarantee NUL-termination. */ buf = malloc(pref->pref_len + 1, M_TEMP, M_WAITOK); if (buf == NULL) return (ENOMEM); error = copyin(pref->pref_plist, buf, pref->pref_len); if (error) { free(buf, M_TEMP); return (error); } buf[pref->pref_len] = '\0'; switch (type) { case PROP_TYPE_ARRAY: obj = prop_array_internalize(buf); break; case PROP_TYPE_DICTIONARY: obj = prop_dictionary_internalize(buf); break; default: error = ENOTSUP; } free(buf, M_TEMP); if (obj == NULL) { if (error == 0) error = EIO; } else { *objp = obj; } return (error); } static int _prop_object_copyin_ioctl(const struct plistref *pref, const prop_type_t type, const u_long cmd, prop_object_t *objp, size_t lim) { if ((cmd & IOC_IN) == 0) return (EFAULT); return _prop_object_copyin(pref, type, objp, lim); } /* * prop_array_copyin -- * Copy in an array passed as a syscall arg. */ int prop_array_copyin_size(const struct plistref *pref, prop_array_t *arrayp, size_t lim) { return _prop_object_copyin(pref, PROP_TYPE_ARRAY, (prop_object_t *)arrayp, lim); } int prop_array_copyin(const struct plistref *pref, prop_array_t *arrayp) { return prop_array_copyin_size(pref, arrayp, prop_object_copyin_limit); } /* * prop_dictionary_copyin -- * Copy in a dictionary passed as a syscall arg. */ int prop_dictionary_copyin_size(const struct plistref *pref, prop_dictionary_t *dictp, size_t lim) { return _prop_object_copyin(pref, PROP_TYPE_DICTIONARY, (prop_object_t *)dictp, lim); } int prop_dictionary_copyin(const struct plistref *pref, prop_dictionary_t *dictp) { return prop_dictionary_copyin_size(pref, dictp, prop_object_copyin_limit); } /* * prop_array_copyin_ioctl -- * Copy in an array send with an ioctl. */ int prop_array_copyin_ioctl_size(const struct plistref *pref, const u_long cmd, prop_array_t *arrayp, size_t lim) { return _prop_object_copyin_ioctl(pref, PROP_TYPE_ARRAY, cmd, (prop_object_t *)arrayp, lim); } int prop_array_copyin_ioctl(const struct plistref *pref, const u_long cmd, prop_array_t *arrayp) { return prop_array_copyin_ioctl_size(pref, cmd, arrayp, prop_object_copyin_limit); } /* * prop_dictionary_copyin_ioctl -- * Copy in a dictionary sent with an ioctl. */ int prop_dictionary_copyin_ioctl_size(const struct plistref *pref, const u_long cmd, prop_dictionary_t *dictp, size_t lim) { return _prop_object_copyin_ioctl(pref, PROP_TYPE_DICTIONARY, cmd, (prop_object_t *)dictp, lim); } int prop_dictionary_copyin_ioctl(const struct plistref *pref, const u_long cmd, prop_dictionary_t *dictp) { return prop_dictionary_copyin_ioctl_size(pref, cmd, dictp, prop_object_copyin_limit); } static int _prop_object_copyout(struct plistref *pref, prop_object_t obj) { struct lwp *l = curlwp; /* XXX */ struct proc *p = l->l_proc; char *buf; void *uaddr; size_t len, rlen; int error = 0; switch (prop_object_type(obj)) { case PROP_TYPE_ARRAY: buf = prop_array_externalize(obj); break; case PROP_TYPE_DICTIONARY: buf = prop_dictionary_externalize(obj); break; default: return (ENOTSUP); } if (buf == NULL) return (ENOMEM); len = strlen(buf) + 1; rlen = round_page(len); uaddr = NULL; error = uvm_mmap_anon(p, &uaddr, rlen); if (error == 0) { error = copyout(buf, uaddr, len); if (error == 0) { pref->pref_plist = uaddr; pref->pref_len = len; } } free(buf, M_TEMP); return (error); } /* * prop_array_copyout -- * Copy out an array to a syscall arg. */ int prop_array_copyout(struct plistref *pref, prop_array_t array) { return (_prop_object_copyout(pref, array)); } /* * prop_dictionary_copyout -- * Copy out a dictionary to a syscall arg. */ int prop_dictionary_copyout(struct plistref *pref, prop_dictionary_t dict) { return (_prop_object_copyout(pref, dict)); } static int _prop_object_copyout_ioctl(struct plistref *pref, const u_long cmd, prop_object_t obj) { if ((cmd & IOC_OUT) == 0) return (EFAULT); return _prop_object_copyout(pref, obj); } /* * prop_array_copyout_ioctl -- * Copy out an array being received with an ioctl. */ int prop_array_copyout_ioctl(struct plistref *pref, const u_long cmd, prop_array_t array) { return (_prop_object_copyout_ioctl(pref, cmd, array)); } /* * prop_dictionary_copyout_ioctl -- * Copy out a dictionary being received with an ioctl. */ int prop_dictionary_copyout_ioctl(struct plistref *pref, const u_long cmd, prop_dictionary_t dict) { return ( _prop_object_copyout_ioctl(pref, cmd, dict)); } #endif /* _KERNEL */ #endif /* __NetBSD__ */
769 771 34 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 /* $NetBSD: strncmp.c,v 1.3 2018/02/04 20:22:17 mrg Exp $ */ /* * Copyright (c) 1989, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include <sys/cdefs.h> #if defined(LIBC_SCCS) && !defined(lint) #if 0 static char sccsid[] = "@(#)strncmp.c 8.1 (Berkeley) 6/4/93"; #else __RCSID("$NetBSD: strncmp.c,v 1.3 2018/02/04 20:22:17 mrg Exp $"); #endif #endif /* LIBC_SCCS and not lint */ #if !defined(_KERNEL) && !defined(_STANDALONE) #include <assert.h> #include <string.h> #else #include <lib/libkern/libkern.h> #endif int strncmp(const char *s1, const char *s2, size_t n) { if (n == 0) return (0); do { if (*s1 != *s2++) return (*(const unsigned char *)s1 - *(const unsigned char *)--s2); if (*s1++ == 0) break; } while (--n != 0); return (0); }
19 19 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 /* $NetBSD: in6.h,v 1.101 2021/07/31 10:12:04 andvar Exp $ */ /* $KAME: in6.h,v 1.83 2001/03/29 02:55:07 jinmei Exp $ */ /* * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the project nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * Copyright (c) 1982, 1986, 1990, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)in.h 8.3 (Berkeley) 1/3/94 */ #ifndef _NETINET6_IN6_H_ #define _NETINET6_IN6_H_ #include <sys/featuretest.h> #ifndef __KAME_NETINET_IN_H_INCLUDED_ #error "do not include netinet6/in6.h directly, include netinet/in.h. see RFC2553" #endif #include <sys/socket.h> #include <sys/endian.h> /* ntohl */ /* * Identification of the network protocol stack * for *BSD-current/release: http://www.kame.net/dev/cvsweb.cgi/kame/COVERAGE * has the table of implementation/integration differences. */ #define __KAME__ #define __KAME_VERSION "NetBSD-current" /* * Local port number conventions: * * Ports < IPPORT_RESERVED are reserved for privileged processes (e.g. root), * unless a kernel is compiled with IPNOPRIVPORTS defined. * * When a user does a bind(2) or connect(2) with a port number of zero, * a non-conflicting local port address is chosen. * * The default range is IPPORT_ANONMIN to IPPORT_ANONMAX, although * that is settable by sysctl(3); net.inet.ip.anonportmin and * net.inet.ip.anonportmax respectively. * * A user may set the IPPROTO_IP option IP_PORTRANGE to change this * default assignment range. * * The value IP_PORTRANGE_DEFAULT causes the default behavior. * * The value IP_PORTRANGE_HIGH is the same as IP_PORTRANGE_DEFAULT, * and exists only for FreeBSD compatibility purposes. * * The value IP_PORTRANGE_LOW changes the range to the "low" are * that is (by convention) restricted to privileged processes. * This convention is based on "vouchsafe" principles only. * It is only secure if you trust the remote host to restrict these ports. * The range is IPPORT_RESERVEDMIN to IPPORT_RESERVEDMAX. */ #if defined(_NETBSD_SOURCE) #define IPV6PORT_RESERVED 1024 #define IPV6PORT_ANONMIN 49152 #define IPV6PORT_ANONMAX 65535 #define IPV6PORT_RESERVEDMIN 600 #define IPV6PORT_RESERVEDMAX (IPV6PORT_RESERVED-1) #endif /* * IPv6 address */ struct in6_addr { union { __uint8_t __u6_addr8[16]; __uint16_t __u6_addr16[8]; uint32_t __u6_addr32[4]; } __u6_addr; /* 128-bit IP6 address */ }; #define s6_addr __u6_addr.__u6_addr8 #ifdef _KERNEL /* XXX nonstandard */ #define s6_addr8 __u6_addr.__u6_addr8 #define s6_addr16 __u6_addr.__u6_addr16 #define s6_addr32 __u6_addr.__u6_addr32 #endif #define INET6_ADDRSTRLEN 46 /* * Socket address for IPv6 */ #if defined(_NETBSD_SOURCE) #define SIN6_LEN #endif struct sockaddr_in6 { uint8_t sin6_len; /* length of this struct(socklen_t)*/ sa_family_t sin6_family; /* AF_INET6 (sa_family_t) */ in_port_t sin6_port; /* Transport layer port */ uint32_t sin6_flowinfo; /* IP6 flow information */ struct in6_addr sin6_addr; /* IP6 address */ uint32_t sin6_scope_id; /* scope zone index */ }; /* * Local definition for masks */ #ifdef _KERNEL /* XXX nonstandard */ #define IN6MASK0 {{{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }}} #define IN6MASK32 {{{ 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, \ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }}} #define IN6MASK64 {{{ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, \ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }}} #define IN6MASK96 {{{ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, \ 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00 }}} #define IN6MASK128 {{{ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, \ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff }}} #endif #ifdef _KERNEL extern const struct sockaddr_in6 sa6_any; extern const struct in6_addr in6mask0; extern const struct in6_addr in6mask32; extern const struct in6_addr in6mask64; extern const struct in6_addr in6mask96; extern const struct in6_addr in6mask128; #endif /* _KERNEL */ /* * Macros started with IPV6_ADDR is KAME local */ #ifdef _KERNEL /* XXX nonstandard */ #if BYTE_ORDER == BIG_ENDIAN #define IPV6_ADDR_INT32_ONE 1 #define IPV6_ADDR_INT32_TWO 2 #define IPV6_ADDR_INT32_MNL 0xff010000 #define IPV6_ADDR_INT32_MLL 0xff020000 #define IPV6_ADDR_INT32_SMP 0x0000ffff #define IPV6_ADDR_INT16_ULL 0xfe80 #define IPV6_ADDR_INT16_USL 0xfec0 #define IPV6_ADDR_INT16_MLL 0xff02 #elif BYTE_ORDER == LITTLE_ENDIAN #define IPV6_ADDR_INT32_ONE 0x01000000 #define IPV6_ADDR_INT32_TWO 0x02000000 #define IPV6_ADDR_INT32_MNL 0x000001ff #define IPV6_ADDR_INT32_MLL 0x000002ff #define IPV6_ADDR_INT32_SMP 0xffff0000 #define IPV6_ADDR_INT16_ULL 0x80fe #define IPV6_ADDR_INT16_USL 0xc0fe #define IPV6_ADDR_INT16_MLL 0x02ff #endif #endif /* * Definition of some useful macros to handle IP6 addresses */ #define IN6ADDR_ANY_INIT \ {{{ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, \ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }}} #define IN6ADDR_LOOPBACK_INIT \ {{{ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, \ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01 }}} #define IN6ADDR_NODELOCAL_ALLNODES_INIT \ {{{ 0xff, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, \ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01 }}} #define IN6ADDR_LINKLOCAL_ALLNODES_INIT \ {{{ 0xff, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, \ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01 }}} #define IN6ADDR_LINKLOCAL_ALLROUTERS_INIT \ {{{ 0xff, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, \ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02 }}} extern const struct in6_addr in6addr_any; extern const struct in6_addr in6addr_loopback; extern const struct in6_addr in6addr_nodelocal_allnodes; extern const struct in6_addr in6addr_linklocal_allnodes; extern const struct in6_addr in6addr_linklocal_allrouters; #define IN6_ARE_ADDR_EQUAL(a, b) \ (memcmp(&(a)->s6_addr[0], &(b)->s6_addr[0], sizeof(struct in6_addr)) == 0) /* * Unspecified */ #define IN6_IS_ADDR_UNSPECIFIED(a) \ ((a)->__u6_addr.__u6_addr32[0] == 0 && \ (a)->__u6_addr.__u6_addr32[1] == 0 && \ (a)->__u6_addr.__u6_addr32[2] == 0 && \ (a)->__u6_addr.__u6_addr32[3] == 0) /* * Loopback */ #define IN6_IS_ADDR_LOOPBACK(a) \ ((a)->__u6_addr.__u6_addr32[0] == 0 && \ (a)->__u6_addr.__u6_addr32[1] == 0 && \ (a)->__u6_addr.__u6_addr32[2] == 0 && \ (a)->__u6_addr.__u6_addr32[3] == ntohl(1)) /* * IPv4 compatible */ #define IN6_IS_ADDR_V4COMPAT(a) \ ((a)->__u6_addr.__u6_addr32[0] == 0 && \ (a)->__u6_addr.__u6_addr32[1] == 0 && \ (a)->__u6_addr.__u6_addr32[2] == 0 && \ (a)->__u6_addr.__u6_addr32[3] != 0 && \ (a)->__u6_addr.__u6_addr32[3] != ntohl(1)) /* * Mapped */ #define IN6_IS_ADDR_V4MAPPED(a) \ ((a)->__u6_addr.__u6_addr32[0] == 0 && \ (a)->__u6_addr.__u6_addr32[1] == 0 && \ (a)->__u6_addr.__u6_addr32[2] == ntohl(0x0000ffff)) /* * KAME Scope Values */ #ifdef _KERNEL /* XXX nonstandard */ #define IPV6_ADDR_SCOPE_NODELOCAL 0x01 #define IPV6_ADDR_SCOPE_INTFACELOCAL 0x01 #define IPV6_ADDR_SCOPE_LINKLOCAL 0x02 #define IPV6_ADDR_SCOPE_SITELOCAL 0x05 #define IPV6_ADDR_SCOPE_ORGLOCAL 0x08 /* just used in this file */ #define IPV6_ADDR_SCOPE_GLOBAL 0x0e #else #define __IPV6_ADDR_SCOPE_NODELOCAL 0x01 #define __IPV6_ADDR_SCOPE_LINKLOCAL 0x02 #define __IPV6_ADDR_SCOPE_SITELOCAL 0x05 #define __IPV6_ADDR_SCOPE_ORGLOCAL 0x08 /* just used in this file */ #define __IPV6_ADDR_SCOPE_GLOBAL 0x0e #endif /* * Unicast Scope * Note that we must check topmost 10 bits only, not 16 bits (see RFC2373). */ #define IN6_IS_ADDR_LINKLOCAL(a) \ (((a)->s6_addr[0] == 0xfe) && (((a)->s6_addr[1] & 0xc0) == 0x80)) #define IN6_IS_ADDR_SITELOCAL(a) \ (((a)->s6_addr[0] == 0xfe) && (((a)->s6_addr[1] & 0xc0) == 0xc0)) /* * Multicast */ #define IN6_IS_ADDR_MULTICAST(a) ((a)->s6_addr[0] == 0xff) #ifdef _KERNEL /* XXX nonstandard */ #define IPV6_ADDR_MC_SCOPE(a) ((a)->s6_addr[1] & 0x0f) #else #define __IPV6_ADDR_MC_SCOPE(a) ((a)->s6_addr[1] & 0x0f) #endif /* * Multicast Scope */ #ifdef _KERNEL /* refers nonstandard items */ #define IN6_IS_ADDR_MC_NODELOCAL(a) \ (IN6_IS_ADDR_MULTICAST(a) && \ (IPV6_ADDR_MC_SCOPE(a) == IPV6_ADDR_SCOPE_NODELOCAL)) #define IN6_IS_ADDR_MC_INTFACELOCAL(a) \ (IN6_IS_ADDR_MULTICAST(a) && \ (IPV6_ADDR_MC_SCOPE(a) == IPV6_ADDR_SCOPE_INTFACELOCAL)) #define IN6_IS_ADDR_MC_LINKLOCAL(a) \ (IN6_IS_ADDR_MULTICAST(a) && \ (IPV6_ADDR_MC_SCOPE(a) == IPV6_ADDR_SCOPE_LINKLOCAL)) #define IN6_IS_ADDR_MC_SITELOCAL(a) \ (IN6_IS_ADDR_MULTICAST(a) && \ (IPV6_ADDR_MC_SCOPE(a) == IPV6_ADDR_SCOPE_SITELOCAL)) #define IN6_IS_ADDR_MC_ORGLOCAL(a) \ (IN6_IS_ADDR_MULTICAST(a) && \ (IPV6_ADDR_MC_SCOPE(a) == IPV6_ADDR_SCOPE_ORGLOCAL)) #define IN6_IS_ADDR_MC_GLOBAL(a) \ (IN6_IS_ADDR_MULTICAST(a) && \ (IPV6_ADDR_MC_SCOPE(a) == IPV6_ADDR_SCOPE_GLOBAL)) #else #define IN6_IS_ADDR_MC_NODELOCAL(a) \ (IN6_IS_ADDR_MULTICAST(a) && \ (__IPV6_ADDR_MC_SCOPE(a) == __IPV6_ADDR_SCOPE_NODELOCAL)) #define IN6_IS_ADDR_MC_LINKLOCAL(a) \ (IN6_IS_ADDR_MULTICAST(a) && \ (__IPV6_ADDR_MC_SCOPE(a) == __IPV6_ADDR_SCOPE_LINKLOCAL)) #define IN6_IS_ADDR_MC_SITELOCAL(a) \ (IN6_IS_ADDR_MULTICAST(a) && \ (__IPV6_ADDR_MC_SCOPE(a) == __IPV6_ADDR_SCOPE_SITELOCAL)) #define IN6_IS_ADDR_MC_ORGLOCAL(a) \ (IN6_IS_ADDR_MULTICAST(a) && \ (__IPV6_ADDR_MC_SCOPE(a) == __IPV6_ADDR_SCOPE_ORGLOCAL)) #define IN6_IS_ADDR_MC_GLOBAL(a) \ (IN6_IS_ADDR_MULTICAST(a) && \ (__IPV6_ADDR_MC_SCOPE(a) == __IPV6_ADDR_SCOPE_GLOBAL)) #endif #ifdef _KERNEL /* nonstandard */ /* * KAME Scope */ #define IN6_IS_SCOPE_LINKLOCAL(a) \ ((IN6_IS_ADDR_LINKLOCAL(a)) || \ (IN6_IS_ADDR_MC_LINKLOCAL(a))) #define IN6_IS_SCOPE_EMBEDDABLE(__a) \ (IN6_IS_SCOPE_LINKLOCAL(__a) || IN6_IS_ADDR_MC_INTFACELOCAL(__a)) #define IFA6_IS_DEPRECATED(a) \ ((a)->ia6_lifetime.ia6t_pltime != ND6_INFINITE_LIFETIME && \ (u_int32_t)((time_uptime - (a)->ia6_updatetime)) > \ (a)->ia6_lifetime.ia6t_pltime) #define IFA6_IS_INVALID(a) \ ((a)->ia6_lifetime.ia6t_vltime != ND6_INFINITE_LIFETIME && \ (u_int32_t)((time_uptime - (a)->ia6_updatetime)) > \ (a)->ia6_lifetime.ia6t_vltime) #endif /* * Options for use with [gs]etsockopt at the IPV6 level. * First word of comment is data type; bool is stored in int. */ /* no hdrincl */ #if 0 /* These are deprecated non-standard options which are no longer supported. */ #define IPV6_OPTIONS 1 /* buf/ip6_opts; set/get IP6 options */ #define IPV6_RECVOPTS 5 /* bool; receive all IP6 opts w/dgram */ #define IPV6_RECVRETOPTS 6 /* bool; receive IP6 opts for response */ #define IPV6_RECVDSTADDR 7 /* bool; receive IP6 dst addr w/dgram */ #define IPV6_RETOPTS 8 /* ip6_opts; set/get IP6 options */ #endif #define IPV6_SOCKOPT_RESERVED1 3 /* reserved for future use */ #define IPV6_UNICAST_HOPS 4 /* int; IP6 hops */ #define IPV6_MULTICAST_IF 9 /* u_int; set/get IP6 multicast i/f */ #define IPV6_MULTICAST_HOPS 10 /* int; set/get IP6 multicast hops */ #define IPV6_MULTICAST_LOOP 11 /* u_int; set/get IP6 multicast loopback */ /* The join and leave membership option numbers need to match with the v4 ones */ #define IPV6_JOIN_GROUP 12 /* ip6_mreq; join a group membership */ #define IPV6_LEAVE_GROUP 13 /* ip6_mreq; leave a group membership */ #define IPV6_PORTRANGE 14 /* int; range to choose for unspec port */ #if defined(_NETBSD_SOURCE) #define IPV6_PORTALGO 17 /* int; port selection algo (rfc6056) */ #define ICMP6_FILTER 18 /* icmp6_filter; icmp6 filter */ #endif /* RFC2292 options */ #ifdef _KERNEL #define IPV6_2292PKTINFO 19 /* bool; send/recv if, src/dst addr */ #define IPV6_2292HOPLIMIT 20 /* bool; hop limit */ #define IPV6_2292NEXTHOP 21 /* bool; next hop addr */ #define IPV6_2292HOPOPTS 22 /* bool; hop-by-hop option */ #define IPV6_2292DSTOPTS 23 /* bool; destination option */ #define IPV6_2292RTHDR 24 /* bool; routing header */ #define IPV6_2292PKTOPTIONS 25 /* buf/cmsghdr; set/get IPv6 options */ #endif #define IPV6_CHECKSUM 26 /* int; checksum offset for raw socket */ #define IPV6_V6ONLY 27 /* bool; make AF_INET6 sockets v6 only */ #define IPV6_IPSEC_POLICY 28 /* struct; get/set security policy */ #define IPV6_FAITH 29 /* bool; accept FAITH'ed connections */ /* new socket options introduced in RFC3542 */ #define IPV6_RTHDRDSTOPTS 35 /* ip6_dest; send dst option before rthdr */ #define IPV6_RECVPKTINFO 36 /* bool; recv if, dst addr */ #define IPV6_RECVHOPLIMIT 37 /* bool; recv hop limit */ #define IPV6_RECVRTHDR 38 /* bool; recv routing header */ #define IPV6_RECVHOPOPTS 39 /* bool; recv hop-by-hop option */ #define IPV6_RECVDSTOPTS 40 /* bool; recv dst option after rthdr */ #ifdef _KERNEL #define IPV6_RECVRTHDRDSTOPTS 41 /* bool; recv dst option before rthdr */ #endif #define IPV6_USE_MIN_MTU 42 /* bool; send packets at the minimum MTU */ #define IPV6_RECVPATHMTU 43 /* bool; notify an according MTU */ #define IPV6_PATHMTU 44 /* mtuinfo; get the current path MTU (sopt), 4 bytes int; MTU notification (cmsg) */ /* more new socket options introduced in RFC3542 */ #define IPV6_PKTINFO 46 /* in6_pktinfo; send if, src addr */ #define IPV6_HOPLIMIT 47 /* int; send hop limit */ #define IPV6_NEXTHOP 48 /* sockaddr; next hop addr */ #define IPV6_HOPOPTS 49 /* ip6_hbh; send hop-by-hop option */ #define IPV6_DSTOPTS 50 /* ip6_dest; send dst option before rthdr */ #define IPV6_RTHDR 51 /* ip6_rthdr; send routing header */ #define IPV6_RECVTCLASS 57 /* bool; recv traffic class values */ #ifdef _KERNEL #define IPV6_OTCLASS 58 /* u_int8_t; send traffic class value */ #endif #define IPV6_TCLASS 61 /* int; send traffic class value */ #define IPV6_DONTFRAG 62 /* bool; disable IPv6 fragmentation */ #define IPV6_PREFER_TEMPADDR 63 /* int; prefer temporary address as * the source address */ #define IPV6_BINDANY 64 /* bool: allow bind to any address */ /* to define items, should talk with KAME guys first, for *BSD compatibility */ #define IPV6_RTHDR_LOOSE 0 /* this hop need not be a neighbor. XXX old spec */ #define IPV6_RTHDR_STRICT 1 /* this hop must be a neighbor. XXX old spec */ #define IPV6_RTHDR_TYPE_0 0 /* IPv6 routing header type 0 */ /* * Defaults and limits for options */ #define IPV6_DEFAULT_MULTICAST_HOPS 1 /* normally limit m'casts to 1 hop */ #define IPV6_DEFAULT_MULTICAST_LOOP 1 /* normally hear sends if a member */ /* * Argument structure for IPV6_JOIN_GROUP and IPV6_LEAVE_GROUP. */ struct ipv6_mreq { struct in6_addr ipv6mr_multiaddr; unsigned int ipv6mr_interface; }; /* * IPV6_PKTINFO: Packet information(RFC2292 sec 5) */ struct in6_pktinfo { struct in6_addr ipi6_addr; /* src/dst IPv6 address */ unsigned int ipi6_ifindex; /* send/recv interface index */ }; /* * Control structure for IPV6_RECVPATHMTU socket option. */ struct ip6_mtuinfo { struct sockaddr_in6 ip6m_addr; /* or sockaddr_storage? */ uint32_t ip6m_mtu; }; /* * Argument for IPV6_PORTRANGE: * - which range to search when port is unspecified at bind() or connect() */ #define IPV6_PORTRANGE_DEFAULT 0 /* default range */ #define IPV6_PORTRANGE_HIGH 1 /* "high" - request firewall bypass */ #define IPV6_PORTRANGE_LOW 2 /* "low" - vouchsafe security */ #if defined(_NETBSD_SOURCE) /* * Definitions for inet6 sysctl operations. * * Third level is protocol number. * Fourth level is desired variable within that protocol. */ /* * Names for IP sysctl objects */ #define IPV6CTL_FORWARDING 1 /* act as router */ #define IPV6CTL_SENDREDIRECTS 2 /* may send redirects when forwarding*/ #define IPV6CTL_DEFHLIM 3 /* default Hop-Limit */ /* IPV6CTL_DEFMTU=4, never implemented */ #define IPV6CTL_FORWSRCRT 5 /* forward source-routed dgrams */ #define IPV6CTL_STATS 6 /* stats */ #define IPV6CTL_MRTSTATS 7 /* multicast forwarding stats */ #define IPV6CTL_MRTPROTO 8 /* multicast routing protocol */ #define IPV6CTL_MAXFRAGPACKETS 9 /* max packets reassembly queue */ #define IPV6CTL_SOURCECHECK 10 /* verify source route and intf */ #define IPV6CTL_SOURCECHECK_LOGINT 11 /* minimum logging interval */ /* 12 was IPV6CTL_ACCEPT_RTADV */ #define IPV6CTL_KEEPFAITH 13 #define IPV6CTL_LOG_INTERVAL 14 #define IPV6CTL_HDRNESTLIMIT 15 #define IPV6CTL_DAD_COUNT 16 #define IPV6CTL_AUTO_FLOWLABEL 17 #define IPV6CTL_DEFMCASTHLIM 18 #define IPV6CTL_GIF_HLIM 19 /* default HLIM for gif encap packet */ #define IPV6CTL_KAME_VERSION 20 #define IPV6CTL_USE_DEPRECATED 21 /* use deprecated addr (RFC2462 5.5.4) */ /* 22 was IPV6CTL_RR_PRUNE */ /* 23: reserved */ #define IPV6CTL_V6ONLY 24 /* 25 to 27: reserved */ #define IPV6CTL_ANONPORTMIN 28 /* minimum ephemeral port */ #define IPV6CTL_ANONPORTMAX 29 /* maximum ephemeral port */ #define IPV6CTL_LOWPORTMIN 30 /* minimum reserved port */ #define IPV6CTL_LOWPORTMAX 31 /* maximum reserved port */ /* 32 to 34: reserved */ #define IPV6CTL_AUTO_LINKLOCAL 35 /* automatic link-local addr assign */ /* 36 to 37: reserved */ #define IPV6CTL_ADDRCTLPOLICY 38 /* get/set address selection policy */ #define IPV6CTL_USE_DEFAULTZONE 39 /* use default scope zone */ /* 40: reserved */ #define IPV6CTL_MAXFRAGS 41 /* max fragments */ #define IPV6CTL_IFQ 42 /* IPv6 packet input queue */ /* 43 was IPV6CTL_RTADV_MAXROUTES */ /* 44 was IPV6CTL_RTADV_NUMROUTES */ #define IPV6CTL_GIF_PMTU 45 /* gif(4) Path MTU setting */ #define IPV6CTL_IPSEC_HLIM 46 /* default HLIM for ipsecif encap packet */ #define IPV6CTL_IPSEC_PMTU 47 /* ipsecif(4) Path MTU setting */ #endif /* _NETBSD_SOURCE */ #ifdef _KERNEL struct cmsghdr; /* * in6_cksum_phdr: * * Compute significant parts of the IPv6 checksum pseudo-header * for use in a delayed TCP/UDP checksum calculation. * * Args: * * src Source IPv6 address * dst Destination IPv6 address * len htonl(proto-hdr-len) * nxt htonl(next-proto-number) * * NOTE: We expect the src and dst addresses to be 16-bit * aligned! */ static __inline u_int16_t __unused in6_cksum_phdr(const struct in6_addr *src, const struct in6_addr *dst, u_int32_t len, u_int32_t nxt) { u_int32_t sum = 0; const u_int16_t *w; /*LINTED*/ w = (const u_int16_t *) src; sum += w[0]; if (!IN6_IS_SCOPE_LINKLOCAL(src)) sum += w[1]; sum += w[2]; sum += w[3]; sum += w[4]; sum += w[5]; sum += w[6]; sum += w[7]; /*LINTED*/ w = (const u_int16_t *) dst; sum += w[0]; if (!IN6_IS_SCOPE_LINKLOCAL(dst)) sum += w[1]; sum += w[2]; sum += w[3]; sum += w[4]; sum += w[5]; sum += w[6]; sum += w[7]; sum += (u_int16_t)(len >> 16) + (u_int16_t)(len /*& 0xffff*/); sum += (u_int16_t)(nxt >> 16) + (u_int16_t)(nxt /*& 0xffff*/); sum = (u_int16_t)(sum >> 16) + (u_int16_t)(sum /*& 0xffff*/); if (sum > 0xffff) sum -= 0xffff; return (sum); } struct mbuf; struct ifnet; int sockaddr_in6_cmp(const struct sockaddr *, const struct sockaddr *); struct sockaddr *sockaddr_in6_externalize(struct sockaddr *, socklen_t, const struct sockaddr *); int in6_cksum(struct mbuf *, u_int8_t, u_int32_t, u_int32_t); int in6_localaddr(const struct in6_addr *); int in6_addrscope(const struct in6_addr *); struct in6_ifaddr *in6_ifawithifp(struct ifnet *, struct in6_addr *); extern void in6_if_link_up(struct ifnet *); extern void in6_if_link_down(struct ifnet *); extern void in6_if_link_state_change(struct ifnet *, int); extern void in6_if_up(struct ifnet *); extern void in6_if_down(struct ifnet *); extern void addrsel_policy_init(void); extern u_char ip6_protox[]; struct ip6_hdr; int in6_tunnel_validate(const struct ip6_hdr *, const struct in6_addr *, const struct in6_addr *); #define satosin6(sa) ((struct sockaddr_in6 *)(sa)) #define satocsin6(sa) ((const struct sockaddr_in6 *)(sa)) #define sin6tosa(sin6) ((struct sockaddr *)(sin6)) #define sin6tocsa(sin6) ((const struct sockaddr *)(sin6)) #define ifatoia6(ifa) ((struct in6_ifaddr *)(ifa)) static __inline void sockaddr_in6_init1(struct sockaddr_in6 *sin6, const struct in6_addr *addr, in_port_t port, uint32_t flowinfo, uint32_t scope_id) { sin6->sin6_port = port; sin6->sin6_flowinfo = flowinfo; sin6->sin6_addr = *addr; sin6->sin6_scope_id = scope_id; } static __inline void sockaddr_in6_init(struct sockaddr_in6 *sin6, const struct in6_addr *addr, in_port_t port, uint32_t flowinfo, uint32_t scope_id) { sin6->sin6_family = AF_INET6; sin6->sin6_len = sizeof(*sin6); sockaddr_in6_init1(sin6, addr, port, flowinfo, scope_id); } static __inline struct sockaddr * sockaddr_in6_alloc(const struct in6_addr *addr, in_port_t port, uint32_t flowinfo, uint32_t scope_id, int flags) { struct sockaddr *sa; if ((sa = sockaddr_alloc(AF_INET6, sizeof(struct sockaddr_in6), flags)) == NULL) return NULL; sockaddr_in6_init1(satosin6(sa), addr, port, flowinfo, scope_id); return sa; } #endif /* _KERNEL */ #if defined(_NETBSD_SOURCE) #include <machine/ansi.h> #ifdef _BSD_SIZE_T_ typedef _BSD_SIZE_T_ size_t; #define _SIZE_T #undef _BSD_SIZE_T_ #endif #include <sys/cdefs.h> __BEGIN_DECLS struct cmsghdr; void in6_in_2_v4mapin6(const struct in_addr *, struct in6_addr *); void in6_sin6_2_sin(struct sockaddr_in *, struct sockaddr_in6 *); void in6_sin_2_v4mapsin6(const struct sockaddr_in *, struct sockaddr_in6 *); void in6_sin6_2_sin_in_sock(struct sockaddr *); void in6_sin_2_v4mapsin6_in_sock(struct sockaddr **); #define INET6_IS_ADDR_LINKLOCAL 1 #define INET6_IS_ADDR_MC_LINKLOCAL 2 #define INET6_IS_ADDR_SITELOCAL 4 void inet6_getscopeid(struct sockaddr_in6 *, int); void inet6_putscopeid(struct sockaddr_in6 *, int); extern int inet6_option_space(int); extern int inet6_option_init(void *, struct cmsghdr **, int); extern int inet6_option_append(struct cmsghdr *, const uint8_t *, int, int); extern uint8_t *inet6_option_alloc(struct cmsghdr *, int, int, int); extern int inet6_option_next(const struct cmsghdr *, uint8_t **); extern int inet6_option_find(const struct cmsghdr *, uint8_t **, int); extern size_t inet6_rthdr_space(int, int); extern struct cmsghdr *inet6_rthdr_init(void *, int); extern int inet6_rthdr_add(struct cmsghdr *, const struct in6_addr *, unsigned int); extern int inet6_rthdr_lasthop(struct cmsghdr *, unsigned int); #if 0 /* not implemented yet */ extern int inet6_rthdr_reverse(const struct cmsghdr *, struct cmsghdr *); #endif extern int inet6_rthdr_segments(const struct cmsghdr *); extern struct in6_addr *inet6_rthdr_getaddr(struct cmsghdr *, int); extern int inet6_rthdr_getflags(const struct cmsghdr *, int); extern int inet6_opt_init(void *, socklen_t); extern int inet6_opt_append(void *, socklen_t, int, uint8_t, socklen_t, uint8_t, void **); extern int inet6_opt_finish(void *, socklen_t, int); extern int inet6_opt_set_val(void *, int, void *, socklen_t); extern int inet6_opt_next(void *, socklen_t, int, uint8_t *, socklen_t *, void **); extern int inet6_opt_find(void *, socklen_t, int, uint8_t, socklen_t *, void **); extern int inet6_opt_get_val(void *, int, void *, socklen_t); extern socklen_t inet6_rth_space(int, int); extern void *inet6_rth_init(void *, socklen_t, int, int); extern int inet6_rth_add(void *, const struct in6_addr *); extern int inet6_rth_reverse(const void *, void *); extern int inet6_rth_segments(const void *); extern struct in6_addr *inet6_rth_getaddr(const void *, int); __END_DECLS #endif /* _NETBSD_SOURCE */ #if defined(_KERNEL) || defined(_TEST) int in6_print(char *, size_t, const struct in6_addr *); #define IN6_PRINT(b, a) (in6_print((b), sizeof(b), (a)), (b)) int sin6_print(char *, size_t, const void *); #endif #endif /* !_NETINET6_IN6_H_ */
191 185 49 2 1 31 4 2 72 69 4 1 1 7 24 23 1 53 3 47 1 1 1 1 1 2 1 3 3 1 1 1 1 4 4 4 2 2 4 2 1 2 4 1 1 1 6 6 3 5 2 3 78 77 77 1 47 20 3 3 2 2 1 1 1 6 1 2 2 1 1 2 1 2 2 2 7 1 4 1 3 4 1 1 3 1 1 4 47 42 42 77 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 /* $NetBSD: sys_ptrace_common.c,v 1.92 2021/08/09 20:49:10 andvar Exp $ */ /*- * Copyright (c) 2008, 2009 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Andrew Doran. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /*- * Copyright (c) 1982, 1986, 1989, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * This code is derived from software contributed to Berkeley by * Jan-Simon Pendry. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)sys_process.c 8.1 (Berkeley) 6/10/93 */ /*- * Copyright (c) 1993 Jan-Simon Pendry. * Copyright (c) 1994 Christopher G. Demetriou. All rights reserved. * * This code is derived from software contributed to Berkeley by * Jan-Simon Pendry. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)sys_process.c 8.1 (Berkeley) 6/10/93 */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: sys_ptrace_common.c,v 1.92 2021/08/09 20:49:10 andvar Exp $"); #ifdef _KERNEL_OPT #include "opt_ptrace.h" #include "opt_ktrace.h" #include "opt_pax.h" #include "opt_compat_netbsd32.h" #endif #if defined(__HAVE_COMPAT_NETBSD32) && !defined(COMPAT_NETBSD32) \ && !defined(_RUMPKERNEL) #define COMPAT_NETBSD32 #endif #include <sys/param.h> #include <sys/systm.h> #include <sys/proc.h> #include <sys/errno.h> #include <sys/exec.h> #include <sys/pax.h> #include <sys/ptrace.h> #include <sys/uio.h> #include <sys/ras.h> #include <sys/kmem.h> #include <sys/kauth.h> #include <sys/mount.h> #include <sys/syscallargs.h> #include <sys/module.h> #include <sys/condvar.h> #include <sys/mutex.h> #include <sys/compat_stub.h> #include <uvm/uvm_extern.h> #include <machine/reg.h> # ifdef PTRACE_DEBUG # define DPRINTF(a) uprintf a # else # define DPRINTF(a) # endif static kauth_listener_t ptrace_listener; static int process_auxv_offset(struct proc *, struct uio *); extern int user_va0_disable; #if 0 static int ptrace_cbref; static kmutex_t ptrace_mtx; static kcondvar_t ptrace_cv; #endif #ifdef PT_GETREGS # define case_PT_GETREGS case PT_GETREGS: #else # define case_PT_GETREGS #endif #ifdef PT_SETREGS # define case_PT_SETREGS case PT_SETREGS: #else # define case_PT_SETREGS #endif #ifdef PT_GETFPREGS # define case_PT_GETFPREGS case PT_GETFPREGS: #else # define case_PT_GETFPREGS #endif #ifdef PT_SETFPREGS # define case_PT_SETFPREGS case PT_SETFPREGS: #else # define case_PT_SETFPREGS #endif #ifdef PT_GETDBREGS # define case_PT_GETDBREGS case PT_GETDBREGS: #else # define case_PT_GETDBREGS #endif #ifdef PT_SETDBREGS # define case_PT_SETDBREGS case PT_SETDBREGS: #else # define case_PT_SETDBREGS #endif static int ptrace_listener_cb(kauth_cred_t cred, kauth_action_t action, void *cookie, void *arg0, void *arg1, void *arg2, void *arg3) { struct proc *p; int result; #ifdef PT_SETDBREGS extern int user_set_dbregs; #endif result = KAUTH_RESULT_DEFER; p = arg0; #if 0 mutex_enter(&ptrace_mtx); ptrace_cbref++; mutex_exit(&ptrace_mtx); #endif if (action != KAUTH_PROCESS_PTRACE) goto out; switch ((u_long)arg1) { #ifdef PT_SETDBREGS case_PT_SETDBREGS if (kauth_cred_getuid(cred) != 0 && user_set_dbregs == 0) { result = KAUTH_RESULT_DENY; break; } #endif /* FALLTHROUGH */ case PT_TRACE_ME: case PT_ATTACH: case PT_WRITE_I: case PT_WRITE_D: case PT_READ_I: case PT_READ_D: case PT_IO: case_PT_GETREGS case_PT_SETREGS case_PT_GETFPREGS case_PT_SETFPREGS case_PT_GETDBREGS case PT_SET_EVENT_MASK: case PT_GET_EVENT_MASK: case PT_GET_PROCESS_STATE: case PT_SET_SIGINFO: case PT_GET_SIGINFO: #ifdef __HAVE_PTRACE_MACHDEP PTRACE_MACHDEP_REQUEST_CASES #endif if (kauth_cred_getuid(cred) != kauth_cred_getuid(p->p_cred) || ISSET(p->p_flag, PK_SUGID)) { break; } result = KAUTH_RESULT_ALLOW; break; #ifdef PT_STEP case PT_STEP: case PT_SETSTEP: case PT_CLEARSTEP: #endif case PT_CONTINUE: case PT_KILL: case PT_DETACH: case PT_LWPINFO: case PT_SYSCALL: case PT_SYSCALLEMU: case PT_DUMPCORE: case PT_RESUME: case PT_SUSPEND: case PT_STOP: case PT_LWPSTATUS: case PT_LWPNEXT: case PT_SET_SIGPASS: case PT_GET_SIGPASS: result = KAUTH_RESULT_ALLOW; break; default: break; } out: #if 0 mutex_enter(&ptrace_mtx); if (--ptrace_cbref == 0) cv_broadcast(&ptrace_cv); mutex_exit(&ptrace_mtx); #endif return result; } static struct proc * ptrace_find(struct lwp *l, int req, pid_t pid) { struct proc *t; /* "A foolish consistency..." XXX */ if (req == PT_TRACE_ME) { t = l->l_proc; mutex_enter(t->p_lock); return t; } /* Find the process we're supposed to be operating on. */ t = proc_find(pid); if (t == NULL) return NULL; /* XXX-elad */ mutex_enter(t->p_lock); int error = kauth_authorize_process(l->l_cred, KAUTH_PROCESS_CANSEE, t, KAUTH_ARG(KAUTH_REQ_PROCESS_CANSEE_ENTRY), NULL, NULL); if (error) { mutex_exit(t->p_lock); return NULL; } return t; } static int ptrace_allowed(struct lwp *l, int req, struct proc *t, struct proc *p, bool *locked) { *locked = false; /* * Grab a reference on the process to prevent it from execing or * exiting. */ if (!rw_tryenter(&t->p_reflock, RW_READER)) return EBUSY; *locked = true; /* Make sure we can operate on it. */ switch (req) { case PT_TRACE_ME: /* * You can't say to the parent of a process to start tracing if: * (1) the parent is initproc, */ if (p->p_pptr == initproc) return EPERM; /* * (2) the process is initproc, or */ if (p == initproc) return EPERM; /* * (3) the child is already traced. */ if (ISSET(p->p_slflag, PSL_TRACED)) return EBUSY; return 0; case PT_ATTACH: /* * You can't attach to a process if: * (1) it's the process that's doing the attaching, */ if (t == p) return EINVAL; /* * (2) it's a system process, */ if (t->p_flag & PK_SYSTEM) return EPERM; /* * (3) the tracer is initproc, */ if (p == initproc) return EPERM; /* * (4) it's already being traced, */ if (ISSET(t->p_slflag, PSL_TRACED)) return EBUSY; /* * (5) it's a vfork(2)ed parent of the current process, or */ if (ISSET(p->p_lflag, PL_PPWAIT) && p->p_pptr == t) return EPERM; /* * (6) the tracer is chrooted, and its root directory is * not at or above the root directory of the tracee */ mutex_exit(t->p_lock); /* XXXSMP */ int tmp = proc_isunder(t, l); mutex_enter(t->p_lock); /* XXXSMP */ if (!tmp) return EPERM; return 0; case PT_READ_I: case PT_READ_D: case PT_WRITE_I: case PT_WRITE_D: case PT_IO: case PT_SET_SIGINFO: case PT_GET_SIGINFO: case_PT_GETREGS case_PT_SETREGS case_PT_GETFPREGS case_PT_SETFPREGS case_PT_GETDBREGS case_PT_SETDBREGS #ifdef __HAVE_PTRACE_MACHDEP PTRACE_MACHDEP_REQUEST_CASES #endif /* * You can't read/write the memory or registers of a process * if the tracer is chrooted, and its root directory is not at * or above the root directory of the tracee. */ mutex_exit(t->p_lock); /* XXXSMP */ tmp = proc_isunder(t, l); mutex_enter(t->p_lock); /* XXXSMP */ if (!tmp) return EPERM; /*FALLTHROUGH*/ case PT_CONTINUE: case PT_KILL: case PT_DETACH: case PT_LWPINFO: case PT_SYSCALL: case PT_SYSCALLEMU: case PT_DUMPCORE: #ifdef PT_STEP case PT_STEP: case PT_SETSTEP: case PT_CLEARSTEP: #endif case PT_SET_EVENT_MASK: case PT_GET_EVENT_MASK: case PT_GET_PROCESS_STATE: case PT_RESUME: case PT_SUSPEND: case PT_STOP: case PT_LWPSTATUS: case PT_LWPNEXT: case PT_SET_SIGPASS: case PT_GET_SIGPASS: /* * You can't do what you want to the process if: * (1) It's not being traced at all, */ if (!ISSET(t->p_slflag, PSL_TRACED)) return EPERM; /* * (2) it's not being traced by _you_, or */ if (t->p_pptr != p) { DPRINTF(("parent %d != %d\n", t->p_pptr->p_pid, p->p_pid)); return EBUSY; } /* * (3) it's not currently stopped. * * As an exception allow PT_KILL and PT_STOP here. */ if (req != PT_KILL && req != PT_STOP && (t->p_stat != SSTOP || !t->p_waited /* XXXSMP */)) { DPRINTF(("stat %d flag %d\n", t->p_stat, !t->p_waited)); return EBUSY; } return 0; default: /* It was not a legal request. */ return EINVAL; } } static int ptrace_needs_hold(int req) { switch (req) { #ifdef PT_STEP case PT_STEP: #endif case PT_CONTINUE: case PT_DETACH: case PT_KILL: case PT_SYSCALL: case PT_SYSCALLEMU: case PT_ATTACH: case PT_TRACE_ME: case PT_GET_SIGINFO: case PT_SET_SIGINFO: case PT_STOP: return 1; default: return 0; } } static int ptrace_get_siginfo(struct proc *t, struct ptrace_methods *ptm, void *addr, size_t data) { struct ptrace_siginfo psi; memset(&psi, 0, sizeof(psi)); psi.psi_siginfo._info = t->p_sigctx.ps_info; psi.psi_lwpid = t->p_sigctx.ps_lwp; DPRINTF(("%s: lwp=%d signal=%d\n", __func__, psi.psi_lwpid, psi.psi_siginfo.si_signo)); return ptm->ptm_copyout_siginfo(&psi, addr, data); } static int ptrace_set_siginfo(struct proc *t, struct lwp **lt, struct ptrace_methods *ptm, void *addr, size_t data) { struct ptrace_siginfo psi; int error = ptm->ptm_copyin_siginfo(&psi, addr, data); if (error) return error; /* Check that the data is a valid signal number or zero. */ if (psi.psi_siginfo.si_signo < 0 || psi.psi_siginfo.si_signo >= NSIG) return EINVAL; t->p_sigctx.ps_faked = true; t->p_sigctx.ps_info = psi.psi_siginfo._info; t->p_sigctx.ps_lwp = psi.psi_lwpid; DPRINTF(("%s: lwp=%d signal=%d\n", __func__, psi.psi_lwpid, psi.psi_siginfo.si_signo)); return 0; } static int ptrace_get_sigpass(struct proc *t, void *addr, size_t data) { sigset_t set; if (data > sizeof(set) || data <= 0) { DPRINTF(("%s: invalid data: %zu < %zu <= 0\n", __func__, sizeof(set), data)); return EINVAL; } set = t->p_sigctx.ps_sigpass; return copyout(&set, addr, data); } static int ptrace_set_sigpass(struct proc *t, void *addr, size_t data) { sigset_t set; int error; if (data > sizeof(set) || data <= 0) { DPRINTF(("%s: invalid data: %zu < %zu <= 0\n", __func__, sizeof(set), data)); return EINVAL; } memset(&set, 0, sizeof(set)); if ((error = copyin(addr, &set, data))) return error; /* We catch SIGSTOP and cannot intercept SIGKILL. */ sigminusset(&sigcantmask, &set); t->p_sigctx.ps_sigpass = set; return 0; } static int ptrace_get_event_mask(struct proc *t, void *addr, size_t data) { struct ptrace_event pe; if (data != sizeof(pe)) { DPRINTF(("%s: %zu != %zu\n", __func__, data, sizeof(pe))); return EINVAL; } memset(&pe, 0, sizeof(pe)); pe.pe_set_event = ISSET(t->p_slflag, PSL_TRACEFORK) ? PTRACE_FORK : 0; pe.pe_set_event |= ISSET(t->p_slflag, PSL_TRACEVFORK) ? PTRACE_VFORK : 0; pe.pe_set_event |= ISSET(t->p_slflag, PSL_TRACEVFORK_DONE) ? PTRACE_VFORK_DONE : 0; pe.pe_set_event |= ISSET(t->p_slflag, PSL_TRACELWP_CREATE) ? PTRACE_LWP_CREATE : 0; pe.pe_set_event |= ISSET(t->p_slflag, PSL_TRACELWP_EXIT) ? PTRACE_LWP_EXIT : 0; pe.pe_set_event |= ISSET(t->p_slflag, PSL_TRACEPOSIX_SPAWN) ? PTRACE_POSIX_SPAWN : 0; DPRINTF(("%s: lwp=%d event=%#x\n", __func__, t->p_sigctx.ps_lwp, pe.pe_set_event)); return copyout(&pe, addr, sizeof(pe)); } static int ptrace_set_event_mask(struct proc *t, void *addr, size_t data) { struct ptrace_event pe; int error; if (data != sizeof(pe)) { DPRINTF(("%s: %zu != %zu\n", __func__, data, sizeof(pe))); return EINVAL; } if ((error = copyin(addr, &pe, sizeof(pe))) != 0) return error; DPRINTF(("%s: lwp=%d event=%#x\n", __func__, t->p_sigctx.ps_lwp, pe.pe_set_event)); if (pe.pe_set_event & PTRACE_FORK) SET(t->p_slflag, PSL_TRACEFORK); else CLR(t->p_slflag, PSL_TRACEFORK); if (pe.pe_set_event & PTRACE_VFORK) SET(t->p_slflag, PSL_TRACEVFORK); else CLR(t->p_slflag, PSL_TRACEVFORK); if (pe.pe_set_event & PTRACE_VFORK_DONE) SET(t->p_slflag, PSL_TRACEVFORK_DONE); else CLR(t->p_slflag, PSL_TRACEVFORK_DONE); if (pe.pe_set_event & PTRACE_LWP_CREATE) SET(t->p_slflag, PSL_TRACELWP_CREATE); else CLR(t->p_slflag, PSL_TRACELWP_CREATE); if (pe.pe_set_event & PTRACE_LWP_EXIT) SET(t->p_slflag, PSL_TRACELWP_EXIT); else CLR(t->p_slflag, PSL_TRACELWP_EXIT); if (pe.pe_set_event & PTRACE_POSIX_SPAWN) SET(t->p_slflag, PSL_TRACEPOSIX_SPAWN); else CLR(t->p_slflag, PSL_TRACEPOSIX_SPAWN); return 0; } static int ptrace_get_process_state(struct proc *t, void *addr, size_t data) { struct _ksiginfo *si; struct ptrace_state ps; if (data != sizeof(ps)) { DPRINTF(("%s: %zu != %zu\n", __func__, data, sizeof(ps))); return EINVAL; } if (t->p_sigctx.ps_info._signo != SIGTRAP || (t->p_sigctx.ps_info._code != TRAP_CHLD && t->p_sigctx.ps_info._code != TRAP_LWP)) { memset(&ps, 0, sizeof(ps)); } else { si = &t->p_sigctx.ps_info; KASSERT(si->_reason._ptrace_state._pe_report_event > 0); KASSERT(si->_reason._ptrace_state._option._pe_other_pid > 0); ps.pe_report_event = si->_reason._ptrace_state._pe_report_event; CTASSERT(sizeof(ps.pe_other_pid) == sizeof(ps.pe_lwp)); ps.pe_other_pid = si->_reason._ptrace_state._option._pe_other_pid; } DPRINTF(("%s: lwp=%d event=%#x pid=%d lwp=%d\n", __func__, t->p_sigctx.ps_lwp, ps.pe_report_event, ps.pe_other_pid, ps.pe_lwp)); return copyout(&ps, addr, sizeof(ps)); } static int ptrace_lwpinfo(struct proc *t, struct lwp **lt, void *addr, size_t data) { struct ptrace_lwpinfo pl; if (data != sizeof(pl)) { DPRINTF(("%s: %zu != %zu\n", __func__, data, sizeof(pl))); return EINVAL; } int error = copyin(addr, &pl, sizeof(pl)); if (error) return error; lwpid_t tmp = pl.pl_lwpid; lwp_delref(*lt); mutex_enter(t->p_lock); if (tmp == 0) *lt = lwp_find_first(t); else { *lt = lwp_find(t, tmp); if (*lt == NULL) { mutex_exit(t->p_lock); return ESRCH; } *lt = LIST_NEXT(*lt, l_sibling); } while (*lt != NULL && (!lwp_alive(*lt) || ((*lt)->l_flag & LW_SYSTEM) != 0)) *lt = LIST_NEXT(*lt, l_sibling); pl.pl_lwpid = 0; pl.pl_event = 0; if (*lt) { lwp_addref(*lt); pl.pl_lwpid = (*lt)->l_lid; if ((*lt)->l_flag & LW_WSUSPEND) pl.pl_event = PL_EVENT_SUSPENDED; /* * If we match the lwp, or it was sent to every lwp, * we set PL_EVENT_SIGNAL. * XXX: ps_lwp == 0 means everyone and noone, so * check ps_signo too. */ else if ((*lt)->l_lid == t->p_sigctx.ps_lwp || (t->p_sigctx.ps_lwp == 0 && t->p_sigctx.ps_info._signo)) { DPRINTF(("%s: lwp=%d siglwp=%d signo %d\n", __func__, pl.pl_lwpid, t->p_sigctx.ps_lwp, t->p_sigctx.ps_info._signo)); pl.pl_event = PL_EVENT_SIGNAL; } } mutex_exit(t->p_lock); DPRINTF(("%s: lwp=%d event=%#x\n", __func__, pl.pl_lwpid, pl.pl_event)); return copyout(&pl, addr, sizeof(pl)); } static int ptrace_lwpstatus(struct proc *t, struct ptrace_methods *ptm, struct lwp **lt, void *addr, size_t data, bool next) { struct ptrace_lwpstatus pls; struct lwp *l; int error; if (data > sizeof(pls) || data < sizeof(lwpid_t)) { DPRINTF(("%s: invalid data: %zu < %zu < %zu\n", __func__, sizeof(lwpid_t), data, sizeof(pls))); return EINVAL; } error = copyin(addr, &pls.pl_lwpid, sizeof(lwpid_t)); if (error) return error; if (next) { lwp_delref(*lt); lwpid_t tmp = pls.pl_lwpid; mutex_enter(t->p_lock); if (tmp == 0) *lt = lwp_find_first(t); else { *lt = lwp_find(t, tmp); if (*lt == NULL) { mutex_exit(t->p_lock); return ESRCH; } *lt = LIST_NEXT(*lt, l_sibling); } while (*lt != NULL && (!lwp_alive(*lt) || ((*lt)->l_flag & LW_SYSTEM) != 0)) *lt = LIST_NEXT(*lt, l_sibling); if (*lt == NULL) { memset(&pls, 0, sizeof(pls)); mutex_exit(t->p_lock); goto out; } lwp_addref(*lt); mutex_exit(t->p_lock); pls.pl_lwpid = (*lt)->l_lid; } else { if ((error = ptrace_update_lwp(t, lt, pls.pl_lwpid)) != 0) return error; } l = *lt; ptrace_read_lwpstatus(l, &pls); out: DPRINTF(("%s: lwp=%d sigpend=%02x%02x%02x%02x sigmask=%02x%02x%02x%02x " "name='%s' private=%p\n", __func__, pls.pl_lwpid, pls.pl_sigpend.__bits[0], pls.pl_sigpend.__bits[1], pls.pl_sigpend.__bits[2], pls.pl_sigpend.__bits[3], pls.pl_sigmask.__bits[0], pls.pl_sigmask.__bits[1], pls.pl_sigmask.__bits[2], pls.pl_sigmask.__bits[3], pls.pl_name, pls.pl_private)); return ptm->ptm_copyout_lwpstatus(&pls, addr, data); } static int ptrace_startstop(struct proc *t, struct lwp **lt, int rq, void *addr, size_t data) { int error; if ((error = ptrace_update_lwp(t, lt, data)) != 0) return error; DPRINTF(("%s: lwp=%d request=%d\n", __func__, (*lt)->l_lid, rq)); lwp_lock(*lt); if (rq == PT_SUSPEND) (*lt)->l_flag |= LW_DBGSUSPEND; else { (*lt)->l_flag &= ~LW_DBGSUSPEND; if ((*lt)->l_flag != LSSUSPENDED) (*lt)->l_stat = LSSTOP; } lwp_unlock(*lt); return 0; } #ifdef PT_REGISTERS static int ptrace_uio_dir(int req) { switch (req) { case_PT_GETREGS case_PT_GETFPREGS case_PT_GETDBREGS return UIO_READ; case_PT_SETREGS case_PT_SETFPREGS case_PT_SETDBREGS return UIO_WRITE; default: return -1; } } static int ptrace_regs(struct lwp *l, struct lwp **lt, int rq, struct ptrace_methods *ptm, void *addr, size_t data) { int error; struct proc *p, *t; struct vmspace *vm; p = l->l_proc; /* tracer */ t = (*lt)->l_proc; /* traced */ if ((error = ptrace_update_lwp(t, lt, data)) != 0) return error; int dir = ptrace_uio_dir(rq); size_t size; int (*func)(struct lwp *, struct lwp *, struct uio *); DPRINTF(("%s: lwp=%d request=%d\n", __func__, l->l_lid, rq)); switch (rq) { #if defined(PT_SETREGS) || defined(PT_GETREGS) case_PT_GETREGS case_PT_SETREGS if (!process_validregs(*lt)) return EINVAL; size = PROC_REGSZ(p); func = ptm->ptm_doregs; break; #endif #if defined(PT_SETFPREGS) || defined(PT_GETFPREGS) case_PT_GETFPREGS case_PT_SETFPREGS if (!process_validfpregs(*lt)) return EINVAL; size = PROC_FPREGSZ(p); func = ptm->ptm_dofpregs; break; #endif #if defined(PT_SETDBREGS) || defined(PT_GETDBREGS) case_PT_GETDBREGS case_PT_SETDBREGS if (!process_validdbregs(*lt)) return EINVAL; size = PROC_DBREGSZ(p); func = ptm->ptm_dodbregs; break; #endif default: return EINVAL; } error = proc_vmspace_getref(l->l_proc, &vm); if (error) return error; struct uio uio; struct iovec iov; iov.iov_base = addr; iov.iov_len = size; uio.uio_iov = &iov; uio.uio_iovcnt = 1; uio.uio_offset = 0; uio.uio_resid = iov.iov_len; uio.uio_rw = dir; uio.uio_vmspace = vm; error = (*func)(l, *lt, &uio); uvmspace_free(vm); return error; } #endif static int ptrace_sendsig(struct lwp *l, int req, struct proc *t, struct lwp *lt, int signo, int resume_all) { ksiginfo_t ksi; /* Finally, deliver the requested signal (or none). */ if (t->p_stat == SSTOP) { /* * Unstop the process. If it needs to take a * signal, make all efforts to ensure that at * an LWP runs to see it. */ t->p_xsig = signo; /* * signo > 0 check prevents a potential panic, as * sigismember(&...,0) is invalid check and signo * can be equal to 0 as a special case of no-signal. */ if (signo > 0 && sigismember(&stopsigmask, signo)) { t->p_waited = 0; child_psignal(t, 0); } else if (resume_all) proc_unstop(t); else lwp_unstop(lt); return 0; } KASSERT(req == PT_KILL || req == PT_STOP || req == PT_ATTACH); KSI_INIT(&ksi); ksi.ksi_signo = signo; ksi.ksi_code = SI_USER; ksi.ksi_pid = l->l_proc->p_pid; ksi.ksi_uid = kauth_cred_geteuid(l->l_cred); t->p_sigctx.ps_faked = false; DPRINTF(("%s: pid=%d.%d signal=%d resume_all=%d\n", __func__, t->p_pid, lt->l_lid, signo, resume_all)); return kpsignal2(t, &ksi); } static int ptrace_dumpcore(struct lwp *lt, char *path, size_t len) { int error; if (path != NULL) { if (len >= MAXPATHLEN) return EINVAL; char *src = path; path = kmem_alloc(len + 1, KM_SLEEP); error = copyin(src, path, len); if (error) goto out; path[len] = '\0'; } DPRINTF(("%s: lwp=%d\n", __func__, lt->l_lid)); MODULE_HOOK_CALL(coredump_hook, (lt, path), 0, error); out: if (path) kmem_free(path, len + 1); return error; } static int ptrace_doio(struct lwp *l, struct proc *t, struct lwp *lt, struct ptrace_io_desc *piod, void *addr, bool sysspace) { struct uio uio; struct iovec iov; int error, tmp; error = 0; iov.iov_base = piod->piod_addr; iov.iov_len = piod->piod_len; uio.uio_iov = &iov; uio.uio_iovcnt = 1; uio.uio_offset = (off_t)(unsigned long)piod->piod_offs; uio.uio_resid = piod->piod_len; DPRINTF(("%s: lwp=%d request=%d\n", __func__, l->l_lid, piod->piod_op)); switch (piod->piod_op) { case PIOD_READ_D: case PIOD_READ_I: uio.uio_rw = UIO_READ; break; case PIOD_WRITE_D: case PIOD_WRITE_I: /* * Can't write to a RAS */ if (ras_lookup(t, addr) != (void *)-1) { return EACCES; } uio.uio_rw = UIO_WRITE; break; case PIOD_READ_AUXV: uio.uio_rw = UIO_READ; tmp = t->p_execsw->es_arglen; if (uio.uio_offset > tmp) return EIO; if (uio.uio_resid > tmp - uio.uio_offset) uio.uio_resid = tmp - uio.uio_offset; piod->piod_len = iov.iov_len = uio.uio_resid; error = process_auxv_offset(t, &uio); break; default: error = EINVAL; break; } if (error) return error; if (sysspace) { uio.uio_vmspace = vmspace_kernel(); } else { error = proc_vmspace_getref(l->l_proc, &uio.uio_vmspace); if (error) return error; } error = process_domem(l, lt, &uio); if (!sysspace) uvmspace_free(uio.uio_vmspace); if (error) return error; piod->piod_len -= uio.uio_resid; return 0; } int do_ptrace(struct ptrace_methods *ptm, struct lwp *l, int req, pid_t pid, void *addr, int data, register_t *retval) { struct proc *p = l->l_proc; struct lwp *lt = NULL; struct lwp *lt2; struct proc *t; /* target process */ struct ptrace_io_desc piod; int error, write, tmp, pheld; int signo = 0; int resume_all; bool locked; error = 0; /* * If attaching or detaching, we need to get a write hold on the * proclist lock so that we can re-parent the target process. */ mutex_enter(&proc_lock); t = ptrace_find(l, req, pid); if (t == NULL) { mutex_exit(&proc_lock); return ESRCH; } pheld = 1; if ((error = ptrace_allowed(l, req, t, p, &locked)) != 0) goto out; if ((error = kauth_authorize_process(l->l_cred, KAUTH_PROCESS_PTRACE, t, KAUTH_ARG(req), NULL, NULL)) != 0) goto out; if ((lt = lwp_find_first(t)) == NULL) { error = ESRCH; goto out; } /* Do single-step fixup if needed. */ FIX_SSTEP(t); KASSERT(lt != NULL); lwp_addref(lt); /* * Which locks do we need held? XXX Ugly. */ if ((pheld = ptrace_needs_hold(req)) == 0) { mutex_exit(t->p_lock); mutex_exit(&proc_lock); } /* Now do the operation. */ write = 0; *retval = 0; tmp = 0; resume_all = 1; switch (req) { case PT_TRACE_ME: /* Just set the trace flag. */ SET(t->p_slflag, PSL_TRACED); t->p_opptr = t->p_pptr; break; /* * The I and D separate address space has been inherited from PDP-11. * The 16-bit UNIX started with a single address space per program, * but was extended to two 16-bit (2 x 64kb) address spaces. * * We no longer maintain this feature in maintained architectures, but * we keep the API for backward compatibility. Currently the I and D * operations are exactly the same and not distinguished in debuggers. */ case PT_WRITE_I: case PT_WRITE_D: write = 1; tmp = data; /* FALLTHROUGH */ case PT_READ_I: case PT_READ_D: piod.piod_addr = &tmp; piod.piod_len = sizeof(tmp); piod.piod_offs = addr; piod.piod_op = write ? PIOD_WRITE_D : PIOD_READ_D; if ((error = ptrace_doio(l, t, lt, &piod, addr, true)) != 0) break; /* * For legacy reasons we treat here two results as success: * - incomplete transfer piod.piod_len < sizeof(tmp) * - no transfer piod.piod_len == 0 * * This means that there is no way to determine whether * transfer operation was performed in PT_WRITE and PT_READ * calls. */ if (!write) *retval = tmp; break; case PT_IO: if ((error = ptm->ptm_copyin_piod(&piod, addr, data)) != 0) break; if (piod.piod_len < 1) { error = EINVAL; break; } if ((error = ptrace_doio(l, t, lt, &piod, addr, false)) != 0) break; /* * For legacy reasons we treat here two results as success: * - incomplete transfer piod.piod_len < sizeof(tmp) * - no transfer piod.piod_len == 0 */ error = ptm->ptm_copyout_piod(&piod, addr, data); break; case PT_DUMPCORE: error = ptrace_dumpcore(lt, addr, data); break; #ifdef PT_STEP case PT_STEP: /* * From the 4.4BSD PRM: * "Execution continues as in request PT_CONTINUE; however * as soon as possible after execution of at least one * instruction, execution stops again. [ ... ]" */ #endif case PT_CONTINUE: case PT_SYSCALL: case PT_DETACH: if (req == PT_SYSCALL) { if (!ISSET(t->p_slflag, PSL_SYSCALL)) { SET(t->p_slflag, PSL_SYSCALL); #ifdef __HAVE_SYSCALL_INTERN (*t->p_emul->e_syscall_intern)(t); #endif } } else { if (ISSET(t->p_slflag, PSL_SYSCALL)) { CLR(t->p_slflag, PSL_SYSCALL); #ifdef __HAVE_SYSCALL_INTERN (*t->p_emul->e_syscall_intern)(t); #endif } } t->p_trace_enabled = trace_is_enabled(t); /* * Pick up the LWPID, if supplied. There are two cases: * data < 0 : step or continue single thread, lwp = -data * data > 0 in PT_STEP : step this thread, continue others * For operations other than PT_STEP, data > 0 means * data is the signo to deliver to the process. */ tmp = data; if (tmp >= 0) { #ifdef PT_STEP if (req == PT_STEP) signo = 0; else #endif { signo = tmp; tmp = 0; /* don't search for LWP */ } } else if (tmp == INT_MIN) { error = ESRCH; break; } else { tmp = -tmp; } if (tmp > 0) { if (req == PT_DETACH) { error = EINVAL; break; } lwp_delref2 (lt); lt = lwp_find(t, tmp); if (lt == NULL) { error = ESRCH; break; } lwp_addref(lt); resume_all = 0; signo = 0; } /* * From the 4.4BSD PRM: * "The data argument is taken as a signal number and the * child's execution continues at location addr as if it * incurred that signal. Normally the signal number will * be either 0 to indicate that the signal that caused the * stop should be ignored, or that value fetched out of * the process's image indicating which signal caused * the stop. If addr is (int *)1 then execution continues * from where it stopped." */ /* Check that the data is a valid signal number or zero. */ if (signo < 0 || signo >= NSIG) { error = EINVAL; break; } /* Prevent process deadlock */ if (resume_all) { #ifdef PT_STEP if (req == PT_STEP) { if (lt->l_flag & (LW_WSUSPEND | LW_DBGSUSPEND)) { error = EDEADLK; break; } } else #endif { error = EDEADLK; LIST_FOREACH(lt2, &t->p_lwps, l_sibling) { if ((lt2->l_flag & (LW_WSUSPEND | LW_DBGSUSPEND)) == 0 ) { error = 0; break; } } if (error != 0) break; } } else { if (lt->l_flag & (LW_WSUSPEND | LW_DBGSUSPEND)) { error = EDEADLK; break; } } /* * Reject setting program counter to 0x0 if VA0 is disabled. * * Not all kernels implement this feature to set Program * Counter in one go in PT_CONTINUE and similar operations. * This causes portability issues as passing address 0x0 * on these kernels is no-operation, but can cause failure * in most cases on NetBSD. */ if (user_va0_disable && addr == 0) { error = EINVAL; break; } /* If the address parameter is not (int *)1, set the pc. */ if ((int *)addr != (int *)1) { error = process_set_pc(lt, addr); if (error != 0) break; } #ifdef PT_STEP /* * Arrange for a single-step, if that's requested and possible. * More precisely, set the single step status as requested for * the requested thread, and clear it for other threads. */ LIST_FOREACH(lt2, &t->p_lwps, l_sibling) { error = process_sstep(lt2, ISSET(lt2->l_pflag, LP_SINGLESTEP)); if (error) break; } if (error) break; error = process_sstep(lt, ISSET(lt->l_pflag, LP_SINGLESTEP) || req == PT_STEP); if (error) break; #endif if (req == PT_DETACH) { CLR(t->p_slflag, PSL_TRACED|PSL_TRACEDCHILD|PSL_SYSCALL); /* clear sigpass mask */ sigemptyset(&t->p_sigctx.ps_sigpass); /* give process back to original parent or init */ if (t->p_opptr != t->p_pptr) { struct proc *pp = t->p_opptr; proc_reparent(t, pp ? pp : initproc); } /* not being traced any more */ t->p_opptr = NULL; /* clear single step */ LIST_FOREACH(lt2, &t->p_lwps, l_sibling) { CLR(lt2->l_pflag, LP_SINGLESTEP); } CLR(lt->l_pflag, LP_SINGLESTEP); } sendsig: error = ptrace_sendsig(l, req, t, lt, signo, resume_all); break; case PT_SYSCALLEMU: if (!ISSET(t->p_slflag, PSL_SYSCALL) || t->p_stat != SSTOP) { error = EINVAL; break; } SET(t->p_slflag, PSL_SYSCALLEMU); break; #ifdef PT_STEP case PT_SETSTEP: write = 1; /* FALLTHROUGH */ case PT_CLEARSTEP: /* write = 0 done above. */ if ((error = ptrace_update_lwp(t, &lt, data)) != 0) break; if (write) SET(lt->l_pflag, LP_SINGLESTEP); else CLR(lt->l_pflag, LP_SINGLESTEP); break; #endif case PT_KILL: /* just send the process a KILL signal. */ signo = SIGKILL; goto sendsig; /* in PT_CONTINUE, above. */ case PT_STOP: /* just send the process a STOP signal. */ signo = SIGSTOP; goto sendsig; /* in PT_CONTINUE, above. */ case PT_ATTACH: /* * Go ahead and set the trace flag. * Save the old parent (it's reset in * _DETACH, and also in kern_exit.c:wait4() * Reparent the process so that the tracing * proc gets to see all the action. * Stop the target. */ proc_changeparent(t, p); signo = SIGSTOP; goto sendsig; case PT_GET_EVENT_MASK: error = ptrace_get_event_mask(t, addr, data); break; case PT_SET_EVENT_MASK: error = ptrace_set_event_mask(t, addr, data); break; case PT_GET_PROCESS_STATE: error = ptrace_get_process_state(t, addr, data); break; case PT_LWPINFO: error = ptrace_lwpinfo(t, &lt, addr, data); break; case PT_SET_SIGINFO: error = ptrace_set_siginfo(t, &lt, ptm, addr, data); break; case PT_GET_SIGINFO: error = ptrace_get_siginfo(t, ptm, addr, data); break; case PT_RESUME: case PT_SUSPEND: error = ptrace_startstop(t, &lt, req, addr, data); break; case PT_LWPSTATUS: error = ptrace_lwpstatus(t, ptm, &lt, addr, data, false); break; case PT_LWPNEXT: error = ptrace_lwpstatus(t, ptm, &lt, addr, data, true); break; case PT_SET_SIGPASS: error = ptrace_set_sigpass(t, addr, data); break; case PT_GET_SIGPASS: error = ptrace_get_sigpass(t, addr, data); break; #ifdef PT_REGISTERS case_PT_SETREGS case_PT_GETREGS case_PT_SETFPREGS case_PT_GETFPREGS case_PT_SETDBREGS case_PT_GETDBREGS error = ptrace_regs(l, &lt, req, ptm, addr, data); break; #endif #ifdef __HAVE_PTRACE_MACHDEP PTRACE_MACHDEP_REQUEST_CASES error = ptrace_machdep_dorequest(l, &lt, req, addr, data); break; #endif } out: if (pheld) { mutex_exit(t->p_lock); mutex_exit(&proc_lock); } if (lt != NULL) lwp_delref(lt); if (locked) rw_exit(&t->p_reflock); return error; } static int process_auxv_offset(struct proc *p, struct uio *uio) { struct ps_strings pss; int error; off_t off = (off_t)p->p_psstrp; if ((error = copyin_psstrings(p, &pss)) != 0) return error; if (pss.ps_envstr == NULL) return EIO; #ifdef COMPAT_NETBSD32 if (p->p_flag & PK_32) uio->uio_offset += (off_t)((vaddr_t)pss.ps_envstr + sizeof(uint32_t) * (pss.ps_nenvstr + 1)); else #endif uio->uio_offset += (off_t)(vaddr_t)(pss.ps_envstr + pss.ps_nenvstr + 1); #ifdef __MACHINE_STACK_GROWS_UP if (uio->uio_offset < off) return EIO; #else if (uio->uio_offset > off) return EIO; if ((uio->uio_offset + uio->uio_resid) > off) uio->uio_resid = off - uio->uio_offset; #endif return 0; } MODULE(MODULE_CLASS_EXEC, ptrace_common, NULL); static int ptrace_common_init(void) { #if 0 mutex_init(&ptrace_mtx, MUTEX_DEFAULT, IPL_NONE); cv_init(&ptrace_cv, "ptracecb"); ptrace_cbref = 0; #endif ptrace_listener = kauth_listen_scope(KAUTH_SCOPE_PROCESS, ptrace_listener_cb, NULL); return 0; } static int ptrace_common_fini(void) { kauth_unlisten_scope(ptrace_listener); #if 0 /* Make sure no-one is executing our kauth listener */ mutex_enter(&ptrace_mtx); while (ptrace_cbref != 0) cv_wait(&ptrace_cv, &ptrace_mtx); mutex_exit(&ptrace_mtx); mutex_destroy(&ptrace_mtx); cv_destroy(&ptrace_cv); #endif return 0; } static int ptrace_common_modcmd(modcmd_t cmd, void *arg) { int error; switch (cmd) { case MODULE_CMD_INIT: error = ptrace_common_init(); break; case MODULE_CMD_FINI: error = ptrace_common_fini(); break; default: ptrace_hooks(); error = ENOTTY; break; } return error; }
19 20 20 19 20 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 /* $NetBSD: pktqueue.c,v 1.22 2023/05/28 08:09:34 andvar Exp $ */ /*- * Copyright (c) 2014 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Mindaugas Rasiukevicius. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /* * The packet queue (pktqueue) interface is a lockless IP input queue * which also abstracts and handles network ISR scheduling. It provides * a mechanism to enable receiver-side packet steering (RPS). */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: pktqueue.c,v 1.22 2023/05/28 08:09:34 andvar Exp $"); #ifdef _KERNEL_OPT #include "opt_net_mpsafe.h" #endif #include <sys/param.h> #include <sys/types.h> #include <sys/atomic.h> #include <sys/cpu.h> #include <sys/pcq.h> #include <sys/intr.h> #include <sys/mbuf.h> #include <sys/proc.h> #include <sys/percpu.h> #include <sys/xcall.h> #include <sys/once.h> #include <sys/queue.h> #include <sys/rwlock.h> #include <net/pktqueue.h> #include <net/rss_config.h> #include <netinet/in.h> #include <netinet/ip.h> #include <netinet/ip6.h> struct pktqueue { /* * The lock used for a barrier mechanism. The barrier counter, * as well as the drop counter, are managed atomically though. * Ensure this group is in a separate cache line. */ union { struct { kmutex_t pq_lock; volatile u_int pq_barrier; }; uint8_t _pad[COHERENCY_UNIT]; }; /* The size of the queue, counters and the interrupt handler. */ u_int pq_maxlen; percpu_t * pq_counters; void * pq_sih; /* The per-CPU queues. */ struct percpu * pq_pcq; /* struct pcq * */ /* The linkage on the list of all pktqueues. */ LIST_ENTRY(pktqueue) pq_list; }; /* The counters of the packet queue. */ #define PQCNT_ENQUEUE 0 #define PQCNT_DEQUEUE 1 #define PQCNT_DROP 2 #define PQCNT_NCOUNTERS 3 typedef struct { uint64_t count[PQCNT_NCOUNTERS]; } pktq_counters_t; /* Special marker value used by pktq_barrier() mechanism. */ #define PKTQ_MARKER ((void *)(~0ULL)) /* * This is a list of all pktqueues. This list is used by * pktq_ifdetach() to issue a barrier on every pktqueue. * * The r/w lock is acquired for writing in pktq_create() and * pktq_destroy(), and for reading in pktq_ifdetach(). * * This list is not performance critical, and will seldom be * accessed. */ static LIST_HEAD(, pktqueue) pktqueue_list __read_mostly; static krwlock_t pktqueue_list_lock __read_mostly; static once_t pktqueue_list_init_once __read_mostly; static int pktqueue_list_init(void) { LIST_INIT(&pktqueue_list); rw_init(&pktqueue_list_lock); return 0; } static void pktq_init_cpu(void *vqp, void *vpq, struct cpu_info *ci) { struct pcq **qp = vqp; struct pktqueue *pq = vpq; *qp = pcq_create(pq->pq_maxlen, KM_SLEEP); } static void pktq_fini_cpu(void *vqp, void *vpq, struct cpu_info *ci) { struct pcq **qp = vqp, *q = *qp; KASSERT(pcq_peek(q) == NULL); pcq_destroy(q); *qp = NULL; /* paranoia */ } static struct pcq * pktq_pcq(struct pktqueue *pq, struct cpu_info *ci) { struct pcq **qp, *q; /* * As long as preemption is disabled, the xcall to swap percpu * buffers can't complete, so it is safe to read the pointer. */ KASSERT(kpreempt_disabled()); qp = percpu_getptr_remote(pq->pq_pcq, ci); q = *qp; return q; } pktqueue_t * pktq_create(size_t maxlen, void (*intrh)(void *), void *sc) { const u_int sflags = SOFTINT_NET | SOFTINT_MPSAFE | SOFTINT_RCPU; pktqueue_t *pq; percpu_t *pc; void *sih; RUN_ONCE(&pktqueue_list_init_once, pktqueue_list_init); pc = percpu_alloc(sizeof(pktq_counters_t)); if ((sih = softint_establish(sflags, intrh, sc)) == NULL) { percpu_free(pc, sizeof(pktq_counters_t)); return NULL; } pq = kmem_zalloc(sizeof(*pq), KM_SLEEP); mutex_init(&pq->pq_lock, MUTEX_DEFAULT, IPL_NONE); pq->pq_maxlen = maxlen; pq->pq_counters = pc; pq->pq_sih = sih; pq->pq_pcq = percpu_create(sizeof(struct pcq *), pktq_init_cpu, pktq_fini_cpu, pq); rw_enter(&pktqueue_list_lock, RW_WRITER); LIST_INSERT_HEAD(&pktqueue_list, pq, pq_list); rw_exit(&pktqueue_list_lock); return pq; } void pktq_destroy(pktqueue_t *pq) { KASSERT(pktqueue_list_init_once.o_status == ONCE_DONE); rw_enter(&pktqueue_list_lock, RW_WRITER); LIST_REMOVE(pq, pq_list); rw_exit(&pktqueue_list_lock); percpu_free(pq->pq_pcq, sizeof(struct pcq *)); percpu_free(pq->pq_counters, sizeof(pktq_counters_t)); softint_disestablish(pq->pq_sih); mutex_destroy(&pq->pq_lock); kmem_free(pq, sizeof(*pq)); } /* * - pktq_inc_counter: increment the counter given an ID. * - pktq_collect_counts: handler to sum up the counts from each CPU. * - pktq_getcount: return the effective count given an ID. */ static inline void pktq_inc_count(pktqueue_t *pq, u_int i) { percpu_t *pc = pq->pq_counters; pktq_counters_t *c; c = percpu_getref(pc); c->count[i]++; percpu_putref(pc); } static void pktq_collect_counts(void *mem, void *arg, struct cpu_info *ci) { const pktq_counters_t *c = mem; pktq_counters_t *sum = arg; int s = splnet(); for (u_int i = 0; i < PQCNT_NCOUNTERS; i++) { sum->count[i] += c->count[i]; } splx(s); } static uint64_t pktq_get_count(pktqueue_t *pq, pktq_count_t c) { pktq_counters_t sum; if (c != PKTQ_MAXLEN) { memset(&sum, 0, sizeof(sum)); percpu_foreach_xcall(pq->pq_counters, XC_HIGHPRI_IPL(IPL_SOFTNET), pktq_collect_counts, &sum); } switch (c) { case PKTQ_NITEMS: return sum.count[PQCNT_ENQUEUE] - sum.count[PQCNT_DEQUEUE]; case PKTQ_DROPS: return sum.count[PQCNT_DROP]; case PKTQ_MAXLEN: return pq->pq_maxlen; } return 0; } uint32_t pktq_rps_hash(const pktq_rps_hash_func_t *funcp, const struct mbuf *m) { pktq_rps_hash_func_t func = atomic_load_relaxed(funcp); KASSERT(func != NULL); return (*func)(m); } static uint32_t pktq_rps_hash_zero(const struct mbuf *m __unused) { return 0; } static uint32_t pktq_rps_hash_curcpu(const struct mbuf *m __unused) { return cpu_index(curcpu()); } static uint32_t pktq_rps_hash_toeplitz(const struct mbuf *m) { struct ip *ip; /* * Disable UDP port - IP fragments aren't currently being handled * and so we end up with a mix of 2-tuple and 4-tuple * traffic. */ const u_int flag = RSS_TOEPLITZ_USE_TCP_PORT; /* glance IP version */ if ((m->m_flags & M_PKTHDR) == 0) return 0; ip = mtod(m, struct ip *); if (ip->ip_v == IPVERSION) { if (__predict_false(m->m_len < sizeof(struct ip))) return 0; return rss_toeplitz_hash_from_mbuf_ipv4(m, flag); } else if (ip->ip_v == 6) { if (__predict_false(m->m_len < sizeof(struct ip6_hdr))) return 0; return rss_toeplitz_hash_from_mbuf_ipv6(m, flag); } return 0; } /* * toeplitz without curcpu. * Generally, this has better performance than toeplitz. */ static uint32_t pktq_rps_hash_toeplitz_othercpus(const struct mbuf *m) { uint32_t hash; if (ncpu == 1) return 0; hash = pktq_rps_hash_toeplitz(m); hash %= ncpu - 1; if (hash >= cpu_index(curcpu())) return hash + 1; else return hash; } static struct pktq_rps_hash_table { const char* prh_type; pktq_rps_hash_func_t prh_func; } const pktq_rps_hash_tab[] = { { "zero", pktq_rps_hash_zero }, { "curcpu", pktq_rps_hash_curcpu }, { "toeplitz", pktq_rps_hash_toeplitz }, { "toeplitz-othercpus", pktq_rps_hash_toeplitz_othercpus }, }; const pktq_rps_hash_func_t pktq_rps_hash_default = #ifdef NET_MPSAFE pktq_rps_hash_curcpu; #else pktq_rps_hash_zero; #endif static const char * pktq_get_rps_hash_type(pktq_rps_hash_func_t func) { for (int i = 0; i < __arraycount(pktq_rps_hash_tab); i++) { if (func == pktq_rps_hash_tab[i].prh_func) { return pktq_rps_hash_tab[i].prh_type; } } return NULL; } static int pktq_set_rps_hash_type(pktq_rps_hash_func_t *func, const char *type) { if (strcmp(type, pktq_get_rps_hash_type(*func)) == 0) return 0; for (int i = 0; i < __arraycount(pktq_rps_hash_tab); i++) { if (strcmp(type, pktq_rps_hash_tab[i].prh_type) == 0) { atomic_store_relaxed(func, pktq_rps_hash_tab[i].prh_func); return 0; } } return ENOENT; } int sysctl_pktq_rps_hash_handler(SYSCTLFN_ARGS) { struct sysctlnode node; pktq_rps_hash_func_t *func; int error; char type[PKTQ_RPS_HASH_NAME_LEN]; node = *rnode; func = node.sysctl_data; strlcpy(type, pktq_get_rps_hash_type(*func), PKTQ_RPS_HASH_NAME_LEN); node.sysctl_data = &type; node.sysctl_size = sizeof(type); error = sysctl_lookup(SYSCTLFN_CALL(&node)); if (error || newp == NULL) return error; error = pktq_set_rps_hash_type(func, type); return error; } /* * pktq_enqueue: inject the packet into the end of the queue. * * => Must be called from the interrupt or with the preemption disabled. * => Consumes the packet and returns true on success. * => Returns false on failure; caller is responsible to free the packet. */ bool pktq_enqueue(pktqueue_t *pq, struct mbuf *m, const u_int hash __unused) { #if defined(_RUMPKERNEL) || defined(_RUMP_NATIVE_ABI) struct cpu_info *ci = curcpu(); #else struct cpu_info *ci = cpu_lookup(hash % ncpu); #endif KASSERT(kpreempt_disabled()); if (__predict_false(!pcq_put(pktq_pcq(pq, ci), m))) { pktq_inc_count(pq, PQCNT_DROP); return false; } softint_schedule_cpu(pq->pq_sih, ci); pktq_inc_count(pq, PQCNT_ENQUEUE); return true; } /* * pktq_dequeue: take a packet from the queue. * * => Must be called with preemption disabled. * => Must ensure there are not concurrent dequeue calls. */ struct mbuf * pktq_dequeue(pktqueue_t *pq) { struct cpu_info *ci = curcpu(); struct mbuf *m; KASSERT(kpreempt_disabled()); m = pcq_get(pktq_pcq(pq, ci)); if (__predict_false(m == PKTQ_MARKER)) { /* Note the marker entry. */ atomic_inc_uint(&pq->pq_barrier); /* Get the next queue entry. */ m = pcq_get(pktq_pcq(pq, ci)); /* * There can only be one barrier operation pending * on a pktqueue at any given time, so we can assert * that the next item is not a marker. */ KASSERT(m != PKTQ_MARKER); } if (__predict_true(m != NULL)) { pktq_inc_count(pq, PQCNT_DEQUEUE); } return m; } /* * pktq_barrier: waits for a grace period when all packets enqueued at * the moment of calling this routine will be processed. This is used * to ensure that e.g. packets referencing some interface were drained. */ void pktq_barrier(pktqueue_t *pq) { CPU_INFO_ITERATOR cii; struct cpu_info *ci; u_int pending = 0; mutex_enter(&pq->pq_lock); KASSERT(pq->pq_barrier == 0); for (CPU_INFO_FOREACH(cii, ci)) { struct pcq *q; kpreempt_disable(); q = pktq_pcq(pq, ci); kpreempt_enable(); /* If the queue is empty - nothing to do. */ if (pcq_peek(q) == NULL) { continue; } /* Otherwise, put the marker and entry. */ while (!pcq_put(q, PKTQ_MARKER)) { kpause("pktqsync", false, 1, NULL); } kpreempt_disable(); softint_schedule_cpu(pq->pq_sih, ci); kpreempt_enable(); pending++; } /* Wait for each queue to process the markers. */ while (pq->pq_barrier != pending) { kpause("pktqsync", false, 1, NULL); } pq->pq_barrier = 0; mutex_exit(&pq->pq_lock); } /* * pktq_ifdetach: issue a barrier on all pktqueues when a network * interface is detached. */ void pktq_ifdetach(void) { pktqueue_t *pq; /* Just in case no pktqueues have been created yet... */ RUN_ONCE(&pktqueue_list_init_once, pktqueue_list_init); rw_enter(&pktqueue_list_lock, RW_READER); LIST_FOREACH(pq, &pktqueue_list, pq_list) { pktq_barrier(pq); } rw_exit(&pktqueue_list_lock); } /* * pktq_flush: free mbufs in all queues. * * => The caller must ensure there are no concurrent writers or flush calls. */ void pktq_flush(pktqueue_t *pq) { CPU_INFO_ITERATOR cii; struct cpu_info *ci; struct mbuf *m, *m0 = NULL; ASSERT_SLEEPABLE(); /* * Run a dummy softint at IPL_SOFTNET on all CPUs to ensure that any * already running handler for this pktqueue is no longer running. */ xc_barrier(XC_HIGHPRI_IPL(IPL_SOFTNET)); /* * Acquire the barrier lock. While the caller ensures that * no explicit pktq_barrier() calls will be issued, this holds * off any implicit pktq_barrier() calls that would happen * as the result of pktq_ifdetach(). */ mutex_enter(&pq->pq_lock); for (CPU_INFO_FOREACH(cii, ci)) { struct pcq *q; kpreempt_disable(); q = pktq_pcq(pq, ci); kpreempt_enable(); /* * Pull the packets off the pcq and chain them into * a list to be freed later. */ while ((m = pcq_get(q)) != NULL) { pktq_inc_count(pq, PQCNT_DEQUEUE); m->m_nextpkt = m0; m0 = m; } } mutex_exit(&pq->pq_lock); /* Free the packets now that the critical section is over. */ while ((m = m0) != NULL) { m0 = m->m_nextpkt; m_freem(m); } } static void pktq_set_maxlen_cpu(void *vpq, void *vqs) { struct pktqueue *pq = vpq; struct pcq **qp, *q, **qs = vqs; unsigned i = cpu_index(curcpu()); int s; s = splnet(); qp = percpu_getref(pq->pq_pcq); q = *qp; *qp = qs[i]; qs[i] = q; percpu_putref(pq->pq_pcq); splx(s); } /* * pktq_set_maxlen: create per-CPU queues using a new size and replace * the existing queues without losing any packets. * * XXX ncpu must remain stable throughout. */ int pktq_set_maxlen(pktqueue_t *pq, size_t maxlen) { const u_int slotbytes = ncpu * sizeof(pcq_t *); pcq_t **qs; if (!maxlen || maxlen > PCQ_MAXLEN) return EINVAL; if (pq->pq_maxlen == maxlen) return 0; /* First, allocate the new queues. */ qs = kmem_zalloc(slotbytes, KM_SLEEP); for (u_int i = 0; i < ncpu; i++) { qs[i] = pcq_create(maxlen, KM_SLEEP); } /* * Issue an xcall to replace the queue pointers on each CPU. * This implies all the necessary memory barriers. */ mutex_enter(&pq->pq_lock); xc_wait(xc_broadcast(XC_HIGHPRI, pktq_set_maxlen_cpu, pq, qs)); pq->pq_maxlen = maxlen; mutex_exit(&pq->pq_lock); /* * At this point, the new packets are flowing into the new * queues. However, the old queues may have some packets * present which are no longer being processed. We are going * to re-enqueue them. This may change the order of packet * arrival, but it is not considered an issue. * * There may be in-flight interrupts calling pktq_dequeue() * which reference the old queues. Issue a barrier to ensure * that we are going to be the only pcq_get() callers on the * old queues. */ pktq_barrier(pq); for (u_int i = 0; i < ncpu; i++) { struct pcq *q; struct mbuf *m; kpreempt_disable(); q = pktq_pcq(pq, cpu_lookup(i)); kpreempt_enable(); while ((m = pcq_get(qs[i])) != NULL) { while (!pcq_put(q, m)) { kpause("pktqrenq", false, 1, NULL); } } pcq_destroy(qs[i]); } /* Well, that was fun. */ kmem_free(qs, slotbytes); return 0; } static int sysctl_pktq_maxlen(SYSCTLFN_ARGS) { struct sysctlnode node = *rnode; pktqueue_t * const pq = node.sysctl_data; u_int nmaxlen = pktq_get_count(pq, PKTQ_MAXLEN); int error; node.sysctl_data = &nmaxlen; error = sysctl_lookup(SYSCTLFN_CALL(&node)); if (error || newp == NULL) return error; return pktq_set_maxlen(pq, nmaxlen); } static int sysctl_pktq_count(SYSCTLFN_ARGS, u_int count_id) { struct sysctlnode node = *rnode; pktqueue_t * const pq = node.sysctl_data; uint64_t count = pktq_get_count(pq, count_id); node.sysctl_data = &count; return sysctl_lookup(SYSCTLFN_CALL(&node)); } static int sysctl_pktq_nitems(SYSCTLFN_ARGS) { return sysctl_pktq_count(SYSCTLFN_CALL(rnode), PKTQ_NITEMS); } static int sysctl_pktq_drops(SYSCTLFN_ARGS) { return sysctl_pktq_count(SYSCTLFN_CALL(rnode), PKTQ_DROPS); } /* * pktqueue_sysctl_setup: set up the sysctl nodes for a pktqueue * using standardized names at the specified parent node and * node ID (or CTL_CREATE). */ void pktq_sysctl_setup(pktqueue_t * const pq, struct sysctllog ** const clog, const struct sysctlnode * const parent_node, const int qid) { const struct sysctlnode *rnode = parent_node, *cnode; KASSERT(pq != NULL); KASSERT(parent_node != NULL); KASSERT(qid == CTL_CREATE || qid >= 0); /* Create the "ifq" node below the parent node. */ sysctl_createv(clog, 0, &rnode, &cnode, CTLFLAG_PERMANENT, CTLTYPE_NODE, "ifq", SYSCTL_DESCR("Protocol input queue controls"), NULL, 0, NULL, 0, qid, CTL_EOL); /* Now create the standard child nodes below "ifq". */ rnode = cnode; sysctl_createv(clog, 0, &rnode, &cnode, CTLFLAG_PERMANENT, CTLTYPE_QUAD, "len", SYSCTL_DESCR("Current input queue length"), sysctl_pktq_nitems, 0, (void *)pq, 0, IFQCTL_LEN, CTL_EOL); sysctl_createv(clog, 0, &rnode, &cnode, CTLFLAG_PERMANENT | CTLFLAG_READWRITE, CTLTYPE_INT, "maxlen", SYSCTL_DESCR("Maximum allowed input queue length"), sysctl_pktq_maxlen, 0, (void *)pq, 0, IFQCTL_MAXLEN, CTL_EOL); sysctl_createv(clog, 0, &rnode, &cnode, CTLFLAG_PERMANENT, CTLTYPE_QUAD, "drops", SYSCTL_DESCR("Packets dropped due to full input queue"), sysctl_pktq_drops, 0, (void *)pq, 0, IFQCTL_DROPS, CTL_EOL); }
1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 /* $NetBSD: pad.c,v 1.85 2023/05/27 14:51:47 nat Exp $ */ /*- * Copyright (c) 2007 Jared D. McNeill <jmcneill@invisible.ca> * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: pad.c,v 1.85 2023/05/27 14:51:47 nat Exp $"); #include <sys/param.h> #include <sys/types.h> #include <sys/audioio.h> #include <sys/buf.h> #include <sys/condvar.h> #include <sys/conf.h> #include <sys/device.h> #include <sys/file.h> #include <sys/filedesc.h> #include <sys/kauth.h> #include <sys/kernel.h> #include <sys/kmem.h> #include <sys/module.h> #include <sys/poll.h> #include <sys/proc.h> #include <sys/select.h> #include <sys/stat.h> #include <sys/vnode.h> #include <dev/audio/audio_if.h> #include <dev/audio/audiovar.h> #include <dev/pad/padvar.h> #include "ioconf.h" /* #define PAD_DEBUG */ #ifdef PAD_DEBUG #define DPRINTF(fmt...) printf(fmt) #else #define DPRINTF(fmt...) /**/ #endif #define PADFREQ 44100 #define PADCHAN 2 #define PADPREC 16 typedef struct pad_block { uint8_t *pb_ptr; int pb_len; } pad_block_t; enum { PAD_OUTPUT_CLASS, PAD_INPUT_CLASS, PAD_OUTPUT_MASTER_VOLUME, PAD_INPUT_DAC_VOLUME, PAD_ENUM_LAST, }; static int pad_match(device_t, cfdata_t, void *); static void pad_attach(device_t, device_t, void *); static int pad_detach(device_t, int); static void pad_childdet(device_t, device_t); static int pad_query_format(void *, audio_format_query_t *); static int pad_set_format(void *, int, const audio_params_t *, const audio_params_t *, audio_filter_reg_t *, audio_filter_reg_t *); static int pad_start_output(void *, void *, int, void (*)(void *), void *); static int pad_halt_output(void *); static int pad_getdev(void *, struct audio_device *); static int pad_set_port(void *, mixer_ctrl_t *); static int pad_get_port(void *, mixer_ctrl_t *); static int pad_query_devinfo(void *, mixer_devinfo_t *); static int pad_get_props(void *); static void pad_get_locks(void *, kmutex_t **, kmutex_t **); static void pad_done_output(void *); static void pad_swvol_codec(audio_filter_arg_t *); static void pad_close(struct pad_softc *); static int pad_read(struct pad_softc *, off_t *, struct uio *, kauth_cred_t, int); static int fops_pad_close(struct file *); static int fops_pad_read(struct file *, off_t *, struct uio *, kauth_cred_t, int); static int fops_pad_write(struct file *, off_t *, struct uio *, kauth_cred_t, int); static int fops_pad_ioctl(struct file *, u_long, void *); static int fops_pad_kqfilter(struct file *, struct knote *); static int fops_pad_poll(struct file *, int); static int fops_pad_stat(struct file *, struct stat *); static int fops_pad_mmap(struct file *, off_t *, size_t, int, int *, int *, struct uvm_object **, int *); static const struct audio_hw_if pad_hw_if = { .query_format = pad_query_format, .set_format = pad_set_format, .start_output = pad_start_output, .halt_output = pad_halt_output, .getdev = pad_getdev, .set_port = pad_set_port, .get_port = pad_get_port, .query_devinfo = pad_query_devinfo, .get_props = pad_get_props, .get_locks = pad_get_locks, }; #define PAD_NFORMATS 1 static const struct audio_format pad_formats[PAD_NFORMATS] = { { .mode = AUMODE_PLAY, .encoding = AUDIO_ENCODING_SLINEAR_LE, .validbits = PADPREC, .precision = PADPREC, .channels = PADCHAN, .channel_mask = AUFMT_STEREO, .frequency_type = 1, .frequency = { PADFREQ }, }, }; extern void padattach(int); static int pad_add_block(struct pad_softc *, uint8_t *, int); static int pad_get_block(struct pad_softc *, pad_block_t *, int, int); static dev_type_open(pad_open); const struct cdevsw pad_cdevsw = { .d_open = pad_open, .d_close = noclose, .d_read = noread, .d_write = nowrite, .d_ioctl = noioctl, .d_stop = nostop, .d_tty = notty, .d_poll = nopoll, .d_mmap = nommap, .d_kqfilter = nokqfilter, .d_discard = nodiscard, .d_flag = D_OTHER | D_MPSAFE, }; const struct fileops pad_fileops = { .fo_name = "pad", .fo_read = fops_pad_read, .fo_write = fops_pad_write, .fo_ioctl = fops_pad_ioctl, .fo_fcntl = fnullop_fcntl, .fo_stat = fops_pad_stat, .fo_poll = fops_pad_poll, .fo_close = fops_pad_close, .fo_mmap = fops_pad_mmap, .fo_kqfilter = fops_pad_kqfilter, .fo_restart = fnullop_restart }; CFATTACH_DECL2_NEW(pad, sizeof(struct pad_softc), pad_match, pad_attach, pad_detach, NULL, NULL, pad_childdet); void padattach(int n) { int error; error = config_cfattach_attach(pad_cd.cd_name, &pad_ca); if (error) { aprint_error("%s: couldn't register cfattach: %d\n", pad_cd.cd_name, error); config_cfdriver_detach(&pad_cd); return; } } static int pad_match(device_t parent, cfdata_t data, void *opaque) { return 1; } static void pad_attach(device_t parent, device_t self, void *opaque) { struct pad_softc *sc = device_private(self); KASSERT(KERNEL_LOCKED_P()); aprint_normal_dev(self, "outputs: 44100Hz, 16-bit, stereo\n"); sc->sc_dev = self; cv_init(&sc->sc_condvar, device_xname(sc->sc_dev)); mutex_init(&sc->sc_lock, MUTEX_DEFAULT, IPL_NONE); mutex_init(&sc->sc_intr_lock, MUTEX_DEFAULT, IPL_SOFTCLOCK); callout_init(&sc->sc_pcallout, CALLOUT_MPSAFE); callout_setfunc(&sc->sc_pcallout, pad_done_output, sc); sc->sc_swvol = 255; sc->sc_buflen = 0; sc->sc_rpos = sc->sc_wpos = 0; sc->sc_audiodev = audio_attach_mi(&pad_hw_if, sc, sc->sc_dev); if (!pmf_device_register(sc->sc_dev, NULL, NULL)) aprint_error_dev(sc->sc_dev, "couldn't establish power handler\n"); sc->sc_open = 1; } static int pad_detach(device_t self, int flags) { struct pad_softc *sc = device_private(self); int cmaj, mn; int error; KASSERT(KERNEL_LOCKED_P()); /* Prevent detach without going through close -- e.g., drvctl. */ if (sc->sc_open) return EBUSY; error = config_detach_children(self, flags); if (error) return error; cmaj = cdevsw_lookup_major(&pad_cdevsw); mn = device_unit(sc->sc_dev); vdevgone(cmaj, mn, mn, VCHR); pmf_device_deregister(sc->sc_dev); callout_destroy(&sc->sc_pcallout); mutex_destroy(&sc->sc_lock); mutex_destroy(&sc->sc_intr_lock); cv_destroy(&sc->sc_condvar); return 0; } static void pad_childdet(device_t self, device_t child) { struct pad_softc *sc = device_private(self); KASSERT(KERNEL_LOCKED_P()); if (child == sc->sc_audiodev) sc->sc_audiodev = NULL; } static int pad_add_block(struct pad_softc *sc, uint8_t *blk, int blksize) { int foff, flen, tlen; KASSERT(blksize >= 0); KASSERT(mutex_owned(&sc->sc_intr_lock)); if (blksize > PAD_BUFSIZE || sc->sc_buflen > PAD_BUFSIZE - (unsigned)blksize) return ENOBUFS; foff = sc->sc_wpos; if (sc->sc_wpos + blksize <= PAD_BUFSIZE) { flen = blksize; tlen = 0; } else { flen = PAD_BUFSIZE - sc->sc_wpos; tlen = blksize - flen; } sc->sc_wpos = foff + blksize; if (sc->sc_wpos >= PAD_BUFSIZE) sc->sc_wpos -= PAD_BUFSIZE; /* * release interrupt lock for bulk copy to audio buffer */ mutex_exit(&sc->sc_intr_lock); memcpy(sc->sc_audiobuf + foff, blk, flen); memcpy(sc->sc_audiobuf, blk + flen, tlen); mutex_enter(&sc->sc_intr_lock); sc->sc_buflen += blksize; cv_broadcast(&sc->sc_condvar); return 0; } static int pad_get_block(struct pad_softc *sc, pad_block_t *pb, int maxblksize, int dowait) { int l, blksize, error; KASSERT(maxblksize > 0); KASSERT(mutex_owned(&sc->sc_intr_lock)); if (sc->sc_buflen == 0 && !dowait) return EAGAIN; while (sc->sc_buflen == 0) { DPRINTF("%s: wait\n", __func__); error = cv_wait_sig(&sc->sc_condvar, &sc->sc_intr_lock); DPRINTF("%s: wake up %d\n", __func__, err); if (error) return error; } blksize = uimin(maxblksize, sc->sc_buflen); pb->pb_ptr = (sc->sc_audiobuf + sc->sc_rpos); if (sc->sc_rpos + blksize < PAD_BUFSIZE) { pb->pb_len = blksize; sc->sc_rpos += blksize; } else { l = PAD_BUFSIZE - sc->sc_rpos; pb->pb_len = l; sc->sc_rpos = 0; } sc->sc_buflen -= pb->pb_len; return 0; } static int pad_open(dev_t dev, int flags, int fmt, struct lwp *l) { struct file *fp = NULL; device_t self; struct pad_softc *sc = NULL; cfdata_t cf = NULL; int error, fd; error = fd_allocfile(&fp, &fd); if (error) goto out; cf = kmem_alloc(sizeof(*cf), KM_SLEEP); cf->cf_name = pad_cd.cd_name; cf->cf_atname = pad_cd.cd_name; cf->cf_unit = 0; cf->cf_fstate = FSTATE_STAR; self = config_attach_pseudo(cf); if (self == NULL) { error = ENXIO; goto out; } sc = device_private(self); KASSERT(sc->sc_dev == self); cf = NULL; error = fd_clone(fp, fd, flags, &pad_fileops, sc); KASSERT(error == EMOVEFD); fp = NULL; sc = NULL; out: if (sc) pad_close(sc); if (cf) kmem_free(cf, sizeof(*cf)); if (fp) fd_abort(curproc, fp, fd); return error; } static void pad_close(struct pad_softc *sc) { device_t self = sc->sc_dev; cfdata_t cf = device_cfdata(self); /* * XXX This is not quite enough to prevent racing with drvctl * detach. What can happen: * * cpu0 cpu1 * * pad_close * take kernel lock * sc->sc_open = 0 * drop kernel lock * wait for config_misc_lock * drvctl detach * take kernel lock * drop kernel lock * wait for config_misc_lock * retake kernel lock * drop config_misc_lock * take config_misc_lock * wait for kernel lock * pad_detach (sc_open=0 already) * free device * drop kernel lock * use device after free * * We need a way to grab a reference to the device so it won't * be freed until we're done -- it's OK if we config_detach * twice as long as it's idempotent, but not OK if the first * config_detach frees the struct device before the second one * has finished handling it. */ KERNEL_LOCK(1, NULL); KASSERT(sc->sc_open); sc->sc_open = 0; (void)config_detach(self, DETACH_FORCE); KERNEL_UNLOCK_ONE(NULL); kmem_free(cf, sizeof(*cf)); } static int fops_pad_close(struct file *fp) { struct pad_softc *sc = fp->f_pad; pad_close(sc); return 0; } static int fops_pad_poll(struct file *fp, int events) { return POLLERR; } static int fops_pad_kqfilter(struct file *fp, struct knote *kn) { struct pad_softc *sc = fp->f_pad; dev_t dev; dev = makedev(cdevsw_lookup_major(&pad_cdevsw), device_unit(sc->sc_dev)); return seltrue_kqfilter(dev, kn); } static int fops_pad_ioctl(struct file *fp, u_long cmd, void *data) { return ENODEV; } static int fops_pad_stat(struct file *fp, struct stat *st) { struct pad_softc *sc = fp->f_pad; memset(st, 0, sizeof(*st)); st->st_dev = makedev(cdevsw_lookup_major(&pad_cdevsw), device_unit(sc->sc_dev)); st->st_uid = kauth_cred_geteuid(fp->f_cred); st->st_gid = kauth_cred_getegid(fp->f_cred); st->st_mode = S_IFCHR; return 0; } static int fops_pad_mmap(struct file *fp, off_t *offp, size_t len, int prot, int *flagsp, int *advicep, struct uvm_object **uobjp, int *maxprotp) { return 1; } static int fops_pad_read(struct file *fp, off_t *offp, struct uio *uio, kauth_cred_t cred, int ioflag) { struct pad_softc *sc = fp->f_pad; return pad_read(sc, offp, uio, cred, ioflag); } static int pad_read(struct pad_softc *sc, off_t *offp, struct uio *uio, kauth_cred_t cred, int ioflag) { pad_block_t pb; int err, first; err = 0; first = 1; DPRINTF("%s: resid=%zu\n", __func__, uio->uio_resid); while (uio->uio_resid > 0) { mutex_enter(&sc->sc_intr_lock); err = pad_get_block(sc, &pb, MIN(uio->uio_resid, INT_MAX), first); mutex_exit(&sc->sc_intr_lock); first = 0; if (err == EAGAIN) { err = 0; break; } if (err) break; DPRINTF("%s: move %d\n", __func__, pb.pb_len); err = uiomove(pb.pb_ptr, pb.pb_len, uio); if (err) break; } return err; } static int fops_pad_write(struct file *fp, off_t *offp, struct uio *uio, kauth_cred_t cred, int ioflag) { return EOPNOTSUPP; } static int pad_query_format(void *opaque, audio_format_query_t *afp) { return audio_query_format(pad_formats, PAD_NFORMATS, afp); } static int pad_set_format(void *opaque, int setmode, const audio_params_t *play, const audio_params_t *rec, audio_filter_reg_t *pfil, audio_filter_reg_t *rfil) { struct pad_softc *sc = opaque; KASSERT(mutex_owned(&sc->sc_lock)); /* XXX playback only */ pfil->codec = pad_swvol_codec; pfil->context = sc; return 0; } static int pad_start_output(void *opaque, void *block, int blksize, void (*intr)(void *), void *intrarg) { struct pad_softc *sc = opaque; int err; u_int framesize; int ticks; KASSERT(mutex_owned(&sc->sc_intr_lock)); sc->sc_intr = intr; sc->sc_intrarg = intrarg; DPRINTF("%s: blksize=%d\n", __func__, blksize); err = pad_add_block(sc, block, blksize); if (err) { DPRINTF("%s: failed: %d\n", __func__, err); /* "Silently" drop overflows, but keep pace */ err = 0; } framesize = PADCHAN * (PADPREC / NBBY) * PADFREQ; sc->sc_resid += blksize; ticks = mstohz(sc->sc_resid * 1000 / framesize); sc->sc_resid -= hztoms(ticks) * framesize / 1000; DPRINTF("%s: callout ms=%d\n", __func__, ms); callout_schedule(&sc->sc_pcallout, ticks); return err; } static int pad_halt_output(void *opaque) { struct pad_softc *sc = opaque; DPRINTF("%s\n", __func__); KASSERT(mutex_owned(&sc->sc_intr_lock)); callout_halt(&sc->sc_pcallout, &sc->sc_intr_lock); sc->sc_intr = NULL; sc->sc_intrarg = NULL; sc->sc_buflen = 0; sc->sc_resid = 0; sc->sc_rpos = sc->sc_wpos = 0; return 0; } static void pad_done_output(void *arg) { struct pad_softc *sc = arg; DPRINTF("%s\n", __func__); mutex_enter(&sc->sc_intr_lock); (*sc->sc_intr)(sc->sc_intrarg); mutex_exit(&sc->sc_intr_lock); } static int pad_getdev(void *opaque, struct audio_device *ret) { strlcpy(ret->name, "Virtual Audio", sizeof(ret->name)); strlcpy(ret->version, osrelease, sizeof(ret->version)); strlcpy(ret->config, "pad", sizeof(ret->config)); return 0; } static int pad_set_port(void *opaque, mixer_ctrl_t *mc) { struct pad_softc *sc = opaque; KASSERT(mutex_owned(&sc->sc_lock)); switch (mc->dev) { case PAD_OUTPUT_MASTER_VOLUME: case PAD_INPUT_DAC_VOLUME: if (mc->un.value.num_channels != 1) return EINVAL; sc->sc_swvol = mc->un.value.level[AUDIO_MIXER_LEVEL_MONO]; return 0; } return ENXIO; } static int pad_get_port(void *opaque, mixer_ctrl_t *mc) { struct pad_softc *sc = opaque; KASSERT(mutex_owned(&sc->sc_lock)); switch (mc->dev) { case PAD_OUTPUT_MASTER_VOLUME: case PAD_INPUT_DAC_VOLUME: if (mc->un.value.num_channels != 1) return EINVAL; mc->un.value.level[AUDIO_MIXER_LEVEL_MONO] = sc->sc_swvol; return 0; } return ENXIO; } static int pad_query_devinfo(void *opaque, mixer_devinfo_t *di) { struct pad_softc *sc __diagused = opaque; KASSERT(mutex_owned(&sc->sc_lock)); switch (di->index) { case PAD_OUTPUT_CLASS: di->mixer_class = PAD_OUTPUT_CLASS; strcpy(di->label.name, AudioCoutputs); di->type = AUDIO_MIXER_CLASS; di->next = di->prev = AUDIO_MIXER_LAST; return 0; case PAD_INPUT_CLASS: di->mixer_class = PAD_INPUT_CLASS; strcpy(di->label.name, AudioCinputs); di->type = AUDIO_MIXER_CLASS; di->next = di->prev = AUDIO_MIXER_LAST; return 0; case PAD_OUTPUT_MASTER_VOLUME: di->mixer_class = PAD_OUTPUT_CLASS; strcpy(di->label.name, AudioNmaster); di->type = AUDIO_MIXER_VALUE; di->next = di->prev = AUDIO_MIXER_LAST; di->un.v.num_channels = 1; strcpy(di->un.v.units.name, AudioNvolume); return 0; case PAD_INPUT_DAC_VOLUME: di->mixer_class = PAD_INPUT_CLASS; strcpy(di->label.name, AudioNdac); di->type = AUDIO_MIXER_VALUE; di->next = di->prev = AUDIO_MIXER_LAST; di->un.v.num_channels = 1; strcpy(di->un.v.units.name, AudioNvolume); return 0; } return ENXIO; } static int pad_get_props(void *opaque) { return AUDIO_PROP_PLAYBACK; } static void pad_get_locks(void *opaque, kmutex_t **intr, kmutex_t **thread) { struct pad_softc *sc = opaque; *intr = &sc->sc_intr_lock; *thread = &sc->sc_lock; } static void pad_swvol_codec(audio_filter_arg_t *arg) { struct pad_softc *sc = arg->context; const uint8_t *src; uint8_t *dst; u_int sample_count; u_int i; u_int bits; src = arg->src; dst = arg->dst; sample_count = arg->count * arg->srcfmt->channels; bits = arg->srcfmt->precision; for (i = 0; i < sample_count; i++) { int64_t v; switch (howmany(bits, NBBY)) { case 2: /* AUDIO_INTERNAL_BITS == 16 */ v = *(const int16_t *)src; src += sizeof(int16_t); break; case 4: /* AUDIO_INTERNAL_BITS == 32 */ v = *(const int32_t *)src; src += sizeof(int32_t); break; default: v = 0; break; } v = v * sc->sc_swvol / 255; if (PADPREC > bits) v = v << (PADPREC - bits); else if (PADPREC < bits) v = v >> (bits - PADPREC); /* AUDIO_ENCODING_SLINEAR_LE */ #if PADPREC > 0 *dst++ = v; #endif #if PADPREC > 8 v >>= 8; *dst++ = v; #endif #if PADPREC > 16 v >>= 8; *dst++ = v; #endif #if PADPREC > 24 v >>= 8; *dst++ = v; #endif } } MODULE(MODULE_CLASS_DRIVER, pad, "audio"); #ifdef _MODULE #include "ioconf.c" devmajor_t cmajor = NODEVMAJOR, bmajor = NODEVMAJOR; /* * We need our own version of cfattach since config(1)'s ioconf does not * generate what we need */ static struct cfattach *pad_cfattachinit[] = { &pad_ca, NULL }; static struct cfattachinit pad_cfattach[] = { { "pad", pad_cfattachinit }, { NULL, NULL } }; #endif static int pad_modcmd(modcmd_t cmd, void *arg) { int error = 0; switch (cmd) { case MODULE_CMD_INIT: #ifdef _MODULE error = devsw_attach(pad_cd.cd_name, NULL, &bmajor, &pad_cdevsw, &cmajor); if (error) break; pad_cfattach[1] = cfattach_ioconf_pad[0]; error = config_init_component(cfdriver_ioconf_pad, pad_cfattach, cfdata_ioconf_pad); if (error) { devsw_detach(NULL, &pad_cdevsw); break; } #endif break; case MODULE_CMD_FINI: #ifdef _MODULE error = config_fini_component(cfdriver_ioconf_pad, pad_cfattach, cfdata_ioconf_pad); if (error == 0) devsw_detach(NULL, &pad_cdevsw); #endif break; default: error = ENOTTY; } return error; }
43 14 32 336 335 338 333 333 331 332 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 /* $NetBSD: kern_syscall.c,v 1.21 2020/08/31 19:51:30 christos Exp $ */ /*- * Copyright (c) 2008 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software developed for The NetBSD Foundation * by Andrew Doran. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: kern_syscall.c,v 1.21 2020/08/31 19:51:30 christos Exp $"); #ifdef _KERNEL_OPT #include "opt_modular.h" #include "opt_syscall_debug.h" #include "opt_ktrace.h" #include "opt_ptrace.h" #include "opt_dtrace.h" #endif /* XXX To get syscall prototypes. */ #define SYSVSHM #define SYSVSEM #define SYSVMSG #include <sys/param.h> #include <sys/module.h> #include <sys/sched.h> #include <sys/syscall.h> #include <sys/syscallargs.h> #include <sys/syscallvar.h> #include <sys/systm.h> #include <sys/xcall.h> #include <sys/ktrace.h> #include <sys/ptrace.h> int sys_nomodule(struct lwp *l, const void *v, register_t *retval) { #ifdef MODULAR const struct sysent *sy; const struct emul *em; const struct sc_autoload *auto_list; u_int code; /* * Restart the syscall if we interrupted a module unload that * failed. Acquiring kernconfig_lock delays us until any unload * has been completed or rolled back. */ kernconfig_lock(); sy = l->l_sysent; if (sy->sy_call != sys_nomodule) { kernconfig_unlock(); return ERESTART; } /* * Try to autoload a module to satisfy the request. If it * works, retry the request. */ em = l->l_proc->p_emul; code = sy - em->e_sysent; if ((auto_list = em->e_sc_autoload) != NULL) for (; auto_list->al_code > 0; auto_list++) { if (auto_list->al_code != code) { continue; } if (module_autoload(auto_list->al_module, MODULE_CLASS_ANY) != 0 || sy->sy_call == sys_nomodule) { break; } kernconfig_unlock(); return ERESTART; } kernconfig_unlock(); #endif /* MODULAR */ return sys_nosys(l, v, retval); } int syscall_establish(const struct emul *em, const struct syscall_package *sp) { struct sysent *sy; int i; KASSERT(kernconfig_is_held()); if (em == NULL) { em = &emul_netbsd; } sy = em->e_sysent; /* * Ensure that all preconditions are valid, since this is * an all or nothing deal. Once a system call is entered, * it can become busy and we could be unable to remove it * on error. */ for (i = 0; sp[i].sp_call != NULL; i++) { if (sp[i].sp_code >= SYS_NSYSENT) return EINVAL; if (sy[sp[i].sp_code].sy_call != sys_nomodule && sy[sp[i].sp_code].sy_call != sys_nosys) { #ifdef DIAGNOSTIC printf("syscall %d is busy\n", sp[i].sp_code); #endif return EBUSY; } } /* Everything looks good, patch them in. */ for (i = 0; sp[i].sp_call != NULL; i++) { sy[sp[i].sp_code].sy_call = sp[i].sp_call; } return 0; } int syscall_disestablish(const struct emul *em, const struct syscall_package *sp) { struct sysent *sy; const uint32_t *sb; lwp_t *l; int i; KASSERT(kernconfig_is_held()); if (em == NULL) { em = &emul_netbsd; } sy = em->e_sysent; sb = em->e_nomodbits; /* * First, patch the system calls to sys_nomodule or sys_nosys * to gate further activity. */ for (i = 0; sp[i].sp_call != NULL; i++) { KASSERT(sy[sp[i].sp_code].sy_call == sp[i].sp_call); sy[sp[i].sp_code].sy_call = sb[sp[i].sp_code / 32] & (1 << (sp[i].sp_code % 32)) ? sys_nomodule : sys_nosys; } /* * Run a cross call to cycle through all CPUs. This does two * things: lock activity provides a barrier and makes our update * of sy_call visible to all CPUs, and upon return we can be sure * that we see pertinent values of l_sysent posted by remote CPUs. */ xc_barrier(0); /* * Now it's safe to check l_sysent. Run through all LWPs and see * if anyone is still using the system call. */ for (i = 0; sp[i].sp_call != NULL; i++) { mutex_enter(&proc_lock); LIST_FOREACH(l, &alllwp, l_list) { if (l->l_sysent == &sy[sp[i].sp_code]) { break; } } mutex_exit(&proc_lock); if (l == NULL) { continue; } /* * We lose: one or more calls are still in use. Put back * the old entrypoints and act like nothing happened. * When we drop kernconfig_lock, any system calls held in * sys_nomodule() will be restarted. */ for (i = 0; sp[i].sp_call != NULL; i++) { sy[sp[i].sp_code].sy_call = sp[i].sp_call; } return EBUSY; } return 0; } /* * Return true if system call tracing is enabled for the specified process. */ bool trace_is_enabled(struct proc *p) { #ifdef SYSCALL_DEBUG return (true); #endif #ifdef KTRACE if (ISSET(p->p_traceflag, (KTRFAC_SYSCALL | KTRFAC_SYSRET))) return (true); #endif #ifdef PTRACE if (ISSET(p->p_slflag, PSL_SYSCALL)) return (true); #endif return (false); } /* * Start trace of particular system call. If process is being traced, * this routine is called by MD syscall dispatch code just before * a system call is actually executed. */ int trace_enter(register_t code, const struct sysent *sy, const void *args) { int error = 0; #if defined(PTRACE) || defined(KDTRACE_HOOKS) struct proc *p = curlwp->l_proc; #endif #ifdef KDTRACE_HOOKS if (sy->sy_entry) { struct emul *e = p->p_emul; if (e->e_dtrace_syscall) (*e->e_dtrace_syscall)(sy->sy_entry, code, sy, args, NULL, 0); } #endif #ifdef SYSCALL_DEBUG scdebug_call(code, args); #endif /* SYSCALL_DEBUG */ ktrsyscall(code, args, sy->sy_narg); #ifdef PTRACE if ((p->p_slflag & (PSL_SYSCALL|PSL_TRACED)) == (PSL_SYSCALL|PSL_TRACED)) { proc_stoptrace(TRAP_SCE, code, args, NULL, 0); if (curlwp->l_proc->p_slflag & PSL_SYSCALLEMU) { /* tracer will emulate syscall for us */ error = EJUSTRETURN; } } #endif return error; } /* * End trace of particular system call. If process is being traced, * this routine is called by MD syscall dispatch code just after * a system call finishes. * MD caller guarantees the passed 'code' is within the supported * system call number range for emulation the process runs under. */ void trace_exit(register_t code, const struct sysent *sy, const void *args, register_t rval[], int error) { #if defined(PTRACE) || defined(KDTRACE_HOOKS) struct proc *p = curlwp->l_proc; #endif #ifdef KDTRACE_HOOKS if (sy->sy_return) { struct emul *e = p->p_emul; if (e->e_dtrace_syscall) (*p->p_emul->e_dtrace_syscall)(sy->sy_return, code, sy, args, rval, error); } #endif #ifdef SYSCALL_DEBUG scdebug_ret(code, error, rval); #endif /* SYSCALL_DEBUG */ ktrsysret(code, error, rval); #ifdef PTRACE if ((p->p_slflag & (PSL_SYSCALL|PSL_TRACED|PSL_SYSCALLEMU)) == (PSL_SYSCALL|PSL_TRACED)) { proc_stoptrace(TRAP_SCX, code, args, rval, error); } CLR(p->p_slflag, PSL_SYSCALLEMU); #endif }
27 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 /* $NetBSD: tcp_var.h,v 1.198 2022/10/28 05:18:39 ozaki-r Exp $ */ /* * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the project nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * @(#)COPYRIGHT 1.1 (NRL) 17 January 1995 * * NRL grants permission for redistribution and use in source and binary * forms, with or without modification, of the software and documentation * created at NRL provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgements: * This product includes software developed by the University of * California, Berkeley and its contributors. * This product includes software developed at the Information * Technology Division, US Naval Research Laboratory. * 4. Neither the name of the NRL nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THE SOFTWARE PROVIDED BY NRL IS PROVIDED BY NRL AND CONTRIBUTORS ``AS * IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NRL OR * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * The views and conclusions contained in the software and documentation * are those of the authors and should not be interpreted as representing * official policies, either expressed or implied, of the US Naval * Research Laboratory (NRL). */ /*- * Copyright (c) 1997, 1998, 1999, 2001, 2005 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility, * NASA Ames Research Center. * This code is derived from software contributed to The NetBSD Foundation * by Charles M. Hannum. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /* * Copyright (c) 1982, 1986, 1993, 1994, 1995 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)tcp_var.h 8.4 (Berkeley) 5/24/95 */ #ifndef _NETINET_TCP_VAR_H_ #define _NETINET_TCP_VAR_H_ #if defined(_KERNEL_OPT) #include "opt_inet.h" #include "opt_mbuftrace.h" #endif /* * TCP kernel structures and variables. */ #include <sys/callout.h> #ifdef TCP_SIGNATURE /* * Defines which are needed by the xform_tcp module and tcp_[in|out]put * for SADB verification and lookup. */ #define TCP_SIGLEN 16 /* length of computed digest in bytes */ #define TCP_KEYLEN_MIN 1 /* minimum length of TCP-MD5 key */ #define TCP_KEYLEN_MAX 80 /* maximum length of TCP-MD5 key */ /* * Only a single SA per host may be specified at this time. An SPI is * needed in order for the KEY_LOOKUP_SA() lookup to work. */ #define TCP_SIG_SPI 0x1000 #endif /* TCP_SIGNATURE */ /* * Tcp+ip header, after ip options removed. */ struct tcpiphdr { struct ipovly ti_i; /* overlaid ip structure */ struct tcphdr ti_t; /* tcp header */ }; #ifdef CTASSERT CTASSERT(sizeof(struct tcpiphdr) == 40); #endif #define ti_x1 ti_i.ih_x1 #define ti_pr ti_i.ih_pr #define ti_len ti_i.ih_len #define ti_src ti_i.ih_src #define ti_dst ti_i.ih_dst #define ti_sport ti_t.th_sport #define ti_dport ti_t.th_dport #define ti_seq ti_t.th_seq #define ti_ack ti_t.th_ack #define ti_x2 ti_t.th_x2 #define ti_off ti_t.th_off #define ti_flags ti_t.th_flags #define ti_win ti_t.th_win #define ti_sum ti_t.th_sum #define ti_urp ti_t.th_urp /* * SACK option block. */ struct sackblk { tcp_seq left; /* Left edge of sack block. */ tcp_seq right; /* Right edge of sack block. */ }; TAILQ_HEAD(sackhead, sackhole); struct sackhole { tcp_seq start; tcp_seq end; tcp_seq rxmit; TAILQ_ENTRY(sackhole) sackhole_q; }; struct syn_cache; /* * Tcp control block, one per tcp; fields: */ struct tcpcb { int t_family; /* address family on the wire */ struct ipqehead segq; /* sequencing queue */ int t_segqlen; /* length of the above */ callout_t t_timer[TCPT_NTIMERS];/* tcp timers */ short t_state; /* state of this connection */ short t_rxtshift; /* log(2) of rexmt exp. backoff */ uint32_t t_rxtcur; /* current retransmit value */ short t_dupacks; /* consecutive dup acks recd */ /* * t_partialacks: * <0 not in fast recovery. * ==0 in fast recovery. has not received partial acks * >0 in fast recovery. has received partial acks */ short t_partialacks; /* partials acks during fast rexmit */ u_short t_peermss; /* peer's maximum segment size */ u_short t_ourmss; /* our's maximum segment size */ u_short t_segsz; /* current segment size in use */ char t_force; /* 1 if forcing out a byte */ u_int t_flags; #define TF_ACKNOW 0x0001 /* ack peer immediately */ #define TF_DELACK 0x0002 /* ack, but try to delay it */ #define TF_NODELAY 0x0004 /* don't delay packets to coalesce */ #define TF_NOOPT 0x0008 /* don't use tcp options */ #define TF_REQ_SCALE 0x0020 /* have/will request window scaling */ #define TF_RCVD_SCALE 0x0040 /* other side has requested scaling */ #define TF_REQ_TSTMP 0x0080 /* have/will request timestamps */ #define TF_RCVD_TSTMP 0x0100 /* a timestamp was received in SYN */ #define TF_SACK_PERMIT 0x0200 /* other side said I could SACK */ #define TF_SYN_REXMT 0x0400 /* rexmit timer fired on SYN */ #define TF_WILL_SACK 0x0800 /* try to use SACK */ #define TF_REASSEMBLING 0x1000 /* we're busy reassembling */ #define TF_DEAD 0x2000 /* dead and to-be-released */ #define TF_PMTUD_PEND 0x4000 /* Path MTU Discovery pending */ #define TF_ECN_PERMIT 0x10000 /* other side said is ECN-ready */ #define TF_ECN_SND_CWR 0x20000 /* ECN CWR in queue */ #define TF_ECN_SND_ECE 0x40000 /* ECN ECE in queue */ #define TF_SIGNATURE 0x400000 /* require MD5 digests (RFC2385) */ struct mbuf *t_template; /* skeletal packet for transmit */ struct inpcb *t_inpcb; /* back pointer to internet pcb */ callout_t t_delack_ch; /* delayed ACK callout */ /* * The following fields are used as in the protocol specification. * See RFC793, Dec. 1981, page 21. */ /* send sequence variables */ tcp_seq snd_una; /* send unacknowledged */ tcp_seq snd_nxt; /* send next */ tcp_seq snd_up; /* send urgent pointer */ tcp_seq snd_wl1; /* window update seg seq number */ tcp_seq snd_wl2; /* window update seg ack number */ tcp_seq iss; /* initial send sequence number */ u_long snd_wnd; /* send window */ /* * snd_recover * it's basically same as the "recover" variable in RFC 2852 (NewReno). * when entering fast retransmit, it's set to snd_max. * newreno uses this to detect partial ack. * snd_high * it's basically same as the "send_high" variable in RFC 2852 (NewReno). * on each RTO, it's set to snd_max. * newreno uses this to avoid false fast retransmits. */ tcp_seq snd_recover; tcp_seq snd_high; /* receive sequence variables */ u_long rcv_wnd; /* receive window */ tcp_seq rcv_nxt; /* receive next */ tcp_seq rcv_up; /* receive urgent pointer */ tcp_seq irs; /* initial receive sequence number */ /* * Additional variables for this implementation. */ /* receive variables */ tcp_seq rcv_adv; /* advertised window */ /* * retransmit variables * * snd_max * the highest sequence number we've ever sent. * used to recognize retransmits. */ tcp_seq snd_max; /* congestion control (for slow start, source quench, retransmit after loss) */ u_long snd_cwnd; /* congestion-controlled window */ u_long snd_ssthresh; /* snd_cwnd size threshold for * for slow start exponential to * linear switch */ /* auto-sizing variables */ u_int rfbuf_cnt; /* recv buffer autoscaling byte count */ uint32_t rfbuf_ts; /* recv buffer autoscaling timestamp */ /* * transmit timing stuff. See below for scale of srtt and rttvar. * "Variance" is actually smoothed difference. */ uint32_t t_rcvtime; /* time last segment received */ uint32_t t_rtttime; /* time we started measuring rtt */ tcp_seq t_rtseq; /* sequence number being timed */ int32_t t_srtt; /* smoothed round-trip time */ int32_t t_rttvar; /* variance in round-trip time */ uint32_t t_rttmin; /* minimum rtt allowed */ u_long max_sndwnd; /* largest window peer has offered */ /* out-of-band data */ char t_oobflags; /* have some */ char t_iobc; /* input character */ #define TCPOOB_HAVEDATA 0x01 #define TCPOOB_HADDATA 0x02 short t_softerror; /* possible error not yet reported */ /* RFC 1323 variables */ u_char snd_scale; /* window scaling for send window */ u_char rcv_scale; /* window scaling for recv window */ u_char request_r_scale; /* pending window scaling */ u_char requested_s_scale; u_int32_t ts_recent; /* timestamp echo data */ u_int32_t ts_recent_age; /* when last updated */ u_int32_t ts_timebase; /* our timebase */ tcp_seq last_ack_sent; /* RFC 3465 variables */ u_long t_bytes_acked; /* ABC "bytes_acked" parameter */ /* SACK stuff */ #define TCP_SACK_MAX 3 #define TCPSACK_NONE 0 #define TCPSACK_HAVED 1 u_char rcv_sack_flags; /* SACK flags. */ struct sackblk rcv_dsack_block; /* RX D-SACK block. */ struct ipqehead timeq; /* time sequenced queue. */ struct sackhead snd_holes; /* TX SACK holes. */ int snd_numholes; /* Number of TX SACK holes. */ tcp_seq rcv_lastsack; /* last seq number(+1) sack'd by rcv'r*/ tcp_seq sack_newdata; /* New data xmitted in this recovery episode starts at this seq number*/ tcp_seq snd_fack; /* FACK TCP. Forward-most data held by peer. */ /* CUBIC variables */ ulong snd_cubic_wmax; /* W_max */ ulong snd_cubic_wmax_last; /* Used for fast convergence */ ulong snd_cubic_ctime; /* Last congestion time */ /* pointer for syn cache entries*/ LIST_HEAD(, syn_cache) t_sc; /* list of entries by this tcb */ /* prediction of next mbuf when using large window sizes */ struct mbuf *t_lastm; /* last mbuf that data was sent from */ int t_inoff; /* data offset in previous mbuf */ int t_lastoff; /* last data address in mbuf chain */ int t_lastlen; /* last length read from mbuf chain */ /* Path-MTU discovery blackhole detection */ int t_mtudisc; /* perform mtudisc for this tcb */ /* Path-MTU Discovery Information */ u_int t_pmtud_mss_acked; /* MSS acked, lower bound for MTU */ u_int t_pmtud_mtu_sent; /* MTU used, upper bound for MTU */ tcp_seq t_pmtud_th_seq; /* TCP SEQ from ICMP payload */ u_int t_pmtud_nextmtu; /* Advertised Next-Hop MTU from ICMP */ u_short t_pmtud_ip_len; /* IP length from ICMP payload */ u_short t_pmtud_ip_hl; /* IP header length from ICMP payload */ uint8_t t_ecn_retries; /* # of ECN setup retries */ const struct tcp_congctl *t_congctl; /* per TCB congctl algorithm */ /* Keepalive per socket */ u_int t_keepinit; u_int t_keepidle; u_int t_keepintvl; u_int t_keepcnt; u_int t_maxidle; /* t_keepcnt * t_keepintvl */ u_int t_msl; /* MSL to use for this connexion */ /* maintain a few stats per connection: */ uint32_t t_rcvoopack; /* out-of-order packets received */ uint32_t t_sndrexmitpack; /* retransmit packets sent */ uint32_t t_sndzerowin; /* zero-window updates sent */ }; /* * Macros to aid ECN TCP. */ #define TCP_ECN_ALLOWED(tp) (tp->t_flags & TF_ECN_PERMIT) /* * Macros to aid SACK/FACK TCP. */ #define TCP_SACK_ENABLED(tp) (tp->t_flags & TF_WILL_SACK) #define TCP_FACK_FASTRECOV(tp) \ (TCP_SACK_ENABLED(tp) && \ (SEQ_GT(tp->snd_fack, tp->snd_una + tcprexmtthresh * tp->t_segsz))) #ifdef _KERNEL /* * TCP reassembly queue locks. */ static __inline int tcp_reass_lock_try (struct tcpcb *) __unused; static __inline void tcp_reass_unlock (struct tcpcb *) __unused; static __inline int tcp_reass_lock_try(struct tcpcb *tp) { int s; /* * Use splvm() -- we're blocking things that would cause * mbuf allocation. */ s = splvm(); if (tp->t_flags & TF_REASSEMBLING) { splx(s); return (0); } tp->t_flags |= TF_REASSEMBLING; splx(s); return (1); } static __inline void tcp_reass_unlock(struct tcpcb *tp) { int s; s = splvm(); KASSERT((tp->t_flags & TF_REASSEMBLING) != 0); tp->t_flags &= ~TF_REASSEMBLING; splx(s); } #ifdef DIAGNOSTIC #define TCP_REASS_LOCK(tp) \ do { \ if (tcp_reass_lock_try(tp) == 0) { \ printf("%s:%d: tcpcb %p reass already locked\n", \ __FILE__, __LINE__, tp); \ panic("tcp_reass_lock"); \ } \ } while (/*CONSTCOND*/ 0) #define TCP_REASS_LOCK_CHECK(tp) \ do { \ if (((tp)->t_flags & TF_REASSEMBLING) == 0) { \ printf("%s:%d: tcpcb %p reass lock not held\n", \ __FILE__, __LINE__, tp); \ panic("tcp reass lock check"); \ } \ } while (/*CONSTCOND*/ 0) #else #define TCP_REASS_LOCK(tp) (void) tcp_reass_lock_try((tp)) #define TCP_REASS_LOCK_CHECK(tp) /* nothing */ #endif #define TCP_REASS_UNLOCK(tp) tcp_reass_unlock((tp)) #endif /* _KERNEL */ /* * Queue for delayed ACK processing. */ #ifdef _KERNEL extern int tcp_delack_ticks; void tcp_delack(void *); #define TCP_RESTART_DELACK(tp) \ callout_reset(&(tp)->t_delack_ch, tcp_delack_ticks, \ tcp_delack, tp) #define TCP_SET_DELACK(tp) \ do { \ if (((tp)->t_flags & TF_DELACK) == 0) { \ (tp)->t_flags |= TF_DELACK; \ TCP_RESTART_DELACK(tp); \ } \ } while (/*CONSTCOND*/0) #define TCP_CLEAR_DELACK(tp) \ do { \ if ((tp)->t_flags & TF_DELACK) { \ (tp)->t_flags &= ~TF_DELACK; \ callout_stop(&(tp)->t_delack_ch); \ } \ } while (/*CONSTCOND*/0) #endif /* _KERNEL */ /* * Compute the current timestamp for a connection. */ #define TCP_TIMESTAMP(tp) (tcp_now - (tp)->ts_timebase) /* * Handy way of passing around TCP option info. */ struct tcp_opt_info { int ts_present; u_int32_t ts_val; u_int32_t ts_ecr; u_int16_t maxseg; }; #define TOF_SIGNATURE 0x0040 /* signature option present */ #define TOF_SIGLEN 0x0080 /* sigature length valid (RFC2385) */ #define intotcpcb(ip) ((struct tcpcb *)(ip)->inp_ppcb) #define sototcpcb(so) (intotcpcb(sotoinpcb(so))) /* * See RFC2988 for a discussion of RTO calculation; comments assume * familiarity with that document. * * The smoothed round-trip time and estimated variance are stored as * fixed point numbers. Historically, srtt was scaled by * TCP_RTT_SHIFT bits, and rttvar by TCP_RTTVAR_SHIFT bits. Because * the values coincide with the alpha and beta parameters suggested * for RTO calculation (1/8 for srtt, 1/4 for rttvar), the combination * of computing 1/8 of the new value and transforming it to the * fixed-point representation required zero instructions. However, * the storage representations no longer coincide with the alpha/beta * shifts; instead, more fractional bits are present. * * The storage representation of srtt is 1/32 slow ticks, or 1/64 s. * (The assumption that a slow tick is 500 ms should not be present in * the code.) * * The storage representation of rttvar is 1/16 slow ticks, or 1/32 s. * There may be some confusion about this in the code. * * For historical reasons, these scales are also used in smoothing the * average (smoothed = (1/scale)sample + ((scale-1)/scale)smoothed). * This results in alpha of 0.125 and beta of 0.25, following RFC2988 * section 2.3 * * XXX Change SHIFT values to LGWEIGHT and REP_SHIFT, and adjust * the code to use the correct ones. */ #define TCP_RTT_SHIFT 3 /* shift for srtt; 3 bits frac. */ #define TCP_RTTVAR_SHIFT 2 /* multiplier for rttvar; 2 bits */ /* * Compute TCP retransmission timer, following RFC2988. * This macro returns a value in slow timeout ticks. * * Section 2.2 requires that the RTO value be * srtt + max(G, 4*RTTVAR) * where G is the clock granularity. * * This comment has not necessarily been updated for the new storage * representation: * * Because of the way we do the smoothing, srtt and rttvar * will each average +1/2 tick of bias. When we compute * the retransmit timer, we want 1/2 tick of rounding and * 1 extra tick because of +-1/2 tick uncertainty in the * firing of the timer. The bias will give us exactly the * 1.5 tick we need. But, because the bias is * statistical, we have to test that we don't drop below * the minimum feasible timer (which is 2 ticks). * This macro assumes that the value of 1<<TCP_RTTVAR_SHIFT * is the same as the multiplier for rttvar. * * This macro appears to be wrong; it should be checking rttvar*4 in * ticks and making sure we use 1 instead if rttvar*4 rounds to 0. It * appears to be treating srtt as being in the old storage * representation, resulting in a factor of 4 extra. */ #define TCP_REXMTVAL(tp) \ ((((tp)->t_srtt >> TCP_RTT_SHIFT) + (tp)->t_rttvar) >> 2) /* * Compute the initial window for slow start. */ #define TCP_INITIAL_WINDOW(iw, segsz) \ uimin((iw) * (segsz), uimax(2 * (segsz), tcp_init_win_max[(iw)])) /* * TCP statistics. * Each counter is an unsigned 64-bit value. * * Many of these should be kept per connection, but that's inconvenient * at the moment. */ #define TCP_STAT_CONNATTEMPT 0 /* connections initiated */ #define TCP_STAT_ACCEPTS 1 /* connections accepted */ #define TCP_STAT_CONNECTS 2 /* connections established */ #define TCP_STAT_DROPS 3 /* connections dropped */ #define TCP_STAT_CONNDROPS 4 /* embryonic connections dropped */ #define TCP_STAT_CLOSED 5 /* conn. closed (includes drops) */ #define TCP_STAT_SEGSTIMED 6 /* segs where we tried to get rtt */ #define TCP_STAT_RTTUPDATED 7 /* times we succeeded */ #define TCP_STAT_DELACK 8 /* delayed ACKs sent */ #define TCP_STAT_TIMEOUTDROP 9 /* conn. dropped in rxmt timeout */ #define TCP_STAT_REXMTTIMEO 10 /* retransmit timeouts */ #define TCP_STAT_PERSISTTIMEO 11 /* persist timeouts */ #define TCP_STAT_KEEPTIMEO 12 /* keepalive timeouts */ #define TCP_STAT_KEEPPROBE 13 /* keepalive probes sent */ #define TCP_STAT_KEEPDROPS 14 /* connections dropped in keepalive */ #define TCP_STAT_PERSISTDROPS 15 /* connections dropped in persist */ #define TCP_STAT_CONNSDRAINED 16 /* connections drained due to memory shortage */ #define TCP_STAT_PMTUBLACKHOLE 17 /* PMTUD blackhole detected */ #define TCP_STAT_SNDTOTAL 18 /* total packets sent */ #define TCP_STAT_SNDPACK 19 /* data packlets sent */ #define TCP_STAT_SNDBYTE 20 /* data bytes sent */ #define TCP_STAT_SNDREXMITPACK 21 /* data packets retransmitted */ #define TCP_STAT_SNDREXMITBYTE 22 /* data bytes retransmitted */ #define TCP_STAT_SNDACKS 23 /* ACK-only packets sent */ #define TCP_STAT_SNDPROBE 24 /* window probes sent */ #define TCP_STAT_SNDURG 25 /* packets sent with URG only */ #define TCP_STAT_SNDWINUP 26 /* window update-only packets sent */ #define TCP_STAT_SNDCTRL 27 /* control (SYN|FIN|RST) packets sent */ #define TCP_STAT_RCVTOTAL 28 /* total packets received */ #define TCP_STAT_RCVPACK 29 /* packets received in sequence */ #define TCP_STAT_RCVBYTE 30 /* bytes received in sequence */ #define TCP_STAT_RCVBADSUM 31 /* packets received with cksum errs */ #define TCP_STAT_RCVBADOFF 32 /* packets received with bad offset */ #define TCP_STAT_RCVMEMDROP 33 /* packets dropped for lack of memory */ #define TCP_STAT_RCVSHORT 34 /* packets received too short */ #define TCP_STAT_RCVDUPPACK 35 /* duplicate-only packets received */ #define TCP_STAT_RCVDUPBYTE 36 /* duplicate-only bytes received */ #define TCP_STAT_RCVPARTDUPPACK 37 /* packets with some duplicate data */ #define TCP_STAT_RCVPARTDUPBYTE 38 /* dup. bytes in part-dup. packets */ #define TCP_STAT_RCVOOPACK 39 /* out-of-order packets received */ #define TCP_STAT_RCVOOBYTE 40 /* out-of-order bytes received */ #define TCP_STAT_RCVPACKAFTERWIN 41 /* packets with data after window */ #define TCP_STAT_RCVBYTEAFTERWIN 42 /* bytes received after window */ #define TCP_STAT_RCVAFTERCLOSE 43 /* packets received after "close" */ #define TCP_STAT_RCVWINPROBE 44 /* rcvd window probe packets */ #define TCP_STAT_RCVDUPACK 45 /* rcvd duplicate ACKs */ #define TCP_STAT_RCVACKTOOMUCH 46 /* rcvd ACKs for unsent data */ #define TCP_STAT_RCVACKPACK 47 /* rcvd ACK packets */ #define TCP_STAT_RCVACKBYTE 48 /* bytes ACKed by rcvd ACKs */ #define TCP_STAT_RCVWINUPD 49 /* rcvd window update packets */ #define TCP_STAT_PAWSDROP 50 /* segments dropped due to PAWS */ #define TCP_STAT_PREDACK 51 /* times hdr predict OK for ACKs */ #define TCP_STAT_PREDDAT 52 /* times hdr predict OK for data pkts */ #define TCP_STAT_PCBHASHMISS 53 /* input packets missing PCB hash */ #define TCP_STAT_NOPORT 54 /* no socket on port */ #define TCP_STAT_BADSYN 55 /* received ACK for which we have no SYN in compressed state */ #define TCP_STAT_DELAYED_FREE 56 /* delayed pool_put() of tcpcb */ #define TCP_STAT_SC_ADDED 57 /* # of sc entries added */ #define TCP_STAT_SC_COMPLETED 58 /* # of sc connections completed */ #define TCP_STAT_SC_TIMED_OUT 59 /* # of sc entries timed out */ #define TCP_STAT_SC_OVERFLOWED 60 /* # of sc drops due to overflow */ #define TCP_STAT_SC_RESET 61 /* # of sc drops due to RST */ #define TCP_STAT_SC_UNREACH 62 /* # of sc drops due to ICMP unreach */ #define TCP_STAT_SC_BUCKETOVERFLOW 63 /* # of sc drops due to bucket ovflow */ #define TCP_STAT_SC_ABORTED 64 /* # of sc entries aborted (no mem) */ #define TCP_STAT_SC_DUPESYN 65 /* # of duplicate SYNs received */ #define TCP_STAT_SC_DROPPED 66 /* # of SYNs dropped (no route/mem) */ #define TCP_STAT_SC_COLLISIONS 67 /* # of sc hash collisions */ #define TCP_STAT_SC_RETRANSMITTED 68 /* # of sc retransmissions */ #define TCP_STAT_SC_DELAYED_FREE 69 /* # of delayed pool_put()s */ #define TCP_STAT_SELFQUENCH 70 /* # of ENOBUFS we get on output */ #define TCP_STAT_BADSIG 71 /* # of drops due to bad signature */ #define TCP_STAT_GOODSIG 72 /* # of packets with good signature */ #define TCP_STAT_ECN_SHS 73 /* # of successful ECN handshakes */ #define TCP_STAT_ECN_CE 74 /* # of packets with CE bit */ #define TCP_STAT_ECN_ECT 75 /* # of packets with ECT(0) bit */ #define TCP_NSTATS 76 /* * Names for TCP sysctl objects. */ #define TCPCTL_RFC1323 1 /* RFC1323 timestamps/scaling */ #define TCPCTL_SENDSPACE 2 /* default send buffer */ #define TCPCTL_RECVSPACE 3 /* default recv buffer */ #define TCPCTL_MSSDFLT 4 /* default seg size */ #define TCPCTL_SYN_CACHE_LIMIT 5 /* max size of comp. state engine */ #define TCPCTL_SYN_BUCKET_LIMIT 6 /* max size of hash bucket */ #if 0 /*obsoleted*/ #define TCPCTL_SYN_CACHE_INTER 7 /* interval of comp. state timer */ #endif #define TCPCTL_INIT_WIN 8 /* initial window */ #define TCPCTL_MSS_IFMTU 9 /* mss from interface, not in_maxmtu */ #define TCPCTL_SACK 10 /* RFC2018 selective acknowledgement */ #define TCPCTL_WSCALE 11 /* RFC1323 window scaling */ #define TCPCTL_TSTAMP 12 /* RFC1323 timestamps */ #if 0 /*obsoleted*/ #define TCPCTL_COMPAT_42 13 /* 4.2BSD TCP bug work-arounds */ #endif #define TCPCTL_CWM 14 /* Congestion Window Monitoring */ #define TCPCTL_CWM_BURSTSIZE 15 /* burst size allowed by CWM */ #define TCPCTL_ACK_ON_PUSH 16 /* ACK immediately on PUSH */ #define TCPCTL_KEEPIDLE 17 /* keepalive idle time */ #define TCPCTL_KEEPINTVL 18 /* keepalive probe interval */ #define TCPCTL_KEEPCNT 19 /* keepalive count */ #define TCPCTL_SLOWHZ 20 /* PR_SLOWHZ (read-only) */ #define TCPCTL_NEWRENO 21 /* NewReno Congestion Control */ #define TCPCTL_LOG_REFUSED 22 /* Log refused connections */ #if 0 /*obsoleted*/ #define TCPCTL_RSTRATELIMIT 23 /* RST rate limit */ #endif #define TCPCTL_RSTPPSLIMIT 24 /* RST pps limit */ #define TCPCTL_DELACK_TICKS 25 /* # ticks to delay ACK */ #define TCPCTL_INIT_WIN_LOCAL 26 /* initial window for local nets */ #define TCPCTL_IDENT 27 /* rfc 931 identd */ #define TCPCTL_ACKDROPRATELIMIT 28 /* SYN/RST -> ACK rate limit */ #define TCPCTL_LOOPBACKCKSUM 29 /* do TCP checksum on loopback */ #define TCPCTL_STATS 30 /* TCP statistics */ #define TCPCTL_DEBUG 31 /* TCP debug sockets */ #define TCPCTL_DEBX 32 /* # of tcp debug sockets */ #define TCPCTL_DROP 33 /* drop tcp connection */ #define TCPCTL_MSL 34 /* Max Segment Life */ #ifdef _KERNEL extern struct inpcbtable tcbtable; /* head of queue of active tcpcb's */ extern const struct pr_usrreqs tcp_usrreqs; extern u_int32_t tcp_now; /* for RFC 1323 timestamps */ extern int tcp_do_rfc1323; /* enabled/disabled? */ extern int tcp_do_sack; /* SACK enabled/disabled? */ extern int tcp_do_win_scale; /* RFC1323 window scaling enabled/disabled? */ extern int tcp_do_timestamps; /* RFC1323 timestamps enabled/disabled? */ extern int tcp_mssdflt; /* default seg size */ extern int tcp_minmss; /* minimal seg size */ extern int tcp_msl; /* max segment life */ extern int tcp_init_win; /* initial window */ extern int tcp_init_win_local; /* initial window for local nets */ extern int tcp_init_win_max[11];/* max sizes for values of tcp_init_win_* */ extern int tcp_mss_ifmtu; /* take MSS from interface, not in_maxmtu */ extern int tcp_cwm; /* enable Congestion Window Monitoring */ extern int tcp_cwm_burstsize; /* burst size allowed by CWM */ extern int tcp_ack_on_push; /* ACK immediately on PUSH */ extern int tcp_log_refused; /* log refused connections */ extern int tcp_do_ecn; /* TCP ECN enabled/disabled? */ extern int tcp_ecn_maxretries; /* Max ECN setup retries */ extern int tcp_do_rfc1948; /* ISS by cryptographic hash */ extern int tcp_sack_tp_maxholes; /* Max holes per connection. */ extern int tcp_sack_globalmaxholes; /* Max holes per system. */ extern int tcp_sack_globalholes; /* Number of holes present. */ extern int tcp_do_abc; /* RFC3465 ABC enabled/disabled? */ extern int tcp_abc_aggressive; /* 1: L=2*SMSS 0: L=1*SMSS */ extern int tcp_msl_enable; /* enable TIME_WAIT truncation */ extern int tcp_msl_loop; /* MSL for loopback */ extern int tcp_msl_local; /* MSL for 'local' */ extern int tcp_msl_remote; /* MSL otherwise */ extern int tcp_msl_remote_threshold; /* RTT threshold */ extern int tcp_rttlocal; /* Use RTT to decide who's 'local' */ extern int tcp4_vtw_enable; extern int tcp6_vtw_enable; extern int tcp_vtw_was_enabled; extern int tcp_vtw_entries; extern int tcp_rst_ppslim; extern int tcp_ackdrop_ppslim; #ifdef MBUFTRACE extern struct mowner tcp_rx_mowner; extern struct mowner tcp_tx_mowner; extern struct mowner tcp_reass_mowner; extern struct mowner tcp_sock_mowner; extern struct mowner tcp_sock_rx_mowner; extern struct mowner tcp_sock_tx_mowner; extern struct mowner tcp_mowner; #endif extern int tcp_do_autorcvbuf; extern int tcp_autorcvbuf_inc; extern int tcp_autorcvbuf_max; extern int tcp_do_autosndbuf; extern int tcp_autosndbuf_inc; extern int tcp_autosndbuf_max; struct secasvar; void tcp_canceltimers(struct tcpcb *); struct tcpcb * tcp_close(struct tcpcb *); int tcp_isdead(struct tcpcb *); #ifdef INET6 void *tcp6_ctlinput(int, const struct sockaddr *, void *); #endif void *tcp_ctlinput(int, const struct sockaddr *, void *); int tcp_ctloutput(int, struct socket *, struct sockopt *); struct tcpcb * tcp_disconnect1(struct tcpcb *); struct tcpcb * tcp_drop(struct tcpcb *, int); #ifdef TCP_SIGNATURE int tcp_signature_apply(void *, void *, u_int); struct secasvar *tcp_signature_getsav(struct mbuf *); int tcp_signature(struct mbuf *, struct tcphdr *, int, struct secasvar *, char *); #endif void tcp_drain(void); void tcp_drainstub(void); void tcp_established(struct tcpcb *); void tcp_init(void); void tcp_init_common(unsigned); #ifdef INET6 int tcp6_input(struct mbuf **, int *, int); #endif void tcp_input(struct mbuf *, int, int); u_int tcp_hdrsz(struct tcpcb *); u_long tcp_mss_to_advertise(const struct ifnet *, int); void tcp_mss_from_peer(struct tcpcb *, int); void tcp_tcpcb_template(void); struct tcpcb * tcp_newtcpcb(int, struct inpcb *); void tcp_notify(struct inpcb *, int); u_int tcp_optlen(struct tcpcb *); int tcp_output(struct tcpcb *); void tcp_pulloutofband(struct socket *, struct tcphdr *, struct mbuf *, int); void tcp_quench(struct inpcb *); void tcp_mtudisc(struct inpcb *, int); #ifdef INET6 void tcp6_mtudisc_callback(struct in6_addr *); #endif void tcpipqent_init(void); struct ipqent *tcpipqent_alloc(void); void tcpipqent_free(struct ipqent *); int tcp_respond(struct tcpcb *, struct mbuf *, struct mbuf *, struct tcphdr *, tcp_seq, tcp_seq, int); void tcp_rmx_rtt(struct tcpcb *); void tcp_setpersist(struct tcpcb *); #ifdef TCP_SIGNATURE int tcp_signature_compute(struct mbuf *, struct tcphdr *, int, int, int, u_char *, u_int); #endif void tcp_fasttimo(void); struct mbuf * tcp_template(struct tcpcb *); void tcp_trace(short, short, struct tcpcb *, struct mbuf *, int); struct tcpcb * tcp_usrclosed(struct tcpcb *); void tcp_usrreq_init(void); void tcp_xmit_timer(struct tcpcb *, uint32_t); tcp_seq tcp_new_iss(struct tcpcb *); tcp_seq tcp_new_iss1(void *, void *, u_int16_t, u_int16_t, size_t); void tcp_sack_init(void); void tcp_new_dsack(struct tcpcb *, tcp_seq, u_int32_t); void tcp_sack_option(struct tcpcb *, const struct tcphdr *, const u_char *, int); void tcp_del_sackholes(struct tcpcb *, const struct tcphdr *); void tcp_free_sackholes(struct tcpcb *); void tcp_sack_adjust(struct tcpcb *tp); struct sackhole *tcp_sack_output(struct tcpcb *tp, int *sack_bytes_rexmt); int tcp_sack_numblks(const struct tcpcb *); #define TCP_SACK_OPTLEN(nblks) ((nblks) * 8 + 2 + 2) void tcp_statinc(u_int); void tcp_statadd(u_int, uint64_t); int tcp_input_checksum(int, struct mbuf *, const struct tcphdr *, int, int, int); int tcp_dooptions(struct tcpcb *, const u_char *, int, struct tcphdr *, struct mbuf *, int, struct tcp_opt_info *); #endif #endif /* !_NETINET_TCP_VAR_H_ */
1 1 2 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 /* $NetBSD: vfs_quotactl.c,v 1.40 2014/06/28 22:27:50 dholland Exp $ */ /*- * Copyright (c) 2012 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by David A. Holland. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: vfs_quotactl.c,v 1.40 2014/06/28 22:27:50 dholland Exp $$"); #include <sys/mount.h> #include <sys/quotactl.h> int vfs_quotactl_stat(struct mount *mp, struct quotastat *info) { struct quotactl_args args; args.qc_op = QUOTACTL_STAT; args.u.stat.qc_info = info; return VFS_QUOTACTL(mp, &args); } int vfs_quotactl_idtypestat(struct mount *mp, int idtype, struct quotaidtypestat *info) { struct quotactl_args args; args.qc_op = QUOTACTL_IDTYPESTAT; args.u.idtypestat.qc_idtype = idtype; args.u.idtypestat.qc_info = info; return VFS_QUOTACTL(mp, &args); } int vfs_quotactl_objtypestat(struct mount *mp, int objtype, struct quotaobjtypestat *info) { struct quotactl_args args; args.qc_op = QUOTACTL_OBJTYPESTAT; args.u.objtypestat.qc_objtype = objtype; args.u.objtypestat.qc_info = info; return VFS_QUOTACTL(mp, &args); } int vfs_quotactl_get(struct mount *mp, const struct quotakey *key, struct quotaval *val) { struct quotactl_args args; args.qc_op = QUOTACTL_GET; args.u.get.qc_key = key; args.u.get.qc_val = val; return VFS_QUOTACTL(mp, &args); } int vfs_quotactl_put(struct mount *mp, const struct quotakey *key, const struct quotaval *val) { struct quotactl_args args; args.qc_op = QUOTACTL_PUT; args.u.put.qc_key = key; args.u.put.qc_val = val; return VFS_QUOTACTL(mp, &args); } int vfs_quotactl_del(struct mount *mp, const struct quotakey *key) { struct quotactl_args args; args.qc_op = QUOTACTL_DEL; args.u.del.qc_key = key; return VFS_QUOTACTL(mp, &args); } int vfs_quotactl_cursoropen(struct mount *mp, struct quotakcursor *cursor) { struct quotactl_args args; args.qc_op = QUOTACTL_CURSOROPEN; args.u.cursoropen.qc_cursor = cursor; return VFS_QUOTACTL(mp, &args); } int vfs_quotactl_cursorclose(struct mount *mp, struct quotakcursor *cursor) { struct quotactl_args args; args.qc_op = QUOTACTL_CURSORCLOSE; args.u.cursorclose.qc_cursor = cursor; return VFS_QUOTACTL(mp, &args); } int vfs_quotactl_cursorskipidtype(struct mount *mp, struct quotakcursor *cursor, int idtype) { struct quotactl_args args; args.qc_op = QUOTACTL_CURSORSKIPIDTYPE; args.u.cursorskipidtype.qc_cursor = cursor; args.u.cursorskipidtype.qc_idtype = idtype; return VFS_QUOTACTL(mp, &args); } int vfs_quotactl_cursorget(struct mount *mp, struct quotakcursor *cursor, struct quotakey *keys, struct quotaval *vals, unsigned maxnum, unsigned *ret) { struct quotactl_args args; args.qc_op = QUOTACTL_CURSORGET; args.u.cursorget.qc_cursor = cursor; args.u.cursorget.qc_keys = keys; args.u.cursorget.qc_vals = vals; args.u.cursorget.qc_maxnum = maxnum; args.u.cursorget.qc_ret = ret; return VFS_QUOTACTL(mp, &args); } int vfs_quotactl_cursoratend(struct mount *mp, struct quotakcursor *cursor, int *ret) { struct quotactl_args args; args.qc_op = QUOTACTL_CURSORATEND; args.u.cursoratend.qc_cursor = cursor; args.u.cursoratend.qc_ret = ret; return VFS_QUOTACTL(mp, &args); } int vfs_quotactl_cursorrewind(struct mount *mp, struct quotakcursor *cursor) { struct quotactl_args args; args.qc_op = QUOTACTL_CURSORREWIND; args.u.cursorrewind.qc_cursor = cursor; return VFS_QUOTACTL(mp, &args); } int vfs_quotactl_quotaon(struct mount *mp, int idtype, const char *path) { struct quotactl_args args; args.qc_op = QUOTACTL_QUOTAON; args.u.quotaon.qc_idtype = idtype; args.u.quotaon.qc_quotafile = path; return VFS_QUOTACTL(mp, &args); } int vfs_quotactl_quotaoff(struct mount *mp, int idtype) { struct quotactl_args args; args.qc_op = QUOTACTL_QUOTAOFF; args.u.quotaoff.qc_idtype = idtype; return VFS_QUOTACTL(mp, &args); }
3 1 2 2 1 1 1 1 1 1 1 1 1 1 1 1 2 2 28 29 25 7 7 1 1 3 2 1 1 1 2 1 1 1 1 1 1 1 1 1 2 2 1 1 1 1 1 1 1 1 1 1 2 3 1 2 2 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 /* $NetBSD: bpf.c,v 1.252 2023/07/31 17:41:18 christos Exp $ */ /* * Copyright (c) 1990, 1991, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from the Stanford/CMU enet packet filter, * (net/enet.c) distributed as part of 4.3BSD, and code contributed * to Berkeley by Steven McCanne and Van Jacobson both of Lawrence * Berkeley Laboratory. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)bpf.c 8.4 (Berkeley) 1/9/95 * static char rcsid[] = * "Header: bpf.c,v 1.67 96/09/26 22:00:52 leres Exp "; */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: bpf.c,v 1.252 2023/07/31 17:41:18 christos Exp $"); #if defined(_KERNEL_OPT) #include "opt_bpf.h" #include "sl.h" #include "opt_net_mpsafe.h" #endif #include <sys/param.h> #include <sys/systm.h> #include <sys/mbuf.h> #include <sys/buf.h> #include <sys/time.h> #include <sys/proc.h> #include <sys/ioctl.h> #include <sys/conf.h> #include <sys/vnode.h> #include <sys/queue.h> #include <sys/stat.h> #include <sys/module.h> #include <sys/atomic.h> #include <sys/cpu.h> #include <sys/file.h> #include <sys/filedesc.h> #include <sys/tty.h> #include <sys/uio.h> #include <sys/protosw.h> #include <sys/socket.h> #include <sys/errno.h> #include <sys/kernel.h> #include <sys/poll.h> #include <sys/sysctl.h> #include <sys/kauth.h> #include <sys/syslog.h> #include <sys/percpu.h> #include <sys/pserialize.h> #include <sys/lwp.h> #include <sys/xcall.h> #include <net/if.h> #include <net/slip.h> #include <net/bpf.h> #include <net/bpfdesc.h> #include <net/bpfjit.h> #include <net/if_arc.h> #include <net/if_ether.h> #include <net/if_types.h> #include <netinet/in.h> #include <netinet/if_inarp.h> #include <compat/sys/sockio.h> #ifndef BPF_BUFSIZE /* * 4096 is too small for FDDI frames. 8192 is too small for gigabit Ethernet * jumbos (circa 9k), ATM, or Intel gig/10gig ethernet jumbos (16k). */ # define BPF_BUFSIZE 32768 #endif #define PRINET 26 /* interruptible */ /* * The default read buffer size, and limit for BIOCSBLEN, is sysctl'able. * XXX the default values should be computed dynamically based * on available memory size and available mbuf clusters. */ static int bpf_bufsize = BPF_BUFSIZE; static int bpf_maxbufsize = BPF_DFLTBUFSIZE; /* XXX set dynamically, see above */ static bool bpf_jit = false; struct bpfjit_ops bpfjit_module_ops = { .bj_generate_code = NULL, .bj_free_code = NULL }; /* * Global BPF statistics returned by net.bpf.stats sysctl. */ static struct percpu *bpf_gstats_percpu; /* struct bpf_stat */ #define BPF_STATINC(id) \ { \ struct bpf_stat *__stats = \ percpu_getref(bpf_gstats_percpu); \ __stats->bs_##id++; \ percpu_putref(bpf_gstats_percpu); \ } /* * Locking notes: * - bpf_mtx (adaptive mutex) protects: * - Gobal lists: bpf_iflist and bpf_dlist * - struct bpf_if * - bpf_close * - bpf_psz (pserialize) * - struct bpf_d has two mutexes: * - bd_buf_mtx (spin mutex) protects the buffers that can be accessed * on packet tapping * - bd_mtx (adaptive mutex) protects member variables other than the buffers * - Locking order: bpf_mtx => bpf_d#bd_mtx => bpf_d#bd_buf_mtx * - struct bpf_d obtained via fp->f_bpf in bpf_read and bpf_write is * never freed because struct bpf_d is only freed in bpf_close and * bpf_close never be called while executing bpf_read and bpf_write * - A filter that is assigned to bpf_d can be replaced with another filter * while tapping packets, so it needs to be done atomically * - struct bpf_d is iterated on bpf_dlist with psz * - struct bpf_if is iterated on bpf_iflist with psz or psref */ /* * Use a mutex to avoid a race condition between gathering the stats/peers * and opening/closing the device. */ static kmutex_t bpf_mtx; static struct psref_class *bpf_psref_class __read_mostly; static pserialize_t bpf_psz; static inline void bpf_if_acquire(struct bpf_if *bp, struct psref *psref) { psref_acquire(psref, &bp->bif_psref, bpf_psref_class); } static inline void bpf_if_release(struct bpf_if *bp, struct psref *psref) { psref_release(psref, &bp->bif_psref, bpf_psref_class); } /* * bpf_iflist is the list of interfaces; each corresponds to an ifnet * bpf_dtab holds the descriptors, indexed by minor device # */ static struct pslist_head bpf_iflist; static struct pslist_head bpf_dlist; /* Macros for bpf_d on bpf_dlist */ #define BPF_DLIST_WRITER_INSERT_HEAD(__d) \ PSLIST_WRITER_INSERT_HEAD(&bpf_dlist, (__d), bd_bpf_dlist_entry) #define BPF_DLIST_READER_FOREACH(__d) \ PSLIST_READER_FOREACH((__d), &bpf_dlist, struct bpf_d, \ bd_bpf_dlist_entry) #define BPF_DLIST_WRITER_FOREACH(__d) \ PSLIST_WRITER_FOREACH((__d), &bpf_dlist, struct bpf_d, \ bd_bpf_dlist_entry) #define BPF_DLIST_ENTRY_INIT(__d) \ PSLIST_ENTRY_INIT((__d), bd_bpf_dlist_entry) #define BPF_DLIST_WRITER_REMOVE(__d) \ PSLIST_WRITER_REMOVE((__d), bd_bpf_dlist_entry) #define BPF_DLIST_ENTRY_DESTROY(__d) \ PSLIST_ENTRY_DESTROY((__d), bd_bpf_dlist_entry) /* Macros for bpf_if on bpf_iflist */ #define BPF_IFLIST_WRITER_INSERT_HEAD(__bp) \ PSLIST_WRITER_INSERT_HEAD(&bpf_iflist, (__bp), bif_iflist_entry) #define BPF_IFLIST_READER_FOREACH(__bp) \ PSLIST_READER_FOREACH((__bp), &bpf_iflist, struct bpf_if, \ bif_iflist_entry) #define BPF_IFLIST_WRITER_FOREACH(__bp) \ PSLIST_WRITER_FOREACH((__bp), &bpf_iflist, struct bpf_if, \ bif_iflist_entry) #define BPF_IFLIST_WRITER_REMOVE(__bp) \ PSLIST_WRITER_REMOVE((__bp), bif_iflist_entry) #define BPF_IFLIST_ENTRY_INIT(__bp) \ PSLIST_ENTRY_INIT((__bp), bif_iflist_entry) #define BPF_IFLIST_ENTRY_DESTROY(__bp) \ PSLIST_ENTRY_DESTROY((__bp), bif_iflist_entry) /* Macros for bpf_d on bpf_if#bif_dlist_pslist */ #define BPFIF_DLIST_READER_FOREACH(__d, __bp) \ PSLIST_READER_FOREACH((__d), &(__bp)->bif_dlist_head, struct bpf_d, \ bd_bif_dlist_entry) #define BPFIF_DLIST_WRITER_INSERT_HEAD(__bp, __d) \ PSLIST_WRITER_INSERT_HEAD(&(__bp)->bif_dlist_head, (__d), \ bd_bif_dlist_entry) #define BPFIF_DLIST_WRITER_REMOVE(__d) \ PSLIST_WRITER_REMOVE((__d), bd_bif_dlist_entry) #define BPFIF_DLIST_ENTRY_INIT(__d) \ PSLIST_ENTRY_INIT((__d), bd_bif_dlist_entry) #define BPFIF_DLIST_READER_EMPTY(__bp) \ (PSLIST_READER_FIRST(&(__bp)->bif_dlist_head, struct bpf_d, \ bd_bif_dlist_entry) == NULL) #define BPFIF_DLIST_WRITER_EMPTY(__bp) \ (PSLIST_WRITER_FIRST(&(__bp)->bif_dlist_head, struct bpf_d, \ bd_bif_dlist_entry) == NULL) #define BPFIF_DLIST_ENTRY_DESTROY(__d) \ PSLIST_ENTRY_DESTROY((__d), bd_bif_dlist_entry) static int bpf_allocbufs(struct bpf_d *); static u_int bpf_xfilter(struct bpf_filter **, void *, u_int, u_int); static void bpf_deliver(struct bpf_if *, void *(*cpfn)(void *, const void *, size_t), void *, u_int, u_int, const u_int); static void bpf_freed(struct bpf_d *); static void bpf_free_filter(struct bpf_filter *); static void bpf_ifname(struct ifnet *, struct ifreq *); static void *bpf_mcpy(void *, const void *, size_t); static int bpf_movein(struct ifnet *, struct uio *, int, uint64_t, struct mbuf **, struct sockaddr *, struct bpf_filter **); static void bpf_attachd(struct bpf_d *, struct bpf_if *); static void bpf_detachd(struct bpf_d *); static int bpf_setif(struct bpf_d *, struct ifreq *); static int bpf_setf(struct bpf_d *, struct bpf_program *, u_long); static void bpf_timed_out(void *); static inline void bpf_wakeup(struct bpf_d *); static int bpf_hdrlen(struct bpf_d *); static void catchpacket(struct bpf_d *, u_char *, u_int, u_int, void *(*)(void *, const void *, size_t), struct timespec *); static void reset_d(struct bpf_d *); static int bpf_getdltlist(struct bpf_d *, struct bpf_dltlist *); static int bpf_setdlt(struct bpf_d *, u_int); static int bpf_read(struct file *, off_t *, struct uio *, kauth_cred_t, int); static int bpf_write(struct file *, off_t *, struct uio *, kauth_cred_t, int); static int bpf_ioctl(struct file *, u_long, void *); static int bpf_poll(struct file *, int); static int bpf_stat(struct file *, struct stat *); static int bpf_close(struct file *); static int bpf_kqfilter(struct file *, struct knote *); static const struct fileops bpf_fileops = { .fo_name = "bpf", .fo_read = bpf_read, .fo_write = bpf_write, .fo_ioctl = bpf_ioctl, .fo_fcntl = fnullop_fcntl, .fo_poll = bpf_poll, .fo_stat = bpf_stat, .fo_close = bpf_close, .fo_kqfilter = bpf_kqfilter, .fo_restart = fnullop_restart, }; dev_type_open(bpfopen); const struct cdevsw bpf_cdevsw = { .d_open = bpfopen, .d_close = noclose, .d_read = noread, .d_write = nowrite, .d_ioctl = noioctl, .d_stop = nostop, .d_tty = notty, .d_poll = nopoll, .d_mmap = nommap, .d_kqfilter = nokqfilter, .d_discard = nodiscard, .d_flag = D_OTHER | D_MPSAFE }; bpfjit_func_t bpf_jit_generate(bpf_ctx_t *bc, void *code, size_t size) { struct bpfjit_ops *ops = &bpfjit_module_ops; bpfjit_func_t (*generate_code)(const bpf_ctx_t *, const struct bpf_insn *, size_t); generate_code = atomic_load_acquire(&ops->bj_generate_code); if (generate_code != NULL) { return generate_code(bc, code, size); } return NULL; } void bpf_jit_freecode(bpfjit_func_t jcode) { KASSERT(bpfjit_module_ops.bj_free_code != NULL); bpfjit_module_ops.bj_free_code(jcode); } static int bpf_movein(struct ifnet *ifp, struct uio *uio, int linktype, uint64_t mtu, struct mbuf **mp, struct sockaddr *sockp, struct bpf_filter **wfilter) { struct mbuf *m, *m0, *n; int error; size_t len; size_t hlen; size_t align; u_int slen; /* * Build a sockaddr based on the data link layer type. * We do this at this level because the ethernet header * is copied directly into the data field of the sockaddr. * In the case of SLIP, there is no header and the packet * is forwarded as is. * Also, we are careful to leave room at the front of the mbuf * for the link level header. */ switch (linktype) { case DLT_SLIP: sockp->sa_family = AF_INET; hlen = 0; align = 0; break; case DLT_PPP: sockp->sa_family = AF_UNSPEC; hlen = 0; align = 0; break; case DLT_EN10MB: sockp->sa_family = AF_UNSPEC; /* XXX Would MAXLINKHDR be better? */ /* 6(dst)+6(src)+2(type) */ hlen = sizeof(struct ether_header); align = 2; break; case DLT_ARCNET: sockp->sa_family = AF_UNSPEC; hlen = ARC_HDRLEN; align = 5; break; case DLT_FDDI: sockp->sa_family = AF_LINK; /* XXX 4(FORMAC)+6(dst)+6(src) */ hlen = 16; align = 0; break; case DLT_ECONET: sockp->sa_family = AF_UNSPEC; hlen = 6; align = 2; break; case DLT_NULL: sockp->sa_family = AF_UNSPEC; if (ifp->if_type == IFT_LOOP) { /* Set here to apply the following validations */ hlen = sizeof(uint32_t); } else hlen = 0; align = 0; break; default: return (EIO); } len = uio->uio_resid; /* * If there aren't enough bytes for a link level header or the * packet length exceeds the interface mtu, return an error. */ if (len - hlen > mtu) return (EMSGSIZE); m0 = m = m_gethdr(M_WAIT, MT_DATA); m_reset_rcvif(m); m->m_pkthdr.len = (int)(len - hlen); if (len + align > MHLEN) { m_clget(m, M_WAIT); if ((m->m_flags & M_EXT) == 0) { error = ENOBUFS; goto bad; } } /* Ensure the data is properly aligned */ if (align > 0) m->m_data += align; for (;;) { len = M_TRAILINGSPACE(m); if (len > uio->uio_resid) len = uio->uio_resid; error = uiomove(mtod(m, void *), len, uio); if (error) goto bad; m->m_len = len; if (uio->uio_resid == 0) break; n = m_get(M_WAIT, MT_DATA); m_clget(n, M_WAIT); /* if fails, there is no problem */ m->m_next = n; m = n; } slen = bpf_xfilter(wfilter, mtod(m, u_char *), len, len); if (slen == 0) { error = EPERM; goto bad; } if (hlen != 0) { if (linktype == DLT_NULL && ifp->if_type == IFT_LOOP) { uint32_t af; /* the link header indicates the address family */ memcpy(&af, mtod(m0, void *), sizeof(af)); sockp->sa_family = af; } else { /* move link level header in the top of mbuf to sa_data */ memcpy(sockp->sa_data, mtod(m0, void *), hlen); } m0->m_data += hlen; m0->m_len -= hlen; } *mp = m0; return (0); bad: m_freem(m0); return (error); } /* * Attach file to the bpf interface, i.e. make d listen on bp. */ static void bpf_attachd(struct bpf_d *d, struct bpf_if *bp) { struct bpf_event_tracker *t; KASSERT(mutex_owned(&bpf_mtx)); KASSERT(mutex_owned(d->bd_mtx)); /* * Point d at bp, and add d to the interface's list of listeners. * Finally, point the driver's bpf cookie at the interface so * it will divert packets to bpf. */ d->bd_bif = bp; BPFIF_DLIST_WRITER_INSERT_HEAD(bp, d); *bp->bif_driverp = bp; SLIST_FOREACH(t, &bp->bif_trackers, bet_entries) { t->bet_notify(bp, bp->bif_ifp, bp->bif_dlt, BPF_TRACK_EVENT_ATTACH); } } /* * Detach a file from its interface. */ static void bpf_detachd(struct bpf_d *d) { struct bpf_if *bp; struct bpf_event_tracker *t; KASSERT(mutex_owned(&bpf_mtx)); KASSERT(mutex_owned(d->bd_mtx)); bp = d->bd_bif; /* * Check if this descriptor had requested promiscuous mode. * If so, turn it off. */ if (d->bd_promisc) { int error __diagused; d->bd_promisc = 0; /* * Take device out of promiscuous mode. Since we were * able to enter promiscuous mode, we should be able * to turn it off. But we can get an error if * the interface was configured down, so only panic * if we don't get an unexpected error. */ KERNEL_LOCK_UNLESS_NET_MPSAFE(); error = ifpromisc(bp->bif_ifp, 0); KERNEL_UNLOCK_UNLESS_NET_MPSAFE(); #ifdef DIAGNOSTIC if (error) printf("%s: ifpromisc failed: %d", __func__, error); #endif } /* Remove d from the interface's descriptor list. */ BPFIF_DLIST_WRITER_REMOVE(d); pserialize_perform(bpf_psz); if (BPFIF_DLIST_WRITER_EMPTY(bp)) { /* * Let the driver know that there are no more listeners. */ *d->bd_bif->bif_driverp = NULL; } d->bd_bif = NULL; SLIST_FOREACH(t, &bp->bif_trackers, bet_entries) { t->bet_notify(bp, bp->bif_ifp, bp->bif_dlt, BPF_TRACK_EVENT_DETACH); } } static void bpf_init(void) { mutex_init(&bpf_mtx, MUTEX_DEFAULT, IPL_NONE); bpf_psz = pserialize_create(); bpf_psref_class = psref_class_create("bpf", IPL_SOFTNET); PSLIST_INIT(&bpf_iflist); PSLIST_INIT(&bpf_dlist); bpf_gstats_percpu = percpu_alloc(sizeof(struct bpf_stat)); return; } /* * bpfilterattach() is called at boot time. We don't need to do anything * here, since any initialization will happen as part of module init code. */ /* ARGSUSED */ void bpfilterattach(int n) { } /* * Open ethernet device. Clones. */ /* ARGSUSED */ int bpfopen(dev_t dev, int flag, int mode, struct lwp *l) { struct bpf_d *d; struct file *fp; int error, fd; /* falloc() will fill in the descriptor for us. */ if ((error = fd_allocfile(&fp, &fd)) != 0) return error; d = kmem_zalloc(sizeof(*d), KM_SLEEP); d->bd_bufsize = bpf_bufsize; d->bd_direction = BPF_D_INOUT; d->bd_feedback = 0; d->bd_pid = l->l_proc->p_pid; #ifdef _LP64 if (curproc->p_flag & PK_32) d->bd_compat32 = 1; #endif getnanotime(&d->bd_btime); d->bd_atime = d->bd_mtime = d->bd_btime; callout_init(&d->bd_callout, CALLOUT_MPSAFE); selinit(&d->bd_sel); d->bd_jitcode = NULL; d->bd_rfilter = NULL; d->bd_wfilter = NULL; d->bd_locked = 0; BPF_DLIST_ENTRY_INIT(d); BPFIF_DLIST_ENTRY_INIT(d); d->bd_mtx = mutex_obj_alloc(MUTEX_DEFAULT, IPL_SOFTNET); d->bd_buf_mtx = mutex_obj_alloc(MUTEX_DEFAULT, IPL_NET); cv_init(&d->bd_cv, "bpf"); mutex_enter(&bpf_mtx); BPF_DLIST_WRITER_INSERT_HEAD(d); mutex_exit(&bpf_mtx); return fd_clone(fp, fd, flag, &bpf_fileops, d); } /* * Close the descriptor by detaching it from its interface, * deallocating its buffers, and marking it free. */ /* ARGSUSED */ static int bpf_close(struct file *fp) { struct bpf_d *d; mutex_enter(&bpf_mtx); if ((d = fp->f_bpf) == NULL) { mutex_exit(&bpf_mtx); return 0; } /* * Refresh the PID associated with this bpf file. */ d->bd_pid = curproc->p_pid; mutex_enter(d->bd_mtx); if (d->bd_state == BPF_WAITING) callout_halt(&d->bd_callout, d->bd_mtx); d->bd_state = BPF_IDLE; if (d->bd_bif) bpf_detachd(d); mutex_exit(d->bd_mtx); BPF_DLIST_WRITER_REMOVE(d); pserialize_perform(bpf_psz); mutex_exit(&bpf_mtx); BPFIF_DLIST_ENTRY_DESTROY(d); BPF_DLIST_ENTRY_DESTROY(d); fp->f_bpf = NULL; bpf_freed(d); callout_destroy(&d->bd_callout); seldestroy(&d->bd_sel); mutex_obj_free(d->bd_mtx); mutex_obj_free(d->bd_buf_mtx); cv_destroy(&d->bd_cv); kmem_free(d, sizeof(*d)); return (0); } /* * Rotate the packet buffers in descriptor d. Move the store buffer * into the hold slot, and the free buffer into the store slot. * Zero the length of the new store buffer. */ #define ROTATE_BUFFERS(d) \ (d)->bd_hbuf = (d)->bd_sbuf; \ (d)->bd_hlen = (d)->bd_slen; \ (d)->bd_sbuf = (d)->bd_fbuf; \ (d)->bd_slen = 0; \ (d)->bd_fbuf = NULL; /* * bpfread - read next chunk of packets from buffers */ static int bpf_read(struct file *fp, off_t *offp, struct uio *uio, kauth_cred_t cred, int flags) { struct bpf_d *d = fp->f_bpf; int timed_out; int error; /* * Refresh the PID associated with this bpf file. */ d->bd_pid = curproc->p_pid; getnanotime(&d->bd_atime); /* * Restrict application to use a buffer the same size as * the kernel buffers. */ if (uio->uio_resid != d->bd_bufsize) return (EINVAL); mutex_enter(d->bd_mtx); if (d->bd_state == BPF_WAITING) callout_halt(&d->bd_callout, d->bd_mtx); timed_out = (d->bd_state == BPF_TIMED_OUT); d->bd_state = BPF_IDLE; mutex_exit(d->bd_mtx); /* * If the hold buffer is empty, then do a timed sleep, which * ends when the timeout expires or when enough packets * have arrived to fill the store buffer. */ mutex_enter(d->bd_buf_mtx); while (d->bd_hbuf == NULL) { if (fp->f_flag & FNONBLOCK) { if (d->bd_slen == 0) { error = EWOULDBLOCK; goto out; } ROTATE_BUFFERS(d); break; } if ((d->bd_immediate || timed_out) && d->bd_slen != 0) { /* * A packet(s) either arrived since the previous * read or arrived while we were asleep. * Rotate the buffers and return what's here. */ ROTATE_BUFFERS(d); break; } error = cv_timedwait_sig(&d->bd_cv, d->bd_buf_mtx, d->bd_rtout); if (error == EINTR || error == ERESTART) goto out; if (error == EWOULDBLOCK) { /* * On a timeout, return what's in the buffer, * which may be nothing. If there is something * in the store buffer, we can rotate the buffers. */ if (d->bd_hbuf) /* * We filled up the buffer in between * getting the timeout and arriving * here, so we don't need to rotate. */ break; if (d->bd_slen == 0) { error = 0; goto out; } ROTATE_BUFFERS(d); break; } if (error != 0) goto out; } /* * At this point, we know we have something in the hold slot. */ mutex_exit(d->bd_buf_mtx); /* * Move data from hold buffer into user space. * We know the entire buffer is transferred since * we checked above that the read buffer is bpf_bufsize bytes. */ error = uiomove(d->bd_hbuf, d->bd_hlen, uio); mutex_enter(d->bd_buf_mtx); d->bd_fbuf = d->bd_hbuf; d->bd_hbuf = NULL; d->bd_hlen = 0; out: mutex_exit(d->bd_buf_mtx); return (error); } /* * If there are processes sleeping on this descriptor, wake them up. */ static inline void bpf_wakeup(struct bpf_d *d) { mutex_enter(d->bd_buf_mtx); cv_broadcast(&d->bd_cv); mutex_exit(d->bd_buf_mtx); if (d->bd_async) fownsignal(d->bd_pgid, SIGIO, 0, 0, NULL); selnotify(&d->bd_sel, 0, 0); } static void bpf_timed_out(void *arg) { struct bpf_d *d = arg; mutex_enter(d->bd_mtx); if (d->bd_state == BPF_WAITING) { d->bd_state = BPF_TIMED_OUT; if (d->bd_slen != 0) bpf_wakeup(d); } mutex_exit(d->bd_mtx); } static int bpf_write(struct file *fp, off_t *offp, struct uio *uio, kauth_cred_t cred, int flags) { struct bpf_d *d = fp->f_bpf; struct bpf_if *bp; struct ifnet *ifp; struct mbuf *m, *mc; int error; static struct sockaddr_storage dst; struct psref psref; int bound; /* * Refresh the PID associated with this bpf file. */ d->bd_pid = curproc->p_pid; m = NULL; /* XXX gcc */ bound = curlwp_bind(); mutex_enter(d->bd_mtx); bp = d->bd_bif; if (bp == NULL) { mutex_exit(d->bd_mtx); error = ENXIO; goto out_bindx; } bpf_if_acquire(bp, &psref); mutex_exit(d->bd_mtx); getnanotime(&d->bd_mtime); ifp = bp->bif_ifp; if (if_is_deactivated(ifp)) { error = ENXIO; goto out; } if (uio->uio_resid == 0) { error = 0; goto out; } error = bpf_movein(ifp, uio, (int)bp->bif_dlt, ifp->if_mtu, &m, (struct sockaddr *) &dst, &d->bd_wfilter); if (error) goto out; if (m->m_pkthdr.len > ifp->if_mtu) { m_freem(m); error = EMSGSIZE; goto out; } /* * If writing to a loopback interface, the address family has * already been specially computed in bpf_movein(), so don't * clobber it, or the loopback will reject it in looutput(). */ if (d->bd_hdrcmplt && ifp->if_type != IFT_LOOP) dst.ss_family = pseudo_AF_HDRCMPLT; if (d->bd_feedback) { mc = m_dup(m, 0, M_COPYALL, M_NOWAIT); if (mc != NULL) m_set_rcvif(mc, ifp); /* Set M_PROMISC for outgoing packets to be discarded. */ if (1 /*d->bd_direction == BPF_D_INOUT*/) m->m_flags |= M_PROMISC; } else mc = NULL; error = if_output_lock(ifp, ifp, m, (struct sockaddr *) &dst, NULL); if (mc != NULL) { if (error == 0) { int s = splsoftnet(); KERNEL_LOCK_UNLESS_IFP_MPSAFE(ifp); ifp->_if_input(ifp, mc); KERNEL_UNLOCK_UNLESS_IFP_MPSAFE(ifp); splx(s); } else m_freem(mc); } /* * The driver frees the mbuf. */ out: bpf_if_release(bp, &psref); out_bindx: curlwp_bindx(bound); return error; } /* * Reset a descriptor by flushing its packet buffer and clearing the * receive and drop counts. */ static void reset_d(struct bpf_d *d) { KASSERT(mutex_owned(d->bd_mtx)); mutex_enter(d->bd_buf_mtx); if (d->bd_hbuf) { /* Free the hold buffer. */ d->bd_fbuf = d->bd_hbuf; d->bd_hbuf = NULL; } d->bd_slen = 0; d->bd_hlen = 0; d->bd_rcount = 0; d->bd_dcount = 0; d->bd_ccount = 0; mutex_exit(d->bd_buf_mtx); } /* * FIONREAD Check for read packet available. * BIOCGBLEN Get buffer len [for read()]. * BIOCSETF Set ethernet read filter. * BIOCFLUSH Flush read packet buffer. * BIOCPROMISC Put interface into promiscuous mode. * BIOCGDLT Get link layer type. * BIOCGETIF Get interface name. * BIOCSETIF Set interface. * BIOCSRTIMEOUT Set read timeout. * BIOCGRTIMEOUT Get read timeout. * BIOCGSTATS Get packet stats. * BIOCIMMEDIATE Set immediate mode. * BIOCVERSION Get filter language version. * BIOCGHDRCMPLT Get "header already complete" flag. * BIOCSHDRCMPLT Set "header already complete" flag. * BIOCSFEEDBACK Set packet feedback mode. * BIOCGFEEDBACK Get packet feedback mode. * BIOCGDIRECTION Get packet direction flag * BIOCSDIRECTION Set packet direction flag */ /* ARGSUSED */ static int bpf_ioctl(struct file *fp, u_long cmd, void *addr) { struct bpf_d *d = fp->f_bpf; int error = 0; /* * Refresh the PID associated with this bpf file. */ d->bd_pid = curproc->p_pid; #ifdef _LP64 if (curproc->p_flag & PK_32) d->bd_compat32 = 1; else d->bd_compat32 = 0; #endif mutex_enter(d->bd_mtx); if (d->bd_state == BPF_WAITING) callout_halt(&d->bd_callout, d->bd_mtx); d->bd_state = BPF_IDLE; mutex_exit(d->bd_mtx); if (d->bd_locked) { switch (cmd) { case BIOCGBLEN: /* FALLTHROUGH */ case BIOCFLUSH: /* FALLTHROUGH */ case BIOCGDLT: /* FALLTHROUGH */ case BIOCGDLTLIST: /* FALLTHROUGH */ case BIOCGETIF: /* FALLTHROUGH */ case BIOCGRTIMEOUT: /* FALLTHROUGH */ case BIOCGSTATS: /* FALLTHROUGH */ case BIOCVERSION: /* FALLTHROUGH */ case BIOCGHDRCMPLT: /* FALLTHROUGH */ case FIONREAD: /* FALLTHROUGH */ case BIOCLOCK: /* FALLTHROUGH */ case BIOCSRTIMEOUT: /* FALLTHROUGH */ case BIOCIMMEDIATE: /* FALLTHROUGH */ case TIOCGPGRP: break; default: return EPERM; } } switch (cmd) { default: error = EINVAL; break; /* * Check for read packet available. */ case FIONREAD: { int n; mutex_enter(d->bd_buf_mtx); n = d->bd_slen; if (d->bd_hbuf) n += d->bd_hlen; mutex_exit(d->bd_buf_mtx); *(int *)addr = n; break; } /* * Get buffer len [for read()]. */ case BIOCGBLEN: *(u_int *)addr = d->bd_bufsize; break; /* * Set buffer length. */ case BIOCSBLEN: /* * Forbid to change the buffer length if buffers are already * allocated. */ mutex_enter(d->bd_mtx); mutex_enter(d->bd_buf_mtx); if (d->bd_bif != NULL || d->bd_sbuf != NULL) error = EINVAL; else { u_int size = *(u_int *)addr; if (size > bpf_maxbufsize) *(u_int *)addr = size = bpf_maxbufsize; else if (size < BPF_MINBUFSIZE) *(u_int *)addr = size = BPF_MINBUFSIZE; d->bd_bufsize = size; } mutex_exit(d->bd_buf_mtx); mutex_exit(d->bd_mtx); break; /* * Set link layer read filter. */ case BIOCSETF: /* FALLTHROUGH */ case BIOCSETWF: error = bpf_setf(d, addr, cmd); break; case BIOCLOCK: d->bd_locked = 1; break; /* * Flush read packet buffer. */ case BIOCFLUSH: mutex_enter(d->bd_mtx); reset_d(d); mutex_exit(d->bd_mtx); break; /* * Put interface into promiscuous mode. */ case BIOCPROMISC: mutex_enter(d->bd_mtx); if (d->bd_bif == NULL) { mutex_exit(d->bd_mtx); /* * No interface attached yet. */ error = EINVAL; break; } if (d->bd_promisc == 0) { KERNEL_LOCK_UNLESS_NET_MPSAFE(); error = ifpromisc(d->bd_bif->bif_ifp, 1); KERNEL_UNLOCK_UNLESS_NET_MPSAFE(); if (error == 0) d->bd_promisc = 1; } mutex_exit(d->bd_mtx); break; /* * Get device parameters. */ case BIOCGDLT: mutex_enter(d->bd_mtx); if (d->bd_bif == NULL) error = EINVAL; else *(u_int *)addr = d->bd_bif->bif_dlt; mutex_exit(d->bd_mtx); break; /* * Get a list of supported device parameters. */ case BIOCGDLTLIST: mutex_enter(d->bd_mtx); if (d->bd_bif == NULL) error = EINVAL; else error = bpf_getdltlist(d, addr); mutex_exit(d->bd_mtx); break; /* * Set device parameters. */ case BIOCSDLT: mutex_enter(&bpf_mtx); mutex_enter(d->bd_mtx); if (d->bd_bif == NULL) error = EINVAL; else error = bpf_setdlt(d, *(u_int *)addr); mutex_exit(d->bd_mtx); mutex_exit(&bpf_mtx); break; /* * Set interface name. */ #ifdef OBIOCGETIF case OBIOCGETIF: #endif case BIOCGETIF: mutex_enter(d->bd_mtx); if (d->bd_bif == NULL) error = EINVAL; else bpf_ifname(d->bd_bif->bif_ifp, addr); mutex_exit(d->bd_mtx); break; /* * Set interface. */ #ifdef OBIOCSETIF case OBIOCSETIF: #endif case BIOCSETIF: mutex_enter(&bpf_mtx); error = bpf_setif(d, addr); mutex_exit(&bpf_mtx); break; /* * Set read timeout. */ case BIOCSRTIMEOUT: { struct timeval *tv = addr; /* Compute number of ticks. */ if (tv->tv_sec < 0 || tv->tv_usec < 0 || tv->tv_usec >= 1000000) { error = EINVAL; break; } else if (tv->tv_sec > INT_MAX/hz - 1) { d->bd_rtout = INT_MAX; } else { d->bd_rtout = tv->tv_sec * hz + tv->tv_usec / tick; } if ((d->bd_rtout == 0) && (tv->tv_usec != 0)) d->bd_rtout = 1; break; } #ifdef BIOCGORTIMEOUT /* * Get read timeout. */ case BIOCGORTIMEOUT: { struct timeval50 *tv = addr; tv->tv_sec = d->bd_rtout / hz; tv->tv_usec = (d->bd_rtout % hz) * tick; break; } #endif #ifdef BIOCSORTIMEOUT /* * Set read timeout. */ case BIOCSORTIMEOUT: { struct timeval50 *tv = addr; /* Compute number of ticks. */ if (tv->tv_sec < 0 || tv->tv_usec < 0 || tv->tv_usec >= 1000000) { error = EINVAL; break; } else if (tv->tv_sec > INT_MAX/hz - 1) { d->bd_rtout = INT_MAX; } else { d->bd_rtout = tv->tv_sec * hz + tv->tv_usec / tick; } if ((d->bd_rtout == 0) && (tv->tv_usec != 0)) d->bd_rtout = 1; break; } #endif /* * Get read timeout. */ case BIOCGRTIMEOUT: { struct timeval *tv = addr; tv->tv_sec = d->bd_rtout / hz; tv->tv_usec = (d->bd_rtout % hz) * tick; break; } /* * Get packet stats. */ case BIOCGSTATS: { struct bpf_stat *bs = addr; bs->bs_recv = d->bd_rcount; bs->bs_drop = d->bd_dcount; bs->bs_capt = d->bd_ccount; break; } case BIOCGSTATS_30: { struct bpf_stat30 *bs = addr; bs->bs_recv = d->bd_rcount; bs->bs_drop = d->bd_dcount; break; } /* * Set immediate mode. */ case BIOCIMMEDIATE: d->bd_immediate = *(u_int *)addr; break; case BIOCVERSION: { struct bpf_version *bv = addr; bv->bv_major = BPF_MAJOR_VERSION; bv->bv_minor = BPF_MINOR_VERSION; break; } case BIOCGHDRCMPLT: /* get "header already complete" flag */ *(u_int *)addr = d->bd_hdrcmplt; break; case BIOCSHDRCMPLT: /* set "header already complete" flag */ d->bd_hdrcmplt = *(u_int *)addr ? 1 : 0; break; /* * Get packet direction flag */ case BIOCGDIRECTION: *(u_int *)addr = d->bd_direction; break; /* * Set packet direction flag */ case BIOCSDIRECTION: { u_int direction; direction = *(u_int *)addr; switch (direction) { case BPF_D_IN: case BPF_D_INOUT: case BPF_D_OUT: d->bd_direction = direction; break; default: error = EINVAL; } } break; /* * Set "feed packets from bpf back to input" mode */ case BIOCSFEEDBACK: d->bd_feedback = *(u_int *)addr; break; /* * Get "feed packets from bpf back to input" mode */ case BIOCGFEEDBACK: *(u_int *)addr = d->bd_feedback; break; case FIONBIO: /* Non-blocking I/O */ /* * No need to do anything special as we use IO_NDELAY in * bpfread() as an indication of whether or not to block * the read. */ break; case FIOASYNC: /* Send signal on receive packets */ mutex_enter(d->bd_mtx); d->bd_async = *(int *)addr; mutex_exit(d->bd_mtx); break; case TIOCSPGRP: /* Process or group to send signals to */ case FIOSETOWN: error = fsetown(&d->bd_pgid, cmd, addr); break; case TIOCGPGRP: case FIOGETOWN: error = fgetown(d->bd_pgid, cmd, addr); break; } return (error); } /* * Set d's packet filter program to fp. If this file already has a filter, * free it and replace it. Returns EINVAL for bogus requests. */ static int bpf_setf(struct bpf_d *d, struct bpf_program *fp, u_long cmd) { struct bpf_insn *fcode; bpfjit_func_t jcode; size_t flen, size = 0; struct bpf_filter *oldf, *newf, **storef; jcode = NULL; flen = fp->bf_len; if ((fp->bf_insns == NULL && flen) || flen > BPF_MAXINSNS) { return EINVAL; } if (flen) { /* * Allocate the buffer, copy the byte-code from * userspace and validate it. */ size = flen * sizeof(*fp->bf_insns); fcode = kmem_alloc(size, KM_SLEEP); if (copyin(fp->bf_insns, fcode, size) != 0 || !bpf_validate(fcode, (int)flen)) { kmem_free(fcode, size); return EINVAL; } if (bpf_jit) jcode = bpf_jit_generate(NULL, fcode, flen); } else { fcode = NULL; } newf = kmem_alloc(sizeof(*newf), KM_SLEEP); newf->bf_insn = fcode; newf->bf_size = size; newf->bf_jitcode = jcode; if (cmd == BIOCSETF) d->bd_jitcode = jcode; /* XXX just for kvm(3) users */ /* Need to hold bpf_mtx for pserialize_perform */ mutex_enter(&bpf_mtx); mutex_enter(d->bd_mtx); if (cmd == BIOCSETWF) { oldf = d->bd_wfilter; storef = &d->bd_wfilter; } else { oldf = d->bd_rfilter; storef = &d->bd_rfilter; } atomic_store_release(storef, newf); reset_d(d); pserialize_perform(bpf_psz); mutex_exit(d->bd_mtx); mutex_exit(&bpf_mtx); if (oldf != NULL) bpf_free_filter(oldf); return 0; } /* * Detach a file from its current interface (if attached at all) and attach * to the interface indicated by the name stored in ifr. * Return an errno or 0. */ static int bpf_setif(struct bpf_d *d, struct ifreq *ifr) { struct bpf_if *bp; char *cp; int unit_seen, i, error; KASSERT(mutex_owned(&bpf_mtx)); /* * Make sure the provided name has a unit number, and default * it to '0' if not specified. * XXX This is ugly ... do this differently? */ unit_seen = 0; cp = ifr->ifr_name; cp[sizeof(ifr->ifr_name) - 1] = '\0'; /* sanity */ while (*cp++) if (*cp >= '0' && *cp <= '9') unit_seen = 1; if (!unit_seen) { /* Make sure to leave room for the '\0'. */ for (i = 0; i < (IFNAMSIZ - 1); ++i) { if ((ifr->ifr_name[i] >= 'a' && ifr->ifr_name[i] <= 'z') || (ifr->ifr_name[i] >= 'A' && ifr->ifr_name[i] <= 'Z')) continue; ifr->ifr_name[i] = '0'; } } /* * Look through attached interfaces for the named one. */ BPF_IFLIST_WRITER_FOREACH(bp) { struct ifnet *ifp = bp->bif_ifp; if (ifp == NULL || strcmp(ifp->if_xname, ifr->ifr_name) != 0) continue; /* skip additional entry */ if (bp->bif_driverp != &ifp->if_bpf) continue; /* * We found the requested interface. * Allocate the packet buffers if we need to. * If we're already attached to requested interface, * just flush the buffer. */ /* * bpf_allocbufs is called only here. bpf_mtx ensures that * no race condition happen on d->bd_sbuf. */ if (d->bd_sbuf == NULL) { error = bpf_allocbufs(d); if (error != 0) return (error); } mutex_enter(d->bd_mtx); if (bp != d->bd_bif) { if (d->bd_bif) { /* * Detach if attached to something else. */ bpf_detachd(d); BPFIF_DLIST_ENTRY_INIT(d); } bpf_attachd(d, bp); } reset_d(d); mutex_exit(d->bd_mtx); return (0); } /* Not found. */ return (ENXIO); } /* * Copy the interface name to the ifreq. */ static void bpf_ifname(struct ifnet *ifp, struct ifreq *ifr) { memcpy(ifr->ifr_name, ifp->if_xname, IFNAMSIZ); } static int bpf_stat(struct file *fp, struct stat *st) { struct bpf_d *d = fp->f_bpf; (void)memset(st, 0, sizeof(*st)); mutex_enter(d->bd_mtx); st->st_dev = makedev(cdevsw_lookup_major(&bpf_cdevsw), d->bd_pid); st->st_atimespec = d->bd_atime; st->st_mtimespec = d->bd_mtime; st->st_ctimespec = st->st_birthtimespec = d->bd_btime; st->st_uid = kauth_cred_geteuid(fp->f_cred); st->st_gid = kauth_cred_getegid(fp->f_cred); st->st_mode = S_IFCHR; mutex_exit(d->bd_mtx); return 0; } /* * Support for poll() system call * * Return true iff the specific operation will not block indefinitely - with * the assumption that it is safe to positively acknowledge a request for the * ability to write to the BPF device. * Otherwise, return false but make a note that a selnotify() must be done. */ static int bpf_poll(struct file *fp, int events) { struct bpf_d *d = fp->f_bpf; int revents; /* * Refresh the PID associated with this bpf file. */ mutex_enter(&bpf_mtx); d->bd_pid = curproc->p_pid; revents = events & (POLLOUT | POLLWRNORM); if (events & (POLLIN | POLLRDNORM)) { /* * An imitation of the FIONREAD ioctl code. */ mutex_enter(d->bd_mtx); if (d->bd_hlen != 0 || ((d->bd_immediate || d->bd_state == BPF_TIMED_OUT) && d->bd_slen != 0)) { revents |= events & (POLLIN | POLLRDNORM); } else { selrecord(curlwp, &d->bd_sel); /* Start the read timeout if necessary */ if (d->bd_rtout > 0 && d->bd_state == BPF_IDLE) { callout_reset(&d->bd_callout, d->bd_rtout, bpf_timed_out, d); d->bd_state = BPF_WAITING; } } mutex_exit(d->bd_mtx); } mutex_exit(&bpf_mtx); return (revents); } static void filt_bpfrdetach(struct knote *kn) { struct bpf_d *d = kn->kn_hook; mutex_enter(d->bd_buf_mtx); selremove_knote(&d->bd_sel, kn); mutex_exit(d->bd_buf_mtx); } static int filt_bpfread(struct knote *kn, long hint) { struct bpf_d *d = kn->kn_hook; int rv; /* * Refresh the PID associated with this bpf file. */ d->bd_pid = curproc->p_pid; mutex_enter(d->bd_buf_mtx); kn->kn_data = d->bd_hlen; if (d->bd_immediate) kn->kn_data += d->bd_slen; rv = (kn->kn_data > 0); mutex_exit(d->bd_buf_mtx); return rv; } static const struct filterops bpfread_filtops = { .f_flags = FILTEROP_ISFD, .f_attach = NULL, .f_detach = filt_bpfrdetach, .f_event = filt_bpfread, }; static int bpf_kqfilter(struct file *fp, struct knote *kn) { struct bpf_d *d = fp->f_bpf; switch (kn->kn_filter) { case EVFILT_READ: kn->kn_fop = &bpfread_filtops; break; default: return (EINVAL); } kn->kn_hook = d; mutex_enter(d->bd_buf_mtx); selrecord_knote(&d->bd_sel, kn); mutex_exit(d->bd_buf_mtx); return (0); } /* * Copy data from an mbuf chain into a buffer. This code is derived * from m_copydata in sys/uipc_mbuf.c. */ static void * bpf_mcpy(void *dst_arg, const void *src_arg, size_t len) { const struct mbuf *m; u_int count; u_char *dst; m = src_arg; dst = dst_arg; while (len > 0) { if (m == NULL) panic("bpf_mcpy"); count = uimin(m->m_len, len); memcpy(dst, mtod(m, const void *), count); m = m->m_next; dst += count; len -= count; } return dst_arg; } static inline u_int bpf_xfilter(struct bpf_filter **filter, void *pkt, u_int pktlen, u_int buflen) { struct bpf_filter *filt; uint32_t mem[BPF_MEMWORDS]; bpf_args_t args = { .pkt = (const uint8_t *)pkt, .wirelen = pktlen, .buflen = buflen, .mem = mem, .arg = NULL }; u_int slen; filt = atomic_load_consume(filter); if (filt == NULL) /* No filter means accept all. */ return (u_int)-1; if (filt->bf_jitcode != NULL) slen = filt->bf_jitcode(NULL, &args); else slen = bpf_filter_ext(NULL, filt->bf_insn, &args); return slen; } /* * Dispatch a packet to all the listeners on interface bp. * * pkt pointer to the packet, either a data buffer or an mbuf chain * buflen buffer length, if pkt is a data buffer * cpfn a function that can copy pkt into the listener's buffer * pktlen length of the packet * direction BPF_D_IN or BPF_D_OUT */ static inline void bpf_deliver(struct bpf_if *bp, void *(*cpfn)(void *, const void *, size_t), void *pkt, u_int pktlen, u_int buflen, const u_int direction) { bool gottime = false; struct timespec ts; struct bpf_d *d; int s; u_int slen; KASSERT(!cpu_intr_p()); /* * Note that the IPL does not have to be raised at this point. * The only problem that could arise here is that if two different * interfaces shared any data. This is not the case. */ s = pserialize_read_enter(); BPFIF_DLIST_READER_FOREACH(d, bp) { if (direction == BPF_D_IN) { if (d->bd_direction == BPF_D_OUT) continue; } else { /* BPF_D_OUT */ if (d->bd_direction == BPF_D_IN) continue; } atomic_inc_ulong(&d->bd_rcount); BPF_STATINC(recv); slen = bpf_xfilter(&d->bd_rfilter, pkt, pktlen, buflen); if (slen == 0) continue; if (!gottime) { gottime = true; nanotime(&ts); } /* Assume catchpacket doesn't sleep */ catchpacket(d, pkt, pktlen, slen, cpfn, &ts); } pserialize_read_exit(s); } /* * Incoming linkage from device drivers, when the head of the packet is in * a buffer, and the tail is in an mbuf chain. */ static void _bpf_mtap2(struct bpf_if *bp, void *data, u_int dlen, struct mbuf *m, u_int direction) { u_int pktlen; struct mbuf mb; /* Skip outgoing duplicate packets. */ if ((m->m_flags & M_PROMISC) != 0 && m->m_pkthdr.rcvif_index == 0) { m->m_flags &= ~M_PROMISC; return; } pktlen = m_length(m) + dlen; /* * Craft on-stack mbuf suitable for passing to bpf_filter. * Note that we cut corners here; we only set up what's * absolutely needed--this mbuf should never go anywhere else. */ (void)memset(&mb, 0, sizeof(mb)); mb.m_type = MT_DATA; mb.m_next = m; mb.m_data = data; mb.m_len = dlen; bpf_deliver(bp, bpf_mcpy, &mb, pktlen, 0, direction); } /* * Incoming linkage from device drivers, when packet is in an mbuf chain. */ static void _bpf_mtap(struct bpf_if *bp, struct mbuf *m, u_int direction) { void *(*cpfn)(void *, const void *, size_t); u_int pktlen, buflen; void *marg; /* Skip outgoing duplicate packets. */ if ((m->m_flags & M_PROMISC) != 0 && m->m_pkthdr.rcvif_index == 0) { m->m_flags &= ~M_PROMISC; return; } pktlen = m_length(m); /* Skip zero-sized packets. */ if (__predict_false(pktlen == 0)) { return; } if (pktlen == m->m_len) { cpfn = (void *)memcpy; marg = mtod(m, void *); buflen = pktlen; KASSERT(buflen != 0); } else { cpfn = bpf_mcpy; marg = m; buflen = 0; } bpf_deliver(bp, cpfn, marg, pktlen, buflen, direction); } /* * We need to prepend the address family as * a four byte field. Cons up a dummy header * to pacify bpf. This is safe because bpf * will only read from the mbuf (i.e., it won't * try to free it or keep a pointer a to it). */ static void _bpf_mtap_af(struct bpf_if *bp, uint32_t af, struct mbuf *m, u_int direction) { struct mbuf m0; m0.m_type = MT_DATA; m0.m_flags = 0; m0.m_next = m; m0.m_nextpkt = NULL; m0.m_owner = NULL; m0.m_len = 4; m0.m_data = (char *)&af; _bpf_mtap(bp, &m0, direction); } /* * Put the SLIP pseudo-"link header" in place. * Note this M_PREPEND() should never fail, * since we know we always have enough space * in the input buffer. */ static void _bpf_mtap_sl_in(struct bpf_if *bp, u_char *chdr, struct mbuf **m) { u_char *hp; M_PREPEND(*m, SLIP_HDRLEN, M_DONTWAIT); if (*m == NULL) return; hp = mtod(*m, u_char *); hp[SLX_DIR] = SLIPDIR_IN; (void)memcpy(&hp[SLX_CHDR], chdr, CHDR_LEN); _bpf_mtap(bp, *m, BPF_D_IN); m_adj(*m, SLIP_HDRLEN); } /* * Put the SLIP pseudo-"link header" in * place. The compressed header is now * at the beginning of the mbuf. */ static void _bpf_mtap_sl_out(struct bpf_if *bp, u_char *chdr, struct mbuf *m) { struct mbuf m0; u_char *hp; m0.m_type = MT_DATA; m0.m_flags = 0; m0.m_next = m; m0.m_nextpkt = NULL; m0.m_owner = NULL; m0.m_data = m0.m_dat; m0.m_len = SLIP_HDRLEN; hp = mtod(&m0, u_char *); hp[SLX_DIR] = SLIPDIR_OUT; (void)memcpy(&hp[SLX_CHDR], chdr, CHDR_LEN); _bpf_mtap(bp, &m0, BPF_D_OUT); m_freem(m); } static struct mbuf * bpf_mbuf_enqueue(struct bpf_if *bp, struct mbuf *m) { struct mbuf *dup; dup = m_dup(m, 0, M_COPYALL, M_NOWAIT); if (dup == NULL) return NULL; if (bp->bif_mbuf_tail != NULL) { bp->bif_mbuf_tail->m_nextpkt = dup; } else { bp->bif_mbuf_head = dup; } bp->bif_mbuf_tail = dup; #ifdef BPF_MTAP_SOFTINT_DEBUG log(LOG_DEBUG, "%s: enqueued mbuf=%p to %s\n", __func__, dup, bp->bif_ifp->if_xname); #endif return dup; } static struct mbuf * bpf_mbuf_dequeue(struct bpf_if *bp) { struct mbuf *m; int s; /* XXX NOMPSAFE: assumed running on one CPU */ s = splnet(); m = bp->bif_mbuf_head; if (m != NULL) { bp->bif_mbuf_head = m->m_nextpkt; m->m_nextpkt = NULL; if (bp->bif_mbuf_head == NULL) bp->bif_mbuf_tail = NULL; #ifdef BPF_MTAP_SOFTINT_DEBUG log(LOG_DEBUG, "%s: dequeued mbuf=%p from %s\n", __func__, m, bp->bif_ifp->if_xname); #endif } splx(s); return m; } static void bpf_mtap_si(void *arg) { struct bpf_if *bp = arg; struct mbuf *m; while ((m = bpf_mbuf_dequeue(bp)) != NULL) { #ifdef BPF_MTAP_SOFTINT_DEBUG log(LOG_DEBUG, "%s: tapping mbuf=%p on %s\n", __func__, m, bp->bif_ifp->if_xname); #endif bpf_ops->bpf_mtap(bp, m, BPF_D_IN); m_freem(m); } } static void _bpf_mtap_softint(struct ifnet *ifp, struct mbuf *m) { struct bpf_if *bp = ifp->if_bpf; struct mbuf *dup; KASSERT(cpu_intr_p()); /* To avoid extra invocations of the softint */ if (BPFIF_DLIST_READER_EMPTY(bp)) return; KASSERT(bp->bif_si != NULL); dup = bpf_mbuf_enqueue(bp, m); if (dup != NULL) softint_schedule(bp->bif_si); } static int bpf_hdrlen(struct bpf_d *d) { int hdrlen = d->bd_bif->bif_hdrlen; /* * Compute the length of the bpf header. This is not necessarily * equal to SIZEOF_BPF_HDR because we want to insert spacing such * that the network layer header begins on a longword boundary (for * performance reasons and to alleviate alignment restrictions). */ #ifdef _LP64 if (d->bd_compat32) return (BPF_WORDALIGN32(hdrlen + SIZEOF_BPF_HDR32) - hdrlen); else #endif return (BPF_WORDALIGN(hdrlen + SIZEOF_BPF_HDR) - hdrlen); } /* * Move the packet data from interface memory (pkt) into the * store buffer. Call the wakeup functions if it's time to wake up * a listener (buffer full), "cpfn" is the routine called to do the * actual data transfer. memcpy is passed in to copy contiguous chunks, * while bpf_mcpy is passed in to copy mbuf chains. In the latter case, * pkt is really an mbuf. */ static void catchpacket(struct bpf_d *d, u_char *pkt, u_int pktlen, u_int snaplen, void *(*cpfn)(void *, const void *, size_t), struct timespec *ts) { char *h; int totlen, curlen, caplen; int hdrlen = bpf_hdrlen(d); int do_wakeup = 0; atomic_inc_ulong(&d->bd_ccount); BPF_STATINC(capt); /* * Figure out how many bytes to move. If the packet is * greater or equal to the snapshot length, transfer that * much. Otherwise, transfer the whole packet (unless * we hit the buffer size limit). */ totlen = hdrlen + uimin(snaplen, pktlen); if (totlen > d->bd_bufsize) totlen = d->bd_bufsize; /* * If we adjusted totlen to fit the bufsize, it could be that * totlen is smaller than hdrlen because of the link layer header. */ caplen = totlen - hdrlen; if (caplen < 0) caplen = 0; mutex_enter(d->bd_buf_mtx); /* * Round up the end of the previous packet to the next longword. */ #ifdef _LP64 if (d->bd_compat32) curlen = BPF_WORDALIGN32(d->bd_slen); else #endif curlen = BPF_WORDALIGN(d->bd_slen); if (curlen + totlen > d->bd_bufsize) { /* * This packet will overflow the storage buffer. * Rotate the buffers if we can, then wakeup any * pending reads. */ if (d->bd_fbuf == NULL) { mutex_exit(d->bd_buf_mtx); /* * We haven't completed the previous read yet, * so drop the packet. */ atomic_inc_ulong(&d->bd_dcount); BPF_STATINC(drop); return; } ROTATE_BUFFERS(d); do_wakeup = 1; curlen = 0; } else if (d->bd_immediate || d->bd_state == BPF_TIMED_OUT) { /* * Immediate mode is set, or the read timeout has * already expired during a select call. A packet * arrived, so the reader should be woken up. */ do_wakeup = 1; } /* * Append the bpf header. */ h = (char *)d->bd_sbuf + curlen; #ifdef _LP64 if (d->bd_compat32) { struct bpf_hdr32 *hp32; hp32 = (struct bpf_hdr32 *)h; hp32->bh_tstamp.tv_sec = ts->tv_sec; hp32->bh_tstamp.tv_usec = ts->tv_nsec / 1000; hp32->bh_datalen = pktlen; hp32->bh_hdrlen = hdrlen; hp32->bh_caplen = caplen; } else #endif { struct bpf_hdr *hp; hp = (struct bpf_hdr *)h; hp->bh_tstamp.tv_sec = ts->tv_sec; hp->bh_tstamp.tv_usec = ts->tv_nsec / 1000; hp->bh_datalen = pktlen; hp->bh_hdrlen = hdrlen; hp->bh_caplen = caplen; } /* * Copy the packet data into the store buffer and update its length. */ (*cpfn)(h + hdrlen, pkt, caplen); d->bd_slen = curlen + totlen; mutex_exit(d->bd_buf_mtx); /* * Call bpf_wakeup after bd_slen has been updated so that kevent(2) * will cause filt_bpfread() to be called with it adjusted. */ if (do_wakeup) bpf_wakeup(d); } /* * Initialize all nonzero fields of a descriptor. */ static int bpf_allocbufs(struct bpf_d *d) { d->bd_fbuf = kmem_zalloc(d->bd_bufsize, KM_NOSLEEP); if (!d->bd_fbuf) return (ENOBUFS); d->bd_sbuf = kmem_zalloc(d->bd_bufsize, KM_NOSLEEP); if (!d->bd_sbuf) { kmem_free(d->bd_fbuf, d->bd_bufsize); return (ENOBUFS); } d->bd_slen = 0; d->bd_hlen = 0; return (0); } static void bpf_free_filter(struct bpf_filter *filter) { KASSERT(filter != NULL); if (filter->bf_insn != NULL) kmem_free(filter->bf_insn, filter->bf_size); if (filter->bf_jitcode != NULL) bpf_jit_freecode(filter->bf_jitcode); kmem_free(filter, sizeof(*filter)); } /* * Free buffers currently in use by a descriptor. * Called on close. */ static void bpf_freed(struct bpf_d *d) { /* * We don't need to lock out interrupts since this descriptor has * been detached from its interface and it yet hasn't been marked * free. */ if (d->bd_sbuf != NULL) { kmem_free(d->bd_sbuf, d->bd_bufsize); if (d->bd_hbuf != NULL) kmem_free(d->bd_hbuf, d->bd_bufsize); if (d->bd_fbuf != NULL) kmem_free(d->bd_fbuf, d->bd_bufsize); } if (d->bd_rfilter != NULL) { bpf_free_filter(d->bd_rfilter); d->bd_rfilter = NULL; } if (d->bd_wfilter != NULL) { bpf_free_filter(d->bd_wfilter); d->bd_wfilter = NULL; } d->bd_jitcode = NULL; } /* * Attach an interface to bpf. dlt is the link layer type; * hdrlen is the fixed size of the link header for the specified dlt * (variable length headers not yet supported). */ static void _bpfattach(struct ifnet *ifp, u_int dlt, u_int hdrlen, struct bpf_if **driverp) { struct bpf_if *bp; bp = kmem_alloc(sizeof(*bp), KM_SLEEP); mutex_enter(&bpf_mtx); bp->bif_driverp = driverp; bp->bif_ifp = ifp; bp->bif_dlt = dlt; bp->bif_si = NULL; BPF_IFLIST_ENTRY_INIT(bp); PSLIST_INIT(&bp->bif_dlist_head); psref_target_init(&bp->bif_psref, bpf_psref_class); SLIST_INIT(&bp->bif_trackers); BPF_IFLIST_WRITER_INSERT_HEAD(bp); *bp->bif_driverp = NULL; bp->bif_hdrlen = hdrlen; mutex_exit(&bpf_mtx); #if 0 printf("bpf: %s attached with dlt %x\n", ifp->if_xname, dlt); #endif } static void _bpf_mtap_softint_init(struct ifnet *ifp) { struct bpf_if *bp; mutex_enter(&bpf_mtx); BPF_IFLIST_WRITER_FOREACH(bp) { if (bp->bif_ifp != ifp) continue; bp->bif_mbuf_head = NULL; bp->bif_mbuf_tail = NULL; bp->bif_si = softint_establish(SOFTINT_NET, bpf_mtap_si, bp); if (bp->bif_si == NULL) panic("%s: softint_establish() failed", __func__); break; } mutex_exit(&bpf_mtx); if (bp == NULL) panic("%s: no bpf_if found for %s", __func__, ifp->if_xname); } /* * Remove an interface from bpf. */ static void _bpfdetach(struct ifnet *ifp) { struct bpf_if *bp; struct bpf_d *d; int s; mutex_enter(&bpf_mtx); /* Nuke the vnodes for any open instances */ again_d: BPF_DLIST_WRITER_FOREACH(d) { mutex_enter(d->bd_mtx); if (d->bd_bif != NULL && d->bd_bif->bif_ifp == ifp) { /* * Detach the descriptor from an interface now. * It will be free'ed later by close routine. */ bpf_detachd(d); mutex_exit(d->bd_mtx); goto again_d; } mutex_exit(d->bd_mtx); } again: BPF_IFLIST_WRITER_FOREACH(bp) { if (bp->bif_ifp == ifp) { BPF_IFLIST_WRITER_REMOVE(bp); pserialize_perform(bpf_psz); psref_target_destroy(&bp->bif_psref, bpf_psref_class); while (!SLIST_EMPTY(&bp->bif_trackers)) { struct bpf_event_tracker *t = SLIST_FIRST(&bp->bif_trackers); SLIST_REMOVE_HEAD(&bp->bif_trackers, bet_entries); kmem_free(t, sizeof(*t)); } BPF_IFLIST_ENTRY_DESTROY(bp); if (bp->bif_si != NULL) { /* XXX NOMPSAFE: assumed running on one CPU */ s = splnet(); while (bp->bif_mbuf_head != NULL) { struct mbuf *m = bp->bif_mbuf_head; bp->bif_mbuf_head = m->m_nextpkt; m_freem(m); } splx(s); softint_disestablish(bp->bif_si); } kmem_free(bp, sizeof(*bp)); goto again; } } mutex_exit(&bpf_mtx); } /* * Change the data link type of a interface. */ static void _bpf_change_type(struct ifnet *ifp, u_int dlt, u_int hdrlen) { struct bpf_if *bp; mutex_enter(&bpf_mtx); BPF_IFLIST_WRITER_FOREACH(bp) { if (bp->bif_driverp == &ifp->if_bpf) break; } if (bp == NULL) panic("bpf_change_type"); bp->bif_dlt = dlt; bp->bif_hdrlen = hdrlen; mutex_exit(&bpf_mtx); } /* * Get a list of available data link type of the interface. */ static int bpf_getdltlist(struct bpf_d *d, struct bpf_dltlist *bfl) { int n, error; struct ifnet *ifp; struct bpf_if *bp; int s, bound; KASSERT(mutex_owned(d->bd_mtx)); ifp = d->bd_bif->bif_ifp; n = 0; error = 0; bound = curlwp_bind(); s = pserialize_read_enter(); BPF_IFLIST_READER_FOREACH(bp) { if (bp->bif_ifp != ifp) continue; if (bfl->bfl_list != NULL) { struct psref psref; if (n >= bfl->bfl_len) { pserialize_read_exit(s); return ENOMEM; } bpf_if_acquire(bp, &psref); pserialize_read_exit(s); error = copyout(&bp->bif_dlt, bfl->bfl_list + n, sizeof(u_int)); s = pserialize_read_enter(); bpf_if_release(bp, &psref); } n++; } pserialize_read_exit(s); curlwp_bindx(bound); bfl->bfl_len = n; return error; } /* * Set the data link type of a BPF instance. */ static int bpf_setdlt(struct bpf_d *d, u_int dlt) { int error, opromisc; struct ifnet *ifp; struct bpf_if *bp; KASSERT(mutex_owned(&bpf_mtx)); KASSERT(mutex_owned(d->bd_mtx)); if (d->bd_bif->bif_dlt == dlt) return 0; ifp = d->bd_bif->bif_ifp; BPF_IFLIST_WRITER_FOREACH(bp) { if (bp->bif_ifp == ifp && bp->bif_dlt == dlt) break; } if (bp == NULL) return EINVAL; opromisc = d->bd_promisc; bpf_detachd(d); BPFIF_DLIST_ENTRY_INIT(d); bpf_attachd(d, bp); reset_d(d); if (opromisc) { KERNEL_LOCK_UNLESS_NET_MPSAFE(); error = ifpromisc(bp->bif_ifp, 1); KERNEL_UNLOCK_UNLESS_NET_MPSAFE(); if (error) printf("%s: bpf_setdlt: ifpromisc failed (%d)\n", bp->bif_ifp->if_xname, error); else d->bd_promisc = 1; } return 0; } static int sysctl_net_bpf_maxbufsize(SYSCTLFN_ARGS) { int newsize, error; struct sysctlnode node; node = *rnode; node.sysctl_data = &newsize; newsize = bpf_maxbufsize; error = sysctl_lookup(SYSCTLFN_CALL(&node)); if (error || newp == NULL) return (error); if (newsize < BPF_MINBUFSIZE || newsize > BPF_MAXBUFSIZE) return (EINVAL); bpf_maxbufsize = newsize; return (0); } #if defined(MODULAR) || defined(BPFJIT) static int sysctl_net_bpf_jit(SYSCTLFN_ARGS) { bool newval; int error; struct sysctlnode node; node = *rnode; node.sysctl_data = &newval; newval = bpf_jit; error = sysctl_lookup(SYSCTLFN_CALL(&node)); if (error != 0 || newp == NULL) return error; bpf_jit = newval; if (newval && bpfjit_module_ops.bj_generate_code == NULL) { printf("JIT compilation is postponed " "until after bpfjit module is loaded\n"); } return 0; } #endif static int sysctl_net_bpf_peers(SYSCTLFN_ARGS) { int error, elem_count; struct bpf_d *dp; struct bpf_d_ext dpe; size_t len, needed, elem_size, out_size; char *sp; if (namelen == 1 && name[0] == CTL_QUERY) return (sysctl_query(SYSCTLFN_CALL(rnode))); if (namelen != 2) return (EINVAL); /* BPF peers is privileged information. */ error = kauth_authorize_network(l->l_cred, KAUTH_NETWORK_INTERFACE, KAUTH_REQ_NETWORK_INTERFACE_GETPRIV, NULL, NULL, NULL); if (error) return (EPERM); len = (oldp != NULL) ? *oldlenp : 0; sp = oldp; elem_size = name[0]; elem_count = name[1]; out_size = MIN(sizeof(dpe), elem_size); needed = 0; if (elem_size < 1 || elem_count < 0) return (EINVAL); mutex_enter(&bpf_mtx); BPF_DLIST_WRITER_FOREACH(dp) { if (len >= elem_size && elem_count > 0) { #define BPF_EXT(field) dpe.bde_ ## field = dp->bd_ ## field BPF_EXT(bufsize); BPF_EXT(promisc); BPF_EXT(state); BPF_EXT(immediate); BPF_EXT(hdrcmplt); BPF_EXT(direction); BPF_EXT(pid); BPF_EXT(rcount); BPF_EXT(dcount); BPF_EXT(ccount); #undef BPF_EXT mutex_enter(dp->bd_mtx); if (dp->bd_bif) (void)strlcpy(dpe.bde_ifname, dp->bd_bif->bif_ifp->if_xname, IFNAMSIZ - 1); else dpe.bde_ifname[0] = '\0'; dpe.bde_locked = dp->bd_locked; mutex_exit(dp->bd_mtx); error = copyout(&dpe, sp, out_size); if (error) break; sp += elem_size; len -= elem_size; } needed += elem_size; if (elem_count > 0 && elem_count != INT_MAX) elem_count--; } mutex_exit(&bpf_mtx); *oldlenp = needed; return (error); } static void bpf_stats(void *p, void *arg, struct cpu_info *ci __unused) { struct bpf_stat *const stats = p; struct bpf_stat *sum = arg; int s = splnet(); sum->bs_recv += stats->bs_recv; sum->bs_drop += stats->bs_drop; sum->bs_capt += stats->bs_capt; splx(s); } static int bpf_sysctl_gstats_handler(SYSCTLFN_ARGS) { struct sysctlnode node; int error; struct bpf_stat sum; memset(&sum, 0, sizeof(sum)); node = *rnode; percpu_foreach_xcall(bpf_gstats_percpu, XC_HIGHPRI_IPL(IPL_SOFTNET), bpf_stats, &sum); node.sysctl_data = &sum; node.sysctl_size = sizeof(sum); error = sysctl_lookup(SYSCTLFN_CALL(&node)); if (error != 0 || newp == NULL) return error; return 0; } SYSCTL_SETUP(sysctl_net_bpf_setup, "bpf sysctls") { const struct sysctlnode *node; node = NULL; sysctl_createv(clog, 0, NULL, &node, CTLFLAG_PERMANENT, CTLTYPE_NODE, "bpf", SYSCTL_DESCR("BPF options"), NULL, 0, NULL, 0, CTL_NET, CTL_CREATE, CTL_EOL); if (node != NULL) { #if defined(MODULAR) || defined(BPFJIT) sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_BOOL, "jit", SYSCTL_DESCR("Toggle Just-In-Time compilation"), sysctl_net_bpf_jit, 0, &bpf_jit, 0, CTL_NET, node->sysctl_num, CTL_CREATE, CTL_EOL); #endif sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_INT, "maxbufsize", SYSCTL_DESCR("Maximum size for data capture buffer"), sysctl_net_bpf_maxbufsize, 0, &bpf_maxbufsize, 0, CTL_NET, node->sysctl_num, CTL_CREATE, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT, CTLTYPE_STRUCT, "stats", SYSCTL_DESCR("BPF stats"), bpf_sysctl_gstats_handler, 0, NULL, 0, CTL_NET, node->sysctl_num, CTL_CREATE, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT, CTLTYPE_STRUCT, "peers", SYSCTL_DESCR("BPF peers"), sysctl_net_bpf_peers, 0, NULL, 0, CTL_NET, node->sysctl_num, CTL_CREATE, CTL_EOL); } } static int _bpf_register_track_event(struct bpf_if **driverp, void (*_fun)(struct bpf_if *, struct ifnet *, int, int)) { struct bpf_if *bp; struct bpf_event_tracker *t; int ret = ENOENT; t = kmem_zalloc(sizeof(*t), KM_SLEEP); if (!t) return ENOMEM; t->bet_notify = _fun; mutex_enter(&bpf_mtx); BPF_IFLIST_WRITER_FOREACH(bp) { if (bp->bif_driverp != driverp) continue; SLIST_INSERT_HEAD(&bp->bif_trackers, t, bet_entries); ret = 0; break; } mutex_exit(&bpf_mtx); return ret; } static int _bpf_deregister_track_event(struct bpf_if **driverp, void (*_fun)(struct bpf_if *, struct ifnet *, int, int)) { struct bpf_if *bp; struct bpf_event_tracker *t = NULL; int ret = ENOENT; mutex_enter(&bpf_mtx); BPF_IFLIST_WRITER_FOREACH(bp) { if (bp->bif_driverp != driverp) continue; SLIST_FOREACH(t, &bp->bif_trackers, bet_entries) { if (t->bet_notify == _fun) { ret = 0; break; } } if (ret == 0) break; } if (ret == 0 && t && t->bet_notify == _fun) { SLIST_REMOVE(&bp->bif_trackers, t, bpf_event_tracker, bet_entries); } mutex_exit(&bpf_mtx); if (ret == 0) kmem_free(t, sizeof(*t)); return ret; } struct bpf_ops bpf_ops_kernel = { .bpf_attach = _bpfattach, .bpf_detach = _bpfdetach, .bpf_change_type = _bpf_change_type, .bpf_register_track_event = _bpf_register_track_event, .bpf_deregister_track_event = _bpf_deregister_track_event, .bpf_mtap = _bpf_mtap, .bpf_mtap2 = _bpf_mtap2, .bpf_mtap_af = _bpf_mtap_af, .bpf_mtap_sl_in = _bpf_mtap_sl_in, .bpf_mtap_sl_out = _bpf_mtap_sl_out, .bpf_mtap_softint = _bpf_mtap_softint, .bpf_mtap_softint_init = _bpf_mtap_softint_init, }; MODULE(MODULE_CLASS_DRIVER, bpf, "bpf_filter"); static int bpf_modcmd(modcmd_t cmd, void *arg) { #ifdef _MODULE devmajor_t bmajor, cmajor; #endif int error = 0; switch (cmd) { case MODULE_CMD_INIT: bpf_init(); #ifdef _MODULE bmajor = cmajor = NODEVMAJOR; error = devsw_attach("bpf", NULL, &bmajor, &bpf_cdevsw, &cmajor); if (error) break; #endif bpf_ops_handover_enter(&bpf_ops_kernel); atomic_swap_ptr(&bpf_ops, &bpf_ops_kernel); bpf_ops_handover_exit(); break; case MODULE_CMD_FINI: /* * While there is no reference counting for bpf callers, * unload could at least in theory be done similarly to * system call disestablishment. This should even be * a little simpler: * * 1) replace op vector with stubs * 2) post update to all cpus with xc * 3) check that nobody is in bpf anymore * (it's doubtful we'd want something like l_sysent, * but we could do something like *signed* percpu * counters. if the sum is 0, we're good). * 4) if fail, unroll changes * * NOTE: change won't be atomic to the outside. some * packets may be not captured even if unload is * not successful. I think packet capture not working * is a perfectly logical consequence of trying to * disable packet capture. */ error = EOPNOTSUPP; break; default: error = ENOTTY; break; } return error; }
352 212 213 212 336 335 336 104 271 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 /* $NetBSD: kern_mutex_obj.c,v 1.15 2023/10/02 21:03:55 ad Exp $ */ /*- * Copyright (c) 2008, 2019, 2023 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Andrew Doran. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: kern_mutex_obj.c,v 1.15 2023/10/02 21:03:55 ad Exp $"); #include <sys/param.h> #include <sys/atomic.h> #include <sys/mutex.h> #include <sys/kmem.h> /* Mutex cache */ #define MUTEX_OBJ_MAGIC 0x5aa3c85d struct kmutexobj { kmutex_t mo_lock; u_int mo_magic; u_int mo_refcnt; uint8_t mo_pad[COHERENCY_UNIT - sizeof(kmutex_t) - sizeof(u_int) * 2]; }; /* * mutex_obj_alloc: * * Allocate a single lock object, waiting for memory if needed. */ kmutex_t * mutex_obj_alloc(kmutex_type_t type, int ipl) { struct kmutexobj *mo; mo = kmem_intr_alloc(sizeof(*mo), KM_SLEEP); KASSERT(ALIGNED_POINTER(mo, coherency_unit)); _mutex_init(&mo->mo_lock, type, ipl, (uintptr_t)__builtin_return_address(0)); mo->mo_magic = MUTEX_OBJ_MAGIC; mo->mo_refcnt = 1; return (kmutex_t *)mo; } /* * mutex_obj_alloc: * * Allocate a single lock object, failing if no memory available. */ kmutex_t * mutex_obj_tryalloc(kmutex_type_t type, int ipl) { struct kmutexobj *mo; mo = kmem_intr_alloc(sizeof(*mo), KM_NOSLEEP); KASSERT(ALIGNED_POINTER(mo, coherency_unit)); if (__predict_true(mo != NULL)) { _mutex_init(&mo->mo_lock, type, ipl, (uintptr_t)__builtin_return_address(0)); mo->mo_magic = MUTEX_OBJ_MAGIC; mo->mo_refcnt = 1; } return (kmutex_t *)mo; } /* * mutex_obj_hold: * * Add a single reference to a lock object. A reference to the object * must already be held, and must be held across this call. */ void mutex_obj_hold(kmutex_t *lock) { struct kmutexobj *mo = (struct kmutexobj *)lock; KASSERTMSG(mo->mo_magic == MUTEX_OBJ_MAGIC, "%s: lock %p: mo->mo_magic (%#x) != MUTEX_OBJ_MAGIC (%#x)", __func__, mo, mo->mo_magic, MUTEX_OBJ_MAGIC); KASSERTMSG(mo->mo_refcnt > 0, "%s: lock %p: mo->mo_refcnt (%#x) == 0", __func__, mo, mo->mo_refcnt); atomic_inc_uint(&mo->mo_refcnt); } /* * mutex_obj_free: * * Drop a reference from a lock object. If the last reference is being * dropped, free the object and return true. Otherwise, return false. */ bool mutex_obj_free(kmutex_t *lock) { struct kmutexobj *mo = (struct kmutexobj *)lock; KASSERTMSG(mo->mo_magic == MUTEX_OBJ_MAGIC, "%s: lock %p: mo->mo_magic (%#x) != MUTEX_OBJ_MAGIC (%#x)", __func__, mo, mo->mo_magic, MUTEX_OBJ_MAGIC); KASSERTMSG(mo->mo_refcnt > 0, "%s: lock %p: mo->mo_refcnt (%#x) == 0", __func__, mo, mo->mo_refcnt); membar_release(); if (atomic_dec_uint_nv(&mo->mo_refcnt) > 0) { return false; } membar_acquire(); mutex_destroy(&mo->mo_lock); kmem_intr_free(mo, sizeof(*mo)); return true; } /* * mutex_obj_refcnt: * * Return the reference count on a lock object. */ u_int mutex_obj_refcnt(kmutex_t *lock) { struct kmutexobj *mo = (struct kmutexobj *)lock; return mo->mo_refcnt; }
2 1 1 1 1 1 1 1 1 7 7 1 1 20 20 20 5 8 4 13 21 2 13 2 2 2 1 2 2 1 2 1 1 1 1 1 1 4 1 2 1 4 7 7 4 3 3 1 3 2 1 3 2 2 10 10 4 6 6 2 2 4 4 4 1 1 1 1 1 1 5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 /* $NetBSD: sysv_sem.c,v 1.98 2019/08/07 00:38:02 pgoyette Exp $ */ /*- * Copyright (c) 1999, 2007 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility, * NASA Ames Research Center, and by Andrew Doran. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /* * Implementation of SVID semaphores * * Author: Daniel Boulet * * This software is provided ``AS IS'' without any warranties of any kind. */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: sysv_sem.c,v 1.98 2019/08/07 00:38:02 pgoyette Exp $"); #ifdef _KERNEL_OPT #include "opt_sysv.h" #endif #include <sys/param.h> #include <sys/kernel.h> #include <sys/sem.h> #include <sys/sysctl.h> #include <sys/kmem.h> #include <sys/mount.h> /* XXX for <sys/syscallargs.h> */ #include <sys/syscallargs.h> #include <sys/kauth.h> #include <sys/once.h> /* * Memory areas: * 1st: Pool of semaphore identifiers * 2nd: Semaphores * 3rd: Conditional variables * 4th: Undo structures */ struct semid_ds * sema __read_mostly; static struct __sem * sem __read_mostly; static kcondvar_t * semcv __read_mostly; static int * semu __read_mostly; static kmutex_t semlock __cacheline_aligned; static bool sem_realloc_state __read_mostly; static kcondvar_t sem_realloc_cv; /* * List of active undo structures, total number of semaphores, * and total number of semop waiters. */ static struct sem_undo *semu_list __read_mostly; static u_int semtot __cacheline_aligned; static u_int sem_waiters __cacheline_aligned; /* Macro to find a particular sem_undo vector */ #define SEMU(s, ix) ((struct sem_undo *)(((long)s) + ix * seminfo.semusz)) #ifdef SEM_DEBUG #define SEM_PRINTF(a) printf a #else #define SEM_PRINTF(a) #endif void *hook; /* cookie from exithook_establish() */ extern int kern_has_sysvsem; SYSCTL_SETUP_PROTO(sysctl_ipc_sem_setup); struct sem_undo *semu_alloc(struct proc *); int semundo_adjust(struct proc *, struct sem_undo **, int, int, int); void semundo_clear(int, int); static ONCE_DECL(exithook_control); static int seminit_exithook(void); int seminit(void) { int i, sz; vaddr_t v; mutex_init(&semlock, MUTEX_DEFAULT, IPL_NONE); cv_init(&sem_realloc_cv, "semrealc"); sem_realloc_state = false; semtot = 0; sem_waiters = 0; /* Allocate the wired memory for our structures */ sz = ALIGN(seminfo.semmni * sizeof(struct semid_ds)) + ALIGN(seminfo.semmns * sizeof(struct __sem)) + ALIGN(seminfo.semmni * sizeof(kcondvar_t)) + ALIGN(seminfo.semmnu * seminfo.semusz); sz = round_page(sz); v = uvm_km_alloc(kernel_map, sz, 0, UVM_KMF_WIRED|UVM_KMF_ZERO); if (v == 0) { printf("sysv_sem: cannot allocate memory"); return ENOMEM; } sema = (void *)v; sem = (void *)((uintptr_t)sema + ALIGN(seminfo.semmni * sizeof(struct semid_ds))); semcv = (void *)((uintptr_t)sem + ALIGN(seminfo.semmns * sizeof(struct __sem))); semu = (void *)((uintptr_t)semcv + ALIGN(seminfo.semmni * sizeof(kcondvar_t))); for (i = 0; i < seminfo.semmni; i++) { sema[i]._sem_base = 0; sema[i].sem_perm.mode = 0; cv_init(&semcv[i], "semwait"); } for (i = 0; i < seminfo.semmnu; i++) { struct sem_undo *suptr = SEMU(semu, i); suptr->un_proc = NULL; } semu_list = NULL; kern_has_sysvsem = 1; return 0; } static int seminit_exithook(void) { hook = exithook_establish(semexit, NULL); return 0; } int semfini(void) { int i, sz; vaddr_t v = (vaddr_t)sema; /* Don't allow module unload if we're busy */ mutex_enter(&semlock); if (semtot) { mutex_exit(&semlock); return 1; } /* Remove the exit hook */ if (hook) exithook_disestablish(hook); /* Destroy all our condvars */ for (i = 0; i < seminfo.semmni; i++) { cv_destroy(&semcv[i]); } /* Free the wired memory that we allocated */ sz = ALIGN(seminfo.semmni * sizeof(struct semid_ds)) + ALIGN(seminfo.semmns * sizeof(struct __sem)) + ALIGN(seminfo.semmni * sizeof(kcondvar_t)) + ALIGN(seminfo.semmnu * seminfo.semusz); sz = round_page(sz); uvm_km_free(kernel_map, v, sz, UVM_KMF_WIRED); /* Destroy the last cv and mutex */ cv_destroy(&sem_realloc_cv); mutex_exit(&semlock); mutex_destroy(&semlock); kern_has_sysvsem = 0; return 0; } static int semrealloc(int newsemmni, int newsemmns, int newsemmnu) { struct semid_ds *new_sema, *old_sema; struct __sem *new_sem; struct sem_undo *new_semu_list, *suptr, *nsuptr; int *new_semu; kcondvar_t *new_semcv; vaddr_t v; int i, j, lsemid, nmnus, sz; if (newsemmni < 1 || newsemmns < 1 || newsemmnu < 1) return EINVAL; /* Allocate the wired memory for our structures */ sz = ALIGN(newsemmni * sizeof(struct semid_ds)) + ALIGN(newsemmns * sizeof(struct __sem)) + ALIGN(newsemmni * sizeof(kcondvar_t)) + ALIGN(newsemmnu * seminfo.semusz); sz = round_page(sz); v = uvm_km_alloc(kernel_map, sz, 0, UVM_KMF_WIRED|UVM_KMF_ZERO); if (v == 0) return ENOMEM; mutex_enter(&semlock); if (sem_realloc_state) { mutex_exit(&semlock); uvm_km_free(kernel_map, v, sz, UVM_KMF_WIRED); return EBUSY; } sem_realloc_state = true; if (sem_waiters) { /* * Mark reallocation state, wake-up all waiters, * and wait while they will all exit. */ for (i = 0; i < seminfo.semmni; i++) cv_broadcast(&semcv[i]); while (sem_waiters) cv_wait(&sem_realloc_cv, &semlock); } old_sema = sema; /* Get the number of last slot */ lsemid = 0; for (i = 0; i < seminfo.semmni; i++) if (sema[i].sem_perm.mode & SEM_ALLOC) lsemid = i; /* Get the number of currently used undo structures */ nmnus = 0; for (i = 0; i < seminfo.semmnu; i++) { suptr = SEMU(semu, i); if (suptr->un_proc == NULL) continue; nmnus++; } /* We cannot reallocate less memory than we use */ if (lsemid >= newsemmni || semtot > newsemmns || nmnus > newsemmnu) { mutex_exit(&semlock); uvm_km_free(kernel_map, v, sz, UVM_KMF_WIRED); return EBUSY; } new_sema = (void *)v; new_sem = (void *)((uintptr_t)new_sema + ALIGN(newsemmni * sizeof(struct semid_ds))); new_semcv = (void *)((uintptr_t)new_sem + ALIGN(newsemmns * sizeof(struct __sem))); new_semu = (void *)((uintptr_t)new_semcv + ALIGN(newsemmni * sizeof(kcondvar_t))); /* Initialize all semaphore identifiers and condvars */ for (i = 0; i < newsemmni; i++) { new_sema[i]._sem_base = 0; new_sema[i].sem_perm.mode = 0; cv_init(&new_semcv[i], "semwait"); } for (i = 0; i < newsemmnu; i++) { nsuptr = SEMU(new_semu, i); nsuptr->un_proc = NULL; } /* * Copy all identifiers, semaphores and list of the * undo structures to the new memory allocation. */ j = 0; for (i = 0; i <= lsemid; i++) { if ((sema[i].sem_perm.mode & SEM_ALLOC) == 0) continue; memcpy(&new_sema[i], &sema[i], sizeof(struct semid_ds)); new_sema[i]._sem_base = &new_sem[j]; memcpy(new_sema[i]._sem_base, sema[i]._sem_base, (sizeof(struct __sem) * sema[i].sem_nsems)); j += sema[i].sem_nsems; } KASSERT(j == semtot); j = 0; new_semu_list = NULL; for (suptr = semu_list; suptr != NULL; suptr = suptr->un_next) { KASSERT(j < newsemmnu); nsuptr = SEMU(new_semu, j); memcpy(nsuptr, suptr, SEMUSZ); nsuptr->un_next = new_semu_list; new_semu_list = nsuptr; j++; } for (i = 0; i < seminfo.semmni; i++) { KASSERT(cv_has_waiters(&semcv[i]) == false); cv_destroy(&semcv[i]); } sz = ALIGN(seminfo.semmni * sizeof(struct semid_ds)) + ALIGN(seminfo.semmns * sizeof(struct __sem)) + ALIGN(seminfo.semmni * sizeof(kcondvar_t)) + ALIGN(seminfo.semmnu * seminfo.semusz); sz = round_page(sz); /* Set the pointers and update the new values */ sema = new_sema; sem = new_sem; semcv = new_semcv; semu = new_semu; semu_list = new_semu_list; seminfo.semmni = newsemmni; seminfo.semmns = newsemmns; seminfo.semmnu = newsemmnu; /* Reallocation completed - notify all waiters, if any */ sem_realloc_state = false; cv_broadcast(&sem_realloc_cv); mutex_exit(&semlock); uvm_km_free(kernel_map, (vaddr_t)old_sema, sz, UVM_KMF_WIRED); return 0; } /* * Placebo. */ int sys_semconfig(struct lwp *l, const struct sys_semconfig_args *uap, register_t *retval) { RUN_ONCE(&exithook_control, seminit_exithook); *retval = 0; return 0; } /* * Allocate a new sem_undo structure for a process. * => Returns NULL on failure. */ struct sem_undo * semu_alloc(struct proc *p) { struct sem_undo *suptr, **supptr; bool attempted = false; int i; KASSERT(mutex_owned(&semlock)); again: /* Look for a free structure. */ for (i = 0; i < seminfo.semmnu; i++) { suptr = SEMU(semu, i); if (suptr->un_proc == NULL) { /* Found. Fill it in and return. */ suptr->un_next = semu_list; semu_list = suptr; suptr->un_cnt = 0; suptr->un_proc = p; return suptr; } } /* Not found. Attempt to free some structures. */ if (!attempted) { bool freed = false; attempted = true; supptr = &semu_list; while ((suptr = *supptr) != NULL) { if (suptr->un_cnt == 0) { suptr->un_proc = NULL; *supptr = suptr->un_next; freed = true; } else { supptr = &suptr->un_next; } } if (freed) { goto again; } } return NULL; } /* * Adjust a particular entry for a particular proc */ int semundo_adjust(struct proc *p, struct sem_undo **supptr, int semid, int semnum, int adjval) { struct sem_undo *suptr; struct sem_undo_entry *sunptr; int i; KASSERT(mutex_owned(&semlock)); /* * Look for and remember the sem_undo if the caller doesn't * provide it */ suptr = *supptr; if (suptr == NULL) { for (suptr = semu_list; suptr != NULL; suptr = suptr->un_next) if (suptr->un_proc == p) break; if (suptr == NULL) { suptr = semu_alloc(p); if (suptr == NULL) return (ENOSPC); } *supptr = suptr; } /* * Look for the requested entry and adjust it (delete if * adjval becomes 0). */ sunptr = &suptr->un_ent[0]; for (i = 0; i < suptr->un_cnt; i++, sunptr++) { if (sunptr->un_id != semid || sunptr->un_num != semnum) continue; sunptr->un_adjval += adjval; if (sunptr->un_adjval == 0) { suptr->un_cnt--; if (i < suptr->un_cnt) suptr->un_ent[i] = suptr->un_ent[suptr->un_cnt]; } return (0); } /* Didn't find the right entry - create it */ if (suptr->un_cnt == SEMUME) return (EINVAL); sunptr = &suptr->un_ent[suptr->un_cnt]; suptr->un_cnt++; sunptr->un_adjval = adjval; sunptr->un_id = semid; sunptr->un_num = semnum; return (0); } void semundo_clear(int semid, int semnum) { struct sem_undo *suptr; struct sem_undo_entry *sunptr, *sunend; KASSERT(mutex_owned(&semlock)); for (suptr = semu_list; suptr != NULL; suptr = suptr->un_next) for (sunptr = &suptr->un_ent[0], sunend = sunptr + suptr->un_cnt; sunptr < sunend;) { if (sunptr->un_id == semid) { if (semnum == -1 || sunptr->un_num == semnum) { suptr->un_cnt--; sunend--; if (sunptr != sunend) *sunptr = *sunend; if (semnum != -1) break; else continue; } } sunptr++; } } int sys_____semctl50(struct lwp *l, const struct sys_____semctl50_args *uap, register_t *retval) { /* { syscallarg(int) semid; syscallarg(int) semnum; syscallarg(int) cmd; syscallarg(union __semun *) arg; } */ struct semid_ds sembuf; int cmd, error; void *pass_arg; union __semun karg; RUN_ONCE(&exithook_control, seminit_exithook); cmd = SCARG(uap, cmd); pass_arg = get_semctl_arg(cmd, &sembuf, &karg); if (pass_arg) { error = copyin(SCARG(uap, arg), &karg, sizeof(karg)); if (error) return error; if (cmd == IPC_SET) { error = copyin(karg.buf, &sembuf, sizeof(sembuf)); if (error) return (error); } } error = semctl1(l, SCARG(uap, semid), SCARG(uap, semnum), cmd, pass_arg, retval); if (error == 0 && cmd == IPC_STAT) error = copyout(&sembuf, karg.buf, sizeof(sembuf)); return (error); } int semctl1(struct lwp *l, int semid, int semnum, int cmd, void *v, register_t *retval) { kauth_cred_t cred = l->l_cred; union __semun *arg = v; struct semid_ds *sembuf = v, *semaptr; int i, error, ix; SEM_PRINTF(("call to semctl(%d, %d, %d, %p)\n", semid, semnum, cmd, v)); mutex_enter(&semlock); ix = IPCID_TO_IX(semid); if (ix < 0 || ix >= seminfo.semmni) { mutex_exit(&semlock); return (EINVAL); } semaptr = &sema[ix]; if ((semaptr->sem_perm.mode & SEM_ALLOC) == 0 || semaptr->sem_perm._seq != IPCID_TO_SEQ(semid)) { mutex_exit(&semlock); return (EINVAL); } switch (cmd) { case IPC_RMID: if ((error = ipcperm(cred, &semaptr->sem_perm, IPC_M)) != 0) break; semaptr->sem_perm.cuid = kauth_cred_geteuid(cred); semaptr->sem_perm.uid = kauth_cred_geteuid(cred); semtot -= semaptr->sem_nsems; for (i = semaptr->_sem_base - sem; i < semtot; i++) sem[i] = sem[i + semaptr->sem_nsems]; for (i = 0; i < seminfo.semmni; i++) { if ((sema[i].sem_perm.mode & SEM_ALLOC) && sema[i]._sem_base > semaptr->_sem_base) sema[i]._sem_base -= semaptr->sem_nsems; } semaptr->sem_perm.mode = 0; semundo_clear(ix, -1); cv_broadcast(&semcv[ix]); break; case IPC_SET: if ((error = ipcperm(cred, &semaptr->sem_perm, IPC_M))) break; KASSERT(sembuf != NULL); semaptr->sem_perm.uid = sembuf->sem_perm.uid; semaptr->sem_perm.gid = sembuf->sem_perm.gid; semaptr->sem_perm.mode = (semaptr->sem_perm.mode & ~0777) | (sembuf->sem_perm.mode & 0777); semaptr->sem_ctime = time_second; break; case IPC_STAT: if ((error = ipcperm(cred, &semaptr->sem_perm, IPC_R))) break; KASSERT(sembuf != NULL); memset(sembuf, 0, sizeof *sembuf); sembuf->sem_perm = semaptr->sem_perm; sembuf->sem_perm.mode &= 0777; sembuf->sem_nsems = semaptr->sem_nsems; sembuf->sem_otime = semaptr->sem_otime; sembuf->sem_ctime = semaptr->sem_ctime; break; case GETNCNT: if ((error = ipcperm(cred, &semaptr->sem_perm, IPC_R))) break; if (semnum < 0 || semnum >= semaptr->sem_nsems) { error = EINVAL; break; } *retval = semaptr->_sem_base[semnum].semncnt; break; case GETPID: if ((error = ipcperm(cred, &semaptr->sem_perm, IPC_R))) break; if (semnum < 0 || semnum >= semaptr->sem_nsems) { error = EINVAL; break; } *retval = semaptr->_sem_base[semnum].sempid; break; case GETVAL: if ((error = ipcperm(cred, &semaptr->sem_perm, IPC_R))) break; if (semnum < 0 || semnum >= semaptr->sem_nsems) { error = EINVAL; break; } *retval = semaptr->_sem_base[semnum].semval; break; case GETALL: if ((error = ipcperm(cred, &semaptr->sem_perm, IPC_R))) break; KASSERT(arg != NULL); for (i = 0; i < semaptr->sem_nsems; i++) { error = copyout(&semaptr->_sem_base[i].semval, &arg->array[i], sizeof(arg->array[i])); if (error != 0) break; } break; case GETZCNT: if ((error = ipcperm(cred, &semaptr->sem_perm, IPC_R))) break; if (semnum < 0 || semnum >= semaptr->sem_nsems) { error = EINVAL; break; } *retval = semaptr->_sem_base[semnum].semzcnt; break; case SETVAL: if ((error = ipcperm(cred, &semaptr->sem_perm, IPC_W))) break; if (semnum < 0 || semnum >= semaptr->sem_nsems) { error = EINVAL; break; } KASSERT(arg != NULL); if ((unsigned int)arg->val > seminfo.semvmx) { error = ERANGE; break; } semaptr->_sem_base[semnum].semval = arg->val; semundo_clear(ix, semnum); cv_broadcast(&semcv[ix]); break; case SETALL: if ((error = ipcperm(cred, &semaptr->sem_perm, IPC_W))) break; KASSERT(arg != NULL); for (i = 0; i < semaptr->sem_nsems; i++) { unsigned short semval; error = copyin(&arg->array[i], &semval, sizeof(arg->array[i])); if (error != 0) break; if ((unsigned int)semval > seminfo.semvmx) { error = ERANGE; break; } semaptr->_sem_base[i].semval = semval; } semundo_clear(ix, -1); cv_broadcast(&semcv[ix]); break; default: error = EINVAL; break; } mutex_exit(&semlock); return (error); } int sys_semget(struct lwp *l, const struct sys_semget_args *uap, register_t *retval) { /* { syscallarg(key_t) key; syscallarg(int) nsems; syscallarg(int) semflg; } */ int semid, error = 0; int key = SCARG(uap, key); int nsems = SCARG(uap, nsems); int semflg = SCARG(uap, semflg); kauth_cred_t cred = l->l_cred; RUN_ONCE(&exithook_control, seminit_exithook); SEM_PRINTF(("semget(0x%x, %d, 0%o)\n", key, nsems, semflg)); mutex_enter(&semlock); if (key != IPC_PRIVATE) { for (semid = 0; semid < seminfo.semmni; semid++) { if ((sema[semid].sem_perm.mode & SEM_ALLOC) && sema[semid].sem_perm._key == key) break; } if (semid < seminfo.semmni) { SEM_PRINTF(("found public key\n")); if ((error = ipcperm(cred, &sema[semid].sem_perm, semflg & 0700))) goto out; if (nsems > 0 && sema[semid].sem_nsems < nsems) { SEM_PRINTF(("too small\n")); error = EINVAL; goto out; } if ((semflg & IPC_CREAT) && (semflg & IPC_EXCL)) { SEM_PRINTF(("not exclusive\n")); error = EEXIST; goto out; } goto found; } } SEM_PRINTF(("need to allocate the semid_ds\n")); if (key == IPC_PRIVATE || (semflg & IPC_CREAT)) { if (nsems <= 0 || nsems > seminfo.semmsl) { SEM_PRINTF(("nsems out of range (0<%d<=%d)\n", nsems, seminfo.semmsl)); error = EINVAL; goto out; } if (nsems > seminfo.semmns - semtot) { SEM_PRINTF(("not enough semaphores left " "(need %d, got %d)\n", nsems, seminfo.semmns - semtot)); error = ENOSPC; goto out; } for (semid = 0; semid < seminfo.semmni; semid++) { if ((sema[semid].sem_perm.mode & SEM_ALLOC) == 0) break; } if (semid == seminfo.semmni) { SEM_PRINTF(("no more semid_ds's available\n")); error = ENOSPC; goto out; } SEM_PRINTF(("semid %d is available\n", semid)); sema[semid].sem_perm._key = key; sema[semid].sem_perm.cuid = kauth_cred_geteuid(cred); sema[semid].sem_perm.uid = kauth_cred_geteuid(cred); sema[semid].sem_perm.cgid = kauth_cred_getegid(cred); sema[semid].sem_perm.gid = kauth_cred_getegid(cred); sema[semid].sem_perm.mode = (semflg & 0777) | SEM_ALLOC; sema[semid].sem_perm._seq = (sema[semid].sem_perm._seq + 1) & 0x7fff; sema[semid].sem_nsems = nsems; sema[semid].sem_otime = 0; sema[semid].sem_ctime = time_second; sema[semid]._sem_base = &sem[semtot]; semtot += nsems; memset(sema[semid]._sem_base, 0, sizeof(sema[semid]._sem_base[0]) * nsems); SEM_PRINTF(("sembase = %p, next = %p\n", sema[semid]._sem_base, &sem[semtot])); } else { SEM_PRINTF(("didn't find it and wasn't asked to create it\n")); error = ENOENT; goto out; } found: *retval = IXSEQ_TO_IPCID(semid, sema[semid].sem_perm); out: mutex_exit(&semlock); return (error); } #define SMALL_SOPS 8 int sys_semop(struct lwp *l, const struct sys_semop_args *uap, register_t *retval) { /* { syscallarg(int) semid; syscallarg(struct sembuf *) sops; syscallarg(size_t) nsops; } */ struct proc *p = l->l_proc; int semid = SCARG(uap, semid), seq; size_t nsops = SCARG(uap, nsops); struct sembuf small_sops[SMALL_SOPS]; struct sembuf *sops; struct semid_ds *semaptr; struct sembuf *sopptr = NULL; struct __sem *semptr = NULL; struct sem_undo *suptr = NULL; kauth_cred_t cred = l->l_cred; int i, error; int do_wakeup, do_undos; RUN_ONCE(&exithook_control, seminit_exithook); SEM_PRINTF(("call to semop(%d, %p, %zd)\n", semid, SCARG(uap,sops), nsops)); if (__predict_false((p->p_flag & PK_SYSVSEM) == 0)) { mutex_enter(p->p_lock); p->p_flag |= PK_SYSVSEM; mutex_exit(p->p_lock); } restart: if (nsops <= SMALL_SOPS) { sops = small_sops; } else if (nsops <= seminfo.semopm) { sops = kmem_alloc(nsops * sizeof(*sops), KM_SLEEP); } else { SEM_PRINTF(("too many sops (max=%d, nsops=%zd)\n", seminfo.semopm, nsops)); return (E2BIG); } error = copyin(SCARG(uap, sops), sops, nsops * sizeof(sops[0])); if (error) { SEM_PRINTF(("error = %d from copyin(%p, %p, %zd)\n", error, SCARG(uap, sops), &sops, nsops * sizeof(sops[0]))); if (sops != small_sops) kmem_free(sops, nsops * sizeof(*sops)); return error; } mutex_enter(&semlock); /* In case of reallocation, we will wait for completion */ while (__predict_false(sem_realloc_state)) cv_wait(&sem_realloc_cv, &semlock); semid = IPCID_TO_IX(semid); /* Convert back to zero origin */ if (semid < 0 || semid >= seminfo.semmni) { error = EINVAL; goto out; } semaptr = &sema[semid]; seq = IPCID_TO_SEQ(SCARG(uap, semid)); if ((semaptr->sem_perm.mode & SEM_ALLOC) == 0 || semaptr->sem_perm._seq != seq) { error = EINVAL; goto out; } if ((error = ipcperm(cred, &semaptr->sem_perm, IPC_W))) { SEM_PRINTF(("error = %d from ipaccess\n", error)); goto out; } for (i = 0; i < nsops; i++) if (sops[i].sem_num >= semaptr->sem_nsems) { error = EFBIG; goto out; } /* * Loop trying to satisfy the vector of requests. * If we reach a point where we must wait, any requests already * performed are rolled back and we go to sleep until some other * process wakes us up. At this point, we start all over again. * * This ensures that from the perspective of other tasks, a set * of requests is atomic (never partially satisfied). */ do_undos = 0; for (;;) { do_wakeup = 0; for (i = 0; i < nsops; i++) { sopptr = &sops[i]; semptr = &semaptr->_sem_base[sopptr->sem_num]; SEM_PRINTF(("semop: semaptr=%p, sem_base=%p, " "semptr=%p, sem[%d]=%d : op=%d, flag=%s\n", semaptr, semaptr->_sem_base, semptr, sopptr->sem_num, semptr->semval, sopptr->sem_op, (sopptr->sem_flg & IPC_NOWAIT) ? "nowait" : "wait")); if (sopptr->sem_op < 0) { if ((int)(semptr->semval + sopptr->sem_op) < 0) { SEM_PRINTF(("semop: " "can't do it now\n")); break; } else { semptr->semval += sopptr->sem_op; if (semptr->semval == 0 && semptr->semzcnt > 0) do_wakeup = 1; } if (sopptr->sem_flg & SEM_UNDO) do_undos = 1; } else if (sopptr->sem_op == 0) { if (semptr->semval > 0) { SEM_PRINTF(("semop: not zero now\n")); break; } } else { if (semptr->semncnt > 0) do_wakeup = 1; semptr->semval += sopptr->sem_op; if (sopptr->sem_flg & SEM_UNDO) do_undos = 1; } } /* * Did we get through the entire vector? */ if (i >= nsops) goto done; /* * No ... rollback anything that we've already done */ SEM_PRINTF(("semop: rollback 0 through %d\n", i - 1)); while (i-- > 0) semaptr->_sem_base[sops[i].sem_num].semval -= sops[i].sem_op; /* * If the request that we couldn't satisfy has the * NOWAIT flag set then return with EAGAIN. */ if (sopptr->sem_flg & IPC_NOWAIT) { error = EAGAIN; goto out; } if (sopptr->sem_op == 0) semptr->semzcnt++; else semptr->semncnt++; sem_waiters++; SEM_PRINTF(("semop: good night!\n")); error = cv_wait_sig(&semcv[semid], &semlock); SEM_PRINTF(("semop: good morning (error=%d)!\n", error)); sem_waiters--; /* Notify reallocator, if it is waiting */ cv_broadcast(&sem_realloc_cv); /* * Make sure that the semaphore still exists */ if ((semaptr->sem_perm.mode & SEM_ALLOC) == 0 || semaptr->sem_perm._seq != seq) { error = EIDRM; goto out; } /* * The semaphore is still alive. Readjust the count of * waiting processes. */ semptr = &semaptr->_sem_base[sopptr->sem_num]; if (sopptr->sem_op == 0) semptr->semzcnt--; else semptr->semncnt--; /* In case of such state, restart the call */ if (sem_realloc_state) { mutex_exit(&semlock); goto restart; } /* Is it really morning, or was our sleep interrupted? */ if (error != 0) { error = EINTR; goto out; } SEM_PRINTF(("semop: good morning!\n")); } done: /* * Process any SEM_UNDO requests. */ if (do_undos) { for (i = 0; i < nsops; i++) { /* * We only need to deal with SEM_UNDO's for non-zero * op's. */ int adjval; if ((sops[i].sem_flg & SEM_UNDO) == 0) continue; adjval = sops[i].sem_op; if (adjval == 0) continue; error = semundo_adjust(p, &suptr, semid, sops[i].sem_num, -adjval); if (error == 0) continue; /* * Oh-Oh! We ran out of either sem_undo's or undo's. * Rollback the adjustments to this point and then * rollback the semaphore ups and down so we can return * with an error with all structures restored. We * rollback the undo's in the exact reverse order that * we applied them. This guarantees that we won't run * out of space as we roll things back out. */ while (i-- > 0) { if ((sops[i].sem_flg & SEM_UNDO) == 0) continue; adjval = sops[i].sem_op; if (adjval == 0) continue; if (semundo_adjust(p, &suptr, semid, sops[i].sem_num, adjval) != 0) panic("semop - can't undo undos"); } for (i = 0; i < nsops; i++) semaptr->_sem_base[sops[i].sem_num].semval -= sops[i].sem_op; SEM_PRINTF(("error = %d from semundo_adjust\n", error)); goto out; } /* loop through the sops */ } /* if (do_undos) */ /* We're definitely done - set the sempid's */ for (i = 0; i < nsops; i++) { sopptr = &sops[i]; semptr = &semaptr->_sem_base[sopptr->sem_num]; semptr->sempid = p->p_pid; } /* Update sem_otime */ semaptr->sem_otime = time_second; /* Do a wakeup if any semaphore was up'd. */ if (do_wakeup) { SEM_PRINTF(("semop: doing wakeup\n")); cv_broadcast(&semcv[semid]); SEM_PRINTF(("semop: back from wakeup\n")); } SEM_PRINTF(("semop: done\n")); *retval = 0; out: mutex_exit(&semlock); if (sops != small_sops) kmem_free(sops, nsops * sizeof(*sops)); return error; } /* * Go through the undo structures for this process and apply the * adjustments to semaphores. */ /*ARGSUSED*/ void semexit(struct proc *p, void *v) { struct sem_undo *suptr; struct sem_undo **supptr; if ((p->p_flag & PK_SYSVSEM) == 0) return; mutex_enter(&semlock); /* * Go through the chain of undo vectors looking for one * associated with this process. */ for (supptr = &semu_list; (suptr = *supptr) != NULL; supptr = &suptr->un_next) { if (suptr->un_proc == p) break; } /* * If there is no undo vector, skip to the end. */ if (suptr == NULL) { mutex_exit(&semlock); return; } /* * We now have an undo vector for this process. */ SEM_PRINTF(("proc @%p has undo structure with %d entries\n", p, suptr->un_cnt)); /* * If there are any active undo elements then process them. */ if (suptr->un_cnt > 0) { int ix; for (ix = 0; ix < suptr->un_cnt; ix++) { int semid = suptr->un_ent[ix].un_id; int semnum = suptr->un_ent[ix].un_num; int adjval = suptr->un_ent[ix].un_adjval; struct semid_ds *semaptr; semaptr = &sema[semid]; if ((semaptr->sem_perm.mode & SEM_ALLOC) == 0) if (semnum >= semaptr->sem_nsems) panic("semexit - semnum out of range"); SEM_PRINTF(("semexit: %p id=%d num=%d(adj=%d) ; " "sem=%d\n", suptr->un_proc, suptr->un_ent[ix].un_id, suptr->un_ent[ix].un_num, suptr->un_ent[ix].un_adjval, semaptr->_sem_base[semnum].semval)); if (adjval < 0 && semaptr->_sem_base[semnum].semval < -adjval) semaptr->_sem_base[semnum].semval = 0; else semaptr->_sem_base[semnum].semval += adjval; cv_broadcast(&semcv[semid]); SEM_PRINTF(("semexit: back from wakeup\n")); } } /* * Deallocate the undo vector. */ SEM_PRINTF(("removing vector\n")); suptr->un_proc = NULL; *supptr = suptr->un_next; mutex_exit(&semlock); } /* * Sysctl initialization and nodes. */ static int sysctl_ipc_semmni(SYSCTLFN_ARGS) { int newsize, error; struct sysctlnode node; node = *rnode; node.sysctl_data = &newsize; newsize = seminfo.semmni; error = sysctl_lookup(SYSCTLFN_CALL(&node)); if (error || newp == NULL) return error; return semrealloc(newsize, seminfo.semmns, seminfo.semmnu); } static int sysctl_ipc_semmns(SYSCTLFN_ARGS) { int newsize, error; struct sysctlnode node; node = *rnode; node.sysctl_data = &newsize; newsize = seminfo.semmns; error = sysctl_lookup(SYSCTLFN_CALL(&node)); if (error || newp == NULL) return error; return semrealloc(seminfo.semmni, newsize, seminfo.semmnu); } static int sysctl_ipc_semmnu(SYSCTLFN_ARGS) { int newsize, error; struct sysctlnode node; node = *rnode; node.sysctl_data = &newsize; newsize = seminfo.semmnu; error = sysctl_lookup(SYSCTLFN_CALL(&node)); if (error || newp == NULL) return error; return semrealloc(seminfo.semmni, seminfo.semmns, newsize); } SYSCTL_SETUP(sysctl_ipc_sem_setup, "sysctl kern.ipc subtree setup") { const struct sysctlnode *node = NULL; sysctl_createv(clog, 0, NULL, &node, CTLFLAG_PERMANENT, CTLTYPE_NODE, "ipc", SYSCTL_DESCR("SysV IPC options"), NULL, 0, NULL, 0, CTL_KERN, KERN_SYSVIPC, CTL_EOL); if (node == NULL) return; sysctl_createv(clog, 0, &node, NULL, CTLFLAG_PERMANENT | CTLFLAG_READWRITE, CTLTYPE_INT, "semmni", SYSCTL_DESCR("Max number of number of semaphore identifiers"), sysctl_ipc_semmni, 0, &seminfo.semmni, 0, CTL_CREATE, CTL_EOL); sysctl_createv(clog, 0, &node, NULL, CTLFLAG_PERMANENT | CTLFLAG_READWRITE, CTLTYPE_INT, "semmns", SYSCTL_DESCR("Max number of number of semaphores in system"), sysctl_ipc_semmns, 0, &seminfo.semmns, 0, CTL_CREATE, CTL_EOL); sysctl_createv(clog, 0, &node, NULL, CTLFLAG_PERMANENT | CTLFLAG_READWRITE, CTLTYPE_INT, "semmnu", SYSCTL_DESCR("Max number of undo structures in system"), sysctl_ipc_semmnu, 0, &seminfo.semmnu, 0, CTL_CREATE, CTL_EOL); }
114 114 114 39 39 39 1 39 4 4 4 4 256 280 354 281 91 92 242 242 241 242 232 240 31 15 31 31 41 5 22 27 25 26 26 27 27 26 6 22 25 27 27 27 22 252 252 217 59 54 249 216 59 25 25 4 22 17 5 1 7 6 1 5 25 25 25 25 5 5 25 4 35 15 35 27 29 27 2 35 2 117 1 1 1 79 37 3 35 37 37 37 35 2 21 21 21 3 18 9 12 6 1 5 6 6 15 15 15 15 114 7 1 6 32 2 21 3 6 2 7 7 6 39 13 17 14 12 2 14 17 17 14 17 12 17 1 17 17 17 17 13 15 15 15 15 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 /* $NetBSD: vfs_mount.c,v 1.105 2024/04/19 00:45:41 riastradh Exp $ */ /*- * Copyright (c) 1997-2020 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility, * NASA Ames Research Center, by Charles M. Hannum, and by Andrew Doran. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /* * Copyright (c) 1989, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)vfs_subr.c 8.13 (Berkeley) 4/18/94 */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: vfs_mount.c,v 1.105 2024/04/19 00:45:41 riastradh Exp $"); #include "veriexec.h" #include <sys/param.h> #include <sys/kernel.h> #include <sys/atomic.h> #include <sys/buf.h> #include <sys/conf.h> #include <sys/fcntl.h> #include <sys/filedesc.h> #include <sys/device.h> #include <sys/kauth.h> #include <sys/kmem.h> #include <sys/module.h> #include <sys/mount.h> #include <sys/fstrans.h> #include <sys/namei.h> #include <sys/extattr.h> #include <sys/verified_exec.h> #include <sys/syscallargs.h> #include <sys/sysctl.h> #include <sys/systm.h> #include <sys/vfs_syscalls.h> #include <sys/vnode_impl.h> #include <miscfs/deadfs/deadfs.h> #include <miscfs/genfs/genfs.h> #include <miscfs/specfs/specdev.h> #include <uvm/uvm_swap.h> enum mountlist_type { ME_MOUNT, ME_MARKER }; struct mountlist_entry { TAILQ_ENTRY(mountlist_entry) me_list; /* Mount list. */ struct mount *me_mount; /* Actual mount if ME_MOUNT, current mount else. */ enum mountlist_type me_type; /* Mount or marker. */ }; struct mount_iterator { struct mountlist_entry mi_entry; }; static struct vnode *vfs_vnode_iterator_next1(struct vnode_iterator *, bool (*)(void *, struct vnode *), void *, bool); /* Root filesystem. */ vnode_t * rootvnode; /* Mounted filesystem list. */ static TAILQ_HEAD(mountlist, mountlist_entry) mountlist; static kmutex_t mountlist_lock __cacheline_aligned; int vnode_offset_next_by_lru /* XXX: ugly hack for pstat.c */ = offsetof(vnode_impl_t, vi_lrulist.tqe_next); kmutex_t vfs_list_lock __cacheline_aligned; static specificdata_domain_t mount_specificdata_domain; static kmutex_t mntid_lock; static kmutex_t mountgen_lock __cacheline_aligned; static uint64_t mountgen; void vfs_mount_sysinit(void) { TAILQ_INIT(&mountlist); mutex_init(&mountlist_lock, MUTEX_DEFAULT, IPL_NONE); mutex_init(&vfs_list_lock, MUTEX_DEFAULT, IPL_NONE); mount_specificdata_domain = specificdata_domain_create(); mutex_init(&mntid_lock, MUTEX_DEFAULT, IPL_NONE); mutex_init(&mountgen_lock, MUTEX_DEFAULT, IPL_NONE); mountgen = 0; } struct mount * vfs_mountalloc(struct vfsops *vfsops, vnode_t *vp) { struct mount *mp; int error __diagused; mp = kmem_zalloc(sizeof(*mp), KM_SLEEP); mp->mnt_op = vfsops; mp->mnt_refcnt = 1; TAILQ_INIT(&mp->mnt_vnodelist); mp->mnt_renamelock = mutex_obj_alloc(MUTEX_DEFAULT, IPL_NONE); mp->mnt_vnodelock = mutex_obj_alloc(MUTEX_DEFAULT, IPL_NONE); mp->mnt_updating = mutex_obj_alloc(MUTEX_DEFAULT, IPL_NONE); mp->mnt_vnodecovered = vp; mount_initspecific(mp); error = fstrans_mount(mp); KASSERT(error == 0); mutex_enter(&mountgen_lock); mp->mnt_gen = mountgen++; mutex_exit(&mountgen_lock); return mp; } /* * vfs_rootmountalloc: lookup a filesystem type, and if found allocate and * initialize a mount structure for it. * * Devname is usually updated by mount(8) after booting. */ int vfs_rootmountalloc(const char *fstypename, const char *devname, struct mount **mpp) { struct vfsops *vfsp = NULL; struct mount *mp; int error __diagused; mutex_enter(&vfs_list_lock); LIST_FOREACH(vfsp, &vfs_list, vfs_list) if (!strncmp(vfsp->vfs_name, fstypename, sizeof(mp->mnt_stat.f_fstypename))) break; if (vfsp == NULL) { mutex_exit(&vfs_list_lock); return (ENODEV); } vfsp->vfs_refcount++; mutex_exit(&vfs_list_lock); if ((mp = vfs_mountalloc(vfsp, NULL)) == NULL) return ENOMEM; error = vfs_busy(mp); KASSERT(error == 0); mp->mnt_flag = MNT_RDONLY; (void)strlcpy(mp->mnt_stat.f_fstypename, vfsp->vfs_name, sizeof(mp->mnt_stat.f_fstypename)); mp->mnt_stat.f_mntonname[0] = '/'; mp->mnt_stat.f_mntonname[1] = '\0'; mp->mnt_stat.f_mntfromname[sizeof(mp->mnt_stat.f_mntfromname) - 1] = '\0'; (void)copystr(devname, mp->mnt_stat.f_mntfromname, sizeof(mp->mnt_stat.f_mntfromname) - 1, 0); *mpp = mp; return 0; } /* * vfs_getnewfsid: get a new unique fsid. */ void vfs_getnewfsid(struct mount *mp) { static u_short xxxfs_mntid; struct mountlist_entry *me; fsid_t tfsid; int mtype; mutex_enter(&mntid_lock); if (xxxfs_mntid == 0) ++xxxfs_mntid; mtype = makefstype(mp->mnt_op->vfs_name); tfsid.__fsid_val[0] = makedev(mtype & 0xff, xxxfs_mntid); tfsid.__fsid_val[1] = mtype; /* Always increment to not return the same fsid to parallel mounts. */ xxxfs_mntid++; /* * Directly walk mountlist to prevent deadlock through * mountlist_iterator_next() -> vfs_busy(). */ mutex_enter(&mountlist_lock); for (me = TAILQ_FIRST(&mountlist); me != TAILQ_END(&mountlist); ) { if (me->me_type == ME_MOUNT && me->me_mount->mnt_stat.f_fsidx.__fsid_val[0] == tfsid.__fsid_val[0] && me->me_mount->mnt_stat.f_fsidx.__fsid_val[1] == tfsid.__fsid_val[1]) { tfsid.__fsid_val[0]++; xxxfs_mntid++; me = TAILQ_FIRST(&mountlist); } else { me = TAILQ_NEXT(me, me_list); } } mutex_exit(&mountlist_lock); mp->mnt_stat.f_fsidx.__fsid_val[0] = tfsid.__fsid_val[0]; mp->mnt_stat.f_fsidx.__fsid_val[1] = tfsid.__fsid_val[1]; mp->mnt_stat.f_fsid = mp->mnt_stat.f_fsidx.__fsid_val[0]; mutex_exit(&mntid_lock); } /* * Lookup a mount point by filesystem identifier. * * XXX Needs to add a reference to the mount point. */ struct mount * vfs_getvfs(fsid_t *fsid) { mount_iterator_t *iter; struct mount *mp; mountlist_iterator_init(&iter); while ((mp = mountlist_iterator_next(iter)) != NULL) { if (mp->mnt_stat.f_fsidx.__fsid_val[0] == fsid->__fsid_val[0] && mp->mnt_stat.f_fsidx.__fsid_val[1] == fsid->__fsid_val[1]) { mountlist_iterator_destroy(iter); return mp; } } mountlist_iterator_destroy(iter); return NULL; } /* * Take a reference to a mount structure. */ void vfs_ref(struct mount *mp) { KASSERT(mp->mnt_refcnt > 0 || mutex_owned(&mountlist_lock)); atomic_inc_uint(&mp->mnt_refcnt); } /* * Drop a reference to a mount structure, freeing if the last reference. */ void vfs_rele(struct mount *mp) { membar_release(); if (__predict_true((int)atomic_dec_uint_nv(&mp->mnt_refcnt) > 0)) { return; } membar_acquire(); /* * Nothing else has visibility of the mount: we can now * free the data structures. */ KASSERT(mp->mnt_refcnt == 0); specificdata_fini(mount_specificdata_domain, &mp->mnt_specdataref); mutex_obj_free(mp->mnt_updating); mutex_obj_free(mp->mnt_renamelock); mutex_obj_free(mp->mnt_vnodelock); if (mp->mnt_op != NULL) { vfs_delref(mp->mnt_op); } fstrans_unmount(mp); /* * Final free of mp gets done from fstrans_mount_dtor(). * * Prevents this memory to be reused as a mount before * fstrans releases all references to it. */ } /* * Mark a mount point as busy, and gain a new reference to it. Used to * prevent the file system from being unmounted during critical sections. * * vfs_busy can be called multiple times and by multiple threads * and must be accompanied by the same number of vfs_unbusy calls. * * => The caller must hold a pre-existing reference to the mount. * => Will fail if the file system is being unmounted, or is unmounted. */ static inline int _vfs_busy(struct mount *mp, bool wait) { KASSERT(mp->mnt_refcnt > 0); if (wait) { fstrans_start(mp); } else { if (fstrans_start_nowait(mp)) return EBUSY; } if (__predict_false((mp->mnt_iflag & IMNT_GONE) != 0)) { fstrans_done(mp); return ENOENT; } vfs_ref(mp); return 0; } int vfs_busy(struct mount *mp) { return _vfs_busy(mp, true); } int vfs_trybusy(struct mount *mp) { return _vfs_busy(mp, false); } /* * Unbusy a busy filesystem. * * Every successful vfs_busy() call must be undone by a vfs_unbusy() call. */ void vfs_unbusy(struct mount *mp) { KASSERT(mp->mnt_refcnt > 0); fstrans_done(mp); vfs_rele(mp); } /* * Change a file systems lower mount. * Both the current and the new lower mount may be NULL. The caller * guarantees exclusive access to the mount and holds a pre-existing * reference to the new lower mount. */ int vfs_set_lowermount(struct mount *mp, struct mount *lowermp) { struct mount *oldlowermp; int error; #ifdef DEBUG /* * Limit the depth of file system stack so kernel sanitizers * may stress mount/unmount without exhausting the kernel stack. */ int depth; struct mount *mp2; for (depth = 0, mp2 = lowermp; mp2; depth++, mp2 = mp2->mnt_lower) { if (depth == 23) return EINVAL; } #endif if (lowermp) { if (lowermp == dead_rootmount) return ENOENT; error = vfs_busy(lowermp); if (error) return error; vfs_ref(lowermp); } oldlowermp = mp->mnt_lower; mp->mnt_lower = lowermp; if (lowermp) vfs_unbusy(lowermp); if (oldlowermp) vfs_rele(oldlowermp); return 0; } struct vnode_iterator { vnode_impl_t vi_vnode; }; void vfs_vnode_iterator_init(struct mount *mp, struct vnode_iterator **vnip) { vnode_t *vp; vnode_impl_t *vip; vp = vnalloc_marker(mp); vip = VNODE_TO_VIMPL(vp); mutex_enter(mp->mnt_vnodelock); TAILQ_INSERT_HEAD(&mp->mnt_vnodelist, vip, vi_mntvnodes); vp->v_usecount = 1; mutex_exit(mp->mnt_vnodelock); *vnip = (struct vnode_iterator *)vip; } void vfs_vnode_iterator_destroy(struct vnode_iterator *vni) { vnode_impl_t *mvip = &vni->vi_vnode; vnode_t *mvp = VIMPL_TO_VNODE(mvip); kmutex_t *lock; KASSERT(vnis_marker(mvp)); if (vrefcnt(mvp) != 0) { lock = mvp->v_mount->mnt_vnodelock; mutex_enter(lock); TAILQ_REMOVE(&mvp->v_mount->mnt_vnodelist, mvip, vi_mntvnodes); mvp->v_usecount = 0; mutex_exit(lock); } vnfree_marker(mvp); } static struct vnode * vfs_vnode_iterator_next1(struct vnode_iterator *vni, bool (*f)(void *, struct vnode *), void *cl, bool do_wait) { vnode_impl_t *mvip = &vni->vi_vnode; struct mount *mp = VIMPL_TO_VNODE(mvip)->v_mount; vnode_t *vp; vnode_impl_t *vip; kmutex_t *lock; int error; KASSERT(vnis_marker(VIMPL_TO_VNODE(mvip))); lock = mp->mnt_vnodelock; do { mutex_enter(lock); vip = TAILQ_NEXT(mvip, vi_mntvnodes); TAILQ_REMOVE(&mp->mnt_vnodelist, mvip, vi_mntvnodes); VIMPL_TO_VNODE(mvip)->v_usecount = 0; again: if (vip == NULL) { mutex_exit(lock); return NULL; } vp = VIMPL_TO_VNODE(vip); KASSERT(vp != NULL); mutex_enter(vp->v_interlock); if (vnis_marker(vp) || vdead_check(vp, (do_wait ? 0 : VDEAD_NOWAIT)) || (f && !(*f)(cl, vp))) { mutex_exit(vp->v_interlock); vip = TAILQ_NEXT(vip, vi_mntvnodes); goto again; } TAILQ_INSERT_AFTER(&mp->mnt_vnodelist, vip, mvip, vi_mntvnodes); VIMPL_TO_VNODE(mvip)->v_usecount = 1; mutex_exit(lock); error = vcache_vget(vp); KASSERT(error == 0 || error == ENOENT); } while (error != 0); return vp; } struct vnode * vfs_vnode_iterator_next(struct vnode_iterator *vni, bool (*f)(void *, struct vnode *), void *cl) { return vfs_vnode_iterator_next1(vni, f, cl, false); } /* * Move a vnode from one mount queue to another. */ void vfs_insmntque(vnode_t *vp, struct mount *mp) { vnode_impl_t *vip = VNODE_TO_VIMPL(vp); struct mount *omp; kmutex_t *lock; KASSERT(mp == NULL || (mp->mnt_iflag & IMNT_UNMOUNT) == 0 || vp->v_tag == VT_VFS); /* * Delete from old mount point vnode list, if on one. */ if ((omp = vp->v_mount) != NULL) { lock = omp->mnt_vnodelock; mutex_enter(lock); TAILQ_REMOVE(&vp->v_mount->mnt_vnodelist, vip, vi_mntvnodes); mutex_exit(lock); } /* * Insert into list of vnodes for the new mount point, if * available. The caller must take a reference on the mount * structure and donate to the vnode. */ if ((vp->v_mount = mp) != NULL) { lock = mp->mnt_vnodelock; mutex_enter(lock); TAILQ_INSERT_TAIL(&mp->mnt_vnodelist, vip, vi_mntvnodes); mutex_exit(lock); } if (omp != NULL) { /* Release reference to old mount. */ vfs_rele(omp); } } /* * Remove any vnodes in the vnode table belonging to mount point mp. * * If FORCECLOSE is not specified, there should not be any active ones, * return error if any are found (nb: this is a user error, not a * system error). If FORCECLOSE is specified, detach any active vnodes * that are found. * * If WRITECLOSE is set, only flush out regular file vnodes open for * writing. * * SKIPSYSTEM causes any vnodes marked VV_SYSTEM to be skipped. */ #ifdef DEBUG int busyprt = 0; /* print out busy vnodes */ struct ctldebug debug1 = { "busyprt", &busyprt }; #endif static vnode_t * vflushnext(struct vnode_iterator *marker, int *when) { if (getticks() > *when) { yield(); *when = getticks() + hz / 10; } preempt_point(); return vfs_vnode_iterator_next1(marker, NULL, NULL, true); } /* * Flush one vnode. Referenced on entry, unreferenced on return. */ static int vflush_one(vnode_t *vp, vnode_t *skipvp, int flags) { int error; struct vattr vattr; if (vp == skipvp || ((flags & SKIPSYSTEM) && (vp->v_vflag & VV_SYSTEM))) { vrele(vp); return 0; } /* * If WRITECLOSE is set, only flush out regular file * vnodes open for writing or open and unlinked. */ if ((flags & WRITECLOSE)) { if (vp->v_type != VREG) { vrele(vp); return 0; } error = vn_lock(vp, LK_EXCLUSIVE); if (error) { KASSERT(error == ENOENT); vrele(vp); return 0; } error = VOP_FSYNC(vp, curlwp->l_cred, FSYNC_WAIT, 0, 0); if (error == 0) error = VOP_GETATTR(vp, &vattr, curlwp->l_cred); VOP_UNLOCK(vp); if (error) { vrele(vp); return error; } if (vp->v_writecount == 0 && vattr.va_nlink > 0) { vrele(vp); return 0; } } /* * First try to recycle the vnode. */ if (vrecycle(vp)) return 0; /* * If FORCECLOSE is set, forcibly close the vnode. * For block or character devices, revert to an * anonymous device. For all other files, just * kill them. */ if (flags & FORCECLOSE) { if (vrefcnt(vp) > 1 && (vp->v_type == VBLK || vp->v_type == VCHR)) vcache_make_anon(vp); else vgone(vp); return 0; } vrele(vp); return EBUSY; } int vflush(struct mount *mp, vnode_t *skipvp, int flags) { vnode_t *vp; struct vnode_iterator *marker; int busy, error, when, retries = 2; do { busy = error = when = 0; /* * First, flush out any vnode references from the * deferred vrele list. */ vrele_flush(mp); vfs_vnode_iterator_init(mp, &marker); while ((vp = vflushnext(marker, &when)) != NULL) { error = vflush_one(vp, skipvp, flags); if (error == EBUSY) { error = 0; busy++; #ifdef DEBUG if (busyprt && retries == 0) vprint("vflush: busy vnode", vp); #endif } else if (error != 0) { break; } } vfs_vnode_iterator_destroy(marker); } while (error == 0 && busy > 0 && retries-- > 0); if (error) return error; if (busy) return EBUSY; return 0; } /* * Mount a file system. */ /* * Scan all active processes to see if any of them have a current or root * directory onto which the new filesystem has just been mounted. If so, * replace them with the new mount point. */ static void mount_checkdirs(vnode_t *olddp) { vnode_t *newdp, *rele1, *rele2; struct cwdinfo *cwdi; struct proc *p; bool retry; if (vrefcnt(olddp) == 1) { return; } if (VFS_ROOT(olddp->v_mountedhere, LK_EXCLUSIVE, &newdp)) panic("mount: lost mount"); do { retry = false; mutex_enter(&proc_lock); PROCLIST_FOREACH(p, &allproc) { if ((cwdi = p->p_cwdi) == NULL) continue; /* * Cannot change to the old directory any more, * so even if we see a stale value it is not a * problem. */ if (cwdi->cwdi_cdir != olddp && cwdi->cwdi_rdir != olddp) continue; retry = true; rele1 = NULL; rele2 = NULL; atomic_inc_uint(&cwdi->cwdi_refcnt); mutex_exit(&proc_lock); rw_enter(&cwdi->cwdi_lock, RW_WRITER); if (cwdi->cwdi_cdir == olddp) { rele1 = cwdi->cwdi_cdir; vref(newdp); cwdi->cwdi_cdir = newdp; } if (cwdi->cwdi_rdir == olddp) { rele2 = cwdi->cwdi_rdir; vref(newdp); cwdi->cwdi_rdir = newdp; } rw_exit(&cwdi->cwdi_lock); cwdfree(cwdi); if (rele1 != NULL) vrele(rele1); if (rele2 != NULL) vrele(rele2); mutex_enter(&proc_lock); break; } mutex_exit(&proc_lock); } while (retry); if (rootvnode == olddp) { vrele(rootvnode); vref(newdp); rootvnode = newdp; } vput(newdp); } /* * Start extended attributes */ static int start_extattr(struct mount *mp) { int error; error = VFS_EXTATTRCTL(mp, EXTATTR_CMD_START, NULL, 0, NULL); if (error) printf("%s: failed to start extattr: error = %d\n", mp->mnt_stat.f_mntonname, error); return error; } int mount_domount(struct lwp *l, vnode_t **vpp, struct vfsops *vfsops, const char *path, int flags, void *data, size_t *data_len) { vnode_t *vp = *vpp; struct mount *mp; struct pathbuf *pb; struct nameidata nd; int error, error2; error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_MOUNT, KAUTH_REQ_SYSTEM_MOUNT_NEW, vp, KAUTH_ARG(flags), data); if (error) { vfs_delref(vfsops); return error; } /* Cannot make a non-dir a mount-point (from here anyway). */ if (vp->v_type != VDIR) { vfs_delref(vfsops); return ENOTDIR; } if (flags & MNT_EXPORTED) { vfs_delref(vfsops); return EINVAL; } if ((mp = vfs_mountalloc(vfsops, vp)) == NULL) { vfs_delref(vfsops); return ENOMEM; } mp->mnt_stat.f_owner = kauth_cred_geteuid(l->l_cred); /* * The underlying file system may refuse the mount for * various reasons. Allow the user to force it to happen. * * Set the mount level flags. */ mp->mnt_flag = flags & (MNT_BASIC_FLAGS | MNT_FORCE | MNT_IGNORE); error = VFS_MOUNT(mp, path, data, data_len); mp->mnt_flag &= ~MNT_OP_FLAGS; if (error != 0) { vfs_rele(mp); return error; } /* Suspend new file system before taking mnt_updating. */ do { error2 = vfs_suspend(mp, 0); } while (error2 == EINTR || error2 == ERESTART); KASSERT(error2 == 0 || error2 == EOPNOTSUPP); mutex_enter(mp->mnt_updating); /* * Validate and prepare the mount point. */ error = pathbuf_copyin(path, &pb); if (error != 0) { goto err_mounted; } NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | TRYEMULROOT, pb); error = namei(&nd); pathbuf_destroy(pb); if (error != 0) { goto err_mounted; } if (nd.ni_vp != vp) { vput(nd.ni_vp); error = EINVAL; goto err_mounted; } if (vp->v_mountedhere != NULL) { vput(nd.ni_vp); error = EBUSY; goto err_mounted; } error = vinvalbuf(vp, V_SAVE, l->l_cred, l, 0, 0); if (error != 0) { vput(nd.ni_vp); goto err_mounted; } /* * Put the new filesystem on the mount list after root. */ cache_purge(vp); mp->mnt_iflag &= ~IMNT_WANTRDWR; mountlist_append(mp); if ((mp->mnt_flag & (MNT_RDONLY | MNT_ASYNC)) == 0) vfs_syncer_add_to_worklist(mp); vp->v_mountedhere = mp; vput(nd.ni_vp); mount_checkdirs(vp); mutex_exit(mp->mnt_updating); if (error2 == 0) vfs_resume(mp); /* Hold an additional reference to the mount across VFS_START(). */ vfs_ref(mp); (void) VFS_STATVFS(mp, &mp->mnt_stat); error = VFS_START(mp, 0); if (error) { vrele(vp); } else if (flags & MNT_EXTATTR) { if (start_extattr(mp) != 0) mp->mnt_flag &= ~MNT_EXTATTR; } /* Drop reference held for VFS_START(). */ vfs_rele(mp); *vpp = NULL; return error; err_mounted: if (VFS_UNMOUNT(mp, MNT_FORCE) != 0) panic("Unmounting fresh file system failed"); mutex_exit(mp->mnt_updating); if (error2 == 0) vfs_resume(mp); vfs_set_lowermount(mp, NULL); vfs_rele(mp); return error; } /* * Do the actual file system unmount. File system is assumed to have * been locked by the caller. * * => Caller hold reference to the mount, explicitly for dounmount(). */ int dounmount(struct mount *mp, int flags, struct lwp *l) { struct vnode *coveredvp, *vp; struct vnode_impl *vip; int error, async, used_syncer, used_extattr; const bool was_suspended = fstrans_is_owner(mp); #if NVERIEXEC > 0 error = veriexec_unmountchk(mp); if (error) return (error); #endif /* NVERIEXEC > 0 */ if (!was_suspended) { error = vfs_suspend(mp, 0); if (error) { return error; } } KASSERT((mp->mnt_iflag & IMNT_GONE) == 0); used_syncer = (mp->mnt_iflag & IMNT_ONWORKLIST) != 0; used_extattr = mp->mnt_flag & MNT_EXTATTR; mp->mnt_iflag |= IMNT_UNMOUNT; mutex_enter(mp->mnt_updating); async = mp->mnt_flag & MNT_ASYNC; mp->mnt_flag &= ~MNT_ASYNC; cache_purgevfs(mp); /* remove cache entries for this file sys */ if (used_syncer) vfs_syncer_remove_from_worklist(mp); error = 0; if (((mp->mnt_flag & MNT_RDONLY) == 0) && ((flags & MNT_FORCE) == 0)) { error = VFS_SYNC(mp, MNT_WAIT, l->l_cred); } if (error == 0 || (flags & MNT_FORCE)) { error = VFS_UNMOUNT(mp, flags); } if (error) { mp->mnt_iflag &= ~IMNT_UNMOUNT; if ((mp->mnt_flag & (MNT_RDONLY | MNT_ASYNC)) == 0) vfs_syncer_add_to_worklist(mp); mp->mnt_flag |= async; mutex_exit(mp->mnt_updating); if (!was_suspended) vfs_resume(mp); if (used_extattr) { if (start_extattr(mp) != 0) mp->mnt_flag &= ~MNT_EXTATTR; else mp->mnt_flag |= MNT_EXTATTR; } return (error); } mutex_exit(mp->mnt_updating); /* * mark filesystem as gone to prevent further umounts * after mnt_umounting lock is gone, this also prevents * vfs_busy() from succeeding. */ mp->mnt_iflag |= IMNT_GONE; if ((coveredvp = mp->mnt_vnodecovered) != NULLVP) { coveredvp->v_mountedhere = NULL; } if (!was_suspended) vfs_resume(mp); mountlist_remove(mp); if ((vip = TAILQ_FIRST(&mp->mnt_vnodelist)) != NULL) { vp = VIMPL_TO_VNODE(vip); vprint("dangling", vp); panic("unmount: dangling vnode"); } vfs_hooks_unmount(mp); vfs_set_lowermount(mp, NULL); vfs_rele(mp); /* reference from mount() */ if (coveredvp != NULLVP) { vrele(coveredvp); } return (0); } /* * Unmount all file systems. * We traverse the list in reverse order under the assumption that doing so * will avoid needing to worry about dependencies. */ bool vfs_unmountall(struct lwp *l) { printf("unmounting file systems...\n"); return vfs_unmountall1(l, true, true); } static void vfs_unmount_print(struct mount *mp, const char *pfx) { aprint_verbose("%sunmounted %s on %s type %s\n", pfx, mp->mnt_stat.f_mntfromname, mp->mnt_stat.f_mntonname, mp->mnt_stat.f_fstypename); } /* * Return the mount with the highest generation less than "gen". */ static struct mount * vfs_unmount_next(uint64_t gen) { mount_iterator_t *iter; struct mount *mp, *nmp; nmp = NULL; mountlist_iterator_init(&iter); while ((mp = mountlist_iterator_next(iter)) != NULL) { if ((nmp == NULL || mp->mnt_gen > nmp->mnt_gen) && mp->mnt_gen < gen) { if (nmp != NULL) vfs_rele(nmp); nmp = mp; vfs_ref(nmp); } } mountlist_iterator_destroy(iter); return nmp; } bool vfs_unmount_forceone(struct lwp *l) { struct mount *mp; int error; mp = vfs_unmount_next(mountgen); if (mp == NULL) { return false; } #ifdef DEBUG printf("forcefully unmounting %s (%s)...\n", mp->mnt_stat.f_mntonname, mp->mnt_stat.f_mntfromname); #endif if ((error = dounmount(mp, MNT_FORCE, l)) == 0) { vfs_unmount_print(mp, "forcefully "); return true; } else { vfs_rele(mp); } #ifdef DEBUG printf("forceful unmount of %s failed with error %d\n", mp->mnt_stat.f_mntonname, error); #endif return false; } bool vfs_unmountall1(struct lwp *l, bool force, bool verbose) { struct mount *mp; mount_iterator_t *iter; bool any_error = false, progress = false; uint64_t gen; int error; gen = mountgen; for (;;) { mp = vfs_unmount_next(gen); if (mp == NULL) break; gen = mp->mnt_gen; #ifdef DEBUG printf("unmounting %p %s (%s)...\n", (void *)mp, mp->mnt_stat.f_mntonname, mp->mnt_stat.f_mntfromname); #endif if ((error = dounmount(mp, force ? MNT_FORCE : 0, l)) == 0) { vfs_unmount_print(mp, ""); progress = true; } else { vfs_rele(mp); if (verbose) { printf("unmount of %s failed with error %d\n", mp->mnt_stat.f_mntonname, error); } any_error = true; } } if (verbose) { printf("unmounting done\n"); } if (any_error && verbose) { printf("WARNING: some file systems would not unmount\n"); } /* If the mountlist is empty it is time to remove swap. */ mountlist_iterator_init(&iter); if (mountlist_iterator_next(iter) == NULL) { uvm_swap_shutdown(l); } mountlist_iterator_destroy(iter); return progress; } void vfs_sync_all(struct lwp *l) { printf("syncing disks... "); /* remove user processes from run queue */ suspendsched(); (void)spl0(); /* avoid coming back this way again if we panic. */ doing_shutdown = 1; do_sys_sync(l); /* Wait for sync to finish. */ if (vfs_syncwait() != 0) { #if defined(DDB) && defined(DEBUG_HALT_BUSY) Debugger(); #endif printf("giving up\n"); return; } else printf("done\n"); } /* * Sync and unmount file systems before shutting down. */ void vfs_shutdown(void) { lwp_t *l = curlwp; vfs_sync_all(l); /* * If we have panicked - do not make the situation potentially * worse by unmounting the file systems. */ if (panicstr != NULL) { return; } /* Unmount file systems. */ vfs_unmountall(l); } /* * Print a list of supported file system types (used by vfs_mountroot) */ static void vfs_print_fstypes(void) { struct vfsops *v; int cnt = 0; mutex_enter(&vfs_list_lock); LIST_FOREACH(v, &vfs_list, vfs_list) ++cnt; mutex_exit(&vfs_list_lock); if (cnt == 0) { printf("WARNING: No file system modules have been loaded.\n"); return; } printf("Supported file systems:"); mutex_enter(&vfs_list_lock); LIST_FOREACH(v, &vfs_list, vfs_list) { printf(" %s", v->vfs_name); } mutex_exit(&vfs_list_lock); printf("\n"); } /* * Mount the root file system. If the operator didn't specify a * file system to use, try all possible file systems until one * succeeds. */ int vfs_mountroot(void) { struct vfsops *v; int error = ENODEV; if (root_device == NULL) panic("vfs_mountroot: root device unknown"); switch (device_class(root_device)) { case DV_IFNET: if (rootdev != NODEV) panic("vfs_mountroot: rootdev set for DV_IFNET " "(0x%llx -> %llu,%llu)", (unsigned long long)rootdev, (unsigned long long)major(rootdev), (unsigned long long)minor(rootdev)); break; case DV_DISK: if (rootdev == NODEV) panic("vfs_mountroot: rootdev not set for DV_DISK"); if (bdevvp(rootdev, &rootvp)) panic("vfs_mountroot: can't get vnode for rootdev"); vn_lock(rootvp, LK_EXCLUSIVE | LK_RETRY); error = VOP_OPEN(rootvp, FREAD, FSCRED); VOP_UNLOCK(rootvp); if (error) { printf("vfs_mountroot: can't open root device\n"); return (error); } break; case DV_VIRTUAL: break; default: printf("%s: inappropriate for root file system\n", device_xname(root_device)); return (ENODEV); } /* * If user specified a root fs type, use it. Make sure the * specified type exists and has a mount_root() */ if (strcmp(rootfstype, ROOT_FSTYPE_ANY) != 0) { v = vfs_getopsbyname(rootfstype); error = EFTYPE; if (v != NULL) { if (v->vfs_mountroot != NULL) { error = (v->vfs_mountroot)(); } v->vfs_refcount--; } goto done; } /* * Try each file system currently configured into the kernel. */ mutex_enter(&vfs_list_lock); LIST_FOREACH(v, &vfs_list, vfs_list) { if (v->vfs_mountroot == NULL) continue; #ifdef DEBUG aprint_normal("mountroot: trying %s...\n", v->vfs_name); #endif v->vfs_refcount++; mutex_exit(&vfs_list_lock); error = (*v->vfs_mountroot)(); mutex_enter(&vfs_list_lock); v->vfs_refcount--; if (!error) { aprint_normal("root file system type: %s\n", v->vfs_name); break; } } mutex_exit(&vfs_list_lock); if (v == NULL) { vfs_print_fstypes(); printf("no file system for %s", device_xname(root_device)); if (device_class(root_device) == DV_DISK) printf(" (dev 0x%llx)", (unsigned long long)rootdev); printf("\n"); error = EFTYPE; } done: if (error && device_class(root_device) == DV_DISK) { vn_lock(rootvp, LK_EXCLUSIVE | LK_RETRY); VOP_CLOSE(rootvp, FREAD, FSCRED); VOP_UNLOCK(rootvp); vrele(rootvp); } if (error == 0) { mount_iterator_t *iter; struct mount *mp; mountlist_iterator_init(&iter); mp = mountlist_iterator_next(iter); KASSERT(mp != NULL); mountlist_iterator_destroy(iter); mp->mnt_flag |= MNT_ROOTFS; mp->mnt_op->vfs_refcount++; /* * Get the vnode for '/'. Set cwdi0.cwdi_cdir to * reference it, and donate it the reference grabbed * with VFS_ROOT(). */ error = VFS_ROOT(mp, LK_NONE, &rootvnode); if (error) panic("cannot find root vnode, error=%d", error); cwdi0.cwdi_cdir = rootvnode; cwdi0.cwdi_rdir = NULL; /* * Now that root is mounted, we can fixup initproc's CWD * info. All other processes are kthreads, which merely * share proc0's CWD info. */ initproc->p_cwdi->cwdi_cdir = rootvnode; vref(initproc->p_cwdi->cwdi_cdir); initproc->p_cwdi->cwdi_rdir = NULL; /* * Enable loading of modules from the filesystem */ module_load_vfs_init(); } return (error); } /* * mount_specific_key_create -- * Create a key for subsystem mount-specific data. */ int mount_specific_key_create(specificdata_key_t *keyp, specificdata_dtor_t dtor) { return specificdata_key_create(mount_specificdata_domain, keyp, dtor); } /* * mount_specific_key_delete -- * Delete a key for subsystem mount-specific data. */ void mount_specific_key_delete(specificdata_key_t key) { specificdata_key_delete(mount_specificdata_domain, key); } /* * mount_initspecific -- * Initialize a mount's specificdata container. */ void mount_initspecific(struct mount *mp) { int error __diagused; error = specificdata_init(mount_specificdata_domain, &mp->mnt_specdataref); KASSERT(error == 0); } /* * mount_finispecific -- * Finalize a mount's specificdata container. */ void mount_finispecific(struct mount *mp) { specificdata_fini(mount_specificdata_domain, &mp->mnt_specdataref); } /* * mount_getspecific -- * Return mount-specific data corresponding to the specified key. */ void * mount_getspecific(struct mount *mp, specificdata_key_t key) { return specificdata_getspecific(mount_specificdata_domain, &mp->mnt_specdataref, key); } /* * mount_setspecific -- * Set mount-specific data corresponding to the specified key. */ void mount_setspecific(struct mount *mp, specificdata_key_t key, void *data) { specificdata_setspecific(mount_specificdata_domain, &mp->mnt_specdataref, key, data); } /* * Check to see if a filesystem is mounted on a block device. */ int vfs_mountedon(vnode_t *vp) { vnode_t *vq; int error = 0; if (vp->v_type != VBLK) return ENOTBLK; if (spec_node_getmountedfs(vp) != NULL) return EBUSY; if (spec_node_lookup_by_dev(vp->v_type, vp->v_rdev, VDEAD_NOWAIT, &vq) == 0) { if (spec_node_getmountedfs(vq) != NULL) error = EBUSY; vrele(vq); } return error; } /* * Check if a device pointed to by vp is mounted. * * Returns: * EINVAL if it's not a disk * EBUSY if it's a disk and mounted * 0 if it's a disk and not mounted */ int rawdev_mounted(vnode_t *vp, vnode_t **bvpp) { vnode_t *bvp; dev_t dev; int d_type; bvp = NULL; d_type = D_OTHER; if (iskmemvp(vp)) return EINVAL; switch (vp->v_type) { case VCHR: { const struct cdevsw *cdev; dev = vp->v_rdev; cdev = cdevsw_lookup(dev); if (cdev != NULL) { dev_t blkdev; blkdev = devsw_chr2blk(dev); if (blkdev != NODEV) { if (vfinddev(blkdev, VBLK, &bvp) != 0) { d_type = (cdev->d_flag & D_TYPEMASK); /* XXX: what if bvp disappears? */ vrele(bvp); } } } break; } case VBLK: { const struct bdevsw *bdev; dev = vp->v_rdev; bdev = bdevsw_lookup(dev); if (bdev != NULL) d_type = (bdev->d_flag & D_TYPEMASK); bvp = vp; break; } default: break; } if (d_type != D_DISK) return EINVAL; if (bvpp != NULL) *bvpp = bvp; /* * XXX: This is bogus. We should be failing the request * XXX: not only if this specific slice is mounted, but * XXX: if it's on a disk with any other mounted slice. */ if (vfs_mountedon(bvp)) return EBUSY; return 0; } /* * Make a 'unique' number from a mount type name. */ long makefstype(const char *type) { long rv; for (rv = 0; *type; type++) { rv <<= 2; rv ^= *type; } return rv; } static struct mountlist_entry * mountlist_alloc(enum mountlist_type type, struct mount *mp) { struct mountlist_entry *me; me = kmem_zalloc(sizeof(*me), KM_SLEEP); me->me_mount = mp; me->me_type = type; return me; } static void mountlist_free(struct mountlist_entry *me) { kmem_free(me, sizeof(*me)); } void mountlist_iterator_init(mount_iterator_t **mip) { struct mountlist_entry *me; me = mountlist_alloc(ME_MARKER, NULL); mutex_enter(&mountlist_lock); TAILQ_INSERT_HEAD(&mountlist, me, me_list); mutex_exit(&mountlist_lock); *mip = (mount_iterator_t *)me; } void mountlist_iterator_destroy(mount_iterator_t *mi) { struct mountlist_entry *marker = &mi->mi_entry; if (marker->me_mount != NULL) vfs_unbusy(marker->me_mount); mutex_enter(&mountlist_lock); TAILQ_REMOVE(&mountlist, marker, me_list); mutex_exit(&mountlist_lock); mountlist_free(marker); } /* * Return the next mount or NULL for this iterator. * Mark it busy on success. */ static inline struct mount * _mountlist_iterator_next(mount_iterator_t *mi, bool wait) { struct mountlist_entry *me, *marker = &mi->mi_entry; struct mount *mp; int error; if (marker->me_mount != NULL) { vfs_unbusy(marker->me_mount); marker->me_mount = NULL; } mutex_enter(&mountlist_lock); for (;;) { KASSERT(marker->me_type == ME_MARKER); me = TAILQ_NEXT(marker, me_list); if (me == NULL) { /* End of list: keep marker and return. */ mutex_exit(&mountlist_lock); return NULL; } TAILQ_REMOVE(&mountlist, marker, me_list); TAILQ_INSERT_AFTER(&mountlist, me, marker, me_list); /* Skip other markers. */ if (me->me_type != ME_MOUNT) continue; /* Take an initial reference for vfs_busy() below. */ mp = me->me_mount; KASSERT(mp != NULL); vfs_ref(mp); mutex_exit(&mountlist_lock); /* Try to mark this mount busy and return on success. */ if (wait) error = vfs_busy(mp); else error = vfs_trybusy(mp); if (error == 0) { vfs_rele(mp); marker->me_mount = mp; return mp; } vfs_rele(mp); mutex_enter(&mountlist_lock); } } struct mount * mountlist_iterator_next(mount_iterator_t *mi) { return _mountlist_iterator_next(mi, true); } struct mount * mountlist_iterator_trynext(mount_iterator_t *mi) { return _mountlist_iterator_next(mi, false); } /* * Attach new mount to the end of the mount list. */ void mountlist_append(struct mount *mp) { struct mountlist_entry *me; me = mountlist_alloc(ME_MOUNT, mp); mutex_enter(&mountlist_lock); TAILQ_INSERT_TAIL(&mountlist, me, me_list); mutex_exit(&mountlist_lock); } /* * Remove mount from mount list. */void mountlist_remove(struct mount *mp) { struct mountlist_entry *me; mutex_enter(&mountlist_lock); TAILQ_FOREACH(me, &mountlist, me_list) if (me->me_type == ME_MOUNT && me->me_mount == mp) break; KASSERT(me != NULL); TAILQ_REMOVE(&mountlist, me, me_list); mutex_exit(&mountlist_lock); mountlist_free(me); } /* * Unlocked variant to traverse the mountlist. * To be used from DDB only. */ struct mount * _mountlist_next(struct mount *mp) { struct mountlist_entry *me; if (mp == NULL) { me = TAILQ_FIRST(&mountlist); } else { TAILQ_FOREACH(me, &mountlist, me_list) if (me->me_type == ME_MOUNT && me->me_mount == mp) break; if (me != NULL) me = TAILQ_NEXT(me, me_list); } while (me != NULL && me->me_type != ME_MOUNT) me = TAILQ_NEXT(me, me_list); return (me ? me->me_mount : NULL); }
191 176 11 12 1 1 9 341 342 341 340 342 337 11 340 17 17 17 4 332 331 330 327 28 322 14 1 5 14 10 10 1 12 12 2 2 2 2 298 2 110 233 336 59 286 332 290 24 69 69 1 69 32 2 9 25 2 24 32 2 336 340 24 14 7 7 4 4 1 501 498 1 1 1 1 1 1 1001 266 261 22 21 21 4 2 15 14 12 2 14 14 5 5 3 2 4 5 19 13 4 17 5 1 5 14 1 13 3 11 11 11 1 1 1 8 4 7 6 7 6 16 15 16 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 /* $NetBSD: kern_ktrace.c,v 1.184 2023/10/17 10:27:34 riastradh Exp $ */ /*- * Copyright (c) 2006, 2007, 2008, 2020 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Andrew Doran. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /* * Copyright (c) 1989, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)kern_ktrace.c 8.5 (Berkeley) 5/14/95 */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: kern_ktrace.c,v 1.184 2023/10/17 10:27:34 riastradh Exp $"); #include <sys/param.h> #include <sys/callout.h> #include <sys/cpu.h> #include <sys/file.h> #include <sys/filedesc.h> #include <sys/ioctl.h> #include <sys/kauth.h> #include <sys/kernel.h> #include <sys/kmem.h> #include <sys/kthread.h> #include <sys/ktrace.h> #include <sys/mount.h> #include <sys/proc.h> #include <sys/syncobj.h> #include <sys/syscallargs.h> #include <sys/syslog.h> #include <sys/systm.h> /* * TODO: * - need better error reporting? * - userland utility to sort ktrace.out by timestamp. * - keep minimum information in ktrace_entry when rest of alloc failed. * - per trace control of configurable parameters. */ struct ktrace_entry { TAILQ_ENTRY(ktrace_entry) kte_list; struct ktr_header kte_kth; void *kte_buf; size_t kte_bufsz; #define KTE_SPACE 32 uint8_t kte_space[KTE_SPACE] __aligned(sizeof(register_t)); }; struct ktr_desc { TAILQ_ENTRY(ktr_desc) ktd_list; int ktd_flags; #define KTDF_WAIT 0x0001 #define KTDF_DONE 0x0002 #define KTDF_BLOCKING 0x0004 #define KTDF_INTERACTIVE 0x0008 int ktd_error; #define KTDE_ENOMEM 0x0001 #define KTDE_ENOSPC 0x0002 int ktd_errcnt; int ktd_ref; /* # of reference */ int ktd_qcount; /* # of entry in the queue */ /* * Params to control behaviour. */ int ktd_delayqcnt; /* # of entry allowed to delay */ int ktd_wakedelay; /* delay of wakeup in *tick* */ int ktd_intrwakdl; /* ditto, but when interactive */ file_t *ktd_fp; /* trace output file */ lwp_t *ktd_lwp; /* our kernel thread */ TAILQ_HEAD(, ktrace_entry) ktd_queue; callout_t ktd_wakch; /* delayed wakeup */ kcondvar_t ktd_sync_cv; kcondvar_t ktd_cv; }; static void ktrwrite(struct ktr_desc *, struct ktrace_entry *); static int ktrops(lwp_t *, struct proc *, int, int, struct ktr_desc *); static int ktrsetchildren(lwp_t *, struct proc *, int, int, struct ktr_desc *); static int ktrcanset(lwp_t *, struct proc *); static int ktrsamefile(file_t *, file_t *); static void ktr_kmem(lwp_t *, int, const void *, size_t); static void ktr_io(lwp_t *, int, enum uio_rw, struct iovec *, size_t); static struct ktr_desc * ktd_lookup(file_t *); static void ktdrel(struct ktr_desc *); static void ktdref(struct ktr_desc *); static void ktefree(struct ktrace_entry *); static void ktd_logerrl(struct ktr_desc *, int); static void ktrace_thread(void *); static int ktrderefall(struct ktr_desc *, int); /* * Default values. */ #define KTD_MAXENTRY 1000 /* XXX: tune */ #define KTD_TIMEOUT 5 /* XXX: tune */ #define KTD_DELAYQCNT 100 /* XXX: tune */ #define KTD_WAKEDELAY 5000 /* XXX: tune */ #define KTD_INTRWAKDL 100 /* XXX: tune */ /* * Patchable variables. */ int ktd_maxentry = KTD_MAXENTRY; /* max # of entry in the queue */ int ktd_timeout = KTD_TIMEOUT; /* timeout in seconds */ int ktd_delayqcnt = KTD_DELAYQCNT; /* # of entry allowed to delay */ int ktd_wakedelay = KTD_WAKEDELAY; /* delay of wakeup in *ms* */ int ktd_intrwakdl = KTD_INTRWAKDL; /* ditto, but when interactive */ kmutex_t ktrace_lock; int ktrace_on; static TAILQ_HEAD(, ktr_desc) ktdq = TAILQ_HEAD_INITIALIZER(ktdq); static pool_cache_t kte_cache; static kauth_listener_t ktrace_listener; static void ktd_wakeup(struct ktr_desc *ktd) { callout_stop(&ktd->ktd_wakch); cv_signal(&ktd->ktd_cv); } static void ktd_callout(void *arg) { mutex_enter(&ktrace_lock); ktd_wakeup(arg); mutex_exit(&ktrace_lock); } static void ktd_logerrl(struct ktr_desc *ktd, int error) { ktd->ktd_error |= error; ktd->ktd_errcnt++; } #if 0 static void ktd_logerr(struct proc *p, int error) { struct ktr_desc *ktd; KASSERT(mutex_owned(&ktrace_lock)); ktd = p->p_tracep; if (ktd == NULL) return; ktd_logerrl(ktd, error); } #endif static int ktrace_listener_cb(kauth_cred_t cred, kauth_action_t action, void *cookie, void *arg0, void *arg1, void *arg2, void *arg3) { struct proc *p; int result; enum kauth_process_req req; result = KAUTH_RESULT_DEFER; p = arg0; if (action != KAUTH_PROCESS_KTRACE) return result; req = (enum kauth_process_req)(uintptr_t)arg1; /* Privileged; secmodel should handle these. */ if (req == KAUTH_REQ_PROCESS_KTRACE_PERSISTENT) return result; if ((p->p_traceflag & KTRFAC_PERSISTENT) || (p->p_flag & PK_SUGID)) return result; if (kauth_cred_geteuid(cred) == kauth_cred_getuid(p->p_cred) && kauth_cred_getuid(cred) == kauth_cred_getsvuid(p->p_cred) && kauth_cred_getgid(cred) == kauth_cred_getgid(p->p_cred) && kauth_cred_getgid(cred) == kauth_cred_getsvgid(p->p_cred)) result = KAUTH_RESULT_ALLOW; return result; } /* * Initialise the ktrace system. */ void ktrinit(void) { mutex_init(&ktrace_lock, MUTEX_DEFAULT, IPL_NONE); kte_cache = pool_cache_init(sizeof(struct ktrace_entry), 0, 0, 0, "ktrace", &pool_allocator_nointr, IPL_NONE, NULL, NULL, NULL); ktrace_listener = kauth_listen_scope(KAUTH_SCOPE_PROCESS, ktrace_listener_cb, NULL); } /* * Release a reference. Called with ktrace_lock held. */ static void ktdrel(struct ktr_desc *ktd) { KASSERT(mutex_owned(&ktrace_lock)); KDASSERT(ktd->ktd_ref != 0); KASSERT(ktd->ktd_ref > 0); KASSERT(ktrace_on > 0); ktrace_on--; if (--ktd->ktd_ref <= 0) { ktd->ktd_flags |= KTDF_DONE; cv_signal(&ktd->ktd_cv); } } static void ktdref(struct ktr_desc *ktd) { KASSERT(mutex_owned(&ktrace_lock)); ktd->ktd_ref++; ktrace_on++; } static struct ktr_desc * ktd_lookup(file_t *fp) { struct ktr_desc *ktd; KASSERT(mutex_owned(&ktrace_lock)); for (ktd = TAILQ_FIRST(&ktdq); ktd != NULL; ktd = TAILQ_NEXT(ktd, ktd_list)) { if (ktrsamefile(ktd->ktd_fp, fp)) { ktdref(ktd); break; } } return (ktd); } void ktraddentry(lwp_t *l, struct ktrace_entry *kte, int flags) { struct proc *p = l->l_proc; struct ktr_desc *ktd; #ifdef DEBUG struct timeval t1, t2; #endif mutex_enter(&ktrace_lock); if (p->p_traceflag & KTRFAC_TRC_EMUL) { /* Add emulation trace before first entry for this process */ p->p_traceflag &= ~KTRFAC_TRC_EMUL; mutex_exit(&ktrace_lock); ktrexit(l); ktremul(); (void)ktrenter(l); mutex_enter(&ktrace_lock); } /* Tracing may have been cancelled. */ ktd = p->p_tracep; if (ktd == NULL) goto freekte; /* * Bump reference count so that the object will remain while * we are here. Note that the trace is controlled by other * process. */ ktdref(ktd); if (ktd->ktd_flags & KTDF_DONE) goto relktd; if (ktd->ktd_qcount > ktd_maxentry) { ktd_logerrl(ktd, KTDE_ENOSPC); goto relktd; } TAILQ_INSERT_TAIL(&ktd->ktd_queue, kte, kte_list); ktd->ktd_qcount++; if (ktd->ktd_flags & KTDF_BLOCKING) goto skip_sync; if (flags & KTA_WAITOK && (/* flags & KTA_LARGE */0 || ktd->ktd_flags & KTDF_WAIT || ktd->ktd_qcount > ktd_maxentry >> 1)) /* * Sync with writer thread since we're requesting rather * big one or many requests are pending. */ do { ktd->ktd_flags |= KTDF_WAIT; ktd_wakeup(ktd); #ifdef DEBUG getmicrouptime(&t1); #endif if (cv_timedwait(&ktd->ktd_sync_cv, &ktrace_lock, ktd_timeout * hz) != 0) { ktd->ktd_flags |= KTDF_BLOCKING; /* * Maybe the writer thread is blocking * completely for some reason, but * don't stop target process forever. */ log(LOG_NOTICE, "ktrace timeout\n"); break; } #ifdef DEBUG getmicrouptime(&t2); timersub(&t2, &t1, &t2); if (t2.tv_sec > 0) log(LOG_NOTICE, "ktrace long wait: %lld.%06ld\n", (long long)t2.tv_sec, (long)t2.tv_usec); #endif } while (p->p_tracep == ktd && (ktd->ktd_flags & (KTDF_WAIT | KTDF_DONE)) == KTDF_WAIT); else { /* Schedule delayed wakeup */ if (ktd->ktd_qcount > ktd->ktd_delayqcnt) ktd_wakeup(ktd); /* Wakeup now */ else if (!callout_pending(&ktd->ktd_wakch)) callout_reset(&ktd->ktd_wakch, ktd->ktd_flags & KTDF_INTERACTIVE ? ktd->ktd_intrwakdl : ktd->ktd_wakedelay, ktd_callout, ktd); } skip_sync: ktdrel(ktd); mutex_exit(&ktrace_lock); ktrexit(l); return; relktd: ktdrel(ktd); freekte: mutex_exit(&ktrace_lock); ktefree(kte); ktrexit(l); } static void ktefree(struct ktrace_entry *kte) { if (kte->kte_buf != kte->kte_space) kmem_free(kte->kte_buf, kte->kte_bufsz); pool_cache_put(kte_cache, kte); } /* * "deep" compare of two files for the purposes of clearing a trace. * Returns true if they're the same open file, or if they point at the * same underlying vnode/socket. */ static int ktrsamefile(file_t *f1, file_t *f2) { return ((f1 == f2) || ((f1 != NULL) && (f2 != NULL) && (f1->f_type == f2->f_type) && (f1->f_data == f2->f_data))); } void ktrderef(struct proc *p) { struct ktr_desc *ktd = p->p_tracep; KASSERT(mutex_owned(&ktrace_lock)); p->p_traceflag = 0; if (ktd == NULL) return; p->p_tracep = NULL; cv_broadcast(&ktd->ktd_sync_cv); ktdrel(ktd); } void ktradref(struct proc *p) { struct ktr_desc *ktd = p->p_tracep; KASSERT(mutex_owned(&ktrace_lock)); ktdref(ktd); } static int ktrderefall(struct ktr_desc *ktd, int auth) { lwp_t *curl = curlwp; struct proc *p; int error = 0; mutex_enter(&proc_lock); PROCLIST_FOREACH(p, &allproc) { if (p->p_tracep != ktd) continue; mutex_enter(p->p_lock); mutex_enter(&ktrace_lock); if (p->p_tracep == ktd) { if (!auth || ktrcanset(curl, p)) ktrderef(p); else error = EPERM; } mutex_exit(&ktrace_lock); mutex_exit(p->p_lock); } mutex_exit(&proc_lock); return error; } int ktealloc(struct ktrace_entry **ktep, void **bufp, lwp_t *l, int type, size_t sz) { struct proc *p = l->l_proc; struct ktrace_entry *kte; struct ktr_header *kth; void *buf; if (ktrenter(l)) return EAGAIN; kte = pool_cache_get(kte_cache, PR_WAITOK); if (sz > sizeof(kte->kte_space)) { buf = kmem_alloc(sz, KM_SLEEP); } else buf = kte->kte_space; kte->kte_bufsz = sz; kte->kte_buf = buf; kth = &kte->kte_kth; (void)memset(kth, 0, sizeof(*kth)); kth->ktr_len = sz; kth->ktr_type = type; kth->ktr_pid = p->p_pid; memcpy(kth->ktr_comm, p->p_comm, MAXCOMLEN); kth->ktr_version = KTRFAC_VERSION(p->p_traceflag); kth->ktr_lid = l->l_lid; nanotime(&kth->ktr_ts); *ktep = kte; *bufp = buf; return 0; } void ktesethdrlen(struct ktrace_entry *kte, size_t l) { kte->kte_kth.ktr_len = l; } void ktr_syscall(register_t code, const register_t args[], int narg) { lwp_t *l = curlwp; struct proc *p = l->l_proc; struct ktrace_entry *kte; struct ktr_syscall *ktp; register_t *argp; size_t len; u_int i; if (!KTRPOINT(p, KTR_SYSCALL)) return; len = sizeof(struct ktr_syscall) + narg * sizeof argp[0]; if (ktealloc(&kte, (void *)&ktp, l, KTR_SYSCALL, len)) return; ktp->ktr_code = code; ktp->ktr_argsize = narg * sizeof argp[0]; argp = (register_t *)(ktp + 1); for (i = 0; i < narg; i++) *argp++ = args[i]; ktraddentry(l, kte, KTA_WAITOK); } void ktr_sysret(register_t code, int error, register_t *retval) { lwp_t *l = curlwp; struct ktrace_entry *kte; struct ktr_sysret *ktp; if (!KTRPOINT(l->l_proc, KTR_SYSRET)) return; if (ktealloc(&kte, (void *)&ktp, l, KTR_SYSRET, sizeof(struct ktr_sysret))) return; ktp->ktr_code = code; ktp->ktr_eosys = 0; /* XXX unused */ ktp->ktr_error = error; ktp->ktr_retval = retval && error == 0 ? retval[0] : 0; ktp->ktr_retval_1 = retval && error == 0 ? retval[1] : 0; ktraddentry(l, kte, KTA_WAITOK); } void ktr_namei(const char *path, size_t pathlen) { lwp_t *l = curlwp; if (!KTRPOINT(l->l_proc, KTR_NAMEI)) return; ktr_kmem(l, KTR_NAMEI, path, pathlen); } void ktr_namei2(const char *eroot, size_t erootlen, const char *path, size_t pathlen) { lwp_t *l = curlwp; struct ktrace_entry *kte; void *buf; if (!KTRPOINT(l->l_proc, KTR_NAMEI)) return; if (ktealloc(&kte, &buf, l, KTR_NAMEI, erootlen + pathlen)) return; memcpy(buf, eroot, erootlen); buf = (char *)buf + erootlen; memcpy(buf, path, pathlen); ktraddentry(l, kte, KTA_WAITOK); } void ktr_emul(void) { lwp_t *l = curlwp; const char *emul = l->l_proc->p_emul->e_name; if (!KTRPOINT(l->l_proc, KTR_EMUL)) return; ktr_kmem(l, KTR_EMUL, emul, strlen(emul)); } void ktr_execarg(const void *bf, size_t len) { lwp_t *l = curlwp; if (!KTRPOINT(l->l_proc, KTR_EXEC_ARG)) return; ktr_kmem(l, KTR_EXEC_ARG, bf, len); } void ktr_execenv(const void *bf, size_t len) { lwp_t *l = curlwp; if (!KTRPOINT(l->l_proc, KTR_EXEC_ENV)) return; ktr_kmem(l, KTR_EXEC_ENV, bf, len); } void ktr_execfd(int fd, u_int dtype) { struct ktrace_entry *kte; struct ktr_execfd* ktp; lwp_t *l = curlwp; if (!KTRPOINT(l->l_proc, KTR_EXEC_FD)) return; if (ktealloc(&kte, (void *)&ktp, l, KTR_EXEC_FD, sizeof(*ktp))) return; ktp->ktr_fd = fd; ktp->ktr_dtype = dtype; ktraddentry(l, kte, KTA_WAITOK); } static void ktr_kmem(lwp_t *l, int type, const void *bf, size_t len) { struct ktrace_entry *kte; void *buf; if (ktealloc(&kte, &buf, l, type, len)) return; memcpy(buf, bf, len); ktraddentry(l, kte, KTA_WAITOK); } static void ktr_io(lwp_t *l, int fd, enum uio_rw rw, struct iovec *iov, size_t len) { struct ktrace_entry *kte; struct ktr_genio *ktp; size_t resid = len, cnt, buflen; char *cp; next: buflen = uimin(PAGE_SIZE, resid + sizeof(struct ktr_genio)); if (ktealloc(&kte, (void *)&ktp, l, KTR_GENIO, buflen)) return; ktp->ktr_fd = fd; ktp->ktr_rw = rw; cp = (void *)(ktp + 1); buflen -= sizeof(struct ktr_genio); kte->kte_kth.ktr_len = sizeof(struct ktr_genio); while (buflen > 0) { cnt = uimin(iov->iov_len, buflen); if (copyin(iov->iov_base, cp, cnt) != 0) goto out; kte->kte_kth.ktr_len += cnt; cp += cnt; buflen -= cnt; resid -= cnt; iov->iov_len -= cnt; if (iov->iov_len == 0) iov++; else iov->iov_base = (char *)iov->iov_base + cnt; } /* * Don't push so many entry at once. It will cause kmem map * shortage. */ ktraddentry(l, kte, KTA_WAITOK | KTA_LARGE); if (resid > 0) { if (preempt_needed()) { (void)ktrenter(l); preempt(); ktrexit(l); } goto next; } return; out: ktefree(kte); ktrexit(l); } void ktr_genio(int fd, enum uio_rw rw, const void *addr, size_t len, int error) { lwp_t *l = curlwp; struct iovec iov; if (!KTRPOINT(l->l_proc, KTR_GENIO) || error != 0) return; iov.iov_base = __UNCONST(addr); iov.iov_len = len; ktr_io(l, fd, rw, &iov, len); } void ktr_geniov(int fd, enum uio_rw rw, struct iovec *iov, size_t len, int error) { lwp_t *l = curlwp; if (!KTRPOINT(l->l_proc, KTR_GENIO) || error != 0) return; ktr_io(l, fd, rw, iov, len); } void ktr_mibio(int fd, enum uio_rw rw, const void *addr, size_t len, int error) { lwp_t *l = curlwp; struct iovec iov; if (!KTRPOINT(l->l_proc, KTR_MIB) || error != 0) return; iov.iov_base = __UNCONST(addr); iov.iov_len = len; ktr_io(l, fd, rw, &iov, len); } void ktr_psig(int sig, sig_t action, const sigset_t *mask, const ksiginfo_t *ksi) { struct ktrace_entry *kte; lwp_t *l = curlwp; struct { struct ktr_psig kp; siginfo_t si; } *kbuf; if (!KTRPOINT(l->l_proc, KTR_PSIG)) return; if (ktealloc(&kte, (void *)&kbuf, l, KTR_PSIG, sizeof(*kbuf))) return; memset(&kbuf->kp, 0, sizeof(kbuf->kp)); kbuf->kp.signo = (char)sig; kbuf->kp.action = action; kbuf->kp.mask = *mask; if (ksi) { kbuf->kp.code = KSI_TRAPCODE(ksi); (void)memset(&kbuf->si, 0, sizeof(kbuf->si)); kbuf->si._info = ksi->ksi_info; kte->kte_kth.ktr_len = sizeof(*kbuf); } else { kbuf->kp.code = 0; kte->kte_kth.ktr_len = sizeof(struct ktr_psig); } ktraddentry(l, kte, KTA_WAITOK); } void ktr_csw(int out, int user, const struct syncobj *syncobj) { lwp_t *l = curlwp; struct proc *p = l->l_proc; struct ktrace_entry *kte; struct ktr_csw *kc; if (!KTRPOINT(p, KTR_CSW)) return; /* * Don't record context switches resulting from blocking on * locks; the results are not useful, and the mutex may be in a * softint, which would lead us to ktealloc in softint context, * which is forbidden. */ if (syncobj == &mutex_syncobj || syncobj == &rw_syncobj) return; KASSERT(!cpu_intr_p()); KASSERT(!cpu_softintr_p()); /* * We can't sleep if we're already going to sleep (if original * condition is met during sleep, we hang up). * * XXX This is not ideal: it would be better to maintain a pool * of ktes and actually push this to the kthread when context * switch happens, however given the points where we are called * from that is difficult to do. */ if (out) { if (ktrenter(l)) return; nanotime(&l->l_ktrcsw); l->l_pflag |= LP_KTRCSW; if (user) l->l_pflag |= LP_KTRCSWUSER; else l->l_pflag &= ~LP_KTRCSWUSER; ktrexit(l); return; } /* * On the way back in, we need to record twice: once for entry, and * once for exit. */ if ((l->l_pflag & LP_KTRCSW) != 0) { struct timespec *ts; l->l_pflag &= ~LP_KTRCSW; if (ktealloc(&kte, (void *)&kc, l, KTR_CSW, sizeof(*kc))) return; kc->out = 1; kc->user = ((l->l_pflag & LP_KTRCSWUSER) != 0); ts = &l->l_ktrcsw; switch (KTRFAC_VERSION(p->p_traceflag)) { case 0: kte->kte_kth.ktr_otv.tv_sec = ts->tv_sec; kte->kte_kth.ktr_otv.tv_usec = ts->tv_nsec / 1000; break; case 1: kte->kte_kth.ktr_ots.tv_sec = ts->tv_sec; kte->kte_kth.ktr_ots.tv_nsec = ts->tv_nsec; break; case 2: kte->kte_kth.ktr_ts.tv_sec = ts->tv_sec; kte->kte_kth.ktr_ts.tv_nsec = ts->tv_nsec; break; default: break; } ktraddentry(l, kte, KTA_WAITOK); } if (ktealloc(&kte, (void *)&kc, l, KTR_CSW, sizeof(*kc))) return; kc->out = 0; kc->user = user; ktraddentry(l, kte, KTA_WAITOK); } bool ktr_point(int fac_bit) { return curlwp->l_proc->p_traceflag & fac_bit; } int ktruser(const char *id, void *addr, size_t len, int ustr) { struct ktrace_entry *kte; struct ktr_user *ktp; lwp_t *l = curlwp; void *user_dta; int error; if (!KTRPOINT(l->l_proc, KTR_USER)) return 0; if (len > KTR_USER_MAXLEN) return ENOSPC; error = ktealloc(&kte, (void *)&ktp, l, KTR_USER, sizeof(*ktp) + len); if (error != 0) return error; if (ustr) { if (copyinstr(id, ktp->ktr_id, KTR_USER_MAXIDLEN, NULL) != 0) ktp->ktr_id[0] = '\0'; } else strncpy(ktp->ktr_id, id, KTR_USER_MAXIDLEN); ktp->ktr_id[KTR_USER_MAXIDLEN-1] = '\0'; user_dta = (void *)(ktp + 1); if ((error = copyin(addr, user_dta, len)) != 0) kte->kte_kth.ktr_len = 0; ktraddentry(l, kte, KTA_WAITOK); return error; } void ktr_kuser(const char *id, const void *addr, size_t len) { struct ktrace_entry *kte; struct ktr_user *ktp; lwp_t *l = curlwp; int error; if (!KTRPOINT(l->l_proc, KTR_USER)) return; if (len > KTR_USER_MAXLEN) return; error = ktealloc(&kte, (void *)&ktp, l, KTR_USER, sizeof(*ktp) + len); if (error != 0) return; strncpy(ktp->ktr_id, id, KTR_USER_MAXIDLEN - 1); ktp->ktr_id[KTR_USER_MAXIDLEN - 1] = '\0'; memcpy(ktp + 1, addr, len); ktraddentry(l, kte, KTA_WAITOK); } void ktr_mib(const int *name, u_int namelen) { struct ktrace_entry *kte; int *namep; size_t size; lwp_t *l = curlwp; if (!KTRPOINT(l->l_proc, KTR_MIB)) return; size = namelen * sizeof(*name); if (ktealloc(&kte, (void *)&namep, l, KTR_MIB, size)) return; (void)memcpy(namep, name, namelen * sizeof(*name)); ktraddentry(l, kte, KTA_WAITOK); } /* Interface and common routines */ int ktrace_common(lwp_t *curl, int ops, int facs, int pid, file_t **fpp) { struct proc *p; struct pgrp *pg; struct ktr_desc *ktd = NULL, *nktd; file_t *fp = *fpp; int ret = 0; int error = 0; int descend; descend = ops & KTRFLAG_DESCEND; facs = facs & ~((unsigned) KTRFAC_PERSISTENT); (void)ktrenter(curl); switch (KTROP(ops)) { case KTROP_CLEARFILE: /* * Clear all uses of the tracefile */ mutex_enter(&ktrace_lock); ktd = ktd_lookup(fp); mutex_exit(&ktrace_lock); if (ktd == NULL) goto done; error = ktrderefall(ktd, 1); goto done; case KTROP_SET: mutex_enter(&ktrace_lock); ktd = ktd_lookup(fp); mutex_exit(&ktrace_lock); if (ktd == NULL) { nktd = kmem_alloc(sizeof(*nktd), KM_SLEEP); TAILQ_INIT(&nktd->ktd_queue); callout_init(&nktd->ktd_wakch, CALLOUT_MPSAFE); cv_init(&nktd->ktd_cv, "ktrwait"); cv_init(&nktd->ktd_sync_cv, "ktrsync"); nktd->ktd_flags = 0; nktd->ktd_qcount = 0; nktd->ktd_error = 0; nktd->ktd_errcnt = 0; nktd->ktd_delayqcnt = ktd_delayqcnt; nktd->ktd_wakedelay = mstohz(ktd_wakedelay); nktd->ktd_intrwakdl = mstohz(ktd_intrwakdl); nktd->ktd_ref = 0; nktd->ktd_fp = fp; mutex_enter(&ktrace_lock); ktdref(nktd); mutex_exit(&ktrace_lock); /* * XXX: not correct. needs an way to detect * whether ktruss or ktrace. */ if (fp->f_type == DTYPE_PIPE) nktd->ktd_flags |= KTDF_INTERACTIVE; mutex_enter(&fp->f_lock); fp->f_count++; mutex_exit(&fp->f_lock); error = kthread_create(PRI_NONE, KTHREAD_MPSAFE, NULL, ktrace_thread, nktd, &nktd->ktd_lwp, "ktrace"); if (error != 0) { kmem_free(nktd, sizeof(*nktd)); nktd = NULL; mutex_enter(&fp->f_lock); fp->f_count--; mutex_exit(&fp->f_lock); goto done; } mutex_enter(&ktrace_lock); ktd = ktd_lookup(fp); if (ktd != NULL) { ktdrel(nktd); nktd = NULL; } else { TAILQ_INSERT_TAIL(&ktdq, nktd, ktd_list); ktd = nktd; } mutex_exit(&ktrace_lock); } break; case KTROP_CLEAR: break; } /* * need something to (un)trace (XXX - why is this here?) */ if (!facs) { error = EINVAL; *fpp = NULL; goto done; } /* * do it */ mutex_enter(&proc_lock); if (pid < 0) { /* * by process group */ pg = pgrp_find(-pid); if (pg == NULL) error = ESRCH; else { LIST_FOREACH(p, &pg->pg_members, p_pglist) { if (descend) ret |= ktrsetchildren(curl, p, ops, facs, ktd); else ret |= ktrops(curl, p, ops, facs, ktd); } } } else { /* * by pid */ p = proc_find(pid); if (p == NULL) error = ESRCH; else if (descend) ret |= ktrsetchildren(curl, p, ops, facs, ktd); else ret |= ktrops(curl, p, ops, facs, ktd); } mutex_exit(&proc_lock); if (error == 0 && !ret) error = EPERM; *fpp = NULL; done: if (ktd != NULL) { mutex_enter(&ktrace_lock); if (error != 0) { /* * Wakeup the thread so that it can be die if we * can't trace any process. */ ktd_wakeup(ktd); } if (KTROP(ops) == KTROP_SET || KTROP(ops) == KTROP_CLEARFILE) ktdrel(ktd); mutex_exit(&ktrace_lock); } ktrexit(curl); return (error); } /* * fktrace system call */ /* ARGSUSED */ int sys_fktrace(struct lwp *l, const struct sys_fktrace_args *uap, register_t *retval) { /* { syscallarg(int) fd; syscallarg(int) ops; syscallarg(int) facs; syscallarg(int) pid; } */ file_t *fp; int error, fd; fd = SCARG(uap, fd); if ((fp = fd_getfile(fd)) == NULL) return (EBADF); if ((fp->f_flag & FWRITE) == 0) error = EBADF; else error = ktrace_common(l, SCARG(uap, ops), SCARG(uap, facs), SCARG(uap, pid), &fp); fd_putfile(fd); return error; } static int ktrops(lwp_t *curl, struct proc *p, int ops, int facs, struct ktr_desc *ktd) { int vers = ops & KTRFAC_VER_MASK; int error = 0; mutex_enter(p->p_lock); mutex_enter(&ktrace_lock); if (!ktrcanset(curl, p)) goto out; switch (vers) { case KTRFACv0: case KTRFACv1: case KTRFACv2: break; default: error = EINVAL; goto out; } if (KTROP(ops) == KTROP_SET) { if (p->p_tracep != ktd) { /* * if trace file already in use, relinquish */ ktrderef(p); p->p_tracep = ktd; ktradref(p); } p->p_traceflag |= facs; if (kauth_authorize_process(curl->l_cred, KAUTH_PROCESS_KTRACE, p, KAUTH_ARG(KAUTH_REQ_PROCESS_KTRACE_PERSISTENT), NULL, NULL) == 0) p->p_traceflag |= KTRFAC_PERSISTENT; } else { /* KTROP_CLEAR */ if (((p->p_traceflag &= ~facs) & KTRFAC_MASK) == 0) { /* no more tracing */ ktrderef(p); } } if (p->p_traceflag) p->p_traceflag |= vers; /* * Emit an emulation record, every time there is a ktrace * change/attach request. */ if (KTRPOINT(p, KTR_EMUL)) p->p_traceflag |= KTRFAC_TRC_EMUL; p->p_trace_enabled = trace_is_enabled(p); #ifdef __HAVE_SYSCALL_INTERN (*p->p_emul->e_syscall_intern)(p); #endif out: mutex_exit(&ktrace_lock); mutex_exit(p->p_lock); return error ? 0 : 1; } static int ktrsetchildren(lwp_t *curl, struct proc *top, int ops, int facs, struct ktr_desc *ktd) { struct proc *p; int ret = 0; KASSERT(mutex_owned(&proc_lock)); p = top; for (;;) { ret |= ktrops(curl, p, ops, facs, ktd); /* * If this process has children, descend to them next, * otherwise do any siblings, and if done with this level, * follow back up the tree (but not past top). */ if (LIST_FIRST(&p->p_children) != NULL) { p = LIST_FIRST(&p->p_children); continue; } for (;;) { if (p == top) return (ret); if (LIST_NEXT(p, p_sibling) != NULL) { p = LIST_NEXT(p, p_sibling); break; } p = p->p_pptr; } } /*NOTREACHED*/ } static void ktrwrite(struct ktr_desc *ktd, struct ktrace_entry *kte) { size_t hlen; struct uio auio; struct iovec aiov[64], *iov; struct ktrace_entry *top = kte; struct ktr_header *kth; file_t *fp = ktd->ktd_fp; int error; next: auio.uio_iov = iov = &aiov[0]; auio.uio_offset = 0; auio.uio_rw = UIO_WRITE; auio.uio_resid = 0; auio.uio_iovcnt = 0; UIO_SETUP_SYSSPACE(&auio); do { struct timespec ts; lwpid_t lid; kth = &kte->kte_kth; hlen = sizeof(struct ktr_header); switch (kth->ktr_version) { case 0: ts = kth->ktr_time; kth->ktr_otv.tv_sec = ts.tv_sec; kth->ktr_otv.tv_usec = ts.tv_nsec / 1000; kth->ktr_unused = NULL; hlen -= sizeof(kth->_v) - MAX(sizeof(kth->_v._v0), sizeof(kth->_v._v1)); break; case 1: ts = kth->ktr_time; lid = kth->ktr_lid; kth->ktr_ots.tv_sec = ts.tv_sec; kth->ktr_ots.tv_nsec = ts.tv_nsec; kth->ktr_olid = lid; hlen -= sizeof(kth->_v) - MAX(sizeof(kth->_v._v0), sizeof(kth->_v._v1)); break; } iov->iov_base = (void *)kth; iov++->iov_len = hlen; auio.uio_resid += hlen; auio.uio_iovcnt++; if (kth->ktr_len > 0) { iov->iov_base = kte->kte_buf; iov++->iov_len = kth->ktr_len; auio.uio_resid += kth->ktr_len; auio.uio_iovcnt++; } } while ((kte = TAILQ_NEXT(kte, kte_list)) != NULL && auio.uio_iovcnt < sizeof(aiov) / sizeof(aiov[0]) - 1); again: error = (*fp->f_ops->fo_write)(fp, &fp->f_offset, &auio, fp->f_cred, FOF_UPDATE_OFFSET); switch (error) { case 0: if (auio.uio_resid > 0) goto again; if (kte != NULL) goto next; break; case EWOULDBLOCK: kpause("ktrzzz", false, 1, NULL); goto again; default: /* * If error encountered, give up tracing on this * vnode. Don't report EPIPE as this can easily * happen with fktrace()/ktruss. */ #ifndef DEBUG if (error != EPIPE) #endif log(LOG_NOTICE, "ktrace write failed, errno %d, tracing stopped\n", error); (void)ktrderefall(ktd, 0); } while ((kte = top) != NULL) { top = TAILQ_NEXT(top, kte_list); ktefree(kte); } } static void ktrace_thread(void *arg) { struct ktr_desc *ktd = arg; file_t *fp = ktd->ktd_fp; struct ktrace_entry *kte; int ktrerr, errcnt; mutex_enter(&ktrace_lock); for (;;) { kte = TAILQ_FIRST(&ktd->ktd_queue); if (kte == NULL) { if (ktd->ktd_flags & KTDF_WAIT) { ktd->ktd_flags &= ~(KTDF_WAIT | KTDF_BLOCKING); cv_broadcast(&ktd->ktd_sync_cv); } if (ktd->ktd_ref == 0) break; cv_wait(&ktd->ktd_cv, &ktrace_lock); continue; } TAILQ_INIT(&ktd->ktd_queue); ktd->ktd_qcount = 0; ktrerr = ktd->ktd_error; errcnt = ktd->ktd_errcnt; ktd->ktd_error = ktd->ktd_errcnt = 0; mutex_exit(&ktrace_lock); if (ktrerr) { log(LOG_NOTICE, "ktrace failed, fp %p, error 0x%x, total %d\n", fp, ktrerr, errcnt); } ktrwrite(ktd, kte); mutex_enter(&ktrace_lock); } if (ktd_lookup(ktd->ktd_fp) == ktd) { TAILQ_REMOVE(&ktdq, ktd, ktd_list); } else { /* nothing, collision in KTROP_SET */ } callout_halt(&ktd->ktd_wakch, &ktrace_lock); callout_destroy(&ktd->ktd_wakch); mutex_exit(&ktrace_lock); /* * ktrace file descriptor can't be watched (are not visible to * userspace), so no kqueue stuff here * XXX: The above comment is wrong, because the fktrace file * descriptor is available in userland. */ closef(fp); cv_destroy(&ktd->ktd_sync_cv); cv_destroy(&ktd->ktd_cv); kmem_free(ktd, sizeof(*ktd)); kthread_exit(0); } /* * Return true if caller has permission to set the ktracing state * of target. Essentially, the target can't possess any * more permissions than the caller. KTRFAC_PERSISTENT signifies that * the tracing will persist on sugid processes during exec; it is only * settable by a process with appropriate credentials. * * TODO: check groups. use caller effective gid. */ static int ktrcanset(lwp_t *calll, struct proc *targetp) { KASSERT(mutex_owned(targetp->p_lock)); KASSERT(mutex_owned(&ktrace_lock)); if (kauth_authorize_process(calll->l_cred, KAUTH_PROCESS_KTRACE, targetp, NULL, NULL, NULL) == 0) return (1); return (0); } /* * Put user defined entry to ktrace records. */ int sys_utrace(struct lwp *l, const struct sys_utrace_args *uap, register_t *retval) { /* { syscallarg(const char *) label; syscallarg(void *) addr; syscallarg(size_t) len; } */ return ktruser(SCARG(uap, label), SCARG(uap, addr), SCARG(uap, len), 1); }
1 2 2 2 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 /* $NetBSD: kern_threadpool.c,v 1.23 2021/01/23 16:33:49 riastradh Exp $ */ /*- * Copyright (c) 2014, 2018 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Taylor R. Campbell and Jason R. Thorpe. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /* * Thread pools. * * A thread pool is a collection of worker threads idle or running * jobs, together with a dispatcher thread that does not run jobs but * can be given jobs to assign to a worker thread. Scheduling a job in * a thread pool does not allocate or even sleep at all, except perhaps * on an adaptive lock, unlike kthread_create. Jobs reuse threads, so * they do not incur the expense of creating and destroying kthreads * unless there is not much work to be done. * * A per-CPU thread pool (threadpool_percpu) is a collection of thread * pools, one per CPU bound to that CPU. For each priority level in * use, there is one shared unbound thread pool (i.e., pool of threads * not bound to any CPU) and one shared per-CPU thread pool. * * To use the unbound thread pool at priority pri, call * threadpool_get(&pool, pri). When you're done, call * threadpool_put(pool, pri). * * To use the per-CPU thread pools at priority pri, call * threadpool_percpu_get(&pool_percpu, pri), and then use the thread * pool returned by threadpool_percpu_ref(pool_percpu) for the current * CPU, or by threadpool_percpu_ref_remote(pool_percpu, ci) for another * CPU. When you're done, call threadpool_percpu_put(pool_percpu, * pri). * * +--MACHINE-----------------------------------------------------+ * | +--CPU 0---------+ +--CPU 1---------+ +--CPU n---------+ | * | | <dispatcher 0> | | <dispatcher 1> | ... | <dispatcher n> | | * | | <idle 0a> | | <running 1a> | ... | <idle na> | | * | | <running 0b> | | <running 1b> | ... | <idle nb> | | * | | . | | . | ... | . | | * | | . | | . | ... | . | | * | | . | | . | ... | . | | * | +----------------+ +----------------+ +----------------+ | * | +--unbound-----------+ | * | | <dispatcher n+1> | | * | | <idle (n+1)a> | | * | | <running (n+1)b> | | * | +--------------------+ | * +--------------------------------------------------------------+ * * XXX Why one dispatcher per CPU? I did that originally to avoid * touching remote CPUs' memory when scheduling a job, but that still * requires interprocessor synchronization. Perhaps we could get by * with a single dispatcher thread, at the expense of another pointer * in struct threadpool_job to identify the CPU on which it must run in * order for the dispatcher to schedule it correctly. */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: kern_threadpool.c,v 1.23 2021/01/23 16:33:49 riastradh Exp $"); #include <sys/types.h> #include <sys/param.h> #include <sys/atomic.h> #include <sys/condvar.h> #include <sys/cpu.h> #include <sys/kernel.h> #include <sys/kmem.h> #include <sys/kthread.h> #include <sys/mutex.h> #include <sys/once.h> #include <sys/percpu.h> #include <sys/pool.h> #include <sys/proc.h> #include <sys/queue.h> #include <sys/sdt.h> #include <sys/sysctl.h> #include <sys/systm.h> #include <sys/threadpool.h> /* Probes */ SDT_PROBE_DEFINE1(sdt, kernel, threadpool, get, "pri_t"/*pri*/); SDT_PROBE_DEFINE1(sdt, kernel, threadpool, get__create, "pri_t"/*pri*/); SDT_PROBE_DEFINE1(sdt, kernel, threadpool, get__race, "pri_t"/*pri*/); SDT_PROBE_DEFINE2(sdt, kernel, threadpool, put, "struct threadpool *"/*pool*/, "pri_t"/*pri*/); SDT_PROBE_DEFINE2(sdt, kernel, threadpool, put__destroy, "struct threadpool *"/*pool*/, "pri_t"/*pri*/); SDT_PROBE_DEFINE1(sdt, kernel, threadpool, percpu__get, "pri_t"/*pri*/); SDT_PROBE_DEFINE1(sdt, kernel, threadpool, percpu__get__create, "pri_t"/*pri*/); SDT_PROBE_DEFINE1(sdt, kernel, threadpool, percpu__get__race, "pri_t"/*pri*/); SDT_PROBE_DEFINE2(sdt, kernel, threadpool, percpu__put, "struct threadpool *"/*pool*/, "pri_t"/*pri*/); SDT_PROBE_DEFINE2(sdt, kernel, threadpool, percpu__put__destroy, "struct threadpool *"/*pool*/, "pri_t"/*pri*/); SDT_PROBE_DEFINE2(sdt, kernel, threadpool, create, "struct cpu_info *"/*ci*/, "pri_t"/*pri*/); SDT_PROBE_DEFINE3(sdt, kernel, threadpool, create__success, "struct cpu_info *"/*ci*/, "pri_t"/*pri*/, "struct threadpool *"/*pool*/); SDT_PROBE_DEFINE3(sdt, kernel, threadpool, create__failure, "struct cpu_info *"/*ci*/, "pri_t"/*pri*/, "int"/*error*/); SDT_PROBE_DEFINE1(sdt, kernel, threadpool, destroy, "struct threadpool *"/*pool*/); SDT_PROBE_DEFINE2(sdt, kernel, threadpool, destroy__wait, "struct threadpool *"/*pool*/, "uint64_t"/*refcnt*/); SDT_PROBE_DEFINE2(sdt, kernel, threadpool, schedule__job, "struct threadpool *"/*pool*/, "struct threadpool_job *"/*job*/); SDT_PROBE_DEFINE2(sdt, kernel, threadpool, schedule__job__running, "struct threadpool *"/*pool*/, "struct threadpool_job *"/*job*/); SDT_PROBE_DEFINE2(sdt, kernel, threadpool, schedule__job__dispatcher, "struct threadpool *"/*pool*/, "struct threadpool_job *"/*job*/); SDT_PROBE_DEFINE3(sdt, kernel, threadpool, schedule__job__thread, "struct threadpool *"/*pool*/, "struct threadpool_job *"/*job*/, "struct lwp *"/*thread*/); SDT_PROBE_DEFINE1(sdt, kernel, threadpool, dispatcher__start, "struct threadpool *"/*pool*/); SDT_PROBE_DEFINE1(sdt, kernel, threadpool, dispatcher__dying, "struct threadpool *"/*pool*/); SDT_PROBE_DEFINE1(sdt, kernel, threadpool, dispatcher__spawn, "struct threadpool *"/*pool*/); SDT_PROBE_DEFINE2(sdt, kernel, threadpool, dispatcher__race, "struct threadpool *"/*pool*/, "struct threadpool_job *"/*job*/); SDT_PROBE_DEFINE3(sdt, kernel, threadpool, dispatcher__assign, "struct threadpool *"/*pool*/, "struct threadpool_job *"/*job*/, "struct lwp *"/*thread*/); SDT_PROBE_DEFINE1(sdt, kernel, threadpool, dispatcher__exit, "struct threadpool *"/*pool*/); SDT_PROBE_DEFINE1(sdt, kernel, threadpool, thread__start, "struct threadpool *"/*pool*/); SDT_PROBE_DEFINE1(sdt, kernel, threadpool, thread__dying, "struct threadpool *"/*pool*/); SDT_PROBE_DEFINE2(sdt, kernel, threadpool, thread__job, "struct threadpool *"/*pool*/, "struct threadpool_job *"/*job*/); SDT_PROBE_DEFINE1(sdt, kernel, threadpool, thread__exit, "struct threadpool *"/*pool*/); /* Data structures */ TAILQ_HEAD(job_head, threadpool_job); TAILQ_HEAD(thread_head, threadpool_thread); struct threadpool_thread { struct lwp *tpt_lwp; char *tpt_lwp_savedname; struct threadpool *tpt_pool; struct threadpool_job *tpt_job; kcondvar_t tpt_cv; TAILQ_ENTRY(threadpool_thread) tpt_entry; }; struct threadpool { kmutex_t tp_lock; struct threadpool_thread tp_dispatcher; struct job_head tp_jobs; struct thread_head tp_idle_threads; uint64_t tp_refcnt; int tp_flags; #define THREADPOOL_DYING 0x01 struct cpu_info *tp_cpu; pri_t tp_pri; }; static void threadpool_hold(struct threadpool *); static void threadpool_rele(struct threadpool *); static int threadpool_percpu_create(struct threadpool_percpu **, pri_t); static void threadpool_percpu_destroy(struct threadpool_percpu *); static void threadpool_percpu_init(void *, void *, struct cpu_info *); static void threadpool_percpu_ok(void *, void *, struct cpu_info *); static void threadpool_percpu_fini(void *, void *, struct cpu_info *); static threadpool_job_fn_t threadpool_job_dead; static void threadpool_job_hold(struct threadpool_job *); static void threadpool_job_rele(struct threadpool_job *); static void threadpool_dispatcher_thread(void *) __dead; static void threadpool_thread(void *) __dead; static pool_cache_t threadpool_thread_pc __read_mostly; static kmutex_t threadpools_lock __cacheline_aligned; /* Default to 30 second idle timeout for pool threads. */ static int threadpool_idle_time_ms = 30 * 1000; struct threadpool_unbound { struct threadpool tpu_pool; /* protected by threadpools_lock */ LIST_ENTRY(threadpool_unbound) tpu_link; uint64_t tpu_refcnt; }; static LIST_HEAD(, threadpool_unbound) unbound_threadpools; static struct threadpool_unbound * threadpool_lookup_unbound(pri_t pri) { struct threadpool_unbound *tpu; LIST_FOREACH(tpu, &unbound_threadpools, tpu_link) { if (tpu->tpu_pool.tp_pri == pri) return tpu; } return NULL; } static void threadpool_insert_unbound(struct threadpool_unbound *tpu) { KASSERT(threadpool_lookup_unbound(tpu->tpu_pool.tp_pri) == NULL); LIST_INSERT_HEAD(&unbound_threadpools, tpu, tpu_link); } static void threadpool_remove_unbound(struct threadpool_unbound *tpu) { KASSERT(threadpool_lookup_unbound(tpu->tpu_pool.tp_pri) == tpu); LIST_REMOVE(tpu, tpu_link); } struct threadpool_percpu { percpu_t * tpp_percpu; pri_t tpp_pri; /* protected by threadpools_lock */ LIST_ENTRY(threadpool_percpu) tpp_link; uint64_t tpp_refcnt; }; static LIST_HEAD(, threadpool_percpu) percpu_threadpools; static struct threadpool_percpu * threadpool_lookup_percpu(pri_t pri) { struct threadpool_percpu *tpp; LIST_FOREACH(tpp, &percpu_threadpools, tpp_link) { if (tpp->tpp_pri == pri) return tpp; } return NULL; } static void threadpool_insert_percpu(struct threadpool_percpu *tpp) { KASSERT(threadpool_lookup_percpu(tpp->tpp_pri) == NULL); LIST_INSERT_HEAD(&percpu_threadpools, tpp, tpp_link); } static void threadpool_remove_percpu(struct threadpool_percpu *tpp) { KASSERT(threadpool_lookup_percpu(tpp->tpp_pri) == tpp); LIST_REMOVE(tpp, tpp_link); } static int sysctl_kern_threadpool_idle_ms(SYSCTLFN_ARGS) { struct sysctlnode node; int val, error; node = *rnode; val = threadpool_idle_time_ms; node.sysctl_data = &val; error = sysctl_lookup(SYSCTLFN_CALL(&node)); if (error == 0 && newp != NULL) { /* Disallow negative values and 0 (forever). */ if (val < 1) error = EINVAL; else threadpool_idle_time_ms = val; } return error; } SYSCTL_SETUP_PROTO(sysctl_threadpool_setup); SYSCTL_SETUP(sysctl_threadpool_setup, "sysctl kern.threadpool subtree setup") { const struct sysctlnode *rnode, *cnode; int error __diagused; error = sysctl_createv(clog, 0, NULL, &rnode, CTLFLAG_PERMANENT, CTLTYPE_NODE, "threadpool", SYSCTL_DESCR("threadpool subsystem options"), NULL, 0, NULL, 0, CTL_KERN, CTL_CREATE, CTL_EOL); KASSERT(error == 0); error = sysctl_createv(clog, 0, &rnode, &cnode, CTLFLAG_PERMANENT | CTLFLAG_READWRITE, CTLTYPE_INT, "idle_ms", SYSCTL_DESCR("idle thread timeout in ms"), sysctl_kern_threadpool_idle_ms, 0, NULL, 0, CTL_CREATE, CTL_EOL); KASSERT(error == 0); } void threadpools_init(void) { threadpool_thread_pc = pool_cache_init(sizeof(struct threadpool_thread), 0, 0, 0, "thplthrd", NULL, IPL_NONE, NULL, NULL, NULL); LIST_INIT(&unbound_threadpools); LIST_INIT(&percpu_threadpools); mutex_init(&threadpools_lock, MUTEX_DEFAULT, IPL_NONE); } static void threadnamesuffix(char *buf, size_t buflen, struct cpu_info *ci, int pri) { buf[0] = '\0'; if (ci) snprintf(buf + strlen(buf), buflen - strlen(buf), "/%d", cpu_index(ci)); if (pri != PRI_NONE) snprintf(buf + strlen(buf), buflen - strlen(buf), "@%d", pri); } /* Thread pool creation */ static bool threadpool_pri_is_valid(pri_t pri) { return (pri == PRI_NONE || (pri >= PRI_USER && pri < PRI_COUNT)); } static int threadpool_create(struct threadpool *const pool, struct cpu_info *ci, pri_t pri) { struct lwp *lwp; char suffix[16]; int ktflags; int error; KASSERT(threadpool_pri_is_valid(pri)); SDT_PROBE2(sdt, kernel, threadpool, create, ci, pri); mutex_init(&pool->tp_lock, MUTEX_DEFAULT, IPL_VM); /* XXX dispatcher */ TAILQ_INIT(&pool->tp_jobs); TAILQ_INIT(&pool->tp_idle_threads); pool->tp_refcnt = 1; /* dispatcher's reference */ pool->tp_flags = 0; pool->tp_cpu = ci; pool->tp_pri = pri; pool->tp_dispatcher.tpt_lwp = NULL; pool->tp_dispatcher.tpt_pool = pool; pool->tp_dispatcher.tpt_job = NULL; cv_init(&pool->tp_dispatcher.tpt_cv, "pooldisp"); ktflags = 0; ktflags |= KTHREAD_MPSAFE; if (pri < PRI_KERNEL) ktflags |= KTHREAD_TS; threadnamesuffix(suffix, sizeof(suffix), ci, pri); error = kthread_create(pri, ktflags, ci, &threadpool_dispatcher_thread, &pool->tp_dispatcher, &lwp, "pooldisp%s", suffix); if (error) goto fail0; mutex_spin_enter(&pool->tp_lock); pool->tp_dispatcher.tpt_lwp = lwp; cv_broadcast(&pool->tp_dispatcher.tpt_cv); mutex_spin_exit(&pool->tp_lock); SDT_PROBE3(sdt, kernel, threadpool, create__success, ci, pri, pool); return 0; fail0: KASSERT(error); KASSERT(pool->tp_dispatcher.tpt_job == NULL); KASSERT(pool->tp_dispatcher.tpt_pool == pool); KASSERT(pool->tp_flags == 0); KASSERT(pool->tp_refcnt == 0); KASSERT(TAILQ_EMPTY(&pool->tp_idle_threads)); KASSERT(TAILQ_EMPTY(&pool->tp_jobs)); KASSERT(!cv_has_waiters(&pool->tp_dispatcher.tpt_cv)); cv_destroy(&pool->tp_dispatcher.tpt_cv); mutex_destroy(&pool->tp_lock); SDT_PROBE3(sdt, kernel, threadpool, create__failure, ci, pri, error); return error; } /* Thread pool destruction */ static void threadpool_destroy(struct threadpool *pool) { struct threadpool_thread *thread; SDT_PROBE1(sdt, kernel, threadpool, destroy, pool); /* Mark the pool dying and wait for threads to commit suicide. */ mutex_spin_enter(&pool->tp_lock); KASSERT(TAILQ_EMPTY(&pool->tp_jobs)); pool->tp_flags |= THREADPOOL_DYING; cv_broadcast(&pool->tp_dispatcher.tpt_cv); TAILQ_FOREACH(thread, &pool->tp_idle_threads, tpt_entry) cv_broadcast(&thread->tpt_cv); while (0 < pool->tp_refcnt) { SDT_PROBE2(sdt, kernel, threadpool, destroy__wait, pool, pool->tp_refcnt); cv_wait(&pool->tp_dispatcher.tpt_cv, &pool->tp_lock); } mutex_spin_exit(&pool->tp_lock); KASSERT(pool->tp_dispatcher.tpt_job == NULL); KASSERT(pool->tp_dispatcher.tpt_pool == pool); KASSERT(pool->tp_flags == THREADPOOL_DYING); KASSERT(pool->tp_refcnt == 0); KASSERT(TAILQ_EMPTY(&pool->tp_idle_threads)); KASSERT(TAILQ_EMPTY(&pool->tp_jobs)); KASSERT(!cv_has_waiters(&pool->tp_dispatcher.tpt_cv)); cv_destroy(&pool->tp_dispatcher.tpt_cv); mutex_destroy(&pool->tp_lock); } static void threadpool_hold(struct threadpool *pool) { KASSERT(mutex_owned(&pool->tp_lock)); pool->tp_refcnt++; KASSERT(pool->tp_refcnt != 0); } static void threadpool_rele(struct threadpool *pool) { KASSERT(mutex_owned(&pool->tp_lock)); KASSERT(0 < pool->tp_refcnt); if (--pool->tp_refcnt == 0) cv_broadcast(&pool->tp_dispatcher.tpt_cv); } /* Unbound thread pools */ int threadpool_get(struct threadpool **poolp, pri_t pri) { struct threadpool_unbound *tpu, *tmp = NULL; int error; ASSERT_SLEEPABLE(); SDT_PROBE1(sdt, kernel, threadpool, get, pri); if (! threadpool_pri_is_valid(pri)) return EINVAL; mutex_enter(&threadpools_lock); tpu = threadpool_lookup_unbound(pri); if (tpu == NULL) { mutex_exit(&threadpools_lock); SDT_PROBE1(sdt, kernel, threadpool, get__create, pri); tmp = kmem_zalloc(sizeof(*tmp), KM_SLEEP); error = threadpool_create(&tmp->tpu_pool, NULL, pri); if (error) { kmem_free(tmp, sizeof(*tmp)); return error; } mutex_enter(&threadpools_lock); tpu = threadpool_lookup_unbound(pri); if (tpu == NULL) { tpu = tmp; tmp = NULL; threadpool_insert_unbound(tpu); } else { SDT_PROBE1(sdt, kernel, threadpool, get__race, pri); } } KASSERT(tpu != NULL); tpu->tpu_refcnt++; KASSERT(tpu->tpu_refcnt != 0); mutex_exit(&threadpools_lock); if (tmp != NULL) { threadpool_destroy(&tmp->tpu_pool); kmem_free(tmp, sizeof(*tmp)); } KASSERT(tpu != NULL); *poolp = &tpu->tpu_pool; return 0; } void threadpool_put(struct threadpool *pool, pri_t pri) { struct threadpool_unbound *tpu = container_of(pool, struct threadpool_unbound, tpu_pool); ASSERT_SLEEPABLE(); KASSERT(threadpool_pri_is_valid(pri)); SDT_PROBE2(sdt, kernel, threadpool, put, pool, pri); mutex_enter(&threadpools_lock); KASSERT(tpu == threadpool_lookup_unbound(pri)); KASSERT(0 < tpu->tpu_refcnt); if (--tpu->tpu_refcnt == 0) { SDT_PROBE2(sdt, kernel, threadpool, put__destroy, pool, pri); threadpool_remove_unbound(tpu); } else { tpu = NULL; } mutex_exit(&threadpools_lock); if (tpu) { threadpool_destroy(&tpu->tpu_pool); kmem_free(tpu, sizeof(*tpu)); } } /* Per-CPU thread pools */ int threadpool_percpu_get(struct threadpool_percpu **pool_percpup, pri_t pri) { struct threadpool_percpu *pool_percpu, *tmp = NULL; int error; ASSERT_SLEEPABLE(); SDT_PROBE1(sdt, kernel, threadpool, percpu__get, pri); if (! threadpool_pri_is_valid(pri)) return EINVAL; mutex_enter(&threadpools_lock); pool_percpu = threadpool_lookup_percpu(pri); if (pool_percpu == NULL) { mutex_exit(&threadpools_lock); SDT_PROBE1(sdt, kernel, threadpool, percpu__get__create, pri); error = threadpool_percpu_create(&tmp, pri); if (error) return error; KASSERT(tmp != NULL); mutex_enter(&threadpools_lock); pool_percpu = threadpool_lookup_percpu(pri); if (pool_percpu == NULL) { pool_percpu = tmp; tmp = NULL; threadpool_insert_percpu(pool_percpu); } else { SDT_PROBE1(sdt, kernel, threadpool, percpu__get__race, pri); } } KASSERT(pool_percpu != NULL); pool_percpu->tpp_refcnt++; KASSERT(pool_percpu->tpp_refcnt != 0); mutex_exit(&threadpools_lock); if (tmp != NULL) threadpool_percpu_destroy(tmp); KASSERT(pool_percpu != NULL); *pool_percpup = pool_percpu; return 0; } void threadpool_percpu_put(struct threadpool_percpu *pool_percpu, pri_t pri) { ASSERT_SLEEPABLE(); KASSERT(threadpool_pri_is_valid(pri)); SDT_PROBE2(sdt, kernel, threadpool, percpu__put, pool_percpu, pri); mutex_enter(&threadpools_lock); KASSERT(pool_percpu == threadpool_lookup_percpu(pri)); KASSERT(0 < pool_percpu->tpp_refcnt); if (--pool_percpu->tpp_refcnt == 0) { SDT_PROBE2(sdt, kernel, threadpool, percpu__put__destroy, pool_percpu, pri); threadpool_remove_percpu(pool_percpu); } else { pool_percpu = NULL; } mutex_exit(&threadpools_lock); if (pool_percpu) threadpool_percpu_destroy(pool_percpu); } struct threadpool * threadpool_percpu_ref(struct threadpool_percpu *pool_percpu) { struct threadpool **poolp, *pool; poolp = percpu_getref(pool_percpu->tpp_percpu); pool = *poolp; percpu_putref(pool_percpu->tpp_percpu); return pool; } struct threadpool * threadpool_percpu_ref_remote(struct threadpool_percpu *pool_percpu, struct cpu_info *ci) { struct threadpool **poolp, *pool; /* * As long as xcalls are blocked -- e.g., by kpreempt_disable * -- the percpu object will not be swapped and destroyed. We * can't write to it, because the data may have already been * moved to a new buffer, but we can safely read from it. */ kpreempt_disable(); poolp = percpu_getptr_remote(pool_percpu->tpp_percpu, ci); pool = *poolp; kpreempt_enable(); return pool; } static int threadpool_percpu_create(struct threadpool_percpu **pool_percpup, pri_t pri) { struct threadpool_percpu *pool_percpu; bool ok = true; pool_percpu = kmem_zalloc(sizeof(*pool_percpu), KM_SLEEP); pool_percpu->tpp_pri = pri; pool_percpu->tpp_percpu = percpu_create(sizeof(struct threadpool *), threadpool_percpu_init, threadpool_percpu_fini, (void *)(intptr_t)pri); /* * Verify that all of the CPUs were initialized. * * XXX What to do if we add CPU hotplug? */ percpu_foreach(pool_percpu->tpp_percpu, &threadpool_percpu_ok, &ok); if (!ok) goto fail; /* Success! */ *pool_percpup = (struct threadpool_percpu *)pool_percpu; return 0; fail: percpu_free(pool_percpu->tpp_percpu, sizeof(struct threadpool *)); kmem_free(pool_percpu, sizeof(*pool_percpu)); return ENOMEM; } static void threadpool_percpu_destroy(struct threadpool_percpu *pool_percpu) { percpu_free(pool_percpu->tpp_percpu, sizeof(struct threadpool *)); kmem_free(pool_percpu, sizeof(*pool_percpu)); } static void threadpool_percpu_init(void *vpoolp, void *vpri, struct cpu_info *ci) { struct threadpool **const poolp = vpoolp; pri_t pri = (intptr_t)(void *)vpri; int error; *poolp = kmem_zalloc(sizeof(**poolp), KM_SLEEP); error = threadpool_create(*poolp, ci, pri); if (error) { KASSERT(error == ENOMEM); kmem_free(*poolp, sizeof(**poolp)); *poolp = NULL; } } static void threadpool_percpu_ok(void *vpoolp, void *vokp, struct cpu_info *ci) { struct threadpool **const poolp = vpoolp; bool *okp = vokp; if (*poolp == NULL) atomic_store_relaxed(okp, false); } static void threadpool_percpu_fini(void *vpoolp, void *vprip, struct cpu_info *ci) { struct threadpool **const poolp = vpoolp; if (*poolp == NULL) /* initialization failed */ return; threadpool_destroy(*poolp); kmem_free(*poolp, sizeof(**poolp)); } /* Thread pool jobs */ void __printflike(4,5) threadpool_job_init(struct threadpool_job *job, threadpool_job_fn_t fn, kmutex_t *lock, const char *fmt, ...) { va_list ap; va_start(ap, fmt); (void)vsnprintf(job->job_name, sizeof(job->job_name), fmt, ap); va_end(ap); job->job_lock = lock; job->job_thread = NULL; job->job_refcnt = 0; cv_init(&job->job_cv, job->job_name); job->job_fn = fn; } static void threadpool_job_dead(struct threadpool_job *job) { panic("threadpool job %p ran after destruction", job); } void threadpool_job_destroy(struct threadpool_job *job) { ASSERT_SLEEPABLE(); KASSERTMSG((job->job_thread == NULL), "job %p still running", job); mutex_enter(job->job_lock); while (0 < atomic_load_relaxed(&job->job_refcnt)) cv_wait(&job->job_cv, job->job_lock); mutex_exit(job->job_lock); job->job_lock = NULL; KASSERT(job->job_thread == NULL); KASSERT(job->job_refcnt == 0); KASSERT(!cv_has_waiters(&job->job_cv)); cv_destroy(&job->job_cv); job->job_fn = threadpool_job_dead; (void)strlcpy(job->job_name, "deadjob", sizeof(job->job_name)); } static void threadpool_job_hold(struct threadpool_job *job) { unsigned int refcnt __diagused; refcnt = atomic_inc_uint_nv(&job->job_refcnt); KASSERT(refcnt != 0); } static void threadpool_job_rele(struct threadpool_job *job) { unsigned int refcnt; KASSERT(mutex_owned(job->job_lock)); refcnt = atomic_dec_uint_nv(&job->job_refcnt); KASSERT(refcnt != UINT_MAX); if (refcnt == 0) cv_broadcast(&job->job_cv); } void threadpool_job_done(struct threadpool_job *job) { KASSERT(mutex_owned(job->job_lock)); KASSERT(job->job_thread != NULL); KASSERT(job->job_thread->tpt_lwp == curlwp); /* * We can safely read this field; it's only modified right before * we call the job work function, and we are only preserving it * to use here; no one cares if it contains junk afterward. */ lwp_lock(curlwp); curlwp->l_name = job->job_thread->tpt_lwp_savedname; lwp_unlock(curlwp); /* * Inline the work of threadpool_job_rele(); the job is already * locked, the most likely scenario (XXXJRT only scenario?) is * that we're dropping the last reference (the one taken in * threadpool_schedule_job()), and we always do the cv_broadcast() * anyway. */ KASSERT(0 < atomic_load_relaxed(&job->job_refcnt)); unsigned int refcnt __diagused = atomic_dec_uint_nv(&job->job_refcnt); KASSERT(refcnt != UINT_MAX); cv_broadcast(&job->job_cv); job->job_thread = NULL; } void threadpool_schedule_job(struct threadpool *pool, struct threadpool_job *job) { KASSERT(mutex_owned(job->job_lock)); SDT_PROBE2(sdt, kernel, threadpool, schedule__job, pool, job); /* * If the job's already running, let it keep running. The job * is guaranteed by the interlock not to end early -- if it had * ended early, threadpool_job_done would have set job_thread * to NULL under the interlock. */ if (__predict_true(job->job_thread != NULL)) { SDT_PROBE2(sdt, kernel, threadpool, schedule__job__running, pool, job); return; } threadpool_job_hold(job); /* Otherwise, try to assign a thread to the job. */ mutex_spin_enter(&pool->tp_lock); if (__predict_false(TAILQ_EMPTY(&pool->tp_idle_threads))) { /* Nobody's idle. Give it to the dispatcher. */ SDT_PROBE2(sdt, kernel, threadpool, schedule__job__dispatcher, pool, job); job->job_thread = &pool->tp_dispatcher; TAILQ_INSERT_TAIL(&pool->tp_jobs, job, job_entry); } else { /* Assign it to the first idle thread. */ job->job_thread = TAILQ_FIRST(&pool->tp_idle_threads); SDT_PROBE3(sdt, kernel, threadpool, schedule__job__thread, pool, job, job->job_thread->tpt_lwp); TAILQ_REMOVE(&pool->tp_idle_threads, job->job_thread, tpt_entry); job->job_thread->tpt_job = job; } /* Notify whomever we gave it to, dispatcher or idle thread. */ KASSERT(job->job_thread != NULL); cv_broadcast(&job->job_thread->tpt_cv); mutex_spin_exit(&pool->tp_lock); } bool threadpool_cancel_job_async(struct threadpool *pool, struct threadpool_job *job) { KASSERT(mutex_owned(job->job_lock)); /* * XXXJRT This fails (albeit safely) when all of the following * are true: * * => "pool" is something other than what the job was * scheduled on. This can legitimately occur if, * for example, a job is percpu-scheduled on CPU0 * and then CPU1 attempts to cancel it without taking * a remote pool reference. (this might happen by * "luck of the draw"). * * => "job" is not yet running, but is assigned to the * dispatcher. * * When this happens, this code makes the determination that * the job is already running. The failure mode is that the * caller is told the job is running, and thus has to wait. * The dispatcher will eventually get to it and the job will * proceed as if it had been already running. */ if (job->job_thread == NULL) { /* Nothing to do. Guaranteed not running. */ return true; } else if (job->job_thread == &pool->tp_dispatcher) { /* Take it off the list to guarantee it won't run. */ job->job_thread = NULL; mutex_spin_enter(&pool->tp_lock); TAILQ_REMOVE(&pool->tp_jobs, job, job_entry); mutex_spin_exit(&pool->tp_lock); threadpool_job_rele(job); return true; } else { /* Too late -- already running. */ return false; } } void threadpool_cancel_job(struct threadpool *pool, struct threadpool_job *job) { /* * We may sleep here, but we can't ASSERT_SLEEPABLE() because * the job lock (used to interlock the cv_wait()) may in fact * legitimately be a spin lock, so the assertion would fire * as a false-positive. */ KASSERT(mutex_owned(job->job_lock)); if (threadpool_cancel_job_async(pool, job)) return; /* Already running. Wait for it to complete. */ while (job->job_thread != NULL) cv_wait(&job->job_cv, job->job_lock); } /* Thread pool dispatcher thread */ static void __dead threadpool_dispatcher_thread(void *arg) { struct threadpool_thread *const dispatcher = arg; struct threadpool *const pool = dispatcher->tpt_pool; struct lwp *lwp = NULL; int ktflags; char suffix[16]; int error; KASSERT((pool->tp_cpu == NULL) || (pool->tp_cpu == curcpu())); KASSERT((pool->tp_cpu == NULL) || (curlwp->l_pflag & LP_BOUND)); /* Wait until we're initialized. */ mutex_spin_enter(&pool->tp_lock); while (dispatcher->tpt_lwp == NULL) cv_wait(&dispatcher->tpt_cv, &pool->tp_lock); SDT_PROBE1(sdt, kernel, threadpool, dispatcher__start, pool); for (;;) { /* Wait until there's a job. */ while (TAILQ_EMPTY(&pool->tp_jobs)) { if (ISSET(pool->tp_flags, THREADPOOL_DYING)) { SDT_PROBE1(sdt, kernel, threadpool, dispatcher__dying, pool); break; } cv_wait(&dispatcher->tpt_cv, &pool->tp_lock); } if (__predict_false(TAILQ_EMPTY(&pool->tp_jobs))) break; /* If there are no threads, we'll have to try to start one. */ if (TAILQ_EMPTY(&pool->tp_idle_threads)) { SDT_PROBE1(sdt, kernel, threadpool, dispatcher__spawn, pool); threadpool_hold(pool); mutex_spin_exit(&pool->tp_lock); struct threadpool_thread *const thread = pool_cache_get(threadpool_thread_pc, PR_WAITOK); thread->tpt_lwp = NULL; thread->tpt_pool = pool; thread->tpt_job = NULL; cv_init(&thread->tpt_cv, "pooljob"); ktflags = 0; ktflags |= KTHREAD_MPSAFE; if (pool->tp_pri < PRI_KERNEL) ktflags |= KTHREAD_TS; threadnamesuffix(suffix, sizeof(suffix), pool->tp_cpu, pool->tp_pri); error = kthread_create(pool->tp_pri, ktflags, pool->tp_cpu, &threadpool_thread, thread, &lwp, "poolthread%s", suffix); mutex_spin_enter(&pool->tp_lock); if (error) { pool_cache_put(threadpool_thread_pc, thread); threadpool_rele(pool); /* XXX What to do to wait for memory? */ (void)kpause("thrdplcr", false, hz, &pool->tp_lock); continue; } /* * New kthread now owns the reference to the pool * taken above. */ KASSERT(lwp != NULL); TAILQ_INSERT_TAIL(&pool->tp_idle_threads, thread, tpt_entry); thread->tpt_lwp = lwp; lwp = NULL; cv_broadcast(&thread->tpt_cv); continue; } /* There are idle threads, so try giving one a job. */ struct threadpool_job *const job = TAILQ_FIRST(&pool->tp_jobs); /* * Take an extra reference on the job temporarily so that * it won't disappear on us while we have both locks dropped. */ threadpool_job_hold(job); mutex_spin_exit(&pool->tp_lock); mutex_enter(job->job_lock); /* If the job was cancelled, we'll no longer be its thread. */ if (__predict_true(job->job_thread == dispatcher)) { mutex_spin_enter(&pool->tp_lock); TAILQ_REMOVE(&pool->tp_jobs, job, job_entry); if (__predict_false( TAILQ_EMPTY(&pool->tp_idle_threads))) { /* * Someone else snagged the thread * first. We'll have to try again. */ SDT_PROBE2(sdt, kernel, threadpool, dispatcher__race, pool, job); TAILQ_INSERT_HEAD(&pool->tp_jobs, job, job_entry); } else { /* * Assign the job to the thread and * wake the thread so it starts work. */ struct threadpool_thread *const thread = TAILQ_FIRST(&pool->tp_idle_threads); SDT_PROBE2(sdt, kernel, threadpool, dispatcher__assign, job, thread->tpt_lwp); KASSERT(thread->tpt_job == NULL); TAILQ_REMOVE(&pool->tp_idle_threads, thread, tpt_entry); thread->tpt_job = job; job->job_thread = thread; cv_broadcast(&thread->tpt_cv); } mutex_spin_exit(&pool->tp_lock); } threadpool_job_rele(job); mutex_exit(job->job_lock); mutex_spin_enter(&pool->tp_lock); } threadpool_rele(pool); mutex_spin_exit(&pool->tp_lock); SDT_PROBE1(sdt, kernel, threadpool, dispatcher__exit, pool); kthread_exit(0); } /* Thread pool thread */ static void __dead threadpool_thread(void *arg) { struct threadpool_thread *const thread = arg; struct threadpool *const pool = thread->tpt_pool; KASSERT((pool->tp_cpu == NULL) || (pool->tp_cpu == curcpu())); KASSERT((pool->tp_cpu == NULL) || (curlwp->l_pflag & LP_BOUND)); /* Wait until we're initialized and on the queue. */ mutex_spin_enter(&pool->tp_lock); while (thread->tpt_lwp == NULL) cv_wait(&thread->tpt_cv, &pool->tp_lock); SDT_PROBE1(sdt, kernel, threadpool, thread__start, pool); KASSERT(thread->tpt_lwp == curlwp); for (;;) { /* Wait until we are assigned a job. */ while (thread->tpt_job == NULL) { if (ISSET(pool->tp_flags, THREADPOOL_DYING)) { SDT_PROBE1(sdt, kernel, threadpool, thread__dying, pool); break; } if (cv_timedwait(&thread->tpt_cv, &pool->tp_lock, mstohz(threadpool_idle_time_ms))) break; } if (__predict_false(thread->tpt_job == NULL)) { TAILQ_REMOVE(&pool->tp_idle_threads, thread, tpt_entry); break; } struct threadpool_job *const job = thread->tpt_job; KASSERT(job != NULL); /* Set our lwp name to reflect what job we're doing. */ lwp_lock(curlwp); char *const lwp_name __diagused = curlwp->l_name; thread->tpt_lwp_savedname = curlwp->l_name; curlwp->l_name = job->job_name; lwp_unlock(curlwp); mutex_spin_exit(&pool->tp_lock); SDT_PROBE2(sdt, kernel, threadpool, thread__job, pool, job); /* Run the job. */ (*job->job_fn)(job); /* lwp name restored in threadpool_job_done(). */ KASSERTMSG((curlwp->l_name == lwp_name), "someone forgot to call threadpool_job_done()!"); /* * We can compare pointers, but we can no longer deference * job after this because threadpool_job_done() drops the * last reference on the job while the job is locked. */ mutex_spin_enter(&pool->tp_lock); KASSERT(thread->tpt_job == job); thread->tpt_job = NULL; TAILQ_INSERT_TAIL(&pool->tp_idle_threads, thread, tpt_entry); } threadpool_rele(pool); mutex_spin_exit(&pool->tp_lock); SDT_PROBE1(sdt, kernel, threadpool, thread__exit, pool); KASSERT(!cv_has_waiters(&thread->tpt_cv)); cv_destroy(&thread->tpt_cv); pool_cache_put(threadpool_thread_pc, thread); kthread_exit(0); }
1 1 203 204 203 87 139 203 202 201 2 2 203 203 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 /* $NetBSD: sd.c,v 1.336 2024/02/24 22:06:49 mlelstv Exp $ */ /*- * Copyright (c) 1998, 2003, 2004 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Charles M. Hannum. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /* * Originally written by Julian Elischer (julian@dialix.oz.au) * for TRW Financial Systems for use under the MACH(2.5) operating system. * * TRW Financial Systems, in accordance with their agreement with Carnegie * Mellon University, makes this software available to CMU to distribute * or use in any manner that they see fit as long as this message is kept with * the software. For this reason TFS also grants any other persons or * organisations permission to use or modify this software. * * TFS supplies this software to be publicly redistributed * on the understanding that TFS is not responsible for the correct * functioning of this software in any circumstances. * * Ported to run under 386BSD by Julian Elischer (julian@dialix.oz.au) Sept 1992 */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: sd.c,v 1.336 2024/02/24 22:06:49 mlelstv Exp $"); #ifdef _KERNEL_OPT #include "opt_scsi.h" #endif #include <sys/param.h> #include <sys/systm.h> #include <sys/kernel.h> #include <sys/file.h> #include <sys/stat.h> #include <sys/ioctl.h> #include <sys/scsiio.h> #include <sys/buf.h> #include <sys/bufq.h> #include <sys/uio.h> #include <sys/malloc.h> #include <sys/errno.h> #include <sys/device.h> #include <sys/disklabel.h> #include <sys/disk.h> #include <sys/proc.h> #include <sys/conf.h> #include <sys/vnode.h> #include <dev/scsipi/scsi_spc.h> #include <dev/scsipi/scsipi_all.h> #include <dev/scsipi/scsi_all.h> #include <dev/scsipi/scsipi_disk.h> #include <dev/scsipi/scsi_disk.h> #include <dev/scsipi/scsiconf.h> #include <dev/scsipi/scsipi_base.h> #include <dev/scsipi/sdvar.h> #include <prop/proplib.h> #define SDUNIT(dev) DISKUNIT(dev) #define SDPART(dev) DISKPART(dev) #define SDMINOR(unit, part) DISKMINOR(unit, part) #define MAKESDDEV(maj, unit, part) MAKEDISKDEV(maj, unit, part) #define SDLABELDEV(dev) (MAKESDDEV(major(dev), SDUNIT(dev), RAW_PART)) #define SD_DEFAULT_BLKSIZE 512 static void sdminphys(struct buf *); static void sdstart(struct scsipi_periph *); static void sdrestart(void *); static void sddone(struct scsipi_xfer *, int); static bool sd_suspend(device_t, const pmf_qual_t *); static bool sd_shutdown(device_t, int); static int sd_interpret_sense(struct scsipi_xfer *); static int sd_diskstart(device_t, struct buf *); static int sd_dumpblocks(device_t, void *, daddr_t, int); static void sd_iosize(device_t, int *); static int sd_lastclose(device_t); static int sd_firstopen(device_t, dev_t, int, int); static void sd_label(device_t, struct disklabel *); static int sd_mode_sense(struct sd_softc *, u_int8_t, void *, size_t, int, int, int *); static int sd_mode_select(struct sd_softc *, u_int8_t, void *, size_t, int, int); static int sd_validate_blksize(struct scsipi_periph *, int); static u_int64_t sd_read_capacity(struct scsipi_periph *, int *, int flags); static int sd_get_simplifiedparms(struct sd_softc *, struct disk_parms *, int); static int sd_get_capacity(struct sd_softc *, struct disk_parms *, int); static int sd_get_parms(struct sd_softc *, struct disk_parms *, int); static int sd_get_parms_page4(struct sd_softc *, struct disk_parms *, int); static int sd_get_parms_page5(struct sd_softc *, struct disk_parms *, int); static int sd_flush(struct sd_softc *, int); static int sd_getcache(struct sd_softc *, int *); static int sd_setcache(struct sd_softc *, int); static int sdmatch(device_t, cfdata_t, void *); static void sdattach(device_t, device_t, void *); static int sddetach(device_t, int); static void sd_set_geometry(struct sd_softc *); CFATTACH_DECL3_NEW(sd, sizeof(struct sd_softc), sdmatch, sdattach, sddetach, NULL, NULL, NULL, DVF_DETACH_SHUTDOWN); extern struct cfdriver sd_cd; static const struct scsipi_inquiry_pattern sd_patterns[] = { {T_DIRECT, T_FIXED, "", "", ""}, {T_DIRECT, T_REMOV, "", "", ""}, {T_OPTICAL, T_FIXED, "", "", ""}, {T_OPTICAL, T_REMOV, "", "", ""}, {T_SIMPLE_DIRECT, T_FIXED, "", "", ""}, {T_SIMPLE_DIRECT, T_REMOV, "", "", ""}, }; static dev_type_open(sdopen); static dev_type_close(sdclose); static dev_type_read(sdread); static dev_type_write(sdwrite); static dev_type_ioctl(sdioctl); static dev_type_strategy(sdstrategy); static dev_type_dump(sddump); static dev_type_size(sdsize); const struct bdevsw sd_bdevsw = { .d_open = sdopen, .d_close = sdclose, .d_strategy = sdstrategy, .d_ioctl = sdioctl, .d_dump = sddump, .d_psize = sdsize, .d_discard = nodiscard, .d_cfdriver = &sd_cd, .d_devtounit = disklabel_dev_unit, .d_flag = D_DISK | D_MPSAFE }; const struct cdevsw sd_cdevsw = { .d_open = sdopen, .d_close = sdclose, .d_read = sdread, .d_write = sdwrite, .d_ioctl = sdioctl, .d_stop = nostop, .d_tty = notty, .d_poll = nopoll, .d_mmap = nommap, .d_kqfilter = nokqfilter, .d_discard = nodiscard, .d_cfdriver = &sd_cd, .d_devtounit = disklabel_dev_unit, .d_flag = D_DISK | D_MPSAFE }; static const struct dkdriver sddkdriver = { .d_open = sdopen, .d_close = sdclose, .d_strategy = sdstrategy, .d_minphys = sdminphys, .d_diskstart = sd_diskstart, .d_dumpblocks = sd_dumpblocks, .d_iosize = sd_iosize, .d_firstopen = sd_firstopen, .d_lastclose = sd_lastclose, .d_label = sd_label, }; static const struct scsipi_periphsw sd_switch = { sd_interpret_sense, /* check our error handler first */ sdstart, /* have a queue, served by this */ NULL, /* have no async handler */ sddone, /* deal with stats at interrupt time */ }; struct sd_mode_sense_data { /* * XXX * We are not going to parse this as-is -- it just has to be large * enough. */ union { struct scsi_mode_parameter_header_6 small; struct scsi_mode_parameter_header_10 big; } header; struct scsi_general_block_descriptor blk_desc; union scsi_disk_pages pages; }; /* * The routine called by the low level scsi routine when it discovers * A device suitable for this driver */ static int sdmatch(device_t parent, cfdata_t match, void *aux) { struct scsipibus_attach_args *sa = aux; int priority; (void)scsipi_inqmatch(&sa->sa_inqbuf, sd_patterns, sizeof(sd_patterns) / sizeof(sd_patterns[0]), sizeof(sd_patterns[0]), &priority); return (priority); } /* * Attach routine common to atapi & scsi. */ static void sdattach(device_t parent, device_t self, void *aux) { struct sd_softc *sd = device_private(self); struct dk_softc *dksc = &sd->sc_dksc; struct scsipibus_attach_args *sa = aux; struct scsipi_periph *periph = sa->sa_periph; int error, result, dtype; struct disk_parms *dp = &sd->params; char pbuf[9]; SC_DEBUG(periph, SCSIPI_DB2, ("sdattach: ")); sd->type = (sa->sa_inqbuf.type & SID_TYPE); memcpy(sd->name, sa->sa_inqbuf.product, uimin(16, sizeof(sd->name))); memcpy(sd->typename, sa->sa_inqbuf.product, uimin(16, sizeof(sd->typename))); if (sd->type == T_SIMPLE_DIRECT) periph->periph_quirks |= PQUIRK_ONLYBIG | PQUIRK_NOBIGMODESENSE; switch (SCSIPI_BUSTYPE_TYPE(scsipi_periph_bustype(sa->sa_periph))) { case SCSIPI_BUSTYPE_SCSI: dtype = DKTYPE_SCSI; if (periph->periph_version == 0) sd->flags |= SDF_ANCIENT; break; case SCSIPI_BUSTYPE_ATAPI: dtype = DKTYPE_ATAPI; break; default: dtype = DKTYPE_UNKNOWN; break; } /* Initialize dk and disk structure. */ dk_init(dksc, self, dtype); disk_init(&dksc->sc_dkdev, dksc->sc_xname, &sddkdriver); /* Attach dk and disk subsystems */ dk_attach(dksc); disk_attach(&dksc->sc_dkdev); bufq_alloc(&dksc->sc_bufq, BUFQ_DISK_DEFAULT_STRAT, BUFQ_SORT_RAWBLOCK); callout_init(&sd->sc_callout, 0); /* * Store information needed to contact our base driver */ sd->sc_periph = periph; periph->periph_dev = dksc->sc_dev; periph->periph_switch = &sd_switch; /* * Increase our openings to the maximum-per-periph * supported by the adapter. This will either be * clamped down or grown by the adapter if necessary. */ periph->periph_openings = SCSIPI_CHAN_MAX_PERIPH(periph->periph_channel); periph->periph_flags |= PERIPH_GROW_OPENINGS; /* * Use the subdriver to request information regarding the drive. */ aprint_naive("\n"); aprint_normal("\n"); if (periph->periph_quirks & PQUIRK_START) (void)scsipi_start(periph, SSS_START, XS_CTL_SILENT); error = scsipi_test_unit_ready(periph, XS_CTL_DISCOVERY | XS_CTL_IGNORE_ILLEGAL_REQUEST | XS_CTL_IGNORE_MEDIA_CHANGE | XS_CTL_SILENT_NODEV); if (error) result = SDGP_RESULT_OFFLINE; else result = sd_get_parms(sd, &sd->params, XS_CTL_DISCOVERY); aprint_normal_dev(dksc->sc_dev, ""); switch (result) { case SDGP_RESULT_OK: format_bytes(pbuf, sizeof(pbuf), (u_int64_t)dp->disksize * dp->blksize); aprint_normal( "%s, %ld cyl, %ld head, %ld sec, %ld bytes/sect x %llu sectors", pbuf, dp->cyls, dp->heads, dp->sectors, dp->blksize, (unsigned long long)dp->disksize); break; case SDGP_RESULT_OFFLINE: aprint_normal("drive offline"); break; case SDGP_RESULT_UNFORMATTED: aprint_normal("unformatted media"); break; #ifdef DIAGNOSTIC default: panic("sdattach: unknown result from get_parms"); break; #endif } aprint_normal("\n"); /* Discover wedges on this disk if it is online */ if (result == SDGP_RESULT_OK) dkwedge_discover(&dksc->sc_dkdev); /* * Establish a shutdown hook so that we can ensure that * our data has actually made it onto the platter at * shutdown time. Note that this relies on the fact * that the shutdown hooks at the "leaves" of the device tree * are run, first (thus guaranteeing that our hook runs before * our ancestors'). */ if (!pmf_device_register1(self, sd_suspend, NULL, sd_shutdown)) aprint_error_dev(self, "couldn't establish power handler\n"); } static int sddetach(device_t self, int flags) { struct sd_softc *sd = device_private(self); struct dk_softc *dksc = &sd->sc_dksc; struct scsipi_periph *periph = sd->sc_periph; struct scsipi_channel *chan = periph->periph_channel; int bmaj, cmaj, i, mn, rc; if ((rc = disk_begindetach(&dksc->sc_dkdev, sd_lastclose, self, flags)) != 0) return rc; /* locate the major number */ bmaj = bdevsw_lookup_major(&sd_bdevsw); cmaj = cdevsw_lookup_major(&sd_cdevsw); /* Nuke the vnodes for any open instances */ for (i = 0; i < MAXPARTITIONS; i++) { mn = SDMINOR(device_unit(self), i); vdevgone(bmaj, mn, mn, VBLK); vdevgone(cmaj, mn, mn, VCHR); } /* kill any pending restart */ callout_halt(&sd->sc_callout, NULL); dk_drain(dksc); /* Kill off any pending commands. */ mutex_enter(chan_mtx(chan)); scsipi_kill_pending(periph); mutex_exit(chan_mtx(chan)); bufq_free(dksc->sc_bufq); /* Delete all of our wedges. */ dkwedge_delall(&dksc->sc_dkdev); /* Detach from the disk list. */ disk_detach(&dksc->sc_dkdev); disk_destroy(&dksc->sc_dkdev); dk_detach(dksc); callout_destroy(&sd->sc_callout); pmf_device_deregister(self); return (0); } /* * Serialized by caller */ static int sd_firstopen(device_t self, dev_t dev, int flag, int fmt) { struct sd_softc *sd = device_private(self); struct scsipi_periph *periph = sd->sc_periph; struct scsipi_adapter *adapt = periph->periph_channel->chan_adapter; int error, silent; int part, removable; part = SDPART(dev); error = scsipi_adapter_addref(adapt); if (error) return error; if ((part == RAW_PART && fmt == S_IFCHR) || (flag & FSILENT)) silent = XS_CTL_SILENT; else silent = 0; /* Check that it is still responding and ok. */ error = scsipi_test_unit_ready(periph, XS_CTL_IGNORE_ILLEGAL_REQUEST | XS_CTL_IGNORE_MEDIA_CHANGE | silent); /* * Start the pack spinning if necessary. Always allow the * raw partition to be opened, for raw IOCTLs. Data transfers * will check for SDEV_MEDIA_LOADED. */ if (error == EIO) { error = scsipi_start(periph, SSS_START, silent); if (error == EINVAL) error = EIO; } if (error) goto bad; removable = (periph->periph_flags & PERIPH_REMOVABLE) != 0; if (removable) { /* Lock the pack in. */ error = scsipi_prevent(periph, SPAMR_PREVENT_DT, XS_CTL_IGNORE_ILLEGAL_REQUEST | XS_CTL_IGNORE_MEDIA_CHANGE | XS_CTL_SILENT); if (error) goto bad; } if ((periph->periph_flags & PERIPH_MEDIA_LOADED) == 0) { int param_error; /* * Load the physical device parameters. * * Note that if media is present but unformatted, * we allow the open (so that it can be formatted!). * The drive should refuse real I/O, if the media is * unformatted. */ param_error = sd_get_parms(sd, &sd->params, 0); if (param_error == SDGP_RESULT_OFFLINE) { error = ENXIO; goto bad2; } periph->periph_flags |= PERIPH_MEDIA_LOADED; SC_DEBUG(periph, SCSIPI_DB3, ("Params loaded ")); } periph->periph_flags |= PERIPH_OPEN; return 0; bad2: if (removable) scsipi_prevent(periph, SPAMR_ALLOW, XS_CTL_IGNORE_ILLEGAL_REQUEST | XS_CTL_IGNORE_MEDIA_CHANGE | XS_CTL_SILENT); bad: scsipi_adapter_delref(adapt); return error; } /* * open the device. Make sure the partition info is a up-to-date as can be. */ static int sdopen(dev_t dev, int flag, int fmt, struct lwp *l) { struct sd_softc *sd; struct dk_softc *dksc; struct scsipi_periph *periph; int unit, part; int error; unit = SDUNIT(dev); sd = device_lookup_private(&sd_cd, unit); if (sd == NULL) return (ENXIO); dksc = &sd->sc_dksc; if (!device_is_active(dksc->sc_dev)) return (ENODEV); periph = sd->sc_periph; part = SDPART(dev); SC_DEBUG(periph, SCSIPI_DB1, ("sdopen: dev=0x%"PRIx64" (unit %d (of %d), partition %d)\n", dev, unit, sd_cd.cd_ndevs, SDPART(dev))); /* * If any partition is open, but the disk has been invalidated, * disallow further opens of non-raw partition */ if ((periph->periph_flags & (PERIPH_OPEN | PERIPH_MEDIA_LOADED)) == PERIPH_OPEN) { if (part != RAW_PART || fmt != S_IFCHR) return EIO; } error = dk_open(dksc, dev, flag, fmt, l); SC_DEBUG(periph, SCSIPI_DB3, ("open complete\n")); return error; } /* * Serialized by caller */ static int sd_lastclose(device_t self) { struct sd_softc *sd = device_private(self); struct dk_softc *dksc = &sd->sc_dksc; struct scsipi_periph *periph = sd->sc_periph; struct scsipi_adapter *adapt = periph->periph_channel->chan_adapter; /* * If the disk cache needs flushing, and the disk supports * it, do it now. */ if ((sd->flags & SDF_DIRTY) != 0) { if (sd_flush(sd, 0)) { aprint_error_dev(dksc->sc_dev, "cache synchronization failed\n"); sd->flags &= ~SDF_FLUSHING; } else sd->flags &= ~(SDF_FLUSHING|SDF_DIRTY); } scsipi_wait_drain(periph); if (periph->periph_flags & PERIPH_REMOVABLE) scsipi_prevent(periph, SPAMR_ALLOW, XS_CTL_IGNORE_ILLEGAL_REQUEST | XS_CTL_IGNORE_NOT_READY | XS_CTL_SILENT); periph->periph_flags &= ~PERIPH_OPEN; scsipi_wait_drain(periph); scsipi_adapter_delref(adapt); return 0; } /* * close the device.. only called if we are the LAST occurrence of an open * device. Convenient now but usually a pain. */ static int sdclose(dev_t dev, int flag, int fmt, struct lwp *l) { struct sd_softc *sd; struct dk_softc *dksc; int unit; unit = SDUNIT(dev); sd = device_lookup_private(&sd_cd, unit); dksc = &sd->sc_dksc; return dk_close(dksc, dev, flag, fmt, l); } /* * Actually translate the requested transfer into one the physical driver * can understand. The transfer is described by a buf and will include * only one physical transfer. */ static void sdstrategy(struct buf *bp) { struct sd_softc *sd = device_lookup_private(&sd_cd, SDUNIT(bp->b_dev)); struct dk_softc *dksc = &sd->sc_dksc; struct scsipi_periph *periph = sd->sc_periph; SC_DEBUG(sd->sc_periph, SCSIPI_DB2, ("sdstrategy ")); SC_DEBUG(sd->sc_periph, SCSIPI_DB1, ("%d bytes @ blk %" PRId64 "\n", bp->b_bcount, bp->b_blkno)); /* * If the device has been made invalid, error out */ if ((periph->periph_flags & PERIPH_MEDIA_LOADED) == 0 || !device_is_active(dksc->sc_dev)) { if (periph->periph_flags & PERIPH_OPEN) bp->b_error = EIO; else bp->b_error = ENODEV; bp->b_resid = bp->b_bcount; biodone(bp); return; } dk_strategy(dksc, bp); } /* * Issue single I/O command * * Called from dk_start and implicitly from dk_strategy */ static int sd_diskstart(device_t dev, struct buf *bp) { struct sd_softc *sd = device_private(dev); struct scsipi_periph *periph = sd->sc_periph; struct scsipi_channel *chan = periph->periph_channel; struct scsipi_rw_16 cmd16; struct scsipi_rw_10 cmd_big; struct scsi_rw_6 cmd_small; struct scsipi_generic *cmdp; struct scsipi_xfer *xs; int error, flags, nblks, cmdlen; int cdb_flags; bool havefua = !(periph->periph_quirks & PQUIRK_NOFUA); mutex_enter(chan_mtx(chan)); if (periph->periph_active >= periph->periph_openings) { error = EAGAIN; goto out; } /* * there is excess capacity, but a special waits * It'll need the adapter as soon as we clear out of the * way and let it run (user level wait). */ if (periph->periph_flags & PERIPH_WAITING) { periph->periph_flags &= ~PERIPH_WAITING; cv_broadcast(periph_cv_periph(periph)); error = EAGAIN; goto out; } /* * If the device has become invalid, abort all the * reads and writes until all files have been closed and * re-opened. */ if (__predict_false( (periph->periph_flags & PERIPH_MEDIA_LOADED) == 0)) { error = EIO; goto out; } /* * Mark the disk dirty so that the cache will be * flushed on close. */ if ((bp->b_flags & B_READ) == 0) sd->flags |= SDF_DIRTY; if (sd->params.blksize == DEV_BSIZE) nblks = bp->b_bcount >> DEV_BSHIFT; else nblks = howmany(bp->b_bcount, sd->params.blksize); /* * Pass FUA and/or DPO if requested. Must be done before CDB * selection, as 6-byte CDB doesn't support the flags. */ cdb_flags = 0; if (havefua) { if (bp->b_flags & B_MEDIA_FUA) cdb_flags |= SRWB_FUA; if (bp->b_flags & B_MEDIA_DPO) cdb_flags |= SRWB_DPO; } /* * Fill out the scsi command. Use the smallest CDB possible * (6-byte, 10-byte, or 16-byte). If we need FUA or DPO, * need to use 10-byte or bigger, as the 6-byte doesn't support * the flags. */ if (((bp->b_rawblkno & 0x1fffff) == bp->b_rawblkno) && ((nblks & 0xff) == nblks) && !(periph->periph_quirks & PQUIRK_ONLYBIG) && !cdb_flags) { /* 6-byte CDB */ memset(&cmd_small, 0, sizeof(cmd_small)); cmd_small.opcode = (bp->b_flags & B_READ) ? SCSI_READ_6_COMMAND : SCSI_WRITE_6_COMMAND; _lto3b(bp->b_rawblkno, cmd_small.addr); cmd_small.length = nblks & 0xff; cmdlen = sizeof(cmd_small); cmdp = (struct scsipi_generic *)&cmd_small; } else if ((bp->b_rawblkno & 0xffffffff) == bp->b_rawblkno) { /* 10-byte CDB */ memset(&cmd_big, 0, sizeof(cmd_big)); cmd_big.opcode = (bp->b_flags & B_READ) ? READ_10 : WRITE_10; _lto4b(bp->b_rawblkno, cmd_big.addr); _lto2b(nblks, cmd_big.length); cmdlen = sizeof(cmd_big); cmdp = (struct scsipi_generic *)&cmd_big; } else { /* 16-byte CDB */ memset(&cmd16, 0, sizeof(cmd16)); cmd16.opcode = (bp->b_flags & B_READ) ? READ_16 : WRITE_16; _lto8b(bp->b_rawblkno, cmd16.addr); _lto4b(nblks, cmd16.length); cmdlen = sizeof(cmd16); cmdp = (struct scsipi_generic *)&cmd16; } if (cdb_flags) cmdp->bytes[0] = cdb_flags; /* * Figure out what flags to use. */ flags = XS_CTL_NOSLEEP|XS_CTL_ASYNC|XS_CTL_SIMPLE_TAG; if (bp->b_flags & B_READ) flags |= XS_CTL_DATA_IN; else flags |= XS_CTL_DATA_OUT; /* * Call the routine that chats with the adapter. * Note: we cannot sleep as we may be an interrupt */ xs = scsipi_make_xs_locked(periph, cmdp, cmdlen, (u_char *)bp->b_data, bp->b_bcount, SDRETRIES, SD_IO_TIMEOUT, bp, flags); if (__predict_false(xs == NULL)) { /* * out of memory. Keep this buffer in the queue, and * retry later. */ callout_reset(&sd->sc_callout, hz / 2, sdrestart, sd); error = EAGAIN; goto out; } error = scsipi_execute_xs(xs); /* with a scsipi_xfer preallocated, scsipi_command can't fail */ KASSERT(error == 0); out: mutex_exit(chan_mtx(chan)); return error; } /* * Recover I/O request after memory shortage * * Called from callout */ static void sdrestart(void *v) { struct sd_softc *sd = v; struct dk_softc *dksc = &sd->sc_dksc; dk_start(dksc, NULL); } /* * Recover I/O request after memory shortage * * Called from scsipi midlayer when resources have been freed * with channel lock held */ static void sdstart(struct scsipi_periph *periph) { struct sd_softc *sd = device_private(periph->periph_dev); struct dk_softc *dksc = &sd->sc_dksc; struct scsipi_channel *chan = periph->periph_channel; /* * release channel lock as dk_start may need to acquire * other locks * * sdstart is called from scsipi_put_xs and all its callers * release the lock afterwards. So releasing it here * doesn't matter. */ mutex_exit(chan_mtx(chan)); dk_start(dksc, NULL); mutex_enter(chan_mtx(chan)); } static void sddone(struct scsipi_xfer *xs, int error) { struct sd_softc *sd = device_private(xs->xs_periph->periph_dev); struct dk_softc *dksc = &sd->sc_dksc; struct buf *bp = xs->bp; if (sd->flags & SDF_FLUSHING) { /* Flush completed, no longer dirty. */ sd->flags &= ~(SDF_FLUSHING|SDF_DIRTY); } if (bp) { bp->b_error = error; bp->b_resid = xs->resid; if (error) { /* on a read/write error bp->b_resid is zero, so fix */ bp->b_resid = bp->b_bcount; } dk_done(dksc, bp); /* dk_start is called from scsipi_complete */ } } static void sdminphys(struct buf *bp) { struct sd_softc *sd = device_lookup_private(&sd_cd, SDUNIT(bp->b_dev)); struct dk_softc *dksc = &sd->sc_dksc; long xmax; /* * If the device is ancient, we want to make sure that * the transfer fits into a 6-byte cdb. * * XXX Note that the SCSI-I spec says that 256-block transfers * are allowed in a 6-byte read/write, and are specified * by setting the "length" to 0. However, we're conservative * here, allowing only 255-block transfers in case an * ancient device gets confused by length == 0. A length of 0 * in a 10-byte read/write actually means 0 blocks. */ if ((sd->flags & SDF_ANCIENT) && ((sd->sc_periph->periph_flags & (PERIPH_REMOVABLE | PERIPH_MEDIA_LOADED)) != PERIPH_REMOVABLE)) { xmax = dksc->sc_dkdev.dk_geom.dg_secsize * 0xff; if (bp->b_bcount > xmax) bp->b_bcount = xmax; } scsipi_adapter_minphys(sd->sc_periph->periph_channel, bp); } static void sd_iosize(device_t dev, int *count) { struct buf B; int bmaj; bmaj = bdevsw_lookup_major(&sd_bdevsw); B.b_dev = MAKESDDEV(bmaj,device_unit(dev),RAW_PART); B.b_bcount = *count; sdminphys(&B); *count = B.b_bcount; } static int sdread(dev_t dev, struct uio *uio, int ioflag) { return (physio(sdstrategy, NULL, dev, B_READ, sdminphys, uio)); } static int sdwrite(dev_t dev, struct uio *uio, int ioflag) { return (physio(sdstrategy, NULL, dev, B_WRITE, sdminphys, uio)); } /* * Perform special action on behalf of the user * Knows about the internals of this device */ static int sdioctl(dev_t dev, u_long cmd, void *addr, int flag, struct lwp *l) { struct sd_softc *sd = device_lookup_private(&sd_cd, SDUNIT(dev)); struct dk_softc *dksc = &sd->sc_dksc; struct scsipi_periph *periph = sd->sc_periph; int part = SDPART(dev); int error; SC_DEBUG(sd->sc_periph, SCSIPI_DB2, ("sdioctl 0x%lx ", cmd)); /* * If the device is not valid, some IOCTLs can still be * handled on the raw partition. Check this here. */ if ((periph->periph_flags & PERIPH_MEDIA_LOADED) == 0 && part != RAW_PART) return (EIO); switch (cmd) { case DIOCLOCK: if (periph->periph_flags & PERIPH_REMOVABLE) return (scsipi_prevent(periph, (*(int *)addr) ? SPAMR_PREVENT_DT : SPAMR_ALLOW, 0)); else return (ENOTTY); case DIOCEJECT: if ((periph->periph_flags & PERIPH_REMOVABLE) == 0) return (ENOTTY); if (*(int *)addr == 0) { int pmask = __BIT(part); /* * Don't force eject: check that we are the only * partition open. If so, unlock it. */ if (DK_BUSY(dksc, pmask) == 0) { error = scsipi_prevent(periph, SPAMR_ALLOW, XS_CTL_IGNORE_NOT_READY); if (error) return (error); } else { return (EBUSY); } } /* FALLTHROUGH */ case ODIOCEJECT: return ((periph->periph_flags & PERIPH_REMOVABLE) == 0 ? ENOTTY : scsipi_start(periph, SSS_STOP|SSS_LOEJ, 0)); case DIOCGCACHE: return (sd_getcache(sd, (int *) addr)); case DIOCSCACHE: if ((flag & FWRITE) == 0) return (EBADF); return (sd_setcache(sd, *(int *) addr)); case DIOCCACHESYNC: /* * XXX Do we really need to care about having a writable * file descriptor here? */ if ((flag & FWRITE) == 0) return (EBADF); if (((sd->flags & SDF_DIRTY) != 0 || *(int *)addr != 0)) { error = sd_flush(sd, 0); if (error) { sd->flags &= ~SDF_FLUSHING; return (error); } sd->flags &= ~(SDF_FLUSHING|SDF_DIRTY); } return (0); default: error = dk_ioctl(dksc, dev, cmd, addr, flag, l); if (error == ENOTTY) error = scsipi_do_ioctl(periph, dev, cmd, addr, flag, l); return (error); } #ifdef DIAGNOSTIC panic("sdioctl: impossible"); #endif } static void sd_label(device_t self, struct disklabel *lp) { struct sd_softc *sd = device_private(self); strncpy(lp->d_typename, sd->name, 16); lp->d_rpm = sd->params.rot_rate; if (sd->sc_periph->periph_flags & PERIPH_REMOVABLE) lp->d_flags |= D_REMOVABLE; } static bool sd_shutdown(device_t self, int how) { struct sd_softc *sd = device_private(self); struct dk_softc *dksc = &sd->sc_dksc; /* * If the disk cache needs to be flushed, and the disk supports * it, flush it. We're cold at this point, so we poll for * completion. */ if ((sd->flags & SDF_DIRTY) != 0) { if (sd_flush(sd, XS_CTL_NOSLEEP|XS_CTL_POLL)) { aprint_error_dev(dksc->sc_dev, "cache synchronization failed\n"); sd->flags &= ~SDF_FLUSHING; } else sd->flags &= ~(SDF_FLUSHING|SDF_DIRTY); } return true; } static bool sd_suspend(device_t dv, const pmf_qual_t *qual) { return sd_shutdown(dv, boothowto); /* XXX no need to poll */ } /* * Check Errors */ static int sd_interpret_sense(struct scsipi_xfer *xs) { struct scsipi_periph *periph = xs->xs_periph; struct scsipi_channel *chan = periph->periph_channel; struct scsi_sense_data *sense = &xs->sense.scsi_sense; struct sd_softc *sd = device_private(periph->periph_dev); struct dk_softc *dksc = &sd->sc_dksc; int error, retval = EJUSTRETURN; /* * If the periph is already recovering, just do the normal * error processing. */ if (periph->periph_flags & PERIPH_RECOVERING) return (retval); /* * Ignore errors from accessing illegal fields (e.g. trying to * lock the door of a digicam, which doesn't have a door that * can be locked) for the SCSI_PREVENT_ALLOW_MEDIUM_REMOVAL command. */ if (xs->cmd->opcode == SCSI_PREVENT_ALLOW_MEDIUM_REMOVAL && SSD_SENSE_KEY(sense->flags) == SKEY_ILLEGAL_REQUEST && sense->asc == 0x24 && sense->ascq == 0x00) { /* Illegal field in CDB */ if (!(xs->xs_control & XS_CTL_SILENT)) { scsipi_printaddr(periph); printf("no door lock\n"); } xs->xs_control |= XS_CTL_IGNORE_ILLEGAL_REQUEST; return (retval); } /* * If the device is not open yet, let the generic code handle it. */ if ((periph->periph_flags & PERIPH_MEDIA_LOADED) == 0) return (retval); /* * If it isn't a extended or extended/deferred error, let * the generic code handle it. */ if (SSD_RCODE(sense->response_code) != SSD_RCODE_CURRENT && SSD_RCODE(sense->response_code) != SSD_RCODE_DEFERRED) return (retval); if (SSD_SENSE_KEY(sense->flags) == SKEY_NOT_READY && sense->asc == 0x4) { if (sense->ascq == 0x01) { /* * Unit In The Process Of Becoming Ready. */ printf("%s: waiting for pack to spin up...\n", dksc->sc_xname); if (!callout_pending(&periph->periph_callout)) scsipi_periph_freeze(periph, 1); callout_reset(&periph->periph_callout, 5 * hz, scsipi_periph_timed_thaw, periph); retval = ERESTART; } else if (sense->ascq == 0x02) { printf("%s: pack is stopped, restarting...\n", dksc->sc_xname); mutex_enter(chan_mtx(chan)); periph->periph_flags |= PERIPH_RECOVERING; mutex_exit(chan_mtx(chan)); error = scsipi_start(periph, SSS_START, XS_CTL_URGENT|XS_CTL_HEAD_TAG| XS_CTL_THAW_PERIPH|XS_CTL_FREEZE_PERIPH); if (error) { aprint_error_dev(dksc->sc_dev, "unable to restart pack\n"); retval = error; } else retval = ERESTART; mutex_enter(chan_mtx(chan)); periph->periph_flags &= ~PERIPH_RECOVERING; mutex_exit(chan_mtx(chan)); } } if (SSD_SENSE_KEY(sense->flags) == SKEY_MEDIUM_ERROR && sense->asc == 0x31 && sense->ascq == 0x00) { /* maybe for any asq ? */ /* Medium Format Corrupted */ retval = EFTYPE; } return (retval); } static int sdsize(dev_t dev) { struct sd_softc *sd; struct dk_softc *dksc; int unit; unit = SDUNIT(dev); sd = device_lookup_private(&sd_cd, unit); if (sd == NULL) return (-1); dksc = &sd->sc_dksc; if (!device_is_active(dksc->sc_dev)) return (-1); return dk_size(dksc, dev); } /* #define SD_DUMP_NOT_TRUSTED if you just want to watch */ static struct scsipi_xfer sx; /* * dump all of physical memory into the partition specified, starting * at offset 'dumplo' into the partition. */ static int sddump(dev_t dev, daddr_t blkno, void *va, size_t size) { struct sd_softc *sd; struct dk_softc *dksc; struct scsipi_periph *periph; int unit; unit = SDUNIT(dev); if ((sd = device_lookup_private(&sd_cd, unit)) == NULL) return (ENXIO); dksc = &sd->sc_dksc; if (!device_is_active(dksc->sc_dev)) return (ENODEV); periph = sd->sc_periph; /* Make sure it was initialized. */ if ((periph->periph_flags & PERIPH_MEDIA_LOADED) == 0) return (ENXIO); return dk_dump(dksc, dev, blkno, va, size, 0); } static int sd_dumpblocks(device_t dev, void *va, daddr_t blkno, int nblk) { struct sd_softc *sd = device_private(dev); struct dk_softc *dksc = &sd->sc_dksc; struct disk_geom *dg = &dksc->sc_dkdev.dk_geom; struct scsipi_rw_10 cmd; /* write command */ struct scsipi_xfer *xs; /* ... convenience */ struct scsipi_periph *periph; struct scsipi_channel *chan; size_t sectorsize; periph = sd->sc_periph; chan = periph->periph_channel; sectorsize = dg->dg_secsize; xs = &sx; #ifndef SD_DUMP_NOT_TRUSTED /* * Fill out the scsi command */ memset(&cmd, 0, sizeof(cmd)); cmd.opcode = WRITE_10; _lto4b(blkno, cmd.addr); _lto2b(nblk, cmd.length); /* * Fill out the scsipi_xfer structure * Note: we cannot sleep as we may be an interrupt * don't use scsipi_command() as it may want to wait * for an xs. */ memset(xs, 0, sizeof(sx)); xs->xs_control |= XS_CTL_NOSLEEP | XS_CTL_POLL | XS_CTL_DATA_OUT; xs->xs_status = 0; xs->xs_periph = periph; xs->xs_retries = SDRETRIES; xs->timeout = 10000; /* 10000 millisecs for a disk ! */ xs->cmd = (struct scsipi_generic *)&cmd; xs->cmdlen = sizeof(cmd); xs->resid = nblk * sectorsize; xs->error = XS_NOERROR; xs->bp = 0; xs->data = va; xs->datalen = nblk * sectorsize; callout_init(&xs->xs_callout, 0); /* * Pass all this info to the scsi driver. */ scsipi_adapter_request(chan, ADAPTER_REQ_RUN_XFER, xs); if ((xs->xs_status & XS_STS_DONE) == 0 || xs->error != XS_NOERROR) return (EIO); #else /* SD_DUMP_NOT_TRUSTED */ /* Let's just talk about this first... */ printf("sd%d: dump addr 0x%x, blk %d\n", unit, va, blkno); delay(500 * 1000); /* half a second */ #endif /* SD_DUMP_NOT_TRUSTED */ return (0); } static int sd_mode_sense(struct sd_softc *sd, u_int8_t byte2, void *sense, size_t size, int page, int flags, int *big) { if ((sd->sc_periph->periph_quirks & PQUIRK_ONLYBIG) && !(sd->sc_periph->periph_quirks & PQUIRK_NOBIGMODESENSE)) { *big = 1; return scsipi_mode_sense_big(sd->sc_periph, byte2, page, sense, size + sizeof(struct scsi_mode_parameter_header_10), flags, SDRETRIES, 6000); } else { *big = 0; return scsipi_mode_sense(sd->sc_periph, byte2, page, sense, size + sizeof(struct scsi_mode_parameter_header_6), flags, SDRETRIES, 6000); } } static int sd_mode_select(struct sd_softc *sd, u_int8_t byte2, void *sense, size_t size, int flags, int big) { if (big) { struct scsi_mode_parameter_header_10 *header = sense; _lto2b(0, header->data_length); return scsipi_mode_select_big(sd->sc_periph, byte2, sense, size + sizeof(struct scsi_mode_parameter_header_10), flags, SDRETRIES, 6000); } else { struct scsi_mode_parameter_header_6 *header = sense; header->data_length = 0; return scsipi_mode_select(sd->sc_periph, byte2, sense, size + sizeof(struct scsi_mode_parameter_header_6), flags, SDRETRIES, 6000); } } /* * sd_validate_blksize: * * Validate the block size. Print error if periph is specified, */ static int sd_validate_blksize(struct scsipi_periph *periph, int len) { if (len >= 256 && powerof2(len) && len <= 4096) { return 1; } if (periph) { scsipi_printaddr(periph); printf("%s sector size: 0x%x. Defaulting to %d bytes.\n", !powerof2(len) ? "preposterous" : "unsupported", len, SD_DEFAULT_BLKSIZE); } return 0; } /* * sd_read_capacity: * * Find out from the device what its capacity is. */ static u_int64_t sd_read_capacity(struct scsipi_periph *periph, int *blksize, int flags) { union { struct scsipi_read_capacity_10 cmd; struct scsipi_read_capacity_16 cmd16; } cmd; union { struct scsipi_read_capacity_10_data data; struct scsipi_read_capacity_16_data data16; } *datap; uint64_t rv; memset(&cmd, 0, sizeof(cmd)); cmd.cmd.opcode = READ_CAPACITY_10; /* * Don't allocate data buffer on stack; * The lower driver layer might use the same stack and * if it uses region which is in the same cacheline, * cache flush ops against the data buffer won't work properly. */ datap = malloc(sizeof(*datap), M_TEMP, M_WAITOK); if (datap == NULL) return 0; /* * If the command works, interpret the result as a 4 byte * number of blocks */ rv = 0; memset(datap, 0, sizeof(datap->data)); if (scsipi_command(periph, (void *)&cmd.cmd, sizeof(cmd.cmd), (void *)datap, sizeof(datap->data), SCSIPIRETRIES, 20000, NULL, flags | XS_CTL_DATA_IN | XS_CTL_SILENT) != 0) goto out; if (_4btol(datap->data.addr) != 0xffffffff) { *blksize = _4btol(datap->data.length); rv = _4btol(datap->data.addr) + 1; goto out; } /* * Device is larger than can be reflected by READ CAPACITY (10). * Try READ CAPACITY (16). */ memset(&cmd, 0, sizeof(cmd)); cmd.cmd16.opcode = READ_CAPACITY_16; cmd.cmd16.byte2 = SRC16_SERVICE_ACTION; _lto4b(sizeof(datap->data16), cmd.cmd16.len); memset(datap, 0, sizeof(datap->data16)); if (scsipi_command(periph, (void *)&cmd.cmd16, sizeof(cmd.cmd16), (void *)datap, sizeof(datap->data16), SCSIPIRETRIES, 20000, NULL, flags | XS_CTL_DATA_IN | XS_CTL_SILENT) != 0) goto out; *blksize = _4btol(datap->data16.length); rv = _8btol(datap->data16.addr) + 1; out: free(datap, M_TEMP); return rv; } static int sd_get_simplifiedparms(struct sd_softc *sd, struct disk_parms *dp, int flags) { struct { struct scsi_mode_parameter_header_6 header; /* no block descriptor */ u_int8_t pg_code; /* page code (should be 6) */ u_int8_t pg_length; /* page length (should be 11) */ u_int8_t wcd; /* bit0: cache disable */ u_int8_t lbs[2]; /* logical block size */ u_int8_t size[5]; /* number of log. blocks */ u_int8_t pp; /* power/performance */ u_int8_t flags; u_int8_t resvd; } scsipi_sense; u_int64_t blocks; int error, blksize; /* * sd_read_capacity (ie "read capacity") and mode sense page 6 * give the same information. Do both for now, and check * for consistency. * XXX probably differs for removable media */ dp->blksize = SD_DEFAULT_BLKSIZE; if ((blocks = sd_read_capacity(sd->sc_periph, &blksize, flags)) == 0) return (SDGP_RESULT_OFFLINE); /* XXX? */ error = scsipi_mode_sense(sd->sc_periph, SMS_DBD, 6, &scsipi_sense.header, sizeof(scsipi_sense), flags, SDRETRIES, 6000); if (error != 0) return (SDGP_RESULT_OFFLINE); /* XXX? */ dp->blksize = blksize; if (!sd_validate_blksize(NULL, dp->blksize)) dp->blksize = _2btol(scsipi_sense.lbs); if (!sd_validate_blksize(sd->sc_periph, dp->blksize)) dp->blksize = SD_DEFAULT_BLKSIZE; /* * Create a pseudo-geometry. */ dp->heads = 64; dp->sectors = 32; dp->cyls = blocks / (dp->heads * dp->sectors); dp->disksize = _5btol(scsipi_sense.size); if (dp->disksize <= UINT32_MAX && dp->disksize != blocks) { printf("RBC size: mode sense=%llu, get cap=%llu\n", (unsigned long long)dp->disksize, (unsigned long long)blocks); dp->disksize = blocks; } dp->disksize512 = (dp->disksize * dp->blksize) / DEV_BSIZE; return (SDGP_RESULT_OK); } /* * Get the scsi driver to send a full inquiry to the * device and use the * results to fill out the disk parameter structure. */ static int sd_get_capacity(struct sd_softc *sd, struct disk_parms *dp, int flags) { u_int64_t blocks; int error, blksize; #if 0 int i; u_int8_t *p; #endif dp->disksize = blocks = sd_read_capacity(sd->sc_periph, &blksize, flags); if (blocks == 0) { struct scsipi_read_format_capacities cmd; struct { struct scsipi_capacity_list_header header; struct scsipi_capacity_descriptor desc; } __packed data; memset(&cmd, 0, sizeof(cmd)); memset(&data, 0, sizeof(data)); cmd.opcode = READ_FORMAT_CAPACITIES; _lto2b(sizeof(data), cmd.length); error = scsipi_command(sd->sc_periph, (void *)&cmd, sizeof(cmd), (void *)&data, sizeof(data), SDRETRIES, 20000, NULL, flags | XS_CTL_DATA_IN); if (error == EFTYPE) { /* Medium Format Corrupted, handle as not formatted */ return (SDGP_RESULT_UNFORMATTED); } if (error || data.header.length == 0) return (SDGP_RESULT_OFFLINE); #if 0 printf("rfc: length=%d\n", data.header.length); printf("rfc result:"); for (i = sizeof(struct scsipi_capacity_list_header) + data.header.length, p = (void *)&data; i; i--, p++) printf(" %02x", *p); printf("\n"); #endif switch (data.desc.byte5 & SCSIPI_CAP_DESC_CODE_MASK) { case SCSIPI_CAP_DESC_CODE_RESERVED: case SCSIPI_CAP_DESC_CODE_FORMATTED: break; case SCSIPI_CAP_DESC_CODE_UNFORMATTED: return (SDGP_RESULT_UNFORMATTED); case SCSIPI_CAP_DESC_CODE_NONE: return (SDGP_RESULT_OFFLINE); } dp->disksize = blocks = _4btol(data.desc.nblks); if (blocks == 0) return (SDGP_RESULT_OFFLINE); /* XXX? */ blksize = _3btol(data.desc.blklen); } else if (!sd_validate_blksize(NULL, blksize)) { struct sd_mode_sense_data scsipi_sense; int big, bsize; struct scsi_general_block_descriptor *bdesc; memset(&scsipi_sense, 0, sizeof(scsipi_sense)); error = sd_mode_sense(sd, 0, &scsipi_sense, sizeof(scsipi_sense.blk_desc), 0, flags | XS_CTL_SILENT, &big); if (!error) { if (big) { bdesc = (void *)(&scsipi_sense.header.big + 1); bsize = _2btol(scsipi_sense.header.big.blk_desc_len); } else { bdesc = (void *)(&scsipi_sense.header.small + 1); bsize = scsipi_sense.header.small.blk_desc_len; } #if 0 printf("page 0 sense:"); for (i = sizeof(scsipi_sense), p = (void *)&scsipi_sense; i; i--, p++) printf(" %02x", *p); printf("\n"); printf("page 0 bsize=%d\n", bsize); printf("page 0 ok\n"); #endif if (bsize >= 8) { blksize = _3btol(bdesc->blklen); } } } if (!sd_validate_blksize(sd->sc_periph, blksize)) blksize = SD_DEFAULT_BLKSIZE; dp->blksize = blksize; dp->disksize512 = (blocks * dp->blksize) / DEV_BSIZE; return (0); } static int sd_get_parms_page4(struct sd_softc *sd, struct disk_parms *dp, int flags) { struct sd_mode_sense_data scsipi_sense; int error; int big, byte2; size_t poffset; union scsi_disk_pages *pages; byte2 = SMS_DBD; again: memset(&scsipi_sense, 0, sizeof(scsipi_sense)); error = sd_mode_sense(sd, byte2, &scsipi_sense, (byte2 ? 0 : sizeof(scsipi_sense.blk_desc)) + sizeof(scsipi_sense.pages.rigid_geometry), 4, flags | XS_CTL_SILENT, &big); if (error) { if (byte2 == SMS_DBD) { /* No result; try once more with DBD off */ byte2 = 0; goto again; } return (error); } if (big) { poffset = sizeof scsipi_sense.header.big; poffset += _2btol(scsipi_sense.header.big.blk_desc_len); } else { poffset = sizeof scsipi_sense.header.small; poffset += scsipi_sense.header.small.blk_desc_len; } if (poffset > sizeof(scsipi_sense) - sizeof(pages->rigid_geometry)) return ERESTART; pages = (void *)((u_long)&scsipi_sense + poffset); #if 0 { size_t i; u_int8_t *p; printf("page 4 sense:"); for (i = sizeof(scsipi_sense), p = (void *)&scsipi_sense; i; i--, p++) printf(" %02x", *p); printf("\n"); printf("page 4 pg_code=%d sense=%p/%p\n", pages->rigid_geometry.pg_code, &scsipi_sense, pages); } #endif if ((pages->rigid_geometry.pg_code & PGCODE_MASK) != 4) return (ERESTART); SC_DEBUG(sd->sc_periph, SCSIPI_DB3, ("%d cyls, %d heads, %d precomp, %d red_write, %d land_zone\n", _3btol(pages->rigid_geometry.ncyl), pages->rigid_geometry.nheads, _2btol(pages->rigid_geometry.st_cyl_wp), _2btol(pages->rigid_geometry.st_cyl_rwc), _2btol(pages->rigid_geometry.land_zone))); /* * KLUDGE!! (for zone recorded disks) * give a number of sectors so that sec * trks * cyls * is <= disk_size * can lead to wasted space! THINK ABOUT THIS ! */ dp->heads = pages->rigid_geometry.nheads; dp->cyls = _3btol(pages->rigid_geometry.ncyl); if (dp->heads == 0 || dp->cyls == 0) return (ERESTART); dp->sectors = dp->disksize / (dp->heads * dp->cyls); /* XXX */ dp->rot_rate = _2btol(pages->rigid_geometry.rpm); if (dp->rot_rate == 0) dp->rot_rate = 3600; #if 0 printf("page 4 ok\n"); #endif return (0); } static int sd_get_parms_page5(struct sd_softc *sd, struct disk_parms *dp, int flags) { struct sd_mode_sense_data scsipi_sense; int error; int big, byte2; size_t poffset; union scsi_disk_pages *pages; byte2 = SMS_DBD; again: memset(&scsipi_sense, 0, sizeof(scsipi_sense)); error = sd_mode_sense(sd, 0, &scsipi_sense, (byte2 ? 0 : sizeof(scsipi_sense.blk_desc)) + sizeof(scsipi_sense.pages.flex_geometry), 5, flags | XS_CTL_SILENT, &big); if (error) { if (byte2 == SMS_DBD) { /* No result; try once more with DBD off */ byte2 = 0; goto again; } return (error); } if (big) { poffset = sizeof scsipi_sense.header.big; poffset += _2btol(scsipi_sense.header.big.blk_desc_len); } else { poffset = sizeof scsipi_sense.header.small; poffset += scsipi_sense.header.small.blk_desc_len; } if (poffset > sizeof(scsipi_sense) - sizeof(pages->flex_geometry)) return ERESTART; pages = (void *)((u_long)&scsipi_sense + poffset); #if 0 { size_t i; u_int8_t *p; printf("page 5 sense:"); for (i = sizeof(scsipi_sense), p = (void *)&scsipi_sense; i; i--, p++) printf(" %02x", *p); printf("\n"); printf("page 5 pg_code=%d sense=%p/%p\n", pages->flex_geometry.pg_code, &scsipi_sense, pages); } #endif if ((pages->flex_geometry.pg_code & PGCODE_MASK) != 5) return (ERESTART); SC_DEBUG(sd->sc_periph, SCSIPI_DB3, ("%d cyls, %d heads, %d sec, %d bytes/sec\n", _3btol(pages->flex_geometry.ncyl), pages->flex_geometry.nheads, pages->flex_geometry.ph_sec_tr, _2btol(pages->flex_geometry.bytes_s))); dp->heads = pages->flex_geometry.nheads; dp->cyls = _2btol(pages->flex_geometry.ncyl); dp->sectors = pages->flex_geometry.ph_sec_tr; if (dp->heads == 0 || dp->cyls == 0 || dp->sectors == 0) return (ERESTART); dp->rot_rate = _2btol(pages->rigid_geometry.rpm); if (dp->rot_rate == 0) dp->rot_rate = 3600; #if 0 printf("page 5 ok\n"); #endif return (0); } static int sd_get_parms(struct sd_softc *sd, struct disk_parms *dp, int flags) { struct dk_softc *dksc = &sd->sc_dksc; int error; /* * If offline, the SDEV_MEDIA_LOADED flag will be * cleared by the caller if necessary. */ if (sd->type == T_SIMPLE_DIRECT) { error = sd_get_simplifiedparms(sd, dp, flags); if (!error) goto setprops; return (error); } error = sd_get_capacity(sd, dp, flags); if (error) return (error); if (sd->type == T_OPTICAL) goto page0; if (sd->sc_periph->periph_flags & PERIPH_REMOVABLE) { if (!sd_get_parms_page5(sd, dp, flags) || !sd_get_parms_page4(sd, dp, flags)) goto setprops; } else { if (!sd_get_parms_page4(sd, dp, flags) || !sd_get_parms_page5(sd, dp, flags)) goto setprops; } page0: printf("%s: fabricating a geometry\n", dksc->sc_xname); /* Try calling driver's method for figuring out geometry. */ if (!sd->sc_periph->periph_channel->chan_adapter->adapt_getgeom || !(*sd->sc_periph->periph_channel->chan_adapter->adapt_getgeom) (sd->sc_periph, dp, dp->disksize)) { /* * Use adaptec standard fictitious geometry * this depends on which controller (e.g. 1542C is * different. but we have to put SOMETHING here..) */ dp->heads = 64; dp->sectors = 32; dp->cyls = dp->disksize / (64 * 32); } dp->rot_rate = 3600; setprops: sd_set_geometry(sd); return (SDGP_RESULT_OK); } static int sd_flush(struct sd_softc *sd, int flags) { struct scsipi_periph *periph = sd->sc_periph; struct scsi_synchronize_cache_10 cmd; /* * If the device is SCSI-2, issue a SYNCHRONIZE CACHE. * We issue with address 0 length 0, which should be * interpreted by the device as "all remaining blocks * starting at address 0". We ignore ILLEGAL REQUEST * in the event that the command is not supported by * the device, and poll for completion so that we know * that the cache has actually been flushed. * * Unless, that is, the device can't handle the SYNCHRONIZE CACHE * command, as indicated by our quirks flags. * * XXX What about older devices? */ if (periph->periph_version < 2 || (periph->periph_quirks & PQUIRK_NOSYNCCACHE)) return (0); sd->flags |= SDF_FLUSHING; memset(&cmd, 0, sizeof(cmd)); cmd.opcode = SCSI_SYNCHRONIZE_CACHE_10; return (scsipi_command(periph, (void *)&cmd, sizeof(cmd), 0, 0, SDRETRIES, 100000, NULL, flags | XS_CTL_IGNORE_ILLEGAL_REQUEST)); } static int sd_getcache(struct sd_softc *sd, int *bitsp) { struct scsipi_periph *periph = sd->sc_periph; struct sd_mode_sense_data scsipi_sense; int error, bits = 0; int big; union scsi_disk_pages *pages; uint8_t dev_spec; /* only SCSI-2 and later supported */ if (periph->periph_version < 2) return (EOPNOTSUPP); memset(&scsipi_sense, 0, sizeof(scsipi_sense)); error = sd_mode_sense(sd, SMS_DBD, &scsipi_sense, sizeof(scsipi_sense.pages.caching_params), 8, XS_CTL_SILENT, &big); if (error) return (error); if (big) { pages = (void *)(&scsipi_sense.header.big + 1); dev_spec = scsipi_sense.header.big.dev_spec; } else { pages = (void *)(&scsipi_sense.header.small + 1); dev_spec = scsipi_sense.header.small.dev_spec; } if ((pages->caching_params.flags & CACHING_RCD) == 0) bits |= DKCACHE_READ; if (pages->caching_params.flags & CACHING_WCE) bits |= DKCACHE_WRITE; if (pages->caching_params.pg_code & PGCODE_PS) bits |= DKCACHE_SAVE; /* * Support for FUA/DPO, defined starting with SCSI-2. Use only * if device claims to support it, according to the MODE SENSE. */ if (!(periph->periph_quirks & PQUIRK_NOFUA) && ISSET(dev_spec, SMH_DSP_DPOFUA)) bits |= DKCACHE_FUA | DKCACHE_DPO; memset(&scsipi_sense, 0, sizeof(scsipi_sense)); error = sd_mode_sense(sd, SMS_DBD, &scsipi_sense, sizeof(scsipi_sense.pages.caching_params), SMS_PCTRL_CHANGEABLE|8, XS_CTL_SILENT, &big); if (error == 0) { if (big) pages = (void *)(&scsipi_sense.header.big + 1); else pages = (void *)(&scsipi_sense.header.small + 1); if (pages->caching_params.flags & CACHING_RCD) bits |= DKCACHE_RCHANGE; if (pages->caching_params.flags & CACHING_WCE) bits |= DKCACHE_WCHANGE; } *bitsp = bits; return (0); } static int sd_setcache(struct sd_softc *sd, int bits) { struct scsipi_periph *periph = sd->sc_periph; struct sd_mode_sense_data scsipi_sense; int error; uint8_t oflags, byte2 = 0; int big; union scsi_disk_pages *pages; if (periph->periph_version < 2) return (EOPNOTSUPP); memset(&scsipi_sense, 0, sizeof(scsipi_sense)); error = sd_mode_sense(sd, SMS_DBD, &scsipi_sense, sizeof(scsipi_sense.pages.caching_params), 8, 0, &big); if (error) return (error); if (big) pages = (void *)(&scsipi_sense.header.big + 1); else pages = (void *)(&scsipi_sense.header.small + 1); oflags = pages->caching_params.flags; if (bits & DKCACHE_READ) pages->caching_params.flags &= ~CACHING_RCD; else pages->caching_params.flags |= CACHING_RCD; if (bits & DKCACHE_WRITE) pages->caching_params.flags |= CACHING_WCE; else pages->caching_params.flags &= ~CACHING_WCE; if (oflags == pages->caching_params.flags) return (0); pages->caching_params.pg_code &= PGCODE_MASK; if (bits & DKCACHE_SAVE) byte2 |= SMS_SP; return (sd_mode_select(sd, byte2|SMS_PF, &scsipi_sense, sizeof(struct scsi_mode_page_header) + pages->caching_params.pg_length, 0, big)); } static void sd_set_geometry(struct sd_softc *sd) { struct dk_softc *dksc = &sd->sc_dksc; struct disk_geom *dg = &dksc->sc_dkdev.dk_geom; memset(dg, 0, sizeof(*dg)); dg->dg_secperunit = sd->params.disksize; dg->dg_secsize = sd->params.blksize; dg->dg_nsectors = sd->params.sectors; dg->dg_ntracks = sd->params.heads; dg->dg_ncylinders = sd->params.cyls; disk_set_info(dksc->sc_dev, &dksc->sc_dkdev, sd->typename); }
5 1 2 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 /* $NetBSD: ext2fs_vfsops.c,v 1.225 2023/08/27 16:35:51 christos Exp $ */ /* * Copyright (c) 1989, 1991, 1993, 1994 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)ffs_vfsops.c 8.14 (Berkeley) 11/28/94 * Modified for ext2fs by Manuel Bouyer. */ /* * Copyright (c) 1997 Manuel Bouyer. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * @(#)ffs_vfsops.c 8.14 (Berkeley) 11/28/94 * Modified for ext2fs by Manuel Bouyer. */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: ext2fs_vfsops.c,v 1.225 2023/08/27 16:35:51 christos Exp $"); #if defined(_KERNEL_OPT) #include "opt_compat_netbsd.h" #endif #include <sys/param.h> #include <sys/systm.h> #include <sys/sysctl.h> #include <sys/namei.h> #include <sys/proc.h> #include <sys/kernel.h> #include <sys/vnode.h> #include <sys/socket.h> #include <sys/mount.h> #include <sys/buf.h> #include <sys/device.h> #include <sys/file.h> #include <sys/disklabel.h> #include <sys/ioctl.h> #include <sys/errno.h> #include <sys/pool.h> #include <sys/lock.h> #include <sys/conf.h> #include <sys/kauth.h> #include <sys/module.h> #include <miscfs/genfs/genfs.h> #include <miscfs/specfs/specdev.h> #include <ufs/ufs/quota.h> #include <ufs/ufs/ufsmount.h> #include <ufs/ufs/inode.h> #include <ufs/ufs/dir.h> #include <ufs/ufs/ufs_extern.h> #include <ufs/ext2fs/ext2fs.h> #include <ufs/ext2fs/ext2fs_dir.h> #include <ufs/ext2fs/ext2fs_extern.h> MODULE(MODULE_CLASS_VFS, ext2fs, "ufs"); int ext2fs_sbupdate(struct ufsmount *, int); static int ext2fs_sbfill(struct m_ext2fs *, int); extern const struct vnodeopv_desc ext2fs_vnodeop_opv_desc; extern const struct vnodeopv_desc ext2fs_specop_opv_desc; extern const struct vnodeopv_desc ext2fs_fifoop_opv_desc; const struct vnodeopv_desc * const ext2fs_vnodeopv_descs[] = { &ext2fs_vnodeop_opv_desc, &ext2fs_specop_opv_desc, &ext2fs_fifoop_opv_desc, NULL, }; struct vfsops ext2fs_vfsops = { .vfs_name = MOUNT_EXT2FS, .vfs_min_mount_data = sizeof (struct ufs_args), .vfs_mount = ext2fs_mount, .vfs_start = ufs_start, .vfs_unmount = ext2fs_unmount, .vfs_root = ufs_root, .vfs_quotactl = ufs_quotactl, .vfs_statvfs = ext2fs_statvfs, .vfs_sync = ext2fs_sync, .vfs_vget = ufs_vget, .vfs_loadvnode = ext2fs_loadvnode, .vfs_newvnode = ext2fs_newvnode, .vfs_fhtovp = ext2fs_fhtovp, .vfs_vptofh = ext2fs_vptofh, .vfs_init = ext2fs_init, .vfs_reinit = ext2fs_reinit, .vfs_done = ext2fs_done, .vfs_mountroot = ext2fs_mountroot, .vfs_snapshot = (void *)eopnotsupp, .vfs_extattrctl = vfs_stdextattrctl, .vfs_suspendctl = genfs_suspendctl, .vfs_renamelock_enter = genfs_renamelock_enter, .vfs_renamelock_exit = genfs_renamelock_exit, .vfs_fsync = (void *)eopnotsupp, .vfs_opv_descs = ext2fs_vnodeopv_descs }; static const struct genfs_ops ext2fs_genfsops = { .gop_size = genfs_size, .gop_alloc = ext2fs_gop_alloc, .gop_write = genfs_gop_write, .gop_markupdate = ufs_gop_markupdate, .gop_putrange = genfs_gop_putrange, }; static const struct ufs_ops ext2fs_ufsops = { .uo_itimes = ext2fs_itimes, .uo_update = ext2fs_update, .uo_bufrd = ext2fs_bufrd, .uo_bufwr = ext2fs_bufwr, }; static void e2fs_cgload(const char *ondisk, struct ext2_gd *inmemory, int cg_size, int shift_cg_entry_size) { if (shift_cg_entry_size == 6) { memcpy(inmemory, ondisk, cg_size); return; } const char *iptr = ondisk; struct ext2_gd *optr = inmemory; int sh = 1 << shift_cg_entry_size; int lim = cg_size >> shift_cg_entry_size; if (shift_cg_entry_size > 6) { for (int i = 0; i < lim; i++, optr++, iptr += sh) { memcpy(optr, iptr, sizeof(*optr)); } } else { for (int i = 0; i < lim; i++, optr++, iptr += sh) { memcpy(optr, iptr, E2FS_REV0_GD_SIZE); memset((char *)optr + E2FS_REV0_GD_SIZE, 0, sizeof(*optr) - E2FS_REV0_GD_SIZE); } } } static void e2fs_cgsave(const struct ext2_gd *inmemory, char *ondisk, int cg_size, int shift_cg_entry_size) { if (shift_cg_entry_size == 6) { memcpy(ondisk, inmemory, cg_size); return; } const struct ext2_gd *iptr = inmemory; char *optr = ondisk; int sh = 1 << shift_cg_entry_size; int lim = cg_size >> shift_cg_entry_size; if (shift_cg_entry_size > 6) { for (int i = 0; i < lim; i++, iptr++, optr += sh) { memcpy(optr, iptr, sizeof(*iptr)); memset(optr + sizeof(*iptr), 0, sh - sizeof(*iptr)); } } else { for (int i = 0; i < lim; i++, iptr++, optr += sh) { memcpy(optr, iptr, E2FS_REV0_GD_SIZE); } } } /* Fill in the inode uid/gid from ext2 halves. */ void ext2fs_set_inode_guid(struct inode *ip) { ip->i_gid = ip->i_e2fs_gid; ip->i_uid = ip->i_e2fs_uid; if (ip->i_e2fs->e2fs.e2fs_rev > E2FS_REV0) { ip->i_gid |= ip->i_e2fs_gid_high << 16; ip->i_uid |= ip->i_e2fs_uid_high << 16; } } SYSCTL_SETUP(ext2fs_sysctl_setup, "ext2fs sysctl") { sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT, CTLTYPE_NODE, "ext2fs", SYSCTL_DESCR("Linux EXT2FS file system"), NULL, 0, NULL, 0, CTL_VFS, 17, CTL_EOL); /* * XXX the "17" above could be dynamic, thereby eliminating * one more instance of the "number to vfs" mapping problem, * but "17" is the order as taken from sys/mount.h */ } static int ext2fs_modcmd(modcmd_t cmd, void *arg) { int error; switch (cmd) { case MODULE_CMD_INIT: error = vfs_attach(&ext2fs_vfsops); if (error != 0) break; break; case MODULE_CMD_FINI: error = vfs_detach(&ext2fs_vfsops); if (error != 0) break; break; default: error = ENOTTY; break; } return error; } /* * XXX Same structure as FFS inodes? Should we share a common pool? */ struct pool ext2fs_inode_pool; extern u_long ext2gennumber; void ext2fs_init(void) { pool_init(&ext2fs_inode_pool, sizeof(struct inode), 0, 0, 0, "ext2fsinopl", &pool_allocator_nointr, IPL_NONE); ufs_init(); } void ext2fs_reinit(void) { ufs_reinit(); } void ext2fs_done(void) { ufs_done(); pool_destroy(&ext2fs_inode_pool); } static void ext2fs_sb_setmountinfo(struct m_ext2fs *fs, struct mount *mp) { (void)strlcpy(fs->e2fs_fsmnt, mp->mnt_stat.f_mntonname, sizeof(fs->e2fs_fsmnt)); if (fs->e2fs_ronly == 0 && fs->e2fs.e2fs_rev > E2FS_REV0) { (void)strlcpy(fs->e2fs.e2fs_fsmnt, mp->mnt_stat.f_mntonname, sizeof(fs->e2fs.e2fs_fsmnt)); fs->e2fs.e2fs_mtime = time_second; fs->e2fs.e2fs_mnt_count++; fs->e2fs_fmod = 1; } } /* * Called by main() when ext2fs is going to be mounted as root. * * Name is updated by mount(8) after booting. */ int ext2fs_mountroot(void) { extern struct vnode *rootvp; struct m_ext2fs *fs; struct mount *mp; struct ufsmount *ump; int error; if (device_class(root_device) != DV_DISK) return ENODEV; if ((error = vfs_rootmountalloc(MOUNT_EXT2FS, "root_device", &mp))) { vrele(rootvp); return error; } if ((error = ext2fs_mountfs(rootvp, mp)) != 0) { vfs_unbusy(mp); vfs_rele(mp); return error; } mountlist_append(mp); ump = VFSTOUFS(mp); fs = ump->um_e2fs; ext2fs_sb_setmountinfo(fs, mp); (void)ext2fs_statvfs(mp, &mp->mnt_stat); vfs_unbusy(mp); setrootfstime((time_t)fs->e2fs.e2fs_wtime); return 0; } /* * VFS Operations. * * mount system call */ int ext2fs_mount(struct mount *mp, const char *path, void *data, size_t *data_len) { struct lwp *l = curlwp; struct vnode *devvp; struct ufs_args *args = data; struct ufsmount *ump = NULL; struct m_ext2fs *fs; int error = 0, flags, update; mode_t accessmode; if (args == NULL) return EINVAL; if (*data_len < sizeof *args) return EINVAL; if (mp->mnt_flag & MNT_GETARGS) { ump = VFSTOUFS(mp); if (ump == NULL) return EIO; memset(args, 0, sizeof *args); args->fspec = NULL; *data_len = sizeof *args; return 0; } update = mp->mnt_flag & MNT_UPDATE; /* Check arguments */ if (args->fspec != NULL) { /* * Look up the name and verify that it's sane. */ error = namei_simple_user(args->fspec, NSM_FOLLOW_NOEMULROOT, &devvp); if (error != 0) return error; if (!update) { /* * Be sure this is a valid block device */ if (devvp->v_type != VBLK) error = ENOTBLK; else if (bdevsw_lookup(devvp->v_rdev) == NULL) error = ENXIO; } else { /* * Be sure we're still naming the same device * used for our initial mount */ ump = VFSTOUFS(mp); if (devvp != ump->um_devvp) { if (devvp->v_rdev != ump->um_devvp->v_rdev) error = EINVAL; else { vrele(devvp); devvp = ump->um_devvp; vref(devvp); } } } } else { if (!update) { /* New mounts must have a filename for the device */ return EINVAL; } else { ump = VFSTOUFS(mp); devvp = ump->um_devvp; vref(devvp); } } /* * If mount by non-root, then verify that user has necessary * permissions on the device. * * Permission to update a mount is checked higher, so here we presume * updating the mount is okay (for example, as far as securelevel goes) * which leaves us with the normal check. */ if (error == 0) { accessmode = VREAD; if (update ? (mp->mnt_iflag & IMNT_WANTRDWR) != 0 : (mp->mnt_flag & MNT_RDONLY) == 0) accessmode |= VWRITE; vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY); error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_MOUNT, KAUTH_REQ_SYSTEM_MOUNT_DEVICE, mp, devvp, KAUTH_ARG(accessmode)); VOP_UNLOCK(devvp); } if (error) { vrele(devvp); return error; } if (!update) { int xflags; if (mp->mnt_flag & MNT_RDONLY) xflags = FREAD; else xflags = FREAD|FWRITE; vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY); error = VOP_OPEN(devvp, xflags, FSCRED); VOP_UNLOCK(devvp); if (error) goto fail; error = ext2fs_mountfs(devvp, mp); if (error) { vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY); (void)VOP_CLOSE(devvp, xflags, NOCRED); VOP_UNLOCK(devvp); goto fail; } ump = VFSTOUFS(mp); fs = ump->um_e2fs; } else { /* * Update the mount. */ /* * The initial mount got a reference on this * device, so drop the one obtained via * namei(), above. */ vrele(devvp); ump = VFSTOUFS(mp); fs = ump->um_e2fs; if (fs->e2fs_ronly == 0 && (mp->mnt_flag & MNT_RDONLY)) { /* * Changing from r/w to r/o */ flags = WRITECLOSE; if (mp->mnt_flag & MNT_FORCE) flags |= FORCECLOSE; error = ext2fs_flushfiles(mp, flags); if (error == 0 && ext2fs_cgupdate(ump, MNT_WAIT) == 0 && (fs->e2fs.e2fs_state & E2FS_ERRORS) == 0) { fs->e2fs.e2fs_state = E2FS_ISCLEAN; (void) ext2fs_sbupdate(ump, MNT_WAIT); } if (error) return error; fs->e2fs_ronly = 1; } if (mp->mnt_flag & MNT_RELOAD) { error = ext2fs_reload(mp, l->l_cred, l); if (error) return error; } if (fs->e2fs_ronly && (mp->mnt_iflag & IMNT_WANTRDWR)) { /* * Changing from read-only to read/write */ fs->e2fs_ronly = 0; if (fs->e2fs.e2fs_state == E2FS_ISCLEAN) fs->e2fs.e2fs_state = 0; else fs->e2fs.e2fs_state = E2FS_ERRORS; fs->e2fs_fmod = 1; } if (args->fspec == NULL) return 0; } error = set_statvfs_info(path, UIO_USERSPACE, args->fspec, UIO_USERSPACE, mp->mnt_op->vfs_name, mp, l); if (error == 0) ext2fs_sb_setmountinfo(fs, mp); if (fs->e2fs_fmod != 0) { /* XXX */ fs->e2fs_fmod = 0; if (fs->e2fs.e2fs_state == 0) fs->e2fs.e2fs_wtime = time_second; else printf("%s: file system not clean; please fsck(8)\n", mp->mnt_stat.f_mntfromname); (void) ext2fs_cgupdate(ump, MNT_WAIT); } return error; fail: vrele(devvp); return error; } /* * Sanity check the disk vnode content, and copy it over to inode structure. */ static int ext2fs_loadvnode_content(struct m_ext2fs *fs, ino_t ino, struct buf *bp, struct inode *ip) { struct ext2fs_dinode *din; int error = 0; din = (struct ext2fs_dinode *)((char *)bp->b_data + (ino_to_fsbo(fs, ino) * EXT2_DINODE_SIZE(fs))); /* sanity checks - inode data NOT byteswapped at this point */ if (EXT2_DINODE_FITS(din, e2di_extra_isize, EXT2_DINODE_SIZE(fs)) && (EXT2_DINODE_SIZE(fs) - EXT2_REV0_DINODE_SIZE) < fs2h16(din->e2di_extra_isize)) { printf("ext2fs: inode %"PRIu64" bad extra_isize %u", ino, din->e2di_extra_isize); error = EINVAL; goto bad; } /* everything alright, proceed with copy */ if (ip->i_din.e2fs_din == NULL) ip->i_din.e2fs_din = kmem_alloc(EXT2_DINODE_SIZE(fs), KM_SLEEP); e2fs_iload(din, ip->i_din.e2fs_din, EXT2_DINODE_SIZE(fs)); ext2fs_set_inode_guid(ip); bad: return error; } /* * Reload all incore data for a filesystem (used after running fsck on * the root filesystem and finding things to fix). The filesystem must * be mounted read-only. * * Things to do to update the mount: * 1) invalidate all cached meta-data. * 2) re-read superblock from disk. * 3) re-read summary information from disk. * 4) invalidate all inactive vnodes. * 5) invalidate all cached file data. * 6) re-read inode data for all active vnodes. */ int ext2fs_reload(struct mount *mp, kauth_cred_t cred, struct lwp *l) { struct vnode *vp, *devvp; struct inode *ip; struct buf *bp; struct m_ext2fs *fs; struct ext2fs *newfs; int i, error; struct ufsmount *ump; struct vnode_iterator *marker; if ((mp->mnt_flag & MNT_RDONLY) == 0) return EINVAL; ump = VFSTOUFS(mp); /* * Step 1: invalidate all cached meta-data. */ devvp = ump->um_devvp; vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY); error = vinvalbuf(devvp, 0, cred, l, 0, 0); VOP_UNLOCK(devvp); if (error) panic("ext2fs_reload: dirty1"); fs = ump->um_e2fs; /* * Step 2: re-read superblock from disk. Copy in new superblock, and * compute in-memory values. */ error = bread(devvp, SBLOCK, SBSIZE, 0, &bp); if (error) return error; newfs = (struct ext2fs *)bp->b_data; e2fs_sbload(newfs, &fs->e2fs); brelse(bp, 0); error = ext2fs_sbfill(fs, (mp->mnt_flag & MNT_RDONLY) != 0); if (error) return error; /* * Step 3: re-read summary information from disk. */ for (i = 0; i < fs->e2fs_ngdb; i++) { error = bread(devvp , EXT2_FSBTODB(fs, fs->e2fs.e2fs_first_dblock + 1 /* superblock */ + i), fs->e2fs_bsize, 0, &bp); if (error) { return error; } e2fs_cgload(bp->b_data, &fs->e2fs_gd[i * fs->e2fs_bsize / sizeof(struct ext2_gd)], fs->e2fs_bsize, fs->e2fs_group_desc_shift); brelse(bp, 0); } vfs_vnode_iterator_init(mp, &marker); while ((vp = vfs_vnode_iterator_next(marker, NULL, NULL))) { /* * Step 4: invalidate all inactive vnodes. */ if (vrecycle(vp)) continue; /* * Step 5: invalidate all cached file data. */ if (vn_lock(vp, LK_EXCLUSIVE)) { vrele(vp); continue; } if (vinvalbuf(vp, 0, cred, l, 0, 0)) panic("ext2fs_reload: dirty2"); /* * Step 6: re-read inode data for all active vnodes. */ ip = VTOI(vp); error = bread(devvp, EXT2_FSBTODB(fs, ino_to_fsba(fs, ip->i_number)), (int)fs->e2fs_bsize, 0, &bp); if (error) { vput(vp); break; } error = ext2fs_loadvnode_content(fs, ip->i_number, bp, ip); brelse(bp, 0); if (error) { vput(vp); break; } vput(vp); } vfs_vnode_iterator_destroy(marker); return error; } /* * Common code for mount and mountroot */ int ext2fs_mountfs(struct vnode *devvp, struct mount *mp) { struct lwp *l = curlwp; struct ufsmount *ump; struct buf *bp; struct ext2fs *fs; struct m_ext2fs *m_fs; dev_t dev; int error, i, ronly; kauth_cred_t cred; dev = devvp->v_rdev; cred = l->l_cred; /* Flush out any old buffers remaining from a previous use. */ vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY); error = vinvalbuf(devvp, V_SAVE, cred, l, 0, 0); VOP_UNLOCK(devvp); if (error) return error; ronly = (mp->mnt_flag & MNT_RDONLY) != 0; bp = NULL; ump = NULL; /* Read the superblock from disk, and swap it directly. */ error = bread(devvp, SBLOCK, SBSIZE, 0, &bp); if (error) goto out; fs = (struct ext2fs *)bp->b_data; m_fs = kmem_zalloc(sizeof(*m_fs), KM_SLEEP); e2fs_sbload(fs, &m_fs->e2fs); brelse(bp, 0); bp = NULL; /* Once swapped, validate and fill in the superblock. */ error = ext2fs_sbfill(m_fs, ronly); if (error) { kmem_free(m_fs, sizeof(*m_fs)); goto out; } m_fs->e2fs_ronly = ronly; ump = kmem_zalloc(sizeof(*ump), KM_SLEEP); ump->um_fstype = UFS1; ump->um_ops = &ext2fs_ufsops; ump->um_e2fs = m_fs; if (ronly == 0) { if (m_fs->e2fs.e2fs_state == E2FS_ISCLEAN) m_fs->e2fs.e2fs_state = 0; else m_fs->e2fs.e2fs_state = E2FS_ERRORS; m_fs->e2fs_fmod = 1; } int32_t sh = m_fs->e2fs_bsize >> m_fs->e2fs_group_desc_shift; /* XXX: should be added in ext2fs_sbfill()? */ m_fs->e2fs_gd = kmem_alloc(m_fs->e2fs_ngdb * sh * sizeof(struct ext2_gd), KM_SLEEP); for (i = 0; i < m_fs->e2fs_ngdb; i++) { error = bread(devvp, EXT2_FSBTODB(m_fs, m_fs->e2fs.e2fs_first_dblock + 1 /* superblock */ + i), m_fs->e2fs_bsize, 0, &bp); if (error) goto out1; e2fs_cgload(bp->b_data, &m_fs->e2fs_gd[i * m_fs->e2fs_bsize / sizeof(struct ext2_gd)], m_fs->e2fs_bsize, m_fs->e2fs_group_desc_shift); brelse(bp, 0); bp = NULL; } error = ext2fs_cg_verify_and_initialize(devvp, m_fs, ronly); if (error) goto out1; mp->mnt_data = ump; mp->mnt_stat.f_fsidx.__fsid_val[0] = (long)dev; mp->mnt_stat.f_fsidx.__fsid_val[1] = makefstype(MOUNT_EXT2FS); mp->mnt_stat.f_fsid = mp->mnt_stat.f_fsidx.__fsid_val[0]; mp->mnt_stat.f_namemax = EXT2FS_MAXNAMLEN; mp->mnt_flag |= MNT_LOCAL; mp->mnt_dev_bshift = DEV_BSHIFT; /* XXX */ mp->mnt_fs_bshift = m_fs->e2fs_bshift; mp->mnt_iflag |= IMNT_DTYPE | IMNT_SHRLOOKUP; ump->um_flags = 0; ump->um_mountp = mp; ump->um_dev = dev; ump->um_devvp = devvp; ump->um_nindir = EXT2_NINDIR(m_fs); ump->um_lognindir = ffs(EXT2_NINDIR(m_fs)) - 1; ump->um_bptrtodb = m_fs->e2fs_fsbtodb; ump->um_seqinc = 1; /* no frags */ ump->um_maxsymlinklen = EXT2_MAXSYMLINKLEN; ump->um_dirblksiz = m_fs->e2fs_bsize; ump->um_maxfilesize = ((uint64_t)0x80000000 * m_fs->e2fs_bsize - 1); spec_node_setmountedfs(devvp, mp); return 0; out1: kmem_free(m_fs->e2fs_gd, m_fs->e2fs_ngdb * sh * sizeof(struct ext2_gd)); out: if (bp != NULL) brelse(bp, 0); if (ump) { kmem_free(ump->um_e2fs, sizeof(*m_fs)); kmem_free(ump, sizeof(*ump)); mp->mnt_data = NULL; } return error; } /* * unmount system call */ int ext2fs_unmount(struct mount *mp, int mntflags) { struct ufsmount *ump; struct m_ext2fs *fs; int error, flags; flags = 0; if (mntflags & MNT_FORCE) flags |= FORCECLOSE; if ((error = ext2fs_flushfiles(mp, flags)) != 0) return error; ump = VFSTOUFS(mp); fs = ump->um_e2fs; if (fs->e2fs_ronly == 0 && ext2fs_cgupdate(ump, MNT_WAIT) == 0 && (fs->e2fs.e2fs_state & E2FS_ERRORS) == 0) { fs->e2fs.e2fs_state = E2FS_ISCLEAN; (void) ext2fs_sbupdate(ump, MNT_WAIT); } if (ump->um_devvp->v_type != VBAD) spec_node_setmountedfs(ump->um_devvp, NULL); vn_lock(ump->um_devvp, LK_EXCLUSIVE | LK_RETRY); error = VOP_CLOSE(ump->um_devvp, fs->e2fs_ronly ? FREAD : FREAD|FWRITE, NOCRED); vput(ump->um_devvp); int32_t sh = fs->e2fs_bsize >> fs->e2fs_group_desc_shift; kmem_free(fs->e2fs_gd, fs->e2fs_ngdb * sh * sizeof(struct ext2_gd)); kmem_free(fs, sizeof(*fs)); kmem_free(ump, sizeof(*ump)); mp->mnt_data = NULL; mp->mnt_flag &= ~MNT_LOCAL; return error; } /* * Flush out all the files in a filesystem. */ int ext2fs_flushfiles(struct mount *mp, int flags) { extern int doforce; int error; if (!doforce) flags &= ~FORCECLOSE; error = vflush(mp, NULLVP, flags); return error; } /* * Get file system statistics. */ int ext2fs_statvfs(struct mount *mp, struct statvfs *sbp) { struct ufsmount *ump; struct m_ext2fs *fs; uint32_t overhead, overhead_per_group, ngdb; int i, ngroups; ump = VFSTOUFS(mp); fs = ump->um_e2fs; if (fs->e2fs.e2fs_magic != E2FS_MAGIC) panic("ext2fs_statvfs"); /* * Compute the overhead (FS structures) */ overhead_per_group = 1 /* block bitmap */ + 1 /* inode bitmap */ + fs->e2fs_itpg; overhead = fs->e2fs.e2fs_first_dblock + fs->e2fs_ncg * overhead_per_group; if (EXT2F_HAS_COMPAT_FEATURE(fs, EXT2F_COMPAT_SPARSESUPER2)) { /* * Superblock and group descriptions is in group zero, * then optionally 0, 1 or 2 extra copies. */ ngroups = 1 + (fs->e2fs.e4fs_backup_bgs[0] ? 1 : 0) + (fs->e2fs.e4fs_backup_bgs[1] ? 1 : 0); } else if (EXT2F_HAS_ROCOMPAT_FEATURE(fs, EXT2F_ROCOMPAT_SPARSESUPER)) { for (i = 0, ngroups = 0; i < fs->e2fs_ncg; i++) { if (cg_has_sb(i)) ngroups++; } } else { ngroups = fs->e2fs_ncg; } ngdb = fs->e2fs_ngdb; if (EXT2F_HAS_COMPAT_FEATURE(fs, EXT2F_COMPAT_RESIZE)) ngdb += fs->e2fs.e2fs_reserved_ngdb; overhead += ngroups * (1 /* superblock */ + ngdb); sbp->f_bsize = fs->e2fs_bsize; sbp->f_frsize = MINBSIZE << fs->e2fs.e2fs_fsize; sbp->f_iosize = fs->e2fs_bsize; sbp->f_blocks = fs->e2fs.e2fs_bcount - overhead; sbp->f_bfree = fs->e2fs.e2fs_fbcount; sbp->f_bresvd = fs->e2fs.e2fs_rbcount; if (sbp->f_bfree > sbp->f_bresvd) sbp->f_bavail = sbp->f_bfree - sbp->f_bresvd; else sbp->f_bavail = 0; sbp->f_files = fs->e2fs.e2fs_icount; sbp->f_ffree = fs->e2fs.e2fs_ficount; sbp->f_favail = fs->e2fs.e2fs_ficount; sbp->f_fresvd = 0; copy_statvfs_info(sbp, mp); return 0; } static bool ext2fs_sync_selector(void *cl, struct vnode *vp) { struct inode *ip; KASSERT(mutex_owned(vp->v_interlock)); ip = VTOI(vp); /* * Skip the vnode/inode if inaccessible. */ if (ip == NULL || vp->v_type == VNON) return false; if (((ip->i_flag & (IN_CHANGE | IN_UPDATE | IN_MODIFIED)) == 0 && LIST_EMPTY(&vp->v_dirtyblkhd) && (vp->v_iflag & VI_ONWORKLST) == 0)) return false; return true; } /* * Go through the disk queues to initiate sandbagged IO; * go through the inodes to write those that have been modified; * initiate the writing of the super block if it has been modified. * * Note: we are always called with the filesystem marked `MPBUSY'. */ int ext2fs_sync(struct mount *mp, int waitfor, kauth_cred_t cred) { struct vnode *vp; struct ufsmount *ump = VFSTOUFS(mp); struct m_ext2fs *fs; struct vnode_iterator *marker; int error, allerror = 0; fs = ump->um_e2fs; if (fs->e2fs_fmod != 0 && fs->e2fs_ronly != 0) { /* XXX */ printf("fs = %s\n", fs->e2fs_fsmnt); panic("update: rofs mod"); } /* * Write back each (modified) inode. */ vfs_vnode_iterator_init(mp, &marker); while ((vp = vfs_vnode_iterator_next(marker, ext2fs_sync_selector, NULL))) { error = vn_lock(vp, LK_EXCLUSIVE); if (error) { vrele(vp); continue; } if (vp->v_type == VREG && waitfor == MNT_LAZY) error = ext2fs_update(vp, NULL, NULL, 0); else error = VOP_FSYNC(vp, cred, waitfor == MNT_WAIT ? FSYNC_WAIT : 0, 0, 0); if (error) allerror = error; vput(vp); } vfs_vnode_iterator_destroy(marker); /* * Force stale file system control information to be flushed. */ if (waitfor != MNT_LAZY) { vn_lock(ump->um_devvp, LK_EXCLUSIVE | LK_RETRY); if ((error = VOP_FSYNC(ump->um_devvp, cred, waitfor == MNT_WAIT ? FSYNC_WAIT : 0, 0, 0)) != 0) allerror = error; VOP_UNLOCK(ump->um_devvp); } /* * Write back modified superblock. */ if (fs->e2fs_fmod != 0) { fs->e2fs_fmod = 0; fs->e2fs.e2fs_wtime = time_second; if ((error = ext2fs_cgupdate(ump, waitfor))) allerror = error; } return allerror; } /* * Load inode from disk and initialize vnode. */ static int ext2fs_init_vnode(struct ufsmount *ump, struct vnode *vp, ino_t ino) { struct m_ext2fs *fs; struct inode *ip; struct buf *bp; int error; fs = ump->um_e2fs; /* Read in the disk contents for the inode, copy into the inode. */ error = bread(ump->um_devvp, EXT2_FSBTODB(fs, ino_to_fsba(fs, ino)), (int)fs->e2fs_bsize, 0, &bp); if (error) return error; /* Allocate and initialize inode. */ ip = pool_get(&ext2fs_inode_pool, PR_WAITOK); memset(ip, 0, sizeof(struct inode)); ip->i_vnode = vp; ip->i_ump = ump; ip->i_e2fs = fs; ip->i_dev = ump->um_dev; ip->i_number = ino; ip->i_e2fs_last_lblk = 0; ip->i_e2fs_last_blk = 0; error = ext2fs_loadvnode_content(fs, ino, bp, ip); brelse(bp, 0); if (error) { pool_put(&ext2fs_inode_pool, ip); return error; } /* If the inode was deleted, reset all fields */ if (ip->i_e2fs_dtime != 0) { ip->i_e2fs_mode = 0; (void)ext2fs_setsize(ip, 0); (void)ext2fs_setnblock(ip, 0); memset(ip->i_e2fs_blocks, 0, sizeof(ip->i_e2fs_blocks)); } /* Initialise vnode with this inode. */ vp->v_tag = VT_EXT2FS; vp->v_op = ext2fs_vnodeop_p; vp->v_data = ip; /* Initialize genfs node. */ genfs_node_init(vp, &ext2fs_genfsops); return 0; } /* * Read an inode from disk and initialize this vnode / inode pair. * Caller assures no other thread will try to load this inode. */ int ext2fs_loadvnode(struct mount *mp, struct vnode *vp, const void *key, size_t key_len, const void **new_key) { ino_t ino; struct inode *ip; struct ufsmount *ump; int error; KASSERT(key_len == sizeof(ino)); memcpy(&ino, key, key_len); ump = VFSTOUFS(mp); error = ext2fs_init_vnode(ump, vp, ino); if (error) return error; ip = VTOI(vp); /* Initialize the vnode from the inode. */ ext2fs_vinit(mp, ext2fs_specop_p, ext2fs_fifoop_p, &vp); /* Finish inode initialization. */ ip->i_devvp = ump->um_devvp; vref(ip->i_devvp); /* * Set up a generation number for this inode if it does not * already have one. This should only happen on old filesystems. */ if (ip->i_e2fs_gen == 0) { if (++ext2gennumber < (u_long)time_second) ext2gennumber = time_second; ip->i_e2fs_gen = ext2gennumber; if ((mp->mnt_flag & MNT_RDONLY) == 0) ip->i_flag |= IN_MODIFIED; } uvm_vnp_setsize(vp, ext2fs_size(ip)); *new_key = &ip->i_number; return 0; } /* * Create a new inode on disk and initialize this vnode / inode pair. */ int ext2fs_newvnode(struct mount *mp, struct vnode *dvp, struct vnode *vp, struct vattr *vap, kauth_cred_t cred, void *extra, size_t *key_len, const void **new_key) { ino_t ino; struct inode *ip, *pdir; struct m_ext2fs *fs; struct ufsmount *ump; int error, mode; KASSERT(dvp->v_mount == mp); KASSERT(vap->va_type != VNON); *key_len = sizeof(ino); pdir = VTOI(dvp); fs = pdir->i_e2fs; ump = VFSTOUFS(mp); mode = MAKEIMODE(vap->va_type, vap->va_mode); /* Allocate fresh inode. */ error = ext2fs_valloc(dvp, mode, cred, &ino); if (error) return error; /* Attach inode to vnode. */ error = ext2fs_init_vnode(ump, vp, ino); if (error) { ext2fs_vfree(dvp, ino, mode); return error; } ip = VTOI(vp); KASSERT(!E2FS_HAS_GD_CSUM(fs) || (fs->e2fs_gd[ino_to_cg(fs, ino)].ext2bgd_flags & h2fs16(E2FS_BG_INODE_ZEROED)) != 0); /* check for already used inode; makes sense only for ZEROED itable */ if (__predict_false(ip->i_e2fs_mode && ip->i_e2fs_nlink != 0)) { printf("mode = 0%o, nlinks %d, inum = %llu, fs = %s\n", ip->i_e2fs_mode, ip->i_e2fs_nlink, (unsigned long long)ip->i_number, fs->e2fs_fsmnt); panic("ext2fs_valloc: dup alloc"); } memset(ip->i_din.e2fs_din, 0, EXT2_DINODE_SIZE(fs)); /* * Set up a new generation number for this inode. */ if (++ext2gennumber < time_second) ext2gennumber = time_second; ip->i_e2fs_gen = ext2gennumber; ip->i_uid = kauth_cred_geteuid(cred); ip->i_e2fs_uid = ip->i_uid & 0xffff; ip->i_e2fs_gid = pdir->i_e2fs_gid; if (ip->i_e2fs->e2fs.e2fs_rev > E2FS_REV0) { ip->i_e2fs_uid_high = (ip->i_uid >> 16) & 0xffff; ip->i_e2fs_gid_high = pdir->i_e2fs_gid_high; } else { ip->i_e2fs_uid_high = 0; ip->i_e2fs_gid_high = 0; } ip->i_gid = ip->i_e2fs_gid | (ip->i_e2fs_gid_high << 16); ip->i_flag |= IN_ACCESS | IN_CHANGE | IN_UPDATE; ip->i_e2fs_mode = mode; vp->v_type = IFTOVT(mode); ip->i_e2fs_nlink = 1; /* Authorize setting SGID if needed. */ if (ip->i_e2fs_mode & ISGID) { error = kauth_authorize_vnode(cred, KAUTH_VNODE_WRITE_SECURITY, vp, NULL, genfs_can_chmod(vp, cred, ip->i_uid, ip->i_gid, mode)); if (error) ip->i_e2fs_mode &= ~ISGID; } /* Initialize extra_isize according to what is set in superblock */ if (EXT2F_HAS_ROCOMPAT_FEATURE(ip->i_e2fs, EXT2F_ROCOMPAT_EXTRA_ISIZE) && EXT2_DINODE_SIZE(ip->i_e2fs) > EXT2_REV0_DINODE_SIZE) { ip->i_din.e2fs_din->e2di_extra_isize = ip->i_e2fs->e2fs.e4fs_want_extra_isize; } /* Set create time if possible */ if (EXT2_DINODE_FITS(ip->i_din.e2fs_din, e2di_crtime, EXT2_DINODE_SIZE(ip->i_e2fs))) { struct timespec now; vfs_timestamp(&now); EXT2_DINODE_TIME_SET(&now, ip->i_din.e2fs_din, e2di_crtime, EXT2_DINODE_SIZE(ip->i_e2fs)); } /* Initialize the vnode from the inode. */ ext2fs_vinit(mp, ext2fs_specop_p, ext2fs_fifoop_p, &vp); /* Finish inode initialization. */ ip->i_devvp = ump->um_devvp; vref(ip->i_devvp); uvm_vnp_setsize(vp, ext2fs_size(ip)); *new_key = &ip->i_number; return 0; } /* * File handle to vnode * * Have to be really careful about stale file handles: * - check that the inode number is valid * - call ext2fs_vget() to get the locked inode * - check for an unallocated inode (i_mode == 0) */ int ext2fs_fhtovp(struct mount *mp, struct fid *fhp, int lktype, struct vnode **vpp) { struct inode *ip; struct vnode *nvp; int error; struct ufid ufh; struct m_ext2fs *fs; if (fhp->fid_len != sizeof(struct ufid)) return EINVAL; memcpy(&ufh, fhp, sizeof(struct ufid)); fs = VFSTOUFS(mp)->um_e2fs; if ((ufh.ufid_ino < EXT2_FIRSTINO && ufh.ufid_ino != EXT2_ROOTINO) || ufh.ufid_ino >= fs->e2fs_ncg * fs->e2fs.e2fs_ipg) return ESTALE; if ((error = VFS_VGET(mp, ufh.ufid_ino, lktype, &nvp)) != 0) { *vpp = NULLVP; return error; } ip = VTOI(nvp); if (ip->i_e2fs_mode == 0 || ip->i_e2fs_dtime != 0 || ip->i_e2fs_gen != ufh.ufid_gen) { vput(nvp); *vpp = NULLVP; return ESTALE; } *vpp = nvp; return 0; } /* * Vnode pointer to File handle */ /* ARGSUSED */ int ext2fs_vptofh(struct vnode *vp, struct fid *fhp, size_t *fh_size) { struct inode *ip; struct ufid ufh; if (*fh_size < sizeof(struct ufid)) { *fh_size = sizeof(struct ufid); return E2BIG; } *fh_size = sizeof(struct ufid); ip = VTOI(vp); memset(&ufh, 0, sizeof(ufh)); ufh.ufid_len = sizeof(struct ufid); ufh.ufid_ino = ip->i_number; ufh.ufid_gen = ip->i_e2fs_gen; memcpy(fhp, &ufh, sizeof(ufh)); return 0; } /* * Write a superblock and associated information back to disk. */ int ext2fs_sbupdate(struct ufsmount *mp, int waitfor) { struct m_ext2fs *fs = mp->um_e2fs; struct buf *bp; int error = 0; bp = getblk(mp->um_devvp, SBLOCK, SBSIZE, 0, 0); e2fs_sbsave(&fs->e2fs, (struct ext2fs*)bp->b_data); if (waitfor == MNT_WAIT) error = bwrite(bp); else bawrite(bp); return error; } int ext2fs_cgupdate(struct ufsmount *mp, int waitfor) { struct m_ext2fs *fs = mp->um_e2fs; struct buf *bp; int i, error = 0, allerror = 0; allerror = ext2fs_sbupdate(mp, waitfor); for (i = 0; i < fs->e2fs_ngdb; i++) { bp = getblk(mp->um_devvp, EXT2_FSBTODB(fs, fs->e2fs.e2fs_first_dblock + 1 /* superblock */ + i), fs->e2fs_bsize, 0, 0); e2fs_cgsave(&fs->e2fs_gd[ i * fs->e2fs_bsize / sizeof(struct ext2_gd)], bp->b_data, fs->e2fs_bsize, fs->e2fs_group_desc_shift); if (waitfor == MNT_WAIT) error = bwrite(bp); else bawrite(bp); } if (!allerror && error) allerror = error; return allerror; } /* * Fill in the m_fs structure, and validate the fields of the superblock. * NOTE: here, the superblock is already swapped. */ static int ext2fs_sbfill(struct m_ext2fs *m_fs, int ronly) { uint32_t u32; struct ext2fs *fs = &m_fs->e2fs; /* * General sanity checks */ if (fs->e2fs_magic != E2FS_MAGIC) return EINVAL; if (fs->e2fs_rev > E2FS_REV1) { printf("ext2fs: unsupported revision number: %#x\n", fs->e2fs_rev); return EINVAL; } if (fs->e2fs_log_bsize > 2) { /* block size = 1024|2048|4096 */ printf("ext2fs: bad block size: %d\n", fs->e2fs_log_bsize); return EINVAL; } if (fs->e2fs_bpg == 0) { printf("ext2fs: zero blocks per group\n"); return EINVAL; } if (fs->e2fs_ipg == 0) { printf("ext2fs: zero inodes per group\n"); return EINVAL; } if (fs->e2fs_first_dblock >= fs->e2fs_bcount) { printf("ext2fs: invalid first data block\n"); return EINVAL; } if (fs->e2fs_rbcount > fs->e2fs_bcount || fs->e2fs_fbcount > fs->e2fs_bcount) { printf("ext2fs: invalid block count\n"); return EINVAL; } /* * Compute the fields of the superblock */ u32 = fs->e2fs_bcount - fs->e2fs_first_dblock; /* > 0 */ m_fs->e2fs_ncg = howmany(u32, fs->e2fs_bpg); if (m_fs->e2fs_ncg == 0) { printf("ext2fs: invalid number of cylinder groups\n"); return EINVAL; } m_fs->e2fs_fsbtodb = fs->e2fs_log_bsize + LOG_MINBSIZE - DEV_BSHIFT; m_fs->e2fs_bsize = MINBSIZE << fs->e2fs_log_bsize; m_fs->e2fs_bshift = LOG_MINBSIZE + fs->e2fs_log_bsize; m_fs->e2fs_qbmask = m_fs->e2fs_bsize - 1; m_fs->e2fs_bmask = ~m_fs->e2fs_qbmask; if (!(fs->e2fs_features_incompat & EXT2F_INCOMPAT_64BIT) || (fs->e2fs_rev == E2FS_REV0)) m_fs->e2fs_group_desc_shift = 5; else { for (m_fs->e2fs_group_desc_shift = 0; (1 << m_fs->e2fs_group_desc_shift) < fs->e3fs_desc_size; m_fs->e2fs_group_desc_shift++); } if ((u32 = (m_fs->e2fs_bsize >> m_fs->e2fs_group_desc_shift)) == 0) { /* Unlikely to happen */ printf("ext2fs: invalid block size\n"); return EINVAL; } m_fs->e2fs_ngdb = howmany(m_fs->e2fs_ncg, u32); if (m_fs->e2fs_ngdb == 0) { printf("ext2fs: invalid number of group descriptor blocks\n"); return EINVAL; } if (m_fs->e2fs_bsize < EXT2_DINODE_SIZE(m_fs)) { printf("ext2fs: invalid inode size\n"); return EINVAL; } m_fs->e2fs_ipb = m_fs->e2fs_bsize / EXT2_DINODE_SIZE(m_fs); m_fs->e2fs_itpg = fs->e2fs_ipg / m_fs->e2fs_ipb; /* * Revision-specific checks */ if (fs->e2fs_rev > E2FS_REV0) { char buf[256]; if (fs->e2fs_first_ino != EXT2_FIRSTINO) { printf("ext2fs: unsupported first inode position\n"); return EINVAL; } u32 = fs->e2fs_features_incompat & ~EXT2F_INCOMPAT_SUPP; if (u32) { snprintb(buf, sizeof(buf), EXT2F_INCOMPAT_BITS, u32); printf("ext2fs: unsupported incompat features: %s\n", buf); #ifndef EXT2_IGNORE_INCOMPAT_FEATURES return EINVAL; #endif } u32 = fs->e2fs_features_rocompat & ~EXT2F_ROCOMPAT_SUPP; if (!ronly && u32) { snprintb(buf, sizeof(buf), EXT2F_ROCOMPAT_BITS, u32); printf("ext2fs: unsupported ro-incompat features: %s\n", buf); #ifndef EXT2_IGNORE_ROCOMPAT_FEATURES return EROFS; #endif } if (fs->e2fs_inode_size == 0 || !powerof2(fs->e2fs_inode_size) || fs->e2fs_inode_size > m_fs->e2fs_bsize) { printf("ext2fs: bad inode size\n"); return EINVAL; } } return 0; }
728 734 805 805 734 732 735 734 734 734 507 508 508 504 4 4 4 4 2648 2647 511 301 2650 1530 198 2627 837 2487 2652 1528 2500 2273 832 2442 2644 97 1528 104 1531 2517 2288 2765 2780 2611 2748 2750 1485 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 /* $NetBSD: subr_lockdebug.c,v 1.83 2022/09/02 06:01:38 nakayama Exp $ */ /*- * Copyright (c) 2006, 2007, 2008, 2020 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Andrew Doran. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /* * Basic lock debugging code shared among lock primitives. */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: subr_lockdebug.c,v 1.83 2022/09/02 06:01:38 nakayama Exp $"); #ifdef _KERNEL_OPT #include "opt_ddb.h" #endif #include <sys/param.h> #include <sys/proc.h> #include <sys/systm.h> #include <sys/kernel.h> #include <sys/kmem.h> #include <sys/lockdebug.h> #include <sys/sleepq.h> #include <sys/cpu.h> #include <sys/atomic.h> #include <sys/lock.h> #include <sys/rbtree.h> #include <sys/ksyms.h> #include <sys/kcov.h> #include <machine/lock.h> #ifdef DDB #include <machine/db_machdep.h> #include <ddb/db_interface.h> #include <ddb/db_access.h> #include <ddb/db_sym.h> #endif unsigned int ld_panic; #ifdef LOCKDEBUG #ifdef __ia64__ #define LD_BATCH_SHIFT 16 #else #define LD_BATCH_SHIFT 9 #endif #define LD_BATCH (1 << LD_BATCH_SHIFT) #define LD_BATCH_MASK (LD_BATCH - 1) #define LD_MAX_LOCKS 1048576 #define LD_SLOP 16 #define LD_LOCKED 0x01 #define LD_SLEEPER 0x02 #define LD_WRITE_LOCK 0x80000000 typedef struct lockdebug { struct rb_node ld_rb_node; __cpu_simple_lock_t ld_spinlock; _TAILQ_ENTRY(struct lockdebug, volatile) ld_chain; _TAILQ_ENTRY(struct lockdebug, volatile) ld_achain; volatile void *ld_lock; lockops_t *ld_lockops; struct lwp *ld_lwp; uintptr_t ld_locked; uintptr_t ld_unlocked; uintptr_t ld_initaddr; uint16_t ld_shares; uint16_t ld_cpu; uint8_t ld_flags; uint8_t ld_shwant; /* advisory */ uint8_t ld_exwant; /* advisory */ uint8_t ld_unused; } volatile lockdebug_t; typedef _TAILQ_HEAD(lockdebuglist, struct lockdebug, volatile) lockdebuglist_t; __cpu_simple_lock_t ld_mod_lk; lockdebuglist_t ld_free = TAILQ_HEAD_INITIALIZER(ld_free); #ifdef _KERNEL lockdebuglist_t ld_all = TAILQ_HEAD_INITIALIZER(ld_all); #else extern lockdebuglist_t ld_all; #define cpu_name(a) "?" #define cpu_index(a) -1 #define curlwp NULL #endif /* _KERNEL */ int ld_nfree; int ld_freeptr; int ld_recurse; bool ld_nomore; lockdebug_t ld_prime[LD_BATCH]; #ifdef _KERNEL static void lockdebug_abort1(const char *, size_t, lockdebug_t *, int, const char *, bool); static int lockdebug_more(int); static void lockdebug_init(void); static void lockdebug_dump(lwp_t *, lockdebug_t *, void (*)(const char *, ...) __printflike(1, 2)); static signed int ld_rbto_compare_nodes(void *ctx, const void *n1, const void *n2) { const lockdebug_t *ld1 = n1; const lockdebug_t *ld2 = n2; const uintptr_t a = (uintptr_t)ld1->ld_lock; const uintptr_t b = (uintptr_t)ld2->ld_lock; if (a < b) return -1; if (a > b) return 1; return 0; } static signed int ld_rbto_compare_key(void *ctx, const void *n, const void *key) { const lockdebug_t *ld = n; const uintptr_t a = (uintptr_t)ld->ld_lock; const uintptr_t b = (uintptr_t)key; if (a < b) return -1; if (a > b) return 1; return 0; } static rb_tree_t ld_rb_tree; static const rb_tree_ops_t ld_rb_tree_ops = { .rbto_compare_nodes = ld_rbto_compare_nodes, .rbto_compare_key = ld_rbto_compare_key, .rbto_node_offset = offsetof(lockdebug_t, ld_rb_node), .rbto_context = NULL }; static inline lockdebug_t * lockdebug_lookup1(const volatile void *lock) { lockdebug_t *ld; struct cpu_info *ci; ci = curcpu(); __cpu_simple_lock(&ci->ci_data.cpu_ld_lock); ld = rb_tree_find_node(&ld_rb_tree, (void *)(intptr_t)lock); __cpu_simple_unlock(&ci->ci_data.cpu_ld_lock); if (ld == NULL) { return NULL; } __cpu_simple_lock(&ld->ld_spinlock); return ld; } static void lockdebug_lock_cpus(void) { CPU_INFO_ITERATOR cii; struct cpu_info *ci; for (CPU_INFO_FOREACH(cii, ci)) { __cpu_simple_lock(&ci->ci_data.cpu_ld_lock); } } static void lockdebug_unlock_cpus(void) { CPU_INFO_ITERATOR cii; struct cpu_info *ci; for (CPU_INFO_FOREACH(cii, ci)) { __cpu_simple_unlock(&ci->ci_data.cpu_ld_lock); } } /* * lockdebug_lookup: * * Find a lockdebug structure by a pointer to a lock and return it locked. */ static inline lockdebug_t * lockdebug_lookup(const char *func, size_t line, const volatile void *lock, uintptr_t where) { lockdebug_t *ld; kcov_silence_enter(); ld = lockdebug_lookup1(lock); kcov_silence_leave(); if (__predict_false(ld == NULL)) { panic("%s,%zu: uninitialized lock (lock=%p, from=%08" PRIxPTR ")", func, line, lock, where); } return ld; } /* * lockdebug_init: * * Initialize the lockdebug system. Allocate an initial pool of * lockdebug structures before the VM system is up and running. */ static void lockdebug_init(void) { lockdebug_t *ld; int i; TAILQ_INIT(&curcpu()->ci_data.cpu_ld_locks); TAILQ_INIT(&curlwp->l_ld_locks); __cpu_simple_lock_init(&curcpu()->ci_data.cpu_ld_lock); __cpu_simple_lock_init(&ld_mod_lk); rb_tree_init(&ld_rb_tree, &ld_rb_tree_ops); ld = ld_prime; for (i = 1, ld++; i < LD_BATCH; i++, ld++) { __cpu_simple_lock_init(&ld->ld_spinlock); TAILQ_INSERT_TAIL(&ld_free, ld, ld_chain); TAILQ_INSERT_TAIL(&ld_all, ld, ld_achain); } ld_freeptr = 1; ld_nfree = LD_BATCH - 1; } /* * lockdebug_alloc: * * A lock is being initialized, so allocate an associated debug * structure. */ bool lockdebug_alloc(const char *func, size_t line, volatile void *lock, lockops_t *lo, uintptr_t initaddr) { struct cpu_info *ci; lockdebug_t *ld; int s; if (__predict_false(lo == NULL || panicstr != NULL || ld_panic)) return false; if (__predict_false(ld_freeptr == 0)) lockdebug_init(); s = splhigh(); __cpu_simple_lock(&ld_mod_lk); if (__predict_false((ld = lockdebug_lookup1(lock)) != NULL)) { __cpu_simple_unlock(&ld_mod_lk); lockdebug_abort1(func, line, ld, s, "already initialized", true); return false; } /* * Pinch a new debug structure. We may recurse because we call * kmem_alloc(), which may need to initialize new locks somewhere * down the path. If not recursing, we try to maintain at least * LD_SLOP structures free, which should hopefully be enough to * satisfy kmem_alloc(). If we can't provide a structure, not to * worry: we'll just mark the lock as not having an ID. */ ci = curcpu(); ci->ci_lkdebug_recurse++; if (TAILQ_EMPTY(&ld_free)) { if (ci->ci_lkdebug_recurse > 1 || ld_nomore) { ci->ci_lkdebug_recurse--; __cpu_simple_unlock(&ld_mod_lk); splx(s); return false; } s = lockdebug_more(s); } else if (ci->ci_lkdebug_recurse == 1 && ld_nfree < LD_SLOP) { s = lockdebug_more(s); } if (__predict_false((ld = TAILQ_FIRST(&ld_free)) == NULL)) { __cpu_simple_unlock(&ld_mod_lk); splx(s); return false; } TAILQ_REMOVE(&ld_free, ld, ld_chain); ld_nfree--; ci->ci_lkdebug_recurse--; if (__predict_false(ld->ld_lock != NULL)) { panic("%s,%zu: corrupt table ld %p", func, line, ld); } /* Initialise the structure. */ ld->ld_lock = lock; ld->ld_lockops = lo; ld->ld_locked = 0; ld->ld_unlocked = 0; ld->ld_lwp = NULL; ld->ld_initaddr = initaddr; ld->ld_flags = (lo->lo_type == LOCKOPS_SLEEP ? LD_SLEEPER : 0); lockdebug_lock_cpus(); (void)rb_tree_insert_node(&ld_rb_tree, __UNVOLATILE(ld)); lockdebug_unlock_cpus(); __cpu_simple_unlock(&ld_mod_lk); splx(s); return true; } /* * lockdebug_free: * * A lock is being destroyed, so release debugging resources. */ void lockdebug_free(const char *func, size_t line, volatile void *lock) { lockdebug_t *ld; int s; if (__predict_false(panicstr != NULL || ld_panic)) return; s = splhigh(); __cpu_simple_lock(&ld_mod_lk); ld = lockdebug_lookup(func, line, lock, (uintptr_t) __builtin_return_address(0)); if (__predict_false(ld == NULL)) { __cpu_simple_unlock(&ld_mod_lk); panic("%s,%zu: destroying uninitialized object %p" "(ld_lock=%p)", func, line, lock, ld->ld_lock); return; } if (__predict_false((ld->ld_flags & LD_LOCKED) != 0 || ld->ld_shares != 0)) { __cpu_simple_unlock(&ld_mod_lk); lockdebug_abort1(func, line, ld, s, "is locked or in use", true); return; } lockdebug_lock_cpus(); rb_tree_remove_node(&ld_rb_tree, __UNVOLATILE(ld)); lockdebug_unlock_cpus(); ld->ld_lock = NULL; TAILQ_INSERT_TAIL(&ld_free, ld, ld_chain); ld_nfree++; __cpu_simple_unlock(&ld->ld_spinlock); __cpu_simple_unlock(&ld_mod_lk); splx(s); } /* * lockdebug_more: * * Allocate a batch of debug structures and add to the free list. * Must be called with ld_mod_lk held. */ static int lockdebug_more(int s) { lockdebug_t *ld; void *block; int i, base, m; /* * Can't call kmem_alloc() if in interrupt context. XXX We could * deadlock, because we don't know which locks the caller holds. */ if (cpu_intr_p() || cpu_softintr_p()) { return s; } while (ld_nfree < LD_SLOP) { __cpu_simple_unlock(&ld_mod_lk); splx(s); block = kmem_zalloc(LD_BATCH * sizeof(lockdebug_t), KM_SLEEP); s = splhigh(); __cpu_simple_lock(&ld_mod_lk); if (ld_nfree > LD_SLOP) { /* Somebody beat us to it. */ __cpu_simple_unlock(&ld_mod_lk); splx(s); kmem_free(block, LD_BATCH * sizeof(lockdebug_t)); s = splhigh(); __cpu_simple_lock(&ld_mod_lk); continue; } base = ld_freeptr; ld_nfree += LD_BATCH; ld = block; base <<= LD_BATCH_SHIFT; m = uimin(LD_MAX_LOCKS, base + LD_BATCH); if (m == LD_MAX_LOCKS) ld_nomore = true; for (i = base; i < m; i++, ld++) { __cpu_simple_lock_init(&ld->ld_spinlock); TAILQ_INSERT_TAIL(&ld_free, ld, ld_chain); TAILQ_INSERT_TAIL(&ld_all, ld, ld_achain); } membar_producer(); } return s; } /* * lockdebug_wantlock: * * Process the preamble to a lock acquire. The "shared" * parameter controls which ld_{ex,sh}want counter is * updated; a negative value of shared updates neither. */ void lockdebug_wantlock(const char *func, size_t line, const volatile void *lock, uintptr_t where, int shared) { struct lwp *l = curlwp; lockdebug_t *ld; bool recurse; int s; (void)shared; recurse = false; if (__predict_false(panicstr != NULL || ld_panic)) return; s = splhigh(); if ((ld = lockdebug_lookup(func, line, lock, where)) == NULL) { splx(s); return; } if ((ld->ld_flags & LD_LOCKED) != 0 || ld->ld_shares != 0) { if ((ld->ld_flags & LD_SLEEPER) != 0) { if (ld->ld_lwp == l) recurse = true; } else if (ld->ld_cpu == (uint16_t)cpu_index(curcpu())) recurse = true; } if (cpu_intr_p()) { if (__predict_false((ld->ld_flags & LD_SLEEPER) != 0)) { lockdebug_abort1(func, line, ld, s, "acquiring sleep lock from interrupt context", true); return; } } if (shared > 0) ld->ld_shwant++; else if (shared == 0) ld->ld_exwant++; if (__predict_false(recurse)) { lockdebug_abort1(func, line, ld, s, "locking against myself", true); return; } if (l->l_ld_wanted == NULL) { l->l_ld_wanted = ld; } __cpu_simple_unlock(&ld->ld_spinlock); splx(s); } /* * lockdebug_locked: * * Process a lock acquire operation. */ void lockdebug_locked(const char *func, size_t line, volatile void *lock, void *cvlock, uintptr_t where, int shared) { struct lwp *l = curlwp; lockdebug_t *ld; int s; if (__predict_false(panicstr != NULL || ld_panic)) return; s = splhigh(); if ((ld = lockdebug_lookup(func, line, lock, where)) == NULL) { splx(s); return; } if (shared) { l->l_shlocks++; ld->ld_locked = where; ld->ld_shares++; ld->ld_shwant--; } else { if (__predict_false((ld->ld_flags & LD_LOCKED) != 0)) { lockdebug_abort1(func, line, ld, s, "already locked", true); return; } ld->ld_flags |= LD_LOCKED; ld->ld_locked = where; ld->ld_exwant--; if ((ld->ld_flags & LD_SLEEPER) != 0) { TAILQ_INSERT_TAIL(&l->l_ld_locks, ld, ld_chain); } else { TAILQ_INSERT_TAIL(&curcpu()->ci_data.cpu_ld_locks, ld, ld_chain); } } ld->ld_cpu = (uint16_t)cpu_index(curcpu()); ld->ld_lwp = l; __cpu_simple_unlock(&ld->ld_spinlock); if (l->l_ld_wanted == ld) { l->l_ld_wanted = NULL; } splx(s); } /* * lockdebug_unlocked: * * Process a lock release operation. */ void lockdebug_unlocked(const char *func, size_t line, volatile void *lock, uintptr_t where, int shared) { struct lwp *l = curlwp; lockdebug_t *ld; int s; if (__predict_false(panicstr != NULL || ld_panic)) return; s = splhigh(); if ((ld = lockdebug_lookup(func, line, lock, where)) == NULL) { splx(s); return; } if (shared) { if (__predict_false(l->l_shlocks == 0)) { lockdebug_abort1(func, line, ld, s, "no shared locks held by LWP", true); return; } if (__predict_false(ld->ld_shares == 0)) { lockdebug_abort1(func, line, ld, s, "no shared holds on this lock", true); return; } l->l_shlocks--; ld->ld_shares--; if (ld->ld_lwp == l) { ld->ld_unlocked = where; ld->ld_lwp = NULL; } if (ld->ld_cpu == (uint16_t)cpu_index(curcpu())) ld->ld_cpu = (uint16_t)-1; } else { if (__predict_false((ld->ld_flags & LD_LOCKED) == 0)) { lockdebug_abort1(func, line, ld, s, "not locked", true); return; } if ((ld->ld_flags & LD_SLEEPER) != 0) { if (__predict_false(ld->ld_lwp != curlwp)) { lockdebug_abort1(func, line, ld, s, "not held by current LWP", true); return; } TAILQ_REMOVE(&l->l_ld_locks, ld, ld_chain); } else { uint16_t idx = (uint16_t)cpu_index(curcpu()); if (__predict_false(ld->ld_cpu != idx)) { lockdebug_abort1(func, line, ld, s, "not held by current CPU", true); return; } TAILQ_REMOVE(&curcpu()->ci_data.cpu_ld_locks, ld, ld_chain); } ld->ld_flags &= ~LD_LOCKED; ld->ld_unlocked = where; ld->ld_lwp = NULL; } __cpu_simple_unlock(&ld->ld_spinlock); splx(s); } /* * lockdebug_barrier: * * Panic if we hold more than one specified lock, and optionally, if we * hold any sleep locks. */ void lockdebug_barrier(const char *func, size_t line, volatile void *onelock, int slplocks) { struct lwp *l = curlwp; lockdebug_t *ld; int s; if (__predict_false(panicstr != NULL || ld_panic)) return; s = splhigh(); if ((l->l_pflag & LP_INTR) == 0) { TAILQ_FOREACH(ld, &curcpu()->ci_data.cpu_ld_locks, ld_chain) { if (ld->ld_lock == onelock) { continue; } __cpu_simple_lock(&ld->ld_spinlock); lockdebug_abort1(func, line, ld, s, "spin lock held", true); return; } } if (slplocks) { splx(s); return; } ld = TAILQ_FIRST(&l->l_ld_locks); if (__predict_false(ld != NULL && ld->ld_lock != onelock)) { __cpu_simple_lock(&ld->ld_spinlock); lockdebug_abort1(func, line, ld, s, "sleep lock held", true); return; } splx(s); if (l->l_shlocks != 0) { TAILQ_FOREACH(ld, &ld_all, ld_achain) { if (ld->ld_lock == onelock) { continue; } if (ld->ld_lwp == l) lockdebug_dump(l, ld, printf); } panic("%s,%zu: holding %d shared locks", func, line, l->l_shlocks); } } /* * lockdebug_mem_check: * * Check for in-use locks within a memory region that is * being freed. */ void lockdebug_mem_check(const char *func, size_t line, void *base, size_t sz) { lockdebug_t *ld; struct cpu_info *ci; int s; if (__predict_false(panicstr != NULL || ld_panic)) return; kcov_silence_enter(); s = splhigh(); ci = curcpu(); __cpu_simple_lock(&ci->ci_data.cpu_ld_lock); ld = (lockdebug_t *)rb_tree_find_node_geq(&ld_rb_tree, base); if (ld != NULL) { const uintptr_t lock = (uintptr_t)ld->ld_lock; if (__predict_false((uintptr_t)base > lock)) panic("%s,%zu: corrupt tree ld=%p, base=%p, sz=%zu", func, line, ld, base, sz); if (lock >= (uintptr_t)base + sz) ld = NULL; } __cpu_simple_unlock(&ci->ci_data.cpu_ld_lock); if (__predict_false(ld != NULL)) { __cpu_simple_lock(&ld->ld_spinlock); lockdebug_abort1(func, line, ld, s, "allocation contains active lock", !cold); kcov_silence_leave(); return; } splx(s); kcov_silence_leave(); } #endif /* _KERNEL */ /* * lockdebug_dump: * * Dump information about a lock on panic, or for DDB. */ static void lockdebug_dump(lwp_t *l, lockdebug_t *ld, void (*pr)(const char *, ...) __printflike(1, 2)) { int sleeper = (ld->ld_flags & LD_SLEEPER); lockops_t *lo = ld->ld_lockops; char locksym[128], initsym[128], lockedsym[128], unlockedsym[128]; #ifdef DDB db_symstr(locksym, sizeof(locksym), (db_expr_t)(intptr_t)ld->ld_lock, DB_STGY_ANY); db_symstr(initsym, sizeof(initsym), (db_expr_t)ld->ld_initaddr, DB_STGY_PROC); db_symstr(lockedsym, sizeof(lockedsym), (db_expr_t)ld->ld_locked, DB_STGY_PROC); db_symstr(unlockedsym, sizeof(unlockedsym), (db_expr_t)ld->ld_unlocked, DB_STGY_PROC); #else snprintf(locksym, sizeof(locksym), "%#018lx", (unsigned long)ld->ld_lock); snprintf(initsym, sizeof(initsym), "%#018lx", (unsigned long)ld->ld_initaddr); snprintf(lockedsym, sizeof(lockedsym), "%#018lx", (unsigned long)ld->ld_locked); snprintf(unlockedsym, sizeof(unlockedsym), "%#018lx", (unsigned long)ld->ld_unlocked); #endif (*pr)( "lock address : %s\n" "type : %s\n" "initialized : %s", locksym, (sleeper ? "sleep/adaptive" : "spin"), initsym); #ifndef _KERNEL lockops_t los; lo = &los; db_read_bytes((db_addr_t)ld->ld_lockops, sizeof(los), (char *)lo); #endif (*pr)("\n" "shared holds : %18u exclusive: %18u\n" "shares wanted: %18u exclusive: %18u\n" "relevant cpu : %18u last held: %18u\n" "relevant lwp : %#018lx last held: %#018lx\n" "last locked%c : %s\n" "unlocked%c : %s\n", (unsigned)ld->ld_shares, ((ld->ld_flags & LD_LOCKED) != 0), (unsigned)ld->ld_shwant, (unsigned)ld->ld_exwant, (unsigned)cpu_index(l->l_cpu), (unsigned)ld->ld_cpu, (long)l, (long)ld->ld_lwp, ((ld->ld_flags & LD_LOCKED) ? '*' : ' '), lockedsym, ((ld->ld_flags & LD_LOCKED) ? ' ' : '*'), unlockedsym); #ifdef _KERNEL if (lo->lo_dump != NULL) (*lo->lo_dump)(ld->ld_lock, pr); if (sleeper) { turnstile_print(ld->ld_lock, pr); } #endif } #ifdef _KERNEL /* * lockdebug_abort1: * * An error has been trapped - dump lock info and panic. */ static void lockdebug_abort1(const char *func, size_t line, lockdebug_t *ld, int s, const char *msg, bool dopanic) { /* * Don't make the situation worse if the system is already going * down in flames. Once a panic is triggered, lockdebug state * becomes stale and cannot be trusted. */ if (atomic_inc_uint_nv(&ld_panic) != 1) { __cpu_simple_unlock(&ld->ld_spinlock); splx(s); return; } printf("%s error: %s,%zu: %s\n\n", ld->ld_lockops->lo_name, func, line, msg); lockdebug_dump(curlwp, ld, printf); __cpu_simple_unlock(&ld->ld_spinlock); splx(s); printf("\n"); if (dopanic) panic("LOCKDEBUG: %s error: %s,%zu: %s", ld->ld_lockops->lo_name, func, line, msg); } #endif /* _KERNEL */ #endif /* LOCKDEBUG */ /* * lockdebug_lock_print: * * Handle the DDB 'show lock' command. */ #ifdef DDB void lockdebug_lock_print(void *addr, void (*pr)(const char *, ...) __printflike(1, 2)) { #ifdef LOCKDEBUG lockdebug_t *ld, lds; TAILQ_FOREACH(ld, &ld_all, ld_achain) { db_read_bytes((db_addr_t)ld, sizeof(lds), __UNVOLATILE(&lds)); ld = &lds; if (ld->ld_lock == NULL) continue; if (addr == NULL || ld->ld_lock == addr) { lockdebug_dump(curlwp, ld, pr); if (addr != NULL) return; } } if (addr != NULL) { (*pr)("Sorry, no record of a lock with address %p found.\n", addr); } #else char sym[128]; uintptr_t word; (*pr)("WARNING: lock print is unreliable without LOCKDEBUG\n"); db_symstr(sym, sizeof(sym), (db_expr_t)(intptr_t)addr, DB_STGY_ANY); db_read_bytes((db_addr_t)addr, sizeof(word), (char *)&word); (*pr)("%s: possible owner: %p, bits: 0x%" PRIxPTR "\n", sym, (void *)(word & ~(uintptr_t)ALIGNBYTES), word & ALIGNBYTES); #endif /* LOCKDEBUG */ } #ifdef _KERNEL #ifdef LOCKDEBUG static void lockdebug_show_one(lwp_t *l, lockdebug_t *ld, int i, void (*pr)(const char *, ...) __printflike(1, 2)) { char sym[128]; #ifdef DDB db_symstr(sym, sizeof(sym), (db_expr_t)ld->ld_initaddr, DB_STGY_PROC); #else snprintf(sym, sizeof(sym), "%p", (void *)ld->ld_initaddr); #endif (*pr)("* Lock %d (initialized at %s)\n", i++, sym); lockdebug_dump(l, ld, pr); } static void lockdebug_show_trace(const void *ptr, void (*pr)(const char *, ...) __printflike(1, 2)) { db_stack_trace_print((db_expr_t)(intptr_t)ptr, true, 32, "a", pr); } static void lockdebug_show_all_locks_lwp(void (*pr)(const char *, ...) __printflike(1, 2), bool show_trace) { struct proc *p; LIST_FOREACH(p, &allproc, p_list) { struct lwp *l; LIST_FOREACH(l, &p->p_lwps, l_sibling) { lockdebug_t *ld; int i = 0; if (TAILQ_EMPTY(&l->l_ld_locks) && l->l_ld_wanted == NULL) { continue; } (*pr)("\n****** LWP %d.%d (%s) @ %p, l_stat=%d\n", p->p_pid, l->l_lid, l->l_name ? l->l_name : p->p_comm, l, l->l_stat); if (!TAILQ_EMPTY(&l->l_ld_locks)) { (*pr)("\n*** Locks held: \n"); TAILQ_FOREACH(ld, &l->l_ld_locks, ld_chain) { (*pr)("\n"); lockdebug_show_one(l, ld, i++, pr); } } else { (*pr)("\n*** Locks held: none\n"); } if (l->l_ld_wanted != NULL) { (*pr)("\n*** Locks wanted: \n\n"); lockdebug_show_one(l, l->l_ld_wanted, 0, pr); } else { (*pr)("\n*** Locks wanted: none\n"); } if (show_trace) { (*pr)("\n*** Traceback: \n\n"); lockdebug_show_trace(l, pr); (*pr)("\n"); } } } } static void lockdebug_show_all_locks_cpu(void (*pr)(const char *, ...) __printflike(1, 2), bool show_trace) { lockdebug_t *ld; CPU_INFO_ITERATOR cii; struct cpu_info *ci; for (CPU_INFO_FOREACH(cii, ci)) { int i = 0; if (TAILQ_EMPTY(&ci->ci_data.cpu_ld_locks)) continue; (*pr)("\n******* Locks held on %s:\n", cpu_name(ci)); TAILQ_FOREACH(ld, &ci->ci_data.cpu_ld_locks, ld_chain) { (*pr)("\n"); #ifdef MULTIPROCESSOR lockdebug_show_one(ci->ci_curlwp, ld, i++, pr); if (show_trace) lockdebug_show_trace(ci->ci_curlwp, pr); #else lockdebug_show_one(curlwp, ld, i++, pr); if (show_trace) lockdebug_show_trace(curlwp, pr); #endif } } } #endif /* _KERNEL */ #endif /* LOCKDEBUG */ #ifdef _KERNEL void lockdebug_show_all_locks(void (*pr)(const char *, ...) __printflike(1, 2), const char *modif) { #ifdef LOCKDEBUG bool show_trace = false; if (modif[0] == 't') show_trace = true; (*pr)("[Locks tracked through LWPs]\n"); lockdebug_show_all_locks_lwp(pr, show_trace); (*pr)("\n"); (*pr)("[Locks tracked through CPUs]\n"); lockdebug_show_all_locks_cpu(pr, show_trace); (*pr)("\n"); #else (*pr)("Sorry, kernel not built with the LOCKDEBUG option.\n"); #endif /* LOCKDEBUG */ } void lockdebug_show_lockstats(void (*pr)(const char *, ...) __printflike(1, 2)) { #ifdef LOCKDEBUG lockdebug_t *ld; void *_ld; uint32_t n_null = 0; uint32_t n_spin_mutex = 0; uint32_t n_adaptive_mutex = 0; uint32_t n_rwlock = 0; uint32_t n_others = 0; RB_TREE_FOREACH(_ld, &ld_rb_tree) { ld = _ld; if (ld->ld_lock == NULL) { n_null++; continue; } if (ld->ld_lockops->lo_name[0] == 'M') { if (ld->ld_lockops->lo_type == LOCKOPS_SLEEP) n_adaptive_mutex++; else n_spin_mutex++; continue; } if (ld->ld_lockops->lo_name[0] == 'R') { n_rwlock++; continue; } n_others++; } (*pr)( "spin mutex: %u\n" "adaptive mutex: %u\n" "rwlock: %u\n" "null locks: %u\n" "others: %u\n", n_spin_mutex, n_adaptive_mutex, n_rwlock, n_null, n_others); #else (*pr)("Sorry, kernel not built with the LOCKDEBUG option.\n"); #endif /* LOCKDEBUG */ } #endif /* _KERNEL */ #endif /* DDB */ #ifdef _KERNEL /* * lockdebug_dismiss: * * The system is rebooting, and potentially from an unsafe * place so avoid any future aborts. */ void lockdebug_dismiss(void) { atomic_inc_uint_nv(&ld_panic); } /* * lockdebug_abort: * * An error has been trapped - dump lock info and call panic(). */ void lockdebug_abort(const char *func, size_t line, const volatile void *lock, lockops_t *ops, const char *msg) { #ifdef LOCKDEBUG lockdebug_t *ld; int s; s = splhigh(); if ((ld = lockdebug_lookup(func, line, lock, (uintptr_t) __builtin_return_address(0))) != NULL) { lockdebug_abort1(func, line, ld, s, msg, true); return; } splx(s); #endif /* LOCKDEBUG */ /* * Don't make the situation worse if the system is already going * down in flames. Once a panic is triggered, lockdebug state * becomes stale and cannot be trusted. */ if (atomic_inc_uint_nv(&ld_panic) > 1) return; char locksym[128]; #ifdef DDB db_symstr(locksym, sizeof(locksym), (db_expr_t)(intptr_t)lock, DB_STGY_ANY); #else snprintf(locksym, sizeof(locksym), "%#018lx", (unsigned long)lock); #endif printf("%s error: %s,%zu: %s\n\n" "lock address : %s\n" "current cpu : %18d\n" "current lwp : %#018lx\n", ops->lo_name, func, line, msg, locksym, (int)cpu_index(curcpu()), (long)curlwp); (*ops->lo_dump)(lock, printf); printf("\n"); panic("lock error: %s: %s,%zu: %s: lock %p cpu %d lwp %p", ops->lo_name, func, line, msg, lock, cpu_index(curcpu()), curlwp); } #endif /* _KERNEL */
4 4 4 4 4 3 2 1 2 1 1 2 1 2 2 1 5 11 3 1 1 1 7 1 1 7 8 1 7 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 /* $NetBSD: vfs_syscalls_43.c,v 1.68 2021/09/07 11:43:02 riastradh Exp $ */ /* * Copyright (c) 1989, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)vfs_syscalls.c 8.28 (Berkeley) 12/10/94 */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: vfs_syscalls_43.c,v 1.68 2021/09/07 11:43:02 riastradh Exp $"); #if defined(_KERNEL_OPT) #include "opt_compat_netbsd.h" #endif #include <sys/param.h> #include <sys/systm.h> #include <sys/filedesc.h> #include <sys/kernel.h> #include <sys/proc.h> #include <sys/file.h> #include <sys/vnode.h> #include <sys/namei.h> #include <sys/dirent.h> #include <sys/socket.h> #include <sys/socketvar.h> #include <sys/stat.h> #include <sys/malloc.h> #include <sys/ioctl.h> #include <sys/fcntl.h> #include <sys/syslog.h> #include <sys/unistd.h> #include <sys/resourcevar.h> #include <sys/mount.h> #include <sys/syscall.h> #include <sys/syscallvar.h> #include <sys/syscallargs.h> #include <sys/vfs_syscalls.h> #include <compat/sys/stat.h> #include <compat/sys/mount.h> #include <compat/sys/dirent.h> #include <compat/common/compat_util.h> #include <compat/common/compat_mod.h> static struct syscall_package vfs_syscalls_43_syscalls[] = { { SYS_compat_43_oquota, 0, (sy_call_t *)compat_43_sys_quota }, { SYS_compat_43_stat43, 0, (sy_call_t *)compat_43_sys_stat }, { SYS_compat_43_lstat43, 0, (sy_call_t *)compat_43_sys_lstat }, { SYS_compat_43_fstat43, 0, (sy_call_t *)compat_43_sys_fstat }, { SYS_compat_43_otruncate, 0, (sy_call_t *)compat_43_sys_ftruncate }, { SYS_compat_43_oftruncate, 0, (sy_call_t *)compat_43_sys_ftruncate }, { SYS_compat_43_olseek, 0, (sy_call_t *)compat_43_sys_lseek }, { SYS_compat_43_ocreat, 0, (sy_call_t *)compat_43_sys_creat }, { SYS_compat_43_ogetdirentries, 0, (sy_call_t *)compat_43_sys_getdirentries }, { 0, 0, NULL } }; /* * Convert from an old to a new timespec structure. */ static void cvttimespec(struct timespec50 *ots, const struct timespec *ts) { if (ts->tv_sec > INT_MAX) { #if defined(DEBUG) || 1 static bool first = true; if (first) { first = false; printf("%s[%s:%d]: time_t does not fit\n", __func__, curlwp->l_proc->p_comm, curlwp->l_lid); } #endif ots->tv_sec = INT_MAX; } else ots->tv_sec = ts->tv_sec; ots->tv_nsec = ts->tv_nsec; } /* * Convert from an old to a new stat structure. */ static void cvtstat(struct stat43 *ost, const struct stat *st) { /* Handle any padding. */ memset(ost, 0, sizeof(*ost)); ost->st_dev = st->st_dev; ost->st_ino = st->st_ino; ost->st_mode = st->st_mode & 0xffff; ost->st_nlink = st->st_nlink; ost->st_uid = st->st_uid; ost->st_gid = st->st_gid; ost->st_rdev = st->st_rdev; if (st->st_size < (quad_t)1 << 32) ost->st_size = st->st_size; else ost->st_size = -2; cvttimespec(&ost->st_atimespec, &st->st_atimespec); cvttimespec(&ost->st_mtimespec, &st->st_mtimespec); cvttimespec(&ost->st_ctimespec, &st->st_ctimespec); ost->st_blksize = st->st_blksize; ost->st_blocks = st->st_blocks; ost->st_flags = st->st_flags; ost->st_gen = st->st_gen; } /* * Get file status; this version follows links. */ /* ARGSUSED */ int compat_43_sys_stat(struct lwp *l, const struct compat_43_sys_stat_args *uap, register_t *retval) { /* { syscallarg(char *) path; syscallarg(struct stat43 *) ub; } */ struct stat sb; struct stat43 osb; int error; error = do_sys_stat(SCARG(uap, path), FOLLOW, &sb); if (error) return error; cvtstat(&osb, &sb); return copyout(&osb, SCARG(uap, ub), sizeof(osb)); } /* * Get file status; this version does not follow links. */ /* ARGSUSED */ int compat_43_sys_lstat(struct lwp *l, const struct compat_43_sys_lstat_args *uap, register_t *retval) { /* { syscallarg(char *) path; syscallarg(struct stat43 *) ub; } */ struct stat sb; struct stat43 osb; int error; error = do_sys_stat(SCARG(uap, path), NOFOLLOW, &sb); if (error) return error; /* * For symbolic links, BSD4.3 returned the attributes of its * containing directory, except for mode, size, and links. * This is no longer emulated, the parent directory is not consulted. */ cvtstat(&osb, &sb); return copyout(&osb, SCARG(uap, ub), sizeof(osb)); } /* * Return status information about a file descriptor. */ /* ARGSUSED */ int compat_43_sys_fstat(struct lwp *l, const struct compat_43_sys_fstat_args *uap, register_t *retval) { /* { syscallarg(int) fd; syscallarg(struct stat43 *) sb; } */ struct stat sb; struct stat43 osb; int error; error = do_sys_fstat(SCARG(uap, fd), &sb); if (error) return error; cvtstat(&osb, &sb); return copyout(&osb, SCARG(uap, sb), sizeof(osb)); } /* * Truncate a file given a file descriptor. */ /* ARGSUSED */ int compat_43_sys_ftruncate(struct lwp *l, const struct compat_43_sys_ftruncate_args *uap, register_t *retval) { /* { syscallarg(int) fd; syscallarg(long) length; } */ struct sys_ftruncate_args /* { syscallarg(int) fd; syscallarg(int) pad; syscallarg(off_t) length; } */ nuap; SCARG(&nuap, fd) = SCARG(uap, fd); SCARG(&nuap, length) = SCARG(uap, length); return sys_ftruncate(l, &nuap, retval); } /* * Truncate a file given its path name. */ /* ARGSUSED */ int compat_43_sys_truncate(struct lwp *l, const struct compat_43_sys_truncate_args *uap, register_t *retval) { /* { syscallarg(char *) path; syscallarg(long) length; } */ struct sys_truncate_args /* { syscallarg(char *) path; syscallarg(int) pad; syscallarg(off_t) length; } */ nuap; SCARG(&nuap, path) = SCARG(uap, path); SCARG(&nuap, length) = SCARG(uap, length); return (sys_truncate(l, &nuap, retval)); } /* * Reposition read/write file offset. */ int compat_43_sys_lseek(struct lwp *l, const struct compat_43_sys_lseek_args *uap, register_t *retval) { /* { syscallarg(int) fd; syscallarg(long) offset; syscallarg(int) whence; } */ struct sys_lseek_args /* { syscallarg(int) fd; syscallarg(int) pad; syscallarg(off_t) offset; syscallarg(int) whence; } */ nuap; off_t qret; int error; SCARG(&nuap, fd) = SCARG(uap, fd); SCARG(&nuap, offset) = SCARG(uap, offset); SCARG(&nuap, whence) = SCARG(uap, whence); error = sys_lseek(l, &nuap, (register_t *)&qret); *(long *)retval = qret; return (error); } /* * Create a file. */ int compat_43_sys_creat(struct lwp *l, const struct compat_43_sys_creat_args *uap, register_t *retval) { /* { syscallarg(char *) path; syscallarg(int) mode; } */ struct sys_open_args /* { syscallarg(char *) path; syscallarg(int) flags; syscallarg(int) mode; } */ nuap; SCARG(&nuap, path) = SCARG(uap, path); SCARG(&nuap, mode) = SCARG(uap, mode); SCARG(&nuap, flags) = O_WRONLY | O_CREAT | O_TRUNC; return (sys_open(l, &nuap, retval)); } /*ARGSUSED*/ int compat_43_sys_quota(struct lwp *l, const void *v, register_t *retval) { return (ENOSYS); } /* * Read a block of directory entries in a file system independent format. */ int compat_43_sys_getdirentries(struct lwp *l, const struct compat_43_sys_getdirentries_args *uap, register_t *retval) { /* { syscallarg(int) fd; syscallarg(char *) buf; syscallarg(u_int) count; syscallarg(long *) basep; } */ struct dirent *bdp; struct vnode *vp; void *tbuf; /* Current-format */ char *inp; /* Current-format */ int len, reclen; /* Current-format */ char *outp; /* Dirent12-format */ int resid, old_reclen = 0; /* Dirent12-format */ struct file *fp; struct uio auio; struct iovec aiov; struct dirent43 idb; off_t off; /* true file offset */ int buflen, error, eofflag, nbytes; struct vattr va; off_t *cookiebuf = NULL, *cookie; int ncookies; long loff; /* fd_getvnode() will use the descriptor for us */ if ((error = fd_getvnode(SCARG(uap, fd), &fp)) != 0) return (error); if ((fp->f_flag & FREAD) == 0) { error = EBADF; goto out1; } vp = fp->f_vnode; if (vp->v_type != VDIR) { error = ENOTDIR; goto out1; } vn_lock(vp, LK_SHARED | LK_RETRY); error = VOP_GETATTR(vp, &va, l->l_cred); VOP_UNLOCK(vp); if (error) goto out1; loff = fp->f_offset; nbytes = SCARG(uap, count); buflen = uimin(MAXBSIZE, nbytes); if (buflen < va.va_blocksize) buflen = va.va_blocksize; tbuf = malloc(buflen, M_TEMP, M_WAITOK); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); off = fp->f_offset; again: aiov.iov_base = tbuf; aiov.iov_len = buflen; auio.uio_iov = &aiov; auio.uio_iovcnt = 1; auio.uio_rw = UIO_READ; auio.uio_resid = buflen; auio.uio_offset = off; UIO_SETUP_SYSSPACE(&auio); /* * First we read into the malloc'ed buffer, then * we massage it into user space, one record at a time. */ error = VOP_READDIR(vp, &auio, fp->f_cred, &eofflag, &cookiebuf, &ncookies); if (error) goto out; inp = (char *)tbuf; outp = SCARG(uap, buf); resid = nbytes; if ((len = buflen - auio.uio_resid) == 0) goto eof; for (cookie = cookiebuf; len > 0; len -= reclen) { bdp = (struct dirent *)inp; reclen = bdp->d_reclen; if (reclen & 3) { error = EIO; goto out; } if (bdp->d_fileno == 0) { inp += reclen; /* it is a hole; squish it out */ if (cookie) off = *cookie++; else off += reclen; continue; } memset(&idb, 0, sizeof(idb)); if (bdp->d_namlen >= sizeof(idb.d_name)) idb.d_namlen = sizeof(idb.d_name) - 1; else idb.d_namlen = bdp->d_namlen; old_reclen = _DIRENT_RECLEN(&idb, bdp->d_namlen); if (reclen > len || resid < old_reclen) { /* entry too big for buffer, so just stop */ outp++; break; } /* * Massage in place to make a Dirent12-shaped dirent (otherwise * we have to worry about touching user memory outside of * the copyout() call). */ idb.d_fileno = (uint32_t)bdp->d_fileno; idb.d_reclen = (uint16_t)old_reclen; idb.d_fileno = (uint32_t)bdp->d_fileno; (void)memcpy(idb.d_name, bdp->d_name, idb.d_namlen); memset(idb.d_name + idb.d_namlen, 0, idb.d_reclen - _DIRENT_NAMEOFF(&idb) - idb.d_namlen); if ((error = copyout(&idb, outp, old_reclen))) goto out; /* advance past this real entry */ inp += reclen; if (cookie) off = *cookie++; /* each entry points to itself */ else off += reclen; /* advance output past Dirent12-shaped entry */ outp += old_reclen; resid -= old_reclen; } /* if we squished out the whole block, try again */ if (outp == SCARG(uap, buf)) { if (cookiebuf) free(cookiebuf, M_TEMP); cookiebuf = NULL; goto again; } fp->f_offset = off; /* update the vnode offset */ eof: *retval = nbytes - resid; out: VOP_UNLOCK(vp); if (cookiebuf) free(cookiebuf, M_TEMP); free(tbuf, M_TEMP); out1: fd_putfile(SCARG(uap, fd)); if (error) return error; return copyout(&loff, SCARG(uap, basep), sizeof(loff)); } int vfs_syscalls_43_init(void) { return syscall_establish(NULL, vfs_syscalls_43_syscalls); } int vfs_syscalls_43_fini(void) { return syscall_disestablish(NULL, vfs_syscalls_43_syscalls); }
1 14 3 2 12 12 9 12 4 4 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 /* $NetBSD: in_var.h,v 1.103 2022/11/19 08:00:51 yamt Exp $ */ /*- * Copyright (c) 1998 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Public Access Networks Corporation ("Panix"). It was developed under * contract to Panix by Eric Haszlakiewicz and Thor Lancelot Simon. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /* * Copyright (c) 1985, 1986, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)in_var.h 8.2 (Berkeley) 1/9/95 */ #ifndef _NETINET_IN_VAR_H_ #define _NETINET_IN_VAR_H_ #include <sys/queue.h> #define IN_IFF_TENTATIVE 0x01 /* tentative address */ #define IN_IFF_DUPLICATED 0x02 /* DAD detected duplicate */ #define IN_IFF_DETACHED 0x04 /* may be detached from the link */ #define IN_IFF_TRYTENTATIVE 0x08 /* intent to try DAD */ #define IN_IFFBITS \ "\020\1TENTATIVE\2DUPLICATED\3DETACHED\4TRYTENTATIVE" /* do not input/output */ #define IN_IFF_NOTREADY \ (IN_IFF_TRYTENTATIVE | IN_IFF_TENTATIVE | IN_IFF_DUPLICATED) /* * Interface address, Internet version. One of these structures * is allocated for each interface with an Internet address. * The ifaddr structure contains the protocol-independent part * of the structure and is assumed to be first. */ struct in_ifaddr { struct ifaddr ia_ifa; /* protocol-independent info */ #define ia_ifp ia_ifa.ifa_ifp #define ia_flags ia_ifa.ifa_flags /* ia_{,sub}net{,mask} in host order */ u_int32_t ia_net; /* network number of interface */ u_int32_t ia_netmask; /* mask of net part */ u_int32_t ia_subnet; /* subnet number, including net */ u_int32_t ia_subnetmask; /* mask of subnet part */ struct in_addr ia_netbroadcast; /* to recognize net broadcasts */ LIST_ENTRY(in_ifaddr) ia_hash; /* entry in bucket of inet addresses */ TAILQ_ENTRY(in_ifaddr) ia_list; /* list of internet addresses */ struct sockaddr_in ia_addr; /* reserve space for interface name */ struct sockaddr_in ia_dstaddr; /* reserve space for broadcast addr */ #define ia_broadaddr ia_dstaddr struct sockaddr_in ia_sockmask; /* reserve space for general netmask */ LIST_HEAD(, in_multi) ia_multiaddrs; /* list of multicast addresses */ struct in_multi *ia_allhosts; /* multicast address record for the allhosts multicast group */ uint16_t ia_idsalt; /* ip_id salt for this ia */ int ia4_flags; /* address flags */ void (*ia_dad_start) (struct ifaddr *); /* DAD start function */ void (*ia_dad_stop) (struct ifaddr *); /* DAD stop function */ time_t ia_dad_defended; /* last time of DAD defence */ #ifdef _KERNEL struct pslist_entry ia_hash_pslist_entry; struct pslist_entry ia_pslist_entry; #endif }; struct in_nbrinfo { char ifname[IFNAMSIZ]; /* if name, e.g. "en0" */ struct in_addr addr; /* IPv4 address of the neighbor */ long asked; /* number of queries already sent for this addr */ int state; /* reachability state */ int expire; /* lifetime for NDP state transition */ }; #ifdef _KERNEL static __inline void ia4_acquire(struct in_ifaddr *ia, struct psref *psref) { KASSERT(ia != NULL); ifa_acquire(&ia->ia_ifa, psref); } static __inline void ia4_release(struct in_ifaddr *ia, struct psref *psref) { if (ia == NULL) return; ifa_release(&ia->ia_ifa, psref); } #endif struct in_aliasreq { char ifra_name[IFNAMSIZ]; /* if name, e.g. "en0" */ struct sockaddr_in ifra_addr; struct sockaddr_in ifra_dstaddr; #define ifra_broadaddr ifra_dstaddr struct sockaddr_in ifra_mask; }; /* * Given a pointer to an in_ifaddr (ifaddr), * return a pointer to the addr as a sockaddr_in. */ #define IA_SIN(ia) (&(((struct in_ifaddr *)(ia))->ia_addr)) #ifdef _KERNEL /* Note: 61, 127, 251, 509, 1021, 2039 are good. */ #ifndef IN_IFADDR_HASH_SIZE #define IN_IFADDR_HASH_SIZE 509 #endif /* * This is a bit unconventional, and wastes a little bit of space, but * because we want a very even hash function we don't use & in_ifaddrhash * here, but rather % the hash size, which should obviously be prime. */ #define IN_IFADDR_HASH(x) in_ifaddrhashtbl[(u_long)(x) % IN_IFADDR_HASH_SIZE] LIST_HEAD(in_ifaddrhashhead, in_ifaddr); /* Type of the hash head */ TAILQ_HEAD(in_ifaddrhead, in_ifaddr); /* Type of the list head */ extern u_long in_ifaddrhash; /* size of hash table - 1 */ extern struct in_ifaddrhashhead *in_ifaddrhashtbl; /* Hash table head */ extern struct in_ifaddrhead in_ifaddrhead; /* List head (in ip_input) */ extern pserialize_t in_ifaddrhash_psz; extern struct pslist_head *in_ifaddrhashtbl_pslist; extern u_long in_ifaddrhash_pslist; extern struct pslist_head in_ifaddrhead_pslist; #define IN_IFADDR_HASH_PSLIST(x) \ in_ifaddrhashtbl_pslist[(u_long)(x) % IN_IFADDR_HASH_SIZE] #define IN_ADDRHASH_READER_FOREACH(__ia, __addr) \ PSLIST_READER_FOREACH((__ia), &IN_IFADDR_HASH_PSLIST(__addr), \ struct in_ifaddr, ia_hash_pslist_entry) #define IN_ADDRHASH_WRITER_INSERT_HEAD(__ia) \ PSLIST_WRITER_INSERT_HEAD( \ &IN_IFADDR_HASH_PSLIST((__ia)->ia_addr.sin_addr.s_addr), \ (__ia), ia_hash_pslist_entry) #define IN_ADDRHASH_WRITER_REMOVE(__ia) \ PSLIST_WRITER_REMOVE((__ia), ia_hash_pslist_entry) #define IN_ADDRHASH_ENTRY_INIT(__ia) \ PSLIST_ENTRY_INIT((__ia), ia_hash_pslist_entry); #define IN_ADDRHASH_ENTRY_DESTROY(__ia) \ PSLIST_ENTRY_DESTROY((__ia), ia_hash_pslist_entry); #define IN_ADDRHASH_READER_NEXT(__ia) \ PSLIST_READER_NEXT((__ia), struct in_ifaddr, ia_hash_pslist_entry) #define IN_ADDRLIST_ENTRY_INIT(__ia) \ PSLIST_ENTRY_INIT((__ia), ia_pslist_entry) #define IN_ADDRLIST_ENTRY_DESTROY(__ia) \ PSLIST_ENTRY_DESTROY((__ia), ia_pslist_entry); #define IN_ADDRLIST_READER_EMPTY() \ (PSLIST_READER_FIRST(&in_ifaddrhead_pslist, struct in_ifaddr, \ ia_pslist_entry) == NULL) #define IN_ADDRLIST_READER_FIRST() \ PSLIST_READER_FIRST(&in_ifaddrhead_pslist, struct in_ifaddr, \ ia_pslist_entry) #define IN_ADDRLIST_READER_NEXT(__ia) \ PSLIST_READER_NEXT((__ia), struct in_ifaddr, ia_pslist_entry) #define IN_ADDRLIST_READER_FOREACH(__ia) \ PSLIST_READER_FOREACH((__ia), &in_ifaddrhead_pslist, \ struct in_ifaddr, ia_pslist_entry) #define IN_ADDRLIST_WRITER_INSERT_HEAD(__ia) \ PSLIST_WRITER_INSERT_HEAD(&in_ifaddrhead_pslist, (__ia), \ ia_pslist_entry) #define IN_ADDRLIST_WRITER_REMOVE(__ia) \ PSLIST_WRITER_REMOVE((__ia), ia_pslist_entry) #define IN_ADDRLIST_WRITER_FOREACH(__ia) \ PSLIST_WRITER_FOREACH((__ia), &in_ifaddrhead_pslist, \ struct in_ifaddr, ia_pslist_entry) #define IN_ADDRLIST_WRITER_FIRST() \ PSLIST_WRITER_FIRST(&in_ifaddrhead_pslist, struct in_ifaddr, \ ia_pslist_entry) #define IN_ADDRLIST_WRITER_NEXT(__ia) \ PSLIST_WRITER_NEXT((__ia), struct in_ifaddr, ia_pslist_entry) #define IN_ADDRLIST_WRITER_INSERT_AFTER(__ia, __new) \ PSLIST_WRITER_INSERT_AFTER((__ia), (__new), ia_pslist_entry) #define IN_ADDRLIST_WRITER_EMPTY() \ (PSLIST_WRITER_FIRST(&in_ifaddrhead_pslist, struct in_ifaddr, \ ia_pslist_entry) == NULL) #define IN_ADDRLIST_WRITER_INSERT_TAIL(__new) \ do { \ if (IN_ADDRLIST_WRITER_EMPTY()) { \ IN_ADDRLIST_WRITER_INSERT_HEAD((__new)); \ } else { \ struct in_ifaddr *__ia; \ IN_ADDRLIST_WRITER_FOREACH(__ia) { \ if (IN_ADDRLIST_WRITER_NEXT(__ia) == NULL) { \ IN_ADDRLIST_WRITER_INSERT_AFTER(__ia,\ (__new)); \ break; \ } \ } \ } \ } while (0) extern const int inetctlerrmap[]; /* * Find whether an internet address (in_addr) belongs to one * of our interfaces (in_ifaddr). NULL if the address isn't ours. */ static __inline struct in_ifaddr * in_get_ia(struct in_addr addr) { struct in_ifaddr *ia; IN_ADDRHASH_READER_FOREACH(ia, addr.s_addr) { if (in_hosteq(ia->ia_addr.sin_addr, addr)) break; } return ia; } static __inline struct in_ifaddr * in_get_ia_psref(struct in_addr addr, struct psref *psref) { struct in_ifaddr *ia; int s; s = pserialize_read_enter(); ia = in_get_ia(addr); if (ia != NULL) ia4_acquire(ia, psref); pserialize_read_exit(s); return ia; } /* * Find whether an internet address (in_addr) belongs to a specified * interface. NULL if the address isn't ours. */ static __inline struct in_ifaddr * in_get_ia_on_iface(struct in_addr addr, struct ifnet *ifp) { struct in_ifaddr *ia; IN_ADDRHASH_READER_FOREACH(ia, addr.s_addr) { if (in_hosteq(ia->ia_addr.sin_addr, addr) && ia->ia_ifp == ifp) break; } return ia; } static __inline struct in_ifaddr * in_get_ia_on_iface_psref(struct in_addr addr, struct ifnet *ifp, struct psref *psref) { struct in_ifaddr *ia; int s; s = pserialize_read_enter(); ia = in_get_ia_on_iface(addr, ifp); if (ia != NULL) ia4_acquire(ia, psref); pserialize_read_exit(s); return ia; } /* * Find an internet address structure (in_ifaddr) corresponding * to a given interface (ifnet structure). */ static __inline struct in_ifaddr * in_get_ia_from_ifp(struct ifnet *ifp) { struct ifaddr *ifa; IFADDR_READER_FOREACH(ifa, ifp) { if (ifa->ifa_addr->sa_family == AF_INET) break; } return ifatoia(ifa); } static __inline struct in_ifaddr * in_get_ia_from_ifp_psref(struct ifnet *ifp, struct psref *psref) { struct in_ifaddr *ia; int s; s = pserialize_read_enter(); ia = in_get_ia_from_ifp(ifp); if (ia != NULL) ia4_acquire(ia, psref); pserialize_read_exit(s); return ia; } #include <netinet/in_selsrc.h> /* * IPv4 per-interface state. */ struct in_ifinfo { struct lltable *ii_llt; /* ARP state */ struct in_ifsysctl *ii_selsrc; #ifdef MBUFTRACE struct mowner ii_mowner; #endif }; #endif /* _KERNEL */ /* * Internet multicast address structure. There is one of these for each IP * multicast group to which this host belongs on a given network interface. * They are kept in a linked list, rooted in the interface's in_ifaddr * structure. */ struct router_info; struct in_multi { LIST_ENTRY(in_multi) inm_list; /* list of multicast addresses */ struct router_info *inm_rti; /* router version info */ struct ifnet *inm_ifp; /* back pointer to ifnet */ struct in_addr inm_addr; /* IP multicast address */ u_int inm_refcount; /* no. membership claims by sockets */ u_int inm_timer; /* IGMP membership report timer */ u_int inm_state; /* state of membership */ }; #ifdef _KERNEL #include <net/pktqueue.h> #include <sys/cprng.h> extern pktqueue_t *ip_pktq; extern int ip_dad_count; /* Duplicate Address Detection probes */ static inline bool ip_dad_enabled(void) { #if NARP > 0 return ip_dad_count > 0; #else return false; #endif } #if defined(INET) && NARP > 0 extern int arp_debug; #define ARPLOGADDR(a) IN_PRINT(_ipbuf, a) #define ARPLOG(level, fmt, args...) \ do { \ char _ipbuf[INET_ADDRSTRLEN]; \ (void)_ipbuf; \ if (arp_debug) \ log(level, "%s: " fmt, __func__, ##args); \ } while (/*CONSTCOND*/0) #else #define ARPLOG(level, fmt, args...) #endif /* * Structure used by functions below to remember position when stepping * through all of the in_multi records. */ struct in_multistep { int i_n; struct in_multi *i_inm; }; bool in_multi_group(struct in_addr, struct ifnet *, int); struct in_multi *in_first_multi(struct in_multistep *); struct in_multi *in_next_multi(struct in_multistep *); struct in_multi *in_lookup_multi(struct in_addr, struct ifnet *); struct in_multi *in_addmulti(struct in_addr *, struct ifnet *); void in_delmulti(struct in_multi *); void in_multi_lock(int); void in_multi_unlock(void); int in_multi_lock_held(void); struct ifaddr; int in_ifinit(struct ifnet *, struct in_ifaddr *, const struct sockaddr_in *, const struct sockaddr_in *, int); void in_savemkludge(struct in_ifaddr *); void in_restoremkludge(struct in_ifaddr *, struct ifnet *); void in_purgemkludge(struct ifnet *); void in_setmaxmtu(void); int in_control(struct socket *, u_long, void *, struct ifnet *); void in_purgeaddr(struct ifaddr *); void in_purgeif(struct ifnet *); void in_addrhash_insert(struct in_ifaddr *); void in_addrhash_remove(struct in_ifaddr *); int ipflow_fastforward(struct mbuf *); extern uint16_t ip_id; extern int ip_do_randomid; static __inline uint16_t ip_randomid(void) { uint16_t id = (uint16_t)cprng_fast32(); return id ? id : 1; } /* * ip_newid_range: "allocate" num contiguous IP IDs. * * => Return the first ID. */ static __inline uint16_t ip_newid_range(const struct in_ifaddr *ia, u_int num) { uint16_t id; if (ip_do_randomid) { /* XXX ignore num */ return ip_randomid(); } /* Never allow an IP ID of 0 (detect wrap). */ if ((uint16_t)(ip_id + num) < ip_id) { ip_id = 1; } id = htons(ip_id); ip_id += num; return id; } static __inline uint16_t ip_newid(const struct in_ifaddr *ia) { return ip_newid_range(ia, 1); } #ifdef SYSCTLFN_PROTO int sysctl_inpcblist(SYSCTLFN_PROTO); #endif #define LLTABLE(ifp) \ ((struct in_ifinfo *)(ifp)->if_afdata[AF_INET])->ii_llt #endif /* !_KERNEL */ /* INET6 stuff */ #include <netinet6/in6_var.h> #endif /* !_NETINET_IN_VAR_H_ */
189 157 82 75 212 213 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 /* $NetBSD: secmodel_extensions.c,v 1.16 2023/04/22 13:54:19 riastradh Exp $ */ /*- * Copyright (c) 2011 Elad Efrat <elad@NetBSD.org> * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The name of the author may not be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: secmodel_extensions.c,v 1.16 2023/04/22 13:54:19 riastradh Exp $"); #include <sys/types.h> #include <sys/param.h> #include <sys/kauth.h> #include <sys/mount.h> #include <sys/socketvar.h> #include <sys/sysctl.h> #include <sys/proc.h> #include <sys/ptrace.h> #include <sys/module.h> #include <secmodel/secmodel.h> #include <secmodel/extensions/extensions.h> #include <secmodel/extensions/extensions_impl.h> MODULE(MODULE_CLASS_SECMODEL, extensions, NULL); static int curtain; static int user_set_cpu_affinity; #ifdef PT_SETDBREGS int user_set_dbregs; #endif static kauth_listener_t l_process, l_network; static secmodel_t extensions_sm; static void secmodel_extensions_init(void); static void secmodel_extensions_start(void); static void secmodel_extensions_stop(void); static void sysctl_security_extensions_setup(struct sysctllog **); static int sysctl_extensions_curtain_handler(SYSCTLFN_PROTO); static bool is_securelevel_above(int); static int secmodel_extensions_process_cb(kauth_cred_t, kauth_action_t, void *, void *, void *, void *, void *); static int secmodel_extensions_network_cb(kauth_cred_t, kauth_action_t, void *, void *, void *, void *, void *); SYSCTL_SETUP(sysctl_security_extensions_setup, "security extensions sysctl") { const struct sysctlnode *rnode, *rnode2; sysctl_createv(clog, 0, NULL, &rnode, CTLFLAG_PERMANENT, CTLTYPE_NODE, "models", NULL, NULL, 0, NULL, 0, CTL_SECURITY, CTL_CREATE, CTL_EOL); /* Compatibility: security.models.bsd44 */ rnode2 = rnode; sysctl_createv(clog, 0, &rnode2, &rnode2, CTLFLAG_PERMANENT, CTLTYPE_NODE, "bsd44", NULL, NULL, 0, NULL, 0, CTL_CREATE, CTL_EOL); /* Compatibility: security.models.bsd44.curtain */ sysctl_createv(clog, 0, &rnode2, NULL, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_INT, "curtain", SYSCTL_DESCR("Curtain information about objects to "\ "users not owning them."), sysctl_extensions_curtain_handler, 0, &curtain, 0, CTL_CREATE, CTL_EOL); sysctl_createv(clog, 0, &rnode, &rnode, CTLFLAG_PERMANENT, CTLTYPE_NODE, "extensions", NULL, NULL, 0, NULL, 0, CTL_CREATE, CTL_EOL); sysctl_createv(clog, 0, &rnode, NULL, CTLFLAG_PERMANENT, CTLTYPE_STRING, "name", NULL, NULL, 0, __UNCONST(SECMODEL_EXTENSIONS_NAME), 0, CTL_CREATE, CTL_EOL); sysctl_createv(clog, 0, &rnode, NULL, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_INT, "curtain", SYSCTL_DESCR("Curtain information about objects to "\ "users not owning them."), sysctl_extensions_curtain_handler, 0, &curtain, 0, CTL_CREATE, CTL_EOL); sysctl_createv(clog, 0, &rnode, NULL, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_INT, "user_set_cpu_affinity", SYSCTL_DESCR("Whether unprivileged users may control "\ "CPU affinity."), sysctl_extensions_user_handler, 0, &user_set_cpu_affinity, 0, CTL_CREATE, CTL_EOL); #ifdef PT_SETDBREGS sysctl_createv(clog, 0, &rnode, NULL, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_INT, "user_set_dbregs", SYSCTL_DESCR("Whether unprivileged users may set "\ "CPU Debug Registers."), sysctl_extensions_user_handler, 0, &user_set_dbregs, 0, CTL_CREATE, CTL_EOL); #endif /* Compatibility: security.curtain */ sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_INT, "curtain", SYSCTL_DESCR("Curtain information about objects to "\ "users not owning them."), sysctl_extensions_curtain_handler, 0, &curtain, 0, CTL_SECURITY, CTL_CREATE, CTL_EOL); secmodel_extensions_vfs_sysctl(clog, rnode); } static int sysctl_extensions_curtain_handler(SYSCTLFN_ARGS) { struct sysctlnode node; int val, error; val = *(int *)rnode->sysctl_data; node = *rnode; node.sysctl_data = &val; error = sysctl_lookup(SYSCTLFN_CALL(&node)); if (error || newp == NULL) return error; /* shortcut */ if (val == *(int *)rnode->sysctl_data) return 0; /* curtain cannot be disabled when securelevel is above 0 */ if (val == 0 && is_securelevel_above(0)) { return EPERM; } *(int *)rnode->sysctl_data = val; return 0; } /* * Generic sysctl extensions handler for user mount and set CPU affinity * rights. Checks the following conditions: * - setting value to 0 is always permitted (decrease user rights) * - setting value != 0 is not permitted when securelevel is above 0 (increase * user rights). */ int sysctl_extensions_user_handler(SYSCTLFN_ARGS) { struct sysctlnode node; int val, error; val = *(int *)rnode->sysctl_data; node = *rnode; node.sysctl_data = &val; error = sysctl_lookup(SYSCTLFN_CALL(&node)); if (error || newp == NULL) return error; /* shortcut */ if (val == *(int *)rnode->sysctl_data) return 0; /* we cannot grant more rights to users when securelevel is above 0 */ if (val != 0 && is_securelevel_above(0)) { return EPERM; } *(int *)rnode->sysctl_data = val; return 0; } /* * Query secmodel_securelevel(9) to know whether securelevel is strictly * above 'level' or not. * Returns true if it is, false otherwise (when securelevel is absent or * securelevel is at or below 'level'). */ static bool is_securelevel_above(int level) { bool above; int error; error = secmodel_eval("org.netbsd.secmodel.securelevel", "is-securelevel-above", KAUTH_ARG(level), &above); if (error == 0 && above) return true; else return false; } static void secmodel_extensions_init(void) { curtain = 0; user_set_cpu_affinity = 0; #ifdef PT_SETDBREGS user_set_dbregs = 0; #endif } static void secmodel_extensions_start(void) { l_process = kauth_listen_scope(KAUTH_SCOPE_PROCESS, secmodel_extensions_process_cb, NULL); l_network = kauth_listen_scope(KAUTH_SCOPE_NETWORK, secmodel_extensions_network_cb, NULL); secmodel_extensions_vfs_start(); } static void secmodel_extensions_stop(void) { secmodel_extensions_vfs_stop(); kauth_unlisten_scope(l_process); kauth_unlisten_scope(l_network); } static int extensions_modcmd(modcmd_t cmd, void *arg) { int error = 0; switch (cmd) { case MODULE_CMD_INIT: error = secmodel_register(&extensions_sm, SECMODEL_EXTENSIONS_ID, SECMODEL_EXTENSIONS_NAME, NULL, NULL, NULL); if (error != 0) printf("extensions_modcmd::init: secmodel_register " "returned %d\n", error); secmodel_extensions_init(); secmodel_extensions_start(); break; case MODULE_CMD_FINI: secmodel_extensions_stop(); error = secmodel_deregister(extensions_sm); if (error != 0) printf("extensions_modcmd::fini: secmodel_deregister " "returned %d\n", error); break; case MODULE_CMD_AUTOUNLOAD: error = EPERM; break; default: error = ENOTTY; break; } return (error); } static int secmodel_extensions_process_cb(kauth_cred_t cred, kauth_action_t action, void *cookie, void *arg0, void *arg1, void *arg2, void *arg3) { int result; enum kauth_process_req req; result = KAUTH_RESULT_DEFER; req = (enum kauth_process_req)(uintptr_t)arg1; switch (action) { case KAUTH_PROCESS_CANSEE: switch (req) { case KAUTH_REQ_PROCESS_CANSEE_ARGS: case KAUTH_REQ_PROCESS_CANSEE_ENTRY: case KAUTH_REQ_PROCESS_CANSEE_OPENFILES: case KAUTH_REQ_PROCESS_CANSEE_EPROC: if (curtain != 0) { struct proc *p = arg0; /* * Only process' owner and root can see * through curtain */ if (!kauth_cred_uidmatch(cred, p->p_cred)) { int error; bool isroot = false; error = secmodel_eval( "org.netbsd.secmodel.suser", "is-root", cred, &isroot); if (error == 0 && !isroot) result = KAUTH_RESULT_DENY; } } break; case KAUTH_REQ_PROCESS_CANSEE_KPTR: default: break; } break; case KAUTH_PROCESS_SCHEDULER_SETAFFINITY: if (user_set_cpu_affinity != 0) { struct proc *p = arg0; if (kauth_cred_uidmatch(cred, p->p_cred)) result = KAUTH_RESULT_ALLOW; } break; default: break; } return (result); } static int secmodel_extensions_network_cb(kauth_cred_t cred, kauth_action_t action, void *cookie, void *arg0, void *arg1, void *arg2, void *arg3) { int result; enum kauth_network_req req; result = KAUTH_RESULT_DEFER; req = (enum kauth_network_req)(uintptr_t)arg0; if (action != KAUTH_NETWORK_SOCKET || req != KAUTH_REQ_NETWORK_SOCKET_CANSEE) return result; if (curtain != 0) { struct socket *so = (struct socket *)arg1; if (__predict_false(so == NULL || so->so_cred == NULL)) return KAUTH_RESULT_DENY; if (!kauth_cred_uidmatch(cred, so->so_cred)) { int error; bool isroot = false; error = secmodel_eval("org.netbsd.secmodel.suser", "is-root", cred, &isroot); if (error == 0 && !isroot) result = KAUTH_RESULT_DENY; } } return (result); }
191 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 /* $NetBSD: popcount32.c,v 1.5 2015/05/29 19:39:41 matt Exp $ */ /*- * Copyright (c) 2009 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Joerg Sonnenberger. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in * the documentation and/or other materials provided with the * distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include <sys/cdefs.h> __RCSID("$NetBSD: popcount32.c,v 1.5 2015/05/29 19:39:41 matt Exp $"); #if !defined(_KERNEL) && !defined(_STANDALONE) #include <limits.h> #include <stdint.h> #include <strings.h> #else #include <lib/libkern/libkern.h> #include <machine/limits.h> #endif #ifndef popcount32 // might be a builtin /* * This a hybrid algorithm for bit counting between parallel counting and * using multiplication. The idea is to sum up the bits in each Byte, so * that the final accumulation can be done with a single multiplication. * If the platform has a slow multiplication instruction, it can be replaced * by the commented out version below. */ unsigned int popcount32(uint32_t v) { unsigned int c; v = v - ((v >> 1) & 0x55555555U); v = (v & 0x33333333U) + ((v >> 2) & 0x33333333U); v = (v + (v >> 4)) & 0x0f0f0f0fU; c = (v * 0x01010101U) >> 24; /* * v = (v >> 16) + v; * v = (v >> 8) + v; * c = v & 255; */ return c; } #if UINT_MAX == 0xffffffffU __strong_alias(popcount, popcount32) #endif #if ULONG_MAX == 0xffffffffU __strong_alias(popcountl, popcount32) #endif #endif /* !popcount32 */
502 501 228 229 225 343 343 344 341 343 1 1 1 1 282 282 55 55 30 30 31 31 532 534 509 71 72 72 72 72 1086 1092 1075 98 99 99 99 99 99 531 5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 /* $NetBSD: kern_condvar.c,v 1.63 2023/11/02 10:31:55 martin Exp $ */ /*- * Copyright (c) 2006, 2007, 2008, 2019, 2020, 2023 * The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Andrew Doran. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /* * Kernel condition variable implementation. */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: kern_condvar.c,v 1.63 2023/11/02 10:31:55 martin Exp $"); #include <sys/param.h> #include <sys/condvar.h> #include <sys/cpu.h> #include <sys/kernel.h> #include <sys/lockdebug.h> #include <sys/lwp.h> #include <sys/sleepq.h> #include <sys/syncobj.h> #include <sys/systm.h> /* * Accessors for the private contents of the kcondvar_t data type. * * cv_opaque[0] sleepq_t * cv_opaque[1] description for ps(1) * * cv_opaque[0] is protected by the interlock passed to cv_wait() (enqueue * only), and the sleep queue lock acquired with sleepq_hashlock() (enqueue * and dequeue). * * cv_opaque[1] (the wmesg) is static and does not change throughout the life * of the CV. */ #define CV_SLEEPQ(cv) ((sleepq_t *)(cv)->cv_opaque) #define CV_WMESG(cv) ((const char *)(cv)->cv_opaque[1]) #define CV_SET_WMESG(cv, v) (cv)->cv_opaque[1] = __UNCONST(v) #define CV_DEBUG_P(cv) (CV_WMESG(cv) != nodebug) #define CV_RA ((uintptr_t)__builtin_return_address(0)) static void cv_unsleep(lwp_t *, bool); static inline void cv_wakeup_one(kcondvar_t *); static inline void cv_wakeup_all(kcondvar_t *); syncobj_t cv_syncobj = { .sobj_name = "cv", .sobj_flag = SOBJ_SLEEPQ_SORTED, .sobj_boostpri = PRI_KERNEL, .sobj_unsleep = cv_unsleep, .sobj_changepri = sleepq_changepri, .sobj_lendpri = sleepq_lendpri, .sobj_owner = syncobj_noowner, }; static const char deadcv[] = "deadcv"; /* * cv_init: * * Initialize a condition variable for use. */ void cv_init(kcondvar_t *cv, const char *wmesg) { KASSERT(wmesg != NULL); CV_SET_WMESG(cv, wmesg); sleepq_init(CV_SLEEPQ(cv)); } /* * cv_destroy: * * Tear down a condition variable. */ void cv_destroy(kcondvar_t *cv) { sleepq_destroy(CV_SLEEPQ(cv)); #ifdef DIAGNOSTIC KASSERT(cv_is_valid(cv)); KASSERT(!cv_has_waiters(cv)); CV_SET_WMESG(cv, deadcv); #endif } /* * cv_enter: * * Look up and lock the sleep queue corresponding to the given * condition variable, and increment the number of waiters. */ static inline int cv_enter(kcondvar_t *cv, kmutex_t *mtx, lwp_t *l, bool catch_p) { sleepq_t *sq; kmutex_t *mp; int nlocks; KASSERT(cv_is_valid(cv)); KASSERT(!cpu_intr_p()); KASSERT((l->l_pflag & LP_INTR) == 0 || panicstr != NULL); mp = sleepq_hashlock(cv); sq = CV_SLEEPQ(cv); nlocks = sleepq_enter(sq, l, mp); sleepq_enqueue(sq, cv, CV_WMESG(cv), &cv_syncobj, catch_p); mutex_exit(mtx); KASSERT(cv_has_waiters(cv)); return nlocks; } /* * cv_unsleep: * * Remove an LWP from the condition variable and sleep queue. This * is called when the LWP has not been awoken normally but instead * interrupted: for example, when a signal is received. Must be * called with the LWP locked. Will unlock if "unlock" is true. */ static void cv_unsleep(lwp_t *l, bool unlock) { kcondvar_t *cv __diagused; cv = (kcondvar_t *)(uintptr_t)l->l_wchan; KASSERT(l->l_wchan == (wchan_t)cv); KASSERT(l->l_sleepq == CV_SLEEPQ(cv)); KASSERT(cv_is_valid(cv)); KASSERT(cv_has_waiters(cv)); sleepq_unsleep(l, unlock); } /* * cv_wait: * * Wait non-interruptably on a condition variable until awoken. */ void cv_wait(kcondvar_t *cv, kmutex_t *mtx) { lwp_t *l = curlwp; int nlocks; KASSERT(mutex_owned(mtx)); nlocks = cv_enter(cv, mtx, l, false); (void)sleepq_block(0, false, &cv_syncobj, nlocks); mutex_enter(mtx); } /* * cv_wait_sig: * * Wait on a condition variable until a awoken or a signal is received. * Will also return early if the process is exiting. Returns zero if * awoken normally, ERESTART if a signal was received and the system * call is restartable, or EINTR otherwise. */ int cv_wait_sig(kcondvar_t *cv, kmutex_t *mtx) { lwp_t *l = curlwp; int error, nlocks; KASSERT(mutex_owned(mtx)); nlocks = cv_enter(cv, mtx, l, true); error = sleepq_block(0, true, &cv_syncobj, nlocks); mutex_enter(mtx); return error; } /* * cv_timedwait: * * Wait on a condition variable until awoken or the specified timeout * expires. Returns zero if awoken normally or EWOULDBLOCK if the * timeout expired. * * timo is a timeout in ticks. timo = 0 specifies an infinite timeout. */ int cv_timedwait(kcondvar_t *cv, kmutex_t *mtx, int timo) { lwp_t *l = curlwp; int error, nlocks; KASSERT(mutex_owned(mtx)); nlocks = cv_enter(cv, mtx, l, false); error = sleepq_block(timo, false, &cv_syncobj, nlocks); mutex_enter(mtx); return error; } /* * cv_timedwait_sig: * * Wait on a condition variable until a timeout expires, awoken or a * signal is received. Will also return early if the process is * exiting. Returns zero if awoken normally, EWOULDBLOCK if the * timeout expires, ERESTART if a signal was received and the system * call is restartable, or EINTR otherwise. * * timo is a timeout in ticks. timo = 0 specifies an infinite timeout. */ int cv_timedwait_sig(kcondvar_t *cv, kmutex_t *mtx, int timo) { lwp_t *l = curlwp; int error, nlocks; KASSERT(mutex_owned(mtx)); nlocks = cv_enter(cv, mtx, l, true); error = sleepq_block(timo, true, &cv_syncobj, nlocks); mutex_enter(mtx); return error; } /* * Given a number of seconds, sec, and 2^64ths of a second, frac, we * want a number of ticks for a timeout: * * timo = hz*(sec + frac/2^64) * = hz*sec + hz*frac/2^64 * = hz*sec + hz*(frachi*2^32 + fraclo)/2^64 * = hz*sec + hz*frachi/2^32 + hz*fraclo/2^64, * * where frachi is the high 32 bits of frac and fraclo is the * low 32 bits. * * We assume hz < INT_MAX/2 < UINT32_MAX, so * * hz*fraclo/2^64 < fraclo*2^32/2^64 <= 1, * * since fraclo < 2^32. * * We clamp the result at INT_MAX/2 for a timeout in ticks, since we * can't represent timeouts higher than INT_MAX in cv_timedwait, and * spurious wakeup is OK. Moreover, we don't want to wrap around, * because we compute end - start in ticks in order to compute the * remaining timeout, and that difference cannot wrap around, so we use * a timeout less than INT_MAX. Using INT_MAX/2 provides plenty of * margin for paranoia and will exceed most waits in practice by far. */ static unsigned bintime2timo(const struct bintime *bt) { KASSERT(hz < INT_MAX/2); CTASSERT(INT_MAX/2 < UINT32_MAX); if (bt->sec > ((INT_MAX/2)/hz)) return INT_MAX/2; if ((hz*(bt->frac >> 32) >> 32) > (INT_MAX/2 - hz*bt->sec)) return INT_MAX/2; return hz*bt->sec + (hz*(bt->frac >> 32) >> 32); } /* * timo is in units of ticks. We want units of seconds and 2^64ths of * a second. We know hz = 1 sec/tick, and 2^64 = 1 sec/(2^64th of a * second), from which we can conclude 2^64 / hz = 1 (2^64th of a * second)/tick. So for the fractional part, we compute * * frac = rem * 2^64 / hz * = ((rem * 2^32) / hz) * 2^32 * * Using truncating integer division instead of real division will * leave us with only about 32 bits of precision, which means about * 1/4-nanosecond resolution, which is good enough for our purposes. */ static struct bintime timo2bintime(unsigned timo) { return (struct bintime) { .sec = timo / hz, .frac = (((uint64_t)(timo % hz) << 32)/hz << 32), }; } /* * cv_timedwaitbt: * * Wait on a condition variable until awoken or the specified * timeout expires. Returns zero if awoken normally or * EWOULDBLOCK if the timeout expires. * * On entry, bt is a timeout in bintime. cv_timedwaitbt subtracts * the time slept, so on exit, bt is the time remaining after * sleeping, possibly negative if the complete time has elapsed. * No infinite timeout; use cv_wait_sig instead. * * epsilon is a requested maximum error in timeout (excluding * spurious wakeups). Currently not used, will be used in the * future to choose between low- and high-resolution timers. * Actual wakeup time will be somewhere in [t, t + max(e, r) + s) * where r is the finest resolution of clock available and s is * scheduling delays for scheduler overhead and competing threads. * Time is measured by the interrupt source implementing the * timeout, not by another timecounter. */ int cv_timedwaitbt(kcondvar_t *cv, kmutex_t *mtx, struct bintime *bt, const struct bintime *epsilon __diagused) { struct bintime slept; unsigned start, end; int timo; int error; KASSERTMSG(bt->sec >= 0, "negative timeout"); KASSERTMSG(epsilon != NULL, "specify maximum requested delay"); /* If there's nothing left to wait, time out. */ if (bt->sec == 0 && bt->frac == 0) return EWOULDBLOCK; /* Convert to ticks, but clamp to be >=1. */ timo = bintime2timo(bt); KASSERTMSG(timo >= 0, "negative ticks: %d", timo); if (timo == 0) timo = 1; /* * getticks() is technically int, but nothing special * happens instead of overflow, so we assume two's-complement * wraparound and just treat it as unsigned. */ start = getticks(); error = cv_timedwait(cv, mtx, timo); end = getticks(); /* * Set it to the time left, or zero, whichever is larger. We * do not fail with EWOULDBLOCK here because this may have been * an explicit wakeup, so the caller needs to check before they * give up or else cv_signal would be lost. */ slept = timo2bintime(end - start); if (bintimecmp(bt, &slept, <=)) { bt->sec = 0; bt->frac = 0; } else { /* bt := bt - slept */ bintime_sub(bt, &slept); } return error; } /* * cv_timedwaitbt_sig: * * Wait on a condition variable until awoken, the specified * timeout expires, or interrupted by a signal. Returns zero if * awoken normally, EWOULDBLOCK if the timeout expires, or * EINTR/ERESTART if interrupted by a signal. * * On entry, bt is a timeout in bintime. cv_timedwaitbt_sig * subtracts the time slept, so on exit, bt is the time remaining * after sleeping. No infinite timeout; use cv_wait instead. * * epsilon is a requested maximum error in timeout (excluding * spurious wakeups). Currently not used, will be used in the * future to choose between low- and high-resolution timers. */ int cv_timedwaitbt_sig(kcondvar_t *cv, kmutex_t *mtx, struct bintime *bt, const struct bintime *epsilon __diagused) { struct bintime slept; unsigned start, end; int timo; int error; KASSERTMSG(bt->sec >= 0, "negative timeout"); KASSERTMSG(epsilon != NULL, "specify maximum requested delay"); /* If there's nothing left to wait, time out. */ if (bt->sec == 0 && bt->frac == 0) return EWOULDBLOCK; /* Convert to ticks, but clamp to be >=1. */ timo = bintime2timo(bt); KASSERTMSG(timo >= 0, "negative ticks: %d", timo); if (timo == 0) timo = 1; /* * getticks() is technically int, but nothing special * happens instead of overflow, so we assume two's-complement * wraparound and just treat it as unsigned. */ start = getticks(); error = cv_timedwait_sig(cv, mtx, timo); end = getticks(); /* * Set it to the time left, or zero, whichever is larger. We * do not fail with EWOULDBLOCK here because this may have been * an explicit wakeup, so the caller needs to check before they * give up or else cv_signal would be lost. */ slept = timo2bintime(end - start); if (bintimecmp(bt, &slept, <=)) { bt->sec = 0; bt->frac = 0; } else { /* bt := bt - slept */ bintime_sub(bt, &slept); } return error; } /* * cv_signal: * * Wake the highest priority LWP waiting on a condition variable. Must * be called with the interlocking mutex held or just after it has been * released (so the awoken LWP will see the changed condition). */ void cv_signal(kcondvar_t *cv) { KASSERT(cv_is_valid(cv)); if (__predict_false(!LIST_EMPTY(CV_SLEEPQ(cv)))) { /* * Compiler turns into a tail call usually, i.e. jmp, * because the arguments are the same and no locals. */ cv_wakeup_one(cv); } } /* * cv_wakeup_one: * * Slow path for cv_signal(). Deliberately marked __noinline to * prevent the compiler pulling it in to cv_signal(), which adds * extra prologue and epilogue code. */ static __noinline void cv_wakeup_one(kcondvar_t *cv) { sleepq_t *sq; kmutex_t *mp; lwp_t *l; mp = sleepq_hashlock(cv); sq = CV_SLEEPQ(cv); if (__predict_true((l = LIST_FIRST(sq)) != NULL)) { KASSERT(l->l_sleepq == sq); KASSERT(l->l_mutex == mp); KASSERT(l->l_wchan == cv); sleepq_remove(sq, l, true); } mutex_spin_exit(mp); } /* * cv_broadcast: * * Wake all LWPs waiting on a condition variable. Must be called with * the interlocking mutex held or just after it has been released (so * the awoken LWP will see the changed condition). */ void cv_broadcast(kcondvar_t *cv) { KASSERT(cv_is_valid(cv)); if (__predict_false(!LIST_EMPTY(CV_SLEEPQ(cv)))) { /* * Compiler turns into a tail call usually, i.e. jmp, * because the arguments are the same and no locals. */ cv_wakeup_all(cv); } } /* * cv_wakeup_all: * * Slow path for cv_broadcast(). Deliberately marked __noinline to * prevent the compiler pulling it in to cv_broadcast(), which adds * extra prologue and epilogue code. */ static __noinline void cv_wakeup_all(kcondvar_t *cv) { sleepq_t *sq; kmutex_t *mp; lwp_t *l; mp = sleepq_hashlock(cv); sq = CV_SLEEPQ(cv); while ((l = LIST_FIRST(sq)) != NULL) { KASSERT(l->l_sleepq == sq); KASSERT(l->l_mutex == mp); KASSERT(l->l_wchan == cv); sleepq_remove(sq, l, true); } mutex_spin_exit(mp); } /* * cv_has_waiters: * * For diagnostic assertions: return non-zero if a condition * variable has waiters. */ bool cv_has_waiters(kcondvar_t *cv) { return !LIST_EMPTY(CV_SLEEPQ(cv)); } /* * cv_is_valid: * * For diagnostic assertions: return non-zero if a condition * variable appears to be valid. No locks need be held. */ bool cv_is_valid(kcondvar_t *cv) { return CV_WMESG(cv) != deadcv && CV_WMESG(cv) != NULL; }
203 203 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 /* $NetBSD: scsi_base.c,v 1.93 2019/05/03 16:06:56 mlelstv Exp $ */ /*- * Copyright (c) 1998, 2004 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Charles M. Hannum. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: scsi_base.c,v 1.93 2019/05/03 16:06:56 mlelstv Exp $"); #include <sys/param.h> #include <sys/systm.h> #include <sys/kernel.h> #include <sys/buf.h> #include <sys/uio.h> #include <sys/malloc.h> #include <sys/errno.h> #include <sys/device.h> #include <sys/proc.h> #include <dev/scsipi/scsipi_all.h> #include <dev/scsipi/scsi_all.h> #include <dev/scsipi/scsi_disk.h> #include <dev/scsipi/scsiconf.h> #include <dev/scsipi/scsipi_base.h> static void scsi_print_xfer_mode(struct scsipi_periph *); /* * Do a scsi operation, asking a device to run as SCSI-II if it can. */ int scsi_change_def(struct scsipi_periph *periph, int flags) { struct scsi_changedef cmd; memset(&cmd, 0, sizeof(cmd)); cmd.opcode = SCSI_CHANGE_DEFINITION; cmd.how = SC_SCSI_2; return (scsipi_command(periph, (void *)&cmd, sizeof(cmd), 0, 0, SCSIPIRETRIES, 100000, NULL, flags)); } /* * ask the scsi driver to perform a command for us. * tell it where to read/write the data, and how * long the data is supposed to be. If we have a buf * to associate with the transfer, we need that too. */ void scsi_scsipi_cmd(struct scsipi_xfer *xs) { struct scsipi_periph *periph = xs->xs_periph; SC_DEBUG(periph, SCSIPI_DB2, ("scsi_scsipi_cmd\n")); /* * Set the LUN in the CDB if we have an older device. We also * set it for more modern SCSI-2 devices "just in case". */ if (periph->periph_version <= 2) xs->cmd->bytes[0] |= ((periph->periph_lun << SCSI_CMD_LUN_SHIFT) & SCSI_CMD_LUN_MASK); } /* * Utility routines often used in SCSI stuff */ /* * Print out the periph's address info. */ void scsi_print_addr(struct scsipi_periph *periph) { struct scsipi_channel *chan = periph->periph_channel; struct scsipi_adapter *adapt = chan->chan_adapter; printf("%s(%s:%d:%d:%d): ", periph->periph_dev != NULL ? device_xname(periph->periph_dev) : "probe", device_xname(adapt->adapt_dev), chan->chan_channel, periph->periph_target, periph->periph_lun); } /* * Kill off all pending xfers for a periph. * * Must be called with channel lock held */ void scsi_kill_pending(struct scsipi_periph *periph) { struct scsipi_xfer *xs; TAILQ_FOREACH(xs, &periph->periph_xferq, device_q) { callout_stop(&xs->xs_callout); scsi_print_addr(periph); printf("killed "); scsipi_print_cdb(xs->cmd); xs->error = XS_DRIVER_STUFFUP; scsipi_done(xs); } } /* * scsi_print_xfer_mode: * * Print a parallel SCSI periph's capabilities. */ static void scsi_print_xfer_mode(struct scsipi_periph *periph) { struct scsipi_channel *chan = periph->periph_channel; struct scsipi_adapter *adapt = chan->chan_adapter; int period, freq, speed, mbs; if (periph->periph_dev) aprint_normal_dev(periph->periph_dev, ""); else aprint_normal("probe(%s:%d:%d:%d): ", device_xname(adapt->adapt_dev), chan->chan_channel, periph->periph_target, periph->periph_lun); if (periph->periph_mode & (PERIPH_CAP_SYNC | PERIPH_CAP_DT)) { period = scsipi_sync_factor_to_period(periph->periph_period); aprint_normal("sync (%d.%02dns offset %d)", period / 100, period % 100, periph->periph_offset); } else aprint_normal("async"); if (periph->periph_mode & PERIPH_CAP_WIDE32) aprint_normal(", 32-bit"); else if (periph->periph_mode & (PERIPH_CAP_WIDE16 | PERIPH_CAP_DT)) aprint_normal(", 16-bit"); else aprint_normal(", 8-bit"); if (periph->periph_mode & (PERIPH_CAP_SYNC | PERIPH_CAP_DT)) { freq = scsipi_sync_factor_to_freq(periph->periph_period); speed = freq; if (periph->periph_mode & PERIPH_CAP_WIDE32) speed *= 4; else if (periph->periph_mode & (PERIPH_CAP_WIDE16 | PERIPH_CAP_DT)) speed *= 2; mbs = speed / 1000; if (mbs > 0) { aprint_normal(" (%d.%03dMB/s)", mbs, speed % 1000); } else aprint_normal(" (%dKB/s)", speed % 1000); } aprint_normal(" transfers"); if (periph->periph_mode & PERIPH_CAP_TQING) aprint_normal(", tagged queueing"); aprint_normal("\n"); } /* * scsi_async_event_xfer_mode: * * Update the xfer mode for all parallel SCSI periphs sharing the * specified I_T Nexus. */ void scsi_async_event_xfer_mode(struct scsipi_channel *chan, void *arg) { struct scsipi_xfer_mode *xm = arg; struct scsipi_periph *periph; int lun, announce, mode, period, offset; for (lun = 0; lun < chan->chan_nluns; lun++) { periph = scsipi_lookup_periph_locked(chan, xm->xm_target, lun); if (periph == NULL) continue; announce = 0; /* * Clamp the xfer mode down to this periph's capabilities. */ mode = xm->xm_mode & periph->periph_cap; if (mode & PERIPH_CAP_SYNC) { period = xm->xm_period; offset = xm->xm_offset; } else { period = 0; offset = 0; } /* * If we do not have a valid xfer mode yet, or the parameters * are different, announce them. */ if ((periph->periph_flags & PERIPH_MODE_VALID) == 0 || periph->periph_mode != mode || periph->periph_period != period || periph->periph_offset != offset) announce = 1; periph->periph_mode = mode; periph->periph_period = period; periph->periph_offset = offset; periph->periph_flags |= PERIPH_MODE_VALID; if (announce) scsi_print_xfer_mode(periph); } } /* * scsipi_async_event_xfer_mode: * * Update the xfer mode for all SAS/FC periphs sharing the * specified I_T Nexus. */ void scsi_fc_sas_async_event_xfer_mode(struct scsipi_channel *chan, void *arg) { struct scsipi_xfer_mode *xm = arg; struct scsipi_periph *periph; int lun, announce, mode; for (lun = 0; lun < chan->chan_nluns; lun++) { periph = scsipi_lookup_periph_locked(chan, xm->xm_target, lun); if (periph == NULL) continue; announce = 0; /* * Clamp the xfer mode down to this periph's capabilities. */ mode = xm->xm_mode & periph->periph_cap; /* * If we do not have a valid xfer mode yet, or the parameters * are different, announce them. */ if ((periph->periph_flags & PERIPH_MODE_VALID) == 0 || periph->periph_mode != mode) announce = 1; periph->periph_mode = mode; periph->periph_flags |= PERIPH_MODE_VALID; if (announce && (periph->periph_mode & PERIPH_CAP_TQING) != 0) { aprint_normal_dev(periph->periph_dev, "tagged queueing\n"); } } }
4 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 /* $NetBSD: hci_ioctl.c,v 1.15 2021/09/21 15:03:08 christos Exp $ */ /*- * Copyright (c) 2005 Iain Hibbert. * Copyright (c) 2006 Itronix Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The name of Itronix Inc. may not be used to endorse * or promote products derived from this software without specific * prior written permission. * * THIS SOFTWARE IS PROVIDED BY ITRONIX INC. ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL ITRONIX INC. BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: hci_ioctl.c,v 1.15 2021/09/21 15:03:08 christos Exp $"); #include <sys/param.h> #include <sys/domain.h> #include <sys/ioctl.h> #include <sys/kauth.h> #include <sys/kernel.h> #include <sys/mbuf.h> #include <sys/proc.h> #include <sys/systm.h> #include <netbt/bluetooth.h> #include <netbt/hci.h> #include <netbt/l2cap.h> #include <netbt/rfcomm.h> #ifdef BLUETOOTH_DEBUG #define BDADDR(bd) (bd).b[5], (bd).b[4], (bd).b[3], \ (bd).b[2], (bd).b[1], (bd).b[0] static void hci_dump(void) { struct hci_unit *unit; struct hci_link *link; struct l2cap_channel *chan; struct rfcomm_session *rs; struct rfcomm_dlc *dlc; uprintf("HCI:\n"); SIMPLEQ_FOREACH(unit, &hci_unit_list, hci_next) { uprintf("UNIT %s: flags 0x%4.4x, " "num_cmd=%d, num_acl=%d, num_sco=%d\n", device_xname(unit->hci_dev), unit->hci_flags, unit->hci_num_cmd_pkts, unit->hci_num_acl_pkts, unit->hci_num_sco_pkts); TAILQ_FOREACH(link, &unit->hci_links, hl_next) { uprintf("+HANDLE #%d: %s " "raddr=%2.2x:%2.2x:%2.2x:%2.2x:%2.2x:%2.2x, " "state %d, refcnt %d\n", link->hl_handle, (link->hl_type == HCI_LINK_ACL ? "ACL":"SCO"), BDADDR(link->hl_bdaddr), link->hl_state, link->hl_refcnt); } } uprintf("L2CAP:\n"); LIST_FOREACH(chan, &l2cap_active_list, lc_ncid) { uprintf("CID #%d state %d, psm=0x%4.4x, " "laddr=%2.2x:%2.2x:%2.2x:%2.2x:%2.2x:%2.2x, " "raddr=%2.2x:%2.2x:%2.2x:%2.2x:%2.2x:%2.2x\n", chan->lc_lcid, chan->lc_state, chan->lc_raddr.bt_psm, BDADDR(chan->lc_laddr.bt_bdaddr), BDADDR(chan->lc_raddr.bt_bdaddr)); } LIST_FOREACH(chan, &l2cap_listen_list, lc_ncid) { uprintf("LISTEN psm=0x%4.4x, " "laddr=%2.2x:%2.2x:%2.2x:%2.2x:%2.2x:%2.2x\n", chan->lc_laddr.bt_psm, BDADDR(chan->lc_laddr.bt_bdaddr)); } uprintf("RFCOMM:\n"); LIST_FOREACH(rs, &rfcomm_session_active, rs_next) { chan = rs->rs_l2cap; uprintf("SESSION: state=%d, flags=0x%4.4x, psm 0x%4.4x " "laddr=%2.2x:%2.2x:%2.2x:%2.2x:%2.2x:%2.2x, " "raddr=%2.2x:%2.2x:%2.2x:%2.2x:%2.2x:%2.2x\n", rs->rs_state, rs->rs_flags, chan->lc_raddr.bt_psm, BDADDR(chan->lc_laddr.bt_bdaddr), BDADDR(chan->lc_raddr.bt_bdaddr)); LIST_FOREACH(dlc, &rs->rs_dlcs, rd_next) { uprintf("+DLC channel=%d, dlci=%d, " "state=%d, flags=0x%4.4x, rxcred=%d, rxsize=%ld, " "txcred=%d, pending=%d, txqlen=%d\n", dlc->rd_raddr.bt_channel, dlc->rd_dlci, dlc->rd_state, dlc->rd_flags, dlc->rd_rxcred, (unsigned long)dlc->rd_rxsize, dlc->rd_txcred, dlc->rd_pending, (dlc->rd_txbuf ? dlc->rd_txbuf->m_pkthdr.len : 0)); } } LIST_FOREACH(rs, &rfcomm_session_listen, rs_next) { chan = rs->rs_l2cap; uprintf("LISTEN: psm 0x%4.4x, " "laddr=%2.2x:%2.2x:%2.2x:%2.2x:%2.2x:%2.2x\n", chan->lc_laddr.bt_psm, BDADDR(chan->lc_laddr.bt_bdaddr)); LIST_FOREACH(dlc, &rs->rs_dlcs, rd_next) uprintf("+DLC channel=%d\n", dlc->rd_laddr.bt_channel); } } #undef BDADDR #endif int hci_ioctl_pcb(unsigned long cmd, void *data) { struct btreq *btr = data; struct hci_unit *unit; int err = 0; DPRINTFN(1, "cmd %#lx\n", cmd); switch(cmd) { #ifdef BLUETOOTH_DEBUG case SIOCBTDUMP: hci_dump(); return 0; #endif /* * Get unit info based on address rather than name */ case SIOCGBTINFOA: unit = hci_unit_lookup(&btr->btr_bdaddr); if (unit == NULL) return ENXIO; break; /* * The remaining ioctl's all use the same btreq structure and * index on the name of the device, so we look that up first. */ case SIOCNBTINFO: /* empty name means give the first unit */ if (btr->btr_name[0] == '\0') { unit = NULL; break; } /* else fall through and look it up */ /* FALLTHROUGH */ case SIOCGBTINFO: case SIOCSBTFLAGS: case SIOCSBTPOLICY: case SIOCSBTPTYPE: case SIOCGBTSTATS: case SIOCZBTSTATS: case SIOCSBTSCOMTU: case SIOCGBTFEAT: SIMPLEQ_FOREACH(unit, &hci_unit_list, hci_next) { if (strncmp(device_xname(unit->hci_dev), btr->btr_name, HCI_DEVNAME_SIZE) == 0) break; } if (unit == NULL) return ENXIO; break; default: /* not one of mine */ return EPASSTHROUGH; } switch(cmd) { case SIOCNBTINFO: /* get next info */ if (unit) unit = SIMPLEQ_NEXT(unit, hci_next); else unit = SIMPLEQ_FIRST(&hci_unit_list); if (unit == NULL) { err = ENXIO; break; } /* FALLTHROUGH */ case SIOCGBTINFO: /* get unit info */ /* FALLTHROUGH */ case SIOCGBTINFOA: /* get info by address */ memset(btr, 0, sizeof(struct btreq)); strlcpy(btr->btr_name, device_xname(unit->hci_dev), HCI_DEVNAME_SIZE); bdaddr_copy(&btr->btr_bdaddr, &unit->hci_bdaddr); btr->btr_flags = unit->hci_flags; btr->btr_num_cmd = unit->hci_num_cmd_pkts; btr->btr_num_acl = unit->hci_num_acl_pkts; btr->btr_num_sco = unit->hci_num_sco_pkts; btr->btr_acl_mtu = unit->hci_max_acl_size; btr->btr_sco_mtu = unit->hci_max_sco_size; btr->btr_max_acl = unit->hci_max_acl_pkts; btr->btr_max_sco = unit->hci_max_sco_pkts; btr->btr_packet_type = unit->hci_packet_type; btr->btr_link_policy = unit->hci_link_policy; break; case SIOCSBTFLAGS: /* set unit flags (privileged) */ err = kauth_authorize_device(kauth_cred_get(), KAUTH_DEVICE_BLUETOOTH_SETPRIV, unit, KAUTH_ARG(cmd), btr, NULL); if (err) break; if ((unit->hci_flags & BTF_UP) && (btr->btr_flags & BTF_UP) == 0) { hci_disable(unit); unit->hci_flags &= ~BTF_UP; } unit->hci_flags &= ~BTF_MASTER; unit->hci_flags |= (btr->btr_flags & (BTF_INIT | BTF_MASTER)); if ((unit->hci_flags & BTF_UP) == 0 && (btr->btr_flags & BTF_UP)) { err = hci_enable(unit); if (err) break; unit->hci_flags |= BTF_UP; } btr->btr_flags = unit->hci_flags; break; case SIOCSBTPOLICY: /* set unit link policy (privileged) */ err = kauth_authorize_device(kauth_cred_get(), KAUTH_DEVICE_BLUETOOTH_SETPRIV, unit, KAUTH_ARG(cmd), btr, NULL); if (err) break; unit->hci_link_policy = btr->btr_link_policy; unit->hci_link_policy &= unit->hci_lmp_mask; btr->btr_link_policy = unit->hci_link_policy; break; case SIOCSBTPTYPE: /* set unit packet types (privileged) */ err = kauth_authorize_device(kauth_cred_get(), KAUTH_DEVICE_BLUETOOTH_SETPRIV, unit, KAUTH_ARG(cmd), btr, NULL); if (err) break; unit->hci_packet_type = btr->btr_packet_type; unit->hci_packet_type &= unit->hci_acl_mask; btr->btr_packet_type = unit->hci_packet_type; break; case SIOCGBTSTATS: /* get unit statistics */ (*unit->hci_if->get_stats)(unit->hci_dev, &btr->btr_stats, 0); break; case SIOCZBTSTATS: /* get & reset unit statistics */ err = kauth_authorize_device(kauth_cred_get(), KAUTH_DEVICE_BLUETOOTH_SETPRIV, unit, KAUTH_ARG(cmd), btr, NULL); if (err) break; (*unit->hci_if->get_stats)(unit->hci_dev, &btr->btr_stats, 1); break; case SIOCSBTSCOMTU: /* set sco_mtu value for unit */ /* * This is a temporary ioctl and may not be supported * in the future. The need is that if SCO packets are * sent to USB bluetooth controllers that are not an * integer number of frame sizes, the USB bus locks up. */ err = kauth_authorize_device(kauth_cred_get(), KAUTH_DEVICE_BLUETOOTH_SETPRIV, unit, KAUTH_ARG(cmd), btr, NULL); if (err) break; unit->hci_max_sco_size = btr->btr_sco_mtu; break; case SIOCGBTFEAT: /* get unit features */ memset(btr, 0, sizeof(struct btreq)); strlcpy(btr->btr_name, device_xname(unit->hci_dev), HCI_DEVNAME_SIZE); memcpy(btr->btr_features0, unit->hci_feat0, HCI_FEATURES_SIZE); memcpy(btr->btr_features1, unit->hci_feat1, HCI_FEATURES_SIZE); memcpy(btr->btr_features2, unit->hci_feat2, HCI_FEATURES_SIZE); break; default: err = EFAULT; break; } return err; }
23 214 10 113 91 8 14 103 2 5 17 15 8 9 13 6 1 8 7 8 15 125 6 119 119 2 1 75 155 8 26 125 26 22 2 120 1 3 5 18 117 129 49 143 37 116 34 120 25 1 1 1 1 2 6 15 1 1 18 18 18 1 3 10 5 1 3 3 4 2 2 4 2 2 8 4 2 2 5 6 1 1 4 4 5 2 1 2 2 5 4 1 3 1 2 3 5 5 10 1 5 4 5 4 1 3 2 14 3 1 3 7 177 1 2 2 48 2 13 8 105 180 180 2 2 180 170 2 7 169 7 164 55 115 16 13 3 6 6 6 7 2 5 5 4 17 5 6 4 2 10 1 2 2 2 1 1 6 1 1 3 1 1 4 2 6 2 4 1 1 3 3 10 3 6 2 2 2 2 13 41 44 44 10 10 2 43 1 2 40 5 7 2 14 2 2 2 16 3 2 12 9 5 13 1 12 1 2 7 25 25 8 1 16 2 1 9 3 1 2 16 9 24 24 3 21 19 19 1 1 15 20 4 3 1 1 1 1 1 29 3 49 49 3 46 1 1 2 14 1 22 2 3 6 1 1 4 9 3 1 1 6 13 3 1 1 1 18 2 11 1 1 1 8 9 1 3 2 1 8 9 19 37 36 11 26 7 17 5 3 2 7 2 5 6 5 2 92 1 2 89 92 5 7 1 6 3 3 3 2 10 2 8 2 1 2 4 2 2 9 12 12 4 8 3 2 1 3 3 1 2 3 3 8 8 5 3 6 2 4 3 2 1 8 2 2 23 2 23 1 1 31 2 2 3 1 27 12 12 5 2 3 5 5 7 30 30 30 4 7 1 29 5 25 6 18 22 23 16 8 8 15 5 18 8 16 4 1 3 8 1 7 19 1 2 1 1 14 4 3 2 4 2 2 5 1 4 51 8 3 62 62 62 1 61 1 60 60 55 54 1 52 3 1 54 49 48 34 16 49 49 48 1 26 27 48 49 8 19 37 2 4 4 42 42 6 8 1 8 28 5 32 32 2 30 1 3 20 17 6 1 1 5 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 /* $NetBSD: vfs_syscalls.c,v 1.561 2023/09/09 18:34:44 ad Exp $ */ /*- * Copyright (c) 2008, 2009, 2019, 2020, 2023 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Andrew Doran. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /* * Copyright (c) 1989, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)vfs_syscalls.c 8.42 (Berkeley) 7/31/95 */ /* * Virtual File System System Calls */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: vfs_syscalls.c,v 1.561 2023/09/09 18:34:44 ad Exp $"); #ifdef _KERNEL_OPT #include "opt_fileassoc.h" #include "veriexec.h" #endif #include <sys/param.h> #include <sys/systm.h> #include <sys/namei.h> #include <sys/filedesc.h> #include <sys/kernel.h> #include <sys/file.h> #include <sys/fcntl.h> #include <sys/stat.h> #include <sys/vnode.h> #include <sys/mount.h> #include <sys/fstrans.h> #include <sys/proc.h> #include <sys/uio.h> #include <sys/kmem.h> #include <sys/dirent.h> #include <sys/sysctl.h> #include <sys/syscallargs.h> #include <sys/vfs_syscalls.h> #include <sys/quota.h> #include <sys/quotactl.h> #include <sys/ktrace.h> #ifdef FILEASSOC #include <sys/fileassoc.h> #endif /* FILEASSOC */ #include <sys/extattr.h> #include <sys/verified_exec.h> #include <sys/kauth.h> #include <sys/atomic.h> #include <sys/module.h> #include <sys/buf.h> #include <sys/event.h> #include <sys/compat_stub.h> #include <miscfs/genfs/genfs.h> #include <miscfs/specfs/specdev.h> #include <nfs/rpcv2.h> #include <nfs/nfsproto.h> #include <nfs/nfs.h> #include <nfs/nfs_var.h> /* XXX this shouldn't be here */ #ifndef OFF_T_MAX #define OFF_T_MAX __type_max(off_t) #endif static int change_flags(struct vnode *, u_long, struct lwp *); static int change_mode(struct vnode *, int, struct lwp *); static int change_owner(struct vnode *, uid_t, gid_t, struct lwp *, int); static int do_sys_openat(lwp_t *, int, const char *, int, int, int *); static int do_sys_mkdirat(struct lwp *l, int, const char *, mode_t, enum uio_seg); static int do_sys_mkfifoat(struct lwp *, int, const char *, mode_t); static int do_sys_symlinkat(struct lwp *, const char *, int, const char *, enum uio_seg); static int do_sys_renameat(struct lwp *l, int, const char *, int, const char *, enum uio_seg, int); static int do_sys_readlinkat(struct lwp *, int, const char *, char *, size_t, register_t *); static int do_sys_unlinkat(struct lwp *, int, const char *, int, enum uio_seg); static int fd_nameiat(struct lwp *, int, struct nameidata *); static int fd_nameiat_simple_user(struct lwp *, int, const char *, namei_simple_flags_t, struct vnode **); /* * This table is used to maintain compatibility with 4.3BSD * and NetBSD 0.9 mount syscalls - and possibly other systems. * Note, the order is important! * * Do not modify this table. It should only contain filesystems * supported by NetBSD 0.9 and 4.3BSD. */ const char * const mountcompatnames[] = { NULL, /* 0 = MOUNT_NONE */ MOUNT_FFS, /* 1 = MOUNT_UFS */ MOUNT_NFS, /* 2 */ MOUNT_MFS, /* 3 */ MOUNT_MSDOS, /* 4 */ MOUNT_CD9660, /* 5 = MOUNT_ISOFS */ MOUNT_FDESC, /* 6 */ MOUNT_KERNFS, /* 7 */ NULL, /* 8 = MOUNT_DEVFS */ MOUNT_AFS, /* 9 */ }; const u_int nmountcompatnames = __arraycount(mountcompatnames); /* * Filter event method for EVFILT_FS. */ static struct klist fs_klist; static kmutex_t fs_klist_lock; CTASSERT((NOTE_SUBMIT & VQ_MOUNT) == 0); CTASSERT((NOTE_SUBMIT & VQ_UNMOUNT) == 0); void vfs_evfilt_fs_init(void) { klist_init(&fs_klist); mutex_init(&fs_klist_lock, MUTEX_DEFAULT, IPL_NONE); } static int filt_fsattach(struct knote *kn) { mutex_enter(&fs_klist_lock); kn->kn_flags |= EV_CLEAR; klist_insert(&fs_klist, kn); mutex_exit(&fs_klist_lock); return 0; } static void filt_fsdetach(struct knote *kn) { mutex_enter(&fs_klist_lock); klist_remove(&fs_klist, kn); mutex_exit(&fs_klist_lock); } static int filt_fs(struct knote *kn, long hint) { int rv; if (hint & NOTE_SUBMIT) { KASSERT(mutex_owned(&fs_klist_lock)); kn->kn_fflags |= hint & ~NOTE_SUBMIT; } else { mutex_enter(&fs_klist_lock); } rv = (kn->kn_fflags != 0); if ((hint & NOTE_SUBMIT) == 0) { mutex_exit(&fs_klist_lock); } return rv; } /* referenced in kern_event.c */ const struct filterops fs_filtops = { .f_flags = FILTEROP_MPSAFE, .f_attach = filt_fsattach, .f_detach = filt_fsdetach, .f_event = filt_fs, }; static int fd_nameiat(struct lwp *l, int fdat, struct nameidata *ndp) { file_t *dfp; int error; if (fdat != AT_FDCWD) { if ((error = fd_getvnode(fdat, &dfp)) != 0) goto out; NDAT(ndp, dfp->f_vnode); } error = namei(ndp); if (fdat != AT_FDCWD) fd_putfile(fdat); out: return error; } static int fd_nameiat_simple_user(struct lwp *l, int fdat, const char *path, namei_simple_flags_t sflags, struct vnode **vp_ret) { file_t *dfp; struct vnode *dvp; int error; if (fdat != AT_FDCWD) { if ((error = fd_getvnode(fdat, &dfp)) != 0) goto out; dvp = dfp->f_vnode; } else { dvp = NULL; } error = nameiat_simple_user(dvp, path, sflags, vp_ret); if (fdat != AT_FDCWD) fd_putfile(fdat); out: return error; } static int open_setfp(struct lwp *l, file_t *fp, struct vnode *vp, int indx, int flags) { int error; fp->f_flag = flags & FMASK; fp->f_type = DTYPE_VNODE; fp->f_ops = &vnops; fp->f_vnode = vp; if (flags & (O_EXLOCK | O_SHLOCK)) { struct flock lf; int type; lf.l_whence = SEEK_SET; lf.l_start = 0; lf.l_len = 0; if (flags & O_EXLOCK) lf.l_type = F_WRLCK; else lf.l_type = F_RDLCK; type = F_FLOCK; if ((flags & FNONBLOCK) == 0) type |= F_WAIT; VOP_UNLOCK(vp); error = VOP_ADVLOCK(vp, fp, F_SETLK, &lf, type); if (error) { (void) vn_close(vp, fp->f_flag, fp->f_cred); fd_abort(l->l_proc, fp, indx); return error; } vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); atomic_or_uint(&fp->f_flag, FHASLOCK); } if (flags & O_CLOEXEC) fd_set_exclose(l, indx, true); return 0; } static int mount_update(struct lwp *l, struct vnode *vp, const char *path, int flags, void *data, size_t *data_len) { struct mount *mp; int error = 0, saved_flags; mp = vp->v_mount; saved_flags = mp->mnt_flag; /* We can operate only on VV_ROOT nodes. */ if ((vp->v_vflag & VV_ROOT) == 0) { error = EINVAL; goto out; } /* * We only allow the filesystem to be reloaded if it * is currently mounted read-only. Additionally, we * prevent read-write to read-only downgrades. */ if ((flags & (MNT_RELOAD | MNT_RDONLY)) != 0 && (mp->mnt_flag & MNT_RDONLY) == 0 && (mp->mnt_iflag & IMNT_CAN_RWTORO) == 0) { error = EOPNOTSUPP; /* Needs translation */ goto out; } /* * Enabling MNT_UNION requires a covered mountpoint and * must not happen on the root mount. */ if ((flags & MNT_UNION) != 0 && mp->mnt_vnodecovered == NULLVP) { error = EOPNOTSUPP; goto out; } error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_MOUNT, KAUTH_REQ_SYSTEM_MOUNT_UPDATE, mp, KAUTH_ARG(flags), data); if (error) goto out; error = vfs_suspend(mp, 0); if (error) goto out; mutex_enter(mp->mnt_updating); mp->mnt_flag &= ~MNT_OP_FLAGS; mp->mnt_flag |= flags & MNT_OP_FLAGS; /* * Set the mount level flags. */ if ((flags & MNT_RDONLY) != (mp->mnt_flag & MNT_RDONLY)) { if ((flags & MNT_RDONLY)) mp->mnt_iflag |= IMNT_WANTRDONLY; else mp->mnt_iflag |= IMNT_WANTRDWR; } mp->mnt_flag &= ~MNT_BASIC_FLAGS; mp->mnt_flag |= flags & MNT_BASIC_FLAGS; if ((mp->mnt_iflag & IMNT_WANTRDONLY)) mp->mnt_flag &= ~MNT_RDONLY; error = VFS_MOUNT(mp, path, data, data_len); if (error && data != NULL) { int error2; /* * Update failed; let's try and see if it was an * export request. For compat with 3.0 and earlier. */ error2 = vfs_hooks_reexport(mp, path, data); /* * Only update error code if the export request was * understood but some problem occurred while * processing it. */ if (error2 != EJUSTRETURN) error = error2; } if (error == 0 && (mp->mnt_iflag & IMNT_WANTRDONLY)) mp->mnt_flag |= MNT_RDONLY; if (error) mp->mnt_flag = saved_flags; mp->mnt_flag &= ~MNT_OP_FLAGS; mp->mnt_iflag &= ~(IMNT_WANTRDONLY | IMNT_WANTRDWR); if ((mp->mnt_flag & (MNT_RDONLY | MNT_ASYNC)) == 0) { if ((mp->mnt_iflag & IMNT_ONWORKLIST) == 0) vfs_syncer_add_to_worklist(mp); } else { if ((mp->mnt_iflag & IMNT_ONWORKLIST) != 0) vfs_syncer_remove_from_worklist(mp); } mutex_exit(mp->mnt_updating); vfs_resume(mp); if ((error == 0) && !(saved_flags & MNT_EXTATTR) && (flags & MNT_EXTATTR)) { if (VFS_EXTATTRCTL(mp, EXTATTR_CMD_START, NULL, 0, NULL) != 0) { printf("%s: failed to start extattr, error = %d", mp->mnt_stat.f_mntonname, error); mp->mnt_flag &= ~MNT_EXTATTR; } } if ((error == 0) && (saved_flags & MNT_EXTATTR) && !(flags & MNT_EXTATTR)) { if (VFS_EXTATTRCTL(mp, EXTATTR_CMD_STOP, NULL, 0, NULL) != 0) { printf("%s: failed to stop extattr, error = %d", mp->mnt_stat.f_mntonname, error); mp->mnt_flag |= MNT_RDONLY; } } out: return (error); } static int mount_get_vfsops(const char *fstype, enum uio_seg type_seg, struct vfsops **vfsops) { char fstypename[sizeof(((struct statvfs *)NULL)->f_fstypename)]; int error; if (type_seg == UIO_USERSPACE) { /* Copy file-system type from userspace. */ error = copyinstr(fstype, fstypename, sizeof(fstypename), NULL); } else { error = copystr(fstype, fstypename, sizeof(fstypename), NULL); KASSERT(error == 0); } if (error) { /* * Historically, filesystem types were identified by numbers. * If we get an integer for the filesystem type instead of a * string, we check to see if it matches one of the historic * filesystem types. */ u_long fsindex = (u_long)fstype; if (fsindex >= nmountcompatnames || mountcompatnames[fsindex] == NULL) return ENODEV; strlcpy(fstypename, mountcompatnames[fsindex], sizeof(fstypename)); } /* Accept `ufs' as an alias for `ffs', for compatibility. */ if (strcmp(fstypename, "ufs") == 0) fstypename[0] = 'f'; if ((*vfsops = vfs_getopsbyname(fstypename)) != NULL) return 0; /* If we can autoload a vfs module, try again */ (void)module_autoload(fstypename, MODULE_CLASS_VFS); if ((*vfsops = vfs_getopsbyname(fstypename)) != NULL) return 0; return ENODEV; } static int mount_getargs(struct lwp *l, struct vnode *vp, const char *path, int flags, void *data, size_t *data_len) { struct mount *mp; int error; /* If MNT_GETARGS is specified, it should be the only flag. */ if (flags & ~MNT_GETARGS) return EINVAL; mp = vp->v_mount; /* XXX: probably some notion of "can see" here if we want isolation. */ error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_MOUNT, KAUTH_REQ_SYSTEM_MOUNT_GET, mp, data, NULL); if (error) return error; if ((vp->v_vflag & VV_ROOT) == 0) return EINVAL; if (vfs_busy(mp)) return EPERM; mutex_enter(mp->mnt_updating); mp->mnt_flag &= ~MNT_OP_FLAGS; mp->mnt_flag |= MNT_GETARGS; error = VFS_MOUNT(mp, path, data, data_len); mp->mnt_flag &= ~MNT_OP_FLAGS; mutex_exit(mp->mnt_updating); vfs_unbusy(mp); return (error); } int sys___mount50(struct lwp *l, const struct sys___mount50_args *uap, register_t *retval) { /* { syscallarg(const char *) type; syscallarg(const char *) path; syscallarg(int) flags; syscallarg(void *) data; syscallarg(size_t) data_len; } */ return do_sys_mount(l, SCARG(uap, type), UIO_USERSPACE, SCARG(uap, path), SCARG(uap, flags), SCARG(uap, data), UIO_USERSPACE, SCARG(uap, data_len), retval); } int do_sys_mount(struct lwp *l, const char *type, enum uio_seg type_seg, const char *path, int flags, void *data, enum uio_seg data_seg, size_t data_len, register_t *retval) { struct vfsops *vfsops = NULL; /* XXX gcc4.8 */ struct vnode *vp; void *data_buf = data; bool vfsopsrele = false; size_t alloc_sz = 0; int error; /* * Get vnode to be covered */ error = namei_simple_user(path, NSM_FOLLOW_TRYEMULROOT, &vp); if (error != 0) { vp = NULL; goto done; } if (flags & (MNT_GETARGS | MNT_UPDATE)) { vfsops = vp->v_mount->mnt_op; } else { /* 'type' is userspace */ error = mount_get_vfsops(type, type_seg, &vfsops); if (error != 0) goto done; vfsopsrele = true; } /* * We allow data to be NULL, even for userspace. Some fs's don't need * it. The others will handle NULL. */ if (data != NULL && data_seg == UIO_USERSPACE) { if (data_len == 0) { /* No length supplied, use default for filesystem */ data_len = vfsops->vfs_min_mount_data; /* * Hopefully a longer buffer won't make copyin() fail. * For compatibility with 3.0 and earlier. */ if (flags & MNT_UPDATE && data_len < sizeof (struct mnt_export_args30)) data_len = sizeof (struct mnt_export_args30); } if ((data_len == 0) || (data_len > VFS_MAX_MOUNT_DATA)) { error = EINVAL; goto done; } alloc_sz = data_len; data_buf = kmem_alloc(alloc_sz, KM_SLEEP); /* NFS needs the buffer even for mnt_getargs .... */ error = copyin(data, data_buf, data_len); if (error != 0) goto done; } if (flags & MNT_GETARGS) { if (data_len == 0) { error = EINVAL; goto done; } error = mount_getargs(l, vp, path, flags, data_buf, &data_len); if (error != 0) goto done; if (data_seg == UIO_USERSPACE) error = copyout(data_buf, data, data_len); *retval = data_len; } else if (flags & MNT_UPDATE) { error = mount_update(l, vp, path, flags, data_buf, &data_len); } else { /* Locking is handled internally in mount_domount(). */ KASSERT(vfsopsrele == true); error = mount_domount(l, &vp, vfsops, path, flags, data_buf, &data_len); vfsopsrele = false; } if (!error) { mutex_enter(&fs_klist_lock); KNOTE(&fs_klist, NOTE_SUBMIT | VQ_MOUNT); mutex_exit(&fs_klist_lock); } done: if (vfsopsrele) vfs_delref(vfsops); if (vp != NULL) { vrele(vp); } if (data_buf != data) kmem_free(data_buf, alloc_sz); return (error); } /* * Unmount a file system. * * Note: unmount takes a path to the vnode mounted on as argument, * not special file (as before). */ /* ARGSUSED */ int sys_unmount(struct lwp *l, const struct sys_unmount_args *uap, register_t *retval) { /* { syscallarg(const char *) path; syscallarg(int) flags; } */ struct vnode *vp; struct mount *mp; int error; struct pathbuf *pb; struct nameidata nd; error = pathbuf_copyin(SCARG(uap, path), &pb); if (error) { return error; } NDINIT(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | TRYEMULROOT, pb); if ((error = namei(&nd)) != 0) { pathbuf_destroy(pb); return error; } vp = nd.ni_vp; pathbuf_destroy(pb); mp = vp->v_mount; vfs_ref(mp); VOP_UNLOCK(vp); error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_MOUNT, KAUTH_REQ_SYSTEM_MOUNT_UNMOUNT, mp, NULL, NULL); if (error) { vrele(vp); vfs_rele(mp); return (error); } /* * Don't allow unmounting the root file system. */ if (mp->mnt_flag & MNT_ROOTFS) { vrele(vp); vfs_rele(mp); return (EINVAL); } /* * Must be the root of the filesystem */ if ((vp->v_vflag & VV_ROOT) == 0) { vrele(vp); vfs_rele(mp); return (EINVAL); } vrele(vp); error = dounmount(mp, SCARG(uap, flags), l); vfs_rele(mp); if (!error) { mutex_enter(&fs_klist_lock); KNOTE(&fs_klist, NOTE_SUBMIT | VQ_UNMOUNT); mutex_exit(&fs_klist_lock); } return error; } /* * Sync each mounted filesystem. */ #ifdef DEBUG int syncprt = 0; struct ctldebug debug0 = { "syncprt", &syncprt }; #endif void do_sys_sync(struct lwp *l) { mount_iterator_t *iter; struct mount *mp; int asyncflag; mountlist_iterator_init(&iter); while ((mp = mountlist_iterator_next(iter)) != NULL) { mutex_enter(mp->mnt_updating); if ((mp->mnt_flag & MNT_RDONLY) == 0) { asyncflag = mp->mnt_flag & MNT_ASYNC; mp->mnt_flag &= ~MNT_ASYNC; VFS_SYNC(mp, MNT_NOWAIT, l->l_cred); if (asyncflag) mp->mnt_flag |= MNT_ASYNC; } mutex_exit(mp->mnt_updating); } mountlist_iterator_destroy(iter); #ifdef DEBUG if (syncprt) vfs_bufstats(); #endif /* DEBUG */ } static bool sync_vnode_filter(void *cookie, vnode_t *vp) { if (vp->v_numoutput > 0) { ++*(int *)cookie; } return false; } int vfs_syncwait(void) { int nbusy, nbusy_prev, iter; struct vnode_iterator *vniter; mount_iterator_t *mpiter; struct mount *mp; for (nbusy_prev = 0, iter = 0; iter < 20;) { nbusy = 0; mountlist_iterator_init(&mpiter); while ((mp = mountlist_iterator_next(mpiter)) != NULL) { vnode_t *vp __diagused; vfs_vnode_iterator_init(mp, &vniter); vp = vfs_vnode_iterator_next(vniter, sync_vnode_filter, &nbusy); KASSERT(vp == NULL); vfs_vnode_iterator_destroy(vniter); } mountlist_iterator_destroy(mpiter); if (nbusy == 0) break; if (nbusy_prev == 0) nbusy_prev = nbusy; printf("%d ", nbusy); kpause("syncwait", false, MAX(1, hz / 25 * iter), NULL); if (nbusy >= nbusy_prev) /* we didn't flush anything */ iter++; else nbusy_prev = nbusy; } if (nbusy) { #if defined(DEBUG) || defined(DEBUG_HALT_BUSY) printf("giving up\nPrinting vnodes for busy buffers\n"); mountlist_iterator_init(&mpiter); while ((mp = mountlist_iterator_next(mpiter)) != NULL) { vnode_t *vp; vfs_vnode_iterator_init(mp, &vniter); vp = vfs_vnode_iterator_next(vniter, NULL, NULL); mutex_enter(vp->v_interlock); if (vp->v_numoutput > 0) vprint(NULL, vp); mutex_exit(vp->v_interlock); vrele(vp); vfs_vnode_iterator_destroy(vniter); } mountlist_iterator_destroy(mpiter); #endif } return nbusy; } /* ARGSUSED */ int sys_sync(struct lwp *l, const void *v, register_t *retval) { do_sys_sync(l); return (0); } /* * Access or change filesystem quotas. * * (this is really 14 different calls bundled into one) */ static int do_sys_quotactl_stat(struct mount *mp, struct quotastat *info_u) { struct quotastat info_k; int error; /* ensure any padding bytes are cleared */ memset(&info_k, 0, sizeof(info_k)); error = vfs_quotactl_stat(mp, &info_k); if (error) { return error; } return copyout(&info_k, info_u, sizeof(info_k)); } static int do_sys_quotactl_idtypestat(struct mount *mp, int idtype, struct quotaidtypestat *info_u) { struct quotaidtypestat info_k; int error; /* ensure any padding bytes are cleared */ memset(&info_k, 0, sizeof(info_k)); error = vfs_quotactl_idtypestat(mp, idtype, &info_k); if (error) { return error; } return copyout(&info_k, info_u, sizeof(info_k)); } static int do_sys_quotactl_objtypestat(struct mount *mp, int objtype, struct quotaobjtypestat *info_u) { struct quotaobjtypestat info_k; int error; /* ensure any padding bytes are cleared */ memset(&info_k, 0, sizeof(info_k)); error = vfs_quotactl_objtypestat(mp, objtype, &info_k); if (error) { return error; } return copyout(&info_k, info_u, sizeof(info_k)); } static int do_sys_quotactl_get(struct mount *mp, const struct quotakey *key_u, struct quotaval *val_u) { struct quotakey key_k; struct quotaval val_k; int error; /* ensure any padding bytes are cleared */ memset(&val_k, 0, sizeof(val_k)); error = copyin(key_u, &key_k, sizeof(key_k)); if (error) { return error; } error = vfs_quotactl_get(mp, &key_k, &val_k); if (error) { return error; } return copyout(&val_k, val_u, sizeof(val_k)); } static int do_sys_quotactl_put(struct mount *mp, const struct quotakey *key_u, const struct quotaval *val_u) { struct quotakey key_k; struct quotaval val_k; int error; error = copyin(key_u, &key_k, sizeof(key_k)); if (error) { return error; } error = copyin(val_u, &val_k, sizeof(val_k)); if (error) { return error; } return vfs_quotactl_put(mp, &key_k, &val_k); } static int do_sys_quotactl_del(struct mount *mp, const struct quotakey *key_u) { struct quotakey key_k; int error; error = copyin(key_u, &key_k, sizeof(key_k)); if (error) { return error; } return vfs_quotactl_del(mp, &key_k); } static int do_sys_quotactl_cursoropen(struct mount *mp, struct quotakcursor *cursor_u) { struct quotakcursor cursor_k; int error; /* ensure any padding bytes are cleared */ memset(&cursor_k, 0, sizeof(cursor_k)); error = vfs_quotactl_cursoropen(mp, &cursor_k); if (error) { return error; } return copyout(&cursor_k, cursor_u, sizeof(cursor_k)); } static int do_sys_quotactl_cursorclose(struct mount *mp, struct quotakcursor *cursor_u) { struct quotakcursor cursor_k; int error; error = copyin(cursor_u, &cursor_k, sizeof(cursor_k)); if (error) { return error; } return vfs_quotactl_cursorclose(mp, &cursor_k); } static int do_sys_quotactl_cursorskipidtype(struct mount *mp, struct quotakcursor *cursor_u, int idtype) { struct quotakcursor cursor_k; int error; error = copyin(cursor_u, &cursor_k, sizeof(cursor_k)); if (error) { return error; } error = vfs_quotactl_cursorskipidtype(mp, &cursor_k, idtype); if (error) { return error; } return copyout(&cursor_k, cursor_u, sizeof(cursor_k)); } static int do_sys_quotactl_cursorget(struct mount *mp, struct quotakcursor *cursor_u, struct quotakey *keys_u, struct quotaval *vals_u, unsigned maxnum, unsigned *ret_u) { #define CGET_STACK_MAX 8 struct quotakcursor cursor_k; struct quotakey stackkeys[CGET_STACK_MAX]; struct quotaval stackvals[CGET_STACK_MAX]; struct quotakey *keys_k; struct quotaval *vals_k; unsigned ret_k; int error; if (maxnum > 128) { maxnum = 128; } error = copyin(cursor_u, &cursor_k, sizeof(cursor_k)); if (error) { return error; } if (maxnum <= CGET_STACK_MAX) { keys_k = stackkeys; vals_k = stackvals; /* ensure any padding bytes are cleared */ memset(keys_k, 0, maxnum * sizeof(keys_k[0])); memset(vals_k, 0, maxnum * sizeof(vals_k[0])); } else { keys_k = kmem_zalloc(maxnum * sizeof(keys_k[0]), KM_SLEEP); vals_k = kmem_zalloc(maxnum * sizeof(vals_k[0]), KM_SLEEP); } error = vfs_quotactl_cursorget(mp, &cursor_k, keys_k, vals_k, maxnum, &ret_k); if (error) { goto fail; } error = copyout(keys_k, keys_u, ret_k * sizeof(keys_k[0])); if (error) { goto fail; } error = copyout(vals_k, vals_u, ret_k * sizeof(vals_k[0])); if (error) { goto fail; } error = copyout(&ret_k, ret_u, sizeof(ret_k)); if (error) { goto fail; } /* do last to maximize the chance of being able to recover a failure */ error = copyout(&cursor_k, cursor_u, sizeof(cursor_k)); fail: if (keys_k != stackkeys) { kmem_free(keys_k, maxnum * sizeof(keys_k[0])); } if (vals_k != stackvals) { kmem_free(vals_k, maxnum * sizeof(vals_k[0])); } return error; } static int do_sys_quotactl_cursoratend(struct mount *mp, struct quotakcursor *cursor_u, int *ret_u) { struct quotakcursor cursor_k; int ret_k; int error; error = copyin(cursor_u, &cursor_k, sizeof(cursor_k)); if (error) { return error; } error = vfs_quotactl_cursoratend(mp, &cursor_k, &ret_k); if (error) { return error; } error = copyout(&ret_k, ret_u, sizeof(ret_k)); if (error) { return error; } return copyout(&cursor_k, cursor_u, sizeof(cursor_k)); } static int do_sys_quotactl_cursorrewind(struct mount *mp, struct quotakcursor *cursor_u) { struct quotakcursor cursor_k; int error; error = copyin(cursor_u, &cursor_k, sizeof(cursor_k)); if (error) { return error; } error = vfs_quotactl_cursorrewind(mp, &cursor_k); if (error) { return error; } return copyout(&cursor_k, cursor_u, sizeof(cursor_k)); } static int do_sys_quotactl_quotaon(struct mount *mp, int idtype, const char *path_u) { char *path_k; int error; /* XXX this should probably be a struct pathbuf */ path_k = PNBUF_GET(); error = copyin(path_u, path_k, PATH_MAX); if (error) { PNBUF_PUT(path_k); return error; } error = vfs_quotactl_quotaon(mp, idtype, path_k); PNBUF_PUT(path_k); return error; } static int do_sys_quotactl_quotaoff(struct mount *mp, int idtype) { return vfs_quotactl_quotaoff(mp, idtype); } int do_sys_quotactl(const char *path_u, const struct quotactl_args *args) { struct mount *mp; struct vnode *vp; int error; error = namei_simple_user(path_u, NSM_FOLLOW_TRYEMULROOT, &vp); if (error != 0) return (error); mp = vp->v_mount; switch (args->qc_op) { case QUOTACTL_STAT: error = do_sys_quotactl_stat(mp, args->u.stat.qc_info); break; case QUOTACTL_IDTYPESTAT: error = do_sys_quotactl_idtypestat(mp, args->u.idtypestat.qc_idtype, args->u.idtypestat.qc_info); break; case QUOTACTL_OBJTYPESTAT: error = do_sys_quotactl_objtypestat(mp, args->u.objtypestat.qc_objtype, args->u.objtypestat.qc_info); break; case QUOTACTL_GET: error = do_sys_quotactl_get(mp, args->u.get.qc_key, args->u.get.qc_val); break; case QUOTACTL_PUT: error = do_sys_quotactl_put(mp, args->u.put.qc_key, args->u.put.qc_val); break; case QUOTACTL_DEL: error = do_sys_quotactl_del(mp, args->u.del.qc_key); break; case QUOTACTL_CURSOROPEN: error = do_sys_quotactl_cursoropen(mp, args->u.cursoropen.qc_cursor); break; case QUOTACTL_CURSORCLOSE: error = do_sys_quotactl_cursorclose(mp, args->u.cursorclose.qc_cursor); break; case QUOTACTL_CURSORSKIPIDTYPE: error = do_sys_quotactl_cursorskipidtype(mp, args->u.cursorskipidtype.qc_cursor, args->u.cursorskipidtype.qc_idtype); break; case QUOTACTL_CURSORGET: error = do_sys_quotactl_cursorget(mp, args->u.cursorget.qc_cursor, args->u.cursorget.qc_keys, args->u.cursorget.qc_vals, args->u.cursorget.qc_maxnum, args->u.cursorget.qc_ret); break; case QUOTACTL_CURSORATEND: error = do_sys_quotactl_cursoratend(mp, args->u.cursoratend.qc_cursor, args->u.cursoratend.qc_ret); break; case QUOTACTL_CURSORREWIND: error = do_sys_quotactl_cursorrewind(mp, args->u.cursorrewind.qc_cursor); break; case QUOTACTL_QUOTAON: error = do_sys_quotactl_quotaon(mp, args->u.quotaon.qc_idtype, args->u.quotaon.qc_quotafile); break; case QUOTACTL_QUOTAOFF: error = do_sys_quotactl_quotaoff(mp, args->u.quotaoff.qc_idtype); break; default: error = EINVAL; break; } vrele(vp); return error; } /* ARGSUSED */ int sys___quotactl(struct lwp *l, const struct sys___quotactl_args *uap, register_t *retval) { /* { syscallarg(const char *) path; syscallarg(struct quotactl_args *) args; } */ struct quotactl_args args; int error; error = copyin(SCARG(uap, args), &args, sizeof(args)); if (error) { return error; } return do_sys_quotactl(SCARG(uap, path), &args); } int dostatvfs(struct mount *mp, struct statvfs *sp, struct lwp *l, int flags, int root) { struct cwdinfo *cwdi = l->l_proc->p_cwdi; bool chrooted; int error = 0; KASSERT(l == curlwp); /* * This is safe unlocked. cwdi_rdir never goes non-NULL -> NULL, * since it would imply chroots can be escaped. Just make sure this * routine is self-consistent. */ chrooted = (atomic_load_relaxed(&cwdi->cwdi_rdir) != NULL); /* * If MNT_NOWAIT or MNT_LAZY is specified, do not * refresh the fsstat cache. MNT_WAIT or MNT_LAZY * overrides MNT_NOWAIT. */ if (flags == MNT_NOWAIT || flags == MNT_LAZY || (flags != MNT_WAIT && flags != 0)) { memcpy(sp, &mp->mnt_stat, sizeof(*sp)); } else { /* Get the filesystem stats now */ memset(sp, 0, sizeof(*sp)); if ((error = VFS_STATVFS(mp, sp)) != 0) return error; if (!chrooted) (void)memcpy(&mp->mnt_stat, sp, sizeof(mp->mnt_stat)); } if (chrooted) { size_t len; char *bp; char c; char *path = PNBUF_GET(); bp = path + MAXPATHLEN; *--bp = '\0'; rw_enter(&cwdi->cwdi_lock, RW_READER); error = getcwd_common(cwdi->cwdi_rdir, rootvnode, &bp, path, MAXPATHLEN / 2, 0, l); rw_exit(&cwdi->cwdi_lock); if (error) { PNBUF_PUT(path); return error; } len = strlen(bp); if (len != 1) { /* * for mount points that are below our root, we can see * them, so we fix up the pathname and return them. The * rest we cannot see, so we don't allow viewing the * data. */ if (strncmp(bp, sp->f_mntonname, len) == 0 && ((c = sp->f_mntonname[len]) == '/' || c == '\0')) { (void)strlcpy(sp->f_mntonname, c == '\0' ? "/" : &sp->f_mntonname[len], sizeof(sp->f_mntonname)); } else { if (root) (void)strlcpy(sp->f_mntonname, "/", sizeof(sp->f_mntonname)); else error = EPERM; } } PNBUF_PUT(path); } sp->f_flag = mp->mnt_flag & MNT_VISFLAGMASK; return error; } /* * Get filesystem statistics by path. */ int do_sys_pstatvfs(struct lwp *l, const char *path, int flags, struct statvfs *sb) { struct mount *mp; int error; struct vnode *vp; error = namei_simple_user(path, NSM_FOLLOW_TRYEMULROOT, &vp); if (error != 0) return error; mp = vp->v_mount; error = dostatvfs(mp, sb, l, flags, 1); vrele(vp); return error; } /* ARGSUSED */ int sys___statvfs190(struct lwp *l, const struct sys___statvfs190_args *uap, register_t *retval) { /* { syscallarg(const char *) path; syscallarg(struct statvfs *) buf; syscallarg(int) flags; } */ struct statvfs *sb; int error; sb = STATVFSBUF_GET(); error = do_sys_pstatvfs(l, SCARG(uap, path), SCARG(uap, flags), sb); if (error == 0) error = copyout(sb, SCARG(uap, buf), sizeof(*sb)); STATVFSBUF_PUT(sb); return error; } /* * Get filesystem statistics by fd. */ int do_sys_fstatvfs(struct lwp *l, int fd, int flags, struct statvfs *sb) { file_t *fp; struct mount *mp; int error; /* fd_getvnode() will use the descriptor for us */ if ((error = fd_getvnode(fd, &fp)) != 0) return (error); mp = fp->f_vnode->v_mount; error = dostatvfs(mp, sb, curlwp, flags, 1); fd_putfile(fd); return error; } /* ARGSUSED */ int sys___fstatvfs190(struct lwp *l, const struct sys___fstatvfs190_args *uap, register_t *retval) { /* { syscallarg(int) fd; syscallarg(struct statvfs *) buf; syscallarg(int) flags; } */ struct statvfs *sb; int error; sb = STATVFSBUF_GET(); error = do_sys_fstatvfs(l, SCARG(uap, fd), SCARG(uap, flags), sb); if (error == 0) error = copyout(sb, SCARG(uap, buf), sizeof(*sb)); STATVFSBUF_PUT(sb); return error; } /* * Get statistics on all filesystems. */ int do_sys_getvfsstat(struct lwp *l, void *sfsp, size_t bufsize, int flags, int (*copyfn)(const void *, void *, size_t), size_t entry_sz, register_t *retval) { int root = 0; mount_iterator_t *iter; struct proc *p = l->l_proc; struct mount *mp; struct statvfs *sb; size_t count, maxcount; int error = 0; sb = STATVFSBUF_GET(); maxcount = bufsize / entry_sz; count = 0; mountlist_iterator_init(&iter); while ((mp = mountlist_iterator_next(iter)) != NULL) { if (sfsp && count < maxcount) { error = dostatvfs(mp, sb, l, flags, 0); if (error) { error = 0; continue; } error = copyfn(sb, sfsp, entry_sz); if (error) goto out; sfsp = (char *)sfsp + entry_sz; root |= strcmp(sb->f_mntonname, "/") == 0; } count++; } if (root == 0 && p->p_cwdi->cwdi_rdir) { /* * fake a root entry */ error = dostatvfs(p->p_cwdi->cwdi_rdir->v_mount, sb, l, flags, 1); if (error != 0) goto out; if (sfsp) { error = copyfn(sb, sfsp, entry_sz); if (error != 0) goto out; } count++; } if (sfsp && count > maxcount) *retval = maxcount; else *retval = count; out: mountlist_iterator_destroy(iter); STATVFSBUF_PUT(sb); return error; } int sys___getvfsstat90(struct lwp *l, const struct sys___getvfsstat90_args *uap, register_t *retval) { /* { syscallarg(struct statvfs *) buf; syscallarg(size_t) bufsize; syscallarg(int) flags; } */ return do_sys_getvfsstat(l, SCARG(uap, buf), SCARG(uap, bufsize), SCARG(uap, flags), copyout, sizeof (struct statvfs), retval); } /* * Change current working directory to a given file descriptor. */ int do_sys_fchdir(struct lwp *l, int fd, register_t *retval) { struct proc *p = l->l_proc; struct cwdinfo *cwdi; struct vnode *vp, *tdp; struct mount *mp; file_t *fp; int error; /* fd_getvnode() will use the descriptor for us */ if ((error = fd_getvnode(fd, &fp)) != 0) return error; vp = fp->f_vnode; vref(vp); vn_lock(vp, LK_SHARED | LK_RETRY); if (vp->v_type != VDIR) error = ENOTDIR; else error = VOP_ACCESS(vp, VEXEC, l->l_cred); if (error) { vput(vp); goto out; } while ((mp = vp->v_mountedhere) != NULL) { error = vfs_busy(mp); vput(vp); if (error != 0) goto out; error = VFS_ROOT(mp, LK_SHARED, &tdp); vfs_unbusy(mp); if (error) goto out; vp = tdp; } VOP_UNLOCK(vp); /* * Disallow changing to a directory not under the process's * current root directory (if there is one). */ cwdi = p->p_cwdi; rw_enter(&cwdi->cwdi_lock, RW_WRITER); if (cwdi->cwdi_rdir && !vn_isunder(vp, NULL, l)) { vrele(vp); error = EPERM; /* operation not permitted */ } else { vrele(cwdi->cwdi_cdir); cwdi->cwdi_cdir = vp; } rw_exit(&cwdi->cwdi_lock); out: fd_putfile(fd); return error; } /* * Change current working directory to a given file descriptor. */ /* ARGSUSED */ int sys_fchdir(struct lwp *l, const struct sys_fchdir_args *uap, register_t *retval) { /* { syscallarg(int) fd; } */ return do_sys_fchdir(l, SCARG(uap, fd), retval); } /* * Change this process's notion of the root directory to a given file * descriptor. */ int sys_fchroot(struct lwp *l, const struct sys_fchroot_args *uap, register_t *retval) { struct vnode *vp; file_t *fp; int error, fd = SCARG(uap, fd); if ((error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_CHROOT, KAUTH_REQ_SYSTEM_CHROOT_FCHROOT, NULL, NULL, NULL)) != 0) return error; /* fd_getvnode() will use the descriptor for us */ if ((error = fd_getvnode(fd, &fp)) != 0) return error; vp = fp->f_vnode; vn_lock(vp, LK_SHARED | LK_RETRY); if (vp->v_type != VDIR) error = ENOTDIR; else error = VOP_ACCESS(vp, VEXEC, l->l_cred); VOP_UNLOCK(vp); if (error) goto out; vref(vp); change_root(vp); out: fd_putfile(fd); return (error); } /* * Change current working directory (``.''). */ int do_sys_chdir(struct lwp *l, const char *path, enum uio_seg seg, register_t *retval) { struct proc *p = l->l_proc; struct cwdinfo * cwdi; int error; struct vnode *vp; if ((error = chdir_lookup(path, seg, &vp, l)) != 0) return error; cwdi = p->p_cwdi; rw_enter(&cwdi->cwdi_lock, RW_WRITER); vrele(cwdi->cwdi_cdir); cwdi->cwdi_cdir = vp; rw_exit(&cwdi->cwdi_lock); return 0; } /* * Change current working directory (``.''). */ /* ARGSUSED */ int sys_chdir(struct lwp *l, const struct sys_chdir_args *uap, register_t *retval) { /* { syscallarg(const char *) path; } */ return do_sys_chdir(l, SCARG(uap, path), UIO_USERSPACE, retval); } /* * Change notion of root (``/'') directory. */ /* ARGSUSED */ int sys_chroot(struct lwp *l, const struct sys_chroot_args *uap, register_t *retval) { /* { syscallarg(const char *) path; } */ int error; struct vnode *vp; if ((error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_CHROOT, KAUTH_REQ_SYSTEM_CHROOT_CHROOT, NULL, NULL, NULL)) != 0) return (error); error = chdir_lookup(SCARG(uap, path), UIO_USERSPACE, &vp, l); if (error == 0) change_root(vp); return error; } /* * Common routine for chroot and fchroot. * NB: callers need to properly authorize the change root operation. */ void change_root(struct vnode *vp) { kauth_cred_t ncred; struct lwp *l = curlwp; struct proc *p = l->l_proc; struct cwdinfo *cwdi = p->p_cwdi; ncred = kauth_cred_alloc(); rw_enter(&cwdi->cwdi_lock, RW_WRITER); if (cwdi->cwdi_rdir != NULL) vrele(cwdi->cwdi_rdir); cwdi->cwdi_rdir = vp; /* * Prevent escaping from chroot by putting the root under * the working directory. Silently chdir to / if we aren't * already there. */ if (!vn_isunder(cwdi->cwdi_cdir, vp, l)) { /* * XXX would be more failsafe to change directory to a * deadfs node here instead */ vrele(cwdi->cwdi_cdir); vref(vp); cwdi->cwdi_cdir = vp; } rw_exit(&cwdi->cwdi_lock); /* Get a write lock on the process credential. */ proc_crmod_enter(); kauth_cred_clone(p->p_cred, ncred); kauth_proc_chroot(ncred, p->p_cwdi); /* Broadcast our credentials to the process and other LWPs. */ proc_crmod_leave(ncred, p->p_cred, true); } /* * Common routine for chroot and chdir. * XXX "where" should be enum uio_seg */ int chdir_lookup(const char *path, int where, struct vnode **vpp, struct lwp *l) { struct pathbuf *pb; struct nameidata nd; int error; error = pathbuf_maybe_copyin(path, where, &pb); if (error) { return error; } NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | LOCKSHARED | TRYEMULROOT, pb); if ((error = namei(&nd)) != 0) { pathbuf_destroy(pb); return error; } *vpp = nd.ni_vp; pathbuf_destroy(pb); if ((*vpp)->v_type != VDIR) error = ENOTDIR; else error = VOP_ACCESS(*vpp, VEXEC, l->l_cred); if (error) vput(*vpp); else VOP_UNLOCK(*vpp); return (error); } /* * Internals of sys_open - path has already been converted into a pathbuf * (so we can easily reuse this function from other parts of the kernel, * like posix_spawn post-processing). */ int do_open(lwp_t *l, struct vnode *dvp, struct pathbuf *pb, int open_flags, int open_mode, int *fd) { struct proc *p = l->l_proc; struct cwdinfo *cwdi = p->p_cwdi; file_t *fp; struct vnode *vp; int dupfd; bool dupfd_move; int flags, cmode; int indx, error; if (open_flags & O_SEARCH) { open_flags &= ~(int)O_SEARCH; } /* * Only one of the O_EXEC, O_RDONLY, O_WRONLY and O_RDWR flags * may be specified. */ if ((open_flags & O_EXEC) && (open_flags & O_ACCMODE)) return EINVAL; flags = FFLAGS(open_flags); if ((flags & (FREAD | FWRITE)) == 0) return EINVAL; if ((error = fd_allocfile(&fp, &indx)) != 0) { return error; } /* We're going to read cwdi->cwdi_cmask unlocked here. */ cmode = ((open_mode &~ cwdi->cwdi_cmask) & ALLPERMS) &~ S_ISTXT; error = vn_open(dvp, pb, TRYEMULROOT, flags, cmode, &vp, &dupfd_move, &dupfd); if (error != 0) { fd_abort(p, fp, indx); return error; } if (vp == NULL) { fd_abort(p, fp, indx); error = fd_dupopen(dupfd, dupfd_move, flags, &indx); if (error) return error; *fd = indx; } else { error = open_setfp(l, fp, vp, indx, flags); if (error) return error; VOP_UNLOCK(vp); *fd = indx; fd_affix(p, fp, indx); } return 0; } int fd_open(const char *path, int open_flags, int open_mode, int *fd) { struct pathbuf *pb; int error, oflags; oflags = FFLAGS(open_flags); if ((oflags & (FREAD | FWRITE)) == 0) return EINVAL; pb = pathbuf_create(path); if (pb == NULL) return ENOMEM; error = do_open(curlwp, NULL, pb, open_flags, open_mode, fd); pathbuf_destroy(pb); return error; } static int do_sys_openat(lwp_t *l, int fdat, const char *path, int flags, int mode, int *fd) { file_t *dfp = NULL; struct vnode *dvp = NULL; struct pathbuf *pb; const char *pathstring = NULL; int error; if (path == NULL) { MODULE_HOOK_CALL(vfs_openat_10_hook, (&pb), enosys(), error); if (error == ENOSYS) goto no_compat; if (error) return error; } else { no_compat: error = pathbuf_copyin(path, &pb); if (error) return error; } pathstring = pathbuf_stringcopy_get(pb); /* * fdat is ignored if: * 1) if fdat is AT_FDCWD, which means use current directory as base. * 2) if path is absolute, then fdat is useless. */ if (fdat != AT_FDCWD && pathstring[0] != '/') { /* fd_getvnode() will use the descriptor for us */ if ((error = fd_getvnode(fdat, &dfp)) != 0) goto out; dvp = dfp->f_vnode; } error = do_open(l, dvp, pb, flags, mode, fd); if (dfp != NULL) fd_putfile(fdat); out: pathbuf_stringcopy_put(pb, pathstring); pathbuf_destroy(pb); return error; } int sys_open(struct lwp *l, const struct sys_open_args *uap, register_t *retval) { /* { syscallarg(const char *) path; syscallarg(int) flags; syscallarg(int) mode; } */ int error; int fd; error = do_sys_openat(l, AT_FDCWD, SCARG(uap, path), SCARG(uap, flags), SCARG(uap, mode), &fd); if (error == 0) *retval = fd; return error; } int sys_openat(struct lwp *l, const struct sys_openat_args *uap, register_t *retval) { /* { syscallarg(int) fd; syscallarg(const char *) path; syscallarg(int) oflags; syscallarg(int) mode; } */ int error; int fd; error = do_sys_openat(l, SCARG(uap, fd), SCARG(uap, path), SCARG(uap, oflags), SCARG(uap, mode), &fd); if (error == 0) *retval = fd; return error; } static void vfs__fhfree(fhandle_t *fhp) { size_t fhsize; fhsize = FHANDLE_SIZE(fhp); kmem_free(fhp, fhsize); } /* * vfs_composefh: compose a filehandle. */ int vfs_composefh(struct vnode *vp, fhandle_t *fhp, size_t *fh_size) { struct mount *mp; struct fid *fidp; int error; size_t needfhsize; size_t fidsize; mp = vp->v_mount; fidp = NULL; if (*fh_size < FHANDLE_SIZE_MIN) { fidsize = 0; } else { fidsize = *fh_size - offsetof(fhandle_t, fh_fid); if (fhp != NULL) { memset(fhp, 0, *fh_size); fhp->fh_fsid = mp->mnt_stat.f_fsidx; fidp = &fhp->fh_fid; } } error = VFS_VPTOFH(vp, fidp, &fidsize); needfhsize = FHANDLE_SIZE_FROM_FILEID_SIZE(fidsize); if (error == 0 && *fh_size < needfhsize) { error = E2BIG; } *fh_size = needfhsize; return error; } int vfs_composefh_alloc(struct vnode *vp, fhandle_t **fhpp) { struct mount *mp; fhandle_t *fhp; size_t fhsize; size_t fidsize; int error; mp = vp->v_mount; fidsize = 0; error = VFS_VPTOFH(vp, NULL, &fidsize); KASSERT(error != 0); if (error != E2BIG) { goto out; } fhsize = FHANDLE_SIZE_FROM_FILEID_SIZE(fidsize); fhp = kmem_zalloc(fhsize, KM_SLEEP); fhp->fh_fsid = mp->mnt_stat.f_fsidx; error = VFS_VPTOFH(vp, &fhp->fh_fid, &fidsize); if (error == 0) { KASSERT(FHANDLE_SIZE(fhp) == fhsize); KASSERT(FHANDLE_FILEID(fhp)->fid_len == fidsize); *fhpp = fhp; } else { kmem_free(fhp, fhsize); } out: return error; } void vfs_composefh_free(fhandle_t *fhp) { vfs__fhfree(fhp); } /* * vfs_fhtovp: lookup a vnode by a filehandle. */ int vfs_fhtovp(fhandle_t *fhp, struct vnode **vpp) { struct mount *mp; int error; *vpp = NULL; mp = vfs_getvfs(FHANDLE_FSID(fhp)); if (mp == NULL) { error = ESTALE; goto out; } if (mp->mnt_op->vfs_fhtovp == NULL) { error = EOPNOTSUPP; goto out; } error = VFS_FHTOVP(mp, FHANDLE_FILEID(fhp), LK_EXCLUSIVE, vpp); out: return error; } /* * vfs_copyinfh_alloc: allocate and copyin a filehandle, given * the needed size. */ int vfs_copyinfh_alloc(const void *ufhp, size_t fhsize, fhandle_t **fhpp) { fhandle_t *fhp; int error; if (fhsize > FHANDLE_SIZE_MAX) { return EINVAL; } if (fhsize < FHANDLE_SIZE_MIN) { return EINVAL; } again: fhp = kmem_alloc(fhsize, KM_SLEEP); error = copyin(ufhp, fhp, fhsize); if (error == 0) { /* XXX this check shouldn't be here */ if (FHANDLE_SIZE(fhp) == fhsize) { *fhpp = fhp; return 0; } else if (fhsize == NFSX_V2FH && FHANDLE_SIZE(fhp) < fhsize) { /* * a kludge for nfsv2 padded handles. */ size_t sz; sz = FHANDLE_SIZE(fhp); kmem_free(fhp, fhsize); fhsize = sz; goto again; } else { /* * userland told us wrong size. */ error = EINVAL; } } kmem_free(fhp, fhsize); return error; } void vfs_copyinfh_free(fhandle_t *fhp) { vfs__fhfree(fhp); } /* * Get file handle system call */ int sys___getfh30(struct lwp *l, const struct sys___getfh30_args *uap, register_t *retval) { /* { syscallarg(char *) fname; syscallarg(fhandle_t *) fhp; syscallarg(size_t *) fh_size; } */ struct vnode *vp; fhandle_t *fh; int error; struct pathbuf *pb; struct nameidata nd; size_t sz; size_t usz; /* * Must be super user */ error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_FILEHANDLE, 0, NULL, NULL, NULL); if (error) return (error); error = pathbuf_copyin(SCARG(uap, fname), &pb); if (error) { return error; } NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | TRYEMULROOT, pb); error = namei(&nd); if (error) { pathbuf_destroy(pb); return error; } vp = nd.ni_vp; pathbuf_destroy(pb); error = vfs_composefh_alloc(vp, &fh); vput(vp); if (error != 0) { return error; } error = copyin(SCARG(uap, fh_size), &usz, sizeof(size_t)); if (error != 0) { goto out; } sz = FHANDLE_SIZE(fh); error = copyout(&sz, SCARG(uap, fh_size), sizeof(size_t)); if (error != 0) { goto out; } if (usz >= sz) { error = copyout(fh, SCARG(uap, fhp), sz); } else { error = E2BIG; } out: vfs_composefh_free(fh); return (error); } /* * Open a file given a file handle. * * Check permissions, allocate an open file structure, * and call the device open routine if any. */ int dofhopen(struct lwp *l, const void *ufhp, size_t fhsize, int oflags, register_t *retval) { file_t *fp; struct vnode *vp = NULL; kauth_cred_t cred = l->l_cred; file_t *nfp; int indx, error; struct vattr va; fhandle_t *fh; int flags; proc_t *p; p = curproc; /* * Must be super user */ if ((error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_FILEHANDLE, 0, NULL, NULL, NULL))) return (error); if (oflags & O_SEARCH) { oflags &= ~(int)O_SEARCH; } flags = FFLAGS(oflags); if ((flags & (FREAD | FWRITE)) == 0) return (EINVAL); if ((flags & O_CREAT)) return (EINVAL); if ((error = fd_allocfile(&nfp, &indx)) != 0) return (error); fp = nfp; error = vfs_copyinfh_alloc(ufhp, fhsize, &fh); if (error != 0) { goto bad; } error = vfs_fhtovp(fh, &vp); vfs_copyinfh_free(fh); if (error != 0) { goto bad; } /* Now do an effective vn_open */ if (vp->v_type == VSOCK) { error = EOPNOTSUPP; goto bad; } error = vn_openchk(vp, cred, flags); if (error != 0) goto bad; if (flags & O_TRUNC) { VOP_UNLOCK(vp); /* XXX */ vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); /* XXX */ vattr_null(&va); va.va_size = 0; error = VOP_SETATTR(vp, &va, cred); if (error) goto bad; } if ((error = VOP_OPEN(vp, flags, cred)) != 0) goto bad; if (flags & FWRITE) { mutex_enter(vp->v_interlock); vp->v_writecount++; mutex_exit(vp->v_interlock); } /* done with modified vn_open, now finish what sys_open does. */ if ((error = open_setfp(l, fp, vp, indx, flags))) return error; VOP_UNLOCK(vp); *retval = indx; fd_affix(p, fp, indx); return (0); bad: fd_abort(p, fp, indx); if (vp != NULL) vput(vp); if (error == EDUPFD || error == EMOVEFD) { /* XXX should probably close curlwp->l_dupfd */ error = EOPNOTSUPP; } return (error); } int sys___fhopen40(struct lwp *l, const struct sys___fhopen40_args *uap, register_t *retval) { /* { syscallarg(const void *) fhp; syscallarg(size_t) fh_size; syscallarg(int) flags; } */ return dofhopen(l, SCARG(uap, fhp), SCARG(uap, fh_size), SCARG(uap, flags), retval); } int do_fhstat(struct lwp *l, const void *ufhp, size_t fhsize, struct stat *sb) { int error; fhandle_t *fh; struct vnode *vp; /* * Must be super user */ if ((error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_FILEHANDLE, 0, NULL, NULL, NULL))) return (error); error = vfs_copyinfh_alloc(ufhp, fhsize, &fh); if (error != 0) return error; error = vfs_fhtovp(fh, &vp); vfs_copyinfh_free(fh); if (error != 0) return error; error = vn_stat(vp, sb); vput(vp); return error; } /* ARGSUSED */ int sys___fhstat50(struct lwp *l, const struct sys___fhstat50_args *uap, register_t *retval) { /* { syscallarg(const void *) fhp; syscallarg(size_t) fh_size; syscallarg(struct stat *) sb; } */ struct stat sb; int error; error = do_fhstat(l, SCARG(uap, fhp), SCARG(uap, fh_size), &sb); if (error) return error; return copyout(&sb, SCARG(uap, sb), sizeof(sb)); } int do_fhstatvfs(struct lwp *l, const void *ufhp, size_t fhsize, struct statvfs *sb, int flags) { fhandle_t *fh; struct mount *mp; struct vnode *vp; int error; /* * Must be super user */ if ((error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_FILEHANDLE, 0, NULL, NULL, NULL))) return error; error = vfs_copyinfh_alloc(ufhp, fhsize, &fh); if (error != 0) return error; error = vfs_fhtovp(fh, &vp); vfs_copyinfh_free(fh); if (error != 0) return error; mp = vp->v_mount; error = dostatvfs(mp, sb, l, flags, 1); vput(vp); return error; } /* ARGSUSED */ int sys___fhstatvfs190(struct lwp *l, const struct sys___fhstatvfs190_args *uap, register_t *retval) { /* { syscallarg(const void *) fhp; syscallarg(size_t) fh_size; syscallarg(struct statvfs *) buf; syscallarg(int) flags; } */ struct statvfs *sb = STATVFSBUF_GET(); int error; error = do_fhstatvfs(l, SCARG(uap, fhp), SCARG(uap, fh_size), sb, SCARG(uap, flags)); if (error == 0) error = copyout(sb, SCARG(uap, buf), sizeof(*sb)); STATVFSBUF_PUT(sb); return error; } int do_posix_mknodat(struct lwp *l, int fdat, const char *pathname, mode_t mode, dev_t dev) { /* * The POSIX mknod(2) call is an alias for mkfifo(2) for S_IFIFO * in mode and dev=0. * * In all the other cases it's implementation defined behavior. */ if ((mode & S_IFIFO) && dev == 0) return do_sys_mkfifoat(l, fdat, pathname, mode); else return do_sys_mknodat(l, fdat, pathname, mode, dev, UIO_USERSPACE); } /* * Create a special file. */ /* ARGSUSED */ int sys___mknod50(struct lwp *l, const struct sys___mknod50_args *uap, register_t *retval) { /* { syscallarg(const char *) path; syscallarg(mode_t) mode; syscallarg(dev_t) dev; } */ return do_posix_mknodat(l, AT_FDCWD, SCARG(uap, path), SCARG(uap, mode), SCARG(uap, dev)); } int sys_mknodat(struct lwp *l, const struct sys_mknodat_args *uap, register_t *retval) { /* { syscallarg(int) fd; syscallarg(const char *) path; syscallarg(mode_t) mode; syscallarg(int) pad; syscallarg(dev_t) dev; } */ return do_posix_mknodat(l, SCARG(uap, fd), SCARG(uap, path), SCARG(uap, mode), SCARG(uap, dev)); } int do_sys_mknod(struct lwp *l, const char *pathname, mode_t mode, dev_t dev, enum uio_seg seg) { return do_sys_mknodat(l, AT_FDCWD, pathname, mode, dev, seg); } int do_sys_mknodat(struct lwp *l, int fdat, const char *pathname, mode_t mode, dev_t dev, enum uio_seg seg) { struct proc *p = l->l_proc; struct vnode *vp; struct vattr vattr; int error, optype; struct pathbuf *pb; struct nameidata nd; const char *pathstring; if ((error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_MKNOD, 0, NULL, NULL, NULL)) != 0) return (error); optype = VOP_MKNOD_DESCOFFSET; error = pathbuf_maybe_copyin(pathname, seg, &pb); if (error) { return error; } pathstring = pathbuf_stringcopy_get(pb); if (pathstring == NULL) { pathbuf_destroy(pb); return ENOMEM; } NDINIT(&nd, CREATE, LOCKPARENT | TRYEMULROOT, pb); if ((error = fd_nameiat(l, fdat, &nd)) != 0) goto out; vp = nd.ni_vp; if (vp != NULL) error = EEXIST; else { vattr_null(&vattr); /* We will read cwdi->cwdi_cmask unlocked. */ vattr.va_mode = (mode & ALLPERMS) &~ p->p_cwdi->cwdi_cmask; vattr.va_rdev = dev; switch (mode & S_IFMT) { case S_IFMT: /* used by badsect to flag bad sectors */ vattr.va_type = VBAD; break; case S_IFCHR: vattr.va_type = VCHR; break; case S_IFBLK: vattr.va_type = VBLK; break; case S_IFWHT: optype = VOP_WHITEOUT_DESCOFFSET; break; case S_IFREG: #if NVERIEXEC > 0 error = veriexec_openchk(l, nd.ni_vp, pathstring, O_CREAT); #endif /* NVERIEXEC > 0 */ vattr.va_type = VREG; vattr.va_rdev = VNOVAL; optype = VOP_CREATE_DESCOFFSET; break; default: error = EINVAL; break; } if (error == 0 && optype == VOP_MKNOD_DESCOFFSET && vattr.va_rdev == VNOVAL) error = EINVAL; } if (!error) { switch (optype) { case VOP_WHITEOUT_DESCOFFSET: error = VOP_WHITEOUT(nd.ni_dvp, &nd.ni_cnd, CREATE); if (error) VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd); vput(nd.ni_dvp); break; case VOP_MKNOD_DESCOFFSET: error = VOP_MKNOD(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr); if (error == 0) vrele(nd.ni_vp); vput(nd.ni_dvp); break; case VOP_CREATE_DESCOFFSET: error = VOP_CREATE(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr); if (error == 0) vrele(nd.ni_vp); vput(nd.ni_dvp); break; } } else { VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd); if (nd.ni_dvp == vp) vrele(nd.ni_dvp); else vput(nd.ni_dvp); if (vp) vrele(vp); } out: pathbuf_stringcopy_put(pb, pathstring); pathbuf_destroy(pb); return (error); } /* * Create a named pipe. */ /* ARGSUSED */ int sys_mkfifo(struct lwp *l, const struct sys_mkfifo_args *uap, register_t *retval) { /* { syscallarg(const char *) path; syscallarg(int) mode; } */ return do_sys_mkfifoat(l, AT_FDCWD, SCARG(uap, path), SCARG(uap, mode)); } int sys_mkfifoat(struct lwp *l, const struct sys_mkfifoat_args *uap, register_t *retval) { /* { syscallarg(int) fd; syscallarg(const char *) path; syscallarg(int) mode; } */ return do_sys_mkfifoat(l, SCARG(uap, fd), SCARG(uap, path), SCARG(uap, mode)); } static int do_sys_mkfifoat(struct lwp *l, int fdat, const char *path, mode_t mode) { struct proc *p = l->l_proc; struct vattr vattr; int error; struct pathbuf *pb; struct nameidata nd; error = pathbuf_copyin(path, &pb); if (error) { return error; } NDINIT(&nd, CREATE, LOCKPARENT | TRYEMULROOT, pb); if ((error = fd_nameiat(l, fdat, &nd)) != 0) { pathbuf_destroy(pb); return error; } if (nd.ni_vp != NULL) { VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd); if (nd.ni_dvp == nd.ni_vp) vrele(nd.ni_dvp); else vput(nd.ni_dvp); vrele(nd.ni_vp); pathbuf_destroy(pb); return (EEXIST); } vattr_null(&vattr); vattr.va_type = VFIFO; /* We will read cwdi->cwdi_cmask unlocked. */ vattr.va_mode = (mode & ALLPERMS) &~ p->p_cwdi->cwdi_cmask; error = VOP_MKNOD(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr); if (error == 0) vrele(nd.ni_vp); vput(nd.ni_dvp); pathbuf_destroy(pb); return (error); } /* * Make a hard file link. */ /* ARGSUSED */ int do_sys_linkat(struct lwp *l, int fdpath, const char *path, int fdlink, const char *link, int follow, register_t *retval) { struct vnode *vp; struct pathbuf *linkpb; struct nameidata nd; namei_simple_flags_t ns_flags; int error; if (follow & AT_SYMLINK_FOLLOW) ns_flags = NSM_FOLLOW_TRYEMULROOT; else ns_flags = NSM_NOFOLLOW_TRYEMULROOT; error = fd_nameiat_simple_user(l, fdpath, path, ns_flags, &vp); if (error != 0) return (error); error = pathbuf_copyin(link, &linkpb); if (error) { goto out1; } NDINIT(&nd, CREATE, LOCKPARENT | TRYEMULROOT, linkpb); if ((error = fd_nameiat(l, fdlink, &nd)) != 0) goto out2; if (nd.ni_vp) { error = EEXIST; goto abortop; } /* Prevent hard links on directories. */ if (vp->v_type == VDIR) { error = EPERM; goto abortop; } /* Prevent cross-mount operation. */ if (nd.ni_dvp->v_mount != vp->v_mount) { error = EXDEV; goto abortop; } error = VOP_LINK(nd.ni_dvp, vp, &nd.ni_cnd); VOP_UNLOCK(nd.ni_dvp); vrele(nd.ni_dvp); out2: pathbuf_destroy(linkpb); out1: vrele(vp); return (error); abortop: VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd); if (nd.ni_dvp == nd.ni_vp) vrele(nd.ni_dvp); else vput(nd.ni_dvp); if (nd.ni_vp != NULL) vrele(nd.ni_vp); goto out2; } int sys_link(struct lwp *l, const struct sys_link_args *uap, register_t *retval) { /* { syscallarg(const char *) path; syscallarg(const char *) link; } */ const char *path = SCARG(uap, path); const char *link = SCARG(uap, link); return do_sys_linkat(l, AT_FDCWD, path, AT_FDCWD, link, AT_SYMLINK_FOLLOW, retval); } int sys_linkat(struct lwp *l, const struct sys_linkat_args *uap, register_t *retval) { /* { syscallarg(int) fd1; syscallarg(const char *) name1; syscallarg(int) fd2; syscallarg(const char *) name2; syscallarg(int) flags; } */ int fd1 = SCARG(uap, fd1); const char *name1 = SCARG(uap, name1); int fd2 = SCARG(uap, fd2); const char *name2 = SCARG(uap, name2); int follow; follow = SCARG(uap, flags) & AT_SYMLINK_FOLLOW; return do_sys_linkat(l, fd1, name1, fd2, name2, follow, retval); } int do_sys_symlink(const char *patharg, const char *link, enum uio_seg seg) { return do_sys_symlinkat(NULL, patharg, AT_FDCWD, link, seg); } static int do_sys_symlinkat(struct lwp *l, const char *patharg, int fdat, const char *link, enum uio_seg seg) { struct proc *p = curproc; struct vattr vattr; char *path; int error; size_t len; struct pathbuf *linkpb; struct nameidata nd; KASSERT(l != NULL || fdat == AT_FDCWD); path = PNBUF_GET(); if (seg == UIO_USERSPACE) { if ((error = copyinstr(patharg, path, MAXPATHLEN, &len)) != 0) goto out1; if ((error = pathbuf_copyin(link, &linkpb)) != 0) goto out1; } else { len = strlen(patharg) + 1; KASSERT(len <= MAXPATHLEN); memcpy(path, patharg, len); linkpb = pathbuf_create(link); if (linkpb == NULL) { error = ENOMEM; goto out1; } } ktrkuser("symlink-target", path, len - 1); NDINIT(&nd, CREATE, LOCKPARENT | TRYEMULROOT, linkpb); if ((error = fd_nameiat(l, fdat, &nd)) != 0) goto out2; if (nd.ni_vp) { VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd); if (nd.ni_dvp == nd.ni_vp) vrele(nd.ni_dvp); else vput(nd.ni_dvp); vrele(nd.ni_vp); error = EEXIST; goto out2; } vattr_null(&vattr); vattr.va_type = VLNK; /* We will read cwdi->cwdi_cmask unlocked. */ vattr.va_mode = ACCESSPERMS &~ p->p_cwdi->cwdi_cmask; error = VOP_SYMLINK(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr, path); if (error == 0) vrele(nd.ni_vp); vput(nd.ni_dvp); out2: pathbuf_destroy(linkpb); out1: PNBUF_PUT(path); return (error); } /* * Make a symbolic link. */ /* ARGSUSED */ int sys_symlink(struct lwp *l, const struct sys_symlink_args *uap, register_t *retval) { /* { syscallarg(const char *) path; syscallarg(const char *) link; } */ return do_sys_symlinkat(l, SCARG(uap, path), AT_FDCWD, SCARG(uap, link), UIO_USERSPACE); } int sys_symlinkat(struct lwp *l, const struct sys_symlinkat_args *uap, register_t *retval) { /* { syscallarg(const char *) path1; syscallarg(int) fd; syscallarg(const char *) path2; } */ return do_sys_symlinkat(l, SCARG(uap, path1), SCARG(uap, fd), SCARG(uap, path2), UIO_USERSPACE); } /* * Delete a whiteout from the filesystem. */ /* ARGSUSED */ int sys_undelete(struct lwp *l, const struct sys_undelete_args *uap, register_t *retval) { /* { syscallarg(const char *) path; } */ int error; struct pathbuf *pb; struct nameidata nd; error = pathbuf_copyin(SCARG(uap, path), &pb); if (error) { return error; } NDINIT(&nd, DELETE, LOCKPARENT | DOWHITEOUT | TRYEMULROOT, pb); error = namei(&nd); if (error) { pathbuf_destroy(pb); return (error); } if (nd.ni_vp != NULLVP || !(nd.ni_cnd.cn_flags & ISWHITEOUT)) { VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd); if (nd.ni_dvp == nd.ni_vp) vrele(nd.ni_dvp); else vput(nd.ni_dvp); if (nd.ni_vp) vrele(nd.ni_vp); pathbuf_destroy(pb); return (EEXIST); } if ((error = VOP_WHITEOUT(nd.ni_dvp, &nd.ni_cnd, DELETE)) != 0) VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd); vput(nd.ni_dvp); pathbuf_destroy(pb); return (error); } /* * Delete a name from the filesystem. */ /* ARGSUSED */ int sys_unlink(struct lwp *l, const struct sys_unlink_args *uap, register_t *retval) { /* { syscallarg(const char *) path; } */ return do_sys_unlinkat(l, AT_FDCWD, SCARG(uap, path), 0, UIO_USERSPACE); } int sys_unlinkat(struct lwp *l, const struct sys_unlinkat_args *uap, register_t *retval) { /* { syscallarg(int) fd; syscallarg(const char *) path; syscallarg(int) flag; } */ return do_sys_unlinkat(l, SCARG(uap, fd), SCARG(uap, path), SCARG(uap, flag), UIO_USERSPACE); } int do_sys_unlink(const char *arg, enum uio_seg seg) { return do_sys_unlinkat(NULL, AT_FDCWD, arg, 0, seg); } static int do_sys_unlinkat(struct lwp *l, int fdat, const char *arg, int flags, enum uio_seg seg) { struct vnode *vp; int error; struct pathbuf *pb; struct nameidata nd; const char *pathstring; KASSERT(l != NULL || fdat == AT_FDCWD); error = pathbuf_maybe_copyin(arg, seg, &pb); if (error) { return error; } pathstring = pathbuf_stringcopy_get(pb); if (pathstring == NULL) { pathbuf_destroy(pb); return ENOMEM; } NDINIT(&nd, DELETE, LOCKPARENT | LOCKLEAF | TRYEMULROOT, pb); if ((error = fd_nameiat(l, fdat, &nd)) != 0) goto out; vp = nd.ni_vp; /* * The root of a mounted filesystem cannot be deleted. */ if ((vp->v_vflag & VV_ROOT) != 0) { error = EBUSY; goto abort; } if ((vp->v_type == VDIR) && (vp->v_mountedhere != NULL)) { error = EBUSY; goto abort; } /* * No rmdir "." please. */ if (nd.ni_dvp == vp) { error = EINVAL; goto abort; } /* * AT_REMOVEDIR is required to remove a directory */ if (vp->v_type == VDIR) { if (!(flags & AT_REMOVEDIR)) { error = EPERM; goto abort; } else { error = VOP_RMDIR(nd.ni_dvp, nd.ni_vp, &nd.ni_cnd); vput(nd.ni_dvp); goto out; } } /* * Starting here we only deal with non directories. */ if (flags & AT_REMOVEDIR) { error = ENOTDIR; goto abort; } #if NVERIEXEC > 0 /* Handle remove requests for veriexec entries. */ if ((error = veriexec_removechk(curlwp, nd.ni_vp, pathstring)) != 0) { goto abort; } #endif /* NVERIEXEC > 0 */ #ifdef FILEASSOC (void)fileassoc_file_delete(vp); #endif /* FILEASSOC */ error = VOP_REMOVE(nd.ni_dvp, nd.ni_vp, &nd.ni_cnd); vput(nd.ni_dvp); goto out; abort: VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd); if (nd.ni_dvp == vp) vrele(nd.ni_dvp); else vput(nd.ni_dvp); vput(vp); out: pathbuf_stringcopy_put(pb, pathstring); pathbuf_destroy(pb); return (error); } /* * Reposition read/write file offset. */ int sys_lseek(struct lwp *l, const struct sys_lseek_args *uap, register_t *retval) { /* { syscallarg(int) fd; syscallarg(int) pad; syscallarg(off_t) offset; syscallarg(int) whence; } */ file_t *fp; int error, fd; switch (SCARG(uap, whence)) { case SEEK_CUR: case SEEK_END: case SEEK_SET: break; default: return EINVAL; } fd = SCARG(uap, fd); if ((fp = fd_getfile(fd)) == NULL) return (EBADF); if (fp->f_ops->fo_seek == NULL) { error = ESPIPE; goto out; } error = (*fp->f_ops->fo_seek)(fp, SCARG(uap, offset), SCARG(uap, whence), (off_t *)retval, FOF_UPDATE_OFFSET); out: fd_putfile(fd); return (error); } /* * Positional read system call. */ int sys_pread(struct lwp *l, const struct sys_pread_args *uap, register_t *retval) { /* { syscallarg(int) fd; syscallarg(void *) buf; syscallarg(size_t) nbyte; syscallarg(off_t) offset; } */ file_t *fp; off_t offset; int error, fd = SCARG(uap, fd); if ((fp = fd_getfile(fd)) == NULL) return (EBADF); if ((fp->f_flag & FREAD) == 0) { fd_putfile(fd); return (EBADF); } if (fp->f_ops->fo_seek == NULL) { error = ESPIPE; goto out; } offset = SCARG(uap, offset); error = (*fp->f_ops->fo_seek)(fp, offset, SEEK_SET, &offset, 0); if (error) goto out; /* dofileread() will unuse the descriptor for us */ return (dofileread(fd, fp, SCARG(uap, buf), SCARG(uap, nbyte), &offset, 0, retval)); out: fd_putfile(fd); return (error); } /* * Positional scatter read system call. */ int sys_preadv(struct lwp *l, const struct sys_preadv_args *uap, register_t *retval) { /* { syscallarg(int) fd; syscallarg(const struct iovec *) iovp; syscallarg(int) iovcnt; syscallarg(off_t) offset; } */ off_t offset = SCARG(uap, offset); return do_filereadv(SCARG(uap, fd), SCARG(uap, iovp), SCARG(uap, iovcnt), &offset, 0, retval); } /* * Positional write system call. */ int sys_pwrite(struct lwp *l, const struct sys_pwrite_args *uap, register_t *retval) { /* { syscallarg(int) fd; syscallarg(const void *) buf; syscallarg(size_t) nbyte; syscallarg(off_t) offset; } */ file_t *fp; off_t offset; int error, fd = SCARG(uap, fd); if ((fp = fd_getfile(fd)) == NULL) return (EBADF); if ((fp->f_flag & FWRITE) == 0) { fd_putfile(fd); return (EBADF); } if (fp->f_ops->fo_seek == NULL) { error = ESPIPE; goto out; } offset = SCARG(uap, offset); error = (*fp->f_ops->fo_seek)(fp, offset, SEEK_SET, &offset, 0); if (error) goto out; /* dofilewrite() will unuse the descriptor for us */ return (dofilewrite(fd, fp, SCARG(uap, buf), SCARG(uap, nbyte), &offset, 0, retval)); out: fd_putfile(fd); return (error); } /* * Positional gather write system call. */ int sys_pwritev(struct lwp *l, const struct sys_pwritev_args *uap, register_t *retval) { /* { syscallarg(int) fd; syscallarg(const struct iovec *) iovp; syscallarg(int) iovcnt; syscallarg(off_t) offset; } */ off_t offset = SCARG(uap, offset); return do_filewritev(SCARG(uap, fd), SCARG(uap, iovp), SCARG(uap, iovcnt), &offset, 0, retval); } /* * Check access permissions. */ int sys_access(struct lwp *l, const struct sys_access_args *uap, register_t *retval) { /* { syscallarg(const char *) path; syscallarg(int) flags; } */ return do_sys_accessat(l, AT_FDCWD, SCARG(uap, path), SCARG(uap, flags), 0); } int do_sys_accessat(struct lwp *l, int fdat, const char *path, int mode, int flags) { kauth_cred_t cred; struct vnode *vp; int error, nd_flag, vmode; struct pathbuf *pb; struct nameidata nd; CTASSERT(F_OK == 0); if ((mode & ~(R_OK | W_OK | X_OK)) != 0) { /* nonsense mode */ return EINVAL; } nd_flag = FOLLOW | LOCKLEAF | LOCKSHARED | TRYEMULROOT; if (flags & AT_SYMLINK_NOFOLLOW) nd_flag &= ~FOLLOW; error = pathbuf_copyin(path, &pb); if (error) return error; NDINIT(&nd, LOOKUP, nd_flag, pb); /* Override default credentials */ if (!(flags & AT_EACCESS)) { cred = kauth_cred_dup(l->l_cred); kauth_cred_seteuid(cred, kauth_cred_getuid(l->l_cred)); kauth_cred_setegid(cred, kauth_cred_getgid(l->l_cred)); } else cred = l->l_cred; nd.ni_cnd.cn_cred = cred; if ((error = fd_nameiat(l, fdat, &nd)) != 0) { pathbuf_destroy(pb); goto out; } vp = nd.ni_vp; pathbuf_destroy(pb); /* Flags == 0 means only check for existence. */ if (mode) { vmode = 0; if (mode & R_OK) vmode |= VREAD; if (mode & W_OK) vmode |= VWRITE; if (mode & X_OK) vmode |= VEXEC; error = VOP_ACCESS(vp, vmode, cred); if (!error && (vmode & VWRITE)) error = vn_writechk(vp); } vput(vp); out: if (!(flags & AT_EACCESS)) kauth_cred_free(cred); return (error); } int sys_faccessat(struct lwp *l, const struct sys_faccessat_args *uap, register_t *retval) { /* { syscallarg(int) fd; syscallarg(const char *) path; syscallarg(int) amode; syscallarg(int) flag; } */ return do_sys_accessat(l, SCARG(uap, fd), SCARG(uap, path), SCARG(uap, amode), SCARG(uap, flag)); } /* * Common code for all sys_stat functions, including compat versions. */ int do_sys_stat(const char *userpath, unsigned int nd_flag, struct stat *sb) { return do_sys_statat(NULL, AT_FDCWD, userpath, nd_flag, sb); } int do_sys_statat(struct lwp *l, int fdat, const char *userpath, unsigned int nd_flag, struct stat *sb) { int error; struct pathbuf *pb; struct nameidata nd; KASSERT(l != NULL || fdat == AT_FDCWD); error = pathbuf_copyin(userpath, &pb); if (error) { return error; } NDINIT(&nd, LOOKUP, nd_flag | LOCKLEAF | TRYEMULROOT, pb); error = fd_nameiat(l, fdat, &nd); if (error != 0) { pathbuf_destroy(pb); return error; } error = vn_stat(nd.ni_vp, sb); vput(nd.ni_vp); pathbuf_destroy(pb); return error; } /* * Get file status; this version follows links. */ /* ARGSUSED */ int sys___stat50(struct lwp *l, const struct sys___stat50_args *uap, register_t *retval) { /* { syscallarg(const char *) path; syscallarg(struct stat *) ub; } */ struct stat sb; int error; error = do_sys_statat(l, AT_FDCWD, SCARG(uap, path), FOLLOW, &sb); if (error) return error; return copyout(&sb, SCARG(uap, ub), sizeof(sb)); } /* * Get file status; this version does not follow links. */ /* ARGSUSED */ int sys___lstat50(struct lwp *l, const struct sys___lstat50_args *uap, register_t *retval) { /* { syscallarg(const char *) path; syscallarg(struct stat *) ub; } */ struct stat sb; int error; error = do_sys_statat(l, AT_FDCWD, SCARG(uap, path), NOFOLLOW, &sb); if (error) return error; return copyout(&sb, SCARG(uap, ub), sizeof(sb)); } int sys_fstatat(struct lwp *l, const struct sys_fstatat_args *uap, register_t *retval) { /* { syscallarg(int) fd; syscallarg(const char *) path; syscallarg(struct stat *) buf; syscallarg(int) flag; } */ unsigned int nd_flag; struct stat sb; int error; if (SCARG(uap, flag) & AT_SYMLINK_NOFOLLOW) nd_flag = NOFOLLOW; else nd_flag = FOLLOW; error = do_sys_statat(l, SCARG(uap, fd), SCARG(uap, path), nd_flag, &sb); if (error) return error; return copyout(&sb, SCARG(uap, buf), sizeof(sb)); } static int kern_pathconf(register_t *retval, const char *path, int name, int flag) { int error; struct pathbuf *pb; struct nameidata nd; error = pathbuf_copyin(path, &pb); if (error) { return error; } NDINIT(&nd, LOOKUP, flag | LOCKLEAF | TRYEMULROOT, pb); if ((error = namei(&nd)) != 0) { pathbuf_destroy(pb); return error; } error = VOP_PATHCONF(nd.ni_vp, name, retval); vput(nd.ni_vp); pathbuf_destroy(pb); return error; } /* * Get configurable pathname variables. */ /* ARGSUSED */ int sys_pathconf(struct lwp *l, const struct sys_pathconf_args *uap, register_t *retval) { /* { syscallarg(const char *) path; syscallarg(int) name; } */ return kern_pathconf(retval, SCARG(uap, path), SCARG(uap, name), FOLLOW); } /* ARGSUSED */ int sys_lpathconf(struct lwp *l, const struct sys_lpathconf_args *uap, register_t *retval) { /* { syscallarg(const char *) path; syscallarg(int) name; } */ return kern_pathconf(retval, SCARG(uap, path), SCARG(uap, name), NOFOLLOW); } /* * Return target name of a symbolic link. */ /* ARGSUSED */ int sys_readlink(struct lwp *l, const struct sys_readlink_args *uap, register_t *retval) { /* { syscallarg(const char *) path; syscallarg(char *) buf; syscallarg(size_t) count; } */ return do_sys_readlinkat(l, AT_FDCWD, SCARG(uap, path), SCARG(uap, buf), SCARG(uap, count), retval); } static int do_sys_readlinkat(struct lwp *l, int fdat, const char *path, char *buf, size_t count, register_t *retval) { struct vnode *vp; struct iovec aiov; struct uio auio; int error; struct pathbuf *pb; struct nameidata nd; error = pathbuf_copyin(path, &pb); if (error) { return error; } NDINIT(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | LOCKSHARED | TRYEMULROOT, pb); if ((error = fd_nameiat(l, fdat, &nd)) != 0) { pathbuf_destroy(pb); return error; } vp = nd.ni_vp; pathbuf_destroy(pb); if (vp->v_type != VLNK) error = EINVAL; else if (!(vp->v_mount->mnt_flag & MNT_SYMPERM) || (error = VOP_ACCESS(vp, VREAD, l->l_cred)) == 0) { aiov.iov_base = buf; aiov.iov_len = count; auio.uio_iov = &aiov; auio.uio_iovcnt = 1; auio.uio_offset = 0; auio.uio_rw = UIO_READ; KASSERT(l == curlwp); auio.uio_vmspace = l->l_proc->p_vmspace; auio.uio_resid = count; if ((error = VOP_READLINK(vp, &auio, l->l_cred)) == 0) *retval = count - auio.uio_resid; } vput(vp); return (error); } int sys_readlinkat(struct lwp *l, const struct sys_readlinkat_args *uap, register_t *retval) { /* { syscallarg(int) fd; syscallarg(const char *) path; syscallarg(char *) buf; syscallarg(size_t) bufsize; } */ return do_sys_readlinkat(l, SCARG(uap, fd), SCARG(uap, path), SCARG(uap, buf), SCARG(uap, bufsize), retval); } /* * Change flags of a file given a path name. */ /* ARGSUSED */ int sys_chflags(struct lwp *l, const struct sys_chflags_args *uap, register_t *retval) { /* { syscallarg(const char *) path; syscallarg(u_long) flags; } */ struct vnode *vp; int error; error = namei_simple_user(SCARG(uap, path), NSM_FOLLOW_TRYEMULROOT, &vp); if (error != 0) return (error); error = change_flags(vp, SCARG(uap, flags), l); vput(vp); return (error); } /* * Change flags of a file given a file descriptor. */ /* ARGSUSED */ int sys_fchflags(struct lwp *l, const struct sys_fchflags_args *uap, register_t *retval) { /* { syscallarg(int) fd; syscallarg(u_long) flags; } */ struct vnode *vp; file_t *fp; int error; /* fd_getvnode() will use the descriptor for us */ if ((error = fd_getvnode(SCARG(uap, fd), &fp)) != 0) return (error); vp = fp->f_vnode; error = change_flags(vp, SCARG(uap, flags), l); VOP_UNLOCK(vp); fd_putfile(SCARG(uap, fd)); return (error); } /* * Change flags of a file given a path name; this version does * not follow links. */ int sys_lchflags(struct lwp *l, const struct sys_lchflags_args *uap, register_t *retval) { /* { syscallarg(const char *) path; syscallarg(u_long) flags; } */ struct vnode *vp; int error; error = namei_simple_user(SCARG(uap, path), NSM_NOFOLLOW_TRYEMULROOT, &vp); if (error != 0) return (error); error = change_flags(vp, SCARG(uap, flags), l); vput(vp); return (error); } /* * Common routine to change flags of a file. */ int change_flags(struct vnode *vp, u_long flags, struct lwp *l) { struct vattr vattr; int error; vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); vattr_null(&vattr); vattr.va_flags = flags; error = VOP_SETATTR(vp, &vattr, l->l_cred); return (error); } /* * Change mode of a file given path name; this version follows links. */ /* ARGSUSED */ int sys_chmod(struct lwp *l, const struct sys_chmod_args *uap, register_t *retval) { /* { syscallarg(const char *) path; syscallarg(int) mode; } */ return do_sys_chmodat(l, AT_FDCWD, SCARG(uap, path), SCARG(uap, mode), 0); } int do_sys_chmodat(struct lwp *l, int fdat, const char *path, int mode, int flags) { int error; struct vnode *vp; namei_simple_flags_t ns_flag; if (flags & AT_SYMLINK_NOFOLLOW) ns_flag = NSM_NOFOLLOW_TRYEMULROOT; else ns_flag = NSM_FOLLOW_TRYEMULROOT; error = fd_nameiat_simple_user(l, fdat, path, ns_flag, &vp); if (error != 0) return error; error = change_mode(vp, mode, l); vrele(vp); return (error); } /* * Change mode of a file given a file descriptor. */ /* ARGSUSED */ int sys_fchmod(struct lwp *l, const struct sys_fchmod_args *uap, register_t *retval) { /* { syscallarg(int) fd; syscallarg(int) mode; } */ file_t *fp; int error; /* fd_getvnode() will use the descriptor for us */ if ((error = fd_getvnode(SCARG(uap, fd), &fp)) != 0) return (error); error = change_mode(fp->f_vnode, SCARG(uap, mode), l); fd_putfile(SCARG(uap, fd)); return (error); } int sys_fchmodat(struct lwp *l, const struct sys_fchmodat_args *uap, register_t *retval) { /* { syscallarg(int) fd; syscallarg(const char *) path; syscallarg(int) mode; syscallarg(int) flag; } */ return do_sys_chmodat(l, SCARG(uap, fd), SCARG(uap, path), SCARG(uap, mode), SCARG(uap, flag)); } /* * Change mode of a file given path name; this version does not follow links. */ /* ARGSUSED */ int sys_lchmod(struct lwp *l, const struct sys_lchmod_args *uap, register_t *retval) { /* { syscallarg(const char *) path; syscallarg(int) mode; } */ int error; struct vnode *vp; error = namei_simple_user(SCARG(uap, path), NSM_NOFOLLOW_TRYEMULROOT, &vp); if (error != 0) return (error); error = change_mode(vp, SCARG(uap, mode), l); vrele(vp); return (error); } /* * Common routine to set mode given a vnode. */ static int change_mode(struct vnode *vp, int mode, struct lwp *l) { struct vattr vattr; int error; vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); vattr_null(&vattr); vattr.va_mode = mode & ALLPERMS; error = VOP_SETATTR(vp, &vattr, l->l_cred); VOP_UNLOCK(vp); return (error); } /* * Set ownership given a path name; this version follows links. */ /* ARGSUSED */ int sys_chown(struct lwp *l, const struct sys_chown_args *uap, register_t *retval) { /* { syscallarg(const char *) path; syscallarg(uid_t) uid; syscallarg(gid_t) gid; } */ return do_sys_chownat(l, AT_FDCWD, SCARG(uap, path), SCARG(uap,uid), SCARG(uap, gid), 0); } int do_sys_chownat(struct lwp *l, int fdat, const char *path, uid_t uid, gid_t gid, int flags) { int error; struct vnode *vp; namei_simple_flags_t ns_flag; if (flags & AT_SYMLINK_NOFOLLOW) ns_flag = NSM_NOFOLLOW_TRYEMULROOT; else ns_flag = NSM_FOLLOW_TRYEMULROOT; error = fd_nameiat_simple_user(l, fdat, path, ns_flag, &vp); if (error != 0) return error; error = change_owner(vp, uid, gid, l, 0); vrele(vp); return (error); } /* * Set ownership given a path name; this version follows links. * Provides POSIX semantics. */ /* ARGSUSED */ int sys___posix_chown(struct lwp *l, const struct sys___posix_chown_args *uap, register_t *retval) { /* { syscallarg(const char *) path; syscallarg(uid_t) uid; syscallarg(gid_t) gid; } */ int error; struct vnode *vp; error = namei_simple_user(SCARG(uap, path), NSM_FOLLOW_TRYEMULROOT, &vp); if (error != 0) return (error); error = change_owner(vp, SCARG(uap, uid), SCARG(uap, gid), l, 1); vrele(vp); return (error); } /* * Set ownership given a file descriptor. */ /* ARGSUSED */ int sys_fchown(struct lwp *l, const struct sys_fchown_args *uap, register_t *retval) { /* { syscallarg(int) fd; syscallarg(uid_t) uid; syscallarg(gid_t) gid; } */ int error; file_t *fp; /* fd_getvnode() will use the descriptor for us */ if ((error = fd_getvnode(SCARG(uap, fd), &fp)) != 0) return (error); error = change_owner(fp->f_vnode, SCARG(uap, uid), SCARG(uap, gid), l, 0); fd_putfile(SCARG(uap, fd)); return (error); } int sys_fchownat(struct lwp *l, const struct sys_fchownat_args *uap, register_t *retval) { /* { syscallarg(int) fd; syscallarg(const char *) path; syscallarg(uid_t) owner; syscallarg(gid_t) group; syscallarg(int) flag; } */ return do_sys_chownat(l, SCARG(uap, fd), SCARG(uap, path), SCARG(uap, owner), SCARG(uap, group), SCARG(uap, flag)); } /* * Set ownership given a file descriptor, providing POSIX/XPG semantics. */ /* ARGSUSED */ int sys___posix_fchown(struct lwp *l, const struct sys___posix_fchown_args *uap, register_t *retval) { /* { syscallarg(int) fd; syscallarg(uid_t) uid; syscallarg(gid_t) gid; } */ int error; file_t *fp; /* fd_getvnode() will use the descriptor for us */ if ((error = fd_getvnode(SCARG(uap, fd), &fp)) != 0) return (error); error = change_owner(fp->f_vnode, SCARG(uap, uid), SCARG(uap, gid), l, 1); fd_putfile(SCARG(uap, fd)); return (error); } /* * Set ownership given a path name; this version does not follow links. */ /* ARGSUSED */ int sys_lchown(struct lwp *l, const struct sys_lchown_args *uap, register_t *retval) { /* { syscallarg(const char *) path; syscallarg(uid_t) uid; syscallarg(gid_t) gid; } */ int error; struct vnode *vp; error = namei_simple_user(SCARG(uap, path), NSM_NOFOLLOW_TRYEMULROOT, &vp); if (error != 0) return (error); error = change_owner(vp, SCARG(uap, uid), SCARG(uap, gid), l, 0); vrele(vp); return (error); } /* * Set ownership given a path name; this version does not follow links. * Provides POSIX/XPG semantics. */ /* ARGSUSED */ int sys___posix_lchown(struct lwp *l, const struct sys___posix_lchown_args *uap, register_t *retval) { /* { syscallarg(const char *) path; syscallarg(uid_t) uid; syscallarg(gid_t) gid; } */ int error; struct vnode *vp; error = namei_simple_user(SCARG(uap, path), NSM_NOFOLLOW_TRYEMULROOT, &vp); if (error != 0) return (error); error = change_owner(vp, SCARG(uap, uid), SCARG(uap, gid), l, 1); vrele(vp); return (error); } /* * Common routine to set ownership given a vnode. */ static int change_owner(struct vnode *vp, uid_t uid, gid_t gid, struct lwp *l, int posix_semantics) { struct vattr vattr; mode_t newmode; int error; vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); if ((error = VOP_GETATTR(vp, &vattr, l->l_cred)) != 0) goto out; #define CHANGED(x) ((int)(x) != -1) newmode = vattr.va_mode; if (posix_semantics) { /* * POSIX/XPG semantics: if the caller is not the super-user, * clear set-user-id and set-group-id bits. Both POSIX and * the XPG consider the behaviour for calls by the super-user * implementation-defined; we leave the set-user-id and set- * group-id settings intact in that case. */ if (vattr.va_mode & S_ISUID) { if (kauth_authorize_vnode(l->l_cred, KAUTH_VNODE_RETAIN_SUID, vp, NULL, EPERM) != 0) newmode &= ~S_ISUID; } if (vattr.va_mode & S_ISGID) { if (kauth_authorize_vnode(l->l_cred, KAUTH_VNODE_RETAIN_SGID, vp, NULL, EPERM) != 0) newmode &= ~S_ISGID; } } else { /* * NetBSD semantics: when changing owner and/or group, * clear the respective bit(s). */ if (CHANGED(uid)) newmode &= ~S_ISUID; if (CHANGED(gid)) newmode &= ~S_ISGID; } /* Update va_mode iff altered. */ if (vattr.va_mode == newmode) newmode = VNOVAL; vattr_null(&vattr); vattr.va_uid = CHANGED(uid) ? uid : (uid_t)VNOVAL; vattr.va_gid = CHANGED(gid) ? gid : (gid_t)VNOVAL; vattr.va_mode = newmode; error = VOP_SETATTR(vp, &vattr, l->l_cred); #undef CHANGED out: VOP_UNLOCK(vp); return (error); } /* * Set the access and modification times given a path name; this * version follows links. */ /* ARGSUSED */ int sys___utimes50(struct lwp *l, const struct sys___utimes50_args *uap, register_t *retval) { /* { syscallarg(const char *) path; syscallarg(const struct timeval *) tptr; } */ return do_sys_utimes(l, NULL, SCARG(uap, path), FOLLOW, SCARG(uap, tptr), UIO_USERSPACE); } /* * Set the access and modification times given a file descriptor. */ /* ARGSUSED */ int sys___futimes50(struct lwp *l, const struct sys___futimes50_args *uap, register_t *retval) { /* { syscallarg(int) fd; syscallarg(const struct timeval *) tptr; } */ int error; file_t *fp; /* fd_getvnode() will use the descriptor for us */ if ((error = fd_getvnode(SCARG(uap, fd), &fp)) != 0) return (error); error = do_sys_utimes(l, fp->f_vnode, NULL, 0, SCARG(uap, tptr), UIO_USERSPACE); fd_putfile(SCARG(uap, fd)); return (error); } int sys_futimens(struct lwp *l, const struct sys_futimens_args *uap, register_t *retval) { /* { syscallarg(int) fd; syscallarg(const struct timespec *) tptr; } */ int error; file_t *fp; /* fd_getvnode() will use the descriptor for us */ if ((error = fd_getvnode(SCARG(uap, fd), &fp)) != 0) return (error); error = do_sys_utimensat(l, AT_FDCWD, fp->f_vnode, NULL, 0, SCARG(uap, tptr), UIO_USERSPACE); fd_putfile(SCARG(uap, fd)); return (error); } /* * Set the access and modification times given a path name; this * version does not follow links. */ int sys___lutimes50(struct lwp *l, const struct sys___lutimes50_args *uap, register_t *retval) { /* { syscallarg(const char *) path; syscallarg(const struct timeval *) tptr; } */ return do_sys_utimes(l, NULL, SCARG(uap, path), NOFOLLOW, SCARG(uap, tptr), UIO_USERSPACE); } int sys_utimensat(struct lwp *l, const struct sys_utimensat_args *uap, register_t *retval) { /* { syscallarg(int) fd; syscallarg(const char *) path; syscallarg(const struct timespec *) tptr; syscallarg(int) flag; } */ int follow; const struct timespec *tptr; int error; tptr = SCARG(uap, tptr); follow = (SCARG(uap, flag) & AT_SYMLINK_NOFOLLOW) ? NOFOLLOW : FOLLOW; error = do_sys_utimensat(l, SCARG(uap, fd), NULL, SCARG(uap, path), follow, tptr, UIO_USERSPACE); return error; } /* * Common routine to set access and modification times given a vnode. */ int do_sys_utimens(struct lwp *l, struct vnode *vp, const char *path, int flag, const struct timespec *tptr, enum uio_seg seg) { return do_sys_utimensat(l, AT_FDCWD, vp, path, flag, tptr, seg); } int do_sys_utimensat(struct lwp *l, int fdat, struct vnode *vp, const char *path, int flag, const struct timespec *tptr, enum uio_seg seg) { struct vattr vattr; int error, dorele = 0; namei_simple_flags_t sflags; bool vanull, setbirthtime; struct timespec ts[2]; KASSERT(l != NULL || fdat == AT_FDCWD); /* * I have checked all callers and they pass either FOLLOW, * NOFOLLOW, or 0 (when they don't pass a path), and NOFOLLOW * is 0. More to the point, they don't pass anything else. * Let's keep it that way at least until the namei interfaces * are fully sanitized. */ KASSERT(flag == NOFOLLOW || flag == FOLLOW); sflags = (flag == FOLLOW) ? NSM_FOLLOW_TRYEMULROOT : NSM_NOFOLLOW_TRYEMULROOT; if (tptr == NULL) { vanull = true; nanotime(&ts[0]); ts[1] = ts[0]; } else { vanull = false; if (seg != UIO_SYSSPACE) { error = copyin(tptr, ts, sizeof (ts)); if (error != 0) return error; } else { ts[0] = tptr[0]; ts[1] = tptr[1]; } } if (ts[0].tv_nsec == UTIME_NOW) { nanotime(&ts[0]); if (ts[1].tv_nsec == UTIME_NOW) { vanull = true; ts[1] = ts[0]; } } else if (ts[1].tv_nsec == UTIME_NOW) nanotime(&ts[1]); if (vp == NULL) { /* note: SEG describes TPTR, not PATH; PATH is always user */ error = fd_nameiat_simple_user(l, fdat, path, sflags, &vp); if (error != 0) return error; dorele = 1; } vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); setbirthtime = (VOP_GETATTR(vp, &vattr, l->l_cred) == 0 && timespeccmp(&ts[1], &vattr.va_birthtime, <)); vattr_null(&vattr); if (ts[0].tv_nsec != UTIME_OMIT) vattr.va_atime = ts[0]; if (ts[1].tv_nsec != UTIME_OMIT) { vattr.va_mtime = ts[1]; if (setbirthtime) vattr.va_birthtime = ts[1]; } if (vanull) vattr.va_vaflags |= VA_UTIMES_NULL; error = VOP_SETATTR(vp, &vattr, l->l_cred); VOP_UNLOCK(vp); if (dorele != 0) vrele(vp); return error; } int do_sys_utimes(struct lwp *l, struct vnode *vp, const char *path, int flag, const struct timeval *tptr, enum uio_seg seg) { struct timespec ts[2]; struct timespec *tsptr = NULL; int error; if (tptr != NULL) { struct timeval tv[2]; if (seg != UIO_SYSSPACE) { error = copyin(tptr, tv, sizeof(tv)); if (error != 0) return error; tptr = tv; } if ((tptr[0].tv_usec == UTIME_NOW) || (tptr[0].tv_usec == UTIME_OMIT)) ts[0].tv_nsec = tptr[0].tv_usec; else { if (tptr[0].tv_usec < 0 || tptr[0].tv_usec >= 1000000) return EINVAL; TIMEVAL_TO_TIMESPEC(&tptr[0], &ts[0]); } if ((tptr[1].tv_usec == UTIME_NOW) || (tptr[1].tv_usec == UTIME_OMIT)) ts[1].tv_nsec = tptr[1].tv_usec; else { if (tptr[1].tv_usec < 0 || tptr[1].tv_usec >= 1000000) return EINVAL; TIMEVAL_TO_TIMESPEC(&tptr[1], &ts[1]); } tsptr = &ts[0]; } return do_sys_utimens(l, vp, path, flag, tsptr, UIO_SYSSPACE); } /* * Truncate a file given its path name. */ /* ARGSUSED */ int sys_truncate(struct lwp *l, const struct sys_truncate_args *uap, register_t *retval) { /* { syscallarg(const char *) path; syscallarg(int) pad; syscallarg(off_t) length; } */ struct vnode *vp; struct vattr vattr; int error; if (SCARG(uap, length) < 0) return EINVAL; error = namei_simple_user(SCARG(uap, path), NSM_FOLLOW_TRYEMULROOT, &vp); if (error != 0) return (error); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); if (vp->v_type == VDIR) error = EISDIR; else if ((error = vn_writechk(vp)) == 0 && (error = VOP_ACCESS(vp, VWRITE, l->l_cred)) == 0) { vattr_null(&vattr); vattr.va_size = SCARG(uap, length); error = VOP_SETATTR(vp, &vattr, l->l_cred); } vput(vp); return (error); } /* * Truncate a file given a file descriptor. */ /* ARGSUSED */ int sys_ftruncate(struct lwp *l, const struct sys_ftruncate_args *uap, register_t *retval) { /* { syscallarg(int) fd; syscallarg(int) pad; syscallarg(off_t) length; } */ file_t *fp; int error, fd = SCARG(uap, fd); fp = fd_getfile(fd); if (fp == NULL) return EBADF; if (fp->f_ops->fo_truncate == NULL) error = EOPNOTSUPP; else error = (*fp->f_ops->fo_truncate)(fp, SCARG(uap, length)); fd_putfile(fd); return error; } /* * Sync an open file. */ /* ARGSUSED */ int sys_fsync(struct lwp *l, const struct sys_fsync_args *uap, register_t *retval) { /* { syscallarg(int) fd; } */ struct vnode *vp; file_t *fp; int error; /* fd_getvnode() will use the descriptor for us */ if ((error = fd_getvnode(SCARG(uap, fd), &fp)) != 0) return (error); vp = fp->f_vnode; vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); error = VOP_FSYNC(vp, fp->f_cred, FSYNC_WAIT, 0, 0); VOP_UNLOCK(vp); fd_putfile(SCARG(uap, fd)); return (error); } /* * Sync a range of file data. API modeled after that found in AIX. * * FDATASYNC indicates that we need only save enough metadata to be able * to re-read the written data. */ /* ARGSUSED */ int sys_fsync_range(struct lwp *l, const struct sys_fsync_range_args *uap, register_t *retval) { /* { syscallarg(int) fd; syscallarg(int) flags; syscallarg(off_t) start; syscallarg(off_t) length; } */ struct vnode *vp; file_t *fp; int flags, nflags; off_t s, e, len; int error; /* fd_getvnode() will use the descriptor for us */ if ((error = fd_getvnode(SCARG(uap, fd), &fp)) != 0) return (error); if ((fp->f_flag & FWRITE) == 0) { error = EBADF; goto out; } flags = SCARG(uap, flags); if (((flags & (FDATASYNC | FFILESYNC)) == 0) || ((~flags & (FDATASYNC | FFILESYNC)) == 0)) { error = EINVAL; goto out; } /* Now set up the flags for value(s) to pass to VOP_FSYNC() */ if (flags & FDATASYNC) nflags = FSYNC_DATAONLY | FSYNC_WAIT; else nflags = FSYNC_WAIT; if (flags & FDISKSYNC) nflags |= FSYNC_CACHE; len = SCARG(uap, length); /* If length == 0, we do the whole file, and s = e = 0 will do that */ if (len) { s = SCARG(uap, start); if (s < 0 || len < 0 || len > OFF_T_MAX - s) { error = EINVAL; goto out; } e = s + len; KASSERT(s <= e); } else { e = 0; s = 0; } vp = fp->f_vnode; vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); error = VOP_FSYNC(vp, fp->f_cred, nflags, s, e); VOP_UNLOCK(vp); out: fd_putfile(SCARG(uap, fd)); return (error); } /* * Sync the data of an open file. */ /* ARGSUSED */ int sys_fdatasync(struct lwp *l, const struct sys_fdatasync_args *uap, register_t *retval) { /* { syscallarg(int) fd; } */ struct vnode *vp; file_t *fp; int error; /* fd_getvnode() will use the descriptor for us */ if ((error = fd_getvnode(SCARG(uap, fd), &fp)) != 0) return (error); vp = fp->f_vnode; vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); error = VOP_FSYNC(vp, fp->f_cred, FSYNC_WAIT|FSYNC_DATAONLY, 0, 0); VOP_UNLOCK(vp); fd_putfile(SCARG(uap, fd)); return (error); } /* * Rename files, (standard) BSD semantics frontend. */ /* ARGSUSED */ int sys_rename(struct lwp *l, const struct sys_rename_args *uap, register_t *retval) { /* { syscallarg(const char *) from; syscallarg(const char *) to; } */ return (do_sys_renameat(l, AT_FDCWD, SCARG(uap, from), AT_FDCWD, SCARG(uap, to), UIO_USERSPACE, 0)); } int sys_renameat(struct lwp *l, const struct sys_renameat_args *uap, register_t *retval) { /* { syscallarg(int) fromfd; syscallarg(const char *) from; syscallarg(int) tofd; syscallarg(const char *) to; } */ return (do_sys_renameat(l, SCARG(uap, fromfd), SCARG(uap, from), SCARG(uap, tofd), SCARG(uap, to), UIO_USERSPACE, 0)); } /* * Rename files, POSIX semantics frontend. */ /* ARGSUSED */ int sys___posix_rename(struct lwp *l, const struct sys___posix_rename_args *uap, register_t *retval) { /* { syscallarg(const char *) from; syscallarg(const char *) to; } */ return (do_sys_renameat(l, AT_FDCWD, SCARG(uap, from), AT_FDCWD, SCARG(uap, to), UIO_USERSPACE, 1)); } /* * Rename files. Source and destination must either both be directories, * or both not be directories. If target is a directory, it must be empty. * If `from' and `to' refer to the same object, the value of the `retain' * argument is used to determine whether `from' will be * * (retain == 0) deleted unless `from' and `to' refer to the same * object in the file system's name space (BSD). * (retain == 1) always retained (POSIX). * * XXX Synchronize with nfsrv_rename in nfs_serv.c. */ int do_sys_rename(const char *from, const char *to, enum uio_seg seg, int retain) { return do_sys_renameat(NULL, AT_FDCWD, from, AT_FDCWD, to, seg, retain); } static int do_sys_renameat(struct lwp *l, int fromfd, const char *from, int tofd, const char *to, enum uio_seg seg, int retain) { struct pathbuf *fpb, *tpb; struct nameidata fnd, tnd; struct vnode *fdvp, *fvp; struct vnode *tdvp, *tvp; struct mount *mp, *tmp; int error; KASSERT(l != NULL || fromfd == AT_FDCWD); KASSERT(l != NULL || tofd == AT_FDCWD); error = pathbuf_maybe_copyin(from, seg, &fpb); if (error) goto out0; KASSERT(fpb != NULL); error = pathbuf_maybe_copyin(to, seg, &tpb); if (error) goto out1; KASSERT(tpb != NULL); /* * Lookup from. * * XXX LOCKPARENT is wrong because we don't actually want it * locked yet, but (a) namei is insane, and (b) VOP_RENAME is * insane, so for the time being we need to leave it like this. */ NDINIT(&fnd, DELETE, (LOCKPARENT | TRYEMULROOT), fpb); if ((error = fd_nameiat(l, fromfd, &fnd)) != 0) goto out2; /* * Pull out the important results of the lookup, fdvp and fvp. * Of course, fvp is bogus because we're about to unlock fdvp. */ fdvp = fnd.ni_dvp; fvp = fnd.ni_vp; mp = fdvp->v_mount; KASSERT(fdvp != NULL); KASSERT(fvp != NULL); KASSERT((fdvp == fvp) || (VOP_ISLOCKED(fdvp) == LK_EXCLUSIVE)); /* * Bracket the operation with fstrans_start()/fstrans_done(). * * Inside the bracket this file system cannot be unmounted so * a vnode on this file system cannot change its v_mount. * A vnode on another file system may still change to dead mount. */ fstrans_start(mp); /* * Make sure neither fdvp nor fvp is locked. */ if (fdvp != fvp) VOP_UNLOCK(fdvp); /* XXX KASSERT(VOP_ISLOCKED(fdvp) != LK_EXCLUSIVE); */ /* XXX KASSERT(VOP_ISLOCKED(fvp) != LK_EXCLUSIVE); */ /* * Reject renaming `.' and `..'. Can't do this until after * namei because we need namei's parsing to find the final * component name. (namei should just leave us with the final * component name and not look it up itself, but anyway...) * * This was here before because we used to relookup from * instead of to and relookup requires the caller to check * this, but now file systems may depend on this check, so we * must retain it until the file systems are all rototilled. */ if (((fnd.ni_cnd.cn_namelen == 1) && (fnd.ni_cnd.cn_nameptr[0] == '.')) || ((fnd.ni_cnd.cn_namelen == 2) && (fnd.ni_cnd.cn_nameptr[0] == '.') && (fnd.ni_cnd.cn_nameptr[1] == '.'))) { error = EINVAL; /* XXX EISDIR? */ goto abort0; } /* * Lookup to. * * XXX LOCKPARENT is wrong, but...insanity, &c. Also, using * fvp here to decide whether to add CREATEDIR is a load of * bollocks because fvp might be the wrong node by now, since * fdvp is unlocked. * * XXX Why not pass CREATEDIR always? */ NDINIT(&tnd, RENAME, (LOCKPARENT | NOCACHE | TRYEMULROOT | ((fvp->v_type == VDIR)? CREATEDIR : 0)), tpb); if ((error = fd_nameiat(l, tofd, &tnd)) != 0) goto abort0; /* * Pull out the important results of the lookup, tdvp and tvp. * Of course, tvp is bogus because we're about to unlock tdvp. */ tdvp = tnd.ni_dvp; tvp = tnd.ni_vp; KASSERT(tdvp != NULL); KASSERT((tdvp == tvp) || (VOP_ISLOCKED(tdvp) == LK_EXCLUSIVE)); if (fvp->v_type == VDIR) tnd.ni_cnd.cn_flags |= WILLBEDIR; /* * Make sure neither tdvp nor tvp is locked. */ if (tdvp != tvp) VOP_UNLOCK(tdvp); /* XXX KASSERT(VOP_ISLOCKED(tdvp) != LK_EXCLUSIVE); */ /* XXX KASSERT((tvp == NULL) || (VOP_ISLOCKED(tvp) != LK_EXCLUSIVE)); */ /* * Reject renaming onto `.' or `..'. relookup is unhappy with * these, which is why we must do this here. Once upon a time * we relooked up from instead of to, and consequently didn't * need this check, but now that we relookup to instead of * from, we need this; and we shall need it forever forward * until the VOP_RENAME protocol changes, because file systems * will no doubt begin to depend on this check. */ if ((tnd.ni_cnd.cn_namelen == 1) && (tnd.ni_cnd.cn_nameptr[0] == '.')) { error = EISDIR; goto abort1; } if ((tnd.ni_cnd.cn_namelen == 2) && (tnd.ni_cnd.cn_nameptr[0] == '.') && (tnd.ni_cnd.cn_nameptr[1] == '.')) { error = EINVAL; goto abort1; } /* * Make sure the mount points match. Although we don't hold * any vnode locks, the v_mount on fdvp file system are stable. * * Unmounting another file system at an inopportune moment may * cause tdvp to disappear and change its v_mount to dead. * * So in either case different v_mount means cross-device rename. */ KASSERT(mp != NULL); tmp = tdvp->v_mount; if (mp != tmp) { error = EXDEV; goto abort1; } /* * Take the vfs rename lock to avoid cross-directory screw cases. * Nothing is locked currently, so taking this lock is safe. */ error = VFS_RENAMELOCK_ENTER(mp); if (error) goto abort1; /* * Now fdvp, fvp, tdvp, and (if nonnull) tvp are referenced, * and nothing is locked except for the vfs rename lock. * * The next step is a little rain dance to conform to the * insane lock protocol, even though it does nothing to ward * off race conditions. * * We need tdvp and tvp to be locked. However, because we have * unlocked tdvp in order to hold no locks while we take the * vfs rename lock, tvp may be wrong here, and we can't safely * lock it even if the sensible file systems will just unlock * it straight away. Consequently, we must lock tdvp and then * relookup tvp to get it locked. * * Finally, because the VOP_RENAME protocol is brain-damaged * and various file systems insanely depend on the semantics of * this brain damage, the lookup of to must be the last lookup * before VOP_RENAME. */ vn_lock(tdvp, LK_EXCLUSIVE | LK_RETRY); error = relookup(tdvp, &tnd.ni_vp, &tnd.ni_cnd, 0); if (error) goto abort2; /* * Drop the old tvp and pick up the new one -- which might be * the same, but that doesn't matter to us. After this, tdvp * and tvp should both be locked. */ if (tvp != NULL) vrele(tvp); tvp = tnd.ni_vp; KASSERT(VOP_ISLOCKED(tdvp) == LK_EXCLUSIVE); KASSERT((tvp == NULL) || (VOP_ISLOCKED(tvp) == LK_EXCLUSIVE)); /* * The old do_sys_rename had various consistency checks here * involving fvp and tvp. fvp is bogus already here, and tvp * will become bogus soon in any sensible file system, so the * only purpose in putting these checks here is to give lip * service to these screw cases and to acknowledge that they * exist, not actually to handle them, but here you go * anyway... */ /* * Acknowledge that directories and non-directories aren't * supposed to mix. */ if (tvp != NULL) { if ((fvp->v_type == VDIR) && (tvp->v_type != VDIR)) { error = ENOTDIR; goto abort3; } else if ((fvp->v_type != VDIR) && (tvp->v_type == VDIR)) { error = EISDIR; goto abort3; } } /* * Acknowledge some random screw case, among the dozens that * might arise. */ if (fvp == tdvp) { error = EINVAL; goto abort3; } /* * Acknowledge that POSIX has a wacky screw case. * * XXX Eventually the retain flag needs to be passed on to * VOP_RENAME. */ if (fvp == tvp) { if (retain) { error = 0; goto abort3; } else if ((fdvp == tdvp) && (fnd.ni_cnd.cn_namelen == tnd.ni_cnd.cn_namelen) && (0 == memcmp(fnd.ni_cnd.cn_nameptr, tnd.ni_cnd.cn_nameptr, fnd.ni_cnd.cn_namelen))) { error = 0; goto abort3; } } /* * Make sure veriexec can screw us up. (But a race can screw * up veriexec, of course -- remember, fvp and (soon) tvp are * bogus.) */ #if NVERIEXEC > 0 { char *f1, *f2; size_t f1_len; size_t f2_len; f1_len = fnd.ni_cnd.cn_namelen + 1; f1 = kmem_alloc(f1_len, KM_SLEEP); strlcpy(f1, fnd.ni_cnd.cn_nameptr, f1_len); f2_len = tnd.ni_cnd.cn_namelen + 1; f2 = kmem_alloc(f2_len, KM_SLEEP); strlcpy(f2, tnd.ni_cnd.cn_nameptr, f2_len); error = veriexec_renamechk(curlwp, fvp, f1, tvp, f2); kmem_free(f1, f1_len); kmem_free(f2, f2_len); if (error) goto abort3; } #endif /* NVERIEXEC > 0 */ /* * All ready. Incant the rename vop. */ /* XXX KASSERT(VOP_ISLOCKED(fdvp) != LK_EXCLUSIVE); */ /* XXX KASSERT(VOP_ISLOCKED(fvp) != LK_EXCLUSIVE); */ KASSERT(VOP_ISLOCKED(tdvp) == LK_EXCLUSIVE); KASSERT((tvp == NULL) || (VOP_ISLOCKED(tvp) == LK_EXCLUSIVE)); error = VOP_RENAME(fdvp, fvp, &fnd.ni_cnd, tdvp, tvp, &tnd.ni_cnd); /* * VOP_RENAME releases fdvp, fvp, tdvp, and tvp, and unlocks * tdvp and tvp. But we can't assert any of that. */ /* XXX KASSERT(VOP_ISLOCKED(fdvp) != LK_EXCLUSIVE); */ /* XXX KASSERT(VOP_ISLOCKED(fvp) != LK_EXCLUSIVE); */ /* XXX KASSERT(VOP_ISLOCKED(tdvp) != LK_EXCLUSIVE); */ /* XXX KASSERT((tvp == NULL) || (VOP_ISLOCKED(tvp) != LK_EXCLUSIVE)); */ /* * So all we have left to do is to drop the rename lock and * destroy the pathbufs. */ VFS_RENAMELOCK_EXIT(mp); fstrans_done(mp); goto out2; abort3: if ((tvp != NULL) && (tvp != tdvp)) VOP_UNLOCK(tvp); abort2: VOP_UNLOCK(tdvp); VFS_RENAMELOCK_EXIT(mp); abort1: VOP_ABORTOP(tdvp, &tnd.ni_cnd); vrele(tdvp); if (tvp != NULL) vrele(tvp); abort0: VOP_ABORTOP(fdvp, &fnd.ni_cnd); vrele(fdvp); vrele(fvp); fstrans_done(mp); out2: pathbuf_destroy(tpb); out1: pathbuf_destroy(fpb); out0: return error; } /* * Make a directory file. */ /* ARGSUSED */ int sys_mkdir(struct lwp *l, const struct sys_mkdir_args *uap, register_t *retval) { /* { syscallarg(const char *) path; syscallarg(int) mode; } */ return do_sys_mkdirat(l, AT_FDCWD, SCARG(uap, path), SCARG(uap, mode), UIO_USERSPACE); } int sys_mkdirat(struct lwp *l, const struct sys_mkdirat_args *uap, register_t *retval) { /* { syscallarg(int) fd; syscallarg(const char *) path; syscallarg(int) mode; } */ return do_sys_mkdirat(l, SCARG(uap, fd), SCARG(uap, path), SCARG(uap, mode), UIO_USERSPACE); } int do_sys_mkdir(const char *path, mode_t mode, enum uio_seg seg) { return do_sys_mkdirat(NULL, AT_FDCWD, path, mode, seg); } static int do_sys_mkdirat(struct lwp *l, int fdat, const char *path, mode_t mode, enum uio_seg seg) { struct proc *p = curlwp->l_proc; struct vnode *vp; struct vattr vattr; int error; struct pathbuf *pb; struct nameidata nd; KASSERT(l != NULL || fdat == AT_FDCWD); /* XXX bollocks, should pass in a pathbuf */ error = pathbuf_maybe_copyin(path, seg, &pb); if (error) { return error; } NDINIT(&nd, CREATE, LOCKPARENT | CREATEDIR | TRYEMULROOT, pb); if ((error = fd_nameiat(l, fdat, &nd)) != 0) { pathbuf_destroy(pb); return (error); } vp = nd.ni_vp; if (vp != NULL) { VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd); if (nd.ni_dvp == vp) vrele(nd.ni_dvp); else vput(nd.ni_dvp); vrele(vp); pathbuf_destroy(pb); return (EEXIST); } vattr_null(&vattr); vattr.va_type = VDIR; /* We will read cwdi->cwdi_cmask unlocked. */ vattr.va_mode = (mode & ACCESSPERMS) &~ p->p_cwdi->cwdi_cmask; nd.ni_cnd.cn_flags |= WILLBEDIR; error = VOP_MKDIR(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr); if (!error) vrele(nd.ni_vp); vput(nd.ni_dvp); pathbuf_destroy(pb); return (error); } /* * Remove a directory file. */ /* ARGSUSED */ int sys_rmdir(struct lwp *l, const struct sys_rmdir_args *uap, register_t *retval) { return do_sys_unlinkat(l, AT_FDCWD, SCARG(uap, path), AT_REMOVEDIR, UIO_USERSPACE); } /* * Read a block of directory entries in a file system independent format. */ int sys___getdents30(struct lwp *l, const struct sys___getdents30_args *uap, register_t *retval) { /* { syscallarg(int) fd; syscallarg(char *) buf; syscallarg(size_t) count; } */ file_t *fp; int error, done; /* fd_getvnode() will use the descriptor for us */ if ((error = fd_getvnode(SCARG(uap, fd), &fp)) != 0) return (error); if ((fp->f_flag & FREAD) == 0) { error = EBADF; goto out; } error = vn_readdir(fp, SCARG(uap, buf), UIO_USERSPACE, SCARG(uap, count), &done, l, 0, 0); ktrgenio(SCARG(uap, fd), UIO_READ, SCARG(uap, buf), done, error); *retval = done; out: fd_putfile(SCARG(uap, fd)); return (error); } /* * Set the mode mask for creation of filesystem nodes. */ int sys_umask(struct lwp *l, const struct sys_umask_args *uap, register_t *retval) { /* { syscallarg(mode_t) newmask; } */ /* * cwdi->cwdi_cmask will be read unlocked elsewhere, and no kind of * serialization with those reads is required. It's important to * return a coherent answer for the caller of umask() though, and * the atomic operation accomplishes that. */ *retval = atomic_swap_uint(&curproc->p_cwdi->cwdi_cmask, SCARG(uap, newmask) & ALLPERMS); return (0); } int dorevoke(struct vnode *vp, kauth_cred_t cred) { struct vattr vattr; int error, fs_decision; vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); error = VOP_GETATTR(vp, &vattr, cred); VOP_UNLOCK(vp); if (error != 0) return error; fs_decision = (kauth_cred_geteuid(cred) == vattr.va_uid) ? 0 : EPERM; error = kauth_authorize_vnode(cred, KAUTH_VNODE_REVOKE, vp, NULL, fs_decision); if (!error) VOP_REVOKE(vp, REVOKEALL); return (error); } /* * Void all references to file by ripping underlying filesystem * away from vnode. */ /* ARGSUSED */ int sys_revoke(struct lwp *l, const struct sys_revoke_args *uap, register_t *retval) { /* { syscallarg(const char *) path; } */ struct vnode *vp; int error; error = namei_simple_user(SCARG(uap, path), NSM_FOLLOW_TRYEMULROOT, &vp); if (error != 0) return (error); error = dorevoke(vp, l->l_cred); vrele(vp); return (error); } /* * Allocate backing store for a file, filling a hole without having to * explicitly write anything out. */ /* ARGSUSED */ int sys_posix_fallocate(struct lwp *l, const struct sys_posix_fallocate_args *uap, register_t *retval) { /* { syscallarg(int) fd; syscallarg(off_t) pos; syscallarg(off_t) len; } */ int fd; off_t pos, len; struct file *fp; struct vnode *vp; int error; fd = SCARG(uap, fd); pos = SCARG(uap, pos); len = SCARG(uap, len); if (pos < 0 || len < 0 || len > OFF_T_MAX - pos) { *retval = EINVAL; return 0; } error = fd_getvnode(fd, &fp); if (error) { *retval = error; return 0; } if ((fp->f_flag & FWRITE) == 0) { error = EBADF; goto fail; } vp = fp->f_vnode; vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); if (vp->v_type == VDIR) { error = EISDIR; } else { error = VOP_FALLOCATE(vp, pos, len); } VOP_UNLOCK(vp); fail: fd_putfile(fd); *retval = error; return 0; } /* * Deallocate backing store for a file, creating a hole. Also used for * invoking TRIM on disks. */ /* ARGSUSED */ int sys_fdiscard(struct lwp *l, const struct sys_fdiscard_args *uap, register_t *retval) { /* { syscallarg(int) fd; syscallarg(off_t) pos; syscallarg(off_t) len; } */ int fd; off_t pos, len; struct file *fp; struct vnode *vp; int error; fd = SCARG(uap, fd); pos = SCARG(uap, pos); len = SCARG(uap, len); if (pos < 0 || len < 0 || len > OFF_T_MAX - pos) { return EINVAL; } error = fd_getvnode(fd, &fp); if (error) { return error; } if ((fp->f_flag & FWRITE) == 0) { error = EBADF; goto fail; } vp = fp->f_vnode; vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); if (vp->v_type == VDIR) { error = EISDIR; } else { error = VOP_FDISCARD(vp, pos, len); } VOP_UNLOCK(vp); fail: fd_putfile(fd); return error; }
177 172 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 /* $NetBSD: ufs_wapbl.h,v 1.19 2020/04/11 17:43:54 jdolecek Exp $ */ /*- * Copyright (c) 2003,2006,2008 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Wasabi Systems, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #ifndef _UFS_UFS_UFS_WAPBL_H_ #define _UFS_UFS_UFS_WAPBL_H_ #if defined(_KERNEL_OPT) #include "opt_wapbl.h" #endif /* * Information for the journal location stored in the superblock. * We store the journal version, some flags, the journal location * type, and some location specific "locators" that identify where * the log itself is located. */ /* fs->fs_journal_version */ #define UFS_WAPBL_VERSION 1 /* fs->fs_journal_location */ #define UFS_WAPBL_JOURNALLOC_NONE 0 #define UFS_WAPBL_JOURNALLOC_END_PARTITION 1 #define UFS_WAPBL_EPART_ADDR 0 /* locator slots */ #define UFS_WAPBL_EPART_COUNT 1 #define UFS_WAPBL_EPART_BLKSZ 2 #define UFS_WAPBL_EPART_UNUSED 3 #define UFS_WAPBL_JOURNALLOC_IN_FILESYSTEM 2 #define UFS_WAPBL_INFS_ADDR 0 /* locator slots */ #define UFS_WAPBL_INFS_COUNT 1 #define UFS_WAPBL_INFS_BLKSZ 2 #define UFS_WAPBL_INFS_INO 3 /* fs->fs_journal_flags */ #define UFS_WAPBL_FLAGS_CREATE_LOG 0x1 #define UFS_WAPBL_FLAGS_CLEAR_LOG 0x2 /* * The journal size is limited to between 1MB and 64MB. * The default journal size is the filesystem size divided by * the scale factor - this is 1M of journal per 1GB of filesystem * space. * * XXX: Is 64MB too limiting? If user explicitly asks for more, allow it? */ #define UFS_WAPBL_JOURNAL_SCALE 1024 #define UFS_WAPBL_MIN_JOURNAL_SIZE (1024 * 1024) #define UFS_WAPBL_MAX_JOURNAL_SIZE (64 * 1024 * 1024) #if defined(WAPBL) static __inline int ufs_wapbl_begin(struct mount *mp, const char *file, int line) { if (mp->mnt_wapbl) { int error; error = wapbl_begin(mp->mnt_wapbl, file, line); if (error) return error; } return 0; } static __inline void ufs_wapbl_end(struct mount *mp) { if (mp->mnt_wapbl) { wapbl_end(mp->mnt_wapbl); } } #define UFS_WAPBL_BEGIN(mp) \ ufs_wapbl_begin(mp, __func__, __LINE__) #define UFS_WAPBL_END(mp) ufs_wapbl_end(mp) #define UFS_WAPBL_UPDATE(vp, access, modify, flags) \ if ((vp)->v_mount->mnt_wapbl) { \ UFS_UPDATE(vp, access, modify, flags); \ } #ifdef DIAGNOSTIC #define UFS_WAPBL_JLOCK_ASSERT(mp) \ if (mp->mnt_wapbl) wapbl_jlock_assert(mp->mnt_wapbl) #define UFS_WAPBL_JUNLOCK_ASSERT(mp) \ if (mp->mnt_wapbl) wapbl_junlock_assert(mp->mnt_wapbl) #else #define UFS_WAPBL_JLOCK_ASSERT(mp) #define UFS_WAPBL_JUNLOCK_ASSERT(mp) #endif #define UFS_WAPBL_REGISTER_INODE(mp, ino, mode) \ if (mp->mnt_wapbl) wapbl_register_inode(mp->mnt_wapbl, ino, mode) #define UFS_WAPBL_UNREGISTER_INODE(mp, ino, mode) \ if (mp->mnt_wapbl) wapbl_unregister_inode(mp->mnt_wapbl, ino, mode) #define UFS_WAPBL_REGISTER_DEALLOCATION(mp, blk, len, cookiep) \ (mp->mnt_wapbl) \ ? wapbl_register_deallocation(mp->mnt_wapbl, blk, len, \ false, cookiep) \ : 0 #define UFS_WAPBL_REGISTER_DEALLOCATION_FORCE(mp, blk, len) \ ( \ (mp->mnt_wapbl) \ ? wapbl_register_deallocation(mp->mnt_wapbl, blk, len, \ true, NULL) \ : 0 \ ) #define UFS_WAPBL_UNREGISTER_DEALLOCATION(mp, cookie) \ if (mp->mnt_wapbl) wapbl_unregister_deallocation(mp->mnt_wapbl, cookie) #else /* ! WAPBL */ #define UFS_WAPBL_BEGIN(mp) (__USE(mp), 0) #define UFS_WAPBL_END(mp) do { } while (0) #define UFS_WAPBL_UPDATE(vp, access, modify, flags) do { } while (0) #define UFS_WAPBL_JLOCK_ASSERT(mp) #define UFS_WAPBL_JUNLOCK_ASSERT(mp) #define UFS_WAPBL_REGISTER_INODE(mp, ino, mode) do { } while (0) #define UFS_WAPBL_UNREGISTER_INODE(mp, ino, mode) do { } while (0) #define UFS_WAPBL_REGISTER_DEALLOCATION(mp, blk, len, cookiep) 0 #define UFS_WAPBL_REGISTER_DEALLOCATION_FORCE(mp, blk, len) 0 #define UFS_WAPBL_UNREGISTER_DEALLOCATION(mp, cookie) do { } while (0) #endif #endif /* !_UFS_UFS_UFS_WAPBL_H_ */
2 7 2 2 7 7 1 9 9 9 7 7 4 2 7 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 /* $NetBSD: prop_number.c,v 1.34 2022/08/03 21:13:46 riastradh Exp $ */ /*- * Copyright (c) 2006, 2020 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Jason R. Thorpe. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #include "prop_object_impl.h" #include <prop/prop_number.h> #include <sys/rbtree.h> #if defined(_KERNEL) #include <sys/systm.h> #elif defined(_STANDALONE) #include <sys/param.h> #include <lib/libkern/libkern.h> #else #include <errno.h> #include <limits.h> #include <stdlib.h> #endif struct _prop_number_value { union { int64_t pnu_signed; uint64_t pnu_unsigned; } pnv_un; #define pnv_signed pnv_un.pnu_signed #define pnv_unsigned pnv_un.pnu_unsigned unsigned int pnv_is_unsigned :1, :31; }; struct _prop_number { struct _prop_object pn_obj; struct rb_node pn_link; struct _prop_number_value pn_value; }; _PROP_POOL_INIT(_prop_number_pool, sizeof(struct _prop_number), "propnmbr") static _prop_object_free_rv_t _prop_number_free(prop_stack_t, prop_object_t *); static bool _prop_number_externalize( struct _prop_object_externalize_context *, void *); static _prop_object_equals_rv_t _prop_number_equals(prop_object_t, prop_object_t, void **, void **, prop_object_t *, prop_object_t *); static void _prop_number_lock(void); static void _prop_number_unlock(void); static const struct _prop_object_type _prop_object_type_number = { .pot_type = PROP_TYPE_NUMBER, .pot_free = _prop_number_free, .pot_extern = _prop_number_externalize, .pot_equals = _prop_number_equals, .pot_lock = _prop_number_lock, .pot_unlock = _prop_number_unlock, }; #define prop_object_is_number(x) \ ((x) != NULL && (x)->pn_obj.po_type == &_prop_object_type_number) /* * Number objects are immutable, and we are likely to have many number * objects that have the same value. So, to save memory, we unique'ify * numbers so we only have one copy of each. */ static int _prop_number_compare_values(const struct _prop_number_value *pnv1, const struct _prop_number_value *pnv2) { /* Signed numbers are sorted before unsigned numbers. */ if (pnv1->pnv_is_unsigned) { if (! pnv2->pnv_is_unsigned) return (1); if (pnv1->pnv_unsigned < pnv2->pnv_unsigned) return (-1); if (pnv1->pnv_unsigned > pnv2->pnv_unsigned) return (1); return (0); } if (pnv2->pnv_is_unsigned) return (-1); if (pnv1->pnv_signed < pnv2->pnv_signed) return (-1); if (pnv1->pnv_signed > pnv2->pnv_signed) return (1); return (0); } static int /*ARGSUSED*/ _prop_number_rb_compare_nodes(void *ctx _PROP_ARG_UNUSED, const void *n1, const void *n2) { const struct _prop_number *pn1 = n1; const struct _prop_number *pn2 = n2; return _prop_number_compare_values(&pn1->pn_value, &pn2->pn_value); } static int /*ARGSUSED*/ _prop_number_rb_compare_key(void *ctx _PROP_ARG_UNUSED, const void *n, const void *v) { const struct _prop_number *pn = n; const struct _prop_number_value *pnv = v; return _prop_number_compare_values(&pn->pn_value, pnv); } static const rb_tree_ops_t _prop_number_rb_tree_ops = { .rbto_compare_nodes = _prop_number_rb_compare_nodes, .rbto_compare_key = _prop_number_rb_compare_key, .rbto_node_offset = offsetof(struct _prop_number, pn_link), .rbto_context = NULL }; static struct rb_tree _prop_number_tree; _PROP_MUTEX_DECL_STATIC(_prop_number_tree_mutex) /* ARGSUSED */ static _prop_object_free_rv_t _prop_number_free(prop_stack_t stack, prop_object_t *obj) { prop_number_t pn = *obj; rb_tree_remove_node(&_prop_number_tree, pn); _PROP_POOL_PUT(_prop_number_pool, pn); return (_PROP_OBJECT_FREE_DONE); } _PROP_ONCE_DECL(_prop_number_init_once) static int _prop_number_init(void) { _PROP_MUTEX_INIT(_prop_number_tree_mutex); rb_tree_init(&_prop_number_tree, &_prop_number_rb_tree_ops); return 0; } static void _prop_number_lock(void) { /* XXX: init necessary? */ _PROP_ONCE_RUN(_prop_number_init_once, _prop_number_init); _PROP_MUTEX_LOCK(_prop_number_tree_mutex); } static void _prop_number_unlock(void) { _PROP_MUTEX_UNLOCK(_prop_number_tree_mutex); } static bool _prop_number_externalize(struct _prop_object_externalize_context *ctx, void *v) { prop_number_t pn = v; char tmpstr[32]; /* * For unsigned numbers, we output in hex. For signed numbers, * we output in decimal. */ if (pn->pn_value.pnv_is_unsigned) snprintf(tmpstr, sizeof(tmpstr), "0x%" PRIx64, pn->pn_value.pnv_unsigned); else snprintf(tmpstr, sizeof(tmpstr), "%" PRIi64, pn->pn_value.pnv_signed); if (_prop_object_externalize_start_tag(ctx, "integer") == false || _prop_object_externalize_append_cstring(ctx, tmpstr) == false || _prop_object_externalize_end_tag(ctx, "integer") == false) return (false); return (true); } /* ARGSUSED */ static _prop_object_equals_rv_t _prop_number_equals(prop_object_t v1, prop_object_t v2, void **stored_pointer1, void **stored_pointer2, prop_object_t *next_obj1, prop_object_t *next_obj2) { prop_number_t num1 = v1; prop_number_t num2 = v2; /* * There is only ever one copy of a number object at any given * time, so we can reduce this to a simple pointer equality check * in the common case. */ if (num1 == num2) return (_PROP_OBJECT_EQUALS_TRUE); /* * If the numbers are the same signed-ness, then we know they * cannot be equal because they would have had pointer equality. */ if (num1->pn_value.pnv_is_unsigned == num2->pn_value.pnv_is_unsigned) return (_PROP_OBJECT_EQUALS_FALSE); /* * We now have one signed value and one unsigned value. We can * compare them iff: * - The unsigned value is not larger than the signed value * can represent. * - The signed value is not smaller than the unsigned value * can represent. */ if (num1->pn_value.pnv_is_unsigned) { /* * num1 is unsigned and num2 is signed. */ if (num1->pn_value.pnv_unsigned > INTMAX_MAX) return (_PROP_OBJECT_EQUALS_FALSE); if (num2->pn_value.pnv_signed < 0) return (_PROP_OBJECT_EQUALS_FALSE); } else { /* * num1 is signed and num2 is unsigned. */ if (num1->pn_value.pnv_signed < 0) return (_PROP_OBJECT_EQUALS_FALSE); if (num2->pn_value.pnv_unsigned > INTMAX_MAX) return (_PROP_OBJECT_EQUALS_FALSE); } if (num1->pn_value.pnv_signed == num2->pn_value.pnv_signed) return _PROP_OBJECT_EQUALS_TRUE; else return _PROP_OBJECT_EQUALS_FALSE; } static prop_number_t _prop_number_alloc(const struct _prop_number_value *pnv) { prop_number_t opn, pn, rpn; _PROP_ONCE_RUN(_prop_number_init_once, _prop_number_init); /* * Check to see if this already exists in the tree. If it does, * we just retain it and return it. */ _PROP_MUTEX_LOCK(_prop_number_tree_mutex); opn = rb_tree_find_node(&_prop_number_tree, pnv); if (opn != NULL) { prop_object_retain(opn); _PROP_MUTEX_UNLOCK(_prop_number_tree_mutex); return (opn); } _PROP_MUTEX_UNLOCK(_prop_number_tree_mutex); /* * Not in the tree. Create it now. */ pn = _PROP_POOL_GET(_prop_number_pool); if (pn == NULL) return (NULL); _prop_object_init(&pn->pn_obj, &_prop_object_type_number); pn->pn_value = *pnv; /* * We dropped the mutex when we allocated the new object, so * we have to check again if it is in the tree. */ _PROP_MUTEX_LOCK(_prop_number_tree_mutex); opn = rb_tree_find_node(&_prop_number_tree, pnv); if (opn != NULL) { prop_object_retain(opn); _PROP_MUTEX_UNLOCK(_prop_number_tree_mutex); _PROP_POOL_PUT(_prop_number_pool, pn); return (opn); } rpn = rb_tree_insert_node(&_prop_number_tree, pn); _PROP_ASSERT(rpn == pn); _PROP_MUTEX_UNLOCK(_prop_number_tree_mutex); return (rpn); } /* * prop_number_create_signed -- * Create a prop_number_t and initialize it with the * provided signed value. */ prop_number_t prop_number_create_signed(intmax_t val) { struct _prop_number_value pnv; memset(&pnv, 0, sizeof(pnv)); pnv.pnv_signed = val; pnv.pnv_is_unsigned = false; return (_prop_number_alloc(&pnv)); } _PROP_DEPRECATED(prop_number_create_integer, "this program uses prop_number_create_integer(), " "which is deprecated; use prop_number_create_signed() instead.") prop_number_t prop_number_create_integer(int64_t val) { return prop_number_create_signed(val); } /* * prop_number_create_unsigned -- * Create a prop_number_t and initialize it with the * provided unsigned value. */ prop_number_t prop_number_create_unsigned(uintmax_t val) { struct _prop_number_value pnv; memset(&pnv, 0, sizeof(pnv)); pnv.pnv_unsigned = val; pnv.pnv_is_unsigned = true; return (_prop_number_alloc(&pnv)); } _PROP_DEPRECATED(prop_number_create_unsigned_integer, "this program uses prop_number_create_unsigned_integer(), " "which is deprecated; use prop_number_create_unsigned() instead.") prop_number_t prop_number_create_unsigned_integer(uint64_t val) { return prop_number_create_unsigned(val); } /* * prop_number_copy -- * Copy a prop_number_t. */ prop_number_t prop_number_copy(prop_number_t opn) { if (! prop_object_is_number(opn)) return (NULL); /* * Because we only ever allocate one object for any given * value, this can be reduced to a simple retain operation. */ prop_object_retain(opn); return (opn); } /* * prop_number_unsigned -- * Returns true if the prop_number_t has an unsigned value. */ bool prop_number_unsigned(prop_number_t pn) { return (pn->pn_value.pnv_is_unsigned); } /* * prop_number_size -- * Return the size, in bits, required to hold the value of * the specified number. */ int prop_number_size(prop_number_t pn) { struct _prop_number_value *pnv; if (! prop_object_is_number(pn)) return (0); pnv = &pn->pn_value; if (pnv->pnv_is_unsigned) { if (pnv->pnv_unsigned > UINT32_MAX) return (64); if (pnv->pnv_unsigned > UINT16_MAX) return (32); if (pnv->pnv_unsigned > UINT8_MAX) return (16); return (8); } if (pnv->pnv_signed > INT32_MAX || pnv->pnv_signed < INT32_MIN) return (64); if (pnv->pnv_signed > INT16_MAX || pnv->pnv_signed < INT16_MIN) return (32); if (pnv->pnv_signed > INT8_MAX || pnv->pnv_signed < INT8_MIN) return (16); return (8); } /* * prop_number_signed_value -- * Get the signed value of a prop_number_t. */ intmax_t prop_number_signed_value(prop_number_t pn) { /* * XXX Impossible to distinguish between "not a prop_number_t" * XXX and "prop_number_t has a value of 0". */ if (! prop_object_is_number(pn)) return (0); return (pn->pn_value.pnv_signed); } _PROP_DEPRECATED(prop_number_integer_value, "this program uses prop_number_integer_value(), " "which is deprecated; use prop_number_signed_value() instead.") int64_t prop_number_integer_value(prop_number_t pn) { return prop_number_signed_value(pn); } /* * prop_number_unsigned_value -- * Get the unsigned value of a prop_number_t. */ uintmax_t prop_number_unsigned_value(prop_number_t pn) { /* * XXX Impossible to distinguish between "not a prop_number_t" * XXX and "prop_number_t has a value of 0". */ if (! prop_object_is_number(pn)) return (0); return (pn->pn_value.pnv_unsigned); } _PROP_DEPRECATED(prop_number_unsigned_integer_value, "this program uses prop_number_unsigned_integer_value(), " "which is deprecated; use prop_number_unsigned_value() instead.") uint64_t prop_number_unsigned_integer_value(prop_number_t pn) { return prop_number_unsigned_value(pn); } /* * prop_number_[...]_value -- * Retrieve the bounds-checked value as the specified type. * Returns true if successful. */ #define TEMPLATE(name, typ, minv, maxv) \ bool \ prop_number_ ## name ## _value(prop_number_t pn, typ * const valp) \ { \ \ if (! prop_object_is_number(pn)) \ return (false); \ \ if (pn->pn_value.pnv_is_unsigned) { \ if (pn->pn_value.pnv_unsigned > (maxv)) \ return (false); \ *valp = (typ) pn->pn_value.pnv_unsigned; \ } else { \ if ((pn->pn_value.pnv_signed > 0 && \ (uintmax_t)pn->pn_value.pnv_signed > (maxv)) || \ pn->pn_value.pnv_signed < (minv)) \ return (false); \ *valp = (typ) pn->pn_value.pnv_signed; \ } \ \ return (true); \ } TEMPLATE(schar, signed char, SCHAR_MIN, SCHAR_MAX) TEMPLATE(short, short, SHRT_MIN, SHRT_MAX) TEMPLATE(int, int, INT_MIN, INT_MAX) TEMPLATE(long, long, LONG_MIN, LONG_MAX) TEMPLATE(longlong, long long, LLONG_MIN, LLONG_MAX) TEMPLATE(intptr, intptr_t, INTPTR_MIN, INTPTR_MAX) TEMPLATE(int8, int8_t, INT8_MIN, INT8_MAX) TEMPLATE(int16, int16_t, INT16_MIN, INT16_MAX) TEMPLATE(int32, int32_t, INT32_MIN, INT32_MAX) TEMPLATE(int64, int64_t, INT64_MIN, INT64_MAX) TEMPLATE(uchar, unsigned char, 0, UCHAR_MAX) TEMPLATE(ushort, unsigned short, 0, USHRT_MAX) TEMPLATE(uint, unsigned int, 0, UINT_MAX) TEMPLATE(ulong, unsigned long, 0, ULONG_MAX) TEMPLATE(ulonglong, unsigned long long, 0, ULLONG_MAX) TEMPLATE(uintptr, uintptr_t, 0, UINTPTR_MAX) TEMPLATE(uint8, uint8_t, 0, UINT8_MAX) TEMPLATE(uint16, uint16_t, 0, UINT16_MAX) TEMPLATE(uint32, uint32_t, 0, UINT32_MAX) TEMPLATE(uint64, uint64_t, 0, UINT64_MAX) #undef TEMPLATE /* * prop_number_equals -- * Return true if two numbers are equivalent. */ bool prop_number_equals(prop_number_t num1, prop_number_t num2) { if (!prop_object_is_number(num1) || !prop_object_is_number(num2)) return (false); return (prop_object_equals(num1, num2)); } /* * prop_number_equals_signed -- * Return true if the number is equivalent to the specified signed * value. */ bool prop_number_equals_signed(prop_number_t pn, intmax_t val) { if (! prop_object_is_number(pn)) return (false); if (pn->pn_value.pnv_is_unsigned && (pn->pn_value.pnv_unsigned > INTMAX_MAX || val < 0)) return (false); return (pn->pn_value.pnv_signed == val); } _PROP_DEPRECATED(prop_number_equals_integer, "this program uses prop_number_equals_integer(), " "which is deprecated; use prop_number_equals_signed() instead.") bool prop_number_equals_integer(prop_number_t pn, int64_t val) { return prop_number_equals_signed(pn, val); } /* * prop_number_equals_unsigned -- * Return true if the number is equivalent to the specified * unsigned value. */ bool prop_number_equals_unsigned(prop_number_t pn, uintmax_t val) { if (! prop_object_is_number(pn)) return (false); if (! pn->pn_value.pnv_is_unsigned && (pn->pn_value.pnv_signed < 0 || val > INT64_MAX)) return (false); return (pn->pn_value.pnv_unsigned == val); } _PROP_DEPRECATED(prop_number_equals_unsigned_integer, "this program uses prop_number_equals_unsigned_integer(), " "which is deprecated; use prop_number_equals_unsigned() instead.") bool prop_number_equals_unsigned_integer(prop_number_t pn, uint64_t val) { return prop_number_equals_unsigned(pn, val); } static bool _prop_number_internalize_unsigned(struct _prop_object_internalize_context *ctx, struct _prop_number_value *pnv) { char *cp; _PROP_ASSERT(/*CONSTCOND*/sizeof(unsigned long long) == sizeof(uint64_t)); #ifndef _KERNEL errno = 0; #endif pnv->pnv_unsigned = (uint64_t) strtoull(ctx->poic_cp, &cp, 0); #ifndef _KERNEL /* XXX can't check for ERANGE in the kernel */ if (pnv->pnv_unsigned == UINT64_MAX && errno == ERANGE) return (false); #endif pnv->pnv_is_unsigned = true; ctx->poic_cp = cp; return (true); } static bool _prop_number_internalize_signed(struct _prop_object_internalize_context *ctx, struct _prop_number_value *pnv) { char *cp; _PROP_ASSERT(/*CONSTCOND*/sizeof(long long) == sizeof(int64_t)); #ifndef _KERNEL errno = 0; #endif pnv->pnv_signed = (int64_t) strtoll(ctx->poic_cp, &cp, 0); #ifndef _KERNEL /* XXX can't check for ERANGE in the kernel */ if ((pnv->pnv_signed == INT64_MAX || pnv->pnv_signed == INT64_MIN) && errno == ERANGE) return (false); #endif pnv->pnv_is_unsigned = false; ctx->poic_cp = cp; return (true); } /* * _prop_number_internalize -- * Parse a <number>...</number> and return the object created from * the external representation. */ /* ARGSUSED */ bool _prop_number_internalize(prop_stack_t stack, prop_object_t *obj, struct _prop_object_internalize_context *ctx) { struct _prop_number_value pnv; memset(&pnv, 0, sizeof(pnv)); /* No attributes, no empty elements. */ if (ctx->poic_tagattr != NULL || ctx->poic_is_empty_element) return (true); /* * If the first character is '-', then we treat as signed. * If the first two characters are "0x" (i.e. the number is * in hex), then we treat as unsigned. Otherwise, we try * signed first, and if that fails (presumably due to ERANGE), * then we switch to unsigned. */ if (ctx->poic_cp[0] == '-') { if (_prop_number_internalize_signed(ctx, &pnv) == false) return (true); } else if (ctx->poic_cp[0] == '0' && ctx->poic_cp[1] == 'x') { if (_prop_number_internalize_unsigned(ctx, &pnv) == false) return (true); } else { if (_prop_number_internalize_signed(ctx, &pnv) == false && _prop_number_internalize_unsigned(ctx, &pnv) == false) return (true); } if (_prop_object_internalize_find_tag(ctx, "integer", _PROP_TAG_TYPE_END) == false) return (true); *obj = _prop_number_alloc(&pnv); return (true); }
12 12 11 1 1 16 11 5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 /* $NetBSD: tcp_sack.c,v 1.36 2018/05/18 18:58:51 maxv Exp $ */ /* * Copyright (c) 2005 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Kentaro A. Kurahone. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /* * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994, 1995 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)tcp_sack.c 8.12 (Berkeley) 5/24/95 * $FreeBSD: src/sys/netinet/tcp_sack.c,v 1.3.2.2 2004/12/25 23:02:57 rwatson Exp $ */ /* * @@(#)COPYRIGHT 1.1 (NRL) 17 January 1995 * * NRL grants permission for redistribution and use in source and binary * forms, with or without modification, of the software and documentation * created at NRL provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgements: * This product includes software developed by the University of * California, Berkeley and its contributors. * This product includes software developed at the Information * Technology Division, US Naval Research Laboratory. * 4. Neither the name of the NRL nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THE SOFTWARE PROVIDED BY NRL IS PROVIDED BY NRL AND CONTRIBUTORS ``AS * IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NRL OR * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * The views and conclusions contained in the software and documentation * are those of the authors and should not be interpreted as representing * official policies, either expressed or implied, of the US Naval * Research Laboratory (NRL). */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: tcp_sack.c,v 1.36 2018/05/18 18:58:51 maxv Exp $"); #ifdef _KERNEL_OPT #include "opt_inet.h" #include "opt_inet_csum.h" #include "opt_tcp_debug.h" #include "opt_ddb.h" #endif #include <sys/param.h> #include <sys/systm.h> #include <sys/mbuf.h> #include <sys/protosw.h> #include <sys/socket.h> #include <sys/socketvar.h> #include <sys/errno.h> #include <sys/syslog.h> #include <sys/pool.h> #include <sys/domain.h> #include <sys/kernel.h> #include <net/if.h> #include <net/route.h> #include <net/if_types.h> #include <netinet/in.h> #include <netinet/in_systm.h> #include <netinet/ip.h> #include <netinet/in_pcb.h> #include <netinet/in_var.h> #include <netinet/ip_var.h> #ifdef INET6 #include <netinet/ip6.h> #include <netinet6/ip6_var.h> #include <netinet6/in6_pcb.h> #include <netinet6/ip6_var.h> #include <netinet6/in6_var.h> #include <netinet/icmp6.h> #endif #ifndef INET6 #include <netinet/ip6.h> #endif #include <netinet/tcp.h> #include <netinet/tcp_fsm.h> #include <netinet/tcp_seq.h> #include <netinet/tcp_timer.h> #include <netinet/tcp_var.h> #include <netinet/tcp_debug.h> /* SACK block pool. */ static struct pool sackhole_pool; void tcp_sack_init(void) { pool_init(&sackhole_pool, sizeof(struct sackhole), 0, 0, 0, "sackholepl", NULL, IPL_SOFTNET); } static struct sackhole * sack_allochole(struct tcpcb *tp) { struct sackhole *hole; if (tp->snd_numholes >= tcp_sack_tp_maxholes || tcp_sack_globalholes >= tcp_sack_globalmaxholes) { return NULL; } hole = pool_get(&sackhole_pool, PR_NOWAIT); if (hole == NULL) { return NULL; } tp->snd_numholes++; tcp_sack_globalholes++; return hole; } static struct sackhole * sack_inserthole(struct tcpcb *tp, tcp_seq start, tcp_seq end, struct sackhole *prev) { struct sackhole *hole; hole = sack_allochole(tp); if (hole == NULL) { return NULL; } hole->start = hole->rxmit = start; hole->end = end; if (prev != NULL) { TAILQ_INSERT_AFTER(&tp->snd_holes, prev, hole, sackhole_q); } else { TAILQ_INSERT_TAIL(&tp->snd_holes, hole, sackhole_q); } return hole; } static struct sackhole * sack_removehole(struct tcpcb *tp, struct sackhole *hole) { struct sackhole *next; next = TAILQ_NEXT(hole, sackhole_q); tp->snd_numholes--; tcp_sack_globalholes--; TAILQ_REMOVE(&tp->snd_holes, hole, sackhole_q); pool_put(&sackhole_pool, hole); return next; } /* * tcp_new_dsack: record the reception of a duplicated segment. */ void tcp_new_dsack(struct tcpcb *tp, tcp_seq seq, u_int32_t len) { if (TCP_SACK_ENABLED(tp)) { tp->rcv_dsack_block.left = seq; tp->rcv_dsack_block.right = seq + len; tp->rcv_sack_flags |= TCPSACK_HAVED; } } /* * tcp_sack_option: parse the given SACK option and update the scoreboard. */ void tcp_sack_option(struct tcpcb *tp, const struct tcphdr *th, const u_char *cp, int optlen) { struct sackblk t_sack_block[(MAX_TCPOPTLEN - 2) / (sizeof(u_int32_t) * 2)]; struct sackblk *sack = NULL; struct sackhole *cur = NULL; struct sackhole *tmp = NULL; const char *lp = cp + 2; int i, j, num_sack_blks; tcp_seq left, right, acked; /* * If we aren't processing SACK responses, this is not an ACK * or the peer sends us a sack option with invalid length, don't * update the scoreboard. */ if (!TCP_SACK_ENABLED(tp) || ((th->th_flags & TH_ACK) == 0) || (optlen % 8 != 2 || optlen < 10)) { return; } /* * If we don't want any SACK holes to be allocated, just return. */ if (tcp_sack_globalmaxholes == 0 || tcp_sack_tp_maxholes == 0) { return; } /* If the ACK is outside [snd_una, snd_max], ignore the SACK options. */ if (SEQ_LT(th->th_ack, tp->snd_una) || SEQ_GT(th->th_ack, tp->snd_max)) return; /* * Extract SACK blocks. * * Note that t_sack_block is sorted so that we only need to do * one pass over the sequence number space. (SACK "fast-path") */ num_sack_blks = optlen / 8; acked = (SEQ_GT(th->th_ack, tp->snd_una)) ? th->th_ack : tp->snd_una; for (i = 0; i < num_sack_blks; i++, lp += sizeof(uint32_t) * 2) { memcpy(&left, lp, sizeof(uint32_t)); memcpy(&right, lp + sizeof(uint32_t), sizeof(uint32_t)); left = ntohl(left); right = ntohl(right); if (SEQ_LEQ(right, acked) || SEQ_GT(right, tp->snd_max) || SEQ_GEQ(left, right)) { /* SACK entry that's old, or invalid. */ i--; num_sack_blks--; continue; } /* Insertion sort. */ for (j = i; (j > 0) && SEQ_LT(left, t_sack_block[j - 1].left); j--) { t_sack_block[j].left = t_sack_block[j - 1].left; t_sack_block[j].right = t_sack_block[j - 1].right; } t_sack_block[j].left = left; t_sack_block[j].right = right; } /* Update the scoreboard. */ cur = TAILQ_FIRST(&tp->snd_holes); for (i = 0; i < num_sack_blks; i++) { sack = &t_sack_block[i]; /* * FACK TCP. Update snd_fack so we can enter Fast * Recovery early. */ if (SEQ_GEQ(sack->right, tp->snd_fack)) tp->snd_fack = sack->right; if (TAILQ_EMPTY(&tp->snd_holes)) { /* First hole. */ cur = sack_inserthole(tp, th->th_ack, sack->left, NULL); if (cur == NULL) { /* ENOBUFS, bail out*/ return; } tp->rcv_lastsack = sack->right; continue; /* With next sack block */ } /* Go through the list of holes. */ while (cur) { if (SEQ_LEQ(sack->right, cur->start)) /* SACKs data before the current hole */ break; /* No use going through more holes */ if (SEQ_GEQ(sack->left, cur->end)) { /* SACKs data beyond the current hole */ cur = TAILQ_NEXT(cur, sackhole_q); continue; } if (SEQ_LEQ(sack->left, cur->start)) { /* Data acks at least the beginning of hole */ if (SEQ_GEQ(sack->right, cur->end)) { /* Acks entire hole, so delete hole */ cur = sack_removehole(tp, cur); break; } /* Otherwise, move start of hole forward */ cur->start = sack->right; cur->rxmit = SEQ_MAX(cur->rxmit, cur->start); break; } if (SEQ_GEQ(sack->right, cur->end)) { /* Move end of hole backward. */ cur->end = sack->left; cur->rxmit = SEQ_MIN(cur->rxmit, cur->end); cur = TAILQ_NEXT(cur, sackhole_q); break; } if (SEQ_LT(cur->start, sack->left) && SEQ_GT(cur->end, sack->right)) { /* * ACKs some data in middle of a hole; need to * split current hole */ tmp = sack_inserthole(tp, sack->right, cur->end, cur); if (tmp == NULL) { return; } tmp->rxmit = SEQ_MAX(cur->rxmit, tmp->start); cur->end = sack->left; cur->rxmit = SEQ_MIN(cur->rxmit, cur->end); cur = tmp; break; } } /* At this point, we have reached the tail of the list. */ if (SEQ_LT(tp->rcv_lastsack, sack->left)) { /* * Need to append new hole at end. */ cur = sack_inserthole(tp, tp->rcv_lastsack, sack->left, NULL); if (cur == NULL) { return; } } if (SEQ_LT(tp->rcv_lastsack, sack->right)) { tp->rcv_lastsack = sack->right; } } } /* * tcp_del_sackholes: remove holes covered by a cumulative ACK. */ void tcp_del_sackholes(struct tcpcb *tp, const struct tcphdr *th) { /* Max because this could be an older ack that just arrived. */ tcp_seq lastack = SEQ_GT(th->th_ack, tp->snd_una) ? th->th_ack : tp->snd_una; struct sackhole *cur = TAILQ_FIRST(&tp->snd_holes); while (cur) { if (SEQ_LEQ(cur->end, lastack)) { cur = sack_removehole(tp, cur); } else if (SEQ_LT(cur->start, lastack)) { cur->start = lastack; if (SEQ_LT(cur->rxmit, cur->start)) cur->rxmit = cur->start; break; } else break; } } /* * tcp_free_sackholes: clear the scoreboard. */ void tcp_free_sackholes(struct tcpcb *tp) { struct sackhole *sack; /* Free up the SACK hole list. */ while ((sack = TAILQ_FIRST(&tp->snd_holes)) != NULL) { sack_removehole(tp, sack); } KASSERT(tp->snd_numholes == 0); } /* * Returns pointer to a sackhole if there are any pending retransmissions; * NULL otherwise. */ struct sackhole * tcp_sack_output(struct tcpcb *tp, int *sack_bytes_rexmt) { struct sackhole *cur = NULL; if (!TCP_SACK_ENABLED(tp)) return (NULL); *sack_bytes_rexmt = 0; TAILQ_FOREACH(cur, &tp->snd_holes, sackhole_q) { if (SEQ_LT(cur->rxmit, cur->end)) { if (SEQ_LT(cur->rxmit, tp->snd_una)) { /* old SACK hole */ continue; } *sack_bytes_rexmt += (cur->rxmit - cur->start); break; } *sack_bytes_rexmt += (cur->rxmit - cur->start); } return (cur); } /* * After a timeout, the SACK list may be rebuilt. This SACK information * should be used to avoid retransmitting SACKed data. This function * traverses the SACK list to see if snd_nxt should be moved forward. */ void tcp_sack_adjust(struct tcpcb *tp) { struct sackhole *cur = TAILQ_FIRST(&tp->snd_holes); struct sackhole *n = NULL; if (TAILQ_EMPTY(&tp->snd_holes)) return; /* No holes */ if (SEQ_GEQ(tp->snd_nxt, tp->rcv_lastsack)) return; /* We're already beyond any SACKed blocks */ /* * Two cases for which we want to advance snd_nxt: * i) snd_nxt lies between end of one hole and beginning of another * ii) snd_nxt lies between end of last hole and rcv_lastsack */ while ((n = TAILQ_NEXT(cur, sackhole_q)) != NULL) { if (SEQ_LT(tp->snd_nxt, cur->end)) return; if (SEQ_GEQ(tp->snd_nxt, n->start)) cur = n; else { tp->snd_nxt = n->start; return; } } if (SEQ_LT(tp->snd_nxt, cur->end)) return; tp->snd_nxt = tp->rcv_lastsack; return; } /* * tcp_sack_numblks: return the number of SACK blocks to send. */ int tcp_sack_numblks(const struct tcpcb *tp) { int numblks; if (!TCP_SACK_ENABLED(tp)) { return 0; } numblks = (((tp->rcv_sack_flags & TCPSACK_HAVED) != 0) ? 1 : 0) + tp->t_segqlen; if (numblks == 0) { return 0; } if (numblks > TCP_SACK_MAX) { numblks = TCP_SACK_MAX; } return numblks; } #if defined(DDB) void sack_dump(const struct tcpcb *); void sack_dump(const struct tcpcb *tp) { const struct sackhole *cur; printf("snd_una=%" PRIu32 ", snd_max=%" PRIu32 "\n", tp->snd_una, tp->snd_max); printf("rcv_lastsack=%" PRIu32 ", snd_fack=%" PRIu32 "\n", tp->rcv_lastsack, tp->snd_fack); printf("numholes=%d\n", tp->snd_numholes); TAILQ_FOREACH(cur, &tp->snd_holes, sackhole_q) { printf("\t%" PRIu32 "-%" PRIu32 ", rxmit=%" PRIu32 "\n", cur->start, cur->end, cur->rxmit); } } #endif /* defined(DDB) */
51 52 9 32 32 254 254 254 252 243 288 288 288 287 288 288 288 273 175 272 272 273 273 50 253 272 102 25 85 25 85 25 85 77 28 78 6 25 65 43 70 13 756 756 290 289 21 31 25 112 178 328 327 48 690 694 50 50 272 342 382 341 382 225 225 225 224 215 16 16 59 59 59 59 59 59 214 214 214 213 214 214 132 131 130 112 28 30 66 57 132 131 131 130 132 132 20 22 44 44 20 26 78 10 10 113 88 21 10 116 114 115 116 74 44 44 44 33 27 35 37 37 29 7 35 35 53 13 1 5 5 5 5 6 6 5 2 3 10 86 28 67 62 33 1 92 93 86 12 116 116 113 6 73 44 688 685 777 465 467 373 697 691 515 516 516 287 288 71 33 15 30 30 43 43 1 70 76 76 80 80 80 80 57 2 33 52 4 9 6 3 1 2 61 10 10 71 71 69 1 44 7 7 6 3 37 2 35 7 44 42 41 3 3 3 3 1 6 36 43 43 37 37 37 37 2 35 37 37 5 5 5 5 5 5 5 74 74 73 85 85 84 84 2 78 9 85 84 84 83 19 4 25 32 41 41 41 84 84 84 85 85 84 84 84 66 35 84 76 14 3 3 3 3 3 3 3 5 5 5 5 5 3 2 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 2 2 2 25 1 24 27 24 3 26 2 25 1 25 24 24 3 22 22 22 19 7 20 4 1 27 12 12 10 1 11 10 11 13 12 2 13 12 13 5 4 5 5 4 4 3 49 49 23 28 1 10 5 8 4 6 4 4 4 4 39 2 5 5 35 42 42 13 37 6 6 37 37 34 1 34 2 2 2 2 33 1 7 7 2 2 1 2 3 2 1 4 4 4 3 2 3 3 1 3 3 3 3 3 3 3 2 2 2 2 2 2 2 16 16 1 13 2 11 4 2 11 11 11 11 5 8 7 7 2 1 3 2 4 6 8 10 4 4 12 17 16 13 13 19 19 19 19 19 19 19 19 2 19 19 4 19 19 2 19 19 19 1 19 19 19 19 19 19 19 19 19 19 16 3 164 12 142 26 14 3 9 9 9 9 9 9 9 113 26 37 30 4 3 24 24 23 23 23 23 23 13 14 8 6 14 1 13 33 32 41 81 29 33 81 35 32 74 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082 5083 5084 5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 5095 5096 5097 5098 5099 5100 5101 5102 5103 5104 5105 5106 5107 5108 5109 5110 5111 5112 5113 5114 5115 5116 5117 5118 5119 5120 5121 5122 5123 5124 5125 5126 5127 5128 5129 5130 5131 5132 5133 5134 5135 5136 5137 5138 5139 5140 5141 5142 5143 5144 5145 5146 5147 5148 5149 5150 5151 5152 5153 5154 5155 5156 5157 5158 5159 5160 5161 5162 5163 5164 5165 5166 5167 5168 5169 5170 5171 5172 5173 5174 5175 5176 5177 5178 5179 5180 5181 5182 5183 5184 5185 5186 5187 5188 5189 5190 5191 5192 5193 5194 5195 5196 5197 5198 5199 5200 5201 5202 5203 5204 5205 5206 5207 5208 5209 5210 5211 5212 5213 5214 5215 5216 5217 5218 5219 5220 5221 5222 5223 5224 5225 5226 5227 5228 5229 5230 5231 5232 5233 5234 5235 5236 5237 5238 5239 5240 5241 5242 5243 5244 5245 5246 5247 5248 5249 5250 5251 5252 5253 5254 5255 5256 5257 5258 5259 5260 5261 5262 5263 5264 5265 5266 5267 5268 5269 5270 5271 5272 5273 5274 5275 5276 5277 5278 5279 5280 5281 5282 5283 5284 5285 5286 5287 5288 5289 5290 5291 5292 5293 5294 5295 5296 5297 5298 5299 5300 5301 5302 5303 5304 5305 5306 5307 5308 5309 5310 5311 5312 5313 5314 5315 5316 5317 5318 5319 5320 5321 5322 5323 5324 5325 5326 5327 5328 5329 5330 5331 5332 5333 5334 5335 5336 5337 5338 5339 5340 5341 5342 5343 5344 5345 5346 5347 5348 5349 5350 5351 5352 5353 5354 5355 5356 5357 5358 5359 5360 5361 5362 5363 5364 5365 5366 5367 5368 5369 5370 5371 5372 5373 5374 5375 5376 5377 5378 5379 5380 5381 5382 5383 5384 5385 5386 5387 5388 5389 5390 5391 5392 5393 5394 5395 5396 5397 5398 5399 5400 5401 5402 5403 5404 5405 5406 5407 5408 5409 5410 5411 5412 5413 5414 5415 5416 5417 5418 5419 5420 5421 5422 5423 5424 5425 5426 5427 5428 5429 5430 5431 5432 5433 5434 5435 5436 5437 /* $NetBSD: uvm_map.c,v 1.411 2024/02/09 22:08:38 andvar Exp $ */ /* * Copyright (c) 1997 Charles D. Cranor and Washington University. * Copyright (c) 1991, 1993, The Regents of the University of California. * * All rights reserved. * * This code is derived from software contributed to Berkeley by * The Mach Operating System project at Carnegie-Mellon University. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)vm_map.c 8.3 (Berkeley) 1/12/94 * from: Id: uvm_map.c,v 1.1.2.27 1998/02/07 01:16:54 chs Exp * * * Copyright (c) 1987, 1990 Carnegie-Mellon University. * All rights reserved. * * Permission to use, copy, modify and distribute this software and * its documentation is hereby granted, provided that both the copyright * notice and this permission notice appear in all copies of the * software, derivative works or modified versions, and any portions * thereof, and that both notices appear in supporting documentation. * * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. * * Carnegie Mellon requests users of this software to return to * * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU * School of Computer Science * Carnegie Mellon University * Pittsburgh PA 15213-3890 * * any improvements or extensions that they make and grant Carnegie the * rights to redistribute these changes. */ /* * uvm_map.c: uvm map operations */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: uvm_map.c,v 1.411 2024/02/09 22:08:38 andvar Exp $"); #include "opt_ddb.h" #include "opt_pax.h" #include "opt_uvmhist.h" #include "opt_uvm.h" #include "opt_sysv.h" #include <sys/param.h> #include <sys/systm.h> #include <sys/mman.h> #include <sys/proc.h> #include <sys/pool.h> #include <sys/kernel.h> #include <sys/mount.h> #include <sys/pax.h> #include <sys/vnode.h> #include <sys/filedesc.h> #include <sys/lockdebug.h> #include <sys/atomic.h> #include <sys/sysctl.h> #ifndef __USER_VA0_IS_SAFE #include <sys/kauth.h> #include "opt_user_va0_disable_default.h" #endif #include <sys/shm.h> #include <uvm/uvm.h> #include <uvm/uvm_readahead.h> #if defined(DDB) || defined(DEBUGPRINT) #include <uvm/uvm_ddb.h> #endif #ifdef UVMHIST #ifndef UVMHIST_MAPHIST_SIZE #define UVMHIST_MAPHIST_SIZE 100 #endif static struct kern_history_ent maphistbuf[UVMHIST_MAPHIST_SIZE]; UVMHIST_DEFINE(maphist) = UVMHIST_INITIALIZER(maphist, maphistbuf); #endif #if !defined(UVMMAP_COUNTERS) #define UVMMAP_EVCNT_DEFINE(name) /* nothing */ #define UVMMAP_EVCNT_INCR(ev) /* nothing */ #define UVMMAP_EVCNT_DECR(ev) /* nothing */ #else /* defined(UVMMAP_NOCOUNTERS) */ #include <sys/evcnt.h> #define UVMMAP_EVCNT_DEFINE(name) \ struct evcnt uvmmap_evcnt_##name = EVCNT_INITIALIZER(EVCNT_TYPE_MISC, NULL, \ "uvmmap", #name); \ EVCNT_ATTACH_STATIC(uvmmap_evcnt_##name); #define UVMMAP_EVCNT_INCR(ev) uvmmap_evcnt_##ev.ev_count++ #define UVMMAP_EVCNT_DECR(ev) uvmmap_evcnt_##ev.ev_count-- #endif /* defined(UVMMAP_NOCOUNTERS) */ UVMMAP_EVCNT_DEFINE(ubackmerge) UVMMAP_EVCNT_DEFINE(uforwmerge) UVMMAP_EVCNT_DEFINE(ubimerge) UVMMAP_EVCNT_DEFINE(unomerge) UVMMAP_EVCNT_DEFINE(kbackmerge) UVMMAP_EVCNT_DEFINE(kforwmerge) UVMMAP_EVCNT_DEFINE(kbimerge) UVMMAP_EVCNT_DEFINE(knomerge) UVMMAP_EVCNT_DEFINE(map_call) UVMMAP_EVCNT_DEFINE(mlk_call) UVMMAP_EVCNT_DEFINE(mlk_hint) UVMMAP_EVCNT_DEFINE(mlk_tree) UVMMAP_EVCNT_DEFINE(mlk_treeloop) const char vmmapbsy[] = "vmmapbsy"; /* * cache for dynamically-allocated map entries. */ static struct pool_cache uvm_map_entry_cache; #ifdef PMAP_GROWKERNEL /* * This global represents the end of the kernel virtual address * space. If we want to exceed this, we must grow the kernel * virtual address space dynamically. * * Note, this variable is locked by kernel_map's lock. */ vaddr_t uvm_maxkaddr; #endif #ifndef __USER_VA0_IS_SAFE #ifndef __USER_VA0_DISABLE_DEFAULT #define __USER_VA0_DISABLE_DEFAULT 1 #endif #ifdef USER_VA0_DISABLE_DEFAULT /* kernel config option overrides */ #undef __USER_VA0_DISABLE_DEFAULT #define __USER_VA0_DISABLE_DEFAULT USER_VA0_DISABLE_DEFAULT #endif int user_va0_disable = __USER_VA0_DISABLE_DEFAULT; #endif /* * macros */ /* * uvm_map_align_va: round down or up virtual address */ static __inline void uvm_map_align_va(vaddr_t *vap, vsize_t align, int topdown) { KASSERT(powerof2(align)); if (align != 0 && (*vap & (align - 1)) != 0) { if (topdown) *vap = rounddown2(*vap, align); else *vap = roundup2(*vap, align); } } /* * UVM_ET_ISCOMPATIBLE: check some requirements for map entry merging */ extern struct vm_map *pager_map; #define UVM_ET_ISCOMPATIBLE(ent, type, uobj, meflags, \ prot, maxprot, inh, adv, wire) \ ((ent)->etype == (type) && \ (((ent)->flags ^ (meflags)) & (UVM_MAP_NOMERGE)) == 0 && \ (ent)->object.uvm_obj == (uobj) && \ (ent)->protection == (prot) && \ (ent)->max_protection == (maxprot) && \ (ent)->inheritance == (inh) && \ (ent)->advice == (adv) && \ (ent)->wired_count == (wire)) /* * uvm_map_entry_link: insert entry into a map * * => map must be locked */ #define uvm_map_entry_link(map, after_where, entry) do { \ uvm_mapent_check(entry); \ (map)->nentries++; \ (entry)->prev = (after_where); \ (entry)->next = (after_where)->next; \ (entry)->prev->next = (entry); \ (entry)->next->prev = (entry); \ uvm_rb_insert((map), (entry)); \ } while (/*CONSTCOND*/ 0) /* * uvm_map_entry_unlink: remove entry from a map * * => map must be locked */ #define uvm_map_entry_unlink(map, entry) do { \ KASSERT((entry) != (map)->first_free); \ KASSERT((entry) != (map)->hint); \ uvm_mapent_check(entry); \ (map)->nentries--; \ (entry)->next->prev = (entry)->prev; \ (entry)->prev->next = (entry)->next; \ uvm_rb_remove((map), (entry)); \ } while (/*CONSTCOND*/ 0) /* * SAVE_HINT: saves the specified entry as the hint for future lookups. * * => map need not be locked. */ #define SAVE_HINT(map, check, value) do { \ if ((map)->hint == (check)) \ (map)->hint = (value); \ } while (/*CONSTCOND*/ 0) /* * clear_hints: ensure that hints don't point to the entry. * * => map must be write-locked. */ static void clear_hints(struct vm_map *map, struct vm_map_entry *ent) { SAVE_HINT(map, ent, ent->prev); if (map->first_free == ent) { map->first_free = ent->prev; } } /* * VM_MAP_RANGE_CHECK: check and correct range * * => map must at least be read locked */ #define VM_MAP_RANGE_CHECK(map, start, end) do { \ if (start < vm_map_min(map)) \ start = vm_map_min(map); \ if (end > vm_map_max(map)) \ end = vm_map_max(map); \ if (start > end) \ start = end; \ } while (/*CONSTCOND*/ 0) /* * local prototypes */ static struct vm_map_entry * uvm_mapent_alloc(struct vm_map *, int); static void uvm_mapent_copy(struct vm_map_entry *, struct vm_map_entry *); static void uvm_mapent_free(struct vm_map_entry *); #if defined(DEBUG) static void _uvm_mapent_check(const struct vm_map_entry *, int); #define uvm_mapent_check(map) _uvm_mapent_check(map, __LINE__) #else /* defined(DEBUG) */ #define uvm_mapent_check(e) /* nothing */ #endif /* defined(DEBUG) */ static void uvm_map_entry_unwire(struct vm_map *, struct vm_map_entry *); static void uvm_map_reference_amap(struct vm_map_entry *, int); static int uvm_map_space_avail(vaddr_t *, vsize_t, voff_t, vsize_t, int, int, struct vm_map_entry *); static void uvm_map_unreference_amap(struct vm_map_entry *, int); int _uvm_map_sanity(struct vm_map *); int _uvm_tree_sanity(struct vm_map *); static vsize_t uvm_rb_maxgap(const struct vm_map_entry *); #define ROOT_ENTRY(map) ((struct vm_map_entry *)(map)->rb_tree.rbt_root) #define LEFT_ENTRY(entry) ((struct vm_map_entry *)(entry)->rb_node.rb_left) #define RIGHT_ENTRY(entry) ((struct vm_map_entry *)(entry)->rb_node.rb_right) #define PARENT_ENTRY(map, entry) \ (ROOT_ENTRY(map) == (entry) \ ? NULL : (struct vm_map_entry *)RB_FATHER(&(entry)->rb_node)) /* * These get filled in if/when SYSVSHM shared memory code is loaded * * We do this with function pointers rather the #ifdef SYSVSHM so the * SYSVSHM code can be loaded and unloaded */ void (*uvm_shmexit)(struct vmspace *) = NULL; void (*uvm_shmfork)(struct vmspace *, struct vmspace *) = NULL; static int uvm_map_compare_nodes(void *ctx, const void *nparent, const void *nkey) { const struct vm_map_entry *eparent = nparent; const struct vm_map_entry *ekey = nkey; KASSERT(eparent->start < ekey->start || eparent->start >= ekey->end); KASSERT(ekey->start < eparent->start || ekey->start >= eparent->end); if (eparent->start < ekey->start) return -1; if (eparent->end >= ekey->start) return 1; return 0; } static int uvm_map_compare_key(void *ctx, const void *nparent, const void *vkey) { const struct vm_map_entry *eparent = nparent; const vaddr_t va = *(const vaddr_t *) vkey; if (eparent->start < va) return -1; if (eparent->end >= va) return 1; return 0; } static const rb_tree_ops_t uvm_map_tree_ops = { .rbto_compare_nodes = uvm_map_compare_nodes, .rbto_compare_key = uvm_map_compare_key, .rbto_node_offset = offsetof(struct vm_map_entry, rb_node), .rbto_context = NULL }; /* * uvm_rb_gap: return the gap size between our entry and next entry. */ static inline vsize_t uvm_rb_gap(const struct vm_map_entry *entry) { KASSERT(entry->next != NULL); return entry->next->start - entry->end; } static vsize_t uvm_rb_maxgap(const struct vm_map_entry *entry) { struct vm_map_entry *child; vsize_t maxgap = entry->gap; /* * We need maxgap to be the largest gap of us or any of our * descendents. Since each of our children's maxgap is the * cached value of their largest gap of themselves or their * descendents, we can just use that value and avoid recursing * down the tree to calculate it. */ if ((child = LEFT_ENTRY(entry)) != NULL && maxgap < child->maxgap) maxgap = child->maxgap; if ((child = RIGHT_ENTRY(entry)) != NULL && maxgap < child->maxgap) maxgap = child->maxgap; return maxgap; } static void uvm_rb_fixup(struct vm_map *map, struct vm_map_entry *entry) { struct vm_map_entry *parent; KASSERT(entry->gap == uvm_rb_gap(entry)); entry->maxgap = uvm_rb_maxgap(entry); while ((parent = PARENT_ENTRY(map, entry)) != NULL) { struct vm_map_entry *brother; vsize_t maxgap = parent->gap; unsigned int which; KDASSERT(parent->gap == uvm_rb_gap(parent)); if (maxgap < entry->maxgap) maxgap = entry->maxgap; /* * Since we work towards the root, we know entry's maxgap * value is OK, but its brothers may now be out-of-date due * to rebalancing. So refresh it. */ which = RB_POSITION(&entry->rb_node) ^ RB_DIR_OTHER; brother = (struct vm_map_entry *)parent->rb_node.rb_nodes[which]; if (brother != NULL) { KDASSERT(brother->gap == uvm_rb_gap(brother)); brother->maxgap = uvm_rb_maxgap(brother); if (maxgap < brother->maxgap) maxgap = brother->maxgap; } parent->maxgap = maxgap; entry = parent; } } static void uvm_rb_insert(struct vm_map *map, struct vm_map_entry *entry) { struct vm_map_entry *ret __diagused; entry->gap = entry->maxgap = uvm_rb_gap(entry); if (entry->prev != &map->header) entry->prev->gap = uvm_rb_gap(entry->prev); ret = rb_tree_insert_node(&map->rb_tree, entry); KASSERTMSG(ret == entry, "uvm_rb_insert: map %p: duplicate entry %p", map, ret); /* * If the previous entry is not our immediate left child, then it's an * ancestor and will be fixed up on the way to the root. We don't * have to check entry->prev against &map->header since &map->header * will never be in the tree. */ uvm_rb_fixup(map, LEFT_ENTRY(entry) == entry->prev ? entry->prev : entry); } static void uvm_rb_remove(struct vm_map *map, struct vm_map_entry *entry) { struct vm_map_entry *prev_parent = NULL, *next_parent = NULL; /* * If we are removing an interior node, then an adjacent node will * be used to replace its position in the tree. Therefore we will * need to fixup the tree starting at the parent of the replacement * node. So record their parents for later use. */ if (entry->prev != &map->header) prev_parent = PARENT_ENTRY(map, entry->prev); if (entry->next != &map->header) next_parent = PARENT_ENTRY(map, entry->next); rb_tree_remove_node(&map->rb_tree, entry); /* * If the previous node has a new parent, fixup the tree starting * at the previous node's old parent. */ if (entry->prev != &map->header) { /* * Update the previous entry's gap due to our absence. */ entry->prev->gap = uvm_rb_gap(entry->prev); uvm_rb_fixup(map, entry->prev); if (prev_parent != NULL && prev_parent != entry && prev_parent != PARENT_ENTRY(map, entry->prev)) uvm_rb_fixup(map, prev_parent); } /* * If the next node has a new parent, fixup the tree starting * at the next node's old parent. */ if (entry->next != &map->header) { uvm_rb_fixup(map, entry->next); if (next_parent != NULL && next_parent != entry && next_parent != PARENT_ENTRY(map, entry->next)) uvm_rb_fixup(map, next_parent); } } #if defined(DEBUG) int uvm_debug_check_map = 0; int uvm_debug_check_rbtree = 0; #define uvm_map_check(map, name) \ _uvm_map_check((map), (name), __FILE__, __LINE__) static void _uvm_map_check(struct vm_map *map, const char *name, const char *file, int line) { if ((uvm_debug_check_map && _uvm_map_sanity(map)) || (uvm_debug_check_rbtree && _uvm_tree_sanity(map))) { panic("uvm_map_check failed: \"%s\" map=%p (%s:%d)", name, map, file, line); } } #else /* defined(DEBUG) */ #define uvm_map_check(map, name) /* nothing */ #endif /* defined(DEBUG) */ #if defined(DEBUG) || defined(DDB) int _uvm_map_sanity(struct vm_map *map) { bool first_free_found = false; bool hint_found = false; const struct vm_map_entry *e; struct vm_map_entry *hint = map->hint; e = &map->header; for (;;) { if (map->first_free == e) { first_free_found = true; } else if (!first_free_found && e->next->start > e->end) { printf("first_free %p should be %p\n", map->first_free, e); return -1; } if (hint == e) { hint_found = true; } e = e->next; if (e == &map->header) { break; } } if (!first_free_found) { printf("stale first_free\n"); return -1; } if (!hint_found) { printf("stale hint\n"); return -1; } return 0; } int _uvm_tree_sanity(struct vm_map *map) { struct vm_map_entry *tmp, *trtmp; int n = 0, i = 1; for (tmp = map->header.next; tmp != &map->header; tmp = tmp->next) { if (tmp->gap != uvm_rb_gap(tmp)) { printf("%d/%d gap %#lx != %#lx %s\n", n + 1, map->nentries, (ulong)tmp->gap, (ulong)uvm_rb_gap(tmp), tmp->next == &map->header ? "(last)" : ""); goto error; } /* * If any entries are out of order, tmp->gap will be unsigned * and will likely exceed the size of the map. */ if (tmp->gap >= vm_map_max(map) - vm_map_min(map)) { printf("too large gap %zu\n", (size_t)tmp->gap); goto error; } n++; } if (n != map->nentries) { printf("nentries: %d vs %d\n", n, map->nentries); goto error; } trtmp = NULL; for (tmp = map->header.next; tmp != &map->header; tmp = tmp->next) { if (tmp->maxgap != uvm_rb_maxgap(tmp)) { printf("maxgap %#lx != %#lx\n", (ulong)tmp->maxgap, (ulong)uvm_rb_maxgap(tmp)); goto error; } if (trtmp != NULL && trtmp->start >= tmp->start) { printf("corrupt: 0x%"PRIxVADDR"x >= 0x%"PRIxVADDR"x\n", trtmp->start, tmp->start); goto error; } trtmp = tmp; } for (tmp = map->header.next; tmp != &map->header; tmp = tmp->next, i++) { trtmp = rb_tree_iterate(&map->rb_tree, tmp, RB_DIR_LEFT); if (trtmp == NULL) trtmp = &map->header; if (tmp->prev != trtmp) { printf("lookup: %d: %p->prev=%p: %p\n", i, tmp, tmp->prev, trtmp); goto error; } trtmp = rb_tree_iterate(&map->rb_tree, tmp, RB_DIR_RIGHT); if (trtmp == NULL) trtmp = &map->header; if (tmp->next != trtmp) { printf("lookup: %d: %p->next=%p: %p\n", i, tmp, tmp->next, trtmp); goto error; } trtmp = rb_tree_find_node(&map->rb_tree, &tmp->start); if (trtmp != tmp) { printf("lookup: %d: %p - %p: %p\n", i, tmp, trtmp, PARENT_ENTRY(map, tmp)); goto error; } } return (0); error: return (-1); } #endif /* defined(DEBUG) || defined(DDB) */ /* * vm_map_lock: acquire an exclusive (write) lock on a map. * * => The locking protocol provides for guaranteed upgrade from shared -> * exclusive by whichever thread currently has the map marked busy. * See "LOCKING PROTOCOL NOTES" in uvm_map.h. This is horrible; among * other problems, it defeats any fairness guarantees provided by RW * locks. */ void vm_map_lock(struct vm_map *map) { for (;;) { rw_enter(&map->lock, RW_WRITER); if (map->busy == NULL || map->busy == curlwp) { break; } mutex_enter(&map->misc_lock); rw_exit(&map->lock); if (map->busy != NULL) { cv_wait(&map->cv, &map->misc_lock); } mutex_exit(&map->misc_lock); } map->timestamp++; } /* * vm_map_lock_try: try to lock a map, failing if it is already locked. */ bool vm_map_lock_try(struct vm_map *map) { if (!rw_tryenter(&map->lock, RW_WRITER)) { return false; } if (map->busy != NULL) { rw_exit(&map->lock); return false; } map->timestamp++; return true; } /* * vm_map_unlock: release an exclusive lock on a map. */ void vm_map_unlock(struct vm_map *map) { KASSERT(rw_write_held(&map->lock)); KASSERT(map->busy == NULL || map->busy == curlwp); rw_exit(&map->lock); } /* * vm_map_unbusy: mark the map as unbusy, and wake any waiters that * want an exclusive lock. */ void vm_map_unbusy(struct vm_map *map) { KASSERT(map->busy == curlwp); /* * Safe to clear 'busy' and 'waiters' with only a read lock held: * * o they can only be set with a write lock held * o writers are blocked out with a read or write hold * o at any time, only one thread owns the set of values */ mutex_enter(&map->misc_lock); map->busy = NULL; cv_broadcast(&map->cv); mutex_exit(&map->misc_lock); } /* * vm_map_lock_read: acquire a shared (read) lock on a map. */ void vm_map_lock_read(struct vm_map *map) { rw_enter(&map->lock, RW_READER); } /* * vm_map_unlock_read: release a shared lock on a map. */ void vm_map_unlock_read(struct vm_map *map) { rw_exit(&map->lock); } /* * vm_map_busy: mark a map as busy. * * => the caller must hold the map write locked */ void vm_map_busy(struct vm_map *map) { KASSERT(rw_write_held(&map->lock)); KASSERT(map->busy == NULL); map->busy = curlwp; } /* * vm_map_locked_p: return true if the map is write locked. * * => only for debug purposes like KASSERTs. * => should not be used to verify that a map is not locked. */ bool vm_map_locked_p(struct vm_map *map) { return rw_write_held(&map->lock); } /* * uvm_mapent_alloc: allocate a map entry */ static struct vm_map_entry * uvm_mapent_alloc(struct vm_map *map, int flags) { struct vm_map_entry *me; int pflags = (flags & UVM_FLAG_NOWAIT) ? PR_NOWAIT : PR_WAITOK; UVMHIST_FUNC(__func__); UVMHIST_CALLED(maphist); me = pool_cache_get(&uvm_map_entry_cache, pflags); if (__predict_false(me == NULL)) { return NULL; } me->flags = 0; UVMHIST_LOG(maphist, "<- new entry=%#jx [kentry=%jd]", (uintptr_t)me, (map == kernel_map), 0, 0); return me; } /* * uvm_mapent_free: free map entry */ static void uvm_mapent_free(struct vm_map_entry *me) { UVMHIST_FUNC(__func__); UVMHIST_CALLARGS(maphist,"<- freeing map entry=%#jx [flags=%#jx]", (uintptr_t)me, me->flags, 0, 0); pool_cache_put(&uvm_map_entry_cache, me); } /* * uvm_mapent_copy: copy a map entry, preserving flags */ static inline void uvm_mapent_copy(struct vm_map_entry *src, struct vm_map_entry *dst) { memcpy(dst, src, sizeof(*dst)); dst->flags = 0; } #if defined(DEBUG) static void _uvm_mapent_check(const struct vm_map_entry *entry, int line) { if (entry->start >= entry->end) { goto bad; } if (UVM_ET_ISOBJ(entry)) { if (entry->object.uvm_obj == NULL) { goto bad; } } else if (UVM_ET_ISSUBMAP(entry)) { if (entry->object.sub_map == NULL) { goto bad; } } else { if (entry->object.uvm_obj != NULL || entry->object.sub_map != NULL) { goto bad; } } if (!UVM_ET_ISOBJ(entry)) { if (entry->offset != 0) { goto bad; } } return; bad: panic("%s: bad entry %p, line %d", __func__, entry, line); } #endif /* defined(DEBUG) */ /* * uvm_map_entry_unwire: unwire a map entry * * => map should be locked by caller */ static inline void uvm_map_entry_unwire(struct vm_map *map, struct vm_map_entry *entry) { entry->wired_count = 0; uvm_fault_unwire_locked(map, entry->start, entry->end); } /* * wrapper for calling amap_ref() */ static inline void uvm_map_reference_amap(struct vm_map_entry *entry, int flags) { amap_ref(entry->aref.ar_amap, entry->aref.ar_pageoff, (entry->end - entry->start) >> PAGE_SHIFT, flags); } /* * wrapper for calling amap_unref() */ static inline void uvm_map_unreference_amap(struct vm_map_entry *entry, int flags) { amap_unref(entry->aref.ar_amap, entry->aref.ar_pageoff, (entry->end - entry->start) >> PAGE_SHIFT, flags); } /* * uvm_map_init: init mapping system at boot time. */ void uvm_map_init(void) { /* * first, init logging system. */ UVMHIST_FUNC(__func__); UVMHIST_LINK_STATIC(maphist); UVMHIST_LINK_STATIC(pdhist); UVMHIST_CALLED(maphist); UVMHIST_LOG(maphist,"<starting uvm map system>", 0, 0, 0, 0); /* * initialize the global lock for kernel map entry. */ mutex_init(&uvm_kentry_lock, MUTEX_DRIVER, IPL_VM); } /* * uvm_map_init_caches: init mapping system caches. */ void uvm_map_init_caches(void) { /* * initialize caches. */ pool_cache_bootstrap(&uvm_map_entry_cache, sizeof(struct vm_map_entry), coherency_unit, 0, PR_LARGECACHE, "vmmpepl", NULL, IPL_NONE, NULL, NULL, NULL); } /* * clippers */ /* * uvm_mapent_splitadj: adjust map entries for splitting, after uvm_mapent_copy. */ static void uvm_mapent_splitadj(struct vm_map_entry *entry1, struct vm_map_entry *entry2, vaddr_t splitat) { vaddr_t adj; KASSERT(entry1->start < splitat); KASSERT(splitat < entry1->end); adj = splitat - entry1->start; entry1->end = entry2->start = splitat; if (entry1->aref.ar_amap) { amap_splitref(&entry1->aref, &entry2->aref, adj); } if (UVM_ET_ISSUBMAP(entry1)) { /* ... unlikely to happen, but play it safe */ uvm_map_reference(entry1->object.sub_map); } else if (UVM_ET_ISOBJ(entry1)) { KASSERT(entry1->object.uvm_obj != NULL); /* suppress coverity */ entry2->offset += adj; if (entry1->object.uvm_obj->pgops && entry1->object.uvm_obj->pgops->pgo_reference) entry1->object.uvm_obj->pgops->pgo_reference( entry1->object.uvm_obj); } } /* * uvm_map_clip_start: ensure that the entry begins at or after * the starting address, if it doesn't we split the entry. * * => caller should use UVM_MAP_CLIP_START macro rather than calling * this directly * => map must be locked by caller */ void uvm_map_clip_start(struct vm_map *map, struct vm_map_entry *entry, vaddr_t start) { struct vm_map_entry *new_entry; /* uvm_map_simplify_entry(map, entry); */ /* XXX */ uvm_map_check(map, "clip_start entry"); uvm_mapent_check(entry); /* * Split off the front portion. note that we must insert the new * entry BEFORE this one, so that this entry has the specified * starting address. */ new_entry = uvm_mapent_alloc(map, 0); uvm_mapent_copy(entry, new_entry); /* entry -> new_entry */ uvm_mapent_splitadj(new_entry, entry, start); uvm_map_entry_link(map, entry->prev, new_entry); uvm_map_check(map, "clip_start leave"); } /* * uvm_map_clip_end: ensure that the entry ends at or before * the ending address, if it does't we split the reference * * => caller should use UVM_MAP_CLIP_END macro rather than calling * this directly * => map must be locked by caller */ void uvm_map_clip_end(struct vm_map *map, struct vm_map_entry *entry, vaddr_t end) { struct vm_map_entry *new_entry; uvm_map_check(map, "clip_end entry"); uvm_mapent_check(entry); /* * Create a new entry and insert it * AFTER the specified entry */ new_entry = uvm_mapent_alloc(map, 0); uvm_mapent_copy(entry, new_entry); /* entry -> new_entry */ uvm_mapent_splitadj(entry, new_entry, end); uvm_map_entry_link(map, entry, new_entry); uvm_map_check(map, "clip_end leave"); } /* * M A P - m a i n e n t r y p o i n t */ /* * uvm_map: establish a valid mapping in a map * * => assume startp is page aligned. * => assume size is a multiple of PAGE_SIZE. * => assume sys_mmap provides enough of a "hint" to have us skip * over text/data/bss area. * => map must be unlocked (we will lock it) * => <uobj,uoffset> value meanings (4 cases): * [1] <NULL,uoffset> == uoffset is a hint for PMAP_PREFER * [2] <NULL,UVM_UNKNOWN_OFFSET> == don't PMAP_PREFER * [3] <uobj,uoffset> == normal mapping * [4] <uobj,UVM_UNKNOWN_OFFSET> == uvm_map finds offset based on VA * * case [4] is for kernel mappings where we don't know the offset until * we've found a virtual address. note that kernel object offsets are * always relative to vm_map_min(kernel_map). * * => if `align' is non-zero, we align the virtual address to the specified * alignment. * this is provided as a mechanism for large pages. * * => XXXCDC: need way to map in external amap? */ int uvm_map(struct vm_map *map, vaddr_t *startp /* IN/OUT */, vsize_t size, struct uvm_object *uobj, voff_t uoffset, vsize_t align, uvm_flag_t flags) { struct uvm_map_args args; struct vm_map_entry *new_entry; int error; KASSERT((size & PAGE_MASK) == 0); KASSERT((flags & UVM_FLAG_FIXED) == 0 || align == 0); /* * for pager_map, allocate the new entry first to avoid sleeping * for memory while we have the map locked. */ new_entry = NULL; if (map == pager_map) { new_entry = uvm_mapent_alloc(map, (flags & UVM_FLAG_NOWAIT)); if (__predict_false(new_entry == NULL)) return ENOMEM; } if (map == pager_map) flags |= UVM_FLAG_NOMERGE; error = uvm_map_prepare(map, *startp, size, uobj, uoffset, align, flags, &args); if (!error) { error = uvm_map_enter(map, &args, new_entry); *startp = args.uma_start; } else if (new_entry) { uvm_mapent_free(new_entry); } #if defined(DEBUG) if (!error && VM_MAP_IS_KERNEL(map) && (flags & UVM_FLAG_NOWAIT) == 0) { uvm_km_check_empty(map, *startp, *startp + size); } #endif /* defined(DEBUG) */ return error; } /* * uvm_map_prepare: * * called with map unlocked. * on success, returns the map locked. */ int uvm_map_prepare(struct vm_map *map, vaddr_t start, vsize_t size, struct uvm_object *uobj, voff_t uoffset, vsize_t align, uvm_flag_t flags, struct uvm_map_args *args) { struct vm_map_entry *prev_entry; vm_prot_t prot = UVM_PROTECTION(flags); vm_prot_t maxprot = UVM_MAXPROTECTION(flags); UVMHIST_FUNC(__func__); UVMHIST_CALLARGS(maphist, "(map=%#jx, start=%#jx, size=%jx, flags=%#jx)", (uintptr_t)map, start, size, flags); UVMHIST_LOG(maphist, " uobj/offset %#jx/%jd", (uintptr_t)uobj, uoffset,0,0); /* * detect a popular device driver bug. */ KASSERT(doing_shutdown || curlwp != NULL); /* * zero-sized mapping doesn't make any sense. */ KASSERT(size > 0); KASSERT((~flags & (UVM_FLAG_NOWAIT | UVM_FLAG_WAITVA)) != 0); uvm_map_check(map, "map entry"); /* * check sanity of protection code */ if ((prot & maxprot) != prot) { UVMHIST_LOG(maphist, "<- prot. failure: prot=%#jx, max=%#jx", prot, maxprot,0,0); return EACCES; } /* * figure out where to put new VM range */ retry: if (vm_map_lock_try(map) == false) { if ((flags & UVM_FLAG_TRYLOCK) != 0) { return EAGAIN; } vm_map_lock(map); /* could sleep here */ } if (flags & UVM_FLAG_UNMAP) { KASSERT(flags & UVM_FLAG_FIXED); KASSERT((flags & UVM_FLAG_NOWAIT) == 0); /* * Set prev_entry to what it will need to be after any existing * entries are removed later in uvm_map_enter(). */ if (uvm_map_lookup_entry(map, start, &prev_entry)) { if (start == prev_entry->start) prev_entry = prev_entry->prev; else UVM_MAP_CLIP_END(map, prev_entry, start); SAVE_HINT(map, map->hint, prev_entry); } } else { prev_entry = uvm_map_findspace(map, start, size, &start, uobj, uoffset, align, flags); } if (prev_entry == NULL) { unsigned int timestamp; timestamp = map->timestamp; UVMHIST_LOG(maphist,"waiting va timestamp=%#jx", timestamp,0,0,0); map->flags |= VM_MAP_WANTVA; vm_map_unlock(map); /* * try to reclaim kva and wait until someone does unmap. * fragile locking here, so we awaken every second to * recheck the condition. */ mutex_enter(&map->misc_lock); while ((map->flags & VM_MAP_WANTVA) != 0 && map->timestamp == timestamp) { if ((flags & UVM_FLAG_WAITVA) == 0) { mutex_exit(&map->misc_lock); UVMHIST_LOG(maphist, "<- uvm_map_findspace failed!", 0,0,0,0); return ENOMEM; } else { cv_timedwait(&map->cv, &map->misc_lock, hz); } } mutex_exit(&map->misc_lock); goto retry; } #ifdef PMAP_GROWKERNEL /* * If the kernel pmap can't map the requested space, * then allocate more resources for it. */ if (map == kernel_map && uvm_maxkaddr < (start + size)) uvm_maxkaddr = pmap_growkernel(start + size); #endif UVMMAP_EVCNT_INCR(map_call); /* * if uobj is null, then uoffset is either a VAC hint for PMAP_PREFER * [typically from uvm_map_reserve] or it is UVM_UNKNOWN_OFFSET. in * either case we want to zero it before storing it in the map entry * (because it looks strange and confusing when debugging...) * * if uobj is not null * if uoffset is not UVM_UNKNOWN_OFFSET then we have a normal mapping * and we do not need to change uoffset. * if uoffset is UVM_UNKNOWN_OFFSET then we need to find the offset * now (based on the starting address of the map). this case is * for kernel object mappings where we don't know the offset until * the virtual address is found (with uvm_map_findspace). the * offset is the distance we are from the start of the map. */ if (uobj == NULL) { uoffset = 0; } else { if (uoffset == UVM_UNKNOWN_OFFSET) { KASSERT(UVM_OBJ_IS_KERN_OBJECT(uobj)); uoffset = start - vm_map_min(kernel_map); } } args->uma_flags = flags; args->uma_prev = prev_entry; args->uma_start = start; args->uma_size = size; args->uma_uobj = uobj; args->uma_uoffset = uoffset; UVMHIST_LOG(maphist, "<- done!", 0,0,0,0); return 0; } /* * uvm_map_enter: * * called with map locked. * unlock the map before returning. */ int uvm_map_enter(struct vm_map *map, const struct uvm_map_args *args, struct vm_map_entry *new_entry) { struct vm_map_entry *prev_entry = args->uma_prev; struct vm_map_entry *dead = NULL, *dead_entries = NULL; const uvm_flag_t flags = args->uma_flags; const vm_prot_t prot = UVM_PROTECTION(flags); const vm_prot_t maxprot = UVM_MAXPROTECTION(flags); const vm_inherit_t inherit = UVM_INHERIT(flags); const int amapwaitflag = (flags & UVM_FLAG_NOWAIT) ? AMAP_EXTEND_NOWAIT : 0; const int advice = UVM_ADVICE(flags); vaddr_t start = args->uma_start; vsize_t size = args->uma_size; struct uvm_object *uobj = args->uma_uobj; voff_t uoffset = args->uma_uoffset; const int kmap = (vm_map_pmap(map) == pmap_kernel()); int merged = 0; int error; int newetype; UVMHIST_FUNC(__func__); UVMHIST_CALLARGS(maphist, "(map=%#jx, start=%#jx, size=%ju, flags=%#jx)", (uintptr_t)map, start, size, flags); UVMHIST_LOG(maphist, " uobj/offset %#jx/%jd", (uintptr_t)uobj, uoffset,0,0); KASSERT(map->hint == prev_entry); /* bimerge case assumes this */ KASSERT(vm_map_locked_p(map)); KASSERT((flags & (UVM_FLAG_NOWAIT | UVM_FLAG_UNMAP)) != (UVM_FLAG_NOWAIT | UVM_FLAG_UNMAP)); if (uobj) newetype = UVM_ET_OBJ; else newetype = 0; if (flags & UVM_FLAG_COPYONW) { newetype |= UVM_ET_COPYONWRITE; if ((flags & UVM_FLAG_OVERLAY) == 0) newetype |= UVM_ET_NEEDSCOPY; } /* * For mappings with unmap, remove any old entries now. Adding the new * entry cannot fail because that can only happen if UVM_FLAG_NOWAIT * is set, and we do not support nowait and unmap together. */ if (flags & UVM_FLAG_UNMAP) { KASSERT(flags & UVM_FLAG_FIXED); uvm_unmap_remove(map, start, start + size, &dead_entries, 0); #ifdef DEBUG struct vm_map_entry *tmp_entry __diagused; bool rv __diagused; rv = uvm_map_lookup_entry(map, start, &tmp_entry); KASSERT(!rv); KASSERTMSG(prev_entry == tmp_entry, "args %p prev_entry %p tmp_entry %p", args, prev_entry, tmp_entry); #endif SAVE_HINT(map, map->hint, prev_entry); } /* * try and insert in map by extending previous entry, if possible. * XXX: we don't try and pull back the next entry. might be useful * for a stack, but we are currently allocating our stack in advance. */ if (flags & UVM_FLAG_NOMERGE) goto nomerge; if (prev_entry->end == start && prev_entry != &map->header && UVM_ET_ISCOMPATIBLE(prev_entry, newetype, uobj, 0, prot, maxprot, inherit, advice, 0)) { if (uobj && prev_entry->offset + (prev_entry->end - prev_entry->start) != uoffset) goto forwardmerge; /* * can't extend a shared amap. note: no need to lock amap to * look at refs since we don't care about its exact value. * if it is one (i.e. we have only reference) it will stay there */ if (prev_entry->aref.ar_amap && amap_refs(prev_entry->aref.ar_amap) != 1) { goto forwardmerge; } if (prev_entry->aref.ar_amap) { error = amap_extend(prev_entry, size, amapwaitflag | AMAP_EXTEND_FORWARDS); if (error) goto nomerge; } if (kmap) { UVMMAP_EVCNT_INCR(kbackmerge); } else { UVMMAP_EVCNT_INCR(ubackmerge); } UVMHIST_LOG(maphist," starting back merge", 0, 0, 0, 0); /* * drop our reference to uobj since we are extending a reference * that we already have (the ref count can not drop to zero). */ if (uobj && uobj->pgops->pgo_detach) uobj->pgops->pgo_detach(uobj); /* * Now that we've merged the entries, note that we've grown * and our gap has shrunk. Then fix the tree. */ prev_entry->end += size; prev_entry->gap -= size; uvm_rb_fixup(map, prev_entry); uvm_map_check(map, "map backmerged"); UVMHIST_LOG(maphist,"<- done (via backmerge)!", 0, 0, 0, 0); merged++; } forwardmerge: if (prev_entry->next->start == (start + size) && prev_entry->next != &map->header && UVM_ET_ISCOMPATIBLE(prev_entry->next, newetype, uobj, 0, prot, maxprot, inherit, advice, 0)) { if (uobj && prev_entry->next->offset != uoffset + size) goto nomerge; /* * can't extend a shared amap. note: no need to lock amap to * look at refs since we don't care about its exact value. * if it is one (i.e. we have only reference) it will stay there. * * note that we also can't merge two amaps, so if we * merged with the previous entry which has an amap, * and the next entry also has an amap, we give up. * * Interesting cases: * amap, new, amap -> give up second merge (single fwd extend) * amap, new, none -> double forward extend (extend again here) * none, new, amap -> double backward extend (done here) * uobj, new, amap -> single backward extend (done here) * * XXX should we attempt to deal with someone refilling * the deallocated region between two entries that are * backed by the same amap (ie, arefs is 2, "prev" and * "next" refer to it, and adding this allocation will * close the hole, thus restoring arefs to 1 and * deallocating the "next" vm_map_entry)? -- @@@ */ if (prev_entry->next->aref.ar_amap && (amap_refs(prev_entry->next->aref.ar_amap) != 1 || (merged && prev_entry->aref.ar_amap))) { goto nomerge; } if (merged) { /* * Try to extend the amap of the previous entry to * cover the next entry as well. If it doesn't work * just skip on, don't actually give up, since we've * already completed the back merge. */ if (prev_entry->aref.ar_amap) { if (amap_extend(prev_entry, prev_entry->next->end - prev_entry->next->start, amapwaitflag | AMAP_EXTEND_FORWARDS)) goto nomerge; } /* * Try to extend the amap of the *next* entry * back to cover the new allocation *and* the * previous entry as well (the previous merge * didn't have an amap already otherwise we * wouldn't be checking here for an amap). If * it doesn't work just skip on, again, don't * actually give up, since we've already * completed the back merge. */ else if (prev_entry->next->aref.ar_amap) { if (amap_extend(prev_entry->next, prev_entry->end - prev_entry->start, amapwaitflag | AMAP_EXTEND_BACKWARDS)) goto nomerge; } } else { /* * Pull the next entry's amap backwards to cover this * new allocation. */ if (prev_entry->next->aref.ar_amap) { error = amap_extend(prev_entry->next, size, amapwaitflag | AMAP_EXTEND_BACKWARDS); if (error) goto nomerge; } } if (merged) { if (kmap) { UVMMAP_EVCNT_DECR(kbackmerge); UVMMAP_EVCNT_INCR(kbimerge); } else { UVMMAP_EVCNT_DECR(ubackmerge); UVMMAP_EVCNT_INCR(ubimerge); } } else { if (kmap) { UVMMAP_EVCNT_INCR(kforwmerge); } else { UVMMAP_EVCNT_INCR(uforwmerge); } } UVMHIST_LOG(maphist," starting forward merge", 0, 0, 0, 0); /* * drop our reference to uobj since we are extending a reference * that we already have (the ref count can not drop to zero). */ if (uobj && uobj->pgops->pgo_detach) uobj->pgops->pgo_detach(uobj); if (merged) { dead = prev_entry->next; prev_entry->end = dead->end; uvm_map_entry_unlink(map, dead); if (dead->aref.ar_amap != NULL) { prev_entry->aref = dead->aref; dead->aref.ar_amap = NULL; } } else { prev_entry->next->start -= size; if (prev_entry != &map->header) { prev_entry->gap -= size; KASSERT(prev_entry->gap == uvm_rb_gap(prev_entry)); uvm_rb_fixup(map, prev_entry); } if (uobj) prev_entry->next->offset = uoffset; } uvm_map_check(map, "map forwardmerged"); UVMHIST_LOG(maphist,"<- done forwardmerge", 0, 0, 0, 0); merged++; } nomerge: if (!merged) { UVMHIST_LOG(maphist," allocating new map entry", 0, 0, 0, 0); if (kmap) { UVMMAP_EVCNT_INCR(knomerge); } else { UVMMAP_EVCNT_INCR(unomerge); } /* * allocate new entry and link it in. */ if (new_entry == NULL) { new_entry = uvm_mapent_alloc(map, (flags & UVM_FLAG_NOWAIT)); if (__predict_false(new_entry == NULL)) { error = ENOMEM; goto done; } } new_entry->start = start; new_entry->end = new_entry->start + size; new_entry->object.uvm_obj = uobj; new_entry->offset = uoffset; new_entry->etype = newetype; if (flags & UVM_FLAG_NOMERGE) { new_entry->flags |= UVM_MAP_NOMERGE; } new_entry->protection = prot; new_entry->max_protection = maxprot; new_entry->inheritance = inherit; new_entry->wired_count = 0; new_entry->advice = advice; if (flags & UVM_FLAG_OVERLAY) { /* * to_add: for BSS we overallocate a little since we * are likely to extend */ vaddr_t to_add = (flags & UVM_FLAG_AMAPPAD) ? UVM_AMAP_CHUNK << PAGE_SHIFT : 0; struct vm_amap *amap = amap_alloc(size, to_add, (flags & UVM_FLAG_NOWAIT)); if (__predict_false(amap == NULL)) { error = ENOMEM; goto done; } new_entry->aref.ar_pageoff = 0; new_entry->aref.ar_amap = amap; } else { new_entry->aref.ar_pageoff = 0; new_entry->aref.ar_amap = NULL; } uvm_map_entry_link(map, prev_entry, new_entry); /* * Update the free space hint */ if ((map->first_free == prev_entry) && (prev_entry->end >= new_entry->start)) map->first_free = new_entry; new_entry = NULL; } map->size += size; UVMHIST_LOG(maphist,"<- done!", 0, 0, 0, 0); error = 0; done: vm_map_unlock(map); if (new_entry) { uvm_mapent_free(new_entry); } if (dead) { KDASSERT(merged); uvm_mapent_free(dead); } if (dead_entries) uvm_unmap_detach(dead_entries, 0); return error; } /* * uvm_map_lookup_entry_bytree: lookup an entry in tree */ static inline bool uvm_map_lookup_entry_bytree(struct vm_map *map, vaddr_t address, struct vm_map_entry **entry /* OUT */) { struct vm_map_entry *prev = &map->header; struct vm_map_entry *cur = ROOT_ENTRY(map); while (cur) { UVMMAP_EVCNT_INCR(mlk_treeloop); if (address >= cur->start) { if (address < cur->end) { *entry = cur; return true; } prev = cur; cur = RIGHT_ENTRY(cur); } else cur = LEFT_ENTRY(cur); } *entry = prev; return false; } /* * uvm_map_lookup_entry: find map entry at or before an address * * => map must at least be read-locked by caller * => entry is returned in "entry" * => return value is true if address is in the returned entry */ bool uvm_map_lookup_entry(struct vm_map *map, vaddr_t address, struct vm_map_entry **entry /* OUT */) { struct vm_map_entry *cur; UVMHIST_FUNC(__func__); UVMHIST_CALLARGS(maphist,"(map=%#jx,addr=%#jx,ent=%#jx)", (uintptr_t)map, address, (uintptr_t)entry, 0); /* * make a quick check to see if we are already looking at * the entry we want (which is usually the case). note also * that we don't need to save the hint here... it is the * same hint (unless we are at the header, in which case the * hint didn't buy us anything anyway). */ cur = map->hint; UVMMAP_EVCNT_INCR(mlk_call); if (cur != &map->header && address >= cur->start && cur->end > address) { UVMMAP_EVCNT_INCR(mlk_hint); *entry = cur; UVMHIST_LOG(maphist,"<- got it via hint (%#jx)", (uintptr_t)cur, 0, 0, 0); uvm_mapent_check(*entry); return (true); } uvm_map_check(map, __func__); /* * lookup in the tree. */ UVMMAP_EVCNT_INCR(mlk_tree); if (__predict_true(uvm_map_lookup_entry_bytree(map, address, entry))) { SAVE_HINT(map, map->hint, *entry); UVMHIST_LOG(maphist,"<- search got it (%#jx)", (uintptr_t)cur, 0, 0, 0); KDASSERT((*entry)->start <= address); KDASSERT(address < (*entry)->end); uvm_mapent_check(*entry); return (true); } SAVE_HINT(map, map->hint, *entry); UVMHIST_LOG(maphist,"<- failed!",0,0,0,0); KDASSERT((*entry) == &map->header || (*entry)->end <= address); KDASSERT((*entry)->next == &map->header || address < (*entry)->next->start); return (false); } /* * See if the range between start and start + length fits in the gap * entry->next->start and entry->end. Returns 1 if fits, 0 if doesn't * fit, and -1 address wraps around. */ static int uvm_map_space_avail(vaddr_t *start, vsize_t length, voff_t uoffset, vsize_t align, int flags, int topdown, struct vm_map_entry *entry) { vaddr_t end; #ifdef PMAP_PREFER /* * push start address forward as needed to avoid VAC alias problems. * we only do this if a valid offset is specified. */ if (uoffset != UVM_UNKNOWN_OFFSET) PMAP_PREFER(uoffset, start, length, topdown); #endif if ((flags & UVM_FLAG_COLORMATCH) != 0) { KASSERT(align < uvmexp.ncolors); if (uvmexp.ncolors > 1) { const u_int colormask = uvmexp.colormask; const u_int colorsize = colormask + 1; vaddr_t hint = atop(*start); const u_int color = hint & colormask; if (color != align) { hint -= color; /* adjust to color boundary */ KASSERT((hint & colormask) == 0); if (topdown) { if (align > color) hint -= colorsize; } else { if (align < color) hint += colorsize; } *start = ptoa(hint + align); /* adjust to color */ } } } else { KASSERT(powerof2(align)); uvm_map_align_va(start, align, topdown); /* * XXX Should we PMAP_PREFER() here again? * eh...i think we're okay */ } /* * Find the end of the proposed new region. Be sure we didn't * wrap around the address; if so, we lose. Otherwise, if the * proposed new region fits before the next entry, we win. */ end = *start + length; if (end < *start) return (-1); if (entry->next->start >= end && *start >= entry->end) return (1); return (0); } static void uvm_findspace_invariants(struct vm_map *map, vaddr_t orig_hint, vaddr_t length, struct uvm_object *uobj, voff_t uoffset, vsize_t align, int flags, vaddr_t hint, struct vm_map_entry *entry, int line) { const int topdown = map->flags & VM_MAP_TOPDOWN; KASSERTMSG( topdown || hint >= orig_hint, "map=%p hint=%#"PRIxVADDR" orig_hint=%#"PRIxVADDR " length=%#"PRIxVSIZE" uobj=%p uoffset=%#llx align=%"PRIxVSIZE " flags=%#x entry=%p (uvm_map_findspace line %d)", map, hint, orig_hint, length, uobj, (unsigned long long)uoffset, align, flags, entry, line); #ifndef __sh3__ /* XXXRO: kern/51254 */ KASSERTMSG(!topdown || hint <= orig_hint, #else if (__predict_false(!(!topdown || hint <= orig_hint))) printf( #endif "map=%p hint=%#"PRIxVADDR" orig_hint=%#"PRIxVADDR " length=%#"PRIxVSIZE" uobj=%p uoffset=%#llx align=%"PRIxVSIZE " flags=%#x entry=%p (uvm_map_findspace line %d)", map, hint, orig_hint, length, uobj, (unsigned long long)uoffset, align, flags, entry, line); } /* * uvm_map_findspace: find "length" sized space in "map". * * => "hint" is a hint about where we want it, unless UVM_FLAG_FIXED is * set in "flags" (in which case we insist on using "hint"). * => "result" is VA returned * => uobj/uoffset are to be used to handle VAC alignment, if required * => if "align" is non-zero, we attempt to align to that value. * => caller must at least have read-locked map * => returns NULL on failure, or pointer to prev. map entry if success * => note this is a cross between the old vm_map_findspace and vm_map_find */ struct vm_map_entry * uvm_map_findspace(struct vm_map *map, vaddr_t hint, vsize_t length, vaddr_t *result /* OUT */, struct uvm_object *uobj, voff_t uoffset, vsize_t align, int flags) { #define INVARIANTS() \ uvm_findspace_invariants(map, orig_hint, length, uobj, uoffset, align,\ flags, hint, entry, __LINE__) struct vm_map_entry *entry = NULL; struct vm_map_entry *child, *prev, *tmp; vaddr_t orig_hint __diagused; const int topdown = map->flags & VM_MAP_TOPDOWN; int avail; UVMHIST_FUNC(__func__); UVMHIST_CALLARGS(maphist, "(map=%#jx, hint=%#jx, len=%ju, flags=%#jx...", (uintptr_t)map, hint, length, flags); UVMHIST_LOG(maphist, " uobj=%#jx, uoffset=%#jx, align=%#jx)", (uintptr_t)uobj, uoffset, align, 0); KASSERT((flags & UVM_FLAG_COLORMATCH) != 0 || powerof2(align)); KASSERT((flags & UVM_FLAG_COLORMATCH) == 0 || align < uvmexp.ncolors); KASSERT((flags & UVM_FLAG_FIXED) == 0 || align == 0); uvm_map_check(map, "map_findspace entry"); /* * Clamp the hint to the VM map's min/max address, and remmeber * the clamped original hint. Remember the original hint, * clamped to the min/max address. If we are aligning, then we * may have to try again with no alignment constraint if we * fail the first time. * * We use the original hint to verify later that the search has * been monotonic -- that is, nonincreasing or nondecreasing, * according to topdown or !topdown respectively. But the * clamping is not monotonic. */ if (hint < vm_map_min(map)) { /* check ranges ... */ if (flags & UVM_FLAG_FIXED) { UVMHIST_LOG(maphist,"<- VA below map range",0,0,0,0); return (NULL); } hint = vm_map_min(map); } if (hint > vm_map_max(map)) { UVMHIST_LOG(maphist,"<- VA %#jx > range [%#jx->%#jx]", hint, vm_map_min(map), vm_map_max(map), 0); return (NULL); } orig_hint = hint; INVARIANTS(); UVMHIST_LOG(maphist,"<- VA %#jx vs range [%#jx->%#jx]", hint, vm_map_min(map), vm_map_max(map), 0); /* * hint may not be aligned properly; we need round up or down it * before proceeding further. */ if ((flags & UVM_FLAG_COLORMATCH) == 0) { uvm_map_align_va(&hint, align, topdown); INVARIANTS(); } UVMHIST_LOG(maphist,"<- VA %#jx vs range [%#jx->%#jx]", hint, vm_map_min(map), vm_map_max(map), 0); /* * Look for the first possible address; if there's already * something at this address, we have to start after it. */ /* * @@@: there are four, no, eight cases to consider. * * 0: found, fixed, bottom up -> fail * 1: found, fixed, top down -> fail * 2: found, not fixed, bottom up -> start after entry->end, * loop up * 3: found, not fixed, top down -> start before entry->start, * loop down * 4: not found, fixed, bottom up -> check entry->next->start, fail * 5: not found, fixed, top down -> check entry->next->start, fail * 6: not found, not fixed, bottom up -> check entry->next->start, * loop up * 7: not found, not fixed, top down -> check entry->next->start, * loop down * * as you can see, it reduces to roughly five cases, and that * adding top down mapping only adds one unique case (without * it, there would be four cases). */ if ((flags & UVM_FLAG_FIXED) == 0 && hint == (topdown ? vm_map_max(map) : vm_map_min(map))) { /* * The uvm_map_findspace algorithm is monotonic -- for * topdown VM it starts with a high hint and returns a * lower free address; for !topdown VM it starts with a * low hint and returns a higher free address. As an * optimization, start with the first (highest for * topdown, lowest for !topdown) free address. * * XXX This `optimization' probably doesn't actually do * much in practice unless userland explicitly passes * the VM map's minimum or maximum address, which * varies from machine to machine (VM_MAX/MIN_ADDRESS, * e.g. 0x7fbfdfeff000 on amd64 but 0xfffffffff000 on * aarch64) and may vary according to other factors * like sysctl vm.user_va0_disable. In particular, if * the user specifies 0 as a hint to mmap, then mmap * will choose a default address which is usually _not_ * VM_MAX/MIN_ADDRESS but something else instead like * VM_MAX_ADDRESS - stack size - guard page overhead, * in which case this branch is never hit. * * In fact, this branch appears to have been broken for * two decades between when topdown was introduced in * ~2003 and when it was adapted to handle the topdown * case without violating the monotonicity assertion in * 2022. Maybe Someone^TM should either ditch the * optimization or find a better way to do it. */ entry = map->first_free; } else { if (uvm_map_lookup_entry(map, hint, &entry)) { /* "hint" address already in use ... */ if (flags & UVM_FLAG_FIXED) { UVMHIST_LOG(maphist, "<- fixed & VA in use", 0, 0, 0, 0); return (NULL); } if (topdown) /* Start from lower gap. */ entry = entry->prev; } else if (flags & UVM_FLAG_FIXED) { if (entry->next->start >= hint + length && hint + length > hint) goto found; /* "hint" address is gap but too small */ UVMHIST_LOG(maphist, "<- fixed mapping failed", 0, 0, 0, 0); return (NULL); /* only one shot at it ... */ } else { /* * See if given hint fits in this gap. */ avail = uvm_map_space_avail(&hint, length, uoffset, align, flags, topdown, entry); INVARIANTS(); switch (avail) { case 1: goto found; case -1: goto wraparound; } if (topdown) { /* * Still there is a chance to fit * if hint > entry->end. */ } else { /* Start from higher gap. */ entry = entry->next; if (entry == &map->header) goto notfound; goto nextgap; } } } /* * Note that all UVM_FLAGS_FIXED case is already handled. */ KDASSERT((flags & UVM_FLAG_FIXED) == 0); /* Try to find the space in the red-black tree */ /* Check slot before any entry */ if (topdown) { KASSERTMSG(entry->next->start >= vm_map_min(map), "map=%p entry=%p entry->next=%p" " entry->next->start=0x%"PRIxVADDR" min=0x%"PRIxVADDR, map, entry, entry->next, entry->next->start, vm_map_min(map)); if (length > entry->next->start - vm_map_min(map)) hint = vm_map_min(map); /* XXX goto wraparound? */ else hint = entry->next->start - length; KASSERT(hint >= vm_map_min(map)); } else { hint = entry->end; } INVARIANTS(); avail = uvm_map_space_avail(&hint, length, uoffset, align, flags, topdown, entry); INVARIANTS(); switch (avail) { case 1: goto found; case -1: goto wraparound; } nextgap: KDASSERT((flags & UVM_FLAG_FIXED) == 0); /* If there is not enough space in the whole tree, we fail */ tmp = ROOT_ENTRY(map); if (tmp == NULL || tmp->maxgap < length) goto notfound; prev = NULL; /* previous candidate */ /* Find an entry close to hint that has enough space */ for (; tmp;) { KASSERT(tmp->next->start == tmp->end + tmp->gap); if (topdown) { if (tmp->next->start < hint + length && (prev == NULL || tmp->end > prev->end)) { if (tmp->gap >= length) prev = tmp; else if ((child = LEFT_ENTRY(tmp)) != NULL && child->maxgap >= length) prev = tmp; } } else { if (tmp->end >= hint && (prev == NULL || tmp->end < prev->end)) { if (tmp->gap >= length) prev = tmp; else if ((child = RIGHT_ENTRY(tmp)) != NULL && child->maxgap >= length) prev = tmp; } } if (tmp->next->start < hint + length) child = RIGHT_ENTRY(tmp); else if (tmp->end > hint) child = LEFT_ENTRY(tmp); else { if (tmp->gap >= length) break; if (topdown) child = LEFT_ENTRY(tmp); else child = RIGHT_ENTRY(tmp); } if (child == NULL || child->maxgap < length) break; tmp = child; } if (tmp != NULL && tmp->start < hint && hint < tmp->next->start) { /* * Check if the entry that we found satifies the * space requirement */ if (topdown) { if (hint > tmp->next->start - length) hint = tmp->next->start - length; } else { if (hint < tmp->end) hint = tmp->end; } INVARIANTS(); avail = uvm_map_space_avail(&hint, length, uoffset, align, flags, topdown, tmp); INVARIANTS(); switch (avail) { case 1: entry = tmp; goto found; case -1: goto wraparound; } if (tmp->gap >= length) goto listsearch; } if (prev == NULL) goto notfound; if (topdown) { KASSERT(orig_hint >= prev->next->start - length || prev->next->start - length > prev->next->start); hint = prev->next->start - length; } else { KASSERT(orig_hint <= prev->end); hint = prev->end; } INVARIANTS(); avail = uvm_map_space_avail(&hint, length, uoffset, align, flags, topdown, prev); INVARIANTS(); switch (avail) { case 1: entry = prev; goto found; case -1: goto wraparound; } if (prev->gap >= length) goto listsearch; if (topdown) tmp = LEFT_ENTRY(prev); else tmp = RIGHT_ENTRY(prev); for (;;) { KASSERT(tmp); KASSERTMSG(tmp->maxgap >= length, "tmp->maxgap=0x%"PRIxVSIZE" length=0x%"PRIxVSIZE, tmp->maxgap, length); if (topdown) child = RIGHT_ENTRY(tmp); else child = LEFT_ENTRY(tmp); if (child && child->maxgap >= length) { tmp = child; continue; } if (tmp->gap >= length) break; if (topdown) tmp = LEFT_ENTRY(tmp); else tmp = RIGHT_ENTRY(tmp); } if (topdown) { KASSERT(orig_hint >= tmp->next->start - length || tmp->next->start - length > tmp->next->start); hint = tmp->next->start - length; } else { KASSERT(orig_hint <= tmp->end); hint = tmp->end; } INVARIANTS(); avail = uvm_map_space_avail(&hint, length, uoffset, align, flags, topdown, tmp); INVARIANTS(); switch (avail) { case 1: entry = tmp; goto found; case -1: goto wraparound; } /* * The tree fails to find an entry because of offset or alignment * restrictions. Search the list instead. */ listsearch: /* * Look through the rest of the map, trying to fit a new region in * the gap between existing regions, or after the very last region. * note: entry->end = base VA of current gap, * entry->next->start = VA of end of current gap */ INVARIANTS(); for (;;) { /* Update hint for current gap. */ hint = topdown ? entry->next->start - length : entry->end; INVARIANTS(); /* See if it fits. */ avail = uvm_map_space_avail(&hint, length, uoffset, align, flags, topdown, entry); INVARIANTS(); switch (avail) { case 1: goto found; case -1: goto wraparound; } /* Advance to next/previous gap */ if (topdown) { if (entry == &map->header) { UVMHIST_LOG(maphist, "<- failed (off start)", 0,0,0,0); goto notfound; } entry = entry->prev; } else { entry = entry->next; if (entry == &map->header) { UVMHIST_LOG(maphist, "<- failed (off end)", 0,0,0,0); goto notfound; } } } found: SAVE_HINT(map, map->hint, entry); *result = hint; UVMHIST_LOG(maphist,"<- got it! (result=%#jx)", hint, 0,0,0); INVARIANTS(); KASSERT(entry->end <= hint); KASSERT(hint + length <= entry->next->start); return (entry); wraparound: UVMHIST_LOG(maphist, "<- failed (wrap around)", 0,0,0,0); return (NULL); notfound: UVMHIST_LOG(maphist, "<- failed (notfound)", 0,0,0,0); return (NULL); #undef INVARIANTS } /* * U N M A P - m a i n h e l p e r f u n c t i o n s */ /* * uvm_unmap_remove: remove mappings from a vm_map (from "start" up to "stop") * * => caller must check alignment and size * => map must be locked by caller * => we return a list of map entries that we've remove from the map * in "entry_list" */ void uvm_unmap_remove(struct vm_map *map, vaddr_t start, vaddr_t end, struct vm_map_entry **entry_list /* OUT */, int flags) { struct vm_map_entry *entry, *first_entry, *next; vaddr_t len; UVMHIST_FUNC(__func__); UVMHIST_CALLARGS(maphist,"(map=%#jx, start=%#jx, end=%#jx)", (uintptr_t)map, start, end, 0); VM_MAP_RANGE_CHECK(map, start, end); uvm_map_check(map, "unmap_remove entry"); /* * find first entry */ if (uvm_map_lookup_entry(map, start, &first_entry) == true) { /* clip and go... */ entry = first_entry; UVM_MAP_CLIP_START(map, entry, start); /* critical! prevents stale hint */ SAVE_HINT(map, entry, entry->prev); } else { entry = first_entry->next; } /* * save the free space hint */ if (map->first_free != &map->header && map->first_free->start >= start) map->first_free = entry->prev; /* * note: we now re-use first_entry for a different task. we remove * a number of map entries from the map and save them in a linked * list headed by "first_entry". once we remove them from the map * the caller should unlock the map and drop the references to the * backing objects [c.f. uvm_unmap_detach]. the object is to * separate unmapping from reference dropping. why? * [1] the map has to be locked for unmapping * [2] the map need not be locked for reference dropping * [3] dropping references may trigger pager I/O, and if we hit * a pager that does synchronous I/O we may have to wait for it. * [4] we would like all waiting for I/O to occur with maps unlocked * so that we don't block other threads. */ first_entry = NULL; *entry_list = NULL; /* * break up the area into map entry sized regions and unmap. note * that all mappings have to be removed before we can even consider * dropping references to amaps or VM objects (otherwise we could end * up with a mapping to a page on the free list which would be very bad) */ while ((entry != &map->header) && (entry->start < end)) { KASSERT((entry->flags & UVM_MAP_STATIC) == 0); UVM_MAP_CLIP_END(map, entry, end); next = entry->next; len = entry->end - entry->start; /* * unwire before removing addresses from the pmap; otherwise * unwiring will put the entries back into the pmap (XXX). */ if (VM_MAPENT_ISWIRED(entry)) { uvm_map_entry_unwire(map, entry); } if (flags & UVM_FLAG_VAONLY) { /* nothing */ } else if ((map->flags & VM_MAP_PAGEABLE) == 0) { /* * if the map is non-pageable, any pages mapped there * must be wired and entered with pmap_kenter_pa(), * and we should free any such pages immediately. * this is mostly used for kmem_map. */ KASSERT(vm_map_pmap(map) == pmap_kernel()); uvm_km_pgremove_intrsafe(map, entry->start, entry->end); } else if (UVM_ET_ISOBJ(entry) && UVM_OBJ_IS_KERN_OBJECT(entry->object.uvm_obj)) { panic("%s: kernel object %p %p\n", __func__, map, entry); } else if (UVM_ET_ISOBJ(entry) || entry->aref.ar_amap) { /* * remove mappings the standard way. lock object * and/or amap to ensure vm_page state does not * change while in pmap_remove(). */ #ifdef __HAVE_UNLOCKED_PMAP /* XXX temporary */ uvm_map_lock_entry(entry, RW_WRITER); #else uvm_map_lock_entry(entry, RW_READER); #endif pmap_remove(map->pmap, entry->start, entry->end); /* * note: if map is dying, leave pmap_update() for * later. if the map is to be reused (exec) then * pmap_update() will be called. if the map is * being disposed of (exit) then pmap_destroy() * will be called. */ if ((map->flags & VM_MAP_DYING) == 0) { pmap_update(vm_map_pmap(map)); } else { KASSERT(vm_map_pmap(map) != pmap_kernel()); } uvm_map_unlock_entry(entry); } #if defined(UVMDEBUG) /* * check if there's remaining mapping, * which is a bug in caller. */ vaddr_t va; for (va = entry->start; va < entry->end; va += PAGE_SIZE) { if (pmap_extract(vm_map_pmap(map), va, NULL)) { panic("%s: %#"PRIxVADDR" has mapping", __func__, va); } } if (VM_MAP_IS_KERNEL(map) && (flags & UVM_FLAG_NOWAIT) == 0) { uvm_km_check_empty(map, entry->start, entry->end); } #endif /* defined(UVMDEBUG) */ /* * remove entry from map and put it on our list of entries * that we've nuked. then go to next entry. */ UVMHIST_LOG(maphist, " removed map entry %#jx", (uintptr_t)entry, 0, 0, 0); /* critical! prevents stale hint */ SAVE_HINT(map, entry, entry->prev); uvm_map_entry_unlink(map, entry); KASSERT(map->size >= len); map->size -= len; entry->prev = NULL; entry->next = first_entry; first_entry = entry; entry = next; } uvm_map_check(map, "unmap_remove leave"); /* * now we've cleaned up the map and are ready for the caller to drop * references to the mapped objects. */ *entry_list = first_entry; UVMHIST_LOG(maphist,"<- done!", 0, 0, 0, 0); if (map->flags & VM_MAP_WANTVA) { mutex_enter(&map->misc_lock); map->flags &= ~VM_MAP_WANTVA; cv_broadcast(&map->cv); mutex_exit(&map->misc_lock); } } /* * uvm_unmap_detach: drop references in a chain of map entries * * => we will free the map entries as we traverse the list. */ void uvm_unmap_detach(struct vm_map_entry *first_entry, int flags) { struct vm_map_entry *next_entry; UVMHIST_FUNC(__func__); UVMHIST_CALLED(maphist); while (first_entry) { KASSERT(!VM_MAPENT_ISWIRED(first_entry)); UVMHIST_LOG(maphist, " detach %#jx: amap=%#jx, obj=%#jx, submap?=%jd", (uintptr_t)first_entry, (uintptr_t)first_entry->aref.ar_amap, (uintptr_t)first_entry->object.uvm_obj, UVM_ET_ISSUBMAP(first_entry)); /* * drop reference to amap, if we've got one */ if (first_entry->aref.ar_amap) uvm_map_unreference_amap(first_entry, flags); /* * drop reference to our backing object, if we've got one */ KASSERT(!UVM_ET_ISSUBMAP(first_entry)); if (UVM_ET_ISOBJ(first_entry) && first_entry->object.uvm_obj->pgops->pgo_detach) { (*first_entry->object.uvm_obj->pgops->pgo_detach) (first_entry->object.uvm_obj); } next_entry = first_entry->next; uvm_mapent_free(first_entry); first_entry = next_entry; } UVMHIST_LOG(maphist, "<- done", 0,0,0,0); } /* * E X T R A C T I O N F U N C T I O N S */ /* * uvm_map_reserve: reserve space in a vm_map for future use. * * => we reserve space in a map by putting a dummy map entry in the * map (dummy means obj=NULL, amap=NULL, prot=VM_PROT_NONE) * => map should be unlocked (we will write lock it) * => we return true if we were able to reserve space * => XXXCDC: should be inline? */ int uvm_map_reserve(struct vm_map *map, vsize_t size, vaddr_t offset /* hint for pmap_prefer */, vsize_t align /* alignment */, vaddr_t *raddr /* IN:hint, OUT: reserved VA */, uvm_flag_t flags /* UVM_FLAG_FIXED or UVM_FLAG_COLORMATCH or 0 */) { UVMHIST_FUNC(__func__); UVMHIST_CALLARGS(maphist, "(map=%#jx, size=%#jx, offset=%#jx, addr=%#jx)", (uintptr_t)map, size, offset, (uintptr_t)raddr); size = round_page(size); /* * reserve some virtual space. */ if (uvm_map(map, raddr, size, NULL, offset, align, UVM_MAPFLAG(UVM_PROT_NONE, UVM_PROT_NONE, UVM_INH_NONE, UVM_ADV_RANDOM, UVM_FLAG_NOMERGE|flags)) != 0) { UVMHIST_LOG(maphist, "<- done (no VM)", 0,0,0,0); return (false); } UVMHIST_LOG(maphist, "<- done (*raddr=%#jx)", *raddr,0,0,0); return (true); } /* * uvm_map_replace: replace a reserved (blank) area of memory with * real mappings. * * => caller must WRITE-LOCK the map * => we return true if replacement was a success * => we expect the newents chain to have nnewents entrys on it and * we expect newents->prev to point to the last entry on the list * => note newents is allowed to be NULL */ static int uvm_map_replace(struct vm_map *map, vaddr_t start, vaddr_t end, struct vm_map_entry *newents, int nnewents, vsize_t nsize, struct vm_map_entry **oldentryp) { struct vm_map_entry *oldent, *last; uvm_map_check(map, "map_replace entry"); /* * first find the blank map entry at the specified address */ if (!uvm_map_lookup_entry(map, start, &oldent)) { return (false); } /* * check to make sure we have a proper blank entry */ if (end < oldent->end) { UVM_MAP_CLIP_END(map, oldent, end); } if (oldent->start != start || oldent->end != end || oldent->object.uvm_obj != NULL || oldent->aref.ar_amap != NULL) { return (false); } #ifdef DIAGNOSTIC /* * sanity check the newents chain */ { struct vm_map_entry *tmpent = newents; int nent = 0; vsize_t sz = 0; vaddr_t cur = start; while (tmpent) { nent++; sz += tmpent->end - tmpent->start; if (tmpent->start < cur) panic("uvm_map_replace1"); if (tmpent->start >= tmpent->end || tmpent->end > end) { panic("uvm_map_replace2: " "tmpent->start=%#"PRIxVADDR ", tmpent->end=%#"PRIxVADDR ", end=%#"PRIxVADDR, tmpent->start, tmpent->end, end); } cur = tmpent->end; if (tmpent->next) { if (tmpent->next->prev != tmpent) panic("uvm_map_replace3"); } else { if (newents->prev != tmpent) panic("uvm_map_replace4"); } tmpent = tmpent->next; } if (nent != nnewents) panic("uvm_map_replace5"); if (sz != nsize) panic("uvm_map_replace6"); } #endif /* * map entry is a valid blank! replace it. (this does all the * work of map entry link/unlink...). */ if (newents) { last = newents->prev; /* critical: flush stale hints out of map */ SAVE_HINT(map, map->hint, newents); if (map->first_free == oldent) map->first_free = last; last->next = oldent->next; last->next->prev = last; /* Fix RB tree */ uvm_rb_remove(map, oldent); newents->prev = oldent->prev; newents->prev->next = newents; map->nentries = map->nentries + (nnewents - 1); /* Fixup the RB tree */ { int i; struct vm_map_entry *tmp; tmp = newents; for (i = 0; i < nnewents && tmp; i++) { uvm_rb_insert(map, tmp); tmp = tmp->next; } } } else { /* NULL list of new entries: just remove the old one */ clear_hints(map, oldent); uvm_map_entry_unlink(map, oldent); } map->size -= end - start - nsize; uvm_map_check(map, "map_replace leave"); /* * now we can free the old blank entry and return. */ *oldentryp = oldent; return (true); } /* * uvm_map_extract: extract a mapping from a map and put it somewhere * (maybe removing the old mapping) * * => maps should be unlocked (we will write lock them) * => returns 0 on success, error code otherwise * => start must be page aligned * => len must be page sized * => flags: * UVM_EXTRACT_REMOVE: remove mappings from srcmap * UVM_EXTRACT_CONTIG: abort if unmapped area (advisory only) * UVM_EXTRACT_QREF: for a temporary extraction do quick obj refs * UVM_EXTRACT_FIXPROT: set prot to maxprot as we go * UVM_EXTRACT_PROT_ALL: set prot to UVM_PROT_ALL as we go * >>>NOTE: if you set REMOVE, you are not allowed to use CONTIG or QREF!<<< * >>>NOTE: QREF's must be unmapped via the QREF path, thus should only * be used from within the kernel in a kernel level map <<< */ int uvm_map_extract(struct vm_map *srcmap, vaddr_t start, vsize_t len, struct vm_map *dstmap, vaddr_t *dstaddrp, int flags) { vaddr_t dstaddr, end, newend, oldoffset, fudge, orig_fudge; struct vm_map_entry *chain, *endchain, *entry, *orig_entry, *newentry, *deadentry, *oldentry; struct vm_map_entry *resentry = NULL; /* a dummy reservation entry */ vsize_t elen __unused; int nchain, error, copy_ok; vsize_t nsize; UVMHIST_FUNC(__func__); UVMHIST_CALLARGS(maphist,"(srcmap=%#jx,start=%#jx, len=%#jx", (uintptr_t)srcmap, start, len, 0); UVMHIST_LOG(maphist," ...,dstmap=%#jx, flags=%#jx)", (uintptr_t)dstmap, flags, 0, 0); /* * step 0: sanity check: start must be on a page boundary, length * must be page sized. can't ask for CONTIG/QREF if you asked for * REMOVE. */ KASSERTMSG((start & PAGE_MASK) == 0, "start=0x%"PRIxVADDR, start); KASSERTMSG((len & PAGE_MASK) == 0, "len=0x%"PRIxVADDR, len); KASSERT((flags & UVM_EXTRACT_REMOVE) == 0 || (flags & (UVM_EXTRACT_CONTIG|UVM_EXTRACT_QREF)) == 0); /* * step 1: reserve space in the target map for the extracted area */ if ((flags & UVM_EXTRACT_RESERVED) == 0) { dstaddr = vm_map_min(dstmap); if (!uvm_map_reserve(dstmap, len, start, atop(start) & uvmexp.colormask, &dstaddr, UVM_FLAG_COLORMATCH)) return (ENOMEM); KASSERT((atop(start ^ dstaddr) & uvmexp.colormask) == 0); *dstaddrp = dstaddr; /* pass address back to caller */ UVMHIST_LOG(maphist, " dstaddr=%#jx", dstaddr,0,0,0); } else { dstaddr = *dstaddrp; } /* * step 2: setup for the extraction process loop by init'ing the * map entry chain, locking src map, and looking up the first useful * entry in the map. */ end = start + len; newend = dstaddr + len; chain = endchain = NULL; nchain = 0; nsize = 0; vm_map_lock(srcmap); if (uvm_map_lookup_entry(srcmap, start, &entry)) { /* "start" is within an entry */ if (flags & UVM_EXTRACT_QREF) { /* * for quick references we don't clip the entry, so * the entry may map space "before" the starting * virtual address... this is the "fudge" factor * (which can be non-zero only the first time * through the "while" loop in step 3). */ fudge = start - entry->start; } else { /* * normal reference: we clip the map to fit (thus * fudge is zero) */ UVM_MAP_CLIP_START(srcmap, entry, start); SAVE_HINT(srcmap, srcmap->hint, entry->prev); fudge = 0; } } else { /* "start" is not within an entry ... skip to next entry */ if (flags & UVM_EXTRACT_CONTIG) { error = EINVAL; goto bad; /* definite hole here ... */ } entry = entry->next; fudge = 0; } /* save values from srcmap for step 6 */ orig_entry = entry; orig_fudge = fudge; /* * step 3: now start looping through the map entries, extracting * as we go. */ while (entry->start < end && entry != &srcmap->header) { /* if we are not doing a quick reference, clip it */ if ((flags & UVM_EXTRACT_QREF) == 0) UVM_MAP_CLIP_END(srcmap, entry, end); /* clear needs_copy (allow chunking) */ if (UVM_ET_ISNEEDSCOPY(entry)) { amap_copy(srcmap, entry, AMAP_COPY_NOWAIT|AMAP_COPY_NOMERGE, start, end); if (UVM_ET_ISNEEDSCOPY(entry)) { /* failed? */ error = ENOMEM; goto bad; } /* amap_copy could clip (during chunk)! update fudge */ if (fudge) { fudge = start - entry->start; orig_fudge = fudge; } } /* calculate the offset of this from "start" */ oldoffset = (entry->start + fudge) - start; /* allocate a new map entry */ newentry = uvm_mapent_alloc(dstmap, 0); if (newentry == NULL) { error = ENOMEM; goto bad; } /* set up new map entry */ newentry->next = NULL; newentry->prev = endchain; newentry->start = dstaddr + oldoffset; newentry->end = newentry->start + (entry->end - (entry->start + fudge)); if (newentry->end > newend || newentry->end < newentry->start) newentry->end = newend; newentry->object.uvm_obj = entry->object.uvm_obj; if (newentry->object.uvm_obj) { if (newentry->object.uvm_obj->pgops->pgo_reference) newentry->object.uvm_obj->pgops-> pgo_reference(newentry->object.uvm_obj); newentry->offset = entry->offset + fudge; } else { newentry->offset = 0; } newentry->etype = entry->etype; if (flags & UVM_EXTRACT_PROT_ALL) { newentry->protection = newentry->max_protection = UVM_PROT_ALL; } else { newentry->protection = (flags & UVM_EXTRACT_FIXPROT) ? entry->max_protection : entry->protection; newentry->max_protection = entry->max_protection; } newentry->inheritance = entry->inheritance; newentry->wired_count = 0; newentry->aref.ar_amap = entry->aref.ar_amap; if (newentry->aref.ar_amap) { newentry->aref.ar_pageoff = entry->aref.ar_pageoff + (fudge >> PAGE_SHIFT); uvm_map_reference_amap(newentry, AMAP_SHARED | ((flags & UVM_EXTRACT_QREF) ? AMAP_REFALL : 0)); } else { newentry->aref.ar_pageoff = 0; } newentry->advice = entry->advice; if ((flags & UVM_EXTRACT_QREF) != 0) { newentry->flags |= UVM_MAP_NOMERGE; } /* now link it on the chain */ nchain++; nsize += newentry->end - newentry->start; if (endchain == NULL) { chain = endchain = newentry; } else { endchain->next = newentry; endchain = newentry; } /* end of 'while' loop! */ if ((flags & UVM_EXTRACT_CONTIG) && entry->end < end && (entry->next == &srcmap->header || entry->next->start != entry->end)) { error = EINVAL; goto bad; } entry = entry->next; fudge = 0; } /* * step 4: close off chain (in format expected by uvm_map_replace) */ if (chain) chain->prev = endchain; /* * step 5: attempt to lock the dest map so we can pmap_copy. * note usage of copy_ok: * 1 => dstmap locked, pmap_copy ok, and we "replace" here (step 5) * 0 => dstmap unlocked, NO pmap_copy, and we will "replace" in step 7 */ if (srcmap == dstmap || vm_map_lock_try(dstmap) == true) { copy_ok = 1; if (!uvm_map_replace(dstmap, dstaddr, dstaddr+len, chain, nchain, nsize, &resentry)) { if (srcmap != dstmap) vm_map_unlock(dstmap); error = EIO; goto bad; } } else { copy_ok = 0; /* replace deferred until step 7 */ } /* * step 6: traverse the srcmap a second time to do the following: * - if we got a lock on the dstmap do pmap_copy * - if UVM_EXTRACT_REMOVE remove the entries * we make use of orig_entry and orig_fudge (saved in step 2) */ if (copy_ok || (flags & UVM_EXTRACT_REMOVE)) { /* purge possible stale hints from srcmap */ if (flags & UVM_EXTRACT_REMOVE) { SAVE_HINT(srcmap, srcmap->hint, orig_entry->prev); if (srcmap->first_free != &srcmap->header && srcmap->first_free->start >= start) srcmap->first_free = orig_entry->prev; } entry = orig_entry; fudge = orig_fudge; deadentry = NULL; /* for UVM_EXTRACT_REMOVE */ while (entry->start < end && entry != &srcmap->header) { if (copy_ok) { oldoffset = (entry->start + fudge) - start; elen = MIN(end, entry->end) - (entry->start + fudge); pmap_copy(dstmap->pmap, srcmap->pmap, dstaddr + oldoffset, elen, entry->start + fudge); } /* we advance "entry" in the following if statement */ if (flags & UVM_EXTRACT_REMOVE) { #ifdef __HAVE_UNLOCKED_PMAP /* XXX temporary */ uvm_map_lock_entry(entry, RW_WRITER); #else uvm_map_lock_entry(entry, RW_READER); #endif pmap_remove(srcmap->pmap, entry->start, entry->end); uvm_map_unlock_entry(entry); oldentry = entry; /* save entry */ entry = entry->next; /* advance */ uvm_map_entry_unlink(srcmap, oldentry); /* add to dead list */ oldentry->next = deadentry; deadentry = oldentry; } else { entry = entry->next; /* advance */ } /* end of 'while' loop */ fudge = 0; } pmap_update(srcmap->pmap); /* * unlock dstmap. we will dispose of deadentry in * step 7 if needed */ if (copy_ok && srcmap != dstmap) vm_map_unlock(dstmap); } else { deadentry = NULL; } /* * step 7: we are done with the source map, unlock. if copy_ok * is 0 then we have not replaced the dummy mapping in dstmap yet * and we need to do so now. */ vm_map_unlock(srcmap); if ((flags & UVM_EXTRACT_REMOVE) && deadentry) uvm_unmap_detach(deadentry, 0); /* dispose of old entries */ /* now do the replacement if we didn't do it in step 5 */ if (copy_ok == 0) { vm_map_lock(dstmap); error = uvm_map_replace(dstmap, dstaddr, dstaddr+len, chain, nchain, nsize, &resentry); vm_map_unlock(dstmap); if (error == false) { error = EIO; goto bad2; } } if (resentry != NULL) uvm_mapent_free(resentry); return (0); /* * bad: failure recovery */ bad: vm_map_unlock(srcmap); bad2: /* src already unlocked */ if (chain) uvm_unmap_detach(chain, (flags & UVM_EXTRACT_QREF) ? AMAP_REFALL : 0); if (resentry != NULL) uvm_mapent_free(resentry); if ((flags & UVM_EXTRACT_RESERVED) == 0) { uvm_unmap(dstmap, dstaddr, dstaddr+len); /* ??? */ } return (error); } /* end of extraction functions */ /* * uvm_map_submap: punch down part of a map into a submap * * => only the kernel_map is allowed to be submapped * => the purpose of submapping is to break up the locking granularity * of a larger map * => the range specified must have been mapped previously with a uvm_map() * call [with uobj==NULL] to create a blank map entry in the main map. * [And it had better still be blank!] * => maps which contain submaps should never be copied or forked. * => to remove a submap, use uvm_unmap() on the main map * and then uvm_map_deallocate() the submap. * => main map must be unlocked. * => submap must have been init'd and have a zero reference count. * [need not be locked as we don't actually reference it] */ int uvm_map_submap(struct vm_map *map, vaddr_t start, vaddr_t end, struct vm_map *submap) { struct vm_map_entry *entry; int error; vm_map_lock(map); VM_MAP_RANGE_CHECK(map, start, end); if (uvm_map_lookup_entry(map, start, &entry)) { UVM_MAP_CLIP_START(map, entry, start); UVM_MAP_CLIP_END(map, entry, end); /* to be safe */ } else { entry = NULL; } if (entry != NULL && entry->start == start && entry->end == end && entry->object.uvm_obj == NULL && entry->aref.ar_amap == NULL && !UVM_ET_ISCOPYONWRITE(entry) && !UVM_ET_ISNEEDSCOPY(entry)) { entry->etype |= UVM_ET_SUBMAP; entry->object.sub_map = submap; entry->offset = 0; uvm_map_reference(submap); error = 0; } else { error = EINVAL; } vm_map_unlock(map); return error; } /* * uvm_map_protect_user: change map protection on behalf of the user. * Enforces PAX settings as necessary. */ int uvm_map_protect_user(struct lwp *l, vaddr_t start, vaddr_t end, vm_prot_t new_prot) { int error; if ((error = PAX_MPROTECT_VALIDATE(l, new_prot))) return error; return uvm_map_protect(&l->l_proc->p_vmspace->vm_map, start, end, new_prot, false); } /* * uvm_map_protect: change map protection * * => set_max means set max_protection. * => map must be unlocked. */ #define MASK(entry) (UVM_ET_ISCOPYONWRITE(entry) ? \ ~VM_PROT_WRITE : VM_PROT_ALL) int uvm_map_protect(struct vm_map *map, vaddr_t start, vaddr_t end, vm_prot_t new_prot, bool set_max) { struct vm_map_entry *current, *entry; int error = 0; UVMHIST_FUNC(__func__); UVMHIST_CALLARGS(maphist,"(map=%#jx,start=%#jx,end=%#jx,new_prot=%#jx)", (uintptr_t)map, start, end, new_prot); vm_map_lock(map); VM_MAP_RANGE_CHECK(map, start, end); if (uvm_map_lookup_entry(map, start, &entry)) { UVM_MAP_CLIP_START(map, entry, start); } else { entry = entry->next; } /* * make a first pass to check for protection violations. */ current = entry; while ((current != &map->header) && (current->start < end)) { if (UVM_ET_ISSUBMAP(current)) { error = EINVAL; goto out; } if ((new_prot & current->max_protection) != new_prot) { error = EACCES; goto out; } /* * Don't allow VM_PROT_EXECUTE to be set on entries that * point to vnodes that are associated with a NOEXEC file * system. */ if (UVM_ET_ISOBJ(current) && UVM_OBJ_IS_VNODE(current->object.uvm_obj)) { struct vnode *vp = (struct vnode *) current->object.uvm_obj; if ((new_prot & VM_PROT_EXECUTE) != 0 && (vp->v_mount->mnt_flag & MNT_NOEXEC) != 0) { error = EACCES; goto out; } } current = current->next; } /* go back and fix up protections (no need to clip this time). */ current = entry; while ((current != &map->header) && (current->start < end)) { vm_prot_t old_prot; UVM_MAP_CLIP_END(map, current, end); old_prot = current->protection; if (set_max) current->protection = (current->max_protection = new_prot) & old_prot; else current->protection = new_prot; /* * update physical map if necessary. worry about copy-on-write * here -- CHECK THIS XXX */ if (current->protection != old_prot) { /* update pmap! */ #ifdef __HAVE_UNLOCKED_PMAP /* XXX temporary */ uvm_map_lock_entry(current, RW_WRITER); #else uvm_map_lock_entry(current, RW_READER); #endif pmap_protect(map->pmap, current->start, current->end, current->protection & MASK(current)); uvm_map_unlock_entry(current); /* * If this entry points at a vnode, and the * protection includes VM_PROT_EXECUTE, mark * the vnode as VEXECMAP. */ if (UVM_ET_ISOBJ(current)) { struct uvm_object *uobj = current->object.uvm_obj; if (UVM_OBJ_IS_VNODE(uobj) && (current->protection & VM_PROT_EXECUTE)) { vn_markexec((struct vnode *) uobj); } } } /* * If the map is configured to lock any future mappings, * wire this entry now if the old protection was VM_PROT_NONE * and the new protection is not VM_PROT_NONE. */ if ((map->flags & VM_MAP_WIREFUTURE) != 0 && VM_MAPENT_ISWIRED(current) == 0 && old_prot == VM_PROT_NONE && new_prot != VM_PROT_NONE) { /* * We must call pmap_update() here because the * pmap_protect() call above might have removed some * pmap entries and uvm_map_pageable() might create * some new pmap entries that rely on the prior * removals being completely finished. */ pmap_update(map->pmap); if (uvm_map_pageable(map, current->start, current->end, false, UVM_LK_ENTER|UVM_LK_EXIT) != 0) { /* * If locking the entry fails, remember the * error if it's the first one. Note we * still continue setting the protection in * the map, but will return the error * condition regardless. * * XXX Ignore what the actual error is, * XXX just call it a resource shortage * XXX so that it doesn't get confused * XXX what uvm_map_protect() itself would * XXX normally return. */ error = ENOMEM; } } current = current->next; } pmap_update(map->pmap); out: vm_map_unlock(map); UVMHIST_LOG(maphist, "<- done, error=%jd",error,0,0,0); return error; } #undef MASK /* * uvm_map_inherit: set inheritance code for range of addrs in map. * * => map must be unlocked * => note that the inherit code is used during a "fork". see fork * code for details. */ int uvm_map_inherit(struct vm_map *map, vaddr_t start, vaddr_t end, vm_inherit_t new_inheritance) { struct vm_map_entry *entry, *temp_entry; UVMHIST_FUNC(__func__); UVMHIST_CALLARGS(maphist,"(map=%#jx,start=%#jx,end=%#jx,new_inh=%#jx)", (uintptr_t)map, start, end, new_inheritance); switch (new_inheritance) { case MAP_INHERIT_NONE: case MAP_INHERIT_COPY: case MAP_INHERIT_SHARE: case MAP_INHERIT_ZERO: break; default: UVMHIST_LOG(maphist,"<- done (INVALID ARG)",0,0,0,0); return EINVAL; } vm_map_lock(map); VM_MAP_RANGE_CHECK(map, start, end); if (uvm_map_lookup_entry(map, start, &temp_entry)) { entry = temp_entry; UVM_MAP_CLIP_START(map, entry, start); } else { entry = temp_entry->next; } while ((entry != &map->header) && (entry->start < end)) { UVM_MAP_CLIP_END(map, entry, end); entry->inheritance = new_inheritance; entry = entry->next; } vm_map_unlock(map); UVMHIST_LOG(maphist,"<- done (OK)",0,0,0,0); return 0; } /* * uvm_map_advice: set advice code for range of addrs in map. * * => map must be unlocked */ int uvm_map_advice(struct vm_map *map, vaddr_t start, vaddr_t end, int new_advice) { struct vm_map_entry *entry, *temp_entry; UVMHIST_FUNC(__func__); UVMHIST_CALLARGS(maphist,"(map=%#jx,start=%#jx,end=%#jx,new_adv=%#jx)", (uintptr_t)map, start, end, new_advice); vm_map_lock(map); VM_MAP_RANGE_CHECK(map, start, end); if (uvm_map_lookup_entry(map, start, &temp_entry)) { entry = temp_entry; UVM_MAP_CLIP_START(map, entry, start); } else { entry = temp_entry->next; } /* * XXXJRT: disallow holes? */ while ((entry != &map->header) && (entry->start < end)) { UVM_MAP_CLIP_END(map, entry, end); switch (new_advice) { case MADV_NORMAL: case MADV_RANDOM: case MADV_SEQUENTIAL: /* nothing special here */ break; default: vm_map_unlock(map); UVMHIST_LOG(maphist,"<- done (INVALID ARG)",0,0,0,0); return EINVAL; } entry->advice = new_advice; entry = entry->next; } vm_map_unlock(map); UVMHIST_LOG(maphist,"<- done (OK)",0,0,0,0); return 0; } /* * uvm_map_willneed: apply MADV_WILLNEED */ int uvm_map_willneed(struct vm_map *map, vaddr_t start, vaddr_t end) { struct vm_map_entry *entry; UVMHIST_FUNC(__func__); UVMHIST_CALLARGS(maphist,"(map=%#jx,start=%#jx,end=%#jx)", (uintptr_t)map, start, end, 0); vm_map_lock_read(map); VM_MAP_RANGE_CHECK(map, start, end); if (!uvm_map_lookup_entry(map, start, &entry)) { entry = entry->next; } while (entry->start < end) { struct vm_amap * const amap = entry->aref.ar_amap; struct uvm_object * const uobj = entry->object.uvm_obj; KASSERT(entry != &map->header); KASSERT(start < entry->end); /* * For now, we handle only the easy but commonly-requested case. * ie. start prefetching of backing uobj pages. * * XXX It might be useful to pmap_enter() the already-in-core * pages by inventing a "weak" mode for uvm_fault() which would * only do the PGO_LOCKED pgo_get(). */ if (UVM_ET_ISOBJ(entry) && amap == NULL && uobj != NULL) { off_t offset; off_t size; offset = entry->offset; if (start < entry->start) { offset += entry->start - start; } size = entry->offset + (entry->end - entry->start); if (entry->end < end) { size -= end - entry->end; } uvm_readahead(uobj, offset, size); } entry = entry->next; } vm_map_unlock_read(map); UVMHIST_LOG(maphist,"<- done (OK)",0,0,0,0); return 0; } /* * uvm_map_pageable: sets the pageability of a range in a map. * * => wires map entries. should not be used for transient page locking. * for that, use uvm_fault_wire()/uvm_fault_unwire() (see uvm_vslock()). * => regions specified as not pageable require lock-down (wired) memory * and page tables. * => map must never be read-locked * => if islocked is true, map is already write-locked * => we always unlock the map, since we must downgrade to a read-lock * to call uvm_fault_wire() * => XXXCDC: check this and try and clean it up. */ int uvm_map_pageable(struct vm_map *map, vaddr_t start, vaddr_t end, bool new_pageable, int lockflags) { struct vm_map_entry *entry, *start_entry, *failed_entry; int rv; #ifdef DIAGNOSTIC u_int timestamp_save; #endif UVMHIST_FUNC(__func__); UVMHIST_CALLARGS(maphist,"(map=%#jx,start=%#jx,end=%#jx,new_pageable=%ju)", (uintptr_t)map, start, end, new_pageable); KASSERT(map->flags & VM_MAP_PAGEABLE); if ((lockflags & UVM_LK_ENTER) == 0) vm_map_lock(map); VM_MAP_RANGE_CHECK(map, start, end); /* * only one pageability change may take place at one time, since * uvm_fault_wire assumes it will be called only once for each * wiring/unwiring. therefore, we have to make sure we're actually * changing the pageability for the entire region. we do so before * making any changes. */ if (uvm_map_lookup_entry(map, start, &start_entry) == false) { if ((lockflags & UVM_LK_EXIT) == 0) vm_map_unlock(map); UVMHIST_LOG(maphist,"<- done (fault)",0,0,0,0); return EFAULT; } entry = start_entry; if (start == end) { /* nothing required */ if ((lockflags & UVM_LK_EXIT) == 0) vm_map_unlock(map); UVMHIST_LOG(maphist,"<- done (nothing)",0,0,0,0); return 0; } /* * handle wiring and unwiring separately. */ if (new_pageable) { /* unwire */ UVM_MAP_CLIP_START(map, entry, start); /* * unwiring. first ensure that the range to be unwired is * really wired down and that there are no holes. */ while ((entry != &map->header) && (entry->start < end)) { if (entry->wired_count == 0 || (entry->end < end && (entry->next == &map->header || entry->next->start > entry->end))) { if ((lockflags & UVM_LK_EXIT) == 0) vm_map_unlock(map); UVMHIST_LOG(maphist, "<- done (INVAL)",0,0,0,0); return EINVAL; } entry = entry->next; } /* * POSIX 1003.1b - a single munlock call unlocks a region, * regardless of the number of mlock calls made on that * region. */ entry = start_entry; while ((entry != &map->header) && (entry->start < end)) { UVM_MAP_CLIP_END(map, entry, end); if (VM_MAPENT_ISWIRED(entry)) uvm_map_entry_unwire(map, entry); entry = entry->next; } if ((lockflags & UVM_LK_EXIT) == 0) vm_map_unlock(map); UVMHIST_LOG(maphist,"<- done (OK UNWIRE)",0,0,0,0); return 0; } /* * wire case: in two passes [XXXCDC: ugly block of code here] * * 1: holding the write lock, we create any anonymous maps that need * to be created. then we clip each map entry to the region to * be wired and increment its wiring count. * * 2: we downgrade to a read lock, and call uvm_fault_wire to fault * in the pages for any newly wired area (wired_count == 1). * * downgrading to a read lock for uvm_fault_wire avoids a possible * deadlock with another thread that may have faulted on one of * the pages to be wired (it would mark the page busy, blocking * us, then in turn block on the map lock that we hold). because * of problems in the recursive lock package, we cannot upgrade * to a write lock in vm_map_lookup. thus, any actions that * require the write lock must be done beforehand. because we * keep the read lock on the map, the copy-on-write status of the * entries we modify here cannot change. */ while ((entry != &map->header) && (entry->start < end)) { if (VM_MAPENT_ISWIRED(entry) == 0) { /* not already wired? */ /* * perform actions of vm_map_lookup that need the * write lock on the map: create an anonymous map * for a copy-on-write region, or an anonymous map * for a zero-fill region. (XXXCDC: submap case * ok?) */ if (!UVM_ET_ISSUBMAP(entry)) { /* not submap */ if (UVM_ET_ISNEEDSCOPY(entry) && ((entry->max_protection & VM_PROT_WRITE) || (entry->object.uvm_obj == NULL))) { amap_copy(map, entry, 0, start, end); /* XXXCDC: wait OK? */ } } } UVM_MAP_CLIP_START(map, entry, start); UVM_MAP_CLIP_END(map, entry, end); entry->wired_count++; /* * Check for holes */ if (entry->protection == VM_PROT_NONE || (entry->end < end && (entry->next == &map->header || entry->next->start > entry->end))) { /* * found one. amap creation actions do not need to * be undone, but the wired counts need to be restored. */ while (entry != &map->header && entry->end > start) { entry->wired_count--; entry = entry->prev; } if ((lockflags & UVM_LK_EXIT) == 0) vm_map_unlock(map); UVMHIST_LOG(maphist,"<- done (INVALID WIRE)",0,0,0,0); return EINVAL; } entry = entry->next; } /* * Pass 2. */ #ifdef DIAGNOSTIC timestamp_save = map->timestamp; #endif vm_map_busy(map); vm_map_unlock(map); rv = 0; entry = start_entry; while (entry != &map->header && entry->start < end) { if (entry->wired_count == 1) { rv = uvm_fault_wire(map, entry->start, entry->end, entry->max_protection, 1); if (rv) { /* * wiring failed. break out of the loop. * we'll clean up the map below, once we * have a write lock again. */ break; } } entry = entry->next; } if (rv) { /* failed? */ /* * Get back to an exclusive (write) lock. */ vm_map_lock(map); vm_map_unbusy(map); #ifdef DIAGNOSTIC if (timestamp_save + 1 != map->timestamp) panic("uvm_map_pageable: stale map"); #endif /* * first drop the wiring count on all the entries * which haven't actually been wired yet. */ failed_entry = entry; while (entry != &map->header && entry->start < end) { entry->wired_count--; entry = entry->next; } /* * now, unwire all the entries that were successfully * wired above. */ entry = start_entry; while (entry != failed_entry) { entry->wired_count--; if (VM_MAPENT_ISWIRED(entry) == 0) uvm_map_entry_unwire(map, entry); entry = entry->next; } if ((lockflags & UVM_LK_EXIT) == 0) vm_map_unlock(map); UVMHIST_LOG(maphist, "<- done (RV=%jd)", rv,0,0,0); return (rv); } if ((lockflags & UVM_LK_EXIT) == 0) { vm_map_unbusy(map); } else { /* * Get back to an exclusive (write) lock. */ vm_map_lock(map); vm_map_unbusy(map); } UVMHIST_LOG(maphist,"<- done (OK WIRE)",0,0,0,0); return 0; } /* * uvm_map_pageable_all: special case of uvm_map_pageable - affects * all mapped regions. * * => map must not be locked. * => if no flags are specified, all regions are unwired. * => XXXJRT: has some of the same problems as uvm_map_pageable() above. */ int uvm_map_pageable_all(struct vm_map *map, int flags, vsize_t limit) { struct vm_map_entry *entry, *failed_entry; vsize_t size; int rv; #ifdef DIAGNOSTIC u_int timestamp_save; #endif UVMHIST_FUNC(__func__); UVMHIST_CALLARGS(maphist,"(map=%#jx,flags=%#jx)", (uintptr_t)map, flags, 0, 0); KASSERT(map->flags & VM_MAP_PAGEABLE); vm_map_lock(map); /* * handle wiring and unwiring separately. */ if (flags == 0) { /* unwire */ /* * POSIX 1003.1b -- munlockall unlocks all regions, * regardless of how many times mlockall has been called. */ for (entry = map->header.next; entry != &map->header; entry = entry->next) { if (VM_MAPENT_ISWIRED(entry)) uvm_map_entry_unwire(map, entry); } map->flags &= ~VM_MAP_WIREFUTURE; vm_map_unlock(map); UVMHIST_LOG(maphist,"<- done (OK UNWIRE)",0,0,0,0); return 0; } if (flags & MCL_FUTURE) { /* * must wire all future mappings; remember this. */ map->flags |= VM_MAP_WIREFUTURE; } if ((flags & MCL_CURRENT) == 0) { /* * no more work to do! */ UVMHIST_LOG(maphist,"<- done (OK no wire)",0,0,0,0); vm_map_unlock(map); return 0; } /* * wire case: in three passes [XXXCDC: ugly block of code here] * * 1: holding the write lock, count all pages mapped by non-wired * entries. if this would cause us to go over our limit, we fail. * * 2: still holding the write lock, we create any anonymous maps that * need to be created. then we increment its wiring count. * * 3: we downgrade to a read lock, and call uvm_fault_wire to fault * in the pages for any newly wired area (wired_count == 1). * * downgrading to a read lock for uvm_fault_wire avoids a possible * deadlock with another thread that may have faulted on one of * the pages to be wired (it would mark the page busy, blocking * us, then in turn block on the map lock that we hold). because * of problems in the recursive lock package, we cannot upgrade * to a write lock in vm_map_lookup. thus, any actions that * require the write lock must be done beforehand. because we * keep the read lock on the map, the copy-on-write status of the * entries we modify here cannot change. */ for (size = 0, entry = map->header.next; entry != &map->header; entry = entry->next) { if (entry->protection != VM_PROT_NONE && VM_MAPENT_ISWIRED(entry) == 0) { /* not already wired? */ size += entry->end - entry->start; } } if (atop(size) + uvmexp.wired > uvmexp.wiredmax) { vm_map_unlock(map); return ENOMEM; } if (limit != 0 && (size + ptoa(pmap_wired_count(vm_map_pmap(map))) > limit)) { vm_map_unlock(map); return ENOMEM; } /* * Pass 2. */ for (entry = map->header.next; entry != &map->header; entry = entry->next) { if (entry->protection == VM_PROT_NONE) continue; if (VM_MAPENT_ISWIRED(entry) == 0) { /* not already wired? */ /* * perform actions of vm_map_lookup that need the * write lock on the map: create an anonymous map * for a copy-on-write region, or an anonymous map * for a zero-fill region. (XXXCDC: submap case * ok?) */ if (!UVM_ET_ISSUBMAP(entry)) { /* not submap */ if (UVM_ET_ISNEEDSCOPY(entry) && ((entry->max_protection & VM_PROT_WRITE) || (entry->object.uvm_obj == NULL))) { amap_copy(map, entry, 0, entry->start, entry->end); /* XXXCDC: wait OK? */ } } } entry->wired_count++; } /* * Pass 3. */ #ifdef DIAGNOSTIC timestamp_save = map->timestamp; #endif vm_map_busy(map); vm_map_unlock(map); rv = 0; for (entry = map->header.next; entry != &map->header; entry = entry->next) { if (entry->wired_count == 1) { rv = uvm_fault_wire(map, entry->start, entry->end, entry->max_protection, 1); if (rv) { /* * wiring failed. break out of the loop. * we'll clean up the map below, once we * have a write lock again. */ break; } } } if (rv) { /* * Get back an exclusive (write) lock. */ vm_map_lock(map); vm_map_unbusy(map); #ifdef DIAGNOSTIC if (timestamp_save + 1 != map->timestamp) panic("uvm_map_pageable_all: stale map"); #endif /* * first drop the wiring count on all the entries * which haven't actually been wired yet. * * Skip VM_PROT_NONE entries like we did above. */ failed_entry = entry; for (/* nothing */; entry != &map->header; entry = entry->next) { if (entry->protection == VM_PROT_NONE) continue; entry->wired_count--; } /* * now, unwire all the entries that were successfully * wired above. * * Skip VM_PROT_NONE entries like we did above. */ for (entry = map->header.next; entry != failed_entry; entry = entry->next) { if (entry->protection == VM_PROT_NONE) continue; entry->wired_count--; if (VM_MAPENT_ISWIRED(entry)) uvm_map_entry_unwire(map, entry); } vm_map_unlock(map); UVMHIST_LOG(maphist,"<- done (RV=%jd)", rv,0,0,0); return (rv); } vm_map_unbusy(map); UVMHIST_LOG(maphist,"<- done (OK WIRE)",0,0,0,0); return 0; } /* * uvm_map_clean: clean out a map range * * => valid flags: * if (flags & PGO_CLEANIT): dirty pages are cleaned first * if (flags & PGO_SYNCIO): dirty pages are written synchronously * if (flags & PGO_DEACTIVATE): any cached pages are deactivated after clean * if (flags & PGO_FREE): any cached pages are freed after clean * => returns an error if any part of the specified range isn't mapped * => never a need to flush amap layer since the anonymous memory has * no permanent home, but may deactivate pages there * => called from sys_msync() and sys_madvise() * => caller must not have map locked */ int uvm_map_clean(struct vm_map *map, vaddr_t start, vaddr_t end, int flags) { struct vm_map_entry *current, *entry; struct uvm_object *uobj; struct vm_amap *amap; struct vm_anon *anon; struct vm_page *pg; vaddr_t offset; vsize_t size; voff_t uoff; int error, refs; UVMHIST_FUNC(__func__); UVMHIST_CALLARGS(maphist,"(map=%#jx,start=%#jx,end=%#jx,flags=%#jx)", (uintptr_t)map, start, end, flags); KASSERT((flags & (PGO_FREE|PGO_DEACTIVATE)) != (PGO_FREE|PGO_DEACTIVATE)); vm_map_lock(map); VM_MAP_RANGE_CHECK(map, start, end); if (!uvm_map_lookup_entry(map, start, &entry)) { vm_map_unlock(map); return EFAULT; } /* * Make a first pass to check for holes and wiring problems. */ for (current = entry; current->start < end; current = current->next) { if (UVM_ET_ISSUBMAP(current)) { vm_map_unlock(map); return EINVAL; } if ((flags & PGO_FREE) != 0 && VM_MAPENT_ISWIRED(entry)) { vm_map_unlock(map); return EBUSY; } if (end <= current->end) { break; } if (current->end != current->next->start) { vm_map_unlock(map); return EFAULT; } } vm_map_busy(map); vm_map_unlock(map); error = 0; for (current = entry; start < end; current = current->next) { amap = current->aref.ar_amap; /* upper layer */ uobj = current->object.uvm_obj; /* lower layer */ KASSERT(start >= current->start); /* * No amap cleaning necessary if: * * (1) There's no amap. * * (2) We're not deactivating or freeing pages. */ if (amap == NULL || (flags & (PGO_DEACTIVATE|PGO_FREE)) == 0) goto flush_object; offset = start - current->start; size = MIN(end, current->end) - start; amap_lock(amap, RW_WRITER); for ( ; size != 0; size -= PAGE_SIZE, offset += PAGE_SIZE) { anon = amap_lookup(&current->aref, offset); if (anon == NULL) continue; KASSERT(anon->an_lock == amap->am_lock); pg = anon->an_page; if (pg == NULL) { continue; } if (pg->flags & PG_BUSY) { continue; } switch (flags & (PGO_CLEANIT|PGO_FREE|PGO_DEACTIVATE)) { /* * In these first 3 cases, we just deactivate the page. */ case PGO_CLEANIT|PGO_FREE: case PGO_CLEANIT|PGO_DEACTIVATE: case PGO_DEACTIVATE: deactivate_it: /* * skip the page if it's loaned or wired, * since it shouldn't be on a paging queue * at all in these cases. */ if (pg->loan_count != 0 || pg->wire_count != 0) { continue; } KASSERT(pg->uanon == anon); uvm_pagelock(pg); uvm_pagedeactivate(pg); uvm_pageunlock(pg); continue; case PGO_FREE: /* * If there are multiple references to * the amap, just deactivate the page. */ if (amap_refs(amap) > 1) goto deactivate_it; /* skip the page if it's wired */ if (pg->wire_count != 0) { continue; } amap_unadd(&current->aref, offset); refs = --anon->an_ref; if (refs == 0) { uvm_anfree(anon); } continue; } } amap_unlock(amap); flush_object: /* * flush pages if we've got a valid backing object. * note that we must always clean object pages before * freeing them since otherwise we could reveal stale * data from files. */ uoff = current->offset + (start - current->start); size = MIN(end, current->end) - start; if (uobj != NULL) { rw_enter(uobj->vmobjlock, RW_WRITER); if (uobj->pgops->pgo_put != NULL) error = (uobj->pgops->pgo_put)(uobj, uoff, uoff + size, flags | PGO_CLEANIT); else error = 0; } start += size; } vm_map_unbusy(map); return error; } /* * uvm_map_checkprot: check protection in map * * => must allow specified protection in a fully allocated region. * => map must be read or write locked by caller. */ bool uvm_map_checkprot(struct vm_map *map, vaddr_t start, vaddr_t end, vm_prot_t protection) { struct vm_map_entry *entry; struct vm_map_entry *tmp_entry; if (!uvm_map_lookup_entry(map, start, &tmp_entry)) { return (false); } entry = tmp_entry; while (start < end) { if (entry == &map->header) { return (false); } /* * no holes allowed */ if (start < entry->start) { return (false); } /* * check protection associated with entry */ if ((entry->protection & protection) != protection) { return (false); } start = entry->end; entry = entry->next; } return (true); } /* * uvmspace_alloc: allocate a vmspace structure. * * - structure includes vm_map and pmap * - XXX: no locking on this structure * - refcnt set to 1, rest must be init'd by caller */ struct vmspace * uvmspace_alloc(vaddr_t vmin, vaddr_t vmax, bool topdown) { struct vmspace *vm; UVMHIST_FUNC(__func__); UVMHIST_CALLED(maphist); vm = kmem_alloc(sizeof(*vm), KM_SLEEP); uvmspace_init(vm, NULL, vmin, vmax, topdown); UVMHIST_LOG(maphist,"<- done (vm=%#jx)", (uintptr_t)vm, 0, 0, 0); return (vm); } /* * uvmspace_init: initialize a vmspace structure. * * - XXX: no locking on this structure * - refcnt set to 1, rest must be init'd by caller */ void uvmspace_init(struct vmspace *vm, struct pmap *pmap, vaddr_t vmin, vaddr_t vmax, bool topdown) { UVMHIST_FUNC(__func__); UVMHIST_CALLARGS(maphist, "(vm=%#jx, pmap=%#jx, vmin=%#jx, vmax=%#jx", (uintptr_t)vm, (uintptr_t)pmap, vmin, vmax); UVMHIST_LOG(maphist, " topdown=%ju)", topdown, 0, 0, 0); memset(vm, 0, sizeof(*vm)); uvm_map_setup(&vm->vm_map, vmin, vmax, VM_MAP_PAGEABLE | (topdown ? VM_MAP_TOPDOWN : 0) ); if (pmap) pmap_reference(pmap); else pmap = pmap_create(); vm->vm_map.pmap = pmap; vm->vm_refcnt = 1; UVMHIST_LOG(maphist,"<- done",0,0,0,0); } /* * uvmspace_share: share a vmspace between two processes * * - used for vfork, threads(?) */ void uvmspace_share(struct proc *p1, struct proc *p2) { uvmspace_addref(p1->p_vmspace); p2->p_vmspace = p1->p_vmspace; } #if 0 /* * uvmspace_unshare: ensure that process "p" has its own, unshared, vmspace * * - XXX: no locking on vmspace */ void uvmspace_unshare(struct lwp *l) { struct proc *p = l->l_proc; struct vmspace *nvm, *ovm = p->p_vmspace; if (ovm->vm_refcnt == 1) /* nothing to do: vmspace isn't shared in the first place */ return; /* make a new vmspace, still holding old one */ nvm = uvmspace_fork(ovm); kpreempt_disable(); pmap_deactivate(l); /* unbind old vmspace */ p->p_vmspace = nvm; pmap_activate(l); /* switch to new vmspace */ kpreempt_enable(); uvmspace_free(ovm); /* drop reference to old vmspace */ } #endif /* * uvmspace_spawn: a new process has been spawned and needs a vmspace */ void uvmspace_spawn(struct lwp *l, vaddr_t start, vaddr_t end, bool topdown) { struct proc *p = l->l_proc; struct vmspace *nvm; #ifdef __HAVE_CPU_VMSPACE_EXEC cpu_vmspace_exec(l, start, end); #endif nvm = uvmspace_alloc(start, end, topdown); kpreempt_disable(); p->p_vmspace = nvm; pmap_activate(l); kpreempt_enable(); } /* * uvmspace_exec: the process wants to exec a new program */ void uvmspace_exec(struct lwp *l, vaddr_t start, vaddr_t end, bool topdown) { struct proc *p = l->l_proc; struct vmspace *nvm, *ovm = p->p_vmspace; struct vm_map *map; int flags; KASSERT(ovm != NULL); #ifdef __HAVE_CPU_VMSPACE_EXEC cpu_vmspace_exec(l, start, end); #endif map = &ovm->vm_map; /* * see if more than one process is using this vmspace... */ if (ovm->vm_refcnt == 1 && topdown == ((ovm->vm_map.flags & VM_MAP_TOPDOWN) != 0)) { /* * if p is the only process using its vmspace then we can safely * recycle that vmspace for the program that is being exec'd. * But only if TOPDOWN matches the requested value for the new * vm space! */ /* * SYSV SHM semantics require us to kill all segments on an exec */ if (uvm_shmexit && ovm->vm_shm) (*uvm_shmexit)(ovm); /* * POSIX 1003.1b -- "lock future mappings" is revoked * when a process execs another program image. */ map->flags &= ~VM_MAP_WIREFUTURE; /* * now unmap the old program. * * XXX set VM_MAP_DYING for the duration, so pmap_update() * is not called until the pmap has been totally cleared out * after pmap_remove_all(), or it can confuse some pmap * implementations. it would be nice to handle this by * deferring the pmap_update() while it is known the address * space is not visible to any user LWP other than curlwp, * but there isn't an elegant way of inferring that right * now. */ flags = pmap_remove_all(map->pmap) ? UVM_FLAG_VAONLY : 0; map->flags |= VM_MAP_DYING; uvm_unmap1(map, vm_map_min(map), vm_map_max(map), flags); map->flags &= ~VM_MAP_DYING; pmap_update(map->pmap); KASSERT(map->header.prev == &map->header); KASSERT(map->nentries == 0); /* * resize the map */ vm_map_setmin(map, start); vm_map_setmax(map, end); } else { /* * p's vmspace is being shared, so we can't reuse it for p since * it is still being used for others. allocate a new vmspace * for p */ nvm = uvmspace_alloc(start, end, topdown); /* * install new vmspace and drop our ref to the old one. */ kpreempt_disable(); pmap_deactivate(l); p->p_vmspace = nvm; pmap_activate(l); kpreempt_enable(); uvmspace_free(ovm); } } /* * uvmspace_addref: add a reference to a vmspace. */ void uvmspace_addref(struct vmspace *vm) { KASSERT((vm->vm_map.flags & VM_MAP_DYING) == 0); KASSERT(vm->vm_refcnt > 0); atomic_inc_uint(&vm->vm_refcnt); } /* * uvmspace_free: free a vmspace data structure */ void uvmspace_free(struct vmspace *vm) { struct vm_map_entry *dead_entries; struct vm_map *map = &vm->vm_map; int flags; UVMHIST_FUNC(__func__); UVMHIST_CALLARGS(maphist,"(vm=%#jx) ref=%jd", (uintptr_t)vm, vm->vm_refcnt, 0, 0); membar_release(); if (atomic_dec_uint_nv(&vm->vm_refcnt) > 0) return; membar_acquire(); /* * at this point, there should be no other references to the map. * delete all of the mappings, then destroy the pmap. */ map->flags |= VM_MAP_DYING; flags = pmap_remove_all(map->pmap) ? UVM_FLAG_VAONLY : 0; /* Get rid of any SYSV shared memory segments. */ if (uvm_shmexit && vm->vm_shm != NULL) (*uvm_shmexit)(vm); if (map->nentries) { uvm_unmap_remove(map, vm_map_min(map), vm_map_max(map), &dead_entries, flags); if (dead_entries != NULL) uvm_unmap_detach(dead_entries, 0); } KASSERT(map->nentries == 0); KASSERT(map->size == 0); mutex_destroy(&map->misc_lock); rw_destroy(&map->lock); cv_destroy(&map->cv); pmap_destroy(map->pmap); kmem_free(vm, sizeof(*vm)); } static struct vm_map_entry * uvm_mapent_clone(struct vm_map *new_map, struct vm_map_entry *old_entry, int flags) { struct vm_map_entry *new_entry; new_entry = uvm_mapent_alloc(new_map, 0); /* old_entry -> new_entry */ uvm_mapent_copy(old_entry, new_entry); /* new pmap has nothing wired in it */ new_entry->wired_count = 0; /* * gain reference to object backing the map (can't * be a submap, already checked this case). */ if (new_entry->aref.ar_amap) uvm_map_reference_amap(new_entry, flags); if (new_entry->object.uvm_obj && new_entry->object.uvm_obj->pgops->pgo_reference) new_entry->object.uvm_obj->pgops->pgo_reference( new_entry->object.uvm_obj); /* insert entry at end of new_map's entry list */ uvm_map_entry_link(new_map, new_map->header.prev, new_entry); return new_entry; } /* * share the mapping: this means we want the old and * new entries to share amaps and backing objects. */ static void uvm_mapent_forkshared(struct vm_map *new_map, struct vm_map *old_map, struct vm_map_entry *old_entry) { /* * if the old_entry needs a new amap (due to prev fork) * then we need to allocate it now so that we have * something we own to share with the new_entry. [in * other words, we need to clear needs_copy] */ if (UVM_ET_ISNEEDSCOPY(old_entry)) { /* get our own amap, clears needs_copy */ amap_copy(old_map, old_entry, AMAP_COPY_NOCHUNK, 0, 0); /* XXXCDC: WAITOK??? */ } uvm_mapent_clone(new_map, old_entry, AMAP_SHARED); } static void uvm_mapent_forkcopy(struct vm_map *new_map, struct vm_map *old_map, struct vm_map_entry *old_entry) { struct vm_map_entry *new_entry; /* * copy-on-write the mapping (using mmap's * MAP_PRIVATE semantics) * * allocate new_entry, adjust reference counts. * (note that new references are read-only). */ new_entry = uvm_mapent_clone(new_map, old_entry, 0); new_entry->etype |= (UVM_ET_COPYONWRITE|UVM_ET_NEEDSCOPY); /* * the new entry will need an amap. it will either * need to be copied from the old entry or created * from scratch (if the old entry does not have an * amap). can we defer this process until later * (by setting "needs_copy") or do we need to copy * the amap now? * * we must copy the amap now if any of the following * conditions hold: * 1. the old entry has an amap and that amap is * being shared. this means that the old (parent) * process is sharing the amap with another * process. if we do not clear needs_copy here * we will end up in a situation where both the * parent and child process are referring to the * same amap with "needs_copy" set. if the * parent write-faults, the fault routine will * clear "needs_copy" in the parent by allocating * a new amap. this is wrong because the * parent is supposed to be sharing the old amap * and the new amap will break that. * * 2. if the old entry has an amap and a non-zero * wire count then we are going to have to call * amap_cow_now to avoid page faults in the * parent process. since amap_cow_now requires * "needs_copy" to be clear we might as well * clear it here as well. * */ if (old_entry->aref.ar_amap != NULL) { if ((amap_flags(old_entry->aref.ar_amap) & AMAP_SHARED) != 0 || VM_MAPENT_ISWIRED(old_entry)) { amap_copy(new_map, new_entry, AMAP_COPY_NOCHUNK, 0, 0); /* XXXCDC: M_WAITOK ... ok? */ } } /* * if the parent's entry is wired down, then the * parent process does not want page faults on * access to that memory. this means that we * cannot do copy-on-write because we can't write * protect the old entry. in this case we * resolve all copy-on-write faults now, using * amap_cow_now. note that we have already * allocated any needed amap (above). */ if (VM_MAPENT_ISWIRED(old_entry)) { /* * resolve all copy-on-write faults now * (note that there is nothing to do if * the old mapping does not have an amap). */ if (old_entry->aref.ar_amap) amap_cow_now(new_map, new_entry); } else { /* * setup mappings to trigger copy-on-write faults * we must write-protect the parent if it has * an amap and it is not already "needs_copy"... * if it is already "needs_copy" then the parent * has already been write-protected by a previous * fork operation. */ if (old_entry->aref.ar_amap && !UVM_ET_ISNEEDSCOPY(old_entry)) { if (old_entry->max_protection & VM_PROT_WRITE) { #ifdef __HAVE_UNLOCKED_PMAP /* XXX temporary */ uvm_map_lock_entry(old_entry, RW_WRITER); #else uvm_map_lock_entry(old_entry, RW_READER); #endif pmap_protect(old_map->pmap, old_entry->start, old_entry->end, old_entry->protection & ~VM_PROT_WRITE); uvm_map_unlock_entry(old_entry); } old_entry->etype |= UVM_ET_NEEDSCOPY; } } } /* * zero the mapping: the new entry will be zero initialized */ static void uvm_mapent_forkzero(struct vm_map *new_map, struct vm_map *old_map, struct vm_map_entry *old_entry) { struct vm_map_entry *new_entry; new_entry = uvm_mapent_clone(new_map, old_entry, 0); new_entry->etype |= (UVM_ET_COPYONWRITE|UVM_ET_NEEDSCOPY); if (new_entry->aref.ar_amap) { uvm_map_unreference_amap(new_entry, 0); new_entry->aref.ar_pageoff = 0; new_entry->aref.ar_amap = NULL; } if (UVM_ET_ISOBJ(new_entry)) { if (new_entry->object.uvm_obj->pgops->pgo_detach) new_entry->object.uvm_obj->pgops->pgo_detach( new_entry->object.uvm_obj); new_entry->object.uvm_obj = NULL; new_entry->offset = 0; new_entry->etype &= ~UVM_ET_OBJ; } } /* * F O R K - m a i n e n t r y p o i n t */ /* * uvmspace_fork: fork a process' main map * * => create a new vmspace for child process from parent. * => parent's map must not be locked. */ struct vmspace * uvmspace_fork(struct vmspace *vm1) { struct vmspace *vm2; struct vm_map *old_map = &vm1->vm_map; struct vm_map *new_map; struct vm_map_entry *old_entry; UVMHIST_FUNC(__func__); UVMHIST_CALLED(maphist); vm_map_lock(old_map); vm2 = uvmspace_alloc(vm_map_min(old_map), vm_map_max(old_map), vm1->vm_map.flags & VM_MAP_TOPDOWN); memcpy(&vm2->vm_startcopy, &vm1->vm_startcopy, (char *) (vm1 + 1) - (char *) &vm1->vm_startcopy); new_map = &vm2->vm_map; /* XXX */ old_entry = old_map->header.next; new_map->size = old_map->size; /* * go entry-by-entry */ while (old_entry != &old_map->header) { /* * first, some sanity checks on the old entry */ KASSERT(!UVM_ET_ISSUBMAP(old_entry)); KASSERT(UVM_ET_ISCOPYONWRITE(old_entry) || !UVM_ET_ISNEEDSCOPY(old_entry)); switch (old_entry->inheritance) { case MAP_INHERIT_NONE: /* * drop the mapping, modify size */ new_map->size -= old_entry->end - old_entry->start; break; case MAP_INHERIT_SHARE: uvm_mapent_forkshared(new_map, old_map, old_entry); break; case MAP_INHERIT_COPY: uvm_mapent_forkcopy(new_map, old_map, old_entry); break; case MAP_INHERIT_ZERO: uvm_mapent_forkzero(new_map, old_map, old_entry); break; default: KASSERT(0); break; } old_entry = old_entry->next; } pmap_update(old_map->pmap); vm_map_unlock(old_map); if (uvm_shmfork && vm1->vm_shm) (*uvm_shmfork)(vm1, vm2); #ifdef PMAP_FORK pmap_fork(vm1->vm_map.pmap, vm2->vm_map.pmap); #endif UVMHIST_LOG(maphist,"<- done",0,0,0,0); return (vm2); } /* * uvm_mapent_trymerge: try to merge an entry with its neighbors. * * => called with map locked. * => return non zero if successfully merged. */ int uvm_mapent_trymerge(struct vm_map *map, struct vm_map_entry *entry, int flags) { struct uvm_object *uobj; struct vm_map_entry *next; struct vm_map_entry *prev; vsize_t size; int merged = 0; bool copying; int newetype; if (entry->aref.ar_amap != NULL) { return 0; } if ((entry->flags & UVM_MAP_NOMERGE) != 0) { return 0; } uobj = entry->object.uvm_obj; size = entry->end - entry->start; copying = (flags & UVM_MERGE_COPYING) != 0; newetype = copying ? (entry->etype & ~UVM_ET_NEEDSCOPY) : entry->etype; next = entry->next; if (next != &map->header && next->start == entry->end && ((copying && next->aref.ar_amap != NULL && amap_refs(next->aref.ar_amap) == 1) || (!copying && next->aref.ar_amap == NULL)) && UVM_ET_ISCOMPATIBLE(next, newetype, uobj, entry->flags, entry->protection, entry->max_protection, entry->inheritance, entry->advice, entry->wired_count) && (uobj == NULL || entry->offset + size == next->offset)) { int error; if (copying) { error = amap_extend(next, size, AMAP_EXTEND_NOWAIT|AMAP_EXTEND_BACKWARDS); } else { error = 0; } if (error == 0) { if (uobj) { if (uobj->pgops->pgo_detach) { uobj->pgops->pgo_detach(uobj); } } entry->end = next->end; clear_hints(map, next); uvm_map_entry_unlink(map, next); if (copying) { entry->aref = next->aref; entry->etype &= ~UVM_ET_NEEDSCOPY; } uvm_map_check(map, "trymerge forwardmerge"); uvm_mapent_free(next); merged++; } } prev = entry->prev; if (prev != &map->header && prev->end == entry->start && ((copying && !merged && prev->aref.ar_amap != NULL && amap_refs(prev->aref.ar_amap) == 1) || (!copying && prev->aref.ar_amap == NULL)) && UVM_ET_ISCOMPATIBLE(prev, newetype, uobj, entry->flags, entry->protection, entry->max_protection, entry->inheritance, entry->advice, entry->wired_count) && (uobj == NULL || prev->offset + prev->end - prev->start == entry->offset)) { int error; if (copying) { error = amap_extend(prev, size, AMAP_EXTEND_NOWAIT|AMAP_EXTEND_FORWARDS); } else { error = 0; } if (error == 0) { if (uobj) { if (uobj->pgops->pgo_detach) { uobj->pgops->pgo_detach(uobj); } entry->offset = prev->offset; } entry->start = prev->start; clear_hints(map, prev); uvm_map_entry_unlink(map, prev); if (copying) { entry->aref = prev->aref; entry->etype &= ~UVM_ET_NEEDSCOPY; } uvm_map_check(map, "trymerge backmerge"); uvm_mapent_free(prev); merged++; } } return merged; } /* * uvm_map_setup: init map * * => map must not be in service yet. */ void uvm_map_setup(struct vm_map *map, vaddr_t vmin, vaddr_t vmax, int flags) { rb_tree_init(&map->rb_tree, &uvm_map_tree_ops); map->header.next = map->header.prev = &map->header; map->nentries = 0; map->size = 0; map->ref_count = 1; vm_map_setmin(map, vmin); vm_map_setmax(map, vmax); map->flags = flags; map->first_free = &map->header; map->hint = &map->header; map->timestamp = 0; map->busy = NULL; rw_init(&map->lock); cv_init(&map->cv, "vm_map"); mutex_init(&map->misc_lock, MUTEX_DRIVER, IPL_NONE); } /* * U N M A P - m a i n e n t r y p o i n t */ /* * uvm_unmap1: remove mappings from a vm_map (from "start" up to "stop") * * => caller must check alignment and size * => map must be unlocked (we will lock it) * => flags is UVM_FLAG_QUANTUM or 0. */ void uvm_unmap1(struct vm_map *map, vaddr_t start, vaddr_t end, int flags) { struct vm_map_entry *dead_entries; UVMHIST_FUNC(__func__); UVMHIST_CALLARGS(maphist, " (map=%#jx, start=%#jx, end=%#jx)", (uintptr_t)map, start, end, 0); KASSERTMSG(start < end, "%s: map %p: start %#jx < end %#jx", __func__, map, (uintmax_t)start, (uintmax_t)end); if (map == kernel_map) { LOCKDEBUG_MEM_CHECK((void *)start, end - start); } /* * work now done by helper functions. wipe the pmap's and then * detach from the dead entries... */ vm_map_lock(map); uvm_unmap_remove(map, start, end, &dead_entries, flags); vm_map_unlock(map); if (dead_entries != NULL) uvm_unmap_detach(dead_entries, 0); UVMHIST_LOG(maphist, "<- done", 0,0,0,0); } /* * uvm_map_reference: add reference to a map * * => map need not be locked */ void uvm_map_reference(struct vm_map *map) { atomic_inc_uint(&map->ref_count); } void uvm_map_lock_entry(struct vm_map_entry *entry, krw_t op) { if (entry->aref.ar_amap != NULL) { amap_lock(entry->aref.ar_amap, op); } if (UVM_ET_ISOBJ(entry)) { rw_enter(entry->object.uvm_obj->vmobjlock, op); } } void uvm_map_unlock_entry(struct vm_map_entry *entry) { if (UVM_ET_ISOBJ(entry)) { rw_exit(entry->object.uvm_obj->vmobjlock); } if (entry->aref.ar_amap != NULL) { amap_unlock(entry->aref.ar_amap); } } #define UVM_VOADDR_TYPE_MASK 0x3UL #define UVM_VOADDR_TYPE_UOBJ 0x1UL #define UVM_VOADDR_TYPE_ANON 0x2UL #define UVM_VOADDR_OBJECT_MASK ~UVM_VOADDR_TYPE_MASK #define UVM_VOADDR_GET_TYPE(voa) \ ((voa)->object & UVM_VOADDR_TYPE_MASK) #define UVM_VOADDR_GET_OBJECT(voa) \ ((voa)->object & UVM_VOADDR_OBJECT_MASK) #define UVM_VOADDR_SET_OBJECT(voa, obj, type) \ do { \ KASSERT(((uintptr_t)(obj) & UVM_VOADDR_TYPE_MASK) == 0); \ (voa)->object = ((uintptr_t)(obj)) | (type); \ } while (/*CONSTCOND*/0) #define UVM_VOADDR_GET_UOBJ(voa) \ ((struct uvm_object *)UVM_VOADDR_GET_OBJECT(voa)) #define UVM_VOADDR_SET_UOBJ(voa, uobj) \ UVM_VOADDR_SET_OBJECT(voa, uobj, UVM_VOADDR_TYPE_UOBJ) #define UVM_VOADDR_GET_ANON(voa) \ ((struct vm_anon *)UVM_VOADDR_GET_OBJECT(voa)) #define UVM_VOADDR_SET_ANON(voa, anon) \ UVM_VOADDR_SET_OBJECT(voa, anon, UVM_VOADDR_TYPE_ANON) /* * uvm_voaddr_acquire: returns the virtual object address corresponding * to the specified virtual address. * * => resolves COW so the true page identity is tracked. * * => acquires a reference on the page's owner (uvm_object or vm_anon) */ bool uvm_voaddr_acquire(struct vm_map * const map, vaddr_t const va, struct uvm_voaddr * const voaddr) { struct vm_map_entry *entry; struct vm_anon *anon = NULL; bool result = false; bool exclusive = false; void (*unlock_fn)(struct vm_map *); UVMHIST_FUNC(__func__); UVMHIST_CALLED(maphist); UVMHIST_LOG(maphist,"(map=%#jx,va=%#jx)", (uintptr_t)map, va, 0, 0); const vaddr_t start = trunc_page(va); const vaddr_t end = round_page(va+1); lookup_again: if (__predict_false(exclusive)) { vm_map_lock(map); unlock_fn = vm_map_unlock; } else { vm_map_lock_read(map); unlock_fn = vm_map_unlock_read; } if (__predict_false(!uvm_map_lookup_entry(map, start, &entry))) { unlock_fn(map); UVMHIST_LOG(maphist,"<- done (no entry)",0,0,0,0); return false; } if (__predict_false(entry->protection == VM_PROT_NONE)) { unlock_fn(map); UVMHIST_LOG(maphist,"<- done (PROT_NONE)",0,0,0,0); return false; } /* * We have a fast path for the common case of "no COW resolution * needed" whereby we have taken a read lock on the map and if * we don't encounter any need to create a vm_anon then great! * But if we do, we loop around again, instead taking an exclusive * lock so that we can perform the fault. * * In the event that we have to resolve the fault, we do nearly the * same work as uvm_map_pageable() does: * * 1: holding the write lock, we create any anonymous maps that need * to be created. however, we do NOT need to clip the map entries * in this case. * * 2: we downgrade to a read lock, and call uvm_fault_wire to fault * in the page (assuming the entry is not already wired). this * is done because we need the vm_anon to be present. */ if (__predict_true(!VM_MAPENT_ISWIRED(entry))) { bool need_fault = false; /* * perform the action of vm_map_lookup that need the * write lock on the map: create an anonymous map for * a copy-on-write region, or an anonymous map for * a zero-fill region. */ if (__predict_false(UVM_ET_ISSUBMAP(entry))) { unlock_fn(map); UVMHIST_LOG(maphist,"<- done (submap)",0,0,0,0); return false; } if (__predict_false(UVM_ET_ISNEEDSCOPY(entry) && ((entry->max_protection & VM_PROT_WRITE) || (entry->object.uvm_obj == NULL)))) { if (!exclusive) { /* need to take the slow path */ KASSERT(unlock_fn == vm_map_unlock_read); vm_map_unlock_read(map); exclusive = true; goto lookup_again; } need_fault = true; amap_copy(map, entry, 0, start, end); /* XXXCDC: wait OK? */ } /* * do a quick check to see if the fault has already * been resolved to the upper layer. */ if (__predict_true(entry->aref.ar_amap != NULL && need_fault == false)) { amap_lock(entry->aref.ar_amap, RW_WRITER); anon = amap_lookup(&entry->aref, start - entry->start); if (__predict_true(anon != NULL)) { /* amap unlocked below */ goto found_anon; } amap_unlock(entry->aref.ar_amap); need_fault = true; } /* * we predict this test as false because if we reach * this point, then we are likely dealing with a * shared memory region backed by a uvm_object, in * which case a fault to create the vm_anon is not * necessary. */ if (__predict_false(need_fault)) { if (exclusive) { vm_map_busy(map); vm_map_unlock(map); unlock_fn = vm_map_unbusy; } if (uvm_fault_wire(map, start, end, entry->max_protection, 1)) { /* wiring failed */ unlock_fn(map); UVMHIST_LOG(maphist,"<- done (wire failed)", 0,0,0,0); return false; } /* * now that we have resolved the fault, we can unwire * the page. */ if (exclusive) { vm_map_lock(map); vm_map_unbusy(map); unlock_fn = vm_map_unlock; } uvm_fault_unwire_locked(map, start, end); } } /* check the upper layer */ if (entry->aref.ar_amap) { amap_lock(entry->aref.ar_amap, RW_WRITER); anon = amap_lookup(&entry->aref, start - entry->start); if (anon) { found_anon: KASSERT(anon->an_lock == entry->aref.ar_amap->am_lock); anon->an_ref++; rw_obj_hold(anon->an_lock); KASSERT(anon->an_ref != 0); UVM_VOADDR_SET_ANON(voaddr, anon); voaddr->offset = va & PAGE_MASK; result = true; } amap_unlock(entry->aref.ar_amap); } /* check the lower layer */ if (!result && UVM_ET_ISOBJ(entry)) { struct uvm_object *uobj = entry->object.uvm_obj; KASSERT(uobj != NULL); (*uobj->pgops->pgo_reference)(uobj); UVM_VOADDR_SET_UOBJ(voaddr, uobj); voaddr->offset = entry->offset + (va - entry->start); result = true; } unlock_fn(map); if (result) { UVMHIST_LOG(maphist, "<- done OK (type=%jd,owner=%#jx,offset=%#jx)", UVM_VOADDR_GET_TYPE(voaddr), UVM_VOADDR_GET_OBJECT(voaddr), voaddr->offset, 0); } else { UVMHIST_LOG(maphist,"<- done (failed)",0,0,0,0); } return result; } /* * uvm_voaddr_release: release the references held by the * vitual object address. */ void uvm_voaddr_release(struct uvm_voaddr * const voaddr) { switch (UVM_VOADDR_GET_TYPE(voaddr)) { case UVM_VOADDR_TYPE_UOBJ: { struct uvm_object * const uobj = UVM_VOADDR_GET_UOBJ(voaddr); KASSERT(uobj != NULL); KASSERT(uobj->pgops->pgo_detach != NULL); (*uobj->pgops->pgo_detach)(uobj); break; } case UVM_VOADDR_TYPE_ANON: { struct vm_anon * const anon = UVM_VOADDR_GET_ANON(voaddr); krwlock_t *lock; KASSERT(anon != NULL); rw_enter((lock = anon->an_lock), RW_WRITER); KASSERT(anon->an_ref > 0); if (--anon->an_ref == 0) { uvm_anfree(anon); } rw_exit(lock); rw_obj_free(lock); break; } default: panic("uvm_voaddr_release: bad type"); } memset(voaddr, 0, sizeof(*voaddr)); } /* * uvm_voaddr_compare: compare two uvm_voaddr objects. * * => memcmp() semantics */ int uvm_voaddr_compare(const struct uvm_voaddr * const voaddr1, const struct uvm_voaddr * const voaddr2) { const uintptr_t type1 = UVM_VOADDR_GET_TYPE(voaddr1); const uintptr_t type2 = UVM_VOADDR_GET_TYPE(voaddr2); KASSERT(type1 == UVM_VOADDR_TYPE_UOBJ || type1 == UVM_VOADDR_TYPE_ANON); KASSERT(type2 == UVM_VOADDR_TYPE_UOBJ || type2 == UVM_VOADDR_TYPE_ANON); if (type1 < type2) return -1; if (type1 > type2) return 1; const uintptr_t addr1 = UVM_VOADDR_GET_OBJECT(voaddr1); const uintptr_t addr2 = UVM_VOADDR_GET_OBJECT(voaddr2); if (addr1 < addr2) return -1; if (addr1 > addr2) return 1; if (voaddr1->offset < voaddr2->offset) return -1; if (voaddr1->offset > voaddr2->offset) return 1; return 0; } #if defined(DDB) || defined(DEBUGPRINT) /* * uvm_map_printit: actually prints the map */ void uvm_map_printit(struct vm_map *map, bool full, void (*pr)(const char *, ...)) { struct vm_map_entry *entry; (*pr)("MAP %p: [%#lx->%#lx]\n", map, vm_map_min(map), vm_map_max(map)); (*pr)("\t#ent=%d, sz=%d, ref=%d, version=%d, flags=%#x\n", map->nentries, map->size, map->ref_count, map->timestamp, map->flags); (*pr)("\tpmap=%p(resident=%ld, wired=%ld)\n", map->pmap, pmap_resident_count(map->pmap), pmap_wired_count(map->pmap)); if (!full) return; for (entry = map->header.next; entry != &map->header; entry = entry->next) { (*pr)(" - %p: %#lx->%#lx: obj=%p/%#llx, amap=%p/%d\n", entry, entry->start, entry->end, entry->object.uvm_obj, (long long)entry->offset, entry->aref.ar_amap, entry->aref.ar_pageoff); (*pr)( "\tsubmap=%c, cow=%c, nc=%c, prot(max)=%d/%d, inh=%d, " "wc=%d, adv=%d%s\n", (entry->etype & UVM_ET_SUBMAP) ? 'T' : 'F', (entry->etype & UVM_ET_COPYONWRITE) ? 'T' : 'F', (entry->etype & UVM_ET_NEEDSCOPY) ? 'T' : 'F', entry->protection, entry->max_protection, entry->inheritance, entry->wired_count, entry->advice, entry == map->first_free ? " (first_free)" : ""); } } void uvm_whatis(uintptr_t addr, void (*pr)(const char *, ...)) { struct vm_map *map; for (map = kernel_map;;) { struct vm_map_entry *entry; if (!uvm_map_lookup_entry_bytree(map, (vaddr_t)addr, &entry)) { break; } (*pr)("%p is %p+%zu from VMMAP %p\n", (void *)addr, (void *)entry->start, (size_t)(addr - (uintptr_t)entry->start), map); if (!UVM_ET_ISSUBMAP(entry)) { break; } map = entry->object.sub_map; } } #endif /* DDB || DEBUGPRINT */ #ifndef __USER_VA0_IS_SAFE static int sysctl_user_va0_disable(SYSCTLFN_ARGS) { struct sysctlnode node; int t, error; node = *rnode; node.sysctl_data = &t; t = user_va0_disable; error = sysctl_lookup(SYSCTLFN_CALL(&node)); if (error || newp == NULL) return (error); if (!t && user_va0_disable && kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_MAP_VA_ZERO, 0, NULL, NULL, NULL)) return EPERM; user_va0_disable = !!t; return 0; } #endif static int fill_vmentry(struct lwp *l, struct proc *p, struct kinfo_vmentry *kve, struct vm_map *m, struct vm_map_entry *e) { #ifndef _RUMPKERNEL int error; memset(kve, 0, sizeof(*kve)); KASSERT(e != NULL); if (UVM_ET_ISOBJ(e)) { struct uvm_object *uobj = e->object.uvm_obj; KASSERT(uobj != NULL); kve->kve_ref_count = uobj->uo_refs; kve->kve_count = uobj->uo_npages; if (UVM_OBJ_IS_VNODE(uobj)) { struct vattr va; struct vnode *vp = (struct vnode *)uobj; vn_lock(vp, LK_SHARED | LK_RETRY); error = VOP_GETATTR(vp, &va, l->l_cred); VOP_UNLOCK(vp); kve->kve_type = KVME_TYPE_VNODE; if (error == 0) { kve->kve_vn_size = vp->v_size; kve->kve_vn_type = (int)vp->v_type; kve->kve_vn_mode = va.va_mode; kve->kve_vn_rdev = va.va_rdev; kve->kve_vn_fileid = va.va_fileid; kve->kve_vn_fsid = va.va_fsid; error = vnode_to_path(kve->kve_path, sizeof(kve->kve_path) / 2, vp, l, p); } } else if (UVM_OBJ_IS_KERN_OBJECT(uobj)) { kve->kve_type = KVME_TYPE_KERN; } else if (UVM_OBJ_IS_DEVICE(uobj)) { kve->kve_type = KVME_TYPE_DEVICE; } else if (UVM_OBJ_IS_AOBJ(uobj)) { kve->kve_type = KVME_TYPE_ANON; } else { kve->kve_type = KVME_TYPE_OBJECT; } } else if (UVM_ET_ISSUBMAP(e)) { struct vm_map *map = e->object.sub_map; KASSERT(map != NULL); kve->kve_ref_count = map->ref_count; kve->kve_count = map->nentries; kve->kve_type = KVME_TYPE_SUBMAP; } else kve->kve_type = KVME_TYPE_UNKNOWN; kve->kve_start = e->start; kve->kve_end = e->end; kve->kve_offset = e->offset; kve->kve_wired_count = e->wired_count; kve->kve_inheritance = e->inheritance; kve->kve_attributes = 0; /* unused */ kve->kve_advice = e->advice; #define PROT(p) (((p) & VM_PROT_READ) ? KVME_PROT_READ : 0) | \ (((p) & VM_PROT_WRITE) ? KVME_PROT_WRITE : 0) | \ (((p) & VM_PROT_EXECUTE) ? KVME_PROT_EXEC : 0) kve->kve_protection = PROT(e->protection); kve->kve_max_protection = PROT(e->max_protection); kve->kve_flags |= (e->etype & UVM_ET_COPYONWRITE) ? KVME_FLAG_COW : 0; kve->kve_flags |= (e->etype & UVM_ET_NEEDSCOPY) ? KVME_FLAG_NEEDS_COPY : 0; kve->kve_flags |= (m->flags & VM_MAP_TOPDOWN) ? KVME_FLAG_GROWS_DOWN : KVME_FLAG_GROWS_UP; kve->kve_flags |= (m->flags & VM_MAP_PAGEABLE) ? KVME_FLAG_PAGEABLE : 0; #endif return 0; } static int fill_vmentries(struct lwp *l, pid_t pid, u_int elem_size, void *oldp, size_t *oldlenp) { int error; struct proc *p; struct kinfo_vmentry *vme; struct vmspace *vm; struct vm_map *map; struct vm_map_entry *entry; char *dp; size_t count, vmesize; if (elem_size == 0 || elem_size > 2 * sizeof(*vme)) return EINVAL; if (oldp) { if (*oldlenp > 10UL * 1024UL * 1024UL) return E2BIG; count = *oldlenp / elem_size; if (count == 0) return ENOMEM; vmesize = count * sizeof(*vme); } else vmesize = 0; if ((error = proc_find_locked(l, &p, pid)) != 0) return error; vme = NULL; count = 0; if ((error = proc_vmspace_getref(p, &vm)) != 0) goto out; map = &vm->vm_map; vm_map_lock_read(map); dp = oldp; if (oldp) vme = kmem_alloc(vmesize, KM_SLEEP); for (entry = map->header.next; entry != &map->header; entry = entry->next) { if (oldp && (dp - (char *)oldp) < vmesize) { error = fill_vmentry(l, p, &vme[count], map, entry); if (error) goto out; dp += elem_size; } count++; } vm_map_unlock_read(map); uvmspace_free(vm); out: if (pid != -1) mutex_exit(p->p_lock); if (error == 0) { const u_int esize = uimin(sizeof(*vme), elem_size); dp = oldp; for (size_t i = 0; i < count; i++) { if (oldp && (dp - (char *)oldp) < vmesize) { error = sysctl_copyout(l, &vme[i], dp, esize); if (error) break; dp += elem_size; } else break; } count *= elem_size; if (oldp != NULL && *oldlenp < count) error = ENOSPC; *oldlenp = count; } if (vme) kmem_free(vme, vmesize); return error; } static int sysctl_vmproc(SYSCTLFN_ARGS) { int error; if (namelen == 1 && name[0] == CTL_QUERY) return (sysctl_query(SYSCTLFN_CALL(rnode))); if (namelen == 0) return EINVAL; switch (name[0]) { case VM_PROC_MAP: if (namelen != 3) return EINVAL; sysctl_unlock(); error = fill_vmentries(l, name[1], name[2], oldp, oldlenp); sysctl_relock(); return error; default: return EINVAL; } } SYSCTL_SETUP(sysctl_uvmmap_setup, "sysctl uvmmap setup") { sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT, CTLTYPE_STRUCT, "proc", SYSCTL_DESCR("Process vm information"), sysctl_vmproc, 0, NULL, 0, CTL_VM, VM_PROC, CTL_EOL); #ifndef __USER_VA0_IS_SAFE sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_INT, "user_va0_disable", SYSCTL_DESCR("Disable VA 0"), sysctl_user_va0_disable, 0, &user_va0_disable, 0, CTL_VM, CTL_CREATE, CTL_EOL); #endif }
5 5 5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 /* $NetBSD: pmap_pvt.c,v 1.15 2022/05/08 22:03:02 rin Exp $ */ /*- * Copyright (c) 2014, 2020 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Taylor R. Campbell. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #include <sys/cdefs.h> __RCSID("$NetBSD: pmap_pvt.c,v 1.15 2022/05/08 22:03:02 rin Exp $"); #include <sys/param.h> #include <sys/atomic.h> #include <sys/kmem.h> #include <sys/pserialize.h> #include <uvm/uvm.h> #include <uvm/pmap/pmap_pvt.h> #if !defined(PMAP_PV_TRACK_ONLY_STUBS) /* * unmanaged pv-tracked ranges * * This is a linear list for now because the only user are the DRM * graphics drivers, with a single tracked range per device, for the * graphics aperture, so there are expected to be few of them. * * This is used only after the VM system is initialized well enough * that we can use kmem_alloc. */ struct pv_track { paddr_t pvt_start; psize_t pvt_size; struct pv_track *pvt_next; struct pmap_page pvt_pages[]; }; static struct { kmutex_t lock; pserialize_t psz; struct pv_track *list; } pv_unmanaged __cacheline_aligned; void pmap_pv_init(void) { mutex_init(&pv_unmanaged.lock, MUTEX_DEFAULT, IPL_NONE); pv_unmanaged.psz = pserialize_create(); pv_unmanaged.list = NULL; } void pmap_pv_track(paddr_t start, psize_t size) { struct pv_track *pvt; size_t npages; KASSERT(start == trunc_page(start)); KASSERT(size == trunc_page(size)); /* We may sleep for allocation. */ ASSERT_SLEEPABLE(); npages = size >> PAGE_SHIFT; pvt = kmem_zalloc(offsetof(struct pv_track, pvt_pages[npages]), KM_SLEEP); pvt->pvt_start = start; pvt->pvt_size = size; #ifdef PMAP_PAGE_INIT for (size_t i = 0; i < npages; i++) PMAP_PAGE_INIT(&pvt->pvt_pages[i]); #endif mutex_enter(&pv_unmanaged.lock); pvt->pvt_next = pv_unmanaged.list; atomic_store_release(&pv_unmanaged.list, pvt); mutex_exit(&pv_unmanaged.lock); } void pmap_pv_untrack(paddr_t start, psize_t size) { struct pv_track **pvtp, *pvt; size_t npages; KASSERT(start == trunc_page(start)); KASSERT(size == trunc_page(size)); /* We may sleep for pserialize_perform. */ ASSERT_SLEEPABLE(); mutex_enter(&pv_unmanaged.lock); for (pvtp = &pv_unmanaged.list; (pvt = *pvtp) != NULL; pvtp = &pvt->pvt_next) { if (pvt->pvt_start != start) continue; if (pvt->pvt_size != size) panic("pmap_pv_untrack: pv-tracking at 0x%"PRIxPADDR ": 0x%"PRIxPSIZE" bytes, not 0x%"PRIxPSIZE" bytes", pvt->pvt_start, pvt->pvt_size, size); /* * Remove from list. Readers can safely see the old * and new states of the list. */ atomic_store_relaxed(pvtp, pvt->pvt_next); /* Wait for readers who can see the old state to finish. */ pserialize_perform(pv_unmanaged.psz); /* * We now have exclusive access to pvt and can destroy * it. Poison it to catch bugs. */ explicit_memset(&pvt->pvt_next, 0x1a, sizeof pvt->pvt_next); goto out; } panic("pmap_pv_untrack: pages not pv-tracked at 0x%"PRIxPADDR " (0x%"PRIxPSIZE" bytes)", start, size); out: mutex_exit(&pv_unmanaged.lock); npages = size >> PAGE_SHIFT; kmem_free(pvt, offsetof(struct pv_track, pvt_pages[npages])); } struct pmap_page * pmap_pv_tracked(paddr_t pa) { struct pv_track *pvt; size_t pgno; int s; KASSERT(pa == trunc_page(pa)); s = pserialize_read_enter(); for (pvt = atomic_load_consume(&pv_unmanaged.list); pvt != NULL; pvt = pvt->pvt_next) { if ((pvt->pvt_start <= pa) && ((pa - pvt->pvt_start) < pvt->pvt_size)) break; } pserialize_read_exit(s); if (pvt == NULL) return NULL; KASSERT(pvt->pvt_start <= pa); KASSERT((pa - pvt->pvt_start) < pvt->pvt_size); pgno = (pa - pvt->pvt_start) >> PAGE_SHIFT; return &pvt->pvt_pages[pgno]; } #else /* PMAP_PV_TRACK_ONLY_STUBS */ /* * Provide empty stubs just for MODULAR kernels. */ void pmap_pv_init(void) { } struct pmap_page * pmap_pv_tracked(paddr_t pa) { return NULL; } #if notdef /* * pmap_pv_{,un}track() are intentionally commented out. If modules * call these functions, the result should be an inconsistent state. * * Such modules require real PV-tracking support. Let us make the * two symbols undefined, and prevent these modules from loaded. */ void pmap_pv_track(paddr_t start, psize_t size) { panic("PV-tracking not supported"); } void pmap_pv_untrack(paddr_t start, psize_t size) { panic("PV-tracking not supported"); } #endif /* notdef */ #endif /* PMAP_PV_TRACK_ONLY_STUBS */
22 22 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 /* $NetBSD: kern_fileassoc.c,v 1.38 2023/12/28 12:49:06 hannken Exp $ */ /*- * Copyright (c) 2006 Elad Efrat <elad@NetBSD.org> * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The name of the author may not be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: kern_fileassoc.c,v 1.38 2023/12/28 12:49:06 hannken Exp $"); #include "opt_fileassoc.h" #include <sys/param.h> #include <sys/mount.h> #include <sys/queue.h> #include <sys/vnode.h> #include <sys/errno.h> #include <sys/fileassoc.h> #include <sys/specificdata.h> #include <sys/hash.h> #include <sys/kmem.h> #include <sys/once.h> #include <sys/mutex.h> #include <sys/xcall.h> #define FILEASSOC_INITIAL_TABLESIZE 128 static specificdata_domain_t fileassoc_domain = NULL; static specificdata_key_t fileassoc_mountspecific_key; static ONCE_DECL(control); /* * Assoc entry. * Includes the assoc name for identification and private clear callback. */ struct fileassoc { LIST_ENTRY(fileassoc) assoc_list; const char *assoc_name; /* Name. */ fileassoc_cleanup_cb_t assoc_cleanup_cb; /* Clear callback. */ specificdata_key_t assoc_key; }; static LIST_HEAD(, fileassoc) fileassoc_list; /* An entry in the per-mount hash table. */ struct fileassoc_file { fhandle_t *faf_handle; /* File handle */ specificdata_reference faf_data; /* Assoc data. */ u_int faf_nassocs; /* # of assocs. */ LIST_ENTRY(fileassoc_file) faf_list; /* List pointer. */ }; LIST_HEAD(fileassoc_hash_entry, fileassoc_file); struct fileassoc_table { struct fileassoc_hash_entry *tbl_hash; u_long tbl_mask; /* Hash table mask. */ size_t tbl_nslots; /* Number of slots. */ size_t tbl_nused; /* # of used slots. */ specificdata_reference tbl_data; }; /* * Hashing function: Takes a number modulus the mask to give back an * index into the hash table. */ #define FILEASSOC_HASH(tbl, handle) \ (hash32_buf((handle), FHANDLE_SIZE(handle), HASH32_BUF_INIT) \ & ((tbl)->tbl_mask)) /* * Global usage counting. This is bad for parallelism of updates, but * good for avoiding calls to fileassoc when it's not in use. Unclear * if parallelism of updates matters much. If you want to improve * fileassoc(9) update performance, feel free to rip this out as long * as you don't cause the fast paths to take any global locks or incur * memory barriers when fileassoc(9) is not in use. */ static struct { kmutex_t lock; uint64_t nassocs; volatile bool inuse; } fileassoc_global __cacheline_aligned; static void fileassoc_incuse(void) { mutex_enter(&fileassoc_global.lock); if (fileassoc_global.nassocs++ == 0) { KASSERT(!fileassoc_global.inuse); atomic_store_relaxed(&fileassoc_global.inuse, true); xc_barrier(0); } mutex_exit(&fileassoc_global.lock); } static void fileassoc_decuse(void) { mutex_enter(&fileassoc_global.lock); KASSERT(fileassoc_global.nassocs > 0); KASSERT(fileassoc_global.inuse); if (--fileassoc_global.nassocs == 0) atomic_store_relaxed(&fileassoc_global.inuse, false); mutex_exit(&fileassoc_global.lock); } static bool fileassoc_inuse(void) { return __predict_false(atomic_load_relaxed(&fileassoc_global.inuse)); } static void * file_getdata(struct fileassoc_file *faf, const struct fileassoc *assoc) { return specificdata_getspecific(fileassoc_domain, &faf->faf_data, assoc->assoc_key); } static void file_setdata(struct fileassoc_file *faf, const struct fileassoc *assoc, void *data) { specificdata_setspecific(fileassoc_domain, &faf->faf_data, assoc->assoc_key, data); } static void file_cleanup(struct fileassoc_file *faf, const struct fileassoc *assoc) { fileassoc_cleanup_cb_t cb; void *data; cb = assoc->assoc_cleanup_cb; if (cb == NULL) { return; } data = file_getdata(faf, assoc); (*cb)(data); } static void file_free(struct fileassoc_file *faf) { struct fileassoc *assoc; LIST_REMOVE(faf, faf_list); LIST_FOREACH(assoc, &fileassoc_list, assoc_list) { file_cleanup(faf, assoc); fileassoc_decuse(); } vfs_composefh_free(faf->faf_handle); specificdata_fini(fileassoc_domain, &faf->faf_data); kmem_free(faf, sizeof(*faf)); } static void table_dtor(void *v) { struct fileassoc_table *tbl = v; u_long i; /* Remove all entries from the table and lists */ for (i = 0; i < tbl->tbl_nslots; i++) { struct fileassoc_file *faf; while ((faf = LIST_FIRST(&tbl->tbl_hash[i])) != NULL) { file_free(faf); } } /* Remove hash table and sysctl node */ hashdone(tbl->tbl_hash, HASH_LIST, tbl->tbl_mask); specificdata_fini(fileassoc_domain, &tbl->tbl_data); kmem_free(tbl, sizeof(*tbl)); } /* * Initialize the fileassoc subsystem. */ static int fileassoc_init(void) { int error; error = mount_specific_key_create(&fileassoc_mountspecific_key, table_dtor); if (error) { return error; } fileassoc_domain = specificdata_domain_create(); mutex_init(&fileassoc_global.lock, MUTEX_DEFAULT, IPL_NONE); return 0; } /* * Register a new assoc. */ int fileassoc_register(const char *name, fileassoc_cleanup_cb_t cleanup_cb, fileassoc_t *result) { int error; specificdata_key_t key; struct fileassoc *assoc; error = RUN_ONCE(&control, fileassoc_init); if (error) { return error; } error = specificdata_key_create(fileassoc_domain, &key, NULL); if (error) { return error; } assoc = kmem_alloc(sizeof(*assoc), KM_SLEEP); assoc->assoc_name = name; assoc->assoc_cleanup_cb = cleanup_cb; assoc->assoc_key = key; LIST_INSERT_HEAD(&fileassoc_list, assoc, assoc_list); *result = assoc; return 0; } /* * Deregister an assoc. */ int fileassoc_deregister(fileassoc_t assoc) { LIST_REMOVE(assoc, assoc_list); specificdata_key_delete(fileassoc_domain, assoc->assoc_key); kmem_free(assoc, sizeof(*assoc)); return 0; } /* * Get the hash table for the specified device. */ static struct fileassoc_table * fileassoc_table_lookup(struct mount *mp) { int error; if (!fileassoc_inuse()) return NULL; error = RUN_ONCE(&control, fileassoc_init); if (error) { return NULL; } return mount_getspecific(mp, fileassoc_mountspecific_key); } /* * Perform a lookup on a hash table. If hint is non-zero then use the value * of the hint as the identifier instead of performing a lookup for the * fileid. */ static struct fileassoc_file * fileassoc_file_lookup(struct vnode *vp, fhandle_t *hint) { struct fileassoc_table *tbl; struct fileassoc_hash_entry *hash_entry; struct fileassoc_file *faf; size_t indx; fhandle_t *th; int error; tbl = fileassoc_table_lookup(vp->v_mount); if (tbl == NULL) { return NULL; } if (hint == NULL) { error = vfs_composefh_alloc(vp, &th); if (error) return (NULL); } else { th = hint; } indx = FILEASSOC_HASH(tbl, th); hash_entry = &(tbl->tbl_hash[indx]); LIST_FOREACH(faf, hash_entry, faf_list) { if (((FHANDLE_FILEID(faf->faf_handle)->fid_len == FHANDLE_FILEID(th)->fid_len)) && (memcmp(FHANDLE_FILEID(faf->faf_handle), FHANDLE_FILEID(th), (FHANDLE_FILEID(th))->fid_len) == 0)) { break; } } if (hint == NULL) vfs_composefh_free(th); return faf; } /* * Return assoc data associated with a vnode. */ void * fileassoc_lookup(struct vnode *vp, fileassoc_t assoc) { struct fileassoc_file *faf; faf = fileassoc_file_lookup(vp, NULL); if (faf == NULL) return (NULL); return file_getdata(faf, assoc); } static struct fileassoc_table * fileassoc_table_resize(struct fileassoc_table *tbl) { struct fileassoc_table *newtbl; u_long i; /* * Allocate a new table. Like the condition in fileassoc_file_add(), * this is also temporary -- just double the number of slots. */ newtbl = kmem_zalloc(sizeof(*newtbl), KM_SLEEP); newtbl->tbl_nslots = (tbl->tbl_nslots * 2); if (newtbl->tbl_nslots < tbl->tbl_nslots) newtbl->tbl_nslots = tbl->tbl_nslots; newtbl->tbl_hash = hashinit(newtbl->tbl_nslots, HASH_LIST, true, &newtbl->tbl_mask); newtbl->tbl_nused = 0; specificdata_init(fileassoc_domain, &newtbl->tbl_data); /* XXX we need to make sure nothing uses fileassoc here! */ for (i = 0; i < tbl->tbl_nslots; i++) { struct fileassoc_file *faf; while ((faf = LIST_FIRST(&tbl->tbl_hash[i])) != NULL) { struct fileassoc_hash_entry *hash_entry; size_t indx; LIST_REMOVE(faf, faf_list); indx = FILEASSOC_HASH(newtbl, faf->faf_handle); hash_entry = &(newtbl->tbl_hash[indx]); LIST_INSERT_HEAD(hash_entry, faf, faf_list); newtbl->tbl_nused++; } } if (tbl->tbl_nused != newtbl->tbl_nused) panic("fileassoc_table_resize: inconsistency detected! " "needed %zu entries, got %zu", tbl->tbl_nused, newtbl->tbl_nused); hashdone(tbl->tbl_hash, HASH_LIST, tbl->tbl_mask); specificdata_fini(fileassoc_domain, &tbl->tbl_data); kmem_free(tbl, sizeof(*tbl)); return (newtbl); } /* * Create a new fileassoc table. */ static struct fileassoc_table * fileassoc_table_add(struct mount *mp) { struct fileassoc_table *tbl; /* Check for existing table for device. */ tbl = fileassoc_table_lookup(mp); if (tbl != NULL) return (tbl); /* Allocate and initialize a table. */ tbl = kmem_zalloc(sizeof(*tbl), KM_SLEEP); tbl->tbl_nslots = FILEASSOC_INITIAL_TABLESIZE; tbl->tbl_hash = hashinit(tbl->tbl_nslots, HASH_LIST, true, &tbl->tbl_mask); tbl->tbl_nused = 0; specificdata_init(fileassoc_domain, &tbl->tbl_data); mount_setspecific(mp, fileassoc_mountspecific_key, tbl); return (tbl); } /* * Delete a table. */ int fileassoc_table_delete(struct mount *mp) { struct fileassoc_table *tbl; tbl = fileassoc_table_lookup(mp); if (tbl == NULL) return (EEXIST); mount_setspecific(mp, fileassoc_mountspecific_key, NULL); table_dtor(tbl); return (0); } /* * Run a callback for each assoc in a table. */ int fileassoc_table_run(struct mount *mp, fileassoc_t assoc, fileassoc_cb_t cb, void *cookie) { struct fileassoc_table *tbl; u_long i; tbl = fileassoc_table_lookup(mp); if (tbl == NULL) return (EEXIST); for (i = 0; i < tbl->tbl_nslots; i++) { struct fileassoc_file *faf; LIST_FOREACH(faf, &tbl->tbl_hash[i], faf_list) { void *data; data = file_getdata(faf, assoc); if (data != NULL) cb(data, cookie); } } return (0); } /* * Clear a table for a given assoc. */ int fileassoc_table_clear(struct mount *mp, fileassoc_t assoc) { struct fileassoc_table *tbl; u_long i; tbl = fileassoc_table_lookup(mp); if (tbl == NULL) return (EEXIST); for (i = 0; i < tbl->tbl_nslots; i++) { struct fileassoc_file *faf; LIST_FOREACH(faf, &tbl->tbl_hash[i], faf_list) { file_cleanup(faf, assoc); file_setdata(faf, assoc, NULL); /* XXX missing faf->faf_nassocs--? */ fileassoc_decuse(); } } return (0); } /* * Add a file entry to a table. */ static struct fileassoc_file * fileassoc_file_add(struct vnode *vp, fhandle_t *hint) { struct fileassoc_table *tbl; struct fileassoc_hash_entry *hash_entry; struct fileassoc_file *faf; size_t indx; fhandle_t *th; int error; if (hint == NULL) { error = vfs_composefh_alloc(vp, &th); if (error) return (NULL); } else th = hint; faf = fileassoc_file_lookup(vp, th); if (faf != NULL) { if (hint == NULL) vfs_composefh_free(th); return (faf); } tbl = fileassoc_table_lookup(vp->v_mount); if (tbl == NULL) { tbl = fileassoc_table_add(vp->v_mount); } indx = FILEASSOC_HASH(tbl, th); hash_entry = &(tbl->tbl_hash[indx]); faf = kmem_zalloc(sizeof(*faf), KM_SLEEP); faf->faf_handle = th; specificdata_init(fileassoc_domain, &faf->faf_data); LIST_INSERT_HEAD(hash_entry, faf, faf_list); /* * This decides when we need to resize the table. For now, * resize it whenever we "filled" up the number of slots it * has. That's not really true unless of course we had zero * collisions. Think positive! :) */ if (++(tbl->tbl_nused) == tbl->tbl_nslots) { struct fileassoc_table *newtbl; newtbl = fileassoc_table_resize(tbl); mount_setspecific(vp->v_mount, fileassoc_mountspecific_key, newtbl); } return (faf); } /* * Delete a file entry from a table. */ int fileassoc_file_delete(struct vnode *vp) { struct fileassoc_table *tbl; struct fileassoc_file *faf; if (!fileassoc_inuse()) return ENOENT; KERNEL_LOCK(1, NULL); faf = fileassoc_file_lookup(vp, NULL); if (faf == NULL) { KERNEL_UNLOCK_ONE(NULL); return (ENOENT); } file_free(faf); tbl = fileassoc_table_lookup(vp->v_mount); KASSERT(tbl != NULL); --(tbl->tbl_nused); /* XXX gc? */ KERNEL_UNLOCK_ONE(NULL); return (0); } /* * Add an assoc to a vnode. */ int fileassoc_add(struct vnode *vp, fileassoc_t assoc, void *data) { struct fileassoc_file *faf; void *olddata; faf = fileassoc_file_lookup(vp, NULL); if (faf == NULL) { faf = fileassoc_file_add(vp, NULL); if (faf == NULL) return (ENOTDIR); } olddata = file_getdata(faf, assoc); if (olddata != NULL) return (EEXIST); fileassoc_incuse(); file_setdata(faf, assoc, data); faf->faf_nassocs++; return (0); } /* * Clear an assoc from a vnode. */ int fileassoc_clear(struct vnode *vp, fileassoc_t assoc) { struct fileassoc_file *faf; faf = fileassoc_file_lookup(vp, NULL); if (faf == NULL) return (ENOENT); file_cleanup(faf, assoc); file_setdata(faf, assoc, NULL); --(faf->faf_nassocs); /* XXX gc? */ fileassoc_decuse(); return (0); }
10 2 8 8 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 /* $NetBSD: kern_time_60.c,v 1.3 2020/01/29 15:47:51 ad Exp $ */ /*- * Copyright (c) 2013, 2020 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Christos Zoulas. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: kern_time_60.c,v 1.3 2020/01/29 15:47:51 ad Exp $"); #if defined(_KERNEL_OPT) #include "opt_compat_netbsd.h" #endif #include <sys/param.h> #include <sys/systm.h> #include <sys/lwp.h> #include <sys/time.h> #include <sys/syscall.h> #include <sys/syscallvar.h> #include <sys/syscallargs.h> #include <compat/common/compat_util.h> #include <compat/common/compat_mod.h> static const struct syscall_package compat_60_syscalls[] = { { SYS_compat_60__lwp_park, 0, (sy_call_t *)compat_60_sys__lwp_park }, { 0, 0, NULL } }; int compat_60_sys__lwp_park(struct lwp *l, const struct compat_60_sys__lwp_park_args *uap, register_t *retval) { /* { syscallarg(const struct timespec *) ts; syscallarg(lwpid_t) unpark; syscallarg(const void *) hint; syscallarg(const void *) unparkhint; } */ int error; struct timespec ts, *tsp; if (SCARG(uap, ts) == NULL) tsp = NULL; else { error = copyin(SCARG(uap, ts), &ts, sizeof(ts)); if (error != 0) return error; tsp = &ts; } if (SCARG(uap, unpark) != 0) { error = lwp_unpark(&SCARG(uap, unpark), 1); if (error != 0) return error; } return lwp_park(CLOCK_REALTIME, TIMER_ABSTIME, tsp); } int kern_time_60_init(void) { return syscall_establish(NULL, compat_60_syscalls); } int kern_time_60_fini(void) { return syscall_disestablish(NULL, compat_60_syscalls); }
11 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 /* $NetBSD: vm_43.c,v 1.21 2019/01/27 02:08:39 pgoyette Exp $ */ /* * Copyright (c) 1988 University of Utah. * Copyright (c) 1991, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * the Systems Programming Group of the University of Utah Computer * Science Department. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: Utah $Hdr: vm_mmap.c 1.6 91/10/21$ * * @(#)vm_mmap.c 8.5 (Berkeley) 5/19/94 */ /* * Mapped file (mmap) interface to VM */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: vm_43.c,v 1.21 2019/01/27 02:08:39 pgoyette Exp $"); #if defined(_KERNEL_OPT) #include "opt_compat_netbsd.h" #endif #include <sys/param.h> #include <sys/systm.h> #include <sys/filedesc.h> #include <sys/resourcevar.h> #include <sys/proc.h> #include <sys/vnode.h> #include <sys/file.h> #include <sys/mman.h> #include <sys/syscall.h> #include <sys/syscallvar.h> #include <sys/syscallargs.h> #include <miscfs/specfs/specdev.h> #include <compat/common/compat_mod.h> static struct syscall_package vm_43_syscalls[] = { { SYS_compat_43_ogetpagesize, 0, (sy_call_t *)compat_43_sys_getpagesize }, { SYS_compat_43_ommap, 0, (sy_call_t *)compat_43_sys_mmap }, { 0, 0, NULL } }; /* ARGSUSED */ int compat_43_sys_getpagesize(struct lwp *l, const void *v, register_t *retval) { *retval = PAGE_SIZE; return (0); } int compat_43_sys_mmap(struct lwp *l, const struct compat_43_sys_mmap_args *uap, register_t *retval) { /* { syscallarg(void *) addr; syscallarg(size_t) len; syscallarg(int) prot; syscallarg(int) flags; syscallarg(int) fd; syscallarg(long) pos; } */ struct sys_mmap_args /* { syscallarg(void *) addr; syscallarg(size_t) len; syscallarg(int) prot; syscallarg(int) flags; syscallarg(int) fd; syscallarg(long) pad; syscallarg(off_t) pos; } */ nargs; static const char cvtbsdprot[8] = { 0, PROT_EXEC, PROT_WRITE, PROT_EXEC|PROT_WRITE, PROT_READ, PROT_EXEC|PROT_READ, PROT_WRITE|PROT_READ, PROT_EXEC|PROT_WRITE|PROT_READ, }; #define OMAP_ANON 0x0002 #define OMAP_COPY 0x0020 #define OMAP_SHARED 0x0010 #define OMAP_FIXED 0x0100 #define OMAP_INHERIT 0x0800 SCARG(&nargs, addr) = SCARG(uap, addr); SCARG(&nargs, len) = SCARG(uap, len); /* Note: index using prot is sign-safe due to mask */ SCARG(&nargs, prot) = cvtbsdprot[SCARG(uap, prot)&0x7]; SCARG(&nargs, flags) = 0; if (SCARG(uap, flags) & OMAP_ANON) SCARG(&nargs, flags) |= MAP_ANON; if (SCARG(uap, flags) & OMAP_SHARED) SCARG(&nargs, flags) |= MAP_SHARED; else SCARG(&nargs, flags) |= MAP_PRIVATE; if (SCARG(uap, flags) & OMAP_COPY) { SCARG(&nargs, flags) |= MAP_PRIVATE; #if defined(COMPAT_10) && defined(__i386__) /* * Ancient kernel on x86 did not obey PROT_EXEC on i386 at least * and ld.so did not turn it on. We take care of this on amd64 * in compat32. */ SCARG(&nargs, prot) |= PROT_EXEC; #endif } if (SCARG(uap, flags) & OMAP_FIXED) SCARG(&nargs, flags) |= MAP_FIXED; if (SCARG(uap, flags) & OMAP_INHERIT) SCARG(&nargs, flags) |= MAP_INHERIT; SCARG(&nargs, fd) = SCARG(uap, fd); SCARG(&nargs, pos) = SCARG(uap, pos); return (sys_mmap(l, &nargs, retval)); } int vm_43_init(void) { return syscall_establish(NULL, vm_43_syscalls); } int vm_43_fini(void) { return syscall_disestablish(NULL, vm_43_syscalls); }
14 7 23 4 19 39 3 2 3 10 10 23 17 40 40 30 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 /* $NetBSD: uvm_glue.c,v 1.182 2023/10/04 20:34:19 ad Exp $ */ /* * Copyright (c) 1997 Charles D. Cranor and Washington University. * Copyright (c) 1991, 1993, The Regents of the University of California. * * All rights reserved. * * This code is derived from software contributed to Berkeley by * The Mach Operating System project at Carnegie-Mellon University. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)vm_glue.c 8.6 (Berkeley) 1/5/94 * from: Id: uvm_glue.c,v 1.1.2.8 1998/02/07 01:16:54 chs Exp * * * Copyright (c) 1987, 1990 Carnegie-Mellon University. * All rights reserved. * * Permission to use, copy, modify and distribute this software and * its documentation is hereby granted, provided that both the copyright * notice and this permission notice appear in all copies of the * software, derivative works or modified versions, and any portions * thereof, and that both notices appear in supporting documentation. * * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. * * Carnegie Mellon requests users of this software to return to * * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU * School of Computer Science * Carnegie Mellon University * Pittsburgh PA 15213-3890 * * any improvements or extensions that they make and grant Carnegie the * rights to redistribute these changes. */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: uvm_glue.c,v 1.182 2023/10/04 20:34:19 ad Exp $"); #include "opt_kgdb.h" #include "opt_kstack.h" #include "opt_uvmhist.h" /* * uvm_glue.c: glue functions */ #include <sys/param.h> #include <sys/kernel.h> #include <sys/systm.h> #include <sys/proc.h> #include <sys/resourcevar.h> #include <sys/buf.h> #include <sys/syncobj.h> #include <sys/cpu.h> #include <sys/atomic.h> #include <sys/lwp.h> #include <sys/asan.h> #include <uvm/uvm.h> #include <uvm/uvm_pdpolicy.h> #include <uvm/uvm_pgflcache.h> /* * uvm_kernacc: test if kernel can access a memory region. * * => Currently used only by /dev/kmem driver (dev/mm.c). */ bool uvm_kernacc(void *addr, size_t len, vm_prot_t prot) { vaddr_t saddr = trunc_page((vaddr_t)addr); vaddr_t eaddr = round_page(saddr + len); bool rv; vm_map_lock_read(kernel_map); rv = uvm_map_checkprot(kernel_map, saddr, eaddr, prot); vm_map_unlock_read(kernel_map); return rv; } #ifdef KGDB /* * Change protections on kernel pages from addr to addr+len * (presumably so debugger can plant a breakpoint). * * We force the protection change at the pmap level. If we were * to use vm_map_protect a change to allow writing would be lazily- * applied meaning we would still take a protection fault, something * we really don't want to do. It would also fragment the kernel * map unnecessarily. We cannot use pmap_protect since it also won't * enforce a write-enable request. Using pmap_enter is the only way * we can ensure the change takes place properly. */ void uvm_chgkprot(void *addr, size_t len, int rw) { vm_prot_t prot; paddr_t pa; vaddr_t sva, eva; prot = rw == B_READ ? VM_PROT_READ : VM_PROT_READ|VM_PROT_WRITE; eva = round_page((vaddr_t)addr + len); for (sva = trunc_page((vaddr_t)addr); sva < eva; sva += PAGE_SIZE) { /* * Extract physical address for the page. */ if (pmap_extract(pmap_kernel(), sva, &pa) == false) panic("%s: invalid page", __func__); pmap_enter(pmap_kernel(), sva, pa, prot, PMAP_WIRED); } pmap_update(pmap_kernel()); } #endif /* * uvm_vslock: wire user memory for I/O * * - called from physio and sys___sysctl * - XXXCDC: consider nuking this (or making it a macro?) */ int uvm_vslock(struct vmspace *vs, void *addr, size_t len, vm_prot_t access_type) { struct vm_map *map; vaddr_t start, end; int error; map = &vs->vm_map; start = trunc_page((vaddr_t)addr); end = round_page((vaddr_t)addr + len); error = uvm_fault_wire(map, start, end, access_type, 0); return error; } /* * uvm_vsunlock: unwire user memory wired by uvm_vslock() * * - called from physio and sys___sysctl * - XXXCDC: consider nuking this (or making it a macro?) */ void uvm_vsunlock(struct vmspace *vs, void *addr, size_t len) { uvm_fault_unwire(&vs->vm_map, trunc_page((vaddr_t)addr), round_page((vaddr_t)addr + len)); } /* * uvm_proc_fork: fork a virtual address space * * - the address space is copied as per parent map's inherit values */ void uvm_proc_fork(struct proc *p1, struct proc *p2, bool shared) { if (shared == true) { p2->p_vmspace = NULL; uvmspace_share(p1, p2); } else { p2->p_vmspace = uvmspace_fork(p1->p_vmspace); } cpu_proc_fork(p1, p2); } /* * uvm_lwp_fork: fork a thread * * - a new PCB structure is allocated for the child process, * and filled in by MD layer * - if specified, the child gets a new user stack described by * stack and stacksize * - NOTE: the kernel stack may be at a different location in the child * process, and thus addresses of automatic variables may be invalid * after cpu_lwp_fork returns in the child process. We do nothing here * after cpu_lwp_fork returns. */ void uvm_lwp_fork(struct lwp *l1, struct lwp *l2, void *stack, size_t stacksize, void (*func)(void *), void *arg) { /* Fill stack with magic number. */ kstack_setup_magic(l2); /* * cpu_lwp_fork() copy and update the pcb, and make the child ready * to run. If this is a normal user fork, the child will exit * directly to user mode via child_return() on its first time * slice and will not return here. If this is a kernel thread, * the specified entry point will be executed. */ cpu_lwp_fork(l1, l2, stack, stacksize, func, arg); } #ifndef USPACE_ALIGN #define USPACE_ALIGN 0 #endif static pool_cache_t uvm_uarea_cache; #if defined(__HAVE_CPU_UAREA_ROUTINES) static pool_cache_t uvm_uarea_system_cache; #else #define uvm_uarea_system_cache uvm_uarea_cache #endif static void * uarea_poolpage_alloc(struct pool *pp, int flags) { KASSERT((flags & PR_WAITOK) != 0); #if defined(PMAP_MAP_POOLPAGE) while (USPACE == PAGE_SIZE && (USPACE_ALIGN == 0 || USPACE_ALIGN == PAGE_SIZE)) { struct vm_page *pg; vaddr_t va; #if defined(PMAP_ALLOC_POOLPAGE) pg = PMAP_ALLOC_POOLPAGE(0); #else pg = uvm_pagealloc(NULL, 0, NULL, 0); #endif if (pg == NULL) { uvm_wait("uarea"); continue; } va = PMAP_MAP_POOLPAGE(VM_PAGE_TO_PHYS(pg)); KASSERT(va != 0); return (void *)va; } #endif #if defined(__HAVE_CPU_UAREA_ROUTINES) void *va = cpu_uarea_alloc(false); if (va) return (void *)va; #endif return (void *)uvm_km_alloc(kernel_map, pp->pr_alloc->pa_pagesz, USPACE_ALIGN, UVM_KMF_WIRED | UVM_KMF_WAITVA); } static void uarea_poolpage_free(struct pool *pp, void *addr) { #if defined(PMAP_MAP_POOLPAGE) if (USPACE == PAGE_SIZE && (USPACE_ALIGN == 0 || USPACE_ALIGN == PAGE_SIZE)) { paddr_t pa; pa = PMAP_UNMAP_POOLPAGE((vaddr_t) addr); KASSERT(pa != 0); uvm_pagefree(PHYS_TO_VM_PAGE(pa)); return; } #endif #if defined(__HAVE_CPU_UAREA_ROUTINES) if (cpu_uarea_free(addr)) return; #endif uvm_km_free(kernel_map, (vaddr_t)addr, pp->pr_alloc->pa_pagesz, UVM_KMF_WIRED); } static struct pool_allocator uvm_uarea_allocator = { .pa_alloc = uarea_poolpage_alloc, .pa_free = uarea_poolpage_free, .pa_pagesz = USPACE, }; #if defined(__HAVE_CPU_UAREA_ROUTINES) static void * uarea_system_poolpage_alloc(struct pool *pp, int flags) { void * const va = cpu_uarea_alloc(true); if (va != NULL) return va; return (void *)uvm_km_alloc(kernel_map, pp->pr_alloc->pa_pagesz, USPACE_ALIGN, UVM_KMF_WIRED | ((flags & PR_WAITOK) ? UVM_KMF_WAITVA : (UVM_KMF_NOWAIT | UVM_KMF_TRYLOCK))); } static void uarea_system_poolpage_free(struct pool *pp, void *addr) { if (cpu_uarea_free(addr)) return; uvm_km_free(kernel_map, (vaddr_t)addr, pp->pr_alloc->pa_pagesz, UVM_KMF_WIRED); } static struct pool_allocator uvm_uarea_system_allocator = { .pa_alloc = uarea_system_poolpage_alloc, .pa_free = uarea_system_poolpage_free, .pa_pagesz = USPACE, }; #endif /* __HAVE_CPU_UAREA_ROUTINES */ void uvm_uarea_init(void) { int flags = PR_NOTOUCH; /* * specify PR_NOALIGN unless the alignment provided by * the backend (USPACE_ALIGN) is sufficient to provide * pool page size (UPSACE) alignment. */ if ((USPACE_ALIGN == 0 && USPACE != PAGE_SIZE) || (USPACE_ALIGN % USPACE) != 0) { flags |= PR_NOALIGN; } uvm_uarea_cache = pool_cache_init(USPACE, USPACE_ALIGN, 0, flags, "uarea", &uvm_uarea_allocator, IPL_NONE, NULL, NULL, NULL); #if defined(__HAVE_CPU_UAREA_ROUTINES) uvm_uarea_system_cache = pool_cache_init(USPACE, USPACE_ALIGN, 0, flags, "uareasys", &uvm_uarea_system_allocator, IPL_NONE, NULL, NULL, NULL); #endif } /* * uvm_uarea_alloc: allocate a u-area */ vaddr_t uvm_uarea_alloc(void) { return (vaddr_t)pool_cache_get(uvm_uarea_cache, PR_WAITOK); } vaddr_t uvm_uarea_system_alloc(struct cpu_info *ci) { #ifdef __HAVE_CPU_UAREA_ALLOC_IDLELWP if (__predict_false(ci != NULL)) return cpu_uarea_alloc_idlelwp(ci); #endif return (vaddr_t)pool_cache_get(uvm_uarea_system_cache, PR_WAITOK); } /* * uvm_uarea_free: free a u-area */ void uvm_uarea_free(vaddr_t uaddr) { kasan_mark((void *)uaddr, USPACE, USPACE, 0); pool_cache_put(uvm_uarea_cache, (void *)uaddr); } void uvm_uarea_system_free(vaddr_t uaddr) { kasan_mark((void *)uaddr, USPACE, USPACE, 0); pool_cache_put(uvm_uarea_system_cache, (void *)uaddr); } vaddr_t uvm_lwp_getuarea(lwp_t *l) { return (vaddr_t)l->l_addr - UAREA_PCB_OFFSET; } void uvm_lwp_setuarea(lwp_t *l, vaddr_t addr) { l->l_addr = (void *)(addr + UAREA_PCB_OFFSET); } /* * uvm_proc_exit: exit a virtual address space * * - borrow proc0's address space because freeing the vmspace * of the dead process may block. */ void uvm_proc_exit(struct proc *p) { struct lwp *l = curlwp; /* XXX */ struct vmspace *ovm; KASSERT(p == l->l_proc); ovm = p->p_vmspace; KASSERT(ovm != NULL); if (__predict_false(ovm == proc0.p_vmspace)) return; /* * borrow proc0's address space. */ kpreempt_disable(); pmap_deactivate(l); p->p_vmspace = proc0.p_vmspace; pmap_activate(l); kpreempt_enable(); uvmspace_free(ovm); } void uvm_lwp_exit(struct lwp *l) { vaddr_t va = uvm_lwp_getuarea(l); bool system = (l->l_flag & LW_SYSTEM) != 0; if (system) uvm_uarea_system_free(va); else uvm_uarea_free(va); #ifdef DIAGNOSTIC uvm_lwp_setuarea(l, (vaddr_t)NULL); #endif } /* * uvm_init_limit: init per-process VM limits * * - called for process 0 and then inherited by all others. */ void uvm_init_limits(struct proc *p) { /* * Set up the initial limits on process VM. Set the maximum * resident set size to be all of (reasonably) available memory. * This causes any single, large process to start random page * replacement once it fills memory. */ p->p_rlimit[RLIMIT_STACK].rlim_cur = DFLSSIZ; p->p_rlimit[RLIMIT_STACK].rlim_max = maxsmap; p->p_rlimit[RLIMIT_DATA].rlim_cur = DFLDSIZ; p->p_rlimit[RLIMIT_DATA].rlim_max = maxdmap; p->p_rlimit[RLIMIT_AS].rlim_cur = RLIM_INFINITY; p->p_rlimit[RLIMIT_AS].rlim_max = RLIM_INFINITY; p->p_rlimit[RLIMIT_RSS].rlim_cur = MIN(VM_MAXUSER_ADDRESS, ctob((rlim_t)uvm_availmem(false))); } /* * uvm_scheduler: process zero main loop. */ extern struct loadavg averunnable; void uvm_scheduler(void) { lwp_t *l = curlwp; lwp_lock(l); l->l_class = SCHED_FIFO; lwp_changepri(l, PRI_VM); lwp_unlock(l); /* Start the freelist cache. */ uvm_pgflcache_start(); for (;;) { /* Update legacy stats for post-mortem debugging. */ uvm_update_uvmexp(); /* See if the pagedaemon needs to generate some free pages. */ uvm_kick_pdaemon(); /* Calculate process statistics. */ sched_pstats(); (void)kpause("uvm", false, hz, NULL); } } /* * uvm_idle: called from the idle loop. */ void uvm_idle(void) { struct cpu_info *ci = curcpu(); struct uvm_cpu *ucpu = ci->ci_data.cpu_uvm; KASSERT(kpreempt_disabled()); uvmpdpol_idle(ucpu); }
2 2 1 1 1 1 1 1 1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 /* $NetBSD: wsevent.c,v 1.47 2021/09/26 01:16:10 thorpej Exp $ */ /*- * Copyright (c) 2006, 2008 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Julio M. Merino Vidal. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /* * Copyright (c) 1996, 1997 Christopher G. Demetriou. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by Christopher G. Demetriou * for the NetBSD Project. * 4. The name of the author may not be used to endorse or promote products * derived from this software without specific prior written permission * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* * Copyright (c) 1992, 1993 * The Regents of the University of California. All rights reserved. * * This software was developed by the Computer Systems Engineering group * at Lawrence Berkeley Laboratory under DARPA contract BG 91-66 and * contributed to Berkeley. * * All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Lawrence Berkeley Laboratory. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)event.c 8.1 (Berkeley) 6/11/93 */ /* * Internal "wscons_event" queue interface for the keyboard and mouse drivers. */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: wsevent.c,v 1.47 2021/09/26 01:16:10 thorpej Exp $"); #ifdef _KERNEL_OPT #include "opt_compat_netbsd.h" #include "opt_modular.h" #endif #include <sys/param.h> #include <sys/kernel.h> #include <sys/fcntl.h> #include <sys/kmem.h> #include <sys/proc.h> #include <sys/systm.h> #include <sys/vnode.h> #include <sys/select.h> #include <sys/poll.h> #include <sys/compat_stub.h> #include <sys/sysctl.h> #include <dev/wscons/wsconsio.h> #include <dev/wscons/wseventvar.h> /* * Size of a wsevent queue (measured in number of events). * Should be a power of two so that `%' is fast. * At the moment, the value below makes the queues use 2 Kbytes each; this * value may need tuning. */ #define WSEVENT_QSIZE 256 #define EVSIZE(ver) ((ver) == WSEVENT_VERSION ? \ sizeof(struct wscons_event) : \ sizeof(struct owscons_event)) #define EVARRAY(ev, idx) (&(ev)->q[(idx)]) static int wsevent_default_version = WSEVENT_VERSION; /* * Priority of code managing wsevent queues. PWSEVENT is set just above * PSOCK, which is just above TTIPRI, on the theory that mouse and keyboard * `user' input should be quick. */ #define PWSEVENT 23 #define splwsevent() spltty() static void wsevent_intr(void *); /* * Initialize a wscons_event queue. */ void wsevent_init(struct wseventvar *ev, struct proc *p) { if (ev->q != NULL) { #ifdef DIAGNOSTIC printf("wsevent_init: already init\n"); #endif return; } /* * For binary compat set default version and either build with * COMPAT_50 or load COMPAT_50 module to include the compatibility * code. */ if (wsevent_default_version >= 0 && wsevent_default_version < WSEVENT_VERSION) ev->version = wsevent_default_version; else ev->version = WSEVENT_VERSION; ev->get = ev->put = 0; ev->q = kmem_alloc(WSEVENT_QSIZE * sizeof(*ev->q), KM_SLEEP); selinit(&ev->sel); ev->io = p; ev->sih = softint_establish(SOFTINT_MPSAFE | SOFTINT_CLOCK, wsevent_intr, ev); } /* * Tear down a wscons_event queue. */ void wsevent_fini(struct wseventvar *ev) { if (ev->q == NULL) { #ifdef DIAGNOSTIC printf("wsevent_fini: already fini\n"); #endif return; } seldestroy(&ev->sel); kmem_free(ev->q, WSEVENT_QSIZE * sizeof(*ev->q)); ev->q = NULL; softint_disestablish(ev->sih); } static int wsevent_copyout_events(const struct wscons_event *events, int cnt, struct uio *uio, int ver) { int error; switch (ver) { case 0: MODULE_HOOK_CALL(wscons_copyout_events_50_hook, (events, cnt, uio), enosys(), error); if (error == ENOSYS) error = EINVAL; return error; case WSEVENT_VERSION: return uiomove(__UNCONST(events), cnt * sizeof(*events), uio); default: panic("%s: unknown version %d", __func__, ver); } } /* * User-level interface: read, poll. * (User cannot write an event queue.) */ int wsevent_read(struct wseventvar *ev, struct uio *uio, int flags) { int s, n, cnt, error; const int ver = ev->version; const size_t evsize = EVSIZE(ver); /* * Make sure we can return at least 1. */ if (uio->uio_resid < evsize) return (EMSGSIZE); /* ??? */ s = splwsevent(); while (ev->get == ev->put) { if (flags & IO_NDELAY) { splx(s); return (EWOULDBLOCK); } ev->wanted = 1; error = tsleep(ev, PWSEVENT | PCATCH, "wsevent_read", 0); if (error) { splx(s); return (error); } } /* * Move wscons_event from tail end of queue (there is at least one * there). */ if (ev->put < ev->get) cnt = WSEVENT_QSIZE - ev->get; /* events in [get..QSIZE) */ else cnt = ev->put - ev->get; /* events in [get..put) */ splx(s); n = howmany(uio->uio_resid, evsize); if (cnt > n) cnt = n; error = wsevent_copyout_events(EVARRAY(ev, ev->get), cnt, uio, ver); n -= cnt; /* * If we do not wrap to 0, used up all our space, or had an error, * stop. Otherwise move from front of queue to put index, if there * is anything there to move. */ if ((ev->get = (ev->get + cnt) % WSEVENT_QSIZE) != 0 || n == 0 || error || (cnt = ev->put) == 0) return (error); if (cnt > n) cnt = n; error = wsevent_copyout_events(EVARRAY(ev, 0), cnt, uio, ver); ev->get = cnt; return (error); } int wsevent_poll(struct wseventvar *ev, int events, struct lwp *l) { int revents = 0; int s = splwsevent(); if (events & (POLLIN | POLLRDNORM)) { if (ev->get != ev->put) revents |= events & (POLLIN | POLLRDNORM); else selrecord(l, &ev->sel); } splx(s); return (revents); } static void filt_wseventrdetach(struct knote *kn) { struct wseventvar *ev = kn->kn_hook; int s; s = splwsevent(); selremove_knote(&ev->sel, kn); splx(s); } static int filt_wseventread(struct knote *kn, long hint) { struct wseventvar *ev = kn->kn_hook; if (ev->get == ev->put) return (0); if (ev->get < ev->put) kn->kn_data = ev->put - ev->get; else kn->kn_data = (WSEVENT_QSIZE - ev->get) + ev->put; kn->kn_data *= EVSIZE(ev->version); return (1); } static const struct filterops wsevent_filtops = { .f_flags = FILTEROP_ISFD, .f_attach = NULL, .f_detach = filt_wseventrdetach, .f_event = filt_wseventread, }; int wsevent_kqfilter(struct wseventvar *ev, struct knote *kn) { int s; switch (kn->kn_filter) { case EVFILT_READ: kn->kn_fop = &wsevent_filtops; break; default: return (EINVAL); } kn->kn_hook = ev; s = splwsevent(); selrecord_knote(&ev->sel, kn); splx(s); return (0); } /* * Wakes up all listener of the 'ev' queue. */ void wsevent_wakeup(struct wseventvar *ev) { selnotify(&ev->sel, 0, 0); if (ev->wanted) { ev->wanted = 0; wakeup(ev); } if (ev->async) { softint_schedule(ev->sih); } } /* * Soft interrupt handler: sends signal to async proc. */ static void wsevent_intr(void *cookie) { struct wseventvar *ev; ev = cookie; if (ev->async) { mutex_enter(&proc_lock); psignal(ev->io, SIGIO); mutex_exit(&proc_lock); } } /* * Injects the set of events given in 'events', whose size is 'nevents', * into the 'ev' queue. If there is not enough free space to inject them * all, returns ENOSPC and the queue is left intact; otherwise returns 0 * and wakes up all listeners. */ int wsevent_inject(struct wseventvar *ev, struct wscons_event *events, size_t nevents) { size_t avail, i; struct timespec t; /* Calculate number of free slots in the queue. */ if (ev->put < ev->get) avail = ev->get - ev->put; else avail = WSEVENT_QSIZE - (ev->put - ev->get); KASSERT(avail <= WSEVENT_QSIZE); /* Fail if there is all events will not fit in the queue. */ if (avail < nevents) return ENOSPC; /* Use the current time for all events. */ getnanotime(&t); /* Inject the events. */ for (i = 0; i < nevents; i++) { struct wscons_event *we; we = EVARRAY(ev, ev->put); we->type = events[i].type; we->value = events[i].value; we->time = t; ev->put = (ev->put + 1) % WSEVENT_QSIZE; } wsevent_wakeup(ev); return 0; } int wsevent_setversion(struct wseventvar *ev, int vers) { if (ev == NULL) return EINVAL; switch (vers) { case 0: case WSEVENT_VERSION: break; default: return EINVAL; } if (vers == ev->version) return 0; ev->get = ev->put = 0; ev->version = vers; return 0; } SYSCTL_SETUP(sysctl_wsevent_setup, "sysctl hw.wsevent subtree setup") { const struct sysctlnode *node = NULL; if (sysctl_createv(clog, 0, NULL, &node, CTLFLAG_PERMANENT, CTLTYPE_NODE, "wsevent", NULL, NULL, 0, NULL, 0, CTL_HW, CTL_CREATE, CTL_EOL) != 0) return; sysctl_createv(clog, 0, &node, NULL, CTLFLAG_READWRITE, CTLTYPE_INT, "default_version", SYSCTL_DESCR("Set default event version for compatibility"), NULL, 0, &wsevent_default_version, 0, CTL_CREATE, CTL_EOL); }
237 170 170 7 7 687 690 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 /* $NetBSD: subr_specificdata.c,v 1.14 2017/06/01 02:45:13 chs Exp $ */ /*- * Copyright (c) 2006, 2007 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Jason R. Thorpe. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /*- * Copyright (c) 2006 YAMAMOTO Takashi. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: subr_specificdata.c,v 1.14 2017/06/01 02:45:13 chs Exp $"); #include <sys/param.h> #include <sys/kmem.h> #include <sys/specificdata.h> #include <sys/queue.h> #include <sys/mutex.h> /* * Locking notes: * * The specdataref_container pointer in the specificdata_reference * is volatile. To read it, you must hold EITHER the domain lock * or the ref lock. To write it, you must hold BOTH the domain lock * and the ref lock. The locks must be acquired in the following * order: * domain -> ref */ typedef struct { specificdata_dtor_t ski_dtor; } specificdata_key_impl; struct specificdata_container { size_t sc_nkey; LIST_ENTRY(specificdata_container) sc_list; void * sc_data[]; /* variable length */ }; #define SPECIFICDATA_CONTAINER_BYTESIZE(n) \ (sizeof(struct specificdata_container) + ((n) * sizeof(void *))) struct specificdata_domain { kmutex_t sd_lock; unsigned int sd_nkey; LIST_HEAD(, specificdata_container) sd_list; specificdata_key_impl *sd_keys; }; static void specificdata_container_link(specificdata_domain_t sd, specificdata_container_t sc) { LIST_INSERT_HEAD(&sd->sd_list, sc, sc_list); } static void specificdata_container_unlink(specificdata_domain_t sd, specificdata_container_t sc) { LIST_REMOVE(sc, sc_list); } static void specificdata_destroy_datum(specificdata_domain_t sd, specificdata_container_t sc, specificdata_key_t key) { specificdata_dtor_t dtor; void *data; if (key >= sc->sc_nkey) return; KASSERT(key < sd->sd_nkey); data = sc->sc_data[key]; dtor = sd->sd_keys[key].ski_dtor; if (dtor != NULL) { if (data != NULL) { sc->sc_data[key] = NULL; (*dtor)(data); } } else { KASSERT(data == NULL); } } static void specificdata_noop_dtor(void *data) { /* nothing */ } /* * specificdata_domain_create -- * Create a specificdata domain. */ specificdata_domain_t specificdata_domain_create(void) { specificdata_domain_t sd; sd = kmem_zalloc(sizeof(*sd), KM_SLEEP); mutex_init(&sd->sd_lock, MUTEX_DEFAULT, IPL_NONE); LIST_INIT(&sd->sd_list); return (sd); } /* * specificdata_domain_delete -- * Destroy a specificdata domain. */ void specificdata_domain_delete(specificdata_domain_t sd) { panic("specificdata_domain_delete: not implemented"); } /* * specificdata_key_create -- * Create a specificdata key for a domain. * * Note: This is a rare operation. */ int specificdata_key_create(specificdata_domain_t sd, specificdata_key_t *keyp, specificdata_dtor_t dtor) { specificdata_key_impl *newkeys; specificdata_key_t key = 0; size_t nsz; ASSERT_SLEEPABLE(); if (dtor == NULL) dtor = specificdata_noop_dtor; mutex_enter(&sd->sd_lock); if (sd->sd_keys == NULL) goto needalloc; for (; key < sd->sd_nkey; key++) { if (sd->sd_keys[key].ski_dtor == NULL) goto gotit; } needalloc: nsz = (sd->sd_nkey + 1) * sizeof(*newkeys); /* XXXSMP allocating memory while holding a lock. */ newkeys = kmem_zalloc(nsz, KM_SLEEP); if (sd->sd_keys != NULL) { size_t osz = sd->sd_nkey * sizeof(*newkeys); memcpy(newkeys, sd->sd_keys, osz); kmem_free(sd->sd_keys, osz); } sd->sd_keys = newkeys; sd->sd_nkey++; gotit: sd->sd_keys[key].ski_dtor = dtor; mutex_exit(&sd->sd_lock); *keyp = key; return (0); } /* * specificdata_key_delete -- * Destroy a specificdata key for a domain. * * Note: This is a rare operation. */ void specificdata_key_delete(specificdata_domain_t sd, specificdata_key_t key) { specificdata_container_t sc; mutex_enter(&sd->sd_lock); if (key >= sd->sd_nkey) goto out; /* * Traverse all of the specificdata containers in the domain * and the destroy the datum for the dying key. */ LIST_FOREACH(sc, &sd->sd_list, sc_list) { specificdata_destroy_datum(sd, sc, key); } sd->sd_keys[key].ski_dtor = NULL; out: mutex_exit(&sd->sd_lock); } /* * specificdata_init -- * Initialize a specificdata container for operation in the * specified domain. */ int specificdata_init(specificdata_domain_t sd, specificdata_reference *ref) { /* * Just NULL-out the container pointer; we'll allocate the * container the first time specificdata is put into it. */ ref->specdataref_container = NULL; mutex_init(&ref->specdataref_lock, MUTEX_DEFAULT, IPL_NONE); return (0); } /* * specificdata_fini -- * Destroy a specificdata container. We destroy all of the datums * stuffed into the container just as if the key were destroyed. */ void specificdata_fini(specificdata_domain_t sd, specificdata_reference *ref) { specificdata_container_t sc; specificdata_key_t key; ASSERT_SLEEPABLE(); mutex_destroy(&ref->specdataref_lock); sc = ref->specdataref_container; if (sc == NULL) return; ref->specdataref_container = NULL; mutex_enter(&sd->sd_lock); specificdata_container_unlink(sd, sc); for (key = 0; key < sc->sc_nkey; key++) { specificdata_destroy_datum(sd, sc, key); } mutex_exit(&sd->sd_lock); kmem_free(sc, SPECIFICDATA_CONTAINER_BYTESIZE(sc->sc_nkey)); } /* * specificdata_getspecific -- * Get a datum from a container. */ void * specificdata_getspecific(specificdata_domain_t sd, specificdata_reference *ref, specificdata_key_t key) { specificdata_container_t sc; void *data = NULL; mutex_enter(&ref->specdataref_lock); sc = ref->specdataref_container; if (sc != NULL && key < sc->sc_nkey) data = sc->sc_data[key]; mutex_exit(&ref->specdataref_lock); return (data); } /* * specificdata_getspecific_unlocked -- * Get a datum from a container in a lockless fashion. * * Note: When using this routine, care must be taken to ensure * that no other thread could cause the specificdata_reference * to become invalid (i.e. point at the wrong container) by * issuing a setspecific call or destroying the container. */ void * specificdata_getspecific_unlocked(specificdata_domain_t sd, specificdata_reference *ref, specificdata_key_t key) { specificdata_container_t sc; sc = ref->specdataref_container; if (sc != NULL && key < sc->sc_nkey) return (sc->sc_data[key]); return (NULL); } /* * specificdata_setspecific -- * Put a datum into a container. */ void specificdata_setspecific(specificdata_domain_t sd, specificdata_reference *ref, specificdata_key_t key, void *data) { specificdata_container_t sc, newsc; size_t newnkey, sz; ASSERT_SLEEPABLE(); mutex_enter(&ref->specdataref_lock); sc = ref->specdataref_container; if (__predict_true(sc != NULL && key < sc->sc_nkey)) { sc->sc_data[key] = data; mutex_exit(&ref->specdataref_lock); return; } mutex_exit(&ref->specdataref_lock); /* * Slow path: need to resize. */ mutex_enter(&sd->sd_lock); newnkey = sd->sd_nkey; if (key >= newnkey) { mutex_exit(&sd->sd_lock); panic("specificdata_setspecific"); } sz = SPECIFICDATA_CONTAINER_BYTESIZE(newnkey); newsc = kmem_zalloc(sz, KM_SLEEP); newsc->sc_nkey = newnkey; mutex_enter(&ref->specdataref_lock); sc = ref->specdataref_container; if (sc != NULL) { if (key < sc->sc_nkey) { /* * Someone beat us to the punch. Unwind and put * the object into the now large enough container. */ sc->sc_data[key] = data; mutex_exit(&ref->specdataref_lock); mutex_exit(&sd->sd_lock); kmem_free(newsc, sz); return; } specificdata_container_unlink(sd, sc); memcpy(newsc->sc_data, sc->sc_data, sc->sc_nkey * sizeof(void *)); } newsc->sc_data[key] = data; specificdata_container_link(sd, newsc); ref->specdataref_container = newsc; mutex_exit(&ref->specdataref_lock); mutex_exit(&sd->sd_lock); if (sc != NULL) kmem_free(sc, SPECIFICDATA_CONTAINER_BYTESIZE(sc->sc_nkey)); }
3 3 3 1 2 1 1 1 1 1 1 1 1 6 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 /* $NetBSD: l2cap_socket.c,v 1.36 2019/01/28 12:53:01 martin Exp $ */ /*- * Copyright (c) 2005 Iain Hibbert. * Copyright (c) 2006 Itronix Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The name of Itronix Inc. may not be used to endorse * or promote products derived from this software without specific * prior written permission. * * THIS SOFTWARE IS PROVIDED BY ITRONIX INC. ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL ITRONIX INC. BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: l2cap_socket.c,v 1.36 2019/01/28 12:53:01 martin Exp $"); /* load symbolic names */ #ifdef BLUETOOTH_DEBUG #define PRUREQUESTS #define PRCOREQUESTS #endif #include <sys/param.h> #include <sys/domain.h> #include <sys/kernel.h> #include <sys/mbuf.h> #include <sys/proc.h> #include <sys/protosw.h> #include <sys/socket.h> #include <sys/socketvar.h> #include <sys/systm.h> #include <netbt/bluetooth.h> #include <netbt/l2cap.h> /* * L2CAP Sockets * * SOCK_SEQPACKET - normal L2CAP connection * * SOCK_DGRAM - connectionless L2CAP - XXX not yet */ static void l2cap_connecting(void *); static void l2cap_connected(void *); static void l2cap_disconnected(void *, int); static void *l2cap_newconn(void *, struct sockaddr_bt *, struct sockaddr_bt *); static void l2cap_complete(void *, int); static void l2cap_linkmode(void *, int); static void l2cap_input(void *, struct mbuf *); static const struct btproto l2cap_proto = { l2cap_connecting, l2cap_connected, l2cap_disconnected, l2cap_newconn, l2cap_complete, l2cap_linkmode, l2cap_input, }; /* sysctl variables */ int l2cap_sendspace = 4096; int l2cap_recvspace = 4096; static int l2cap_attach(struct socket *so, int proto) { int error; KASSERT(so->so_pcb == NULL); if (so->so_lock == NULL) { mutex_obj_hold(bt_lock); so->so_lock = bt_lock; solock(so); } KASSERT(solocked(so)); /* * For L2CAP socket PCB we just use an l2cap_channel structure * since we have nothing to add.. */ error = soreserve(so, l2cap_sendspace, l2cap_recvspace); if (error) return error; return l2cap_attach_pcb((struct l2cap_channel **)&so->so_pcb, &l2cap_proto, so); } static void l2cap_detach(struct socket *so) { KASSERT(so->so_pcb != NULL); l2cap_detach_pcb((struct l2cap_channel **)&so->so_pcb); KASSERT(so->so_pcb == NULL); } static int l2cap_accept(struct socket *so, struct sockaddr *nam) { struct l2cap_channel *pcb = so->so_pcb; KASSERT(solocked(so)); KASSERT(nam != NULL); if (pcb == NULL) return EINVAL; return l2cap_peeraddr_pcb(pcb, (struct sockaddr_bt *)nam); } static int l2cap_bind(struct socket *so, struct sockaddr *nam, struct lwp *l) { struct l2cap_channel *pcb = so->so_pcb; struct sockaddr_bt *sa = (struct sockaddr_bt *)nam; KASSERT(solocked(so)); KASSERT(nam != NULL); if (pcb == NULL) return EINVAL; if (sa->bt_len != sizeof(struct sockaddr_bt)) return EINVAL; if (sa->bt_family != AF_BLUETOOTH) return EAFNOSUPPORT; return l2cap_bind_pcb(pcb, sa); } static int l2cap_listen(struct socket *so, struct lwp *l) { struct l2cap_channel *pcb = so->so_pcb; KASSERT(solocked(so)); if (pcb == NULL) return EINVAL; return l2cap_listen_pcb(pcb); } static int l2cap_connect(struct socket *so, struct sockaddr *nam, struct lwp *l) { struct l2cap_channel *pcb = so->so_pcb; struct sockaddr_bt *sa = (struct sockaddr_bt *)nam; KASSERT(solocked(so)); KASSERT(nam != NULL); if (pcb == NULL) return EINVAL; if (sa->bt_len != sizeof(struct sockaddr_bt)) return EINVAL; if (sa->bt_family != AF_BLUETOOTH) return EAFNOSUPPORT; soisconnecting(so); return l2cap_connect_pcb(pcb, sa); } static int l2cap_connect2(struct socket *so, struct socket *so2) { KASSERT(solocked(so)); if (so->so_pcb == NULL) return EINVAL; return EOPNOTSUPP; } static int l2cap_disconnect(struct socket *so) { struct l2cap_channel *pcb = so->so_pcb; KASSERT(solocked(so)); if (pcb == NULL) return EINVAL; soisdisconnecting(so); return l2cap_disconnect_pcb(pcb, so->so_linger); } static int l2cap_shutdown(struct socket *so) { KASSERT(solocked(so)); socantsendmore(so); return 0; } static int l2cap_abort(struct socket *so) { struct l2cap_channel *pcb = so->so_pcb; KASSERT(solocked(so)); if (pcb == NULL) return EINVAL; l2cap_disconnect_pcb(pcb, 0); soisdisconnected(so); l2cap_detach(so); return 0; } static int l2cap_ioctl(struct socket *so, u_long cmd, void *nam, struct ifnet *ifp) { return EPASSTHROUGH; } static int l2cap_stat(struct socket *so, struct stat *ub) { KASSERT(solocked(so)); return 0; } static int l2cap_peeraddr(struct socket *so, struct sockaddr *nam) { struct l2cap_channel *pcb = so->so_pcb; KASSERT(solocked(so)); KASSERT(pcb != NULL); KASSERT(nam != NULL); return l2cap_peeraddr_pcb(pcb, (struct sockaddr_bt *)nam); } static int l2cap_sockaddr(struct socket *so, struct sockaddr *nam) { struct l2cap_channel *pcb = so->so_pcb; KASSERT(solocked(so)); KASSERT(pcb != NULL); KASSERT(nam != NULL); return l2cap_sockaddr_pcb(pcb, (struct sockaddr_bt *)nam); } static int l2cap_rcvd(struct socket *so, int flags, struct lwp *l) { KASSERT(solocked(so)); return EOPNOTSUPP; } static int l2cap_recvoob(struct socket *so, struct mbuf *m, int flags) { KASSERT(solocked(so)); return EOPNOTSUPP; } static int l2cap_send(struct socket *so, struct mbuf *m, struct sockaddr *nam, struct mbuf *control, struct lwp *l) { struct l2cap_channel *pcb = so->so_pcb; struct mbuf *m0; int error = 0; KASSERT(solocked(so)); KASSERT(m != NULL); if (control) m_freem(control); if (pcb == NULL) { error = EINVAL; goto release; } if (m->m_pkthdr.len == 0) goto release; if (m->m_pkthdr.len > pcb->lc_omtu) { error = EMSGSIZE; goto release; } m0 = m_copypacket(m, M_DONTWAIT); if (m0 == NULL) { error = ENOMEM; goto release; } sbappendrecord(&so->so_snd, m); return l2cap_send_pcb(pcb, m0); release: if (m) m_freem(m); return error; } static int l2cap_sendoob(struct socket *so, struct mbuf *m, struct mbuf *control) { KASSERT(solocked(so)); m_freem(m); m_freem(control); return EOPNOTSUPP; } static int l2cap_purgeif(struct socket *so, struct ifnet *ifp) { return EOPNOTSUPP; } /* * l2cap_ctloutput(req, socket, sockopt) * * Apply configuration commands to channel. This corresponds to * "Reconfigure Channel Request" in the L2CAP specification. */ int l2cap_ctloutput(int req, struct socket *so, struct sockopt *sopt) { struct l2cap_channel *pcb = so->so_pcb; int err = 0; DPRINTFN(2, "%s\n", prcorequests[req]); if (pcb == NULL) return EINVAL; if (sopt->sopt_level != BTPROTO_L2CAP) return ENOPROTOOPT; switch(req) { case PRCO_GETOPT: err = l2cap_getopt(pcb, sopt); break; case PRCO_SETOPT: err = l2cap_setopt(pcb, sopt); break; default: err = ENOPROTOOPT; break; } return err; } /********************************************************************** * * L2CAP Protocol socket callbacks * */ static void l2cap_connecting(void *arg) { struct socket *so = arg; DPRINTF("Connecting\n"); soisconnecting(so); } static void l2cap_connected(void *arg) { struct socket *so = arg; DPRINTF("Connected\n"); soisconnected(so); } static void l2cap_disconnected(void *arg, int err) { struct socket *so = arg; DPRINTF("Disconnected (%d)\n", err); so->so_error = err; soisdisconnected(so); } static void * l2cap_newconn(void *arg, struct sockaddr_bt *laddr, struct sockaddr_bt *raddr) { struct socket *so = arg; DPRINTF("New Connection\n"); so = sonewconn(so, false); if (so == NULL) return NULL; soisconnecting(so); return so->so_pcb; } static void l2cap_complete(void *arg, int count) { struct socket *so = arg; while (count-- > 0) sbdroprecord(&so->so_snd); sowwakeup(so); } static void l2cap_linkmode(void *arg, int new) { struct socket *so = arg; struct sockopt sopt; int mode; DPRINTF("auth %s, encrypt %s, secure %s\n", (new & L2CAP_LM_AUTH ? "on" : "off"), (new & L2CAP_LM_ENCRYPT ? "on" : "off"), (new & L2CAP_LM_SECURE ? "on" : "off")); sockopt_init(&sopt, BTPROTO_L2CAP, SO_L2CAP_LM, 0); (void)l2cap_getopt(so->so_pcb, &sopt); (void)sockopt_getint(&sopt, &mode); sockopt_destroy(&sopt); if (((mode & L2CAP_LM_AUTH) && !(new & L2CAP_LM_AUTH)) || ((mode & L2CAP_LM_ENCRYPT) && !(new & L2CAP_LM_ENCRYPT)) || ((mode & L2CAP_LM_SECURE) && !(new & L2CAP_LM_SECURE))) l2cap_disconnect_pcb(so->so_pcb, 0); } static void l2cap_input(void *arg, struct mbuf *m) { struct socket *so = arg; if (m->m_pkthdr.len > sbspace(&so->so_rcv)) { printf("%s: packet (%d bytes) dropped (socket buffer full)\n", __func__, m->m_pkthdr.len); m_freem(m); return; } DPRINTFN(10, "received %d bytes\n", m->m_pkthdr.len); sbappendrecord(&so->so_rcv, m); sorwakeup(so); } PR_WRAP_USRREQS(l2cap) #define l2cap_attach l2cap_attach_wrapper #define l2cap_detach l2cap_detach_wrapper #define l2cap_accept l2cap_accept_wrapper #define l2cap_bind l2cap_bind_wrapper #define l2cap_listen l2cap_listen_wrapper #define l2cap_connect l2cap_connect_wrapper #define l2cap_connect2 l2cap_connect2_wrapper #define l2cap_disconnect l2cap_disconnect_wrapper #define l2cap_shutdown l2cap_shutdown_wrapper #define l2cap_abort l2cap_abort_wrapper #define l2cap_ioctl l2cap_ioctl_wrapper #define l2cap_stat l2cap_stat_wrapper #define l2cap_peeraddr l2cap_peeraddr_wrapper #define l2cap_sockaddr l2cap_sockaddr_wrapper #define l2cap_rcvd l2cap_rcvd_wrapper #define l2cap_recvoob l2cap_recvoob_wrapper #define l2cap_send l2cap_send_wrapper #define l2cap_sendoob l2cap_sendoob_wrapper #define l2cap_purgeif l2cap_purgeif_wrapper const struct pr_usrreqs l2cap_usrreqs = { .pr_attach = l2cap_attach, .pr_detach = l2cap_detach, .pr_accept = l2cap_accept, .pr_bind = l2cap_bind, .pr_listen = l2cap_listen, .pr_connect = l2cap_connect, .pr_connect2 = l2cap_connect2, .pr_disconnect = l2cap_disconnect, .pr_shutdown = l2cap_shutdown, .pr_abort = l2cap_abort, .pr_ioctl = l2cap_ioctl, .pr_stat = l2cap_stat, .pr_peeraddr = l2cap_peeraddr, .pr_sockaddr = l2cap_sockaddr, .pr_rcvd = l2cap_rcvd, .pr_recvoob = l2cap_recvoob, .pr_send = l2cap_send, .pr_sendoob = l2cap_sendoob, .pr_purgeif = l2cap_purgeif, };
23 21 10 10 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 /* $NetBSD: in6_cksum.c,v 1.28 2011/04/25 22:05:05 yamt Exp $ */ /*- * Copyright (c) 2008 Joerg Sonnenberger <joerg@NetBSD.org>. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in * the documentation and/or other materials provided with the * distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: in6_cksum.c,v 1.28 2011/04/25 22:05:05 yamt Exp $"); #include <sys/param.h> #include <sys/mbuf.h> #include <netinet/in.h> #include <netinet/ip6.h> /* * Checksum of the IPv6 pseudo header. * * off is supposed to be the skipped IPv6 header, len is the payload size. */ int in6_cksum(struct mbuf *m, u_int8_t nxt, uint32_t off, uint32_t len) { union { uint16_t words[16]; struct { struct in6_addr ip6_src; struct in6_addr ip6_dst; } addrs; } u; const struct in6_addr *in6_src; const struct in6_addr *in6_dst; const struct ip6_hdr *ip6; uint32_t sum; const uint16_t *w; const char *cp; if (nxt == 0) return cpu_in_cksum(m, len, off, 0); if (__predict_false(off < sizeof(struct ip6_hdr))) panic("in6_cksum: offset too short for IPv6 header"); if (__predict_false(m->m_len < sizeof(struct ip6_hdr))) panic("in6_cksum: mbuf too short for IPv6 header"); /* * Compute the equivalent of: * struct ip6_hdr_pseudo ip6; * * bzero(sizeof(*ip6)); * ip6.ip6ph_nxt = nxt; * ip6.ip6ph_len = htonl(len); * ipv6.ip6ph_src = mtod(m, struct ip6_hdr *)->ip6_src; * in6_clearscope(&ip6->ip6ph_src); * ipv6.ip6ph_dst = mtod(m, struct ip6_hdr *)->ip6_dst; * in6_clearscope(&ip6->ip6ph_dst); * sum = one_add(&ip6); */ #if BYTE_ORDER == LITTLE_ENDIAN sum = ((len & 0xffff) + ((len >> 16) & 0xffff) + nxt) << 8; #else sum = (len & 0xffff) + ((len >> 16) & 0xffff) + nxt; #endif cp = mtod(m, const char *); w = (const uint16_t *)(cp + offsetof(struct ip6_hdr, ip6_src)); ip6 = (const void *)cp; if (__predict_true((uintptr_t)w % 2 == 0)) { in6_src = &ip6->ip6_src; in6_dst = &ip6->ip6_dst; } else { memcpy(&u, &ip6->ip6_src, 32); w = u.words; in6_src = &u.addrs.ip6_src; in6_dst = &u.addrs.ip6_dst; } sum += w[0]; if (!IN6_IS_SCOPE_EMBEDDABLE(in6_src)) sum += w[1]; sum += w[2]; sum += w[3]; sum += w[4]; sum += w[5]; sum += w[6]; sum += w[7]; w += 8; sum += w[0]; if (!IN6_IS_SCOPE_EMBEDDABLE(in6_dst)) sum += w[1]; sum += w[2]; sum += w[3]; sum += w[4]; sum += w[5]; sum += w[6]; sum += w[7]; return cpu_in_cksum(m, len, off, sum); }
9 9 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 /* $NetBSD: exec_subr.c,v 1.88 2023/11/21 14:35:36 riastradh Exp $ */ /* * Copyright (c) 1993, 1994, 1996 Christopher G. Demetriou * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by Christopher G. Demetriou. * 4. The name of the author may not be used to endorse or promote products * derived from this software without specific prior written permission * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: exec_subr.c,v 1.88 2023/11/21 14:35:36 riastradh Exp $"); #include "opt_pax.h" #include <sys/param.h> #include <sys/systm.h> #include <sys/proc.h> #include <sys/kmem.h> #include <sys/vnode.h> #include <sys/filedesc.h> #include <sys/exec.h> #include <sys/mman.h> #include <sys/resourcevar.h> #include <sys/device.h> #include <sys/pax.h> #include <uvm/uvm_extern.h> #define VMCMD_EVCNT_DECL(name) \ static struct evcnt vmcmd_ev_##name = \ EVCNT_INITIALIZER(EVCNT_TYPE_MISC, NULL, "vmcmd", #name); \ EVCNT_ATTACH_STATIC(vmcmd_ev_##name) #define VMCMD_EVCNT_INCR(name) \ vmcmd_ev_##name.ev_count++ VMCMD_EVCNT_DECL(calls); VMCMD_EVCNT_DECL(extends); VMCMD_EVCNT_DECL(kills); #ifdef DEBUG_STACK #define DPRINTF(a) uprintf a #else #define DPRINTF(a) #endif unsigned int user_stack_guard_size = 1024 * 1024; unsigned int user_thread_stack_guard_size = 64 * 1024; /* * new_vmcmd(): * create a new vmcmd structure and fill in its fields based * on function call arguments. make sure objects ref'd by * the vmcmd are 'held'. */ void new_vmcmd(struct exec_vmcmd_set *evsp, int (*proc)(struct lwp * l, struct exec_vmcmd *), vsize_t len, vaddr_t addr, struct vnode *vp, u_long offset, u_int prot, int flags) { struct exec_vmcmd *vcp; VMCMD_EVCNT_INCR(calls); KASSERT(proc != vmcmd_map_pagedvn || (vp->v_iflag & VI_TEXT)); KASSERT(vp == NULL || vrefcnt(vp) > 0); if (evsp->evs_used >= evsp->evs_cnt) vmcmdset_extend(evsp); vcp = &evsp->evs_cmds[evsp->evs_used++]; vcp->ev_proc = proc; vcp->ev_len = len; vcp->ev_addr = addr; if ((vcp->ev_vp = vp) != NULL) vref(vp); vcp->ev_offset = offset; vcp->ev_prot = prot; vcp->ev_flags = flags; } void vmcmdset_extend(struct exec_vmcmd_set *evsp) { struct exec_vmcmd *nvcp; u_int ocnt; #ifdef DIAGNOSTIC if (evsp->evs_used < evsp->evs_cnt) panic("vmcmdset_extend: not necessary"); #endif /* figure out number of entries in new set */ if ((ocnt = evsp->evs_cnt) != 0) { evsp->evs_cnt += ocnt; VMCMD_EVCNT_INCR(extends); } else evsp->evs_cnt = EXEC_DEFAULT_VMCMD_SETSIZE; /* allocate it */ nvcp = kmem_alloc(evsp->evs_cnt * sizeof(struct exec_vmcmd), KM_SLEEP); /* free the old struct, if there was one, and record the new one */ if (ocnt) { memcpy(nvcp, evsp->evs_cmds, (ocnt * sizeof(struct exec_vmcmd))); kmem_free(evsp->evs_cmds, ocnt * sizeof(struct exec_vmcmd)); } evsp->evs_cmds = nvcp; } void kill_vmcmds(struct exec_vmcmd_set *evsp) { struct exec_vmcmd *vcp; u_int i; VMCMD_EVCNT_INCR(kills); if (evsp->evs_cnt == 0) return; for (i = 0; i < evsp->evs_used; i++) { vcp = &evsp->evs_cmds[i]; if (vcp->ev_vp != NULL) vrele(vcp->ev_vp); } kmem_free(evsp->evs_cmds, evsp->evs_cnt * sizeof(struct exec_vmcmd)); evsp->evs_used = evsp->evs_cnt = 0; } /* * vmcmd_map_pagedvn(): * handle vmcmd which specifies that a vnode should be mmap'd. * appropriate for handling demand-paged text and data segments. */ static int vmcmd_get_prot(struct lwp *l, const struct exec_vmcmd *cmd, vm_prot_t *prot, vm_prot_t *maxprot) { vm_prot_t extraprot = PROT_MPROTECT_EXTRACT(cmd->ev_prot); *prot = cmd->ev_prot & UVM_PROT_ALL; *maxprot = PAX_MPROTECT_MAXPROTECT(l, *prot, extraprot, UVM_PROT_ALL); if ((*prot & *maxprot) != *prot) return EACCES; return PAX_MPROTECT_VALIDATE(l, *prot); } int vmcmd_map_pagedvn(struct lwp *l, struct exec_vmcmd *cmd) { struct uvm_object *uobj; struct vnode *vp = cmd->ev_vp; struct proc *p = l->l_proc; int error; vm_prot_t prot, maxprot; KASSERT(vp->v_iflag & VI_TEXT); /* * map the vnode in using uvm_map. */ if (cmd->ev_len == 0) return 0; if (cmd->ev_offset & PAGE_MASK) return EINVAL; if (cmd->ev_addr & PAGE_MASK) return EINVAL; if (cmd->ev_len & PAGE_MASK) return EINVAL; if ((error = vmcmd_get_prot(l, cmd, &prot, &maxprot)) != 0) return error; /* * check the file system's opinion about mmapping the file */ error = VOP_MMAP(vp, prot, l->l_cred); if (error) return error; if ((vp->v_vflag & VV_MAPPED) == 0) { vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); vp->v_vflag |= VV_MAPPED; VOP_UNLOCK(vp); } /* * do the map, reference the object for this map entry */ uobj = &vp->v_uobj; vref(vp); error = uvm_map(&p->p_vmspace->vm_map, &cmd->ev_addr, cmd->ev_len, uobj, cmd->ev_offset, 0, UVM_MAPFLAG(prot, maxprot, UVM_INH_COPY, UVM_ADV_NORMAL, UVM_FLAG_COPYONW|UVM_FLAG_FIXED)); if (error) { uobj->pgops->pgo_detach(uobj); } return error; } /* * vmcmd_map_readvn(): * handle vmcmd which specifies that a vnode should be read from. * appropriate for non-demand-paged text/data segments, i.e. impure * objects (a la OMAGIC and NMAGIC). */ int vmcmd_map_readvn(struct lwp *l, struct exec_vmcmd *cmd) { struct proc *p = l->l_proc; int error; long diff; if (cmd->ev_len == 0) return 0; diff = cmd->ev_addr - trunc_page(cmd->ev_addr); cmd->ev_addr -= diff; /* required by uvm_map */ cmd->ev_offset -= diff; cmd->ev_len += diff; error = uvm_map(&p->p_vmspace->vm_map, &cmd->ev_addr, round_page(cmd->ev_len), NULL, UVM_UNKNOWN_OFFSET, 0, UVM_MAPFLAG(UVM_PROT_ALL, UVM_PROT_ALL, UVM_INH_COPY, UVM_ADV_NORMAL, UVM_FLAG_FIXED|UVM_FLAG_OVERLAY|UVM_FLAG_COPYONW)); if (error) return error; return vmcmd_readvn(l, cmd); } int vmcmd_readvn(struct lwp *l, struct exec_vmcmd *cmd) { struct proc *p = l->l_proc; int error; vm_prot_t prot, maxprot; error = vn_rdwr(UIO_READ, cmd->ev_vp, (void *)cmd->ev_addr, cmd->ev_len, cmd->ev_offset, UIO_USERSPACE, IO_UNIT, l->l_cred, NULL, l); if (error) return error; if ((error = vmcmd_get_prot(l, cmd, &prot, &maxprot)) != 0) return error; #ifdef PMAP_NEED_PROCWR /* * we had to write the process, make sure the pages are synched * with the instruction cache. */ if (prot & VM_PROT_EXECUTE) pmap_procwr(p, cmd->ev_addr, cmd->ev_len); #endif /* * we had to map in the area at PROT_ALL so that vn_rdwr() * could write to it. however, the caller seems to want * it mapped read-only, so now we are going to have to call * uvm_map_protect() to fix up the protection. ICK. */ if (maxprot != VM_PROT_ALL) { error = uvm_map_protect(&p->p_vmspace->vm_map, trunc_page(cmd->ev_addr), round_page(cmd->ev_addr + cmd->ev_len), maxprot, true); if (error) return error; } if (prot != maxprot) { error = uvm_map_protect(&p->p_vmspace->vm_map, trunc_page(cmd->ev_addr), round_page(cmd->ev_addr + cmd->ev_len), prot, false); if (error) return error; } return 0; } /* * vmcmd_map_zero(): * handle vmcmd which specifies a zero-filled address space region. The * address range must be first allocated, then protected appropriately. */ int vmcmd_map_zero(struct lwp *l, struct exec_vmcmd *cmd) { struct proc *p = l->l_proc; int error; long diff; vm_prot_t prot, maxprot; diff = cmd->ev_addr - trunc_page(cmd->ev_addr); cmd->ev_addr -= diff; /* required by uvm_map */ cmd->ev_len += diff; if ((error = vmcmd_get_prot(l, cmd, &prot, &maxprot)) != 0) return error; error = uvm_map(&p->p_vmspace->vm_map, &cmd->ev_addr, round_page(cmd->ev_len), NULL, UVM_UNKNOWN_OFFSET, 0, UVM_MAPFLAG(prot, maxprot, UVM_INH_COPY, UVM_ADV_NORMAL, UVM_FLAG_FIXED|UVM_FLAG_COPYONW)); if (cmd->ev_flags & VMCMD_STACK) curproc->p_vmspace->vm_issize += atop(round_page(cmd->ev_len)); return error; } /* * exec_read(): * * Read from vnode into buffer at offset. */ int exec_read(struct lwp *l, struct vnode *vp, u_long off, void *bf, size_t size, int ioflg) { int error; size_t resid; KASSERT((ioflg & IO_NODELOCKED) == 0 || VOP_ISLOCKED(vp) != LK_NONE); if ((error = vn_rdwr(UIO_READ, vp, bf, size, off, UIO_SYSSPACE, ioflg, l->l_cred, &resid, NULL)) != 0) return error; /* * See if we got all of it */ if (resid != 0) return ENOEXEC; return 0; } /* * exec_setup_stack(): Set up the stack segment for an elf * executable. * * Note that the ep_ssize parameter must be set to be the current stack * limit; this is adjusted in the body of execve() to yield the * appropriate stack segment usage once the argument length is * calculated. * * This function returns an int for uniformity with other (future) formats' * stack setup functions. They might have errors to return. */ int exec_setup_stack(struct lwp *l, struct exec_package *epp) { vsize_t max_stack_size; vaddr_t access_linear_min; vsize_t access_size; vaddr_t noaccess_linear_min; vsize_t noaccess_size; #ifndef USRSTACK32 #define USRSTACK32 (0x00000000ffffffffL&~PGOFSET) #endif #ifndef MAXSSIZ32 #define MAXSSIZ32 (MAXSSIZ >> 2) #endif if (epp->ep_flags & EXEC_32) { epp->ep_minsaddr = USRSTACK32; max_stack_size = MAXSSIZ32; } else { epp->ep_minsaddr = USRSTACK; max_stack_size = MAXSSIZ; } DPRINTF(("ep_minsaddr=%#jx max_stack_size=%#jx\n", (uintmax_t)epp->ep_minsaddr, (uintmax_t)max_stack_size)); pax_aslr_stack(epp, &max_stack_size); DPRINTF(("[RLIMIT_STACK].lim_cur=%#jx max_stack_size=%#jx\n", (uintmax_t)l->l_proc->p_rlimit[RLIMIT_STACK].rlim_cur, (uintmax_t)max_stack_size)); epp->ep_ssize = MIN(l->l_proc->p_rlimit[RLIMIT_STACK].rlim_cur, max_stack_size); l->l_proc->p_stackbase = epp->ep_minsaddr; epp->ep_maxsaddr = (vaddr_t)STACK_GROW(epp->ep_minsaddr, max_stack_size); DPRINTF(("ep_ssize=%#jx ep_minsaddr=%#jx ep_maxsaddr=%#jx\n", (uintmax_t)epp->ep_ssize, (uintmax_t)epp->ep_minsaddr, (uintmax_t)epp->ep_maxsaddr)); /* * set up commands for stack. note that this takes *two*, one to * map the part of the stack which we can access, and one to map * the part which we can't. * * arguably, it could be made into one, but that would require the * addition of another mapping proc, which is unnecessary */ access_size = epp->ep_ssize; access_linear_min = (vaddr_t)STACK_ALLOC(epp->ep_minsaddr, access_size); noaccess_size = max_stack_size - access_size; noaccess_linear_min = (vaddr_t)STACK_ALLOC(STACK_GROW(epp->ep_minsaddr, access_size), noaccess_size); DPRINTF(("access_size=%#jx, access_linear_min=%#jx, " "noaccess_size=%#jx, noaccess_linear_min=%#jx\n", (uintmax_t)access_size, (uintmax_t)access_linear_min, (uintmax_t)noaccess_size, (uintmax_t)noaccess_linear_min)); if (user_stack_guard_size > 0) { #ifdef __MACHINE_STACK_GROWS_UP vsize_t guard_size = MIN(VM_MAXUSER_ADDRESS - epp->ep_maxsaddr, user_stack_guard_size); if (guard_size > 0) NEW_VMCMD(&epp->ep_vmcmds, vmcmd_map_zero, guard_size, epp->ep_maxsaddr, NULL, 0, VM_PROT_NONE); #else NEW_VMCMD(&epp->ep_vmcmds, vmcmd_map_zero, user_stack_guard_size, epp->ep_maxsaddr - user_stack_guard_size, NULL, 0, VM_PROT_NONE); #endif } if (noaccess_size > 0 && noaccess_size <= MAXSSIZ) { NEW_VMCMD2(&epp->ep_vmcmds, vmcmd_map_zero, noaccess_size, noaccess_linear_min, NULL, 0, VM_PROT_NONE | PROT_MPROTECT(VM_PROT_READ | VM_PROT_WRITE), VMCMD_STACK); } KASSERT(access_size > 0); KASSERT(access_size <= MAXSSIZ); NEW_VMCMD2(&epp->ep_vmcmds, vmcmd_map_zero, access_size, access_linear_min, NULL, 0, VM_PROT_READ | VM_PROT_WRITE, VMCMD_STACK); return 0; }
18 49 49 72 70 72 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 /* $NetBSD: genfs_vfsops.c,v 1.11 2022/07/08 07:42:06 hannken Exp $ */ /*- * Copyright (c) 2008, 2009, 2019 The NetBSD Foundation, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: genfs_vfsops.c,v 1.11 2022/07/08 07:42:06 hannken Exp $"); #include <sys/types.h> #include <sys/mount.h> #include <sys/fstrans.h> #include <sys/statvfs.h> #include <sys/vnode.h> #include <miscfs/genfs/genfs.h> #include <miscfs/genfs/genfs_node.h> int genfs_statvfs(struct mount *mp, struct statvfs *sbp) { sbp->f_bsize = DEV_BSIZE; sbp->f_frsize = DEV_BSIZE; sbp->f_iosize = DEV_BSIZE; sbp->f_blocks = 2; /* 1k to keep df happy */ sbp->f_bfree = 0; sbp->f_bavail = 0; sbp->f_bresvd = 0; sbp->f_files = 0; sbp->f_ffree = 0; sbp->f_favail = 0; sbp->f_fresvd = 0; copy_statvfs_info(sbp, mp); return 0; } int genfs_renamelock_enter(struct mount *mp) { mutex_enter(mp->mnt_renamelock); /* Preserve possible error return in case we become interruptible. */ return 0; } void genfs_renamelock_exit(struct mount *mp) { mutex_exit(mp->mnt_renamelock); } int genfs_suspendctl(struct mount *mp, int cmd) { int error; switch (cmd) { case SUSPEND_SUSPEND: error = fstrans_setstate(mp, FSTRANS_SUSPENDING); if (error) return error; error = fstrans_setstate(mp, FSTRANS_SUSPENDED); return error; case SUSPEND_RESUME: error = fstrans_setstate(mp, FSTRANS_NORMAL); KASSERT(error == 0); return 0; default: panic("%s: bogus command %d", __func__, cmd); } }
22 6 1 19 25 1 11 10 21 15 28 8 14 11 1 11 22 1 2 24 22 1 21 3 13 10 21 3 22 2 23 7 1 18 21 9 4 7 20 46 62 2 4 1 43 17 1 16 45 12 57 55 2 53 4 30 6 13 48 4 40 11 337 49 332 5 1 302 38 1 260 18 259 55 27 2 4 4 2 241 89 303 38 341 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 /* $NetBSD: sys_generic.c,v 1.134 2022/07/10 23:12:12 riastradh Exp $ */ /*- * Copyright (c) 2007, 2008, 2009 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Andrew Doran. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /* * Copyright (c) 1982, 1986, 1989, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)sys_generic.c 8.9 (Berkeley) 2/14/95 */ /* * System calls relating to files. */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: sys_generic.c,v 1.134 2022/07/10 23:12:12 riastradh Exp $"); #include <sys/param.h> #include <sys/systm.h> #include <sys/filedesc.h> #include <sys/ioctl.h> #include <sys/file.h> #include <sys/proc.h> #include <sys/socketvar.h> #include <sys/signalvar.h> #include <sys/uio.h> #include <sys/kernel.h> #include <sys/stat.h> #include <sys/kmem.h> #include <sys/poll.h> #include <sys/vnode.h> #include <sys/mount.h> #include <sys/syscallargs.h> #include <sys/ktrace.h> #include <sys/atomic.h> #include <sys/disklabel.h> /* * Read system call. */ /* ARGSUSED */ int sys_read(struct lwp *l, const struct sys_read_args *uap, register_t *retval) { /* { syscallarg(int) fd; syscallarg(void *) buf; syscallarg(size_t) nbyte; } */ file_t *fp; int fd; fd = SCARG(uap, fd); if ((fp = fd_getfile(fd)) == NULL) return (EBADF); if ((fp->f_flag & FREAD) == 0) { fd_putfile(fd); return (EBADF); } /* dofileread() will unuse the descriptor for us */ return (dofileread(fd, fp, SCARG(uap, buf), SCARG(uap, nbyte), &fp->f_offset, FOF_UPDATE_OFFSET, retval)); } int dofileread(int fd, struct file *fp, void *buf, size_t nbyte, off_t *offset, int flags, register_t *retval) { struct iovec aiov; struct uio auio; size_t cnt; int error; lwp_t *l; l = curlwp; aiov.iov_base = (void *)buf; aiov.iov_len = nbyte; auio.uio_iov = &aiov; auio.uio_iovcnt = 1; auio.uio_resid = nbyte; auio.uio_rw = UIO_READ; auio.uio_vmspace = l->l_proc->p_vmspace; /* * Reads return ssize_t because -1 is returned on error. Therefore * we must restrict the length to SSIZE_MAX to avoid garbage return * values. */ if (auio.uio_resid > SSIZE_MAX) { error = EINVAL; goto out; } cnt = auio.uio_resid; error = (*fp->f_ops->fo_read)(fp, offset, &auio, fp->f_cred, flags); if (error) if (auio.uio_resid != cnt && (error == ERESTART || error == EINTR || error == EWOULDBLOCK)) error = 0; cnt -= auio.uio_resid; ktrgenio(fd, UIO_READ, buf, cnt, error); *retval = cnt; out: fd_putfile(fd); return (error); } /* * Scatter read system call. */ int sys_readv(struct lwp *l, const struct sys_readv_args *uap, register_t *retval) { /* { syscallarg(int) fd; syscallarg(const struct iovec *) iovp; syscallarg(int) iovcnt; } */ return do_filereadv(SCARG(uap, fd), SCARG(uap, iovp), SCARG(uap, iovcnt), NULL, FOF_UPDATE_OFFSET, retval); } int do_filereadv(int fd, const struct iovec *iovp, int iovcnt, off_t *offset, int flags, register_t *retval) { struct uio auio; struct iovec *iov, *needfree = NULL, aiov[UIO_SMALLIOV]; int i, error; size_t cnt; u_int iovlen; struct file *fp; struct iovec *ktriov = NULL; if (iovcnt == 0) return EINVAL; if ((fp = fd_getfile(fd)) == NULL) return EBADF; if ((fp->f_flag & FREAD) == 0) { fd_putfile(fd); return EBADF; } if (offset == NULL) offset = &fp->f_offset; else { /* * Caller must not specify &fp->f_offset -- we can't * safely dereference it for the call to fo_seek * without holding some underlying object lock. */ KASSERT(offset != &fp->f_offset); if (fp->f_ops->fo_seek == NULL) { error = ESPIPE; goto out; } error = (*fp->f_ops->fo_seek)(fp, *offset, SEEK_SET, NULL, 0); if (error != 0) goto out; } iovlen = iovcnt * sizeof(struct iovec); if (flags & FOF_IOV_SYSSPACE) iov = __UNCONST(iovp); else { iov = aiov; if ((u_int)iovcnt > UIO_SMALLIOV) { if ((u_int)iovcnt > IOV_MAX) { error = EINVAL; goto out; } iov = kmem_alloc(iovlen, KM_SLEEP); needfree = iov; } error = copyin(iovp, iov, iovlen); if (error) goto done; } auio.uio_iov = iov; auio.uio_iovcnt = iovcnt; auio.uio_rw = UIO_READ; auio.uio_vmspace = curproc->p_vmspace; auio.uio_resid = 0; for (i = 0; i < iovcnt; i++, iov++) { auio.uio_resid += iov->iov_len; /* * Reads return ssize_t because -1 is returned on error. * Therefore we must restrict the length to SSIZE_MAX to * avoid garbage return values. */ if (iov->iov_len > SSIZE_MAX || auio.uio_resid > SSIZE_MAX - iov->iov_len) { error = EINVAL; goto done; } } /* * if tracing, save a copy of iovec */ if (ktrpoint(KTR_GENIO)) { ktriov = kmem_alloc(iovlen, KM_SLEEP); memcpy(ktriov, auio.uio_iov, iovlen); } cnt = auio.uio_resid; error = (*fp->f_ops->fo_read)(fp, offset, &auio, fp->f_cred, flags); if (error) if (auio.uio_resid != cnt && (error == ERESTART || error == EINTR || error == EWOULDBLOCK)) error = 0; cnt -= auio.uio_resid; *retval = cnt; if (ktriov != NULL) { ktrgeniov(fd, UIO_READ, ktriov, cnt, error); kmem_free(ktriov, iovlen); } done: if (needfree) kmem_free(needfree, iovlen); out: fd_putfile(fd); return (error); } /* * Write system call */ int sys_write(struct lwp *l, const struct sys_write_args *uap, register_t *retval) { /* { syscallarg(int) fd; syscallarg(const void *) buf; syscallarg(size_t) nbyte; } */ file_t *fp; int fd; fd = SCARG(uap, fd); if ((fp = fd_getfile(fd)) == NULL) return (EBADF); if ((fp->f_flag & FWRITE) == 0) { fd_putfile(fd); return (EBADF); } /* dofilewrite() will unuse the descriptor for us */ return (dofilewrite(fd, fp, SCARG(uap, buf), SCARG(uap, nbyte), &fp->f_offset, FOF_UPDATE_OFFSET, retval)); } int dofilewrite(int fd, struct file *fp, const void *buf, size_t nbyte, off_t *offset, int flags, register_t *retval) { struct iovec aiov; struct uio auio; size_t cnt; int error; aiov.iov_base = __UNCONST(buf); /* XXXUNCONST kills const */ aiov.iov_len = nbyte; auio.uio_iov = &aiov; auio.uio_iovcnt = 1; auio.uio_resid = nbyte; auio.uio_rw = UIO_WRITE; auio.uio_vmspace = curproc->p_vmspace; /* * Writes return ssize_t because -1 is returned on error. Therefore * we must restrict the length to SSIZE_MAX to avoid garbage return * values. */ if (auio.uio_resid > SSIZE_MAX) { error = EINVAL; goto out; } cnt = auio.uio_resid; error = (*fp->f_ops->fo_write)(fp, offset, &auio, fp->f_cred, flags); if (error) { if (auio.uio_resid != cnt && (error == ERESTART || error == EINTR || error == EWOULDBLOCK)) error = 0; if (error == EPIPE && !(fp->f_flag & FNOSIGPIPE)) { mutex_enter(&proc_lock); psignal(curproc, SIGPIPE); mutex_exit(&proc_lock); } } cnt -= auio.uio_resid; ktrgenio(fd, UIO_WRITE, buf, cnt, error); *retval = cnt; out: fd_putfile(fd); return (error); } /* * Gather write system call */ int sys_writev(struct lwp *l, const struct sys_writev_args *uap, register_t *retval) { /* { syscallarg(int) fd; syscallarg(const struct iovec *) iovp; syscallarg(int) iovcnt; } */ return do_filewritev(SCARG(uap, fd), SCARG(uap, iovp), SCARG(uap, iovcnt), NULL, FOF_UPDATE_OFFSET, retval); } int do_filewritev(int fd, const struct iovec *iovp, int iovcnt, off_t *offset, int flags, register_t *retval) { struct uio auio; struct iovec *iov, *needfree = NULL, aiov[UIO_SMALLIOV]; int i, error; size_t cnt; u_int iovlen; struct file *fp; struct iovec *ktriov = NULL; if (iovcnt == 0) return EINVAL; if ((fp = fd_getfile(fd)) == NULL) return EBADF; if ((fp->f_flag & FWRITE) == 0) { fd_putfile(fd); return EBADF; } if (offset == NULL) offset = &fp->f_offset; else { /* * Caller must not specify &fp->f_offset -- we can't * safely dereference it for the call to fo_seek * without holding some underlying object lock. */ KASSERT(offset != &fp->f_offset); if (fp->f_ops->fo_seek == NULL) { error = ESPIPE; goto out; } error = (*fp->f_ops->fo_seek)(fp, *offset, SEEK_SET, NULL, 0); if (error != 0) goto out; } iovlen = iovcnt * sizeof(struct iovec); if (flags & FOF_IOV_SYSSPACE) iov = __UNCONST(iovp); else { iov = aiov; if ((u_int)iovcnt > UIO_SMALLIOV) { if ((u_int)iovcnt > IOV_MAX) { error = EINVAL; goto out; } iov = kmem_alloc(iovlen, KM_SLEEP); needfree = iov; } error = copyin(iovp, iov, iovlen); if (error) goto done; } auio.uio_iov = iov; auio.uio_iovcnt = iovcnt; auio.uio_rw = UIO_WRITE; auio.uio_vmspace = curproc->p_vmspace; auio.uio_resid = 0; for (i = 0; i < iovcnt; i++, iov++) { auio.uio_resid += iov->iov_len; /* * Writes return ssize_t because -1 is returned on error. * Therefore we must restrict the length to SSIZE_MAX to * avoid garbage return values. */ if (iov->iov_len > SSIZE_MAX || auio.uio_resid > SSIZE_MAX - iov->iov_len) { error = EINVAL; goto done; } } /* * if tracing, save a copy of iovec */ if (ktrpoint(KTR_GENIO)) { ktriov = kmem_alloc(iovlen, KM_SLEEP); memcpy(ktriov, auio.uio_iov, iovlen); } cnt = auio.uio_resid; error = (*fp->f_ops->fo_write)(fp, offset, &auio, fp->f_cred, flags); if (error) { if (auio.uio_resid != cnt && (error == ERESTART || error == EINTR || error == EWOULDBLOCK)) error = 0; if (error == EPIPE && !(fp->f_flag & FNOSIGPIPE)) { mutex_enter(&proc_lock); psignal(curproc, SIGPIPE); mutex_exit(&proc_lock); } } cnt -= auio.uio_resid; *retval = cnt; if (ktriov != NULL) { ktrgeniov(fd, UIO_WRITE, ktriov, cnt, error); kmem_free(ktriov, iovlen); } done: if (needfree) kmem_free(needfree, iovlen); out: fd_putfile(fd); return (error); } /* * Ioctl system call */ /* ARGSUSED */ int sys_ioctl(struct lwp *l, const struct sys_ioctl_args *uap, register_t *retval) { /* { syscallarg(int) fd; syscallarg(u_long) com; syscallarg(void *) data; } */ struct file *fp; proc_t *p; u_long com; int error; size_t size, alloc_size; void *data, *memp; #define STK_PARAMS 128 u_long stkbuf[STK_PARAMS/sizeof(u_long)]; #if __TMPBIGMAXPARTITIONS > MAXPARTITIONS size_t zero_last = 0; #define zero_size(SZ) ((SZ)+zero_last) #else #define zero_size(SZ) (SZ) #endif memp = NULL; alloc_size = 0; error = 0; p = l->l_proc; if ((fp = fd_getfile(SCARG(uap, fd))) == NULL) return (EBADF); if ((fp->f_flag & (FREAD | FWRITE)) == 0) { error = EBADF; com = 0; goto out; } switch (com = SCARG(uap, com)) { case FIONCLEX: case FIOCLEX: fd_set_exclose(l, SCARG(uap, fd), com == FIOCLEX); goto out; } /* * Interpret high order word to find amount of data to be * copied to/from the user's address space. */ size = IOCPARM_LEN(com); alloc_size = size; /* * The disklabel is now padded to a multiple of 8 bytes however the old * disklabel on 32bit platforms wasn't. This leaves a difference in * size of 4 bytes between the two but are otherwise identical. * To deal with this, we allocate enough space for the new disklabel * but only copyin/out the smaller amount. */ if (IOCGROUP(com) == 'd') { #if __TMPBIGMAXPARTITIONS > MAXPARTITIONS u_long ocom = com; #endif u_long ncom = com ^ (DIOCGDINFO ^ DIOCGDINFO32); #if __TMPBIGMAXPARTITIONS > MAXPARTITIONS /* * Userland might use struct disklabel that is bigger than the * the kernel version (historic accident) - alloc userland * size and zero unused part on copyout. */ #define DISKLABELLENDIFF (sizeof(struct partition) \ *(__TMPBIGMAXPARTITIONS-MAXPARTITIONS)) #define IOCFIXUP(NIOC) ((NIOC&~(IOCPARM_MASK<<IOCPARM_SHIFT)) | \ (IOCPARM_LEN(NIOC)-DISKLABELLENDIFF)<<IOCPARM_SHIFT) switch (IOCFIXUP(ocom)) { case DIOCGDINFO: case DIOCWDINFO: case DIOCSDINFO: case DIOCGDEFLABEL: com = ncom = IOCFIXUP(ocom); zero_last = DISKLABELLENDIFF; size -= DISKLABELLENDIFF; goto done; } #endif switch (ncom) { case DIOCGDINFO: case DIOCWDINFO: case DIOCSDINFO: case DIOCGDEFLABEL: com = ncom; if (IOCPARM_LEN(DIOCGDINFO32) < IOCPARM_LEN(DIOCGDINFO)) alloc_size = IOCPARM_LEN(DIOCGDINFO); break; } #if __TMPBIGMAXPARTITIONS > MAXPARTITIONS done: ; #endif } if (size > IOCPARM_MAX) { error = ENOTTY; goto out; } memp = NULL; if ((com >> IOCPARM_SHIFT) == 0) { /* UNIX-style ioctl. */ data = SCARG(uap, data); } else { if (alloc_size > sizeof(stkbuf)) { memp = kmem_alloc(alloc_size, KM_SLEEP); data = memp; } else { data = (void *)stkbuf; } if (com&IOC_IN) { if (size) { error = copyin(SCARG(uap, data), data, size); if (error) { goto out; } /* * The data between size and alloc_size has * not been overwritten. It shouldn't matter * but let's clear that anyway. */ if (__predict_false(size < alloc_size)) { memset((char *)data+size, 0, alloc_size - size); } ktrgenio(SCARG(uap, fd), UIO_WRITE, SCARG(uap, data), size, 0); } else { *(void **)data = SCARG(uap, data); } } else if ((com&IOC_OUT) && size) { /* * Zero the buffer so the user always * gets back something deterministic. */ memset(data, 0, zero_size(size)); } else if (com&IOC_VOID) { *(void **)data = SCARG(uap, data); } } switch (com) { case FIONBIO: /* XXX Code block is not atomic */ if (*(int *)data != 0) atomic_or_uint(&fp->f_flag, FNONBLOCK); else atomic_and_uint(&fp->f_flag, ~FNONBLOCK); error = (*fp->f_ops->fo_ioctl)(fp, FIONBIO, data); break; case FIOASYNC: /* XXX Code block is not atomic */ if (*(int *)data != 0) atomic_or_uint(&fp->f_flag, FASYNC); else atomic_and_uint(&fp->f_flag, ~FASYNC); error = (*fp->f_ops->fo_ioctl)(fp, FIOASYNC, data); break; default: error = (*fp->f_ops->fo_ioctl)(fp, com, data); /* * Copy any data to user, size was * already set and checked above. */ if (error == 0 && (com&IOC_OUT) && size) { error = copyout(data, SCARG(uap, data), zero_size(size)); ktrgenio(SCARG(uap, fd), UIO_READ, SCARG(uap, data), size, error); } break; } out: if (memp) kmem_free(memp, alloc_size); fd_putfile(SCARG(uap, fd)); switch (error) { case -1: printf("sys_ioctl: _IO%s%s('%c', %lu, %lu) returned -1: " "pid=%d comm=%s\n", (com & IOC_IN) ? "W" : "", (com & IOC_OUT) ? "R" : "", (char)IOCGROUP(com), (com & 0xff), IOCPARM_LEN(com), p->p_pid, p->p_comm); /* FALLTHROUGH */ case EPASSTHROUGH: error = ENOTTY; /* FALLTHROUGH */ default: return (error); } }
259 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 /* $NetBSD: vnode.h,v 1.304 2022/10/26 23:40:30 riastradh Exp $ */ /*- * Copyright (c) 2008, 2020 The NetBSD Foundation, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /* * Copyright (c) 1989, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)vnode.h 8.17 (Berkeley) 5/20/95 */ #ifndef _SYS_VNODE_H_ #define _SYS_VNODE_H_ #include <sys/event.h> #include <sys/queue.h> #include <sys/condvar.h> #include <sys/rwlock.h> #include <sys/mutex.h> #include <sys/time.h> #include <sys/acl.h> /* XXX: clean up includes later */ #include <uvm/uvm_param.h> /* XXX */ #if defined(_KERNEL) || defined(_KMEMUSER) #include <uvm/uvm_pglist.h> /* XXX */ #include <uvm/uvm_object.h> /* XXX */ #include <uvm/uvm_extern.h> /* XXX */ struct uvm_ractx; #endif /* * The vnode is the focus of all file activity in UNIX. There is a * unique vnode allocated for each active file, each current directory, * each mounted-on file, text file, and the root. */ /* * Vnode types. VNON means no type. */ enum vtype { VNON, VREG, VDIR, VBLK, VCHR, VLNK, VSOCK, VFIFO, VBAD }; #define VNODE_TYPES \ "VNON", "VREG", "VDIR", "VBLK", "VCHR", "VLNK", "VSOCK", "VFIFO", "VBAD" /* * Vnode tag types. * These are for the benefit of external programs only (e.g., pstat) * and should NEVER be inspected by the kernel. */ enum vtagtype { VT_NON, VT_UFS, VT_NFS, VT_MFS, VT_MSDOSFS, VT_LFS, VT_LOFS, VT_FDESC, VT_PORTAL, VT_NULL, VT_UMAP, VT_KERNFS, VT_PROCFS, VT_AFS, VT_ISOFS, VT_UNION, VT_ADOSFS, VT_EXT2FS, VT_CODA, VT_FILECORE, VT_NTFS, VT_VFS, VT_OVERLAY, VT_SMBFS, VT_PTYFS, VT_TMPFS, VT_UDF, VT_SYSVBFS, VT_PUFFS, VT_HFS, VT_EFS, VT_ZFS, VT_RUMP, VT_NILFS, VT_V7FS, VT_CHFS, VT_AUTOFS }; #define VNODE_TAGS \ "VT_NON", "VT_UFS", "VT_NFS", "VT_MFS", "VT_MSDOSFS", "VT_LFS", "VT_LOFS", \ "VT_FDESC", "VT_PORTAL", "VT_NULL", "VT_UMAP", "VT_KERNFS", "VT_PROCFS", \ "VT_AFS", "VT_ISOFS", "VT_UNION", "VT_ADOSFS", "VT_EXT2FS", "VT_CODA", \ "VT_FILECORE", "VT_NTFS", "VT_VFS", "VT_OVERLAY", "VT_SMBFS", "VT_PTYFS", \ "VT_TMPFS", "VT_UDF", "VT_SYSVBFS", "VT_PUFFS", "VT_HFS", "VT_EFS", \ "VT_ZFS", "VT_RUMP", "VT_NILFS", "VT_V7FS", "VT_CHFS", "VT_AUTOFS" #if defined(_KERNEL) || defined(_KMEMUSER) struct vnode; struct buf; LIST_HEAD(buflists, buf); /* * Reading or writing any of these items requires holding the appropriate * lock. Field markings and the corresponding locks: * * - stable, reference to the vnode is required * b bufcache_lock * e exec_lock * f vnode_free_list_lock, or vrele_lock for vrele_list * i v_interlock * i+b v_interlock + bufcache_lock to modify, either to inspect * i+u v_interlock + v_uobj.vmobjlock to modify, either to inspect * k locked by underlying filesystem (maybe kernel_lock) * u v_uobj.vmobjlock * v vnode lock * * Each underlying filesystem allocates its own private area and hangs * it from v_data. */ struct vnode { /* * VM system related items. */ struct uvm_object v_uobj; /* u the VM object */ voff_t v_size; /* i+u size of file */ voff_t v_writesize; /* i+u new size after write */ /* * Unstable items get their own cache line. * On _LP64 this fills the space nicely. */ kcondvar_t v_cv /* i synchronization */ __aligned(COHERENCY_UNIT); int v_iflag; /* i+u VI_* flags */ int v_uflag; /* k VU_* flags */ int v_usecount; /* i reference count */ int v_numoutput; /* i # of pending writes */ int v_writecount; /* i ref count of writers */ int v_holdcnt; /* i page & buffer refs */ struct buflists v_cleanblkhd; /* i+b clean blocklist head */ struct buflists v_dirtyblkhd; /* i+b dirty blocklist head */ /* * The remaining items are largely stable. */ int v_vflag /* v VV_* flags */ __aligned(COHERENCY_UNIT); kmutex_t *v_interlock; /* - vnode interlock */ struct mount *v_mount; /* v ptr to vfs we are in */ int (**v_op)(void *); /* : vnode operations vector */ union { struct mount *vu_mountedhere;/* v ptr to vfs (VDIR) */ struct socket *vu_socket; /* v unix ipc (VSOCK) */ struct specnode *vu_specnode; /* v device (VCHR, VBLK) */ struct fifoinfo *vu_fifoinfo; /* v fifo (VFIFO) */ struct uvm_ractx *vu_ractx; /* u read-ahead ctx (VREG) */ } v_un; enum vtype v_type; /* - vnode type */ enum vtagtype v_tag; /* - type of underlying data */ void *v_data; /* - private data for fs */ struct vnode_klist *v_klist; /* i kevent / knote info */ void *v_segvguard; /* e for PAX_SEGVGUARD */ }; #define v_mountedhere v_un.vu_mountedhere #define v_socket v_un.vu_socket #define v_specnode v_un.vu_specnode #define v_fifoinfo v_un.vu_fifoinfo #define v_ractx v_un.vu_ractx typedef struct vnode vnode_t; /* * Structure that encompasses the kevent state for a vnode. This is * carved out as a separate structure because some vnodes may share * this state with one another. * * N.B. if two vnodes share a vnode_klist, then they must also share * v_interlock. */ struct vnode_klist { struct klist vk_klist; /* i notes attached to vnode */ long vk_interest; /* i what the notes are interested in */ }; #endif /* * Vnode flags. The first set are locked by vnode lock or are stable. * VSYSTEM is only used to skip vflush()ing quota files. VISTTY is used * when reading dead vnodes. */ #define VV_ROOT 0x00000001 /* root of its file system */ #define VV_SYSTEM 0x00000002 /* vnode being used by kernel */ #define VV_ISTTY 0x00000004 /* vnode represents a tty */ #define VV_MAPPED 0x00000008 /* vnode might have user mappings */ #define VV_MPSAFE 0x00000010 /* file system code is MP safe */ /* * The second set are locked by vp->v_interlock. VI_TEXT and VI_EXECMAP are * typically updated with vp->v_uobj.vmobjlock also held as the VM system * uses them for accounting purposes. */ #define VI_TEXT 0x00000100 /* vnode is a pure text prototype */ #define VI_EXECMAP 0x00000200 /* might have PROT_EXEC mappings */ #define VI_WRMAP 0x00000400 /* might have PROT_WRITE u. mappings */ #define VI_PAGES 0x00000800 /* UVM object has >0 pages */ #define VI_ONWORKLST 0x00004000 /* On syncer work-list */ #define VI_DEADCHECK 0x00008000 /* UVM: need to call vdead_check() */ /* * The third set are locked by the underlying file system. */ #define VU_DIROP 0x01000000 /* LFS: involved in a directory op */ #define VNODE_FLAGBITS \ "\20\1ROOT\2SYSTEM\3ISTTY\4MAPPED\5MPSAFE\11TEXT\12EXECMAP" \ "\13WRMAP\14PAGES\17ONWORKLST\20DEADCHECK\31DIROP" #define VSIZENOTSET ((voff_t)-1) /* * vnode lock flags */ #define LK_NONE 0x00000000 /* no lock - for VOP_ISLOCKED() */ #define LK_SHARED 0x00000001 /* shared lock */ #define LK_EXCLUSIVE 0x00000002 /* exclusive lock */ #define LK_UPGRADE 0x00000010 /* upgrade shared -> exclusive */ #define LK_DOWNGRADE 0x00000020 /* downgrade exclusive -> shared */ #define LK_NOWAIT 0x00000100 /* do not sleep to await lock */ #define LK_RETRY 0x00000200 /* vn_lock: retry until locked */ /* * Vnode attributes. A field value of VNOVAL represents a field whose value * is unavailable (getattr) or which is not to be changed (setattr). */ struct vattr { enum vtype va_type; /* vnode type (for create) */ mode_t va_mode; /* files access mode and type */ nlink_t va_nlink; /* number of references to file */ uid_t va_uid; /* owner user id */ gid_t va_gid; /* owner group id */ dev_t va_fsid; /* file system id (dev for now) */ ino_t va_fileid; /* file id */ u_quad_t va_size; /* file size in bytes */ long va_blocksize; /* blocksize preferred for i/o */ struct timespec va_atime; /* time of last access */ struct timespec va_mtime; /* time of last modification */ struct timespec va_ctime; /* time file changed */ struct timespec va_birthtime; /* time file created */ u_long va_gen; /* generation number of file */ u_long va_flags; /* flags defined for file */ dev_t va_rdev; /* device the special file represents */ u_quad_t va_bytes; /* bytes of disk space held by file */ u_quad_t va_filerev; /* file modification number */ unsigned int va_vaflags; /* operations flags, see below */ long va_spare; /* remain quad aligned */ }; /* * Flags for va_vaflags. */ #define VA_UTIMES_NULL 0x01 /* utimes argument was NULL */ #define VA_EXCLUSIVE 0x02 /* exclusive create request */ #ifdef _KERNEL /* * Flags for ioflag. */ #define IO_UNIT 0x00010 /* do I/O as atomic unit */ #define IO_APPEND 0x00020 /* append write to end */ #define IO_SYNC (0x40|IO_DSYNC) /* sync I/O file integrity completion */ #define IO_NODELOCKED 0x00080 /* underlying node already locked */ #define IO_NDELAY 0x00100 /* FNDELAY flag set in file table */ #define IO_DSYNC 0x00200 /* sync I/O data integrity completion */ #define IO_ALTSEMANTICS 0x00400 /* use alternate i/o semantics */ #define IO_NORMAL 0x00800 /* operate on regular data */ #define IO_EXT 0x01000 /* operate on extended attributes */ #define IO_DIRECT 0x02000 /* direct I/O hint */ #define IO_JOURNALLOCKED 0x04000 /* journal is already locked */ #define IO_ADV_MASK 0x00003 /* access pattern hint */ #define IO_ADV_SHIFT 0 #define IO_ADV_ENCODE(adv) (((adv) << IO_ADV_SHIFT) & IO_ADV_MASK) #define IO_ADV_DECODE(ioflag) (((ioflag) & IO_ADV_MASK) >> IO_ADV_SHIFT) /* * Flags for accmode_t. */ #define VEXEC 000000000100 /* execute/search permission */ #define VWRITE 000000000200 /* write permission */ #define VREAD 000000000400 /* read permission */ #define VADMIN 000000010000 /* being the file owner */ #define VAPPEND 000000040000 /* permission to write/append */ /* * VEXPLICIT_DENY makes VOP_ACCESSX(9) return EPERM or EACCES only * if permission was denied explicitly, by a "deny" rule in NFSv4 ACL, * and 0 otherwise. This never happens with ordinary unix access rights * or POSIX.1e ACLs. Obviously, VEXPLICIT_DENY must be OR-ed with * some other V* constant. */ #define VEXPLICIT_DENY 000000100000 #define VREAD_NAMED_ATTRS 000000200000 /* not used */ #define VWRITE_NAMED_ATTRS 000000400000 /* not used */ #define VDELETE_CHILD 000001000000 #define VREAD_ATTRIBUTES 000002000000 /* permission to stat(2) */ #define VWRITE_ATTRIBUTES 000004000000 /* change {m,c,a}time */ #define VDELETE 000010000000 #define VREAD_ACL 000020000000 /* read ACL and file mode */ #define VWRITE_ACL 000040000000 /* change ACL and/or file mode */ #define VWRITE_OWNER 000100000000 /* change file owner */ #define VSYNCHRONIZE 000200000000 /* not used */ #define VCREAT 000400000000 /* creating new file */ #define VVERIFY 001000000000 /* verification required */ #define __VNODE_PERM_BITS \ "\10" \ "\07VEXEC" \ "\10VWRITE" \ "\11VREAD" \ "\15VADMIN" \ "\17VAPPEND" \ "\20VEXPLICIT_DENY" \ "\21VREAD_NAMED_ATTRS" \ "\22VWRITE_NAMED_ATTRS" \ "\23VDELETE_CHILD" \ "\24VREAD_ATTRIBUTES" \ "\25VWRITE_ATTRIBUTES" \ "\26VDELETE" \ "\27VREAD_ACL" \ "\30VWRITE_ACL" \ "\31VWRITE_OWNER" \ "\32VSYNCHRONIZE" \ "\33VCREAT" \ "\34VVERIFY" /* * Permissions that were traditionally granted only to the file owner. */ #define VADMIN_PERMS (VADMIN | VWRITE_ATTRIBUTES | VWRITE_ACL | \ VWRITE_OWNER) /* * Permissions that were traditionally granted to everyone. */ #define VSTAT_PERMS (VREAD_ATTRIBUTES | VREAD_ACL) /* * Permissions that allow to change the state of the file in any way. */ #define VMODIFY_PERMS (VWRITE | VAPPEND | VADMIN_PERMS | VDELETE_CHILD | \ VDELETE) /* * Token indicating no attribute value yet assigned. */ #define VNOVAL (-1) #define VNOVALSIZE ((u_quad_t)-1) #define VNOVALFLAGS ((u_long)-1) /* * Convert between vnode types and inode formats (since POSIX.1 * defines mode word of stat structure in terms of inode formats). */ extern const enum vtype iftovt_tab[]; extern const int vttoif_tab[]; #define IFTOVT(mode) (iftovt_tab[((mode) & S_IFMT) >> 12]) #define VTTOIF(indx) (vttoif_tab[(int)(indx)]) #define MAKEIMODE(indx, mode) (int)(VTTOIF(indx) | (mode)) /* * Flags to various vnode functions. */ #define SKIPSYSTEM 0x0001 /* vflush: skip vnodes marked VSYSTEM */ #define FORCECLOSE 0x0002 /* vflush: force file closeure */ #define WRITECLOSE 0x0004 /* vflush: only close writable files */ #define V_SAVE 0x0001 /* vinvalbuf: sync file first */ /* * Flags to various vnode operations. */ #define REVOKEALL 0x0001 /* revoke: revoke all aliases */ #define FSYNC_WAIT 0x0001 /* fsync: wait for completion */ #define FSYNC_DATAONLY 0x0002 /* fsync: hint: sync file data only */ #define FSYNC_RECLAIM 0x0004 /* fsync: hint: vnode is being reclaimed */ #define FSYNC_LAZY 0x0008 /* fsync: lazy sync (trickle) */ #define FSYNC_NOLOG 0x0010 /* fsync: do not flush the log */ #define FSYNC_CACHE 0x0100 /* fsync: flush disk caches too */ #define UPDATE_WAIT 0x0001 /* update: wait for completion */ #define UPDATE_DIROP 0x0002 /* update: hint to fs to wait or not */ #define UPDATE_CLOSE 0x0004 /* update: clean up on close */ #define VDEAD_NOWAIT 0x0001 /* vdead_check: do not sleep */ void holdrelel(struct vnode *); void holdrele(struct vnode *); void vholdl(struct vnode *); void vhold(struct vnode *); void vref(struct vnode *); #define NULLVP ((struct vnode *)NULL) /* * Macro to determine kevent interest on a vnode. */ #define _VN_KEVENT_INTEREST(vp, n) \ (((vp)->v_klist->vk_interest & (n)) != 0) static inline bool VN_KEVENT_INTEREST(struct vnode *vp, long hint) { mutex_enter(vp->v_interlock); bool rv = _VN_KEVENT_INTEREST(vp, hint); mutex_exit(vp->v_interlock); return rv; } static inline void VN_KNOTE(struct vnode *vp, long hint) { mutex_enter(vp->v_interlock); if (__predict_false(_VN_KEVENT_INTEREST(vp, hint))) { knote(&vp->v_klist->vk_klist, hint); } mutex_exit(vp->v_interlock); } void vn_knote_attach(struct vnode *, struct knote *); void vn_knote_detach(struct vnode *, struct knote *); /* * Global vnode data. */ extern struct vnode *rootvnode; /* root (i.e. "/") vnode */ extern int desiredvnodes; /* number of vnodes desired */ extern unsigned int numvnodes; /* current number of vnodes */ #endif /* _KERNEL */ /* * Mods for exensibility. */ /* * Flags for vdesc_flags: */ #define VDESC_MAX_VPS 8 /* Low order 16 flag bits are reserved for willrele flags for vp arguments. */ #define VDESC_VP0_WILLRELE 0x00000001 #define VDESC_VP1_WILLRELE 0x00000002 #define VDESC_VP2_WILLRELE 0x00000004 #define VDESC_VP3_WILLRELE 0x00000008 #define VDESC_VP0_WILLPUT 0x00000101 #define VDESC_VP1_WILLPUT 0x00000202 #define VDESC_VP2_WILLPUT 0x00000404 #define VDESC_VP3_WILLPUT 0x00000808 /* * VDESC_NO_OFFSET is used to identify the end of the offset list * and in places where no such field exists. */ #define VDESC_NO_OFFSET -1 /* * This structure describes the vnode operation taking place. */ struct vnodeop_desc { int vdesc_offset; /* offset in vector--first for speed */ const char *vdesc_name; /* a readable name for debugging */ int vdesc_flags; /* VDESC_* flags */ /* * These ops are used by bypass routines to map and locate arguments. * Creds and procs are not needed in bypass routines, but sometimes * they are useful to (for example) transport layers. * Nameidata is useful because it has a cred in it. */ const int *vdesc_vp_offsets; /* list ended by VDESC_NO_OFFSET */ int vdesc_vpp_offset; /* return vpp location */ int vdesc_cred_offset; /* cred location, if any */ int vdesc_componentname_offset; /* if any */ }; #ifdef _KERNEL extern const struct vnodeop_desc * const vfs_op_descs[]; /* * Union filesystem hook for vn_readdir(). */ extern int (*vn_union_readdir_hook) (struct vnode **, struct file *, struct lwp *); /* * Macros for offsets in the vdesc struct. */ #define VOPARG_OFFSETOF(type, member) offsetof(type, member) #define VOPARG_OFFSETTO(type,offset,sp) ((type)(((char *)(sp)) + (offset))) /* * This structure is used to configure the new vnodeops vector. */ struct vnodeopv_entry_desc { const struct vnodeop_desc *opve_op; /* which operation this is */ int (*opve_impl)(void *); /* code implementing this operation */ }; struct vnodeopv_desc { /* ptr to the ptr to the vector where op should go */ int (***opv_desc_vector_p)(void *); const struct vnodeopv_entry_desc *opv_desc_ops; /* null terminated list */ }; /* * A default routine which just returns an error. */ int vn_default_error(void *); /* * A generic structure. * This can be used by bypass routines to identify generic arguments. */ struct vop_generic_args { struct vnodeop_desc *a_desc; /* other random data follows, presumably */ }; /* * VOCALL calls an op given an ops vector. We break it out because BSD's * vclean changes the ops vector and then wants to call ops with the old * vector. */ /* * actually, vclean doesn't use it anymore, but nfs does, * for device specials and fifos. */ #define VOCALL(OPSV,OFF,AP) (( *((OPSV)[(OFF)])) (AP)) /* * This call works for vnodes in the kernel. */ #define VCALL(VP,OFF,AP) VOCALL((VP)->v_op,(OFF),(AP)) #define VDESC(OP) (& __CONCAT(OP,_desc)) #define VOFFSET(OP) (VDESC(OP)->vdesc_offset) /* XXX This include should go away */ #include <sys/mount.h> /* * Finally, include the default set of vnode operations. */ #include <sys/vnode_if.h> /* * Public vnode manipulation functions. */ struct file; struct filedesc; struct nameidata; struct pathbuf; struct proc; struct stat; struct uio; struct vattr; struct vnode; /* see vnode(9) */ void vfs_vnode_sysinit(void); int bdevvp(dev_t, struct vnode **); int cdevvp(dev_t, struct vnode **); void vattr_null(struct vattr *); void vdevgone(int, int, int, enum vtype); int vfinddev(dev_t, enum vtype, struct vnode **); int vflush(struct mount *, struct vnode *, int); int vflushbuf(struct vnode *, int); void vgone(struct vnode *); int vinvalbuf(struct vnode *, int, kauth_cred_t, struct lwp *, bool, int); void vprint(const char *, struct vnode *); void vput(struct vnode *); bool vrecycle(struct vnode *); void vrele(struct vnode *); void vrele_async(struct vnode *); void vrele_flush(struct mount *); int vtruncbuf(struct vnode *, daddr_t, bool, int); void vwakeup(struct buf *); int vdead_check(struct vnode *, int); void vrevoke(struct vnode *); void vremfree(struct vnode *); void vshareilock(struct vnode *, struct vnode *); void vshareklist(struct vnode *, struct vnode *); int vrefcnt(struct vnode *); int vcache_get(struct mount *, const void *, size_t, struct vnode **); int vcache_new(struct mount *, struct vnode *, struct vattr *, kauth_cred_t, void *, struct vnode **); int vcache_rekey_enter(struct mount *, struct vnode *, const void *, size_t, const void *, size_t); void vcache_rekey_exit(struct mount *, struct vnode *, const void *, size_t, const void *, size_t); /* see vnsubr(9) */ int vn_bwrite(void *); int vn_close(struct vnode *, int, kauth_cred_t); int vn_isunder(struct vnode *, struct vnode *, struct lwp *); int vn_lock(struct vnode *, int); void vn_markexec(struct vnode *); int vn_marktext(struct vnode *); int vn_open(struct vnode *, struct pathbuf *, int, int, int, struct vnode **, bool *, int *); int vn_rdwr(enum uio_rw, struct vnode *, void *, int, off_t, enum uio_seg, int, kauth_cred_t, size_t *, struct lwp *); int vn_readdir(struct file *, char *, int, unsigned int, int *, struct lwp *, off_t **, int *); int vn_stat(struct vnode *, struct stat *); int vn_kqfilter(struct file *, struct knote *); int vn_writechk(struct vnode *); int vn_openchk(struct vnode *, kauth_cred_t, int); int vn_extattr_get(struct vnode *, int, int, const char *, size_t *, void *, struct lwp *); int vn_extattr_set(struct vnode *, int, int, const char *, size_t, const void *, struct lwp *); int vn_extattr_rm(struct vnode *, int, int, const char *, struct lwp *); int vn_fifo_bypass(void *); int vn_bdev_open(dev_t, struct vnode **, struct lwp *); int vn_bdev_openpath(struct pathbuf *pb, struct vnode **, struct lwp *); /* initialise global vnode management */ void vntblinit(void); /* misc stuff */ void sched_sync(void *); void vn_syncer_add_to_worklist(struct vnode *, int); void vn_syncer_remove_from_worklist(struct vnode *); int dorevoke(struct vnode *, kauth_cred_t); int rawdev_mounted(struct vnode *, struct vnode **); uint8_t vtype2dt(enum vtype); /* see vfssubr(9) */ int vfs_unixify_accmode(accmode_t *); void vfs_getnewfsid(struct mount *); void vfs_timestamp(struct timespec *); #if defined(DDB) || defined(DEBUGPRINT) void vfs_vnode_print(struct vnode *, int, void (*)(const char *, ...) __printflike(1, 2)); void vfs_vnode_lock_print(void *, int, void (*)(const char *, ...) __printflike(1, 2)); void vfs_mount_print(struct mount *, int, void (*)(const char *, ...) __printflike(1, 2)); void vfs_mount_print_all(int, void (*)(const char *, ...) __printflike(1, 2)); #endif /* DDB */ #endif /* _KERNEL */ #endif /* !_SYS_VNODE_H_ */
7 3 3 3 3 3 3 194 196 196 194 196 192 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 /* $NetBSD: subr_localcount.c,v 1.7 2017/11/17 09:26:36 ozaki-r Exp $ */ /*- * Copyright (c) 2016 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Taylor R. Campbell. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /* * CPU-local reference counts * * localcount(9) is a reference-counting scheme that involves no * interprocessor synchronization most of the time, at the cost of * eight bytes of memory per CPU per object and at the cost of * expensive interprocessor synchronization to drain references. * * localcount(9) references may be held across sleeps, may be * transferred from CPU to CPU or thread to thread: they behave * semantically like typical reference counts, with different * pragmatic performance characteristics. */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: subr_localcount.c,v 1.7 2017/11/17 09:26:36 ozaki-r Exp $"); #include <sys/param.h> #include <sys/localcount.h> #include <sys/types.h> #include <sys/condvar.h> #include <sys/errno.h> #include <sys/mutex.h> #include <sys/percpu.h> #include <sys/xcall.h> #if defined(DEBUG) && defined(LOCKDEBUG) #include <sys/atomic.h> #endif static void localcount_xc(void *, void *); /* * localcount_init(lc) * * Initialize a localcount object. Returns 0 on success, error * code on failure. May fail to allocate memory for percpu(9). * * The caller must call localcount_drain and then localcount_fini * when done with lc. */ void localcount_init(struct localcount *lc) { lc->lc_totalp = NULL; lc->lc_percpu = percpu_alloc(sizeof(int64_t)); } /* * localcount_drain(lc, cv, interlock) * * Wait for all acquired references to lc to drain. Caller must * hold interlock; localcount_drain releases it during cross-calls * and waits on cv. The cv and interlock passed here must be the * same as are passed to localcount_release for this lc. * * Caller must guarantee that no new references can be acquired * with localcount_acquire before calling localcount_drain. For * example, any object that may be found in a list and acquired * must be removed from the list before localcount_drain. * * The localcount object lc may be used only with localcount_fini * after this, unless reinitialized after localcount_fini with * localcount_init. */ void localcount_drain(struct localcount *lc, kcondvar_t *cv, kmutex_t *interlock) { int64_t total = 0; KASSERT(mutex_owned(interlock)); KASSERT(lc->lc_totalp == NULL); /* Mark it draining. */ lc->lc_totalp = &total; /* * Count up all references on all CPUs. * * This serves as a global memory barrier: after xc_wait, all * CPUs will have witnessed the nonnull value of lc->lc_totalp, * so that it is safe to wait on the cv for them. */ mutex_exit(interlock); xc_wait(xc_broadcast(0, &localcount_xc, lc, interlock)); mutex_enter(interlock); /* Wait for remaining references to drain. */ while (total != 0) { /* * At this point, now that we have added up all * references on all CPUs, the total had better be * nonnegative. */ KASSERTMSG((0 < total), "negatively referenced localcount: %p, %"PRId64, lc, total); cv_wait(cv, interlock); } /* Paranoia: Cause any further use of lc->lc_totalp to crash. */ lc->lc_totalp = (void *)(uintptr_t)1; } /* * localcount_fini(lc) * * Finalize a localcount object, releasing any memory allocated * for it. The localcount object must already have been drained. */ void localcount_fini(struct localcount *lc) { KASSERT(lc->lc_totalp == (void *)(uintptr_t)1); percpu_free(lc->lc_percpu, sizeof(uint64_t)); } /* * localcount_xc(cookie0, cookie1) * * Accumulate and transfer the per-CPU reference counts to a * global total, resetting the per-CPU counter to zero. Once * localcount_drain() has started, we only maintain the total * count in localcount_release(). */ static void localcount_xc(void *cookie0, void *cookie1) { struct localcount *lc = cookie0; kmutex_t *interlock = cookie1; int64_t *localp; mutex_enter(interlock); localp = percpu_getref(lc->lc_percpu); *lc->lc_totalp += *localp; *localp -= *localp; /* ie, *localp = 0; */ percpu_putref(lc->lc_percpu); mutex_exit(interlock); } /* * localcount_adjust(lc, delta) * * Add delta -- positive or negative -- to the local CPU's count * for lc. */ static void localcount_adjust(struct localcount *lc, int delta) { int64_t *localp; localp = percpu_getref(lc->lc_percpu); *localp += delta; percpu_putref(lc->lc_percpu); } /* * localcount_acquire(lc) * * Acquire a reference to lc. * * The reference may be held across sleeps and may be migrated * from CPU to CPU, or even thread to thread -- it is only * counted, not associated with a particular concrete owner. * * Involves no interprocessor synchronization. May be used in any * context: while a lock is held, within a pserialize(9) read * section, in hard interrupt context (provided other users block * hard interrupts), in soft interrupt context, in thread context, * &c. * * Caller must guarantee that there is no concurrent * localcount_drain. For example, any object that may be found in * a list and acquired must be removed from the list before * localcount_drain. */ void localcount_acquire(struct localcount *lc) { KASSERT(lc->lc_totalp == NULL); localcount_adjust(lc, +1); #if defined(DEBUG) && defined(LOCKDEBUG) if (atomic_inc_32_nv(&lc->lc_refcnt) == 0) panic("counter overflow"); #endif } /* * localcount_release(lc, cv, interlock) * * Release a reference to lc. If there is a concurrent * localcount_drain and this may be the last reference, notify * localcount_drain by acquiring interlock, waking cv, and * releasing interlock. The cv and interlock passed here must be * the same as are passed to localcount_drain for this lc. * * Involves no interprocessor synchronization unless there is a * concurrent localcount_drain in progress. */ void localcount_release(struct localcount *lc, kcondvar_t *cv, kmutex_t *interlock) { /* * Block xcall so that if someone begins draining after we see * lc->lc_totalp as null, then they won't start cv_wait until * after they have counted this CPU's contributions. * * Otherwise, localcount_drain may notice an extant reference * from this CPU and cv_wait for it, but having seen * lc->lc_totalp as null, this CPU will not wake * localcount_drain. */ kpreempt_disable(); KDASSERT(mutex_ownable(interlock)); if (__predict_false(lc->lc_totalp != NULL)) { /* * Slow path -- wake localcount_drain in case this is * the last reference. */ mutex_enter(interlock); if (--*lc->lc_totalp == 0) cv_broadcast(cv); mutex_exit(interlock); goto out; } localcount_adjust(lc, -1); #if defined(DEBUG) && defined(LOCKDEBUG) if (atomic_dec_32_nv(&lc->lc_refcnt) == UINT_MAX) panic("counter underflow"); #endif out: kpreempt_enable(); } /* * localcount_debug_refcnt(lc) * * Return a total reference count of lc. It returns a correct value * only if DEBUG and LOCKDEBUG enabled. Otherwise always return 0. */ uint32_t localcount_debug_refcnt(const struct localcount *lc) { #if defined(DEBUG) && defined(LOCKDEBUG) return lc->lc_refcnt; #else return 0; #endif }
5 3 2 3 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 /* $NetBSD: uvm_io.c,v 1.30 2024/05/03 07:09:20 skrll Exp $ */ /* * Copyright (c) 1997 Charles D. Cranor and Washington University. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * from: Id: uvm_io.c,v 1.1.2.2 1997/12/30 12:02:00 mrg Exp */ /* * uvm_io.c: uvm i/o ops */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: uvm_io.c,v 1.30 2024/05/03 07:09:20 skrll Exp $"); #include <sys/param.h> #include <sys/systm.h> #include <sys/mman.h> #include <sys/uio.h> #include <uvm/uvm.h> /* * functions */ /* * uvm_io: perform I/O on a map * * => caller must have a reference to "map" so that it doesn't go away * while we are working. */ int uvm_io(struct vm_map *map, struct uio *uio, int flags) { vaddr_t baseva, endva, pageoffset, kva; vsize_t chunksz, togo, sz; struct vm_map_entry *dead_entries; int error; /* * step 0: sanity checks and set up for copy loop. start with a * large chunk size. if we have trouble finding vm space we will * reduce it. */ if (uio->uio_resid == 0) return 0; togo = uio->uio_resid; baseva = (vaddr_t) uio->uio_offset; endva = baseva + (togo - 1); if (endva < baseva) /* wrap around? */ return EIO; if (baseva >= VM_MAXUSER_ADDRESS) return 0; if (endva >= VM_MAXUSER_ADDRESS) /* EOF truncate */ togo = togo - (endva - VM_MAXUSER_ADDRESS + 1); pageoffset = baseva & PAGE_MASK; baseva = trunc_page(baseva); chunksz = MIN(round_page(togo + pageoffset), trunc_page(MAXPHYS)); error = 0; flags |= UVM_EXTRACT_QREF | UVM_EXTRACT_CONTIG | UVM_EXTRACT_FIXPROT; /* XXX cannot use QREF with without AMAP_REFALL, and REFALL is unsafe */ flags &= ~UVM_EXTRACT_QREF; /* * step 1: main loop... while we've got data to move */ for (/*null*/; togo > 0 ; pageoffset = 0) { /* * step 2: extract mappings from the map into kernel_map */ error = uvm_map_extract(map, baseva, chunksz, kernel_map, &kva, flags); if (error) { /* retry with a smaller chunk... */ if (error == ENOMEM && chunksz > PAGE_SIZE) { chunksz = trunc_page(chunksz / 2); if (chunksz < PAGE_SIZE) chunksz = PAGE_SIZE; continue; } break; } /* * step 3: move a chunk of data */ sz = chunksz - pageoffset; if (sz > togo) sz = togo; error = uiomove((void *) (kva + pageoffset), sz, uio); togo -= sz; baseva += chunksz; /* * step 4: unmap the area of kernel memory */ vm_map_lock(kernel_map); uvm_unmap_remove(kernel_map, kva, kva + chunksz, &dead_entries, 0); vm_map_unlock(kernel_map); if (dead_entries != NULL) uvm_unmap_detach(dead_entries, AMAP_REFALL); if (error) break; } return error; }
10 8 2 1 7 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 /* $NetBSD: sysv_msg_50.c,v 1.5 2019/12/15 16:48:26 tsutsui Exp $ */ /*- * Copyright (c) 1999 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility, * NASA Ames Research Center. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: sysv_msg_50.c,v 1.5 2019/12/15 16:48:26 tsutsui Exp $"); #if defined(_KERNEL_OPT) #include "opt_compat_netbsd.h" #endif #include <sys/param.h> #include <sys/systm.h> #include <sys/signal.h> #include <sys/proc.h> #include <sys/msg.h> #include <compat/sys/msg.h> #ifndef SYSVMSG #define SYSVMSG #endif #include <sys/syscallargs.h> int compat_50_sys___msgctl13(struct lwp *l, const struct compat_50_sys___msgctl13_args *uap, register_t *retval) { /* { syscallarg(int) msqid; syscallarg(int) cmd; syscallarg(struct msqid_ds13 *) buf; } */ struct msqid_ds msqbuf; struct msqid_ds13 omsqbuf; int cmd, error; cmd = SCARG(uap, cmd); if (cmd == IPC_SET) { error = copyin(SCARG(uap, buf), &omsqbuf, sizeof(omsqbuf)); if (error) return (error); __msqid_ds13_to_native(&omsqbuf, &msqbuf); } error = msgctl1(l, SCARG(uap, msqid), cmd, (cmd == IPC_SET || cmd == IPC_STAT) ? &msqbuf : NULL); if (error == 0 && cmd == IPC_STAT) { __native_to_msqid_ds13(&msqbuf, &omsqbuf); error = copyout(&omsqbuf, SCARG(uap, buf), sizeof(omsqbuf)); } return (error); }
4 56 4 9 2 1 7 7 7 7 7 6 3 3 5 3 6 3 7 5 3 6 3 6 3 5 3 70 1 1 1 56 11 25 7 1 5 40 17 2 54 2 46 5 57 7 1 3 2 1 8 1 1 7 26 25 1 12 31 1 14 22 1 21 11 1 6 1 5 2 55 10 46 54 1 1 54 38 37 1 16 3 2 14 38 22 2 20 1 10 10 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 /* $NetBSD: uvm_mmap.c,v 1.185 2023/11/21 14:35:36 riastradh Exp $ */ /* * Copyright (c) 1997 Charles D. Cranor and Washington University. * Copyright (c) 1991, 1993 The Regents of the University of California. * Copyright (c) 1988 University of Utah. * * All rights reserved. * * This code is derived from software contributed to Berkeley by * the Systems Programming Group of the University of Utah Computer * Science Department. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: Utah $Hdr: vm_mmap.c 1.6 91/10/21$ * @(#)vm_mmap.c 8.5 (Berkeley) 5/19/94 * from: Id: uvm_mmap.c,v 1.1.2.14 1998/01/05 21:04:26 chuck Exp */ /* * uvm_mmap.c: system call interface into VM system, plus kernel vm_mmap * function. */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: uvm_mmap.c,v 1.185 2023/11/21 14:35:36 riastradh Exp $"); #include "opt_compat_netbsd.h" #include "opt_pax.h" #include <sys/param.h> #include <sys/types.h> #include <sys/file.h> #include <sys/filedesc.h> #include <sys/resourcevar.h> #include <sys/mman.h> #include <sys/pax.h> #include <sys/syscallargs.h> #include <uvm/uvm.h> #include <uvm/uvm_device.h> static int uvm_mmap(struct vm_map *, vaddr_t *, vsize_t, vm_prot_t, vm_prot_t, int, int, struct uvm_object *, voff_t, vsize_t); static int range_test(const struct vm_map *map, vaddr_t addr, vsize_t size, bool ismmap) { vaddr_t vm_min_address = vm_map_min(map); vaddr_t vm_max_address = vm_map_max(map); vaddr_t eaddr = addr + size; int res = 0; if (addr < vm_min_address) return EINVAL; if (eaddr > vm_max_address) return ismmap ? EFBIG : EINVAL; if (addr > eaddr) /* no wrapping! */ return ismmap ? EOVERFLOW : EINVAL; #ifdef MD_MMAP_RANGE_TEST res = MD_MMAP_RANGE_TEST(addr, eaddr); #endif return res; } /* * align the address to a page boundary, and adjust the size accordingly */ static int round_and_check(const struct vm_map *map, vaddr_t *addr, vsize_t *size) { const vsize_t pageoff = (vsize_t)(*addr & PAGE_MASK); *addr -= pageoff; if (*size != 0) { *size += pageoff; *size = (vsize_t)round_page(*size); } else if (*addr + *size < *addr) { return ENOMEM; } return range_test(map, *addr, *size, false); } /* * sys_mincore: determine if pages are in core or not. */ /* ARGSUSED */ int sys_mincore(struct lwp *l, const struct sys_mincore_args *uap, register_t *retval) { /* { syscallarg(void *) addr; syscallarg(size_t) len; syscallarg(char *) vec; } */ struct proc *p = l->l_proc; struct vm_page *pg; char *vec, pgi; struct uvm_object *uobj; struct vm_amap *amap; struct vm_anon *anon; struct vm_map_entry *entry; vaddr_t start, end, lim; struct vm_map *map; vsize_t len; int error = 0; size_t npgs; map = &p->p_vmspace->vm_map; start = (vaddr_t)SCARG(uap, addr); len = SCARG(uap, len); vec = SCARG(uap, vec); if (start & PAGE_MASK) return EINVAL; len = round_page(len); end = start + len; if (end <= start) return EINVAL; /* * Lock down vec, so our returned status isn't outdated by * storing the status byte for a page. */ npgs = len >> PAGE_SHIFT; error = uvm_vslock(p->p_vmspace, vec, npgs, VM_PROT_WRITE); if (error) { return error; } vm_map_lock_read(map); if (uvm_map_lookup_entry(map, start, &entry) == false) { error = ENOMEM; goto out; } for (/* nothing */; entry != &map->header && entry->start < end; entry = entry->next) { KASSERT(!UVM_ET_ISSUBMAP(entry)); KASSERT(start >= entry->start); /* Make sure there are no holes. */ if (entry->end < end && (entry->next == &map->header || entry->next->start > entry->end)) { error = ENOMEM; goto out; } lim = end < entry->end ? end : entry->end; /* * Special case for objects with no "real" pages. Those * are always considered resident (mapped devices). */ if (UVM_ET_ISOBJ(entry)) { KASSERT(!UVM_OBJ_IS_KERN_OBJECT(entry->object.uvm_obj)); if (UVM_OBJ_IS_DEVICE(entry->object.uvm_obj)) { for (/* nothing */; start < lim; start += PAGE_SIZE, vec++) ustore_char(vec, 1); continue; } } amap = entry->aref.ar_amap; /* upper layer */ uobj = entry->object.uvm_obj; /* lower layer */ if (amap != NULL) amap_lock(amap, RW_READER); if (uobj != NULL) rw_enter(uobj->vmobjlock, RW_READER); for (/* nothing */; start < lim; start += PAGE_SIZE, vec++) { pgi = 0; if (amap != NULL) { /* Check the upper layer first. */ anon = amap_lookup(&entry->aref, start - entry->start); /* Don't need to lock anon here. */ if (anon != NULL && anon->an_page != NULL) { /* * Anon has the page for this entry * offset. */ pgi = 1; } } if (uobj != NULL && pgi == 0) { /* Check the lower layer. */ pg = uvm_pagelookup(uobj, entry->offset + (start - entry->start)); if (pg != NULL) { /* * Object has the page for this entry * offset. */ pgi = 1; } } (void) ustore_char(vec, pgi); } if (uobj != NULL) rw_exit(uobj->vmobjlock); if (amap != NULL) amap_unlock(amap); } out: vm_map_unlock_read(map); uvm_vsunlock(p->p_vmspace, SCARG(uap, vec), npgs); return error; } /* * sys_mmap: mmap system call. * * => file offset and address may not be page aligned * - if MAP_FIXED, offset and address must have remainder mod PAGE_SIZE * - if address isn't page aligned the mapping starts at trunc_page(addr) * and the return value is adjusted up by the page offset. */ int sys_mmap(struct lwp *l, const struct sys_mmap_args *uap, register_t *retval) { /* { syscallarg(void *) addr; syscallarg(size_t) len; syscallarg(int) prot; syscallarg(int) flags; syscallarg(int) fd; syscallarg(long) pad; syscallarg(off_t) pos; } */ struct proc *p = l->l_proc; vaddr_t addr; off_t pos; vsize_t size, pageoff; vm_prot_t prot, maxprot, extraprot; int flags, fd, advice; vaddr_t defaddr = 0; /* XXXGCC */ bool addrhint = false; struct file *fp = NULL; struct uvm_object *uobj; int error; vaddr_t orig_addr; /* * first, extract syscall args from the uap. */ addr = (vaddr_t)SCARG(uap, addr); size = (vsize_t)SCARG(uap, len); prot = SCARG(uap, prot) & VM_PROT_ALL; extraprot = PROT_MPROTECT_EXTRACT(SCARG(uap, prot)); flags = SCARG(uap, flags); fd = SCARG(uap, fd); pos = SCARG(uap, pos); orig_addr = addr; if ((flags & (MAP_SHARED|MAP_PRIVATE)) == (MAP_SHARED|MAP_PRIVATE)) return EINVAL; if (size == 0 && (flags & MAP_ANON) == 0) return EINVAL; /* * Align file position and save offset into page. Adjust size * so that it is an integral multiple of the page size. */ pageoff = pos & PAGE_MASK; pos -= pageoff; KASSERT(PAGE_MASK <= __type_max(vsize_t)); KASSERT((__type_max(vsize_t) - PAGE_SIZE + 1) % PAGE_SIZE == 0); if (size > __type_max(vsize_t) - PAGE_SIZE + 1 - pageoff) return ENOMEM; /* * size + pageoff <= VSIZE_MAX + 1 - PAGE_SIZE, and the * right-hand side is an integral multiple of the page size, so * round_page(size + pageoff) <= VSIZE_MAX + 1 - PAGE_SIZE. */ size = round_page(size + pageoff); /* * now check (MAP_FIXED) or get (!MAP_FIXED) the "addr" */ if (flags & MAP_FIXED) { /* ensure address and file offset are aligned properly */ addr -= pageoff; if (addr & PAGE_MASK) return EINVAL; error = range_test(&p->p_vmspace->vm_map, addr, size, true); if (error) { return error; } } else if (addr == 0 || !(flags & MAP_TRYFIXED)) { /* * not fixed: make sure we skip over the largest * possible heap for non-topdown mapping arrangements. * we will refine our guess later (e.g. to account for * VAC, etc) */ defaddr = p->p_emul->e_vm_default_addr(p, (vaddr_t)p->p_vmspace->vm_daddr, size, p->p_vmspace->vm_map.flags & VM_MAP_TOPDOWN); if (addr == 0 || !(p->p_vmspace->vm_map.flags & VM_MAP_TOPDOWN)) addr = MAX(addr, defaddr); else addr = MIN(addr, defaddr); /* * If addr is nonzero and not the default, then the * address is a hint. */ addrhint = (addr != 0 && addr != defaddr); } /* * check for file mappings (i.e. not anonymous) and verify file. */ advice = UVM_ADV_NORMAL; if ((flags & MAP_ANON) == 0) { KASSERT(size != 0); if ((fp = fd_getfile(fd)) == NULL) return EBADF; if (fp->f_ops->fo_mmap == NULL) { error = ENODEV; goto out; } error = (*fp->f_ops->fo_mmap)(fp, &pos, size, prot, &flags, &advice, &uobj, &maxprot); if (error) { goto out; } if (uobj == NULL) { flags |= MAP_ANON; fd_putfile(fd); fp = NULL; goto is_anon; } } else { /* MAP_ANON case */ /* * XXX What do we do about (MAP_SHARED|MAP_PRIVATE) == 0? */ if (fd != -1) return EINVAL; is_anon: /* label for SunOS style /dev/zero */ uobj = NULL; maxprot = VM_PROT_ALL; pos = 0; } maxprot = PAX_MPROTECT_MAXPROTECT(l, prot, extraprot, maxprot); if (((prot | extraprot) & maxprot) != (prot | extraprot)) { error = EACCES; goto out; } if ((error = PAX_MPROTECT_VALIDATE(l, prot))) goto out; pax_aslr_mmap(l, &addr, orig_addr, flags); /* * Now let kernel internal function uvm_mmap do the work. * * If the user provided a hint, take a reference to uobj in * case the first attempt to satisfy the hint fails, so we can * try again with the default address. */ if (addrhint) { if (uobj) (*uobj->pgops->pgo_reference)(uobj); } error = uvm_mmap(&p->p_vmspace->vm_map, &addr, size, prot, maxprot, flags, advice, uobj, pos, p->p_rlimit[RLIMIT_MEMLOCK].rlim_cur); if (addrhint) { if (error) { addr = defaddr; pax_aslr_mmap(l, &addr, orig_addr, flags); error = uvm_mmap(&p->p_vmspace->vm_map, &addr, size, prot, maxprot, flags, advice, uobj, pos, p->p_rlimit[RLIMIT_MEMLOCK].rlim_cur); } else if (uobj) { /* Release the exta reference we took. */ (*uobj->pgops->pgo_detach)(uobj); } } /* remember to add offset */ *retval = (register_t)(addr + pageoff); out: if (fp != NULL) fd_putfile(fd); return error; } /* * sys___msync13: the msync system call (a front-end for flush) */ int sys___msync13(struct lwp *l, const struct sys___msync13_args *uap, register_t *retval) { /* { syscallarg(void *) addr; syscallarg(size_t) len; syscallarg(int) flags; } */ struct proc *p = l->l_proc; vaddr_t addr; vsize_t size; struct vm_map *map; int error, flags, uvmflags; bool rv; /* * extract syscall args from the uap */ addr = (vaddr_t)SCARG(uap, addr); size = (vsize_t)SCARG(uap, len); flags = SCARG(uap, flags); /* sanity check flags */ if ((flags & ~(MS_ASYNC | MS_SYNC | MS_INVALIDATE)) != 0 || (flags & (MS_ASYNC | MS_SYNC | MS_INVALIDATE)) == 0 || (flags & (MS_ASYNC | MS_SYNC)) == (MS_ASYNC | MS_SYNC)) return EINVAL; if ((flags & (MS_ASYNC | MS_SYNC)) == 0) flags |= MS_SYNC; /* * get map */ map = &p->p_vmspace->vm_map; if (round_and_check(map, &addr, &size)) return ENOMEM; /* * XXXCDC: do we really need this semantic? * * XXX Gak! If size is zero we are supposed to sync "all modified * pages with the region containing addr". Unfortunately, we * don't really keep track of individual mmaps so we approximate * by flushing the range of the map entry containing addr. * This can be incorrect if the region splits or is coalesced * with a neighbor. */ if (size == 0) { struct vm_map_entry *entry; vm_map_lock_read(map); rv = uvm_map_lookup_entry(map, addr, &entry); if (rv == true) { addr = entry->start; size = entry->end - entry->start; } vm_map_unlock_read(map); if (rv == false) return EINVAL; } /* * translate MS_ flags into PGO_ flags */ uvmflags = PGO_CLEANIT; if (flags & MS_INVALIDATE) uvmflags |= PGO_FREE; if (flags & MS_SYNC) uvmflags |= PGO_SYNCIO; error = uvm_map_clean(map, addr, addr+size, uvmflags); return error; } /* * sys_munmap: unmap a users memory */ int sys_munmap(struct lwp *l, const struct sys_munmap_args *uap, register_t *retval) { /* { syscallarg(void *) addr; syscallarg(size_t) len; } */ struct proc *p = l->l_proc; vaddr_t addr; vsize_t size; struct vm_map *map; struct vm_map_entry *dead_entries; /* * get syscall args. */ addr = (vaddr_t)SCARG(uap, addr); size = (vsize_t)SCARG(uap, len); map = &p->p_vmspace->vm_map; if (round_and_check(map, &addr, &size)) return EINVAL; if (size == 0) return 0; vm_map_lock(map); #if 0 /* * interesting system call semantic: make sure entire range is * allocated before allowing an unmap. */ if (!uvm_map_checkprot(map, addr, addr + size, VM_PROT_NONE)) { vm_map_unlock(map); return EINVAL; } #endif uvm_unmap_remove(map, addr, addr + size, &dead_entries, 0); vm_map_unlock(map); if (dead_entries != NULL) uvm_unmap_detach(dead_entries, 0); return 0; } /* * sys_mprotect: the mprotect system call */ int sys_mprotect(struct lwp *l, const struct sys_mprotect_args *uap, register_t *retval) { /* { syscallarg(void *) addr; syscallarg(size_t) len; syscallarg(int) prot; } */ struct proc *p = l->l_proc; vaddr_t addr; vsize_t size; vm_prot_t prot; int error; /* * extract syscall args from uap */ addr = (vaddr_t)SCARG(uap, addr); size = (vsize_t)SCARG(uap, len); prot = SCARG(uap, prot) & VM_PROT_ALL; if (round_and_check(&p->p_vmspace->vm_map, &addr, &size)) return EINVAL; error = uvm_map_protect_user(l, addr, addr + size, prot); return error; } /* * sys_minherit: the minherit system call */ int sys_minherit(struct lwp *l, const struct sys_minherit_args *uap, register_t *retval) { /* { syscallarg(void *) addr; syscallarg(int) len; syscallarg(int) inherit; } */ struct proc *p = l->l_proc; vaddr_t addr; vsize_t size; vm_inherit_t inherit; int error; addr = (vaddr_t)SCARG(uap, addr); size = (vsize_t)SCARG(uap, len); inherit = SCARG(uap, inherit); if (round_and_check(&p->p_vmspace->vm_map, &addr, &size)) return EINVAL; error = uvm_map_inherit(&p->p_vmspace->vm_map, addr, addr + size, inherit); return error; } /* * sys_madvise: give advice about memory usage. */ /* ARGSUSED */ int sys_madvise(struct lwp *l, const struct sys_madvise_args *uap, register_t *retval) { /* { syscallarg(void *) addr; syscallarg(size_t) len; syscallarg(int) behav; } */ struct proc *p = l->l_proc; vaddr_t addr; vsize_t size; int advice, error; addr = (vaddr_t)SCARG(uap, addr); size = (vsize_t)SCARG(uap, len); advice = SCARG(uap, behav); if (round_and_check(&p->p_vmspace->vm_map, &addr, &size)) return EINVAL; switch (advice) { case MADV_NORMAL: case MADV_RANDOM: case MADV_SEQUENTIAL: error = uvm_map_advice(&p->p_vmspace->vm_map, addr, addr + size, advice); break; case MADV_WILLNEED: /* * Activate all these pages, pre-faulting them in if * necessary. */ error = uvm_map_willneed(&p->p_vmspace->vm_map, addr, addr + size); break; case MADV_DONTNEED: /* * Deactivate all these pages. We don't need them * any more. We don't, however, toss the data in * the pages. */ error = uvm_map_clean(&p->p_vmspace->vm_map, addr, addr + size, PGO_DEACTIVATE); break; case MADV_FREE: /* * These pages contain no valid data, and may be * garbage-collected. Toss all resources, including * any swap space in use. */ error = uvm_map_clean(&p->p_vmspace->vm_map, addr, addr + size, PGO_FREE); break; case MADV_SPACEAVAIL: /* * XXXMRG What is this? I think it's: * * Ensure that we have allocated backing-store * for these pages. * * This is going to require changes to the page daemon, * as it will free swap space allocated to pages in core. * There's also what to do for device/file/anonymous memory. */ return EINVAL; default: return EINVAL; } return error; } /* * sys_mlock: memory lock */ int sys_mlock(struct lwp *l, const struct sys_mlock_args *uap, register_t *retval) { /* { syscallarg(const void *) addr; syscallarg(size_t) len; } */ struct proc *p = l->l_proc; vaddr_t addr; vsize_t size; int error; /* * extract syscall args from uap */ addr = (vaddr_t)SCARG(uap, addr); size = (vsize_t)SCARG(uap, len); if (round_and_check(&p->p_vmspace->vm_map, &addr, &size)) return ENOMEM; if (atop(size) + uvmexp.wired > uvmexp.wiredmax) return EAGAIN; if (size + ptoa(pmap_wired_count(vm_map_pmap(&p->p_vmspace->vm_map))) > p->p_rlimit[RLIMIT_MEMLOCK].rlim_cur) return EAGAIN; error = uvm_map_pageable(&p->p_vmspace->vm_map, addr, addr+size, false, 0); if (error == EFAULT) error = ENOMEM; return error; } /* * sys_munlock: unlock wired pages */ int sys_munlock(struct lwp *l, const struct sys_munlock_args *uap, register_t *retval) { /* { syscallarg(const void *) addr; syscallarg(size_t) len; } */ struct proc *p = l->l_proc; vaddr_t addr; vsize_t size; /* * extract syscall args from uap */ addr = (vaddr_t)SCARG(uap, addr); size = (vsize_t)SCARG(uap, len); if (round_and_check(&p->p_vmspace->vm_map, &addr, &size)) return ENOMEM; if (uvm_map_pageable(&p->p_vmspace->vm_map, addr, addr+size, true, 0)) return ENOMEM; return 0; } /* * sys_mlockall: lock all pages mapped into an address space. */ int sys_mlockall(struct lwp *l, const struct sys_mlockall_args *uap, register_t *retval) { /* { syscallarg(int) flags; } */ struct proc *p = l->l_proc; int error, flags; flags = SCARG(uap, flags); if (flags == 0 || (flags & ~(MCL_CURRENT|MCL_FUTURE)) != 0) return EINVAL; error = uvm_map_pageable_all(&p->p_vmspace->vm_map, flags, p->p_rlimit[RLIMIT_MEMLOCK].rlim_cur); return error; } /* * sys_munlockall: unlock all pages mapped into an address space. */ int sys_munlockall(struct lwp *l, const void *v, register_t *retval) { struct proc *p = l->l_proc; (void) uvm_map_pageable_all(&p->p_vmspace->vm_map, 0, 0); return 0; } /* * uvm_mmap: internal version of mmap * * - used by sys_mmap and various framebuffers * - uobj is a struct uvm_object pointer or NULL for MAP_ANON * - caller must page-align the file offset * * XXX This appears to leak the uobj in various error branches? Need * to clean up the contract around uobj reference. */ static int uvm_mmap(struct vm_map *map, vaddr_t *addr, vsize_t size, vm_prot_t prot, vm_prot_t maxprot, int flags, int advice, struct uvm_object *uobj, voff_t foff, vsize_t locklimit) { vaddr_t align = 0; int error; uvm_flag_t uvmflag = 0; /* * check params */ if (size == 0) return 0; if (foff & PAGE_MASK) return EINVAL; if ((prot & maxprot) != prot) return EINVAL; /* * for non-fixed mappings, round off the suggested address. * for fixed mappings, check alignment. */ if ((flags & MAP_FIXED) == 0) { *addr = round_page(*addr); } else { if (*addr & PAGE_MASK) return EINVAL; uvmflag |= UVM_FLAG_FIXED | UVM_FLAG_UNMAP; } /* * Try to see if any requested alignment can even be attemped. * Make sure we can express the alignment (asking for a >= 4GB * alignment on an ILP32 architecure make no sense) and the * alignment is at least for a page sized quanitiy. If the * request was for a fixed mapping, make sure supplied address * adheres to the request alignment. */ align = (flags & MAP_ALIGNMENT_MASK) >> MAP_ALIGNMENT_SHIFT; if (align) { if (align >= sizeof(vaddr_t) * NBBY) return EINVAL; align = 1UL << align; if (align < PAGE_SIZE) return EINVAL; if (align >= vm_map_max(map)) return ENOMEM; if (flags & MAP_FIXED) { if ((*addr & (align-1)) != 0) return EINVAL; align = 0; } } /* * check resource limits */ if (!VM_MAP_IS_KERNEL(map) && (((rlim_t)curproc->p_vmspace->vm_map.size + (rlim_t)size) > curproc->p_rlimit[RLIMIT_AS].rlim_cur)) return ENOMEM; /* * handle anon vs. non-anon mappings. for non-anon mappings attach * to underlying vm object. */ if (flags & MAP_ANON) { KASSERT(uobj == NULL); foff = UVM_UNKNOWN_OFFSET; if ((flags & MAP_SHARED) == 0) /* XXX: defer amap create */ uvmflag |= UVM_FLAG_COPYONW; else /* shared: create amap now */ uvmflag |= UVM_FLAG_OVERLAY; } else { KASSERT(uobj != NULL); if ((flags & MAP_SHARED) == 0) { uvmflag |= UVM_FLAG_COPYONW; } } uvmflag = UVM_MAPFLAG(prot, maxprot, (flags & MAP_SHARED) ? UVM_INH_SHARE : UVM_INH_COPY, advice, uvmflag); error = uvm_map(map, addr, size, uobj, foff, align, uvmflag); if (error) { if (uobj) uobj->pgops->pgo_detach(uobj); return error; } /* * POSIX 1003.1b -- if our address space was configured * to lock all future mappings, wire the one we just made. * * Also handle the MAP_WIRED flag here. */ if (prot == VM_PROT_NONE) { /* * No more work to do in this case. */ return 0; } if ((flags & MAP_WIRED) != 0 || (map->flags & VM_MAP_WIREFUTURE) != 0) { vm_map_lock(map); if (atop(size) + uvmexp.wired > uvmexp.wiredmax || (locklimit != 0 && size + ptoa(pmap_wired_count(vm_map_pmap(map))) > locklimit)) { vm_map_unlock(map); uvm_unmap(map, *addr, *addr + size); return ENOMEM; } /* * uvm_map_pageable() always returns the map unlocked. */ error = uvm_map_pageable(map, *addr, *addr + size, false, UVM_LK_ENTER); if (error) { uvm_unmap(map, *addr, *addr + size); return error; } return 0; } return 0; } vaddr_t uvm_default_mapaddr(struct proc *p, vaddr_t base, vsize_t sz, int topdown) { if (topdown) return VM_DEFAULT_ADDRESS_TOPDOWN(base, sz); else return VM_DEFAULT_ADDRESS_BOTTOMUP(base, sz); } int uvm_mmap_dev(struct proc *p, void **addrp, size_t len, dev_t dev, off_t off) { struct uvm_object *uobj; int error, flags, prot; KASSERT(len > 0); flags = MAP_SHARED; prot = VM_PROT_READ | VM_PROT_WRITE; if (*addrp) flags |= MAP_FIXED; else *addrp = (void *)p->p_emul->e_vm_default_addr(p, (vaddr_t)p->p_vmspace->vm_daddr, len, p->p_vmspace->vm_map.flags & VM_MAP_TOPDOWN); uobj = udv_attach(dev, prot, off, len); if (uobj == NULL) return EINVAL; error = uvm_mmap(&p->p_vmspace->vm_map, (vaddr_t *)addrp, (vsize_t)len, prot, prot, flags, UVM_ADV_RANDOM, uobj, off, p->p_rlimit[RLIMIT_MEMLOCK].rlim_cur); return error; } int uvm_mmap_anon(struct proc *p, void **addrp, size_t len) { int error, flags, prot; flags = MAP_PRIVATE | MAP_ANON; prot = VM_PROT_READ | VM_PROT_WRITE; if (*addrp) flags |= MAP_FIXED; else *addrp = (void *)p->p_emul->e_vm_default_addr(p, (vaddr_t)p->p_vmspace->vm_daddr, len, p->p_vmspace->vm_map.flags & VM_MAP_TOPDOWN); error = uvm_mmap(&p->p_vmspace->vm_map, (vaddr_t *)addrp, (vsize_t)len, prot, prot, flags, UVM_ADV_NORMAL, NULL, 0, p->p_rlimit[RLIMIT_MEMLOCK].rlim_cur); return error; }
1 1 9 9 2 2 3 1 2 2 2 2 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 /* $NetBSD: sys_process_lwpstatus.c,v 1.4 2022/07/10 17:47:58 riastradh Exp $ */ /*- * Copyright (c) 2019 The NetBSD Foundation, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: sys_process_lwpstatus.c,v 1.4 2022/07/10 17:47:58 riastradh Exp $"); #ifdef _KERNEL_OPT #include "opt_ptrace.h" #include "opt_ktrace.h" #include "opt_pax.h" #include "opt_compat_netbsd32.h" #endif #if defined(__HAVE_COMPAT_NETBSD32) && !defined(COMPAT_NETBSD32) \ && !defined(_RUMPKERNEL) #define COMPAT_NETBSD32 #endif #include <sys/param.h> #include <sys/systm.h> #include <sys/errno.h> #include <sys/lwp.h> #include <sys/proc.h> #include <sys/ptrace.h> #ifndef PTRACE_REGS_ALIGN #define PTRACE_REGS_ALIGN /* nothing */ #endif void ptrace_read_lwpstatus(struct lwp *l, struct ptrace_lwpstatus *pls) { pls->pl_lwpid = l->l_lid; memcpy(&pls->pl_sigmask, &l->l_sigmask, sizeof(pls->pl_sigmask)); memcpy(&pls->pl_sigpend, &l->l_sigpend.sp_set, sizeof(pls->pl_sigpend)); if (l->l_name == NULL) memset(&pls->pl_name, 0, PL_LNAMELEN); else { KASSERT(strlen(l->l_name) < PL_LNAMELEN); strncpy(pls->pl_name, l->l_name, PL_LNAMELEN); } #ifdef PTRACE_LWP_GETPRIVATE pls->pl_private = (void *)(intptr_t)PTRACE_LWP_GETPRIVATE(l); #else pls->pl_private = l->l_private; #endif } void process_read_lwpstatus(struct lwp *l, struct ptrace_lwpstatus *pls) { ptrace_read_lwpstatus(l, pls); } int ptrace_update_lwp(struct proc *t, struct lwp **lt, lwpid_t lid) { if (lid == 0 || lid == (*lt)->l_lid || t->p_nlwps == 1) return 0; mutex_enter(t->p_lock); lwp_delref2(*lt); *lt = lwp_find(t, lid); if (*lt == NULL) { mutex_exit(t->p_lock); return ESRCH; } if ((*lt)->l_flag & LW_SYSTEM) { mutex_exit(t->p_lock); *lt = NULL; return EINVAL; } lwp_addref(*lt); mutex_exit(t->p_lock); return 0; } int process_validfpregs(struct lwp *l) { #if defined(PT_SETFPREGS) || defined(PT_GETFPREGS) return (l->l_flag & LW_SYSTEM) == 0; #else return 0; #endif } int process_validregs(struct lwp *l) { #if defined(PT_SETREGS) || defined(PT_GETREGS) return (l->l_flag & LW_SYSTEM) == 0; #else return 0; #endif } int process_validdbregs(struct lwp *l) { #if defined(PT_SETDBREGS) || defined(PT_GETDBREGS) return (l->l_flag & LW_SYSTEM) == 0; #else return 0; #endif } #ifdef PT_REGISTERS static int proc_regio(struct lwp *l, struct uio *uio, size_t ks, ptrace_regrfunc_t r, ptrace_regwfunc_t w) { char buf[1024] PTRACE_REGS_ALIGN; int error; char *kv; size_t kl; if (ks > sizeof(buf)) return E2BIG; if (uio->uio_offset < 0 || uio->uio_offset > (off_t)ks) return EINVAL; kv = buf + uio->uio_offset; kl = ks - uio->uio_offset; if (kl > uio->uio_resid) kl = uio->uio_resid; error = (*r)(l, buf, &ks); if (error == 0) error = uiomove(kv, kl, uio); if (error == 0 && uio->uio_rw == UIO_WRITE) { if (l->l_stat != LSSTOP) error = EBUSY; else error = (*w)(l, buf, ks); } uio->uio_offset = 0; return error; } #endif int process_doregs(struct lwp *curl /*tracer*/, struct lwp *l /*traced*/, struct uio *uio) { #if defined(PT_GETREGS) || defined(PT_SETREGS) size_t s; ptrace_regrfunc_t r; ptrace_regwfunc_t w; #ifdef COMPAT_NETBSD32 const bool pk32 = (curl->l_proc->p_flag & PK_32) != 0; if (__predict_false(pk32)) { if ((l->l_proc->p_flag & PK_32) == 0) { // 32 bit tracer can't trace 64 bit process return EINVAL; } s = sizeof(process_reg32); r = __FPTRCAST(ptrace_regrfunc_t, process_read_regs32); w = __FPTRCAST(ptrace_regwfunc_t, process_write_regs32); } else #endif { s = sizeof(struct reg); r = __FPTRCAST(ptrace_regrfunc_t, process_read_regs); w = __FPTRCAST(ptrace_regwfunc_t, process_write_regs); } return proc_regio(l, uio, s, r, w); #else return EINVAL; #endif } int process_dofpregs(struct lwp *curl /*tracer*/, struct lwp *l /*traced*/, struct uio *uio) { #if defined(PT_GETFPREGS) || defined(PT_SETFPREGS) size_t s; ptrace_regrfunc_t r; ptrace_regwfunc_t w; #ifdef COMPAT_NETBSD32 const bool pk32 = (curl->l_proc->p_flag & PK_32) != 0; if (__predict_false(pk32)) { if ((l->l_proc->p_flag & PK_32) == 0) { // 32 bit tracer can't trace 64 bit process return EINVAL; } s = sizeof(process_fpreg32); r = (ptrace_regrfunc_t)process_read_fpregs32; w = (ptrace_regwfunc_t)process_write_fpregs32; } else #endif { s = sizeof(struct fpreg); r = (ptrace_regrfunc_t)process_read_fpregs; w = (ptrace_regwfunc_t)process_write_fpregs; } return proc_regio(l, uio, s, r, w); #else return EINVAL; #endif } int process_dodbregs(struct lwp *curl /*tracer*/, struct lwp *l /*traced*/, struct uio *uio) { #if defined(PT_GETDBREGS) || defined(PT_SETDBREGS) size_t s; ptrace_regrfunc_t r; ptrace_regwfunc_t w; #ifdef COMPAT_NETBSD32 const bool pk32 = (curl->l_proc->p_flag & PK_32) != 0; if (__predict_false(pk32)) { if ((l->l_proc->p_flag & PK_32) == 0) { // 32 bit tracer can't trace 64 bit process return EINVAL; } s = sizeof(process_dbreg32); r = (ptrace_regrfunc_t)process_read_dbregs32; w = (ptrace_regwfunc_t)process_write_dbregs32; } else #endif { s = sizeof(struct dbreg); r = (ptrace_regrfunc_t)process_read_dbregs; w = (ptrace_regwfunc_t)process_write_dbregs; } return proc_regio(l, uio, s, r, w); #else return EINVAL; #endif }
203 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 /* $NetBSD: scsipi_base.h,v 1.24 2017/02/26 10:58:47 maya Exp $ */ /*- * Copyright (c) 1998, 2004 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Charles M. Hannum. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #ifndef _DEV_SCSIPI_SCSIPI_BASE_H_ #define _DEV_SCSIPI_SCSIPI_BASE_H_ struct scsipi_xfer *scsipi_get_xs(struct scsipi_periph *, int); void scsipi_put_xs(struct scsipi_xfer *); static __inline struct scsipi_xfer *scsipi_make_xs_internal(struct scsipi_periph *, struct scsipi_generic *, int cmdlen, u_char *data_addr, int datalen, int retries, int timeout, struct buf *, int flags) __unused; static __inline struct scsipi_xfer *scsipi_make_xs_unlocked(struct scsipi_periph *, struct scsipi_generic *, int cmdlen, u_char *data_addr, int datalen, int retries, int timeout, struct buf *, int flags) __unused; static __inline struct scsipi_xfer *scsipi_make_xs_locked(struct scsipi_periph *, struct scsipi_generic *, int cmdlen, u_char *data_addr, int datalen, int retries, int timeout, struct buf *, int flags) __unused; /* * Make a scsipi_xfer, and return a pointer to it. */ static __inline struct scsipi_xfer * scsipi_make_xs_internal(struct scsipi_periph *periph, struct scsipi_generic *cmd, int cmdlen, u_char *data_addr, int datalen, int retries, int timeout, struct buf *bp, int flags) { struct scsipi_xfer *xs; if ((xs = scsipi_get_xs(periph, flags)) == NULL) return (NULL); /* * Fill out the scsipi_xfer structure. We don't know whose context * the cmd is in, so copy it. */ memcpy(&xs->cmdstore, cmd, cmdlen); xs->cmd = &xs->cmdstore; xs->cmdlen = cmdlen; xs->data = data_addr; xs->datalen = datalen; xs->xs_retries = retries; xs->timeout = timeout; xs->bp = bp; return (xs); } static __inline struct scsipi_xfer * scsipi_make_xs_unlocked(struct scsipi_periph *periph, struct scsipi_generic *cmd, int cmdlen, u_char *data_addr, int datalen, int retries, int timeout, struct buf *bp, int flags) { return scsipi_make_xs_internal(periph, cmd, cmdlen, data_addr, datalen, retries, timeout, bp, flags & ~XS_CTL_NOSLEEP); } static __inline struct scsipi_xfer * scsipi_make_xs_locked(struct scsipi_periph *periph, struct scsipi_generic *cmd, int cmdlen, u_char *data_addr, int datalen, int retries, int timeout, struct buf *bp, int flags) { KDASSERT(mutex_owned(chan_mtx(periph->periph_channel))); return scsipi_make_xs_internal(periph, cmd, cmdlen, data_addr, datalen, retries, timeout, bp, flags | XS_CTL_NOSLEEP); } #endif /* _DEV_SCSIPI_SCSIPI_BASE_H_ */
26 5 5 40 274 271 16 33 258 20 20 20 274 19 274 17 10 4 275 19 275 19 275 275 275 275 275 275 9 9 26 24 2 1 267 249 24 1 270 270 272 272 254 35 23 23 23 23 23 22 29 29 16 3 14 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 /* $NetBSD: uvm_vnode.c,v 1.121 2024/04/05 13:05:41 riastradh Exp $ */ /* * Copyright (c) 1997 Charles D. Cranor and Washington University. * Copyright (c) 1991, 1993 * The Regents of the University of California. * Copyright (c) 1990 University of Utah. * * All rights reserved. * * This code is derived from software contributed to Berkeley by * the Systems Programming Group of the University of Utah Computer * Science Department. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)vnode_pager.c 8.8 (Berkeley) 2/13/94 * from: Id: uvm_vnode.c,v 1.1.2.26 1998/02/02 20:38:07 chuck Exp */ /* * uvm_vnode.c: the vnode pager. */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: uvm_vnode.c,v 1.121 2024/04/05 13:05:41 riastradh Exp $"); #ifdef _KERNEL_OPT #include "opt_uvmhist.h" #endif #include <sys/atomic.h> #include <sys/param.h> #include <sys/systm.h> #include <sys/kernel.h> #include <sys/vnode.h> #include <sys/disklabel.h> #include <sys/ioctl.h> #include <sys/fcntl.h> #include <sys/conf.h> #include <sys/pool.h> #include <sys/mount.h> #include <miscfs/specfs/specdev.h> #include <uvm/uvm.h> #include <uvm/uvm_readahead.h> #include <uvm/uvm_page_array.h> #ifdef UVMHIST UVMHIST_DEFINE(ubchist); #endif /* * functions */ static void uvn_alloc_ractx(struct uvm_object *); static void uvn_detach(struct uvm_object *); static int uvn_get(struct uvm_object *, voff_t, struct vm_page **, int *, int, vm_prot_t, int, int); static void uvn_markdirty(struct uvm_object *); static int uvn_put(struct uvm_object *, voff_t, voff_t, int); static void uvn_reference(struct uvm_object *); static int uvn_findpage(struct uvm_object *, voff_t, struct vm_page **, unsigned int, struct uvm_page_array *a, unsigned int); /* * master pager structure */ const struct uvm_pagerops uvm_vnodeops = { .pgo_reference = uvn_reference, .pgo_detach = uvn_detach, .pgo_get = uvn_get, .pgo_put = uvn_put, .pgo_markdirty = uvn_markdirty, }; /* * the ops! */ /* * uvn_reference * * duplicate a reference to a VM object. Note that the reference * count must already be at least one (the passed in reference) so * there is no chance of the uvn being killed or locked out here. * * => caller must call with object unlocked. * => caller must be using the same accessprot as was used at attach time */ static void uvn_reference(struct uvm_object *uobj) { vref((struct vnode *)uobj); } /* * uvn_detach * * remove a reference to a VM object. * * => caller must call with object unlocked and map locked. */ static void uvn_detach(struct uvm_object *uobj) { vrele((struct vnode *)uobj); } /* * uvn_put: flush page data to backing store. * * => object must be locked on entry! VOP_PUTPAGES must unlock it. * => flags: PGO_SYNCIO -- use sync. I/O */ static int uvn_put(struct uvm_object *uobj, voff_t offlo, voff_t offhi, int flags) { struct vnode *vp = (struct vnode *)uobj; int error; KASSERT(rw_write_held(uobj->vmobjlock)); error = VOP_PUTPAGES(vp, offlo, offhi, flags); return error; } /* * uvn_get: get pages (synchronously) from backing store * * => prefer map unlocked (not required) * => object must be locked! we will _unlock_ it before starting any I/O. * => flags: PGO_LOCKED: fault data structures are locked * => NOTE: offset is the offset of pps[0], _NOT_ pps[centeridx] * => NOTE: caller must check for released pages!! */ static int uvn_get(struct uvm_object *uobj, voff_t offset, struct vm_page **pps /* IN/OUT */, int *npagesp /* IN (OUT if PGO_LOCKED)*/, int centeridx, vm_prot_t access_type, int advice, int flags) { struct vnode *vp = (struct vnode *)uobj; int error; UVMHIST_FUNC(__func__); UVMHIST_CALLARGS(ubchist, "vp %#jx off %#jx", (uintptr_t)vp, offset, 0, 0); if (vp->v_type == VREG && (access_type & VM_PROT_WRITE) == 0 && (flags & PGO_LOCKED) == 0 && vp->v_tag != VT_TMPFS) { uvn_alloc_ractx(uobj); uvm_ra_request(vp->v_ractx, advice, uobj, offset, *npagesp << PAGE_SHIFT); } error = VOP_GETPAGES(vp, offset, pps, npagesp, centeridx, access_type, advice, flags); if (flags & PGO_LOCKED) KASSERT(rw_lock_held(uobj->vmobjlock)); return error; } /* * uvn_markdirty: called when the object gains first dirty page * * => uobj must be write locked. */ static void uvn_markdirty(struct uvm_object *uobj) { struct vnode *vp = (struct vnode *)uobj; KASSERT(rw_write_held(uobj->vmobjlock)); mutex_enter(vp->v_interlock); if ((vp->v_iflag & VI_ONWORKLST) == 0) { vn_syncer_add_to_worklist(vp, filedelay); } mutex_exit(vp->v_interlock); } /* * uvn_findpages: * return the page for the uobj and offset requested, allocating if needed. * => uobj must be locked. * => returned pages will be BUSY. */ int uvn_findpages(struct uvm_object *uobj, voff_t offset, unsigned int *npagesp, struct vm_page **pgs, struct uvm_page_array *a, unsigned int flags) { unsigned int count, found, npages; int i, rv; struct uvm_page_array a_store; if (a == NULL) { /* * XXX fragile API * note that the array can be the one supplied by the caller of * uvn_findpages. in that case, fillflags used by the caller * might not match strictly with ours. * in particular, the caller might have filled the array * without DENSE but passed us UFP_DIRTYONLY (thus DENSE). */ const unsigned int fillflags = ((flags & UFP_BACKWARD) ? UVM_PAGE_ARRAY_FILL_BACKWARD : 0) | ((flags & UFP_DIRTYONLY) ? (UVM_PAGE_ARRAY_FILL_DIRTY|UVM_PAGE_ARRAY_FILL_DENSE) : 0); a = &a_store; uvm_page_array_init(a, uobj, fillflags); } count = found = 0; npages = *npagesp; if (flags & UFP_BACKWARD) { for (i = npages - 1; i >= 0; i--, offset -= PAGE_SIZE) { rv = uvn_findpage(uobj, offset, &pgs[i], flags, a, i + 1); if (rv == 0) { if (flags & UFP_DIRTYONLY) break; } else found++; count++; } } else { for (i = 0; i < npages; i++, offset += PAGE_SIZE) { rv = uvn_findpage(uobj, offset, &pgs[i], flags, a, npages - i); if (rv == 0) { if (flags & UFP_DIRTYONLY) break; } else found++; count++; } } if (a == &a_store) { uvm_page_array_fini(a); } *npagesp = count; return (found); } /* * uvn_findpage: find a single page * * if a suitable page was found, put it in *pgp and return 1. * otherwise return 0. */ static int uvn_findpage(struct uvm_object *uobj, voff_t offset, struct vm_page **pgp, unsigned int flags, struct uvm_page_array *a, unsigned int nleft) { struct vm_page *pg; UVMHIST_FUNC(__func__); UVMHIST_CALLARGS(ubchist, "vp %#jx off %#jx", (uintptr_t)uobj, offset, 0, 0); /* * NOBUSY must come with NOWAIT and NOALLOC. if NOBUSY is * specified, this may be called with a reader lock. */ KASSERT(rw_lock_held(uobj->vmobjlock)); KASSERT((flags & UFP_NOBUSY) == 0 || (flags & UFP_NOWAIT) != 0); KASSERT((flags & UFP_NOBUSY) == 0 || (flags & UFP_NOALLOC) != 0); KASSERT((flags & UFP_NOBUSY) != 0 || rw_write_held(uobj->vmobjlock)); if (*pgp != NULL) { UVMHIST_LOG(ubchist, "dontcare", 0,0,0,0); goto skip_offset; } for (;;) { /* * look for an existing page. */ pg = uvm_page_array_fill_and_peek(a, offset, nleft); if (pg != NULL && pg->offset != offset) { struct vm_page __diagused *tpg; KASSERT( ((a->ar_flags & UVM_PAGE_ARRAY_FILL_BACKWARD) != 0) == (pg->offset < offset)); KASSERT((tpg = uvm_pagelookup(uobj, offset)) == NULL || ((a->ar_flags & UVM_PAGE_ARRAY_FILL_DIRTY) != 0 && !uvm_obj_page_dirty_p(tpg))); pg = NULL; if ((a->ar_flags & UVM_PAGE_ARRAY_FILL_DENSE) != 0) { UVMHIST_LOG(ubchist, "dense", 0,0,0,0); return 0; } } /* nope? allocate one now */ if (pg == NULL) { if (flags & UFP_NOALLOC) { UVMHIST_LOG(ubchist, "noalloc", 0,0,0,0); return 0; } pg = uvm_pagealloc(uobj, offset, NULL, UVM_FLAG_COLORMATCH); if (pg == NULL) { if (flags & UFP_NOWAIT) { UVMHIST_LOG(ubchist, "nowait",0,0,0,0); return 0; } rw_exit(uobj->vmobjlock); uvm_wait("uvnfp1"); uvm_page_array_clear(a); rw_enter(uobj->vmobjlock, RW_WRITER); continue; } UVMHIST_LOG(ubchist, "alloced %#jx (color %ju)", (uintptr_t)pg, VM_PGCOLOR(pg), 0, 0); KASSERTMSG(uvm_pagegetdirty(pg) == UVM_PAGE_STATUS_CLEAN, "page %p not clean", pg); break; } else if (flags & UFP_NOCACHE) { UVMHIST_LOG(ubchist, "nocache",0,0,0,0); goto skip; } /* page is there, see if we need to wait on it */ if ((pg->flags & PG_BUSY) != 0) { if (flags & UFP_NOWAIT) { UVMHIST_LOG(ubchist, "nowait",0,0,0,0); goto skip; } UVMHIST_LOG(ubchist, "wait %#jx (color %ju)", (uintptr_t)pg, VM_PGCOLOR(pg), 0, 0); uvm_pagewait(pg, uobj->vmobjlock, "uvnfp2"); uvm_page_array_clear(a); rw_enter(uobj->vmobjlock, RW_WRITER); continue; } /* skip PG_RDONLY pages if requested */ if ((flags & UFP_NORDONLY) && (pg->flags & PG_RDONLY)) { UVMHIST_LOG(ubchist, "nordonly",0,0,0,0); goto skip; } /* stop on clean pages if requested */ if (flags & UFP_DIRTYONLY) { const bool dirty = uvm_pagecheckdirty(pg, false); if (!dirty) { UVMHIST_LOG(ubchist, "dirtonly", 0,0,0,0); return 0; } } /* mark the page BUSY and we're done. */ if ((flags & UFP_NOBUSY) == 0) { pg->flags |= PG_BUSY; UVM_PAGE_OWN(pg, "uvn_findpage"); } UVMHIST_LOG(ubchist, "found %#jx (color %ju)", (uintptr_t)pg, VM_PGCOLOR(pg), 0, 0); uvm_page_array_advance(a); break; } *pgp = pg; return 1; skip_offset: /* * skip this offset */ pg = uvm_page_array_peek(a); if (pg != NULL) { if (pg->offset == offset) { uvm_page_array_advance(a); } else { KASSERT((a->ar_flags & UVM_PAGE_ARRAY_FILL_DENSE) == 0); } } return 0; skip: /* * skip this page */ KASSERT(pg != NULL); uvm_page_array_advance(a); return 0; } /* * uvm_vnp_setsize: grow or shrink a vnode uobj * * grow => just update size value * shrink => toss un-needed pages * * => we assume that the caller has a reference of some sort to the * vnode in question so that it will not be yanked out from under * us. */ void uvm_vnp_setsize(struct vnode *vp, voff_t newsize) { struct uvm_object *uobj = &vp->v_uobj; voff_t pgend = round_page(newsize); voff_t oldsize; UVMHIST_FUNC(__func__); UVMHIST_CALLED(ubchist); rw_enter(uobj->vmobjlock, RW_WRITER); UVMHIST_LOG(ubchist, "vp %#jx old %#jx new %#jx", (uintptr_t)vp, vp->v_size, newsize, 0); /* * now check if the size has changed: if we shrink we had better * toss some pages... */ KASSERT(newsize != VSIZENOTSET); KASSERT(newsize >= 0); KASSERTMSG(vp->v_size <= vp->v_writesize, "vp=%p" " v_size=0x%llx v_writesize=0x%llx", vp, (unsigned long long)vp->v_size, (unsigned long long)vp->v_writesize); KASSERTMSG((vp->v_size == vp->v_writesize || newsize == vp->v_writesize || newsize <= vp->v_size), "vp=%p v_size=0x%llx v_writesize=0x%llx newsize=0x%llx", vp, (unsigned long long)vp->v_size, (unsigned long long)vp->v_writesize, (unsigned long long)newsize); oldsize = vp->v_writesize; /* * check whether size shrinks * if old size hasn't been set, there are no pages to drop * if there was an integer overflow in pgend, then this is no shrink */ if (oldsize > pgend && oldsize != VSIZENOTSET && pgend >= 0) { (void) uvn_put(uobj, pgend, 0, PGO_FREE | PGO_SYNCIO); rw_enter(uobj->vmobjlock, RW_WRITER); } mutex_enter(vp->v_interlock); vp->v_size = vp->v_writesize = newsize; mutex_exit(vp->v_interlock); rw_exit(uobj->vmobjlock); } void uvm_vnp_setwritesize(struct vnode *vp, voff_t newsize) { rw_enter(vp->v_uobj.vmobjlock, RW_WRITER); KASSERT(newsize != VSIZENOTSET); KASSERT(newsize >= 0); KASSERT(vp->v_size != VSIZENOTSET); KASSERT(vp->v_writesize != VSIZENOTSET); KASSERTMSG(vp->v_size <= vp->v_writesize, "vp=%p" " v_size=0x%llx v_writesize=0x%llx newsize=0x%llx", vp, (unsigned long long)vp->v_size, (unsigned long long)vp->v_writesize, (unsigned long long)newsize); KASSERTMSG(vp->v_size <= newsize, "vp=%p" " v_size=0x%llx v_writesize=0x%llx newsize=0x%llx", vp, (unsigned long long)vp->v_size, (unsigned long long)vp->v_writesize, (unsigned long long)newsize); mutex_enter(vp->v_interlock); vp->v_writesize = newsize; mutex_exit(vp->v_interlock); rw_exit(vp->v_uobj.vmobjlock); } bool uvn_text_p(struct uvm_object *uobj) { struct vnode *vp = (struct vnode *)uobj; int iflag; /* * v_interlock is not held here, but VI_EXECMAP is only ever changed * with the vmobjlock held too. */ iflag = atomic_load_relaxed(&vp->v_iflag); return (iflag & VI_EXECMAP) != 0; } static void uvn_alloc_ractx(struct uvm_object *uobj) { struct vnode *vp = (struct vnode *)uobj; struct uvm_ractx *ra = NULL; KASSERT(rw_write_held(uobj->vmobjlock)); if (vp->v_type != VREG) { return; } if (vp->v_ractx != NULL) { return; } if (vp->v_ractx == NULL) { rw_exit(uobj->vmobjlock); ra = uvm_ra_allocctx(); rw_enter(uobj->vmobjlock, RW_WRITER); if (ra != NULL && vp->v_ractx == NULL) { vp->v_ractx = ra; ra = NULL; } } if (ra != NULL) { uvm_ra_freectx(ra); } }
2 11 6 3 1 1 1 1 1 1 3 1 1 3 1 2 2 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 /* $NetBSD: pci_usrreq.c,v 1.31 2021/09/05 03:47:24 mrg Exp $ */ /* * Copyright 2001 Wasabi Systems, Inc. * All rights reserved. * * Written by Jason R. Thorpe for Wasabi Systems, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed for the NetBSD Project by * Wasabi Systems, Inc. * 4. The name of Wasabi Systems, Inc. may not be used to endorse * or promote products derived from this software without specific prior * written permission. * * THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL WASABI SYSTEMS, INC * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /* * User -> kernel interface for PCI bus access. */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: pci_usrreq.c,v 1.31 2021/09/05 03:47:24 mrg Exp $"); #ifdef _KERNEL_OPT #include "opt_pci.h" #endif #include <sys/param.h> #include <sys/conf.h> #include <sys/device.h> #include <sys/ioctl.h> #include <sys/proc.h> #include <sys/systm.h> #include <sys/errno.h> #include <sys/fcntl.h> #include <sys/kauth.h> #include <dev/pci/pcireg.h> #include <dev/pci/pcivar.h> #include <dev/pci/pciio.h> static int pciopen(dev_t dev, int flags, int mode, struct lwp *l) { device_t dv; dv = device_lookup(&pci_cd, minor(dev)); if (dv == NULL) return ENXIO; return 0; } static int pciioctl(dev_t dev, u_long cmd, void *data, int flag, struct lwp *l) { struct pci_softc *sc = device_lookup_private(&pci_cd, minor(dev)); struct pci_child *child; struct pciio_bdf_cfgreg *bdfr; struct pciio_businfo *binfo; struct pciio_drvname *dname; struct pciio_drvnameonbus *dnameonbus; pcitag_t tag; switch (cmd) { case PCI_IOC_BDF_CFGREAD: case PCI_IOC_BDF_CFGWRITE: bdfr = data; if (bdfr->bus > 255 || bdfr->device >= sc->sc_maxndevs || bdfr->function > 7 || ISSET(bdfr->cfgreg.reg, 3)) return EINVAL; tag = pci_make_tag(sc->sc_pc, bdfr->bus, bdfr->device, bdfr->function); if (cmd == PCI_IOC_BDF_CFGREAD) { bdfr->cfgreg.val = pci_conf_read(sc->sc_pc, tag, bdfr->cfgreg.reg); } else { if ((flag & FWRITE) == 0) return EBADF; pci_conf_write(sc->sc_pc, tag, bdfr->cfgreg.reg, bdfr->cfgreg.val); } return 0; case PCI_IOC_BUSINFO: binfo = data; binfo->busno = sc->sc_bus; binfo->maxdevs = sc->sc_maxndevs; return 0; case PCI_IOC_DRVNAME: dname = data; if (dname->device >= sc->sc_maxndevs || dname->function > 7) return EINVAL; child = &sc->PCI_SC_DEVICESC(dname->device, dname->function); if (!child->c_dev) return ENXIO; strlcpy(dname->name, device_xname(child->c_dev), sizeof dname->name); return 0; case PCI_IOC_DRVNAMEONBUS: dnameonbus = data; int i; for (i = 0; i < pci_cd.cd_ndevs; i++) { sc = device_lookup_private(&pci_cd, i); if (sc == NULL) continue; if (sc->sc_bus == dnameonbus->bus) break; /* found the right bus */ } if (i == pci_cd.cd_ndevs || sc == NULL) return ENXIO; if (dnameonbus->device >= sc->sc_maxndevs || dnameonbus->function > 7) return EINVAL; child = &sc->PCI_SC_DEVICESC(dnameonbus->device, dnameonbus->function); if (!child->c_dev) return ENXIO; strlcpy(dnameonbus->name, device_xname(child->c_dev), sizeof dnameonbus->name); return 0; default: return ENOTTY; } } static paddr_t pcimmap(dev_t dev, off_t offset, int prot) { struct pci_softc *sc = device_lookup_private(&pci_cd, minor(dev)); struct pci_child *c; struct pci_range *r; int flags = 0; int device, range; if (kauth_authorize_machdep(kauth_cred_get(), KAUTH_MACHDEP_UNMANAGEDMEM, NULL, NULL, NULL, NULL) != 0) { return -1; } /* * Since we allow mapping of the entire bus, we * take the offset to be the address on the bus, * and pass 0 as the offset into that range. * * XXX Need a way to deal with linear/etc. * * XXX we rely on MD mmap() methods to enforce limits since these * are hidden in *_tag_t structs if they exist at all */ #ifdef PCI_MAGIC_IO_RANGE /* * first, check if someone's trying to map the IO range * XXX this assumes 64kB IO space even though some machines can have * significantly more than that - macppc's bandit host bridge allows * 8MB IO space and sparc64 may have the entire 4GB available. The * firmware on both tries to use the lower 64kB first though and * exausting it is pretty difficult so we should be safe */ if ((offset >= PCI_MAGIC_IO_RANGE) && (offset < (PCI_MAGIC_IO_RANGE + 0x10000))) { return bus_space_mmap(sc->sc_iot, offset - PCI_MAGIC_IO_RANGE, 0, prot, 0); } #endif /* PCI_MAGIC_IO_RANGE */ for (device = 0; device < __arraycount(sc->sc_devices); device++) { c = &sc->sc_devices[device]; if (c->c_dev == NULL) continue; for (range = 0; range < __arraycount(c->c_range); range++) { r = &c->c_range[range]; if (r->r_size == 0) break; if (offset >= r->r_offset && offset < r->r_offset + r->r_size) { flags = r->r_flags; break; } } } return bus_space_mmap(sc->sc_memt, offset, 0, prot, flags); } const struct cdevsw pci_cdevsw = { .d_open = pciopen, .d_close = nullclose, .d_read = noread, .d_write = nowrite, .d_ioctl = pciioctl, .d_stop = nostop, .d_tty = notty, .d_poll = nopoll, .d_mmap = pcimmap, .d_kqfilter = nokqfilter, .d_discard = nodiscard, .d_flag = D_OTHER }; /* * pci_devioctl: * * PCI ioctls that can be performed on devices directly. */ int pci_devioctl(pci_chipset_tag_t pc, pcitag_t tag, u_long cmd, void *data, int flag, struct lwp *l) { struct pciio_cfgreg *r = (void *) data; switch (cmd) { case PCI_IOC_CFGREAD: r->val = pci_conf_read(pc, tag, r->reg); break; case PCI_IOC_CFGWRITE: if ((flag & FWRITE) == 0) return EBADF; pci_conf_write(pc, tag, r->reg, r->val); break; default: return EPASSTHROUGH; } return 0; }
2 2 2 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 /* $NetBSD: dead_vfsops.c,v 1.13 2022/10/26 23:39:43 riastradh Exp $ */ /*- * Copyright (c) 2014 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Juergen Hannken-Illjes. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: dead_vfsops.c,v 1.13 2022/10/26 23:39:43 riastradh Exp $"); #include <sys/param.h> #include <sys/systm.h> #include <sys/vnode.h> #include <sys/mount.h> #include <miscfs/deadfs/deadfs.h> #include <miscfs/specfs/specdev.h> VFS_PROTOS(dead); static void dead_panic(void); static const struct vnodeopv_desc * const dead_vnodeopv_descs[] = { &dead_vnodeop_opv_desc, NULL }; struct mount *dead_rootmount; struct vfsops dead_vfsops = { .vfs_name = "dead", .vfs_min_mount_data = 0, .vfs_mount = (void *)dead_panic, .vfs_start = (void *)dead_panic, .vfs_unmount = (void *)dead_panic, .vfs_root = (void *)dead_panic, .vfs_quotactl = (void *)dead_panic, .vfs_statvfs = (void *)eopnotsupp, .vfs_sync = (void *)dead_panic, .vfs_vget = (void *)dead_panic, .vfs_loadvnode = (void *)dead_panic, .vfs_newvnode = dead_newvnode, .vfs_fhtovp = (void *)dead_panic, .vfs_vptofh = (void *)eopnotsupp, .vfs_init = (void *)dead_panic, .vfs_reinit = (void *)dead_panic, .vfs_done = (void *)dead_panic, .vfs_mountroot = (void *)dead_panic, .vfs_snapshot = (void *)dead_panic, .vfs_extattrctl = (void *)dead_panic, .vfs_suspendctl = (void *)dead_panic, .vfs_renamelock_enter = (void *)dead_panic, .vfs_renamelock_exit = (void *)dead_panic, .vfs_fsync = (void *)eopnotsupp, .vfs_opv_descs = dead_vnodeopv_descs }; static void dead_panic(void) { panic("dead fs operation used"); } /* * Create a new anonymous device vnode. */ int dead_newvnode(struct mount *mp, struct vnode *dvp, struct vnode *vp, struct vattr *vap, kauth_cred_t cred, void *extra, size_t *key_len, const void **new_key) { KASSERT(mp == dead_rootmount); KASSERT(dvp == NULL); KASSERT(vap->va_type == VCHR || vap->va_type == VBLK); KASSERT(vap->va_rdev != VNOVAL); vp->v_tag = VT_NON; vp->v_type = vap->va_type; vp->v_op = spec_vnodeop_p; vp->v_vflag |= VV_MPSAFE; uvm_vnp_setsize(vp, 0); spec_node_init(vp, vap->va_rdev); *key_len = 0; *new_key = NULL; return 0; }
536 539 535 535 538 554 556 555 553 556 539 536 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 /* $NetBSD: uvm_physseg.c,v 1.20 2024/01/13 09:44:42 tnn Exp $ */ /* * Copyright (c) 1997 Charles D. Cranor and Washington University. * Copyright (c) 1991, 1993, The Regents of the University of California. * * All rights reserved. * * This code is derived from software contributed to Berkeley by * The Mach Operating System project at Carnegie-Mellon University. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)vm_page.h 7.3 (Berkeley) 4/21/91 * from: Id: uvm_page.h,v 1.1.2.6 1998/02/04 02:31:42 chuck Exp * * * Copyright (c) 1987, 1990 Carnegie-Mellon University. * All rights reserved. * * Permission to use, copy, modify and distribute this software and * its documentation is hereby granted, provided that both the copyright * notice and this permission notice appear in all copies of the * software, derivative works or modified versions, and any portions * thereof, and that both notices appear in supporting documentation. * * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. * * Carnegie Mellon requests users of this software to return to * * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU * School of Computer Science * Carnegie Mellon University * Pittsburgh PA 15213-3890 * * any improvements or extensions that they make and grant Carnegie the * rights to redistribute these changes. */ /* * Consolidated API from uvm_page.c and others. * Consolidated and designed by Cherry G. Mathew <cherry@zyx.in> * rbtree(3) backing implementation by: * Santhosh N. Raju <santhosh.raju@gmail.com> */ #ifdef _KERNEL_OPT #include "opt_uvm.h" #endif #include <sys/param.h> #include <sys/types.h> #include <sys/extent.h> #include <sys/kmem.h> #include <uvm/uvm.h> #include <uvm/uvm_page.h> #include <uvm/uvm_param.h> #include <uvm/uvm_pdpolicy.h> #include <uvm/uvm_physseg.h> /* * uvm_physseg: describes one segment of physical memory */ struct uvm_physseg { /* used during RB tree lookup for PHYS_TO_VM_PAGE(). */ #if defined(UVM_HOTPLUG) struct rb_node rb_node; /* tree information */ #endif paddr_t start; /* PF# of first page in segment */ paddr_t end; /* (PF# of last page in segment) + 1 */ struct vm_page *pgs; /* vm_page structures (from start) */ /* less performance sensitive fields. */ paddr_t avail_start; /* PF# of first free page in segment */ paddr_t avail_end; /* (PF# of last free page in segment) +1 */ struct extent *ext; /* extent(9) structure to manage pgs[] */ int free_list; /* which free list they belong on */ u_long start_hint; /* start looking for free pages here */ #ifdef __HAVE_PMAP_PHYSSEG struct pmap_physseg pmseg; /* pmap specific (MD) data */ #endif }; /* * These functions are reserved for uvm(9) internal use and are not * exported in the header file uvm_physseg.h * * Thus they are redefined here. */ void uvm_physseg_init_seg(uvm_physseg_t, struct vm_page *); void uvm_physseg_seg_chomp_slab(uvm_physseg_t, struct vm_page *, size_t); /* returns a pgs array */ struct vm_page *uvm_physseg_seg_alloc_from_slab(uvm_physseg_t, size_t); #if defined(UVM_HOTPLUG) /* rbtree impementation */ #define HANDLE_TO_PHYSSEG_NODE(h) ((struct uvm_physseg *)(h)) #define PHYSSEG_NODE_TO_HANDLE(u) ((uvm_physseg_t)(u)) struct uvm_physseg_graph { struct rb_tree rb_tree; /* Tree for entries */ int nentries; /* Number of entries */ } __aligned(COHERENCY_UNIT); static struct uvm_physseg_graph uvm_physseg_graph __read_mostly; /* * Note on kmem(9) allocator usage: * We take the conservative approach that plug/unplug are allowed to * fail in high memory stress situations. * * We want to avoid re-entrant situations in which one plug/unplug * operation is waiting on a previous one to complete, since this * makes the design more complicated than necessary. * * We may review this and change its behaviour, once the use cases * become more obvious. */ /* * Special alloc()/free() functions for boot time support: * We assume that alloc() at boot time is only for new 'vm_physseg's * This allows us to use a static array for memory allocation at boot * time. Thus we avoid using kmem(9) which is not ready at this point * in boot. * * After kmem(9) is ready, we use it. We currently discard any free()s * to this static array, since the size is small enough to be a * trivial waste on all architectures we run on. */ static size_t nseg = 0; static struct uvm_physseg uvm_physseg[VM_PHYSSEG_MAX]; static void * uvm_physseg_alloc(size_t sz) { /* * During boot time, we only support allocating vm_physseg * entries from the static array. * We need to assert for this. */ if (__predict_false(uvm.page_init_done == false)) { if (sz % sizeof(struct uvm_physseg)) panic("%s: tried to alloc size other than multiple" " of struct uvm_physseg at boot\n", __func__); size_t n = sz / sizeof(struct uvm_physseg); nseg += n; KASSERT(nseg > 0); KASSERT(nseg <= VM_PHYSSEG_MAX); return &uvm_physseg[nseg - n]; } return kmem_zalloc(sz, KM_NOSLEEP); } static void uvm_physseg_free(void *p, size_t sz) { /* * This is a bit tricky. We do allow simulation of free() * during boot (for eg: when MD code is "steal"ing memory, * and the segment has been exhausted (and thus needs to be * free() - ed. * free() also complicates things because we leak the * free(). Therefore calling code can't assume that free()-ed * memory is available for alloc() again, at boot time. * * Thus we can't explicitly disallow free()s during * boot time. However, the same restriction for alloc() * applies to free(). We only allow uvm_physseg related free()s * via this function during boot time. */ if (__predict_false(uvm.page_init_done == false)) { if (sz % sizeof(struct uvm_physseg)) panic("%s: tried to free size other than struct uvm_physseg" " at boot\n", __func__); } /* * Could have been in a single if(){} block - split for * clarity */ if ((struct uvm_physseg *)p >= uvm_physseg && (struct uvm_physseg *)p < (uvm_physseg + VM_PHYSSEG_MAX)) { if (sz % sizeof(struct uvm_physseg)) panic("%s: tried to free() other than struct uvm_physseg" " from static array\n", __func__); if ((sz / sizeof(struct uvm_physseg)) >= VM_PHYSSEG_MAX) panic("%s: tried to free() the entire static array!", __func__); return; /* Nothing to free */ } kmem_free(p, sz); } /* XXX: Multi page size */ bool uvm_physseg_plug(paddr_t pfn, size_t pages, uvm_physseg_t *psp) { int preload; size_t slabpages; struct uvm_physseg *ps, *current_ps = NULL; struct vm_page *slab = NULL, *pgs = NULL; #ifdef DEBUG paddr_t off; uvm_physseg_t upm; upm = uvm_physseg_find(pfn, &off); ps = HANDLE_TO_PHYSSEG_NODE(upm); if (ps != NULL) /* XXX; do we allow "update" plugs ? */ return false; #endif /* * do we have room? */ ps = uvm_physseg_alloc(sizeof (struct uvm_physseg)); if (ps == NULL) { printf("uvm_page_physload: unable to load physical memory " "segment\n"); printf("\t%d segments allocated, ignoring 0x%"PRIxPADDR" -> 0x%"PRIxPADDR"\n", VM_PHYSSEG_MAX, pfn, pfn + pages + 1); printf("\tincrease VM_PHYSSEG_MAX\n"); return false; } /* span init */ ps->start = pfn; ps->end = pfn + pages; /* * XXX: Ugly hack because uvmexp.npages accounts for only * those pages in the segment included below as well - this * should be legacy and removed. */ ps->avail_start = ps->start; ps->avail_end = ps->end; /* * check to see if this is a "preload" (i.e. uvm_page_init hasn't been * called yet, so kmem is not available). */ preload = 1; /* We are going to assume it is a preload */ RB_TREE_FOREACH(current_ps, &(uvm_physseg_graph.rb_tree)) { /* If there are non NULL pages then we are not in a preload */ if (current_ps->pgs != NULL) { preload = 0; /* Try to scavenge from earlier unplug()s. */ pgs = uvm_physseg_seg_alloc_from_slab(current_ps, pages); if (pgs != NULL) { break; } } } /* * if VM is already running, attempt to kmem_alloc vm_page structures */ if (!preload) { if (pgs == NULL) { /* Brand new */ /* Iteratively try alloc down from uvmexp.npages */ for (slabpages = (size_t) uvmexp.npages; slabpages >= pages; slabpages--) { slab = kmem_zalloc(sizeof *pgs * (long unsigned int)slabpages, KM_NOSLEEP); if (slab != NULL) break; } if (slab == NULL) { uvm_physseg_free(ps, sizeof(struct uvm_physseg)); return false; } uvm_physseg_seg_chomp_slab(ps, slab, (size_t) slabpages); /* We allocate enough for this plug */ pgs = uvm_physseg_seg_alloc_from_slab(ps, pages); if (pgs == NULL) { printf("unable to uvm_physseg_seg_alloc_from_slab() from backend\n"); return false; } } else { /* Reuse scavenged extent */ ps->ext = current_ps->ext; } physmem += pages; uvmpdpol_reinit(); } else { /* Boot time - see uvm_page.c:uvm_page_init() */ pgs = NULL; ps->pgs = pgs; } /* * now insert us in the proper place in uvm_physseg_graph.rb_tree */ current_ps = rb_tree_insert_node(&(uvm_physseg_graph.rb_tree), ps); if (current_ps != ps) { panic("uvm_page_physload: Duplicate address range detected!"); } uvm_physseg_graph.nentries++; /* * uvm_pagefree() requires the PHYS_TO_VM_PAGE(pgs[i]) on the * newly allocated pgs[] to return the correct value. This is * a bit of a chicken and egg problem, since it needs * uvm_physseg_find() to succeed. For this, the node needs to * be inserted *before* uvm_physseg_init_seg() happens. * * During boot, this happens anyway, since * uvm_physseg_init_seg() is called later on and separately * from uvm_page.c:uvm_page_init(). * In the case of hotplug we need to ensure this. */ if (__predict_true(!preload)) uvm_physseg_init_seg(ps, pgs); if (psp != NULL) *psp = ps; return true; } static int uvm_physseg_compare_nodes(void *ctx, const void *nnode1, const void *nnode2) { const struct uvm_physseg *enode1 = nnode1; const struct uvm_physseg *enode2 = nnode2; KASSERT(enode1->start < enode2->start || enode1->start >= enode2->end); KASSERT(enode2->start < enode1->start || enode2->start >= enode1->end); if (enode1->start < enode2->start) return -1; if (enode1->start >= enode2->end) return 1; return 0; } static int uvm_physseg_compare_key(void *ctx, const void *nnode, const void *pkey) { const struct uvm_physseg *enode = nnode; const paddr_t pa = *(const paddr_t *) pkey; if(enode->start <= pa && pa < enode->end) return 0; if (enode->start < pa) return -1; if (enode->end > pa) return 1; return 0; } static const rb_tree_ops_t uvm_physseg_tree_ops = { .rbto_compare_nodes = uvm_physseg_compare_nodes, .rbto_compare_key = uvm_physseg_compare_key, .rbto_node_offset = offsetof(struct uvm_physseg, rb_node), .rbto_context = NULL }; /* * uvm_physseg_init: init the physmem * * => physmem unit should not be in use at this point */ void uvm_physseg_init(void) { rb_tree_init(&(uvm_physseg_graph.rb_tree), &uvm_physseg_tree_ops); uvm_physseg_graph.nentries = 0; } uvm_physseg_t uvm_physseg_get_next(uvm_physseg_t upm) { /* next of invalid is invalid, not fatal */ if (uvm_physseg_valid_p(upm) == false) return UVM_PHYSSEG_TYPE_INVALID; return (uvm_physseg_t) rb_tree_iterate(&(uvm_physseg_graph.rb_tree), upm, RB_DIR_RIGHT); } uvm_physseg_t uvm_physseg_get_prev(uvm_physseg_t upm) { /* prev of invalid is invalid, not fatal */ if (uvm_physseg_valid_p(upm) == false) return UVM_PHYSSEG_TYPE_INVALID; return (uvm_physseg_t) rb_tree_iterate(&(uvm_physseg_graph.rb_tree), upm, RB_DIR_LEFT); } uvm_physseg_t uvm_physseg_get_last(void) { return (uvm_physseg_t) RB_TREE_MAX(&(uvm_physseg_graph.rb_tree)); } uvm_physseg_t uvm_physseg_get_first(void) { return (uvm_physseg_t) RB_TREE_MIN(&(uvm_physseg_graph.rb_tree)); } paddr_t uvm_physseg_get_highest_frame(void) { struct uvm_physseg *ps = (uvm_physseg_t) RB_TREE_MAX(&(uvm_physseg_graph.rb_tree)); return ps->end - 1; } /* * uvm_page_physunload: unload physical memory and return it to * caller. */ bool uvm_page_physunload(uvm_physseg_t upm, int freelist, paddr_t *paddrp) { struct uvm_physseg *seg; if (__predict_true(uvm.page_init_done == true)) panic("%s: unload attempted after uvm_page_init()\n", __func__); seg = HANDLE_TO_PHYSSEG_NODE(upm); if (seg->free_list != freelist) { return false; } /* * During cold boot, what we're about to unplug hasn't been * put on the uvm freelist, nor has uvmexp.npages been * updated. (This happens in uvm_page.c:uvm_page_init()) * * For hotplug, we assume here that the pages being unloaded * here are completely out of sight of uvm (ie; not on any uvm * lists), and that uvmexp.npages has been suitably * decremented before we're called. * * XXX: will avail_end == start if avail_start < avail_end? */ /* try from front */ if (seg->avail_start == seg->start && seg->avail_start < seg->avail_end) { *paddrp = ctob(seg->avail_start); return uvm_physseg_unplug(seg->avail_start, 1); } /* try from rear */ if (seg->avail_end == seg->end && seg->avail_start < seg->avail_end) { *paddrp = ctob(seg->avail_end - 1); return uvm_physseg_unplug(seg->avail_end - 1, 1); } return false; } bool uvm_page_physunload_force(uvm_physseg_t upm, int freelist, paddr_t *paddrp) { struct uvm_physseg *seg; seg = HANDLE_TO_PHYSSEG_NODE(upm); if (__predict_true(uvm.page_init_done == true)) panic("%s: unload attempted after uvm_page_init()\n", __func__); /* any room in this bank? */ if (seg->avail_start >= seg->avail_end) { return false; /* nope */ } *paddrp = ctob(seg->avail_start); /* Always unplug from front */ return uvm_physseg_unplug(seg->avail_start, 1); } /* * vm_physseg_find: find vm_physseg structure that belongs to a PA */ uvm_physseg_t uvm_physseg_find(paddr_t pframe, psize_t *offp) { struct uvm_physseg * ps = NULL; ps = rb_tree_find_node(&(uvm_physseg_graph.rb_tree), &pframe); if(ps != NULL && offp != NULL) *offp = pframe - ps->start; return ps; } #else /* UVM_HOTPLUG */ /* * physical memory config is stored in vm_physmem. */ #define VM_PHYSMEM_PTR(i) (&vm_physmem[i]) #if VM_PHYSSEG_MAX == 1 #define VM_PHYSMEM_PTR_SWAP(i, j) /* impossible */ #else #define VM_PHYSMEM_PTR_SWAP(i, j) \ do { vm_physmem[(i)] = vm_physmem[(j)]; } while (0) #endif #define HANDLE_TO_PHYSSEG_NODE(h) (VM_PHYSMEM_PTR((int)h)) #define PHYSSEG_NODE_TO_HANDLE(u) ((int)((vsize_t) (u - vm_physmem) / sizeof(struct uvm_physseg))) /* XXXCDC: uvm.physmem */ static struct uvm_physseg vm_physmem[VM_PHYSSEG_MAX] __read_mostly; /* XXXCDC: uvm.nphysseg */ static int vm_nphysseg __read_mostly = 0; #define vm_nphysmem vm_nphysseg void uvm_physseg_init(void) { /* XXX: Provisioning for rb_tree related init(s) */ return; } int uvm_physseg_get_next(uvm_physseg_t lcv) { /* next of invalid is invalid, not fatal */ if (uvm_physseg_valid_p(lcv) == false) return UVM_PHYSSEG_TYPE_INVALID; return (lcv + 1); } int uvm_physseg_get_prev(uvm_physseg_t lcv) { /* prev of invalid is invalid, not fatal */ if (uvm_physseg_valid_p(lcv) == false) return UVM_PHYSSEG_TYPE_INVALID; return (lcv - 1); } int uvm_physseg_get_last(void) { return (vm_nphysseg - 1); } int uvm_physseg_get_first(void) { return 0; } paddr_t uvm_physseg_get_highest_frame(void) { int lcv; paddr_t last = 0; struct uvm_physseg *ps; for (lcv = 0; lcv < vm_nphysseg; lcv++) { ps = VM_PHYSMEM_PTR(lcv); if (last < ps->end) last = ps->end; } return last; } static struct vm_page * uvm_post_preload_check(void) { int preload, lcv; /* * check to see if this is a "preload" (i.e. uvm_page_init hasn't been * called yet, so kmem is not available). */ for (lcv = 0 ; lcv < vm_nphysmem ; lcv++) { if (VM_PHYSMEM_PTR(lcv)->pgs) break; } preload = (lcv == vm_nphysmem); /* * if VM is already running, attempt to kmem_alloc vm_page structures */ if (!preload) { panic("Tried to add RAM after uvm_page_init"); } return NULL; } /* * uvm_page_physunload: unload physical memory and return it to * caller. */ bool uvm_page_physunload(uvm_physseg_t psi, int freelist, paddr_t *paddrp) { int x; struct uvm_physseg *seg; uvm_post_preload_check(); seg = VM_PHYSMEM_PTR(psi); if (seg->free_list != freelist) { return false; } /* try from front */ if (seg->avail_start == seg->start && seg->avail_start < seg->avail_end) { *paddrp = ctob(seg->avail_start); seg->avail_start++; seg->start++; /* nothing left? nuke it */ if (seg->avail_start == seg->end) { if (vm_nphysmem == 1) panic("uvm_page_physget: out of memory!"); vm_nphysmem--; for (x = psi ; x < vm_nphysmem ; x++) /* structure copy */ VM_PHYSMEM_PTR_SWAP(x, x + 1); } return (true); } /* try from rear */ if (seg->avail_end == seg->end && seg->avail_start < seg->avail_end) { *paddrp = ctob(seg->avail_end - 1); seg->avail_end--; seg->end--; /* nothing left? nuke it */ if (seg->avail_end == seg->start) { if (vm_nphysmem == 1) panic("uvm_page_physget: out of memory!"); vm_nphysmem--; for (x = psi ; x < vm_nphysmem ; x++) /* structure copy */ VM_PHYSMEM_PTR_SWAP(x, x + 1); } return (true); } return false; } bool uvm_page_physunload_force(uvm_physseg_t psi, int freelist, paddr_t *paddrp) { int x; struct uvm_physseg *seg; uvm_post_preload_check(); seg = VM_PHYSMEM_PTR(psi); /* any room in this bank? */ if (seg->avail_start >= seg->avail_end) { return false; /* nope */ } *paddrp = ctob(seg->avail_start); seg->avail_start++; /* truncate! */ seg->start = seg->avail_start; /* nothing left? nuke it */ if (seg->avail_start == seg->end) { if (vm_nphysmem == 1) panic("uvm_page_physget: out of memory!"); vm_nphysmem--; for (x = psi ; x < vm_nphysmem ; x++) /* structure copy */ VM_PHYSMEM_PTR_SWAP(x, x + 1); } return (true); } bool uvm_physseg_plug(paddr_t pfn, size_t pages, uvm_physseg_t *psp) { int lcv; struct vm_page *pgs; struct uvm_physseg *ps; #ifdef DEBUG paddr_t off; uvm_physseg_t upm; upm = uvm_physseg_find(pfn, &off); if (uvm_physseg_valid_p(upm)) /* XXX; do we allow "update" plugs ? */ return false; #endif paddr_t start = pfn; paddr_t end = pfn + pages; paddr_t avail_start = start; paddr_t avail_end = end; if (uvmexp.pagesize == 0) panic("uvm_page_physload: page size not set!"); /* * do we have room? */ if (vm_nphysmem == VM_PHYSSEG_MAX) { printf("uvm_page_physload: unable to load physical memory " "segment\n"); printf("\t%d segments allocated, ignoring 0x%llx -> 0x%llx\n", VM_PHYSSEG_MAX, (long long)start, (long long)end); printf("\tincrease VM_PHYSSEG_MAX\n"); if (psp != NULL) *psp = UVM_PHYSSEG_TYPE_INVALID_OVERFLOW; return false; } /* * check to see if this is a "preload" (i.e. uvm_page_init hasn't been * called yet, so kmem is not available). */ pgs = uvm_post_preload_check(); /* * now insert us in the proper place in vm_physmem[] */ #if (VM_PHYSSEG_STRAT == VM_PSTRAT_RANDOM) /* random: put it at the end (easy!) */ ps = VM_PHYSMEM_PTR(vm_nphysmem); lcv = vm_nphysmem; #elif (VM_PHYSSEG_STRAT == VM_PSTRAT_BSEARCH) { int x; /* sort by address for binary search */ for (lcv = 0 ; lcv < vm_nphysmem ; lcv++) if (start < VM_PHYSMEM_PTR(lcv)->start) break; ps = VM_PHYSMEM_PTR(lcv); /* move back other entries, if necessary ... */ for (x = vm_nphysmem ; x > lcv ; x--) /* structure copy */ VM_PHYSMEM_PTR_SWAP(x, x - 1); } #elif (VM_PHYSSEG_STRAT == VM_PSTRAT_BIGFIRST) { int x; /* sort by largest segment first */ for (lcv = 0 ; lcv < vm_nphysmem ; lcv++) if ((end - start) > (VM_PHYSMEM_PTR(lcv)->end - VM_PHYSMEM_PTR(lcv)->start)) break; ps = VM_PHYSMEM_PTR(lcv); /* move back other entries, if necessary ... */ for (x = vm_nphysmem ; x > lcv ; x--) /* structure copy */ VM_PHYSMEM_PTR_SWAP(x, x - 1); } #else panic("uvm_page_physload: unknown physseg strategy selected!"); #endif ps->start = start; ps->end = end; ps->avail_start = avail_start; ps->avail_end = avail_end; ps->pgs = pgs; vm_nphysmem++; if (psp != NULL) *psp = lcv; return true; } /* * when VM_PHYSSEG_MAX is 1, we can simplify these functions */ #if VM_PHYSSEG_MAX == 1 static inline int vm_physseg_find_contig(struct uvm_physseg *, int, paddr_t, psize_t *); #elif (VM_PHYSSEG_STRAT == VM_PSTRAT_BSEARCH) static inline int vm_physseg_find_bsearch(struct uvm_physseg *, int, paddr_t, psize_t *); #else static inline int vm_physseg_find_linear(struct uvm_physseg *, int, paddr_t, psize_t *); #endif /* * vm_physseg_find: find vm_physseg structure that belongs to a PA */ inline int uvm_physseg_find(paddr_t pframe, psize_t *offp) { #if VM_PHYSSEG_MAX == 1 return vm_physseg_find_contig(vm_physmem, vm_nphysseg, pframe, offp); #elif (VM_PHYSSEG_STRAT == VM_PSTRAT_BSEARCH) return vm_physseg_find_bsearch(vm_physmem, vm_nphysseg, pframe, offp); #else return vm_physseg_find_linear(vm_physmem, vm_nphysseg, pframe, offp); #endif } #if VM_PHYSSEG_MAX == 1 static inline int vm_physseg_find_contig(struct uvm_physseg *segs, int nsegs, paddr_t pframe, psize_t *offp) { /* 'contig' case */ if (pframe >= segs[0].start && pframe < segs[0].end) { if (offp) *offp = pframe - segs[0].start; return(0); } return(-1); } #elif (VM_PHYSSEG_STRAT == VM_PSTRAT_BSEARCH) static inline int vm_physseg_find_bsearch(struct uvm_physseg *segs, int nsegs, paddr_t pframe, psize_t *offp) { /* binary search for it */ int start, len, guess; /* * if try is too large (thus target is less than try) we reduce * the length to trunc(len/2) [i.e. everything smaller than "try"] * * if the try is too small (thus target is greater than try) then * we set the new start to be (try + 1). this means we need to * reduce the length to (round(len/2) - 1). * * note "adjust" below which takes advantage of the fact that * (round(len/2) - 1) == trunc((len - 1) / 2) * for any value of len we may have */ for (start = 0, len = nsegs ; len != 0 ; len = len / 2) { guess = start + (len / 2); /* try in the middle */ /* start past our try? */ if (pframe >= segs[guess].start) { /* was try correct? */ if (pframe < segs[guess].end) { if (offp) *offp = pframe - segs[guess].start; return guess; /* got it */ } start = guess + 1; /* next time, start here */ len--; /* "adjust" */ } else { /* * pframe before try, just reduce length of * region, done in "for" loop */ } } return(-1); } #else static inline int vm_physseg_find_linear(struct uvm_physseg *segs, int nsegs, paddr_t pframe, psize_t *offp) { /* linear search for it */ int lcv; for (lcv = 0; lcv < nsegs; lcv++) { if (pframe >= segs[lcv].start && pframe < segs[lcv].end) { if (offp) *offp = pframe - segs[lcv].start; return(lcv); /* got it */ } } return(-1); } #endif #endif /* UVM_HOTPLUG */ /* * PHYS_TO_VM_PAGE: find vm_page for a PA. used by MI code to get vm_pages * back from an I/O mapping (ugh!). used in some MD code as well. it can * be prominent in flamegraphs, so optimise it and try to make it easy for * the compiler by including next to the inline lookup routines. */ struct vm_page * uvm_phys_to_vm_page(paddr_t pa) { #if VM_PHYSSEG_STRAT != VM_PSTRAT_BSEARCH /* 'contig' and linear cases */ KASSERT(vm_nphysseg > 0); struct uvm_physseg *ps = &vm_physmem[0]; struct uvm_physseg *end = &vm_physmem[vm_nphysseg]; paddr_t pframe = atop(pa); do { if (pframe >= ps->start && pframe < ps->end) { return &ps->pgs[pframe - ps->start]; } } while (VM_PHYSSEG_MAX > 1 && __predict_false(++ps < end)); return NULL; #else /* binary search for it */ paddr_t pf = atop(pa); paddr_t off; uvm_physseg_t upm; upm = uvm_physseg_find(pf, &off); if (upm != UVM_PHYSSEG_TYPE_INVALID) return uvm_physseg_get_pg(upm, off); return(NULL); #endif } bool uvm_physseg_valid_p(uvm_physseg_t upm) { struct uvm_physseg *ps; if (upm == UVM_PHYSSEG_TYPE_INVALID || upm == UVM_PHYSSEG_TYPE_INVALID_EMPTY || upm == UVM_PHYSSEG_TYPE_INVALID_OVERFLOW) return false; /* * This is the delicate init dance - * needs to go with the dance. */ if (uvm.page_init_done != true) return true; ps = HANDLE_TO_PHYSSEG_NODE(upm); /* Extra checks needed only post uvm_page_init() */ if (ps->pgs == NULL) return false; /* XXX: etc. */ return true; } /* * Boot protocol dictates that these must be able to return partially * initialised segments. */ paddr_t uvm_physseg_get_start(uvm_physseg_t upm) { if (uvm_physseg_valid_p(upm) == false) return (paddr_t) -1; return HANDLE_TO_PHYSSEG_NODE(upm)->start; } paddr_t uvm_physseg_get_end(uvm_physseg_t upm) { if (uvm_physseg_valid_p(upm) == false) return (paddr_t) -1; return HANDLE_TO_PHYSSEG_NODE(upm)->end; } paddr_t uvm_physseg_get_avail_start(uvm_physseg_t upm) { if (uvm_physseg_valid_p(upm) == false) return (paddr_t) -1; return HANDLE_TO_PHYSSEG_NODE(upm)->avail_start; } #if defined(UVM_PHYSSEG_LEGACY) void uvm_physseg_set_avail_start(uvm_physseg_t upm, paddr_t avail_start) { struct uvm_physseg *ps = HANDLE_TO_PHYSSEG_NODE(upm); #if defined(DIAGNOSTIC) paddr_t avail_end; avail_end = uvm_physseg_get_avail_end(upm); KASSERT(uvm_physseg_valid_p(upm)); KASSERT(avail_start < avail_end); KASSERT(avail_start >= ps->start); #endif ps->avail_start = avail_start; } void uvm_physseg_set_avail_end(uvm_physseg_t upm, paddr_t avail_end) { struct uvm_physseg *ps = HANDLE_TO_PHYSSEG_NODE(upm); #if defined(DIAGNOSTIC) paddr_t avail_start; avail_start = uvm_physseg_get_avail_start(upm); KASSERT(uvm_physseg_valid_p(upm)); KASSERT(avail_end > avail_start); KASSERT(avail_end <= ps->end); #endif ps->avail_end = avail_end; } #endif /* UVM_PHYSSEG_LEGACY */ paddr_t uvm_physseg_get_avail_end(uvm_physseg_t upm) { if (uvm_physseg_valid_p(upm) == false) return (paddr_t) -1; return HANDLE_TO_PHYSSEG_NODE(upm)->avail_end; } inline struct vm_page * uvm_physseg_get_pg(uvm_physseg_t upm, paddr_t idx) { KASSERT(uvm_physseg_valid_p(upm)); return &HANDLE_TO_PHYSSEG_NODE(upm)->pgs[idx]; } #ifdef __HAVE_PMAP_PHYSSEG struct pmap_physseg * uvm_physseg_get_pmseg(uvm_physseg_t upm) { KASSERT(uvm_physseg_valid_p(upm)); return &(HANDLE_TO_PHYSSEG_NODE(upm)->pmseg); } #endif int uvm_physseg_get_free_list(uvm_physseg_t upm) { KASSERT(uvm_physseg_valid_p(upm)); return HANDLE_TO_PHYSSEG_NODE(upm)->free_list; } u_long uvm_physseg_get_start_hint(uvm_physseg_t upm) { KASSERT(uvm_physseg_valid_p(upm)); return HANDLE_TO_PHYSSEG_NODE(upm)->start_hint; } bool uvm_physseg_set_start_hint(uvm_physseg_t upm, u_long start_hint) { if (uvm_physseg_valid_p(upm) == false) return false; HANDLE_TO_PHYSSEG_NODE(upm)->start_hint = start_hint; return true; } void uvm_physseg_init_seg(uvm_physseg_t upm, struct vm_page *pgs) { psize_t i; psize_t n; paddr_t paddr; struct uvm_physseg *seg; struct vm_page *pg; KASSERT(upm != UVM_PHYSSEG_TYPE_INVALID); KASSERT(pgs != NULL); seg = HANDLE_TO_PHYSSEG_NODE(upm); KASSERT(seg != NULL); KASSERT(seg->pgs == NULL); n = seg->end - seg->start; seg->pgs = pgs; /* init and free vm_pages (we've already zeroed them) */ paddr = ctob(seg->start); for (i = 0 ; i < n ; i++, paddr += PAGE_SIZE) { pg = &seg->pgs[i]; pg->phys_addr = paddr; #ifdef __HAVE_VM_PAGE_MD VM_MDPAGE_INIT(pg); #endif if (atop(paddr) >= seg->avail_start && atop(paddr) < seg->avail_end) { uvmexp.npages++; /* add page to free pool */ uvm_page_set_freelist(pg, uvm_page_lookup_freelist(pg)); /* Disable LOCKDEBUG: too many and too early. */ mutex_init(&pg->interlock, MUTEX_NODEBUG, IPL_NONE); uvm_pagefree(pg); } } } void uvm_physseg_seg_chomp_slab(uvm_physseg_t upm, struct vm_page *pgs, size_t n) { struct uvm_physseg *seg = HANDLE_TO_PHYSSEG_NODE(upm); /* max number of pre-boot unplug()s allowed */ #define UVM_PHYSSEG_BOOT_UNPLUG_MAX VM_PHYSSEG_MAX static char btslab_ex_storage[EXTENT_FIXED_STORAGE_SIZE(UVM_PHYSSEG_BOOT_UNPLUG_MAX)]; if (__predict_false(uvm.page_init_done == false)) { seg->ext = extent_create("Boot time slab", (u_long) pgs, (u_long) (pgs + n), (void *)btslab_ex_storage, sizeof(btslab_ex_storage), 0); } else { seg->ext = extent_create("Hotplug slab", (u_long) pgs, (u_long) (pgs + n), NULL, 0, 0); } KASSERT(seg->ext != NULL); } struct vm_page * uvm_physseg_seg_alloc_from_slab(uvm_physseg_t upm, size_t pages) { int err; struct uvm_physseg *seg; struct vm_page *pgs = NULL; KASSERT(pages > 0); seg = HANDLE_TO_PHYSSEG_NODE(upm); if (__predict_false(seg->ext == NULL)) { /* * This is a situation unique to boot time. * It shouldn't happen at any point other than from * the first uvm_page.c:uvm_page_init() call * Since we're in a loop, we can get away with the * below. */ KASSERT(uvm.page_init_done != true); uvm_physseg_t upmp = uvm_physseg_get_prev(upm); KASSERT(upmp != UVM_PHYSSEG_TYPE_INVALID); seg->ext = HANDLE_TO_PHYSSEG_NODE(upmp)->ext; KASSERT(seg->ext != NULL); } /* We allocate enough for this segment */ err = extent_alloc(seg->ext, sizeof(*pgs) * pages, 1, 0, EX_BOUNDZERO, (u_long *)&pgs); if (err != 0) { #ifdef DEBUG printf("%s: extent_alloc failed with error: %d \n", __func__, err); #endif } return pgs; } /* * uvm_page_physload: load physical memory into VM system * * => all args are PFs * => all pages in start/end get vm_page structures * => areas marked by avail_start/avail_end get added to the free page pool * => we are limited to VM_PHYSSEG_MAX physical memory segments */ uvm_physseg_t uvm_page_physload(paddr_t start, paddr_t end, paddr_t avail_start, paddr_t avail_end, int free_list) { struct uvm_physseg *ps; uvm_physseg_t upm; if (__predict_true(uvm.page_init_done == true)) panic("%s: unload attempted after uvm_page_init()\n", __func__); if (uvmexp.pagesize == 0) panic("uvm_page_physload: page size not set!"); if (free_list >= VM_NFREELIST || free_list < VM_FREELIST_DEFAULT) panic("uvm_page_physload: bad free list %d", free_list); if (start >= end) panic("uvm_page_physload: start[%" PRIxPADDR "] >= end[%" PRIxPADDR "]", start, end); if (uvm_physseg_plug(start, end - start, &upm) == false) { panic("uvm_physseg_plug() failed at boot."); /* NOTREACHED */ return UVM_PHYSSEG_TYPE_INVALID; /* XXX: correct type */ } ps = HANDLE_TO_PHYSSEG_NODE(upm); /* Legacy */ ps->avail_start = avail_start; ps->avail_end = avail_end; ps->free_list = free_list; /* XXX: */ return upm; } bool uvm_physseg_unplug(paddr_t pfn, size_t pages) { uvm_physseg_t upm; paddr_t off = 0, start __diagused, end; struct uvm_physseg *seg; upm = uvm_physseg_find(pfn, &off); if (!uvm_physseg_valid_p(upm)) { printf("%s: Tried to unplug from unknown offset\n", __func__); return false; } seg = HANDLE_TO_PHYSSEG_NODE(upm); start = uvm_physseg_get_start(upm); end = uvm_physseg_get_end(upm); if (end < (pfn + pages)) { printf("%s: Tried to unplug oversized span \n", __func__); return false; } KASSERT(pfn == start + off); /* sanity */ if (__predict_true(uvm.page_init_done == true)) { /* XXX: KASSERT() that seg->pgs[] are not on any uvm lists */ if (extent_free(seg->ext, (u_long)(seg->pgs + off), sizeof(struct vm_page) * pages, EX_MALLOCOK | EX_NOWAIT) != 0) return false; } if (off == 0 && (pfn + pages) == end) { #if defined(UVM_HOTPLUG) /* rbtree implementation */ int segcount = 0; struct uvm_physseg *current_ps; /* Complete segment */ if (uvm_physseg_graph.nentries == 1) panic("%s: out of memory!", __func__); if (__predict_true(uvm.page_init_done == true)) { RB_TREE_FOREACH(current_ps, &(uvm_physseg_graph.rb_tree)) { if (seg->ext == current_ps->ext) segcount++; } KASSERT(segcount > 0); if (segcount == 1) { extent_destroy(seg->ext); } /* * We assume that the unplug will succeed from * this point onwards */ uvmexp.npages -= (int) pages; } rb_tree_remove_node(&(uvm_physseg_graph.rb_tree), upm); memset(seg, 0, sizeof(struct uvm_physseg)); uvm_physseg_free(seg, sizeof(struct uvm_physseg)); uvm_physseg_graph.nentries--; #else /* UVM_HOTPLUG */ int x; if (vm_nphysmem == 1) panic("uvm_page_physget: out of memory!"); vm_nphysmem--; for (x = upm ; x < vm_nphysmem ; x++) /* structure copy */ VM_PHYSMEM_PTR_SWAP(x, x + 1); #endif /* UVM_HOTPLUG */ /* XXX: KASSERT() that seg->pgs[] are not on any uvm lists */ return true; } if (off > 0 && (pfn + pages) < end) { #if defined(UVM_HOTPLUG) /* rbtree implementation */ /* middle chunk - need a new segment */ struct uvm_physseg *ps, *current_ps; ps = uvm_physseg_alloc(sizeof (struct uvm_physseg)); if (ps == NULL) { printf("%s: Unable to allocated new fragment vm_physseg \n", __func__); return false; } /* Remove middle chunk */ if (__predict_true(uvm.page_init_done == true)) { KASSERT(seg->ext != NULL); ps->ext = seg->ext; /* XXX: KASSERT() that seg->pgs[] are not on any uvm lists */ /* * We assume that the unplug will succeed from * this point onwards */ uvmexp.npages -= (int) pages; } ps->start = pfn + pages; ps->avail_start = ps->start; /* XXX: Legacy */ ps->end = seg->end; ps->avail_end = ps->end; /* XXX: Legacy */ seg->end = pfn; seg->avail_end = seg->end; /* XXX: Legacy */ /* * The new pgs array points to the beginning of the * tail fragment. */ if (__predict_true(uvm.page_init_done == true)) ps->pgs = seg->pgs + off + pages; current_ps = rb_tree_insert_node(&(uvm_physseg_graph.rb_tree), ps); if (current_ps != ps) { panic("uvm_page_physload: Duplicate address range detected!"); } uvm_physseg_graph.nentries++; #else /* UVM_HOTPLUG */ panic("%s: can't unplug() from the middle of a segment without" " UVM_HOTPLUG\n", __func__); /* NOTREACHED */ #endif /* UVM_HOTPLUG */ return true; } if (off == 0 && (pfn + pages) < end) { /* Remove front chunk */ if (__predict_true(uvm.page_init_done == true)) { /* XXX: KASSERT() that seg->pgs[] are not on any uvm lists */ /* * We assume that the unplug will succeed from * this point onwards */ uvmexp.npages -= (int) pages; } /* Truncate */ seg->start = pfn + pages; seg->avail_start = seg->start; /* XXX: Legacy */ /* * Move the pgs array start to the beginning of the * tail end. */ if (__predict_true(uvm.page_init_done == true)) seg->pgs += pages; return true; } if (off > 0 && (pfn + pages) == end) { /* back chunk */ /* Truncate! */ seg->end = pfn; seg->avail_end = seg->end; /* XXX: Legacy */ uvmexp.npages -= (int) pages; return true; } printf("%s: Tried to unplug unknown range \n", __func__); return false; }
538 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 /* $NetBSD: uvm_page.h,v 1.109 2020/12/20 16:38:26 skrll Exp $ */ /* * Copyright (c) 1997 Charles D. Cranor and Washington University. * Copyright (c) 1991, 1993, The Regents of the University of California. * * All rights reserved. * * This code is derived from software contributed to Berkeley by * The Mach Operating System project at Carnegie-Mellon University. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)vm_page.h 7.3 (Berkeley) 4/21/91 * from: Id: uvm_page.h,v 1.1.2.6 1998/02/04 02:31:42 chuck Exp * * * Copyright (c) 1987, 1990 Carnegie-Mellon University. * All rights reserved. * * Permission to use, copy, modify and distribute this software and * its documentation is hereby granted, provided that both the copyright * notice and this permission notice appear in all copies of the * software, derivative works or modified versions, and any portions * thereof, and that both notices appear in supporting documentation. * * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. * * Carnegie Mellon requests users of this software to return to * * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU * School of Computer Science * Carnegie Mellon University * Pittsburgh PA 15213-3890 * * any improvements or extensions that they make and grant Carnegie the * rights to redistribute these changes. */ #ifndef _UVM_UVM_PAGE_H_ #define _UVM_UVM_PAGE_H_ #ifdef _KERNEL_OPT #include "opt_uvm_page_trkown.h" #endif #include <sys/rwlock.h> #include <uvm/uvm_extern.h> #include <uvm/uvm_pglist.h> /* * Management of resident (logical) pages. * * Each resident page has a vm_page structure, indexed by page number. * There are several lists in the structure: * * - A red-black tree rooted with the containing object is used to * quickly perform object+offset lookups. * - A list of all pages for a given object, for a quick deactivation * at a time of deallocation. * - An ordered list of pages due for pageout. * * In addition, the structure contains the object and offset to which * this page belongs (for pageout) and sundry status bits. * * Note that the page structure has no lock of its own. The page is * generally protected by its owner's lock (UVM object or amap/anon). * It should be noted that UVM has to serialize pmap(9) operations on * the managed pages, e.g. for pmap_enter() calls. Hence, the lock * order is as follows: * * [vmpage-owner-lock] -> * any pmap locks (e.g. PV hash lock) * * Since the kernel is always self-consistent, no serialization is * required for unmanaged mappings, e.g. for pmap_kenter_pa() calls. * * Field markings and the corresponding locks: * * f: free page queue lock, uvm_fpageqlock * o: page owner (uvm_object::vmobjlock, vm_amap::am_lock, vm_anon::an_lock) * i: vm_page::interlock * => flags set and cleared only with o&i held can * safely be tested for with only o held. * o,i: o|i for read, o&i for write (depends on context - if could be loaned) * => see uvm_loan.c * w: wired page queue or uvm_pglistalloc: * => wired page queue: o&i to change, stable from wire to unwire * XXX What about concurrent or nested wire? * => uvm_pglistalloc: owned by caller * ?: locked by pmap or assumed page owner's lock * p: locked by pagedaemon policy module (pdpolicy) * c: cpu private * s: stable, does not change * * UVM and pmap(9) may use uvm_page_owner_locked_p() to assert whether the * page owner's lock is acquired. * * A page can have one of four identities: * * o free * => pageq.list is entry on global free page queue * => uanon is unused (or (void *)0xdeadbeef for DEBUG) * => uobject is unused (or (void *)0xdeadbeef for DEBUG) * => PG_FREE is set in flags * o owned by a uvm_object * => pageq.queue is entry on wired page queue, if any * => uanon is NULL or the vm_anon to which it has been O->A loaned * => uobject is owner * o owned by a vm_anon * => pageq is unused (XXX correct?) * => uanon is owner * => uobject is NULL * => PG_ANON is set in flags * o allocated by uvm_pglistalloc * => pageq.queue is entry on resulting pglist, owned by caller * => uanon is unused * => uobject is unused * * The following transitions are allowed: * * - uvm_pagealloc: free -> owned by a uvm_object/vm_anon * - uvm_pagefree: owned by a uvm_object/vm_anon -> free * - uvm_pglistalloc: free -> allocated by uvm_pglistalloc * - uvm_pglistfree: allocated by uvm_pglistalloc -> free * * On the ordering of fields: * * The fields most heavily used during fault processing are clustered * together at the start of the structure to reduce cache misses. * XXX This entire thing should be shrunk to fit in one cache line. */ struct vm_page { /* _LP64: first cache line */ union { TAILQ_ENTRY(vm_page) queue; /* w: wired page queue * or uvm_pglistalloc output */ LIST_ENTRY(vm_page) list; /* f: global free page queue */ } pageq; uint32_t pqflags; /* i: pagedaemon flags */ uint32_t flags; /* o: object flags */ paddr_t phys_addr; /* o: physical address of pg */ uint32_t loan_count; /* o,i: num. active loans */ uint32_t wire_count; /* o,i: wired down map refs */ struct vm_anon *uanon; /* o,i: anon */ struct uvm_object *uobject; /* o,i: object */ voff_t offset; /* o: offset into object */ /* _LP64: second cache line */ kmutex_t interlock; /* s: lock on identity */ TAILQ_ENTRY(vm_page) pdqueue; /* p: pagedaemon queue */ #ifdef __HAVE_VM_PAGE_MD struct vm_page_md mdpage; /* ?: pmap-specific data */ #endif #if defined(UVM_PAGE_TRKOWN) /* debugging fields to track page ownership */ pid_t owner; /* proc that set PG_BUSY */ lwpid_t lowner; /* lwp that set PG_BUSY */ const char *owner_tag; /* why it was set busy */ #endif }; /* * Overview of UVM page flags, stored in pg->flags. * * Locking notes: * * PG_, struct vm_page::flags => locked by owner * PG_AOBJ => additionally locked by vm_page::interlock * PG_ANON => additionally locked by vm_page::interlock * PG_FREE => additionally locked by uvm_fpageqlock * for uvm_pglistalloc() * * Flag descriptions: * * PG_CLEAN: * Page is known clean. * The contents of the page is consistent with its backing store. * * PG_DIRTY: * Page is known dirty. * To avoid losing data, the contents of the page should be written * back to the backing store before freeing the page. * * PG_BUSY: * Page is long-term locked, usually because of I/O (transfer from the * page memory to the backing store) is in progress. LWP attempting * to access the page shall set PQ_WANTED and wait. PG_BUSY may only * be set with a write lock held on the object. * * PG_PAGEOUT: * Indicates that the page is being paged-out in preparation for * being freed. * * PG_RELEASED: * Indicates that the page, which is currently PG_BUSY, should be freed * after the release of long-term lock. It is responsibility of the * owning LWP (i.e. which set PG_BUSY) to do it. * * PG_FAKE: * Page has been allocated, but not yet initialised. The flag is used * to avoid overwriting of valid data, e.g. to prevent read from the * backing store when in-core data is newer. * * PG_RDONLY: * Indicates that the page must be mapped read-only. * * PG_MARKER: * Dummy marker page, generally used for list traversal. */ /* * if you want to renumber PG_CLEAN and PG_DIRTY, check __CTASSERTs in * uvm_page_status.c first. */ #define PG_CLEAN 0x00000001 /* page is known clean */ #define PG_DIRTY 0x00000002 /* page is known dirty */ #define PG_BUSY 0x00000004 /* page is locked */ #define PG_PAGEOUT 0x00000010 /* page to be freed for pagedaemon */ #define PG_RELEASED 0x00000020 /* page to be freed when unbusied */ #define PG_FAKE 0x00000040 /* page is not yet initialized */ #define PG_RDONLY 0x00000080 /* page must be mapped read-only */ #define PG_TABLED 0x00000200 /* page is tabled in object */ #define PG_AOBJ 0x00000400 /* page is part of an anonymous uvm_object */ #define PG_ANON 0x00000800 /* page is part of an anon, rather than an uvm_object */ #define PG_FILE 0x00001000 /* file backed (non-anonymous) */ #define PG_READAHEAD 0x00002000 /* read-ahead but not "hit" yet */ #define PG_FREE 0x00004000 /* page is on free list */ #define PG_MARKER 0x00008000 /* dummy marker page */ #define PG_PAGER1 0x00010000 /* pager-specific flag */ #define PG_PGLCA 0x00020000 /* allocated by uvm_pglistalloc_contig */ #define PG_STAT (PG_ANON|PG_AOBJ|PG_FILE) #define PG_SWAPBACKED (PG_ANON|PG_AOBJ) #define UVM_PGFLAGBITS \ "\20\1CLEAN\2DIRTY\3BUSY" \ "\5PAGEOUT\6RELEASED\7FAKE\10RDONLY" \ "\11ZERO\12TABLED\13AOBJ\14ANON" \ "\15FILE\16READAHEAD\17FREE\20MARKER" \ "\21PAGER1\22PGLCA" /* * Flags stored in pg->pqflags, which is protected by pg->interlock. * * PQ_PRIVATE: * ... is for uvmpdpol to do whatever it wants with. * * PQ_INTENT_SET: * Indicates that the intent set on the page has not yet been realized. * * PQ_INTENT_QUEUED: * Indicates that the page is, or will soon be, on a per-CPU queue for * the intent to be realized. * * PQ_WANTED: * Indicates that the page, which is currently PG_BUSY, is wanted by * some other LWP. The page owner (i.e. LWP which set PG_BUSY) is * responsible to clear both flags and wake up any waiters once it has * released the long-term lock (PG_BUSY). */ #define PQ_INTENT_A 0x00000000 /* intend activation */ #define PQ_INTENT_I 0x00000001 /* intend deactivation */ #define PQ_INTENT_E 0x00000002 /* intend enqueue */ #define PQ_INTENT_D 0x00000003 /* intend dequeue */ #define PQ_INTENT_MASK 0x00000003 /* mask of intended state */ #define PQ_INTENT_SET 0x00000004 /* not realized yet */ #define PQ_INTENT_QUEUED 0x00000008 /* queued for processing */ #define PQ_PRIVATE 0x00000ff0 /* private for pdpolicy */ #define PQ_WANTED 0x00001000 /* someone is waiting for page */ #define UVM_PQFLAGBITS \ "\20\1INTENT_0\2INTENT_1\3INTENT_SET\4INTENT_QUEUED" \ "\5PRIVATE1\6PRIVATE2\7PRIVATE3\10PRIVATE4" \ "\11PRIVATE5\12PRIVATE6\13PRIVATE7\14PRIVATE8" \ "\15WANTED" /* * physical memory layout structure * * MD vmparam.h must #define: * VM_PHYSEG_MAX = max number of physical memory segments we support * (if this is "1" then we revert to a "contig" case) * VM_PHYSSEG_STRAT: memory sort/search options (for VM_PHYSEG_MAX > 1) * - VM_PSTRAT_RANDOM: linear search (random order) * - VM_PSTRAT_BSEARCH: binary search (sorted by address) * - VM_PSTRAT_BIGFIRST: linear search (sorted by largest segment first) * - others? * XXXCDC: eventually we should purge all left-over global variables... */ #define VM_PSTRAT_RANDOM 1 #define VM_PSTRAT_BSEARCH 2 #define VM_PSTRAT_BIGFIRST 3 #ifdef _KERNEL /* * prototypes: the following prototypes define the interface to pages */ void uvm_page_init(vaddr_t *, vaddr_t *); void uvm_pglistalloc_init(void); #if defined(UVM_PAGE_TRKOWN) void uvm_page_own(struct vm_page *, const char *); #endif #if !defined(PMAP_STEAL_MEMORY) bool uvm_page_physget(paddr_t *); #endif void uvm_page_recolor(int); void uvm_page_rebucket(void); void uvm_pageactivate(struct vm_page *); vaddr_t uvm_pageboot_alloc(vsize_t); void uvm_pagecopy(struct vm_page *, struct vm_page *); void uvm_pagedeactivate(struct vm_page *); void uvm_pagedequeue(struct vm_page *); void uvm_pageenqueue(struct vm_page *); void uvm_pagefree(struct vm_page *); void uvm_pagelock(struct vm_page *); void uvm_pagelock2(struct vm_page *, struct vm_page *); void uvm_pageunlock(struct vm_page *); void uvm_pageunlock2(struct vm_page *, struct vm_page *); void uvm_page_unbusy(struct vm_page **, int); struct vm_page *uvm_pagelookup(struct uvm_object *, voff_t); void uvm_pageunwire(struct vm_page *); void uvm_pagewire(struct vm_page *); void uvm_pagezero(struct vm_page *); bool uvm_pageismanaged(paddr_t); bool uvm_page_owner_locked_p(struct vm_page *, bool); void uvm_pgfl_lock(void); void uvm_pgfl_unlock(void); unsigned int uvm_pagegetdirty(struct vm_page *); void uvm_pagemarkdirty(struct vm_page *, unsigned int); bool uvm_pagecheckdirty(struct vm_page *, bool); bool uvm_pagereadonly_p(struct vm_page *); bool uvm_page_locked_p(struct vm_page *); void uvm_pagewakeup(struct vm_page *); bool uvm_pagewanted_p(struct vm_page *); void uvm_pagewait(struct vm_page *, krwlock_t *, const char *); int uvm_page_lookup_freelist(struct vm_page *); struct vm_page *uvm_phys_to_vm_page(paddr_t); paddr_t uvm_vm_page_to_phys(const struct vm_page *); #if defined(PMAP_DIRECT) extern bool ubc_direct; int uvm_direct_process(struct vm_page **, u_int, voff_t, vsize_t, int (*)(void *, size_t, void *), void *); #endif /* * page dirtiness status for uvm_pagegetdirty and uvm_pagemarkdirty * * UNKNOWN means that we need to consult pmap to know if the page is * dirty or not. * basically, UVM_PAGE_STATUS_CLEAN implies that the page has no writable * mapping. * * if you want to renumber these, check __CTASSERTs in * uvm_page_status.c first. */ #define UVM_PAGE_STATUS_UNKNOWN 0 #define UVM_PAGE_STATUS_CLEAN 1 #define UVM_PAGE_STATUS_DIRTY 2 #define UVM_PAGE_NUM_STATUS 3 /* * macros */ #define VM_PAGE_TO_PHYS(entry) uvm_vm_page_to_phys(entry) #ifdef __HAVE_VM_PAGE_MD #define VM_PAGE_TO_MD(pg) (&(pg)->mdpage) #define VM_MD_TO_PAGE(md) (container_of((md), struct vm_page, mdpage)) #endif /* * Compute the page color for a given page. */ #define VM_PGCOLOR(pg) \ (atop(VM_PAGE_TO_PHYS((pg))) & uvmexp.colormask) #define PHYS_TO_VM_PAGE(pa) uvm_phys_to_vm_page(pa) /* * VM_PAGE_IS_FREE() can't tell if the page is on global free list, or a * per-CPU cache. If you need to be certain, pause caching. */ #define VM_PAGE_IS_FREE(entry) ((entry)->flags & PG_FREE) /* * Use the lower 10 bits of pg->phys_addr to cache some some locators for * the page. This implies that the smallest possible page size is 1kB, and * that nobody should use pg->phys_addr directly (use VM_PAGE_TO_PHYS()). * * - 5 bits for the freelist index, because uvm_page_lookup_freelist() * traverses an rbtree and therefore features prominently in traces * captured during performance test. It would probably be more useful to * cache physseg index here because freelist can be inferred from physseg, * but it requires changes to allocation for UVM_HOTPLUG, so for now we'll * go with freelist. * * - 5 bits for "bucket", a way for us to categorise pages further as * needed (e.g. NUMA node). * * None of this is set in stone; it can be adjusted as needed. */ #define UVM_PHYSADDR_FREELIST __BITS(0,4) #define UVM_PHYSADDR_BUCKET __BITS(5,9) static inline unsigned uvm_page_get_freelist(struct vm_page *pg) { unsigned fl = __SHIFTOUT(pg->phys_addr, UVM_PHYSADDR_FREELIST); KASSERT(fl == (unsigned)uvm_page_lookup_freelist(pg)); return fl; } static inline unsigned uvm_page_get_bucket(struct vm_page *pg) { return __SHIFTOUT(pg->phys_addr, UVM_PHYSADDR_BUCKET); } static inline void uvm_page_set_freelist(struct vm_page *pg, unsigned fl) { KASSERT(fl < 32); pg->phys_addr &= ~UVM_PHYSADDR_FREELIST; pg->phys_addr |= __SHIFTIN(fl, UVM_PHYSADDR_FREELIST); } static inline void uvm_page_set_bucket(struct vm_page *pg, unsigned b) { KASSERT(b < 32); pg->phys_addr &= ~UVM_PHYSADDR_BUCKET; pg->phys_addr |= __SHIFTIN(b, UVM_PHYSADDR_BUCKET); } #endif /* _KERNEL */ #endif /* _UVM_UVM_PAGE_H_ */
77 60 16 6 3 3 3 3 78 79 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 /* $NetBSD: rtbl.c,v 1.7 2017/06/01 02:45:14 chs Exp $ */ /*- * Copyright (c) 1998, 2008, 2011 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Kevin M. Lahey of the Numerical Aerospace Simulation Facility, * NASA Ames Research Center. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /* * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the project nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * Copyright (c) 1980, 1986, 1991, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)route.c 8.3 (Berkeley) 1/9/95 */ #if defined(_KERNEL) && defined(_KERNEL_OPT) #include "opt_route.h" #endif /* _KERNEL && _KERNEL_OPT */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: rtbl.c,v 1.7 2017/06/01 02:45:14 chs Exp $"); #include <sys/param.h> #include <sys/kmem.h> #include <sys/sysctl.h> #include <sys/systm.h> #include <sys/callout.h> #include <sys/proc.h> #include <sys/mbuf.h> #include <sys/socket.h> #include <sys/socketvar.h> #include <sys/domain.h> #include <sys/kernel.h> #include <sys/ioctl.h> #include <sys/pool.h> #include <sys/kauth.h> #include <net/if.h> #include <net/if_dl.h> #include <net/route.h> #include <net/raw_cb.h> static rtbl_t *rt_tables[AF_MAX+1]; int rt_inithead(rtbl_t **tp, int off) { rtbl_t *t; if (*tp != NULL) return 1; t = kmem_alloc(sizeof(*t), KM_SLEEP); *tp = t; return rn_inithead0(&t->t_rnh, off); } struct rtentry * rt_matchaddr(rtbl_t *t, const struct sockaddr *dst) { struct radix_node_head *rnh = &t->t_rnh; struct radix_node *rn; rn = rnh->rnh_matchaddr(dst, rnh); if (rn == NULL || (rn->rn_flags & RNF_ROOT) != 0) return NULL; return (struct rtentry *)rn; } int rt_addaddr(rtbl_t *t, struct rtentry *rt, const struct sockaddr *netmask) { struct radix_node_head *rnh = &t->t_rnh; struct radix_node *rn; rn = rnh->rnh_addaddr(rt_getkey(rt), netmask, rnh, rt->rt_nodes); return (rn == NULL) ? EEXIST : 0; } struct rtentry * rt_lookup(rtbl_t *t, const struct sockaddr *dst, const struct sockaddr *netmask) { struct radix_node_head *rnh = &t->t_rnh; struct radix_node *rn; rn = rnh->rnh_lookup(dst, netmask, rnh); if (rn == NULL || (rn->rn_flags & RNF_ROOT) != 0) return NULL; return (struct rtentry *)rn; } struct rtentry * rt_deladdr(rtbl_t *t, const struct sockaddr *dst, const struct sockaddr *netmask) { struct radix_node_head *rnh = &t->t_rnh; struct radix_node *rn; if ((rn = rnh->rnh_deladdr(dst, netmask, rnh)) == NULL) return NULL; if (rn->rn_flags & (RNF_ACTIVE | RNF_ROOT)) panic("%s", __func__); return (struct rtentry *)rn; } static int rt_walktree_visitor(struct radix_node *rn, void *v) { struct rtwalk *rw = (struct rtwalk *)v; return (*rw->rw_f)((struct rtentry *)rn, rw->rw_v); } int rtbl_walktree(sa_family_t family, int (*f)(struct rtentry *, void *), void *v) { rtbl_t *t = rt_tables[family]; struct rtwalk rw; if (t == NULL) return 0; rw.rw_f = f; rw.rw_v = v; return rn_walktree(&t->t_rnh, rt_walktree_visitor, &rw); } struct rtentry * rtbl_search_matched_entry(sa_family_t family, int (*f)(struct rtentry *, void *), void *v) { rtbl_t *t = rt_tables[family]; struct rtwalk rw; if (t == NULL) return 0; rw.rw_f = f; rw.rw_v = v; return (struct rtentry *) rn_search_matched(&t->t_rnh, rt_walktree_visitor, &rw); } rtbl_t * rt_gettable(sa_family_t af) { if (af >= __arraycount(rt_tables)) return NULL; return rt_tables[af]; } void rtbl_init(void) { struct domain *dom; DOMAIN_FOREACH(dom) if (dom->dom_rtattach) dom->dom_rtattach(&rt_tables[dom->dom_family], dom->dom_rtoffset); } void rt_assert_inactive(const struct rtentry *rt) { if (rt->rt_nodes->rn_flags & (RNF_ACTIVE | RNF_ROOT)) panic ("rtfree 2"); } int rt_refines(const struct sockaddr *m_sa, const struct sockaddr *n_sa) { return rn_refines(m_sa, n_sa); }
192 189 22 23 1 22 5 1 4 5 1 1 1 1 5 5 5 1 4 4 4 35 30 5 35 5 1 1 1 1 2 2 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 4 4 4 4 4 4 4 3 4 1 5 7 7 7 6 7 7 7 7 7 5 2 1 4 1 1 4 5 5 5 1 4 1 4 5 5 2 2 2 2 2 2 2 2 2 2 2 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 /* $NetBSD: kern_sig.c,v 1.409 2024/02/10 09:24:18 andvar Exp $ */ /*- * Copyright (c) 2006, 2007, 2008, 2019, 2023 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Andrew Doran. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /* * Copyright (c) 1982, 1986, 1989, 1991, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)kern_sig.c 8.14 (Berkeley) 5/14/95 */ /* * Signal subsystem. */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: kern_sig.c,v 1.409 2024/02/10 09:24:18 andvar Exp $"); #include "opt_execfmt.h" #include "opt_ptrace.h" #include "opt_dtrace.h" #include "opt_compat_sunos.h" #include "opt_compat_netbsd.h" #include "opt_compat_netbsd32.h" #include "opt_pax.h" #define SIGPROP /* include signal properties table */ #include <sys/param.h> #include <sys/signalvar.h> #include <sys/proc.h> #include <sys/ptrace.h> #include <sys/systm.h> #include <sys/wait.h> #include <sys/ktrace.h> #include <sys/syslog.h> #include <sys/filedesc.h> #include <sys/file.h> #include <sys/pool.h> #include <sys/ucontext.h> #include <sys/exec.h> #include <sys/kauth.h> #include <sys/acct.h> #include <sys/callout.h> #include <sys/atomic.h> #include <sys/cpu.h> #include <sys/module.h> #include <sys/sdt.h> #include <sys/exec_elf.h> #include <sys/compat_stub.h> #ifdef PAX_SEGVGUARD #include <sys/pax.h> #endif /* PAX_SEGVGUARD */ #include <uvm/uvm_extern.h> /* Many hard-coded assumptions that there are <= 4 x 32bit signal mask bits */ __CTASSERT(NSIG <= 128); #define SIGQUEUE_MAX 32 static pool_cache_t sigacts_cache __read_mostly; static pool_cache_t ksiginfo_cache __read_mostly; static callout_t proc_stop_ch __cacheline_aligned; sigset_t contsigmask __cacheline_aligned; sigset_t stopsigmask __cacheline_aligned; static sigset_t vforksigmask __cacheline_aligned; sigset_t sigcantmask __cacheline_aligned; static void ksiginfo_exechook(struct proc *, void *); static void proc_stop(struct proc *, int); static void proc_stop_done(struct proc *, int); static void proc_stop_callout(void *); static int sigchecktrace(void); static int sigpost(struct lwp *, sig_t, int, int); static int sigput(sigpend_t *, struct proc *, ksiginfo_t *); static int sigunwait(struct proc *, const ksiginfo_t *); static void sigswitch(int, int, bool); static void sigswitch_unlock_and_switch_away(struct lwp *); static void sigacts_poolpage_free(struct pool *, void *); static void *sigacts_poolpage_alloc(struct pool *, int); /* * DTrace SDT provider definitions */ SDT_PROVIDER_DECLARE(proc); SDT_PROBE_DEFINE3(proc, kernel, , signal__send, "struct lwp *", /* target thread */ "struct proc *", /* target process */ "int"); /* signal */ SDT_PROBE_DEFINE3(proc, kernel, , signal__discard, "struct lwp *", /* target thread */ "struct proc *", /* target process */ "int"); /* signal */ SDT_PROBE_DEFINE3(proc, kernel, , signal__handle, "int", /* signal */ "ksiginfo_t *", /* signal info */ "void (*)(void)"); /* handler address */ static struct pool_allocator sigactspool_allocator = { .pa_alloc = sigacts_poolpage_alloc, .pa_free = sigacts_poolpage_free }; #ifdef DEBUG int kern_logsigexit = 1; #else int kern_logsigexit = 0; #endif static const char logcoredump[] = "pid %d (%s), uid %d: exited on signal %d (core dumped)\n"; static const char lognocoredump[] = "pid %d (%s), uid %d: exited on signal %d (core not dumped, err = %d)\n"; static kauth_listener_t signal_listener; static int signal_listener_cb(kauth_cred_t cred, kauth_action_t action, void *cookie, void *arg0, void *arg1, void *arg2, void *arg3) { struct proc *p; int result, signum; result = KAUTH_RESULT_DEFER; p = arg0; signum = (int)(unsigned long)arg1; if (action != KAUTH_PROCESS_SIGNAL) return result; if (kauth_cred_uidmatch(cred, p->p_cred) || (signum == SIGCONT && (curproc->p_session == p->p_session))) result = KAUTH_RESULT_ALLOW; return result; } static int sigacts_ctor(void *arg __unused, void *obj, int flags __unused) { memset(obj, 0, sizeof(struct sigacts)); return 0; } /* * signal_init: * * Initialize global signal-related data structures. */ void signal_init(void) { sigactspool_allocator.pa_pagesz = (PAGE_SIZE)*2; sigacts_cache = pool_cache_init(sizeof(struct sigacts), 0, 0, 0, "sigacts", sizeof(struct sigacts) > PAGE_SIZE ? &sigactspool_allocator : NULL, IPL_NONE, sigacts_ctor, NULL, NULL); ksiginfo_cache = pool_cache_init(sizeof(ksiginfo_t), 0, 0, 0, "ksiginfo", NULL, IPL_VM, NULL, NULL, NULL); exechook_establish(ksiginfo_exechook, NULL); callout_init(&proc_stop_ch, CALLOUT_MPSAFE); callout_setfunc(&proc_stop_ch, proc_stop_callout, NULL); signal_listener = kauth_listen_scope(KAUTH_SCOPE_PROCESS, signal_listener_cb, NULL); } /* * sigacts_poolpage_alloc: * * Allocate a page for the sigacts memory pool. */ static void * sigacts_poolpage_alloc(struct pool *pp, int flags) { return (void *)uvm_km_alloc(kernel_map, PAGE_SIZE * 2, PAGE_SIZE * 2, ((flags & PR_WAITOK) ? 0 : UVM_KMF_NOWAIT | UVM_KMF_TRYLOCK) | UVM_KMF_WIRED); } /* * sigacts_poolpage_free: * * Free a page on behalf of the sigacts memory pool. */ static void sigacts_poolpage_free(struct pool *pp, void *v) { uvm_km_free(kernel_map, (vaddr_t)v, PAGE_SIZE * 2, UVM_KMF_WIRED); } /* * sigactsinit: * * Create an initial sigacts structure, using the same signal state * as of specified process. If 'share' is set, share the sigacts by * holding a reference, otherwise just copy it from parent. */ struct sigacts * sigactsinit(struct proc *pp, int share) { struct sigacts *ps = pp->p_sigacts, *ps2; if (__predict_false(share)) { atomic_inc_uint(&ps->sa_refcnt); return ps; } ps2 = pool_cache_get(sigacts_cache, PR_WAITOK); mutex_init(&ps2->sa_mutex, MUTEX_DEFAULT, IPL_SCHED); ps2->sa_refcnt = 1; mutex_enter(&ps->sa_mutex); memcpy(ps2->sa_sigdesc, ps->sa_sigdesc, sizeof(ps2->sa_sigdesc)); mutex_exit(&ps->sa_mutex); return ps2; } /* * sigactsunshare: * * Make this process not share its sigacts, maintaining all signal state. */ void sigactsunshare(struct proc *p) { struct sigacts *ps, *oldps = p->p_sigacts; if (__predict_true(oldps->sa_refcnt == 1)) return; ps = pool_cache_get(sigacts_cache, PR_WAITOK); mutex_init(&ps->sa_mutex, MUTEX_DEFAULT, IPL_SCHED); memcpy(ps->sa_sigdesc, oldps->sa_sigdesc, sizeof(ps->sa_sigdesc)); ps->sa_refcnt = 1; p->p_sigacts = ps; sigactsfree(oldps); } /* * sigactsfree; * * Release a sigacts structure. */ void sigactsfree(struct sigacts *ps) { membar_release(); if (atomic_dec_uint_nv(&ps->sa_refcnt) == 0) { membar_acquire(); mutex_destroy(&ps->sa_mutex); pool_cache_put(sigacts_cache, ps); } } /* * siginit: * * Initialize signal state for process 0; set to ignore signals that * are ignored by default and disable the signal stack. Locking not * required as the system is still cold. */ void siginit(struct proc *p) { struct lwp *l; struct sigacts *ps; int signo, prop; ps = p->p_sigacts; sigemptyset(&contsigmask); sigemptyset(&stopsigmask); sigemptyset(&vforksigmask); sigemptyset(&sigcantmask); for (signo = 1; signo < NSIG; signo++) { prop = sigprop[signo]; if (prop & SA_CONT) sigaddset(&contsigmask, signo); if (prop & SA_STOP) sigaddset(&stopsigmask, signo); if (prop & SA_STOP && signo != SIGSTOP) sigaddset(&vforksigmask, signo); if (prop & SA_CANTMASK) sigaddset(&sigcantmask, signo); if (prop & SA_IGNORE && signo != SIGCONT) sigaddset(&p->p_sigctx.ps_sigignore, signo); sigemptyset(&SIGACTION_PS(ps, signo).sa_mask); SIGACTION_PS(ps, signo).sa_flags = SA_RESTART; } sigemptyset(&p->p_sigctx.ps_sigcatch); p->p_sflag &= ~PS_NOCLDSTOP; ksiginfo_queue_init(&p->p_sigpend.sp_info); sigemptyset(&p->p_sigpend.sp_set); /* * Reset per LWP state. */ l = LIST_FIRST(&p->p_lwps); l->l_sigwaited = NULL; l->l_sigstk = SS_INIT; ksiginfo_queue_init(&l->l_sigpend.sp_info); sigemptyset(&l->l_sigpend.sp_set); /* One reference. */ ps->sa_refcnt = 1; } /* * execsigs: * * Reset signals for an exec of the specified process. */ void execsigs(struct proc *p) { struct sigacts *ps; struct lwp *l; int signo, prop; sigset_t tset; ksiginfoq_t kq; KASSERT(p->p_nlwps == 1); sigactsunshare(p); ps = p->p_sigacts; /* * Reset caught signals. Held signals remain held through * l->l_sigmask (unless they were caught, and are now ignored * by default). * * No need to lock yet, the process has only one LWP and * at this point the sigacts are private to the process. */ sigemptyset(&tset); for (signo = 1; signo < NSIG; signo++) { if (sigismember(&p->p_sigctx.ps_sigcatch, signo)) { prop = sigprop[signo]; if (prop & SA_IGNORE) { if ((prop & SA_CONT) == 0) sigaddset(&p->p_sigctx.ps_sigignore, signo); sigaddset(&tset, signo); } SIGACTION_PS(ps, signo).sa_handler = SIG_DFL; } sigemptyset(&SIGACTION_PS(ps, signo).sa_mask); SIGACTION_PS(ps, signo).sa_flags = SA_RESTART; } ksiginfo_queue_init(&kq); mutex_enter(p->p_lock); sigclearall(p, &tset, &kq); sigemptyset(&p->p_sigctx.ps_sigcatch); /* * Reset no zombies if child dies flag as Solaris does. */ p->p_flag &= ~(PK_NOCLDWAIT | PK_CLDSIGIGN); if (SIGACTION_PS(ps, SIGCHLD).sa_handler == SIG_IGN) SIGACTION_PS(ps, SIGCHLD).sa_handler = SIG_DFL; /* * Reset per-LWP state. */ l = LIST_FIRST(&p->p_lwps); l->l_sigwaited = NULL; l->l_sigstk = SS_INIT; ksiginfo_queue_init(&l->l_sigpend.sp_info); sigemptyset(&l->l_sigpend.sp_set); mutex_exit(p->p_lock); ksiginfo_queue_drain(&kq); } /* * ksiginfo_exechook: * * Free all pending ksiginfo entries from a process on exec. * Additionally, drain any unused ksiginfo structures in the * system back to the pool. * * XXX This should not be a hook, every process has signals. */ static void ksiginfo_exechook(struct proc *p, void *v) { ksiginfoq_t kq; ksiginfo_queue_init(&kq); mutex_enter(p->p_lock); sigclearall(p, NULL, &kq); mutex_exit(p->p_lock); ksiginfo_queue_drain(&kq); } /* * ksiginfo_alloc: * * Allocate a new ksiginfo structure from the pool, and optionally copy * an existing one. If the existing ksiginfo_t is from the pool, and * has not been queued somewhere, then just return it. Additionally, * if the existing ksiginfo_t does not contain any information beyond * the signal number, then just return it. */ ksiginfo_t * ksiginfo_alloc(struct proc *p, ksiginfo_t *ok, int flags) { ksiginfo_t *kp; if (ok != NULL) { if ((ok->ksi_flags & (KSI_QUEUED | KSI_FROMPOOL)) == KSI_FROMPOOL) return ok; if (KSI_EMPTY_P(ok)) return ok; } kp = pool_cache_get(ksiginfo_cache, flags); if (kp == NULL) { #ifdef DIAGNOSTIC printf("Out of memory allocating ksiginfo for pid %d\n", p->p_pid); #endif return NULL; } if (ok != NULL) { memcpy(kp, ok, sizeof(*kp)); kp->ksi_flags &= ~KSI_QUEUED; } else KSI_INIT_EMPTY(kp); kp->ksi_flags |= KSI_FROMPOOL; return kp; } /* * ksiginfo_free: * * If the given ksiginfo_t is from the pool and has not been queued, * then free it. */ void ksiginfo_free(ksiginfo_t *kp) { if ((kp->ksi_flags & (KSI_QUEUED | KSI_FROMPOOL)) != KSI_FROMPOOL) return; pool_cache_put(ksiginfo_cache, kp); } /* * ksiginfo_queue_drain: * * Drain a non-empty ksiginfo_t queue. */ void ksiginfo_queue_drain0(ksiginfoq_t *kq) { ksiginfo_t *ksi; KASSERT(!TAILQ_EMPTY(kq)); while (!TAILQ_EMPTY(kq)) { ksi = TAILQ_FIRST(kq); TAILQ_REMOVE(kq, ksi, ksi_list); pool_cache_put(ksiginfo_cache, ksi); } } static int siggetinfo(sigpend_t *sp, ksiginfo_t *out, int signo) { ksiginfo_t *ksi, *nksi; if (sp == NULL) goto out; /* Find siginfo and copy it out. */ int count = 0; TAILQ_FOREACH_SAFE(ksi, &sp->sp_info, ksi_list, nksi) { if (ksi->ksi_signo != signo) continue; if (count++ > 0) /* Only remove the first, count all of them */ continue; TAILQ_REMOVE(&sp->sp_info, ksi, ksi_list); KASSERT((ksi->ksi_flags & KSI_FROMPOOL) != 0); KASSERT((ksi->ksi_flags & KSI_QUEUED) != 0); ksi->ksi_flags &= ~KSI_QUEUED; if (out != NULL) { memcpy(out, ksi, sizeof(*out)); out->ksi_flags &= ~(KSI_FROMPOOL | KSI_QUEUED); } ksiginfo_free(ksi); } if (count) return count; out: /* If there is no siginfo, then manufacture it. */ if (out != NULL) { KSI_INIT(out); out->ksi_info._signo = signo; out->ksi_info._code = SI_NOINFO; } return 0; } /* * sigget: * * Fetch the first pending signal from a set. Optionally, also fetch * or manufacture a ksiginfo element. Returns the number of the first * pending signal, or zero. */ int sigget(sigpend_t *sp, ksiginfo_t *out, int signo, const sigset_t *mask) { sigset_t tset; int count; /* If there's no pending set, the signal is from the debugger. */ if (sp == NULL) goto out; /* Construct mask from signo, and 'mask'. */ if (signo == 0) { if (mask != NULL) { tset = *mask; __sigandset(&sp->sp_set, &tset); } else tset = sp->sp_set; /* If there are no signals pending - return. */ if ((signo = firstsig(&tset)) == 0) goto out; } else { KASSERT(sigismember(&sp->sp_set, signo)); } sigdelset(&sp->sp_set, signo); out: count = siggetinfo(sp, out, signo); if (count > 1) sigaddset(&sp->sp_set, signo); return signo; } /* * sigput: * * Append a new ksiginfo element to the list of pending ksiginfo's. */ static int sigput(sigpend_t *sp, struct proc *p, ksiginfo_t *ksi) { ksiginfo_t *kp; KASSERT(mutex_owned(p->p_lock)); KASSERT((ksi->ksi_flags & KSI_QUEUED) == 0); sigaddset(&sp->sp_set, ksi->ksi_signo); /* * If there is no siginfo, we are done. */ if (KSI_EMPTY_P(ksi)) return 0; KASSERT((ksi->ksi_flags & KSI_FROMPOOL) != 0); size_t count = 0; TAILQ_FOREACH(kp, &sp->sp_info, ksi_list) { count++; if (ksi->ksi_signo >= SIGRTMIN && ksi->ksi_signo <= SIGRTMAX) continue; if (kp->ksi_signo == ksi->ksi_signo) { KSI_COPY(ksi, kp); kp->ksi_flags |= KSI_QUEUED; return 0; } } if (count >= SIGQUEUE_MAX) { #ifdef DIAGNOSTIC printf("%s(%d): Signal queue is full signal=%d\n", p->p_comm, p->p_pid, ksi->ksi_signo); #endif return EAGAIN; } ksi->ksi_flags |= KSI_QUEUED; TAILQ_INSERT_TAIL(&sp->sp_info, ksi, ksi_list); return 0; } /* * sigclear: * * Clear all pending signals in the specified set. */ void sigclear(sigpend_t *sp, const sigset_t *mask, ksiginfoq_t *kq) { ksiginfo_t *ksi, *next; if (mask == NULL) sigemptyset(&sp->sp_set); else sigminusset(mask, &sp->sp_set); TAILQ_FOREACH_SAFE(ksi, &sp->sp_info, ksi_list, next) { if (mask == NULL || sigismember(mask, ksi->ksi_signo)) { TAILQ_REMOVE(&sp->sp_info, ksi, ksi_list); KASSERT((ksi->ksi_flags & KSI_FROMPOOL) != 0); KASSERT((ksi->ksi_flags & KSI_QUEUED) != 0); TAILQ_INSERT_TAIL(kq, ksi, ksi_list); } } } /* * sigclearall: * * Clear all pending signals in the specified set from a process and * its LWPs. */ void sigclearall(struct proc *p, const sigset_t *mask, ksiginfoq_t *kq) { struct lwp *l; KASSERT(mutex_owned(p->p_lock)); sigclear(&p->p_sigpend, mask, kq); LIST_FOREACH(l, &p->p_lwps, l_sibling) { sigclear(&l->l_sigpend, mask, kq); } } /* * sigispending: * * Return the first signal number if there are pending signals for the * current LWP. May be called unlocked provided that LW_PENDSIG is set, * and that the signal has been posted to the appopriate queue before * LW_PENDSIG is set. * * This should only ever be called with (l == curlwp), unless the * result does not matter (procfs, sysctl). */ int sigispending(struct lwp *l, int signo) { struct proc *p = l->l_proc; sigset_t tset; membar_consumer(); tset = l->l_sigpend.sp_set; sigplusset(&p->p_sigpend.sp_set, &tset); sigminusset(&p->p_sigctx.ps_sigignore, &tset); sigminusset(&l->l_sigmask, &tset); if (signo == 0) { return firstsig(&tset); } return sigismember(&tset, signo) ? signo : 0; } void getucontext(struct lwp *l, ucontext_t *ucp) { struct proc *p = l->l_proc; KASSERT(mutex_owned(p->p_lock)); ucp->uc_flags = 0; ucp->uc_link = l->l_ctxlink; ucp->uc_sigmask = l->l_sigmask; ucp->uc_flags |= _UC_SIGMASK; /* * The (unsupplied) definition of the `current execution stack' * in the System V Interface Definition appears to allow returning * the main context stack. */ if ((l->l_sigstk.ss_flags & SS_ONSTACK) == 0) { ucp->uc_stack.ss_sp = (void *)l->l_proc->p_stackbase; ucp->uc_stack.ss_size = ctob(l->l_proc->p_vmspace->vm_ssize); ucp->uc_stack.ss_flags = 0; /* XXX, def. is Very Fishy */ } else { /* Simply copy alternate signal execution stack. */ ucp->uc_stack = l->l_sigstk; } ucp->uc_flags |= _UC_STACK; mutex_exit(p->p_lock); cpu_getmcontext(l, &ucp->uc_mcontext, &ucp->uc_flags); mutex_enter(p->p_lock); } int setucontext(struct lwp *l, const ucontext_t *ucp) { struct proc *p = l->l_proc; int error; KASSERT(mutex_owned(p->p_lock)); if ((ucp->uc_flags & _UC_SIGMASK) != 0) { error = sigprocmask1(l, SIG_SETMASK, &ucp->uc_sigmask, NULL); if (error != 0) return error; } mutex_exit(p->p_lock); error = cpu_setmcontext(l, &ucp->uc_mcontext, ucp->uc_flags); mutex_enter(p->p_lock); if (error != 0) return (error); l->l_ctxlink = ucp->uc_link; /* * If there was stack information, update whether or not we are * still running on an alternate signal stack. */ if ((ucp->uc_flags & _UC_STACK) != 0) { if (ucp->uc_stack.ss_flags & SS_ONSTACK) l->l_sigstk.ss_flags |= SS_ONSTACK; else l->l_sigstk.ss_flags &= ~SS_ONSTACK; } return 0; } /* * killpg1: common code for kill process group/broadcast kill. */ int killpg1(struct lwp *l, ksiginfo_t *ksi, int pgid, int all) { struct proc *p, *cp; kauth_cred_t pc; struct pgrp *pgrp; int nfound; int signo = ksi->ksi_signo; cp = l->l_proc; pc = l->l_cred; nfound = 0; mutex_enter(&proc_lock); if (all) { /* * Broadcast. */ PROCLIST_FOREACH(p, &allproc) { if (p->p_pid <= 1 || p == cp || (p->p_flag & PK_SYSTEM) != 0) continue; mutex_enter(p->p_lock); if (kauth_authorize_process(pc, KAUTH_PROCESS_SIGNAL, p, KAUTH_ARG(signo), NULL, NULL) == 0) { nfound++; if (signo) kpsignal2(p, ksi); } mutex_exit(p->p_lock); } } else { if (pgid == 0) /* Zero pgid means send to my process group. */ pgrp = cp->p_pgrp; else { pgrp = pgrp_find(pgid); if (pgrp == NULL) goto out; } LIST_FOREACH(p, &pgrp->pg_members, p_pglist) { if (p->p_pid <= 1 || p->p_flag & PK_SYSTEM) continue; mutex_enter(p->p_lock); if (kauth_authorize_process(pc, KAUTH_PROCESS_SIGNAL, p, KAUTH_ARG(signo), NULL, NULL) == 0) { nfound++; if (signo && P_ZOMBIE(p) == 0) kpsignal2(p, ksi); } mutex_exit(p->p_lock); } } out: mutex_exit(&proc_lock); return nfound ? 0 : ESRCH; } /* * Send a signal to a process group. If checktty is set, limit to members * which have a controlling terminal. */ void pgsignal(struct pgrp *pgrp, int sig, int checkctty) { ksiginfo_t ksi; KASSERT(!cpu_intr_p()); KASSERT(mutex_owned(&proc_lock)); KSI_INIT_EMPTY(&ksi); ksi.ksi_signo = sig; kpgsignal(pgrp, &ksi, NULL, checkctty); } void kpgsignal(struct pgrp *pgrp, ksiginfo_t *ksi, void *data, int checkctty) { struct proc *p; KASSERT(!cpu_intr_p()); KASSERT(mutex_owned(&proc_lock)); KASSERT(pgrp != NULL); LIST_FOREACH(p, &pgrp->pg_members, p_pglist) if (checkctty == 0 || p->p_lflag & PL_CONTROLT) kpsignal(p, ksi, data); } /* * Send a signal caused by a trap to the current LWP. If it will be caught * immediately, deliver it with correct code. Otherwise, post it normally. */ void trapsignal(struct lwp *l, ksiginfo_t *ksi) { struct proc *p; struct sigacts *ps; int signo = ksi->ksi_signo; sigset_t *mask; sig_t action; KASSERT(KSI_TRAP_P(ksi)); ksi->ksi_lid = l->l_lid; p = l->l_proc; KASSERT(!cpu_intr_p()); mutex_enter(&proc_lock); mutex_enter(p->p_lock); repeat: /* * If we are exiting, demise now. * * This avoids notifying tracer and deadlocking. */ if (__predict_false(ISSET(p->p_sflag, PS_WEXIT))) { mutex_exit(p->p_lock); mutex_exit(&proc_lock); lwp_exit(l); panic("trapsignal"); /* NOTREACHED */ } /* * The process is already stopping. */ if ((p->p_sflag & PS_STOPPING) != 0) { mutex_exit(&proc_lock); sigswitch_unlock_and_switch_away(l); mutex_enter(&proc_lock); mutex_enter(p->p_lock); goto repeat; } mask = &l->l_sigmask; ps = p->p_sigacts; action = SIGACTION_PS(ps, signo).sa_handler; if (ISSET(p->p_slflag, PSL_TRACED) && !(p->p_pptr == p->p_opptr && ISSET(p->p_lflag, PL_PPWAIT)) && p->p_xsig != SIGKILL && !sigismember(&p->p_sigpend.sp_set, SIGKILL)) { p->p_xsig = signo; p->p_sigctx.ps_faked = true; p->p_sigctx.ps_lwp = ksi->ksi_lid; p->p_sigctx.ps_info = ksi->ksi_info; sigswitch(0, signo, true); if (ktrpoint(KTR_PSIG)) { if (p->p_emul->e_ktrpsig) p->p_emul->e_ktrpsig(signo, action, mask, ksi); else ktrpsig(signo, action, mask, ksi); } return; } const bool caught = sigismember(&p->p_sigctx.ps_sigcatch, signo); const bool masked = sigismember(mask, signo); if (caught && !masked) { mutex_exit(&proc_lock); l->l_ru.ru_nsignals++; kpsendsig(l, ksi, mask); mutex_exit(p->p_lock); if (ktrpoint(KTR_PSIG)) { if (p->p_emul->e_ktrpsig) p->p_emul->e_ktrpsig(signo, action, mask, ksi); else ktrpsig(signo, action, mask, ksi); } return; } /* * If the signal is masked or ignored, then unmask it and * reset it to the default action so that the process or * its tracer will be notified. */ const bool ignored = action == SIG_IGN; if (masked || ignored) { mutex_enter(&ps->sa_mutex); sigdelset(mask, signo); sigdelset(&p->p_sigctx.ps_sigcatch, signo); sigdelset(&p->p_sigctx.ps_sigignore, signo); sigdelset(&SIGACTION_PS(ps, signo).sa_mask, signo); SIGACTION_PS(ps, signo).sa_handler = SIG_DFL; mutex_exit(&ps->sa_mutex); } kpsignal2(p, ksi); mutex_exit(p->p_lock); mutex_exit(&proc_lock); } /* * Fill in signal information and signal the parent for a child status change. */ void child_psignal(struct proc *p, int mask) { ksiginfo_t ksi; struct proc *q; int xsig; KASSERT(mutex_owned(&proc_lock)); KASSERT(mutex_owned(p->p_lock)); xsig = p->p_xsig; KSI_INIT(&ksi); ksi.ksi_signo = SIGCHLD; ksi.ksi_code = (xsig == SIGCONT ? CLD_CONTINUED : CLD_STOPPED); ksi.ksi_pid = p->p_pid; ksi.ksi_uid = kauth_cred_geteuid(p->p_cred); ksi.ksi_status = xsig; ksi.ksi_utime = p->p_stats->p_ru.ru_utime.tv_sec; ksi.ksi_stime = p->p_stats->p_ru.ru_stime.tv_sec; q = p->p_pptr; mutex_exit(p->p_lock); mutex_enter(q->p_lock); if ((q->p_sflag & mask) == 0) kpsignal2(q, &ksi); mutex_exit(q->p_lock); mutex_enter(p->p_lock); } void psignal(struct proc *p, int signo) { ksiginfo_t ksi; KASSERT(!cpu_intr_p()); KASSERT(mutex_owned(&proc_lock)); KSI_INIT_EMPTY(&ksi); ksi.ksi_signo = signo; mutex_enter(p->p_lock); kpsignal2(p, &ksi); mutex_exit(p->p_lock); } void kpsignal(struct proc *p, ksiginfo_t *ksi, void *data) { fdfile_t *ff; file_t *fp; fdtab_t *dt; KASSERT(!cpu_intr_p()); KASSERT(mutex_owned(&proc_lock)); if ((p->p_sflag & PS_WEXIT) == 0 && data) { size_t fd; filedesc_t *fdp = p->p_fd; /* XXXSMP locking */ ksi->ksi_fd = -1; dt = atomic_load_consume(&fdp->fd_dt); for (fd = 0; fd < dt->dt_nfiles; fd++) { if ((ff = dt->dt_ff[fd]) == NULL) continue; if ((fp = atomic_load_consume(&ff->ff_file)) == NULL) continue; if (fp->f_data == data) { ksi->ksi_fd = fd; break; } } } mutex_enter(p->p_lock); kpsignal2(p, ksi); mutex_exit(p->p_lock); } /* * sigismasked: * * Returns true if signal is ignored or masked for the specified LWP. */ int sigismasked(struct lwp *l, int sig) { struct proc *p = l->l_proc; return sigismember(&p->p_sigctx.ps_sigignore, sig) || sigismember(&l->l_sigmask, sig); } /* * sigpost: * * Post a pending signal to an LWP. Returns non-zero if the LWP may * be able to take the signal. */ static int sigpost(struct lwp *l, sig_t action, int prop, int sig) { int rv, masked; struct proc *p = l->l_proc; KASSERT(mutex_owned(p->p_lock)); /* * If the LWP is on the way out, sigclear() will be busy draining all * pending signals. Don't give it more. */ if (l->l_stat == LSZOMB) return 0; SDT_PROBE(proc, kernel, , signal__send, l, p, sig, 0, 0); lwp_lock(l); if (__predict_false((l->l_flag & LW_DBGSUSPEND) != 0)) { if ((prop & SA_KILL) != 0) l->l_flag &= ~LW_DBGSUSPEND; else { lwp_unlock(l); return 0; } } /* * Have the LWP check for signals. This ensures that even if no LWP * is found to take the signal immediately, it should be taken soon. */ signotify(l); /* * SIGCONT can be masked, but if LWP is stopped, it needs restart. * Note: SIGKILL and SIGSTOP cannot be masked. */ masked = sigismember(&l->l_sigmask, sig); if (masked && ((prop & SA_CONT) == 0 || l->l_stat != LSSTOP)) { lwp_unlock(l); return 0; } /* * If killing the process, make it run fast. */ if (__predict_false((prop & SA_KILL) != 0) && action == SIG_DFL && l->l_priority < MAXPRI_USER) { KASSERT(l->l_class == SCHED_OTHER); lwp_changepri(l, MAXPRI_USER); } /* * If the LWP is running or on a run queue, then we win. If it's * sleeping interruptably, wake it and make it take the signal. If * the sleep isn't interruptable, then the chances are it will get * to see the signal soon anyhow. If suspended, it can't take the * signal right now. If it's LWP private or for all LWPs, save it * for later; otherwise punt. */ rv = 0; switch (l->l_stat) { case LSRUN: case LSONPROC: rv = 1; break; case LSSLEEP: if ((l->l_flag & LW_SINTR) != 0) { /* setrunnable() will release the lock. */ setrunnable(l); return 1; } break; case LSSUSPENDED: if ((prop & SA_KILL) != 0 && (l->l_flag & LW_WCORE) != 0) { /* lwp_continue() will release the lock. */ lwp_continue(l); return 1; } break; case LSSTOP: if ((prop & SA_STOP) != 0) break; /* * If the LWP is stopped and we are sending a continue * signal, then start it again. */ if ((prop & SA_CONT) != 0) { if (l->l_wchan != NULL) { l->l_stat = LSSLEEP; p->p_nrlwps++; rv = 1; break; } /* setrunnable() will release the lock. */ setrunnable(l); return 1; } else if (l->l_wchan == NULL || (l->l_flag & LW_SINTR) != 0) { /* setrunnable() will release the lock. */ setrunnable(l); return 1; } break; default: break; } lwp_unlock(l); return rv; } /* * Notify an LWP that it has a pending signal. */ void signotify(struct lwp *l) { KASSERT(lwp_locked(l, NULL)); l->l_flag |= LW_PENDSIG; lwp_need_userret(l); } /* * Find an LWP within process p that is waiting on signal ksi, and hand * it on. */ static int sigunwait(struct proc *p, const ksiginfo_t *ksi) { struct lwp *l; int signo; KASSERT(mutex_owned(p->p_lock)); signo = ksi->ksi_signo; if (ksi->ksi_lid != 0) { /* * Signal came via _lwp_kill(). Find the LWP and see if * it's interested. */ if ((l = lwp_find(p, ksi->ksi_lid)) == NULL) return 0; if (l->l_sigwaited == NULL || !sigismember(&l->l_sigwaitset, signo)) return 0; } else { /* * Look for any LWP that may be interested. */ LIST_FOREACH(l, &p->p_sigwaiters, l_sigwaiter) { KASSERT(l->l_sigwaited != NULL); if (sigismember(&l->l_sigwaitset, signo)) break; } } if (l != NULL) { l->l_sigwaited->ksi_info = ksi->ksi_info; l->l_sigwaited = NULL; LIST_REMOVE(l, l_sigwaiter); cv_signal(&l->l_sigcv); return 1; } return 0; } /* * Send the signal to the process. If the signal has an action, the action * is usually performed by the target process rather than the caller; we add * the signal to the set of pending signals for the process. * * Exceptions: * o When a stop signal is sent to a sleeping process that takes the * default action, the process is stopped without awakening it. * o SIGCONT restarts stopped processes (or puts them back to sleep) * regardless of the signal action (eg, blocked or ignored). * * Other ignored signals are discarded immediately. */ int kpsignal2(struct proc *p, ksiginfo_t *ksi) { int prop, signo = ksi->ksi_signo; struct lwp *l = NULL; ksiginfo_t *kp; lwpid_t lid; sig_t action; bool toall; bool traced; int error = 0; KASSERT(!cpu_intr_p()); KASSERT(mutex_owned(&proc_lock)); KASSERT(mutex_owned(p->p_lock)); KASSERT((ksi->ksi_flags & KSI_QUEUED) == 0); KASSERT(signo > 0); KASSERT(signo < NSIG); /* * If the process is being created by fork, is a zombie or is * exiting, then just drop the signal here and bail out. */ if (p->p_stat != SACTIVE && p->p_stat != SSTOP) return 0; /* * Notify any interested parties of the signal. */ KNOTE(&p->p_klist, NOTE_SIGNAL | signo); /* * Some signals including SIGKILL must act on the entire process. */ kp = NULL; prop = sigprop[signo]; toall = ((prop & SA_TOALL) != 0); lid = toall ? 0 : ksi->ksi_lid; traced = ISSET(p->p_slflag, PSL_TRACED) && !sigismember(&p->p_sigctx.ps_sigpass, signo); /* * If proc is traced, always give parent a chance. */ if (traced) { action = SIG_DFL; if (lid == 0) { /* * If the process is being traced and the signal * is being caught, make sure to save any ksiginfo. */ if ((kp = ksiginfo_alloc(p, ksi, PR_NOWAIT)) == NULL) goto discard; if ((error = sigput(&p->p_sigpend, p, kp)) != 0) goto out; } } else { /* * If the signal is being ignored, then drop it. Note: we * don't set SIGCONT in ps_sigignore, and if it is set to * SIG_IGN, action will be SIG_DFL here. */ if (sigismember(&p->p_sigctx.ps_sigignore, signo)) goto discard; else if (sigismember(&p->p_sigctx.ps_sigcatch, signo)) action = SIG_CATCH; else { action = SIG_DFL; /* * If sending a tty stop signal to a member of an * orphaned process group, discard the signal here if * the action is default; don't stop the process below * if sleeping, and don't clear any pending SIGCONT. */ if (prop & SA_TTYSTOP && p->p_pgrp->pg_jobc == 0) goto discard; if (prop & SA_KILL && p->p_nice > NZERO) p->p_nice = NZERO; } } /* * If stopping or continuing a process, discard any pending * signals that would do the inverse. */ if ((prop & (SA_CONT | SA_STOP)) != 0) { ksiginfoq_t kq; ksiginfo_queue_init(&kq); if ((prop & SA_CONT) != 0) sigclear(&p->p_sigpend, &stopsigmask, &kq); if ((prop & SA_STOP) != 0) sigclear(&p->p_sigpend, &contsigmask, &kq); ksiginfo_queue_drain(&kq); /* XXXSMP */ } /* * If the signal doesn't have SA_CANTMASK (no override for SIGKILL, * please!), check if any LWPs are waiting on it. If yes, pass on * the signal info. The signal won't be processed further here. */ if ((prop & SA_CANTMASK) == 0 && !LIST_EMPTY(&p->p_sigwaiters) && p->p_stat == SACTIVE && (p->p_sflag & PS_STOPPING) == 0 && sigunwait(p, ksi)) goto discard; /* * XXXSMP Should be allocated by the caller, we're holding locks * here. */ if (kp == NULL && (kp = ksiginfo_alloc(p, ksi, PR_NOWAIT)) == NULL) goto discard; /* * LWP private signals are easy - just find the LWP and post * the signal to it. */ if (lid != 0) { l = lwp_find(p, lid); if (l != NULL) { if ((error = sigput(&l->l_sigpend, p, kp)) != 0) goto out; membar_producer(); if (sigpost(l, action, prop, kp->ksi_signo) != 0) signo = -1; } goto out; } /* * Some signals go to all LWPs, even if posted with _lwp_kill() * or for an SA process. */ if (p->p_stat == SACTIVE && (p->p_sflag & PS_STOPPING) == 0) { if (traced) goto deliver; /* * If SIGCONT is default (or ignored) and process is * asleep, we are finished; the process should not * be awakened. */ if ((prop & SA_CONT) != 0 && action == SIG_DFL) goto out; } else { /* * Process is stopped or stopping. * - If traced, then no action is needed, unless killing. * - Run the process only if sending SIGCONT or SIGKILL. */ if (traced && signo != SIGKILL) { goto out; } if ((prop & SA_CONT) != 0 || signo == SIGKILL) { /* * Re-adjust p_nstopchild if the process was * stopped but not yet collected by its parent. */ if (p->p_stat == SSTOP && !p->p_waited) p->p_pptr->p_nstopchild--; p->p_stat = SACTIVE; p->p_sflag &= ~PS_STOPPING; if (traced) { KASSERT(signo == SIGKILL); goto deliver; } /* * Do not make signal pending if SIGCONT is default. * * If the process catches SIGCONT, let it handle the * signal itself (if waiting on event - process runs, * otherwise continues sleeping). */ if ((prop & SA_CONT) != 0) { p->p_xsig = SIGCONT; p->p_sflag |= PS_CONTINUED; child_psignal(p, 0); if (action == SIG_DFL) { KASSERT(signo != SIGKILL); goto deliver; } } } else if ((prop & SA_STOP) != 0) { /* * Already stopped, don't need to stop again. * (If we did the shell could get confused.) */ goto out; } } /* * Make signal pending. */ KASSERT(!traced); if ((error = sigput(&p->p_sigpend, p, kp)) != 0) goto out; deliver: /* * Before we set LW_PENDSIG on any LWP, ensure that the signal is * visible on the per process list (for sigispending()). This * is unlikely to be needed in practice, but... */ membar_producer(); /* * Try to find an LWP that can take the signal. */ LIST_FOREACH(l, &p->p_lwps, l_sibling) { if (sigpost(l, action, prop, kp->ksi_signo) && !toall) break; } signo = -1; out: /* * If the ksiginfo wasn't used, then bin it. XXXSMP freeing memory * with locks held. The caller should take care of this. */ ksiginfo_free(kp); if (signo == -1) return error; discard: SDT_PROBE(proc, kernel, , signal__discard, l, p, signo, 0, 0); return error; } void kpsendsig(struct lwp *l, const ksiginfo_t *ksi, const sigset_t *mask) { struct proc *p = l->l_proc; KASSERT(mutex_owned(p->p_lock)); (*p->p_emul->e_sendsig)(ksi, mask); } /* * Stop any LWPs sleeping interruptably. */ static void proc_stop_lwps(struct proc *p) { struct lwp *l; KASSERT(mutex_owned(p->p_lock)); KASSERT((p->p_sflag & PS_STOPPING) != 0); LIST_FOREACH(l, &p->p_lwps, l_sibling) { lwp_lock(l); if (l->l_stat == LSSLEEP && (l->l_flag & LW_SINTR) != 0) { l->l_stat = LSSTOP; p->p_nrlwps--; } lwp_unlock(l); } } /* * Finish stopping of a process. Mark it stopped and notify the parent. * * Drop p_lock briefly if ppsig is true. */ static void proc_stop_done(struct proc *p, int ppmask) { KASSERT(mutex_owned(&proc_lock)); KASSERT(mutex_owned(p->p_lock)); KASSERT((p->p_sflag & PS_STOPPING) != 0); KASSERT(p->p_nrlwps == 0 || p->p_nrlwps == 1); KASSERT(p->p_nrlwps == 0 || p == curproc); p->p_sflag &= ~PS_STOPPING; p->p_stat = SSTOP; p->p_waited = 0; p->p_pptr->p_nstopchild++; /* child_psignal drops p_lock briefly. */ child_psignal(p, ppmask); cv_broadcast(&p->p_pptr->p_waitcv); } /* * Stop the current process and switch away to the debugger notifying * an event specific to a traced process only. */ void eventswitch(int code, int pe_report_event, int entity) { struct lwp *l = curlwp; struct proc *p = l->l_proc; struct sigacts *ps; sigset_t *mask; sig_t action; ksiginfo_t ksi; const int signo = SIGTRAP; KASSERT(mutex_owned(&proc_lock)); KASSERT(mutex_owned(p->p_lock)); KASSERT(p->p_pptr != initproc); KASSERT(l->l_stat == LSONPROC); KASSERT(ISSET(p->p_slflag, PSL_TRACED)); KASSERT(!ISSET(l->l_flag, LW_SYSTEM)); KASSERT(p->p_nrlwps > 0); KASSERT((code == TRAP_CHLD) || (code == TRAP_LWP) || (code == TRAP_EXEC)); KASSERT((code != TRAP_CHLD) || (entity > 1)); /* prevent pid1 */ KASSERT((code != TRAP_LWP) || (entity > 0)); repeat: /* * If we are exiting, demise now. * * This avoids notifying tracer and deadlocking. */ if (__predict_false(ISSET(p->p_sflag, PS_WEXIT))) { mutex_exit(p->p_lock); mutex_exit(&proc_lock); if (pe_report_event == PTRACE_LWP_EXIT) { /* Avoid double lwp_exit() and panic. */ return; } lwp_exit(l); panic("eventswitch"); /* NOTREACHED */ } /* * If we are no longer traced, abandon this event signal. * * This avoids killing a process after detaching the debugger. */ if (__predict_false(!ISSET(p->p_slflag, PSL_TRACED))) { mutex_exit(p->p_lock); mutex_exit(&proc_lock); return; } /* * If there's a pending SIGKILL process it immediately. */ if (p->p_xsig == SIGKILL || sigismember(&p->p_sigpend.sp_set, SIGKILL)) { mutex_exit(p->p_lock); mutex_exit(&proc_lock); return; } /* * The process is already stopping. */ if ((p->p_sflag & PS_STOPPING) != 0) { mutex_exit(&proc_lock); sigswitch_unlock_and_switch_away(l); mutex_enter(&proc_lock); mutex_enter(p->p_lock); goto repeat; } KSI_INIT_TRAP(&ksi); ksi.ksi_lid = l->l_lid; ksi.ksi_signo = signo; ksi.ksi_code = code; ksi.ksi_pe_report_event = pe_report_event; CTASSERT(sizeof(ksi.ksi_pe_other_pid) == sizeof(ksi.ksi_pe_lwp)); ksi.ksi_pe_other_pid = entity; /* Needed for ktrace */ ps = p->p_sigacts; action = SIGACTION_PS(ps, signo).sa_handler; mask = &l->l_sigmask; p->p_xsig = signo; p->p_sigctx.ps_faked = true; p->p_sigctx.ps_lwp = ksi.ksi_lid; p->p_sigctx.ps_info = ksi.ksi_info; sigswitch(0, signo, true); if (code == TRAP_CHLD) { mutex_enter(&proc_lock); while (l->l_vforkwaiting) cv_wait(&l->l_waitcv, &proc_lock); mutex_exit(&proc_lock); } if (ktrpoint(KTR_PSIG)) { if (p->p_emul->e_ktrpsig) p->p_emul->e_ktrpsig(signo, action, mask, &ksi); else ktrpsig(signo, action, mask, &ksi); } } void eventswitchchild(struct proc *p, int code, int pe_report_event) { mutex_enter(&proc_lock); mutex_enter(p->p_lock); if ((p->p_slflag & (PSL_TRACED|PSL_TRACEDCHILD)) != (PSL_TRACED|PSL_TRACEDCHILD)) { mutex_exit(p->p_lock); mutex_exit(&proc_lock); return; } eventswitch(code, pe_report_event, p->p_oppid); } /* * Stop the current process and switch away when being stopped or traced. */ static void sigswitch(int ppmask, int signo, bool proc_lock_held) { struct lwp *l = curlwp; struct proc *p = l->l_proc; KASSERT(mutex_owned(p->p_lock)); KASSERT(l->l_stat == LSONPROC); KASSERT(p->p_nrlwps > 0); if (proc_lock_held) { KASSERT(mutex_owned(&proc_lock)); } else { KASSERT(!mutex_owned(&proc_lock)); } /* * On entry we know that the process needs to stop. If it's * the result of a 'sideways' stop signal that has been sourced * through issignal(), then stop other LWPs in the process too. */ if (p->p_stat == SACTIVE && (p->p_sflag & PS_STOPPING) == 0) { KASSERT(signo != 0); proc_stop(p, signo); KASSERT(p->p_nrlwps > 0); } /* * If we are the last live LWP, and the stop was a result of * a new signal, then signal the parent. */ if ((p->p_sflag & PS_STOPPING) != 0) { if (!proc_lock_held && !mutex_tryenter(&proc_lock)) { mutex_exit(p->p_lock); mutex_enter(&proc_lock); mutex_enter(p->p_lock); } if (p->p_nrlwps == 1 && (p->p_sflag & PS_STOPPING) != 0) { /* * Note that proc_stop_done() can drop * p->p_lock briefly. */ proc_stop_done(p, ppmask); } mutex_exit(&proc_lock); } sigswitch_unlock_and_switch_away(l); } /* * Unlock and switch away. */ static void sigswitch_unlock_and_switch_away(struct lwp *l) { struct proc *p; p = l->l_proc; KASSERT(mutex_owned(p->p_lock)); KASSERT(!mutex_owned(&proc_lock)); KASSERT(l->l_stat == LSONPROC); KASSERT(p->p_nrlwps > 0); KASSERT(l->l_blcnt == 0); if (p->p_stat == SSTOP || (p->p_sflag & PS_STOPPING) != 0) { p->p_nrlwps--; lwp_lock(l); KASSERT(l->l_stat == LSONPROC || l->l_stat == LSSLEEP); l->l_stat = LSSTOP; lwp_unlock(l); } mutex_exit(p->p_lock); lwp_lock(l); spc_lock(l->l_cpu); mi_switch(l); } /* * Check for a signal from the debugger. */ static int sigchecktrace(void) { struct lwp *l = curlwp; struct proc *p = l->l_proc; int signo; KASSERT(mutex_owned(p->p_lock)); /* If there's a pending SIGKILL, process it immediately. */ if (sigismember(&p->p_sigpend.sp_set, SIGKILL)) return 0; /* * If we are no longer being traced, or the parent didn't * give us a signal, or we're stopping, look for more signals. */ if ((p->p_slflag & PSL_TRACED) == 0 || p->p_xsig == 0 || (p->p_sflag & PS_STOPPING) != 0) return 0; /* * If the new signal is being masked, look for other signals. * `p->p_sigctx.ps_siglist |= mask' is done in setrunnable(). */ signo = p->p_xsig; p->p_xsig = 0; if (sigismember(&l->l_sigmask, signo)) { signo = 0; } return signo; } /* * If the current process has received a signal (should be caught or cause * termination, should interrupt current syscall), return the signal number. * * Stop signals with default action are processed immediately, then cleared; * they aren't returned. This is checked after each entry to the system for * a syscall or trap. * * We will also return -1 if the process is exiting and the current LWP must * follow suit. */ int issignal(struct lwp *l) { struct proc *p; int siglwp, signo, prop; sigpend_t *sp; sigset_t ss; bool traced; p = l->l_proc; sp = NULL; signo = 0; KASSERT(p == curproc); KASSERT(mutex_owned(p->p_lock)); for (;;) { /* Discard any signals that we have decided not to take. */ if (signo != 0) { (void)sigget(sp, NULL, signo, NULL); } /* * If the process is stopped/stopping, then stop ourselves * now that we're on the kernel/userspace boundary. When * we awaken, check for a signal from the debugger. */ if (p->p_stat == SSTOP || (p->p_sflag & PS_STOPPING) != 0) { sigswitch_unlock_and_switch_away(l); mutex_enter(p->p_lock); continue; } else if (p->p_stat == SACTIVE) signo = sigchecktrace(); else signo = 0; /* Signals from the debugger are "out of band". */ sp = NULL; /* * If the debugger didn't provide a signal, find a pending * signal from our set. Check per-LWP signals first, and * then per-process. */ if (signo == 0) { sp = &l->l_sigpend; ss = sp->sp_set; siglwp = l->l_lid; if ((p->p_lflag & PL_PPWAIT) != 0) sigminusset(&vforksigmask, &ss); sigminusset(&l->l_sigmask, &ss); if ((signo = firstsig(&ss)) == 0) { sp = &p->p_sigpend; ss = sp->sp_set; siglwp = 0; if ((p->p_lflag & PL_PPWAIT) != 0) sigminusset(&vforksigmask, &ss); sigminusset(&l->l_sigmask, &ss); if ((signo = firstsig(&ss)) == 0) { /* * No signal pending - clear the * indicator and bail out. */ lwp_lock(l); l->l_flag &= ~LW_PENDSIG; lwp_unlock(l); sp = NULL; break; } } } traced = ISSET(p->p_slflag, PSL_TRACED) && !sigismember(&p->p_sigctx.ps_sigpass, signo); if (sp) { /* Overwrite process' signal context to correspond * to the currently reported LWP. This is necessary * for PT_GET_SIGINFO to report the correct signal when * multiple LWPs have pending signals. We do this only * when the signal comes from the queue, for signals * created by the debugger we assume it set correct * siginfo. */ ksiginfo_t *ksi = TAILQ_FIRST(&sp->sp_info); if (ksi) { p->p_sigctx.ps_lwp = ksi->ksi_lid; p->p_sigctx.ps_info = ksi->ksi_info; } else { p->p_sigctx.ps_lwp = siglwp; memset(&p->p_sigctx.ps_info, 0, sizeof(p->p_sigctx.ps_info)); p->p_sigctx.ps_info._signo = signo; p->p_sigctx.ps_info._code = SI_NOINFO; } } /* * We should see pending but ignored signals only if * we are being traced. */ if (sigismember(&p->p_sigctx.ps_sigignore, signo) && !traced) { /* Discard the signal. */ continue; } /* * If traced, always stop, and stay stopped until released * by the debugger. If the our parent is our debugger waiting * for us and we vforked, don't hang as we could deadlock. */ if (traced && signo != SIGKILL && !(ISSET(p->p_lflag, PL_PPWAIT) && (p->p_pptr == p->p_opptr))) { /* * Take the signal, but don't remove it from the * siginfo queue, because the debugger can send * it later. */ if (sp) sigdelset(&sp->sp_set, signo); p->p_xsig = signo; /* Handling of signal trace */ sigswitch(0, signo, false); mutex_enter(p->p_lock); /* Check for a signal from the debugger. */ if ((signo = sigchecktrace()) == 0) continue; /* Signals from the debugger are "out of band". */ sp = NULL; } prop = sigprop[signo]; /* * Decide whether the signal should be returned. */ switch ((long)SIGACTION(p, signo).sa_handler) { case (long)SIG_DFL: /* * Don't take default actions on system processes. */ if (p->p_pid <= 1) { #ifdef DIAGNOSTIC /* * Are you sure you want to ignore SIGSEGV * in init? XXX */ printf_nolog("Process (pid %d) got sig %d\n", p->p_pid, signo); #endif continue; } /* * If there is a pending stop signal to process with * default action, stop here, then clear the signal. * However, if process is member of an orphaned * process group, ignore tty stop signals. */ if (prop & SA_STOP) { /* * XXX Don't hold proc_lock for p_lflag, * but it's not a big deal. */ if ((traced && !(ISSET(p->p_lflag, PL_PPWAIT) && (p->p_pptr == p->p_opptr))) || ((p->p_lflag & PL_ORPHANPG) != 0 && prop & SA_TTYSTOP)) { /* Ignore the signal. */ continue; } /* Take the signal. */ (void)sigget(sp, NULL, signo, NULL); p->p_xsig = signo; p->p_sflag &= ~PS_CONTINUED; signo = 0; sigswitch(PS_NOCLDSTOP, p->p_xsig, false); mutex_enter(p->p_lock); } else if (prop & SA_IGNORE) { /* * Except for SIGCONT, shouldn't get here. * Default action is to ignore; drop it. */ continue; } break; case (long)SIG_IGN: #ifdef DEBUG_ISSIGNAL /* * Masking above should prevent us ever trying * to take action on an ignored signal other * than SIGCONT, unless process is traced. */ if ((prop & SA_CONT) == 0 && !traced) printf_nolog("issignal\n"); #endif continue; default: /* * This signal has an action, let postsig() process * it. */ break; } break; } l->l_sigpendset = sp; return signo; } /* * Take the action for the specified signal * from the current set of pending signals. */ void postsig(int signo) { struct lwp *l; struct proc *p; struct sigacts *ps; sig_t action; sigset_t *returnmask; ksiginfo_t ksi; l = curlwp; p = l->l_proc; ps = p->p_sigacts; KASSERT(mutex_owned(p->p_lock)); KASSERT(signo > 0); /* * Set the new mask value and also defer further occurrences of this * signal. * * Special case: user has done a sigsuspend. Here the current mask is * not of interest, but rather the mask from before the sigsuspend is * what we want restored after the signal processing is completed. */ if (l->l_sigrestore) { returnmask = &l->l_sigoldmask; l->l_sigrestore = 0; } else returnmask = &l->l_sigmask; /* * Commit to taking the signal before releasing the mutex. */ action = SIGACTION_PS(ps, signo).sa_handler; l->l_ru.ru_nsignals++; if (l->l_sigpendset == NULL) { /* From the debugger */ if (p->p_sigctx.ps_faked && signo == p->p_sigctx.ps_info._signo) { KSI_INIT(&ksi); ksi.ksi_info = p->p_sigctx.ps_info; ksi.ksi_lid = p->p_sigctx.ps_lwp; p->p_sigctx.ps_faked = false; } else { if (!siggetinfo(&l->l_sigpend, &ksi, signo)) (void)siggetinfo(&p->p_sigpend, &ksi, signo); } } else sigget(l->l_sigpendset, &ksi, signo, NULL); if (ktrpoint(KTR_PSIG)) { mutex_exit(p->p_lock); if (p->p_emul->e_ktrpsig) p->p_emul->e_ktrpsig(signo, action, returnmask, &ksi); else ktrpsig(signo, action, returnmask, &ksi); mutex_enter(p->p_lock); } SDT_PROBE(proc, kernel, , signal__handle, signo, &ksi, action, 0, 0); if (action == SIG_DFL) { /* * Default action, where the default is to kill * the process. (Other cases were ignored above.) */ sigexit(l, signo); return; } /* * If we get here, the signal must be caught. */ #ifdef DIAGNOSTIC if (action == SIG_IGN || sigismember(&l->l_sigmask, signo)) panic("postsig action"); #endif kpsendsig(l, &ksi, returnmask); } /* * sendsig: * * Default signal delivery method for NetBSD. */ void sendsig(const struct ksiginfo *ksi, const sigset_t *mask) { struct sigacts *sa; int sig; sig = ksi->ksi_signo; sa = curproc->p_sigacts; switch (sa->sa_sigdesc[sig].sd_vers) { case __SIGTRAMP_SIGCODE_VERSION: #ifdef __HAVE_STRUCT_SIGCONTEXT case __SIGTRAMP_SIGCONTEXT_VERSION_MIN ... __SIGTRAMP_SIGCONTEXT_VERSION_MAX: /* Compat for 1.6 and earlier. */ MODULE_HOOK_CALL_VOID(sendsig_sigcontext_16_hook, (ksi, mask), break); return; #endif /* __HAVE_STRUCT_SIGCONTEXT */ case __SIGTRAMP_SIGINFO_VERSION_MIN ... __SIGTRAMP_SIGINFO_VERSION_MAX: sendsig_siginfo(ksi, mask); return; default: break; } printf("sendsig: bad version %d\n", sa->sa_sigdesc[sig].sd_vers); sigexit(curlwp, SIGILL); } /* * sendsig_reset: * * Reset the signal action. Called from emulation specific sendsig() * before unlocking to deliver the signal. */ void sendsig_reset(struct lwp *l, int signo) { struct proc *p = l->l_proc; struct sigacts *ps = p->p_sigacts; KASSERT(mutex_owned(p->p_lock)); p->p_sigctx.ps_lwp = 0; memset(&p->p_sigctx.ps_info, 0, sizeof(p->p_sigctx.ps_info)); mutex_enter(&ps->sa_mutex); sigplusset(&SIGACTION_PS(ps, signo).sa_mask, &l->l_sigmask); if (SIGACTION_PS(ps, signo).sa_flags & SA_RESETHAND) { sigdelset(&p->p_sigctx.ps_sigcatch, signo); if (signo != SIGCONT && sigprop[signo] & SA_IGNORE) sigaddset(&p->p_sigctx.ps_sigignore, signo); SIGACTION_PS(ps, signo).sa_handler = SIG_DFL; } mutex_exit(&ps->sa_mutex); } /* * Kill the current process for stated reason. */ void killproc(struct proc *p, const char *why) { KASSERT(mutex_owned(&proc_lock)); log(LOG_ERR, "pid %d was killed: %s\n", p->p_pid, why); uprintf_locked("sorry, pid %d was killed: %s\n", p->p_pid, why); psignal(p, SIGKILL); } /* * Force the current process to exit with the specified signal, dumping core * if appropriate. We bypass the normal tests for masked and caught * signals, allowing unrecoverable failures to terminate the process without * changing signal state. Mark the accounting record with the signal * termination. If dumping core, save the signal number for the debugger. * Calls exit and does not return. */ void sigexit(struct lwp *l, int signo) { int exitsig, error, docore; struct proc *p; struct lwp *t; p = l->l_proc; KASSERT(mutex_owned(p->p_lock)); KASSERT(l->l_blcnt == 0); /* * Don't permit coredump() multiple times in the same process. * Call back into sigexit, where we will be suspended until * the deed is done. Note that this is a recursive call, but * LW_WCORE will prevent us from coming back this way. */ if ((p->p_sflag & PS_WCORE) != 0) { lwp_lock(l); l->l_flag |= (LW_WCORE | LW_WEXIT | LW_WSUSPEND); lwp_need_userret(l); lwp_unlock(l); mutex_exit(p->p_lock); lwp_userret(l); panic("sigexit 1"); /* NOTREACHED */ } /* If process is already on the way out, then bail now. */ if ((p->p_sflag & PS_WEXIT) != 0) { mutex_exit(p->p_lock); lwp_exit(l); panic("sigexit 2"); /* NOTREACHED */ } /* * Prepare all other LWPs for exit. If dumping core, suspend them * so that their registers are available long enough to be dumped. */ if ((docore = (sigprop[signo] & SA_CORE)) != 0) { p->p_sflag |= PS_WCORE; for (;;) { LIST_FOREACH(t, &p->p_lwps, l_sibling) { lwp_lock(t); if (t == l) { t->l_flag &= ~(LW_WSUSPEND | LW_DBGSUSPEND); lwp_unlock(t); continue; } t->l_flag |= (LW_WCORE | LW_WEXIT); lwp_need_userret(t); lwp_suspend(l, t); } if (p->p_nrlwps == 1) break; /* * Kick any LWPs sitting in lwp_wait1(), and wait * for everyone else to stop before proceeding. */ p->p_nlwpwait++; cv_broadcast(&p->p_lwpcv); cv_wait(&p->p_lwpcv, p->p_lock); p->p_nlwpwait--; } } exitsig = signo; p->p_acflag |= AXSIG; memset(&p->p_sigctx.ps_info, 0, sizeof(p->p_sigctx.ps_info)); p->p_sigctx.ps_info._signo = signo; p->p_sigctx.ps_info._code = SI_NOINFO; if (docore) { mutex_exit(p->p_lock); MODULE_HOOK_CALL(coredump_hook, (l, NULL), enosys(), error); if (kern_logsigexit) { int uid = l->l_cred ? (int)kauth_cred_geteuid(l->l_cred) : -1; if (error) log(LOG_INFO, lognocoredump, p->p_pid, p->p_comm, uid, signo, error); else log(LOG_INFO, logcoredump, p->p_pid, p->p_comm, uid, signo); } #ifdef PAX_SEGVGUARD rw_enter(&exec_lock, RW_WRITER); pax_segvguard(l, p->p_textvp, p->p_comm, true); rw_exit(&exec_lock); #endif /* PAX_SEGVGUARD */ /* Acquire the sched state mutex. exit1() will release it. */ mutex_enter(p->p_lock); if (error == 0) p->p_sflag |= PS_COREDUMP; } /* No longer dumping core. */ p->p_sflag &= ~PS_WCORE; exit1(l, 0, exitsig); /* NOTREACHED */ } /* * Since the "real" code may (or may not) be present in loadable module, * we provide routines here which calls the module hooks. */ int coredump_netbsd(struct lwp *l, struct coredump_iostate *iocookie) { int retval; MODULE_HOOK_CALL(coredump_netbsd_hook, (l, iocookie), ENOSYS, retval); return retval; } int coredump_netbsd32(struct lwp *l, struct coredump_iostate *iocookie) { int retval; MODULE_HOOK_CALL(coredump_netbsd32_hook, (l, iocookie), ENOSYS, retval); return retval; } int coredump_elf32(struct lwp *l, struct coredump_iostate *iocookie) { int retval; MODULE_HOOK_CALL(coredump_elf32_hook, (l, iocookie), ENOSYS, retval); return retval; } int coredump_elf64(struct lwp *l, struct coredump_iostate *iocookie) { int retval; MODULE_HOOK_CALL(coredump_elf64_hook, (l, iocookie), ENOSYS, retval); return retval; } /* * Put process 'p' into the stopped state and optionally, notify the parent. */ void proc_stop(struct proc *p, int signo) { struct lwp *l; KASSERT(mutex_owned(p->p_lock)); /* * First off, set the stopping indicator and bring all sleeping * LWPs to a halt so they are included in p->p_nrlwps. We mustn't * unlock between here and the p->p_nrlwps check below. */ p->p_sflag |= PS_STOPPING; membar_producer(); proc_stop_lwps(p); /* * If there are no LWPs available to take the signal, then we * signal the parent process immediately. Otherwise, the last * LWP to stop will take care of it. */ if (p->p_nrlwps == 0) { proc_stop_done(p, PS_NOCLDSTOP); } else { /* * Have the remaining LWPs come to a halt, and trigger * proc_stop_callout() to ensure that they do. */ LIST_FOREACH(l, &p->p_lwps, l_sibling) { sigpost(l, SIG_DFL, SA_STOP, signo); } callout_schedule(&proc_stop_ch, 1); } } /* * When stopping a process, we do not immediately set sleeping LWPs stopped, * but wait for them to come to a halt at the kernel-user boundary. This is * to allow LWPs to release any locks that they may hold before stopping. * * Non-interruptable sleeps can be long, and there is the potential for an * LWP to begin sleeping interruptably soon after the process has been set * stopping (PS_STOPPING). These LWPs will not notice that the process is * stopping, and so complete halt of the process and the return of status * information to the parent could be delayed indefinitely. * * To handle this race, proc_stop_callout() runs once per tick while there * are stopping processes in the system. It sets LWPs that are sleeping * interruptably into the LSSTOP state. * * Note that we are not concerned about keeping all LWPs stopped while the * process is stopped: stopped LWPs can awaken briefly to handle signals. * What we do need to ensure is that all LWPs in a stopping process have * stopped at least once, so that notification can be sent to the parent * process. */ static void proc_stop_callout(void *cookie) { bool more, restart; struct proc *p; (void)cookie; do { restart = false; more = false; mutex_enter(&proc_lock); PROCLIST_FOREACH(p, &allproc) { mutex_enter(p->p_lock); if ((p->p_sflag & PS_STOPPING) == 0) { mutex_exit(p->p_lock); continue; } /* Stop any LWPs sleeping interruptably. */ proc_stop_lwps(p); if (p->p_nrlwps == 0) { /* * We brought the process to a halt. * Mark it as stopped and notify the * parent. * * Note that proc_stop_done() will * drop p->p_lock briefly. * Arrange to restart and check * all processes again. */ restart = true; proc_stop_done(p, PS_NOCLDSTOP); } else more = true; mutex_exit(p->p_lock); if (restart) break; } mutex_exit(&proc_lock); } while (restart); /* * If we noted processes that are stopping but still have * running LWPs, then arrange to check again in 1 tick. */ if (more) callout_schedule(&proc_stop_ch, 1); } /* * Given a process in state SSTOP, set the state back to SACTIVE and * move LSSTOP'd LWPs to LSSLEEP or make them runnable. */ void proc_unstop(struct proc *p) { struct lwp *l; int sig; KASSERT(mutex_owned(&proc_lock)); KASSERT(mutex_owned(p->p_lock)); p->p_stat = SACTIVE; p->p_sflag &= ~PS_STOPPING; sig = p->p_xsig; if (!p->p_waited) p->p_pptr->p_nstopchild--; LIST_FOREACH(l, &p->p_lwps, l_sibling) { lwp_lock(l); if (l->l_stat != LSSTOP || (l->l_flag & LW_DBGSUSPEND) != 0) { lwp_unlock(l); continue; } if (l->l_wchan == NULL) { setrunnable(l); continue; } if (sig && (l->l_flag & LW_SINTR) != 0) { setrunnable(l); sig = 0; } else { l->l_stat = LSSLEEP; p->p_nrlwps++; lwp_unlock(l); } } } void proc_stoptrace(int trapno, int sysnum, const register_t args[], const register_t *ret, int error) { struct lwp *l = curlwp; struct proc *p = l->l_proc; struct sigacts *ps; sigset_t *mask; sig_t action; ksiginfo_t ksi; size_t i, sy_narg; const int signo = SIGTRAP; KASSERT((trapno == TRAP_SCE) || (trapno == TRAP_SCX)); KASSERT(p->p_pptr != initproc); KASSERT(ISSET(p->p_slflag, PSL_TRACED)); KASSERT(ISSET(p->p_slflag, PSL_SYSCALL)); sy_narg = p->p_emul->e_sysent[sysnum].sy_narg; KSI_INIT_TRAP(&ksi); ksi.ksi_lid = l->l_lid; ksi.ksi_signo = signo; ksi.ksi_code = trapno; ksi.ksi_sysnum = sysnum; if (trapno == TRAP_SCE) { ksi.ksi_retval[0] = 0; ksi.ksi_retval[1] = 0; ksi.ksi_error = 0; } else { ksi.ksi_retval[0] = ret[0]; ksi.ksi_retval[1] = ret[1]; ksi.ksi_error = error; } memset(ksi.ksi_args, 0, sizeof(ksi.ksi_args)); for (i = 0; i < sy_narg; i++) ksi.ksi_args[i] = args[i]; mutex_enter(p->p_lock); repeat: /* * If we are exiting, demise now. * * This avoids notifying tracer and deadlocking. */ if (__predict_false(ISSET(p->p_sflag, PS_WEXIT))) { mutex_exit(p->p_lock); lwp_exit(l); panic("proc_stoptrace"); /* NOTREACHED */ } /* * If there's a pending SIGKILL process it immediately. */ if (p->p_xsig == SIGKILL || sigismember(&p->p_sigpend.sp_set, SIGKILL)) { mutex_exit(p->p_lock); return; } /* * If we are no longer traced, abandon this event signal. * * This avoids killing a process after detaching the debugger. */ if (__predict_false(!ISSET(p->p_slflag, PSL_TRACED))) { mutex_exit(p->p_lock); return; } /* * The process is already stopping. */ if ((p->p_sflag & PS_STOPPING) != 0) { sigswitch_unlock_and_switch_away(l); mutex_enter(p->p_lock); goto repeat; } /* Needed for ktrace */ ps = p->p_sigacts; action = SIGACTION_PS(ps, signo).sa_handler; mask = &l->l_sigmask; p->p_xsig = signo; p->p_sigctx.ps_lwp = ksi.ksi_lid; p->p_sigctx.ps_info = ksi.ksi_info; sigswitch(0, signo, false); if (ktrpoint(KTR_PSIG)) { if (p->p_emul->e_ktrpsig) p->p_emul->e_ktrpsig(signo, action, mask, &ksi); else ktrpsig(signo, action, mask, &ksi); } } static int filt_sigattach(struct knote *kn) { struct proc *p = curproc; kn->kn_obj = p; kn->kn_flags |= EV_CLEAR; /* automatically set */ mutex_enter(p->p_lock); klist_insert(&p->p_klist, kn); mutex_exit(p->p_lock); return 0; } static void filt_sigdetach(struct knote *kn) { struct proc *p = kn->kn_obj; mutex_enter(p->p_lock); klist_remove(&p->p_klist, kn); mutex_exit(p->p_lock); } /* * Signal knotes are shared with proc knotes, so we apply a mask to * the hint in order to differentiate them from process hints. This * could be avoided by using a signal-specific knote list, but probably * isn't worth the trouble. */ static int filt_signal(struct knote *kn, long hint) { if (hint & NOTE_SIGNAL) { hint &= ~NOTE_SIGNAL; if (kn->kn_id == hint) kn->kn_data++; } return (kn->kn_data != 0); } const struct filterops sig_filtops = { .f_flags = FILTEROP_MPSAFE, .f_attach = filt_sigattach, .f_detach = filt_sigdetach, .f_event = filt_signal, };
4 1 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 /* $NetBSD: subr_prof.c,v 1.50 2021/08/14 17:51:20 ryo Exp $ */ /*- * Copyright (c) 1982, 1986, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)subr_prof.c 8.4 (Berkeley) 2/14/95 */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: subr_prof.c,v 1.50 2021/08/14 17:51:20 ryo Exp $"); #ifdef _KERNEL_OPT #include "opt_gprof.h" #include "opt_multiprocessor.h" #endif #include <sys/param.h> #include <sys/systm.h> #include <sys/kernel.h> #include <sys/proc.h> #include <sys/mount.h> #include <sys/syscallargs.h> #include <sys/sysctl.h> #include <sys/cpu.h> #ifdef GPROF #include <sys/malloc.h> #include <sys/gmon.h> #include <sys/xcall.h> MALLOC_DEFINE(M_GPROF, "gprof", "kernel profiling buffer"); static int sysctl_kern_profiling(SYSCTLFN_ARGS); #ifdef MULTIPROCESSOR void _gmonparam_merge(struct gmonparam *, struct gmonparam *); #endif /* * Froms is actually a bunch of unsigned shorts indexing tos */ struct gmonparam _gmonparam = { .state = GMON_PROF_OFF }; /* Actual start of the kernel text segment. */ extern char kernel_text[]; extern char etext[]; void kmstartup(void) { char *cp; struct gmonparam *p = &_gmonparam; unsigned long size; /* * Round lowpc and highpc to multiples of the density we're using * so the rest of the scaling (here and in gprof) stays in ints. */ p->lowpc = rounddown(((u_long)kernel_text), HISTFRACTION * sizeof(HISTCOUNTER)); p->highpc = roundup((u_long)etext, HISTFRACTION * sizeof(HISTCOUNTER)); p->textsize = p->highpc - p->lowpc; printf("Profiling kernel, textsize=%ld [%lx..%lx]\n", p->textsize, p->lowpc, p->highpc); p->kcountsize = p->textsize / HISTFRACTION; p->hashfraction = HASHFRACTION; p->fromssize = p->textsize / HASHFRACTION; p->tolimit = p->textsize * ARCDENSITY / 100; if (p->tolimit < MINARCS) p->tolimit = MINARCS; else if (p->tolimit > MAXARCS) p->tolimit = MAXARCS; p->tossize = p->tolimit * sizeof(struct tostruct); size = p->kcountsize + p->fromssize + p->tossize; #ifdef MULTIPROCESSOR CPU_INFO_ITERATOR cii; struct cpu_info *ci; for (CPU_INFO_FOREACH(cii, ci)) { p = malloc(sizeof(struct gmonparam) + size, M_GPROF, M_NOWAIT | M_ZERO); if (p == NULL) { printf("No memory for profiling on %s\n", cpu_name(ci)); /* cannot profile on this cpu */ continue; } memcpy(p, &_gmonparam, sizeof(_gmonparam)); ci->ci_gmon = p; /* * To allow profiling to be controlled only by the global * _gmonparam.state, set the default value for each CPU to * GMON_PROF_ON. If _gmonparam.state is not ON, mcount will * not be executed. * This is For compatibility of the kgmon(8) kmem interface. */ p->state = GMON_PROF_ON; cp = (char *)(p + 1); p->tos = (struct tostruct *)cp; p->kcount = (u_short *)(cp + p->tossize); p->froms = (u_short *)(cp + p->tossize + p->kcountsize); } sysctl_createv(NULL, 0, NULL, NULL, 0, CTLTYPE_NODE, "percpu", SYSCTL_DESCR("per cpu profiling information"), NULL, 0, NULL, 0, CTL_KERN, KERN_PROF, GPROF_PERCPU, CTL_EOL); for (CPU_INFO_FOREACH(cii, ci)) { if (ci->ci_gmon == NULL) continue; sysctl_createv(NULL, 0, NULL, NULL, 0, CTLTYPE_NODE, cpu_name(ci), NULL, NULL, 0, NULL, 0, CTL_KERN, KERN_PROF, GPROF_PERCPU, cpu_index(ci), CTL_EOL); sysctl_createv(NULL, 0, NULL, NULL, CTLFLAG_READWRITE, CTLTYPE_INT, "state", SYSCTL_DESCR("Profiling state"), sysctl_kern_profiling, 0, (void *)ci, 0, CTL_KERN, KERN_PROF, GPROF_PERCPU, cpu_index(ci), GPROF_STATE, CTL_EOL); sysctl_createv(NULL, 0, NULL, NULL, CTLFLAG_READWRITE, CTLTYPE_STRUCT, "count", SYSCTL_DESCR("Array of statistical program counters"), sysctl_kern_profiling, 0, (void *)ci, 0, CTL_KERN, KERN_PROF, GPROF_PERCPU, cpu_index(ci), GPROF_COUNT, CTL_EOL); sysctl_createv(NULL, 0, NULL, NULL, CTLFLAG_READWRITE, CTLTYPE_STRUCT, "froms", SYSCTL_DESCR("Array indexed by program counter of " "call-from points"), sysctl_kern_profiling, 0, (void *)ci, 0, CTL_KERN, KERN_PROF, GPROF_PERCPU, cpu_index(ci), GPROF_FROMS, CTL_EOL); sysctl_createv(NULL, 0, NULL, NULL, CTLFLAG_READWRITE, CTLTYPE_STRUCT, "tos", SYSCTL_DESCR("Array of structures describing " "destination of calls and their counts"), sysctl_kern_profiling, 0, (void *)ci, 0, CTL_KERN, KERN_PROF, GPROF_PERCPU, cpu_index(ci), GPROF_TOS, CTL_EOL); sysctl_createv(NULL, 0, NULL, NULL, CTLFLAG_READWRITE, CTLTYPE_STRUCT, "gmonparam", SYSCTL_DESCR("Structure giving the sizes of the above " "arrays"), sysctl_kern_profiling, 0, (void *)ci, 0, CTL_KERN, KERN_PROF, GPROF_PERCPU, cpu_index(ci), GPROF_GMONPARAM, CTL_EOL); } /* * For minimal compatibility of the kgmon(8) kmem interface, * the _gmonparam and cpu0:ci_gmon share buffers. */ p = curcpu()->ci_gmon; if (p != NULL) { _gmonparam.tos = p->tos; _gmonparam.kcount = p->kcount; _gmonparam.froms = p->froms; } #else /* MULTIPROCESSOR */ cp = malloc(size, M_GPROF, M_NOWAIT | M_ZERO); if (cp == 0) { printf("No memory for profiling.\n"); return; } p->tos = (struct tostruct *)cp; cp += p->tossize; p->kcount = (u_short *)cp; cp += p->kcountsize; p->froms = (u_short *)cp; #endif /* MULTIPROCESSOR */ } #ifdef MULTIPROCESSOR static void prof_set_state_xc(void *arg1, void *arg2 __unused) { int state = PTRTOUINT64(arg1); struct gmonparam *gp = curcpu()->ci_gmon; if (gp != NULL) gp->state = state; } #endif /* MULTIPROCESSOR */ /* * Return kernel profiling information. */ /* * sysctl helper routine for kern.profiling subtree. enables/disables * kernel profiling and gives out copies of the profiling data. */ static int sysctl_kern_profiling(SYSCTLFN_ARGS) { struct sysctlnode node = *rnode; struct gmonparam *gp; int error; #ifdef MULTIPROCESSOR CPU_INFO_ITERATOR cii; struct cpu_info *ci, *target_ci; uint64_t where; int state; bool prof_on, do_merge; target_ci = (struct cpu_info *)rnode->sysctl_data; do_merge = (oldp != NULL) && (target_ci == NULL) && ((node.sysctl_num == GPROF_COUNT) || (node.sysctl_num == GPROF_FROMS) || (node.sysctl_num == GPROF_TOS)); if (do_merge) { /* kern.profiling.{count,froms,tos} */ unsigned long size; char *cp; /* allocate temporary gmonparam, and merge results of all CPU */ size = _gmonparam.kcountsize + _gmonparam.fromssize + _gmonparam.tossize; gp = malloc(sizeof(struct gmonparam) + size, M_GPROF, M_NOWAIT | M_ZERO); if (gp == NULL) return ENOMEM; memcpy(gp, &_gmonparam, sizeof(_gmonparam)); cp = (char *)(gp + 1); gp->tos = (struct tostruct *)cp; gp->kcount = (u_short *)(cp + gp->tossize); gp->froms = (u_short *)(cp + gp->tossize + gp->kcountsize); for (CPU_INFO_FOREACH(cii, ci)) { if (ci->ci_gmon == NULL) continue; _gmonparam_merge(gp, ci->ci_gmon); } } else if (target_ci != NULL) { /* kern.profiling.percpu.* */ gp = target_ci->ci_gmon; } else { /* kern.profiling.{state,gmonparam} */ gp = &_gmonparam; } #else /* MULTIPROCESSOR */ gp = &_gmonparam; #endif switch (node.sysctl_num) { case GPROF_STATE: #ifdef MULTIPROCESSOR /* * if _gmonparam.state is OFF, the state of each CPU is * considered to be OFF, even if it is actually ON. */ if (_gmonparam.state == GMON_PROF_OFF || gp->state == GMON_PROF_OFF) state = GMON_PROF_OFF; else state = GMON_PROF_ON; node.sysctl_data = &state; #else node.sysctl_data = &gp->state; #endif break; case GPROF_COUNT: node.sysctl_data = gp->kcount; node.sysctl_size = gp->kcountsize; break; case GPROF_FROMS: node.sysctl_data = gp->froms; node.sysctl_size = gp->fromssize; break; case GPROF_TOS: node.sysctl_data = gp->tos; node.sysctl_size = gp->tossize; break; case GPROF_GMONPARAM: node.sysctl_data = gp; node.sysctl_size = sizeof(*gp); break; default: return (EOPNOTSUPP); } error = sysctl_lookup(SYSCTLFN_CALL(&node)); if (error || newp == NULL) goto done; #ifdef MULTIPROCESSOR switch (node.sysctl_num) { case GPROF_STATE: if (target_ci != NULL) { where = xc_unicast(0, prof_set_state_xc, UINT64TOPTR(state), NULL, target_ci); xc_wait(where); /* if even one CPU being profiled, enable perfclock. */ prof_on = false; for (CPU_INFO_FOREACH(cii, ci)) { if (ci->ci_gmon == NULL) continue; if (ci->ci_gmon->state != GMON_PROF_OFF) { prof_on = true; break; } } mutex_spin_enter(&proc0.p_stmutex); if (prof_on) startprofclock(&proc0); else stopprofclock(&proc0); mutex_spin_exit(&proc0.p_stmutex); if (prof_on) { _gmonparam.state = GMON_PROF_ON; } else { _gmonparam.state = GMON_PROF_OFF; /* * when _gmonparam.state and all CPU gmon state * are OFF, all CPU states should be ON so that * the entire CPUs profiling can be controlled * by _gmonparam.state only. */ for (CPU_INFO_FOREACH(cii, ci)) { if (ci->ci_gmon == NULL) continue; ci->ci_gmon->state = GMON_PROF_ON; } } } else { _gmonparam.state = state; where = xc_broadcast(0, prof_set_state_xc, UINT64TOPTR(state), NULL); xc_wait(where); mutex_spin_enter(&proc0.p_stmutex); if (state == GMON_PROF_OFF) stopprofclock(&proc0); else startprofclock(&proc0); mutex_spin_exit(&proc0.p_stmutex); } break; case GPROF_COUNT: /* * if 'kern.profiling.{count,froms,tos}' is written, the same * data will be written to 'kern.profiling.percpu.cpuN.xxx' */ if (target_ci == NULL) { for (CPU_INFO_FOREACH(cii, ci)) { if (ci->ci_gmon == NULL) continue; memmove(ci->ci_gmon->kcount, gp->kcount, newlen); } } break; case GPROF_FROMS: if (target_ci == NULL) { for (CPU_INFO_FOREACH(cii, ci)) { if (ci->ci_gmon == NULL) continue; memmove(ci->ci_gmon->froms, gp->froms, newlen); } } break; case GPROF_TOS: if (target_ci == NULL) { for (CPU_INFO_FOREACH(cii, ci)) { if (ci->ci_gmon == NULL) continue; memmove(ci->ci_gmon->tos, gp->tos, newlen); } } break; } #else if (node.sysctl_num == GPROF_STATE) { mutex_spin_enter(&proc0.p_stmutex); if (gp->state == GMON_PROF_OFF) stopprofclock(&proc0); else startprofclock(&proc0); mutex_spin_exit(&proc0.p_stmutex); } #endif done: #ifdef MULTIPROCESSOR if (do_merge) free(gp, M_GPROF); #endif return error; } SYSCTL_SETUP(sysctl_kern_gprof_setup, "sysctl kern.profiling subtree setup") { sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT, CTLTYPE_NODE, "profiling", SYSCTL_DESCR("Profiling information (available)"), NULL, 0, NULL, 0, CTL_KERN, KERN_PROF, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_INT, "state", SYSCTL_DESCR("Profiling state"), sysctl_kern_profiling, 0, NULL, 0, CTL_KERN, KERN_PROF, GPROF_STATE, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_STRUCT, "count", SYSCTL_DESCR("Array of statistical program counters"), sysctl_kern_profiling, 0, NULL, 0, CTL_KERN, KERN_PROF, GPROF_COUNT, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_STRUCT, "froms", SYSCTL_DESCR("Array indexed by program counter of " "call-from points"), sysctl_kern_profiling, 0, NULL, 0, CTL_KERN, KERN_PROF, GPROF_FROMS, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_STRUCT, "tos", SYSCTL_DESCR("Array of structures describing " "destination of calls and their counts"), sysctl_kern_profiling, 0, NULL, 0, CTL_KERN, KERN_PROF, GPROF_TOS, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT, CTLTYPE_STRUCT, "gmonparam", SYSCTL_DESCR("Structure giving the sizes of the above " "arrays"), sysctl_kern_profiling, 0, NULL, 0, CTL_KERN, KERN_PROF, GPROF_GMONPARAM, CTL_EOL); } #endif /* GPROF */ /* * Profiling system call. * * The scale factor is a fixed point number with 16 bits of fraction, so that * 1.0 is represented as 0x10000. A scale factor of 0 turns off profiling. */ /* ARGSUSED */ int sys_profil(struct lwp *l, const struct sys_profil_args *uap, register_t *retval) { /* { syscallarg(char *) samples; syscallarg(size_t) size; syscallarg(u_long) offset; syscallarg(u_int) scale; } */ struct proc *p = l->l_proc; struct uprof *upp; if (SCARG(uap, scale) > (1 << 16)) return (EINVAL); if (SCARG(uap, scale) == 0) { mutex_spin_enter(&p->p_stmutex); stopprofclock(p); mutex_spin_exit(&p->p_stmutex); return (0); } upp = &p->p_stats->p_prof; /* Block profile interrupts while changing state. */ mutex_spin_enter(&p->p_stmutex); upp->pr_off = SCARG(uap, offset); upp->pr_scale = SCARG(uap, scale); upp->pr_base = SCARG(uap, samples); upp->pr_size = SCARG(uap, size); startprofclock(p); mutex_spin_exit(&p->p_stmutex); return (0); } /* * Scale is a fixed-point number with the binary point 16 bits * into the value, and is <= 1.0. pc is at most 32 bits, so the * intermediate result is at most 48 bits. */ #define PC_TO_INDEX(pc, prof) \ ((int)(((u_quad_t)((pc) - (prof)->pr_off) * \ (u_quad_t)((prof)->pr_scale)) >> 16) & ~1) /* * Collect user-level profiling statistics; called on a profiling tick, * when a process is running in user-mode. This routine may be called * from an interrupt context. We schedule an AST that will vector us * to trap() with a context in which copyin and copyout will work. * Trap will then call addupc_task(). * * XXX We could use ufetch/ustore here if the profile buffers were * wired. * * Note that we may (rarely) not get around to the AST soon enough, and * lose profile ticks when the next tick overwrites this one, but in this * case the system is overloaded and the profile is probably already * inaccurate. */ void addupc_intr(struct lwp *l, u_long pc) { struct uprof *prof; struct proc *p; u_int i; p = l->l_proc; KASSERT(mutex_owned(&p->p_stmutex)); prof = &p->p_stats->p_prof; if (pc < prof->pr_off || (i = PC_TO_INDEX(pc, prof)) >= prof->pr_size) return; /* out of range; ignore */ mutex_spin_exit(&p->p_stmutex); /* XXXSMP */ prof->pr_addr = pc; prof->pr_ticks++; cpu_need_proftick(l); mutex_spin_enter(&p->p_stmutex); } /* * Much like before, but we can afford to take faults here. If the * update fails, we simply turn off profiling. */ void addupc_task(struct lwp *l, u_long pc, u_int ticks) { struct uprof *prof; struct proc *p; void *addr; int error; u_int i; u_short v; p = l->l_proc; if (ticks == 0) return; mutex_spin_enter(&p->p_stmutex); prof = &p->p_stats->p_prof; /* Testing P_PROFIL may be unnecessary, but is certainly safe. */ if ((p->p_stflag & PST_PROFIL) == 0 || pc < prof->pr_off || (i = PC_TO_INDEX(pc, prof)) >= prof->pr_size) { mutex_spin_exit(&p->p_stmutex); return; } addr = prof->pr_base + i; mutex_spin_exit(&p->p_stmutex); if ((error = copyin(addr, (void *)&v, sizeof(v))) == 0) { v += ticks; error = copyout((void *)&v, addr, sizeof(v)); } if (error != 0) { mutex_spin_enter(&p->p_stmutex); stopprofclock(p); mutex_spin_exit(&p->p_stmutex); } }
2 2 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 /* $NetBSD: sys_sig.c,v 1.57 2023/10/04 20:42:38 ad Exp $ */ /*- * Copyright (c) 2006, 2007, 2008 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Andrew Doran. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /* * Copyright (c) 1982, 1986, 1989, 1991, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)kern_sig.c 8.14 (Berkeley) 5/14/95 */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: sys_sig.c,v 1.57 2023/10/04 20:42:38 ad Exp $"); #include "opt_dtrace.h" #include <sys/param.h> #include <sys/kernel.h> #include <sys/signalvar.h> #include <sys/proc.h> #include <sys/pool.h> #include <sys/syscallargs.h> #include <sys/kauth.h> #include <sys/wait.h> #include <sys/kmem.h> #include <sys/module.h> #include <sys/sdt.h> #include <sys/compat_stub.h> SDT_PROVIDER_DECLARE(proc); SDT_PROBE_DEFINE2(proc, kernel, , signal__clear, "int", /* signal */ "ksiginfo_t *"); /* signal-info */ int sys___sigaction_sigtramp(struct lwp *l, const struct sys___sigaction_sigtramp_args *uap, register_t *retval) { /* { syscallarg(int) signum; syscallarg(const struct sigaction *) nsa; syscallarg(struct sigaction *) osa; syscallarg(void *) tramp; syscallarg(int) vers; } */ struct sigaction nsa, osa; int error; if (SCARG(uap, nsa)) { error = copyin(SCARG(uap, nsa), &nsa, sizeof(nsa)); if (error) return (error); } error = sigaction1(l, SCARG(uap, signum), SCARG(uap, nsa) ? &nsa : 0, SCARG(uap, osa) ? &osa : 0, SCARG(uap, tramp), SCARG(uap, vers)); if (error) return (error); if (SCARG(uap, osa)) { error = copyout(&osa, SCARG(uap, osa), sizeof(osa)); if (error) return (error); } return 0; } /* * Manipulate signal mask. Note that we receive new mask, not pointer, and * return old mask as return value; the library stub does the rest. */ int sys___sigprocmask14(struct lwp *l, const struct sys___sigprocmask14_args *uap, register_t *retval) { /* { syscallarg(int) how; syscallarg(const sigset_t *) set; syscallarg(sigset_t *) oset; } */ struct proc *p = l->l_proc; sigset_t nss, oss; int error; if (SCARG(uap, set)) { error = copyin(SCARG(uap, set), &nss, sizeof(nss)); if (error) return error; } mutex_enter(p->p_lock); error = sigprocmask1(l, SCARG(uap, how), SCARG(uap, set) ? &nss : 0, SCARG(uap, oset) ? &oss : 0); mutex_exit(p->p_lock); if (error) return error; if (SCARG(uap, oset)) { error = copyout(&oss, SCARG(uap, oset), sizeof(oss)); if (error) return error; } return 0; } int sys___sigpending14(struct lwp *l, const struct sys___sigpending14_args *uap, register_t *retval) { /* { syscallarg(sigset_t *) set; } */ sigset_t ss; sigpending1(l, &ss); return copyout(&ss, SCARG(uap, set), sizeof(ss)); } /* * Suspend process until signal, providing mask to be set in the meantime. * Note nonstandard calling convention: libc stub passes mask, not pointer, * to save a copyin. */ int sys___sigsuspend14(struct lwp *l, const struct sys___sigsuspend14_args *uap, register_t *retval) { /* { syscallarg(const sigset_t *) set; } */ sigset_t ss; int error; if (SCARG(uap, set)) { error = copyin(SCARG(uap, set), &ss, sizeof(ss)); if (error) return error; } return sigsuspend1(l, SCARG(uap, set) ? &ss : 0); } int sys___sigaltstack14(struct lwp *l, const struct sys___sigaltstack14_args *uap, register_t *retval) { /* { syscallarg(const struct sigaltstack *) nss; syscallarg(struct sigaltstack *) oss; } */ stack_t nss, oss; int error; if (SCARG(uap, nss)) { error = copyin(SCARG(uap, nss), &nss, sizeof(nss)); if (error) return error; } error = sigaltstack1(l, SCARG(uap, nss) ? &nss : 0, SCARG(uap, oss) ? &oss : 0); if (error) return error; if (SCARG(uap, oss)) { error = copyout(&oss, SCARG(uap, oss), sizeof(oss)); if (error) return error; } return 0; } int kill1(struct lwp *l, pid_t pid, ksiginfo_t *ksi, register_t *retval) { int error; struct proc *p; if ((u_int)ksi->ksi_signo >= NSIG) return EINVAL; if (pid != l->l_proc->p_pid) { if (ksi->ksi_pid != l->l_proc->p_pid) return EPERM; if (ksi->ksi_uid != kauth_cred_geteuid(l->l_cred)) return EPERM; switch (ksi->ksi_code) { case SI_USER: case SI_QUEUE: break; default: return EPERM; } } if (pid > 0) { /* kill single process */ mutex_enter(&proc_lock); p = proc_find_raw(pid); if (p == NULL || (p->p_stat != SACTIVE && p->p_stat != SSTOP)) { mutex_exit(&proc_lock); /* IEEE Std 1003.1-2001: return success for zombies */ return p ? 0 : ESRCH; } mutex_enter(p->p_lock); error = kauth_authorize_process(l->l_cred, KAUTH_PROCESS_SIGNAL, p, KAUTH_ARG(ksi->ksi_signo), NULL, NULL); if (!error && ksi->ksi_signo) { error = kpsignal2(p, ksi); } mutex_exit(p->p_lock); mutex_exit(&proc_lock); return error; } switch (pid) { case -1: /* broadcast signal */ return killpg1(l, ksi, 0, 1); case 0: /* signal own process group */ return killpg1(l, ksi, 0, 0); default: /* negative explicit process group */ return killpg1(l, ksi, -pid, 0); } /* NOTREACHED */ } int sys_sigqueueinfo(struct lwp *l, const struct sys_sigqueueinfo_args *uap, register_t *retval) { /* { syscallarg(pid_t int) pid; syscallarg(const siginfo_t *) info; } */ ksiginfo_t ksi; int error; KSI_INIT(&ksi); if ((error = copyin(&SCARG(uap, info)->_info, &ksi.ksi_info, sizeof(ksi.ksi_info))) != 0) return error; return kill1(l, SCARG(uap, pid), &ksi, retval); } int sys_kill(struct lwp *l, const struct sys_kill_args *uap, register_t *retval) { /* { syscallarg(pid_t) pid; syscallarg(int) signum; } */ ksiginfo_t ksi; KSI_INIT(&ksi); ksi.ksi_signo = SCARG(uap, signum); ksi.ksi_code = SI_USER; ksi.ksi_pid = l->l_proc->p_pid; ksi.ksi_uid = kauth_cred_geteuid(l->l_cred); return kill1(l, SCARG(uap, pid), &ksi, retval); } int sys_getcontext(struct lwp *l, const struct sys_getcontext_args *uap, register_t *retval) { /* { syscallarg(struct __ucontext *) ucp; } */ struct proc *p = l->l_proc; ucontext_t uc; memset(&uc, 0, sizeof(uc)); mutex_enter(p->p_lock); getucontext(l, &uc); mutex_exit(p->p_lock); return copyout(&uc, SCARG(uap, ucp), sizeof (*SCARG(uap, ucp))); } int sys_setcontext(struct lwp *l, const struct sys_setcontext_args *uap, register_t *retval) { /* { syscallarg(const ucontext_t *) ucp; } */ struct proc *p = l->l_proc; ucontext_t uc; int error; error = copyin(SCARG(uap, ucp), &uc, sizeof (uc)); if (error) return error; if ((uc.uc_flags & _UC_CPU) == 0) return EINVAL; mutex_enter(p->p_lock); error = setucontext(l, &uc); mutex_exit(p->p_lock); if (error) return error; return EJUSTRETURN; } /* * sigtimedwait(2) system call, used also for implementation * of sigwaitinfo() and sigwait(). * * This only handles single LWP in signal wait. libpthread provides * its own sigtimedwait() wrapper to DTRT WRT individual threads. */ int sys_____sigtimedwait50(struct lwp *l, const struct sys_____sigtimedwait50_args *uap, register_t *retval) { return sigtimedwait1(l, uap, retval, copyin, copyout, copyin, copyout); } int sigaction1(struct lwp *l, int signum, const struct sigaction *nsa, struct sigaction *osa, const void *tramp, int vers) { struct proc *p; struct sigacts *ps; sigset_t tset; int prop, error; ksiginfoq_t kq; static bool v0v1valid; if (signum <= 0 || signum >= NSIG) return EINVAL; p = l->l_proc; error = 0; ksiginfo_queue_init(&kq); /* * Trampoline ABI version __SIGTRAMP_SIGCODE_VERSION (0) is reserved * for the legacy kernel provided on-stack trampoline. Conversely, * if we are using a non-0 ABI version, we must have a trampoline. * Only validate the vers if a new sigaction was supplied and there * was an actual handler specified (not SIG_IGN or SIG_DFL), which * don't require a trampoline. Emulations use legacy kernel * trampolines with version 0, alternatively check for that too. * * If version < __SIGTRAMP_SIGINFO_VERSION_MIN (usually 2), we try * to autoload the compat module. Note that we interlock with the * unload check in compat_modcmd() using kernconfig_lock. If the * autoload fails, we don't try it again for this process. */ if (nsa != NULL && nsa->sa_handler != SIG_IGN && nsa->sa_handler != SIG_DFL) { if (__predict_false(vers < __SIGTRAMP_SIGINFO_VERSION_MIN)) { if (vers == __SIGTRAMP_SIGCODE_VERSION && p->p_sigctx.ps_sigcode != NULL) { /* * if sigcode is used for this emulation, * version 0 is allowed. */ } #ifdef __HAVE_STRUCT_SIGCONTEXT else if (p->p_flag & PK_32) { /* * The 32-bit compat module will have * pre-validated this for us. */ v0v1valid = true; } else if ((p->p_lflag & PL_SIGCOMPAT) == 0) { kernconfig_lock(); (void)module_autoload("compat_16", MODULE_CLASS_ANY); if (sendsig_sigcontext_16_hook.hooked) { /* * We need to remember if the * sigcontext method may be useable, * because libc may use it even * if siginfo is available. */ v0v1valid = true; } mutex_enter(&proc_lock); /* * Prevent unload of compat module while * this process remains. */ p->p_lflag |= PL_SIGCOMPAT; mutex_exit(&proc_lock); kernconfig_unlock(); } #endif /* __HAVE_STRUCT_SIGCONTEXT */ } switch (vers) { case __SIGTRAMP_SIGCODE_VERSION: /* kernel supplied trampoline. */ if (tramp != NULL || (p->p_sigctx.ps_sigcode == NULL && !v0v1valid)) { return EINVAL; } break; #ifdef __HAVE_STRUCT_SIGCONTEXT case __SIGTRAMP_SIGCONTEXT_VERSION_MIN ... __SIGTRAMP_SIGCONTEXT_VERSION_MAX: /* sigcontext, user supplied trampoline. */ if (tramp == NULL || !v0v1valid) { return EINVAL; } break; #endif /* __HAVE_STRUCT_SIGCONTEXT */ case __SIGTRAMP_SIGINFO_VERSION_MIN ... __SIGTRAMP_SIGINFO_VERSION_MAX: /* siginfo, user supplied trampoline. */ if (tramp == NULL) { return EINVAL; } break; default: /* Invalid trampoline version. */ return EINVAL; } } mutex_enter(p->p_lock); ps = p->p_sigacts; if (osa) sigaction_copy(osa, &SIGACTION_PS(ps, signum)); if (!nsa) goto out; prop = sigprop[signum]; if ((nsa->sa_flags & ~SA_ALLBITS) || (prop & SA_CANTMASK)) { error = EINVAL; goto out; } sigaction_copy(&SIGACTION_PS(ps, signum), nsa); ps->sa_sigdesc[signum].sd_tramp = tramp; ps->sa_sigdesc[signum].sd_vers = vers; sigminusset(&sigcantmask, &SIGACTION_PS(ps, signum).sa_mask); if ((prop & SA_NORESET) != 0) SIGACTION_PS(ps, signum).sa_flags &= ~SA_RESETHAND; if (signum == SIGCHLD) { if (nsa->sa_flags & SA_NOCLDSTOP) p->p_sflag |= PS_NOCLDSTOP; else p->p_sflag &= ~PS_NOCLDSTOP; if (nsa->sa_flags & SA_NOCLDWAIT) { /* * Paranoia: since SA_NOCLDWAIT is implemented by * reparenting the dying child to PID 1 (and trust * it to reap the zombie), PID 1 itself is forbidden * to set SA_NOCLDWAIT. */ if (p->p_pid == 1) p->p_flag &= ~PK_NOCLDWAIT; else p->p_flag |= PK_NOCLDWAIT; } else p->p_flag &= ~PK_NOCLDWAIT; if (nsa->sa_handler == SIG_IGN) { /* * Paranoia: same as above. */ if (p->p_pid == 1) p->p_flag &= ~PK_CLDSIGIGN; else p->p_flag |= PK_CLDSIGIGN; } else p->p_flag &= ~PK_CLDSIGIGN; } if ((nsa->sa_flags & SA_NODEFER) == 0) sigaddset(&SIGACTION_PS(ps, signum).sa_mask, signum); else sigdelset(&SIGACTION_PS(ps, signum).sa_mask, signum); /* * Set bit in p_sigctx.ps_sigignore for signals that are set to * SIG_IGN, and for signals set to SIG_DFL where the default is to * ignore. However, don't put SIGCONT in p_sigctx.ps_sigignore, as * we have to restart the process. */ if (nsa->sa_handler == SIG_IGN || (nsa->sa_handler == SIG_DFL && (prop & SA_IGNORE) != 0)) { /* Never to be seen again. */ sigemptyset(&tset); sigaddset(&tset, signum); sigclearall(p, &tset, &kq); if (signum != SIGCONT) { /* Easier in psignal */ sigaddset(&p->p_sigctx.ps_sigignore, signum); } sigdelset(&p->p_sigctx.ps_sigcatch, signum); } else { sigdelset(&p->p_sigctx.ps_sigignore, signum); if (nsa->sa_handler == SIG_DFL) sigdelset(&p->p_sigctx.ps_sigcatch, signum); else sigaddset(&p->p_sigctx.ps_sigcatch, signum); } /* * Previously held signals may now have become visible. Ensure that * we check for them before returning to userspace. */ if (sigispending(l, 0)) { lwp_lock(l); l->l_flag |= LW_PENDSIG; lwp_need_userret(l); lwp_unlock(l); } out: mutex_exit(p->p_lock); ksiginfo_queue_drain(&kq); return error; } int sigprocmask1(struct lwp *l, int how, const sigset_t *nss, sigset_t *oss) { sigset_t *mask = &l->l_sigmask; bool more; KASSERT(mutex_owned(l->l_proc->p_lock)); if (oss) { *oss = *mask; } if (nss == NULL) { return 0; } switch (how) { case SIG_BLOCK: sigplusset(nss, mask); more = false; break; case SIG_UNBLOCK: sigminusset(nss, mask); more = true; break; case SIG_SETMASK: *mask = *nss; more = true; break; default: return EINVAL; } sigminusset(&sigcantmask, mask); if (more && sigispending(l, 0)) { /* * Check for pending signals on return to user. */ lwp_lock(l); l->l_flag |= LW_PENDSIG; lwp_need_userret(l); lwp_unlock(l); } return 0; } void sigpending1(struct lwp *l, sigset_t *ss) { struct proc *p = l->l_proc; mutex_enter(p->p_lock); *ss = l->l_sigpend.sp_set; sigplusset(&p->p_sigpend.sp_set, ss); mutex_exit(p->p_lock); } void sigsuspendsetup(struct lwp *l, const sigset_t *ss) { struct proc *p = l->l_proc; /* * When returning from sigsuspend/pselect/pollts, we want * the old mask to be restored after the * signal handler has finished. Thus, we * save it here and mark the sigctx structure * to indicate this. */ mutex_enter(p->p_lock); l->l_sigrestore = 1; l->l_sigoldmask = l->l_sigmask; l->l_sigmask = *ss; sigminusset(&sigcantmask, &l->l_sigmask); /* Check for pending signals when sleeping. */ if (sigispending(l, 0)) { lwp_lock(l); l->l_flag |= LW_PENDSIG; lwp_need_userret(l); lwp_unlock(l); } mutex_exit(p->p_lock); } void sigsuspendteardown(struct lwp *l) { struct proc *p = l->l_proc; mutex_enter(p->p_lock); /* Check for pending signals when sleeping. */ if (l->l_sigrestore) { if (sigispending(l, 0)) { lwp_lock(l); l->l_flag |= LW_PENDSIG; lwp_need_userret(l); lwp_unlock(l); } else { l->l_sigrestore = 0; l->l_sigmask = l->l_sigoldmask; } } mutex_exit(p->p_lock); } int sigsuspend1(struct lwp *l, const sigset_t *ss) { if (ss) sigsuspendsetup(l, ss); while (kpause("pause", true, 0, NULL) == 0) ; /* always return EINTR rather than ERESTART... */ return EINTR; } int sigaltstack1(struct lwp *l, const stack_t *nss, stack_t *oss) { struct proc *p = l->l_proc; int error = 0; mutex_enter(p->p_lock); if (oss) *oss = l->l_sigstk; if (nss) { if (nss->ss_flags & ~SS_ALLBITS) error = EINVAL; else if (nss->ss_flags & SS_DISABLE) { if (l->l_sigstk.ss_flags & SS_ONSTACK) error = EINVAL; } else if (nss->ss_size < MINSIGSTKSZ) error = ENOMEM; if (!error) l->l_sigstk = *nss; } mutex_exit(p->p_lock); return error; } int sigtimedwait1(struct lwp *l, const struct sys_____sigtimedwait50_args *uap, register_t *retval, copyin_t fetchss, copyout_t storeinf, copyin_t fetchts, copyout_t storets) { /* { syscallarg(const sigset_t *) set; syscallarg(siginfo_t *) info; syscallarg(struct timespec *) timeout; } */ struct proc *p = l->l_proc; int error, signum, timo; struct timespec ts, tsstart, tsnow; ksiginfo_t ksi; /* * Calculate timeout, if it was specified. * * NULL pointer means an infinite timeout. * {.tv_sec = 0, .tv_nsec = 0} means do not block. */ if (SCARG(uap, timeout)) { error = (*fetchts)(SCARG(uap, timeout), &ts, sizeof(ts)); if (error) return error; if ((error = itimespecfix(&ts)) != 0) return error; timo = tstohz(&ts); if (timo == 0) { if (ts.tv_sec == 0 && ts.tv_nsec == 0) timo = -1; /* do not block */ else timo = 1; /* the shortest possible timeout */ } /* * Remember current uptime, it would be used in * ECANCELED/ERESTART case. */ getnanouptime(&tsstart); } else { memset(&tsstart, 0, sizeof(tsstart)); /* XXXgcc */ timo = 0; /* infinite timeout */ } error = (*fetchss)(SCARG(uap, set), &l->l_sigwaitset, sizeof(l->l_sigwaitset)); if (error) return error; /* * Silently ignore SA_CANTMASK signals. psignal1() would ignore * SA_CANTMASK signals in waitset, we do this only for the below * siglist check. */ sigminusset(&sigcantmask, &l->l_sigwaitset); memset(&ksi.ksi_info, 0, sizeof(ksi.ksi_info)); mutex_enter(p->p_lock); /* Check for pending signals in the process, if no - then in LWP. */ if ((signum = sigget(&p->p_sigpend, &ksi, 0, &l->l_sigwaitset)) == 0) signum = sigget(&l->l_sigpend, &ksi, 0, &l->l_sigwaitset); if (signum != 0) { /* If found a pending signal, just copy it out to the user. */ mutex_exit(p->p_lock); goto out; } if (timo < 0) { /* If not allowed to block, return an error */ mutex_exit(p->p_lock); return EAGAIN; } /* * Set up the sigwait list and wait for signal to arrive. * We can either be woken up or time out. */ l->l_sigwaited = &ksi; LIST_INSERT_HEAD(&p->p_sigwaiters, l, l_sigwaiter); error = cv_timedwait_sig(&l->l_sigcv, p->p_lock, timo); /* * Need to find out if we woke as a result of _lwp_wakeup() or a * signal outside our wait set. */ if (l->l_sigwaited != NULL) { if (error == EINTR) { /* Wakeup via _lwp_wakeup(). */ error = ECANCELED; } else if (!error) { /* Spurious wakeup - arrange for syscall restart. */ error = ERESTART; } l->l_sigwaited = NULL; LIST_REMOVE(l, l_sigwaiter); } mutex_exit(p->p_lock); /* * If the sleep was interrupted (either by signal or wakeup), update * the timeout and copyout new value back. It would be used when * the syscall would be restarted or called again. */ if (timo && (error == ERESTART || error == ECANCELED)) { getnanouptime(&tsnow); /* Compute how much time has passed since start. */ timespecsub(&tsnow, &tsstart, &tsnow); /* Subtract passed time from timeout. */ timespecsub(&ts, &tsnow, &ts); if (ts.tv_sec < 0) error = EAGAIN; else { /* Copy updated timeout to userland. */ error = (*storets)(&ts, SCARG(uap, timeout), sizeof(ts)); } } out: /* * If a signal from the wait set arrived, copy it to userland. * Copy only the used part of siginfo, the padding part is * left unchanged (userland is not supposed to touch it anyway). */ if (error == 0 && SCARG(uap, info)) { error = (*storeinf)(&ksi.ksi_info, SCARG(uap, info), sizeof(ksi.ksi_info)); } if (error == 0) { *retval = ksi.ksi_info._signo; SDT_PROBE(proc, kernel, , signal__clear, *retval, &ksi, 0, 0, 0); } return error; }
20 20 14 4 2 1 9 9 9 9 9 9 5 22 22 29 5 29 8 1 2 5 19 29 29 27 25 3 1 20 10 19 1 20 20 20 6 29 22 7 7 7 6 4 5 7 5 1 1 5 5 3 4 5 16 1 15 7 7 7 16 3 13 12 5 3 7 13 3 14 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 /* $NetBSD: kern_exec.c,v 1.521 2023/10/08 12:38:58 ad Exp $ */ /*- * Copyright (c) 2008, 2019, 2020 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Andrew Doran. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /*- * Copyright (C) 1993, 1994, 1996 Christopher G. Demetriou * Copyright (C) 1992 Wolfgang Solfrank. * Copyright (C) 1992 TooLs GmbH. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by TooLs GmbH. * 4. The name of TooLs GmbH may not be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY TOOLS GMBH ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL TOOLS GMBH BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: kern_exec.c,v 1.521 2023/10/08 12:38:58 ad Exp $"); #include "opt_exec.h" #include "opt_execfmt.h" #include "opt_ktrace.h" #include "opt_modular.h" #include "opt_syscall_debug.h" #include "veriexec.h" #include "opt_pax.h" #include <sys/param.h> #include <sys/systm.h> #include <sys/filedesc.h> #include <sys/kernel.h> #include <sys/proc.h> #include <sys/ptrace.h> #include <sys/mount.h> #include <sys/kmem.h> #include <sys/namei.h> #include <sys/vnode.h> #include <sys/file.h> #include <sys/filedesc.h> #include <sys/acct.h> #include <sys/atomic.h> #include <sys/exec.h> #include <sys/futex.h> #include <sys/ktrace.h> #include <sys/uidinfo.h> #include <sys/wait.h> #include <sys/mman.h> #include <sys/ras.h> #include <sys/signalvar.h> #include <sys/stat.h> #include <sys/syscall.h> #include <sys/kauth.h> #include <sys/lwpctl.h> #include <sys/pax.h> #include <sys/cpu.h> #include <sys/module.h> #include <sys/syscallvar.h> #include <sys/syscallargs.h> #include <sys/vfs_syscalls.h> #if NVERIEXEC > 0 #include <sys/verified_exec.h> #endif /* NVERIEXEC > 0 */ #include <sys/sdt.h> #include <sys/spawn.h> #include <sys/prot.h> #include <sys/cprng.h> #include <uvm/uvm_extern.h> #include <machine/reg.h> #include <compat/common/compat_util.h> #ifndef MD_TOPDOWN_INIT #ifdef __USE_TOPDOWN_VM #define MD_TOPDOWN_INIT(epp) (epp)->ep_flags |= EXEC_TOPDOWN_VM #else #define MD_TOPDOWN_INIT(epp) #endif #endif struct execve_data; extern int user_va0_disable; static size_t calcargs(struct execve_data * restrict, const size_t); static size_t calcstack(struct execve_data * restrict, const size_t); static int copyoutargs(struct execve_data * restrict, struct lwp *, char * const); static int copyoutpsstrs(struct execve_data * restrict, struct proc *); static int copyinargs(struct execve_data * restrict, char * const *, char * const *, execve_fetch_element_t, char **); static int copyinargstrs(struct execve_data * restrict, char * const *, execve_fetch_element_t, char **, size_t *, void (*)(const void *, size_t)); static int exec_sigcode_map(struct proc *, const struct emul *); #if defined(DEBUG) && !defined(DEBUG_EXEC) #define DEBUG_EXEC #endif #ifdef DEBUG_EXEC #define DPRINTF(a) printf a #define COPYPRINTF(s, a, b) printf("%s, %d: copyout%s @%p %zu\n", __func__, \ __LINE__, (s), (a), (b)) static void dump_vmcmds(const struct exec_package * const, size_t, int); #define DUMPVMCMDS(p, x, e) do { dump_vmcmds((p), (x), (e)); } while (0) #else #define DPRINTF(a) #define COPYPRINTF(s, a, b) #define DUMPVMCMDS(p, x, e) do {} while (0) #endif /* DEBUG_EXEC */ /* * DTrace SDT provider definitions */ SDT_PROVIDER_DECLARE(proc); SDT_PROBE_DEFINE1(proc, kernel, , exec, "char *"); SDT_PROBE_DEFINE1(proc, kernel, , exec__success, "char *"); SDT_PROBE_DEFINE1(proc, kernel, , exec__failure, "int"); /* * Exec function switch: * * Note that each makecmds function is responsible for loading the * exec package with the necessary functions for any exec-type-specific * handling. * * Functions for specific exec types should be defined in their own * header file. */ static const struct execsw **execsw = NULL; static int nexecs; u_int exec_maxhdrsz; /* must not be static - used by netbsd32 */ /* list of dynamically loaded execsw entries */ static LIST_HEAD(execlist_head, exec_entry) ex_head = LIST_HEAD_INITIALIZER(ex_head); struct exec_entry { LIST_ENTRY(exec_entry) ex_list; SLIST_ENTRY(exec_entry) ex_slist; const struct execsw *ex_sw; }; #ifndef __HAVE_SYSCALL_INTERN void syscall(void); #endif /* NetBSD autoloadable syscalls */ #ifdef MODULAR #include <kern/syscalls_autoload.c> #endif /* NetBSD emul struct */ struct emul emul_netbsd = { .e_name = "netbsd", #ifdef EMUL_NATIVEROOT .e_path = EMUL_NATIVEROOT, #else .e_path = NULL, #endif #ifndef __HAVE_MINIMAL_EMUL .e_flags = EMUL_HAS_SYS___syscall, .e_errno = NULL, .e_nosys = SYS_syscall, .e_nsysent = SYS_NSYSENT, #endif #ifdef MODULAR .e_sc_autoload = netbsd_syscalls_autoload, #endif .e_sysent = sysent, .e_nomodbits = sysent_nomodbits, #ifdef SYSCALL_DEBUG .e_syscallnames = syscallnames, #else .e_syscallnames = NULL, #endif .e_sendsig = sendsig, .e_trapsignal = trapsignal, .e_sigcode = NULL, .e_esigcode = NULL, .e_sigobject = NULL, .e_setregs = setregs, .e_proc_exec = NULL, .e_proc_fork = NULL, .e_proc_exit = NULL, .e_lwp_fork = NULL, .e_lwp_exit = NULL, #ifdef __HAVE_SYSCALL_INTERN .e_syscall_intern = syscall_intern, #else .e_syscall = syscall, #endif .e_sysctlovly = NULL, .e_vm_default_addr = uvm_default_mapaddr, .e_usertrap = NULL, .e_ucsize = sizeof(ucontext_t), .e_startlwp = startlwp }; /* * Exec lock. Used to control access to execsw[] structures. * This must not be static so that netbsd32 can access it, too. */ krwlock_t exec_lock __cacheline_aligned; /* * Data used between a loadvm and execve part of an "exec" operation */ struct execve_data { struct exec_package ed_pack; struct pathbuf *ed_pathbuf; struct vattr ed_attr; struct ps_strings ed_arginfo; char *ed_argp; const char *ed_pathstring; char *ed_resolvedname; size_t ed_ps_strings_sz; int ed_szsigcode; size_t ed_argslen; long ed_argc; long ed_envc; }; /* * data passed from parent lwp to child during a posix_spawn() */ struct spawn_exec_data { struct execve_data sed_exec; struct posix_spawn_file_actions *sed_actions; struct posix_spawnattr *sed_attrs; struct proc *sed_parent; kcondvar_t sed_cv_child_ready; kmutex_t sed_mtx_child; int sed_error; volatile uint32_t sed_refcnt; }; static struct vm_map *exec_map; static struct pool exec_pool; static void * exec_pool_alloc(struct pool *pp, int flags) { return (void *)uvm_km_alloc(exec_map, NCARGS, 0, UVM_KMF_PAGEABLE | UVM_KMF_WAITVA); } static void exec_pool_free(struct pool *pp, void *addr) { uvm_km_free(exec_map, (vaddr_t)addr, NCARGS, UVM_KMF_PAGEABLE); } static struct pool_allocator exec_palloc = { .pa_alloc = exec_pool_alloc, .pa_free = exec_pool_free, .pa_pagesz = NCARGS }; static void exec_path_free(struct execve_data *data) { pathbuf_stringcopy_put(data->ed_pathbuf, data->ed_pathstring); pathbuf_destroy(data->ed_pathbuf); if (data->ed_resolvedname) PNBUF_PUT(data->ed_resolvedname); } static int exec_resolvename(struct lwp *l, struct exec_package *epp, struct vnode *vp, char **rpath) { int error; char *p; KASSERT(rpath != NULL); *rpath = PNBUF_GET(); error = vnode_to_path(*rpath, MAXPATHLEN, vp, l, l->l_proc); if (error) { DPRINTF(("%s: can't resolve name for %s, error %d\n", __func__, epp->ep_kname, error)); PNBUF_PUT(*rpath); *rpath = NULL; return error; } epp->ep_resolvedname = *rpath; if ((p = strrchr(*rpath, '/')) != NULL) epp->ep_kname = p + 1; return 0; } /* * check exec: * given an "executable" described in the exec package's namei info, * see what we can do with it. * * ON ENTRY: * exec package with appropriate namei info * lwp pointer of exec'ing lwp * NO SELF-LOCKED VNODES * * ON EXIT: * error: nothing held, etc. exec header still allocated. * ok: filled exec package, executable's vnode (unlocked). * * EXEC SWITCH ENTRY: * Locked vnode to check, exec package, proc. * * EXEC SWITCH EXIT: * ok: return 0, filled exec package, executable's vnode (unlocked). * error: destructive: * everything deallocated execept exec header. * non-destructive: * error code, executable's vnode (unlocked), * exec header unmodified. */ int /*ARGSUSED*/ check_exec(struct lwp *l, struct exec_package *epp, struct pathbuf *pb, char **rpath) { int error, i; struct vnode *vp; size_t resid; if (epp->ep_resolvedname) { struct nameidata nd; // grab the absolute pathbuf here before namei() trashes it. pathbuf_copystring(pb, epp->ep_resolvedname, PATH_MAX); NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | TRYEMULROOT, pb); /* first get the vnode */ if ((error = namei(&nd)) != 0) return error; epp->ep_vp = vp = nd.ni_vp; #ifdef DIAGNOSTIC /* paranoia (take this out once namei stuff stabilizes) */ memset(nd.ni_pnbuf, '~', PATH_MAX); #endif } else { struct file *fp; if ((error = fd_getvnode(epp->ep_xfd, &fp)) != 0) return error; epp->ep_vp = vp = fp->f_vnode; vref(vp); fd_putfile(epp->ep_xfd); if ((error = exec_resolvename(l, epp, vp, rpath)) != 0) return error; vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); } /* check access and type */ if (vp->v_type != VREG) { error = EACCES; goto bad1; } if ((error = VOP_ACCESS(vp, VEXEC, l->l_cred)) != 0) goto bad1; /* get attributes */ /* XXX VOP_GETATTR is the only thing that needs LK_EXCLUSIVE here */ if ((error = VOP_GETATTR(vp, epp->ep_vap, l->l_cred)) != 0) goto bad1; /* Check mount point */ if (vp->v_mount->mnt_flag & MNT_NOEXEC) { error = EACCES; goto bad1; } if (vp->v_mount->mnt_flag & MNT_NOSUID) epp->ep_vap->va_mode &= ~(S_ISUID | S_ISGID); /* try to open it */ if ((error = VOP_OPEN(vp, FREAD, l->l_cred)) != 0) goto bad1; /* now we have the file, get the exec header */ error = vn_rdwr(UIO_READ, vp, epp->ep_hdr, epp->ep_hdrlen, 0, UIO_SYSSPACE, IO_NODELOCKED, l->l_cred, &resid, NULL); if (error) goto bad1; /* unlock vp, since we need it unlocked from here on out. */ VOP_UNLOCK(vp); #if NVERIEXEC > 0 error = veriexec_verify(l, vp, epp->ep_resolvedname ? epp->ep_resolvedname : epp->ep_kname, epp->ep_flags & EXEC_INDIR ? VERIEXEC_INDIRECT : VERIEXEC_DIRECT, NULL); if (error) goto bad2; #endif /* NVERIEXEC > 0 */ #ifdef PAX_SEGVGUARD error = pax_segvguard(l, vp, epp->ep_resolvedname, false); if (error) goto bad2; #endif /* PAX_SEGVGUARD */ epp->ep_hdrvalid = epp->ep_hdrlen - resid; /* * Set up default address space limits. Can be overridden * by individual exec packages. */ epp->ep_vm_minaddr = exec_vm_minaddr(VM_MIN_ADDRESS); epp->ep_vm_maxaddr = VM_MAXUSER_ADDRESS; /* * set up the vmcmds for creation of the process * address space */ error = ENOEXEC; for (i = 0; i < nexecs; i++) { int newerror; epp->ep_esch = execsw[i]; newerror = (*execsw[i]->es_makecmds)(l, epp); if (!newerror) { /* Seems ok: check that entry point is not too high */ if (epp->ep_entry >= epp->ep_vm_maxaddr) { #ifdef DIAGNOSTIC printf("%s: rejecting %p due to " "too high entry address (>= %p)\n", __func__, (void *)epp->ep_entry, (void *)epp->ep_vm_maxaddr); #endif error = ENOEXEC; break; } /* Seems ok: check that entry point is not too low */ if (epp->ep_entry < epp->ep_vm_minaddr) { #ifdef DIAGNOSTIC printf("%s: rejecting %p due to " "too low entry address (< %p)\n", __func__, (void *)epp->ep_entry, (void *)epp->ep_vm_minaddr); #endif error = ENOEXEC; break; } /* check limits */ #ifdef DIAGNOSTIC #define LMSG "%s: rejecting due to %s limit (%ju > %ju)\n" #endif #ifdef MAXTSIZ if (epp->ep_tsize > MAXTSIZ) { #ifdef DIAGNOSTIC printf(LMSG, __func__, "text", (uintmax_t)epp->ep_tsize, (uintmax_t)MAXTSIZ); #endif error = ENOMEM; break; } #endif vsize_t dlimit = (vsize_t)l->l_proc->p_rlimit[RLIMIT_DATA].rlim_cur; if (epp->ep_dsize > dlimit) { #ifdef DIAGNOSTIC printf(LMSG, __func__, "data", (uintmax_t)epp->ep_dsize, (uintmax_t)dlimit); #endif error = ENOMEM; break; } return 0; } /* * Reset all the fields that may have been modified by the * loader. */ KASSERT(epp->ep_emul_arg == NULL); if (epp->ep_emul_root != NULL) { vrele(epp->ep_emul_root); epp->ep_emul_root = NULL; } if (epp->ep_interp != NULL) { vrele(epp->ep_interp); epp->ep_interp = NULL; } epp->ep_pax_flags = 0; /* make sure the first "interesting" error code is saved. */ if (error == ENOEXEC) error = newerror; if (epp->ep_flags & EXEC_DESTR) /* Error from "#!" code, tidied up by recursive call */ return error; } /* not found, error */ /* * free any vmspace-creation commands, * and release their references */ kill_vmcmds(&epp->ep_vmcmds); #if NVERIEXEC > 0 || defined(PAX_SEGVGUARD) bad2: #endif /* * close and release the vnode, restore the old one, free the * pathname buf, and punt. */ vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); VOP_CLOSE(vp, FREAD, l->l_cred); vput(vp); return error; bad1: /* * free the namei pathname buffer, and put the vnode * (which we don't yet have open). */ vput(vp); /* was still locked */ return error; } #ifdef __MACHINE_STACK_GROWS_UP #define STACK_PTHREADSPACE NBPG #else #define STACK_PTHREADSPACE 0 #endif static int execve_fetch_element(char * const *array, size_t index, char **value) { return copyin(array + index, value, sizeof(*value)); } /* * exec system call */ int sys_execve(struct lwp *l, const struct sys_execve_args *uap, register_t *retval) { /* { syscallarg(const char *) path; syscallarg(char * const *) argp; syscallarg(char * const *) envp; } */ return execve1(l, true, SCARG(uap, path), -1, SCARG(uap, argp), SCARG(uap, envp), execve_fetch_element); } int sys_fexecve(struct lwp *l, const struct sys_fexecve_args *uap, register_t *retval) { /* { syscallarg(int) fd; syscallarg(char * const *) argp; syscallarg(char * const *) envp; } */ return execve1(l, false, NULL, SCARG(uap, fd), SCARG(uap, argp), SCARG(uap, envp), execve_fetch_element); } /* * Load modules to try and execute an image that we do not understand. * If no execsw entries are present, we load those likely to be needed * in order to run native images only. Otherwise, we autoload all * possible modules that could let us run the binary. XXX lame */ static void exec_autoload(void) { #ifdef MODULAR static const char * const native[] = { "exec_elf32", "exec_elf64", "exec_script", NULL }; static const char * const compat[] = { "exec_elf32", "exec_elf64", "exec_script", "exec_aout", "exec_coff", "exec_ecoff", "compat_aoutm68k", "compat_netbsd32", #if 0 "compat_linux", "compat_linux32", #endif "compat_sunos", "compat_sunos32", "compat_ultrix", NULL }; char const * const *list; int i; list = nexecs == 0 ? native : compat; for (i = 0; list[i] != NULL; i++) { if (module_autoload(list[i], MODULE_CLASS_EXEC) != 0) { continue; } yield(); } #endif } /* * Copy the user or kernel supplied upath to the allocated pathbuffer pbp * making it absolute in the process, by prepending the current working * directory if it is not. If offs is supplied it will contain the offset * where the original supplied copy of upath starts. */ int exec_makepathbuf(struct lwp *l, const char *upath, enum uio_seg seg, struct pathbuf **pbp, size_t *offs) { char *path, *bp; size_t len, tlen; int error; struct cwdinfo *cwdi; path = PNBUF_GET(); if (seg == UIO_SYSSPACE) { error = copystr(upath, path, MAXPATHLEN, &len); } else { error = copyinstr(upath, path, MAXPATHLEN, &len); } if (error) goto err; if (path[0] == '/') { if (offs) *offs = 0; goto out; } len++; if (len + 1 >= MAXPATHLEN) { error = ENAMETOOLONG; goto err; } bp = path + MAXPATHLEN - len; memmove(bp, path, len); *(--bp) = '/'; cwdi = l->l_proc->p_cwdi; rw_enter(&cwdi->cwdi_lock, RW_READER); error = getcwd_common(cwdi->cwdi_cdir, NULL, &bp, path, MAXPATHLEN / 2, GETCWD_CHECK_ACCESS, l); rw_exit(&cwdi->cwdi_lock); if (error) goto err; tlen = path + MAXPATHLEN - bp; memmove(path, bp, tlen); path[tlen - 1] = '\0'; if (offs) *offs = tlen - len; out: *pbp = pathbuf_assimilate(path); return 0; err: PNBUF_PUT(path); return error; } vaddr_t exec_vm_minaddr(vaddr_t va_min) { /* * Increase va_min if we don't want NULL to be mappable by the * process. */ #define VM_MIN_GUARD PAGE_SIZE if (user_va0_disable && (va_min < VM_MIN_GUARD)) return VM_MIN_GUARD; return va_min; } static int execve_loadvm(struct lwp *l, bool has_path, const char *path, int fd, char * const *args, char * const *envs, execve_fetch_element_t fetch_element, struct execve_data * restrict data) { struct exec_package * const epp = &data->ed_pack; int error; struct proc *p; char *dp; u_int modgen; KASSERT(data != NULL); p = l->l_proc; modgen = 0; SDT_PROBE(proc, kernel, , exec, path, 0, 0, 0, 0); /* * Check if we have exceeded our number of processes limit. * This is so that we handle the case where a root daemon * forked, ran setuid to become the desired user and is trying * to exec. The obvious place to do the reference counting check * is setuid(), but we don't do the reference counting check there * like other OS's do because then all the programs that use setuid() * must be modified to check the return code of setuid() and exit(). * It is dangerous to make setuid() fail, because it fails open and * the program will continue to run as root. If we make it succeed * and return an error code, again we are not enforcing the limit. * The best place to enforce the limit is here, when the process tries * to execute a new image, because eventually the process will need * to call exec in order to do something useful. */ retry: if (p->p_flag & PK_SUGID) { if (kauth_authorize_process(l->l_cred, KAUTH_PROCESS_RLIMIT, p, KAUTH_ARG(KAUTH_REQ_PROCESS_RLIMIT_BYPASS), &p->p_rlimit[RLIMIT_NPROC], KAUTH_ARG(RLIMIT_NPROC)) != 0 && chgproccnt(kauth_cred_getuid(l->l_cred), 0) > p->p_rlimit[RLIMIT_NPROC].rlim_cur) return EAGAIN; } /* * Drain existing references and forbid new ones. The process * should be left alone until we're done here. This is necessary * to avoid race conditions - e.g. in ptrace() - that might allow * a local user to illicitly obtain elevated privileges. */ rw_enter(&p->p_reflock, RW_WRITER); if (has_path) { size_t offs; /* * Init the namei data to point the file user's program name. * This is done here rather than in check_exec(), so that it's * possible to override this settings if any of makecmd/probe * functions call check_exec() recursively - for example, * see exec_script_makecmds(). */ if ((error = exec_makepathbuf(l, path, UIO_USERSPACE, &data->ed_pathbuf, &offs)) != 0) goto clrflg; data->ed_pathstring = pathbuf_stringcopy_get(data->ed_pathbuf); epp->ep_kname = data->ed_pathstring + offs; data->ed_resolvedname = PNBUF_GET(); epp->ep_resolvedname = data->ed_resolvedname; epp->ep_xfd = -1; } else { data->ed_pathbuf = pathbuf_assimilate(strcpy(PNBUF_GET(), "/")); data->ed_pathstring = pathbuf_stringcopy_get(data->ed_pathbuf); epp->ep_kname = "*fexecve*"; data->ed_resolvedname = NULL; epp->ep_resolvedname = NULL; epp->ep_xfd = fd; } /* * initialize the fields of the exec package. */ epp->ep_hdr = kmem_alloc(exec_maxhdrsz, KM_SLEEP); epp->ep_hdrlen = exec_maxhdrsz; epp->ep_hdrvalid = 0; epp->ep_emul_arg = NULL; epp->ep_emul_arg_free = NULL; memset(&epp->ep_vmcmds, 0, sizeof(epp->ep_vmcmds)); epp->ep_vap = &data->ed_attr; epp->ep_flags = (p->p_flag & PK_32) ? EXEC_FROM32 : 0; MD_TOPDOWN_INIT(epp); epp->ep_emul_root = NULL; epp->ep_interp = NULL; epp->ep_esch = NULL; epp->ep_pax_flags = 0; memset(epp->ep_machine_arch, 0, sizeof(epp->ep_machine_arch)); rw_enter(&exec_lock, RW_READER); /* see if we can run it. */ if ((error = check_exec(l, epp, data->ed_pathbuf, &data->ed_resolvedname)) != 0) { if (error != ENOENT && error != EACCES && error != ENOEXEC) { DPRINTF(("%s: check exec failed for %s, error %d\n", __func__, epp->ep_kname, error)); } goto freehdr; } /* allocate an argument buffer */ data->ed_argp = pool_get(&exec_pool, PR_WAITOK); KASSERT(data->ed_argp != NULL); dp = data->ed_argp; if ((error = copyinargs(data, args, envs, fetch_element, &dp)) != 0) { goto bad; } /* * Calculate the new stack size. */ #ifdef __MACHINE_STACK_GROWS_UP /* * copyargs() fills argc/argv/envp from the lower address even on * __MACHINE_STACK_GROWS_UP machines. Reserve a few words just below the SP * so that _rtld() use it. */ #define RTLD_GAP 32 #else #define RTLD_GAP 0 #endif const size_t argenvstrlen = (char *)ALIGN(dp) - data->ed_argp; data->ed_argslen = calcargs(data, argenvstrlen); const size_t len = calcstack(data, pax_aslr_stack_gap(epp) + RTLD_GAP); if (len > epp->ep_ssize) { /* in effect, compare to initial limit */ DPRINTF(("%s: stack limit exceeded %zu\n", __func__, len)); error = ENOMEM; goto bad; } /* adjust "active stack depth" for process VSZ */ epp->ep_ssize = len; return 0; bad: /* free the vmspace-creation commands, and release their references */ kill_vmcmds(&epp->ep_vmcmds); /* kill any opened file descriptor, if necessary */ if (epp->ep_flags & EXEC_HASFD) { epp->ep_flags &= ~EXEC_HASFD; fd_close(epp->ep_fd); } /* close and put the exec'd file */ vn_lock(epp->ep_vp, LK_EXCLUSIVE | LK_RETRY); VOP_CLOSE(epp->ep_vp, FREAD, l->l_cred); vput(epp->ep_vp); pool_put(&exec_pool, data->ed_argp); freehdr: kmem_free(epp->ep_hdr, epp->ep_hdrlen); if (epp->ep_emul_root != NULL) vrele(epp->ep_emul_root); if (epp->ep_interp != NULL) vrele(epp->ep_interp); rw_exit(&exec_lock); exec_path_free(data); clrflg: rw_exit(&p->p_reflock); if (modgen != module_gen && error == ENOEXEC) { modgen = module_gen; exec_autoload(); goto retry; } SDT_PROBE(proc, kernel, , exec__failure, error, 0, 0, 0, 0); return error; } static int execve_dovmcmds(struct lwp *l, struct execve_data * restrict data) { struct exec_package * const epp = &data->ed_pack; struct proc *p = l->l_proc; struct exec_vmcmd *base_vcp; int error = 0; size_t i; /* record proc's vnode, for use by procfs and others */ if (p->p_textvp) vrele(p->p_textvp); vref(epp->ep_vp); p->p_textvp = epp->ep_vp; /* create the new process's VM space by running the vmcmds */ KASSERTMSG(epp->ep_vmcmds.evs_used != 0, "%s: no vmcmds", __func__); #ifdef TRACE_EXEC DUMPVMCMDS(epp, 0, 0); #endif base_vcp = NULL; for (i = 0; i < epp->ep_vmcmds.evs_used && !error; i++) { struct exec_vmcmd *vcp; vcp = &epp->ep_vmcmds.evs_cmds[i]; if (vcp->ev_flags & VMCMD_RELATIVE) { KASSERTMSG(base_vcp != NULL, "%s: relative vmcmd with no base", __func__); KASSERTMSG((vcp->ev_flags & VMCMD_BASE) == 0, "%s: illegal base & relative vmcmd", __func__); vcp->ev_addr += base_vcp->ev_addr; } error = (*vcp->ev_proc)(l, vcp); if (error) DUMPVMCMDS(epp, i, error); if (vcp->ev_flags & VMCMD_BASE) base_vcp = vcp; } /* free the vmspace-creation commands, and release their references */ kill_vmcmds(&epp->ep_vmcmds); vn_lock(epp->ep_vp, LK_EXCLUSIVE | LK_RETRY); VOP_CLOSE(epp->ep_vp, FREAD, l->l_cred); vput(epp->ep_vp); /* if an error happened, deallocate and punt */ if (error != 0) { DPRINTF(("%s: vmcmd %zu failed: %d\n", __func__, i - 1, error)); } return error; } static void execve_free_data(struct execve_data *data) { struct exec_package * const epp = &data->ed_pack; /* free the vmspace-creation commands, and release their references */ kill_vmcmds(&epp->ep_vmcmds); /* kill any opened file descriptor, if necessary */ if (epp->ep_flags & EXEC_HASFD) { epp->ep_flags &= ~EXEC_HASFD; fd_close(epp->ep_fd); } /* close and put the exec'd file */ vn_lock(epp->ep_vp, LK_EXCLUSIVE | LK_RETRY); VOP_CLOSE(epp->ep_vp, FREAD, curlwp->l_cred); vput(epp->ep_vp); pool_put(&exec_pool, data->ed_argp); kmem_free(epp->ep_hdr, epp->ep_hdrlen); if (epp->ep_emul_root != NULL) vrele(epp->ep_emul_root); if (epp->ep_interp != NULL) vrele(epp->ep_interp); exec_path_free(data); } static void pathexec(struct proc *p, const char *resolvedname) { /* set command name & other accounting info */ const char *cmdname; if (resolvedname == NULL) { cmdname = "*fexecve*"; resolvedname = "/"; } else { cmdname = strrchr(resolvedname, '/') + 1; } KASSERTMSG(resolvedname[0] == '/', "bad resolvedname `%s'", resolvedname); strlcpy(p->p_comm, cmdname, sizeof(p->p_comm)); kmem_strfree(p->p_path); p->p_path = kmem_strdupsize(resolvedname, NULL, KM_SLEEP); } /* XXX elsewhere */ static int credexec(struct lwp *l, struct execve_data *data) { struct proc *p = l->l_proc; struct vattr *attr = &data->ed_attr; int error; /* * Deal with set[ug]id. MNT_NOSUID has already been used to disable * s[ug]id. It's OK to check for PSL_TRACED here as we have blocked * out additional references on the process for the moment. */ if ((p->p_slflag & PSL_TRACED) == 0 && (((attr->va_mode & S_ISUID) != 0 && kauth_cred_geteuid(l->l_cred) != attr->va_uid) || ((attr->va_mode & S_ISGID) != 0 && kauth_cred_getegid(l->l_cred) != attr->va_gid))) { /* * Mark the process as SUGID before we do * anything that might block. */ proc_crmod_enter(); proc_crmod_leave(NULL, NULL, true); if (data->ed_argc == 0) { DPRINTF(( "%s: not executing set[ug]id binary with no args\n", __func__)); return EINVAL; } /* Make sure file descriptors 0..2 are in use. */ if ((error = fd_checkstd()) != 0) { DPRINTF(("%s: fdcheckstd failed %d\n", __func__, error)); return error; } /* * Copy the credential so other references don't see our * changes. */ l->l_cred = kauth_cred_copy(l->l_cred); #ifdef KTRACE /* * If the persistent trace flag isn't set, turn off. */ if (p->p_tracep) { mutex_enter(&ktrace_lock); if (!(p->p_traceflag & KTRFAC_PERSISTENT)) ktrderef(p); mutex_exit(&ktrace_lock); } #endif if (attr->va_mode & S_ISUID) kauth_cred_seteuid(l->l_cred, attr->va_uid); if (attr->va_mode & S_ISGID) kauth_cred_setegid(l->l_cred, attr->va_gid); } else { if (kauth_cred_geteuid(l->l_cred) == kauth_cred_getuid(l->l_cred) && kauth_cred_getegid(l->l_cred) == kauth_cred_getgid(l->l_cred)) p->p_flag &= ~PK_SUGID; } /* * Copy the credential so other references don't see our changes. * Test to see if this is necessary first, since in the common case * we won't need a private reference. */ if (kauth_cred_geteuid(l->l_cred) != kauth_cred_getsvuid(l->l_cred) || kauth_cred_getegid(l->l_cred) != kauth_cred_getsvgid(l->l_cred)) { l->l_cred = kauth_cred_copy(l->l_cred); kauth_cred_setsvuid(l->l_cred, kauth_cred_geteuid(l->l_cred)); kauth_cred_setsvgid(l->l_cred, kauth_cred_getegid(l->l_cred)); } /* Update the master credentials. */ if (l->l_cred != p->p_cred) { kauth_cred_t ocred; mutex_enter(p->p_lock); ocred = p->p_cred; p->p_cred = kauth_cred_hold(l->l_cred); mutex_exit(p->p_lock); kauth_cred_free(ocred); } return 0; } static void emulexec(struct lwp *l, struct exec_package *epp) { struct proc *p = l->l_proc; /* The emulation root will usually have been found when we looked * for the elf interpreter (or similar), if not look now. */ if (epp->ep_esch->es_emul->e_path != NULL && epp->ep_emul_root == NULL) emul_find_root(l, epp); /* Any old emulation root got removed by fdcloseexec */ rw_enter(&p->p_cwdi->cwdi_lock, RW_WRITER); p->p_cwdi->cwdi_edir = epp->ep_emul_root; rw_exit(&p->p_cwdi->cwdi_lock); epp->ep_emul_root = NULL; if (epp->ep_interp != NULL) vrele(epp->ep_interp); /* * Call emulation specific exec hook. This can setup per-process * p->p_emuldata or do any other per-process stuff an emulation needs. * * If we are executing process of different emulation than the * original forked process, call e_proc_exit() of the old emulation * first, then e_proc_exec() of new emulation. If the emulation is * same, the exec hook code should deallocate any old emulation * resources held previously by this process. */ if (p->p_emul && p->p_emul->e_proc_exit && p->p_emul != epp->ep_esch->es_emul) (*p->p_emul->e_proc_exit)(p); /* * Call exec hook. Emulation code may NOT store reference to anything * from &pack. */ if (epp->ep_esch->es_emul->e_proc_exec) (*epp->ep_esch->es_emul->e_proc_exec)(p, epp); /* update p_emul, the old value is no longer needed */ p->p_emul = epp->ep_esch->es_emul; /* ...and the same for p_execsw */ p->p_execsw = epp->ep_esch; #ifdef __HAVE_SYSCALL_INTERN (*p->p_emul->e_syscall_intern)(p); #endif ktremul(); } static int execve_runproc(struct lwp *l, struct execve_data * restrict data, bool no_local_exec_lock, bool is_spawn) { struct exec_package * const epp = &data->ed_pack; int error = 0; struct proc *p; struct vmspace *vm; /* * In case of a posix_spawn operation, the child doing the exec * might not hold the reader lock on exec_lock, but the parent * will do this instead. */ KASSERT(no_local_exec_lock || rw_lock_held(&exec_lock)); KASSERT(!no_local_exec_lock || is_spawn); KASSERT(data != NULL); p = l->l_proc; /* Get rid of other LWPs. */ if (p->p_nlwps > 1) { mutex_enter(p->p_lock); exit_lwps(l); mutex_exit(p->p_lock); } KDASSERT(p->p_nlwps == 1); /* * All of the other LWPs got rid of their robust futexes * when they exited above, but we might still have some * to dispose of. Do that now. */ if (__predict_false(l->l_robust_head != 0)) { futex_release_all_lwp(l); /* * Since this LWP will live on with a different * program image, we need to clear the robust * futex list pointer here. */ l->l_robust_head = 0; } /* Destroy any lwpctl info. */ if (p->p_lwpctl != NULL) lwp_ctl_exit(); /* Remove POSIX timers */ ptimers_free(p, TIMERS_POSIX); /* Set the PaX flags. */ pax_set_flags(epp, p); /* * Do whatever is necessary to prepare the address space * for remapping. Note that this might replace the current * vmspace with another! * * vfork(): do not touch any user space data in the new child * until we have awoken the parent below, or it will defeat * lazy pmap switching (on x86). */ if (is_spawn) uvmspace_spawn(l, epp->ep_vm_minaddr, epp->ep_vm_maxaddr, epp->ep_flags & EXEC_TOPDOWN_VM); else uvmspace_exec(l, epp->ep_vm_minaddr, epp->ep_vm_maxaddr, epp->ep_flags & EXEC_TOPDOWN_VM); vm = p->p_vmspace; vm->vm_taddr = (void *)epp->ep_taddr; vm->vm_tsize = btoc(epp->ep_tsize); vm->vm_daddr = (void*)epp->ep_daddr; vm->vm_dsize = btoc(epp->ep_dsize); vm->vm_ssize = btoc(epp->ep_ssize); vm->vm_issize = 0; vm->vm_maxsaddr = (void *)epp->ep_maxsaddr; vm->vm_minsaddr = (void *)epp->ep_minsaddr; pax_aslr_init_vm(l, vm, epp); cwdexec(p); fd_closeexec(); /* handle close on exec */ if (__predict_false(ktrace_on)) fd_ktrexecfd(); execsigs(p); /* reset caught signals */ mutex_enter(p->p_lock); l->l_ctxlink = NULL; /* reset ucontext link */ p->p_acflag &= ~AFORK; p->p_flag |= PK_EXEC; mutex_exit(p->p_lock); error = credexec(l, data); if (error) goto exec_abort; #if defined(__HAVE_RAS) /* * Remove all RASs from the address space. */ ras_purgeall(); #endif /* * Stop profiling. */ if ((p->p_stflag & PST_PROFIL) != 0) { mutex_spin_enter(&p->p_stmutex); stopprofclock(p); mutex_spin_exit(&p->p_stmutex); } /* * It's OK to test PL_PPWAIT unlocked here, as other LWPs have * exited and exec()/exit() are the only places it will be cleared. * * Once the parent has been awoken, curlwp may teleport to a new CPU * in sched_vforkexec(), and it's then OK to start messing with user * data. See comment above. */ if ((p->p_lflag & PL_PPWAIT) != 0) { bool samecpu; lwp_t *lp; mutex_enter(&proc_lock); lp = p->p_vforklwp; p->p_vforklwp = NULL; l->l_lwpctl = NULL; /* was on loan from blocked parent */ /* Clear flags after cv_broadcast() (scheduler needs them). */ p->p_lflag &= ~PL_PPWAIT; lp->l_vforkwaiting = false; /* If parent is still on same CPU, teleport curlwp elsewhere. */ samecpu = (lp->l_cpu == curlwp->l_cpu); cv_broadcast(&lp->l_waitcv); mutex_exit(&proc_lock); /* Give the parent its CPU back - find a new home. */ KASSERT(!is_spawn); sched_vforkexec(l, samecpu); } /* Now map address space. */ error = execve_dovmcmds(l, data); if (error != 0) goto exec_abort; pathexec(p, epp->ep_resolvedname); char * const newstack = STACK_GROW(vm->vm_minsaddr, epp->ep_ssize); error = copyoutargs(data, l, newstack); if (error != 0) goto exec_abort; doexechooks(p); /* * Set initial SP at the top of the stack. * * Note that on machines where stack grows up (e.g. hppa), SP points to * the end of arg/env strings. Userland guesses the address of argc * via ps_strings::ps_argvstr. */ /* Setup new registers and do misc. setup. */ (*epp->ep_esch->es_emul->e_setregs)(l, epp, (vaddr_t)newstack); if (epp->ep_esch->es_setregs) (*epp->ep_esch->es_setregs)(l, epp, (vaddr_t)newstack); /* Provide a consistent LWP private setting */ (void)lwp_setprivate(l, NULL); /* Discard all PCU state; need to start fresh */ pcu_discard_all(l); /* map the process's signal trampoline code */ if ((error = exec_sigcode_map(p, epp->ep_esch->es_emul)) != 0) { DPRINTF(("%s: map sigcode failed %d\n", __func__, error)); goto exec_abort; } pool_put(&exec_pool, data->ed_argp); /* * Notify anyone who might care that we've exec'd. * * This is slightly racy; someone could sneak in and * attach a knote after we've decided not to notify, * or vice-versa, but that's not particularly bothersome. * knote_proc_exec() will acquire p->p_lock as needed. */ if (!SLIST_EMPTY(&p->p_klist)) { knote_proc_exec(p); } kmem_free(epp->ep_hdr, epp->ep_hdrlen); SDT_PROBE(proc, kernel, , exec__success, epp->ep_kname, 0, 0, 0, 0); emulexec(l, epp); /* Allow new references from the debugger/procfs. */ rw_exit(&p->p_reflock); if (!no_local_exec_lock) rw_exit(&exec_lock); mutex_enter(&proc_lock); /* posix_spawn(3) reports a single event with implied exec(3) */ if ((p->p_slflag & PSL_TRACED) && !is_spawn) { mutex_enter(p->p_lock); eventswitch(TRAP_EXEC, 0, 0); mutex_enter(&proc_lock); } if (p->p_sflag & PS_STOPEXEC) { ksiginfoq_t kq; KASSERT(l->l_blcnt == 0); p->p_pptr->p_nstopchild++; p->p_waited = 0; mutex_enter(p->p_lock); ksiginfo_queue_init(&kq); sigclearall(p, &contsigmask, &kq); lwp_lock(l); l->l_stat = LSSTOP; p->p_stat = SSTOP; p->p_nrlwps--; lwp_unlock(l); mutex_exit(p->p_lock); mutex_exit(&proc_lock); lwp_lock(l); spc_lock(l->l_cpu); mi_switch(l); ksiginfo_queue_drain(&kq); } else { mutex_exit(&proc_lock); } exec_path_free(data); #ifdef TRACE_EXEC DPRINTF(("%s finished\n", __func__)); #endif return EJUSTRETURN; exec_abort: SDT_PROBE(proc, kernel, , exec__failure, error, 0, 0, 0, 0); rw_exit(&p->p_reflock); if (!no_local_exec_lock) rw_exit(&exec_lock); exec_path_free(data); /* * the old process doesn't exist anymore. exit gracefully. * get rid of the (new) address space we have created, if any, get rid * of our namei data and vnode, and exit noting failure */ if (vm != NULL) { uvm_deallocate(&vm->vm_map, VM_MIN_ADDRESS, VM_MAXUSER_ADDRESS - VM_MIN_ADDRESS); } exec_free_emul_arg(epp); pool_put(&exec_pool, data->ed_argp); kmem_free(epp->ep_hdr, epp->ep_hdrlen); if (epp->ep_emul_root != NULL) vrele(epp->ep_emul_root); if (epp->ep_interp != NULL) vrele(epp->ep_interp); /* Acquire the sched-state mutex (exit1() will release it). */ if (!is_spawn) { mutex_enter(p->p_lock); exit1(l, error, SIGABRT); } return error; } int execve1(struct lwp *l, bool has_path, const char *path, int fd, char * const *args, char * const *envs, execve_fetch_element_t fetch_element) { struct execve_data data; int error; error = execve_loadvm(l, has_path, path, fd, args, envs, fetch_element, &data); if (error) return error; error = execve_runproc(l, &data, false, false); return error; } static size_t fromptrsz(const struct exec_package *epp) { return (epp->ep_flags & EXEC_FROM32) ? sizeof(int) : sizeof(char *); } static size_t ptrsz(const struct exec_package *epp) { return (epp->ep_flags & EXEC_32) ? sizeof(int) : sizeof(char *); } static size_t calcargs(struct execve_data * restrict data, const size_t argenvstrlen) { struct exec_package * const epp = &data->ed_pack; const size_t nargenvptrs = 1 + /* long argc */ data->ed_argc + /* char *argv[] */ 1 + /* \0 */ data->ed_envc + /* char *env[] */ 1; /* \0 */ return (nargenvptrs * ptrsz(epp)) /* pointers */ + argenvstrlen /* strings */ + epp->ep_esch->es_arglen; /* auxinfo */ } static size_t calcstack(struct execve_data * restrict data, const size_t gaplen) { struct exec_package * const epp = &data->ed_pack; data->ed_szsigcode = epp->ep_esch->es_emul->e_esigcode - epp->ep_esch->es_emul->e_sigcode; data->ed_ps_strings_sz = (epp->ep_flags & EXEC_32) ? sizeof(struct ps_strings32) : sizeof(struct ps_strings); const size_t sigcode_psstr_sz = data->ed_szsigcode + /* sigcode */ data->ed_ps_strings_sz + /* ps_strings */ STACK_PTHREADSPACE; /* pthread space */ const size_t stacklen = data->ed_argslen + gaplen + sigcode_psstr_sz; /* make the stack "safely" aligned */ return STACK_LEN_ALIGN(stacklen, STACK_ALIGNBYTES); } static int copyoutargs(struct execve_data * restrict data, struct lwp *l, char * const newstack) { struct exec_package * const epp = &data->ed_pack; struct proc *p = l->l_proc; int error; memset(&data->ed_arginfo, 0, sizeof(data->ed_arginfo)); /* remember information about the process */ data->ed_arginfo.ps_nargvstr = data->ed_argc; data->ed_arginfo.ps_nenvstr = data->ed_envc; /* * Allocate the stack address passed to the newly execve()'ed process. * * The new stack address will be set to the SP (stack pointer) register * in setregs(). */ char *newargs = STACK_ALLOC( STACK_SHRINK(newstack, data->ed_argslen), data->ed_argslen); error = (*epp->ep_esch->es_copyargs)(l, epp, &data->ed_arginfo, &newargs, data->ed_argp); if (error) { DPRINTF(("%s: copyargs failed %d\n", __func__, error)); return error; } error = copyoutpsstrs(data, p); if (error != 0) return error; return 0; } static int copyoutpsstrs(struct execve_data * restrict data, struct proc *p) { struct exec_package * const epp = &data->ed_pack; struct ps_strings32 arginfo32; void *aip; int error; /* fill process ps_strings info */ p->p_psstrp = (vaddr_t)STACK_ALLOC(STACK_GROW(epp->ep_minsaddr, STACK_PTHREADSPACE), data->ed_ps_strings_sz); if (epp->ep_flags & EXEC_32) { aip = &arginfo32; arginfo32.ps_argvstr = (vaddr_t)data->ed_arginfo.ps_argvstr; arginfo32.ps_nargvstr = data->ed_arginfo.ps_nargvstr; arginfo32.ps_envstr = (vaddr_t)data->ed_arginfo.ps_envstr; arginfo32.ps_nenvstr = data->ed_arginfo.ps_nenvstr; } else aip = &data->ed_arginfo; /* copy out the process's ps_strings structure */ if ((error = copyout(aip, (void *)p->p_psstrp, data->ed_ps_strings_sz)) != 0) { DPRINTF(("%s: ps_strings copyout %p->%p size %zu failed\n", __func__, aip, (void *)p->p_psstrp, data->ed_ps_strings_sz)); return error; } return 0; } static int copyinargs(struct execve_data * restrict data, char * const *args, char * const *envs, execve_fetch_element_t fetch_element, char **dpp) { struct exec_package * const epp = &data->ed_pack; char *dp; size_t i; int error; dp = *dpp; data->ed_argc = 0; /* copy the fake args list, if there's one, freeing it as we go */ if (epp->ep_flags & EXEC_HASARGL) { struct exec_fakearg *fa = epp->ep_fa; while (fa->fa_arg != NULL) { const size_t maxlen = ARG_MAX - (dp - data->ed_argp); size_t len; len = strlcpy(dp, fa->fa_arg, maxlen); /* Count NUL into len. */ if (len < maxlen) len++; else { while (fa->fa_arg != NULL) { kmem_free(fa->fa_arg, fa->fa_len); fa++; } kmem_free(epp->ep_fa, epp->ep_fa_len); epp->ep_flags &= ~EXEC_HASARGL; return E2BIG; } ktrexecarg(fa->fa_arg, len - 1); dp += len; kmem_free(fa->fa_arg, fa->fa_len); fa++; data->ed_argc++; } kmem_free(epp->ep_fa, epp->ep_fa_len); epp->ep_flags &= ~EXEC_HASARGL; } /* * Read and count argument strings from user. */ if (args == NULL) { DPRINTF(("%s: null args\n", __func__)); return EINVAL; } if (epp->ep_flags & EXEC_SKIPARG) args = (const void *)((const char *)args + fromptrsz(epp)); i = 0; error = copyinargstrs(data, args, fetch_element, &dp, &i, ktr_execarg); if (error != 0) { DPRINTF(("%s: copyin arg %d\n", __func__, error)); return error; } data->ed_argc += i; /* * Read and count environment strings from user. */ data->ed_envc = 0; /* environment need not be there */ if (envs == NULL) goto done; i = 0; error = copyinargstrs(data, envs, fetch_element, &dp, &i, ktr_execenv); if (error != 0) { DPRINTF(("%s: copyin env %d\n", __func__, error)); return error; } data->ed_envc += i; done: *dpp = dp; return 0; } static int copyinargstrs(struct execve_data * restrict data, char * const *strs, execve_fetch_element_t fetch_element, char **dpp, size_t *ip, void (*ktr)(const void *, size_t)) { char *dp, *sp; size_t i; int error; dp = *dpp; i = 0; while (1) { const size_t maxlen = ARG_MAX - (dp - data->ed_argp); size_t len; if ((error = (*fetch_element)(strs, i, &sp)) != 0) { return error; } if (!sp) break; if ((error = copyinstr(sp, dp, maxlen, &len)) != 0) { if (error == ENAMETOOLONG) error = E2BIG; return error; } if (__predict_false(ktrace_on)) (*ktr)(dp, len - 1); dp += len; i++; } *dpp = dp; *ip = i; return 0; } /* * Copy argv and env strings from kernel buffer (argp) to the new stack. * Those strings are located just after auxinfo. */ int copyargs(struct lwp *l, struct exec_package *pack, struct ps_strings *arginfo, char **stackp, void *argp) { char **cpp, *dp, *sp; size_t len; void *nullp; long argc, envc; int error; cpp = (char **)*stackp; nullp = NULL; argc = arginfo->ps_nargvstr; envc = arginfo->ps_nenvstr; /* argc on stack is long */ CTASSERT(sizeof(*cpp) == sizeof(argc)); dp = (char *)(cpp + 1 + /* long argc */ argc + /* char *argv[] */ 1 + /* \0 */ envc + /* char *env[] */ 1) + /* \0 */ pack->ep_esch->es_arglen; /* auxinfo */ sp = argp; if ((error = copyout(&argc, cpp++, sizeof(argc))) != 0) { COPYPRINTF("", cpp - 1, sizeof(argc)); return error; } /* XXX don't copy them out, remap them! */ arginfo->ps_argvstr = cpp; /* remember location of argv for later */ for (; --argc >= 0; sp += len, dp += len) { if ((error = copyout(&dp, cpp++, sizeof(dp))) != 0) { COPYPRINTF("", cpp - 1, sizeof(dp)); return error; } if ((error = copyoutstr(sp, dp, ARG_MAX, &len)) != 0) { COPYPRINTF("str", dp, (size_t)ARG_MAX); return error; } } if ((error = copyout(&nullp, cpp++, sizeof(nullp))) != 0) { COPYPRINTF("", cpp - 1, sizeof(nullp)); return error; } arginfo->ps_envstr = cpp; /* remember location of envp for later */ for (; --envc >= 0; sp += len, dp += len) { if ((error = copyout(&dp, cpp++, sizeof(dp))) != 0) { COPYPRINTF("", cpp - 1, sizeof(dp)); return error; } if ((error = copyoutstr(sp, dp, ARG_MAX, &len)) != 0) { COPYPRINTF("str", dp, (size_t)ARG_MAX); return error; } } if ((error = copyout(&nullp, cpp++, sizeof(nullp))) != 0) { COPYPRINTF("", cpp - 1, sizeof(nullp)); return error; } *stackp = (char *)cpp; return 0; } /* * Add execsw[] entries. */ int exec_add(struct execsw *esp, int count) { struct exec_entry *it; int i, error = 0; if (count == 0) { return 0; } /* Check for duplicates. */ rw_enter(&exec_lock, RW_WRITER); for (i = 0; i < count; i++) { LIST_FOREACH(it, &ex_head, ex_list) { /* assume unique (makecmds, probe_func, emulation) */ if (it->ex_sw->es_makecmds == esp[i].es_makecmds && it->ex_sw->u.elf_probe_func == esp[i].u.elf_probe_func && it->ex_sw->es_emul == esp[i].es_emul) { rw_exit(&exec_lock); return EEXIST; } } } /* Allocate new entries. */ for (i = 0; i < count; i++) { it = kmem_alloc(sizeof(*it), KM_SLEEP); it->ex_sw = &esp[i]; error = exec_sigcode_alloc(it->ex_sw->es_emul); if (error != 0) { kmem_free(it, sizeof(*it)); break; } LIST_INSERT_HEAD(&ex_head, it, ex_list); } /* If even one fails, remove them all back. */ if (error != 0) { for (i--; i >= 0; i--) { it = LIST_FIRST(&ex_head); LIST_REMOVE(it, ex_list); exec_sigcode_free(it->ex_sw->es_emul); kmem_free(it, sizeof(*it)); } return error; } /* update execsw[] */ exec_init(0); rw_exit(&exec_lock); return 0; } /* * Remove execsw[] entry. */ int exec_remove(struct execsw *esp, int count) { struct exec_entry *it, *next; int i; const struct proclist_desc *pd; proc_t *p; if (count == 0) { return 0; } /* Abort if any are busy. */ rw_enter(&exec_lock, RW_WRITER); for (i = 0; i < count; i++) { mutex_enter(&proc_lock); for (pd = proclists; pd->pd_list != NULL; pd++) { PROCLIST_FOREACH(p, pd->pd_list) { if (p->p_execsw == &esp[i]) { mutex_exit(&proc_lock); rw_exit(&exec_lock); return EBUSY; } } } mutex_exit(&proc_lock); } /* None are busy, so remove them all. */ for (i = 0; i < count; i++) { for (it = LIST_FIRST(&ex_head); it != NULL; it = next) { next = LIST_NEXT(it, ex_list); if (it->ex_sw == &esp[i]) { LIST_REMOVE(it, ex_list); exec_sigcode_free(it->ex_sw->es_emul); kmem_free(it, sizeof(*it)); break; } } } /* update execsw[] */ exec_init(0); rw_exit(&exec_lock); return 0; } /* * Initialize exec structures. If init_boot is true, also does necessary * one-time initialization (it's called from main() that way). * Once system is multiuser, this should be called with exec_lock held, * i.e. via exec_{add|remove}(). */ int exec_init(int init_boot) { const struct execsw **sw; struct exec_entry *ex; SLIST_HEAD(,exec_entry) first; SLIST_HEAD(,exec_entry) any; SLIST_HEAD(,exec_entry) last; int i, sz; if (init_boot) { /* do one-time initializations */ vaddr_t vmin = 0, vmax; rw_init(&exec_lock); exec_map = uvm_km_suballoc(kernel_map, &vmin, &vmax, maxexec*NCARGS, VM_MAP_PAGEABLE, false, NULL); pool_init(&exec_pool, NCARGS, 0, 0, PR_NOALIGN|PR_NOTOUCH, "execargs", &exec_palloc, IPL_NONE); pool_sethardlimit(&exec_pool, maxexec, "should not happen", 0); } else { KASSERT(rw_write_held(&exec_lock)); } /* Sort each entry onto the appropriate queue. */ SLIST_INIT(&first); SLIST_INIT(&any); SLIST_INIT(&last); sz = 0; LIST_FOREACH(ex, &ex_head, ex_list) { switch(ex->ex_sw->es_prio) { case EXECSW_PRIO_FIRST: SLIST_INSERT_HEAD(&first, ex, ex_slist); break; case EXECSW_PRIO_ANY: SLIST_INSERT_HEAD(&any, ex, ex_slist); break; case EXECSW_PRIO_LAST: SLIST_INSERT_HEAD(&last, ex, ex_slist); break; default: panic("%s", __func__); break; } sz++; } /* * Create new execsw[]. Ensure we do not try a zero-sized * allocation. */ sw = kmem_alloc(sz * sizeof(struct execsw *) + 1, KM_SLEEP); i = 0; SLIST_FOREACH(ex, &first, ex_slist) { sw[i++] = ex->ex_sw; } SLIST_FOREACH(ex, &any, ex_slist) { sw[i++] = ex->ex_sw; } SLIST_FOREACH(ex, &last, ex_slist) { sw[i++] = ex->ex_sw; } /* Replace old execsw[] and free used memory. */ if (execsw != NULL) { kmem_free(__UNCONST(execsw), nexecs * sizeof(struct execsw *) + 1); } execsw = sw; nexecs = sz; /* Figure out the maximum size of an exec header. */ exec_maxhdrsz = sizeof(int); for (i = 0; i < nexecs; i++) { if (execsw[i]->es_hdrsz > exec_maxhdrsz) exec_maxhdrsz = execsw[i]->es_hdrsz; } return 0; } int exec_sigcode_alloc(const struct emul *e) { vaddr_t va; vsize_t sz; int error; struct uvm_object *uobj; KASSERT(rw_lock_held(&exec_lock)); if (e == NULL || e->e_sigobject == NULL) return 0; sz = (vaddr_t)e->e_esigcode - (vaddr_t)e->e_sigcode; if (sz == 0) return 0; /* * Create a sigobject for this emulation. * * sigobject is an anonymous memory object (just like SYSV shared * memory) that we keep a permanent reference to and that we map * in all processes that need this sigcode. The creation is simple, * we create an object, add a permanent reference to it, map it in * kernel space, copy out the sigcode to it and unmap it. * We map it with PROT_READ|PROT_EXEC into the process just * the way sys_mmap() would map it. */ if (*e->e_sigobject == NULL) { uobj = uao_create(sz, 0); (*uobj->pgops->pgo_reference)(uobj); va = vm_map_min(kernel_map); if ((error = uvm_map(kernel_map, &va, round_page(sz), uobj, 0, 0, UVM_MAPFLAG(UVM_PROT_RW, UVM_PROT_RW, UVM_INH_SHARE, UVM_ADV_RANDOM, 0)))) { printf("sigcode kernel mapping failed %d\n", error); (*uobj->pgops->pgo_detach)(uobj); return error; } memcpy((void *)va, e->e_sigcode, sz); #ifdef PMAP_NEED_PROCWR pmap_procwr(&proc0, va, sz); #endif uvm_unmap(kernel_map, va, va + round_page(sz)); *e->e_sigobject = uobj; KASSERT(uobj->uo_refs == 1); } else { /* if already created, reference++ */ uobj = *e->e_sigobject; (*uobj->pgops->pgo_reference)(uobj); } return 0; } void exec_sigcode_free(const struct emul *e) { struct uvm_object *uobj; KASSERT(rw_lock_held(&exec_lock)); if (e == NULL || e->e_sigobject == NULL) return; uobj = *e->e_sigobject; if (uobj == NULL) return; if (uobj->uo_refs == 1) *e->e_sigobject = NULL; /* I'm the last person to reference. */ (*uobj->pgops->pgo_detach)(uobj); } static int exec_sigcode_map(struct proc *p, const struct emul *e) { vaddr_t va; vsize_t sz; int error; struct uvm_object *uobj; sz = (vaddr_t)e->e_esigcode - (vaddr_t)e->e_sigcode; if (e->e_sigobject == NULL || sz == 0) return 0; uobj = *e->e_sigobject; if (uobj == NULL) return 0; /* Just a hint to uvm_map where to put it. */ va = e->e_vm_default_addr(p, (vaddr_t)p->p_vmspace->vm_daddr, round_page(sz), p->p_vmspace->vm_map.flags & VM_MAP_TOPDOWN); #ifdef __alpha__ /* * Tru64 puts /sbin/loader at the end of user virtual memory, * which causes the above calculation to put the sigcode at * an invalid address. Put it just below the text instead. */ if (va == (vaddr_t)vm_map_max(&p->p_vmspace->vm_map)) { va = (vaddr_t)p->p_vmspace->vm_taddr - round_page(sz); } #endif (*uobj->pgops->pgo_reference)(uobj); error = uvm_map(&p->p_vmspace->vm_map, &va, round_page(sz), uobj, 0, 0, UVM_MAPFLAG(UVM_PROT_RX, UVM_PROT_RX, UVM_INH_SHARE, UVM_ADV_RANDOM, 0)); if (error) { DPRINTF(("%s, %d: map %p " "uvm_map %#"PRIxVSIZE"@%#"PRIxVADDR" failed %d\n", __func__, __LINE__, &p->p_vmspace->vm_map, round_page(sz), va, error)); (*uobj->pgops->pgo_detach)(uobj); return error; } p->p_sigctx.ps_sigcode = (void *)va; return 0; } /* * Release a refcount on spawn_exec_data and destroy memory, if this * was the last one. */ static void spawn_exec_data_release(struct spawn_exec_data *data) { membar_release(); if (atomic_dec_32_nv(&data->sed_refcnt) != 0) return; membar_acquire(); cv_destroy(&data->sed_cv_child_ready); mutex_destroy(&data->sed_mtx_child); if (data->sed_actions) posix_spawn_fa_free(data->sed_actions, data->sed_actions->len); if (data->sed_attrs) kmem_free(data->sed_attrs, sizeof(*data->sed_attrs)); kmem_free(data, sizeof(*data)); } static int handle_posix_spawn_file_actions(struct posix_spawn_file_actions *actions) { struct lwp *l = curlwp; register_t retval; int error, newfd; if (actions == NULL) return 0; for (size_t i = 0; i < actions->len; i++) { const struct posix_spawn_file_actions_entry *fae = &actions->fae[i]; switch (fae->fae_action) { case FAE_OPEN: if (fd_getfile(fae->fae_fildes) != NULL) { error = fd_close(fae->fae_fildes); if (error) return error; } error = fd_open(fae->fae_path, fae->fae_oflag, fae->fae_mode, &newfd); if (error) return error; if (newfd != fae->fae_fildes) { error = dodup(l, newfd, fae->fae_fildes, 0, &retval); if (fd_getfile(newfd) != NULL) fd_close(newfd); } break; case FAE_DUP2: error = dodup(l, fae->fae_fildes, fae->fae_newfildes, 0, &retval); break; case FAE_CLOSE: if (fd_getfile(fae->fae_fildes) == NULL) { return EBADF; } error = fd_close(fae->fae_fildes); break; case FAE_CHDIR: error = do_sys_chdir(l, fae->fae_chdir_path, UIO_SYSSPACE, &retval); break; case FAE_FCHDIR: error = do_sys_fchdir(l, fae->fae_fildes, &retval); break; } if (error) return error; } return 0; } static int handle_posix_spawn_attrs(struct posix_spawnattr *attrs, struct proc *parent) { struct sigaction sigact; int error; struct proc *p = curproc; struct lwp *l = curlwp; if (attrs == NULL) return 0; memset(&sigact, 0, sizeof(sigact)); sigact._sa_u._sa_handler = SIG_DFL; sigact.sa_flags = 0; /* * set state to SSTOP so that this proc can be found by pid. * see proc_enterprp, do_sched_setparam below */ mutex_enter(&proc_lock); /* * p_stat should be SACTIVE, so we need to adjust the * parent's p_nstopchild here. For safety, just make * we're on the good side of SDEAD before we adjust. */ int ostat = p->p_stat; KASSERT(ostat < SSTOP); p->p_stat = SSTOP; p->p_waited = 0; p->p_pptr->p_nstopchild++; mutex_exit(&proc_lock); /* Set process group */ if (attrs->sa_flags & POSIX_SPAWN_SETPGROUP) { pid_t mypid = p->p_pid; pid_t pgrp = attrs->sa_pgroup; if (pgrp == 0) pgrp = mypid; error = proc_enterpgrp(parent, mypid, pgrp, false); if (error) goto out; } /* Set scheduler policy */ if (attrs->sa_flags & POSIX_SPAWN_SETSCHEDULER) error = do_sched_setparam(p->p_pid, 0, attrs->sa_schedpolicy, &attrs->sa_schedparam); else if (attrs->sa_flags & POSIX_SPAWN_SETSCHEDPARAM) { error = do_sched_setparam(parent->p_pid, 0, SCHED_NONE, &attrs->sa_schedparam); } if (error) goto out; /* Reset user ID's */ if (attrs->sa_flags & POSIX_SPAWN_RESETIDS) { error = do_setresgid(l, -1, kauth_cred_getgid(l->l_cred), -1, ID_E_EQ_R | ID_E_EQ_S); if (error) return error; error = do_setresuid(l, -1, kauth_cred_getuid(l->l_cred), -1, ID_E_EQ_R | ID_E_EQ_S); if (error) goto out; } /* Set signal masks/defaults */ if (attrs->sa_flags & POSIX_SPAWN_SETSIGMASK) { mutex_enter(p->p_lock); error = sigprocmask1(l, SIG_SETMASK, &attrs->sa_sigmask, NULL); mutex_exit(p->p_lock); if (error) goto out; } if (attrs->sa_flags & POSIX_SPAWN_SETSIGDEF) { /* * The following sigaction call is using a sigaction * version 0 trampoline which is in the compatibility * code only. This is not a problem because for SIG_DFL * and SIG_IGN, the trampolines are now ignored. If they * were not, this would be a problem because we are * holding the exec_lock, and the compat code needs * to do the same in order to replace the trampoline * code of the process. */ for (int i = 1; i <= NSIG; i++) { if (sigismember(&attrs->sa_sigdefault, i)) sigaction1(l, i, &sigact, NULL, NULL, 0); } } error = 0; out: mutex_enter(&proc_lock); p->p_stat = ostat; p->p_pptr->p_nstopchild--; mutex_exit(&proc_lock); return error; } /* * A child lwp of a posix_spawn operation starts here and ends up in * cpu_spawn_return, dealing with all filedescriptor and scheduler * manipulations in between. * The parent waits for the child, as it is not clear whether the child * will be able to acquire its own exec_lock. If it can, the parent can * be released early and continue running in parallel. If not (or if the * magic debug flag is passed in the scheduler attribute struct), the * child rides on the parent's exec lock until it is ready to return to * to userland - and only then releases the parent. This method loses * concurrency, but improves error reporting. */ static void spawn_return(void *arg) { struct spawn_exec_data *spawn_data = arg; struct lwp *l = curlwp; struct proc *p = l->l_proc; int error; bool have_reflock; bool parent_is_waiting = true; /* * Check if we can release parent early. * We either need to have no sed_attrs, or sed_attrs does not * have POSIX_SPAWN_RETURNERROR or one of the flags, that require * safe access to the parent proc (passed in sed_parent). * We then try to get the exec_lock, and only if that works, we can * release the parent here already. */ struct posix_spawnattr *attrs = spawn_data->sed_attrs; if ((!attrs || (attrs->sa_flags & (POSIX_SPAWN_RETURNERROR|POSIX_SPAWN_SETPGROUP)) == 0) && rw_tryenter(&exec_lock, RW_READER)) { parent_is_waiting = false; mutex_enter(&spawn_data->sed_mtx_child); cv_signal(&spawn_data->sed_cv_child_ready); mutex_exit(&spawn_data->sed_mtx_child); } /* don't allow debugger access yet */ rw_enter(&p->p_reflock, RW_WRITER); have_reflock = true; /* handle posix_spawnattr */ error = handle_posix_spawn_attrs(attrs, spawn_data->sed_parent); if (error) goto report_error; /* handle posix_spawn_file_actions */ error = handle_posix_spawn_file_actions(spawn_data->sed_actions); if (error) goto report_error; /* now do the real exec */ error = execve_runproc(l, &spawn_data->sed_exec, parent_is_waiting, true); have_reflock = false; if (error == EJUSTRETURN) error = 0; else if (error) goto report_error; if (parent_is_waiting) { mutex_enter(&spawn_data->sed_mtx_child); cv_signal(&spawn_data->sed_cv_child_ready); mutex_exit(&spawn_data->sed_mtx_child); } /* release our refcount on the data */ spawn_exec_data_release(spawn_data); if ((p->p_slflag & (PSL_TRACED|PSL_TRACEDCHILD)) == (PSL_TRACED|PSL_TRACEDCHILD)) { eventswitchchild(p, TRAP_CHLD, PTRACE_POSIX_SPAWN); } /* and finally: leave to userland for the first time */ cpu_spawn_return(l); /* NOTREACHED */ return; report_error: if (have_reflock) { /* * We have not passed through execve_runproc(), * which would have released the p_reflock and also * taken ownership of the sed_exec part of spawn_data, * so release/free both here. */ rw_exit(&p->p_reflock); execve_free_data(&spawn_data->sed_exec); } if (parent_is_waiting) { /* pass error to parent */ mutex_enter(&spawn_data->sed_mtx_child); spawn_data->sed_error = error; cv_signal(&spawn_data->sed_cv_child_ready); mutex_exit(&spawn_data->sed_mtx_child); } else { rw_exit(&exec_lock); } /* release our refcount on the data */ spawn_exec_data_release(spawn_data); /* done, exit */ mutex_enter(p->p_lock); /* * Posix explicitly asks for an exit code of 127 if we report * errors from the child process - so, unfortunately, there * is no way to report a more exact error code. * A NetBSD specific workaround is POSIX_SPAWN_RETURNERROR as * flag bit in the attrp argument to posix_spawn(2), see above. */ exit1(l, 127, 0); } static __inline char ** posix_spawn_fae_path(struct posix_spawn_file_actions_entry *fae) { switch (fae->fae_action) { case FAE_OPEN: return &fae->fae_path; case FAE_CHDIR: return &fae->fae_chdir_path; default: return NULL; } } void posix_spawn_fa_free(struct posix_spawn_file_actions *fa, size_t len) { for (size_t i = 0; i < len; i++) { char **pathp = posix_spawn_fae_path(&fa->fae[i]); if (pathp) kmem_strfree(*pathp); } if (fa->len > 0) kmem_free(fa->fae, sizeof(*fa->fae) * fa->len); kmem_free(fa, sizeof(*fa)); } static int posix_spawn_fa_alloc(struct posix_spawn_file_actions **fap, const struct posix_spawn_file_actions *ufa, rlim_t lim) { struct posix_spawn_file_actions *fa; struct posix_spawn_file_actions_entry *fae; char *pbuf = NULL; int error; size_t i = 0; fa = kmem_alloc(sizeof(*fa), KM_SLEEP); error = copyin(ufa, fa, sizeof(*fa)); if (error || fa->len == 0) { kmem_free(fa, sizeof(*fa)); return error; /* 0 if not an error, and len == 0 */ } if (fa->len > lim) { kmem_free(fa, sizeof(*fa)); return EINVAL; } fa->size = fa->len; size_t fal = fa->len * sizeof(*fae); fae = fa->fae; fa->fae = kmem_alloc(fal, KM_SLEEP); error = copyin(fae, fa->fae, fal); if (error) goto out; pbuf = PNBUF_GET(); for (; i < fa->len; i++) { char **pathp = posix_spawn_fae_path(&fa->fae[i]); if (pathp == NULL) continue; error = copyinstr(*pathp, pbuf, MAXPATHLEN, &fal); if (error) goto out; *pathp = kmem_alloc(fal, KM_SLEEP); memcpy(*pathp, pbuf, fal); } PNBUF_PUT(pbuf); *fap = fa; return 0; out: if (pbuf) PNBUF_PUT(pbuf); posix_spawn_fa_free(fa, i); return error; } /* * N.B. increments nprocs upon success. Callers need to drop nprocs if * they fail for some other reason. */ int check_posix_spawn(struct lwp *l1) { int error, tnprocs, count; uid_t uid; struct proc *p1; p1 = l1->l_proc; uid = kauth_cred_getuid(l1->l_cred); tnprocs = atomic_inc_uint_nv(&nprocs); /* * Although process entries are dynamically created, we still keep * a global limit on the maximum number we will create. */ if (__predict_false(tnprocs >= maxproc)) error = -1; else error = kauth_authorize_process(l1->l_cred, KAUTH_PROCESS_FORK, p1, KAUTH_ARG(tnprocs), NULL, NULL); if (error) { atomic_dec_uint(&nprocs); return EAGAIN; } /* * Enforce limits. */ count = chgproccnt(uid, 1); if (kauth_authorize_process(l1->l_cred, KAUTH_PROCESS_RLIMIT, p1, KAUTH_ARG(KAUTH_REQ_PROCESS_RLIMIT_BYPASS), &p1->p_rlimit[RLIMIT_NPROC], KAUTH_ARG(RLIMIT_NPROC)) != 0 && __predict_false(count > p1->p_rlimit[RLIMIT_NPROC].rlim_cur)) { (void)chgproccnt(uid, -1); atomic_dec_uint(&nprocs); return EAGAIN; } return 0; } int do_posix_spawn(struct lwp *l1, pid_t *pid_res, bool *child_ok, const char *path, struct posix_spawn_file_actions *fa, struct posix_spawnattr *sa, char *const *argv, char *const *envp, execve_fetch_element_t fetch) { struct proc *p1, *p2; struct lwp *l2; int error; struct spawn_exec_data *spawn_data; vaddr_t uaddr = 0; pid_t pid; bool have_exec_lock = false; p1 = l1->l_proc; /* Allocate and init spawn_data */ spawn_data = kmem_zalloc(sizeof(*spawn_data), KM_SLEEP); spawn_data->sed_refcnt = 1; /* only parent so far */ cv_init(&spawn_data->sed_cv_child_ready, "pspawn"); mutex_init(&spawn_data->sed_mtx_child, MUTEX_DEFAULT, IPL_NONE); mutex_enter(&spawn_data->sed_mtx_child); /* * Do the first part of the exec now, collect state * in spawn_data. */ error = execve_loadvm(l1, true, path, -1, argv, envp, fetch, &spawn_data->sed_exec); if (error == EJUSTRETURN) error = 0; else if (error) goto error_exit; have_exec_lock = true; /* * Allocate virtual address space for the U-area now, while it * is still easy to abort the fork operation if we're out of * kernel virtual address space. */ uaddr = uvm_uarea_alloc(); if (__predict_false(uaddr == 0)) { error = ENOMEM; goto error_exit; } /* * Allocate new proc. Borrow proc0 vmspace for it, we will * replace it with its own before returning to userland * in the child. */ p2 = proc_alloc(); if (p2 == NULL) { /* We were unable to allocate a process ID. */ error = EAGAIN; goto error_exit; } /* * This is a point of no return, we will have to go through * the child proc to properly clean it up past this point. */ pid = p2->p_pid; /* * Make a proc table entry for the new process. * Start by zeroing the section of proc that is zero-initialized, * then copy the section that is copied directly from the parent. */ memset(&p2->p_startzero, 0, (unsigned) ((char *)&p2->p_endzero - (char *)&p2->p_startzero)); memcpy(&p2->p_startcopy, &p1->p_startcopy, (unsigned) ((char *)&p2->p_endcopy - (char *)&p2->p_startcopy)); p2->p_vmspace = proc0.p_vmspace; TAILQ_INIT(&p2->p_sigpend.sp_info); LIST_INIT(&p2->p_lwps); LIST_INIT(&p2->p_sigwaiters); /* * Duplicate sub-structures as needed. * Increase reference counts on shared objects. * Inherit flags we want to keep. The flags related to SIGCHLD * handling are important in order to keep a consistent behaviour * for the child after the fork. If we are a 32-bit process, the * child will be too. */ p2->p_flag = p1->p_flag & (PK_SUGID | PK_NOCLDWAIT | PK_CLDSIGIGN | PK_32); p2->p_emul = p1->p_emul; p2->p_execsw = p1->p_execsw; mutex_init(&p2->p_stmutex, MUTEX_DEFAULT, IPL_HIGH); mutex_init(&p2->p_auxlock, MUTEX_DEFAULT, IPL_NONE); rw_init(&p2->p_reflock); cv_init(&p2->p_waitcv, "wait"); cv_init(&p2->p_lwpcv, "lwpwait"); p2->p_lock = mutex_obj_alloc(MUTEX_DEFAULT, IPL_NONE); kauth_proc_fork(p1, p2); p2->p_raslist = NULL; p2->p_fd = fd_copy(); /* XXX racy */ p2->p_mqueue_cnt = p1->p_mqueue_cnt; p2->p_cwdi = cwdinit(); /* * Note: p_limit (rlimit stuff) is copy-on-write, so normally * we just need increase pl_refcnt. */ if (!p1->p_limit->pl_writeable) { lim_addref(p1->p_limit); p2->p_limit = p1->p_limit; } else { p2->p_limit = lim_copy(p1->p_limit); } p2->p_lflag = 0; l1->l_vforkwaiting = false; p2->p_sflag = 0; p2->p_slflag = 0; p2->p_pptr = p1; p2->p_ppid = p1->p_pid; LIST_INIT(&p2->p_children); p2->p_aio = NULL; #ifdef KTRACE /* * Copy traceflag and tracefile if enabled. * If not inherited, these were zeroed above. */ if (p1->p_traceflag & KTRFAC_INHERIT) { mutex_enter(&ktrace_lock); p2->p_traceflag = p1->p_traceflag; if ((p2->p_tracep = p1->p_tracep) != NULL) ktradref(p2); mutex_exit(&ktrace_lock); } #endif /* * Create signal actions for the child process. */ p2->p_sigacts = sigactsinit(p1, 0); mutex_enter(p1->p_lock); p2->p_sflag |= (p1->p_sflag & (PS_STOPFORK | PS_STOPEXEC | PS_NOCLDSTOP)); sched_proc_fork(p1, p2); mutex_exit(p1->p_lock); p2->p_stflag = p1->p_stflag; /* * p_stats. * Copy parts of p_stats, and zero out the rest. */ p2->p_stats = pstatscopy(p1->p_stats); /* copy over machdep flags to the new proc */ cpu_proc_fork(p1, p2); /* * Prepare remaining parts of spawn data */ spawn_data->sed_actions = fa; spawn_data->sed_attrs = sa; spawn_data->sed_parent = p1; /* create LWP */ lwp_create(l1, p2, uaddr, 0, NULL, 0, spawn_return, spawn_data, &l2, l1->l_class, &l1->l_sigmask, &l1->l_sigstk); l2->l_ctxlink = NULL; /* reset ucontext link */ /* * Copy the credential so other references don't see our changes. * Test to see if this is necessary first, since in the common case * we won't need a private reference. */ if (kauth_cred_geteuid(l2->l_cred) != kauth_cred_getsvuid(l2->l_cred) || kauth_cred_getegid(l2->l_cred) != kauth_cred_getsvgid(l2->l_cred)) { l2->l_cred = kauth_cred_copy(l2->l_cred); kauth_cred_setsvuid(l2->l_cred, kauth_cred_geteuid(l2->l_cred)); kauth_cred_setsvgid(l2->l_cred, kauth_cred_getegid(l2->l_cred)); } /* Update the master credentials. */ if (l2->l_cred != p2->p_cred) { kauth_cred_t ocred; mutex_enter(p2->p_lock); ocred = p2->p_cred; p2->p_cred = kauth_cred_hold(l2->l_cred); mutex_exit(p2->p_lock); kauth_cred_free(ocred); } *child_ok = true; spawn_data->sed_refcnt = 2; /* child gets it as well */ #if 0 l2->l_nopreempt = 1; /* start it non-preemptable */ #endif /* * It's now safe for the scheduler and other processes to see the * child process. */ mutex_enter(&proc_lock); if (p1->p_session->s_ttyvp != NULL && p1->p_lflag & PL_CONTROLT) p2->p_lflag |= PL_CONTROLT; LIST_INSERT_HEAD(&p1->p_children, p2, p_sibling); p2->p_exitsig = SIGCHLD; /* signal for parent on exit */ if ((p1->p_slflag & (PSL_TRACEPOSIX_SPAWN|PSL_TRACED)) == (PSL_TRACEPOSIX_SPAWN|PSL_TRACED)) { proc_changeparent(p2, p1->p_pptr); SET(p2->p_slflag, PSL_TRACEDCHILD); } p2->p_oppid = p1->p_pid; /* Remember the original parent id. */ LIST_INSERT_AFTER(p1, p2, p_pglist); LIST_INSERT_HEAD(&allproc, p2, p_list); p2->p_trace_enabled = trace_is_enabled(p2); #ifdef __HAVE_SYSCALL_INTERN (*p2->p_emul->e_syscall_intern)(p2); #endif /* * Make child runnable, set start time, and add to run queue except * if the parent requested the child to start in SSTOP state. */ mutex_enter(p2->p_lock); getmicrotime(&p2->p_stats->p_start); lwp_lock(l2); KASSERT(p2->p_nrlwps == 1); KASSERT(l2->l_stat == LSIDL); p2->p_nrlwps = 1; p2->p_stat = SACTIVE; setrunnable(l2); /* LWP now unlocked */ mutex_exit(p2->p_lock); mutex_exit(&proc_lock); cv_wait(&spawn_data->sed_cv_child_ready, &spawn_data->sed_mtx_child); error = spawn_data->sed_error; mutex_exit(&spawn_data->sed_mtx_child); spawn_exec_data_release(spawn_data); rw_exit(&p1->p_reflock); rw_exit(&exec_lock); have_exec_lock = false; *pid_res = pid; if (error) return error; if (p1->p_slflag & PSL_TRACED) { /* Paranoid check */ mutex_enter(&proc_lock); if ((p1->p_slflag & (PSL_TRACEPOSIX_SPAWN|PSL_TRACED)) != (PSL_TRACEPOSIX_SPAWN|PSL_TRACED)) { mutex_exit(&proc_lock); return 0; } mutex_enter(p1->p_lock); eventswitch(TRAP_CHLD, PTRACE_POSIX_SPAWN, pid); } return 0; error_exit: if (have_exec_lock) { execve_free_data(&spawn_data->sed_exec); rw_exit(&p1->p_reflock); rw_exit(&exec_lock); } mutex_exit(&spawn_data->sed_mtx_child); spawn_exec_data_release(spawn_data); if (uaddr != 0) uvm_uarea_free(uaddr); return error; } int sys_posix_spawn(struct lwp *l1, const struct sys_posix_spawn_args *uap, register_t *retval) { /* { syscallarg(pid_t *) pid; syscallarg(const char *) path; syscallarg(const struct posix_spawn_file_actions *) file_actions; syscallarg(const struct posix_spawnattr *) attrp; syscallarg(char *const *) argv; syscallarg(char *const *) envp; } */ int error; struct posix_spawn_file_actions *fa = NULL; struct posix_spawnattr *sa = NULL; pid_t pid; bool child_ok = false; rlim_t max_fileactions; proc_t *p = l1->l_proc; /* check_posix_spawn() increments nprocs for us. */ error = check_posix_spawn(l1); if (error) { *retval = error; return 0; } /* copy in file_actions struct */ if (SCARG(uap, file_actions) != NULL) { max_fileactions = 2 * uimin(p->p_rlimit[RLIMIT_NOFILE].rlim_cur, maxfiles); error = posix_spawn_fa_alloc(&fa, SCARG(uap, file_actions), max_fileactions); if (error) goto error_exit; } /* copyin posix_spawnattr struct */ if (SCARG(uap, attrp) != NULL) { sa = kmem_alloc(sizeof(*sa), KM_SLEEP); error = copyin(SCARG(uap, attrp), sa, sizeof(*sa)); if (error) goto error_exit; } /* * Do the spawn */ error = do_posix_spawn(l1, &pid, &child_ok, SCARG(uap, path), fa, sa, SCARG(uap, argv), SCARG(uap, envp), execve_fetch_element); if (error) goto error_exit; if (error == 0 && SCARG(uap, pid) != NULL) error = copyout(&pid, SCARG(uap, pid), sizeof(pid)); *retval = error; return 0; error_exit: if (!child_ok) { (void)chgproccnt(kauth_cred_getuid(l1->l_cred), -1); atomic_dec_uint(&nprocs); if (sa) kmem_free(sa, sizeof(*sa)); if (fa) posix_spawn_fa_free(fa, fa->len); } *retval = error; return 0; } void exec_free_emul_arg(struct exec_package *epp) { if (epp->ep_emul_arg_free != NULL) { KASSERT(epp->ep_emul_arg != NULL); (*epp->ep_emul_arg_free)(epp->ep_emul_arg); epp->ep_emul_arg_free = NULL; epp->ep_emul_arg = NULL; } else { KASSERT(epp->ep_emul_arg == NULL); } } #ifdef DEBUG_EXEC static void dump_vmcmds(const struct exec_package * const epp, size_t x, int error) { struct exec_vmcmd *vp = &epp->ep_vmcmds.evs_cmds[0]; size_t j; if (error == 0) DPRINTF(("vmcmds %u\n", epp->ep_vmcmds.evs_used)); else DPRINTF(("vmcmds %zu/%u, error %d\n", x, epp->ep_vmcmds.evs_used, error)); for (j = 0; j < epp->ep_vmcmds.evs_used; j++) { DPRINTF(("vmcmd[%zu] = vmcmd_map_%s %#" PRIxVADDR"/%#"PRIxVSIZE" fd@%#" PRIxVSIZE" prot=0%o flags=%d\n", j, vp[j].ev_proc == vmcmd_map_pagedvn ? "pagedvn" : vp[j].ev_proc == vmcmd_map_readvn ? "readvn" : vp[j].ev_proc == vmcmd_map_zero ? "zero" : "*unknown*", vp[j].ev_addr, vp[j].ev_len, vp[j].ev_offset, vp[j].ev_prot, vp[j].ev_flags)); if (error != 0 && j == x) DPRINTF((" ^--- failed\n")); } } #endif
1214 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 /* $NetBSD: bitops.h,v 1.15 2021/09/12 15:22:05 rillig Exp $ */ /*- * Copyright (c) 2007, 2010 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Christos Zoulas and Joerg Sonnenberger. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #ifndef _SYS_BITOPS_H_ #define _SYS_BITOPS_H_ #include <sys/stdint.h> /* * Find First Set functions */ #ifndef ffs32 static __inline int __unused ffs32(uint32_t _n) { int _v; if (!_n) return 0; _v = 1; if ((_n & 0x0000FFFFU) == 0) { _n >>= 16; _v += 16; } if ((_n & 0x000000FFU) == 0) { _n >>= 8; _v += 8; } if ((_n & 0x0000000FU) == 0) { _n >>= 4; _v += 4; } if ((_n & 0x00000003U) == 0) { _n >>= 2; _v += 2; } if ((_n & 0x00000001U) == 0) { _n >>= 1; _v += 1; } return _v; } #endif #ifndef ffs64 static __inline int __unused ffs64(uint64_t _n) { int _v; if (!_n) return 0; _v = 1; if ((_n & 0x00000000FFFFFFFFULL) == 0) { _n >>= 32; _v += 32; } if ((_n & 0x000000000000FFFFULL) == 0) { _n >>= 16; _v += 16; } if ((_n & 0x00000000000000FFULL) == 0) { _n >>= 8; _v += 8; } if ((_n & 0x000000000000000FULL) == 0) { _n >>= 4; _v += 4; } if ((_n & 0x0000000000000003ULL) == 0) { _n >>= 2; _v += 2; } if ((_n & 0x0000000000000001ULL) == 0) { _n >>= 1; _v += 1; } return _v; } #endif /* * Find Last Set functions */ #ifndef fls32 static __inline int __unused fls32(uint32_t _n) { int _v; if (!_n) return 0; _v = 32; if ((_n & 0xFFFF0000U) == 0) { _n <<= 16; _v -= 16; } if ((_n & 0xFF000000U) == 0) { _n <<= 8; _v -= 8; } if ((_n & 0xF0000000U) == 0) { _n <<= 4; _v -= 4; } if ((_n & 0xC0000000U) == 0) { _n <<= 2; _v -= 2; } if ((_n & 0x80000000U) == 0) { _n <<= 1; _v -= 1; } return _v; } #endif #ifndef fls64 static __inline int __unused fls64(uint64_t _n) { int _v; if (!_n) return 0; _v = 64; if ((_n & 0xFFFFFFFF00000000ULL) == 0) { _n <<= 32; _v -= 32; } if ((_n & 0xFFFF000000000000ULL) == 0) { _n <<= 16; _v -= 16; } if ((_n & 0xFF00000000000000ULL) == 0) { _n <<= 8; _v -= 8; } if ((_n & 0xF000000000000000ULL) == 0) { _n <<= 4; _v -= 4; } if ((_n & 0xC000000000000000ULL) == 0) { _n <<= 2; _v -= 2; } if ((_n & 0x8000000000000000ULL) == 0) { _n <<= 1; _v -= 1; } return _v; } #endif /* * Integer logarithm, returns -1 on error. Inspired by the linux * version written by David Howells. */ #define _ilog2_helper(_n, _x) ((_n) & (1ULL << (_x))) ? _x : #define _ilog2_const(_n) ( \ _ilog2_helper(_n, 63) \ _ilog2_helper(_n, 62) \ _ilog2_helper(_n, 61) \ _ilog2_helper(_n, 60) \ _ilog2_helper(_n, 59) \ _ilog2_helper(_n, 58) \ _ilog2_helper(_n, 57) \ _ilog2_helper(_n, 56) \ _ilog2_helper(_n, 55) \ _ilog2_helper(_n, 54) \ _ilog2_helper(_n, 53) \ _ilog2_helper(_n, 52) \ _ilog2_helper(_n, 51) \ _ilog2_helper(_n, 50) \ _ilog2_helper(_n, 49) \ _ilog2_helper(_n, 48) \ _ilog2_helper(_n, 47) \ _ilog2_helper(_n, 46) \ _ilog2_helper(_n, 45) \ _ilog2_helper(_n, 44) \ _ilog2_helper(_n, 43) \ _ilog2_helper(_n, 42) \ _ilog2_helper(_n, 41) \ _ilog2_helper(_n, 40) \ _ilog2_helper(_n, 39) \ _ilog2_helper(_n, 38) \ _ilog2_helper(_n, 37) \ _ilog2_helper(_n, 36) \ _ilog2_helper(_n, 35) \ _ilog2_helper(_n, 34) \ _ilog2_helper(_n, 33) \ _ilog2_helper(_n, 32) \ _ilog2_helper(_n, 31) \ _ilog2_helper(_n, 30) \ _ilog2_helper(_n, 29) \ _ilog2_helper(_n, 28) \ _ilog2_helper(_n, 27) \ _ilog2_helper(_n, 26) \ _ilog2_helper(_n, 25) \ _ilog2_helper(_n, 24) \ _ilog2_helper(_n, 23) \ _ilog2_helper(_n, 22) \ _ilog2_helper(_n, 21) \ _ilog2_helper(_n, 20) \ _ilog2_helper(_n, 19) \ _ilog2_helper(_n, 18) \ _ilog2_helper(_n, 17) \ _ilog2_helper(_n, 16) \ _ilog2_helper(_n, 15) \ _ilog2_helper(_n, 14) \ _ilog2_helper(_n, 13) \ _ilog2_helper(_n, 12) \ _ilog2_helper(_n, 11) \ _ilog2_helper(_n, 10) \ _ilog2_helper(_n, 9) \ _ilog2_helper(_n, 8) \ _ilog2_helper(_n, 7) \ _ilog2_helper(_n, 6) \ _ilog2_helper(_n, 5) \ _ilog2_helper(_n, 4) \ _ilog2_helper(_n, 3) \ _ilog2_helper(_n, 2) \ _ilog2_helper(_n, 1) \ _ilog2_helper(_n, 0) \ -1) #define ilog2(_n) \ ( \ __builtin_constant_p(_n) ? _ilog2_const(_n) : \ ((sizeof(_n) > 4 ? fls64(_n) : fls32(_n)) - 1) \ ) static __inline void fast_divide32_prepare(uint32_t _div, uint32_t * __restrict _m, uint8_t *__restrict _s1, uint8_t *__restrict _s2) { uint64_t _mt; int _l; _l = fls32(_div - 1); _mt = (uint64_t)(0x100000000ULL * ((1ULL << _l) - _div)); *_m = (uint32_t)(_mt / _div + 1); *_s1 = (_l > 1) ? 1U : (uint8_t)_l; *_s2 = (_l == 0) ? 0 : (uint8_t)(_l - 1); } /* ARGSUSED */ static __inline uint32_t fast_divide32(uint32_t _v, uint32_t _div __unused, uint32_t _m, uint8_t _s1, uint8_t _s2) { uint32_t _t; _t = (uint32_t)(((uint64_t)_v * _m) >> 32); return (_t + ((_v - _t) >> _s1)) >> _s2; } static __inline uint32_t fast_remainder32(uint32_t _v, uint32_t _div, uint32_t _m, uint8_t _s1, uint8_t _s2) { return _v - _div * fast_divide32(_v, _div, _m, _s1, _s2); } #define __BITMAP_TYPE(__s, __t, __n) struct __s { \ __t _b[__BITMAP_SIZE(__t, __n)]; \ } #define __BITMAP_BITS(__t) (sizeof(__t) * NBBY) #define __BITMAP_SHIFT(__t) (ilog2(__BITMAP_BITS(__t))) #define __BITMAP_MASK(__t) (__BITMAP_BITS(__t) - 1) #define __BITMAP_SIZE(__t, __n) \ (((__n) + (__BITMAP_BITS(__t) - 1)) / __BITMAP_BITS(__t)) #define __BITMAP_BIT(__n, __v) \ ((__typeof__((__v)->_b[0]))1 << ((__n) & __BITMAP_MASK(*(__v)->_b))) #define __BITMAP_WORD(__n, __v) \ ((__n) >> __BITMAP_SHIFT(*(__v)->_b)) #define __BITMAP_SET(__n, __v) \ ((__v)->_b[__BITMAP_WORD(__n, __v)] |= __BITMAP_BIT(__n, __v)) #define __BITMAP_CLR(__n, __v) \ ((__v)->_b[__BITMAP_WORD(__n, __v)] &= ~__BITMAP_BIT(__n, __v)) #define __BITMAP_ISSET(__n, __v) \ ((__v)->_b[__BITMAP_WORD(__n, __v)] & __BITMAP_BIT(__n, __v)) #if __GNUC_PREREQ__(2, 95) #define __BITMAP_ZERO(__v) \ (void)__builtin_memset((__v), 0, sizeof(*__v)) #else #define __BITMAP_ZERO(__v) do { \ size_t __i; \ for (__i = 0; __i < __arraycount((__v)->_b); __i++) \ (__v)->_b[__i] = 0; \ } while (/* CONSTCOND */ 0) #endif /* GCC 2.95 */ #endif /* _SYS_BITOPS_H_ */
188 5 187 187 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 /* $NetBSD: kern_module_hook.c,v 1.4 2019/12/13 08:02:53 skrll Exp $ */ /*- * Copyright (c) 2019 The NetBSD Foundation, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /* * Kernel module support. */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: kern_module_hook.c,v 1.4 2019/12/13 08:02:53 skrll Exp $"); #include <sys/param.h> #include <sys/atomic.h> #include <sys/condvar.h> #include <sys/module_hook.h> #include <sys/mutex.h> #include <sys/pserialize.h> #include <uvm/uvm_extern.h> /* Locking/synchronization stuff for module hooks */ static struct { kmutex_t mtx; kcondvar_t cv; pserialize_t psz; } module_hook __cacheline_aligned; /* * We use pserialize_perform() to issue a memory barrier on the current * CPU and on all other CPUs so that all prior memory operations on the * current CPU globally happen before all subsequent memory operations * on the current CPU, as perceived by any other CPU. * * pserialize_perform() might be rather heavy-weight here, but it only * happens during module loading, and it allows MODULE_HOOK_CALL() to * work without any other memory barriers. */ void module_hook_set(bool *hooked, struct localcount *lc) { KASSERT(kernconfig_is_held()); KASSERT(!*hooked); localcount_init(lc); /* Wait until setup has been witnessed by all CPUs. */ pserialize_perform(module_hook.psz); /* Let others use it */ atomic_store_relaxed(hooked, true); } void module_hook_unset(bool *hooked, struct localcount *lc) { KASSERT(kernconfig_is_held()); KASSERT(*hooked); /* Get exclusive with pserialize and localcount. */ mutex_enter(&module_hook.mtx); /* Prevent new calls to module_hook_tryenter(). */ atomic_store_relaxed(hooked, false); /* Wait for existing calls to module_hook_tryenter(). */ pserialize_perform(module_hook.psz); /* Wait for module_hook_exit. */ localcount_drain(lc, &module_hook.cv, &module_hook.mtx); /* All done! */ mutex_exit(&module_hook.mtx); localcount_fini(lc); } bool module_hook_tryenter(bool *hooked, struct localcount *lc) { bool call_hook; int s; s = pserialize_read_enter(); call_hook = atomic_load_relaxed(hooked); if (call_hook) localcount_acquire(lc); pserialize_read_exit(s); return call_hook; } void module_hook_exit(struct localcount *lc) { localcount_release(lc, &module_hook.cv, &module_hook.mtx); } void module_hook_init(void) { mutex_init(&module_hook.mtx, MUTEX_DEFAULT, IPL_NONE); cv_init(&module_hook.cv, "mod_hook"); module_hook.psz = pserialize_create(); }
2 2 2 2 1 1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 /* $NetBSD: l2cap_upper.c,v 1.19 2016/12/12 15:58:45 maya Exp $ */ /*- * Copyright (c) 2005 Iain Hibbert. * Copyright (c) 2006 Itronix Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The name of Itronix Inc. may not be used to endorse * or promote products derived from this software without specific * prior written permission. * * THIS SOFTWARE IS PROVIDED BY ITRONIX INC. ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL ITRONIX INC. BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: l2cap_upper.c,v 1.19 2016/12/12 15:58:45 maya Exp $"); #include <sys/param.h> #include <sys/kernel.h> #include <sys/mbuf.h> #include <sys/proc.h> #include <sys/queue.h> #include <sys/socket.h> #include <sys/socketvar.h> #include <sys/systm.h> #include <netbt/bluetooth.h> #include <netbt/hci.h> #include <netbt/l2cap.h> /******************************************************************************* * * L2CAP Channel - Upper Protocol API */ /* * l2cap_attach_pcb(handle, btproto, upper) * * attach new l2cap_channel to handle, populate * with reasonable defaults */ int l2cap_attach_pcb(struct l2cap_channel **handle, const struct btproto *proto, void *upper) { struct l2cap_channel *chan; KASSERT(handle != NULL); KASSERT(proto != NULL); KASSERT(upper != NULL); chan = malloc(sizeof(struct l2cap_channel), M_BLUETOOTH, M_NOWAIT | M_ZERO); if (chan == NULL) return ENOMEM; chan->lc_proto = proto; chan->lc_upper = upper; chan->lc_state = L2CAP_CLOSED; chan->lc_lcid = L2CAP_NULL_CID; chan->lc_rcid = L2CAP_NULL_CID; chan->lc_laddr.bt_len = sizeof(struct sockaddr_bt); chan->lc_laddr.bt_family = AF_BLUETOOTH; chan->lc_laddr.bt_psm = L2CAP_PSM_ANY; chan->lc_raddr.bt_len = sizeof(struct sockaddr_bt); chan->lc_raddr.bt_family = AF_BLUETOOTH; chan->lc_raddr.bt_psm = L2CAP_PSM_ANY; chan->lc_imtu = L2CAP_MTU_DEFAULT; chan->lc_omtu = L2CAP_MTU_DEFAULT; chan->lc_flush = L2CAP_FLUSH_TIMO_DEFAULT; memcpy(&chan->lc_iqos, &l2cap_default_qos, sizeof(l2cap_qos_t)); memcpy(&chan->lc_oqos, &l2cap_default_qos, sizeof(l2cap_qos_t)); MBUFQ_INIT(&chan->lc_txq); *handle = chan; return 0; } /* * l2cap_bind_pcb(l2cap_channel, sockaddr) * * set local address of channel */ int l2cap_bind_pcb(struct l2cap_channel *chan, struct sockaddr_bt *addr) { if (chan->lc_lcid != L2CAP_NULL_CID) return EINVAL; memcpy(&chan->lc_laddr, addr, sizeof(struct sockaddr_bt)); return 0; } /* * l2cap_sockaddr_pcb(l2cap_channel, sockaddr) * * get local address of channel */ int l2cap_sockaddr_pcb(struct l2cap_channel *chan, struct sockaddr_bt *addr) { memcpy(addr, &chan->lc_laddr, sizeof(struct sockaddr_bt)); return 0; } /* * l2cap_connect_pcb(l2cap_channel, sockaddr) * * Initiate a connection to destination. This corresponds to * "Open Channel Request" in the L2CAP specification and will * result in one of the following: * * proto->connected(upper) * proto->disconnected(upper, error) * * and, optionally * proto->connecting(upper) */ int l2cap_connect_pcb(struct l2cap_channel *chan, struct sockaddr_bt *dest) { struct hci_unit *unit; int err; memcpy(&chan->lc_raddr, dest, sizeof(struct sockaddr_bt)); if (L2CAP_PSM_INVALID(chan->lc_raddr.bt_psm)) return EINVAL; if (bdaddr_any(&chan->lc_raddr.bt_bdaddr)) return EDESTADDRREQ; /* set local address if it needs setting */ if (bdaddr_any(&chan->lc_laddr.bt_bdaddr)) { err = hci_route_lookup(&chan->lc_laddr.bt_bdaddr, &chan->lc_raddr.bt_bdaddr); if (err) return err; } unit = hci_unit_lookup(&chan->lc_laddr.bt_bdaddr); if (unit == NULL) return EHOSTUNREACH; /* attach to active list */ err = l2cap_cid_alloc(chan); if (err) return err; /* open link to remote device */ chan->lc_link = hci_acl_open(unit, &chan->lc_raddr.bt_bdaddr); if (chan->lc_link == NULL) return EHOSTUNREACH; /* set the link mode */ err = l2cap_setmode(chan); if (err == EINPROGRESS) { chan->lc_state = L2CAP_WAIT_SEND_CONNECT_REQ; (*chan->lc_proto->connecting)(chan->lc_upper); return 0; } if (err) goto fail; /* * We can queue a connect request now even though the link may * not yet be open; Our mode setting is assured, and the queue * will be started automatically at the right time. */ chan->lc_state = L2CAP_WAIT_RECV_CONNECT_RSP; err = l2cap_send_connect_req(chan); if (err) goto fail; return 0; fail: chan->lc_state = L2CAP_CLOSED; hci_acl_close(chan->lc_link, err); chan->lc_link = NULL; return err; } /* * l2cap_peeraddr_pcb(l2cap_channel, sockaddr) * * get remote address of channel */ int l2cap_peeraddr_pcb(struct l2cap_channel *chan, struct sockaddr_bt *addr) { memcpy(addr, &chan->lc_raddr, sizeof(struct sockaddr_bt)); return 0; } /* * l2cap_disconnect_pcb(l2cap_channel, linger) * * Initiate L2CAP disconnection. This corresponds to * "Close Channel Request" in the L2CAP specification * and will result in a call to * * proto->disconnected(upper, error) * * when the disconnection is complete. If linger is set, * the call will not be made until data has flushed from * the queue. */ int l2cap_disconnect_pcb(struct l2cap_channel *chan, int linger) { int err = 0; if (chan->lc_state == L2CAP_CLOSED || chan->lc_state == L2CAP_WAIT_DISCONNECT) return EINVAL; chan->lc_flags |= L2CAP_SHUTDOWN; /* * no need to do anything unless the queue is empty or * we are not lingering.. */ if ((MBUFQ_FIRST(&chan->lc_txq) == NULL && chan->lc_pending == 0) || linger == 0) { chan->lc_state = L2CAP_WAIT_DISCONNECT; err = l2cap_send_disconnect_req(chan); if (err) l2cap_close(chan, err); } return err; } /* * l2cap_detach_pcb(handle) * * Detach l2cap channel from handle & close it down */ void l2cap_detach_pcb(struct l2cap_channel **handle) { struct l2cap_channel *chan; chan = *handle; *handle = NULL; if (chan->lc_state != L2CAP_CLOSED) l2cap_close(chan, 0); if (chan->lc_lcid != L2CAP_NULL_CID) { LIST_REMOVE(chan, lc_ncid); chan->lc_lcid = L2CAP_NULL_CID; } MBUFQ_DRAIN(&chan->lc_txq); /* * Could implement some kind of delayed expunge to make sure that the * CID is really dead before it becomes available for reuse? */ free(chan, M_BLUETOOTH); } /* * l2cap_listen_pcb(l2cap_channel) * * Use this channel as a listening post (until detached). This will * result in calls to: * * proto->newconn(upper, laddr, raddr) * * for incoming connections matching the psm and local address of * the channel. NULL address is permitted and matches any device. * If L2CAP_PSM_ANY is bound the next higher unused value from the * dynamic range (above 0x1001) will be selected. * * The upper layer should create and return a new channel. * * You cannot use this channel for anything else subsequent to this call */ int l2cap_listen_pcb(struct l2cap_channel *chan) { struct l2cap_channel *used, *prev = NULL; uint32_t psm; if (chan->lc_lcid != L2CAP_NULL_CID) return EINVAL; /* * This is simplistic but its not really worth spending a * lot of time looking for an unused PSM.. */ if (chan->lc_laddr.bt_psm == L2CAP_PSM_ANY) { psm = 0x1001; used = LIST_FIRST(&l2cap_listen_list); if (used != NULL && used->lc_laddr.bt_psm >= psm) { psm = used->lc_laddr.bt_psm + 0x0002; if ((psm & 0x0100) != 0) psm += 0x0100; if (psm > UINT16_MAX) return EADDRNOTAVAIL; } chan->lc_laddr.bt_psm = psm; } else if (L2CAP_PSM_INVALID(chan->lc_laddr.bt_psm)) return EINVAL; /* * This CID is irrelevant, as the channel is not stored on the active * list and the socket code does not allow operations on listening * sockets, but we set it so the detach code knows to LIST_REMOVE the * channel. */ chan->lc_lcid = L2CAP_SIGNAL_CID; /* * The list of listening channels is stored in an order such that new * listeners dont usurp current listeners, but that specific listening * takes precedence over promiscuous, and the connect request code can * easily use the first matching entry. */ LIST_FOREACH(used, &l2cap_listen_list, lc_ncid) { if (used->lc_laddr.bt_psm < chan->lc_laddr.bt_psm) break; if (used->lc_laddr.bt_psm == chan->lc_laddr.bt_psm && bdaddr_any(&used->lc_laddr.bt_bdaddr) && !bdaddr_any(&chan->lc_laddr.bt_bdaddr)) break; prev = used; } if (prev == NULL) LIST_INSERT_HEAD(&l2cap_listen_list, chan, lc_ncid); else LIST_INSERT_AFTER(prev, chan, lc_ncid); return 0; } /* * l2cap_send_pcb(l2cap_channel, mbuf) * * Output SDU on channel described by channel. This corresponds * to "Send Data Request" in the L2CAP specification. The upper * layer will be notified when SDU's have completed sending by a * call to: * * proto->complete(upper, n) * * (currently n == 1) * * Note: I'm not sure how this will work out, but I think that * if outgoing Retransmission Mode or Flow Control Mode is * negotiated then this call will not be made until the SDU has * been acknowledged by the peer L2CAP entity. For 'Best Effort' * it will be made when the packet has cleared the controller * buffers. * * We only support Basic mode so far, so encapsulate with a * B-Frame header and start sending if we are not already */ int l2cap_send_pcb(struct l2cap_channel *chan, struct mbuf *m) { l2cap_hdr_t *hdr; int plen; if (chan->lc_state == L2CAP_CLOSED) { m_freem(m); return ENOTCONN; } plen = m->m_pkthdr.len; DPRINTFN(5, "send %d bytes on CID #%d (pending = %d)\n", plen, chan->lc_lcid, chan->lc_pending); /* Encapsulate with B-Frame */ M_PREPEND(m, sizeof(l2cap_hdr_t), M_DONTWAIT); if (m == NULL) return ENOMEM; hdr = mtod(m, l2cap_hdr_t *); hdr->length = htole16(plen); hdr->dcid = htole16(chan->lc_rcid); /* Queue it on our list */ MBUFQ_ENQUEUE(&chan->lc_txq, m); /* If we are not sending, then start doing so */ if (chan->lc_pending == 0) return l2cap_start(chan); return 0; } /* * l2cap_setopt(l2cap_channel, sopt) * * Apply configuration options to channel. This corresponds to * "Configure Channel Request" in the L2CAP specification. * * for SO_L2CAP_LM, the settings will take effect when the * channel is established. If the channel is already open, * a call to * proto->linkmode(upper, new) * * will be made when the change is complete. */ int l2cap_setopt(struct l2cap_channel *chan, const struct sockopt *sopt) { int mode, err = 0; uint16_t mtu; switch (sopt->sopt_name) { case SO_L2CAP_IMTU: /* set Incoming MTU */ err = sockopt_get(sopt, &mtu, sizeof(mtu)); if (err) break; if (mtu < L2CAP_MTU_MINIMUM) err = EINVAL; else if (chan->lc_state == L2CAP_CLOSED) chan->lc_imtu = mtu; else err = EBUSY; break; case SO_L2CAP_LM: /* set link mode */ err = sockopt_getint(sopt, &mode); if (err) break; mode &= (L2CAP_LM_SECURE | L2CAP_LM_ENCRYPT | L2CAP_LM_AUTH); if (mode & L2CAP_LM_SECURE) mode |= L2CAP_LM_ENCRYPT; if (mode & L2CAP_LM_ENCRYPT) mode |= L2CAP_LM_AUTH; chan->lc_mode = mode; if (chan->lc_state == L2CAP_OPEN) err = l2cap_setmode(chan); break; case SO_L2CAP_OQOS: /* set Outgoing QoS flow spec */ case SO_L2CAP_FLUSH: /* set Outgoing Flush Timeout */ default: err = ENOPROTOOPT; break; } return err; } /* * l2cap_getopt(l2cap_channel, sopt) * * Return configuration parameters. */ int l2cap_getopt(struct l2cap_channel *chan, struct sockopt *sopt) { switch (sopt->sopt_name) { case SO_L2CAP_IMTU: /* get Incoming MTU */ return sockopt_set(sopt, &chan->lc_imtu, sizeof(uint16_t)); case SO_L2CAP_OMTU: /* get Outgoing MTU */ return sockopt_set(sopt, &chan->lc_omtu, sizeof(uint16_t)); case SO_L2CAP_IQOS: /* get Incoming QoS flow spec */ return sockopt_set(sopt, &chan->lc_iqos, sizeof(l2cap_qos_t)); case SO_L2CAP_OQOS: /* get Outgoing QoS flow spec */ return sockopt_set(sopt, &chan->lc_oqos, sizeof(l2cap_qos_t)); case SO_L2CAP_FLUSH: /* get Flush Timeout */ return sockopt_set(sopt, &chan->lc_flush, sizeof(uint16_t)); case SO_L2CAP_LM: /* get link mode */ return sockopt_setint(sopt, chan->lc_mode); default: break; } return ENOPROTOOPT; }
1 1 2 3 1 1 7 1 12 6 1 1 1 1 1 1 1 1 1 22 21 10 21 21 7 7 7 7 7 2 5 3 3 3 1 3 1 1 1 1 1 1 2 3 3 1 1 2 2 5 7 7 7 4 4 6 3 4 3 3 3 3 3 1 10 9 7 7 43 45 45 2 2 31 31 31 2 30 30 5 26 31 2 2 2 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 /* $NetBSD: in6.c,v 1.292 2024/03/01 23:50:27 riastradh Exp $ */ /* $KAME: in6.c,v 1.198 2001/07/18 09:12:38 itojun Exp $ */ /* * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the project nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * Copyright (c) 1982, 1986, 1991, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)in.c 8.2 (Berkeley) 11/15/93 */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: in6.c,v 1.292 2024/03/01 23:50:27 riastradh Exp $"); #ifdef _KERNEL_OPT #include "opt_inet.h" #include "opt_compat_netbsd.h" #include "opt_net_mpsafe.h" #endif #include <sys/param.h> #include <sys/ioctl.h> #include <sys/errno.h> #include <sys/malloc.h> #include <sys/socket.h> #include <sys/socketvar.h> #include <sys/sockio.h> #include <sys/systm.h> #include <sys/proc.h> #include <sys/time.h> #include <sys/kernel.h> #include <sys/syslog.h> #include <sys/kauth.h> #include <sys/cprng.h> #include <sys/kmem.h> #include <net/if.h> #include <net/if_types.h> #include <net/if_llatbl.h> #include <net/if_ether.h> #include <net/if_dl.h> #include <net/pfil.h> #include <net/route.h> #include <netinet/in.h> #include <netinet/in_var.h> #include <netinet/ip6.h> #include <netinet6/ip6_var.h> #include <netinet6/nd6.h> #include <netinet6/mld6_var.h> #include <netinet6/ip6_mroute.h> #include <netinet6/in6_ifattach.h> #include <netinet6/scope6_var.h> #include <compat/netinet6/in6_var.h> #include <compat/netinet6/nd6.h> MALLOC_DEFINE(M_IP6OPT, "ip6_options", "IPv6 options"); /* enable backward compatibility code for obsoleted ioctls */ #define COMPAT_IN6IFIOCTL #ifdef IN6_DEBUG #define IN6_DPRINTF(__fmt, ...) printf(__fmt, __VA_ARGS__) #else #define IN6_DPRINTF(__fmt, ...) do { } while (/*CONSTCOND*/0) #endif /* IN6_DEBUG */ /* * Definitions of some constant IP6 addresses. */ const struct in6_addr in6addr_any = IN6ADDR_ANY_INIT; const struct in6_addr in6addr_loopback = IN6ADDR_LOOPBACK_INIT; const struct in6_addr in6addr_nodelocal_allnodes = IN6ADDR_NODELOCAL_ALLNODES_INIT; const struct in6_addr in6addr_linklocal_allnodes = IN6ADDR_LINKLOCAL_ALLNODES_INIT; const struct in6_addr in6addr_linklocal_allrouters = IN6ADDR_LINKLOCAL_ALLROUTERS_INIT; const struct in6_addr in6mask0 = IN6MASK0; const struct in6_addr in6mask32 = IN6MASK32; const struct in6_addr in6mask64 = IN6MASK64; const struct in6_addr in6mask96 = IN6MASK96; const struct in6_addr in6mask128 = IN6MASK128; const struct sockaddr_in6 sa6_any = {sizeof(sa6_any), AF_INET6, 0, 0, IN6ADDR_ANY_INIT, 0}; struct pslist_head in6_ifaddr_list; kmutex_t in6_ifaddr_lock; static int in6_lifaddr_ioctl(struct socket *, u_long, void *, struct ifnet *); static int in6_ifaddprefix(struct in6_ifaddr *); static int in6_ifremprefix(struct in6_ifaddr *); static int in6_ifinit(struct ifnet *, struct in6_ifaddr *, const struct sockaddr_in6 *, int); static void in6_unlink_ifa(struct in6_ifaddr *, struct ifnet *); static int in6_update_ifa1(struct ifnet *, struct in6_aliasreq *, struct in6_ifaddr **, struct psref *, int); void in6_init(void) { PSLIST_INIT(&in6_ifaddr_list); mutex_init(&in6_ifaddr_lock, MUTEX_DEFAULT, IPL_NONE); in6_sysctl_multicast_setup(NULL); } /* * Add ownaddr as loopback rtentry. We previously add the route only if * necessary (ex. on a p2p link). However, since we now manage addresses * separately from prefixes, we should always add the route. We can't * rely on the cloning mechanism from the corresponding interface route * any more. */ void in6_ifaddlocal(struct ifaddr *ifa) { if (IN6_ARE_ADDR_EQUAL(IFA_IN6(ifa), &in6addr_any) || (ifa->ifa_ifp->if_flags & IFF_POINTOPOINT && IN6_ARE_ADDR_EQUAL(IFA_IN6(ifa), IFA_DSTIN6(ifa)))) { rt_addrmsg(RTM_NEWADDR, ifa); return; } rt_ifa_addlocal(ifa); } /* * Remove loopback rtentry of ownaddr generated by in6_ifaddlocal(), * if it exists. */ void in6_ifremlocal(struct ifaddr *ifa) { struct in6_ifaddr *ia; struct ifaddr *alt_ifa = NULL; int ia_count = 0; struct psref psref; int s; /* * Some of BSD variants do not remove cloned routes * from an interface direct route, when removing the direct route * (see comments in net/net_osdep.h). Even for variants that do remove * cloned routes, they could fail to remove the cloned routes when * we handle multiple addresses that share a common prefix. * So, we should remove the route corresponding to the deleted address. */ /* * Delete the entry only if exactly one ifaddr matches the * address, ifa->ifa_addr. * * If more than one ifaddr matches, replace the ifaddr in * the routing table, rt_ifa, with a different ifaddr than * the one we are purging, ifa. It is important to do * this, or else the routing table can accumulate dangling * pointers rt->rt_ifa->ifa_ifp to destroyed interfaces, * which will lead to crashes, later. (More than one ifaddr * can match if we assign the same address to multiple---probably * p2p---interfaces.) * * XXX An old comment at this place said, "we should avoid * XXX such a configuration [i.e., interfaces with the same * XXX addressed assigned --ed.] in IPv6...". I do not * XXX agree, especially now that I have fixed the dangling * XXX ifp-pointers bug. */ s = pserialize_read_enter(); IN6_ADDRLIST_READER_FOREACH(ia) { if (!IN6_ARE_ADDR_EQUAL(IFA_IN6(ifa), &ia->ia_addr.sin6_addr)) continue; if (ia->ia_ifp != ifa->ifa_ifp) alt_ifa = &ia->ia_ifa; if (++ia_count > 1 && alt_ifa != NULL) break; } if (ia_count > 1 && alt_ifa != NULL) ifa_acquire(alt_ifa, &psref); pserialize_read_exit(s); if (ia_count == 0) return; rt_ifa_remlocal(ifa, ia_count == 1 ? NULL : alt_ifa); if (ia_count > 1 && alt_ifa != NULL) ifa_release(alt_ifa, &psref); } /* Add prefix route for the network. */ static int in6_ifaddprefix(struct in6_ifaddr *ia) { int error, flags = 0; if (in6_mask2len(&ia->ia_prefixmask.sin6_addr, NULL) == 128) { if (ia->ia_dstaddr.sin6_family != AF_INET6) /* We don't need to install a host route. */ return 0; flags |= RTF_HOST; } /* Is this a connected route for neighbour discovery? */ if (nd6_need_cache(ia->ia_ifp)) flags |= RTF_CONNECTED; if ((error = rtinit(&ia->ia_ifa, RTM_ADD, RTF_UP | flags)) == 0) ia->ia_flags |= IFA_ROUTE; else if (error == EEXIST) /* Existence of the route is not an error. */ error = 0; return error; } static int in6_rt_ifa_matcher(struct rtentry *rt, void *v) { struct ifaddr *ifa = v; if (rt->rt_ifa == ifa) return 1; else return 0; } /* Delete network prefix route if present. * Re-add it to another address if the prefix matches. */ static int in6_ifremprefix(struct in6_ifaddr *target) { int error, s; struct in6_ifaddr *ia; if ((target->ia_flags & IFA_ROUTE) == 0) return 0; s = pserialize_read_enter(); IN6_ADDRLIST_READER_FOREACH(ia) { if (target->ia_dstaddr.sin6_len) { if (ia->ia_dstaddr.sin6_len == 0 || !IN6_ARE_ADDR_EQUAL(&ia->ia_dstaddr.sin6_addr, &target->ia_dstaddr.sin6_addr)) continue; } else { if (!IN6_ARE_MASKED_ADDR_EQUAL(&ia->ia_addr.sin6_addr, &target->ia_addr.sin6_addr, &target->ia_prefixmask.sin6_addr)) continue; } /* * if we got a matching prefix route, move IFA_ROUTE to him */ if ((ia->ia_flags & IFA_ROUTE) == 0) { struct psref psref; int bound = curlwp_bind(); ia6_acquire(ia, &psref); pserialize_read_exit(s); rtinit(&target->ia_ifa, RTM_DELETE, 0); target->ia_flags &= ~IFA_ROUTE; error = in6_ifaddprefix(ia); if (!ISSET(target->ia_ifa.ifa_flags, IFA_DESTROYING)) goto skip; /* * Replace rt_ifa of routes that have the removing address * with the new address. */ rt_replace_ifa_matched_entries(AF_INET6, in6_rt_ifa_matcher, &target->ia_ifa, &ia->ia_ifa); skip: ia6_release(ia, &psref); curlwp_bindx(bound); return error; } } pserialize_read_exit(s); /* * noone seem to have prefix route. remove it. */ rtinit(&target->ia_ifa, RTM_DELETE, 0); target->ia_flags &= ~IFA_ROUTE; if (ISSET(target->ia_ifa.ifa_flags, IFA_DESTROYING)) { /* Remove routes that have the removing address as rt_ifa. */ rt_delete_matched_entries(AF_INET6, in6_rt_ifa_matcher, &target->ia_ifa, true); } return 0; } int in6_mask2len(struct in6_addr *mask, u_char *lim0) { int x = 0, y; u_char *lim = lim0, *p; /* ignore the scope_id part */ if (lim0 == NULL || lim0 - (u_char *)mask > sizeof(*mask)) lim = (u_char *)mask + sizeof(*mask); for (p = (u_char *)mask; p < lim; x++, p++) { if (*p != 0xff) break; } y = 0; if (p < lim) { for (y = 0; y < NBBY; y++) { if ((*p & (0x80 >> y)) == 0) break; } } /* * when the limit pointer is given, do a stricter check on the * remaining bits. */ if (p < lim) { if (y != 0 && (*p & (0x00ff >> y)) != 0) return -1; for (p = p + 1; p < lim; p++) if (*p != 0) return -1; } return x * NBBY + y; } #define ifa2ia6(ifa) ((struct in6_ifaddr *)(ifa)) #define ia62ifa(ia6) (&((ia6)->ia_ifa)) static int in6_control1(struct socket *so, u_long cmd, void *data, struct ifnet *ifp) { struct in6_ifreq *ifr = (struct in6_ifreq *)data; struct in6_ifaddr *ia = NULL; struct in6_aliasreq *ifra = (struct in6_aliasreq *)data; struct sockaddr_in6 *sa6; int error, bound; struct psref psref; switch (cmd) { case SIOCAADDRCTL_POLICY: case SIOCDADDRCTL_POLICY: /* Privileged. */ return in6_src_ioctl(cmd, data); /* * XXX: Fix me, once we fix SIOCSIFADDR, SIOCIFDSTADDR, etc. */ case SIOCSIFADDR: case SIOCSIFDSTADDR: case SIOCSIFBRDADDR: case SIOCSIFNETMASK: return EOPNOTSUPP; case SIOCGETSGCNT_IN6: case SIOCGETMIFCNT_IN6: return mrt6_ioctl(cmd, data); case SIOCGIFADDRPREF: case SIOCSIFADDRPREF: if (ifp == NULL) return EINVAL; return ifaddrpref_ioctl(so, cmd, data, ifp); } if (ifp == NULL) return EOPNOTSUPP; switch (cmd) { #ifdef OSIOCSIFINFO_IN6_90 case OSIOCSIFINFO_FLAGS_90: case OSIOCSIFINFO_IN6_90: case OSIOCSDEFIFACE_IN6: case OSIOCSNDFLUSH_IN6: case OSIOCSPFXFLUSH_IN6: case OSIOCSRTRFLUSH_IN6: #endif case SIOCSIFINFO_FLAGS: case SIOCSIFINFO_IN6: /* Privileged. */ /* FALLTHROUGH */ #ifdef OSIOCGIFINFO_IN6 case OSIOCGIFINFO_IN6: #endif #ifdef OSIOCGIFINFO_IN6_90 case OSIOCGDRLST_IN6: case OSIOCGPRLST_IN6: case OSIOCGIFINFO_IN6_90: case OSIOCGDEFIFACE_IN6: #endif case SIOCGIFINFO_IN6: case SIOCGNBRINFO_IN6: return nd6_ioctl(cmd, data, ifp); } switch (cmd) { case SIOCALIFADDR: case SIOCDLIFADDR: /* Privileged. */ /* FALLTHROUGH */ case SIOCGLIFADDR: return in6_lifaddr_ioctl(so, cmd, data, ifp); } /* * Find address for this interface, if it exists. * * In netinet code, we have checked ifra_addr in SIOCSIF*ADDR operation * only, and used the first interface address as the target of other * operations (without checking ifra_addr). This was because netinet * code/API assumed at most 1 interface address per interface. * Since IPv6 allows a node to assign multiple addresses * on a single interface, we almost always look and check the * presence of ifra_addr, and reject invalid ones here. * It also decreases duplicated code among SIOC*_IN6 operations. */ switch (cmd) { case SIOCAIFADDR_IN6: #ifdef OSIOCAIFADDR_IN6 case OSIOCAIFADDR_IN6: #endif #ifdef OSIOCSIFPHYADDR_IN6 case OSIOCSIFPHYADDR_IN6: #endif case SIOCSIFPHYADDR_IN6: sa6 = &ifra->ifra_addr; break; case SIOCSIFADDR_IN6: case SIOCGIFADDR_IN6: case SIOCSIFDSTADDR_IN6: case SIOCSIFNETMASK_IN6: case SIOCGIFDSTADDR_IN6: case SIOCGIFNETMASK_IN6: case SIOCDIFADDR_IN6: case SIOCGIFPSRCADDR_IN6: case SIOCGIFPDSTADDR_IN6: case SIOCGIFAFLAG_IN6: case SIOCGIFALIFETIME_IN6: #ifdef OSIOCGIFALIFETIME_IN6 case OSIOCGIFALIFETIME_IN6: #endif case SIOCGIFSTAT_IN6: case SIOCGIFSTAT_ICMP6: sa6 = &ifr->ifr_addr; break; default: sa6 = NULL; break; } error = 0; bound = curlwp_bind(); if (sa6 && sa6->sin6_family == AF_INET6) { if (sa6->sin6_scope_id != 0) error = sa6_embedscope(sa6, 0); else error = in6_setscope(&sa6->sin6_addr, ifp, NULL); if (error != 0) goto out; ia = in6ifa_ifpwithaddr_psref(ifp, &sa6->sin6_addr, &psref); } else ia = NULL; switch (cmd) { case SIOCSIFADDR_IN6: case SIOCSIFDSTADDR_IN6: case SIOCSIFNETMASK_IN6: /* * Since IPv6 allows a node to assign multiple addresses * on a single interface, SIOCSIFxxx ioctls are deprecated. */ error = EINVAL; goto release; case SIOCDIFADDR_IN6: /* * for IPv4, we look for existing in_ifaddr here to allow * "ifconfig if0 delete" to remove the first IPv4 address on * the interface. For IPv6, as the spec allows multiple * interface address from the day one, we consider "remove the * first one" semantics to be not preferable. */ if (ia == NULL) { error = EADDRNOTAVAIL; goto out; } #ifdef OSIOCAIFADDR_IN6 /* FALLTHROUGH */ case OSIOCAIFADDR_IN6: #endif /* FALLTHROUGH */ case SIOCAIFADDR_IN6: /* * We always require users to specify a valid IPv6 address for * the corresponding operation. */ if (ifra->ifra_addr.sin6_family != AF_INET6 || ifra->ifra_addr.sin6_len != sizeof(struct sockaddr_in6)) { error = EAFNOSUPPORT; goto release; } /* Privileged. */ break; case SIOCGIFADDR_IN6: /* This interface is basically deprecated. use SIOCGIFCONF. */ /* FALLTHROUGH */ case SIOCGIFAFLAG_IN6: case SIOCGIFNETMASK_IN6: case SIOCGIFDSTADDR_IN6: case SIOCGIFALIFETIME_IN6: #ifdef OSIOCGIFALIFETIME_IN6 case OSIOCGIFALIFETIME_IN6: #endif /* must think again about its semantics */ if (ia == NULL) { error = EADDRNOTAVAIL; goto out; } break; } switch (cmd) { case SIOCGIFADDR_IN6: ifr->ifr_addr = ia->ia_addr; error = sa6_recoverscope(&ifr->ifr_addr); break; case SIOCGIFDSTADDR_IN6: if ((ifp->if_flags & IFF_POINTOPOINT) == 0) { error = EINVAL; break; } /* * XXX: should we check if ifa_dstaddr is NULL and return * an error? */ ifr->ifr_dstaddr = ia->ia_dstaddr; error = sa6_recoverscope(&ifr->ifr_dstaddr); break; case SIOCGIFNETMASK_IN6: ifr->ifr_addr = ia->ia_prefixmask; break; case SIOCGIFAFLAG_IN6: ifr->ifr_ifru.ifru_flags6 = ia->ia6_flags; break; case SIOCGIFSTAT_IN6: if (ifp == NULL) { error = EINVAL; break; } memset(&ifr->ifr_ifru.ifru_stat, 0, sizeof(ifr->ifr_ifru.ifru_stat)); ifr->ifr_ifru.ifru_stat = *((struct in6_ifextra *)ifp->if_afdata[AF_INET6])->in6_ifstat; break; case SIOCGIFSTAT_ICMP6: if (ifp == NULL) { error = EINVAL; break; } memset(&ifr->ifr_ifru.ifru_icmp6stat, 0, sizeof(ifr->ifr_ifru.ifru_icmp6stat)); ifr->ifr_ifru.ifru_icmp6stat = *((struct in6_ifextra *)ifp->if_afdata[AF_INET6])->icmp6_ifstat; break; #ifdef OSIOCGIFALIFETIME_IN6 case OSIOCGIFALIFETIME_IN6: #endif case SIOCGIFALIFETIME_IN6: ifr->ifr_ifru.ifru_lifetime = ia->ia6_lifetime; if (ia->ia6_lifetime.ia6t_vltime != ND6_INFINITE_LIFETIME) { time_t maxexpire; struct in6_addrlifetime *retlt = &ifr->ifr_ifru.ifru_lifetime; /* * XXX: adjust expiration time assuming time_t is * signed. */ maxexpire = ((time_t)~0) & (time_t)~(1ULL << ((sizeof(maxexpire) * NBBY) - 1)); if (ia->ia6_lifetime.ia6t_vltime < maxexpire - ia->ia6_updatetime) { retlt->ia6t_expire = ia->ia6_updatetime + ia->ia6_lifetime.ia6t_vltime; retlt->ia6t_expire = retlt->ia6t_expire ? time_mono_to_wall(retlt->ia6t_expire) : 0; } else retlt->ia6t_expire = maxexpire; } if (ia->ia6_lifetime.ia6t_pltime != ND6_INFINITE_LIFETIME) { time_t maxexpire; struct in6_addrlifetime *retlt = &ifr->ifr_ifru.ifru_lifetime; /* * XXX: adjust expiration time assuming time_t is * signed. */ maxexpire = ((time_t)~0) & (time_t)~(1ULL << ((sizeof(maxexpire) * NBBY) - 1)); if (ia->ia6_lifetime.ia6t_pltime < maxexpire - ia->ia6_updatetime) { retlt->ia6t_preferred = ia->ia6_updatetime + ia->ia6_lifetime.ia6t_pltime; retlt->ia6t_preferred = retlt->ia6t_preferred ? time_mono_to_wall(retlt->ia6t_preferred) : 0; } else retlt->ia6t_preferred = maxexpire; } #ifdef OSIOCFIFALIFETIME_IN6 if (cmd == OSIOCFIFALIFETIME_IN6) in6_addrlifetime_to_in6_addrlifetime50( &ifr->ifru.ifru_lifetime); #endif break; #ifdef OSIOCAIFADDR_IN6 case OSIOCAIFADDR_IN6: in6_aliasreq50_to_in6_aliasreq(ifra); #endif /*FALLTHROUGH*/ case SIOCAIFADDR_IN6: { struct in6_addrlifetime *lt; /* reject read-only flags */ if ((ifra->ifra_flags & IN6_IFF_DUPLICATED) != 0 || (ifra->ifra_flags & IN6_IFF_DETACHED) != 0 || (ifra->ifra_flags & IN6_IFF_TENTATIVE) != 0 || (ifra->ifra_flags & IN6_IFF_NODAD) != 0) { error = EINVAL; break; } /* * ia6t_expire and ia6t_preferred won't be used for now, * so just in case. */ lt = &ifra->ifra_lifetime; if (lt->ia6t_expire != 0) lt->ia6t_expire = time_wall_to_mono(lt->ia6t_expire); if (lt->ia6t_preferred != 0) lt->ia6t_preferred = time_wall_to_mono(lt->ia6t_preferred); /* * make (ia == NULL) or update (ia != NULL) the interface * address structure, and link it to the list. */ int s = splsoftnet(); error = in6_update_ifa1(ifp, ifra, &ia, &psref, 0); splx(s); /* * in6_update_ifa1 doesn't create the address if its * valid lifetime (vltime) is zero, since we would just * delete the address immediately in that case anyway. * So it may succeed but return null ia. In that case, * nothing left to do. */ if (error || ia == NULL) break; pfil_run_addrhooks(if_pfil, cmd, &ia->ia_ifa); break; } case SIOCDIFADDR_IN6: ia6_release(ia, &psref); ifaref(&ia->ia_ifa); in6_purgeaddr(&ia->ia_ifa); pfil_run_addrhooks(if_pfil, cmd, &ia->ia_ifa); ifafree(&ia->ia_ifa); ia = NULL; break; default: error = ENOTTY; } release: ia6_release(ia, &psref); out: curlwp_bindx(bound); return error; } int in6_control(struct socket *so, u_long cmd, void *data, struct ifnet *ifp) { int error, s; switch (cmd) { #ifdef OSIOCSIFINFO_IN6_90 case OSIOCSIFINFO_FLAGS_90: case OSIOCSIFINFO_IN6_90: case OSIOCSDEFIFACE_IN6: case OSIOCSNDFLUSH_IN6: case OSIOCSPFXFLUSH_IN6: case OSIOCSRTRFLUSH_IN6: #endif case SIOCSIFINFO_FLAGS: case SIOCSIFINFO_IN6: case SIOCALIFADDR: case SIOCDLIFADDR: case SIOCDIFADDR_IN6: #ifdef OSIOCAIFADDR_IN6 case OSIOCAIFADDR_IN6: #endif case SIOCAIFADDR_IN6: case SIOCAADDRCTL_POLICY: case SIOCDADDRCTL_POLICY: if (kauth_authorize_network(kauth_cred_get(), KAUTH_NETWORK_SOCKET, KAUTH_REQ_NETWORK_SOCKET_SETPRIV, so, NULL, NULL)) return EPERM; break; } s = splsoftnet(); #ifndef NET_MPSAFE KASSERT(KERNEL_LOCKED_P()); #endif error = in6_control1(so , cmd, data, ifp); splx(s); return error; } static int in6_get_llsol_addr(struct in6_addr *llsol, struct ifnet *ifp, struct in6_addr *ip6) { int error; memset(llsol, 0, sizeof(struct in6_addr)); llsol->s6_addr16[0] = htons(0xff02); llsol->s6_addr32[1] = 0; llsol->s6_addr32[2] = htonl(1); llsol->s6_addr32[3] = ip6->s6_addr32[3]; llsol->s6_addr8[12] = 0xff; error = in6_setscope(llsol, ifp, NULL); if (error != 0) { /* XXX: should not happen */ log(LOG_ERR, "%s: in6_setscope failed\n", __func__); } return error; } static int in6_join_mcastgroups(struct in6_aliasreq *ifra, struct in6_ifaddr *ia, struct ifnet *ifp, int flags) { int error; struct sockaddr_in6 mltaddr, mltmask; struct in6_multi_mship *imm; struct in6_addr llsol; struct rtentry *rt; int dad_delay; char ip6buf[INET6_ADDRSTRLEN]; /* join solicited multicast addr for new host id */ error = in6_get_llsol_addr(&llsol, ifp, &ifra->ifra_addr.sin6_addr); if (error != 0) goto out; dad_delay = 0; if ((flags & IN6_IFAUPDATE_DADDELAY)) { /* * We need a random delay for DAD on the address * being configured. It also means delaying * transmission of the corresponding MLD report to * avoid report collision. * [draft-ietf-ipv6-rfc2462bis-02.txt] */ dad_delay = cprng_fast32() % (MAX_RTR_SOLICITATION_DELAY * hz); } #define MLTMASK_LEN 4 /* mltmask's masklen (=32bit=4octet) */ /* join solicited multicast addr for new host id */ imm = in6_joingroup(ifp, &llsol, &error, dad_delay); if (!imm) { nd6log(LOG_ERR, "addmulti failed for %s on %s (errno=%d)\n", IN6_PRINT(ip6buf, &llsol), if_name(ifp), error); goto out; } mutex_enter(&in6_ifaddr_lock); LIST_INSERT_HEAD(&ia->ia6_memberships, imm, i6mm_chain); mutex_exit(&in6_ifaddr_lock); sockaddr_in6_init(&mltmask, &in6mask32, 0, 0, 0); /* * join link-local all-nodes address */ sockaddr_in6_init(&mltaddr, &in6addr_linklocal_allnodes, 0, 0, 0); if ((error = in6_setscope(&mltaddr.sin6_addr, ifp, NULL)) != 0) goto out; /* XXX: should not fail */ /* * XXX: do we really need this automatic routes? * We should probably reconsider this stuff. Most applications * actually do not need the routes, since they usually specify * the outgoing interface. */ rt = rtalloc1(sin6tosa(&mltaddr), 0); if (rt) { if (memcmp(&mltaddr.sin6_addr, &satocsin6(rt_getkey(rt))->sin6_addr, MLTMASK_LEN)) { rt_unref(rt); rt = NULL; } else if (rt->rt_ifp != ifp) { IN6_DPRINTF("%s: rt_ifp %p -> %p (%s) " "network %04x:%04x::/32 = %04x:%04x::/32\n", __func__, rt->rt_ifp, ifp, ifp->if_xname, ntohs(mltaddr.sin6_addr.s6_addr16[0]), ntohs(mltaddr.sin6_addr.s6_addr16[1]), satocsin6(rt_getkey(rt))->sin6_addr.s6_addr16[0], satocsin6(rt_getkey(rt))->sin6_addr.s6_addr16[1]); #ifdef NET_MPSAFE error = rt_update_prepare(rt); if (error == 0) { rt_replace_ifa(rt, &ia->ia_ifa); rt->rt_ifp = ifp; rt_update_finish(rt); } else { /* * If error != 0, the rtentry is being * destroyed, so doing nothing doesn't * matter. */ } #else rt_replace_ifa(rt, &ia->ia_ifa); rt->rt_ifp = ifp; #endif } } if (!rt) { struct rt_addrinfo info; memset(&info, 0, sizeof(info)); info.rti_info[RTAX_DST] = sin6tosa(&mltaddr); info.rti_info[RTAX_GATEWAY] = sin6tosa(&ia->ia_addr); info.rti_info[RTAX_NETMASK] = sin6tosa(&mltmask); info.rti_info[RTAX_IFA] = sin6tosa(&ia->ia_addr); /* XXX: we need RTF_CONNECTED to fake nd6_rtrequest */ info.rti_flags = RTF_UP | RTF_CONNECTED; error = rtrequest1(RTM_ADD, &info, NULL); if (error) goto out; } else { rt_unref(rt); } imm = in6_joingroup(ifp, &mltaddr.sin6_addr, &error, 0); if (!imm) { nd6log(LOG_WARNING, "addmulti failed for %s on %s (errno=%d)\n", IN6_PRINT(ip6buf, &mltaddr.sin6_addr), if_name(ifp), error); goto out; } mutex_enter(&in6_ifaddr_lock); LIST_INSERT_HEAD(&ia->ia6_memberships, imm, i6mm_chain); mutex_exit(&in6_ifaddr_lock); /* * join node information group address */ dad_delay = 0; if ((flags & IN6_IFAUPDATE_DADDELAY)) { /* * The spec doesn't say anything about delay for this * group, but the same logic should apply. */ dad_delay = cprng_fast32() % (MAX_RTR_SOLICITATION_DELAY * hz); } if (in6_nigroup(ifp, hostname, hostnamelen, &mltaddr) != 0) ; else if ((imm = in6_joingroup(ifp, &mltaddr.sin6_addr, &error, dad_delay)) == NULL) { /* XXX jinmei */ nd6log(LOG_WARNING, "addmulti failed for %s on %s (errno=%d)\n", IN6_PRINT(ip6buf, &mltaddr.sin6_addr), if_name(ifp), error); /* XXX not very fatal, go on... */ } else { mutex_enter(&in6_ifaddr_lock); LIST_INSERT_HEAD(&ia->ia6_memberships, imm, i6mm_chain); mutex_exit(&in6_ifaddr_lock); } /* * join interface-local all-nodes address. * (ff01::1%ifN, and ff01::%ifN/32) */ mltaddr.sin6_addr = in6addr_nodelocal_allnodes; if ((error = in6_setscope(&mltaddr.sin6_addr, ifp, NULL)) != 0) goto out; /* XXX: should not fail */ /* XXX: again, do we really need the route? */ rt = rtalloc1(sin6tosa(&mltaddr), 0); if (rt) { /* 32bit came from "mltmask" */ if (memcmp(&mltaddr.sin6_addr, &satocsin6(rt_getkey(rt))->sin6_addr, 32 / NBBY)) { rt_unref(rt); rt = NULL; } else if (rt->rt_ifp != ifp) { IN6_DPRINTF("%s: rt_ifp %p -> %p (%s) " "network %04x:%04x::/32 = %04x:%04x::/32\n", __func__, rt->rt_ifp, ifp, ifp->if_xname, ntohs(mltaddr.sin6_addr.s6_addr16[0]), ntohs(mltaddr.sin6_addr.s6_addr16[1]), satocsin6(rt_getkey(rt))->sin6_addr.s6_addr16[0], satocsin6(rt_getkey(rt))->sin6_addr.s6_addr16[1]); #ifdef NET_MPSAFE error = rt_update_prepare(rt); if (error == 0) { rt_replace_ifa(rt, &ia->ia_ifa); rt->rt_ifp = ifp; rt_update_finish(rt); } else { /* * If error != 0, the rtentry is being * destroyed, so doing nothing doesn't * matter. */ } #else rt_replace_ifa(rt, &ia->ia_ifa); rt->rt_ifp = ifp; #endif } } if (!rt) { struct rt_addrinfo info; memset(&info, 0, sizeof(info)); info.rti_info[RTAX_DST] = sin6tosa(&mltaddr); info.rti_info[RTAX_GATEWAY] = sin6tosa(&ia->ia_addr); info.rti_info[RTAX_NETMASK] = sin6tosa(&mltmask); info.rti_info[RTAX_IFA] = sin6tosa(&ia->ia_addr); info.rti_flags = RTF_UP | RTF_CONNECTED; error = rtrequest1(RTM_ADD, &info, NULL); if (error) goto out; #undef MLTMASK_LEN } else { rt_unref(rt); } imm = in6_joingroup(ifp, &mltaddr.sin6_addr, &error, 0); if (!imm) { nd6log(LOG_WARNING, "addmulti failed for %s on %s (errno=%d)\n", IN6_PRINT(ip6buf, &mltaddr.sin6_addr), if_name(ifp), error); goto out; } else { mutex_enter(&in6_ifaddr_lock); LIST_INSERT_HEAD(&ia->ia6_memberships, imm, i6mm_chain); mutex_exit(&in6_ifaddr_lock); } return 0; out: KASSERT(error != 0); return error; } /* * Update parameters of an IPv6 interface address. * If necessary, a new entry is created and linked into address chains. * This function is separated from in6_control(). * XXX: should this be performed under splsoftnet()? */ static int in6_update_ifa1(struct ifnet *ifp, struct in6_aliasreq *ifra, struct in6_ifaddr **iap, struct psref *psref, int flags) { int error = 0, hostIsNew = 0, plen = -1; struct sockaddr_in6 dst6; struct in6_addrlifetime *lt; int dad_delay, was_tentative; struct in6_ifaddr *ia = iap ? *iap : NULL; char ip6buf[INET6_ADDRSTRLEN]; bool addrmaskNotChanged = false; bool send_rtm_newaddr = (ip6_param_rt_msg == 1); int saved_flags = 0; KASSERT((iap == NULL && psref == NULL) || (iap != NULL && psref != NULL)); /* Validate parameters */ if (ifp == NULL || ifra == NULL) /* this maybe redundant */ return EINVAL; /* * The destination address for a p2p link must have a family * of AF_UNSPEC or AF_INET6. */ if ((ifp->if_flags & IFF_POINTOPOINT) != 0 && ifra->ifra_dstaddr.sin6_family != AF_INET6 && ifra->ifra_dstaddr.sin6_family != AF_UNSPEC) return EAFNOSUPPORT; /* * validate ifra_prefixmask. don't check sin6_family, netmask * does not carry fields other than sin6_len. */ if (ifra->ifra_prefixmask.sin6_len > sizeof(struct sockaddr_in6)) return EINVAL; /* * Because the IPv6 address architecture is classless, we require * users to specify a (non 0) prefix length (mask) for a new address. * We also require the prefix (when specified) mask is valid, and thus * reject a non-consecutive mask. */ if (ia == NULL && ifra->ifra_prefixmask.sin6_len == 0) return EINVAL; if (ifra->ifra_prefixmask.sin6_len != 0) { plen = in6_mask2len(&ifra->ifra_prefixmask.sin6_addr, (u_char *)&ifra->ifra_prefixmask + ifra->ifra_prefixmask.sin6_len); if (plen <= 0) return EINVAL; } else { /* * In this case, ia must not be NULL. We just use its prefix * length. */ plen = in6_mask2len(&ia->ia_prefixmask.sin6_addr, NULL); } /* * If the destination address on a p2p interface is specified, * and the address is a scoped one, validate/set the scope * zone identifier. */ dst6 = ifra->ifra_dstaddr; if ((ifp->if_flags & (IFF_POINTOPOINT|IFF_LOOPBACK)) != 0 && (dst6.sin6_family == AF_INET6)) { struct in6_addr in6_tmp; u_int32_t zoneid; in6_tmp = dst6.sin6_addr; if (in6_setscope(&in6_tmp, ifp, &zoneid)) return EINVAL; /* XXX: should be impossible */ if (dst6.sin6_scope_id != 0) { if (dst6.sin6_scope_id != zoneid) return EINVAL; } else /* user omit to specify the ID. */ dst6.sin6_scope_id = zoneid; /* convert into the internal form */ if (sa6_embedscope(&dst6, 0)) return EINVAL; /* XXX: should be impossible */ } /* * The destination address can be specified only for a p2p or a * loopback interface. If specified, the corresponding prefix length * must be 128. */ if (ifra->ifra_dstaddr.sin6_family == AF_INET6) { #ifdef FORCE_P2PPLEN int i; #endif if ((ifp->if_flags & (IFF_POINTOPOINT|IFF_LOOPBACK)) == 0) { /* XXX: noisy message */ nd6log(LOG_INFO, "a destination can " "be specified for a p2p or a loopback IF only\n"); return EINVAL; } if (plen != 128) { nd6log(LOG_INFO, "prefixlen should " "be 128 when dstaddr is specified\n"); #ifdef FORCE_P2PPLEN /* * To be compatible with old configurations, * such as ifconfig gif0 inet6 2001::1 2001::2 * prefixlen 126, we override the specified * prefixmask as if the prefix length was 128. */ ifra->ifra_prefixmask.sin6_len = sizeof(struct sockaddr_in6); for (i = 0; i < 4; i++) ifra->ifra_prefixmask.sin6_addr.s6_addr32[i] = 0xffffffff; plen = 128; #else return EINVAL; #endif } } /* lifetime consistency check */ lt = &ifra->ifra_lifetime; if (lt->ia6t_pltime > lt->ia6t_vltime) return EINVAL; if (lt->ia6t_vltime == 0) { /* * the following log might be noisy, but this is a typical * configuration mistake or a tool's bug. */ nd6log(LOG_INFO, "valid lifetime is 0 for %s\n", IN6_PRINT(ip6buf, &ifra->ifra_addr.sin6_addr)); if (ia == NULL) return 0; /* there's nothing to do */ } #define sin6eq(a, b) \ ((a)->sin6_len == sizeof(struct sockaddr_in6) && \ (b)->sin6_len == sizeof(struct sockaddr_in6) && \ IN6_ARE_ADDR_EQUAL(&(a)->sin6_addr, &(b)->sin6_addr)) if (!send_rtm_newaddr) { if (ia != NULL && sin6eq(&ifra->ifra_addr, &ia->ia_addr) && sin6eq(&ifra->ifra_prefixmask, &ia->ia_prefixmask)) { addrmaskNotChanged = true; saved_flags = ia->ia6_flags; /* check it later */ } } #undef sin6eq /* * If this is a new address, allocate a new ifaddr and link it * into chains. */ if (ia == NULL) { hostIsNew = 1; /* * When in6_update_ifa() is called in a process of a received * RA, it is called under an interrupt context. So, we should * call malloc with M_NOWAIT. */ ia = malloc(sizeof(*ia), M_IFADDR, M_NOWAIT|M_ZERO); if (ia == NULL) return ENOBUFS; LIST_INIT(&ia->ia6_memberships); /* Initialize the address and masks, and put time stamp */ ia->ia_ifa.ifa_addr = sin6tosa(&ia->ia_addr); ia->ia_addr.sin6_family = AF_INET6; ia->ia_addr.sin6_len = sizeof(ia->ia_addr); ia->ia6_createtime = time_uptime; if ((ifp->if_flags & (IFF_POINTOPOINT | IFF_LOOPBACK)) != 0) { /* * XXX: some functions expect that ifa_dstaddr is not * NULL for p2p interfaces. */ ia->ia_ifa.ifa_dstaddr = sin6tosa(&ia->ia_dstaddr); } else { ia->ia_ifa.ifa_dstaddr = NULL; } ia->ia_ifa.ifa_netmask = sin6tosa(&ia->ia_prefixmask); ia->ia_ifp = ifp; IN6_ADDRLIST_ENTRY_INIT(ia); ifa_psref_init(&ia->ia_ifa); } /* update timestamp */ ia->ia6_updatetime = time_uptime; /* set prefix mask */ if (ifra->ifra_prefixmask.sin6_len) { if (ia->ia_prefixmask.sin6_len) { if (!IN6_ARE_ADDR_EQUAL(&ia->ia_prefixmask.sin6_addr, &ifra->ifra_prefixmask.sin6_addr)) in6_ifremprefix(ia); } ia->ia_prefixmask = ifra->ifra_prefixmask; } /* Set destination address. */ if (dst6.sin6_family == AF_INET6) { if (!IN6_ARE_ADDR_EQUAL(&dst6.sin6_addr, &ia->ia_dstaddr.sin6_addr)) in6_ifremprefix(ia); ia->ia_dstaddr = dst6; } /* * Set lifetimes. We do not refer to ia6t_expire and ia6t_preferred * to see if the address is deprecated or invalidated, but initialize * these members for applications. */ ia->ia6_lifetime = ifra->ifra_lifetime; if (ia->ia6_lifetime.ia6t_vltime != ND6_INFINITE_LIFETIME) { ia->ia6_lifetime.ia6t_expire = time_uptime + ia->ia6_lifetime.ia6t_vltime; } else ia->ia6_lifetime.ia6t_expire = 0; if (ia->ia6_lifetime.ia6t_pltime != ND6_INFINITE_LIFETIME) { ia->ia6_lifetime.ia6t_preferred = time_uptime + ia->ia6_lifetime.ia6t_pltime; } else ia->ia6_lifetime.ia6t_preferred = 0; /* * configure address flags. * We need to preserve tentative state so DAD works if * something adds the same address before DAD finishes. */ was_tentative = ia->ia6_flags & (IN6_IFF_TENTATIVE|IN6_IFF_DUPLICATED); ia->ia6_flags = ifra->ifra_flags; /* * Make the address tentative before joining multicast addresses, * so that corresponding MLD responses would not have a tentative * source address. */ ia->ia6_flags &= ~IN6_IFF_DUPLICATED; /* safety */ if (ifp->if_link_state == LINK_STATE_DOWN) { ia->ia6_flags |= IN6_IFF_DETACHED; ia->ia6_flags &= ~IN6_IFF_TENTATIVE; } else if ((hostIsNew || was_tentative) && if_do_dad(ifp) && ip6_dad_enabled()) { ia->ia6_flags |= IN6_IFF_TENTATIVE; } /* * backward compatibility - if IN6_IFF_DEPRECATED is set from the * userland, make it deprecated. */ if ((ifra->ifra_flags & IN6_IFF_DEPRECATED) != 0) { ia->ia6_lifetime.ia6t_pltime = 0; ia->ia6_lifetime.ia6t_preferred = time_uptime; } if (!send_rtm_newaddr) { /* * We will not send RTM_NEWADDR if the only difference between * ia and ifra is preferred/valid lifetimes, because it is not * very useful for userland programs to be notified of that * changes. */ if (addrmaskNotChanged && ia->ia6_flags == saved_flags) return 0; } if (hostIsNew) { /* * We need a reference to ia before calling in6_ifinit. * Otherwise ia can be freed in in6_ifinit accidentally. */ ifaref(&ia->ia_ifa); } /* Must execute in6_ifinit and ifa_insert atomically */ mutex_enter(&in6_ifaddr_lock); /* reset the interface and routing table appropriately. */ error = in6_ifinit(ifp, ia, &ifra->ifra_addr, hostIsNew); if (error != 0) { if (hostIsNew) free(ia, M_IFADDR); mutex_exit(&in6_ifaddr_lock); return error; } /* * We are done if we have simply modified an existing address. */ if (!hostIsNew) { mutex_exit(&in6_ifaddr_lock); return error; } /* * Insert ia to the global list and ifa to the interface's list. * A reference to it is already gained above. */ IN6_ADDRLIST_WRITER_INSERT_TAIL(ia); ifa_insert(ifp, &ia->ia_ifa); mutex_exit(&in6_ifaddr_lock); /* * Beyond this point, we should call in6_purgeaddr upon an error, * not just go to unlink. */ /* join necessary multicast groups */ if ((ifp->if_flags & IFF_MULTICAST) != 0) { error = in6_join_mcastgroups(ifra, ia, ifp, flags); if (error != 0) goto cleanup; } if (nd6_need_cache(ifp)) { /* XXX maybe unnecessary */ ia->ia_ifa.ifa_rtrequest = nd6_rtrequest; ia->ia_ifa.ifa_flags |= RTF_CONNECTED; } /* * Perform DAD, if needed. * XXX It may be of use, if we can administratively * disable DAD. */ if (hostIsNew && if_do_dad(ifp) && ((ifra->ifra_flags & IN6_IFF_NODAD) == 0) && (ia->ia6_flags & IN6_IFF_TENTATIVE)) { int mindelay, maxdelay; dad_delay = 0; if ((flags & IN6_IFAUPDATE_DADDELAY)) { struct in6_addr llsol; struct in6_multi *in6m_sol = NULL; /* * We need to impose a delay before sending an NS * for DAD. Check if we also needed a delay for the * corresponding MLD message. If we did, the delay * should be larger than the MLD delay (this could be * relaxed a bit, but this simple logic is at least * safe). */ mindelay = 0; error = in6_get_llsol_addr(&llsol, ifp, &ifra->ifra_addr.sin6_addr); in6_multi_lock(RW_READER); if (error == 0) in6m_sol = in6_lookup_multi(&llsol, ifp); if (in6m_sol != NULL && in6m_sol->in6m_state == MLD_REPORTPENDING) { mindelay = in6m_sol->in6m_timer; } in6_multi_unlock(); maxdelay = MAX_RTR_SOLICITATION_DELAY * hz; if (maxdelay - mindelay == 0) dad_delay = 0; else { dad_delay = (cprng_fast32() % (maxdelay - mindelay)) + mindelay; } } /* +1 ensures callout is always used */ nd6_dad_start(&ia->ia_ifa, dad_delay + 1); } if (iap != NULL) { *iap = ia; if (hostIsNew) ia6_acquire(ia, psref); } return 0; cleanup: in6_purgeaddr(&ia->ia_ifa); return error; } int in6_update_ifa(struct ifnet *ifp, struct in6_aliasreq *ifra, int flags) { int rc, s; s = splsoftnet(); rc = in6_update_ifa1(ifp, ifra, NULL, NULL, flags); splx(s); return rc; } void in6_purgeaddr(struct ifaddr *ifa) { struct ifnet *ifp = ifa->ifa_ifp; struct in6_ifaddr *ia = (struct in6_ifaddr *) ifa; struct in6_multi_mship *imm; /* KASSERT(!ifa_held(ifa)); XXX need ifa_not_held (psref_not_held) */ KASSERT(IFNET_LOCKED(ifp)); ifa->ifa_flags |= IFA_DESTROYING; /* stop DAD processing */ nd6_dad_stop(ifa); /* Delete any network route. */ in6_ifremprefix(ia); /* Remove ownaddr's loopback rtentry, if it exists. */ in6_ifremlocal(&(ia->ia_ifa)); /* * leave from multicast groups we have joined for the interface */ again: mutex_enter(&in6_ifaddr_lock); while ((imm = LIST_FIRST(&ia->ia6_memberships)) != NULL) { struct in6_multi *in6m __diagused = imm->i6mm_maddr; KASSERTMSG(in6m == NULL || in6m->in6m_ifp == ifp, "in6m_ifp=%s ifp=%s", in6m ? in6m->in6m_ifp->if_xname : NULL, ifp->if_xname); LIST_REMOVE(imm, i6mm_chain); mutex_exit(&in6_ifaddr_lock); in6_leavegroup(imm); goto again; } mutex_exit(&in6_ifaddr_lock); in6_unlink_ifa(ia, ifp); } static void in6_unlink_ifa(struct in6_ifaddr *ia, struct ifnet *ifp) { int s = splsoftnet(); mutex_enter(&in6_ifaddr_lock); IN6_ADDRLIST_WRITER_REMOVE(ia); ifa_remove(ifp, &ia->ia_ifa); /* Assume ifa_remove called pserialize_perform and psref_destroy */ mutex_exit(&in6_ifaddr_lock); IN6_ADDRLIST_ENTRY_DESTROY(ia); /* * release another refcnt for the link from in6_ifaddr. * Note that we should decrement the refcnt at least once for all *BSD. */ ifafree(&ia->ia_ifa); splx(s); } void in6_purgeif(struct ifnet *ifp) { IFNET_LOCK(ifp); in6_ifdetach(ifp); IFNET_UNLOCK(ifp); } void in6_purge_mcast_references(struct in6_multi *in6m) { struct in6_ifaddr *ia; KASSERT(in6_multi_locked(RW_WRITER)); mutex_enter(&in6_ifaddr_lock); IN6_ADDRLIST_WRITER_FOREACH(ia) { struct in6_multi_mship *imm; LIST_FOREACH(imm, &ia->ia6_memberships, i6mm_chain) { if (imm->i6mm_maddr == in6m) imm->i6mm_maddr = NULL; } } mutex_exit(&in6_ifaddr_lock); } /* * SIOC[GAD]LIFADDR. * SIOCGLIFADDR: get first address. (?) * SIOCGLIFADDR with IFLR_PREFIX: * get first address that matches the specified prefix. * SIOCALIFADDR: add the specified address. * SIOCALIFADDR with IFLR_PREFIX: * add the specified prefix, filling hostid part from * the first link-local address. prefixlen must be <= 64. * SIOCDLIFADDR: delete the specified address. * SIOCDLIFADDR with IFLR_PREFIX: * delete the first address that matches the specified prefix. * return values: * EINVAL on invalid parameters * EADDRNOTAVAIL on prefix match failed/specified address not found * other values may be returned from in6_ioctl() * * NOTE: SIOCALIFADDR(with IFLR_PREFIX set) allows prefixlen less than 64. * this is to accommodate address naming scheme other than RFC2374, * in the future. * RFC2373 defines interface id to be 64bit, but it allows non-RFC2374 * address encoding scheme. (see figure on page 8) */ static int in6_lifaddr_ioctl(struct socket *so, u_long cmd, void *data, struct ifnet *ifp) { struct in6_ifaddr *ia = NULL; /* XXX gcc 4.8 maybe-uninitialized */ struct if_laddrreq *iflr = (struct if_laddrreq *)data; struct ifaddr *ifa; struct sockaddr *sa; /* sanity checks */ if (!data || !ifp) { panic("invalid argument to in6_lifaddr_ioctl"); /* NOTREACHED */ } switch (cmd) { case SIOCGLIFADDR: /* address must be specified on GET with IFLR_PREFIX */ if ((iflr->flags & IFLR_PREFIX) == 0) break; /* FALLTHROUGH */ case SIOCALIFADDR: case SIOCDLIFADDR: /* address must be specified on ADD and DELETE */ sa = (struct sockaddr *)&iflr->addr; if (sa->sa_family != AF_INET6) return EINVAL; if (sa->sa_len != sizeof(struct sockaddr_in6)) return EINVAL; /* XXX need improvement */ sa = (struct sockaddr *)&iflr->dstaddr; if (sa->sa_family && sa->sa_family != AF_INET6) return EINVAL; if (sa->sa_len && sa->sa_len != sizeof(struct sockaddr_in6)) return EINVAL; break; default: /* shouldn't happen */ #if 0 panic("invalid cmd to in6_lifaddr_ioctl"); /* NOTREACHED */ #else return EOPNOTSUPP; #endif } if (sizeof(struct in6_addr) * NBBY < iflr->prefixlen) return EINVAL; switch (cmd) { case SIOCALIFADDR: { struct in6_aliasreq ifra; struct in6_addr *xhostid = NULL; int prefixlen; int bound = curlwp_bind(); struct psref psref; if ((iflr->flags & IFLR_PREFIX) != 0) { struct sockaddr_in6 *sin6; /* * xhostid is to fill in the hostid part of the * address. xhostid points to the first link-local * address attached to the interface. */ ia = in6ifa_ifpforlinklocal_psref(ifp, 0, &psref); if (ia == NULL) { curlwp_bindx(bound); return EADDRNOTAVAIL; } xhostid = IFA_IN6(&ia->ia_ifa); /* prefixlen must be <= 64. */ if (64 < iflr->prefixlen) { ia6_release(ia, &psref); curlwp_bindx(bound); return EINVAL; } prefixlen = iflr->prefixlen; /* hostid part must be zero. */ sin6 = (struct sockaddr_in6 *)&iflr->addr; if (sin6->sin6_addr.s6_addr32[2] != 0 || sin6->sin6_addr.s6_addr32[3] != 0) { ia6_release(ia, &psref); curlwp_bindx(bound); return EINVAL; } } else prefixlen = iflr->prefixlen; /* copy args to in6_aliasreq, perform ioctl(SIOCAIFADDR_IN6). */ memset(&ifra, 0, sizeof(ifra)); memcpy(ifra.ifra_name, iflr->iflr_name, sizeof(ifra.ifra_name)); memcpy(&ifra.ifra_addr, &iflr->addr, ((struct sockaddr *)&iflr->addr)->sa_len); if (xhostid) { /* fill in hostid part */ ifra.ifra_addr.sin6_addr.s6_addr32[2] = xhostid->s6_addr32[2]; ifra.ifra_addr.sin6_addr.s6_addr32[3] = xhostid->s6_addr32[3]; } if (((struct sockaddr *)&iflr->dstaddr)->sa_family) { /* XXX */ memcpy(&ifra.ifra_dstaddr, &iflr->dstaddr, ((struct sockaddr *)&iflr->dstaddr)->sa_len); if (xhostid) { ifra.ifra_dstaddr.sin6_addr.s6_addr32[2] = xhostid->s6_addr32[2]; ifra.ifra_dstaddr.sin6_addr.s6_addr32[3] = xhostid->s6_addr32[3]; } } if (xhostid) { ia6_release(ia, &psref); ia = NULL; } curlwp_bindx(bound); ifra.ifra_prefixmask.sin6_len = sizeof(struct sockaddr_in6); in6_prefixlen2mask(&ifra.ifra_prefixmask.sin6_addr, prefixlen); ifra.ifra_lifetime.ia6t_vltime = ND6_INFINITE_LIFETIME; ifra.ifra_lifetime.ia6t_pltime = ND6_INFINITE_LIFETIME; ifra.ifra_flags = iflr->flags & ~IFLR_PREFIX; return in6_control(so, SIOCAIFADDR_IN6, &ifra, ifp); } case SIOCGLIFADDR: case SIOCDLIFADDR: { struct in6_addr mask, candidate, match; struct sockaddr_in6 *sin6; int cmp; int error, s; memset(&mask, 0, sizeof(mask)); if (iflr->flags & IFLR_PREFIX) { /* lookup a prefix rather than address. */ in6_prefixlen2mask(&mask, iflr->prefixlen); sin6 = (struct sockaddr_in6 *)&iflr->addr; memcpy(&match, &sin6->sin6_addr, sizeof(match)); match.s6_addr32[0] &= mask.s6_addr32[0]; match.s6_addr32[1] &= mask.s6_addr32[1]; match.s6_addr32[2] &= mask.s6_addr32[2]; match.s6_addr32[3] &= mask.s6_addr32[3]; /* if you set extra bits, that's wrong */ if (memcmp(&match, &sin6->sin6_addr, sizeof(match))) return EINVAL; cmp = 1; } else { if (cmd == SIOCGLIFADDR) { /* on getting an address, take the 1st match */ cmp = 0; /* XXX */ } else { /* on deleting an address, do exact match */ in6_prefixlen2mask(&mask, 128); sin6 = (struct sockaddr_in6 *)&iflr->addr; memcpy(&match, &sin6->sin6_addr, sizeof(match)); cmp = 1; } } s = pserialize_read_enter(); IFADDR_READER_FOREACH(ifa, ifp) { if (ifa->ifa_addr->sa_family != AF_INET6) continue; if (!cmp) break; /* * XXX: this is adhoc, but is necessary to allow * a user to specify fe80::/64 (not /10) for a * link-local address. */ memcpy(&candidate, IFA_IN6(ifa), sizeof(candidate)); in6_clearscope(&candidate); candidate.s6_addr32[0] &= mask.s6_addr32[0]; candidate.s6_addr32[1] &= mask.s6_addr32[1]; candidate.s6_addr32[2] &= mask.s6_addr32[2]; candidate.s6_addr32[3] &= mask.s6_addr32[3]; if (IN6_ARE_ADDR_EQUAL(&candidate, &match)) break; } if (!ifa) { error = EADDRNOTAVAIL; goto error; } ia = ifa2ia6(ifa); if (cmd == SIOCGLIFADDR) { /* fill in the if_laddrreq structure */ memcpy(&iflr->addr, &ia->ia_addr, ia->ia_addr.sin6_len); error = sa6_recoverscope( (struct sockaddr_in6 *)&iflr->addr); if (error != 0) goto error; if ((ifp->if_flags & IFF_POINTOPOINT) != 0) { memcpy(&iflr->dstaddr, &ia->ia_dstaddr, ia->ia_dstaddr.sin6_len); error = sa6_recoverscope( (struct sockaddr_in6 *)&iflr->dstaddr); if (error != 0) goto error; } else memset(&iflr->dstaddr, 0, sizeof(iflr->dstaddr)); iflr->prefixlen = in6_mask2len(&ia->ia_prefixmask.sin6_addr, NULL); iflr->flags = ia->ia6_flags; /* XXX */ error = 0; } else { struct in6_aliasreq ifra; /* fill in6_aliasreq and do ioctl(SIOCDIFADDR_IN6) */ memset(&ifra, 0, sizeof(ifra)); memcpy(ifra.ifra_name, iflr->iflr_name, sizeof(ifra.ifra_name)); memcpy(&ifra.ifra_addr, &ia->ia_addr, ia->ia_addr.sin6_len); if ((ifp->if_flags & IFF_POINTOPOINT) != 0) { memcpy(&ifra.ifra_dstaddr, &ia->ia_dstaddr, ia->ia_dstaddr.sin6_len); } else { memset(&ifra.ifra_dstaddr, 0, sizeof(ifra.ifra_dstaddr)); } memcpy(&ifra.ifra_dstaddr, &ia->ia_prefixmask, ia->ia_prefixmask.sin6_len); ifra.ifra_flags = ia->ia6_flags; pserialize_read_exit(s); return in6_control(so, SIOCDIFADDR_IN6, &ifra, ifp); } error: pserialize_read_exit(s); return error; } } return EOPNOTSUPP; /* just for safety */ } /* * Initialize an interface's internet6 address * and routing table entry. */ static int in6_ifinit(struct ifnet *ifp, struct in6_ifaddr *ia, const struct sockaddr_in6 *sin6, int newhost) { int error = 0, ifacount = 0; int s; struct ifaddr *ifa; KASSERT(mutex_owned(&in6_ifaddr_lock)); /* * Give the interface a chance to initialize * if this is its first address, * and to validate the address if necessary. */ s = pserialize_read_enter(); IFADDR_READER_FOREACH(ifa, ifp) { if (ifa->ifa_addr->sa_family != AF_INET6) continue; ifacount++; } pserialize_read_exit(s); ia->ia_addr = *sin6; if (ifacount == 0 && (error = if_addr_init(ifp, &ia->ia_ifa, true)) != 0) { return error; } ia->ia_ifa.ifa_metric = ifp->if_metric; /* we could do in(6)_socktrim here, but just omit it at this moment. */ /* Add ownaddr as loopback rtentry, if necessary (ex. on p2p link). */ if (newhost) { /* set the rtrequest function to create llinfo */ if (ifp->if_flags & IFF_POINTOPOINT) ia->ia_ifa.ifa_rtrequest = p2p_rtrequest; else if ((ifp->if_flags & IFF_LOOPBACK) == 0) ia->ia_ifa.ifa_rtrequest = nd6_rtrequest; in6_ifaddlocal(&ia->ia_ifa); } else { /* Inform the routing socket of new flags/timings */ rt_addrmsg(RTM_NEWADDR, &ia->ia_ifa); } /* Add the network prefix route. */ if ((error = in6_ifaddprefix(ia)) != 0) { if (newhost) in6_ifremlocal(&ia->ia_ifa); return error; } return error; } static struct ifaddr * bestifa(struct ifaddr *best_ifa, struct ifaddr *ifa) { if (best_ifa == NULL || best_ifa->ifa_preference < ifa->ifa_preference) return ifa; return best_ifa; } /* * Find an IPv6 interface link-local address specific to an interface. */ struct in6_ifaddr * in6ifa_ifpforlinklocal(const struct ifnet *ifp, const int ignoreflags) { struct ifaddr *best_ifa = NULL, *ifa; IFADDR_READER_FOREACH(ifa, ifp) { if (ifa->ifa_addr->sa_family != AF_INET6) continue; if (!IN6_IS_ADDR_LINKLOCAL(IFA_IN6(ifa))) continue; if ((((struct in6_ifaddr *)ifa)->ia6_flags & ignoreflags) != 0) continue; best_ifa = bestifa(best_ifa, ifa); } return (struct in6_ifaddr *)best_ifa; } struct in6_ifaddr * in6ifa_ifpforlinklocal_psref(const struct ifnet *ifp, const int ignoreflags, struct psref *psref) { struct in6_ifaddr *ia; int s = pserialize_read_enter(); ia = in6ifa_ifpforlinklocal(ifp, ignoreflags); if (ia != NULL) ia6_acquire(ia, psref); pserialize_read_exit(s); return ia; } /* * find the internet address corresponding to a given address. * ifaddr is returned referenced. */ struct in6_ifaddr * in6ifa_ifwithaddr(const struct in6_addr *addr, uint32_t zoneid) { struct in6_ifaddr *ia; int s; s = pserialize_read_enter(); IN6_ADDRLIST_READER_FOREACH(ia) { if (IN6_ARE_ADDR_EQUAL(IA6_IN6(ia), addr)) { if (zoneid != 0 && zoneid != ia->ia_addr.sin6_scope_id) continue; ifaref(&ia->ia_ifa); break; } } pserialize_read_exit(s); return ia; } /* * find the internet address corresponding to a given interface and address. */ struct in6_ifaddr * in6ifa_ifpwithaddr(const struct ifnet *ifp, const struct in6_addr *addr) { struct ifaddr *best_ifa = NULL, *ifa; IFADDR_READER_FOREACH(ifa, ifp) { if (ifa->ifa_addr->sa_family != AF_INET6) continue; if (!IN6_ARE_ADDR_EQUAL(addr, IFA_IN6(ifa))) continue; best_ifa = bestifa(best_ifa, ifa); } return (struct in6_ifaddr *)best_ifa; } struct in6_ifaddr * in6ifa_ifpwithaddr_psref(const struct ifnet *ifp, const struct in6_addr *addr, struct psref *psref) { struct in6_ifaddr *ia; int s = pserialize_read_enter(); ia = in6ifa_ifpwithaddr(ifp, addr); if (ia != NULL) ia6_acquire(ia, psref); pserialize_read_exit(s); return ia; } static struct in6_ifaddr * bestia(struct in6_ifaddr *best_ia, struct in6_ifaddr *ia) { if (best_ia == NULL || best_ia->ia_ifa.ifa_preference < ia->ia_ifa.ifa_preference) return ia; return best_ia; } /* * Determine if an address is on a local network. */ int in6_localaddr(const struct in6_addr *in6) { struct in6_ifaddr *ia; int s; if (IN6_IS_ADDR_LOOPBACK(in6) || IN6_IS_ADDR_LINKLOCAL(in6)) return 1; s = pserialize_read_enter(); IN6_ADDRLIST_READER_FOREACH(ia) { if (IN6_ARE_MASKED_ADDR_EQUAL(in6, &ia->ia_addr.sin6_addr, &ia->ia_prefixmask.sin6_addr)) { pserialize_read_exit(s); return 1; } } pserialize_read_exit(s); return 0; } int in6_is_addr_deprecated(struct sockaddr_in6 *sa6) { struct in6_ifaddr *ia; int s; s = pserialize_read_enter(); IN6_ADDRLIST_READER_FOREACH(ia) { if (IN6_ARE_ADDR_EQUAL(&ia->ia_addr.sin6_addr, &sa6->sin6_addr) && #ifdef SCOPEDROUTING ia->ia_addr.sin6_scope_id == sa6->sin6_scope_id && #endif (ia->ia6_flags & IN6_IFF_DEPRECATED) != 0) { pserialize_read_exit(s); return 1; /* true */ } /* XXX: do we still have to go thru the rest of the list? */ } pserialize_read_exit(s); return 0; /* false */ } /* * return length of part which dst and src are equal * hard coding... */ int in6_matchlen(struct in6_addr *src, struct in6_addr *dst) { int match = 0; u_char *s = (u_char *)src, *d = (u_char *)dst; u_char *lim = s + 16, r; while (s < lim) if ((r = (*d++ ^ *s++)) != 0) { while (r < 128) { match++; r <<= 1; } break; } else match += NBBY; return match; } void in6_prefixlen2mask(struct in6_addr *maskp, int len) { static const u_char maskarray[NBBY] = {0x80, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc, 0xfe, 0xff}; int bytelen, bitlen, i; /* sanity check */ if (len < 0 || len > 128) { log(LOG_ERR, "in6_prefixlen2mask: invalid prefix length(%d)\n", len); return; } memset(maskp, 0, sizeof(*maskp)); bytelen = len / NBBY; bitlen = len % NBBY; for (i = 0; i < bytelen; i++) maskp->s6_addr[i] = 0xff; if (bitlen) maskp->s6_addr[bytelen] = maskarray[bitlen - 1]; } /* * return the best address out of the same scope. if no address was * found, return the first valid address from designated IF. */ struct in6_ifaddr * in6_ifawithifp(struct ifnet *ifp, struct in6_addr *dst) { int dst_scope = in6_addrscope(dst), blen = -1, tlen; struct ifaddr *ifa; struct in6_ifaddr *best_ia = NULL, *ia; struct in6_ifaddr *dep[2]; /* last-resort: deprecated */ dep[0] = dep[1] = NULL; /* * We first look for addresses in the same scope. * If there is one, return it. * If two or more, return one which matches the dst longest. * If none, return one of global addresses assigned other ifs. */ IFADDR_READER_FOREACH(ifa, ifp) { if (ifa->ifa_addr->sa_family != AF_INET6) continue; ia = (struct in6_ifaddr *)ifa; if (ia->ia6_flags & IN6_IFF_ANYCAST) continue; /* XXX: is there any case to allow anycast? */ if (ia->ia6_flags & IN6_IFF_NOTREADY) continue; /* don't use this interface */ if (ia->ia6_flags & IN6_IFF_DETACHED) continue; if (ia->ia6_flags & IN6_IFF_DEPRECATED) { if (ip6_use_deprecated) dep[0] = ia; continue; } if (dst_scope != in6_addrscope(IFA_IN6(ifa))) continue; /* * call in6_matchlen() as few as possible */ if (best_ia == NULL) { best_ia = ia; continue; } if (blen == -1) blen = in6_matchlen(&best_ia->ia_addr.sin6_addr, dst); tlen = in6_matchlen(IFA_IN6(ifa), dst); if (tlen > blen) { blen = tlen; best_ia = ia; } else if (tlen == blen) best_ia = bestia(best_ia, ia); } if (best_ia != NULL) return best_ia; IFADDR_READER_FOREACH(ifa, ifp) { if (ifa->ifa_addr->sa_family != AF_INET6) continue; ia = (struct in6_ifaddr *)ifa; if (ia->ia6_flags & IN6_IFF_ANYCAST) continue; /* XXX: is there any case to allow anycast? */ if (ia->ia6_flags & IN6_IFF_NOTREADY) continue; /* don't use this interface */ if (ia->ia6_flags & IN6_IFF_DETACHED) continue; if (ia->ia6_flags & IN6_IFF_DEPRECATED) { if (ip6_use_deprecated) dep[1] = (struct in6_ifaddr *)ifa; continue; } best_ia = bestia(best_ia, ia); } if (best_ia != NULL) return best_ia; /* use the last-resort values, that are, deprecated addresses */ if (dep[0]) return dep[0]; if (dep[1]) return dep[1]; return NULL; } /* * perform DAD when interface becomes IFF_UP. */ void in6_if_link_up(struct ifnet *ifp) { struct ifaddr *ifa; struct in6_ifaddr *ia; int s, bound; char ip6buf[INET6_ADDRSTRLEN]; /* Ensure it's sane to run DAD */ if (ifp->if_link_state == LINK_STATE_DOWN) return; if ((ifp->if_flags & (IFF_UP|IFF_RUNNING)) != (IFF_UP|IFF_RUNNING)) return; bound = curlwp_bind(); s = pserialize_read_enter(); IFADDR_READER_FOREACH(ifa, ifp) { struct psref psref; if (ifa->ifa_addr->sa_family != AF_INET6) continue; ifa_acquire(ifa, &psref); pserialize_read_exit(s); ia = (struct in6_ifaddr *)ifa; /* If detached then mark as tentative */ if (ia->ia6_flags & IN6_IFF_DETACHED) { ia->ia6_flags &= ~IN6_IFF_DETACHED; if (ip6_dad_enabled() && if_do_dad(ifp)) { ia->ia6_flags |= IN6_IFF_TENTATIVE; nd6log(LOG_ERR, "%s marked tentative\n", IN6_PRINT(ip6buf, &ia->ia_addr.sin6_addr)); } else if ((ia->ia6_flags & IN6_IFF_TENTATIVE) == 0) rt_addrmsg(RTM_NEWADDR, ifa); } if (ia->ia6_flags & IN6_IFF_TENTATIVE) { int rand_delay; /* Clear the duplicated flag as we're starting DAD. */ ia->ia6_flags &= ~IN6_IFF_DUPLICATED; /* * The TENTATIVE flag was likely set by hand * beforehand, implicitly indicating the need for DAD. * We may be able to skip the random delay in this * case, but we impose delays just in case. */ rand_delay = cprng_fast32() % (MAX_RTR_SOLICITATION_DELAY * hz); /* +1 ensures callout is always used */ nd6_dad_start(ifa, rand_delay + 1); } s = pserialize_read_enter(); ifa_release(ifa, &psref); } pserialize_read_exit(s); curlwp_bindx(bound); } void in6_if_up(struct ifnet *ifp) { /* * special cases, like 6to4, are handled in in6_ifattach */ in6_ifattach(ifp, NULL); /* interface may not support link state, so bring it up also */ in6_if_link_up(ifp); } /* * Mark all addresses as detached. */ void in6_if_link_down(struct ifnet *ifp) { struct ifaddr *ifa; struct in6_ifaddr *ia; int s, bound; char ip6buf[INET6_ADDRSTRLEN]; bound = curlwp_bind(); s = pserialize_read_enter(); IFADDR_READER_FOREACH(ifa, ifp) { struct psref psref; if (ifa->ifa_addr->sa_family != AF_INET6) continue; ifa_acquire(ifa, &psref); pserialize_read_exit(s); ia = (struct in6_ifaddr *)ifa; /* Stop DAD processing */ nd6_dad_stop(ifa); /* * Mark the address as detached. * This satisfies RFC4862 Section 5.3, but we should apply * this logic to all addresses to be a good citizen and * avoid potential duplicated addresses. * When the interface comes up again, detached addresses * are marked tentative and DAD commences. */ if (!(ia->ia6_flags & IN6_IFF_DETACHED)) { nd6log(LOG_DEBUG, "%s marked detached\n", IN6_PRINT(ip6buf, &ia->ia_addr.sin6_addr)); ia->ia6_flags |= IN6_IFF_DETACHED; ia->ia6_flags &= ~(IN6_IFF_TENTATIVE | IN6_IFF_DUPLICATED); rt_addrmsg(RTM_NEWADDR, ifa); } s = pserialize_read_enter(); ifa_release(ifa, &psref); } pserialize_read_exit(s); curlwp_bindx(bound); /* Clear ND6_IFF_IFDISABLED to allow DAD again on link-up. */ if (ifp->if_afdata[AF_INET6] != NULL) ND_IFINFO(ifp)->flags &= ~ND6_IFF_IFDISABLED; } void in6_if_down(struct ifnet *ifp) { in6_if_link_down(ifp); lltable_purge_entries(LLTABLE6(ifp)); } void in6_if_link_state_change(struct ifnet *ifp, int link_state) { /* * Treat LINK_STATE_UNKNOWN as UP. * LINK_STATE_UNKNOWN transitions to LINK_STATE_DOWN when * if_link_state_change() transitions to LINK_STATE_UP. */ if (link_state == LINK_STATE_DOWN) in6_if_link_down(ifp); else in6_if_link_up(ifp); } int in6_tunnel_validate(const struct ip6_hdr *ip6, const struct in6_addr *src, const struct in6_addr *dst) { /* check for address match */ if (!IN6_ARE_ADDR_EQUAL(src, &ip6->ip6_dst) || !IN6_ARE_ADDR_EQUAL(dst, &ip6->ip6_src)) return 0; /* martian filters on outer source - done in ip6_input */ /* NOTE: the packet may be dropped by uRPF. */ /* return valid bytes length */ return sizeof(*src) + sizeof(*dst); } #define IN6_LLTBL_DEFAULT_HSIZE 32 #define IN6_LLTBL_HASH(k, h) \ (((((((k >> 8) ^ k) >> 8) ^ k) >> 8) ^ k) & ((h) - 1)) /* * Do actual deallocation of @lle. * Called by LLE_FREE_LOCKED when number of references * drops to zero. */ static void in6_lltable_destroy_lle(struct llentry *lle) { KASSERTMSG(lle->la_numheld == 0, "la_numheld=%d", lle->la_numheld); LLE_WUNLOCK(lle); LLE_LOCK_DESTROY(lle); llentry_pool_put(lle); } static struct llentry * in6_lltable_new(const struct in6_addr *addr6, u_int flags) { struct llentry *lle; lle = llentry_pool_get(PR_NOWAIT); if (lle == NULL) /* NB: caller generates msg */ return NULL; lle->r_l3addr.addr6 = *addr6; lle->lle_refcnt = 1; lle->lle_free = in6_lltable_destroy_lle; LLE_LOCK_INIT(lle); callout_init(&lle->lle_timer, CALLOUT_MPSAFE); return lle; } static int in6_lltable_match_prefix(const struct sockaddr *prefix, const struct sockaddr *mask, u_int flags, struct llentry *lle) { const struct sockaddr_in6 *pfx = (const struct sockaddr_in6 *)prefix; const struct sockaddr_in6 *msk = (const struct sockaddr_in6 *)mask; if (IN6_ARE_MASKED_ADDR_EQUAL(&lle->r_l3addr.addr6, &pfx->sin6_addr, &msk->sin6_addr) && ((flags & LLE_STATIC) || !(lle->la_flags & LLE_STATIC))) return 1; return 0; } static void in6_lltable_free_entry(struct lltable *llt, struct llentry *lle) { LLE_WLOCK_ASSERT(lle); (void) llentry_free(lle); } static int in6_lltable_rtcheck(struct ifnet *ifp, u_int flags, const struct sockaddr *l3addr, const struct rtentry *rt) { char ip6buf[INET6_ADDRSTRLEN]; if (rt == NULL || (rt->rt_flags & RTF_GATEWAY) || rt->rt_ifp != ifp) { int s; struct ifaddr *ifa; /* * Create an ND6 cache for an IPv6 neighbor * that is not covered by our own prefix. */ /* XXX ifaof_ifpforaddr should take a const param */ s = pserialize_read_enter(); ifa = ifaof_ifpforaddr(l3addr, ifp); if (ifa != NULL) { pserialize_read_exit(s); return 0; } pserialize_read_exit(s); log(LOG_INFO, "IPv6 address: \"%s\" is not on the network\n", IN6_PRINT(ip6buf, &((const struct sockaddr_in6 *)l3addr)->sin6_addr)); return EINVAL; } return 0; } static inline uint32_t in6_lltable_hash_dst(const struct in6_addr *dst, uint32_t hsize) { return IN6_LLTBL_HASH(dst->s6_addr32[3], hsize); } static uint32_t in6_lltable_hash(const struct llentry *lle, uint32_t hsize) { return in6_lltable_hash_dst(&lle->r_l3addr.addr6, hsize); } static void in6_lltable_fill_sa_entry(const struct llentry *lle, struct sockaddr *sa) { struct sockaddr_in6 *sin6; sin6 = (struct sockaddr_in6 *)sa; bzero(sin6, sizeof(*sin6)); sin6->sin6_family = AF_INET6; sin6->sin6_len = sizeof(*sin6); sin6->sin6_addr = lle->r_l3addr.addr6; } static inline struct llentry * in6_lltable_find_dst(struct lltable *llt, const struct in6_addr *dst) { struct llentry *lle; struct llentries *lleh; u_int hashidx; hashidx = in6_lltable_hash_dst(dst, llt->llt_hsize); lleh = &llt->lle_head[hashidx]; LIST_FOREACH(lle, lleh, lle_next) { if (lle->la_flags & LLE_DELETED) continue; if (IN6_ARE_ADDR_EQUAL(&lle->r_l3addr.addr6, dst)) break; } return lle; } static int in6_lltable_delete(struct lltable *llt, u_int flags, const struct sockaddr *l3addr) { const struct sockaddr_in6 *sin6 = (const struct sockaddr_in6 *)l3addr; struct llentry *lle; IF_AFDATA_WLOCK_ASSERT(llt->llt_ifp); KASSERTMSG(l3addr->sa_family == AF_INET6, "sin_family %d", l3addr->sa_family); lle = in6_lltable_find_dst(llt, &sin6->sin6_addr); if (lle == NULL) { #ifdef LLTABLE_DEBUG char buf[64]; sockaddr_format(l3addr, buf, sizeof(buf)); log(LOG_INFO, "%s: cache for %s is not found\n", __func__, buf); #endif return ENOENT; } LLE_WLOCK(lle); #ifdef LLTABLE_DEBUG { char buf[64]; sockaddr_format(l3addr, buf, sizeof(buf)); log(LOG_INFO, "%s: cache for %s (%p) is deleted\n", __func__, buf, lle); } #endif llentry_free(lle); return 0; } static struct llentry * in6_lltable_create(struct lltable *llt, u_int flags, const struct sockaddr *l3addr, const struct rtentry *rt) { const struct sockaddr_in6 *sin6 = (const struct sockaddr_in6 *)l3addr; struct ifnet *ifp = llt->llt_ifp; struct llentry *lle; IF_AFDATA_WLOCK_ASSERT(ifp); KASSERTMSG(l3addr->sa_family == AF_INET6, "sin_family %d", l3addr->sa_family); lle = in6_lltable_find_dst(llt, &sin6->sin6_addr); if (lle != NULL) { LLE_WLOCK(lle); return lle; } /* * A route that covers the given address must have * been installed 1st because we are doing a resolution, * verify this. */ if (!(flags & LLE_IFADDR) && in6_lltable_rtcheck(ifp, flags, l3addr, rt) != 0) return NULL; lle = in6_lltable_new(&sin6->sin6_addr, flags); if (lle == NULL) { log(LOG_INFO, "lla_lookup: new lle malloc failed\n"); return NULL; } lle->la_flags = flags; if ((flags & LLE_IFADDR) == LLE_IFADDR) { memcpy(&lle->ll_addr, CLLADDR(ifp->if_sadl), ifp->if_addrlen); lle->la_flags |= LLE_VALID; } lltable_link_entry(llt, lle); LLE_WLOCK(lle); return lle; } static struct llentry * in6_lltable_lookup(struct lltable *llt, u_int flags, const struct sockaddr *l3addr) { const struct sockaddr_in6 *sin6 = (const struct sockaddr_in6 *)l3addr; struct llentry *lle; IF_AFDATA_LOCK_ASSERT(llt->llt_ifp); KASSERTMSG(l3addr->sa_family == AF_INET6, "sin_family %d", l3addr->sa_family); lle = in6_lltable_find_dst(llt, &sin6->sin6_addr); if (lle == NULL) return NULL; if (flags & LLE_EXCLUSIVE) LLE_WLOCK(lle); else LLE_RLOCK(lle); return lle; } static int in6_lltable_dump_entry(struct lltable *llt, struct llentry *lle, struct rt_walkarg *w) { struct sockaddr_in6 sin6; LLTABLE_LOCK_ASSERT(); /* skip deleted entries */ if (lle->la_flags & LLE_DELETED) return 0; sockaddr_in6_init(&sin6, &lle->r_l3addr.addr6, 0, 0, 0); return lltable_dump_entry(llt, lle, w, sin6tosa(&sin6)); } static struct lltable * in6_lltattach(struct ifnet *ifp) { struct lltable *llt; llt = lltable_allocate_htbl(IN6_LLTBL_DEFAULT_HSIZE); llt->llt_af = AF_INET6; llt->llt_ifp = ifp; llt->llt_lookup = in6_lltable_lookup; llt->llt_create = in6_lltable_create; llt->llt_delete = in6_lltable_delete; llt->llt_dump_entry = in6_lltable_dump_entry; llt->llt_hash = in6_lltable_hash; llt->llt_fill_sa_entry = in6_lltable_fill_sa_entry; llt->llt_free_entry = in6_lltable_free_entry; llt->llt_match_prefix = in6_lltable_match_prefix; lltable_link(llt); return llt; } void * in6_domifattach(struct ifnet *ifp) { struct in6_ifextra *ext; ext = malloc(sizeof(*ext), M_IFADDR, M_WAITOK|M_ZERO); ext->in6_ifstat = malloc(sizeof(struct in6_ifstat), M_IFADDR, M_WAITOK|M_ZERO); ext->icmp6_ifstat = malloc(sizeof(struct icmp6_ifstat), M_IFADDR, M_WAITOK|M_ZERO); ext->nd_ifinfo = nd6_ifattach(ifp); ext->scope6_id = scope6_ifattach(ifp); ext->lltable = in6_lltattach(ifp); return ext; } void in6_domifdetach(struct ifnet *ifp, void *aux) { struct in6_ifextra *ext = (struct in6_ifextra *)aux; lltable_free(ext->lltable); ext->lltable = NULL; SOFTNET_LOCK_UNLESS_NET_MPSAFE(); nd6_ifdetach(ifp, ext); SOFTNET_UNLOCK_UNLESS_NET_MPSAFE(); free(ext->in6_ifstat, M_IFADDR); free(ext->icmp6_ifstat, M_IFADDR); scope6_ifdetach(ext->scope6_id); free(ext, M_IFADDR); } /* * Convert IPv4 address stored in struct in_addr to IPv4-Mapped IPv6 address * stored in struct in6_addr as defined in RFC 4921 section 2.5.5.2. */ void in6_in_2_v4mapin6(const struct in_addr *in, struct in6_addr *in6) { in6->s6_addr32[0] = 0; in6->s6_addr32[1] = 0; in6->s6_addr32[2] = IPV6_ADDR_INT32_SMP; in6->s6_addr32[3] = in->s_addr; } /* * Convert sockaddr_in6 to sockaddr_in. Original sockaddr_in6 must be * v4 mapped addr or v4 compat addr */ void in6_sin6_2_sin(struct sockaddr_in *sin, struct sockaddr_in6 *sin6) { memset(sin, 0, sizeof(*sin)); sin->sin_len = sizeof(struct sockaddr_in); sin->sin_family = AF_INET; sin->sin_port = sin6->sin6_port; sin->sin_addr.s_addr = sin6->sin6_addr.s6_addr32[3]; } /* Convert sockaddr_in to sockaddr_in6 in v4 mapped addr format. */ void in6_sin_2_v4mapsin6(const struct sockaddr_in *sin, struct sockaddr_in6 *sin6) { memset(sin6, 0, sizeof(*sin6)); sin6->sin6_len = sizeof(struct sockaddr_in6); sin6->sin6_family = AF_INET6; sin6->sin6_port = sin->sin_port; in6_in_2_v4mapin6(&sin->sin_addr, &sin6->sin6_addr); } /* Convert sockaddr_in6 into sockaddr_in. */ void in6_sin6_2_sin_in_sock(struct sockaddr *nam) { struct sockaddr_in *sin_p; struct sockaddr_in6 sin6; /* * Save original sockaddr_in6 addr and convert it * to sockaddr_in. */ sin6 = *(struct sockaddr_in6 *)nam; sin_p = (struct sockaddr_in *)nam; in6_sin6_2_sin(sin_p, &sin6); } /* Convert sockaddr_in into sockaddr_in6 in v4 mapped addr format. */ void in6_sin_2_v4mapsin6_in_sock(struct sockaddr **nam) { struct sockaddr_in *sin_p; struct sockaddr_in6 *sin6_p; sin6_p = malloc(sizeof(*sin6_p), M_SONAME, M_WAITOK); sin_p = (struct sockaddr_in *)*nam; in6_sin_2_v4mapsin6(sin_p, sin6_p); free(*nam, M_SONAME); *nam = sin6tosa(sin6_p); }
21 21 20 1 21 2 29 29 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 /* $NetBSD: vfs_cwd.c,v 1.11 2023/09/23 18:21:11 ad Exp $ */ /*- * Copyright (c) 2008, 2020, 2023 The NetBSD Foundation, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /* * Current working directory. */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: vfs_cwd.c,v 1.11 2023/09/23 18:21:11 ad Exp $"); #include <sys/param.h> #include <sys/atomic.h> #include <sys/filedesc.h> #include <sys/proc.h> #include <sys/vnode.h> #include <sys/kmem.h> /* * Create an initial cwdinfo structure, using the same current and root * directories as curproc. */ struct cwdinfo * cwdinit(void) { struct cwdinfo *cwdi; struct cwdinfo *copy; cwdi = kmem_alloc(sizeof(*cwdi), KM_SLEEP); KASSERT(ALIGNED_POINTER(cwdi, COHERENCY_UNIT)); rw_init(&cwdi->cwdi_lock); copy = curproc->p_cwdi; rw_enter(&copy->cwdi_lock, RW_READER); cwdi->cwdi_cdir = copy->cwdi_cdir; if (cwdi->cwdi_cdir) vref(cwdi->cwdi_cdir); cwdi->cwdi_rdir = copy->cwdi_rdir; if (cwdi->cwdi_rdir) vref(cwdi->cwdi_rdir); cwdi->cwdi_edir = copy->cwdi_edir; if (cwdi->cwdi_edir) vref(cwdi->cwdi_edir); rw_exit(&copy->cwdi_lock); cwdi->cwdi_cmask = copy->cwdi_cmask; cwdi->cwdi_refcnt = 1; return (cwdi); } /* * Make p2 share p1's cwdinfo. */ void cwdshare(struct proc *p2) { struct cwdinfo *cwdi; cwdi = curproc->p_cwdi; atomic_inc_uint(&cwdi->cwdi_refcnt); p2->p_cwdi = cwdi; } /* * Make sure proc has only one reference to its cwdi, creating * a new one if necessary. */ void cwdunshare(struct proc *p) { struct cwdinfo *cwdi = p->p_cwdi; if (cwdi->cwdi_refcnt > 1) { cwdi = cwdinit(); cwdfree(p->p_cwdi); p->p_cwdi = cwdi; } } /* * Release a cwdinfo structure. */ void cwdfree(struct cwdinfo *cwdi) { membar_release(); if (atomic_dec_uint_nv(&cwdi->cwdi_refcnt) > 0) return; membar_acquire(); vrele(cwdi->cwdi_cdir); rw_destroy(&cwdi->cwdi_lock); if (cwdi->cwdi_rdir) vrele(cwdi->cwdi_rdir); if (cwdi->cwdi_edir) vrele(cwdi->cwdi_edir); kmem_free(cwdi, sizeof(*cwdi)); } void cwdexec(struct proc *p) { cwdunshare(p); if (p->p_cwdi->cwdi_edir) { vrele(p->p_cwdi->cwdi_edir); } }
43 1 7 2 41 12 39 37 36 1 37 37 36 37 36 36 37 36 36 6 39 6 38 63 63 8 63 63 7 60 3 1 3 3 61 13 44 3 41 62 62 58 58 59 59 56 59 56 59 57 6 54 2 15 38 15 40 41 46 19 16 3 22 22 39 1 1 39 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 /* $NetBSD: in6_src.c,v 1.92 2023/08/03 04:24:55 ozaki-r Exp $ */ /* $KAME: in6_src.c,v 1.159 2005/10/19 01:40:32 t-momose Exp $ */ /* * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the project nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * Copyright (c) 1982, 1986, 1991, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)in_pcb.c 8.2 (Berkeley) 1/4/94 */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: in6_src.c,v 1.92 2023/08/03 04:24:55 ozaki-r Exp $"); #ifdef _KERNEL_OPT #include "opt_inet.h" #endif #include <sys/param.h> #include <sys/systm.h> #include <sys/malloc.h> #include <sys/mbuf.h> #include <sys/protosw.h> #include <sys/socket.h> #include <sys/socketvar.h> #include <sys/ioctl.h> #include <sys/errno.h> #include <sys/time.h> #include <sys/kernel.h> #include <sys/proc.h> #include <sys/kauth.h> #include <net/if.h> #include <net/if_types.h> #include <net/route.h> #include <netinet/in.h> #include <netinet/in_var.h> #include <netinet/in_systm.h> #include <netinet/ip.h> #include <netinet/in_pcb.h> #include <netinet/portalgo.h> #include <netinet6/in6_var.h> #include <netinet/ip6.h> #include <netinet6/in6_pcb.h> #include <netinet6/ip6_var.h> #include <netinet6/ip6_private.h> #include <netinet6/nd6.h> #include <netinet6/scope6_var.h> #ifdef MIP6 #include <netinet6/mip6.h> #include <netinet6/mip6_var.h> #include "mip.h" #if NMIP > 0 #include <net/if_mip.h> #endif /* NMIP > 0 */ #endif /* MIP6 */ #include <netinet/tcp_vtw.h> #define ADDR_LABEL_NOTAPP (-1) struct in6_addrpolicy defaultaddrpolicy; int ip6_prefer_tempaddr = 0; static int in6_selectif(struct sockaddr_in6 *, struct ip6_pktopts *, struct ip6_moptions *, struct route *, struct ifnet **, struct psref *); static struct in6_addrpolicy *lookup_addrsel_policy(struct sockaddr_in6 *); static void init_policy_queue(void); static int add_addrsel_policyent(struct in6_addrpolicy *); static int delete_addrsel_policyent(struct in6_addrpolicy *); static int walk_addrsel_policy(int (*)(struct in6_addrpolicy *, void *), void *); static int dump_addrsel_policyent(struct in6_addrpolicy *, void *); static struct in6_addrpolicy *match_addrsel_policy(struct sockaddr_in6 *); #define IFA6_IS_VALIDATED(ia) \ (((ia)->ia6_flags & (IN6_IFF_TENTATIVE | IN6_IFF_DETACHED)) == 0) /* * Return an IPv6 address, which is the most appropriate for a given * destination and user specified options. * If necessary, this function lookups the routing table and returns * an entry to the caller for later use. */ #if 0 /* disabled ad-hoc */ #define REPLACE(r) do {\ char _buf1[INET6_ADDRSTRLEN], _buf2[INET6_ADDRSTRLEN]; \ if ((r) < sizeof(ip6stat.ip6s_sources_rule) / \ sizeof(ip6stat.ip6s_sources_rule[0])) /* check for safety */ \ ip6stat.ip6s_sources_rule[(r)]++; \ printf("%s: replace %s with %s by %d\n", __func__, ia_best ? \ IN6_PRINT(_buf1, &ia_best->ia_addr.sin6_addr) : "none", \ IN6_PRINT(_buf2, &ia->ia_addr.sin6_addr), (r)); \ goto replace; \ } while(/*CONSTCOND*/0) #define NEXT(r) do {\ if ((r) < sizeof(ip6stat.ip6s_sources_rule) / \ sizeof(ip6stat.ip6s_sources_rule[0])) /* check for safety */ \ ip6stat.ip6s_sources_rule[(r)]++; \ printf("%s: keep %s against %s by %d\n", ia_best ? \ IN6_PRINT(_buf1, &ia_best->ia_addr.sin6_addr) : "none", \ IN6_PRINT(_buf2, &ia->ia_addr.sin6_addr), (r)); \ goto next; /* XXX: we can't use 'continue' here */ \ } while(/*CONSTCOND*/0) #define BREAK(r) do { \ if ((r) < sizeof(ip6stat.ip6s_sources_rule) / \ sizeof(ip6stat.ip6s_sources_rule[0])) /* check for safety */ \ ip6stat.ip6s_sources_rule[(r)]++; \ goto out; /* XXX: we can't use 'break' here */ \ } while(/*CONSTCOND*/0) #else #define REPLACE(r) goto replace #define NEXT(r) goto next #define BREAK(r) goto out #endif /* * Called inside pserialize critical section. Don't sleep/block. */ static struct in6_ifaddr * in6_select_best_ia(struct sockaddr_in6 *dstsock, struct in6_addr *dst, const struct ifnet *ifp, const struct ip6_pktopts *opts, const u_int32_t odstzone) { struct in6_ifaddr *ia, *ia_best = NULL; int dst_scope = -1, best_scope = -1, best_matchlen = -1; struct in6_addrpolicy *dst_policy = NULL, *best_policy = NULL; IN6_ADDRLIST_READER_FOREACH(ia) { int new_scope = -1, new_matchlen = -1; struct in6_addrpolicy *new_policy = NULL; u_int32_t srczone, osrczone, dstzone; struct in6_addr src; struct ifnet *ifp1 = ia->ia_ifp; int prefer_tempaddr; /* * We'll never take an address that breaks the scope zone * of the destination. We also skip an address if its zone * does not contain the outgoing interface. * XXX: we should probably use sin6_scope_id here. */ if (in6_setscope(dst, ifp1, &dstzone) || odstzone != dstzone) { continue; } src = ia->ia_addr.sin6_addr; /* Skip the scope test in impossible cases */ if (!(ifp->if_flags & IFF_LOOPBACK) && IN6_IS_ADDR_LOOPBACK(&src)) continue; if (in6_setscope(&src, ifp, &osrczone) || in6_setscope(&src, ifp1, &srczone) || osrczone != srczone) { continue; } /* avoid unusable addresses */ if ((ia->ia6_flags & (IN6_IFF_DUPLICATED | IN6_IFF_ANYCAST))) continue; if (!ip6_use_deprecated && IFA6_IS_DEPRECATED(ia)) continue; #if defined(MIP6) && NMIP > 0 /* avoid unusable home addresses. */ if ((ia->ia6_flags & IN6_IFF_HOME) && !mip6_ifa6_is_addr_valid_hoa(ia)) continue; #endif /* MIP6 && NMIP > 0 */ /* Rule 1: Prefer same address */ if (IN6_ARE_ADDR_EQUAL(dst, &ia->ia_addr.sin6_addr)) { ia_best = ia; BREAK(1); /* there should be no better candidate */ } if (ia_best == NULL) REPLACE(1); /* Rule 2: Prefer appropriate scope */ if (dst_scope < 0) dst_scope = in6_addrscope(dst); new_scope = in6_addrscope(&ia->ia_addr.sin6_addr); if (IN6_ARE_SCOPE_CMP(best_scope, new_scope) < 0) { if (IN6_ARE_SCOPE_CMP(best_scope, dst_scope) < 0) REPLACE(2); NEXT(2); } else if (IN6_ARE_SCOPE_CMP(new_scope, best_scope) < 0) { if (IN6_ARE_SCOPE_CMP(new_scope, dst_scope) < 0) NEXT(2); REPLACE(2); } /* * Rule 3: Avoid deprecated addresses. Note that the case of * !ip6_use_deprecated is already rejected above. * Treat unvalidated addresses as deprecated here. */ if (IFA6_IS_VALIDATED(ia_best) && !IFA6_IS_VALIDATED(ia)) NEXT(3); if (!IFA6_IS_VALIDATED(ia_best) && IFA6_IS_VALIDATED(ia)) REPLACE(3); if (!IFA6_IS_DEPRECATED(ia_best) && IFA6_IS_DEPRECATED(ia)) NEXT(3); if (IFA6_IS_DEPRECATED(ia_best) && !IFA6_IS_DEPRECATED(ia)) REPLACE(3); /* Rule 4: Prefer home addresses */ #if defined(MIP6) && NMIP > 0 if (!MIP6_IS_MN) goto skip_rule4; if ((ia_best->ia6_flags & IN6_IFF_HOME) == 0 && (ia->ia6_flags & IN6_IFF_HOME) == 0) { /* both address are not home addresses. */ goto skip_rule4; } /* * If SA is simultaneously a home address and care-of * address and SB is not, then prefer SA. Similarly, * if SB is simultaneously a home address and care-of * address and SA is not, then prefer SB. */ if (((ia_best->ia6_flags & IN6_IFF_HOME) != 0 && ia_best->ia_ifp->if_type != IFT_MIP) && ((ia->ia6_flags & IN6_IFF_HOME) != 0 && ia->ia_ifp->if_type == IFT_MIP)) NEXT(4); if (((ia_best->ia6_flags & IN6_IFF_HOME) != 0 && ia_best->ia_ifp->if_type == IFT_MIP) && ((ia->ia6_flags & IN6_IFF_HOME) != 0 && ia->ia_ifp->if_type != IFT_MIP)) REPLACE(4); if (ip6po_usecoa == 0) { /* * If SA is just a home address and SB is just * a care-of address, then prefer * SA. Similarly, if SB is just a home address * and SA is just a care-of address, then * prefer SB. */ if ((ia_best->ia6_flags & IN6_IFF_HOME) != 0 && (ia->ia6_flags & IN6_IFF_HOME) == 0) { NEXT(4); } if ((ia_best->ia6_flags & IN6_IFF_HOME) == 0 && (ia->ia6_flags & IN6_IFF_HOME) != 0) { REPLACE(4); } } else { /* * a sender don't want to use a home address * because: * * 1) we cannot use. (ex. NS or NA to global * addresses.) * * 2) a user specified not to use. * (ex. mip6control -u) */ if ((ia_best->ia6_flags & IN6_IFF_HOME) == 0 && (ia->ia6_flags & IN6_IFF_HOME) != 0) { /* XXX breaks stat */ NEXT(0); } if ((ia_best->ia6_flags & IN6_IFF_HOME) != 0 && (ia->ia6_flags & IN6_IFF_HOME) == 0) { /* XXX breaks stat */ REPLACE(0); } } skip_rule4: #endif /* MIP6 && NMIP > 0 */ /* Rule 5: Prefer outgoing interface */ if (ia_best->ia_ifp == ifp && ia->ia_ifp != ifp) NEXT(5); if (ia_best->ia_ifp != ifp && ia->ia_ifp == ifp) REPLACE(5); /* * Rule 6: Prefer matching label * Note that best_policy should be non-NULL here. */ if (dst_policy == NULL) dst_policy = lookup_addrsel_policy(dstsock); if (dst_policy->label != ADDR_LABEL_NOTAPP) { new_policy = lookup_addrsel_policy(&ia->ia_addr); if (dst_policy->label == best_policy->label && dst_policy->label != new_policy->label) NEXT(6); if (dst_policy->label != best_policy->label && dst_policy->label == new_policy->label) REPLACE(6); } /* * Rule 7: Prefer public addresses. * We allow users to reverse the logic by configuring * a sysctl variable, so that privacy conscious users can * always prefer temporary addresses. */ if (opts == NULL || opts->ip6po_prefer_tempaddr == IP6PO_TEMPADDR_SYSTEM) { prefer_tempaddr = ip6_prefer_tempaddr; } else if (opts->ip6po_prefer_tempaddr == IP6PO_TEMPADDR_NOTPREFER) { prefer_tempaddr = 0; } else prefer_tempaddr = 1; if (!(ia_best->ia6_flags & IN6_IFF_TEMPORARY) && (ia->ia6_flags & IN6_IFF_TEMPORARY)) { if (prefer_tempaddr) REPLACE(7); else NEXT(7); } if ((ia_best->ia6_flags & IN6_IFF_TEMPORARY) && !(ia->ia6_flags & IN6_IFF_TEMPORARY)) { if (prefer_tempaddr) NEXT(7); else REPLACE(7); } /* * Rule 8: prefer addresses on alive interfaces. * This is a KAME specific rule. */ if ((ia_best->ia_ifp->if_flags & IFF_UP) && !(ia->ia_ifp->if_flags & IFF_UP)) NEXT(8); if (!(ia_best->ia_ifp->if_flags & IFF_UP) && (ia->ia_ifp->if_flags & IFF_UP)) REPLACE(8); /* * Rule 9: prefer addresses on "preferred" interfaces. * This is a KAME specific rule. */ #ifdef notyet /* until introducing address selection */ #define NDI_BEST ND_IFINFO(ia_best->ia_ifp) #define NDI_NEW ND_IFINFO(ia->ia_ifp) if ((NDI_BEST->flags & ND6_IFF_PREFER_SOURCE) && !(NDI_NEW->flags & ND6_IFF_PREFER_SOURCE)) NEXT(9); if (!(NDI_BEST->flags & ND6_IFF_PREFER_SOURCE) && (NDI_NEW->flags & ND6_IFF_PREFER_SOURCE)) REPLACE(9); #undef NDI_BEST #undef NDI_NEW #endif /* * Rule 14: Use longest matching prefix. * Note: in the address selection draft, this rule is * documented as "Rule 8". However, since it is also * documented that this rule can be overridden, we assign * a large number so that it is easy to assign smaller numbers * to more preferred rules. */ new_matchlen = in6_matchlen(&ia->ia_addr.sin6_addr, dst); if (best_matchlen < new_matchlen) REPLACE(14); if (new_matchlen < best_matchlen) NEXT(14); /* Rule 15 is reserved. */ /* * Last resort: just keep the current candidate. * Or, do we need more rules? */ continue; replace: ia_best = ia; best_scope = (new_scope >= 0 ? new_scope : in6_addrscope(&ia_best->ia_addr.sin6_addr)); best_policy = (new_policy ? new_policy : lookup_addrsel_policy(&ia_best->ia_addr)); best_matchlen = (new_matchlen >= 0 ? new_matchlen : in6_matchlen(&ia_best->ia_addr.sin6_addr, dst)); next: continue; out: break; } return ia_best; } #undef REPLACE #undef BREAK #undef NEXT int in6_selectsrc(struct sockaddr_in6 *dstsock, struct ip6_pktopts *opts, struct ip6_moptions *mopts, struct route *ro, struct in6_addr *laddr, struct ifnet **ifpp, struct psref *psref, struct in6_addr *ret_ia6) { struct in6_addr dst; struct ifnet *ifp = NULL; struct in6_ifaddr *ia = NULL; struct in6_pktinfo *pi = NULL; u_int32_t odstzone; int error = 0, iferror; #if defined(MIP6) && NMIP > 0 u_int8_t ip6po_usecoa = 0; #endif /* MIP6 && NMIP > 0 */ struct psref local_psref; int bound = curlwp_bind(); #define PSREF (psref == NULL) ? &local_psref : psref int s; KASSERT((ifpp != NULL && psref != NULL) || (ifpp == NULL && psref == NULL)); dst = dstsock->sin6_addr; /* make a copy for local operation */ if (ifpp) *ifpp = NULL; /* * Try to determine the outgoing interface for the given destination. * We do this regardless of whether the socket is bound, since the * caller may need this information as a side effect of the call * to this function (e.g., for identifying the appropriate scope zone * ID). */ iferror = in6_selectif(dstsock, opts, mopts, ro, &ifp, PSREF); if (ifpp != NULL) *ifpp = ifp; /* * If the source address is explicitly specified by the caller, * check if the requested source address is indeed a unicast address * assigned to the node, and can be used as the packet's source * address. If everything is okay, use the address as source. */ if (opts && (pi = opts->ip6po_pktinfo) && !IN6_IS_ADDR_UNSPECIFIED(&pi->ipi6_addr)) { struct sockaddr_in6 srcsock; struct in6_ifaddr *ia6; int _s; struct ifaddr *ifa; /* * Determine the appropriate zone id of the source based on * the zone of the destination and the outgoing interface. * If the specified address is ambiguous wrt the scope zone, * the interface must be specified; otherwise, ifa_ifwithaddr() * will fail matching the address. */ memset(&srcsock, 0, sizeof(srcsock)); srcsock.sin6_family = AF_INET6; srcsock.sin6_len = sizeof(srcsock); srcsock.sin6_addr = pi->ipi6_addr; if (ifp) { error = in6_setscope(&srcsock.sin6_addr, ifp, NULL); if (error != 0) goto exit; } _s = pserialize_read_enter(); ifa = ifa_ifwithaddr(sin6tosa(&srcsock)); if ((ia6 = ifatoia6(ifa)) == NULL || ia6->ia6_flags & (IN6_IFF_ANYCAST | IN6_IFF_NOTREADY)) { pserialize_read_exit(_s); error = EADDRNOTAVAIL; goto exit; } pi->ipi6_addr = srcsock.sin6_addr; /* XXX: this overrides pi */ if (ifpp) *ifpp = ifp; *ret_ia6 = ia6->ia_addr.sin6_addr; pserialize_read_exit(_s); goto exit; } /* * If the socket has already bound the source, just use it. We don't * care at the moment whether in6_selectif() succeeded above, even * though it would eventually cause an error. */ if (laddr && !IN6_IS_ADDR_UNSPECIFIED(laddr)) { *ret_ia6 = *laddr; goto exit; } /* * The outgoing interface is crucial in the general selection procedure * below. If it is not known at this point, we fail. */ if (ifp == NULL) { error = iferror; goto exit; } /* * If the address is not yet determined, choose the best one based on * the outgoing interface and the destination address. */ #if defined(MIP6) && NMIP > 0 /* * a caller can specify IP6PO_USECOA to not to use a home * address. for example, the case that the neighbour * unreachability detection to the global address. */ if (opts != NULL && (opts->ip6po_flags & IP6PO_USECOA) != 0) { ip6po_usecoa = 1; } #endif /* MIP6 && NMIP > 0 */ error = in6_setscope(&dst, ifp, &odstzone); if (error != 0) goto exit; s = pserialize_read_enter(); ia = in6_select_best_ia(dstsock, &dst, ifp, opts, odstzone); if (ia == NULL) { pserialize_read_exit(s); error = EADDRNOTAVAIL; goto exit; } *ret_ia6 = ia->ia_addr.sin6_addr; pserialize_read_exit(s); exit: if (ifpp == NULL) if_put(ifp, PSREF); curlwp_bindx(bound); return error; #undef PSREF } int in6_selectroute(struct sockaddr_in6 *dstsock, struct ip6_pktopts *opts, struct route **ro, struct rtentry **retrt, bool count_discard) { int error = 0; struct rtentry *rt = NULL; union { struct sockaddr dst; struct sockaddr_in dst4; struct sockaddr_in6 dst6; } u; KASSERT(ro != NULL); KASSERT(*ro != NULL); KASSERT(retrt != NULL); #if 0 if (dstsock->sin6_addr.s6_addr32[0] == 0 && dstsock->sin6_addr.s6_addr32[1] == 0 && !IN6_IS_ADDR_LOOPBACK(&dstsock->sin6_addr)) { char ip6buf[INET6_ADDRSTRLEN]; printf("%s: strange destination %s\n", __func__, IN6_PRINT(ip6buf, &dstsock->sin6_addr)); } else { char ip6buf[INET6_ADDRSTRLEN]; printf("%s: destination = %s%%%d\n", __func__, IN6_PRINT(ip6buf, &dstsock->sin6_addr), dstsock->sin6_scope_id); /* for debug */ } #endif /* * If the next hop address for the packet is specified by the caller, * use it as the gateway. */ if (opts && opts->ip6po_nexthop) { struct route *ron; struct sockaddr_in6 *sin6_next; sin6_next = satosin6(opts->ip6po_nexthop); /* at this moment, we only support AF_INET6 next hops */ if (sin6_next->sin6_family != AF_INET6) { IP6_STATINC(IP6_STAT_ODROPPED); error = EAFNOSUPPORT; /* or should we proceed? */ goto done; } /* * If the next hop is an IPv6 address, then the node identified * by that address must be a neighbor of the sending host. */ ron = &opts->ip6po_nextroute; rt = rtcache_lookup(ron, sin6tosa(sin6_next)); if (rt == NULL || (rt->rt_flags & RTF_GATEWAY) != 0 || !nd6_is_addr_neighbor(sin6_next, rt->rt_ifp)) { if (rt != NULL) { if (count_discard) in6_ifstat_inc(rt->rt_ifp, ifs6_out_discard); rtcache_unref(rt, ron); rt = NULL; } rtcache_free(ron); error = EHOSTUNREACH; goto done; } *ro = ron; goto done; } /* * Use a cached route if it exists and is valid, else try to allocate * a new one. Note that we should check the address family of the * cached destination, in case of sharing the cache with IPv4. * * for V4 mapped addresses we want to pick up the v4 route * see PR kern/56348 */ if (IN6_IS_ADDR_V4MAPPED(&dstsock->sin6_addr)) { in6_sin6_2_sin(&u.dst4, dstsock); } else { u.dst6 = *dstsock; u.dst6.sin6_scope_id = 0; } rt = rtcache_lookup1(*ro, &u.dst, 1); if (rt == NULL) error = EHOSTUNREACH; /* * Check if the outgoing interface conflicts with * the interface specified by ipi6_ifindex (if specified). * Note that loopback interface is always okay. * (this may happen when we are sending a packet to one of * our own addresses.) */ if (opts && opts->ip6po_pktinfo && opts->ip6po_pktinfo->ipi6_ifindex) { if (rt != NULL && !(rt->rt_ifp->if_flags & IFF_LOOPBACK) && rt->rt_ifp->if_index != opts->ip6po_pktinfo->ipi6_ifindex) { if (count_discard) in6_ifstat_inc(rt->rt_ifp, ifs6_out_discard); error = EHOSTUNREACH; rtcache_unref(rt, *ro); rt = NULL; } } done: if (error == EHOSTUNREACH) IP6_STATINC(IP6_STAT_NOROUTE); *retrt = rt; return error; } static int in6_selectif(struct sockaddr_in6 *dstsock, struct ip6_pktopts *opts, struct ip6_moptions *mopts, struct route *ro, struct ifnet **retifp, struct psref *psref) { int error = 0; struct rtentry *rt = NULL; struct in6_addr *dst; struct in6_pktinfo *pi = NULL; KASSERT(retifp != NULL); *retifp = NULL; dst = &dstsock->sin6_addr; /* If the caller specify the outgoing interface explicitly, use it. */ if (opts && (pi = opts->ip6po_pktinfo) != NULL && pi->ipi6_ifindex) { /* XXX boundary check is assumed to be already done. */ *retifp = if_get_byindex(pi->ipi6_ifindex, psref); if (*retifp != NULL) return 0; goto getroute; } /* * If the destination address is a multicast address and the outgoing * interface for the address is specified by the caller, use it. */ if (IN6_IS_ADDR_MULTICAST(dst) && mopts != NULL) { *retifp = if_get_byindex(mopts->im6o_multicast_if_index, psref); if (*retifp != NULL) return 0; /* we do not need a route for multicast. */ } getroute: error = in6_selectroute(dstsock, opts, &ro, &rt, false); if (error != 0) return error; *retifp = if_get_byindex(rt->rt_ifp->if_index, psref); /* * do not use a rejected or black hole route. * XXX: this check should be done in the L2 output routine. * However, if we skipped this check here, we'd see the following * scenario: * - install a rejected route for a scoped address prefix * (like fe80::/10) * - send a packet to a destination that matches the scoped prefix, * with ambiguity about the scope zone. * - pick the outgoing interface from the route, and disambiguate the * scope zone with the interface. * - ip6_output() would try to get another route with the "new" * destination, which may be valid. * - we'd see no error on output. * Although this may not be very harmful, it should still be confusing. * We thus reject the case here. */ if ((rt->rt_flags & (RTF_REJECT | RTF_BLACKHOLE))) { error = (rt->rt_flags & RTF_HOST ? EHOSTUNREACH : ENETUNREACH); /* XXX: ifp can be returned with psref even if error */ goto out; } /* * Adjust the "outgoing" interface. If we're going to loop the packet * back to ourselves, the ifp would be the loopback interface. * However, we'd rather know the interface associated to the * destination address (which should probably be one of our own * addresses.) */ if (rt->rt_ifa->ifa_ifp != *retifp && !if_is_deactivated(rt->rt_ifa->ifa_ifp)) { if_put(*retifp, psref); *retifp = rt->rt_ifa->ifa_ifp; if_acquire(*retifp, psref); } out: rtcache_unref(rt, ro); return error; } /* * Default hop limit selection. The precedence is as follows: * 1. Hoplimit value specified via ioctl. * 2. (If the outgoing interface is detected) the current * hop limit of the interface specified by router advertisement. * 3. The system default hoplimit. */ int in6pcb_selecthlim(struct inpcb *inp, struct ifnet *ifp) { if (inp && in6p_hops6(inp) >= 0) return in6p_hops6(inp); else if (ifp) return (ND_IFINFO(ifp)->chlim); else return (ip6_defhlim); } int in6pcb_selecthlim_rt(struct inpcb *inp) { struct rtentry *rt; if (inp == NULL) return in6pcb_selecthlim(inp, NULL); rt = rtcache_validate(&inp->inp_route); if (rt != NULL) { int ret = in6pcb_selecthlim(inp, rt->rt_ifp); rtcache_unref(rt, &inp->inp_route); return ret; } else return in6pcb_selecthlim(inp, NULL); } /* * Find an empty port and set it to the specified PCB. */ int in6pcb_set_port(struct sockaddr_in6 *sin6, struct inpcb *inp, struct lwp *l) { struct socket *so = inp->inp_socket; struct inpcbtable *table = inp->inp_table; u_int16_t lport, *lastport; enum kauth_network_req req; int error = 0; if (inp->inp_flags & IN6P_LOWPORT) { #ifndef IPNOPRIVPORTS req = KAUTH_REQ_NETWORK_BIND_PRIVPORT; #else req = KAUTH_REQ_NETWORK_BIND_PORT; #endif lastport = &table->inpt_lastlow; } else { req = KAUTH_REQ_NETWORK_BIND_PORT; lastport = &table->inpt_lastport; } /* XXX-kauth: KAUTH_REQ_NETWORK_BIND_AUTOASSIGN_{,PRIV}PORT */ error = kauth_authorize_network(l->l_cred, KAUTH_NETWORK_BIND, req, so, sin6, NULL); if (error) return (EACCES); /* * Use RFC6056 randomized port selection */ error = portalgo_randport(&lport, inp, l->l_cred); if (error) return error; inp->inp_flags |= IN6P_ANONPORT; *lastport = lport; inp->inp_lport = htons(lport); in6pcb_set_state(inp, INP_BOUND); return (0); /* success */ } void addrsel_policy_init(void) { init_policy_queue(); /* initialize the "last resort" policy */ memset(&defaultaddrpolicy, 0, sizeof(defaultaddrpolicy)); defaultaddrpolicy.label = ADDR_LABEL_NOTAPP; } /* * XXX: NOMPSAFE if a policy is set */ static struct in6_addrpolicy * lookup_addrsel_policy(struct sockaddr_in6 *key) { struct in6_addrpolicy *match = NULL; match = match_addrsel_policy(key); if (match == NULL) match = &defaultaddrpolicy; else match->use++; return (match); } /* * Subroutines to manage the address selection policy table via sysctl. */ struct sel_walkarg { size_t w_total; size_t w_given; void * w_where; void *w_limit; }; int sysctl_net_inet6_addrctlpolicy(SYSCTLFN_ARGS); int sysctl_net_inet6_addrctlpolicy(SYSCTLFN_ARGS) { int error = 0; int s; s = splsoftnet(); if (newp) { error = EPERM; goto end; } if (oldp && oldlenp == NULL) { error = EINVAL; goto end; } if (oldp || oldlenp) { struct sel_walkarg w; size_t oldlen = *oldlenp; memset(&w, 0, sizeof(w)); w.w_given = oldlen; w.w_where = oldp; if (oldp) w.w_limit = (char *)oldp + oldlen; error = walk_addrsel_policy(dump_addrsel_policyent, &w); *oldlenp = w.w_total; if (oldp && w.w_total > oldlen && error == 0) error = ENOMEM; } end: splx(s); return (error); } int in6_src_ioctl(u_long cmd, void *data) { int i; struct in6_addrpolicy ent0; if (cmd != SIOCAADDRCTL_POLICY && cmd != SIOCDADDRCTL_POLICY) return (EOPNOTSUPP); /* check for safety */ ent0 = *(struct in6_addrpolicy *)data; if (ent0.label == ADDR_LABEL_NOTAPP) return (EINVAL); /* check if the prefix mask is consecutive. */ if (in6_mask2len(&ent0.addrmask.sin6_addr, NULL) < 0) return (EINVAL); /* clear trailing garbages (if any) of the prefix address. */ for (i = 0; i < 4; i++) { ent0.addr.sin6_addr.s6_addr32[i] &= ent0.addrmask.sin6_addr.s6_addr32[i]; } ent0.use = 0; switch (cmd) { case SIOCAADDRCTL_POLICY: return (add_addrsel_policyent(&ent0)); case SIOCDADDRCTL_POLICY: return (delete_addrsel_policyent(&ent0)); } return (0); /* XXX: compromise compilers */ } /* * The followings are implementation of the policy table using a * simple tail queue. * XXX such details should be hidden. * XXX implementation using binary tree should be more efficient. */ struct addrsel_policyent { TAILQ_ENTRY(addrsel_policyent) ape_entry; struct in6_addrpolicy ape_policy; }; TAILQ_HEAD(addrsel_policyhead, addrsel_policyent); struct addrsel_policyhead addrsel_policytab; static void init_policy_queue(void) { TAILQ_INIT(&addrsel_policytab); } static int add_addrsel_policyent(struct in6_addrpolicy *newpolicy) { struct addrsel_policyent *newpol, *pol; /* duplication check */ TAILQ_FOREACH(pol, &addrsel_policytab, ape_entry) { if (IN6_ARE_ADDR_EQUAL(&newpolicy->addr.sin6_addr, &pol->ape_policy.addr.sin6_addr) && IN6_ARE_ADDR_EQUAL(&newpolicy->addrmask.sin6_addr, &pol->ape_policy.addrmask.sin6_addr)) { return (EEXIST); /* or override it? */ } } newpol = malloc(sizeof(*newpol), M_IFADDR, M_WAITOK|M_ZERO); /* XXX: should validate entry */ newpol->ape_policy = *newpolicy; TAILQ_INSERT_TAIL(&addrsel_policytab, newpol, ape_entry); return (0); } static int delete_addrsel_policyent(struct in6_addrpolicy *key) { struct addrsel_policyent *pol; /* search for the entry in the table */ for (pol = TAILQ_FIRST(&addrsel_policytab); pol; pol = TAILQ_NEXT(pol, ape_entry)) { if (IN6_ARE_ADDR_EQUAL(&key->addr.sin6_addr, &pol->ape_policy.addr.sin6_addr) && IN6_ARE_ADDR_EQUAL(&key->addrmask.sin6_addr, &pol->ape_policy.addrmask.sin6_addr)) { break; } } if (pol == NULL) { return (ESRCH); } TAILQ_REMOVE(&addrsel_policytab, pol, ape_entry); return (0); } static int walk_addrsel_policy(int (*callback)(struct in6_addrpolicy *, void *), void *w) { struct addrsel_policyent *pol; int error = 0; TAILQ_FOREACH(pol, &addrsel_policytab, ape_entry) { if ((error = (*callback)(&pol->ape_policy, w)) != 0) return error; } return error; } static int dump_addrsel_policyent(struct in6_addrpolicy *pol, void *arg) { int error = 0; struct sel_walkarg *w = arg; if (w->w_where && (char *)w->w_where + sizeof(*pol) <= (char *)w->w_limit) { if ((error = copyout(pol, w->w_where, sizeof(*pol))) != 0) return error; w->w_where = (char *)w->w_where + sizeof(*pol); } w->w_total += sizeof(*pol); return error; } static struct in6_addrpolicy * match_addrsel_policy(struct sockaddr_in6 *key) { struct addrsel_policyent *pent; struct in6_addrpolicy *bestpol = NULL, *pol; int matchlen, bestmatchlen = -1; u_char *mp, *ep, *k, *p, m; for (pent = TAILQ_FIRST(&addrsel_policytab); pent; pent = TAILQ_NEXT(pent, ape_entry)) { matchlen = 0; pol = &pent->ape_policy; mp = (u_char *)&pol->addrmask.sin6_addr; ep = mp + 16; /* XXX: scope field? */ k = (u_char *)&key->sin6_addr; p = (u_char *)&pol->addr.sin6_addr; for (; mp < ep && *mp; mp++, k++, p++) { m = *mp; if ((*k & m) != *p) goto next; /* not match */ if (m == 0xff) /* short cut for a typical case */ matchlen += 8; else { while (m >= 0x80) { matchlen++; m <<= 1; } } } /* matched. check if this is better than the current best. */ if (bestpol == NULL || matchlen > bestmatchlen) { bestpol = pol; bestmatchlen = matchlen; } next: continue; } return (bestpol); }
586 1024 1276 1273 1024 11 11 2 2 577 576 570 576 3 3 101 102 213 212 51 52 32 31 5 5 579 6 9 9 9 9 8 9 9 9 9 3 3 9 9 6 6 6 2 1 1 2 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 /* $NetBSD: kern_tc.c,v 1.77 2024/05/11 06:34:45 andvar Exp $ */ /*- * Copyright (c) 2008, 2009 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Andrew Doran. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /*- * ---------------------------------------------------------------------------- * "THE BEER-WARE LICENSE" (Revision 42): * <phk@FreeBSD.ORG> wrote this file. As long as you retain this notice you * can do whatever you want with this stuff. If we meet some day, and you think * this stuff is worth it, you can buy me a beer in return. Poul-Henning Kamp * --------------------------------------------------------------------------- */ /* * https://papers.freebsd.org/2002/phk-timecounters.files/timecounter.pdf */ #include <sys/cdefs.h> /* __FBSDID("$FreeBSD: src/sys/kern/kern_tc.c,v 1.166 2005/09/19 22:16:31 andre Exp $"); */ __KERNEL_RCSID(0, "$NetBSD: kern_tc.c,v 1.77 2024/05/11 06:34:45 andvar Exp $"); #ifdef _KERNEL_OPT #include "opt_ntp.h" #endif #include <sys/param.h> #include <sys/atomic.h> #include <sys/evcnt.h> #include <sys/kauth.h> #include <sys/kernel.h> #include <sys/lock.h> #include <sys/mutex.h> #include <sys/reboot.h> /* XXX just to get AB_VERBOSE */ #include <sys/sysctl.h> #include <sys/syslog.h> #include <sys/systm.h> #include <sys/timepps.h> #include <sys/timetc.h> #include <sys/timex.h> #include <sys/xcall.h> /* * A large step happens on boot. This constant detects such steps. * It is relatively small so that ntp_update_second gets called enough * in the typical 'missed a couple of seconds' case, but doesn't loop * forever when the time step is large. */ #define LARGE_STEP 200 /* * Implement a dummy timecounter which we can use until we get a real one * in the air. This allows the console and other early stuff to use * time services. */ static u_int dummy_get_timecount(struct timecounter *tc) { static u_int now; return ++now; } static struct timecounter dummy_timecounter = { .tc_get_timecount = dummy_get_timecount, .tc_counter_mask = ~0u, .tc_frequency = 1000000, .tc_name = "dummy", .tc_quality = -1000000, .tc_priv = NULL, }; struct timehands { /* These fields must be initialized by the driver. */ struct timecounter *th_counter; /* active timecounter */ int64_t th_adjustment; /* frequency adjustment */ /* (NTP/adjtime) */ uint64_t th_scale; /* scale factor (counter */ /* tick->time) */ uint64_t th_offset_count; /* offset at last time */ /* update (tc_windup()) */ struct bintime th_offset; /* bin (up)time at windup */ struct timeval th_microtime; /* cached microtime */ struct timespec th_nanotime; /* cached nanotime */ /* Fields not to be copied in tc_windup start with th_generation. */ volatile u_int th_generation; /* current genration */ struct timehands *th_next; /* next timehand */ }; static struct timehands th0; static struct timehands th9 = { .th_next = &th0, }; static struct timehands th8 = { .th_next = &th9, }; static struct timehands th7 = { .th_next = &th8, }; static struct timehands th6 = { .th_next = &th7, }; static struct timehands th5 = { .th_next = &th6, }; static struct timehands th4 = { .th_next = &th5, }; static struct timehands th3 = { .th_next = &th4, }; static struct timehands th2 = { .th_next = &th3, }; static struct timehands th1 = { .th_next = &th2, }; static struct timehands th0 = { .th_counter = &dummy_timecounter, .th_scale = (uint64_t)-1 / 1000000, .th_offset = { .sec = 1, .frac = 0 }, .th_generation = 1, .th_next = &th1, }; static struct timehands *volatile timehands = &th0; struct timecounter *timecounter = &dummy_timecounter; static struct timecounter *timecounters = &dummy_timecounter; /* used by savecore(8) */ time_t time_second_legacy asm("time_second"); #ifdef __HAVE_ATOMIC64_LOADSTORE volatile time_t time__second __cacheline_aligned = 1; volatile time_t time__uptime __cacheline_aligned = 1; #else static volatile struct { uint32_t lo, hi; } time__uptime32 __cacheline_aligned = { .lo = 1, }, time__second32 __cacheline_aligned = { .lo = 1, }; #endif static struct { struct bintime bin; volatile unsigned gen; /* even when stable, odd when changing */ } timebase __cacheline_aligned; static int timestepwarnings; kmutex_t timecounter_lock; static u_int timecounter_mods; static volatile int timecounter_removals = 1; static u_int timecounter_bad; #ifdef __HAVE_ATOMIC64_LOADSTORE static inline void setrealuptime(time_t second, time_t uptime) { time_second_legacy = second; atomic_store_relaxed(&time__second, second); atomic_store_relaxed(&time__uptime, uptime); } #else static inline void setrealuptime(time_t second, time_t uptime) { uint32_t seclo = second & 0xffffffff, sechi = second >> 32; uint32_t uplo = uptime & 0xffffffff, uphi = uptime >> 32; KDASSERT(mutex_owned(&timecounter_lock)); time_second_legacy = second; /* * Fast path -- no wraparound, just updating the low bits, so * no need for seqlocked access. */ if (__predict_true(sechi == time__second32.hi) && __predict_true(uphi == time__uptime32.hi)) { atomic_store_relaxed(&time__second32.lo, seclo); atomic_store_relaxed(&time__uptime32.lo, uplo); return; } atomic_store_relaxed(&time__second32.hi, 0xffffffff); atomic_store_relaxed(&time__uptime32.hi, 0xffffffff); membar_producer(); atomic_store_relaxed(&time__second32.lo, seclo); atomic_store_relaxed(&time__uptime32.lo, uplo); membar_producer(); atomic_store_relaxed(&time__second32.hi, sechi); atomic_store_relaxed(&time__uptime32.hi, uphi); } time_t getrealtime(void) { uint32_t lo, hi; do { for (;;) { hi = atomic_load_relaxed(&time__second32.hi); if (__predict_true(hi != 0xffffffff)) break; SPINLOCK_BACKOFF_HOOK; } membar_consumer(); lo = atomic_load_relaxed(&time__second32.lo); membar_consumer(); } while (hi != atomic_load_relaxed(&time__second32.hi)); return ((time_t)hi << 32) | lo; } time_t getuptime(void) { uint32_t lo, hi; do { for (;;) { hi = atomic_load_relaxed(&time__uptime32.hi); if (__predict_true(hi != 0xffffffff)) break; SPINLOCK_BACKOFF_HOOK; } membar_consumer(); lo = atomic_load_relaxed(&time__uptime32.lo); membar_consumer(); } while (hi != atomic_load_relaxed(&time__uptime32.hi)); return ((time_t)hi << 32) | lo; } time_t getboottime(void) { return getrealtime() - getuptime(); } uint32_t getuptime32(void) { return atomic_load_relaxed(&time__uptime32.lo); } #endif /* !defined(__HAVE_ATOMIC64_LOADSTORE) */ /* * sysctl helper routine for kern.timercounter.hardware */ static int sysctl_kern_timecounter_hardware(SYSCTLFN_ARGS) { struct sysctlnode node; int error; char newname[MAX_TCNAMELEN]; struct timecounter *newtc, *tc; tc = timecounter; strlcpy(newname, tc->tc_name, sizeof(newname)); node = *rnode; node.sysctl_data = newname; node.sysctl_size = sizeof(newname); error = sysctl_lookup(SYSCTLFN_CALL(&node)); if (error || newp == NULL || strncmp(newname, tc->tc_name, sizeof(newname)) == 0) return error; if (l != NULL && (error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_TIME, KAUTH_REQ_SYSTEM_TIME_TIMECOUNTERS, newname, NULL, NULL)) != 0) return error; if (!cold) mutex_spin_enter(&timecounter_lock); error = EINVAL; for (newtc = timecounters; newtc != NULL; newtc = newtc->tc_next) { if (strcmp(newname, newtc->tc_name) != 0) continue; /* Warm up new timecounter. */ (void)newtc->tc_get_timecount(newtc); (void)newtc->tc_get_timecount(newtc); timecounter = newtc; error = 0; break; } if (!cold) mutex_spin_exit(&timecounter_lock); return error; } static int sysctl_kern_timecounter_choice(SYSCTLFN_ARGS) { char buf[MAX_TCNAMELEN+48]; char *where; const char *spc; struct timecounter *tc; size_t needed, left, slen; int error, mods; if (newp != NULL) return EPERM; if (namelen != 0) return EINVAL; mutex_spin_enter(&timecounter_lock); retry: spc = ""; error = 0; needed = 0; left = *oldlenp; where = oldp; for (tc = timecounters; error == 0 && tc != NULL; tc = tc->tc_next) { if (where == NULL) { needed += sizeof(buf); /* be conservative */ } else { slen = snprintf(buf, sizeof(buf), "%s%s(q=%d, f=%" PRId64 " Hz)", spc, tc->tc_name, tc->tc_quality, tc->tc_frequency); if (left < slen + 1) break; mods = timecounter_mods; mutex_spin_exit(&timecounter_lock); error = copyout(buf, where, slen + 1); mutex_spin_enter(&timecounter_lock); if (mods != timecounter_mods) { goto retry; } spc = " "; where += slen; needed += slen; left -= slen; } } mutex_spin_exit(&timecounter_lock); *oldlenp = needed; return error; } SYSCTL_SETUP(sysctl_timecounter_setup, "sysctl timecounter setup") { const struct sysctlnode *node; sysctl_createv(clog, 0, NULL, &node, CTLFLAG_PERMANENT, CTLTYPE_NODE, "timecounter", SYSCTL_DESCR("time counter information"), NULL, 0, NULL, 0, CTL_KERN, CTL_CREATE, CTL_EOL); if (node != NULL) { sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT, CTLTYPE_STRING, "choice", SYSCTL_DESCR("available counters"), sysctl_kern_timecounter_choice, 0, NULL, 0, CTL_KERN, node->sysctl_num, CTL_CREATE, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_STRING, "hardware", SYSCTL_DESCR("currently active time counter"), sysctl_kern_timecounter_hardware, 0, NULL, MAX_TCNAMELEN, CTL_KERN, node->sysctl_num, CTL_CREATE, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_INT, "timestepwarnings", SYSCTL_DESCR("log time steps"), NULL, 0, &timestepwarnings, 0, CTL_KERN, node->sysctl_num, CTL_CREATE, CTL_EOL); } } #ifdef TC_COUNTERS #define TC_STATS(name) \ static struct evcnt n##name = \ EVCNT_INITIALIZER(EVCNT_TYPE_MISC, NULL, "timecounter", #name); \ EVCNT_ATTACH_STATIC(n##name) TC_STATS(binuptime); TC_STATS(nanouptime); TC_STATS(microuptime); TC_STATS(bintime); TC_STATS(nanotime); TC_STATS(microtime); TC_STATS(getbinuptime); TC_STATS(getnanouptime); TC_STATS(getmicrouptime); TC_STATS(getbintime); TC_STATS(getnanotime); TC_STATS(getmicrotime); TC_STATS(setclock); #define TC_COUNT(var) var.ev_count++ #undef TC_STATS #else #define TC_COUNT(var) /* nothing */ #endif /* TC_COUNTERS */ static void tc_windup(void); /* * Return the difference between the timehands' counter value now and what * was when we copied it to the timehands' offset_count. */ static inline u_int tc_delta(struct timehands *th) { struct timecounter *tc; tc = th->th_counter; return (tc->tc_get_timecount(tc) - th->th_offset_count) & tc->tc_counter_mask; } /* * Functions for reading the time. We have to loop until we are sure that * the timehands that we operated on was not updated under our feet. See * the comment in <sys/timevar.h> for a description of these 12 functions. */ void binuptime(struct bintime *bt) { struct timehands *th; lwp_t *l; u_int lgen, gen; TC_COUNT(nbinuptime); /* * Provide exclusion against tc_detach(). * * We record the number of timecounter removals before accessing * timecounter state. Note that the LWP can be using multiple * "generations" at once, due to interrupts (interrupted while in * this function). Hardware interrupts will borrow the interrupted * LWP's l_tcgen value for this purpose, and can themselves be * interrupted by higher priority interrupts. In this case we need * to ensure that the oldest generation in use is recorded. * * splsched() is too expensive to use, so we take care to structure * this code in such a way that it is not required. Likewise, we * do not disable preemption. * * Memory barriers are also too expensive to use for such a * performance critical function. The good news is that we do not * need memory barriers for this type of exclusion, as the thread * updating timecounter_removals will issue a broadcast cross call * before inspecting our l_tcgen value (this elides memory ordering * issues). * * XXX If the author of the above comment knows how to make it * safe to avoid memory barriers around the access to * th->th_generation, I'm all ears. */ l = curlwp; lgen = l->l_tcgen; if (__predict_true(lgen == 0)) { l->l_tcgen = timecounter_removals; } __insn_barrier(); do { th = atomic_load_consume(&timehands); gen = th->th_generation; membar_consumer(); *bt = th->th_offset; bintime_addx(bt, th->th_scale * tc_delta(th)); membar_consumer(); } while (gen == 0 || gen != th->th_generation); __insn_barrier(); l->l_tcgen = lgen; } void nanouptime(struct timespec *tsp) { struct bintime bt; TC_COUNT(nnanouptime); binuptime(&bt); bintime2timespec(&bt, tsp); } void microuptime(struct timeval *tvp) { struct bintime bt; TC_COUNT(nmicrouptime); binuptime(&bt); bintime2timeval(&bt, tvp); } void bintime(struct bintime *bt) { struct bintime boottime; TC_COUNT(nbintime); binuptime(bt); getbinboottime(&boottime); bintime_add(bt, &boottime); } void nanotime(struct timespec *tsp) { struct bintime bt; TC_COUNT(nnanotime); bintime(&bt); bintime2timespec(&bt, tsp); } void microtime(struct timeval *tvp) { struct bintime bt; TC_COUNT(nmicrotime); bintime(&bt); bintime2timeval(&bt, tvp); } void getbinuptime(struct bintime *bt) { struct timehands *th; u_int gen; TC_COUNT(ngetbinuptime); do { th = atomic_load_consume(&timehands); gen = th->th_generation; membar_consumer(); *bt = th->th_offset; membar_consumer(); } while (gen == 0 || gen != th->th_generation); } void getnanouptime(struct timespec *tsp) { struct timehands *th; u_int gen; TC_COUNT(ngetnanouptime); do { th = atomic_load_consume(&timehands); gen = th->th_generation; membar_consumer(); bintime2timespec(&th->th_offset, tsp); membar_consumer(); } while (gen == 0 || gen != th->th_generation); } void getmicrouptime(struct timeval *tvp) { struct timehands *th; u_int gen; TC_COUNT(ngetmicrouptime); do { th = atomic_load_consume(&timehands); gen = th->th_generation; membar_consumer(); bintime2timeval(&th->th_offset, tvp); membar_consumer(); } while (gen == 0 || gen != th->th_generation); } void getbintime(struct bintime *bt) { struct timehands *th; struct bintime boottime; u_int gen; TC_COUNT(ngetbintime); do { th = atomic_load_consume(&timehands); gen = th->th_generation; membar_consumer(); *bt = th->th_offset; membar_consumer(); } while (gen == 0 || gen != th->th_generation); getbinboottime(&boottime); bintime_add(bt, &boottime); } static inline void dogetnanotime(struct timespec *tsp) { struct timehands *th; u_int gen; TC_COUNT(ngetnanotime); do { th = atomic_load_consume(&timehands); gen = th->th_generation; membar_consumer(); *tsp = th->th_nanotime; membar_consumer(); } while (gen == 0 || gen != th->th_generation); } void getnanotime(struct timespec *tsp) { dogetnanotime(tsp); } void dtrace_getnanotime(struct timespec *tsp); void dtrace_getnanotime(struct timespec *tsp) { dogetnanotime(tsp); } void getmicrotime(struct timeval *tvp) { struct timehands *th; u_int gen; TC_COUNT(ngetmicrotime); do { th = atomic_load_consume(&timehands); gen = th->th_generation; membar_consumer(); *tvp = th->th_microtime; membar_consumer(); } while (gen == 0 || gen != th->th_generation); } void getnanoboottime(struct timespec *tsp) { struct bintime bt; getbinboottime(&bt); bintime2timespec(&bt, tsp); } void getmicroboottime(struct timeval *tvp) { struct bintime bt; getbinboottime(&bt); bintime2timeval(&bt, tvp); } void getbinboottime(struct bintime *basep) { struct bintime base; unsigned gen; do { /* Spin until the timebase isn't changing. */ while ((gen = atomic_load_relaxed(&timebase.gen)) & 1) SPINLOCK_BACKOFF_HOOK; /* Read out a snapshot of the timebase. */ membar_consumer(); base = timebase.bin; membar_consumer(); /* Restart if it changed while we were reading. */ } while (gen != atomic_load_relaxed(&timebase.gen)); *basep = base; } /* * Initialize a new timecounter and possibly use it. */ void tc_init(struct timecounter *tc) { u_int u; KASSERTMSG(tc->tc_next == NULL, "timecounter %s already initialised", tc->tc_name); u = tc->tc_frequency / tc->tc_counter_mask; /* XXX: We need some margin here, 10% is a guess */ u *= 11; u /= 10; if (u > hz && tc->tc_quality >= 0) { tc->tc_quality = -2000; aprint_verbose( "timecounter: Timecounter \"%s\" frequency %ju Hz", tc->tc_name, (uintmax_t)tc->tc_frequency); aprint_verbose(" -- Insufficient hz, needs at least %u\n", u); } else if (tc->tc_quality >= 0 || bootverbose) { aprint_verbose( "timecounter: Timecounter \"%s\" frequency %ju Hz " "quality %d\n", tc->tc_name, (uintmax_t)tc->tc_frequency, tc->tc_quality); } mutex_spin_enter(&timecounter_lock); tc->tc_next = timecounters; timecounters = tc; timecounter_mods++; /* * Never automatically use a timecounter with negative quality. * Even though we run on the dummy counter, switching here may be * worse since this timecounter may not be monotonous. */ if (tc->tc_quality >= 0 && (tc->tc_quality > timecounter->tc_quality || (tc->tc_quality == timecounter->tc_quality && tc->tc_frequency > timecounter->tc_frequency))) { (void)tc->tc_get_timecount(tc); (void)tc->tc_get_timecount(tc); timecounter = tc; tc_windup(); } mutex_spin_exit(&timecounter_lock); } /* * Pick a new timecounter due to the existing counter going bad. */ static void tc_pick(void) { struct timecounter *best, *tc; KASSERT(mutex_owned(&timecounter_lock)); for (best = tc = timecounters; tc != NULL; tc = tc->tc_next) { if (tc->tc_quality > best->tc_quality) best = tc; else if (tc->tc_quality < best->tc_quality) continue; else if (tc->tc_frequency > best->tc_frequency) best = tc; } (void)best->tc_get_timecount(best); (void)best->tc_get_timecount(best); timecounter = best; } /* * A timecounter has gone bad, arrange to pick a new one at the next * clock tick. */ void tc_gonebad(struct timecounter *tc) { tc->tc_quality = -100; membar_producer(); atomic_inc_uint(&timecounter_bad); } /* * Stop using a timecounter and remove it from the timecounters list. */ int tc_detach(struct timecounter *target) { struct timecounter *tc; struct timecounter **tcp = NULL; int removals; lwp_t *l; /* First, find the timecounter. */ mutex_spin_enter(&timecounter_lock); for (tcp = &timecounters, tc = timecounters; tc != NULL; tcp = &tc->tc_next, tc = tc->tc_next) { if (tc == target) break; } if (tc == NULL) { mutex_spin_exit(&timecounter_lock); return ESRCH; } /* And now, remove it. */ *tcp = tc->tc_next; if (timecounter == target) { tc_pick(); tc_windup(); } timecounter_mods++; removals = timecounter_removals++; mutex_spin_exit(&timecounter_lock); /* * We now have to determine if any threads in the system are still * making use of this timecounter. * * We issue a broadcast cross call to elide memory ordering issues, * then scan all LWPs in the system looking at each's timecounter * generation number. We need to see a value of zero (not actively * using a timecounter) or a value greater than our removal value. * * We may race with threads that read `timecounter_removals' and * and then get preempted before updating `l_tcgen'. This is not * a problem, since it means that these threads have not yet started * accessing timecounter state. All we do need is one clean * snapshot of the system where every thread appears not to be using * old timecounter state. */ for (;;) { xc_barrier(0); mutex_enter(&proc_lock); LIST_FOREACH(l, &alllwp, l_list) { if (l->l_tcgen == 0 || l->l_tcgen > removals) { /* * Not using timecounter or old timecounter * state at time of our xcall or later. */ continue; } break; } mutex_exit(&proc_lock); /* * If the timecounter is still in use, wait at least 10ms * before retrying. */ if (l == NULL) { break; } (void)kpause("tcdetach", false, mstohz(10), NULL); } tc->tc_next = NULL; return 0; } /* Report the frequency of the current timecounter. */ uint64_t tc_getfrequency(void) { return atomic_load_consume(&timehands)->th_counter->tc_frequency; } /* * Step our concept of UTC. This is done by modifying our estimate of * when we booted. */ void tc_setclock(const struct timespec *ts) { struct timespec ts2; struct bintime bt, bt2; mutex_spin_enter(&timecounter_lock); TC_COUNT(nsetclock); binuptime(&bt2); timespec2bintime(ts, &bt); bintime_sub(&bt, &bt2); bintime_add(&bt2, &timebase.bin); timebase.gen |= 1; /* change in progress */ membar_producer(); timebase.bin = bt; membar_producer(); timebase.gen++; /* commit change */ tc_windup(); mutex_spin_exit(&timecounter_lock); if (timestepwarnings) { bintime2timespec(&bt2, &ts2); log(LOG_INFO, "Time stepped from %lld.%09ld to %lld.%09ld\n", (long long)ts2.tv_sec, ts2.tv_nsec, (long long)ts->tv_sec, ts->tv_nsec); } } /* * Initialize the next struct timehands in the ring and make * it the active timehands. Along the way we might switch to a different * timecounter and/or do seconds processing in NTP. Slightly magic. */ static void tc_windup(void) { struct bintime bt; struct timehands *th, *tho; uint64_t scale; u_int delta, ncount, ogen; int i, s_update; time_t t; KASSERT(mutex_owned(&timecounter_lock)); s_update = 0; /* * Make the next timehands a copy of the current one, but do not * overwrite the generation or next pointer. While we update * the contents, the generation must be zero. Ensure global * visibility of the generation before proceeding. */ tho = timehands; th = tho->th_next; ogen = th->th_generation; th->th_generation = 0; membar_producer(); bcopy(tho, th, offsetof(struct timehands, th_generation)); /* * Capture a timecounter delta on the current timecounter and if * changing timecounters, a counter value from the new timecounter. * Update the offset fields accordingly. */ delta = tc_delta(th); if (th->th_counter != timecounter) ncount = timecounter->tc_get_timecount(timecounter); else ncount = 0; th->th_offset_count += delta; bintime_addx(&th->th_offset, th->th_scale * delta); /* * Hardware latching timecounters may not generate interrupts on * PPS events, so instead we poll them. There is a finite risk that * the hardware might capture a count which is later than the one we * got above, and therefore possibly in the next NTP second which might * have a different rate than the current NTP second. It doesn't * matter in practice. */ if (tho->th_counter->tc_poll_pps) tho->th_counter->tc_poll_pps(tho->th_counter); /* * Deal with NTP second processing. The for loop normally * iterates at most once, but in extreme situations it might * keep NTP sane if timeouts are not run for several seconds. * At boot, the time step can be large when the TOD hardware * has been read, so on really large steps, we call * ntp_update_second only twice. We need to call it twice in * case we missed a leap second. * If NTP is not compiled in ntp_update_second still calculates * the adjustment resulting from adjtime() calls. */ bt = th->th_offset; bintime_add(&bt, &timebase.bin); i = bt.sec - tho->th_microtime.tv_sec; if (i > LARGE_STEP) i = 2; for (; i > 0; i--) { t = bt.sec; ntp_update_second(&th->th_adjustment, &bt.sec); s_update = 1; if (bt.sec != t) { timebase.gen |= 1; /* change in progress */ membar_producer(); timebase.bin.sec += bt.sec - t; membar_producer(); timebase.gen++; /* commit change */ } } /* Update the UTC timestamps used by the get*() functions. */ /* XXX shouldn't do this here. Should force non-`get' versions. */ bintime2timeval(&bt, &th->th_microtime); bintime2timespec(&bt, &th->th_nanotime); /* Now is a good time to change timecounters. */ if (th->th_counter != timecounter) { th->th_counter = timecounter; th->th_offset_count = ncount; s_update = 1; } /*- * Recalculate the scaling factor. We want the number of 1/2^64 * fractions of a second per period of the hardware counter, taking * into account the th_adjustment factor which the NTP PLL/adjtime(2) * processing provides us with. * * The th_adjustment is nanoseconds per second with 32 bit binary * fraction and we want 64 bit binary fraction of second: * * x = a * 2^32 / 10^9 = a * 4.294967296 * * The range of th_adjustment is +/- 5000PPM so inside a 64bit int * we can only multiply by about 850 without overflowing, but that * leaves suitably precise fractions for multiply before divide. * * Divide before multiply with a fraction of 2199/512 results in a * systematic undercompensation of 10PPM of th_adjustment. On a * 5000PPM adjustment this is a 0.05PPM error. This is acceptable. * * We happily sacrifice the lowest of the 64 bits of our result * to the goddess of code clarity. * */ if (s_update) { scale = (uint64_t)1 << 63; scale += (th->th_adjustment / 1024) * 2199; scale /= th->th_counter->tc_frequency; th->th_scale = scale * 2; } /* * Now that the struct timehands is again consistent, set the new * generation number, making sure to not make it zero. Ensure * changes are globally visible before changing. */ if (++ogen == 0) ogen = 1; membar_producer(); th->th_generation = ogen; /* * Go live with the new struct timehands. Ensure changes are * globally visible before changing. */ setrealuptime(th->th_microtime.tv_sec, th->th_offset.sec); atomic_store_release(&timehands, th); /* * Force users of the old timehand to move on. This is * necessary for MP systems; we need to ensure that the * consumers will move away from the old timehand before * we begin updating it again when we eventually wrap * around. */ if (++tho->th_generation == 0) tho->th_generation = 1; } /* * RFC 2783 PPS-API implementation. */ int pps_ioctl(u_long cmd, void *data, struct pps_state *pps) { pps_params_t *app; pps_info_t *pipi; #ifdef PPS_SYNC int *epi; #endif KASSERT(mutex_owned(&timecounter_lock)); KASSERT(pps != NULL); switch (cmd) { case PPS_IOC_CREATE: return 0; case PPS_IOC_DESTROY: return 0; case PPS_IOC_SETPARAMS: app = (pps_params_t *)data; if (app->mode & ~pps->ppscap) return EINVAL; pps->ppsparam = *app; return 0; case PPS_IOC_GETPARAMS: app = (pps_params_t *)data; *app = pps->ppsparam; app->api_version = PPS_API_VERS_1; return 0; case PPS_IOC_GETCAP: *(int*)data = pps->ppscap; return 0; case PPS_IOC_FETCH: pipi = (pps_info_t *)data; pps->ppsinfo.current_mode = pps->ppsparam.mode; *pipi = pps->ppsinfo; return 0; case PPS_IOC_KCBIND: #ifdef PPS_SYNC epi = (int *)data; /* XXX Only root should be able to do this */ if (*epi & ~pps->ppscap) return EINVAL; pps->kcmode = *epi; return 0; #else return EOPNOTSUPP; #endif default: return EPASSTHROUGH; } } void pps_init(struct pps_state *pps) { KASSERT(mutex_owned(&timecounter_lock)); pps->ppscap |= PPS_TSFMT_TSPEC; if (pps->ppscap & PPS_CAPTUREASSERT) pps->ppscap |= PPS_OFFSETASSERT; if (pps->ppscap & PPS_CAPTURECLEAR) pps->ppscap |= PPS_OFFSETCLEAR; } /* * capture a timestamp in the pps structure */ void pps_capture(struct pps_state *pps) { struct timehands *th; KASSERT(mutex_owned(&timecounter_lock)); KASSERT(pps != NULL); th = timehands; pps->capgen = th->th_generation; pps->capth = th; pps->capcount = (uint64_t)tc_delta(th) + th->th_offset_count; if (pps->capgen != th->th_generation) pps->capgen = 0; } #ifdef PPS_DEBUG int ppsdebug = 0; #endif /* * process a pps_capture()ed event */ void pps_event(struct pps_state *pps, int event) { pps_ref_event(pps, event, NULL, PPS_REFEVNT_PPS|PPS_REFEVNT_CAPTURE); } /* * extended pps api / kernel pll/fll entry point * * feed reference time stamps to PPS engine * * will simulate a PPS event and feed * the NTP PLL/FLL if requested. * * the ref time stamps should be roughly once * a second but do not need to be exactly in phase * with the UTC second but should be close to it. * this relaxation of requirements allows callout * driven timestamping mechanisms to feed to pps * capture/kernel pll logic. * * calling pattern is: * pps_capture() (for PPS_REFEVNT_{CAPTURE|CAPCUR}) * read timestamp from reference source * pps_ref_event() * * supported refmodes: * PPS_REFEVNT_CAPTURE * use system timestamp of pps_capture() * PPS_REFEVNT_CURRENT * use system timestamp of this call * PPS_REFEVNT_CAPCUR * use average of read capture and current system time stamp * PPS_REFEVNT_PPS * assume timestamp on second mark - ref_ts is ignored * */ void pps_ref_event(struct pps_state *pps, int event, struct bintime *ref_ts, int refmode ) { struct bintime bt; /* current time */ struct bintime btd; /* time difference */ struct bintime bt_ref; /* reference time */ struct timespec ts, *tsp, *osp; struct timehands *th; uint64_t tcount, acount, dcount, *pcount; int foff, gen; #ifdef PPS_SYNC int fhard; #endif pps_seq_t *pseq; KASSERT(mutex_owned(&timecounter_lock)); KASSERT(pps != NULL); /* pick up current time stamp if needed */ if (refmode & (PPS_REFEVNT_CURRENT|PPS_REFEVNT_CAPCUR)) { /* pick up current time stamp */ th = timehands; gen = th->th_generation; tcount = (uint64_t)tc_delta(th) + th->th_offset_count; if (gen != th->th_generation) gen = 0; /* If the timecounter was wound up underneath us, bail out. */ if (pps->capgen == 0 || pps->capgen != pps->capth->th_generation || gen == 0 || gen != pps->capgen) { #ifdef PPS_DEBUG if (ppsdebug & 0x1) { log(LOG_DEBUG, "pps_ref_event(pps=%p, event=%d, ...): DROP (wind-up)\n", pps, event); } #endif return; } } else { tcount = 0; /* keep GCC happy */ } #ifdef PPS_DEBUG if (ppsdebug & 0x1) { struct timespec tmsp; if (ref_ts == NULL) { tmsp.tv_sec = 0; tmsp.tv_nsec = 0; } else { bintime2timespec(ref_ts, &tmsp); } log(LOG_DEBUG, "pps_ref_event(pps=%p, event=%d, ref_ts=%"PRIi64 ".%09"PRIi32", refmode=0x%1x)\n", pps, event, tmsp.tv_sec, (int32_t)tmsp.tv_nsec, refmode); } #endif /* setup correct event references */ if (event == PPS_CAPTUREASSERT) { tsp = &pps->ppsinfo.assert_timestamp; osp = &pps->ppsparam.assert_offset; foff = pps->ppsparam.mode & PPS_OFFSETASSERT; #ifdef PPS_SYNC fhard = pps->kcmode & PPS_CAPTUREASSERT; #endif pcount = &pps->ppscount[0]; pseq = &pps->ppsinfo.assert_sequence; } else { tsp = &pps->ppsinfo.clear_timestamp; osp = &pps->ppsparam.clear_offset; foff = pps->ppsparam.mode & PPS_OFFSETCLEAR; #ifdef PPS_SYNC fhard = pps->kcmode & PPS_CAPTURECLEAR; #endif pcount = &pps->ppscount[1]; pseq = &pps->ppsinfo.clear_sequence; } /* determine system time stamp according to refmode */ dcount = 0; /* keep GCC happy */ switch (refmode & PPS_REFEVNT_RMASK) { case PPS_REFEVNT_CAPTURE: acount = pps->capcount; /* use capture timestamp */ break; case PPS_REFEVNT_CURRENT: acount = tcount; /* use current timestamp */ break; case PPS_REFEVNT_CAPCUR: /* * calculate counter value between pps_capture() and * pps_ref_event() */ dcount = tcount - pps->capcount; acount = (dcount / 2) + pps->capcount; break; default: /* ignore call error silently */ return; } /* * If the timecounter changed, we cannot compare the count values, so * we have to drop the rest of the PPS-stuff until the next event. */ if (pps->ppstc != pps->capth->th_counter) { pps->ppstc = pps->capth->th_counter; pps->capcount = acount; *pcount = acount; pps->ppscount[2] = acount; #ifdef PPS_DEBUG if (ppsdebug & 0x1) { log(LOG_DEBUG, "pps_ref_event(pps=%p, event=%d, ...): DROP (time-counter change)\n", pps, event); } #endif return; } pps->capcount = acount; /* Convert the count to a bintime. */ bt = pps->capth->th_offset; bintime_addx(&bt, pps->capth->th_scale * (acount - pps->capth->th_offset_count)); bintime_add(&bt, &timebase.bin); if ((refmode & PPS_REFEVNT_PPS) == 0) { /* determine difference to reference time stamp */ bt_ref = *ref_ts; btd = bt; bintime_sub(&btd, &bt_ref); /* * simulate a PPS timestamp by dropping the fraction * and applying the offset */ if (bt.frac >= (uint64_t)1<<63) /* skip to nearest second */ bt.sec++; bt.frac = 0; bintime_add(&bt, &btd); } else { /* * create ref_ts from current time - * we are supposed to be called on * the second mark */ bt_ref = bt; if (bt_ref.frac >= (uint64_t)1<<63) /* skip to nearest second */ bt_ref.sec++; bt_ref.frac = 0; } /* convert bintime to timestamp */ bintime2timespec(&bt, &ts); /* If the timecounter was wound up underneath us, bail out. */ if (pps->capgen != pps->capth->th_generation) return; /* store time stamp */ *pcount = pps->capcount; (*pseq)++; *tsp = ts; /* add offset correction */ if (foff) { timespecadd(tsp, osp, tsp); if (tsp->tv_nsec < 0) { tsp->tv_nsec += 1000000000; tsp->tv_sec -= 1; } } #ifdef PPS_DEBUG if (ppsdebug & 0x2) { struct timespec ts2; struct timespec ts3; bintime2timespec(&bt_ref, &ts2); bt.sec = 0; bt.frac = 0; if (refmode & PPS_REFEVNT_CAPCUR) { bintime_addx(&bt, pps->capth->th_scale * dcount); } bintime2timespec(&bt, &ts3); log(LOG_DEBUG, "ref_ts=%"PRIi64".%09"PRIi32 ", ts=%"PRIi64".%09"PRIi32", read latency=%"PRIi64" ns\n", ts2.tv_sec, (int32_t)ts2.tv_nsec, tsp->tv_sec, (int32_t)tsp->tv_nsec, timespec2ns(&ts3)); } #endif #ifdef PPS_SYNC if (fhard) { uint64_t scale; uint64_t div; /* * Feed the NTP PLL/FLL. * The FLL wants to know how many (hardware) nanoseconds * elapsed since the previous event (mod 1 second) thus * we are actually looking at the frequency difference scaled * in nsec. * As the counter time stamps are not truly at 1Hz * we need to scale the count by the elapsed * reference time. * valid sampling interval: [0.5..2[ sec */ /* calculate elapsed raw count */ tcount = pps->capcount - pps->ppscount[2]; pps->ppscount[2] = pps->capcount; tcount &= pps->capth->th_counter->tc_counter_mask; /* calculate elapsed ref time */ btd = bt_ref; bintime_sub(&btd, &pps->ref_time); pps->ref_time = bt_ref; /* check that we stay below 2 sec */ if (btd.sec < 0 || btd.sec > 1) return; /* we want at least 0.5 sec between samples */ if (btd.sec == 0 && btd.frac < (uint64_t)1<<63) return; /* * calculate cycles per period by multiplying * the frequency with the elapsed period * we pick a fraction of 30 bits * ~1ns resolution for elapsed time */ div = (uint64_t)btd.sec << 30; div |= (btd.frac >> 34) & (((uint64_t)1 << 30) - 1); div *= pps->capth->th_counter->tc_frequency; div >>= 30; if (div == 0) /* safeguard */ return; scale = (uint64_t)1 << 63; scale /= div; scale *= 2; bt.sec = 0; bt.frac = 0; bintime_addx(&bt, scale * tcount); bintime2timespec(&bt, &ts); #ifdef PPS_DEBUG if (ppsdebug & 0x4) { struct timespec ts2; int64_t df; bintime2timespec(&bt_ref, &ts2); df = timespec2ns(&ts); if (df > 500000000) df -= 1000000000; log(LOG_DEBUG, "hardpps: ref_ts=%"PRIi64 ".%09"PRIi32", ts=%"PRIi64".%09"PRIi32 ", freqdiff=%"PRIi64" ns/s\n", ts2.tv_sec, (int32_t)ts2.tv_nsec, tsp->tv_sec, (int32_t)tsp->tv_nsec, df); } #endif hardpps(tsp, timespec2ns(&ts)); } #endif } /* * Timecounters need to be updated every so often to prevent the hardware * counter from overflowing. Updating also recalculates the cached values * used by the get*() family of functions, so their precision depends on * the update frequency. */ static int tc_tick; void tc_ticktock(void) { static int count; if (++count < tc_tick) return; count = 0; mutex_spin_enter(&timecounter_lock); if (__predict_false(timecounter_bad != 0)) { /* An existing timecounter has gone bad, pick a new one. */ (void)atomic_swap_uint(&timecounter_bad, 0); if (timecounter->tc_quality < 0) { tc_pick(); } } tc_windup(); mutex_spin_exit(&timecounter_lock); } void inittimecounter(void) { u_int p; mutex_init(&timecounter_lock, MUTEX_DEFAULT, IPL_HIGH); /* * Set the initial timeout to * max(1, <approx. number of hardclock ticks in a millisecond>). * People should probably not use the sysctl to set the timeout * to smaller than its initial value, since that value is the * smallest reasonable one. If they want better timestamps they * should use the non-"get"* functions. */ if (hz > 1000) tc_tick = (hz + 500) / 1000; else tc_tick = 1; p = (tc_tick * 1000000) / hz; aprint_verbose("timecounter: Timecounters tick every %d.%03u msec\n", p / 1000, p % 1000); /* warm up new timecounter (again) and get rolling. */ (void)timecounter->tc_get_timecount(timecounter); (void)timecounter->tc_get_timecount(timecounter); }
1 1 3 4 4 4 2 4 4 1 1 13 2 2 2 9 8 1 1 2 2 4 4 1 2 2 1 1 1 3 4 3 1 2 9 11 13 13 1 4 4 4 4 4 4 4 4 4 2 1 1 1 1 1 1 1 1 4 4 1 3 4 3 1 4 2 2 4 2 3 4 4 4 4 2 3 4 2 4 4 4 1 1 4 4 1 1 1 1 1 1 5 5 5 5 5 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 6 4 2 3 3 3 5 5 4 2 2 5 2 2 2 1 1 2 2 2 2 1 1 5 5 5 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 /* $NetBSD: in.c,v 1.247 2022/11/25 08:39:32 knakahara Exp $ */ /* * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the project nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /*- * Copyright (c) 1998 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Public Access Networks Corporation ("Panix"). It was developed under * contract to Panix by Eric Haszlakiewicz and Thor Lancelot Simon. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /* * Copyright (c) 1982, 1986, 1991, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)in.c 8.4 (Berkeley) 1/9/95 */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: in.c,v 1.247 2022/11/25 08:39:32 knakahara Exp $"); #include "arp.h" #ifdef _KERNEL_OPT #include "opt_inet.h" #include "opt_inet_conf.h" #include "opt_mrouting.h" #include "opt_net_mpsafe.h" #endif #include <sys/param.h> #include <sys/ioctl.h> #include <sys/errno.h> #include <sys/kernel.h> #include <sys/malloc.h> #include <sys/socket.h> #include <sys/socketvar.h> #include <sys/sysctl.h> #include <sys/systm.h> #include <sys/proc.h> #include <sys/syslog.h> #include <sys/kauth.h> #include <sys/kmem.h> #include <sys/cprng.h> #include <net/if.h> #include <net/route.h> #include <net/pfil.h> #include <net/if_arp.h> #include <net/if_ether.h> #include <net/if_types.h> #include <net/if_llatbl.h> #include <net/if_dl.h> #include <netinet/in_systm.h> #include <netinet/in.h> #include <netinet/in_var.h> #include <netinet/ip.h> #include <netinet/ip_var.h> #include <netinet/in_ifattach.h> #include <netinet/in_pcb.h> #include <netinet/in_selsrc.h> #include <netinet/if_inarp.h> #include <netinet/ip_mroute.h> #include <netinet/igmp_var.h> #ifdef IPSELSRC #include <netinet/in_selsrc.h> #endif static u_int in_mask2len(struct in_addr *); static int in_lifaddr_ioctl(struct socket *, u_long, void *, struct ifnet *); static void in_addrhash_insert_locked(struct in_ifaddr *); static void in_addrhash_remove_locked(struct in_ifaddr *); static int in_addprefix(struct in_ifaddr *, int); static void in_scrubaddr(struct in_ifaddr *); static int in_scrubprefix(struct in_ifaddr *); static void in_sysctl_init(struct sysctllog **); #ifndef SUBNETSARELOCAL #define SUBNETSARELOCAL 1 #endif #ifndef HOSTZEROBROADCAST #define HOSTZEROBROADCAST 0 #endif /* Note: 61, 127, 251, 509, 1021, 2039 are good. */ #ifndef IN_MULTI_HASH_SIZE #define IN_MULTI_HASH_SIZE 509 #endif static int subnetsarelocal = SUBNETSARELOCAL; static int hostzeroisbroadcast = HOSTZEROBROADCAST; /* * This list is used to keep track of in_multi chains which belong to * deleted interface addresses. We use in_ifaddr so that a chain head * won't be deallocated until all multicast address record are deleted. */ LIST_HEAD(in_multihashhead, in_multi); /* Type of the hash head */ static struct pool inmulti_pool; static u_int in_multientries; static struct in_multihashhead *in_multihashtbl; static u_long in_multihash; static krwlock_t in_multilock; #define IN_MULTI_HASH(x, ifp) \ (in_multihashtbl[(u_long)((x) ^ (ifp->if_index)) % IN_MULTI_HASH_SIZE]) /* XXX DEPRECATED. Keep them to avoid breaking kvm(3) users. */ struct in_ifaddrhashhead * in_ifaddrhashtbl; u_long in_ifaddrhash; struct in_ifaddrhead in_ifaddrhead; static kmutex_t in_ifaddr_lock; pserialize_t in_ifaddrhash_psz; struct pslist_head * in_ifaddrhashtbl_pslist; u_long in_ifaddrhash_pslist; struct pslist_head in_ifaddrhead_pslist; void in_init(void) { pool_init(&inmulti_pool, sizeof(struct in_multi), 0, 0, 0, "inmltpl", NULL, IPL_SOFTNET); TAILQ_INIT(&in_ifaddrhead); PSLIST_INIT(&in_ifaddrhead_pslist); in_ifaddrhashtbl = hashinit(IN_IFADDR_HASH_SIZE, HASH_LIST, true, &in_ifaddrhash); in_ifaddrhash_psz = pserialize_create(); in_ifaddrhashtbl_pslist = hashinit(IN_IFADDR_HASH_SIZE, HASH_PSLIST, true, &in_ifaddrhash_pslist); mutex_init(&in_ifaddr_lock, MUTEX_DEFAULT, IPL_NONE); in_multihashtbl = hashinit(IN_IFADDR_HASH_SIZE, HASH_LIST, true, &in_multihash); rw_init(&in_multilock); in_sysctl_init(NULL); } /* * Return 1 if an internet address is for a ``local'' host * (one to which we have a connection). If subnetsarelocal * is true, this includes other subnets of the local net. * Otherwise, it includes only the directly-connected (sub)nets. */ int in_localaddr(struct in_addr in) { struct in_ifaddr *ia; int localaddr = 0; int s = pserialize_read_enter(); if (subnetsarelocal) { IN_ADDRLIST_READER_FOREACH(ia) { if ((in.s_addr & ia->ia_netmask) == ia->ia_net) { localaddr = 1; break; } } } else { IN_ADDRLIST_READER_FOREACH(ia) { if ((in.s_addr & ia->ia_subnetmask) == ia->ia_subnet) { localaddr = 1; break; } } } pserialize_read_exit(s); return localaddr; } /* * like in_localaddr() but can specify ifp. */ int in_direct(struct in_addr in, struct ifnet *ifp) { struct ifaddr *ifa; int localaddr = 0; int s; KASSERT(ifp != NULL); #define ia (ifatoia(ifa)) s = pserialize_read_enter(); if (subnetsarelocal) { IFADDR_READER_FOREACH(ifa, ifp) { if (ifa->ifa_addr->sa_family == AF_INET && ((in.s_addr & ia->ia_netmask) == ia->ia_net)) { localaddr = 1; break; } } } else { IFADDR_READER_FOREACH(ifa, ifp) { if (ifa->ifa_addr->sa_family == AF_INET && (in.s_addr & ia->ia_subnetmask) == ia->ia_subnet) { localaddr = 1; break; } } } pserialize_read_exit(s); return localaddr; #undef ia } /* * Determine whether an IP address is in a reserved set of addresses * that may not be forwarded, or whether datagrams to that destination * may be forwarded. */ int in_canforward(struct in_addr in) { u_int32_t net; if (IN_EXPERIMENTAL(in.s_addr) || IN_MULTICAST(in.s_addr)) return (0); if (IN_CLASSA(in.s_addr)) { net = in.s_addr & IN_CLASSA_NET; if (net == 0 || net == htonl(IN_LOOPBACKNET << IN_CLASSA_NSHIFT)) return (0); } return (1); } /* * Trim a mask in a sockaddr */ void in_socktrim(struct sockaddr_in *ap) { char *cplim = (char *) &ap->sin_addr; char *cp = (char *) (&ap->sin_addr + 1); ap->sin_len = 0; while (--cp >= cplim) if (*cp) { (ap)->sin_len = cp - (char *) (ap) + 1; break; } } /* * Maintain the "in_maxmtu" variable, which is the largest * mtu for non-local interfaces with AF_INET addresses assigned * to them that are up. */ unsigned long in_maxmtu; void in_setmaxmtu(void) { struct in_ifaddr *ia; struct ifnet *ifp; unsigned long maxmtu = 0; int s = pserialize_read_enter(); IN_ADDRLIST_READER_FOREACH(ia) { if ((ifp = ia->ia_ifp) == 0) continue; if ((ifp->if_flags & (IFF_UP|IFF_LOOPBACK)) != IFF_UP) continue; if (ifp->if_mtu > maxmtu) maxmtu = ifp->if_mtu; } if (maxmtu) in_maxmtu = maxmtu; pserialize_read_exit(s); } static u_int in_mask2len(struct in_addr *mask) { u_int x, y; u_char *p; p = (u_char *)mask; for (x = 0; x < sizeof(*mask); x++) { if (p[x] != 0xff) break; } y = 0; if (x < sizeof(*mask)) { for (y = 0; y < NBBY; y++) { if ((p[x] & (0x80 >> y)) == 0) break; } } return x * NBBY + y; } void in_len2mask(struct in_addr *mask, u_int len) { u_int i; u_char *p; p = (u_char *)mask; memset(mask, 0, sizeof(*mask)); for (i = 0; i < len / NBBY; i++) p[i] = 0xff; if (len % NBBY) p[i] = (0xff00 >> (len % NBBY)) & 0xff; } /* * Generic internet control operations (ioctl's). * Ifp is 0 if not an interface-specific ioctl. */ /* ARGSUSED */ static int in_control0(struct socket *so, u_long cmd, void *data, struct ifnet *ifp) { struct ifreq *ifr = (struct ifreq *)data; struct in_ifaddr *ia = NULL; struct in_aliasreq *ifra = (struct in_aliasreq *)data; struct sockaddr_in oldaddr, *new_dstaddr; int error, hostIsNew, maskIsNew; int newifaddr = 0; bool run_hook = false; bool need_reinsert = false; struct psref psref; int bound; switch (cmd) { case SIOCALIFADDR: case SIOCDLIFADDR: case SIOCGLIFADDR: if (ifp == NULL) return EINVAL; return in_lifaddr_ioctl(so, cmd, data, ifp); case SIOCGIFADDRPREF: case SIOCSIFADDRPREF: if (ifp == NULL) return EINVAL; return ifaddrpref_ioctl(so, cmd, data, ifp); #if NARP > 0 case SIOCGNBRINFO: { struct in_nbrinfo *nbi = (struct in_nbrinfo *)data; struct llentry *ln; struct in_addr nb_addr = nbi->addr; /* make local for safety */ ln = arplookup(ifp, &nb_addr, NULL, 0); if (ln == NULL) return EINVAL; nbi->state = ln->ln_state; nbi->asked = ln->ln_asked; nbi->expire = ln->ln_expire ? time_mono_to_wall(ln->ln_expire) : 0; LLE_RUNLOCK(ln); return 0; } #endif } bound = curlwp_bind(); /* * Find address for this interface, if it exists. */ if (ifp != NULL) ia = in_get_ia_from_ifp_psref(ifp, &psref); hostIsNew = 1; /* moved here to appease gcc */ switch (cmd) { case SIOCAIFADDR: case SIOCDIFADDR: case SIOCGIFALIAS: case SIOCGIFAFLAG_IN: if (ifra->ifra_addr.sin_family == AF_INET) { int s; if (ia != NULL) ia4_release(ia, &psref); s = pserialize_read_enter(); IN_ADDRHASH_READER_FOREACH(ia, ifra->ifra_addr.sin_addr.s_addr) { if (ia->ia_ifp == ifp && in_hosteq(ia->ia_addr.sin_addr, ifra->ifra_addr.sin_addr)) break; } if (ia != NULL) ia4_acquire(ia, &psref); pserialize_read_exit(s); } if ((cmd == SIOCDIFADDR || cmd == SIOCGIFALIAS || cmd == SIOCGIFAFLAG_IN) && ia == NULL) { error = EADDRNOTAVAIL; goto out; } if (cmd == SIOCDIFADDR && ifra->ifra_addr.sin_family == AF_UNSPEC) { ifra->ifra_addr.sin_family = AF_INET; } /* FALLTHROUGH */ case SIOCSIFADDR: if (ia == NULL || ia->ia_addr.sin_family != AF_INET) ; else if (ifra->ifra_addr.sin_len == 0) { ifra->ifra_addr = ia->ia_addr; hostIsNew = 0; } else if (in_hosteq(ia->ia_addr.sin_addr, ifra->ifra_addr.sin_addr)) hostIsNew = 0; if (ifra->ifra_addr.sin_family != AF_INET) { error = EAFNOSUPPORT; goto out; } /* FALLTHROUGH */ case SIOCSIFDSTADDR: if (cmd == SIOCSIFDSTADDR && ifreq_getaddr(cmd, ifr)->sa_family != AF_INET) { error = EAFNOSUPPORT; goto out; } /* FALLTHROUGH */ case SIOCSIFNETMASK: if (ifp == NULL) panic("in_control"); if (cmd == SIOCGIFALIAS || cmd == SIOCGIFAFLAG_IN) break; if (ia == NULL && (cmd == SIOCSIFNETMASK || cmd == SIOCSIFDSTADDR)) { error = EADDRNOTAVAIL; goto out; } if (kauth_authorize_network(kauth_cred_get(), KAUTH_NETWORK_INTERFACE, KAUTH_REQ_NETWORK_INTERFACE_SETPRIV, ifp, (void *)cmd, NULL) != 0) { error = EPERM; goto out; } if (ia == NULL) { ia = malloc(sizeof(*ia), M_IFADDR, M_WAITOK|M_ZERO); if (ia == NULL) { error = ENOBUFS; goto out; } ia->ia_ifa.ifa_addr = sintosa(&ia->ia_addr); ia->ia_ifa.ifa_dstaddr = sintosa(&ia->ia_dstaddr); ia->ia_ifa.ifa_netmask = sintosa(&ia->ia_sockmask); #ifdef IPSELSRC ia->ia_ifa.ifa_getifa = in_getifa; #else /* IPSELSRC */ ia->ia_ifa.ifa_getifa = NULL; #endif /* IPSELSRC */ ia->ia_sockmask.sin_len = 8; ia->ia_sockmask.sin_family = AF_INET; if (ifp->if_flags & IFF_BROADCAST) { ia->ia_broadaddr.sin_len = sizeof(ia->ia_addr); ia->ia_broadaddr.sin_family = AF_INET; } ia->ia_ifp = ifp; ia->ia_idsalt = cprng_fast32() % 65535; LIST_INIT(&ia->ia_multiaddrs); IN_ADDRHASH_ENTRY_INIT(ia); IN_ADDRLIST_ENTRY_INIT(ia); ifa_psref_init(&ia->ia_ifa); /* * We need a reference to make ia survive over in_ifinit * that does ifaref and ifafree. */ ifaref(&ia->ia_ifa); newifaddr = 1; } break; case SIOCSIFBRDADDR: if (kauth_authorize_network(kauth_cred_get(), KAUTH_NETWORK_INTERFACE, KAUTH_REQ_NETWORK_INTERFACE_SETPRIV, ifp, (void *)cmd, NULL) != 0) { error = EPERM; goto out; } /* FALLTHROUGH */ case SIOCGIFADDR: case SIOCGIFNETMASK: case SIOCGIFDSTADDR: case SIOCGIFBRDADDR: if (ia == NULL) { error = EADDRNOTAVAIL; goto out; } break; } error = 0; switch (cmd) { case SIOCGIFADDR: ifreq_setaddr(cmd, ifr, sintocsa(&ia->ia_addr)); break; case SIOCGIFBRDADDR: if ((ifp->if_flags & IFF_BROADCAST) == 0) { error = EINVAL; goto out; } ifreq_setdstaddr(cmd, ifr, sintocsa(&ia->ia_broadaddr)); break; case SIOCGIFDSTADDR: if ((ifp->if_flags & IFF_POINTOPOINT) == 0) { error = EINVAL; goto out; } ifreq_setdstaddr(cmd, ifr, sintocsa(&ia->ia_dstaddr)); break; case SIOCGIFNETMASK: /* * We keep the number of trailing zero bytes the sin_len field * of ia_sockmask, so we fix this before we pass it back to * userland. */ oldaddr = ia->ia_sockmask; oldaddr.sin_len = sizeof(struct sockaddr_in); ifreq_setaddr(cmd, ifr, (const void *)&oldaddr); break; case SIOCSIFDSTADDR: if ((ifp->if_flags & IFF_POINTOPOINT) == 0) { error = EINVAL; goto out; } oldaddr = ia->ia_dstaddr; ia->ia_dstaddr = *satocsin(ifreq_getdstaddr(cmd, ifr)); if ((error = if_addr_init(ifp, &ia->ia_ifa, false)) != 0) { ia->ia_dstaddr = oldaddr; goto out; } if (ia->ia_flags & IFA_ROUTE) { ia->ia_ifa.ifa_dstaddr = sintosa(&oldaddr); rtinit(&ia->ia_ifa, RTM_DELETE, RTF_HOST); ia->ia_ifa.ifa_dstaddr = sintosa(&ia->ia_dstaddr); rtinit(&ia->ia_ifa, RTM_ADD, RTF_HOST|RTF_UP); } break; case SIOCSIFBRDADDR: if ((ifp->if_flags & IFF_BROADCAST) == 0) { error = EINVAL; goto out; } ia->ia_broadaddr = *satocsin(ifreq_getbroadaddr(cmd, ifr)); break; case SIOCSIFADDR: if (!newifaddr) { in_addrhash_remove(ia); need_reinsert = true; } error = in_ifinit(ifp, ia, satocsin(ifreq_getaddr(cmd, ifr)), NULL, 1); run_hook = true; break; case SIOCSIFNETMASK: in_scrubprefix(ia); ia->ia_sockmask = *satocsin(ifreq_getaddr(cmd, ifr)); ia->ia_subnetmask = ia->ia_sockmask.sin_addr.s_addr; if (!newifaddr) { in_addrhash_remove(ia); need_reinsert = true; } error = in_ifinit(ifp, ia, NULL, NULL, 0); break; case SIOCAIFADDR: maskIsNew = 0; if (ifra->ifra_mask.sin_len) { in_scrubprefix(ia); ia->ia_sockmask = ifra->ifra_mask; ia->ia_subnetmask = ia->ia_sockmask.sin_addr.s_addr; maskIsNew = 1; } if ((ifp->if_flags & IFF_POINTOPOINT) && (ifra->ifra_dstaddr.sin_family == AF_INET)) { new_dstaddr = &ifra->ifra_dstaddr; maskIsNew = 1; /* We lie; but the effect's the same */ } else new_dstaddr = NULL; if (ifra->ifra_addr.sin_family == AF_INET && (hostIsNew || maskIsNew)) { if (!newifaddr) { in_addrhash_remove(ia); need_reinsert = true; } error = in_ifinit(ifp, ia, &ifra->ifra_addr, new_dstaddr, 0); } if ((ifp->if_flags & IFF_BROADCAST) && (ifra->ifra_broadaddr.sin_family == AF_INET)) ia->ia_broadaddr = ifra->ifra_broadaddr; run_hook = true; break; case SIOCGIFALIAS: ifra->ifra_mask = ia->ia_sockmask; if ((ifp->if_flags & IFF_POINTOPOINT) && (ia->ia_dstaddr.sin_family == AF_INET)) ifra->ifra_dstaddr = ia->ia_dstaddr; else if ((ifp->if_flags & IFF_BROADCAST) && (ia->ia_broadaddr.sin_family == AF_INET)) ifra->ifra_broadaddr = ia->ia_broadaddr; else memset(&ifra->ifra_broadaddr, 0, sizeof(ifra->ifra_broadaddr)); break; case SIOCGIFAFLAG_IN: ifr->ifr_addrflags = ia->ia4_flags; break; case SIOCDIFADDR: ia4_release(ia, &psref); ifaref(&ia->ia_ifa); in_purgeaddr(&ia->ia_ifa); pfil_run_addrhooks(if_pfil, cmd, &ia->ia_ifa); ifafree(&ia->ia_ifa); ia = NULL; break; #ifdef MROUTING case SIOCGETVIFCNT: case SIOCGETSGCNT: error = mrt_ioctl(so, cmd, data); break; #endif /* MROUTING */ default: error = ENOTTY; goto out; } /* * XXX insert regardless of error to make in_purgeaddr below work. * Need to improve. */ if (newifaddr) { ifaref(&ia->ia_ifa); ifa_insert(ifp, &ia->ia_ifa); mutex_enter(&in_ifaddr_lock); TAILQ_INSERT_TAIL(&in_ifaddrhead, ia, ia_list); IN_ADDRLIST_WRITER_INSERT_TAIL(ia); in_addrhash_insert_locked(ia); /* Release a reference that is held just after creation. */ ifafree(&ia->ia_ifa); mutex_exit(&in_ifaddr_lock); } else if (need_reinsert) { in_addrhash_insert(ia); } if (error == 0) { if (run_hook) pfil_run_addrhooks(if_pfil, cmd, &ia->ia_ifa); } else if (newifaddr) { KASSERT(ia != NULL); in_purgeaddr(&ia->ia_ifa); ia = NULL; } out: if (!newifaddr && ia != NULL) ia4_release(ia, &psref); curlwp_bindx(bound); return error; } int in_control(struct socket *so, u_long cmd, void *data, struct ifnet *ifp) { int error; #ifndef NET_MPSAFE KASSERT(KERNEL_LOCKED_P()); #endif error = in_control0(so, cmd, data, ifp); return error; } /* Add ownaddr as loopback rtentry. */ static void in_ifaddlocal(struct ifaddr *ifa) { struct in_ifaddr *ia; ia = (struct in_ifaddr *)ifa; if ((ia->ia_ifp->if_flags & IFF_UNNUMBERED)) { rt_addrmsg(RTM_NEWADDR, ifa); return; } if (ia->ia_addr.sin_addr.s_addr == INADDR_ANY || (ia->ia_ifp->if_flags & IFF_POINTOPOINT && in_hosteq(ia->ia_dstaddr.sin_addr, ia->ia_addr.sin_addr))) { rt_addrmsg(RTM_NEWADDR, ifa); return; } rt_ifa_addlocal(ifa); } /* Remove loopback entry of ownaddr */ static void in_ifremlocal(struct ifaddr *ifa) { struct in_ifaddr *ia, *p; struct ifaddr *alt_ifa = NULL; int ia_count = 0; int s; struct psref psref; int bound = curlwp_bind(); ia = (struct in_ifaddr *)ifa; if ((ia->ia_ifp->if_flags & IFF_UNNUMBERED)) { rt_addrmsg(RTM_DELADDR, ifa); goto out; } /* Delete the entry if exactly one ifaddr matches the * address, ifa->ifa_addr. */ s = pserialize_read_enter(); IN_ADDRLIST_READER_FOREACH(p) { if ((p->ia_ifp->if_flags & IFF_UNNUMBERED)) continue; if (!in_hosteq(p->ia_addr.sin_addr, ia->ia_addr.sin_addr)) continue; if (p->ia_ifp != ia->ia_ifp) alt_ifa = &p->ia_ifa; if (++ia_count > 1 && alt_ifa != NULL) break; } if (alt_ifa != NULL && ia_count > 1) ifa_acquire(alt_ifa, &psref); pserialize_read_exit(s); if (ia_count == 0) goto out; rt_ifa_remlocal(ifa, ia_count == 1 ? NULL : alt_ifa); if (alt_ifa != NULL && ia_count > 1) ifa_release(alt_ifa, &psref); out: curlwp_bindx(bound); } static void in_scrubaddr(struct in_ifaddr *ia) { /* stop DAD processing */ if (ia->ia_dad_stop != NULL) ia->ia_dad_stop(&ia->ia_ifa); in_scrubprefix(ia); in_ifremlocal(&ia->ia_ifa); mutex_enter(&in_ifaddr_lock); if (ia->ia_allhosts != NULL) { in_delmulti(ia->ia_allhosts); ia->ia_allhosts = NULL; } mutex_exit(&in_ifaddr_lock); } /* * Depends on it isn't called in concurrent. It should be guaranteed * by ifa->ifa_ifp's ioctl lock. The possible callers are in_control * and if_purgeaddrs; the former is called iva ifa->ifa_ifp's ioctl * and the latter is called via ifa->ifa_ifp's if_detach. The functions * never be executed in concurrent. */ void in_purgeaddr(struct ifaddr *ifa) { struct in_ifaddr *ia = (void *) ifa; struct ifnet *ifp = ifa->ifa_ifp; /* KASSERT(!ifa_held(ifa)); XXX need ifa_not_held (psref_not_held) */ ifa->ifa_flags |= IFA_DESTROYING; in_scrubaddr(ia); mutex_enter(&in_ifaddr_lock); in_addrhash_remove_locked(ia); TAILQ_REMOVE(&in_ifaddrhead, ia, ia_list); IN_ADDRLIST_WRITER_REMOVE(ia); ifa_remove(ifp, &ia->ia_ifa); /* Assume ifa_remove called pserialize_perform and psref_destroy */ mutex_exit(&in_ifaddr_lock); IN_ADDRHASH_ENTRY_DESTROY(ia); IN_ADDRLIST_ENTRY_DESTROY(ia); ifafree(&ia->ia_ifa); in_setmaxmtu(); } static void in_addrhash_insert_locked(struct in_ifaddr *ia) { KASSERT(mutex_owned(&in_ifaddr_lock)); LIST_INSERT_HEAD(&IN_IFADDR_HASH(ia->ia_addr.sin_addr.s_addr), ia, ia_hash); IN_ADDRHASH_ENTRY_INIT(ia); IN_ADDRHASH_WRITER_INSERT_HEAD(ia); } void in_addrhash_insert(struct in_ifaddr *ia) { mutex_enter(&in_ifaddr_lock); in_addrhash_insert_locked(ia); mutex_exit(&in_ifaddr_lock); } static void in_addrhash_remove_locked(struct in_ifaddr *ia) { KASSERT(mutex_owned(&in_ifaddr_lock)); LIST_REMOVE(ia, ia_hash); IN_ADDRHASH_WRITER_REMOVE(ia); } void in_addrhash_remove(struct in_ifaddr *ia) { mutex_enter(&in_ifaddr_lock); in_addrhash_remove_locked(ia); #ifdef NET_MPSAFE pserialize_perform(in_ifaddrhash_psz); #endif mutex_exit(&in_ifaddr_lock); IN_ADDRHASH_ENTRY_DESTROY(ia); } void in_purgeif(struct ifnet *ifp) /* MUST be called at splsoftnet() */ { IFNET_LOCK(ifp); if_purgeaddrs(ifp, AF_INET, in_purgeaddr); igmp_purgeif(ifp); /* manipulates pools */ #ifdef MROUTING ip_mrouter_detach(ifp); #endif IFNET_UNLOCK(ifp); } /* * SIOC[GAD]LIFADDR. * SIOCGLIFADDR: get first address. (???) * SIOCGLIFADDR with IFLR_PREFIX: * get first address that matches the specified prefix. * SIOCALIFADDR: add the specified address. * SIOCALIFADDR with IFLR_PREFIX: * EINVAL since we can't deduce hostid part of the address. * SIOCDLIFADDR: delete the specified address. * SIOCDLIFADDR with IFLR_PREFIX: * delete the first address that matches the specified prefix. * return values: * EINVAL on invalid parameters * EADDRNOTAVAIL on prefix match failed/specified address not found * other values may be returned from in_ioctl() */ static int in_lifaddr_ioctl(struct socket *so, u_long cmd, void *data, struct ifnet *ifp) { struct if_laddrreq *iflr = (struct if_laddrreq *)data; struct ifaddr *ifa; struct sockaddr *sa; /* sanity checks */ if (data == NULL || ifp == NULL) { panic("invalid argument to in_lifaddr_ioctl"); /*NOTRECHED*/ } switch (cmd) { case SIOCGLIFADDR: /* address must be specified on GET with IFLR_PREFIX */ if ((iflr->flags & IFLR_PREFIX) == 0) break; /*FALLTHROUGH*/ case SIOCALIFADDR: case SIOCDLIFADDR: /* address must be specified on ADD and DELETE */ sa = (struct sockaddr *)&iflr->addr; if (sa->sa_family != AF_INET) return EINVAL; if (sa->sa_len != sizeof(struct sockaddr_in)) return EINVAL; /* XXX need improvement */ sa = (struct sockaddr *)&iflr->dstaddr; if (sa->sa_family != AF_UNSPEC && sa->sa_family != AF_INET) return EINVAL; if (sa->sa_len != 0 && sa->sa_len != sizeof(struct sockaddr_in)) return EINVAL; break; default: /*shouldn't happen*/ #if 0 panic("invalid cmd to in_lifaddr_ioctl"); /*NOTREACHED*/ #else return EOPNOTSUPP; #endif } if (sizeof(struct in_addr) * NBBY < iflr->prefixlen) return EINVAL; switch (cmd) { case SIOCALIFADDR: { struct in_aliasreq ifra; if (iflr->flags & IFLR_PREFIX) return EINVAL; /* copy args to in_aliasreq, perform ioctl(SIOCAIFADDR). */ memset(&ifra, 0, sizeof(ifra)); memcpy(ifra.ifra_name, iflr->iflr_name, sizeof(ifra.ifra_name)); memcpy(&ifra.ifra_addr, &iflr->addr, ((struct sockaddr *)&iflr->addr)->sa_len); if (((struct sockaddr *)&iflr->dstaddr)->sa_family) { /*XXX*/ memcpy(&ifra.ifra_dstaddr, &iflr->dstaddr, ((struct sockaddr *)&iflr->dstaddr)->sa_len); } ifra.ifra_mask.sin_family = AF_INET; ifra.ifra_mask.sin_len = sizeof(struct sockaddr_in); in_len2mask(&ifra.ifra_mask.sin_addr, iflr->prefixlen); return in_control(so, SIOCAIFADDR, &ifra, ifp); } case SIOCGLIFADDR: case SIOCDLIFADDR: { struct in_ifaddr *ia; struct in_addr mask, candidate, match; struct sockaddr_in *sin; int cmp, s; memset(&mask, 0, sizeof(mask)); memset(&match, 0, sizeof(match)); /* XXX gcc */ if (iflr->flags & IFLR_PREFIX) { /* lookup a prefix rather than address. */ in_len2mask(&mask, iflr->prefixlen); sin = (struct sockaddr_in *)&iflr->addr; match.s_addr = sin->sin_addr.s_addr; match.s_addr &= mask.s_addr; /* if you set extra bits, that's wrong */ if (match.s_addr != sin->sin_addr.s_addr) return EINVAL; cmp = 1; } else { if (cmd == SIOCGLIFADDR) { /* on getting an address, take the 1st match */ cmp = 0; /*XXX*/ } else { /* on deleting an address, do exact match */ in_len2mask(&mask, 32); sin = (struct sockaddr_in *)&iflr->addr; match.s_addr = sin->sin_addr.s_addr; cmp = 1; } } s = pserialize_read_enter(); IFADDR_READER_FOREACH(ifa, ifp) { if (ifa->ifa_addr->sa_family != AF_INET) continue; if (cmp == 0) break; candidate.s_addr = ((struct sockaddr_in *)ifa->ifa_addr)->sin_addr.s_addr; candidate.s_addr &= mask.s_addr; if (candidate.s_addr == match.s_addr) break; } if (ifa == NULL) { pserialize_read_exit(s); return EADDRNOTAVAIL; } ia = (struct in_ifaddr *)ifa; if (cmd == SIOCGLIFADDR) { /* fill in the if_laddrreq structure */ memcpy(&iflr->addr, &ia->ia_addr, ia->ia_addr.sin_len); if ((ifp->if_flags & IFF_POINTOPOINT) != 0) { memcpy(&iflr->dstaddr, &ia->ia_dstaddr, ia->ia_dstaddr.sin_len); } else memset(&iflr->dstaddr, 0, sizeof(iflr->dstaddr)); iflr->prefixlen = in_mask2len(&ia->ia_sockmask.sin_addr); iflr->flags = 0; /*XXX*/ pserialize_read_exit(s); return 0; } else { struct in_aliasreq ifra; /* fill in_aliasreq and do ioctl(SIOCDIFADDR) */ memset(&ifra, 0, sizeof(ifra)); memcpy(ifra.ifra_name, iflr->iflr_name, sizeof(ifra.ifra_name)); memcpy(&ifra.ifra_addr, &ia->ia_addr, ia->ia_addr.sin_len); if ((ifp->if_flags & IFF_POINTOPOINT) != 0) { memcpy(&ifra.ifra_dstaddr, &ia->ia_dstaddr, ia->ia_dstaddr.sin_len); } memcpy(&ifra.ifra_dstaddr, &ia->ia_sockmask, ia->ia_sockmask.sin_len); pserialize_read_exit(s); return in_control(so, SIOCDIFADDR, &ifra, ifp); } } } return EOPNOTSUPP; /*just for safety*/ } /* * Initialize an interface's internet address * and routing table entry. */ int in_ifinit(struct ifnet *ifp, struct in_ifaddr *ia, const struct sockaddr_in *sin, const struct sockaddr_in *dst, int scrub) { u_int32_t i; struct sockaddr_in oldaddr, olddst; int s, oldflags, flags = RTF_UP, error, hostIsNew; if (sin == NULL) sin = &ia->ia_addr; if (dst == NULL) dst = &ia->ia_dstaddr; /* * Set up new addresses. */ oldaddr = ia->ia_addr; olddst = ia->ia_dstaddr; oldflags = ia->ia4_flags; ia->ia_addr = *sin; ia->ia_dstaddr = *dst; hostIsNew = oldaddr.sin_family != AF_INET || !in_hosteq(ia->ia_addr.sin_addr, oldaddr.sin_addr); if (!scrub) scrub = oldaddr.sin_family != ia->ia_dstaddr.sin_family || !in_hosteq(ia->ia_dstaddr.sin_addr, olddst.sin_addr); /* * Configure address flags. * We need to do this early because they may be adjusted * by if_addr_init depending on the address. */ if (ia->ia4_flags & IN_IFF_DUPLICATED) { ia->ia4_flags &= ~IN_IFF_DUPLICATED; hostIsNew = 1; } if (ifp->if_link_state == LINK_STATE_DOWN) { ia->ia4_flags |= IN_IFF_DETACHED; ia->ia4_flags &= ~IN_IFF_TENTATIVE; } else if (hostIsNew && if_do_dad(ifp) && ip_dad_enabled()) ia->ia4_flags |= IN_IFF_TRYTENTATIVE; /* * Give the interface a chance to initialize * if this is its first address, * and to validate the address if necessary. */ s = splsoftnet(); error = if_addr_init(ifp, &ia->ia_ifa, true); splx(s); /* Now clear the try tentative flag, its job is done. */ ia->ia4_flags &= ~IN_IFF_TRYTENTATIVE; if (error != 0) { ia->ia_addr = oldaddr; ia->ia_dstaddr = olddst; ia->ia4_flags = oldflags; return error; } /* * The interface which does not have IPv4 address is not required * to scrub old address. So, skip scrub such cases. */ if (oldaddr.sin_family == AF_INET && (scrub || hostIsNew)) { int newflags = ia->ia4_flags; ia->ia_ifa.ifa_addr = sintosa(&oldaddr); ia->ia_ifa.ifa_dstaddr = sintosa(&olddst); ia->ia4_flags = oldflags; if (hostIsNew) in_scrubaddr(ia); else if (scrub) in_scrubprefix(ia); ia->ia_ifa.ifa_addr = sintosa(&ia->ia_addr); ia->ia_ifa.ifa_dstaddr = sintosa(&ia->ia_dstaddr); ia->ia4_flags = newflags; } i = ia->ia_addr.sin_addr.s_addr; if (ifp->if_flags & IFF_POINTOPOINT) ia->ia_netmask = INADDR_BROADCAST; /* default to /32 */ else if (IN_CLASSA(i)) ia->ia_netmask = IN_CLASSA_NET; else if (IN_CLASSB(i)) ia->ia_netmask = IN_CLASSB_NET; else ia->ia_netmask = IN_CLASSC_NET; /* * The subnet mask usually includes at least the standard network part, * but may may be smaller in the case of supernetting. * If it is set, we believe it. */ if (ia->ia_subnetmask == 0) { ia->ia_subnetmask = ia->ia_netmask; ia->ia_sockmask.sin_addr.s_addr = ia->ia_subnetmask; } else ia->ia_netmask &= ia->ia_subnetmask; ia->ia_net = i & ia->ia_netmask; ia->ia_subnet = i & ia->ia_subnetmask; in_socktrim(&ia->ia_sockmask); /* re-calculate the "in_maxmtu" value */ in_setmaxmtu(); ia->ia_ifa.ifa_metric = ifp->if_metric; if (ifp->if_flags & IFF_BROADCAST) { if (ia->ia_subnetmask == IN_RFC3021_MASK) { ia->ia_broadaddr.sin_addr.s_addr = INADDR_BROADCAST; ia->ia_netbroadcast.s_addr = INADDR_BROADCAST; } else { ia->ia_broadaddr.sin_addr.s_addr = ia->ia_subnet | ~ia->ia_subnetmask; ia->ia_netbroadcast.s_addr = ia->ia_net | ~ia->ia_netmask; } } else if (ifp->if_flags & IFF_LOOPBACK) { ia->ia_dstaddr = ia->ia_addr; flags |= RTF_HOST; } else if (ifp->if_flags & IFF_POINTOPOINT) { if (ia->ia_dstaddr.sin_family != AF_INET) return (0); flags |= RTF_HOST; } /* Add the local route to the address */ in_ifaddlocal(&ia->ia_ifa); /* Add the prefix route for the address */ error = in_addprefix(ia, flags); /* * If the interface supports multicast, join the "all hosts" * multicast group on that interface. */ mutex_enter(&in_ifaddr_lock); if ((ifp->if_flags & IFF_MULTICAST) != 0 && ia->ia_allhosts == NULL) { struct in_addr addr; addr.s_addr = INADDR_ALLHOSTS_GROUP; ia->ia_allhosts = in_addmulti(&addr, ifp); } mutex_exit(&in_ifaddr_lock); if (hostIsNew && ia->ia4_flags & IN_IFF_TENTATIVE && if_do_dad(ifp)) ia->ia_dad_start((struct ifaddr *)ia); return error; } #define rtinitflags(x) \ ((((x)->ia_ifp->if_flags & (IFF_LOOPBACK | IFF_POINTOPOINT)) != 0) \ ? RTF_HOST : 0) /* * add a route to prefix ("connected route" in cisco terminology). * does nothing if there's some interface address with the same prefix already. */ static int in_addprefix(struct in_ifaddr *target, int flags) { struct in_ifaddr *ia; struct in_addr prefix, mask, p; int error; int s; if ((flags & RTF_HOST) != 0) prefix = target->ia_dstaddr.sin_addr; else { prefix = target->ia_addr.sin_addr; mask = target->ia_sockmask.sin_addr; prefix.s_addr &= mask.s_addr; } s = pserialize_read_enter(); IN_ADDRLIST_READER_FOREACH(ia) { if (rtinitflags(ia)) p = ia->ia_dstaddr.sin_addr; else { p = ia->ia_addr.sin_addr; p.s_addr &= ia->ia_sockmask.sin_addr.s_addr; } if (prefix.s_addr != p.s_addr) continue; if ((ia->ia_ifp->if_flags & IFF_UNNUMBERED)) continue; /* * if we got a matching prefix route inserted by other * interface address, we don't need to bother * * XXX RADIX_MPATH implications here? -dyoung */ if (ia->ia_flags & IFA_ROUTE) { pserialize_read_exit(s); return 0; } } pserialize_read_exit(s); /* * noone seem to have prefix route. insert it. */ if (target->ia_ifa.ifa_ifp->if_flags & IFF_UNNUMBERED) { error = 0; } else { error = rtinit(&target->ia_ifa, RTM_ADD, flags); if (error == 0) target->ia_flags |= IFA_ROUTE; else if (error == EEXIST) { /* * the fact the route already exists is not an error. */ error = 0; } } return error; } static int in_rt_ifa_matcher(struct rtentry *rt, void *v) { struct ifaddr *ifa = v; if (rt->rt_ifa == ifa) return 1; else return 0; } /* * remove a route to prefix ("connected route" in cisco terminology). * re-installs the route by using another interface address, if there's one * with the same prefix (otherwise we lose the route mistakenly). */ static int in_scrubprefix(struct in_ifaddr *target) { struct in_ifaddr *ia; struct in_addr prefix, mask, p; int error; int s; /* If we don't have IFA_ROUTE we have nothing to do */ if ((target->ia_flags & IFA_ROUTE) == 0) return 0; if (rtinitflags(target)) prefix = target->ia_dstaddr.sin_addr; else { prefix = target->ia_addr.sin_addr; mask = target->ia_sockmask.sin_addr; prefix.s_addr &= mask.s_addr; } s = pserialize_read_enter(); IN_ADDRLIST_READER_FOREACH(ia) { if (rtinitflags(ia)) p = ia->ia_dstaddr.sin_addr; else { p = ia->ia_addr.sin_addr; p.s_addr &= ia->ia_sockmask.sin_addr.s_addr; } if (prefix.s_addr != p.s_addr) continue; if ((ia->ia_ifp->if_flags & IFF_UNNUMBERED)) continue; /* * if we got a matching prefix route, move IFA_ROUTE to him */ if ((ia->ia_flags & IFA_ROUTE) == 0) { struct psref psref; int bound = curlwp_bind(); ia4_acquire(ia, &psref); pserialize_read_exit(s); rtinit(&target->ia_ifa, RTM_DELETE, rtinitflags(target)); target->ia_flags &= ~IFA_ROUTE; error = rtinit(&ia->ia_ifa, RTM_ADD, rtinitflags(ia) | RTF_UP); if (error == 0) ia->ia_flags |= IFA_ROUTE; if (!ISSET(target->ia_ifa.ifa_flags, IFA_DESTROYING)) goto skip; /* * Replace rt_ifa of routes that have the removing address * with the new address. */ rt_replace_ifa_matched_entries(AF_INET, in_rt_ifa_matcher, &target->ia_ifa, &ia->ia_ifa); skip: ia4_release(ia, &psref); curlwp_bindx(bound); return error; } } pserialize_read_exit(s); /* * noone seem to have prefix route. remove it. */ rtinit(&target->ia_ifa, RTM_DELETE, rtinitflags(target)); target->ia_flags &= ~IFA_ROUTE; if (ISSET(target->ia_ifa.ifa_flags, IFA_DESTROYING)) { /* Remove routes that have the removing address as rt_ifa. */ rt_delete_matched_entries(AF_INET, in_rt_ifa_matcher, &target->ia_ifa, true); } return 0; } #undef rtinitflags /* * Return 1 if the address might be a local broadcast address. */ int in_broadcast(struct in_addr in, struct ifnet *ifp) { struct ifaddr *ifa; int s; KASSERT(ifp != NULL); if (in.s_addr == INADDR_BROADCAST || in_nullhost(in)) return 1; if ((ifp->if_flags & IFF_BROADCAST) == 0) return 0; /* * Look through the list of addresses for a match * with a broadcast address. */ #define ia (ifatoia(ifa)) s = pserialize_read_enter(); IFADDR_READER_FOREACH(ifa, ifp) { if (ifa->ifa_addr->sa_family == AF_INET && !in_hosteq(in, ia->ia_addr.sin_addr) && (in_hosteq(in, ia->ia_broadaddr.sin_addr) || in_hosteq(in, ia->ia_netbroadcast) || (hostzeroisbroadcast && /* * Check for old-style (host 0) broadcast, but * taking into account that RFC 3021 obsoletes it. */ ia->ia_subnetmask != IN_RFC3021_MASK && (in.s_addr == ia->ia_subnet || in.s_addr == ia->ia_net)))) { pserialize_read_exit(s); return 1; } } pserialize_read_exit(s); return (0); #undef ia } /* * perform DAD when interface becomes IFF_UP. */ void in_if_link_up(struct ifnet *ifp) { struct ifaddr *ifa; struct in_ifaddr *ia; int s, bound; /* Ensure it's sane to run DAD */ if (ifp->if_link_state == LINK_STATE_DOWN) return; if ((ifp->if_flags & (IFF_UP|IFF_RUNNING)) != (IFF_UP|IFF_RUNNING)) return; bound = curlwp_bind(); s = pserialize_read_enter(); IFADDR_READER_FOREACH(ifa, ifp) { struct psref psref; if (ifa->ifa_addr->sa_family != AF_INET) continue; ifa_acquire(ifa, &psref); pserialize_read_exit(s); ia = (struct in_ifaddr *)ifa; /* If detached then mark as tentative */ if (ia->ia4_flags & IN_IFF_DETACHED) { ia->ia4_flags &= ~IN_IFF_DETACHED; if (ip_dad_enabled() && if_do_dad(ifp) && ia->ia_dad_start != NULL) ia->ia4_flags |= IN_IFF_TENTATIVE; else if ((ia->ia4_flags & IN_IFF_TENTATIVE) == 0) rt_addrmsg(RTM_NEWADDR, ifa); } if (ia->ia4_flags & IN_IFF_TENTATIVE) { /* Clear the duplicated flag as we're starting DAD. */ ia->ia4_flags &= ~IN_IFF_DUPLICATED; ia->ia_dad_start(ifa); } s = pserialize_read_enter(); ifa_release(ifa, &psref); } pserialize_read_exit(s); curlwp_bindx(bound); } void in_if_up(struct ifnet *ifp) { /* interface may not support link state, so bring it up also */ in_if_link_up(ifp); } /* * Mark all addresses as detached. */ void in_if_link_down(struct ifnet *ifp) { struct ifaddr *ifa; struct in_ifaddr *ia; int s, bound; bound = curlwp_bind(); s = pserialize_read_enter(); IFADDR_READER_FOREACH(ifa, ifp) { struct psref psref; if (ifa->ifa_addr->sa_family != AF_INET) continue; ifa_acquire(ifa, &psref); pserialize_read_exit(s); ia = (struct in_ifaddr *)ifa; /* Stop DAD processing */ if (ia->ia_dad_stop != NULL) ia->ia_dad_stop(ifa); /* * Mark the address as detached. */ if (!(ia->ia4_flags & IN_IFF_DETACHED)) { ia->ia4_flags |= IN_IFF_DETACHED; ia->ia4_flags &= ~(IN_IFF_TENTATIVE | IN_IFF_DUPLICATED); rt_addrmsg(RTM_NEWADDR, ifa); } s = pserialize_read_enter(); ifa_release(ifa, &psref); } pserialize_read_exit(s); curlwp_bindx(bound); } void in_if_down(struct ifnet *ifp) { in_if_link_down(ifp); #if NARP > 0 lltable_purge_entries(LLTABLE(ifp)); #endif } void in_if_link_state_change(struct ifnet *ifp, int link_state) { /* * Treat LINK_STATE_UNKNOWN as UP. * LINK_STATE_UNKNOWN transitions to LINK_STATE_DOWN when * if_link_state_change() transitions to LINK_STATE_UP. */ if (link_state == LINK_STATE_DOWN) in_if_link_down(ifp); else in_if_link_up(ifp); } /* * in_lookup_multi: look up the in_multi record for a given IP * multicast address on a given interface. If no matching record is * found, return NULL. */ struct in_multi * in_lookup_multi(struct in_addr addr, ifnet_t *ifp) { struct in_multi *inm; KASSERT(rw_lock_held(&in_multilock)); LIST_FOREACH(inm, &IN_MULTI_HASH(addr.s_addr, ifp), inm_list) { if (in_hosteq(inm->inm_addr, addr) && inm->inm_ifp == ifp) break; } return inm; } /* * in_multi_group: check whether the address belongs to an IP multicast * group we are joined on this interface. Returns true or false. */ bool in_multi_group(struct in_addr addr, ifnet_t *ifp, int flags) { bool ingroup; if (__predict_true(flags & IP_IGMP_MCAST) == 0) { rw_enter(&in_multilock, RW_READER); ingroup = in_lookup_multi(addr, ifp) != NULL; rw_exit(&in_multilock); } else { /* XXX Recursive call from ip_output(). */ KASSERT(rw_lock_held(&in_multilock)); ingroup = in_lookup_multi(addr, ifp) != NULL; } return ingroup; } /* * Add an address to the list of IP multicast addresses for a given interface. */ struct in_multi * in_addmulti(struct in_addr *ap, ifnet_t *ifp) { struct sockaddr_in sin; struct in_multi *inm; /* * See if address already in list. */ rw_enter(&in_multilock, RW_WRITER); inm = in_lookup_multi(*ap, ifp); if (inm != NULL) { /* * Found it; just increment the reference count. */ inm->inm_refcount++; rw_exit(&in_multilock); return inm; } /* * New address; allocate a new multicast record. */ inm = pool_get(&inmulti_pool, PR_NOWAIT); if (inm == NULL) { rw_exit(&in_multilock); return NULL; } inm->inm_addr = *ap; inm->inm_ifp = ifp; inm->inm_refcount = 1; /* * Ask the network driver to update its multicast reception * filter appropriately for the new address. */ sockaddr_in_init(&sin, ap, 0); if (if_mcast_op(ifp, SIOCADDMULTI, sintosa(&sin)) != 0) { rw_exit(&in_multilock); pool_put(&inmulti_pool, inm); return NULL; } /* * Let IGMP know that we have joined a new IP multicast group. */ if (igmp_joingroup(inm) != 0) { rw_exit(&in_multilock); pool_put(&inmulti_pool, inm); return NULL; } LIST_INSERT_HEAD( &IN_MULTI_HASH(inm->inm_addr.s_addr, ifp), inm, inm_list); in_multientries++; rw_exit(&in_multilock); return inm; } /* * Delete a multicast address record. */ void in_delmulti(struct in_multi *inm) { struct sockaddr_in sin; rw_enter(&in_multilock, RW_WRITER); if (--inm->inm_refcount > 0) { rw_exit(&in_multilock); return; } /* * No remaining claims to this record; let IGMP know that * we are leaving the multicast group. */ igmp_leavegroup(inm); /* * Notify the network driver to update its multicast reception * filter. */ sockaddr_in_init(&sin, &inm->inm_addr, 0); if_mcast_op(inm->inm_ifp, SIOCDELMULTI, sintosa(&sin)); /* * Unlink from list. */ LIST_REMOVE(inm, inm_list); in_multientries--; rw_exit(&in_multilock); pool_put(&inmulti_pool, inm); } /* * in_next_multi: step through all of the in_multi records, one at a time. * The current position is remembered in "step", which the caller must * provide. in_first_multi(), below, must be called to initialize "step" * and get the first record. Both macros return a NULL "inm" when there * are no remaining records. */ struct in_multi * in_next_multi(struct in_multistep *step) { struct in_multi *inm; KASSERT(rw_lock_held(&in_multilock)); while (step->i_inm == NULL && step->i_n < IN_MULTI_HASH_SIZE) { step->i_inm = LIST_FIRST(&in_multihashtbl[++step->i_n]); } if ((inm = step->i_inm) != NULL) { step->i_inm = LIST_NEXT(inm, inm_list); } return inm; } struct in_multi * in_first_multi(struct in_multistep *step) { KASSERT(rw_lock_held(&in_multilock)); step->i_n = 0; step->i_inm = LIST_FIRST(&in_multihashtbl[0]); return in_next_multi(step); } void in_multi_lock(int op) { rw_enter(&in_multilock, op); } void in_multi_unlock(void) { rw_exit(&in_multilock); } int in_multi_lock_held(void) { return rw_lock_held(&in_multilock); } struct in_ifaddr * in_selectsrc(struct sockaddr_in *sin, struct route *ro, int soopts, struct ip_moptions *mopts, int *errorp, struct psref *psref) { struct rtentry *rt = NULL; struct in_ifaddr *ia = NULL; KASSERT(ISSET(curlwp->l_pflag, LP_BOUND)); /* * If route is known or can be allocated now, take the * source address from the interface. Otherwise, punt. */ if ((soopts & SO_DONTROUTE) != 0) rtcache_free(ro); else { union { struct sockaddr dst; struct sockaddr_in dst4; } u; sockaddr_in_init(&u.dst4, &sin->sin_addr, 0); rt = rtcache_lookup(ro, &u.dst); } /* * If we found a route, use the address * corresponding to the outgoing interface * unless it is the loopback (in case a route * to our address on another net goes to loopback). * * XXX Is this still true? Do we care? */ if (rt != NULL && (rt->rt_ifp->if_flags & IFF_LOOPBACK) == 0) { int s; struct ifaddr *ifa; /* * Just in case. May not need to do this workaround. * Revisit when working on rtentry MP-ification. */ s = pserialize_read_enter(); IFADDR_READER_FOREACH(ifa, rt->rt_ifp) { if (ifa == rt->rt_ifa) break; } if (ifa != NULL) ifa_acquire(ifa, psref); pserialize_read_exit(s); ia = ifatoia(ifa); } if (ia == NULL) { in_port_t fport = sin->sin_port; struct ifaddr *ifa; int s; sin->sin_port = 0; ifa = ifa_ifwithladdr_psref(sintosa(sin), psref); sin->sin_port = fport; if (ifa == NULL) { /* Find 1st non-loopback AF_INET address */ s = pserialize_read_enter(); IN_ADDRLIST_READER_FOREACH(ia) { if (!(ia->ia_ifp->if_flags & IFF_LOOPBACK)) break; } if (ia != NULL) ia4_acquire(ia, psref); pserialize_read_exit(s); } else { /* ia is already referenced by psref */ ia = ifatoia(ifa); } if (ia == NULL) { *errorp = EADDRNOTAVAIL; goto out; } } /* * If the destination address is multicast and an outgoing * interface has been set as a multicast option, use the * address of that interface as our source address. */ if (IN_MULTICAST(sin->sin_addr.s_addr) && mopts != NULL) { struct ip_moptions *imo; imo = mopts; if (imo->imo_multicast_if_index != 0) { struct ifnet *ifp; int s; if (ia != NULL) ia4_release(ia, psref); s = pserialize_read_enter(); ifp = if_byindex(imo->imo_multicast_if_index); if (ifp != NULL) { /* XXX */ ia = in_get_ia_from_ifp_psref(ifp, psref); } else ia = NULL; if (ia == NULL || ia->ia4_flags & IN_IFF_NOTREADY) { pserialize_read_exit(s); if (ia != NULL) ia4_release(ia, psref); *errorp = EADDRNOTAVAIL; ia = NULL; goto out; } pserialize_read_exit(s); } } if (ia->ia_ifa.ifa_getifa != NULL) { ia = ifatoia((*ia->ia_ifa.ifa_getifa)(&ia->ia_ifa, sintosa(sin))); if (ia == NULL) { *errorp = EADDRNOTAVAIL; goto out; } /* FIXME NOMPSAFE */ ia4_acquire(ia, psref); } #ifdef GETIFA_DEBUG else printf("%s: missing ifa_getifa\n", __func__); #endif out: rtcache_unref(rt, ro); return ia; } int in_tunnel_validate(const struct ip *ip, struct in_addr src, struct in_addr dst) { struct in_ifaddr *ia4; int s; /* check for address match */ if (src.s_addr != ip->ip_dst.s_addr || dst.s_addr != ip->ip_src.s_addr) return 0; /* martian filters on outer source - NOT done in ip_input! */ if (IN_MULTICAST(ip->ip_src.s_addr)) return 0; switch ((ntohl(ip->ip_src.s_addr) & 0xff000000) >> 24) { case 0: case 127: case 255: return 0; } /* reject packets with broadcast on source */ s = pserialize_read_enter(); IN_ADDRLIST_READER_FOREACH(ia4) { if ((ia4->ia_ifa.ifa_ifp->if_flags & IFF_BROADCAST) == 0) continue; if (ip->ip_src.s_addr == ia4->ia_broadaddr.sin_addr.s_addr) { pserialize_read_exit(s); return 0; } } pserialize_read_exit(s); /* NOTE: packet may dropped by uRPF */ /* return valid bytes length */ return sizeof(src) + sizeof(dst); } #if NARP > 0 #define IN_LLTBL_DEFAULT_HSIZE 32 #define IN_LLTBL_HASH(k, h) \ (((((((k >> 8) ^ k) >> 8) ^ k) >> 8) ^ k) & ((h) - 1)) /* * Do actual deallocation of @lle. * Called by LLE_FREE_LOCKED when number of references * drops to zero. */ static void in_lltable_destroy_lle(struct llentry *lle) { KASSERTMSG(lle->la_numheld == 0, "la_numheld=%d", lle->la_numheld); LLE_WUNLOCK(lle); LLE_LOCK_DESTROY(lle); llentry_pool_put(lle); } static struct llentry * in_lltable_new(struct in_addr addr4, u_int flags) { struct llentry *lle; lle = llentry_pool_get(PR_NOWAIT); if (lle == NULL) /* NB: caller generates msg */ return NULL; lle->r_l3addr.addr4 = addr4; lle->lle_refcnt = 1; lle->lle_free = in_lltable_destroy_lle; LLE_LOCK_INIT(lle); callout_init(&lle->la_timer, CALLOUT_MPSAFE); return lle; } #define IN_ARE_MASKED_ADDR_EQUAL(d, a, m) ( \ (((ntohl((d).s_addr) ^ (a)->sin_addr.s_addr) & (m)->sin_addr.s_addr)) == 0 ) static int in_lltable_match_prefix(const struct sockaddr *prefix, const struct sockaddr *mask, u_int flags, struct llentry *lle) { const struct sockaddr_in *pfx = (const struct sockaddr_in *)prefix; const struct sockaddr_in *msk = (const struct sockaddr_in *)mask; struct in_addr lle_addr; lle_addr.s_addr = ntohl(lle->r_l3addr.addr4.s_addr); /* * (flags & LLE_STATIC) means deleting all entries * including static ARP entries. */ if (IN_ARE_MASKED_ADDR_EQUAL(lle_addr, pfx, msk) && ((flags & LLE_STATIC) || !(lle->la_flags & LLE_STATIC))) return (1); return (0); } static void in_lltable_free_entry(struct lltable *llt, struct llentry *lle) { size_t pkts_dropped; LLE_WLOCK_ASSERT(lle); KASSERT(llt != NULL); pkts_dropped = llentry_free(lle); arp_stat_add(ARP_STAT_DFRDROPPED, (uint64_t)pkts_dropped); } static int in_lltable_rtcheck(struct ifnet *ifp, u_int flags, const struct sockaddr *l3addr, const struct rtentry *rt) { int error = EINVAL; if (rt == NULL) return error; /* * If the gateway for an existing host route matches the target L3 * address, which is a special route inserted by some implementation * such as MANET, and the interface is of the correct type, then * allow for ARP to proceed. */ if (rt->rt_flags & RTF_GATEWAY) { if (!(rt->rt_flags & RTF_HOST) || !rt->rt_ifp || rt->rt_ifp->if_type != IFT_ETHER || (rt->rt_ifp->if_flags & IFF_NOARP) != 0 || memcmp(rt->rt_gateway->sa_data, l3addr->sa_data, sizeof(in_addr_t)) != 0) { goto error; } } /* * Make sure that at least the destination address is covered * by the route. This is for handling the case where 2 or more * interfaces have the same prefix. An incoming packet arrives * on one interface and the corresponding outgoing packet leaves * another interface. */ if (!(rt->rt_flags & RTF_HOST) && rt->rt_ifp != ifp) { const char *sa, *mask, *addr, *lim; int len; mask = (const char *)rt_mask(rt); /* * Just being extra cautious to avoid some custom * code getting into trouble. */ if (mask == NULL) goto error; sa = (const char *)rt_getkey(rt); addr = (const char *)l3addr; len = ((const struct sockaddr_in *)l3addr)->sin_len; lim = addr + len; for ( ; addr < lim; sa++, mask++, addr++) { if ((*sa ^ *addr) & *mask) { #ifdef DIAGNOSTIC log(LOG_INFO, "IPv4 address: \"%s\" is not on the network\n", inet_ntoa(((const struct sockaddr_in *)l3addr)->sin_addr)); #endif goto error; } } } error = 0; error: return error; } static inline uint32_t in_lltable_hash_dst(const struct in_addr dst, uint32_t hsize) { return (IN_LLTBL_HASH(dst.s_addr, hsize)); } static uint32_t in_lltable_hash(const struct llentry *lle, uint32_t hsize) { return (in_lltable_hash_dst(lle->r_l3addr.addr4, hsize)); } static void in_lltable_fill_sa_entry(const struct llentry *lle, struct sockaddr *sa) { struct sockaddr_in *sin; sin = (struct sockaddr_in *)sa; memset(sin, 0, sizeof(*sin)); sin->sin_family = AF_INET; sin->sin_len = sizeof(*sin); sin->sin_addr = lle->r_l3addr.addr4; } static inline struct llentry * in_lltable_find_dst(struct lltable *llt, struct in_addr dst) { struct llentry *lle; struct llentries *lleh; u_int hashidx; hashidx = in_lltable_hash_dst(dst, llt->llt_hsize); lleh = &llt->lle_head[hashidx]; LIST_FOREACH(lle, lleh, lle_next) { if (lle->la_flags & LLE_DELETED) continue; if (lle->r_l3addr.addr4.s_addr == dst.s_addr) break; } return (lle); } static int in_lltable_delete(struct lltable *llt, u_int flags, const struct sockaddr *l3addr) { const struct sockaddr_in *sin = (const struct sockaddr_in *)l3addr; struct ifnet *ifp __diagused = llt->llt_ifp; struct llentry *lle; IF_AFDATA_WLOCK_ASSERT(ifp); KASSERTMSG(l3addr->sa_family == AF_INET, "sin_family %d", l3addr->sa_family); lle = in_lltable_find_dst(llt, sin->sin_addr); if (lle == NULL) { #ifdef LLTABLE_DEBUG char buf[64]; sockaddr_format(l3addr, buf, sizeof(buf)); log(LOG_INFO, "%s: cache for %s is not found\n", __func__, buf); #endif return (ENOENT); } LLE_WLOCK(lle); #ifdef LLTABLE_DEBUG { char buf[64]; sockaddr_format(l3addr, buf, sizeof(buf)); log(LOG_INFO, "%s: cache for %s (%p) is deleted\n", __func__, buf, lle); } #endif llentry_free(lle); return (0); } static struct llentry * in_lltable_create(struct lltable *llt, u_int flags, const struct sockaddr *l3addr, const struct rtentry *rt) { const struct sockaddr_in *sin = (const struct sockaddr_in *)l3addr; struct ifnet *ifp = llt->llt_ifp; struct llentry *lle; IF_AFDATA_WLOCK_ASSERT(ifp); KASSERTMSG(l3addr->sa_family == AF_INET, "sin_family %d", l3addr->sa_family); lle = in_lltable_find_dst(llt, sin->sin_addr); if (lle != NULL) { LLE_WLOCK(lle); return (lle); } /* no existing record, we need to create new one */ /* * A route that covers the given address must have * been installed 1st because we are doing a resolution, * verify this. */ if (!(flags & LLE_IFADDR) && in_lltable_rtcheck(ifp, flags, l3addr, rt) != 0) return (NULL); lle = in_lltable_new(sin->sin_addr, flags); if (lle == NULL) { log(LOG_INFO, "lla_lookup: new lle malloc failed\n"); return (NULL); } lle->la_flags = flags; if ((flags & LLE_IFADDR) == LLE_IFADDR) { memcpy(&lle->ll_addr, CLLADDR(ifp->if_sadl), ifp->if_addrlen); lle->la_flags |= (LLE_VALID | LLE_STATIC); } lltable_link_entry(llt, lle); LLE_WLOCK(lle); return (lle); } /* * Return NULL if not found or marked for deletion. * If found return lle read locked. */ static struct llentry * in_lltable_lookup(struct lltable *llt, u_int flags, const struct sockaddr *l3addr) { const struct sockaddr_in *sin = (const struct sockaddr_in *)l3addr; struct llentry *lle; IF_AFDATA_LOCK_ASSERT(llt->llt_ifp); KASSERTMSG(l3addr->sa_family == AF_INET, "sin_family %d", l3addr->sa_family); lle = in_lltable_find_dst(llt, sin->sin_addr); if (lle == NULL) return NULL; if (flags & LLE_EXCLUSIVE) LLE_WLOCK(lle); else LLE_RLOCK(lle); return lle; } static int in_lltable_dump_entry(struct lltable *llt, struct llentry *lle, struct rt_walkarg *w) { struct sockaddr_in sin; LLTABLE_LOCK_ASSERT(); /* skip deleted entries */ if (lle->la_flags & LLE_DELETED) return 0; sockaddr_in_init(&sin, &lle->r_l3addr.addr4, 0); return lltable_dump_entry(llt, lle, w, sintosa(&sin)); } #endif /* NARP > 0 */ static int in_multicast_sysctl(SYSCTLFN_ARGS) { struct ifnet *ifp; struct ifaddr *ifa; struct in_ifaddr *ifa4; struct in_multi *inm; uint32_t tmp; int error; size_t written; struct psref psref; int bound; if (namelen != 1) return EINVAL; bound = curlwp_bind(); ifp = if_get_byindex(name[0], &psref); if (ifp == NULL) { curlwp_bindx(bound); return ENODEV; } if (oldp == NULL) { *oldlenp = 0; IFADDR_FOREACH(ifa, ifp) { if (ifa->ifa_addr->sa_family != AF_INET) continue; ifa4 = (void *)ifa; LIST_FOREACH(inm, &ifa4->ia_multiaddrs, inm_list) { *oldlenp += 2 * sizeof(struct in_addr) + sizeof(uint32_t); } } if_put(ifp, &psref); curlwp_bindx(bound); return 0; } error = 0; written = 0; IFADDR_FOREACH(ifa, ifp) { if (ifa->ifa_addr->sa_family != AF_INET) continue; ifa4 = (void *)ifa; LIST_FOREACH(inm, &ifa4->ia_multiaddrs, inm_list) { if (written + 2 * sizeof(struct in_addr) + sizeof(uint32_t) > *oldlenp) goto done; error = sysctl_copyout(l, &ifa4->ia_addr.sin_addr, oldp, sizeof(struct in_addr)); if (error) goto done; oldp = (char *)oldp + sizeof(struct in_addr); written += sizeof(struct in_addr); error = sysctl_copyout(l, &inm->inm_addr, oldp, sizeof(struct in_addr)); if (error) goto done; oldp = (char *)oldp + sizeof(struct in_addr); written += sizeof(struct in_addr); tmp = inm->inm_refcount; error = sysctl_copyout(l, &tmp, oldp, sizeof(tmp)); if (error) goto done; oldp = (char *)oldp + sizeof(tmp); written += sizeof(tmp); } } done: if_put(ifp, &psref); curlwp_bindx(bound); *oldlenp = written; return error; } static void in_sysctl_init(struct sysctllog **clog) { sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT, CTLTYPE_NODE, "inet", SYSCTL_DESCR("PF_INET related settings"), NULL, 0, NULL, 0, CTL_NET, PF_INET, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT, CTLTYPE_NODE, "multicast", SYSCTL_DESCR("Multicast information"), in_multicast_sysctl, 0, NULL, 0, CTL_NET, PF_INET, CTL_CREATE, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT, CTLTYPE_NODE, "ip", SYSCTL_DESCR("IPv4 related settings"), NULL, 0, NULL, 0, CTL_NET, PF_INET, IPPROTO_IP, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_INT, "subnetsarelocal", SYSCTL_DESCR("Whether logical subnets are considered " "local"), NULL, 0, &subnetsarelocal, 0, CTL_NET, PF_INET, IPPROTO_IP, IPCTL_SUBNETSARELOCAL, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_INT, "hostzerobroadcast", SYSCTL_DESCR("All zeroes address is broadcast address"), NULL, 0, &hostzeroisbroadcast, 0, CTL_NET, PF_INET, IPPROTO_IP, IPCTL_HOSTZEROBROADCAST, CTL_EOL); } #if NARP > 0 static struct lltable * in_lltattach(struct ifnet *ifp, struct in_ifinfo *ii) { struct lltable *llt; llt = lltable_allocate_htbl(IN_LLTBL_DEFAULT_HSIZE); llt->llt_af = AF_INET; llt->llt_ifp = ifp; llt->llt_lookup = in_lltable_lookup; llt->llt_create = in_lltable_create; llt->llt_delete = in_lltable_delete; llt->llt_dump_entry = in_lltable_dump_entry; llt->llt_hash = in_lltable_hash; llt->llt_fill_sa_entry = in_lltable_fill_sa_entry; llt->llt_free_entry = in_lltable_free_entry; llt->llt_match_prefix = in_lltable_match_prefix; #ifdef MBUFTRACE struct mowner *mowner = &ii->ii_mowner; mowner_init_owner(mowner, ifp->if_xname, "arp"); MOWNER_ATTACH(mowner); llt->llt_mowner = mowner; #endif lltable_link(llt); return (llt); } #endif /* NARP > 0 */ void * in_domifattach(struct ifnet *ifp) { struct in_ifinfo *ii; ii = kmem_zalloc(sizeof(struct in_ifinfo), KM_SLEEP); #if NARP > 0 ii->ii_llt = in_lltattach(ifp, ii); #endif #ifdef IPSELSRC ii->ii_selsrc = in_selsrc_domifattach(ifp); KASSERT(ii->ii_selsrc != NULL); #endif return ii; } void in_domifdetach(struct ifnet *ifp, void *aux) { struct in_ifinfo *ii = aux; #ifdef IPSELSRC in_selsrc_domifdetach(ifp, ii->ii_selsrc); #endif #if NARP > 0 lltable_free(ii->ii_llt); #ifdef MBUFTRACE MOWNER_DETACH(&ii->ii_mowner); #endif #endif kmem_free(ii, sizeof(struct in_ifinfo)); }
3 1 2 2 2 15 5 7 4 8 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 /* $NetBSD: kern_50.c,v 1.3 2020/01/29 15:47:51 ad Exp $ */ /*- * Copyright (c) 2008, 2009, 2020 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Christos Zoulas. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: kern_50.c,v 1.3 2020/01/29 15:47:51 ad Exp $"); #if defined(_KERNEL_OPT) #include "opt_compat_netbsd.h" #endif #include <sys/param.h> #include <sys/lwp.h> #include <sys/proc.h> #include <sys/syscall.h> #include <sys/syscallvar.h> #include <sys/syscallargs.h> #include <compat/sys/resource.h> #include <compat/sys/time.h> #include <compat/common/compat_mod.h> static const struct syscall_package kern_50_syscalls[] = { { SYS_compat_50__lwp_park, 0, (sy_call_t *)compat_50_sys__lwp_park }, { SYS_compat_50___sigtimedwait, 0, (sy_call_t *)compat_50_sys___sigtimedwait }, { SYS_compat_50_wait4, 0, (sy_call_t *)compat_50_sys_wait4 }, { 0, 0, NULL } }; int compat_50_sys__lwp_park(struct lwp *l, const struct compat_50_sys__lwp_park_args *uap, register_t *retval) { /* { syscallarg(const struct timespec50 *) ts; syscallarg(lwpid_t) unpark; syscallarg(const void *) hint; syscallarg(const void *) unparkhint; } */ struct timespec ts, *tsp; struct timespec50 ts50; int error; if (SCARG(uap, ts) == NULL) tsp = NULL; else { error = copyin(SCARG(uap, ts), &ts50, sizeof(ts50)); if (error != 0) return error; timespec50_to_timespec(&ts50, &ts); tsp = &ts; } if (SCARG(uap, unpark) != 0) { error = lwp_unpark(&SCARG(uap, unpark), 1); if (error != 0) return error; } return lwp_park(CLOCK_REALTIME, TIMER_ABSTIME, tsp); } static int tscopyin(const void *u, void *s, size_t len) { struct timespec50 ts50; int error; KASSERT(len == sizeof(struct timespec)); error = copyin(u, &ts50, sizeof(ts50)); if (error) return error; timespec50_to_timespec(&ts50, s); return 0; } static int tscopyout(const void *s, void *u, size_t len) { struct timespec50 ts50; KASSERT(len == sizeof(struct timespec)); timespec_to_timespec50(s, &ts50); return copyout(&ts50, u, sizeof(ts50)); } int compat_50_sys___sigtimedwait(struct lwp *l, const struct compat_50_sys___sigtimedwait_args *uap, register_t *retval) { int res; res = sigtimedwait1(l, (const struct sys_____sigtimedwait50_args *)uap, retval, copyin, copyout, tscopyin, tscopyout); if (!res) *retval = 0; /* XXX NetBSD<=5 was not POSIX compliant */ return res; } int compat_50_sys_wait4(struct lwp *l, const struct compat_50_sys_wait4_args *uap, register_t *retval) { /* { syscallarg(int) pid; syscallarg(int *) status; syscallarg(int) options; syscallarg(struct rusage50 *) rusage; } */ int status, error, pid = SCARG(uap, pid); struct rusage50 ru50; struct rusage ru; error = do_sys_wait(&pid, &status, SCARG(uap, options), SCARG(uap, rusage) != NULL ? &ru : NULL); retval[0] = pid; if (pid == 0) return error; if (SCARG(uap, rusage)) { rusage_to_rusage50(&ru, &ru50); error = copyout(&ru50, SCARG(uap, rusage), sizeof(ru50)); } if (error == 0 && SCARG(uap, status)) error = copyout(&status, SCARG(uap, status), sizeof(status)); return error; } int kern_50_init(void) { return syscall_establish(NULL, kern_50_syscalls); } int kern_50_fini(void) { return syscall_disestablish(NULL, kern_50_syscalls); }
1837 1835 1488 1492 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 /* $NetBSD: subr_debug.c,v 1.7 2008/04/30 20:20:53 ad Exp $ */ /*- * Copyright (c) 2007, 2008 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Andrew Doran. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /* * Shared support code for kernels built with the DEBUG option. */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: subr_debug.c,v 1.7 2008/04/30 20:20:53 ad Exp $"); #include "opt_ddb.h" #include <sys/param.h> #include <sys/proc.h> #include <sys/systm.h> #include <sys/kmem.h> #include <sys/debug.h> #include <sys/atomic.h> #include <sys/cpu.h> #include <uvm/uvm_extern.h> #include <machine/lock.h> /* * Allocation/free validation by pointer address. Introduces * significant overhead and is not enabled by default. Patch * `debug_freecheck' to 1 at boot time to enable. */ #define FREECHECK_BYTES (8*1024*1024) typedef struct fcitem { void *i_addr; struct fcitem *i_next; } fcitem_t; fcitem_t *freecheck_free; __cpu_simple_lock_t freecheck_lock; u_int debug_freecheck; void debug_init(void) { size_t cnt; fcitem_t *i; __cpu_simple_lock_init(&freecheck_lock); if (debug_freecheck) { i = (fcitem_t *)uvm_km_alloc(kernel_map, FREECHECK_BYTES, 0, UVM_KMF_WIRED); if (i == NULL) { printf("freecheck_init: unable to allocate memory"); return; } for (cnt = FREECHECK_BYTES / sizeof(*i); cnt != 0; cnt--) { i->i_next = freecheck_free; freecheck_free = i++; } } } void freecheck_out(void **head, void *addr) { fcitem_t *i; int s; if (!debug_freecheck) return; s = splvm(); __cpu_simple_lock(&freecheck_lock); for (i = *head; i != NULL; i = i->i_next) { if (i->i_addr != addr) continue; __cpu_simple_unlock(&freecheck_lock); splx(s); panic("freecheck_out: %p already out", addr); } if ((i = freecheck_free) != NULL) { freecheck_free = i->i_next; i->i_addr = addr; i->i_next = *head; *head = i; } __cpu_simple_unlock(&freecheck_lock); splx(s); if (i == NULL) { if (atomic_swap_uint(&debug_freecheck, 1) == 0) printf("freecheck_out: no more slots\n"); } } void freecheck_in(void **head, void *addr) { fcitem_t *i; void *pp; int s; if (!debug_freecheck) return; s = splvm(); __cpu_simple_lock(&freecheck_lock); for (i = *head, pp = head; i != NULL; pp = &i->i_next, i = i->i_next) { if (i->i_addr == addr) { *(fcitem_t **)pp = i->i_next; i->i_next = freecheck_free; freecheck_free = i; break; } } __cpu_simple_unlock(&freecheck_lock); splx(s); if (i != NULL) return; #ifdef DDB printf("freecheck_in: %p not out\n", addr); Debugger(); #else panic("freecheck_in: %p not out", addr); #endif }
1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 /* $NetBSD: md_root.c,v 1.19 2015/08/30 05:24:03 uebayasi Exp $ */ /*- * Copyright (c) 1996 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Gordon W. Ross. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: md_root.c,v 1.19 2015/08/30 05:24:03 uebayasi Exp $"); #include "opt_md.h" #include "opt_memory_disk_image.h" #include <sys/param.h> #include <sys/systm.h> #include <sys/reboot.h> #include <dev/md.h> #ifdef MEMORY_DISK_DYNAMIC #ifdef makeoptions_MEMORY_DISK_IMAGE #error MEMORY_DISK_DYNAMIC is not compatible with MEMORY_DISK_IMAGE #endif size_t md_root_size; char *md_root_image; #else /* MEMORY_DISK_DYNAMIC */ #ifdef makeoptions_MEMORY_DISK_IMAGE #ifdef MEMORY_DISK_ROOT_SIZE #error MEMORY_DISK_ROOT_SIZE is not compatible with MEMORY_DISK_IMAGE #endif char md_root_image[] = { #include "md_root_image.h" }; uint32_t md_root_size = sizeof(md_root_image) & ~(DEV_BSIZE - 1); #else /* makeoptions_MEMORY_DISK_IMAGE */ #ifndef MEMORY_DISK_ROOT_SIZE #define MEMORY_DISK_ROOT_SIZE 512 #endif #define ROOTBYTES (MEMORY_DISK_ROOT_SIZE << DEV_BSHIFT) /* * This array will be patched to contain a file-system image. * See the program mdsetimage(8) for details. */ uint32_t md_root_size = ROOTBYTES; char md_root_image[ROOTBYTES] = "|This is the root ramdisk!\n"; #endif /* makeoptions_MEMORY_DISK_IMAGE */ #endif /* MEMORY_DISK_DYNAMIC */ #ifndef MEMORY_DISK_RBFLAGS #define MEMORY_DISK_RBFLAGS RB_AUTOBOOT /* default boot mode */ #endif #ifdef MEMORY_DISK_DYNAMIC void md_root_setconf(char *addr, size_t size) { md_is_root = 1; md_root_image = addr; md_root_size = size; } #endif /* MEMORY_DISK_DYNAMIC */ /* * This is called during pseudo-device attachment. */ #define PBUFLEN sizeof("99999 KB") void md_attach_hook(int unit, struct md_conf *md) { char pbuf[PBUFLEN]; if (unit == 0 && md_is_root) { /* Setup root ramdisk */ md->md_addr = (void *)md_root_image; md->md_size = (size_t)md_root_size; md->md_type = MD_KMEM_FIXED; format_bytes(pbuf, sizeof(pbuf), md->md_size); aprint_verbose("md%d: internal %s image area\n", unit, pbuf); } } /* * This is called during open (i.e. mountroot) */ void md_open_hook(int unit, struct md_conf *md) { if (unit == 0 && md_is_root) { boothowto |= MEMORY_DISK_RBFLAGS; } }
397 7 391 2 21 7 7 7 7 7 7 7 7 7 2 7 7 7 7 2 2 2 22 1 22 10 12 22 22 2 2 8 8 20 20 8 13 5 4 6 20 2 9 11 8 11 7 11 11 1 11 11 11 9 3 3 9 9 9 9 9 9 9 9 2 2 1 1 1 3 3 3 3 3 3 3 11 11 22 11 11 11 7 6 10 11 7 11 10 10 10 10 10 10 9 1 9 8 8 8 8 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 /* $NetBSD: ufs_dirhash.c,v 1.41 2022/08/07 02:33:47 simonb Exp $ */ /* * Copyright (c) 2001, 2002 Ian Dowse. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD: src/sys/ufs/ufs/ufs_dirhash.c,v 1.3.2.8 2004/12/08 11:54:13 dwmalone Exp $ */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: ufs_dirhash.c,v 1.41 2022/08/07 02:33:47 simonb Exp $"); /* * This implements a hash-based lookup scheme for UFS directories. */ #include <sys/param.h> #include <sys/systm.h> #include <sys/kernel.h> #include <sys/kmem.h> #include <sys/types.h> #include <sys/hash.h> #include <sys/proc.h> #include <sys/buf.h> #include <sys/vnode.h> #include <sys/mount.h> #include <sys/pool.h> #include <sys/sysctl.h> #include <sys/atomic.h> #include <ufs/ufs/inode.h> #include <ufs/ufs/dir.h> #include <ufs/ufs/dirhash.h> #include <ufs/ufs/ufsmount.h> #include <ufs/ufs/ufs_bswap.h> #include <ufs/ufs/ufs_extern.h> /* * Defaults for dirhash cache sizes: * - use up to 1/64th of system memory. * - disable dirhash (set the cache size to 0 bytes) if the * calculated value of hash is less than 2MB. * - cap maximum size of the dirhash cache at 32MB. */ #define DIRHASH_DEFAULT_DIVIDER 64 #define MIN_DEFAULT_DIRHASH_MEM (2 * 1024 * 1024) #define MAX_DEFAULT_DIRHASH_MEM (32 * 1024 * 1024) #define WRAPINCR(val, limit) (((val) + 1 == (limit)) ? 0 : ((val) + 1)) #define WRAPDECR(val, limit) (((val) == 0) ? ((limit) - 1) : ((val) - 1)) #define OFSFMT(ip) ((ip)->i_ump->um_maxsymlinklen <= 0) #define BLKFREE2IDX(n) ((n) > DH_NFSTATS ? DH_NFSTATS : (n)) static u_int ufs_dirhashminblks = 5; static u_int ufs_dirhashmaxmem = 0; static u_int ufs_dirhashmem; static u_int ufs_dirhashcheck = 0; static int ufsdirhash_hash(struct dirhash *dh, const char *name, int namelen); static void ufsdirhash_adjfree(struct dirhash *dh, doff_t offset, int diff, int dirblksiz); static void ufsdirhash_delslot(struct dirhash *dh, int slot); static int ufsdirhash_findslot(struct dirhash *dh, const char *name, int namelen, doff_t offset); static doff_t ufsdirhash_getprev(struct direct *dp, doff_t offset, int dirblksiz); static int ufsdirhash_recycle(int wanted); static pool_cache_t ufsdirhashblk_cache; static pool_cache_t ufsdirhash_cache; #define DIRHASHLIST_LOCK() mutex_enter(&ufsdirhash_lock) #define DIRHASHLIST_UNLOCK() mutex_exit(&ufsdirhash_lock) #define DIRHASH_LOCK(dh) mutex_enter(&(dh)->dh_lock) #define DIRHASH_UNLOCK(dh) mutex_exit(&(dh)->dh_lock) #define DIRHASH_BLKALLOC() \ pool_cache_get(ufsdirhashblk_cache, PR_NOWAIT) #define DIRHASH_BLKFREE(ptr) \ pool_cache_put(ufsdirhashblk_cache, ptr) /* Dirhash list; recently-used entries are near the tail. */ static TAILQ_HEAD(, dirhash) ufsdirhash_list; /* Protects: ufsdirhash_list, `dh_list' field, ufs_dirhashmem. */ static kmutex_t ufsdirhash_lock; /* * Locking order: * ufsdirhash_lock * dh_lock * * The dh_lock mutex should be acquired either via the inode lock, or via * ufsdirhash_lock. Only the owner of the inode may free the associated * dirhash, but anything can steal its memory and set dh_hash to NULL. */ /* * Attempt to build up a hash table for the directory contents in * inode 'ip'. Returns 0 on success, or -1 of the operation failed. */ int ufsdirhash_build(struct inode *ip) { struct dirhash *dh; struct buf *bp = NULL; struct direct *ep; struct vnode *vp; doff_t bmask, pos; int dirblocks, i, j, memreqd, nblocks, narrays, nslots, slot; const int needswap = UFS_MPNEEDSWAP(ip->i_ump); int dirblksiz = ip->i_ump->um_dirblksiz; /* Check if we can/should use dirhash. */ if (ip->i_dirhash == NULL) { if (ufs_dirhashmaxmem == 0 || ip->i_size < (ufs_dirhashminblks * dirblksiz) || OFSFMT(ip)) return (-1); } else { /* Hash exists, but sysctls could have changed. */ if (ip->i_size < (ufs_dirhashminblks * dirblksiz) || ufs_dirhashmem > ufs_dirhashmaxmem) { ufsdirhash_free(ip); return (-1); } /* Check if hash exists and is intact (note: unlocked read). */ if (ip->i_dirhash->dh_hash != NULL) return (0); /* Free the old, recycled hash and build a new one. */ ufsdirhash_free(ip); } /* Don't hash removed directories. */ if (ip->i_nlink == 0) return (-1); vp = ip->i_vnode; /* Allocate 50% more entries than this dir size could ever need. */ KASSERT(ip->i_size >= dirblksiz); nslots = ip->i_size / UFS_DIRECTSIZ(1); nslots = (nslots * 3 + 1) / 2; narrays = howmany(nslots, DH_NBLKOFF); nslots = narrays * DH_NBLKOFF; dirblocks = howmany(ip->i_size, dirblksiz); nblocks = (dirblocks * 3 + 1) / 2; memreqd = sizeof(*dh) + narrays * sizeof(*dh->dh_hash) + narrays * DH_NBLKOFF * sizeof(**dh->dh_hash) + nblocks * sizeof(*dh->dh_blkfree); while (atomic_add_int_nv(&ufs_dirhashmem, memreqd) > ufs_dirhashmaxmem) { atomic_add_int(&ufs_dirhashmem, -memreqd); if (memreqd > ufs_dirhashmaxmem / 2) return (-1); /* Try to free some space. */ if (ufsdirhash_recycle(memreqd) != 0) return (-1); else DIRHASHLIST_UNLOCK(); } /* * Use non-blocking mallocs so that we will revert to a linear * lookup on failure rather than potentially blocking forever. */ dh = pool_cache_get(ufsdirhash_cache, PR_NOWAIT); if (dh == NULL) { atomic_add_int(&ufs_dirhashmem, -memreqd); return (-1); } memset(dh, 0, sizeof(*dh)); mutex_init(&dh->dh_lock, MUTEX_DEFAULT, IPL_NONE); DIRHASH_LOCK(dh); dh->dh_hashsz = narrays * sizeof(dh->dh_hash[0]); dh->dh_hash = kmem_zalloc(dh->dh_hashsz, KM_NOSLEEP); dh->dh_blkfreesz = nblocks * sizeof(dh->dh_blkfree[0]); dh->dh_blkfree = kmem_zalloc(dh->dh_blkfreesz, KM_NOSLEEP); if (dh->dh_hash == NULL || dh->dh_blkfree == NULL) goto fail; for (i = 0; i < narrays; i++) { if ((dh->dh_hash[i] = DIRHASH_BLKALLOC()) == NULL) goto fail; for (j = 0; j < DH_NBLKOFF; j++) dh->dh_hash[i][j] = DIRHASH_EMPTY; } /* Initialise the hash table and block statistics. */ dh->dh_narrays = narrays; dh->dh_hlen = nslots; dh->dh_nblk = nblocks; dh->dh_dirblks = dirblocks; for (i = 0; i < dirblocks; i++) dh->dh_blkfree[i] = dirblksiz / DIRALIGN; for (i = 0; i < DH_NFSTATS; i++) dh->dh_firstfree[i] = -1; dh->dh_firstfree[DH_NFSTATS] = 0; dh->dh_seqopt = 0; dh->dh_seqoff = 0; dh->dh_score = DH_SCOREINIT; ip->i_dirhash = dh; bmask = VFSTOUFS(vp->v_mount)->um_mountp->mnt_stat.f_iosize - 1; pos = 0; while (pos < ip->i_size) { preempt_point(); /* If necessary, get the next directory block. */ if ((pos & bmask) == 0) { if (bp != NULL) brelse(bp, 0); if (ufs_blkatoff(vp, (off_t)pos, NULL, &bp, false) != 0) goto fail; } /* Add this entry to the hash. */ ep = (struct direct *)((char *)bp->b_data + (pos & bmask)); if (ep->d_reclen == 0 || ep->d_reclen > dirblksiz - (pos & (dirblksiz - 1))) { /* Corrupted directory. */ brelse(bp, 0); goto fail; } if (ep->d_ino != 0) { /* Add the entry (simplified ufsdirhash_add). */ slot = ufsdirhash_hash(dh, ep->d_name, ep->d_namlen); while (DH_ENTRY(dh, slot) != DIRHASH_EMPTY) slot = WRAPINCR(slot, dh->dh_hlen); dh->dh_hused++; DH_ENTRY(dh, slot) = pos; ufsdirhash_adjfree(dh, pos, -UFS_DIRSIZ(0, ep, needswap), dirblksiz); } pos += ep->d_reclen; } if (bp != NULL) brelse(bp, 0); DIRHASHLIST_LOCK(); TAILQ_INSERT_TAIL(&ufsdirhash_list, dh, dh_list); dh->dh_onlist = 1; DIRHASH_UNLOCK(dh); DIRHASHLIST_UNLOCK(); return (0); fail: ip->i_dirhash = NULL; DIRHASH_UNLOCK(dh); if (dh->dh_hash != NULL) { for (i = 0; i < narrays; i++) if (dh->dh_hash[i] != NULL) DIRHASH_BLKFREE(dh->dh_hash[i]); kmem_free(dh->dh_hash, dh->dh_hashsz); } if (dh->dh_blkfree != NULL) kmem_free(dh->dh_blkfree, dh->dh_blkfreesz); mutex_destroy(&dh->dh_lock); pool_cache_put(ufsdirhash_cache, dh); atomic_add_int(&ufs_dirhashmem, -memreqd); return (-1); } /* * Free any hash table associated with inode 'ip'. */ void ufsdirhash_free(struct inode *ip) { struct dirhash *dh; int i, mem; if ((dh = ip->i_dirhash) == NULL) return; ip->i_dirhash = NULL; DIRHASHLIST_LOCK(); if (dh->dh_onlist) TAILQ_REMOVE(&ufsdirhash_list, dh, dh_list); DIRHASHLIST_UNLOCK(); /* The dirhash pointed to by 'dh' is exclusively ours now. */ mem = sizeof(*dh); if (dh->dh_hash != NULL) { for (i = 0; i < dh->dh_narrays; i++) DIRHASH_BLKFREE(dh->dh_hash[i]); kmem_free(dh->dh_hash, dh->dh_hashsz); kmem_free(dh->dh_blkfree, dh->dh_blkfreesz); mem += dh->dh_hashsz; mem += dh->dh_narrays * DH_NBLKOFF * sizeof(**dh->dh_hash); mem += dh->dh_nblk * sizeof(*dh->dh_blkfree); } mutex_destroy(&dh->dh_lock); pool_cache_put(ufsdirhash_cache, dh); atomic_add_int(&ufs_dirhashmem, -mem); } /* * Find the offset of the specified name within the given inode. * Returns 0 on success, ENOENT if the entry does not exist, or * EJUSTRETURN if the caller should revert to a linear search. * * If successful, the directory offset is stored in *offp, and a * pointer to a struct buf containing the entry is stored in *bpp. If * prevoffp is non-NULL, the offset of the previous entry within * the UFS_DIRBLKSIZ-sized block is stored in *prevoffp (if the entry * is the first in a block, the start of the block is used). */ int ufsdirhash_lookup(struct inode *ip, const char *name, int namelen, doff_t *offp, struct buf **bpp, doff_t *prevoffp) { struct dirhash *dh, *dh_next; struct direct *dp; struct vnode *vp; struct buf *bp; doff_t blkoff, bmask, offset, prevoff; int i, slot; const int needswap = UFS_MPNEEDSWAP(ip->i_ump); int dirblksiz = ip->i_ump->um_dirblksiz; if ((dh = ip->i_dirhash) == NULL) return (EJUSTRETURN); /* * Move this dirhash towards the end of the list if it has a * score higher than the next entry, and acquire the dh_lock. * Optimise the case where it's already the last by performing * an unlocked read of the TAILQ_NEXT pointer. * * In both cases, end up holding just dh_lock. */ if (TAILQ_NEXT(dh, dh_list) != NULL) { DIRHASHLIST_LOCK(); DIRHASH_LOCK(dh); /* * If the new score will be greater than that of the next * entry, then move this entry past it. With both mutexes * held, dh_next won't go away, but its dh_score could * change; that's not important since it is just a hint. */ if (dh->dh_hash != NULL && (dh_next = TAILQ_NEXT(dh, dh_list)) != NULL && dh->dh_score >= dh_next->dh_score) { KASSERT(dh->dh_onlist); TAILQ_REMOVE(&ufsdirhash_list, dh, dh_list); TAILQ_INSERT_AFTER(&ufsdirhash_list, dh_next, dh, dh_list); } DIRHASHLIST_UNLOCK(); } else { /* Already the last, though that could change as we wait. */ DIRHASH_LOCK(dh); } if (dh->dh_hash == NULL) { DIRHASH_UNLOCK(dh); ufsdirhash_free(ip); return (EJUSTRETURN); } /* Update the score. */ if (dh->dh_score < DH_SCOREMAX) dh->dh_score++; vp = ip->i_vnode; bmask = VFSTOUFS(vp->v_mount)->um_mountp->mnt_stat.f_iosize - 1; blkoff = -1; bp = NULL; restart: slot = ufsdirhash_hash(dh, name, namelen); if (dh->dh_seqopt) { /* * Sequential access optimisation. dh_seqoff contains the * offset of the directory entry immediately following * the last entry that was looked up. Check if this offset * appears in the hash chain for the name we are looking for. */ for (i = slot; (offset = DH_ENTRY(dh, i)) != DIRHASH_EMPTY; i = WRAPINCR(i, dh->dh_hlen)) if (offset == dh->dh_seqoff) break; if (offset == dh->dh_seqoff) { /* * We found an entry with the expected offset. This * is probably the entry we want, but if not, the * code below will turn off seqoff and retry. */ slot = i; } else dh->dh_seqopt = 0; } for (; (offset = DH_ENTRY(dh, slot)) != DIRHASH_EMPTY; slot = WRAPINCR(slot, dh->dh_hlen)) { if (offset == DIRHASH_DEL) continue; if (offset < 0 || offset >= ip->i_size) panic("ufsdirhash_lookup: bad offset in hash array"); if ((offset & ~bmask) != blkoff) { if (bp != NULL) brelse(bp, 0); blkoff = offset & ~bmask; if (ufs_blkatoff(vp, (off_t)blkoff, NULL, &bp, false) != 0) { DIRHASH_UNLOCK(dh); return (EJUSTRETURN); } } dp = (struct direct *)((char *)bp->b_data + (offset & bmask)); if (dp->d_reclen == 0 || dp->d_reclen > dirblksiz - (offset & (dirblksiz - 1))) { /* Corrupted directory. */ DIRHASH_UNLOCK(dh); brelse(bp, 0); return (EJUSTRETURN); } if (dp->d_namlen == namelen && memcmp(dp->d_name, name, namelen) == 0) { /* Found. Get the prev offset if needed. */ if (prevoffp != NULL) { if (offset & (dirblksiz - 1)) { prevoff = ufsdirhash_getprev(dp, offset, dirblksiz); if (prevoff == -1) { brelse(bp, 0); return (EJUSTRETURN); } } else prevoff = offset; *prevoffp = prevoff; } /* Check for sequential access, and update offset. */ if (dh->dh_seqopt == 0 && dh->dh_seqoff == offset) dh->dh_seqopt = 1; dh->dh_seqoff = offset + UFS_DIRSIZ(0, dp, needswap); DIRHASH_UNLOCK(dh); *bpp = bp; *offp = offset; return (0); } if (dh->dh_hash == NULL) { DIRHASH_UNLOCK(dh); if (bp != NULL) brelse(bp, 0); ufsdirhash_free(ip); return (EJUSTRETURN); } /* * When the name doesn't match in the seqopt case, go back * and search normally. */ if (dh->dh_seqopt) { dh->dh_seqopt = 0; goto restart; } } DIRHASH_UNLOCK(dh); if (bp != NULL) brelse(bp, 0); return (ENOENT); } /* * Find a directory block with room for 'slotneeded' bytes. Returns * the offset of the directory entry that begins the free space. * This will either be the offset of an existing entry that has free * space at the end, or the offset of an entry with d_ino == 0 at * the start of a UFS_DIRBLKSIZ block. * * To use the space, the caller may need to compact existing entries in * the directory. The total number of bytes in all of the entries involved * in the compaction is stored in *slotsize. In other words, all of * the entries that must be compacted are exactly contained in the * region beginning at the returned offset and spanning *slotsize bytes. * * Returns -1 if no space was found, indicating that the directory * must be extended. */ doff_t ufsdirhash_findfree(struct inode *ip, int slotneeded, int *slotsize) { struct direct *dp; struct dirhash *dh; struct buf *bp; doff_t pos, slotstart; int dirblock, error, freebytes, i; const int needswap = UFS_MPNEEDSWAP(ip->i_ump); int dirblksiz = ip->i_ump->um_dirblksiz; if ((dh = ip->i_dirhash) == NULL) return (-1); DIRHASH_LOCK(dh); if (dh->dh_hash == NULL) { DIRHASH_UNLOCK(dh); ufsdirhash_free(ip); return (-1); } /* Find a directory block with the desired free space. */ dirblock = -1; for (i = howmany(slotneeded, DIRALIGN); i <= DH_NFSTATS; i++) if ((dirblock = dh->dh_firstfree[i]) != -1) break; if (dirblock == -1) { DIRHASH_UNLOCK(dh); return (-1); } KASSERT(dirblock < dh->dh_nblk && dh->dh_blkfree[dirblock] >= howmany(slotneeded, DIRALIGN)); pos = dirblock * dirblksiz; error = ufs_blkatoff(ip->i_vnode, (off_t)pos, (void *)&dp, &bp, false); if (error) { DIRHASH_UNLOCK(dh); return (-1); } /* Find the first entry with free space. */ for (i = 0; i < dirblksiz; ) { if (dp->d_reclen == 0) { DIRHASH_UNLOCK(dh); brelse(bp, 0); return (-1); } if (dp->d_ino == 0 || dp->d_reclen > UFS_DIRSIZ(0, dp, needswap)) break; i += dp->d_reclen; dp = (struct direct *)((char *)dp + dp->d_reclen); } if (i > dirblksiz) { DIRHASH_UNLOCK(dh); brelse(bp, 0); return (-1); } slotstart = pos + i; /* Find the range of entries needed to get enough space */ freebytes = 0; while (i < dirblksiz && freebytes < slotneeded) { freebytes += dp->d_reclen; if (dp->d_ino != 0) freebytes -= UFS_DIRSIZ(0, dp, needswap); if (dp->d_reclen == 0) { DIRHASH_UNLOCK(dh); brelse(bp, 0); return (-1); } i += dp->d_reclen; dp = (struct direct *)((char *)dp + dp->d_reclen); } if (i > dirblksiz) { DIRHASH_UNLOCK(dh); brelse(bp, 0); return (-1); } if (freebytes < slotneeded) panic("ufsdirhash_findfree: free mismatch"); DIRHASH_UNLOCK(dh); brelse(bp, 0); *slotsize = pos + i - slotstart; return (slotstart); } /* * Return the start of the unused space at the end of a directory, or * -1 if there are no trailing unused blocks. */ doff_t ufsdirhash_enduseful(struct inode *ip) { struct dirhash *dh; int i; int dirblksiz = ip->i_ump->um_dirblksiz; if ((dh = ip->i_dirhash) == NULL) return (-1); DIRHASH_LOCK(dh); if (dh->dh_hash == NULL) { DIRHASH_UNLOCK(dh); ufsdirhash_free(ip); return (-1); } if (dh->dh_blkfree[dh->dh_dirblks - 1] != dirblksiz / DIRALIGN) { DIRHASH_UNLOCK(dh); return (-1); } for (i = dh->dh_dirblks - 1; i >= 0; i--) if (dh->dh_blkfree[i] != dirblksiz / DIRALIGN) break; DIRHASH_UNLOCK(dh); return ((doff_t)(i + 1) * dirblksiz); } /* * Insert information into the hash about a new directory entry. dirp * points to a struct direct containing the entry, and offset specifies * the offset of this entry. */ void ufsdirhash_add(struct inode *ip, struct direct *dirp, doff_t offset) { struct dirhash *dh; int slot; const int needswap = UFS_MPNEEDSWAP(ip->i_ump); int dirblksiz = ip->i_ump->um_dirblksiz; if ((dh = ip->i_dirhash) == NULL) return; DIRHASH_LOCK(dh); if (dh->dh_hash == NULL) { DIRHASH_UNLOCK(dh); ufsdirhash_free(ip); return; } KASSERT(offset < dh->dh_dirblks * dirblksiz); /* * Normal hash usage is < 66%. If the usage gets too high then * remove the hash entirely and let it be rebuilt later. */ if (dh->dh_hused >= (dh->dh_hlen * 3) / 4) { DIRHASH_UNLOCK(dh); ufsdirhash_free(ip); return; } /* Find a free hash slot (empty or deleted), and add the entry. */ slot = ufsdirhash_hash(dh, dirp->d_name, dirp->d_namlen); while (DH_ENTRY(dh, slot) >= 0) slot = WRAPINCR(slot, dh->dh_hlen); if (DH_ENTRY(dh, slot) == DIRHASH_EMPTY) dh->dh_hused++; DH_ENTRY(dh, slot) = offset; /* Update the per-block summary info. */ ufsdirhash_adjfree(dh, offset, -UFS_DIRSIZ(0, dirp, needswap), dirblksiz); DIRHASH_UNLOCK(dh); } /* * Remove the specified directory entry from the hash. The entry to remove * is defined by the name in `dirp', which must exist at the specified * `offset' within the directory. */ void ufsdirhash_remove(struct inode *ip, struct direct *dirp, doff_t offset) { struct dirhash *dh; int slot; const int needswap = UFS_MPNEEDSWAP(ip->i_ump); int dirblksiz = ip->i_ump->um_dirblksiz; if ((dh = ip->i_dirhash) == NULL) return; DIRHASH_LOCK(dh); if (dh->dh_hash == NULL) { DIRHASH_UNLOCK(dh); ufsdirhash_free(ip); return; } KASSERT(offset < dh->dh_dirblks * dirblksiz); /* Find the entry */ slot = ufsdirhash_findslot(dh, dirp->d_name, dirp->d_namlen, offset); /* Remove the hash entry. */ ufsdirhash_delslot(dh, slot); /* Update the per-block summary info. */ ufsdirhash_adjfree(dh, offset, UFS_DIRSIZ(0, dirp, needswap), dirblksiz); DIRHASH_UNLOCK(dh); } /* * Change the offset associated with a directory entry in the hash. Used * when compacting directory blocks. */ void ufsdirhash_move(struct inode *ip, struct direct *dirp, doff_t oldoff, doff_t newoff) { struct dirhash *dh; int slot; if ((dh = ip->i_dirhash) == NULL) return; DIRHASH_LOCK(dh); if (dh->dh_hash == NULL) { DIRHASH_UNLOCK(dh); ufsdirhash_free(ip); return; } KASSERT(oldoff < dh->dh_dirblks * ip->i_ump->um_dirblksiz && newoff < dh->dh_dirblks * ip->i_ump->um_dirblksiz); /* Find the entry, and update the offset. */ slot = ufsdirhash_findslot(dh, dirp->d_name, dirp->d_namlen, oldoff); DH_ENTRY(dh, slot) = newoff; DIRHASH_UNLOCK(dh); } /* * Inform dirhash that the directory has grown by one block that * begins at offset (i.e. the new length is offset + UFS_DIRBLKSIZ). */ void ufsdirhash_newblk(struct inode *ip, doff_t offset) { struct dirhash *dh; int block; int dirblksiz = ip->i_ump->um_dirblksiz; if ((dh = ip->i_dirhash) == NULL) return; DIRHASH_LOCK(dh); if (dh->dh_hash == NULL) { DIRHASH_UNLOCK(dh); ufsdirhash_free(ip); return; } KASSERT(offset == dh->dh_dirblks * dirblksiz); block = offset / dirblksiz; if (block >= dh->dh_nblk) { /* Out of space; must rebuild. */ DIRHASH_UNLOCK(dh); ufsdirhash_free(ip); return; } dh->dh_dirblks = block + 1; /* Account for the new free block. */ dh->dh_blkfree[block] = dirblksiz / DIRALIGN; if (dh->dh_firstfree[DH_NFSTATS] == -1) dh->dh_firstfree[DH_NFSTATS] = block; DIRHASH_UNLOCK(dh); } /* * Inform dirhash that the directory is being truncated. */ void ufsdirhash_dirtrunc(struct inode *ip, doff_t offset) { struct dirhash *dh; int block, i; int dirblksiz = ip->i_ump->um_dirblksiz; if ((dh = ip->i_dirhash) == NULL) return; DIRHASH_LOCK(dh); if (dh->dh_hash == NULL) { DIRHASH_UNLOCK(dh); ufsdirhash_free(ip); return; } KASSERT(offset <= dh->dh_dirblks * dirblksiz); block = howmany(offset, dirblksiz); /* * If the directory shrinks to less than 1/8 of dh_nblk blocks * (about 20% of its original size due to the 50% extra added in * ufsdirhash_build) then free it, and let the caller rebuild * if necessary. */ if (block < dh->dh_nblk / 8 && dh->dh_narrays > 1) { DIRHASH_UNLOCK(dh); ufsdirhash_free(ip); return; } /* * Remove any `first free' information pertaining to the * truncated blocks. All blocks we're removing should be * completely unused. */ if (dh->dh_firstfree[DH_NFSTATS] >= block) dh->dh_firstfree[DH_NFSTATS] = -1; for (i = block; i < dh->dh_dirblks; i++) if (dh->dh_blkfree[i] != dirblksiz / DIRALIGN) panic("ufsdirhash_dirtrunc: blocks in use"); for (i = 0; i < DH_NFSTATS; i++) if (dh->dh_firstfree[i] >= block) panic("ufsdirhash_dirtrunc: first free corrupt"); dh->dh_dirblks = block; DIRHASH_UNLOCK(dh); } /* * Debugging function to check that the dirhash information about * a directory block matches its actual contents. Panics if a mismatch * is detected. * * On entry, `sbuf' should point to the start of an in-core * DIRBLKSIZ-sized directory block, and `offset' should contain the * offset from the start of the directory of that block. */ void ufsdirhash_checkblock(struct inode *ip, char *sbuf, doff_t offset) { struct dirhash *dh; struct direct *dp; int block, ffslot, i, nfree; const int needswap = UFS_MPNEEDSWAP(ip->i_ump); int dirblksiz = ip->i_ump->um_dirblksiz; if (!ufs_dirhashcheck) return; if ((dh = ip->i_dirhash) == NULL) return; DIRHASH_LOCK(dh); if (dh->dh_hash == NULL) { DIRHASH_UNLOCK(dh); ufsdirhash_free(ip); return; } block = offset / dirblksiz; if ((offset & (dirblksiz - 1)) != 0 || block >= dh->dh_dirblks) panic("ufsdirhash_checkblock: bad offset"); nfree = 0; for (i = 0; i < dirblksiz; i += dp->d_reclen) { dp = (struct direct *)(sbuf + i); if (dp->d_reclen == 0 || i + dp->d_reclen > dirblksiz) panic("ufsdirhash_checkblock: bad dir"); if (dp->d_ino == 0) { #if 0 /* * XXX entries with d_ino == 0 should only occur * at the start of a DIRBLKSIZ block. However the * ufs code is tolerant of such entries at other * offsets, and fsck does not fix them. */ if (i != 0) panic("ufsdirhash_checkblock: bad dir inode"); #endif nfree += dp->d_reclen; continue; } /* Check that the entry exists (will panic if it doesn't). */ ufsdirhash_findslot(dh, dp->d_name, dp->d_namlen, offset + i); nfree += dp->d_reclen - UFS_DIRSIZ(0, dp, needswap); } if (i != dirblksiz) panic("ufsdirhash_checkblock: bad dir end"); if (dh->dh_blkfree[block] * DIRALIGN != nfree) panic("ufsdirhash_checkblock: bad free count"); ffslot = BLKFREE2IDX(nfree / DIRALIGN); for (i = 0; i <= DH_NFSTATS; i++) if (dh->dh_firstfree[i] == block && i != ffslot) panic("ufsdirhash_checkblock: bad first-free"); if (dh->dh_firstfree[ffslot] == -1) panic("ufsdirhash_checkblock: missing first-free entry"); DIRHASH_UNLOCK(dh); } /* * Hash the specified filename into a dirhash slot. */ static int ufsdirhash_hash(struct dirhash *dh, const char *name, int namelen) { u_int32_t hash; /* * We hash the name and then some other bit of data that is * invariant over the dirhash's lifetime. Otherwise names * differing only in the last byte are placed close to one * another in the table, which is bad for linear probing. */ hash = hash32_buf(name, namelen, HASH32_BUF_INIT); hash = hash32_buf(&dh, sizeof(dh), hash); return (hash % dh->dh_hlen); } /* * Adjust the number of free bytes in the block containing `offset' * by the value specified by `diff'. * * The caller must ensure we have exclusive access to `dh'; normally * that means that dh_lock should be held, but this is also called * from ufsdirhash_build() where exclusive access can be assumed. */ static void ufsdirhash_adjfree(struct dirhash *dh, doff_t offset, int diff, int dirblksiz) { int block, i, nfidx, ofidx; KASSERT(mutex_owned(&dh->dh_lock)); /* Update the per-block summary info. */ block = offset / dirblksiz; KASSERT(block < dh->dh_nblk && block < dh->dh_dirblks); ofidx = BLKFREE2IDX(dh->dh_blkfree[block]); dh->dh_blkfree[block] = (int)dh->dh_blkfree[block] + (diff / DIRALIGN); nfidx = BLKFREE2IDX(dh->dh_blkfree[block]); /* Update the `first free' list if necessary. */ if (ofidx != nfidx) { /* If removing, scan forward for the next block. */ if (dh->dh_firstfree[ofidx] == block) { for (i = block + 1; i < dh->dh_dirblks; i++) if (BLKFREE2IDX(dh->dh_blkfree[i]) == ofidx) break; dh->dh_firstfree[ofidx] = (i < dh->dh_dirblks) ? i : -1; } /* Make this the new `first free' if necessary */ if (dh->dh_firstfree[nfidx] > block || dh->dh_firstfree[nfidx] == -1) dh->dh_firstfree[nfidx] = block; } } /* * Find the specified name which should have the specified offset. * Returns a slot number, and panics on failure. * * `dh' must be locked on entry and remains so on return. */ static int ufsdirhash_findslot(struct dirhash *dh, const char *name, int namelen, doff_t offset) { int slot; KASSERT(mutex_owned(&dh->dh_lock)); /* Find the entry. */ KASSERT(dh->dh_hused < dh->dh_hlen); slot = ufsdirhash_hash(dh, name, namelen); while (DH_ENTRY(dh, slot) != offset && DH_ENTRY(dh, slot) != DIRHASH_EMPTY) slot = WRAPINCR(slot, dh->dh_hlen); if (DH_ENTRY(dh, slot) != offset) panic("ufsdirhash_findslot: '%.*s' not found", namelen, name); return (slot); } /* * Remove the entry corresponding to the specified slot from the hash array. * * `dh' must be locked on entry and remains so on return. */ static void ufsdirhash_delslot(struct dirhash *dh, int slot) { int i; KASSERT(mutex_owned(&dh->dh_lock)); /* Mark the entry as deleted. */ DH_ENTRY(dh, slot) = DIRHASH_DEL; /* If this is the end of a chain of DIRHASH_DEL slots, remove them. */ for (i = slot; DH_ENTRY(dh, i) == DIRHASH_DEL; ) i = WRAPINCR(i, dh->dh_hlen); if (DH_ENTRY(dh, i) == DIRHASH_EMPTY) { i = WRAPDECR(i, dh->dh_hlen); while (DH_ENTRY(dh, i) == DIRHASH_DEL) { DH_ENTRY(dh, i) = DIRHASH_EMPTY; dh->dh_hused--; i = WRAPDECR(i, dh->dh_hlen); } KASSERT(dh->dh_hused >= 0); } } /* * Given a directory entry and its offset, find the offset of the * previous entry in the same UFS_DIRBLKSIZ-sized block. Returns an * offset, or -1 if there is no previous entry in the block or some * other problem occurred. */ static doff_t ufsdirhash_getprev(struct direct *dirp, doff_t offset, int dirblksiz) { struct direct *dp; char *blkbuf; doff_t blkoff, prevoff; int entrypos, i; blkoff = offset & ~(dirblksiz - 1); /* offset of start of block */ entrypos = offset & (dirblksiz - 1); /* entry relative to block */ blkbuf = (char *)dirp - entrypos; prevoff = blkoff; /* If `offset' is the start of a block, there is no previous entry. */ if (entrypos == 0) return (-1); /* Scan from the start of the block until we get to the entry. */ for (i = 0; i < entrypos; i += dp->d_reclen) { dp = (struct direct *)(blkbuf + i); if (dp->d_reclen == 0 || i + dp->d_reclen > entrypos) return (-1); /* Corrupted directory. */ prevoff = blkoff + i; } return (prevoff); } /* * Try to free up `wanted' bytes by stealing memory from existing * dirhashes. Returns zero with list locked if successful. */ static int ufsdirhash_recycle(int wanted) { struct dirhash *dh; doff_t **hash; u_int8_t *blkfree; int i, mem, narrays; size_t hashsz, blkfreesz; DIRHASHLIST_LOCK(); while (wanted + ufs_dirhashmem > ufs_dirhashmaxmem) { /* Find a dirhash, and lock it. */ if ((dh = TAILQ_FIRST(&ufsdirhash_list)) == NULL) { DIRHASHLIST_UNLOCK(); return (-1); } DIRHASH_LOCK(dh); KASSERT(dh->dh_hash != NULL); /* Decrement the score; only recycle if it becomes zero. */ if (--dh->dh_score > 0) { DIRHASH_UNLOCK(dh); DIRHASHLIST_UNLOCK(); return (-1); } /* Remove it from the list and detach its memory. */ TAILQ_REMOVE(&ufsdirhash_list, dh, dh_list); dh->dh_onlist = 0; hash = dh->dh_hash; hashsz = dh->dh_hashsz; dh->dh_hash = NULL; blkfree = dh->dh_blkfree; blkfreesz = dh->dh_blkfreesz; dh->dh_blkfree = NULL; narrays = dh->dh_narrays; mem = narrays * sizeof(*dh->dh_hash) + narrays * DH_NBLKOFF * sizeof(**dh->dh_hash) + dh->dh_nblk * sizeof(*dh->dh_blkfree); /* Unlock everything, free the detached memory. */ DIRHASH_UNLOCK(dh); DIRHASHLIST_UNLOCK(); for (i = 0; i < narrays; i++) DIRHASH_BLKFREE(hash[i]); kmem_free(hash, hashsz); kmem_free(blkfree, blkfreesz); /* Account for the returned memory, and repeat if necessary. */ DIRHASHLIST_LOCK(); atomic_add_int(&ufs_dirhashmem, -mem); } /* Success. */ return (0); } SYSCTL_SETUP(ufsdirhash_sysctl_init, "ufs_dirhash sysctl") { const struct sysctlnode *rnode, *cnode; sysctl_createv(clog, 0, NULL, &rnode, CTLFLAG_PERMANENT, CTLTYPE_NODE, "ufs", SYSCTL_DESCR("ufs"), NULL, 0, NULL, 0, CTL_VFS, CTL_CREATE, CTL_EOL); sysctl_createv(clog, 0, &rnode, &rnode, CTLFLAG_PERMANENT, CTLTYPE_NODE, "dirhash", SYSCTL_DESCR("dirhash"), NULL, 0, NULL, 0, CTL_CREATE, CTL_EOL); sysctl_createv(clog, 0, &rnode, &cnode, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_INT, "minblocks", SYSCTL_DESCR("minimum hashed directory size in blocks"), NULL, 0, &ufs_dirhashminblks, 0, CTL_CREATE, CTL_EOL); sysctl_createv(clog, 0, &rnode, &cnode, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_INT, "maxmem", SYSCTL_DESCR("maximum dirhash memory usage"), NULL, 0, &ufs_dirhashmaxmem, 0, CTL_CREATE, CTL_EOL); sysctl_createv(clog, 0, &rnode, &cnode, CTLFLAG_PERMANENT|CTLFLAG_READONLY, CTLTYPE_INT, "memused", SYSCTL_DESCR("current dirhash memory usage"), NULL, 0, &ufs_dirhashmem, 0, CTL_CREATE, CTL_EOL); sysctl_createv(clog, 0, &rnode, &cnode, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_INT, "docheck", SYSCTL_DESCR("enable extra sanity checks"), NULL, 0, &ufs_dirhashcheck, 0, CTL_CREATE, CTL_EOL); } void ufsdirhash_init(void) { /* * Only initialise defaults for the dirhash size if it hasn't * hasn't been set. */ if (ufs_dirhashmaxmem == 0) { /* Use 64-bit math to avoid overflows. */ uint64_t physmem_bytes, hash_bytes; physmem_bytes = ctob((uint64_t)physmem); hash_bytes = physmem_bytes / DIRHASH_DEFAULT_DIVIDER; if (hash_bytes < MIN_DEFAULT_DIRHASH_MEM) hash_bytes = 0; if (hash_bytes > MAX_DEFAULT_DIRHASH_MEM) hash_bytes = MAX_DEFAULT_DIRHASH_MEM; ufs_dirhashmaxmem = (u_int)hash_bytes; } mutex_init(&ufsdirhash_lock, MUTEX_DEFAULT, IPL_NONE); ufsdirhashblk_cache = pool_cache_init(DH_NBLKOFF * sizeof(daddr_t), 0, 0, 0, "dirhashblk", NULL, IPL_NONE, NULL, NULL, NULL); ufsdirhash_cache = pool_cache_init(sizeof(struct dirhash), 0, 0, 0, "dirhash", NULL, IPL_NONE, NULL, NULL, NULL); TAILQ_INIT(&ufsdirhash_list); } void ufsdirhash_done(void) { KASSERT(TAILQ_EMPTY(&ufsdirhash_list)); pool_cache_destroy(ufsdirhashblk_cache); pool_cache_destroy(ufsdirhash_cache); mutex_destroy(&ufsdirhash_lock); }
2 2 2 2 2 6 2 4 2 2 2 2 2 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 /* $NetBSD: machdep.c,v 1.368 2024/03/05 14:15:28 thorpej Exp $ */ /* * Copyright (c) 1996, 1997, 1998, 2000, 2006, 2007, 2008, 2011 * The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Charles M. Hannum and by Jason R. Thorpe of the Numerical Aerospace * Simulation Facility, NASA Ames Research Center. * * This code is derived from software contributed to The NetBSD Foundation * by Coyote Point Systems, Inc. which was written under contract to Coyote * Point by Jed Davis and Devon O'Dell. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /* * Copyright (c) 2006 Mathieu Ropert <mro@adviseo.fr> * * Permission to use, copy, modify, and distribute this software for any * purpose with or without fee is hereby granted, provided that the above * copyright notice and this permission notice appear in all copies. * * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ /* * Copyright (c) 2007 Manuel Bouyer. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* * Copyright (c) 1982, 1987, 1990 The Regents of the University of California. * All rights reserved. * * This code is derived from software contributed to Berkeley by * William Jolitz. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)machdep.c 7.4 (Berkeley) 6/3/91 */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: machdep.c,v 1.368 2024/03/05 14:15:28 thorpej Exp $"); #include "opt_modular.h" #include "opt_user_ldt.h" #include "opt_ddb.h" #include "opt_kgdb.h" #include "opt_cpureset_delay.h" #include "opt_mtrr.h" #include "opt_realmem.h" #include "opt_xen.h" #include "opt_svs.h" #include "opt_kaslr.h" #ifndef XENPV #include "opt_physmem.h" #endif #include "isa.h" #include "pci.h" #include <sys/param.h> #include <sys/systm.h> #include <sys/signal.h> #include <sys/signalvar.h> #include <sys/kernel.h> #include <sys/cpu.h> #include <sys/exec.h> #include <sys/exec_aout.h> /* for MID_* */ #include <sys/reboot.h> #include <sys/conf.h> #include <sys/msgbuf.h> #include <sys/mount.h> #include <sys/core.h> #include <sys/kcore.h> #include <sys/ucontext.h> #include <machine/kcore.h> #include <sys/ras.h> #include <sys/syscallargs.h> #include <sys/ksyms.h> #include <sys/device.h> #include <sys/lwp.h> #include <sys/proc.h> #include <sys/asan.h> #include <sys/csan.h> #include <sys/msan.h> #include <sys/module.h> #include <sys/timevar.h> #ifdef KGDB #include <sys/kgdb.h> #endif #include <lib/libkern/entpool.h> /* XXX */ #include <dev/cons.h> #include <dev/mm.h> #include <uvm/uvm.h> #include <uvm/uvm_page.h> #include <sys/sysctl.h> #include <machine/cpu.h> #include <machine/cpu_rng.h> #include <machine/cpufunc.h> #include <machine/gdt.h> #include <machine/intr.h> #include <machine/pio.h> #include <machine/psl.h> #include <machine/reg.h> #include <machine/specialreg.h> #include <machine/bootinfo.h> #include <x86/fpu.h> #include <x86/dbregs.h> #include <machine/mtrr.h> #include <machine/mpbiosvar.h> #include <machine/pmap_private.h> #include <x86/bootspace.h> #include <x86/cputypes.h> #include <x86/cpuvar.h> #include <x86/machdep.h> #include <x86/x86/tsc.h> #include <dev/isa/isareg.h> #include <machine/isa_machdep.h> #include <dev/ic/i8042reg.h> #ifdef XEN #include <xen/xen.h> #include <xen/hypervisor.h> #include <xen/evtchn.h> #include <xen/include/public/version.h> #include <xen/include/public/vcpu.h> #endif /* XEN */ #include <ddb/db_active.h> #ifdef DDB #include <machine/db_machdep.h> #include <ddb/db_extern.h> #include <ddb/db_output.h> #include <ddb/db_interface.h> #endif #include "acpica.h" #if NACPICA > 0 #include <dev/acpi/acpivar.h> #define ACPI_MACHDEP_PRIVATE #include <machine/acpi_machdep.h> #else #include <machine/i82489var.h> #endif #include "isa.h" #include "isadma.h" #include "ksyms.h" /* the following is used externally (sysctl_hw) */ char machine[] = "amd64"; /* CPU "architecture" */ char machine_arch[] = "x86_64"; /* machine == machine_arch */ #ifdef CPURESET_DELAY int cpureset_delay = CPURESET_DELAY; #else int cpureset_delay = 2000; /* default to 2s */ #endif int cpu_class = CPUCLASS_686; #ifdef MTRR const struct mtrr_funcs *mtrr_funcs; #endif int cpu_class; int use_pae; #ifndef NO_SPARSE_DUMP int sparse_dump = 1; paddr_t max_paddr = 0; unsigned char *sparse_dump_physmap; #endif char *dump_headerbuf, *dump_headerbuf_ptr; #define dump_headerbuf_size PAGE_SIZE #define dump_headerbuf_end (dump_headerbuf + dump_headerbuf_size) #define dump_headerbuf_avail (dump_headerbuf_end - dump_headerbuf_ptr) daddr_t dump_header_blkno; size_t dump_nmemsegs; size_t dump_npages; size_t dump_header_size; size_t dump_totalbytesleft; vaddr_t idt_vaddr; paddr_t idt_paddr; vaddr_t gdt_vaddr; paddr_t gdt_paddr; vaddr_t ldt_vaddr; paddr_t ldt_paddr; static struct vm_map module_map_store; extern struct bootspace bootspace; extern struct slotspace slotspace; vaddr_t vm_min_kernel_address __read_mostly = VM_MIN_KERNEL_ADDRESS_DEFAULT; vaddr_t vm_max_kernel_address __read_mostly = VM_MAX_KERNEL_ADDRESS_DEFAULT; pd_entry_t *pte_base __read_mostly; struct vm_map *phys_map = NULL; extern paddr_t lowmem_rsvd; extern paddr_t avail_start, avail_end; #ifdef XENPV extern paddr_t pmap_pa_start, pmap_pa_end; #endif struct nmistore { uint64_t cr3; uint64_t scratch; } __packed; /* * Size of memory segments, before any memory is stolen. */ phys_ram_seg_t mem_clusters[VM_PHYSSEG_MAX]; int mem_cluster_cnt; int cpu_dump(void); int cpu_dumpsize(void); u_long cpu_dump_mempagecnt(void); void dodumpsys(void); void dumpsys(void); static void x86_64_proc0_pcb_ldt_init(void); void dump_misc_init(void); void dump_seg_prep(void); int dump_seg_iter(int (*)(paddr_t, paddr_t)); #ifndef NO_SPARSE_DUMP void sparse_dump_reset(void); void sparse_dump_mark(void); void cpu_dump_prep_sparse(void); #endif void dump_header_start(void); int dump_header_flush(void); int dump_header_addbytes(const void*, size_t); int dump_header_addseg(paddr_t, paddr_t); int dump_header_finish(void); int dump_seg_count_range(paddr_t, paddr_t); int dumpsys_seg(paddr_t, paddr_t); void init_bootspace(void); void init_slotspace(void); void init_x86_64(paddr_t); /* * Machine-dependent startup code */ void cpu_startup(void) { int x, y; vaddr_t minaddr, maxaddr; psize_t sz; /* * For console drivers that require uvm and pmap to be initialized, * we'll give them one more chance here... */ consinit(); /* * Initialize error message buffer (at end of core). */ if (msgbuf_p_cnt == 0) panic("msgbuf paddr map has not been set up"); for (x = 0, sz = 0; x < msgbuf_p_cnt; sz += msgbuf_p_seg[x++].sz) continue; msgbuf_vaddr = uvm_km_alloc(kernel_map, sz, 0, UVM_KMF_VAONLY); if (msgbuf_vaddr == 0) panic("failed to valloc msgbuf_vaddr"); for (y = 0, sz = 0; y < msgbuf_p_cnt; y++) { for (x = 0; x < btoc(msgbuf_p_seg[y].sz); x++, sz += PAGE_SIZE) pmap_kenter_pa((vaddr_t)msgbuf_vaddr + sz, msgbuf_p_seg[y].paddr + x * PAGE_SIZE, VM_PROT_READ|VM_PROT_WRITE, 0); } pmap_update(pmap_kernel()); initmsgbuf((void *)msgbuf_vaddr, round_page(sz)); minaddr = 0; /* * Allocate a submap for physio. */ phys_map = uvm_km_suballoc(kernel_map, &minaddr, &maxaddr, VM_PHYS_SIZE, 0, false, NULL); /* * Create the module map. * * The kernel uses RIP-relative addressing with a maximum offset of * 2GB. Because of that, we can't put the kernel modules in kernel_map * (like i386 does), since kernel_map is too far away in memory from * the kernel sections. So we have to create a special module_map. * * The module map is taken as what is left of the bootstrap memory * created in locore/prekern. */ uvm_map_setup(&module_map_store, bootspace.smodule, bootspace.emodule, 0); module_map_store.pmap = pmap_kernel(); module_map = &module_map_store; /* Say hello. */ banner(); #if NISA > 0 || NPCI > 0 /* Safe for i/o port / memory space allocation to use malloc now. */ x86_bus_space_mallocok(); #endif #ifdef __HAVE_PCPU_AREA cpu_pcpuarea_init(&cpu_info_primary); #endif gdt_init(); x86_64_proc0_pcb_ldt_init(); cpu_init_tss(&cpu_info_primary); #if !defined(XENPV) ltr(cpu_info_primary.ci_tss_sel); #endif x86_startup(); } #ifdef XENPV /* used in assembly */ void hypervisor_callback(void); void failsafe_callback(void); void x86_64_switch_context(struct pcb *); void x86_64_tls_switch(struct lwp *); void x86_64_switch_context(struct pcb *new) { HYPERVISOR_stack_switch(GSEL(GDATA_SEL, SEL_KPL), new->pcb_rsp0); struct physdev_set_iopl set_iopl; set_iopl.iopl = new->pcb_iopl; HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl); } void x86_64_tls_switch(struct lwp *l) { struct cpu_info *ci = curcpu(); struct pcb *pcb = lwp_getpcb(l); struct trapframe *tf = l->l_md.md_regs; uint64_t zero = 0; /* * Raise the IPL to IPL_HIGH. XXX Still needed? */ (void)splhigh(); /* Update segment registers */ if (pcb->pcb_flags & PCB_COMPAT32) { update_descriptor(&ci->ci_gdt[GUFS_SEL], &pcb->pcb_fs); update_descriptor(&ci->ci_gdt[GUGS_SEL], &pcb->pcb_gs); setds(GSEL(GUDATA32_SEL, SEL_UPL)); setes(GSEL(GUDATA32_SEL, SEL_UPL)); setfs(GSEL(GUDATA32_SEL, SEL_UPL)); HYPERVISOR_set_segment_base(SEGBASE_GS_USER_SEL, tf->tf_gs); } else { update_descriptor(&ci->ci_gdt[GUFS_SEL], &zero); update_descriptor(&ci->ci_gdt[GUGS_SEL], &zero); setds(GSEL(GUDATA_SEL, SEL_UPL)); setes(GSEL(GUDATA_SEL, SEL_UPL)); setfs(0); HYPERVISOR_set_segment_base(SEGBASE_GS_USER_SEL, 0); HYPERVISOR_set_segment_base(SEGBASE_FS, pcb->pcb_fs); HYPERVISOR_set_segment_base(SEGBASE_GS_USER, pcb->pcb_gs); } } #endif /* XENPV */ /* * Set up proc0's PCB and LDT. */ static void x86_64_proc0_pcb_ldt_init(void) { struct lwp *l = &lwp0; struct pcb *pcb = lwp_getpcb(l); pcb->pcb_flags = 0; pcb->pcb_fs = 0; pcb->pcb_gs = 0; pcb->pcb_rsp0 = (uvm_lwp_getuarea(l) + USPACE - 16) & ~0xf; pcb->pcb_iopl = IOPL_KPL; pcb->pcb_dbregs = NULL; pcb->pcb_cr0 = rcr0() & ~CR0_TS; l->l_md.md_regs = (struct trapframe *)pcb->pcb_rsp0 - 1; #if !defined(XENPV) lldt(GSYSSEL(GLDT_SEL, SEL_KPL)); #else xen_set_ldt((vaddr_t)ldtstore, LDT_SIZE >> 3); /* Reset TS bit and set kernel stack for interrupt handlers */ HYPERVISOR_fpu_taskswitch(1); HYPERVISOR_stack_switch(GSEL(GDATA_SEL, SEL_KPL), pcb->pcb_rsp0); struct physdev_set_iopl set_iopl; set_iopl.iopl = pcb->pcb_iopl; HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl); #endif } /* * Set up TSS and I/O bitmap. */ void cpu_init_tss(struct cpu_info *ci) { #ifdef __HAVE_PCPU_AREA const cpuid_t cid = cpu_index(ci); #endif struct cpu_tss *cputss; struct nmistore *store; uintptr_t p; #ifdef __HAVE_PCPU_AREA cputss = (struct cpu_tss *)&pcpuarea->ent[cid].tss; #else cputss = (struct cpu_tss *)uvm_km_alloc(kernel_map, sizeof(struct cpu_tss), 0, UVM_KMF_WIRED|UVM_KMF_ZERO); #endif cputss->tss.tss_iobase = IOMAP_INVALOFF << 16; /* DDB stack */ #ifdef __HAVE_PCPU_AREA p = (vaddr_t)&pcpuarea->ent[cid].ist0; #else p = uvm_km_alloc(kernel_map, PAGE_SIZE, 0, UVM_KMF_WIRED|UVM_KMF_ZERO); #endif cputss->tss.tss_ist[0] = p + PAGE_SIZE - 16; /* double fault */ #ifdef __HAVE_PCPU_AREA p = (vaddr_t)&pcpuarea->ent[cid].ist1; #else p = uvm_km_alloc(kernel_map, PAGE_SIZE, 0, UVM_KMF_WIRED|UVM_KMF_ZERO); #endif cputss->tss.tss_ist[1] = p + PAGE_SIZE - 16; /* NMI - store a structure at the top of the stack */ #ifdef __HAVE_PCPU_AREA p = (vaddr_t)&pcpuarea->ent[cid].ist2; #else p = uvm_km_alloc(kernel_map, PAGE_SIZE, 0, UVM_KMF_WIRED|UVM_KMF_ZERO); #endif cputss->tss.tss_ist[2] = p + PAGE_SIZE - sizeof(struct nmistore); store = (struct nmistore *)(p + PAGE_SIZE - sizeof(struct nmistore)); store->cr3 = pmap_pdirpa(pmap_kernel(), 0); /* DB */ #ifdef __HAVE_PCPU_AREA p = (vaddr_t)&pcpuarea->ent[cid].ist3; #else p = uvm_km_alloc(kernel_map, PAGE_SIZE, 0, UVM_KMF_WIRED|UVM_KMF_ZERO); #endif cputss->tss.tss_ist[3] = p + PAGE_SIZE - 16; ci->ci_tss = cputss; ci->ci_tss_sel = tss_alloc(&cputss->tss); } void buildcontext(struct lwp *l, void *catcher, void *f) { struct trapframe *tf = l->l_md.md_regs; tf->tf_ds = GSEL(GUDATA_SEL, SEL_UPL); tf->tf_es = GSEL(GUDATA_SEL, SEL_UPL); tf->tf_fs = GSEL(GUDATA_SEL, SEL_UPL); tf->tf_gs = GSEL(GUDATA_SEL, SEL_UPL); tf->tf_rip = (uint64_t)catcher; tf->tf_cs = GSEL(GUCODE_SEL, SEL_UPL); tf->tf_rflags &= ~PSL_CLEARSIG; tf->tf_rsp = (uint64_t)f; tf->tf_ss = GSEL(GUDATA_SEL, SEL_UPL); /* Ensure FP state is sane */ fpu_sigreset(l); } void sendsig_sigcontext(const ksiginfo_t *ksi, const sigset_t *mask) { printf("sendsig_sigcontext: illegal\n"); sigexit(curlwp, SIGILL); } void sendsig_siginfo(const ksiginfo_t *ksi, const sigset_t *mask) { struct lwp *l = curlwp; struct proc *p = l->l_proc; struct sigacts *ps = p->p_sigacts; int onstack, error; int sig = ksi->ksi_signo; struct sigframe_siginfo *fp, frame; sig_t catcher = SIGACTION(p, sig).sa_handler; struct trapframe *tf = l->l_md.md_regs; char *sp; KASSERT(mutex_owned(p->p_lock)); /* Do we need to jump onto the signal stack? */ onstack = (l->l_sigstk.ss_flags & (SS_DISABLE | SS_ONSTACK)) == 0 && (SIGACTION(p, sig).sa_flags & SA_ONSTACK) != 0; /* Allocate space for the signal handler context. */ if (onstack) sp = ((char *)l->l_sigstk.ss_sp + l->l_sigstk.ss_size); else /* AMD64 ABI 128-bytes "red zone". */ sp = (char *)tf->tf_rsp - 128; sp -= sizeof(struct sigframe_siginfo); /* Round down the stackpointer to a multiple of 16 for the ABI. */ fp = (struct sigframe_siginfo *)(((unsigned long)sp & ~15) - 8); memset(&frame, 0, sizeof(frame)); frame.sf_ra = (uint64_t)ps->sa_sigdesc[sig].sd_tramp; frame.sf_si._info = ksi->ksi_info; frame.sf_uc.uc_flags = _UC_SIGMASK; frame.sf_uc.uc_sigmask = *mask; frame.sf_uc.uc_link = l->l_ctxlink; frame.sf_uc.uc_flags |= (l->l_sigstk.ss_flags & SS_ONSTACK) ? _UC_SETSTACK : _UC_CLRSTACK; sendsig_reset(l, sig); mutex_exit(p->p_lock); cpu_getmcontext(l, &frame.sf_uc.uc_mcontext, &frame.sf_uc.uc_flags); /* Copyout all the fp regs, the signal handler might expect them. */ error = copyout(&frame, fp, sizeof frame); mutex_enter(p->p_lock); if (error != 0) { /* * Process has trashed its stack; give it an illegal * instruction to halt it in its tracks. */ sigexit(l, SIGILL); /* NOTREACHED */ } buildcontext(l, catcher, fp); tf->tf_rdi = sig; tf->tf_rsi = (uint64_t)&fp->sf_si; tf->tf_rdx = tf->tf_r15 = (uint64_t)&fp->sf_uc; /* Remember that we're now on the signal stack. */ if (onstack) l->l_sigstk.ss_flags |= SS_ONSTACK; if ((vaddr_t)catcher >= VM_MAXUSER_ADDRESS) { /* * process has given an invalid address for the * handler. Stop it, but do not do it before so * we can return the right info to userland (or in core dump) */ sigexit(l, SIGILL); /* NOTREACHED */ } } struct pcb dumppcb; void cpu_reboot(int howto, char *bootstr) { static bool syncdone = false; int s = IPL_NONE; __USE(s); /* ugly otherwise */ if (cold) { howto |= RB_HALT; goto haltsys; } boothowto = howto; /* i386 maybe_dump() */ /* * If we've panic'd, don't make the situation potentially * worse by syncing or unmounting the file systems. */ if ((howto & RB_NOSYNC) == 0 && panicstr == NULL) { if (!syncdone) { syncdone = true; /* XXX used to force unmount as well, here */ vfs_sync_all(curlwp); } while (vfs_unmountall1(curlwp, false, false) || config_detach_all(boothowto) || vfs_unmount_forceone(curlwp)) ; /* do nothing */ } else { if (!db_active) suspendsched(); } pmf_system_shutdown(boothowto); /* Disable interrupts. */ s = splhigh(); /* Do a dump if requested. */ if ((howto & (RB_DUMP | RB_HALT)) == RB_DUMP) dumpsys(); haltsys: doshutdownhooks(); if ((howto & RB_POWERDOWN) == RB_POWERDOWN) { #if NACPICA > 0 if (s != IPL_NONE) splx(s); acpi_enter_sleep_state(ACPI_STATE_S5); #endif #ifdef XEN if (vm_guest == VM_GUEST_XENPV || vm_guest == VM_GUEST_XENPVH || vm_guest == VM_GUEST_XENPVHVM) HYPERVISOR_shutdown(); #endif /* XEN */ } cpu_broadcast_halt(); if (howto & RB_HALT) { #if NACPICA > 0 acpi_disable(); #endif printf("\n"); printf("The operating system has halted.\n"); printf("Please press any key to reboot.\n\n"); cnpollc(1); /* for proper keyboard command handling */ if (cngetc() == 0) { /* no console attached, so just hlt */ printf("No keyboard - cannot reboot after all.\n"); for(;;) { x86_hlt(); } } cnpollc(0); } printf("rebooting...\n"); if (cpureset_delay > 0) delay(cpureset_delay * 1000); cpu_reset(); for(;;) ; /*NOTREACHED*/ } /* * XXXfvdl share dumpcode. */ /* * Perform assorted dump-related initialization tasks. Assumes that * the maximum physical memory address will not increase afterwards. */ void dump_misc_init(void) { #ifndef NO_SPARSE_DUMP int i; #endif if (dump_headerbuf != NULL) return; /* already called */ #ifndef NO_SPARSE_DUMP for (i = 0; i < mem_cluster_cnt; ++i) { paddr_t top = mem_clusters[i].start + mem_clusters[i].size; if (max_paddr < top) max_paddr = top; } #ifdef DEBUG printf("dump_misc_init: max_paddr = 0x%lx\n", (unsigned long)max_paddr); #endif if (max_paddr == 0) { printf("Your machine does not initialize mem_clusters; " "sparse_dumps disabled\n"); sparse_dump = 0; } else { sparse_dump_physmap = (void *)uvm_km_alloc(kernel_map, roundup(max_paddr / (PAGE_SIZE * NBBY), PAGE_SIZE), PAGE_SIZE, UVM_KMF_WIRED|UVM_KMF_ZERO); } #endif dump_headerbuf = (void *)uvm_km_alloc(kernel_map, dump_headerbuf_size, PAGE_SIZE, UVM_KMF_WIRED|UVM_KMF_ZERO); /* XXXjld should check for failure here, disable dumps if so. */ } #ifndef NO_SPARSE_DUMP /* * Clear the set of pages to include in a sparse dump. */ void sparse_dump_reset(void) { memset(sparse_dump_physmap, 0, roundup(max_paddr / (PAGE_SIZE * NBBY), PAGE_SIZE)); } /* * Include or exclude pages in a sparse dump. */ void sparse_dump_mark(void) { paddr_t p, pstart, pend; struct vm_page *pg; int i; uvm_physseg_t upm; /* * Mark all memory pages, then unmark pages that are uninteresting. * Dereferenceing pg->uobject might crash again if another CPU * frees the object out from under us, but we can't lock anything * so it's a risk we have to take. */ for (i = 0; i < mem_cluster_cnt; ++i) { pstart = mem_clusters[i].start / PAGE_SIZE; pend = pstart + mem_clusters[i].size / PAGE_SIZE; for (p = pstart; p < pend; p++) { setbit(sparse_dump_physmap, p); } } for (upm = uvm_physseg_get_first(); uvm_physseg_valid_p(upm); upm = uvm_physseg_get_next(upm)) { paddr_t pfn; /* * We assume that seg->start to seg->end are * uvm_page_physload()ed */ for (pfn = uvm_physseg_get_start(upm); pfn < uvm_physseg_get_end(upm); pfn++) { pg = PHYS_TO_VM_PAGE(ptoa(pfn)); if (pg->uanon || (pg->flags & PG_FREE) || (pg->uobject && pg->uobject->pgops)) { p = VM_PAGE_TO_PHYS(pg) / PAGE_SIZE; clrbit(sparse_dump_physmap, p); } } } } /* * Machine-dependently decides on the contents of a sparse dump, using * the above. */ void cpu_dump_prep_sparse(void) { sparse_dump_reset(); /* XXX could the alternate recursive page table be skipped? */ sparse_dump_mark(); /* Memory for I/O buffers could be unmarked here, for example. */ /* The kernel text could also be unmarked, but gdb would be upset. */ } #endif /* * Abstractly iterate over the collection of memory segments to be * dumped; the callback lacks the customary environment-pointer * argument because none of the current users really need one. * * To be used only after dump_seg_prep is called to set things up. */ int dump_seg_iter(int (*callback)(paddr_t, paddr_t)) { int error, i; #define CALLBACK(start,size) do { \ error = callback(start,size); \ if (error) \ return error; \ } while(0) for (i = 0; i < mem_cluster_cnt; ++i) { #ifndef NO_SPARSE_DUMP /* * The bitmap is scanned within each memory segment, * rather than over its entire domain, in case any * pages outside of the memory proper have been mapped * into kva; they might be devices that wouldn't * appreciate being arbitrarily read, and including * them could also break the assumption that a sparse * dump will always be smaller than a full one. */ if (sparse_dump && sparse_dump_physmap) { paddr_t p, sp_start, sp_end; int lastset; sp_start = mem_clusters[i].start; sp_end = sp_start + mem_clusters[i].size; sp_start = rounddown(sp_start, PAGE_SIZE); /* unnecessary? */ lastset = 0; for (p = sp_start; p < sp_end; p += PAGE_SIZE) { int thisset = isset(sparse_dump_physmap, p/PAGE_SIZE); if (!lastset && thisset) sp_start = p; if (lastset && !thisset) CALLBACK(sp_start, p - sp_start); lastset = thisset; } if (lastset) CALLBACK(sp_start, p - sp_start); } else #endif CALLBACK(mem_clusters[i].start, mem_clusters[i].size); } return 0; #undef CALLBACK } /* * Prepare for an impending core dump: decide what's being dumped and * how much space it will take up. */ void dump_seg_prep(void) { #ifndef NO_SPARSE_DUMP if (sparse_dump && sparse_dump_physmap) cpu_dump_prep_sparse(); #endif dump_nmemsegs = 0; dump_npages = 0; dump_seg_iter(dump_seg_count_range); dump_header_size = ALIGN(sizeof(kcore_seg_t)) + ALIGN(sizeof(cpu_kcore_hdr_t)) + ALIGN(dump_nmemsegs * sizeof(phys_ram_seg_t)); dump_header_size = roundup(dump_header_size, dbtob(1)); /* * savecore(8) will read this to decide how many pages to * copy, and cpu_dumpconf has already used the pessimistic * value to set dumplo, so it's time to tell the truth. */ dumpsize = dump_npages; /* XXX could these just be one variable? */ } int dump_seg_count_range(paddr_t start, paddr_t size) { ++dump_nmemsegs; dump_npages += size / PAGE_SIZE; return 0; } /* * A sparse dump's header may be rather large, due to the number of * "segments" emitted. These routines manage a simple output buffer, * so that the header can be written to disk incrementally. */ void dump_header_start(void) { dump_headerbuf_ptr = dump_headerbuf; dump_header_blkno = dumplo; } int dump_header_flush(void) { const struct bdevsw *bdev; size_t to_write; int error; bdev = bdevsw_lookup(dumpdev); to_write = roundup(dump_headerbuf_ptr - dump_headerbuf, dbtob(1)); error = bdev->d_dump(dumpdev, dump_header_blkno, dump_headerbuf, to_write); dump_header_blkno += btodb(to_write); dump_headerbuf_ptr = dump_headerbuf; return error; } int dump_header_addbytes(const void* vptr, size_t n) { const char* ptr = vptr; int error; while (n > dump_headerbuf_avail) { memcpy(dump_headerbuf_ptr, ptr, dump_headerbuf_avail); ptr += dump_headerbuf_avail; n -= dump_headerbuf_avail; dump_headerbuf_ptr = dump_headerbuf_end; error = dump_header_flush(); if (error) return error; } memcpy(dump_headerbuf_ptr, ptr, n); dump_headerbuf_ptr += n; return 0; } int dump_header_addseg(paddr_t start, paddr_t size) { phys_ram_seg_t seg = { start, size }; return dump_header_addbytes(&seg, sizeof(seg)); } int dump_header_finish(void) { memset(dump_headerbuf_ptr, 0, dump_headerbuf_avail); return dump_header_flush(); } /* * These variables are needed by /sbin/savecore */ uint32_t dumpmag = 0x8fca0101; /* magic number */ int dumpsize = 0; /* pages */ long dumplo = 0; /* blocks */ /* * cpu_dumpsize: calculate size of machine-dependent kernel core dump headers * for a full (non-sparse) dump. */ int cpu_dumpsize(void) { int size; size = ALIGN(sizeof(kcore_seg_t)) + ALIGN(sizeof(cpu_kcore_hdr_t)) + ALIGN(mem_cluster_cnt * sizeof(phys_ram_seg_t)); if (roundup(size, dbtob(1)) != dbtob(1)) return (-1); return (1); } /* * cpu_dump_mempagecnt: calculate the size of RAM (in pages) to be dumped * for a full (non-sparse) dump. */ u_long cpu_dump_mempagecnt(void) { u_long i, n; n = 0; for (i = 0; i < mem_cluster_cnt; i++) n += atop(mem_clusters[i].size); return (n); } /* * cpu_dump: dump the machine-dependent kernel core dump headers. */ int cpu_dump(void) { kcore_seg_t seg; cpu_kcore_hdr_t cpuhdr; const struct bdevsw *bdev; bdev = bdevsw_lookup(dumpdev); if (bdev == NULL) return (ENXIO); /* * Generate a segment header. */ CORE_SETMAGIC(seg, KCORE_MAGIC, MID_MACHINE, CORE_CPU); seg.c_size = dump_header_size - ALIGN(sizeof(seg)); (void)dump_header_addbytes(&seg, ALIGN(sizeof(seg))); /* * Add the machine-dependent header info. */ cpuhdr.ptdpaddr = PDPpaddr; cpuhdr.nmemsegs = dump_nmemsegs; (void)dump_header_addbytes(&cpuhdr, ALIGN(sizeof(cpuhdr))); /* * Write out the memory segment descriptors. */ return dump_seg_iter(dump_header_addseg); } /* * Doadump comes here after turning off memory management and * getting on the dump stack, either when called above, or by * the auto-restart code. */ #define BYTES_PER_DUMP PAGE_SIZE /* must be a multiple of pagesize XXX small */ static vaddr_t dumpspace; vaddr_t reserve_dumppages(vaddr_t p) { dumpspace = p; return (p + BYTES_PER_DUMP); } int dumpsys_seg(paddr_t maddr, paddr_t bytes) { u_long i, m, n; daddr_t blkno; const struct bdevsw *bdev; int (*dump)(dev_t, daddr_t, void *, size_t); int error; if (dumpdev == NODEV) return ENODEV; bdev = bdevsw_lookup(dumpdev); if (bdev == NULL || bdev->d_psize == NULL) return ENODEV; dump = bdev->d_dump; blkno = dump_header_blkno; for (i = 0; i < bytes; i += n, dump_totalbytesleft -= n) { /* Print out how many MBs we have left to go. */ if ((dump_totalbytesleft % (1024*1024)) == 0) printf_nolog("%lu ", (unsigned long) (dump_totalbytesleft / (1024 * 1024))); /* Limit size for next transfer. */ n = bytes - i; if (n > BYTES_PER_DUMP) n = BYTES_PER_DUMP; for (m = 0; m < n; m += NBPG) pmap_kenter_pa(dumpspace + m, maddr + m, VM_PROT_READ, 0); pmap_update(pmap_kernel()); error = (*dump)(dumpdev, blkno, (void *)dumpspace, n); pmap_kremove_local(dumpspace, n); if (error) return error; maddr += n; blkno += btodb(n); /* XXX? */ #if 0 /* XXX this doesn't work. grr. */ /* operator aborting dump? */ if (sget() != NULL) return EINTR; #endif } dump_header_blkno = blkno; return 0; } void dodumpsys(void) { const struct bdevsw *bdev; int dumpend, psize; int error; if (dumpdev == NODEV) return; bdev = bdevsw_lookup(dumpdev); if (bdev == NULL || bdev->d_psize == NULL) return; /* * For dumps during autoconfiguration, * if dump device has already configured... */ if (dumpsize == 0) cpu_dumpconf(); printf("\ndumping to dev %llu,%llu (offset=%ld, size=%d):", (unsigned long long)major(dumpdev), (unsigned long long)minor(dumpdev), dumplo, dumpsize); if (dumplo <= 0 || dumpsize <= 0) { printf(" not possible\n"); return; } psize = bdev_size(dumpdev); printf("\ndump "); if (psize == -1) { printf("area unavailable\n"); return; } #if 0 /* XXX this doesn't work. grr. */ /* toss any characters present prior to dump */ while (sget() != NULL); /*syscons and pccons differ */ #endif dump_seg_prep(); dumpend = dumplo + btodb(dump_header_size) + ctod(dump_npages); if (dumpend > psize) { printf("failed: insufficient space (%d < %d)\n", psize, dumpend); goto failed; } dump_header_start(); if ((error = cpu_dump()) != 0) goto err; if ((error = dump_header_finish()) != 0) goto err; if (dump_header_blkno != dumplo + btodb(dump_header_size)) { printf("BAD header size (%ld [written] != %ld [expected])\n", (long)(dump_header_blkno - dumplo), (long)btodb(dump_header_size)); goto failed; } dump_totalbytesleft = roundup(ptoa(dump_npages), BYTES_PER_DUMP); error = dump_seg_iter(dumpsys_seg); if (error == 0 && dump_header_blkno != dumpend) { printf("BAD dump size (%ld [written] != %ld [expected])\n", (long)(dumpend - dumplo), (long)(dump_header_blkno - dumplo)); goto failed; } err: switch (error) { case ENXIO: printf("device bad\n"); break; case EFAULT: printf("device not ready\n"); break; case EINVAL: printf("area improper\n"); break; case EIO: printf("i/o error\n"); break; case EINTR: printf("aborted from console\n"); break; case 0: printf("succeeded\n"); break; default: printf("error %d\n", error); break; } failed: printf("\n\n"); delay(5000000); /* 5 seconds */ } /* * This is called by main to set dumplo and dumpsize. * Dumps always skip the first PAGE_SIZE of disk space * in case there might be a disk label stored there. * If there is extra space, put dump at the end to * reduce the chance that swapping trashes it. * * Sparse dumps can't placed as close to the end as possible, because * savecore(8) has to know where to start reading in the dump device * before it has access to any of the crashed system's state. * * Note also that a sparse dump will never be larger than a full one: * in order to add a phys_ram_seg_t to the header, at least one page * must be removed. */ void cpu_dumpconf(void) { int nblks, dumpblks; /* size of dump area */ if (dumpdev == NODEV) goto bad; nblks = bdev_size(dumpdev); if (nblks <= ctod(1)) goto bad; dumpblks = cpu_dumpsize(); if (dumpblks < 0) goto bad; /* dumpsize is in page units, and doesn't include headers. */ dumpsize = cpu_dump_mempagecnt(); dumpblks += ctod(dumpsize); /* If dump won't fit (incl. room for possible label), punt. */ if (dumpblks > (nblks - ctod(1))) { #ifndef NO_SPARSE_DUMP /* A sparse dump might (and hopefully will) fit. */ dumplo = ctod(1); #else /* But if we're not configured for that, punt. */ goto bad; #endif } else { /* Put dump at end of partition */ dumplo = nblks - dumpblks; } /* Now that we've decided this will work, init ancillary stuff. */ dump_misc_init(); return; bad: dumpsize = 0; } /* * Clear registers on exec */ void setregs(struct lwp *l, struct exec_package *pack, vaddr_t stack) { struct pcb *pcb = lwp_getpcb(l); struct trapframe *tf; #ifdef USER_LDT pmap_ldt_cleanup(l); #endif fpu_clear(l, pack->ep_osversion >= 699002600 ? __NetBSD_NPXCW__ : __NetBSD_COMPAT_NPXCW__); x86_dbregs_clear(l); kpreempt_disable(); pcb->pcb_flags = 0; l->l_proc->p_flag &= ~PK_32; l->l_md.md_flags = MDL_IRET; cpu_segregs64_zero(l); kpreempt_enable(); tf = l->l_md.md_regs; tf->tf_ds = GSEL(GUDATA_SEL, SEL_UPL); tf->tf_es = GSEL(GUDATA_SEL, SEL_UPL); tf->tf_rdi = 0; tf->tf_rsi = 0; tf->tf_rbp = 0; tf->tf_rbx = l->l_proc->p_psstrp; tf->tf_rdx = 0; tf->tf_rcx = 0; tf->tf_rax = 0; tf->tf_rip = pack->ep_entry; tf->tf_cs = LSEL(LUCODE_SEL, SEL_UPL); tf->tf_rflags = PSL_USERSET; tf->tf_rsp = stack; tf->tf_ss = LSEL(LUDATA_SEL, SEL_UPL); } /* * Initialize segments and descriptor tables */ char *ldtstore; char *gdtstore; void setgate(struct gate_descriptor *gd, void *func, int ist, int type, int dpl, int sel) { vaddr_t vaddr; vaddr = ((vaddr_t)gd) & ~PAGE_MASK; kpreempt_disable(); pmap_changeprot_local(vaddr, VM_PROT_READ|VM_PROT_WRITE); gd->gd_looffset = (uint64_t)func & 0xffff; gd->gd_selector = sel; gd->gd_ist = ist; gd->gd_type = type; gd->gd_dpl = dpl; gd->gd_p = 1; gd->gd_hioffset = (uint64_t)func >> 16; gd->gd_zero = 0; gd->gd_xx1 = 0; gd->gd_xx2 = 0; gd->gd_xx3 = 0; pmap_changeprot_local(vaddr, VM_PROT_READ); kpreempt_enable(); } void unsetgate(struct gate_descriptor *gd) { vaddr_t vaddr; vaddr = ((vaddr_t)gd) & ~PAGE_MASK; kpreempt_disable(); pmap_changeprot_local(vaddr, VM_PROT_READ|VM_PROT_WRITE); memset(gd, 0, sizeof (*gd)); pmap_changeprot_local(vaddr, VM_PROT_READ); kpreempt_enable(); } void setregion(struct region_descriptor *rd, void *base, uint16_t limit) { rd->rd_limit = limit; rd->rd_base = (uint64_t)base; } /* * Note that the base and limit fields are ignored in long mode. */ void set_mem_segment(struct mem_segment_descriptor *sd, void *base, size_t limit, int type, int dpl, int gran, int def32, int is64) { sd->sd_lolimit = (unsigned)limit; sd->sd_lobase = (unsigned long)base; sd->sd_type = type; sd->sd_dpl = dpl; sd->sd_p = 1; sd->sd_hilimit = (unsigned)limit >> 16; sd->sd_avl = 0; sd->sd_long = is64; sd->sd_def32 = def32; sd->sd_gran = gran; sd->sd_hibase = (unsigned long)base >> 24; } void set_sys_segment(struct sys_segment_descriptor *sd, void *base, size_t limit, int type, int dpl, int gran) { memset(sd, 0, sizeof *sd); sd->sd_lolimit = (unsigned)limit; sd->sd_lobase = (uint64_t)base; sd->sd_type = type; sd->sd_dpl = dpl; sd->sd_p = 1; sd->sd_hilimit = (unsigned)limit >> 16; sd->sd_gran = gran; sd->sd_hibase = (uint64_t)base >> 24; } void cpu_init_idt(struct cpu_info *ci) { struct region_descriptor region; idt_descriptor_t *idt; idt = ci->ci_idtvec.iv_idt; setregion(&region, idt, NIDT * sizeof(idt[0]) - 1); lidt(&region); } #define IDTVEC(name) __CONCAT(X, name) typedef void (vector)(void); extern vector IDTVEC(syscall); extern vector IDTVEC(syscall32); extern vector IDTVEC(osyscall); extern vector *x86_exceptions[]; #ifndef XENPV static void init_x86_64_ksyms(void) { #if NKSYMS || defined(DDB) || defined(MODULAR) extern int end; extern int *esym; struct btinfo_symtab *symtab; vaddr_t tssym, tesym; #ifdef DDB db_machine_init(); #endif symtab = lookup_bootinfo(BTINFO_SYMTAB); if (symtab) { #ifdef KASLR tssym = bootspace.head.va; tesym = bootspace.head.va; /* (unused...) */ #else tssym = (vaddr_t)symtab->ssym + KERNBASE; tesym = (vaddr_t)symtab->esym + KERNBASE; #endif ksyms_addsyms_elf(symtab->nsym, (void *)tssym, (void *)tesym); } else { uintptr_t endp = (uintptr_t)(void *)&end; ksyms_addsyms_elf(*(long *)endp, ((long *)endp) + 1, esym); } #endif } #endif /* XENPV */ void __noasan init_bootspace(void) { extern char __rodata_start; extern char __data_start; extern char __kernel_end; size_t i = 0; memset(&bootspace, 0, sizeof(bootspace)); bootspace.head.va = KERNTEXTOFF; bootspace.head.pa = KERNTEXTOFF - KERNBASE; bootspace.head.sz = 0; bootspace.segs[i].type = BTSEG_TEXT; bootspace.segs[i].va = KERNTEXTOFF; bootspace.segs[i].pa = KERNTEXTOFF - KERNBASE; bootspace.segs[i].sz = (size_t)&__rodata_start - KERNTEXTOFF; i++; bootspace.segs[i].type = BTSEG_RODATA; bootspace.segs[i].va = (vaddr_t)&__rodata_start; bootspace.segs[i].pa = (paddr_t)&__rodata_start - KERNBASE; bootspace.segs[i].sz = (size_t)&__data_start - (size_t)&__rodata_start; i++; bootspace.segs[i].type = BTSEG_DATA; bootspace.segs[i].va = (vaddr_t)&__data_start; bootspace.segs[i].pa = (paddr_t)&__data_start - KERNBASE; bootspace.segs[i].sz = (size_t)&__kernel_end - (size_t)&__data_start; i++; bootspace.boot.va = (vaddr_t)&__kernel_end; bootspace.boot.pa = (paddr_t)&__kernel_end - KERNBASE; bootspace.boot.sz = (size_t)(atdevbase + IOM_SIZE) - (size_t)&__kernel_end; /* In locore.S, we allocated a tmp va. We will use it now. */ bootspace.spareva = KERNBASE + NKL2_KIMG_ENTRIES * NBPD_L2; /* Virtual address of the L4 page. */ bootspace.pdir = (vaddr_t)(PDPpaddr + KERNBASE); /* Kernel module map. */ bootspace.smodule = (vaddr_t)atdevbase + IOM_SIZE; bootspace.emodule = KERNBASE + NKL2_KIMG_ENTRIES * NBPD_L2; } static void init_pte(void) { #ifndef XENPV extern uint32_t nox_flag; pd_entry_t *pdir = (pd_entry_t *)bootspace.pdir; pdir[L4_SLOT_PTE] = PDPpaddr | PTE_W | ((uint64_t)nox_flag << 32) | PTE_P; #endif extern pd_entry_t *normal_pdes[3]; normal_pdes[0] = L2_BASE; normal_pdes[1] = L3_BASE; normal_pdes[2] = L4_BASE; } void init_slotspace(void) { /* * XXX Too early to use cprng(9), or even entropy_extract. */ struct entpool pool; size_t randhole; vaddr_t randva; uint64_t sample; vaddr_t va; memset(&pool, 0, sizeof pool); cpu_rng_early_sample(&sample); entpool_enter(&pool, &sample, sizeof sample); memset(&slotspace, 0, sizeof(slotspace)); /* User. [256, because we want to land in >= 256] */ slotspace.area[SLAREA_USER].sslot = 0; slotspace.area[SLAREA_USER].nslot = PDIR_SLOT_USERLIM+1; slotspace.area[SLAREA_USER].active = true; #ifdef XENPV /* PTE. */ slotspace.area[SLAREA_PTE].sslot = PDIR_SLOT_PTE; slotspace.area[SLAREA_PTE].nslot = 1; slotspace.area[SLAREA_PTE].active = true; #endif #ifdef __HAVE_PCPU_AREA /* Per-CPU. */ slotspace.area[SLAREA_PCPU].sslot = PDIR_SLOT_PCPU; slotspace.area[SLAREA_PCPU].nslot = 1; slotspace.area[SLAREA_PCPU].active = true; #endif #ifdef __HAVE_DIRECT_MAP /* Direct Map. [Randomized later] */ slotspace.area[SLAREA_DMAP].active = false; #endif #ifdef XENPV /* Hypervisor. */ slotspace.area[SLAREA_HYPV].sslot = 256; slotspace.area[SLAREA_HYPV].nslot = 17; slotspace.area[SLAREA_HYPV].active = true; #endif #ifdef KASAN /* ASAN. */ slotspace.area[SLAREA_ASAN].sslot = L4_SLOT_KASAN; slotspace.area[SLAREA_ASAN].nslot = NL4_SLOT_KASAN; slotspace.area[SLAREA_ASAN].active = true; #endif #ifdef KMSAN /* MSAN. */ slotspace.area[SLAREA_MSAN].sslot = L4_SLOT_KMSAN; slotspace.area[SLAREA_MSAN].nslot = NL4_SLOT_KMSAN; slotspace.area[SLAREA_MSAN].active = true; #endif /* Kernel. */ slotspace.area[SLAREA_KERN].sslot = L4_SLOT_KERNBASE; slotspace.area[SLAREA_KERN].nslot = 1; slotspace.area[SLAREA_KERN].active = true; /* Main. */ cpu_rng_early_sample(&sample); entpool_enter(&pool, &sample, sizeof sample); entpool_extract(&pool, &randhole, sizeof randhole); entpool_extract(&pool, &randva, sizeof randva); va = slotspace_rand(SLAREA_MAIN, NKL4_MAX_ENTRIES * NBPD_L4, NBPD_L4, randhole, randva); /* TODO: NBPD_L1 */ vm_min_kernel_address = va; vm_max_kernel_address = va + NKL4_MAX_ENTRIES * NBPD_L4; #ifndef XENPV /* PTE. */ cpu_rng_early_sample(&sample); entpool_enter(&pool, &sample, sizeof sample); entpool_extract(&pool, &randhole, sizeof randhole); entpool_extract(&pool, &randva, sizeof randva); va = slotspace_rand(SLAREA_PTE, NBPD_L4, NBPD_L4, randhole, randva); pte_base = (pd_entry_t *)va; #endif explicit_memset(&pool, 0, sizeof pool); } void init_x86_64(paddr_t first_avail) { extern void consinit(void); struct region_descriptor region; struct mem_segment_descriptor *ldt_segp; struct idt_vec *iv; idt_descriptor_t *idt; int x; struct pcb *pcb; extern vaddr_t lwp0uarea; #ifndef XENPV extern paddr_t local_apic_pa; #endif KASSERT(first_avail % PAGE_SIZE == 0); #ifdef XENPV KASSERT(HYPERVISOR_shared_info != NULL); cpu_info_primary.ci_vcpu = &HYPERVISOR_shared_info->vcpu_info[0]; #endif #ifdef XEN if (vm_guest == VM_GUEST_XENPVH) xen_parse_cmdline(XEN_PARSE_BOOTFLAGS, NULL); #endif init_pte(); uvm_lwp_setuarea(&lwp0, lwp0uarea); cpu_probe(&cpu_info_primary); #ifdef SVS svs_init(); #endif /* * Initialize MSRs on cpu0: * * - Enables SYSCALL/SYSRET. * * - Sets up %fs and %gs so that %gs points to the current * struct cpu_info as needed for CPUVAR(...), curcpu(), and * curlwp. * * - Enables the no-execute bit if supported. * * Thus, after this point, CPUVAR(...), curcpu(), and curlwp * will work on cpu0. * * Note: The call to cpu_init_msrs for secondary CPUs happens * in cpu_hatch. */ cpu_init_msrs(&cpu_info_primary, true); #ifndef XENPV cpu_speculation_init(&cpu_info_primary); #endif use_pae = 1; /* PAE always enabled in long mode */ pcb = lwp_getpcb(&lwp0); #ifdef XENPV mutex_init(&pte_lock, MUTEX_DEFAULT, IPL_VM); pcb->pcb_cr3 = xen_start_info.pt_base - KERNBASE; #else pcb->pcb_cr3 = PDPpaddr; #endif #if NISA > 0 || NPCI > 0 x86_bus_space_init(); #endif pat_init(&cpu_info_primary); consinit(); /* XXX SHOULD NOT BE DONE HERE */ /* * Initialize RNG to get entropy ASAP either from CPU * RDRAND/RDSEED or from seed on disk. Must happen after * cpu_init_msrs. Prefer to happen after consinit so we have * the opportunity to print useful feedback. */ cpu_rng_init(); x86_rndseed(); /* * Initialize PAGE_SIZE-dependent variables. */ uvm_md_init(); uvmexp.ncolors = 2; avail_start = first_avail; #ifndef XENPV /* * Low memory reservations: * Page 0: BIOS data * Page 1: BIOS callback (not used yet, for symmetry with i386) * Page 2: MP bootstrap code (MP_TRAMPOLINE) * Page 3: ACPI wakeup code (ACPI_WAKEUP_ADDR) * Page 4: Temporary page table for 0MB-4MB * Page 5: Temporary page directory * Page 6: Temporary page map level 3 * Page 7: Temporary page map level 4 */ lowmem_rsvd = 8 * PAGE_SIZE; /* Initialize the memory clusters (needed in pmap_bootstrap). */ init_x86_clusters(); #else /* Parse Xen command line (replace bootinfo) */ xen_parse_cmdline(XEN_PARSE_BOOTFLAGS, NULL); avail_end = ctob(xen_start_info.nr_pages); pmap_pa_start = (KERNTEXTOFF - KERNBASE); pmap_pa_end = avail_end; #endif /* * Call pmap initialization to make new kernel address space. * We must do this before loading pages into the VM system. */ pmap_bootstrap(VM_MIN_KERNEL_ADDRESS); #ifndef XENPV /* Internalize the physical pages into the VM system. */ init_x86_vm(avail_start); #else physmem = xen_start_info.nr_pages; uvm_page_physload(atop(avail_start), atop(avail_end), atop(avail_start), atop(avail_end), VM_FREELIST_DEFAULT); #endif init_x86_msgbuf(); kasan_init(); kcsan_init(); kmsan_init((void *)lwp0uarea); pmap_growkernel(VM_MIN_KERNEL_ADDRESS + 32 * 1024 * 1024); kpreempt_disable(); #ifndef XENPV pmap_kenter_pa(local_apic_va, local_apic_pa, VM_PROT_READ|VM_PROT_WRITE, 0); pmap_update(pmap_kernel()); memset((void *)local_apic_va, 0, PAGE_SIZE); #endif pmap_kenter_pa(idt_vaddr, idt_paddr, VM_PROT_READ|VM_PROT_WRITE, 0); pmap_kenter_pa(gdt_vaddr, gdt_paddr, VM_PROT_READ|VM_PROT_WRITE, 0); pmap_kenter_pa(ldt_vaddr, ldt_paddr, VM_PROT_READ|VM_PROT_WRITE, 0); pmap_update(pmap_kernel()); memset((void *)idt_vaddr, 0, PAGE_SIZE); memset((void *)gdt_vaddr, 0, PAGE_SIZE); memset((void *)ldt_vaddr, 0, PAGE_SIZE); #ifndef XENPV pmap_changeprot_local(idt_vaddr, VM_PROT_READ); #endif pmap_update(pmap_kernel()); iv = &(cpu_info_primary.ci_idtvec); idt_vec_init_cpu_md(iv, cpu_index(&cpu_info_primary)); idt = iv->iv_idt; gdtstore = (char *)gdt_vaddr; ldtstore = (char *)ldt_vaddr; /* * Make GDT gates and memory segments. */ set_mem_segment(GDT_ADDR_MEM(gdtstore, GCODE_SEL), 0, 0xfffff, SDT_MEMERA, SEL_KPL, 1, 0, 1); set_mem_segment(GDT_ADDR_MEM(gdtstore, GDATA_SEL), 0, 0xfffff, SDT_MEMRWA, SEL_KPL, 1, 0, 1); set_mem_segment(GDT_ADDR_MEM(gdtstore, GUCODE_SEL), 0, x86_btop(VM_MAXUSER_ADDRESS) - 1, SDT_MEMERA, SEL_UPL, 1, 0, 1); set_mem_segment(GDT_ADDR_MEM(gdtstore, GUDATA_SEL), 0, x86_btop(VM_MAXUSER_ADDRESS) - 1, SDT_MEMRWA, SEL_UPL, 1, 0, 1); #ifndef XENPV set_sys_segment(GDT_ADDR_SYS(gdtstore, GLDT_SEL), ldtstore, LDT_SIZE - 1, SDT_SYSLDT, SEL_KPL, 0); #endif /* * Make LDT memory segments. */ *(struct mem_segment_descriptor *)(ldtstore + LUCODE_SEL) = *GDT_ADDR_MEM(gdtstore, GUCODE_SEL); *(struct mem_segment_descriptor *)(ldtstore + LUDATA_SEL) = *GDT_ADDR_MEM(gdtstore, GUDATA_SEL); /* * 32 bit GDT entries. */ set_mem_segment(GDT_ADDR_MEM(gdtstore, GUCODE32_SEL), 0, x86_btop(VM_MAXUSER_ADDRESS32) - 1, SDT_MEMERA, SEL_UPL, 1, 1, 0); set_mem_segment(GDT_ADDR_MEM(gdtstore, GUDATA32_SEL), 0, x86_btop(VM_MAXUSER_ADDRESS32) - 1, SDT_MEMRWA, SEL_UPL, 1, 1, 0); set_mem_segment(GDT_ADDR_MEM(gdtstore, GUFS_SEL), 0, x86_btop(VM_MAXUSER_ADDRESS32) - 1, SDT_MEMRWA, SEL_UPL, 1, 1, 0); set_mem_segment(GDT_ADDR_MEM(gdtstore, GUGS_SEL), 0, x86_btop(VM_MAXUSER_ADDRESS32) - 1, SDT_MEMRWA, SEL_UPL, 1, 1, 0); /* * 32 bit LDT entries. */ ldt_segp = (struct mem_segment_descriptor *)(ldtstore + LUCODE32_SEL); set_mem_segment(ldt_segp, 0, x86_btop(VM_MAXUSER_ADDRESS32) - 1, SDT_MEMERA, SEL_UPL, 1, 1, 0); ldt_segp = (struct mem_segment_descriptor *)(ldtstore + LUDATA32_SEL); set_mem_segment(ldt_segp, 0, x86_btop(VM_MAXUSER_ADDRESS32) - 1, SDT_MEMRWA, SEL_UPL, 1, 1, 0); /* CPU-specific IDT exceptions. */ for (x = 0; x < NCPUIDT; x++) { int sel, ist; /* Reset to default. Special cases below */ sel = SEL_KPL; ist = 0; idt_vec_reserve(iv, x); switch (x) { case 1: /* DB */ ist = 4; break; case 2: /* NMI */ ist = 3; break; case 3: case 4: sel = SEL_UPL; break; case 8: /* double fault */ ist = 2; break; #ifdef XENPV case 18: /* MCA */ sel |= 0x4; /* Auto EOI/mask */ break; #endif /* XENPV */ default: break; } set_idtgate(&idt[x], x86_exceptions[x], ist, SDT_SYS386IGT, sel, GSEL(GCODE_SEL, SEL_KPL)); } /* new-style interrupt gate for syscalls */ idt_vec_reserve(iv, 128); set_idtgate(&idt[128], &IDTVEC(osyscall), 0, SDT_SYS386IGT, SEL_UPL, GSEL(GCODE_SEL, SEL_KPL)); kpreempt_enable(); setregion(&region, gdtstore, DYNSEL_START - 1); lgdt(&region); #ifdef XENPV /* Init Xen callbacks and syscall handlers */ if (HYPERVISOR_set_callbacks( (unsigned long) hypervisor_callback, (unsigned long) failsafe_callback, (unsigned long) Xsyscall)) panic("HYPERVISOR_set_callbacks() failed"); #endif /* XENPV */ cpu_init_idt(&cpu_info_primary); #ifdef XENPV xen_init_ksyms(); #else /* XENPV */ #ifdef XEN if (vm_guest == VM_GUEST_XENPVH) xen_init_ksyms(); else #endif /* XEN */ init_x86_64_ksyms(); #endif /* XENPV */ #ifndef XENPV intr_default_setup(); #else events_default_setup(); #endif splraise(IPL_HIGH); x86_enable_intr(); #ifdef DDB if (boothowto & RB_KDB) Debugger(); #endif #ifdef KGDB kgdb_port_init(); if (boothowto & RB_KDB) { kgdb_debug_init = 1; kgdb_connect(1); } #endif pcb->pcb_dbregs = NULL; x86_dbregs_init(); } void cpu_reset(void) { #ifndef XENPV idt_descriptor_t *idt; vaddr_t vaddr; idt = cpu_info_primary.ci_idtvec.iv_idt; vaddr = (vaddr_t)idt; #endif x86_disable_intr(); #ifdef XENPV HYPERVISOR_reboot(); #else x86_reset(); /* * Try to cause a triple fault and watchdog reset by making the IDT * invalid and causing a fault. */ kpreempt_disable(); pmap_changeprot_local(vaddr, VM_PROT_READ|VM_PROT_WRITE); memset((void *)idt, 0, NIDT * sizeof(idt[0])); kpreempt_enable(); breakpoint(); #if 0 /* * Try to cause a triple fault and watchdog reset by unmapping the * entire address space and doing a TLB flush. */ memset((void *)PTD, 0, PAGE_SIZE); tlbflush(); #endif #endif /* XENPV */ for (;;); } void cpu_getmcontext(struct lwp *l, mcontext_t *mcp, unsigned int *flags) { const struct trapframe *tf = l->l_md.md_regs; __greg_t ras_rip; mcp->__gregs[_REG_RDI] = tf->tf_rdi; mcp->__gregs[_REG_RSI] = tf->tf_rsi; mcp->__gregs[_REG_RDX] = tf->tf_rdx; mcp->__gregs[_REG_R10] = tf->tf_r10; mcp->__gregs[_REG_R8] = tf->tf_r8; mcp->__gregs[_REG_R9] = tf->tf_r9; /* argX not touched */ mcp->__gregs[_REG_RCX] = tf->tf_rcx; mcp->__gregs[_REG_R11] = tf->tf_r11; mcp->__gregs[_REG_R12] = tf->tf_r12; mcp->__gregs[_REG_R13] = tf->tf_r13; mcp->__gregs[_REG_R14] = tf->tf_r14; mcp->__gregs[_REG_R15] = tf->tf_r15; mcp->__gregs[_REG_RBP] = tf->tf_rbp; mcp->__gregs[_REG_RBX] = tf->tf_rbx; mcp->__gregs[_REG_RAX] = tf->tf_rax; mcp->__gregs[_REG_GS] = 0; mcp->__gregs[_REG_FS] = 0; mcp->__gregs[_REG_ES] = GSEL(GUDATA_SEL, SEL_UPL); mcp->__gregs[_REG_DS] = GSEL(GUDATA_SEL, SEL_UPL); mcp->__gregs[_REG_TRAPNO] = tf->tf_trapno; mcp->__gregs[_REG_ERR] = tf->tf_err; mcp->__gregs[_REG_RIP] = tf->tf_rip; mcp->__gregs[_REG_CS] = LSEL(LUCODE_SEL, SEL_UPL); mcp->__gregs[_REG_RFLAGS] = tf->tf_rflags; mcp->__gregs[_REG_RSP] = tf->tf_rsp; mcp->__gregs[_REG_SS] = LSEL(LUDATA_SEL, SEL_UPL); if ((ras_rip = (__greg_t)ras_lookup(l->l_proc, (void *) mcp->__gregs[_REG_RIP])) != -1) mcp->__gregs[_REG_RIP] = ras_rip; *flags |= _UC_CPU; mcp->_mc_tlsbase = (uintptr_t)l->l_private; *flags |= _UC_TLSBASE; process_read_fpregs_xmm(l, (struct fxsave *)&mcp->__fpregs); *flags |= _UC_FPU; } int cpu_setmcontext(struct lwp *l, const mcontext_t *mcp, unsigned int flags) { struct trapframe *tf = l->l_md.md_regs; const __greg_t *gr = mcp->__gregs; struct proc *p = l->l_proc; int error; int64_t rflags; CTASSERT(sizeof (mcontext_t) == 26 * 8 + 8 + 512); if ((flags & _UC_CPU) != 0) { error = cpu_mcontext_validate(l, mcp); if (error != 0) return error; tf->tf_rdi = gr[_REG_RDI]; tf->tf_rsi = gr[_REG_RSI]; tf->tf_rdx = gr[_REG_RDX]; tf->tf_r10 = gr[_REG_R10]; tf->tf_r8 = gr[_REG_R8]; tf->tf_r9 = gr[_REG_R9]; /* argX not touched */ tf->tf_rcx = gr[_REG_RCX]; tf->tf_r11 = gr[_REG_R11]; tf->tf_r12 = gr[_REG_R12]; tf->tf_r13 = gr[_REG_R13]; tf->tf_r14 = gr[_REG_R14]; tf->tf_r15 = gr[_REG_R15]; tf->tf_rbp = gr[_REG_RBP]; tf->tf_rbx = gr[_REG_RBX]; tf->tf_rax = gr[_REG_RAX]; tf->tf_gs = 0; tf->tf_fs = 0; tf->tf_es = GSEL(GUDATA_SEL, SEL_UPL); tf->tf_ds = GSEL(GUDATA_SEL, SEL_UPL); /* trapno, err not touched */ tf->tf_rip = gr[_REG_RIP]; tf->tf_cs = LSEL(LUCODE_SEL, SEL_UPL); rflags = tf->tf_rflags; rflags &= ~PSL_USER; tf->tf_rflags = rflags | (gr[_REG_RFLAGS] & PSL_USER); tf->tf_rsp = gr[_REG_RSP]; tf->tf_ss = LSEL(LUDATA_SEL, SEL_UPL); l->l_md.md_flags |= MDL_IRET; } if ((flags & _UC_FPU) != 0) process_write_fpregs_xmm(l, (const struct fxsave *)&mcp->__fpregs); if ((flags & _UC_TLSBASE) != 0) lwp_setprivate(l, (void *)(uintptr_t)mcp->_mc_tlsbase); mutex_enter(p->p_lock); if (flags & _UC_SETSTACK) l->l_sigstk.ss_flags |= SS_ONSTACK; if (flags & _UC_CLRSTACK) l->l_sigstk.ss_flags &= ~SS_ONSTACK; mutex_exit(p->p_lock); return 0; } int cpu_mcontext_validate(struct lwp *l, const mcontext_t *mcp) { struct proc *p __diagused = l->l_proc; struct trapframe *tf = l->l_md.md_regs; const __greg_t *gr; uint16_t sel; KASSERT((p->p_flag & PK_32) == 0); gr = mcp->__gregs; if (((gr[_REG_RFLAGS] ^ tf->tf_rflags) & PSL_USERSTATIC) != 0) return EINVAL; sel = gr[_REG_ES] & 0xffff; if (sel != 0 && !VALID_USER_DSEL(sel)) return EINVAL; sel = gr[_REG_FS] & 0xffff; if (sel != 0 && !VALID_USER_DSEL(sel)) return EINVAL; sel = gr[_REG_GS] & 0xffff; if (sel != 0 && !VALID_USER_DSEL(sel)) return EINVAL; sel = gr[_REG_DS] & 0xffff; if (!VALID_USER_DSEL(sel)) return EINVAL; #ifndef XENPV sel = gr[_REG_SS] & 0xffff; if (!VALID_USER_DSEL(sel)) return EINVAL; sel = gr[_REG_CS] & 0xffff; if (!VALID_USER_CSEL(sel)) return EINVAL; #endif if (gr[_REG_RIP] >= VM_MAXUSER_ADDRESS) return EINVAL; return 0; } int mm_md_kernacc(void *ptr, vm_prot_t prot, bool *handled) { const vaddr_t v = (vaddr_t)ptr; vaddr_t kva, kva_end; size_t i; kva = bootspace.head.va; kva_end = kva + bootspace.head.sz; if (v >= kva && v < kva_end) { *handled = true; return 0; } for (i = 0; i < BTSPACE_NSEGS; i++) { kva = bootspace.segs[i].va; kva_end = kva + bootspace.segs[i].sz; if (v < kva || v >= kva_end) continue; *handled = true; if (bootspace.segs[i].type == BTSEG_TEXT || bootspace.segs[i].type == BTSEG_RODATA) { if (prot & VM_PROT_WRITE) { return EFAULT; } } return 0; } kva = bootspace.boot.va; kva_end = kva + bootspace.boot.sz; if (v >= kva && v < kva_end) { *handled = true; return 0; } if (v >= bootspace.smodule && v < bootspace.emodule) { *handled = true; if (!uvm_map_checkprot(module_map, v, v + 1, prot)) { return EFAULT; } } else { *handled = false; } return 0; } /* * Zero out a 64bit LWP's segments registers. Used when exec'ing a new * 64bit program. */ void cpu_segregs64_zero(struct lwp *l) { struct trapframe * const tf = l->l_md.md_regs; struct pcb *pcb; uint64_t zero = 0; KASSERT(kpreempt_disabled()); KASSERT((l->l_proc->p_flag & PK_32) == 0); KASSERT(l == curlwp); pcb = lwp_getpcb(l); tf->tf_fs = 0; tf->tf_gs = 0; setds(GSEL(GUDATA_SEL, SEL_UPL)); setes(GSEL(GUDATA_SEL, SEL_UPL)); setfs(0); setusergs(0); #ifndef XENPV wrmsr(MSR_FSBASE, 0); wrmsr(MSR_KERNELGSBASE, 0); #else HYPERVISOR_set_segment_base(SEGBASE_FS, 0); HYPERVISOR_set_segment_base(SEGBASE_GS_USER, 0); #endif pcb->pcb_fs = 0; pcb->pcb_gs = 0; update_descriptor(&curcpu()->ci_gdt[GUFS_SEL], &zero); update_descriptor(&curcpu()->ci_gdt[GUGS_SEL], &zero); } /* * Zero out a 32bit LWP's segments registers. Used when exec'ing a new * 32bit program. */ void cpu_segregs32_zero(struct lwp *l) { struct trapframe * const tf = l->l_md.md_regs; struct pcb *pcb; uint64_t zero = 0; KASSERT(kpreempt_disabled()); KASSERT(l->l_proc->p_flag & PK_32); KASSERT(l == curlwp); pcb = lwp_getpcb(l); tf->tf_fs = 0; tf->tf_gs = 0; setds(GSEL(GUDATA32_SEL, SEL_UPL)); setes(GSEL(GUDATA32_SEL, SEL_UPL)); setfs(0); setusergs(0); pcb->pcb_fs = 0; pcb->pcb_gs = 0; update_descriptor(&curcpu()->ci_gdt[GUFS_SEL], &zero); update_descriptor(&curcpu()->ci_gdt[GUGS_SEL], &zero); } /* * Load an LWP's TLS context, possibly changing the %fs and %gs selectors. * Used only for 32-bit processes. */ void cpu_fsgs_reload(struct lwp *l, int fssel, int gssel) { struct trapframe *tf; struct pcb *pcb; KASSERT(l->l_proc->p_flag & PK_32); KASSERT(l == curlwp); tf = l->l_md.md_regs; fssel &= 0xFFFF; gssel &= 0xFFFF; pcb = lwp_getpcb(l); kpreempt_disable(); update_descriptor(&curcpu()->ci_gdt[GUFS_SEL], &pcb->pcb_fs); update_descriptor(&curcpu()->ci_gdt[GUGS_SEL], &pcb->pcb_gs); #ifdef XENPV setusergs(gssel); #endif tf->tf_fs = fssel; tf->tf_gs = gssel; kpreempt_enable(); } bool mm_md_direct_mapped_io(void *addr, paddr_t *paddr) { vaddr_t va = (vaddr_t)addr; #ifdef __HAVE_DIRECT_MAP if (va >= PMAP_DIRECT_BASE && va < PMAP_DIRECT_END) { *paddr = PMAP_DIRECT_UNMAP(va); return true; } #else __USE(va); #endif return false; } bool mm_md_direct_mapped_phys(paddr_t paddr, vaddr_t *vaddr) { #ifdef __HAVE_DIRECT_MAP *vaddr = PMAP_DIRECT_MAP(paddr); return true; #else return false; #endif } static void idt_vec_copy(struct idt_vec *dst, struct idt_vec *src) { idt_descriptor_t *idt_dst; idt_dst = dst->iv_idt; kpreempt_disable(); pmap_changeprot_local((vaddr_t)idt_dst, VM_PROT_READ|VM_PROT_WRITE); memcpy(idt_dst, src->iv_idt, PAGE_SIZE); memcpy(dst->iv_allocmap, src->iv_allocmap, sizeof(dst->iv_allocmap)); pmap_changeprot_local((vaddr_t)idt_dst, VM_PROT_READ); kpreempt_enable(); } void idt_vec_init_cpu_md(struct idt_vec *iv, cpuid_t cid) { vaddr_t va; if (cid != cpu_index(&cpu_info_primary) && idt_vec_is_pcpu()) { #ifdef __HAVE_PCPU_AREA va = (vaddr_t)&pcpuarea->ent[cid].idt; #else struct vm_page *pg; va = uvm_km_alloc(kernel_map, PAGE_SIZE, 0, UVM_KMF_VAONLY); pg = uvm_pagealloc(NULL, 0, NULL, UVM_PGA_ZERO); if (pg == NULL) { panic("failed to allocate a page for IDT"); } pmap_kenter_pa(va, VM_PAGE_TO_PHYS(pg), VM_PROT_READ|VM_PROT_WRITE, 0); pmap_update(pmap_kernel()); #endif memset((void *)va, 0, PAGE_SIZE); #ifndef XENPV pmap_changeprot_local(va, VM_PROT_READ); #endif pmap_update(pmap_kernel()); iv->iv_idt = (void *)va; idt_vec_copy(iv, &(cpu_info_primary.ci_idtvec)); } else { iv->iv_idt = (void *)idt_vaddr; } }
8 8 3 3 3 3 3 48 49 2 50 49 2 2 2 50 52 44 93 94 57 53 50 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 /* $NetBSD: cons.c,v 1.95 2023/09/02 17:44:59 riastradh Exp $ */ /* * Copyright (c) 1988 University of Utah. * Copyright (c) 1990, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * the Systems Programming Group of the University of Utah Computer * Science Department. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: Utah $Hdr: cons.c 1.7 92/01/21$ * * @(#)cons.c 8.2 (Berkeley) 1/12/94 */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: cons.c,v 1.95 2023/09/02 17:44:59 riastradh Exp $"); #include <sys/param.h> #include <sys/atomic.h> #include <sys/buf.h> #include <sys/conf.h> #include <sys/file.h> #include <sys/heartbeat.h> #include <sys/ioctl.h> #include <sys/kauth.h> #include <sys/module.h> #include <sys/mutex.h> #include <sys/poll.h> #include <sys/proc.h> #include <sys/pserialize.h> #include <sys/systm.h> #include <sys/tty.h> #include <sys/vnode.h> #include <dev/cons.h> #include "nullcons.h" dev_type_open(cnopen); dev_type_close(cnclose); dev_type_read(cnread); dev_type_write(cnwrite); dev_type_ioctl(cnioctl); dev_type_poll(cnpoll); dev_type_kqfilter(cnkqfilter); static bool cn_redirect(dev_t *, int, int *, struct tty **); static void cn_release(struct tty *); const struct cdevsw cons_cdevsw = { .d_open = cnopen, .d_close = cnclose, .d_read = cnread, .d_write = cnwrite, .d_ioctl = cnioctl, .d_stop = nostop, .d_tty = notty, .d_poll = cnpoll, .d_mmap = nommap, .d_kqfilter = cnkqfilter, .d_discard = nodiscard, .d_flag = D_TTY|D_MPSAFE, }; static struct kmutex cn_lock; struct tty *volatile constty; /* virtual console output device */ struct consdev *cn_tab; /* physical console device info */ struct vnode *cn_devvp[2]; /* vnode for underlying device. */ void cn_set_tab(struct consdev *tab) { /* * This is a point that we should have KASSERT(cold) or add * synchronization in case this can happen after cold boot. * However, cn_tab initialization is so critical to any * diagnostics or debugging that we need to tread carefully * about introducing new ways to crash. So let's put the * assertion in only after we've audited most or all of the * cn_tab updates. */ cn_tab = tab; } int cnopen(dev_t dev, int flag, int mode, struct lwp *l) { dev_t cndev; int unit, error; unit = minor(dev); if (unit > 1) return ENODEV; mutex_enter(&cn_lock); if (cn_tab == NULL) { error = 0; goto out; } /* * always open the 'real' console device, so we don't get nailed * later. This follows normal device semantics; they always get * open() calls. */ cndev = cn_tab->cn_dev; #if NNULLCONS > 0 if (cndev == NODEV) { nullconsattach(0); } #else /* NNULLCONS > 0 */ if (cndev == NODEV) { /* * This is most likely an error in the console attach * code. Panicking looks better than jumping into nowhere * through cdevsw below.... */ panic("cnopen: no console device"); } #endif /* NNULLCONS > 0 */ if (dev == cndev) { /* * This causes cnopen() to be called recursively, which * is generally a bad thing. It is often caused when * dev == 0 and cn_dev has not been set, but was probably * initialised to 0. */ panic("cnopen: cn_tab->cn_dev == dev"); } if (cn_devvp[unit] != NULLVP) { error = 0; goto out; } if ((error = cdevvp(cndev, &cn_devvp[unit])) != 0) { printf("cnopen: unable to get vnode reference\n"); goto out; } vn_lock(cn_devvp[unit], LK_EXCLUSIVE | LK_RETRY); error = VOP_OPEN(cn_devvp[unit], flag, kauth_cred_get()); VOP_UNLOCK(cn_devvp[unit]); out: mutex_exit(&cn_lock); return error; } int cnclose(dev_t dev, int flag, int mode, struct lwp *l) { struct vnode *vp; int unit, error; unit = minor(dev); if (unit > 1) return ENODEV; mutex_enter(&cn_lock); if (cn_tab == NULL) { error = 0; goto out; } vp = cn_devvp[unit]; cn_devvp[unit] = NULL; vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); error = VOP_CLOSE(vp, flag, kauth_cred_get()); VOP_UNLOCK(vp); vrele(vp); out: mutex_exit(&cn_lock); return error; } int cnread(dev_t dev, struct uio *uio, int flag) { struct tty *ctp = NULL; int error; /* * If we would redirect input, punt. This will keep strange * things from happening to people who are using the real * console. Nothing should be using /dev/console for * input (except a shell in single-user mode, but then, * one wouldn't TIOCCONS then). */ if (!cn_redirect(&dev, 1, &error, &ctp)) return error; error = cdev_read(dev, uio, flag); cn_release(ctp); return error; } int cnwrite(dev_t dev, struct uio *uio, int flag) { struct tty *ctp = NULL; int error; /* Redirect output, if that's appropriate. */ if (!cn_redirect(&dev, 0, &error, &ctp)) return error; error = cdev_write(dev, uio, flag); cn_release(ctp); return error; } int cnioctl(dev_t dev, u_long cmd, void *data, int flag, struct lwp *l) { struct tty *ctp = NULL; int error; error = 0; /* * Superuser can always use this to wrest control of console * output from the "virtual" console. */ if (cmd == TIOCCONS) { struct tty *tp; mutex_enter(&constty_lock); tp = atomic_load_relaxed(&constty); if (tp == NULL) { mutex_exit(&constty_lock); goto passthrough; /* XXX ??? */ } error = kauth_authorize_device_tty(l->l_cred, KAUTH_DEVICE_TTY_VIRTUAL, tp); if (!error) atomic_store_relaxed(&constty, NULL); mutex_exit(&constty_lock); return error; } passthrough: /* * Redirect the ioctl, if that's appropriate. * Note that strange things can happen, if a program does * ioctls on /dev/console, then the console is redirected * out from under it. */ if (!cn_redirect(&dev, 0, &error, &ctp)) return error; error = cdev_ioctl(dev, cmd, data, flag, l); cn_release(ctp); return error; } /*ARGSUSED*/ int cnpoll(dev_t dev, int events, struct lwp *l) { struct tty *ctp = NULL; int error; /* * Redirect the poll, if that's appropriate. * I don't want to think of the possible side effects * of console redirection here. */ if (!cn_redirect(&dev, 0, &error, &ctp)) return POLLHUP; error = cdev_poll(dev, events, l); cn_release(ctp); return error; } /*ARGSUSED*/ int cnkqfilter(dev_t dev, struct knote *kn) { struct tty *ctp = NULL; int error; /* * Redirect the kqfilter, if that's appropriate. * I don't want to think of the possible side effects * of console redirection here. */ if (!cn_redirect(&dev, 0, &error, &ctp)) return error; error = cdev_kqfilter(dev, kn); cn_release(ctp); return error; } int cngetc(void) { if (cn_tab == NULL) return (0); int s = splhigh(); for (;;) { const int rv = (*cn_tab->cn_getc)(cn_tab->cn_dev); if (rv >= 0) { splx(s); return rv; } docritpollhooks(); } } int cngetsn(char *cp, int size) { char *lp; int c, len; cnpollc(1); lp = cp; len = 0; for (;;) { c = cngetc(); switch (c) { case '\n': case '\r': printf("\n"); *lp++ = '\0'; cnpollc(0); return (len); case '\b': case '\177': case '#': if (len) { --len; --lp; printf("\b \b"); } continue; case '@': case 'u'&037: /* CTRL-u */ len = 0; lp = cp; printf("\n"); continue; default: if (len + 1 >= size || c < ' ') { printf("\007"); continue; } printf("%c", c); ++len; *lp++ = c; } } } void cnputc(int c) { if (cn_tab == NULL) return; /* * XXX * for some reason this causes ARCS firmware to output an endless stream of * whitespaces with n32 kernels, so use the pre-1.74 code for now until I can * figure out why this happens */ #ifndef sgimips if (c) { if (c == '\n') { (*cn_tab->cn_putc)(cn_tab->cn_dev, '\r'); docritpollhooks(); } (*cn_tab->cn_putc)(cn_tab->cn_dev, c); } #else if (c) { (*cn_tab->cn_putc)(cn_tab->cn_dev, c); if (c == '\n') { docritpollhooks(); (*cn_tab->cn_putc)(cn_tab->cn_dev, '\r'); } } #endif } void cnpollc(int on) { static int refcount = 0; if (cn_tab == NULL) return; if (!on) --refcount; if (refcount == 0) { if (on) { /* * Bind to the current CPU by disabling * preemption (more convenient than finding a * place to store a stack to unwind for * curlwp_bind/bindx, and preemption wouldn't * happen anyway while spinning at high IPL in * cngetc) so that curcpu() is stable so that * we can suspend heartbeat checks for it. */ kpreempt_disable(); heartbeat_suspend(); } (*cn_tab->cn_pollc)(cn_tab->cn_dev, on); if (!on) { heartbeat_resume(); kpreempt_enable(); } } if (on) ++refcount; } void nullcnpollc(dev_t dev, int on) { } void cnbell(u_int pitch, u_int period, u_int volume) { if (cn_tab == NULL || cn_tab->cn_bell == NULL) return; (*cn_tab->cn_bell)(cn_tab->cn_dev, pitch, period, volume); } void cnflush(void) { if (cn_tab == NULL || cn_tab->cn_flush == NULL) return; (*cn_tab->cn_flush)(cn_tab->cn_dev); } void cnhalt(void) { if (cn_tab == NULL || cn_tab->cn_halt == NULL) return; (*cn_tab->cn_halt)(cn_tab->cn_dev); } /* * Redirect output, if that's appropriate. If there's no real console, * return ENXIO. */ static bool cn_redirect(dev_t *devp, int is_read, int *error, struct tty **ctpp) { dev_t dev = *devp; struct tty *ctp; int s; bool ok = false; *error = ENXIO; *ctpp = NULL; s = pserialize_read_enter(); if ((ctp = atomic_load_consume(&constty)) != NULL && minor(dev) == 0 && (cn_tab == NULL || (cn_tab->cn_pri != CN_REMOTE))) { if (is_read) { *error = 0; goto out; } tty_acquire(ctp); *ctpp = ctp; dev = ctp->t_dev; } else if (cn_tab == NULL) goto out; else dev = cn_tab->cn_dev; ok = true; *devp = dev; out: pserialize_read_exit(s); return ok; } static void cn_release(struct tty *ctp) { if (ctp == NULL) return; tty_release(ctp); } MODULE(MODULE_CLASS_DRIVER, cons, NULL); static int cons_modcmd(modcmd_t cmd, void *arg) { switch (cmd) { case MODULE_CMD_INIT: mutex_init(&cn_lock, MUTEX_DEFAULT, IPL_NONE); return 0; case MODULE_CMD_FINI: mutex_destroy(&cn_lock); return 0; default: return ENOTTY; } }
4 7 7 414 1 3 3 1 1 36 36 6 1 4 8 8 1 1 1 1 1 1 1 1 5 5 5 5 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 /* $NetBSD: kern_module.c,v 1.161 2023/01/31 13:21:37 riastradh Exp $ */ /*- * Copyright (c) 2008 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software developed for The NetBSD Foundation * by Andrew Doran. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /* * Kernel module support. */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: kern_module.c,v 1.161 2023/01/31 13:21:37 riastradh Exp $"); #define _MODULE_INTERNAL #ifdef _KERNEL_OPT #include "opt_ddb.h" #include "opt_modular.h" #endif #include <sys/param.h> #include <sys/systm.h> #include <sys/kernel.h> #include <sys/proc.h> #include <sys/lwp.h> #include <sys/kauth.h> #include <sys/kobj.h> #include <sys/kmem.h> #include <sys/module.h> #include <sys/module_hook.h> #include <sys/kthread.h> #include <sys/sysctl.h> #include <sys/lock.h> #include <sys/evcnt.h> #include <uvm/uvm_extern.h> struct vm_map *module_map; const char *module_machine; char module_base[MODULE_BASE_SIZE]; struct modlist module_list = TAILQ_HEAD_INITIALIZER(module_list); struct modlist module_builtins = TAILQ_HEAD_INITIALIZER(module_builtins); static struct modlist module_bootlist = TAILQ_HEAD_INITIALIZER(module_bootlist); struct module_callbacks { TAILQ_ENTRY(module_callbacks) modcb_list; void (*modcb_load)(struct module *); void (*modcb_unload)(struct module *); }; TAILQ_HEAD(modcblist, module_callbacks); static struct modcblist modcblist; static module_t *module_netbsd; static const modinfo_t module_netbsd_modinfo = { .mi_version = __NetBSD_Version__, .mi_class = MODULE_CLASS_MISC, .mi_name = "netbsd" }; static module_t *module_active; #ifdef MODULAR_DEFAULT_VERBOSE bool module_verbose_on = true; #else bool module_verbose_on = false; #endif #ifdef MODULAR_DEFAULT_AUTOLOAD bool module_autoload_on = true; #else bool module_autoload_on = false; #endif bool module_autounload_unsafe = 0; u_int module_count; u_int module_builtinlist; u_int module_autotime = 10; u_int module_gen = 1; static kcondvar_t module_thread_cv; static kmutex_t module_thread_lock; static int module_thread_ticks; int (*module_load_vfs_vec)(const char *, int, bool, module_t *, prop_dictionary_t *) = (void *)eopnotsupp; static kauth_listener_t module_listener; static specificdata_domain_t module_specificdata_domain; /* Ensure that the kernel's link set isn't empty. */ static modinfo_t module_dummy; __link_set_add_rodata(modules, module_dummy); static module_t *module_newmodule(modsrc_t); static void module_free(module_t *); static void module_require_force(module_t *); static int module_do_load(const char *, bool, int, prop_dictionary_t, module_t **, modclass_t modclass, bool); static int module_do_unload(const char *, bool); static int module_do_builtin(const module_t *, const char *, module_t **, prop_dictionary_t); static int module_fetch_info(module_t *); static void module_thread(void *); static module_t *module_lookup(const char *); static void module_enqueue(module_t *); static bool module_merge_dicts(prop_dictionary_t, const prop_dictionary_t); static void sysctl_module_setup(void); static int sysctl_module_autotime(SYSCTLFN_PROTO); static void module_callback_load(struct module *); static void module_callback_unload(struct module *); #define MODULE_CLASS_MATCH(mi, modclass) \ ((modclass) == MODULE_CLASS_ANY || (modclass) == (mi)->mi_class) static void module_incompat(const modinfo_t *mi, int modclass) { module_error("incompatible module class %d for `%s' (wanted %d)", mi->mi_class, mi->mi_name, modclass); } struct module * module_kernel(void) { return module_netbsd; } /* * module_error: * * Utility function: log an error. */ void module_error(const char *fmt, ...) { va_list ap; va_start(ap, fmt); printf("WARNING: module error: "); vprintf(fmt, ap); printf("\n"); va_end(ap); } /* * module_print: * * Utility function: log verbose output. */ void module_print(const char *fmt, ...) { va_list ap; if (module_verbose_on) { va_start(ap, fmt); printf("DEBUG: module: "); vprintf(fmt, ap); printf("\n"); va_end(ap); } } /* * module_name: * * Utility function: return the module's name. */ const char * module_name(struct module *mod) { return mod->mod_info->mi_name; } /* * module_source: * * Utility function: return the module's source. */ modsrc_t module_source(struct module *mod) { return mod->mod_source; } static int module_listener_cb(kauth_cred_t cred, kauth_action_t action, void *cookie, void *arg0, void *arg1, void *arg2, void *arg3) { int result; result = KAUTH_RESULT_DEFER; if (action != KAUTH_SYSTEM_MODULE) return result; if ((uintptr_t)arg2 != 0) /* autoload */ result = KAUTH_RESULT_ALLOW; return result; } /* * Allocate a new module_t */ static module_t * module_newmodule(modsrc_t source) { module_t *mod; mod = kmem_zalloc(sizeof(*mod), KM_SLEEP); mod->mod_source = source; specificdata_init(module_specificdata_domain, &mod->mod_sdref); return mod; } /* * Free a module_t */ static void module_free(module_t *mod) { specificdata_fini(module_specificdata_domain, &mod->mod_sdref); if (mod->mod_required) kmem_free(mod->mod_required, mod->mod_arequired * sizeof(module_t *)); kmem_free(mod, sizeof(*mod)); } /* * Require the -f (force) flag to load a module */ static void module_require_force(struct module *mod) { SET(mod->mod_flags, MODFLG_MUST_FORCE); } /* * Add modules to the builtin list. This can done at boottime or * at runtime if the module is linked into the kernel with an * external linker. All or none of the input will be handled. * Optionally, the modules can be initialized. If they are not * initialized, module_init_class() or module_load() can be used * later, but these are not guaranteed to give atomic results. */ int module_builtin_add(modinfo_t *const *mip, size_t nmodinfo, bool init) { struct module **modp = NULL, *mod_iter; int rv = 0, i, mipskip; if (init) { rv = kauth_authorize_system(kauth_cred_get(), KAUTH_SYSTEM_MODULE, 0, (void *)(uintptr_t)MODCTL_LOAD, (void *)(uintptr_t)1, NULL); if (rv) { return rv; } } for (i = 0, mipskip = 0; i < nmodinfo; i++) { if (mip[i] == &module_dummy) { KASSERT(nmodinfo > 0); nmodinfo--; } } if (nmodinfo == 0) return 0; modp = kmem_zalloc(sizeof(*modp) * nmodinfo, KM_SLEEP); for (i = 0, mipskip = 0; i < nmodinfo; i++) { if (mip[i+mipskip] == &module_dummy) { mipskip++; continue; } modp[i] = module_newmodule(MODULE_SOURCE_KERNEL); modp[i]->mod_info = mip[i+mipskip]; } kernconfig_lock(); /* do this in three stages for error recovery and atomicity */ /* first check for presence */ for (i = 0; i < nmodinfo; i++) { TAILQ_FOREACH(mod_iter, &module_builtins, mod_chain) { if (strcmp(mod_iter->mod_info->mi_name, modp[i]->mod_info->mi_name) == 0) break; } if (mod_iter) { rv = EEXIST; goto out; } if (module_lookup(modp[i]->mod_info->mi_name) != NULL) { rv = EEXIST; goto out; } } /* then add to list */ for (i = 0; i < nmodinfo; i++) { TAILQ_INSERT_TAIL(&module_builtins, modp[i], mod_chain); module_builtinlist++; } /* finally, init (if required) */ if (init) { for (i = 0; i < nmodinfo; i++) { rv = module_do_builtin(modp[i], modp[i]->mod_info->mi_name, NULL, NULL); /* throw in the towel, recovery hard & not worth it */ if (rv) panic("%s: builtin module \"%s\" init failed:" " %d", __func__, modp[i]->mod_info->mi_name, rv); } } out: kernconfig_unlock(); if (rv != 0) { for (i = 0; i < nmodinfo; i++) { if (modp[i]) module_free(modp[i]); } } kmem_free(modp, sizeof(*modp) * nmodinfo); return rv; } /* * Optionally fini and remove builtin module from the kernel. * Note: the module will now be unreachable except via mi && builtin_add. */ int module_builtin_remove(modinfo_t *mi, bool fini) { struct module *mod; int rv = 0; if (fini) { rv = kauth_authorize_system(kauth_cred_get(), KAUTH_SYSTEM_MODULE, 0, (void *)(uintptr_t)MODCTL_UNLOAD, NULL, NULL); if (rv) return rv; kernconfig_lock(); rv = module_do_unload(mi->mi_name, true); if (rv) { goto out; } } else { kernconfig_lock(); } TAILQ_FOREACH(mod, &module_builtins, mod_chain) { if (strcmp(mod->mod_info->mi_name, mi->mi_name) == 0) break; } if (mod) { TAILQ_REMOVE(&module_builtins, mod, mod_chain); module_builtinlist--; } else { KASSERT(fini == false); rv = ENOENT; } out: kernconfig_unlock(); return rv; } /* * module_init: * * Initialize the module subsystem. */ void module_init(void) { __link_set_decl(modules, modinfo_t); modinfo_t *const *mip; int rv; if (module_map == NULL) { module_map = kernel_map; } cv_init(&module_thread_cv, "mod_unld"); mutex_init(&module_thread_lock, MUTEX_DEFAULT, IPL_NONE); TAILQ_INIT(&modcblist); #ifdef MODULAR /* XXX */ module_init_md(); #endif #ifdef KERNEL_DIR const char *booted_kernel = get_booted_kernel(); if (booted_kernel) { char *ptr = strrchr(booted_kernel, '/'); snprintf(module_base, sizeof(module_base), "/%.*s/modules", (int)(ptr - booted_kernel), booted_kernel); } else { strlcpy(module_base, "/netbsd/modules", sizeof(module_base)); printf("Cannot find kernel name, loading modules from \"%s\"\n", module_base); } #else if (!module_machine) module_machine = machine; #if __NetBSD_Version__ / 1000000 % 100 == 99 /* -current */ snprintf(module_base, sizeof(module_base), "/stand/%s/%s/modules", module_machine, osrelease); #else /* release */ snprintf(module_base, sizeof(module_base), "/stand/%s/%d.%d/modules", module_machine, __NetBSD_Version__ / 100000000, __NetBSD_Version__ / 1000000 % 100); #endif #endif module_listener = kauth_listen_scope(KAUTH_SCOPE_SYSTEM, module_listener_cb, NULL); __link_set_foreach(mip, modules) { if ((rv = module_builtin_add(mip, 1, false)) != 0) module_error("builtin %s failed: %d\n", (*mip)->mi_name, rv); } sysctl_module_setup(); module_specificdata_domain = specificdata_domain_create(); module_netbsd = module_newmodule(MODULE_SOURCE_KERNEL); module_netbsd->mod_refcnt = 1; module_netbsd->mod_info = &module_netbsd_modinfo; } /* * module_start_unload_thread: * * Start the auto unload kthread. */ void module_start_unload_thread(void) { int error; error = kthread_create(PRI_VM, KTHREAD_MPSAFE, NULL, module_thread, NULL, NULL, "modunload"); if (error != 0) panic("%s: %d", __func__, error); } /* * module_builtin_require_force * * Require MODCTL_MUST_FORCE to load any built-in modules that have * not yet been initialized */ void module_builtin_require_force(void) { module_t *mod; kernconfig_lock(); TAILQ_FOREACH(mod, &module_builtins, mod_chain) { module_require_force(mod); } kernconfig_unlock(); } static struct sysctllog *module_sysctllog; static int sysctl_module_autotime(SYSCTLFN_ARGS) { struct sysctlnode node; int t, error; t = *(int *)rnode->sysctl_data; node = *rnode; node.sysctl_data = &t; error = sysctl_lookup(SYSCTLFN_CALL(&node)); if (error || newp == NULL) return (error); if (t < 0) return (EINVAL); *(int *)rnode->sysctl_data = t; return (0); } static void sysctl_module_setup(void) { const struct sysctlnode *node = NULL; sysctl_createv(&module_sysctllog, 0, NULL, &node, CTLFLAG_PERMANENT, CTLTYPE_NODE, "module", SYSCTL_DESCR("Module options"), NULL, 0, NULL, 0, CTL_KERN, CTL_CREATE, CTL_EOL); if (node == NULL) return; sysctl_createv(&module_sysctllog, 0, &node, NULL, CTLFLAG_PERMANENT | CTLFLAG_READWRITE, CTLTYPE_BOOL, "autoload", SYSCTL_DESCR("Enable automatic load of modules"), NULL, 0, &module_autoload_on, 0, CTL_CREATE, CTL_EOL); sysctl_createv(&module_sysctllog, 0, &node, NULL, CTLFLAG_PERMANENT | CTLFLAG_READWRITE, CTLTYPE_BOOL, "autounload_unsafe", SYSCTL_DESCR("Enable automatic unload of unaudited modules"), NULL, 0, &module_autounload_unsafe, 0, CTL_CREATE, CTL_EOL); sysctl_createv(&module_sysctllog, 0, &node, NULL, CTLFLAG_PERMANENT | CTLFLAG_READWRITE, CTLTYPE_BOOL, "verbose", SYSCTL_DESCR("Enable verbose output"), NULL, 0, &module_verbose_on, 0, CTL_CREATE, CTL_EOL); sysctl_createv(&module_sysctllog, 0, &node, NULL, CTLFLAG_PERMANENT | CTLFLAG_READONLY, CTLTYPE_STRING, "path", SYSCTL_DESCR("Default module load path"), NULL, 0, module_base, 0, CTL_CREATE, CTL_EOL); sysctl_createv(&module_sysctllog, 0, &node, NULL, CTLFLAG_PERMANENT | CTLFLAG_READWRITE, CTLTYPE_INT, "autotime", SYSCTL_DESCR("Auto-unload delay"), sysctl_module_autotime, 0, &module_autotime, 0, CTL_CREATE, CTL_EOL); } /* * module_init_class: * * Initialize all built-in and pre-loaded modules of the * specified class. */ void module_init_class(modclass_t modclass) { TAILQ_HEAD(, module) bi_fail = TAILQ_HEAD_INITIALIZER(bi_fail); module_t *mod; modinfo_t *mi; kernconfig_lock(); /* * Builtins first. These will not depend on pre-loaded modules * (because the kernel would not link). */ do { TAILQ_FOREACH(mod, &module_builtins, mod_chain) { mi = mod->mod_info; if (!MODULE_CLASS_MATCH(mi, modclass)) continue; /* * If initializing a builtin module fails, don't try * to load it again. But keep it around and queue it * on the builtins list after we're done with module * init. Don't set it to MODFLG_MUST_FORCE in case a * future attempt to initialize can be successful. * (If the module has previously been set to * MODFLG_MUST_FORCE, don't try to override that!) */ if (ISSET(mod->mod_flags, MODFLG_MUST_FORCE) || module_do_builtin(mod, mi->mi_name, NULL, NULL) != 0) { TAILQ_REMOVE(&module_builtins, mod, mod_chain); TAILQ_INSERT_TAIL(&bi_fail, mod, mod_chain); } break; } } while (mod != NULL); /* * Now preloaded modules. These will be pulled off the * list as we call module_do_load(); */ do { TAILQ_FOREACH(mod, &module_bootlist, mod_chain) { mi = mod->mod_info; if (!MODULE_CLASS_MATCH(mi, modclass)) continue; module_do_load(mi->mi_name, false, 0, NULL, NULL, modclass, false); break; } } while (mod != NULL); /* return failed builtin modules to builtin list */ while ((mod = TAILQ_FIRST(&bi_fail)) != NULL) { TAILQ_REMOVE(&bi_fail, mod, mod_chain); TAILQ_INSERT_TAIL(&module_builtins, mod, mod_chain); } kernconfig_unlock(); } /* * module_compatible: * * Return true if the two supplied kernel versions are said to * have the same binary interface for kernel code. The entire * version is signficant for the development tree (-current), * major and minor versions are significant for official * releases of the system. */ bool module_compatible(int v1, int v2) { #if __NetBSD_Version__ / 1000000 % 100 == 99 /* -current */ return v1 == v2; #else /* release */ return abs(v1 - v2) < 10000; #endif } /* * module_load: * * Load a single module from the file system. */ int module_load(const char *filename, int flags, prop_dictionary_t props, modclass_t modclass) { module_t *mod; int error; /* Test if we already have the module loaded before * authorizing so we have the opportunity to return EEXIST. */ kernconfig_lock(); mod = module_lookup(filename); if (mod != NULL) { module_print("%s module `%s' already loaded", "requested", filename); error = EEXIST; goto out; } /* Authorize. */ error = kauth_authorize_system(kauth_cred_get(), KAUTH_SYSTEM_MODULE, 0, (void *)(uintptr_t)MODCTL_LOAD, NULL, NULL); if (error != 0) goto out; error = module_do_load(filename, false, flags, props, NULL, modclass, false); out: kernconfig_unlock(); return error; } /* * module_autoload: * * Load a single module from the file system, system initiated. */ int module_autoload(const char *filename, modclass_t modclass) { int error; struct proc *p = curlwp->l_proc; kernconfig_lock(); /* Nothing if the user has disabled it. */ if (!module_autoload_on) { kernconfig_unlock(); return EPERM; } /* Disallow path separators and magic symlinks. */ if (strchr(filename, '/') != NULL || strchr(filename, '@') != NULL || strchr(filename, '.') != NULL) { kernconfig_unlock(); return EPERM; } /* Authorize. */ error = kauth_authorize_system(kauth_cred_get(), KAUTH_SYSTEM_MODULE, 0, (void *)(uintptr_t)MODCTL_LOAD, (void *)(uintptr_t)1, NULL); if (error == 0) error = module_do_load(filename, false, 0, NULL, NULL, modclass, true); module_print("Autoload for `%s' requested by pid %d (%s), status %d", filename, p->p_pid, p->p_comm, error); kernconfig_unlock(); return error; } /* * module_unload: * * Find and unload a module by name. */ int module_unload(const char *name) { int error; /* Authorize. */ error = kauth_authorize_system(kauth_cred_get(), KAUTH_SYSTEM_MODULE, 0, (void *)(uintptr_t)MODCTL_UNLOAD, NULL, NULL); if (error != 0) { return error; } kernconfig_lock(); error = module_do_unload(name, true); kernconfig_unlock(); return error; } /* * module_lookup: * * Look up a module by name. */ module_t * module_lookup(const char *name) { module_t *mod; KASSERT(kernconfig_is_held()); TAILQ_FOREACH(mod, &module_list, mod_chain) { if (strcmp(mod->mod_info->mi_name, name) == 0) break; } return mod; } /* * module_hold: * * Add a single reference to a module. It's the caller's * responsibility to ensure that the reference is dropped * later. */ void module_hold(module_t *mod) { kernconfig_lock(); mod->mod_refcnt++; kernconfig_unlock(); } /* * module_rele: * * Release a reference acquired with module_hold(). */ void module_rele(module_t *mod) { kernconfig_lock(); KASSERT(mod->mod_refcnt > 0); mod->mod_refcnt--; kernconfig_unlock(); } /* * module_enqueue: * * Put a module onto the global list and update counters. */ void module_enqueue(module_t *mod) { int i; KASSERT(kernconfig_is_held()); /* * Put new entry at the head of the queue so autounload can unload * requisite modules with only one pass through the queue. */ TAILQ_INSERT_HEAD(&module_list, mod, mod_chain); if (mod->mod_nrequired) { /* Add references to the requisite modules. */ for (i = 0; i < mod->mod_nrequired; i++) { KASSERT((*mod->mod_required)[i] != NULL); (*mod->mod_required)[i]->mod_refcnt++; } } module_count++; module_gen++; } /* * Our array of required module pointers starts with zero entries. If we * need to add a new entry, and the list is already full, we reallocate a * larger array, adding MAXMODDEPS entries. */ static void alloc_required(module_t *mod) { module_t *(*new)[], *(*old)[]; int areq; int i; if (mod->mod_nrequired >= mod->mod_arequired) { areq = mod->mod_arequired + MAXMODDEPS; old = mod->mod_required; new = kmem_zalloc(areq * sizeof(module_t *), KM_SLEEP); for (i = 0; i < mod->mod_arequired; i++) (*new)[i] = (*old)[i]; mod->mod_required = new; if (old) kmem_free(old, mod->mod_arequired * sizeof(module_t *)); mod->mod_arequired = areq; } } /* * module_do_builtin: * * Initialize a module from the list of modules that are * already linked into the kernel. */ static int module_do_builtin(const module_t *pmod, const char *name, module_t **modp, prop_dictionary_t props) { const char *p, *s; char buf[MAXMODNAME]; modinfo_t *mi = NULL; module_t *mod, *mod2, *mod_loaded, *prev_active; size_t len; int error; KASSERT(kernconfig_is_held()); /* * Search the list to see if we have a module by this name. */ TAILQ_FOREACH(mod, &module_builtins, mod_chain) { if (strcmp(mod->mod_info->mi_name, name) == 0) { mi = mod->mod_info; break; } } /* * Check to see if already loaded. This might happen if we * were already loaded as a dependency. */ if ((mod_loaded = module_lookup(name)) != NULL) { KASSERT(mod == NULL); if (modp) *modp = mod_loaded; return 0; } /* Note! This is from TAILQ, not immediate above */ if (mi == NULL) { /* * XXX: We'd like to panic here, but currently in some * cases (such as nfsserver + nfs), the dependee can be * successfully linked without the dependencies. */ module_error("built-in module %s can't find builtin " "dependency `%s'", pmod->mod_info->mi_name, name); return ENOENT; } /* * Initialize pre-requisites. */ KASSERT(mod->mod_required == NULL); KASSERT(mod->mod_arequired == 0); KASSERT(mod->mod_nrequired == 0); if (mi->mi_required != NULL) { for (s = mi->mi_required; *s != '\0'; s = p) { if (*s == ',') s++; p = s; while (*p != '\0' && *p != ',') p++; len = uimin(p - s + 1, sizeof(buf)); strlcpy(buf, s, len); if (buf[0] == '\0') break; alloc_required(mod); error = module_do_builtin(mod, buf, &mod2, NULL); if (error != 0) { module_error("built-in module %s prerequisite " "%s failed, error %d", name, buf, error); goto fail; } (*mod->mod_required)[mod->mod_nrequired++] = mod2; } } /* * Try to initialize the module. */ prev_active = module_active; module_active = mod; error = (*mi->mi_modcmd)(MODULE_CMD_INIT, props); module_active = prev_active; if (error != 0) { module_error("built-in module %s failed its MODULE_CMD_INIT, " "error %d", mi->mi_name, error); goto fail; } /* load always succeeds after this point */ TAILQ_REMOVE(&module_builtins, mod, mod_chain); module_builtinlist--; if (modp != NULL) { *modp = mod; } module_enqueue(mod); return 0; fail: if (mod->mod_required) kmem_free(mod->mod_required, mod->mod_arequired * sizeof(module_t *)); mod->mod_arequired = 0; mod->mod_nrequired = 0; mod->mod_required = NULL; return error; } /* * module_load_sysctl * * Check to see if a non-builtin module has any SYSCTL_SETUP() routine(s) * registered. If so, call it (them). */ static void module_load_sysctl(module_t *mod) { void (**ls_funcp)(struct sysctllog **); void *ls_start; size_t ls_size, count; int error; /* * Built-in modules don't have a mod_kobj so we cannot search * for their link_set_sysctl_funcs */ if (mod->mod_source == MODULE_SOURCE_KERNEL) return; error = kobj_find_section(mod->mod_kobj, "link_set_sysctl_funcs", &ls_start, &ls_size); if (error == 0) { count = ls_size / sizeof(ls_start); ls_funcp = ls_start; while (count--) { (**ls_funcp)(&mod->mod_sysctllog); ls_funcp++; } } } /* * module_load_evcnt * * Check to see if a non-builtin module has any static evcnt's defined; * if so, attach them. */ static void module_load_evcnt(module_t *mod) { struct evcnt * const *ls_evp; void *ls_start; size_t ls_size, count; int error; /* * Built-in modules' static evcnt stuff will be handled * automatically as part of general kernel initialization */ if (mod->mod_source == MODULE_SOURCE_KERNEL) return; error = kobj_find_section(mod->mod_kobj, "link_set_evcnts", &ls_start, &ls_size); if (error == 0) { count = ls_size / sizeof(*ls_evp); ls_evp = ls_start; while (count--) { evcnt_attach_static(*ls_evp++); } } } /* * module_unload_evcnt * * Check to see if a non-builtin module has any static evcnt's defined; * if so, detach them. */ static void module_unload_evcnt(module_t *mod) { struct evcnt * const *ls_evp; void *ls_start; size_t ls_size, count; int error; /* * Built-in modules' static evcnt stuff will be handled * automatically as part of general kernel initialization */ if (mod->mod_source == MODULE_SOURCE_KERNEL) return; error = kobj_find_section(mod->mod_kobj, "link_set_evcnts", &ls_start, &ls_size); if (error == 0) { count = ls_size / sizeof(*ls_evp); ls_evp = (void *)((char *)ls_start + ls_size); while (count--) { evcnt_detach(*--ls_evp); } } } /* * module_do_load: * * Helper routine: load a module from the file system, or one * pushed by the boot loader. */ static int module_do_load(const char *name, bool isdep, int flags, prop_dictionary_t props, module_t **modp, modclass_t modclass, bool autoload) { /* The pending list for this level of recursion */ TAILQ_HEAD(pending_t, module); struct pending_t *pending; struct pending_t new_pending = TAILQ_HEAD_INITIALIZER(new_pending); /* The stack of pending lists */ static SLIST_HEAD(pend_head, pend_entry) pend_stack = SLIST_HEAD_INITIALIZER(pend_stack); struct pend_entry { SLIST_ENTRY(pend_entry) pe_entry; struct pending_t *pe_pending; } my_pend_entry; modinfo_t *mi; module_t *mod, *mod2, *prev_active; prop_dictionary_t filedict; char buf[MAXMODNAME]; const char *s, *p; int error; size_t len; KASSERT(kernconfig_is_held()); filedict = NULL; error = 0; /* * Set up the pending list for this entry. If this is an * internal entry (for a dependency), then use the same list * as for the outer call; otherwise, it's an external entry * (possibly recursive, ie a module's xxx_modcmd(init, ...) * routine called us), so use the locally allocated list. In * either case, add it to our stack. */ if (isdep) { KASSERT(SLIST_FIRST(&pend_stack) != NULL); pending = SLIST_FIRST(&pend_stack)->pe_pending; } else pending = &new_pending; my_pend_entry.pe_pending = pending; SLIST_INSERT_HEAD(&pend_stack, &my_pend_entry, pe_entry); /* * Search the list of disabled builtins first. */ TAILQ_FOREACH(mod, &module_builtins, mod_chain) { if (strcmp(mod->mod_info->mi_name, name) == 0) { break; } } if (mod) { if (ISSET(mod->mod_flags, MODFLG_MUST_FORCE) && !ISSET(flags, MODCTL_LOAD_FORCE)) { if (!autoload) { module_error("use -f to reinstate " "builtin module `%s'", name); } SLIST_REMOVE_HEAD(&pend_stack, pe_entry); return EPERM; } else { SLIST_REMOVE_HEAD(&pend_stack, pe_entry); error = module_do_builtin(mod, name, modp, props); return error; } } /* * Load the module and link. Before going to the file system, * scan the list of modules loaded by the boot loader. */ TAILQ_FOREACH(mod, &module_bootlist, mod_chain) { if (strcmp(mod->mod_info->mi_name, name) == 0) { TAILQ_REMOVE(&module_bootlist, mod, mod_chain); break; } } if (mod != NULL) { TAILQ_INSERT_TAIL(pending, mod, mod_chain); } else { /* * Check to see if module is already present. */ mod = module_lookup(name); if (mod != NULL) { if (modp != NULL) { *modp = mod; } module_print("%s module `%s' already loaded", isdep ? "dependent" : "requested", name); SLIST_REMOVE_HEAD(&pend_stack, pe_entry); return EEXIST; } mod = module_newmodule(MODULE_SOURCE_FILESYS); if (mod == NULL) { module_error("out of memory for `%s'", name); SLIST_REMOVE_HEAD(&pend_stack, pe_entry); return ENOMEM; } error = module_load_vfs_vec(name, flags, autoload, mod, &filedict); if (error != 0) { #ifdef DEBUG /* * The exec class of modules contains a list of * modules that is the union of all the modules * available for each architecture, so we don't * print an error if they are missing. */ if ((modclass != MODULE_CLASS_EXEC || error != ENOENT) && root_device != NULL) module_error("vfs load failed for `%s', " "error %d", name, error); #endif SLIST_REMOVE_HEAD(&pend_stack, pe_entry); module_free(mod); return error; } TAILQ_INSERT_TAIL(pending, mod, mod_chain); error = module_fetch_info(mod); if (error != 0) { module_error("cannot fetch info for `%s', error %d", name, error); goto fail; } } /* * Check compatibility. */ mi = mod->mod_info; if (strnlen(mi->mi_name, MAXMODNAME) >= MAXMODNAME) { error = EINVAL; module_error("module name `%s' longer than %d", mi->mi_name, MAXMODNAME); goto fail; } if (mi->mi_class <= MODULE_CLASS_ANY || mi->mi_class >= MODULE_CLASS_MAX) { error = EINVAL; module_error("module `%s' has invalid class %d", mi->mi_name, mi->mi_class); goto fail; } if (!module_compatible(mi->mi_version, __NetBSD_Version__)) { module_error("module `%s' built for `%d', system `%d'", mi->mi_name, mi->mi_version, __NetBSD_Version__); if (ISSET(flags, MODCTL_LOAD_FORCE)) { module_error("forced load, system may be unstable"); } else { error = EPROGMISMATCH; goto fail; } } /* * If a specific kind of module was requested, ensure that we have * a match. */ if (!MODULE_CLASS_MATCH(mi, modclass)) { module_incompat(mi, modclass); error = ENOENT; goto fail; } /* * If loading a dependency, `name' is a plain module name. * The name must match. */ if (isdep && strcmp(mi->mi_name, name) != 0) { module_error("dependency name mismatch (`%s' != `%s')", name, mi->mi_name); error = ENOENT; goto fail; } /* * If we loaded a module from the filesystem, check the actual * module name (from the modinfo_t) to ensure another module * with the same name doesn't already exist. (There's no * guarantee the filename will match the module name, and the * dup-symbols check may not be sufficient.) */ if (mod->mod_source == MODULE_SOURCE_FILESYS) { mod2 = module_lookup(mod->mod_info->mi_name); if ( mod2 && mod2 != mod) { module_error("module with name `%s' already loaded", mod2->mod_info->mi_name); error = EEXIST; if (modp != NULL) *modp = mod2; goto fail; } } /* * Block circular dependencies. */ TAILQ_FOREACH(mod2, pending, mod_chain) { if (mod == mod2) { continue; } if (strcmp(mod2->mod_info->mi_name, mi->mi_name) == 0) { error = EDEADLK; module_error("circular dependency detected for `%s'", mi->mi_name); goto fail; } } /* * Now try to load any requisite modules. */ if (mi->mi_required != NULL) { mod->mod_arequired = 0; for (s = mi->mi_required; *s != '\0'; s = p) { if (*s == ',') s++; p = s; while (*p != '\0' && *p != ',') p++; len = p - s + 1; if (len >= MAXMODNAME) { error = EINVAL; module_error("required module name `%s' " "longer than %d", mi->mi_required, MAXMODNAME); goto fail; } strlcpy(buf, s, len); if (buf[0] == '\0') break; alloc_required(mod); if (strcmp(buf, mi->mi_name) == 0) { error = EDEADLK; module_error("self-dependency detected for " "`%s'", mi->mi_name); goto fail; } error = module_do_load(buf, true, flags, NULL, &mod2, MODULE_CLASS_ANY, true); if (error != 0 && error != EEXIST) { module_error("recursive load failed for `%s' " "(`%s' required), error %d", mi->mi_name, buf, error); goto fail; } (*mod->mod_required)[mod->mod_nrequired++] = mod2; } } /* * We loaded all needed modules successfully: perform global * relocations and initialize. */ { char xname[MAXMODNAME]; /* * In case of error the entire module is gone, so we * need to save its name for possible error report. */ strlcpy(xname, mi->mi_name, MAXMODNAME); error = kobj_affix(mod->mod_kobj, mi->mi_name); if (error != 0) { module_error("unable to affix module `%s', error %d", xname, error); goto fail2; } } if (filedict) { if (!module_merge_dicts(filedict, props)) { module_error("module properties failed for %s", name); error = EINVAL; goto fail; } } prev_active = module_active; module_active = mod; /* * Note that we handle sysctl and evcnt setup _before_ we * initialize the module itself. This maintains a consistent * order between built-in and run-time-loaded modules. If * initialization then fails, we'll need to undo these, too. */ module_load_sysctl(mod); /* Set-up module's sysctl if any */ module_load_evcnt(mod); /* Attach any static evcnt needed */ error = (*mi->mi_modcmd)(MODULE_CMD_INIT, filedict ? filedict : props); module_active = prev_active; if (filedict) { prop_object_release(filedict); filedict = NULL; } if (error != 0) { module_error("modcmd(CMD_INIT) failed for `%s', error %d", mi->mi_name, error); goto fail3; } /* * If a recursive load already added a module with the same * name, abort. */ mod2 = module_lookup(mi->mi_name); if (mod2 && mod2 != mod) { module_error("recursive load causes duplicate module `%s'", mi->mi_name); error = EEXIST; goto fail1; } /* * Good, the module loaded successfully. Put it onto the * list and add references to its requisite modules. */ TAILQ_REMOVE(pending, mod, mod_chain); module_enqueue(mod); if (modp != NULL) { *modp = mod; } if (autoload && module_autotime > 0) { /* * Arrange to try unloading the module after * a short delay unless auto-unload is disabled. */ mod->mod_autotime = time_second + module_autotime; SET(mod->mod_flags, MODFLG_AUTO_LOADED); module_thread_kick(); } SLIST_REMOVE_HEAD(&pend_stack, pe_entry); module_print("module `%s' loaded successfully", mi->mi_name); module_callback_load(mod); return 0; fail1: (*mi->mi_modcmd)(MODULE_CMD_FINI, NULL); fail3: /* * If there were any registered SYSCTL_SETUP funcs, make sure * we release the sysctl entries */ if (mod->mod_sysctllog) { sysctl_teardown(&mod->mod_sysctllog); } /* Also detach any static evcnt's */ module_unload_evcnt(mod); fail: kobj_unload(mod->mod_kobj); fail2: if (filedict != NULL) { prop_object_release(filedict); filedict = NULL; } TAILQ_REMOVE(pending, mod, mod_chain); SLIST_REMOVE_HEAD(&pend_stack, pe_entry); module_free(mod); return error; } /* * module_do_unload: * * Helper routine: do the dirty work of unloading a module. */ static int module_do_unload(const char *name, bool load_requires_force) { module_t *mod, *prev_active; int error; u_int i; KASSERT(kernconfig_is_held()); KASSERT(name != NULL); module_print("unload requested for '%s' (%s)", name, load_requires_force ? "TRUE" : "FALSE"); mod = module_lookup(name); if (mod == NULL) { module_error("module `%s' not found", name); return ENOENT; } if (mod->mod_refcnt != 0) { module_print("module `%s' busy (%d refs)", name, mod->mod_refcnt); return EBUSY; } /* * Builtin secmodels are there to stay. */ if (mod->mod_source == MODULE_SOURCE_KERNEL && mod->mod_info->mi_class == MODULE_CLASS_SECMODEL) { module_print("cannot unload built-in secmodel module `%s'", name); return EPERM; } prev_active = module_active; module_active = mod; module_callback_unload(mod); /* let the module clean up after itself */ error = (*mod->mod_info->mi_modcmd)(MODULE_CMD_FINI, NULL); /* * If there were any registered SYSCTL_SETUP funcs, make sure * we release the sysctl entries. Same for static evcnt. */ if (error == 0) { if (mod->mod_sysctllog) { sysctl_teardown(&mod->mod_sysctllog); } module_unload_evcnt(mod); } module_active = prev_active; if (error != 0) { module_print("could not unload module `%s' error=%d", name, error); return error; } module_count--; TAILQ_REMOVE(&module_list, mod, mod_chain); for (i = 0; i < mod->mod_nrequired; i++) { (*mod->mod_required)[i]->mod_refcnt--; } module_print("unloaded module `%s'", name); if (mod->mod_kobj != NULL) { kobj_unload(mod->mod_kobj); } if (mod->mod_source == MODULE_SOURCE_KERNEL) { if (mod->mod_required != NULL) { /* * release "required" resources - will be re-parsed * if the module is re-enabled */ kmem_free(mod->mod_required, mod->mod_arequired * sizeof(module_t *)); mod->mod_nrequired = 0; mod->mod_arequired = 0; mod->mod_required = NULL; } if (load_requires_force) module_require_force(mod); TAILQ_INSERT_TAIL(&module_builtins, mod, mod_chain); module_builtinlist++; } else { module_free(mod); } module_gen++; return 0; } /* * module_prime: * * Push a module loaded by the bootloader onto our internal * list. */ int module_prime(const char *name, void *base, size_t size) { __link_set_decl(modules, modinfo_t); modinfo_t *const *mip; module_t *mod; int error; /* Check for module name same as a built-in module */ __link_set_foreach(mip, modules) { if (*mip == &module_dummy) continue; if (strcmp((*mip)->mi_name, name) == 0) { module_error("module `%s' pushed by boot loader " "already exists", name); return EEXIST; } } /* Also eliminate duplicate boolist entries */ TAILQ_FOREACH(mod, &module_bootlist, mod_chain) { if (strcmp(mod->mod_info->mi_name, name) == 0) { module_error("duplicate bootlist entry for module " "`%s'", name); return EEXIST; } } mod = module_newmodule(MODULE_SOURCE_BOOT); if (mod == NULL) { return ENOMEM; } error = kobj_load_mem(&mod->mod_kobj, name, base, size); if (error != 0) { module_free(mod); module_error("unable to load `%s' pushed by boot loader, " "error %d", name, error); return error; } error = module_fetch_info(mod); if (error != 0) { kobj_unload(mod->mod_kobj); module_free(mod); module_error("unable to fetch_info for `%s' pushed by boot " "loader, error %d", name, error); return error; } TAILQ_INSERT_TAIL(&module_bootlist, mod, mod_chain); return 0; } /* * module_fetch_into: * * Fetch modinfo record from a loaded module. */ static int module_fetch_info(module_t *mod) { int error; void *addr; size_t size; /* * Find module info record and check compatibility. */ error = kobj_find_section(mod->mod_kobj, "link_set_modules", &addr, &size); if (error != 0) { module_error("`link_set_modules' section not present, " "error %d", error); return error; } if (size != sizeof(modinfo_t **)) { if (size > sizeof(modinfo_t **) && (size % sizeof(modinfo_t **)) == 0) { module_error("`link_set_modules' section wrong size " "(%zu different MODULE declarations?)", size / sizeof(modinfo_t **)); } else { module_error("`link_set_modules' section wrong size " "(got %zu, wanted %zu)", size, sizeof(modinfo_t **)); } return ENOEXEC; } mod->mod_info = *(modinfo_t **)addr; return 0; } /* * module_find_section: * * Allows a module that is being initialized to look up a section * within its ELF object. */ int module_find_section(const char *name, void **addr, size_t *size) { KASSERT(kernconfig_is_held()); KASSERT(module_active != NULL); return kobj_find_section(module_active->mod_kobj, name, addr, size); } /* * module_thread: * * Automatically unload modules. We try once to unload autoloaded * modules after module_autotime seconds. If the system is under * severe memory pressure, we'll try unloading all modules, else if * module_autotime is zero, we don't try to unload, even if the * module was previously scheduled for unload. */ static void module_thread(void *cookie) { module_t *mod, *next; modinfo_t *mi; int error; for (;;) { kernconfig_lock(); for (mod = TAILQ_FIRST(&module_list); mod != NULL; mod = next) { next = TAILQ_NEXT(mod, mod_chain); /* skip built-in modules */ if (mod->mod_source == MODULE_SOURCE_KERNEL) continue; /* skip modules that weren't auto-loaded */ if (!ISSET(mod->mod_flags, MODFLG_AUTO_LOADED)) continue; if (uvm_availmem(false) < uvmexp.freemin) { module_thread_ticks = hz; } else if (module_autotime == 0 || mod->mod_autotime == 0) { continue; } else if (time_second < mod->mod_autotime) { module_thread_ticks = hz; continue; } else { mod->mod_autotime = 0; } /* * Ask the module if it can be safely unloaded. * * - Modules which have been audited to be OK * with that will return 0. * * - Modules which have not been audited for * safe autounload will return ENOTTY. * * => With kern.module.autounload_unsafe=1, * we treat ENOTTY as acceptance. * * - Some modules would ping-ping in and out * because their use is transient but often. * Example: exec_script. Other modules may * still be in use. These modules can * prevent autounload in all cases by * returning EBUSY or some other error code. */ mi = mod->mod_info; error = (*mi->mi_modcmd)(MODULE_CMD_AUTOUNLOAD, NULL); if (error == 0 || (error == ENOTTY && module_autounload_unsafe)) { (void)module_do_unload(mi->mi_name, false); } else module_print("module `%s' declined to be " "auto-unloaded error=%d", mi->mi_name, error); } kernconfig_unlock(); mutex_enter(&module_thread_lock); (void)cv_timedwait(&module_thread_cv, &module_thread_lock, module_thread_ticks); module_thread_ticks = 0; mutex_exit(&module_thread_lock); } } /* * module_thread: * * Kick the module thread into action, perhaps because the * system is low on memory. */ void module_thread_kick(void) { mutex_enter(&module_thread_lock); module_thread_ticks = hz; cv_broadcast(&module_thread_cv); mutex_exit(&module_thread_lock); } #ifdef DDB /* * module_whatis: * * Helper routine for DDB. */ void module_whatis(uintptr_t addr, void (*pr)(const char *, ...)) { module_t *mod; size_t msize; vaddr_t maddr; TAILQ_FOREACH(mod, &module_list, mod_chain) { if (mod->mod_kobj == NULL) { continue; } if (kobj_stat(mod->mod_kobj, &maddr, &msize) != 0) continue; if (addr < maddr || addr >= maddr + msize) { continue; } (*pr)("%p is %p+%zu, in kernel module `%s'\n", (void *)addr, (void *)maddr, (size_t)(addr - maddr), mod->mod_info->mi_name); } } /* * module_print_list: * * Helper routine for DDB. */ void module_print_list(void (*pr)(const char *, ...)) { const char *src; module_t *mod; size_t msize; vaddr_t maddr; (*pr)("%16s %16s %8s %8s\n", "NAME", "TEXT/DATA", "SIZE", "SOURCE"); TAILQ_FOREACH(mod, &module_list, mod_chain) { switch (mod->mod_source) { case MODULE_SOURCE_KERNEL: src = "builtin"; break; case MODULE_SOURCE_FILESYS: src = "filesys"; break; case MODULE_SOURCE_BOOT: src = "boot"; break; default: src = "unknown"; break; } if (mod->mod_kobj == NULL) { maddr = 0; msize = 0; } else if (kobj_stat(mod->mod_kobj, &maddr, &msize) != 0) continue; (*pr)("%16s %16lx %8ld %8s\n", mod->mod_info->mi_name, (long)maddr, (long)msize, src); } } #endif /* DDB */ static bool module_merge_dicts(prop_dictionary_t existing_dict, const prop_dictionary_t new_dict) { prop_dictionary_keysym_t props_keysym; prop_object_iterator_t props_iter; prop_object_t props_obj; const char *props_key; bool error; if (new_dict == NULL) { /* nothing to merge */ return true; } error = false; props_iter = prop_dictionary_iterator(new_dict); if (props_iter == NULL) { return false; } while ((props_obj = prop_object_iterator_next(props_iter)) != NULL) { props_keysym = (prop_dictionary_keysym_t)props_obj; props_key = prop_dictionary_keysym_value(props_keysym); props_obj = prop_dictionary_get_keysym(new_dict, props_keysym); if ((props_obj == NULL) || !prop_dictionary_set(existing_dict, props_key, props_obj)) { error = true; goto out; } } error = false; out: prop_object_iterator_release(props_iter); return !error; } /* * module_specific_key_create: * * Create a key for subsystem module-specific data. */ specificdata_key_t module_specific_key_create(specificdata_key_t *keyp, specificdata_dtor_t dtor) { return specificdata_key_create(module_specificdata_domain, keyp, dtor); } /* * module_specific_key_delete: * * Delete a key for subsystem module-specific data. */ void module_specific_key_delete(specificdata_key_t key) { return specificdata_key_delete(module_specificdata_domain, key); } /* * module_getspecific: * * Return module-specific data corresponding to the specified key. */ void * module_getspecific(module_t *mod, specificdata_key_t key) { return specificdata_getspecific(module_specificdata_domain, &mod->mod_sdref, key); } /* * module_setspecific: * * Set module-specific data corresponding to the specified key. */ void module_setspecific(module_t *mod, specificdata_key_t key, void *data) { specificdata_setspecific(module_specificdata_domain, &mod->mod_sdref, key, data); } /* * module_register_callbacks: * * Register a new set of callbacks to be called on module load/unload. * Call the load callback on each existing module. * Return an opaque handle for unregistering these later. */ void * module_register_callbacks(void (*load)(struct module *), void (*unload)(struct module *)) { struct module_callbacks *modcb; struct module *mod; modcb = kmem_alloc(sizeof(*modcb), KM_SLEEP); modcb->modcb_load = load; modcb->modcb_unload = unload; kernconfig_lock(); TAILQ_INSERT_TAIL(&modcblist, modcb, modcb_list); TAILQ_FOREACH_REVERSE(mod, &module_list, modlist, mod_chain) load(mod); kernconfig_unlock(); return modcb; } /* * module_unregister_callbacks: * * Unregister a previously-registered set of module load/unload callbacks. * Call the unload callback on each existing module. */ void module_unregister_callbacks(void *opaque) { struct module_callbacks *modcb; struct module *mod; modcb = opaque; kernconfig_lock(); TAILQ_FOREACH(mod, &module_list, mod_chain) modcb->modcb_unload(mod); TAILQ_REMOVE(&modcblist, modcb, modcb_list); kernconfig_unlock(); kmem_free(modcb, sizeof(*modcb)); } /* * module_callback_load: * * Helper routine: call all load callbacks on a module being loaded. */ static void module_callback_load(struct module *mod) { struct module_callbacks *modcb; TAILQ_FOREACH(modcb, &modcblist, modcb_list) { modcb->modcb_load(mod); } } /* * module_callback_unload: * * Helper routine: call all unload callbacks on a module being unloaded. */ static void module_callback_unload(struct module *mod) { struct module_callbacks *modcb; TAILQ_FOREACH(modcb, &modcblist, modcb_list) { modcb->modcb_unload(mod); } }
6 6 4 6 6 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 /* $NetBSD: mount.h,v 1.16 2024/01/19 18:39:15 christos Exp $ */ /* * Copyright (c) 1989, 1991, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)mount.h 8.21 (Berkeley) 5/20/95 */ #ifndef _COMPAT_SYS_MOUNT_H_ #define _COMPAT_SYS_MOUNT_H_ #ifdef _KERNEL_OPT #include "opt_compat_43.h" #endif #define MFSNAMELEN 16 struct statfs12 { short f_type; /* type of file system */ u_short f_oflags; /* deprecated copy of mount flags */ long f_bsize; /* fundamental file system block size */ long f_iosize; /* optimal transfer block size */ long f_blocks; /* total data blocks in file system */ long f_bfree; /* free blocks in fs */ long f_bavail; /* free blocks avail to non-superuser */ long f_files; /* total file nodes in file system */ long f_ffree; /* free file nodes in fs */ fsid_t f_fsid; /* file system id */ uid_t f_owner; /* user that mounted the file system */ long f_flags; /* copy of mount flags */ long f_syncwrites; /* count of sync writes since mount */ long f_asyncwrites; /* count of async writes since mount */ long f_spare[1]; /* spare for later */ char f_fstypename[MFSNAMELEN]; /* fs type name */ char f_mntonname[MNAMELEN]; /* directory on which mounted */ char f_mntfromname[MNAMELEN]; /* mounted file system */ }; #ifndef _KERNEL #include <string.h> #endif /* * Operations supported on mounted file system. */ /* * Convert from a new statvfs to an old statfs structure. */ #define MOUNTNO_NONE 0 #define MOUNTNO_UFS 1 /* UNIX "Fast" Filesystem */ #define MOUNTNO_NFS 2 /* Network Filesystem */ #define MOUNTNO_MFS 3 /* Memory Filesystem */ #define MOUNTNO_MSDOS 4 /* MSDOS Filesystem */ #define MOUNTNO_CD9660 5 /* iso9660 cdrom */ #define MOUNTNO_FDESC 6 /* /dev/fd filesystem */ #define MOUNTNO_KERNFS 7 /* kernel variable filesystem */ #define MOUNTNO_DEVFS 8 /* device node filesystem */ #define MOUNTNO_AFS 9 /* AFS 3.x */ static const struct { const char *name; const int value; } __nv[] = { { MOUNT_UFS, MOUNTNO_UFS }, { MOUNT_NFS, MOUNTNO_NFS }, { MOUNT_MFS, MOUNTNO_MFS }, { MOUNT_MSDOS, MOUNTNO_MSDOS }, { MOUNT_CD9660, MOUNTNO_CD9660 }, { MOUNT_FDESC, MOUNTNO_FDESC }, { MOUNT_KERNFS, MOUNTNO_KERNFS }, { MOUNT_AFS, MOUNTNO_AFS }, }; static __inline void statvfs_to_statfs12(const struct statvfs *fs, struct statfs12 *s12) { size_t i = 0; memset(s12, 0, sizeof(*s12)); s12->f_type = 0; s12->f_oflags = (short)fs->f_flag; for (i = 0; i < sizeof(__nv) / sizeof(__nv[0]); i++) { if (strcmp(__nv[i].name, fs->f_fstypename) == 0) { s12->f_type = __nv[i].value; break; } } #define __STATFSCLAMP(a) (long)(((a) & ~LONG_MAX) ? LONG_MAX : (a)) s12->f_bsize = __STATFSCLAMP(fs->f_frsize); s12->f_iosize = __STATFSCLAMP(fs->f_iosize); s12->f_blocks = __STATFSCLAMP(fs->f_blocks); s12->f_bfree = __STATFSCLAMP(fs->f_bfree); if (fs->f_bfree > fs->f_bresvd) s12->f_bavail = __STATFSCLAMP(fs->f_bfree - fs->f_bresvd); else s12->f_bavail = -__STATFSCLAMP(fs->f_bresvd - fs->f_bfree); s12->f_files = __STATFSCLAMP(fs->f_files); s12->f_ffree = __STATFSCLAMP(fs->f_ffree); s12->f_fsid = fs->f_fsidx; s12->f_owner = fs->f_owner; s12->f_flags = (long)fs->f_flag; s12->f_syncwrites = __STATFSCLAMP(fs->f_syncwrites); s12->f_asyncwrites = __STATFSCLAMP(fs->f_asyncwrites); memcpy(s12->f_fstypename, fs->f_fstypename, sizeof(s12->f_fstypename)); memcpy(s12->f_mntonname, fs->f_mntonname, sizeof(s12->f_mntonname)); memcpy(s12->f_mntfromname, fs->f_mntfromname, sizeof(s12->f_mntfromname)); } #ifdef _KERNEL static __inline int statvfs_to_statfs12_copy(const void *vs, void *vs12, size_t l) { struct statfs12 *s12 = kmem_zalloc(sizeof(*s12), KM_SLEEP); int error; statvfs_to_statfs12(vs, s12); error = copyout(s12, vs12, sizeof(*s12)); kmem_free(s12, sizeof(*s12)); return error; } /* * Filesystem configuration information. Not used by NetBSD, but * defined here to provide a compatible sysctl interface to Lite2. */ struct vfsconf { struct vfsops *vfc_vfsops; /* filesystem operations vector */ char vfc_name[MFSNAMELEN]; /* filesystem type name */ int vfc_typenum; /* historic filesystem type number */ int vfc_refcount; /* number mounted of this type */ int vfc_flags; /* permanent flags */ int (*vfc_mountroot)(void); /* if != NULL, routine to mount root */ struct vfsconf *vfc_next; /* next in list */ }; /* Old, fixed size filehandle structures (used upto (including) 3.x) */ struct compat_30_fid { unsigned short fid_len; unsigned short fid_reserved; char fid_data[16]; }; struct compat_30_fhandle { fsid_t fh_fsid; struct compat_30_fid fh_fid; }; #else __BEGIN_DECLS int __compat_fstatfs(int, struct statfs12 *) __dso_hidden; int __compat_getfsstat(struct statfs12 *, long, int) __dso_hidden; int __compat_statfs(const char *, struct statfs12 *) __dso_hidden; int __compat_getmntinfo(struct statfs12 **, int) __dso_hidden; #if defined(_NETBSD_SOURCE) struct compat_30_fhandle; int __compat_fhstatfs(const struct compat_30_fhandle *, struct statfs12 *) __dso_hidden; struct stat13; int __compat_fhstat(const struct compat_30_fhandle *, struct stat13 *) __dso_hidden; struct stat30; int __compat___fhstat30(const struct compat_30_fhandle *, struct stat30 *) __dso_hidden; int __compat___fhstat40(const void *, size_t, struct stat30 *) __dso_hidden; struct stat; int __fhstat50(const void *, size_t, struct stat *); int __fhopen40(const void *, size_t, int); int fhopen(const struct compat_30_fhandle *, int); int __getfh30(const char *, void*, size_t *); int getfh(const char *path, struct compat_30_fhandle *fhp); int mount(const char *, const char *, int, void *); int __mount50(const char *, const char *, int, void *, size_t); #endif /* _NETBSD_SOURCE */ __END_DECLS #endif /* _KERNEL */ #endif /* !_COMPAT_SYS_MOUNT_H_ */
2 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 /* $NetBSD: nfs_vfsops.c,v 1.245 2023/03/21 15:47:46 christos Exp $ */ /* * Copyright (c) 1989, 1993, 1995 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * Rick Macklem at The University of Guelph. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)nfs_vfsops.c 8.12 (Berkeley) 5/20/95 */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: nfs_vfsops.c,v 1.245 2023/03/21 15:47:46 christos Exp $"); #if defined(_KERNEL_OPT) #include "opt_nfs.h" #endif #include <sys/param.h> #include <sys/ioctl.h> #include <sys/signal.h> #include <sys/proc.h> #include <sys/namei.h> #include <sys/device.h> #include <sys/vnode.h> #include <sys/kernel.h> #include <sys/mount.h> #include <sys/buf.h> #include <sys/mbuf.h> #include <sys/dirent.h> #include <sys/socket.h> #include <sys/socketvar.h> #include <sys/sysctl.h> #include <sys/systm.h> #include <sys/timetc.h> #include <sys/kauth.h> #include <sys/module.h> #include <net/if.h> #include <net/route.h> #include <netinet/in.h> #include <nfs/rpcv2.h> #include <nfs/nfsproto.h> #include <nfs/nfsnode.h> #include <nfs/nfs.h> #include <nfs/nfsmount.h> #include <nfs/xdr_subs.h> #include <nfs/nfsm_subs.h> #include <nfs/nfsdiskless.h> #include <nfs/nfs_var.h> MODULE(MODULE_CLASS_VFS, nfs, NULL); extern struct nfsstats nfsstats; extern int nfs_ticks; /* * keep a count of the nfs mounts to generate ficticious drive names * for the per drive stats. */ unsigned int nfs_mount_count = 0; int nfs_commitsize; /* * nfs vfs operations. */ extern const struct vnodeopv_desc nfsv2_vnodeop_opv_desc; extern const struct vnodeopv_desc spec_nfsv2nodeop_opv_desc; extern const struct vnodeopv_desc fifo_nfsv2nodeop_opv_desc; const struct vnodeopv_desc * const nfs_vnodeopv_descs[] = { &nfsv2_vnodeop_opv_desc, &spec_nfsv2nodeop_opv_desc, &fifo_nfsv2nodeop_opv_desc, NULL, }; struct vfsops nfs_vfsops = { .vfs_name = MOUNT_NFS, .vfs_min_mount_data = sizeof (struct nfs_args), .vfs_mount = nfs_mount, .vfs_start = nfs_start, .vfs_unmount = nfs_unmount, .vfs_root = nfs_root, .vfs_quotactl = (void *)eopnotsupp, .vfs_statvfs = nfs_statvfs, .vfs_sync = nfs_sync, .vfs_loadvnode = nfs_loadvnode, .vfs_vget = nfs_vget, .vfs_fhtovp = nfs_fhtovp, .vfs_vptofh = nfs_vptofh, .vfs_init = nfs_vfs_init, .vfs_done = nfs_vfs_done, .vfs_mountroot = nfs_mountroot, .vfs_snapshot = (void *)eopnotsupp, .vfs_extattrctl = vfs_stdextattrctl, .vfs_suspendctl = genfs_suspendctl, .vfs_renamelock_enter = genfs_renamelock_enter, .vfs_renamelock_exit = genfs_renamelock_exit, .vfs_fsync = (void *)eopnotsupp, .vfs_opv_descs = nfs_vnodeopv_descs }; extern u_int32_t nfs_procids[NFS_NPROCS]; extern u_int32_t nfs_prog, nfs_vers; static int nfs_mount_diskless(struct nfs_dlmount *, const char *, struct mount **, struct vnode **, struct lwp *); static int nfs_modcmd(modcmd_t cmd, void *arg) { int error; switch (cmd) { case MODULE_CMD_INIT: error = vfs_attach(&nfs_vfsops); return error; case MODULE_CMD_FINI: error = vfs_detach(&nfs_vfsops); return error; default: return ENOTTY; } } /* * nfs statvfs call */ int nfs_statvfs(struct mount *mp, struct statvfs *sbp) { struct lwp *l = curlwp; struct vnode *vp; struct nfs_statfs *sfp; char *cp; u_int32_t *tl; int32_t t1, t2; char *bpos, *dpos, *cp2; struct nfsmount *nmp = VFSTONFS(mp); int error = 0, retattr; #ifdef NFS_V2_ONLY const int v3 = 0; #else int v3 = (nmp->nm_flag & NFSMNT_NFSV3); #endif struct mbuf *mreq, *mrep = NULL, *md, *mb; kauth_cred_t cred; u_quad_t tquad; struct nfsnode *np; #ifndef nolint sfp = (struct nfs_statfs *)0; #endif vp = nmp->nm_vnode; np = VTONFS(vp); cred = kauth_cred_alloc(); #ifndef NFS_V2_ONLY if (v3 && (nmp->nm_iflag & NFSMNT_GOTFSINFO) == 0) (void)nfs_fsinfo(nmp, vp, cred, l); #endif nfsstats.rpccnt[NFSPROC_FSSTAT]++; nfsm_reqhead(np, NFSPROC_FSSTAT, NFSX_FH(v3)); nfsm_fhtom(np, v3); nfsm_request(np, NFSPROC_FSSTAT, l, cred); if (v3) nfsm_postop_attr(vp, retattr, 0); if (error) { if (mrep != NULL) { if (mrep->m_next != NULL) printf("nfs_vfsops: nfs_statvfs would lose buffers\n"); m_freem(mrep); } goto nfsmout; } nfsm_dissect(sfp, struct nfs_statfs *, NFSX_STATFS(v3)); sbp->f_flag = nmp->nm_flag; sbp->f_iosize = uimin(nmp->nm_rsize, nmp->nm_wsize); if (v3) { sbp->f_frsize = sbp->f_bsize = NFS_FABLKSIZE; tquad = fxdr_hyper(&sfp->sf_tbytes); sbp->f_blocks = ((quad_t)tquad / (quad_t)NFS_FABLKSIZE); tquad = fxdr_hyper(&sfp->sf_fbytes); sbp->f_bfree = ((quad_t)tquad / (quad_t)NFS_FABLKSIZE); tquad = fxdr_hyper(&sfp->sf_abytes); tquad = ((quad_t)tquad / (quad_t)NFS_FABLKSIZE); sbp->f_bresvd = sbp->f_bfree - tquad; sbp->f_bavail = tquad; /* Handle older NFS servers returning negative values */ if ((quad_t)sbp->f_bavail < 0) sbp->f_bavail = 0; tquad = fxdr_hyper(&sfp->sf_tfiles); sbp->f_files = tquad; tquad = fxdr_hyper(&sfp->sf_ffiles); sbp->f_ffree = tquad; sbp->f_favail = tquad; sbp->f_fresvd = 0; } else { sbp->f_bsize = NFS_FABLKSIZE; sbp->f_frsize = fxdr_unsigned(int32_t, sfp->sf_bsize); sbp->f_blocks = fxdr_unsigned(int32_t, sfp->sf_blocks); sbp->f_bfree = fxdr_unsigned(int32_t, sfp->sf_bfree); sbp->f_bavail = fxdr_unsigned(int32_t, sfp->sf_bavail); sbp->f_fresvd = 0; sbp->f_files = 0; sbp->f_ffree = 0; sbp->f_favail = 0; sbp->f_fresvd = 0; } copy_statvfs_info(sbp, mp); nfsm_reqdone; kauth_cred_free(cred); return (error); } #ifndef NFS_V2_ONLY /* * nfs version 3 fsinfo rpc call */ int nfs_fsinfo(struct nfsmount *nmp, struct vnode *vp, kauth_cred_t cred, struct lwp *l) { struct nfsv3_fsinfo *fsp; char *cp; int32_t t1, t2; u_int32_t *tl, pref, xmax; char *bpos, *dpos, *cp2; int error = 0, retattr; struct mbuf *mreq, *mrep, *md, *mb; u_int64_t maxfsize; struct nfsnode *np = VTONFS(vp); nfsstats.rpccnt[NFSPROC_FSINFO]++; nfsm_reqhead(np, NFSPROC_FSINFO, NFSX_FH(1)); nfsm_fhtom(np, 1); nfsm_request(np, NFSPROC_FSINFO, l, cred); nfsm_postop_attr(vp, retattr, 0); if (!error) { nfsm_dissect(fsp, struct nfsv3_fsinfo *, NFSX_V3FSINFO); pref = fxdr_unsigned(u_int32_t, fsp->fs_wtpref); if ((nmp->nm_flag & NFSMNT_WSIZE) == 0 && pref < nmp->nm_wsize && pref >= NFS_FABLKSIZE) nmp->nm_wsize = (pref + NFS_FABLKSIZE - 1) & ~(NFS_FABLKSIZE - 1); xmax = fxdr_unsigned(u_int32_t, fsp->fs_wtmax); if (xmax < nmp->nm_wsize && xmax > 0) { nmp->nm_wsize = xmax & ~(NFS_FABLKSIZE - 1); if (nmp->nm_wsize == 0) nmp->nm_wsize = xmax; } pref = fxdr_unsigned(u_int32_t, fsp->fs_rtpref); if ((nmp->nm_flag & NFSMNT_RSIZE) == 0 && pref < nmp->nm_rsize && pref >= NFS_FABLKSIZE) nmp->nm_rsize = (pref + NFS_FABLKSIZE - 1) & ~(NFS_FABLKSIZE - 1); xmax = fxdr_unsigned(u_int32_t, fsp->fs_rtmax); if (xmax < nmp->nm_rsize && xmax > 0) { nmp->nm_rsize = xmax & ~(NFS_FABLKSIZE - 1); if (nmp->nm_rsize == 0) nmp->nm_rsize = xmax; } pref = fxdr_unsigned(u_int32_t, fsp->fs_dtpref); if (pref < nmp->nm_readdirsize && pref >= NFS_DIRFRAGSIZ) nmp->nm_readdirsize = (pref + NFS_DIRFRAGSIZ - 1) & ~(NFS_DIRFRAGSIZ - 1); if (xmax < nmp->nm_readdirsize && xmax > 0) { nmp->nm_readdirsize = xmax & ~(NFS_DIRFRAGSIZ - 1); if (nmp->nm_readdirsize == 0) nmp->nm_readdirsize = xmax; } nmp->nm_maxfilesize = 0xffffffffffffffffull; maxfsize = fxdr_hyper(&fsp->fs_maxfilesize); if (maxfsize > 0 && maxfsize < nmp->nm_maxfilesize) nmp->nm_maxfilesize = maxfsize; nmp->nm_mountp->mnt_fs_bshift = ffs(MIN(nmp->nm_rsize, nmp->nm_wsize)) - 1; nmp->nm_iflag |= NFSMNT_GOTFSINFO; } nfsm_reqdone; return (error); } #endif /* * Mount a remote root fs via. NFS. It goes like this: * - Call nfs_boot_init() to fill in the nfs_diskless struct * - build the rootfs mount point and call mountnfs() to do the rest. */ int nfs_mountroot(void) { struct timespec ts; struct nfs_diskless *nd; struct vattr attr; struct mount *mp; struct vnode *vp; struct lwp *l; time_t n; int error; l = curlwp; /* XXX */ if (device_class(root_device) != DV_IFNET) return (ENODEV); /* * XXX time must be non-zero when we init the interface or else * the arp code will wedge. [Fixed now in if_ether.c] * However, the NFS attribute cache gives false "hits" when the * current time < nfs_attrtimeo(nmp, np) so keep this in for now. */ if (time_second < NFS_MAXATTRTIMO) { ts.tv_sec = NFS_MAXATTRTIMO; ts.tv_nsec = 0; tc_setclock(&ts); } /* * Call nfs_boot_init() to fill in the nfs_diskless struct. * Side effect: Finds and configures a network interface. */ nd = kmem_zalloc(sizeof(*nd), KM_SLEEP); error = nfs_boot_init(nd, l); if (error) { kmem_free(nd, sizeof(*nd)); return (error); } /* * Create the root mount point. */ error = nfs_mount_diskless(&nd->nd_root, "/", &mp, &vp, l); if (error) goto out; printf("root on %s\n", nd->nd_root.ndm_host); /* * Link it into the mount list. */ mountlist_append(mp); rootvp = vp; mp->mnt_vnodecovered = NULLVP; vfs_unbusy(mp); /* Get root attributes (for the time). */ vn_lock(vp, LK_SHARED | LK_RETRY); error = VOP_GETATTR(vp, &attr, l->l_cred); VOP_UNLOCK(vp); if (error) panic("nfs_mountroot: getattr for root"); n = attr.va_atime.tv_sec; #ifdef DEBUG printf("root time: 0x%jx\n", (intmax_t)n); #endif setrootfstime(n); out: if (error) nfs_boot_cleanup(nd, l); kmem_free(nd, sizeof(*nd)); return (error); } /* * Internal version of mount system call for diskless setup. * Separate function because we used to call it twice. * (once for root and once for swap) */ static int nfs_mount_diskless(struct nfs_dlmount *ndmntp, const char *mntname, struct mount **mpp, struct vnode **vpp, struct lwp *l) /* mntname: mount point name */ { struct mount *mp; struct mbuf *m; int error; vfs_rootmountalloc(MOUNT_NFS, mntname, &mp); mp->mnt_op = &nfs_vfsops; /* * Historical practice expects NFS root file systems to * be initially mounted r/w. */ mp->mnt_flag &= ~MNT_RDONLY; /* Get mbuf for server sockaddr. */ m = m_get(M_WAIT, MT_SONAME); if (m == NULL) panic("nfs_mountroot: mget soname for %s", mntname); MCLAIM(m, &nfs_mowner); memcpy(mtod(m, void *), (void *)ndmntp->ndm_args.addr, (m->m_len = ndmntp->ndm_args.addr->sa_len)); error = mountnfs(&ndmntp->ndm_args, mp, m, mntname, ndmntp->ndm_args.hostname, vpp, l); if (error) { vfs_unbusy(mp); vfs_rele(mp); printf("nfs_mountroot: mount %s failed: %d\n", mntname, error); } else *mpp = mp; return (error); } void nfs_decode_args(struct nfsmount *nmp, struct nfs_args *argp, struct lwp *l) { int s; int adjsock; int maxio; s = splsoftnet(); /* * Silently clear NFSMNT_NOCONN if it's a TCP mount, it makes * no sense in that context. */ if (argp->sotype == SOCK_STREAM) argp->flags &= ~NFSMNT_NOCONN; /* * Cookie translation is not needed for v2, silently ignore it. */ if ((argp->flags & (NFSMNT_XLATECOOKIE|NFSMNT_NFSV3)) == NFSMNT_XLATECOOKIE) argp->flags &= ~NFSMNT_XLATECOOKIE; /* Re-bind if rsrvd port requested and wasn't on one */ adjsock = !(nmp->nm_flag & NFSMNT_RESVPORT) && (argp->flags & NFSMNT_RESVPORT); /* Also re-bind if we're switching to/from a connected UDP socket */ adjsock |= ((nmp->nm_flag & NFSMNT_NOCONN) != (argp->flags & NFSMNT_NOCONN)); /* Update flags. */ nmp->nm_flag = argp->flags; splx(s); if ((argp->flags & NFSMNT_TIMEO) && argp->timeo > 0) { nmp->nm_timeo = (argp->timeo * NFS_HZ + 5) / 10; if (nmp->nm_timeo < NFS_MINTIMEO) nmp->nm_timeo = NFS_MINTIMEO; else if (nmp->nm_timeo > NFS_MAXTIMEO) nmp->nm_timeo = NFS_MAXTIMEO; } if ((argp->flags & NFSMNT_RETRANS) && argp->retrans > 1) { nmp->nm_retry = argp->retrans; if (nmp->nm_retry > NFS_MAXREXMIT) nmp->nm_retry = NFS_MAXREXMIT; } #ifndef NFS_V2_ONLY if (argp->flags & NFSMNT_NFSV3) { if (argp->sotype == SOCK_DGRAM) maxio = NFS_MAXDGRAMDATA; else maxio = NFS_MAXDATA; } else #endif maxio = NFS_V2MAXDATA; if ((argp->flags & NFSMNT_WSIZE) && argp->wsize > 0) { int osize = nmp->nm_wsize; nmp->nm_wsize = argp->wsize; /* Round down to multiple of blocksize */ nmp->nm_wsize &= ~(NFS_FABLKSIZE - 1); if (nmp->nm_wsize <= 0) nmp->nm_wsize = NFS_FABLKSIZE; adjsock |= (nmp->nm_wsize != osize); } if (nmp->nm_wsize > maxio) nmp->nm_wsize = maxio; if (nmp->nm_wsize > MAXBSIZE) nmp->nm_wsize = MAXBSIZE; if ((argp->flags & NFSMNT_RSIZE) && argp->rsize > 0) { int osize = nmp->nm_rsize; nmp->nm_rsize = argp->rsize; /* Round down to multiple of blocksize */ nmp->nm_rsize &= ~(NFS_FABLKSIZE - 1); if (nmp->nm_rsize <= 0) nmp->nm_rsize = NFS_FABLKSIZE; adjsock |= (nmp->nm_rsize != osize); } if (nmp->nm_rsize > maxio) nmp->nm_rsize = maxio; if (nmp->nm_rsize > MAXBSIZE) nmp->nm_rsize = MAXBSIZE; if ((argp->flags & NFSMNT_READDIRSIZE) && argp->readdirsize > 0) { nmp->nm_readdirsize = argp->readdirsize; /* Round down to multiple of minimum blocksize */ nmp->nm_readdirsize &= ~(NFS_DIRFRAGSIZ - 1); if (nmp->nm_readdirsize < NFS_DIRFRAGSIZ) nmp->nm_readdirsize = NFS_DIRFRAGSIZ; /* Bigger than buffer size makes no sense */ if (nmp->nm_readdirsize > NFS_DIRBLKSIZ) nmp->nm_readdirsize = NFS_DIRBLKSIZ; } else if (argp->flags & NFSMNT_RSIZE) nmp->nm_readdirsize = nmp->nm_rsize; if (nmp->nm_readdirsize > maxio) nmp->nm_readdirsize = maxio; if ((argp->flags & NFSMNT_MAXGRPS) && argp->maxgrouplist >= 0 && argp->maxgrouplist <= NFS_MAXGRPS) nmp->nm_numgrps = argp->maxgrouplist; if ((argp->flags & NFSMNT_READAHEAD) && argp->readahead >= 0 && argp->readahead <= NFS_MAXRAHEAD) nmp->nm_readahead = argp->readahead; if ((argp->flags & NFSMNT_DEADTHRESH) && argp->deadthresh >= 1 && argp->deadthresh <= NFS_NEVERDEAD) nmp->nm_deadthresh = argp->deadthresh; adjsock |= ((nmp->nm_sotype != argp->sotype) || (nmp->nm_soproto != argp->proto)); nmp->nm_sotype = argp->sotype; nmp->nm_soproto = argp->proto; if (nmp->nm_so && adjsock) { nfs_safedisconnect(nmp); if (nmp->nm_sotype == SOCK_DGRAM) while (nfs_connect(nmp, (struct nfsreq *)0, l)) { printf("nfs_args: retrying connect\n"); kpause("nfscn3", false, hz, NULL); } } } /* * VFS Operations. * * mount system call * It seems a bit dumb to copyinstr() the host and path here and then * memcpy() them in mountnfs(), but I wanted to detect errors before * doing the sockargs() call because sockargs() allocates an mbuf and * an error after that means that I have to release the mbuf. */ /* ARGSUSED */ int nfs_mount(struct mount *mp, const char *path, void *data, size_t *data_len) { struct lwp *l = curlwp; int error; struct nfs_args *args = data; struct mbuf *nam; struct nfsmount *nmp = VFSTONFS(mp); struct sockaddr *sa; struct vnode *vp; char *pth, *hst; size_t len; u_char *nfh; if (args == NULL) return EINVAL; if (*data_len < sizeof *args) return EINVAL; if (mp->mnt_flag & MNT_GETARGS) { if (nmp == NULL) return (EIO); if (args->addr != NULL) { sa = mtod(nmp->nm_nam, struct sockaddr *); error = copyout(sa, args->addr, sa->sa_len); if (error) return (error); args->addrlen = sa->sa_len; } else args->addrlen = 0; args->version = NFS_ARGSVERSION; args->sotype = nmp->nm_sotype; args->proto = nmp->nm_soproto; args->fh = NULL; args->fhsize = 0; args->flags = nmp->nm_flag; args->wsize = nmp->nm_wsize; args->rsize = nmp->nm_rsize; args->readdirsize = nmp->nm_readdirsize; args->timeo = nmp->nm_timeo; args->retrans = nmp->nm_retry; args->maxgrouplist = nmp->nm_numgrps; args->readahead = nmp->nm_readahead; args->leaseterm = 0; /* dummy */ args->deadthresh = nmp->nm_deadthresh; args->hostname = NULL; *data_len = sizeof *args; return 0; } if (args->version != NFS_ARGSVERSION) return (EPROGMISMATCH); if (args->flags & (NFSMNT_NQNFS|NFSMNT_KERB)) return (EPROGUNAVAIL); #ifdef NFS_V2_ONLY if (args->flags & NFSMNT_NFSV3) return (EPROGMISMATCH); #endif if (mp->mnt_flag & MNT_UPDATE) { if (nmp == NULL) return (EIO); /* * When doing an update, we can't change from or to * v3, or change cookie translation */ args->flags = (args->flags & ~(NFSMNT_NFSV3|NFSMNT_XLATECOOKIE)) | (nmp->nm_flag & (NFSMNT_NFSV3|NFSMNT_XLATECOOKIE)); nfs_decode_args(nmp, args, l); return (0); } if (args->fhsize < 0 || args->fhsize > NFSX_V3FHMAX) return (EINVAL); nfh = malloc(NFSX_V3FHMAX, M_TEMP, M_WAITOK); error = copyin(args->fh, nfh, args->fhsize); if (error) goto free_nfh; pth = malloc(MNAMELEN, M_TEMP, M_WAITOK); error = copyinstr(path, pth, MNAMELEN - 1, &len); if (error) goto free_pth; memset(&pth[len], 0, MNAMELEN - len); hst = malloc(MNAMELEN, M_TEMP, M_WAITOK); error = copyinstr(args->hostname, hst, MNAMELEN - 1, &len); if (error) goto free_hst; memset(&hst[len], 0, MNAMELEN - len); /* sockargs() call must be after above copyin() calls */ error = sockargs(&nam, args->addr, args->addrlen, UIO_USERSPACE, MT_SONAME); if (error) goto free_hst; MCLAIM(nam, &nfs_mowner); args->fh = nfh; error = mountnfs(args, mp, nam, pth, hst, &vp, l); free_hst: free(hst, M_TEMP); free_pth: free(pth, M_TEMP); free_nfh: free(nfh, M_TEMP); return (error); } /* * Common code for mount and mountroot */ int mountnfs(struct nfs_args *argp, struct mount *mp, struct mbuf *nam, const char *pth, const char *hst, struct vnode **vpp, struct lwp *l) { struct nfsmount *nmp; struct nfsnode *np; struct vnode *vp; int error; struct vattr *attrs; kauth_cred_t cr; char iosname[IOSTATNAMELEN]; /* * If the number of nfs iothreads to use has never * been set, create a reasonable number of them. */ if (nfs_niothreads < 0) { nfs_set_niothreads(NFS_DEFAULT_NIOTHREADS); } if (mp->mnt_flag & MNT_UPDATE) { nmp = VFSTONFS(mp); /* update paths, file handles, etc, here XXX */ m_freem(nam); return 0; } nmp = kmem_zalloc(sizeof(*nmp), KM_SLEEP); TAILQ_INIT(&nmp->nm_uidlruhead); TAILQ_INIT(&nmp->nm_bufq); rw_init(&nmp->nm_writeverflock); mutex_init(&nmp->nm_lock, MUTEX_DEFAULT, IPL_NONE); cv_init(&nmp->nm_rcvcv, "nfsrcv"); cv_init(&nmp->nm_sndcv, "nfssnd"); cv_init(&nmp->nm_aiocv, "nfsaio"); cv_init(&nmp->nm_disconcv, "nfsdis"); mp->mnt_data = nmp; mp->mnt_stat.f_namemax = NFS_MAXNAMLEN; vfs_getnewfsid(mp); nmp->nm_mountp = mp; #ifndef NFS_V2_ONLY if ((argp->flags & NFSMNT_NFSV3) == 0) #endif { if (argp->fhsize != NFSX_V2FH) { return EINVAL; } } /* * V2 can only handle 32 bit filesizes. For v3, nfs_fsinfo * will overwrite this. */ nmp->nm_maxfilesize = 0xffffffffLL; nmp->nm_timeo = NFS_TIMEO; nmp->nm_retry = NFS_RETRANS; nmp->nm_wsize = NFS_WSIZE; nmp->nm_rsize = NFS_RSIZE; nmp->nm_readdirsize = NFS_READDIRSIZE; nmp->nm_numgrps = NFS_MAXGRPS; nmp->nm_readahead = NFS_DEFRAHEAD; nmp->nm_deadthresh = NFS_DEFDEADTHRESH; error = set_statvfs_info(pth, UIO_SYSSPACE, hst, UIO_SYSSPACE, mp->mnt_op->vfs_name, mp, l); if (error) goto bad; nmp->nm_nam = nam; /* Set up the sockets and per-host congestion */ nmp->nm_sotype = argp->sotype; nmp->nm_soproto = argp->proto; nfs_decode_args(nmp, argp, l); mp->mnt_fs_bshift = ffs(MIN(nmp->nm_rsize, nmp->nm_wsize)) - 1; mp->mnt_dev_bshift = DEV_BSHIFT; /* * For Connection based sockets (TCP,...) defer the connect until * the first request, in case the server is not responding. */ if (nmp->nm_sotype == SOCK_DGRAM && (error = nfs_connect(nmp, (struct nfsreq *)0, l))) goto bad; /* * This is silly, but it has to be set so that vinifod() works. * We do not want to do an nfs_statvfs() here since we can get * stuck on a dead server and we are holding a lock on the mount * point. */ mp->mnt_stat.f_iosize = NFS_MAXDGRAMDATA; error = nfs_nget(mp, (nfsfh_t *)argp->fh, argp->fhsize, &np); if (error) goto bad; vp = NFSTOV(np); attrs = malloc(sizeof(struct vattr), M_TEMP, M_WAITOK); VOP_GETATTR(vp, attrs, l->l_cred); if ((nmp->nm_flag & NFSMNT_NFSV3) && (vp->v_type == VDIR)) { cr = kauth_cred_alloc(); kauth_cred_setuid(cr, attrs->va_uid); kauth_cred_seteuid(cr, attrs->va_uid); kauth_cred_setsvuid(cr, attrs->va_uid); kauth_cred_setgid(cr, attrs->va_gid); kauth_cred_setegid(cr, attrs->va_gid); kauth_cred_setsvgid(cr, attrs->va_gid); nfs_cookieheuristic(vp, &nmp->nm_iflag, l, cr); kauth_cred_free(cr); } free(attrs, M_TEMP); /* * A reference count is needed on the nfsnode representing the * remote root. If this object is not persistent, then backward * traversals of the mount point (i.e. "..") will not work if * the nfsnode gets flushed out of the cache. Ufs does not have * this problem, because one can identify root inodes by their * number == UFS_ROOTINO (2). So, just unlock, but no rele. */ nmp->nm_vnode = vp; if (vp->v_type == VNON) vp->v_type = VDIR; vp->v_vflag |= VV_ROOT; VOP_UNLOCK(vp); *vpp = vp; snprintf(iosname, sizeof(iosname), "nfs%u", nfs_mount_count++); nmp->nm_stats = iostat_alloc(IOSTAT_NFS, nmp, iosname); return (0); bad: nfs_disconnect(nmp); rw_destroy(&nmp->nm_writeverflock); mutex_destroy(&nmp->nm_lock); cv_destroy(&nmp->nm_rcvcv); cv_destroy(&nmp->nm_sndcv); cv_destroy(&nmp->nm_aiocv); cv_destroy(&nmp->nm_disconcv); kmem_free(nmp, sizeof(*nmp)); m_freem(nam); return (error); } /* * unmount system call */ int nfs_unmount(struct mount *mp, int mntflags) { struct nfsmount *nmp = VFSTONFS(mp); struct vnode *vp; int error, flags = 0; if (mntflags & MNT_FORCE) { mutex_enter(&nmp->nm_lock); flags |= FORCECLOSE; nmp->nm_iflag |= NFSMNT_DISMNTFORCE; mutex_exit(&nmp->nm_lock); } /* * Goes something like this.. * - Check for activity on the root vnode (other than ourselves). * - Call vflush() to clear out vnodes for this file system, * except for the root vnode. * - Decrement reference on the vnode representing remote root. * - Close the socket * - Free up the data structures */ /* * We need to decrement the ref. count on the nfsnode representing * the remote root. See comment in mountnfs(). */ vp = nmp->nm_vnode; error = vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); if (error != 0) goto err; if ((mntflags & MNT_FORCE) == 0 && vrefcnt(vp) > 1) { VOP_UNLOCK(vp); error = EBUSY; goto err; } error = vflush(mp, vp, flags); if (error) { VOP_UNLOCK(vp); goto err; } /* * We are now committed to the unmount; mark the mount structure * as doomed so that any sleepers kicked awake by nfs_disconnect * will go away cleanly. */ nmp->nm_iflag |= NFSMNT_DISMNT; /* * No new async I/O will be added, but await for pending * ones to drain. */ while (nfs_iodbusy(nmp)) kpause("nfsumnt", false, hz, NULL); /* * Clean up the stats... note that we carefully avoid decrementing * nfs_mount_count here for good reason - we may not be unmounting * the last thing mounted. */ iostat_free(nmp->nm_stats); /* * There is one reference count to get rid of here * (see comment in mountnfs()). */ VOP_UNLOCK(vp); vgone(vp); nfs_disconnect(nmp); m_freem(nmp->nm_nam); rw_destroy(&nmp->nm_writeverflock); mutex_destroy(&nmp->nm_lock); cv_destroy(&nmp->nm_rcvcv); cv_destroy(&nmp->nm_sndcv); cv_destroy(&nmp->nm_aiocv); cv_destroy(&nmp->nm_disconcv); kmem_free(nmp, sizeof(*nmp)); return (0); err: if (mntflags & MNT_FORCE) { mutex_enter(&nmp->nm_lock); nmp->nm_iflag &= ~NFSMNT_DISMNTFORCE; mutex_exit(&nmp->nm_lock); } return error; } /* * Return root of a filesystem */ int nfs_root(struct mount *mp, int lktype, struct vnode **vpp) { struct vnode *vp; struct nfsmount *nmp; int error; nmp = VFSTONFS(mp); vp = nmp->nm_vnode; vref(vp); error = vn_lock(vp, lktype | LK_RETRY); if (error != 0) { vrele(vp); return error; } *vpp = vp; return (0); } extern int syncprt; static bool nfs_sync_selector(void *cl, struct vnode *vp) { KASSERT(mutex_owned(vp->v_interlock)); return !LIST_EMPTY(&vp->v_dirtyblkhd) || (vp->v_iflag & VI_ONWORKLST) != 0; } /* * Flush out the buffer cache */ /* ARGSUSED */ int nfs_sync(struct mount *mp, int waitfor, kauth_cred_t cred) { struct vnode *vp; struct vnode_iterator *marker; int error, allerror = 0; /* * Force stale buffer cache information to be flushed. */ vfs_vnode_iterator_init(mp, &marker); while ((vp = vfs_vnode_iterator_next(marker, nfs_sync_selector, NULL))) { error = vn_lock(vp, LK_EXCLUSIVE); if (error) { vrele(vp); continue; } error = VOP_FSYNC(vp, cred, waitfor == MNT_WAIT ? FSYNC_WAIT : 0, 0, 0); if (error) allerror = error; vput(vp); } vfs_vnode_iterator_destroy(marker); return allerror; } /* * NFS flat namespace lookup. * Currently unsupported. */ /* ARGSUSED */ int nfs_vget(struct mount *mp, ino_t ino, int lktype, struct vnode **vpp) { return (EOPNOTSUPP); } /* * Do that sysctl thang... */ static int sysctl_vfs_nfs_iothreads(SYSCTLFN_ARGS) { struct sysctlnode node; int val; int error; val = nfs_niothreads; node = *rnode; node.sysctl_data = &val; error = sysctl_lookup(SYSCTLFN_CALL(&node)); if (error || newp == NULL) return error; return nfs_set_niothreads(val); } SYSCTL_SETUP(nfs_sysctl_init, "nfs sysctl") { sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT, CTLTYPE_NODE, "nfs", SYSCTL_DESCR("NFS vfs options"), NULL, 0, NULL, 0, CTL_VFS, 2, CTL_EOL); /* * XXX the "2" above could be dynamic, thereby eliminating one * more instance of the "number to vfs" mapping problem, but * "2" is the order as taken from sys/mount.h */ sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_STRUCT, "nfsstats", SYSCTL_DESCR("NFS operation statistics"), NULL, 0, &nfsstats, sizeof(nfsstats), CTL_VFS, 2, NFS_NFSSTATS, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_INT, "iothreads", SYSCTL_DESCR("Number of NFS client processes desired"), sysctl_vfs_nfs_iothreads, 0, NULL, 0, CTL_VFS, 2, NFS_IOTHREADS, CTL_EOL); } /* ARGSUSED */ int nfs_fhtovp(struct mount *mp, struct fid *fid, int lktype, struct vnode **vpp) { size_t fidsize; size_t fhsize; struct nfsnode *np; int error; struct vattr va; fidsize = fid->fid_len; if (fidsize < sizeof(*fid)) { return EINVAL; } fhsize = fidsize - sizeof(*fid); if ((fhsize % NFSX_UNSIGNED) != 0) { return EINVAL; } if ((VFSTONFS(mp)->nm_flag & NFSMNT_NFSV3) != 0) { if (fhsize > NFSX_V3FHMAX || fhsize == 0) { return EINVAL; } } else { if (fhsize != NFSX_V2FH) { return EINVAL; } } /* XXX lktype ignored */ error = nfs_nget(mp, (void *)fid->fid_data, fhsize, &np); if (error) { return error; } *vpp = NFSTOV(np); error = VOP_GETATTR(*vpp, &va, kauth_cred_get()); if (error != 0) { vput(*vpp); *vpp = NULLVP; } return error; } /* ARGSUSED */ int nfs_vptofh(struct vnode *vp, struct fid *buf, size_t *bufsize) { struct nfsnode *np; struct fid *fid; size_t fidsize; int error = 0; np = VTONFS(vp); fidsize = sizeof(*fid) + np->n_fhsize; if (*bufsize < fidsize) { error = E2BIG; } *bufsize = fidsize; if (error == 0) { struct fid fid_store; fid = &fid_store; memset(fid, 0, sizeof(*fid)); fid->fid_len = fidsize; memcpy(buf, fid, sizeof(*fid)); memcpy(buf->fid_data, np->n_fhp, np->n_fhsize); } return error; } /* * Vfs start routine, a no-op. */ /* ARGSUSED */ int nfs_start(struct mount *mp, int flags) { return (0); } /* * Called once at VFS init to initialize client-specific data structures. */ void nfs_vfs_init(void) { unsigned scale; /* Initialize NFS server / client shared data. */ nfs_init(); nfs_node_init(); /* Initialize the kqueue structures */ nfs_kqinit(); /* Initialize the iod structures */ nfs_iodinit(); scale = PAGE_SHIFT - 4; nfs_commitsize = uimin(uvmexp.npages, INT_MAX >> scale) << scale; } void nfs_vfs_done(void) { nfs_node_done(); nfs_kqfini(); nfs_iodfini(); nfs_fini(); }
192 190 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 /* $NetBSD: sys_sched.c,v 1.50 2023/04/09 09:18:09 riastradh Exp $ */ /* * Copyright (c) 2008, 2011 Mindaugas Rasiukevicius <rmind at NetBSD org> * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * System calls relating to the scheduler. * * Lock order: * * cpu_lock -> * proc_lock -> * proc_t::p_lock -> * lwp_t::lwp_lock * * TODO: * - Handle pthread_setschedprio() as defined by POSIX; */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: sys_sched.c,v 1.50 2023/04/09 09:18:09 riastradh Exp $"); #include <sys/param.h> #include <sys/cpu.h> #include <sys/kauth.h> #include <sys/kmem.h> #include <sys/lwp.h> #include <sys/mutex.h> #include <sys/proc.h> #include <sys/pset.h> #include <sys/sched.h> #include <sys/syscallargs.h> #include <sys/sysctl.h> #include <sys/systm.h> #include <sys/types.h> #include <sys/unistd.h> static struct sysctllog *sched_sysctl_log; static kauth_listener_t sched_listener; /* * Convert user priority or the in-kernel priority or convert the current * priority to the appropriate range according to the policy change. */ static pri_t convert_pri(lwp_t *l, int policy, pri_t pri) { /* Convert user priority to the in-kernel */ if (pri != PRI_NONE) { /* Only for real-time threads */ KASSERT(pri >= SCHED_PRI_MIN); KASSERT(pri <= SCHED_PRI_MAX); KASSERT(policy != SCHED_OTHER); return PRI_USER_RT + pri; } /* Neither policy, nor priority change */ if (l->l_class == policy) return l->l_priority; /* Time-sharing -> real-time */ if (l->l_class == SCHED_OTHER) { KASSERT(policy == SCHED_FIFO || policy == SCHED_RR); return PRI_USER_RT; } /* Real-time -> time-sharing */ if (policy == SCHED_OTHER) { KASSERT(l->l_class == SCHED_FIFO || l->l_class == SCHED_RR); /* * this is a bit arbitrary because the priority is dynamic * for SCHED_OTHER threads and will likely be changed by * the scheduler soon anyway. */ return l->l_priority - PRI_USER_RT; } /* Real-time -> real-time */ return l->l_priority; } int do_sched_setparam(pid_t pid, lwpid_t lid, int policy, const struct sched_param *params) { struct proc *p; struct lwp *t; pri_t pri; u_int lcnt; int error; error = 0; pri = params->sched_priority; /* If no parameters specified, just return (this should not happen) */ if (pri == PRI_NONE && policy == SCHED_NONE) return 0; /* Validate scheduling class */ if (policy != SCHED_NONE && (policy < SCHED_OTHER || policy > SCHED_RR)) return EINVAL; /* Validate priority */ if (pri != PRI_NONE && (pri < SCHED_PRI_MIN || pri > SCHED_PRI_MAX)) return EINVAL; if (pid != 0) { /* Find the process */ mutex_enter(&proc_lock); p = proc_find(pid); if (p == NULL) { mutex_exit(&proc_lock); return ESRCH; } mutex_enter(p->p_lock); mutex_exit(&proc_lock); /* Disallow modification of system processes */ if ((p->p_flag & PK_SYSTEM) != 0) { mutex_exit(p->p_lock); return EPERM; } } else { /* Use the calling process */ p = curlwp->l_proc; mutex_enter(p->p_lock); } /* Find the LWP(s) */ lcnt = 0; LIST_FOREACH(t, &p->p_lwps, l_sibling) { pri_t kpri; int lpolicy; if (lid && lid != t->l_lid) continue; lcnt++; lwp_lock(t); lpolicy = (policy == SCHED_NONE) ? t->l_class : policy; /* Disallow setting of priority for SCHED_OTHER threads */ if (lpolicy == SCHED_OTHER && pri != PRI_NONE) { lwp_unlock(t); error = EINVAL; break; } /* Convert priority, if needed */ kpri = convert_pri(t, lpolicy, pri); /* Check the permission */ error = kauth_authorize_process(kauth_cred_get(), KAUTH_PROCESS_SCHEDULER_SETPARAM, p, t, KAUTH_ARG(lpolicy), KAUTH_ARG(kpri)); if (error) { lwp_unlock(t); break; } /* Set the scheduling class, change the priority */ t->l_class = lpolicy; lwp_changepri(t, kpri); lwp_unlock(t); } mutex_exit(p->p_lock); return (lcnt == 0) ? ESRCH : error; } /* * Set scheduling parameters. */ int sys__sched_setparam(struct lwp *l, const struct sys__sched_setparam_args *uap, register_t *retval) { /* { syscallarg(pid_t) pid; syscallarg(lwpid_t) lid; syscallarg(int) policy; syscallarg(const struct sched_param *) params; } */ struct sched_param params; int error; /* Get the parameters from the user-space */ error = copyin(SCARG(uap, params), &params, sizeof(params)); if (error) goto out; error = do_sched_setparam(SCARG(uap, pid), SCARG(uap, lid), SCARG(uap, policy), &params); out: return error; } /* * do_sched_getparam: * * if lid=0, returns the parameter of the first LWP in the process. */ int do_sched_getparam(pid_t pid, lwpid_t lid, int *policy, struct sched_param *params) { struct sched_param lparams; struct lwp *t; int error, lpolicy; if (pid < 0 || lid < 0) return EINVAL; t = lwp_find2(pid, lid); /* acquire p_lock */ if (t == NULL) return ESRCH; /* Check the permission */ error = kauth_authorize_process(kauth_cred_get(), KAUTH_PROCESS_SCHEDULER_GETPARAM, t->l_proc, NULL, NULL, NULL); if (error != 0) { mutex_exit(t->l_proc->p_lock); return error; } lwp_lock(t); lparams.sched_priority = t->l_priority; lpolicy = t->l_class; lwp_unlock(t); mutex_exit(t->l_proc->p_lock); /* * convert to the user-visible priority value. * it's an inversion of convert_pri(). * * the SCHED_OTHER case is a bit arbitrary given that * - we don't allow setting the priority. * - the priority is dynamic. */ switch (lpolicy) { case SCHED_OTHER: lparams.sched_priority -= PRI_USER; break; case SCHED_RR: case SCHED_FIFO: lparams.sched_priority -= PRI_USER_RT; break; } if (policy != NULL) *policy = lpolicy; if (params != NULL) *params = lparams; return error; } /* * Get scheduling parameters. */ int sys__sched_getparam(struct lwp *l, const struct sys__sched_getparam_args *uap, register_t *retval) { /* { syscallarg(pid_t) pid; syscallarg(lwpid_t) lid; syscallarg(int *) policy; syscallarg(struct sched_param *) params; } */ struct sched_param params; int error, policy; error = do_sched_getparam(SCARG(uap, pid), SCARG(uap, lid), &policy, &params); if (error) goto out; error = copyout(&params, SCARG(uap, params), sizeof(params)); if (error == 0 && SCARG(uap, policy) != NULL) error = copyout(&policy, SCARG(uap, policy), sizeof(int)); out: return error; } /* * Allocate the CPU set, and get it from userspace. */ static int genkcpuset(kcpuset_t **dset, const cpuset_t *sset, size_t size) { kcpuset_t *kset; int error; kcpuset_create(&kset, true); error = kcpuset_copyin(sset, kset, size); if (error) { kcpuset_unuse(kset, NULL); } else { *dset = kset; } return error; } /* * Set affinity. */ int sys__sched_setaffinity(struct lwp *l, const struct sys__sched_setaffinity_args *uap, register_t *retval) { /* { syscallarg(pid_t) pid; syscallarg(lwpid_t) lid; syscallarg(size_t) size; syscallarg(const cpuset_t *) cpuset; } */ kcpuset_t *kcset, *kcpulst = NULL; struct cpu_info *ici, *ci; struct proc *p; struct lwp *t; CPU_INFO_ITERATOR cii; bool alloff; lwpid_t lid; u_int lcnt; int error; error = genkcpuset(&kcset, SCARG(uap, cpuset), SCARG(uap, size)); if (error) return error; /* * Traverse _each_ CPU to: * - Check that CPUs in the mask have no assigned processor set. * - Check that at least one CPU from the mask is online. * - Find the first target CPU to migrate. * * To avoid the race with CPU online/offline calls and processor sets, * cpu_lock will be locked for the entire operation. */ ci = NULL; alloff = false; mutex_enter(&cpu_lock); for (CPU_INFO_FOREACH(cii, ici)) { struct schedstate_percpu *ispc; if (!kcpuset_isset(kcset, cpu_index(ici))) { continue; } ispc = &ici->ci_schedstate; /* Check that CPU is not in the processor-set */ if (ispc->spc_psid != PS_NONE) { error = EPERM; goto out; } /* Skip offline CPUs */ if (ispc->spc_flags & SPCF_OFFLINE) { alloff = true; continue; } /* Target CPU to migrate */ if (ci == NULL) { ci = ici; } } if (ci == NULL) { if (alloff) { /* All CPUs in the set are offline */ error = EPERM; goto out; } /* Empty set */ kcpuset_unuse(kcset, &kcpulst); kcset = NULL; } if (SCARG(uap, pid) != 0) { /* Find the process */ mutex_enter(&proc_lock); p = proc_find(SCARG(uap, pid)); if (p == NULL) { mutex_exit(&proc_lock); error = ESRCH; goto out; } mutex_enter(p->p_lock); mutex_exit(&proc_lock); /* Disallow modification of system processes. */ if ((p->p_flag & PK_SYSTEM) != 0) { mutex_exit(p->p_lock); error = EPERM; goto out; } } else { /* Use the calling process */ p = l->l_proc; mutex_enter(p->p_lock); } /* * Check the permission. */ error = kauth_authorize_process(l->l_cred, KAUTH_PROCESS_SCHEDULER_SETAFFINITY, p, NULL, NULL, NULL); if (error != 0) { mutex_exit(p->p_lock); goto out; } /* Iterate through LWP(s). */ lcnt = 0; lid = SCARG(uap, lid); LIST_FOREACH(t, &p->p_lwps, l_sibling) { if (lid && lid != t->l_lid) { continue; } lwp_lock(t); /* No affinity for zombie LWPs. */ if (t->l_stat == LSZOMB) { lwp_unlock(t); continue; } /* First, release existing affinity, if any. */ if (t->l_affinity) { kcpuset_unuse(t->l_affinity, &kcpulst); } if (kcset) { /* * Hold a reference on affinity mask, assign mask to * LWP and migrate it to another CPU (unlocks LWP). */ kcpuset_use(kcset); t->l_affinity = kcset; lwp_migrate(t, ci); } else { /* Old affinity mask is released, just clear. */ t->l_affinity = NULL; lwp_unlock(t); } lcnt++; } mutex_exit(p->p_lock); if (lcnt == 0) { error = ESRCH; } out: mutex_exit(&cpu_lock); /* * Drop the initial reference (LWPs, if any, have the ownership now), * and destroy whatever is in the G/C list, if filled. */ if (kcset) { kcpuset_unuse(kcset, &kcpulst); } if (kcpulst) { kcpuset_destroy(kcpulst); } return error; } /* * Get affinity. */ int sys__sched_getaffinity(struct lwp *l, const struct sys__sched_getaffinity_args *uap, register_t *retval) { /* { syscallarg(pid_t) pid; syscallarg(lwpid_t) lid; syscallarg(size_t) size; syscallarg(cpuset_t *) cpuset; } */ struct lwp *t; kcpuset_t *kcset; int error; if (SCARG(uap, pid) < 0 || SCARG(uap, lid) < 0) return EINVAL; error = genkcpuset(&kcset, SCARG(uap, cpuset), SCARG(uap, size)); if (error) return error; /* Locks the LWP */ t = lwp_find2(SCARG(uap, pid), SCARG(uap, lid)); if (t == NULL) { error = ESRCH; goto out; } /* Check the permission */ if (kauth_authorize_process(l->l_cred, KAUTH_PROCESS_SCHEDULER_GETAFFINITY, t->l_proc, NULL, NULL, NULL)) { mutex_exit(t->l_proc->p_lock); error = EPERM; goto out; } lwp_lock(t); if (t->l_affinity) { kcpuset_copy(kcset, t->l_affinity); } else { kcpuset_zero(kcset); } lwp_unlock(t); mutex_exit(t->l_proc->p_lock); error = kcpuset_copyout(kcset, SCARG(uap, cpuset), SCARG(uap, size)); out: kcpuset_unuse(kcset, NULL); return error; } /* * Priority protection for PTHREAD_PRIO_PROTECT. This is a weak * analogue of priority inheritance: temp raise the priority * of the caller when accessing a protected resource. */ int sys__sched_protect(struct lwp *l, const struct sys__sched_protect_args *uap, register_t *retval) { /* { syscallarg(int) priority; syscallarg(int *) opriority; } */ int error; pri_t pri; KASSERT(l->l_inheritedprio == -1); KASSERT(l->l_auxprio == -1 || l->l_auxprio == l->l_protectprio); pri = SCARG(uap, priority); error = 0; lwp_lock(l); if (pri == -1) { /* back out priority changes */ switch(l->l_protectdepth) { case 0: error = EINVAL; break; case 1: l->l_protectdepth = 0; l->l_protectprio = -1; l->l_auxprio = -1; break; default: l->l_protectdepth--; break; } } else if (pri < 0) { /* Just retrieve the current value, for debugging */ if (l->l_protectprio == -1) error = ENOENT; else *retval = l->l_protectprio - PRI_USER_RT; } else if (__predict_false(pri < SCHED_PRI_MIN || pri > SCHED_PRI_MAX || l->l_priority > pri + PRI_USER_RT)) { /* must fail if existing priority is higher */ error = EPERM; } else { /* play along but make no changes if not a realtime LWP. */ l->l_protectdepth++; pri += PRI_USER_RT; if (__predict_true(l->l_class != SCHED_OTHER && pri > l->l_protectprio)) { l->l_protectprio = pri; l->l_auxprio = pri; } } lwp_unlock(l); return error; } /* * Yield. */ int sys_sched_yield(struct lwp *l, const void *v, register_t *retval) { yield(); return 0; } /* * Sysctl nodes and initialization. */ static void sysctl_sched_setup(struct sysctllog **clog) { const struct sysctlnode *node = NULL; sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_IMMEDIATE, CTLTYPE_INT, "posix_sched", SYSCTL_DESCR("Version of IEEE Std 1003.1 and its " "Process Scheduling option to which the " "system attempts to conform"), NULL, _POSIX_PRIORITY_SCHEDULING, NULL, 0, CTL_KERN, CTL_CREATE, CTL_EOL); sysctl_createv(clog, 0, NULL, &node, CTLFLAG_PERMANENT, CTLTYPE_NODE, "sched", SYSCTL_DESCR("Scheduler options"), NULL, 0, NULL, 0, CTL_KERN, CTL_CREATE, CTL_EOL); if (node == NULL) return; sysctl_createv(clog, 0, &node, NULL, CTLFLAG_PERMANENT | CTLFLAG_IMMEDIATE, CTLTYPE_INT, "pri_min", SYSCTL_DESCR("Minimal POSIX real-time priority"), NULL, SCHED_PRI_MIN, NULL, 0, CTL_CREATE, CTL_EOL); sysctl_createv(clog, 0, &node, NULL, CTLFLAG_PERMANENT | CTLFLAG_IMMEDIATE, CTLTYPE_INT, "pri_max", SYSCTL_DESCR("Maximal POSIX real-time priority"), NULL, SCHED_PRI_MAX, NULL, 0, CTL_CREATE, CTL_EOL); } static int sched_listener_cb(kauth_cred_t cred, kauth_action_t action, void *cookie, void *arg0, void *arg1, void *arg2, void *arg3) { struct proc *p; int result; result = KAUTH_RESULT_DEFER; p = arg0; switch (action) { case KAUTH_PROCESS_SCHEDULER_GETPARAM: if (kauth_cred_uidmatch(cred, p->p_cred)) result = KAUTH_RESULT_ALLOW; break; case KAUTH_PROCESS_SCHEDULER_SETPARAM: if (kauth_cred_uidmatch(cred, p->p_cred)) { struct lwp *l; int policy; pri_t priority; l = arg1; policy = (int)(unsigned long)arg2; priority = (pri_t)(unsigned long)arg3; if ((policy == l->l_class || (policy != SCHED_FIFO && policy != SCHED_RR)) && priority <= l->l_priority) result = KAUTH_RESULT_ALLOW; } break; case KAUTH_PROCESS_SCHEDULER_GETAFFINITY: result = KAUTH_RESULT_ALLOW; break; case KAUTH_PROCESS_SCHEDULER_SETAFFINITY: /* Privileged; we let the secmodel handle this. */ break; default: break; } return result; } void sched_init(void) { sysctl_sched_setup(&sched_sysctl_log); sched_listener = kauth_listen_scope(KAUTH_SCOPE_PROCESS, sched_listener_cb, NULL); }
2454 601 601 602 26 511 466 467 467 10 466 437 345 174 356 2584 2174 2161 2167 2168 308 310 310 310 310 310 2451 2456 2450 435 436 436 356 356 357 357 354 355 31 159 9 2467 2454 2461 2454 2587 2163 2165 2170 2455 2453 2453 2448 3 187 2450 2310 1940 156 156 196 196 314 280 280 280 6 59 5 59 59 59 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 /* $NetBSD: kern_mutex.c,v 1.112 2023/10/15 10:28:23 riastradh Exp $ */ /*- * Copyright (c) 2002, 2006, 2007, 2008, 2019, 2023 * The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Jason R. Thorpe and Andrew Doran. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /* * Kernel mutex implementation, modeled after those found in Solaris, * a description of which can be found in: * * Solaris Internals: Core Kernel Architecture, Jim Mauro and * Richard McDougall. */ #define __MUTEX_PRIVATE #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: kern_mutex.c,v 1.112 2023/10/15 10:28:23 riastradh Exp $"); #include <sys/param.h> #include <sys/atomic.h> #include <sys/cpu.h> #include <sys/intr.h> #include <sys/kernel.h> #include <sys/lock.h> #include <sys/lockdebug.h> #include <sys/mutex.h> #include <sys/proc.h> #include <sys/pserialize.h> #include <sys/sched.h> #include <sys/sleepq.h> #include <sys/syncobj.h> #include <sys/systm.h> #include <sys/types.h> #include <dev/lockstat.h> #include <machine/lock.h> /* * When not running a debug kernel, spin mutexes are not much * more than an splraiseipl() and splx() pair. */ #if defined(DIAGNOSTIC) || defined(MULTIPROCESSOR) || defined(LOCKDEBUG) #define FULL #endif /* * Debugging support. */ #define MUTEX_WANTLOCK(mtx) \ LOCKDEBUG_WANTLOCK(MUTEX_DEBUG_P(mtx), (mtx), \ (uintptr_t)__builtin_return_address(0), 0) #define MUTEX_TESTLOCK(mtx) \ LOCKDEBUG_WANTLOCK(MUTEX_DEBUG_P(mtx), (mtx), \ (uintptr_t)__builtin_return_address(0), -1) #define MUTEX_LOCKED(mtx) \ LOCKDEBUG_LOCKED(MUTEX_DEBUG_P(mtx), (mtx), NULL, \ (uintptr_t)__builtin_return_address(0), 0) #define MUTEX_UNLOCKED(mtx) \ LOCKDEBUG_UNLOCKED(MUTEX_DEBUG_P(mtx), (mtx), \ (uintptr_t)__builtin_return_address(0), 0) #define MUTEX_ABORT(mtx, msg) \ mutex_abort(__func__, __LINE__, mtx, msg) #if defined(LOCKDEBUG) #define MUTEX_DASSERT(mtx, cond) \ do { \ if (__predict_false(!(cond))) \ MUTEX_ABORT(mtx, "assertion failed: " #cond); \ } while (/* CONSTCOND */ 0) #else /* LOCKDEBUG */ #define MUTEX_DASSERT(mtx, cond) /* nothing */ #endif /* LOCKDEBUG */ #if defined(DIAGNOSTIC) #define MUTEX_ASSERT(mtx, cond) \ do { \ if (__predict_false(!(cond))) \ MUTEX_ABORT(mtx, "assertion failed: " #cond); \ } while (/* CONSTCOND */ 0) #else /* DIAGNOSTIC */ #define MUTEX_ASSERT(mtx, cond) /* nothing */ #endif /* DIAGNOSTIC */ /* * Some architectures can't use __cpu_simple_lock as is so allow a way * for them to use an alternate definition. */ #ifndef MUTEX_SPINBIT_LOCK_INIT #define MUTEX_SPINBIT_LOCK_INIT(mtx) __cpu_simple_lock_init(&(mtx)->mtx_lock) #endif #ifndef MUTEX_SPINBIT_LOCKED_P #define MUTEX_SPINBIT_LOCKED_P(mtx) __SIMPLELOCK_LOCKED_P(&(mtx)->mtx_lock) #endif #ifndef MUTEX_SPINBIT_LOCK_TRY #define MUTEX_SPINBIT_LOCK_TRY(mtx) __cpu_simple_lock_try(&(mtx)->mtx_lock) #endif #ifndef MUTEX_SPINBIT_LOCK_UNLOCK #define MUTEX_SPINBIT_LOCK_UNLOCK(mtx) __cpu_simple_unlock(&(mtx)->mtx_lock) #endif #ifndef MUTEX_INITIALIZE_SPIN_IPL #define MUTEX_INITIALIZE_SPIN_IPL(mtx, ipl) \ ((mtx)->mtx_ipl = makeiplcookie((ipl))) #endif /* * Spin mutex SPL save / restore. */ #define MUTEX_SPIN_SPLRAISE(mtx) \ do { \ const int s = splraiseipl(MUTEX_SPIN_IPL(mtx)); \ struct cpu_info * const x__ci = curcpu(); \ const int x__cnt = x__ci->ci_mtx_count--; \ __insn_barrier(); \ if (x__cnt == 0) \ x__ci->ci_mtx_oldspl = s; \ } while (/* CONSTCOND */ 0) #define MUTEX_SPIN_SPLRESTORE(mtx) \ do { \ struct cpu_info * const x__ci = curcpu(); \ const int s = x__ci->ci_mtx_oldspl; \ __insn_barrier(); \ if (++(x__ci->ci_mtx_count) == 0) \ splx(s); \ } while (/* CONSTCOND */ 0) /* * Memory barriers. */ #ifdef __HAVE_ATOMIC_AS_MEMBAR #define MUTEX_MEMBAR_ENTER() #else #define MUTEX_MEMBAR_ENTER() membar_enter() #endif /* * For architectures that provide 'simple' mutexes: they provide a * CAS function that is either MP-safe, or does not need to be MP * safe. Adaptive mutexes on these architectures do not require an * additional interlock. */ #ifdef __HAVE_SIMPLE_MUTEXES #define MUTEX_OWNER(owner) \ (owner & MUTEX_THREAD) #define MUTEX_HAS_WAITERS(mtx) \ (((int)(mtx)->mtx_owner & MUTEX_BIT_WAITERS) != 0) #define MUTEX_INITIALIZE_ADAPTIVE(mtx, dodebug) \ do { \ if (!dodebug) \ (mtx)->mtx_owner |= MUTEX_BIT_NODEBUG; \ } while (/* CONSTCOND */ 0) #define MUTEX_INITIALIZE_SPIN(mtx, dodebug, ipl) \ do { \ (mtx)->mtx_owner = MUTEX_BIT_SPIN; \ if (!dodebug) \ (mtx)->mtx_owner |= MUTEX_BIT_NODEBUG; \ MUTEX_INITIALIZE_SPIN_IPL((mtx), (ipl)); \ MUTEX_SPINBIT_LOCK_INIT((mtx)); \ } while (/* CONSTCOND */ 0) #define MUTEX_DESTROY(mtx) \ do { \ (mtx)->mtx_owner = MUTEX_THREAD; \ } while (/* CONSTCOND */ 0) #define MUTEX_SPIN_P(owner) \ (((owner) & MUTEX_BIT_SPIN) != 0) #define MUTEX_ADAPTIVE_P(owner) \ (((owner) & MUTEX_BIT_SPIN) == 0) #ifndef MUTEX_CAS #define MUTEX_CAS(p, o, n) \ (atomic_cas_ulong((volatile unsigned long *)(p), (o), (n)) == (o)) #endif /* MUTEX_CAS */ #define MUTEX_DEBUG_P(mtx) (((mtx)->mtx_owner & MUTEX_BIT_NODEBUG) == 0) #if defined(LOCKDEBUG) #define MUTEX_OWNED(owner) (((owner) & ~MUTEX_BIT_NODEBUG) != 0) #define MUTEX_INHERITDEBUG(n, o) (n) |= (o) & MUTEX_BIT_NODEBUG #else /* defined(LOCKDEBUG) */ #define MUTEX_OWNED(owner) ((owner) != 0) #define MUTEX_INHERITDEBUG(n, o) /* nothing */ #endif /* defined(LOCKDEBUG) */ static inline int MUTEX_ACQUIRE(kmutex_t *mtx, uintptr_t curthread) { int rv; uintptr_t oldown = 0; uintptr_t newown = curthread; MUTEX_INHERITDEBUG(oldown, mtx->mtx_owner); MUTEX_INHERITDEBUG(newown, oldown); rv = MUTEX_CAS(&mtx->mtx_owner, oldown, newown); membar_acquire(); return rv; } static inline int MUTEX_SET_WAITERS(kmutex_t *mtx, uintptr_t owner) { int rv; rv = MUTEX_CAS(&mtx->mtx_owner, owner, owner | MUTEX_BIT_WAITERS); MUTEX_MEMBAR_ENTER(); return rv; } static inline void MUTEX_RELEASE(kmutex_t *mtx) { uintptr_t newown; newown = 0; MUTEX_INHERITDEBUG(newown, mtx->mtx_owner); atomic_store_release(&mtx->mtx_owner, newown); } #endif /* __HAVE_SIMPLE_MUTEXES */ /* * Patch in stubs via strong alias where they are not available. */ #if defined(LOCKDEBUG) #undef __HAVE_MUTEX_STUBS #undef __HAVE_SPIN_MUTEX_STUBS #endif #ifndef __HAVE_MUTEX_STUBS __strong_alias(mutex_enter,mutex_vector_enter); __strong_alias(mutex_exit,mutex_vector_exit); #endif #ifndef __HAVE_SPIN_MUTEX_STUBS __strong_alias(mutex_spin_enter,mutex_vector_enter); __strong_alias(mutex_spin_exit,mutex_vector_exit); #endif static void mutex_abort(const char *, size_t, volatile const kmutex_t *, const char *); static void mutex_dump(const volatile void *, lockop_printer_t); static lwp_t *mutex_owner(wchan_t); lockops_t mutex_spin_lockops = { .lo_name = "Mutex", .lo_type = LOCKOPS_SPIN, .lo_dump = mutex_dump, }; lockops_t mutex_adaptive_lockops = { .lo_name = "Mutex", .lo_type = LOCKOPS_SLEEP, .lo_dump = mutex_dump, }; syncobj_t mutex_syncobj = { .sobj_name = "mutex", .sobj_flag = SOBJ_SLEEPQ_SORTED, .sobj_boostpri = PRI_KERNEL, .sobj_unsleep = turnstile_unsleep, .sobj_changepri = turnstile_changepri, .sobj_lendpri = sleepq_lendpri, .sobj_owner = mutex_owner, }; /* * mutex_dump: * * Dump the contents of a mutex structure. */ static void mutex_dump(const volatile void *cookie, lockop_printer_t pr) { const volatile kmutex_t *mtx = cookie; uintptr_t owner = mtx->mtx_owner; pr("owner field : %#018lx wait/spin: %16d/%d\n", (long)MUTEX_OWNER(owner), MUTEX_HAS_WAITERS(mtx), MUTEX_SPIN_P(owner)); } /* * mutex_abort: * * Dump information about an error and panic the system. This * generates a lot of machine code in the DIAGNOSTIC case, so * we ask the compiler to not inline it. */ static void __noinline mutex_abort(const char *func, size_t line, volatile const kmutex_t *mtx, const char *msg) { LOCKDEBUG_ABORT(func, line, mtx, (MUTEX_SPIN_P(mtx->mtx_owner) ? &mutex_spin_lockops : &mutex_adaptive_lockops), msg); } /* * mutex_init: * * Initialize a mutex for use. Note that adaptive mutexes are in * essence spin mutexes that can sleep to avoid deadlock and wasting * CPU time. We can't easily provide a type of mutex that always * sleeps - see comments in mutex_vector_enter() about releasing * mutexes unlocked. */ void _mutex_init(kmutex_t *mtx, kmutex_type_t type, int ipl, uintptr_t return_address) { lockops_t *lockops __unused; bool dodebug; memset(mtx, 0, sizeof(*mtx)); if (ipl == IPL_NONE || ipl == IPL_SOFTCLOCK || ipl == IPL_SOFTBIO || ipl == IPL_SOFTNET || ipl == IPL_SOFTSERIAL) { lockops = (type == MUTEX_NODEBUG ? NULL : &mutex_adaptive_lockops); dodebug = LOCKDEBUG_ALLOC(mtx, lockops, return_address); MUTEX_INITIALIZE_ADAPTIVE(mtx, dodebug); } else { lockops = (type == MUTEX_NODEBUG ? NULL : &mutex_spin_lockops); dodebug = LOCKDEBUG_ALLOC(mtx, lockops, return_address); MUTEX_INITIALIZE_SPIN(mtx, dodebug, ipl); } } void mutex_init(kmutex_t *mtx, kmutex_type_t type, int ipl) { _mutex_init(mtx, type, ipl, (uintptr_t)__builtin_return_address(0)); } /* * mutex_destroy: * * Tear down a mutex. */ void mutex_destroy(kmutex_t *mtx) { uintptr_t owner = mtx->mtx_owner; if (MUTEX_ADAPTIVE_P(owner)) { MUTEX_ASSERT(mtx, !MUTEX_OWNED(owner)); MUTEX_ASSERT(mtx, !MUTEX_HAS_WAITERS(mtx)); } else { MUTEX_ASSERT(mtx, !MUTEX_SPINBIT_LOCKED_P(mtx)); } LOCKDEBUG_FREE(MUTEX_DEBUG_P(mtx), mtx); MUTEX_DESTROY(mtx); } #ifdef MULTIPROCESSOR /* * mutex_oncpu: * * Return true if an adaptive mutex owner is running on a CPU in the * system. If the target is waiting on the kernel big lock, then we * must release it. This is necessary to avoid deadlock. */ static bool mutex_oncpu(uintptr_t owner) { struct cpu_info *ci; lwp_t *l; KASSERT(kpreempt_disabled()); if (!MUTEX_OWNED(owner)) { return false; } /* * See lwp_dtor() why dereference of the LWP pointer is safe. * We must have kernel preemption disabled for that. */ l = (lwp_t *)MUTEX_OWNER(owner); ci = l->l_cpu; if (ci && ci->ci_curlwp == l) { /* Target is running; do we need to block? */ return (atomic_load_relaxed(&ci->ci_biglock_wanted) != l); } /* Not running. It may be safe to block now. */ return false; } #endif /* MULTIPROCESSOR */ /* * mutex_vector_enter: * * Support routine for mutex_enter() that must handle all cases. In * the LOCKDEBUG case, mutex_enter() is always aliased here, even if * fast-path stubs are available. If a mutex_spin_enter() stub is * not available, then it is also aliased directly here. */ void mutex_vector_enter(kmutex_t *mtx) { uintptr_t owner, curthread; turnstile_t *ts; #ifdef MULTIPROCESSOR u_int count; #endif LOCKSTAT_COUNTER(spincnt); LOCKSTAT_COUNTER(slpcnt); LOCKSTAT_TIMER(spintime); LOCKSTAT_TIMER(slptime); LOCKSTAT_FLAG(lsflag); /* * Handle spin mutexes. */ KPREEMPT_DISABLE(curlwp); owner = mtx->mtx_owner; if (MUTEX_SPIN_P(owner)) { #if defined(LOCKDEBUG) && defined(MULTIPROCESSOR) u_int spins = 0; #endif KPREEMPT_ENABLE(curlwp); MUTEX_SPIN_SPLRAISE(mtx); MUTEX_WANTLOCK(mtx); #ifdef FULL if (MUTEX_SPINBIT_LOCK_TRY(mtx)) { MUTEX_LOCKED(mtx); return; } #if !defined(MULTIPROCESSOR) MUTEX_ABORT(mtx, "locking against myself"); #else /* !MULTIPROCESSOR */ LOCKSTAT_ENTER(lsflag); LOCKSTAT_START_TIMER(lsflag, spintime); count = SPINLOCK_BACKOFF_MIN; /* * Spin testing the lock word and do exponential backoff * to reduce cache line ping-ponging between CPUs. */ do { while (MUTEX_SPINBIT_LOCKED_P(mtx)) { SPINLOCK_SPIN_HOOK; SPINLOCK_BACKOFF(count); #ifdef LOCKDEBUG if (SPINLOCK_SPINOUT(spins)) MUTEX_ABORT(mtx, "spinout"); #endif /* LOCKDEBUG */ } } while (!MUTEX_SPINBIT_LOCK_TRY(mtx)); if (count != SPINLOCK_BACKOFF_MIN) { LOCKSTAT_STOP_TIMER(lsflag, spintime); LOCKSTAT_EVENT(lsflag, mtx, LB_SPIN_MUTEX | LB_SPIN, 1, spintime); } LOCKSTAT_EXIT(lsflag); #endif /* !MULTIPROCESSOR */ #endif /* FULL */ MUTEX_LOCKED(mtx); return; } curthread = (uintptr_t)curlwp; MUTEX_DASSERT(mtx, MUTEX_ADAPTIVE_P(owner)); MUTEX_ASSERT(mtx, curthread != 0); MUTEX_ASSERT(mtx, !cpu_intr_p()); MUTEX_WANTLOCK(mtx); if (__predict_true(panicstr == NULL)) { KDASSERT(pserialize_not_in_read_section()); LOCKDEBUG_BARRIER(&kernel_lock, 1); } LOCKSTAT_ENTER(lsflag); /* * Adaptive mutex; spin trying to acquire the mutex. If we * determine that the owner is not running on a processor, * then we stop spinning, and sleep instead. */ for (;;) { if (!MUTEX_OWNED(owner)) { /* * Mutex owner clear could mean two things: * * * The mutex has been released. * * The owner field hasn't been set yet. * * Try to acquire it again. If that fails, * we'll just loop again. */ if (MUTEX_ACQUIRE(mtx, curthread)) break; owner = mtx->mtx_owner; continue; } if (__predict_false(MUTEX_OWNER(owner) == curthread)) { MUTEX_ABORT(mtx, "locking against myself"); } #ifdef MULTIPROCESSOR /* * Check to see if the owner is running on a processor. * If so, then we should just spin, as the owner will * likely release the lock very soon. */ if (mutex_oncpu(owner)) { LOCKSTAT_START_TIMER(lsflag, spintime); count = SPINLOCK_BACKOFF_MIN; do { KPREEMPT_ENABLE(curlwp); SPINLOCK_BACKOFF(count); KPREEMPT_DISABLE(curlwp); owner = mtx->mtx_owner; } while (mutex_oncpu(owner)); LOCKSTAT_STOP_TIMER(lsflag, spintime); LOCKSTAT_COUNT(spincnt, 1); if (!MUTEX_OWNED(owner)) continue; } #endif ts = turnstile_lookup(mtx); /* * Once we have the turnstile chain interlock, mark the * mutex as having waiters. If that fails, spin again: * chances are that the mutex has been released. */ if (!MUTEX_SET_WAITERS(mtx, owner)) { turnstile_exit(mtx); owner = mtx->mtx_owner; continue; } #ifdef MULTIPROCESSOR /* * mutex_exit() is permitted to release the mutex without * any interlocking instructions, and the following can * occur as a result: * * CPU 1: MUTEX_SET_WAITERS() CPU2: mutex_exit() * ---------------------------- ---------------------------- * .. load mtx->mtx_owner * .. see has-waiters bit clear * set has-waiters bit .. * .. store mtx->mtx_owner := 0 * return success * * There is another race that can occur: a third CPU could * acquire the mutex as soon as it is released. Since * adaptive mutexes are primarily spin mutexes, this is not * something that we need to worry about too much. What we * do need to ensure is that the waiters bit gets set. * * To allow the unlocked release, we need to make some * assumptions here: * * o Release is the only non-atomic/unlocked operation * that can be performed on the mutex. (It must still * be atomic on the local CPU, e.g. in case interrupted * or preempted). * * o At any given time on each mutex, MUTEX_SET_WAITERS() * can only ever be in progress on one CPU in the * system - guaranteed by the turnstile chain lock. * * o No other operations other than MUTEX_SET_WAITERS() * and release can modify a mutex with a non-zero * owner field. * * o If the holding LWP switches away, it posts a store * fence before changing curlwp, ensuring that any * overwrite of the mutex waiters flag by mutex_exit() * completes before the modification of curlwp becomes * visible to this CPU. * * o cpu_switchto() posts a store fence after setting curlwp * and before resuming execution of an LWP. * * o _kernel_lock() posts a store fence before setting * curcpu()->ci_biglock_wanted, and after clearing it. * This ensures that any overwrite of the mutex waiters * flag by mutex_exit() completes before the modification * of ci_biglock_wanted becomes visible. * * After MUTEX_SET_WAITERS() succeeds, simultaneously * confirming that the same LWP still holds the mutex * since we took the turnstile lock and notifying it that * we're waiting, we check the lock holder's status again. * Some of the possible outcomes (not an exhaustive list; * XXX this should be made exhaustive): * * 1. The on-CPU check returns true: the holding LWP is * running again. The lock may be released soon and * we should spin. Importantly, we can't trust the * value of the waiters flag. * * 2. The on-CPU check returns false: the holding LWP is * not running. We now have the opportunity to check * if mutex_exit() has blatted the modifications made * by MUTEX_SET_WAITERS(). * * 3. The on-CPU check returns false: the holding LWP may * or may not be running. It has context switched at * some point during our check. Again, we have the * chance to see if the waiters bit is still set or * has been overwritten. * * 4. The on-CPU check returns false: the holding LWP is * running on a CPU, but wants the big lock. It's OK * to check the waiters field in this case. * * 5. The has-waiters check fails: the mutex has been * released, the waiters flag cleared and another LWP * now owns the mutex. * * 6. The has-waiters check fails: the mutex has been * released. * * If the waiters bit is not set it's unsafe to go asleep, * as we might never be awoken. */ if (mutex_oncpu(owner)) { turnstile_exit(mtx); owner = mtx->mtx_owner; continue; } membar_consumer(); if (!MUTEX_HAS_WAITERS(mtx)) { turnstile_exit(mtx); owner = mtx->mtx_owner; continue; } #endif /* MULTIPROCESSOR */ LOCKSTAT_START_TIMER(lsflag, slptime); turnstile_block(ts, TS_WRITER_Q, mtx, &mutex_syncobj); LOCKSTAT_STOP_TIMER(lsflag, slptime); LOCKSTAT_COUNT(slpcnt, 1); owner = mtx->mtx_owner; } KPREEMPT_ENABLE(curlwp); LOCKSTAT_EVENT(lsflag, mtx, LB_ADAPTIVE_MUTEX | LB_SLEEP1, slpcnt, slptime); LOCKSTAT_EVENT(lsflag, mtx, LB_ADAPTIVE_MUTEX | LB_SPIN, spincnt, spintime); LOCKSTAT_EXIT(lsflag); MUTEX_DASSERT(mtx, MUTEX_OWNER(mtx->mtx_owner) == curthread); MUTEX_LOCKED(mtx); } /* * mutex_vector_exit: * * Support routine for mutex_exit() that handles all cases. */ void mutex_vector_exit(kmutex_t *mtx) { turnstile_t *ts; uintptr_t curthread; if (MUTEX_SPIN_P(mtx->mtx_owner)) { #ifdef FULL if (__predict_false(!MUTEX_SPINBIT_LOCKED_P(mtx))) { MUTEX_ABORT(mtx, "exiting unheld spin mutex"); } MUTEX_UNLOCKED(mtx); MUTEX_SPINBIT_LOCK_UNLOCK(mtx); #endif MUTEX_SPIN_SPLRESTORE(mtx); return; } #ifndef __HAVE_MUTEX_STUBS /* * On some architectures without mutex stubs, we can enter here to * release mutexes before interrupts and whatnot are up and running. * We need this hack to keep them sweet. */ if (__predict_false(cold)) { MUTEX_UNLOCKED(mtx); MUTEX_RELEASE(mtx); return; } #endif curthread = (uintptr_t)curlwp; MUTEX_DASSERT(mtx, curthread != 0); MUTEX_ASSERT(mtx, MUTEX_OWNER(mtx->mtx_owner) == curthread); MUTEX_UNLOCKED(mtx); #if !defined(LOCKDEBUG) __USE(curthread); #endif #ifdef LOCKDEBUG /* * Avoid having to take the turnstile chain lock every time * around. Raise the priority level to splhigh() in order * to disable preemption and so make the following atomic. * This also blocks out soft interrupts that could set the * waiters bit. */ { int s = splhigh(); if (!MUTEX_HAS_WAITERS(mtx)) { MUTEX_RELEASE(mtx); splx(s); return; } splx(s); } #endif /* * Get this lock's turnstile. This gets the interlock on * the sleep queue. Once we have that, we can clear the * lock. If there was no turnstile for the lock, there * were no waiters remaining. */ ts = turnstile_lookup(mtx); if (ts == NULL) { MUTEX_RELEASE(mtx); turnstile_exit(mtx); } else { MUTEX_RELEASE(mtx); turnstile_wakeup(ts, TS_WRITER_Q, TS_WAITERS(ts, TS_WRITER_Q), NULL); } } #ifndef __HAVE_SIMPLE_MUTEXES /* * mutex_wakeup: * * Support routine for mutex_exit() that wakes up all waiters. * We assume that the mutex has been released, but it need not * be. */ void mutex_wakeup(kmutex_t *mtx) { turnstile_t *ts; ts = turnstile_lookup(mtx); if (ts == NULL) { turnstile_exit(mtx); return; } MUTEX_CLEAR_WAITERS(mtx); turnstile_wakeup(ts, TS_WRITER_Q, TS_WAITERS(ts, TS_WRITER_Q), NULL); } #endif /* !__HAVE_SIMPLE_MUTEXES */ /* * mutex_owned: * * Return true if the current LWP (adaptive) or CPU (spin) * holds the mutex. */ int mutex_owned(const kmutex_t *mtx) { if (mtx == NULL) return 0; if (MUTEX_ADAPTIVE_P(mtx->mtx_owner)) return MUTEX_OWNER(mtx->mtx_owner) == (uintptr_t)curlwp; #ifdef FULL return MUTEX_SPINBIT_LOCKED_P(mtx); #else return 1; #endif } /* * mutex_owner: * * Return the current owner of an adaptive mutex. Used for * priority inheritance. */ static lwp_t * mutex_owner(wchan_t wchan) { volatile const kmutex_t *mtx = wchan; MUTEX_ASSERT(mtx, MUTEX_ADAPTIVE_P(mtx->mtx_owner)); return (struct lwp *)MUTEX_OWNER(mtx->mtx_owner); } /* * mutex_ownable: * * When compiled with DEBUG and LOCKDEBUG defined, ensure that * the mutex is available. We cannot use !mutex_owned() since * that won't work correctly for spin mutexes. */ int mutex_ownable(const kmutex_t *mtx) { #ifdef LOCKDEBUG MUTEX_TESTLOCK(mtx); #endif return 1; } /* * mutex_tryenter: * * Try to acquire the mutex; return non-zero if we did. */ int mutex_tryenter(kmutex_t *mtx) { uintptr_t curthread; /* * Handle spin mutexes. */ if (MUTEX_SPIN_P(mtx->mtx_owner)) { MUTEX_SPIN_SPLRAISE(mtx); #ifdef FULL if (MUTEX_SPINBIT_LOCK_TRY(mtx)) { MUTEX_WANTLOCK(mtx); MUTEX_LOCKED(mtx); return 1; } MUTEX_SPIN_SPLRESTORE(mtx); #else MUTEX_WANTLOCK(mtx); MUTEX_LOCKED(mtx); return 1; #endif } else { curthread = (uintptr_t)curlwp; MUTEX_ASSERT(mtx, curthread != 0); if (MUTEX_ACQUIRE(mtx, curthread)) { MUTEX_WANTLOCK(mtx); MUTEX_LOCKED(mtx); MUTEX_DASSERT(mtx, MUTEX_OWNER(mtx->mtx_owner) == curthread); return 1; } } return 0; } #if defined(__HAVE_SPIN_MUTEX_STUBS) || defined(FULL) /* * mutex_spin_retry: * * Support routine for mutex_spin_enter(). Assumes that the caller * has already raised the SPL, and adjusted counters. */ void mutex_spin_retry(kmutex_t *mtx) { #ifdef MULTIPROCESSOR u_int count; LOCKSTAT_TIMER(spintime); LOCKSTAT_FLAG(lsflag); #ifdef LOCKDEBUG u_int spins = 0; #endif /* LOCKDEBUG */ MUTEX_WANTLOCK(mtx); LOCKSTAT_ENTER(lsflag); LOCKSTAT_START_TIMER(lsflag, spintime); count = SPINLOCK_BACKOFF_MIN; /* * Spin testing the lock word and do exponential backoff * to reduce cache line ping-ponging between CPUs. */ do { while (MUTEX_SPINBIT_LOCKED_P(mtx)) { SPINLOCK_BACKOFF(count); #ifdef LOCKDEBUG if (SPINLOCK_SPINOUT(spins)) MUTEX_ABORT(mtx, "spinout"); #endif /* LOCKDEBUG */ } } while (!MUTEX_SPINBIT_LOCK_TRY(mtx)); LOCKSTAT_STOP_TIMER(lsflag, spintime); LOCKSTAT_EVENT(lsflag, mtx, LB_SPIN_MUTEX | LB_SPIN, 1, spintime); LOCKSTAT_EXIT(lsflag); MUTEX_LOCKED(mtx); #else /* MULTIPROCESSOR */ MUTEX_ABORT(mtx, "locking against myself"); #endif /* MULTIPROCESSOR */ } #endif /* defined(__HAVE_SPIN_MUTEX_STUBS) || defined(FULL) */
12 10 1 1 64 62 61 29 32 60 32 26 44 29 4 3 16 9 51 51 3 48 40 1 39 33 6 39 37 35 23 5 24 3 24 3 27 6 20 20 28 16 21 15 9 5 35 2 33 35 1 13 21 3 2 24 17 13 26 15 9 23 15 2 31 31 10 20 22 3 83 83 79 5 5 82 2 54 34 22 22 22 22 90 78 78 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 /* $NetBSD: sys_select.c,v 1.66 2023/10/15 10:29:34 riastradh Exp $ */ /*- * Copyright (c) 2007, 2008, 2009, 2010, 2019, 2020, 2023 * The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Andrew Doran and Mindaugas Rasiukevicius. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /* * Copyright (c) 1982, 1986, 1989, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)sys_generic.c 8.9 (Berkeley) 2/14/95 */ /* * System calls of synchronous I/O multiplexing subsystem. * * Locking * * Two locks are used: <object-lock> and selcluster_t::sc_lock. * * The <object-lock> might be a device driver or another subsystem, e.g. * socket or pipe. This lock is not exported, and thus invisible to this * subsystem. Mainly, synchronisation between selrecord() and selnotify() * routines depends on this lock, as it will be described in the comments. * * Lock order * * <object-lock> -> * selcluster_t::sc_lock */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: sys_select.c,v 1.66 2023/10/15 10:29:34 riastradh Exp $"); #include <sys/param.h> #include <sys/atomic.h> #include <sys/bitops.h> #include <sys/cpu.h> #include <sys/file.h> #include <sys/filedesc.h> #include <sys/kernel.h> #include <sys/lwp.h> #include <sys/mount.h> #include <sys/poll.h> #include <sys/proc.h> #include <sys/signalvar.h> #include <sys/sleepq.h> #include <sys/socketvar.h> #include <sys/socketvar.h> #include <sys/syncobj.h> #include <sys/syscallargs.h> #include <sys/sysctl.h> #include <sys/systm.h> #include <sys/uio.h> /* Flags for lwp::l_selflag. */ #define SEL_RESET 0 /* awoken, interrupted, or not yet polling */ #define SEL_SCANNING 1 /* polling descriptors */ #define SEL_BLOCKING 2 /* blocking and waiting for event */ #define SEL_EVENT 3 /* interrupted, events set directly */ /* * Per-cluster state for select()/poll(). For a system with fewer * than 64 CPUs, this gives us per-CPU clusters. */ #define SELCLUSTERS 64 #define SELCLUSTERMASK (SELCLUSTERS - 1) typedef struct selcluster { kmutex_t *sc_lock; sleepq_t sc_sleepq; uint64_t sc_mask; int sc_ncoll; } selcluster_t; static inline int selscan(char *, const int, const size_t, register_t *); static inline int pollscan(struct pollfd *, const int, register_t *); static void selclear(void); static const int sel_flag[] = { POLLRDNORM | POLLHUP | POLLERR, POLLWRNORM | POLLHUP | POLLERR, POLLRDBAND }; /* * LWPs are woken using the sleep queue only due to a collision, the case * with the maximum Suck Factor. Save the cost of sorting for named waiters * by inserting in LIFO order. In the future it would be preferable to not * enqueue LWPs at all, unless subject to a collision. */ syncobj_t select_sobj = { .sobj_name = "select", .sobj_flag = SOBJ_SLEEPQ_LIFO, .sobj_boostpri = PRI_KERNEL, .sobj_unsleep = sleepq_unsleep, .sobj_changepri = sleepq_changepri, .sobj_lendpri = sleepq_lendpri, .sobj_owner = syncobj_noowner, }; static selcluster_t *selcluster[SELCLUSTERS] __read_mostly; static int direct_select __read_mostly = 0; /* Operations: either select() or poll(). */ const char selop_select[] = "select"; const char selop_poll[] = "poll"; /* * Select system call. */ int sys___pselect50(struct lwp *l, const struct sys___pselect50_args *uap, register_t *retval) { /* { syscallarg(int) nd; syscallarg(fd_set *) in; syscallarg(fd_set *) ou; syscallarg(fd_set *) ex; syscallarg(const struct timespec *) ts; syscallarg(sigset_t *) mask; } */ struct timespec ats, *ts = NULL; sigset_t amask, *mask = NULL; int error; if (SCARG(uap, ts)) { error = copyin(SCARG(uap, ts), &ats, sizeof(ats)); if (error) return error; ts = &ats; } if (SCARG(uap, mask) != NULL) { error = copyin(SCARG(uap, mask), &amask, sizeof(amask)); if (error) return error; mask = &amask; } return selcommon(retval, SCARG(uap, nd), SCARG(uap, in), SCARG(uap, ou), SCARG(uap, ex), ts, mask); } int sys___select50(struct lwp *l, const struct sys___select50_args *uap, register_t *retval) { /* { syscallarg(int) nd; syscallarg(fd_set *) in; syscallarg(fd_set *) ou; syscallarg(fd_set *) ex; syscallarg(struct timeval *) tv; } */ struct timeval atv; struct timespec ats, *ts = NULL; int error; if (SCARG(uap, tv)) { error = copyin(SCARG(uap, tv), (void *)&atv, sizeof(atv)); if (error) return error; if (atv.tv_usec < 0 || atv.tv_usec >= 1000000) return EINVAL; TIMEVAL_TO_TIMESPEC(&atv, &ats); ts = &ats; } return selcommon(retval, SCARG(uap, nd), SCARG(uap, in), SCARG(uap, ou), SCARG(uap, ex), ts, NULL); } /* * sel_do_scan: common code to perform the scan on descriptors. */ static int sel_do_scan(const char *opname, void *fds, const int nf, const size_t ni, struct timespec *ts, sigset_t *mask, register_t *retval) { lwp_t * const l = curlwp; selcluster_t *sc; kmutex_t *lock; struct timespec sleepts; int error, timo; timo = 0; if (ts && inittimeleft(ts, &sleepts) == -1) { return EINVAL; } if (__predict_false(mask)) sigsuspendsetup(l, mask); /* * We may context switch during or at any time after picking a CPU * and cluster to associate with, but it doesn't matter. In the * unlikely event we migrate elsewhere all we risk is a little lock * contention; correctness is not sacrificed. */ sc = curcpu()->ci_data.cpu_selcluster; lock = sc->sc_lock; l->l_selcluster = sc; if (opname == selop_select) { l->l_selbits = fds; l->l_selni = ni; } else { l->l_selbits = NULL; } for (;;) { int ncoll; SLIST_INIT(&l->l_selwait); l->l_selret = 0; /* * No need to lock. If this is overwritten by another value * while scanning, we will retry below. We only need to see * exact state from the descriptors that we are about to poll, * and lock activity resulting from fo_poll is enough to * provide an up to date value for new polling activity. */ if (ts && (ts->tv_sec | ts->tv_nsec | direct_select) == 0) { /* Non-blocking: no need for selrecord()/selclear() */ l->l_selflag = SEL_RESET; } else { l->l_selflag = SEL_SCANNING; } ncoll = sc->sc_ncoll; membar_release(); if (opname == selop_select) { error = selscan((char *)fds, nf, ni, retval); } else { error = pollscan((struct pollfd *)fds, nf, retval); } if (error || *retval) break; if (ts && (timo = gettimeleft(ts, &sleepts)) <= 0) break; /* * Acquire the lock and perform the (re)checks. Note, if * collision has occurred, then our state does not matter, * as we must perform re-scan. Therefore, check it first. */ state_check: mutex_spin_enter(lock); if (__predict_false(sc->sc_ncoll != ncoll)) { /* Collision: perform re-scan. */ mutex_spin_exit(lock); selclear(); continue; } if (__predict_true(l->l_selflag == SEL_EVENT)) { /* Events occurred, they are set directly. */ mutex_spin_exit(lock); break; } if (__predict_true(l->l_selflag == SEL_RESET)) { /* Events occurred, but re-scan is requested. */ mutex_spin_exit(lock); selclear(); continue; } /* Nothing happen, therefore - sleep. */ l->l_selflag = SEL_BLOCKING; KASSERT(l->l_blcnt == 0); (void)sleepq_enter(&sc->sc_sleepq, l, lock); sleepq_enqueue(&sc->sc_sleepq, sc, opname, &select_sobj, true); error = sleepq_block(timo, true, &select_sobj, 0); if (error != 0) { break; } /* Awoken: need to check the state. */ goto state_check; } selclear(); /* Add direct events if any. */ if (l->l_selflag == SEL_EVENT) { KASSERT(l->l_selret != 0); *retval += l->l_selret; } if (__predict_false(mask)) sigsuspendteardown(l); /* select and poll are not restarted after signals... */ if (error == ERESTART) return EINTR; if (error == EWOULDBLOCK) return 0; return error; } int selcommon(register_t *retval, int nd, fd_set *u_in, fd_set *u_ou, fd_set *u_ex, struct timespec *ts, sigset_t *mask) { char smallbits[howmany(FD_SETSIZE, NFDBITS) * sizeof(fd_mask) * 6]; char *bits; int error, nf; size_t ni; if (nd < 0) return (EINVAL); nf = atomic_load_consume(&curlwp->l_fd->fd_dt)->dt_nfiles; if (nd > nf) { /* forgiving; slightly wrong */ nd = nf; } ni = howmany(nd, NFDBITS) * sizeof(fd_mask); if (ni * 6 > sizeof(smallbits)) bits = kmem_alloc(ni * 6, KM_SLEEP); else bits = smallbits; #define getbits(name, x) \ if (u_ ## name) { \ error = copyin(u_ ## name, bits + ni * x, ni); \ if (error) \ goto fail; \ } else \ memset(bits + ni * x, 0, ni); getbits(in, 0); getbits(ou, 1); getbits(ex, 2); #undef getbits error = sel_do_scan(selop_select, bits, nd, ni, ts, mask, retval); if (error == 0 && u_in != NULL) error = copyout(bits + ni * 3, u_in, ni); if (error == 0 && u_ou != NULL) error = copyout(bits + ni * 4, u_ou, ni); if (error == 0 && u_ex != NULL) error = copyout(bits + ni * 5, u_ex, ni); fail: if (bits != smallbits) kmem_free(bits, ni * 6); return (error); } static inline int selscan(char *bits, const int nfd, const size_t ni, register_t *retval) { fd_mask *ibitp, *obitp; int msk, i, j, fd, n; file_t *fp; lwp_t *l; ibitp = (fd_mask *)(bits + ni * 0); obitp = (fd_mask *)(bits + ni * 3); n = 0; l = curlwp; memset(obitp, 0, ni * 3); for (msk = 0; msk < 3; msk++) { for (i = 0; i < nfd; i += NFDBITS) { fd_mask ibits, obits; ibits = *ibitp; obits = 0; while ((j = ffs(ibits)) && (fd = i + --j) < nfd) { ibits &= ~(1U << j); if ((fp = fd_getfile(fd)) == NULL) return (EBADF); /* * Setup an argument to selrecord(), which is * a file descriptor number. */ l->l_selrec = fd; if ((*fp->f_ops->fo_poll)(fp, sel_flag[msk])) { if (!direct_select) { /* * Have events: do nothing in * selrecord(). */ l->l_selflag = SEL_RESET; } obits |= (1U << j); n++; } fd_putfile(fd); } if (obits != 0) { if (direct_select) { kmutex_t *lock; lock = l->l_selcluster->sc_lock; mutex_spin_enter(lock); *obitp |= obits; mutex_spin_exit(lock); } else { *obitp |= obits; } } ibitp++; obitp++; } } *retval = n; return (0); } /* * Poll system call. */ int sys_poll(struct lwp *l, const struct sys_poll_args *uap, register_t *retval) { /* { syscallarg(struct pollfd *) fds; syscallarg(u_int) nfds; syscallarg(int) timeout; } */ struct timespec ats, *ts = NULL; if (SCARG(uap, timeout) != INFTIM) { ats.tv_sec = SCARG(uap, timeout) / 1000; ats.tv_nsec = (SCARG(uap, timeout) % 1000) * 1000000; ts = &ats; } return pollcommon(retval, SCARG(uap, fds), SCARG(uap, nfds), ts, NULL); } /* * Poll system call. */ int sys___pollts50(struct lwp *l, const struct sys___pollts50_args *uap, register_t *retval) { /* { syscallarg(struct pollfd *) fds; syscallarg(u_int) nfds; syscallarg(const struct timespec *) ts; syscallarg(const sigset_t *) mask; } */ struct timespec ats, *ts = NULL; sigset_t amask, *mask = NULL; int error; if (SCARG(uap, ts)) { error = copyin(SCARG(uap, ts), &ats, sizeof(ats)); if (error) return error; ts = &ats; } if (SCARG(uap, mask)) { error = copyin(SCARG(uap, mask), &amask, sizeof(amask)); if (error) return error; mask = &amask; } return pollcommon(retval, SCARG(uap, fds), SCARG(uap, nfds), ts, mask); } int pollcommon(register_t *retval, struct pollfd *u_fds, u_int nfds, struct timespec *ts, sigset_t *mask) { struct pollfd smallfds[32]; struct pollfd *fds; int error; size_t ni; if (nfds > curlwp->l_proc->p_rlimit[RLIMIT_NOFILE].rlim_max + 1000) { /* * Prevent userland from causing over-allocation. * Raising the default limit too high can still cause * a lot of memory to be allocated, but this also means * that the file descriptor array will also be large. * * To reduce the memory requirements here, we could * process the 'fds' array in chunks, but that * is a lot of code that isn't normally useful. * (Or just move the copyin/out into pollscan().) * * Historically the code silently truncated 'fds' to * dt_nfiles entries - but that does cause issues. * * Using the max limit equivalent to sysctl * kern.maxfiles is the moral equivalent of OPEN_MAX * as specified by POSIX. * * We add a slop of 1000 in case the resource limit was * changed after opening descriptors or the same descriptor * was specified more than once. */ return EINVAL; } ni = nfds * sizeof(struct pollfd); if (ni > sizeof(smallfds)) fds = kmem_alloc(ni, KM_SLEEP); else fds = smallfds; error = copyin(u_fds, fds, ni); if (error) goto fail; error = sel_do_scan(selop_poll, fds, nfds, ni, ts, mask, retval); if (error == 0) error = copyout(fds, u_fds, ni); fail: if (fds != smallfds) kmem_free(fds, ni); return (error); } static inline int pollscan(struct pollfd *fds, const int nfd, register_t *retval) { file_t *fp; int i, n = 0, revents; for (i = 0; i < nfd; i++, fds++) { fds->revents = 0; if (fds->fd < 0) { revents = 0; } else if ((fp = fd_getfile(fds->fd)) == NULL) { revents = POLLNVAL; } else { /* * Perform poll: registers select request or returns * the events which are set. Setup an argument for * selrecord(), which is a pointer to struct pollfd. */ curlwp->l_selrec = (uintptr_t)fds; revents = (*fp->f_ops->fo_poll)(fp, fds->events | POLLERR | POLLHUP); fd_putfile(fds->fd); } if (revents) { if (!direct_select) { /* Have events: do nothing in selrecord(). */ curlwp->l_selflag = SEL_RESET; } fds->revents = revents; n++; } } *retval = n; return (0); } int seltrue(dev_t dev, int events, lwp_t *l) { return (events & (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM)); } /* * Record a select request. Concurrency issues: * * The caller holds the same lock across calls to selrecord() and * selnotify(), so we don't need to consider a concurrent wakeup * while in this routine. * * The only activity we need to guard against is selclear(), called by * another thread that is exiting sel_do_scan(). * `sel_lwp' can only become non-NULL while the caller's lock is held, * so it cannot become non-NULL due to a change made by another thread * while we are in this routine. It can only become _NULL_ due to a * call to selclear(). * * If it is non-NULL and != selector there is the potential for * selclear() to be called by another thread. If either of those * conditions are true, we're not interested in touching the `named * waiter' part of the selinfo record because we need to record a * collision. Hence there is no need for additional locking in this * routine. */ void selrecord(lwp_t *selector, struct selinfo *sip) { selcluster_t *sc; lwp_t *other; KASSERT(selector == curlwp); sc = selector->l_selcluster; other = sip->sel_lwp; if (selector->l_selflag == SEL_RESET) { /* 0. We're not going to block - will poll again if needed. */ } else if (other == selector) { /* 1. We (selector) already claimed to be the first LWP. */ KASSERT(sip->sel_cluster == sc); } else if (other == NULL) { /* * 2. No first LWP, therefore we (selector) are the first. * * There may be unnamed waiters (collisions). Issue a memory * barrier to ensure that we access sel_lwp (above) before * other fields - this guards against a call to selclear(). */ membar_acquire(); sip->sel_lwp = selector; SLIST_INSERT_HEAD(&selector->l_selwait, sip, sel_chain); /* Copy the argument, which is for selnotify(). */ sip->sel_fdinfo = selector->l_selrec; /* Replace selinfo's lock with the chosen cluster's lock. */ sip->sel_cluster = sc; } else { /* 3. Multiple waiters: record a collision. */ sip->sel_collision |= sc->sc_mask; KASSERT(sip->sel_cluster != NULL); } } /* * Record a knote. * * The caller holds the same lock as for selrecord(). */ void selrecord_knote(struct selinfo *sip, struct knote *kn) { klist_insert(&sip->sel_klist, kn); } /* * Remove a knote. * * The caller holds the same lock as for selrecord(). * * Returns true if the last knote was removed and the list * is now empty. */ bool selremove_knote(struct selinfo *sip, struct knote *kn) { return klist_remove(&sip->sel_klist, kn); } /* * sel_setevents: a helper function for selnotify(), to set the events * for LWP sleeping in selcommon() or pollcommon(). */ static inline bool sel_setevents(lwp_t *l, struct selinfo *sip, const int events) { const int oflag = l->l_selflag; int ret = 0; /* * If we require re-scan or it was required by somebody else, * then just (re)set SEL_RESET and return. */ if (__predict_false(events == 0 || oflag == SEL_RESET)) { l->l_selflag = SEL_RESET; return true; } /* * Direct set. Note: select state of LWP is locked. First, * determine whether it is selcommon() or pollcommon(). */ if (l->l_selbits != NULL) { const size_t ni = l->l_selni; fd_mask *fds = (fd_mask *)l->l_selbits; fd_mask *ofds = (fd_mask *)((char *)fds + ni * 3); const int fd = sip->sel_fdinfo, fbit = 1 << (fd & __NFDMASK); const int idx = fd >> __NFDSHIFT; int n; for (n = 0; n < 3; n++) { if ((fds[idx] & fbit) != 0 && (ofds[idx] & fbit) == 0 && (sel_flag[n] & events)) { ofds[idx] |= fbit; ret++; } fds = (fd_mask *)((char *)fds + ni); ofds = (fd_mask *)((char *)ofds + ni); } } else { struct pollfd *pfd = (void *)sip->sel_fdinfo; int revents = events & (pfd->events | POLLERR | POLLHUP); if (revents) { if (pfd->revents == 0) ret = 1; pfd->revents |= revents; } } /* Check whether there are any events to return. */ if (!ret) { return false; } /* Indicate direct set and note the event (cluster lock is held). */ l->l_selflag = SEL_EVENT; l->l_selret += ret; return true; } /* * Do a wakeup when a selectable event occurs. Concurrency issues: * * As per selrecord(), the caller's object lock is held. If there * is a named waiter, we must acquire the associated selcluster's lock * in order to synchronize with selclear() and pollers going to sleep * in sel_do_scan(). * * sip->sel_cluser cannot change at this point, as it is only changed * in selrecord(), and concurrent calls to selrecord() are locked * out by the caller. */ void selnotify(struct selinfo *sip, int events, long knhint) { selcluster_t *sc; uint64_t mask; int index, oflag; lwp_t *l; kmutex_t *lock; KNOTE(&sip->sel_klist, knhint); if (sip->sel_lwp != NULL) { /* One named LWP is waiting. */ sc = sip->sel_cluster; lock = sc->sc_lock; mutex_spin_enter(lock); /* Still there? */ if (sip->sel_lwp != NULL) { /* * Set the events for our LWP and indicate that. * Otherwise, request for a full re-scan. */ l = sip->sel_lwp; oflag = l->l_selflag; if (!direct_select) { l->l_selflag = SEL_RESET; } else if (!sel_setevents(l, sip, events)) { /* No events to return. */ mutex_spin_exit(lock); return; } /* * If thread is sleeping, wake it up. If it's not * yet asleep, it will notice the change in state * and will re-poll the descriptors. */ if (oflag == SEL_BLOCKING && l->l_mutex == lock) { KASSERT(l->l_wchan == sc); sleepq_remove(l->l_sleepq, l, true); } } mutex_spin_exit(lock); } if ((mask = sip->sel_collision) != 0) { /* * There was a collision (multiple waiters): we must * inform all potentially interested waiters. */ sip->sel_collision = 0; do { index = ffs64(mask) - 1; mask ^= __BIT(index); sc = selcluster[index]; lock = sc->sc_lock; mutex_spin_enter(lock); sc->sc_ncoll++; sleepq_wake(&sc->sc_sleepq, sc, (u_int)-1, lock); } while (__predict_false(mask != 0)); } } /* * Remove an LWP from all objects that it is waiting for. Concurrency * issues: * * The object owner's (e.g. device driver) lock is not held here. Calls * can be made to selrecord() and we do not synchronize against those * directly using locks. However, we use `sel_lwp' to lock out changes. * Before clearing it we must use memory barriers to ensure that we can * safely traverse the list of selinfo records. */ static void selclear(void) { struct selinfo *sip, *next; selcluster_t *sc; lwp_t *l; kmutex_t *lock; l = curlwp; sc = l->l_selcluster; lock = sc->sc_lock; /* * If the request was non-blocking, or we found events on the first * descriptor, there will be no need to clear anything - avoid * taking the lock. */ if (SLIST_EMPTY(&l->l_selwait)) { return; } mutex_spin_enter(lock); for (sip = SLIST_FIRST(&l->l_selwait); sip != NULL; sip = next) { KASSERT(sip->sel_lwp == l); KASSERT(sip->sel_cluster == l->l_selcluster); /* * Read link to next selinfo record, if any. * It's no longer safe to touch `sip' after clearing * `sel_lwp', so ensure that the read of `sel_chain' * completes before the clearing of sel_lwp becomes * globally visible. */ next = SLIST_NEXT(sip, sel_chain); /* Release the record for another named waiter to use. */ atomic_store_release(&sip->sel_lwp, NULL); } mutex_spin_exit(lock); } /* * Initialize the select/poll system calls. Called once for each * CPU in the system, as they are attached. */ void selsysinit(struct cpu_info *ci) { selcluster_t *sc; u_int index; /* If already a cluster in place for this bit, re-use. */ index = cpu_index(ci) & SELCLUSTERMASK; sc = selcluster[index]; if (sc == NULL) { sc = kmem_alloc(roundup2(sizeof(selcluster_t), coherency_unit) + coherency_unit, KM_SLEEP); sc = (void *)roundup2((uintptr_t)sc, coherency_unit); sc->sc_lock = mutex_obj_alloc(MUTEX_DEFAULT, IPL_SCHED); sleepq_init(&sc->sc_sleepq); sc->sc_ncoll = 0; sc->sc_mask = __BIT(index); selcluster[index] = sc; } ci->ci_data.cpu_selcluster = sc; } /* * Initialize a selinfo record. */ void selinit(struct selinfo *sip) { memset(sip, 0, sizeof(*sip)); klist_init(&sip->sel_klist); } /* * Destroy a selinfo record. The owning object must not gain new * references while this is in progress: all activity on the record * must be stopped. * * Concurrency issues: we only need guard against a call to selclear() * by a thread exiting sel_do_scan(). The caller has prevented further * references being made to the selinfo record via selrecord(), and it * will not call selnotify() again. */ void seldestroy(struct selinfo *sip) { selcluster_t *sc; kmutex_t *lock; lwp_t *l; klist_fini(&sip->sel_klist); if (sip->sel_lwp == NULL) return; /* * Lock out selclear(). The selcluster pointer can't change while * we are here since it is only ever changed in selrecord(), * and that will not be entered again for this record because * it is dying. */ KASSERT(sip->sel_cluster != NULL); sc = sip->sel_cluster; lock = sc->sc_lock; mutex_spin_enter(lock); if ((l = sip->sel_lwp) != NULL) { /* * This should rarely happen, so although SLIST_REMOVE() * is slow, using it here is not a problem. */ KASSERT(l->l_selcluster == sc); SLIST_REMOVE(&l->l_selwait, sip, selinfo, sel_chain); sip->sel_lwp = NULL; } mutex_spin_exit(lock); } /* * System control nodes. */ SYSCTL_SETUP(sysctl_select_setup, "sysctl select setup") { sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT | CTLFLAG_READWRITE, CTLTYPE_INT, "direct_select", SYSCTL_DESCR("Enable/disable direct select (for testing)"), NULL, 0, &direct_select, 0, CTL_KERN, CTL_CREATE, CTL_EOL); }
2 1 2 1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 /* $NetBSD: nd6_nbr.c,v 1.183 2023/03/29 13:01:44 kardel Exp $ */ /* $KAME: nd6_nbr.c,v 1.61 2001/02/10 16:06:14 jinmei Exp $ */ /* * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the project nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: nd6_nbr.c,v 1.183 2023/03/29 13:01:44 kardel Exp $"); #ifdef _KERNEL_OPT #include "opt_inet.h" #include "opt_net_mpsafe.h" #endif #include <sys/param.h> #include <sys/systm.h> #include <sys/kmem.h> #include <sys/mbuf.h> #include <sys/socket.h> #include <sys/socketvar.h> #include <sys/sockio.h> #include <sys/time.h> #include <sys/kernel.h> #include <sys/errno.h> #include <sys/ioctl.h> #include <sys/syslog.h> #include <sys/queue.h> #include <sys/callout.h> #include <sys/cprng.h> #include <net/if.h> #include <net/if_types.h> #include <net/if_dl.h> #include <net/if_llatbl.h> #include <net/nd.h> #include <net/route.h> #include <netinet/in.h> #include <netinet/in_var.h> #include <netinet6/in6_var.h> #include <netinet6/in6_ifattach.h> #include <netinet/ip6.h> #include <netinet6/ip6_var.h> #include <netinet6/scope6_var.h> #include <netinet6/nd6.h> #include <netinet/icmp6.h> #include <netinet6/icmp6_private.h> #include "carp.h" #if NCARP > 0 #include <netinet/ip_carp.h> #endif struct dadq; static struct dadq *nd6_dad_find(struct ifaddr *, struct nd_opt_nonce *, bool *); static bool nd6_dad_ownnonce(struct ifaddr *, struct nd_opt_nonce *nonce); static void nd6_dad_starttimer(struct dadq *, int); static void nd6_dad_destroytimer(struct dadq *); static void nd6_dad_timer(struct dadq *); static void nd6_dad_ns_output(struct dadq *, struct ifaddr *); static void nd6_dad_input(struct ifaddr *, struct nd_opt_nonce *, const struct sockaddr_dl *); static void nd6_dad_duplicated(struct ifaddr *, struct dadq *, const struct sockaddr_dl *); static int dad_maxtry = 15; /* max # of *tries* to transmit DAD packet */ /* * Input a Neighbor Solicitation Message. * * Based on RFC 2461 * Based on RFC 2462 (duplicate address detection) */ void nd6_ns_input(struct mbuf *m, int off, int icmp6len) { struct ifnet *ifp, *ifpc; struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *); struct nd_neighbor_solicit *nd_ns; struct in6_addr saddr6 = ip6->ip6_src; struct in6_addr daddr6 = ip6->ip6_dst; struct in6_addr taddr6; struct in6_addr myaddr6; char *lladdr = NULL; struct ifaddr *ifa = NULL; int lladdrlen = 0; int anycast = 0, proxy = 0, tentative = 0; int router = ip6_forwarding; int tlladdr; union nd_opts ndopts; const struct sockaddr_dl *proxydl = NULL; struct psref psref; struct psref psref_c; struct psref psref_ia; char ip6buf[INET6_ADDRSTRLEN], ip6buf2[INET6_ADDRSTRLEN]; ifp = ifpc = m_get_rcvif_psref(m, &psref); if (ifp == NULL) goto freeit; IP6_EXTHDR_GET(nd_ns, struct nd_neighbor_solicit *, m, off, icmp6len); if (nd_ns == NULL) { ICMP6_STATINC(ICMP6_STAT_TOOSHORT); m_put_rcvif_psref(ifp, &psref); return; } ip6 = mtod(m, struct ip6_hdr *); /* adjust pointer for safety */ taddr6 = nd_ns->nd_ns_target; if (in6_setscope(&taddr6, ifp, NULL) != 0) goto bad; if (ip6->ip6_hlim != 255) { nd6log(LOG_ERR, "invalid hlim (%d) from %s to %s on %s\n", ip6->ip6_hlim, IN6_PRINT(ip6buf, &ip6->ip6_src), IN6_PRINT(ip6buf2, &ip6->ip6_dst), if_name(ifp)); goto bad; } if (IN6_IS_ADDR_UNSPECIFIED(&saddr6)) { /* dst has to be a solicited node multicast address. */ /* don't check ifindex portion */ if (daddr6.s6_addr16[0] == IPV6_ADDR_INT16_MLL && daddr6.s6_addr32[1] == 0 && daddr6.s6_addr32[2] == IPV6_ADDR_INT32_ONE && daddr6.s6_addr8[12] == 0xff) { ; /* good */ } else { nd6log(LOG_INFO, "bad DAD packet (wrong ip6 dst)\n"); goto bad; } } else { struct sockaddr_in6 ssin6; /* * Make sure the source address is from a neighbor's address. */ sockaddr_in6_init(&ssin6, &saddr6, 0, 0, 0); if (nd6_is_addr_neighbor(&ssin6, ifp) == 0) { nd6log(LOG_INFO, "NS packet from non-neighbor %s on %s\n", IN6_PRINT(ip6buf, &saddr6), if_name(ifp)); goto bad; } } if (IN6_IS_ADDR_MULTICAST(&taddr6)) { nd6log(LOG_INFO, "bad NS target (multicast)\n"); goto bad; } icmp6len -= sizeof(*nd_ns); nd6_option_init(nd_ns + 1, icmp6len, &ndopts); if (nd6_options(&ndopts) < 0) { nd6log(LOG_INFO, "invalid ND option, ignored\n"); /* nd6_options have incremented stats */ goto freeit; } if (ndopts.nd_opts_src_lladdr) { lladdr = (char *)(ndopts.nd_opts_src_lladdr + 1); lladdrlen = ndopts.nd_opts_src_lladdr->nd_opt_len << 3; } if (IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_src) && lladdr) { nd6log(LOG_INFO, "bad DAD packet (link-layer address option)\n"); goto bad; } /* * Attaching target link-layer address to the NA? * (RFC 2461 7.2.4) * * NS IP dst is multicast MUST add * Otherwise MAY be omitted * * In this implementation, we omit the target link-layer address * in the "MAY" case. */ #if 0 /* too much! */ ifa = (struct ifaddr *)in6ifa_ifpwithaddr(ifp, &daddr6); if (ifa && (((struct in6_ifaddr *)ifa)->ia6_flags & IN6_IFF_ANYCAST)) tlladdr = 0; else #endif if (!IN6_IS_ADDR_MULTICAST(&daddr6)) tlladdr = 0; else tlladdr = 1; /* * Target address (taddr6) must be either: * (1) Valid unicast/anycast address for my receiving interface, * (2) Unicast address for which I'm offering proxy service, or * (3) "tentative" address on which DAD is being performed. */ /* (1) and (3) check. */ #if NCARP > 0 if (ifp->if_carp && ifp->if_type != IFT_CARP) { int s = pserialize_read_enter(); ifa = carp_iamatch6(ifp->if_carp, &taddr6); if (ifa != NULL) { ifa_acquire(ifa, &psref_ia); if (ifa->ifa_ifp && ifa->ifa_ifp != ifp) { ifpc = ifa->ifa_ifp; if_acquire(ifpc, &psref_c); } } pserialize_read_exit(s); } else ifa = NULL; if (!ifa) ifa = (struct ifaddr *)in6ifa_ifpwithaddr_psref(ifp, &taddr6, &psref_ia); #else ifa = (struct ifaddr *)in6ifa_ifpwithaddr_psref(ifp, &taddr6, &psref_ia); #endif /* (2) check. */ if (ifa == NULL) { struct rtentry *rt; struct sockaddr_in6 tsin6; sockaddr_in6_init(&tsin6, &taddr6, 0, 0, 0); rt = rtalloc1(sin6tosa(&tsin6), 0); if (rt && (rt->rt_flags & RTF_ANNOUNCE) != 0 && rt->rt_gateway->sa_family == AF_LINK) { /* * proxy NDP for single entry */ ifa = (struct ifaddr *)in6ifa_ifpforlinklocal_psref(ifp, IN6_IFF_NOTREADY|IN6_IFF_ANYCAST, &psref_ia); if (ifa) { proxy = 1; proxydl = satocsdl(rt->rt_gateway); router = 0; /* XXX */ } } if (rt) rt_unref(rt); } if (ifa == NULL) { /* * We've got an NS packet, and we don't have that address * assigned for us. We MUST silently ignore it. * See RFC2461 7.2.3. */ goto freeit; } myaddr6 = *IFA_IN6(ifa); anycast = ((struct in6_ifaddr *)ifa)->ia6_flags & IN6_IFF_ANYCAST; tentative = ((struct in6_ifaddr *)ifa)->ia6_flags & IN6_IFF_TENTATIVE; if (((struct in6_ifaddr *)ifa)->ia6_flags & IN6_IFF_DUPLICATED) goto freeit; if (lladdr && ((ifp->if_addrlen + 2 + 7) & ~7) != lladdrlen) { nd6log(LOG_INFO, "lladdrlen mismatch for %s " "(if %d, NS packet %d)\n", IN6_PRINT(ip6buf, &taddr6), ifp->if_addrlen, lladdrlen - 2); goto bad; } if (IN6_ARE_ADDR_EQUAL(&myaddr6, &saddr6)) { nd6log(LOG_INFO, "duplicate IP6 address %s\n", IN6_PRINT(ip6buf, &saddr6)); goto freeit; } /* * We have neighbor solicitation packet, with target address equals to * one of my tentative address. * * src addr how to process? * --- --- * multicast of course, invalid (rejected in ip6_input) * unicast somebody is doing address resolution -> ignore * unspec dup address detection * * The processing is defined in RFC 2462. */ if (tentative) { /* * If source address is unspecified address, it is for * duplicate address detection. * * If not, the packet is for address resolution; * silently ignore it. */ if (IN6_IS_ADDR_UNSPECIFIED(&saddr6)) { struct sockaddr_dl sdl, *sdlp; if (lladdr != NULL) sdlp = sockaddr_dl_init(&sdl, sizeof(sdl), ifp->if_index, ifp->if_type, NULL, 0, lladdr, lladdrlen); else sdlp = NULL; nd6_dad_input(ifa, ndopts.nd_opts_nonce, sdlp); } goto freeit; } /* * It looks that sender is performing DAD. * Check that the nonce is not being used by the same address * on another interface. */ if (IN6_IS_ADDR_UNSPECIFIED(&saddr6) && ndopts.nd_opts_nonce != NULL) { if (nd6_dad_ownnonce(ifa, ndopts.nd_opts_nonce)) goto freeit; } ifa_release(ifa, &psref_ia); ifa = NULL; /* * If the source address is unspecified address, entries must not * be created or updated. * It looks that sender is performing DAD. Output NA toward * all-node multicast address, to tell the sender that I'm using * the address. * S bit ("solicited") must be zero. */ if (IN6_IS_ADDR_UNSPECIFIED(&saddr6)) { struct in6_addr in6_all; in6_all = in6addr_linklocal_allnodes; if (in6_setscope(&in6_all, ifp, NULL) != 0) goto bad; nd6_na_output(ifpc, &in6_all, &taddr6, ((anycast || proxy || !tlladdr) ? 0 : ND_NA_FLAG_OVERRIDE) | (ip6_forwarding ? ND_NA_FLAG_ROUTER : 0), tlladdr, (const struct sockaddr *)proxydl); goto freeit; } nd6_cache_lladdr(ifpc, &saddr6, lladdr, lladdrlen, ND_NEIGHBOR_SOLICIT, 0); nd6_na_output(ifp, &saddr6, &taddr6, ((anycast || proxy || !tlladdr) ? 0 : ND_NA_FLAG_OVERRIDE) | (router ? ND_NA_FLAG_ROUTER : 0) | ND_NA_FLAG_SOLICITED, tlladdr, (const struct sockaddr *)proxydl); freeit: ifa_release(ifa, &psref_ia); m_put_rcvif_psref(ifp, &psref); if (ifp != ifpc) if_put(ifpc, &psref_c); m_freem(m); return; bad: nd6log(LOG_ERR, "src=%s\n", IN6_PRINT(ip6buf, &saddr6)); nd6log(LOG_ERR, "dst=%s\n", IN6_PRINT(ip6buf, &daddr6)); nd6log(LOG_ERR, "tgt=%s\n", IN6_PRINT(ip6buf, &taddr6)); ICMP6_STATINC(ICMP6_STAT_BADNS); ifa_release(ifa, &psref_ia); m_put_rcvif_psref(ifp, &psref); m_freem(m); } /* * Output a Neighbor Solicitation Message. Caller specifies: * - ICMP6 header source IP6 address * - ND6 header target IP6 address * - ND6 header source datalink address * * Based on RFC 2461 * Based on RFC 2462 (duplicate address detection) */ void nd6_ns_output(struct ifnet *ifp, const struct in6_addr *daddr6, const struct in6_addr *taddr6, const struct in6_addr *hsrc, const uint8_t *nonce /* duplicate address detection */) { struct mbuf *m; struct ip6_hdr *ip6; struct nd_neighbor_solicit *nd_ns; const struct in6_addr *src; struct in6_addr src_in; struct ip6_moptions im6o; int icmp6len; int maxlen; const void *mac; struct route ro; if (IN6_IS_ADDR_MULTICAST(taddr6)) return; memset(&ro, 0, sizeof(ro)); /* estimate the size of message */ maxlen = sizeof(*ip6) + sizeof(*nd_ns); maxlen += (sizeof(struct nd_opt_hdr) + ifp->if_addrlen + 7) & ~7; KASSERTMSG(max_linkhdr + maxlen <= MCLBYTES, "max_linkhdr + maxlen > MCLBYTES (%d + %d > %d)", max_linkhdr, maxlen, MCLBYTES); MGETHDR(m, M_DONTWAIT, MT_DATA); if (m && max_linkhdr + maxlen >= MHLEN) { MCLGET(m, M_DONTWAIT); if ((m->m_flags & M_EXT) == 0) { m_free(m); m = NULL; } } if (m == NULL) return; m_reset_rcvif(m); if (daddr6 == NULL || IN6_IS_ADDR_MULTICAST(daddr6)) { m->m_flags |= M_MCAST; im6o.im6o_multicast_if_index = if_get_index(ifp); im6o.im6o_multicast_hlim = 255; im6o.im6o_multicast_loop = 0; } icmp6len = sizeof(*nd_ns); m->m_pkthdr.len = m->m_len = sizeof(*ip6) + icmp6len; m->m_data += max_linkhdr; /* or m_align() equivalent? */ /* fill neighbor solicitation packet */ ip6 = mtod(m, struct ip6_hdr *); ip6->ip6_flow = 0; ip6->ip6_vfc &= ~IPV6_VERSION_MASK; ip6->ip6_vfc |= IPV6_VERSION; /* ip6->ip6_plen will be set later */ ip6->ip6_nxt = IPPROTO_ICMPV6; ip6->ip6_hlim = 255; if (daddr6) ip6->ip6_dst = *daddr6; else { ip6->ip6_dst.s6_addr16[0] = IPV6_ADDR_INT16_MLL; ip6->ip6_dst.s6_addr16[1] = 0; ip6->ip6_dst.s6_addr32[1] = 0; ip6->ip6_dst.s6_addr32[2] = IPV6_ADDR_INT32_ONE; ip6->ip6_dst.s6_addr32[3] = taddr6->s6_addr32[3]; ip6->ip6_dst.s6_addr8[12] = 0xff; if (in6_setscope(&ip6->ip6_dst, ifp, NULL) != 0) goto bad; } if (nonce == NULL) { int s; /* * RFC2461 7.2.2: * "If the source address of the packet prompting the * solicitation is the same as one of the addresses assigned * to the outgoing interface, that address SHOULD be placed * in the IP Source Address of the outgoing solicitation. * Otherwise, any one of the addresses assigned to the * interface should be used." * * We use the source address for the prompting packet * (hsrc), if: * - hsrc is given from the caller (by giving "ln"), and * - hsrc belongs to the outgoing interface. * Otherwise, we perform the source address selection as usual. */ s = pserialize_read_enter(); if (hsrc && in6ifa_ifpwithaddr(ifp, hsrc)) { pserialize_read_exit(s); src = hsrc; } else { int error; struct sockaddr_in6 dst_sa; pserialize_read_exit(s); sockaddr_in6_init(&dst_sa, &ip6->ip6_dst, 0, 0, 0); error = in6_selectsrc(&dst_sa, NULL, NULL, &ro, NULL, NULL, NULL, &src_in); if (error != 0) { char ip6buf[INET6_ADDRSTRLEN]; nd6log(LOG_DEBUG, "source can't be " "determined: dst=%s, error=%d\n", IN6_PRINT(ip6buf, &dst_sa.sin6_addr), error); goto bad; } src = &src_in; } } else { /* * Source address for DAD packet must always be IPv6 * unspecified address. (0::0) * We actually don't have to 0-clear the address (we did it * above), but we do so here explicitly to make the intention * clearer. */ memset(&src_in, 0, sizeof(src_in)); src = &src_in; } ip6->ip6_src = *src; nd_ns = (struct nd_neighbor_solicit *)(ip6 + 1); nd_ns->nd_ns_type = ND_NEIGHBOR_SOLICIT; nd_ns->nd_ns_code = 0; nd_ns->nd_ns_reserved = 0; nd_ns->nd_ns_target = *taddr6; in6_clearscope(&nd_ns->nd_ns_target); /* XXX */ /* * Add source link-layer address option. * * spec implementation * --- --- * DAD packet MUST NOT do not add the option * there's no link layer address: * impossible do not add the option * there's link layer address: * Multicast NS MUST add one add the option * Unicast NS SHOULD add one add the option */ if (nonce == NULL && (mac = nd6_ifptomac(ifp))) { int optlen = sizeof(struct nd_opt_hdr) + ifp->if_addrlen; struct nd_opt_hdr *nd_opt = (struct nd_opt_hdr *)(nd_ns + 1); /* 8 byte alignments... */ optlen = (optlen + 7) & ~7; m->m_pkthdr.len += optlen; m->m_len += optlen; icmp6len += optlen; memset((void *)nd_opt, 0, optlen); nd_opt->nd_opt_type = ND_OPT_SOURCE_LINKADDR; nd_opt->nd_opt_len = optlen >> 3; memcpy((void *)(nd_opt + 1), mac, ifp->if_addrlen); } /* Add a nonce option (RFC 3971) to detect looped back NS messages. * This behavior is documented in RFC 7527. */ if (nonce != NULL) { int optlen = sizeof(struct nd_opt_hdr) + ND_OPT_NONCE_LEN; struct nd_opt_hdr *nd_opt = (struct nd_opt_hdr *)(nd_ns + 1); /* 8-byte alignment is required. */ optlen = (optlen + 7) & ~7; m->m_pkthdr.len += optlen; m->m_len += optlen; icmp6len += optlen; memset(nd_opt, 0, optlen); nd_opt->nd_opt_type = ND_OPT_NONCE; nd_opt->nd_opt_len = optlen >> 3; memcpy(nd_opt + 1, nonce, ND_OPT_NONCE_LEN); } ip6->ip6_plen = htons((u_int16_t)icmp6len); nd_ns->nd_ns_cksum = 0; nd_ns->nd_ns_cksum = in6_cksum(m, IPPROTO_ICMPV6, sizeof(*ip6), icmp6len); ip6_output(m, NULL, &ro, nonce != NULL ? IPV6_UNSPECSRC : 0, &im6o, NULL, NULL); icmp6_ifstat_inc(ifp, ifs6_out_msg); icmp6_ifstat_inc(ifp, ifs6_out_neighborsolicit); ICMP6_STATINC(ICMP6_STAT_OUTHIST + ND_NEIGHBOR_SOLICIT); rtcache_free(&ro); return; bad: rtcache_free(&ro); m_freem(m); return; } /* * Neighbor advertisement input handling. * * Based on RFC 2461 * Based on RFC 2462 (duplicate address detection) * * the following items are not implemented yet: * - proxy advertisement delay rule (RFC2461 7.2.8, last paragraph, SHOULD) * - anycast advertisement delay rule (RFC2461 7.2.7, SHOULD) */ void nd6_na_input(struct mbuf *m, int off, int icmp6len) { struct ifnet *ifp; struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *); struct nd_neighbor_advert *nd_na; struct in6_addr saddr6 = ip6->ip6_src; struct in6_addr daddr6 = ip6->ip6_dst; struct in6_addr taddr6; int flags; int is_router; int is_solicited; int is_override; int rt_cmd; char *lladdr = NULL; int lladdrlen = 0; struct ifaddr *ifa; struct llentry *ln = NULL; union nd_opts ndopts; struct sockaddr_in6 ssin6; struct psref psref; struct psref psref_ia; char ip6buf[INET6_ADDRSTRLEN], ip6buf2[INET6_ADDRSTRLEN]; ifp = m_get_rcvif_psref(m, &psref); if (ifp == NULL) goto freeit; if (ip6->ip6_hlim != 255) { nd6log(LOG_ERR, "invalid hlim (%d) from %s to %s on %s\n", ip6->ip6_hlim, IN6_PRINT(ip6buf, &ip6->ip6_src), IN6_PRINT(ip6buf2, &ip6->ip6_dst), if_name(ifp)); goto bad; } IP6_EXTHDR_GET(nd_na, struct nd_neighbor_advert *, m, off, icmp6len); if (nd_na == NULL) { m_put_rcvif_psref(ifp, &psref); ICMP6_STATINC(ICMP6_STAT_TOOSHORT); return; } flags = nd_na->nd_na_flags_reserved; is_router = ((flags & ND_NA_FLAG_ROUTER) != 0); is_solicited = ((flags & ND_NA_FLAG_SOLICITED) != 0); is_override = ((flags & ND_NA_FLAG_OVERRIDE) != 0); taddr6 = nd_na->nd_na_target; if (in6_setscope(&taddr6, ifp, NULL)) { goto bad; } if (IN6_IS_ADDR_MULTICAST(&taddr6)) { nd6log(LOG_ERR, "invalid target address %s\n", IN6_PRINT(ip6buf, &taddr6)); goto bad; } if (is_solicited && IN6_IS_ADDR_MULTICAST(&daddr6)) { nd6log(LOG_ERR, "a solicited adv is multicasted\n"); goto bad; } icmp6len -= sizeof(*nd_na); nd6_option_init(nd_na + 1, icmp6len, &ndopts); if (nd6_options(&ndopts) < 0) { nd6log(LOG_INFO, "invalid ND option, ignored\n"); /* nd6_options have incremented stats */ goto freeit; } if (ndopts.nd_opts_tgt_lladdr != NULL) { struct ifnet *ifp_ll; struct psref psref_ll; lladdr = (char *)(ndopts.nd_opts_tgt_lladdr + 1); lladdrlen = ndopts.nd_opts_tgt_lladdr->nd_opt_len << 3; if (lladdr && ((ifp->if_addrlen + 2 + 7) & ~7) != lladdrlen) { nd6log(LOG_INFO, "lladdrlen mismatch for %s " "(if %d, NA packet %d)\n", IN6_PRINT(ip6buf, &taddr6), ifp->if_addrlen, lladdrlen - 2); goto bad; } ifp_ll = if_get_bylla(lladdr, ifp->if_addrlen, &psref_ll); if (ifp_ll != NULL) { /* it's from me, ignore it. */ if_put(ifp_ll, &psref_ll); goto freeit; } } ifa = (struct ifaddr *)in6ifa_ifpwithaddr_psref(ifp, &taddr6, &psref_ia); /* * Target address matches one of my interface address. * * If my address is tentative, this means that there's somebody * already using the same address as mine. This indicates DAD failure. * This is defined in RFC 2462. * * Otherwise, process as defined in RFC 2461. */ if (ifa) { if (((struct in6_ifaddr *)ifa)->ia6_flags & IN6_IFF_TENTATIVE) { struct sockaddr_dl sdl, *sdlp; if (lladdr != NULL) sdlp = sockaddr_dl_init(&sdl, sizeof(sdl), ifp->if_index, ifp->if_type, NULL, 0, lladdr, lladdrlen); else sdlp = NULL; nd6_dad_input(ifa, NULL, sdlp); } else log(LOG_ERR, "nd6_na_input: duplicate IP6 address %s\n", IN6_PRINT(ip6buf, &taddr6)); ifa_release(ifa, &psref_ia); ifa = NULL; goto freeit; } /* * Make sure the source address is from a neighbor's address. */ sockaddr_in6_init(&ssin6, &saddr6, 0, 0, 0); if (nd6_is_addr_neighbor(&ssin6, ifp) == 0) { nd6log(LOG_INFO, "ND packet from non-neighbor %s on %s\n", IN6_PRINT(ip6buf, &saddr6), if_name(ifp)); goto bad; } /* * If no neighbor cache entry is found, NA SHOULD silently be * discarded. */ ln = nd6_lookup(&taddr6, ifp, true); if (ln == NULL) goto freeit; rt_cmd = 0; if (ln->ln_state <= ND_LLINFO_INCOMPLETE) { /* * If the link-layer has address, and no lladdr option came, * discard the packet. */ if (ifp->if_addrlen && !lladdr) goto freeit; /* * Record link-layer address, and update the state. */ memcpy(&ln->ll_addr, lladdr, ifp->if_addrlen); ln->la_flags |= LLE_VALID; rt_cmd = RTM_ADD; if (is_solicited) { ln->ln_state = ND_LLINFO_REACHABLE; ln->ln_byhint = 0; if (!ND_IS_LLINFO_PERMANENT(ln)) nd_set_timer(ln, ND_TIMER_REACHABLE); } else { ln->ln_state = ND_LLINFO_STALE; nd_set_timer(ln, ND_TIMER_GC); } } else { bool llchange; /* * Check if the link-layer address has changed or not. */ if (lladdr == NULL) llchange = false; else { if (ln->la_flags & LLE_VALID) { if (memcmp(lladdr, &ln->ll_addr, ifp->if_addrlen)) llchange = true; else llchange = false; } else llchange = true; } if (llchange) rt_cmd = RTM_CHANGE; /* * This is VERY complex. Look at it with care. * * override solicit lladdr llchange action * (L: record lladdr) * * 0 0 n -- (2c) * 0 0 y n (2b) L * 0 0 y y (1) REACHABLE->STALE * 0 1 n -- (2c) *->REACHABLE * 0 1 y n (2b) L *->REACHABLE * 0 1 y y (1) REACHABLE->STALE * 1 0 n -- (2a) * 1 0 y n (2a) L * 1 0 y y (2a) L *->STALE * 1 1 n -- (2a) *->REACHABLE * 1 1 y n (2a) L *->REACHABLE * 1 1 y y (2a) L *->REACHABLE */ if (!is_override && lladdr != NULL && llchange) { /* (1) */ /* * If state is REACHABLE, make it STALE. * no other updates should be done. */ if (ln->ln_state == ND_LLINFO_REACHABLE) { ln->ln_state = ND_LLINFO_STALE; nd_set_timer(ln, ND_TIMER_GC); } goto freeit; } else if (is_override /* (2a) */ || (!is_override && lladdr != NULL && !llchange) /* (2b) */ || lladdr == NULL) { /* (2c) */ /* * Update link-local address, if any. */ if (lladdr != NULL) { memcpy(&ln->ll_addr, lladdr, ifp->if_addrlen); ln->la_flags |= LLE_VALID; } /* * If solicited, make the state REACHABLE. * If not solicited and the link-layer address was * changed, make it STALE. */ if (is_solicited) { ln->ln_state = ND_LLINFO_REACHABLE; ln->ln_byhint = 0; if (!ND_IS_LLINFO_PERMANENT(ln)) nd_set_timer(ln, ND_TIMER_REACHABLE); } else { if (lladdr && llchange) { ln->ln_state = ND_LLINFO_STALE; nd_set_timer(ln, ND_TIMER_GC); } } } ln->ln_router = is_router; } /* * XXX: does this matter? * rt->rt_flags &= ~RTF_REJECT; */ ln->ln_asked = 0; nd6_llinfo_release_pkts(ln, ifp); if (rt_cmd != 0) { struct sockaddr_in6 sin6; sockaddr_in6_init(&sin6, &ln->r_l3addr.addr6, 0, 0, 0); rt_clonedmsg(rt_cmd, sin6tosa(&ssin6), sin6tosa(&sin6), (char *)&ln->ll_addr, ln->lle_tbl->llt_ifp); } freeit: if (ln != NULL) LLE_WUNLOCK(ln); m_put_rcvif_psref(ifp, &psref); m_freem(m); return; bad: if (ln != NULL) LLE_WUNLOCK(ln); ICMP6_STATINC(ICMP6_STAT_BADNA); m_put_rcvif_psref(ifp, &psref); m_freem(m); } /* * Neighbor advertisement output handling. * * Based on RFC 2461 * * the following items are not implemented yet: * - proxy advertisement delay rule (RFC2461 7.2.8, last paragraph, SHOULD) * - anycast advertisement delay rule (RFC2461 7.2.7, SHOULD) */ void nd6_na_output( struct ifnet *ifp, const struct in6_addr *daddr6_0, const struct in6_addr *taddr6, u_long flags, int tlladdr, /* 1 if include target link-layer address */ const struct sockaddr *sdl0) /* sockaddr_dl (= proxy NA) or NULL */ { struct mbuf *m; struct ip6_hdr *ip6; struct nd_neighbor_advert *nd_na; struct ip6_moptions im6o; struct sockaddr *dst; union { struct sockaddr dst; struct sockaddr_in6 dst6; } u; struct in6_addr daddr6; int icmp6len, maxlen, error; const void *mac; struct route ro; mac = NULL; memset(&ro, 0, sizeof(ro)); daddr6 = *daddr6_0; /* make a local copy for modification */ /* estimate the size of message */ maxlen = sizeof(*ip6) + sizeof(*nd_na); maxlen += (sizeof(struct nd_opt_hdr) + ifp->if_addrlen + 7) & ~7; KASSERTMSG(max_linkhdr + maxlen <= MCLBYTES, "max_linkhdr + maxlen > MCLBYTES (%d + %d > %d)", max_linkhdr, maxlen, MCLBYTES); MGETHDR(m, M_DONTWAIT, MT_DATA); if (m && max_linkhdr + maxlen >= MHLEN) { MCLGET(m, M_DONTWAIT); if ((m->m_flags & M_EXT) == 0) { m_free(m); m = NULL; } } if (m == NULL) return; m_reset_rcvif(m); if (IN6_IS_ADDR_MULTICAST(&daddr6)) { m->m_flags |= M_MCAST; im6o.im6o_multicast_if_index = if_get_index(ifp); im6o.im6o_multicast_hlim = 255; im6o.im6o_multicast_loop = 0; } icmp6len = sizeof(*nd_na); m->m_pkthdr.len = m->m_len = sizeof(struct ip6_hdr) + icmp6len; m->m_data += max_linkhdr; /* or m_align() equivalent? */ /* fill neighbor advertisement packet */ ip6 = mtod(m, struct ip6_hdr *); ip6->ip6_flow = 0; ip6->ip6_vfc &= ~IPV6_VERSION_MASK; ip6->ip6_vfc |= IPV6_VERSION; ip6->ip6_nxt = IPPROTO_ICMPV6; ip6->ip6_hlim = 255; if (IN6_IS_ADDR_UNSPECIFIED(&daddr6)) { /* reply to DAD */ daddr6.s6_addr16[0] = IPV6_ADDR_INT16_MLL; daddr6.s6_addr16[1] = 0; daddr6.s6_addr32[1] = 0; daddr6.s6_addr32[2] = 0; daddr6.s6_addr32[3] = IPV6_ADDR_INT32_ONE; if (in6_setscope(&daddr6, ifp, NULL)) goto bad; flags &= ~ND_NA_FLAG_SOLICITED; } ip6->ip6_dst = daddr6; sockaddr_in6_init(&u.dst6, &daddr6, 0, 0, 0); dst = &u.dst; if (rtcache_setdst(&ro, dst) != 0) goto bad; /* * Select a source whose scope is the same as that of the dest. */ error = in6_selectsrc(satosin6(dst), NULL, NULL, &ro, NULL, NULL, NULL, &ip6->ip6_src); if (error != 0) { char ip6buf[INET6_ADDRSTRLEN]; nd6log(LOG_DEBUG, "source can't be " "determined: dst=%s, error=%d\n", IN6_PRINT(ip6buf, &satocsin6(dst)->sin6_addr), error); goto bad; } nd_na = (struct nd_neighbor_advert *)(ip6 + 1); nd_na->nd_na_type = ND_NEIGHBOR_ADVERT; nd_na->nd_na_code = 0; nd_na->nd_na_target = *taddr6; in6_clearscope(&nd_na->nd_na_target); /* XXX */ /* * "tlladdr" indicates NS's condition for adding tlladdr or not. * see nd6_ns_input() for details. * Basically, if NS packet is sent to unicast/anycast addr, * target lladdr option SHOULD NOT be included. */ if (tlladdr) { /* * sdl0 != NULL indicates proxy NA. If we do proxy, use * lladdr in sdl0. If we are not proxying (sending NA for * my address) use lladdr configured for the interface. */ if (sdl0 == NULL) mac = nd6_ifptomac(ifp); else if (sdl0->sa_family == AF_LINK) { const struct sockaddr_dl *sdl; sdl = satocsdl(sdl0); if (sdl->sdl_alen == ifp->if_addrlen) mac = CLLADDR(sdl); } } if (tlladdr && mac) { int optlen = sizeof(struct nd_opt_hdr) + ifp->if_addrlen; struct nd_opt_hdr *nd_opt = (struct nd_opt_hdr *)(nd_na + 1); /* roundup to 8 bytes alignment! */ optlen = (optlen + 7) & ~7; m->m_pkthdr.len += optlen; m->m_len += optlen; icmp6len += optlen; memset((void *)nd_opt, 0, optlen); nd_opt->nd_opt_type = ND_OPT_TARGET_LINKADDR; nd_opt->nd_opt_len = optlen >> 3; memcpy((void *)(nd_opt + 1), mac, ifp->if_addrlen); } else flags &= ~ND_NA_FLAG_OVERRIDE; ip6->ip6_plen = htons((u_int16_t)icmp6len); nd_na->nd_na_flags_reserved = flags; nd_na->nd_na_cksum = 0; nd_na->nd_na_cksum = in6_cksum(m, IPPROTO_ICMPV6, sizeof(struct ip6_hdr), icmp6len); ip6_output(m, NULL, NULL, 0, &im6o, NULL, NULL); icmp6_ifstat_inc(ifp, ifs6_out_msg); icmp6_ifstat_inc(ifp, ifs6_out_neighboradvert); ICMP6_STATINC(ICMP6_STAT_OUTHIST + ND_NEIGHBOR_ADVERT); rtcache_free(&ro); return; bad: rtcache_free(&ro); m_freem(m); return; } const void * nd6_ifptomac(const struct ifnet *ifp) { switch (ifp->if_type) { case IFT_ARCNET: case IFT_ETHER: case IFT_IEEE1394: case IFT_PROPVIRTUAL: case IFT_CARP: case IFT_L2VLAN: case IFT_IEEE80211: return CLLADDR(ifp->if_sadl); default: return NULL; } } TAILQ_HEAD(dadq_head, dadq); struct dadq { TAILQ_ENTRY(dadq) dad_list; struct ifaddr *dad_ifa; int dad_count; /* max NS to send */ int dad_ns_tcount; /* # of trials to send NS */ int dad_ns_ocount; /* NS sent so far */ int dad_ns_lcount; /* looped back NS */ struct callout dad_timer_ch; #define ND_OPT_NONCE_STORE 3 /* dad_count should not exceed this */ /* * The default ip6_dad_count is 1 as specified by RFC 4862 and * practically must users won't exceed this. * A storage of 3 is defaulted to here, in-case the administrator wants * to match the equivalent behaviour in our ARP implementation. * This constraint could be removed by sending the on wire nonce as * hmac(key, dad_ns_ocount), but that would increase the nonce size * sent on the wire. */ uint8_t dad_nonce[ND_OPT_NONCE_STORE][ND_OPT_NONCE_LEN]; }; static struct dadq_head dadq; static kmutex_t nd6_dad_lock; void nd6_nbr_init(void) { TAILQ_INIT(&dadq); mutex_init(&nd6_dad_lock, MUTEX_DEFAULT, IPL_NONE); } static struct dadq * nd6_dad_find(struct ifaddr *ifa, struct nd_opt_nonce *nonce, bool *found_nonce) { struct in6_addr *myaddr6, *dadaddr6; bool match_ifa; struct dadq *dp; int i, nonce_max; KASSERT(mutex_owned(&nd6_dad_lock)); KASSERT(ifa != NULL); myaddr6 = IFA_IN6(ifa); if (nonce != NULL && nonce->nd_opt_nonce_len != (ND_OPT_NONCE_LEN + 2) / 8) nonce = NULL; match_ifa = nonce == NULL || found_nonce == NULL || *found_nonce == false; if (found_nonce != NULL) *found_nonce = false; TAILQ_FOREACH(dp, &dadq, dad_list) { if (match_ifa) { if (dp->dad_ifa != ifa) continue; } else { dadaddr6 = IFA_IN6(dp->dad_ifa); if (!IN6_ARE_ADDR_EQUAL(myaddr6, dadaddr6)) continue; } if (nonce == NULL) break; nonce_max = MIN(dp->dad_ns_ocount, ND_OPT_NONCE_STORE); for (i = 0; i < nonce_max; i++) { if (memcmp(nonce->nd_opt_nonce, dp->dad_nonce[i], ND_OPT_NONCE_LEN) == 0) break; } if (i < nonce_max) { char ip6buf[INET6_ADDRSTRLEN]; *found_nonce = true; log(LOG_DEBUG, "%s: detected a looped back NS message for %s\n", if_name(ifa->ifa_ifp), IN6_PRINT(ip6buf, myaddr6)); dp->dad_ns_lcount++; continue; } break; } return dp; } static bool nd6_dad_ownnonce(struct ifaddr *ifa, struct nd_opt_nonce *nonce) { bool found_nonce = true; mutex_enter(&nd6_dad_lock); nd6_dad_find(ifa, nonce, &found_nonce); mutex_exit(&nd6_dad_lock); return found_nonce; } static void nd6_dad_starttimer(struct dadq *dp, int ticks) { callout_reset(&dp->dad_timer_ch, ticks, (void (*)(void *))nd6_dad_timer, dp); } static void nd6_dad_stoptimer(struct dadq *dp) { KASSERT(mutex_owned(&nd6_dad_lock)); TAILQ_REMOVE(&dadq, dp, dad_list); /* Tell the timer that dp is being destroyed. */ dp->dad_ifa = NULL; callout_halt(&dp->dad_timer_ch, &nd6_dad_lock); } static void nd6_dad_destroytimer(struct dadq *dp) { KASSERT(dp->dad_ifa == NULL); callout_destroy(&dp->dad_timer_ch); kmem_intr_free(dp, sizeof(*dp)); } /* * Start Duplicate Address Detection (DAD) for specified interface address. * * Note that callout is used when xtick > 0 and not when xtick == 0. * * xtick: minimum delay ticks for IFF_UP event */ void nd6_dad_start(struct ifaddr *ifa, int xtick) { struct in6_ifaddr *ia = (struct in6_ifaddr *)ifa; struct dadq *dp; char ip6buf[INET6_ADDRSTRLEN]; /* * If we don't need DAD, don't do it. * There are several cases: * - DAD is disabled * - the interface address is anycast */ if (!(ia->ia6_flags & IN6_IFF_TENTATIVE)) { log(LOG_DEBUG, "nd6_dad_start: called with non-tentative address " "%s(%s)\n", IN6_PRINT(ip6buf, &ia->ia_addr.sin6_addr), if_name(ifa->ifa_ifp)); return; } if (ia->ia6_flags & IN6_IFF_ANYCAST || !ip6_dad_enabled()) { ia->ia6_flags &= ~IN6_IFF_TENTATIVE; rt_addrmsg(RTM_NEWADDR, ifa); return; } if (!(ifa->ifa_ifp->if_flags & IFF_UP)) return; dp = kmem_intr_alloc(sizeof(*dp), KM_NOSLEEP); mutex_enter(&nd6_dad_lock); if (nd6_dad_find(ifa, NULL, NULL) != NULL) { mutex_exit(&nd6_dad_lock); /* DAD already in progress */ if (dp != NULL) kmem_intr_free(dp, sizeof(*dp)); return; } if (dp == NULL) { mutex_exit(&nd6_dad_lock); log(LOG_ERR, "nd6_dad_start: memory allocation failed for " "%s(%s)\n", IN6_PRINT(ip6buf, &ia->ia_addr.sin6_addr), if_name(ifa->ifa_ifp)); return; } /* * Send NS packet for DAD, ip6_dad_count times. * Note that we must delay the first transmission, if this is the * first packet to be sent from the interface after interface * (re)initialization. */ callout_init(&dp->dad_timer_ch, CALLOUT_MPSAFE); dp->dad_ifa = ifa; ifaref(ifa); /* just for safety */ dp->dad_count = ip6_dad_count; dp->dad_ns_ocount = dp->dad_ns_tcount = 0; dp->dad_ns_lcount = 0; TAILQ_INSERT_TAIL(&dadq, (struct dadq *)dp, dad_list); nd6log(LOG_DEBUG, "%s: starting DAD for %s\n", if_name(ifa->ifa_ifp), IN6_PRINT(ip6buf, &ia->ia_addr.sin6_addr)); if (xtick == 0) { nd6_dad_ns_output(dp, ifa); nd6_dad_starttimer(dp, (long)ND_IFINFO(ifa->ifa_ifp)->retrans * hz / 1000); } else nd6_dad_starttimer(dp, xtick); mutex_exit(&nd6_dad_lock); } /* * terminate DAD unconditionally. used for address removals. */ void nd6_dad_stop(struct ifaddr *ifa) { struct dadq *dp; mutex_enter(&nd6_dad_lock); dp = nd6_dad_find(ifa, NULL, NULL); if (dp == NULL) { mutex_exit(&nd6_dad_lock); /* DAD wasn't started yet */ return; } /* Prevent the timer from running anymore. */ nd6_dad_stoptimer(dp); mutex_exit(&nd6_dad_lock); nd6_dad_destroytimer(dp); ifafree(ifa); } static void nd6_dad_timer(struct dadq *dp) { struct ifaddr *ifa; struct in6_ifaddr *ia; char ip6buf[INET6_ADDRSTRLEN]; bool need_free = false; KERNEL_LOCK_UNLESS_NET_MPSAFE(); mutex_enter(&nd6_dad_lock); ifa = dp->dad_ifa; if (ifa == NULL) { /* dp is being destroyed by someone. Do nothing. */ goto done; } ia = (struct in6_ifaddr *)ifa; if (ia->ia6_flags & IN6_IFF_DUPLICATED) { log(LOG_ERR, "nd6_dad_timer: called with duplicate address " "%s(%s)\n", IN6_PRINT(ip6buf, &ia->ia_addr.sin6_addr), if_name(ifa->ifa_ifp)); goto done; } if ((ia->ia6_flags & IN6_IFF_TENTATIVE) == 0) { log(LOG_ERR, "nd6_dad_timer: called with non-tentative address " "%s(%s)\n", IN6_PRINT(ip6buf, &ia->ia_addr.sin6_addr), if_name(ifa->ifa_ifp)); goto done; } /* timeouted with IFF_{RUNNING,UP} check */ if (dp->dad_ns_tcount > dad_maxtry) { nd6log(LOG_INFO, "%s: could not run DAD, driver problem?\n", if_name(ifa->ifa_ifp)); nd6_dad_stoptimer(dp); need_free = true; goto done; } /* Need more checks? */ if (dp->dad_ns_ocount < dp->dad_count) { /* * We have more NS to go. Send NS packet for DAD. */ nd6_dad_ns_output(dp, ifa); nd6_dad_starttimer(dp, (long)ND_IFINFO(ifa->ifa_ifp)->retrans * hz / 1000); } else { /* * We are done with DAD. No NA came, no NS came. * No duplicate address found. */ ia->ia6_flags &= ~IN6_IFF_TENTATIVE; rt_addrmsg(RTM_NEWADDR, ifa); nd6log(LOG_DEBUG, "%s: DAD complete for %s - no duplicates found\n", if_name(ifa->ifa_ifp), IN6_PRINT(ip6buf, &ia->ia_addr.sin6_addr)); nd6_dad_stoptimer(dp); need_free = true; } done: mutex_exit(&nd6_dad_lock); if (need_free) { nd6_dad_destroytimer(dp); KASSERT(ifa != NULL); ifafree(ifa); } KERNEL_UNLOCK_UNLESS_NET_MPSAFE(); } static void nd6_dad_duplicated(struct ifaddr *ifa, struct dadq *dp, const struct sockaddr_dl *from) { struct in6_ifaddr *ia; struct ifnet *ifp; char ip6buf[INET6_ADDRSTRLEN], llabuf[LLA_ADDRSTRLEN], *llastr; KASSERT(mutex_owned(&nd6_dad_lock)); KASSERT(ifa != NULL); ifp = ifa->ifa_ifp; ia = (struct in6_ifaddr *)ifa; ia->ia6_flags &= ~IN6_IFF_TENTATIVE; ia->ia6_flags |= IN6_IFF_DUPLICATED; if (__predict_false(from == NULL)) llastr = NULL; else llastr = lla_snprintf(llabuf, sizeof(llabuf), CLLADDR(from), from->sdl_alen); log(LOG_ERR, "%s: DAD duplicate address %s from %s\n", if_name(ifp), IN6_PRINT(ip6buf, &ia->ia_addr.sin6_addr), llastr); /* Inform the routing socket that DAD has completed */ rt_addrmsg_src(RTM_NEWADDR, ifa, (const struct sockaddr *)from); /* * If the address is a link-local address formed from an interface * identifier based on the hardware address which is supposed to be * uniquely assigned (e.g., EUI-64 for an Ethernet interface), IP * operation on the interface SHOULD be disabled. * [rfc2462bis-03 Section 5.4.5] */ if (IN6_IS_ADDR_LINKLOCAL(&ia->ia_addr.sin6_addr)) { struct in6_addr in6; /* * To avoid over-reaction, we only apply this logic when we are * very sure that hardware addresses are supposed to be unique. */ switch (ifp->if_type) { case IFT_ETHER: case IFT_ATM: case IFT_IEEE1394: case IFT_IEEE80211: in6 = ia->ia_addr.sin6_addr; if (in6_get_hw_ifid(ifp, &in6) == 0 && IN6_ARE_ADDR_EQUAL(&ia->ia_addr.sin6_addr, &in6)) { ND_IFINFO(ifp)->flags |= ND6_IFF_IFDISABLED; log(LOG_ERR, "%s: possible hardware address " "duplication detected, disable IPv6\n", if_name(ifp)); } break; } } } static void nd6_dad_ns_output(struct dadq *dp, struct ifaddr *ifa) { struct in6_ifaddr *ia = (struct in6_ifaddr *)ifa; struct ifnet *ifp = ifa->ifa_ifp; uint8_t *nonce; dp->dad_ns_tcount++; if ((ifp->if_flags & IFF_UP) == 0) { #if 0 printf("%s: interface down?\n", if_name(ifp)); #endif return; } if ((ifp->if_flags & IFF_RUNNING) == 0) { #if 0 printf("%s: interface not running?\n", if_name(ifp)); #endif return; } dp->dad_ns_tcount = 0; nonce = dp->dad_nonce[dp->dad_ns_ocount % ND_OPT_NONCE_STORE]; cprng_fast(nonce, ND_OPT_NONCE_LEN); dp->dad_ns_ocount++; nd6_ns_output(ifp, NULL, &ia->ia_addr.sin6_addr, NULL, nonce); } static void nd6_dad_input(struct ifaddr *ifa, struct nd_opt_nonce *nonce, const struct sockaddr_dl *from) { struct dadq *dp; bool found_nonce = false; KASSERT(ifa != NULL); mutex_enter(&nd6_dad_lock); dp = nd6_dad_find(ifa, nonce, &found_nonce); if (!found_nonce) { nd6_dad_duplicated(ifa, dp, from); if (dp != NULL) nd6_dad_stoptimer(dp); } mutex_exit(&nd6_dad_lock); if (dp != NULL) { nd6_dad_destroytimer(dp); ifafree(ifa); } }
65 1 1 1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 /* $NetBSD: tty_conf.c,v 1.57 2021/08/09 20:49:10 andvar Exp $ */ /*- * Copyright (c) 2005, 2007 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Jason R. Thorpe. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /*- * Copyright (c) 1982, 1986, 1991, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)tty_conf.c 8.5 (Berkeley) 1/9/95 */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: tty_conf.c,v 1.57 2021/08/09 20:49:10 andvar Exp $"); #define TTY_ALLOW_PRIVATE #include <sys/param.h> #include <sys/systm.h> #include <sys/poll.h> #include <sys/proc.h> #include <sys/tty.h> #include <sys/ttycom.h> #include <sys/conf.h> #include <sys/mutex.h> #include <sys/queue.h> static struct linesw termios_disc = { .l_name = "termios", .l_open = ttylopen, .l_close = ttylclose, .l_read = ttread, .l_write = ttwrite, .l_ioctl = ttynullioctl, .l_rint = ttyinput, .l_start = ttstart, .l_modem = ttymodem, .l_poll = ttpoll }; /* * This is for the benefit of old BSD TTY compatibility, but since it is * identical to termios (except for the name), don't bother conditionalizing * it. */ static struct linesw ntty_disc = { /* old NTTYDISC */ .l_name = "ntty", .l_open = ttylopen, .l_close = ttylclose, .l_read = ttread, .l_write = ttwrite, .l_ioctl = ttynullioctl, .l_rint = ttyinput, .l_start = ttstart, .l_modem = ttymodem, .l_poll = ttpoll }; static LIST_HEAD(, linesw) ttyldisc_list = LIST_HEAD_INITIALIZER(ttyldisc_head); /* * Note: We don't bother refcounting termios_disc and ntty_disc; they can't * be removed from the list, and termios_disc is likely to have very many * references (could we overflow the count?). */ #define TTYLDISC_ISSTATIC(disc) \ ((disc) == &termios_disc || (disc) == &ntty_disc) #define TTYLDISC_HOLD(disc) \ do { \ if (! TTYLDISC_ISSTATIC(disc)) { \ KASSERT((disc)->l_refcnt != UINT_MAX); \ (disc)->l_refcnt++; \ } \ } while (/*CONSTCOND*/0) #define TTYLDISC_RELE(disc) \ do { \ if (! TTYLDISC_ISSTATIC(disc)) { \ KASSERT((disc)->l_refcnt != 0); \ (disc)->l_refcnt--; \ } \ } while (/*CONSTCOND*/0) #define TTYLDISC_ISINUSE(disc) \ (TTYLDISC_ISSTATIC(disc) || (disc)->l_refcnt != 0) /* * Do nothing specific version of line * discipline specific ioctl command. */ /*ARGSUSED*/ int ttynullioctl(struct tty *tp, u_long cmd, void *data, int flags, struct lwp *l) { return (EPASSTHROUGH); } /* * Return error to line discipline * specific poll call. */ /*ARGSUSED*/ int ttyerrpoll(struct tty *tp, int events, struct lwp *l) { return (POLLERR); } void ttyldisc_init(void) { if (ttyldisc_attach(&termios_disc) != 0) panic("ttyldisc_init: termios_disc"); if (ttyldisc_attach(&ntty_disc) != 0) panic("ttyldisc_init: ntty_disc"); } static struct linesw * ttyldisc_lookup_locked(const char *name) { struct linesw *disc; LIST_FOREACH(disc, &ttyldisc_list, l_list) { if (strcmp(name, disc->l_name) == 0) return (disc); } return (NULL); } /* * Look up a line discipline by its name. Caller holds a reference on * the returned line discipline. */ struct linesw * ttyldisc_lookup(const char *name) { struct linesw *disc; mutex_spin_enter(&tty_lock); disc = ttyldisc_lookup_locked(name); if (disc != NULL) TTYLDISC_HOLD(disc); mutex_spin_exit(&tty_lock); return (disc); } /* * Look up a line discipline by its legacy number. Caller holds a * reference on the returned line discipline. */ struct linesw * ttyldisc_lookup_bynum(int num) { struct linesw *disc; mutex_spin_enter(&tty_lock); LIST_FOREACH(disc, &ttyldisc_list, l_list) { if (disc->l_no == num) { TTYLDISC_HOLD(disc); mutex_spin_exit(&tty_lock); return (disc); } } mutex_spin_exit(&tty_lock); return (NULL); } /* * Release a reference on a line discipline previously added by * ttyldisc_lookup() or ttyldisc_lookup_bynum(). */ void ttyldisc_release(struct linesw *disc) { if (disc == NULL) return; mutex_spin_enter(&tty_lock); TTYLDISC_RELE(disc); mutex_spin_exit(&tty_lock); } #define TTYLDISC_LEGACY_NUMBER_MIN 10 #define TTYLDISC_LEGACY_NUMBER_MAX INT_MAX static void ttyldisc_assign_legacy_number(struct linesw *disc) { static const struct { const char *name; int num; } table[] = { { "termios", TTYDISC }, { "ntty", 2 /* XXX old NTTYDISC */ }, { "tablet", TABLDISC }, { "slip", SLIPDISC }, { "ppp", PPPDISC }, { "strip", STRIPDISC }, { "hdlc", HDLCDISC }, { NULL, 0 } }; struct linesw *ldisc; int i; for (i = 0; table[i].name != NULL; i++) { if (strcmp(disc->l_name, table[i].name) == 0) { disc->l_no = table[i].num; return; } } disc->l_no = TTYLDISC_LEGACY_NUMBER_MIN; LIST_FOREACH(ldisc, &ttyldisc_list, l_list) { if (disc->l_no == ldisc->l_no) { KASSERT(disc->l_no < TTYLDISC_LEGACY_NUMBER_MAX); disc->l_no++; } } } /* * Register a line discipline. */ int ttyldisc_attach(struct linesw *disc) { KASSERT(disc->l_name != NULL); KASSERT(disc->l_open != NULL); KASSERT(disc->l_close != NULL); KASSERT(disc->l_read != NULL); KASSERT(disc->l_write != NULL); KASSERT(disc->l_ioctl != NULL); KASSERT(disc->l_rint != NULL); KASSERT(disc->l_start != NULL); KASSERT(disc->l_modem != NULL); KASSERT(disc->l_poll != NULL); /* You are not allowed to exceed TTLINEDNAMELEN */ if (strlen(disc->l_name) >= TTLINEDNAMELEN) return (ENAMETOOLONG); mutex_spin_enter(&tty_lock); if (ttyldisc_lookup_locked(disc->l_name) != NULL) { mutex_spin_exit(&tty_lock); return (EEXIST); } ttyldisc_assign_legacy_number(disc); LIST_INSERT_HEAD(&ttyldisc_list, disc, l_list); mutex_spin_exit(&tty_lock); return (0); } /* * Remove a line discipline. */ int ttyldisc_detach(struct linesw *disc) { #ifdef DIAGNOSTIC struct linesw *ldisc = ttyldisc_lookup(disc->l_name); KASSERT(ldisc != NULL); KASSERT(ldisc == disc); ttyldisc_release(ldisc); #endif mutex_spin_enter(&tty_lock); if (TTYLDISC_ISINUSE(disc)) { mutex_spin_exit(&tty_lock); return (EBUSY); } LIST_REMOVE(disc, l_list); mutex_spin_exit(&tty_lock); return (0); } /* * Return the default line discipline. */ struct linesw * ttyldisc_default(void) { return (&termios_disc); }
2741 2764 2680 334 331 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 /* $NetBSD: sys_syscall.c,v 1.15 2022/06/29 16:33:09 hannken Exp $ */ /*- * Copyright (c) 2006 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by David Laight. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: sys_syscall.c,v 1.15 2022/06/29 16:33:09 hannken Exp $"); #include <sys/syscall_stats.h> #include <sys/syscallvar.h> /* * MI indirect system call support. * Included from sys_indirect.c and compat/netbsd32/netbsd32_indirect.c * * SYS_SYSCALL is set to the required function name. */ #define CONCAT(a,b) __CONCAT(a,b) static void CONCAT(SYS_SYSCALL, _biglockcheck)(struct proc *p, int code) { #ifdef DIAGNOSTIC kpreempt_disable(); /* make curcpu() stable */ KASSERTMSG(curcpu()->ci_biglock_count == 0, "syscall %ld of emul %s leaked %d kernel locks", (long)code, p->p_emul->e_name, curcpu()->ci_biglock_count); kpreempt_enable(); #endif } int SYS_SYSCALL(struct lwp *l, const struct CONCAT(SYS_SYSCALL, _args) *uap, register_t *rval) { /* { syscallarg(int) code; syscallarg(register_t) args[SYS_MAXSYSARGS]; } */ const struct sysent *callp; struct proc *p = l->l_proc; int code; int error; #ifdef NETBSD32_SYSCALL register_t args64[SYS_MAXSYSARGS]; int i, narg; #define TRACE_ARGS args64 #else #define TRACE_ARGS &SCARG(uap, args[0]) #endif callp = p->p_emul->e_sysent; code = SCARG(uap, code) & (SYS_NSYSENT - 1); SYSCALL_COUNT(syscall_counts, code); callp += code; if (__predict_false(callp->sy_flags & SYCALL_INDIRECT)) return ENOSYS; if (__predict_true(!p->p_trace_enabled)) { error = sy_call(callp, l, &uap->args, rval); CONCAT(SYS_SYSCALL, _biglockcheck)(p, code); return error; } #ifdef NETBSD32_SYSCALL narg = callp->sy_narg; for (i = 0; i < narg; i++) args64[i] = SCARG(uap, args[i]); #endif error = trace_enter(code, callp, TRACE_ARGS); if (__predict_true(error == 0)) error = sy_call(callp, l, &uap->args, rval); trace_exit(code, callp, &uap->args, rval, error); CONCAT(SYS_SYSCALL, _biglockcheck)(p, code); return error; #undef TRACE_ARGS }
14 3 11 14 15 15 15 15 15 15 1 11 4 15 7 6 8 1 11 3 3 3 6 4 11 6 5 6 5 3 3 3 3 3 3 3 3 6 5 11 12 4 14 9 5 14 14 14 6 13 6 9 8 6 14 15 11 4 12 3 15 6 6 1 1 1 1 48 48 2 7 24 5 1 1 2 1 2 1 1 1 1 1 1 1 1 6 1 1 1 1 19 6 6 5 5 1 1 1 1 2 2 2 1 1 1 1 2 1 2 8 2 3 2 1 4 7 5 1 5 1 1 5 11 4 4 4 3 2 2 1 8 25 7 21 2 1 1 1 4 11 8 6 4 17 1 1 38 36 1 14 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 /* $NetBSD: ip_output.c,v 1.326 2023/04/19 22:00:18 mlelstv Exp $ */ /* * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the project nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * Copyright (c) 1998 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Public Access Networks Corporation ("Panix"). It was developed under * contract to Panix by Eric Haszlakiewicz and Thor Lancelot Simon. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /* * Copyright (c) 1982, 1986, 1988, 1990, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)ip_output.c 8.3 (Berkeley) 1/21/94 */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: ip_output.c,v 1.326 2023/04/19 22:00:18 mlelstv Exp $"); #ifdef _KERNEL_OPT #include "opt_inet.h" #include "opt_ipsec.h" #include "opt_mrouting.h" #include "opt_net_mpsafe.h" #include "opt_mpls.h" #endif #include "arp.h" #include <sys/param.h> #include <sys/kmem.h> #include <sys/mbuf.h> #include <sys/socket.h> #include <sys/socketvar.h> #include <sys/kauth.h> #include <sys/systm.h> #include <sys/syslog.h> #include <net/if.h> #include <net/if_types.h> #include <net/route.h> #include <net/pfil.h> #include <netinet/in.h> #include <netinet/in_systm.h> #include <netinet/ip.h> #include <netinet/in_pcb.h> #include <netinet/in_var.h> #include <netinet/ip_var.h> #include <netinet/ip_private.h> #include <netinet/in_offload.h> #include <netinet/portalgo.h> #include <netinet/udp.h> #include <netinet/udp_var.h> #ifdef INET6 #include <netinet6/ip6_var.h> #endif #ifdef MROUTING #include <netinet/ip_mroute.h> #endif #ifdef IPSEC #include <netipsec/ipsec.h> #include <netipsec/key.h> #endif #ifdef MPLS #include <netmpls/mpls.h> #include <netmpls/mpls_var.h> #endif static int ip_pcbopts(struct inpcb *, const struct sockopt *); static struct mbuf *ip_insertoptions(struct mbuf *, struct mbuf *, int *); static struct ifnet *ip_multicast_if(struct in_addr *, int *); static void ip_mloopback(struct ifnet *, struct mbuf *, const struct sockaddr_in *); static int ip_ifaddrvalid(const struct in_ifaddr *); extern pfil_head_t *inet_pfil_hook; /* XXX */ int ip_do_loopback_cksum = 0; static int ip_mark_mpls(struct ifnet * const ifp, struct mbuf * const m, const struct rtentry *rt) { int error = 0; #ifdef MPLS union mpls_shim msh; if (rt == NULL || rt_gettag(rt) == NULL || rt_gettag(rt)->sa_family != AF_MPLS || (m->m_flags & (M_MCAST | M_BCAST)) != 0 || ifp->if_type != IFT_ETHER) return 0; msh.s_addr = MPLS_GETSADDR(rt); if (msh.shim.label != MPLS_LABEL_IMPLNULL) { struct m_tag *mtag; /* * XXX tentative solution to tell ether_output * it's MPLS. Need some more efficient solution. */ mtag = m_tag_get(PACKET_TAG_MPLS, sizeof(int) /* dummy */, M_NOWAIT); if (mtag == NULL) return ENOMEM; m_tag_prepend(m, mtag); } #endif return error; } /* * Send an IP packet to a host. */ int ip_if_output(struct ifnet * const ifp, struct mbuf * const m, const struct sockaddr * const dst, const struct rtentry *rt) { int error = 0; if (rt != NULL) { error = rt_check_reject_route(rt, ifp); if (error != 0) { IP_STATINC(IP_STAT_RTREJECT); m_freem(m); return error; } } error = ip_mark_mpls(ifp, m, rt); if (error != 0) { m_freem(m); return error; } error = if_output_lock(ifp, ifp, m, dst, rt); return error; } /* * IP output. The packet in mbuf chain m contains a skeletal IP * header (with len, off, ttl, proto, tos, src, dst). * The mbuf chain containing the packet will be freed. * The mbuf opt, if present, will not be freed. */ int ip_output(struct mbuf *m0, struct mbuf *opt, struct route *ro, int flags, struct ip_moptions *imo, struct inpcb *inp) { struct rtentry *rt; struct ip *ip; struct ifnet *ifp, *mifp = NULL; struct mbuf *m = m0; int len, hlen, error = 0; struct route iproute; const struct sockaddr_in *dst; struct in_ifaddr *ia = NULL; struct ifaddr *ifa; int isbroadcast; int sw_csum; u_long mtu; bool natt_frag = false; bool rtmtu_nolock; union { struct sockaddr sa; struct sockaddr_in sin; } udst, usrc; struct sockaddr *rdst = &udst.sa; /* real IP destination, as * opposed to the nexthop */ struct psref psref, psref_ia; int bound; bool bind_need_restore = false; const struct sockaddr *sa; len = 0; MCLAIM(m, &ip_tx_mowner); KASSERT((m->m_flags & M_PKTHDR) != 0); KASSERT((m->m_pkthdr.csum_flags & (M_CSUM_TCPv6|M_CSUM_UDPv6)) == 0); KASSERT((m->m_pkthdr.csum_flags & (M_CSUM_TCPv4|M_CSUM_UDPv4)) != (M_CSUM_TCPv4|M_CSUM_UDPv4)); KASSERT(m->m_len >= sizeof(struct ip)); hlen = sizeof(struct ip); if (opt) { m = ip_insertoptions(m, opt, &len); hlen = len; } ip = mtod(m, struct ip *); /* * Fill in IP header. */ if ((flags & (IP_FORWARDING|IP_RAWOUTPUT)) == 0) { ip->ip_v = IPVERSION; ip->ip_off = htons(0); /* ip->ip_id filled in after we find out source ia */ ip->ip_hl = hlen >> 2; IP_STATINC(IP_STAT_LOCALOUT); } else { hlen = ip->ip_hl << 2; } /* * Route packet. */ if (ro == NULL) { memset(&iproute, 0, sizeof(iproute)); ro = &iproute; } sockaddr_in_init(&udst.sin, &ip->ip_dst, 0); dst = satocsin(rtcache_getdst(ro)); /* * If there is a cached route, check that it is to the same * destination and is still up. If not, free it and try again. * The address family should also be checked in case of sharing * the cache with IPv6. */ if (dst && (dst->sin_family != AF_INET || !in_hosteq(dst->sin_addr, ip->ip_dst))) rtcache_free(ro); /* XXX must be before rtcache operations */ bound = curlwp_bind(); bind_need_restore = true; if ((rt = rtcache_validate(ro)) == NULL && (rt = rtcache_update(ro, 1)) == NULL) { dst = &udst.sin; error = rtcache_setdst(ro, &udst.sa); if (error != 0) { IP_STATINC(IP_STAT_ODROPPED); goto bad; } } /* * If routing to interface only, short circuit routing lookup. */ if (flags & IP_ROUTETOIF) { ifa = ifa_ifwithladdr_psref(sintocsa(dst), &psref_ia); if (ifa == NULL) { IP_STATINC(IP_STAT_NOROUTE); error = ENETUNREACH; goto bad; } /* ia is already referenced by psref_ia */ ia = ifatoia(ifa); ifp = ia->ia_ifp; mtu = ifp->if_mtu; ip->ip_ttl = 1; isbroadcast = in_broadcast(dst->sin_addr, ifp); } else if (((IN_MULTICAST(ip->ip_dst.s_addr) || ip->ip_dst.s_addr == INADDR_BROADCAST) || (flags & IP_ROUTETOIFINDEX)) && imo != NULL && imo->imo_multicast_if_index != 0) { ifp = mifp = if_get_byindex(imo->imo_multicast_if_index, &psref); if (ifp == NULL) { IP_STATINC(IP_STAT_NOROUTE); error = ENETUNREACH; goto bad; } mtu = ifp->if_mtu; ia = in_get_ia_from_ifp_psref(ifp, &psref_ia); if (IN_MULTICAST(ip->ip_dst.s_addr) || ip->ip_dst.s_addr == INADDR_BROADCAST) { isbroadcast = 0; } else { /* IP_ROUTETOIFINDEX */ isbroadcast = in_broadcast(dst->sin_addr, ifp); if ((isbroadcast == 0) && ((ifp->if_flags & (IFF_LOOPBACK | IFF_POINTOPOINT)) == 0) && (in_direct(dst->sin_addr, ifp) == 0)) { /* gateway address required */ if (rt == NULL) rt = rtcache_init(ro); if (rt == NULL || rt->rt_ifp != ifp) { IP_STATINC(IP_STAT_NOROUTE); error = EHOSTUNREACH; goto bad; } rt->rt_use++; if (rt->rt_flags & RTF_GATEWAY) dst = satosin(rt->rt_gateway); if (rt->rt_flags & RTF_HOST) isbroadcast = rt->rt_flags & RTF_BROADCAST; } } } else { if (rt == NULL) rt = rtcache_init(ro); if (rt == NULL) { IP_STATINC(IP_STAT_NOROUTE); error = EHOSTUNREACH; goto bad; } if (ifa_is_destroying(rt->rt_ifa)) { rtcache_unref(rt, ro); rt = NULL; IP_STATINC(IP_STAT_NOROUTE); error = EHOSTUNREACH; goto bad; } ifa_acquire(rt->rt_ifa, &psref_ia); ia = ifatoia(rt->rt_ifa); ifp = rt->rt_ifp; if ((mtu = rt->rt_rmx.rmx_mtu) == 0) mtu = ifp->if_mtu; rt->rt_use++; if (rt->rt_flags & RTF_GATEWAY) dst = satosin(rt->rt_gateway); if (rt->rt_flags & RTF_HOST) isbroadcast = rt->rt_flags & RTF_BROADCAST; else isbroadcast = in_broadcast(dst->sin_addr, ifp); } rtmtu_nolock = rt && (rt->rt_rmx.rmx_locks & RTV_MTU) == 0; if (IN_MULTICAST(ip->ip_dst.s_addr) || (ip->ip_dst.s_addr == INADDR_BROADCAST)) { bool inmgroup; m->m_flags |= (ip->ip_dst.s_addr == INADDR_BROADCAST) ? M_BCAST : M_MCAST; /* * See if the caller provided any multicast options */ if (imo != NULL) ip->ip_ttl = imo->imo_multicast_ttl; else ip->ip_ttl = IP_DEFAULT_MULTICAST_TTL; /* * if we don't know the outgoing ifp yet, we can't generate * output */ if (!ifp) { IP_STATINC(IP_STAT_NOROUTE); error = ENETUNREACH; goto bad; } /* * If the packet is multicast or broadcast, confirm that * the outgoing interface can transmit it. */ if (((m->m_flags & M_MCAST) && (ifp->if_flags & IFF_MULTICAST) == 0) || ((m->m_flags & M_BCAST) && (ifp->if_flags & (IFF_BROADCAST|IFF_POINTOPOINT)) == 0)) { IP_STATINC(IP_STAT_NOROUTE); error = ENETUNREACH; goto bad; } /* * If source address not specified yet, use an address * of outgoing interface. */ if (in_nullhost(ip->ip_src)) { struct in_ifaddr *xia; struct ifaddr *xifa; struct psref _psref; xia = in_get_ia_from_ifp_psref(ifp, &_psref); if (!xia) { IP_STATINC(IP_STAT_IFNOADDR); error = EADDRNOTAVAIL; goto bad; } xifa = &xia->ia_ifa; if (xifa->ifa_getifa != NULL) { ia4_release(xia, &_psref); /* FIXME ifa_getifa is NOMPSAFE */ xia = ifatoia((*xifa->ifa_getifa)(xifa, rdst)); if (xia == NULL) { IP_STATINC(IP_STAT_IFNOADDR); error = EADDRNOTAVAIL; goto bad; } ia4_acquire(xia, &_psref); } ip->ip_src = xia->ia_addr.sin_addr; ia4_release(xia, &_psref); } inmgroup = in_multi_group(ip->ip_dst, ifp, flags); if (inmgroup && (imo == NULL || imo->imo_multicast_loop)) { /* * If we belong to the destination multicast group * on the outgoing interface, and the caller did not * forbid loopback, loop back a copy. */ ip_mloopback(ifp, m, &udst.sin); } #ifdef MROUTING else { /* * If we are acting as a multicast router, perform * multicast forwarding as if the packet had just * arrived on the interface to which we are about * to send. The multicast forwarding function * recursively calls this function, using the * IP_FORWARDING flag to prevent infinite recursion. * * Multicasts that are looped back by ip_mloopback(), * above, will be forwarded by the ip_input() routine, * if necessary. */ extern struct socket *ip_mrouter; if (ip_mrouter && (flags & IP_FORWARDING) == 0) { if (ip_mforward(m, ifp) != 0) { m_freem(m); goto done; } } } #endif /* * Multicasts with a time-to-live of zero may be looped- * back, above, but must not be transmitted on a network. * Also, multicasts addressed to the loopback interface * are not sent -- the above call to ip_mloopback() will * loop back a copy if this host actually belongs to the * destination group on the loopback interface. */ if (ip->ip_ttl == 0 || (ifp->if_flags & IFF_LOOPBACK) != 0) { IP_STATINC(IP_STAT_ODROPPED); m_freem(m); goto done; } goto sendit; } /* * If source address not specified yet, use address * of outgoing interface. */ if (in_nullhost(ip->ip_src)) { struct ifaddr *xifa; xifa = &ia->ia_ifa; if (xifa->ifa_getifa != NULL) { ia4_release(ia, &psref_ia); /* FIXME ifa_getifa is NOMPSAFE */ ia = ifatoia((*xifa->ifa_getifa)(xifa, rdst)); if (ia == NULL) { error = EADDRNOTAVAIL; goto bad; } ia4_acquire(ia, &psref_ia); } ip->ip_src = ia->ia_addr.sin_addr; } /* * Packets with Class-D address as source are not valid per * RFC1112. */ if (IN_MULTICAST(ip->ip_src.s_addr)) { IP_STATINC(IP_STAT_ODROPPED); error = EADDRNOTAVAIL; goto bad; } /* * Look for broadcast address and verify user is allowed to * send such a packet. */ if (isbroadcast) { if ((ifp->if_flags & IFF_BROADCAST) == 0) { IP_STATINC(IP_STAT_BCASTDENIED); error = EADDRNOTAVAIL; goto bad; } if ((flags & IP_ALLOWBROADCAST) == 0) { IP_STATINC(IP_STAT_BCASTDENIED); error = EACCES; goto bad; } /* don't allow broadcast messages to be fragmented */ if (ntohs(ip->ip_len) > ifp->if_mtu) { IP_STATINC(IP_STAT_BCASTDENIED); error = EMSGSIZE; goto bad; } m->m_flags |= M_BCAST; } else m->m_flags &= ~M_BCAST; sendit: if ((flags & (IP_FORWARDING|IP_NOIPNEWID)) == 0) { if (m->m_pkthdr.len < IP_MINFRAGSIZE) { ip->ip_id = 0; } else if ((m->m_pkthdr.csum_flags & M_CSUM_TSOv4) == 0) { ip->ip_id = ip_newid(ia); } else { /* * TSO capable interfaces (typically?) increment * ip_id for each segment. * "allocate" enough ids here to increase the chance * for them to be unique. * * note that the following calculation is not * needed to be precise. wasting some ip_id is fine. */ unsigned int segsz = m->m_pkthdr.segsz; unsigned int datasz = ntohs(ip->ip_len) - hlen; unsigned int num = howmany(datasz, segsz); ip->ip_id = ip_newid_range(ia, num); } } if (ia != NULL) { ia4_release(ia, &psref_ia); ia = NULL; } /* * If we're doing Path MTU Discovery, we need to set DF unless * the route's MTU is locked. */ if ((flags & IP_MTUDISC) != 0 && rtmtu_nolock) { ip->ip_off |= htons(IP_DF); } #ifdef IPSEC if (ipsec_used) { bool ipsec_done = false; bool count_drop = false; /* Perform IPsec processing, if any. */ error = ipsec4_output(m, inp, flags, &mtu, &natt_frag, &ipsec_done, &count_drop); if (count_drop) IP_STATINC(IP_STAT_IPSECDROP_OUT); if (error || ipsec_done) goto done; } if (!ipsec_used || !natt_frag) #endif { /* * Run through list of hooks for output packets. */ error = pfil_run_hooks(inet_pfil_hook, &m, ifp, PFIL_OUT); if (error || m == NULL) { IP_STATINC(IP_STAT_PFILDROP_OUT); goto done; } } ip = mtod(m, struct ip *); hlen = ip->ip_hl << 2; m->m_pkthdr.csum_data |= hlen << 16; /* * search for the source address structure to * maintain output statistics, and verify address * validity */ KASSERT(ia == NULL); sockaddr_in_init(&usrc.sin, &ip->ip_src, 0); ifa = ifaof_ifpforaddr_psref(&usrc.sa, ifp, &psref_ia); if (ifa != NULL) ia = ifatoia(ifa); /* * Ensure we only send from a valid address. * A NULL address is valid because the packet could be * generated from a packet filter. */ if (ia != NULL && (flags & IP_FORWARDING) == 0 && (error = ip_ifaddrvalid(ia)) != 0) { ARPLOG(LOG_ERR, "refusing to send from invalid address %s (pid %d)\n", ARPLOGADDR(&ip->ip_src), curproc->p_pid); IP_STATINC(IP_STAT_ODROPPED); if (error == 1) /* * Address exists, but is tentative or detached. * We can't send from it because it's invalid, * so we drop the packet. */ error = 0; else error = EADDRNOTAVAIL; goto bad; } /* Maybe skip checksums on loopback interfaces. */ if (IN_NEED_CHECKSUM(ifp, M_CSUM_IPv4)) { m->m_pkthdr.csum_flags |= M_CSUM_IPv4; } sw_csum = m->m_pkthdr.csum_flags & ~ifp->if_csum_flags_tx; /* Need to fragment the packet */ if (ntohs(ip->ip_len) > mtu && (m->m_pkthdr.csum_flags & M_CSUM_TSOv4) == 0) { goto fragment; } #if IFA_STATS if (ia) ia->ia_ifa.ifa_data.ifad_outbytes += ntohs(ip->ip_len); #endif /* * Always initialize the sum to 0! Some HW assisted * checksumming requires this. */ ip->ip_sum = 0; if ((m->m_pkthdr.csum_flags & M_CSUM_TSOv4) == 0) { /* * Perform any checksums that the hardware can't do * for us. * * XXX Does any hardware require the {th,uh}_sum * XXX fields to be 0? */ if (sw_csum & M_CSUM_IPv4) { KASSERT(IN_NEED_CHECKSUM(ifp, M_CSUM_IPv4)); ip->ip_sum = in_cksum(m, hlen); m->m_pkthdr.csum_flags &= ~M_CSUM_IPv4; } if (sw_csum & (M_CSUM_TCPv4|M_CSUM_UDPv4)) { if (IN_NEED_CHECKSUM(ifp, sw_csum & (M_CSUM_TCPv4|M_CSUM_UDPv4))) { in_undefer_cksum_tcpudp(m); } m->m_pkthdr.csum_flags &= ~(M_CSUM_TCPv4|M_CSUM_UDPv4); } } sa = (m->m_flags & M_MCAST) ? sintocsa(rdst) : sintocsa(dst); /* Send it */ if (__predict_false(sw_csum & M_CSUM_TSOv4)) { /* * TSO4 is required by a packet, but disabled for * the interface. */ error = ip_tso_output(ifp, m, sa, rt); } else error = ip_if_output(ifp, m, sa, rt); goto done; fragment: /* * We can't use HW checksumming if we're about to fragment the packet. * * XXX Some hardware can do this. */ if (m->m_pkthdr.csum_flags & (M_CSUM_TCPv4|M_CSUM_UDPv4)) { if (IN_NEED_CHECKSUM(ifp, m->m_pkthdr.csum_flags & (M_CSUM_TCPv4|M_CSUM_UDPv4))) { in_undefer_cksum_tcpudp(m); } m->m_pkthdr.csum_flags &= ~(M_CSUM_TCPv4|M_CSUM_UDPv4); } /* * Too large for interface; fragment if possible. * Must be able to put at least 8 bytes per fragment. */ if (ntohs(ip->ip_off) & IP_DF) { if (flags & IP_RETURNMTU) { KASSERT(inp != NULL); in4p_errormtu(inp) = mtu; } error = EMSGSIZE; IP_STATINC(IP_STAT_CANTFRAG); goto bad; } error = ip_fragment(m, ifp, mtu); if (error) { m = NULL; goto bad; } for (; m; m = m0) { m0 = m->m_nextpkt; m->m_nextpkt = NULL; if (error) { m_freem(m); continue; } #if IFA_STATS if (ia) ia->ia_ifa.ifa_data.ifad_outbytes += ntohs(ip->ip_len); #endif /* * If we get there, the packet has not been handled by * IPsec whereas it should have. Now that it has been * fragmented, re-inject it in ip_output so that IPsec * processing can occur. */ if (natt_frag) { error = ip_output(m, opt, NULL, flags | IP_RAWOUTPUT | IP_NOIPNEWID, imo, inp); } else { KASSERT((m->m_pkthdr.csum_flags & (M_CSUM_UDPv4 | M_CSUM_TCPv4)) == 0); error = ip_if_output(ifp, m, (m->m_flags & M_MCAST) ? sintocsa(rdst) : sintocsa(dst), rt); } } if (error == 0) { IP_STATINC(IP_STAT_FRAGMENTED); } done: ia4_release(ia, &psref_ia); rtcache_unref(rt, ro); if (ro == &iproute) { rtcache_free(&iproute); } if (mifp != NULL) { if_put(mifp, &psref); } if (bind_need_restore) curlwp_bindx(bound); return error; bad: m_freem(m); goto done; } int ip_fragment(struct mbuf *m, struct ifnet *ifp, u_long mtu) { struct ip *ip, *mhip; struct mbuf *m0; int len, hlen, off; int mhlen, firstlen; struct mbuf **mnext; int sw_csum = m->m_pkthdr.csum_flags; int fragments = 0; int error = 0; int ipoff, ipflg; ip = mtod(m, struct ip *); hlen = ip->ip_hl << 2; /* Preserve the offset and flags. */ ipoff = ntohs(ip->ip_off) & IP_OFFMASK; ipflg = ntohs(ip->ip_off) & (IP_RF|IP_DF|IP_MF); if (ifp != NULL) sw_csum &= ~ifp->if_csum_flags_tx; len = (mtu - hlen) &~ 7; if (len < 8) { IP_STATINC(IP_STAT_CANTFRAG); m_freem(m); return EMSGSIZE; } firstlen = len; mnext = &m->m_nextpkt; /* * Loop through length of segment after first fragment, * make new header and copy data of each part and link onto chain. */ m0 = m; mhlen = sizeof(struct ip); for (off = hlen + len; off < ntohs(ip->ip_len); off += len) { MGETHDR(m, M_DONTWAIT, MT_HEADER); if (m == NULL) { error = ENOBUFS; IP_STATINC(IP_STAT_ODROPPED); goto sendorfree; } MCLAIM(m, m0->m_owner); *mnext = m; mnext = &m->m_nextpkt; m->m_data += max_linkhdr; mhip = mtod(m, struct ip *); *mhip = *ip; /* we must inherit the flags */ m->m_flags |= m0->m_flags & M_COPYFLAGS; if (hlen > sizeof(struct ip)) { mhlen = ip_optcopy(ip, mhip) + sizeof(struct ip); mhip->ip_hl = mhlen >> 2; } m->m_len = mhlen; mhip->ip_off = ((off - hlen) >> 3) + ipoff; mhip->ip_off |= ipflg; if (off + len >= ntohs(ip->ip_len)) len = ntohs(ip->ip_len) - off; else mhip->ip_off |= IP_MF; HTONS(mhip->ip_off); mhip->ip_len = htons((u_int16_t)(len + mhlen)); m->m_next = m_copym(m0, off, len, M_DONTWAIT); if (m->m_next == NULL) { error = ENOBUFS; IP_STATINC(IP_STAT_ODROPPED); goto sendorfree; } m->m_pkthdr.len = mhlen + len; m_reset_rcvif(m); mhip->ip_sum = 0; KASSERT((m->m_pkthdr.csum_flags & M_CSUM_IPv4) == 0); if (sw_csum & M_CSUM_IPv4) { mhip->ip_sum = in_cksum(m, mhlen); } else { /* * checksum is hw-offloaded or not necessary. */ m->m_pkthdr.csum_flags |= m0->m_pkthdr.csum_flags & M_CSUM_IPv4; m->m_pkthdr.csum_data |= mhlen << 16; KASSERT(!(ifp != NULL && IN_NEED_CHECKSUM(ifp, M_CSUM_IPv4)) || (m->m_pkthdr.csum_flags & M_CSUM_IPv4) != 0); } IP_STATINC(IP_STAT_OFRAGMENTS); fragments++; } /* * Update first fragment by trimming what's been copied out * and updating header, then send each fragment (in order). */ m = m0; m_adj(m, hlen + firstlen - ntohs(ip->ip_len)); m->m_pkthdr.len = hlen + firstlen; ip->ip_len = htons((u_int16_t)m->m_pkthdr.len); ip->ip_off |= htons(IP_MF); ip->ip_sum = 0; if (sw_csum & M_CSUM_IPv4) { ip->ip_sum = in_cksum(m, hlen); m->m_pkthdr.csum_flags &= ~M_CSUM_IPv4; } else { /* * checksum is hw-offloaded or not necessary. */ KASSERT(!(ifp != NULL && IN_NEED_CHECKSUM(ifp, M_CSUM_IPv4)) || (m->m_pkthdr.csum_flags & M_CSUM_IPv4) != 0); KASSERT(M_CSUM_DATA_IPv4_IPHL(m->m_pkthdr.csum_data) >= sizeof(struct ip)); } sendorfree: /* * If there is no room for all the fragments, don't queue * any of them. */ if (ifp != NULL) { IFQ_LOCK(&ifp->if_snd); if (ifp->if_snd.ifq_maxlen - ifp->if_snd.ifq_len < fragments && error == 0) { error = ENOBUFS; IP_STATINC(IP_STAT_ODROPPED); IFQ_INC_DROPS(&ifp->if_snd); } IFQ_UNLOCK(&ifp->if_snd); } if (error) { for (m = m0; m; m = m0) { m0 = m->m_nextpkt; m->m_nextpkt = NULL; m_freem(m); } } return error; } /* * Determine the maximum length of the options to be inserted; * we would far rather allocate too much space rather than too little. */ u_int ip_optlen(struct inpcb *inp) { struct mbuf *m = inp->inp_options; if (m && m->m_len > offsetof(struct ipoption, ipopt_dst)) { return (m->m_len - offsetof(struct ipoption, ipopt_dst)); } return 0; } /* * Insert IP options into preformed packet. * Adjust IP destination as required for IP source routing, * as indicated by a non-zero in_addr at the start of the options. */ static struct mbuf * ip_insertoptions(struct mbuf *m, struct mbuf *opt, int *phlen) { struct ipoption *p = mtod(opt, struct ipoption *); struct mbuf *n; struct ip *ip = mtod(m, struct ip *); unsigned optlen; optlen = opt->m_len - sizeof(p->ipopt_dst); KASSERT(optlen % 4 == 0); if (optlen + ntohs(ip->ip_len) > IP_MAXPACKET) return m; /* XXX should fail */ if (!in_nullhost(p->ipopt_dst)) ip->ip_dst = p->ipopt_dst; if (M_READONLY(m) || M_LEADINGSPACE(m) < optlen) { MGETHDR(n, M_DONTWAIT, MT_HEADER); if (n == NULL) return m; MCLAIM(n, m->m_owner); m_move_pkthdr(n, m); m->m_len -= sizeof(struct ip); m->m_data += sizeof(struct ip); n->m_next = m; n->m_len = optlen + sizeof(struct ip); n->m_data += max_linkhdr; memcpy(mtod(n, void *), ip, sizeof(struct ip)); m = n; } else { m->m_data -= optlen; m->m_len += optlen; memmove(mtod(m, void *), ip, sizeof(struct ip)); } m->m_pkthdr.len += optlen; ip = mtod(m, struct ip *); memcpy(ip + 1, p->ipopt_list, optlen); *phlen = sizeof(struct ip) + optlen; ip->ip_len = htons(ntohs(ip->ip_len) + optlen); return m; } /* * Copy options from ipsrc to ipdst, omitting those not copied during * fragmentation. */ int ip_optcopy(struct ip *ipsrc, struct ip *ipdst) { u_char *cp, *dp; int opt, optlen, cnt; cp = (u_char *)(ipsrc + 1); dp = (u_char *)(ipdst + 1); cnt = (ipsrc->ip_hl << 2) - sizeof(struct ip); for (; cnt > 0; cnt -= optlen, cp += optlen) { opt = cp[0]; if (opt == IPOPT_EOL) break; if (opt == IPOPT_NOP) { /* Preserve for IP mcast tunnel's LSRR alignment. */ *dp++ = IPOPT_NOP; optlen = 1; continue; } KASSERT(cnt >= IPOPT_OLEN + sizeof(*cp)); optlen = cp[IPOPT_OLEN]; KASSERT(optlen >= IPOPT_OLEN + sizeof(*cp) && optlen < cnt); /* Invalid lengths should have been caught by ip_dooptions. */ if (optlen > cnt) optlen = cnt; if (IPOPT_COPIED(opt)) { bcopy((void *)cp, (void *)dp, (unsigned)optlen); dp += optlen; } } for (optlen = dp - (u_char *)(ipdst+1); optlen & 0x3; optlen++) { *dp++ = IPOPT_EOL; } return optlen; } /* * IP socket option processing. */ int ip_ctloutput(int op, struct socket *so, struct sockopt *sopt) { struct inpcb *inp = sotoinpcb(so); struct ip *ip = &in4p_ip(inp); int inpflags = inp->inp_flags; int optval = 0, error = 0; struct in_pktinfo pktinfo; KASSERT(solocked(so)); if (sopt->sopt_level != IPPROTO_IP) { if (sopt->sopt_level == SOL_SOCKET && sopt->sopt_name == SO_NOHEADER) return 0; return ENOPROTOOPT; } switch (op) { case PRCO_SETOPT: switch (sopt->sopt_name) { case IP_OPTIONS: #ifdef notyet case IP_RETOPTS: #endif error = ip_pcbopts(inp, sopt); break; case IP_TOS: case IP_TTL: case IP_MINTTL: case IP_RECVOPTS: case IP_RECVRETOPTS: case IP_RECVDSTADDR: case IP_RECVIF: case IP_RECVPKTINFO: case IP_RECVTTL: case IP_BINDANY: error = sockopt_getint(sopt, &optval); if (error) break; switch (sopt->sopt_name) { case IP_TOS: ip->ip_tos = optval; break; case IP_TTL: ip->ip_ttl = optval; break; case IP_MINTTL: if (optval > 0 && optval <= MAXTTL) in4p_ip_minttl(inp) = optval; else error = EINVAL; break; #define OPTSET(bit) \ if (optval) \ inpflags |= bit; \ else \ inpflags &= ~bit; case IP_RECVOPTS: OPTSET(INP_RECVOPTS); break; case IP_RECVPKTINFO: OPTSET(INP_RECVPKTINFO); break; case IP_RECVRETOPTS: OPTSET(INP_RECVRETOPTS); break; case IP_RECVDSTADDR: OPTSET(INP_RECVDSTADDR); break; case IP_RECVIF: OPTSET(INP_RECVIF); break; case IP_RECVTTL: OPTSET(INP_RECVTTL); break; case IP_BINDANY: error = kauth_authorize_network( kauth_cred_get(), KAUTH_NETWORK_BIND, KAUTH_REQ_NETWORK_BIND_ANYADDR, so, NULL, NULL); if (error == 0) { OPTSET(INP_BINDANY); } break; } break; case IP_PKTINFO: error = sockopt_getint(sopt, &optval); if (!error) { /* Linux compatibility */ OPTSET(INP_RECVPKTINFO); break; } error = sockopt_get(sopt, &pktinfo, sizeof(pktinfo)); if (error) break; if (pktinfo.ipi_ifindex == 0) { in4p_prefsrcip(inp) = pktinfo.ipi_addr; break; } /* Solaris compatibility */ struct ifnet *ifp; struct in_ifaddr *ia; int s; /* pick up primary address */ s = pserialize_read_enter(); ifp = if_byindex(pktinfo.ipi_ifindex); if (ifp == NULL) { pserialize_read_exit(s); error = EADDRNOTAVAIL; break; } ia = in_get_ia_from_ifp(ifp); if (ia == NULL) { pserialize_read_exit(s); error = EADDRNOTAVAIL; break; } in4p_prefsrcip(inp) = IA_SIN(ia)->sin_addr; pserialize_read_exit(s); break; break; #undef OPTSET case IP_MULTICAST_IF: case IP_MULTICAST_TTL: case IP_MULTICAST_LOOP: case IP_ADD_MEMBERSHIP: case IP_DROP_MEMBERSHIP: error = ip_setmoptions(&inp->inp_moptions, sopt); break; case IP_PORTRANGE: error = sockopt_getint(sopt, &optval); if (error) break; switch (optval) { case IP_PORTRANGE_DEFAULT: case IP_PORTRANGE_HIGH: inpflags &= ~(INP_LOWPORT); break; case IP_PORTRANGE_LOW: inpflags |= INP_LOWPORT; break; default: error = EINVAL; break; } break; case IP_PORTALGO: error = sockopt_getint(sopt, &optval); if (error) break; error = portalgo_algo_index_select(inp, optval); break; #if defined(IPSEC) case IP_IPSEC_POLICY: if (ipsec_enabled) { error = ipsec_set_policy(inp, sopt->sopt_data, sopt->sopt_size, curlwp->l_cred); } else error = ENOPROTOOPT; break; #endif /* IPSEC */ default: error = ENOPROTOOPT; break; } break; case PRCO_GETOPT: switch (sopt->sopt_name) { case IP_OPTIONS: case IP_RETOPTS: { struct mbuf *mopts = inp->inp_options; if (mopts) { struct mbuf *m; m = m_copym(mopts, 0, M_COPYALL, M_DONTWAIT); if (m == NULL) { error = ENOBUFS; break; } error = sockopt_setmbuf(sopt, m); } break; } case IP_TOS: case IP_TTL: case IP_MINTTL: case IP_RECVOPTS: case IP_RECVRETOPTS: case IP_RECVDSTADDR: case IP_RECVIF: case IP_RECVPKTINFO: case IP_RECVTTL: case IP_ERRORMTU: case IP_BINDANY: switch (sopt->sopt_name) { case IP_TOS: optval = ip->ip_tos; break; case IP_TTL: optval = ip->ip_ttl; break; case IP_MINTTL: optval = in4p_ip_minttl(inp); break; case IP_ERRORMTU: optval = in4p_errormtu(inp); break; #define OPTBIT(bit) (inpflags & bit ? 1 : 0) case IP_RECVOPTS: optval = OPTBIT(INP_RECVOPTS); break; case IP_RECVPKTINFO: optval = OPTBIT(INP_RECVPKTINFO); break; case IP_RECVRETOPTS: optval = OPTBIT(INP_RECVRETOPTS); break; case IP_RECVDSTADDR: optval = OPTBIT(INP_RECVDSTADDR); break; case IP_RECVIF: optval = OPTBIT(INP_RECVIF); break; case IP_RECVTTL: optval = OPTBIT(INP_RECVTTL); break; case IP_BINDANY: optval = OPTBIT(INP_BINDANY); break; } error = sockopt_setint(sopt, optval); break; case IP_PKTINFO: switch (sopt->sopt_size) { case sizeof(int): /* Linux compatibility */ optval = OPTBIT(INP_RECVPKTINFO); error = sockopt_setint(sopt, optval); break; case sizeof(struct in_pktinfo): /* Solaris compatibility */ pktinfo.ipi_ifindex = 0; pktinfo.ipi_addr = in4p_prefsrcip(inp); error = sockopt_set(sopt, &pktinfo, sizeof(pktinfo)); break; default: /* * While size is stuck at 0, and, later, if * the caller doesn't use an exactly sized * recipient for the data, default to Linux * compatibility */ optval = OPTBIT(INP_RECVPKTINFO); error = sockopt_setint(sopt, optval); break; } break; #if 0 /* defined(IPSEC) */ case IP_IPSEC_POLICY: { struct mbuf *m = NULL; /* XXX this will return EINVAL as sopt is empty */ error = ipsec_get_policy(inp, sopt->sopt_data, sopt->sopt_size, &m); if (error == 0) error = sockopt_setmbuf(sopt, m); break; } #endif /*IPSEC*/ case IP_MULTICAST_IF: case IP_MULTICAST_TTL: case IP_MULTICAST_LOOP: case IP_ADD_MEMBERSHIP: case IP_DROP_MEMBERSHIP: error = ip_getmoptions(inp->inp_moptions, sopt); break; case IP_PORTRANGE: if (inpflags & INP_LOWPORT) optval = IP_PORTRANGE_LOW; else optval = IP_PORTRANGE_DEFAULT; error = sockopt_setint(sopt, optval); break; case IP_PORTALGO: optval = inp->inp_portalgo; error = sockopt_setint(sopt, optval); break; default: error = ENOPROTOOPT; break; } break; } if (!error) { inp->inp_flags = inpflags; } return error; } static int ip_pktinfo_prepare(const struct inpcb *inp, const struct in_pktinfo *pktinfo, struct ip_pktopts *pktopts, int *flags, kauth_cred_t cred) { struct ip_moptions *imo; int error = 0; bool addrset = false; if (!in_nullhost(pktinfo->ipi_addr)) { pktopts->ippo_laddr.sin_addr = pktinfo->ipi_addr; /* EADDRNOTAVAIL? */ error = inpcb_bindableaddr(inp, &pktopts->ippo_laddr, cred); if (error != 0) return error; addrset = true; } if (pktinfo->ipi_ifindex != 0) { if (!addrset) { struct ifnet *ifp; struct in_ifaddr *ia; int s; /* pick up primary address */ s = pserialize_read_enter(); ifp = if_byindex(pktinfo->ipi_ifindex); if (ifp == NULL) { pserialize_read_exit(s); return EADDRNOTAVAIL; } ia = in_get_ia_from_ifp(ifp); if (ia == NULL) { pserialize_read_exit(s); return EADDRNOTAVAIL; } pktopts->ippo_laddr.sin_addr = IA_SIN(ia)->sin_addr; pserialize_read_exit(s); } /* * If specified ipi_ifindex, * use copied or locally initialized ip_moptions. * Original ip_moptions must not be modified. */ imo = &pktopts->ippo_imobuf; /* local buf in pktopts */ if (pktopts->ippo_imo != NULL) { memcpy(imo, pktopts->ippo_imo, sizeof(*imo)); } else { memset(imo, 0, sizeof(*imo)); imo->imo_multicast_ttl = IP_DEFAULT_MULTICAST_TTL; imo->imo_multicast_loop = IP_DEFAULT_MULTICAST_LOOP; } imo->imo_multicast_if_index = pktinfo->ipi_ifindex; pktopts->ippo_imo = imo; *flags |= IP_ROUTETOIFINDEX; } return error; } /* * Set up IP outgoing packet options. Even if control is NULL, * pktopts->ippo_laddr and pktopts->ippo_imo are set and used. */ int ip_setpktopts(struct mbuf *control, struct ip_pktopts *pktopts, int *flags, struct inpcb *inp, kauth_cred_t cred) { struct cmsghdr *cm; struct in_pktinfo pktinfo; int error; pktopts->ippo_imo = inp->inp_moptions; struct in_addr *ia = in_nullhost(in4p_prefsrcip(inp)) ? &in4p_laddr(inp) : &in4p_prefsrcip(inp); sockaddr_in_init(&pktopts->ippo_laddr, ia, 0); if (control == NULL) return 0; /* * XXX: Currently, we assume all the optional information is * stored in a single mbuf. */ if (control->m_next) return EINVAL; for (; control->m_len > 0; control->m_data += CMSG_ALIGN(cm->cmsg_len), control->m_len -= CMSG_ALIGN(cm->cmsg_len)) { cm = mtod(control, struct cmsghdr *); if ((control->m_len < sizeof(*cm)) || (cm->cmsg_len == 0) || (cm->cmsg_len > control->m_len)) { return EINVAL; } if (cm->cmsg_level != IPPROTO_IP) continue; switch (cm->cmsg_type) { case IP_PKTINFO: if (cm->cmsg_len != CMSG_LEN(sizeof(pktinfo))) return EINVAL; memcpy(&pktinfo, CMSG_DATA(cm), sizeof(pktinfo)); error = ip_pktinfo_prepare(inp, &pktinfo, pktopts, flags, cred); if (error) return error; break; case IP_SENDSRCADDR: /* FreeBSD compatibility */ if (cm->cmsg_len != CMSG_LEN(sizeof(struct in_addr))) return EINVAL; pktinfo.ipi_ifindex = 0; pktinfo.ipi_addr = ((struct in_pktinfo *)CMSG_DATA(cm))->ipi_addr; error = ip_pktinfo_prepare(inp, &pktinfo, pktopts, flags, cred); if (error) return error; break; default: return ENOPROTOOPT; } } return 0; } /* * Set up IP options in pcb for insertion in output packets. * Store in mbuf with pointer in pcbopt, adding pseudo-option * with destination address if source routed. */ static int ip_pcbopts(struct inpcb *inp, const struct sockopt *sopt) { struct mbuf *m; const u_char *cp; u_char *dp; int cnt; KASSERT(inp_locked(inp)); /* Turn off any old options. */ if (inp->inp_options) { m_free(inp->inp_options); } inp->inp_options = NULL; if ((cnt = sopt->sopt_size) == 0) { /* Only turning off any previous options. */ return 0; } cp = sopt->sopt_data; if (cnt % 4) { /* Must be 4-byte aligned, because there's no padding. */ return EINVAL; } m = m_get(M_DONTWAIT, MT_SOOPTS); if (m == NULL) return ENOBUFS; dp = mtod(m, u_char *); memset(dp, 0, sizeof(struct in_addr)); dp += sizeof(struct in_addr); m->m_len = sizeof(struct in_addr); /* * IP option list according to RFC791. Each option is of the form * * [optval] [olen] [(olen - 2) data bytes] * * We validate the list and copy options to an mbuf for prepending * to data packets. The IP first-hop destination address will be * stored before actual options and is zero if unset. */ while (cnt > 0) { uint8_t optval, olen, offset; optval = cp[IPOPT_OPTVAL]; if (optval == IPOPT_EOL || optval == IPOPT_NOP) { olen = 1; } else { if (cnt < IPOPT_OLEN + 1) goto bad; olen = cp[IPOPT_OLEN]; if (olen < IPOPT_OLEN + 1 || olen > cnt) goto bad; } if (optval == IPOPT_LSRR || optval == IPOPT_SSRR) { /* * user process specifies route as: * ->A->B->C->D * D must be our final destination (but we can't * check that since we may not have connected yet). * A is first hop destination, which doesn't appear in * actual IP option, but is stored before the options. */ if (olen < IPOPT_OFFSET + 1 + sizeof(struct in_addr)) goto bad; offset = cp[IPOPT_OFFSET]; memcpy(mtod(m, u_char *), cp + IPOPT_OFFSET + 1, sizeof(struct in_addr)); cp += sizeof(struct in_addr); cnt -= sizeof(struct in_addr); olen -= sizeof(struct in_addr); if (m->m_len + olen > MAX_IPOPTLEN + sizeof(struct in_addr)) goto bad; memcpy(dp, cp, olen); dp[IPOPT_OPTVAL] = optval; dp[IPOPT_OLEN] = olen; dp[IPOPT_OFFSET] = offset; break; } else { if (m->m_len + olen > MAX_IPOPTLEN + sizeof(struct in_addr)) goto bad; memcpy(dp, cp, olen); break; } dp += olen; m->m_len += olen; if (optval == IPOPT_EOL) break; cp += olen; cnt -= olen; } inp->inp_options = m; return 0; bad: (void)m_free(m); return EINVAL; } /* * following RFC1724 section 3.3, 0.0.0.0/8 is interpreted as interface index. * Must be called in a pserialize critical section. */ static struct ifnet * ip_multicast_if(struct in_addr *a, int *ifindexp) { int ifindex; struct ifnet *ifp = NULL; struct in_ifaddr *ia; if (ifindexp) *ifindexp = 0; if (ntohl(a->s_addr) >> 24 == 0) { ifindex = ntohl(a->s_addr) & 0xffffff; ifp = if_byindex(ifindex); if (!ifp) return NULL; if (ifindexp) *ifindexp = ifindex; } else { IN_ADDRHASH_READER_FOREACH(ia, a->s_addr) { if (in_hosteq(ia->ia_addr.sin_addr, *a) && (ia->ia_ifp->if_flags & IFF_MULTICAST) != 0) { ifp = ia->ia_ifp; if (if_is_deactivated(ifp)) ifp = NULL; break; } } } return ifp; } static int ip_getoptval(const struct sockopt *sopt, u_int8_t *val, u_int maxval) { u_int tval; u_char cval; int error; if (sopt == NULL) return EINVAL; switch (sopt->sopt_size) { case sizeof(u_char): error = sockopt_get(sopt, &cval, sizeof(u_char)); tval = cval; break; case sizeof(u_int): error = sockopt_get(sopt, &tval, sizeof(u_int)); break; default: error = EINVAL; } if (error) return error; if (tval > maxval) return EINVAL; *val = tval; return 0; } static int ip_get_membership(const struct sockopt *sopt, struct ifnet **ifp, struct psref *psref, struct in_addr *ia, bool add) { int error; struct ip_mreq mreq; error = sockopt_get(sopt, &mreq, sizeof(mreq)); if (error) return error; if (!IN_MULTICAST(mreq.imr_multiaddr.s_addr)) return EINVAL; memcpy(ia, &mreq.imr_multiaddr, sizeof(*ia)); if (in_nullhost(mreq.imr_interface)) { union { struct sockaddr dst; struct sockaddr_in dst4; } u; struct route ro; if (!add) { *ifp = NULL; return 0; } /* * If no interface address was provided, use the interface of * the route to the given multicast address. */ struct rtentry *rt; memset(&ro, 0, sizeof(ro)); sockaddr_in_init(&u.dst4, ia, 0); error = rtcache_setdst(&ro, &u.dst); if (error != 0) return error; *ifp = (rt = rtcache_init(&ro)) != NULL ? rt->rt_ifp : NULL; if (*ifp != NULL) { if (if_is_deactivated(*ifp)) *ifp = NULL; else if_acquire(*ifp, psref); } rtcache_unref(rt, &ro); rtcache_free(&ro); } else { int s = pserialize_read_enter(); *ifp = ip_multicast_if(&mreq.imr_interface, NULL); if (!add && *ifp == NULL) { pserialize_read_exit(s); return EADDRNOTAVAIL; } if (*ifp != NULL) { if (if_is_deactivated(*ifp)) *ifp = NULL; else if_acquire(*ifp, psref); } pserialize_read_exit(s); } return 0; } /* * Add a multicast group membership. * Group must be a valid IP multicast address. */ static int ip_add_membership(struct ip_moptions *imo, const struct sockopt *sopt) { struct ifnet *ifp = NULL; // XXX: gcc [ppc] struct in_addr ia; int i, error, bound; struct psref psref; /* imo is protected by solock or referenced only by the caller */ bound = curlwp_bind(); if (sopt->sopt_size == sizeof(struct ip_mreq)) error = ip_get_membership(sopt, &ifp, &psref, &ia, true); else { #ifdef INET6 error = ip6_get_membership(sopt, &ifp, &psref, &ia, sizeof(ia)); #else error = EINVAL; #endif } if (error) goto out; /* * See if we found an interface, and confirm that it * supports multicast. */ if (ifp == NULL || (ifp->if_flags & IFF_MULTICAST) == 0) { error = EADDRNOTAVAIL; goto out; } /* * See if the membership already exists or if all the * membership slots are full. */ for (i = 0; i < imo->imo_num_memberships; ++i) { if (imo->imo_membership[i]->inm_ifp == ifp && in_hosteq(imo->imo_membership[i]->inm_addr, ia)) break; } if (i < imo->imo_num_memberships) { error = EADDRINUSE; goto out; } if (i == IP_MAX_MEMBERSHIPS) { error = ETOOMANYREFS; goto out; } /* * Everything looks good; add a new record to the multicast * address list for the given interface. */ imo->imo_membership[i] = in_addmulti(&ia, ifp); if (imo->imo_membership[i] == NULL) { error = ENOBUFS; goto out; } ++imo->imo_num_memberships; error = 0; out: if_put(ifp, &psref); curlwp_bindx(bound); return error; } /* * Drop a multicast group membership. * Group must be a valid IP multicast address. */ static int ip_drop_membership(struct ip_moptions *imo, const struct sockopt *sopt) { struct in_addr ia = { .s_addr = 0 }; // XXX: gcc [ppc] struct ifnet *ifp = NULL; // XXX: gcc [ppc] int i, error, bound; struct psref psref; /* imo is protected by solock or referenced only by the caller */ bound = curlwp_bind(); if (sopt->sopt_size == sizeof(struct ip_mreq)) error = ip_get_membership(sopt, &ifp, &psref, &ia, false); else { #ifdef INET6 error = ip6_get_membership(sopt, &ifp, &psref, &ia, sizeof(ia)); #else error = EINVAL; #endif } if (error) goto out; /* * Find the membership in the membership array. */ for (i = 0; i < imo->imo_num_memberships; ++i) { if ((ifp == NULL || imo->imo_membership[i]->inm_ifp == ifp) && in_hosteq(imo->imo_membership[i]->inm_addr, ia)) break; } if (i == imo->imo_num_memberships) { error = EADDRNOTAVAIL; goto out; } /* * Give up the multicast address record to which the * membership points. */ in_delmulti(imo->imo_membership[i]); /* * Remove the gap in the membership array. */ for (++i; i < imo->imo_num_memberships; ++i) imo->imo_membership[i-1] = imo->imo_membership[i]; --imo->imo_num_memberships; error = 0; out: if_put(ifp, &psref); curlwp_bindx(bound); return error; } /* * Set the IP multicast options in response to user setsockopt(). */ int ip_setmoptions(struct ip_moptions **pimo, const struct sockopt *sopt) { struct ip_moptions *imo = *pimo; struct in_addr addr; struct ifnet *ifp; int ifindex, error = 0; /* The passed imo isn't NULL, it should be protected by solock */ if (!imo) { /* * No multicast option buffer attached to the pcb; * allocate one and initialize to default values. */ imo = kmem_intr_alloc(sizeof(*imo), KM_NOSLEEP); if (imo == NULL) return ENOBUFS; imo->imo_multicast_if_index = 0; imo->imo_multicast_addr.s_addr = INADDR_ANY; imo->imo_multicast_ttl = IP_DEFAULT_MULTICAST_TTL; imo->imo_multicast_loop = IP_DEFAULT_MULTICAST_LOOP; imo->imo_num_memberships = 0; *pimo = imo; } switch (sopt->sopt_name) { case IP_MULTICAST_IF: { int s; /* * Select the interface for outgoing multicast packets. */ error = sockopt_get(sopt, &addr, sizeof(addr)); if (error) break; /* * INADDR_ANY is used to remove a previous selection. * When no interface is selected, a default one is * chosen every time a multicast packet is sent. */ if (in_nullhost(addr)) { imo->imo_multicast_if_index = 0; break; } /* * The selected interface is identified by its local * IP address. Find the interface and confirm that * it supports multicasting. */ s = pserialize_read_enter(); ifp = ip_multicast_if(&addr, &ifindex); if (ifp == NULL || (ifp->if_flags & IFF_MULTICAST) == 0) { pserialize_read_exit(s); error = EADDRNOTAVAIL; break; } imo->imo_multicast_if_index = ifp->if_index; pserialize_read_exit(s); if (ifindex) imo->imo_multicast_addr = addr; else imo->imo_multicast_addr.s_addr = INADDR_ANY; break; } case IP_MULTICAST_TTL: /* * Set the IP time-to-live for outgoing multicast packets. */ error = ip_getoptval(sopt, &imo->imo_multicast_ttl, MAXTTL); break; case IP_MULTICAST_LOOP: /* * Set the loopback flag for outgoing multicast packets. * Must be zero or one. */ error = ip_getoptval(sopt, &imo->imo_multicast_loop, 1); break; case IP_ADD_MEMBERSHIP: /* IPV6_JOIN_GROUP */ error = ip_add_membership(imo, sopt); break; case IP_DROP_MEMBERSHIP: /* IPV6_LEAVE_GROUP */ error = ip_drop_membership(imo, sopt); break; default: error = EOPNOTSUPP; break; } /* * If all options have default values, no need to keep the mbuf. */ if (imo->imo_multicast_if_index == 0 && imo->imo_multicast_ttl == IP_DEFAULT_MULTICAST_TTL && imo->imo_multicast_loop == IP_DEFAULT_MULTICAST_LOOP && imo->imo_num_memberships == 0) { kmem_intr_free(imo, sizeof(*imo)); *pimo = NULL; } return error; } /* * Return the IP multicast options in response to user getsockopt(). */ int ip_getmoptions(struct ip_moptions *imo, struct sockopt *sopt) { struct in_addr addr; uint8_t optval; int error = 0; /* imo is protected by solock or referenced only by the caller */ switch (sopt->sopt_name) { case IP_MULTICAST_IF: if (imo == NULL || imo->imo_multicast_if_index == 0) addr = zeroin_addr; else if (imo->imo_multicast_addr.s_addr) { /* return the value user has set */ addr = imo->imo_multicast_addr; } else { struct ifnet *ifp; struct in_ifaddr *ia = NULL; int s = pserialize_read_enter(); ifp = if_byindex(imo->imo_multicast_if_index); if (ifp != NULL) { ia = in_get_ia_from_ifp(ifp); } addr = ia ? ia->ia_addr.sin_addr : zeroin_addr; pserialize_read_exit(s); } error = sockopt_set(sopt, &addr, sizeof(addr)); break; case IP_MULTICAST_TTL: optval = imo ? imo->imo_multicast_ttl : IP_DEFAULT_MULTICAST_TTL; error = sockopt_set(sopt, &optval, sizeof(optval)); break; case IP_MULTICAST_LOOP: optval = imo ? imo->imo_multicast_loop : IP_DEFAULT_MULTICAST_LOOP; error = sockopt_set(sopt, &optval, sizeof(optval)); break; default: error = EOPNOTSUPP; } return error; } /* * Discard the IP multicast options. */ void ip_freemoptions(struct ip_moptions *imo) { int i; /* The owner of imo (inp) should be protected by solock */ if (imo != NULL) { for (i = 0; i < imo->imo_num_memberships; ++i) { struct in_multi *inm = imo->imo_membership[i]; in_delmulti(inm); /* ifp should not leave thanks to solock */ } kmem_intr_free(imo, sizeof(*imo)); } } /* * Routine called from ip_output() to loop back a copy of an IP multicast * packet to the input queue of a specified interface. Note that this * calls the output routine of the loopback "driver", but with an interface * pointer that might NOT be lo0ifp -- easier than replicating that code here. */ static void ip_mloopback(struct ifnet *ifp, struct mbuf *m, const struct sockaddr_in *dst) { struct ip *ip; struct mbuf *copym; copym = m_copypacket(m, M_DONTWAIT); if (copym != NULL && (copym->m_flags & M_EXT || copym->m_len < sizeof(struct ip))) copym = m_pullup(copym, sizeof(struct ip)); if (copym == NULL) return; /* * We don't bother to fragment if the IP length is greater * than the interface's MTU. Can this possibly matter? */ ip = mtod(copym, struct ip *); if (copym->m_pkthdr.csum_flags & (M_CSUM_TCPv4|M_CSUM_UDPv4)) { in_undefer_cksum_tcpudp(copym); copym->m_pkthdr.csum_flags &= ~(M_CSUM_TCPv4|M_CSUM_UDPv4); } ip->ip_sum = 0; ip->ip_sum = in_cksum(copym, ip->ip_hl << 2); KERNEL_LOCK_UNLESS_NET_MPSAFE(); (void)looutput(ifp, copym, sintocsa(dst), NULL); KERNEL_UNLOCK_UNLESS_NET_MPSAFE(); } /* * Ensure sending address is valid. * Returns 0 on success, -1 if an error should be sent back or 1 * if the packet could be dropped without error (protocol dependent). */ static int ip_ifaddrvalid(const struct in_ifaddr *ia) { if (ia->ia_addr.sin_addr.s_addr == INADDR_ANY) return 0; if (ia->ia4_flags & IN_IFF_DUPLICATED) return -1; else if (ia->ia4_flags & (IN_IFF_TENTATIVE | IN_IFF_DETACHED)) return 1; return 0; }
1 4 1 1 1 1 44 43 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 /* $NetBSD: sysmon_wdog.c,v 1.30 2021/12/31 11:05:41 riastradh Exp $ */ /*- * Copyright (c) 2000 Zembu Labs, Inc. * All rights reserved. * * Author: Jason R. Thorpe <thorpej@zembu.com> * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by Zembu Labs, Inc. * 4. Neither the name of Zembu Labs nor the names of its employees may * be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY ZEMBU LABS, INC. ``AS IS'' AND ANY EXPRESS * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WAR- * RANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DIS- * CLAIMED. IN NO EVENT SHALL ZEMBU LABS BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* * Watchdog timer framework for sysmon. Hardware (and software) * watchdog timers can register themselves here to provide a * watchdog function, which provides an abstract interface to the * user. */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: sysmon_wdog.c,v 1.30 2021/12/31 11:05:41 riastradh Exp $"); #include <sys/param.h> #include <sys/conf.h> #include <sys/errno.h> #include <sys/fcntl.h> #include <sys/condvar.h> #include <sys/mutex.h> #include <sys/callout.h> #include <sys/kernel.h> #include <sys/systm.h> #include <sys/proc.h> #include <sys/module.h> #include <sys/once.h> #include <dev/sysmon/sysmonvar.h> static LIST_HEAD(, sysmon_wdog) sysmon_wdog_list = LIST_HEAD_INITIALIZER(&sysmon_wdog_list); static int sysmon_wdog_count; static kmutex_t sysmon_wdog_list_mtx, sysmon_wdog_mtx; static kcondvar_t sysmon_wdog_cv; static struct sysmon_wdog *sysmon_armed_wdog; static callout_t sysmon_wdog_callout; static void *sysmon_wdog_sdhook; static void *sysmon_wdog_cphook; struct sysmon_wdog *sysmon_wdog_find(const char *); void sysmon_wdog_release(struct sysmon_wdog *); int sysmon_wdog_setmode(struct sysmon_wdog *, int, u_int); void sysmon_wdog_ktickle(void *); void sysmon_wdog_critpoll(void *); void sysmon_wdog_shutdown(void *); void sysmon_wdog_ref(struct sysmon_wdog *); static struct sysmon_opvec sysmon_wdog_opvec = { sysmonopen_wdog, sysmonclose_wdog, sysmonioctl_wdog, NULL, NULL, NULL }; MODULE(MODULE_CLASS_DRIVER, sysmon_wdog, "sysmon"); ONCE_DECL(once_wdog); static int wdog_preinit(void) { mutex_init(&sysmon_wdog_list_mtx, MUTEX_DEFAULT, IPL_NONE); mutex_init(&sysmon_wdog_mtx, MUTEX_DEFAULT, IPL_SOFTCLOCK); cv_init(&sysmon_wdog_cv, "wdogref"); callout_init(&sysmon_wdog_callout, 0); return 0; } int sysmon_wdog_init(void) { int error; (void)RUN_ONCE(&once_wdog, wdog_preinit); sysmon_wdog_sdhook = shutdownhook_establish(sysmon_wdog_shutdown, NULL); if (sysmon_wdog_sdhook == NULL) printf("WARNING: unable to register watchdog shutdown hook\n"); sysmon_wdog_cphook = critpollhook_establish(sysmon_wdog_critpoll, NULL); if (sysmon_wdog_cphook == NULL) printf("WARNING: unable to register watchdog critpoll hook\n"); error = sysmon_attach_minor(SYSMON_MINOR_WDOG, &sysmon_wdog_opvec); return error; } int sysmon_wdog_fini(void) { int error; if ( ! LIST_EMPTY(&sysmon_wdog_list)) return EBUSY; error = sysmon_attach_minor(SYSMON_MINOR_WDOG, NULL); if (error == 0) { callout_destroy(&sysmon_wdog_callout); critpollhook_disestablish(sysmon_wdog_cphook); shutdownhook_disestablish(sysmon_wdog_sdhook); cv_destroy(&sysmon_wdog_cv); mutex_destroy(&sysmon_wdog_mtx); mutex_destroy(&sysmon_wdog_list_mtx); } return error; } /* * sysmonopen_wdog: * * Open the system monitor device. */ int sysmonopen_wdog(dev_t dev, int flag, int mode, struct lwp *l) { return 0; } /* * sysmonclose_wdog: * * Close the system monitor device. */ int sysmonclose_wdog(dev_t dev, int flag, int mode, struct lwp *l) { struct sysmon_wdog *smw; int error = 0; /* * If this is the last close, and there is a watchdog * running in UTICKLE mode, we need to disable it, * otherwise the system will reset in short order. * * XXX Maybe we should just go into KTICKLE mode? */ mutex_enter(&sysmon_wdog_mtx); if ((smw = sysmon_armed_wdog) != NULL) { if ((smw->smw_mode & WDOG_MODE_MASK) == WDOG_MODE_UTICKLE) { error = sysmon_wdog_setmode(smw, WDOG_MODE_DISARMED, smw->smw_period); if (error) { printf("WARNING: UNABLE TO DISARM " "WATCHDOG %s ON CLOSE!\n", smw->smw_name); /* * ...we will probably reboot soon. */ } } } mutex_exit(&sysmon_wdog_mtx); return error; } /* * sysmonioctl_wdog: * * Perform a watchdog control request. */ int sysmonioctl_wdog(dev_t dev, u_long cmd, void *data, int flag, struct lwp *l) { struct sysmon_wdog *smw; int error = 0; switch (cmd) { case WDOGIOC_GMODE: { struct wdog_mode *wm = (void *) data; wm->wm_name[sizeof(wm->wm_name) - 1] = '\0'; smw = sysmon_wdog_find(wm->wm_name); if (smw == NULL) { error = ESRCH; break; } wm->wm_mode = smw->smw_mode; wm->wm_period = smw->smw_period; sysmon_wdog_release(smw); break; } case WDOGIOC_SMODE: { struct wdog_mode *wm = (void *) data; if ((flag & FWRITE) == 0) { error = EPERM; break; } wm->wm_name[sizeof(wm->wm_name) - 1] = '\0'; smw = sysmon_wdog_find(wm->wm_name); if (smw == NULL) { error = ESRCH; break; } if (wm->wm_mode & ~(WDOG_MODE_MASK|WDOG_FEATURE_MASK)) error = EINVAL; else { mutex_enter(&sysmon_wdog_mtx); error = sysmon_wdog_setmode(smw, wm->wm_mode, wm->wm_period); mutex_exit(&sysmon_wdog_mtx); } sysmon_wdog_release(smw); break; } case WDOGIOC_WHICH: { struct wdog_mode *wm = (void *) data; mutex_enter(&sysmon_wdog_mtx); if ((smw = sysmon_armed_wdog) != NULL) { strcpy(wm->wm_name, smw->smw_name); wm->wm_mode = smw->smw_mode; wm->wm_period = smw->smw_period; } else error = ESRCH; mutex_exit(&sysmon_wdog_mtx); break; } case WDOGIOC_TICKLE: if ((flag & FWRITE) == 0) { error = EPERM; break; } mutex_enter(&sysmon_wdog_mtx); if ((smw = sysmon_armed_wdog) != NULL) { error = (*smw->smw_tickle)(smw); if (error == 0) smw->smw_tickler = l->l_proc->p_pid; } else error = ESRCH; mutex_exit(&sysmon_wdog_mtx); break; case WDOGIOC_GTICKLER: if ((smw = sysmon_armed_wdog) != NULL) *(pid_t *)data = smw->smw_tickler; else error = ESRCH; break; case WDOGIOC_GWDOGS: { struct wdog_conf *wc = (void *) data; char *cp; int i; mutex_enter(&sysmon_wdog_list_mtx); if (wc->wc_names == NULL) wc->wc_count = sysmon_wdog_count; else { for (i = 0, cp = wc->wc_names, smw = LIST_FIRST(&sysmon_wdog_list); i < sysmon_wdog_count && smw != NULL && error == 0; i++, cp += WDOG_NAMESIZE, smw = LIST_NEXT(smw, smw_list)) error = copyout(smw->smw_name, cp, strlen(smw->smw_name) + 1); wc->wc_count = i; } mutex_exit(&sysmon_wdog_list_mtx); break; } default: error = ENOTTY; } return error; } /* * sysmon_wdog_register: * * Register a watchdog device. */ int sysmon_wdog_register(struct sysmon_wdog *smw) { struct sysmon_wdog *lsmw; int error = 0; (void)RUN_ONCE(&once_wdog, wdog_preinit); mutex_enter(&sysmon_wdog_list_mtx); LIST_FOREACH(lsmw, &sysmon_wdog_list, smw_list) { if (strcmp(lsmw->smw_name, smw->smw_name) == 0) { error = EEXIST; goto out; } } smw->smw_mode = WDOG_MODE_DISARMED; smw->smw_tickler = (pid_t) -1; smw->smw_refcnt = 0; sysmon_wdog_count++; LIST_INSERT_HEAD(&sysmon_wdog_list, smw, smw_list); out: mutex_exit(&sysmon_wdog_list_mtx); return error; } /* * sysmon_wdog_unregister: * * Unregister a watchdog device. */ int sysmon_wdog_unregister(struct sysmon_wdog *smw) { int rc = 0; mutex_enter(&sysmon_wdog_list_mtx); while (smw->smw_refcnt > 0 && rc == 0) { aprint_debug("%s: %d users remain\n", smw->smw_name, smw->smw_refcnt); rc = cv_wait_sig(&sysmon_wdog_cv, &sysmon_wdog_list_mtx); } if (rc == 0) { sysmon_wdog_count--; LIST_REMOVE(smw, smw_list); } mutex_exit(&sysmon_wdog_list_mtx); return rc; } /* * sysmon_wdog_critpoll: * * Perform critical operations during long polling periods */ void sysmon_wdog_critpoll(void *arg) { struct sysmon_wdog *smw = sysmon_armed_wdog; if (smw == NULL) return; if ((smw->smw_mode & WDOG_MODE_MASK) == WDOG_MODE_KTICKLE) { if ((*smw->smw_tickle)(smw) != 0) { printf("WARNING: KERNEL TICKLE OF WATCHDOG %s " "FAILED!\n", smw->smw_name); } } } /* * sysmon_wdog_find: * * Find a watchdog device. We increase the reference * count on a match. */ struct sysmon_wdog * sysmon_wdog_find(const char *name) { struct sysmon_wdog *smw; mutex_enter(&sysmon_wdog_list_mtx); LIST_FOREACH(smw, &sysmon_wdog_list, smw_list) { if (strcmp(smw->smw_name, name) == 0) break; } if (smw != NULL) smw->smw_refcnt++; mutex_exit(&sysmon_wdog_list_mtx); return smw; } /* * sysmon_wdog_release: * * Release a watchdog device. */ void sysmon_wdog_release(struct sysmon_wdog *smw) { mutex_enter(&sysmon_wdog_list_mtx); KASSERT(smw->smw_refcnt != 0); smw->smw_refcnt--; cv_signal(&sysmon_wdog_cv); mutex_exit(&sysmon_wdog_list_mtx); } void sysmon_wdog_ref(struct sysmon_wdog *smw) { mutex_enter(&sysmon_wdog_list_mtx); smw->smw_refcnt++; mutex_exit(&sysmon_wdog_list_mtx); } /* * sysmon_wdog_setmode: * * Set the mode of a watchdog device. */ int sysmon_wdog_setmode(struct sysmon_wdog *smw, int mode, u_int period) { u_int operiod = smw->smw_period; int omode = smw->smw_mode; int error = 0; smw->smw_period = period; smw->smw_mode = mode; switch (mode & WDOG_MODE_MASK) { case WDOG_MODE_DISARMED: if (smw != sysmon_armed_wdog) { error = EINVAL; goto out; } break; case WDOG_MODE_KTICKLE: case WDOG_MODE_UTICKLE: case WDOG_MODE_ETICKLE: if (sysmon_armed_wdog != NULL) { error = EBUSY; goto out; } break; default: error = EINVAL; goto out; } error = (*smw->smw_setmode)(smw); out: if (error) { smw->smw_period = operiod; smw->smw_mode = omode; } else { if ((mode & WDOG_MODE_MASK) == WDOG_MODE_DISARMED) { sysmon_armed_wdog = NULL; smw->smw_tickler = (pid_t) -1; sysmon_wdog_release(smw); if ((omode & WDOG_MODE_MASK) == WDOG_MODE_KTICKLE) callout_stop(&sysmon_wdog_callout); } else { sysmon_armed_wdog = smw; sysmon_wdog_ref(smw); if ((mode & WDOG_MODE_MASK) == WDOG_MODE_KTICKLE) { callout_reset(&sysmon_wdog_callout, WDOG_PERIOD_TO_TICKS(smw->smw_period) / 2, sysmon_wdog_ktickle, NULL); } } } return error; } /* * sysmon_wdog_ktickle: * * Kernel watchdog tickle routine. */ void sysmon_wdog_ktickle(void *arg) { struct sysmon_wdog *smw; mutex_enter(&sysmon_wdog_mtx); if ((smw = sysmon_armed_wdog) != NULL) { if ((*smw->smw_tickle)(smw) != 0) { printf("WARNING: KERNEL TICKLE OF WATCHDOG %s " "FAILED!\n", smw->smw_name); /* * ...we will probably reboot soon. */ } callout_reset(&sysmon_wdog_callout, WDOG_PERIOD_TO_TICKS(smw->smw_period) / 2, sysmon_wdog_ktickle, NULL); } mutex_exit(&sysmon_wdog_mtx); } /* * sysmon_wdog_shutdown: * * Perform shutdown-time operations. */ void sysmon_wdog_shutdown(void *arg) { struct sysmon_wdog *smw; /* * XXX Locking here? I don't think it's necessary. */ if ((smw = sysmon_armed_wdog) != NULL) { if (sysmon_wdog_setmode(smw, WDOG_MODE_DISARMED, smw->smw_period)) printf("WARNING: FAILED TO SHUTDOWN WATCHDOG %s!\n", smw->smw_name); } } static int sysmon_wdog_modcmd(modcmd_t cmd, void *arg) { int ret; switch (cmd) { case MODULE_CMD_INIT: ret = sysmon_wdog_init(); break; case MODULE_CMD_FINI: ret = sysmon_wdog_fini(); break; case MODULE_CMD_STAT: default: ret = ENOTTY; } return ret; }
299 299 18 18 1 302 302 286 287 287 302 302 299 288 301 14 302 291 291 10 6 291 269 301 275 51 291 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 /* $NetBSD: uvm_page_array.c,v 1.9 2020/05/26 21:52:12 ad Exp $ */ /*- * Copyright (c)2011 YAMAMOTO Takashi, * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: uvm_page_array.c,v 1.9 2020/05/26 21:52:12 ad Exp $"); #include <sys/param.h> #include <sys/systm.h> #include <uvm/uvm_extern.h> #include <uvm/uvm_object.h> #include <uvm/uvm_page.h> #include <uvm/uvm_page_array.h> /* * uvm_page_array_init: initialize the array. */ void uvm_page_array_init(struct uvm_page_array *ar, struct uvm_object *uobj, unsigned int flags) { ar->ar_idx = 0; ar->ar_npages = 0; ar->ar_uobj = uobj; ar->ar_flags = flags; } /* * uvm_page_array_fini: clean up the array. */ void uvm_page_array_fini(struct uvm_page_array *ar) { /* * currently nothing to do. */ #if defined(DIAGNOSTIC) /* * poison to trigger assertion in uvm_page_array_peek to * detect usage errors. */ ar->ar_npages = 1; ar->ar_idx = 1000; #endif /* defined(DIAGNOSTIC) */ } /* * uvm_page_array_clear: forget the cached pages and initialize the array. */ void uvm_page_array_clear(struct uvm_page_array *ar) { KASSERT(ar->ar_idx <= ar->ar_npages); ar->ar_idx = 0; ar->ar_npages = 0; } /* * uvm_page_array_peek: return the next cached page. */ struct vm_page * uvm_page_array_peek(struct uvm_page_array *ar) { KASSERT(ar->ar_idx <= ar->ar_npages); if (ar->ar_idx == ar->ar_npages) { return NULL; } return ar->ar_pages[ar->ar_idx]; } /* * uvm_page_array_advance: advance the array to the next cached page */ void uvm_page_array_advance(struct uvm_page_array *ar) { KASSERT(ar->ar_idx <= ar->ar_npages); ar->ar_idx++; KASSERT(ar->ar_idx <= ar->ar_npages); } /* * uvm_page_array_fill: lookup pages and keep them cached. * * return 0 on success. in that case, cache the result in the array * so that they will be picked by later uvm_page_array_peek. * * nwant is a number of pages to fetch. a caller should consider it a hint. * nwant == 0 means a caller have no specific idea. * * return ENOENT if no pages are found. * * called with object lock held. */ int uvm_page_array_fill(struct uvm_page_array *ar, voff_t off, unsigned int nwant) { unsigned int npages; #if defined(DEBUG) unsigned int i; #endif /* defined(DEBUG) */ unsigned int maxpages = __arraycount(ar->ar_pages); struct uvm_object *uobj = ar->ar_uobj; const int flags = ar->ar_flags; const bool dense = (flags & UVM_PAGE_ARRAY_FILL_DENSE) != 0; const bool backward = (flags & UVM_PAGE_ARRAY_FILL_BACKWARD) != 0; int error = 0; if (nwant != 0 && nwant < maxpages) { maxpages = nwant; } #if 0 /* called from DDB for "show obj/f" without lock */ KASSERT(rw_lock_held(uobj->vmobjlock)); #endif KASSERT(uvm_page_array_peek(ar) == NULL); if ((flags & UVM_PAGE_ARRAY_FILL_DIRTY) != 0) { unsigned int tagmask = UVM_PAGE_DIRTY_TAG; if ((flags & UVM_PAGE_ARRAY_FILL_WRITEBACK) != 0) { tagmask |= UVM_PAGE_WRITEBACK_TAG; } npages = (backward ? radix_tree_gang_lookup_tagged_node_reverse : radix_tree_gang_lookup_tagged_node)( &uobj->uo_pages, off >> PAGE_SHIFT, (void **)ar->ar_pages, maxpages, dense, tagmask); } else { npages = (backward ? radix_tree_gang_lookup_node_reverse : radix_tree_gang_lookup_node)( &uobj->uo_pages, off >> PAGE_SHIFT, (void **)ar->ar_pages, maxpages, dense); } if (npages == 0) { if (flags != 0) { /* * if dense or looking for tagged entries (or * working backwards), fail right away. */ npages = 0; } else { /* * there's nothing else to be found with the current * set of arguments, in the current version of the * tree. * * minimize repeated tree lookups by "finding" a * null pointer, in case the caller keeps looping (a * common use case). */ npages = 1; ar->ar_pages[0] = NULL; } error = ENOENT; } KASSERT(npages <= maxpages); ar->ar_npages = npages; ar->ar_idx = 0; #if defined(DEBUG) for (i = 0; error == 0 && i < ar->ar_npages; i++) { struct vm_page * const pg = ar->ar_pages[i]; KASSERT(pg != NULL); KDASSERT(pg->uobject == uobj); if (backward) { KDASSERT(pg->offset <= off); KDASSERT(i == 0 || pg->offset < ar->ar_pages[i - 1]->offset); } else { KDASSERT(pg->offset >= off); KDASSERT(i == 0 || pg->offset > ar->ar_pages[i - 1]->offset); } } #endif /* defined(DEBUG) */ return error; } /* * uvm_page_array_fill_and_peek: * same as uvm_page_array_peek except that, if the array is empty, try to fill * it first. */ struct vm_page * uvm_page_array_fill_and_peek(struct uvm_page_array *ar, voff_t off, unsigned int nwant) { int error; if (ar->ar_idx != ar->ar_npages) { return ar->ar_pages[ar->ar_idx]; } error = uvm_page_array_fill(ar, off, nwant); if (error != 0) { return NULL; } return uvm_page_array_peek(ar); }
24 24 15 9 24 24 24 4 6 5 1 1 1 5 4 1 1 4 5 3 3 3 1 4 1 7 2 1 2 2 2 2 2 1 1 2 1 1 37 38 38 37 38 37 1 25 13 2 2 2 2 5 5 5 1 5 2 2 2 2 2 83 42 39 3 35 5 2 17 11 6 17 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 /* $NetBSD: in_pcb.c,v 1.202 2022/11/04 09:05:41 ozaki-r Exp $ */ /* * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the project nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /*- * Copyright (c) 1998, 2011 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Coyote Point Systems, Inc. * This code is derived from software contributed to The NetBSD Foundation * by Public Access Networks Corporation ("Panix"). It was developed under * contract to Panix by Eric Haszlakiewicz and Thor Lancelot Simon. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /* * Copyright (c) 1982, 1986, 1991, 1993, 1995 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)in_pcb.c 8.4 (Berkeley) 5/24/95 */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: in_pcb.c,v 1.202 2022/11/04 09:05:41 ozaki-r Exp $"); #ifdef _KERNEL_OPT #include "opt_inet.h" #include "opt_ipsec.h" #endif #include <sys/param.h> #include <sys/systm.h> #include <sys/mbuf.h> #include <sys/socket.h> #include <sys/socketvar.h> #include <sys/ioctl.h> #include <sys/errno.h> #include <sys/time.h> #include <sys/once.h> #include <sys/pool.h> #include <sys/proc.h> #include <sys/kauth.h> #include <sys/uidinfo.h> #include <sys/domain.h> #include <net/if.h> #include <net/route.h> #include <netinet/in.h> #include <netinet/in_systm.h> #include <netinet/ip.h> #include <netinet/in_pcb.h> #include <netinet/in_var.h> #include <netinet/ip_var.h> #include <netinet/portalgo.h> #ifdef INET6 #include <netinet/ip6.h> #include <netinet6/ip6_var.h> #include <netinet6/in6_pcb.h> #endif #ifdef IPSEC #include <netipsec/ipsec.h> #include <netipsec/key.h> #endif /* IPSEC */ #include <netinet/tcp_vtw.h> struct in_addr zeroin_addr; #define INPCBHASH_PORT(table, lport) \ &(table)->inpt_porthashtbl[ntohs(lport) & (table)->inpt_porthash] #define INPCBHASH_BIND(table, laddr, lport) \ &(table)->inpt_bindhashtbl[ \ ((ntohl((laddr).s_addr) + ntohs(lport))) & (table)->inpt_bindhash] #define INPCBHASH_CONNECT(table, faddr, fport, laddr, lport) \ &(table)->inpt_connecthashtbl[ \ ((ntohl((faddr).s_addr) + ntohs(fport)) + \ (ntohl((laddr).s_addr) + ntohs(lport))) & (table)->inpt_connecthash] int anonportmin = IPPORT_ANONMIN; int anonportmax = IPPORT_ANONMAX; int lowportmin = IPPORT_RESERVEDMIN; int lowportmax = IPPORT_RESERVEDMAX; static pool_cache_t in4pcb_pool_cache; #ifdef INET6 static pool_cache_t in6pcb_pool_cache; #endif static int inpcb_poolinit(void) { in4pcb_pool_cache = pool_cache_init(sizeof(struct in4pcb), coherency_unit, 0, 0, "in4pcbpl", NULL, IPL_NET, NULL, NULL, NULL); #ifdef INET6 in6pcb_pool_cache = pool_cache_init(sizeof(struct in6pcb), coherency_unit, 0, 0, "in6pcbpl", NULL, IPL_NET, NULL, NULL, NULL); #endif return 0; } void inpcb_init(struct inpcbtable *table, int bindhashsize, int connecthashsize) { static ONCE_DECL(control); TAILQ_INIT(&table->inpt_queue); table->inpt_porthashtbl = hashinit(bindhashsize, HASH_LIST, true, &table->inpt_porthash); table->inpt_bindhashtbl = hashinit(bindhashsize, HASH_LIST, true, &table->inpt_bindhash); table->inpt_connecthashtbl = hashinit(connecthashsize, HASH_LIST, true, &table->inpt_connecthash); table->inpt_lastlow = IPPORT_RESERVEDMAX; table->inpt_lastport = (in_port_t)anonportmax; RUN_ONCE(&control, inpcb_poolinit); } /* * inpcb_create: construct a new PCB and associated with a given socket. * Sets the PCB state to INP_ATTACHED and makes PCB globally visible. */ int inpcb_create(struct socket *so, void *v) { struct inpcbtable *table = v; struct inpcb *inp; int s; #ifdef INET6 KASSERT(soaf(so) == AF_INET || soaf(so) == AF_INET6); if (soaf(so) == AF_INET) inp = pool_cache_get(in4pcb_pool_cache, PR_NOWAIT); else inp = pool_cache_get(in6pcb_pool_cache, PR_NOWAIT); #else KASSERT(soaf(so) == AF_INET); inp = pool_cache_get(in4pcb_pool_cache, PR_NOWAIT); #endif if (inp == NULL) return ENOBUFS; if (soaf(so) == AF_INET) memset(inp, 0, sizeof(struct in4pcb)); #ifdef INET6 else memset(inp, 0, sizeof(struct in6pcb)); #endif inp->inp_af = soaf(so); inp->inp_table = table; inp->inp_socket = so; inp->inp_portalgo = PORTALGO_DEFAULT; inp->inp_bindportonsend = false; if (inp->inp_af == AF_INET) { in4p_errormtu(inp) = -1; in4p_prefsrcip(inp).s_addr = INADDR_ANY; } #ifdef INET6 else { in6p_hops6(inp) = -1; /* use kernel default */ if (ip6_v6only) inp->inp_flags |= IN6P_IPV6_V6ONLY; } #endif #if defined(IPSEC) if (ipsec_enabled) { int error = ipsec_init_pcbpolicy(so, &inp->inp_sp); if (error != 0) { #ifdef INET6 if (inp->inp_af == AF_INET) pool_cache_put(in4pcb_pool_cache, inp); else pool_cache_put(in6pcb_pool_cache, inp); #else KASSERT(inp->inp_af == AF_INET); pool_cache_put(in4pcb_pool_cache, inp); #endif return error; } inp->inp_sp->sp_inp = inp; } #endif so->so_pcb = inp; s = splsoftnet(); TAILQ_INSERT_HEAD(&table->inpt_queue, inp, inp_queue); LIST_INSERT_HEAD(INPCBHASH_PORT(table, inp->inp_lport), inp, inp_lhash); inpcb_set_state(inp, INP_ATTACHED); splx(s); return 0; } static int inpcb_set_port(struct sockaddr_in *sin, struct inpcb *inp, kauth_cred_t cred) { struct inpcbtable *table = inp->inp_table; struct socket *so = inp->inp_socket; in_port_t *lastport; in_port_t lport = 0; enum kauth_network_req req; int error; if (inp->inp_flags & INP_LOWPORT) { #ifndef IPNOPRIVPORTS req = KAUTH_REQ_NETWORK_BIND_PRIVPORT; #else req = KAUTH_REQ_NETWORK_BIND_PORT; #endif lastport = &table->inpt_lastlow; } else { req = KAUTH_REQ_NETWORK_BIND_PORT; lastport = &table->inpt_lastport; } /* XXX-kauth: KAUTH_REQ_NETWORK_BIND_AUTOASSIGN_{,PRIV}PORT */ error = kauth_authorize_network(cred, KAUTH_NETWORK_BIND, req, so, sin, NULL); if (error) return EACCES; /* * Use RFC6056 randomized port selection */ error = portalgo_randport(&lport, inp, cred); if (error) return error; inp->inp_flags |= INP_ANONPORT; *lastport = lport; lport = htons(lport); inp->inp_lport = lport; inpcb_set_state(inp, INP_BOUND); return 0; } int inpcb_bindableaddr(const struct inpcb *inp, struct sockaddr_in *sin, kauth_cred_t cred) { int error = EADDRNOTAVAIL; struct ifaddr *ifa = NULL; int s; if (sin->sin_family != AF_INET) return EAFNOSUPPORT; s = pserialize_read_enter(); if (IN_MULTICAST(sin->sin_addr.s_addr)) { /* Always succeed; port reuse handled in inpcb_bind_port(). */ } else if (!in_nullhost(sin->sin_addr)) { struct in_ifaddr *ia; ia = in_get_ia(sin->sin_addr); /* check for broadcast addresses */ if (ia == NULL) { ifa = ifa_ifwithaddr(sintosa(sin)); if (ifa != NULL) ia = ifatoia(ifa); else if ((inp->inp_flags & INP_BINDANY) != 0) { error = 0; goto error; } } if (ia == NULL) goto error; if (ia->ia4_flags & IN_IFF_DUPLICATED) goto error; } error = 0; error: pserialize_read_exit(s); return error; } static int inpcb_bind_addr(struct inpcb *inp, struct sockaddr_in *sin, kauth_cred_t cred) { int error; error = inpcb_bindableaddr(inp, sin, cred); if (error == 0) in4p_laddr(inp) = sin->sin_addr; return error; } static int inpcb_bind_port(struct inpcb *inp, struct sockaddr_in *sin, kauth_cred_t cred) { struct inpcbtable *table = inp->inp_table; struct socket *so = inp->inp_socket; int reuseport = (so->so_options & SO_REUSEPORT); int wild = 0, error; if (IN_MULTICAST(sin->sin_addr.s_addr)) { /* * Treat SO_REUSEADDR as SO_REUSEPORT for multicast; * allow complete duplication of binding if * SO_REUSEPORT is set, or if SO_REUSEADDR is set * and a multicast address is bound on both * new and duplicated sockets. */ if (so->so_options & (SO_REUSEADDR | SO_REUSEPORT)) reuseport = SO_REUSEADDR|SO_REUSEPORT; } if (sin->sin_port == 0) { error = inpcb_set_port(sin, inp, cred); if (error) return error; } else { struct inpcb *t; vestigial_inpcb_t vestige; #ifdef INET6 struct inpcb *t6; struct in6_addr mapped; #endif enum kauth_network_req req; if ((so->so_options & (SO_REUSEADDR|SO_REUSEPORT)) == 0) wild = 1; #ifndef IPNOPRIVPORTS if (ntohs(sin->sin_port) < IPPORT_RESERVED) req = KAUTH_REQ_NETWORK_BIND_PRIVPORT; else #endif /* !IPNOPRIVPORTS */ req = KAUTH_REQ_NETWORK_BIND_PORT; error = kauth_authorize_network(cred, KAUTH_NETWORK_BIND, req, so, sin, NULL); if (error) return EACCES; #ifdef INET6 in6_in_2_v4mapin6(&sin->sin_addr, &mapped); t6 = in6pcb_lookup_local(table, &mapped, sin->sin_port, wild, &vestige); if (t6 && (reuseport & t6->inp_socket->so_options) == 0) return EADDRINUSE; if (!t6 && vestige.valid) { if (!!reuseport != !!vestige.reuse_port) { return EADDRINUSE; } } #endif /* XXX-kauth */ if (so->so_uidinfo->ui_uid && !IN_MULTICAST(sin->sin_addr.s_addr)) { t = inpcb_lookup_local(table, sin->sin_addr, sin->sin_port, 1, &vestige); /* * XXX: investigate ramifications of loosening this * restriction so that as long as both ports have * SO_REUSEPORT allow the bind */ if (t && (!in_nullhost(sin->sin_addr) || !in_nullhost(in4p_laddr(t)) || (t->inp_socket->so_options & SO_REUSEPORT) == 0) && (so->so_uidinfo->ui_uid != t->inp_socket->so_uidinfo->ui_uid)) { return EADDRINUSE; } if (!t && vestige.valid) { if ((!in_nullhost(sin->sin_addr) || !in_nullhost(vestige.laddr.v4) || !vestige.reuse_port) && so->so_uidinfo->ui_uid != vestige.uid) { return EADDRINUSE; } } } t = inpcb_lookup_local(table, sin->sin_addr, sin->sin_port, wild, &vestige); if (t && (reuseport & t->inp_socket->so_options) == 0) return EADDRINUSE; if (!t && vestige.valid && !(reuseport && vestige.reuse_port)) return EADDRINUSE; inp->inp_lport = sin->sin_port; inpcb_set_state(inp, INP_BOUND); } LIST_REMOVE(inp, inp_lhash); LIST_INSERT_HEAD(INPCBHASH_PORT(table, inp->inp_lport), inp, inp_lhash); return 0; } /* * inpcb_bind: assign a local IP address and port number to the PCB. * * If the address is not a wildcard, verify that it corresponds to a * local interface. If a port is specified and it is privileged, then * check the permission. Check whether the address or port is in use, * and if so, whether we can re-use them. */ int inpcb_bind(void *v, struct sockaddr_in *sin, struct lwp *l) { struct inpcb *inp = v; struct sockaddr_in lsin; int error; if (inp->inp_af != AF_INET) return EINVAL; if (inp->inp_lport || !in_nullhost(in4p_laddr(inp))) return EINVAL; if (NULL != sin) { if (sin->sin_len != sizeof(*sin)) return EINVAL; } else { lsin = *((const struct sockaddr_in *) inp->inp_socket->so_proto->pr_domain->dom_sa_any); sin = &lsin; } /* Bind address. */ error = inpcb_bind_addr(inp, sin, l->l_cred); if (error) return error; /* Bind port. */ error = inpcb_bind_port(inp, sin, l->l_cred); if (error) { in4p_laddr(inp).s_addr = INADDR_ANY; return error; } return 0; } /* * inpcb_connect: connect from a socket to a specified address, i.e., * assign a foreign IP address and port number to the PCB. * * Both address and port must be specified in the name argument. * If there is no local address for this socket yet, then pick one. */ int inpcb_connect(void *v, struct sockaddr_in *sin, struct lwp *l) { struct inpcb *inp = v; vestigial_inpcb_t vestige; int error; struct in_addr laddr; if (inp->inp_af != AF_INET) return EINVAL; if (sin->sin_len != sizeof (*sin)) return EINVAL; if (sin->sin_family != AF_INET) return EAFNOSUPPORT; if (sin->sin_port == 0) return EADDRNOTAVAIL; if (IN_MULTICAST(sin->sin_addr.s_addr) && inp->inp_socket->so_type == SOCK_STREAM) return EADDRNOTAVAIL; if (!IN_ADDRLIST_READER_EMPTY()) { /* * If the destination address is INADDR_ANY, * use any local address (likely loopback). * If the supplied address is INADDR_BROADCAST, * use the broadcast address of an interface * which supports broadcast. (loopback does not) */ if (in_nullhost(sin->sin_addr)) { /* XXX racy */ sin->sin_addr = IN_ADDRLIST_READER_FIRST()->ia_addr.sin_addr; } else if (sin->sin_addr.s_addr == INADDR_BROADCAST) { struct in_ifaddr *ia; int s = pserialize_read_enter(); IN_ADDRLIST_READER_FOREACH(ia) { if (ia->ia_ifp->if_flags & IFF_BROADCAST) { sin->sin_addr = ia->ia_broadaddr.sin_addr; break; } } pserialize_read_exit(s); } } /* * If we haven't bound which network number to use as ours, * we will use the number of the outgoing interface. * This depends on having done a routing lookup, which * we will probably have to do anyway, so we might * as well do it now. On the other hand if we are * sending to multiple destinations we may have already * done the lookup, so see if we can use the route * from before. In any case, we only * chose a port number once, even if sending to multiple * destinations. */ if (in_nullhost(in4p_laddr(inp))) { int xerror; struct in_ifaddr *ia, *_ia; int s; struct psref psref; int bound; bound = curlwp_bind(); ia = in_selectsrc(sin, &inp->inp_route, inp->inp_socket->so_options, inp->inp_moptions, &xerror, &psref); if (ia == NULL) { curlwp_bindx(bound); if (xerror == 0) xerror = EADDRNOTAVAIL; return xerror; } s = pserialize_read_enter(); _ia = in_get_ia(IA_SIN(ia)->sin_addr); if (_ia == NULL && (inp->inp_flags & INP_BINDANY) == 0) { pserialize_read_exit(s); ia4_release(ia, &psref); curlwp_bindx(bound); return EADDRNOTAVAIL; } pserialize_read_exit(s); laddr = IA_SIN(ia)->sin_addr; ia4_release(ia, &psref); curlwp_bindx(bound); } else laddr = in4p_laddr(inp); if (inpcb_lookup(inp->inp_table, sin->sin_addr, sin->sin_port, laddr, inp->inp_lport, &vestige) != NULL || vestige.valid) { return EADDRINUSE; } if (in_nullhost(in4p_laddr(inp))) { if (inp->inp_lport == 0) { error = inpcb_bind(inp, NULL, l); /* * This used to ignore the return value * completely, but we need to check for * ephemeral port shortage. * And attempts to request low ports if not root. */ if (error != 0) return error; } in4p_laddr(inp) = laddr; } in4p_faddr(inp) = sin->sin_addr; inp->inp_fport = sin->sin_port; /* Late bind, if needed */ if (inp->inp_bindportonsend) { struct sockaddr_in lsin = *((const struct sockaddr_in *) inp->inp_socket->so_proto->pr_domain->dom_sa_any); lsin.sin_addr = in4p_laddr(inp); lsin.sin_port = 0; if ((error = inpcb_bind_port(inp, &lsin, l->l_cred)) != 0) return error; } inpcb_set_state(inp, INP_CONNECTED); #if defined(IPSEC) if (ipsec_enabled && inp->inp_socket->so_type == SOCK_STREAM) ipsec_pcbconn(inp->inp_sp); #endif return 0; } /* * inpcb_disconnect: remove any foreign IP/port association. * * Note: destroys the PCB if socket was closed. */ void inpcb_disconnect(void *v) { struct inpcb *inp = v; if (inp->inp_af != AF_INET) return; in4p_faddr(inp) = zeroin_addr; inp->inp_fport = 0; inpcb_set_state(inp, INP_BOUND); #if defined(IPSEC) if (ipsec_enabled) ipsec_pcbdisconn(inp->inp_sp); #endif if (inp->inp_socket->so_state & SS_NOFDREF) inpcb_destroy(inp); } /* * inpcb_destroy: destroy PCB as well as the associated socket. */ void inpcb_destroy(void *v) { struct inpcb *inp = v; struct socket *so = inp->inp_socket; int s; KASSERT(inp->inp_af == AF_INET || inp->inp_af == AF_INET6); #if defined(IPSEC) if (ipsec_enabled) ipsec_delete_pcbpolicy(inp); #endif so->so_pcb = NULL; s = splsoftnet(); inpcb_set_state(inp, INP_ATTACHED); LIST_REMOVE(inp, inp_lhash); TAILQ_REMOVE(&inp->inp_table->inpt_queue, inp, inp_queue); splx(s); if (inp->inp_options) { m_free(inp->inp_options); } rtcache_free(&inp->inp_route); ip_freemoptions(inp->inp_moptions); #ifdef INET6 if (inp->inp_af == AF_INET6) { if (in6p_outputopts(inp) != NULL) { ip6_clearpktopts(in6p_outputopts(inp), -1); free(in6p_outputopts(inp), M_IP6OPT); } ip6_freemoptions(in6p_moptions(inp)); } #endif sofree(so); /* drops the socket's lock */ #ifdef INET6 if (inp->inp_af == AF_INET) pool_cache_put(in4pcb_pool_cache, inp); else pool_cache_put(in6pcb_pool_cache, inp); #else KASSERT(inp->inp_af == AF_INET); pool_cache_put(in4pcb_pool_cache, inp); #endif mutex_enter(softnet_lock); /* reacquire the softnet_lock */ } /* * inpcb_fetch_sockaddr: fetch the local IP address and port number. */ void inpcb_fetch_sockaddr(struct inpcb *inp, struct sockaddr_in *sin) { if (inp->inp_af != AF_INET) return; sockaddr_in_init(sin, &in4p_laddr(inp), inp->inp_lport); } /* * inpcb_fetch_peeraddr: fetch the foreign IP address and port number. */ void inpcb_fetch_peeraddr(struct inpcb *inp, struct sockaddr_in *sin) { if (inp->inp_af != AF_INET) return; sockaddr_in_init(sin, &in4p_faddr(inp), inp->inp_fport); } /* * inpcb_notify: pass some notification to all connections of a protocol * associated with destination address. The local address and/or port * numbers may be specified to limit the search. The "usual action" will * be taken, depending on the command. * * The caller must filter any commands that are not interesting (e.g., * no error in the map). Call the protocol specific routine (if any) to * report any errors for each matching socket. * * Must be called at splsoftnet. */ int inpcb_notify(struct inpcbtable *table, struct in_addr faddr, u_int fport_arg, struct in_addr laddr, u_int lport_arg, int errno, void (*notify)(struct inpcb *, int)) { struct inpcbhead *head; struct inpcb *inp; in_port_t fport = fport_arg, lport = lport_arg; int nmatch; if (in_nullhost(faddr) || notify == NULL) return 0; nmatch = 0; head = INPCBHASH_CONNECT(table, faddr, fport, laddr, lport); LIST_FOREACH(inp, head, inp_hash) { if (inp->inp_af != AF_INET) continue; if (in_hosteq(in4p_faddr(inp), faddr) && inp->inp_fport == fport && inp->inp_lport == lport && in_hosteq(in4p_laddr(inp), laddr)) { (*notify)(inp, errno); nmatch++; } } return nmatch; } void inpcb_notifyall(struct inpcbtable *table, struct in_addr faddr, int errno, void (*notify)(struct inpcb *, int)) { struct inpcb *inp; if (in_nullhost(faddr) || notify == NULL) return; TAILQ_FOREACH(inp, &table->inpt_queue, inp_queue) { if (inp->inp_af != AF_INET) continue; if (in_hosteq(in4p_faddr(inp), faddr)) (*notify)(inp, errno); } } void in_purgeifmcast(struct ip_moptions *imo, struct ifnet *ifp) { int i, gap; /* The owner of imo should be protected by solock */ KASSERT(ifp != NULL); if (imo == NULL) return; /* * Unselect the outgoing interface if it is being * detached. */ if (imo->imo_multicast_if_index == ifp->if_index) imo->imo_multicast_if_index = 0; /* * Drop multicast group membership if we joined * through the interface being detached. */ for (i = 0, gap = 0; i < imo->imo_num_memberships; i++) { if (imo->imo_membership[i]->inm_ifp == ifp) { in_delmulti(imo->imo_membership[i]); gap++; } else if (gap != 0) imo->imo_membership[i - gap] = imo->imo_membership[i]; } imo->imo_num_memberships -= gap; } void inpcb_purgeif0(struct inpcbtable *table, struct ifnet *ifp) { struct inpcb *inp; TAILQ_FOREACH(inp, &table->inpt_queue, inp_queue) { bool need_unlock = false; if (inp->inp_af != AF_INET) continue; /* The caller holds either one of inps' lock */ if (!inp_locked(inp)) { inp_lock(inp); need_unlock = true; } in_purgeifmcast(inp->inp_moptions, ifp); if (need_unlock) inp_unlock(inp); } } void inpcb_purgeif(struct inpcbtable *table, struct ifnet *ifp) { struct rtentry *rt; struct inpcb *inp; TAILQ_FOREACH(inp, &table->inpt_queue, inp_queue) { if (inp->inp_af != AF_INET) continue; if ((rt = rtcache_validate(&inp->inp_route)) != NULL && rt->rt_ifp == ifp) { rtcache_unref(rt, &inp->inp_route); inpcb_rtchange(inp, 0); } else rtcache_unref(rt, &inp->inp_route); } } /* * inpcb_losing: check for alternatives when higher level complains about * service problems. For now, invalidate cached routing information. * If the route was created dynamically (by a redirect), time to try a * default gateway again. */ void inpcb_losing(struct inpcb *inp) { struct rtentry *rt; struct rt_addrinfo info; if (inp->inp_af != AF_INET) return; if ((rt = rtcache_validate(&inp->inp_route)) == NULL) return; memset(&info, 0, sizeof(info)); info.rti_info[RTAX_DST] = rtcache_getdst(&inp->inp_route); info.rti_info[RTAX_GATEWAY] = rt->rt_gateway; info.rti_info[RTAX_NETMASK] = rt_mask(rt); rt_missmsg(RTM_LOSING, &info, rt->rt_flags, 0); if (rt->rt_flags & RTF_DYNAMIC) { int error; struct rtentry *nrt; error = rtrequest(RTM_DELETE, rt_getkey(rt), rt->rt_gateway, rt_mask(rt), rt->rt_flags, &nrt); rtcache_unref(rt, &inp->inp_route); if (error == 0) { rt_newmsg_dynamic(RTM_DELETE, nrt); rt_free(nrt); } } else rtcache_unref(rt, &inp->inp_route); /* * A new route can be allocated * the next time output is attempted. */ rtcache_free(&inp->inp_route); } /* * inpcb_rtchange: after a routing change, flush old routing. * A new route can be allocated the next time output is attempted. */ void inpcb_rtchange(struct inpcb *inp, int errno) { if (inp->inp_af != AF_INET) return; rtcache_free(&inp->inp_route); /* XXX SHOULD NOTIFY HIGHER-LEVEL PROTOCOLS */ } /* * inpcb_lookup_local: find a PCB by looking at the local port and matching * the local address or resolving the wildcards. Primarily used to detect * when the local address is already in use. */ struct inpcb * inpcb_lookup_local(struct inpcbtable *table, struct in_addr laddr, u_int lport_arg, int lookup_wildcard, vestigial_inpcb_t *vp) { struct inpcbhead *head; struct inpcb *inp; struct inpcb *match = NULL; int matchwild = 3; int wildcard; in_port_t lport = lport_arg; if (vp) vp->valid = 0; head = INPCBHASH_PORT(table, lport); LIST_FOREACH(inp, head, inp_lhash) { if (inp->inp_af != AF_INET) continue; if (inp->inp_lport != lport) continue; /* * check if inp's faddr and laddr match with ours. * our faddr is considered null. * count the number of wildcard matches. (0 - 2) * * null null match * A null wildcard match * null B wildcard match * A B non match * A A match */ wildcard = 0; if (!in_nullhost(in4p_faddr(inp))) wildcard++; if (in_nullhost(in4p_laddr(inp))) { if (!in_nullhost(laddr)) wildcard++; } else { if (in_nullhost(laddr)) wildcard++; else { if (!in_hosteq(in4p_laddr(inp), laddr)) continue; } } if (wildcard && !lookup_wildcard) continue; /* * prefer an address with less wildcards. */ if (wildcard < matchwild) { match = inp; matchwild = wildcard; if (matchwild == 0) break; } } if (match && matchwild == 0) return match; if (vp && table->vestige) { void *state = (*table->vestige->init_ports4)(laddr, lport_arg, lookup_wildcard); vestigial_inpcb_t better; bool has_better = false; while (table->vestige && (*table->vestige->next_port4)(state, vp)) { if (vp->lport != lport) continue; wildcard = 0; if (!in_nullhost(vp->faddr.v4)) wildcard++; if (in_nullhost(vp->laddr.v4)) { if (!in_nullhost(laddr)) wildcard++; } else { if (in_nullhost(laddr)) wildcard++; else { if (!in_hosteq(vp->laddr.v4, laddr)) continue; } } if (wildcard && !lookup_wildcard) continue; if (wildcard < matchwild) { better = *vp; has_better = true; matchwild = wildcard; if (matchwild == 0) break; } } if (has_better) { *vp = better; return 0; } } return match; } #ifdef DIAGNOSTIC int inpcb_notifymiss = 0; #endif /* * inpcb_lookup: perform a full 4-tuple PCB lookup. */ struct inpcb * inpcb_lookup(struct inpcbtable *table, struct in_addr faddr, u_int fport_arg, struct in_addr laddr, u_int lport_arg, vestigial_inpcb_t *vp) { struct inpcbhead *head; struct inpcb *inp; in_port_t fport = fport_arg, lport = lport_arg; if (vp) vp->valid = 0; head = INPCBHASH_CONNECT(table, faddr, fport, laddr, lport); LIST_FOREACH(inp, head, inp_hash) { if (inp->inp_af != AF_INET) continue; if (in_hosteq(in4p_faddr(inp), faddr) && inp->inp_fport == fport && inp->inp_lport == lport && in_hosteq(in4p_laddr(inp), laddr)) goto out; } if (vp && table->vestige) { if ((*table->vestige->lookup4)(faddr, fport_arg, laddr, lport_arg, vp)) return 0; } #ifdef DIAGNOSTIC if (inpcb_notifymiss) { printf("inpcb_lookup: faddr=%08x fport=%d laddr=%08x lport=%d\n", ntohl(faddr.s_addr), ntohs(fport), ntohl(laddr.s_addr), ntohs(lport)); } #endif return 0; out: /* Move this PCB to the head of hash chain. */ if (inp != LIST_FIRST(head)) { LIST_REMOVE(inp, inp_hash); LIST_INSERT_HEAD(head, inp, inp_hash); } return inp; } /* * inpcb_lookup_bound: find a PCB by looking at the local address and port. * Primarily used to find the listening (i.e., already bound) socket. */ struct inpcb * inpcb_lookup_bound(struct inpcbtable *table, struct in_addr laddr, u_int lport_arg) { struct inpcbhead *head; struct inpcb *inp; in_port_t lport = lport_arg; head = INPCBHASH_BIND(table, laddr, lport); LIST_FOREACH(inp, head, inp_hash) { if (inp->inp_af != AF_INET) continue; if (inp->inp_lport == lport && in_hosteq(in4p_laddr(inp), laddr)) goto out; } head = INPCBHASH_BIND(table, zeroin_addr, lport); LIST_FOREACH(inp, head, inp_hash) { if (inp->inp_af != AF_INET) continue; if (inp->inp_lport == lport && in_hosteq(in4p_laddr(inp), zeroin_addr)) goto out; } #ifdef DIAGNOSTIC if (inpcb_notifymiss) { printf("inpcb_lookup_bound: laddr=%08x lport=%d\n", ntohl(laddr.s_addr), ntohs(lport)); } #endif return 0; out: /* Move this PCB to the head of hash chain. */ if (inp != LIST_FIRST(head)) { LIST_REMOVE(inp, inp_hash); LIST_INSERT_HEAD(head, inp, inp_hash); } return inp; } void inpcb_set_state(struct inpcb *inp, int state) { #ifdef INET6 if (inp->inp_af == AF_INET6) { in6pcb_set_state(inp, state); return; } #else if (inp->inp_af != AF_INET) return; #endif if (inp->inp_state > INP_ATTACHED) LIST_REMOVE(inp, inp_hash); switch (state) { case INP_BOUND: LIST_INSERT_HEAD(INPCBHASH_BIND(inp->inp_table, in4p_laddr(inp), inp->inp_lport), inp, inp_hash); break; case INP_CONNECTED: LIST_INSERT_HEAD(INPCBHASH_CONNECT(inp->inp_table, in4p_faddr(inp), inp->inp_fport, in4p_laddr(inp), inp->inp_lport), inp, inp_hash); break; } inp->inp_state = state; } struct rtentry * inpcb_rtentry(struct inpcb *inp) { struct route *ro; union { struct sockaddr dst; struct sockaddr_in dst4; } u; #ifdef INET6 if (inp->inp_af == AF_INET6) return in6pcb_rtentry(inp); #endif if (inp->inp_af != AF_INET) return NULL; ro = &inp->inp_route; sockaddr_in_init(&u.dst4, &in4p_faddr(inp), 0); return rtcache_lookup(ro, &u.dst); } void inpcb_rtentry_unref(struct rtentry *rt, struct inpcb *inp) { rtcache_unref(rt, &inp->inp_route); }
2 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 /* $NetBSD: tmpfs_specops.c,v 1.16 2021/07/19 01:30:25 dholland Exp $ */ /* * Copyright (c) 2005 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Julio M. Merino Vidal, developed as part of Google's Summer of Code * 2005 program. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /* * tmpfs vnode interface for special devices. */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: tmpfs_specops.c,v 1.16 2021/07/19 01:30:25 dholland Exp $"); #include <sys/param.h> #include <sys/vnode.h> #include <fs/tmpfs/tmpfs.h> #include <fs/tmpfs/tmpfs_specops.h> /* * vnode operations vector used for special devices stored in a tmpfs * file system. */ int (**tmpfs_specop_p)(void *); const struct vnodeopv_entry_desc tmpfs_specop_entries[] = { { &vop_default_desc, vn_default_error }, GENFS_SPECOP_ENTRIES, { &vop_close_desc, tmpfs_spec_close }, { &vop_access_desc, tmpfs_access }, { &vop_accessx_desc, genfs_accessx }, { &vop_getattr_desc, tmpfs_getattr }, { &vop_setattr_desc, tmpfs_setattr }, { &vop_read_desc, tmpfs_spec_read }, { &vop_write_desc, tmpfs_spec_write }, { &vop_fcntl_desc, genfs_fcntl }, { &vop_fsync_desc, spec_fsync }, { &vop_inactive_desc, tmpfs_inactive }, { &vop_reclaim_desc, tmpfs_reclaim }, { &vop_lock_desc, genfs_lock }, { &vop_unlock_desc, genfs_unlock }, { &vop_print_desc, tmpfs_print }, { &vop_islocked_desc, genfs_islocked }, { &vop_bwrite_desc, vn_bwrite }, { NULL, NULL } }; const struct vnodeopv_desc tmpfs_specop_opv_desc = { &tmpfs_specop_p, tmpfs_specop_entries }; int tmpfs_spec_close(void *v) { struct vop_close_args /* { struct vnode *a_vp; int a_fflag; kauth_cred_t a_cred; } */ *ap __unused = v; return VOCALL(spec_vnodeop_p, VOFFSET(vop_close), v); } int tmpfs_spec_read(void *v) { struct vop_read_args /* { struct vnode *a_vp; struct uio *a_uio; int a_ioflag; kauth_cred_t a_cred; } */ *ap = v; vnode_t *vp = ap->a_vp; tmpfs_update(vp, TMPFS_UPDATE_ATIME); return VOCALL(spec_vnodeop_p, VOFFSET(vop_read), v); } int tmpfs_spec_write(void *v) { struct vop_write_args /* { struct vnode *a_vp; struct uio *a_uio; int a_ioflag; kauth_cred_t a_cred; } */ *ap = v; vnode_t *vp = ap->a_vp; tmpfs_update(vp, TMPFS_UPDATE_MTIME); return VOCALL(spec_vnodeop_p, VOFFSET(vop_write), v); }
55 49 11 11 13 13 13 13 19 11 11 12 12 1 12 12 1 1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 /* $NetBSD: ufs_quota1.c,v 1.26 2023/02/22 21:49:45 riastradh Exp $ */ /* * Copyright (c) 1982, 1986, 1990, 1993, 1995 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * Robert Elz at The University of Melbourne. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)ufs_quota.c 8.5 (Berkeley) 5/20/95 */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: ufs_quota1.c,v 1.26 2023/02/22 21:49:45 riastradh Exp $"); #include <sys/param.h> #include <sys/kernel.h> #include <sys/systm.h> #include <sys/namei.h> #include <sys/file.h> #include <sys/proc.h> #include <sys/vnode.h> #include <sys/mount.h> #include <sys/kauth.h> #include <ufs/ufs/quota1.h> #include <ufs/ufs/inode.h> #include <ufs/ufs/ufsmount.h> #include <ufs/ufs/ufs_extern.h> #include <ufs/ufs/ufs_quota.h> static int chkdqchg(struct inode *, int64_t, kauth_cred_t, int); static int chkiqchg(struct inode *, int32_t, kauth_cred_t, int); /* * Update disk usage, and take corrective action. */ int chkdq1(struct inode *ip, int64_t change, kauth_cred_t cred, int flags) { struct dquot *dq; int i; int ncurblocks, error; if ((error = getinoquota(ip)) != 0) return error; if (change == 0) return (0); if (change < 0) { for (i = 0; i < MAXQUOTAS; i++) { if ((dq = ip->i_dquot[i]) == NODQUOT) continue; mutex_enter(&dq->dq_interlock); ncurblocks = dq->dq_curblocks + change; if (ncurblocks >= 0) dq->dq_curblocks = ncurblocks; else dq->dq_curblocks = 0; dq->dq_flags &= ~DQ_WARN(QL_BLOCK); dq->dq_flags |= DQ_MOD; mutex_exit(&dq->dq_interlock); } return (0); } for (i = 0; i < MAXQUOTAS; i++) { if ((dq = ip->i_dquot[i]) == NODQUOT) continue; if ((flags & FORCE) == 0 && kauth_authorize_system(cred, KAUTH_SYSTEM_FS_QUOTA, KAUTH_REQ_SYSTEM_FS_QUOTA_NOLIMIT, KAUTH_ARG(i), KAUTH_ARG(QL_BLOCK), NULL) != 0) { mutex_enter(&dq->dq_interlock); error = chkdqchg(ip, change, cred, i); mutex_exit(&dq->dq_interlock); if (error != 0) return (error); } } for (i = 0; i < MAXQUOTAS; i++) { if ((dq = ip->i_dquot[i]) == NODQUOT) continue; mutex_enter(&dq->dq_interlock); dq->dq_curblocks += change; dq->dq_flags |= DQ_MOD; mutex_exit(&dq->dq_interlock); } return (0); } /* * Check for a valid change to a users allocation. * Issue an error message if appropriate. */ static int chkdqchg(struct inode *ip, int64_t change, kauth_cred_t cred, int type) { struct dquot *dq = ip->i_dquot[type]; long ncurblocks = dq->dq_curblocks + change; KASSERT(mutex_owned(&dq->dq_interlock)); /* * If user would exceed their hard limit, disallow space allocation. */ if (ncurblocks >= dq->dq_bhardlimit && dq->dq_bhardlimit) { if ((dq->dq_flags & DQ_WARN(QL_BLOCK)) == 0 && ip->i_uid == kauth_cred_geteuid(cred)) { uprintf("\n%s: write failed, %s disk limit reached\n", ITOV(ip)->v_mount->mnt_stat.f_mntonname, quotatypes[type]); dq->dq_flags |= DQ_WARN(QL_BLOCK); } return (EDQUOT); } /* * If user is over their soft limit for too long, disallow space * allocation. Reset time limit as they cross their soft limit. */ if (ncurblocks >= dq->dq_bsoftlimit && dq->dq_bsoftlimit) { if (dq->dq_curblocks < dq->dq_bsoftlimit) { dq->dq_btime = time_second + ip->i_ump->umq1_btime[type]; if (ip->i_uid == kauth_cred_geteuid(cred)) uprintf("\n%s: warning, %s %s\n", ITOV(ip)->v_mount->mnt_stat.f_mntonname, quotatypes[type], "disk quota exceeded"); return (0); } if (time_second > dq->dq_btime) { if ((dq->dq_flags & DQ_WARN(QL_BLOCK)) == 0 && ip->i_uid == kauth_cred_geteuid(cred)) { uprintf("\n%s: write failed, %s %s\n", ITOV(ip)->v_mount->mnt_stat.f_mntonname, quotatypes[type], "disk quota exceeded for too long"); dq->dq_flags |= DQ_WARN(QL_BLOCK); } return (EDQUOT); } } return (0); } /* * Check the inode limit, applying corrective action. */ int chkiq1(struct inode *ip, int32_t change, kauth_cred_t cred, int flags) { struct dquot *dq; int i; int ncurinodes, error; if ((error = getinoquota(ip)) != 0) return error; if (change == 0) return (0); if (change < 0) { for (i = 0; i < MAXQUOTAS; i++) { if ((dq = ip->i_dquot[i]) == NODQUOT) continue; mutex_enter(&dq->dq_interlock); ncurinodes = dq->dq_curinodes + change; if (ncurinodes >= 0) dq->dq_curinodes = ncurinodes; else dq->dq_curinodes = 0; dq->dq_flags &= ~DQ_WARN(QL_FILE); dq->dq_flags |= DQ_MOD; mutex_exit(&dq->dq_interlock); } return (0); } for (i = 0; i < MAXQUOTAS; i++) { if ((dq = ip->i_dquot[i]) == NODQUOT) continue; if ((flags & FORCE) == 0 && kauth_authorize_system(cred, KAUTH_SYSTEM_FS_QUOTA, KAUTH_REQ_SYSTEM_FS_QUOTA_NOLIMIT, KAUTH_ARG(i), KAUTH_ARG(QL_FILE), NULL) != 0) { mutex_enter(&dq->dq_interlock); error = chkiqchg(ip, change, cred, i); mutex_exit(&dq->dq_interlock); if (error != 0) return (error); } } for (i = 0; i < MAXQUOTAS; i++) { if ((dq = ip->i_dquot[i]) == NODQUOT) continue; mutex_enter(&dq->dq_interlock); dq->dq_curinodes += change; dq->dq_flags |= DQ_MOD; mutex_exit(&dq->dq_interlock); } return (0); } /* * Check for a valid change to a users allocation. * Issue an error message if appropriate. */ static int chkiqchg(struct inode *ip, int32_t change, kauth_cred_t cred, int type) { struct dquot *dq = ip->i_dquot[type]; long ncurinodes = dq->dq_curinodes + change; KASSERT(mutex_owned(&dq->dq_interlock)); /* * If user would exceed their hard limit, disallow inode allocation. */ if (ncurinodes >= dq->dq_ihardlimit && dq->dq_ihardlimit) { if ((dq->dq_flags & DQ_WARN(QL_FILE)) == 0 && ip->i_uid == kauth_cred_geteuid(cred)) { uprintf("\n%s: write failed, %s inode limit reached\n", ITOV(ip)->v_mount->mnt_stat.f_mntonname, quotatypes[type]); dq->dq_flags |= DQ_WARN(QL_FILE); } return (EDQUOT); } /* * If user is over their soft limit for too long, disallow inode * allocation. Reset time limit as they cross their soft limit. */ if (ncurinodes >= dq->dq_isoftlimit && dq->dq_isoftlimit) { if (dq->dq_curinodes < dq->dq_isoftlimit) { dq->dq_itime = time_second + ip->i_ump->umq1_itime[type]; if (ip->i_uid == kauth_cred_geteuid(cred)) uprintf("\n%s: warning, %s %s\n", ITOV(ip)->v_mount->mnt_stat.f_mntonname, quotatypes[type], "inode quota exceeded"); return (0); } if (time_second > dq->dq_itime) { if ((dq->dq_flags & DQ_WARN(QL_FILE)) == 0 && ip->i_uid == kauth_cred_geteuid(cred)) { uprintf("\n%s: write failed, %s %s\n", ITOV(ip)->v_mount->mnt_stat.f_mntonname, quotatypes[type], "inode quota exceeded for too long"); dq->dq_flags |= DQ_WARN(QL_FILE); } return (EDQUOT); } } return (0); } int quota1_umount(struct mount *mp, int flags) { int i, error; struct ufsmount *ump = VFSTOUFS(mp); struct lwp *l = curlwp; if ((ump->um_flags & UFS_QUOTA) == 0) return 0; if ((error = vflush(mp, NULLVP, SKIPSYSTEM | flags)) != 0) return (error); for (i = 0; i < MAXQUOTAS; i++) { if (ump->um_quotas[i] != NULLVP) { quota1_handle_cmd_quotaoff(l, ump, i); } } return 0; } /* * Code to process quotactl commands. */ /* * set up a quota file for a particular file system. */ int quota1_handle_cmd_quotaon(struct lwp *l, struct ufsmount *ump, int type, const char *fname) { struct mount *mp = ump->um_mountp; struct vnode *vp, **vpp; struct vnode_iterator *marker; struct dquot *dq; int error; struct pathbuf *pb; if (type < 0 || type >= MAXQUOTAS) return EINVAL; if (ump->um_flags & UFS_QUOTA2) { uprintf("%s: quotas v2 already enabled\n", mp->mnt_stat.f_mntonname); return (EBUSY); } if (mp->mnt_wapbl != NULL) { printf("%s: quota v1 cannot be used with -o log\n", mp->mnt_stat.f_mntonname); return (EOPNOTSUPP); } vpp = &ump->um_quotas[type]; pb = pathbuf_create(fname); if (pb == NULL) { return ENOMEM; } error = vn_open(NULL, pb, 0, FREAD|FWRITE, 0, &vp, NULL, NULL); if (error != 0) { pathbuf_destroy(pb); return error; } pathbuf_destroy(pb); VOP_UNLOCK(vp); if (vp->v_type != VREG) { (void) vn_close(vp, FREAD|FWRITE, l->l_cred); return (EACCES); } if (*vpp != vp) quota1_handle_cmd_quotaoff(l, ump, type); mutex_enter(&dqlock); while ((ump->umq1_qflags[type] & (QTF_CLOSING | QTF_OPENING)) != 0) cv_wait(&dqcv, &dqlock); ump->umq1_qflags[type] |= QTF_OPENING; mutex_exit(&dqlock); mp->mnt_flag |= MNT_QUOTA; vp->v_vflag |= VV_SYSTEM; /* XXXSMP */ *vpp = vp; /* * Save the credential of the process that turned on quotas. * Set up the time limits for this quota. */ kauth_cred_hold(l->l_cred); ump->um_cred[type] = l->l_cred; ump->umq1_btime[type] = MAX_DQ_TIME; ump->umq1_itime[type] = MAX_IQ_TIME; if (dqget(NULLVP, 0, ump, type, &dq) == 0) { if (dq->dq_btime > 0) ump->umq1_btime[type] = dq->dq_btime; if (dq->dq_itime > 0) ump->umq1_itime[type] = dq->dq_itime; dqrele(NULLVP, dq); } /* * Search vnodes associated with this mount point, * adding references to quota file being opened. * NB: only need to add dquot's for inodes being modified. */ vfs_vnode_iterator_init(mp, &marker); while ((vp = vfs_vnode_iterator_next(marker, NULL, NULL))) { error = vn_lock(vp, LK_EXCLUSIVE); if (error) { vrele(vp); continue; } mutex_enter(vp->v_interlock); if (VTOI(vp) == NULL || vp->v_type == VNON || vp->v_writecount == 0) { mutex_exit(vp->v_interlock); vput(vp); continue; } mutex_exit(vp->v_interlock); if ((error = getinoquota(VTOI(vp))) != 0) { vput(vp); break; } vput(vp); } vfs_vnode_iterator_destroy(marker); mutex_enter(&dqlock); ump->umq1_qflags[type] &= ~QTF_OPENING; cv_broadcast(&dqcv); if (error == 0) ump->um_flags |= UFS_QUOTA; mutex_exit(&dqlock); if (error) quota1_handle_cmd_quotaoff(l, ump, type); return (error); } /* * turn off disk quotas for a filesystem. */ int quota1_handle_cmd_quotaoff(struct lwp *l, struct ufsmount *ump, int type) { struct mount *mp = ump->um_mountp; struct vnode *vp; struct vnode *qvp; struct vnode_iterator *marker; struct dquot *dq; struct inode *ip; kauth_cred_t cred; int i, error; if (type < 0 || type >= MAXQUOTAS) return EINVAL; mutex_enter(&dqlock); while ((ump->umq1_qflags[type] & (QTF_CLOSING | QTF_OPENING)) != 0) cv_wait(&dqcv, &dqlock); if ((qvp = ump->um_quotas[type]) == NULLVP) { mutex_exit(&dqlock); return (0); } ump->umq1_qflags[type] |= QTF_CLOSING; mutex_exit(&dqlock); /* * Search vnodes associated with this mount point, * deleting any references to quota file being closed. */ vfs_vnode_iterator_init(mp, &marker); while ((vp = vfs_vnode_iterator_next(marker, NULL, NULL))) { error = vn_lock(vp, LK_EXCLUSIVE); if (error) { vrele(vp); continue; } ip = VTOI(vp); if (ip == NULL || vp->v_type == VNON) { vput(vp); continue; } dq = ip->i_dquot[type]; ip->i_dquot[type] = NODQUOT; dqrele(vp, dq); vput(vp); } vfs_vnode_iterator_destroy(marker); #ifdef DIAGNOSTIC dqflush(qvp); #endif qvp->v_vflag &= ~VV_SYSTEM; error = vn_close(qvp, FREAD|FWRITE, l->l_cred); mutex_enter(&dqlock); ump->um_quotas[type] = NULLVP; cred = ump->um_cred[type]; ump->um_cred[type] = NOCRED; for (i = 0; i < MAXQUOTAS; i++) if (ump->um_quotas[i] != NULLVP) break; ump->umq1_qflags[type] &= ~QTF_CLOSING; if (i == MAXQUOTAS) ump->um_flags &= ~UFS_QUOTA; cv_broadcast(&dqcv); mutex_exit(&dqlock); kauth_cred_free(cred); if (i == MAXQUOTAS) mp->mnt_flag &= ~MNT_QUOTA; return (error); } int quota1_handle_cmd_get(struct ufsmount *ump, const struct quotakey *qk, struct quotaval *qv) { struct dquot *dq; int error; struct quotaval blocks, files; int idtype; id_t id; idtype = qk->qk_idtype; id = qk->qk_id; if (ump->um_quotas[idtype] == NULLVP) return ENODEV; if (id == QUOTA_DEFAULTID) { /* we want the grace period of id 0 */ if ((error = dqget(NULLVP, 0, ump, idtype, &dq)) != 0) return error; } else { if ((error = dqget(NULLVP, id, ump, idtype, &dq)) != 0) return error; } dqblk_to_quotavals(&dq->dq_un.dq1_dqb, &blocks, &files); dqrele(NULLVP, dq); if (id == QUOTA_DEFAULTID) { if (blocks.qv_expiretime > 0) blocks.qv_grace = blocks.qv_expiretime; else blocks.qv_grace = MAX_DQ_TIME; if (files.qv_expiretime > 0) files.qv_grace = files.qv_expiretime; else files.qv_grace = MAX_DQ_TIME; } switch (qk->qk_objtype) { case QUOTA_OBJTYPE_BLOCKS: *qv = blocks; break; case QUOTA_OBJTYPE_FILES: *qv = files; break; default: return EINVAL; } return 0; } static uint32_t quota1_encode_limit(uint64_t lim) { if (lim == QUOTA_NOLIMIT || lim >= 0xffffffff) { return 0; } return lim; } int quota1_handle_cmd_put(struct ufsmount *ump, const struct quotakey *key, const struct quotaval *val) { struct dquot *dq; struct dqblk dqb; int error; switch (key->qk_idtype) { case QUOTA_IDTYPE_USER: case QUOTA_IDTYPE_GROUP: break; default: return EINVAL; } switch (key->qk_objtype) { case QUOTA_OBJTYPE_BLOCKS: case QUOTA_OBJTYPE_FILES: break; default: return EINVAL; } if (ump->um_quotas[key->qk_idtype] == NULLVP) return ENODEV; if (key->qk_id == QUOTA_DEFAULTID) { /* just update grace times */ id_t id = 0; if ((error = dqget(NULLVP, id, ump, key->qk_idtype, &dq)) != 0) return error; mutex_enter(&dq->dq_interlock); if (val->qv_grace != QUOTA_NOTIME) { if (key->qk_objtype == QUOTA_OBJTYPE_BLOCKS) ump->umq1_btime[key->qk_idtype] = dq->dq_btime = val->qv_grace; if (key->qk_objtype == QUOTA_OBJTYPE_FILES) ump->umq1_itime[key->qk_idtype] = dq->dq_itime = val->qv_grace; } dq->dq_flags |= DQ_MOD; mutex_exit(&dq->dq_interlock); dqrele(NULLVP, dq); return 0; } if ((error = dqget(NULLVP, key->qk_id, ump, key->qk_idtype, &dq)) != 0) return (error); mutex_enter(&dq->dq_interlock); /* * Copy all but the current values. * Reset time limit if previously had no soft limit or were * under it, but now have a soft limit and are over it. */ dqb.dqb_curblocks = dq->dq_curblocks; dqb.dqb_curinodes = dq->dq_curinodes; dqb.dqb_btime = dq->dq_btime; dqb.dqb_itime = dq->dq_itime; if (key->qk_objtype == QUOTA_OBJTYPE_BLOCKS) { dqb.dqb_bsoftlimit = quota1_encode_limit(val->qv_softlimit); dqb.dqb_bhardlimit = quota1_encode_limit(val->qv_hardlimit); dqb.dqb_isoftlimit = dq->dq_isoftlimit; dqb.dqb_ihardlimit = dq->dq_ihardlimit; } else { KASSERT(key->qk_objtype == QUOTA_OBJTYPE_FILES); dqb.dqb_bsoftlimit = dq->dq_bsoftlimit; dqb.dqb_bhardlimit = dq->dq_bhardlimit; dqb.dqb_isoftlimit = quota1_encode_limit(val->qv_softlimit); dqb.dqb_ihardlimit = quota1_encode_limit(val->qv_hardlimit); } if (dq->dq_id == 0 && val->qv_grace != QUOTA_NOTIME) { /* also update grace time if available */ if (key->qk_objtype == QUOTA_OBJTYPE_BLOCKS) { ump->umq1_btime[key->qk_idtype] = dqb.dqb_btime = val->qv_grace; } if (key->qk_objtype == QUOTA_OBJTYPE_FILES) { ump->umq1_itime[key->qk_idtype] = dqb.dqb_itime = val->qv_grace; } } if (dqb.dqb_bsoftlimit && dq->dq_curblocks >= dqb.dqb_bsoftlimit && (dq->dq_bsoftlimit == 0 || dq->dq_curblocks < dq->dq_bsoftlimit)) dqb.dqb_btime = time_second + ump->umq1_btime[key->qk_idtype]; if (dqb.dqb_isoftlimit && dq->dq_curinodes >= dqb.dqb_isoftlimit && (dq->dq_isoftlimit == 0 || dq->dq_curinodes < dq->dq_isoftlimit)) dqb.dqb_itime = time_second + ump->umq1_itime[key->qk_idtype]; dq->dq_un.dq1_dqb = dqb; if (dq->dq_curblocks < dq->dq_bsoftlimit) dq->dq_flags &= ~DQ_WARN(QL_BLOCK); if (dq->dq_curinodes < dq->dq_isoftlimit) dq->dq_flags &= ~DQ_WARN(QL_FILE); if (dq->dq_isoftlimit == 0 && dq->dq_bsoftlimit == 0 && dq->dq_ihardlimit == 0 && dq->dq_bhardlimit == 0) dq->dq_flags |= DQ_FAKE; else dq->dq_flags &= ~DQ_FAKE; dq->dq_flags |= DQ_MOD; mutex_exit(&dq->dq_interlock); dqrele(NULLVP, dq); return (0); } #if 0 /* * Q_SETQUOTA - assign an entire dqblk structure. */ int setquota1(struct mount *mp, u_long id, int type, struct dqblk *dqb) { struct dquot *dq; struct dquot *ndq; struct ufsmount *ump = VFSTOUFS(mp); if ((error = dqget(NULLVP, id, ump, type, &ndq)) != 0) return (error); dq = ndq; mutex_enter(&dq->dq_interlock); /* * Copy all but the current values. * Reset time limit if previously had no soft limit or were * under it, but now have a soft limit and are over it. */ dqb->dqb_curblocks = dq->dq_curblocks; dqb->dqb_curinodes = dq->dq_curinodes; if (dq->dq_id != 0) { dqb->dqb_btime = dq->dq_btime; dqb->dqb_itime = dq->dq_itime; } if (dqb->dqb_bsoftlimit && dq->dq_curblocks >= dqb->dqb_bsoftlimit && (dq->dq_bsoftlimit == 0 || dq->dq_curblocks < dq->dq_bsoftlimit)) dqb->dqb_btime = time_second + ump->umq1_btime[type]; if (dqb->dqb_isoftlimit && dq->dq_curinodes >= dqb->dqb_isoftlimit && (dq->dq_isoftlimit == 0 || dq->dq_curinodes < dq->dq_isoftlimit)) dqb->dqb_itime = time_second + ump->umq1_itime[type]; dq->dq_un.dq1_dqb = *dqb; if (dq->dq_curblocks < dq->dq_bsoftlimit) dq->dq_flags &= ~DQ_WARN(QL_BLOCK); if (dq->dq_curinodes < dq->dq_isoftlimit) dq->dq_flags &= ~DQ_WARN(QL_FILE); if (dq->dq_isoftlimit == 0 && dq->dq_bsoftlimit == 0 && dq->dq_ihardlimit == 0 && dq->dq_bhardlimit == 0) dq->dq_flags |= DQ_FAKE; else dq->dq_flags &= ~DQ_FAKE; dq->dq_flags |= DQ_MOD; mutex_exit(&dq->dq_interlock); dqrele(NULLVP, dq); return (0); } /* * Q_SETUSE - set current inode and block usage. */ int setuse(struct mount *mp, u_long id, int type, void *addr) { struct dquot *dq; struct ufsmount *ump = VFSTOUFS(mp); struct dquot *ndq; struct dqblk usage; int error; error = copyin(addr, (void *)&usage, sizeof (struct dqblk)); if (error) return (error); if ((error = dqget(NULLVP, id, ump, type, &ndq)) != 0) return (error); dq = ndq; mutex_enter(&dq->dq_interlock); /* * Reset time limit if have a soft limit and were * previously under it, but are now over it. */ if (dq->dq_bsoftlimit && dq->dq_curblocks < dq->dq_bsoftlimit && usage.dqb_curblocks >= dq->dq_bsoftlimit) dq->dq_btime = time_second + ump->umq1_btime[type]; if (dq->dq_isoftlimit && dq->dq_curinodes < dq->dq_isoftlimit && usage.dqb_curinodes >= dq->dq_isoftlimit) dq->dq_itime = time_second + ump->umq1_itime[type]; dq->dq_curblocks = usage.dqb_curblocks; dq->dq_curinodes = usage.dqb_curinodes; if (dq->dq_curblocks < dq->dq_bsoftlimit) dq->dq_flags &= ~DQ_WARN(QL_BLOCK); if (dq->dq_curinodes < dq->dq_isoftlimit) dq->dq_flags &= ~DQ_WARN(QL_FILE); dq->dq_flags |= DQ_MOD; mutex_exit(&dq->dq_interlock); dqrele(NULLVP, dq); return (0); } #endif /* * Q_SYNC - sync quota files to disk. */ int q1sync(struct mount *mp) { struct ufsmount *ump = VFSTOUFS(mp); struct vnode *vp; struct vnode_iterator *marker; struct dquot *dq; int i, error; /* * Check if the mount point has any quotas. * If not, simply return. */ for (i = 0; i < MAXQUOTAS; i++) if (ump->um_quotas[i] != NULLVP) break; if (i == MAXQUOTAS) return (0); /* * Search vnodes associated with this mount point, * synchronizing any modified dquot structures. */ vfs_vnode_iterator_init(mp, &marker); while ((vp = vfs_vnode_iterator_next(marker, NULL, NULL))) { error = vn_lock(vp, LK_EXCLUSIVE); if (error) { vrele(vp); continue; } if (VTOI(vp) == NULL || vp->v_type == VNON) { vput(vp); continue; } for (i = 0; i < MAXQUOTAS; i++) { dq = VTOI(vp)->i_dquot[i]; if (dq == NODQUOT) continue; mutex_enter(&dq->dq_interlock); if (dq->dq_flags & DQ_MOD) dq1sync(vp, dq); mutex_exit(&dq->dq_interlock); } vput(vp); } vfs_vnode_iterator_destroy(marker); return (0); } /* * Obtain a dquot structure for the specified identifier and quota file * reading the information from the file if necessary. */ int dq1get(struct vnode *dqvp, u_long id, struct ufsmount *ump, int type, struct dquot *dq) { struct iovec aiov; struct uio auio; int error; KASSERT(mutex_owned(&dq->dq_interlock)); vn_lock(dqvp, LK_EXCLUSIVE | LK_RETRY); auio.uio_iov = &aiov; auio.uio_iovcnt = 1; aiov.iov_base = (void *)&dq->dq_un.dq1_dqb; aiov.iov_len = sizeof (struct dqblk); auio.uio_resid = sizeof (struct dqblk); auio.uio_offset = (off_t)id * sizeof (struct dqblk); auio.uio_rw = UIO_READ; UIO_SETUP_SYSSPACE(&auio); error = VOP_READ(dqvp, &auio, 0, ump->um_cred[type]); if (auio.uio_resid == sizeof(struct dqblk) && error == 0) memset((void *)&dq->dq_un.dq1_dqb, 0, sizeof(struct dqblk)); VOP_UNLOCK(dqvp); /* * I/O error in reading quota file, release * quota structure and reflect problem to caller. */ if (error) return (error); /* * Check for no limit to enforce. * Initialize time values if necessary. */ if (dq->dq_isoftlimit == 0 && dq->dq_bsoftlimit == 0 && dq->dq_ihardlimit == 0 && dq->dq_bhardlimit == 0) dq->dq_flags |= DQ_FAKE; if (dq->dq_id != 0) { if (dq->dq_btime == 0) dq->dq_btime = time_second + ump->umq1_btime[type]; if (dq->dq_itime == 0) dq->dq_itime = time_second + ump->umq1_itime[type]; } return (0); } /* * Update the disk quota in the quota file. */ int dq1sync(struct vnode *vp, struct dquot *dq) { struct vnode *dqvp; struct iovec aiov; struct uio auio; int error; if (dq == NODQUOT) panic("dq1sync: dquot"); KASSERT(mutex_owned(&dq->dq_interlock)); if ((dq->dq_flags & DQ_MOD) == 0) return (0); if ((dqvp = dq->dq_ump->um_quotas[dq->dq_type]) == NULLVP) panic("dq1sync: file"); KASSERT(dqvp != vp); vn_lock(dqvp, LK_EXCLUSIVE | LK_RETRY); auio.uio_iov = &aiov; auio.uio_iovcnt = 1; aiov.iov_base = (void *)&dq->dq_un.dq1_dqb; aiov.iov_len = sizeof (struct dqblk); auio.uio_resid = sizeof (struct dqblk); auio.uio_offset = (off_t)dq->dq_id * sizeof (struct dqblk); auio.uio_rw = UIO_WRITE; UIO_SETUP_SYSSPACE(&auio); error = VOP_WRITE(dqvp, &auio, 0, dq->dq_ump->um_cred[dq->dq_type]); if (auio.uio_resid && error == 0) error = EIO; dq->dq_flags &= ~DQ_MOD; VOP_UNLOCK(dqvp); return (error); }
4 2 2 4 2 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 /* $NetBSD: clock_subr.c,v 1.27 2016/08/15 15:51:39 jakllsch Exp $ */ /* * Copyright (c) 1988 University of Utah. * Copyright (c) 1982, 1990, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * the Systems Programming Group of the University of Utah Computer * Science Department. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: Utah $Hdr: clock.c 1.18 91/01/21$ * * @(#)clock.c 8.2 (Berkeley) 1/12/94 */ /* * Generic routines to convert between a POSIX date * (seconds since 1/1/1970) and yr/mo/day/hr/min/sec * Derived from arch/hp300/hp300/clock.c */ #if HAVE_NBTOOL_CONFIG_H #include "nbtool_config.h" #endif /* HAVE_NBTOOL_CONFIG_H */ #ifdef _KERNEL #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: clock_subr.c,v 1.27 2016/08/15 15:51:39 jakllsch Exp $"); #include <sys/param.h> #include <sys/systm.h> #include <sys/errno.h> #else /* ! _KERNEL */ #include <string.h> #include <time.h> #include <errno.h> #endif /* ! _KERNEL */ #include "../sys/clock.h" #include <dev/clock_subr.h> #define FEBRUARY 2 /* for easier alignment: * time from the epoch to 2001 (there were 8 leap years): */ #define DAYSTO2001 (365*31+8) /* 4 year intervals include 1 leap year */ #define DAYS4YEARS (365*4+1) /* 100 year intervals include 24 leap years */ #define DAYS100YEARS (365*100+24) /* 400 year intervals include 97 leap years */ #define DAYS400YEARS (365*400+97) time_t clock_ymdhms_to_secs(struct clock_ymdhms *dt) { uint64_t secs, i, year, days; year = dt->dt_year; /* * Compute days since start of time * First from years, then from months. */ if (year < POSIX_BASE_YEAR) return -1; days = 0; if (is_leap_year(year) && dt->dt_mon > FEBRUARY) days++; if (year < 2001) { /* simple way for early years */ for (i = POSIX_BASE_YEAR; i < year; i++) days += days_per_year(i); } else { /* years are properly aligned */ days += DAYSTO2001; year -= 2001; i = year / 400; days += i * DAYS400YEARS; year -= i * 400; i = year / 100; days += i * DAYS100YEARS; year -= i * 100; i = year / 4; days += i * DAYS4YEARS; year -= i * 4; for (i = dt->dt_year-year; i < dt->dt_year; i++) days += days_per_year(i); } /* Months */ for (i = 1; i < dt->dt_mon; i++) days += days_in_month(i); days += (dt->dt_day - 1); /* Add hours, minutes, seconds. */ secs = (((uint64_t)days * 24 + dt->dt_hour) * 60 + dt->dt_min) * 60 + dt->dt_sec; if ((time_t)secs < 0 || secs > __type_max(time_t)) return -1; return secs; } int clock_secs_to_ymdhms(time_t secs, struct clock_ymdhms *dt) { int leap; uint64_t i; time_t days; time_t rsec; /* remainder seconds */ if (secs < 0) return EINVAL; days = secs / SECS_PER_DAY; rsec = secs % SECS_PER_DAY; /* Day of week (Note: 1/1/1970 was a Thursday) */ dt->dt_wday = (days + 4) % 7; if (days >= DAYSTO2001) { days -= DAYSTO2001; dt->dt_year = 2001; i = days / DAYS400YEARS; days -= i*DAYS400YEARS; dt->dt_year += i*400; i = days / DAYS100YEARS; days -= i*DAYS100YEARS; dt->dt_year += i*100; i = days / DAYS4YEARS; days -= i*DAYS4YEARS; dt->dt_year += i*4; for (i = dt->dt_year; days >= days_per_year(i); i++) days -= days_per_year(i); dt->dt_year = i; } else { /* Subtract out whole years, counting them in i. */ for (i = POSIX_BASE_YEAR; days >= days_per_year(i); i++) days -= days_per_year(i); dt->dt_year = i; } /* Subtract out whole months, counting them in i. */ for (leap = 0, i = 1; days >= days_in_month(i)+leap; i++) { days -= days_in_month(i)+leap; if (i == 1 && is_leap_year(dt->dt_year)) leap = 1; else leap = 0; } dt->dt_mon = i; /* Days are what is left over (+1) from all that. */ dt->dt_day = days + 1; /* Hours, minutes, seconds are easy */ dt->dt_hour = rsec / SECS_PER_HOUR; rsec = rsec % SECS_PER_HOUR; dt->dt_min = rsec / SECS_PER_MINUTE; rsec = rsec % SECS_PER_MINUTE; dt->dt_sec = rsec; return 0; }
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 /* $NetBSD: ipsec.h,v 1.93 2022/10/28 05:23:09 ozaki-r Exp $ */ /* $FreeBSD: ipsec.h,v 1.2.4.2 2004/02/14 22:23:23 bms Exp $ */ /* $KAME: ipsec.h,v 1.53 2001/11/20 08:32:38 itojun Exp $ */ /* * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the project nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #ifndef _NETIPSEC_IPSEC_H_ #define _NETIPSEC_IPSEC_H_ #if defined(_KERNEL_OPT) #include "opt_inet.h" #include "opt_ipsec.h" #endif #include <net/pfkeyv2.h> #ifdef _KERNEL #include <sys/socketvar.h> #include <sys/localcount.h> #include <netinet/in_pcb.h> #include <netipsec/keydb.h> /* * Security Policy Index * Ensure that both address families in the "src" and "dst" are same. * When the value of the ul_proto is ICMPv6, the port field in "src" * specifies ICMPv6 type, and the port field in "dst" specifies ICMPv6 code. */ struct secpolicyindex { u_int8_t dir; /* direction of packet flow, see blow */ union sockaddr_union src; /* IP src address for SP */ union sockaddr_union dst; /* IP dst address for SP */ u_int8_t prefs; /* prefix length in bits for src */ u_int8_t prefd; /* prefix length in bits for dst */ u_int16_t ul_proto; /* upper layer Protocol */ }; /* Security Policy Data Base */ struct secpolicy { struct pslist_entry pslist_entry; struct localcount localcount; /* reference count */ struct secpolicyindex spidx; /* selector */ u_int32_t id; /* It's unique number on the system. */ u_int state; /* 0: dead, others: alive */ #define IPSEC_SPSTATE_DEAD 0 #define IPSEC_SPSTATE_ALIVE 1 u_int origin; /* who generate this SP. */ #define IPSEC_SPORIGIN_USER 0 #define IPSEC_SPORIGIN_KERNEL 1 u_int policy; /* DISCARD, NONE or IPSEC, see keyv2.h */ struct ipsecrequest *req; /* pointer to the ipsec request tree, */ /* if policy == IPSEC else this value == NULL.*/ /* * lifetime handler. * the policy can be used without limitiation if both lifetime and * validtime are zero. * "lifetime" is passed by sadb_lifetime.sadb_lifetime_addtime. * "validtime" is passed by sadb_lifetime.sadb_lifetime_usetime. */ time_t created; /* time created the policy */ time_t lastused; /* updated every when kernel sends a packet */ time_t lifetime; /* duration of the lifetime of this policy */ time_t validtime; /* duration this policy is valid without use */ }; /* Request for IPsec */ struct ipsecrequest { struct ipsecrequest *next; /* pointer to next structure */ /* If NULL, it means the end of chain. */ struct secasindex saidx;/* hint for search proper SA */ /* if __ss_len == 0 then no address specified.*/ u_int level; /* IPsec level defined below. */ struct secpolicy *sp; /* back pointer to SP */ }; /* security policy in PCB */ struct inpcbpolicy { struct secpolicy *sp_in; struct secpolicy *sp_out; int priv; /* privileged socket ? */ /* cached policy */ struct { struct secpolicy *cachesp; struct secpolicyindex cacheidx; int cachehint; /* processing requirement hint: */ #define IPSEC_PCBHINT_UNKNOWN 0 /* Unknown */ #define IPSEC_PCBHINT_YES 1 /* IPsec processing is required */ #define IPSEC_PCBHINT_NO 2 /* IPsec processing not required */ u_int cachegen; /* spdgen when cache filled */ } sp_cache[3]; /* XXX 3 == IPSEC_DIR_MAX */ int sp_cacheflags; #define IPSEC_PCBSP_CONNECTED 1 struct inpcb *sp_inp; /* back pointer */ }; extern u_int ipsec_spdgen; static __inline bool ipsec_pcb_skip_ipsec(struct inpcbpolicy *pcbsp, int dir) { KASSERT(inp_locked(pcbsp->sp_inp)); return pcbsp->sp_cache[(dir)].cachehint == IPSEC_PCBHINT_NO && pcbsp->sp_cache[(dir)].cachegen == ipsec_spdgen; } /* SP acquiring list table. */ struct secspacq { LIST_ENTRY(secspacq) chain; struct secpolicyindex spidx; time_t created; /* for lifetime */ int count; /* for lifetime */ /* XXX: here is mbuf place holder to be sent ? */ }; #endif /* _KERNEL */ /* buffer size for formatted output of ipsec address (addr + '%' + scope_id?) */ #define IPSEC_ADDRSTRLEN (INET6_ADDRSTRLEN + 11) /* buffer size for ipsec_logsastr() */ #define IPSEC_LOGSASTRLEN 192 /* according to IANA assignment, port 0x0000 and proto 0xff are reserved. */ #define IPSEC_PORT_ANY 0 #define IPSEC_ULPROTO_ANY 255 #define IPSEC_PROTO_ANY 255 /* mode of security protocol */ /* NOTE: DON'T use IPSEC_MODE_ANY at SPD. It's only use in SAD */ #define IPSEC_MODE_ANY 0 /* i.e. wildcard. */ #define IPSEC_MODE_TRANSPORT 1 #define IPSEC_MODE_TUNNEL 2 #define IPSEC_MODE_TCPMD5 3 /* TCP MD5 mode */ /* * Direction of security policy. * NOTE: Since INVALID is used just as flag. * The other are used for loop counter too. */ #define IPSEC_DIR_ANY 0 #define IPSEC_DIR_INBOUND 1 #define IPSEC_DIR_OUTBOUND 2 #define IPSEC_DIR_MAX 3 #define IPSEC_DIR_INVALID 4 #define IPSEC_DIR_IS_VALID(dir) ((dir) >= 0 && (dir) <= IPSEC_DIR_MAX) #define IPSEC_DIR_IS_INOROUT(dir) ((dir) == IPSEC_DIR_INBOUND || \ (dir) == IPSEC_DIR_OUTBOUND) /* Policy level */ /* * IPSEC, ENTRUST and BYPASS are allowed for setsockopt() in PCB, * DISCARD, IPSEC and NONE are allowed for setkey() in SPD. * DISCARD and NONE are allowed for system default. */ #define IPSEC_POLICY_DISCARD 0 /* discarding packet */ #define IPSEC_POLICY_NONE 1 /* through IPsec engine */ #define IPSEC_POLICY_IPSEC 2 /* do IPsec */ #define IPSEC_POLICY_ENTRUST 3 /* consulting SPD if present. */ #define IPSEC_POLICY_BYPASS 4 /* only for privileged socket. */ /* Security protocol level */ #define IPSEC_LEVEL_DEFAULT 0 /* reference to system default */ #define IPSEC_LEVEL_USE 1 /* use SA if present. */ #define IPSEC_LEVEL_REQUIRE 2 /* require SA. */ #define IPSEC_LEVEL_UNIQUE 3 /* unique SA. */ #define IPSEC_MANUAL_REQID_MAX 0x3fff /* * if security policy level == unique, this id * indicate to a relative SA for use, else is * zero. * 1 - 0x3fff are reserved for manual keying. * 0 are reserved for above reason. Others is * for kernel use. * Note that this id doesn't identify SA * by only itself. */ #define IPSEC_REPLAYWSIZE 32 #ifdef _KERNEL extern int ipsec_debug; #ifdef IPSEC_DEBUG extern int ipsec_replay; extern int ipsec_integrity; #endif extern struct secpolicy ip4_def_policy; extern int ip4_esp_trans_deflev; extern int ip4_esp_net_deflev; extern int ip4_ah_trans_deflev; extern int ip4_ah_net_deflev; extern int ip4_ah_cleartos; extern int ip4_ah_offsetmask; extern int ip4_ipsec_dfbit; extern int ip4_ipsec_ecn; extern int crypto_support; #include <sys/syslog.h> #define DPRINTF(fmt, args...) \ do { \ if (ipsec_debug) \ log(LOG_DEBUG, "%s: " fmt, __func__, ##args); \ } while (/*CONSTCOND*/0) #define IPSECLOG(level, fmt, args...) \ do { \ if (ipsec_debug) \ log(level, "%s: " fmt, __func__, ##args); \ } while (/*CONSTCOND*/0) #define ipsec_indone(m) \ ((m->m_flags & M_AUTHIPHDR) || (m->m_flags & M_DECRYPTED)) #define ipsec_outdone(m) \ (m_tag_find((m), PACKET_TAG_IPSEC_OUT_DONE) != NULL) static __inline bool ipsec_skip_pfil(struct mbuf *m) { bool rv; if (ipsec_indone(m) && ((m->m_pkthdr.pkthdr_flags & PKTHDR_FLAG_IPSEC_SKIP_PFIL) != 0)) { m->m_pkthdr.pkthdr_flags &= ~PKTHDR_FLAG_IPSEC_SKIP_PFIL; rv = true; } else { rv = false; } return rv; } void ipsec_pcbconn(struct inpcbpolicy *); void ipsec_pcbdisconn(struct inpcbpolicy *); void ipsec_invalpcbcacheall(void); struct inpcb; int ipsec4_output(struct mbuf *, struct inpcb *, int, u_long *, bool *, bool *, bool *); int ipsec_ip_input_checkpolicy(struct mbuf *, bool); void ipsec_mtu(struct mbuf *, int *); #ifdef INET6 void ipsec6_udp_cksum(struct mbuf *); #endif struct inpcb; int ipsec_init_pcbpolicy(struct socket *so, struct inpcbpolicy **); int ipsec_copy_policy(const struct inpcbpolicy *, struct inpcbpolicy *); u_int ipsec_get_reqlevel(const struct ipsecrequest *); int ipsec_set_policy(struct inpcb *, const void *, size_t, kauth_cred_t); int ipsec_get_policy(struct inpcb *, const void *, size_t, struct mbuf **); int ipsec_delete_pcbpolicy(struct inpcb *); int ipsec_in_reject(struct mbuf *, struct inpcb *); struct secasvar *ipsec_lookup_sa(const struct ipsecrequest *, const struct mbuf *); struct secas; struct tcpcb; int ipsec_chkreplay(u_int32_t, const struct secasvar *); int ipsec_updatereplay(u_int32_t, const struct secasvar *); size_t ipsec_hdrsiz(struct mbuf *, u_int, struct inpcb *); size_t ipsec4_hdrsiz_tcp(struct tcpcb *); union sockaddr_union; const char *ipsec_address(const union sockaddr_union* sa, char *, size_t); const char *ipsec_logsastr(const struct secasvar *, char *, size_t); /* NetBSD protosw ctlin entrypoint */ void *esp4_ctlinput(int, const struct sockaddr *, void *); void *ah4_ctlinput(int, const struct sockaddr *, void *); void ipsec_output_init(void); struct m_tag; void ipsec4_common_input(struct mbuf *m, int, int); int ipsec4_common_input_cb(struct mbuf *, struct secasvar *, int, int); int ipsec4_process_packet(struct mbuf *, const struct ipsecrequest *, u_long *); int ipsec_process_done(struct mbuf *, const struct ipsecrequest *, struct secasvar *, int); struct mbuf *m_clone(struct mbuf *); struct mbuf *m_makespace(struct mbuf *, int, int, int *); void *m_pad(struct mbuf *, int); int m_striphdr(struct mbuf *, int, int); extern int ipsec_used __read_mostly; extern int ipsec_enabled __read_mostly; #endif /* _KERNEL */ #ifndef _KERNEL char *ipsec_set_policy(const char *, int); int ipsec_get_policylen(char *); char *ipsec_dump_policy(char *, const char *); const char *ipsec_strerror(void); #endif /* !_KERNEL */ #ifdef _KERNEL /* External declarations of per-file init functions */ void ah_attach(void); void esp_attach(void); void ipcomp_attach(void); void ipe4_attach(void); void tcpsignature_attach(void); void ipsec_attach(void); void sysctl_net_inet_ipsec_setup(struct sysctllog **); #ifdef INET6 void sysctl_net_inet6_ipsec6_setup(struct sysctllog **); #endif #endif /* _KERNEL */ #endif /* !_NETIPSEC_IPSEC_H_ */
411 413 26 26 22 22 42 42 99 21 51 34 2 25 7 21 21 196 195 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 /* $NetBSD: kern_veriexec.c,v 1.27 2023/04/09 09:18:09 riastradh Exp $ */ /*- * Copyright (c) 2005, 2006 Elad Efrat <elad@NetBSD.org> * Copyright (c) 2005, 2006 Brett Lymn <blymn@NetBSD.org> * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The name of the authors may not be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: kern_veriexec.c,v 1.27 2023/04/09 09:18:09 riastradh Exp $"); #include "opt_veriexec.h" #include <sys/param.h> #include <sys/mount.h> #include <sys/kmem.h> #include <sys/vnode.h> #include <sys/namei.h> #include <sys/once.h> #include <sys/proc.h> #include <sys/rwlock.h> #include <sys/syslog.h> #include <sys/sysctl.h> #include <sys/inttypes.h> #include <sys/verified_exec.h> #include <sys/sha1.h> #include <sys/sha2.h> #include <sys/rmd160.h> #include <sys/md5.h> #include <sys/fileassoc.h> #include <sys/kauth.h> #include <sys/conf.h> #include <miscfs/specfs/specdev.h> #include <prop/proplib.h> #include <sys/fcntl.h> /* Readable values for veriexec_file_report(). */ #define REPORT_ALWAYS 0x01 /* Always print */ #define REPORT_VERBOSE 0x02 /* Print when verbose >= 1 */ #define REPORT_DEBUG 0x04 /* Print when verbose >= 2 (debug) */ #define REPORT_PANIC 0x08 /* Call panic() */ #define REPORT_ALARM 0x10 /* Alarm - also print pid/uid/.. */ #define REPORT_LOGMASK (REPORT_ALWAYS|REPORT_VERBOSE|REPORT_DEBUG) /* state of locking for veriexec_file_verify */ #define VERIEXEC_UNLOCKED 0x00 /* Nothing locked, callee does it */ #define VERIEXEC_LOCKED 0x01 /* Global op lock held */ /* state of file locking for veriexec_file_verify */ #define VERIEXEC_FILE_UNLOCKED 0x02 /* Nothing locked, callee does it */ #define VERIEXEC_FILE_LOCKED 0x04 /* File locked */ #define VERIEXEC_RW_UPGRADE(lock) while((rw_tryupgrade(lock)) == 0){}; struct veriexec_fpops { const char *type; size_t hash_len; size_t context_size; veriexec_fpop_init_t init; veriexec_fpop_update_t update; veriexec_fpop_final_t final; LIST_ENTRY(veriexec_fpops) entries; }; /* Veriexec per-file entry data. */ struct veriexec_file_entry { krwlock_t lock; /* r/w lock */ u_char *filename; /* File name. */ u_char type; /* Entry type. */ u_char status; /* Evaluation status. */ u_char *fp; /* Fingerprint. */ struct veriexec_fpops *ops; /* Fingerprint ops vector*/ size_t filename_len; /* Length of filename. */ }; /* Veriexec per-table data. */ struct veriexec_table_entry { uint64_t vte_count; /* Number of Veriexec entries. */ const struct sysctlnode *vte_node; }; static int veriexec_verbose; static int veriexec_strict; static int veriexec_bypass = 1; static char *veriexec_fp_names = NULL; static size_t veriexec_name_max = 0; static const struct sysctlnode *veriexec_count_node; static fileassoc_t veriexec_hook; static specificdata_key_t veriexec_mountspecific_key; static LIST_HEAD(, veriexec_fpops) veriexec_fpops_list = LIST_HEAD_INITIALIZER(veriexec_fpops_list); static int veriexec_raw_cb(kauth_cred_t, kauth_action_t, void *, void *, void *, void *, void *); static struct veriexec_fpops *veriexec_fpops_lookup(const char *); static void veriexec_file_free(struct veriexec_file_entry *); static unsigned int veriexec_tablecount = 0; /* * Veriexec operations global lock - most ops hold this as a read * lock, it is upgraded to a write lock when destroying veriexec file * table entries. */ static krwlock_t veriexec_op_lock; /* * Sysctl helper routine for Veriexec. */ static int sysctl_kern_veriexec_algorithms(SYSCTLFN_ARGS) { size_t len; int error; const char *p; if (newp != NULL) return EPERM; if (namelen != 0) return EINVAL; p = veriexec_fp_names == NULL ? "" : veriexec_fp_names; len = strlen(p) + 1; if (*oldlenp < len && oldp) return ENOMEM; if (oldp && (error = copyout(p, oldp, len)) != 0) return error; *oldlenp = len; return 0; } static int sysctl_kern_veriexec_strict(SYSCTLFN_ARGS) { struct sysctlnode node; int error, newval; node = *rnode; node.sysctl_data = &newval; newval = veriexec_strict; error = sysctl_lookup(SYSCTLFN_CALL(&node)); if (error || newp == NULL) return error; if (newval < veriexec_strict) return EPERM; veriexec_strict = newval; return 0; } SYSCTL_SETUP(sysctl_kern_veriexec_setup, "sysctl kern.veriexec setup") { const struct sysctlnode *rnode = NULL; sysctl_createv(clog, 0, NULL, &rnode, CTLFLAG_PERMANENT, CTLTYPE_NODE, "veriexec", SYSCTL_DESCR("Veriexec"), NULL, 0, NULL, 0, CTL_KERN, CTL_CREATE, CTL_EOL); sysctl_createv(clog, 0, &rnode, NULL, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_INT, "verbose", SYSCTL_DESCR("Veriexec verbose level"), NULL, 0, &veriexec_verbose, 0, CTL_CREATE, CTL_EOL); sysctl_createv(clog, 0, &rnode, NULL, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_INT, "strict", SYSCTL_DESCR("Veriexec strict level"), sysctl_kern_veriexec_strict, 0, NULL, 0, CTL_CREATE, CTL_EOL); sysctl_createv(clog, 0, &rnode, NULL, CTLFLAG_PERMANENT, CTLTYPE_STRING, "algorithms", SYSCTL_DESCR("Veriexec supported hashing " "algorithms"), sysctl_kern_veriexec_algorithms, 0, NULL, 0, CTL_CREATE, CTL_EOL); sysctl_createv(clog, 0, &rnode, &veriexec_count_node, CTLFLAG_PERMANENT, CTLTYPE_NODE, "count", SYSCTL_DESCR("Number of fingerprints on mount(s)"), NULL, 0, NULL, 0, CTL_CREATE, CTL_EOL); } /* * Add ops to the fingerprint ops vector list. */ int veriexec_fpops_add(const char *fp_type, size_t hash_len, size_t ctx_size, veriexec_fpop_init_t init, veriexec_fpop_update_t update, veriexec_fpop_final_t final) { struct veriexec_fpops *ops; KASSERT(init != NULL); KASSERT(update != NULL); KASSERT(final != NULL); KASSERT(hash_len != 0); KASSERT(ctx_size != 0); KASSERT(fp_type != NULL); if (veriexec_fpops_lookup(fp_type) != NULL) return (EEXIST); ops = kmem_alloc(sizeof(*ops), KM_SLEEP); ops->type = fp_type; ops->hash_len = hash_len; ops->context_size = ctx_size; ops->init = init; ops->update = update; ops->final = final; LIST_INSERT_HEAD(&veriexec_fpops_list, ops, entries); /* * If we don't have space for any names, allocate enough for six * which should be sufficient. (it's also enough for all algorithms * we can support at the moment) */ if (veriexec_fp_names == NULL) { veriexec_name_max = 64; veriexec_fp_names = kmem_zalloc(veriexec_name_max, KM_SLEEP); } /* * If we're running out of space for storing supported algorithms, * extend the buffer with space for four names. */ while (veriexec_name_max - (strlen(veriexec_fp_names) + 1) < strlen(fp_type)) { char *newp; unsigned int new_max; /* Add space for four algorithm names. */ new_max = veriexec_name_max + 64; newp = kmem_zalloc(new_max, KM_SLEEP); strlcpy(newp, veriexec_fp_names, new_max); kmem_free(veriexec_fp_names, veriexec_name_max); veriexec_fp_names = newp; veriexec_name_max = new_max; } if (*veriexec_fp_names != '\0') strlcat(veriexec_fp_names, " ", veriexec_name_max); strlcat(veriexec_fp_names, fp_type, veriexec_name_max); return (0); } static void veriexec_mountspecific_dtor(void *v) { struct veriexec_table_entry *vte = v; if (vte == NULL) { return; } sysctl_free(__UNCONST(vte->vte_node)); veriexec_tablecount--; kmem_free(vte, sizeof(*vte)); } static int veriexec_listener_cb(kauth_cred_t cred, kauth_action_t action, void *cookie, void *arg0, void *arg1, void *arg2, void *arg3) { int result; enum kauth_system_req req; if (action != KAUTH_SYSTEM_VERIEXEC) return KAUTH_RESULT_DEFER; result = KAUTH_RESULT_DEFER; req = (enum kauth_system_req)(uintptr_t)arg0; if (req == KAUTH_REQ_SYSTEM_VERIEXEC_MODIFY && veriexec_strict > VERIEXEC_LEARNING) { log(LOG_WARNING, "Veriexec: Strict mode, modifying " "tables not permitted.\n"); result = KAUTH_RESULT_DENY; } return result; } /* * Initialise Veriexec. */ void veriexec_init(void) { int error; /* Register a fileassoc for Veriexec. */ error = fileassoc_register("veriexec", (fileassoc_cleanup_cb_t)veriexec_file_free, &veriexec_hook); if (error) panic("Veriexec: Can't register fileassoc: error=%d", error); /* Register listener to handle raw disk access. */ if (kauth_listen_scope(KAUTH_SCOPE_DEVICE, veriexec_raw_cb, NULL) == NULL) panic("Veriexec: Can't listen on device scope"); error = mount_specific_key_create(&veriexec_mountspecific_key, veriexec_mountspecific_dtor); if (error) panic("Veriexec: Can't create mountspecific key"); if (kauth_listen_scope(KAUTH_SCOPE_SYSTEM, veriexec_listener_cb, NULL) == NULL) panic("Veriexec: Can't listen on system scope"); rw_init(&veriexec_op_lock); #define FPOPS_ADD(a, b, c, d, e, f) \ veriexec_fpops_add(a, b, c, \ __FPTRCAST(veriexec_fpop_init_t, d), \ __FPTRCAST(veriexec_fpop_update_t, e), \ __FPTRCAST(veriexec_fpop_final_t, f)) #ifdef VERIFIED_EXEC_FP_SHA256 FPOPS_ADD("SHA256", SHA256_DIGEST_LENGTH, sizeof(SHA256_CTX), SHA256_Init, SHA256_Update, SHA256_Final); #endif /* VERIFIED_EXEC_FP_SHA256 */ #ifdef VERIFIED_EXEC_FP_SHA384 FPOPS_ADD("SHA384", SHA384_DIGEST_LENGTH, sizeof(SHA384_CTX), SHA384_Init, SHA384_Update, SHA384_Final); #endif /* VERIFIED_EXEC_FP_SHA384 */ #ifdef VERIFIED_EXEC_FP_SHA512 FPOPS_ADD("SHA512", SHA512_DIGEST_LENGTH, sizeof(SHA512_CTX), SHA512_Init, SHA512_Update, SHA512_Final); #endif /* VERIFIED_EXEC_FP_SHA512 */ #undef FPOPS_ADD } static struct veriexec_fpops * veriexec_fpops_lookup(const char *name) { struct veriexec_fpops *ops; if (name == NULL) return (NULL); LIST_FOREACH(ops, &veriexec_fpops_list, entries) { if (strcasecmp(name, ops->type) == 0) return (ops); } return (NULL); } /* * Calculate fingerprint. Information on hash length and routines used is * extracted from veriexec_hash_list according to the hash type. * * NOTE: vfe is assumed to be locked for writing on entry. */ static int veriexec_fp_calc(struct lwp *l, struct vnode *vp, int file_lock_state, struct veriexec_file_entry *vfe, u_char *fp) { struct vattr va; void *ctx; u_char *buf; off_t offset, len; size_t resid; int error; KASSERT(file_lock_state != VERIEXEC_LOCKED); KASSERT(file_lock_state != VERIEXEC_UNLOCKED); if (file_lock_state == VERIEXEC_FILE_UNLOCKED) vn_lock(vp, LK_SHARED | LK_RETRY); error = VOP_GETATTR(vp, &va, l->l_cred); if (file_lock_state == VERIEXEC_FILE_UNLOCKED) VOP_UNLOCK(vp); if (error) return (error); ctx = kmem_alloc(vfe->ops->context_size, KM_SLEEP); buf = kmem_alloc(PAGE_SIZE, KM_SLEEP); (vfe->ops->init)(ctx); len = 0; error = 0; for (offset = 0; offset < va.va_size; offset += PAGE_SIZE) { len = ((va.va_size - offset) < PAGE_SIZE) ? (va.va_size - offset) : PAGE_SIZE; error = vn_rdwr(UIO_READ, vp, buf, len, offset, UIO_SYSSPACE, ((file_lock_state == VERIEXEC_FILE_LOCKED)? IO_NODELOCKED : 0), l->l_cred, &resid, NULL); if (error) { goto bad; } (vfe->ops->update)(ctx, buf, (unsigned int) len); if (len != PAGE_SIZE) break; } (vfe->ops->final)(fp, ctx); bad: kmem_free(ctx, vfe->ops->context_size); kmem_free(buf, PAGE_SIZE); return (error); } /* Compare two fingerprints of the same type. */ static int veriexec_fp_cmp(struct veriexec_fpops *ops, u_char *fp1, u_char *fp2) { if (veriexec_verbose >= 2) { int i; printf("comparing hashes...\n"); printf("fp1: "); for (i = 0; i < ops->hash_len; i++) { printf("%02x", fp1[i]); } printf("\nfp2: "); for (i = 0; i < ops->hash_len; i++) { printf("%02x", fp2[i]); } printf("\n"); } return (memcmp(fp1, fp2, ops->hash_len)); } static int veriexec_fp_status(struct lwp *l, struct vnode *vp, int file_lock_state, struct veriexec_file_entry *vfe, u_char *status) { size_t hash_len = vfe->ops->hash_len; u_char *digest; int error; digest = kmem_zalloc(hash_len, KM_SLEEP); error = veriexec_fp_calc(l, vp, file_lock_state, vfe, digest); if (error) goto out; /* Compare fingerprint with loaded data. */ if (veriexec_fp_cmp(vfe->ops, vfe->fp, digest) == 0) *status = FINGERPRINT_VALID; else *status = FINGERPRINT_NOMATCH; out: kmem_free(digest, hash_len); return error; } static struct veriexec_table_entry * veriexec_table_lookup(struct mount *mp) { /* XXX: From raidframe init */ if (mp == NULL) return NULL; return mount_getspecific(mp, veriexec_mountspecific_key); } static struct veriexec_file_entry * veriexec_get(struct vnode *vp) { return (fileassoc_lookup(vp, veriexec_hook)); } bool veriexec_lookup(struct vnode *vp) { return (veriexec_get(vp) == NULL ? false : true); } /* * Routine for maintaining mostly consistent message formats in Veriexec. */ static void veriexec_file_report(struct veriexec_file_entry *vfe, const u_char *msg, const u_char *filename, struct lwp *l, int f) { if (vfe != NULL && vfe->filename != NULL) filename = vfe->filename; if (filename == NULL) return; if (((f & REPORT_LOGMASK) >> 1) <= veriexec_verbose) { if (!(f & REPORT_ALARM) || (l == NULL)) log(LOG_NOTICE, "Veriexec: %s [%s]\n", msg, filename); else log(LOG_ALERT, "Veriexec: %s [%s, prog=%s pid=%u, " "uid=%u, gid=%u]\n", msg, filename, l->l_proc->p_comm, l->l_proc->p_pid, kauth_cred_getuid(l->l_cred), kauth_cred_getgid(l->l_cred)); } if (f & REPORT_PANIC) panic("Veriexec: Unrecoverable error."); } /* * Verify the fingerprint of the given file. If we're called directly from * sys_execve(), 'flag' will be VERIEXEC_DIRECT. If we're called from * exec_script(), 'flag' will be VERIEXEC_INDIRECT. If we are called from * vn_open(), 'flag' will be VERIEXEC_FILE. * * 'veriexec_op_lock' must be locked (and remains locked). * * NOTE: The veriexec file entry pointer (vfep) will be returned LOCKED * on no error. */ static int veriexec_file_verify(struct lwp *l, struct vnode *vp, const u_char *name, int flag, int file_lock_state, struct veriexec_file_entry **vfep) { struct veriexec_file_entry *vfe; int error = 0; KASSERT(rw_lock_held(&veriexec_op_lock)); KASSERT(file_lock_state != VERIEXEC_LOCKED); KASSERT(file_lock_state != VERIEXEC_UNLOCKED); #define VFE_NEEDS_EVAL(vfe) ((vfe->status == FINGERPRINT_NOTEVAL) || \ (vfe->type & VERIEXEC_UNTRUSTED)) if (vfep != NULL) *vfep = NULL; if (vp->v_type != VREG) return (0); /* Lookup veriexec table entry, save pointer if requested. */ vfe = veriexec_get(vp); if (vfep != NULL) *vfep = vfe; /* No entry in the veriexec tables. */ if (vfe == NULL) { veriexec_file_report(NULL, "No entry.", name, l, REPORT_VERBOSE); /* * Lockdown mode: Deny access to non-monitored files. * IPS mode: Deny execution of non-monitored files. */ if ((veriexec_strict >= VERIEXEC_LOCKDOWN) || ((veriexec_strict >= VERIEXEC_IPS) && (flag != VERIEXEC_FILE))) return (EPERM); return (0); } /* * Grab the lock for the entry, if we need to do an evaluation * then the lock is a write lock, after we have the write * lock, check if we really need it - some other thread may * have already done the work for us. */ if (VFE_NEEDS_EVAL(vfe)) { rw_enter(&vfe->lock, RW_WRITER); if (!VFE_NEEDS_EVAL(vfe)) rw_downgrade(&vfe->lock); } else rw_enter(&vfe->lock, RW_READER); /* Evaluate fingerprint if needed. */ if (VFE_NEEDS_EVAL(vfe)) { u_char status; error = veriexec_fp_status(l, vp, file_lock_state, vfe, &status); if (error) { veriexec_file_report(vfe, "Fingerprint calculation error.", name, NULL, REPORT_ALWAYS); rw_exit(&vfe->lock); return (error); } vfe->status = status; rw_downgrade(&vfe->lock); } if (!(vfe->type & flag)) { veriexec_file_report(vfe, "Incorrect access type.", name, l, REPORT_ALWAYS|REPORT_ALARM); /* IPS mode: Enforce access type. */ if (veriexec_strict >= VERIEXEC_IPS) { rw_exit(&vfe->lock); return (EPERM); } } switch (vfe->status) { case FINGERPRINT_NOTEVAL: /* Should not happen. */ rw_exit(&vfe->lock); veriexec_file_report(vfe, "Not-evaluated status " "post evaluation; inconsistency detected.", name, NULL, REPORT_ALWAYS|REPORT_PANIC); __builtin_unreachable(); /* NOTREACHED */ case FINGERPRINT_VALID: /* Valid fingerprint. */ veriexec_file_report(vfe, "Match.", name, NULL, REPORT_VERBOSE); break; case FINGERPRINT_NOMATCH: /* Fingerprint mismatch. */ veriexec_file_report(vfe, "Mismatch.", name, NULL, REPORT_ALWAYS|REPORT_ALARM); /* IDS mode: Deny access on fingerprint mismatch. */ if (veriexec_strict >= VERIEXEC_IDS) { rw_exit(&vfe->lock); error = EPERM; } break; default: /* Should never happen. */ rw_exit(&vfe->lock); veriexec_file_report(vfe, "Invalid status " "post evaluation.", name, NULL, REPORT_ALWAYS|REPORT_PANIC); /* NOTREACHED */ } return (error); } int veriexec_verify(struct lwp *l, struct vnode *vp, const u_char *name, int flag, bool *found) { struct veriexec_file_entry *vfe; int r; if (veriexec_bypass && (veriexec_strict == VERIEXEC_LEARNING)) return 0; rw_enter(&veriexec_op_lock, RW_READER); r = veriexec_file_verify(l, vp, name, flag, VERIEXEC_FILE_UNLOCKED, &vfe); rw_exit(&veriexec_op_lock); if ((r == 0) && (vfe != NULL)) rw_exit(&vfe->lock); if (found != NULL) *found = (vfe != NULL) ? true : false; return (r); } /* * Veriexec remove policy code. */ int veriexec_removechk(struct lwp *l, struct vnode *vp, const char *pathbuf) { struct veriexec_file_entry *vfe; int error; if (veriexec_bypass && (veriexec_strict == VERIEXEC_LEARNING)) return 0; rw_enter(&veriexec_op_lock, RW_READER); vfe = veriexec_get(vp); rw_exit(&veriexec_op_lock); if (vfe == NULL) { /* Lockdown mode: Deny access to non-monitored files. */ if (veriexec_strict >= VERIEXEC_LOCKDOWN) return (EPERM); return (0); } veriexec_file_report(vfe, "Remove request.", pathbuf, l, REPORT_ALWAYS|REPORT_ALARM); /* IDS mode: Deny removal of monitored files. */ if (veriexec_strict >= VERIEXEC_IDS) error = EPERM; else error = veriexec_file_delete(l, vp); return error; } /* * Veriexec rename policy. * * XXX: Once there's a way to hook after a successful rename, it would be * XXX: nice to update vfe->filename to the new name if it's not NULL and * XXX: the new name is absolute (ie., starts with a slash). */ int veriexec_renamechk(struct lwp *l, struct vnode *fromvp, const char *fromname, struct vnode *tovp, const char *toname) { struct veriexec_file_entry *fvfe = NULL, *tvfe = NULL; if (veriexec_bypass && (veriexec_strict == VERIEXEC_LEARNING)) return 0; rw_enter(&veriexec_op_lock, RW_READER); if (veriexec_strict >= VERIEXEC_LOCKDOWN) { log(LOG_ALERT, "Veriexec: Preventing rename of `%s' to " "`%s', uid=%u, pid=%u: Lockdown mode.\n", fromname, toname, kauth_cred_geteuid(l->l_cred), l->l_proc->p_pid); rw_exit(&veriexec_op_lock); return (EPERM); } fvfe = veriexec_get(fromvp); if (tovp != NULL) tvfe = veriexec_get(tovp); if ((fvfe == NULL) && (tvfe == NULL)) { /* None of them is monitored */ rw_exit(&veriexec_op_lock); return 0; } if (veriexec_strict >= VERIEXEC_IPS) { log(LOG_ALERT, "Veriexec: Preventing rename of `%s' " "to `%s', uid=%u, pid=%u: IPS mode, %s " "monitored.\n", fromname, toname, kauth_cred_geteuid(l->l_cred), l->l_proc->p_pid, (fvfe != NULL && tvfe != NULL) ? "files" : "file"); rw_exit(&veriexec_op_lock); return (EPERM); } if (fvfe != NULL) { /* * Monitored file is renamed; filename no longer relevant. */ /* * XXX: We could keep the buffer, and when (and if) updating the * XXX: filename post-rename, re-allocate it only if it's not * XXX: big enough for the new filename. */ /* XXX: Get write lock on fvfe here? */ VERIEXEC_RW_UPGRADE(&veriexec_op_lock); /* once we have the op lock in write mode * there should be no locks on any file * entries so we can destroy the object. */ if (fvfe->filename_len > 0) kmem_free(fvfe->filename, fvfe->filename_len); fvfe->filename = NULL; fvfe->filename_len = 0; rw_downgrade(&veriexec_op_lock); } log(LOG_NOTICE, "Veriexec: %s file `%s' renamed to " "%s file `%s', uid=%u, pid=%u.\n", (fvfe != NULL) ? "Monitored" : "Non-monitored", fromname, (tvfe != NULL) ? "monitored" : "non-monitored", toname, kauth_cred_geteuid(l->l_cred), l->l_proc->p_pid); rw_exit(&veriexec_op_lock); if (tvfe != NULL) { /* * Monitored file is overwritten. Remove the entry. */ (void)veriexec_file_delete(l, tovp); } return (0); } static void veriexec_file_free(struct veriexec_file_entry *vfe) { if (vfe != NULL) { if (vfe->fp != NULL) kmem_free(vfe->fp, vfe->ops->hash_len); if (vfe->filename != NULL) kmem_free(vfe->filename, vfe->filename_len); rw_destroy(&vfe->lock); kmem_free(vfe, sizeof(*vfe)); } } static void veriexec_file_purge(struct veriexec_file_entry *vfe, int have_lock) { if (vfe == NULL) return; if (have_lock == VERIEXEC_UNLOCKED) rw_enter(&vfe->lock, RW_WRITER); else VERIEXEC_RW_UPGRADE(&vfe->lock); vfe->status = FINGERPRINT_NOTEVAL; if (have_lock == VERIEXEC_UNLOCKED) rw_exit(&vfe->lock); else rw_downgrade(&vfe->lock); } static void veriexec_file_purge_cb(struct veriexec_file_entry *vfe, void *cookie) { veriexec_file_purge(vfe, VERIEXEC_UNLOCKED); } /* * Invalidate a Veriexec file entry. * XXX: This should be updated when per-page fingerprints are added. */ void veriexec_purge(struct vnode *vp) { rw_enter(&veriexec_op_lock, RW_READER); veriexec_file_purge(veriexec_get(vp), VERIEXEC_UNLOCKED); rw_exit(&veriexec_op_lock); } /* * Enforce raw disk access policy. * * IDS mode: Invalidate fingerprints on a mount if it's opened for writing. * IPS mode: Don't allow raw writing to disks we monitor. * Lockdown mode: Don't allow raw writing to all disks. * * XXX: This is bogus. There's an obvious race condition between the time * XXX: the disk is open for writing, in which an attacker can access a * XXX: monitored file to get its signature cached again, and when the raw * XXX: file is overwritten on disk. * XXX: * XXX: To solve this, we need something like the following: * XXX: open raw disk: * XXX: - raise refcount, * XXX: - invalidate fingerprints, * XXX: - mark all entries for that disk with "no cache" flag * XXX: * XXX: veriexec_verify: * XXX: - if "no cache", don't cache evaluation result * XXX: * XXX: close raw disk: * XXX: - lower refcount, * XXX: - if refcount == 0, remove "no cache" flag from all entries */ static int veriexec_raw_cb(kauth_cred_t cred, kauth_action_t action, void *cookie, void *arg0, void *arg1, void *arg2, void *arg3) { int result; enum kauth_device_req req; struct veriexec_table_entry *vte; result = KAUTH_RESULT_DENY; req = (enum kauth_device_req)(uintptr_t)arg0; switch (action) { case KAUTH_DEVICE_RAWIO_SPEC: { struct vnode *vp, *bvp; int error; if (req == KAUTH_REQ_DEVICE_RAWIO_SPEC_READ) { result = KAUTH_RESULT_DEFER; break; } vp = arg1; KASSERT(vp != NULL); /* Handle /dev/mem and /dev/kmem. */ if (iskmemvp(vp)) { if (veriexec_strict < VERIEXEC_IPS) result = KAUTH_RESULT_DEFER; break; } error = rawdev_mounted(vp, &bvp); if (error == EINVAL) { result = KAUTH_RESULT_DEFER; break; } /* * XXX: See vfs_mountedon() comment in rawdev_mounted(). */ vte = veriexec_table_lookup(bvp->v_mount); if (vte == NULL) { result = KAUTH_RESULT_DEFER; break; } switch (veriexec_strict) { case VERIEXEC_LEARNING: case VERIEXEC_IDS: result = KAUTH_RESULT_DEFER; rw_enter(&veriexec_op_lock, RW_WRITER); fileassoc_table_run(bvp->v_mount, veriexec_hook, (fileassoc_cb_t)veriexec_file_purge_cb, NULL); rw_exit(&veriexec_op_lock); break; case VERIEXEC_IPS: result = KAUTH_RESULT_DENY; break; case VERIEXEC_LOCKDOWN: result = KAUTH_RESULT_DENY; break; } break; } case KAUTH_DEVICE_RAWIO_PASSTHRU: /* XXX What can we do here? */ if (veriexec_strict < VERIEXEC_IPS) result = KAUTH_RESULT_DEFER; break; default: result = KAUTH_RESULT_DEFER; break; } return (result); } /* * Create a new Veriexec table. */ static struct veriexec_table_entry * veriexec_table_add(struct lwp *l, struct mount *mp) { struct veriexec_table_entry *vte; u_char buf[16]; vte = kmem_zalloc(sizeof(*vte), KM_SLEEP); mount_setspecific(mp, veriexec_mountspecific_key, vte); snprintf(buf, sizeof(buf), "table%u", veriexec_tablecount++); sysctl_createv(NULL, 0, &veriexec_count_node, &vte->vte_node, 0, CTLTYPE_NODE, buf, NULL, NULL, 0, NULL, 0, CTL_CREATE, CTL_EOL); sysctl_createv(NULL, 0, &vte->vte_node, NULL, CTLFLAG_READONLY, CTLTYPE_STRING, "mntpt", NULL, NULL, 0, mp->mnt_stat.f_mntonname, 0, CTL_CREATE, CTL_EOL); sysctl_createv(NULL, 0, &vte->vte_node, NULL, CTLFLAG_READONLY, CTLTYPE_STRING, "fstype", NULL, NULL, 0, mp->mnt_stat.f_fstypename, 0, CTL_CREATE, CTL_EOL); sysctl_createv(NULL, 0, &vte->vte_node, NULL, CTLFLAG_READONLY, CTLTYPE_QUAD, "nentries", NULL, NULL, 0, &vte->vte_count, 0, CTL_CREATE, CTL_EOL); return (vte); } /* * Add a file to be monitored by Veriexec. * * Expected elements in dict: * file, fp, fp-type, entry-type, keep-filename, eval-on-load. */ int veriexec_file_add(struct lwp *l, prop_dictionary_t dict) { struct veriexec_table_entry *vte; struct veriexec_file_entry *vfe = NULL; struct veriexec_file_entry *ovfe; struct vnode *vp; const char *file, *fp_type; int error; bool ignore_dup = false; if (!prop_dictionary_get_string(dict, "file", &file)) return (EINVAL); error = namei_simple_kernel(file, NSM_FOLLOW_NOEMULROOT, &vp); if (error) return (error); /* Add only regular files. */ if (vp->v_type != VREG) { log(LOG_ERR, "Veriexec: Not adding `%s': Not a regular file.\n", file); error = EBADF; goto out; } vfe = kmem_zalloc(sizeof(*vfe), KM_SLEEP); rw_init(&vfe->lock); /* Lookup fingerprint hashing algorithm. */ fp_type = prop_string_value(prop_dictionary_get(dict, "fp-type")); if ((vfe->ops = veriexec_fpops_lookup(fp_type)) == NULL) { log(LOG_ERR, "Veriexec: Invalid or unknown fingerprint type " "`%s' for file `%s'.\n", fp_type, file); error = EOPNOTSUPP; goto out; } if (prop_data_size(prop_dictionary_get(dict, "fp")) != vfe->ops->hash_len) { log(LOG_ERR, "Veriexec: Bad fingerprint length for `%s'.\n", file); error = EINVAL; goto out; } vfe->fp = kmem_alloc(vfe->ops->hash_len, KM_SLEEP); memcpy(vfe->fp, prop_data_value(prop_dictionary_get(dict, "fp")), vfe->ops->hash_len); rw_enter(&veriexec_op_lock, RW_WRITER); /* Continue entry initialization. */ if (prop_dictionary_get_uint8(dict, "entry-type", &vfe->type) == FALSE) vfe->type = 0; else { uint8_t extra_flags; extra_flags = vfe->type & ~(VERIEXEC_DIRECT | VERIEXEC_INDIRECT | VERIEXEC_FILE | VERIEXEC_UNTRUSTED); if (extra_flags) { log(LOG_NOTICE, "Veriexec: Contaminated flags `0x%x' " "for `%s', skipping.\n", extra_flags, file); error = EINVAL; goto unlock_out; } } if (!(vfe->type & (VERIEXEC_DIRECT | VERIEXEC_INDIRECT | VERIEXEC_FILE))) vfe->type |= VERIEXEC_DIRECT; vfe->status = FINGERPRINT_NOTEVAL; if (prop_bool_true(prop_dictionary_get(dict, "keep-filename"))) { vfe->filename = kmem_strdupsize(file, &vfe->filename_len, KM_SLEEP); } else vfe->filename = NULL; if (prop_bool_true(prop_dictionary_get(dict, "eval-on-load")) || (vfe->type & VERIEXEC_UNTRUSTED)) { u_char status; error = veriexec_fp_status(l, vp, VERIEXEC_FILE_UNLOCKED, vfe, &status); if (error) goto unlock_out; vfe->status = status; } /* * If we already have an entry for this file, and it matches * the new entry exactly (except for the filename, which may * hard-linked!), we just ignore the new entry. If the new * entry differs, report the error. */ if ((ovfe = veriexec_get(vp)) != NULL) { error = EEXIST; if (vfe->type == ovfe->type && vfe->status == ovfe->status && vfe->ops == ovfe->ops && memcmp(vfe->fp, ovfe->fp, vfe->ops->hash_len) == 0) ignore_dup = true; goto unlock_out; } vte = veriexec_table_lookup(vp->v_mount); if (vte == NULL) vte = veriexec_table_add(l, vp->v_mount); /* XXX if we bail below this, we might want to gc newly created vtes. */ error = fileassoc_add(vp, veriexec_hook, vfe); if (error) goto unlock_out; vte->vte_count++; veriexec_file_report(NULL, "New entry.", file, NULL, REPORT_DEBUG); veriexec_bypass = 0; unlock_out: rw_exit(&veriexec_op_lock); out: vrele(vp); if (error) veriexec_file_free(vfe); if (ignore_dup && error == EEXIST) error = 0; return (error); } int veriexec_table_delete(struct lwp *l, struct mount *mp) { struct veriexec_table_entry *vte; vte = veriexec_table_lookup(mp); if (vte == NULL) return (ENOENT); veriexec_mountspecific_dtor(vte); mount_setspecific(mp, veriexec_mountspecific_key, NULL); return (fileassoc_table_clear(mp, veriexec_hook)); } int veriexec_file_delete(struct lwp *l, struct vnode *vp) { struct veriexec_table_entry *vte; int error; vte = veriexec_table_lookup(vp->v_mount); if (vte == NULL) return (ENOENT); rw_enter(&veriexec_op_lock, RW_WRITER); error = fileassoc_clear(vp, veriexec_hook); rw_exit(&veriexec_op_lock); if (!error) { KASSERT(vte->vte_count > 0); vte->vte_count--; } return (error); } /* * Convert Veriexec entry data to a dictionary readable by userland tools. */ static void veriexec_file_convert(struct veriexec_file_entry *vfe, prop_dictionary_t rdict) { if (vfe->filename) prop_dictionary_set(rdict, "file", prop_string_create_copy(vfe->filename)); prop_dictionary_set_uint8(rdict, "entry-type", vfe->type); prop_dictionary_set_uint8(rdict, "status", vfe->status); prop_dictionary_set(rdict, "fp-type", prop_string_create_copy(vfe->ops->type)); prop_dictionary_set(rdict, "fp", prop_data_create_copy(vfe->fp, vfe->ops->hash_len)); } int veriexec_convert(struct vnode *vp, prop_dictionary_t rdict) { struct veriexec_file_entry *vfe; rw_enter(&veriexec_op_lock, RW_READER); vfe = veriexec_get(vp); if (vfe == NULL) { rw_exit(&veriexec_op_lock); return (ENOENT); } rw_enter(&vfe->lock, RW_READER); veriexec_file_convert(vfe, rdict); rw_exit(&vfe->lock); rw_exit(&veriexec_op_lock); return (0); } int veriexec_unmountchk(struct mount *mp) { int error; if ((veriexec_bypass && (veriexec_strict == VERIEXEC_LEARNING)) || doing_shutdown) return (0); rw_enter(&veriexec_op_lock, RW_READER); switch (veriexec_strict) { case VERIEXEC_LEARNING: error = 0; break; case VERIEXEC_IDS: if (veriexec_table_lookup(mp) != NULL) { log(LOG_INFO, "Veriexec: IDS mode, allowing unmount " "of \"%s\".\n", mp->mnt_stat.f_mntonname); } error = 0; break; case VERIEXEC_IPS: { struct veriexec_table_entry *vte; vte = veriexec_table_lookup(mp); if ((vte != NULL) && (vte->vte_count > 0)) { log(LOG_ALERT, "Veriexec: IPS mode, preventing" " unmount of \"%s\" with monitored files.\n", mp->mnt_stat.f_mntonname); error = EPERM; } else error = 0; break; } case VERIEXEC_LOCKDOWN: default: log(LOG_ALERT, "Veriexec: Lockdown mode, preventing unmount " "of \"%s\".\n", mp->mnt_stat.f_mntonname); error = EPERM; break; } rw_exit(&veriexec_op_lock); return (error); } int veriexec_openchk(struct lwp *l, struct vnode *vp, const char *path, int fmode) { struct veriexec_file_entry *vfe = NULL; int error = 0; if (veriexec_bypass && (veriexec_strict == VERIEXEC_LEARNING)) return 0; if (vp == NULL) { /* If no creation requested, let this fail normally. */ if (!(fmode & O_CREAT)) goto out; /* Lockdown mode: Prevent creation of new files. */ if (veriexec_strict >= VERIEXEC_LOCKDOWN) { log(LOG_ALERT, "Veriexec: Preventing new file " "creation in `%s'.\n", path); error = EPERM; } goto out; } rw_enter(&veriexec_op_lock, RW_READER); error = veriexec_file_verify(l, vp, path, VERIEXEC_FILE, VERIEXEC_FILE_LOCKED, &vfe); if (error) { rw_exit(&veriexec_op_lock); goto out; } if ((vfe != NULL) && ((fmode & FWRITE) || (fmode & O_TRUNC))) { veriexec_file_report(vfe, "Write access request.", path, l, REPORT_ALWAYS | REPORT_ALARM); /* IPS mode: Deny write access to monitored files. */ if (veriexec_strict >= VERIEXEC_IPS) error = EPERM; else veriexec_file_purge(vfe, VERIEXEC_LOCKED); } if (vfe != NULL) rw_exit(&vfe->lock); rw_exit(&veriexec_op_lock); out: return (error); } static void veriexec_file_dump(struct veriexec_file_entry *vfe, prop_array_t entries) { prop_dictionary_t entry; /* If we don't have a filename, this is meaningless. */ if (vfe->filename == NULL) return; entry = prop_dictionary_create(); veriexec_file_convert(vfe, entry); prop_array_add(entries, entry); } int veriexec_dump(struct lwp *l, prop_array_t rarray) { mount_iterator_t *iter; struct mount *mp; mountlist_iterator_init(&iter); while ((mp = mountlist_iterator_next(iter)) != NULL) { fileassoc_table_run(mp, veriexec_hook, (fileassoc_cb_t)veriexec_file_dump, rarray); } mountlist_iterator_destroy(iter); return (0); } int veriexec_flush(struct lwp *l) { mount_iterator_t *iter; struct mount *mp; int error = 0; mountlist_iterator_init(&iter); while ((mp = mountlist_iterator_next(iter)) != NULL) { int lerror; lerror = veriexec_table_delete(l, mp); if (lerror && lerror != ENOENT) error = lerror; } mountlist_iterator_destroy(iter); return (error); }
15 15 13 3 4 4 1 3 3 3 3 3 3 3 16 6 2 4 3 3 3 3 3 3 3 3 3 3 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 /* $NetBSD: nfs_export.c,v 1.63 2021/06/04 10:44:58 hannken Exp $ */ /*- * Copyright (c) 1997, 1998, 2004, 2005, 2008, 2019 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility, * NASA Ames Research Center. * This code is derived from software contributed to The NetBSD Foundation * by Charles M. Hannum. * This code is derived from software contributed to The NetBSD Foundation * by Julio M. Merino Vidal. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /* * Copyright (c) 1989, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)vfs_subr.c 8.13 (Berkeley) 4/18/94 */ /* * VFS exports list management. * * Lock order: vfs_busy -> mnt_updating -> netexport_lock. */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: nfs_export.c,v 1.63 2021/06/04 10:44:58 hannken Exp $"); #include <sys/param.h> #include <sys/systm.h> #include <sys/kernel.h> #include <sys/queue.h> #include <sys/proc.h> #include <sys/mount.h> #include <sys/vnode.h> #include <sys/namei.h> #include <sys/errno.h> #include <sys/malloc.h> #include <sys/domain.h> #include <sys/mbuf.h> #include <sys/dirent.h> #include <sys/socket.h> /* XXX for AF_MAX */ #include <sys/kauth.h> #include <net/radix.h> #include <netinet/in.h> #include <nfs/rpcv2.h> #include <nfs/nfsproto.h> #include <nfs/nfs.h> #include <nfs/nfs_var.h> /* * Network address lookup element. */ struct netcred { struct radix_node netc_rnodes[2]; int netc_refcnt; int netc_exflags; kauth_cred_t netc_anon; }; /* * Network export information. */ struct netexport { TAILQ_ENTRY(netexport) ne_list; struct mount *ne_mount; struct netcred ne_defexported; /* Default export */ struct radix_node_head *ne_rtable[AF_MAX+1]; /* Individual exports */ }; TAILQ_HEAD(, netexport) netexport_list = TAILQ_HEAD_INITIALIZER(netexport_list); /* Publicly exported file system. */ struct nfs_public nfs_pub; /* * Local prototypes. */ static int init_exports(struct mount *, struct netexport **); static int hang_addrlist(struct mount *, struct netexport *, const struct export_args *); static int sacheck(struct sockaddr *); static int free_netcred(struct radix_node *, void *); static int export(struct netexport *, const struct export_args *); static int setpublicfs(struct mount *, struct netexport *, const struct export_args *); static struct netcred *netcred_lookup(struct netexport *, struct mbuf *); static struct netexport *netexport_lookup(const struct mount *); static struct netexport *netexport_lookup_byfsid(const fsid_t *); static void netexport_clear(struct netexport *); static void netexport_insert(struct netexport *); static void netexport_remove(struct netexport *); static void netexport_wrlock(void); static void netexport_wrunlock(void); static int nfs_export_update_30(struct mount *mp, const char *path, void *); static krwlock_t netexport_lock; /* * PUBLIC INTERFACE */ /* * Declare and initialize the file system export hooks. */ static void netexport_unmount(struct mount *); struct vfs_hooks nfs_export_hooks = { { NULL, NULL }, .vh_unmount = netexport_unmount, .vh_reexport = nfs_export_update_30, }; /* * VFS unmount hook for NFS exports. * * Releases NFS exports list resources if the given mount point has some. * As allocation happens lazily, it may be that it doesn't have this * information, although it theoretically should. */ static void netexport_unmount(struct mount *mp) { struct netexport *ne; KASSERT(mp != NULL); netexport_wrlock(); ne = netexport_lookup(mp); if (ne == NULL) { netexport_wrunlock(); return; } netexport_clear(ne); netexport_remove(ne); netexport_wrunlock(); kmem_free(ne, sizeof(*ne)); } void netexport_init(void) { rw_init(&netexport_lock); } void netexport_fini(void) { struct netexport *ne; struct mount *mp; int error; while (!TAILQ_EMPTY(&netexport_list)) { netexport_wrlock(); ne = TAILQ_FIRST(&netexport_list); mp = ne->ne_mount; error = vfs_busy(mp); netexport_wrunlock(); if (error != 0) { kpause("nfsfini", false, hz, NULL); continue; } mutex_enter(mp->mnt_updating); /* mnt_flag */ netexport_unmount(mp); mutex_exit(mp->mnt_updating); /* mnt_flag */ vfs_unbusy(mp); } rw_destroy(&netexport_lock); } /* * Atomically set the NFS exports list of the given file system, replacing * it with a new list of entries. * * Returns zero on success or an appropriate error code otherwise. * * Helper function for the nfssvc(2) system call (NFSSVC_SETEXPORTSLIST * and NFSSVC_REPLACEEXPORTSLIST command). */ int mountd_set_exports_list(const struct mountd_exports_list *mel, struct lwp *l, struct mount *nmp, int cmd) { int error; size_t i; struct mount *mp; struct netexport *ne; struct pathbuf *pb; struct nameidata nd; struct vnode *vp; size_t fid_size; if (kauth_authorize_network(l->l_cred, KAUTH_NETWORK_NFS, KAUTH_REQ_NETWORK_NFS_EXPORT, NULL, NULL, NULL) != 0) return EPERM; /* Look up the file system path. */ error = pathbuf_copyin(mel->mel_path, &pb); if (error) { return error; } NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, pb); error = namei(&nd); if (error != 0) { pathbuf_destroy(pb); return error; } vp = nd.ni_vp; mp = vp->v_mount; KASSERT(nmp == NULL || nmp == mp); pathbuf_destroy(pb); /* * Make sure the file system can do vptofh. If the file system * knows the handle's size, just trust it's able to do the * actual translation also (otherwise we should check fhtovp * also, and that's getting a wee bit ridiculous). */ fid_size = 0; if ((error = VFS_VPTOFH(vp, NULL, &fid_size)) != E2BIG) { vput(vp); return EOPNOTSUPP; } /* Mark the file system busy. */ error = vfs_busy(mp); vput(vp); if (error != 0) return error; if (nmp == NULL) mutex_enter(mp->mnt_updating); /* mnt_flag */ netexport_wrlock(); ne = netexport_lookup(mp); if (ne == NULL) { error = init_exports(mp, &ne); if (error != 0) { goto out; } } KASSERT(ne != NULL); KASSERT(ne->ne_mount == mp); if (cmd == NFSSVC_SETEXPORTSLIST) { if (mel->mel_nexports == 0) netexport_clear(ne); else if (mel->mel_nexports == 1) error = export(ne, &mel->mel_exports[0]); else { printf("%s: Cannot set more than one " "entry at once (unimplemented)\n", __func__); error = EOPNOTSUPP; } } else if (cmd == NFSSVC_REPLACEEXPORTSLIST) { netexport_clear(ne); for (i = 0; error == 0 && i < mel->mel_nexports; i++) error = export(ne, &mel->mel_exports[i]); } else { printf("%s: Command %#x not implemented\n", __func__, cmd); error = EOPNOTSUPP; } out: netexport_wrunlock(); if (nmp == NULL) mutex_exit(mp->mnt_updating); /* mnt_flag */ vfs_unbusy(mp); return error; } static void netexport_insert(struct netexport *ne) { TAILQ_INSERT_HEAD(&netexport_list, ne, ne_list); } static void netexport_remove(struct netexport *ne) { TAILQ_REMOVE(&netexport_list, ne, ne_list); } static struct netexport * netexport_lookup(const struct mount *mp) { struct netexport *ne; TAILQ_FOREACH(ne, &netexport_list, ne_list) { if (ne->ne_mount == mp) { goto done; } } ne = NULL; done: return ne; } static struct netexport * netexport_lookup_byfsid(const fsid_t *fsid) { struct netexport *ne; TAILQ_FOREACH(ne, &netexport_list, ne_list) { const struct mount *mp = ne->ne_mount; if (mp->mnt_stat.f_fsidx.__fsid_val[0] == fsid->__fsid_val[0] && mp->mnt_stat.f_fsidx.__fsid_val[1] == fsid->__fsid_val[1]) { goto done; } } ne = NULL; done: return ne; } /* * Check if the file system specified by the 'mp' mount structure is * exported to a client with 'anon' anonymous credentials. The 'mb' * argument is an mbuf containing the network address of the client. * The return parameters for the export flags for the client are returned * in the address specified by 'wh'. * * This function is used exclusively by the NFS server. It is generally * invoked before VFS_FHTOVP to validate that a client has access to the * file system. */ int netexport_check(const fsid_t *fsid, struct mbuf *mb, struct mount **mpp, int *wh, kauth_cred_t *anon) { struct netexport *ne; struct netcred *np; ne = netexport_lookup_byfsid(fsid); if (ne == NULL) { return EACCES; } np = netcred_lookup(ne, mb); if (np == NULL) { return EACCES; } *mpp = ne->ne_mount; *wh = np->netc_exflags; *anon = np->netc_anon; return 0; } /* * Handles legacy export requests. In this case, the export information * is hardcoded in a specific place of the mount arguments structure (given * in data); the request for an update is given through the fspec field * (also in a known location), which must be a null pointer. * * Returns EJUSTRETURN if the given command was not a export request. * Otherwise, returns 0 on success or an appropriate error code otherwise. */ static int nfs_export_update_30(struct mount *mp, const char *path, void *data) { struct mountd_exports_list mel; struct mnt_export_args30 *args; args = data; mel.mel_path = path; if (args->fspec != NULL) return EJUSTRETURN; if (args->eargs.ex_flags & 0x00020000) { /* Request to delete exports. The mask above holds the * value that used to be in MNT_DELEXPORT. */ mel.mel_nexports = 0; } else { /* * The following code assumes export_args has not * changed since export_args30, so check that. */ __CTASSERT(sizeof(args->eargs) == sizeof(*mel.mel_exports)); mel.mel_nexports = 1; mel.mel_exports = (void *)&args->eargs; } return mountd_set_exports_list(&mel, curlwp, mp, NFSSVC_SETEXPORTSLIST); } /* * INTERNAL FUNCTIONS */ /* * Initializes NFS exports for the mountpoint given in 'mp'. * If successful, returns 0 and sets *nep to the address of the new * netexport item; otherwise returns an appropriate error code * and *nep remains unmodified. */ static int init_exports(struct mount *mp, struct netexport **nep) { int error; struct export_args ea; struct netexport *ne; KASSERT(mp != NULL); /* Ensure that we do not already have this mount point. */ KASSERT(netexport_lookup(mp) == NULL); ne = kmem_zalloc(sizeof(*ne), KM_SLEEP); ne->ne_mount = mp; /* Set the default export entry. Handled internally by export upon * first call. */ memset(&ea, 0, sizeof(ea)); ea.ex_root = -2; if (mp->mnt_flag & MNT_RDONLY) ea.ex_flags |= MNT_EXRDONLY; error = export(ne, &ea); if (error != 0) { kmem_free(ne, sizeof(*ne)); } else { netexport_insert(ne); *nep = ne; } return error; } /* * Build hash lists of net addresses and hang them off the mount point. * Called by export() to set up a new entry in the lists of export * addresses. */ static int hang_addrlist(struct mount *mp, struct netexport *nep, const struct export_args *argp) { int error, i; struct netcred *np, *enp; struct radix_node_head *rnh; struct sockaddr *saddr, *smask; struct domain *dom; smask = NULL; if (argp->ex_addrlen == 0) { if (mp->mnt_flag & MNT_DEFEXPORTED) return EPERM; np = &nep->ne_defexported; KASSERT(np->netc_anon == NULL); np->netc_anon = kauth_cred_alloc(); np->netc_exflags = argp->ex_flags; kauth_uucred_to_cred(np->netc_anon, &argp->ex_anon); mp->mnt_flag |= MNT_DEFEXPORTED; return 0; } if (argp->ex_addrlen > MLEN || argp->ex_masklen > MLEN) return EINVAL; i = sizeof(struct netcred) + argp->ex_addrlen + argp->ex_masklen; np = malloc(i, M_NETADDR, M_WAITOK | M_ZERO); np->netc_anon = kauth_cred_alloc(); saddr = (struct sockaddr *)(np + 1); error = copyin(argp->ex_addr, saddr, argp->ex_addrlen); if (error) goto out; if (saddr->sa_len > argp->ex_addrlen) saddr->sa_len = argp->ex_addrlen; if (sacheck(saddr) == -1) { error = EINVAL; goto out; } if (argp->ex_masklen) { smask = (struct sockaddr *)((char *)saddr + argp->ex_addrlen); error = copyin(argp->ex_mask, smask, argp->ex_masklen); if (error) goto out; if (smask->sa_len > argp->ex_masklen) smask->sa_len = argp->ex_masklen; if (smask->sa_family != saddr->sa_family) { error = EINVAL; goto out; } if (sacheck(smask) == -1) { error = EINVAL; goto out; } } i = saddr->sa_family; if ((rnh = nep->ne_rtable[i]) == 0) { /* * Seems silly to initialize every AF when most are not * used, do so on demand here. */ DOMAIN_FOREACH(dom) { if (dom->dom_family == i && dom->dom_rtattach) { rn_inithead((void **)&nep->ne_rtable[i], dom->dom_rtoffset); break; } } if ((rnh = nep->ne_rtable[i]) == 0) { error = ENOBUFS; goto out; } } enp = (struct netcred *)(*rnh->rnh_addaddr)(saddr, smask, rnh, np->netc_rnodes); if (enp != np) { if (enp == NULL) { enp = (struct netcred *)(*rnh->rnh_lookup)(saddr, smask, rnh); if (enp == NULL) { error = EPERM; goto out; } } else enp->netc_refcnt++; goto check; } else enp->netc_refcnt = 1; np->netc_exflags = argp->ex_flags; kauth_uucred_to_cred(np->netc_anon, &argp->ex_anon); return 0; check: if (enp->netc_exflags != argp->ex_flags || kauth_cred_uucmp(enp->netc_anon, &argp->ex_anon) != 0) error = EPERM; else error = 0; out: KASSERT(np->netc_anon != NULL); kauth_cred_free(np->netc_anon); free(np, M_NETADDR); return error; } /* * Ensure that the address stored in 'sa' is valid. * Returns zero on success, otherwise -1. */ static int sacheck(struct sockaddr *sa) { switch (sa->sa_family) { case AF_INET: { struct sockaddr_in *sin = (struct sockaddr_in *)sa; char *p = (char *)sin->sin_zero; size_t i; if (sin->sin_len != sizeof(*sin)) return -1; if (sin->sin_port != 0) return -1; for (i = 0; i < sizeof(sin->sin_zero); i++) if (*p++ != '\0') return -1; return 0; } case AF_INET6: { struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)sa; if (sin6->sin6_len != sizeof(*sin6)) return -1; if (sin6->sin6_port != 0) return -1; return 0; } default: return -1; } } /* * Free the netcred object pointed to by the 'rn' radix node. * 'w' holds a pointer to the radix tree head. */ static int free_netcred(struct radix_node *rn, void *w) { struct radix_node_head *rnh = (struct radix_node_head *)w; struct netcred *np = (struct netcred *)(void *)rn; (*rnh->rnh_deladdr)(rn->rn_key, rn->rn_mask, rnh); if (--(np->netc_refcnt) <= 0) { KASSERT(np->netc_anon != NULL); kauth_cred_free(np->netc_anon); free(np, M_NETADDR); } return 0; } /* * Clears the exports list for a given file system. */ static void netexport_clear(struct netexport *ne) { struct radix_node_head *rnh; struct mount *mp = ne->ne_mount; int i; if (mp->mnt_flag & MNT_EXPUBLIC) { setpublicfs(NULL, NULL, NULL); mp->mnt_flag &= ~MNT_EXPUBLIC; } for (i = 0; i <= AF_MAX; i++) { if ((rnh = ne->ne_rtable[i]) != NULL) { rn_walktree(rnh, free_netcred, rnh); free(rnh, M_RTABLE); ne->ne_rtable[i] = NULL; } } if ((mp->mnt_flag & MNT_DEFEXPORTED) != 0) { struct netcred *np = &ne->ne_defexported; KASSERT(np->netc_anon != NULL); kauth_cred_free(np->netc_anon); np->netc_anon = NULL; } else { KASSERT(ne->ne_defexported.netc_anon == NULL); } mp->mnt_flag &= ~(MNT_EXPORTED | MNT_DEFEXPORTED); } /* * Add a new export entry (described by an export_args structure) to the * given file system. */ static int export(struct netexport *nep, const struct export_args *argp) { struct mount *mp = nep->ne_mount; int error; if (argp->ex_flags & MNT_EXPORTED) { if (argp->ex_flags & MNT_EXPUBLIC) { if ((error = setpublicfs(mp, nep, argp)) != 0) return error; mp->mnt_flag |= MNT_EXPUBLIC; } if ((error = hang_addrlist(mp, nep, argp)) != 0) return error; mp->mnt_flag |= MNT_EXPORTED; } return 0; } /* * Set the publicly exported filesystem (WebNFS). Currently, only * one public filesystem is possible in the spec (RFC 2054 and 2055) */ static int setpublicfs(struct mount *mp, struct netexport *nep, const struct export_args *argp) { char *cp; int error; struct vnode *rvp; size_t fhsize; /* * mp == NULL --> invalidate the current info; the FS is * no longer exported. May be called from either export * or unmount, so check if it hasn't already been done. */ if (mp == NULL) { if (nfs_pub.np_valid) { nfs_pub.np_valid = 0; if (nfs_pub.np_handle != NULL) { free(nfs_pub.np_handle, M_TEMP); nfs_pub.np_handle = NULL; } if (nfs_pub.np_index != NULL) { free(nfs_pub.np_index, M_TEMP); nfs_pub.np_index = NULL; } } return 0; } /* * Only one allowed at a time. */ if (nfs_pub.np_valid != 0 && mp != nfs_pub.np_mount) return EBUSY; /* * Get real filehandle for root of exported FS. */ if ((error = VFS_ROOT(mp, LK_EXCLUSIVE, &rvp))) return error; fhsize = 0; error = vfs_composefh(rvp, NULL, &fhsize); if (error != E2BIG) return error; nfs_pub.np_handle = malloc(fhsize, M_TEMP, M_NOWAIT); if (nfs_pub.np_handle == NULL) error = ENOMEM; else error = vfs_composefh(rvp, nfs_pub.np_handle, &fhsize); if (error) return error; vput(rvp); /* * If an indexfile was specified, pull it in. */ if (argp->ex_indexfile != NULL) { nfs_pub.np_index = malloc(NFS_MAXNAMLEN + 1, M_TEMP, M_WAITOK); error = copyinstr(argp->ex_indexfile, nfs_pub.np_index, NFS_MAXNAMLEN, (size_t *)0); if (!error) { /* * Check for illegal filenames. */ for (cp = nfs_pub.np_index; *cp; cp++) { if (*cp == '/') { error = EINVAL; break; } } } if (error) { free(nfs_pub.np_index, M_TEMP); return error; } } nfs_pub.np_mount = mp; nfs_pub.np_valid = 1; return 0; } /* * Look up an export entry in the exports list that matches the address * stored in 'nam'. If no entry is found, the default one is used instead * (if available). */ static struct netcred * netcred_lookup(struct netexport *ne, struct mbuf *nam) { struct netcred *np; struct radix_node_head *rnh; struct sockaddr *saddr; if ((ne->ne_mount->mnt_flag & MNT_EXPORTED) == 0) { return NULL; } /* * Look in the export list first. */ np = NULL; if (nam != NULL) { saddr = mtod(nam, struct sockaddr *); rnh = ne->ne_rtable[saddr->sa_family]; if (rnh != NULL) { np = (struct netcred *) (*rnh->rnh_matchaddr)((void *)saddr, rnh); if (np && np->netc_rnodes->rn_flags & RNF_ROOT) np = NULL; } } /* * If no address match, use the default if it exists. */ if (np == NULL && ne->ne_mount->mnt_flag & MNT_DEFEXPORTED) np = &ne->ne_defexported; return np; } void netexport_rdlock(void) { rw_enter(&netexport_lock, RW_READER); } void netexport_rdunlock(void) { rw_exit(&netexport_lock); } static void netexport_wrlock(void) { rw_enter(&netexport_lock, RW_WRITER); } static void netexport_wrunlock(void) { rw_exit(&netexport_lock); } bool netexport_hasexports(void) { return nfs_pub.np_valid || !TAILQ_EMPTY(&netexport_list); }
20 20 20 20 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 /* $NetBSD: subr_pcq.c,v 1.20 2023/02/24 11:02:27 riastradh Exp $ */ /*- * Copyright (c) 2009, 2019 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Andrew Doran. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /* * Lockless producer/consumer queue. * * Summary of the producer algorithm in pcq_put (may run many in * parallel with each other and with a consumer): * * P1. initialize an item * * P2. atomic_cas(&pcq->pcq_pc) loop to advance the producer * pointer, reserving a space at c (fails if not enough space) * * P3. atomic_store_release(&pcq->pcq_items[c], item) to publish * the item in the space it reserved * * Summary of the consumer algorithm in pcq_get (must be serialized by * caller with other consumers, may run in parallel with any number of * producers): * * C1. atomic_load_relaxed(&pcq->pcq_pc) to get the consumer * pointer and a snapshot of the producer pointer, which may * point to null items or point to initialized items (fails if * no space reserved for published items yet) * * C2. atomic_load_consume(&pcq->pcq_items[c]) to get the next * unconsumed but potentially published item (fails if item * not published yet) * * C3. pcq->pcq_items[c] = NULL to consume the next unconsumed but * published item * * C4. membar_producer * * C5. atomic_cas(&pcq->pcq_pc) loop to advance the consumer * pointer * * C6. use the item * * Note that there is a weird bare membar_producer which is not matched * by membar_consumer. This is one of the rare cases of a memory * barrier on one side that is not matched by a memory barrier on * another side, but the ordering works out, with a somewhat more * involved proof. * * Some properties that need to be proved: * * Theorem 1. For pcq_put call that leads into pcq_get: * Initializing item at P1 is dependency-ordered before usage of * item at C6, so items placed by pcq_put can be safely used by * the caller of pcq_get. * * Proof sketch. * * Assume load/store P2 synchronizes with load/store C1 * (if not, pcq_get fails in `if (p == c) return NULL'). * * Assume store-release P3 synchronizes with load-consume * C2 (if not, pcq_get fails in `if (item == NULL) return * NULL'). * * Then: * * - P1 is sequenced before store-release P3 * - store-release P3 synchronizes with load-consume C2 * - load-consume C2 is dependency-ordered before C6 * * Hence transitively, P1 is dependency-ordered before C6, * QED. * * Theorem 2. For pcq_get call followed by pcq_put: Nulling out * location at store C3 happens before placing a new item in the * same location at store P3, so items are not lost. * * Proof sketch. * * Assume load/store C5 synchronizes with load/store P2 * (otherwise pcq_peek starts over the CAS loop or fails). * * Then: * * - store C3 is sequenced before membar_producer C4 * - membar_producer C4 is sequenced before load/store C5 * - load/store C5 synchronizes with load/store P2 at &pcq->pcq_pc * - P2 is sequenced before store-release P3 * * Hence transitively, store C3 happens before * store-release P3, QED. */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: subr_pcq.c,v 1.20 2023/02/24 11:02:27 riastradh Exp $"); #include <sys/param.h> #include <sys/types.h> #include <sys/atomic.h> #include <sys/kmem.h> #include <sys/pcq.h> /* * Internal producer-consumer queue structure. Note: providing a separate * cache-line both for pcq_t::pcq_pc and pcq_t::pcq_items. */ struct pcq { u_int pcq_nitems; uint8_t pcq_pad1[COHERENCY_UNIT - sizeof(u_int)]; volatile uint32_t pcq_pc; uint8_t pcq_pad2[COHERENCY_UNIT - sizeof(uint32_t)]; void * volatile pcq_items[]; }; /* * Producer (p) - stored in the lower 16 bits of pcq_t::pcq_pc. * Consumer (c) - in the higher 16 bits. * * We have a limitation of 16 bits i.e. 0xffff items in the queue. * The PCQ_MAXLEN constant is set accordingly. */ static inline void pcq_split(uint32_t v, u_int *p, u_int *c) { *p = v & 0xffff; *c = v >> 16; } static inline uint32_t pcq_combine(u_int p, u_int c) { return p | (c << 16); } static inline u_int pcq_advance(pcq_t *pcq, u_int pc) { if (__predict_false(++pc == pcq->pcq_nitems)) { return 0; } return pc; } /* * pcq_put: place an item at the end of the queue. */ bool pcq_put(pcq_t *pcq, void *item) { uint32_t v, nv; u_int op, p, c; KASSERT(item != NULL); do { v = atomic_load_relaxed(&pcq->pcq_pc); pcq_split(v, &op, &c); p = pcq_advance(pcq, op); if (p == c) { /* Queue is full. */ return false; } nv = pcq_combine(p, c); } while (atomic_cas_32(&pcq->pcq_pc, v, nv) != v); /* * Ensure that the update to pcq_pc is globally visible before the * data item. See pcq_get(). This also ensures that any changes * that the caller made to the data item are globally visible * before we put it onto the list. */ atomic_store_release(&pcq->pcq_items[op], item); /* * Synchronization activity to wake up the consumer will ensure * that the update to pcq_items[] is visible before the wakeup * arrives. So, we do not need an additional memory barrier here. */ return true; } /* * pcq_peek: return the next item from the queue without removal. */ void * pcq_peek(pcq_t *pcq) { const uint32_t v = atomic_load_relaxed(&pcq->pcq_pc); u_int p, c; pcq_split(v, &p, &c); /* See comment on race below in pcq_get(). */ return (p == c) ? NULL : atomic_load_consume(&pcq->pcq_items[c]); } /* * pcq_get: remove and return the next item for consumption or NULL if empty. * * => The caller must prevent concurrent gets from occurring. */ void * pcq_get(pcq_t *pcq) { uint32_t v, nv; u_int p, c; void *item; v = atomic_load_relaxed(&pcq->pcq_pc); pcq_split(v, &p, &c); if (p == c) { /* Queue is empty: nothing to return. */ return NULL; } item = atomic_load_consume(&pcq->pcq_items[c]); if (item == NULL) { /* * Raced with sender: we rely on a notification (e.g. softint * or wakeup) being generated after the producer's pcq_put(), * causing us to retry pcq_get() later. */ return NULL; } /* * We have exclusive access to this slot, so no need for * atomic_store_*. */ pcq->pcq_items[c] = NULL; c = pcq_advance(pcq, c); nv = pcq_combine(p, c); /* * Ensure that update to pcq_items[c] becomes globally visible * before the update to pcq_pc. If it were reordered to occur * after it, we could in theory wipe out a modification made * to pcq_items[c] by pcq_put(). * * No need for load-before-store ordering of membar_release * because the only load we need to ensure happens first is the * load of pcq->pcq_items[c], but that necessarily happens * before the store to pcq->pcq_items[c] to null it out because * it is at the same memory location. Yes, this is a bare * membar_producer with no matching membar_consumer. */ membar_producer(); while (__predict_false(atomic_cas_32(&pcq->pcq_pc, v, nv) != v)) { v = atomic_load_relaxed(&pcq->pcq_pc); pcq_split(v, &p, &c); c = pcq_advance(pcq, c); nv = pcq_combine(p, c); } return item; } pcq_t * pcq_create(size_t nitems, km_flag_t kmflags) { pcq_t *pcq; KASSERT(nitems > 0); KASSERT(nitems <= PCQ_MAXLEN); pcq = kmem_zalloc(offsetof(pcq_t, pcq_items[nitems]), kmflags); if (pcq != NULL) { pcq->pcq_nitems = nitems; } return pcq; } void pcq_destroy(pcq_t *pcq) { kmem_free(pcq, offsetof(pcq_t, pcq_items[pcq->pcq_nitems])); } size_t pcq_maxitems(pcq_t *pcq) { return pcq->pcq_nitems; }
1 9 15 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 /* $NetBSD: pmap.h,v 1.134 2022/08/20 23:49:31 riastradh Exp $ */ /* * Copyright (c) 1997 Charles D. Cranor and Washington University. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* * Copyright (c) 2001 Wasabi Systems, Inc. * All rights reserved. * * Written by Frank van der Linden for Wasabi Systems, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed for the NetBSD Project by * Wasabi Systems, Inc. * 4. The name of Wasabi Systems, Inc. may not be used to endorse * or promote products derived from this software without specific prior * written permission. * * THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL WASABI SYSTEMS, INC * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /* * pmap.h: see pmap.c for the history of this pmap module. */ #ifndef _X86_PMAP_H_ #define _X86_PMAP_H_ #if defined(_KERNEL) #include <x86/pmap_pv.h> #include <uvm/pmap/pmap_pvt.h> /* * MD flags that we use for pmap_enter and pmap_kenter_pa: */ /* * macros */ #define pmap_clear_modify(pg) pmap_clear_attrs(pg, PP_ATTRS_D) #define pmap_clear_reference(pg) pmap_clear_attrs(pg, PP_ATTRS_A) #define pmap_copy(DP,SP,D,L,S) __USE(L) #define pmap_is_modified(pg) pmap_test_attrs(pg, PP_ATTRS_D) #define pmap_is_referenced(pg) pmap_test_attrs(pg, PP_ATTRS_A) #define pmap_move(DP,SP,D,L,S) #define pmap_phys_address(ppn) (x86_ptob(ppn) & ~X86_MMAP_FLAG_MASK) #define pmap_mmap_flags(ppn) x86_mmap_flags(ppn) #if defined(__x86_64__) || defined(PAE) #define X86_MMAP_FLAG_SHIFT (64 - PGSHIFT) #else #define X86_MMAP_FLAG_SHIFT (32 - PGSHIFT) #endif #define X86_MMAP_FLAG_MASK 0xf #define X86_MMAP_FLAG_PREFETCH 0x1 /* * prototypes */ void pmap_activate(struct lwp *); void pmap_bootstrap(vaddr_t); bool pmap_clear_attrs(struct vm_page *, unsigned); bool pmap_pv_clear_attrs(paddr_t, unsigned); void pmap_deactivate(struct lwp *); void pmap_page_remove(struct vm_page *); void pmap_pv_remove(paddr_t); void pmap_remove(struct pmap *, vaddr_t, vaddr_t); bool pmap_test_attrs(struct vm_page *, unsigned); void pmap_write_protect(struct pmap *, vaddr_t, vaddr_t, vm_prot_t); void pmap_load(void); paddr_t pmap_init_tmp_pgtbl(paddr_t); bool pmap_remove_all(struct pmap *); void pmap_ldt_cleanup(struct lwp *); void pmap_ldt_sync(struct pmap *); void pmap_kremove_local(vaddr_t, vsize_t); #define __HAVE_PMAP_PV_TRACK 1 void pmap_pv_init(void); void pmap_pv_track(paddr_t, psize_t); void pmap_pv_untrack(paddr_t, psize_t); u_int x86_mmap_flags(paddr_t); #define PMAP_GROWKERNEL /* turn on pmap_growkernel interface */ #define PMAP_FORK /* turn on pmap_fork interface */ /* * inline functions */ /* * pmap_page_protect: change the protection of all recorded mappings * of a managed page * * => this function is a frontend for pmap_page_remove/pmap_clear_attrs * => we only have to worry about making the page more protected. * unprotecting a page is done on-demand at fault time. */ __inline static void __unused pmap_page_protect(struct vm_page *pg, vm_prot_t prot) { if ((prot & VM_PROT_WRITE) == 0) { if (prot & (VM_PROT_READ|VM_PROT_EXECUTE)) { (void)pmap_clear_attrs(pg, PP_ATTRS_W); } else { pmap_page_remove(pg); } } } /* * pmap_pv_protect: change the protection of all recorded mappings * of an unmanaged page */ __inline static void __unused pmap_pv_protect(paddr_t pa, vm_prot_t prot) { if ((prot & VM_PROT_WRITE) == 0) { if (prot & (VM_PROT_READ|VM_PROT_EXECUTE)) { (void)pmap_pv_clear_attrs(pa, PP_ATTRS_W); } else { pmap_pv_remove(pa); } } } /* * pmap_protect: change the protection of pages in a pmap * * => this function is a frontend for pmap_remove/pmap_write_protect * => we only have to worry about making the page more protected. * unprotecting a page is done on-demand at fault time. */ __inline static void __unused pmap_protect(struct pmap *pmap, vaddr_t sva, vaddr_t eva, vm_prot_t prot) { if ((prot & VM_PROT_WRITE) == 0) { if (prot & (VM_PROT_READ|VM_PROT_EXECUTE)) { pmap_write_protect(pmap, sva, eva, prot); } else { pmap_remove(pmap, sva, eva); } } } paddr_t vtophys(vaddr_t); vaddr_t pmap_map(vaddr_t, paddr_t, paddr_t, vm_prot_t); void pmap_cpu_init_late(struct cpu_info *); /* pmap functions with machine addresses */ void pmap_kenter_ma(vaddr_t, paddr_t, vm_prot_t, u_int); int pmap_enter_ma(struct pmap *, vaddr_t, paddr_t, paddr_t, vm_prot_t, u_int, int); bool pmap_extract_ma(pmap_t, vaddr_t, paddr_t *); paddr_t pmap_get_physpage(void); /* * Hooks for the pool allocator. */ #define POOL_VTOPHYS(va) vtophys((vaddr_t) (va)) #ifdef __HAVE_DIRECT_MAP extern vaddr_t pmap_direct_base; extern vaddr_t pmap_direct_end; #define PMAP_DIRECT_BASE pmap_direct_base #define PMAP_DIRECT_END pmap_direct_end #define PMAP_DIRECT_MAP(pa) ((vaddr_t)PMAP_DIRECT_BASE + (pa)) #define PMAP_DIRECT_UNMAP(va) ((paddr_t)(va) - PMAP_DIRECT_BASE) /* * Alternate mapping hooks for pool pages. */ #define PMAP_MAP_POOLPAGE(pa) PMAP_DIRECT_MAP((pa)) #define PMAP_UNMAP_POOLPAGE(va) PMAP_DIRECT_UNMAP((va)) #endif /* __HAVE_DIRECT_MAP */ #define __HAVE_VM_PAGE_MD #define VM_MDPAGE_INIT(pg) \ memset(&(pg)->mdpage, 0, sizeof((pg)->mdpage)); \ PMAP_PAGE_INIT(&(pg)->mdpage.mp_pp) struct vm_page_md { struct pmap_page mp_pp; }; #endif /* _KERNEL */ #endif /* _X86_PMAP_H_ */
1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 /* $NetBSD: radio.c,v 1.31 2021/08/07 16:19:08 thorpej Exp $ */ /* $OpenBSD: radio.c,v 1.2 2001/12/05 10:27:06 mickey Exp $ */ /* $RuOBSD: radio.c,v 1.7 2001/12/04 06:03:05 tm Exp $ */ /* * Copyright (c) 2001 Maxim Tsyplakov <tm@oganer.net> * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* This is the /dev/radio driver from OpenBSD */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: radio.c,v 1.31 2021/08/07 16:19:08 thorpej Exp $"); #include <sys/param.h> #include <sys/systm.h> #include <sys/proc.h> #include <sys/errno.h> #include <sys/ioctl.h> #include <sys/device.h> #include <sys/vnode.h> #include <sys/radioio.h> #include <sys/conf.h> #include <dev/radio_if.h> #include "ioconf.h" struct radio_softc { void *hw_hdl; /* hardware driver handle */ device_t sc_dev; /* hardware device struct */ const struct radio_hw_if *hw_if; /* hardware interface */ }; static int radioprobe(device_t, cfdata_t, void *); static void radioattach(device_t, device_t, void *); static int radioprint(void *, const char *); static int radiodetach(device_t, int); CFATTACH_DECL_NEW(radio, sizeof(struct radio_softc), radioprobe, radioattach, radiodetach, NULL); static dev_type_open(radioopen); static dev_type_close(radioclose); static dev_type_ioctl(radioioctl); const struct cdevsw radio_cdevsw = { .d_open = radioopen, .d_close = radioclose, .d_read = noread, .d_write = nowrite, .d_ioctl = radioioctl, .d_stop = nostop, .d_tty = notty, .d_poll = nopoll, .d_mmap = nommap, .d_kqfilter = nokqfilter, .d_discard = nodiscard, .d_flag = D_OTHER, }; static int radioprobe(device_t parent, cfdata_t match, void *aux) { return (1); } static void radioattach(device_t parent, device_t self, void *aux) { struct radio_softc *sc = device_private(self); struct radio_attach_args *sa = aux; const struct radio_hw_if *hwp = sa->hwif; void *hdlp = sa->hdl; aprint_naive("\n"); aprint_normal("\n"); sc->hw_if = hwp; sc->hw_hdl = hdlp; sc->sc_dev = self; } static int radioopen(dev_t dev, int flags, int fmt, struct lwp *l) { int unit; struct radio_softc *sc; unit = RADIOUNIT(dev); sc = device_lookup_private(&radio_cd, unit); if (sc == NULL || sc->hw_if == NULL) return (ENXIO); if (sc->hw_if->open != NULL) return (sc->hw_if->open(sc->hw_hdl, flags, fmt, l->l_proc)); else return (0); } static int radioclose(dev_t dev, int flags, int fmt, struct lwp *l) { struct radio_softc *sc; sc = device_lookup_private(&radio_cd, RADIOUNIT(dev)); if (sc->hw_if->close != NULL) return (sc->hw_if->close(sc->hw_hdl, flags, fmt, l->l_proc)); else return (0); } static int radioioctl(dev_t dev, u_long cmd, void *data, int flags, struct lwp *l) { struct radio_softc *sc; int unit, error; unit = RADIOUNIT(dev); sc = device_lookup_private(&radio_cd, unit); if (sc == NULL || sc->hw_if == NULL) return (ENXIO); error = EOPNOTSUPP; switch (cmd) { case RIOCGINFO: if (sc->hw_if->get_info) error = (sc->hw_if->get_info)(sc->hw_hdl, (struct radio_info *)data); break; case RIOCSINFO: if (sc->hw_if->set_info) error = (sc->hw_if->set_info)(sc->hw_hdl, (struct radio_info *)data); break; case RIOCSSRCH: if (sc->hw_if->search) error = (sc->hw_if->search)(sc->hw_hdl, *(int *)data); break; default: error = EINVAL; } return (error); } /* * Called from hardware driver. This is where the MI radio driver gets * probed/attached to the hardware driver */ device_t radio_attach_mi(const struct radio_hw_if *rhwp, void *hdlp, device_t dev) { struct radio_attach_args arg; arg.hwif = rhwp; arg.hdl = hdlp; return (config_found(dev, &arg, radioprint, CFARGS_NONE)); } static int radioprint(void *aux, const char *pnp) { if (pnp != NULL) aprint_normal("radio at %s", pnp); return (UNCONF); } static int radiodetach(device_t self, int flags) { int maj, mn; /* locate the major number */ maj = cdevsw_lookup_major(&radio_cdevsw); /* Nuke the vnodes for any open instances (calls close). */ mn = device_unit(self); vdevgone(maj, mn, mn, VCHR); return (0); }
686 689 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 /* $NetBSD: subr_fault.c,v 1.2 2020/06/30 16:28:17 maxv Exp $ */ /* * Copyright (c) 2020 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Maxime Villard. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: subr_fault.c,v 1.2 2020/06/30 16:28:17 maxv Exp $"); #include <sys/module.h> #include <sys/param.h> #include <sys/systm.h> #include <sys/kernel.h> #include <sys/conf.h> #include <sys/types.h> #include <sys/specificdata.h> #include <sys/kmem.h> #include <sys/atomic.h> #include <sys/ioccom.h> #include <sys/lwp.h> #include <sys/fault.h> typedef struct { volatile bool enabled; volatile bool oneshot; volatile unsigned long nth; volatile unsigned long cnt; volatile unsigned long nfaults; } fault_t; static fault_t fault_global __cacheline_aligned = { .enabled = false, .oneshot = false, .nth = FAULT_NTH_MIN, .cnt = 0, .nfaults = 0 }; static kmutex_t fault_global_lock __cacheline_aligned; static specificdata_key_t fault_lwp_key; /* -------------------------------------------------------------------------- */ bool fault_inject(void) { volatile unsigned long cnt; fault_t *f; if (__predict_false(cold)) return false; if (__predict_false(atomic_load_acquire(&fault_global.enabled))) { f = &fault_global; } else { f = lwp_getspecific(fault_lwp_key); if (__predict_true(f == NULL)) return false; if (__predict_false(!f->enabled)) return false; } if (atomic_load_relaxed(&f->oneshot)) { if (__predict_true(atomic_load_relaxed(&f->nfaults) > 0)) return false; } cnt = atomic_inc_ulong_nv(&f->cnt); if (__predict_false(cnt % atomic_load_relaxed(&f->nth) == 0)) { atomic_inc_ulong(&f->nfaults); return true; } return false; } /* -------------------------------------------------------------------------- */ static int fault_open(dev_t dev, int flag, int mode, struct lwp *l) { return 0; } static int fault_close(dev_t dev, int flag, int mode, struct lwp *l) { return 0; } static int fault_ioc_enable(struct fault_ioc_enable *args) { fault_t *f; if (args->mode != FAULT_MODE_NTH_ONESHOT) return EINVAL; if (args->nth < FAULT_NTH_MIN) return EINVAL; switch (args->scope) { case FAULT_SCOPE_GLOBAL: mutex_enter(&fault_global_lock); if (fault_global.enabled) { mutex_exit(&fault_global_lock); return EEXIST; } fault_global.oneshot = true; atomic_store_relaxed(&fault_global.nth, args->nth); fault_global.cnt = 0; fault_global.nfaults = 0; atomic_store_release(&fault_global.enabled, true); mutex_exit(&fault_global_lock); break; case FAULT_SCOPE_LWP: f = lwp_getspecific(fault_lwp_key); if (f != NULL) { if (f->enabled) return EEXIST; } else { f = kmem_zalloc(sizeof(*f), KM_SLEEP); lwp_setspecific(fault_lwp_key, f); } f->oneshot = true; atomic_store_relaxed(&f->nth, args->nth); f->cnt = 0; f->nfaults = 0; atomic_store_release(&f->enabled, true); break; default: return EINVAL; } return 0; } static int fault_ioc_disable(struct fault_ioc_disable *args) { fault_t *f; switch (args->scope) { case FAULT_SCOPE_GLOBAL: mutex_enter(&fault_global_lock); if (!fault_global.enabled) { mutex_exit(&fault_global_lock); return ENOENT; } atomic_store_release(&fault_global.enabled, false); mutex_exit(&fault_global_lock); break; case FAULT_SCOPE_LWP: f = lwp_getspecific(fault_lwp_key); if (f == NULL) return ENOENT; if (!f->enabled) return ENOENT; atomic_store_release(&f->enabled, false); break; default: return EINVAL; } return 0; } static int fault_ioc_getinfo(struct fault_ioc_getinfo *args) { fault_t *f; switch (args->scope) { case FAULT_SCOPE_GLOBAL: args->nfaults = atomic_load_relaxed(&fault_global.nfaults); break; case FAULT_SCOPE_LWP: f = lwp_getspecific(fault_lwp_key); if (f == NULL) return ENOENT; args->nfaults = atomic_load_relaxed(&f->nfaults); break; default: return EINVAL; } return 0; } static int fault_ioctl(dev_t dev, u_long cmd, void *addr, int flag, struct lwp *l) { switch (cmd) { case FAULT_IOC_ENABLE: return fault_ioc_enable(addr); case FAULT_IOC_DISABLE: return fault_ioc_disable(addr); case FAULT_IOC_GETINFO: return fault_ioc_getinfo(addr); default: return EINVAL; } } const struct cdevsw fault_cdevsw = { .d_open = fault_open, .d_close = fault_close, .d_read = noread, .d_write = nowrite, .d_ioctl = fault_ioctl, .d_stop = nostop, .d_tty = notty, .d_poll = nopoll, .d_mmap = nommap, .d_kqfilter = nokqfilter, .d_discard = nodiscard, .d_flag = D_OTHER | D_MPSAFE }; /* -------------------------------------------------------------------------- */ MODULE(MODULE_CLASS_MISC, fault, NULL); static void fault_lwp_free(void *arg) { fault_t *f = (fault_t *)arg; if (f == NULL) { return; } kmem_free(f, sizeof(*f)); } static void fault_init(void) { mutex_init(&fault_global_lock, MUTEX_DEFAULT, IPL_NONE); lwp_specific_key_create(&fault_lwp_key, fault_lwp_free); } static int fault_modcmd(modcmd_t cmd, void *arg) { switch (cmd) { case MODULE_CMD_INIT: fault_init(); return 0; case MODULE_CMD_FINI: return EINVAL; default: return ENOTTY; } }
1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 1 1 6 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 /* $NetBSD: keysock.c,v 1.70 2019/06/12 22:23:50 christos Exp $ */ /* $FreeBSD: keysock.c,v 1.3.2.1 2003/01/24 05:11:36 sam Exp $ */ /* $KAME: keysock.c,v 1.25 2001/08/13 20:07:41 itojun Exp $ */ /* * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the project nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: keysock.c,v 1.70 2019/06/12 22:23:50 christos Exp $"); /* This code has derived from sys/net/rtsock.c on FreeBSD2.2.5 */ #include <sys/types.h> #include <sys/param.h> #include <sys/domain.h> #include <sys/errno.h> #include <sys/kernel.h> #include <sys/kmem.h> #include <sys/mbuf.h> #include <sys/protosw.h> #include <sys/signalvar.h> #include <sys/socket.h> #include <sys/socketvar.h> #include <sys/sysctl.h> #include <sys/systm.h> #include <sys/cpu.h> #include <sys/syslog.h> #include <net/raw_cb.h> #include <net/route.h> #include <net/pfkeyv2.h> #include <netipsec/key.h> #include <netipsec/keysock.h> #include <netipsec/key_debug.h> #include <netipsec/ipsec_private.h> struct key_cb { int key_count; int any_count; }; static struct key_cb key_cb; static struct sockaddr key_dst = { .sa_len = 2, .sa_family = PF_KEY, }; static struct sockaddr key_src = { .sa_len = 2, .sa_family = PF_KEY, }; static const struct protosw keysw[]; static int key_sendup0(struct rawcb *, struct mbuf *, int, int); int key_registered_sb_max = (2048 * MHLEN); /* XXX arbitrary */ static kmutex_t *key_so_mtx; static struct rawcbhead key_rawcb; void key_init_so(void) { key_so_mtx = mutex_obj_alloc(MUTEX_DEFAULT, IPL_NONE); } static void key_pr_init(void) { LIST_INIT(&key_rawcb); } /* * key_output() */ static int key_output(struct mbuf *m, struct socket *so) { struct sadb_msg *msg; int len, error = 0; int s; KASSERT(m != NULL); { uint64_t *ps = PFKEY_STAT_GETREF(); ps[PFKEY_STAT_OUT_TOTAL]++; ps[PFKEY_STAT_OUT_BYTES] += m->m_pkthdr.len; PFKEY_STAT_PUTREF(); } len = m->m_pkthdr.len; if (len < sizeof(struct sadb_msg)) { PFKEY_STATINC(PFKEY_STAT_OUT_TOOSHORT); error = EINVAL; goto end; } if (m->m_len < sizeof(struct sadb_msg)) { if ((m = m_pullup(m, sizeof(struct sadb_msg))) == 0) { PFKEY_STATINC(PFKEY_STAT_OUT_NOMEM); error = ENOBUFS; goto end; } } KASSERT((m->m_flags & M_PKTHDR) != 0); if (KEYDEBUG_ON(KEYDEBUG_KEY_DUMP)) kdebug_mbuf(__func__, m); msg = mtod(m, struct sadb_msg *); PFKEY_STATINC(PFKEY_STAT_OUT_MSGTYPE + msg->sadb_msg_type); if (len != PFKEY_UNUNIT64(msg->sadb_msg_len)) { PFKEY_STATINC(PFKEY_STAT_OUT_INVLEN); error = EINVAL; goto end; } /*XXX giant lock*/ s = splsoftnet(); error = key_parse(m, so); m = NULL; splx(s); end: if (m) m_freem(m); return error; } /* * send message to the socket. */ static int key_sendup0( struct rawcb *rp, struct mbuf *m, int promisc, int sbprio ) { int error; int ok; if (promisc) { struct sadb_msg *pmsg; M_PREPEND(m, sizeof(struct sadb_msg), M_DONTWAIT); if (m && m->m_len < sizeof(struct sadb_msg)) m = m_pullup(m, sizeof(struct sadb_msg)); if (!m) { PFKEY_STATINC(PFKEY_STAT_IN_NOMEM); return ENOBUFS; } m->m_pkthdr.len += sizeof(*pmsg); pmsg = mtod(m, struct sadb_msg *); memset(pmsg, 0, sizeof(*pmsg)); pmsg->sadb_msg_version = PF_KEY_V2; pmsg->sadb_msg_type = SADB_X_PROMISC; pmsg->sadb_msg_len = PFKEY_UNIT64(m->m_pkthdr.len); /* pid and seq? */ PFKEY_STATINC(PFKEY_STAT_IN_MSGTYPE + pmsg->sadb_msg_type); } if (sbprio == 0) ok = sbappendaddr(&rp->rcb_socket->so_rcv, (struct sockaddr *)&key_src, m, NULL); else ok = sbappendaddrchain(&rp->rcb_socket->so_rcv, (struct sockaddr *)&key_src, m, sbprio); if (!ok) { log(LOG_WARNING, "%s: couldn't send PF_KEY message to the socket\n", __func__); PFKEY_STATINC(PFKEY_STAT_IN_NOMEM); m_freem(m); /* Don't call soroverflow because we're returning this * error directly to the sender. */ rp->rcb_socket->so_rcv.sb_overflowed++; error = ENOBUFS; } else { sorwakeup(rp->rcb_socket); error = 0; } return error; } /* so can be NULL if target != KEY_SENDUP_ONE */ static int _key_sendup_mbuf(struct socket *so, struct mbuf *m, int target/*, sbprio */) { struct mbuf *n; struct keycb *kp; int sendup; struct rawcb *rp; int error = 0; int sbprio = 0; /* XXX should be a parameter */ KASSERT(m != NULL); KASSERT(so != NULL || target != KEY_SENDUP_ONE); /* * RFC 2367 says ACQUIRE and other kernel-generated messages * are special. We treat all KEY_SENDUP_REGISTERED messages * as special, delivering them to all registered sockets * even if the socket is at or above its so->so_rcv.sb_max limits. * The only constraint is that the so_rcv data fall below * key_registered_sb_max. * Doing that check here avoids reworking every key_sendup_mbuf() * in the short term. . The rework will be done after a technical * conensus that this approach is appropriate. */ if (target == KEY_SENDUP_REGISTERED) { sbprio = SB_PRIO_BESTEFFORT; } { uint64_t *ps = PFKEY_STAT_GETREF(); ps[PFKEY_STAT_IN_TOTAL]++; ps[PFKEY_STAT_IN_BYTES] += m->m_pkthdr.len; PFKEY_STAT_PUTREF(); } if (m->m_len < sizeof(struct sadb_msg)) { #if 1 m = m_pullup(m, sizeof(struct sadb_msg)); if (m == NULL) { PFKEY_STATINC(PFKEY_STAT_IN_NOMEM); return ENOBUFS; } #else /* don't bother pulling it up just for stats */ #endif } if (m->m_len >= sizeof(struct sadb_msg)) { struct sadb_msg *msg; msg = mtod(m, struct sadb_msg *); PFKEY_STATINC(PFKEY_STAT_IN_MSGTYPE + msg->sadb_msg_type); } LIST_FOREACH(rp, &key_rawcb, rcb_list) { struct socket * kso = rp->rcb_socket; if (rp->rcb_proto.sp_family != PF_KEY) continue; if (rp->rcb_proto.sp_protocol && rp->rcb_proto.sp_protocol != PF_KEY_V2) { continue; } kp = (struct keycb *)rp; /* * If you are in promiscuous mode, and when you get broadcasted * reply, you'll get two PF_KEY messages. * (based on pf_key@inner.net message on 14 Oct 1998) */ if (((struct keycb *)rp)->kp_promisc) { if ((n = m_copym(m, 0, (int)M_COPYALL, M_DONTWAIT)) != NULL) { (void)key_sendup0(rp, n, 1, 0); n = NULL; } } /* the exact target will be processed later */ if (so && sotorawcb(so) == rp) continue; sendup = 0; switch (target) { case KEY_SENDUP_ONE: /* the statement has no effect */ if (so && sotorawcb(so) == rp) sendup++; break; case KEY_SENDUP_ALL: sendup++; break; case KEY_SENDUP_REGISTERED: if (kp->kp_registered) { if (kso->so_rcv.sb_cc <= key_registered_sb_max) sendup++; else printf("keysock: " "registered sendup dropped, " "sb_cc %ld max %d\n", kso->so_rcv.sb_cc, key_registered_sb_max); } break; } PFKEY_STATINC(PFKEY_STAT_IN_MSGTARGET + target); if (!sendup) continue; if ((n = m_copym(m, 0, (int)M_COPYALL, M_DONTWAIT)) == NULL) { m_freem(m); PFKEY_STATINC(PFKEY_STAT_IN_NOMEM); return ENOBUFS; } if ((error = key_sendup0(rp, n, 0, 0)) != 0) { m_freem(m); return error; } n = NULL; } /* The 'later' time for processing the exact target has arrived */ if (so) { error = key_sendup0(sotorawcb(so), m, 0, sbprio); m = NULL; } else { error = 0; m_freem(m); } return error; } int key_sendup_mbuf(struct socket *so, struct mbuf *m, int target/*, sbprio */) { int error; if (so == NULL) mutex_enter(key_so_mtx); else KASSERT(solocked(so)); error = _key_sendup_mbuf(so, m, target); if (so == NULL) mutex_exit(key_so_mtx); return error; } static int key_attach(struct socket *so, int proto) { struct keycb *kp; int s, error; KASSERT(sotorawcb(so) == NULL); kp = kmem_zalloc(sizeof(*kp), KM_SLEEP); kp->kp_raw.rcb_len = sizeof(*kp); so->so_pcb = kp; s = splsoftnet(); if (so->so_lock != key_so_mtx) { KASSERT(so->so_lock == NULL); mutex_obj_hold(key_so_mtx); so->so_lock = key_so_mtx; solock(so); } error = raw_attach(so, proto, &key_rawcb); if (error) { PFKEY_STATINC(PFKEY_STAT_SOCKERR); kmem_free(kp, sizeof(*kp)); so->so_pcb = NULL; goto out; } kp->kp_promisc = kp->kp_registered = 0; if (kp->kp_raw.rcb_proto.sp_protocol == PF_KEY) /* XXX: AF_KEY */ key_cb.key_count++; key_cb.any_count++; kp->kp_raw.rcb_laddr = &key_src; kp->kp_raw.rcb_faddr = &key_dst; soisconnected(so); so->so_options |= SO_USELOOPBACK; out: KASSERT(solocked(so)); splx(s); return error; } static void key_detach(struct socket *so) { struct keycb *kp = (struct keycb *)sotorawcb(so); int s; KASSERT(!cpu_softintr_p()); KASSERT(solocked(so)); KASSERT(kp != NULL); s = splsoftnet(); if (kp->kp_raw.rcb_proto.sp_protocol == PF_KEY) /* XXX: AF_KEY */ key_cb.key_count--; key_cb.any_count--; key_freereg(so); raw_detach(so); splx(s); } static int key_accept(struct socket *so, struct sockaddr *nam) { KASSERT(solocked(so)); panic("%s: unsupported", __func__); return EOPNOTSUPP; } static int key_bind(struct socket *so, struct sockaddr *nam, struct lwp *l) { KASSERT(solocked(so)); return EOPNOTSUPP; } static int key_listen(struct socket *so, struct lwp *l) { KASSERT(solocked(so)); return EOPNOTSUPP; } static int key_connect(struct socket *so, struct sockaddr *nam, struct lwp *l) { KASSERT(solocked(so)); return EOPNOTSUPP; } static int key_connect2(struct socket *so, struct socket *so2) { KASSERT(solocked(so)); return EOPNOTSUPP; } static int key_disconnect(struct socket *so) { struct rawcb *rp = sotorawcb(so); int s; KASSERT(solocked(so)); KASSERT(rp != NULL); s = splsoftnet(); soisdisconnected(so); raw_disconnect(rp); splx(s); return 0; } static int key_shutdown(struct socket *so) { int s; KASSERT(solocked(so)); /* * Mark the connection as being incapable of further input. */ s = splsoftnet(); socantsendmore(so); splx(s); return 0; } static int key_abort(struct socket *so) { KASSERT(solocked(so)); panic("%s: unsupported", __func__); return EOPNOTSUPP; } static int key_ioctl(struct socket *so, u_long cmd, void *nam, struct ifnet *ifp) { return EOPNOTSUPP; } static int key_stat(struct socket *so, struct stat *ub) { KASSERT(solocked(so)); return 0; } static int key_peeraddr(struct socket *so, struct sockaddr *nam) { struct rawcb *rp = sotorawcb(so); KASSERT(solocked(so)); KASSERT(rp != NULL); KASSERT(nam != NULL); if (rp->rcb_faddr == NULL) return ENOTCONN; raw_setpeeraddr(rp, nam); return 0; } static int key_sockaddr(struct socket *so, struct sockaddr *nam) { struct rawcb *rp = sotorawcb(so); KASSERT(solocked(so)); KASSERT(rp != NULL); KASSERT(nam != NULL); if (rp->rcb_faddr == NULL) return ENOTCONN; raw_setsockaddr(rp, nam); return 0; } static int key_rcvd(struct socket *so, int flags, struct lwp *l) { KASSERT(solocked(so)); return EOPNOTSUPP; } static int key_recvoob(struct socket *so, struct mbuf *m, int flags) { KASSERT(solocked(so)); return EOPNOTSUPP; } static int key_send(struct socket *so, struct mbuf *m, struct sockaddr *nam, struct mbuf *control, struct lwp *l) { int error = 0; int s; KASSERT(solocked(so)); KASSERT(so->so_proto == &keysw[0]); s = splsoftnet(); error = raw_send(so, m, nam, control, l, &key_output); splx(s); return error; } static int key_sendoob(struct socket *so, struct mbuf *m, struct mbuf *control) { KASSERT(solocked(so)); m_freem(m); m_freem(control); return EOPNOTSUPP; } static int key_purgeif(struct socket *so, struct ifnet *ifa) { panic("%s: unsupported", __func__); return EOPNOTSUPP; } /* * Definitions of protocols supported in the KEY domain. */ DOMAIN_DEFINE(keydomain); PR_WRAP_USRREQS(key) #define key_attach key_attach_wrapper #define key_detach key_detach_wrapper #define key_accept key_accept_wrapper #define key_bind key_bind_wrapper #define key_listen key_listen_wrapper #define key_connect key_connect_wrapper #define key_connect2 key_connect2_wrapper #define key_disconnect key_disconnect_wrapper #define key_shutdown key_shutdown_wrapper #define key_abort key_abort_wrapper #define key_ioctl key_ioctl_wrapper #define key_stat key_stat_wrapper #define key_peeraddr key_peeraddr_wrapper #define key_sockaddr key_sockaddr_wrapper #define key_rcvd key_rcvd_wrapper #define key_recvoob key_recvoob_wrapper #define key_send key_send_wrapper #define key_sendoob key_sendoob_wrapper #define key_purgeif key_purgeif_wrapper static const struct pr_usrreqs key_usrreqs = { .pr_attach = key_attach, .pr_detach = key_detach, .pr_accept = key_accept, .pr_bind = key_bind, .pr_listen = key_listen, .pr_connect = key_connect, .pr_connect2 = key_connect2, .pr_disconnect = key_disconnect, .pr_shutdown = key_shutdown, .pr_abort = key_abort, .pr_ioctl = key_ioctl, .pr_stat = key_stat, .pr_peeraddr = key_peeraddr, .pr_sockaddr = key_sockaddr, .pr_rcvd = key_rcvd, .pr_recvoob = key_recvoob, .pr_send = key_send, .pr_sendoob = key_sendoob, .pr_purgeif = key_purgeif, }; static const struct protosw keysw[] = { { .pr_type = SOCK_RAW, .pr_domain = &keydomain, .pr_protocol = PF_KEY_V2, .pr_flags = PR_ATOMIC|PR_ADDR, .pr_ctlinput = raw_ctlinput, .pr_usrreqs = &key_usrreqs, .pr_init = key_pr_init, } }; struct domain keydomain = { .dom_family = PF_KEY, .dom_name = "key", .dom_init = key_init, .dom_protosw = keysw, .dom_protoswNPROTOSW = &keysw[__arraycount(keysw)], };
1 1 1 281 281 280 279 280 281 280 281 24 259 281 280 281 281 9 11 25 250 249 250 250 249 1 250 250 26 1 26 23 21 8 8 8 19 23 6 17 22 1 23 21 4 1 23 26 2 20 23 26 1 1 22 21 26 25 1 12 26 23 23 21 4 21 5 5 5 4 1 1 7 1 8 91 92 92 91 92 92 91 92 92 74 75 75 26 20 26 26 25 24 25 25 24 3 11 2 18 10 3 19 19 19 19 16 4 19 19 25 24 25 4 21 6 13 1 10 9 10 10 10 25 18 19 19 19 19 25 1 1 20 9 26 4 11 20 19 19 19 19 19 19 19 19 18 19 19 18 1 13 7 5 5 5 1 4 1 1 4 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 /* $NetBSD: genfs_io.c,v 1.104 2024/04/05 13:05:40 riastradh Exp $ */ /* * Copyright (c) 1982, 1986, 1989, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: genfs_io.c,v 1.104 2024/04/05 13:05:40 riastradh Exp $"); #include <sys/param.h> #include <sys/systm.h> #include <sys/proc.h> #include <sys/kernel.h> #include <sys/mount.h> #include <sys/vnode.h> #include <sys/kmem.h> #include <sys/kauth.h> #include <sys/fstrans.h> #include <sys/buf.h> #include <sys/atomic.h> #include <miscfs/genfs/genfs.h> #include <miscfs/genfs/genfs_node.h> #include <miscfs/specfs/specdev.h> #include <uvm/uvm.h> #include <uvm/uvm_pager.h> #include <uvm/uvm_page_array.h> static int genfs_do_directio(struct vmspace *, vaddr_t, size_t, struct vnode *, off_t, enum uio_rw); static void genfs_dio_iodone(struct buf *); static int genfs_getpages_read(struct vnode *, struct vm_page **, int, off_t, off_t, bool, bool, bool, bool); static int genfs_do_io(struct vnode *, off_t, vaddr_t, size_t, int, enum uio_rw, void (*)(struct buf *)); static void genfs_rel_pages(struct vm_page **, unsigned int); int genfs_maxdio = MAXPHYS; static void genfs_rel_pages(struct vm_page **pgs, unsigned int npages) { unsigned int i; for (i = 0; i < npages; i++) { struct vm_page *pg = pgs[i]; if (pg == NULL || pg == PGO_DONTCARE) continue; KASSERT(uvm_page_owner_locked_p(pg, true)); if (pg->flags & PG_FAKE) { pg->flags |= PG_RELEASED; } } uvm_page_unbusy(pgs, npages); } /* * generic VM getpages routine. * Return PG_BUSY pages for the given range, * reading from backing store if necessary. */ int genfs_getpages(void *v) { struct vop_getpages_args /* { struct vnode *a_vp; voff_t a_offset; struct vm_page **a_m; int *a_count; int a_centeridx; vm_prot_t a_access_type; int a_advice; int a_flags; } */ * const ap = v; off_t diskeof, memeof; int i, error, npages, iflag; const int flags = ap->a_flags; struct vnode * const vp = ap->a_vp; struct uvm_object * const uobj = &vp->v_uobj; const bool async = (flags & PGO_SYNCIO) == 0; const bool memwrite = (ap->a_access_type & VM_PROT_WRITE) != 0; const bool overwrite = (flags & PGO_OVERWRITE) != 0; const bool blockalloc = memwrite && (flags & PGO_NOBLOCKALLOC) == 0; const bool need_wapbl = (vp->v_mount->mnt_wapbl && (flags & PGO_JOURNALLOCKED) == 0); const bool glocked = (flags & PGO_GLOCKHELD) != 0; bool holds_wapbl = false; struct mount *trans_mount = NULL; UVMHIST_FUNC("genfs_getpages"); UVMHIST_CALLED(ubchist); UVMHIST_LOG(ubchist, "vp %#jx off 0x%jx/%jx count %jd", (uintptr_t)vp, ap->a_offset >> 32, ap->a_offset, *ap->a_count); KASSERT(memwrite >= overwrite); KASSERT(vp->v_type == VREG || vp->v_type == VDIR || vp->v_type == VLNK || vp->v_type == VBLK); /* * the object must be locked. it can only be a read lock when * processing a read fault with PGO_LOCKED. */ KASSERT(rw_lock_held(uobj->vmobjlock)); KASSERT(rw_write_held(uobj->vmobjlock) || ((flags & PGO_LOCKED) != 0 && !memwrite)); #ifdef DIAGNOSTIC if ((flags & PGO_JOURNALLOCKED) && vp->v_mount->mnt_wapbl) WAPBL_JLOCK_ASSERT(vp->v_mount); #endif /* * check for reclaimed vnode. v_interlock is not held here, but * VI_DEADCHECK is set with vmobjlock held. */ iflag = atomic_load_relaxed(&vp->v_iflag); if (__predict_false((iflag & VI_DEADCHECK) != 0)) { mutex_enter(vp->v_interlock); error = vdead_check(vp, VDEAD_NOWAIT); mutex_exit(vp->v_interlock); if (error) { if ((flags & PGO_LOCKED) == 0) rw_exit(uobj->vmobjlock); return error; } } startover: error = 0; const voff_t origvsize = vp->v_size; const off_t origoffset = ap->a_offset; const int orignpages = *ap->a_count; GOP_SIZE(vp, origvsize, &diskeof, 0); if (flags & PGO_PASTEOF) { off_t newsize; #if defined(DIAGNOSTIC) off_t writeeof; #endif /* defined(DIAGNOSTIC) */ newsize = MAX(origvsize, origoffset + (orignpages << PAGE_SHIFT)); GOP_SIZE(vp, newsize, &memeof, GOP_SIZE_MEM); #if defined(DIAGNOSTIC) GOP_SIZE(vp, vp->v_writesize, &writeeof, GOP_SIZE_MEM); if (newsize > round_page(writeeof)) { panic("%s: past eof: %" PRId64 " vs. %" PRId64, __func__, newsize, round_page(writeeof)); } #endif /* defined(DIAGNOSTIC) */ } else { GOP_SIZE(vp, origvsize, &memeof, GOP_SIZE_MEM); } KASSERT(ap->a_centeridx >= 0 || ap->a_centeridx <= orignpages); KASSERT((origoffset & (PAGE_SIZE - 1)) == 0); KASSERT(origoffset >= 0); KASSERT(orignpages > 0); /* * Bounds-check the request. */ if (origoffset + (ap->a_centeridx << PAGE_SHIFT) >= memeof) { if ((flags & PGO_LOCKED) == 0) { rw_exit(uobj->vmobjlock); } UVMHIST_LOG(ubchist, "off 0x%jx count %jd goes past EOF 0x%jx", origoffset, *ap->a_count, memeof,0); error = EINVAL; goto out_err; } /* uobj is locked */ if ((flags & PGO_NOTIMESTAMP) == 0 && (vp->v_type != VBLK || (vp->v_mount->mnt_flag & MNT_NODEVMTIME) == 0)) { int updflags = 0; if ((vp->v_mount->mnt_flag & MNT_NOATIME) == 0) { updflags = GOP_UPDATE_ACCESSED; } if (memwrite) { updflags |= GOP_UPDATE_MODIFIED; } if (updflags != 0) { GOP_MARKUPDATE(vp, updflags); } } /* * For PGO_LOCKED requests, just return whatever's in memory. */ if (flags & PGO_LOCKED) { int nfound; struct vm_page *pg; KASSERT(!glocked); npages = *ap->a_count; #if defined(DEBUG) for (i = 0; i < npages; i++) { pg = ap->a_m[i]; KASSERT(pg == NULL || pg == PGO_DONTCARE); } #endif /* defined(DEBUG) */ nfound = uvn_findpages(uobj, origoffset, &npages, ap->a_m, NULL, UFP_NOWAIT | UFP_NOALLOC | UFP_NOBUSY | (memwrite ? UFP_NORDONLY : 0)); KASSERT(npages == *ap->a_count); if (nfound == 0) { error = EBUSY; goto out_err; } /* * lock and unlock g_glock to ensure that no one is truncating * the file behind us. */ if (!genfs_node_rdtrylock(vp)) { /* * restore the array. */ for (i = 0; i < npages; i++) { pg = ap->a_m[i]; if (pg != NULL && pg != PGO_DONTCARE) { ap->a_m[i] = NULL; } KASSERT(ap->a_m[i] == NULL || ap->a_m[i] == PGO_DONTCARE); } } else { genfs_node_unlock(vp); } error = (ap->a_m[ap->a_centeridx] == NULL ? EBUSY : 0); if (error == 0 && memwrite) { for (i = 0; i < npages; i++) { pg = ap->a_m[i]; if (pg == NULL || pg == PGO_DONTCARE) { continue; } if (uvm_pagegetdirty(pg) == UVM_PAGE_STATUS_CLEAN) { uvm_pagemarkdirty(pg, UVM_PAGE_STATUS_UNKNOWN); } } } goto out_err; } rw_exit(uobj->vmobjlock); /* * find the requested pages and make some simple checks. * leave space in the page array for a whole block. */ const int fs_bshift = (vp->v_type != VBLK) ? vp->v_mount->mnt_fs_bshift : DEV_BSHIFT; const int fs_bsize = 1 << fs_bshift; #define blk_mask (fs_bsize - 1) #define trunc_blk(x) ((x) & ~blk_mask) #define round_blk(x) (((x) + blk_mask) & ~blk_mask) const int orignmempages = MIN(orignpages, round_page(memeof - origoffset) >> PAGE_SHIFT); npages = orignmempages; const off_t startoffset = trunc_blk(origoffset); const off_t endoffset = MIN( round_page(round_blk(origoffset + (npages << PAGE_SHIFT))), round_page(memeof)); const int ridx = (origoffset - startoffset) >> PAGE_SHIFT; const int pgs_size = sizeof(struct vm_page *) * ((endoffset - startoffset) >> PAGE_SHIFT); struct vm_page **pgs, *pgs_onstack[UBC_MAX_PAGES]; if (pgs_size > sizeof(pgs_onstack)) { pgs = kmem_zalloc(pgs_size, async ? KM_NOSLEEP : KM_SLEEP); if (pgs == NULL) { pgs = pgs_onstack; error = ENOMEM; goto out_err; } } else { pgs = pgs_onstack; (void)memset(pgs, 0, pgs_size); } UVMHIST_LOG(ubchist, "ridx %jd npages %jd startoff %#jx endoff %#jx", ridx, npages, startoffset, endoffset); if (trans_mount == NULL) { trans_mount = vp->v_mount; fstrans_start(trans_mount); /* * check if this vnode is still valid. */ mutex_enter(vp->v_interlock); error = vdead_check(vp, 0); mutex_exit(vp->v_interlock); if (error) goto out_err_free; /* * XXX: This assumes that we come here only via * the mmio path */ if (blockalloc && need_wapbl) { error = WAPBL_BEGIN(trans_mount); if (error) goto out_err_free; holds_wapbl = true; } } /* * hold g_glock to prevent a race with truncate. * * check if our idea of v_size is still valid. */ KASSERT(!glocked || genfs_node_wrlocked(vp)); if (!glocked) { if (blockalloc) { genfs_node_wrlock(vp); } else { genfs_node_rdlock(vp); } } rw_enter(uobj->vmobjlock, RW_WRITER); if (vp->v_size < origvsize) { if (!glocked) { genfs_node_unlock(vp); } if (pgs != pgs_onstack) kmem_free(pgs, pgs_size); goto startover; } if (uvn_findpages(uobj, origoffset, &npages, &pgs[ridx], NULL, async ? UFP_NOWAIT : UFP_ALL) != orignmempages) { if (!glocked) { genfs_node_unlock(vp); } KASSERT(async != 0); genfs_rel_pages(&pgs[ridx], orignmempages); rw_exit(uobj->vmobjlock); error = EBUSY; goto out_err_free; } /* * if PGO_OVERWRITE is set, don't bother reading the pages. */ if (overwrite) { if (!glocked) { genfs_node_unlock(vp); } UVMHIST_LOG(ubchist, "PGO_OVERWRITE",0,0,0,0); for (i = 0; i < npages; i++) { struct vm_page *pg = pgs[ridx + i]; /* * it's caller's responsibility to allocate blocks * beforehand for the overwrite case. */ KASSERT((pg->flags & PG_RDONLY) == 0 || !blockalloc); pg->flags &= ~PG_RDONLY; /* * mark the page DIRTY. * otherwise another thread can do putpages and pull * our vnode from syncer's queue before our caller does * ubc_release. note that putpages won't see CLEAN * pages even if they are BUSY. */ uvm_pagemarkdirty(pg, UVM_PAGE_STATUS_DIRTY); } npages += ridx; goto out; } /* * if the pages are already resident, just return them. */ for (i = 0; i < npages; i++) { struct vm_page *pg = pgs[ridx + i]; if ((pg->flags & PG_FAKE) || (blockalloc && (pg->flags & PG_RDONLY) != 0)) { break; } } if (i == npages) { if (!glocked) { genfs_node_unlock(vp); } UVMHIST_LOG(ubchist, "returning cached pages", 0,0,0,0); npages += ridx; goto out; } /* * the page wasn't resident and we're not overwriting, * so we're going to have to do some i/o. * find any additional pages needed to cover the expanded range. */ npages = (endoffset - startoffset) >> PAGE_SHIFT; if (startoffset != origoffset || npages != orignmempages) { int npgs; /* * we need to avoid deadlocks caused by locking * additional pages at lower offsets than pages we * already have locked. unlock them all and start over. */ genfs_rel_pages(&pgs[ridx], orignmempages); memset(pgs, 0, pgs_size); UVMHIST_LOG(ubchist, "reset npages start 0x%jx end 0x%jx", startoffset, endoffset, 0,0); npgs = npages; if (uvn_findpages(uobj, startoffset, &npgs, pgs, NULL, async ? UFP_NOWAIT : UFP_ALL) != npages) { if (!glocked) { genfs_node_unlock(vp); } KASSERT(async != 0); genfs_rel_pages(pgs, npages); rw_exit(uobj->vmobjlock); error = EBUSY; goto out_err_free; } } rw_exit(uobj->vmobjlock); error = genfs_getpages_read(vp, pgs, npages, startoffset, diskeof, async, memwrite, blockalloc, glocked); if (!glocked) { genfs_node_unlock(vp); } if (error == 0 && async) goto out_err_free; rw_enter(uobj->vmobjlock, RW_WRITER); /* * we're almost done! release the pages... * for errors, we free the pages. * otherwise we activate them and mark them as valid and clean. * also, unbusy pages that were not actually requested. */ if (error) { genfs_rel_pages(pgs, npages); rw_exit(uobj->vmobjlock); UVMHIST_LOG(ubchist, "returning error %jd", error,0,0,0); goto out_err_free; } out: UVMHIST_LOG(ubchist, "succeeding, npages %jd", npages,0,0,0); error = 0; for (i = 0; i < npages; i++) { struct vm_page *pg = pgs[i]; if (pg == NULL) { continue; } UVMHIST_LOG(ubchist, "examining pg %#jx flags 0x%jx", (uintptr_t)pg, pg->flags, 0,0); if (pg->flags & PG_FAKE && !overwrite) { /* * we've read page's contents from the backing storage. * * for a read fault, we keep them CLEAN; if we * encountered a hole while reading, the pages can * already been dirtied with zeros. */ KASSERTMSG(blockalloc || uvm_pagegetdirty(pg) == UVM_PAGE_STATUS_CLEAN, "page %p not clean", pg); pg->flags &= ~PG_FAKE; } KASSERT(!memwrite || !blockalloc || (pg->flags & PG_RDONLY) == 0); if (i < ridx || i >= ridx + orignmempages || async) { UVMHIST_LOG(ubchist, "unbusy pg %#jx offset 0x%jx", (uintptr_t)pg, pg->offset,0,0); if (pg->flags & PG_FAKE) { KASSERT(overwrite); uvm_pagezero(pg); } if (pg->flags & PG_RELEASED) { uvm_pagefree(pg); continue; } uvm_pagelock(pg); uvm_pageenqueue(pg); uvm_pagewakeup(pg); uvm_pageunlock(pg); pg->flags &= ~(PG_BUSY|PG_FAKE); UVM_PAGE_OWN(pg, NULL); } else if (memwrite && !overwrite && uvm_pagegetdirty(pg) == UVM_PAGE_STATUS_CLEAN) { /* * for a write fault, start dirtiness tracking of * requested pages. */ uvm_pagemarkdirty(pg, UVM_PAGE_STATUS_UNKNOWN); } } rw_exit(uobj->vmobjlock); if (ap->a_m != NULL) { memcpy(ap->a_m, &pgs[ridx], orignmempages * sizeof(struct vm_page *)); } out_err_free: if (pgs != NULL && pgs != pgs_onstack) kmem_free(pgs, pgs_size); out_err: if (trans_mount != NULL) { if (holds_wapbl) WAPBL_END(trans_mount); fstrans_done(trans_mount); } return error; } /* * genfs_getpages_read: Read the pages in with VOP_BMAP/VOP_STRATEGY. * * "glocked" (which is currently not actually used) tells us not whether * the genfs_node is locked on entry (it always is) but whether it was * locked on entry to genfs_getpages. */ static int genfs_getpages_read(struct vnode *vp, struct vm_page **pgs, int npages, off_t startoffset, off_t diskeof, bool async, bool memwrite, bool blockalloc, bool glocked) { struct uvm_object * const uobj = &vp->v_uobj; const int fs_bshift = (vp->v_type != VBLK) ? vp->v_mount->mnt_fs_bshift : DEV_BSHIFT; const int dev_bshift = (vp->v_type != VBLK) ? vp->v_mount->mnt_dev_bshift : DEV_BSHIFT; kauth_cred_t const cred = curlwp->l_cred; /* XXXUBC curlwp */ size_t bytes, iobytes, tailstart, tailbytes, totalbytes, skipbytes; vaddr_t kva; struct buf *bp, *mbp; bool sawhole = false; int i; int error = 0; UVMHIST_FUNC(__func__); UVMHIST_CALLED(ubchist); /* * read the desired page(s). */ totalbytes = npages << PAGE_SHIFT; bytes = MIN(totalbytes, MAX(diskeof - startoffset, 0)); tailbytes = totalbytes - bytes; skipbytes = 0; kva = uvm_pagermapin(pgs, npages, UVMPAGER_MAPIN_READ | (async ? 0 : UVMPAGER_MAPIN_WAITOK)); if (kva == 0) return EBUSY; mbp = getiobuf(vp, true); mbp->b_bufsize = totalbytes; mbp->b_data = (void *)kva; mbp->b_resid = mbp->b_bcount = bytes; mbp->b_cflags |= BC_BUSY; if (async) { mbp->b_flags = B_READ | B_ASYNC; mbp->b_iodone = uvm_aio_aiodone; } else { mbp->b_flags = B_READ; mbp->b_iodone = NULL; } if (async) BIO_SETPRIO(mbp, BPRIO_TIMELIMITED); else BIO_SETPRIO(mbp, BPRIO_TIMECRITICAL); /* * if EOF is in the middle of the range, zero the part past EOF. * skip over pages which are not PG_FAKE since in that case they have * valid data that we need to preserve. */ tailstart = bytes; while (tailbytes > 0) { const int len = PAGE_SIZE - (tailstart & PAGE_MASK); KASSERT(len <= tailbytes); if ((pgs[tailstart >> PAGE_SHIFT]->flags & PG_FAKE) != 0) { memset((void *)(kva + tailstart), 0, len); UVMHIST_LOG(ubchist, "tailbytes %#jx 0x%jx 0x%jx", (uintptr_t)kva, tailstart, len, 0); } tailstart += len; tailbytes -= len; } /* * now loop over the pages, reading as needed. */ bp = NULL; off_t offset; for (offset = startoffset; bytes > 0; offset += iobytes, bytes -= iobytes) { int run; daddr_t lbn, blkno; int pidx; struct vnode *devvp; /* * skip pages which don't need to be read. */ pidx = (offset - startoffset) >> PAGE_SHIFT; while ((pgs[pidx]->flags & PG_FAKE) == 0) { size_t b; KASSERT((offset & (PAGE_SIZE - 1)) == 0); if ((pgs[pidx]->flags & PG_RDONLY)) { sawhole = true; } b = MIN(PAGE_SIZE, bytes); offset += b; bytes -= b; skipbytes += b; pidx++; UVMHIST_LOG(ubchist, "skipping, new offset 0x%jx", offset, 0,0,0); if (bytes == 0) { goto loopdone; } } /* * bmap the file to find out the blkno to read from and * how much we can read in one i/o. if bmap returns an error, * skip the rest of the top-level i/o. */ lbn = offset >> fs_bshift; error = VOP_BMAP(vp, lbn, &devvp, &blkno, &run); if (error) { UVMHIST_LOG(ubchist, "VOP_BMAP lbn 0x%jx -> %jd", lbn,error,0,0); skipbytes += bytes; bytes = 0; goto loopdone; } /* * see how many pages can be read with this i/o. * reduce the i/o size if necessary to avoid * overwriting pages with valid data. */ iobytes = MIN((((off_t)lbn + 1 + run) << fs_bshift) - offset, bytes); if (offset + iobytes > round_page(offset)) { int pcount; pcount = 1; while (pidx + pcount < npages && pgs[pidx + pcount]->flags & PG_FAKE) { pcount++; } iobytes = MIN(iobytes, (pcount << PAGE_SHIFT) - (offset - trunc_page(offset))); } /* * if this block isn't allocated, zero it instead of * reading it. unless we are going to allocate blocks, * mark the pages we zeroed PG_RDONLY. */ if (blkno == (daddr_t)-1) { int holepages = (round_page(offset + iobytes) - trunc_page(offset)) >> PAGE_SHIFT; UVMHIST_LOG(ubchist, "lbn 0x%jx -> HOLE", lbn,0,0,0); sawhole = true; memset((char *)kva + (offset - startoffset), 0, iobytes); skipbytes += iobytes; if (!blockalloc) { rw_enter(uobj->vmobjlock, RW_WRITER); for (i = 0; i < holepages; i++) { pgs[pidx + i]->flags |= PG_RDONLY; } rw_exit(uobj->vmobjlock); } continue; } /* * allocate a sub-buf for this piece of the i/o * (or just use mbp if there's only 1 piece), * and start it going. */ if (offset == startoffset && iobytes == bytes) { bp = mbp; } else { UVMHIST_LOG(ubchist, "vp %#jx bp %#jx num now %jd", (uintptr_t)vp, (uintptr_t)bp, vp->v_numoutput, 0); bp = getiobuf(vp, true); nestiobuf_setup(mbp, bp, offset - startoffset, iobytes); } bp->b_lblkno = 0; /* adjust physical blkno for partial blocks */ bp->b_blkno = blkno + ((offset - ((off_t)lbn << fs_bshift)) >> dev_bshift); UVMHIST_LOG(ubchist, "bp %#jx offset 0x%x bcount 0x%x blkno 0x%x", (uintptr_t)bp, offset, bp->b_bcount, bp->b_blkno); VOP_STRATEGY(devvp, bp); } loopdone: nestiobuf_done(mbp, skipbytes, error); if (async) { UVMHIST_LOG(ubchist, "returning 0 (async)",0,0,0,0); return 0; } if (bp != NULL) { error = biowait(mbp); } /* Remove the mapping (make KVA available as soon as possible) */ uvm_pagermapout(kva, npages); /* * if this we encountered a hole then we have to do a little more work. * for read faults, we marked the page PG_RDONLY so that future * write accesses to the page will fault again. * for write faults, we must make sure that the backing store for * the page is completely allocated while the pages are locked. */ if (!error && sawhole && blockalloc) { error = GOP_ALLOC(vp, startoffset, npages << PAGE_SHIFT, 0, cred); UVMHIST_LOG(ubchist, "gop_alloc off 0x%jx/0x%jx -> %jd", startoffset, npages << PAGE_SHIFT, error,0); if (!error) { rw_enter(uobj->vmobjlock, RW_WRITER); for (i = 0; i < npages; i++) { struct vm_page *pg = pgs[i]; if (pg == NULL) { continue; } pg->flags &= ~PG_RDONLY; uvm_pagemarkdirty(pg, UVM_PAGE_STATUS_DIRTY); UVMHIST_LOG(ubchist, "mark dirty pg %#jx", (uintptr_t)pg, 0, 0, 0); } rw_exit(uobj->vmobjlock); } } putiobuf(mbp); return error; } /* * generic VM putpages routine. * Write the given range of pages to backing store. * * => "offhi == 0" means flush all pages at or after "offlo". * => object should be locked by caller. we return with the * object unlocked. * => if PGO_CLEANIT or PGO_SYNCIO is set, we may block (due to I/O). * thus, a caller might want to unlock higher level resources * (e.g. vm_map) before calling flush. * => if neither PGO_CLEANIT nor PGO_SYNCIO is set, we will not block * => if PGO_ALLPAGES is set, then all pages in the object will be processed. * * note on "cleaning" object and PG_BUSY pages: * this routine is holding the lock on the object. the only time * that it can run into a PG_BUSY page that it does not own is if * some other process has started I/O on the page (e.g. either * a pagein, or a pageout). if the PG_BUSY page is being paged * in, then it can not be dirty (!UVM_PAGE_STATUS_CLEAN) because no * one has had a chance to modify it yet. if the PG_BUSY page is * being paged out then it means that someone else has already started * cleaning the page for us (how nice!). in this case, if we * have syncio specified, then after we make our pass through the * object we need to wait for the other PG_BUSY pages to clear * off (i.e. we need to do an iosync). also note that once a * page is PG_BUSY it must stay in its object until it is un-busyed. */ int genfs_putpages(void *v) { struct vop_putpages_args /* { struct vnode *a_vp; voff_t a_offlo; voff_t a_offhi; int a_flags; } */ * const ap = v; return genfs_do_putpages(ap->a_vp, ap->a_offlo, ap->a_offhi, ap->a_flags, NULL); } int genfs_do_putpages(struct vnode *vp, off_t startoff, off_t endoff, int origflags, struct vm_page **busypg) { struct uvm_object * const uobj = &vp->v_uobj; krwlock_t * const slock = uobj->vmobjlock; off_t nextoff; int i, error, npages, nback; int freeflag; /* * This array is larger than it should so that it's size is constant. * The right size is MAXPAGES. */ struct vm_page *pgs[MAXPHYS / MIN_PAGE_SIZE]; #define MAXPAGES (MAXPHYS / PAGE_SIZE) struct vm_page *pg, *tpg; struct uvm_page_array a; bool wasclean, needs_clean; bool async = (origflags & PGO_SYNCIO) == 0; bool pagedaemon = curlwp == uvm.pagedaemon_lwp; struct mount *trans_mp; int flags; bool modified; /* if we write out any pages */ bool holds_wapbl; bool cleanall; /* try to pull off from the syncer's list */ bool onworklst; bool nodirty; const bool dirtyonly = (origflags & (PGO_DEACTIVATE|PGO_FREE)) == 0; UVMHIST_FUNC("genfs_putpages"); UVMHIST_CALLED(ubchist); KASSERT(origflags & (PGO_CLEANIT|PGO_FREE|PGO_DEACTIVATE)); KASSERT((startoff & PAGE_MASK) == 0); KASSERT((endoff & PAGE_MASK) == 0); KASSERT(startoff < endoff || endoff == 0); KASSERT(rw_write_held(slock)); UVMHIST_LOG(ubchist, "vp %#jx pages %jd off 0x%jx len 0x%jx", (uintptr_t)vp, uobj->uo_npages, startoff, endoff - startoff); #ifdef DIAGNOSTIC if ((origflags & PGO_JOURNALLOCKED) && vp->v_mount->mnt_wapbl) WAPBL_JLOCK_ASSERT(vp->v_mount); #endif trans_mp = NULL; holds_wapbl = false; retry: modified = false; flags = origflags; /* * shortcut if we have no pages to process. */ nodirty = uvm_obj_clean_p(uobj); #ifdef DIAGNOSTIC mutex_enter(vp->v_interlock); KASSERT((vp->v_iflag & VI_ONWORKLST) != 0 || nodirty); mutex_exit(vp->v_interlock); #endif if (uobj->uo_npages == 0 || (dirtyonly && nodirty)) { mutex_enter(vp->v_interlock); if (vp->v_iflag & VI_ONWORKLST && LIST_EMPTY(&vp->v_dirtyblkhd)) { vn_syncer_remove_from_worklist(vp); } mutex_exit(vp->v_interlock); if (trans_mp) { if (holds_wapbl) WAPBL_END(trans_mp); fstrans_done(trans_mp); } rw_exit(slock); return (0); } /* * the vnode has pages, set up to process the request. */ if (trans_mp == NULL && (flags & PGO_CLEANIT) != 0) { if (pagedaemon) { /* Pagedaemon must not sleep here. */ trans_mp = vp->v_mount; error = fstrans_start_nowait(trans_mp); if (error) { rw_exit(slock); return error; } } else { /* * Cannot use vdeadcheck() here as this operation * usually gets used from VOP_RECLAIM(). Test for * change of v_mount instead and retry on change. */ rw_exit(slock); trans_mp = vp->v_mount; fstrans_start(trans_mp); if (vp->v_mount != trans_mp) { fstrans_done(trans_mp); trans_mp = NULL; } else { holds_wapbl = (trans_mp->mnt_wapbl && (origflags & PGO_JOURNALLOCKED) == 0); if (holds_wapbl) { error = WAPBL_BEGIN(trans_mp); if (error) { fstrans_done(trans_mp); return error; } } } rw_enter(slock, RW_WRITER); goto retry; } } error = 0; wasclean = uvm_obj_nowriteback_p(uobj); nextoff = startoff; if (endoff == 0 || flags & PGO_ALLPAGES) { endoff = trunc_page(LLONG_MAX); } /* * if this vnode is known not to have dirty pages, * don't bother to clean it out. */ if (nodirty) { /* We handled the dirtyonly && nodirty case above. */ KASSERT(!dirtyonly); flags &= ~PGO_CLEANIT; } /* * start the loop to scan pages. */ cleanall = true; freeflag = pagedaemon ? PG_PAGEOUT : PG_RELEASED; uvm_page_array_init(&a, uobj, dirtyonly ? (UVM_PAGE_ARRAY_FILL_DIRTY | (!async ? UVM_PAGE_ARRAY_FILL_WRITEBACK : 0)) : 0); for (;;) { bool pgprotected; /* * if !dirtyonly, iterate over all resident pages in the range. * * if dirtyonly, only possibly dirty pages are interesting. * however, if we are asked to sync for integrity, we should * wait on pages being written back by other threads as well. */ pg = uvm_page_array_fill_and_peek(&a, nextoff, 0); if (pg == NULL) { break; } KASSERT(pg->uobject == uobj); KASSERT((pg->flags & (PG_RELEASED|PG_PAGEOUT)) == 0 || (pg->flags & (PG_BUSY)) != 0); KASSERT(pg->offset >= startoff); KASSERT(pg->offset >= nextoff); KASSERT(!dirtyonly || uvm_pagegetdirty(pg) != UVM_PAGE_STATUS_CLEAN || uvm_obj_page_writeback_p(pg)); if (pg->offset >= endoff) { break; } /* * a preempt point. */ if (preempt_needed()) { nextoff = pg->offset; /* visit this page again */ rw_exit(slock); preempt(); /* * as we dropped the object lock, our cached pages can * be stale. */ uvm_page_array_clear(&a); rw_enter(slock, RW_WRITER); continue; } /* * if the current page is busy, wait for it to become unbusy. */ if ((pg->flags & PG_BUSY) != 0) { UVMHIST_LOG(ubchist, "busy %#jx", (uintptr_t)pg, 0, 0, 0); if ((pg->flags & (PG_RELEASED|PG_PAGEOUT)) != 0 && (flags & PGO_BUSYFAIL) != 0) { UVMHIST_LOG(ubchist, "busyfail %#jx", (uintptr_t)pg, 0, 0, 0); error = EDEADLK; if (busypg != NULL) *busypg = pg; break; } if (pagedaemon) { /* * someone has taken the page while we * dropped the lock for fstrans_start. */ break; } /* * don't bother to wait on other's activities * unless we are asked to sync for integrity. */ if (!async && (flags & PGO_RECLAIM) == 0) { wasclean = false; nextoff = pg->offset + PAGE_SIZE; uvm_page_array_advance(&a); continue; } nextoff = pg->offset; /* visit this page again */ uvm_pagewait(pg, slock, "genput"); /* * as we dropped the object lock, our cached pages can * be stale. */ uvm_page_array_clear(&a); rw_enter(slock, RW_WRITER); continue; } nextoff = pg->offset + PAGE_SIZE; uvm_page_array_advance(&a); /* * if we're freeing, remove all mappings of the page now. * if we're cleaning, check if the page is needs to be cleaned. */ pgprotected = false; if (flags & PGO_FREE) { pmap_page_protect(pg, VM_PROT_NONE); pgprotected = true; } else if (flags & PGO_CLEANIT) { /* * if we still have some hope to pull this vnode off * from the syncer queue, write-protect the page. */ if (cleanall && wasclean) { /* * uobj pages get wired only by uvm_fault * where uobj is locked. */ if (pg->wire_count == 0) { pmap_page_protect(pg, VM_PROT_READ|VM_PROT_EXECUTE); pgprotected = true; } else { cleanall = false; } } } if (flags & PGO_CLEANIT) { needs_clean = uvm_pagecheckdirty(pg, pgprotected); } else { needs_clean = false; } /* * if we're cleaning, build a cluster. * the cluster will consist of pages which are currently dirty. * if not cleaning, just operate on the one page. */ if (needs_clean) { wasclean = false; memset(pgs, 0, sizeof(pgs)); pg->flags |= PG_BUSY; UVM_PAGE_OWN(pg, "genfs_putpages"); /* * let the fs constrain the offset range of the cluster. * we additionally constrain the range here such that * it fits in the "pgs" pages array. */ off_t fslo, fshi, genlo, lo, off = pg->offset; GOP_PUTRANGE(vp, off, &fslo, &fshi); KASSERT(fslo == trunc_page(fslo)); KASSERT(fslo <= off); KASSERT(fshi == trunc_page(fshi)); KASSERT(fshi == 0 || off < fshi); if (off > MAXPHYS / 2) genlo = trunc_page(off - (MAXPHYS / 2)); else genlo = 0; lo = MAX(fslo, genlo); /* * first look backward. */ npages = (off - lo) >> PAGE_SHIFT; nback = npages; uvn_findpages(uobj, off - PAGE_SIZE, &nback, &pgs[0], NULL, UFP_NOWAIT|UFP_NOALLOC|UFP_DIRTYONLY|UFP_BACKWARD); if (nback) { memmove(&pgs[0], &pgs[npages - nback], nback * sizeof(pgs[0])); if (npages - nback < nback) memset(&pgs[nback], 0, (npages - nback) * sizeof(pgs[0])); else memset(&pgs[npages - nback], 0, nback * sizeof(pgs[0])); } /* * then plug in our page of interest. */ pgs[nback] = pg; /* * then look forward to fill in the remaining space in * the array of pages. * * pass our cached array of pages so that hopefully * uvn_findpages can find some good pages in it. * the array a was filled above with the one of * following sets of flags: * 0 * UVM_PAGE_ARRAY_FILL_DIRTY * UVM_PAGE_ARRAY_FILL_DIRTY|WRITEBACK * * XXX this is fragile but it'll work: the array * was earlier filled sparsely, but UFP_DIRTYONLY * implies dense. see corresponding comment in * uvn_findpages(). */ npages = MAXPAGES - nback - 1; if (fshi) npages = MIN(npages, (fshi - off - 1) >> PAGE_SHIFT); uvn_findpages(uobj, off + PAGE_SIZE, &npages, &pgs[nback + 1], &a, UFP_NOWAIT|UFP_NOALLOC|UFP_DIRTYONLY); npages += nback + 1; } else { pgs[0] = pg; npages = 1; nback = 0; } /* * apply FREE or DEACTIVATE options if requested. */ for (i = 0; i < npages; i++) { tpg = pgs[i]; KASSERT(tpg->uobject == uobj); KASSERT(i == 0 || pgs[i-1]->offset + PAGE_SIZE == tpg->offset); KASSERT(!needs_clean || uvm_pagegetdirty(pgs[i]) != UVM_PAGE_STATUS_DIRTY); if (needs_clean) { /* * mark pages as WRITEBACK so that concurrent * fsync can find and wait for our activities. */ uvm_obj_page_set_writeback(pgs[i]); } if (tpg->offset < startoff || tpg->offset >= endoff) continue; if (flags & PGO_DEACTIVATE && tpg->wire_count == 0) { uvm_pagelock(tpg); uvm_pagedeactivate(tpg); uvm_pageunlock(tpg); } else if (flags & PGO_FREE) { pmap_page_protect(tpg, VM_PROT_NONE); if (tpg->flags & PG_BUSY) { tpg->flags |= freeflag; if (pagedaemon) { uvm_pageout_start(1); uvm_pagelock(tpg); uvm_pagedequeue(tpg); uvm_pageunlock(tpg); } } else { /* * ``page is not busy'' * implies that npages is 1 * and needs_clean is false. */ KASSERT(npages == 1); KASSERT(!needs_clean); KASSERT(pg == tpg); KASSERT(nextoff == tpg->offset + PAGE_SIZE); uvm_pagefree(tpg); if (pagedaemon) uvmexp.pdfreed++; } } } if (needs_clean) { modified = true; KASSERT(nextoff == pg->offset + PAGE_SIZE); KASSERT(nback < npages); nextoff = pg->offset + ((npages - nback) << PAGE_SHIFT); KASSERT(pgs[nback] == pg); KASSERT(nextoff == pgs[npages - 1]->offset + PAGE_SIZE); /* * start the i/o. */ rw_exit(slock); error = GOP_WRITE(vp, pgs, npages, flags); /* * as we dropped the object lock, our cached pages can * be stale. */ uvm_page_array_clear(&a); rw_enter(slock, RW_WRITER); if (error) { break; } } } uvm_page_array_fini(&a); /* * update ctime/mtime if the modification we started writing out might * be from mmap'ed write. * * this is necessary when an application keeps a file mmaped and * repeatedly modifies it via the window. note that, because we * don't always write-protect pages when cleaning, such modifications * might not involve any page faults. */ mutex_enter(vp->v_interlock); if (modified && (vp->v_iflag & VI_WRMAP) != 0 && (vp->v_type != VBLK || (vp->v_mount->mnt_flag & MNT_NODEVMTIME) == 0)) { GOP_MARKUPDATE(vp, GOP_UPDATE_MODIFIED); } /* * if we no longer have any possibly dirty pages, take us off the * syncer list. */ if ((vp->v_iflag & VI_ONWORKLST) != 0 && uvm_obj_clean_p(uobj) && LIST_EMPTY(&vp->v_dirtyblkhd)) { vn_syncer_remove_from_worklist(vp); } /* Wait for output to complete. */ rw_exit(slock); if (!wasclean && !async && vp->v_numoutput != 0) { while (vp->v_numoutput != 0) cv_wait(&vp->v_cv, vp->v_interlock); } onworklst = (vp->v_iflag & VI_ONWORKLST) != 0; mutex_exit(vp->v_interlock); if ((flags & PGO_RECLAIM) != 0 && onworklst) { /* * in the case of PGO_RECLAIM, ensure to make the vnode clean. * retrying is not a big deal because, in many cases, * uobj->uo_npages is already 0 here. */ rw_enter(slock, RW_WRITER); goto retry; } if (trans_mp) { if (holds_wapbl) WAPBL_END(trans_mp); fstrans_done(trans_mp); } return (error); } /* * Default putrange method for file systems that do not care * how many pages are given to one GOP_WRITE() call. */ void genfs_gop_putrange(struct vnode *vp, off_t off, off_t *lop, off_t *hip) { *lop = 0; *hip = 0; } int genfs_gop_write(struct vnode *vp, struct vm_page **pgs, int npages, int flags) { off_t off; vaddr_t kva; size_t len; int error; UVMHIST_FUNC(__func__); UVMHIST_CALLED(ubchist); UVMHIST_LOG(ubchist, "vp %#jx pgs %#jx npages %jd flags 0x%jx", (uintptr_t)vp, (uintptr_t)pgs, npages, flags); off = pgs[0]->offset; kva = uvm_pagermapin(pgs, npages, UVMPAGER_MAPIN_WRITE | UVMPAGER_MAPIN_WAITOK); len = npages << PAGE_SHIFT; error = genfs_do_io(vp, off, kva, len, flags, UIO_WRITE, uvm_aio_aiodone); return error; } /* * genfs_gop_write_rwmap: * * a variant of genfs_gop_write. it's used by UDF for its directory buffers. * this maps pages with PROT_WRITE so that VOP_STRATEGY can modifies * the contents before writing it out to the underlying storage. */ int genfs_gop_write_rwmap(struct vnode *vp, struct vm_page **pgs, int npages, int flags) { off_t off; vaddr_t kva; size_t len; int error; UVMHIST_FUNC(__func__); UVMHIST_CALLED(ubchist); UVMHIST_LOG(ubchist, "vp %#jx pgs %#jx npages %jd flags 0x%jx", (uintptr_t)vp, (uintptr_t)pgs, npages, flags); off = pgs[0]->offset; kva = uvm_pagermapin(pgs, npages, UVMPAGER_MAPIN_READ | UVMPAGER_MAPIN_WAITOK); len = npages << PAGE_SHIFT; error = genfs_do_io(vp, off, kva, len, flags, UIO_WRITE, uvm_aio_aiodone); return error; } /* * Backend routine for doing I/O to vnode pages. Pages are already locked * and mapped into kernel memory. Here we just look up the underlying * device block addresses and call the strategy routine. */ static int genfs_do_io(struct vnode *vp, off_t off, vaddr_t kva, size_t len, int flags, enum uio_rw rw, void (*iodone)(struct buf *)) { int s, error; int fs_bshift, dev_bshift; off_t eof, offset, startoffset; size_t bytes, iobytes, skipbytes; struct buf *mbp, *bp; const bool async = (flags & PGO_SYNCIO) == 0; const bool lazy = (flags & PGO_LAZY) == 0; const bool iowrite = rw == UIO_WRITE; const int brw = iowrite ? B_WRITE : B_READ; UVMHIST_FUNC(__func__); UVMHIST_CALLED(ubchist); UVMHIST_LOG(ubchist, "vp %#jx kva %#jx len 0x%jx flags 0x%jx", (uintptr_t)vp, (uintptr_t)kva, len, flags); KASSERT(vp->v_size != VSIZENOTSET); KASSERT(vp->v_writesize != VSIZENOTSET); KASSERTMSG(vp->v_size <= vp->v_writesize, "vp=%p" " v_size=0x%llx v_writesize=0x%llx", vp, (unsigned long long)vp->v_size, (unsigned long long)vp->v_writesize); GOP_SIZE(vp, vp->v_writesize, &eof, 0); if (vp->v_type != VBLK) { fs_bshift = vp->v_mount->mnt_fs_bshift; dev_bshift = vp->v_mount->mnt_dev_bshift; } else { fs_bshift = DEV_BSHIFT; dev_bshift = DEV_BSHIFT; } error = 0; startoffset = off; bytes = MIN(len, eof - startoffset); skipbytes = 0; KASSERT(bytes != 0); if (iowrite) { /* * why += 2? * 1 for biodone, 1 for uvm_aio_aiodone. */ mutex_enter(vp->v_interlock); vp->v_numoutput += 2; mutex_exit(vp->v_interlock); } mbp = getiobuf(vp, true); UVMHIST_LOG(ubchist, "vp %#jx mbp %#jx num now %jd bytes 0x%jx", (uintptr_t)vp, (uintptr_t)mbp, vp->v_numoutput, bytes); mbp->b_bufsize = len; mbp->b_data = (void *)kva; mbp->b_resid = mbp->b_bcount = bytes; mbp->b_cflags |= BC_BUSY | BC_AGE; if (async) { mbp->b_flags = brw | B_ASYNC; mbp->b_iodone = iodone; } else { mbp->b_flags = brw; mbp->b_iodone = NULL; } if (curlwp == uvm.pagedaemon_lwp) BIO_SETPRIO(mbp, BPRIO_TIMELIMITED); else if (async || lazy) BIO_SETPRIO(mbp, BPRIO_TIMENONCRITICAL); else BIO_SETPRIO(mbp, BPRIO_TIMECRITICAL); bp = NULL; for (offset = startoffset; bytes > 0; offset += iobytes, bytes -= iobytes) { int run; daddr_t lbn, blkno; struct vnode *devvp; /* * bmap the file to find out the blkno to read from and * how much we can read in one i/o. if bmap returns an error, * skip the rest of the top-level i/o. */ lbn = offset >> fs_bshift; error = VOP_BMAP(vp, lbn, &devvp, &blkno, &run); if (error) { UVMHIST_LOG(ubchist, "VOP_BMAP lbn 0x%jx -> %jd", lbn, error, 0, 0); skipbytes += bytes; bytes = 0; goto loopdone; } /* * see how many pages can be read with this i/o. * reduce the i/o size if necessary to avoid * overwriting pages with valid data. */ iobytes = MIN((((off_t)lbn + 1 + run) << fs_bshift) - offset, bytes); /* * if this block isn't allocated, zero it instead of * reading it. unless we are going to allocate blocks, * mark the pages we zeroed PG_RDONLY. */ if (blkno == (daddr_t)-1) { if (!iowrite) { memset((char *)kva + (offset - startoffset), 0, iobytes); } skipbytes += iobytes; continue; } /* * allocate a sub-buf for this piece of the i/o * (or just use mbp if there's only 1 piece), * and start it going. */ if (offset == startoffset && iobytes == bytes) { bp = mbp; } else { UVMHIST_LOG(ubchist, "vp %#jx bp %#jx num now %jd", (uintptr_t)vp, (uintptr_t)bp, vp->v_numoutput, 0); bp = getiobuf(vp, true); nestiobuf_setup(mbp, bp, offset - startoffset, iobytes); } bp->b_lblkno = 0; /* adjust physical blkno for partial blocks */ bp->b_blkno = blkno + ((offset - ((off_t)lbn << fs_bshift)) >> dev_bshift); UVMHIST_LOG(ubchist, "bp %#jx offset 0x%jx bcount 0x%jx blkno 0x%jx", (uintptr_t)bp, offset, bp->b_bcount, bp->b_blkno); VOP_STRATEGY(devvp, bp); } loopdone: if (skipbytes) { UVMHIST_LOG(ubchist, "skipbytes %jd", skipbytes, 0,0,0); } nestiobuf_done(mbp, skipbytes, error); if (async) { UVMHIST_LOG(ubchist, "returning 0 (async)", 0,0,0,0); return (0); } UVMHIST_LOG(ubchist, "waiting for mbp %#jx", (uintptr_t)mbp, 0, 0, 0); error = biowait(mbp); s = splbio(); (*iodone)(mbp); splx(s); UVMHIST_LOG(ubchist, "returning, error %jd", error, 0, 0, 0); return (error); } int genfs_compat_getpages(void *v) { struct vop_getpages_args /* { struct vnode *a_vp; voff_t a_offset; struct vm_page **a_m; int *a_count; int a_centeridx; vm_prot_t a_access_type; int a_advice; int a_flags; } */ *ap = v; off_t origoffset; struct vnode *vp = ap->a_vp; struct uvm_object *uobj = &vp->v_uobj; struct vm_page *pg, **pgs; vaddr_t kva; int i, error, orignpages, npages; struct iovec iov; struct uio uio; kauth_cred_t cred = curlwp->l_cred; const bool memwrite = (ap->a_access_type & VM_PROT_WRITE) != 0; error = 0; origoffset = ap->a_offset; orignpages = *ap->a_count; pgs = ap->a_m; if (ap->a_flags & PGO_LOCKED) { uvn_findpages(uobj, origoffset, ap->a_count, ap->a_m, NULL, UFP_NOWAIT|UFP_NOALLOC| (memwrite ? UFP_NORDONLY : 0)); error = ap->a_m[ap->a_centeridx] == NULL ? EBUSY : 0; return error; } if (origoffset + (ap->a_centeridx << PAGE_SHIFT) >= vp->v_size) { rw_exit(uobj->vmobjlock); return EINVAL; } if ((ap->a_flags & PGO_SYNCIO) == 0) { rw_exit(uobj->vmobjlock); return 0; } npages = orignpages; uvn_findpages(uobj, origoffset, &npages, pgs, NULL, UFP_ALL); rw_exit(uobj->vmobjlock); kva = uvm_pagermapin(pgs, npages, UVMPAGER_MAPIN_READ | UVMPAGER_MAPIN_WAITOK); for (i = 0; i < npages; i++) { pg = pgs[i]; if ((pg->flags & PG_FAKE) == 0) { continue; } iov.iov_base = (char *)kva + (i << PAGE_SHIFT); iov.iov_len = PAGE_SIZE; uio.uio_iov = &iov; uio.uio_iovcnt = 1; uio.uio_offset = origoffset + (i << PAGE_SHIFT); uio.uio_rw = UIO_READ; uio.uio_resid = PAGE_SIZE; UIO_SETUP_SYSSPACE(&uio); /* XXX vn_lock */ error = VOP_READ(vp, &uio, 0, cred); if (error) { break; } if (uio.uio_resid) { memset(iov.iov_base, 0, uio.uio_resid); } } uvm_pagermapout(kva, npages); rw_enter(uobj->vmobjlock, RW_WRITER); for (i = 0; i < npages; i++) { pg = pgs[i]; if (error && (pg->flags & PG_FAKE) != 0) { pg->flags |= PG_RELEASED; } else { uvm_pagemarkdirty(pg, UVM_PAGE_STATUS_UNKNOWN); uvm_pagelock(pg); uvm_pageactivate(pg); uvm_pageunlock(pg); } } if (error) { uvm_page_unbusy(pgs, npages); } rw_exit(uobj->vmobjlock); return error; } int genfs_compat_gop_write(struct vnode *vp, struct vm_page **pgs, int npages, int flags) { off_t offset; struct iovec iov; struct uio uio; kauth_cred_t cred = curlwp->l_cred; struct buf *bp; vaddr_t kva; int error; offset = pgs[0]->offset; kva = uvm_pagermapin(pgs, npages, UVMPAGER_MAPIN_WRITE | UVMPAGER_MAPIN_WAITOK); iov.iov_base = (void *)kva; iov.iov_len = npages << PAGE_SHIFT; uio.uio_iov = &iov; uio.uio_iovcnt = 1; uio.uio_offset = offset; uio.uio_rw = UIO_WRITE; uio.uio_resid = npages << PAGE_SHIFT; UIO_SETUP_SYSSPACE(&uio); /* XXX vn_lock */ error = VOP_WRITE(vp, &uio, 0, cred); mutex_enter(vp->v_interlock); vp->v_numoutput++; mutex_exit(vp->v_interlock); bp = getiobuf(vp, true); bp->b_cflags |= BC_BUSY | BC_AGE; bp->b_lblkno = offset >> vp->v_mount->mnt_fs_bshift; bp->b_data = (char *)kva; bp->b_bcount = npages << PAGE_SHIFT; bp->b_bufsize = npages << PAGE_SHIFT; bp->b_resid = 0; bp->b_error = error; uvm_aio_aiodone(bp); return (error); } /* * Process a uio using direct I/O. If we reach a part of the request * which cannot be processed in this fashion for some reason, just return. * The caller must handle some additional part of the request using * buffered I/O before trying direct I/O again. */ void genfs_directio(struct vnode *vp, struct uio *uio, int ioflag) { struct vmspace *vs; struct iovec *iov; vaddr_t va; size_t len; const int mask = DEV_BSIZE - 1; int error; bool need_wapbl = (vp->v_mount && vp->v_mount->mnt_wapbl && (ioflag & IO_JOURNALLOCKED) == 0); #ifdef DIAGNOSTIC if ((ioflag & IO_JOURNALLOCKED) && vp->v_mount->mnt_wapbl) WAPBL_JLOCK_ASSERT(vp->v_mount); #endif /* * We only support direct I/O to user space for now. */ if (VMSPACE_IS_KERNEL_P(uio->uio_vmspace)) { return; } /* * If the vnode is mapped, we would need to get the getpages lock * to stabilize the bmap, but then we would get into trouble while * locking the pages if the pages belong to this same vnode (or a * multi-vnode cascade to the same effect). Just fall back to * buffered I/O if the vnode is mapped to avoid this mess. */ if (vp->v_vflag & VV_MAPPED) { return; } if (need_wapbl) { error = WAPBL_BEGIN(vp->v_mount); if (error) return; } /* * Do as much of the uio as possible with direct I/O. */ vs = uio->uio_vmspace; while (uio->uio_resid) { iov = uio->uio_iov; if (iov->iov_len == 0) { uio->uio_iov++; uio->uio_iovcnt--; continue; } va = (vaddr_t)iov->iov_base; len = MIN(iov->iov_len, genfs_maxdio); len &= ~mask; /* * If the next chunk is smaller than DEV_BSIZE or extends past * the current EOF, then fall back to buffered I/O. */ if (len == 0 || uio->uio_offset + len > vp->v_size) { break; } /* * Check alignment. The file offset must be at least * sector-aligned. The exact constraint on memory alignment * is very hardware-dependent, but requiring sector-aligned * addresses there too is safe. */ if (uio->uio_offset & mask || va & mask) { break; } error = genfs_do_directio(vs, va, len, vp, uio->uio_offset, uio->uio_rw); if (error) { break; } iov->iov_base = (char *)iov->iov_base + len; iov->iov_len -= len; uio->uio_offset += len; uio->uio_resid -= len; } if (need_wapbl) WAPBL_END(vp->v_mount); } /* * Iodone routine for direct I/O. We don't do much here since the request is * always synchronous, so the caller will do most of the work after biowait(). */ static void genfs_dio_iodone(struct buf *bp) { KASSERT((bp->b_flags & B_ASYNC) == 0); if ((bp->b_flags & B_READ) == 0 && (bp->b_cflags & BC_AGE) != 0) { mutex_enter(bp->b_objlock); vwakeup(bp); mutex_exit(bp->b_objlock); } putiobuf(bp); } /* * Process one chunk of a direct I/O request. */ static int genfs_do_directio(struct vmspace *vs, vaddr_t uva, size_t len, struct vnode *vp, off_t off, enum uio_rw rw) { struct vm_map *map; struct pmap *upm, *kpm __unused; size_t klen = round_page(uva + len) - trunc_page(uva); off_t spoff, epoff; vaddr_t kva, puva; paddr_t pa; vm_prot_t prot; int error, rv __diagused, poff, koff; const int pgoflags = PGO_CLEANIT | PGO_SYNCIO | PGO_JOURNALLOCKED | (rw == UIO_WRITE ? PGO_FREE : 0); /* * For writes, verify that this range of the file already has fully * allocated backing store. If there are any holes, just punt and * make the caller take the buffered write path. */ if (rw == UIO_WRITE) { daddr_t lbn, elbn, blkno; int bsize, bshift, run; bshift = vp->v_mount->mnt_fs_bshift; bsize = 1 << bshift; lbn = off >> bshift; elbn = (off + len + bsize - 1) >> bshift; while (lbn < elbn) { error = VOP_BMAP(vp, lbn, NULL, &blkno, &run); if (error) { return error; } if (blkno == (daddr_t)-1) { return ENOSPC; } lbn += 1 + run; } } /* * Flush any cached pages for parts of the file that we're about to * access. If we're writing, invalidate pages as well. */ spoff = trunc_page(off); epoff = round_page(off + len); rw_enter(vp->v_uobj.vmobjlock, RW_WRITER); error = VOP_PUTPAGES(vp, spoff, epoff, pgoflags); if (error) { return error; } /* * Wire the user pages and remap them into kernel memory. */ prot = rw == UIO_READ ? VM_PROT_READ | VM_PROT_WRITE : VM_PROT_READ; error = uvm_vslock(vs, (void *)uva, len, prot); if (error) { return error; } map = &vs->vm_map; upm = vm_map_pmap(map); kpm = vm_map_pmap(kernel_map); puva = trunc_page(uva); kva = uvm_km_alloc(kernel_map, klen, atop(puva) & uvmexp.colormask, UVM_KMF_VAONLY | UVM_KMF_WAITVA | UVM_KMF_COLORMATCH); for (poff = 0; poff < klen; poff += PAGE_SIZE) { rv = pmap_extract(upm, puva + poff, &pa); KASSERT(rv); pmap_kenter_pa(kva + poff, pa, prot, PMAP_WIRED); } pmap_update(kpm); /* * Do the I/O. */ koff = uva - trunc_page(uva); error = genfs_do_io(vp, off, kva + koff, len, PGO_SYNCIO, rw, genfs_dio_iodone); /* * Tear down the kernel mapping. */ pmap_kremove(kva, klen); pmap_update(kpm); uvm_km_free(kernel_map, kva, klen, UVM_KMF_VAONLY); /* * Unwire the user pages. */ uvm_vsunlock(vs, (void *)uva, len); return error; }
47 47 47 47 24 23 43 5 44 5 3 5 9 9 34 1 17 17 40 6 5 3 6 3 3 3 3 5 5 1 4 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 /* $NetBSD: ffs_balloc.c,v 1.66 2022/11/17 06:40:40 chs Exp $ */ /* * Copyright (c) 2002 Networks Associates Technology, Inc. * All rights reserved. * * This software was developed for the FreeBSD Project by Marshall * Kirk McKusick and Network Associates Laboratories, the Security * Research Division of Network Associates, Inc. under DARPA/SPAWAR * contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA CHATS * research program * * Copyright (c) 1982, 1986, 1989, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)ffs_balloc.c 8.8 (Berkeley) 6/16/95 */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: ffs_balloc.c,v 1.66 2022/11/17 06:40:40 chs Exp $"); #if defined(_KERNEL_OPT) #include "opt_quota.h" #include "opt_uvmhist.h" #endif #include <sys/param.h> #include <sys/systm.h> #include <sys/buf.h> #include <sys/file.h> #include <sys/mount.h> #include <sys/vnode.h> #include <sys/kauth.h> #include <sys/fstrans.h> #include <ufs/ufs/quota.h> #include <ufs/ufs/ufsmount.h> #include <ufs/ufs/inode.h> #include <ufs/ufs/ufs_extern.h> #include <ufs/ufs/ufs_bswap.h> #include <ufs/ffs/fs.h> #include <ufs/ffs/ffs_extern.h> #ifdef UVMHIST #include <uvm/uvm.h> #endif #include <uvm/uvm_extern.h> #include <uvm/uvm_stat.h> static int ffs_balloc_ufs1(struct vnode *, off_t, int, kauth_cred_t, int, struct buf **); static int ffs_balloc_ufs2(struct vnode *, off_t, int, kauth_cred_t, int, struct buf **); static daddr_t ffs_extb(struct fs *fs, struct ufs2_dinode *dp, daddr_t nb) { return ufs_rw64(dp->di_extb[nb], UFS_FSNEEDSWAP(fs)); } /* * Balloc defines the structure of file system storage * by allocating the physical blocks on a device given * the inode and the logical block number in a file. */ int ffs_balloc(struct vnode *vp, off_t off, int size, kauth_cred_t cred, int flags, struct buf **bpp) { int error; if (VTOI(vp)->i_fs->fs_magic == FS_UFS2_MAGIC) error = ffs_balloc_ufs2(vp, off, size, cred, flags, bpp); else error = ffs_balloc_ufs1(vp, off, size, cred, flags, bpp); if (error == 0 && bpp != NULL && (error = fscow_run(*bpp, false)) != 0) brelse(*bpp, 0); return error; } static int ffs_balloc_ufs1(struct vnode *vp, off_t off, int size, kauth_cred_t cred, int flags, struct buf **bpp) { daddr_t lbn, lastlbn; struct buf *bp, *nbp; struct inode *ip = VTOI(vp); struct fs *fs = ip->i_fs; struct ufsmount *ump = ip->i_ump; struct indir indirs[UFS_NIADDR + 2]; daddr_t newb, pref, nb; int32_t *bap; /* XXX ondisk32 */ int deallocated, osize, nsize, num, i, error; int32_t *blkp, *allocblk, allociblk[UFS_NIADDR + 1]; int32_t *allocib; int unwindidx = -1; const int needswap = UFS_FSNEEDSWAP(fs); UVMHIST_FUNC("ffs_balloc"); UVMHIST_CALLED(ubchist); lbn = ffs_lblkno(fs, off); size = ffs_blkoff(fs, off) + size; if (size > fs->fs_bsize) panic("ffs_balloc: blk too big"); if (bpp != NULL) { *bpp = NULL; } UVMHIST_LOG(ubchist, "vp %#jx lbn 0x%jx size 0x%jx", (uintptr_t)vp, lbn, size, 0); if (lbn < 0) return (EFBIG); /* * If the next write will extend the file into a new block, * and the file is currently composed of a fragment * this fragment has to be extended to be a full block. */ lastlbn = ffs_lblkno(fs, ip->i_size); if (lastlbn < UFS_NDADDR && lastlbn < lbn) { nb = lastlbn; osize = ffs_blksize(fs, ip, nb); if (osize < fs->fs_bsize && osize > 0) { mutex_enter(&ump->um_lock); error = ffs_realloccg(ip, nb, ffs_getdb(fs, ip, nb), ffs_blkpref_ufs1(ip, lastlbn, nb, flags, &ip->i_ffs1_db[0]), osize, (int)fs->fs_bsize, flags, cred, bpp, &newb); if (error) return (error); ip->i_size = ffs_lblktosize(fs, nb + 1); ip->i_ffs1_size = ip->i_size; uvm_vnp_setsize(vp, ip->i_ffs1_size); ip->i_ffs1_db[nb] = ufs_rw32((u_int32_t)newb, needswap); ip->i_flag |= IN_CHANGE | IN_UPDATE; if (bpp && *bpp) { if (flags & B_SYNC) bwrite(*bpp); else bawrite(*bpp); } } } /* * The first UFS_NDADDR blocks are direct blocks */ if (lbn < UFS_NDADDR) { nb = ufs_rw32(ip->i_ffs1_db[lbn], needswap); if (nb != 0 && ip->i_size >= ffs_lblktosize(fs, lbn + 1)) { /* * The block is an already-allocated direct block * and the file already extends past this block, * thus this must be a whole block. * Just read the block (if requested). */ if (bpp != NULL) { error = bread(vp, lbn, fs->fs_bsize, B_MODIFY, bpp); if (error) { return (error); } } return (0); } if (nb != 0) { /* * Consider need to reallocate a fragment. */ osize = ffs_fragroundup(fs, ffs_blkoff(fs, ip->i_size)); nsize = ffs_fragroundup(fs, size); if (nsize <= osize) { /* * The existing block is already * at least as big as we want. * Just read the block (if requested). */ if (bpp != NULL) { error = bread(vp, lbn, osize, B_MODIFY, bpp); if (error) { return (error); } } return 0; } else { /* * The existing block is smaller than we want, * grow it. */ mutex_enter(&ump->um_lock); error = ffs_realloccg(ip, lbn, ffs_getdb(fs, ip, lbn), ffs_blkpref_ufs1(ip, lbn, (int)lbn, flags, &ip->i_ffs1_db[0]), osize, nsize, flags, cred, bpp, &newb); if (error) return (error); } } else { /* * the block was not previously allocated, * allocate a new block or fragment. */ if (ip->i_size < ffs_lblktosize(fs, lbn + 1)) nsize = ffs_fragroundup(fs, size); else nsize = fs->fs_bsize; mutex_enter(&ump->um_lock); error = ffs_alloc(ip, lbn, ffs_blkpref_ufs1(ip, lbn, (int)lbn, flags, &ip->i_ffs1_db[0]), nsize, flags, cred, &newb); if (error) return (error); if (bpp != NULL) { error = ffs_getblk(vp, lbn, FFS_FSBTODB(fs, newb), nsize, (flags & B_CLRBUF) != 0, bpp); if (error) return error; } } ip->i_ffs1_db[lbn] = ufs_rw32((u_int32_t)newb, needswap); ip->i_flag |= IN_CHANGE | IN_UPDATE; return (0); } /* * Determine the number of levels of indirection. */ pref = 0; if ((error = ufs_getlbns(vp, lbn, indirs, &num)) != 0) return (error); /* * Fetch the first indirect block allocating if necessary. */ --num; nb = ufs_rw32(ip->i_ffs1_ib[indirs[0].in_off], needswap); allocib = NULL; allocblk = allociblk; if (nb == 0) { mutex_enter(&ump->um_lock); pref = ffs_blkpref_ufs1(ip, lbn, 0, flags | B_METAONLY, NULL); error = ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize, flags | B_METAONLY, cred, &newb); if (error) goto fail; nb = newb; *allocblk++ = nb; error = ffs_getblk(vp, indirs[1].in_lbn, FFS_FSBTODB(fs, nb), fs->fs_bsize, true, &bp); if (error) goto fail; /* * Write synchronously so that indirect blocks * never point at garbage. */ if ((error = bwrite(bp)) != 0) goto fail; unwindidx = 0; allocib = &ip->i_ffs1_ib[indirs[0].in_off]; *allocib = ufs_rw32(nb, needswap); ip->i_flag |= IN_CHANGE | IN_UPDATE; } /* * Fetch through the indirect blocks, allocating as necessary. */ for (i = 1;;) { error = bread(vp, indirs[i].in_lbn, (int)fs->fs_bsize, 0, &bp); if (error) { goto fail; } bap = (int32_t *)bp->b_data; /* XXX ondisk32 */ nb = ufs_rw32(bap[indirs[i].in_off], needswap); if (i == num) break; i++; if (nb != 0) { brelse(bp, 0); continue; } if (fscow_run(bp, true) != 0) { brelse(bp, 0); goto fail; } mutex_enter(&ump->um_lock); /* Try to keep snapshot indirect blocks contiguous. */ if (i == num && (ip->i_flags & SF_SNAPSHOT) != 0) pref = ffs_blkpref_ufs1(ip, lbn, indirs[i-1].in_off, flags | B_METAONLY, &bap[0]); if (pref == 0) pref = ffs_blkpref_ufs1(ip, lbn, 0, flags | B_METAONLY, NULL); error = ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize, flags | B_METAONLY, cred, &newb); if (error) { brelse(bp, 0); goto fail; } nb = newb; *allocblk++ = nb; error = ffs_getblk(vp, indirs[i].in_lbn, FFS_FSBTODB(fs, nb), fs->fs_bsize, true, &nbp); if (error) { brelse(bp, 0); goto fail; } /* * Write synchronously so that indirect blocks * never point at garbage. */ if ((error = bwrite(nbp)) != 0) { brelse(bp, 0); goto fail; } if (unwindidx < 0) unwindidx = i - 1; bap[indirs[i - 1].in_off] = ufs_rw32(nb, needswap); /* * If required, write synchronously, otherwise use * delayed write. */ if (flags & B_SYNC) { bwrite(bp); } else { bdwrite(bp); } } if (flags & B_METAONLY) { KASSERT(bpp != NULL); *bpp = bp; return (0); } /* * Get the data block, allocating if necessary. */ if (nb == 0) { if (fscow_run(bp, true) != 0) { brelse(bp, 0); goto fail; } mutex_enter(&ump->um_lock); pref = ffs_blkpref_ufs1(ip, lbn, indirs[num].in_off, flags, &bap[0]); error = ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize, flags, cred, &newb); if (error) { brelse(bp, 0); goto fail; } nb = newb; *allocblk++ = nb; if (bpp != NULL) { error = ffs_getblk(vp, lbn, FFS_FSBTODB(fs, nb), fs->fs_bsize, (flags & B_CLRBUF) != 0, bpp); if (error) { brelse(bp, 0); goto fail; } } bap[indirs[num].in_off] = ufs_rw32(nb, needswap); if (allocib == NULL && unwindidx < 0) { unwindidx = i - 1; } /* * If required, write synchronously, otherwise use * delayed write. */ if (flags & B_SYNC) { bwrite(bp); } else { bdwrite(bp); } return (0); } brelse(bp, 0); if (bpp != NULL) { if (flags & B_CLRBUF) { error = bread(vp, lbn, (int)fs->fs_bsize, B_MODIFY, &nbp); if (error) { goto fail; } } else { error = ffs_getblk(vp, lbn, FFS_FSBTODB(fs, nb), fs->fs_bsize, true, &nbp); if (error) goto fail; } *bpp = nbp; } return (0); fail: /* * If we have failed part way through block allocation, we * have to deallocate any indirect blocks that we have allocated. */ if (unwindidx >= 0) { /* * First write out any buffers we've created to resolve their * softdeps. This must be done in reverse order of creation * so that we resolve the dependencies in one pass. * Write the cylinder group buffers for these buffers too. */ for (i = num; i >= unwindidx; i--) { if (i == 0) { break; } if (ffs_getblk(vp, indirs[i].in_lbn, FFS_NOBLK, fs->fs_bsize, false, &bp) != 0) continue; if (bp->b_oflags & BO_DELWRI) { nb = FFS_FSBTODB(fs, cgtod(fs, dtog(fs, FFS_DBTOFSB(fs, bp->b_blkno)))); bwrite(bp); if (ffs_getblk(ip->i_devvp, nb, FFS_NOBLK, fs->fs_cgsize, false, &bp) != 0) continue; if (bp->b_oflags & BO_DELWRI) { bwrite(bp); } else { brelse(bp, BC_INVAL); } } else { brelse(bp, BC_INVAL); } } /* * Undo the partial allocation. */ if (unwindidx == 0) { *allocib = 0; ip->i_flag |= IN_CHANGE | IN_UPDATE; } else { int r; r = bread(vp, indirs[unwindidx].in_lbn, (int)fs->fs_bsize, 0, &bp); if (r) { panic("Could not unwind indirect block, error %d", r); } else { bap = (int32_t *)bp->b_data; /* XXX ondisk32 */ bap[indirs[unwindidx].in_off] = 0; bwrite(bp); } } for (i = unwindidx + 1; i <= num; i++) { if (ffs_getblk(vp, indirs[i].in_lbn, FFS_NOBLK, fs->fs_bsize, false, &bp) == 0) brelse(bp, BC_INVAL); } } for (deallocated = 0, blkp = allociblk; blkp < allocblk; blkp++) { ffs_blkfree(fs, ip->i_devvp, *blkp, fs->fs_bsize, ip->i_number); deallocated += fs->fs_bsize; } if (deallocated) { #if defined(QUOTA) || defined(QUOTA2) /* * Restore user's disk quota because allocation failed. */ (void)chkdq(ip, -btodb(deallocated), cred, FORCE); #endif ip->i_ffs1_blocks -= btodb(deallocated); ip->i_flag |= IN_CHANGE | IN_UPDATE; } return (error); } static int ffs_balloc_ufs2(struct vnode *vp, off_t off, int size, kauth_cred_t cred, int flags, struct buf **bpp) { daddr_t lbn, lastlbn; struct buf *bp, *nbp; struct inode *ip = VTOI(vp); struct fs *fs = ip->i_fs; struct ufsmount *ump = ip->i_ump; struct indir indirs[UFS_NIADDR + 2]; daddr_t newb, pref, nb; int64_t *bap; int deallocated, osize, nsize, num, i, error; daddr_t *blkp, *allocblk, allociblk[UFS_NIADDR + 1]; int64_t *allocib; int unwindidx = -1; const int needswap = UFS_FSNEEDSWAP(fs); UVMHIST_FUNC("ffs_balloc"); UVMHIST_CALLED(ubchist); KASSERT((ump->um_flags & UFS_EA) != 0 || (flags & IO_EXT) == 0); lbn = ffs_lblkno(fs, off); size = ffs_blkoff(fs, off) + size; if (size > fs->fs_bsize) panic("ffs_balloc: blk too big"); if (bpp != NULL) { *bpp = NULL; } UVMHIST_LOG(ubchist, "vp %#jx lbn 0x%jx size 0x%jx", (uintptr_t)vp, lbn, size, 0); if (lbn < 0) return (EFBIG); /* * Check for allocating external data. */ if (flags & IO_EXT) { struct ufs2_dinode *dp = ip->i_din.ffs2_din; if (lbn >= UFS_NXADDR) return (EFBIG); /* * If the next write will extend the data into a new block, * and the data is currently composed of a fragment * this fragment has to be extended to be a full block. */ lastlbn = ffs_lblkno(fs, dp->di_extsize); if (lastlbn < lbn) { nb = lastlbn; osize = ffs_sblksize(fs, dp->di_extsize, nb); if (osize < fs->fs_bsize && osize > 0) { mutex_enter(&ump->um_lock); error = ffs_realloccg(ip, -1 - nb, ffs_extb(fs, dp, nb), ffs_blkpref_ufs2(ip, lastlbn, (int)nb, flags, &dp->di_extb[0]), osize, (int)fs->fs_bsize, flags, cred, &bp, &newb); if (error) return (error); dp->di_extsize = ffs_lblktosize(fs, nb + 1); dp->di_extb[nb] = FFS_DBTOFSB(fs, bp->b_blkno); ip->i_flag |= IN_CHANGE | IN_UPDATE; if (flags & IO_SYNC) bwrite(bp); else bawrite(bp); } } /* * All blocks are direct blocks */ nb = dp->di_extb[lbn]; if (nb != 0 && dp->di_extsize >= ffs_lblktosize(fs, lbn + 1)) { error = bread(vp, -1 - lbn, fs->fs_bsize, 0, &bp); if (error) { return (error); } mutex_enter(bp->b_objlock); bp->b_blkno = FFS_FSBTODB(fs, nb); mutex_exit(bp->b_objlock); *bpp = bp; return (0); } if (nb != 0) { /* * Consider need to reallocate a fragment. */ osize = ffs_fragroundup(fs, ffs_blkoff(fs, dp->di_extsize)); nsize = ffs_fragroundup(fs, size); if (nsize <= osize) { error = bread(vp, -1 - lbn, osize, 0, &bp); if (error) { return (error); } mutex_enter(bp->b_objlock); bp->b_blkno = FFS_FSBTODB(fs, nb); mutex_exit(bp->b_objlock); } else { mutex_enter(&ump->um_lock); error = ffs_realloccg(ip, -1 - lbn, ffs_extb(fs, dp, lbn), ffs_blkpref_ufs2(ip, lbn, (int)lbn, flags, &dp->di_extb[0]), osize, nsize, flags, cred, &bp, &newb); if (error) return (error); } } else { if (dp->di_extsize < ffs_lblktosize(fs, lbn + 1)) nsize = ffs_fragroundup(fs, size); else nsize = fs->fs_bsize; mutex_enter(&ump->um_lock); error = ffs_alloc(ip, lbn, ffs_blkpref_ufs2(ip, lbn, (int)lbn, flags, &dp->di_extb[0]), nsize, flags, cred, &newb); if (error) return (error); error = ffs_getblk(vp, -1 - lbn, FFS_FSBTODB(fs, newb), nsize, (flags & B_CLRBUF) != 0, &bp); if (error) return error; } dp->di_extb[lbn] = FFS_DBTOFSB(fs, bp->b_blkno); ip->i_flag |= IN_CHANGE | IN_UPDATE; *bpp = bp; return (0); } /* * If the next write will extend the file into a new block, * and the file is currently composed of a fragment * this fragment has to be extended to be a full block. */ lastlbn = ffs_lblkno(fs, ip->i_size); if (lastlbn < UFS_NDADDR && lastlbn < lbn) { nb = lastlbn; osize = ffs_blksize(fs, ip, nb); if (osize < fs->fs_bsize && osize > 0) { mutex_enter(&ump->um_lock); error = ffs_realloccg(ip, nb, ffs_getdb(fs, ip, lbn), ffs_blkpref_ufs2(ip, lastlbn, nb, flags, &ip->i_ffs2_db[0]), osize, (int)fs->fs_bsize, flags, cred, bpp, &newb); if (error) return (error); ip->i_size = ffs_lblktosize(fs, nb + 1); ip->i_ffs2_size = ip->i_size; uvm_vnp_setsize(vp, ip->i_size); ip->i_ffs2_db[nb] = ufs_rw64(newb, needswap); ip->i_flag |= IN_CHANGE | IN_UPDATE; if (bpp) { if (flags & B_SYNC) bwrite(*bpp); else bawrite(*bpp); } } } /* * The first UFS_NDADDR blocks are direct blocks */ if (lbn < UFS_NDADDR) { nb = ufs_rw64(ip->i_ffs2_db[lbn], needswap); if (nb != 0 && ip->i_size >= ffs_lblktosize(fs, lbn + 1)) { /* * The block is an already-allocated direct block * and the file already extends past this block, * thus this must be a whole block. * Just read the block (if requested). */ if (bpp != NULL) { error = bread(vp, lbn, fs->fs_bsize, B_MODIFY, bpp); if (error) { return (error); } } return (0); } if (nb != 0) { /* * Consider need to reallocate a fragment. */ osize = ffs_fragroundup(fs, ffs_blkoff(fs, ip->i_size)); nsize = ffs_fragroundup(fs, size); if (nsize <= osize) { /* * The existing block is already * at least as big as we want. * Just read the block (if requested). */ if (bpp != NULL) { error = bread(vp, lbn, osize, B_MODIFY, bpp); if (error) { return (error); } } return 0; } else { /* * The existing block is smaller than we want, * grow it. */ mutex_enter(&ump->um_lock); error = ffs_realloccg(ip, lbn, ffs_getdb(fs, ip, lbn), ffs_blkpref_ufs2(ip, lbn, (int)lbn, flags, &ip->i_ffs2_db[0]), osize, nsize, flags, cred, bpp, &newb); if (error) return (error); } } else { /* * the block was not previously allocated, * allocate a new block or fragment. */ if (ip->i_size < ffs_lblktosize(fs, lbn + 1)) nsize = ffs_fragroundup(fs, size); else nsize = fs->fs_bsize; mutex_enter(&ump->um_lock); error = ffs_alloc(ip, lbn, ffs_blkpref_ufs2(ip, lbn, (int)lbn, flags, &ip->i_ffs2_db[0]), nsize, flags, cred, &newb); if (error) return (error); if (bpp != NULL) { error = ffs_getblk(vp, lbn, FFS_FSBTODB(fs, newb), nsize, (flags & B_CLRBUF) != 0, bpp); if (error) return error; } } ip->i_ffs2_db[lbn] = ufs_rw64(newb, needswap); ip->i_flag |= IN_CHANGE | IN_UPDATE; return (0); } /* * Determine the number of levels of indirection. */ pref = 0; if ((error = ufs_getlbns(vp, lbn, indirs, &num)) != 0) return (error); /* * Fetch the first indirect block allocating if necessary. */ --num; nb = ufs_rw64(ip->i_ffs2_ib[indirs[0].in_off], needswap); allocib = NULL; allocblk = allociblk; if (nb == 0) { mutex_enter(&ump->um_lock); pref = ffs_blkpref_ufs2(ip, lbn, 0, flags | B_METAONLY, NULL); error = ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize, flags | B_METAONLY, cred, &newb); if (error) goto fail; nb = newb; *allocblk++ = nb; error = ffs_getblk(vp, indirs[1].in_lbn, FFS_FSBTODB(fs, nb), fs->fs_bsize, true, &bp); if (error) goto fail; /* * Write synchronously so that indirect blocks * never point at garbage. */ if ((error = bwrite(bp)) != 0) goto fail; unwindidx = 0; allocib = &ip->i_ffs2_ib[indirs[0].in_off]; *allocib = ufs_rw64(nb, needswap); ip->i_flag |= IN_CHANGE | IN_UPDATE; } /* * Fetch through the indirect blocks, allocating as necessary. */ for (i = 1;;) { error = bread(vp, indirs[i].in_lbn, (int)fs->fs_bsize, 0, &bp); if (error) { goto fail; } bap = (int64_t *)bp->b_data; nb = ufs_rw64(bap[indirs[i].in_off], needswap); if (i == num) break; i++; if (nb != 0) { brelse(bp, 0); continue; } if (fscow_run(bp, true) != 0) { brelse(bp, 0); goto fail; } mutex_enter(&ump->um_lock); /* Try to keep snapshot indirect blocks contiguous. */ if (i == num && (ip->i_flags & SF_SNAPSHOT) != 0) pref = ffs_blkpref_ufs2(ip, lbn, indirs[i-1].in_off, flags | B_METAONLY, &bap[0]); if (pref == 0) pref = ffs_blkpref_ufs2(ip, lbn, 0, flags | B_METAONLY, NULL); error = ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize, flags | B_METAONLY, cred, &newb); if (error) { brelse(bp, 0); goto fail; } nb = newb; *allocblk++ = nb; error = ffs_getblk(vp, indirs[i].in_lbn, FFS_FSBTODB(fs, nb), fs->fs_bsize, true, &nbp); if (error) { brelse(bp, 0); goto fail; } /* * Write synchronously so that indirect blocks * never point at garbage. */ if ((error = bwrite(nbp)) != 0) { brelse(bp, 0); goto fail; } if (unwindidx < 0) unwindidx = i - 1; bap[indirs[i - 1].in_off] = ufs_rw64(nb, needswap); /* * If required, write synchronously, otherwise use * delayed write. */ if (flags & B_SYNC) { bwrite(bp); } else { bdwrite(bp); } } if (flags & B_METAONLY) { KASSERT(bpp != NULL); *bpp = bp; return (0); } /* * Get the data block, allocating if necessary. */ if (nb == 0) { if (fscow_run(bp, true) != 0) { brelse(bp, 0); goto fail; } mutex_enter(&ump->um_lock); pref = ffs_blkpref_ufs2(ip, lbn, indirs[num].in_off, flags, &bap[0]); error = ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize, flags, cred, &newb); if (error) { brelse(bp, 0); goto fail; } nb = newb; *allocblk++ = nb; if (bpp != NULL) { error = ffs_getblk(vp, lbn, FFS_FSBTODB(fs, nb), fs->fs_bsize, (flags & B_CLRBUF) != 0, bpp); if (error) { brelse(bp, 0); goto fail; } } bap[indirs[num].in_off] = ufs_rw64(nb, needswap); if (allocib == NULL && unwindidx < 0) { unwindidx = i - 1; } /* * If required, write synchronously, otherwise use * delayed write. */ if (flags & B_SYNC) { bwrite(bp); } else { bdwrite(bp); } return (0); } brelse(bp, 0); if (bpp != NULL) { if (flags & B_CLRBUF) { error = bread(vp, lbn, (int)fs->fs_bsize, B_MODIFY, &nbp); if (error) { goto fail; } } else { error = ffs_getblk(vp, lbn, FFS_FSBTODB(fs, nb), fs->fs_bsize, true, &nbp); if (error) goto fail; } *bpp = nbp; } return (0); fail: /* * If we have failed part way through block allocation, we * have to deallocate any indirect blocks that we have allocated. */ if (unwindidx >= 0) { /* * First write out any buffers we've created to resolve their * softdeps. This must be done in reverse order of creation * so that we resolve the dependencies in one pass. * Write the cylinder group buffers for these buffers too. */ for (i = num; i >= unwindidx; i--) { if (i == 0) { break; } if (ffs_getblk(vp, indirs[i].in_lbn, FFS_NOBLK, fs->fs_bsize, false, &bp) != 0) continue; if (bp->b_oflags & BO_DELWRI) { nb = FFS_FSBTODB(fs, cgtod(fs, dtog(fs, FFS_DBTOFSB(fs, bp->b_blkno)))); bwrite(bp); if (ffs_getblk(ip->i_devvp, nb, FFS_NOBLK, fs->fs_cgsize, false, &bp) != 0) continue; if (bp->b_oflags & BO_DELWRI) { bwrite(bp); } else { brelse(bp, BC_INVAL); } } else { brelse(bp, BC_INVAL); } } /* * Now that any dependencies that we created have been * resolved, we can undo the partial allocation. */ if (unwindidx == 0) { *allocib = 0; ip->i_flag |= IN_CHANGE | IN_UPDATE; } else { int r; r = bread(vp, indirs[unwindidx].in_lbn, (int)fs->fs_bsize, 0, &bp); if (r) { panic("Could not unwind indirect block, error %d", r); } else { bap = (int64_t *)bp->b_data; bap[indirs[unwindidx].in_off] = 0; bwrite(bp); } } for (i = unwindidx + 1; i <= num; i++) { if (ffs_getblk(vp, indirs[i].in_lbn, FFS_NOBLK, fs->fs_bsize, false, &bp) == 0) brelse(bp, BC_INVAL); } } for (deallocated = 0, blkp = allociblk; blkp < allocblk; blkp++) { ffs_blkfree(fs, ip->i_devvp, *blkp, fs->fs_bsize, ip->i_number); deallocated += fs->fs_bsize; } if (deallocated) { #if defined(QUOTA) || defined(QUOTA2) /* * Restore user's disk quota because allocation failed. */ (void)chkdq(ip, -btodb(deallocated), cred, FORCE); #endif ip->i_ffs2_blocks -= btodb(deallocated); ip->i_flag |= IN_CHANGE | IN_UPDATE; } return (error); }
92 91 90 79 20 92 27 27 5 4 1 27 15 4 11 55 55 1 1 1 54 54 2 111 111 110 111 110 110 110 102 10 110 41 42 41 42 42 42 41 41 42 41 153 153 152 152 152 48 153 86 41 3 94 2 3 5 153 3 23 23 23 23 23 23 23 11 12 12 12 39 40 40 40 12 6 40 40 26 26 26 26 236 119 115 119 56 37 48 39 3 36 22 2 18 242 242 579 81 80 580 2 133 121 13 121 13 37 12 27 12 27 21 11 9 10 10 137 44 104 44 104 5 5 5 56 47 37 47 37 13 5 8 4 8 17 15 6 15 6 10 72 54 18 54 18 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 /* $NetBSD: vfs_subr.c,v 1.500 2023/04/30 08:46:11 riastradh Exp $ */ /*- * Copyright (c) 1997, 1998, 2004, 2005, 2007, 2008, 2019, 2020 * The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility, * NASA Ames Research Center, by Charles M. Hannum, by Andrew Doran, * by Marshall Kirk McKusick and Greg Ganger at the University of Michigan. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /* * Copyright (c) 1989, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)vfs_subr.c 8.13 (Berkeley) 4/18/94 */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: vfs_subr.c,v 1.500 2023/04/30 08:46:11 riastradh Exp $"); #ifdef _KERNEL_OPT #include "opt_compat_43.h" #include "opt_compat_netbsd.h" #include "opt_ddb.h" #endif #include <sys/param.h> #include <sys/types.h> #include <sys/buf.h> #include <sys/conf.h> #include <sys/dirent.h> #include <sys/errno.h> #include <sys/filedesc.h> #include <sys/fstrans.h> #include <sys/kauth.h> #include <sys/kernel.h> #include <sys/kmem.h> #include <sys/module.h> #include <sys/mount.h> #include <sys/namei.h> #include <sys/stat.h> #include <sys/syscallargs.h> #include <sys/sysctl.h> #include <sys/systm.h> #include <sys/vnode_impl.h> #include <miscfs/deadfs/deadfs.h> #include <miscfs/genfs/genfs.h> #include <miscfs/specfs/specdev.h> #include <uvm/uvm_ddb.h> SDT_PROBE_DEFINE3(vfs, syncer, worklist, vnode__add, "struct vnode *"/*vp*/, "int"/*delayx*/, "int"/*slot*/); SDT_PROBE_DEFINE4(vfs, syncer, worklist, vnode__update, "struct vnode *"/*vp*/, "int"/*delayx*/, "int"/*oslot*/, "int"/*nslot*/); SDT_PROBE_DEFINE1(vfs, syncer, worklist, vnode__remove, "struct vnode *"/*vp*/); SDT_PROBE_DEFINE3(vfs, syncer, worklist, mount__add, "struct mount *"/*mp*/, "int"/*vdelay*/, "int"/*slot*/); SDT_PROBE_DEFINE4(vfs, syncer, worklist, mount__update, "struct mount *"/*vp*/, "int"/*vdelay*/, "int"/*oslot*/, "int"/*nslot*/); SDT_PROBE_DEFINE1(vfs, syncer, worklist, mount__remove, "struct mount *"/*mp*/); SDT_PROBE_DEFINE1(vfs, syncer, sync, start, "int"/*starttime*/); SDT_PROBE_DEFINE1(vfs, syncer, sync, mount__start, "struct mount *"/*mp*/); SDT_PROBE_DEFINE2(vfs, syncer, sync, mount__done, "struct mount *"/*mp*/, "int"/*error*/); SDT_PROBE_DEFINE1(vfs, syncer, sync, mount__skip, "struct mount *"/*mp*/); SDT_PROBE_DEFINE1(vfs, syncer, sync, vnode__start, "struct vnode *"/*vp*/); SDT_PROBE_DEFINE2(vfs, syncer, sync, vnode__done, "struct vnode *"/*vp*/, "int"/*error*/); SDT_PROBE_DEFINE2(vfs, syncer, sync, vnode__fail__lock, "struct vnode *"/*vp*/, "int"/*error*/); SDT_PROBE_DEFINE2(vfs, syncer, sync, vnode__fail__vget, "struct vnode *"/*vp*/, "int"/*error*/); SDT_PROBE_DEFINE2(vfs, syncer, sync, done, "int"/*starttime*/, "int"/*endtime*/); const enum vtype iftovt_tab[16] = { VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON, VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VBAD, }; const int vttoif_tab[9] = { 0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK, S_IFSOCK, S_IFIFO, S_IFMT, }; /* * Insq/Remq for the vnode usage lists. */ #define bufinsvn(bp, dp) LIST_INSERT_HEAD(dp, bp, b_vnbufs) #define bufremvn(bp) { \ LIST_REMOVE(bp, b_vnbufs); \ (bp)->b_vnbufs.le_next = NOLIST; \ } int doforce = 1; /* 1 => permit forcible unmounting */ /* * Local declarations. */ static void vn_initialize_syncerd(void); /* * Initialize the vnode management data structures. */ void vntblinit(void) { vn_initialize_syncerd(); vfs_mount_sysinit(); vfs_vnode_sysinit(); } /* * Flush out and invalidate all buffers associated with a vnode. * Called with the underlying vnode locked, which should prevent new dirty * buffers from being queued. */ int vinvalbuf(struct vnode *vp, int flags, kauth_cred_t cred, struct lwp *l, bool catch_p, int slptimeo) { struct buf *bp, *nbp; int error; int flushflags = PGO_ALLPAGES | PGO_FREE | PGO_SYNCIO | (flags & V_SAVE ? PGO_CLEANIT | PGO_RECLAIM : 0); /* XXXUBC this doesn't look at flags or slp* */ rw_enter(vp->v_uobj.vmobjlock, RW_WRITER); error = VOP_PUTPAGES(vp, 0, 0, flushflags); if (error) { return error; } if (flags & V_SAVE) { error = VOP_FSYNC(vp, cred, FSYNC_WAIT|FSYNC_RECLAIM, 0, 0); if (error) return (error); KASSERT(LIST_EMPTY(&vp->v_dirtyblkhd)); } mutex_enter(&bufcache_lock); restart: for (bp = LIST_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) { KASSERT(bp->b_vp == vp); nbp = LIST_NEXT(bp, b_vnbufs); error = bbusy(bp, catch_p, slptimeo, NULL); if (error != 0) { if (error == EPASSTHROUGH) goto restart; mutex_exit(&bufcache_lock); return (error); } brelsel(bp, BC_INVAL | BC_VFLUSH); } for (bp = LIST_FIRST(&vp->v_cleanblkhd); bp; bp = nbp) { KASSERT(bp->b_vp == vp); nbp = LIST_NEXT(bp, b_vnbufs); error = bbusy(bp, catch_p, slptimeo, NULL); if (error != 0) { if (error == EPASSTHROUGH) goto restart; mutex_exit(&bufcache_lock); return (error); } /* * XXX Since there are no node locks for NFS, I believe * there is a slight chance that a delayed write will * occur while sleeping just above, so check for it. */ if ((bp->b_oflags & BO_DELWRI) && (flags & V_SAVE)) { #ifdef DEBUG printf("buffer still DELWRI\n"); #endif bp->b_cflags |= BC_BUSY | BC_VFLUSH; mutex_exit(&bufcache_lock); VOP_BWRITE(bp->b_vp, bp); mutex_enter(&bufcache_lock); goto restart; } brelsel(bp, BC_INVAL | BC_VFLUSH); } #ifdef DIAGNOSTIC if (!LIST_EMPTY(&vp->v_cleanblkhd) || !LIST_EMPTY(&vp->v_dirtyblkhd)) panic("vinvalbuf: flush failed, vp %p", vp); #endif mutex_exit(&bufcache_lock); return (0); } /* * Destroy any in core blocks past the truncation length. * Called with the underlying vnode locked, which should prevent new dirty * buffers from being queued. */ int vtruncbuf(struct vnode *vp, daddr_t lbn, bool catch_p, int slptimeo) { struct buf *bp, *nbp; int error; voff_t off; off = round_page((voff_t)lbn << vp->v_mount->mnt_fs_bshift); rw_enter(vp->v_uobj.vmobjlock, RW_WRITER); error = VOP_PUTPAGES(vp, off, 0, PGO_FREE | PGO_SYNCIO); if (error) { return error; } mutex_enter(&bufcache_lock); restart: for (bp = LIST_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) { KASSERT(bp->b_vp == vp); nbp = LIST_NEXT(bp, b_vnbufs); if (bp->b_lblkno < lbn) continue; error = bbusy(bp, catch_p, slptimeo, NULL); if (error != 0) { if (error == EPASSTHROUGH) goto restart; mutex_exit(&bufcache_lock); return (error); } brelsel(bp, BC_INVAL | BC_VFLUSH); } for (bp = LIST_FIRST(&vp->v_cleanblkhd); bp; bp = nbp) { KASSERT(bp->b_vp == vp); nbp = LIST_NEXT(bp, b_vnbufs); if (bp->b_lblkno < lbn) continue; error = bbusy(bp, catch_p, slptimeo, NULL); if (error != 0) { if (error == EPASSTHROUGH) goto restart; mutex_exit(&bufcache_lock); return (error); } brelsel(bp, BC_INVAL | BC_VFLUSH); } mutex_exit(&bufcache_lock); return (0); } /* * Flush all dirty buffers from a vnode. * Called with the underlying vnode locked, which should prevent new dirty * buffers from being queued. */ int vflushbuf(struct vnode *vp, int flags) { struct buf *bp, *nbp; int error, pflags; bool dirty, sync; sync = (flags & FSYNC_WAIT) != 0; pflags = PGO_CLEANIT | PGO_ALLPAGES | (sync ? PGO_SYNCIO : 0) | ((flags & FSYNC_LAZY) ? PGO_LAZY : 0); rw_enter(vp->v_uobj.vmobjlock, RW_WRITER); (void) VOP_PUTPAGES(vp, 0, 0, pflags); loop: mutex_enter(&bufcache_lock); for (bp = LIST_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) { KASSERT(bp->b_vp == vp); nbp = LIST_NEXT(bp, b_vnbufs); if ((bp->b_cflags & BC_BUSY)) continue; if ((bp->b_oflags & BO_DELWRI) == 0) panic("vflushbuf: not dirty, bp %p", bp); bp->b_cflags |= BC_BUSY | BC_VFLUSH; mutex_exit(&bufcache_lock); /* * Wait for I/O associated with indirect blocks to complete, * since there is no way to quickly wait for them below. */ if (bp->b_vp == vp || !sync) (void) bawrite(bp); else { error = bwrite(bp); if (error) return error; } goto loop; } mutex_exit(&bufcache_lock); if (!sync) return 0; mutex_enter(vp->v_interlock); while (vp->v_numoutput != 0) cv_wait(&vp->v_cv, vp->v_interlock); dirty = !LIST_EMPTY(&vp->v_dirtyblkhd); mutex_exit(vp->v_interlock); if (dirty) { vprint("vflushbuf: dirty", vp); goto loop; } return 0; } /* * Create a vnode for a block device. * Used for root filesystem and swap areas. * Also used for memory file system special devices. */ int bdevvp(dev_t dev, vnode_t **vpp) { struct vattr va; vattr_null(&va); va.va_type = VBLK; va.va_rdev = dev; return vcache_new(dead_rootmount, NULL, &va, NOCRED, NULL, vpp); } /* * Create a vnode for a character device. * Used for kernfs and some console handling. */ int cdevvp(dev_t dev, vnode_t **vpp) { struct vattr va; vattr_null(&va); va.va_type = VCHR; va.va_rdev = dev; return vcache_new(dead_rootmount, NULL, &va, NOCRED, NULL, vpp); } /* * Associate a buffer with a vnode. There must already be a hold on * the vnode. */ void bgetvp(struct vnode *vp, struct buf *bp) { KASSERT(bp->b_vp == NULL); KASSERT(bp->b_objlock == &buffer_lock); KASSERT(mutex_owned(vp->v_interlock)); KASSERT(mutex_owned(&bufcache_lock)); KASSERT((bp->b_cflags & BC_BUSY) != 0); KASSERT(!cv_has_waiters(&bp->b_done)); vholdl(vp); bp->b_vp = vp; if (vp->v_type == VBLK || vp->v_type == VCHR) bp->b_dev = vp->v_rdev; else bp->b_dev = NODEV; /* * Insert onto list for new vnode. */ bufinsvn(bp, &vp->v_cleanblkhd); bp->b_objlock = vp->v_interlock; } /* * Disassociate a buffer from a vnode. */ void brelvp(struct buf *bp) { struct vnode *vp = bp->b_vp; KASSERT(vp != NULL); KASSERT(bp->b_objlock == vp->v_interlock); KASSERT(mutex_owned(vp->v_interlock)); KASSERT(mutex_owned(&bufcache_lock)); KASSERT((bp->b_cflags & BC_BUSY) != 0); KASSERT(!cv_has_waiters(&bp->b_done)); /* * Delete from old vnode list, if on one. */ if (LIST_NEXT(bp, b_vnbufs) != NOLIST) bufremvn(bp); if ((vp->v_iflag & (VI_ONWORKLST | VI_PAGES)) == VI_ONWORKLST && LIST_FIRST(&vp->v_dirtyblkhd) == NULL) vn_syncer_remove_from_worklist(vp); bp->b_objlock = &buffer_lock; bp->b_vp = NULL; holdrelel(vp); } /* * Reassign a buffer from one vnode list to another. * The list reassignment must be within the same vnode. * Used to assign file specific control information * (indirect blocks) to the list to which they belong. */ void reassignbuf(struct buf *bp, struct vnode *vp) { struct buflists *listheadp; int delayx; KASSERT(mutex_owned(&bufcache_lock)); KASSERT(bp->b_objlock == vp->v_interlock); KASSERT(mutex_owned(vp->v_interlock)); KASSERT((bp->b_cflags & BC_BUSY) != 0); /* * Delete from old vnode list, if on one. */ if (LIST_NEXT(bp, b_vnbufs) != NOLIST) bufremvn(bp); /* * If dirty, put on list of dirty buffers; * otherwise insert onto list of clean buffers. */ if ((bp->b_oflags & BO_DELWRI) == 0) { listheadp = &vp->v_cleanblkhd; if ((vp->v_iflag & (VI_ONWORKLST | VI_PAGES)) == VI_ONWORKLST && LIST_FIRST(&vp->v_dirtyblkhd) == NULL) vn_syncer_remove_from_worklist(vp); } else { listheadp = &vp->v_dirtyblkhd; if ((vp->v_iflag & VI_ONWORKLST) == 0) { switch (vp->v_type) { case VDIR: delayx = dirdelay; break; case VBLK: if (spec_node_getmountedfs(vp) != NULL) { delayx = metadelay; break; } /* fall through */ default: delayx = filedelay; break; } if (!vp->v_mount || (vp->v_mount->mnt_flag & MNT_ASYNC) == 0) vn_syncer_add_to_worklist(vp, delayx); } } bufinsvn(bp, listheadp); } /* * Lookup a vnode by device number and return it referenced. */ int vfinddev(dev_t dev, enum vtype type, vnode_t **vpp) { return (spec_node_lookup_by_dev(type, dev, VDEAD_NOWAIT, vpp) == 0); } /* * Revoke all the vnodes corresponding to the specified minor number * range (endpoints inclusive) of the specified major. */ void vdevgone(int maj, int minl, int minh, enum vtype type) { vnode_t *vp; dev_t dev; int mn; for (mn = minl; mn <= minh; mn++) { dev = makedev(maj, mn); /* * Notify anyone trying to get at this device that it * has been detached, and then revoke it. */ switch (type) { case VBLK: bdev_detached(dev); break; case VCHR: cdev_detached(dev); break; default: panic("invalid specnode type: %d", type); } /* * Passing 0 as flags, instead of VDEAD_NOWAIT, means * spec_node_lookup_by_dev will wait for vnodes it * finds concurrently being revoked before returning. */ while (spec_node_lookup_by_dev(type, dev, 0, &vp) == 0) { VOP_REVOKE(vp, REVOKEALL); vrele(vp); } } } /* * The filesystem synchronizer mechanism - syncer. * * It is useful to delay writes of file data and filesystem metadata for * a certain amount of time so that quickly created and deleted files need * not waste disk bandwidth being created and removed. To implement this, * vnodes are appended to a "workitem" queue. * * Most pending metadata should not wait for more than ten seconds. Thus, * mounted on block devices are delayed only about a half the time that file * data is delayed. Similarly, directory updates are more critical, so are * only delayed about a third the time that file data is delayed. * * There are SYNCER_MAXDELAY queues that are processed in a round-robin * manner at a rate of one each second (driven off the filesystem syner * thread). The syncer_delayno variable indicates the next queue that is * to be processed. Items that need to be processed soon are placed in * this queue: * * syncer_workitem_pending[syncer_delayno] * * A delay of e.g. fifteen seconds is done by placing the request fifteen * entries later in the queue: * * syncer_workitem_pending[(syncer_delayno + 15) & syncer_mask] * * Flag VI_ONWORKLST indicates that vnode is added into the queue. */ #define SYNCER_MAXDELAY 32 typedef TAILQ_HEAD(synclist, vnode_impl) synclist_t; static void vn_syncer_add1(struct vnode *, int); static void sysctl_vfs_syncfs_setup(struct sysctllog **); /* * Defines and variables for the syncer process. */ int syncer_maxdelay = SYNCER_MAXDELAY; /* maximum delay time */ time_t syncdelay = 30; /* max time to delay syncing data */ time_t filedelay = 30; /* time to delay syncing files */ time_t dirdelay = 15; /* time to delay syncing directories */ time_t metadelay = 10; /* time to delay syncing metadata */ time_t lockdelay = 1; /* time to delay if locking fails */ static kmutex_t syncer_data_lock; /* short term lock on data structs */ static int syncer_delayno = 0; static long syncer_last; static synclist_t * syncer_workitem_pending; static void vn_initialize_syncerd(void) { int i; syncer_last = SYNCER_MAXDELAY + 2; sysctl_vfs_syncfs_setup(NULL); syncer_workitem_pending = kmem_alloc(syncer_last * sizeof (struct synclist), KM_SLEEP); for (i = 0; i < syncer_last; i++) TAILQ_INIT(&syncer_workitem_pending[i]); mutex_init(&syncer_data_lock, MUTEX_DEFAULT, IPL_NONE); } /* * Return delay factor appropriate for the given file system. For * WAPBL we use the sync vnode to burst out metadata updates: sync * those file systems more frequently. */ static inline int sync_delay(struct mount *mp) { return mp->mnt_wapbl != NULL ? metadelay : syncdelay; } /* * Compute the next slot index from delay. */ static inline int sync_delay_slot(int delayx) { if (delayx > syncer_maxdelay - 2) delayx = syncer_maxdelay - 2; return (syncer_delayno + delayx) % syncer_last; } /* * Add an item to the syncer work queue. */ static void vn_syncer_add1(struct vnode *vp, int delayx) { synclist_t *slp; vnode_impl_t *vip = VNODE_TO_VIMPL(vp); KASSERT(mutex_owned(&syncer_data_lock)); if (vp->v_iflag & VI_ONWORKLST) { /* * Remove in order to adjust the position of the vnode. * Note: called from sched_sync(), which will not hold * interlock, therefore we cannot modify v_iflag here. */ slp = &syncer_workitem_pending[vip->vi_synclist_slot]; TAILQ_REMOVE(slp, vip, vi_synclist); } else { KASSERT(mutex_owned(vp->v_interlock)); vp->v_iflag |= VI_ONWORKLST; } vip->vi_synclist_slot = sync_delay_slot(delayx); slp = &syncer_workitem_pending[vip->vi_synclist_slot]; TAILQ_INSERT_TAIL(slp, vip, vi_synclist); } void vn_syncer_add_to_worklist(struct vnode *vp, int delayx) { vnode_impl_t *vip = VNODE_TO_VIMPL(vp); KASSERT(mutex_owned(vp->v_interlock)); mutex_enter(&syncer_data_lock); vn_syncer_add1(vp, delayx); SDT_PROBE3(vfs, syncer, worklist, vnode__add, vp, delayx, vip->vi_synclist_slot); mutex_exit(&syncer_data_lock); } /* * Remove an item from the syncer work queue. */ void vn_syncer_remove_from_worklist(struct vnode *vp) { synclist_t *slp; vnode_impl_t *vip = VNODE_TO_VIMPL(vp); KASSERT(mutex_owned(vp->v_interlock)); if (vp->v_iflag & VI_ONWORKLST) { mutex_enter(&syncer_data_lock); SDT_PROBE1(vfs, syncer, worklist, vnode__remove, vp); vp->v_iflag &= ~VI_ONWORKLST; slp = &syncer_workitem_pending[vip->vi_synclist_slot]; TAILQ_REMOVE(slp, vip, vi_synclist); mutex_exit(&syncer_data_lock); } } /* * Add this mount point to the syncer. */ void vfs_syncer_add_to_worklist(struct mount *mp) { static int start, incr, next; int vdelay; KASSERT(mutex_owned(mp->mnt_updating)); KASSERT((mp->mnt_iflag & IMNT_ONWORKLIST) == 0); /* * We attempt to scatter the mount points on the list * so that they will go off at evenly distributed times * even if all the filesystems are mounted at once. */ next += incr; if (next == 0 || next > syncer_maxdelay) { start /= 2; incr /= 2; if (start == 0) { start = syncer_maxdelay / 2; incr = syncer_maxdelay; } next = start; } mp->mnt_iflag |= IMNT_ONWORKLIST; vdelay = sync_delay(mp); mp->mnt_synclist_slot = vdelay > 0 ? next % vdelay : 0; SDT_PROBE3(vfs, syncer, worklist, mount__add, mp, vdelay, mp->mnt_synclist_slot); } /* * Remove the mount point from the syncer. */ void vfs_syncer_remove_from_worklist(struct mount *mp) { KASSERT(mutex_owned(mp->mnt_updating)); KASSERT((mp->mnt_iflag & IMNT_ONWORKLIST) != 0); SDT_PROBE1(vfs, syncer, worklist, mount__remove, mp); mp->mnt_iflag &= ~IMNT_ONWORKLIST; } /* * Try lazy sync, return true on success. */ static bool lazy_sync_vnode(struct vnode *vp) { bool synced; int error; KASSERT(mutex_owned(&syncer_data_lock)); synced = false; if ((error = vcache_tryvget(vp)) == 0) { mutex_exit(&syncer_data_lock); if ((error = vn_lock(vp, LK_EXCLUSIVE | LK_NOWAIT)) == 0) { synced = true; SDT_PROBE1(vfs, syncer, sync, vnode__start, vp); error = VOP_FSYNC(vp, curlwp->l_cred, FSYNC_LAZY, 0, 0); SDT_PROBE2(vfs, syncer, sync, vnode__done, vp, error); vput(vp); } else { SDT_PROBE2(vfs, syncer, sync, vnode__fail__lock, vp, error); vrele(vp); } mutex_enter(&syncer_data_lock); } else { SDT_PROBE2(vfs, syncer, sync, vnode__fail__vget, vp, error); } return synced; } /* * System filesystem synchronizer daemon. */ void sched_sync(void *arg) { mount_iterator_t *iter; synclist_t *slp; struct vnode_impl *vi; struct vnode *vp; struct mount *mp; time_t starttime, endtime; int vdelay, oslot, nslot, delayx; bool synced; int error; for (;;) { starttime = time_second; SDT_PROBE1(vfs, syncer, sync, start, starttime); /* * Sync mounts whose dirty time has expired. */ mountlist_iterator_init(&iter); while ((mp = mountlist_iterator_trynext(iter)) != NULL) { if ((mp->mnt_iflag & IMNT_ONWORKLIST) == 0 || mp->mnt_synclist_slot != syncer_delayno) { SDT_PROBE1(vfs, syncer, sync, mount__skip, mp); continue; } vdelay = sync_delay(mp); oslot = mp->mnt_synclist_slot; nslot = sync_delay_slot(vdelay); mp->mnt_synclist_slot = nslot; SDT_PROBE4(vfs, syncer, worklist, mount__update, mp, vdelay, oslot, nslot); SDT_PROBE1(vfs, syncer, sync, mount__start, mp); error = VFS_SYNC(mp, MNT_LAZY, curlwp->l_cred); SDT_PROBE2(vfs, syncer, sync, mount__done, mp, error); } mountlist_iterator_destroy(iter); mutex_enter(&syncer_data_lock); /* * Push files whose dirty time has expired. */ slp = &syncer_workitem_pending[syncer_delayno]; syncer_delayno += 1; if (syncer_delayno >= syncer_last) syncer_delayno = 0; while ((vi = TAILQ_FIRST(slp)) != NULL) { vp = VIMPL_TO_VNODE(vi); synced = lazy_sync_vnode(vp); /* * XXX The vnode may have been recycled, in which * case it may have a new identity. */ vi = TAILQ_FIRST(slp); if (vi != NULL && VIMPL_TO_VNODE(vi) == vp) { /* * Put us back on the worklist. The worklist * routine will remove us from our current * position and then add us back in at a later * position. * * Try again sooner rather than later if * we were unable to lock the vnode. Lock * failure should not prevent us from doing * the sync "soon". * * If we locked it yet arrive here, it's * likely that lazy sync is in progress and * so the vnode still has dirty metadata. * syncdelay is mainly to get this vnode out * of the way so we do not consider it again * "soon" in this loop, so the delay time is * not critical as long as it is not "soon". * While write-back strategy is the file * system's domain, we expect write-back to * occur no later than syncdelay seconds * into the future. */ delayx = synced ? syncdelay : lockdelay; oslot = vi->vi_synclist_slot; vn_syncer_add1(vp, delayx); nslot = vi->vi_synclist_slot; SDT_PROBE4(vfs, syncer, worklist, vnode__update, vp, delayx, oslot, nslot); } } endtime = time_second; SDT_PROBE2(vfs, syncer, sync, done, starttime, endtime); /* * If it has taken us less than a second to process the * current work, then wait. Otherwise start right over * again. We can still lose time if any single round * takes more than two seconds, but it does not really * matter as we are just trying to generally pace the * filesystem activity. */ if (endtime == starttime) { kpause("syncer", false, hz, &syncer_data_lock); } mutex_exit(&syncer_data_lock); } } static void sysctl_vfs_syncfs_setup(struct sysctllog **clog) { const struct sysctlnode *rnode, *cnode; sysctl_createv(clog, 0, NULL, &rnode, CTLFLAG_PERMANENT, CTLTYPE_NODE, "sync", SYSCTL_DESCR("syncer options"), NULL, 0, NULL, 0, CTL_VFS, CTL_CREATE, CTL_EOL); sysctl_createv(clog, 0, &rnode, &cnode, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_QUAD, "delay", SYSCTL_DESCR("max time to delay syncing data"), NULL, 0, &syncdelay, 0, CTL_CREATE, CTL_EOL); sysctl_createv(clog, 0, &rnode, &cnode, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_QUAD, "filedelay", SYSCTL_DESCR("time to delay syncing files"), NULL, 0, &filedelay, 0, CTL_CREATE, CTL_EOL); sysctl_createv(clog, 0, &rnode, &cnode, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_QUAD, "dirdelay", SYSCTL_DESCR("time to delay syncing directories"), NULL, 0, &dirdelay, 0, CTL_CREATE, CTL_EOL); sysctl_createv(clog, 0, &rnode, &cnode, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_QUAD, "metadelay", SYSCTL_DESCR("time to delay syncing metadata"), NULL, 0, &metadelay, 0, CTL_CREATE, CTL_EOL); } /* * sysctl helper routine to return list of supported fstypes */ int sysctl_vfs_generic_fstypes(SYSCTLFN_ARGS) { char bf[sizeof(((struct statvfs *)NULL)->f_fstypename)]; char *where = oldp; struct vfsops *v; size_t needed, left, slen; int error, first; if (newp != NULL) return (EPERM); if (namelen != 0) return (EINVAL); first = 1; error = 0; needed = 0; left = *oldlenp; sysctl_unlock(); mutex_enter(&vfs_list_lock); LIST_FOREACH(v, &vfs_list, vfs_list) { if (where == NULL) needed += strlen(v->vfs_name) + 1; else { memset(bf, 0, sizeof(bf)); if (first) { strncpy(bf, v->vfs_name, sizeof(bf)); first = 0; } else { bf[0] = ' '; strncpy(bf + 1, v->vfs_name, sizeof(bf) - 1); } bf[sizeof(bf)-1] = '\0'; slen = strlen(bf); if (left < slen + 1) break; v->vfs_refcount++; mutex_exit(&vfs_list_lock); /* +1 to copy out the trailing NUL byte */ error = copyout(bf, where, slen + 1); mutex_enter(&vfs_list_lock); v->vfs_refcount--; if (error) break; where += slen; needed += slen; left -= slen; } } mutex_exit(&vfs_list_lock); sysctl_relock(); *oldlenp = needed; return (error); } int kinfo_vdebug = 1; int kinfo_vgetfailed; #define KINFO_VNODESLOP 10 /* * Dump vnode list (via sysctl). * Copyout address of vnode followed by vnode. */ int sysctl_kern_vnode(SYSCTLFN_ARGS) { char *where = oldp; size_t *sizep = oldlenp; struct mount *mp; vnode_t *vp, vbuf; mount_iterator_t *iter; struct vnode_iterator *marker; char *bp = where; char *ewhere; int error; if (namelen != 0) return (EOPNOTSUPP); if (newp != NULL) return (EPERM); #define VPTRSZ sizeof(vnode_t *) #define VNODESZ sizeof(vnode_t) if (where == NULL) { *sizep = (numvnodes + KINFO_VNODESLOP) * (VPTRSZ + VNODESZ); return (0); } ewhere = where + *sizep; sysctl_unlock(); mountlist_iterator_init(&iter); while ((mp = mountlist_iterator_next(iter)) != NULL) { vfs_vnode_iterator_init(mp, &marker); while ((vp = vfs_vnode_iterator_next(marker, NULL, NULL))) { if (bp + VPTRSZ + VNODESZ > ewhere) { vrele(vp); vfs_vnode_iterator_destroy(marker); mountlist_iterator_destroy(iter); sysctl_relock(); *sizep = bp - where; return (ENOMEM); } memcpy(&vbuf, vp, VNODESZ); if ((error = copyout(&vp, bp, VPTRSZ)) || (error = copyout(&vbuf, bp + VPTRSZ, VNODESZ))) { vrele(vp); vfs_vnode_iterator_destroy(marker); mountlist_iterator_destroy(iter); sysctl_relock(); return (error); } vrele(vp); bp += VPTRSZ + VNODESZ; } vfs_vnode_iterator_destroy(marker); } mountlist_iterator_destroy(iter); sysctl_relock(); *sizep = bp - where; return (0); } /* * Set vnode attributes to VNOVAL */ void vattr_null(struct vattr *vap) { memset(vap, 0, sizeof(*vap)); vap->va_type = VNON; /* * Assign individually so that it is safe even if size and * sign of each member are varied. */ vap->va_mode = VNOVAL; vap->va_nlink = VNOVAL; vap->va_uid = VNOVAL; vap->va_gid = VNOVAL; vap->va_fsid = VNOVAL; vap->va_fileid = VNOVAL; vap->va_size = VNOVAL; vap->va_blocksize = VNOVAL; vap->va_atime.tv_sec = vap->va_mtime.tv_sec = vap->va_ctime.tv_sec = vap->va_birthtime.tv_sec = VNOVAL; vap->va_atime.tv_nsec = vap->va_mtime.tv_nsec = vap->va_ctime.tv_nsec = vap->va_birthtime.tv_nsec = VNOVAL; vap->va_gen = VNOVAL; vap->va_flags = VNOVAL; vap->va_rdev = VNOVAL; vap->va_bytes = VNOVAL; } /* * Vnode state to string. */ const char * vstate_name(enum vnode_state state) { switch (state) { case VS_ACTIVE: return "ACTIVE"; case VS_MARKER: return "MARKER"; case VS_LOADING: return "LOADING"; case VS_LOADED: return "LOADED"; case VS_BLOCKED: return "BLOCKED"; case VS_RECLAIMING: return "RECLAIMING"; case VS_RECLAIMED: return "RECLAIMED"; default: return "ILLEGAL"; } } /* * Print a description of a vnode (common part). */ static void vprint_common(struct vnode *vp, const char *prefix, void (*pr)(const char *, ...) __printflike(1, 2)) { int n; char bf[96]; const uint8_t *cp; vnode_impl_t *vip; const char * const vnode_tags[] = { VNODE_TAGS }; const char * const vnode_types[] = { VNODE_TYPES }; const char vnode_flagbits[] = VNODE_FLAGBITS; #define ARRAY_SIZE(arr) (sizeof(arr) / sizeof(arr[0])) #define ARRAY_PRINT(idx, arr) \ ((unsigned int)(idx) < ARRAY_SIZE(arr) ? (arr)[(idx)] : "UNKNOWN") vip = VNODE_TO_VIMPL(vp); snprintb(bf, sizeof(bf), vnode_flagbits, vp->v_iflag | vp->v_vflag | vp->v_uflag); (*pr)("vnode %p flags %s\n", vp, bf); (*pr)("%stag %s(%d) type %s(%d) mount %p typedata %p\n", prefix, ARRAY_PRINT(vp->v_tag, vnode_tags), vp->v_tag, ARRAY_PRINT(vp->v_type, vnode_types), vp->v_type, vp->v_mount, vp->v_mountedhere); (*pr)("%susecount %d writecount %d holdcount %d\n", prefix, vrefcnt(vp), vp->v_writecount, vp->v_holdcnt); (*pr)("%ssize %" PRIx64 " writesize %" PRIx64 " numoutput %d\n", prefix, vp->v_size, vp->v_writesize, vp->v_numoutput); (*pr)("%sdata %p lock %p\n", prefix, vp->v_data, &vip->vi_lock); (*pr)("%sstate %s key(%p %zd)", prefix, vstate_name(vip->vi_state), vip->vi_key.vk_mount, vip->vi_key.vk_key_len); n = vip->vi_key.vk_key_len; cp = vip->vi_key.vk_key; while (n-- > 0) (*pr)(" %02x", *cp++); (*pr)("\n"); (*pr)("%slrulisthd %p\n", prefix, vip->vi_lrulisthd); #undef ARRAY_PRINT #undef ARRAY_SIZE } /* * Print out a description of a vnode. */ void vprint(const char *label, struct vnode *vp) { if (label != NULL) printf("%s: ", label); vprint_common(vp, "\t", printf); if (vp->v_data != NULL) { printf("\t"); VOP_PRINT(vp); } } /* * Given a file system name, look up the vfsops for that * file system, or return NULL if file system isn't present * in the kernel. */ struct vfsops * vfs_getopsbyname(const char *name) { struct vfsops *v; mutex_enter(&vfs_list_lock); LIST_FOREACH(v, &vfs_list, vfs_list) { if (strcmp(v->vfs_name, name) == 0) break; } if (v != NULL) v->vfs_refcount++; mutex_exit(&vfs_list_lock); return (v); } void copy_statvfs_info(struct statvfs *sbp, const struct mount *mp) { const struct statvfs *mbp; if (sbp == (mbp = &mp->mnt_stat)) return; (void)memcpy(&sbp->f_fsidx, &mbp->f_fsidx, sizeof(sbp->f_fsidx)); sbp->f_fsid = mbp->f_fsid; sbp->f_owner = mbp->f_owner; sbp->f_flag = mbp->f_flag; sbp->f_syncwrites = mbp->f_syncwrites; sbp->f_asyncwrites = mbp->f_asyncwrites; sbp->f_syncreads = mbp->f_syncreads; sbp->f_asyncreads = mbp->f_asyncreads; (void)memcpy(sbp->f_spare, mbp->f_spare, sizeof(mbp->f_spare)); (void)memcpy(sbp->f_fstypename, mbp->f_fstypename, sizeof(sbp->f_fstypename)); (void)memcpy(sbp->f_mntonname, mbp->f_mntonname, sizeof(sbp->f_mntonname)); (void)memcpy(sbp->f_mntfromname, mp->mnt_stat.f_mntfromname, sizeof(sbp->f_mntfromname)); (void)memcpy(sbp->f_mntfromlabel, mp->mnt_stat.f_mntfromlabel, sizeof(sbp->f_mntfromlabel)); sbp->f_namemax = mbp->f_namemax; } int set_statvfs_info(const char *onp, int ukon, const char *fromp, int ukfrom, const char *vfsname, struct mount *mp, struct lwp *l) { int error; size_t size; struct statvfs *sfs = &mp->mnt_stat; int (*fun)(const void *, void *, size_t, size_t *); (void)strlcpy(mp->mnt_stat.f_fstypename, vfsname, sizeof(mp->mnt_stat.f_fstypename)); if (onp) { struct cwdinfo *cwdi = l->l_proc->p_cwdi; fun = (ukon == UIO_SYSSPACE) ? copystr : copyinstr; if (cwdi->cwdi_rdir != NULL) { size_t len; char *bp; char *path = PNBUF_GET(); bp = path + MAXPATHLEN; *--bp = '\0'; rw_enter(&cwdi->cwdi_lock, RW_READER); error = getcwd_common(cwdi->cwdi_rdir, rootvnode, &bp, path, MAXPATHLEN / 2, 0, l); rw_exit(&cwdi->cwdi_lock); if (error) { PNBUF_PUT(path); return error; } len = strlen(bp); if (len > sizeof(sfs->f_mntonname) - 1) len = sizeof(sfs->f_mntonname) - 1; (void)strncpy(sfs->f_mntonname, bp, len); PNBUF_PUT(path); if (len < sizeof(sfs->f_mntonname) - 1) { error = (*fun)(onp, &sfs->f_mntonname[len], sizeof(sfs->f_mntonname) - len - 1, &size); if (error) return error; size += len; } else { size = len; } } else { error = (*fun)(onp, &sfs->f_mntonname, sizeof(sfs->f_mntonname) - 1, &size); if (error) return error; } (void)memset(sfs->f_mntonname + size, 0, sizeof(sfs->f_mntonname) - size); } if (fromp) { fun = (ukfrom == UIO_SYSSPACE) ? copystr : copyinstr; error = (*fun)(fromp, sfs->f_mntfromname, sizeof(sfs->f_mntfromname) - 1, &size); if (error) return error; (void)memset(sfs->f_mntfromname + size, 0, sizeof(sfs->f_mntfromname) - size); } return 0; } /* * Knob to control the precision of file timestamps: * * 0 = seconds only; nanoseconds zeroed. * 1 = seconds and nanoseconds, accurate within 1/HZ. * 2 = seconds and nanoseconds, truncated to microseconds. * >=3 = seconds and nanoseconds, maximum precision. */ enum { TSP_SEC, TSP_HZ, TSP_USEC, TSP_NSEC }; int vfs_timestamp_precision __read_mostly = TSP_NSEC; void vfs_timestamp(struct timespec *tsp) { struct timeval tv; switch (vfs_timestamp_precision) { case TSP_SEC: tsp->tv_sec = time_second; tsp->tv_nsec = 0; break; case TSP_HZ: getnanotime(tsp); break; case TSP_USEC: microtime(&tv); TIMEVAL_TO_TIMESPEC(&tv, tsp); break; case TSP_NSEC: default: nanotime(tsp); break; } } /* * The purpose of this routine is to remove granularity from accmode_t, * reducing it into standard unix access bits - VEXEC, VREAD, VWRITE, * VADMIN and VAPPEND. * * If it returns 0, the caller is supposed to continue with the usual * access checks using 'accmode' as modified by this routine. If it * returns nonzero value, the caller is supposed to return that value * as errno. * * Note that after this routine runs, accmode may be zero. */ int vfs_unixify_accmode(accmode_t *accmode) { /* * There is no way to specify explicit "deny" rule using * file mode or POSIX.1e ACLs. */ if (*accmode & VEXPLICIT_DENY) { *accmode = 0; return (0); } /* * None of these can be translated into usual access bits. * Also, the common case for NFSv4 ACLs is to not contain * either of these bits. Caller should check for VWRITE * on the containing directory instead. */ if (*accmode & (VDELETE_CHILD | VDELETE)) return (EPERM); if (*accmode & VADMIN_PERMS) { *accmode &= ~VADMIN_PERMS; *accmode |= VADMIN; } /* * There is no way to deny VREAD_ATTRIBUTES, VREAD_ACL * or VSYNCHRONIZE using file mode or POSIX.1e ACL. */ *accmode &= ~(VSTAT_PERMS | VSYNCHRONIZE); return (0); } time_t rootfstime; /* recorded root fs time, if known */ void setrootfstime(time_t t) { rootfstime = t; } static const uint8_t vttodt_tab[ ] = { [VNON] = DT_UNKNOWN, [VREG] = DT_REG, [VDIR] = DT_DIR, [VBLK] = DT_BLK, [VCHR] = DT_CHR, [VLNK] = DT_LNK, [VSOCK] = DT_SOCK, [VFIFO] = DT_FIFO, [VBAD] = DT_UNKNOWN }; uint8_t vtype2dt(enum vtype vt) { CTASSERT(VBAD == __arraycount(vttodt_tab) - 1); return vttodt_tab[vt]; } int VFS_MOUNT(struct mount *mp, const char *a, void *b, size_t *c) { int mpsafe = mp->mnt_iflag & IMNT_MPSAFE; int error; /* * Note: The first time through, the vfs_mount function may set * IMNT_MPSAFE, so we have to cache it on entry in order to * avoid leaking a kernel lock. * * XXX Maybe the MPSAFE bit should be set in struct vfsops and * not in struct mount. */ if (mpsafe) { KERNEL_LOCK(1, NULL); } error = (*(mp->mnt_op->vfs_mount))(mp, a, b, c); if (mpsafe) { KERNEL_UNLOCK_ONE(NULL); } return error; } int VFS_START(struct mount *mp, int a) { int error; if ((mp->mnt_iflag & IMNT_MPSAFE) == 0) { KERNEL_LOCK(1, NULL); } error = (*(mp->mnt_op->vfs_start))(mp, a); if ((mp->mnt_iflag & IMNT_MPSAFE) == 0) { KERNEL_UNLOCK_ONE(NULL); } return error; } int VFS_UNMOUNT(struct mount *mp, int a) { int error; if ((mp->mnt_iflag & IMNT_MPSAFE) == 0) { KERNEL_LOCK(1, NULL); } error = (*(mp->mnt_op->vfs_unmount))(mp, a); if ((mp->mnt_iflag & IMNT_MPSAFE) == 0) { KERNEL_UNLOCK_ONE(NULL); } return error; } int VFS_ROOT(struct mount *mp, int lktype, struct vnode **a) { int error; if ((mp->mnt_iflag & IMNT_MPSAFE) == 0) { KERNEL_LOCK(1, NULL); } error = (*(mp->mnt_op->vfs_root))(mp, lktype, a); if ((mp->mnt_iflag & IMNT_MPSAFE) == 0) { KERNEL_UNLOCK_ONE(NULL); } return error; } int VFS_QUOTACTL(struct mount *mp, struct quotactl_args *args) { int error; if ((mp->mnt_iflag & IMNT_MPSAFE) == 0) { KERNEL_LOCK(1, NULL); } error = (*(mp->mnt_op->vfs_quotactl))(mp, args); if ((mp->mnt_iflag & IMNT_MPSAFE) == 0) { KERNEL_UNLOCK_ONE(NULL); } return error; } int VFS_STATVFS(struct mount *mp, struct statvfs *a) { int error; if ((mp->mnt_iflag & IMNT_MPSAFE) == 0) { KERNEL_LOCK(1, NULL); } error = (*(mp->mnt_op->vfs_statvfs))(mp, a); if ((mp->mnt_iflag & IMNT_MPSAFE) == 0) { KERNEL_UNLOCK_ONE(NULL); } return error; } int VFS_SYNC(struct mount *mp, int a, struct kauth_cred *b) { int error; if ((mp->mnt_iflag & IMNT_MPSAFE) == 0) { KERNEL_LOCK(1, NULL); } error = (*(mp->mnt_op->vfs_sync))(mp, a, b); if ((mp->mnt_iflag & IMNT_MPSAFE) == 0) { KERNEL_UNLOCK_ONE(NULL); } return error; } int VFS_FHTOVP(struct mount *mp, struct fid *a, int b, struct vnode **c) { int error; if ((mp->mnt_iflag & IMNT_MPSAFE) == 0) { KERNEL_LOCK(1, NULL); } error = (*(mp->mnt_op->vfs_fhtovp))(mp, a, b, c); if ((mp->mnt_iflag & IMNT_MPSAFE) == 0) { KERNEL_UNLOCK_ONE(NULL); } return error; } int VFS_VPTOFH(struct vnode *vp, struct fid *a, size_t *b) { int error; if ((vp->v_vflag & VV_MPSAFE) == 0) { KERNEL_LOCK(1, NULL); } error = (*(vp->v_mount->mnt_op->vfs_vptofh))(vp, a, b); if ((vp->v_vflag & VV_MPSAFE) == 0) { KERNEL_UNLOCK_ONE(NULL); } return error; } int VFS_SNAPSHOT(struct mount *mp, struct vnode *a, struct timespec *b) { int error; if ((mp->mnt_iflag & IMNT_MPSAFE) == 0) { KERNEL_LOCK(1, NULL); } error = (*(mp->mnt_op->vfs_snapshot))(mp, a, b); if ((mp->mnt_iflag & IMNT_MPSAFE) == 0) { KERNEL_UNLOCK_ONE(NULL); } return error; } int VFS_EXTATTRCTL(struct mount *mp, int a, struct vnode *b, int c, const char *d) { int error; KERNEL_LOCK(1, NULL); /* XXXSMP check ffs */ error = (*(mp->mnt_op->vfs_extattrctl))(mp, a, b, c, d); KERNEL_UNLOCK_ONE(NULL); /* XXX */ return error; } int VFS_SUSPENDCTL(struct mount *mp, int a) { int error; if ((mp->mnt_iflag & IMNT_MPSAFE) == 0) { KERNEL_LOCK(1, NULL); } error = (*(mp->mnt_op->vfs_suspendctl))(mp, a); if ((mp->mnt_iflag & IMNT_MPSAFE) == 0) { KERNEL_UNLOCK_ONE(NULL); } return error; } #if defined(DDB) || defined(DEBUGPRINT) static const char buf_flagbits[] = BUF_FLAGBITS; void vfs_buf_print(struct buf *bp, int full, void (*pr)(const char *, ...)) { char bf[1024]; (*pr)(" vp %p lblkno 0x%"PRIx64" blkno 0x%"PRIx64" rawblkno 0x%" PRIx64 " dev 0x%x\n", bp->b_vp, bp->b_lblkno, bp->b_blkno, bp->b_rawblkno, bp->b_dev); snprintb(bf, sizeof(bf), buf_flagbits, bp->b_flags | bp->b_oflags | bp->b_cflags); (*pr)(" error %d flags %s\n", bp->b_error, bf); (*pr)(" bufsize 0x%lx bcount 0x%lx resid 0x%lx\n", bp->b_bufsize, bp->b_bcount, bp->b_resid); (*pr)(" data %p saveaddr %p\n", bp->b_data, bp->b_saveaddr); (*pr)(" iodone %p objlock %p\n", bp->b_iodone, bp->b_objlock); } void vfs_vnode_print(struct vnode *vp, int full, void (*pr)(const char *, ...)) { uvm_object_printit(&vp->v_uobj, full, pr); (*pr)("\n"); vprint_common(vp, "", pr); if (full) { struct buf *bp; (*pr)("clean bufs:\n"); LIST_FOREACH(bp, &vp->v_cleanblkhd, b_vnbufs) { (*pr)(" bp %p\n", bp); vfs_buf_print(bp, full, pr); } (*pr)("dirty bufs:\n"); LIST_FOREACH(bp, &vp->v_dirtyblkhd, b_vnbufs) { (*pr)(" bp %p\n", bp); vfs_buf_print(bp, full, pr); } } } void vfs_vnode_lock_print(void *vlock, int full, void (*pr)(const char *, ...)) { struct mount *mp; vnode_impl_t *vip; for (mp = _mountlist_next(NULL); mp; mp = _mountlist_next(mp)) { TAILQ_FOREACH(vip, &mp->mnt_vnodelist, vi_mntvnodes) { if (&vip->vi_lock == vlock || VIMPL_TO_VNODE(vip)->v_interlock == vlock) vfs_vnode_print(VIMPL_TO_VNODE(vip), full, pr); } } } void vfs_mount_print_all(int full, void (*pr)(const char *, ...)) { struct mount *mp; for (mp = _mountlist_next(NULL); mp; mp = _mountlist_next(mp)) vfs_mount_print(mp, full, pr); } void vfs_mount_print(struct mount *mp, int full, void (*pr)(const char *, ...)) { char sbuf[256]; (*pr)("vnodecovered = %p data = %p\n", mp->mnt_vnodecovered, mp->mnt_data); (*pr)("fs_bshift %d dev_bshift = %d\n", mp->mnt_fs_bshift, mp->mnt_dev_bshift); snprintb(sbuf, sizeof(sbuf), __MNT_FLAG_BITS, mp->mnt_flag); (*pr)("flag = %s\n", sbuf); snprintb(sbuf, sizeof(sbuf), __IMNT_FLAG_BITS, mp->mnt_iflag); (*pr)("iflag = %s\n", sbuf); (*pr)("refcnt = %d updating @ %p\n", mp->mnt_refcnt, mp->mnt_updating); (*pr)("statvfs cache:\n"); (*pr)("\tbsize = %lu\n", mp->mnt_stat.f_bsize); (*pr)("\tfrsize = %lu\n", mp->mnt_stat.f_frsize); (*pr)("\tiosize = %lu\n", mp->mnt_stat.f_iosize); (*pr)("\tblocks = %"PRIu64"\n", mp->mnt_stat.f_blocks); (*pr)("\tbfree = %"PRIu64"\n", mp->mnt_stat.f_bfree); (*pr)("\tbavail = %"PRIu64"\n", mp->mnt_stat.f_bavail); (*pr)("\tbresvd = %"PRIu64"\n", mp->mnt_stat.f_bresvd); (*pr)("\tfiles = %"PRIu64"\n", mp->mnt_stat.f_files); (*pr)("\tffree = %"PRIu64"\n", mp->mnt_stat.f_ffree); (*pr)("\tfavail = %"PRIu64"\n", mp->mnt_stat.f_favail); (*pr)("\tfresvd = %"PRIu64"\n", mp->mnt_stat.f_fresvd); (*pr)("\tf_fsidx = { 0x%"PRIx32", 0x%"PRIx32" }\n", mp->mnt_stat.f_fsidx.__fsid_val[0], mp->mnt_stat.f_fsidx.__fsid_val[1]); (*pr)("\towner = %"PRIu32"\n", mp->mnt_stat.f_owner); (*pr)("\tnamemax = %lu\n", mp->mnt_stat.f_namemax); snprintb(sbuf, sizeof(sbuf), __MNT_FLAG_BITS, mp->mnt_stat.f_flag); (*pr)("\tflag = %s\n", sbuf); (*pr)("\tsyncwrites = %" PRIu64 "\n", mp->mnt_stat.f_syncwrites); (*pr)("\tasyncwrites = %" PRIu64 "\n", mp->mnt_stat.f_asyncwrites); (*pr)("\tsyncreads = %" PRIu64 "\n", mp->mnt_stat.f_syncreads); (*pr)("\tasyncreads = %" PRIu64 "\n", mp->mnt_stat.f_asyncreads); (*pr)("\tfstypename = %s\n", mp->mnt_stat.f_fstypename); (*pr)("\tmntonname = %s\n", mp->mnt_stat.f_mntonname); (*pr)("\tmntfromname = %s\n", mp->mnt_stat.f_mntfromname); { int cnt = 0; vnode_t *vp; vnode_impl_t *vip; (*pr)("locked vnodes ="); TAILQ_FOREACH(vip, &mp->mnt_vnodelist, vi_mntvnodes) { vp = VIMPL_TO_VNODE(vip); if (VOP_ISLOCKED(vp)) { if ((++cnt % 6) == 0) { (*pr)(" %p,\n\t", vp); } else { (*pr)(" %p,", vp); } } } (*pr)("\n"); } if (full) { int cnt = 0; vnode_t *vp; vnode_impl_t *vip; (*pr)("all vnodes ="); TAILQ_FOREACH(vip, &mp->mnt_vnodelist, vi_mntvnodes) { vp = VIMPL_TO_VNODE(vip); if (!TAILQ_NEXT(vip, vi_mntvnodes)) { (*pr)(" %p", vp); } else if ((++cnt % 6) == 0) { (*pr)(" %p,\n\t", vp); } else { (*pr)(" %p,", vp); } } (*pr)("\n"); } } /* * List all of the locked vnodes in the system. */ void printlockedvnodes(void); void printlockedvnodes(void) { struct mount *mp; vnode_t *vp; vnode_impl_t *vip; printf("Locked vnodes\n"); for (mp = _mountlist_next(NULL); mp; mp = _mountlist_next(mp)) { TAILQ_FOREACH(vip, &mp->mnt_vnodelist, vi_mntvnodes) { vp = VIMPL_TO_VNODE(vip); if (VOP_ISLOCKED(vp)) vprint(NULL, vp); } } } #endif /* DDB || DEBUGPRINT */
2 7 414 414 1 4 4 1 1 1 2 1 6 7 1 1 1 19 7 6 6 4 5 7 6 3 2 1 4 4 3 1 1 3 1 13 13 13 2 10 2 6 6 2 1 1 2 1 2 6 6 1 1 1 2 1 2 2 1 1 1 1 1 3 5 1 5 3 1 2 2 1 1 3 1 2 2 2 5 2 3 2 2 6 1 5 2 2 2 3 8 2 3 3 1 4 2 2 4 2 3 2 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 /* $NetBSD: uipc_sem.c,v 1.60 2020/12/14 23:12:12 chs Exp $ */ /*- * Copyright (c) 2011, 2019 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Mindaugas Rasiukevicius and Jason R. Thorpe. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /* * Copyright (c) 2002 Alfred Perlstein <alfred@FreeBSD.org> * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * Implementation of POSIX semaphore. */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: uipc_sem.c,v 1.60 2020/12/14 23:12:12 chs Exp $"); #include <sys/param.h> #include <sys/kernel.h> #include <sys/atomic.h> #include <sys/proc.h> #include <sys/lwp.h> #include <sys/ksem.h> #include <sys/syscall.h> #include <sys/stat.h> #include <sys/kmem.h> #include <sys/fcntl.h> #include <sys/file.h> #include <sys/filedesc.h> #include <sys/kauth.h> #include <sys/module.h> #include <sys/mount.h> #include <sys/mutex.h> #include <sys/rwlock.h> #include <sys/semaphore.h> #include <sys/syscall.h> #include <sys/syscallargs.h> #include <sys/syscallvar.h> #include <sys/sysctl.h> #include <sys/uidinfo.h> #include <sys/cprng.h> MODULE(MODULE_CLASS_MISC, ksem, NULL); #define SEM_MAX_NAMELEN NAME_MAX #define KS_UNLINKED 0x01 static kmutex_t ksem_lock __cacheline_aligned; static LIST_HEAD(,ksem) ksem_head __cacheline_aligned; static u_int nsems_total __cacheline_aligned; static u_int nsems __cacheline_aligned; static krwlock_t ksem_pshared_lock __cacheline_aligned; static LIST_HEAD(, ksem) *ksem_pshared_hashtab __cacheline_aligned; static u_long ksem_pshared_hashmask __read_mostly; #define KSEM_PSHARED_HASHSIZE 32 static kauth_listener_t ksem_listener; static int ksem_sysinit(void); static int ksem_sysfini(bool); static int ksem_modcmd(modcmd_t, void *); static void ksem_release(ksem_t *, int); static int ksem_close_fop(file_t *); static int ksem_stat_fop(file_t *, struct stat *); static int ksem_read_fop(file_t *, off_t *, struct uio *, kauth_cred_t, int); static const struct fileops semops = { .fo_name = "sem", .fo_read = ksem_read_fop, .fo_write = fbadop_write, .fo_ioctl = fbadop_ioctl, .fo_fcntl = fnullop_fcntl, .fo_poll = fnullop_poll, .fo_stat = ksem_stat_fop, .fo_close = ksem_close_fop, .fo_kqfilter = fnullop_kqfilter, .fo_restart = fnullop_restart, }; static const struct syscall_package ksem_syscalls[] = { { SYS__ksem_init, 0, (sy_call_t *)sys__ksem_init }, { SYS__ksem_open, 0, (sy_call_t *)sys__ksem_open }, { SYS__ksem_unlink, 0, (sy_call_t *)sys__ksem_unlink }, { SYS__ksem_close, 0, (sy_call_t *)sys__ksem_close }, { SYS__ksem_post, 0, (sy_call_t *)sys__ksem_post }, { SYS__ksem_wait, 0, (sy_call_t *)sys__ksem_wait }, { SYS__ksem_trywait, 0, (sy_call_t *)sys__ksem_trywait }, { SYS__ksem_getvalue, 0, (sy_call_t *)sys__ksem_getvalue }, { SYS__ksem_destroy, 0, (sy_call_t *)sys__ksem_destroy }, { SYS__ksem_timedwait, 0, (sy_call_t *)sys__ksem_timedwait }, { 0, 0, NULL }, }; struct sysctllog *ksem_clog; int ksem_max = KSEM_MAX; static int name_copyin(const char *uname, char **name) { *name = kmem_alloc(SEM_MAX_NAMELEN, KM_SLEEP); int error = copyinstr(uname, *name, SEM_MAX_NAMELEN, NULL); if (error) kmem_free(*name, SEM_MAX_NAMELEN); return error; } static void name_destroy(char **name) { if (!*name) return; kmem_free(*name, SEM_MAX_NAMELEN); *name = NULL; } static int ksem_listener_cb(kauth_cred_t cred, kauth_action_t action, void *cookie, void *arg0, void *arg1, void *arg2, void *arg3) { ksem_t *ks; mode_t mode; if (action != KAUTH_SYSTEM_SEMAPHORE) return KAUTH_RESULT_DEFER; ks = arg1; mode = ks->ks_mode; if ((kauth_cred_geteuid(cred) == ks->ks_uid && (mode & S_IWUSR) != 0) || (kauth_cred_getegid(cred) == ks->ks_gid && (mode & S_IWGRP) != 0) || (mode & S_IWOTH) != 0) return KAUTH_RESULT_ALLOW; return KAUTH_RESULT_DEFER; } static int ksem_sysinit(void) { int error; const struct sysctlnode *rnode; mutex_init(&ksem_lock, MUTEX_DEFAULT, IPL_NONE); LIST_INIT(&ksem_head); nsems_total = 0; nsems = 0; rw_init(&ksem_pshared_lock); ksem_pshared_hashtab = hashinit(KSEM_PSHARED_HASHSIZE, HASH_LIST, true, &ksem_pshared_hashmask); KASSERT(ksem_pshared_hashtab != NULL); ksem_listener = kauth_listen_scope(KAUTH_SCOPE_SYSTEM, ksem_listener_cb, NULL); /* Define module-specific sysctl tree */ ksem_clog = NULL; sysctl_createv(&ksem_clog, 0, NULL, &rnode, CTLFLAG_PERMANENT, CTLTYPE_NODE, "posix", SYSCTL_DESCR("POSIX options"), NULL, 0, NULL, 0, CTL_KERN, CTL_CREATE, CTL_EOL); sysctl_createv(&ksem_clog, 0, &rnode, NULL, CTLFLAG_PERMANENT | CTLFLAG_READWRITE, CTLTYPE_INT, "semmax", SYSCTL_DESCR("Maximal number of semaphores"), NULL, 0, &ksem_max, 0, CTL_CREATE, CTL_EOL); sysctl_createv(&ksem_clog, 0, &rnode, NULL, CTLFLAG_PERMANENT | CTLFLAG_READONLY, CTLTYPE_INT, "semcnt", SYSCTL_DESCR("Current number of semaphores"), NULL, 0, &nsems, 0, CTL_CREATE, CTL_EOL); error = syscall_establish(NULL, ksem_syscalls); if (error) { (void)ksem_sysfini(false); } return error; } static int ksem_sysfini(bool interface) { int error; if (interface) { error = syscall_disestablish(NULL, ksem_syscalls); if (error != 0) { return error; } /* * Make sure that no semaphores are in use. Note: semops * must be unused at this point. */ if (nsems_total) { error = syscall_establish(NULL, ksem_syscalls); KASSERT(error == 0); return EBUSY; } } kauth_unlisten_scope(ksem_listener); hashdone(ksem_pshared_hashtab, HASH_LIST, ksem_pshared_hashmask); rw_destroy(&ksem_pshared_lock); mutex_destroy(&ksem_lock); sysctl_teardown(&ksem_clog); return 0; } static int ksem_modcmd(modcmd_t cmd, void *arg) { switch (cmd) { case MODULE_CMD_INIT: return ksem_sysinit(); case MODULE_CMD_FINI: return ksem_sysfini(true); default: return ENOTTY; } } static ksem_t * ksem_lookup(const char *name) { ksem_t *ks; KASSERT(mutex_owned(&ksem_lock)); LIST_FOREACH(ks, &ksem_head, ks_entry) { if (strcmp(ks->ks_name, name) == 0) { mutex_enter(&ks->ks_lock); return ks; } } return NULL; } static int ksem_perm(lwp_t *l, ksem_t *ks) { kauth_cred_t uc = l->l_cred; KASSERT(mutex_owned(&ks->ks_lock)); if (kauth_authorize_system(uc, KAUTH_SYSTEM_SEMAPHORE, 0, ks, NULL, NULL) != 0) return EACCES; return 0; } /* * Bits 1..23 are random, just pluck a few of those and assume the * distribution is going to be pretty good. */ #define KSEM_PSHARED_HASH(id) (((id) >> 1) & ksem_pshared_hashmask) static void ksem_remove_pshared(ksem_t *ksem) { rw_enter(&ksem_pshared_lock, RW_WRITER); LIST_REMOVE(ksem, ks_entry); rw_exit(&ksem_pshared_lock); } static ksem_t * ksem_lookup_pshared_locked(intptr_t id) { u_long bucket = KSEM_PSHARED_HASH(id); ksem_t *ksem = NULL; /* ksem_t is locked and referenced upon return. */ LIST_FOREACH(ksem, &ksem_pshared_hashtab[bucket], ks_entry) { if (ksem->ks_pshared_id == id) { mutex_enter(&ksem->ks_lock); if (ksem->ks_pshared_proc == NULL) { /* * This entry is dead, and in the process * of being torn down; skip it. */ mutex_exit(&ksem->ks_lock); continue; } ksem->ks_ref++; KASSERT(ksem->ks_ref != 0); return ksem; } } return NULL; } static ksem_t * ksem_lookup_pshared(intptr_t id) { rw_enter(&ksem_pshared_lock, RW_READER); ksem_t *ksem = ksem_lookup_pshared_locked(id); rw_exit(&ksem_pshared_lock); return ksem; } static void ksem_alloc_pshared_id(ksem_t *ksem) { ksem_t *ksem0; uint32_t try; KASSERT(ksem->ks_pshared_proc != NULL); rw_enter(&ksem_pshared_lock, RW_WRITER); for (;;) { try = (cprng_fast32() & ~KSEM_MARKER_MASK) | KSEM_PSHARED_MARKER; if ((ksem0 = ksem_lookup_pshared_locked(try)) == NULL) { /* Got it! */ break; } ksem_release(ksem0, -1); } ksem->ks_pshared_id = try; u_long bucket = KSEM_PSHARED_HASH(ksem->ks_pshared_id); LIST_INSERT_HEAD(&ksem_pshared_hashtab[bucket], ksem, ks_entry); rw_exit(&ksem_pshared_lock); } /* * ksem_get: get the semaphore from the descriptor. * * => locks the semaphore, if found, and holds an extra reference. * => holds a reference on the file descriptor. */ static int ksem_get(intptr_t id, ksem_t **ksret, int *fdp) { ksem_t *ks; int fd; if ((id & KSEM_MARKER_MASK) == KSEM_PSHARED_MARKER) { /* * ksem_lookup_pshared() returns the ksem_t * * locked and referenced. */ ks = ksem_lookup_pshared(id); if (ks == NULL) return EINVAL; KASSERT(ks->ks_pshared_id == id); KASSERT(ks->ks_pshared_proc != NULL); fd = -1; } else if (id <= INT_MAX) { fd = (int)id; file_t *fp = fd_getfile(fd); if (__predict_false(fp == NULL)) return EINVAL; if (__predict_false(fp->f_type != DTYPE_SEM)) { fd_putfile(fd); return EINVAL; } ks = fp->f_ksem; mutex_enter(&ks->ks_lock); ks->ks_ref++; } else { return EINVAL; } *ksret = ks; *fdp = fd; return 0; } /* * ksem_create: allocate and setup a new semaphore structure. */ static int ksem_create(lwp_t *l, const char *name, ksem_t **ksret, mode_t mode, u_int val) { ksem_t *ks; kauth_cred_t uc; char *kname; size_t len; /* Pre-check for the limit. */ if (nsems >= ksem_max) { return ENFILE; } if (val > SEM_VALUE_MAX) { return EINVAL; } if (name != NULL) { len = strlen(name); if (len > SEM_MAX_NAMELEN) { return ENAMETOOLONG; } /* Name must start with a '/' but not contain one. */ if (*name != '/' || len < 2 || strchr(name + 1, '/') != NULL) { return EINVAL; } kname = kmem_alloc(++len, KM_SLEEP); strlcpy(kname, name, len); } else { kname = NULL; len = 0; } ks = kmem_zalloc(sizeof(ksem_t), KM_SLEEP); mutex_init(&ks->ks_lock, MUTEX_DEFAULT, IPL_NONE); cv_init(&ks->ks_cv, "psem"); ks->ks_name = kname; ks->ks_namelen = len; ks->ks_mode = mode; ks->ks_value = val; ks->ks_ref = 1; uc = l->l_cred; ks->ks_uid = kauth_cred_geteuid(uc); ks->ks_gid = kauth_cred_getegid(uc); chgsemcnt(ks->ks_uid, 1); atomic_inc_uint(&nsems_total); *ksret = ks; return 0; } static void ksem_free(ksem_t *ks) { KASSERT(!cv_has_waiters(&ks->ks_cv)); chgsemcnt(ks->ks_uid, -1); atomic_dec_uint(&nsems_total); if (ks->ks_pshared_id) { KASSERT(ks->ks_pshared_proc == NULL); ksem_remove_pshared(ks); } if (ks->ks_name) { KASSERT(ks->ks_namelen > 0); kmem_free(ks->ks_name, ks->ks_namelen); } mutex_destroy(&ks->ks_lock); cv_destroy(&ks->ks_cv); kmem_free(ks, sizeof(ksem_t)); } #define KSEM_ID_IS_PSHARED(id) \ (((id) & KSEM_MARKER_MASK) == KSEM_PSHARED_MARKER) static void ksem_release(ksem_t *ksem, int fd) { bool destroy = false; KASSERT(mutex_owned(&ksem->ks_lock)); KASSERT(ksem->ks_ref > 0); if (--ksem->ks_ref == 0) { /* * Destroy if the last reference and semaphore is unnamed, * or unlinked (for named semaphore). */ destroy = (ksem->ks_flags & KS_UNLINKED) || (ksem->ks_name == NULL); } mutex_exit(&ksem->ks_lock); if (destroy) { ksem_free(ksem); } if (fd != -1) { fd_putfile(fd); } } int sys__ksem_init(struct lwp *l, const struct sys__ksem_init_args *uap, register_t *retval) { /* { unsigned int value; intptr_t *idp; } */ return do_ksem_init(l, SCARG(uap, value), SCARG(uap, idp), copyin, copyout); } int do_ksem_init(lwp_t *l, u_int val, intptr_t *idp, copyin_t docopyin, copyout_t docopyout) { proc_t *p = l->l_proc; ksem_t *ks; file_t *fp; intptr_t id, arg; int fd, error; /* * Newer versions of librt / libpthread pass us 'PSRD' in *idp to * indicate that a pshared semaphore is wanted. In that case we * allocate globally unique ID and return that, rather than the * process-scoped file descriptor ID. */ error = (*docopyin)(idp, &arg, sizeof(*idp)); if (error) { return error; } error = fd_allocfile(&fp, &fd); if (error) { return error; } fp->f_type = DTYPE_SEM; fp->f_flag = FREAD | FWRITE; fp->f_ops = &semops; if (fd >= KSEM_MARKER_MIN) { /* * This is super-unlikely, but we check for it anyway * because potential collisions with the pshared marker * would be bad. */ fd_abort(p, fp, fd); return EMFILE; } /* Note the mode does not matter for anonymous semaphores. */ error = ksem_create(l, NULL, &ks, 0, val); if (error) { fd_abort(p, fp, fd); return error; } if (arg == KSEM_PSHARED) { ks->ks_pshared_proc = curproc; ks->ks_pshared_fd = fd; ksem_alloc_pshared_id(ks); id = ks->ks_pshared_id; } else { id = (intptr_t)fd; } error = (*docopyout)(&id, idp, sizeof(*idp)); if (error) { ksem_free(ks); fd_abort(p, fp, fd); return error; } fp->f_ksem = ks; fd_affix(p, fp, fd); return error; } int sys__ksem_open(struct lwp *l, const struct sys__ksem_open_args *uap, register_t *retval) { /* { const char *name; int oflag; mode_t mode; unsigned int value; intptr_t *idp; } */ return do_ksem_open(l, SCARG(uap, name), SCARG(uap, oflag), SCARG(uap, mode), SCARG(uap, value), SCARG(uap, idp), copyout); } int do_ksem_open(struct lwp *l, const char *semname, int oflag, mode_t mode, unsigned int value, intptr_t *idp, copyout_t docopyout) { char *name; proc_t *p = l->l_proc; ksem_t *ksnew = NULL, *ks; file_t *fp; intptr_t id; int fd, error; error = name_copyin(semname, &name); if (error) { return error; } error = fd_allocfile(&fp, &fd); if (error) { name_destroy(&name); return error; } fp->f_type = DTYPE_SEM; fp->f_flag = FREAD | FWRITE; fp->f_ops = &semops; if (fd >= KSEM_MARKER_MIN) { /* * This is super-unlikely, but we check for it anyway * because potential collisions with the pshared marker * would be bad. */ fd_abort(p, fp, fd); return EMFILE; } /* * The ID (file descriptor number) can be stored early. * Note that zero is a special value for libpthread. */ id = (intptr_t)fd; error = (*docopyout)(&id, idp, sizeof(*idp)); if (error) { goto err; } if (oflag & O_CREAT) { /* Create a new semaphore. */ error = ksem_create(l, name, &ksnew, mode, value); if (error) { goto err; } KASSERT(ksnew != NULL); } /* Lookup for a semaphore with such name. */ mutex_enter(&ksem_lock); ks = ksem_lookup(name); name_destroy(&name); if (ks) { KASSERT(mutex_owned(&ks->ks_lock)); mutex_exit(&ksem_lock); /* Check for exclusive create. */ if (oflag & O_EXCL) { mutex_exit(&ks->ks_lock); error = EEXIST; goto err; } /* * Verify permissions. If we can access it, * add the reference of this thread. */ error = ksem_perm(l, ks); if (error == 0) { ks->ks_ref++; } mutex_exit(&ks->ks_lock); if (error) { goto err; } } else { /* Fail if not found and not creating. */ if ((oflag & O_CREAT) == 0) { mutex_exit(&ksem_lock); KASSERT(ksnew == NULL); error = ENOENT; goto err; } /* Check for the limit locked. */ if (nsems >= ksem_max) { mutex_exit(&ksem_lock); error = ENFILE; goto err; } /* * Finally, insert semaphore into the list. * Note: it already has the initial reference. */ ks = ksnew; LIST_INSERT_HEAD(&ksem_head, ks, ks_entry); nsems++; mutex_exit(&ksem_lock); ksnew = NULL; } KASSERT(ks != NULL); fp->f_ksem = ks; fd_affix(p, fp, fd); err: name_destroy(&name); if (error) { fd_abort(p, fp, fd); } if (ksnew) { ksem_free(ksnew); } return error; } int sys__ksem_close(struct lwp *l, const struct sys__ksem_close_args *uap, register_t *retval) { /* { intptr_t id; } */ intptr_t id = SCARG(uap, id); int fd, error; ksem_t *ks; error = ksem_get(id, &ks, &fd); if (error) { return error; } /* This is only for named semaphores. */ if (ks->ks_name == NULL) { error = EINVAL; } ksem_release(ks, -1); if (error) { if (fd != -1) fd_putfile(fd); return error; } return fd_close(fd); } static int ksem_read_fop(file_t *fp, off_t *offset, struct uio *uio, kauth_cred_t cred, int flags) { size_t len; char *name; ksem_t *ks = fp->f_ksem; mutex_enter(&ks->ks_lock); len = ks->ks_namelen; name = ks->ks_name; mutex_exit(&ks->ks_lock); if (name == NULL || len == 0) return 0; return uiomove(name, len, uio); } static int ksem_stat_fop(file_t *fp, struct stat *ub) { ksem_t *ks = fp->f_ksem; mutex_enter(&ks->ks_lock); memset(ub, 0, sizeof(*ub)); ub->st_mode = ks->ks_mode | ((ks->ks_name && ks->ks_namelen) ? _S_IFLNK : _S_IFREG); ub->st_uid = ks->ks_uid; ub->st_gid = ks->ks_gid; ub->st_size = ks->ks_value; ub->st_blocks = (ub->st_size) ? 1 : 0; ub->st_nlink = ks->ks_ref; ub->st_blksize = 4096; nanotime(&ub->st_atimespec); ub->st_mtimespec = ub->st_ctimespec = ub->st_birthtimespec = ub->st_atimespec; /* * Left as 0: st_dev, st_ino, st_rdev, st_flags, st_gen. * XXX (st_dev, st_ino) should be unique. */ mutex_exit(&ks->ks_lock); return 0; } static int ksem_close_fop(file_t *fp) { ksem_t *ks = fp->f_ksem; mutex_enter(&ks->ks_lock); if (ks->ks_pshared_id) { if (ks->ks_pshared_proc != curproc) { /* Do nothing if this is not the creator. */ mutex_exit(&ks->ks_lock); return 0; } /* Mark this semaphore as dead. */ ks->ks_pshared_proc = NULL; } ksem_release(ks, -1); return 0; } int sys__ksem_unlink(struct lwp *l, const struct sys__ksem_unlink_args *uap, register_t *retval) { /* { const char *name; } */ char *name; ksem_t *ks; u_int refcnt; int error; error = name_copyin(SCARG(uap, name), &name); if (error) return error; mutex_enter(&ksem_lock); ks = ksem_lookup(name); name_destroy(&name); if (ks == NULL) { mutex_exit(&ksem_lock); return ENOENT; } KASSERT(mutex_owned(&ks->ks_lock)); /* Verify permissions. */ error = ksem_perm(l, ks); if (error) { mutex_exit(&ks->ks_lock); mutex_exit(&ksem_lock); return error; } /* Remove from the global list. */ LIST_REMOVE(ks, ks_entry); nsems--; mutex_exit(&ksem_lock); refcnt = ks->ks_ref; if (refcnt) { /* Mark as unlinked, if there are references. */ ks->ks_flags |= KS_UNLINKED; } mutex_exit(&ks->ks_lock); if (refcnt == 0) { ksem_free(ks); } return 0; } int sys__ksem_post(struct lwp *l, const struct sys__ksem_post_args *uap, register_t *retval) { /* { intptr_t id; } */ int fd, error; ksem_t *ks; error = ksem_get(SCARG(uap, id), &ks, &fd); if (error) { return error; } KASSERT(mutex_owned(&ks->ks_lock)); if (ks->ks_value == SEM_VALUE_MAX) { error = EOVERFLOW; goto out; } ks->ks_value++; if (ks->ks_waiters) { cv_broadcast(&ks->ks_cv); } out: ksem_release(ks, fd); return error; } int do_ksem_wait(lwp_t *l, intptr_t id, bool try_p, struct timespec *abstime) { int fd, error, timeo; ksem_t *ks; error = ksem_get(id, &ks, &fd); if (error) { return error; } KASSERT(mutex_owned(&ks->ks_lock)); while (ks->ks_value == 0) { ks->ks_waiters++; if (!try_p && abstime != NULL) { error = ts2timo(CLOCK_REALTIME, TIMER_ABSTIME, abstime, &timeo, NULL); if (error != 0) goto out; } else { timeo = 0; } error = try_p ? EAGAIN : cv_timedwait_sig(&ks->ks_cv, &ks->ks_lock, timeo); ks->ks_waiters--; if (error) goto out; } ks->ks_value--; out: ksem_release(ks, fd); return error; } int sys__ksem_wait(struct lwp *l, const struct sys__ksem_wait_args *uap, register_t *retval) { /* { intptr_t id; } */ return do_ksem_wait(l, SCARG(uap, id), false, NULL); } int sys__ksem_timedwait(struct lwp *l, const struct sys__ksem_timedwait_args *uap, register_t *retval) { /* { intptr_t id; const struct timespec *abstime; } */ struct timespec ts; int error; error = copyin(SCARG(uap, abstime), &ts, sizeof(ts)); if (error != 0) return error; if (ts.tv_sec < 0 || ts.tv_nsec < 0 || ts.tv_nsec >= 1000000000) return EINVAL; error = do_ksem_wait(l, SCARG(uap, id), false, &ts); if (error == EWOULDBLOCK) error = ETIMEDOUT; return error; } int sys__ksem_trywait(struct lwp *l, const struct sys__ksem_trywait_args *uap, register_t *retval) { /* { intptr_t id; } */ return do_ksem_wait(l, SCARG(uap, id), true, NULL); } int sys__ksem_getvalue(struct lwp *l, const struct sys__ksem_getvalue_args *uap, register_t *retval) { /* { intptr_t id; unsigned int *value; } */ int fd, error; ksem_t *ks; unsigned int val; error = ksem_get(SCARG(uap, id), &ks, &fd); if (error) { return error; } KASSERT(mutex_owned(&ks->ks_lock)); val = ks->ks_value; ksem_release(ks, fd); return copyout(&val, SCARG(uap, value), sizeof(val)); } int sys__ksem_destroy(struct lwp *l, const struct sys__ksem_destroy_args *uap, register_t *retval) { /* { intptr_t id; } */ int fd, error; ksem_t *ks; intptr_t id = SCARG(uap, id); error = ksem_get(id, &ks, &fd); if (error) { return error; } KASSERT(mutex_owned(&ks->ks_lock)); /* Operation is only for unnamed semaphores. */ if (ks->ks_name != NULL) { error = EINVAL; goto out; } /* Cannot destroy if there are waiters. */ if (ks->ks_waiters) { error = EBUSY; goto out; } if (KSEM_ID_IS_PSHARED(id)) { /* Cannot destroy if we did't create it. */ KASSERT(fd == -1); KASSERT(ks->ks_pshared_proc != NULL); if (ks->ks_pshared_proc != curproc) { error = EINVAL; goto out; } fd = ks->ks_pshared_fd; /* Mark it dead so subsequent lookups fail. */ ks->ks_pshared_proc = NULL; /* Do an fd_getfile() to for the benefit of fd_close(). */ file_t *fp __diagused = fd_getfile(fd); KASSERT(fp != NULL); KASSERT(fp->f_ksem == ks); } out: ksem_release(ks, -1); if (error) { if (!KSEM_ID_IS_PSHARED(id)) fd_putfile(fd); return error; } return fd_close(fd); }
30 30 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 /* $NetBSD: ufs_extattr.c,v 1.55 2024/02/10 18:43:53 andvar Exp $ */ /*- * Copyright (c) 1999-2002 Robert N. M. Watson * Copyright (c) 2002-2003 Networks Associates Technology, Inc. * All rights reserved. * * This software was developed by Robert Watson for the TrustedBSD Project. * * This software was developed for the FreeBSD Project in part by Network * Associates Laboratories, the Security Research Division of Network * Associates, Inc. under DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), * as part of the DARPA CHATS research program. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * */ /* * Support for file system extended attributes on the UFS1 file system. * * Extended attributes are defined in the form name=value, where name is * a nul-terminated string in the style of a file name, and value is a * binary blob of zero or more bytes. The UFS1 extended attribute service * layers support for extended attributes onto a backing file, in the style * of the quota implementation, meaning that it requires no underlying format * changes to the file system. This design choice exchanges simplicity, * usability, and easy deployment for performance. */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: ufs_extattr.c,v 1.55 2024/02/10 18:43:53 andvar Exp $"); #ifdef _KERNEL_OPT #include "opt_ffs.h" #endif #include <sys/param.h> #include <sys/systm.h> #include <sys/reboot.h> #include <sys/kauth.h> #include <sys/kernel.h> #include <sys/namei.h> #include <sys/kmem.h> #include <sys/fcntl.h> #include <sys/lwp.h> #include <sys/vnode.h> #include <sys/mount.h> #include <sys/lock.h> #include <sys/dirent.h> #include <sys/extattr.h> #include <sys/sysctl.h> #include <ufs/ufs/dir.h> #include <ufs/ufs/extattr.h> #include <ufs/ufs/ufsmount.h> #include <ufs/ufs/inode.h> #include <ufs/ufs/ufs_bswap.h> #include <ufs/ufs/ufs_extern.h> int ufs_extattr_sync = 1; int ufs_extattr_autocreate = 1024; static int ufs_extattr_valid_attrname(int attrnamespace, const char *attrname); static int ufs_extattr_enable_with_open(struct ufsmount *ump, struct vnode *vp, int attrnamespace, const char *attrname, struct lwp *l); static int ufs_extattr_enable(struct ufsmount *ump, int attrnamespace, const char *attrname, struct vnode *backing_vnode, struct lwp *l); static int ufs_extattr_disable(struct ufsmount *ump, int attrnamespace, const char *attrname, struct lwp *l); static int ufs_extattr_get(struct vnode *vp, int attrnamespace, const char *name, struct uio *uio, size_t *size, kauth_cred_t cred, struct lwp *l); static int ufs_extattr_list(struct vnode *vp, int attrnamespace, struct uio *uio, size_t *size, int flag, kauth_cred_t cred, struct lwp *l); static int ufs_extattr_set(struct vnode *vp, int attrnamespace, const char *name, struct uio *uio, kauth_cred_t cred, struct lwp *l); static int ufs_extattr_rm(struct vnode *vp, int attrnamespace, const char *name, kauth_cred_t cred, struct lwp *l); static struct ufs_extattr_list_entry *ufs_extattr_find_attr(struct ufsmount *, int, const char *); static int ufs_extattr_get_header(struct vnode *, struct ufs_extattr_list_entry *, struct ufs_extattr_header *, off_t *); /* * Per-FS attribute lock protecting attribute operations. * XXX Right now there is a lot of lock contention due to having a single * lock per-FS; really, this should be far more fine-grained. */ static void ufs_extattr_uepm_lock(struct ufsmount *ump) { /* * XXX This needs to be recursive for the following reasons: * - it is taken in ufs_extattr_vnode_inactive * - which is called from VOP_INACTIVE * - which can be triggered by any vrele, vput, or vn_close * - several of these can happen while it's held */ if (mutex_owned(&ump->um_extattr.uepm_lock)) { ump->um_extattr.uepm_lockcnt++; return; } mutex_enter(&ump->um_extattr.uepm_lock); } static void ufs_extattr_uepm_unlock(struct ufsmount *ump) { if (ump->um_extattr.uepm_lockcnt != 0) { KASSERT(mutex_owned(&ump->um_extattr.uepm_lock)); ump->um_extattr.uepm_lockcnt--; return; } mutex_exit(&ump->um_extattr.uepm_lock); } /*- * Determine whether the name passed is a valid name for an actual * attribute. * * Invalid currently consists of: * NULL pointer for attrname * zero-length attrname (used to retrieve application attribute list) */ static int ufs_extattr_valid_attrname(int attrnamespace, const char *attrname) { if (attrname == NULL) return 0; if (strlen(attrname) == 0) return 0; return 1; } /* * Autocreate an attribute storage */ static int ufs_extattr_autocreate_attr(struct vnode *vp, int attrnamespace, const char *attrname, struct lwp *l, struct ufs_extattr_list_entry **uelep) { struct mount *mp = vp->v_mount; struct ufsmount *ump = VFSTOUFS(mp); struct vnode *backing_vp; struct pathbuf *pb; char *path; struct ufs_extattr_fileheader uef; struct ufs_extattr_list_entry *uele; int error; path = PNBUF_GET(); /* * We only support system and user namespace autocreation */ switch (attrnamespace) { case EXTATTR_NAMESPACE_SYSTEM: (void)snprintf(path, PATH_MAX, "%s/%s/%s/%s", mp->mnt_stat.f_mntonname, UFS_EXTATTR_FSROOTSUBDIR, UFS_EXTATTR_SUBDIR_SYSTEM, attrname); break; case EXTATTR_NAMESPACE_USER: (void)snprintf(path, PATH_MAX, "%s/%s/%s/%s", mp->mnt_stat.f_mntonname, UFS_EXTATTR_FSROOTSUBDIR, UFS_EXTATTR_SUBDIR_USER, attrname); break; default: PNBUF_PUT(path); *uelep = NULL; return EINVAL; break; } /* * Release extended attribute mount lock, otherwise * we can deadlock with another thread that would lock * vp after we unlock it below, and call * ufs_extattr_uepm_lock(ump), for instance * in ufs_getextattr(). */ ufs_extattr_uepm_unlock(ump); /* * XXX unlock/lock should only be done when setting extattr * on backing store or one of its parent directory * including root, but we always do it for now. */ KASSERT(VOP_ISLOCKED(vp) == LK_EXCLUSIVE); VOP_UNLOCK(vp); pb = pathbuf_create(path); /* * Since we do not hold ufs_extattr_uepm_lock anymore, * another thread may race with us for backend creation, * but only one can succeed here thanks to O_EXCL. * * backing_vp is the backing store. */ error = vn_open(NULL, pb, 0, O_CREAT|O_EXCL|O_RDWR, 0600, &backing_vp, NULL, NULL); /* * Reacquire the lock on the vnode */ KASSERT(VOP_ISLOCKED(vp) == 0); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); ufs_extattr_uepm_lock(ump); if (error != 0) { pathbuf_destroy(pb); PNBUF_PUT(path); *uelep = NULL; return error; } KASSERT(backing_vp != NULL); KASSERT(VOP_ISLOCKED(backing_vp) == LK_EXCLUSIVE); pathbuf_destroy(pb); PNBUF_PUT(path); uef.uef_magic = UFS_EXTATTR_MAGIC; uef.uef_version = UFS_EXTATTR_VERSION; uef.uef_size = ufs_extattr_autocreate; error = vn_rdwr(UIO_WRITE, backing_vp, &uef, sizeof(uef), 0, UIO_SYSSPACE, IO_NODELOCKED|IO_APPEND, l->l_cred, NULL, l); VOP_UNLOCK(backing_vp); if (error != 0) { printf("%s: write uef header failed for `%s' (%d)\n", __func__, attrname, error); vn_close(backing_vp, FREAD|FWRITE, l->l_cred); *uelep = NULL; return error; } /* * Now enable attribute. */ error = ufs_extattr_enable(ump,attrnamespace, attrname, backing_vp, l); KASSERT(VOP_ISLOCKED(backing_vp) == 0); if (error != 0) { printf("%s: enable `%s' failed (%d)\n", __func__, attrname, error); vn_close(backing_vp, FREAD|FWRITE, l->l_cred); *uelep = NULL; return error; } uele = ufs_extattr_find_attr(ump, attrnamespace, attrname); if (uele == NULL) { printf("%s: attribute `%s' created but not found!\n", __func__, attrname); vn_close(backing_vp, FREAD|FWRITE, l->l_cred); *uelep = NULL; return ESRCH; /* really internal error */ } printf("%s: EA backing store autocreated for %s\n", mp->mnt_stat.f_mntonname, attrname); *uelep = uele; return 0; } /* * Locate an attribute given a name and mountpoint. * Must be holding uepm lock for the mount point. */ static struct ufs_extattr_list_entry * ufs_extattr_find_attr(struct ufsmount *ump, int attrnamespace, const char *attrname) { struct ufs_extattr_list_entry *search_attribute; for (search_attribute = LIST_FIRST(&ump->um_extattr.uepm_list); search_attribute != NULL; search_attribute = LIST_NEXT(search_attribute, uele_entries)) { if (!(strncmp(attrname, search_attribute->uele_attrname, UFS_EXTATTR_MAXEXTATTRNAME)) && (attrnamespace == search_attribute->uele_attrnamespace)) { return search_attribute; } } return 0; } /* * Initialize per-FS structures supporting extended attributes. Do not * start extended attributes yet. */ void ufs_extattr_uepm_init(struct ufs_extattr_per_mount *uepm) { uepm->uepm_flags = 0; uepm->uepm_lockcnt = 0; LIST_INIT(&uepm->uepm_list); mutex_init(&uepm->uepm_lock, MUTEX_DEFAULT, IPL_NONE); uepm->uepm_flags |= UFS_EXTATTR_UEPM_INITIALIZED; } /* * Destroy per-FS structures supporting extended attributes. Assumes * that EAs have already been stopped, and will panic if not. */ void ufs_extattr_uepm_destroy(struct ufs_extattr_per_mount *uepm) { if (!(uepm->uepm_flags & UFS_EXTATTR_UEPM_INITIALIZED)) panic("ufs_extattr_uepm_destroy: not initialized"); if ((uepm->uepm_flags & UFS_EXTATTR_UEPM_STARTED)) panic("ufs_extattr_uepm_destroy: called while still started"); /* * It's not clear that either order for the next three lines is * ideal, and it should never be a problem if this is only called * during unmount, and with vfs_busy(). */ uepm->uepm_flags &= ~UFS_EXTATTR_UEPM_STARTED; uepm->uepm_flags &= ~UFS_EXTATTR_UEPM_INITIALIZED; mutex_destroy(&uepm->uepm_lock); } /* * Start extended attribute support on an FS. */ int ufs_extattr_start(struct mount *mp, struct lwp *l) { struct ufsmount *ump; int error = 0; ump = VFSTOUFS(mp); if (!(ump->um_extattr.uepm_flags & UFS_EXTATTR_UEPM_INITIALIZED)) ufs_extattr_uepm_init(&ump->um_extattr); ufs_extattr_uepm_lock(ump); if (!(ump->um_extattr.uepm_flags & UFS_EXTATTR_UEPM_INITIALIZED)) { error = EOPNOTSUPP; goto unlock; } if (ump->um_extattr.uepm_flags & UFS_EXTATTR_UEPM_STARTED) { error = EBUSY; goto unlock; } ump->um_extattr.uepm_flags |= UFS_EXTATTR_UEPM_STARTED; ump->um_extattr.uepm_ucred = l->l_cred; kauth_cred_hold(ump->um_extattr.uepm_ucred); unlock: ufs_extattr_uepm_unlock(ump); return error; } /* * Helper routine: given a locked parent directory and filename, return * the locked vnode of the inode associated with the name. Will not * follow symlinks, may return any type of vnode. Lock on parent will * be released even in the event of a failure. In the event that the * target is the parent (i.e., "."), there will be two references and * one lock, requiring the caller to possibly special-case. */ static int ufs_extattr_lookup(struct vnode *start_dvp, int lockparent, const char *dirname, struct vnode **vp, struct lwp *l) { struct vop_lookup_v2_args vargs; struct componentname cnp; struct vnode *target_vp; char *pnbuf; int error; KASSERT(VOP_ISLOCKED(start_dvp) == LK_EXCLUSIVE); pnbuf = PNBUF_GET(); memset(&cnp, 0, sizeof(cnp)); cnp.cn_nameiop = LOOKUP; cnp.cn_flags = ISLASTCN | lockparent; cnp.cn_cred = l->l_cred; cnp.cn_nameptr = pnbuf; error = copystr(dirname, pnbuf, MAXPATHLEN, &cnp.cn_namelen); if (error) { if (lockparent == 0) { VOP_UNLOCK(start_dvp); } PNBUF_PUT(pnbuf); printf("%s: copystr failed (%d)\n", __func__, error); return error; } cnp.cn_namelen--; /* trim nul termination */ vargs.a_desc = NULL; vargs.a_dvp = start_dvp; vargs.a_vpp = &target_vp; vargs.a_cnp = &cnp; error = ufs_lookup(&vargs); PNBUF_PUT(pnbuf); if (error) { if (lockparent == 0) { VOP_UNLOCK(start_dvp); } return error; } #if 0 if (target_vp == start_dvp) panic("%s: target_vp == start_dvp", __func__); #endif if (target_vp != start_dvp) { error = vn_lock(target_vp, LK_EXCLUSIVE); if (lockparent == 0) VOP_UNLOCK(start_dvp); if (error) { vrele(target_vp); return error; } } KASSERT(VOP_ISLOCKED(target_vp) == LK_EXCLUSIVE); *vp = target_vp; return 0; } /* * Enable an EA using the passed filesystem, backing vnode, attribute name, * namespace, and proc. Will perform a VOP_OPEN() on the vp, so expects vp * to be locked when passed in. The vnode will be returned unlocked, * regardless of success/failure of the function. As a result, the caller * will always need to vrele(), but not vput(). */ static int ufs_extattr_enable_with_open(struct ufsmount *ump, struct vnode *vp, int attrnamespace, const char *attrname, struct lwp *l) { int error; error = VOP_OPEN(vp, FREAD|FWRITE, l->l_cred); if (error) { printf("%s: VOP_OPEN(): failed (%d)\n", __func__, error); VOP_UNLOCK(vp); return error; } mutex_enter(vp->v_interlock); vp->v_writecount++; mutex_exit(vp->v_interlock); vref(vp); VOP_UNLOCK(vp); error = ufs_extattr_enable(ump, attrnamespace, attrname, vp, l); if (error != 0) vn_close(vp, FREAD|FWRITE, l->l_cred); return error; } /* * Given a locked directory vnode, iterate over the names in the directory * and use ufs_extattr_lookup() to retrieve locked vnodes of potential * attribute files. Then invoke ufs_extattr_enable_with_open() on each * to attempt to start the attribute. Leaves the directory locked on * exit. */ static int ufs_extattr_iterate_directory(struct ufsmount *ump, struct vnode *dvp, int attrnamespace, struct lwp *l) { struct vop_readdir_args vargs; struct statvfs *sbp = &ump->um_mountp->mnt_stat; struct dirent *dp, *edp; struct vnode *attr_vp; struct uio auio; struct iovec aiov; char *dirbuf; int error, eofflag = 0; if (dvp->v_type != VDIR) return ENOTDIR; dirbuf = kmem_alloc(UFS_DIRBLKSIZ, KM_SLEEP); auio.uio_iov = &aiov; auio.uio_iovcnt = 1; auio.uio_rw = UIO_READ; auio.uio_offset = 0; UIO_SETUP_SYSSPACE(&auio); vargs.a_desc = NULL; vargs.a_vp = dvp; vargs.a_uio = &auio; vargs.a_cred = l->l_cred; vargs.a_eofflag = &eofflag; vargs.a_ncookies = NULL; vargs.a_cookies = NULL; while (!eofflag) { auio.uio_resid = UFS_DIRBLKSIZ; aiov.iov_base = dirbuf; aiov.iov_len = UFS_DIRBLKSIZ; error = ufs_readdir(&vargs); if (error) { printf("%s: ufs_readdir (%d)\n", __func__, error); return error; } /* * XXXRW: While in UFS, we always get UFS_DIRBLKSIZ returns from * the directory code on success, on other file systems this * may not be the case. For portability, we should check the * read length on return from ufs_readdir(). */ edp = (struct dirent *)&dirbuf[UFS_DIRBLKSIZ]; for (dp = (struct dirent *)dirbuf; dp < edp; ) { if (dp->d_reclen == 0) break; /* Skip "." and ".." */ if (dp->d_name[0] == '.' && (dp->d_name[1] == '\0' || (dp->d_name[1] == '.' && dp->d_name[2] == '\0'))) goto next; error = ufs_extattr_lookup(dvp, LOCKPARENT, dp->d_name, &attr_vp, l); if (error == ENOENT) { goto next; /* keep silent */ } else if (error) { printf("%s: lookup `%s' (%d)\n", __func__, dp->d_name, error); } else if (attr_vp == dvp) { vrele(attr_vp); } else if (attr_vp->v_type != VREG) { vput(attr_vp); } else { error = ufs_extattr_enable_with_open(ump, attr_vp, attrnamespace, dp->d_name, l); vrele(attr_vp); if (error) { printf("%s: enable `%s' (%d)\n", __func__, dp->d_name, error); } else if (bootverbose) { printf("%s: EA %s loaded\n", sbp->f_mntonname, dp->d_name); } } next: dp = (struct dirent *) ((char *)dp + dp->d_reclen); if (dp >= edp) break; } } kmem_free(dirbuf, UFS_DIRBLKSIZ); return 0; } static int ufs_extattr_subdir(struct lwp *l, struct mount *mp, struct vnode *attr_dvp, const char *subdir, int namespace) { int error; struct vnode *attr_sub; error = ufs_extattr_lookup(attr_dvp, LOCKPARENT, subdir, &attr_sub, l); KASSERT(VOP_ISLOCKED(attr_dvp) == LK_EXCLUSIVE); if (error) { printf("%s: Can't find `%s/%s/%s' (%d)\n", __func__, mp->mnt_stat.f_mntonname, UFS_EXTATTR_FSROOTSUBDIR, subdir, error); return error; } KASSERT(VOP_ISLOCKED(attr_sub) == LK_EXCLUSIVE); error = ufs_extattr_iterate_directory(VFSTOUFS(mp), attr_sub, namespace, l); if (error) { printf("%s: ufs_extattr_iterate_directory `%s/%s/%s' (%d)\n", __func__, mp->mnt_stat.f_mntonname, UFS_EXTATTR_FSROOTSUBDIR, subdir, error); } KASSERT(VOP_ISLOCKED(attr_sub) == LK_EXCLUSIVE); vput(attr_sub); return error; } /* * Auto-start of extended attributes, to be executed (optionally) at * mount-time. */ int ufs_extattr_autostart(struct mount *mp, struct lwp *l) { struct vnode *rvp, *attr_dvp; int error; /* * Does UFS_EXTATTR_FSROOTSUBDIR exist off the filesystem root? * If so, automatically start EA's. */ error = VFS_ROOT(mp, LK_EXCLUSIVE, &rvp); if (error) { printf("%s: VFS_ROOT() (%d)\n", __func__, error); return error; } KASSERT(VOP_ISLOCKED(rvp) == LK_EXCLUSIVE); error = ufs_extattr_lookup(rvp, 0, UFS_EXTATTR_FSROOTSUBDIR, &attr_dvp, l); if (error) { /* rvp ref'd but now unlocked */ KASSERT(VOP_ISLOCKED(rvp) == 0); vrele(rvp); printf("%s: lookup `%s/%s' (%d)\n", __func__, mp->mnt_stat.f_mntonname, UFS_EXTATTR_FSROOTSUBDIR, error); return error; } if (rvp == attr_dvp) { /* Should never happen. */ KASSERT(VOP_ISLOCKED(rvp) == LK_EXCLUSIVE); vrele(attr_dvp); vput(rvp); printf("%s: `/' == `%s/%s' (%d)\n", __func__, mp->mnt_stat.f_mntonname, UFS_EXTATTR_FSROOTSUBDIR, EINVAL); return EINVAL; } KASSERT(VOP_ISLOCKED(rvp) == 0); vrele(rvp); KASSERT(VOP_ISLOCKED(attr_dvp) == LK_EXCLUSIVE); if (attr_dvp->v_type != VDIR) { printf("%s: `%s/%s' is not a directory\n", __func__, mp->mnt_stat.f_mntonname, UFS_EXTATTR_FSROOTSUBDIR); goto return_vput_attr_dvp; } error = ufs_extattr_start(mp, l); if (error) { printf("%s: ufs_extattr_start failed (%d)\n", __func__, error); goto return_vput_attr_dvp; } /* * Look for two subdirectories: UFS_EXTATTR_SUBDIR_SYSTEM, * UFS_EXTATTR_SUBDIR_USER. For each, iterate over the sub-directory, * and start with appropriate type. Failures in either don't * result in an over-all failure. attr_dvp is left locked to * be cleaned up on exit. */ error = ufs_extattr_subdir(l, mp, attr_dvp, UFS_EXTATTR_SUBDIR_SYSTEM, EXTATTR_NAMESPACE_SYSTEM); error = ufs_extattr_subdir(l, mp, attr_dvp, UFS_EXTATTR_SUBDIR_USER, EXTATTR_NAMESPACE_USER); /* Mask startup failures in sub-directories. */ error = 0; return_vput_attr_dvp: KASSERT(VOP_ISLOCKED(attr_dvp) == LK_EXCLUSIVE); vput(attr_dvp); return error; } /* * Stop extended attribute support on an FS. */ void ufs_extattr_stop(struct mount *mp, struct lwp *l) { struct ufs_extattr_list_entry *uele; struct ufsmount *ump = VFSTOUFS(mp); ufs_extattr_uepm_lock(ump); /* * If we haven't been started, no big deal. Just short-circuit * the processing work. */ if (!(ump->um_extattr.uepm_flags & UFS_EXTATTR_UEPM_STARTED)) { goto unlock; } while (LIST_FIRST(&ump->um_extattr.uepm_list) != NULL) { uele = LIST_FIRST(&ump->um_extattr.uepm_list); ufs_extattr_disable(ump, uele->uele_attrnamespace, uele->uele_attrname, l); } ump->um_extattr.uepm_flags &= ~UFS_EXTATTR_UEPM_STARTED; kauth_cred_free(ump->um_extattr.uepm_ucred); ump->um_extattr.uepm_ucred = NULL; unlock: ufs_extattr_uepm_unlock(ump); } /* * Enable a named attribute on the specified filesystem; provide an * unlocked backing vnode to hold the attribute data. */ static int ufs_extattr_enable(struct ufsmount *ump, int attrnamespace, const char *attrname, struct vnode *backing_vnode, struct lwp *l) { struct ufs_extattr_list_entry *attribute; struct iovec aiov; struct uio auio; int error = 0; if (!ufs_extattr_valid_attrname(attrnamespace, attrname)) return EINVAL; if (backing_vnode->v_type != VREG) return EINVAL; attribute = kmem_zalloc(sizeof(*attribute), KM_SLEEP); if (!(ump->um_extattr.uepm_flags & UFS_EXTATTR_UEPM_STARTED)) { error = EOPNOTSUPP; goto free_exit; } if (ufs_extattr_find_attr(ump, attrnamespace, attrname)) { error = EEXIST; goto free_exit; } strncpy(attribute->uele_attrname, attrname, UFS_EXTATTR_MAXEXTATTRNAME); attribute->uele_attrnamespace = attrnamespace; memset(&attribute->uele_fileheader, 0, sizeof(struct ufs_extattr_fileheader)); attribute->uele_backing_vnode = backing_vnode; auio.uio_iov = &aiov; auio.uio_iovcnt = 1; aiov.iov_base = (void *) &attribute->uele_fileheader; aiov.iov_len = sizeof(struct ufs_extattr_fileheader); auio.uio_resid = sizeof(struct ufs_extattr_fileheader); auio.uio_offset = (off_t) 0; auio.uio_rw = UIO_READ; UIO_SETUP_SYSSPACE(&auio); vn_lock(backing_vnode, LK_SHARED | LK_RETRY); error = VOP_READ(backing_vnode, &auio, IO_NODELOCKED, ump->um_extattr.uepm_ucred); if (error) goto unlock_free_exit; if (auio.uio_resid != 0) { printf("%s: malformed attribute header\n", __func__); error = EINVAL; goto unlock_free_exit; } /* * Try to determine the byte order of the attribute file. */ if (attribute->uele_fileheader.uef_magic != UFS_EXTATTR_MAGIC) { attribute->uele_flags |= UELE_F_NEEDSWAP; attribute->uele_fileheader.uef_magic = ufs_rw32(attribute->uele_fileheader.uef_magic, UELE_NEEDSWAP(attribute)); if (attribute->uele_fileheader.uef_magic != UFS_EXTATTR_MAGIC) { printf("%s: invalid attribute header magic\n", __func__); error = EINVAL; goto unlock_free_exit; } } attribute->uele_fileheader.uef_version = ufs_rw32(attribute->uele_fileheader.uef_version, UELE_NEEDSWAP(attribute)); attribute->uele_fileheader.uef_size = ufs_rw32(attribute->uele_fileheader.uef_size, UELE_NEEDSWAP(attribute)); if (attribute->uele_fileheader.uef_version != UFS_EXTATTR_VERSION) { printf("%s: incorrect attribute header version %d != %d\n", __func__, attribute->uele_fileheader.uef_version, UFS_EXTATTR_VERSION); error = EINVAL; goto unlock_free_exit; } LIST_INSERT_HEAD(&ump->um_extattr.uepm_list, attribute, uele_entries); VOP_UNLOCK(backing_vnode); return 0; unlock_free_exit: VOP_UNLOCK(backing_vnode); free_exit: kmem_free(attribute, sizeof(*attribute)); return error; } /* * Disable extended attribute support on an FS. */ static int ufs_extattr_disable(struct ufsmount *ump, int attrnamespace, const char *attrname, struct lwp *l) { struct ufs_extattr_list_entry *uele; int error = 0; if (!ufs_extattr_valid_attrname(attrnamespace, attrname)) return EINVAL; uele = ufs_extattr_find_attr(ump, attrnamespace, attrname); if (!uele) return ENODATA; LIST_REMOVE(uele, uele_entries); error = vn_close(uele->uele_backing_vnode, FREAD|FWRITE, l->l_cred); kmem_free(uele, sizeof(*uele)); return error; } /* * VFS call to manage extended attributes in UFS. If filename_vp is * non-NULL, it must be passed in locked, and regardless of errors in * processing, will be unlocked. */ int ufs_extattrctl(struct mount *mp, int cmd, struct vnode *filename_vp, int attrnamespace, const char *attrname) { struct lwp *l = curlwp; struct ufsmount *ump = VFSTOUFS(mp); int error; /* * Only privileged processes can configure extended attributes. */ error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_FS_EXTATTR, 0, mp, NULL, NULL); if (error) { if (filename_vp != NULL) VOP_UNLOCK(filename_vp); return error; } switch(cmd) { case UFS_EXTATTR_CMD_START: case UFS_EXTATTR_CMD_STOP: case UFS_EXTATTR_CMD_ENABLE: case UFS_EXTATTR_CMD_DISABLE: if (filename_vp != NULL) { VOP_UNLOCK(filename_vp); return EINVAL; } if (attrname != NULL) return EINVAL; break; default: return EINVAL; } switch(cmd) { case UFS_EXTATTR_CMD_START: error = ufs_extattr_autostart(mp, l); return error; case UFS_EXTATTR_CMD_STOP: ufs_extattr_stop(mp, l); return 0; case UFS_EXTATTR_CMD_ENABLE: /* * ufs_extattr_enable_with_open() will always unlock the * vnode, regardless of failure. */ ufs_extattr_uepm_lock(ump); error = ufs_extattr_enable_with_open(ump, filename_vp, attrnamespace, attrname, l); ufs_extattr_uepm_unlock(ump); return error; case UFS_EXTATTR_CMD_DISABLE: ufs_extattr_uepm_lock(ump); error = ufs_extattr_disable(ump, attrnamespace, attrname, l); ufs_extattr_uepm_unlock(ump); return error; default: return EINVAL; } } /* * Read extended attribute header for a given vnode and attribute. * Backing vnode should be locked and unlocked by caller. */ static int ufs_extattr_get_header(struct vnode *vp, struct ufs_extattr_list_entry *uele, struct ufs_extattr_header *ueh, off_t *bap) { struct mount *mp = vp->v_mount; struct ufsmount *ump = VFSTOUFS(mp); struct inode *ip = VTOI(vp); off_t base_offset; struct iovec aiov; struct uio aio; int error; /* * Find base offset of header in file based on file header size, and * data header size + maximum data size, indexed by inode number. */ base_offset = sizeof(struct ufs_extattr_fileheader) + ip->i_number * (sizeof(struct ufs_extattr_header) + uele->uele_fileheader.uef_size); /* * Read in the data header to see if the data is defined, and if so * how much. */ memset(ueh, 0, sizeof(struct ufs_extattr_header)); aiov.iov_base = ueh; aiov.iov_len = sizeof(struct ufs_extattr_header); aio.uio_iov = &aiov; aio.uio_iovcnt = 1; aio.uio_rw = UIO_READ; aio.uio_offset = base_offset; aio.uio_resid = sizeof(struct ufs_extattr_header); UIO_SETUP_SYSSPACE(&aio); error = VOP_READ(uele->uele_backing_vnode, &aio, IO_NODELOCKED, ump->um_extattr.uepm_ucred); if (error) return error; /* * Attribute headers are kept in file system byte order. * XXX What about the blob of data? */ ueh->ueh_flags = ufs_rw32(ueh->ueh_flags, UELE_NEEDSWAP(uele)); ueh->ueh_len = ufs_rw32(ueh->ueh_len, UELE_NEEDSWAP(uele)); ueh->ueh_i_gen = ufs_rw32(ueh->ueh_i_gen, UELE_NEEDSWAP(uele)); /* Defined? */ if ((ueh->ueh_flags & UFS_EXTATTR_ATTR_FLAG_INUSE) == 0) return ENODATA; /* Valid for the current inode generation? */ if (ueh->ueh_i_gen != ip->i_gen) { /* * The inode itself has a different generation number * than the uele data. For now, the best solution * is to coerce this to undefined, and let it get cleaned * up by the next write or extattrctl clean. */ printf("%s: %s: inode gen inconsistency (%u, %jd)\n", __func__, mp->mnt_stat.f_mntonname, ueh->ueh_i_gen, (intmax_t)ip->i_gen); return ENODATA; } /* Local size consistency check. */ if (ueh->ueh_len > uele->uele_fileheader.uef_size) return ENXIO; /* Return base offset */ if (bap != NULL) *bap = base_offset; return 0; } /* * Vnode operation to retrieve a named extended attribute. */ int ufs_getextattr(struct vop_getextattr_args *ap) /* vop_getextattr { IN struct vnode *a_vp; IN int a_attrnamespace; IN const char *a_name; INOUT struct uio *a_uio; OUT size_t *a_size; IN kauth_cred_t a_cred; }; */ { struct mount *mp = ap->a_vp->v_mount; struct ufsmount *ump = VFSTOUFS(mp); int error; if (!(ump->um_extattr.uepm_flags & UFS_EXTATTR_UEPM_STARTED)) return EOPNOTSUPP; ufs_extattr_uepm_lock(ump); error = ufs_extattr_get(ap->a_vp, ap->a_attrnamespace, ap->a_name, ap->a_uio, ap->a_size, ap->a_cred, curlwp); ufs_extattr_uepm_unlock(ump); return error; } /* * Real work associated with retrieving a named attribute--assumes that * the attribute lock has already been grabbed. */ static int ufs_extattr_get(struct vnode *vp, int attrnamespace, const char *name, struct uio *uio, size_t *size, kauth_cred_t cred, struct lwp *l) { struct ufs_extattr_list_entry *attribute; struct ufs_extattr_header ueh; struct mount *mp = vp->v_mount; struct ufsmount *ump = VFSTOUFS(mp); off_t base_offset; size_t len, old_len; int error = 0; if (strlen(name) == 0) return EINVAL; error = extattr_check_cred(vp, attrnamespace, cred, VREAD); if (error) return error; attribute = ufs_extattr_find_attr(ump, attrnamespace, name); if (!attribute) return ENODATA; /* * Allow only offsets of zero to encourage the read/replace * extended attribute semantic. Otherwise we can't guarantee * atomicity, as we don't provide locks for extended attributes. */ if (uio != NULL && uio->uio_offset != 0) return ENXIO; /* * Don't need to get a lock on the backing file if the getattr is * being applied to the backing file, as the lock is already held. */ if (attribute->uele_backing_vnode != vp) vn_lock(attribute->uele_backing_vnode, LK_SHARED | LK_RETRY); error = ufs_extattr_get_header(vp, attribute, &ueh, &base_offset); if (error) goto vopunlock_exit; /* Return full data size if caller requested it. */ if (size != NULL) *size = ueh.ueh_len; /* Return data if the caller requested it. */ if (uio != NULL) { /* Allow for offset into the attribute data. */ uio->uio_offset = base_offset + sizeof(struct ufs_extattr_header); /* * Figure out maximum to transfer -- use buffer size and * local data limit. */ len = MIN(uio->uio_resid, ueh.ueh_len); old_len = uio->uio_resid; uio->uio_resid = len; error = VOP_READ(attribute->uele_backing_vnode, uio, IO_NODELOCKED, ump->um_extattr.uepm_ucred); if (error) goto vopunlock_exit; uio->uio_resid = old_len - (len - uio->uio_resid); } vopunlock_exit: if (uio != NULL) uio->uio_offset = 0; if (attribute->uele_backing_vnode != vp) VOP_UNLOCK(attribute->uele_backing_vnode); return error; } /* * Vnode operation to list extended attribute for a vnode */ int ufs_listextattr(struct vop_listextattr_args *ap) /* vop_listextattr { IN struct vnode *a_vp; IN int a_attrnamespace; INOUT struct uio *a_uio; OUT size_t *a_size; IN int flag; IN kauth_cred_t a_cred; struct proc *a_p; }; */ { struct mount *mp = ap->a_vp->v_mount; struct ufsmount *ump = VFSTOUFS(mp); int error; if (!(ump->um_extattr.uepm_flags & UFS_EXTATTR_UEPM_STARTED)) return EOPNOTSUPP; ufs_extattr_uepm_lock(ump); error = ufs_extattr_list(ap->a_vp, ap->a_attrnamespace, ap->a_uio, ap->a_size, ap->a_flag, ap->a_cred, curlwp); ufs_extattr_uepm_unlock(ump); return error; } /* * Real work associated with retrieving list of attributes--assumes that * the attribute lock has already been grabbed. */ static int ufs_extattr_list(struct vnode *vp, int attrnamespace, struct uio *uio, size_t *size, int flag, kauth_cred_t cred, struct lwp *l) { struct ufs_extattr_list_entry *uele; struct ufs_extattr_header ueh; struct mount *mp = vp->v_mount; struct ufsmount *ump = VFSTOUFS(mp); size_t listsize = 0; int error = 0; /* * XXX: We can move this inside the loop and iterate on individual * attributes. */ error = extattr_check_cred(vp, attrnamespace, cred, VREAD); if (error) return error; LIST_FOREACH(uele, &ump->um_extattr.uepm_list, uele_entries) { unsigned char attrnamelen; if (uele->uele_attrnamespace != attrnamespace) continue; error = ufs_extattr_get_header(vp, uele, &ueh, NULL); if (error == ENODATA) continue; if (error != 0) return error; /* * Don't need to get a lock on the backing file if * the listattr is being applied to the backing file, * as the lock is already held. */ if (uele->uele_backing_vnode != vp) vn_lock(uele->uele_backing_vnode, LK_SHARED | LK_RETRY); /* * +1 for trailing NUL (listxattr flavor) * or leading name length (extattr_list_file flavor) */ attrnamelen = strlen(uele->uele_attrname); listsize += attrnamelen + 1; /* Return data if the caller requested it. */ if (uio != NULL) { /* * We support two flavors. Either NUL-terminated * strings (a la listxattr), or non NUL-terminated, * one byte length prefixed strings (for * extattr_list_file). EXTATTR_LIST_LENPREFIX switches * that second behavior. */ if (flag & EXTATTR_LIST_LENPREFIX) { uint8_t len = (uint8_t)attrnamelen; /* Copy leading name length */ error = uiomove(&len, sizeof(len), uio); if (error != 0) break; } else { /* Include trailing NULL */ attrnamelen++; } error = uiomove(uele->uele_attrname, (size_t)attrnamelen, uio); if (error != 0) break; } if (uele->uele_backing_vnode != vp) VOP_UNLOCK(uele->uele_backing_vnode); if (error != 0) return error; } if (uio != NULL) uio->uio_offset = 0; /* Return full data size if caller requested it. */ if (size != NULL) *size = listsize; return 0; } /* * Vnode operation to remove a named attribute. */ int ufs_deleteextattr(struct vop_deleteextattr_args *ap) /* vop_deleteextattr { IN struct vnode *a_vp; IN int a_attrnamespace; IN const char *a_name; IN kauth_cred_t a_cred; }; */ { struct mount *mp = ap->a_vp->v_mount; struct ufsmount *ump = VFSTOUFS(mp); int error; if (!(ump->um_extattr.uepm_flags & UFS_EXTATTR_UEPM_STARTED)) return EOPNOTSUPP; ufs_extattr_uepm_lock(ump); error = ufs_extattr_rm(ap->a_vp, ap->a_attrnamespace, ap->a_name, ap->a_cred, curlwp); ufs_extattr_uepm_unlock(ump); return error; } /* * Vnode operation to set a named attribute. */ int ufs_setextattr(struct vop_setextattr_args *ap) /* vop_setextattr { IN struct vnode *a_vp; IN int a_attrnamespace; IN const char *a_name; INOUT struct uio *a_uio; IN kauth_cred_t a_cred; }; */ { struct mount *mp = ap->a_vp->v_mount; struct ufsmount *ump = VFSTOUFS(mp); int error; if (!(ump->um_extattr.uepm_flags & UFS_EXTATTR_UEPM_STARTED)) return EOPNOTSUPP; ufs_extattr_uepm_lock(ump); /* * XXX: No longer a supported way to delete extended attributes. */ if (ap->a_uio == NULL) { ufs_extattr_uepm_unlock(ump); return EINVAL; } error = ufs_extattr_set(ap->a_vp, ap->a_attrnamespace, ap->a_name, ap->a_uio, ap->a_cred, curlwp); ufs_extattr_uepm_unlock(ump); return error; } /* * Real work associated with setting a vnode's extended attributes; * assumes that the attribute lock has already been grabbed. */ static int ufs_extattr_set(struct vnode *vp, int attrnamespace, const char *name, struct uio *uio, kauth_cred_t cred, struct lwp *l) { struct ufs_extattr_list_entry *attribute; struct ufs_extattr_header ueh; struct iovec local_aiov; struct uio local_aio; struct mount *mp = vp->v_mount; struct ufsmount *ump = VFSTOUFS(mp); struct inode *ip = VTOI(vp); off_t base_offset; int error = 0, ioflag; if (vp->v_mount->mnt_flag & MNT_RDONLY) return EROFS; if (!ufs_extattr_valid_attrname(attrnamespace, name)) return EINVAL; error = extattr_check_cred(vp, attrnamespace, cred, VWRITE); if (error) return error; attribute = ufs_extattr_find_attr(ump, attrnamespace, name); if (!attribute) { error = ufs_extattr_autocreate_attr(vp, attrnamespace, name, l, &attribute); if (error == EEXIST) { /* Another thread raced us for backend creation */ error = 0; attribute = ufs_extattr_find_attr(ump, attrnamespace, name); } if (error || !attribute) return ENODATA; } /* * Early rejection of invalid offsets/length. * Reject: any offset but 0 (replace) * Any size greater than attribute size limit */ if (uio->uio_offset != 0 || uio->uio_resid > attribute->uele_fileheader.uef_size) return ENXIO; /* * Find base offset of header in file based on file header size, and * data header size + maximum data size, indexed by inode number. */ base_offset = sizeof(struct ufs_extattr_fileheader) + ip->i_number * (sizeof(struct ufs_extattr_header) + attribute->uele_fileheader.uef_size); /* * Write out a data header for the data. */ ueh.ueh_len = ufs_rw32((uint32_t) uio->uio_resid, UELE_NEEDSWAP(attribute)); ueh.ueh_flags = ufs_rw32(UFS_EXTATTR_ATTR_FLAG_INUSE, UELE_NEEDSWAP(attribute)); ueh.ueh_i_gen = ufs_rw32(ip->i_gen, UELE_NEEDSWAP(attribute)); local_aiov.iov_base = &ueh; local_aiov.iov_len = sizeof(struct ufs_extattr_header); local_aio.uio_iov = &local_aiov; local_aio.uio_iovcnt = 1; local_aio.uio_rw = UIO_WRITE; local_aio.uio_offset = base_offset; local_aio.uio_resid = sizeof(struct ufs_extattr_header); UIO_SETUP_SYSSPACE(&local_aio); /* * Don't need to get a lock on the backing file if the setattr is * being applied to the backing file, as the lock is already held. */ if (attribute->uele_backing_vnode != vp) vn_lock(attribute->uele_backing_vnode, LK_EXCLUSIVE | LK_RETRY); ioflag = IO_NODELOCKED; if (ufs_extattr_sync) ioflag |= IO_SYNC; error = VOP_WRITE(attribute->uele_backing_vnode, &local_aio, ioflag, ump->um_extattr.uepm_ucred); if (error) goto vopunlock_exit; if (local_aio.uio_resid != 0) { error = ENXIO; goto vopunlock_exit; } /* * Write out user data. * XXX NOT ATOMIC WITH RESPECT TO THE HEADER. */ uio->uio_offset = base_offset + sizeof(struct ufs_extattr_header); ioflag = IO_NODELOCKED; if (ufs_extattr_sync) ioflag |= IO_SYNC; error = VOP_WRITE(attribute->uele_backing_vnode, uio, ioflag, ump->um_extattr.uepm_ucred); vopunlock_exit: uio->uio_offset = 0; if (attribute->uele_backing_vnode != vp) VOP_UNLOCK(attribute->uele_backing_vnode); return error; } /* * Real work associated with removing an extended attribute from a vnode. * Assumes the attribute lock has already been grabbed. */ static int ufs_extattr_rm(struct vnode *vp, int attrnamespace, const char *name, kauth_cred_t cred, struct lwp *l) { struct ufs_extattr_list_entry *attribute; struct ufs_extattr_header ueh; struct mount *mp = vp->v_mount; struct ufsmount *ump = VFSTOUFS(mp); struct iovec local_aiov; struct uio local_aio; off_t base_offset; int error = 0, ioflag; if (vp->v_mount->mnt_flag & MNT_RDONLY) return EROFS; if (!ufs_extattr_valid_attrname(attrnamespace, name)) return EINVAL; error = extattr_check_cred(vp, attrnamespace, cred, VWRITE); if (error) return error; attribute = ufs_extattr_find_attr(ump, attrnamespace, name); if (!attribute) return ENODATA; /* * Don't need to get a lock on the backing file if the getattr is * being applied to the backing file, as the lock is already held. */ if (attribute->uele_backing_vnode != vp) vn_lock(attribute->uele_backing_vnode, LK_EXCLUSIVE | LK_RETRY); error = ufs_extattr_get_header(vp, attribute, &ueh, &base_offset); if (error) goto vopunlock_exit; /* Flag it as not in use. */ ueh.ueh_flags = 0; /* No need to byte swap 0 */ ueh.ueh_len = 0; /* ...ditto... */ local_aiov.iov_base = &ueh; local_aiov.iov_len = sizeof(struct ufs_extattr_header); local_aio.uio_iov = &local_aiov; local_aio.uio_iovcnt = 1; local_aio.uio_rw = UIO_WRITE; local_aio.uio_offset = base_offset; local_aio.uio_resid = sizeof(struct ufs_extattr_header); UIO_SETUP_SYSSPACE(&local_aio); ioflag = IO_NODELOCKED; if (ufs_extattr_sync) ioflag |= IO_SYNC; error = VOP_WRITE(attribute->uele_backing_vnode, &local_aio, ioflag, ump->um_extattr.uepm_ucred); if (error) goto vopunlock_exit; if (local_aio.uio_resid != 0) error = ENXIO; vopunlock_exit: VOP_UNLOCK(attribute->uele_backing_vnode); return error; } /* * Called by UFS when an inode is no longer active and should have its * attributes stripped. */ void ufs_extattr_vnode_inactive(struct vnode *vp, struct lwp *l) { struct ufs_extattr_list_entry *uele; struct mount *mp = vp->v_mount; struct ufsmount *ump = VFSTOUFS(mp); /* * In that case, we cannot lock. We should not have any active vnodes * on the fs if this is not yet initialized but is going to be, so * this can go unlocked. */ if (!(ump->um_extattr.uepm_flags & UFS_EXTATTR_UEPM_INITIALIZED)) return; if (!(ump->um_extattr.uepm_flags & UFS_EXTATTR_UEPM_STARTED)) return; ufs_extattr_uepm_lock(ump); LIST_FOREACH(uele, &ump->um_extattr.uepm_list, uele_entries) ufs_extattr_rm(vp, uele->uele_attrnamespace, uele->uele_attrname, lwp0.l_cred, l); ufs_extattr_uepm_unlock(ump); } void ufs_extattr_init(void) { } void ufs_extattr_done(void) { }
1 1 1 2 1 1 2 1 1 2 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 /* $NetBSD: md.c,v 1.87 2023/01/13 15:46:40 hannken Exp $ */ /* * Copyright (c) 1995 Gordon W. Ross, Leo Weppelman. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* * This implements a general-purpose memory-disk. * See md.h for notes on the config types. * * Note that this driver provides the same functionality * as the MFS filesystem hack, but this is better because * you can use this for any filesystem type you'd like! * * Credit for most of the kmem ramdisk code goes to: * Leo Weppelman (atari) and Phil Nelson (pc532) * Credit for the ideas behind the "user space memory" code goes * to the authors of the MFS implementation. */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: md.c,v 1.87 2023/01/13 15:46:40 hannken Exp $"); #ifdef _KERNEL_OPT #include "opt_md.h" #else #define MEMORY_DISK_SERVER 1 #endif #include <sys/param.h> #include <sys/kernel.h> #include <sys/kmem.h> #include <sys/systm.h> #include <sys/buf.h> #include <sys/bufq.h> #include <sys/device.h> #include <sys/disk.h> #include <sys/stat.h> #include <sys/proc.h> #include <sys/conf.h> #include <sys/disklabel.h> #include <uvm/uvm_extern.h> #include <dev/md.h> #include "ioconf.h" /* * The user-space functionality is included by default. * Use `options MEMORY_DISK_SERVER=0' to turn it off. */ #ifndef MEMORY_DISK_SERVER #error MEMORY_DISK_SERVER should be defined by opt_md.h #endif /* MEMORY_DISK_SERVER */ /* * We should use the raw partition for ioctl. */ #define MD_UNIT(unit) DISKUNIT(unit) /* autoconfig stuff... */ struct md_softc { device_t sc_dev; /* Self. */ struct disk sc_dkdev; /* hook for generic disk handling */ struct md_conf sc_md; kmutex_t sc_lock; /* Protect self. */ kcondvar_t sc_cv; /* Wait here for work. */ struct bufq_state *sc_buflist; }; /* shorthand for fields in sc_md: */ #define sc_addr sc_md.md_addr #define sc_size sc_md.md_size #define sc_type sc_md.md_type static void md_attach(device_t, device_t, void *); static int md_detach(device_t, int); static dev_type_open(mdopen); static dev_type_close(mdclose); static dev_type_read(mdread); static dev_type_write(mdwrite); static dev_type_ioctl(mdioctl); static dev_type_strategy(mdstrategy); static dev_type_size(mdsize); const struct bdevsw md_bdevsw = { .d_open = mdopen, .d_close = mdclose, .d_strategy = mdstrategy, .d_ioctl = mdioctl, .d_dump = nodump, .d_psize = mdsize, .d_discard = nodiscard, .d_flag = D_DISK | D_MPSAFE }; const struct cdevsw md_cdevsw = { .d_open = mdopen, .d_close = mdclose, .d_read = mdread, .d_write = mdwrite, .d_ioctl = mdioctl, .d_stop = nostop, .d_tty = notty, .d_poll = nopoll, .d_mmap = nommap, .d_kqfilter = nokqfilter, .d_discard = nodiscard, .d_flag = D_DISK | D_MPSAFE }; static const struct dkdriver mddkdriver = { .d_strategy = mdstrategy, .d_minphys = minphys }; CFATTACH_DECL3_NEW(md, sizeof(struct md_softc), 0, md_attach, md_detach, NULL, NULL, NULL, DVF_DETACH_SHUTDOWN); static kmutex_t md_device_lock; /* Protect unit creation / deletion. */ extern size_t md_root_size; static void md_set_disklabel(struct md_softc *); /* * This is called if we are configured as a pseudo-device */ void mdattach(int n) { mutex_init(&md_device_lock, MUTEX_DEFAULT, IPL_NONE); if (config_cfattach_attach(md_cd.cd_name, &md_ca)) { aprint_error("%s: cfattach_attach failed\n", md_cd.cd_name); return; } } static void md_attach(device_t parent, device_t self, void *aux) { struct md_softc *sc = device_private(self); sc->sc_dev = self; sc->sc_type = MD_UNCONFIGURED; mutex_init(&sc->sc_lock, MUTEX_DEFAULT, IPL_NONE); cv_init(&sc->sc_cv, "mdidle"); bufq_alloc(&sc->sc_buflist, "fcfs", 0); /* XXX - Could accept aux info here to set the config. */ #ifdef MEMORY_DISK_HOOKS /* * This external function might setup a pre-loaded disk. * All it would need to do is setup the md_conf struct. * See sys/dev/md_root.c for an example. */ md_attach_hook(device_unit(self), &sc->sc_md); #endif /* * Initialize and attach the disk structure. */ disk_init(&sc->sc_dkdev, device_xname(self), &mddkdriver); disk_attach(&sc->sc_dkdev); if (sc->sc_type != MD_UNCONFIGURED) md_set_disklabel(sc); if (!pmf_device_register(self, NULL, NULL)) aprint_error_dev(self, "couldn't establish power handler\n"); } static int md_detach(device_t self, int flags) { struct md_softc *sc = device_private(self); int rc; rc = 0; mutex_enter(&sc->sc_dkdev.dk_openlock); if (sc->sc_dkdev.dk_openmask == 0 && sc->sc_type == MD_UNCONFIGURED) ; /* nothing to do */ else if ((flags & DETACH_FORCE) == 0) rc = EBUSY; mutex_exit(&sc->sc_dkdev.dk_openlock); if (rc != 0) return rc; pmf_device_deregister(self); disk_detach(&sc->sc_dkdev); disk_destroy(&sc->sc_dkdev); bufq_free(sc->sc_buflist); mutex_destroy(&sc->sc_lock); cv_destroy(&sc->sc_cv); return 0; } /* * operational routines: * open, close, read, write, strategy, * ioctl, dump, size */ #if MEMORY_DISK_SERVER static int md_server_loop(struct md_softc *sc); static int md_ioctl_server(struct md_softc *sc, struct md_conf *umd, struct lwp *l); #endif /* MEMORY_DISK_SERVER */ static int md_ioctl_kalloc(struct md_softc *sc, struct md_conf *umd, struct lwp *l); static int mdsize(dev_t dev) { struct md_softc *sc; int res; sc = device_lookup_private(&md_cd, MD_UNIT(dev)); if (sc == NULL) return 0; mutex_enter(&sc->sc_lock); if (sc->sc_type == MD_UNCONFIGURED) res = 0; else res = sc->sc_size >> DEV_BSHIFT; mutex_exit(&sc->sc_lock); return res; } static int mdopen(dev_t dev, int flag, int fmt, struct lwp *l) { int unit; int part = DISKPART(dev); int pmask = 1 << part; cfdata_t cf; struct md_softc *sc; struct disk *dk; #ifdef MEMORY_DISK_HOOKS bool configured; #endif mutex_enter(&md_device_lock); unit = MD_UNIT(dev); sc = device_lookup_private(&md_cd, unit); if (sc == NULL) { if (part != RAW_PART) { mutex_exit(&md_device_lock); return ENXIO; } cf = kmem_zalloc(sizeof(*cf), KM_SLEEP); cf->cf_name = md_cd.cd_name; cf->cf_atname = md_cd.cd_name; cf->cf_unit = unit; cf->cf_fstate = FSTATE_STAR; sc = device_private(config_attach_pseudo(cf)); if (sc == NULL) { mutex_exit(&md_device_lock); return ENOMEM; } } dk = &sc->sc_dkdev; /* * The raw partition is used for ioctl to configure. */ if (part == RAW_PART) goto ok; #ifdef MEMORY_DISK_HOOKS /* Call the open hook to allow loading the device. */ configured = (sc->sc_type != MD_UNCONFIGURED); md_open_hook(unit, &sc->sc_md); /* initialize disklabel if the device is configured in open hook */ if (!configured && sc->sc_type != MD_UNCONFIGURED) md_set_disklabel(sc); #endif /* * This is a normal, "slave" device, so * enforce initialized. */ if (sc->sc_type == MD_UNCONFIGURED) { mutex_exit(&md_device_lock); return ENXIO; } ok: /* XXX duplicates code in dk_open(). Call dk_open(), instead? */ mutex_enter(&dk->dk_openlock); /* Mark our unit as open. */ switch (fmt) { case S_IFCHR: dk->dk_copenmask |= pmask; break; case S_IFBLK: dk->dk_bopenmask |= pmask; break; } dk->dk_openmask = dk->dk_copenmask | dk->dk_bopenmask; mutex_exit(&dk->dk_openlock); mutex_exit(&md_device_lock); return 0; } static int mdclose(dev_t dev, int flag, int fmt, struct lwp *l) { int part = DISKPART(dev); int pmask = 1 << part; int error; cfdata_t cf; struct md_softc *sc; struct disk *dk; sc = device_lookup_private(&md_cd, MD_UNIT(dev)); if (sc == NULL) return ENXIO; dk = &sc->sc_dkdev; mutex_enter(&dk->dk_openlock); switch (fmt) { case S_IFCHR: dk->dk_copenmask &= ~pmask; break; case S_IFBLK: dk->dk_bopenmask &= ~pmask; break; } dk->dk_openmask = dk->dk_copenmask | dk->dk_bopenmask; if (dk->dk_openmask != 0) { mutex_exit(&dk->dk_openlock); return 0; } mutex_exit(&dk->dk_openlock); mutex_enter(&md_device_lock); cf = device_cfdata(sc->sc_dev); error = config_detach(sc->sc_dev, DETACH_QUIET); if (! error) kmem_free(cf, sizeof(*cf)); mutex_exit(&md_device_lock); return error; } static int mdread(dev_t dev, struct uio *uio, int flags) { struct md_softc *sc; sc = device_lookup_private(&md_cd, MD_UNIT(dev)); if (sc == NULL || sc->sc_type == MD_UNCONFIGURED) return ENXIO; return (physio(mdstrategy, NULL, dev, B_READ, minphys, uio)); } static int mdwrite(dev_t dev, struct uio *uio, int flags) { struct md_softc *sc; sc = device_lookup_private(&md_cd, MD_UNIT(dev)); if (sc == NULL || sc->sc_type == MD_UNCONFIGURED) return ENXIO; return (physio(mdstrategy, NULL, dev, B_WRITE, minphys, uio)); } /* * Handle I/O requests, either directly, or * by passing them to the server process. */ static void mdstrategy(struct buf *bp) { struct md_softc *sc; void * addr; size_t off, xfer; bool is_read; sc = device_lookup_private(&md_cd, MD_UNIT(bp->b_dev)); if (sc == NULL || sc->sc_type == MD_UNCONFIGURED) { bp->b_error = ENXIO; goto done; } mutex_enter(&sc->sc_lock); switch (sc->sc_type) { #if MEMORY_DISK_SERVER case MD_UMEM_SERVER: /* Just add this job to the server's queue. */ bufq_put(sc->sc_buflist, bp); cv_signal(&sc->sc_cv); mutex_exit(&sc->sc_lock); /* see md_server_loop() */ /* no biodone in this case */ return; #endif /* MEMORY_DISK_SERVER */ case MD_KMEM_FIXED: case MD_KMEM_ALLOCATED: /* These are in kernel space. Access directly. */ is_read = ((bp->b_flags & B_READ) == B_READ); bp->b_resid = bp->b_bcount; off = (bp->b_blkno << DEV_BSHIFT); if (off >= sc->sc_size) { if (is_read) break; /* EOF */ goto set_eio; } xfer = bp->b_resid; if (xfer > (sc->sc_size - off)) xfer = (sc->sc_size - off); addr = (char *)sc->sc_addr + off; disk_busy(&sc->sc_dkdev); if (is_read) memcpy(bp->b_data, addr, xfer); else memcpy(addr, bp->b_data, xfer); disk_unbusy(&sc->sc_dkdev, xfer, is_read); bp->b_resid -= xfer; break; default: bp->b_resid = bp->b_bcount; set_eio: bp->b_error = EIO; break; } mutex_exit(&sc->sc_lock); done: biodone(bp); } static int mdioctl(dev_t dev, u_long cmd, void *data, int flag, struct lwp *l) { struct md_softc *sc; struct md_conf *umd; int error; if ((sc = device_lookup_private(&md_cd, MD_UNIT(dev))) == NULL) return ENXIO; if (sc->sc_type != MD_UNCONFIGURED) { error = disk_ioctl(&sc->sc_dkdev, dev, cmd, data, flag, l); if (error != EPASSTHROUGH) { return error; } } /* If this is not the raw partition, punt! */ if (DISKPART(dev) != RAW_PART) { return ENOTTY; } mutex_enter(&sc->sc_lock); umd = (struct md_conf *)data; error = EINVAL; switch (cmd) { case MD_GETCONF: *umd = sc->sc_md; error = 0; break; case MD_SETCONF: /* Can only set it once. */ if (sc->sc_type != MD_UNCONFIGURED) break; switch (umd->md_type) { case MD_KMEM_ALLOCATED: error = md_ioctl_kalloc(sc, umd, l); break; #if MEMORY_DISK_SERVER case MD_UMEM_SERVER: error = md_ioctl_server(sc, umd, l); break; #endif /* MEMORY_DISK_SERVER */ default: break; } break; } mutex_exit(&sc->sc_lock); return error; } static void md_set_disklabel(struct md_softc *sc) { struct disk_geom *dg = &sc->sc_dkdev.dk_geom; struct disklabel *lp = sc->sc_dkdev.dk_label; struct partition *pp; memset(lp, 0, sizeof(*lp)); lp->d_secsize = DEV_BSIZE; lp->d_secperunit = sc->sc_size / DEV_BSIZE; if (lp->d_secperunit >= (32*64)) { lp->d_nsectors = 32; lp->d_ntracks = 64; lp->d_ncylinders = lp->d_secperunit / (32*64); } else { lp->d_nsectors = 1; lp->d_ntracks = 1; lp->d_ncylinders = lp->d_secperunit; } lp->d_secpercyl = lp->d_ntracks*lp->d_nsectors; strncpy(lp->d_typename, md_cd.cd_name, sizeof(lp->d_typename)); lp->d_type = DKTYPE_MD; strncpy(lp->d_packname, "fictitious", sizeof(lp->d_packname)); lp->d_rpm = 3600; lp->d_interleave = 1; lp->d_flags = 0; pp = &lp->d_partitions[0]; pp->p_offset = 0; pp->p_size = lp->d_secperunit; pp->p_fstype = FS_BSDFFS; pp = &lp->d_partitions[RAW_PART]; pp->p_offset = 0; pp->p_size = lp->d_secperunit; pp->p_fstype = FS_UNUSED; lp->d_npartitions = RAW_PART+1; lp->d_magic = DISKMAGIC; lp->d_magic2 = DISKMAGIC; lp->d_checksum = dkcksum(lp); memset(dg, 0, sizeof(*dg)); dg->dg_secsize = lp->d_secsize; dg->dg_secperunit = lp->d_secperunit; dg->dg_nsectors = lp->d_nsectors; dg->dg_ntracks = lp->d_ntracks = 64; dg->dg_ncylinders = lp->d_ncylinders; disk_set_info(sc->sc_dev, &sc->sc_dkdev, NULL); } /* * Handle ioctl MD_SETCONF for (sc_type == MD_KMEM_ALLOCATED) * Just allocate some kernel memory and return. */ static int md_ioctl_kalloc(struct md_softc *sc, struct md_conf *umd, struct lwp *l) { vaddr_t addr; vsize_t size; /* Sanity check the size. */ size = umd->md_size; if (size < DEV_BSIZE || (size % DEV_BSIZE) != 0) return EINVAL; mutex_exit(&sc->sc_lock); addr = uvm_km_alloc(kernel_map, size, 0, UVM_KMF_WIRED|UVM_KMF_ZERO); mutex_enter(&sc->sc_lock); if (!addr) return ENOMEM; /* If another thread beat us to configure this unit: fail. */ if (sc->sc_type != MD_UNCONFIGURED) { uvm_km_free(kernel_map, addr, size, UVM_KMF_WIRED); return EINVAL; } /* This unit is now configured. */ sc->sc_addr = (void *)addr; /* kernel space */ sc->sc_size = (size_t)size; sc->sc_type = MD_KMEM_ALLOCATED; md_set_disklabel(sc); return 0; } #if MEMORY_DISK_SERVER /* * Handle ioctl MD_SETCONF for (sc_type == MD_UMEM_SERVER) * Set config, then become the I/O server for this unit. */ static int md_ioctl_server(struct md_softc *sc, struct md_conf *umd, struct lwp *l) { vaddr_t end; int error; KASSERT(mutex_owned(&sc->sc_lock)); /* Sanity check addr, size. */ end = (vaddr_t) ((char *)umd->md_addr + umd->md_size); if ( #ifndef _RUMPKERNEL /* * On some architectures (e.g. powerpc) rump kernel provides * "safe" low defaults which make this test fail since malloc * does return higher addresses than the "safe" default. */ (end >= VM_MAXUSER_ADDRESS) || #endif (end < ((vaddr_t) umd->md_addr))) return EINVAL; /* This unit is now configured. */ sc->sc_addr = umd->md_addr; /* user space */ sc->sc_size = umd->md_size; sc->sc_type = MD_UMEM_SERVER; md_set_disklabel(sc); /* Become the server daemon */ error = md_server_loop(sc); /* This server is now going away! */ sc->sc_type = MD_UNCONFIGURED; sc->sc_addr = 0; sc->sc_size = 0; return (error); } static int md_server_loop(struct md_softc *sc) { struct buf *bp; void *addr; /* user space address */ size_t off; /* offset into "device" */ size_t xfer; /* amount to transfer */ int error; bool is_read; KASSERT(mutex_owned(&sc->sc_lock)); for (;;) { /* Wait for some work to arrive. */ while ((bp = bufq_get(sc->sc_buflist)) == NULL) { error = cv_wait_sig(&sc->sc_cv, &sc->sc_lock); if (error) return error; } /* Do the transfer to/from user space. */ mutex_exit(&sc->sc_lock); error = 0; is_read = ((bp->b_flags & B_READ) == B_READ); bp->b_resid = bp->b_bcount; off = (bp->b_blkno << DEV_BSHIFT); if (off >= sc->sc_size) { if (is_read) goto done; /* EOF (not an error) */ error = EIO; goto done; } xfer = bp->b_resid; if (xfer > (sc->sc_size - off)) xfer = (sc->sc_size - off); addr = (char *)sc->sc_addr + off; disk_busy(&sc->sc_dkdev); if (is_read) error = copyin(addr, bp->b_data, xfer); else error = copyout(bp->b_data, addr, xfer); disk_unbusy(&sc->sc_dkdev, (error ? 0 : xfer), is_read); if (!error) bp->b_resid -= xfer; done: if (error) { bp->b_error = error; } biodone(bp); mutex_enter(&sc->sc_lock); } } #endif /* MEMORY_DISK_SERVER */
3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 1 1 1 1 1 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 4 4 4 4 3 3 3 3 4 4 3 4 4 4 3 3 3 3 3 3 3 3 3 4 4 4 4 4 4 4 3 3 3 4 3 3 3 3 3 3 3 3 3 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 /* $NetBSD: usbdi.c,v 1.253 2024/04/05 18:57:10 riastradh Exp $ */ /* * Copyright (c) 1998, 2012, 2015 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Lennart Augustsson (lennart@augustsson.net) at * Carlstedt Research & Technology, Matthew R. Green (mrg@eterna23.net), * and Nick Hudson. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: usbdi.c,v 1.253 2024/04/05 18:57:10 riastradh Exp $"); #ifdef _KERNEL_OPT #include "opt_usb.h" #include "opt_compat_netbsd.h" #include "usb_dma.h" #endif #include <sys/param.h> #include <sys/systm.h> #include <sys/kernel.h> #include <sys/device.h> #include <sys/kmem.h> #include <sys/proc.h> #include <sys/bus.h> #include <sys/cpu.h> #include <dev/usb/usb.h> #include <dev/usb/usbdi.h> #include <dev/usb/usbdi_util.h> #include <dev/usb/usbdivar.h> #include <dev/usb/usb_mem.h> #include <dev/usb/usb_quirks.h> #include <dev/usb/usb_sdt.h> #include <dev/usb/usbhist.h> /* UTF-8 encoding stuff */ #include <fs/unicode.h> SDT_PROBE_DEFINE5(usb, device, pipe, open, "struct usbd_interface *"/*iface*/, "uint8_t"/*address*/, "uint8_t"/*flags*/, "int"/*ival*/, "struct usbd_pipe *"/*pipe*/); SDT_PROBE_DEFINE7(usb, device, pipe, open__intr, "struct usbd_interface *"/*iface*/, "uint8_t"/*address*/, "uint8_t"/*flags*/, "int"/*ival*/, "usbd_callback"/*cb*/, "void *"/*cookie*/, "struct usbd_pipe *"/*pipe*/); SDT_PROBE_DEFINE2(usb, device, pipe, transfer__start, "struct usbd_pipe *"/*pipe*/, "struct usbd_xfer *"/*xfer*/); SDT_PROBE_DEFINE3(usb, device, pipe, transfer__done, "struct usbd_pipe *"/*pipe*/, "struct usbd_xfer *"/*xfer*/, "usbd_status"/*err*/); SDT_PROBE_DEFINE2(usb, device, pipe, start, "struct usbd_pipe *"/*pipe*/, "struct usbd_xfer *"/*xfer*/); SDT_PROBE_DEFINE1(usb, device, pipe, close, "struct usbd_pipe *"/*pipe*/); SDT_PROBE_DEFINE1(usb, device, pipe, abort__start, "struct usbd_pipe *"/*pipe*/); SDT_PROBE_DEFINE1(usb, device, pipe, abort__done, "struct usbd_pipe *"/*pipe*/); SDT_PROBE_DEFINE1(usb, device, pipe, clear__endpoint__stall, "struct usbd_pipe *"/*pipe*/); SDT_PROBE_DEFINE1(usb, device, pipe, clear__endpoint__toggle, "struct usbd_pipe *"/*pipe*/); SDT_PROBE_DEFINE5(usb, device, xfer, create, "struct usbd_xfer *"/*xfer*/, "struct usbd_pipe *"/*pipe*/, "size_t"/*len*/, "unsigned int"/*flags*/, "unsigned int"/*nframes*/); SDT_PROBE_DEFINE1(usb, device, xfer, start, "struct usbd_xfer *"/*xfer*/); SDT_PROBE_DEFINE1(usb, device, xfer, preabort, "struct usbd_xfer *"/*xfer*/); SDT_PROBE_DEFINE1(usb, device, xfer, abort, "struct usbd_xfer *"/*xfer*/); SDT_PROBE_DEFINE1(usb, device, xfer, timeout, "struct usbd_xfer *"/*xfer*/); SDT_PROBE_DEFINE2(usb, device, xfer, done, "struct usbd_xfer *"/*xfer*/, "usbd_status"/*status*/); SDT_PROBE_DEFINE1(usb, device, xfer, destroy, "struct usbd_xfer *"/*xfer*/); SDT_PROBE_DEFINE5(usb, device, request, start, "struct usbd_device *"/*dev*/, "usb_device_request_t *"/*req*/, "size_t"/*len*/, "int"/*flags*/, "uint32_t"/*timeout*/); SDT_PROBE_DEFINE7(usb, device, request, done, "struct usbd_device *"/*dev*/, "usb_device_request_t *"/*req*/, "size_t"/*actlen*/, "int"/*flags*/, "uint32_t"/*timeout*/, "void *"/*data*/, "usbd_status"/*status*/); Static void usbd_ar_pipe(struct usbd_pipe *); Static void usbd_start_next(struct usbd_pipe *); Static usbd_status usbd_open_pipe_ival (struct usbd_interface *, uint8_t, uint8_t, struct usbd_pipe **, int); static void *usbd_alloc_buffer(struct usbd_xfer *, uint32_t); static void usbd_free_buffer(struct usbd_xfer *); static struct usbd_xfer *usbd_alloc_xfer(struct usbd_device *, unsigned int); static void usbd_free_xfer(struct usbd_xfer *); static void usbd_xfer_timeout(void *); static void usbd_xfer_timeout_task(void *); static bool usbd_xfer_probe_timeout(struct usbd_xfer *); static void usbd_xfer_cancel_timeout_async(struct usbd_xfer *); #if defined(USB_DEBUG) void usbd_dump_iface(struct usbd_interface *iface) { USBHIST_FUNC(); USBHIST_CALLARGS(usbdebug, "iface %#jx", (uintptr_t)iface, 0, 0, 0); if (iface == NULL) return; USBHIST_LOG(usbdebug, " device = %#jx idesc = %#jx index = %jd", (uintptr_t)iface->ui_dev, (uintptr_t)iface->ui_idesc, iface->ui_index, 0); USBHIST_LOG(usbdebug, " altindex=%jd", iface->ui_altindex, 0, 0, 0); } void usbd_dump_device(struct usbd_device *dev) { USBHIST_FUNC(); USBHIST_CALLARGS(usbdebug, "dev = %#jx", (uintptr_t)dev, 0, 0, 0); if (dev == NULL) return; USBHIST_LOG(usbdebug, " bus = %#jx default_pipe = %#jx", (uintptr_t)dev->ud_bus, (uintptr_t)dev->ud_pipe0, 0, 0); USBHIST_LOG(usbdebug, " address = %jd config = %jd depth = %jd ", dev->ud_addr, dev->ud_config, dev->ud_depth, 0); USBHIST_LOG(usbdebug, " speed = %jd self_powered = %jd " "power = %jd langid = %jd", dev->ud_speed, dev->ud_selfpowered, dev->ud_power, dev->ud_langid); } void usbd_dump_endpoint(struct usbd_endpoint *endp) { USBHIST_FUNC(); USBHIST_CALLARGS(usbdebug, "endp = %#jx", (uintptr_t)endp, 0, 0, 0); if (endp == NULL) return; USBHIST_LOG(usbdebug, " edesc = %#jx refcnt = %jd", (uintptr_t)endp->ue_edesc, endp->ue_refcnt, 0, 0); if (endp->ue_edesc) USBHIST_LOG(usbdebug, " bEndpointAddress=0x%02jx", endp->ue_edesc->bEndpointAddress, 0, 0, 0); } void usbd_dump_queue(struct usbd_pipe *pipe) { struct usbd_xfer *xfer; USBHIST_FUNC(); USBHIST_CALLARGS(usbdebug, "pipe = %#jx", (uintptr_t)pipe, 0, 0, 0); SIMPLEQ_FOREACH(xfer, &pipe->up_queue, ux_next) { USBHIST_LOG(usbdebug, " xfer = %#jx", (uintptr_t)xfer, 0, 0, 0); } } void usbd_dump_pipe(struct usbd_pipe *pipe) { USBHIST_FUNC(); USBHIST_CALLARGS(usbdebug, "pipe = %#jx", (uintptr_t)pipe, 0, 0, 0); if (pipe == NULL) return; usbd_dump_iface(pipe->up_iface); usbd_dump_device(pipe->up_dev); usbd_dump_endpoint(pipe->up_endpoint); USBHIST_LOG(usbdebug, "(usbd_dump_pipe)", 0, 0, 0, 0); USBHIST_LOG(usbdebug, " running = %jd aborting = %jd", pipe->up_running, pipe->up_aborting, 0, 0); USBHIST_LOG(usbdebug, " intrxfer = %#jx, repeat = %jd, " "interval = %jd", (uintptr_t)pipe->up_intrxfer, pipe->up_repeat, pipe->up_interval, 0); } #endif usbd_status usbd_open_pipe(struct usbd_interface *iface, uint8_t address, uint8_t flags, struct usbd_pipe **pipe) { return (usbd_open_pipe_ival(iface, address, flags, pipe, USBD_DEFAULT_INTERVAL)); } usbd_status usbd_open_pipe_ival(struct usbd_interface *iface, uint8_t address, uint8_t flags, struct usbd_pipe **pipe, int ival) { struct usbd_pipe *p = NULL; struct usbd_endpoint *ep = NULL /* XXXGCC */; bool piperef = false; usbd_status err; int i; USBHIST_FUNC(); USBHIST_CALLARGS(usbdebug, "iface = %#jx address = %#jx flags = %#jx", (uintptr_t)iface, address, flags, 0); /* * Block usbd_set_interface so we have a snapshot of the * interface endpoints. They will remain stable until we drop * the reference in usbd_close_pipe (or on failure here). */ err = usbd_iface_piperef(iface); if (err) goto out; piperef = true; /* Find the endpoint at this address. */ for (i = 0; i < iface->ui_idesc->bNumEndpoints; i++) { ep = &iface->ui_endpoints[i]; if (ep->ue_edesc == NULL) { err = USBD_IOERROR; goto out; } if (ep->ue_edesc->bEndpointAddress == address) break; } if (i == iface->ui_idesc->bNumEndpoints) { err = USBD_BAD_ADDRESS; goto out; } /* Set up the pipe with this endpoint. */ err = usbd_setup_pipe_flags(iface->ui_dev, iface, ep, ival, &p, flags); if (err) goto out; /* Success! */ *pipe = p; p = NULL; /* handed off to caller */ piperef = false; /* handed off to pipe */ SDT_PROBE5(usb, device, pipe, open, iface, address, flags, ival, p); err = USBD_NORMAL_COMPLETION; out: if (p) usbd_close_pipe(p); if (piperef) usbd_iface_pipeunref(iface); return err; } usbd_status usbd_open_pipe_intr(struct usbd_interface *iface, uint8_t address, uint8_t flags, struct usbd_pipe **pipe, void *priv, void *buffer, uint32_t len, usbd_callback cb, int ival) { usbd_status err; struct usbd_xfer *xfer; struct usbd_pipe *ipipe; USBHIST_FUNC(); USBHIST_CALLARGS(usbdebug, "address = %#jx flags = %#jx len = %jd", address, flags, len, 0); err = usbd_open_pipe_ival(iface, address, USBD_EXCLUSIVE_USE | (flags & USBD_MPSAFE), &ipipe, ival); if (err) return err; err = usbd_create_xfer(ipipe, len, flags, 0, &xfer); if (err) goto bad1; usbd_setup_xfer(xfer, priv, buffer, len, flags, USBD_NO_TIMEOUT, cb); ipipe->up_intrxfer = xfer; ipipe->up_repeat = 1; err = usbd_transfer(xfer); *pipe = ipipe; if (err != USBD_IN_PROGRESS) goto bad3; SDT_PROBE7(usb, device, pipe, open__intr, iface, address, flags, ival, cb, priv, ipipe); return USBD_NORMAL_COMPLETION; bad3: ipipe->up_intrxfer = NULL; ipipe->up_repeat = 0; usbd_destroy_xfer(xfer); bad1: usbd_close_pipe(ipipe); return err; } void usbd_close_pipe(struct usbd_pipe *pipe) { USBHIST_FUNC(); USBHIST_CALLED(usbdebug); KASSERT(pipe != NULL); usbd_lock_pipe(pipe); SDT_PROBE1(usb, device, pipe, close, pipe); if (!SIMPLEQ_EMPTY(&pipe->up_queue)) { printf("WARNING: pipe closed with active xfers on addr %d\n", pipe->up_dev->ud_addr); usbd_ar_pipe(pipe); } KASSERT(SIMPLEQ_EMPTY(&pipe->up_queue)); pipe->up_methods->upm_close(pipe); usbd_unlock_pipe(pipe); cv_destroy(&pipe->up_callingcv); if (pipe->up_intrxfer) usbd_destroy_xfer(pipe->up_intrxfer); usb_rem_task_wait(pipe->up_dev, &pipe->up_async_task, USB_TASKQ_DRIVER, NULL); usbd_endpoint_release(pipe->up_dev, pipe->up_endpoint); if (pipe->up_iface) usbd_iface_pipeunref(pipe->up_iface); kmem_free(pipe, pipe->up_dev->ud_bus->ub_pipesize); } usbd_status usbd_transfer(struct usbd_xfer *xfer) { struct usbd_pipe *pipe = xfer->ux_pipe; usbd_status err; unsigned int size, flags; USBHIST_FUNC(); USBHIST_CALLARGS(usbdebug, "xfer = %#jx, flags = %#jx, pipe = %#jx, running = %jd", (uintptr_t)xfer, xfer->ux_flags, (uintptr_t)pipe, pipe->up_running); KASSERT(xfer->ux_status == USBD_NOT_STARTED); SDT_PROBE1(usb, device, xfer, start, xfer); #ifdef USB_DEBUG if (usbdebug > 5) usbd_dump_queue(pipe); #endif xfer->ux_done = 0; KASSERT(xfer->ux_length == 0 || xfer->ux_buf != NULL); size = xfer->ux_length; flags = xfer->ux_flags; if (size != 0) { /* * Use the xfer buffer if none specified in transfer setup. * isoc transfers always use the xfer buffer, i.e. * ux_buffer is always NULL for isoc. */ if (xfer->ux_buffer == NULL) { xfer->ux_buffer = xfer->ux_buf; } /* * If not using the xfer buffer copy data to the * xfer buffer for OUT transfers of >0 length */ if (xfer->ux_buffer != xfer->ux_buf) { KASSERT(xfer->ux_buf); if (!usbd_xfer_isread(xfer)) { memcpy(xfer->ux_buf, xfer->ux_buffer, size); } } } if (pipe->up_dev->ud_bus->ub_usepolling == 0) usbd_lock_pipe(pipe); if (pipe->up_aborting) { /* * XXX For synchronous transfers this is fine. What to * do for asynchronous transfers? The callback is * never run, not even with status USBD_CANCELLED. */ KASSERT(pipe->up_dev->ud_bus->ub_usepolling == 0); usbd_unlock_pipe(pipe); USBHIST_LOG(usbdebug, "<- done xfer %#jx, aborting", (uintptr_t)xfer, 0, 0, 0); SDT_PROBE2(usb, device, xfer, done, xfer, USBD_CANCELLED); return USBD_CANCELLED; } /* xfer is not valid after the transfer method unless synchronous */ SDT_PROBE2(usb, device, pipe, transfer__start, pipe, xfer); do { #ifdef DIAGNOSTIC xfer->ux_state = XFER_ONQU; #endif SIMPLEQ_INSERT_TAIL(&pipe->up_queue, xfer, ux_next); if (pipe->up_running && pipe->up_serialise) { err = USBD_IN_PROGRESS; } else { pipe->up_running = 1; err = USBD_NORMAL_COMPLETION; } if (err) break; err = pipe->up_methods->upm_transfer(xfer); } while (0); SDT_PROBE3(usb, device, pipe, transfer__done, pipe, xfer, err); if (pipe->up_dev->ud_bus->ub_usepolling == 0) usbd_unlock_pipe(pipe); if (err != USBD_IN_PROGRESS && err) { /* * The transfer made it onto the pipe queue, but didn't get * accepted by the HCD for some reason. It needs removing * from the pipe queue. */ USBHIST_LOG(usbdebug, "xfer failed: %jd, reinserting", err, 0, 0, 0); if (pipe->up_dev->ud_bus->ub_usepolling == 0) usbd_lock_pipe(pipe); SDT_PROBE1(usb, device, xfer, preabort, xfer); #ifdef DIAGNOSTIC xfer->ux_state = XFER_BUSY; #endif SIMPLEQ_REMOVE_HEAD(&pipe->up_queue, ux_next); if (pipe->up_serialise) usbd_start_next(pipe); if (pipe->up_dev->ud_bus->ub_usepolling == 0) usbd_unlock_pipe(pipe); } if (!(flags & USBD_SYNCHRONOUS)) { USBHIST_LOG(usbdebug, "<- done xfer %#jx, not sync (err %jd)", (uintptr_t)xfer, err, 0, 0); KASSERTMSG(err != USBD_NORMAL_COMPLETION, "asynchronous xfer %p completed synchronously", xfer); return err; } if (err != USBD_IN_PROGRESS) { USBHIST_LOG(usbdebug, "<- done xfer %#jx, sync (err %jd)", (uintptr_t)xfer, err, 0, 0); SDT_PROBE2(usb, device, xfer, done, xfer, err); return err; } /* Sync transfer, wait for completion. */ if (pipe->up_dev->ud_bus->ub_usepolling == 0) usbd_lock_pipe(pipe); while (!xfer->ux_done) { if (pipe->up_dev->ud_bus->ub_usepolling) panic("usbd_transfer: not done"); USBHIST_LOG(usbdebug, "<- sleeping on xfer %#jx", (uintptr_t)xfer, 0, 0, 0); err = 0; if ((flags & USBD_SYNCHRONOUS_SIG) != 0) { err = cv_wait_sig(&xfer->ux_cv, pipe->up_dev->ud_bus->ub_lock); } else { cv_wait(&xfer->ux_cv, pipe->up_dev->ud_bus->ub_lock); } if (err) { if (!xfer->ux_done) { SDT_PROBE1(usb, device, xfer, abort, xfer); pipe->up_methods->upm_abort(xfer); } break; } } err = xfer->ux_status; SDT_PROBE2(usb, device, xfer, done, xfer, err); if (pipe->up_dev->ud_bus->ub_usepolling == 0) usbd_unlock_pipe(pipe); return err; } /* Like usbd_transfer(), but waits for completion. */ usbd_status usbd_sync_transfer(struct usbd_xfer *xfer) { xfer->ux_flags |= USBD_SYNCHRONOUS; return usbd_transfer(xfer); } /* Like usbd_transfer(), but waits for completion and listens for signals. */ usbd_status usbd_sync_transfer_sig(struct usbd_xfer *xfer) { xfer->ux_flags |= USBD_SYNCHRONOUS | USBD_SYNCHRONOUS_SIG; return usbd_transfer(xfer); } static void * usbd_alloc_buffer(struct usbd_xfer *xfer, uint32_t size) { KASSERT(xfer->ux_buf == NULL); KASSERT(size != 0); xfer->ux_bufsize = 0; #if NUSB_DMA > 0 struct usbd_bus *bus = xfer->ux_bus; if (bus->ub_usedma) { usb_dma_t *dmap = &xfer->ux_dmabuf; KASSERT((bus->ub_dmaflags & USBMALLOC_COHERENT) == 0); int err = usb_allocmem(bus->ub_dmatag, size, 0, bus->ub_dmaflags, dmap); if (err) { return NULL; } xfer->ux_buf = KERNADDR(&xfer->ux_dmabuf, 0); xfer->ux_bufsize = size; return xfer->ux_buf; } #endif KASSERT(xfer->ux_bus->ub_usedma == false); xfer->ux_buf = kmem_alloc(size, KM_SLEEP); xfer->ux_bufsize = size; return xfer->ux_buf; } static void usbd_free_buffer(struct usbd_xfer *xfer) { KASSERT(xfer->ux_buf != NULL); KASSERT(xfer->ux_bufsize != 0); void *buf = xfer->ux_buf; uint32_t size = xfer->ux_bufsize; xfer->ux_buf = NULL; xfer->ux_bufsize = 0; #if NUSB_DMA > 0 struct usbd_bus *bus = xfer->ux_bus; if (bus->ub_usedma) { usb_dma_t *dmap = &xfer->ux_dmabuf; usb_freemem(dmap); return; } #endif KASSERT(xfer->ux_bus->ub_usedma == false); kmem_free(buf, size); } void * usbd_get_buffer(struct usbd_xfer *xfer) { return xfer->ux_buf; } struct usbd_pipe * usbd_get_pipe0(struct usbd_device *dev) { return dev->ud_pipe0; } static struct usbd_xfer * usbd_alloc_xfer(struct usbd_device *dev, unsigned int nframes) { struct usbd_xfer *xfer; USBHIST_FUNC(); ASSERT_SLEEPABLE(); xfer = dev->ud_bus->ub_methods->ubm_allocx(dev->ud_bus, nframes); if (xfer == NULL) goto out; xfer->ux_bus = dev->ud_bus; callout_init(&xfer->ux_callout, CALLOUT_MPSAFE); callout_setfunc(&xfer->ux_callout, usbd_xfer_timeout, xfer); cv_init(&xfer->ux_cv, "usbxfer"); usb_init_task(&xfer->ux_aborttask, usbd_xfer_timeout_task, xfer, USB_TASKQ_MPSAFE); out: USBHIST_CALLARGS(usbdebug, "returns %#jx", (uintptr_t)xfer, 0, 0, 0); return xfer; } static void usbd_free_xfer(struct usbd_xfer *xfer) { USBHIST_FUNC(); USBHIST_CALLARGS(usbdebug, "%#jx", (uintptr_t)xfer, 0, 0, 0); if (xfer->ux_buf) { usbd_free_buffer(xfer); } /* Wait for any straggling timeout to complete. */ mutex_enter(xfer->ux_bus->ub_lock); xfer->ux_timeout_reset = false; /* do not resuscitate */ callout_halt(&xfer->ux_callout, xfer->ux_bus->ub_lock); usb_rem_task_wait(xfer->ux_pipe->up_dev, &xfer->ux_aborttask, USB_TASKQ_HC, xfer->ux_bus->ub_lock); mutex_exit(xfer->ux_bus->ub_lock); cv_destroy(&xfer->ux_cv); xfer->ux_bus->ub_methods->ubm_freex(xfer->ux_bus, xfer); } int usbd_create_xfer(struct usbd_pipe *pipe, size_t len, unsigned int flags, unsigned int nframes, struct usbd_xfer **xp) { KASSERT(xp != NULL); void *buf = NULL; struct usbd_xfer *xfer = usbd_alloc_xfer(pipe->up_dev, nframes); if (xfer == NULL) return ENOMEM; xfer->ux_pipe = pipe; xfer->ux_flags = flags; xfer->ux_nframes = nframes; xfer->ux_methods = pipe->up_methods; if (len) { buf = usbd_alloc_buffer(xfer, len); if (!buf) { usbd_free_xfer(xfer); return ENOMEM; } } if (xfer->ux_methods->upm_init) { int err = xfer->ux_methods->upm_init(xfer); if (err) { usbd_free_xfer(xfer); return err; } } *xp = xfer; SDT_PROBE5(usb, device, xfer, create, xfer, pipe, len, flags, nframes); return 0; } void usbd_destroy_xfer(struct usbd_xfer *xfer) { SDT_PROBE1(usb, device, xfer, destroy, xfer); if (xfer->ux_methods->upm_fini) xfer->ux_methods->upm_fini(xfer); usbd_free_xfer(xfer); } void usbd_setup_xfer(struct usbd_xfer *xfer, void *priv, void *buffer, uint32_t length, uint16_t flags, uint32_t timeout, usbd_callback callback) { KASSERT(xfer->ux_pipe); xfer->ux_priv = priv; xfer->ux_buffer = buffer; xfer->ux_length = length; xfer->ux_actlen = 0; xfer->ux_flags = flags; xfer->ux_timeout = timeout; xfer->ux_status = USBD_NOT_STARTED; xfer->ux_callback = callback; xfer->ux_rqflags &= ~URQ_REQUEST; xfer->ux_nframes = 0; } void usbd_setup_default_xfer(struct usbd_xfer *xfer, struct usbd_device *dev, void *priv, uint32_t timeout, usb_device_request_t *req, void *buffer, uint32_t length, uint16_t flags, usbd_callback callback) { KASSERT(xfer->ux_pipe == dev->ud_pipe0); xfer->ux_priv = priv; xfer->ux_buffer = buffer; xfer->ux_length = length; xfer->ux_actlen = 0; xfer->ux_flags = flags; xfer->ux_timeout = timeout; xfer->ux_status = USBD_NOT_STARTED; xfer->ux_callback = callback; xfer->ux_request = *req; xfer->ux_rqflags |= URQ_REQUEST; xfer->ux_nframes = 0; } void usbd_setup_isoc_xfer(struct usbd_xfer *xfer, void *priv, uint16_t *frlengths, uint32_t nframes, uint16_t flags, usbd_callback callback) { xfer->ux_priv = priv; xfer->ux_buffer = NULL; xfer->ux_length = 0; xfer->ux_actlen = 0; xfer->ux_flags = flags; xfer->ux_timeout = USBD_NO_TIMEOUT; xfer->ux_status = USBD_NOT_STARTED; xfer->ux_callback = callback; xfer->ux_rqflags &= ~URQ_REQUEST; xfer->ux_frlengths = frlengths; xfer->ux_nframes = nframes; for (size_t i = 0; i < xfer->ux_nframes; i++) xfer->ux_length += xfer->ux_frlengths[i]; } void usbd_get_xfer_status(struct usbd_xfer *xfer, void **priv, void **buffer, uint32_t *count, usbd_status *status) { if (priv != NULL) *priv = xfer->ux_priv; if (buffer != NULL) *buffer = xfer->ux_buffer; if (count != NULL) *count = xfer->ux_actlen; if (status != NULL) *status = xfer->ux_status; } usb_config_descriptor_t * usbd_get_config_descriptor(struct usbd_device *dev) { KASSERT(dev != NULL); return dev->ud_cdesc; } usb_interface_descriptor_t * usbd_get_interface_descriptor(struct usbd_interface *iface) { KASSERT(iface != NULL); return iface->ui_idesc; } usb_device_descriptor_t * usbd_get_device_descriptor(struct usbd_device *dev) { KASSERT(dev != NULL); return &dev->ud_ddesc; } usb_endpoint_descriptor_t * usbd_interface2endpoint_descriptor(struct usbd_interface *iface, uint8_t index) { if (index >= iface->ui_idesc->bNumEndpoints) return NULL; return iface->ui_endpoints[index].ue_edesc; } /* Some drivers may wish to abort requests on the default pipe, * * but there is no mechanism for getting a handle on it. */ void usbd_abort_default_pipe(struct usbd_device *device) { usbd_abort_pipe(device->ud_pipe0); } void usbd_abort_pipe(struct usbd_pipe *pipe) { usbd_suspend_pipe(pipe); usbd_resume_pipe(pipe); } void usbd_suspend_pipe(struct usbd_pipe *pipe) { usbd_lock_pipe(pipe); usbd_ar_pipe(pipe); usbd_unlock_pipe(pipe); } void usbd_resume_pipe(struct usbd_pipe *pipe) { usbd_lock_pipe(pipe); KASSERT(SIMPLEQ_EMPTY(&pipe->up_queue)); pipe->up_aborting = 0; usbd_unlock_pipe(pipe); } usbd_status usbd_clear_endpoint_stall(struct usbd_pipe *pipe) { struct usbd_device *dev = pipe->up_dev; usbd_status err; USBHIST_FUNC(); USBHIST_CALLED(usbdebug); SDT_PROBE1(usb, device, pipe, clear__endpoint__stall, pipe); /* * Clearing en endpoint stall resets the endpoint toggle, so * do the same to the HC toggle. */ SDT_PROBE1(usb, device, pipe, clear__endpoint__toggle, pipe); pipe->up_methods->upm_cleartoggle(pipe); err = usbd_clear_endpoint_feature(dev, pipe->up_endpoint->ue_edesc->bEndpointAddress, UF_ENDPOINT_HALT); #if 0 XXX should we do this? if (!err) { pipe->state = USBD_PIPE_ACTIVE; /* XXX activate pipe */ } #endif return err; } void usbd_clear_endpoint_stall_task(void *arg) { struct usbd_pipe *pipe = arg; struct usbd_device *dev = pipe->up_dev; SDT_PROBE1(usb, device, pipe, clear__endpoint__stall, pipe); SDT_PROBE1(usb, device, pipe, clear__endpoint__toggle, pipe); pipe->up_methods->upm_cleartoggle(pipe); (void)usbd_clear_endpoint_feature(dev, pipe->up_endpoint->ue_edesc->bEndpointAddress, UF_ENDPOINT_HALT); } void usbd_clear_endpoint_stall_async(struct usbd_pipe *pipe) { usb_add_task(pipe->up_dev, &pipe->up_async_task, USB_TASKQ_DRIVER); } void usbd_clear_endpoint_toggle(struct usbd_pipe *pipe) { SDT_PROBE1(usb, device, pipe, clear__endpoint__toggle, pipe); pipe->up_methods->upm_cleartoggle(pipe); } usbd_status usbd_endpoint_count(struct usbd_interface *iface, uint8_t *count) { KASSERT(iface != NULL); KASSERT(iface->ui_idesc != NULL); *count = iface->ui_idesc->bNumEndpoints; return USBD_NORMAL_COMPLETION; } usbd_status usbd_interface_count(struct usbd_device *dev, uint8_t *count) { if (dev->ud_cdesc == NULL) return USBD_NOT_CONFIGURED; *count = dev->ud_cdesc->bNumInterface; return USBD_NORMAL_COMPLETION; } void usbd_interface2device_handle(struct usbd_interface *iface, struct usbd_device **dev) { *dev = iface->ui_dev; } usbd_status usbd_device2interface_handle(struct usbd_device *dev, uint8_t ifaceno, struct usbd_interface **iface) { if (dev->ud_cdesc == NULL) return USBD_NOT_CONFIGURED; if (ifaceno >= dev->ud_cdesc->bNumInterface) return USBD_INVAL; *iface = &dev->ud_ifaces[ifaceno]; return USBD_NORMAL_COMPLETION; } struct usbd_device * usbd_pipe2device_handle(struct usbd_pipe *pipe) { KASSERT(pipe != NULL); return pipe->up_dev; } /* XXXX use altno */ usbd_status usbd_set_interface(struct usbd_interface *iface, int altidx) { bool locked = false; usb_device_request_t req; usbd_status err; USBHIST_FUNC(); USBHIST_CALLARGS(usbdebug, "iface %#jx", (uintptr_t)iface, 0, 0, 0); err = usbd_iface_lock(iface); if (err) goto out; locked = true; err = usbd_fill_iface_data(iface->ui_dev, iface->ui_index, altidx); if (err) goto out; req.bmRequestType = UT_WRITE_INTERFACE; req.bRequest = UR_SET_INTERFACE; USETW(req.wValue, iface->ui_idesc->bAlternateSetting); USETW(req.wIndex, iface->ui_idesc->bInterfaceNumber); USETW(req.wLength, 0); err = usbd_do_request(iface->ui_dev, &req, 0); out: /* XXX back out iface data? */ if (locked) usbd_iface_unlock(iface); return err; } int usbd_get_no_alts(usb_config_descriptor_t *cdesc, int ifaceno) { char *p = (char *)cdesc; char *end = p + UGETW(cdesc->wTotalLength); usb_descriptor_t *desc; usb_interface_descriptor_t *idesc; int n; for (n = 0; end - p >= sizeof(*desc); p += desc->bLength) { desc = (usb_descriptor_t *)p; if (desc->bLength < sizeof(*desc) || desc->bLength > end - p) break; if (desc->bDescriptorType != UDESC_INTERFACE) continue; if (desc->bLength < sizeof(*idesc)) break; idesc = (usb_interface_descriptor_t *)desc; if (idesc->bInterfaceNumber == ifaceno) { n++; if (n == INT_MAX) break; } } return n; } int usbd_get_interface_altindex(struct usbd_interface *iface) { return iface->ui_altindex; } usbd_status usbd_get_interface(struct usbd_interface *iface, uint8_t *aiface) { usb_device_request_t req; req.bmRequestType = UT_READ_INTERFACE; req.bRequest = UR_GET_INTERFACE; USETW(req.wValue, 0); USETW(req.wIndex, iface->ui_idesc->bInterfaceNumber); USETW(req.wLength, 1); return usbd_do_request(iface->ui_dev, &req, aiface); } /*** Internal routines ***/ /* Dequeue all pipe operations, called with bus lock held. */ Static void usbd_ar_pipe(struct usbd_pipe *pipe) { struct usbd_xfer *xfer; USBHIST_FUNC(); USBHIST_CALLARGS(usbdebug, "pipe = %#jx", (uintptr_t)pipe, 0, 0, 0); SDT_PROBE1(usb, device, pipe, abort__start, pipe); ASSERT_SLEEPABLE(); KASSERT(mutex_owned(pipe->up_dev->ud_bus->ub_lock)); KASSERT(pipe->up_dev->ud_bus->ub_usepolling == 0); /* * Allow only one thread at a time to abort the pipe, so we * don't get confused if upm_abort drops the lock in the middle * of the abort to wait for hardware completion softints to * stop using the xfer before returning. */ KASSERTMSG(pipe->up_abortlwp == NULL, "pipe->up_abortlwp=%p", pipe->up_abortlwp); pipe->up_abortlwp = curlwp; #ifdef USB_DEBUG if (usbdebug > 5) usbd_dump_queue(pipe); #endif pipe->up_repeat = 0; pipe->up_running = 0; pipe->up_aborting = 1; while ((xfer = SIMPLEQ_FIRST(&pipe->up_queue)) != NULL) { USBHIST_LOG(usbdebug, "pipe = %#jx xfer = %#jx " "(methods = %#jx)", (uintptr_t)pipe, (uintptr_t)xfer, (uintptr_t)pipe->up_methods, 0); if (xfer->ux_status == USBD_NOT_STARTED) { SDT_PROBE1(usb, device, xfer, preabort, xfer); #ifdef DIAGNOSTIC xfer->ux_state = XFER_BUSY; #endif SIMPLEQ_REMOVE_HEAD(&pipe->up_queue, ux_next); } else { /* Make the HC abort it (and invoke the callback). */ SDT_PROBE1(usb, device, xfer, abort, xfer); pipe->up_methods->upm_abort(xfer); while (pipe->up_callingxfer == xfer) { USBHIST_LOG(usbdebug, "wait for callback" "pipe = %#jx xfer = %#jx", (uintptr_t)pipe, (uintptr_t)xfer, 0, 0); cv_wait(&pipe->up_callingcv, pipe->up_dev->ud_bus->ub_lock); } /* XXX only for non-0 usbd_clear_endpoint_stall(pipe); */ } } /* * There may be an xfer callback already in progress which was * taken off the queue before we got to it. We must wait for * the callback to finish before returning control to the * caller. */ while (pipe->up_callingxfer) { USBHIST_LOG(usbdebug, "wait for callback" "pipe = %#jx xfer = %#jx", (uintptr_t)pipe, (uintptr_t)pipe->up_callingxfer, 0, 0); cv_wait(&pipe->up_callingcv, pipe->up_dev->ud_bus->ub_lock); } KASSERT(mutex_owned(pipe->up_dev->ud_bus->ub_lock)); KASSERTMSG(pipe->up_abortlwp == curlwp, "pipe->up_abortlwp=%p", pipe->up_abortlwp); pipe->up_abortlwp = NULL; SDT_PROBE1(usb, device, pipe, abort__done, pipe); } /* Called with USB lock held. */ void usb_transfer_complete(struct usbd_xfer *xfer) { struct usbd_pipe *pipe = xfer->ux_pipe; struct usbd_bus *bus = pipe->up_dev->ud_bus; int sync = xfer->ux_flags & USBD_SYNCHRONOUS; int erred; int polling = bus->ub_usepolling; int repeat = pipe->up_repeat; USBHIST_FUNC(); USBHIST_CALLARGS(usbdebug, "pipe = %#jx xfer = %#jx status = %jd " "actlen = %jd", (uintptr_t)pipe, (uintptr_t)xfer, xfer->ux_status, xfer->ux_actlen); KASSERT(polling || mutex_owned(pipe->up_dev->ud_bus->ub_lock)); KASSERTMSG(xfer->ux_state == XFER_ONQU, "xfer %p state is %x", xfer, xfer->ux_state); KASSERT(pipe != NULL); /* * If device is known to miss out ack, then pretend that * output timeout is a success. Userland should handle * the logic to verify that the operation succeeded. */ if (pipe->up_dev->ud_quirks && pipe->up_dev->ud_quirks->uq_flags & UQ_MISS_OUT_ACK && xfer->ux_status == USBD_TIMEOUT && !usbd_xfer_isread(xfer)) { USBHIST_LOG(usbdebug, "Possible output ack miss for xfer %#jx: " "hiding write timeout to %jd.%jd for %ju bytes written", (uintptr_t)xfer, curlwp->l_proc->p_pid, curlwp->l_lid, xfer->ux_length); xfer->ux_status = USBD_NORMAL_COMPLETION; xfer->ux_actlen = xfer->ux_length; } erred = xfer->ux_status == USBD_CANCELLED || xfer->ux_status == USBD_TIMEOUT; if (!repeat) { /* Remove request from queue. */ KASSERTMSG(!SIMPLEQ_EMPTY(&pipe->up_queue), "pipe %p is empty, but xfer %p wants to complete", pipe, xfer); KASSERTMSG(xfer == SIMPLEQ_FIRST(&pipe->up_queue), "xfer %p is not start of queue (%p is at start)", xfer, SIMPLEQ_FIRST(&pipe->up_queue)); #ifdef DIAGNOSTIC xfer->ux_state = XFER_BUSY; #endif SIMPLEQ_REMOVE_HEAD(&pipe->up_queue, ux_next); } USBHIST_LOG(usbdebug, "xfer %#jx: repeat %jd new head = %#jx", (uintptr_t)xfer, repeat, (uintptr_t)SIMPLEQ_FIRST(&pipe->up_queue), 0); /* Count completed transfers. */ ++pipe->up_dev->ud_bus->ub_stats.uds_requests [pipe->up_endpoint->ue_edesc->bmAttributes & UE_XFERTYPE]; xfer->ux_done = 1; if (!xfer->ux_status && xfer->ux_actlen < xfer->ux_length && !(xfer->ux_flags & USBD_SHORT_XFER_OK)) { USBHIST_LOG(usbdebug, "short transfer %jd < %jd", xfer->ux_actlen, xfer->ux_length, 0, 0); xfer->ux_status = USBD_SHORT_XFER; } USBHIST_LOG(usbdebug, "xfer %#jx doing done %#jx", (uintptr_t)xfer, (uintptr_t)pipe->up_methods->upm_done, 0, 0); SDT_PROBE2(usb, device, xfer, done, xfer, xfer->ux_status); pipe->up_methods->upm_done(xfer); if (xfer->ux_length != 0 && xfer->ux_buffer != xfer->ux_buf) { KDASSERTMSG(xfer->ux_actlen <= xfer->ux_length, "actlen %d length %d",xfer->ux_actlen, xfer->ux_length); /* Only if IN transfer */ if (usbd_xfer_isread(xfer)) { memcpy(xfer->ux_buffer, xfer->ux_buf, xfer->ux_actlen); } } USBHIST_LOG(usbdebug, "xfer %#jx doing callback %#jx status %jd", (uintptr_t)xfer, (uintptr_t)xfer->ux_callback, xfer->ux_status, 0); if (xfer->ux_callback) { if (!polling) { KASSERT(pipe->up_callingxfer == NULL); pipe->up_callingxfer = xfer; mutex_exit(pipe->up_dev->ud_bus->ub_lock); if (!(pipe->up_flags & USBD_MPSAFE)) KERNEL_LOCK(1, curlwp); } xfer->ux_callback(xfer, xfer->ux_priv, xfer->ux_status); if (!polling) { if (!(pipe->up_flags & USBD_MPSAFE)) KERNEL_UNLOCK_ONE(curlwp); mutex_enter(pipe->up_dev->ud_bus->ub_lock); KASSERT(pipe->up_callingxfer == xfer); pipe->up_callingxfer = NULL; cv_broadcast(&pipe->up_callingcv); } } if (sync && !polling) { USBHIST_LOG(usbdebug, "<- done xfer %#jx, wakeup", (uintptr_t)xfer, 0, 0, 0); cv_broadcast(&xfer->ux_cv); } if (repeat) { xfer->ux_actlen = 0; xfer->ux_status = USBD_NOT_STARTED; } else { /* XXX should we stop the queue on all errors? */ if (erred && pipe->up_iface != NULL) /* not control pipe */ pipe->up_running = 0; } if (pipe->up_running && pipe->up_serialise) usbd_start_next(pipe); } /* Called with USB lock held. */ void usbd_start_next(struct usbd_pipe *pipe) { struct usbd_xfer *xfer; usbd_status err; USBHIST_FUNC(); KASSERT(pipe != NULL); KASSERT(pipe->up_methods != NULL); KASSERT(pipe->up_methods->upm_start != NULL); KASSERT(pipe->up_serialise == true); int polling = pipe->up_dev->ud_bus->ub_usepolling; KASSERT(polling || mutex_owned(pipe->up_dev->ud_bus->ub_lock)); /* Get next request in queue. */ xfer = SIMPLEQ_FIRST(&pipe->up_queue); USBHIST_CALLARGS(usbdebug, "pipe = %#jx, xfer = %#jx", (uintptr_t)pipe, (uintptr_t)xfer, 0, 0); if (xfer == NULL) { pipe->up_running = 0; } else { SDT_PROBE2(usb, device, pipe, start, pipe, xfer); err = pipe->up_methods->upm_start(xfer); if (err != USBD_IN_PROGRESS) { USBHIST_LOG(usbdebug, "error = %jd", err, 0, 0, 0); pipe->up_running = 0; /* XXX do what? */ } } KASSERT(polling || mutex_owned(pipe->up_dev->ud_bus->ub_lock)); } usbd_status usbd_do_request(struct usbd_device *dev, usb_device_request_t *req, void *data) { return usbd_do_request_flags(dev, req, data, 0, 0, USBD_DEFAULT_TIMEOUT); } usbd_status usbd_do_request_flags(struct usbd_device *dev, usb_device_request_t *req, void *data, uint16_t flags, int *actlen, uint32_t timeout) { size_t len = UGETW(req->wLength); return usbd_do_request_len(dev, req, len, data, flags, actlen, timeout); } usbd_status usbd_do_request_len(struct usbd_device *dev, usb_device_request_t *req, size_t len, void *data, uint16_t flags, int *actlen, uint32_t timeout) { struct usbd_xfer *xfer; usbd_status err; KASSERT(len >= UGETW(req->wLength)); USBHIST_FUNC(); USBHIST_CALLARGS(usbdebug, "dev=%#jx req=%jx flags=%jx len=%jx", (uintptr_t)dev, (uintptr_t)req, flags, len); ASSERT_SLEEPABLE(); SDT_PROBE5(usb, device, request, start, dev, req, len, flags, timeout); int error = usbd_create_xfer(dev->ud_pipe0, len, 0, 0, &xfer); if (error) { SDT_PROBE7(usb, device, request, done, dev, req, /*actlen*/0, flags, timeout, data, USBD_NOMEM); return USBD_NOMEM; } usbd_setup_default_xfer(xfer, dev, 0, timeout, req, data, UGETW(req->wLength), flags, NULL); KASSERT(xfer->ux_pipe == dev->ud_pipe0); err = usbd_sync_transfer(xfer); #if defined(USB_DEBUG) || defined(DIAGNOSTIC) if (xfer->ux_actlen > xfer->ux_length) { USBHIST_LOG(usbdebug, "overrun addr = %jd type = 0x%02jx", dev->ud_addr, xfer->ux_request.bmRequestType, 0, 0); USBHIST_LOG(usbdebug, " req = 0x%02jx val = %jd " "index = %jd", xfer->ux_request.bRequest, UGETW(xfer->ux_request.wValue), UGETW(xfer->ux_request.wIndex), 0); USBHIST_LOG(usbdebug, " rlen = %jd length = %jd " "actlen = %jd", UGETW(xfer->ux_request.wLength), xfer->ux_length, xfer->ux_actlen, 0); } #endif if (actlen != NULL) *actlen = xfer->ux_actlen; usbd_destroy_xfer(xfer); SDT_PROBE7(usb, device, request, done, dev, req, xfer->ux_actlen, flags, timeout, data, err); if (err) { USBHIST_LOG(usbdebug, "returning err = %jd", err, 0, 0, 0); } return err; } const struct usbd_quirks * usbd_get_quirks(struct usbd_device *dev) { #ifdef DIAGNOSTIC if (dev == NULL) { printf("usbd_get_quirks: dev == NULL\n"); return 0; } #endif return dev->ud_quirks; } /* XXX do periodic free() of free list */ /* * Called from keyboard driver when in polling mode. */ void usbd_dopoll(struct usbd_interface *iface) { iface->ui_dev->ud_bus->ub_methods->ubm_dopoll(iface->ui_dev->ud_bus); } /* * This is for keyboard driver as well, which only operates in polling * mode from the ask root, etc., prompt and from DDB. */ void usbd_set_polling(struct usbd_device *dev, int on) { mutex_enter(dev->ud_bus->ub_lock); if (on) { /* * Enabling polling. If we're enabling for the first * time, call the softint routine on transition while * we hold the lock and polling is still disabled, and * then enable polling -- once polling is enabled, we * must not hold the lock when we call the softint * routine. */ KASSERT(dev->ud_bus->ub_usepolling < __type_max(char)); if (dev->ud_bus->ub_usepolling == 0) dev->ud_bus->ub_methods->ubm_softint(dev->ud_bus); dev->ud_bus->ub_usepolling++; } else { /* * Disabling polling. If we're disabling polling for * the last time, disable polling first and then call * the softint routine while we hold the lock -- until * polling is disabled, we must not hold the lock when * we call the softint routine. */ KASSERT(dev->ud_bus->ub_usepolling > 0); dev->ud_bus->ub_usepolling--; if (dev->ud_bus->ub_usepolling == 0) dev->ud_bus->ub_methods->ubm_softint(dev->ud_bus); } mutex_exit(dev->ud_bus->ub_lock); } usb_endpoint_descriptor_t * usbd_get_endpoint_descriptor(struct usbd_interface *iface, uint8_t address) { struct usbd_endpoint *ep; int i; for (i = 0; i < iface->ui_idesc->bNumEndpoints; i++) { ep = &iface->ui_endpoints[i]; if (ep->ue_edesc->bEndpointAddress == address) return iface->ui_endpoints[i].ue_edesc; } return NULL; } /* * usbd_ratecheck() can limit the number of error messages that occurs. * When a device is unplugged it may take up to 0.25s for the hub driver * to notice it. If the driver continuously tries to do I/O operations * this can generate a large number of messages. */ int usbd_ratecheck(struct timeval *last) { static struct timeval errinterval = { 0, 250000 }; /* 0.25 s*/ return ratecheck(last, &errinterval); } /* * Search for a vendor/product pair in an array. The item size is * given as an argument. */ const struct usb_devno * usb_match_device(const struct usb_devno *tbl, u_int nentries, u_int sz, uint16_t vendor, uint16_t product) { while (nentries-- > 0) { uint16_t tproduct = tbl->ud_product; if (tbl->ud_vendor == vendor && (tproduct == product || tproduct == USB_PRODUCT_ANY)) return tbl; tbl = (const struct usb_devno *)((const char *)tbl + sz); } return NULL; } usbd_status usbd_get_string(struct usbd_device *dev, int si, char *buf) { return usbd_get_string0(dev, si, buf, 1); } usbd_status usbd_get_string0(struct usbd_device *dev, int si, char *buf, int unicode) { int swap = dev->ud_quirks->uq_flags & UQ_SWAP_UNICODE; usb_string_descriptor_t us; char *s; int i, n; uint16_t c; usbd_status err; int size; USBHIST_FUNC(); USBHIST_CALLED(usbdebug); buf[0] = '\0'; if (si == 0) return USBD_INVAL; if (dev->ud_quirks->uq_flags & UQ_NO_STRINGS) return USBD_STALLED; if (dev->ud_langid == USBD_NOLANG) { /* Set up default language */ err = usbd_get_string_desc(dev, USB_LANGUAGE_TABLE, 0, &us, &size); if (err || size < 4) { USBHIST_LOG(usbdebug, "getting lang failed, using 0", 0, 0, 0, 0); dev->ud_langid = 0; /* Well, just pick something then */ } else { /* Pick the first language as the default. */ dev->ud_langid = UGETW(us.bString[0]); } } err = usbd_get_string_desc(dev, si, dev->ud_langid, &us, &size); if (err) return err; s = buf; n = size / 2 - 1; if (unicode) { for (i = 0; i < n; i++) { c = UGETW(us.bString[i]); if (swap) c = (c >> 8) | (c << 8); s += wput_utf8(s, 3, c); } *s++ = 0; } #ifdef COMPAT_30 else { for (i = 0; i < n; i++) { c = UGETW(us.bString[i]); if (swap) c = (c >> 8) | (c << 8); *s++ = (c < 0x80) ? c : '?'; } *s++ = 0; } #endif return USBD_NORMAL_COMPLETION; } /* * usbd_xfer_trycomplete(xfer) * * Try to claim xfer for completion. Return true if successful, * false if the xfer has been synchronously aborted or has timed * out. * * If this returns true, caller is responsible for setting * xfer->ux_status and calling usb_transfer_complete. To be used * in a host controller interrupt handler. * * Caller must either hold the bus lock or have the bus in polling * mode. If this succeeds, caller must proceed to call * usb_complete_transfer under the bus lock or with polling * enabled -- must not release and reacquire the bus lock in the * meantime. Failing to heed this rule may lead to catastrophe * with abort or timeout. */ bool usbd_xfer_trycomplete(struct usbd_xfer *xfer) { struct usbd_bus *bus __diagused = xfer->ux_bus; KASSERT(bus->ub_usepolling || mutex_owned(bus->ub_lock)); USBHIST_FUNC(); USBHIST_CALLARGS(usbdebug, "xfer %#jx status %jd", (uintptr_t)xfer, xfer->ux_status, 0, 0); /* * If software has completed it, either by synchronous abort or * by timeout, too late. */ if (xfer->ux_status != USBD_IN_PROGRESS) return false; /* * We are completing the xfer. Cancel the timeout if we can, * but only asynchronously. See usbd_xfer_cancel_timeout_async * for why we need not wait for the callout or task here. */ usbd_xfer_cancel_timeout_async(xfer); /* Success! Note: Caller must set xfer->ux_status afterwar. */ return true; } /* * usbd_xfer_abort(xfer) * * Try to claim xfer to abort. If successful, mark it completed * with USBD_CANCELLED and call the bus-specific method to abort * at the hardware level. * * To be called in thread context from struct * usbd_pipe_methods::upm_abort. * * Caller must hold the bus lock. */ void usbd_xfer_abort(struct usbd_xfer *xfer) { struct usbd_bus *bus = xfer->ux_bus; KASSERT(mutex_owned(bus->ub_lock)); USBHIST_FUNC(); USBHIST_CALLARGS(usbdebug, "xfer %#jx status %jd", (uintptr_t)xfer, xfer->ux_status, 0, 0); /* * If host controller interrupt or timer interrupt has * completed it, too late. But the xfer cannot be * cancelled already -- only one caller can synchronously * abort. */ KASSERT(xfer->ux_status != USBD_CANCELLED); if (xfer->ux_status != USBD_IN_PROGRESS) return; /* * Cancel the timeout if we can, but only asynchronously; see * usbd_xfer_cancel_timeout_async for why we need not wait for * the callout or task here. */ usbd_xfer_cancel_timeout_async(xfer); /* * We beat everyone else. Claim the status as cancelled, do * the bus-specific dance to abort the hardware, and complete * the xfer. */ xfer->ux_status = USBD_CANCELLED; bus->ub_methods->ubm_abortx(xfer); usb_transfer_complete(xfer); } /* * usbd_xfer_timeout(xfer) * * Called at IPL_SOFTCLOCK when too much time has elapsed waiting * for xfer to complete. Since we can't abort the xfer at * IPL_SOFTCLOCK, defer to a usb_task to run it in thread context, * unless the xfer has completed or aborted concurrently -- and if * the xfer has also been resubmitted, take care of rescheduling * the callout. */ static void usbd_xfer_timeout(void *cookie) { struct usbd_xfer *xfer = cookie; struct usbd_bus *bus = xfer->ux_bus; struct usbd_device *dev = xfer->ux_pipe->up_dev; /* Acquire the lock so we can transition the timeout state. */ mutex_enter(bus->ub_lock); USBHIST_FUNC(); USBHIST_CALLARGS(usbdebug, "xfer %#jx status %jd", (uintptr_t)xfer, xfer->ux_status, 0, 0); /* * Use usbd_xfer_probe_timeout to check whether the timeout is * still valid, or to reschedule the callout if necessary. If * it is still valid, schedule the task. */ if (usbd_xfer_probe_timeout(xfer)) { USBHIST_LOG(usbdebug, "xfer %#jx schedule timeout task", (uintptr_t)xfer, 0, 0, 0); usb_add_task(dev, &xfer->ux_aborttask, USB_TASKQ_HC); } else { USBHIST_LOG(usbdebug, "xfer %#jx timeout cancelled", (uintptr_t)xfer, 0, 0, 0); } /* * Notify usbd_xfer_cancel_timeout_async that we may have * scheduled the task. This causes callout_invoking to return * false in usbd_xfer_cancel_timeout_async so that it can tell * which stage in the callout->task->abort process we're at. */ callout_ack(&xfer->ux_callout); /* All done -- release the lock. */ mutex_exit(bus->ub_lock); } /* * usbd_xfer_timeout_task(xfer) * * Called in thread context when too much time has elapsed waiting * for xfer to complete. Abort the xfer with USBD_TIMEOUT, unless * it has completed or aborted concurrently -- and if the xfer has * also been resubmitted, take care of rescheduling the callout. */ static void usbd_xfer_timeout_task(void *cookie) { struct usbd_xfer *xfer = cookie; struct usbd_bus *bus = xfer->ux_bus; /* Acquire the lock so we can transition the timeout state. */ mutex_enter(bus->ub_lock); USBHIST_FUNC(); USBHIST_CALLARGS(usbdebug, "xfer %#jx status %jd", (uintptr_t)xfer, xfer->ux_status, 0, 0); /* * Use usbd_xfer_probe_timeout to check whether the timeout is * still valid, or to reschedule the callout if necessary. If * it is not valid -- the timeout has been asynchronously * cancelled, or the xfer has already been resubmitted -- then * we're done here. */ if (!usbd_xfer_probe_timeout(xfer)) { USBHIST_LOG(usbdebug, "xfer %#jx timeout cancelled", (uintptr_t)xfer, 0, 0, 0); goto out; } /* * After this point, no further timeout probing will happen for * the current incarnation of the timeout, so make the next * usbd_xfer_schedule_timeout schedule a new callout. * usbd_xfer_probe_timeout has already processed any reset. */ KASSERT(!xfer->ux_timeout_reset); xfer->ux_timeout_set = false; /* * May have completed or been aborted, but we're the only one * who can time it out. If it has completed or been aborted, * no need to timeout. */ KASSERT(xfer->ux_status != USBD_TIMEOUT); if (xfer->ux_status != USBD_IN_PROGRESS) { USBHIST_LOG(usbdebug, "xfer %#jx timeout raced", (uintptr_t)xfer, 0, 0, 0); goto out; } /* * We beat everyone else. Claim the status as timed out, do * the bus-specific dance to abort the hardware, and complete * the xfer. */ USBHIST_LOG(usbdebug, "xfer %#jx timed out", (uintptr_t)xfer, 0, 0, 0); xfer->ux_status = USBD_TIMEOUT; bus->ub_methods->ubm_abortx(xfer); usb_transfer_complete(xfer); out: /* All done -- release the lock. */ mutex_exit(bus->ub_lock); } /* * usbd_xfer_probe_timeout(xfer) * * Probe the status of xfer's timeout. Acknowledge and process a * request to reschedule. Return true if the timeout is still * valid and the caller should take further action (queueing a * task or aborting the xfer), false if it must stop here. */ static bool usbd_xfer_probe_timeout(struct usbd_xfer *xfer) { struct usbd_bus *bus = xfer->ux_bus; bool valid; USBHIST_FUNC(); USBHIST_CALLARGS(usbdebug, "xfer %#jx timeout %jdms" " set %jd reset %jd", (uintptr_t)xfer, xfer->ux_timeout, xfer->ux_timeout_set, xfer->ux_timeout_reset); KASSERT(bus->ub_usepolling || mutex_owned(bus->ub_lock)); /* The timeout must be set. */ KASSERT(xfer->ux_timeout_set); /* * Neither callout nor task may be pending; they execute * alternately in lock step. */ KASSERT(!callout_pending(&xfer->ux_callout)); KASSERT(!usb_task_pending(xfer->ux_pipe->up_dev, &xfer->ux_aborttask)); /* There are a few cases... */ if (bus->ub_methods->ubm_dying(bus)) { /* Host controller dying. Drop it all on the floor. */ USBHIST_LOG(usbdebug, "xfer %#jx bus dying, not rescheduling", (uintptr_t)xfer, 0, 0, 0); xfer->ux_timeout_set = false; xfer->ux_timeout_reset = false; valid = false; } else if (xfer->ux_timeout_reset) { /* * The xfer completed _and_ got resubmitted while we * waited for the lock. Acknowledge the request to * reschedule, and reschedule it if there is a timeout * and the bus is not polling. */ xfer->ux_timeout_reset = false; if (xfer->ux_timeout && !bus->ub_usepolling) { USBHIST_LOG(usbdebug, "xfer %#jx resubmitted," " rescheduling timer for %jdms", (uintptr_t)xfer, xfer->ux_timeout, 0, 0); KASSERT(xfer->ux_timeout_set); callout_schedule(&xfer->ux_callout, mstohz(xfer->ux_timeout)); } else { /* No more callout or task scheduled. */ USBHIST_LOG(usbdebug, "xfer %#jx resubmitted" " and completed, not rescheduling", (uintptr_t)xfer, 0, 0, 0); xfer->ux_timeout_set = false; } valid = false; } else if (xfer->ux_status != USBD_IN_PROGRESS) { /* * The xfer has completed by hardware completion or by * software abort, and has not been resubmitted, so the * timeout must be unset, and is no longer valid for * the caller. */ USBHIST_LOG(usbdebug, "xfer %#jx timeout lost race," " status=%jd, not rescheduling", (uintptr_t)xfer, xfer->ux_status, 0, 0); xfer->ux_timeout_set = false; valid = false; } else { /* * The xfer has not yet completed, so the timeout is * valid. */ USBHIST_LOG(usbdebug, "xfer %#jx timing out", (uintptr_t)xfer, 0, 0, 0); valid = true; } /* Any reset must have been processed. */ KASSERT(!xfer->ux_timeout_reset); /* * Either we claim the timeout is set, or the callout is idle. * If the timeout is still set, we may be handing off to the * task instead, so this is an if but not an iff. */ KASSERT(xfer->ux_timeout_set || !callout_pending(&xfer->ux_callout)); /* * The task must be idle now. * * - If the caller is the callout, _and_ the timeout is still * valid, the caller will schedule it, but it hasn't been * scheduled yet. (If the timeout is not valid, the task * should not be scheduled.) * * - If the caller is the task, it cannot be scheduled again * until the callout runs again, which won't happen until we * next release the lock. */ KASSERT(!usb_task_pending(xfer->ux_pipe->up_dev, &xfer->ux_aborttask)); KASSERT(bus->ub_usepolling || mutex_owned(bus->ub_lock)); return valid; } /* * usbd_xfer_schedule_timeout(xfer) * * Ensure that xfer has a timeout. If the callout is already * queued or the task is already running, request that they * reschedule the callout. If not, and if we're not polling, * schedule the callout anew. * * To be called in thread context from struct * usbd_pipe_methods::upm_start. */ void usbd_xfer_schedule_timeout(struct usbd_xfer *xfer) { struct usbd_bus *bus = xfer->ux_bus; USBHIST_FUNC(); USBHIST_CALLARGS(usbdebug, "xfer %#jx timeout %jdms" " set %jd reset %jd", (uintptr_t)xfer, xfer->ux_timeout, xfer->ux_timeout_set, xfer->ux_timeout_reset); KASSERT(bus->ub_usepolling || mutex_owned(bus->ub_lock)); KASSERTMSG(xfer->ux_status == USBD_IN_PROGRESS, "xfer=%p status=%d", xfer, xfer->ux_status); if (xfer->ux_timeout_set) { /* * Callout or task has fired from a prior completed * xfer but has not yet noticed that the xfer is done. * Ask it to reschedule itself to ux_timeout. */ xfer->ux_timeout_reset = true; } else if (xfer->ux_timeout && !bus->ub_usepolling) { /* Callout is not scheduled. Schedule it. */ KASSERT(!callout_pending(&xfer->ux_callout)); callout_schedule(&xfer->ux_callout, mstohz(xfer->ux_timeout)); xfer->ux_timeout_set = true; } KASSERT(bus->ub_usepolling || mutex_owned(bus->ub_lock)); } /* * usbd_xfer_cancel_timeout_async(xfer) * * Cancel the callout and the task of xfer, which have not yet run * to completion, but don't wait for the callout or task to finish * running. * * If they have already fired, at worst they are waiting for the * bus lock. They will see that the xfer is no longer in progress * and give up, or they will see that the xfer has been * resubmitted with a new timeout and reschedule the callout. * * If a resubmitted request completed so fast that the callout * didn't have time to process a timer reset, just cancel the * timer reset. */ static void usbd_xfer_cancel_timeout_async(struct usbd_xfer *xfer) { struct usbd_bus *bus __diagused = xfer->ux_bus; KASSERT(bus->ub_usepolling || mutex_owned(bus->ub_lock)); USBHIST_FUNC(); USBHIST_CALLARGS(usbdebug, "xfer %#jx timeout %jdms" " set %jd reset %jd", (uintptr_t)xfer, xfer->ux_timeout, xfer->ux_timeout_set, xfer->ux_timeout_reset); /* * If the timer wasn't running anyway, forget about it. This * can happen if we are completing an isochronous transfer * which doesn't use the same timeout logic. */ if (!xfer->ux_timeout_set) { USBHIST_LOG(usbdebug, "xfer %#jx timer not running", (uintptr_t)xfer, 0, 0, 0); return; } xfer->ux_timeout_reset = false; if (!callout_stop(&xfer->ux_callout)) { /* * We stopped the callout before it ran. The timeout * is no longer set. */ USBHIST_LOG(usbdebug, "xfer %#jx timer stopped", (uintptr_t)xfer, 0, 0, 0); xfer->ux_timeout_set = false; } else if (callout_invoking(&xfer->ux_callout)) { /* * The callout has begun to run but it has not yet * acquired the lock and called callout_ack. The task * cannot be queued yet, and the callout cannot have * been rescheduled yet. * * By the time the callout acquires the lock, we will * have transitioned from USBD_IN_PROGRESS to a * completed status, and possibly also resubmitted the * xfer and set xfer->ux_timeout_reset = true. In both * cases, the callout will DTRT, so no further action * is needed here. */ USBHIST_LOG(usbdebug, "xfer %#jx timer fired", (uintptr_t)xfer, 0, 0, 0); } else if (usb_rem_task(xfer->ux_pipe->up_dev, &xfer->ux_aborttask)) { /* * The callout had fired and scheduled the task, but we * stopped the task before it could run. The timeout * is therefore no longer set -- the next resubmission * of the xfer must schedule a new timeout. * * The callout should not be pending at this point: * it is scheduled only under the lock, and only when * xfer->ux_timeout_set is false, or by the callout or * task itself when xfer->ux_timeout_reset is true. */ USBHIST_LOG(usbdebug, "xfer %#jx task fired", (uintptr_t)xfer, 0, 0, 0); xfer->ux_timeout_set = false; } else { USBHIST_LOG(usbdebug, "xfer %#jx task stopped", (uintptr_t)xfer, 0, 0, 0); } /* * The callout cannot be scheduled and the task cannot be * queued at this point. Either we cancelled them, or they are * already running and waiting for the bus lock. */ KASSERT(!callout_pending(&xfer->ux_callout)); KASSERT(!usb_task_pending(xfer->ux_pipe->up_dev, &xfer->ux_aborttask)); KASSERT(bus->ub_usepolling || mutex_owned(bus->ub_lock)); }
4 4 11 1 5 8 5 5 3 3 1 1 5 8 8 8 2 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 6 1 1 5 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 /* $NetBSD: kernfs_vnops.c,v 1.174 2022/03/27 17:10:56 christos Exp $ */ /* * Copyright (c) 1992, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software donated to Berkeley by * Jan-Simon Pendry. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)kernfs_vnops.c 8.15 (Berkeley) 5/21/95 */ /* * Kernel parameter filesystem (/kern) */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: kernfs_vnops.c,v 1.174 2022/03/27 17:10:56 christos Exp $"); #include <sys/param.h> #include <sys/systm.h> #include <sys/kernel.h> #include <sys/vmmeter.h> #include <sys/time.h> #include <sys/proc.h> #include <sys/vnode.h> #include <sys/malloc.h> #include <sys/file.h> #include <sys/stat.h> #include <sys/mount.h> #include <sys/namei.h> #include <sys/buf.h> #include <sys/dirent.h> #include <sys/msgbuf.h> #include <miscfs/genfs/genfs.h> #include <miscfs/kernfs/kernfs.h> #include <miscfs/specfs/specdev.h> #include <uvm/uvm_extern.h> #define KSTRING 256 /* Largest I/O available via this filesystem */ #define UIO_MX 32 #define READ_MODE (S_IRUSR|S_IRGRP|S_IROTH) #define WRITE_MODE (S_IWUSR|S_IRUSR|S_IRGRP|S_IROTH) #define UREAD_MODE (S_IRUSR) #define DIR_MODE (S_IRUSR|S_IXUSR|S_IRGRP|S_IXGRP|S_IROTH|S_IXOTH) #define UDIR_MODE (S_IRUSR|S_IXUSR) #define N(s) sizeof(s)-1, s const struct kern_target kern_targets[] = { /* NOTE: The name must be less than UIO_MX-16 chars in length */ /* name data tag type ro/rw */ { DT_DIR, N("."), 0, KFSkern, VDIR, DIR_MODE }, { DT_DIR, N(".."), 0, KFSroot, VDIR, DIR_MODE }, { DT_REG, N("boottime"), 0, KFSboottime, VREG, READ_MODE }, /* XXXUNCONST */ { DT_REG, N("copyright"), __UNCONST(copyright), KFSstring, VREG, READ_MODE }, { DT_REG, N("hostname"), 0, KFShostname, VREG, WRITE_MODE }, { DT_REG, N("hz"), &hz, KFSint, VREG, READ_MODE }, { DT_REG, N("loadavg"), 0, KFSavenrun, VREG, READ_MODE }, { DT_REG, N("msgbuf"), 0, KFSmsgbuf, VREG, READ_MODE }, { DT_REG, N("pagesize"), &uvmexp.pagesize, KFSint, VREG, READ_MODE }, { DT_REG, N("physmem"), &physmem, KFSint, VREG, READ_MODE }, #if 0 { DT_DIR, N("root"), 0, KFSnull, VDIR, DIR_MODE }, #endif { DT_BLK, N("rootdev"), &rootdev, KFSdevice, VBLK, UREAD_MODE }, { DT_CHR, N("rrootdev"), &rrootdev, KFSdevice, VCHR, UREAD_MODE }, { DT_REG, N("time"), 0, KFStime, VREG, READ_MODE }, /* XXXUNCONST */ { DT_REG, N("version"), __UNCONST(version), KFSstring, VREG, READ_MODE }, }; const struct kern_target subdir_targets[] = { /* NOTE: The name must be less than UIO_MX-16 chars in length */ /* name data tag type ro/rw */ { DT_DIR, N("."), 0, KFSsubdir, VDIR, DIR_MODE }, { DT_DIR, N(".."), 0, KFSkern, VDIR, DIR_MODE }, }; #undef N SIMPLEQ_HEAD(,dyn_kern_target) dyn_kern_targets = SIMPLEQ_HEAD_INITIALIZER(dyn_kern_targets); int nkern_targets = sizeof(kern_targets) / sizeof(kern_targets[0]); const int static_nkern_targets = sizeof(kern_targets) / sizeof(kern_targets[0]); int nkern_dirs = 2; int kernfs_try_fileop(kfstype, kfsfileop, void *, int); int kernfs_try_xread(kfstype, const struct kernfs_node *, char **, size_t, int); int kernfs_try_xwrite(kfstype, const struct kernfs_node *, char *, size_t, int); static int kernfs_default_xread(void *v); static int kernfs_default_xwrite(void *v); static int kernfs_default_fileop_getattr(void *); /* must include all fileop's */ const struct kernfs_fileop kernfs_default_fileops[] = { { .kf_fileop = KERNFS_XREAD }, { .kf_fileop = KERNFS_XWRITE }, { .kf_fileop = KERNFS_FILEOP_OPEN }, { .kf_fileop = KERNFS_FILEOP_GETATTR, .kf_vop = kernfs_default_fileop_getattr }, { .kf_fileop = KERNFS_FILEOP_IOCTL }, { .kf_fileop = KERNFS_FILEOP_CLOSE }, { .kf_fileop = KERNFS_FILEOP_READ, .kf_vop = kernfs_default_xread }, { .kf_fileop = KERNFS_FILEOP_WRITE, .kf_vop = kernfs_default_xwrite }, }; int kernfs_lookup(void *); int kernfs_open(void *); int kernfs_close(void *); int kernfs_access(void *); int kernfs_getattr(void *); int kernfs_setattr(void *); int kernfs_read(void *); int kernfs_write(void *); int kernfs_ioctl(void *); int kernfs_readdir(void *); int kernfs_inactive(void *); int kernfs_reclaim(void *); int kernfs_print(void *); int kernfs_pathconf(void *); int kernfs_getpages(void *); static int kernfs_xread(struct kernfs_node *, int, char **, size_t, size_t *); static int kernfs_xwrite(const struct kernfs_node *, char *, size_t); int (**kernfs_vnodeop_p)(void *); const struct vnodeopv_entry_desc kernfs_vnodeop_entries[] = { { &vop_default_desc, vn_default_error }, { &vop_parsepath_desc, genfs_parsepath }, /* parsepath */ { &vop_lookup_desc, kernfs_lookup }, /* lookup */ { &vop_create_desc, genfs_eopnotsupp }, /* create */ { &vop_mknod_desc, genfs_eopnotsupp }, /* mknod */ { &vop_open_desc, kernfs_open }, /* open */ { &vop_close_desc, kernfs_close }, /* close */ { &vop_access_desc, kernfs_access }, /* access */ { &vop_accessx_desc, genfs_accessx }, /* accessx */ { &vop_getattr_desc, kernfs_getattr }, /* getattr */ { &vop_setattr_desc, kernfs_setattr }, /* setattr */ { &vop_read_desc, kernfs_read }, /* read */ { &vop_write_desc, kernfs_write }, /* write */ { &vop_fallocate_desc, genfs_eopnotsupp }, /* fallocate */ { &vop_fdiscard_desc, genfs_eopnotsupp }, /* fdiscard */ { &vop_fcntl_desc, genfs_fcntl }, /* fcntl */ { &vop_ioctl_desc, kernfs_ioctl }, /* ioctl */ { &vop_poll_desc, genfs_poll }, /* poll */ { &vop_kqfilter_desc, genfs_kqfilter }, /* kqfilter */ { &vop_revoke_desc, genfs_revoke }, /* revoke */ { &vop_fsync_desc, genfs_nullop }, /* fsync */ { &vop_seek_desc, genfs_nullop }, /* seek */ { &vop_remove_desc, genfs_eopnotsupp }, /* remove */ { &vop_link_desc, genfs_erofs_link }, /* link */ { &vop_rename_desc, genfs_eopnotsupp }, /* rename */ { &vop_mkdir_desc, genfs_eopnotsupp }, /* mkdir */ { &vop_rmdir_desc, genfs_eopnotsupp }, /* rmdir */ { &vop_symlink_desc, genfs_erofs_symlink }, /* symlink */ { &vop_readdir_desc, kernfs_readdir }, /* readdir */ { &vop_readlink_desc, genfs_eopnotsupp }, /* readlink */ { &vop_abortop_desc, genfs_abortop }, /* abortop */ { &vop_inactive_desc, kernfs_inactive }, /* inactive */ { &vop_reclaim_desc, kernfs_reclaim }, /* reclaim */ { &vop_lock_desc, genfs_lock }, /* lock */ { &vop_unlock_desc, genfs_unlock }, /* unlock */ { &vop_bmap_desc, genfs_eopnotsupp }, /* bmap */ { &vop_strategy_desc, genfs_eopnotsupp }, /* strategy */ { &vop_print_desc, kernfs_print }, /* print */ { &vop_islocked_desc, genfs_islocked }, /* islocked */ { &vop_pathconf_desc, kernfs_pathconf }, /* pathconf */ { &vop_advlock_desc, genfs_einval }, /* advlock */ { &vop_bwrite_desc, genfs_eopnotsupp }, /* bwrite */ { &vop_getpages_desc, kernfs_getpages }, /* getpages */ { &vop_putpages_desc, genfs_putpages }, /* putpages */ { NULL, NULL } }; const struct vnodeopv_desc kernfs_vnodeop_opv_desc = { &kernfs_vnodeop_p, kernfs_vnodeop_entries }; int (**kernfs_specop_p)(void *); const struct vnodeopv_entry_desc kernfs_specop_entries[] = { { &vop_default_desc, vn_default_error }, GENFS_SPECOP_ENTRIES, { &vop_close_desc, spec_close }, /* close */ { &vop_access_desc, kernfs_access }, /* access */ { &vop_accessx_desc, genfs_accessx }, /* accessx */ { &vop_getattr_desc, kernfs_getattr }, /* getattr */ { &vop_setattr_desc, kernfs_setattr }, /* setattr */ { &vop_read_desc, spec_read }, /* read */ { &vop_write_desc, spec_write }, /* write */ { &vop_fcntl_desc, genfs_fcntl }, /* fcntl */ { &vop_fsync_desc, spec_fsync }, /* fsync */ { &vop_inactive_desc, kernfs_inactive }, /* inactive */ { &vop_reclaim_desc, kernfs_reclaim }, /* reclaim */ { &vop_lock_desc, genfs_lock }, /* lock */ { &vop_unlock_desc, genfs_unlock }, /* unlock */ { &vop_print_desc, kernfs_print }, /* print */ { &vop_islocked_desc, genfs_islocked }, /* islocked */ { &vop_bwrite_desc, vn_bwrite }, /* bwrite */ { NULL, NULL } }; const struct vnodeopv_desc kernfs_specop_opv_desc = { &kernfs_specop_p, kernfs_specop_entries }; static inline int kernfs_fileop_compare(struct kernfs_fileop *a, struct kernfs_fileop *b) { if (a->kf_type < b->kf_type) return -1; if (a->kf_type > b->kf_type) return 1; if (a->kf_fileop < b->kf_fileop) return -1; if (a->kf_fileop > b->kf_fileop) return 1; return (0); } SPLAY_HEAD(kfsfileoptree, kernfs_fileop) kfsfileoptree = SPLAY_INITIALIZER(kfsfileoptree); SPLAY_PROTOTYPE(kfsfileoptree, kernfs_fileop, kf_node, kernfs_fileop_compare); SPLAY_GENERATE(kfsfileoptree, kernfs_fileop, kf_node, kernfs_fileop_compare); kfstype kernfs_alloctype(int nkf, const struct kernfs_fileop *kf) { static u_char nextfreetype = KFSlasttype; struct kernfs_fileop *dkf, *fkf, skf; int i; /* XXX need to keep track of dkf's memory if we support deallocating types */ dkf = malloc(sizeof(kernfs_default_fileops), M_TEMP, M_WAITOK); memcpy(dkf, kernfs_default_fileops, sizeof(kernfs_default_fileops)); for (i = 0; i < sizeof(kernfs_default_fileops) / sizeof(kernfs_default_fileops[0]); i++) { dkf[i].kf_type = nextfreetype; SPLAY_INSERT(kfsfileoptree, &kfsfileoptree, &dkf[i]); } for (i = 0; i < nkf; i++) { skf.kf_type = nextfreetype; skf.kf_fileop = kf[i].kf_fileop; if ((fkf = SPLAY_FIND(kfsfileoptree, &kfsfileoptree, &skf))) fkf->kf_vop = kf[i].kf_vop; } return nextfreetype++; } int kernfs_try_fileop(kfstype type, kfsfileop fileop, void *v, int error) { struct kernfs_fileop *kf, skf; skf.kf_type = type; skf.kf_fileop = fileop; if ((kf = SPLAY_FIND(kfsfileoptree, &kfsfileoptree, &skf))) if (kf->kf_vop) return kf->kf_vop(v); return error; } int kernfs_try_xread(kfstype type, const struct kernfs_node *kfs, char **bfp, size_t len, int error) { struct kernfs_fileop *kf, skf; skf.kf_type = type; skf.kf_fileop = KERNFS_XREAD; if ((kf = SPLAY_FIND(kfsfileoptree, &kfsfileoptree, &skf))) if (kf->kf_xread) return kf->kf_xread(kfs, bfp, len); return error; } int kernfs_try_xwrite(kfstype type, const struct kernfs_node *kfs, char *bf, size_t len, int error) { struct kernfs_fileop *kf, skf; skf.kf_type = type; skf.kf_fileop = KERNFS_XWRITE; if ((kf = SPLAY_FIND(kfsfileoptree, &kfsfileoptree, &skf))) if (kf->kf_xwrite) return kf->kf_xwrite(kfs, bf, len); return error; } int kernfs_addentry(kernfs_parentdir_t *pkt, kernfs_entry_t *dkt) { struct kernfs_subdir *ks, *parent; if (pkt == NULL) { SIMPLEQ_INSERT_TAIL(&dyn_kern_targets, dkt, dkt_queue); nkern_targets++; if (dkt->dkt_kt.kt_vtype == VDIR) nkern_dirs++; } else { parent = (struct kernfs_subdir *)pkt->kt_data; SIMPLEQ_INSERT_TAIL(&parent->ks_entries, dkt, dkt_queue); parent->ks_nentries++; if (dkt->dkt_kt.kt_vtype == VDIR) parent->ks_dirs++; } if (dkt->dkt_kt.kt_vtype == VDIR && dkt->dkt_kt.kt_data == NULL) { ks = malloc(sizeof(struct kernfs_subdir), M_TEMP, M_WAITOK); SIMPLEQ_INIT(&ks->ks_entries); ks->ks_nentries = 2; /* . and .. */ ks->ks_dirs = 2; ks->ks_parent = pkt ? pkt : &kern_targets[0]; dkt->dkt_kt.kt_data = ks; } return 0; } static int kernfs_xread(struct kernfs_node *kfs, int off, char **bufp, size_t len, size_t *wrlen) { const struct kern_target *kt; int err; kt = kfs->kfs_kt; switch (kfs->kfs_type) { case KFStime: { struct timeval tv; microtime(&tv); snprintf(*bufp, len, "%lld %ld\n", (long long)tv.tv_sec, (long)tv.tv_usec); break; } case KFSboottime: { struct timeval tv; /* * Historically, /kern/boottime only contained seconds. */ getmicroboottime(&tv); snprintf(*bufp, len, "%lld\n", (long long)tv.tv_sec); break; } case KFSint: { int *ip = kt->kt_data; snprintf(*bufp, len, "%d\n", *ip); break; } case KFSstring: { char *cp = kt->kt_data; *bufp = cp; break; } case KFSmsgbuf: { long n; /* * deal with cases where the message buffer has * become corrupted. */ if (!logenabled(msgbufp)) { msgbufenabled = 0; return (ENXIO); } /* * Note that reads of /kern/msgbuf won't necessarily yield * consistent results, if the message buffer is modified * while the read is in progress. The worst that can happen * is that incorrect data will be read. There's no way * that this can crash the system unless the values in the * message buffer header are corrupted, but that'll cause * the system to die anyway. */ if (off >= msgbufp->msg_bufs) { *wrlen = 0; return (0); } n = msgbufp->msg_bufx + off; if (n >= msgbufp->msg_bufs) n -= msgbufp->msg_bufs; len = uimin(msgbufp->msg_bufs - n, msgbufp->msg_bufs - off); *bufp = msgbufp->msg_bufc + n; *wrlen = len; return (0); } case KFShostname: { char *cp = hostname; size_t xlen = hostnamelen; if (xlen >= (len - 2)) return (EINVAL); memcpy(*bufp, cp, xlen); (*bufp)[xlen] = '\n'; (*bufp)[xlen+1] = '\0'; break; } case KFSavenrun: averunnable.fscale = FSCALE; snprintf(*bufp, len, "%d %d %d %ld\n", averunnable.ldavg[0], averunnable.ldavg[1], averunnable.ldavg[2], averunnable.fscale); break; default: err = kernfs_try_xread(kfs->kfs_type, kfs, bufp, len, EOPNOTSUPP); if (err) return err; } len = strlen(*bufp); if (len <= off) *wrlen = 0; else { *bufp += off; *wrlen = len - off; } return (0); } static int kernfs_xwrite(const struct kernfs_node *kfs, char *bf, size_t len) { switch (kfs->kfs_type) { case KFShostname: if (bf[len-1] == '\n') --len; memcpy(hostname, bf, len); hostname[len] = '\0'; hostnamelen = (size_t) len; return (0); default: return kernfs_try_xwrite(kfs->kfs_type, kfs, bf, len, EIO); } } /* * vp is the current namei directory * ndp is the name to locate in that directory... */ int kernfs_lookup(void *v) { struct vop_lookup_v2_args /* { struct vnode * a_dvp; struct vnode ** a_vpp; struct componentname * a_cnp; } */ *ap = v; struct componentname *cnp = ap->a_cnp; struct vnode **vpp = ap->a_vpp; struct vnode *dvp = ap->a_dvp; const char *pname = cnp->cn_nameptr; const struct kernfs_node *kfs; const struct kern_target *kt; const struct dyn_kern_target *dkt; const struct kernfs_subdir *ks; int error, i; *vpp = NULLVP; if (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME) return (EROFS); if (cnp->cn_namelen == 1 && *pname == '.') { *vpp = dvp; vref(dvp); return (0); } kfs = VTOKERN(dvp); switch (kfs->kfs_type) { case KFSkern: /* * Shouldn't get here with .. in the root node. */ if (cnp->cn_flags & ISDOTDOT) return (EIO); for (i = 0; i < static_nkern_targets; i++) { kt = &kern_targets[i]; if (cnp->cn_namelen == kt->kt_namlen && memcmp(kt->kt_name, pname, cnp->cn_namelen) == 0) goto found; } SIMPLEQ_FOREACH(dkt, &dyn_kern_targets, dkt_queue) { if (cnp->cn_namelen == dkt->dkt_kt.kt_namlen && memcmp(dkt->dkt_kt.kt_name, pname, cnp->cn_namelen) == 0) { kt = &dkt->dkt_kt; goto found; } } break; found: error = vcache_get(dvp->v_mount, &kt, sizeof(kt), vpp); return error; case KFSsubdir: ks = (struct kernfs_subdir *)kfs->kfs_kt->kt_data; if (cnp->cn_flags & ISDOTDOT) { kt = ks->ks_parent; goto found; } SIMPLEQ_FOREACH(dkt, &ks->ks_entries, dkt_queue) { if (cnp->cn_namelen == dkt->dkt_kt.kt_namlen && memcmp(dkt->dkt_kt.kt_name, pname, cnp->cn_namelen) == 0) { kt = &dkt->dkt_kt; goto found; } } break; default: return (ENOTDIR); } return (cnp->cn_nameiop == LOOKUP ? ENOENT : EROFS); } int kernfs_open(void *v) { struct vop_open_args /* { struct vnode *a_vp; int a_mode; kauth_cred_t a_cred; } */ *ap = v; struct kernfs_node *kfs = VTOKERN(ap->a_vp); return kernfs_try_fileop(kfs->kfs_type, KERNFS_FILEOP_OPEN, v, 0); } int kernfs_close(void *v) { struct vop_close_args /* { struct vnode *a_vp; int a_fflag; kauth_cred_t a_cred; } */ *ap = v; struct kernfs_node *kfs = VTOKERN(ap->a_vp); return kernfs_try_fileop(kfs->kfs_type, KERNFS_FILEOP_CLOSE, v, 0); } int kernfs_access(void *v) { struct vop_access_args /* { struct vnode *a_vp; accmode_t a_accmode; kauth_cred_t a_cred; } */ *ap = v; struct vattr va; int error; if ((error = VOP_GETATTR(ap->a_vp, &va, ap->a_cred)) != 0) return (error); return kauth_authorize_vnode(ap->a_cred, KAUTH_ACCESS_ACTION(ap->a_accmode, ap->a_vp->v_type, va.va_mode), ap->a_vp, NULL, genfs_can_access(ap->a_vp, ap->a_cred, va.va_uid, va.va_gid, va.va_mode, NULL, ap->a_accmode)); } static int kernfs_default_fileop_getattr(void *v) { struct vop_getattr_args /* { struct vnode *a_vp; struct vattr *a_vap; kauth_cred_t a_cred; } */ *ap = v; struct vattr *vap = ap->a_vap; vap->va_nlink = 1; vap->va_bytes = vap->va_size = 0; return 0; } int kernfs_getattr(void *v) { struct vop_getattr_args /* { struct vnode *a_vp; struct vattr *a_vap; kauth_cred_t a_cred; } */ *ap = v; struct kernfs_node *kfs = VTOKERN(ap->a_vp); struct kernfs_subdir *ks; struct vattr *vap = ap->a_vap; int error = 0; char strbuf[KSTRING], *bf; size_t nread, total; vattr_null(vap); vap->va_type = ap->a_vp->v_type; vap->va_uid = 0; vap->va_gid = 0; vap->va_mode = kfs->kfs_mode; vap->va_fileid = kfs->kfs_fileno; vap->va_flags = 0; vap->va_size = 0; vap->va_blocksize = DEV_BSIZE; /* Make all times be current TOD, except for the "boottime" node. */ if (kfs->kfs_kt->kt_namlen == 8 && !memcmp(kfs->kfs_kt->kt_name, "boottime", 8)) { getnanoboottime(&vap->va_ctime); } else { getnanotime(&vap->va_ctime); } vap->va_atime = vap->va_mtime = vap->va_ctime; vap->va_gen = 0; vap->va_flags = 0; vap->va_rdev = 0; vap->va_bytes = 0; switch (kfs->kfs_type) { case KFSkern: vap->va_nlink = nkern_dirs; vap->va_bytes = vap->va_size = DEV_BSIZE; break; case KFSdevice: vap->va_nlink = 1; vap->va_rdev = ap->a_vp->v_rdev; break; case KFSroot: vap->va_nlink = 1; vap->va_bytes = vap->va_size = DEV_BSIZE; break; case KFSsubdir: ks = (struct kernfs_subdir *)kfs->kfs_kt->kt_data; vap->va_nlink = ks->ks_dirs; vap->va_bytes = vap->va_size = DEV_BSIZE; break; case KFSnull: case KFStime: case KFSboottime: case KFSint: case KFSstring: case KFShostname: case KFSavenrun: case KFSmsgbuf: vap->va_nlink = 1; total = 0; do { bf = strbuf; error = kernfs_xread(kfs, total, &bf, sizeof(strbuf), &nread); total += nread; } while (error == 0 && nread != 0); vap->va_bytes = vap->va_size = total; break; default: error = kernfs_try_fileop(kfs->kfs_type, KERNFS_FILEOP_GETATTR, v, EINVAL); break; } return (error); } /*ARGSUSED*/ int kernfs_setattr(void *v) { /* * Silently ignore attribute changes. * This allows for open with truncate to have no * effect until some data is written. I want to * do it this way because all writes are atomic. */ return (0); } int kernfs_default_xread(void *v) { struct vop_read_args /* { struct vnode *a_vp; struct uio *a_uio; int a_ioflag; kauth_cred_t a_cred; } */ *ap = v; struct uio *uio = ap->a_uio; struct kernfs_node *kfs = VTOKERN(ap->a_vp); char strbuf[KSTRING], *bf; int off; size_t len; int error; if (ap->a_vp->v_type == VDIR) return EISDIR; off = (int)uio->uio_offset; /* Don't allow negative offsets */ if (off < 0) return EINVAL; bf = strbuf; if ((error = kernfs_xread(kfs, off, &bf, sizeof(strbuf), &len)) == 0) error = uiomove(bf, len, uio); return (error); } int kernfs_read(void *v) { struct vop_read_args /* { struct vnode *a_vp; struct uio *a_uio; int a_ioflag; struct ucred *a_cred; } */ *ap = v; struct kernfs_node *kfs = VTOKERN(ap->a_vp); if (kfs->kfs_type < KFSlasttype) { /* use default function */ return kernfs_default_xread(v); } return kernfs_try_fileop(kfs->kfs_type, KERNFS_FILEOP_READ, v, EOPNOTSUPP); } static int kernfs_default_xwrite(void *v) { struct vop_write_args /* { struct vnode *a_vp; struct uio *a_uio; int a_ioflag; kauth_cred_t a_cred; } */ *ap = v; struct kernfs_node *kfs = VTOKERN(ap->a_vp); struct uio *uio = ap->a_uio; int error; size_t xlen; char strbuf[KSTRING]; if (uio->uio_offset != 0) return (EINVAL); xlen = uimin(uio->uio_resid, KSTRING-1); if ((error = uiomove(strbuf, xlen, uio)) != 0) return (error); if (uio->uio_resid != 0) return (EIO); strbuf[xlen] = '\0'; xlen = strlen(strbuf); return (kernfs_xwrite(kfs, strbuf, xlen)); } int kernfs_write(void *v) { struct vop_write_args /* { struct vnode *a_vp; struct uio *a_uio; int a_ioflag; kauth_cred_t a_cred; } */ *ap = v; struct kernfs_node *kfs = VTOKERN(ap->a_vp); if (kfs->kfs_type < KFSlasttype) { /* use default function */ return kernfs_default_xwrite(v); } return kernfs_try_fileop(kfs->kfs_type, KERNFS_FILEOP_WRITE, v, EOPNOTSUPP); } int kernfs_ioctl(void *v) { struct vop_ioctl_args /* { const struct vnodeop_desc *a_desc; struct vnode *a_vp; u_long a_command; void *a_data; int a_fflag; kauth_cred_t a_cred; } */ *ap = v; struct kernfs_node *kfs = VTOKERN(ap->a_vp); return kernfs_try_fileop(kfs->kfs_type, KERNFS_FILEOP_IOCTL, v, EPASSTHROUGH); } static int kernfs_setdirentfileno_kt(struct dirent *d, const struct kern_target *kt, struct vop_readdir_args *ap) { struct kernfs_node *kfs; struct vnode *vp; int error; if ((error = vcache_get(ap->a_vp->v_mount, &kt, sizeof(kt), &vp)) != 0) return error; kfs = VTOKERN(vp); d->d_fileno = kfs->kfs_fileno; vrele(vp); return 0; } static int kernfs_setdirentfileno(struct dirent *d, off_t entry, struct kernfs_node *thisdir_kfs, const struct kern_target *parent_kt, const struct kern_target *kt, struct vop_readdir_args *ap) { const struct kern_target *ikt; int error; switch (entry) { case 0: d->d_fileno = thisdir_kfs->kfs_fileno; return 0; case 1: ikt = parent_kt; break; default: ikt = kt; break; } if (ikt != thisdir_kfs->kfs_kt) { if ((error = kernfs_setdirentfileno_kt(d, ikt, ap)) != 0) return error; } else d->d_fileno = thisdir_kfs->kfs_fileno; return 0; } int kernfs_readdir(void *v) { struct vop_readdir_args /* { struct vnode *a_vp; struct uio *a_uio; kauth_cred_t a_cred; int *a_eofflag; off_t **a_cookies; int a_*ncookies; } */ *ap = v; struct uio *uio = ap->a_uio; struct dirent d; struct kernfs_node *kfs = VTOKERN(ap->a_vp); const struct kern_target *kt; const struct dyn_kern_target *dkt = NULL; const struct kernfs_subdir *ks; off_t i, j; int error; off_t *cookies = NULL; int ncookies = 0, n; if (uio->uio_resid < UIO_MX) return (EINVAL); if (uio->uio_offset < 0) return (EINVAL); error = 0; i = uio->uio_offset; memset(&d, 0, sizeof(d)); d.d_reclen = UIO_MX; ncookies = uio->uio_resid / UIO_MX; switch (kfs->kfs_type) { case KFSkern: if (i >= nkern_targets) return (0); if (ap->a_ncookies) { ncookies = uimin(ncookies, (nkern_targets - i)); cookies = malloc(ncookies * sizeof(off_t), M_TEMP, M_WAITOK); *ap->a_cookies = cookies; } n = 0; for (; i < nkern_targets && uio->uio_resid >= UIO_MX; i++) { if (i < static_nkern_targets) kt = &kern_targets[i]; else { if (dkt == NULL) { dkt = SIMPLEQ_FIRST(&dyn_kern_targets); for (j = static_nkern_targets; j < i && dkt != NULL; j++) dkt = SIMPLEQ_NEXT(dkt, dkt_queue); if (j != i) break; } else { dkt = SIMPLEQ_NEXT(dkt, dkt_queue); } if (dkt == NULL) break; kt = &dkt->dkt_kt; } if (kt->kt_tag == KFSmsgbuf) { if (!logenabled(msgbufp)) { continue; } } d.d_namlen = kt->kt_namlen; if ((error = kernfs_setdirentfileno(&d, i, kfs, &kern_targets[0], kt, ap)) != 0) break; memcpy(d.d_name, kt->kt_name, kt->kt_namlen + 1); d.d_type = kt->kt_type; if ((error = uiomove(&d, UIO_MX, uio)) != 0) break; if (cookies) *cookies++ = i + 1; n++; } ncookies = n; break; case KFSroot: if (i >= 2) return 0; if (ap->a_ncookies) { ncookies = uimin(ncookies, (2 - i)); cookies = malloc(ncookies * sizeof(off_t), M_TEMP, M_WAITOK); *ap->a_cookies = cookies; } n = 0; for (; i < 2 && uio->uio_resid >= UIO_MX; i++) { kt = &kern_targets[i]; d.d_namlen = kt->kt_namlen; d.d_fileno = KERNFS_FILENO(kt, kt->kt_tag, 0); memcpy(d.d_name, kt->kt_name, kt->kt_namlen + 1); d.d_type = kt->kt_type; if ((error = uiomove(&d, UIO_MX, uio)) != 0) break; if (cookies) *cookies++ = i + 1; n++; } ncookies = n; break; case KFSsubdir: ks = (struct kernfs_subdir *)kfs->kfs_kt->kt_data; if (i >= ks->ks_nentries) return (0); if (ap->a_ncookies) { ncookies = uimin(ncookies, (ks->ks_nentries - i)); cookies = malloc(ncookies * sizeof(off_t), M_TEMP, M_WAITOK); *ap->a_cookies = cookies; } dkt = SIMPLEQ_FIRST(&ks->ks_entries); for (j = 0; j < i && dkt != NULL; j++) dkt = SIMPLEQ_NEXT(dkt, dkt_queue); n = 0; for (; i < ks->ks_nentries && uio->uio_resid >= UIO_MX; i++) { if (i < 2) kt = &subdir_targets[i]; else { /* check if ks_nentries lied to us */ if (dkt == NULL) break; kt = &dkt->dkt_kt; dkt = SIMPLEQ_NEXT(dkt, dkt_queue); } d.d_namlen = kt->kt_namlen; if ((error = kernfs_setdirentfileno(&d, i, kfs, ks->ks_parent, kt, ap)) != 0) break; memcpy(d.d_name, kt->kt_name, kt->kt_namlen + 1); d.d_type = kt->kt_type; if ((error = uiomove(&d, UIO_MX, uio)) != 0) break; if (cookies) *cookies++ = i + 1; n++; } ncookies = n; break; default: error = ENOTDIR; break; } if (ap->a_ncookies) { if (error) { if (cookies) free(*ap->a_cookies, M_TEMP); *ap->a_ncookies = 0; *ap->a_cookies = NULL; } else *ap->a_ncookies = ncookies; } uio->uio_offset = i; return (error); } int kernfs_inactive(void *v) { struct vop_inactive_v2_args /* { struct vnode *a_vp; bool *a_recycle; } */ *ap = v; *ap->a_recycle = false; return (0); } int kernfs_reclaim(void *v) { struct vop_reclaim_v2_args /* { struct vnode *a_vp; } */ *ap = v; struct vnode *vp = ap->a_vp; struct kernfs_node *kfs = VTOKERN(vp); VOP_UNLOCK(vp); vp->v_data = NULL; mutex_enter(&kfs_lock); TAILQ_REMOVE(&VFSTOKERNFS(vp->v_mount)->nodelist, kfs, kfs_list); mutex_exit(&kfs_lock); kmem_free(kfs, sizeof(struct kernfs_node)); return 0; } /* * Return POSIX pathconf information applicable to special devices. */ int kernfs_pathconf(void *v) { struct vop_pathconf_args /* { struct vnode *a_vp; int a_name; register_t *a_retval; } */ *ap = v; switch (ap->a_name) { case _PC_LINK_MAX: *ap->a_retval = LINK_MAX; return (0); case _PC_MAX_CANON: *ap->a_retval = MAX_CANON; return (0); case _PC_MAX_INPUT: *ap->a_retval = MAX_INPUT; return (0); case _PC_PIPE_BUF: *ap->a_retval = PIPE_BUF; return (0); case _PC_CHOWN_RESTRICTED: *ap->a_retval = 1; return (0); case _PC_VDISABLE: *ap->a_retval = _POSIX_VDISABLE; return (0); case _PC_SYNC_IO: *ap->a_retval = 1; return (0); default: return genfs_pathconf(ap); } /* NOTREACHED */ } /* * Print out the contents of a /dev/fd vnode. */ /* ARGSUSED */ int kernfs_print(void *v) { printf("tag VT_KERNFS, kernfs vnode\n"); return (0); } int kernfs_getpages(void *v) { struct vop_getpages_args /* { struct vnode *a_vp; voff_t a_offset; struct vm_page **a_m; int *a_count; int a_centeridx; vm_prot_t a_access_type; int a_advice; int a_flags; } */ *ap = v; if ((ap->a_flags & PGO_LOCKED) == 0) rw_exit(ap->a_vp->v_uobj.vmobjlock); return (EFAULT); }
3 1 2 1 1 1 1 1 79 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 /* $NetBSD: sys_ptrace.c,v 1.12 2022/07/10 14:07:55 riastradh Exp $ */ /*- * Copyright (c) 2008, 2009 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Andrew Doran. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /*- * Copyright (c) 1982, 1986, 1989, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * This code is derived from software contributed to Berkeley by * Jan-Simon Pendry. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)sys_process.c 8.1 (Berkeley) 6/10/93 */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: sys_ptrace.c,v 1.12 2022/07/10 14:07:55 riastradh Exp $"); #ifdef _KERNEL_OPT #include "opt_ptrace.h" #endif #include <sys/param.h> #include <sys/systm.h> #include <sys/proc.h> #include <sys/errno.h> #include <sys/exec.h> #include <sys/pax.h> #include <sys/ptrace.h> #include <sys/uio.h> #include <sys/ras.h> #include <sys/kmem.h> #include <sys/kauth.h> #include <sys/mount.h> #include <sys/syscallargs.h> #include <sys/syscallvar.h> #include <sys/syscall.h> #include <sys/module.h> #include <uvm/uvm_extern.h> #include <machine/reg.h> /* * PTRACE methods */ static int ptrace_copyin_piod(struct ptrace_io_desc *piod, const void *addr, size_t len) { if (len != 0 && sizeof(*piod) != len) return EINVAL; return copyin(addr, piod, sizeof(*piod)); } static int ptrace_copyout_piod(const struct ptrace_io_desc *piod, void *addr, size_t len) { if (len != 0 && sizeof(*piod) != len) return EINVAL; return copyout(piod, addr, sizeof(*piod)); } static int ptrace_copyin_siginfo(struct ptrace_siginfo *psi, const void *addr, size_t len) { if (sizeof(*psi) != len) return EINVAL; return copyin(addr, psi, sizeof(*psi)); } static int ptrace_copyout_siginfo(const struct ptrace_siginfo *psi, void *addr, size_t len) { if (sizeof(*psi) != len) return EINVAL; return copyout(psi, addr, sizeof(*psi)); } static int ptrace_copyout_lwpstatus(const struct ptrace_lwpstatus *pls, void *addr, size_t len) { return copyout(pls, addr, len); } static struct ptrace_methods native_ptm = { .ptm_copyin_piod = ptrace_copyin_piod, .ptm_copyout_piod = ptrace_copyout_piod, .ptm_copyin_siginfo = ptrace_copyin_siginfo, .ptm_copyout_siginfo = ptrace_copyout_siginfo, .ptm_copyout_lwpstatus = ptrace_copyout_lwpstatus, .ptm_doregs = process_doregs, .ptm_dofpregs = process_dofpregs, .ptm_dodbregs = process_dodbregs, }; static const struct syscall_package ptrace_syscalls[] = { { SYS_ptrace, 0, (sy_call_t *)sys_ptrace }, { 0, 0, NULL }, }; /* * Process debugging system call. */ int sys_ptrace(struct lwp *l, const struct sys_ptrace_args *uap, register_t *retval) { /* { syscallarg(int) req; syscallarg(pid_t) pid; syscallarg(void *) addr; syscallarg(int) data; } */ return do_ptrace(&native_ptm, l, SCARG(uap, req), SCARG(uap, pid), SCARG(uap, addr), SCARG(uap, data), retval); } #define DEPS "ptrace_common" MODULE(MODULE_CLASS_EXEC, ptrace, DEPS); static int ptrace_init(void) { int error; error = syscall_establish(&emul_netbsd, ptrace_syscalls); return error; } static int ptrace_fini(void) { int error; error = syscall_disestablish(&emul_netbsd, ptrace_syscalls); return error; } static int ptrace_modcmd(modcmd_t cmd, void *arg) { int error; switch (cmd) { case MODULE_CMD_INIT: error = ptrace_init(); break; case MODULE_CMD_FINI: error = ptrace_fini(); break; default: error = ENOTTY; break; } return error; }
1 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 /* $NetBSD: uvm_50.c,v 1.3 2020/09/05 16:30:10 riastradh Exp $ */ /*- * Copyright (c) 2018 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Christos Zoulas. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: uvm_50.c,v 1.3 2020/09/05 16:30:10 riastradh Exp $"); #if defined(_KERNEL_OPT) #include "opt_compat_netbsd.h" #endif #if defined(_KERNEL) || defined(_MODULE) #if defined(_KERNEL_OPT) #include "opt_vmswap.h" #else #define VMSWAP /* XXX */ #endif #endif #include <sys/param.h> #include <sys/types.h> #include <sys/systm.h> #include <sys/syscallargs.h> #include <sys/swap.h> #include <uvm/uvm_swap.h> #include <compat/sys/uvm.h> static void swapent50_cvt(void *p, const struct swapent *se) { struct swapent50 *sep50 = p; sep50->se50_dev = se->se_dev; sep50->se50_flags = se->se_flags; sep50->se50_nblks = se->se_nblks; sep50->se50_inuse = se->se_inuse; sep50->se50_priority = se->se_priority; KASSERT(sizeof(se->se_path) <= sizeof(sep50->se50_path)); strcpy(sep50->se50_path, se->se_path); } static int compat_uvm_swap_stats50(const struct sys_swapctl_args *uap, register_t *retval) { return uvm_swap_stats(SCARG(uap, arg), SCARG(uap, misc), swapent50_cvt, sizeof(struct swapent50), retval); } void uvm_50_init(void) { uvm_swap_stats50 = compat_uvm_swap_stats50; } void uvm_50_fini(void) { uvm_swap_stats50 = (void *)enosys; }
3 3 3 3 3 7 6 1 10 10 9 7 3 7 6 1 4 4 3 1 1 1 1 1 1 1 1 1 1 11 11 10 10 7 7 7 7 7 7 7 7 7 7 7 16 11 1 1 1 7 7 10 11 7 7 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 /* $NetBSD: mld6.c,v 1.101 2019/09/25 09:53:38 ozaki-r Exp $ */ /* $KAME: mld6.c,v 1.25 2001/01/16 14:14:18 itojun Exp $ */ /* * Copyright (C) 1998 WIDE Project. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the project nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * Copyright (c) 1992, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * Stephen Deering of Stanford University. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)igmp.c 8.1 (Berkeley) 7/19/93 */ /* * Copyright (c) 1988 Stephen Deering. * * This code is derived from software contributed to Berkeley by * Stephen Deering of Stanford University. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)igmp.c 8.1 (Berkeley) 7/19/93 */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: mld6.c,v 1.101 2019/09/25 09:53:38 ozaki-r Exp $"); #ifdef _KERNEL_OPT #include "opt_inet.h" #include "opt_net_mpsafe.h" #endif #include <sys/param.h> #include <sys/systm.h> #include <sys/mbuf.h> #include <sys/socket.h> #include <sys/socketvar.h> #include <sys/syslog.h> #include <sys/sysctl.h> #include <sys/kernel.h> #include <sys/callout.h> #include <sys/cprng.h> #include <sys/rwlock.h> #include <net/if.h> #include <netinet/in.h> #include <netinet/in_var.h> #include <netinet6/in6_var.h> #include <netinet/ip6.h> #include <netinet6/ip6_var.h> #include <netinet6/scope6_var.h> #include <netinet/icmp6.h> #include <netinet6/icmp6_private.h> #include <netinet6/mld6_var.h> static krwlock_t in6_multilock __cacheline_aligned; /* * Protocol constants */ /* * time between repetitions of a node's initial report of interest in a * multicast address(in seconds) */ #define MLD_UNSOLICITED_REPORT_INTERVAL 10 static struct ip6_pktopts ip6_opts; static void mld_start_listening(struct in6_multi *); static void mld_stop_listening(struct in6_multi *); static struct mld_hdr *mld_allocbuf(struct mbuf **, struct in6_multi *, int); static void mld_sendpkt(struct in6_multi *, int, const struct in6_addr *); static void mld_starttimer(struct in6_multi *); static void mld_stoptimer(struct in6_multi *); static u_long mld_timerresid(struct in6_multi *); static void in6m_ref(struct in6_multi *); static void in6m_unref(struct in6_multi *); static void in6m_destroy(struct in6_multi *); void mld_init(void) { static u_int8_t hbh_buf[8]; struct ip6_hbh *hbh = (struct ip6_hbh *)hbh_buf; u_int16_t rtalert_code = htons((u_int16_t)IP6OPT_RTALERT_MLD); /* ip6h_nxt will be fill in later */ hbh->ip6h_len = 0; /* (8 >> 3) - 1 */ /* XXX: grotty hard coding... */ hbh_buf[2] = IP6OPT_PADN; /* 2 byte padding */ hbh_buf[3] = 0; hbh_buf[4] = IP6OPT_RTALERT; hbh_buf[5] = IP6OPT_RTALERT_LEN - 2; memcpy(&hbh_buf[6], (void *)&rtalert_code, sizeof(u_int16_t)); ip6_opts.ip6po_hbh = hbh; /* We will specify the hoplimit by a multicast option. */ ip6_opts.ip6po_hlim = -1; ip6_opts.ip6po_prefer_tempaddr = IP6PO_TEMPADDR_NOTPREFER; rw_init(&in6_multilock); } static void mld_starttimer(struct in6_multi *in6m) { struct timeval now; KASSERT(rw_write_held(&in6_multilock)); KASSERTMSG(in6m->in6m_timer != IN6M_TIMER_UNDEF, "in6m_timer=%d", in6m->in6m_timer); microtime(&now); in6m->in6m_timer_expire.tv_sec = now.tv_sec + in6m->in6m_timer / hz; in6m->in6m_timer_expire.tv_usec = now.tv_usec + (in6m->in6m_timer % hz) * (1000000 / hz); if (in6m->in6m_timer_expire.tv_usec > 1000000) { in6m->in6m_timer_expire.tv_sec++; in6m->in6m_timer_expire.tv_usec -= 1000000; } /* start or restart the timer */ callout_schedule(&in6m->in6m_timer_ch, in6m->in6m_timer); } /* * mld_stoptimer releases in6_multilock when calling callout_halt. * The caller must ensure in6m won't be freed while releasing the lock. */ static void mld_stoptimer(struct in6_multi *in6m) { KASSERT(rw_write_held(&in6_multilock)); if (in6m->in6m_timer == IN6M_TIMER_UNDEF) return; rw_exit(&in6_multilock); callout_halt(&in6m->in6m_timer_ch, NULL); rw_enter(&in6_multilock, RW_WRITER); in6m->in6m_timer = IN6M_TIMER_UNDEF; } static void mld_timeo(void *arg) { struct in6_multi *in6m = arg; KASSERTMSG(in6m->in6m_refcount > 0, "in6m_refcount=%d", in6m->in6m_refcount); KERNEL_LOCK_UNLESS_NET_MPSAFE(); rw_enter(&in6_multilock, RW_WRITER); if (in6m->in6m_timer == IN6M_TIMER_UNDEF) goto out; in6m->in6m_timer = IN6M_TIMER_UNDEF; switch (in6m->in6m_state) { case MLD_REPORTPENDING: mld_start_listening(in6m); break; default: mld_sendpkt(in6m, MLD_LISTENER_REPORT, NULL); break; } out: rw_exit(&in6_multilock); KERNEL_UNLOCK_UNLESS_NET_MPSAFE(); } static u_long mld_timerresid(struct in6_multi *in6m) { struct timeval now, diff; microtime(&now); if (now.tv_sec > in6m->in6m_timer_expire.tv_sec || (now.tv_sec == in6m->in6m_timer_expire.tv_sec && now.tv_usec > in6m->in6m_timer_expire.tv_usec)) { return (0); } diff = in6m->in6m_timer_expire; diff.tv_sec -= now.tv_sec; diff.tv_usec -= now.tv_usec; if (diff.tv_usec < 0) { diff.tv_sec--; diff.tv_usec += 1000000; } /* return the remaining time in milliseconds */ return diff.tv_sec * 1000 + diff.tv_usec / 1000; } static void mld_start_listening(struct in6_multi *in6m) { struct in6_addr all_in6; KASSERT(rw_write_held(&in6_multilock)); /* * RFC2710 page 10: * The node never sends a Report or Done for the link-scope all-nodes * address. * MLD messages are never sent for multicast addresses whose scope is 0 * (reserved) or 1 (node-local). */ all_in6 = in6addr_linklocal_allnodes; if (in6_setscope(&all_in6, in6m->in6m_ifp, NULL)) { /* XXX: this should not happen! */ in6m->in6m_timer = 0; in6m->in6m_state = MLD_OTHERLISTENER; } if (IN6_ARE_ADDR_EQUAL(&in6m->in6m_addr, &all_in6) || IPV6_ADDR_MC_SCOPE(&in6m->in6m_addr) < IPV6_ADDR_SCOPE_LINKLOCAL) { in6m->in6m_timer = IN6M_TIMER_UNDEF; in6m->in6m_state = MLD_OTHERLISTENER; } else { mld_sendpkt(in6m, MLD_LISTENER_REPORT, NULL); in6m->in6m_timer = cprng_fast32() % (MLD_UNSOLICITED_REPORT_INTERVAL * hz); in6m->in6m_state = MLD_IREPORTEDLAST; mld_starttimer(in6m); } } static void mld_stop_listening(struct in6_multi *in6m) { struct in6_addr allnode, allrouter; KASSERT(rw_lock_held(&in6_multilock)); allnode = in6addr_linklocal_allnodes; if (in6_setscope(&allnode, in6m->in6m_ifp, NULL)) { /* XXX: this should not happen! */ return; } allrouter = in6addr_linklocal_allrouters; if (in6_setscope(&allrouter, in6m->in6m_ifp, NULL)) { /* XXX impossible */ return; } if (in6m->in6m_state == MLD_IREPORTEDLAST && (!IN6_ARE_ADDR_EQUAL(&in6m->in6m_addr, &allnode)) && IPV6_ADDR_MC_SCOPE(&in6m->in6m_addr) > IPV6_ADDR_SCOPE_INTFACELOCAL) { mld_sendpkt(in6m, MLD_LISTENER_DONE, &allrouter); } } void mld_input(struct mbuf *m, int off) { struct ip6_hdr *ip6; struct mld_hdr *mldh; struct ifnet *ifp; struct in6_multi *in6m = NULL; struct in6_addr mld_addr, all_in6; u_long timer = 0; /* timer value in the MLD query header */ struct psref psref; ifp = m_get_rcvif_psref(m, &psref); if (__predict_false(ifp == NULL)) goto out; IP6_EXTHDR_GET(mldh, struct mld_hdr *, m, off, sizeof(*mldh)); if (mldh == NULL) { ICMP6_STATINC(ICMP6_STAT_TOOSHORT); goto out_nodrop; } ip6 = mtod(m, struct ip6_hdr *); /* source address validation */ if (!IN6_IS_ADDR_LINKLOCAL(&ip6->ip6_src)) { /* * RFC3590 allows the IPv6 unspecified address as the source * address of MLD report and done messages. However, as this * same document says, this special rule is for snooping * switches and the RFC requires routers to discard MLD packets * with the unspecified source address. The RFC only talks * about hosts receiving an MLD query or report in Security * Considerations, but this is probably the correct intention. * RFC3590 does not talk about other cases than link-local and * the unspecified source addresses, but we believe the same * rule should be applied. * As a result, we only allow link-local addresses as the * source address; otherwise, simply discard the packet. */ #if 0 /* * XXX: do not log in an input path to avoid log flooding, * though RFC3590 says "SHOULD log" if the source of a query * is the unspecified address. */ char ip6bufs[INET6_ADDRSTRLEN]; char ip6bufm[INET6_ADDRSTRLEN]; log(LOG_INFO, "mld_input: src %s is not link-local (grp=%s)\n", IN6_PRINT(ip6bufs,&ip6->ip6_src), IN6_PRINT(ip6bufm, &mldh->mld_addr)); #endif goto out; } /* * make a copy for local work (in6_setscope() may modify the 1st arg) */ mld_addr = mldh->mld_addr; if (in6_setscope(&mld_addr, ifp, NULL)) { /* XXX: this should not happen! */ goto out; } /* * In the MLD specification, there are 3 states and a flag. * * In Non-Listener state, we simply don't have a membership record. * In Delaying Listener state, our timer is running (in6m->in6m_timer) * In Idle Listener state, our timer is not running * (in6m->in6m_timer==IN6M_TIMER_UNDEF) * * The flag is in6m->in6m_state, it is set to MLD_OTHERLISTENER if * we have heard a report from another member, or MLD_IREPORTEDLAST * if we sent the last report. */ switch (mldh->mld_type) { case MLD_LISTENER_QUERY: { struct in6_multi *next; if (ifp->if_flags & IFF_LOOPBACK) break; if (!IN6_IS_ADDR_UNSPECIFIED(&mld_addr) && !IN6_IS_ADDR_MULTICAST(&mld_addr)) break; /* print error or log stat? */ all_in6 = in6addr_linklocal_allnodes; if (in6_setscope(&all_in6, ifp, NULL)) { /* XXX: this should not happen! */ break; } /* * - Start the timers in all of our membership records * that the query applies to for the interface on * which the query arrived excl. those that belong * to the "all-nodes" group (ff02::1). * - Restart any timer that is already running but has * a value longer than the requested timeout. * - Use the value specified in the query message as * the maximum timeout. */ timer = ntohs(mldh->mld_maxdelay); rw_enter(&in6_multilock, RW_WRITER); /* * mld_stoptimer and mld_sendpkt release in6_multilock * temporarily, so we have to prevent in6m from being freed * while releasing the lock by having an extra reference to it. * * Also in6_purge_multi might remove items from the list of the * ifp while releasing the lock. Fortunately in6_purge_multi is * never executed as long as we have a psref of the ifp. */ LIST_FOREACH_SAFE(in6m, &ifp->if_multiaddrs, in6m_entry, next) { if (IN6_ARE_ADDR_EQUAL(&in6m->in6m_addr, &all_in6) || IPV6_ADDR_MC_SCOPE(&in6m->in6m_addr) < IPV6_ADDR_SCOPE_LINKLOCAL) continue; if (in6m->in6m_state == MLD_REPORTPENDING) continue; /* we are not yet ready */ if (!IN6_IS_ADDR_UNSPECIFIED(&mld_addr) && !IN6_ARE_ADDR_EQUAL(&mld_addr, &in6m->in6m_addr)) continue; if (timer == 0) { in6m_ref(in6m); /* send a report immediately */ mld_stoptimer(in6m); mld_sendpkt(in6m, MLD_LISTENER_REPORT, NULL); in6m->in6m_state = MLD_IREPORTEDLAST; in6m_unref(in6m); /* May free in6m */ } else if (in6m->in6m_timer == IN6M_TIMER_UNDEF || mld_timerresid(in6m) > timer) { in6m->in6m_timer = 1 + (cprng_fast32() % timer) * hz / 1000; mld_starttimer(in6m); } } rw_exit(&in6_multilock); break; } case MLD_LISTENER_REPORT: /* * For fast leave to work, we have to know that we are the * last person to send a report for this group. Reports * can potentially get looped back if we are a multicast * router, so discard reports sourced by me. * Note that it is impossible to check IFF_LOOPBACK flag of * ifp for this purpose, since ip6_mloopback pass the physical * interface to looutput. */ if (m->m_flags & M_LOOP) /* XXX: grotty flag, but efficient */ break; if (!IN6_IS_ADDR_MULTICAST(&mldh->mld_addr)) break; /* * If we belong to the group being reported, stop * our timer for that group. */ rw_enter(&in6_multilock, RW_WRITER); in6m = in6_lookup_multi(&mld_addr, ifp); if (in6m) { in6m_ref(in6m); mld_stoptimer(in6m); /* transit to idle state */ in6m->in6m_state = MLD_OTHERLISTENER; /* clear flag */ in6m_unref(in6m); in6m = NULL; /* in6m might be freed */ } rw_exit(&in6_multilock); break; default: /* this is impossible */ #if 0 /* * this case should be impossible because of filtering in * icmp6_input(). But we explicitly disabled this part * just in case. */ log(LOG_ERR, "mld_input: illegal type(%d)", mldh->mld_type); #endif break; } out: m_freem(m); out_nodrop: m_put_rcvif_psref(ifp, &psref); } /* * XXX mld_sendpkt must be called with in6_multilock held and * will release in6_multilock before calling ip6_output and * returning to avoid locking against myself in ip6_output. */ static void mld_sendpkt(struct in6_multi *in6m, int type, const struct in6_addr *dst) { struct mbuf *mh; struct mld_hdr *mldh; struct ip6_hdr *ip6 = NULL; struct ip6_moptions im6o; struct in6_ifaddr *ia = NULL; struct ifnet *ifp = in6m->in6m_ifp; int ignflags; struct psref psref; int bound; KASSERT(rw_write_held(&in6_multilock)); /* * At first, find a link local address on the outgoing interface * to use as the source address of the MLD packet. * We do not reject tentative addresses for MLD report to deal with * the case where we first join a link-local address. */ ignflags = (IN6_IFF_NOTREADY|IN6_IFF_ANYCAST) & ~IN6_IFF_TENTATIVE; bound = curlwp_bind(); ia = in6ifa_ifpforlinklocal_psref(ifp, ignflags, &psref); if (ia == NULL) { curlwp_bindx(bound); return; } if ((ia->ia6_flags & IN6_IFF_TENTATIVE)) { ia6_release(ia, &psref); ia = NULL; } /* Allocate two mbufs to store IPv6 header and MLD header */ mldh = mld_allocbuf(&mh, in6m, type); if (mldh == NULL) { ia6_release(ia, &psref); curlwp_bindx(bound); return; } /* fill src/dst here */ ip6 = mtod(mh, struct ip6_hdr *); ip6->ip6_src = ia ? ia->ia_addr.sin6_addr : in6addr_any; ip6->ip6_dst = dst ? *dst : in6m->in6m_addr; ia6_release(ia, &psref); curlwp_bindx(bound); mldh->mld_addr = in6m->in6m_addr; in6_clearscope(&mldh->mld_addr); /* XXX */ mldh->mld_cksum = in6_cksum(mh, IPPROTO_ICMPV6, sizeof(struct ip6_hdr), sizeof(struct mld_hdr)); /* construct multicast option */ memset(&im6o, 0, sizeof(im6o)); im6o.im6o_multicast_if_index = if_get_index(ifp); im6o.im6o_multicast_hlim = 1; /* * Request loopback of the report if we are acting as a multicast * router, so that the process-level routing daemon can hear it. */ im6o.im6o_multicast_loop = (ip6_mrouter != NULL); /* increment output statistics */ ICMP6_STATINC(ICMP6_STAT_OUTHIST + type); icmp6_ifstat_inc(ifp, ifs6_out_msg); switch (type) { case MLD_LISTENER_QUERY: icmp6_ifstat_inc(ifp, ifs6_out_mldquery); break; case MLD_LISTENER_REPORT: icmp6_ifstat_inc(ifp, ifs6_out_mldreport); break; case MLD_LISTENER_DONE: icmp6_ifstat_inc(ifp, ifs6_out_mlddone); break; } /* XXX we cannot call ip6_output with holding in6_multilock */ rw_exit(&in6_multilock); ip6_output(mh, &ip6_opts, NULL, ia ? 0 : IPV6_UNSPECSRC, &im6o, NULL, NULL); rw_enter(&in6_multilock, RW_WRITER); } static struct mld_hdr * mld_allocbuf(struct mbuf **mh, struct in6_multi *in6m, int type) { struct mbuf *md; struct mld_hdr *mldh; struct ip6_hdr *ip6; /* * Allocate mbufs to store ip6 header and MLD header. * We allocate 2 mbufs and make chain in advance because * it is more convenient when inserting the hop-by-hop option later. */ MGETHDR(*mh, M_DONTWAIT, MT_HEADER); if (*mh == NULL) return NULL; MGET(md, M_DONTWAIT, MT_DATA); if (md == NULL) { m_free(*mh); *mh = NULL; return NULL; } (*mh)->m_next = md; md->m_next = NULL; m_reset_rcvif((*mh)); (*mh)->m_pkthdr.len = sizeof(struct ip6_hdr) + sizeof(struct mld_hdr); (*mh)->m_len = sizeof(struct ip6_hdr); m_align(*mh, sizeof(struct ip6_hdr)); /* fill in the ip6 header */ ip6 = mtod(*mh, struct ip6_hdr *); memset(ip6, 0, sizeof(*ip6)); ip6->ip6_flow = 0; ip6->ip6_vfc &= ~IPV6_VERSION_MASK; ip6->ip6_vfc |= IPV6_VERSION; /* ip6_plen will be set later */ ip6->ip6_nxt = IPPROTO_ICMPV6; /* ip6_hlim will be set by im6o.im6o_multicast_hlim */ /* ip6_src/dst will be set by mld_sendpkt() or mld_sendbuf() */ /* fill in the MLD header as much as possible */ md->m_len = sizeof(struct mld_hdr); mldh = mtod(md, struct mld_hdr *); memset(mldh, 0, sizeof(struct mld_hdr)); mldh->mld_type = type; return mldh; } static void in6m_ref(struct in6_multi *in6m) { KASSERT(rw_write_held(&in6_multilock)); in6m->in6m_refcount++; } static void in6m_unref(struct in6_multi *in6m) { KASSERT(rw_write_held(&in6_multilock)); if (--in6m->in6m_refcount == 0) in6m_destroy(in6m); } /* * Add an address to the list of IP6 multicast addresses for a given interface. */ struct in6_multi * in6_addmulti(struct in6_addr *maddr6, struct ifnet *ifp, int *errorp, int timer) { struct sockaddr_in6 sin6; struct in6_multi *in6m; *errorp = 0; rw_enter(&in6_multilock, RW_WRITER); /* * See if address already in list. */ in6m = in6_lookup_multi(maddr6, ifp); if (in6m != NULL) { /* * Found it; just increment the reference count. */ in6m->in6m_refcount++; } else { /* * New address; allocate a new multicast record * and link it into the interface's multicast list. */ in6m = malloc(sizeof(*in6m), M_IPMADDR, M_NOWAIT|M_ZERO); if (in6m == NULL) { *errorp = ENOBUFS; goto out; } in6m->in6m_addr = *maddr6; in6m->in6m_ifp = ifp; in6m->in6m_refcount = 1; in6m->in6m_timer = IN6M_TIMER_UNDEF; callout_init(&in6m->in6m_timer_ch, CALLOUT_MPSAFE); callout_setfunc(&in6m->in6m_timer_ch, mld_timeo, in6m); LIST_INSERT_HEAD(&ifp->if_multiaddrs, in6m, in6m_entry); /* * Ask the network driver to update its multicast reception * filter appropriately for the new address. */ sockaddr_in6_init(&sin6, maddr6, 0, 0, 0); *errorp = if_mcast_op(ifp, SIOCADDMULTI, sin6tosa(&sin6)); if (*errorp) { callout_destroy(&in6m->in6m_timer_ch); LIST_REMOVE(in6m, in6m_entry); free(in6m, M_IPMADDR); in6m = NULL; goto out; } in6m->in6m_timer = timer; if (in6m->in6m_timer > 0) { in6m->in6m_state = MLD_REPORTPENDING; mld_starttimer(in6m); goto out; } /* * Let MLD6 know that we have joined a new IP6 multicast * group. */ mld_start_listening(in6m); } out: rw_exit(&in6_multilock); return in6m; } static void in6m_destroy(struct in6_multi *in6m) { struct sockaddr_in6 sin6; KASSERT(rw_write_held(&in6_multilock)); KASSERTMSG(in6m->in6m_refcount == 0, "in6m_refcount=%d", in6m->in6m_refcount); /* * Unlink from list if it's listed. This must be done before * mld_stop_listening because it releases in6_multilock and that allows * someone to look up the removing in6m from the list and add a * reference to the entry unexpectedly. */ if (in6_lookup_multi(&in6m->in6m_addr, in6m->in6m_ifp) != NULL) LIST_REMOVE(in6m, in6m_entry); /* * No remaining claims to this record; let MLD6 know * that we are leaving the multicast group. */ mld_stop_listening(in6m); /* * Delete all references of this multicasting group from * the membership arrays */ in6_purge_mcast_references(in6m); /* * Notify the network driver to update its multicast * reception filter. */ sockaddr_in6_init(&sin6, &in6m->in6m_addr, 0, 0, 0); if_mcast_op(in6m->in6m_ifp, SIOCDELMULTI, sin6tosa(&sin6)); /* Tell mld_timeo we're halting the timer */ in6m->in6m_timer = IN6M_TIMER_UNDEF; rw_exit(&in6_multilock); callout_halt(&in6m->in6m_timer_ch, NULL); callout_destroy(&in6m->in6m_timer_ch); free(in6m, M_IPMADDR); rw_enter(&in6_multilock, RW_WRITER); } /* * Delete a multicast address record. */ void in6_delmulti_locked(struct in6_multi *in6m) { KASSERT(rw_write_held(&in6_multilock)); KASSERTMSG(in6m->in6m_refcount > 0, "in6m_refcount=%d", in6m->in6m_refcount); /* * The caller should have a reference to in6m. So we don't need to care * of releasing the lock in mld_stoptimer. */ mld_stoptimer(in6m); if (--in6m->in6m_refcount == 0) in6m_destroy(in6m); } void in6_delmulti(struct in6_multi *in6m) { rw_enter(&in6_multilock, RW_WRITER); in6_delmulti_locked(in6m); rw_exit(&in6_multilock); } /* * Look up the in6_multi record for a given IP6 multicast address * on a given interface. If no matching record is found, "in6m" * returns NULL. */ struct in6_multi * in6_lookup_multi(const struct in6_addr *addr, const struct ifnet *ifp) { struct in6_multi *in6m; KASSERT(rw_lock_held(&in6_multilock)); LIST_FOREACH(in6m, &ifp->if_multiaddrs, in6m_entry) { if (IN6_ARE_ADDR_EQUAL(&in6m->in6m_addr, addr)) break; } return in6m; } void in6_lookup_and_delete_multi(const struct in6_addr *addr, const struct ifnet *ifp) { struct in6_multi *in6m; rw_enter(&in6_multilock, RW_WRITER); in6m = in6_lookup_multi(addr, ifp); if (in6m != NULL) in6_delmulti_locked(in6m); rw_exit(&in6_multilock); } bool in6_multi_group(const struct in6_addr *addr, const struct ifnet *ifp) { bool ingroup; rw_enter(&in6_multilock, RW_READER); ingroup = in6_lookup_multi(addr, ifp) != NULL; rw_exit(&in6_multilock); return ingroup; } /* * Purge in6_multi records associated to the interface. */ void in6_purge_multi(struct ifnet *ifp) { struct in6_multi *in6m, *next; rw_enter(&in6_multilock, RW_WRITER); LIST_FOREACH_SAFE(in6m, &ifp->if_multiaddrs, in6m_entry, next) { LIST_REMOVE(in6m, in6m_entry); /* * Normally multicast addresses are already purged at this * point. Remaining references aren't accessible via ifp, * so what we can do here is to prevent ifp from being * accessed via in6m by removing it from the list of ifp. */ mld_stoptimer(in6m); } rw_exit(&in6_multilock); } void in6_multi_lock(int op) { rw_enter(&in6_multilock, op); } void in6_multi_unlock(void) { rw_exit(&in6_multilock); } bool in6_multi_locked(int op) { switch (op) { case RW_READER: return rw_read_held(&in6_multilock); case RW_WRITER: return rw_write_held(&in6_multilock); default: return rw_lock_held(&in6_multilock); } } struct in6_multi_mship * in6_joingroup(struct ifnet *ifp, struct in6_addr *addr, int *errorp, int timer) { struct in6_multi_mship *imm; imm = malloc(sizeof(*imm), M_IPMADDR, M_NOWAIT|M_ZERO); if (imm == NULL) { *errorp = ENOBUFS; return NULL; } imm->i6mm_maddr = in6_addmulti(addr, ifp, errorp, timer); if (!imm->i6mm_maddr) { /* *errorp is already set */ free(imm, M_IPMADDR); return NULL; } return imm; } int in6_leavegroup(struct in6_multi_mship *imm) { struct in6_multi *in6m; rw_enter(&in6_multilock, RW_WRITER); in6m = imm->i6mm_maddr; imm->i6mm_maddr = NULL; if (in6m != NULL) { in6_delmulti_locked(in6m); } rw_exit(&in6_multilock); free(imm, M_IPMADDR); return 0; } /* * DEPRECATED: keep it just to avoid breaking old sysctl users. */ static int in6_mkludge_sysctl(SYSCTLFN_ARGS) { if (namelen != 1) return EINVAL; *oldlenp = 0; return 0; } static int in6_multicast_sysctl(SYSCTLFN_ARGS) { struct ifnet *ifp; struct ifaddr *ifa; struct in6_ifaddr *ia6; struct in6_multi *in6m; uint32_t tmp; int error; size_t written; struct psref psref, psref_ia; int bound, s; if (namelen != 1) return EINVAL; rw_enter(&in6_multilock, RW_READER); bound = curlwp_bind(); ifp = if_get_byindex(name[0], &psref); if (ifp == NULL) { curlwp_bindx(bound); rw_exit(&in6_multilock); return ENODEV; } if (oldp == NULL) { *oldlenp = 0; s = pserialize_read_enter(); IFADDR_READER_FOREACH(ifa, ifp) { LIST_FOREACH(in6m, &ifp->if_multiaddrs, in6m_entry) { *oldlenp += 2 * sizeof(struct in6_addr) + sizeof(uint32_t); } } pserialize_read_exit(s); if_put(ifp, &psref); curlwp_bindx(bound); rw_exit(&in6_multilock); return 0; } error = 0; written = 0; s = pserialize_read_enter(); IFADDR_READER_FOREACH(ifa, ifp) { if (ifa->ifa_addr->sa_family != AF_INET6) continue; ifa_acquire(ifa, &psref_ia); pserialize_read_exit(s); ia6 = ifatoia6(ifa); LIST_FOREACH(in6m, &ifp->if_multiaddrs, in6m_entry) { if (written + 2 * sizeof(struct in6_addr) + sizeof(uint32_t) > *oldlenp) goto done; /* * XXX return the first IPv6 address to keep backward * compatibility, however now multicast addresses * don't belong to any IPv6 addresses so it should be * unnecessary. */ error = sysctl_copyout(l, &ia6->ia_addr.sin6_addr, oldp, sizeof(struct in6_addr)); if (error) goto done; oldp = (char *)oldp + sizeof(struct in6_addr); written += sizeof(struct in6_addr); error = sysctl_copyout(l, &in6m->in6m_addr, oldp, sizeof(struct in6_addr)); if (error) goto done; oldp = (char *)oldp + sizeof(struct in6_addr); written += sizeof(struct in6_addr); tmp = in6m->in6m_refcount; error = sysctl_copyout(l, &tmp, oldp, sizeof(tmp)); if (error) goto done; oldp = (char *)oldp + sizeof(tmp); written += sizeof(tmp); } s = pserialize_read_enter(); break; } pserialize_read_exit(s); done: ifa_release(ifa, &psref_ia); if_put(ifp, &psref); curlwp_bindx(bound); rw_exit(&in6_multilock); *oldlenp = written; return error; } void in6_sysctl_multicast_setup(struct sysctllog **clog) { sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT, CTLTYPE_NODE, "inet6", NULL, NULL, 0, NULL, 0, CTL_NET, PF_INET6, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT, CTLTYPE_NODE, "multicast", SYSCTL_DESCR("Multicast information"), in6_multicast_sysctl, 0, NULL, 0, CTL_NET, PF_INET6, CTL_CREATE, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT, CTLTYPE_NODE, "multicast_kludge", SYSCTL_DESCR("multicast kludge information"), in6_mkludge_sysctl, 0, NULL, 0, CTL_NET, PF_INET6, CTL_CREATE, CTL_EOL); }
1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 /* $NetBSD: strlcat.c,v 1.4 2013/01/23 07:57:27 matt Exp $ */ /* $OpenBSD: strlcat.c,v 1.10 2003/04/12 21:56:39 millert Exp $ */ /* * Copyright (c) 1998 Todd C. Miller <Todd.Miller@courtesan.com> * * Permission to use, copy, modify, and distribute this software for any * purpose with or without fee is hereby granted, provided that the above * copyright notice and this permission notice appear in all copies. * * THE SOFTWARE IS PROVIDED "AS IS" AND TODD C. MILLER DISCLAIMS ALL * WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL TODD C. MILLER BE LIABLE * FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ #if !defined(_KERNEL) && !defined(_STANDALONE) #if HAVE_NBTOOL_CONFIG_H #include "nbtool_config.h" #endif #include <sys/cdefs.h> #if defined(LIBC_SCCS) && !defined(lint) __RCSID("$NetBSD: strlcat.c,v 1.4 2013/01/23 07:57:27 matt Exp $"); #endif /* LIBC_SCCS and not lint */ #ifdef _LIBC #include "namespace.h" #endif #include <sys/types.h> #include <assert.h> #include <string.h> #ifdef _LIBC # ifdef __weak_alias __weak_alias(strlcat, _strlcat) # endif #endif #else #include <lib/libkern/libkern.h> #endif /* !_KERNEL && !_STANDALONE */ #if !HAVE_STRLCAT /* * Appends src to string dst of size siz (unlike strncat, siz is the * full size of dst, not space left). At most siz-1 characters * will be copied. Always NUL terminates (unless siz <= strlen(dst)). * Returns strlen(src) + MIN(siz, strlen(initial dst)). * If retval >= siz, truncation occurred. */ size_t strlcat(char *dst, const char *src, size_t siz) { #if 1 char *d = dst; const char *s = src; size_t n = siz; size_t dlen; _DIAGASSERT(dst != NULL); _DIAGASSERT(src != NULL); /* Find the end of dst and adjust bytes left but don't go past end */ while (n-- != 0 && *d != '\0') d++; dlen = d - dst; n = siz - dlen; if (n == 0) return(dlen + strlen(s)); while (*s != '\0') { if (n != 1) { *d++ = *s; n--; } s++; } *d = '\0'; return(dlen + (s - src)); /* count does not include NUL */ #else _DIAGASSERT(dst != NULL); _DIAGASSERT(src != NULL); /* * Find length of string in dst (maxing out at siz). */ size_t dlen = strnlen(dst, siz); /* * Copy src into any remaining space in dst (truncating if needed). * Note strlcpy(dst, src, 0) returns strlen(src). */ return dlen + strlcpy(dst + dlen, src, siz - dlen); #endif } #endif
8 20 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 /* $NetBSD: bpf.h,v 1.82 2023/08/23 13:21:17 rin Exp $ */ /* * Copyright (c) 1990, 1991, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from the Stanford/CMU enet packet filter, * (net/enet.c) distributed as part of 4.3BSD, and code contributed * to Berkeley by Steven McCanne and Van Jacobson both of Lawrence * Berkeley Laboratory. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)bpf.h 8.2 (Berkeley) 1/9/95 * @(#) Header: bpf.h,v 1.36 97/06/12 14:29:53 leres Exp (LBL) */ #ifndef _NET_BPF_H_ #define _NET_BPF_H_ #include <sys/ioccom.h> #include <sys/time.h> /* BSD style release date */ #define BPF_RELEASE 199606 /* Date when COP instructions and external memory have been released. */ #define BPF_COP_EXTMEM_RELEASE 20140624 __BEGIN_DECLS typedef int bpf_int32; typedef u_int bpf_u_int32; /* * Alignment macros. BPF_WORDALIGN rounds up to the next * even multiple of BPF_ALIGNMENT. */ #define BPF_ALIGNMENT sizeof(long) #define BPF_ALIGNMENT32 sizeof(int) #define BPF_WORDALIGN(x) (((x)+(BPF_ALIGNMENT-1))&~(BPF_ALIGNMENT-1)) #define BPF_WORDALIGN32(x) (((x)+(BPF_ALIGNMENT32-1))&~(BPF_ALIGNMENT32-1)) #define BPF_MAXINSNS 512 #define BPF_DFLTBUFSIZE (1024*1024) /* default static upper limit */ #define BPF_MAXBUFSIZE (1024*1024*16) /* hard limit on sysctl'able value */ #define BPF_MINBUFSIZE 32 /* * Structure for BIOCSETF. */ struct bpf_program { u_int bf_len; struct bpf_insn *bf_insns; }; /* * Struct returned by BIOCGSTATS and net.bpf.stats sysctl. */ struct bpf_stat { uint64_t bs_recv; /* number of packets received */ uint64_t bs_drop; /* number of packets dropped */ uint64_t bs_capt; /* number of packets captured */ uint64_t bs_padding[13]; }; /* * Struct returned by BIOCGSTATS_30. */ struct bpf_stat30 { u_int bs_recv; /* number of packets received */ u_int bs_drop; /* number of packets dropped */ }; /* * Struct return by BIOCVERSION. This represents the version number of * the filter language described by the instruction encodings below. * bpf understands a program iff kernel_major == filter_major && * kernel_minor >= filter_minor, that is, if the value returned by the * running kernel has the same major number and a minor number equal * equal to or less than the filter being downloaded. Otherwise, the * results are undefined, meaning an error may be returned or packets * may be accepted haphazardly. * It has nothing to do with the source code version. */ struct bpf_version { u_short bv_major; u_short bv_minor; }; /* Current version number of filter architecture. */ #define BPF_MAJOR_VERSION 1 #define BPF_MINOR_VERSION 1 /* * BPF ioctls * * The first set is for compatibility with Sun's pcc style * header files. If your using gcc, we assume that you * have run fixincludes so the latter set should work. */ #define BIOCGBLEN _IOR('B', 102, u_int) #define BIOCSBLEN _IOWR('B', 102, u_int) #define BIOCSETF _IOW('B', 103, struct bpf_program) #define BIOCFLUSH _IO('B', 104) #define BIOCPROMISC _IO('B', 105) #define BIOCGDLT _IOR('B', 106, u_int) #define BIOCGETIF _IOR('B', 107, struct ifreq) #define BIOCSETIF _IOW('B', 108, struct ifreq) #ifdef COMPAT_50 #include <compat/sys/time.h> #define BIOCSORTIMEOUT _IOW('B', 109, struct timeval50) #define BIOCGORTIMEOUT _IOR('B', 110, struct timeval50) #endif #define BIOCGSTATS _IOR('B', 111, struct bpf_stat) #define BIOCGSTATS_30 _IOR('B', 111, struct bpf_stat30) #define BIOCIMMEDIATE _IOW('B', 112, u_int) #define BIOCVERSION _IOR('B', 113, struct bpf_version) #define BIOCSTCPF _IOW('B', 114, struct bpf_program) #define BIOCSUDPF _IOW('B', 115, struct bpf_program) #define BIOCGHDRCMPLT _IOR('B', 116, u_int) #define BIOCSHDRCMPLT _IOW('B', 117, u_int) #define BIOCSDLT _IOW('B', 118, u_int) #define BIOCGDLTLIST _IOWR('B', 119, struct bpf_dltlist) #define BIOCGDIRECTION _IOR('B', 120, u_int) #define BIOCSDIRECTION _IOW('B', 121, u_int) #define BIOCSRTIMEOUT _IOW('B', 122, struct timeval) #define BIOCGRTIMEOUT _IOR('B', 123, struct timeval) #define BIOCGFEEDBACK _IOR('B', 124, u_int) #define BIOCSFEEDBACK _IOW('B', 125, u_int) #define BIOCFEEDBACK BIOCSFEEDBACK /* FreeBSD name */ #define BIOCLOCK _IO('B', 126) #define BIOCSETWF _IOW('B', 127, struct bpf_program) /* Obsolete */ #define BIOCGSEESENT BIOCGDIRECTION #define BIOCSSEESENT BIOCSDIRECTION /* * Packet directions. * BPF_D_IN = 0, BPF_D_INOUT =1 for backward compatibility of BIOC[GS]SEESENT. */ #define BPF_D_IN 0 /* See incoming packets */ #define BPF_D_INOUT 1 /* See incoming and outgoing packets */ #define BPF_D_OUT 2 /* See outgoing packets */ /* * Structure prepended to each packet. This is "wire" format, so we * cannot change it unfortunately to 64 bit times on 32 bit systems [yet]. */ struct bpf_timeval { long tv_sec; long tv_usec; }; struct bpf_timeval32 { int32_t tv_sec; int32_t tv_usec; }; struct bpf_hdr { struct bpf_timeval bh_tstamp; /* time stamp */ uint32_t bh_caplen; /* length of captured portion */ uint32_t bh_datalen; /* original length of packet */ uint16_t bh_hdrlen; /* length of bpf header (this struct plus alignment padding) */ }; struct bpf_hdr32 { struct bpf_timeval32 bh_tstamp; /* time stamp */ uint32_t bh_caplen; /* length of captured portion */ uint32_t bh_datalen; /* original length of packet */ uint16_t bh_hdrlen; /* length of bpf header (this struct plus alignment padding) */ }; /* * Because the structure above is not a multiple of 4 bytes, some compilers * will insist on inserting padding; hence, sizeof(struct bpf_hdr) won't work. * Only the kernel needs to know about it; applications use bh_hdrlen. * XXX To save a few bytes on 32-bit machines, we avoid end-of-struct * XXX padding by using the size of the header data elements. This is * XXX fail-safe: on new machines, we just use the 'safe' sizeof. */ #ifdef _KERNEL #if defined(__mips64) #define SIZEOF_BPF_HDR sizeof(struct bpf_hdr) #define SIZEOF_BPF_HDR32 18 #elif defined(__arm32__) || defined(__i386__) || defined(__m68k__) || \ defined(__mips__) || defined(__ns32k__) || defined(__vax__) || \ defined(__sh__) || (defined(__sparc__) && !defined(__sparc64__)) #define SIZEOF_BPF_HDR 18 #define SIZEOF_BPF_HDR32 18 #else #define SIZEOF_BPF_HDR sizeof(struct bpf_hdr) #define SIZEOF_BPF_HDR32 sizeof(struct bpf_hdr32) #endif #endif /* Pull in data-link level type codes. */ #include <net/dlt.h> /* * The instruction encodings. */ /* instruction classes */ #define BPF_CLASS(code) ((code) & 0x07) #define BPF_LD 0x00 #define BPF_LDX 0x01 #define BPF_ST 0x02 #define BPF_STX 0x03 #define BPF_ALU 0x04 #define BPF_JMP 0x05 #define BPF_RET 0x06 #define BPF_MISC 0x07 /* ld/ldx fields */ #define BPF_SIZE(code) ((code) & 0x18) #define BPF_W 0x00 #define BPF_H 0x08 #define BPF_B 0x10 /* 0x18 reserved; used by BSD/OS */ #define BPF_MODE(code) ((code) & 0xe0) #define BPF_IMM 0x00 #define BPF_ABS 0x20 #define BPF_IND 0x40 #define BPF_MEM 0x60 #define BPF_LEN 0x80 #define BPF_MSH 0xa0 /* 0xc0 reserved; used by BSD/OS */ /* 0xe0 reserved; used by BSD/OS */ /* alu/jmp fields */ #define BPF_OP(code) ((code) & 0xf0) #define BPF_ADD 0x00 #define BPF_SUB 0x10 #define BPF_MUL 0x20 #define BPF_DIV 0x30 #define BPF_OR 0x40 #define BPF_AND 0x50 #define BPF_LSH 0x60 #define BPF_RSH 0x70 #define BPF_NEG 0x80 #define BPF_MOD 0x90 #define BPF_XOR 0xa0 /* 0xb0 reserved */ /* 0xc0 reserved */ /* 0xd0 reserved */ /* 0xe0 reserved */ /* 0xf0 reserved */ #define BPF_JA 0x00 #define BPF_JEQ 0x10 #define BPF_JGT 0x20 #define BPF_JGE 0x30 #define BPF_JSET 0x40 /* 0x50 reserved; used by BSD/OS */ /* 0x60 reserved */ /* 0x70 reserved */ /* 0x80 reserved */ /* 0x90 reserved */ /* 0xa0 reserved */ /* 0xb0 reserved */ /* 0xc0 reserved */ /* 0xd0 reserved */ /* 0xe0 reserved */ /* 0xf0 reserved */ #define BPF_SRC(code) ((code) & 0x08) #define BPF_K 0x00 #define BPF_X 0x08 /* ret - BPF_K and BPF_X also apply */ #define BPF_RVAL(code) ((code) & 0x18) #define BPF_A 0x10 /* 0x18 reserved */ /* misc */ #define BPF_MISCOP(code) ((code) & 0xf8) #define BPF_TAX 0x00 /* 0x10 reserved */ /* 0x18 reserved */ #define BPF_COP 0x20 /* 0x28 reserved */ /* 0x30 reserved */ /* 0x38 reserved */ #define BPF_COPX 0x40 /* XXX: also used by BSD/OS */ /* 0x48 reserved */ /* 0x50 reserved */ /* 0x58 reserved */ /* 0x60 reserved */ /* 0x68 reserved */ /* 0x70 reserved */ /* 0x78 reserved */ #define BPF_TXA 0x80 /* 0x88 reserved */ /* 0x90 reserved */ /* 0x98 reserved */ /* 0xa0 reserved */ /* 0xa8 reserved */ /* 0xb0 reserved */ /* 0xb8 reserved */ /* 0xc0 reserved; used by BSD/OS */ /* 0xc8 reserved */ /* 0xd0 reserved */ /* 0xd8 reserved */ /* 0xe0 reserved */ /* 0xe8 reserved */ /* 0xf0 reserved */ /* 0xf8 reserved */ /* * The instruction data structure. */ struct bpf_insn { uint16_t code; u_char jt; u_char jf; uint32_t k; }; /* * Auxiliary data, for use when interpreting a filter intended for the * Linux kernel when the kernel rejects the filter (requiring us to * run it in userland). It contains VLAN tag information. */ struct bpf_aux_data { u_short vlan_tag_present; u_short vlan_tag; }; /* * Macros for insn array initializers. */ #define BPF_STMT(code, k) { (uint16_t)(code), 0, 0, k } #define BPF_JUMP(code, k, jt, jf) { (uint16_t)(code), jt, jf, k } /* * Number of scratch memory words (for BPF_LD|BPF_MEM and BPF_ST). */ #define BPF_MEMWORDS 16 /* * bpf_memword_init_t: bits indicate which words in the external memory * store will be initialised by the caller before BPF program execution. */ typedef uint32_t bpf_memword_init_t; #define BPF_MEMWORD_INIT(k) (UINT32_C(1) << (k)) /* Note: two most significant bits are reserved by bpfjit. */ __CTASSERT(BPF_MEMWORDS + 2 <= sizeof(bpf_memword_init_t) * NBBY); #ifdef _KERNEL /* * Max number of external memory words (for BPF_LD|BPF_MEM and BPF_ST). */ #define BPF_MAX_MEMWORDS 30 __CTASSERT(BPF_MAX_MEMWORDS >= BPF_MEMWORDS); __CTASSERT(BPF_MAX_MEMWORDS + 2 <= sizeof(bpf_memword_init_t) * NBBY); #endif /* * Structure to retrieve available DLTs for the interface. */ struct bpf_dltlist { u_int bfl_len; /* number of bfd_list array */ u_int *bfl_list; /* array of DLTs */ }; struct bpf_ctx; typedef struct bpf_ctx bpf_ctx_t; typedef struct bpf_args { const uint8_t * pkt; size_t wirelen; size_t buflen; /* * The following arguments are used only by some kernel * subsystems. * They aren't required for classical bpf filter programs. * For such programs, bpfjit generated code doesn't read * those arguments at all. Note however that bpf interpreter * always needs a pointer to memstore. */ uint32_t * mem; /* pointer to external memory store */ void * arg; /* auxiliary argument for a copfunc */ } bpf_args_t; #if defined(_KERNEL) || defined(__BPF_PRIVATE) typedef uint32_t (*bpf_copfunc_t)(const bpf_ctx_t *, bpf_args_t *, uint32_t); struct bpf_ctx { /* * BPF coprocessor functions and the number of them. */ const bpf_copfunc_t * copfuncs; size_t nfuncs; /* * The number of memory words in the external memory store. * There may be up to BPF_MAX_MEMWORDS words; if zero is set, * then the internal memory store is used which has a fixed * number of words (BPF_MEMWORDS). */ size_t extwords; /* * The bitmask indicating which words in the external memstore * will be initialised by the caller. */ bpf_memword_init_t preinited; }; #endif #ifdef _KERNEL #include <net/bpfjit.h> #include <net/if.h> struct bpf_if; struct bpf_ops { void (*bpf_attach)(struct ifnet *, u_int, u_int, struct bpf_if **); void (*bpf_detach)(struct ifnet *); void (*bpf_change_type)(struct ifnet *, u_int, u_int); void (*bpf_mtap)(struct bpf_if *, struct mbuf *, u_int); void (*bpf_mtap2)(struct bpf_if *, void *, u_int, struct mbuf *, u_int); void (*bpf_mtap_af)(struct bpf_if *, uint32_t, struct mbuf *, u_int); void (*bpf_mtap_sl_in)(struct bpf_if *, u_char *, struct mbuf **); void (*bpf_mtap_sl_out)(struct bpf_if *, u_char *, struct mbuf *); void (*bpf_mtap_softint_init)(struct ifnet *); void (*bpf_mtap_softint)(struct ifnet *, struct mbuf *); int (*bpf_register_track_event)(struct bpf_if **, void (*)(struct bpf_if *, struct ifnet *, int, int)); int (*bpf_deregister_track_event)(struct bpf_if **, void (*)(struct bpf_if *, struct ifnet *, int, int)); }; extern struct bpf_ops *bpf_ops; static __inline void bpf_attach(struct ifnet *_ifp, u_int _dlt, u_int _hdrlen) { bpf_ops->bpf_attach(_ifp, _dlt, _hdrlen, &_ifp->if_bpf); } static __inline void bpf_attach2(struct ifnet *_ifp, u_int _dlt, u_int _hdrlen, struct bpf_if **_dp) { bpf_ops->bpf_attach(_ifp, _dlt, _hdrlen, _dp); } static __inline void bpf_mtap(struct ifnet *_ifp, struct mbuf *_m, u_int _direction) { if (_ifp->if_bpf) { if (_ifp->if_bpf_mtap) { _ifp->if_bpf_mtap(_ifp->if_bpf, _m, _direction); } else { bpf_ops->bpf_mtap(_ifp->if_bpf, _m, _direction); } } } static __inline void bpf_mtap2(struct bpf_if *_bpf, void *_data, u_int _dlen, struct mbuf *_m, u_int _direction) { bpf_ops->bpf_mtap2(_bpf, _data, _dlen, _m, _direction); } static __inline void bpf_mtap3(struct bpf_if *_bpf, struct mbuf *_m, u_int _direction) { if (_bpf) bpf_ops->bpf_mtap(_bpf, _m, _direction); } static __inline void bpf_mtap_af(struct ifnet *_ifp, uint32_t _af, struct mbuf *_m, u_int _direction) { if (_ifp->if_bpf) bpf_ops->bpf_mtap_af(_ifp->if_bpf, _af, _m, _direction); } static __inline void bpf_change_type(struct ifnet *_ifp, u_int _dlt, u_int _hdrlen) { bpf_ops->bpf_change_type(_ifp, _dlt, _hdrlen); } static __inline bool bpf_peers_present(struct bpf_if *dp) { /* * Our code makes sure the driver visible pointer is NULL * whenever there is no listener on this tap. */ return dp != NULL; } static __inline void bpf_detach(struct ifnet *_ifp) { bpf_ops->bpf_detach(_ifp); } static __inline void bpf_mtap_sl_in(struct ifnet *_ifp, u_char *_hdr, struct mbuf **_m) { bpf_ops->bpf_mtap_sl_in(_ifp->if_bpf, _hdr, _m); } static __inline void bpf_mtap_sl_out(struct ifnet *_ifp, u_char *_hdr, struct mbuf *_m) { if (_ifp->if_bpf) bpf_ops->bpf_mtap_sl_out(_ifp->if_bpf, _hdr, _m); } static __inline void bpf_mtap_softint_init(struct ifnet *_ifp) { bpf_ops->bpf_mtap_softint_init(_ifp); } static __inline void bpf_mtap_softint(struct ifnet *_ifp, struct mbuf *_m) { if (_ifp->if_bpf) bpf_ops->bpf_mtap_softint(_ifp, _m); } static __inline int bpf_register_track_event(struct bpf_if **_dp, void (*_fun)(struct bpf_if *, struct ifnet *, int, int)) { if (bpf_ops->bpf_register_track_event == NULL) return ENXIO; return bpf_ops->bpf_register_track_event(_dp, _fun); } static __inline int bpf_deregister_track_event(struct bpf_if **_dp, void (*_fun)(struct bpf_if *, struct ifnet *, int, int)) { if (bpf_ops->bpf_deregister_track_event == NULL) return ENXIO; return bpf_ops->bpf_deregister_track_event(_dp, _fun); } void bpf_setops(void); void bpf_ops_handover_enter(struct bpf_ops *); void bpf_ops_handover_exit(void); void bpfilterattach(int); bpf_ctx_t *bpf_create(void); void bpf_destroy(bpf_ctx_t *); int bpf_set_cop(bpf_ctx_t *, const bpf_copfunc_t *, size_t); int bpf_set_extmem(bpf_ctx_t *, size_t, bpf_memword_init_t); u_int bpf_filter_ext(const bpf_ctx_t *, const struct bpf_insn *, bpf_args_t *); int bpf_validate_ext(const bpf_ctx_t *, const struct bpf_insn *, int); bpfjit_func_t bpf_jit_generate(bpf_ctx_t *, void *, size_t); void bpf_jit_freecode(bpfjit_func_t); #endif int bpf_validate(const struct bpf_insn *, int); u_int bpf_filter(const struct bpf_insn *, const u_char *, u_int, u_int); u_int bpf_filter_with_aux_data(const struct bpf_insn *, const u_char *, u_int, u_int, const struct bpf_aux_data *); /* * events to be tracked by bpf_register_track_event callbacks */ #define BPF_TRACK_EVENT_ATTACH 1 #define BPF_TRACK_EVENT_DETACH 2 void bpf_dump(const struct bpf_program *, int); char *bpf_image(const struct bpf_insn *, int); __END_DECLS #if 1 /* XXX: remove me, for the benefit of sanitizers */ #define BIOCGSTATSOLD BIOCGSTATS_30 #define bpf_stat_old bpf_stat30 #endif #endif /* !_NET_BPF_H_ */
1 2 1 7 3 4 6 3 4 2 2 2 1 5 1 1 3 10 2 2 7 12 3 10 6 1 7 4 3 3 1 6 1 4 6 7 5 4 5 3 6 8 10 2 10 2 1 8 1 8 1 3 2 8 8 2 1 1 2 3 4 3 6 5 1 5 1 2 2 8 6 2 6 2 5 1 4 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 /* $NetBSD: kern_prot.c,v 1.122 2020/05/23 23:42:43 ad Exp $ */ /* * Copyright (c) 1982, 1986, 1989, 1990, 1991, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)kern_prot.c 8.9 (Berkeley) 2/14/95 */ /* * System calls related to processes and protection */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: kern_prot.c,v 1.122 2020/05/23 23:42:43 ad Exp $"); #ifdef _KERNEL_OPT #include "opt_compat_43.h" #endif #include <sys/param.h> #include <sys/acct.h> #include <sys/systm.h> #include <sys/ucred.h> #include <sys/proc.h> #include <sys/timeb.h> #include <sys/times.h> #include <sys/pool.h> #include <sys/prot.h> #include <sys/syslog.h> #include <sys/uidinfo.h> #include <sys/kauth.h> #include <sys/mount.h> #include <sys/syscallargs.h> int sys_getpid(struct lwp *, const void *, register_t *); int sys_getpid_with_ppid(struct lwp *, const void *, register_t *); int sys_getuid(struct lwp *, const void *, register_t *); int sys_getuid_with_euid(struct lwp *, const void *, register_t *); int sys_getgid(struct lwp *, const void *, register_t *); int sys_getgid_with_egid(struct lwp *, const void *, register_t *); /* ARGSUSED */ int sys_getpid(struct lwp *l, const void *v, register_t *retval) { struct proc *p = l->l_proc; *retval = p->p_pid; return (0); } /* ARGSUSED */ int sys_getpid_with_ppid(struct lwp *l, const void *v, register_t *retval) { struct proc *p = l->l_proc; retval[0] = p->p_pid; retval[1] = p->p_ppid; return (0); } /* ARGSUSED */ int sys_getppid(struct lwp *l, const void *v, register_t *retval) { struct proc *p = l->l_proc; *retval = p->p_ppid; return (0); } /* Get process group ID; note that POSIX getpgrp takes no parameter */ int sys_getpgrp(struct lwp *l, const void *v, register_t *retval) { struct proc *p = l->l_proc; mutex_enter(&proc_lock); *retval = p->p_pgrp->pg_id; mutex_exit(&proc_lock); return (0); } /* * Return the process group ID of the session leader (session ID) * for the specified process. */ int sys_getsid(struct lwp *l, const struct sys_getsid_args *uap, register_t *retval) { /* { syscalldarg(pid_t) pid; } */ pid_t pid = SCARG(uap, pid); struct proc *p; int error = 0; mutex_enter(&proc_lock); if (pid == 0) *retval = l->l_proc->p_session->s_sid; else if ((p = proc_find(pid)) != NULL) *retval = p->p_session->s_sid; else error = ESRCH; mutex_exit(&proc_lock); return error; } int sys_getpgid(struct lwp *l, const struct sys_getpgid_args *uap, register_t *retval) { /* { syscallarg(pid_t) pid; } */ pid_t pid = SCARG(uap, pid); struct proc *p; int error = 0; mutex_enter(&proc_lock); if (pid == 0) *retval = l->l_proc->p_pgid; else if ((p = proc_find(pid)) != NULL) *retval = p->p_pgid; else error = ESRCH; mutex_exit(&proc_lock); return error; } /* ARGSUSED */ int sys_getuid(struct lwp *l, const void *v, register_t *retval) { *retval = kauth_cred_getuid(l->l_cred); return (0); } /* ARGSUSED */ int sys_getuid_with_euid(struct lwp *l, const void *v, register_t *retval) { retval[0] = kauth_cred_getuid(l->l_cred); retval[1] = kauth_cred_geteuid(l->l_cred); return (0); } /* ARGSUSED */ int sys_geteuid(struct lwp *l, const void *v, register_t *retval) { *retval = kauth_cred_geteuid(l->l_cred); return (0); } /* ARGSUSED */ int sys_getgid(struct lwp *l, const void *v, register_t *retval) { *retval = kauth_cred_getgid(l->l_cred); return (0); } /* ARGSUSED */ int sys_getgid_with_egid(struct lwp *l, const void *v, register_t *retval) { retval[0] = kauth_cred_getgid(l->l_cred); retval[1] = kauth_cred_getegid(l->l_cred); return (0); } /* * Get effective group ID. The "egid" is groups[0], and could be obtained * via getgroups. This syscall exists because it is somewhat painful to do * correctly in a library function. */ /* ARGSUSED */ int sys_getegid(struct lwp *l, const void *v, register_t *retval) { *retval = kauth_cred_getegid(l->l_cred); return (0); } int sys_getgroups(struct lwp *l, const struct sys_getgroups_args *uap, register_t *retval) { /* { syscallarg(int) gidsetsize; syscallarg(gid_t *) gidset; } */ *retval = kauth_cred_ngroups(l->l_cred); if (SCARG(uap, gidsetsize) == 0) return 0; if (SCARG(uap, gidsetsize) < (int)*retval) return EINVAL; return kauth_cred_getgroups(l->l_cred, SCARG(uap, gidset), *retval, UIO_USERSPACE); } int sys_setsid(struct lwp *l, const void *v, register_t *retval) { struct proc *p = l->l_proc; int error; error = proc_enterpgrp(p, p->p_pid, p->p_pid, true); *retval = p->p_pid; return (error); } /* * set process group (setpgid/old setpgrp) * * caller does setpgid(targpid, targpgid) * * pgid must be in valid range (EINVAL) * pid must be caller or child of caller (ESRCH) * if a child * pid must be in same session (EPERM) * pid can't have done an exec (EACCES) * if pgid != pid * there must exist some pid in same session having pgid (EPERM) * pid must not be session leader (EPERM) * * Permission checks now in proc_enterpgrp() */ int sys_setpgid(struct lwp *l, const struct sys_setpgid_args *uap, register_t *retval) { /* { syscallarg(int) pid; syscallarg(int) pgid; } */ struct proc *p = l->l_proc; pid_t targp, pgid; if (SCARG(uap, pgid) < 0) return EINVAL; if ((targp = SCARG(uap, pid)) == 0) targp = p->p_pid; if ((pgid = SCARG(uap, pgid)) == 0) pgid = targp; return proc_enterpgrp(p, targp, pgid, false); } /* * Set real, effective and saved uids to the requested values. * non-root callers can only ever change uids to values that match * one of the processes current uid values. * This is further restricted by the flags argument. */ int do_setresuid(struct lwp *l, uid_t r, uid_t e, uid_t sv, u_int flags) { struct proc *p = l->l_proc; kauth_cred_t cred, ncred; ncred = kauth_cred_alloc(); /* Get a write lock on the process credential. */ proc_crmod_enter(); cred = p->p_cred; /* * Check that the new value is one of the allowed existing values, * or that we have root privilege. */ if ((r != -1 && !((flags & ID_R_EQ_R) && r == kauth_cred_getuid(cred)) && !((flags & ID_R_EQ_E) && r == kauth_cred_geteuid(cred)) && !((flags & ID_R_EQ_S) && r == kauth_cred_getsvuid(cred))) || (e != -1 && !((flags & ID_E_EQ_R) && e == kauth_cred_getuid(cred)) && !((flags & ID_E_EQ_E) && e == kauth_cred_geteuid(cred)) && !((flags & ID_E_EQ_S) && e == kauth_cred_getsvuid(cred))) || (sv != -1 && !((flags & ID_S_EQ_R) && sv == kauth_cred_getuid(cred)) && !((flags & ID_S_EQ_E) && sv == kauth_cred_geteuid(cred)) && !((flags & ID_S_EQ_S) && sv == kauth_cred_getsvuid(cred)))) { int error; error = kauth_authorize_process(cred, KAUTH_PROCESS_SETID, p, NULL, NULL, NULL); if (error != 0) { proc_crmod_leave(cred, ncred, false); return error; } } /* If nothing has changed, short circuit the request */ if ((r == -1 || r == kauth_cred_getuid(cred)) && (e == -1 || e == kauth_cred_geteuid(cred)) && (sv == -1 || sv == kauth_cred_getsvuid(cred))) { proc_crmod_leave(cred, ncred, false); return 0; } kauth_cred_clone(cred, ncred); if (r != -1 && r != kauth_cred_getuid(ncred)) { u_long nlwps; /* Update count of processes for this user. */ (void)chgproccnt(kauth_cred_getuid(ncred), -1); (void)chgproccnt(r, 1); /* The first LWP of a process is excluded. */ KASSERT(mutex_owned(p->p_lock)); nlwps = p->p_nlwps - 1; (void)chglwpcnt(kauth_cred_getuid(ncred), -nlwps); (void)chglwpcnt(r, nlwps); kauth_cred_setuid(ncred, r); } if (sv != -1) kauth_cred_setsvuid(ncred, sv); if (e != -1) kauth_cred_seteuid(ncred, e); /* Broadcast our credentials to the process and other LWPs. */ proc_crmod_leave(ncred, cred, true); return 0; } /* * Set real, effective and saved gids to the requested values. * non-root callers can only ever change gids to values that match * one of the processes current gid values. * This is further restricted by the flags argument. */ int do_setresgid(struct lwp *l, gid_t r, gid_t e, gid_t sv, u_int flags) { struct proc *p = l->l_proc; kauth_cred_t cred, ncred; ncred = kauth_cred_alloc(); /* Get a write lock on the process credential. */ proc_crmod_enter(); cred = p->p_cred; /* * check new value is one of the allowed existing values. * otherwise, check if we have root privilege. */ if ((r != -1 && !((flags & ID_R_EQ_R) && r == kauth_cred_getgid(cred)) && !((flags & ID_R_EQ_E) && r == kauth_cred_getegid(cred)) && !((flags & ID_R_EQ_S) && r == kauth_cred_getsvgid(cred))) || (e != -1 && !((flags & ID_E_EQ_R) && e == kauth_cred_getgid(cred)) && !((flags & ID_E_EQ_E) && e == kauth_cred_getegid(cred)) && !((flags & ID_E_EQ_S) && e == kauth_cred_getsvgid(cred))) || (sv != -1 && !((flags & ID_S_EQ_R) && sv == kauth_cred_getgid(cred)) && !((flags & ID_S_EQ_E) && sv == kauth_cred_getegid(cred)) && !((flags & ID_S_EQ_S) && sv == kauth_cred_getsvgid(cred)))) { int error; error = kauth_authorize_process(cred, KAUTH_PROCESS_SETID, p, NULL, NULL, NULL); if (error != 0) { proc_crmod_leave(cred, ncred, false); return error; } } /* If nothing has changed, short circuit the request */ if ((r == -1 || r == kauth_cred_getgid(cred)) && (e == -1 || e == kauth_cred_getegid(cred)) && (sv == -1 || sv == kauth_cred_getsvgid(cred))) { proc_crmod_leave(cred, ncred, false); return 0; } kauth_cred_clone(cred, ncred); if (r != -1) kauth_cred_setgid(ncred, r); if (sv != -1) kauth_cred_setsvgid(ncred, sv); if (e != -1) kauth_cred_setegid(ncred, e); /* Broadcast our credentials to the process and other LWPs. */ proc_crmod_leave(ncred, cred, true); return 0; } /* ARGSUSED */ int sys_setuid(struct lwp *l, const struct sys_setuid_args *uap, register_t *retval) { /* { syscallarg(uid_t) uid; } */ uid_t uid = SCARG(uap, uid); return do_setresuid(l, uid, uid, uid, ID_R_EQ_R | ID_E_EQ_R | ID_S_EQ_R); } /* ARGSUSED */ int sys_seteuid(struct lwp *l, const struct sys_seteuid_args *uap, register_t *retval) { /* { syscallarg(uid_t) euid; } */ return do_setresuid(l, -1, SCARG(uap, euid), -1, ID_E_EQ_R | ID_E_EQ_S); } int sys_setreuid(struct lwp *l, const struct sys_setreuid_args *uap, register_t *retval) { /* { syscallarg(uid_t) ruid; syscallarg(uid_t) euid; } */ kauth_cred_t cred = l->l_cred; uid_t ruid, euid, svuid; ruid = SCARG(uap, ruid); euid = SCARG(uap, euid); if (ruid == -1) ruid = kauth_cred_getuid(cred); if (euid == -1) euid = kauth_cred_geteuid(cred); /* Saved uid is set to the new euid if the ruid changed */ svuid = (ruid == kauth_cred_getuid(cred)) ? -1 : euid; return do_setresuid(l, ruid, euid, svuid, ID_R_EQ_R | ID_R_EQ_E | ID_E_EQ_R | ID_E_EQ_E | ID_E_EQ_S | ID_S_EQ_R | ID_S_EQ_E | ID_S_EQ_S); } /* ARGSUSED */ int sys_setgid(struct lwp *l, const struct sys_setgid_args *uap, register_t *retval) { /* { syscallarg(gid_t) gid; } */ gid_t gid = SCARG(uap, gid); return do_setresgid(l, gid, gid, gid, ID_R_EQ_R | ID_E_EQ_R | ID_S_EQ_R); } /* ARGSUSED */ int sys_setegid(struct lwp *l, const struct sys_setegid_args *uap, register_t *retval) { /* { syscallarg(gid_t) egid; } */ return do_setresgid(l, -1, SCARG(uap, egid), -1, ID_E_EQ_R | ID_E_EQ_S); } int sys_setregid(struct lwp *l, const struct sys_setregid_args *uap, register_t *retval) { /* { syscallarg(gid_t) rgid; syscallarg(gid_t) egid; } */ kauth_cred_t cred = l->l_cred; gid_t rgid, egid, svgid; rgid = SCARG(uap, rgid); egid = SCARG(uap, egid); if (rgid == -1) rgid = kauth_cred_getgid(cred); if (egid == -1) egid = kauth_cred_getegid(cred); /* Saved gid is set to the new egid if the rgid changed */ svgid = rgid == kauth_cred_getgid(cred) ? -1 : egid; return do_setresgid(l, rgid, egid, svgid, ID_R_EQ_R | ID_R_EQ_E | ID_E_EQ_R | ID_E_EQ_E | ID_E_EQ_S | ID_S_EQ_R | ID_S_EQ_E | ID_S_EQ_S); } int sys_issetugid(struct lwp *l, const void *v, register_t *retval) { struct proc *p = l->l_proc; /* * Note: OpenBSD sets a P_SUGIDEXEC flag set at execve() time, * we use PK_SUGID because we consider changing the owners as * "tainting" as well. * This is significant for procs that start as root and "become" * a user without an exec - programs cannot know *everything* * that libc *might* have put in their data segment. */ *retval = (p->p_flag & PK_SUGID) != 0; return (0); } /* ARGSUSED */ int sys_setgroups(struct lwp *l, const struct sys_setgroups_args *uap, register_t *retval) { /* { syscallarg(int) gidsetsize; syscallarg(const gid_t *) gidset; } */ kauth_cred_t ncred; int error; ncred = kauth_cred_alloc(); error = kauth_cred_setgroups(ncred, SCARG(uap, gidset), SCARG(uap, gidsetsize), -1, UIO_USERSPACE); if (error != 0) { kauth_cred_free(ncred); return error; } return kauth_proc_setgroups(l, ncred); } /* * Get login name, if available. */ /* ARGSUSED */ int sys___getlogin(struct lwp *l, const struct sys___getlogin_args *uap, register_t *retval) { /* { syscallarg(char *) namebuf; syscallarg(size_t) namelen; } */ struct proc *p = l->l_proc; char login[sizeof(p->p_session->s_login)]; size_t namelen = SCARG(uap, namelen); if (namelen > sizeof(login)) namelen = sizeof(login); mutex_enter(&proc_lock); memcpy(login, p->p_session->s_login, namelen); mutex_exit(&proc_lock); return (copyout(login, (void *)SCARG(uap, namebuf), namelen)); } /* * Set login name. */ /* ARGSUSED */ int sys___setlogin(struct lwp *l, const struct sys___setlogin_args *uap, register_t *retval) { /* { syscallarg(const char *) namebuf; } */ struct proc *p = l->l_proc; struct session *sp; char newname[sizeof sp->s_login + 1]; int error; if ((error = kauth_authorize_process(l->l_cred, KAUTH_PROCESS_SETID, p, NULL, NULL, NULL)) != 0) return (error); error = copyinstr(SCARG(uap, namebuf), newname, sizeof newname, NULL); if (error != 0) return (error == ENAMETOOLONG ? EINVAL : error); mutex_enter(&proc_lock); sp = p->p_session; if (sp->s_flags & S_LOGIN_SET && p->p_pid != sp->s_sid && strncmp(newname, sp->s_login, sizeof sp->s_login) != 0) log(LOG_WARNING, "%s (pid %d) changing logname from " "%.*s to %s\n", p->p_comm, p->p_pid, (int)sizeof sp->s_login, sp->s_login, newname); sp->s_flags |= S_LOGIN_SET; strncpy(sp->s_login, newname, sizeof sp->s_login); mutex_exit(&proc_lock); return (0); }
27 2 2 6 5 3 10 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 /* $NetBSD: clockctl_50.c,v 1.4 2019/12/12 02:15:42 pgoyette Exp $ */ /*- * Copyright (c) 2001 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Emmanuel Dreyfus. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The name of the author may not be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: clockctl_50.c,v 1.4 2019/12/12 02:15:42 pgoyette Exp $"); #if defined(_KERNEL_OPT) #include "opt_compat_netbsd.h" #endif #include <sys/param.h> #include <sys/systm.h> #include <sys/proc.h> #include <sys/errno.h> #include <sys/ioctl.h> #include <sys/device.h> #include <sys/time.h> #include <sys/conf.h> #include <sys/timex.h> #include <sys/kauth.h> #include <sys/module.h> #include <sys/mutex.h> #include <sys/compat_stub.h> #include <sys/clockctl.h> #include <compat/sys/clockctl.h> #include <compat/sys/time_types.h> int compat50_clockctlioctl(dev_t dev, u_long cmd, void *data, int flags, struct lwp *l) { int error = 0; const struct cdevsw *cd = cdevsw_lookup(dev); if (cd == NULL || cd->d_ioctl == NULL) return ENXIO; switch (cmd) { case CLOCKCTL_OSETTIMEOFDAY: { struct timeval50 tv50; struct timeval tv; struct clockctl50_settimeofday *args = data; error = copyin(args->tv, &tv50, sizeof(tv50)); if (error) return (error); timeval50_to_timeval(&tv50, &tv); error = settimeofday1(&tv, false, args->tzp, l, false); break; } case CLOCKCTL_OADJTIME: { struct timeval atv, oldatv; struct timeval50 atv50; struct clockctl50_adjtime *args = data; if (args->delta) { error = copyin(args->delta, &atv50, sizeof(atv50)); if (error) return (error); timeval50_to_timeval(&atv50, &atv); } adjtime1(args->delta ? &atv : NULL, args->olddelta ? &oldatv : NULL, l->l_proc); if (args->olddelta) { timeval_to_timeval50(&oldatv, &atv50); error = copyout(&atv50, args->olddelta, sizeof(atv50)); } break; } case CLOCKCTL_OCLOCK_SETTIME: { struct timespec50 tp50; struct timespec tp; struct clockctl50_clock_settime *args = data; error = copyin(args->tp, &tp50, sizeof(tp50)); if (error) return (error); timespec50_to_timespec(&tp50, &tp); error = clock_settime1(l->l_proc, args->clock_id, &tp, true); break; } case CLOCKCTL_ONTP_ADJTIME: { if (vec_ntp_timestatus == NULL) { error = ENOTTY; break; } /* The ioctl number changed but the data did not change. */ error = (cd->d_ioctl)(dev, CLOCKCTL_NTP_ADJTIME, data, flags, l); break; } default: error = ENOTTY; } return (error); } void clockctl_50_init(void) { MODULE_HOOK_SET(clockctl_ioctl_50_hook, compat50_clockctlioctl); } void clockctl_50_fini(void) { MODULE_HOOK_UNSET(clockctl_ioctl_50_hook); }
170 170 49 1 156 155 155 30 140 26 155 115 99 48 48 48 48 34 13 1 13 13 13 6 31 6 6 24 5 5 1 4 28 28 4 28 26 18 9 20 27 27 27 27 27 27 4 1 3 3 27 27 27 26 17 17 17 27 10 1 9 9 9 2 7 27 27 27 27 18 27 27 27 27 27 4 4 3 1 4 4 3 1 1 1 4 4 4 3 3 2 3 3 4 1 3 245 123 140 111 113 78 167 167 18 191 189 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 /* $NetBSD: ffs_inode.c,v 1.131 2020/07/31 04:07:30 chs Exp $ */ /*- * Copyright (c) 2008 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Wasabi Systems, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /* * Copyright (c) 1982, 1986, 1989, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)ffs_inode.c 8.13 (Berkeley) 4/21/95 */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: ffs_inode.c,v 1.131 2020/07/31 04:07:30 chs Exp $"); #if defined(_KERNEL_OPT) #include "opt_ffs.h" #include "opt_quota.h" #endif #include <sys/param.h> #include <sys/systm.h> #include <sys/buf.h> #include <sys/file.h> #include <sys/fstrans.h> #include <sys/kauth.h> #include <sys/kernel.h> #include <sys/kmem.h> #include <sys/mount.h> #include <sys/proc.h> #include <sys/resourcevar.h> #include <sys/trace.h> #include <sys/vnode.h> #include <sys/wapbl.h> #include <ufs/ufs/quota.h> #include <ufs/ufs/inode.h> #include <ufs/ufs/ufsmount.h> #include <ufs/ufs/ufs_extern.h> #include <ufs/ufs/ufs_bswap.h> #include <ufs/ufs/ufs_wapbl.h> #include <ufs/ffs/fs.h> #include <ufs/ffs/ffs_extern.h> static int ffs_indirtrunc(struct inode *, daddr_t, daddr_t, daddr_t, int, int64_t *); /* * Update the access, modified, and inode change times as specified * by the IN_ACCESS, IN_UPDATE, and IN_CHANGE flags respectively. * The IN_MODIFIED flag is used to specify that the inode needs to be * updated but that the times have already been set. The access * and modified times are taken from the second and third parameters; * the inode change time is always taken from the current time. If * UPDATE_WAIT flag is set, or UPDATE_DIROP is set then wait for the * disk write of the inode to complete. */ int ffs_update(struct vnode *vp, const struct timespec *acc, const struct timespec *mod, int updflags) { struct fs *fs; struct buf *bp; struct inode *ip; int error; void *cp; int waitfor, flags; if (vp->v_mount->mnt_flag & MNT_RDONLY) return (0); ip = VTOI(vp); FFS_ITIMES(ip, acc, mod, NULL); if (updflags & UPDATE_CLOSE) flags = ip->i_flag & (IN_MODIFIED | IN_ACCESSED); else flags = ip->i_flag & IN_MODIFIED; if (flags == 0) return (0); fs = ip->i_fs; if ((flags & IN_MODIFIED) != 0 && (vp->v_mount->mnt_flag & MNT_ASYNC) == 0) { waitfor = updflags & UPDATE_WAIT; if ((updflags & UPDATE_DIROP) != 0) waitfor |= UPDATE_WAIT; } else waitfor = 0; /* * Ensure that uid and gid are correct. This is a temporary * fix until fsck has been changed to do the update. */ if (fs->fs_magic == FS_UFS1_MAGIC && /* XXX */ fs->fs_old_inodefmt < FS_44INODEFMT) { /* XXX */ ip->i_ffs1_ouid = ip->i_uid; /* XXX */ ip->i_ffs1_ogid = ip->i_gid; /* XXX */ } /* XXX */ error = bread(ip->i_devvp, FFS_FSBTODB(fs, ino_to_fsba(fs, ip->i_number)), (int)fs->fs_bsize, B_MODIFY, &bp); if (error) { return (error); } ip->i_flag &= ~(IN_MODIFIED | IN_ACCESSED); /* Keep unlinked inode list up to date */ KDASSERTMSG(DIP(ip, nlink) == ip->i_nlink, "DIP(ip, nlink) [%d] == ip->i_nlink [%d]", DIP(ip, nlink), ip->i_nlink); if (ip->i_mode) { if (ip->i_nlink > 0) { UFS_WAPBL_UNREGISTER_INODE(ip->i_ump->um_mountp, ip->i_number, ip->i_mode); } else { UFS_WAPBL_REGISTER_INODE(ip->i_ump->um_mountp, ip->i_number, ip->i_mode); } } if (fs->fs_magic == FS_UFS1_MAGIC) { cp = (char *)bp->b_data + (ino_to_fsbo(fs, ip->i_number) * DINODE1_SIZE); #ifdef FFS_EI if (UFS_FSNEEDSWAP(fs)) ffs_dinode1_swap(ip->i_din.ffs1_din, (struct ufs1_dinode *)cp); else #endif memcpy(cp, ip->i_din.ffs1_din, DINODE1_SIZE); } else { cp = (char *)bp->b_data + (ino_to_fsbo(fs, ip->i_number) * DINODE2_SIZE); #ifdef FFS_EI if (UFS_FSNEEDSWAP(fs)) ffs_dinode2_swap(ip->i_din.ffs2_din, (struct ufs2_dinode *)cp); else #endif memcpy(cp, ip->i_din.ffs2_din, DINODE2_SIZE); } if (waitfor) { return (bwrite(bp)); } else { bdwrite(bp); return (0); } } #define SINGLE 0 /* index of single indirect block */ #define DOUBLE 1 /* index of double indirect block */ #define TRIPLE 2 /* index of triple indirect block */ /* * Truncate the inode oip to at most length size, freeing the * disk blocks. */ int ffs_truncate(struct vnode *ovp, off_t length, int ioflag, kauth_cred_t cred) { daddr_t lastblock; struct inode *oip = VTOI(ovp); struct mount *omp = ovp->v_mount; daddr_t bn, lastiblock[UFS_NIADDR], indir_lbn[UFS_NIADDR]; daddr_t blks[UFS_NDADDR + UFS_NIADDR], oldblks[UFS_NDADDR + UFS_NIADDR]; struct fs *fs; int extblocks; int offset, pgoffset, level; int64_t blocksreleased = 0, datablocks; int i, aflag, nblocks; int error, allerror = 0; off_t osize; int sync; struct ufsmount *ump = oip->i_ump; void *dcookie; long bsize; bool wapbl = omp->mnt_wapbl != NULL; UFS_WAPBL_JLOCK_ASSERT(ump->um_mountp); if (ovp->v_type == VCHR || ovp->v_type == VBLK || ovp->v_type == VFIFO || ovp->v_type == VSOCK) { KASSERT(oip->i_size == 0); return 0; } if (length < 0) return (EINVAL); /* * Historically clients did not have to specify which data * they were truncating. So, if not specified, we assume * traditional behavior, e.g., just the normal data. */ if ((ioflag & (IO_EXT | IO_NORMAL)) == 0) ioflag |= IO_NORMAL; fs = oip->i_fs; #define i_din2 i_din.ffs2_din extblocks = 0; datablocks = DIP(oip, blocks); if (fs->fs_magic == FS_UFS2_MAGIC && oip->i_din2->di_extsize > 0) { extblocks = btodb(ffs_fragroundup(fs, oip->i_din2->di_extsize)); datablocks -= extblocks; } if ((ioflag & IO_EXT) && extblocks > 0) { if (length != 0) panic("ffs_truncate: partial trunc of extdata"); { #ifdef QUOTA (void) chkdq(oip, -extblocks, NOCRED, FORCE); #endif osize = oip->i_din2->di_extsize; oip->i_din2->di_blocks -= extblocks; oip->i_din2->di_extsize = 0; for (i = 0; i < UFS_NXADDR; i++) { binvalbuf(ovp, -1 - i); oldblks[i] = oip->i_din2->di_extb[i]; oip->i_din2->di_extb[i] = 0; } oip->i_flag |= IN_CHANGE; if ((error = ffs_update(ovp, NULL, NULL, 0))) return (error); for (i = 0; i < UFS_NXADDR; i++) { if (oldblks[i] == 0) continue; bsize = ffs_sblksize(fs, osize, i); if (wapbl) { error = UFS_WAPBL_REGISTER_DEALLOCATION(omp, FFS_FSBTODB(fs, oldblks[i]), bsize, NULL); if (error) return error; } else ffs_blkfree(fs, oip->i_devvp, oldblks[i], bsize, oip->i_number); } extblocks = 0; } } if ((ioflag & IO_NORMAL) == 0) return (0); if (ovp->v_type == VLNK && (oip->i_size < ump->um_maxsymlinklen || (ump->um_maxsymlinklen == 0 && datablocks == 0))) { KDASSERT(length == 0); memset(SHORTLINK(oip), 0, (size_t)oip->i_size); oip->i_size = 0; DIP_ASSIGN(oip, size, 0); oip->i_flag |= IN_CHANGE | IN_UPDATE; return (ffs_update(ovp, NULL, NULL, 0)); } if (oip->i_size == length) { /* still do a uvm_vnp_setsize() as writesize may be larger */ uvm_vnp_setsize(ovp, length); oip->i_flag |= IN_CHANGE | IN_UPDATE; return (ffs_update(ovp, NULL, NULL, 0)); } if (length > ump->um_maxfilesize) return (EFBIG); if ((oip->i_flags & SF_SNAPSHOT) != 0) ffs_snapremove(ovp); osize = oip->i_size; aflag = ioflag & IO_SYNC ? B_SYNC : 0; /* * Lengthen the size of the file. We must ensure that the * last byte of the file is allocated. Since the smallest * value of osize is 0, length will be at least 1. */ if (osize < length) { if (ffs_lblkno(fs, osize) < UFS_NDADDR && ffs_lblkno(fs, osize) != ffs_lblkno(fs, length) && ffs_blkroundup(fs, osize) != osize) { off_t eob; eob = ffs_blkroundup(fs, osize); uvm_vnp_setwritesize(ovp, eob); error = ufs_balloc_range(ovp, osize, eob - osize, cred, aflag); if (error) { (void) ffs_truncate(ovp, osize, ioflag & IO_SYNC, cred); return error; } if (ioflag & IO_SYNC) { rw_enter(ovp->v_uobj.vmobjlock, RW_WRITER); VOP_PUTPAGES(ovp, trunc_page(osize & fs->fs_bmask), round_page(eob), PGO_CLEANIT | PGO_SYNCIO | PGO_JOURNALLOCKED); } } uvm_vnp_setwritesize(ovp, length); error = ufs_balloc_range(ovp, length - 1, 1, cred, aflag); if (error) { (void) ffs_truncate(ovp, osize, ioflag & IO_SYNC, cred); return (error); } uvm_vnp_setsize(ovp, length); oip->i_flag |= IN_CHANGE | IN_UPDATE; KASSERT(ovp->v_size == oip->i_size); return (ffs_update(ovp, NULL, NULL, 0)); } /* * When truncating a regular file down to a non-block-aligned size, * we must zero the part of last block which is past the new EOF. * We must synchronously flush the zeroed pages to disk * since the new pages will be invalidated as soon as we * inform the VM system of the new, smaller size. * We must do this before acquiring the GLOCK, since fetching * the pages will acquire the GLOCK internally. * So there is a window where another thread could see a whole * zeroed page past EOF, but that's life. */ offset = ffs_blkoff(fs, length); pgoffset = length & PAGE_MASK; if (ovp->v_type == VREG && (pgoffset != 0 || offset != 0) && osize > length) { daddr_t lbn; voff_t eoz; int size; if (offset != 0) { error = ufs_balloc_range(ovp, length - 1, 1, cred, aflag); if (error) return error; } lbn = ffs_lblkno(fs, length); size = ffs_blksize(fs, oip, lbn); eoz = MIN(MAX(ffs_lblktosize(fs, lbn) + size, round_page(pgoffset)), osize); ubc_zerorange(&ovp->v_uobj, length, eoz - length, UBC_VNODE_FLAGS(ovp)); if (round_page(eoz) > round_page(length)) { rw_enter(ovp->v_uobj.vmobjlock, RW_WRITER); error = VOP_PUTPAGES(ovp, round_page(length), round_page(eoz), PGO_CLEANIT | PGO_DEACTIVATE | PGO_JOURNALLOCKED | ((ioflag & IO_SYNC) ? PGO_SYNCIO : 0)); if (error) return error; } } genfs_node_wrlock(ovp); oip->i_size = length; DIP_ASSIGN(oip, size, length); uvm_vnp_setsize(ovp, length); /* * Calculate index into inode's block list of * last direct and indirect blocks (if any) * which we want to keep. Lastblock is -1 when * the file is truncated to 0. */ lastblock = ffs_lblkno(fs, length + fs->fs_bsize - 1) - 1; lastiblock[SINGLE] = lastblock - UFS_NDADDR; lastiblock[DOUBLE] = lastiblock[SINGLE] - FFS_NINDIR(fs); lastiblock[TRIPLE] = lastiblock[DOUBLE] - FFS_NINDIR(fs) * FFS_NINDIR(fs); nblocks = btodb(fs->fs_bsize); /* * Update file and block pointers on disk before we start freeing * blocks. If we crash before free'ing blocks below, the blocks * will be returned to the free list. lastiblock values are also * normalized to -1 for calls to ffs_indirtrunc below. */ sync = 0; for (level = TRIPLE; level >= SINGLE; level--) { blks[UFS_NDADDR + level] = DIP(oip, ib[level]); if (lastiblock[level] < 0 && blks[UFS_NDADDR + level] != 0) { sync = 1; DIP_ASSIGN(oip, ib[level], 0); lastiblock[level] = -1; } } for (i = 0; i < UFS_NDADDR; i++) { blks[i] = DIP(oip, db[i]); if (i > lastblock && blks[i] != 0) { sync = 1; DIP_ASSIGN(oip, db[i], 0); } } oip->i_flag |= IN_CHANGE | IN_UPDATE; if (sync) { error = ffs_update(ovp, NULL, NULL, UPDATE_WAIT); if (error && !allerror) allerror = error; } /* * Having written the new inode to disk, save its new configuration * and put back the old block pointers long enough to process them. * Note that we save the new block configuration so we can check it * when we are done. */ for (i = 0; i < UFS_NDADDR; i++) { bn = DIP(oip, db[i]); DIP_ASSIGN(oip, db[i], blks[i]); blks[i] = bn; } for (i = 0; i < UFS_NIADDR; i++) { bn = DIP(oip, ib[i]); DIP_ASSIGN(oip, ib[i], blks[UFS_NDADDR + i]); blks[UFS_NDADDR + i] = bn; } oip->i_size = osize; DIP_ASSIGN(oip, size, osize); error = vtruncbuf(ovp, lastblock + 1, 0, 0); if (error && !allerror) allerror = error; /* * Indirect blocks first. */ indir_lbn[SINGLE] = -UFS_NDADDR; indir_lbn[DOUBLE] = indir_lbn[SINGLE] - FFS_NINDIR(fs) - 1; indir_lbn[TRIPLE] = indir_lbn[DOUBLE] - FFS_NINDIR(fs) * FFS_NINDIR(fs) - 1; for (level = TRIPLE; level >= SINGLE; level--) { bn = ffs_getib(fs, oip, level); if (bn != 0) { if (lastiblock[level] < 0 && oip->i_ump->um_mountp->mnt_wapbl) { error = UFS_WAPBL_REGISTER_DEALLOCATION( oip->i_ump->um_mountp, FFS_FSBTODB(fs, bn), fs->fs_bsize, &dcookie); if (error) goto out; } else { dcookie = NULL; } error = ffs_indirtrunc(oip, indir_lbn[level], FFS_FSBTODB(fs, bn), lastiblock[level], level, &blocksreleased); if (error) { if (dcookie) { UFS_WAPBL_UNREGISTER_DEALLOCATION( oip->i_ump->um_mountp, dcookie); } goto out; } if (lastiblock[level] < 0) { if (!dcookie) ffs_blkfree(fs, oip->i_devvp, bn, fs->fs_bsize, oip->i_number); DIP_ASSIGN(oip, ib[level], 0); blocksreleased += nblocks; } } if (lastiblock[level] >= 0) goto done; } /* * All whole direct blocks or frags. */ for (i = UFS_NDADDR - 1; i > lastblock; i--) { bn = ffs_getdb(fs, oip, i); if (bn == 0) continue; bsize = ffs_blksize(fs, oip, i); if ((oip->i_ump->um_mountp->mnt_wapbl) && (ovp->v_type != VREG)) { error = UFS_WAPBL_REGISTER_DEALLOCATION( oip->i_ump->um_mountp, FFS_FSBTODB(fs, bn), bsize, NULL); if (error) goto out; } else ffs_blkfree(fs, oip->i_devvp, bn, bsize, oip->i_number); DIP_ASSIGN(oip, db[i], 0); blocksreleased += btodb(bsize); } if (lastblock < 0) goto done; /* * Finally, look for a change in size of the * last direct block; release any frags. */ bn = ffs_getdb(fs, oip, lastblock); if (bn != 0) { long oldspace, newspace; /* * Calculate amount of space we're giving * back as old block size minus new block size. */ oldspace = ffs_blksize(fs, oip, lastblock); oip->i_size = length; DIP_ASSIGN(oip, size, length); newspace = ffs_blksize(fs, oip, lastblock); if (newspace == 0) panic("itrunc: newspace"); if (oldspace - newspace > 0) { /* * Block number of space to be free'd is * the old block # plus the number of frags * required for the storage we're keeping. */ bn += ffs_numfrags(fs, newspace); if ((oip->i_ump->um_mountp->mnt_wapbl) && (ovp->v_type != VREG)) { error = UFS_WAPBL_REGISTER_DEALLOCATION( oip->i_ump->um_mountp, FFS_FSBTODB(fs, bn), oldspace - newspace, NULL); if (error) goto out; } else ffs_blkfree(fs, oip->i_devvp, bn, oldspace - newspace, oip->i_number); blocksreleased += btodb(oldspace - newspace); } } done: for (level = SINGLE; level <= TRIPLE; level++) KASSERTMSG((blks[UFS_NDADDR + level] == DIP(oip, ib[level])), "itrunc1 blk mismatch: %jx != %jx", (uintmax_t)blks[UFS_NDADDR + level], (uintmax_t)DIP(oip, ib[level])); for (i = 0; i < UFS_NDADDR; i++) KASSERTMSG((blks[i] == DIP(oip, db[i])), "itrunc2 blk mismatch: %jx != %jx", (uintmax_t)blks[i], (uintmax_t)DIP(oip, db[i])); KASSERTMSG((length != 0 || extblocks || LIST_EMPTY(&ovp->v_cleanblkhd)), "itrunc3: zero length and nonempty cleanblkhd"); KASSERTMSG((length != 0 || extblocks || LIST_EMPTY(&ovp->v_dirtyblkhd)), "itrunc3: zero length and nonempty dirtyblkhd"); out: /* * Set length back to old size if deallocation failed. Some indirect * blocks were deallocated creating a hole, but that is okay. */ if (error == EAGAIN) { if (!allerror) allerror = error; length = osize; uvm_vnp_setsize(ovp, length); } /* * Put back the real size. */ oip->i_size = length; DIP_ASSIGN(oip, size, length); DIP_ADD(oip, blocks, -blocksreleased); genfs_node_unlock(ovp); oip->i_flag |= IN_CHANGE; UFS_WAPBL_UPDATE(ovp, NULL, NULL, 0); #if defined(QUOTA) || defined(QUOTA2) (void) chkdq(oip, -blocksreleased, NOCRED, 0); #endif KASSERT(ovp->v_type != VREG || ovp->v_size == oip->i_size); return (allerror); } /* * Release blocks associated with the inode ip and stored in the indirect * block bn. Blocks are free'd in LIFO order up to (but not including) * lastbn. If level is greater than SINGLE, the block is an indirect block * and recursive calls to indirtrunc must be used to cleanse other indirect * blocks. * * NB: triple indirect blocks are untested. */ static int ffs_indirtrunc(struct inode *ip, daddr_t lbn, daddr_t dbn, daddr_t lastbn, int level, int64_t *countp) { int i; struct buf *bp; struct fs *fs = ip->i_fs; int32_t *bap1 = NULL; int64_t *bap2 = NULL; struct vnode *vp; daddr_t nb, nlbn, last; char *copy = NULL; int64_t factor; int64_t nblocks; int error = 0, allerror = 0; const int needswap = UFS_FSNEEDSWAP(fs); const int wapbl = (ip->i_ump->um_mountp->mnt_wapbl != NULL); void *dcookie; #define RBAP(ip, i) (((ip)->i_ump->um_fstype == UFS1) ? \ ufs_rw32(bap1[i], needswap) : ufs_rw64(bap2[i], needswap)) #define BAP_ASSIGN(ip, i, value) \ do { \ if ((ip)->i_ump->um_fstype == UFS1) \ bap1[i] = (value); \ else \ bap2[i] = (value); \ } while(0) /* * Calculate index in current block of last * block to be kept. -1 indicates the entire * block so we need not calculate the index. */ factor = 1; for (i = SINGLE; i < level; i++) factor *= FFS_NINDIR(fs); last = lastbn; if (lastbn > 0) last /= factor; nblocks = btodb(fs->fs_bsize); /* * Get buffer of block pointers, zero those entries corresponding * to blocks to be free'd, and update on disk copy first. Since * double(triple) indirect before single(double) indirect, calls * to bmap on these blocks will fail. However, we already have * the on disk address, so we have to set the b_blkno field * explicitly instead of letting bread do everything for us. */ vp = ITOV(ip); error = ffs_getblk(vp, lbn, FFS_NOBLK, fs->fs_bsize, false, &bp); if (error) return error; if (bp->b_oflags & (BO_DONE | BO_DELWRI)) { /* Braces must be here in case trace evaluates to nothing. */ trace(TR_BREADHIT, pack(vp, fs->fs_bsize), lbn); } else { trace(TR_BREADMISS, pack(vp, fs->fs_bsize), lbn); curlwp->l_ru.ru_inblock++; /* pay for read */ bp->b_flags |= B_READ; bp->b_flags &= ~B_COWDONE; /* we change blkno below */ if (bp->b_bcount > bp->b_bufsize) panic("ffs_indirtrunc: bad buffer size"); bp->b_blkno = dbn; BIO_SETPRIO(bp, BPRIO_TIMECRITICAL); VOP_STRATEGY(vp, bp); error = biowait(bp); if (error == 0) error = fscow_run(bp, true); } if (error) { brelse(bp, 0); return error; } /* * Clear reference to blocks to be removed on disk, before actually * reclaiming them, so that fsck is more likely to be able to recover * the filesystem if system goes down during the truncate process. * This assumes the truncate process would not fail, contrary * to the wapbl case. */ if (ip->i_ump->um_fstype == UFS1) bap1 = (int32_t *)bp->b_data; else bap2 = (int64_t *)bp->b_data; if (lastbn >= 0 && !wapbl) { copy = kmem_alloc(fs->fs_bsize, KM_SLEEP); memcpy((void *)copy, bp->b_data, (u_int)fs->fs_bsize); for (i = last + 1; i < FFS_NINDIR(fs); i++) BAP_ASSIGN(ip, i, 0); error = bwrite(bp); if (error) allerror = error; if (ip->i_ump->um_fstype == UFS1) bap1 = (int32_t *)copy; else bap2 = (int64_t *)copy; } /* * Recursively free totally unused blocks. */ for (i = FFS_NINDIR(fs) - 1, nlbn = lbn + 1 - i * factor; i > last; i--, nlbn += factor) { nb = RBAP(ip, i); if (nb == 0) continue; if ((ip->i_ump->um_mountp->mnt_wapbl) && ((level > SINGLE) || (ITOV(ip)->v_type != VREG))) { error = UFS_WAPBL_REGISTER_DEALLOCATION( ip->i_ump->um_mountp, FFS_FSBTODB(fs, nb), fs->fs_bsize, &dcookie); if (error) goto out; } else { dcookie = NULL; } if (level > SINGLE) { error = ffs_indirtrunc(ip, nlbn, FFS_FSBTODB(fs, nb), (daddr_t)-1, level - 1, countp); if (error) { if (dcookie) { UFS_WAPBL_UNREGISTER_DEALLOCATION( ip->i_ump->um_mountp, dcookie); } goto out; } } if (!dcookie) ffs_blkfree(fs, ip->i_devvp, nb, fs->fs_bsize, ip->i_number); BAP_ASSIGN(ip, i, 0); *countp += nblocks; } /* * Recursively free blocks on the now last partial indirect block. */ if (level > SINGLE && lastbn >= 0) { last = lastbn % factor; nb = RBAP(ip, i); if (nb != 0) { error = ffs_indirtrunc(ip, nlbn, FFS_FSBTODB(fs, nb), last, level - 1, countp); if (error) goto out; } } out: if (error && !allerror) allerror = error; if (copy != NULL) { kmem_free(copy, fs->fs_bsize); } else if (lastbn < 0 && error == 0) { /* all freed, release without writing back */ brelse(bp, BC_INVAL); } else if (wapbl) { /* only partially freed, write the updated block */ error = bwrite(bp); if (!allerror) allerror = error; } return (allerror); } void ffs_itimes(struct inode *ip, const struct timespec *acc, const struct timespec *mod, const struct timespec *cre) { struct timespec now; if (!(ip->i_flag & (IN_ACCESS | IN_CHANGE | IN_UPDATE | IN_MODIFY))) { return; } vfs_timestamp(&now); if (ip->i_flag & IN_ACCESS) { if (acc == NULL) acc = &now; DIP_ASSIGN(ip, atime, acc->tv_sec); DIP_ASSIGN(ip, atimensec, acc->tv_nsec); } if (ip->i_flag & (IN_UPDATE | IN_MODIFY)) { if ((ip->i_flags & SF_SNAPSHOT) == 0) { if (mod == NULL) mod = &now; DIP_ASSIGN(ip, mtime, mod->tv_sec); DIP_ASSIGN(ip, mtimensec, mod->tv_nsec); } ip->i_modrev++; } if (ip->i_flag & (IN_CHANGE | IN_MODIFY)) { if (cre == NULL) cre = &now; DIP_ASSIGN(ip, ctime, cre->tv_sec); DIP_ASSIGN(ip, ctimensec, cre->tv_nsec); } if (ip->i_flag & (IN_ACCESS | IN_MODIFY)) ip->i_flag |= IN_ACCESSED; if (ip->i_flag & (IN_UPDATE | IN_CHANGE)) ip->i_flag |= IN_MODIFIED; ip->i_flag &= ~(IN_ACCESS | IN_CHANGE | IN_UPDATE | IN_MODIFY); }
9 30 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 /* $NetBSD: dtrace_bsd.h,v 1.9 2018/04/19 21:19:07 christos Exp $ */ /*- * Copyright (c) 2007-2008 John Birrell (jb@freebsd.org) * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD: src/sys/sys/dtrace_bsd.h,v 1.3.2.1 2009/08/03 08:13:06 kensmith Exp $ * * This file contains BSD shims for Sun's DTrace code. */ #ifndef _SYS_DTRACE_BSD_H #define _SYS_DTRACE_BSD_H #if defined(_KERNEL_OPT) #include "opt_dtrace.h" #endif #include <sys/param.h> #include <sys/systm.h> #include <sys/kernel.h> #include <sys/kmem.h> #include <sys/proc.h> /* Forward definitions: */ struct mbuf; struct trapframe; struct lwp; struct vattr; struct vnode; struct ucred; /* * Cyclic clock function type definition used to hook the cyclic * subsystem into the appropriate timer interrupt. */ typedef void (*cyclic_clock_func_t)(struct clockframe *); extern cyclic_clock_func_t cyclic_clock_func[]; /* * The dtrace module handles traps that occur during a DTrace probe. * This type definition is used in the trap handler to provide a * hook for the dtrace module to register its handler with. */ typedef int (*dtrace_trap_func_t)(struct trapframe *, u_int); int dtrace_trap(struct trapframe *, u_int); extern dtrace_trap_func_t dtrace_trap_func; /* Used by the machine dependent trap() code. */ typedef int (*dtrace_invop_func_t)(uintptr_t, uintptr_t *, uintptr_t); typedef void (*dtrace_doubletrap_func_t)(void); /* Global variables in trap.c */ extern dtrace_invop_func_t dtrace_invop_func; extern dtrace_doubletrap_func_t dtrace_doubletrap_func; /* Virtual time hook function type. */ typedef void (*dtrace_vtime_switch_func_t)(struct lwp *); extern int dtrace_vtime_active; extern dtrace_vtime_switch_func_t dtrace_vtime_switch_func; /* The fasttrap module hooks into the fork, exit and exit. */ typedef void (*dtrace_fork_func_t)(struct proc *, struct proc *); typedef void (*dtrace_execexit_func_t)(struct proc *); /* Global variable in kern_fork.c */ extern dtrace_fork_func_t dtrace_fasttrap_fork; /* Global variable in kern_exec.c */ extern dtrace_execexit_func_t dtrace_fasttrap_exec; /* Global variable in kern_exit.c */ extern dtrace_execexit_func_t dtrace_fasttrap_exit; /* The dtmalloc provider hooks into malloc. */ typedef void (*dtrace_malloc_probe_func_t)(u_int32_t, uintptr_t arg0, uintptr_t arg1, uintptr_t arg2, uintptr_t arg3, uintptr_t arg4); extern dtrace_malloc_probe_func_t dtrace_malloc_probe; /* dtnfsclient NFSv3 access cache provider hooks. */ typedef void (*dtrace_nfsclient_accesscache_flush_probe_func_t)(uint32_t, struct vnode *); extern dtrace_nfsclient_accesscache_flush_probe_func_t dtrace_nfsclient_accesscache_flush_done_probe; typedef void (*dtrace_nfsclient_accesscache_get_probe_func_t)(uint32_t, struct vnode *, uid_t, uint32_t); extern dtrace_nfsclient_accesscache_get_probe_func_t dtrace_nfsclient_accesscache_get_hit_probe, dtrace_nfsclient_accesscache_get_miss_probe; typedef void (*dtrace_nfsclient_accesscache_load_probe_func_t)(uint32_t, struct vnode *, uid_t, uint32_t, int); extern dtrace_nfsclient_accesscache_load_probe_func_t dtrace_nfsclient_accesscache_load_done_probe; /* dtnfsclient NFSv[23] attribute cache provider hooks. */ typedef void (*dtrace_nfsclient_attrcache_flush_probe_func_t)(uint32_t, struct vnode *); extern dtrace_nfsclient_attrcache_flush_probe_func_t dtrace_nfsclient_attrcache_flush_done_probe; typedef void (*dtrace_nfsclient_attrcache_get_hit_probe_func_t)(uint32_t, struct vnode *, struct vattr *); extern dtrace_nfsclient_attrcache_get_hit_probe_func_t dtrace_nfsclient_attrcache_get_hit_probe; typedef void (*dtrace_nfsclient_attrcache_get_miss_probe_func_t)(uint32_t, struct vnode *); extern dtrace_nfsclient_attrcache_get_miss_probe_func_t dtrace_nfsclient_attrcache_get_miss_probe; typedef void (*dtrace_nfsclient_attrcache_load_probe_func_t)(uint32_t, struct vnode *, struct vattr *, int); extern dtrace_nfsclient_attrcache_load_probe_func_t dtrace_nfsclient_attrcache_load_done_probe; /* dtnfsclient NFSv[23] RPC provider hooks. */ typedef void (*dtrace_nfsclient_nfs23_start_probe_func_t)(uint32_t, struct vnode *, struct mbuf *, struct ucred *, int); extern dtrace_nfsclient_nfs23_start_probe_func_t dtrace_nfsclient_nfs23_start_probe; typedef void (*dtrace_nfsclient_nfs23_done_probe_func_t)(uint32_t, struct vnode *, struct mbuf *, struct ucred *, int, int); extern dtrace_nfsclient_nfs23_done_probe_func_t dtrace_nfsclient_nfs23_done_probe; /* * OpenSolaris compatible time functions returning nanoseconds. * On OpenSolaris these return hrtime_t which we define as uint64_t. */ uint64_t dtrace_gethrtime(void); uint64_t dtrace_gethrestime(void); /* sizes based on DTrace structure requirements */ #define KDTRACE_PROC_SIZE 64 #define KDTRACE_PROC_ZERO 8 #define KDTRACE_THREAD_SIZE 256 #define KDTRACE_THREAD_ZERO 64 /* * Functions for managing the opaque DTrace memory areas for * processes and lwps. */ static __inline size_t kdtrace_proc_size(void); static __inline void kdtrace_proc_ctor(void *, struct proc *); static __inline void kdtrace_proc_dtor(void *, struct proc *); static __inline size_t kdtrace_thread_size(void); static __inline void kdtrace_thread_ctor(void *, struct lwp *); static __inline void kdtrace_thread_dtor(void *, struct lwp *); /* Return the DTrace process data size compiled in the kernel hooks. */ static __inline size_t kdtrace_proc_size(void) { return KDTRACE_PROC_SIZE; } /* Return the DTrace thread data size compiled in the kernel hooks. */ static __inline size_t kdtrace_thread_size(void) { return KDTRACE_THREAD_SIZE; } static __inline void kdtrace_proc_ctor(void *arg, struct proc *p) { #ifdef KDTRACE_HOOKS p->p_dtrace = kmem_zalloc(KDTRACE_PROC_SIZE, KM_SLEEP); #endif } static __inline void kdtrace_proc_dtor(void *arg, struct proc *p) { #ifdef KDTRACE_HOOKS if (p->p_dtrace != NULL) { kmem_free(p->p_dtrace, KDTRACE_PROC_SIZE); p->p_dtrace = NULL; } #endif } static __inline void kdtrace_thread_ctor(void *arg, struct lwp *l) { #ifdef KDTRACE_HOOKS l->l_dtrace = kmem_zalloc(KDTRACE_THREAD_SIZE, KM_SLEEP); #endif } static __inline void kdtrace_thread_dtor(void *arg, struct lwp *l) { #ifdef KDTRACE_HOOKS if (l->l_dtrace != NULL) { kmem_free(l->l_dtrace, KDTRACE_THREAD_SIZE); l->l_dtrace = NULL; } #endif } #endif /* _SYS_DTRACE_BSD_H */
15 1 1 1 2 2 8 2 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 /* $NetBSD: overlay_vfsops.c,v 1.73 2022/11/04 11:20:39 hannken Exp $ */ /* * Copyright (c) 1999, 2000 National Aeronautics & Space Administration * All rights reserved. * * This software was written by William Studenmund of the * Numerical Aerospace Simulation Facility, NASA Ames Research Center. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the National Aeronautics & Space Administration * nor the names of its contributors may be used to endorse or promote * products derived from this software without specific prior written * permission. * * THIS SOFTWARE IS PROVIDED BY THE NATIONAL AERONAUTICS & SPACE ADMINISTRATION * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE ADMINISTRATION OR CONTRIB- * UTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /* * Copyright (c) 1992, 1993, 1995 * The Regents of the University of California. All rights reserved. * * This code is derived from software donated to Berkeley by * Jan-Simon Pendry. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: Id: lofs_vfsops.c,v 1.9 1992/05/30 10:26:24 jsp Exp * from: @(#)lofs_vfsops.c 1.2 (Berkeley) 6/18/92 * @(#)null_vfsops.c 8.7 (Berkeley) 5/14/95 */ /* * Overlay Layer * (See overlay_vnops.c for a description of what this does.) */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: overlay_vfsops.c,v 1.73 2022/11/04 11:20:39 hannken Exp $"); #include <sys/param.h> #include <sys/systm.h> #include <sys/sysctl.h> #include <sys/time.h> #include <sys/proc.h> #include <sys/vnode.h> #include <sys/mount.h> #include <sys/namei.h> #include <sys/module.h> #include <miscfs/overlay/overlay.h> #include <miscfs/genfs/layer_extern.h> MODULE(MODULE_CLASS_VFS, overlay, "layerfs"); VFS_PROTOS(ov); #define NOVERLAYNODECACHE 16 /* * Mount overlay layer */ int ov_mount(struct mount *mp, const char *path, void *data, size_t *data_len) { struct lwp *l = curlwp; int error = 0; struct overlay_args *args = data; struct vnode *lowerrootvp, *vp; struct overlay_mount *nmp; struct layer_mount *lmp; #ifdef OVERLAYFS_DIAGNOSTIC printf("ov_mount(mp = %p)\n", mp); #endif if (args == NULL) return EINVAL; if (*data_len < sizeof *args) return EINVAL; if (mp->mnt_flag & MNT_GETARGS) { lmp = MOUNTTOLAYERMOUNT(mp); if (lmp == NULL) return EIO; args->la.target = NULL; *data_len = sizeof *args; return 0; } /* * Update is not supported */ if (mp->mnt_flag & MNT_UPDATE) return EOPNOTSUPP; /* * Find lower node */ lowerrootvp = mp->mnt_vnodecovered; vref(lowerrootvp); if ((error = vn_lock(lowerrootvp, LK_EXCLUSIVE))) { vrele(lowerrootvp); return (error); } /* * First cut at fixing up upper mount point */ nmp = kmem_zalloc(sizeof(struct overlay_mount), KM_SLEEP); mp->mnt_data = nmp; /* * Make sure that the mount point is sufficiently initialized * that the node create call will work. */ vfs_getnewfsid(mp); error = vfs_set_lowermount(mp, lowerrootvp->v_mount); if (error) { vput(lowerrootvp); kmem_free(nmp, sizeof(struct overlay_mount)); return error; } nmp->ovm_size = sizeof (struct overlay_node); nmp->ovm_tag = VT_OVERLAY; nmp->ovm_bypass = layer_bypass; nmp->ovm_vnodeop_p = overlay_vnodeop_p; /* * Fix up overlay node for root vnode */ VOP_UNLOCK(lowerrootvp); error = layer_node_create(mp, lowerrootvp, &vp); /* * Make sure the fixup worked */ if (error) { vrele(lowerrootvp); kmem_free(nmp, sizeof(struct overlay_mount)); return error; } /* * Keep a held reference to the root vnode. * It is vrele'd in ov_unmount. */ vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); vp->v_vflag |= VV_ROOT; nmp->ovm_rootvp = vp; VOP_UNLOCK(vp); error = set_statvfs_info(path, UIO_USERSPACE, args->la.target, UIO_USERSPACE, mp->mnt_op->vfs_name, mp, l); if (error) return error; if (mp->mnt_lower->mnt_flag & MNT_LOCAL) mp->mnt_flag |= MNT_LOCAL; #ifdef OVERLAYFS_DIAGNOSTIC printf("ov_mount: lower %s, alias at %s\n", mp->mnt_stat.f_mntfromname, mp->mnt_stat.f_mntonname); #endif return 0; } /* * Free reference to overlay layer */ int ov_unmount(struct mount *mp, int mntflags) { struct vnode *overlay_rootvp = MOUNTTOOVERLAYMOUNT(mp)->ovm_rootvp; struct overlay_mount *omp; int error; int flags = 0; #ifdef OVERLAYFS_DIAGNOSTIC printf("ov_unmount(mp = %p)\n", mp); #endif if (mntflags & MNT_FORCE) flags |= FORCECLOSE; if (vrefcnt(overlay_rootvp) > 1 && (mntflags & MNT_FORCE) == 0) return (EBUSY); if ((error = vflush(mp, overlay_rootvp, flags)) != 0) return (error); #ifdef OVERLAYFS_DIAGNOSTIC vprint("alias root of lower", overlay_rootvp); #endif /* * Blow it away for future re-use */ vgone(overlay_rootvp); /* * Finally, throw away the overlay_mount structure */ omp = mp->mnt_data; kmem_free(omp, sizeof(struct overlay_mount)); mp->mnt_data = NULL; return 0; } extern const struct vnodeopv_desc overlay_vnodeop_opv_desc; const struct vnodeopv_desc * const ov_vnodeopv_descs[] = { &overlay_vnodeop_opv_desc, NULL, }; struct vfsops overlay_vfsops = { .vfs_name = MOUNT_OVERLAY, .vfs_min_mount_data = sizeof (struct overlay_args), .vfs_mount = ov_mount, .vfs_start = layerfs_start, .vfs_unmount = ov_unmount, .vfs_root = layerfs_root, .vfs_quotactl = layerfs_quotactl, .vfs_statvfs = layerfs_statvfs, .vfs_sync = layerfs_sync, .vfs_loadvnode = layerfs_loadvnode, .vfs_vget = layerfs_vget, .vfs_fhtovp = layerfs_fhtovp, .vfs_vptofh = layerfs_vptofh, .vfs_init = layerfs_init, .vfs_done = layerfs_done, .vfs_snapshot = layerfs_snapshot, .vfs_extattrctl = vfs_stdextattrctl, .vfs_suspendctl = layerfs_suspendctl, .vfs_renamelock_enter = layerfs_renamelock_enter, .vfs_renamelock_exit = layerfs_renamelock_exit, .vfs_fsync = (void *)eopnotsupp, .vfs_opv_descs = ov_vnodeopv_descs }; SYSCTL_SETUP(overlay_sysctl_setup, "overlay fs sysctl") { sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT, CTLTYPE_NODE, "overlay", SYSCTL_DESCR("Overlay file system"), NULL, 0, NULL, 0, CTL_VFS, CTL_CREATE, CTL_EOL); } static int overlay_modcmd(modcmd_t cmd, void *arg) { int error; switch (cmd) { case MODULE_CMD_INIT: error = vfs_attach(&overlay_vfsops); if (error != 0) break; break; case MODULE_CMD_FINI: error = vfs_detach(&overlay_vfsops); if (error != 0) break; break; default: error = ENOTTY; break; } return (error); }
28 20 10 29 29 9 9 16 20 5 13 13 6 2 5 4 3 25 2 3 2 18 23 23 15 29 1 27 10 6 8 9 2 1 18 12 3 7 9 9 7 2 10 4 9 9 9 9 9 9 5 4 9 9 9 4 4 4 4 4 4 4 4 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 /* $NetBSD: kern_exit.c,v 1.298 2023/10/08 12:38:58 ad Exp $ */ /*- * Copyright (c) 1998, 1999, 2006, 2007, 2008, 2020, 2023 * The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility, * NASA Ames Research Center, and by Andrew Doran. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /* * Copyright (c) 1982, 1986, 1989, 1991, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)kern_exit.c 8.10 (Berkeley) 2/23/95 */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: kern_exit.c,v 1.298 2023/10/08 12:38:58 ad Exp $"); #include "opt_ktrace.h" #include "opt_dtrace.h" #include "opt_sysv.h" #include <sys/param.h> #include <sys/systm.h> #include <sys/ioctl.h> #include <sys/tty.h> #include <sys/time.h> #include <sys/resource.h> #include <sys/kernel.h> #include <sys/proc.h> #include <sys/buf.h> #include <sys/wait.h> #include <sys/file.h> #include <sys/fstrans.h> #include <sys/vnode.h> #include <sys/syslog.h> #include <sys/pool.h> #include <sys/uidinfo.h> #include <sys/ptrace.h> #include <sys/acct.h> #include <sys/filedesc.h> #include <sys/ras.h> #include <sys/signalvar.h> #include <sys/sched.h> #include <sys/mount.h> #include <sys/syscallargs.h> #include <sys/kauth.h> #include <sys/sleepq.h> #include <sys/lock.h> #include <sys/lockdebug.h> #include <sys/ktrace.h> #include <sys/cpu.h> #include <sys/lwpctl.h> #include <sys/atomic.h> #include <sys/sdt.h> #include <sys/psref.h> #include <uvm/uvm_extern.h> #ifdef DEBUG_EXIT int debug_exit = 0; #define DPRINTF(x) if (debug_exit) printf x #else #define DPRINTF(x) #endif static int find_stopped_child(struct proc *, idtype_t, id_t, int, struct proc **, struct wrusage *, siginfo_t *); static void proc_free(struct proc *, struct wrusage *); /* * DTrace SDT provider definitions */ SDT_PROVIDER_DECLARE(proc); SDT_PROBE_DEFINE1(proc, kernel, , exit, "int"); /* * Fill in the appropriate signal information, and signal the parent. */ /* XXX noclone works around a gcc 4.5 bug on arm */ static void __noclone exit_psignal(struct proc *p, struct proc *pp, ksiginfo_t *ksi) { KSI_INIT(ksi); if ((ksi->ksi_signo = P_EXITSIG(p)) == SIGCHLD) { if (p->p_xsig) { if (p->p_sflag & PS_COREDUMP) ksi->ksi_code = CLD_DUMPED; else ksi->ksi_code = CLD_KILLED; ksi->ksi_status = p->p_xsig; } else { ksi->ksi_code = CLD_EXITED; ksi->ksi_status = p->p_xexit; } } else { ksi->ksi_code = SI_USER; ksi->ksi_status = p->p_xsig; } /* * We fill those in, even for non-SIGCHLD. * It's safe to access p->p_cred unlocked here. */ ksi->ksi_pid = p->p_pid; ksi->ksi_uid = kauth_cred_geteuid(p->p_cred); /* XXX: is this still valid? */ ksi->ksi_utime = p->p_stats->p_ru.ru_utime.tv_sec; ksi->ksi_stime = p->p_stats->p_ru.ru_stime.tv_sec; } /* * exit -- * Death of process. */ int sys_exit(struct lwp *l, const struct sys_exit_args *uap, register_t *retval) { /* { syscallarg(int) rval; } */ struct proc *p = l->l_proc; /* Don't call exit1() multiple times in the same process. */ mutex_enter(p->p_lock); if (p->p_sflag & PS_WEXIT) { mutex_exit(p->p_lock); lwp_exit(l); } /* exit1() will release the mutex. */ exit1(l, SCARG(uap, rval), 0); /* NOTREACHED */ return (0); } /* * Exit: deallocate address space and other resources, change proc state * to zombie, and unlink proc from allproc and parent's lists. Save exit * status and rusage for wait(). Check for child processes and orphan them. * * Must be called with p->p_lock held. Does not return. */ void exit1(struct lwp *l, int exitcode, int signo) { struct proc *p, *child, *next_child, *old_parent, *new_parent; struct pgrp *pgrp; ksiginfo_t ksi; ksiginfoq_t kq; int wakeinit; p = l->l_proc; /* Verify that we hold no locks other than p->p_lock. */ LOCKDEBUG_BARRIER(p->p_lock, 0); /* XXX Temporary: something is leaking kernel_lock. */ KERNEL_UNLOCK_ALL(l, NULL); KASSERT(mutex_owned(p->p_lock)); KASSERT(p->p_vmspace != NULL); if (__predict_false(p == initproc)) { panic("init died (signal %d, exit %d)", signo, exitcode); } p->p_sflag |= PS_WEXIT; /* * Force all other LWPs to exit before we do. Only then can we * begin to tear down the rest of the process state. */ if (p->p_nlwps > 1) { exit_lwps(l); } ksiginfo_queue_init(&kq); /* * If we have been asked to stop on exit, do so now. */ if (__predict_false(p->p_sflag & PS_STOPEXIT)) { KASSERT(l->l_blcnt == 0); sigclearall(p, &contsigmask, &kq); if (!mutex_tryenter(&proc_lock)) { mutex_exit(p->p_lock); mutex_enter(&proc_lock); mutex_enter(p->p_lock); } p->p_waited = 0; p->p_pptr->p_nstopchild++; p->p_stat = SSTOP; mutex_exit(&proc_lock); lwp_lock(l); p->p_nrlwps--; l->l_stat = LSSTOP; lwp_unlock(l); mutex_exit(p->p_lock); lwp_lock(l); spc_lock(l->l_cpu); mi_switch(l); mutex_enter(p->p_lock); } /* * Bin any remaining signals and mark the process as dying so it will * not be found for, e.g. signals. */ sigfillset(&p->p_sigctx.ps_sigignore); sigclearall(p, NULL, &kq); p->p_stat = SDYING; /* * Perform any required thread cleanup. Do this early so * anyone wanting to look us up by our global thread ID * will fail to find us. * * N.B. this will unlock p->p_lock on our behalf. */ lwp_thread_cleanup(l); ksiginfo_queue_drain(&kq); /* Destroy any lwpctl info. */ if (p->p_lwpctl != NULL) lwp_ctl_exit(); /* * Drain all remaining references that procfs, ptrace and others may * have on the process. */ rw_enter(&p->p_reflock, RW_WRITER); DPRINTF(("%s: %d.%d exiting.\n", __func__, p->p_pid, l->l_lid)); ptimers_free(p, TIMERS_ALL); #if defined(__HAVE_RAS) ras_purgeall(); #endif /* * Close open files, release open-file table and free signal * actions. This may block! */ fd_free(); cwdfree(p->p_cwdi); p->p_cwdi = NULL; doexithooks(p); sigactsfree(p->p_sigacts); /* * Write out accounting data. */ (void)acct_process(l); #ifdef KTRACE /* * Release trace file. */ if (p->p_tracep != NULL) { mutex_enter(&ktrace_lock); ktrderef(p); mutex_exit(&ktrace_lock); } #endif p->p_xexit = exitcode; p->p_xsig = signo; /* * If emulation has process exit hook, call it now. * Set the exit status now so that the exit hook has * an opportunity to tweak it (COMPAT_LINUX requires * this for thread group emulation) */ if (p->p_emul->e_proc_exit) (*p->p_emul->e_proc_exit)(p); /* * Free the VM resources we're still holding on to. * We must do this from a valid thread because doing * so may block. This frees vmspace, which we don't * need anymore. The only remaining lwp is the one * we run at this moment, nothing runs in userland * anymore. */ ruspace(p); /* Update our vm resource use */ uvm_proc_exit(p); /* * Stop profiling. */ if (__predict_false((p->p_stflag & PST_PROFIL) != 0)) { mutex_spin_enter(&p->p_stmutex); stopprofclock(p); mutex_spin_exit(&p->p_stmutex); } /* * If parent is waiting for us to exit or exec, PL_PPWAIT is set; we * wake up the parent early to avoid deadlock. We can do this once * the VM resources are released. */ mutex_enter(&proc_lock); if (p->p_lflag & PL_PPWAIT) { lwp_t *lp; l->l_lwpctl = NULL; /* was on loan from blocked parent */ p->p_lflag &= ~PL_PPWAIT; lp = p->p_vforklwp; p->p_vforklwp = NULL; lp->l_vforkwaiting = false; cv_broadcast(&lp->l_waitcv); } if (SESS_LEADER(p)) { struct vnode *vprele = NULL, *vprevoke = NULL; struct session *sp = p->p_session; struct tty *tp; if (sp->s_ttyvp) { /* * Controlling process. * Signal foreground pgrp, * drain controlling terminal * and revoke access to controlling terminal. */ tp = sp->s_ttyp; mutex_spin_enter(&tty_lock); if (tp->t_session == sp) { /* we can't guarantee the revoke will do this */ pgrp = tp->t_pgrp; tp->t_pgrp = NULL; tp->t_session = NULL; mutex_spin_exit(&tty_lock); if (pgrp != NULL) { pgsignal(pgrp, SIGHUP, 1); } mutex_exit(&proc_lock); (void) ttywait(tp); mutex_enter(&proc_lock); /* The tty could have been revoked. */ vprevoke = sp->s_ttyvp; } else mutex_spin_exit(&tty_lock); vprele = sp->s_ttyvp; sp->s_ttyvp = NULL; /* * s_ttyp is not zero'd; we use this to indicate * that the session once had a controlling terminal. * (for logging and informational purposes) */ } sp->s_leader = NULL; if (vprevoke != NULL || vprele != NULL) { if (vprevoke != NULL) { /* Releases proc_lock. */ proc_sessrele(sp); VOP_REVOKE(vprevoke, REVOKEALL); } else mutex_exit(&proc_lock); if (vprele != NULL) vrele(vprele); mutex_enter(&proc_lock); } } fixjobc(p, p->p_pgrp, 0); /* Release fstrans private data. */ fstrans_lwp_dtor(l); /* * Finalize the last LWP's specificdata, as well as the * specificdata for the proc itself. */ lwp_finispecific(l); proc_finispecific(p); /* * Reset p_opptr pointer of all former children which got * traced by another process and were reparented. We reset * it to NULL here; the trace detach code then reparents * the child to initproc. We only check allproc list, since * eventual former children on zombproc list won't reference * p_opptr anymore. */ if (__predict_false(p->p_slflag & PSL_CHTRACED)) { struct proc *q; PROCLIST_FOREACH(q, &allproc) { if (q->p_opptr == p) q->p_opptr = NULL; } PROCLIST_FOREACH(q, &zombproc) { if (q->p_opptr == p) q->p_opptr = NULL; } } /* * Give orphaned children to init(8). */ child = LIST_FIRST(&p->p_children); wakeinit = (child != NULL); for (; child != NULL; child = next_child) { next_child = LIST_NEXT(child, p_sibling); /* * Traced processes are killed since their existence * means someone is screwing up. Since we reset the * trace flags, the logic in sys_wait4() would not be * triggered to reparent the process to its * original parent, so we must do this here. */ if (__predict_false(child->p_slflag & PSL_TRACED)) { mutex_enter(p->p_lock); child->p_slflag &= ~(PSL_TRACED|PSL_SYSCALL); mutex_exit(p->p_lock); if (child->p_opptr != child->p_pptr) { struct proc *t = child->p_opptr; proc_reparent(child, t ? t : initproc); child->p_opptr = NULL; } else proc_reparent(child, initproc); killproc(child, "orphaned traced process"); } else proc_reparent(child, initproc); } /* * Move proc from allproc to zombproc, it's now nearly ready to be * collected by parent. */ LIST_REMOVE(l, l_list); LIST_REMOVE(p, p_list); LIST_INSERT_HEAD(&zombproc, p, p_list); /* * Mark the process as dead. We must do this before we signal * the parent. */ p->p_stat = SDEAD; /* * Let anyone watching this DTrace probe know what we're * on our way out. */ SDT_PROBE(proc, kernel, , exit, ((p->p_sflag & PS_COREDUMP) ? CLD_DUMPED : (p->p_xsig ? CLD_KILLED : CLD_EXITED)), 0,0,0,0); /* Put in front of parent's sibling list for parent to collect it */ old_parent = p->p_pptr; old_parent->p_nstopchild++; if (LIST_FIRST(&old_parent->p_children) != p) { /* Put child where it can be found quickly */ LIST_REMOVE(p, p_sibling); LIST_INSERT_HEAD(&old_parent->p_children, p, p_sibling); } /* * Notify parent that we're gone. If parent has the P_NOCLDWAIT * flag set, notify init instead (and hope it will handle * this situation). */ if (old_parent->p_flag & (PK_NOCLDWAIT|PK_CLDSIGIGN)) { proc_reparent(p, initproc); wakeinit = 1; /* * If this was the last child of our parent, notify * parent, so in case he was wait(2)ing, he will * continue. */ if (LIST_FIRST(&old_parent->p_children) == NULL) cv_broadcast(&old_parent->p_waitcv); } /* Reload parent pointer, since p may have been reparented above */ new_parent = p->p_pptr; if (__predict_false(p->p_exitsig != 0)) { exit_psignal(p, new_parent, &ksi); kpsignal(new_parent, &ksi, NULL); } /* Calculate the final rusage info. */ calcru(p, &p->p_stats->p_ru.ru_utime, &p->p_stats->p_ru.ru_stime, NULL, NULL); callout_destroy(&l->l_timeout_ch); /* * Release any PCU resources before becoming a zombie. */ pcu_discard_all(l); /* * Notify other processes tracking us with a knote that * we're exiting. * * N.B. we do this here because the process is now SDEAD, * and thus cannot have any more knotes attached. Also, * knote_proc_exit() expects that p->p_lock is already * held (and will assert so). */ mutex_enter(p->p_lock); if (!SLIST_EMPTY(&p->p_klist)) { knote_proc_exit(p); } /* Free the LWP ID */ proc_free_lwpid(p, l->l_lid); lwp_drainrefs(l); lwp_lock(l); l->l_prflag &= ~LPR_DETACHED; l->l_stat = LSZOMB; lwp_unlock(l); KASSERT(curlwp == l); KASSERT(p->p_nrlwps == 1); KASSERT(p->p_nlwps == 1); p->p_stat = SZOMB; p->p_nrlwps--; p->p_nzlwps++; p->p_ndlwps = 0; mutex_exit(p->p_lock); /* * Signal the parent to collect us, and drop the proclist lock. * Drop debugger/procfs lock; no new references can be gained. */ rw_exit(&p->p_reflock); cv_broadcast(&p->p_pptr->p_waitcv); mutex_exit(&proc_lock); if (wakeinit) cv_broadcast(&initproc->p_waitcv); /* * NOTE: WE ARE NO LONGER ALLOWED TO SLEEP! */ /* * Give machine-dependent code a chance to free any MD LWP * resources. This must be done before uvm_lwp_exit(), in * case these resources are in the PCB. */ cpu_lwp_free(l, 1); /* Switch away into oblivion. */ lwp_lock(l); spc_lock(l->l_cpu); mi_switch(l); panic("exit1"); } void exit_lwps(struct lwp *l) { proc_t *p = l->l_proc; lwp_t *l2; retry: KASSERT(mutex_owned(p->p_lock)); /* * Interrupt LWPs in interruptable sleep, unsuspend suspended * LWPs and then wait for everyone else to finish. */ LIST_FOREACH(l2, &p->p_lwps, l_sibling) { if (l2 == l) continue; lwp_lock(l2); l2->l_flag |= LW_WEXIT; lwp_need_userret(l2); if ((l2->l_stat == LSSLEEP && (l2->l_flag & LW_SINTR)) || l2->l_stat == LSSUSPENDED || l2->l_stat == LSSTOP) { l2->l_flag &= ~LW_DBGSUSPEND; /* setrunnable() will release the lock. */ setrunnable(l2); continue; } lwp_unlock(l2); } /* * Wait for every LWP to exit. Note: LWPs can get suspended/slept * behind us or there may even be new LWPs created. Therefore, a * full retry is required on error. */ while (p->p_nlwps > 1) { if (lwp_wait(l, 0, NULL, true)) { goto retry; } } KASSERT(p->p_nlwps == 1); } int do_sys_waitid(idtype_t idtype, id_t id, int *pid, int *status, int options, struct wrusage *wru, siginfo_t *si) { proc_t *child; int error; if (wru != NULL) memset(wru, 0, sizeof(*wru)); if (si != NULL) memset(si, 0, sizeof(*si)); mutex_enter(&proc_lock); error = find_stopped_child(curproc, idtype, id, options, &child, wru, si); if (child == NULL) { mutex_exit(&proc_lock); *pid = 0; *status = 0; return error; } *pid = child->p_pid; if (child->p_stat == SZOMB) { /* Child is exiting */ *status = P_WAITSTATUS(child); /* proc_free() will release the proc_lock. */ if (options & WNOWAIT) { mutex_exit(&proc_lock); } else { proc_free(child, wru); } } else { /* Don't mark SIGCONT if we are being stopped */ *status = (child->p_xsig == SIGCONT && child->p_stat != SSTOP) ? W_CONTCODE() : W_STOPCODE(child->p_xsig); mutex_exit(&proc_lock); } return 0; } int do_sys_wait(int *pid, int *status, int options, struct rusage *ru) { idtype_t idtype; id_t id; int ret; struct wrusage wru; /* * Translate the special pid values into the (idtype, pid) * pair for wait6. The WAIT_MYPGRP case is handled by * find_stopped_child() on its own. */ if (*pid == WAIT_ANY) { idtype = P_ALL; id = 0; } else if (*pid < 0) { idtype = P_PGID; id = (id_t)-*pid; } else { idtype = P_PID; id = (id_t)*pid; } options |= WEXITED | WTRAPPED; ret = do_sys_waitid(idtype, id, pid, status, options, ru ? &wru : NULL, NULL); if (ru) *ru = wru.wru_self; return ret; } int sys___wait450(struct lwp *l, const struct sys___wait450_args *uap, register_t *retval) { /* { syscallarg(int) pid; syscallarg(int *) status; syscallarg(int) options; syscallarg(struct rusage *) rusage; } */ int error, status, pid = SCARG(uap, pid); struct rusage ru; error = do_sys_wait(&pid, &status, SCARG(uap, options), SCARG(uap, rusage) != NULL ? &ru : NULL); retval[0] = pid; if (pid == 0) { return error; } if (SCARG(uap, status)) { error = copyout(&status, SCARG(uap, status), sizeof(status)); } if (SCARG(uap, rusage) && error == 0) { error = copyout(&ru, SCARG(uap, rusage), sizeof(ru)); } return error; } int sys_wait6(struct lwp *l, const struct sys_wait6_args *uap, register_t *retval) { /* { syscallarg(idtype_t) idtype; syscallarg(id_t) id; syscallarg(int *) status; syscallarg(int) options; syscallarg(struct wrusage *) wru; syscallarg(siginfo_t *) si; } */ struct wrusage wru, *wrup; siginfo_t si, *sip; idtype_t idtype; int pid; id_t id; int error, status; idtype = SCARG(uap, idtype); id = SCARG(uap, id); if (SCARG(uap, wru) != NULL) wrup = &wru; else wrup = NULL; if (SCARG(uap, info) != NULL) sip = &si; else sip = NULL; /* * We expect all callers of wait6() to know about WEXITED and * WTRAPPED. */ error = do_sys_waitid(idtype, id, &pid, &status, SCARG(uap, options), wrup, sip); retval[0] = pid; /* tell userland who it was */ #if 0 /* * should we copyout if there was no process, hence no useful data? * We don't for an old style wait4() (etc) but I believe * FreeBSD does for wait6(), so a tossup... Go with FreeBSD for now. */ if (pid == 0) return error; #endif if (SCARG(uap, status) != NULL && error == 0) error = copyout(&status, SCARG(uap, status), sizeof(status)); if (SCARG(uap, wru) != NULL && error == 0) error = copyout(&wru, SCARG(uap, wru), sizeof(wru)); if (SCARG(uap, info) != NULL && error == 0) error = copyout(&si, SCARG(uap, info), sizeof(si)); return error; } /* * Find a process that matches the provided criteria, and fill siginfo * and resources if found. * Returns: * -1: Not found, abort early * 0: Not matched * 1: Matched, there might be more matches * 2: This is the only match */ static int match_process(const struct proc *pp, struct proc **q, idtype_t idtype, id_t id, int options, struct wrusage *wrusage, siginfo_t *siginfo) { struct rusage *rup; struct proc *p = *q; int rv = 1; switch (idtype) { case P_ALL: mutex_enter(p->p_lock); break; case P_PID: if (p->p_pid != (pid_t)id) { p = *q = proc_find_raw((pid_t)id); if (p == NULL || p->p_stat == SIDL || p->p_pptr != pp) { *q = NULL; return -1; } } mutex_enter(p->p_lock); rv++; break; case P_PGID: if (p->p_pgid != (pid_t)id) return 0; mutex_enter(p->p_lock); break; case P_SID: if (p->p_session->s_sid != (pid_t)id) return 0; mutex_enter(p->p_lock); break; case P_UID: mutex_enter(p->p_lock); if (kauth_cred_geteuid(p->p_cred) != (uid_t)id) { mutex_exit(p->p_lock); return 0; } break; case P_GID: mutex_enter(p->p_lock); if (kauth_cred_getegid(p->p_cred) != (gid_t)id) { mutex_exit(p->p_lock); return 0; } break; case P_CID: case P_PSETID: case P_CPUID: /* XXX: Implement me */ default: return 0; } if ((options & WEXITED) == 0 && p->p_stat == SZOMB) { mutex_exit(p->p_lock); return 0; } if (siginfo != NULL) { siginfo->si_errno = 0; /* * SUSv4 requires that the si_signo value is always * SIGCHLD. Obey it despite the rfork(2) interface * allows to request other signal for child exit * notification. */ siginfo->si_signo = SIGCHLD; /* * This is still a rough estimate. We will fix the * cases TRAPPED, STOPPED, and CONTINUED later. */ if (p->p_sflag & PS_COREDUMP) { siginfo->si_code = CLD_DUMPED; siginfo->si_status = p->p_xsig; } else if (p->p_xsig) { siginfo->si_code = CLD_KILLED; siginfo->si_status = p->p_xsig; } else { siginfo->si_code = CLD_EXITED; siginfo->si_status = p->p_xexit; } siginfo->si_pid = p->p_pid; siginfo->si_uid = kauth_cred_geteuid(p->p_cred); siginfo->si_utime = p->p_stats->p_ru.ru_utime.tv_sec; siginfo->si_stime = p->p_stats->p_ru.ru_stime.tv_sec; } /* * There should be no reason to limit resources usage info to * exited processes only. A snapshot about any resources used * by a stopped process may be exactly what is needed. */ if (wrusage != NULL) { rup = &wrusage->wru_self; *rup = p->p_stats->p_ru; calcru(p, &rup->ru_utime, &rup->ru_stime, NULL, NULL); rup = &wrusage->wru_children; *rup = p->p_stats->p_cru; calcru(p, &rup->ru_utime, &rup->ru_stime, NULL, NULL); } mutex_exit(p->p_lock); return rv; } /* * Determine if there are existing processes being debugged * that used to be (and sometime later will be again) children * of a specific parent (while matching wait criteria) */ static bool debugged_child_exists(idtype_t idtype, id_t id, int options, siginfo_t *si, const struct proc *parent) { struct proc *pp; /* * If we are searching for a specific pid, we can optimise a little */ if (idtype == P_PID) { /* * Check the specific process to see if its real parent is us */ pp = proc_find_raw((pid_t)id); if (pp != NULL && pp->p_stat != SIDL && pp->p_opptr == parent) { /* * using P_ALL here avoids match_process() doing the * same work that we just did, but incorrectly for * this scenario. */ if (match_process(parent, &pp, P_ALL, id, options, NULL, si)) return true; } return false; } /* * For the hard cases, just look everywhere to see if some * stolen (reparented) process is really our lost child. * Then check if that process could satisfy the wait conditions. */ /* * XXX inefficient, but hopefully fairly rare. * XXX should really use a list of reparented processes. */ PROCLIST_FOREACH(pp, &allproc) { if (pp->p_stat == SIDL) /* XXX impossible ?? */ continue; if (pp->p_opptr == parent && match_process(parent, &pp, idtype, id, options, NULL, si)) return true; } PROCLIST_FOREACH(pp, &zombproc) { if (pp->p_stat == SIDL) /* XXX impossible ?? */ continue; if (pp->p_opptr == parent && match_process(parent, &pp, idtype, id, options, NULL, si)) return true; } return false; } /* * Scan list of child processes for a child process that has stopped or * exited. Used by sys_wait4 and 'compat' equivalents. * * Must be called with the proc_lock held, and may release while waiting. */ static int find_stopped_child(struct proc *parent, idtype_t idtype, id_t id, int options, struct proc **child_p, struct wrusage *wru, siginfo_t *si) { struct proc *child, *dead; int error; KASSERT(mutex_owned(&proc_lock)); if (options & ~WALLOPTS) { *child_p = NULL; return EINVAL; } if ((options & WSELECTOPTS) == 0) { /* * We will be unable to find any matching processes, * because there are no known events to look for. * Prefer to return error instead of blocking * indefinitely. */ *child_p = NULL; return EINVAL; } if ((pid_t)id == WAIT_MYPGRP && (idtype == P_PID || idtype == P_PGID)) { id = (id_t)parent->p_pgid; idtype = P_PGID; } for (;;) { error = ECHILD; dead = NULL; LIST_FOREACH(child, &parent->p_children, p_sibling) { int rv = match_process(parent, &child, idtype, id, options, wru, si); if (rv == -1) break; if (rv == 0) continue; /* * Wait for processes with p_exitsig != SIGCHLD * processes only if WALTSIG is set; wait for * processes with p_exitsig == SIGCHLD only * if WALTSIG is clear. */ if (((options & WALLSIG) == 0) && (options & WALTSIG ? child->p_exitsig == SIGCHLD : P_EXITSIG(child) != SIGCHLD)){ if (rv == 2) { child = NULL; break; } continue; } error = 0; if ((options & WNOZOMBIE) == 0) { if (child->p_stat == SZOMB) break; if (child->p_stat == SDEAD) { /* * We may occasionally arrive here * after receiving a signal, but * immediately before the child * process is zombified. The wait * will be short, so avoid returning * to userspace. */ dead = child; } } if ((options & WCONTINUED) != 0 && child->p_xsig == SIGCONT && (child->p_sflag & PS_CONTINUED)) { if ((options & WNOWAIT) == 0) { child->p_sflag &= ~PS_CONTINUED; child->p_waited = 1; parent->p_nstopchild--; } if (si) { si->si_status = child->p_xsig; si->si_code = CLD_CONTINUED; } break; } if ((options & (WTRAPPED|WSTOPPED)) != 0 && child->p_stat == SSTOP && child->p_waited == 0 && ((child->p_slflag & PSL_TRACED) || options & (WUNTRACED|WSTOPPED))) { if ((options & WNOWAIT) == 0) { child->p_waited = 1; parent->p_nstopchild--; } if (si) { si->si_status = child->p_xsig; si->si_code = (child->p_slflag & PSL_TRACED) ? CLD_TRAPPED : CLD_STOPPED; } break; } if (parent->p_nstopchild == 0 || rv == 2) { child = NULL; break; } } /* * If we found nothing, but we are the bereaved parent * of a stolen child, look and see if that child (or * one of them) meets our search criteria. If so, then * we cannot succeed, but we can hang (wait...), * or if WNOHANG, return 0 instead of ECHILD */ if (child == NULL && error == ECHILD && (parent->p_slflag & PSL_CHTRACED) && debugged_child_exists(idtype, id, options, si, parent)) error = 0; if (child != NULL || error != 0 || ((options & WNOHANG) != 0 && dead == NULL)) { *child_p = child; return error; } /* * Wait for another child process to stop. */ error = cv_wait_sig(&parent->p_waitcv, &proc_lock); if (error != 0) { *child_p = NULL; return error; } } } /* * Free a process after parent has taken all the state info. Must be called * with the proclist lock held, and will release before returning. * * *ru is returned to the caller, and must be freed by the caller. */ static void proc_free(struct proc *p, struct wrusage *wru) { struct proc *parent = p->p_pptr; struct lwp *l; ksiginfo_t ksi; kauth_cred_t cred1, cred2; uid_t uid; KASSERT(mutex_owned(&proc_lock)); KASSERT(p->p_nlwps == 1); KASSERT(p->p_nzlwps == 1); KASSERT(p->p_nrlwps == 0); KASSERT(p->p_stat == SZOMB); /* * If we got the child via ptrace(2) or procfs, and * the parent is different (meaning the process was * attached, rather than run as a child), then we need * to give it back to the old parent, and send the * parent the exit signal. The rest of the cleanup * will be done when the old parent waits on the child. */ if ((p->p_slflag & PSL_TRACED) != 0 && p->p_opptr != parent) { mutex_enter(p->p_lock); p->p_slflag &= ~(PSL_TRACED|PSL_SYSCALL); mutex_exit(p->p_lock); parent = (p->p_opptr == NULL) ? initproc : p->p_opptr; proc_reparent(p, parent); p->p_opptr = NULL; if (p->p_exitsig != 0) { exit_psignal(p, parent, &ksi); kpsignal(parent, &ksi, NULL); } cv_broadcast(&parent->p_waitcv); mutex_exit(&proc_lock); return; } sched_proc_exit(parent, p); /* * Add child times of exiting process onto its own times. * This cannot be done any earlier else it might get done twice. */ l = LIST_FIRST(&p->p_lwps); ruadd(&p->p_stats->p_ru, &l->l_ru); ruadd(&p->p_stats->p_ru, &p->p_stats->p_cru); ruadd(&parent->p_stats->p_cru, &p->p_stats->p_ru); if (wru != NULL) { wru->wru_self = p->p_stats->p_ru; wru->wru_children = p->p_stats->p_cru; } p->p_xsig = 0; p->p_xexit = 0; /* * At this point we are going to start freeing the final resources. * If anyone tries to access the proc structure after here they will * get a shock - bits are missing. Attempt to make it hard! We * don't bother with any further locking past this point. */ p->p_stat = SIDL; /* not even a zombie any more */ LIST_REMOVE(p, p_list); /* off zombproc */ parent->p_nstopchild--; LIST_REMOVE(p, p_sibling); /* * Let pid be reallocated. */ proc_free_pid(p->p_pid); atomic_dec_uint(&nprocs); /* * Unlink process from its process group. * Releases the proc_lock. */ proc_leavepgrp(p); /* * Delay release until after lwp_free. */ cred2 = l->l_cred; /* * Free the last LWP's resources. * * lwp_free ensures the LWP is no longer running on another CPU. */ lwp_free(l, false, true); /* * Now no one except us can reach the process p. */ /* * Decrement the count of procs running with this uid. */ cred1 = p->p_cred; uid = kauth_cred_getuid(cred1); (void)chgproccnt(uid, -1); /* * Release substructures. */ lim_free(p->p_limit); pstatsfree(p->p_stats); kauth_cred_free(cred1); kauth_cred_free(cred2); /* * Release reference to text vnode */ if (p->p_textvp) vrele(p->p_textvp); kmem_strfree(p->p_path); mutex_destroy(&p->p_auxlock); mutex_obj_free(p->p_lock); mutex_destroy(&p->p_stmutex); cv_destroy(&p->p_waitcv); cv_destroy(&p->p_lwpcv); rw_destroy(&p->p_reflock); proc_free_mem(p); } /* * Change the parent of a process for tracing purposes. */ void proc_changeparent(struct proc *t, struct proc *p) { SET(t->p_slflag, PSL_TRACED); t->p_opptr = t->p_pptr; if (t->p_pptr == p) return; struct proc *parent = t->p_pptr; if (parent->p_lock < t->p_lock) { if (!mutex_tryenter(parent->p_lock)) { mutex_exit(t->p_lock); mutex_enter(parent->p_lock); mutex_enter(t->p_lock); } } else if (parent->p_lock > t->p_lock) { mutex_enter(parent->p_lock); } parent->p_slflag |= PSL_CHTRACED; proc_reparent(t, p); if (parent->p_lock != t->p_lock) mutex_exit(parent->p_lock); } /* * make process 'parent' the new parent of process 'child'. * * Must be called with proc_lock held. */ void proc_reparent(struct proc *child, struct proc *parent) { KASSERT(mutex_owned(&proc_lock)); if (child->p_pptr == parent) return; if (child->p_stat == SZOMB || child->p_stat == SDEAD || (child->p_stat == SSTOP && !child->p_waited)) { child->p_pptr->p_nstopchild--; parent->p_nstopchild++; } if (parent == initproc) { child->p_exitsig = SIGCHLD; child->p_ppid = parent->p_pid; } LIST_REMOVE(child, p_sibling); LIST_INSERT_HEAD(&parent->p_children, child, p_sibling); child->p_pptr = parent; }
19 18 108 88 110 110 109 87 22 105 107 107 5 85 18 17 1 4 4 4 1 4 1 4 4 2 2 2 3 1 4 4 3 1 9 8 9 9 7 8 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 /* $NetBSD: ufs_bmap.c,v 1.54 2022/11/17 06:40:40 chs Exp $ */ /* * Copyright (c) 1989, 1991, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)ufs_bmap.c 8.8 (Berkeley) 8/11/95 */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: ufs_bmap.c,v 1.54 2022/11/17 06:40:40 chs Exp $"); #include <sys/param.h> #include <sys/systm.h> #include <sys/buf.h> #include <sys/proc.h> #include <sys/vnode.h> #include <sys/mount.h> #include <sys/resourcevar.h> #include <sys/trace.h> #include <miscfs/specfs/specdev.h> #include <ufs/ufs/inode.h> #include <ufs/ufs/ufsmount.h> #include <ufs/ufs/ufs_extern.h> #include <ufs/ufs/ufs_bswap.h> static bool ufs_issequential(const struct ufsmount *ump, daddr_t daddr0, daddr_t daddr1) { /* for ufs, blocks in a hole is not 'contiguous'. */ if (daddr0 == 0) return false; return (daddr0 + ump->um_seqinc == daddr1); } /* * Bmap converts the logical block number of a file to its physical block * number on the disk. The conversion is done by using the logical block * number to index into the array of block pointers described by the dinode. */ int ufs_bmap(void *v) { struct vop_bmap_args /* { struct vnode *a_vp; daddr_t a_bn; struct vnode **a_vpp; daddr_t *a_bnp; int *a_runp; } */ *ap = v; int error; /* * Check for underlying vnode requests and ensure that logical * to physical mapping is requested. */ if (ap->a_vpp != NULL) *ap->a_vpp = VTOI(ap->a_vp)->i_devvp; if (ap->a_bnp == NULL) return (0); error = ufs_bmaparray(ap->a_vp, ap->a_bn, ap->a_bnp, NULL, NULL, ap->a_runp, ufs_issequential); return error; } /* * Indirect blocks are now on the vnode for the file. They are given negative * logical block numbers. Indirect blocks are addressed by the negative * address of the first data block to which they point. Double indirect blocks * are addressed by one less than the address of the first indirect block to * which they point. Triple indirect blocks are addressed by one less than * the address of the first double indirect block to which they point. * * ufs_bmaparray does the bmap conversion, and if requested returns the * array of logical blocks which must be traversed to get to a block. * Each entry contains the offset into that block that gets you to the * next block and the disk address of the block (if it is assigned). */ int ufs_bmaparray(struct vnode *vp, daddr_t bn, daddr_t *bnp, struct indir *ap, int *nump, int *runp, ufs_issequential_callback_t is_sequential) { struct inode *ip; struct buf *bp, *cbp; struct ufsmount *ump; struct mount *mp; struct indir a[UFS_NIADDR + 1], *xap; daddr_t daddr; daddr_t metalbn; int error, maxrun = 0, num; ip = VTOI(vp); mp = vp->v_mount; ump = ip->i_ump; KASSERTMSG(((ap == NULL) == (nump == NULL)), "ufs_bmaparray: invalid arguments: ap = %p, nump = %p", ap, nump); if (runp) { /* * XXX * If MAXBSIZE is the largest transfer the disks can handle, * we probably want maxrun to be 1 block less so that we * don't create a block larger than the device can handle. */ *runp = 0; maxrun = MAXPHYS / mp->mnt_stat.f_iosize - 1; } if (bn >= 0 && bn < UFS_NDADDR) { if (nump != NULL) *nump = 0; if (ump->um_fstype == UFS1) daddr = ufs_rw32(ip->i_ffs1_db[bn], UFS_MPNEEDSWAP(ump)); else daddr = ufs_rw64(ip->i_ffs2_db[bn], UFS_MPNEEDSWAP(ump)); *bnp = blkptrtodb(ump, daddr); /* * Since this is FFS independent code, we are out of * scope for the definitions of BLK_NOCOPY and * BLK_SNAP, but we do know that they will fall in * the range 1..um_seqinc, so we use that test and * return a request for a zeroed out buffer if attempts * are made to read a BLK_NOCOPY or BLK_SNAP block. */ if ((ip->i_flags & (SF_SNAPSHOT | SF_SNAPINVAL)) == SF_SNAPSHOT && daddr > 0 && daddr < ump->um_seqinc) { *bnp = -1; } else if (*bnp == 0) { if ((ip->i_flags & (SF_SNAPSHOT | SF_SNAPINVAL)) == SF_SNAPSHOT) { *bnp = blkptrtodb(ump, bn * ump->um_seqinc); } else { *bnp = -1; } } else if (runp) { if (ump->um_fstype == UFS1) { for (++bn; bn < UFS_NDADDR && *runp < maxrun && is_sequential(ump, ufs_rw32(ip->i_ffs1_db[bn - 1], UFS_MPNEEDSWAP(ump)), ufs_rw32(ip->i_ffs1_db[bn], UFS_MPNEEDSWAP(ump))); ++bn, ++*runp); } else { for (++bn; bn < UFS_NDADDR && *runp < maxrun && is_sequential(ump, ufs_rw64(ip->i_ffs2_db[bn - 1], UFS_MPNEEDSWAP(ump)), ufs_rw64(ip->i_ffs2_db[bn], UFS_MPNEEDSWAP(ump))); ++bn, ++*runp); } } return (0); } else if (bn < 0 && bn >= -UFS_NXADDR) { KASSERT(ump->um_fstype == UFS2 && (ump->um_flags & UFS_EA) != 0); daddr = ufs_rw64(ip->i_ffs2_extb[-1 - bn], UFS_MPNEEDSWAP(ump)); *bnp = blkptrtodb(ump, daddr); if (*bnp == 0) *bnp = -1; return 0; } xap = ap == NULL ? a : ap; if (!nump) nump = &num; if ((error = ufs_getlbns(vp, bn, xap, nump)) != 0) return (error); num = *nump; /* Get disk address out of indirect block array */ if (ump->um_fstype == UFS1) daddr = ufs_rw32(ip->i_ffs1_ib[xap->in_off], UFS_MPNEEDSWAP(ump)); else daddr = ufs_rw64(ip->i_ffs2_ib[xap->in_off], UFS_MPNEEDSWAP(ump)); for (bp = NULL, ++xap; --num; ++xap) { /* * Exit the loop if there is no disk address assigned yet and * the indirect block isn't in the cache, or if we were * looking for an indirect block and we've found it. */ metalbn = xap->in_lbn; if (metalbn == bn) break; if (daddr == 0) { mutex_enter(&bufcache_lock); cbp = incore(vp, metalbn); mutex_exit(&bufcache_lock); if (cbp == NULL) break; } /* * If we get here, we've either got the block in the cache * or we have a disk address for it, go fetch it. */ if (bp) brelse(bp, 0); xap->in_exists = 1; bp = getblk(vp, metalbn, mp->mnt_stat.f_iosize, 0, 0); if (bp == NULL) { /* * getblk() above returns NULL only iff we are * pagedaemon. See the implementation of getblk * for detail. */ return (ENOMEM); } if (bp->b_oflags & (BO_DONE | BO_DELWRI)) { trace(TR_BREADHIT, pack(vp, size), metalbn); } else { KASSERTMSG((daddr != 0), "ufs_bmaparray: indirect block not in cache"); trace(TR_BREADMISS, pack(vp, size), metalbn); bp->b_blkno = blkptrtodb(ump, daddr); bp->b_flags |= B_READ; BIO_SETPRIO(bp, BPRIO_TIMECRITICAL); VOP_STRATEGY(vp, bp); curlwp->l_ru.ru_inblock++; /* XXX */ if ((error = biowait(bp)) != 0) { brelse(bp, 0); return (error); } } if (ump->um_fstype == UFS1) { daddr = ufs_rw32(((u_int32_t *)bp->b_data)[xap->in_off], UFS_MPNEEDSWAP(ump)); if (num == 1 && daddr && runp) { for (bn = xap->in_off + 1; bn < MNINDIR(ump) && *runp < maxrun && is_sequential(ump, ufs_rw32(((int32_t *)bp->b_data)[bn-1], UFS_MPNEEDSWAP(ump)), ufs_rw32(((int32_t *)bp->b_data)[bn], UFS_MPNEEDSWAP(ump))); ++bn, ++*runp); } } else { daddr = ufs_rw64(((u_int64_t *)bp->b_data)[xap->in_off], UFS_MPNEEDSWAP(ump)); if (num == 1 && daddr && runp) { for (bn = xap->in_off + 1; bn < MNINDIR(ump) && *runp < maxrun && is_sequential(ump, ufs_rw64(((int64_t *)bp->b_data)[bn-1], UFS_MPNEEDSWAP(ump)), ufs_rw64(((int64_t *)bp->b_data)[bn], UFS_MPNEEDSWAP(ump))); ++bn, ++*runp); } } } if (bp) brelse(bp, 0); /* * Since this is FFS independent code, we are out of scope for the * definitions of BLK_NOCOPY and BLK_SNAP, but we do know that they * will fall in the range 1..um_seqinc, so we use that test and * return a request for a zeroed out buffer if attempts are made * to read a BLK_NOCOPY or BLK_SNAP block. */ if ((ip->i_flags & (SF_SNAPSHOT | SF_SNAPINVAL)) == SF_SNAPSHOT && daddr > 0 && daddr < ump->um_seqinc) { *bnp = -1; return (0); } *bnp = blkptrtodb(ump, daddr); if (*bnp == 0) { if ((ip->i_flags & (SF_SNAPSHOT | SF_SNAPINVAL)) == SF_SNAPSHOT) { *bnp = blkptrtodb(ump, bn * ump->um_seqinc); } else { *bnp = -1; } } return (0); } /* * Create an array of logical block number/offset pairs which represent the * path of indirect blocks required to access a data block. The first "pair" * contains the logical block number of the appropriate single, double or * triple indirect block and the offset into the inode indirect block array. * Note, the logical block number of the inode single/double/triple indirect * block appears twice in the array, once with the offset into the i_ffs1_ib and * once with the offset into the page itself. */ int ufs_getlbns(struct vnode *vp, daddr_t bn, struct indir *ap, int *nump) { daddr_t metalbn, realbn; struct ufsmount *ump; int64_t blockcnt; int lbc; int i, numlevels, off; ump = VFSTOUFS(vp->v_mount); if (nump) *nump = 0; numlevels = 0; realbn = bn; if (bn < 0) bn = -bn; KASSERT(bn >= UFS_NDADDR); /* * Determine the number of levels of indirection. After this loop * is done, blockcnt indicates the number of data blocks possible * at the given level of indirection, and UFS_NIADDR - i is the number * of levels of indirection needed to locate the requested block. */ bn -= UFS_NDADDR; for (lbc = 0, i = UFS_NIADDR;; i--, bn -= blockcnt) { if (i == 0) return (EFBIG); lbc += ump->um_lognindir; blockcnt = (int64_t)1 << lbc; if (bn < blockcnt) break; } /* Calculate the address of the first meta-block. */ metalbn = -((realbn >= 0 ? realbn : -realbn) - bn + UFS_NIADDR - i); /* * At each iteration, off is the offset into the bap array which is * an array of disk addresses at the current level of indirection. * The logical block number and the offset in that block are stored * into the argument array. */ ap->in_lbn = metalbn; ap->in_off = off = UFS_NIADDR - i; ap->in_exists = 0; ap++; for (++numlevels; i <= UFS_NIADDR; i++) { /* If searching for a meta-data block, quit when found. */ if (metalbn == realbn) break; lbc -= ump->um_lognindir; off = (bn >> lbc) & (MNINDIR(ump) - 1); ++numlevels; ap->in_lbn = metalbn; ap->in_off = off; ap->in_exists = 0; ++ap; metalbn -= -1 + ((int64_t)off << lbc); } if (nump) *nump = numlevels; return (0); }
17 17 17 17 17 17 17 17 17 17 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 /* $NetBSD: kern_kthread.c,v 1.49 2023/09/23 14:40:42 ad Exp $ */ /*- * Copyright (c) 1998, 1999, 2007, 2009, 2019, 2023 * The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility, * NASA Ames Research Center, and by Andrew Doran. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: kern_kthread.c,v 1.49 2023/09/23 14:40:42 ad Exp $"); #include <sys/param.h> #include <sys/cpu.h> #include <sys/systm.h> #include <sys/kernel.h> #include <sys/kthread.h> #include <sys/mutex.h> #include <sys/sched.h> #include <sys/kmem.h> #include <sys/msan.h> #include <uvm/uvm_extern.h> static kmutex_t kthread_lock; static kcondvar_t kthread_cv; void kthread_sysinit(void) { mutex_init(&kthread_lock, MUTEX_DEFAULT, IPL_NONE); cv_init(&kthread_cv, "kthrwait"); } /* * kthread_create: create a kernel thread, that is, system-only LWP. */ int kthread_create(pri_t pri, int flag, struct cpu_info *ci, void (*func)(void *), void *arg, lwp_t **lp, const char *fmt, ...) { lwp_t *l; vaddr_t uaddr; int error, lc; va_list ap; KASSERT((flag & KTHREAD_INTR) == 0 || (flag & KTHREAD_MPSAFE) != 0); uaddr = uvm_uarea_system_alloc( (flag & (KTHREAD_INTR|KTHREAD_IDLE)) == KTHREAD_IDLE ? ci : NULL); if (uaddr == 0) { return ENOMEM; } kmsan_orig((void *)uaddr, USPACE, KMSAN_TYPE_POOL, __RET_ADDR); if ((flag & KTHREAD_TS) != 0) { lc = SCHED_OTHER; } else { lc = SCHED_RR; } error = lwp_create(&lwp0, &proc0, uaddr, LWP_DETACHED, NULL, 0, func, arg, &l, lc, &lwp0.l_sigmask, &lwp0.l_sigstk); if (error) { uvm_uarea_system_free(uaddr); return error; } if (fmt != NULL) { l->l_name = kmem_alloc(MAXCOMLEN, KM_SLEEP); va_start(ap, fmt); vsnprintf(l->l_name, MAXCOMLEN, fmt, ap); va_end(ap); } /* * Set parameters. */ if (pri == PRI_NONE) { if ((flag & KTHREAD_TS) != 0) { /* Maximum user priority level. */ pri = MAXPRI_USER; } else { /* Minimum kernel priority level. */ pri = PRI_KTHREAD; } } mutex_enter(proc0.p_lock); lwp_lock(l); lwp_changepri(l, pri); if (ci != NULL) { if (ci != l->l_cpu) { lwp_unlock_to(l, ci->ci_schedstate.spc_lwplock); lwp_lock(l); l->l_cpu = ci; } l->l_pflag |= LP_BOUND; } if ((flag & KTHREAD_MUSTJOIN) != 0) { KASSERT(lp != NULL); l->l_pflag |= LP_MUSTJOIN; } if ((flag & KTHREAD_INTR) != 0) { l->l_pflag |= LP_INTR; } if ((flag & KTHREAD_MPSAFE) == 0) { l->l_pflag &= ~LP_MPSAFE; } /* * Set the new LWP running, unless the caller has requested * otherwise. */ KASSERT(l->l_stat == LSIDL); if ((flag & KTHREAD_IDLE) == 0) { setrunnable(l); /* LWP now unlocked */ } else { lwp_unlock(l); } mutex_exit(proc0.p_lock); /* All done! */ if (lp != NULL) { *lp = l; } return 0; } /* * Cause a kernel thread to exit. Assumes the exiting thread is the * current context. */ void kthread_exit(int ecode) { const char *name; lwp_t *l = curlwp; /* If the kernel lock is held, we need to drop it now. */ if ((l->l_pflag & LP_MPSAFE) == 0) { KERNEL_UNLOCK_LAST(l); } /* We can't do much with the exit code, so just report it. */ if (ecode != 0) { if ((name = l->l_name) == NULL) name = "unnamed"; printf("WARNING: kthread `%s' (%d) exits with status %d\n", name, l->l_lid, ecode); } /* Barrier for joining. */ if (l->l_pflag & LP_MUSTJOIN) { bool *exitedp; mutex_enter(&kthread_lock); while ((exitedp = l->l_private) == NULL) { cv_wait(&kthread_cv, &kthread_lock); } KASSERT(!*exitedp); *exitedp = true; cv_broadcast(&kthread_cv); mutex_exit(&kthread_lock); } /* And exit.. */ lwp_exit(l); panic("kthread_exit"); } /* * Wait for a kthread to exit, as pthread_join(). */ int kthread_join(lwp_t *l) { bool exited = false; KASSERT((l->l_flag & LW_SYSTEM) != 0); KASSERT((l->l_pflag & LP_MUSTJOIN) != 0); /* * - Ask the kthread to write to `exited'. * - After this, touching l is forbidden -- it may be freed. * - Wait until the kthread has written to `exited'. */ mutex_enter(&kthread_lock); KASSERT(l->l_private == NULL); l->l_private = &exited; cv_broadcast(&kthread_cv); while (!exited) { cv_wait(&kthread_cv, &kthread_lock); } mutex_exit(&kthread_lock); return 0; } /* * kthread_fpu_enter() * * Allow the current lwp, which must be a kthread, to use the FPU. * Return a cookie that must be passed to kthread_fpu_exit when * done. Must be used only in thread context. Recursive -- you * can call kthread_fpu_enter several times in a row as long as * you pass the cookies in reverse order to kthread_fpu_exit. */ int kthread_fpu_enter(void) { struct lwp *l = curlwp; int s; KASSERTMSG(!cpu_intr_p(), "%s is not allowed in interrupt context", __func__); KASSERTMSG(!cpu_softintr_p(), "%s is not allowed in interrupt context", __func__); /* * Remember whether this thread already had FPU access, and * mark this thread as having FPU access. */ lwp_lock(l); KASSERTMSG(l->l_flag & LW_SYSTEM, "%s is allowed only in kthreads", __func__); s = l->l_flag & LW_SYSTEM_FPU; l->l_flag |= LW_SYSTEM_FPU; lwp_unlock(l); /* Take MD steps to enable the FPU if necessary. */ if (s == 0) kthread_fpu_enter_md(); return s; } /* * kthread_fpu_exit(s) * * Restore the current lwp's FPU access to what it was before the * matching call to kthread_fpu_enter() that returned s. Must be * used only in thread context. */ void kthread_fpu_exit(int s) { struct lwp *l = curlwp; KASSERT(s == (s & LW_SYSTEM_FPU)); KASSERTMSG(!cpu_intr_p(), "%s is not allowed in interrupt context", __func__); KASSERTMSG(!cpu_softintr_p(), "%s is not allowed in interrupt context", __func__); lwp_lock(l); KASSERTMSG(l->l_flag & LW_SYSTEM, "%s is allowed only in kthreads", __func__); KASSERT(l->l_flag & LW_SYSTEM_FPU); l->l_flag ^= s ^ LW_SYSTEM_FPU; lwp_unlock(l); /* Take MD steps to zero and disable the FPU if necessary. */ if (s == 0) kthread_fpu_exit_md(); }
4 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 /* $NetBSD: dead_vnops.c,v 1.67 2022/10/26 23:39:43 riastradh Exp $ */ /* * Copyright (c) 1989, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)dead_vnops.c 8.2 (Berkeley) 11/21/94 */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: dead_vnops.c,v 1.67 2022/10/26 23:39:43 riastradh Exp $"); #include <sys/param.h> #include <sys/systm.h> #include <sys/time.h> #include <sys/vnode.h> #include <sys/errno.h> #include <sys/namei.h> #include <sys/buf.h> #include <sys/proc.h> #include <miscfs/deadfs/deadfs.h> #include <miscfs/genfs/genfs.h> /* * Prototypes for dead operations on vnodes. */ int dead_lookup(void *); int dead_open(void *); int dead_read(void *); int dead_write(void *); int dead_ioctl(void *); int dead_poll(void *); int dead_remove(void *); int dead_link(void *); int dead_rename(void *); int dead_rmdir(void *); int dead_inactive(void *); int dead_bmap(void *); int dead_strategy(void *); int dead_print(void *); int dead_getpages(void *); int dead_putpages(void *); int dead_default_error(void *); int (**dead_vnodeop_p)(void *); static const struct vnodeopv_entry_desc dead_vnodeop_entries[] = { { &vop_default_desc, dead_default_error }, { &vop_bwrite_desc, vn_bwrite }, /* bwrite */ { &vop_parsepath_desc, genfs_parsepath }, /* parsepath */ { &vop_lookup_desc, dead_lookup }, /* lookup */ { &vop_open_desc, dead_open }, /* open */ { &vop_close_desc, genfs_nullop }, /* close */ { &vop_read_desc, dead_read }, /* read */ { &vop_write_desc, dead_write }, /* write */ { &vop_fallocate_desc, genfs_eopnotsupp }, /* fallocate */ { &vop_fdiscard_desc, genfs_eopnotsupp }, /* fdiscard */ { &vop_fcntl_desc, genfs_nullop }, /* fcntl */ { &vop_ioctl_desc, dead_ioctl }, /* ioctl */ { &vop_poll_desc, dead_poll }, /* poll */ { &vop_remove_desc, dead_remove }, /* remove */ { &vop_link_desc, dead_link }, /* link */ { &vop_rename_desc, dead_rename }, /* rename */ { &vop_rmdir_desc, dead_rmdir }, /* rmdir */ { &vop_fsync_desc, genfs_nullop }, /* fsync */ { &vop_seek_desc, genfs_nullop }, /* seek */ { &vop_inactive_desc, dead_inactive }, /* inactive */ { &vop_reclaim_desc, genfs_nullop }, /* reclaim */ { &vop_lock_desc, genfs_deadlock }, /* lock */ { &vop_unlock_desc, genfs_deadunlock }, /* unlock */ { &vop_bmap_desc, dead_bmap }, /* bmap */ { &vop_strategy_desc, dead_strategy }, /* strategy */ { &vop_print_desc, dead_print }, /* print */ { &vop_islocked_desc, genfs_deadislocked }, /* islocked */ { &vop_revoke_desc, genfs_nullop }, /* revoke */ { &vop_getpages_desc, dead_getpages }, /* getpages */ { &vop_putpages_desc, dead_putpages }, /* putpages */ { NULL, NULL } }; const struct vnodeopv_desc dead_vnodeop_opv_desc = { &dead_vnodeop_p, dead_vnodeop_entries }; /* ARGSUSED */ int dead_default_error(void *v) { return EBADF; } int dead_bmap(void *v) { struct vop_bmap_args /* { struct vnode *a_vp; daddr_t a_bn; struct vnode **a_vpp; daddr_t *a_bnp; int *a_runp; } */ *ap = v; (void)ap; return (EIO); } int dead_lookup(void *v) { struct vop_lookup_v2_args /* { struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; } */ *ap = v; *(ap->a_vpp) = NULL; return ENOENT; } int dead_open(void *v) { struct vop_open_args /* { struct vnode *a_vp; int a_mode; kauth_cred_t a_cred; } */ *ap = v; (void)ap; return (ENXIO); } int dead_read(void *v) { struct vop_read_args /* { struct vnode *a_vp; struct uio *a_uio; int a_ioflag; kauth_cred_t a_cred; } */ *ap = v; /* * Return EOF for tty devices, EIO for others */ if ((ap->a_vp->v_vflag & VV_ISTTY) == 0) return (EIO); return (0); } int dead_write(void *v) { struct vop_write_args /* { struct vnode *a_vp; struct uio *a_uio; int a_ioflag; kauth_cred_t a_cred; } */ *ap = v; (void)ap; return (EIO); } int dead_ioctl(void *v) { struct vop_ioctl_args /* { struct vnode *a_vp; u_long a_command; void *a_data; int a_fflag; kauth_cred_t a_cred; struct lwp *a_l; } */ *ap = v; (void)ap; return (EBADF); } int dead_poll(void *v) { struct vop_poll_args /* { struct vnode *a_vp; int a_events; struct lwp *a_l; } */ *ap = v; /* * Let the user find out that the descriptor is gone. */ return (ap->a_events); } int dead_remove(void *v) { struct vop_remove_v3_args /* { struct vnode *a_dvp; struct vnode *a_vp; struct componentname *a_cnp; nlink_t ctx_vp_new_nlink; } */ *ap = v; vput(ap->a_vp); return EIO; } int dead_link(void *v) { struct vop_link_v2_args /* { struct vnode *a_dvp; struct vnode *a_vp; struct componentname *a_cnp; } */ *ap = v; (void)ap; return EIO; } int dead_rename(void *v) { struct vop_rename_args /* { struct vnode *a_fdvp; struct vnode *a_fvp; struct componentname *a_fcnp; struct vnode *a_tdvp; struct vnode *a_tvp; struct componentname *a_tcnp; } */ *ap = v; vrele(ap->a_fdvp); vrele(ap->a_fvp); if (ap->a_tvp != NULL && ap->a_tvp != ap->a_tdvp) VOP_UNLOCK(ap->a_tvp); vput(ap->a_tdvp); if (ap->a_tvp != NULL) vrele(ap->a_tvp); return EIO; } int dead_rmdir(void *v) { struct vop_rmdir_v2_args /* { struct vnode *a_dvp; struct vnode *a_vp; struct componentname *a_cnp; } */ *ap = v; vput(ap->a_vp); return EIO; } int dead_inactive(void *v) { struct vop_inactive_v2_args /* { struct vnode *a_vp; bool *a_recycle; } */ *ap = v; *ap->a_recycle = false; return 0; } int dead_strategy(void *v) { struct vop_strategy_args /* { struct vnode *a_vp; struct buf *a_bp; } */ *ap = v; struct buf *bp; bp = ap->a_bp; bp->b_error = EIO; bp->b_resid = bp->b_bcount; biodone(ap->a_bp); return (EIO); } /* ARGSUSED */ int dead_print(void *v) { printf("tag VT_NON, dead vnode\n"); return 0; } int dead_getpages(void *v) { struct vop_getpages_args /* { struct vnode *a_vp; voff_t a_offset; struct vm_page **a_m; int *a_count; int a_centeridx; vm_prot_t a_access_type; int a_advice; int a_flags; } */ *ap = v; if ((ap->a_flags & PGO_LOCKED) == 0) rw_exit(ap->a_vp->v_uobj.vmobjlock); return (EFAULT); } int dead_putpages(void *v) { struct vop_putpages_args /* { struct vnode *a_vp; voff_t a_offlo; voff_t a_offhi; int a_flags; } */ *ap = v; rw_exit(ap->a_vp->v_uobj.vmobjlock); return (EFAULT); }
67 67 67 67 67 67 5 66 5 34 35 67 3 13 16 67 1 67 5 67 5 67 35 5 34 67 16 3 13 32 32 32 32 32 7 32 13 25 32 32 32 11 11 11 11 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 /* $NetBSD: umap_vnops.c,v 1.62 2021/10/20 03:08:18 thorpej Exp $ */ /* * Copyright (c) 1992, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software donated to Berkeley by * the UCLA Ficus project. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)umap_vnops.c 8.6 (Berkeley) 5/22/95 */ /* * Umap Layer */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: umap_vnops.c,v 1.62 2021/10/20 03:08:18 thorpej Exp $"); #include <sys/param.h> #include <sys/systm.h> #include <sys/time.h> #include <sys/vnode.h> #include <sys/mount.h> #include <sys/namei.h> #include <sys/buf.h> #include <sys/kauth.h> #include <miscfs/umapfs/umap.h> #include <miscfs/genfs/genfs.h> #include <miscfs/genfs/layer_extern.h> /* * Note: If the LAYERFS_MBYPASSDEBUG flag is set, it is possible * that the debug printing will bomb out, because kauth routines * do not handle NOCRED or FSCRED like other credentials and end * up dereferencing an inappropriate pointer. * * That should be fixed in kauth rather than here. */ int umap_lookup(void *); int umap_getattr(void *); int umap_print(void *); int umap_rename(void *); /* * Global vfs data structures */ /* * XXX - strategy, bwrite are hand coded currently. They should * go away with a merged buffer/block cache. * */ int (**umap_vnodeop_p)(void *); const struct vnodeopv_entry_desc umap_vnodeop_entries[] = { { &vop_default_desc, umap_bypass }, { &vop_lookup_desc, umap_lookup }, { &vop_getattr_desc, umap_getattr }, { &vop_print_desc, umap_print }, { &vop_rename_desc, umap_rename }, { &vop_fsync_desc, layer_fsync }, { &vop_inactive_desc, layer_inactive }, { &vop_reclaim_desc, layer_reclaim }, { &vop_open_desc, layer_open }, { &vop_close_desc, layer_close }, { &vop_setattr_desc, layer_setattr }, { &vop_access_desc, layer_access }, { &vop_accessx_desc, genfs_accessx }, { &vop_remove_desc, layer_remove }, { &vop_revoke_desc, layer_revoke }, { &vop_rmdir_desc, layer_rmdir }, { &vop_bmap_desc, layer_bmap }, { &vop_getpages_desc, layer_getpages }, { &vop_putpages_desc, layer_putpages }, { NULL, NULL } }; const struct vnodeopv_desc umapfs_vnodeop_opv_desc = { &umap_vnodeop_p, umap_vnodeop_entries }; /* * This is the 08-June-1999 bypass routine. * See layer_vnops.c:layer_bypass for more details. */ int umap_bypass(void *v) { struct vop_generic_args /* { struct vnodeop_desc *a_desc; <other random data follows, presumably> } */ *ap = v; int (**our_vnodeop_p)(void *); kauth_cred_t *credpp = NULL, credp = 0; kauth_cred_t savecredp = 0, savecompcredp = 0; kauth_cred_t compcredp = 0; struct vnode **this_vp_p; int error; struct vnode *old_vps[VDESC_MAX_VPS], *vp0; struct vnode **vps_p[VDESC_MAX_VPS]; struct vnode ***vppp; struct vnodeop_desc *descp = ap->a_desc; int reles, i, flags; struct componentname **compnamepp = 0; #ifdef DIAGNOSTIC /* * We require at least one vp. */ if (descp->vdesc_vp_offsets == NULL || descp->vdesc_vp_offsets[0] == VDESC_NO_OFFSET) panic("%s: no vp's in map.\n", __func__); #endif vps_p[0] = VOPARG_OFFSETTO(struct vnode**, descp->vdesc_vp_offsets[0], ap); vp0 = *vps_p[0]; flags = MOUNTTOUMAPMOUNT(vp0->v_mount)->umapm_flags; our_vnodeop_p = vp0->v_op; if (flags & LAYERFS_MBYPASSDEBUG) printf("%s: %s\n", __func__, descp->vdesc_name); /* * Map the vnodes going in. * Later, we'll invoke the operation based on * the first mapped vnode's operation vector. */ reles = descp->vdesc_flags; for (i = 0; i < VDESC_MAX_VPS; reles >>= 1, i++) { if (descp->vdesc_vp_offsets[i] == VDESC_NO_OFFSET) break; /* bail out at end of list */ vps_p[i] = this_vp_p = VOPARG_OFFSETTO(struct vnode**, descp->vdesc_vp_offsets[i], ap); /* * We're not guaranteed that any but the first vnode * are of our type. Check for and don't map any * that aren't. (We must always map first vp or vclean fails.) */ if (i && (*this_vp_p == NULL || (*this_vp_p)->v_op != our_vnodeop_p)) { old_vps[i] = NULL; } else { old_vps[i] = *this_vp_p; *(vps_p[i]) = UMAPVPTOLOWERVP(*this_vp_p); /* * XXX - Several operations have the side effect * of vrele'ing their vp's. We must account for * that. (This should go away in the future.) */ if (reles & VDESC_VP0_WILLRELE) vref(*this_vp_p); } } /* * Fix the credentials. (That's the purpose of this layer.) */ if (descp->vdesc_cred_offset != VDESC_NO_OFFSET) { credpp = VOPARG_OFFSETTO(kauth_cred_t*, descp->vdesc_cred_offset, ap); /* Save old values */ savecredp = *credpp; if (savecredp != NOCRED && savecredp != FSCRED) *credpp = kauth_cred_dup(savecredp); credp = *credpp; if ((flags & LAYERFS_MBYPASSDEBUG) && kauth_cred_geteuid(credp) != 0) printf("umap_bypass: user was %d, group %d\n", kauth_cred_geteuid(credp), kauth_cred_getegid(credp)); /* Map all ids in the credential structure. */ umap_mapids(vp0->v_mount, credp); if ((flags & LAYERFS_MBYPASSDEBUG) && kauth_cred_geteuid(credp) != 0) printf("umap_bypass: user now %d, group %d\n", kauth_cred_geteuid(credp), kauth_cred_getegid(credp)); } /* BSD often keeps a credential in the componentname structure * for speed. If there is one, it better get mapped, too. */ if (descp->vdesc_componentname_offset != VDESC_NO_OFFSET) { compnamepp = VOPARG_OFFSETTO(struct componentname**, descp->vdesc_componentname_offset, ap); savecompcredp = (*compnamepp)->cn_cred; if (savecompcredp != NOCRED && savecompcredp != FSCRED) (*compnamepp)->cn_cred = kauth_cred_dup(savecompcredp); compcredp = (*compnamepp)->cn_cred; if ((flags & LAYERFS_MBYPASSDEBUG) && kauth_cred_geteuid(compcredp) != 0) printf("umap_bypass: component credit user was %d, group %d\n", kauth_cred_geteuid(compcredp), kauth_cred_getegid(compcredp)); /* Map all ids in the credential structure. */ umap_mapids(vp0->v_mount, compcredp); if ((flags & LAYERFS_MBYPASSDEBUG) && kauth_cred_geteuid(compcredp) != 0) printf("umap_bypass: component credit user now %d, group %d\n", kauth_cred_geteuid(compcredp), kauth_cred_getegid(compcredp)); } /* * Call the operation on the lower layer * with the modified argument structure. */ error = VCALL(*vps_p[0], descp->vdesc_offset, ap); /* * Maintain the illusion of call-by-value * by restoring vnodes in the argument structure * to their original value. */ reles = descp->vdesc_flags; for (i = 0; i < VDESC_MAX_VPS; reles >>= 1, i++) { if (descp->vdesc_vp_offsets[i] == VDESC_NO_OFFSET) break; /* bail out at end of list */ if (old_vps[i]) { *(vps_p[i]) = old_vps[i]; if (reles & VDESC_VP0_WILLRELE) vrele(*(vps_p[i])); } } /* * Map the possible out-going vpp * (Assumes that the lower layer always returns * a VREF'ed vpp unless it gets an error.) */ if (descp->vdesc_vpp_offset != VDESC_NO_OFFSET && !error) { vppp = VOPARG_OFFSETTO(struct vnode***, descp->vdesc_vpp_offset, ap); /* * Only vop_lookup, vop_create, vop_makedir, vop_mknod * and vop_symlink return vpp's. vop_lookup doesn't call bypass * as a lookup on "." would generate a locking error. * So all the calls which get us here have a unlocked vpp. :-) */ error = layer_node_create(old_vps[0]->v_mount, **vppp, *vppp); if (error) { vrele(**vppp); **vppp = NULL; } } /* * Free duplicate cred structure and restore old one. */ if (descp->vdesc_cred_offset != VDESC_NO_OFFSET) { if ((flags & LAYERFS_MBYPASSDEBUG) && credp && kauth_cred_geteuid(credp) != 0) printf("umap_bypass: returning-user was %d\n", kauth_cred_geteuid(credp)); if (savecredp != NOCRED && savecredp != FSCRED && credpp) { kauth_cred_free(credp); *credpp = savecredp; if ((flags & LAYERFS_MBYPASSDEBUG) && credpp && kauth_cred_geteuid(*credpp) != 0) printf("umap_bypass: returning-user now %d\n\n", kauth_cred_geteuid(savecredp)); } } if (descp->vdesc_componentname_offset != VDESC_NO_OFFSET) { if ((flags & LAYERFS_MBYPASSDEBUG) && compcredp && kauth_cred_geteuid(compcredp) != 0) printf("umap_bypass: returning-component-user was %d\n", kauth_cred_geteuid(compcredp)); if (savecompcredp != NOCRED && savecompcredp != FSCRED) { kauth_cred_free(compcredp); (*compnamepp)->cn_cred = savecompcredp; if ((flags & LAYERFS_MBYPASSDEBUG) && savecompcredp && kauth_cred_geteuid(savecompcredp) != 0) printf("umap_bypass: returning-component-user now %d\n", kauth_cred_geteuid(savecompcredp)); } } return (error); } /* * This is based on the 08-June-1999 bypass routine. * See layer_vnops.c:layer_bypass for more details. */ int umap_lookup(void *v) { struct vop_lookup_v2_args /* { struct vnodeop_desc *a_desc; struct vnode * a_dvp; struct vnode ** a_vpp; struct componentname * a_cnp; } */ *ap = v; struct componentname *cnp = ap->a_cnp; kauth_cred_t savecompcredp = NULL; kauth_cred_t compcredp = NULL; struct vnode *dvp, *vp, *ldvp; struct mount *mp; int error; int flags, cnf = cnp->cn_flags; dvp = ap->a_dvp; mp = dvp->v_mount; if ((cnf & ISLASTCN) && (dvp->v_mount->mnt_flag & MNT_RDONLY) && (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)) return (EROFS); flags = MOUNTTOUMAPMOUNT(mp)->umapm_flags; ldvp = UMAPVPTOLOWERVP(dvp); if (flags & LAYERFS_MBYPASSDEBUG) printf("umap_lookup\n"); /* * Fix the credentials. (That's the purpose of this layer.) * * BSD often keeps a credential in the componentname structure * for speed. If there is one, it better get mapped, too. */ if ((savecompcredp = cnp->cn_cred)) { compcredp = kauth_cred_dup(savecompcredp); cnp->cn_cred = compcredp; if ((flags & LAYERFS_MBYPASSDEBUG) && kauth_cred_geteuid(compcredp) != 0) printf("umap_lookup: component credit user was %d, group %d\n", kauth_cred_geteuid(compcredp), kauth_cred_getegid(compcredp)); /* Map all ids in the credential structure. */ umap_mapids(mp, compcredp); } if ((flags & LAYERFS_MBYPASSDEBUG) && compcredp && kauth_cred_geteuid(compcredp) != 0) printf("umap_lookup: component credit user now %d, group %d\n", kauth_cred_geteuid(compcredp), kauth_cred_getegid(compcredp)); ap->a_dvp = ldvp; error = VCALL(ldvp, ap->a_desc->vdesc_offset, ap); vp = *ap->a_vpp; *ap->a_vpp = NULL; if (error == EJUSTRETURN && (cnf & ISLASTCN) && (dvp->v_mount->mnt_flag & MNT_RDONLY) && (cnp->cn_nameiop == CREATE || cnp->cn_nameiop == RENAME)) error = EROFS; /* Do locking fixup as appropriate. See layer_lookup() for info */ if (ldvp == vp) { *ap->a_vpp = dvp; vref(dvp); vrele(vp); } else if (vp != NULL) { error = layer_node_create(mp, vp, ap->a_vpp); if (error) { vrele(vp); } } /* * Free duplicate cred structure and restore old one. */ if ((flags & LAYERFS_MBYPASSDEBUG) && compcredp && kauth_cred_geteuid(compcredp) != 0) printf("umap_lookup: returning-component-user was %d\n", kauth_cred_geteuid(compcredp)); if (savecompcredp != NOCRED && savecompcredp != FSCRED) { if (compcredp) kauth_cred_free(compcredp); cnp->cn_cred = savecompcredp; if ((flags & LAYERFS_MBYPASSDEBUG) && savecompcredp && kauth_cred_geteuid(savecompcredp) != 0) printf("umap_lookup: returning-component-user now %d\n", kauth_cred_geteuid(savecompcredp)); } return (error); } /* * We handle getattr to change the fsid. */ int umap_getattr(void *v) { struct vop_getattr_args /* { struct vnode *a_vp; struct vattr *a_vap; kauth_cred_t a_cred; struct lwp *a_l; } */ *ap = v; uid_t uid; gid_t gid; int error, tmpid, nentries, gnentries, flags; u_long (*mapdata)[2]; u_long (*gmapdata)[2]; struct vnode **vp1p; const struct vnodeop_desc *descp = ap->a_desc; if ((error = umap_bypass(ap)) != 0) return (error); /* Requires that arguments be restored. */ ap->a_vap->va_fsid = ap->a_vp->v_mount->mnt_stat.f_fsidx.__fsid_val[0]; flags = MOUNTTOUMAPMOUNT(ap->a_vp->v_mount)->umapm_flags; /* * Umap needs to map the uid and gid returned by a stat * into the proper values for this site. This involves * finding the returned uid in the mapping information, * translating it into the uid on the other end, * and filling in the proper field in the vattr * structure pointed to by ap->a_vap. The group * is easier, since currently all groups will be * translate to the NULLGROUP. */ /* Find entry in map */ uid = ap->a_vap->va_uid; gid = ap->a_vap->va_gid; if ((flags & LAYERFS_MBYPASSDEBUG)) printf("umap_getattr: mapped uid = %d, mapped gid = %d\n", uid, gid); vp1p = VOPARG_OFFSETTO(struct vnode**, descp->vdesc_vp_offsets[0], ap); nentries = MOUNTTOUMAPMOUNT((*vp1p)->v_mount)->info_nentries; mapdata = (MOUNTTOUMAPMOUNT((*vp1p)->v_mount)->info_mapdata); gnentries = MOUNTTOUMAPMOUNT((*vp1p)->v_mount)->info_gnentries; gmapdata = (MOUNTTOUMAPMOUNT((*vp1p)->v_mount)->info_gmapdata); /* Reverse map the uid for the vnode. Since it's a reverse map, we can't use umap_mapids() to do it. */ tmpid = umap_reverse_findid(uid, mapdata, nentries); if (tmpid != -1) { ap->a_vap->va_uid = (uid_t) tmpid; if ((flags & LAYERFS_MBYPASSDEBUG)) printf("umap_getattr: original uid = %d\n", uid); } else ap->a_vap->va_uid = (uid_t) NOBODY; /* Reverse map the gid for the vnode. */ tmpid = umap_reverse_findid(gid, gmapdata, gnentries); if (tmpid != -1) { ap->a_vap->va_gid = (gid_t) tmpid; if ((flags & LAYERFS_MBYPASSDEBUG)) printf("umap_getattr: original gid = %d\n", gid); } else ap->a_vap->va_gid = (gid_t) NULLGROUP; return (0); } int umap_print(void *v) { struct vop_print_args /* { struct vnode *a_vp; } */ *ap = v; struct vnode *vp = ap->a_vp; printf("\ttag VT_UMAPFS, vp=%p, lowervp=%p\n", vp, UMAPVPTOLOWERVP(vp)); return (0); } int umap_rename(void *v) { struct vop_rename_args /* { struct vnode *a_fdvp; struct vnode *a_fvp; struct componentname *a_fcnp; struct vnode *a_tdvp; struct vnode *a_tvp; struct componentname *a_tcnp; } */ *ap = v; int error, flags; struct componentname *compnamep; kauth_cred_t compcredp, savecompcredp; struct vnode *vp; struct vnode *tvp; /* * Rename is irregular, having two componentname structures. * We need to map the cre in the second structure, * and then bypass takes care of the rest. */ vp = ap->a_fdvp; flags = MOUNTTOUMAPMOUNT(vp->v_mount)->umapm_flags; compnamep = ap->a_tcnp; compcredp = compnamep->cn_cred; savecompcredp = compcredp; compcredp = compnamep->cn_cred = kauth_cred_dup(savecompcredp); if ((flags & LAYERFS_MBYPASSDEBUG) && kauth_cred_geteuid(compcredp) != 0) printf("umap_rename: rename component credit user was %d, group %d\n", kauth_cred_geteuid(compcredp), kauth_cred_getegid(compcredp)); /* Map all ids in the credential structure. */ umap_mapids(vp->v_mount, compcredp); if ((flags & LAYERFS_MBYPASSDEBUG) && kauth_cred_geteuid(compcredp) != 0) printf("umap_rename: rename component credit user now %d, group %d\n", kauth_cred_geteuid(compcredp), kauth_cred_getegid(compcredp)); tvp = ap->a_tvp; if (tvp) { if (tvp->v_mount != vp->v_mount) tvp = NULL; else vref(tvp); } error = umap_bypass(ap); if (tvp) { if (error == 0) VTOLAYER(tvp)->layer_flags |= LAYERFS_REMOVED; vrele(tvp); } /* Restore the additional mapped componentname cred structure. */ kauth_cred_free(compcredp); compnamep->cn_cred = savecompcredp; return error; }
28 4 2 5 7 12 4 12 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 /* $NetBSD: kern_ktrace_vfs.c,v 1.3 2021/06/29 22:40:53 dholland Exp $ */ /*- * Copyright (c) 2006, 2007, 2008 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Andrew Doran. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /* * Copyright (c) 1989, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)kern_ktrace.c 8.5 (Berkeley) 5/14/95 */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: kern_ktrace_vfs.c,v 1.3 2021/06/29 22:40:53 dholland Exp $"); #include <sys/param.h> #include <sys/systm.h> #include <sys/file.h> #include <sys/filedesc.h> #include <sys/namei.h> #include <sys/vnode.h> #include <sys/kernel.h> #include <sys/ktrace.h> #include <sys/kauth.h> #include <sys/mount.h> #include <sys/syscallargs.h> /* * ktrace system call, the part of the ktrace framework that * explicitly interacts with VFS */ /* ARGSUSED */ int sys_ktrace(struct lwp *l, const struct sys_ktrace_args *uap, register_t *retval) { /* { syscallarg(const char *) fname; syscallarg(int) ops; syscallarg(int) facs; syscallarg(int) pid; } */ struct vnode *vp = NULL; file_t *fp = NULL; struct pathbuf *pb; int error = 0; int fd; if (ktrenter(l)) return EAGAIN; if (KTROP(SCARG(uap, ops)) != KTROP_CLEAR) { /* * an operation which requires a file argument. */ error = pathbuf_copyin(SCARG(uap, fname), &pb); if (error) { ktrexit(l); return (error); } error = vn_open(NULL, pb, 0, FREAD|FWRITE, 0, &vp, NULL, NULL); if (error != 0) { pathbuf_destroy(pb); ktrexit(l); return (error); } pathbuf_destroy(pb); VOP_UNLOCK(vp); if (vp->v_type != VREG) { vn_close(vp, FREAD|FWRITE, l->l_cred); ktrexit(l); return (EACCES); } /* * This uses up a file descriptor slot in the * tracing process for the duration of this syscall. * This is not expected to be a problem. */ if ((error = fd_allocfile(&fp, &fd)) != 0) { vn_close(vp, FWRITE, l->l_cred); ktrexit(l); return error; } fp->f_flag = FWRITE; fp->f_type = DTYPE_VNODE; fp->f_ops = &vnops; fp->f_vnode = vp; vp = NULL; } error = ktrace_common(l, SCARG(uap, ops), SCARG(uap, facs), SCARG(uap, pid), &fp); if (KTROP(SCARG(uap, ops)) != KTROP_CLEAR) fd_abort(curproc, fp, fd); return (error); }
1 1 1 1 1 1 1 1 12 12 12 12 27 27 26 27 27 442 441 442 442 27 27 28 26 26 26 28 1 1 57 57 57 57 2 2 2 2 34 304 305 420 420 28 28 28 27 1 26 28 28 28 28 28 462 460 28 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 /* $NetBSD: uvm_pdpolicy_clock.c,v 1.40 2022/04/12 20:27:56 andvar Exp $ */ /* NetBSD: uvm_pdaemon.c,v 1.72 2006/01/05 10:47:33 yamt Exp $ */ /*- * Copyright (c) 2019, 2020 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Andrew Doran. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /* * Copyright (c) 1997 Charles D. Cranor and Washington University. * Copyright (c) 1991, 1993, The Regents of the University of California. * * All rights reserved. * * This code is derived from software contributed to Berkeley by * The Mach Operating System project at Carnegie-Mellon University. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)vm_pageout.c 8.5 (Berkeley) 2/14/94 * from: Id: uvm_pdaemon.c,v 1.1.2.32 1998/02/06 05:26:30 chs Exp * * * Copyright (c) 1987, 1990 Carnegie-Mellon University. * All rights reserved. * * Permission to use, copy, modify and distribute this software and * its documentation is hereby granted, provided that both the copyright * notice and this permission notice appear in all copies of the * software, derivative works or modified versions, and any portions * thereof, and that both notices appear in supporting documentation. * * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. * * Carnegie Mellon requests users of this software to return to * * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU * School of Computer Science * Carnegie Mellon University * Pittsburgh PA 15213-3890 * * any improvements or extensions that they make and grant Carnegie the * rights to redistribute these changes. */ #if defined(PDSIM) #include "pdsim.h" #else /* defined(PDSIM) */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: uvm_pdpolicy_clock.c,v 1.40 2022/04/12 20:27:56 andvar Exp $"); #include <sys/param.h> #include <sys/proc.h> #include <sys/systm.h> #include <sys/kernel.h> #include <sys/kmem.h> #include <sys/atomic.h> #include <uvm/uvm.h> #include <uvm/uvm_pdpolicy.h> #include <uvm/uvm_pdpolicy_impl.h> #include <uvm/uvm_stat.h> #endif /* defined(PDSIM) */ /* * per-CPU queue of pending page status changes. 128 entries makes for a * 1kB queue on _LP64 and has been found to be a reasonable compromise that * keeps lock contention events and wait times low, while not using too much * memory nor allowing global state to fall too far behind. */ #if !defined(CLOCK_PDQ_SIZE) #define CLOCK_PDQ_SIZE 128 #endif /* !defined(CLOCK_PDQ_SIZE) */ #define PQ_INACTIVE 0x00000010 /* page is in inactive list */ #define PQ_ACTIVE 0x00000020 /* page is in active list */ #if !defined(CLOCK_INACTIVEPCT) #define CLOCK_INACTIVEPCT 33 #endif /* !defined(CLOCK_INACTIVEPCT) */ struct uvmpdpol_globalstate { kmutex_t lock; /* lock on state */ /* <= compiler pads here */ struct pglist s_activeq /* allocated pages, in use */ __aligned(COHERENCY_UNIT); struct pglist s_inactiveq; /* pages between the clock hands */ int s_active; int s_inactive; int s_inactarg; struct uvm_pctparam s_anonmin; struct uvm_pctparam s_filemin; struct uvm_pctparam s_execmin; struct uvm_pctparam s_anonmax; struct uvm_pctparam s_filemax; struct uvm_pctparam s_execmax; struct uvm_pctparam s_inactivepct; }; struct uvmpdpol_scanstate { bool ss_anonreact, ss_filereact, ss_execreact; struct vm_page ss_marker; }; static void uvmpdpol_pageactivate_locked(struct vm_page *); static void uvmpdpol_pagedeactivate_locked(struct vm_page *); static void uvmpdpol_pagedequeue_locked(struct vm_page *); static bool uvmpdpol_pagerealize_locked(struct vm_page *); static struct uvm_cpu *uvmpdpol_flush(void); static struct uvmpdpol_globalstate pdpol_state __cacheline_aligned; static struct uvmpdpol_scanstate pdpol_scanstate; PDPOL_EVCNT_DEFINE(reactexec) PDPOL_EVCNT_DEFINE(reactfile) PDPOL_EVCNT_DEFINE(reactanon) static void clock_tune(void) { struct uvmpdpol_globalstate *s = &pdpol_state; s->s_inactarg = UVM_PCTPARAM_APPLY(&s->s_inactivepct, s->s_active + s->s_inactive); if (s->s_inactarg <= uvmexp.freetarg) { s->s_inactarg = uvmexp.freetarg + 1; } } void uvmpdpol_scaninit(void) { struct uvmpdpol_globalstate *s = &pdpol_state; struct uvmpdpol_scanstate *ss = &pdpol_scanstate; int t; bool anonunder, fileunder, execunder; bool anonover, fileover, execover; bool anonreact, filereact, execreact; int64_t freepg, anonpg, filepg, execpg; /* * decide which types of pages we want to reactivate instead of freeing * to keep usage within the minimum and maximum usage limits. * uvm_availmem() will sync the counters. */ freepg = uvm_availmem(false); anonpg = cpu_count_get(CPU_COUNT_ANONCLEAN) + cpu_count_get(CPU_COUNT_ANONDIRTY) + cpu_count_get(CPU_COUNT_ANONUNKNOWN); execpg = cpu_count_get(CPU_COUNT_EXECPAGES); filepg = cpu_count_get(CPU_COUNT_FILECLEAN) + cpu_count_get(CPU_COUNT_FILEDIRTY) + cpu_count_get(CPU_COUNT_FILEUNKNOWN) - execpg; mutex_enter(&s->lock); t = s->s_active + s->s_inactive + freepg; anonunder = anonpg <= UVM_PCTPARAM_APPLY(&s->s_anonmin, t); fileunder = filepg <= UVM_PCTPARAM_APPLY(&s->s_filemin, t); execunder = execpg <= UVM_PCTPARAM_APPLY(&s->s_execmin, t); anonover = anonpg > UVM_PCTPARAM_APPLY(&s->s_anonmax, t); fileover = filepg > UVM_PCTPARAM_APPLY(&s->s_filemax, t); execover = execpg > UVM_PCTPARAM_APPLY(&s->s_execmax, t); anonreact = anonunder || (!anonover && (fileover || execover)); filereact = fileunder || (!fileover && (anonover || execover)); execreact = execunder || (!execover && (anonover || fileover)); if (filereact && execreact && (anonreact || uvm_swapisfull())) { anonreact = filereact = execreact = false; } ss->ss_anonreact = anonreact; ss->ss_filereact = filereact; ss->ss_execreact = execreact; memset(&ss->ss_marker, 0, sizeof(ss->ss_marker)); ss->ss_marker.flags = PG_MARKER; TAILQ_INSERT_HEAD(&pdpol_state.s_inactiveq, &ss->ss_marker, pdqueue); mutex_exit(&s->lock); } void uvmpdpol_scanfini(void) { struct uvmpdpol_globalstate *s = &pdpol_state; struct uvmpdpol_scanstate *ss = &pdpol_scanstate; mutex_enter(&s->lock); TAILQ_REMOVE(&pdpol_state.s_inactiveq, &ss->ss_marker, pdqueue); mutex_exit(&s->lock); } struct vm_page * uvmpdpol_selectvictim(krwlock_t **plock) { struct uvmpdpol_globalstate *s = &pdpol_state; struct uvmpdpol_scanstate *ss = &pdpol_scanstate; struct vm_page *pg; krwlock_t *lock; mutex_enter(&s->lock); while (/* CONSTCOND */ 1) { struct vm_anon *anon; struct uvm_object *uobj; pg = TAILQ_NEXT(&ss->ss_marker, pdqueue); if (pg == NULL) { break; } KASSERT((pg->flags & PG_MARKER) == 0); uvmexp.pdscans++; /* * acquire interlock to stabilize page identity. * if we have caught the page in a state of flux * deal with it and retry. */ mutex_enter(&pg->interlock); if (uvmpdpol_pagerealize_locked(pg)) { mutex_exit(&pg->interlock); continue; } /* * now prepare to move on to the next page. */ TAILQ_REMOVE(&pdpol_state.s_inactiveq, &ss->ss_marker, pdqueue); TAILQ_INSERT_AFTER(&pdpol_state.s_inactiveq, pg, &ss->ss_marker, pdqueue); /* * enforce the minimum thresholds on different * types of memory usage. if reusing the current * page would reduce that type of usage below its * minimum, reactivate the page instead and move * on to the next page. */ anon = pg->uanon; uobj = pg->uobject; if (uobj && UVM_OBJ_IS_VTEXT(uobj) && ss->ss_execreact) { uvmpdpol_pageactivate_locked(pg); mutex_exit(&pg->interlock); PDPOL_EVCNT_INCR(reactexec); continue; } if (uobj && UVM_OBJ_IS_VNODE(uobj) && !UVM_OBJ_IS_VTEXT(uobj) && ss->ss_filereact) { uvmpdpol_pageactivate_locked(pg); mutex_exit(&pg->interlock); PDPOL_EVCNT_INCR(reactfile); continue; } if ((anon || UVM_OBJ_IS_AOBJ(uobj)) && ss->ss_anonreact) { uvmpdpol_pageactivate_locked(pg); mutex_exit(&pg->interlock); PDPOL_EVCNT_INCR(reactanon); continue; } /* * try to lock the object that owns the page. * * with the page interlock held, we can drop s->lock, which * could otherwise serve as a barrier to us getting the * object locked, because the owner of the object's lock may * be blocked on s->lock (i.e. a deadlock). * * whatever happens, uvmpd_trylockowner() will release the * interlock. with the interlock dropped we can then * re-acquire our own lock. the order is: * * object -> pdpol -> interlock. */ mutex_exit(&s->lock); lock = uvmpd_trylockowner(pg); /* pg->interlock now released */ mutex_enter(&s->lock); if (lock == NULL) { /* didn't get it - try the next page. */ continue; } /* * move referenced pages back to active queue and skip to * next page. */ if (pmap_is_referenced(pg)) { mutex_enter(&pg->interlock); uvmpdpol_pageactivate_locked(pg); mutex_exit(&pg->interlock); uvmexp.pdreact++; rw_exit(lock); continue; } /* we have a potential victim. */ *plock = lock; break; } mutex_exit(&s->lock); return pg; } void uvmpdpol_balancequeue(int swap_shortage) { struct uvmpdpol_globalstate *s = &pdpol_state; int inactive_shortage; struct vm_page *p, marker; krwlock_t *lock; /* * we have done the scan to get free pages. now we work on meeting * our inactive target. */ memset(&marker, 0, sizeof(marker)); marker.flags = PG_MARKER; mutex_enter(&s->lock); TAILQ_INSERT_HEAD(&pdpol_state.s_activeq, &marker, pdqueue); for (;;) { inactive_shortage = pdpol_state.s_inactarg - pdpol_state.s_inactive; if (inactive_shortage <= 0 && swap_shortage <= 0) { break; } p = TAILQ_NEXT(&marker, pdqueue); if (p == NULL) { break; } KASSERT((p->flags & PG_MARKER) == 0); /* * acquire interlock to stabilize page identity. * if we have caught the page in a state of flux * deal with it and retry. */ mutex_enter(&p->interlock); if (uvmpdpol_pagerealize_locked(p)) { mutex_exit(&p->interlock); continue; } /* * now prepare to move on to the next page. */ TAILQ_REMOVE(&pdpol_state.s_activeq, &marker, pdqueue); TAILQ_INSERT_AFTER(&pdpol_state.s_activeq, p, &marker, pdqueue); /* * try to lock the object that owns the page. see comments * in uvmpdol_selectvictim(). */ mutex_exit(&s->lock); lock = uvmpd_trylockowner(p); /* p->interlock now released */ mutex_enter(&s->lock); if (lock == NULL) { /* didn't get it - try the next page. */ continue; } /* * if there's a shortage of swap slots, try to free it. */ if (swap_shortage > 0 && (p->flags & PG_SWAPBACKED) != 0 && (p->flags & PG_BUSY) == 0) { if (uvmpd_dropswap(p)) { swap_shortage--; } } /* * if there's a shortage of inactive pages, deactivate. */ if (inactive_shortage > 0) { pmap_clear_reference(p); mutex_enter(&p->interlock); uvmpdpol_pagedeactivate_locked(p); mutex_exit(&p->interlock); uvmexp.pddeact++; inactive_shortage--; } rw_exit(lock); } TAILQ_REMOVE(&pdpol_state.s_activeq, &marker, pdqueue); mutex_exit(&s->lock); } static void uvmpdpol_pagedeactivate_locked(struct vm_page *pg) { struct uvmpdpol_globalstate *s __diagused = &pdpol_state; KASSERT(mutex_owned(&s->lock)); KASSERT(mutex_owned(&pg->interlock)); KASSERT((pg->pqflags & (PQ_INTENT_MASK | PQ_INTENT_SET)) != (PQ_INTENT_D | PQ_INTENT_SET)); if (pg->pqflags & PQ_ACTIVE) { TAILQ_REMOVE(&pdpol_state.s_activeq, pg, pdqueue); KASSERT(pdpol_state.s_active > 0); pdpol_state.s_active--; } if ((pg->pqflags & PQ_INACTIVE) == 0) { KASSERT(pg->wire_count == 0); TAILQ_INSERT_TAIL(&pdpol_state.s_inactiveq, pg, pdqueue); pdpol_state.s_inactive++; } pg->pqflags &= ~(PQ_ACTIVE | PQ_INTENT_SET); pg->pqflags |= PQ_INACTIVE; } void uvmpdpol_pagedeactivate(struct vm_page *pg) { KASSERT(uvm_page_owner_locked_p(pg, false)); KASSERT(mutex_owned(&pg->interlock)); /* * we have to clear the reference bit now, as when it comes time to * realize the intent we won't have the object locked any more. */ pmap_clear_reference(pg); uvmpdpol_set_intent(pg, PQ_INTENT_I); } static void uvmpdpol_pageactivate_locked(struct vm_page *pg) { struct uvmpdpol_globalstate *s __diagused = &pdpol_state; KASSERT(mutex_owned(&s->lock)); KASSERT(mutex_owned(&pg->interlock)); KASSERT((pg->pqflags & (PQ_INTENT_MASK | PQ_INTENT_SET)) != (PQ_INTENT_D | PQ_INTENT_SET)); uvmpdpol_pagedequeue_locked(pg); TAILQ_INSERT_TAIL(&pdpol_state.s_activeq, pg, pdqueue); pdpol_state.s_active++; pg->pqflags &= ~(PQ_INACTIVE | PQ_INTENT_SET); pg->pqflags |= PQ_ACTIVE; } void uvmpdpol_pageactivate(struct vm_page *pg) { KASSERT(uvm_page_owner_locked_p(pg, false)); KASSERT(mutex_owned(&pg->interlock)); uvmpdpol_set_intent(pg, PQ_INTENT_A); } static void uvmpdpol_pagedequeue_locked(struct vm_page *pg) { struct uvmpdpol_globalstate *s __diagused = &pdpol_state; KASSERT(mutex_owned(&s->lock)); KASSERT(mutex_owned(&pg->interlock)); if (pg->pqflags & PQ_ACTIVE) { TAILQ_REMOVE(&pdpol_state.s_activeq, pg, pdqueue); KASSERT((pg->pqflags & PQ_INACTIVE) == 0); KASSERT(pdpol_state.s_active > 0); pdpol_state.s_active--; } else if (pg->pqflags & PQ_INACTIVE) { TAILQ_REMOVE(&pdpol_state.s_inactiveq, pg, pdqueue); KASSERT(pdpol_state.s_inactive > 0); pdpol_state.s_inactive--; } pg->pqflags &= ~(PQ_ACTIVE | PQ_INACTIVE | PQ_INTENT_SET); } void uvmpdpol_pagedequeue(struct vm_page *pg) { KASSERT(uvm_page_owner_locked_p(pg, true)); KASSERT(mutex_owned(&pg->interlock)); uvmpdpol_set_intent(pg, PQ_INTENT_D); } void uvmpdpol_pageenqueue(struct vm_page *pg) { KASSERT(uvm_page_owner_locked_p(pg, false)); KASSERT(mutex_owned(&pg->interlock)); uvmpdpol_set_intent(pg, PQ_INTENT_E); } void uvmpdpol_anfree(struct vm_anon *an) { } bool uvmpdpol_pageisqueued_p(struct vm_page *pg) { uint32_t pqflags; /* * if there's an intent set, we have to consider it. otherwise, * return the actual state. we may be called unlocked for the * purpose of assertions, which is safe due to the page lifecycle. */ pqflags = atomic_load_relaxed(&pg->pqflags); if ((pqflags & PQ_INTENT_SET) != 0) { return (pqflags & PQ_INTENT_MASK) != PQ_INTENT_D; } else { return (pqflags & (PQ_ACTIVE | PQ_INACTIVE)) != 0; } } bool uvmpdpol_pageactivate_p(struct vm_page *pg) { uint32_t pqflags; /* consider intent in preference to actual state. */ pqflags = atomic_load_relaxed(&pg->pqflags); if ((pqflags & PQ_INTENT_SET) != 0) { pqflags &= PQ_INTENT_MASK; return pqflags != PQ_INTENT_A && pqflags != PQ_INTENT_E; } else { /* * TODO: Enabling this may be too much of a big hammer, * since we do get useful information from activations. * Think about it more and maybe come up with a heuristic * or something. * * return (pqflags & PQ_ACTIVE) == 0; */ return true; } } void uvmpdpol_estimatepageable(int *active, int *inactive) { struct uvmpdpol_globalstate *s = &pdpol_state; /* * Don't take any locks here. This can be called from DDB, and in * any case the numbers are stale the instant the lock is dropped, * so it just doesn't matter. */ if (active) { *active = s->s_active; } if (inactive) { *inactive = s->s_inactive; } } #if !defined(PDSIM) static int min_check(struct uvm_pctparam *pct, int t) { struct uvmpdpol_globalstate *s = &pdpol_state; int total = t; if (pct != &s->s_anonmin) { total += uvm_pctparam_get(&s->s_anonmin); } if (pct != &s->s_filemin) { total += uvm_pctparam_get(&s->s_filemin); } if (pct != &s->s_execmin) { total += uvm_pctparam_get(&s->s_execmin); } if (total > 95) { return EINVAL; } return 0; } #endif /* !defined(PDSIM) */ void uvmpdpol_init(void) { struct uvmpdpol_globalstate *s = &pdpol_state; mutex_init(&s->lock, MUTEX_DEFAULT, IPL_NONE); TAILQ_INIT(&s->s_activeq); TAILQ_INIT(&s->s_inactiveq); uvm_pctparam_init(&s->s_inactivepct, CLOCK_INACTIVEPCT, NULL); uvm_pctparam_init(&s->s_anonmin, 10, min_check); uvm_pctparam_init(&s->s_filemin, 10, min_check); uvm_pctparam_init(&s->s_execmin, 5, min_check); uvm_pctparam_init(&s->s_anonmax, 80, NULL); uvm_pctparam_init(&s->s_filemax, 50, NULL); uvm_pctparam_init(&s->s_execmax, 30, NULL); } void uvmpdpol_init_cpu(struct uvm_cpu *ucpu) { ucpu->pdq = kmem_alloc(CLOCK_PDQ_SIZE * sizeof(struct vm_page *), KM_SLEEP); ucpu->pdqhead = CLOCK_PDQ_SIZE; ucpu->pdqtail = CLOCK_PDQ_SIZE; } void uvmpdpol_reinit(void) { } bool uvmpdpol_needsscan_p(void) { /* * this must be an unlocked check: can be called from interrupt. */ return pdpol_state.s_inactive < pdpol_state.s_inactarg; } void uvmpdpol_tune(void) { struct uvmpdpol_globalstate *s = &pdpol_state; mutex_enter(&s->lock); clock_tune(); mutex_exit(&s->lock); } /* * uvmpdpol_pagerealize_locked: take the intended state set on a page and * make it real. return true if any work was done. */ static bool uvmpdpol_pagerealize_locked(struct vm_page *pg) { struct uvmpdpol_globalstate *s __diagused = &pdpol_state; KASSERT(mutex_owned(&s->lock)); KASSERT(mutex_owned(&pg->interlock)); switch (pg->pqflags & (PQ_INTENT_MASK | PQ_INTENT_SET)) { case PQ_INTENT_A | PQ_INTENT_SET: case PQ_INTENT_E | PQ_INTENT_SET: uvmpdpol_pageactivate_locked(pg); return true; case PQ_INTENT_I | PQ_INTENT_SET: uvmpdpol_pagedeactivate_locked(pg); return true; case PQ_INTENT_D | PQ_INTENT_SET: uvmpdpol_pagedequeue_locked(pg); return true; default: return false; } } /* * uvmpdpol_flush: return the current uvm_cpu with all of its pending * updates flushed to the global queues. this routine may block, and * so can switch cpu. the idea is to empty to queue on whatever cpu * we finally end up on. */ static struct uvm_cpu * uvmpdpol_flush(void) { struct uvmpdpol_globalstate *s __diagused = &pdpol_state; struct uvm_cpu *ucpu; struct vm_page *pg; KASSERT(kpreempt_disabled()); mutex_enter(&s->lock); for (;;) { /* * prefer scanning forwards (even though mutex_enter() is * serializing) so as to not defeat any prefetch logic in * the CPU. that means elsewhere enqueuing backwards, like * a stack, but not so important there as pages are being * added singularly. * * prefetch the next "struct vm_page" while working on the * current one. this has a measurable and very positive * effect in reducing the amount of time spent here under * the global lock. */ ucpu = curcpu()->ci_data.cpu_uvm; KASSERT(ucpu->pdqhead <= ucpu->pdqtail); if (__predict_false(ucpu->pdqhead == ucpu->pdqtail)) { break; } pg = ucpu->pdq[ucpu->pdqhead++]; if (__predict_true(ucpu->pdqhead != ucpu->pdqtail)) { __builtin_prefetch(ucpu->pdq[ucpu->pdqhead]); } mutex_enter(&pg->interlock); pg->pqflags &= ~PQ_INTENT_QUEUED; (void)uvmpdpol_pagerealize_locked(pg); mutex_exit(&pg->interlock); } mutex_exit(&s->lock); return ucpu; } /* * uvmpdpol_pagerealize: realize any intent set on the page. in this * implementation, that means putting the page on a per-CPU queue to be * dealt with later. */ void uvmpdpol_pagerealize(struct vm_page *pg) { struct uvm_cpu *ucpu; /* * drain the per per-CPU queue if full, then enter the page. */ kpreempt_disable(); ucpu = curcpu()->ci_data.cpu_uvm; if (__predict_false(ucpu->pdqhead == 0)) { ucpu = uvmpdpol_flush(); } ucpu->pdq[--(ucpu->pdqhead)] = pg; kpreempt_enable(); } /* * uvmpdpol_idle: called from the system idle loop. periodically purge any * pending updates back to the global queues. */ void uvmpdpol_idle(struct uvm_cpu *ucpu) { struct uvmpdpol_globalstate *s = &pdpol_state; struct vm_page *pg; KASSERT(kpreempt_disabled()); /* * if no pages in the queue, we have nothing to do. */ if (ucpu->pdqhead == ucpu->pdqtail) { ucpu->pdqtime = getticks(); return; } /* * don't do this more than ~8 times a second as it would needlessly * exert pressure. */ if (getticks() - ucpu->pdqtime < (hz >> 3)) { return; } /* * the idle LWP can't block, so we have to try for the lock. if we * get it, purge the per-CPU pending update queue. continually * check for a pending resched: in that case exit immediately. */ if (mutex_tryenter(&s->lock)) { while (ucpu->pdqhead != ucpu->pdqtail) { pg = ucpu->pdq[ucpu->pdqhead]; if (!mutex_tryenter(&pg->interlock)) { break; } ucpu->pdqhead++; pg->pqflags &= ~PQ_INTENT_QUEUED; (void)uvmpdpol_pagerealize_locked(pg); mutex_exit(&pg->interlock); if (curcpu()->ci_want_resched) { break; } } if (ucpu->pdqhead == ucpu->pdqtail) { ucpu->pdqtime = getticks(); } mutex_exit(&s->lock); } } #if !defined(PDSIM) #include <sys/sysctl.h> /* XXX SYSCTL_DESCR */ void uvmpdpol_sysctlsetup(void) { struct uvmpdpol_globalstate *s = &pdpol_state; uvm_pctparam_createsysctlnode(&s->s_anonmin, "anonmin", SYSCTL_DESCR("Percentage of physical memory reserved " "for anonymous application data")); uvm_pctparam_createsysctlnode(&s->s_filemin, "filemin", SYSCTL_DESCR("Percentage of physical memory reserved " "for cached file data")); uvm_pctparam_createsysctlnode(&s->s_execmin, "execmin", SYSCTL_DESCR("Percentage of physical memory reserved " "for cached executable data")); uvm_pctparam_createsysctlnode(&s->s_anonmax, "anonmax", SYSCTL_DESCR("Percentage of physical memory which will " "be reclaimed from other usage for " "anonymous application data")); uvm_pctparam_createsysctlnode(&s->s_filemax, "filemax", SYSCTL_DESCR("Percentage of physical memory which will " "be reclaimed from other usage for cached " "file data")); uvm_pctparam_createsysctlnode(&s->s_execmax, "execmax", SYSCTL_DESCR("Percentage of physical memory which will " "be reclaimed from other usage for cached " "executable data")); uvm_pctparam_createsysctlnode(&s->s_inactivepct, "inactivepct", SYSCTL_DESCR("Percentage of inactive queue of " "the entire (active + inactive) queue")); } #endif /* !defined(PDSIM) */ #if defined(PDSIM) void pdsim_dump(const char *id) { #if defined(DEBUG) /* XXX */ #endif /* defined(DEBUG) */ } #endif /* defined(PDSIM) */
32 29 31 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 /* $NetBSD: strncpy.c,v 1.4 2018/02/04 01:13:45 mrg Exp $ */ /*- * Copyright (c) 1990, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * Chris Torek. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include <sys/cdefs.h> #if defined(LIBC_SCCS) && !defined(lint) #if 0 static char sccsid[] = "@(#)strncpy.c 8.1 (Berkeley) 6/4/93"; #else __RCSID("$NetBSD: strncpy.c,v 1.4 2018/02/04 01:13:45 mrg Exp $"); #endif #endif /* LIBC_SCCS and not lint */ #if !defined(_KERNEL) && !defined(_STANDALONE) #include <assert.h> #include <string.h> #else #include <lib/libkern/libkern.h> #endif #ifdef _FORTIFY_SOURCE #undef strncpy #endif /* * Copy src to dst, truncating or null-padding to always copy n bytes. * Return dst. */ char * strncpy(char *dst, const char *src, size_t n) { if (n != 0) { char *d = dst; const char *s = src; do { if ((*d++ = *s++) == 0) { /* NUL pad the remaining n-1 bytes */ while (--n != 0) *d++ = 0; break; } } while (--n != 0); } return (dst); }
9 9 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 /* $NetBSD: in_cksum.c,v 1.22 2008/01/25 21:12:15 joerg Exp $ */ /* * Copyright (c) 1988, 1992, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)in_cksum.c 8.1 (Berkeley) 6/10/93 */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: in_cksum.c,v 1.22 2008/01/25 21:12:15 joerg Exp $"); #include <sys/param.h> #include <netinet/in.h> int in_cksum(struct mbuf *m, int len) { KASSERT(len >= 0); return cpu_in_cksum(m, len, 0, 0); }
2 2 2 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 /* $NetBSD: sys_getrandom.c,v 1.2 2021/12/28 13:22:43 riastradh Exp $ */ /*- * Copyright (c) 2020 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Taylor R. Campbell. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /* * getrandom() system call */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: sys_getrandom.c,v 1.2 2021/12/28 13:22:43 riastradh Exp $"); #include <sys/types.h> #include <sys/param.h> #include <sys/atomic.h> #include <sys/cprng.h> #include <sys/entropy.h> #include <sys/kmem.h> #include <sys/lwp.h> #include <sys/proc.h> #include <sys/random.h> #include <sys/sched.h> #include <sys/signalvar.h> #include <sys/syscallargs.h> #include <sys/uio.h> #include <crypto/nist_hash_drbg/nist_hash_drbg.h> #define RANDOM_BUFSIZE 512 int dogetrandom(struct uio *uio, unsigned int flags) { uint8_t seed[NIST_HASH_DRBG_SEEDLEN_BYTES] = {0}; struct nist_hash_drbg drbg; uint8_t *buf; int extractflags = 0; int error; KASSERT((flags & ~(GRND_RANDOM|GRND_INSECURE|GRND_NONBLOCK)) == 0); KASSERT((flags & (GRND_RANDOM|GRND_INSECURE)) != (GRND_RANDOM|GRND_INSECURE)); /* Get a buffer for transfers. */ buf = kmem_alloc(RANDOM_BUFSIZE, KM_SLEEP); /* * Fast path: for short reads other than from /dev/random, if * seeded or if INSECURE, just draw from per-CPU cprng_strong. */ if (uio->uio_resid <= RANDOM_BUFSIZE && !ISSET(flags, GRND_RANDOM) && (entropy_ready() || ISSET(flags, GRND_INSECURE))) { /* Generate data and transfer it out. */ cprng_strong(user_cprng, buf, uio->uio_resid, 0); error = uiomove(buf, uio->uio_resid, uio); goto out; } /* * Try to get a seed from the entropy pool. Fail if we would * block. If GRND_INSECURE, always return something even if it * is partial entropy; if !GRND_INSECURE, set ENTROPY_HARDFAIL * in order to tell entropy_extract not to bother drawing * anything from a partial pool if we can't get full entropy. */ if (!ISSET(flags, GRND_NONBLOCK) && !ISSET(flags, GRND_INSECURE)) extractflags |= ENTROPY_WAIT|ENTROPY_SIG; if (!ISSET(flags, GRND_INSECURE)) extractflags |= ENTROPY_HARDFAIL; error = entropy_extract(seed, sizeof seed, extractflags); if (error && !ISSET(flags, GRND_INSECURE)) goto out; /* Instantiate the DRBG. */ if (nist_hash_drbg_instantiate(&drbg, seed, sizeof seed, NULL, 0, NULL, 0)) panic("nist_hash_drbg_instantiate"); /* Promptly zero the seed. */ explicit_memset(seed, 0, sizeof seed); /* Generate data. */ error = 0; while (uio->uio_resid) { size_t n = MIN(uio->uio_resid, RANDOM_BUFSIZE); /* * Clamp /dev/random output to the entropy capacity and * seed size. Programs can't rely on long reads. */ if (ISSET(flags, GRND_RANDOM)) { n = MIN(n, ENTROPY_CAPACITY); n = MIN(n, sizeof seed); /* * Guarantee never to return more than one * buffer in this case to minimize bookkeeping. */ CTASSERT(ENTROPY_CAPACITY <= RANDOM_BUFSIZE); CTASSERT(sizeof seed <= RANDOM_BUFSIZE); } /* * Try to generate a block of data, but if we've hit * the DRBG reseed interval, reseed. */ if (nist_hash_drbg_generate(&drbg, buf, n, NULL, 0)) { /* * Get a fresh seed without blocking -- we have * already generated some output so it is not * useful to block. This can fail only if the * request is obscenely large, so it is OK for * either /dev/random or /dev/urandom to fail: * we make no promises about gigabyte-sized * reads happening all at once. */ error = entropy_extract(seed, sizeof seed, ENTROPY_HARDFAIL); if (error) break; /* Reseed and try again. */ if (nist_hash_drbg_reseed(&drbg, seed, sizeof seed, NULL, 0)) panic("nist_hash_drbg_reseed"); /* Promptly zero the seed. */ explicit_memset(seed, 0, sizeof seed); /* If it fails now, that's a bug. */ if (nist_hash_drbg_generate(&drbg, buf, n, NULL, 0)) panic("nist_hash_drbg_generate"); } /* Transfer n bytes out. */ error = uiomove(buf, n, uio); if (error) break; /* * If this is /dev/random, stop here, return what we * have, and force the next read to reseed. Programs * can't rely on /dev/random for long reads. */ if (ISSET(flags, GRND_RANDOM)) { error = 0; break; } /* Now's a good time to yield if needed. */ preempt_point(); /* Check for interruption after at least 256 bytes. */ CTASSERT(RANDOM_BUFSIZE >= 256); if (__predict_false(curlwp->l_flag & LW_PENDSIG) && sigispending(curlwp, 0)) { error = EINTR; break; } } out: /* Zero the buffer and free it. */ explicit_memset(buf, 0, RANDOM_BUFSIZE); kmem_free(buf, RANDOM_BUFSIZE); return error; } int sys_getrandom(struct lwp *l, const struct sys_getrandom_args *uap, register_t *retval) { /* { syscallarg(void *) buf; syscallarg(size_t) buflen; syscallarg(unsigned) flags; } */ void *buf = SCARG(uap, buf); size_t buflen = SCARG(uap, buflen); int flags = SCARG(uap, flags); int error; /* Set up an iov and uio to read into the user's buffer. */ struct iovec iov = { .iov_base = buf, .iov_len = buflen }; struct uio uio = { .uio_iov = &iov, .uio_iovcnt = 1, .uio_offset = 0, .uio_resid = buflen, .uio_rw = UIO_READ, .uio_vmspace = curproc->p_vmspace, }; /* Validate the flags. */ if (flags & ~(GRND_RANDOM|GRND_INSECURE|GRND_NONBLOCK)) { /* Unknown flags. */ error = EINVAL; goto out; } if ((flags & (GRND_RANDOM|GRND_INSECURE)) == (GRND_RANDOM|GRND_INSECURE)) { /* Nonsensical combination. */ error = EINVAL; goto out; } /* Do it. */ error = dogetrandom(&uio, flags); out: /* * If we transferred anything, return the number of bytes * transferred and suppress error; otherwise return the error. */ *retval = buflen - uio.uio_resid; if (*retval) error = 0; return error; }
5 1 2 2 4 3 1 5 2 4 2 2 2 1 1 2 1 1 1 10 8 1 3 1 1 1 4 4 1 10 10 6 5 1 1 4 13 6 2 1 2 2 3 3 3 2 3 6 1 1 3 1 1 3 5 2 3 4 1 1 3 4 2 2 2 1 2 3 1 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 /* $NetBSD: sys_lwp.c,v 1.89 2023/10/15 10:29:24 riastradh Exp $ */ /*- * Copyright (c) 2001, 2006, 2007, 2008, 2019, 2020, 2023 * The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Nathan J. Williams, and Andrew Doran. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /* * Lightweight process (LWP) system calls. See kern_lwp.c for a description * of LWPs. */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: sys_lwp.c,v 1.89 2023/10/15 10:29:24 riastradh Exp $"); #include <sys/param.h> #include <sys/cpu.h> #include <sys/kauth.h> #include <sys/kmem.h> #include <sys/lwpctl.h> #include <sys/pool.h> #include <sys/proc.h> #include <sys/pserialize.h> #include <sys/ptrace.h> #include <sys/sleepq.h> #include <sys/syncobj.h> #include <sys/syscallargs.h> #include <sys/systm.h> #include <sys/types.h> #include <uvm/uvm_extern.h> #define LWP_UNPARK_MAX 1024 static const stack_t lwp_ss_init = SS_INIT; /* * Parked LWPs get no priority boost on awakening as they blocked on * user space objects. Maybe revisit? */ syncobj_t lwp_park_syncobj = { .sobj_name = "lwp_park", .sobj_flag = SOBJ_SLEEPQ_NULL, .sobj_boostpri = PRI_USER, .sobj_unsleep = sleepq_unsleep, .sobj_changepri = sleepq_changepri, .sobj_lendpri = sleepq_lendpri, .sobj_owner = syncobj_noowner, }; static void mi_startlwp(void *arg) { struct lwp *l = curlwp; struct proc *p = l->l_proc; (p->p_emul->e_startlwp)(arg); /* If the process is traced, report lwp creation to a debugger */ if ((p->p_slflag & (PSL_TRACED|PSL_TRACELWP_CREATE)) == (PSL_TRACED|PSL_TRACELWP_CREATE)) { /* Paranoid check */ mutex_enter(&proc_lock); if ((p->p_slflag & (PSL_TRACED|PSL_TRACELWP_CREATE)) != (PSL_TRACED|PSL_TRACELWP_CREATE)) { mutex_exit(&proc_lock); return; } mutex_enter(p->p_lock); eventswitch(TRAP_LWP, PTRACE_LWP_CREATE, l->l_lid); } } int do_lwp_create(lwp_t *l, void *arg, u_long flags, lwp_t **l2, const sigset_t *sigmask, const stack_t *sigstk) { struct proc *p = l->l_proc; vaddr_t uaddr; int error; /* XXX check against resource limits */ uaddr = uvm_uarea_alloc(); if (__predict_false(uaddr == 0)) return ENOMEM; error = lwp_create(l, p, uaddr, flags & LWP_DETACHED, NULL, 0, mi_startlwp, arg, l2, l->l_class, sigmask, &lwp_ss_init); if (__predict_false(error)) { uvm_uarea_free(uaddr); return error; } return 0; } int sys__lwp_create(struct lwp *l, const struct sys__lwp_create_args *uap, register_t *retval) { /* { syscallarg(const ucontext_t *) ucp; syscallarg(u_long) flags; syscallarg(lwpid_t *) new_lwp; } */ struct proc *p = l->l_proc; ucontext_t *newuc; lwp_t *l2; int error; newuc = kmem_alloc(sizeof(ucontext_t), KM_SLEEP); error = copyin(SCARG(uap, ucp), newuc, p->p_emul->e_ucsize); if (error) goto fail; /* validate the ucontext */ if ((newuc->uc_flags & _UC_CPU) == 0) { error = EINVAL; goto fail; } error = cpu_mcontext_validate(l, &newuc->uc_mcontext); if (error) goto fail; const sigset_t *sigmask = newuc->uc_flags & _UC_SIGMASK ? &newuc->uc_sigmask : &l->l_sigmask; error = do_lwp_create(l, newuc, SCARG(uap, flags), &l2, sigmask, &SS_INIT); if (error) goto fail; error = copyout(&l2->l_lid, SCARG(uap, new_lwp), sizeof(l2->l_lid)); if (error == 0) { lwp_start(l2, SCARG(uap, flags)); return 0; } lwp_exit(l2); fail: kmem_free(newuc, sizeof(ucontext_t)); return error; } int sys__lwp_exit(struct lwp *l, const void *v, register_t *retval) { lwp_exit(l); return 0; } int sys__lwp_self(struct lwp *l, const void *v, register_t *retval) { *retval = l->l_lid; return 0; } int sys__lwp_getprivate(struct lwp *l, const void *v, register_t *retval) { *retval = (uintptr_t)l->l_private; return 0; } int sys__lwp_setprivate(struct lwp *l, const struct sys__lwp_setprivate_args *uap, register_t *retval) { /* { syscallarg(void *) ptr; } */ return lwp_setprivate(l, SCARG(uap, ptr)); } int sys__lwp_suspend(struct lwp *l, const struct sys__lwp_suspend_args *uap, register_t *retval) { /* { syscallarg(lwpid_t) target; } */ struct proc *p = l->l_proc; struct lwp *t; int error; mutex_enter(p->p_lock); if ((t = lwp_find(p, SCARG(uap, target))) == NULL) { mutex_exit(p->p_lock); return ESRCH; } /* * Check for deadlock, which is only possible when we're suspending * ourself. XXX There is a short race here, as p_nrlwps is only * incremented when an LWP suspends itself on the kernel/user * boundary. It's still possible to kill -9 the process so we * don't bother checking further. */ lwp_lock(t); if ((t == l && p->p_nrlwps == 1) || (l->l_flag & (LW_WCORE | LW_WEXIT)) != 0) { lwp_unlock(t); mutex_exit(p->p_lock); return EDEADLK; } /* * Suspend the LWP. XXX If it's on a different CPU, we should wait * for it to be preempted, where it will put itself to sleep. * * Suspension of the current LWP will happen on return to userspace. */ error = lwp_suspend(l, t); if (error) { mutex_exit(p->p_lock); return error; } /* * Wait for: * o process exiting * o target LWP suspended * o target LWP not suspended and L_WSUSPEND clear * o target LWP exited */ for (;;) { error = cv_wait_sig(&p->p_lwpcv, p->p_lock); if (error) { error = ERESTART; break; } if (lwp_find(p, SCARG(uap, target)) == NULL) { error = ESRCH; break; } if ((l->l_flag | t->l_flag) & (LW_WCORE | LW_WEXIT)) { error = ERESTART; break; } if (t->l_stat == LSSUSPENDED || (t->l_flag & LW_WSUSPEND) == 0) break; } mutex_exit(p->p_lock); return error; } int sys__lwp_continue(struct lwp *l, const struct sys__lwp_continue_args *uap, register_t *retval) { /* { syscallarg(lwpid_t) target; } */ int error; struct proc *p = l->l_proc; struct lwp *t; error = 0; mutex_enter(p->p_lock); if ((t = lwp_find(p, SCARG(uap, target))) == NULL) { mutex_exit(p->p_lock); return ESRCH; } lwp_lock(t); lwp_continue(t); mutex_exit(p->p_lock); return error; } int sys__lwp_wakeup(struct lwp *l, const struct sys__lwp_wakeup_args *uap, register_t *retval) { /* { syscallarg(lwpid_t) target; } */ struct lwp *t; struct proc *p; int error; p = l->l_proc; mutex_enter(p->p_lock); if ((t = lwp_find(p, SCARG(uap, target))) == NULL) { mutex_exit(p->p_lock); return ESRCH; } lwp_lock(t); t->l_flag |= (LW_CANCELLED | LW_UNPARKED); if (t->l_stat != LSSLEEP) { lwp_unlock(t); error = ENODEV; } else if ((t->l_flag & LW_SINTR) == 0) { lwp_unlock(t); error = EBUSY; } else { /* Wake it up. lwp_unsleep() will release the LWP lock. */ lwp_unsleep(t, true); error = 0; } mutex_exit(p->p_lock); return error; } int sys__lwp_wait(struct lwp *l, const struct sys__lwp_wait_args *uap, register_t *retval) { /* { syscallarg(lwpid_t) wait_for; syscallarg(lwpid_t *) departed; } */ struct proc *p = l->l_proc; int error; lwpid_t dep; mutex_enter(p->p_lock); error = lwp_wait(l, SCARG(uap, wait_for), &dep, false); mutex_exit(p->p_lock); if (!error && SCARG(uap, departed)) { error = copyout(&dep, SCARG(uap, departed), sizeof(dep)); } return error; } int sys__lwp_kill(struct lwp *l, const struct sys__lwp_kill_args *uap, register_t *retval) { /* { syscallarg(lwpid_t) target; syscallarg(int) signo; } */ struct proc *p = l->l_proc; struct lwp *t; ksiginfo_t ksi; int signo = SCARG(uap, signo); int error = 0; if ((u_int)signo >= NSIG) return EINVAL; KSI_INIT(&ksi); ksi.ksi_signo = signo; ksi.ksi_code = SI_LWP; ksi.ksi_pid = p->p_pid; ksi.ksi_uid = kauth_cred_geteuid(l->l_cred); ksi.ksi_lid = SCARG(uap, target); mutex_enter(&proc_lock); mutex_enter(p->p_lock); if ((t = lwp_find(p, ksi.ksi_lid)) == NULL) error = ESRCH; else if (signo != 0) kpsignal2(p, &ksi); mutex_exit(p->p_lock); mutex_exit(&proc_lock); return error; } int sys__lwp_detach(struct lwp *l, const struct sys__lwp_detach_args *uap, register_t *retval) { /* { syscallarg(lwpid_t) target; } */ struct proc *p; struct lwp *t; lwpid_t target; int error; target = SCARG(uap, target); p = l->l_proc; mutex_enter(p->p_lock); if (l->l_lid == target) t = l; else { /* * We can't use lwp_find() here because the target might * be a zombie. */ t = proc_find_lwp(p, target); KASSERT(t == NULL || t->l_lid == target); } /* * If the LWP is already detached, there's nothing to do. * If it's a zombie, we need to clean up after it. LSZOMB * is visible with the proc mutex held. * * After we have detached or released the LWP, kick any * other LWPs that may be sitting in _lwp_wait(), waiting * for the target LWP to exit. */ if (t != NULL && t->l_stat != LSIDL) { if ((t->l_prflag & LPR_DETACHED) == 0) { p->p_ndlwps++; t->l_prflag |= LPR_DETACHED; if (t->l_stat == LSZOMB) { /* Releases proc mutex. */ lwp_free(t, false, false); return 0; } error = 0; /* * Have any LWPs sleeping in lwp_wait() recheck * for deadlock. */ cv_broadcast(&p->p_lwpcv); } else error = EINVAL; } else error = ESRCH; mutex_exit(p->p_lock); return error; } int lwp_unpark(const lwpid_t *tp, const u_int ntargets) { u_int target; kmutex_t *mp; int error, s; proc_t *p; lwp_t *t; p = curproc; error = 0; s = pserialize_read_enter(); for (target = 0; target < ntargets; target++) { t = proc_find_lwp_unlocked(p, tp[target]); if (__predict_false(t == NULL)) { error = ESRCH; continue; } KASSERT(lwp_locked(t, NULL)); if (__predict_true(t->l_syncobj == &lwp_park_syncobj)) { /* As expected it's parked, so wake it up. */ mp = t->l_mutex; sleepq_remove(NULL, t, true); mutex_spin_exit(mp); } else if (__predict_false(t->l_stat == LSZOMB)) { lwp_unlock(t); error = ESRCH; } else { /* * It hasn't parked yet because the wakeup side won * the race, or something else has happened to make * the thread not park. Why doesn't really matter. * Set the operation pending, so that the next call * to _lwp_park() in the LWP returns early. If it * turns out to be a spurious wakeup, no harm done. */ t->l_flag |= LW_UNPARKED; lwp_unlock(t); } } pserialize_read_exit(s); return error; } int lwp_park(clockid_t clock_id, int flags, struct timespec *ts) { int timo, error; struct timespec start; lwp_t *l; bool timeremain = !(flags & TIMER_ABSTIME) && ts; if (ts != NULL) { if ((error = ts2timo(clock_id, flags, ts, &timo, timeremain ? &start : NULL)) != 0) return error; KASSERT(timo != 0); } else { timo = 0; } /* * Before going the full route and blocking, check to see if an * unpark op is pending. */ l = curlwp; lwp_lock(l); if ((l->l_flag & (LW_CANCELLED | LW_UNPARKED)) != 0) { l->l_flag &= ~(LW_CANCELLED | LW_UNPARKED); lwp_unlock(l); return EALREADY; } sleepq_enqueue(NULL, l, "parked", &lwp_park_syncobj, true); error = sleepq_block(timo, true, &lwp_park_syncobj, 0); switch (error) { case EWOULDBLOCK: error = ETIMEDOUT; if (timeremain) memset(ts, 0, sizeof(*ts)); break; case ERESTART: error = EINTR; /*FALLTHROUGH*/ default: if (timeremain) clock_timeleft(clock_id, ts, &start); break; } return error; } /* * 'park' an LWP waiting on a user-level synchronisation object. The LWP * will remain parked until another LWP in the same process calls in and * requests that it be unparked. */ int sys____lwp_park60(struct lwp *l, const struct sys____lwp_park60_args *uap, register_t *retval) { /* { syscallarg(clockid_t) clock_id; syscallarg(int) flags; syscallarg(struct timespec *) ts; syscallarg(lwpid_t) unpark; syscallarg(const void *) hint; syscallarg(const void *) unparkhint; } */ struct timespec ts, *tsp; int error; if (SCARG(uap, ts) == NULL) tsp = NULL; else { error = copyin(SCARG(uap, ts), &ts, sizeof(ts)); if (error != 0) return error; tsp = &ts; } if (SCARG(uap, unpark) != 0) { error = lwp_unpark(&SCARG(uap, unpark), 1); if (error != 0) return error; } error = lwp_park(SCARG(uap, clock_id), SCARG(uap, flags), tsp); if (SCARG(uap, ts) != NULL && (SCARG(uap, flags) & TIMER_ABSTIME) == 0) (void)copyout(tsp, SCARG(uap, ts), sizeof(*tsp)); return error; } int sys__lwp_unpark(struct lwp *l, const struct sys__lwp_unpark_args *uap, register_t *retval) { /* { syscallarg(lwpid_t) target; syscallarg(const void *) hint; } */ return lwp_unpark(&SCARG(uap, target), 1); } int sys__lwp_unpark_all(struct lwp *l, const struct sys__lwp_unpark_all_args *uap, register_t *retval) { /* { syscallarg(const lwpid_t *) targets; syscallarg(size_t) ntargets; syscallarg(const void *) hint; } */ lwpid_t targets[32], *tp; int error; u_int ntargets; size_t sz; ntargets = SCARG(uap, ntargets); if (SCARG(uap, targets) == NULL) { /* * Let the caller know how much we are willing to do, and * let it unpark the LWPs in blocks. */ *retval = LWP_UNPARK_MAX; return 0; } if (ntargets > LWP_UNPARK_MAX || ntargets == 0) return EINVAL; /* * Copy in the target array. If it's a small number of LWPs, then * place the numbers on the stack. */ sz = sizeof(lwpid_t) * ntargets; if (sz <= sizeof(targets)) tp = targets; else tp = kmem_alloc(sz, KM_SLEEP); error = copyin(SCARG(uap, targets), tp, sz); if (error != 0) { if (tp != targets) { kmem_free(tp, sz); } return error; } error = lwp_unpark(tp, ntargets); if (tp != targets) kmem_free(tp, sz); return error; } int sys__lwp_setname(struct lwp *l, const struct sys__lwp_setname_args *uap, register_t *retval) { /* { syscallarg(lwpid_t) target; syscallarg(const char *) name; } */ char *name, *oname; lwpid_t target; proc_t *p; lwp_t *t; int error; if ((target = SCARG(uap, target)) == 0) target = l->l_lid; name = kmem_alloc(MAXCOMLEN, KM_SLEEP); error = copyinstr(SCARG(uap, name), name, MAXCOMLEN, NULL); switch (error) { case ENAMETOOLONG: case 0: name[MAXCOMLEN - 1] = '\0'; break; default: kmem_free(name, MAXCOMLEN); return error; } p = curproc; mutex_enter(p->p_lock); if ((t = lwp_find(p, target)) == NULL) { mutex_exit(p->p_lock); kmem_free(name, MAXCOMLEN); return ESRCH; } lwp_lock(t); oname = t->l_name; t->l_name = name; lwp_unlock(t); mutex_exit(p->p_lock); if (oname != NULL) kmem_free(oname, MAXCOMLEN); return 0; } int sys__lwp_getname(struct lwp *l, const struct sys__lwp_getname_args *uap, register_t *retval) { /* { syscallarg(lwpid_t) target; syscallarg(char *) name; syscallarg(size_t) len; } */ char name[MAXCOMLEN]; lwpid_t target; size_t len; proc_t *p; lwp_t *t; if ((target = SCARG(uap, target)) == 0) target = l->l_lid; p = curproc; mutex_enter(p->p_lock); if ((t = lwp_find(p, target)) == NULL) { mutex_exit(p->p_lock); return ESRCH; } lwp_lock(t); if (t->l_name == NULL) name[0] = '\0'; else strlcpy(name, t->l_name, sizeof(name)); lwp_unlock(t); mutex_exit(p->p_lock); len = uimin(SCARG(uap, len), sizeof(name)); return copyoutstr(name, SCARG(uap, name), len, NULL); } int sys__lwp_ctl(struct lwp *l, const struct sys__lwp_ctl_args *uap, register_t *retval) { /* { syscallarg(int) features; syscallarg(struct lwpctl **) address; } */ int error, features; vaddr_t vaddr; features = SCARG(uap, features); features &= ~(LWPCTL_FEATURE_CURCPU | LWPCTL_FEATURE_PCTR); if (features != 0) return ENODEV; if ((error = lwp_ctl_alloc(&vaddr)) != 0) return error; return copyout(&vaddr, SCARG(uap, address), sizeof(void *)); }
2786 338 2766 2759 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 /* $NetBSD: syscallvar.h,v 1.12 2018/04/19 21:19:07 christos Exp $ */ /*- * Copyright (c) 2008 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software developed for The NetBSD Foundation * by Andrew Doran. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #ifndef _SYS_SYSCALLVAR_H_ #define _SYS_SYSCALLVAR_H_ #ifndef _KERNEL #error nothing of interest to userspace here #endif #if defined(_KERNEL) && defined(_KERNEL_OPT) #include "opt_dtrace.h" #endif #include <sys/systm.h> #include <sys/proc.h> extern struct emul emul_netbsd; struct syscall_package { u_short sp_code; u_short sp_flags; sy_call_t *sp_call; }; void syscall_init(void); int syscall_establish(const struct emul *, const struct syscall_package *); int syscall_disestablish(const struct emul *, const struct syscall_package *); static __inline int sy_call(const struct sysent *sy, struct lwp *l, const void *uap, register_t *rval) { int error; l->l_sysent = sy; error = (*sy->sy_call)(l, uap, rval); l->l_sysent = NULL; return error; } static __inline int sy_invoke(const struct sysent *sy, struct lwp *l, const void *uap, register_t *rval, int code) { const bool do_trace = l->l_proc->p_trace_enabled && (sy->sy_flags & SYCALL_INDIRECT) == 0; int error; #ifdef KDTRACE_HOOKS #define KDTRACE_ENTRY(a) (a) #else #define KDTRACE_ENTRY(a) (0) #endif if (__predict_true(!(do_trace || KDTRACE_ENTRY(sy->sy_entry))) || (error = trace_enter(code, sy, uap)) == 0) { rval[0] = 0; #if !defined(__mips__) && !defined(__m68k__) /* * Due to the mips userland code for SYS_break needing v1 to be * preserved, we can't clear this on mips. */ rval[1] = 0; #endif error = sy_call(sy, l, uap, rval); } if (__predict_false(do_trace || KDTRACE_ENTRY(sy->sy_return))) { trace_exit(code, sy, uap, rval, error); } return error; } /* inclusion in the kernel currently depends on SYSCALL_DEBUG */ extern const char * const syscallnames[]; extern const char * const altsyscallnames[]; #endif /* _SYS_SYSCALLVAR_H_ */
10 10 1 9 9 5 5 5 5 8 8 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 /* $NetBSD: raw_cb.c,v 1.24 2017/09/25 01:56:22 ozaki-r Exp $ */ /* * Copyright (c) 1980, 1986, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)raw_cb.c 8.1 (Berkeley) 6/10/93 */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: raw_cb.c,v 1.24 2017/09/25 01:56:22 ozaki-r Exp $"); #include <sys/param.h> #include <sys/systm.h> #include <sys/mbuf.h> #include <sys/socket.h> #include <sys/socketvar.h> #include <sys/domain.h> #include <sys/protosw.h> #include <sys/kmem.h> #include <net/if.h> #include <net/route.h> #include <net/raw_cb.h> #include <netinet/in.h> /* * Routines to manage the raw protocol control blocks. * * TODO: * hash lookups by protocol family/protocol + address family * take care of unique address problems per AF? * redo address binding to allow wildcards */ static u_long raw_sendspace = RAWSNDQ; static u_long raw_recvspace = RAWRCVQ; /* * Allocate a nominal amount of buffer space for the socket. */ int raw_attach(struct socket *so, int proto, struct rawcbhead *rawcbhead) { struct rawcb *rp; int error; /* * It is assumed that raw_attach() is called after space has been * allocated for the rawcb; consumer protocols may simply allocate * type struct rawcb, or a wrapper data structure that begins with a * struct rawcb. */ rp = sotorawcb(so); KASSERT(rp != NULL); sosetlock(so); if ((error = soreserve(so, raw_sendspace, raw_recvspace)) != 0) { return error; } rp->rcb_socket = so; rp->rcb_proto.sp_family = so->so_proto->pr_domain->dom_family; rp->rcb_proto.sp_protocol = proto; LIST_INSERT_HEAD(rawcbhead, rp, rcb_list); KASSERT(solocked(so)); return 0; } /* * Detach the raw connection block and discard socket resources. */ void raw_detach(struct socket *so) { struct rawcb *rp = sotorawcb(so); const size_t rcb_len = rp->rcb_len; KASSERT(rp != NULL); KASSERT(solocked(so)); /* Remove the last reference. */ LIST_REMOVE(rp, rcb_list); so->so_pcb = NULL; /* Note: sofree() drops the socket's lock. */ sofree(so); kmem_free(rp, rcb_len); if (so->so_lock != softnet_lock) { so->so_lock = softnet_lock; mutex_obj_hold(softnet_lock); } mutex_enter(softnet_lock); } /* * Disconnect and possibly release resources. */ void raw_disconnect(struct rawcb *rp) { struct socket *so = rp->rcb_socket; if (so->so_state & SS_NOFDREF) { raw_detach(so); } }
19 2 1 12 11 11 10 86 1 72 71 66 67 2 2 2 5 5 4 1 1 24 23 21 3 4 3 4 12 12 10 1 10 10 10 10 10 10 3 3 3 3 3 3 3 209 209 209 207 209 207 16 16 15 16 16 154 2 154 6 1 5 5 5 3 2 72 72 65 3 66 66 67 67 64 3 67 7 6 7 7 7 7 7 13 13 12 10 10 18 18 18 16 16 156 159 160 158 157 2 2 2 2 2 13 13 13 13 13 8 8 8 8 8 82 5 77 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 /* $NetBSD: subr_devsw.c,v 1.51 2023/02/15 13:12:45 riastradh Exp $ */ /*- * Copyright (c) 2001, 2002, 2007, 2008 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by MAEKAWA Masahide <gehenna@NetBSD.org>, and by Andrew Doran. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /* * Overview * * subr_devsw.c: registers device drivers by name and by major * number, and provides wrapper methods for performing I/O and * other tasks on device drivers, keying on the device number * (dev_t). * * When the system is built, the config(8) command generates * static tables of device drivers built into the kernel image * along with their associated methods. These are recorded in * the cdevsw0 and bdevsw0 tables. Drivers can also be added to * and removed from the system dynamically. * * Allocation * * When the system initially boots only the statically allocated * indexes (bdevsw0, cdevsw0) are used. If these overflow due to * allocation, we allocate a fixed block of memory to hold the new, * expanded index. This "fork" of the table is only ever performed * once in order to guarantee that other threads may safely access * the device tables: * * o Once a thread has a "reference" to the table via an earlier * open() call, we know that the entry in the table must exist * and so it is safe to access it. * * o Regardless of whether other threads see the old or new * pointers, they will point to a correct device switch * structure for the operation being performed. * * XXX Currently, the wrapper methods such as cdev_read() verify * that a device driver does in fact exist before calling the * associated driver method. This should be changed so that * once the device is has been referenced by a vnode (opened), * calling the other methods should be valid until that reference * is dropped. */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: subr_devsw.c,v 1.51 2023/02/15 13:12:45 riastradh Exp $"); #ifdef _KERNEL_OPT #include "opt_dtrace.h" #endif #include <sys/param.h> #include <sys/conf.h> #include <sys/kmem.h> #include <sys/systm.h> #include <sys/poll.h> #include <sys/tty.h> #include <sys/cpu.h> #include <sys/buf.h> #include <sys/reboot.h> #include <sys/sdt.h> #include <sys/atomic.h> #include <sys/localcount.h> #include <sys/pserialize.h> #include <sys/xcall.h> #include <sys/device.h> #ifdef DEVSW_DEBUG #define DPRINTF(x) printf x #else /* DEVSW_DEBUG */ #define DPRINTF(x) #endif /* DEVSW_DEBUG */ #define MAXDEVSW 512 /* the maximum of major device number */ #define BDEVSW_SIZE (sizeof(struct bdevsw *)) #define CDEVSW_SIZE (sizeof(struct cdevsw *)) #define DEVSWCONV_SIZE (sizeof(struct devsw_conv)) struct devswref { struct localcount *dr_lc; }; /* XXX bdevsw, cdevsw, max_bdevsws, and max_cdevsws should be volatile */ extern const struct bdevsw **bdevsw, *bdevsw0[]; extern const struct cdevsw **cdevsw, *cdevsw0[]; extern struct devsw_conv *devsw_conv, devsw_conv0[]; extern const int sys_bdevsws, sys_cdevsws; extern int max_bdevsws, max_cdevsws, max_devsw_convs; static struct devswref *cdevswref; static struct devswref *bdevswref; static kcondvar_t devsw_cv; static int bdevsw_attach(const struct bdevsw *, devmajor_t *); static int cdevsw_attach(const struct cdevsw *, devmajor_t *); static void devsw_detach_locked(const struct bdevsw *, const struct cdevsw *); kmutex_t device_lock; void (*biodone_vfs)(buf_t *) = (void *)nullop; /* * bdev probes */ SDT_PROBE_DEFINE6(sdt, bdev, open, acquire, "struct bdevsw *"/*bdevsw*/, "dev_t"/*dev*/, "int"/*flag*/, "int"/*devtype*/, "int"/*unit*/, "device_t"/*dv*/); SDT_PROBE_DEFINE4(sdt, bdev, open, entry, "struct bdevsw *"/*bdevsw*/, "dev_t"/*dev*/, "int"/*flag*/, "int"/*devtype*/); SDT_PROBE_DEFINE5(sdt, bdev, open, return, "struct bdevsw *"/*bdevsw*/, "dev_t"/*dev*/, "int"/*flag*/, "int"/*devtype*/, "int"/*error*/); SDT_PROBE_DEFINE6(sdt, bdev, open, release, "struct bdevsw *"/*bdevsw*/, "dev_t"/*dev*/, "int"/*flag*/, "int"/*devtype*/, "int"/*unit*/, "device_t"/*dv*/); SDT_PROBE_DEFINE4(sdt, bdev, cancel, entry, "struct bdevsw *"/*bdevsw*/, "dev_t"/*dev*/, "int"/*flag*/, "int"/*devtype*/); SDT_PROBE_DEFINE5(sdt, bdev, cancel, return, "struct bdevsw *"/*bdevsw*/, "dev_t"/*dev*/, "int"/*flag*/, "int"/*devtype*/, "int"/*error*/); SDT_PROBE_DEFINE4(sdt, bdev, close, entry, "struct bdevsw *"/*bdevsw*/, "dev_t"/*dev*/, "int"/*flag*/, "int"/*devtype*/); SDT_PROBE_DEFINE5(sdt, bdev, close, return, "struct bdevsw *"/*bdevsw*/, "dev_t"/*dev*/, "int"/*flag*/, "int"/*devtype*/, "int"/*error*/); SDT_PROBE_DEFINE3(sdt, bdev, strategy, entry, "struct bdevsw *"/*bdevsw*/, "dev_t"/*dev*/, "struct buf *"/*bp*/); SDT_PROBE_DEFINE3(sdt, bdev, strategy, return, "struct bdevsw *"/*bdevsw*/, "dev_t"/*dev*/, "struct buf *"/*bp*/); SDT_PROBE_DEFINE5(sdt, bdev, ioctl, entry, "struct bdevsw *"/*bdevsw*/, "dev_t"/*dev*/, "unsigned long"/*cmd*/, "void *"/*data*/, "int"/*flag*/); SDT_PROBE_DEFINE6(sdt, bdev, ioctl, return, "struct bdevsw *"/*bdevsw*/, "dev_t"/*dev*/, "unsigned long"/*cmd*/, "void *"/*data*/, "int"/*flag*/, "int"/*error*/); SDT_PROBE_DEFINE2(sdt, bdev, psize, entry, "struct bdevsw *"/*bdevsw*/, "dev_t"/*dev*/); SDT_PROBE_DEFINE3(sdt, bdev, psize, return, "struct bdevsw *"/*bdevsw*/, "dev_t"/*dev*/, "int"/*psize*/); SDT_PROBE_DEFINE4(sdt, bdev, discard, entry, "struct bdevsw *"/*bdevsw*/, "dev_t"/*dev*/, "off_t"/*pos*/, "off_t"/*len*/); SDT_PROBE_DEFINE5(sdt, bdev, discard, return, "struct bdevsw *"/*bdevsw*/, "dev_t"/*dev*/, "off_t"/*pos*/, "off_t"/*len*/, "int"/*error*/); /* * cdev probes */ SDT_PROBE_DEFINE6(sdt, cdev, open, acquire, "struct cdevsw *"/*cdevsw*/, "dev_t"/*dev*/, "int"/*flag*/, "int"/*devtype*/, "int"/*unit*/, "device_t"/*dv*/); SDT_PROBE_DEFINE4(sdt, cdev, open, entry, "struct cdevsw *"/*cdevsw*/, "dev_t"/*dev*/, "int"/*flag*/, "int"/*devtype*/); SDT_PROBE_DEFINE5(sdt, cdev, open, return, "struct cdevsw *"/*cdevsw*/, "dev_t"/*dev*/, "int"/*flag*/, "int"/*devtype*/, "int"/*error*/); SDT_PROBE_DEFINE6(sdt, cdev, open, release, "struct cdevsw *"/*cdevsw*/, "dev_t"/*dev*/, "int"/*flag*/, "int"/*devtype*/, "int"/*unit*/, "device_t"/*dv*/); SDT_PROBE_DEFINE4(sdt, cdev, cancel, entry, "struct cdevsw *"/*cdevsw*/, "dev_t"/*dev*/, "int"/*flag*/, "int"/*devtype*/); SDT_PROBE_DEFINE5(sdt, cdev, cancel, return, "struct cdevsw *"/*cdevsw*/, "dev_t"/*dev*/, "int"/*flag*/, "int"/*devtype*/, "int"/*error*/); SDT_PROBE_DEFINE4(sdt, cdev, close, entry, "struct cdevsw *"/*cdevsw*/, "dev_t"/*dev*/, "int"/*flag*/, "int"/*devtype*/); SDT_PROBE_DEFINE5(sdt, cdev, close, return, "struct cdevsw *"/*cdevsw*/, "dev_t"/*dev*/, "int"/*flag*/, "int"/*devtype*/, "int"/*error*/); SDT_PROBE_DEFINE4(sdt, cdev, read, entry, "struct cdevsw *"/*cdevsw*/, "dev_t"/*dev*/, "struct uio *"/*uio*/, "int"/*flag*/); SDT_PROBE_DEFINE5(sdt, cdev, read, return, "struct cdevsw *"/*cdevsw*/, "dev_t"/*dev*/, "struct uio *"/*uio*/, "int"/*flag*/, "int"/*error*/); SDT_PROBE_DEFINE4(sdt, cdev, write, entry, "struct cdevsw *"/*cdevsw*/, "dev_t"/*dev*/, "struct uio *"/*uio*/, "int"/*flag*/); SDT_PROBE_DEFINE5(sdt, cdev, write, return, "struct cdevsw *"/*cdevsw*/, "dev_t"/*dev*/, "struct uio *"/*uio*/, "int"/*flag*/, "int"/*error*/); SDT_PROBE_DEFINE5(sdt, cdev, ioctl, entry, "struct cdevsw *"/*cdevsw*/, "dev_t"/*dev*/, "unsigned long"/*cmd*/, "void *"/*data*/, "int"/*flag*/); SDT_PROBE_DEFINE6(sdt, cdev, ioctl, return, "struct cdevsw *"/*cdevsw*/, "dev_t"/*dev*/, "unsigned long"/*cmd*/, "void *"/*data*/, "int"/*flag*/, "int"/*error*/); SDT_PROBE_DEFINE4(sdt, cdev, stop, entry, "struct cdevsw *"/*cdevsw*/, "dev_t"/*dev*/, "struct tty *"/*tp*/, "int"/*flag*/); SDT_PROBE_DEFINE4(sdt, cdev, stop, return, "struct cdevsw *"/*cdevsw*/, "dev_t"/*dev*/, "struct tty *"/*tp*/, "int"/*flag*/); SDT_PROBE_DEFINE3(sdt, cdev, poll, entry, "struct cdevsw *"/*cdevsw*/, "dev_t"/*dev*/, "int"/*events*/); SDT_PROBE_DEFINE4(sdt, cdev, poll, return, "struct cdevsw *"/*cdevsw*/, "dev_t"/*dev*/, "int"/*events*/, "int"/*revents*/); SDT_PROBE_DEFINE4(sdt, cdev, mmap, entry, "struct cdevsw *"/*cdevsw*/, "dev_t"/*dev*/, "off_t"/*off*/, "int"/*flag*/); SDT_PROBE_DEFINE5(sdt, cdev, mmap, return, "struct cdevsw *"/*cdevsw*/, "dev_t"/*dev*/, "off_t"/*off*/, "int"/*flag*/, "paddr_t"/*mmapcookie*/); SDT_PROBE_DEFINE3(sdt, cdev, kqfilter, entry, "struct cdevsw *"/*cdevsw*/, "dev_t"/*dev*/, "struct knote *"/*kn*/); SDT_PROBE_DEFINE4(sdt, cdev, kqfilter, return, "struct cdevsw *"/*cdevsw*/, "dev_t"/*dev*/, "struct knote *"/*kn*/, "int"/*error*/); SDT_PROBE_DEFINE4(sdt, cdev, discard, entry, "struct cdevsw *"/*cdevsw*/, "dev_t"/*dev*/, "off_t"/*pos*/, "off_t"/*len*/); SDT_PROBE_DEFINE5(sdt, cdev, discard, return, "struct cdevsw *"/*cdevsw*/, "dev_t"/*dev*/, "off_t"/*pos*/, "off_t"/*len*/, "int"/*error*/); void devsw_init(void) { KASSERT(sys_bdevsws < MAXDEVSW - 1); KASSERT(sys_cdevsws < MAXDEVSW - 1); mutex_init(&device_lock, MUTEX_DEFAULT, IPL_NONE); cv_init(&devsw_cv, "devsw"); } int devsw_attach(const char *devname, const struct bdevsw *bdev, devmajor_t *bmajor, const struct cdevsw *cdev, devmajor_t *cmajor) { struct devsw_conv *conv; char *name; int error, i; if (devname == NULL || cdev == NULL) return EINVAL; mutex_enter(&device_lock); for (i = 0; i < max_devsw_convs; i++) { conv = &devsw_conv[i]; if (conv->d_name == NULL || strcmp(devname, conv->d_name) != 0) continue; if ((bdev != NULL) && (*bmajor < 0)) *bmajor = conv->d_bmajor; if (*cmajor < 0) *cmajor = conv->d_cmajor; if (*bmajor != conv->d_bmajor || *cmajor != conv->d_cmajor) { error = EINVAL; goto out; } if ((*bmajor >= 0 && bdev == NULL) || *cmajor < 0) { error = EINVAL; goto out; } if ((*bmajor >= 0 && bdevsw[*bmajor] != NULL) || cdevsw[*cmajor] != NULL) { error = EEXIST; goto out; } break; } /* * XXX This should allocate what it needs up front so we never * need to flail around trying to unwind. */ error = bdevsw_attach(bdev, bmajor); if (error != 0) goto out; error = cdevsw_attach(cdev, cmajor); if (error != 0) { devsw_detach_locked(bdev, NULL); goto out; } /* * If we already found a conv, we're done. Otherwise, find an * empty slot or extend the table. */ if (i < max_devsw_convs) { error = 0; goto out; } for (i = 0; i < max_devsw_convs; i++) { if (devsw_conv[i].d_name == NULL) break; } if (i == max_devsw_convs) { struct devsw_conv *newptr; int old_convs, new_convs; old_convs = max_devsw_convs; new_convs = old_convs + 1; newptr = kmem_zalloc(new_convs * DEVSWCONV_SIZE, KM_NOSLEEP); if (newptr == NULL) { devsw_detach_locked(bdev, cdev); error = ENOMEM; goto out; } newptr[old_convs].d_name = NULL; newptr[old_convs].d_bmajor = -1; newptr[old_convs].d_cmajor = -1; memcpy(newptr, devsw_conv, old_convs * DEVSWCONV_SIZE); if (devsw_conv != devsw_conv0) kmem_free(devsw_conv, old_convs * DEVSWCONV_SIZE); devsw_conv = newptr; max_devsw_convs = new_convs; } name = kmem_strdupsize(devname, NULL, KM_NOSLEEP); if (name == NULL) { devsw_detach_locked(bdev, cdev); error = ENOMEM; goto out; } devsw_conv[i].d_name = name; devsw_conv[i].d_bmajor = *bmajor; devsw_conv[i].d_cmajor = *cmajor; error = 0; out: mutex_exit(&device_lock); return error; } static int bdevsw_attach(const struct bdevsw *devsw, devmajor_t *devmajor) { const struct bdevsw **newbdevsw = NULL; struct devswref *newbdevswref = NULL; struct localcount *lc; devmajor_t bmajor; int i; KASSERT(mutex_owned(&device_lock)); if (devsw == NULL) return 0; if (*devmajor < 0) { for (bmajor = sys_bdevsws; bmajor < max_bdevsws; bmajor++) { if (bdevsw[bmajor] != NULL) continue; for (i = 0; i < max_devsw_convs; i++) { if (devsw_conv[i].d_bmajor == bmajor) break; } if (i != max_devsw_convs) continue; break; } *devmajor = bmajor; } if (*devmajor >= MAXDEVSW) { printf("%s: block majors exhausted\n", __func__); return ENOMEM; } if (bdevswref == NULL) { newbdevswref = kmem_zalloc(MAXDEVSW * sizeof(newbdevswref[0]), KM_NOSLEEP); if (newbdevswref == NULL) return ENOMEM; atomic_store_release(&bdevswref, newbdevswref); } if (*devmajor >= max_bdevsws) { KASSERT(bdevsw == bdevsw0); newbdevsw = kmem_zalloc(MAXDEVSW * sizeof(newbdevsw[0]), KM_NOSLEEP); if (newbdevsw == NULL) return ENOMEM; memcpy(newbdevsw, bdevsw, max_bdevsws * sizeof(bdevsw[0])); atomic_store_release(&bdevsw, newbdevsw); atomic_store_release(&max_bdevsws, MAXDEVSW); } if (bdevsw[*devmajor] != NULL) return EEXIST; KASSERT(bdevswref[*devmajor].dr_lc == NULL); lc = kmem_zalloc(sizeof(*lc), KM_SLEEP); localcount_init(lc); bdevswref[*devmajor].dr_lc = lc; atomic_store_release(&bdevsw[*devmajor], devsw); return 0; } static int cdevsw_attach(const struct cdevsw *devsw, devmajor_t *devmajor) { const struct cdevsw **newcdevsw = NULL; struct devswref *newcdevswref = NULL; struct localcount *lc; devmajor_t cmajor; int i; KASSERT(mutex_owned(&device_lock)); if (*devmajor < 0) { for (cmajor = sys_cdevsws; cmajor < max_cdevsws; cmajor++) { if (cdevsw[cmajor] != NULL) continue; for (i = 0; i < max_devsw_convs; i++) { if (devsw_conv[i].d_cmajor == cmajor) break; } if (i != max_devsw_convs) continue; break; } *devmajor = cmajor; } if (*devmajor >= MAXDEVSW) { printf("%s: character majors exhausted\n", __func__); return ENOMEM; } if (cdevswref == NULL) { newcdevswref = kmem_zalloc(MAXDEVSW * sizeof(newcdevswref[0]), KM_NOSLEEP); if (newcdevswref == NULL) return ENOMEM; atomic_store_release(&cdevswref, newcdevswref); } if (*devmajor >= max_cdevsws) { KASSERT(cdevsw == cdevsw0); newcdevsw = kmem_zalloc(MAXDEVSW * sizeof(newcdevsw[0]), KM_NOSLEEP); if (newcdevsw == NULL) return ENOMEM; memcpy(newcdevsw, cdevsw, max_cdevsws * sizeof(cdevsw[0])); atomic_store_release(&cdevsw, newcdevsw); atomic_store_release(&max_cdevsws, MAXDEVSW); } if (cdevsw[*devmajor] != NULL) return EEXIST; KASSERT(cdevswref[*devmajor].dr_lc == NULL); lc = kmem_zalloc(sizeof(*lc), KM_SLEEP); localcount_init(lc); cdevswref[*devmajor].dr_lc = lc; atomic_store_release(&cdevsw[*devmajor], devsw); return 0; } static void devsw_detach_locked(const struct bdevsw *bdev, const struct cdevsw *cdev) { int bi, ci = -1/*XXXGCC*/, di; struct cfdriver *cd; device_t dv; KASSERT(mutex_owned(&device_lock)); /* * If this is wired to an autoconf device, make sure the device * has no more instances. No locking here because under * correct use of devsw_detach, none of this state can change * at this point. */ if (cdev != NULL && (cd = cdev->d_cfdriver) != NULL) { for (di = 0; di < cd->cd_ndevs; di++) { KASSERTMSG((dv = cd->cd_devs[di]) == NULL, "detaching character device driver %s" " still has attached unit %s", cd->cd_name, device_xname(dv)); } } if (bdev != NULL && (cd = bdev->d_cfdriver) != NULL) { for (di = 0; di < cd->cd_ndevs; di++) { KASSERTMSG((dv = cd->cd_devs[di]) == NULL, "detaching block device driver %s" " still has attached unit %s", cd->cd_name, device_xname(dv)); } } /* Prevent new references. */ if (bdev != NULL) { for (bi = 0; bi < max_bdevsws; bi++) { if (bdevsw[bi] != bdev) continue; atomic_store_relaxed(&bdevsw[bi], NULL); break; } KASSERT(bi < max_bdevsws); } if (cdev != NULL) { for (ci = 0; ci < max_cdevsws; ci++) { if (cdevsw[ci] != cdev) continue; atomic_store_relaxed(&cdevsw[ci], NULL); break; } KASSERT(ci < max_cdevsws); } if (bdev == NULL && cdev == NULL) /* XXX possible? */ return; /* * Wait for all bdevsw_lookup_acquire, cdevsw_lookup_acquire * calls to notice that the devsw is gone. * * XXX Despite the use of the pserialize_read_enter/exit API * elsewhere in this file, we use xc_barrier here instead of * pserialize_perform -- because devsw_init is too early for * pserialize_create. Either pserialize_create should be made * to work earlier, or it should be nixed altogether. Until * that is fixed, xc_barrier will serve the same purpose. */ xc_barrier(0); /* * Wait for all references to drain. It is the caller's * responsibility to ensure that at this point, there are no * extant open instances and all new d_open calls will fail. * * Note that localcount_drain may release and reacquire * device_lock. */ if (bdev != NULL) { localcount_drain(bdevswref[bi].dr_lc, &devsw_cv, &device_lock); localcount_fini(bdevswref[bi].dr_lc); kmem_free(bdevswref[bi].dr_lc, sizeof(*bdevswref[bi].dr_lc)); bdevswref[bi].dr_lc = NULL; } if (cdev != NULL) { localcount_drain(cdevswref[ci].dr_lc, &devsw_cv, &device_lock); localcount_fini(cdevswref[ci].dr_lc); kmem_free(cdevswref[ci].dr_lc, sizeof(*cdevswref[ci].dr_lc)); cdevswref[ci].dr_lc = NULL; } } void devsw_detach(const struct bdevsw *bdev, const struct cdevsw *cdev) { mutex_enter(&device_lock); devsw_detach_locked(bdev, cdev); mutex_exit(&device_lock); } /* * Look up a block device by number. * * => Caller must ensure that the device is attached. */ const struct bdevsw * bdevsw_lookup(dev_t dev) { devmajor_t bmajor; if (dev == NODEV) return NULL; bmajor = major(dev); if (bmajor < 0 || bmajor >= atomic_load_relaxed(&max_bdevsws)) return NULL; return atomic_load_consume(&bdevsw)[bmajor]; } static const struct bdevsw * bdevsw_lookup_acquire(dev_t dev, struct localcount **lcp) { devmajor_t bmajor; const struct bdevsw *bdev = NULL, *const *curbdevsw; struct devswref *curbdevswref; int s; if (dev == NODEV) return NULL; bmajor = major(dev); if (bmajor < 0) return NULL; s = pserialize_read_enter(); /* * max_bdevsws never goes down, so it is safe to rely on this * condition without any locking for the array access below. * Test sys_bdevsws first so we can avoid the memory barrier in * that case. */ if (bmajor >= sys_bdevsws && bmajor >= atomic_load_acquire(&max_bdevsws)) goto out; curbdevsw = atomic_load_consume(&bdevsw); if ((bdev = atomic_load_consume(&curbdevsw[bmajor])) == NULL) goto out; curbdevswref = atomic_load_consume(&bdevswref); if (curbdevswref == NULL) { *lcp = NULL; } else if ((*lcp = curbdevswref[bmajor].dr_lc) != NULL) { localcount_acquire(*lcp); } out: pserialize_read_exit(s); return bdev; } static void bdevsw_release(const struct bdevsw *bdev, struct localcount *lc) { if (lc == NULL) return; localcount_release(lc, &devsw_cv, &device_lock); } /* * Look up a character device by number. * * => Caller must ensure that the device is attached. */ const struct cdevsw * cdevsw_lookup(dev_t dev) { devmajor_t cmajor; if (dev == NODEV) return NULL; cmajor = major(dev); if (cmajor < 0 || cmajor >= atomic_load_relaxed(&max_cdevsws)) return NULL; return atomic_load_consume(&cdevsw)[cmajor]; } static const struct cdevsw * cdevsw_lookup_acquire(dev_t dev, struct localcount **lcp) { devmajor_t cmajor; const struct cdevsw *cdev = NULL, *const *curcdevsw; struct devswref *curcdevswref; int s; if (dev == NODEV) return NULL; cmajor = major(dev); if (cmajor < 0) return NULL; s = pserialize_read_enter(); /* * max_cdevsws never goes down, so it is safe to rely on this * condition without any locking for the array access below. * Test sys_cdevsws first so we can avoid the memory barrier in * that case. */ if (cmajor >= sys_cdevsws && cmajor >= atomic_load_acquire(&max_cdevsws)) goto out; curcdevsw = atomic_load_consume(&cdevsw); if ((cdev = atomic_load_consume(&curcdevsw[cmajor])) == NULL) goto out; curcdevswref = atomic_load_consume(&cdevswref); if (curcdevswref == NULL) { *lcp = NULL; } else if ((*lcp = curcdevswref[cmajor].dr_lc) != NULL) { localcount_acquire(*lcp); } out: pserialize_read_exit(s); return cdev; } static void cdevsw_release(const struct cdevsw *cdev, struct localcount *lc) { if (lc == NULL) return; localcount_release(lc, &devsw_cv, &device_lock); } /* * Look up a block device by reference to its operations set. * * => Caller must ensure that the device is not detached, and therefore * that the returned major is still valid when dereferenced. */ devmajor_t bdevsw_lookup_major(const struct bdevsw *bdev) { const struct bdevsw *const *curbdevsw; devmajor_t bmajor, bmax; bmax = atomic_load_acquire(&max_bdevsws); curbdevsw = atomic_load_consume(&bdevsw); for (bmajor = 0; bmajor < bmax; bmajor++) { if (atomic_load_relaxed(&curbdevsw[bmajor]) == bdev) return bmajor; } return NODEVMAJOR; } /* * Look up a character device by reference to its operations set. * * => Caller must ensure that the device is not detached, and therefore * that the returned major is still valid when dereferenced. */ devmajor_t cdevsw_lookup_major(const struct cdevsw *cdev) { const struct cdevsw *const *curcdevsw; devmajor_t cmajor, cmax; cmax = atomic_load_acquire(&max_cdevsws); curcdevsw = atomic_load_consume(&cdevsw); for (cmajor = 0; cmajor < cmax; cmajor++) { if (atomic_load_relaxed(&curcdevsw[cmajor]) == cdev) return cmajor; } return NODEVMAJOR; } /* * Convert from block major number to name. * * => Caller must ensure that the device is not detached, and therefore * that the name pointer is still valid when dereferenced. */ const char * devsw_blk2name(devmajor_t bmajor) { const char *name; devmajor_t cmajor; int i; name = NULL; cmajor = -1; mutex_enter(&device_lock); if (bmajor < 0 || bmajor >= max_bdevsws || bdevsw[bmajor] == NULL) { mutex_exit(&device_lock); return NULL; } for (i = 0; i < max_devsw_convs; i++) { if (devsw_conv[i].d_bmajor == bmajor) { cmajor = devsw_conv[i].d_cmajor; break; } } if (cmajor >= 0 && cmajor < max_cdevsws && cdevsw[cmajor] != NULL) name = devsw_conv[i].d_name; mutex_exit(&device_lock); return name; } /* * Convert char major number to device driver name. */ const char * cdevsw_getname(devmajor_t major) { const char *name; int i; name = NULL; if (major < 0) return NULL; mutex_enter(&device_lock); for (i = 0; i < max_devsw_convs; i++) { if (devsw_conv[i].d_cmajor == major) { name = devsw_conv[i].d_name; break; } } mutex_exit(&device_lock); return name; } /* * Convert block major number to device driver name. */ const char * bdevsw_getname(devmajor_t major) { const char *name; int i; name = NULL; if (major < 0) return NULL; mutex_enter(&device_lock); for (i = 0; i < max_devsw_convs; i++) { if (devsw_conv[i].d_bmajor == major) { name = devsw_conv[i].d_name; break; } } mutex_exit(&device_lock); return name; } /* * Convert from device name to block major number. * * => Caller must ensure that the device is not detached, and therefore * that the major number is still valid when dereferenced. */ devmajor_t devsw_name2blk(const char *name, char *devname, size_t devnamelen) { struct devsw_conv *conv; devmajor_t bmajor; int i; if (name == NULL) return NODEVMAJOR; mutex_enter(&device_lock); for (i = 0; i < max_devsw_convs; i++) { size_t len; conv = &devsw_conv[i]; if (conv->d_name == NULL) continue; len = strlen(conv->d_name); if (strncmp(conv->d_name, name, len) != 0) continue; if (name[len] != '\0' && !isdigit((unsigned char)name[len])) continue; bmajor = conv->d_bmajor; if (bmajor < 0 || bmajor >= max_bdevsws || bdevsw[bmajor] == NULL) break; if (devname != NULL) { #ifdef DEVSW_DEBUG if (strlen(conv->d_name) >= devnamelen) printf("%s: too short buffer\n", __func__); #endif /* DEVSW_DEBUG */ strncpy(devname, conv->d_name, devnamelen); devname[devnamelen - 1] = '\0'; } mutex_exit(&device_lock); return bmajor; } mutex_exit(&device_lock); return NODEVMAJOR; } /* * Convert from device name to char major number. * * => Caller must ensure that the device is not detached, and therefore * that the major number is still valid when dereferenced. */ devmajor_t devsw_name2chr(const char *name, char *devname, size_t devnamelen) { struct devsw_conv *conv; devmajor_t cmajor; int i; if (name == NULL) return NODEVMAJOR; mutex_enter(&device_lock); for (i = 0; i < max_devsw_convs; i++) { size_t len; conv = &devsw_conv[i]; if (conv->d_name == NULL) continue; len = strlen(conv->d_name); if (strncmp(conv->d_name, name, len) != 0) continue; if (name[len] != '\0' && !isdigit((unsigned char)name[len])) continue; cmajor = conv->d_cmajor; if (cmajor < 0 || cmajor >= max_cdevsws || cdevsw[cmajor] == NULL) break; if (devname != NULL) { #ifdef DEVSW_DEBUG if (strlen(conv->d_name) >= devnamelen) printf("%s: too short buffer", __func__); #endif /* DEVSW_DEBUG */ strncpy(devname, conv->d_name, devnamelen); devname[devnamelen - 1] = '\0'; } mutex_exit(&device_lock); return cmajor; } mutex_exit(&device_lock); return NODEVMAJOR; } /* * Convert from character dev_t to block dev_t. * * => Caller must ensure that the device is not detached, and therefore * that the major number is still valid when dereferenced. */ dev_t devsw_chr2blk(dev_t cdev) { devmajor_t bmajor, cmajor; int i; dev_t rv; cmajor = major(cdev); bmajor = NODEVMAJOR; rv = NODEV; mutex_enter(&device_lock); if (cmajor < 0 || cmajor >= max_cdevsws || cdevsw[cmajor] == NULL) { mutex_exit(&device_lock); return NODEV; } for (i = 0; i < max_devsw_convs; i++) { if (devsw_conv[i].d_cmajor == cmajor) { bmajor = devsw_conv[i].d_bmajor; break; } } if (bmajor >= 0 && bmajor < max_bdevsws && bdevsw[bmajor] != NULL) rv = makedev(bmajor, minor(cdev)); mutex_exit(&device_lock); return rv; } /* * Convert from block dev_t to character dev_t. * * => Caller must ensure that the device is not detached, and therefore * that the major number is still valid when dereferenced. */ dev_t devsw_blk2chr(dev_t bdev) { devmajor_t bmajor, cmajor; int i; dev_t rv; bmajor = major(bdev); cmajor = NODEVMAJOR; rv = NODEV; mutex_enter(&device_lock); if (bmajor < 0 || bmajor >= max_bdevsws || bdevsw[bmajor] == NULL) { mutex_exit(&device_lock); return NODEV; } for (i = 0; i < max_devsw_convs; i++) { if (devsw_conv[i].d_bmajor == bmajor) { cmajor = devsw_conv[i].d_cmajor; break; } } if (cmajor >= 0 && cmajor < max_cdevsws && cdevsw[cmajor] != NULL) rv = makedev(cmajor, minor(bdev)); mutex_exit(&device_lock); return rv; } /* * Device access methods. */ #define DEV_LOCK(d) \ if ((mpflag = (d->d_flag & D_MPSAFE)) == 0) { \ KERNEL_LOCK(1, NULL); \ } #define DEV_UNLOCK(d) \ if (mpflag == 0) { \ KERNEL_UNLOCK_ONE(NULL); \ } int bdev_open(dev_t dev, int flag, int devtype, lwp_t *l) { const struct bdevsw *d; struct localcount *lc; device_t dv = NULL/*XXXGCC*/; int unit = -1/*XXXGCC*/, rv, mpflag; d = bdevsw_lookup_acquire(dev, &lc); if (d == NULL) return ENXIO; if (d->d_devtounit) { /* * If the device node corresponds to an autoconf device * instance, acquire a reference to it so that during * d_open, device_lookup is stable. * * XXX This should also arrange to instantiate cloning * pseudo-devices if appropriate, but that requires * reviewing them all to find and verify a common * pattern. */ if ((unit = (*d->d_devtounit)(dev)) == -1) return ENXIO; if ((dv = device_lookup_acquire(d->d_cfdriver, unit)) == NULL) return ENXIO; SDT_PROBE6(sdt, bdev, open, acquire, d, dev, flag, devtype, unit, dv); } DEV_LOCK(d); SDT_PROBE4(sdt, bdev, open, entry, d, dev, flag, devtype); rv = (*d->d_open)(dev, flag, devtype, l); SDT_PROBE5(sdt, bdev, open, return, d, dev, flag, devtype, rv); DEV_UNLOCK(d); if (d->d_devtounit) { SDT_PROBE6(sdt, bdev, open, release, d, dev, flag, devtype, unit, dv); device_release(dv); } bdevsw_release(d, lc); return rv; } int bdev_cancel(dev_t dev, int flag, int devtype, struct lwp *l) { const struct bdevsw *d; int rv, mpflag; if ((d = bdevsw_lookup(dev)) == NULL) return ENXIO; if (d->d_cancel == NULL) return ENODEV; DEV_LOCK(d); SDT_PROBE4(sdt, bdev, cancel, entry, d, dev, flag, devtype); rv = (*d->d_cancel)(dev, flag, devtype, l); SDT_PROBE5(sdt, bdev, cancel, return, d, dev, flag, devtype, rv); DEV_UNLOCK(d); return rv; } int bdev_close(dev_t dev, int flag, int devtype, lwp_t *l) { const struct bdevsw *d; int rv, mpflag; if ((d = bdevsw_lookup(dev)) == NULL) return ENXIO; DEV_LOCK(d); SDT_PROBE4(sdt, bdev, close, entry, d, dev, flag, devtype); rv = (*d->d_close)(dev, flag, devtype, l); SDT_PROBE5(sdt, bdev, close, return, d, dev, flag, devtype, rv); DEV_UNLOCK(d); return rv; } SDT_PROVIDER_DECLARE(io); SDT_PROBE_DEFINE1(io, kernel, , start, "struct buf *"/*bp*/); void bdev_strategy(struct buf *bp) { const struct bdevsw *d; int mpflag; SDT_PROBE1(io, kernel, , start, bp); if ((d = bdevsw_lookup(bp->b_dev)) == NULL) { bp->b_error = ENXIO; bp->b_resid = bp->b_bcount; biodone_vfs(bp); /* biodone() iff vfs present */ return; } DEV_LOCK(d); SDT_PROBE3(sdt, bdev, strategy, entry, d, bp->b_dev, bp); (*d->d_strategy)(bp); SDT_PROBE3(sdt, bdev, strategy, return, d, bp->b_dev, bp); DEV_UNLOCK(d); } int bdev_ioctl(dev_t dev, u_long cmd, void *data, int flag, lwp_t *l) { const struct bdevsw *d; int rv, mpflag; if ((d = bdevsw_lookup(dev)) == NULL) return ENXIO; DEV_LOCK(d); SDT_PROBE5(sdt, bdev, ioctl, entry, d, dev, cmd, data, flag); rv = (*d->d_ioctl)(dev, cmd, data, flag, l); SDT_PROBE6(sdt, bdev, ioctl, return, d, dev, cmd, data, flag, rv); DEV_UNLOCK(d); return rv; } int bdev_dump(dev_t dev, daddr_t addr, void *data, size_t sz) { const struct bdevsw *d; int rv; /* * Dump can be called without the device open. Since it can * currently only be called with the system paused (and in a * potentially unstable state), we don't perform any locking. */ if ((d = bdevsw_lookup(dev)) == NULL) return ENXIO; /* DEV_LOCK(d); */ rv = (*d->d_dump)(dev, addr, data, sz); /* DEV_UNLOCK(d); */ return rv; } int bdev_flags(dev_t dev) { const struct bdevsw *d; if ((d = bdevsw_lookup(dev)) == NULL) return 0; return d->d_flag & ~D_TYPEMASK; } int bdev_type(dev_t dev) { const struct bdevsw *d; if ((d = bdevsw_lookup(dev)) == NULL) return D_OTHER; return d->d_flag & D_TYPEMASK; } int bdev_size(dev_t dev) { const struct bdevsw *d; int rv, mpflag = 0; if ((d = bdevsw_lookup(dev)) == NULL || d->d_psize == NULL) return -1; /* * Don't to try lock the device if we're dumping. * XXX: is there a better way to test this? */ if ((boothowto & RB_DUMP) == 0) DEV_LOCK(d); SDT_PROBE2(sdt, bdev, psize, entry, d, dev); rv = (*d->d_psize)(dev); SDT_PROBE3(sdt, bdev, psize, return, d, dev, rv); if ((boothowto & RB_DUMP) == 0) DEV_UNLOCK(d); return rv; } int bdev_discard(dev_t dev, off_t pos, off_t len) { const struct bdevsw *d; int rv, mpflag; if ((d = bdevsw_lookup(dev)) == NULL) return ENXIO; DEV_LOCK(d); SDT_PROBE4(sdt, bdev, discard, entry, d, dev, pos, len); rv = (*d->d_discard)(dev, pos, len); SDT_PROBE5(sdt, bdev, discard, return, d, dev, pos, len, rv); DEV_UNLOCK(d); return rv; } void bdev_detached(dev_t dev) { const struct bdevsw *d; device_t dv; int unit; if ((d = bdevsw_lookup(dev)) == NULL) return; if (d->d_devtounit == NULL) return; if ((unit = (*d->d_devtounit)(dev)) == -1) return; if ((dv = device_lookup(d->d_cfdriver, unit)) == NULL) return; config_detach_commit(dv); } int cdev_open(dev_t dev, int flag, int devtype, lwp_t *l) { const struct cdevsw *d; struct localcount *lc; device_t dv = NULL/*XXXGCC*/; int unit = -1/*XXXGCC*/, rv, mpflag; d = cdevsw_lookup_acquire(dev, &lc); if (d == NULL) return ENXIO; if (d->d_devtounit) { /* * If the device node corresponds to an autoconf device * instance, acquire a reference to it so that during * d_open, device_lookup is stable. * * XXX This should also arrange to instantiate cloning * pseudo-devices if appropriate, but that requires * reviewing them all to find and verify a common * pattern. */ if ((unit = (*d->d_devtounit)(dev)) == -1) return ENXIO; if ((dv = device_lookup_acquire(d->d_cfdriver, unit)) == NULL) return ENXIO; SDT_PROBE6(sdt, cdev, open, acquire, d, dev, flag, devtype, unit, dv); } DEV_LOCK(d); SDT_PROBE4(sdt, cdev, open, entry, d, dev, flag, devtype); rv = (*d->d_open)(dev, flag, devtype, l); SDT_PROBE5(sdt, cdev, open, return, d, dev, flag, devtype, rv); DEV_UNLOCK(d); if (d->d_devtounit) { SDT_PROBE6(sdt, cdev, open, release, d, dev, flag, devtype, unit, dv); device_release(dv); } cdevsw_release(d, lc); return rv; } int cdev_cancel(dev_t dev, int flag, int devtype, struct lwp *l) { const struct cdevsw *d; int rv, mpflag; if ((d = cdevsw_lookup(dev)) == NULL) return ENXIO; if (d->d_cancel == NULL) return ENODEV; DEV_LOCK(d); SDT_PROBE4(sdt, cdev, cancel, entry, d, dev, flag, devtype); rv = (*d->d_cancel)(dev, flag, devtype, l); SDT_PROBE5(sdt, cdev, cancel, return, d, dev, flag, devtype, rv); DEV_UNLOCK(d); return rv; } int cdev_close(dev_t dev, int flag, int devtype, lwp_t *l) { const struct cdevsw *d; int rv, mpflag; if ((d = cdevsw_lookup(dev)) == NULL) return ENXIO; DEV_LOCK(d); SDT_PROBE4(sdt, cdev, close, entry, d, dev, flag, devtype); rv = (*d->d_close)(dev, flag, devtype, l); SDT_PROBE5(sdt, cdev, close, return, d, dev, flag, devtype, rv); DEV_UNLOCK(d); return rv; } int cdev_read(dev_t dev, struct uio *uio, int flag) { const struct cdevsw *d; int rv, mpflag; if ((d = cdevsw_lookup(dev)) == NULL) return ENXIO; DEV_LOCK(d); SDT_PROBE4(sdt, cdev, read, entry, d, dev, uio, flag); rv = (*d->d_read)(dev, uio, flag); SDT_PROBE5(sdt, cdev, read, return, d, dev, uio, flag, rv); DEV_UNLOCK(d); return rv; } int cdev_write(dev_t dev, struct uio *uio, int flag) { const struct cdevsw *d; int rv, mpflag; if ((d = cdevsw_lookup(dev)) == NULL) return ENXIO; DEV_LOCK(d); SDT_PROBE4(sdt, cdev, write, entry, d, dev, uio, flag); rv = (*d->d_write)(dev, uio, flag); SDT_PROBE5(sdt, cdev, write, return, d, dev, uio, flag, rv); DEV_UNLOCK(d); return rv; } int cdev_ioctl(dev_t dev, u_long cmd, void *data, int flag, lwp_t *l) { const struct cdevsw *d; int rv, mpflag; if ((d = cdevsw_lookup(dev)) == NULL) return ENXIO; DEV_LOCK(d); SDT_PROBE5(sdt, cdev, ioctl, entry, d, dev, cmd, data, flag); rv = (*d->d_ioctl)(dev, cmd, data, flag, l); SDT_PROBE6(sdt, cdev, ioctl, return, d, dev, cmd, data, flag, rv); DEV_UNLOCK(d); return rv; } void cdev_stop(struct tty *tp, int flag) { const struct cdevsw *d; int mpflag; if ((d = cdevsw_lookup(tp->t_dev)) == NULL) return; DEV_LOCK(d); SDT_PROBE4(sdt, cdev, stop, entry, d, tp->t_dev, tp, flag); (*d->d_stop)(tp, flag); SDT_PROBE4(sdt, cdev, stop, return, d, tp->t_dev, tp, flag); DEV_UNLOCK(d); } struct tty * cdev_tty(dev_t dev) { const struct cdevsw *d; if ((d = cdevsw_lookup(dev)) == NULL) return NULL; /* XXX Check if necessary. */ if (d->d_tty == NULL) return NULL; return (*d->d_tty)(dev); } int cdev_poll(dev_t dev, int flag, lwp_t *l) { const struct cdevsw *d; int rv, mpflag; if ((d = cdevsw_lookup(dev)) == NULL) return POLLERR; DEV_LOCK(d); SDT_PROBE3(sdt, cdev, poll, entry, d, dev, flag); rv = (*d->d_poll)(dev, flag, l); SDT_PROBE4(sdt, cdev, poll, return, d, dev, flag, rv); DEV_UNLOCK(d); return rv; } paddr_t cdev_mmap(dev_t dev, off_t off, int flag) { const struct cdevsw *d; paddr_t rv; int mpflag; if ((d = cdevsw_lookup(dev)) == NULL) return (paddr_t)-1LL; DEV_LOCK(d); SDT_PROBE4(sdt, cdev, mmap, entry, d, dev, off, flag); rv = (*d->d_mmap)(dev, off, flag); SDT_PROBE5(sdt, cdev, mmap, return, d, dev, off, flag, rv); DEV_UNLOCK(d); return rv; } int cdev_kqfilter(dev_t dev, struct knote *kn) { const struct cdevsw *d; int rv, mpflag; if ((d = cdevsw_lookup(dev)) == NULL) return ENXIO; DEV_LOCK(d); SDT_PROBE3(sdt, cdev, kqfilter, entry, d, dev, kn); rv = (*d->d_kqfilter)(dev, kn); SDT_PROBE4(sdt, cdev, kqfilter, return, d, dev, kn, rv); DEV_UNLOCK(d); return rv; } int cdev_discard(dev_t dev, off_t pos, off_t len) { const struct cdevsw *d; int rv, mpflag; if ((d = cdevsw_lookup(dev)) == NULL) return ENXIO; DEV_LOCK(d); SDT_PROBE4(sdt, cdev, discard, entry, d, dev, pos, len); rv = (*d->d_discard)(dev, pos, len); SDT_PROBE5(sdt, cdev, discard, return, d, dev, pos, len, rv); DEV_UNLOCK(d); return rv; } int cdev_flags(dev_t dev) { const struct cdevsw *d; if ((d = cdevsw_lookup(dev)) == NULL) return 0; return d->d_flag & ~D_TYPEMASK; } int cdev_type(dev_t dev) { const struct cdevsw *d; if ((d = cdevsw_lookup(dev)) == NULL) return D_OTHER; return d->d_flag & D_TYPEMASK; } void cdev_detached(dev_t dev) { const struct cdevsw *d; device_t dv; int unit; if ((d = cdevsw_lookup(dev)) == NULL) return; if (d->d_devtounit == NULL) return; if ((unit = (*d->d_devtounit)(dev)) == -1) return; if ((dv = device_lookup(d->d_cfdriver, unit)) == NULL) return; config_detach_commit(dv); } /* * nommap(dev, off, prot) * * mmap routine that always fails, for non-mmappable devices. */ paddr_t nommap(dev_t dev, off_t off, int prot) { return (paddr_t)-1; } /* * dev_minor_unit(dev) * * Returns minor(dev) as an int. Intended for use with struct * bdevsw, cdevsw::d_devtounit for drivers whose /dev nodes are * implemented by reference to an autoconf instance with the minor * number. */ int dev_minor_unit(dev_t dev) { return minor(dev); }
3 10 10 10 10 3 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 /* $NetBSD: subr_hash.c,v 1.12 2021/06/13 14:58:49 simonb Exp $ */ /* * Copyright (c) 1982, 1986, 1991, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)kern_subr.c 8.4 (Berkeley) 2/14/95 */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: subr_hash.c,v 1.12 2021/06/13 14:58:49 simonb Exp $"); #include <sys/param.h> #include <sys/bitops.h> #include <sys/kmem.h> #include <sys/systm.h> #include <sys/pslist.h> #include <sys/rwlock.h> #include <sys/sysctl.h> static int hashstat_sysctl(SYSCTLFN_PROTO); static size_t hash_list_size(enum hashtype htype) { LIST_HEAD(, generic) *hashtbl_list; SLIST_HEAD(, generic) *hashtbl_slist; TAILQ_HEAD(, generic) *hashtbl_tailq; struct pslist_head *hashtbl_pslist; size_t esize; switch (htype) { case HASH_LIST: esize = sizeof(*hashtbl_list); break; case HASH_PSLIST: esize = sizeof(*hashtbl_pslist); break; case HASH_SLIST: esize = sizeof(*hashtbl_slist); break; case HASH_TAILQ: esize = sizeof(*hashtbl_tailq); break; default: panic("hashdone: invalid table type"); } return esize; } /* * General routine to allocate a hash table. * Allocate enough memory to hold at least `elements' list-head pointers. * Return a pointer to the allocated space and set *hashmask to a pattern * suitable for masking a value to use as an index into the returned array. */ void * hashinit(u_int elements, enum hashtype htype, bool waitok, u_long *hashmask) { LIST_HEAD(, generic) *hashtbl_list; SLIST_HEAD(, generic) *hashtbl_slist; TAILQ_HEAD(, generic) *hashtbl_tailq; struct pslist_head *hashtbl_pslist; u_long hashsize, i; size_t esize; void *p; KASSERT(elements > 0); #define MAXELEMENTS (1U << ((sizeof(elements) * NBBY) - 1)) if (elements > MAXELEMENTS) elements = MAXELEMENTS; hashsize = 1UL << (ilog2(elements - 1) + 1); esize = hash_list_size(htype); p = kmem_alloc(hashsize * esize, waitok ? KM_SLEEP : KM_NOSLEEP); if (p == NULL) return NULL; switch (htype) { case HASH_LIST: hashtbl_list = p; for (i = 0; i < hashsize; i++) LIST_INIT(&hashtbl_list[i]); break; case HASH_PSLIST: hashtbl_pslist = p; for (i = 0; i < hashsize; i++) PSLIST_INIT(&hashtbl_pslist[i]); break; case HASH_SLIST: hashtbl_slist = p; for (i = 0; i < hashsize; i++) SLIST_INIT(&hashtbl_slist[i]); break; case HASH_TAILQ: hashtbl_tailq = p; for (i = 0; i < hashsize; i++) TAILQ_INIT(&hashtbl_tailq[i]); break; } *hashmask = hashsize - 1; return p; } /* * Free memory from hash table previosly allocated via hashinit(). */ void hashdone(void *hashtbl, enum hashtype htype, u_long hashmask) { const size_t esize = hash_list_size(htype); kmem_free(hashtbl, esize * (hashmask + 1)); } /* * Support for hash statistics (vmstat -H / vmstat -h hashname). */ struct hashstat { const char *hs_name; hashstat_func_t hs_func; TAILQ_ENTRY(hashstat) hs_next; }; TAILQ_HEAD(, hashstat) hashstat_list = TAILQ_HEAD_INITIALIZER(hashstat_list); static krwlock_t hashstat_lock; void hashstat_register(const char *name, hashstat_func_t func) { struct hashstat *hs; hs = kmem_alloc(sizeof(*hs), KM_SLEEP); hs->hs_name = name; hs->hs_func = func; rw_enter(&hashstat_lock, RW_WRITER); TAILQ_INSERT_TAIL(&hashstat_list, hs, hs_next); rw_exit(&hashstat_lock); } /* * sysctl support for returning kernel hash statistics. * * We (ab)use CTL_DESCRIBE and CTL_QUERY: * When passed an OID of CTL_DESCRIBE, return a list and description * of the available hashes. * When passed an OID of CTL_QUERY, use the hash name passed in the * "new" hash input as the name of a single hash to return stats on. */ static int hashstat_sysctl(SYSCTLFN_ARGS) { struct hashstat_sysctl hs; struct hashstat *hash; char queryname[SYSCTL_NAMELEN]; size_t written; bool fill, query; int error; if (oldp == NULL) { *oldlenp = 0; TAILQ_FOREACH(hash, &hashstat_list, hs_next) *oldlenp += sizeof(hs); return 0; } error = 0; written = 0; if (namelen > 0 && name[0] == CTL_DESCRIBE) fill = false; else fill = true; if (namelen > 0 && name[0] == CTL_QUERY) { const struct hashstat_sysctl *h = newp; size_t s; if (h == NULL) { /* Can't QUERY one hash without supplying the hash name. */ return EINVAL; } query = true; error = sysctl_copyinstr(l, h->hash_name, queryname, sizeof(queryname), &s); if (error) return error; } else { query = false; } sysctl_unlock(); rw_enter(&hashstat_lock, RW_READER); TAILQ_FOREACH(hash, &hashstat_list, hs_next) { if (query && (strcmp(hash->hs_name, queryname) != 0)) { continue; } memset(&hs, 0, sizeof(hs)); error = hash->hs_func(&hs, fill); if (error) break; error = sysctl_copyout(l, &hs, oldp, sizeof(hs)); if (error) break; written += sizeof(hs); oldp = (char *)oldp + sizeof(hs); } rw_exit(&hashstat_lock); sysctl_relock(); if (query && written == 0) /* query not found? */ error = ENOENT; *oldlenp = written; return error; } SYSCTL_SETUP(sysctl_hash_setup, "sysctl hash stats setup") { rw_init(&hashstat_lock); /* as good a place as any for this */ sysctl_createv(NULL, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_STRUCT, "hashstat", SYSCTL_DESCR("kernel hash statistics"), hashstat_sysctl, 0, NULL, 0, CTL_KERN, CTL_CREATE, CTL_EOL); }
1 1 12 11 203 203 204 203 4 4 203 203 204 203 204 200 63 63 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 /* $NetBSD: subr_iostat.c,v 1.26 2024/05/04 13:33:18 mlelstv Exp $ */ /* NetBSD: subr_disk.c,v 1.69 2005/05/29 22:24:15 christos Exp */ /*- * Copyright (c) 1996, 1997, 1999, 2000, 2009 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility, * NASA Ames Research Center. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /* * Copyright (c) 1982, 1986, 1988, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)ufs_disksubr.c 8.5 (Berkeley) 1/21/94 */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: subr_iostat.c,v 1.26 2024/05/04 13:33:18 mlelstv Exp $"); #include <sys/param.h> #include <sys/kernel.h> #include <sys/kmem.h> #include <sys/iostat.h> #include <sys/sysctl.h> #include <sys/rwlock.h> /* * Function prototypes for sysctl nodes */ static int sysctl_hw_disknames(SYSCTLFN_PROTO); static int sysctl_hw_iostatnames(SYSCTLFN_PROTO); static int sysctl_hw_iostats(SYSCTLFN_PROTO); static int iostati_getnames(int disk_only, char *oldp, size_t *oldlenp, const void *newp, u_int namelen); /* * A global list of all drives attached to the system. May grow or * shrink over time. */ struct iostatlist_head iostatlist = TAILQ_HEAD_INITIALIZER(iostatlist); int iostat_count; /* number of drives in global drivelist */ krwlock_t iostatlist_lock; static void sysctl_io_stats_setup(struct sysctllog **); /* * Initialise the iostat subsystem. */ void iostat_init(void) { rw_init(&iostatlist_lock); sysctl_io_stats_setup(NULL); } /* * Searches the iostatlist for the iostat corresponding to the * name provided. */ struct io_stats * iostat_find(const char *name) { struct io_stats *iostatp; KASSERT(name != NULL); rw_enter(&iostatlist_lock, RW_READER); TAILQ_FOREACH(iostatp, &iostatlist, io_link) { if (strcmp(iostatp->io_name, name) == 0) { break; } } rw_exit(&iostatlist_lock); return iostatp; } /* * Allocate and initialise memory for the i/o statistics. */ struct io_stats * iostat_alloc(int32_t type, void *parent, const char *name) { struct io_stats *stats; stats = kmem_zalloc(sizeof(*stats), KM_SLEEP); stats->io_type = type; stats->io_parent = parent; (void)strlcpy(stats->io_name, name, sizeof(stats->io_name)); /* * Set the attached timestamp. */ getmicrouptime(&stats->io_attachtime); /* * Link into the drivelist. */ rw_enter(&iostatlist_lock, RW_WRITER); TAILQ_INSERT_TAIL(&iostatlist, stats, io_link); iostat_count++; rw_exit(&iostatlist_lock); return stats; } /* * Remove i/o from stats collection. */ void iostat_free(struct io_stats *stats) { /* * Remove from the iostat list. */ if (iostat_count == 0) panic("iostat_free: iostat_count == 0"); rw_enter(&iostatlist_lock, RW_WRITER); TAILQ_REMOVE(&iostatlist, stats, io_link); iostat_count--; rw_exit(&iostatlist_lock); kmem_free(stats, sizeof(*stats)); } /* * Rename i/o stats. */ void iostat_rename(struct io_stats *stats, const char *name) { rw_enter(&iostatlist_lock, RW_WRITER); (void)strlcpy(stats->io_name, name, sizeof(stats->io_name)); rw_exit(&iostatlist_lock); } /* * multiply timeval by unsigned integer and add to result */ static void timermac(struct timeval *a, uint64_t count, struct timeval *res) { struct timeval part = *a; while (count) { if (count & 1) timeradd(res, &part, res); timeradd(&part, &part, &part); count >>= 1; } } /* * Increment the iostat wait counter. * Accumulate wait time and timesum. * * Wait time is spent in the device bufq. */ void iostat_wait(struct io_stats *stats) { struct timeval dv_time, diff_time; int32_t count; KASSERT(stats->io_wait >= 0); getmicrouptime(&dv_time); timersub(&dv_time, &stats->io_waitstamp, &diff_time); count = stats->io_wait++; if (count != 0) { timermac(&diff_time, count, &stats->io_waitsum); timeradd(&stats->io_waittime, &diff_time, &stats->io_waittime); } stats->io_waitstamp = dv_time; } /* * Decrement the iostat wait counter. * Increment the iostat busy counter. * Accumulate wait and busy times and timesums. * * Busy time is spent being processed by the device. * * Old devices do not yet measure wait time, so skip * processing it if the counter is still zero. */ void iostat_busy(struct io_stats *stats) { struct timeval dv_time, diff_time; int32_t count; KASSERT(stats->io_wait >= 0); /* > 0 when iostat_wait is used */ KASSERT(stats->io_busy >= 0); getmicrouptime(&dv_time); timersub(&dv_time, &stats->io_waitstamp, &diff_time); if (stats->io_wait != 0) { count = stats->io_wait--; timermac(&diff_time, count, &stats->io_waitsum); timeradd(&stats->io_waittime, &diff_time, &stats->io_waittime); } stats->io_waitstamp = dv_time; timersub(&dv_time, &stats->io_busystamp, &diff_time); count = stats->io_busy++; if (count != 0) { timermac(&diff_time, count, &stats->io_busysum); timeradd(&stats->io_busytime, &diff_time, &stats->io_busytime); } stats->io_busystamp = dv_time; } /* * Decrement the iostat busy counter, increment the byte count. * Accumulate busy time and timesum. */ void iostat_unbusy(struct io_stats *stats, long bcount, int read) { struct timeval dv_time, diff_time; int32_t count; KASSERT(stats->io_busy > 0); getmicrouptime(&dv_time); stats->io_timestamp = dv_time; /* any op */ timersub(&dv_time, &stats->io_busystamp, &diff_time); count = stats->io_busy--; timermac(&diff_time, count, &stats->io_busysum); timeradd(&stats->io_busytime, &diff_time, &stats->io_busytime); stats->io_busystamp = dv_time; if (bcount > 0) { if (read) { stats->io_rbytes += bcount; stats->io_rxfer++; } else { stats->io_wbytes += bcount; stats->io_wxfer++; } } } /* * Return non-zero if a device has an I/O request in flight. */ bool iostat_isbusy(struct io_stats *stats) { return stats->io_busy != 0; } /* * Increment the seek counter. This does look almost redundant but it * abstracts the stats gathering. */ void iostat_seek(struct io_stats *stats) { stats->io_seek++; } static int sysctl_hw_disknames(SYSCTLFN_ARGS) { return iostati_getnames(1, oldp, oldlenp, newp, namelen); } static int sysctl_hw_iostatnames(SYSCTLFN_ARGS) { return iostati_getnames(0, oldp, oldlenp, newp, namelen); } static int iostati_getnames(int disk_only, char *oldp, size_t *oldlenp, const void *newp, u_int namelen) { char bf[IOSTATNAMELEN + 1]; char *where = oldp; struct io_stats *stats; size_t needed, left, slen; int error, first; if (newp != NULL) return (EPERM); if (namelen != 0) return (EINVAL); first = 1; error = 0; needed = 0; left = *oldlenp; rw_enter(&iostatlist_lock, RW_READER); for (stats = TAILQ_FIRST(&iostatlist); stats != NULL; stats = TAILQ_NEXT(stats, io_link)) { if ((disk_only == 1) && (stats->io_type != IOSTAT_DISK)) continue; if (where == NULL) needed += strlen(stats->io_name) + 1; else { memset(bf, 0, sizeof(bf)); if (first) { strncpy(bf, stats->io_name, sizeof(bf)); /* account for trailing NUL byte */ needed += 1; first = 0; } else { bf[0] = ' '; strncpy(bf + 1, stats->io_name, sizeof(bf) - 1); } bf[IOSTATNAMELEN] = '\0'; slen = strlen(bf); if (left < slen + 1) break; /* +1 to copy out the trailing NUL byte */ error = copyout(bf, where, slen + 1); if (error) break; where += slen; needed += slen; left -= slen; } } rw_exit(&iostatlist_lock); *oldlenp = needed; return (error); } static int sysctl_hw_iostats(SYSCTLFN_ARGS) { struct io_sysctl sdrive; struct io_stats *stats; char *where = oldp; size_t tocopy, left; int error; if (newp != NULL) return (EPERM); /* * The original hw.diskstats call was broken and did not require * the userland to pass in its size of struct disk_sysctl. This * was fixed after NetBSD 1.6 was released. */ if (namelen == 0) tocopy = offsetof(struct io_sysctl, busy); else tocopy = name[0]; if (where == NULL) { *oldlenp = iostat_count * tocopy; return (0); } error = 0; left = *oldlenp; memset(&sdrive, 0, sizeof(sdrive)); *oldlenp = 0; rw_enter(&iostatlist_lock, RW_READER); TAILQ_FOREACH(stats, &iostatlist, io_link) { if (left < tocopy) break; strncpy(sdrive.name, stats->io_name, sizeof(sdrive.name)); sdrive.attachtime_sec = stats->io_attachtime.tv_sec; sdrive.attachtime_usec = stats->io_attachtime.tv_usec; sdrive.timestamp_sec = stats->io_busystamp.tv_sec; sdrive.timestamp_usec = stats->io_busystamp.tv_usec; sdrive.time_sec = stats->io_busytime.tv_sec; sdrive.time_usec = stats->io_busytime.tv_usec; sdrive.seek = stats->io_seek; sdrive.rxfer = stats->io_rxfer; sdrive.wxfer = stats->io_wxfer; sdrive.xfer = stats->io_rxfer + stats->io_wxfer; sdrive.rbytes = stats->io_rbytes; sdrive.wbytes = stats->io_wbytes; sdrive.bytes = stats->io_rbytes + stats->io_wbytes; sdrive.wait_sec = stats->io_waittime.tv_sec; sdrive.wait_usec = stats->io_waittime.tv_usec; sdrive.time_sec = stats->io_busytime.tv_sec; sdrive.time_usec = stats->io_busytime.tv_usec; sdrive.waitsum_sec = stats->io_waitsum.tv_sec; sdrive.waitsum_usec = stats->io_waitsum.tv_usec; sdrive.busysum_sec = stats->io_busysum.tv_sec; sdrive.busysum_usec = stats->io_busysum.tv_usec; sdrive.busy = stats->io_busy; error = copyout(&sdrive, where, uimin(tocopy, sizeof(sdrive))); if (error) break; where += tocopy; *oldlenp += tocopy; left -= tocopy; } rw_exit(&iostatlist_lock); return (error); } static void sysctl_io_stats_setup(struct sysctllog **clog) { sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT, CTLTYPE_STRING, "disknames", SYSCTL_DESCR("List of disk drives present"), sysctl_hw_disknames, 0, NULL, 0, CTL_HW, HW_DISKNAMES, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT, CTLTYPE_STRING, "iostatnames", SYSCTL_DESCR("I/O stats are being collected for these" " devices"), sysctl_hw_iostatnames, 0, NULL, 0, CTL_HW, HW_IOSTATNAMES, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT, CTLTYPE_STRUCT, "iostats", SYSCTL_DESCR("Statistics on device I/O operations"), sysctl_hw_iostats, 0, NULL, 0, CTL_HW, HW_IOSTATS, CTL_EOL); }
780 780 776 7 21 81 7 39 5 2 2 2 2 2 2 2 3 1 9 9 9 925 245 245 113 328 917 923 698 672 313 12 11 11 64 30 45 21 250 294 22 768 764 767 25 25 687 264 15 2 4 7 10 19 2 3 13 2 18 15 3 8 5 3 4 4 559 559 16 16 10 10 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 /* $NetBSD: genfs_vnops.c,v 1.220 2023/03/03 10:02:51 hannken Exp $ */ /*- * Copyright (c) 2008 The NetBSD Foundation, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /* * Copyright (c) 1982, 1986, 1989, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: genfs_vnops.c,v 1.220 2023/03/03 10:02:51 hannken Exp $"); #include <sys/param.h> #include <sys/systm.h> #include <sys/proc.h> #include <sys/kernel.h> #include <sys/mount.h> #include <sys/fstrans.h> #include <sys/namei.h> #include <sys/vnode_impl.h> #include <sys/fcntl.h> #include <sys/kmem.h> #include <sys/poll.h> #include <sys/mman.h> #include <sys/file.h> #include <sys/kauth.h> #include <sys/stat.h> #include <sys/extattr.h> #include <miscfs/genfs/genfs.h> #include <miscfs/genfs/genfs_node.h> #include <miscfs/specfs/specdev.h> static void filt_genfsdetach(struct knote *); static int filt_genfsread(struct knote *, long); static int filt_genfsvnode(struct knote *, long); /* * Find the end of the first path component in NAME and return its * length. */ int genfs_parsepath(void *v) { struct vop_parsepath_args /* { struct vnode *a_dvp; const char *a_name; size_t *a_ret; } */ *ap = v; const char *name = ap->a_name; size_t pos; (void)ap->a_dvp; pos = 0; while (name[pos] != '\0' && name[pos] != '/') { pos++; } *ap->a_retval = pos; return 0; } int genfs_poll(void *v) { struct vop_poll_args /* { struct vnode *a_vp; int a_events; struct lwp *a_l; } */ *ap = v; return (ap->a_events & (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM)); } int genfs_seek(void *v) { struct vop_seek_args /* { struct vnode *a_vp; off_t a_oldoff; off_t a_newoff; kauth_cred_t cred; } */ *ap = v; if (ap->a_newoff < 0) return (EINVAL); return (0); } int genfs_abortop(void *v) { struct vop_abortop_args /* { struct vnode *a_dvp; struct componentname *a_cnp; } */ *ap = v; (void)ap; return (0); } int genfs_fcntl(void *v) { struct vop_fcntl_args /* { struct vnode *a_vp; u_int a_command; void *a_data; int a_fflag; kauth_cred_t a_cred; struct lwp *a_l; } */ *ap = v; if (ap->a_command == F_SETFL) return (0); else return (EOPNOTSUPP); } /*ARGSUSED*/ int genfs_badop(void *v) { panic("genfs: bad op"); } /*ARGSUSED*/ int genfs_nullop(void *v) { return (0); } /*ARGSUSED*/ int genfs_einval(void *v) { return (EINVAL); } int genfs_erofs_link(void *v) { /* also for symlink */ struct vop_link_v2_args /* { struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; } */ *ap = v; VOP_ABORTOP(ap->a_dvp, ap->a_cnp); return EROFS; } /* * Called when an fs doesn't support a particular vop. * This takes care to vrele, vput, or vunlock passed in vnodes * and calls VOP_ABORTOP for a componentname (in non-rename VOP). */ int genfs_eopnotsupp(void *v) { struct vop_generic_args /* struct vnodeop_desc *a_desc; / * other random data follows, presumably * / } */ *ap = v; struct vnodeop_desc *desc = ap->a_desc; struct vnode *vp, *vp_last = NULL; int flags, i, j, offset_cnp, offset_vp; KASSERT(desc->vdesc_offset != VOP_LOOKUP_DESCOFFSET); KASSERT(desc->vdesc_offset != VOP_ABORTOP_DESCOFFSET); /* * Abort any componentname that lookup potentially left state in. * * As is logical, componentnames for VOP_RENAME are handled by * the caller of VOP_RENAME. Yay, rename! */ if (desc->vdesc_offset != VOP_RENAME_DESCOFFSET && (offset_vp = desc->vdesc_vp_offsets[0]) != VDESC_NO_OFFSET && (offset_cnp = desc->vdesc_componentname_offset) != VDESC_NO_OFFSET){ struct componentname *cnp; struct vnode *dvp; dvp = *VOPARG_OFFSETTO(struct vnode **, offset_vp, ap); cnp = *VOPARG_OFFSETTO(struct componentname **, offset_cnp, ap); VOP_ABORTOP(dvp, cnp); } flags = desc->vdesc_flags; for (i = 0; i < VDESC_MAX_VPS; flags >>=1, i++) { if ((offset_vp = desc->vdesc_vp_offsets[i]) == VDESC_NO_OFFSET) break; /* stop at end of list */ if ((j = flags & VDESC_VP0_WILLPUT)) { vp = *VOPARG_OFFSETTO(struct vnode **, offset_vp, ap); /* Skip if NULL */ if (!vp) continue; switch (j) { case VDESC_VP0_WILLPUT: /* Check for dvp == vp cases */ if (vp == vp_last) vrele(vp); else { vput(vp); vp_last = vp; } break; case VDESC_VP0_WILLRELE: vrele(vp); break; } } } return (EOPNOTSUPP); } /*ARGSUSED*/ int genfs_ebadf(void *v) { return (EBADF); } /* ARGSUSED */ int genfs_enoioctl(void *v) { return (EPASSTHROUGH); } /* * Eliminate all activity associated with the requested vnode * and with all vnodes aliased to the requested vnode. */ int genfs_revoke(void *v) { struct vop_revoke_args /* { struct vnode *a_vp; int a_flags; } */ *ap = v; #ifdef DIAGNOSTIC if ((ap->a_flags & REVOKEALL) == 0) panic("genfs_revoke: not revokeall"); #endif vrevoke(ap->a_vp); return (0); } /* * Lock the node (for deadfs). */ int genfs_deadlock(void *v) { struct vop_lock_args /* { struct vnode *a_vp; int a_flags; } */ *ap = v; vnode_t *vp = ap->a_vp; vnode_impl_t *vip = VNODE_TO_VIMPL(vp); int flags = ap->a_flags; krw_t op; if (! ISSET(flags, LK_RETRY)) return ENOENT; if (ISSET(flags, LK_DOWNGRADE)) { rw_downgrade(&vip->vi_lock); } else if (ISSET(flags, LK_UPGRADE)) { KASSERT(ISSET(flags, LK_NOWAIT)); if (!rw_tryupgrade(&vip->vi_lock)) { return EBUSY; } } else if ((flags & (LK_EXCLUSIVE | LK_SHARED)) != 0) { op = (ISSET(flags, LK_EXCLUSIVE) ? RW_WRITER : RW_READER); if (ISSET(flags, LK_NOWAIT)) { if (!rw_tryenter(&vip->vi_lock, op)) return EBUSY; } else { rw_enter(&vip->vi_lock, op); } } VSTATE_ASSERT_UNLOCKED(vp, VS_RECLAIMED); return 0; } /* * Unlock the node (for deadfs). */ int genfs_deadunlock(void *v) { struct vop_unlock_args /* { struct vnode *a_vp; } */ *ap = v; vnode_t *vp = ap->a_vp; vnode_impl_t *vip = VNODE_TO_VIMPL(vp); rw_exit(&vip->vi_lock); return 0; } /* * Lock the node. */ int genfs_lock(void *v) { struct vop_lock_args /* { struct vnode *a_vp; int a_flags; } */ *ap = v; vnode_t *vp = ap->a_vp; vnode_impl_t *vip = VNODE_TO_VIMPL(vp); int flags = ap->a_flags; krw_t op; if (ISSET(flags, LK_DOWNGRADE)) { rw_downgrade(&vip->vi_lock); } else if (ISSET(flags, LK_UPGRADE)) { KASSERT(ISSET(flags, LK_NOWAIT)); if (!rw_tryupgrade(&vip->vi_lock)) { return EBUSY; } } else if ((flags & (LK_EXCLUSIVE | LK_SHARED)) != 0) { op = (ISSET(flags, LK_EXCLUSIVE) ? RW_WRITER : RW_READER); if (ISSET(flags, LK_NOWAIT)) { if (!rw_tryenter(&vip->vi_lock, op)) return EBUSY; } else { rw_enter(&vip->vi_lock, op); } } VSTATE_ASSERT_UNLOCKED(vp, VS_ACTIVE); return 0; } /* * Unlock the node. */ int genfs_unlock(void *v) { struct vop_unlock_args /* { struct vnode *a_vp; } */ *ap = v; vnode_t *vp = ap->a_vp; vnode_impl_t *vip = VNODE_TO_VIMPL(vp); rw_exit(&vip->vi_lock); return 0; } /* * Return whether or not the node is locked. */ int genfs_islocked(void *v) { struct vop_islocked_args /* { struct vnode *a_vp; } */ *ap = v; vnode_t *vp = ap->a_vp; vnode_impl_t *vip = VNODE_TO_VIMPL(vp); if (rw_write_held(&vip->vi_lock)) return LK_EXCLUSIVE; if (rw_read_held(&vip->vi_lock)) return LK_SHARED; return 0; } int genfs_mmap(void *v) { return (0); } /* * VOP_PUTPAGES() for vnodes which never have pages. */ int genfs_null_putpages(void *v) { struct vop_putpages_args /* { struct vnode *a_vp; voff_t a_offlo; voff_t a_offhi; int a_flags; } */ *ap = v; struct vnode *vp = ap->a_vp; KASSERT(vp->v_uobj.uo_npages == 0); rw_exit(vp->v_uobj.vmobjlock); return (0); } void genfs_node_init(struct vnode *vp, const struct genfs_ops *ops) { struct genfs_node *gp = VTOG(vp); rw_init(&gp->g_glock); gp->g_op = ops; } void genfs_node_destroy(struct vnode *vp) { struct genfs_node *gp = VTOG(vp); rw_destroy(&gp->g_glock); } void genfs_size(struct vnode *vp, off_t size, off_t *eobp, int flags) { int bsize; bsize = 1 << vp->v_mount->mnt_fs_bshift; *eobp = (size + bsize - 1) & ~(bsize - 1); } static void filt_genfsdetach(struct knote *kn) { struct vnode *vp = (struct vnode *)kn->kn_hook; vn_knote_detach(vp, kn); } static int filt_genfsread(struct knote *kn, long hint) { struct vnode *vp = (struct vnode *)kn->kn_hook; int rv; /* * filesystem is gone, so set the EOF flag and schedule * the knote for deletion. */ switch (hint) { case NOTE_REVOKE: KASSERT(mutex_owned(vp->v_interlock)); knote_set_eof(kn, EV_ONESHOT); return (1); case 0: mutex_enter(vp->v_interlock); kn->kn_data = vp->v_size - ((file_t *)kn->kn_obj)->f_offset; rv = (kn->kn_data != 0); mutex_exit(vp->v_interlock); return rv; default: KASSERT(mutex_owned(vp->v_interlock)); kn->kn_data = vp->v_size - ((file_t *)kn->kn_obj)->f_offset; return (kn->kn_data != 0); } } static int filt_genfswrite(struct knote *kn, long hint) { struct vnode *vp = (struct vnode *)kn->kn_hook; /* * filesystem is gone, so set the EOF flag and schedule * the knote for deletion. */ switch (hint) { case NOTE_REVOKE: KASSERT(mutex_owned(vp->v_interlock)); knote_set_eof(kn, EV_ONESHOT); return (1); case 0: mutex_enter(vp->v_interlock); kn->kn_data = 0; mutex_exit(vp->v_interlock); return 1; default: KASSERT(mutex_owned(vp->v_interlock)); kn->kn_data = 0; return 1; } } static int filt_genfsvnode(struct knote *kn, long hint) { struct vnode *vp = (struct vnode *)kn->kn_hook; int fflags; switch (hint) { case NOTE_REVOKE: KASSERT(mutex_owned(vp->v_interlock)); knote_set_eof(kn, 0); if ((kn->kn_sfflags & hint) != 0) kn->kn_fflags |= hint; return (1); case 0: mutex_enter(vp->v_interlock); fflags = kn->kn_fflags; mutex_exit(vp->v_interlock); break; default: KASSERT(mutex_owned(vp->v_interlock)); if ((kn->kn_sfflags & hint) != 0) kn->kn_fflags |= hint; fflags = kn->kn_fflags; break; } return (fflags != 0); } static const struct filterops genfsread_filtops = { .f_flags = FILTEROP_ISFD | FILTEROP_MPSAFE, .f_attach = NULL, .f_detach = filt_genfsdetach, .f_event = filt_genfsread, }; static const struct filterops genfswrite_filtops = { .f_flags = FILTEROP_ISFD | FILTEROP_MPSAFE, .f_attach = NULL, .f_detach = filt_genfsdetach, .f_event = filt_genfswrite, }; static const struct filterops genfsvnode_filtops = { .f_flags = FILTEROP_ISFD | FILTEROP_MPSAFE, .f_attach = NULL, .f_detach = filt_genfsdetach, .f_event = filt_genfsvnode, }; int genfs_kqfilter(void *v) { struct vop_kqfilter_args /* { struct vnode *a_vp; struct knote *a_kn; } */ *ap = v; struct vnode *vp; struct knote *kn; vp = ap->a_vp; kn = ap->a_kn; switch (kn->kn_filter) { case EVFILT_READ: kn->kn_fop = &genfsread_filtops; break; case EVFILT_WRITE: kn->kn_fop = &genfswrite_filtops; break; case EVFILT_VNODE: kn->kn_fop = &genfsvnode_filtops; break; default: return (EINVAL); } kn->kn_hook = vp; vn_knote_attach(vp, kn); return (0); } void genfs_node_wrlock(struct vnode *vp) { struct genfs_node *gp = VTOG(vp); rw_enter(&gp->g_glock, RW_WRITER); } void genfs_node_rdlock(struct vnode *vp) { struct genfs_node *gp = VTOG(vp); rw_enter(&gp->g_glock, RW_READER); } int genfs_node_rdtrylock(struct vnode *vp) { struct genfs_node *gp = VTOG(vp); return rw_tryenter(&gp->g_glock, RW_READER); } void genfs_node_unlock(struct vnode *vp) { struct genfs_node *gp = VTOG(vp); rw_exit(&gp->g_glock); } int genfs_node_wrlocked(struct vnode *vp) { struct genfs_node *gp = VTOG(vp); return rw_write_held(&gp->g_glock); } /* * Common filesystem object access control check routine. Accepts a * vnode, cred, uid, gid, mode, acl, requested access mode. * Returns 0 on success, or an errno on failure. */ int genfs_can_access(vnode_t *vp, kauth_cred_t cred, uid_t file_uid, gid_t file_gid, mode_t file_mode, struct acl *acl, accmode_t accmode) { accmode_t dac_granted; int error; KASSERT((accmode & ~(VEXEC | VWRITE | VREAD | VADMIN | VAPPEND)) == 0); KASSERT((accmode & VAPPEND) == 0 || (accmode & VWRITE)); /* * Look for a normal, non-privileged way to access the file/directory * as requested. If it exists, go with that. */ dac_granted = 0; /* Check the owner. */ if (kauth_cred_geteuid(cred) == file_uid) { dac_granted |= VADMIN; if (file_mode & S_IXUSR) dac_granted |= VEXEC; if (file_mode & S_IRUSR) dac_granted |= VREAD; if (file_mode & S_IWUSR) dac_granted |= (VWRITE | VAPPEND); goto privchk; } /* Otherwise, check the groups (first match) */ /* Otherwise, check the groups. */ error = kauth_cred_groupmember(cred, file_gid); if (error > 0) return error; if (error == 0) { if (file_mode & S_IXGRP) dac_granted |= VEXEC; if (file_mode & S_IRGRP) dac_granted |= VREAD; if (file_mode & S_IWGRP) dac_granted |= (VWRITE | VAPPEND); goto privchk; } /* Otherwise, check everyone else. */ if (file_mode & S_IXOTH) dac_granted |= VEXEC; if (file_mode & S_IROTH) dac_granted |= VREAD; if (file_mode & S_IWOTH) dac_granted |= (VWRITE | VAPPEND); privchk: if ((accmode & dac_granted) == accmode) return 0; return (accmode & VADMIN) ? EPERM : EACCES; } /* * Implement a version of genfs_can_access() that understands POSIX.1e ACL * semantics; * the access ACL has already been prepared for evaluation by the file system * and is passed via 'uid', 'gid', and 'acl'. Return 0 on success, else an * errno value. */ int genfs_can_access_acl_posix1e(vnode_t *vp, kauth_cred_t cred, uid_t file_uid, gid_t file_gid, mode_t file_mode, struct acl *acl, accmode_t accmode) { struct acl_entry *acl_other, *acl_mask; accmode_t dac_granted; accmode_t acl_mask_granted; int group_matched, i; int error; KASSERT((accmode & ~(VEXEC | VWRITE | VREAD | VADMIN | VAPPEND)) == 0); KASSERT((accmode & VAPPEND) == 0 || (accmode & VWRITE)); /* * The owner matches if the effective uid associated with the * credential matches that of the ACL_USER_OBJ entry. While we're * doing the first scan, also cache the location of the ACL_MASK and * ACL_OTHER entries, preventing some future iterations. */ acl_mask = acl_other = NULL; for (i = 0; i < acl->acl_cnt; i++) { struct acl_entry *ae = &acl->acl_entry[i]; switch (ae->ae_tag) { case ACL_USER_OBJ: if (kauth_cred_geteuid(cred) != file_uid) break; dac_granted = 0; dac_granted |= VADMIN; if (ae->ae_perm & ACL_EXECUTE) dac_granted |= VEXEC; if (ae->ae_perm & ACL_READ) dac_granted |= VREAD; if (ae->ae_perm & ACL_WRITE) dac_granted |= (VWRITE | VAPPEND); goto out; case ACL_MASK: acl_mask = ae; break; case ACL_OTHER: acl_other = ae; break; default: break; } } /* * An ACL_OTHER entry should always exist in a valid access ACL. If * it doesn't, then generate a serious failure. For now, this means * a debugging message and EPERM, but in the future should probably * be a panic. */ if (acl_other == NULL) { /* * XXX This should never happen */ printf("%s: ACL_OTHER missing\n", __func__); return EPERM; } /* * Checks against ACL_USER, ACL_GROUP_OBJ, and ACL_GROUP fields are * masked by an ACL_MASK entry, if any. As such, first identify the * ACL_MASK field, then iterate through identifying potential user * matches, then group matches. If there is no ACL_MASK, assume that * the mask allows all requests to succeed. */ if (acl_mask != NULL) { acl_mask_granted = 0; if (acl_mask->ae_perm & ACL_EXECUTE) acl_mask_granted |= VEXEC; if (acl_mask->ae_perm & ACL_READ) acl_mask_granted |= VREAD; if (acl_mask->ae_perm & ACL_WRITE) acl_mask_granted |= (VWRITE | VAPPEND); } else acl_mask_granted = VEXEC | VREAD | VWRITE | VAPPEND; /* * Check ACL_USER ACL entries. There will either be one or no * matches; if there is one, we accept or rejected based on the * match; otherwise, we continue on to groups. */ for (i = 0; i < acl->acl_cnt; i++) { struct acl_entry *ae = &acl->acl_entry[i]; switch (ae->ae_tag) { case ACL_USER: if (kauth_cred_geteuid(cred) != ae->ae_id) break; dac_granted = 0; if (ae->ae_perm & ACL_EXECUTE) dac_granted |= VEXEC; if (ae->ae_perm & ACL_READ) dac_granted |= VREAD; if (ae->ae_perm & ACL_WRITE) dac_granted |= (VWRITE | VAPPEND); dac_granted &= acl_mask_granted; goto out; } } /* * Group match is best-match, not first-match, so find a "best" * match. Iterate across, testing each potential group match. Make * sure we keep track of whether we found a match or not, so that we * know if we should try again with any available privilege, or if we * should move on to ACL_OTHER. */ group_matched = 0; for (i = 0; i < acl->acl_cnt; i++) { struct acl_entry *ae = &acl->acl_entry[i]; switch (ae->ae_tag) { case ACL_GROUP_OBJ: error = kauth_cred_groupmember(cred, file_gid); if (error > 0) return error; if (error) break; dac_granted = 0; if (ae->ae_perm & ACL_EXECUTE) dac_granted |= VEXEC; if (ae->ae_perm & ACL_READ) dac_granted |= VREAD; if (ae->ae_perm & ACL_WRITE) dac_granted |= (VWRITE | VAPPEND); dac_granted &= acl_mask_granted; if ((accmode & dac_granted) == accmode) return 0; group_matched = 1; break; case ACL_GROUP: error = kauth_cred_groupmember(cred, ae->ae_id); if (error > 0) return error; if (error) break; dac_granted = 0; if (ae->ae_perm & ACL_EXECUTE) dac_granted |= VEXEC; if (ae->ae_perm & ACL_READ) dac_granted |= VREAD; if (ae->ae_perm & ACL_WRITE) dac_granted |= (VWRITE | VAPPEND); dac_granted &= acl_mask_granted; if ((accmode & dac_granted) == accmode) return 0; group_matched = 1; break; default: break; } } if (group_matched == 1) { /* * There was a match, but it did not grant rights via pure * DAC. Try again, this time with privilege. */ for (i = 0; i < acl->acl_cnt; i++) { struct acl_entry *ae = &acl->acl_entry[i]; switch (ae->ae_tag) { case ACL_GROUP_OBJ: error = kauth_cred_groupmember(cred, file_gid); if (error > 0) return error; if (error) break; dac_granted = 0; if (ae->ae_perm & ACL_EXECUTE) dac_granted |= VEXEC; if (ae->ae_perm & ACL_READ) dac_granted |= VREAD; if (ae->ae_perm & ACL_WRITE) dac_granted |= (VWRITE | VAPPEND); dac_granted &= acl_mask_granted; goto out; case ACL_GROUP: error = kauth_cred_groupmember(cred, ae->ae_id); if (error > 0) return error; if (error) break; dac_granted = 0; if (ae->ae_perm & ACL_EXECUTE) dac_granted |= VEXEC; if (ae->ae_perm & ACL_READ) dac_granted |= VREAD; if (ae->ae_perm & ACL_WRITE) dac_granted |= (VWRITE | VAPPEND); dac_granted &= acl_mask_granted; goto out; default: break; } } /* * Even with privilege, group membership was not sufficient. * Return failure. */ dac_granted = 0; goto out; } /* * Fall back on ACL_OTHER. ACL_MASK is not applied to ACL_OTHER. */ dac_granted = 0; if (acl_other->ae_perm & ACL_EXECUTE) dac_granted |= VEXEC; if (acl_other->ae_perm & ACL_READ) dac_granted |= VREAD; if (acl_other->ae_perm & ACL_WRITE) dac_granted |= (VWRITE | VAPPEND); out: if ((accmode & dac_granted) == accmode) return 0; return (accmode & VADMIN) ? EPERM : EACCES; } static struct { accmode_t accmode; int mask; } accmode2mask[] = { { VREAD, ACL_READ_DATA }, { VWRITE, ACL_WRITE_DATA }, { VAPPEND, ACL_APPEND_DATA }, { VEXEC, ACL_EXECUTE }, { VREAD_NAMED_ATTRS, ACL_READ_NAMED_ATTRS }, { VWRITE_NAMED_ATTRS, ACL_WRITE_NAMED_ATTRS }, { VDELETE_CHILD, ACL_DELETE_CHILD }, { VREAD_ATTRIBUTES, ACL_READ_ATTRIBUTES }, { VWRITE_ATTRIBUTES, ACL_WRITE_ATTRIBUTES }, { VDELETE, ACL_DELETE }, { VREAD_ACL, ACL_READ_ACL }, { VWRITE_ACL, ACL_WRITE_ACL }, { VWRITE_OWNER, ACL_WRITE_OWNER }, { VSYNCHRONIZE, ACL_SYNCHRONIZE }, { 0, 0 }, }; static int _access_mask_from_accmode(accmode_t accmode) { int access_mask = 0, i; for (i = 0; accmode2mask[i].accmode != 0; i++) { if (accmode & accmode2mask[i].accmode) access_mask |= accmode2mask[i].mask; } /* * VAPPEND is just a modifier for VWRITE; if the caller asked * for 'VAPPEND | VWRITE', we want to check for ACL_APPEND_DATA only. */ if (access_mask & ACL_APPEND_DATA) access_mask &= ~ACL_WRITE_DATA; return (access_mask); } /* * Return 0, iff access is allowed, 1 otherwise. */ static int _acl_denies(const struct acl *aclp, int access_mask, kauth_cred_t cred, int file_uid, int file_gid, int *denied_explicitly) { int i, error; const struct acl_entry *ae; if (denied_explicitly != NULL) *denied_explicitly = 0; KASSERT(aclp->acl_cnt <= ACL_MAX_ENTRIES); for (i = 0; i < aclp->acl_cnt; i++) { ae = &(aclp->acl_entry[i]); if (ae->ae_entry_type != ACL_ENTRY_TYPE_ALLOW && ae->ae_entry_type != ACL_ENTRY_TYPE_DENY) continue; if (ae->ae_flags & ACL_ENTRY_INHERIT_ONLY) continue; switch (ae->ae_tag) { case ACL_USER_OBJ: if (kauth_cred_geteuid(cred) != file_uid) continue; break; case ACL_USER: if (kauth_cred_geteuid(cred) != ae->ae_id) continue; break; case ACL_GROUP_OBJ: error = kauth_cred_groupmember(cred, file_gid); if (error > 0) return error; if (error != 0) continue; break; case ACL_GROUP: error = kauth_cred_groupmember(cred, ae->ae_id); if (error > 0) return error; if (error != 0) continue; break; default: KASSERT(ae->ae_tag == ACL_EVERYONE); } if (ae->ae_entry_type == ACL_ENTRY_TYPE_DENY) { if (ae->ae_perm & access_mask) { if (denied_explicitly != NULL) *denied_explicitly = 1; return (1); } } access_mask &= ~(ae->ae_perm); if (access_mask == 0) return (0); } if (access_mask == 0) return (0); return (1); } int genfs_can_access_acl_nfs4(vnode_t *vp, kauth_cred_t cred, uid_t file_uid, gid_t file_gid, mode_t file_mode, struct acl *aclp, accmode_t accmode) { int denied, explicitly_denied, access_mask, is_directory, must_be_owner = 0; file_mode = 0; KASSERT((accmode & ~(VEXEC | VWRITE | VREAD | VADMIN | VAPPEND | VEXPLICIT_DENY | VREAD_NAMED_ATTRS | VWRITE_NAMED_ATTRS | VDELETE_CHILD | VREAD_ATTRIBUTES | VWRITE_ATTRIBUTES | VDELETE | VREAD_ACL | VWRITE_ACL | VWRITE_OWNER | VSYNCHRONIZE)) == 0); KASSERT((accmode & VAPPEND) == 0 || (accmode & VWRITE)); if (accmode & VADMIN) must_be_owner = 1; /* * Ignore VSYNCHRONIZE permission. */ accmode &= ~VSYNCHRONIZE; access_mask = _access_mask_from_accmode(accmode); if (vp && vp->v_type == VDIR) is_directory = 1; else is_directory = 0; /* * File owner is always allowed to read and write the ACL * and basic attributes. This is to prevent a situation * where user would change ACL in a way that prevents him * from undoing the change. */ if (kauth_cred_geteuid(cred) == file_uid) access_mask &= ~(ACL_READ_ACL | ACL_WRITE_ACL | ACL_READ_ATTRIBUTES | ACL_WRITE_ATTRIBUTES); /* * Ignore append permission for regular files; use write * permission instead. */ if (!is_directory && (access_mask & ACL_APPEND_DATA)) { access_mask &= ~ACL_APPEND_DATA; access_mask |= ACL_WRITE_DATA; } denied = _acl_denies(aclp, access_mask, cred, file_uid, file_gid, &explicitly_denied); if (must_be_owner) { if (kauth_cred_geteuid(cred) != file_uid) denied = EPERM; } /* * For VEXEC, ensure that at least one execute bit is set for * non-directories. We have to check the mode here to stay * consistent with execve(2). See the test in * exec_check_permissions(). */ __acl_nfs4_sync_mode_from_acl(&file_mode, aclp); if (!denied && !is_directory && (accmode & VEXEC) && (file_mode & (S_IXUSR | S_IXGRP | S_IXOTH)) == 0) denied = EACCES; if (!denied) return (0); /* * Access failed. Iff it was not denied explicitly and * VEXPLICIT_DENY flag was specified, allow access. */ if ((accmode & VEXPLICIT_DENY) && explicitly_denied == 0) return (0); accmode &= ~VEXPLICIT_DENY; if (accmode & (VADMIN_PERMS | VDELETE_CHILD | VDELETE)) denied = EPERM; else denied = EACCES; return (denied); } /* * Common routine to check if chmod() is allowed. * * Policy: * - You must own the file, and * - You must not set the "sticky" bit (meaningless, see chmod(2)) * - You must be a member of the group if you're trying to set the * SGIDf bit * * vp - vnode of the file-system object * cred - credentials of the invoker * cur_uid, cur_gid - current uid/gid of the file-system object * new_mode - new mode for the file-system object * * Returns 0 if the change is allowed, or an error value otherwise. */ int genfs_can_chmod(vnode_t *vp, kauth_cred_t cred, uid_t cur_uid, gid_t cur_gid, mode_t new_mode) { int error; /* * To modify the permissions on a file, must possess VADMIN * for that file. */ if ((error = VOP_ACCESSX(vp, VWRITE_ACL, cred)) != 0) return (error); /* * Unprivileged users can't set the sticky bit on files. */ if ((vp->v_type != VDIR) && (new_mode & S_ISTXT)) return (EFTYPE); /* * If the invoker is trying to set the SGID bit on the file, * check group membership. */ if (new_mode & S_ISGID) { int ismember; error = kauth_cred_ismember_gid(cred, cur_gid, &ismember); if (error || !ismember) return (EPERM); } /* * Deny setting setuid if we are not the file owner. */ if ((new_mode & S_ISUID) && cur_uid != kauth_cred_geteuid(cred)) return (EPERM); return (0); } /* * Common routine to check if chown() is allowed. * * Policy: * - You must own the file, and * - You must not try to change ownership, and * - You must be member of the new group * * vp - vnode * cred - credentials of the invoker * cur_uid, cur_gid - current uid/gid of the file-system object * new_uid, new_gid - target uid/gid of the file-system object * * Returns 0 if the change is allowed, or an error value otherwise. */ int genfs_can_chown(vnode_t *vp, kauth_cred_t cred, uid_t cur_uid, gid_t cur_gid, uid_t new_uid, gid_t new_gid) { int error, ismember; /* * To modify the ownership of a file, must possess VADMIN for that * file. */ if ((error = VOP_ACCESSX(vp, VWRITE_OWNER, cred)) != 0) return (error); /* * You can only change ownership of a file if: * You own the file and... */ if (kauth_cred_geteuid(cred) == cur_uid) { /* * You don't try to change ownership, and... */ if (new_uid != cur_uid) return (EPERM); /* * You don't try to change group (no-op), or... */ if (new_gid == cur_gid) return (0); /* * Your effective gid is the new gid, or... */ if (kauth_cred_getegid(cred) == new_gid) return (0); /* * The new gid is one you're a member of. */ ismember = 0; error = kauth_cred_ismember_gid(cred, new_gid, &ismember); if (!error && ismember) return (0); } return (EPERM); } int genfs_can_chtimes(vnode_t *vp, kauth_cred_t cred, uid_t owner_uid, u_int vaflags) { int error; /* * Grant permission if the caller is the owner of the file, or * the super-user, or has ACL_WRITE_ATTRIBUTES permission on * on the file. If the time pointer is null, then write * permission on the file is also sufficient. * * From NFSv4.1, draft 21, 6.2.1.3.1, Discussion of Mask Attributes: * A user having ACL_WRITE_DATA or ACL_WRITE_ATTRIBUTES * will be allowed to set the times [..] to the current * server time. */ error = VOP_ACCESSX(vp, VWRITE_ATTRIBUTES, cred); if (error != 0 && (vaflags & VA_UTIMES_NULL) != 0) error = VOP_ACCESS(vp, VWRITE, cred); if (error) return (vaflags & VA_UTIMES_NULL) == 0 ? EPERM : EACCES; return 0; } /* * Common routine to check if chflags() is allowed. * * Policy: * - You must own the file, and * - You must not change system flags, and * - You must not change flags on character/block devices. * * vp - vnode * cred - credentials of the invoker * owner_uid - uid of the file-system object * changing_sysflags - true if the invoker wants to change system flags */ int genfs_can_chflags(vnode_t *vp, kauth_cred_t cred, uid_t owner_uid, bool changing_sysflags) { /* The user must own the file. */ if (kauth_cred_geteuid(cred) != owner_uid) { return EPERM; } if (changing_sysflags) { return EPERM; } /* * Unprivileged users cannot change the flags on devices, even if they * own them. */ if (vp->v_type == VCHR || vp->v_type == VBLK) { return EPERM; } return 0; } /* * Common "sticky" policy. * * When a directory is "sticky" (as determined by the caller), this * function may help implementing the following policy: * - Renaming a file in it is only possible if the user owns the directory * or the file being renamed. * - Deleting a file from it is only possible if the user owns the * directory or the file being deleted. */ int genfs_can_sticky(vnode_t *vp, kauth_cred_t cred, uid_t dir_uid, uid_t file_uid) { if (kauth_cred_geteuid(cred) != dir_uid && kauth_cred_geteuid(cred) != file_uid) return EPERM; return 0; } int genfs_can_extattr(vnode_t *vp, kauth_cred_t cred, accmode_t accmode, int attrnamespace) { /* * Kernel-invoked always succeeds. */ if (cred == NOCRED) return 0; switch (attrnamespace) { case EXTATTR_NAMESPACE_SYSTEM: return kauth_authorize_system(cred, KAUTH_SYSTEM_FS_EXTATTR, 0, vp->v_mount, NULL, NULL); case EXTATTR_NAMESPACE_USER: return VOP_ACCESS(vp, accmode, cred); default: return EPERM; } } int genfs_access(void *v) { struct vop_access_args *ap = v; KASSERT((ap->a_accmode & ~(VEXEC | VWRITE | VREAD | VADMIN | VAPPEND)) == 0); return VOP_ACCESSX(ap->a_vp, ap->a_accmode, ap->a_cred); } int genfs_accessx(void *v) { struct vop_accessx_args *ap = v; int error; accmode_t accmode = ap->a_accmode; error = vfs_unixify_accmode(&accmode); if (error != 0) return error; if (accmode == 0) return 0; return VOP_ACCESS(ap->a_vp, accmode, ap->a_cred); } /* * genfs_pathconf: * * Standard implementation of POSIX pathconf, to get information about limits * for a filesystem. * Override per filesystem for the case where the filesystem has smaller * limits. */ int genfs_pathconf(void *v) { struct vop_pathconf_args *ap = v; switch (ap->a_name) { case _PC_PATH_MAX: *ap->a_retval = PATH_MAX; return 0; case _PC_ACL_EXTENDED: case _PC_ACL_NFS4: *ap->a_retval = 0; return 0; default: return EINVAL; } }
690 244 263 262 223 476 358 222 517 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 /* $NetBSD: uvm_fault_i.h,v 1.33 2020/02/23 15:46:43 ad Exp $ */ /* * Copyright (c) 1997 Charles D. Cranor and Washington University. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * from: Id: uvm_fault_i.h,v 1.1.6.1 1997/12/08 16:07:12 chuck Exp */ #ifndef _UVM_UVM_FAULT_I_H_ #define _UVM_UVM_FAULT_I_H_ /* * uvm_fault_i.h: fault inline functions */ void uvmfault_update_stats(struct uvm_faultinfo *); /* * uvmfault_unlockmaps: unlock the maps */ static __inline void uvmfault_unlockmaps(struct uvm_faultinfo *ufi, bool write_locked) { /* * ufi can be NULL when this isn't really a fault, * but merely paging in anon data. */ if (ufi == NULL) { return; } #ifndef __HAVE_NO_PMAP_STATS uvmfault_update_stats(ufi); #endif if (write_locked) { vm_map_unlock(ufi->map); } else { vm_map_unlock_read(ufi->map); } } /* * uvmfault_unlockall: unlock everything passed in. * * => maps must be read-locked (not write-locked). */ static __inline void uvmfault_unlockall(struct uvm_faultinfo *ufi, struct vm_amap *amap, struct uvm_object *uobj) { if (uobj) rw_exit(uobj->vmobjlock); if (amap) amap_unlock(amap); uvmfault_unlockmaps(ufi, false); } /* * uvmfault_lookup: lookup a virtual address in a map * * => caller must provide a uvm_faultinfo structure with the IN * params properly filled in * => we will lookup the map entry (handling submaps) as we go * => if the lookup is a success we will return with the maps locked * => if "write_lock" is true, we write_lock the map, otherwise we only * get a read lock. * => note that submaps can only appear in the kernel and they are * required to use the same virtual addresses as the map they * are referenced by (thus address translation between the main * map and the submap is unnecessary). */ static __inline bool uvmfault_lookup(struct uvm_faultinfo *ufi, bool write_lock) { struct vm_map *tmpmap; /* * init ufi values for lookup. */ ufi->map = ufi->orig_map; ufi->size = ufi->orig_size; /* * keep going down levels until we are done. note that there can * only be two levels so we won't loop very long. */ for (;;) { /* * lock map */ if (write_lock) { vm_map_lock(ufi->map); } else { vm_map_lock_read(ufi->map); } /* * lookup */ if (!uvm_map_lookup_entry(ufi->map, ufi->orig_rvaddr, &ufi->entry)) { uvmfault_unlockmaps(ufi, write_lock); return(false); } /* * reduce size if necessary */ if (ufi->entry->end - ufi->orig_rvaddr < ufi->size) ufi->size = ufi->entry->end - ufi->orig_rvaddr; /* * submap? replace map with the submap and lookup again. * note: VAs in submaps must match VAs in main map. */ if (UVM_ET_ISSUBMAP(ufi->entry)) { tmpmap = ufi->entry->object.sub_map; if (write_lock) { vm_map_unlock(ufi->map); } else { vm_map_unlock_read(ufi->map); } ufi->map = tmpmap; continue; } /* * got it! */ ufi->mapv = ufi->map->timestamp; return(true); } /* while loop */ /*NOTREACHED*/ } /* * uvmfault_relock: attempt to relock the same version of the map * * => fault data structures should be unlocked before calling. * => if a success (true) maps will be locked after call. */ static __inline bool uvmfault_relock(struct uvm_faultinfo *ufi) { /* * ufi can be NULL when this isn't really a fault, * but merely paging in anon data. */ if (ufi == NULL) { return true; } cpu_count(CPU_COUNT_FLTRELCK, 1); /* * relock map. fail if version mismatch (in which case nothing * gets locked). */ vm_map_lock_read(ufi->map); if (ufi->mapv != ufi->map->timestamp) { vm_map_unlock_read(ufi->map); return(false); } cpu_count(CPU_COUNT_FLTRELCKOK, 1); return(true); } #endif /* _UVM_UVM_FAULT_I_H_ */
3 3 3 3 3 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 /* $NetBSD: uhub.c,v 1.162 2024/05/04 12:49:15 mlelstv Exp $ */ /* $FreeBSD: src/sys/dev/usb/uhub.c,v 1.18 1999/11/17 22:33:43 n_hibma Exp $ */ /* $OpenBSD: uhub.c,v 1.86 2015/06/29 18:27:40 mpi Exp $ */ /* * Copyright (c) 1998, 2004 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Lennart Augustsson (lennart@augustsson.net) at * Carlstedt Research & Technology. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /* * USB spec: http://www.usb.org/developers/docs/usbspec.zip */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: uhub.c,v 1.162 2024/05/04 12:49:15 mlelstv Exp $"); #ifdef _KERNEL_OPT #include "opt_usb.h" #endif #include <sys/param.h> #include <sys/bus.h> #include <sys/device.h> #include <sys/kernel.h> #include <sys/kmem.h> #include <sys/proc.h> #include <sys/sysctl.h> #include <sys/systm.h> #include <sys/kcov.h> #include <sys/sdt.h> #include <dev/usb/usb.h> #include <dev/usb/usb_sdt.h> #include <dev/usb/usbdi.h> #include <dev/usb/usbdi_util.h> #include <dev/usb/usbdivar.h> #include <dev/usb/usbhist.h> SDT_PROBE_DEFINE1(usb, hub, explore, start, "struct usbd_device *"/*hub*/); SDT_PROBE_DEFINE1(usb, hub, explore, done, "struct usbd_device *"/*hub*/); SDT_PROBE_DEFINE3(usb, hub, explore, rescan, "struct usbd_device *"/*hub*/, "int"/*portno*/, "struct usbd_port *"/*port*/); SDT_PROBE_DEFINE5(usb, hub, explore, portstat, "struct usbd_device *"/*hub*/, "int"/*portno*/, "int"/*status*/, "int"/*change*/, "int"/*reattach*/); SDT_PROBE_DEFINE3(usb, hub, explore, disconnect, "struct usbd_device *"/*hub*/, "int"/*portno*/, "struct usbd_port *"/*port*/); SDT_PROBE_DEFINE5(usb, hub, explore, reset, "struct usbd_device *"/*hub*/, "int"/*portno*/, "struct usbd_port *"/*port*/, "int"/*status*/, "int"/*change*/); SDT_PROBE_DEFINE4(usb, hub, explore, connect, "struct usbd_device *"/*hub*/, "int"/*portno*/, "struct usbd_port *"/*port*/, "int"/*speed*/); SDT_PROBE_DEFINE4(usb, hub, explore, connected, "struct usbd_device *"/*hub*/, "int"/*portno*/, "struct usbd_port *"/*port*/, "int"/*speed*/); SDT_PROBE_DEFINE2(usb, hub, interrupt, , "struct usbd_device *"/*hub*/, "usbd_status"/*status*/); #ifdef USB_DEBUG #ifndef UHUB_DEBUG #define uhubdebug 0 #else static int uhubdebug = 0; SYSCTL_SETUP(sysctl_hw_uhub_setup, "sysctl hw.uhub setup") { int err; const struct sysctlnode *rnode; const struct sysctlnode *cnode; err = sysctl_createv(clog, 0, NULL, &rnode, CTLFLAG_PERMANENT, CTLTYPE_NODE, "uhub", SYSCTL_DESCR("uhub global controls"), NULL, 0, NULL, 0, CTL_HW, CTL_CREATE, CTL_EOL); if (err) goto fail; /* control debugging printfs */ err = sysctl_createv(clog, 0, &rnode, &cnode, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_INT, "debug", SYSCTL_DESCR("Enable debugging output"), NULL, 0, &uhubdebug, sizeof(uhubdebug), CTL_CREATE, CTL_EOL); if (err) goto fail; return; fail: aprint_error("%s: sysctl_createv failed (err = %d)\n", __func__, err); } #endif /* UHUB_DEBUG */ #endif /* USB_DEBUG */ #define DPRINTF(FMT,A,B,C,D) USBHIST_LOGN(uhubdebug,1,FMT,A,B,C,D) #define DPRINTFN(N,FMT,A,B,C,D) USBHIST_LOGN(uhubdebug,N,FMT,A,B,C,D) #define UHUBHIST_FUNC() USBHIST_FUNC() #define UHUBHIST_CALLED(name) USBHIST_CALLED(uhubdebug) #define UHUBHIST_CALLARGS(FMT,A,B,C,D) \ USBHIST_CALLARGS(uhubdebug,FMT,A,B,C,D) struct uhub_softc { device_t sc_dev; /* base device */ struct usbd_device *sc_hub; /* USB device */ int sc_proto; /* device protocol */ struct usbd_pipe *sc_ipipe; /* interrupt pipe */ kmutex_t sc_lock; kcondvar_t sc_cv; uint8_t *sc_statusbuf; uint8_t *sc_statuspend; uint8_t *sc_status; size_t sc_statuslen; bool sc_explorepending; bool sc_first_explore; bool sc_running; bool sc_rescan; struct lwp *sc_exploring; }; #define UHUB_IS_HIGH_SPEED(sc) \ ((sc)->sc_proto == UDPROTO_HSHUBSTT || (sc)->sc_proto == UDPROTO_HSHUBMTT) #define UHUB_IS_SINGLE_TT(sc) ((sc)->sc_proto == UDPROTO_HSHUBSTT) #define PORTSTAT_ISSET(sc, port) \ ((sc)->sc_status[(port) / 8] & (1 << ((port) % 8))) Static usbd_status uhub_explore(struct usbd_device *); Static void uhub_intr(struct usbd_xfer *, void *, usbd_status); /* * We need two attachment points: * hub to usb and hub to hub * Every other driver only connects to hubs */ static int uhub_match(device_t, cfdata_t, void *); static void uhub_attach(device_t, device_t, void *); static int uhub_rescan(device_t, const char *, const int *); static void uhub_childdet(device_t, device_t); static int uhub_detach(device_t, int); CFATTACH_DECL3_NEW(uhub, sizeof(struct uhub_softc), uhub_match, uhub_attach, uhub_detach, NULL, uhub_rescan, uhub_childdet, DVF_DETACH_SHUTDOWN); CFATTACH_DECL3_NEW(uroothub, sizeof(struct uhub_softc), uhub_match, uhub_attach, uhub_detach, NULL, uhub_rescan, uhub_childdet, DVF_DETACH_SHUTDOWN); /* * Setting this to 1 makes sure than an uhub attaches even at higher * priority than ugen when ugen_override is set to 1. This allows to * probe the whole USB bus and attach functions with ugen. */ int uhub_ubermatch = 0; static usbd_status usbd_get_hub_desc(struct usbd_device *dev, usb_hub_descriptor_t *hd, int speed) { usb_device_request_t req; usbd_status err; int nports; UHUBHIST_FUNC(); UHUBHIST_CALLED(); /* don't issue UDESC_HUB to SS hub, or it would stall */ if (dev->ud_depth != 0 && USB_IS_SS(dev->ud_speed)) { usb_hub_ss_descriptor_t hssd; int rmvlen; memset(&hssd, 0, sizeof(hssd)); req.bmRequestType = UT_READ_CLASS_DEVICE; req.bRequest = UR_GET_DESCRIPTOR; USETW2(req.wValue, UDESC_SS_HUB, 0); USETW(req.wIndex, 0); USETW(req.wLength, USB_HUB_SS_DESCRIPTOR_SIZE); DPRINTFN(1, "getting sshub descriptor", 0, 0, 0, 0); err = usbd_do_request(dev, &req, &hssd); nports = hssd.bNbrPorts; if (dev->ud_depth != 0 && nports > UHD_SS_NPORTS_MAX) { DPRINTF("num of ports %jd exceeds maxports %jd", nports, UHD_SS_NPORTS_MAX, 0, 0); nports = hd->bNbrPorts = UHD_SS_NPORTS_MAX; } rmvlen = (nports + 7) / 8; hd->bDescLength = USB_HUB_DESCRIPTOR_SIZE + (rmvlen > 1 ? rmvlen : 1) - 1; memcpy(hd->DeviceRemovable, hssd.DeviceRemovable, rmvlen); hd->bDescriptorType = hssd.bDescriptorType; hd->bNbrPorts = hssd.bNbrPorts; hd->wHubCharacteristics[0] = hssd.wHubCharacteristics[0]; hd->wHubCharacteristics[1] = hssd.wHubCharacteristics[1]; hd->bPwrOn2PwrGood = hssd.bPwrOn2PwrGood; hd->bHubContrCurrent = hssd.bHubContrCurrent; } else { req.bmRequestType = UT_READ_CLASS_DEVICE; req.bRequest = UR_GET_DESCRIPTOR; USETW2(req.wValue, UDESC_HUB, 0); USETW(req.wIndex, 0); USETW(req.wLength, USB_HUB_DESCRIPTOR_SIZE); DPRINTFN(1, "getting hub descriptor", 0, 0, 0, 0); err = usbd_do_request(dev, &req, hd); nports = hd->bNbrPorts; if (!err && nports > 7) { USETW(req.wLength, USB_HUB_DESCRIPTOR_SIZE + (nports+1) / 8); err = usbd_do_request(dev, &req, hd); } } return err; } static usbd_status usbd_set_hub_depth(struct usbd_device *dev, int depth) { usb_device_request_t req; req.bmRequestType = UT_WRITE_CLASS_DEVICE; req.bRequest = UR_SET_HUB_DEPTH; USETW(req.wValue, depth); USETW(req.wIndex, 0); USETW(req.wLength, 0); return usbd_do_request(dev, &req, 0); } static int uhub_match(device_t parent, cfdata_t match, void *aux) { struct usb_attach_arg *uaa = aux; int matchvalue; UHUBHIST_FUNC(); UHUBHIST_CALLED(); if (uhub_ubermatch) matchvalue = UMATCH_HIGHEST+1; else matchvalue = UMATCH_DEVCLASS_DEVSUBCLASS; DPRINTFN(5, "uaa=%#jx", (uintptr_t)uaa, 0, 0, 0); /* * The subclass for hubs seems to be 0 for some and 1 for others, * so we just ignore the subclass. */ if (uaa->uaa_class == UDCLASS_HUB) return matchvalue; return UMATCH_NONE; } static void uhub_attach(device_t parent, device_t self, void *aux) { struct uhub_softc *sc = device_private(self); struct usb_attach_arg *uaa = aux; struct usbd_device *dev = uaa->uaa_device; char *devinfop; usbd_status err; struct usbd_hub *hub = NULL; usb_hub_descriptor_t hubdesc; int p, port, nports, nremov, pwrdly; struct usbd_interface *iface; usb_endpoint_descriptor_t *ed; struct usbd_tt *tts = NULL; UHUBHIST_FUNC(); UHUBHIST_CALLED(); KASSERT(usb_in_event_thread(parent)); config_pending_incr(self); sc->sc_dev = self; sc->sc_hub = dev; sc->sc_proto = uaa->uaa_proto; devinfop = usbd_devinfo_alloc(dev, 1); aprint_naive("\n"); aprint_normal(": %s\n", devinfop); usbd_devinfo_free(devinfop); if (dev->ud_depth > 0 && UHUB_IS_HIGH_SPEED(sc)) { aprint_normal_dev(self, "%s transaction translator%s\n", UHUB_IS_SINGLE_TT(sc) ? "single" : "multiple", UHUB_IS_SINGLE_TT(sc) ? "" : "s"); } err = usbd_set_config_index(dev, 0, 1); if (err) { DPRINTF("configuration failed, sc %#jx error %jd", (uintptr_t)sc, err, 0, 0); goto bad2; } if (dev->ud_depth > USB_HUB_MAX_DEPTH) { aprint_error_dev(self, "hub depth (%d) exceeded, hub ignored\n", USB_HUB_MAX_DEPTH); goto bad2; } /* Get hub descriptor. */ memset(&hubdesc, 0, sizeof(hubdesc)); err = usbd_get_hub_desc(dev, &hubdesc, dev->ud_speed); nports = hubdesc.bNbrPorts; if (err) { DPRINTF("getting hub descriptor failed, uhub%jd error %jd", device_unit(self), err, 0, 0); goto bad2; } for (nremov = 0, port = 1; port <= nports; port++) if (!UHD_NOT_REMOV(&hubdesc, port)) nremov++; aprint_verbose_dev(self, "%d port%s with %d removable, %s powered\n", nports, nports != 1 ? "s" : "", nremov, dev->ud_selfpowered ? "self" : "bus"); if (nports == 0) { aprint_debug_dev(self, "no ports, hub ignored\n"); goto bad; } hub = kmem_alloc(sizeof(*hub) + (nports-1) * sizeof(struct usbd_port), KM_SLEEP); dev->ud_hub = hub; dev->ud_hub->uh_hubsoftc = sc; hub->uh_explore = uhub_explore; hub->uh_hubdesc = hubdesc; if (USB_IS_SS(dev->ud_speed) && dev->ud_depth != 0) { aprint_debug_dev(self, "setting hub depth %u\n", dev->ud_depth - 1); err = usbd_set_hub_depth(dev, dev->ud_depth - 1); if (err) { aprint_error_dev(self, "can't set depth\n"); goto bad; } } /* Set up interrupt pipe. */ err = usbd_device2interface_handle(dev, 0, &iface); if (err) { aprint_error_dev(self, "no interface handle\n"); goto bad; } if (UHUB_IS_HIGH_SPEED(sc) && !UHUB_IS_SINGLE_TT(sc)) { err = usbd_set_interface(iface, 1); if (err) aprint_error_dev(self, "can't enable multiple TTs\n"); } ed = usbd_interface2endpoint_descriptor(iface, 0); if (ed == NULL) { aprint_error_dev(self, "no endpoint descriptor\n"); goto bad; } if ((ed->bmAttributes & UE_XFERTYPE) != UE_INTERRUPT) { aprint_error_dev(self, "bad interrupt endpoint\n"); goto bad; } sc->sc_statuslen = (nports + 1 + 7) / 8; sc->sc_statusbuf = kmem_alloc(sc->sc_statuslen, KM_SLEEP); sc->sc_statuspend = kmem_zalloc(sc->sc_statuslen, KM_SLEEP); sc->sc_status = kmem_alloc(sc->sc_statuslen, KM_SLEEP); mutex_init(&sc->sc_lock, MUTEX_DEFAULT, IPL_SOFTUSB); cv_init(&sc->sc_cv, "uhubex"); /* force initial scan */ memset(sc->sc_status, 0xff, sc->sc_statuslen); sc->sc_explorepending = true; err = usbd_open_pipe_intr(iface, ed->bEndpointAddress, USBD_SHORT_XFER_OK|USBD_MPSAFE, &sc->sc_ipipe, sc, sc->sc_statusbuf, sc->sc_statuslen, uhub_intr, USBD_DEFAULT_INTERVAL); if (err) { aprint_error_dev(self, "cannot open interrupt pipe\n"); goto bad; } /* Wait with power off for a while if we are not a root hub */ if (dev->ud_powersrc->up_parent != NULL) usbd_delay_ms(dev, USB_POWER_DOWN_TIME); usbd_add_drv_event(USB_EVENT_DRIVER_ATTACH, dev, sc->sc_dev); /* * To have the best chance of success we do things in the exact same * order as Windows 98. This should not be necessary, but some * devices do not follow the USB specs to the letter. * * These are the events on the bus when a hub is attached: * Get device and config descriptors (see attach code) * Get hub descriptor (see above) * For all ports * turn on power * wait for power to become stable * (all below happens in explore code) * For all ports * clear C_PORT_CONNECTION * For all ports * get port status * if device connected * wait 100 ms * turn on reset * wait * clear C_PORT_RESET * get port status * proceed with device attachment */ if (UHUB_IS_HIGH_SPEED(sc) && nports > 0) { tts = kmem_alloc((UHUB_IS_SINGLE_TT(sc) ? 1 : nports) * sizeof(struct usbd_tt), KM_SLEEP); } /* Set up data structures */ for (p = 1; p <= nports; p++) { struct usbd_port *up = &hub->uh_ports[p - 1]; up->up_dev = NULL; up->up_parent = dev; up->up_portno = p; if (dev->ud_selfpowered) /* Self powered hub, give ports maximum current. */ up->up_power = USB_MAX_POWER; else up->up_power = USB_MIN_POWER; up->up_restartcnt = 0; up->up_reattach = 0; if (UHUB_IS_HIGH_SPEED(sc)) { up->up_tt = &tts[UHUB_IS_SINGLE_TT(sc) ? 0 : p - 1]; up->up_tt->utt_hub = hub; } else { up->up_tt = NULL; } } /* XXX should check for none, individual, or ganged power? */ pwrdly = dev->ud_hub->uh_hubdesc.bPwrOn2PwrGood * UHD_PWRON_FACTOR + USB_EXTRA_POWER_UP_TIME; for (port = 1; port <= nports; port++) { /* Turn the power on. */ err = usbd_set_port_feature(dev, port, UHF_PORT_POWER); if (err) aprint_error_dev(self, "port %d power on failed, %s\n", port, usbd_errstr(err)); DPRINTF("uhub%jd turn on port %jd power", device_unit(self), port, 0, 0); } /* Wait for stable power if we are not a root hub */ if (dev->ud_powersrc->up_parent != NULL) usbd_delay_ms(dev, pwrdly); /* The usual exploration will finish the setup. */ sc->sc_running = true; sc->sc_first_explore = true; if (!pmf_device_register(self, NULL, NULL)) aprint_error_dev(self, "couldn't establish power handler\n"); return; bad: if (sc->sc_status) kmem_free(sc->sc_status, sc->sc_statuslen); if (sc->sc_statuspend) kmem_free(sc->sc_statuspend, sc->sc_statuslen); if (sc->sc_statusbuf) kmem_free(sc->sc_statusbuf, sc->sc_statuslen); if (hub) kmem_free(hub, sizeof(*hub) + (nports-1) * sizeof(struct usbd_port)); dev->ud_hub = NULL; bad2: config_pending_decr(self); } usbd_status uhub_explore(struct usbd_device *dev) { usb_hub_descriptor_t *hd = &dev->ud_hub->uh_hubdesc; struct uhub_softc *sc = dev->ud_hub->uh_hubsoftc; struct usbd_port *up; struct usbd_device *subdev; usbd_status err; int speed; int port; int change, status, reconnect, rescan; UHUBHIST_FUNC(); UHUBHIST_CALLARGS("uhub%jd dev=%#jx addr=%jd speed=%ju", device_unit(sc->sc_dev), (uintptr_t)dev, dev->ud_addr, dev->ud_speed); KASSERT(usb_in_event_thread(sc->sc_dev)); if (!sc->sc_running) return USBD_NOT_STARTED; /* Ignore hubs that are too deep. */ if (dev->ud_depth > USB_HUB_MAX_DEPTH) return USBD_TOO_DEEP; SDT_PROBE1(usb, hub, explore, start, dev); /* Process rescan if requested. */ mutex_enter(&sc->sc_lock); rescan = sc->sc_rescan; sc->sc_rescan = false; mutex_exit(&sc->sc_lock); if (rescan) { for (port = 1; port <= hd->bNbrPorts; port++) { SDT_PROBE3(usb, hub, explore, rescan, dev, port, &dev->ud_hub->uh_ports[port - 1]); subdev = dev->ud_hub->uh_ports[port - 1].up_dev; if (subdev == NULL) continue; usbd_reattach_device(sc->sc_dev, subdev, port, NULL); } } if (PORTSTAT_ISSET(sc, 0)) { /* hub status change */ usb_hub_status_t hs; err = usbd_get_hub_status(dev, &hs); if (err) { DPRINTF("uhub%jd get hub status failed, err %jd", device_unit(sc->sc_dev), err, 0, 0); } else { /* just acknowledge */ status = UGETW(hs.wHubStatus); change = UGETW(hs.wHubChange); SDT_PROBE5(usb, hub, explore, portstat, dev, /*portno*/0, status, change, /*reattach*/0); DPRINTF("uhub%jd s/c=%jx/%jx", device_unit(sc->sc_dev), status, change, 0); if (change & UHS_LOCAL_POWER) usbd_clear_hub_feature(dev, UHF_C_HUB_LOCAL_POWER); if (change & UHS_OVER_CURRENT) usbd_clear_hub_feature(dev, UHF_C_HUB_OVER_CURRENT); } } for (port = 1; port <= hd->bNbrPorts; port++) { up = &dev->ud_hub->uh_ports[port - 1]; /* reattach is needed after firmware upload */ reconnect = up->up_reattach; up->up_reattach = 0; status = change = 0; /* don't check if no change summary notification */ if (PORTSTAT_ISSET(sc, port) || reconnect) { err = usbd_get_port_status(dev, port, &up->up_status); if (err) { DPRINTF("uhub%jd get port stat failed, err %jd", device_unit(sc->sc_dev), err, 0, 0); continue; } status = UGETW(up->up_status.wPortStatus); change = UGETW(up->up_status.wPortChange); DPRINTF("uhub%jd port %jd: s/c=%jx/%jx", device_unit(sc->sc_dev), port, status, change); } SDT_PROBE5(usb, hub, explore, portstat, dev, port, status, change, reconnect); if (!change && !reconnect) { /* No status change, just do recursive explore. */ if (up->up_dev != NULL && up->up_dev->ud_hub != NULL) up->up_dev->ud_hub->uh_explore(up->up_dev); continue; } if (change & UPS_C_PORT_ENABLED) { DPRINTF("uhub%jd port %jd C_PORT_ENABLED", device_unit(sc->sc_dev), port, 0, 0); usbd_clear_port_feature(dev, port, UHF_C_PORT_ENABLE); if (change & UPS_C_CONNECT_STATUS) { /* Ignore the port error if the device vanished. */ } else if (status & UPS_PORT_ENABLED) { aprint_error_dev(sc->sc_dev, "illegal enable change, port %d\n", port); } else { /* Port error condition. */ if (up->up_restartcnt) /* no message first time */ aprint_error_dev(sc->sc_dev, "port error, restarting port %d\n", port); if (up->up_restartcnt++ < USBD_RESTART_MAX) goto disco; else aprint_error_dev(sc->sc_dev, "port error, giving up port %d\n", port); } } if (change & UPS_C_PORT_RESET) { /* * some xHCs set PortResetChange instead of CSC * when port is reset. */ if ((status & UPS_CURRENT_CONNECT_STATUS) != 0) { change |= UPS_C_CONNECT_STATUS; } usbd_clear_port_feature(dev, port, UHF_C_PORT_RESET); } if (change & UPS_C_BH_PORT_RESET) { /* * some xHCs set WarmResetChange instead of CSC * when port is reset. */ if ((status & UPS_CURRENT_CONNECT_STATUS) != 0) { change |= UPS_C_CONNECT_STATUS; } usbd_clear_port_feature(dev, port, UHF_C_BH_PORT_RESET); } if (change & UPS_C_PORT_LINK_STATE) usbd_clear_port_feature(dev, port, UHF_C_PORT_LINK_STATE); if (change & UPS_C_PORT_CONFIG_ERROR) usbd_clear_port_feature(dev, port, UHF_C_PORT_CONFIG_ERROR); /* XXX handle overcurrent and resume events! */ if (!reconnect && !(change & UPS_C_CONNECT_STATUS)) { /* No status change, just do recursive explore. */ if (up->up_dev != NULL && up->up_dev->ud_hub != NULL) up->up_dev->ud_hub->uh_explore(up->up_dev); continue; } /* We have a connect status change, handle it. */ DPRINTF("uhub%jd status change port %jd", device_unit(sc->sc_dev), port, 0, 0); usbd_clear_port_feature(dev, port, UHF_C_PORT_CONNECTION); /* * If there is already a device on the port the change status * must mean that is has disconnected. Looking at the * current connect status is not enough to figure this out * since a new unit may have been connected before we handle * the disconnect. */ disco: if (up->up_dev != NULL) { /* Disconnected */ DPRINTF("uhub%jd device addr=%jd disappeared on " "port %jd", device_unit(sc->sc_dev), up->up_dev->ud_addr, port, 0); SDT_PROBE3(usb, hub, explore, disconnect, dev, port, up); usb_disconnect_port(up, sc->sc_dev, DETACH_FORCE); usbd_clear_port_feature(dev, port, UHF_C_PORT_CONNECTION); } if (!(status & UPS_CURRENT_CONNECT_STATUS)) { /* Nothing connected, just ignore it. */ DPRINTFN(3, "uhub%jd port %jd !CURRENT_CONNECT_STATUS", device_unit(sc->sc_dev), port, 0, 0); SDT_PROBE3(usb, hub, explore, disconnect, dev, port, up); usb_disconnect_port(up, sc->sc_dev, DETACH_FORCE); usbd_clear_port_feature(dev, port, UHF_C_PORT_CONNECTION); continue; } /* Connected */ DPRINTF("unit %jd dev->speed=%ju dev->depth=%ju", device_unit(sc->sc_dev), dev->ud_speed, dev->ud_depth, 0); /* Wait for maximum device power up time. */ usbd_delay_ms(dev, USB_PORT_POWERUP_DELAY); /* Reset port, which implies enabling it. */ if (usbd_reset_port(dev, port, &up->up_status)) { aprint_error_dev(sc->sc_dev, "port %d reset failed\n", port); continue; } #if 0 /* Get port status again, it might have changed during reset */ err = usbd_get_port_status(dev, port, &up->up_status); if (err) { DPRINTF("uhub%jd port %jd get port status failed, " "err %jd", device_unit(sc->sc_dev), port, err, 0); continue; } #endif /* * Use the port status from the reset to check for the device * disappearing, the port enable status, and the port speed */ status = UGETW(up->up_status.wPortStatus); change = UGETW(up->up_status.wPortChange); SDT_PROBE5(usb, hub, explore, reset, dev, port, up, status, change); DPRINTF("uhub%jd port %jd after reset: s/c=%jx/%jx", device_unit(sc->sc_dev), port, status, change); if (!(status & UPS_CURRENT_CONNECT_STATUS)) { /* Nothing connected, just ignore it. */ #ifdef DIAGNOSTIC aprint_debug_dev(sc->sc_dev, "port %d, device disappeared after reset\n", port); #endif continue; } if (!(status & UPS_PORT_ENABLED)) { /* Not allowed send/receive packet. */ #ifdef DIAGNOSTIC printf("%s: port %d, device not enabled\n", device_xname(sc->sc_dev), port); #endif continue; } /* port reset may cause Warm Reset Change, drop it. */ if (change & UPS_C_BH_PORT_RESET) usbd_clear_port_feature(dev, port, UHF_C_BH_PORT_RESET); /* * Figure out device speed from power bit of port status. * USB 2.0 ch 11.24.2.7.1 * USB 3.1 ch 10.16.2.6.1 */ int sts = status; if ((sts & UPS_PORT_POWER) == 0) sts &= ~UPS_PORT_POWER_SS; if (sts & UPS_HIGH_SPEED) speed = USB_SPEED_HIGH; else if (sts & UPS_LOW_SPEED) speed = USB_SPEED_LOW; else { /* * If there is no power bit set, it is certainly * a Super Speed device, so use the speed of its * parent hub. */ if (sts & UPS_PORT_POWER) speed = USB_SPEED_FULL; else speed = dev->ud_speed; } /* * Reduce the speed, otherwise we won't setup the proper * transfer methods. */ if (speed > dev->ud_speed) speed = dev->ud_speed; DPRINTF("uhub%jd speed %ju", device_unit(sc->sc_dev), speed, 0, 0); /* * To check whether port has power, * check UPS_PORT_POWER_SS bit if port speed is SS, and * check UPS_PORT_POWER bit if port speed is HS/FS/LS. */ if (USB_IS_SS(speed)) { /* SS hub port */ if (!(status & UPS_PORT_POWER_SS)) aprint_normal_dev(sc->sc_dev, "strange, connected port %d has no power\n", port); } else { /* HS/FS/LS hub port */ if (!(status & UPS_PORT_POWER)) aprint_normal_dev(sc->sc_dev, "strange, connected port %d has no power\n", port); } if (dev->ud_bus->ub_hctype == USBHCTYPE_VHCI) { kcov_remote_enter(KCOV_REMOTE_VHCI, KCOV_REMOTE_VHCI_ID(dev->ud_bus->ub_busnum, port)); } SDT_PROBE4(usb, hub, explore, connect, dev, port, up, speed); /* Get device info and set its address. */ err = usbd_new_device(sc->sc_dev, dev->ud_bus, dev->ud_depth + 1, speed, port, up); if (dev->ud_bus->ub_hctype == USBHCTYPE_VHCI) { kcov_remote_leave(KCOV_REMOTE_VHCI, KCOV_REMOTE_VHCI_ID(dev->ud_bus->ub_busnum, port)); } /* XXX retry a few times? */ if (err) { DPRINTF("uhub%jd: usbd_new_device failed, error %jd", device_unit(sc->sc_dev), err, 0, 0); /* Avoid addressing problems by disabling. */ /* usbd_reset_port(dev, port, &up->status); */ /* * The unit refused to accept a new address, or had * some other serious problem. Since we cannot leave * at 0 we have to disable the port instead. */ device_printf(sc->sc_dev, "device problem, disabling port %d\n", port); usbd_clear_port_feature(dev, port, UHF_PORT_ENABLE); } else { SDT_PROBE4(usb, hub, explore, connected, dev, port, up, speed); /* The port set up succeeded, reset error count. */ up->up_restartcnt = 0; if (up->up_dev->ud_hub) up->up_dev->ud_hub->uh_explore(up->up_dev); } } mutex_enter(&sc->sc_lock); sc->sc_explorepending = false; for (int i = 0; i < sc->sc_statuslen; i++) { if (sc->sc_statuspend[i] != 0) { memcpy(sc->sc_status, sc->sc_statuspend, sc->sc_statuslen); memset(sc->sc_statuspend, 0, sc->sc_statuslen); usb_needs_explore(sc->sc_hub); break; } } mutex_exit(&sc->sc_lock); if (sc->sc_first_explore) { config_pending_decr(sc->sc_dev); sc->sc_first_explore = false; } SDT_PROBE1(usb, hub, explore, done, dev); return USBD_NORMAL_COMPLETION; } /* * Called from process context when the hub is gone. * Detach all devices on active ports. */ static int uhub_detach(device_t self, int flags) { struct uhub_softc *sc = device_private(self); struct usbd_hub *hub = sc->sc_hub->ud_hub; struct usbd_port *rup; int nports, port, rc; UHUBHIST_FUNC(); UHUBHIST_CALLED(); DPRINTF("uhub%jd flags=%jd", device_unit(self), flags, 0, 0); if (hub == NULL) /* Must be partially working */ return 0; /* XXXSMP usb */ KERNEL_LOCK(1, curlwp); nports = hub->uh_hubdesc.bNbrPorts; for (port = 1; port <= nports; port++) { rup = &hub->uh_ports[port - 1]; if (rup->up_dev == NULL) continue; if ((rc = usb_disconnect_port(rup, self, flags)) != 0) { /* XXXSMP usb */ KERNEL_UNLOCK_ONE(curlwp); return rc; } } pmf_device_deregister(self); usbd_abort_pipe(sc->sc_ipipe); usbd_close_pipe(sc->sc_ipipe); usbd_add_drv_event(USB_EVENT_DRIVER_DETACH, sc->sc_hub, sc->sc_dev); if (hub->uh_ports[0].up_tt) kmem_free(hub->uh_ports[0].up_tt, (UHUB_IS_SINGLE_TT(sc) ? 1 : nports) * sizeof(struct usbd_tt)); kmem_free(hub, sizeof(*hub) + (nports-1) * sizeof(struct usbd_port)); sc->sc_hub->ud_hub = NULL; if (sc->sc_status) kmem_free(sc->sc_status, sc->sc_statuslen); if (sc->sc_statuspend) kmem_free(sc->sc_statuspend, sc->sc_statuslen); if (sc->sc_statusbuf) kmem_free(sc->sc_statusbuf, sc->sc_statuslen); cv_destroy(&sc->sc_cv); mutex_destroy(&sc->sc_lock); /* XXXSMP usb */ KERNEL_UNLOCK_ONE(curlwp); return 0; } static int uhub_rescan(device_t self, const char *ifattr, const int *locators) { struct uhub_softc *sc = device_private(self); UHUBHIST_FUNC(); UHUBHIST_CALLARGS("uhub%jd", device_unit(sc->sc_dev), 0, 0, 0); KASSERT(KERNEL_LOCKED_P()); /* Trigger bus exploration. */ /* XXX locators */ mutex_enter(&sc->sc_lock); sc->sc_rescan = true; mutex_exit(&sc->sc_lock); usb_needs_explore(sc->sc_hub); return 0; } /* Called when a device has been detached from it */ static void uhub_childdet(device_t self, device_t child) { struct uhub_softc *sc = device_private(self); struct usbd_device *devhub = sc->sc_hub; struct usbd_device *dev; int nports; int port; int i; KASSERT(KERNEL_LOCKED_P()); if (!devhub->ud_hub) /* should never happen; children are only created after init */ panic("hub not fully initialised, but child deleted?"); nports = devhub->ud_hub->uh_hubdesc.bNbrPorts; for (port = 1; port <= nports; port++) { dev = devhub->ud_hub->uh_ports[port - 1].up_dev; if (!dev || dev->ud_subdevlen == 0) continue; for (i = 0; i < dev->ud_subdevlen; i++) { if (dev->ud_subdevs[i] == child) { dev->ud_subdevs[i] = NULL; dev->ud_nifaces_claimed--; } } if (dev->ud_nifaces_claimed == 0) { kmem_free(dev->ud_subdevs, dev->ud_subdevlen * sizeof(device_t)); dev->ud_subdevs = NULL; dev->ud_subdevlen = 0; } } } /* * Hub interrupt. * This an indication that some port has changed status. * Notify the bus event handler thread that we need * to be explored again. */ void uhub_intr(struct usbd_xfer *xfer, void *addr, usbd_status status) { struct uhub_softc *sc = addr; UHUBHIST_FUNC(); UHUBHIST_CALLARGS("called! uhub%jd status=%jx", device_unit(sc->sc_dev), status, 0, 0); SDT_PROBE2(usb, hub, interrupt, , sc->sc_hub, status); if (status == USBD_STALLED) usbd_clear_endpoint_stall_async(sc->sc_ipipe); else if (status == USBD_NORMAL_COMPLETION) { mutex_enter(&sc->sc_lock); DPRINTFN(5, "uhub%jd: explore pending %jd", device_unit(sc->sc_dev), sc->sc_explorepending, 0, 0); /* merge port bitmap into pending interrupts list */ for (size_t i = 0; i < sc->sc_statuslen; i++) { sc->sc_statuspend[i] |= sc->sc_statusbuf[i]; DPRINTFN(5, "uhub%jd: pending/new ports " "[%jd] %#jx/%#jx", device_unit(sc->sc_dev), i, sc->sc_statuspend[i], sc->sc_statusbuf[i]); } if (!sc->sc_explorepending) { sc->sc_explorepending = true; memcpy(sc->sc_status, sc->sc_statuspend, sc->sc_statuslen); memset(sc->sc_statuspend, 0, sc->sc_statuslen); for (size_t i = 0; i < sc->sc_statuslen; i++) { DPRINTFN(5, "uhub%jd: exploring ports " "[%jd] %#jx", device_unit(sc->sc_dev), i, sc->sc_status[i], 0); } usb_needs_explore(sc->sc_hub); } mutex_exit(&sc->sc_lock); } }
4 2 2 2 2 2 2 4 1 2 14 5 1 7 1 1 3 2 7 4 10 1 2 1 3 4 4 2 2 4 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 /* $NetBSD: vfs_syscalls_30.c,v 1.45 2022/03/12 20:46:03 riastradh Exp $ */ /*- * Copyright (c) 2005, 2008 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Christos Zoulas. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: vfs_syscalls_30.c,v 1.45 2022/03/12 20:46:03 riastradh Exp $"); #if defined(_KERNEL_OPT) #include "opt_compat_netbsd.h" #endif #include <sys/param.h> #include <sys/systm.h> #include <sys/namei.h> #include <sys/filedesc.h> #include <sys/kernel.h> #include <sys/file.h> #include <sys/stat.h> #include <sys/socketvar.h> #include <sys/vnode.h> #include <sys/mount.h> #include <sys/proc.h> #include <sys/uio.h> #include <sys/dirent.h> #include <sys/malloc.h> #include <sys/kauth.h> #include <sys/vfs_syscalls.h> #include <sys/syscall.h> #include <sys/syscallvar.h> #include <sys/syscallargs.h> #include <compat/common/compat_mod.h> #include <compat/common/compat_util.h> #include <compat/sys/stat.h> #include <compat/sys/dirent.h> #include <compat/sys/mount.h> #include <compat/sys/statvfs.h> static const struct syscall_package vfs_syscalls_30_syscalls[] = { { SYS_compat_30___fhstat30, 0, (sy_call_t *)compat_30_sys___fhstat30 }, { SYS_compat_30___fstat13, 0, (sy_call_t *)compat_30_sys___fstat13 }, { SYS_compat_30___lstat13, 0, (sy_call_t *)compat_30_sys___lstat13 }, { SYS_compat_30___stat13, 0, (sy_call_t *)compat_30_sys___stat13 }, { SYS_compat_30_fhopen, 0, (sy_call_t *)compat_30_sys_fhopen }, { SYS_compat_30_fhstat, 0, (sy_call_t *)compat_30_sys_fhstat }, { SYS_compat_30_fhstatvfs1, 0, (sy_call_t *)compat_30_sys_fhstatvfs1 }, { SYS_compat_30_getdents, 0, (sy_call_t *)compat_30_sys_getdents }, { SYS_compat_30_getfh, 0, (sy_call_t *)compat_30_sys_getfh }, { 0,0, NULL } }; /* * Convert from a new to an old stat structure. */ static void cvtstat(struct stat13 *ost, const struct stat *st) { /* Handle any padding. */ memset(ost, 0, sizeof(*ost)); ost->st_dev = st->st_dev; ost->st_ino = (uint32_t)st->st_ino; ost->st_mode = st->st_mode; ost->st_nlink = st->st_nlink; ost->st_uid = st->st_uid; ost->st_gid = st->st_gid; ost->st_rdev = st->st_rdev; timespec_to_timespec50(&st->st_atimespec, &ost->st_atimespec); timespec_to_timespec50(&st->st_mtimespec, &ost->st_mtimespec); timespec_to_timespec50(&st->st_ctimespec, &ost->st_ctimespec); timespec_to_timespec50(&st->st_birthtimespec, &ost->st_birthtimespec); ost->st_size = st->st_size; ost->st_blocks = st->st_blocks; ost->st_blksize = st->st_blksize; ost->st_flags = st->st_flags; ost->st_gen = st->st_gen; } /* * Get file status; this version follows links. */ /* ARGSUSED */ int compat_30_sys___stat13(struct lwp *l, const struct compat_30_sys___stat13_args *uap, register_t *retval) { /* { syscallarg(const char *) path; syscallarg(struct stat13 *) ub; } */ struct stat sb; struct stat13 osb; int error; error = do_sys_stat(SCARG(uap, path), FOLLOW, &sb); if (error) return error; cvtstat(&osb, &sb); return copyout(&osb, SCARG(uap, ub), sizeof(osb)); } /* * Get file status; this version does not follow links. */ /* ARGSUSED */ int compat_30_sys___lstat13(struct lwp *l, const struct compat_30_sys___lstat13_args *uap, register_t *retval) { /* { syscallarg(const char *) path; syscallarg(struct stat13 *) ub; } */ struct stat sb; struct stat13 osb; int error; error = do_sys_stat(SCARG(uap, path), NOFOLLOW, &sb); if (error) return error; cvtstat(&osb, &sb); return copyout(&osb, SCARG(uap, ub), sizeof(osb)); } /* ARGSUSED */ int compat_30_sys_fhstat(struct lwp *l, const struct compat_30_sys_fhstat_args *uap, register_t *retval) { /* { syscallarg(const struct compat_30_fhandle *) fhp; syscallarg(struct stat13 *) sb; } */ struct stat sb; struct stat13 osb; int error; error = do_fhstat(l, SCARG(uap, fhp), sizeof(*SCARG(uap, fhp)), &sb); if (error) return error; cvtstat(&osb, &sb); return copyout(&osb, SCARG(uap, sb), sizeof(osb)); } /* * Return status information about a file descriptor. */ /* ARGSUSED */ int compat_30_sys___fstat13(struct lwp *l, const struct compat_30_sys___fstat13_args *uap, register_t *retval) { /* { syscallarg(int) fd; syscallarg(struct stat13 *) sb; } */ struct stat sb; struct stat13 osb; int error; error = do_sys_fstat(SCARG(uap, fd), &sb); if (error) return error; cvtstat(&osb, &sb); return copyout(&osb, SCARG(uap, sb), sizeof(osb)); } /* * Read a block of directory entries in a file system independent format. */ int compat_30_sys_getdents(struct lwp *l, const struct compat_30_sys_getdents_args *uap, register_t *retval) { /* { syscallarg(int) fd; syscallarg(char *) buf; syscallarg(size_t) count; } */ struct dirent *bdp; struct vnode *vp; char *inp, *tbuf; /* BSD-format */ int len, reclen; /* BSD-format */ char *outp; /* NetBSD-3.0-format */ int resid; struct file *fp; struct uio auio; struct iovec aiov; struct dirent12 idb; off_t off; /* true file offset */ int buflen, error, eofflag; off_t *cookiebuf = NULL, *cookie; int ncookies; bool any = false; /* fd_getvnode() will use the descriptor for us */ if ((error = fd_getvnode(SCARG(uap, fd), &fp)) != 0) return error; if ((fp->f_flag & FREAD) == 0) { error = EBADF; goto out1; } vp = fp->f_vnode; if (vp->v_type != VDIR) { error = EINVAL; goto out1; } buflen = uimin(MAXBSIZE, SCARG(uap, count)); tbuf = malloc(buflen, M_TEMP, M_WAITOK); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); off = fp->f_offset; again: aiov.iov_base = tbuf; aiov.iov_len = buflen; auio.uio_iov = &aiov; auio.uio_iovcnt = 1; auio.uio_rw = UIO_READ; auio.uio_resid = buflen; auio.uio_offset = off; UIO_SETUP_SYSSPACE(&auio); /* * First we read into the malloc'ed buffer, then * we massage it into user space, one record at a time. */ error = VOP_READDIR(vp, &auio, fp->f_cred, &eofflag, &cookiebuf, &ncookies); if (error) goto out; inp = tbuf; outp = SCARG(uap, buf); resid = SCARG(uap, count); if ((len = buflen - auio.uio_resid) == 0) goto eof; for (cookie = cookiebuf; len > 0; len -= reclen) { bdp = (struct dirent *)inp; reclen = bdp->d_reclen; if (reclen & _DIRENT_ALIGN(bdp)) panic("%s: bad reclen %d", __func__, reclen); if (cookie) off = *cookie++; /* each entry points to the next */ else off += reclen; if ((off >> 32) != 0) { compat_offseterr(vp, "netbsd30_getdents"); error = EINVAL; goto out; } memset(&idb, 0, sizeof(idb)); if (bdp->d_namlen >= sizeof(idb.d_name)) idb.d_namlen = sizeof(idb.d_name) - 1; else idb.d_namlen = bdp->d_namlen; idb.d_reclen = _DIRENT_SIZE(&idb); if (reclen > len || resid < idb.d_reclen) { /* entry too big for buffer, so just stop */ any = true; break; } /* * Massage in place to make a NetBSD-3.0-shaped dirent * (otherwise we have to worry about touching user memory * outside of the copyout() call). */ idb.d_fileno = (u_int32_t)bdp->d_fileno; idb.d_type = bdp->d_type; (void)memcpy(idb.d_name, bdp->d_name, idb.d_namlen); memset(idb.d_name + idb.d_namlen, 0, idb.d_reclen - _DIRENT_NAMEOFF(&idb) - idb.d_namlen); if ((error = copyout(&idb, outp, idb.d_reclen)) != 0) goto out; /* advance past this real entry */ inp += reclen; /* advance output past NetBSD-3.0-shaped entry */ outp += idb.d_reclen; resid -= idb.d_reclen; any = true; } /* if we squished out the whole block, try again */ if (!any) { if (cookiebuf) free(cookiebuf, M_TEMP); cookiebuf = NULL; goto again; } fp->f_offset = off; /* update the vnode offset */ eof: *retval = SCARG(uap, count) - resid; out: VOP_UNLOCK(vp); if (cookiebuf) free(cookiebuf, M_TEMP); free(tbuf, M_TEMP); out1: fd_putfile(SCARG(uap, fd)); return error; } /* * Get file handle system call */ int compat_30_sys_getfh(struct lwp *l, const struct compat_30_sys_getfh_args *uap, register_t *retval) { /* { syscallarg(char *) fname; syscallarg(struct compat_30_fhandle *) fhp; } */ struct vnode *vp; struct compat_30_fhandle fh; int error; struct pathbuf *pb; struct nameidata nd; size_t sz; /* * Must be super user */ error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_FILEHANDLE, 0, NULL, NULL, NULL); if (error) return (error); error = pathbuf_copyin(SCARG(uap, fname), &pb); if (error) { return error; } NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | TRYEMULROOT, pb); error = namei(&nd); pathbuf_destroy(pb); if (error) return error; vp = nd.ni_vp; sz = sizeof(struct compat_30_fhandle); error = vfs_composefh(vp, (void *)&fh, &sz); vput(vp); CTASSERT(FHANDLE_SIZE_COMPAT == sizeof(struct compat_30_fhandle)); if (sz != FHANDLE_SIZE_COMPAT) { error = EINVAL; } if (error) return error; return copyout(&fh, SCARG(uap, fhp), sizeof(fh)); } /* * Open a file given a file handle. * * Check permissions, allocate an open file structure, * and call the device open routine if any. */ int compat_30_sys_fhopen(struct lwp *l, const struct compat_30_sys_fhopen_args *uap, register_t *retval) { /* { syscallarg(const fhandle_t *) fhp; syscallarg(int) flags; } */ return dofhopen(l, SCARG(uap, fhp), FHANDLE_SIZE_COMPAT, SCARG(uap, flags), retval); } /* ARGSUSED */ int compat_30_sys___fhstat30(struct lwp *l, const struct compat_30_sys___fhstat30_args *uap_30, register_t *retval) { /* { syscallarg(const fhandle_t *) fhp; syscallarg(struct stat30 *) sb; } */ struct stat sb; struct stat13 osb; int error; error = do_fhstat(l, SCARG(uap_30, fhp), FHANDLE_SIZE_COMPAT, &sb); if (error) return error; cvtstat(&osb, &sb); return copyout(&osb, SCARG(uap_30, sb), sizeof(osb)); } /* ARGSUSED */ int compat_30_sys_fhstatvfs1(struct lwp *l, const struct compat_30_sys_fhstatvfs1_args *uap, register_t *retval) { /* { syscallarg(const fhandle_t *) fhp; syscallarg(struct statvfs90 *) buf; syscallarg(int) flags; } */ struct statvfs *sb = STATVFSBUF_GET(); int error = do_fhstatvfs(l, SCARG(uap, fhp), FHANDLE_SIZE_COMPAT, sb, SCARG(uap, flags)); if (!error) { error = statvfs_to_statvfs90_copy(sb, SCARG(uap, buf), sizeof(struct statvfs90)); } STATVFSBUF_PUT(sb); return error; } int vfs_syscalls_30_init(void) { return syscall_establish(NULL, vfs_syscalls_30_syscalls); } int vfs_syscalls_30_fini(void) { return syscall_disestablish(NULL, vfs_syscalls_30_syscalls); }
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 /* $NetBSD: kern_scdebug.c,v 1.2 2019/03/14 19:51:49 palle Exp $ */ /* * Copyright (c) 2015 Matthew R. Green * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * Copyright (c) 1982, 1986, 1989, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)kern_xxx.c 8.3 (Berkeley) 2/14/95 * from: NetBSD: kern_xxx.c,v 1.74 2017/10/28 00:37:11 pgoyette Exp */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: kern_scdebug.c,v 1.2 2019/03/14 19:51:49 palle Exp $"); #ifdef _KERNEL_OPT #include "opt_syscall_debug.h" #include "opt_kernhist.h" #endif #include <sys/param.h> #include <sys/systm.h> #include <sys/kernel.h> #include <sys/proc.h> #include <sys/sysctl.h> #include <sys/mount.h> #include <sys/syscall.h> #include <sys/syscallargs.h> #include <sys/kernhist.h> /* * Pull in the indirect syscall functions here. * They are only actually used if the ports syscall entry code * doesn't special-case SYS_SYSCALL and SYS___SYSCALL * * In some cases the generated code for the two functions is identical, * but there isn't a MI way of determining that - so we don't try. */ #define SYS_SYSCALL sys_syscall #include "sys_syscall.c" #undef SYS_SYSCALL #define SYS_SYSCALL sys___syscall #include "sys_syscall.c" #undef SYS_SYSCALL #ifdef SYSCALL_DEBUG #define SCDEBUG_CALLS 0x0001 /* show calls */ #define SCDEBUG_RETURNS 0x0002 /* show returns */ #define SCDEBUG_ALL 0x0004 /* even syscalls that are not implemented */ #define SCDEBUG_SHOWARGS 0x0008 /* show arguments to calls */ #define SCDEBUG_KERNHIST 0x0010 /* use kernhist instead of printf */ #ifndef SCDEBUG_DEFAULT #define SCDEBUG_DEFAULT (SCDEBUG_CALLS|SCDEBUG_RETURNS|SCDEBUG_SHOWARGS) #endif int scdebug = SCDEBUG_DEFAULT; #ifdef KERNHIST KERNHIST_DEFINE(scdebughist); #define SCDEBUG_KERNHIST_FUNC(a) KERNHIST_FUNC(a) #define SCDEBUG_KERNHIST_CALLED(a) KERNHIST_CALLED(a) #define SCDEBUG_KERNHIST_LOG(a,b,c,d,e,f) KERNHIST_LOG(a,b,c,d,e,f) #else #define SCDEBUG_KERNHIST_FUNC(a) {} /* nothing */ #define SCDEBUG_KERNHIST_CALLED(a) {} /* nothing */ #define SCDEBUG_KERNHIST_LOG(a,b,c,d,e,f) {} /* nothing */ /* The non-kernhist support version can elide all this code easily. */ #undef SCDEBUG_KERNHIST #define SCDEBUG_KERNHIST 0 #endif #ifdef __HAVE_MINIMAL_EMUL #define CODE_NOT_OK(code, em) ((int)(code) < 0) #else #define CODE_NOT_OK(code, em) (((int)(code) < 0) || \ ((int)(code) >= (em)->e_nsysent)) #endif void scdebug_call(register_t code, const register_t args[]) { SCDEBUG_KERNHIST_FUNC("scdebug_call"); struct lwp *l = curlwp; struct proc *p = l->l_proc; const struct sysent *sy; const struct emul *em; int i; if ((scdebug & SCDEBUG_CALLS) == 0) return; if (scdebug & SCDEBUG_KERNHIST) SCDEBUG_KERNHIST_CALLED(scdebughist); em = p->p_emul; sy = &em->e_sysent[code]; if ((scdebug & SCDEBUG_ALL) == 0 && (CODE_NOT_OK(code, em) || sy->sy_call == sys_nosys)) { if (scdebug & SCDEBUG_KERNHIST) SCDEBUG_KERNHIST_LOG(scdebughist, "", 0, 0, 0, 0); return; } /* * The kernhist version of scdebug needs to restrict the usage * compared to the normal version. histories must avoid these * sorts of usage: * * - the format string *must* be literal, as it is used * at display time in the kernel or userland * - strings in the format will cause vmstat -u to crash * so avoid using %s formats * * to avoid these, we have a fairly long block to print args * as the format needs to change for each, and we can't just * call printf() on each argument until we're done. */ if (scdebug & SCDEBUG_KERNHIST) { if (CODE_NOT_OK(code, em)) { SCDEBUG_KERNHIST_LOG(scdebughist, "pid %jd:%jd: OUT OF RANGE (%jd)", p->p_pid, l->l_lid, code, 0); } else { SCDEBUG_KERNHIST_LOG(scdebughist, "pid %jd:%jd: num %jd call %#jx", p->p_pid, l->l_lid, code, (uintptr_t)sy->sy_call); if ((scdebug & SCDEBUG_SHOWARGS) == 0) return; if (sy->sy_narg > 7) { SCDEBUG_KERNHIST_LOG(scdebughist, "args[4-7]: (%jx, %jx, %jx, %jx, ...)", (long)args[4], (long)args[5], (long)args[6], (long)args[7]); } else if (sy->sy_narg > 6) { SCDEBUG_KERNHIST_LOG(scdebughist, "args[4-6]: (%jx, %jx, %jx)", (long)args[4], (long)args[5], (long)args[6], 0); } else if (sy->sy_narg > 5) { SCDEBUG_KERNHIST_LOG(scdebughist, "args[4-5]: (%jx, %jx)", (long)args[4], (long)args[5], 0, 0); } else if (sy->sy_narg == 5) { SCDEBUG_KERNHIST_LOG(scdebughist, "args[4]: (%jx)", (long)args[4], 0, 0, 0); } if (sy->sy_narg > 3) { SCDEBUG_KERNHIST_LOG(scdebughist, "args[0-3]: (%jx, %jx, %jx, %jx, ...)", (long)args[0], (long)args[1], (long)args[2], (long)args[3]); } else if (sy->sy_narg > 2) { SCDEBUG_KERNHIST_LOG(scdebughist, "args[0-2]: (%jx, %jx, %jx)", (long)args[0], (long)args[1], (long)args[2], 0); } else if (sy->sy_narg > 1) { SCDEBUG_KERNHIST_LOG(scdebughist, "args[0-1]: (%jx, %jx)", (long)args[0], (long)args[1], 0, 0); } else if (sy->sy_narg == 1) { SCDEBUG_KERNHIST_LOG(scdebughist, "args[0]: (%jx)", (long)args[0], 0, 0, 0); } } return; } printf("proc %d (%s): %s num ", p->p_pid, p->p_comm, em->e_name); if (CODE_NOT_OK(code, em)) printf("OUT OF RANGE (%ld)", (long)code); else { printf("%ld call: %s", (long)code, em->e_syscallnames[code]); if (scdebug & SCDEBUG_SHOWARGS) { printf("("); for (i = 0; i < sy->sy_argsize/sizeof(register_t); i++) printf("%s0x%lx", i == 0 ? "" : ", ", (long)args[i]); printf(")"); } } printf("\n"); } void scdebug_ret(register_t code, int error, const register_t retval[]) { SCDEBUG_KERNHIST_FUNC("scdebug_ret"); struct lwp *l = curlwp; struct proc *p = l->l_proc; const struct sysent *sy; const struct emul *em; if ((scdebug & SCDEBUG_RETURNS) == 0) return; if (scdebug & SCDEBUG_KERNHIST) SCDEBUG_KERNHIST_CALLED(scdebughist); em = p->p_emul; sy = &em->e_sysent[code]; if ((scdebug & SCDEBUG_ALL) == 0 && (CODE_NOT_OK(code, em) || sy->sy_call == sys_nosys)) { if (scdebug & SCDEBUG_KERNHIST) SCDEBUG_KERNHIST_LOG(scdebughist, "", 0, 0, 0, 0); return; } if (scdebug & SCDEBUG_KERNHIST) { if (CODE_NOT_OK(code, em)) { SCDEBUG_KERNHIST_LOG(scdebughist, "pid %jd:%jd: OUT OF RANGE (%jd)", p->p_pid, l->l_lid, code, 0); } else { SCDEBUG_KERNHIST_LOG(scdebughist, "pid %jd:%jd: num %jd", p->p_pid, l->l_lid, code, 0); SCDEBUG_KERNHIST_LOG(scdebughist, "ret: err = %jd, rv = 0x%jx,0x%jx", error, (long)retval[0], (long)retval[1], 0); } return; } printf("proc %d (%s): %s num ", p->p_pid, p->p_comm, em->e_name); if (CODE_NOT_OK(code, em)) printf("OUT OF RANGE (%ld)", (long)code); else printf("%ld ret %s: err = %d, rv = 0x%lx,0x%lx", (long)code, em->e_syscallnames[code], error, (long)retval[0], (long)retval[1]); printf("\n"); } #endif /* SYSCALL_DEBUG */ #ifndef SCDEBUG_KERNHIST_SIZE #define SCDEBUG_KERNHIST_SIZE 500 #endif void scdebug_init(void) { #if defined(SYSCALL_DEBUG) && defined(KERNHIST) /* Setup scdebughist kernel history */ KERNHIST_INIT(scdebughist, SCDEBUG_KERNHIST_SIZE); #endif }
7 7 50 49 50 49 50 50 50 50 30 29 30 30 30 663 258 428 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 /* $NetBSD: ffs_vnops.c,v 1.138 2021/12/14 11:06:12 chs Exp $ */ /*- * Copyright (c) 2008, 2009 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Wasabi Systems, Inc, and by Andrew Doran. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /* * Copyright (c) 1982, 1986, 1989, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)ffs_vnops.c 8.15 (Berkeley) 5/14/95 */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: ffs_vnops.c,v 1.138 2021/12/14 11:06:12 chs Exp $"); #if defined(_KERNEL_OPT) #include "opt_ffs.h" #include "opt_wapbl.h" #endif #include <sys/param.h> #include <sys/systm.h> #include <sys/resourcevar.h> #include <sys/kernel.h> #include <sys/file.h> #include <sys/stat.h> #include <sys/buf.h> #include <sys/event.h> #include <sys/proc.h> #include <sys/mount.h> #include <sys/vnode.h> #include <sys/pool.h> #include <sys/signalvar.h> #include <sys/kauth.h> #include <sys/wapbl.h> #include <miscfs/fifofs/fifo.h> #include <miscfs/genfs/genfs.h> #include <miscfs/specfs/specdev.h> #include <ufs/ufs/acl.h> #include <ufs/ufs/inode.h> #include <ufs/ufs/dir.h> #include <ufs/ufs/ufs_extern.h> #include <ufs/ufs/ufsmount.h> #include <ufs/ufs/ufs_wapbl.h> #include <ufs/ffs/fs.h> #include <ufs/ffs/ffs_extern.h> /* Global vfs data structures for ufs. */ int (**ffs_vnodeop_p)(void *); const struct vnodeopv_entry_desc ffs_vnodeop_entries[] = { { &vop_default_desc, vn_default_error }, { &vop_parsepath_desc, genfs_parsepath }, /* parsepath */ { &vop_lookup_desc, ufs_lookup }, /* lookup */ { &vop_create_desc, ufs_create }, /* create */ { &vop_whiteout_desc, ufs_whiteout }, /* whiteout */ { &vop_mknod_desc, ufs_mknod }, /* mknod */ { &vop_open_desc, ufs_open }, /* open */ { &vop_close_desc, ufs_close }, /* close */ { &vop_access_desc, genfs_access }, /* access */ { &vop_accessx_desc, ufs_accessx }, /* accessx */ { &vop_getattr_desc, ufs_getattr }, /* getattr */ { &vop_setattr_desc, ufs_setattr }, /* setattr */ { &vop_read_desc, ffs_read }, /* read */ { &vop_write_desc, ffs_write }, /* write */ { &vop_fallocate_desc, genfs_eopnotsupp }, /* fallocate */ { &vop_fdiscard_desc, genfs_eopnotsupp }, /* fdiscard */ { &vop_ioctl_desc, genfs_enoioctl }, /* ioctl */ { &vop_fcntl_desc, genfs_fcntl }, /* fcntl */ { &vop_poll_desc, genfs_poll }, /* poll */ { &vop_kqfilter_desc, genfs_kqfilter }, /* kqfilter */ { &vop_revoke_desc, genfs_revoke }, /* revoke */ { &vop_mmap_desc, genfs_mmap }, /* mmap */ { &vop_fsync_desc, ffs_fsync }, /* fsync */ { &vop_seek_desc, genfs_seek }, /* seek */ { &vop_remove_desc, ufs_remove }, /* remove */ { &vop_link_desc, ufs_link }, /* link */ { &vop_rename_desc, ufs_rename }, /* rename */ { &vop_mkdir_desc, ufs_mkdir }, /* mkdir */ { &vop_rmdir_desc, ufs_rmdir }, /* rmdir */ { &vop_symlink_desc, ufs_symlink }, /* symlink */ { &vop_readdir_desc, ufs_readdir }, /* readdir */ { &vop_readlink_desc, ufs_readlink }, /* readlink */ { &vop_abortop_desc, genfs_abortop }, /* abortop */ { &vop_inactive_desc, ufs_inactive }, /* inactive */ { &vop_reclaim_desc, ffs_reclaim }, /* reclaim */ { &vop_lock_desc, genfs_lock }, /* lock */ { &vop_unlock_desc, genfs_unlock }, /* unlock */ { &vop_bmap_desc, ufs_bmap }, /* bmap */ { &vop_strategy_desc, ufs_strategy }, /* strategy */ { &vop_print_desc, ufs_print }, /* print */ { &vop_islocked_desc, genfs_islocked }, /* islocked */ { &vop_pathconf_desc, ufs_pathconf }, /* pathconf */ { &vop_advlock_desc, ufs_advlock }, /* advlock */ { &vop_bwrite_desc, vn_bwrite }, /* bwrite */ { &vop_getpages_desc, genfs_getpages }, /* getpages */ { &vop_putpages_desc, genfs_putpages }, /* putpages */ { &vop_openextattr_desc, ffs_openextattr }, /* openextattr */ { &vop_closeextattr_desc, ffs_closeextattr }, /* closeextattr */ { &vop_getextattr_desc, ffs_getextattr }, /* getextattr */ { &vop_setextattr_desc, ffs_setextattr }, /* setextattr */ { &vop_listextattr_desc, ffs_listextattr }, /* listextattr */ { &vop_deleteextattr_desc, ffs_deleteextattr }, /* deleteextattr */ { &vop_getacl_desc, ufs_getacl }, /* getacl */ { &vop_setacl_desc, ufs_setacl }, /* setacl */ { &vop_aclcheck_desc, ufs_aclcheck }, /* aclcheck */ { NULL, NULL } }; const struct vnodeopv_desc ffs_vnodeop_opv_desc = { &ffs_vnodeop_p, ffs_vnodeop_entries }; int (**ffs_specop_p)(void *); const struct vnodeopv_entry_desc ffs_specop_entries[] = { { &vop_default_desc, vn_default_error }, GENFS_SPECOP_ENTRIES, { &vop_close_desc, ufsspec_close }, /* close */ { &vop_access_desc, genfs_access }, /* access */ { &vop_accessx_desc, ufs_accessx }, /* accessx */ { &vop_getattr_desc, ufs_getattr }, /* getattr */ { &vop_setattr_desc, ufs_setattr }, /* setattr */ { &vop_read_desc, ufsspec_read }, /* read */ { &vop_write_desc, ufsspec_write }, /* write */ { &vop_fcntl_desc, genfs_fcntl }, /* fcntl */ { &vop_fsync_desc, ffs_spec_fsync }, /* fsync */ { &vop_inactive_desc, ufs_inactive }, /* inactive */ { &vop_reclaim_desc, ffs_reclaim }, /* reclaim */ { &vop_lock_desc, genfs_lock }, /* lock */ { &vop_unlock_desc, genfs_unlock }, /* unlock */ { &vop_print_desc, ufs_print }, /* print */ { &vop_islocked_desc, genfs_islocked }, /* islocked */ { &vop_bwrite_desc, vn_bwrite }, /* bwrite */ { &vop_openextattr_desc, ffs_openextattr }, /* openextattr */ { &vop_closeextattr_desc, ffs_closeextattr }, /* closeextattr */ { &vop_getextattr_desc, ffs_getextattr }, /* getextattr */ { &vop_setextattr_desc, ffs_setextattr }, /* setextattr */ { &vop_listextattr_desc, ffs_listextattr }, /* listextattr */ { &vop_deleteextattr_desc, ffs_deleteextattr }, /* deleteextattr */ { &vop_getacl_desc, ufs_getacl }, /* getacl */ { &vop_setacl_desc, ufs_setacl }, /* setacl */ { &vop_aclcheck_desc, ufs_aclcheck }, /* aclcheck */ { NULL, NULL } }; const struct vnodeopv_desc ffs_specop_opv_desc = { &ffs_specop_p, ffs_specop_entries }; int (**ffs_fifoop_p)(void *); const struct vnodeopv_entry_desc ffs_fifoop_entries[] = { { &vop_default_desc, vn_default_error }, GENFS_FIFOOP_ENTRIES, { &vop_close_desc, ufsfifo_close }, /* close */ { &vop_access_desc, genfs_access }, /* access */ { &vop_accessx_desc, ufs_accessx }, /* accessx */ { &vop_getattr_desc, ufs_getattr }, /* getattr */ { &vop_setattr_desc, ufs_setattr }, /* setattr */ { &vop_read_desc, ufsfifo_read }, /* read */ { &vop_write_desc, ufsfifo_write }, /* write */ { &vop_fcntl_desc, genfs_fcntl }, /* fcntl */ { &vop_fsync_desc, ffs_fsync }, /* fsync */ { &vop_inactive_desc, ufs_inactive }, /* inactive */ { &vop_reclaim_desc, ffs_reclaim }, /* reclaim */ { &vop_lock_desc, genfs_lock }, /* lock */ { &vop_unlock_desc, genfs_unlock }, /* unlock */ { &vop_bmap_desc, ufs_bmap }, /* bmap */ { &vop_strategy_desc, ffsext_strategy }, /* strategy */ { &vop_print_desc, ufs_print }, /* print */ { &vop_islocked_desc, genfs_islocked }, /* islocked */ { &vop_pathconf_desc, ufs_pathconf }, /* pathconf */ { &vop_bwrite_desc, vn_bwrite }, /* bwrite */ { &vop_openextattr_desc, ffs_openextattr }, /* openextattr */ { &vop_closeextattr_desc, ffs_closeextattr }, /* closeextattr */ { &vop_getextattr_desc, ffs_getextattr }, /* getextattr */ { &vop_setextattr_desc, ffs_setextattr }, /* setextattr */ { &vop_listextattr_desc, ffs_listextattr }, /* listextattr */ { &vop_deleteextattr_desc, ffs_deleteextattr }, /* deleteextattr */ { &vop_getacl_desc, ufs_getacl }, /* getacl */ { &vop_setacl_desc, ufs_setacl }, /* setacl */ { &vop_aclcheck_desc, ufs_aclcheck }, /* aclcheck */ { NULL, NULL } }; const struct vnodeopv_desc ffs_fifoop_opv_desc = { &ffs_fifoop_p, ffs_fifoop_entries }; #include <ufs/ufs/ufs_readwrite.c> int ffs_spec_fsync(void *v) { struct vop_fsync_args /* { struct vnode *a_vp; kauth_cred_t a_cred; int a_flags; off_t a_offlo; off_t a_offhi; struct lwp *a_l; } */ *ap = v; int error, flags, uflags; struct vnode *vp; flags = ap->a_flags; uflags = UPDATE_CLOSE | ((flags & FSYNC_WAIT) ? UPDATE_WAIT : 0); vp = ap->a_vp; error = spec_fsync(v); if (error) goto out; #ifdef WAPBL struct mount *mp = vp->v_mount; if (mp && mp->mnt_wapbl) { /* * Don't bother writing out metadata if the syncer is * making the request. We will let the sync vnode * write it out in a single burst through a call to * VFS_SYNC(). */ if ((flags & (FSYNC_DATAONLY | FSYNC_LAZY)) != 0) goto out; if ((VTOI(vp)->i_flag & (IN_ACCESS | IN_CHANGE | IN_UPDATE | IN_MODIFY | IN_MODIFIED | IN_ACCESSED)) != 0) { error = UFS_WAPBL_BEGIN(mp); if (error != 0) goto out; error = ffs_update(vp, NULL, NULL, uflags); UFS_WAPBL_END(mp); } goto out; } #endif /* WAPBL */ error = ffs_update(vp, NULL, NULL, uflags); out: return error; } int ffs_fsync(void *v) { struct vop_fsync_args /* { struct vnode *a_vp; kauth_cred_t a_cred; int a_flags; off_t a_offlo; off_t a_offhi; struct lwp *a_l; } */ *ap = v; struct buf *bp; int num, error, i; struct indir ia[UFS_NIADDR + 1]; int bsize; daddr_t blk_high; struct vnode *vp; struct mount *mp; vp = ap->a_vp; mp = vp->v_mount; if ((ap->a_offlo == 0 && ap->a_offhi == 0) || (vp->v_type != VREG)) { error = ffs_full_fsync(vp, ap->a_flags); goto out; } bsize = mp->mnt_stat.f_iosize; blk_high = ap->a_offhi / bsize; if (ap->a_offhi % bsize != 0) blk_high++; /* * First, flush all pages in range. */ rw_enter(vp->v_uobj.vmobjlock, RW_WRITER); error = VOP_PUTPAGES(vp, trunc_page(ap->a_offlo), round_page(ap->a_offhi), PGO_CLEANIT | ((ap->a_flags & FSYNC_WAIT) ? PGO_SYNCIO : 0)); if (error) { goto out; } #ifdef WAPBL KASSERT(vp->v_type == VREG); if (mp->mnt_wapbl) { /* * Don't bother writing out metadata if the syncer is * making the request. We will let the sync vnode * write it out in a single burst through a call to * VFS_SYNC(). */ if ((ap->a_flags & (FSYNC_DATAONLY | FSYNC_LAZY)) != 0) { return 0; } error = 0; if (vp->v_tag == VT_UFS && VTOI(vp)->i_flag & (IN_ACCESS | IN_CHANGE | IN_UPDATE | IN_MODIFY | IN_MODIFIED | IN_ACCESSED)) { error = UFS_WAPBL_BEGIN(mp); if (error) { return error; } error = ffs_update(vp, NULL, NULL, UPDATE_CLOSE | ((ap->a_flags & FSYNC_WAIT) ? UPDATE_WAIT : 0)); UFS_WAPBL_END(mp); } if (error || (ap->a_flags & FSYNC_NOLOG) != 0) { return error; } error = wapbl_flush(mp->mnt_wapbl, 0); return error; } #endif /* WAPBL */ /* * Then, flush indirect blocks. */ if (blk_high >= UFS_NDADDR) { error = ufs_getlbns(vp, blk_high, ia, &num); if (error) goto out; mutex_enter(&bufcache_lock); for (i = 0; i < num; i++) { if ((bp = incore(vp, ia[i].in_lbn)) == NULL) continue; if ((bp->b_cflags & BC_BUSY) != 0 || (bp->b_oflags & BO_DELWRI) == 0) continue; bp->b_cflags |= BC_BUSY | BC_VFLUSH; mutex_exit(&bufcache_lock); bawrite(bp); mutex_enter(&bufcache_lock); } mutex_exit(&bufcache_lock); } if (ap->a_flags & FSYNC_WAIT) { mutex_enter(vp->v_interlock); while (vp->v_numoutput > 0) cv_wait(&vp->v_cv, vp->v_interlock); mutex_exit(vp->v_interlock); } error = ffs_update(vp, NULL, NULL, UPDATE_CLOSE | (((ap->a_flags & (FSYNC_WAIT | FSYNC_DATAONLY)) == FSYNC_WAIT) ? UPDATE_WAIT : 0)); if (error == 0 && ap->a_flags & FSYNC_CACHE) { int l = 0; VOP_IOCTL(VTOI(vp)->i_devvp, DIOCCACHESYNC, &l, FWRITE, curlwp->l_cred); } out: return error; } /* * Synch an open file. Called for VOP_FSYNC(). */ /* ARGSUSED */ int ffs_full_fsync(struct vnode *vp, int flags) { int error, i, uflags; KASSERT(vp->v_tag == VT_UFS); KASSERT(VTOI(vp) != NULL); KASSERT(vp->v_type != VCHR && vp->v_type != VBLK); uflags = UPDATE_CLOSE | ((flags & FSYNC_WAIT) ? UPDATE_WAIT : 0); #ifdef WAPBL struct mount *mp = vp->v_mount; if (mp && mp->mnt_wapbl) { /* * Flush all dirty data associated with the vnode. */ if (vp->v_type == VREG) { int pflags = PGO_ALLPAGES | PGO_CLEANIT; if ((flags & FSYNC_LAZY)) pflags |= PGO_LAZY; if ((flags & FSYNC_WAIT)) pflags |= PGO_SYNCIO; rw_enter(vp->v_uobj.vmobjlock, RW_WRITER); error = VOP_PUTPAGES(vp, 0, 0, pflags); if (error) return error; } /* * Don't bother writing out metadata if the syncer is * making the request. We will let the sync vnode * write it out in a single burst through a call to * VFS_SYNC(). */ if ((flags & (FSYNC_DATAONLY | FSYNC_LAZY)) != 0) return 0; if ((VTOI(vp)->i_flag & (IN_ACCESS | IN_CHANGE | IN_UPDATE | IN_MODIFY | IN_MODIFIED | IN_ACCESSED)) != 0) { error = UFS_WAPBL_BEGIN(mp); if (error) return error; error = ffs_update(vp, NULL, NULL, uflags); UFS_WAPBL_END(mp); } else { error = 0; } if (error || (flags & FSYNC_NOLOG) != 0) return error; /* * Don't flush the log if the vnode being flushed * contains no dirty buffers that could be in the log. */ if (!LIST_EMPTY(&vp->v_dirtyblkhd)) { error = wapbl_flush(mp->mnt_wapbl, 0); if (error) return error; } if ((flags & FSYNC_WAIT) != 0) { mutex_enter(vp->v_interlock); while (vp->v_numoutput != 0) cv_wait(&vp->v_cv, vp->v_interlock); mutex_exit(vp->v_interlock); } return error; } #endif /* WAPBL */ error = vflushbuf(vp, flags); if (error == 0) error = ffs_update(vp, NULL, NULL, uflags); if (error == 0 && (flags & FSYNC_CACHE) != 0) { i = 1; (void)VOP_IOCTL(VTOI(vp)->i_devvp, DIOCCACHESYNC, &i, FWRITE, kauth_cred_get()); } return error; } /* * Reclaim an inode so that it can be used for other purposes. */ int ffs_reclaim(void *v) { struct vop_reclaim_v2_args /* { struct vnode *a_vp; struct lwp *a_l; } */ *ap = v; struct vnode *vp = ap->a_vp; struct inode *ip = VTOI(vp); struct mount *mp = vp->v_mount; struct ufsmount *ump = ip->i_ump; void *data; int error; VOP_UNLOCK(vp); /* * The inode must be freed and updated before being removed * from its hash chain. Other threads trying to gain a hold * or lock on the inode will be stalled. */ error = UFS_WAPBL_BEGIN(mp); if (error) { return error; } if (ip->i_nlink <= 0 && ip->i_omode != 0 && (vp->v_mount->mnt_flag & MNT_RDONLY) == 0) ffs_vfree(vp, ip->i_number, ip->i_omode); UFS_WAPBL_END(mp); if ((error = ufs_reclaim(vp)) != 0) { return (error); } if (ip->i_din.ffs1_din != NULL) { if (ump->um_fstype == UFS1) pool_cache_put(ffs_dinode1_cache, ip->i_din.ffs1_din); else pool_cache_put(ffs_dinode2_cache, ip->i_din.ffs2_din); } /* * To interlock with ffs_sync(). */ genfs_node_destroy(vp); mutex_enter(vp->v_interlock); data = vp->v_data; vp->v_data = NULL; mutex_exit(vp->v_interlock); /* * XXX MFS ends up here, too, to free an inode. Should we create * XXX a separate pool for MFS inodes? */ pool_cache_put(ffs_inode_cache, data); return (0); } /* * Return the last logical file offset that should be written for this file * if we're doing a write that ends at "size". */ void ffs_gop_size(struct vnode *vp, off_t size, off_t *eobp, int flags) { struct inode *ip = VTOI(vp); struct fs *fs = ip->i_fs; daddr_t olbn, nlbn; olbn = ffs_lblkno(fs, ip->i_size); nlbn = ffs_lblkno(fs, size); if (nlbn < UFS_NDADDR && olbn <= nlbn) { *eobp = ffs_fragroundup(fs, size); } else { *eobp = ffs_blkroundup(fs, size); } }
6 6 9 9 5 7 5 7 14 1 3 1 2 2 3 3 3 5 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 /* $NetBSD: kern_acct.c,v 1.99 2021/12/05 04:35:38 msaitoh Exp $ */ /*- * Copyright (c) 1982, 1986, 1989, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)kern_acct.c 8.8 (Berkeley) 5/14/95 */ /*- * Copyright (c) 1994 Christopher G. Demetriou * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)kern_acct.c 8.8 (Berkeley) 5/14/95 */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: kern_acct.c,v 1.99 2021/12/05 04:35:38 msaitoh Exp $"); #include <sys/param.h> #include <sys/systm.h> #include <sys/proc.h> #include <sys/mount.h> #include <sys/vnode.h> #include <sys/file.h> #include <sys/syslog.h> #include <sys/kernel.h> #include <sys/kthread.h> #include <sys/kmem.h> #include <sys/namei.h> #include <sys/errno.h> #include <sys/acct.h> #include <sys/resourcevar.h> #include <sys/ioctl.h> #include <sys/tty.h> #include <sys/kauth.h> #include <sys/syscallargs.h> /* * The routines implemented in this file are described in: * Leffler, et al.: The Design and Implementation of the 4.3BSD * UNIX Operating System (Addison Welley, 1989) * on pages 62-63. * * Arguably, to simplify accounting operations, this mechanism should * be replaced by one in which an accounting log file (similar to /dev/klog) * is read by a user process, etc. However, that has its own problems. */ /* * Lock to serialize system calls and kernel threads. */ krwlock_t acct_lock; /* * The global accounting state and related data. Gain the mutex before * accessing these variables. */ static enum { ACCT_STOP, ACCT_ACTIVE, ACCT_SUSPENDED } acct_state; /* The current accounting state. */ static struct vnode *acct_vp; /* Accounting vnode pointer. */ static kauth_cred_t acct_cred; /* Credential of accounting file owner (i.e root). Used when accounting file i/o. */ static struct lwp *acct_dkwatcher; /* Free disk space checker. */ /* * Values associated with enabling and disabling accounting */ int acctsuspend = 2; /* stop accounting when < 2% free space left */ int acctresume = 4; /* resume when free space risen to > 4% */ int acctchkfreq = 15; /* frequency (in seconds) to check space */ /* * Encode_comp_t converts from ticks in seconds and microseconds * to ticks in 1/AHZ seconds. The encoding is described in * Leffler, et al., on page 63. */ #define MANTSIZE 13 /* 13 bit mantissa. */ #define EXPSIZE 3 /* Base 8 (3 bit) exponent. */ #define MAXFRACT ((1 << MANTSIZE) - 1) /* Maximum fractional value. */ static comp_t encode_comp_t(u_long s, u_long us) { int exp, rnd; exp = 0; rnd = 0; s *= AHZ; s += us / (1000000 / AHZ); /* Maximize precision. */ while (s > MAXFRACT) { rnd = s & (1 << (EXPSIZE - 1)); /* Round up? */ s >>= EXPSIZE; /* Base 8 exponent == 3 bit shift. */ exp++; } /* If we need to round up, do it (and handle overflow correctly). */ if (rnd && (++s > MAXFRACT)) { s >>= EXPSIZE; exp++; } /* Clean it up and polish it off. */ exp <<= MANTSIZE; /* Shift the exponent into place */ exp += s; /* and add on the mantissa. */ return (exp); } static int acct_chkfree(void) { int error; struct statvfs *sb; fsblkcnt_t bavail; sb = kmem_alloc(sizeof(*sb), KM_SLEEP); error = VFS_STATVFS(acct_vp->v_mount, sb); if (error != 0) { kmem_free(sb, sizeof(*sb)); return (error); } if (sb->f_bfree < sb->f_bresvd) { bavail = 0; } else { bavail = sb->f_bfree - sb->f_bresvd; } switch (acct_state) { case ACCT_SUSPENDED: if (bavail > acctresume * sb->f_blocks / 100) { acct_state = ACCT_ACTIVE; log(LOG_NOTICE, "Accounting resumed\n"); } break; case ACCT_ACTIVE: if (bavail <= acctsuspend * sb->f_blocks / 100) { acct_state = ACCT_SUSPENDED; log(LOG_NOTICE, "Accounting suspended\n"); } break; case ACCT_STOP: break; } kmem_free(sb, sizeof(*sb)); return (0); } static void acct_stop(void) { int error; KASSERT(rw_write_held(&acct_lock)); if (acct_vp != NULLVP && acct_vp->v_type != VBAD) { error = vn_close(acct_vp, FWRITE, acct_cred); #ifdef DIAGNOSTIC if (error != 0) printf("acct_stop: failed to close, errno = %d\n", error); #else __USE(error); #endif acct_vp = NULLVP; } if (acct_cred != NULL) { kauth_cred_free(acct_cred); acct_cred = NULL; } acct_state = ACCT_STOP; } /* * Periodically check the file system to see if accounting * should be turned on or off. Beware the case where the vnode * has been vgone()'d out from underneath us, e.g. when the file * system containing the accounting file has been forcibly unmounted. */ static void acctwatch(void *arg) { int error; log(LOG_NOTICE, "Accounting started\n"); rw_enter(&acct_lock, RW_WRITER); while (acct_state != ACCT_STOP) { if (acct_vp->v_type == VBAD) { log(LOG_NOTICE, "Accounting terminated\n"); acct_stop(); continue; } error = acct_chkfree(); #ifdef DIAGNOSTIC if (error != 0) printf("acctwatch: failed to statvfs, error = %d\n", error); #else __USE(error); #endif rw_exit(&acct_lock); error = kpause("actwat", false, acctchkfreq * hz, NULL); rw_enter(&acct_lock, RW_WRITER); #ifdef DIAGNOSTIC if (error != 0 && error != EWOULDBLOCK) printf("acctwatch: sleep error %d\n", error); #endif } acct_dkwatcher = NULL; rw_exit(&acct_lock); kthread_exit(0); } void acct_init(void) { acct_state = ACCT_STOP; acct_vp = NULLVP; acct_cred = NULL; rw_init(&acct_lock); } /* * Accounting system call. Written based on the specification and * previous implementation done by Mark Tinguely. */ int sys_acct(struct lwp *l, const struct sys_acct_args *uap, register_t *retval) { /* { syscallarg(const char *) path; } */ struct pathbuf *pb; struct vnode *vp; int error; /* Make sure that the caller is root. */ if ((error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_ACCOUNTING, 0, NULL, NULL, NULL))) return (error); /* * If accounting is to be started to a file, open that file for * writing and make sure it's a 'normal'. */ if (SCARG(uap, path) != NULL) { struct vattr va; size_t pad; error = pathbuf_copyin(SCARG(uap, path), &pb); if (error) { return error; } error = vn_open(NULL, pb, TRYEMULROOT, FWRITE|O_APPEND, 0, &vp, NULL, NULL); if (error != 0) { pathbuf_destroy(pb); return error; } if (vp->v_type != VREG) { VOP_UNLOCK(vp); error = EACCES; goto bad; } if ((error = VOP_GETATTR(vp, &va, l->l_cred)) != 0) { VOP_UNLOCK(vp); goto bad; } if ((pad = (va.va_size % sizeof(struct acct))) != 0) { u_quad_t size = va.va_size - pad; #ifdef DIAGNOSTIC printf("Size of accounting file not a multiple of " "%lu - incomplete record truncated\n", (unsigned long)sizeof(struct acct)); #endif vattr_null(&va); va.va_size = size; error = VOP_SETATTR(vp, &va, l->l_cred); if (error != 0) { VOP_UNLOCK(vp); goto bad; } } VOP_UNLOCK(vp); } rw_enter(&acct_lock, RW_WRITER); /* * If accounting was previously enabled, kill the old space-watcher, * free credential for accounting file i/o, * ... (and, if no new file was specified, leave). */ acct_stop(); if (SCARG(uap, path) == NULL) goto out; /* * Save the new accounting file vnode and credential, * and schedule the new free space watcher. */ acct_state = ACCT_ACTIVE; acct_vp = vp; acct_cred = l->l_cred; kauth_cred_hold(acct_cred); pathbuf_destroy(pb); error = acct_chkfree(); /* Initial guess. */ if (error != 0) { acct_stop(); goto out; } if (acct_dkwatcher == NULL) { error = kthread_create(PRI_NONE, KTHREAD_MPSAFE, NULL, acctwatch, NULL, &acct_dkwatcher, "acctwatch"); if (error != 0) acct_stop(); } out: rw_exit(&acct_lock); return (error); bad: vn_close(vp, FWRITE, l->l_cred); pathbuf_destroy(pb); return error; } /* * Write out process accounting information, on process exit. * Data to be written out is specified in Leffler, et al. * and are enumerated below. (They're also noted in the system * "acct.h" header file.) */ int acct_process(struct lwp *l) { struct acct acct; struct timeval ut, st, tmp; struct rusage *r; int t, error = 0; struct rlimit orlim; struct proc *p = l->l_proc; if (acct_state != ACCT_ACTIVE) return 0; memset(&acct, 0, sizeof(acct)); /* to zerofill padded data */ rw_enter(&acct_lock, RW_READER); /* If accounting isn't enabled, don't bother */ if (acct_state != ACCT_ACTIVE) goto out; /* * Temporarily raise the file limit so that accounting can't * be stopped by the user. * * XXX We should think about the CPU limit, too. */ lim_privatise(p); orlim = p->p_rlimit[RLIMIT_FSIZE]; /* Set current and max to avoid illegal values */ p->p_rlimit[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY; p->p_rlimit[RLIMIT_FSIZE].rlim_max = RLIM_INFINITY; /* * Get process accounting information. */ /* (1) The name of the command that ran */ strncpy(acct.ac_comm, p->p_comm, sizeof(acct.ac_comm)); /* (2) The amount of user and system time that was used */ mutex_enter(p->p_lock); calcru(p, &ut, &st, NULL, NULL); mutex_exit(p->p_lock); acct.ac_utime = encode_comp_t(ut.tv_sec, ut.tv_usec); acct.ac_stime = encode_comp_t(st.tv_sec, st.tv_usec); /* (3) The elapsed time the command ran (and its starting time) */ acct.ac_btime = p->p_stats->p_start.tv_sec; getmicrotime(&tmp); timersub(&tmp, &p->p_stats->p_start, &tmp); acct.ac_etime = encode_comp_t(tmp.tv_sec, tmp.tv_usec); /* (4) The average amount of memory used */ r = &p->p_stats->p_ru; timeradd(&ut, &st, &tmp); t = tmp.tv_sec * hz + tmp.tv_usec / tick; if (t) acct.ac_mem = (r->ru_ixrss + r->ru_idrss + r->ru_isrss) / t; else acct.ac_mem = 0; /* (5) The number of disk I/O operations done */ acct.ac_io = encode_comp_t(r->ru_inblock + r->ru_oublock, 0); /* (6) The UID and GID of the process */ acct.ac_uid = kauth_cred_getuid(l->l_cred); acct.ac_gid = kauth_cred_getgid(l->l_cred); /* (7) The terminal from which the process was started */ mutex_enter(&proc_lock); if ((p->p_lflag & PL_CONTROLT) && p->p_pgrp->pg_session->s_ttyp) acct.ac_tty = p->p_pgrp->pg_session->s_ttyp->t_dev; else acct.ac_tty = NODEV; mutex_exit(&proc_lock); /* (8) The boolean flags that tell how the process terminated, etc. */ acct.ac_flag = p->p_acflag; /* * Now, just write the accounting information to the file. */ error = vn_rdwr(UIO_WRITE, acct_vp, (void *)&acct, sizeof(acct), (off_t)0, UIO_SYSSPACE, IO_APPEND|IO_UNIT, acct_cred, NULL, NULL); if (error != 0) log(LOG_ERR, "Accounting: write failed %d\n", error); /* Restore limit - rather pointless since process is about to exit */ p->p_rlimit[RLIMIT_FSIZE] = orlim; out: rw_exit(&acct_lock); return (error); }
2746 2763 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 /* $NetBSD: userret.h,v 1.13 2018/07/26 09:29:08 maxv Exp $ */ /* * XXXfvdl same as i386 counterpart, but should probably be independent. */ /*- * Copyright (c) 1998, 2000 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Charles M. Hannum. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /*- * Copyright (c) 1990 The Regents of the University of California. * All rights reserved. * * This code is derived from software contributed to Berkeley by * William Jolitz. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * */ #include <sys/userret.h> static __inline void userret(struct lwp *); /* * Define the code needed before returning to user mode, for * trap and syscall. */ static __inline void userret(struct lwp *l) { /* Invoke MI userret code */ mi_userret(l); }
1 1 1 84 1 18 51 1 1 1 1 1 1 1 1 1 1 1 1 1 1 8 8 8 8 7 8 6 1 3 8 6 2 3 2 2 1 13 1 1 4 11 3 11 2 12 1 2 6 1 4 2 1 1 4 1 1 4 1 2 2 1 1 3 3 6 6 7 7 6 1 1 69 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 /* $NetBSD: tcp_usrreq.c,v 1.238 2022/11/04 09:01:53 ozaki-r Exp $ */ /* * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the project nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /*- * Copyright (c) 1997, 1998, 2005, 2006 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Jason R. Thorpe and Kevin M. Lahey of the Numerical Aerospace Simulation * Facility, NASA Ames Research Center. * This code is derived from software contributed to The NetBSD Foundation * by Charles M. Hannum. * This code is derived from software contributed to The NetBSD Foundation * by Rui Paulo. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /* * Copyright (c) 1982, 1986, 1988, 1993, 1995 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)tcp_usrreq.c 8.5 (Berkeley) 6/21/95 */ /* * TCP protocol interface to socket abstraction. */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: tcp_usrreq.c,v 1.238 2022/11/04 09:01:53 ozaki-r Exp $"); #ifdef _KERNEL_OPT #include "opt_inet.h" #include "opt_tcp_debug.h" #include "opt_mbuftrace.h" #include "opt_tcp_space.h" #include "opt_net_mpsafe.h" #endif #include <sys/param.h> #include <sys/systm.h> #include <sys/kernel.h> #include <sys/mbuf.h> #include <sys/socket.h> #include <sys/socketvar.h> #include <sys/protosw.h> #include <sys/errno.h> #include <sys/stat.h> #include <sys/proc.h> #include <sys/domain.h> #include <sys/sysctl.h> #include <sys/kauth.h> #include <sys/kernel.h> #include <sys/uidinfo.h> #include <net/if.h> #include <netinet/in.h> #include <netinet/in_systm.h> #include <netinet/in_var.h> #include <netinet/ip.h> #include <netinet/in_pcb.h> #include <netinet/ip_var.h> #include <netinet/in_offload.h> #ifdef INET6 #include <netinet/ip6.h> #include <netinet6/in6_pcb.h> #include <netinet6/ip6_var.h> #include <netinet6/scope6_var.h> #endif #include <netinet/tcp.h> #include <netinet/tcp_fsm.h> #include <netinet/tcp_seq.h> #include <netinet/tcp_timer.h> #include <netinet/tcp_var.h> #include <netinet/tcp_private.h> #include <netinet/tcp_congctl.h> #include <netinet/tcp_debug.h> #include <netinet/tcp_vtw.h> #include <netinet/tcp_syncache.h> static int tcp_debug_capture(struct tcpcb *tp, int req) { #ifdef TCP_DEBUG return tp->t_state; #endif return 0; } static inline void tcp_debug_trace(struct socket *so, struct tcpcb *tp, int ostate, int req) { #ifdef TCP_DEBUG if (tp && (so->so_options & SO_DEBUG)) tcp_trace(TA_USER, ostate, tp, NULL, req); #endif } static void change_keepalive(struct socket *so, struct tcpcb *tp) { tp->t_maxidle = tp->t_keepcnt * MIN(tp->t_keepintvl, TCP_TIMER_MAXTICKS / tp->t_keepcnt); TCP_TIMER_DISARM(tp, TCPT_KEEP); TCP_TIMER_DISARM(tp, TCPT_2MSL); if (tp->t_state == TCPS_SYN_RECEIVED || tp->t_state == TCPS_SYN_SENT) { TCP_TIMER_ARM(tp, TCPT_KEEP, tp->t_keepinit); } else if (so->so_options & SO_KEEPALIVE && tp->t_state <= TCPS_CLOSE_WAIT) { TCP_TIMER_ARM(tp, TCPT_KEEP, tp->t_keepintvl); } else { TCP_TIMER_ARM(tp, TCPT_KEEP, tp->t_keepidle); } if ((tp->t_state == TCPS_FIN_WAIT_2) && (tp->t_maxidle > 0)) TCP_TIMER_ARM(tp, TCPT_2MSL, tp->t_maxidle); } /* * Export TCP internal state information via a struct tcp_info, based on the * Linux 2.6 API. Not ABI compatible as our constants are mapped differently * (TCP state machine, etc). We export all information using FreeBSD-native * constants -- for example, the numeric values for tcpi_state will differ * from Linux. */ static void tcp_fill_info(struct tcpcb *tp, struct tcp_info *ti) { bzero(ti, sizeof(*ti)); ti->tcpi_state = tp->t_state; if ((tp->t_flags & TF_REQ_TSTMP) && (tp->t_flags & TF_RCVD_TSTMP)) ti->tcpi_options |= TCPI_OPT_TIMESTAMPS; if (tp->t_flags & TF_SACK_PERMIT) ti->tcpi_options |= TCPI_OPT_SACK; if ((tp->t_flags & TF_REQ_SCALE) && (tp->t_flags & TF_RCVD_SCALE)) { ti->tcpi_options |= TCPI_OPT_WSCALE; ti->tcpi_snd_wscale = tp->snd_scale; ti->tcpi_rcv_wscale = tp->rcv_scale; } if (tp->t_flags & TF_ECN_PERMIT) { ti->tcpi_options |= TCPI_OPT_ECN; } ti->tcpi_rto = tp->t_rxtcur * tick; ti->tcpi_last_data_recv = (long)(getticks() - (int)tp->t_rcvtime) * tick; ti->tcpi_rtt = ((u_int64_t)tp->t_srtt * tick / PR_SLOWHZ) >> (TCP_RTT_SHIFT + 2); ti->tcpi_rttvar = ((u_int64_t)tp->t_rttvar * tick / PR_SLOWHZ) >> (TCP_RTTVAR_SHIFT + 2); ti->tcpi_snd_ssthresh = tp->snd_ssthresh; /* Linux API wants these in # of segments, apparently */ ti->tcpi_snd_cwnd = tp->snd_cwnd / tp->t_segsz; ti->tcpi_snd_wnd = tp->snd_wnd / tp->t_segsz; /* * FreeBSD-specific extension fields for tcp_info. */ ti->tcpi_rcv_space = tp->rcv_wnd; ti->tcpi_rcv_nxt = tp->rcv_nxt; ti->tcpi_snd_bwnd = 0; /* Unused, kept for compat. */ ti->tcpi_snd_nxt = tp->snd_nxt; ti->tcpi_snd_mss = tp->t_segsz; ti->tcpi_rcv_mss = tp->t_segsz; #ifdef TF_TOE if (tp->t_flags & TF_TOE) ti->tcpi_options |= TCPI_OPT_TOE; #endif /* From the redundant department of redundancies... */ ti->__tcpi_retransmits = ti->__tcpi_retrans = ti->tcpi_snd_rexmitpack = tp->t_sndrexmitpack; ti->tcpi_rcv_ooopack = tp->t_rcvoopack; ti->tcpi_snd_zerowin = tp->t_sndzerowin; } int tcp_ctloutput(int op, struct socket *so, struct sockopt *sopt) { int error = 0, s; struct inpcb *inp; struct tcpcb *tp; struct tcp_info ti; u_int ui; int family; /* family of the socket */ int level, optname, optval; level = sopt->sopt_level; optname = sopt->sopt_name; family = so->so_proto->pr_domain->dom_family; s = splsoftnet(); inp = sotoinpcb(so); if (inp == NULL) { splx(s); return ECONNRESET; } if (level != IPPROTO_TCP) { switch (family) { case PF_INET: error = ip_ctloutput(op, so, sopt); break; #ifdef INET6 case PF_INET6: error = ip6_ctloutput(op, so, sopt); break; #endif } splx(s); return error; } tp = intotcpcb(inp); switch (op) { case PRCO_SETOPT: switch (optname) { #ifdef TCP_SIGNATURE case TCP_MD5SIG: error = sockopt_getint(sopt, &optval); if (error) break; if (optval > 0) tp->t_flags |= TF_SIGNATURE; else tp->t_flags &= ~TF_SIGNATURE; break; #endif /* TCP_SIGNATURE */ case TCP_NODELAY: error = sockopt_getint(sopt, &optval); if (error) break; if (optval) tp->t_flags |= TF_NODELAY; else tp->t_flags &= ~TF_NODELAY; break; case TCP_MAXSEG: error = sockopt_getint(sopt, &optval); if (error) break; if (optval > 0 && optval <= tp->t_peermss) tp->t_peermss = optval; /* limit on send size */ else error = EINVAL; break; #ifdef notyet case TCP_CONGCTL: /* XXX string overflow XXX */ error = tcp_congctl_select(tp, sopt->sopt_data); break; #endif case TCP_KEEPIDLE: error = sockopt_get(sopt, &ui, sizeof(ui)); if (error) break; if (ui > 0 && ui <= TCP_TIMER_MAXTICKS) { tp->t_keepidle = ui; change_keepalive(so, tp); } else error = EINVAL; break; case TCP_KEEPINTVL: error = sockopt_get(sopt, &ui, sizeof(ui)); if (error) break; if (ui > 0 && ui <= TCP_TIMER_MAXTICKS) { tp->t_keepintvl = ui; change_keepalive(so, tp); } else error = EINVAL; break; case TCP_KEEPCNT: error = sockopt_get(sopt, &ui, sizeof(ui)); if (error) break; if (ui > 0 && ui <= TCP_TIMER_MAXTICKS) { tp->t_keepcnt = ui; change_keepalive(so, tp); } else error = EINVAL; break; case TCP_KEEPINIT: error = sockopt_get(sopt, &ui, sizeof(ui)); if (error) break; if (ui > 0 && ui <= TCP_TIMER_MAXTICKS) { tp->t_keepinit = ui; change_keepalive(so, tp); } else error = EINVAL; break; default: error = ENOPROTOOPT; break; } break; case PRCO_GETOPT: switch (optname) { #ifdef TCP_SIGNATURE case TCP_MD5SIG: optval = (tp->t_flags & TF_SIGNATURE) ? 1 : 0; goto setval; #endif case TCP_NODELAY: optval = tp->t_flags & TF_NODELAY; goto setval; case TCP_MAXSEG: optval = tp->t_peermss; goto setval; case TCP_INFO: tcp_fill_info(tp, &ti); error = sockopt_set(sopt, &ti, sizeof ti); break; #ifdef notyet case TCP_CONGCTL: break; #endif case TCP_KEEPIDLE: optval = tp->t_keepidle; goto setval; case TCP_KEEPINTVL: optval = tp->t_keepintvl; goto setval; case TCP_KEEPCNT: optval = tp->t_keepcnt; goto setval; case TCP_KEEPINIT: optval = tp->t_keepinit; setval: error = sockopt_set(sopt, &optval, sizeof(optval)); break; default: error = ENOPROTOOPT; break; } break; } splx(s); return error; } #ifndef TCP_SENDSPACE #define TCP_SENDSPACE 1024*32 #endif int tcp_sendspace = TCP_SENDSPACE; #ifndef TCP_RECVSPACE #define TCP_RECVSPACE 1024*32 #endif int tcp_recvspace = TCP_RECVSPACE; /* * tcp_attach: attach TCP protocol to socket, allocating internet protocol * control block, TCP control block, buffer space and entering LISTEN state * if to accept connections. */ static int tcp_attach(struct socket *so, int proto) { struct tcpcb *tp; struct inpcb *inp; int s, error, family; /* Assign the lock (must happen even if we will error out). */ s = splsoftnet(); sosetlock(so); KASSERT(solocked(so)); KASSERT(sotoinpcb(so) == NULL); inp = sotoinpcb(so); KASSERT(inp == NULL); family = soaf(so); #ifdef MBUFTRACE so->so_mowner = &tcp_sock_mowner; so->so_rcv.sb_mowner = &tcp_sock_rx_mowner; so->so_snd.sb_mowner = &tcp_sock_tx_mowner; #endif if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0) { error = soreserve(so, tcp_sendspace, tcp_recvspace); if (error) goto out; } so->so_rcv.sb_flags |= SB_AUTOSIZE; so->so_snd.sb_flags |= SB_AUTOSIZE; error = inpcb_create(so, &tcbtable); if (error) goto out; inp = sotoinpcb(so); tp = tcp_newtcpcb(family, inp); if (tp == NULL) { int nofd = so->so_state & SS_NOFDREF; /* XXX */ so->so_state &= ~SS_NOFDREF; /* don't free the socket yet */ inpcb_destroy(inp); so->so_state |= nofd; error = ENOBUFS; goto out; } tp->t_state = TCPS_CLOSED; if ((so->so_options & SO_LINGER) && so->so_linger == 0) { so->so_linger = TCP_LINGERTIME; } out: KASSERT(solocked(so)); splx(s); return error; } static void tcp_detach(struct socket *so) { struct inpcb *inp; struct tcpcb *tp; int s; inp = sotoinpcb(so); if (inp == NULL) return; tp = intotcpcb(inp); s = splsoftnet(); (void)tcp_disconnect1(tp); splx(s); } static int tcp_accept(struct socket *so, struct sockaddr *nam) { struct inpcb *inp; struct tcpcb *tp; int ostate = 0; int s; inp = sotoinpcb(so); if (inp == NULL) return EINVAL; tp = intotcpcb(inp); ostate = tcp_debug_capture(tp, PRU_ACCEPT); /* * Accept a connection. Essentially all the work is * done at higher levels; just return the address * of the peer, storing through addr. */ s = splsoftnet(); if (inp->inp_af == AF_INET) { inpcb_fetch_peeraddr(inp, (struct sockaddr_in *)nam); } #ifdef INET6 else if (inp->inp_af == AF_INET6) { in6pcb_fetch_peeraddr(inp, (struct sockaddr_in6 *)nam); } #endif tcp_debug_trace(so, tp, ostate, PRU_ACCEPT); splx(s); return 0; } static int tcp_bind(struct socket *so, struct sockaddr *nam, struct lwp *l) { struct inpcb *inp = NULL; struct sockaddr_in *sin = (struct sockaddr_in *)nam; #ifdef INET6 struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)nam; #endif /* INET6 */ struct tcpcb *tp; int s; int error = 0; int ostate = 0; inp = sotoinpcb(so); if (inp == NULL) return EINVAL; tp = intotcpcb(inp); ostate = tcp_debug_capture(tp, PRU_BIND); /* * Give the socket an address. */ s = splsoftnet(); switch (so->so_proto->pr_domain->dom_family) { case PF_INET: error = inpcb_bind(inp, sin, l); break; #ifdef INET6 case PF_INET6: error = in6pcb_bind(inp, sin6, l); if (!error) { /* mapped addr case */ if (IN6_IS_ADDR_V4MAPPED(&in6p_laddr(inp))) tp->t_family = AF_INET; else tp->t_family = AF_INET6; } break; #endif } tcp_debug_trace(so, tp, ostate, PRU_BIND); splx(s); return error; } static int tcp_listen(struct socket *so, struct lwp *l) { struct inpcb *inp; struct tcpcb *tp; int error = 0; int ostate = 0; int s; inp = sotoinpcb(so); if (inp == NULL) return EINVAL; tp = intotcpcb(inp); ostate = tcp_debug_capture(tp, PRU_LISTEN); /* * Prepare to accept connections. */ s = splsoftnet(); if (inp->inp_af == AF_INET && inp->inp_lport == 0) { error = inpcb_bind(inp, NULL, l); if (error) goto release; } #ifdef INET6 if (inp->inp_af == AF_INET6 && inp->inp_lport == 0) { error = in6pcb_bind(inp, NULL, l); if (error) goto release; } #endif tp->t_state = TCPS_LISTEN; release: tcp_debug_trace(so, tp, ostate, PRU_LISTEN); splx(s); return error; } static int tcp_connect(struct socket *so, struct sockaddr *nam, struct lwp *l) { struct inpcb *inp; struct tcpcb *tp; int s; int error = 0; int ostate = 0; inp = sotoinpcb(so); if (inp == NULL) return EINVAL; tp = intotcpcb(inp); ostate = tcp_debug_capture(tp, PRU_CONNECT); /* * Initiate connection to peer. * Create a template for use in transmissions on this connection. * Enter SYN_SENT state, and mark socket as connecting. * Start keep-alive timer, and seed output sequence space. * Send initial segment on connection. */ s = splsoftnet(); if (inp->inp_af == AF_INET) { if (inp->inp_lport == 0) { error = inpcb_bind(inp, NULL, l); if (error) goto release; } error = inpcb_connect(inp, (struct sockaddr_in *)nam, l); } #ifdef INET6 if (inp->inp_af == AF_INET6) { if (inp->inp_lport == 0) { error = in6pcb_bind(inp, NULL, l); if (error) goto release; } error = in6pcb_connect(inp, (struct sockaddr_in6 *)nam, l); if (!error) { /* mapped addr case */ if (IN6_IS_ADDR_V4MAPPED(&in6p_faddr(inp))) tp->t_family = AF_INET; else tp->t_family = AF_INET6; } } #endif if (error) goto release; tp->t_template = tcp_template(tp); if (tp->t_template == 0) { if (inp->inp_af == AF_INET) inpcb_disconnect(inp); #ifdef INET6 else if (inp->inp_af == AF_INET6) in6pcb_disconnect(inp); #endif error = ENOBUFS; goto release; } /* * Compute window scaling to request. * XXX: This should be moved to tcp_output(). */ while (tp->request_r_scale < TCP_MAX_WINSHIFT && (TCP_MAXWIN << tp->request_r_scale) < sb_max) tp->request_r_scale++; soisconnecting(so); TCP_STATINC(TCP_STAT_CONNATTEMPT); tp->t_state = TCPS_SYN_SENT; TCP_TIMER_ARM(tp, TCPT_KEEP, tp->t_keepinit); tp->iss = tcp_new_iss(tp); tcp_sendseqinit(tp); error = tcp_output(tp); release: tcp_debug_trace(so, tp, ostate, PRU_CONNECT); splx(s); return error; } static int tcp_connect2(struct socket *so, struct socket *so2) { struct inpcb *inp; struct tcpcb *tp; int ostate = 0; KASSERT(solocked(so)); inp = sotoinpcb(so); if (inp == NULL) return EINVAL; tp = intotcpcb(inp); ostate = tcp_debug_capture(tp, PRU_CONNECT2); tcp_debug_trace(so, tp, ostate, PRU_CONNECT2); return EOPNOTSUPP; } static int tcp_disconnect(struct socket *so) { struct inpcb *inp; struct tcpcb *tp; int error = 0; int ostate = 0; int s; inp = sotoinpcb(so); if (inp == NULL) return EINVAL; tp = intotcpcb(inp); ostate = tcp_debug_capture(tp, PRU_DISCONNECT); /* * Initiate disconnect from peer. * If connection never passed embryonic stage, just drop; * else if don't need to let data drain, then can just drop anyways, * else have to begin TCP shutdown process: mark socket disconnecting, * drain unread data, state switch to reflect user close, and * send segment (e.g. FIN) to peer. Socket will be really disconnected * when peer sends FIN and acks ours. * * SHOULD IMPLEMENT LATER PRU_CONNECT VIA REALLOC TCPCB. */ s = splsoftnet(); tp = tcp_disconnect1(tp); tcp_debug_trace(so, tp, ostate, PRU_DISCONNECT); splx(s); return error; } static int tcp_shutdown(struct socket *so) { struct inpcb *inp; struct tcpcb *tp; int error = 0; int ostate = 0; int s; inp = sotoinpcb(so); if (inp == NULL) return EINVAL; tp = intotcpcb(inp); ostate = tcp_debug_capture(tp, PRU_SHUTDOWN); /* * Mark the connection as being incapable of further output. */ s = splsoftnet(); socantsendmore(so); tp = tcp_usrclosed(tp); if (tp) error = tcp_output(tp); tcp_debug_trace(so, tp, ostate, PRU_SHUTDOWN); splx(s); return error; } static int tcp_abort(struct socket *so) { struct inpcb *inp; struct tcpcb *tp; int error = 0; int ostate = 0; int s; inp = sotoinpcb(so); if (inp == NULL) return EINVAL; tp = intotcpcb(inp); ostate = tcp_debug_capture(tp, PRU_ABORT); /* * Abort the TCP. */ s = splsoftnet(); tp = tcp_drop(tp, ECONNABORTED); tcp_debug_trace(so, tp, ostate, PRU_ABORT); splx(s); return error; } static int tcp_ioctl(struct socket *so, u_long cmd, void *nam, struct ifnet *ifp) { switch (so->so_proto->pr_domain->dom_family) { case PF_INET: return in_control(so, cmd, nam, ifp); #ifdef INET6 case PF_INET6: return in6_control(so, cmd, nam, ifp); #endif default: return EAFNOSUPPORT; } } static int tcp_stat(struct socket *so, struct stat *ub) { KASSERT(solocked(so)); /* stat: don't bother with a blocksize. */ return 0; } static int tcp_peeraddr(struct socket *so, struct sockaddr *nam) { struct inpcb *inp; struct tcpcb *tp; int ostate = 0; int s; inp = sotoinpcb(so); if (inp == NULL) return EINVAL; tp = intotcpcb(inp); ostate = tcp_debug_capture(tp, PRU_PEERADDR); s = splsoftnet(); if (inp->inp_af == AF_INET) { inpcb_fetch_peeraddr(inp, (struct sockaddr_in *)nam); } #ifdef INET6 else if (inp->inp_af == AF_INET6) { in6pcb_fetch_peeraddr(inp, (struct sockaddr_in6 *)nam); } #endif tcp_debug_trace(so, tp, ostate, PRU_PEERADDR); splx(s); return 0; } static int tcp_sockaddr(struct socket *so, struct sockaddr *nam) { struct inpcb *inp; struct tcpcb *tp; int ostate = 0; int s; inp = sotoinpcb(so); if (inp == NULL) return EINVAL; tp = intotcpcb(inp); ostate = tcp_debug_capture(tp, PRU_SOCKADDR); s = splsoftnet(); if (inp->inp_af == AF_INET) { inpcb_fetch_sockaddr(inp, (struct sockaddr_in *)nam); } #ifdef INET6 if (inp->inp_af == AF_INET6) { in6pcb_fetch_sockaddr(inp, (struct sockaddr_in6 *)nam); } #endif tcp_debug_trace(so, tp, ostate, PRU_SOCKADDR); splx(s); return 0; } static int tcp_rcvd(struct socket *so, int flags, struct lwp *l) { struct inpcb *inp; struct tcpcb *tp; int ostate = 0; int s; inp = sotoinpcb(so); if (inp == NULL) return EINVAL; tp = intotcpcb(inp); ostate = tcp_debug_capture(tp, PRU_RCVD); /* * After a receive, possibly send window update to peer. * * soreceive() calls this function when a user receives * ancillary data on a listening socket. We don't call * tcp_output in such a case, since there is no header * template for a listening socket and hence the kernel * will panic. */ s = splsoftnet(); if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) != 0) (void) tcp_output(tp); splx(s); tcp_debug_trace(so, tp, ostate, PRU_RCVD); return 0; } static int tcp_recvoob(struct socket *so, struct mbuf *m, int flags) { struct inpcb *inp; struct tcpcb *tp; int ostate = 0; int s; inp = sotoinpcb(so); if (inp == NULL) return EINVAL; tp = intotcpcb(inp); ostate = tcp_debug_capture(tp, PRU_RCVOOB); s = splsoftnet(); if ((so->so_oobmark == 0 && (so->so_state & SS_RCVATMARK) == 0) || so->so_options & SO_OOBINLINE || tp->t_oobflags & TCPOOB_HADDATA) { splx(s); return EINVAL; } if ((tp->t_oobflags & TCPOOB_HAVEDATA) == 0) { splx(s); return EWOULDBLOCK; } m->m_len = 1; *mtod(m, char *) = tp->t_iobc; if ((flags & MSG_PEEK) == 0) { tp->t_oobflags ^= (TCPOOB_HAVEDATA | TCPOOB_HADDATA); so->so_state &= ~SS_POLLRDBAND; } tcp_debug_trace(so, tp, ostate, PRU_RCVOOB); splx(s); return 0; } static int tcp_send(struct socket *so, struct mbuf *m, struct sockaddr *nam, struct mbuf *control, struct lwp *l) { struct inpcb *inp; struct tcpcb *tp; int ostate = 0; int error = 0; int s; inp = sotoinpcb(so); if (inp == NULL) return EINVAL; tp = intotcpcb(inp); ostate = tcp_debug_capture(tp, PRU_SEND); /* * Do a send by putting data in output queue and updating urgent * marker if URG set. Possibly send more data. */ s = splsoftnet(); if (control && control->m_len) { m_freem(control); m_freem(m); tcp_debug_trace(so, tp, ostate, PRU_SEND); splx(s); return EINVAL; } sbappendstream(&so->so_snd, m); error = tcp_output(tp); tcp_debug_trace(so, tp, ostate, PRU_SEND); splx(s); return error; } static int tcp_sendoob(struct socket *so, struct mbuf *m, struct mbuf *control) { struct inpcb *inp = NULL; struct tcpcb *tp = NULL; int ostate = 0; int error = 0; int s; inp = sotoinpcb(so); if (inp == NULL) { m_freem(m); m_freem(control); return EINVAL; } tp = intotcpcb(inp); if (tp->t_template == NULL) { /* * XXX FreeBSD appears to open the connection * automagically in this case, but the socket address * isn't passed through here so we can't do that. */ m_freem(m); m_freem(control); return ENOTCONN; } ostate = tcp_debug_capture(tp, PRU_SENDOOB); s = splsoftnet(); if (sbspace_oob(&so->so_snd) == 0) { m_freem(m); m_freem(control); splx(s); return ENOBUFS; } /* * According to RFC961 (Assigned Protocols), * the urgent pointer points to the last octet * of urgent data. We continue, however, * to consider it to indicate the first octet * of data past the urgent section. * Otherwise, snd_up should be one lower. */ sbappendstream(&so->so_snd, m); tp->snd_up = tp->snd_una + so->so_snd.sb_cc; tp->t_force = 1; error = tcp_output(tp); tp->t_force = 0; tcp_debug_trace(so, tp, ostate, PRU_SENDOOB); splx(s); m_freem(control); return error; } static int tcp_purgeif(struct socket *so, struct ifnet *ifp) { int s; int error = 0; s = splsoftnet(); mutex_enter(softnet_lock); switch (so->so_proto->pr_domain->dom_family) { case PF_INET: inpcb_purgeif0(&tcbtable, ifp); #ifdef NET_MPSAFE mutex_exit(softnet_lock); #endif in_purgeif(ifp); #ifdef NET_MPSAFE mutex_enter(softnet_lock); #endif inpcb_purgeif(&tcbtable, ifp); break; #ifdef INET6 case PF_INET6: in6pcb_purgeif0(&tcbtable, ifp); #ifdef NET_MPSAFE mutex_exit(softnet_lock); #endif in6_purgeif(ifp); #ifdef NET_MPSAFE mutex_enter(softnet_lock); #endif in6pcb_purgeif(&tcbtable, ifp); break; #endif default: error = EAFNOSUPPORT; break; } mutex_exit(softnet_lock); splx(s); return error; } /* * Initiate (or continue) disconnect. * If embryonic state, just send reset (once). * If in ``let data drain'' option and linger null, just drop. * Otherwise (hard), mark socket disconnecting and drop * current input data; switch states based on user close, and * send segment to peer (with FIN). */ struct tcpcb * tcp_disconnect1(struct tcpcb *tp) { struct socket *so; so = tp->t_inpcb->inp_socket; if (TCPS_HAVEESTABLISHED(tp->t_state) == 0) tp = tcp_close(tp); else if ((so->so_options & SO_LINGER) && so->so_linger == 0) tp = tcp_drop(tp, 0); else { soisdisconnecting(so); sbflush(&so->so_rcv); tp = tcp_usrclosed(tp); if (tp) (void) tcp_output(tp); } return tp; } /* * User issued close, and wish to trail through shutdown states: * if never received SYN, just forget it. If got a SYN from peer, * but haven't sent FIN, then go to FIN_WAIT_1 state to send peer a FIN. * If already got a FIN from peer, then almost done; go to LAST_ACK * state. In all other cases, have already sent FIN to peer (e.g. * after PRU_SHUTDOWN), and just have to play tedious game waiting * for peer to send FIN or not respond to keep-alives, etc. * We can let the user exit from the close as soon as the FIN is acked. */ struct tcpcb * tcp_usrclosed(struct tcpcb *tp) { switch (tp->t_state) { case TCPS_CLOSED: case TCPS_LISTEN: case TCPS_SYN_SENT: tp->t_state = TCPS_CLOSED; tp = tcp_close(tp); break; case TCPS_SYN_RECEIVED: case TCPS_ESTABLISHED: tp->t_state = TCPS_FIN_WAIT_1; break; case TCPS_CLOSE_WAIT: tp->t_state = TCPS_LAST_ACK; break; } if (tp && tp->t_state >= TCPS_FIN_WAIT_2) { struct socket *so = tp->t_inpcb->inp_socket; if (so) soisdisconnected(so); /* * If we are in FIN_WAIT_2, we arrived here because the * application did a shutdown of the send side. Like the * case of a transition from FIN_WAIT_1 to FIN_WAIT_2 after * a full close, we start a timer to make sure sockets are * not left in FIN_WAIT_2 forever. */ if ((tp->t_state == TCPS_FIN_WAIT_2) && (tp->t_maxidle > 0)) TCP_TIMER_ARM(tp, TCPT_2MSL, tp->t_maxidle); else if (tp->t_state == TCPS_TIME_WAIT && ((tp->t_inpcb->inp_af == AF_INET && (tcp4_vtw_enable & 1) && vtw_add(AF_INET, tp)) || (tp->t_inpcb->inp_af == AF_INET6 && (tcp6_vtw_enable & 1) && vtw_add(AF_INET6, tp)))) { tp = 0; } } return tp; } /* * sysctl helper routine for net.inet.ip.mssdflt. it can't be less * than 32. */ static int sysctl_net_inet_tcp_mssdflt(SYSCTLFN_ARGS) { int error, mssdflt; struct sysctlnode node; mssdflt = tcp_mssdflt; node = *rnode; node.sysctl_data = &mssdflt; error = sysctl_lookup(SYSCTLFN_CALL(&node)); if (error || newp == NULL) return error; if (mssdflt < 32) return EINVAL; tcp_mssdflt = mssdflt; mutex_enter(softnet_lock); tcp_tcpcb_template(); mutex_exit(softnet_lock); return 0; } /* * sysctl helper for TCP CB template update */ static int sysctl_update_tcpcb_template(SYSCTLFN_ARGS) { int t, error; struct sysctlnode node; /* follow procedures in sysctl(9) manpage */ t = *(int *)rnode->sysctl_data; node = *rnode; node.sysctl_data = &t; error = sysctl_lookup(SYSCTLFN_CALL(&node)); if (error || newp == NULL) return error; if (t < 0) return EINVAL; *(int *)rnode->sysctl_data = t; mutex_enter(softnet_lock); tcp_tcpcb_template(); mutex_exit(softnet_lock); return 0; } /* * sysctl helper routine for setting port related values under * net.inet.ip and net.inet6.ip6. does basic range checking and does * additional checks for each type. this code has placed in * tcp_input.c since INET and INET6 both use the same tcp code. * * this helper is not static so that both inet and inet6 can use it. */ int sysctl_net_inet_ip_ports(SYSCTLFN_ARGS) { int error, tmp; int apmin, apmax; #ifndef IPNOPRIVPORTS int lpmin, lpmax; #endif /* IPNOPRIVPORTS */ struct sysctlnode node; if (namelen != 0) return EINVAL; switch (name[-3]) { case PF_INET: apmin = anonportmin; apmax = anonportmax; #ifndef IPNOPRIVPORTS lpmin = lowportmin; lpmax = lowportmax; #endif /* IPNOPRIVPORTS */ break; #ifdef INET6 case PF_INET6: apmin = ip6_anonportmin; apmax = ip6_anonportmax; #ifndef IPNOPRIVPORTS lpmin = ip6_lowportmin; lpmax = ip6_lowportmax; #endif /* IPNOPRIVPORTS */ break; #endif /* INET6 */ default: return EINVAL; } /* * insert temporary copy into node, perform lookup on * temporary, then restore pointer */ node = *rnode; tmp = *(int*)rnode->sysctl_data; node.sysctl_data = &tmp; error = sysctl_lookup(SYSCTLFN_CALL(&node)); if (error || newp == NULL) return error; /* * simple port range check */ if (tmp < 0 || tmp > 65535) return EINVAL; /* * per-node range checks */ switch (rnode->sysctl_num) { case IPCTL_ANONPORTMIN: case IPV6CTL_ANONPORTMIN: if (tmp >= apmax) return EINVAL; #ifndef IPNOPRIVPORTS if (tmp < IPPORT_RESERVED) return EINVAL; #endif /* IPNOPRIVPORTS */ break; case IPCTL_ANONPORTMAX: case IPV6CTL_ANONPORTMAX: if (apmin >= tmp) return EINVAL; #ifndef IPNOPRIVPORTS if (tmp < IPPORT_RESERVED) return EINVAL; #endif /* IPNOPRIVPORTS */ break; #ifndef IPNOPRIVPORTS case IPCTL_LOWPORTMIN: case IPV6CTL_LOWPORTMIN: if (tmp >= lpmax || tmp > IPPORT_RESERVEDMAX || tmp < IPPORT_RESERVEDMIN) return EINVAL; break; case IPCTL_LOWPORTMAX: case IPV6CTL_LOWPORTMAX: if (lpmin >= tmp || tmp > IPPORT_RESERVEDMAX || tmp < IPPORT_RESERVEDMIN) return EINVAL; break; #endif /* IPNOPRIVPORTS */ default: return EINVAL; } *(int*)rnode->sysctl_data = tmp; return 0; } static inline int copyout_uid(struct socket *sockp, void *oldp, size_t *oldlenp) { if (oldp) { size_t sz; uid_t uid; int error; if (sockp->so_cred == NULL) return EPERM; uid = kauth_cred_geteuid(sockp->so_cred); sz = MIN(sizeof(uid), *oldlenp); if ((error = copyout(&uid, oldp, sz)) != 0) return error; } *oldlenp = sizeof(uid_t); return 0; } static inline int inet4_ident_core(struct in_addr raddr, u_int rport, struct in_addr laddr, u_int lport, void *oldp, size_t *oldlenp, struct lwp *l, int dodrop) { struct inpcb *inp; struct socket *sockp; inp = inpcb_lookup(&tcbtable, raddr, rport, laddr, lport, 0); if (inp == NULL || (sockp = inp->inp_socket) == NULL) return ESRCH; if (dodrop) { struct tcpcb *tp; int error; if (inp == NULL || (tp = intotcpcb(inp)) == NULL || (inp->inp_socket->so_options & SO_ACCEPTCONN) != 0) return ESRCH; error = kauth_authorize_network(l->l_cred, KAUTH_NETWORK_SOCKET, KAUTH_REQ_NETWORK_SOCKET_DROP, inp->inp_socket, tp, NULL); if (error) return error; (void)tcp_drop(tp, ECONNABORTED); return 0; } return copyout_uid(sockp, oldp, oldlenp); } #ifdef INET6 static inline int inet6_ident_core(struct in6_addr *raddr, u_int rport, struct in6_addr *laddr, u_int lport, void *oldp, size_t *oldlenp, struct lwp *l, int dodrop) { struct inpcb *inp; struct socket *sockp; inp = in6pcb_lookup(&tcbtable, raddr, rport, laddr, lport, 0, 0); if (inp == NULL || (sockp = inp->inp_socket) == NULL) return ESRCH; if (dodrop) { struct tcpcb *tp; int error; if (inp == NULL || (tp = intotcpcb(inp)) == NULL || (inp->inp_socket->so_options & SO_ACCEPTCONN) != 0) return ESRCH; error = kauth_authorize_network(l->l_cred, KAUTH_NETWORK_SOCKET, KAUTH_REQ_NETWORK_SOCKET_DROP, inp->inp_socket, tp, NULL); if (error) return error; (void)tcp_drop(tp, ECONNABORTED); return 0; } return copyout_uid(sockp, oldp, oldlenp); } #endif /* * sysctl helper routine for the net.inet.tcp.drop and * net.inet6.tcp6.drop nodes. */ #define sysctl_net_inet_tcp_drop sysctl_net_inet_tcp_ident /* * sysctl helper routine for the net.inet.tcp.ident and * net.inet6.tcp6.ident nodes. contains backwards compat code for the * old way of looking up the ident information for ipv4 which involves * stuffing the port/addr pairs into the mib lookup. */ static int sysctl_net_inet_tcp_ident(SYSCTLFN_ARGS) { struct sockaddr_in *si4[2]; #ifdef INET6 struct sockaddr_in6 *si6[2]; #endif struct sockaddr_storage sa[2]; int error, pf, dodrop; dodrop = name[-1] == TCPCTL_DROP; if (dodrop) { if (oldp != NULL || *oldlenp != 0) return EINVAL; if (newp == NULL) return EPERM; if (newlen < sizeof(sa)) return ENOMEM; } if (namelen != 4 && namelen != 0) return EINVAL; if (name[-2] != IPPROTO_TCP) return EINVAL; pf = name[-3]; /* old style lookup, ipv4 only */ if (namelen == 4) { struct in_addr laddr, raddr; u_int lport, rport; if (pf != PF_INET) return EPROTONOSUPPORT; raddr.s_addr = (uint32_t)name[0]; rport = (u_int)name[1]; laddr.s_addr = (uint32_t)name[2]; lport = (u_int)name[3]; mutex_enter(softnet_lock); error = inet4_ident_core(raddr, rport, laddr, lport, oldp, oldlenp, l, dodrop); mutex_exit(softnet_lock); return error; } if (newp == NULL || newlen != sizeof(sa)) return EINVAL; error = copyin(newp, &sa, newlen); if (error) return error; /* * requested families must match */ if (pf != sa[0].ss_family || sa[0].ss_family != sa[1].ss_family) return EINVAL; switch (pf) { #ifdef INET6 case PF_INET6: si6[0] = (struct sockaddr_in6*)&sa[0]; si6[1] = (struct sockaddr_in6*)&sa[1]; if (si6[0]->sin6_len != sizeof(*si6[0]) || si6[1]->sin6_len != sizeof(*si6[1])) return EINVAL; if (!IN6_IS_ADDR_V4MAPPED(&si6[0]->sin6_addr) && !IN6_IS_ADDR_V4MAPPED(&si6[1]->sin6_addr)) { error = sa6_embedscope(si6[0], ip6_use_defzone); if (error) return error; error = sa6_embedscope(si6[1], ip6_use_defzone); if (error) return error; mutex_enter(softnet_lock); error = inet6_ident_core(&si6[0]->sin6_addr, si6[0]->sin6_port, &si6[1]->sin6_addr, si6[1]->sin6_port, oldp, oldlenp, l, dodrop); mutex_exit(softnet_lock); return error; } if (IN6_IS_ADDR_V4MAPPED(&si6[0]->sin6_addr) != IN6_IS_ADDR_V4MAPPED(&si6[1]->sin6_addr)) return EINVAL; in6_sin6_2_sin_in_sock((struct sockaddr *)&sa[0]); in6_sin6_2_sin_in_sock((struct sockaddr *)&sa[1]); #endif /* INET6 */ /*FALLTHROUGH*/ case PF_INET: si4[0] = (struct sockaddr_in*)&sa[0]; si4[1] = (struct sockaddr_in*)&sa[1]; if (si4[0]->sin_len != sizeof(*si4[0]) || si4[0]->sin_len != sizeof(*si4[1])) return EINVAL; mutex_enter(softnet_lock); error = inet4_ident_core(si4[0]->sin_addr, si4[0]->sin_port, si4[1]->sin_addr, si4[1]->sin_port, oldp, oldlenp, l, dodrop); mutex_exit(softnet_lock); return error; default: return EPROTONOSUPPORT; } } /* * sysctl helper for the inet and inet6 pcblists. handles tcp/udp and * inet/inet6, as well as raw pcbs for each. specifically not * declared static so that raw sockets and udp/udp6 can use it as * well. */ int sysctl_inpcblist(SYSCTLFN_ARGS) { const bool allowaddr = get_expose_address(curproc); struct sockaddr_in *in; const struct inpcb *inp; #ifdef INET6 struct sockaddr_in6 *in6; #endif struct inpcbtable *pcbtbl = __UNCONST(rnode->sysctl_data); struct tcpcb *tp; struct kinfo_pcb pcb; char *dp; size_t len, needed, elem_size, out_size; int error, elem_count, pf, proto, pf2; if (namelen != 4) return EINVAL; if (oldp != NULL) { len = *oldlenp; elem_size = name[2]; elem_count = name[3]; if (elem_size != sizeof(pcb)) return EINVAL; } else { len = 0; elem_count = INT_MAX; elem_size = sizeof(pcb); } error = 0; dp = oldp; out_size = elem_size; needed = 0; if (namelen == 1 && name[0] == CTL_QUERY) return (sysctl_query(SYSCTLFN_CALL(rnode))); if (name - oname != 4) return EINVAL; pf = oname[1]; proto = oname[2]; pf2 = (oldp != NULL) ? pf : 0; mutex_enter(softnet_lock); TAILQ_FOREACH(inp, &pcbtbl->inpt_queue, inp_queue) { if (inp->inp_af != pf) continue; if (kauth_authorize_network(l->l_cred, KAUTH_NETWORK_SOCKET, KAUTH_REQ_NETWORK_SOCKET_CANSEE, inp->inp_socket, NULL, NULL) != 0) continue; memset(&pcb, 0, sizeof(pcb)); pcb.ki_family = pf; pcb.ki_type = proto; switch (pf2) { case 0: /* just probing for size */ break; case PF_INET: pcb.ki_family = inp->inp_socket->so_proto-> pr_domain->dom_family; pcb.ki_type = inp->inp_socket->so_proto-> pr_type; pcb.ki_protocol = inp->inp_socket->so_proto-> pr_protocol; pcb.ki_pflags = inp->inp_flags; pcb.ki_sostate = inp->inp_socket->so_state; pcb.ki_prstate = inp->inp_state; if (proto == IPPROTO_TCP) { tp = intotcpcb(inp); pcb.ki_tstate = tp->t_state; pcb.ki_tflags = tp->t_flags; } COND_SET_VALUE(pcb.ki_pcbaddr, PTRTOUINT64(inp), allowaddr); COND_SET_VALUE(pcb.ki_ppcbaddr, PTRTOUINT64(inp->inp_ppcb), allowaddr); COND_SET_VALUE(pcb.ki_sockaddr, PTRTOUINT64(inp->inp_socket), allowaddr); pcb.ki_rcvq = inp->inp_socket->so_rcv.sb_cc; pcb.ki_sndq = inp->inp_socket->so_snd.sb_cc; in = satosin(&pcb.ki_src); in->sin_len = sizeof(*in); in->sin_family = pf; in->sin_port = inp->inp_lport; in->sin_addr = const_in4p_laddr(inp); if (pcb.ki_prstate >= INP_CONNECTED) { in = satosin(&pcb.ki_dst); in->sin_len = sizeof(*in); in->sin_family = pf; in->sin_port = inp->inp_fport; in->sin_addr = const_in4p_faddr(inp); } break; #ifdef INET6 case PF_INET6: pcb.ki_family = inp->inp_socket->so_proto-> pr_domain->dom_family; pcb.ki_type = inp->inp_socket->so_proto->pr_type; pcb.ki_protocol = inp->inp_socket->so_proto-> pr_protocol; pcb.ki_pflags = inp->inp_flags; pcb.ki_sostate = inp->inp_socket->so_state; pcb.ki_prstate = inp->inp_state; if (proto == IPPROTO_TCP) { tp = intotcpcb(inp); pcb.ki_tstate = tp->t_state; pcb.ki_tflags = tp->t_flags; } COND_SET_VALUE(pcb.ki_pcbaddr, PTRTOUINT64(inp), allowaddr); COND_SET_VALUE(pcb.ki_ppcbaddr, PTRTOUINT64(inp->inp_ppcb), allowaddr); COND_SET_VALUE(pcb.ki_sockaddr, PTRTOUINT64(inp->inp_socket), allowaddr); pcb.ki_rcvq = inp->inp_socket->so_rcv.sb_cc; pcb.ki_sndq = inp->inp_socket->so_snd.sb_cc; in6 = satosin6(&pcb.ki_src); in6->sin6_len = sizeof(*in6); in6->sin6_family = pf; in6->sin6_port = inp->inp_lport; in6->sin6_flowinfo = const_in6p_flowinfo(inp); in6->sin6_addr = const_in6p_laddr(inp); in6->sin6_scope_id = 0; /* XXX? */ if (pcb.ki_prstate >= INP_CONNECTED) { in6 = satosin6(&pcb.ki_dst); in6->sin6_len = sizeof(*in6); in6->sin6_family = pf; in6->sin6_port = inp->inp_fport; in6->sin6_flowinfo = const_in6p_flowinfo(inp); in6->sin6_addr = const_in6p_faddr(inp); in6->sin6_scope_id = 0; /* XXX? */ } break; #endif } if (len >= elem_size && elem_count > 0) { error = copyout(&pcb, dp, out_size); if (error) { mutex_exit(softnet_lock); return error; } dp += elem_size; len -= elem_size; } needed += elem_size; if (elem_count > 0 && elem_count != INT_MAX) elem_count--; } *oldlenp = needed; if (oldp == NULL) *oldlenp += PCB_SLOP * sizeof(struct kinfo_pcb); mutex_exit(softnet_lock); return error; } static int sysctl_tcp_congctl(SYSCTLFN_ARGS) { struct sysctlnode node; int error; char newname[TCPCC_MAXLEN]; strlcpy(newname, tcp_congctl_global_name, sizeof(newname) - 1); node = *rnode; node.sysctl_data = newname; node.sysctl_size = sizeof(newname); error = sysctl_lookup(SYSCTLFN_CALL(&node)); if (error || newp == NULL || strncmp(newname, tcp_congctl_global_name, sizeof(newname)) == 0) return error; mutex_enter(softnet_lock); error = tcp_congctl_select(NULL, newname); mutex_exit(softnet_lock); return error; } static int sysctl_tcp_init_win(SYSCTLFN_ARGS) { int error; u_int iw; struct sysctlnode node; iw = *(u_int *)rnode->sysctl_data; node = *rnode; node.sysctl_data = &iw; node.sysctl_size = sizeof(iw); error = sysctl_lookup(SYSCTLFN_CALL(&node)); if (error || newp == NULL) return error; if (iw >= __arraycount(tcp_init_win_max)) return EINVAL; *(u_int *)rnode->sysctl_data = iw; return 0; } static int sysctl_tcp_keep(SYSCTLFN_ARGS) { int error; u_int tmp; struct sysctlnode node; node = *rnode; tmp = *(u_int *)rnode->sysctl_data; node.sysctl_data = &tmp; error = sysctl_lookup(SYSCTLFN_CALL(&node)); if (error || newp == NULL) return error; if (!(tmp > 0 && tmp <= TCP_TIMER_MAXTICKS)) return EINVAL; mutex_enter(softnet_lock); *(u_int *)rnode->sysctl_data = tmp; tcp_tcpcb_template(); /* update the template */ mutex_exit(softnet_lock); return 0; } static int sysctl_net_inet_tcp_stats(SYSCTLFN_ARGS) { return (NETSTAT_SYSCTL(tcpstat_percpu, TCP_NSTATS)); } /* * this (second stage) setup routine is a replacement for tcp_sysctl() * (which is currently used for ipv4 and ipv6) */ static void sysctl_net_inet_tcp_setup2(struct sysctllog **clog, int pf, const char *pfname, const char *tcpname) { const struct sysctlnode *sack_node; const struct sysctlnode *abc_node; const struct sysctlnode *ecn_node; const struct sysctlnode *congctl_node; const struct sysctlnode *mslt_node; const struct sysctlnode *vtw_node; #ifdef TCP_DEBUG extern struct tcp_debug tcp_debug[TCP_NDEBUG]; extern int tcp_debx; #endif sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT, CTLTYPE_NODE, pfname, NULL, NULL, 0, NULL, 0, CTL_NET, pf, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT, CTLTYPE_NODE, tcpname, SYSCTL_DESCR("TCP related settings"), NULL, 0, NULL, 0, CTL_NET, pf, IPPROTO_TCP, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_INT, "rfc1323", SYSCTL_DESCR("Enable RFC1323 TCP extensions"), sysctl_update_tcpcb_template, 0, &tcp_do_rfc1323, 0, CTL_NET, pf, IPPROTO_TCP, TCPCTL_RFC1323, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_INT, "sendspace", SYSCTL_DESCR("Default TCP send buffer size"), NULL, 0, &tcp_sendspace, 0, CTL_NET, pf, IPPROTO_TCP, TCPCTL_SENDSPACE, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_INT, "recvspace", SYSCTL_DESCR("Default TCP receive buffer size"), NULL, 0, &tcp_recvspace, 0, CTL_NET, pf, IPPROTO_TCP, TCPCTL_RECVSPACE, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_INT, "mssdflt", SYSCTL_DESCR("Default maximum segment size"), sysctl_net_inet_tcp_mssdflt, 0, &tcp_mssdflt, 0, CTL_NET, pf, IPPROTO_TCP, TCPCTL_MSSDFLT, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_INT, "minmss", SYSCTL_DESCR("Lower limit for TCP maximum segment size"), NULL, 0, &tcp_minmss, 0, CTL_NET, pf, IPPROTO_TCP, CTL_CREATE, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_INT, "msl", SYSCTL_DESCR("Maximum Segment Life"), NULL, 0, &tcp_msl, 0, CTL_NET, pf, IPPROTO_TCP, TCPCTL_MSL, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_INT, "syn_cache_limit", SYSCTL_DESCR("Maximum number of entries in the TCP " "compressed state engine"), NULL, 0, &tcp_syn_cache_limit, 0, CTL_NET, pf, IPPROTO_TCP, TCPCTL_SYN_CACHE_LIMIT, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_INT, "syn_bucket_limit", SYSCTL_DESCR("Maximum number of entries per hash " "bucket in the TCP compressed state " "engine"), NULL, 0, &tcp_syn_bucket_limit, 0, CTL_NET, pf, IPPROTO_TCP, TCPCTL_SYN_BUCKET_LIMIT, CTL_EOL); #if 0 /* obsoleted */ sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_INT, "syn_cache_interval", SYSCTL_DESCR("TCP compressed state engine's timer interval"), NULL, 0, &tcp_syn_cache_interval, 0, CTL_NET, pf, IPPROTO_TCP, TCPCTL_SYN_CACHE_INTER, CTL_EOL); #endif sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_INT, "init_win", SYSCTL_DESCR("Initial TCP congestion window"), sysctl_tcp_init_win, 0, &tcp_init_win, 0, CTL_NET, pf, IPPROTO_TCP, TCPCTL_INIT_WIN, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_INT, "mss_ifmtu", SYSCTL_DESCR("Use interface MTU for calculating MSS"), NULL, 0, &tcp_mss_ifmtu, 0, CTL_NET, pf, IPPROTO_TCP, TCPCTL_MSS_IFMTU, CTL_EOL); sysctl_createv(clog, 0, NULL, &sack_node, CTLFLAG_PERMANENT, CTLTYPE_NODE, "sack", SYSCTL_DESCR("RFC2018 Selective ACKnowledgement tunables"), NULL, 0, NULL, 0, CTL_NET, pf, IPPROTO_TCP, TCPCTL_SACK, CTL_EOL); /* Congctl subtree */ sysctl_createv(clog, 0, NULL, &congctl_node, CTLFLAG_PERMANENT, CTLTYPE_NODE, "congctl", SYSCTL_DESCR("TCP Congestion Control"), NULL, 0, NULL, 0, CTL_NET, pf, IPPROTO_TCP, CTL_CREATE, CTL_EOL); sysctl_createv(clog, 0, &congctl_node, NULL, CTLFLAG_PERMANENT, CTLTYPE_STRING, "available", SYSCTL_DESCR("Available Congestion Control Mechanisms"), NULL, 0, tcp_congctl_avail, 0, CTL_CREATE, CTL_EOL); sysctl_createv(clog, 0, &congctl_node, NULL, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_STRING, "selected", SYSCTL_DESCR("Selected Congestion Control Mechanism"), sysctl_tcp_congctl, 0, NULL, TCPCC_MAXLEN, CTL_CREATE, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_INT, "win_scale", SYSCTL_DESCR("Use RFC1323 window scale options"), sysctl_update_tcpcb_template, 0, &tcp_do_win_scale, 0, CTL_NET, pf, IPPROTO_TCP, TCPCTL_WSCALE, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_INT, "timestamps", SYSCTL_DESCR("Use RFC1323 time stamp options"), sysctl_update_tcpcb_template, 0, &tcp_do_timestamps, 0, CTL_NET, pf, IPPROTO_TCP, TCPCTL_TSTAMP, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_INT, "cwm", SYSCTL_DESCR("Hughes/Touch/Heidemann Congestion Window " "Monitoring"), NULL, 0, &tcp_cwm, 0, CTL_NET, pf, IPPROTO_TCP, TCPCTL_CWM, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_INT, "cwm_burstsize", SYSCTL_DESCR("Congestion Window Monitoring allowed " "burst count in packets"), NULL, 0, &tcp_cwm_burstsize, 0, CTL_NET, pf, IPPROTO_TCP, TCPCTL_CWM_BURSTSIZE, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_INT, "ack_on_push", SYSCTL_DESCR("Immediately return ACK when PSH is " "received"), NULL, 0, &tcp_ack_on_push, 0, CTL_NET, pf, IPPROTO_TCP, TCPCTL_ACK_ON_PUSH, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_INT, "keepidle", SYSCTL_DESCR("Allowed connection idle ticks before a " "keepalive probe is sent"), sysctl_tcp_keep, 0, &tcp_keepidle, 0, CTL_NET, pf, IPPROTO_TCP, TCPCTL_KEEPIDLE, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_INT, "keepintvl", SYSCTL_DESCR("Ticks before next keepalive probe is sent"), sysctl_tcp_keep, 0, &tcp_keepintvl, 0, CTL_NET, pf, IPPROTO_TCP, TCPCTL_KEEPINTVL, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_INT, "keepcnt", SYSCTL_DESCR("Number of keepalive probes to send"), sysctl_tcp_keep, 0, &tcp_keepcnt, 0, CTL_NET, pf, IPPROTO_TCP, TCPCTL_KEEPCNT, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_IMMEDIATE, CTLTYPE_INT, "slowhz", SYSCTL_DESCR("Keepalive ticks per second"), NULL, PR_SLOWHZ, NULL, 0, CTL_NET, pf, IPPROTO_TCP, TCPCTL_SLOWHZ, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_INT, "log_refused", SYSCTL_DESCR("Log refused TCP connections"), NULL, 0, &tcp_log_refused, 0, CTL_NET, pf, IPPROTO_TCP, TCPCTL_LOG_REFUSED, CTL_EOL); #if 0 /* obsoleted */ sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_INT, "rstratelimit", NULL, NULL, 0, &tcp_rst_ratelim, 0, CTL_NET, pf, IPPROTO_TCP, TCPCTL_RSTRATELIMIT, CTL_EOL); #endif sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_INT, "rstppslimit", SYSCTL_DESCR("Maximum number of RST packets to send " "per second"), NULL, 0, &tcp_rst_ppslim, 0, CTL_NET, pf, IPPROTO_TCP, TCPCTL_RSTPPSLIMIT, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_INT, "delack_ticks", SYSCTL_DESCR("Number of ticks to delay sending an ACK"), NULL, 0, &tcp_delack_ticks, 0, CTL_NET, pf, IPPROTO_TCP, TCPCTL_DELACK_TICKS, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_INT, "init_win_local", SYSCTL_DESCR("Initial TCP window size (in segments)"), sysctl_tcp_init_win, 0, &tcp_init_win_local, 0, CTL_NET, pf, IPPROTO_TCP, TCPCTL_INIT_WIN_LOCAL, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_STRUCT, "ident", SYSCTL_DESCR("RFC1413 Identification Protocol lookups"), sysctl_net_inet_tcp_ident, 0, NULL, sizeof(uid_t), CTL_NET, pf, IPPROTO_TCP, TCPCTL_IDENT, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_INT, "do_loopback_cksum", SYSCTL_DESCR("Perform TCP checksum on loopback"), NULL, 0, &tcp_do_loopback_cksum, 0, CTL_NET, pf, IPPROTO_TCP, TCPCTL_LOOPBACKCKSUM, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT, CTLTYPE_STRUCT, "pcblist", SYSCTL_DESCR("TCP protocol control block list"), sysctl_inpcblist, 0, &tcbtable, 0, CTL_NET, pf, IPPROTO_TCP, CTL_CREATE, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_INT, "keepinit", SYSCTL_DESCR("Ticks before initial tcp connection times out"), sysctl_tcp_keep, 0, &tcp_keepinit, 0, CTL_NET, pf, IPPROTO_TCP, CTL_CREATE, CTL_EOL); /* TCP socket buffers auto-sizing nodes */ sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_INT, "recvbuf_auto", SYSCTL_DESCR("Enable automatic receive " "buffer sizing (experimental)"), NULL, 0, &tcp_do_autorcvbuf, 0, CTL_NET, pf, IPPROTO_TCP, CTL_CREATE, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_INT, "recvbuf_inc", SYSCTL_DESCR("Incrementor step size of " "automatic receive buffer"), NULL, 0, &tcp_autorcvbuf_inc, 0, CTL_NET, pf, IPPROTO_TCP, CTL_CREATE, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_INT, "recvbuf_max", SYSCTL_DESCR("Max size of automatic receive buffer"), NULL, 0, &tcp_autorcvbuf_max, 0, CTL_NET, pf, IPPROTO_TCP, CTL_CREATE, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_INT, "sendbuf_auto", SYSCTL_DESCR("Enable automatic send " "buffer sizing (experimental)"), NULL, 0, &tcp_do_autosndbuf, 0, CTL_NET, pf, IPPROTO_TCP, CTL_CREATE, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_INT, "sendbuf_inc", SYSCTL_DESCR("Incrementor step size of " "automatic send buffer"), NULL, 0, &tcp_autosndbuf_inc, 0, CTL_NET, pf, IPPROTO_TCP, CTL_CREATE, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_INT, "sendbuf_max", SYSCTL_DESCR("Max size of automatic send buffer"), NULL, 0, &tcp_autosndbuf_max, 0, CTL_NET, pf, IPPROTO_TCP, CTL_CREATE, CTL_EOL); /* ECN subtree */ sysctl_createv(clog, 0, NULL, &ecn_node, CTLFLAG_PERMANENT, CTLTYPE_NODE, "ecn", SYSCTL_DESCR("RFC3168 Explicit Congestion Notification"), NULL, 0, NULL, 0, CTL_NET, pf, IPPROTO_TCP, CTL_CREATE, CTL_EOL); sysctl_createv(clog, 0, &ecn_node, NULL, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_INT, "enable", SYSCTL_DESCR("Enable TCP Explicit Congestion " "Notification"), NULL, 0, &tcp_do_ecn, 0, CTL_CREATE, CTL_EOL); sysctl_createv(clog, 0, &ecn_node, NULL, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_INT, "maxretries", SYSCTL_DESCR("Number of times to retry ECN setup " "before disabling ECN on the connection"), NULL, 0, &tcp_ecn_maxretries, 0, CTL_CREATE, CTL_EOL); /* SACK gets its own little subtree. */ sysctl_createv(clog, 0, NULL, &sack_node, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_INT, "enable", SYSCTL_DESCR("Enable RFC2018 Selective ACKnowledgement"), NULL, 0, &tcp_do_sack, 0, CTL_NET, pf, IPPROTO_TCP, TCPCTL_SACK, CTL_CREATE, CTL_EOL); sysctl_createv(clog, 0, NULL, &sack_node, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_INT, "maxholes", SYSCTL_DESCR("Maximum number of TCP SACK holes allowed per connection"), NULL, 0, &tcp_sack_tp_maxholes, 0, CTL_NET, pf, IPPROTO_TCP, TCPCTL_SACK, CTL_CREATE, CTL_EOL); sysctl_createv(clog, 0, NULL, &sack_node, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_INT, "globalmaxholes", SYSCTL_DESCR("Global maximum number of TCP SACK holes"), NULL, 0, &tcp_sack_globalmaxholes, 0, CTL_NET, pf, IPPROTO_TCP, TCPCTL_SACK, CTL_CREATE, CTL_EOL); sysctl_createv(clog, 0, NULL, &sack_node, CTLFLAG_PERMANENT, CTLTYPE_INT, "globalholes", SYSCTL_DESCR("Global number of TCP SACK holes"), NULL, 0, &tcp_sack_globalholes, 0, CTL_NET, pf, IPPROTO_TCP, TCPCTL_SACK, CTL_CREATE, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT, CTLTYPE_STRUCT, "stats", SYSCTL_DESCR("TCP statistics"), sysctl_net_inet_tcp_stats, 0, NULL, 0, CTL_NET, pf, IPPROTO_TCP, TCPCTL_STATS, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_INT, "local_by_rtt", SYSCTL_DESCR("Use RTT estimator to decide which hosts " "are local"), NULL, 0, &tcp_rttlocal, 0, CTL_NET, pf, IPPROTO_TCP, CTL_CREATE, CTL_EOL); #ifdef TCP_DEBUG sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT, CTLTYPE_STRUCT, "debug", SYSCTL_DESCR("TCP sockets debug information"), NULL, 0, &tcp_debug, sizeof(tcp_debug), CTL_NET, pf, IPPROTO_TCP, TCPCTL_DEBUG, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT, CTLTYPE_INT, "debx", SYSCTL_DESCR("Number of TCP debug sockets messages"), NULL, 0, &tcp_debx, sizeof(tcp_debx), CTL_NET, pf, IPPROTO_TCP, TCPCTL_DEBX, CTL_EOL); #endif sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_STRUCT, "drop", SYSCTL_DESCR("TCP drop connection"), sysctl_net_inet_tcp_drop, 0, NULL, 0, CTL_NET, pf, IPPROTO_TCP, TCPCTL_DROP, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_INT, "iss_hash", SYSCTL_DESCR("Enable RFC 1948 ISS by cryptographic " "hash computation"), NULL, 0, &tcp_do_rfc1948, sizeof(tcp_do_rfc1948), CTL_NET, pf, IPPROTO_TCP, CTL_CREATE, CTL_EOL); /* ABC subtree */ sysctl_createv(clog, 0, NULL, &abc_node, CTLFLAG_PERMANENT, CTLTYPE_NODE, "abc", SYSCTL_DESCR("RFC3465 Appropriate Byte Counting (ABC)"), NULL, 0, NULL, 0, CTL_NET, pf, IPPROTO_TCP, CTL_CREATE, CTL_EOL); sysctl_createv(clog, 0, &abc_node, NULL, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_INT, "enable", SYSCTL_DESCR("Enable RFC3465 Appropriate Byte Counting"), NULL, 0, &tcp_do_abc, 0, CTL_CREATE, CTL_EOL); sysctl_createv(clog, 0, &abc_node, NULL, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_INT, "aggressive", SYSCTL_DESCR("1: L=2*SMSS 0: L=1*SMSS"), NULL, 0, &tcp_abc_aggressive, 0, CTL_CREATE, CTL_EOL); /* MSL tuning subtree */ sysctl_createv(clog, 0, NULL, &mslt_node, CTLFLAG_PERMANENT, CTLTYPE_NODE, "mslt", SYSCTL_DESCR("MSL Tuning for TIME_WAIT truncation"), NULL, 0, NULL, 0, CTL_NET, pf, IPPROTO_TCP, CTL_CREATE, CTL_EOL); sysctl_createv(clog, 0, &mslt_node, NULL, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_INT, "enable", SYSCTL_DESCR("Enable TIME_WAIT truncation"), NULL, 0, &tcp_msl_enable, 0, CTL_CREATE, CTL_EOL); sysctl_createv(clog, 0, &mslt_node, NULL, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_INT, "loopback", SYSCTL_DESCR("MSL value to use for loopback connections"), NULL, 0, &tcp_msl_loop, 0, CTL_CREATE, CTL_EOL); sysctl_createv(clog, 0, &mslt_node, NULL, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_INT, "local", SYSCTL_DESCR("MSL value to use for local connections"), NULL, 0, &tcp_msl_local, 0, CTL_CREATE, CTL_EOL); sysctl_createv(clog, 0, &mslt_node, NULL, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_INT, "remote", SYSCTL_DESCR("MSL value to use for remote connections"), NULL, 0, &tcp_msl_remote, 0, CTL_CREATE, CTL_EOL); sysctl_createv(clog, 0, &mslt_node, NULL, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_INT, "remote_threshold", SYSCTL_DESCR("RTT estimate value to promote local to remote"), NULL, 0, &tcp_msl_remote_threshold, 0, CTL_CREATE, CTL_EOL); /* vestigial TIME_WAIT tuning subtree */ sysctl_createv(clog, 0, NULL, &vtw_node, CTLFLAG_PERMANENT, CTLTYPE_NODE, "vtw", SYSCTL_DESCR("Tuning for Vestigial TIME_WAIT"), NULL, 0, NULL, 0, CTL_NET, pf, IPPROTO_TCP, CTL_CREATE, CTL_EOL); sysctl_createv(clog, 0, &vtw_node, NULL, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_INT, "enable", SYSCTL_DESCR("Enable Vestigial TIME_WAIT"), sysctl_tcp_vtw_enable, 0, (pf == AF_INET) ? &tcp4_vtw_enable : &tcp6_vtw_enable, 0, CTL_CREATE, CTL_EOL); sysctl_createv(clog, 0, &vtw_node, NULL, CTLFLAG_PERMANENT|CTLFLAG_READONLY, CTLTYPE_INT, "entries", SYSCTL_DESCR("Maximum number of vestigial TIME_WAIT entries"), NULL, 0, &tcp_vtw_entries, 0, CTL_CREATE, CTL_EOL); } void tcp_usrreq_init(void) { sysctl_net_inet_tcp_setup2(NULL, PF_INET, "inet", "tcp"); #ifdef INET6 sysctl_net_inet_tcp_setup2(NULL, PF_INET6, "inet6", "tcp6"); #endif } PR_WRAP_USRREQS(tcp) #define tcp_attach tcp_attach_wrapper #define tcp_detach tcp_detach_wrapper #define tcp_accept tcp_accept_wrapper #define tcp_bind tcp_bind_wrapper #define tcp_listen tcp_listen_wrapper #define tcp_connect tcp_connect_wrapper #define tcp_connect2 tcp_connect2_wrapper #define tcp_disconnect tcp_disconnect_wrapper #define tcp_shutdown tcp_shutdown_wrapper #define tcp_abort tcp_abort_wrapper #define tcp_ioctl tcp_ioctl_wrapper #define tcp_stat tcp_stat_wrapper #define tcp_peeraddr tcp_peeraddr_wrapper #define tcp_sockaddr tcp_sockaddr_wrapper #define tcp_rcvd tcp_rcvd_wrapper #define tcp_recvoob tcp_recvoob_wrapper #define tcp_send tcp_send_wrapper #define tcp_sendoob tcp_sendoob_wrapper #define tcp_purgeif tcp_purgeif_wrapper const struct pr_usrreqs tcp_usrreqs = { .pr_attach = tcp_attach, .pr_detach = tcp_detach, .pr_accept = tcp_accept, .pr_bind = tcp_bind, .pr_listen = tcp_listen, .pr_connect = tcp_connect, .pr_connect2 = tcp_connect2, .pr_disconnect = tcp_disconnect, .pr_shutdown = tcp_shutdown, .pr_abort = tcp_abort, .pr_ioctl = tcp_ioctl, .pr_stat = tcp_stat, .pr_peeraddr = tcp_peeraddr, .pr_sockaddr = tcp_sockaddr, .pr_rcvd = tcp_rcvd, .pr_recvoob = tcp_recvoob, .pr_send = tcp_send, .pr_sendoob = tcp_sendoob, .pr_purgeif = tcp_purgeif, };
2 2 2 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 /* $NetBSD: subr_once.c,v 1.7 2019/03/19 08:16:51 ryo Exp $ */ /*- * Copyright (c)2005 YAMAMOTO Takashi, * Copyright (c)2008 Antti Kantee, * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: subr_once.c,v 1.7 2019/03/19 08:16:51 ryo Exp $"); #include <sys/param.h> #include <sys/systm.h> #include <sys/condvar.h> #include <sys/mutex.h> #include <sys/once.h> static kmutex_t oncemtx; static kcondvar_t oncecv; void once_init(void) { mutex_init(&oncemtx, MUTEX_DEFAULT, IPL_NONE); cv_init(&oncecv, "runonce"); } int _init_once(once_t *o, int (*fn)(void)) { /* Fastpath handled by RUN_ONCE() */ int error; mutex_enter(&oncemtx); while (o->o_status == ONCE_RUNNING) cv_wait(&oncecv, &oncemtx); if (o->o_refcnt++ == 0) { o->o_status = ONCE_RUNNING; mutex_exit(&oncemtx); o->o_error = fn(); mutex_enter(&oncemtx); o->o_status = ONCE_DONE; cv_broadcast(&oncecv); } KASSERT(o->o_refcnt != 0); /* detect overflow */ while (o->o_status == ONCE_RUNNING) cv_wait(&oncecv, &oncemtx); error = o->o_error; mutex_exit(&oncemtx); return error; } void _fini_once(once_t *o, void (*fn)(void)) { mutex_enter(&oncemtx); while (o->o_status == ONCE_RUNNING) cv_wait(&oncecv, &oncemtx); KASSERT(o->o_refcnt != 0); /* we need to call _init_once() once */ if (--o->o_refcnt == 0) { o->o_status = ONCE_RUNNING; mutex_exit(&oncemtx); fn(); mutex_enter(&oncemtx); o->o_status = ONCE_VIRGIN; cv_broadcast(&oncecv); } mutex_exit(&oncemtx); }
47 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 /* $NetBSD: proc.h,v 1.373 2023/10/04 20:52:07 ad Exp $ */ /*- * Copyright (c) 2006, 2007, 2008, 2020, 2023 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Andrew Doran. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /*- * Copyright (c) 1986, 1989, 1991, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)proc.h 8.15 (Berkeley) 5/19/95 */ #ifndef _SYS_PROC_H_ #define _SYS_PROC_H_ #include <sys/lwp.h> #if defined(_KMEMUSER) || defined(_KERNEL) #if defined(_KERNEL_OPT) #include "opt_multiprocessor.h" #include "opt_kstack.h" #include "opt_lockdebug.h" #endif #include <machine/proc.h> /* Machine-dependent proc substruct */ #include <machine/pcb.h> #include <sys/aio.h> #include <sys/idtype.h> #include <sys/rwlock.h> #include <sys/mqueue.h> #include <sys/mutex.h> #include <sys/condvar.h> #include <sys/queue.h> #include <sys/radixtree.h> #include <sys/signalvar.h> #include <sys/siginfo.h> #include <sys/event.h> #include <sys/specificdata.h> #ifdef _KERNEL #include <sys/resourcevar.h> #else #include <sys/time.h> #include <sys/resource.h> #endif /* * One structure allocated per session. */ struct session { int s_count; /* Ref cnt; pgrps in session */ u_int s_flags; #define S_LOGIN_SET 1 /* s_login set in this session */ struct proc *s_leader; /* Session leader */ struct vnode *s_ttyvp; /* Vnode of controlling terminal */ struct tty *s_ttyp; /* Controlling terminal */ char s_login[MAXLOGNAME]; /* Setlogin() name */ pid_t s_sid; /* Session ID (pid of leader) */ }; /* * One structure allocated per process group. */ struct pgrp { LIST_HEAD(, proc) pg_members; /* Pointer to pgrp members */ struct session *pg_session; /* Pointer to session */ pid_t pg_id; /* Pgrp id */ int pg_jobc; /* * Number of processes qualifying * pgrp for job control */ }; /* * Autoloadable syscall definition */ struct sc_autoload { u_int al_code; const char *al_module; }; /* * One structure allocated per emulation. */ struct exec_package; struct ras; struct kauth_cred; struct emul { const char *e_name; /* Symbolic name */ const char *e_path; /* Extra emulation path (NULL if none)*/ #ifndef __HAVE_MINIMAL_EMUL int e_flags; /* Miscellaneous flags, see above */ /* Syscall handling function */ const int *e_errno; /* Errno array */ int e_nosys; /* Offset of the nosys() syscall */ int e_nsysent; /* Number of system call entries */ #endif struct sysent *e_sysent; /* System call array */ const uint32_t *e_nomodbits; /* sys_nosys/sys_nomodule flags * for syscall_disestablish() */ const char * const *e_syscallnames; /* System call name array */ struct sc_autoload *e_sc_autoload; /* List of autoloadable syscalls */ /* Signal sending function */ void (*e_sendsig)(const struct ksiginfo *, const sigset_t *); void (*e_trapsignal)(struct lwp *, struct ksiginfo *); char *e_sigcode; /* Start of sigcode */ char *e_esigcode; /* End of sigcode */ /* Set registers before execution */ struct uvm_object **e_sigobject;/* shared sigcode object */ void (*e_setregs)(struct lwp *, struct exec_package *, vaddr_t); /* Per-process hooks */ void (*e_proc_exec)(struct proc *, struct exec_package *); void (*e_proc_fork)(struct proc *, struct lwp *, int); void (*e_proc_exit)(struct proc *); void (*e_lwp_fork)(struct lwp *, struct lwp *); void (*e_lwp_exit)(struct lwp *); #ifdef __HAVE_SYSCALL_INTERN void (*e_syscall_intern)(struct proc *); #else void (*e_syscall)(void); #endif /* Emulation specific sysctl data */ struct sysctlnode *e_sysctlovly; vaddr_t (*e_vm_default_addr)(struct proc *, vaddr_t, vsize_t, int); /* Emulation-specific hook for userspace page faults */ int (*e_usertrap)(struct lwp *, vaddr_t, void *); size_t e_ucsize; /* size of ucontext_t */ void (*e_startlwp)(void *); /* Dtrace syscall probe */ void (*e_dtrace_syscall)(uint32_t, register_t, const struct sysent *, const void *, const register_t *, int); /* Emulation specific support for ktracing signal posts */ void (*e_ktrpsig)(int, sig_t, const sigset_t *, const struct ksiginfo *); }; /* * Emulation miscellaneous flags */ #define EMUL_HAS_SYS___syscall 0x001 /* Has SYS___syscall */ /* * Description of a process. * * This structure contains the information needed to manage a thread of * control, known in UN*X as a process; it has references to substructures * containing descriptions of things that the process uses, but may share * with related processes. The process structure and the substructures * are always addressible except for those marked "(PROC ONLY)" below, * which might be addressible only on a processor on which the process * is running. * * Field markings and the corresponding locks: * * a: p_auxlock * k: ktrace_mutex * l: proc_lock * t: p_stmutex * p: p_lock * (: updated atomically * :: unlocked, stable */ struct vmspace; struct proc { LIST_ENTRY(proc) p_list; /* l: List of all processes */ kmutex_t *p_lock; /* :: general mutex */ kcondvar_t p_waitcv; /* p: wait, stop CV on children */ kcondvar_t p_lwpcv; /* p: wait, stop CV on LWPs */ /* Substructures: */ struct kauth_cred *p_cred; /* p: Master copy of credentials */ struct filedesc *p_fd; /* :: Ptr to open files structure */ struct cwdinfo *p_cwdi; /* :: cdir/rdir/cmask info */ struct pstats *p_stats; /* :: Accounting/stats (PROC ONLY) */ struct plimit *p_limit; /* :: Process limits */ struct vmspace *p_vmspace; /* :: Address space */ struct sigacts *p_sigacts; /* :: Process sigactions */ struct aioproc *p_aio; /* p: Asynchronous I/O data */ u_int p_mqueue_cnt; /* (: Count of open message queues */ specificdata_reference p_specdataref; /* subsystem proc-specific data */ int p_exitsig; /* l: signal to send to parent on exit */ int p_flag; /* p: PK_* flags */ int p_sflag; /* p: PS_* flags */ int p_stflag; /* t: PST_* flags */ short p_slflag; /* l, p: PSL_* flags */ char p_stat; /* l: S* process status. */ char p_lflag; /* l: PL_* flags */ char p_trace_enabled;/* p: cached by syscall_intern() */ char p_pad1[3]; /* unused */ pid_t p_pid; /* :: Process identifier. */ LIST_ENTRY(proc) p_pglist; /* l: List of processes in pgrp. */ struct proc *p_pptr; /* l: Pointer to parent process. */ LIST_ENTRY(proc) p_sibling; /* l: List of sibling processes. */ LIST_HEAD(, proc) p_children; /* l: List of children. */ LIST_HEAD(, lwp) p_lwps; /* p: List of LWPs. */ struct ras *p_raslist; /* a: List of RAS entries */ /* The following fields are all zeroed upon creation in fork. */ #define p_startzero p_nlwps int p_nlwps; /* p: Number of LWPs */ int p_nzlwps; /* p: Number of zombie LWPs */ int p_nrlwps; /* p: Number running/sleeping LWPs */ int p_nlwpwait; /* p: Number of LWPs in lwp_wait1() */ int p_ndlwps; /* p: Number of detached LWPs */ u_int p_nstopchild; /* l: Count of stopped/dead children */ u_int p_waited; /* l: parent has waited on child */ struct lwp *p_zomblwp; /* p: detached LWP to be reaped */ struct lwp *p_vforklwp; /* p: parent LWP waiting at vfork() */ /* scheduling */ void *p_sched_info; /* p: Scheduler-specific structure */ fixpt_t p_estcpu; /* p: Time avg. value of p_cpticks */ fixpt_t p_estcpu_inherited; /* p: cpu inherited from children */ unsigned int p_forktime; fixpt_t p_pctcpu; /* p: %cpu from dead LWPs */ struct proc *p_opptr; /* l: save parent during ptrace. */ struct ptimers *p_timers; /* Timers: real, virtual, profiling */ struct bintime p_rtime; /* p: real time */ u_quad_t p_uticks; /* t: Statclock hits in user mode */ u_quad_t p_sticks; /* t: Statclock hits in system mode */ u_quad_t p_iticks; /* t: Statclock hits processing intr */ uint64_t p_xutime; /* p: utime exposed to userspace */ uint64_t p_xstime; /* p: stime exposed to userspace */ int p_traceflag; /* k: Kernel trace points */ void *p_tracep; /* k: Trace private data */ struct vnode *p_textvp; /* :: Vnode of executable */ struct emul *p_emul; /* :: emulation information */ void *p_emuldata; /* :: per-proc emul data, or NULL */ const struct execsw *p_execsw; /* :: exec package information */ struct klist p_klist; /* p: knotes attached to proc */ LIST_HEAD(, lwp) p_sigwaiters; /* p: LWPs waiting for signals */ sigpend_t p_sigpend; /* p: pending signals */ struct lcproc *p_lwpctl; /* p, a: _lwp_ctl() information */ pid_t p_ppid; /* :: cached parent pid */ pid_t p_oppid; /* :: cached original parent pid */ char *p_path; /* :: full pathname of executable */ /* * End area that is zeroed on creation */ #define p_endzero p_startcopy /* * The following fields are all copied upon creation in fork. */ #define p_startcopy p_sigctx struct sigctx p_sigctx; /* p: Shared signal state */ u_char p_nice; /* p: Process "nice" value */ char p_comm[MAXCOMLEN+1]; /* p: basename of last exec file */ struct pgrp *p_pgrp; /* l: Pointer to process group */ vaddr_t p_psstrp; /* :: address of process's ps_strings */ u_int p_pax; /* :: PAX flags */ int p_xexit; /* p: exit code */ /* * End area that is copied on creation */ #define p_endcopy p_xsig u_short p_xsig; /* p: stop signal */ u_short p_acflag; /* p: Acc. flags; see struct lwp also */ struct mdproc p_md; /* p: Any machine-dependent fields */ vaddr_t p_stackbase; /* :: ASLR randomized stack base */ struct kdtrace_proc *p_dtrace; /* :: DTrace-specific data. */ /* * Locks in their own cache line towards the end. */ kmutex_t p_auxlock /* :: secondary, longer term lock */ __aligned(COHERENCY_UNIT); kmutex_t p_stmutex; /* :: mutex on profiling state */ krwlock_t p_reflock; /* :: lock for debugger, procfs */ }; #define p_rlimit p_limit->pl_rlimit #define p_session p_pgrp->pg_session #define p_pgid p_pgrp->pg_id #endif /* _KMEMUSER || _KERNEL */ /* * Status values. */ #define SIDL 1 /* Process being created by fork */ #define SACTIVE 2 /* Process is not stopped */ #define SDYING 3 /* About to die */ #define SSTOP 4 /* Process debugging or suspension */ #define SZOMB 5 /* Awaiting collection by parent */ #define SDEAD 6 /* Almost a zombie */ #define P_ZOMBIE(p) \ ((p)->p_stat == SZOMB || (p)->p_stat == SDYING || (p)->p_stat == SDEAD) /* * These flags are kept in p_flag and are protected by p_lock. Access from * process context only. */ #define PK_ADVLOCK 0x00000001 /* Process may hold a POSIX advisory lock */ #define PK_SYSTEM 0x00000002 /* System process (kthread) */ #define PK_SYSVSEM 0x00000004 /* Used SysV semaphores */ #define PK_SUGID 0x00000100 /* Had set id privileges since last exec */ #define PK_KMEM 0x00000200 /* Has kmem access */ #define PK_EXEC 0x00004000 /* Process called exec */ #define PK_NOCLDWAIT 0x00020000 /* No zombies if child dies */ #define PK_32 0x00040000 /* 32-bit process (used on 64-bit kernels) */ #define PK_CLDSIGIGN 0x00080000 /* Process is ignoring SIGCHLD */ #define PK_MARKER 0x80000000 /* Is a dummy marker process */ /* * These flags are kept in p_sflag and are protected by p_lock. Access from * process context only. */ #define PS_NOCLDSTOP 0x00000008 /* No SIGCHLD when children stop */ #define PS_RUMP_LWPEXIT 0x00000400 /* LWPs in RUMP kernel should exit for GC */ #define PS_WCORE 0x00001000 /* Process needs to dump core */ #define PS_WEXIT 0x00002000 /* Working on exiting */ #define PS_STOPFORK 0x00800000 /* Child will be stopped on fork(2) */ #define PS_STOPEXEC 0x01000000 /* Will be stopped on exec(2) */ #define PS_STOPEXIT 0x02000000 /* Will be stopped at process exit */ #define PS_COREDUMP 0x20000000 /* Process core-dumped */ #define PS_CONTINUED 0x40000000 /* Process is continued */ #define PS_STOPPING 0x80000000 /* Transitioning SACTIVE -> SSTOP */ /* * These flags are kept in p_slflag and are protected by the proc_lock * and p_lock. Access from process context only. */ #define PSL_TRACEFORK 0x00000001 /* traced process wants fork events */ #define PSL_TRACEVFORK 0x00000002 /* traced process wants vfork events */ #define PSL_TRACEVFORK_DONE \ 0x00000004 /* traced process wants vfork done events */ #define PSL_TRACELWP_CREATE \ 0x00000008 /* traced process wants LWP create events */ #define PSL_TRACELWP_EXIT \ 0x00000010 /* traced process wants LWP exit events */ #define PSL_TRACEPOSIX_SPAWN \ 0x00000020 /* traced process wants posix_spawn events */ #define PSL_TRACED 0x00000040 /* Debugged process being traced */ #define PSL_TRACEDCHILD 0x00000080 /* Report process birth */ #define PSL_CHTRACED 0x00000100 /* Child has been traced & reparented */ #define PSL_SYSCALL 0x00000200 /* process has PT_SYSCALL enabled */ #define PSL_SYSCALLEMU 0x00000400 /* cancel in-progress syscall */ /* * Kept in p_stflag and protected by p_stmutex. */ #define PST_PROFIL 0x00000020 /* Has started profiling */ /* * Kept in p_lflag and protected by the proc_lock. Access * from process context only. */ #define PL_CONTROLT 0x00000001 /* Has a controlling terminal */ #define PL_PPWAIT 0x00000002 /* Parent is waiting for child exec/exit */ #define PL_SIGCOMPAT 0x00000004 /* Has used compat signal trampoline */ #define PL_ORPHANPG 0x00000008 /* Member of an orphaned pgrp */ #if defined(_KMEMUSER) || defined(_KERNEL) /* * Macro to compute the exit signal to be delivered. */ #define P_EXITSIG(p) \ (((p)->p_slflag & PSL_TRACED) ? SIGCHLD : p->p_exitsig) /* * Compute a wait(2) 16 bit exit status code */ #define P_WAITSTATUS(p) W_EXITCODE((p)->p_xexit, ((p)->p_xsig | \ (((p)->p_sflag & PS_COREDUMP) ? WCOREFLAG : 0))) LIST_HEAD(proclist, proc); /* A list of processes */ /* * This structure associates a proclist with its lock. */ struct proclist_desc { struct proclist *pd_list; /* The list */ /* * XXX Add a pointer to the proclist's lock eventually. */ }; #ifdef _KERNEL /* * We use process IDs <= PID_MAX until there are > 16k processes. * NO_PGID is used to represent "no process group" for a tty. */ #define PID_MAX 30000 #define NO_PGID ((pid_t)-1) #define SESS_LEADER(p) ((p)->p_session->s_leader == (p)) /* * Flags passed to fork1(). */ #define FORK_PPWAIT 0x0001 /* Block parent until child exit */ #define FORK_SHAREVM 0x0002 /* Share vmspace with parent */ #define FORK_SHARECWD 0x0004 /* Share cdir/rdir/cmask */ #define FORK_SHAREFILES 0x0008 /* Share file descriptors */ #define FORK_SHARESIGS 0x0010 /* Share signal actions */ #define FORK_NOWAIT 0x0020 /* Make init the parent of the child */ #define FORK_CLEANFILES 0x0040 /* Start with a clean descriptor set */ #define FORK_SYSTEM 0x0080 /* Fork a kernel thread */ extern struct proc proc0; /* Process slot for swapper */ extern u_int nprocs; /* Current number of procs */ extern int maxproc; /* Max number of procs */ #define vmspace_kernel() (proc0.p_vmspace) extern kmutex_t proc_lock; extern struct proclist allproc; /* List of all processes */ extern struct proclist zombproc; /* List of zombie processes */ extern struct proc *initproc; /* Process slots for init, pager */ extern const struct proclist_desc proclists[]; int proc_find_locked(struct lwp *, struct proc **, pid_t); proc_t * proc_find_raw(pid_t); proc_t * proc_find(pid_t); /* Find process by ID */ proc_t * proc_find_lwpid(pid_t); /* Find process by LWP ID */ struct lwp * proc_find_lwp(proc_t *, pid_t); /* Find LWP in proc by ID */ struct lwp * proc_find_lwp_unlocked(proc_t *, pid_t); /* Find LWP, acquire proc */ struct lwp * proc_find_lwp_acquire_proc(pid_t, proc_t **); struct pgrp * pgrp_find(pid_t); /* Find process group by ID */ void procinit(void); void procinit_sysctl(void); int proc_enterpgrp(struct proc *, pid_t, pid_t, bool); void proc_leavepgrp(struct proc *); void proc_sesshold(struct session *); void proc_sessrele(struct session *); void fixjobc(struct proc *, struct pgrp *, int); int tsleep(wchan_t, pri_t, const char *, int); int mtsleep(wchan_t, pri_t, const char *, int, kmutex_t *); void wakeup(wchan_t); int kpause(const char *, bool, int, kmutex_t *); void exit1(struct lwp *, int, int) __dead; int kill1(struct lwp *l, pid_t pid, ksiginfo_t *ksi, register_t *retval); int do_sys_wait(int *, int *, int, struct rusage *); int do_sys_waitid(idtype_t, id_t, int *, int *, int, struct wrusage *, siginfo_t *); struct proc *proc_alloc(void); void proc0_init(void); pid_t proc_alloc_pid(struct proc *); void proc_free_pid(pid_t); pid_t proc_alloc_lwpid(struct proc *, struct lwp *); void proc_free_lwpid(struct proc *, pid_t); void proc_free_mem(struct proc *); void exit_lwps(struct lwp *l); int fork1(struct lwp *, int, int, void *, size_t, void (*)(void *), void *, register_t *); int pgid_in_session(struct proc *, pid_t); void cpu_lwp_fork(struct lwp *, struct lwp *, void *, size_t, void (*)(void *), void *); void cpu_lwp_free(struct lwp *, int); void cpu_lwp_free2(struct lwp *); void cpu_spawn_return(struct lwp*); #ifdef __HAVE_SYSCALL_INTERN void syscall_intern(struct proc *); #endif void md_child_return(struct lwp *); void child_return(void *); int proc_isunder(struct proc *, struct lwp *); int proc_uidmatch(kauth_cred_t, kauth_cred_t); int proc_vmspace_getref(struct proc *, struct vmspace **); void proc_crmod_leave(kauth_cred_t, kauth_cred_t, bool); void proc_crmod_enter(void); int proc_getauxv(struct proc *, void **, size_t *); int proc_specific_key_create(specificdata_key_t *, specificdata_dtor_t); void proc_specific_key_delete(specificdata_key_t); void proc_initspecific(struct proc *); void proc_finispecific(struct proc *); void * proc_getspecific(struct proc *, specificdata_key_t); void proc_setspecific(struct proc *, specificdata_key_t, void *); int proc_compare(const struct proc *, const struct lwp *, const struct proc *, const struct lwp *); /* * Special handlers for delivering EVFILT_PROC notifications. These * exist to handle some of the special locking considerations around * processes. */ void knote_proc_exec(struct proc *); void knote_proc_fork(struct proc *, struct proc *); void knote_proc_exit(struct proc *); int proclist_foreach_call(struct proclist *, int (*)(struct proc *, void *arg), void *); static __inline struct proc * _proclist_skipmarker(struct proc *p0) { struct proc *p = p0; while (p != NULL && p->p_flag & PK_MARKER) p = LIST_NEXT(p, p_list); return p; } #define PROC_PTRSZ(p) (((p)->p_flag & PK_32) ? sizeof(int) : sizeof(void *)) #define PROC_REGSZ(p) (((p)->p_flag & PK_32) ? \ sizeof(process_reg32) : sizeof(struct reg)) #define PROC_FPREGSZ(p) (((p)->p_flag & PK_32) ? \ sizeof(process_fpreg32) : sizeof(struct fpreg)) #define PROC_DBREGSZ(p) (((p)->p_flag & PK_32) ? \ sizeof(process_dbreg32) : sizeof(struct dbreg)) #ifndef PROC_MACHINE_ARCH #define PROC_MACHINE_ARCH(p) machine_arch #endif /* * PROCLIST_FOREACH: iterate on the given proclist, skipping PK_MARKER ones. */ #define PROCLIST_FOREACH(var, head) \ for ((var) = LIST_FIRST(head); \ ((var) = _proclist_skipmarker(var)) != NULL; \ (var) = LIST_NEXT(var, p_list)) #ifdef KSTACK_CHECK_MAGIC void kstack_setup_magic(const struct lwp *); void kstack_check_magic(const struct lwp *); #else #define kstack_setup_magic(x) #define kstack_check_magic(x) #endif extern struct emul emul_netbsd; #endif /* _KERNEL */ /* * Kernel stack parameters. * * KSTACK_LOWEST_ADDR: return the lowest address of the LWP's kernel stack, * excluding red-zone. * * KSTACK_SIZE: the size kernel stack for a LWP, excluding red-zone. * * if <machine/proc.h> provides the MD definition, it will be used. */ #ifndef KSTACK_LOWEST_ADDR #define KSTACK_LOWEST_ADDR(l) ((void *)ALIGN((struct pcb *)((l)->l_addr) + 1)) #endif #ifndef KSTACK_SIZE #define KSTACK_SIZE (USPACE - ALIGN(sizeof(struct pcb))) #endif #endif /* _KMEMUSER || _KERNEL */ #endif /* !_SYS_PROC_H_ */
7 7 9 9 9 9 7 7 9 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 /* $NetBSD: prop_dictionary_util.c,v 1.9 2022/08/03 21:13:46 riastradh Exp $ */ /*- * Copyright (c) 2006, 2020 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Jason R. Thorpe. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /* * Utility routines to make it more convenient to work with values * stored in dictionaries. * * Note: There is no special magic going on here. We use the standard * proplib(3) APIs to do all of this work. Any application could do * exactly what we're doing here. */ #include "prop_object_impl.h" /* only to hide kernel vs. not-kernel */ #include <prop/proplib.h> bool prop_dictionary_get_dict(prop_dictionary_t dict, const char *key, prop_dictionary_t *dp) { prop_object_t o; o = prop_dictionary_get(dict, key); if (prop_object_type(o) != PROP_TYPE_DICTIONARY) return false; *dp = o; return true; } bool prop_dictionary_get_bool(prop_dictionary_t dict, const char *key, bool *valp) { prop_bool_t b; b = prop_dictionary_get(dict, key); if (prop_object_type(b) != PROP_TYPE_BOOL) return (false); *valp = prop_bool_true(b); return (true); } bool prop_dictionary_set_bool(prop_dictionary_t dict, const char *key, bool val) { return prop_dictionary_set_and_rel(dict, key, prop_bool_create(val)); } #define TEMPLATE(name, typ) \ bool \ prop_dictionary_get_ ## name (prop_dictionary_t dict, \ const char *key, \ typ *valp) \ { \ return prop_number_ ## name ## _value( \ prop_dictionary_get(dict, key), valp); \ } TEMPLATE(schar, signed char) TEMPLATE(short, short) TEMPLATE(int, int) TEMPLATE(long, long) TEMPLATE(longlong, long long) TEMPLATE(intptr, intptr_t) TEMPLATE(int8, int8_t) TEMPLATE(int16, int16_t) TEMPLATE(int32, int32_t) TEMPLATE(int64, int64_t) TEMPLATE(uchar, unsigned char) TEMPLATE(ushort, unsigned short) TEMPLATE(uint, unsigned int) TEMPLATE(ulong, unsigned long) TEMPLATE(ulonglong, unsigned long long) TEMPLATE(uintptr, uintptr_t) TEMPLATE(uint8, uint8_t) TEMPLATE(uint16, uint16_t) TEMPLATE(uint32, uint32_t) TEMPLATE(uint64, uint64_t) #undef TEMPLATE static bool prop_dictionary_set_signed_number(prop_dictionary_t dict, const char *key, intmax_t val) { return prop_dictionary_set_and_rel(dict, key, prop_number_create_signed(val)); } static bool prop_dictionary_set_unsigned_number(prop_dictionary_t dict, const char *key, uintmax_t val) { /*LINTED: for conversion from 'long long' to 'long'*/ \ return prop_dictionary_set_and_rel(dict, key, prop_number_create_unsigned(val)); } #define TEMPLATE(name, which, typ) \ bool \ prop_dictionary_set_ ## name (prop_dictionary_t dict, \ const char *key, \ typ val) \ { \ /*LINTED: for conversion from long long to 'long'*/ \ return prop_dictionary_set_ ## which ## _number(dict, key, val);\ } #define STEMPLATE(name, typ) TEMPLATE(name, signed, typ) #define UTEMPLATE(name, typ) TEMPLATE(name, unsigned, typ) STEMPLATE(schar, signed char) STEMPLATE(short, short) STEMPLATE(int, int) STEMPLATE(long, long) STEMPLATE(longlong, long long) STEMPLATE(intptr, intptr_t) STEMPLATE(int8, int8_t) STEMPLATE(int16, int16_t) STEMPLATE(int32, int32_t) STEMPLATE(int64, int64_t) UTEMPLATE(uchar, unsigned char) UTEMPLATE(ushort, unsigned short) UTEMPLATE(uint, unsigned int) UTEMPLATE(ulong, unsigned long) UTEMPLATE(ulonglong, unsigned long long) UTEMPLATE(uintptr, uintptr_t) UTEMPLATE(uint8, uint8_t) UTEMPLATE(uint16, uint16_t) UTEMPLATE(uint32, uint32_t) UTEMPLATE(uint64, uint64_t) #undef STEMPLATE #undef UTEMPLATE #undef TEMPLATE bool prop_dictionary_get_string(prop_dictionary_t dict, const char *key, const char **cpp) { prop_string_t str; const char *cp; str = prop_dictionary_get(dict, key); if (prop_object_type(str) != PROP_TYPE_STRING) return (false); cp = prop_string_value(str); if (cp == NULL) return (false); *cpp = cp; return (true); } bool prop_dictionary_set_string(prop_dictionary_t dict, const char *key, const char *cp) { return prop_dictionary_set_and_rel(dict, key, prop_string_create_copy(cp)); } bool prop_dictionary_set_string_nocopy(prop_dictionary_t dict, const char *key, const char *cp) { return prop_dictionary_set_and_rel(dict, key, prop_string_create_nocopy(cp)); } bool prop_dictionary_get_data(prop_dictionary_t dict, const char *key, const void **vp, size_t *sizep) { prop_data_t data; const void *v; data = prop_dictionary_get(dict, key); if (prop_object_type(data) != PROP_TYPE_DATA) return (false); v = prop_data_value(data); if (v == NULL) return (false); *vp = v; if (sizep != NULL) *sizep = prop_data_size(data); return (true); } bool prop_dictionary_set_data(prop_dictionary_t dict, const char *key, const void *v, size_t size) { return prop_dictionary_set_and_rel(dict, key, prop_data_create_copy(v, size)); } bool prop_dictionary_set_data_nocopy(prop_dictionary_t dict, const char *key, const void *v, size_t size) { return prop_dictionary_set_and_rel(dict, key, prop_data_create_nocopy(v, size)); } _PROP_DEPRECATED(prop_dictionary_get_cstring, "this program uses prop_dictionary_get_cstring(), " "which is deprecated; use prop_dictionary_get_string() and copy instead.") bool prop_dictionary_get_cstring(prop_dictionary_t dict, const char *key, char **cpp) { prop_string_t str; char *cp; size_t len; bool rv; str = prop_dictionary_get(dict, key); if (prop_object_type(str) != PROP_TYPE_STRING) return (false); len = prop_string_size(str); cp = _PROP_MALLOC(len + 1, M_TEMP); if (cp == NULL) return (false); rv = prop_string_copy_value(str, cp, len + 1); if (rv) *cpp = cp; else _PROP_FREE(cp, M_TEMP); return (rv); } _PROP_DEPRECATED(prop_string_get_cstring_nocopy, "this program uses prop_string_get_cstring_nocopy(), " "which is deprecated; use prop_dictionary_get_string() instead.") bool prop_dictionary_get_cstring_nocopy(prop_dictionary_t dict, const char *key, const char **cpp) { return prop_dictionary_get_string(dict, key, cpp); } _PROP_DEPRECATED(prop_dictionary_set_cstring, "this program uses prop_dictionary_set_cstring(), " "which is deprecated; use prop_dictionary_set_string() instead.") bool prop_dictionary_set_cstring(prop_dictionary_t dict, const char *key, const char *cp) { return prop_dictionary_set_string(dict, key, cp); } _PROP_DEPRECATED(prop_dictionary_set_cstring_nocopy, "this program uses prop_dictionary_set_cstring_nocopy(), " "which is deprecated; use prop_dictionary_set_string_nocopy() instead.") bool prop_dictionary_set_cstring_nocopy(prop_dictionary_t dict, const char *key, const char *cp) { return prop_dictionary_set_string_nocopy(dict, key, cp); } bool prop_dictionary_set_and_rel(prop_dictionary_t dict, const char *key, prop_object_t po) { bool rv; if (po == NULL) return false; rv = prop_dictionary_set(dict, key, po); prop_object_release(po); return rv; }
9 9 9 8 8 8 3 8 9 9 1 9 9 9 9 9 7 7 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 /* $NetBSD: prop_string.c,v 1.18 2023/11/17 21:29:33 thorpej Exp $ */ /*- * Copyright (c) 2006, 2020 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Jason R. Thorpe. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #include "prop_object_impl.h" #include <prop/prop_string.h> #include <sys/rbtree.h> #if defined(_KERNEL) || defined(_STANDALONE) #include <sys/stdarg.h> #else #include <stdarg.h> #endif /* _KERNEL || _STANDALONE */ struct _prop_string { struct _prop_object ps_obj; union { char * psu_mutable; const char * psu_immutable; } ps_un; #define ps_mutable ps_un.psu_mutable #define ps_immutable ps_un.psu_immutable size_t ps_size; /* not including \0 */ struct rb_node ps_link; int ps_flags; }; #define PS_F_NOCOPY 0x01 #define PS_F_MUTABLE 0x02 _PROP_POOL_INIT(_prop_string_pool, sizeof(struct _prop_string), "propstng") _PROP_MALLOC_DEFINE(M_PROP_STRING, "prop string", "property string container object") static _prop_object_free_rv_t _prop_string_free(prop_stack_t, prop_object_t *); static bool _prop_string_externalize( struct _prop_object_externalize_context *, void *); static _prop_object_equals_rv_t _prop_string_equals(prop_object_t, prop_object_t, void **, void **, prop_object_t *, prop_object_t *); static const struct _prop_object_type _prop_object_type_string = { .pot_type = PROP_TYPE_STRING, .pot_free = _prop_string_free, .pot_extern = _prop_string_externalize, .pot_equals = _prop_string_equals, }; #define prop_object_is_string(x) \ ((x) != NULL && (x)->ps_obj.po_type == &_prop_object_type_string) #define prop_string_contents(x) ((x)->ps_immutable ? (x)->ps_immutable : "") /* * In order to reduce memory usage, all immutable string objects are * de-duplicated. */ static int /*ARGSUSED*/ _prop_string_rb_compare_nodes(void *ctx _PROP_ARG_UNUSED, const void *n1, const void *n2) { const struct _prop_string * const ps1 = n1; const struct _prop_string * const ps2 = n2; _PROP_ASSERT(ps1->ps_immutable != NULL); _PROP_ASSERT(ps2->ps_immutable != NULL); return strcmp(ps1->ps_immutable, ps2->ps_immutable); } static int /*ARGSUSED*/ _prop_string_rb_compare_key(void *ctx _PROP_ARG_UNUSED, const void *n, const void *v) { const struct _prop_string * const ps = n; const char * const cp = v; _PROP_ASSERT(ps->ps_immutable != NULL); return strcmp(ps->ps_immutable, cp); } static const rb_tree_ops_t _prop_string_rb_tree_ops = { .rbto_compare_nodes = _prop_string_rb_compare_nodes, .rbto_compare_key = _prop_string_rb_compare_key, .rbto_node_offset = offsetof(struct _prop_string, ps_link), .rbto_context = NULL }; static struct rb_tree _prop_string_tree; _PROP_ONCE_DECL(_prop_string_init_once) _PROP_MUTEX_DECL_STATIC(_prop_string_tree_mutex) static int _prop_string_init(void) { _PROP_MUTEX_INIT(_prop_string_tree_mutex); rb_tree_init(&_prop_string_tree, &_prop_string_rb_tree_ops); return 0; } /* ARGSUSED */ static _prop_object_free_rv_t _prop_string_free(prop_stack_t stack, prop_object_t *obj) { prop_string_t ps = *obj; if ((ps->ps_flags & PS_F_MUTABLE) == 0) { _PROP_MUTEX_LOCK(_prop_string_tree_mutex); /* * Double-check the retain count now that we've * acquired the tree lock; holding this lock prevents * new retains from coming in by finding it in the * tree. */ if (_PROP_ATOMIC_LOAD(&ps->ps_obj.po_refcnt) == 0) rb_tree_remove_node(&_prop_string_tree, ps); else ps = NULL; _PROP_MUTEX_UNLOCK(_prop_string_tree_mutex); if (ps == NULL) return (_PROP_OBJECT_FREE_DONE); } if ((ps->ps_flags & PS_F_NOCOPY) == 0 && ps->ps_mutable != NULL) _PROP_FREE(ps->ps_mutable, M_PROP_STRING); _PROP_POOL_PUT(_prop_string_pool, ps); return (_PROP_OBJECT_FREE_DONE); } static bool _prop_string_externalize(struct _prop_object_externalize_context *ctx, void *v) { prop_string_t ps = v; if (ps->ps_size == 0) return (_prop_object_externalize_empty_tag(ctx, "string")); if (_prop_object_externalize_start_tag(ctx, "string") == false || _prop_object_externalize_append_encoded_cstring(ctx, ps->ps_immutable) == false || _prop_object_externalize_end_tag(ctx, "string") == false) return (false); return (true); } /* ARGSUSED */ static _prop_object_equals_rv_t _prop_string_equals(prop_object_t v1, prop_object_t v2, void **stored_pointer1, void **stored_pointer2, prop_object_t *next_obj1, prop_object_t *next_obj2) { prop_string_t str1 = v1; prop_string_t str2 = v2; if (str1 == str2) return (_PROP_OBJECT_EQUALS_TRUE); if (str1->ps_size != str2->ps_size) return (_PROP_OBJECT_EQUALS_FALSE); if (strcmp(prop_string_contents(str1), prop_string_contents(str2))) return (_PROP_OBJECT_EQUALS_FALSE); else return (_PROP_OBJECT_EQUALS_TRUE); } static prop_string_t _prop_string_alloc(int const flags) { prop_string_t ps; ps = _PROP_POOL_GET(_prop_string_pool); if (ps != NULL) { _prop_object_init(&ps->ps_obj, &_prop_object_type_string); ps->ps_mutable = NULL; ps->ps_size = 0; ps->ps_flags = flags; } return (ps); } static prop_string_t _prop_string_instantiate(int const flags, const char * const str, size_t const len) { prop_string_t ps; _PROP_ONCE_RUN(_prop_string_init_once, _prop_string_init); ps = _prop_string_alloc(flags); if (ps != NULL) { ps->ps_immutable = str; ps->ps_size = len; if ((flags & PS_F_MUTABLE) == 0) { prop_string_t ops; _PROP_MUTEX_LOCK(_prop_string_tree_mutex); ops = rb_tree_insert_node(&_prop_string_tree, ps); if (ops != ps) { /* * Equivalent string object already exist; * free the new one and return a reference * to the existing object. */ prop_object_retain(ops); _PROP_MUTEX_UNLOCK(_prop_string_tree_mutex); if ((flags & PS_F_NOCOPY) == 0) { _PROP_FREE(ps->ps_mutable, M_PROP_STRING); } _PROP_POOL_PUT(_prop_string_pool, ps); ps = ops; } else { _PROP_MUTEX_UNLOCK(_prop_string_tree_mutex); } } } else if ((flags & PS_F_NOCOPY) == 0) { _PROP_FREE(__UNCONST(str), M_PROP_STRING); } return (ps); } _PROP_DEPRECATED(prop_string_create, "this program uses prop_string_create(); all functions " "supporting mutable prop_strings are deprecated.") prop_string_t prop_string_create(void) { return (_prop_string_alloc(PS_F_MUTABLE)); } _PROP_DEPRECATED(prop_string_create_cstring, "this program uses prop_string_create_cstring(); all functions " "supporting mutable prop_strings are deprecated.") prop_string_t prop_string_create_cstring(const char *str) { prop_string_t ps; char *cp; size_t len; _PROP_ASSERT(str != NULL); ps = _prop_string_alloc(PS_F_MUTABLE); if (ps != NULL) { len = strlen(str); cp = _PROP_MALLOC(len + 1, M_PROP_STRING); if (cp == NULL) { prop_object_release(ps); return (NULL); } strcpy(cp, str); ps->ps_mutable = cp; ps->ps_size = len; } return (ps); } _PROP_DEPRECATED(prop_string_create_cstring_nocopy, "this program uses prop_string_create_cstring_nocopy(), " "which is deprecated; use prop_string_create_nocopy() instead.") prop_string_t prop_string_create_cstring_nocopy(const char *str) { return prop_string_create_nocopy(str); } /* * prop_string_create_format -- * Create a string object using the provided format string. */ prop_string_t __printflike(1, 2) prop_string_create_format(const char *fmt, ...) { char *str = NULL; int len; size_t nlen; va_list ap; _PROP_ASSERT(fmt != NULL); va_start(ap, fmt); len = vsnprintf(NULL, 0, fmt, ap); va_end(ap); if (len < 0) return (NULL); nlen = len + 1; str = _PROP_MALLOC(nlen, M_PROP_STRING); if (str == NULL) return (NULL); va_start(ap, fmt); vsnprintf(str, nlen, fmt, ap); va_end(ap); return _prop_string_instantiate(0, str, (size_t)len); } /* * prop_string_create_copy -- * Create a string object by coping the provided constant string. */ prop_string_t prop_string_create_copy(const char *str) { return prop_string_create_format("%s", str); } /* * prop_string_create_nocopy -- * Create a string object using the provided external constant * string. */ prop_string_t prop_string_create_nocopy(const char *str) { _PROP_ASSERT(str != NULL); return _prop_string_instantiate(PS_F_NOCOPY, str, strlen(str)); } /* * prop_string_copy -- * Copy a string. This reduces to a retain in the common case. * Deprecated mutable string objects must be copied. */ prop_string_t prop_string_copy(prop_string_t ops) { char *cp; if (! prop_object_is_string(ops)) return (NULL); if ((ops->ps_flags & PS_F_MUTABLE) == 0) { prop_object_retain(ops); return (ops); } cp = _PROP_MALLOC(ops->ps_size + 1, M_PROP_STRING); if (cp == NULL) return NULL; strcpy(cp, prop_string_contents(ops)); return _prop_string_instantiate(PS_F_MUTABLE, cp, ops->ps_size); } _PROP_DEPRECATED(prop_string_copy_mutable, "this program uses prop_string_copy_mutable(); all functions " "supporting mutable prop_strings are deprecated.") prop_string_t prop_string_copy_mutable(prop_string_t ops) { char *cp; if (! prop_object_is_string(ops)) return (NULL); cp = _PROP_MALLOC(ops->ps_size + 1, M_PROP_STRING); if (cp == NULL) return NULL; strcpy(cp, prop_string_contents(ops)); return _prop_string_instantiate(PS_F_MUTABLE, cp, ops->ps_size); } /* * prop_string_size -- * Return the size of the string, not including the terminating NUL. */ size_t prop_string_size(prop_string_t ps) { if (! prop_object_is_string(ps)) return (0); return (ps->ps_size); } /* * prop_string_value -- * Returns a pointer to the string object's value. This pointer * remains valid only as long as the string object. */ const char * prop_string_value(prop_string_t ps) { if (! prop_object_is_string(ps)) return (NULL); if ((ps->ps_flags & PS_F_MUTABLE) == 0) return (ps->ps_immutable); return (prop_string_contents(ps)); } /* * prop_string_copy_value -- * Copy the string object's value into the supplied buffer. */ bool prop_string_copy_value(prop_string_t ps, void *buf, size_t buflen) { if (! prop_object_is_string(ps)) return (false); if (buf == NULL || buflen < ps->ps_size + 1) return (false); strcpy(buf, prop_string_contents(ps)); return (true); } _PROP_DEPRECATED(prop_string_mutable, "this program uses prop_string_mutable(); all functions " "supporting mutable prop_strings are deprecated.") bool prop_string_mutable(prop_string_t ps) { if (! prop_object_is_string(ps)) return (false); return ((ps->ps_flags & PS_F_MUTABLE) != 0); } _PROP_DEPRECATED(prop_string_cstring, "this program uses prop_string_cstring(), " "which is deprecated; use prop_string_copy_value() instead.") char * prop_string_cstring(prop_string_t ps) { char *cp; if (! prop_object_is_string(ps)) return (NULL); cp = _PROP_MALLOC(ps->ps_size + 1, M_TEMP); if (cp != NULL) strcpy(cp, prop_string_contents(ps)); return (cp); } _PROP_DEPRECATED(prop_string_cstring_nocopy, "this program uses prop_string_cstring_nocopy(), " "which is deprecated; use prop_string_value() instead.") const char * prop_string_cstring_nocopy(prop_string_t ps) { if (! prop_object_is_string(ps)) return (NULL); return (prop_string_contents(ps)); } _PROP_DEPRECATED(prop_string_append, "this program uses prop_string_append(); all functions " "supporting mutable prop_strings are deprecated.") bool prop_string_append(prop_string_t dst, prop_string_t src) { char *ocp, *cp; size_t len; if (! (prop_object_is_string(dst) && prop_object_is_string(src))) return (false); if ((dst->ps_flags & PS_F_MUTABLE) == 0) return (false); len = dst->ps_size + src->ps_size; cp = _PROP_MALLOC(len + 1, M_PROP_STRING); if (cp == NULL) return (false); snprintf(cp, len + 1, "%s%s", prop_string_contents(dst), prop_string_contents(src)); ocp = dst->ps_mutable; dst->ps_mutable = cp; dst->ps_size = len; if (ocp != NULL) _PROP_FREE(ocp, M_PROP_STRING); return (true); } _PROP_DEPRECATED(prop_string_append_cstring, "this program uses prop_string_append_cstring(); all functions " "supporting mutable prop_strings are deprecated.") bool prop_string_append_cstring(prop_string_t dst, const char *src) { char *ocp, *cp; size_t len; if (! prop_object_is_string(dst)) return (false); _PROP_ASSERT(src != NULL); if ((dst->ps_flags & PS_F_MUTABLE) == 0) return (false); len = dst->ps_size + strlen(src); cp = _PROP_MALLOC(len + 1, M_PROP_STRING); if (cp == NULL) return (false); snprintf(cp, len + 1, "%s%s", prop_string_contents(dst), src); ocp = dst->ps_mutable; dst->ps_mutable = cp; dst->ps_size = len; if (ocp != NULL) _PROP_FREE(ocp, M_PROP_STRING); return (true); } /* * prop_string_equals -- * Return true if two strings are equivalent. */ bool prop_string_equals(prop_string_t str1, prop_string_t str2) { if (!prop_object_is_string(str1) || !prop_object_is_string(str2)) return (false); return prop_object_equals(str1, str2); } /* * prop_string_equals_string -- * Return true if the string object is equivalent to the specified * C string. */ bool prop_string_equals_string(prop_string_t ps, const char *cp) { if (! prop_object_is_string(ps)) return (false); return (strcmp(prop_string_contents(ps), cp) == 0); } _PROP_DEPRECATED(prop_string_equals_cstring, "this program uses prop_string_equals_cstring(), " "which is deprecated; prop_string_equals_string() instead.") bool prop_string_equals_cstring(prop_string_t ps, const char *cp) { return prop_string_equals_string(ps, cp); } /* * prop_string_compare -- * Compare two string objects, using strcmp() semantics. */ int prop_string_compare(prop_string_t ps1, prop_string_t ps2) { if (!prop_object_is_string(ps1) || !prop_object_is_string(ps2)) return (-666); /* arbitrary */ return (strcmp(prop_string_contents(ps1), prop_string_contents(ps2))); } /* * prop_string_compare_string -- * Compare a string object to the specified C string, using * strcmp() semantics. */ int prop_string_compare_string(prop_string_t ps, const char *cp) { if (!prop_object_is_string(ps)) return (-666); /* arbitrary */ return (strcmp(prop_string_contents(ps), cp)); } /* * _prop_string_internalize -- * Parse a <string>...</string> and return the object created from the * external representation. */ /* ARGSUSED */ bool _prop_string_internalize(prop_stack_t stack, prop_object_t *obj, struct _prop_object_internalize_context *ctx) { char *str; size_t len, alen; if (ctx->poic_is_empty_element) { *obj = prop_string_create(); return (true); } /* No attributes recognized here. */ if (ctx->poic_tagattr != NULL) return (true); /* Compute the length of the result. */ if (_prop_object_internalize_decode_string(ctx, NULL, 0, &len, NULL) == false) return (true); str = _PROP_MALLOC(len + 1, M_PROP_STRING); if (str == NULL) return (true); if (_prop_object_internalize_decode_string(ctx, str, len, &alen, &ctx->poic_cp) == false || alen != len) { _PROP_FREE(str, M_PROP_STRING); return (true); } str[len] = '\0'; if (_prop_object_internalize_find_tag(ctx, "string", _PROP_TAG_TYPE_END) == false) { _PROP_FREE(str, M_PROP_STRING); return (true); } *obj = _prop_string_instantiate(0, str, len); return (true); }
9 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 /* $NetBSD: ccd_60.c,v 1.11 2019/12/12 02:15:42 pgoyette Exp $ */ /*- * Copyright (c) 2018 The NetBSD Foundation, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: ccd_60.c,v 1.11 2019/12/12 02:15:42 pgoyette Exp $"); #ifdef _KERNEL_OPT #include "opt_compat_netbsd.h" #endif #include <sys/param.h> #include <sys/systm.h> #include <sys/disk.h> #include <sys/lwp.h> #include <sys/compat_stub.h> #include <dev/ccdvar.h> #include <compat/sys/ccdvar.h> /* * Compat code must not be called if on a platform where * sizeof (size_t) == sizeof (uint64_t) as CCDIOCSET will * be the same as CCDIOCSET_60 */ static int compat_60_ccdioctl(dev_t dev, u_long cmd, void *data, int flag, struct lwp *l, int (*f)(dev_t, u_long, void *, int, struct lwp *)) { switch (cmd) { #ifdef CCDIOCSET_60 case CCDIOCSET_60: { if (data == NULL) return 0; struct ccd_ioctl ccio; struct ccd_ioctl_60 *ccio60 = data; ccio.ccio_disks = ccio60->ccio_disks; ccio.ccio_ndisks = ccio60->ccio_ndisks; ccio.ccio_ileave = ccio60->ccio_ileave; ccio.ccio_flags = ccio60->ccio_flags; ccio.ccio_unit = ccio60->ccio_unit; int error = (*f)(dev, CCDIOCSET, &ccio, flag, l); if (!error) { /* Copy data back, adjust types if necessary */ ccio60->ccio_disks = ccio.ccio_disks; ccio60->ccio_ndisks = ccio.ccio_ndisks; ccio60->ccio_ileave = ccio.ccio_ileave; ccio60->ccio_flags = ccio.ccio_flags; ccio60->ccio_unit = ccio.ccio_unit; ccio60->ccio_size = (size_t)ccio.ccio_size; } return error; } case CCDIOCCLR_60: if (data == NULL) return ENOSYS; /* * ccio_size member not used, so existing struct OK * drop through to existing non-compat version */ return (*f)(dev, CCDIOCCLR, data, flag, l); #endif default: return ENOSYS; } } void ccd_60_init(void) { MODULE_HOOK_SET(ccd_ioctl_60_hook, compat_60_ccdioctl); } void ccd_60_fini(void) { MODULE_HOOK_UNSET(ccd_ioctl_60_hook); }
1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 /* $NetBSD: if_sl.c,v 1.136 2022/10/26 23:42:42 riastradh Exp $ */ /* * Copyright (c) 1987, 1989, 1992, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)if_sl.c 8.9 (Berkeley) 1/9/95 */ /* * Serial Line interface * * Rick Adams * Center for Seismic Studies * 1300 N 17th Street, Suite 1450 * Arlington, Virginia 22209 * (703)276-7900 * rick@seismo.ARPA * seismo!rick * * Pounded on heavily by Chris Torek (chris@mimsy.umd.edu, umcp-cs!chris). * N.B.: this belongs in netinet, not net, the way it stands now. * Should have a link-layer type designation, but wouldn't be * backwards-compatible. * * Converted to 4.3BSD Beta by Chris Torek. * Other changes made at Berkeley, based in part on code by Kirk Smith. * W. Jolitz added slip abort. * * Hacked almost beyond recognition by Van Jacobson (van@helios.ee.lbl.gov). * Added priority queuing for "interactive" traffic; hooks for TCP * header compression; ICMP filtering (at 2400 baud, some cretin * pinging you can use up all your bandwidth). Made low clist behavior * more robust and slightly less likely to hang serial line. * Sped up a bunch of things. */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: if_sl.c,v 1.136 2022/10/26 23:42:42 riastradh Exp $"); #ifdef _KERNEL_OPT #include "opt_inet.h" #endif #include <sys/param.h> #include <sys/proc.h> #include <sys/malloc.h> #include <sys/mbuf.h> #include <sys/buf.h> #include <sys/dkstat.h> #include <sys/socket.h> #include <sys/ioctl.h> #include <sys/file.h> #include <sys/conf.h> #include <sys/tty.h> #include <sys/kernel.h> #include <sys/socketvar.h> #if __NetBSD__ #include <sys/systm.h> #include <sys/kauth.h> #endif #include <sys/cpu.h> #include <sys/intr.h> #include <sys/device.h> #include <sys/module.h> #include <net/if.h> #include <net/if_types.h> #include <net/route.h> #ifdef INET #include <netinet/in.h> #include <netinet/in_systm.h> #include <netinet/in_var.h> #include <netinet/ip.h> #endif #include <net/slcompress.h> #include <net/if_slvar.h> #include <net/slip.h> #include <net/ppp_defs.h> #include <net/if_ppp.h> #include <sys/time.h> #include <net/bpf.h> #include "ioconf.h" /* * SLMAX is a hard limit on input packet size. To simplify the code * and improve performance, we require that packets fit in an mbuf * cluster, and if we get a compressed packet, there's enough extra * room to expand the header into a max length tcp/ip header (128 * bytes). So, SLMAX can be at most * MCLBYTES - 128 * * SLMTU is a hard limit on output packet size. To insure good * interactive response, SLMTU wants to be the smallest size that * amortizes the header cost. (Remember that even with * type-of-service queuing, we have to wait for any in-progress * packet to finish. I.e., we wait, on the average, 1/2 * mtu / * cps, where cps is the line speed in characters per second. * E.g., 533ms wait for a 1024 byte MTU on a 9600 baud line. The * average compressed header size is 6-8 bytes so any MTU > 90 * bytes will give us 90% of the line bandwidth. A 100ms wait is * tolerable (500ms is not), so want an MTU around 296. (Since TCP * will send 256 byte segments (to allow for 40 byte headers), the * typical packet size on the wire will be around 260 bytes). In * 4.3tahoe+ systems, we can set an MTU in a route so we do that & * leave the interface MTU relatively high (so we don't IP fragment * when acting as a gateway to someone using a stupid MTU). * * Similar considerations apply to SLIP_HIWAT: It's the amount of * data that will be queued 'downstream' of us (i.e., in clists * waiting to be picked up by the tty output interrupt). If we * queue a lot of data downstream, it's immune to our t.o.s. queuing. * E.g., if SLIP_HIWAT is 1024, the interactive traffic in mixed * telnet/ftp will see a 1 sec wait, independent of the mtu (the * wait is dependent on the ftp window size but that's typically * 1k - 4k). So, we want SLIP_HIWAT just big enough to amortize * the cost (in idle time on the wire) of the tty driver running * off the end of its clists & having to call back slstart for a * new packet. For a tty interface with any buffering at all, this * cost will be zero. Even with a totally brain dead interface (like * the one on a typical workstation), the cost will be <= 1 character * time. So, setting SLIP_HIWAT to ~100 guarantees that we'll lose * at most 1% while maintaining good interactive response. */ #define BUFOFFSET (128+sizeof(struct ifnet **)+SLIP_HDRLEN) #define SLMAX (MCLBYTES - BUFOFFSET) #define SLBUFSIZE (SLMAX + BUFOFFSET) #ifndef SLMTU #define SLMTU 296 #endif #if (SLMTU < 3) #error SLMTU way too small. #endif #define SLIP_HIWAT roundup(50, TTROUND) #ifndef __NetBSD__ /* XXX - cgd */ #define CLISTRESERVE 1024 /* Can't let clists get too low */ #endif /* !__NetBSD__ */ /* * SLIP ABORT ESCAPE MECHANISM: * (inspired by HAYES modem escape arrangement) * 1sec escape 1sec escape 1sec escape { 1sec escape 1sec escape } * within window time signals a "soft" exit from slip mode by remote end * if the IFF_DEBUG flag is on. */ #define ABT_ESC '\033' /* can't be t_intr - distant host must know it*/ #define ABT_IDLE 1 /* in seconds - idle before an escape */ #define ABT_COUNT 3 /* count of escapes for abort */ #define ABT_WINDOW (ABT_COUNT*2+2) /* in seconds - time to count */ static int sl_clone_create(struct if_clone *, int); static int sl_clone_destroy(struct ifnet *); static LIST_HEAD(, sl_softc) sl_softc_list; struct if_clone sl_cloner = IF_CLONE_INITIALIZER("sl", sl_clone_create, sl_clone_destroy); #define FRAME_END 0xc0 /* Frame End */ #define FRAME_ESCAPE 0xdb /* Frame Esc */ #define TRANS_FRAME_END 0xdc /* transposed frame end */ #define TRANS_FRAME_ESCAPE 0xdd /* transposed frame esc */ static void slintr(void *); static int slcreate(struct sl_softc *); static struct mbuf *sl_btom(struct sl_softc *, int); static int slclose(struct tty *, int); static int slinput(int, struct tty *); static int slioctl(struct ifnet *, u_long, void *); static int slopen(dev_t, struct tty *); static int sloutput(struct ifnet *, struct mbuf *, const struct sockaddr *, const struct rtentry *); static int slstart(struct tty *); static int sltioctl(struct tty *, u_long, void *, int, struct lwp *); static struct linesw slip_disc = { .l_name = "slip", .l_open = slopen, .l_close = slclose, .l_read = ttyerrio, .l_write = ttyerrio, .l_ioctl = sltioctl, .l_rint = slinput, .l_start = slstart, .l_modem = nullmodem, .l_poll = ttyerrpoll }; void slattach(int n __unused) { /* * Nothing to do here, initialization is handled by the * module initialization code in slinit() below). */ } static void slinit(void) { if (ttyldisc_attach(&slip_disc) != 0) panic("%s", __func__); LIST_INIT(&sl_softc_list); if_clone_attach(&sl_cloner); } static int sldetach(void) { int error = 0; if (!LIST_EMPTY(&sl_softc_list)) error = EBUSY; if (error == 0) error = ttyldisc_detach(&slip_disc); if (error == 0) if_clone_detach(&sl_cloner); return error; } static int sl_clone_create(struct if_clone *ifc, int unit) { struct sl_softc *sc; sc = malloc(sizeof(*sc), M_DEVBUF, M_WAIT|M_ZERO); sc->sc_unit = unit; if_initname(&sc->sc_if, ifc->ifc_name, unit); sc->sc_if.if_softc = sc; sc->sc_if.if_mtu = SLMTU; sc->sc_if.if_flags = IFF_POINTOPOINT | SC_AUTOCOMP | IFF_MULTICAST; sc->sc_if.if_type = IFT_SLIP; sc->sc_if.if_ioctl = slioctl; sc->sc_if.if_output = sloutput; sc->sc_if.if_dlt = DLT_SLIP; IFQ_SET_MAXLEN(&sc->sc_fastq, 32); IFQ_LOCK_INIT(&sc->sc_fastq); IFQ_SET_READY(&sc->sc_if.if_snd); if_attach(&sc->sc_if); if_alloc_sadl(&sc->sc_if); bpf_attach(&sc->sc_if, DLT_SLIP, SLIP_HDRLEN); LIST_INSERT_HEAD(&sl_softc_list, sc, sc_iflist); return 0; } static int sl_clone_destroy(struct ifnet *ifp) { struct sl_softc *sc = (struct sl_softc *)ifp->if_softc; if (sc->sc_ttyp != NULL) return EBUSY; /* Not removing it */ LIST_REMOVE(sc, sc_iflist); bpf_detach(ifp); if_detach(ifp); IFQ_LOCK_DESTROY(&sc->sc_fastq); free(sc, M_DEVBUF); return 0; } static int slcreate(struct sl_softc *sc) { if (sc->sc_mbuf == NULL) { sc->sc_mbuf = m_gethdr(M_WAIT, MT_DATA); m_clget(sc->sc_mbuf, M_WAIT); } sc->sc_ep = (u_char *)sc->sc_mbuf->m_ext.ext_buf + sc->sc_mbuf->m_ext.ext_size; sc->sc_mp = sc->sc_pktstart = (u_char *)sc->sc_mbuf->m_ext.ext_buf + BUFOFFSET; #ifdef INET sl_compress_init(&sc->sc_comp); #endif return 1; } /* * Line specific open routine. * Attach the given tty to the first available sl unit. */ /* ARGSUSED */ static int slopen(dev_t dev, struct tty *tp) { struct lwp *l = curlwp; /* XXX */ struct sl_softc *sc; int error; error = kauth_authorize_network(l->l_cred, KAUTH_NETWORK_INTERFACE_SLIP, KAUTH_REQ_NETWORK_INTERFACE_SLIP_ADD, NULL, NULL, NULL); if (error) return error; if (tp->t_linesw == &slip_disc) return 0; LIST_FOREACH(sc, &sl_softc_list, sc_iflist) if (sc->sc_ttyp == NULL) { sc->sc_si = softint_establish(SOFTINT_NET, slintr, sc); if (sc->sc_si == NULL) return ENOMEM; if (slcreate(sc) == 0) { softint_disestablish(sc->sc_si); return ENOBUFS; } tp->t_sc = (void *)sc; sc->sc_ttyp = tp; sc->sc_if.if_baudrate = tp->t_ospeed; ttylock(tp); tp->t_state |= TS_ISOPEN | TS_XCLUDE; ttyflush(tp, FREAD | FWRITE); /* * make sure tty output queue is large enough * to hold a full-sized packet (including frame * end, and a possible extra frame end). full-sized * packet occupies a max of 2*SLMAX bytes (because * of possible escapes), and add two on for frame * ends. */ if (tp->t_outq.c_cn < 2 * SLMAX + 2) { sc->sc_oldbufsize = tp->t_outq.c_cn; sc->sc_oldbufquot = tp->t_outq.c_cq != 0; clfree(&tp->t_outq); ttyunlock(tp); error = clalloc(&tp->t_outq, 2 * SLMAX + 2, 0); if (error) { softint_disestablish(sc->sc_si); /* * clalloc() might return -1 which * is no good, so we need to return * something else. */ return ENOMEM; /* XXX ?! */ } } else { sc->sc_oldbufsize = sc->sc_oldbufquot = 0; ttyunlock(tp); } return 0; } return ENXIO; } /* * Line specific close routine. * Detach the tty from the sl unit. */ static int slclose(struct tty *tp, int flag) { struct sl_softc *sc; int s; ttywflush(tp); sc = tp->t_sc; if (sc != NULL) { softint_disestablish(sc->sc_si); s = splnet(); if_down(&sc->sc_if); IF_PURGE(&sc->sc_fastq); splx(s); s = spltty(); ttyldisc_release(tp->t_linesw); tp->t_linesw = ttyldisc_default(); tp->t_state = 0; sc->sc_ttyp = NULL; tp->t_sc = NULL; m_freem(sc->sc_mbuf); sc->sc_mbuf = NULL; sc->sc_ep = sc->sc_mp = sc->sc_pktstart = NULL; IF_PURGE(&sc->sc_inq); /* * If necessary, install a new outq buffer of the * appropriate size. */ if (sc->sc_oldbufsize != 0) { clfree(&tp->t_outq); clalloc(&tp->t_outq, sc->sc_oldbufsize, sc->sc_oldbufquot); } splx(s); } return 0; } /* * Line specific (tty) ioctl routine. * Provide a way to get the sl unit number. */ /* ARGSUSED */ static int sltioctl(struct tty *tp, u_long cmd, void *data, int flag, struct lwp *l) { struct sl_softc *sc = (struct sl_softc *)tp->t_sc; /* * XXX * This function can be called without KERNEL_LOCK when caller's * struct cdevsw is set D_MPSAFE. Is KERNEL_LOCK required? */ switch (cmd) { case SLIOCGUNIT: *(int *)data = sc->sc_unit; /* XXX */ break; default: return EPASSTHROUGH; } return 0; } /* * Queue a packet. Start transmission if not active. * Compression happens in slintr(); if we do it here, IP TOS * will cause us to not compress "background" packets, because * ordering gets trashed. It can be done for all packets in slintr(). */ static int sloutput(struct ifnet *ifp, struct mbuf *m, const struct sockaddr *dst, const struct rtentry *rtp) { struct sl_softc *sc = ifp->if_softc; struct ip *ip; struct ifqueue *ifq = NULL; int s, error; IFQ_CLASSIFY(&ifp->if_snd, m, dst->sa_family); /* * `Cannot happen' (see slioctl). Someday we will extend * the line protocol to support other address families. */ if (dst->sa_family != AF_INET) { printf("%s: af%d not supported\n", sc->sc_if.if_xname, dst->sa_family); m_freem(m); if_statinc(&sc->sc_if, if_noproto); return EAFNOSUPPORT; } if (sc->sc_ttyp == NULL) { m_freem(m); return ENETDOWN; /* sort of */ } if ((sc->sc_ttyp->t_state & TS_CARR_ON) == 0 && (sc->sc_ttyp->t_cflag & CLOCAL) == 0) { m_freem(m); printf("%s: no carrier and not local\n", sc->sc_if.if_xname); return EHOSTUNREACH; } ip = mtod(m, struct ip *); #ifdef INET if (sc->sc_if.if_flags & SC_NOICMP && ip->ip_p == IPPROTO_ICMP) { m_freem(m); return ENETRESET; /* XXX ? */ } #endif s = spltty(); if (sc->sc_oqlen && sc->sc_ttyp->t_outq.c_cc == sc->sc_oqlen) { struct bintime bt; /* if output's been stalled for too long, and restart */ getbinuptime(&bt); bintime_sub(&bt, &sc->sc_lastpacket); if (bt.sec > 0) { sc->sc_otimeout++; slstart(sc->sc_ttyp); } } splx(s); s = splnet(); #ifdef INET if ((ip->ip_tos & IPTOS_LOWDELAY) != 0) ifq = &sc->sc_fastq; #endif if ((error = ifq_enqueue2(ifp, ifq, m)) != 0) { splx(s); return error; } getbinuptime(&sc->sc_lastpacket); splx(s); s = spltty(); if ((sc->sc_oqlen = sc->sc_ttyp->t_outq.c_cc) == 0) slstart(sc->sc_ttyp); splx(s); return 0; } /* * Start output on interface. Get another datagram * to send from the interface queue and map it to * the interface before starting output. */ static int slstart(struct tty *tp) { struct sl_softc *sc = tp->t_sc; /* * If there is more in the output queue, just send it now. * We are being called in lieu of ttstart and must do what * it would. */ if (tp->t_outq.c_cc != 0) { (*tp->t_oproc)(tp); if (tp->t_outq.c_cc > SLIP_HIWAT) return 0; } /* * This happens briefly when the line shuts down. */ if (sc == NULL) return 0; softint_schedule(sc->sc_si); return 0; } /* * Copy data buffer to mbuf chain; add ifnet pointer. */ static struct mbuf * sl_btom(struct sl_softc *sc, int len) { struct mbuf *m; /* * Allocate a new input buffer and swap. */ m = sc->sc_mbuf; MGETHDR(sc->sc_mbuf, M_DONTWAIT, MT_DATA); if (sc->sc_mbuf == NULL) { sc->sc_mbuf = m; return NULL; } MCLGET(sc->sc_mbuf, M_DONTWAIT); if ((sc->sc_mbuf->m_flags & M_EXT) == 0) { m_freem(sc->sc_mbuf); sc->sc_mbuf = m; return NULL; } sc->sc_ep = (u_char *)sc->sc_mbuf->m_ext.ext_buf + sc->sc_mbuf->m_ext.ext_size; m->m_data = sc->sc_pktstart; m->m_pkthdr.len = m->m_len = len; m_set_rcvif(m, &sc->sc_if); return m; } /* * tty interface receiver interrupt. */ static int slinput(int c, struct tty *tp) { struct sl_softc *sc; struct mbuf *m; int len; tk_nin++; sc = (struct sl_softc *)tp->t_sc; if (sc == NULL) return 0; if ((c & TTY_ERRORMASK) || ((tp->t_state & TS_CARR_ON) == 0 && (tp->t_cflag & CLOCAL) == 0)) { sc->sc_flags |= SC_ERROR; return 0; } c &= TTY_CHARMASK; if_statinc(&sc->sc_if, if_ibytes); if (sc->sc_if.if_flags & IFF_DEBUG) { if (c == ABT_ESC) { /* * If we have a previous abort, see whether * this one is within the time limit. */ if (sc->sc_abortcount && time_second >= sc->sc_starttime + ABT_WINDOW) sc->sc_abortcount = 0; /* * If we see an abort after "idle" time, count it; * record when the first abort escape arrived. */ if (time_second >= sc->sc_lasttime + ABT_IDLE) { if (++sc->sc_abortcount == 1) sc->sc_starttime = time_second; if (sc->sc_abortcount >= ABT_COUNT) { slclose(tp, 0); return 0; } } } else sc->sc_abortcount = 0; sc->sc_lasttime = time_second; } switch (c) { case TRANS_FRAME_ESCAPE: if (sc->sc_escape) c = FRAME_ESCAPE; break; case TRANS_FRAME_END: if (sc->sc_escape) c = FRAME_END; break; case FRAME_ESCAPE: sc->sc_escape = 1; return 0; case FRAME_END: if (sc->sc_flags & SC_ERROR) { sc->sc_flags &= ~SC_ERROR; goto newpack; } len = sc->sc_mp - sc->sc_pktstart; if (len < 3) /* less than min length packet - ignore */ goto newpack; m = sl_btom(sc, len); if (m == NULL) goto error; IF_ENQUEUE(&sc->sc_inq, m); softint_schedule(sc->sc_si); goto newpack; } if (sc->sc_mp < sc->sc_ep) { *sc->sc_mp++ = c; sc->sc_escape = 0; return 0; } /* can't put lower; would miss an extra frame */ sc->sc_flags |= SC_ERROR; error: if_statinc(&sc->sc_if, if_ierrors); newpack: sc->sc_mp = sc->sc_pktstart = (u_char *)sc->sc_mbuf->m_ext.ext_buf + BUFOFFSET; sc->sc_escape = 0; return 0; } static void slintr(void *arg) { struct sl_softc *sc = arg; struct tty *tp = sc->sc_ttyp; struct mbuf *m, *n; int s, len; u_char *pktstart; u_char chdr[CHDR_LEN]; KASSERT(tp != NULL); /* * Output processing loop. */ mutex_enter(softnet_lock); for (;;) { struct mbuf *m2; struct mbuf *bpf_m; /* * Do not remove the packet from the queue if it * doesn't look like it will fit into the current * serial output queue. With a packet full of * escapes, this could be as bad as MTU*2+2. */ s = spltty(); if (tp->t_outq.c_cn - tp->t_outq.c_cc < 2 * sc->sc_if.if_mtu + 2) { splx(s); break; } splx(s); /* * Get a packet and send it to the interface. */ s = splnet(); IF_DEQUEUE(&sc->sc_fastq, m); if (m) if_statinc(&sc->sc_if, if_omcasts); /* XXX */ else IFQ_DEQUEUE(&sc->sc_if.if_snd, m); splx(s); if (m == NULL) break; /* * We do the header compression here rather than in * sloutput() because the packets will be out of order * if we are using TOS queueing, and the connection * ID compression will get munged when this happens. */ if (sc->sc_if.if_bpf) { /* * We need to save the TCP/IP header before * it's compressed. To avoid complicated * code, we just make a deep copy of the * entire packet (since this is a serial * line, packets should be short and/or the * copy should be negligible cost compared * to the packet transmission time). */ bpf_m = m_dup(m, 0, M_COPYALL, M_DONTWAIT); } else bpf_m = NULL; #ifdef INET struct ip *ip; if ((ip = mtod(m, struct ip *))->ip_p == IPPROTO_TCP) { if (sc->sc_if.if_flags & SC_COMPRESS) *mtod(m, u_char *) |= sl_compress_tcp(m, ip, &sc->sc_comp, 1); } #endif if (bpf_m) bpf_mtap_sl_out(&sc->sc_if, mtod(m, u_char *), bpf_m); getbinuptime(&sc->sc_lastpacket); s = spltty(); /* * The extra FRAME_END will start up a new packet, * and thus will flush any accumulated garbage. We * do this whenever the line may have been idle for * some time. */ if (tp->t_outq.c_cc == 0) { if_statinc(&sc->sc_if, if_obytes); (void)putc(FRAME_END, &tp->t_outq); } while (m) { u_char *bp, *cp, *ep; bp = cp = mtod(m, u_char *); ep = cp + m->m_len; while (cp < ep) { /* * Find out how many bytes in the * string we can handle without * doing something special. */ while (cp < ep) { switch (*cp++) { case FRAME_ESCAPE: case FRAME_END: cp--; goto out; } } out: if (cp > bp) { /* * Put N characters at once * into the tty output queue. */ if (b_to_q(bp, cp - bp, &tp->t_outq)) break; if_statadd(&sc->sc_if, if_obytes, cp - bp); } /* * If there are characters left in * the mbuf, the first one must be * special.. Put it out in a different * form. */ if (cp < ep) { if (putc(FRAME_ESCAPE, &tp->t_outq)) break; if (putc(*cp++ == FRAME_ESCAPE ? TRANS_FRAME_ESCAPE : TRANS_FRAME_END, &tp->t_outq)) { (void)unputc(&tp->t_outq); break; } if_statadd(&sc->sc_if, if_obytes, 2); } bp = cp; } m = m2 = m_free(m); } if (putc(FRAME_END, &tp->t_outq)) { /* * Not enough room. Remove a char to make * room and end the packet normally. If * you get many collisions (more than one * or two a day), you probably do not have * enough clists and you should increase * "nclist" in param.c */ (void)unputc(&tp->t_outq); (void)putc(FRAME_END, &tp->t_outq); if_statinc(&sc->sc_if, if_collisions); } else { if_statadd2(&sc->sc_if, if_obytes, 1, if_opackets, 1); } /* * We now have characters in the output queue, * kick the serial port. */ (*tp->t_oproc)(tp); splx(s); } /* * Input processing loop. */ for (;;) { s = spltty(); IF_DEQUEUE(&sc->sc_inq, m); splx(s); if (m == NULL) break; pktstart = mtod(m, u_char *); len = m->m_pkthdr.len; if (sc->sc_if.if_bpf) { /* * Save the compressed header, so we * can tack it on later. Note that we * will end up copying garbage in some * cases but this is okay. We remember * where the buffer started so we can * compute the new header length. */ memcpy(chdr, pktstart, CHDR_LEN); } #ifdef INET u_char c; if ((c = (*pktstart & 0xf0)) != (IPVERSION << 4)) { if (c & 0x80) c = TYPE_COMPRESSED_TCP; else if (c == TYPE_UNCOMPRESSED_TCP) *pktstart &= 0x4f; /* XXX */ /* * We've got something that's not an IP * packet. If compression is enabled, * try to decompress it. Otherwise, if * `auto-enable' compression is on and * it's a reasonable packet, decompress * it and then enable compression. * Otherwise, drop it. */ if (sc->sc_if.if_flags & SC_COMPRESS) { len = sl_uncompress_tcp(&pktstart, len, (u_int)c, &sc->sc_comp); if (len <= 0) { m_freem(m); continue; } } else if ((sc->sc_if.if_flags & SC_AUTOCOMP) && c == TYPE_UNCOMPRESSED_TCP && len >= 40) { len = sl_uncompress_tcp(&pktstart, len, (u_int)c, &sc->sc_comp); if (len <= 0) { m_freem(m); continue; } sc->sc_if.if_flags |= SC_COMPRESS; } else { m_freem(m); continue; } } #endif m->m_data = (void *) pktstart; m->m_pkthdr.len = m->m_len = len; if (sc->sc_if.if_bpf) { bpf_mtap_sl_in(&sc->sc_if, chdr, &m); if (m == NULL) continue; } /* * If the packet will fit into a single * header mbuf, try to copy it into one, * to save memory. */ if ((m->m_pkthdr.len < MHLEN) && (n = m_gethdr(M_DONTWAIT, MT_DATA))) { int pktlen; pktlen = m->m_pkthdr.len; m_move_pkthdr(n, m); memcpy(mtod(n, void *), mtod(m, void *), pktlen); n->m_len = m->m_len; m_freem(m); m = n; } if_statinc(&sc->sc_if, if_ipackets); getbinuptime(&sc->sc_lastpacket); #ifdef INET s = splnet(); if (__predict_false(!pktq_enqueue(ip_pktq, m, 0))) { if_statadd2(&sc->sc_if, if_ierrors, 1, if_iqdrops, 1); m_freem(m); } splx(s); #endif } mutex_exit(softnet_lock); } /* * Process an ioctl request. */ static int slioctl(struct ifnet *ifp, u_long cmd, void *data) { struct ifaddr *ifa = (struct ifaddr *)data; struct ifreq *ifr = (struct ifreq *)data; int s = splnet(), error = 0; struct sl_softc *sc = ifp->if_softc; struct ppp_stats *psp; struct ppp_comp_stats *pcp; switch (cmd) { case SIOCINITIFADDR: if (ifa->ifa_addr->sa_family == AF_INET) ifp->if_flags |= IFF_UP; else error = EAFNOSUPPORT; break; case SIOCSIFDSTADDR: if (ifreq_getaddr(cmd, ifr)->sa_family != AF_INET) error = EAFNOSUPPORT; break; case SIOCSIFMTU: if ((ifr->ifr_mtu < 3) || (ifr->ifr_mtu > SLMAX)) { error = EINVAL; break; } /*FALLTHROUGH*/ case SIOCGIFMTU: if ((error = ifioctl_common(&sc->sc_if, cmd, data)) == ENETRESET) error = 0; break; case SIOCADDMULTI: case SIOCDELMULTI: if (ifr == 0) { error = EAFNOSUPPORT; /* XXX */ break; } switch (ifreq_getaddr(cmd, ifr)->sa_family) { #ifdef INET case AF_INET: break; #endif default: error = EAFNOSUPPORT; break; } break; case SIOCGPPPSTATS: { struct if_data ifi; if_export_if_data(&sc->sc_if, &ifi, false); psp = &((struct ifpppstatsreq *) data)->stats; (void)memset(psp, 0, sizeof(*psp)); psp->p.ppp_ibytes = ifi.ifi_ibytes; psp->p.ppp_ipackets = ifi.ifi_ipackets; psp->p.ppp_ierrors = ifi.ifi_ierrors; psp->p.ppp_obytes = ifi.ifi_obytes; psp->p.ppp_opackets = ifi.ifi_opackets; psp->p.ppp_oerrors = ifi.ifi_oerrors; #ifdef INET psp->vj.vjs_packets = sc->sc_comp.sls_packets; psp->vj.vjs_compressed = sc->sc_comp.sls_compressed; psp->vj.vjs_searches = sc->sc_comp.sls_searches; psp->vj.vjs_misses = sc->sc_comp.sls_misses; psp->vj.vjs_uncompressedin = sc->sc_comp.sls_uncompressedin; psp->vj.vjs_compressedin = sc->sc_comp.sls_compressedin; psp->vj.vjs_errorin = sc->sc_comp.sls_errorin; psp->vj.vjs_tossed = sc->sc_comp.sls_tossed; #endif } break; case SIOCGPPPCSTATS: pcp = &((struct ifpppcstatsreq *) data)->stats; (void)memset(pcp, 0, sizeof(*pcp)); break; default: error = ifioctl_common(ifp, cmd, data); break; } splx(s); return error; } /* * Module infrastructure */ #include "if_module.h" IF_MODULE(MODULE_CLASS_DRIVER, sl, "slcompress");
190 136 75 6 38 23 4 2 2 2 2 2 1 1 1 26 26 26 6 23 17 10 10 10 3 10 7 2 105 8 2 2 118 116 1 99 1 117 117 10 12 12 23 23 23 39 39 40 40 23 23 23 23 40 40 40 40 40 23 17 30 30 30 30 30 9 9 21 21 21 9 9 8 8 8 2 1 6 1 6 1 3 4 4 5 5 2 5 5 2 8 4 7 9 9 9 9 2 2 2 2 2 2 2 1 2 2 5 5 5 4 1 2 5 2 2 1 1 2 2 2 1 2 2 2 2 2 8 8 8 29 29 29 29 29 29 29 29 13 18 13 18 29 13 18 23 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 /* $NetBSD: kern_proc.c,v 1.274 2023/10/05 19:41:07 ad Exp $ */ /*- * Copyright (c) 1999, 2006, 2007, 2008, 2020, 2023 * The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility, * NASA Ames Research Center, and by Andrew Doran. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /* * Copyright (c) 1982, 1986, 1989, 1991, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)kern_proc.c 8.7 (Berkeley) 2/14/95 */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: kern_proc.c,v 1.274 2023/10/05 19:41:07 ad Exp $"); #ifdef _KERNEL_OPT #include "opt_kstack.h" #include "opt_maxuprc.h" #include "opt_dtrace.h" #include "opt_compat_netbsd32.h" #include "opt_kaslr.h" #endif #if defined(__HAVE_COMPAT_NETBSD32) && !defined(COMPAT_NETBSD32) \ && !defined(_RUMPKERNEL) #define COMPAT_NETBSD32 #endif #include <sys/param.h> #include <sys/systm.h> #include <sys/kernel.h> #include <sys/proc.h> #include <sys/resourcevar.h> #include <sys/buf.h> #include <sys/acct.h> #include <sys/wait.h> #include <sys/file.h> #include <ufs/ufs/quota.h> #include <sys/uio.h> #include <sys/pool.h> #include <sys/pset.h> #include <sys/ioctl.h> #include <sys/tty.h> #include <sys/signalvar.h> #include <sys/ras.h> #include <sys/filedesc.h> #include <sys/syscall_stats.h> #include <sys/kauth.h> #include <sys/sleepq.h> #include <sys/atomic.h> #include <sys/kmem.h> #include <sys/namei.h> #include <sys/dtrace_bsd.h> #include <sys/sysctl.h> #include <sys/exec.h> #include <sys/cpu.h> #include <sys/compat_stub.h> #include <sys/futex.h> #include <sys/pserialize.h> #include <uvm/uvm_extern.h> /* * Process lists. */ struct proclist allproc __cacheline_aligned; struct proclist zombproc __cacheline_aligned; kmutex_t proc_lock __cacheline_aligned; static pserialize_t proc_psz; /* * pid to lwp/proc lookup is done by indexing the pid_table array. * Since pid numbers are only allocated when an empty slot * has been found, there is no need to search any lists ever. * (an orphaned pgrp will lock the slot, a session will lock * the pgrp with the same number.) * If the table is too small it is reallocated with twice the * previous size and the entries 'unzipped' into the two halves. * A linked list of free entries is passed through the pt_lwp * field of 'free' items - set odd to be an invalid ptr. Two * additional bits are also used to indicate if the slot is * currently occupied by a proc or lwp, and if the PID is * hidden from certain kinds of lookups. We thus require a * minimum alignment for proc and lwp structures (LWPs are * at least 32-byte aligned). */ struct pid_table { uintptr_t pt_slot; struct pgrp *pt_pgrp; pid_t pt_pid; }; #define PT_F_FREE ((uintptr_t)__BIT(0)) #define PT_F_LWP 0 /* pseudo-flag */ #define PT_F_PROC ((uintptr_t)__BIT(1)) #define PT_F_TYPEBITS (PT_F_FREE|PT_F_PROC) #define PT_F_ALLBITS (PT_F_FREE|PT_F_PROC) #define PT_VALID(s) (((s) & PT_F_FREE) == 0) #define PT_RESERVED(s) ((s) == 0) #define PT_NEXT(s) ((u_int)(s) >> 1) #define PT_SET_FREE(pid) (((pid) << 1) | PT_F_FREE) #define PT_SET_LWP(l) ((uintptr_t)(l)) #define PT_SET_PROC(p) (((uintptr_t)(p)) | PT_F_PROC) #define PT_SET_RESERVED 0 #define PT_GET_LWP(s) ((struct lwp *)((s) & ~PT_F_ALLBITS)) #define PT_GET_PROC(s) ((struct proc *)((s) & ~PT_F_ALLBITS)) #define PT_GET_TYPE(s) ((s) & PT_F_TYPEBITS) #define PT_IS_LWP(s) (PT_GET_TYPE(s) == PT_F_LWP && (s) != 0) #define PT_IS_PROC(s) (PT_GET_TYPE(s) == PT_F_PROC) #define MIN_PROC_ALIGNMENT (PT_F_ALLBITS + 1) /* * Table of process IDs (PIDs). */ static struct pid_table *pid_table __read_mostly; #define INITIAL_PID_TABLE_SIZE (1 << 5) /* Table mask, threshold for growing and number of allocated PIDs. */ static u_int pid_tbl_mask __read_mostly; static u_int pid_alloc_lim __read_mostly; static u_int pid_alloc_cnt __cacheline_aligned; /* Next free, last free and maximum PIDs. */ static u_int next_free_pt __cacheline_aligned; static u_int last_free_pt __cacheline_aligned; static pid_t pid_max __read_mostly; /* Components of the first process -- never freed. */ struct session session0 = { .s_count = 1, .s_sid = 0, }; struct pgrp pgrp0 = { .pg_members = LIST_HEAD_INITIALIZER(&pgrp0.pg_members), .pg_session = &session0, }; filedesc_t filedesc0; struct cwdinfo cwdi0 = { .cwdi_cmask = CMASK, .cwdi_refcnt = 1, }; struct plimit limit0; struct pstats pstat0; struct vmspace vmspace0; struct sigacts sigacts0; struct proc proc0 = { .p_lwps = LIST_HEAD_INITIALIZER(&proc0.p_lwps), .p_sigwaiters = LIST_HEAD_INITIALIZER(&proc0.p_sigwaiters), .p_nlwps = 1, .p_nrlwps = 1, .p_pgrp = &pgrp0, .p_comm = "system", /* * Set P_NOCLDWAIT so that kernel threads are reparented to init(8) * when they exit. init(8) can easily wait them out for us. */ .p_flag = PK_SYSTEM | PK_NOCLDWAIT, .p_stat = SACTIVE, .p_nice = NZERO, .p_emul = &emul_netbsd, .p_cwdi = &cwdi0, .p_limit = &limit0, .p_fd = &filedesc0, .p_vmspace = &vmspace0, .p_stats = &pstat0, .p_sigacts = &sigacts0, #ifdef PROC0_MD_INITIALIZERS PROC0_MD_INITIALIZERS #endif }; kauth_cred_t cred0; static const int nofile = NOFILE; static const int maxuprc = MAXUPRC; static int sysctl_doeproc(SYSCTLFN_PROTO); static int sysctl_kern_proc_args(SYSCTLFN_PROTO); static int sysctl_security_expose_address(SYSCTLFN_PROTO); #ifdef KASLR static int kern_expose_address = 0; #else static int kern_expose_address = 1; #endif /* * The process list descriptors, used during pid allocation and * by sysctl. No locking on this data structure is needed since * it is completely static. */ const struct proclist_desc proclists[] = { { &allproc }, { &zombproc }, { NULL }, }; static struct pgrp * pg_remove(pid_t); static void pg_delete(pid_t); static void orphanpg(struct pgrp *); static specificdata_domain_t proc_specificdata_domain; static pool_cache_t proc_cache; static kauth_listener_t proc_listener; static void fill_proc(const struct proc *, struct proc *, bool); static int fill_pathname(struct lwp *, pid_t, void *, size_t *); static int fill_cwd(struct lwp *, pid_t, void *, size_t *); static int proc_listener_cb(kauth_cred_t cred, kauth_action_t action, void *cookie, void *arg0, void *arg1, void *arg2, void *arg3) { struct proc *p; int result; result = KAUTH_RESULT_DEFER; p = arg0; switch (action) { case KAUTH_PROCESS_CANSEE: { enum kauth_process_req req; req = (enum kauth_process_req)(uintptr_t)arg1; switch (req) { case KAUTH_REQ_PROCESS_CANSEE_ARGS: case KAUTH_REQ_PROCESS_CANSEE_ENTRY: case KAUTH_REQ_PROCESS_CANSEE_OPENFILES: case KAUTH_REQ_PROCESS_CANSEE_EPROC: result = KAUTH_RESULT_ALLOW; break; case KAUTH_REQ_PROCESS_CANSEE_ENV: if (kauth_cred_getuid(cred) != kauth_cred_getuid(p->p_cred) || kauth_cred_getuid(cred) != kauth_cred_getsvuid(p->p_cred)) break; result = KAUTH_RESULT_ALLOW; break; case KAUTH_REQ_PROCESS_CANSEE_KPTR: if (!kern_expose_address) break; if (kern_expose_address == 1 && !(p->p_flag & PK_KMEM)) break; result = KAUTH_RESULT_ALLOW; break; default: break; } break; } case KAUTH_PROCESS_FORK: { int lnprocs = (int)(unsigned long)arg2; /* * Don't allow a nonprivileged user to use the last few * processes. The variable lnprocs is the current number of * processes, maxproc is the limit. */ if (__predict_false((lnprocs >= maxproc - 5))) break; result = KAUTH_RESULT_ALLOW; break; } case KAUTH_PROCESS_CORENAME: case KAUTH_PROCESS_STOPFLAG: if (proc_uidmatch(cred, p->p_cred) == 0) result = KAUTH_RESULT_ALLOW; break; default: break; } return result; } static int proc_ctor(void *arg __unused, void *obj, int flags __unused) { struct proc *p = obj; memset(p, 0, sizeof(*p)); klist_init(&p->p_klist); /* * There is no need for a proc_dtor() to do a klist_fini(), * since knote_proc_exit() ensures that p->p_klist is empty * when a process exits. */ return 0; } static pid_t proc_alloc_pid_slot(struct proc *, uintptr_t); /* * Initialize global process hashing structures. */ void procinit(void) { const struct proclist_desc *pd; u_int i; #define LINK_EMPTY ((PID_MAX + INITIAL_PID_TABLE_SIZE) & ~(INITIAL_PID_TABLE_SIZE - 1)) for (pd = proclists; pd->pd_list != NULL; pd++) LIST_INIT(pd->pd_list); mutex_init(&proc_lock, MUTEX_DEFAULT, IPL_NONE); proc_psz = pserialize_create(); pid_table = kmem_alloc(INITIAL_PID_TABLE_SIZE * sizeof(struct pid_table), KM_SLEEP); pid_tbl_mask = INITIAL_PID_TABLE_SIZE - 1; pid_max = PID_MAX; /* Set free list running through table... Preset 'use count' above PID_MAX so we allocate pid 1 next. */ for (i = 0; i <= pid_tbl_mask; i++) { pid_table[i].pt_slot = PT_SET_FREE(LINK_EMPTY + i + 1); pid_table[i].pt_pgrp = 0; pid_table[i].pt_pid = 0; } /* slot 0 is just grabbed */ next_free_pt = 1; /* Need to fix last entry. */ last_free_pt = pid_tbl_mask; pid_table[last_free_pt].pt_slot = PT_SET_FREE(LINK_EMPTY); /* point at which we grow table - to avoid reusing pids too often */ pid_alloc_lim = pid_tbl_mask - 1; #undef LINK_EMPTY /* Reserve PID 1 for init(8). */ /* XXX slightly gross */ mutex_enter(&proc_lock); if (proc_alloc_pid_slot(&proc0, PT_SET_RESERVED) != 1) panic("failed to reserve PID 1 for init(8)"); mutex_exit(&proc_lock); proc_specificdata_domain = specificdata_domain_create(); KASSERT(proc_specificdata_domain != NULL); size_t proc_alignment = coherency_unit; if (proc_alignment < MIN_PROC_ALIGNMENT) proc_alignment = MIN_PROC_ALIGNMENT; proc_cache = pool_cache_init(sizeof(struct proc), proc_alignment, 0, 0, "procpl", NULL, IPL_NONE, proc_ctor, NULL, NULL); proc_listener = kauth_listen_scope(KAUTH_SCOPE_PROCESS, proc_listener_cb, NULL); } void procinit_sysctl(void) { static struct sysctllog *clog; sysctl_createv(&clog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_INT, "expose_address", SYSCTL_DESCR("Enable exposing kernel addresses"), sysctl_security_expose_address, 0, &kern_expose_address, 0, CTL_KERN, CTL_CREATE, CTL_EOL); sysctl_createv(&clog, 0, NULL, NULL, CTLFLAG_PERMANENT, CTLTYPE_NODE, "proc", SYSCTL_DESCR("System-wide process information"), sysctl_doeproc, 0, NULL, 0, CTL_KERN, KERN_PROC, CTL_EOL); sysctl_createv(&clog, 0, NULL, NULL, CTLFLAG_PERMANENT, CTLTYPE_NODE, "proc2", SYSCTL_DESCR("Machine-independent process information"), sysctl_doeproc, 0, NULL, 0, CTL_KERN, KERN_PROC2, CTL_EOL); sysctl_createv(&clog, 0, NULL, NULL, CTLFLAG_PERMANENT, CTLTYPE_NODE, "proc_args", SYSCTL_DESCR("Process argument information"), sysctl_kern_proc_args, 0, NULL, 0, CTL_KERN, KERN_PROC_ARGS, CTL_EOL); /* "nodes" under these: KERN_PROC_ALL KERN_PROC_PID pid KERN_PROC_PGRP pgrp KERN_PROC_SESSION sess KERN_PROC_TTY tty KERN_PROC_UID uid KERN_PROC_RUID uid KERN_PROC_GID gid KERN_PROC_RGID gid all in all, probably not worth the effort... */ } /* * Initialize process 0. */ void proc0_init(void) { struct proc *p; struct pgrp *pg; struct rlimit *rlim; rlim_t lim; int i; p = &proc0; pg = &pgrp0; mutex_init(&p->p_stmutex, MUTEX_DEFAULT, IPL_HIGH); mutex_init(&p->p_auxlock, MUTEX_DEFAULT, IPL_NONE); p->p_lock = mutex_obj_alloc(MUTEX_DEFAULT, IPL_NONE); rw_init(&p->p_reflock); cv_init(&p->p_waitcv, "wait"); cv_init(&p->p_lwpcv, "lwpwait"); LIST_INSERT_HEAD(&p->p_lwps, &lwp0, l_sibling); KASSERT(lwp0.l_lid == 0); pid_table[lwp0.l_lid].pt_slot = PT_SET_LWP(&lwp0); LIST_INSERT_HEAD(&allproc, p, p_list); pid_table[lwp0.l_lid].pt_pgrp = pg; LIST_INSERT_HEAD(&pg->pg_members, p, p_pglist); #ifdef __HAVE_SYSCALL_INTERN (*p->p_emul->e_syscall_intern)(p); #endif /* Create credentials. */ cred0 = kauth_cred_alloc(); p->p_cred = cred0; /* Create the CWD info. */ rw_init(&cwdi0.cwdi_lock); /* Create the limits structures. */ mutex_init(&limit0.pl_lock, MUTEX_DEFAULT, IPL_NONE); rlim = limit0.pl_rlimit; for (i = 0; i < __arraycount(limit0.pl_rlimit); i++) { rlim[i].rlim_cur = RLIM_INFINITY; rlim[i].rlim_max = RLIM_INFINITY; } rlim[RLIMIT_NOFILE].rlim_max = maxfiles; rlim[RLIMIT_NOFILE].rlim_cur = maxfiles < nofile ? maxfiles : nofile; rlim[RLIMIT_NPROC].rlim_max = maxproc; rlim[RLIMIT_NPROC].rlim_cur = maxproc < maxuprc ? maxproc : maxuprc; lim = MIN(VM_MAXUSER_ADDRESS, ctob((rlim_t)uvm_availmem(false))); rlim[RLIMIT_RSS].rlim_max = lim; rlim[RLIMIT_MEMLOCK].rlim_max = lim; rlim[RLIMIT_MEMLOCK].rlim_cur = lim / 3; rlim[RLIMIT_NTHR].rlim_max = maxlwp; rlim[RLIMIT_NTHR].rlim_cur = maxlwp / 2; /* Note that default core name has zero length. */ limit0.pl_corename = defcorename; limit0.pl_cnlen = 0; limit0.pl_refcnt = 1; limit0.pl_writeable = false; limit0.pl_sv_limit = NULL; /* Configure virtual memory system, set vm rlimits. */ uvm_init_limits(p); /* Initialize file descriptor table for proc0. */ fd_init(&filedesc0); /* * Initialize proc0's vmspace, which uses the kernel pmap. * All kernel processes (which never have user space mappings) * share proc0's vmspace, and thus, the kernel pmap. */ uvmspace_init(&vmspace0, pmap_kernel(), round_page(VM_MIN_ADDRESS), trunc_page(VM_MAXUSER_ADDRESS), #ifdef __USE_TOPDOWN_VM true #else false #endif ); /* Initialize signal state for proc0. XXX IPL_SCHED */ mutex_init(&p->p_sigacts->sa_mutex, MUTEX_DEFAULT, IPL_SCHED); siginit(p); proc_initspecific(p); kdtrace_proc_ctor(NULL, p); } /* * Session reference counting. */ void proc_sesshold(struct session *ss) { KASSERT(mutex_owned(&proc_lock)); ss->s_count++; } void proc_sessrele(struct session *ss) { struct pgrp *pg; KASSERT(mutex_owned(&proc_lock)); KASSERT(ss->s_count > 0); /* * We keep the pgrp with the same id as the session in order to * stop a process being given the same pid. Since the pgrp holds * a reference to the session, it must be a 'zombie' pgrp by now. */ if (--ss->s_count == 0) { pg = pg_remove(ss->s_sid); } else { pg = NULL; ss = NULL; } mutex_exit(&proc_lock); if (pg) kmem_free(pg, sizeof(struct pgrp)); if (ss) kmem_free(ss, sizeof(struct session)); } /* * Check that the specified process group is in the session of the * specified process. * Treats -ve ids as process ids. * Used to validate TIOCSPGRP requests. */ int pgid_in_session(struct proc *p, pid_t pg_id) { struct pgrp *pgrp; struct session *session; int error; if (pg_id == INT_MIN) return EINVAL; mutex_enter(&proc_lock); if (pg_id < 0) { struct proc *p1 = proc_find(-pg_id); if (p1 == NULL) { error = EINVAL; goto fail; } pgrp = p1->p_pgrp; } else { pgrp = pgrp_find(pg_id); if (pgrp == NULL) { error = EINVAL; goto fail; } } session = pgrp->pg_session; error = (session != p->p_pgrp->pg_session) ? EPERM : 0; fail: mutex_exit(&proc_lock); return error; } /* * p_inferior: is p an inferior of q? */ static inline bool p_inferior(struct proc *p, struct proc *q) { KASSERT(mutex_owned(&proc_lock)); for (; p != q; p = p->p_pptr) if (p->p_pid == 0) return false; return true; } /* * proc_find_lwp: locate an lwp in said proc by the ID. * * => Must be called with p::p_lock held. * => LSIDL lwps are not returned because they are only partially * constructed while occupying the slot. * => Callers need to be careful about lwp::l_stat of the returned * lwp. */ struct lwp * proc_find_lwp(proc_t *p, pid_t pid) { struct pid_table *pt; unsigned pt_mask; struct lwp *l = NULL; uintptr_t slot; int s; KASSERT(mutex_owned(p->p_lock)); /* * Look in the pid_table. This is done unlocked inside a * pserialize read section covering pid_table's memory * allocation only, so take care to read things in the correct * order: * * 1. First read the table mask -- this only ever increases, in * expand_pid_table, so a stale value is safely * conservative. * * 2. Next read the pid table -- this is always set _before_ * the mask increases, so if we see a new table and stale * mask, the mask is still valid for the table. */ s = pserialize_read_enter(); pt_mask = atomic_load_acquire(&pid_tbl_mask); pt = &atomic_load_consume(&pid_table)[pid & pt_mask]; slot = atomic_load_consume(&pt->pt_slot); if (__predict_false(!PT_IS_LWP(slot))) { pserialize_read_exit(s); return NULL; } /* * Check to see if the LWP is from the correct process. We won't * see entries in pid_table from a prior process that also used "p", * by virtue of the fact that allocating "p" means all prior updates * to dependant data structures are visible to this thread. */ l = PT_GET_LWP(slot); if (__predict_false(atomic_load_relaxed(&l->l_proc) != p)) { pserialize_read_exit(s); return NULL; } /* * We now know that p->p_lock holds this LWP stable. * * If the status is not LSIDL, it means the LWP is intended to be * findable by LID and l_lid cannot change behind us. * * No need to acquire the LWP's lock to check for LSIDL, as * p->p_lock must be held to transition in and out of LSIDL. * Any other observed state of is no particular interest. */ pserialize_read_exit(s); return l->l_stat != LSIDL && l->l_lid == pid ? l : NULL; } /* * proc_find_lwp_unlocked: locate an lwp in said proc by the ID. * * => Called in a pserialize read section with no locks held. * => LSIDL lwps are not returned because they are only partially * constructed while occupying the slot. * => Callers need to be careful about lwp::l_stat of the returned * lwp. * => If an LWP is found, it's returned locked. */ struct lwp * proc_find_lwp_unlocked(proc_t *p, pid_t pid) { struct pid_table *pt; unsigned pt_mask; struct lwp *l = NULL; uintptr_t slot; KASSERT(pserialize_in_read_section()); /* * Look in the pid_table. This is done unlocked inside a * pserialize read section covering pid_table's memory * allocation only, so take care to read things in the correct * order: * * 1. First read the table mask -- this only ever increases, in * expand_pid_table, so a stale value is safely * conservative. * * 2. Next read the pid table -- this is always set _before_ * the mask increases, so if we see a new table and stale * mask, the mask is still valid for the table. */ pt_mask = atomic_load_acquire(&pid_tbl_mask); pt = &atomic_load_consume(&pid_table)[pid & pt_mask]; slot = atomic_load_consume(&pt->pt_slot); if (__predict_false(!PT_IS_LWP(slot))) { return NULL; } /* * Lock the LWP we found to get it stable. If it's embryonic or * reaped (LSIDL) then none of the other fields can safely be * checked. */ l = PT_GET_LWP(slot); lwp_lock(l); if (__predict_false(l->l_stat == LSIDL)) { lwp_unlock(l); return NULL; } /* * l_proc and l_lid are now known stable because the LWP is not * LSIDL, so check those fields too to make sure we found the * right thing. */ if (__predict_false(l->l_proc != p || l->l_lid != pid)) { lwp_unlock(l); return NULL; } /* Everything checks out, return it locked. */ return l; } /* * proc_find_lwp_acquire_proc: locate an lwp and acquire a lock * on its containing proc. * * => Similar to proc_find_lwp(), but does not require you to have * the proc a priori. * => Also returns proc * to caller, with p::p_lock held. * => Same caveats apply. */ struct lwp * proc_find_lwp_acquire_proc(pid_t pid, struct proc **pp) { struct pid_table *pt; struct proc *p = NULL; struct lwp *l = NULL; uintptr_t slot; KASSERT(pp != NULL); mutex_enter(&proc_lock); pt = &pid_table[pid & pid_tbl_mask]; slot = pt->pt_slot; if (__predict_true(PT_IS_LWP(slot) && pt->pt_pid == pid)) { l = PT_GET_LWP(slot); p = l->l_proc; mutex_enter(p->p_lock); if (__predict_false(l->l_stat == LSIDL)) { mutex_exit(p->p_lock); l = NULL; p = NULL; } } mutex_exit(&proc_lock); KASSERT(p == NULL || mutex_owned(p->p_lock)); *pp = p; return l; } /* * proc_find_raw_pid_table_locked: locate a process by the ID. * * => Must be called with proc_lock held. */ static proc_t * proc_find_raw_pid_table_locked(pid_t pid, bool any_lwpid) { struct pid_table *pt; proc_t *p = NULL; uintptr_t slot; /* No - used by DDB. KASSERT(mutex_owned(&proc_lock)); */ pt = &pid_table[pid & pid_tbl_mask]; slot = pt->pt_slot; if (__predict_true(PT_IS_LWP(slot) && pt->pt_pid == pid)) { /* * When looking up processes, require a direct match * on the PID assigned to the proc, not just one of * its LWPs. * * N.B. We require lwp::l_proc of LSIDL LWPs to be * valid here. */ p = PT_GET_LWP(slot)->l_proc; if (__predict_false(p->p_pid != pid && !any_lwpid)) p = NULL; } else if (PT_IS_PROC(slot) && pt->pt_pid == pid) { p = PT_GET_PROC(slot); } return p; } proc_t * proc_find_raw(pid_t pid) { return proc_find_raw_pid_table_locked(pid, false); } static proc_t * proc_find_internal(pid_t pid, bool any_lwpid) { proc_t *p; KASSERT(mutex_owned(&proc_lock)); p = proc_find_raw_pid_table_locked(pid, any_lwpid); if (__predict_false(p == NULL)) { return NULL; } /* * Only allow live processes to be found by PID. * XXX: p_stat might change, since proc unlocked. */ if (__predict_true(p->p_stat == SACTIVE || p->p_stat == SSTOP)) { return p; } return NULL; } proc_t * proc_find(pid_t pid) { return proc_find_internal(pid, false); } proc_t * proc_find_lwpid(pid_t pid) { return proc_find_internal(pid, true); } /* * pgrp_find: locate a process group by the ID. * * => Must be called with proc_lock held. */ struct pgrp * pgrp_find(pid_t pgid) { struct pgrp *pg; KASSERT(mutex_owned(&proc_lock)); pg = pid_table[pgid & pid_tbl_mask].pt_pgrp; /* * Cannot look up a process group that only exists because the * session has not died yet (traditional). */ if (pg == NULL || pg->pg_id != pgid || LIST_EMPTY(&pg->pg_members)) { return NULL; } return pg; } static void expand_pid_table(void) { size_t pt_size, tsz; struct pid_table *n_pt, *new_pt; uintptr_t slot; struct pgrp *pgrp; pid_t pid, rpid; u_int i; uint new_pt_mask; KASSERT(mutex_owned(&proc_lock)); /* Unlock the pid_table briefly to allocate memory. */ pt_size = pid_tbl_mask + 1; mutex_exit(&proc_lock); tsz = pt_size * 2 * sizeof(struct pid_table); new_pt = kmem_alloc(tsz, KM_SLEEP); new_pt_mask = pt_size * 2 - 1; /* XXX For now. The pratical limit is much lower anyway. */ KASSERT(new_pt_mask <= FUTEX_TID_MASK); mutex_enter(&proc_lock); if (pt_size != pid_tbl_mask + 1) { /* Another process beat us to it... */ mutex_exit(&proc_lock); kmem_free(new_pt, tsz); goto out; } /* * Copy entries from old table into new one. * If 'pid' is 'odd' we need to place in the upper half, * even pid's to the lower half. * Free items stay in the low half so we don't have to * fixup the reference to them. * We stuff free items on the front of the freelist * because we can't write to unmodified entries. * Processing the table backwards maintains a semblance * of issuing pid numbers that increase with time. */ i = pt_size - 1; n_pt = new_pt + i; for (; ; i--, n_pt--) { slot = pid_table[i].pt_slot; pgrp = pid_table[i].pt_pgrp; if (!PT_VALID(slot)) { /* Up 'use count' so that link is valid */ pid = (PT_NEXT(slot) + pt_size) & ~pt_size; rpid = 0; slot = PT_SET_FREE(pid); if (pgrp) pid = pgrp->pg_id; } else { pid = pid_table[i].pt_pid; rpid = pid; } /* Save entry in appropriate half of table */ n_pt[pid & pt_size].pt_slot = slot; n_pt[pid & pt_size].pt_pgrp = pgrp; n_pt[pid & pt_size].pt_pid = rpid; /* Put other piece on start of free list */ pid = (pid ^ pt_size) & ~pid_tbl_mask; n_pt[pid & pt_size].pt_slot = PT_SET_FREE((pid & ~pt_size) | next_free_pt); n_pt[pid & pt_size].pt_pgrp = 0; n_pt[pid & pt_size].pt_pid = 0; next_free_pt = i | (pid & pt_size); if (i == 0) break; } /* Save old table size and switch tables */ tsz = pt_size * sizeof(struct pid_table); n_pt = pid_table; atomic_store_release(&pid_table, new_pt); KASSERT(new_pt_mask >= pid_tbl_mask); atomic_store_release(&pid_tbl_mask, new_pt_mask); /* * pid_max starts as PID_MAX (= 30000), once we have 16384 * allocated pids we need it to be larger! */ if (pid_tbl_mask > PID_MAX) { pid_max = pid_tbl_mask * 2 + 1; pid_alloc_lim |= pid_alloc_lim << 1; } else pid_alloc_lim <<= 1; /* doubles number of free slots... */ mutex_exit(&proc_lock); /* * Make sure that unlocked access to the old pid_table is complete * and then free it. */ pserialize_perform(proc_psz); kmem_free(n_pt, tsz); out: /* Return with proc_lock held again. */ mutex_enter(&proc_lock); } struct proc * proc_alloc(void) { struct proc *p; p = pool_cache_get(proc_cache, PR_WAITOK); p->p_stat = SIDL; /* protect against others */ proc_initspecific(p); kdtrace_proc_ctor(NULL, p); /* * Allocate a placeholder in the pid_table. When we create the * first LWP for this process, it will take ownership of the * slot. */ if (__predict_false(proc_alloc_pid(p) == -1)) { /* Allocating the PID failed; unwind. */ proc_finispecific(p); proc_free_mem(p); p = NULL; } return p; } /* * proc_alloc_pid_slot: allocate PID and record the occcupant so that * proc_find_raw() can find it by the PID. */ static pid_t __noinline proc_alloc_pid_slot(struct proc *p, uintptr_t slot) { struct pid_table *pt; pid_t pid; int nxt; KASSERT(mutex_owned(&proc_lock)); for (;;expand_pid_table()) { if (__predict_false(pid_alloc_cnt >= pid_alloc_lim)) { /* ensure pids cycle through 2000+ values */ continue; } /* * The first user process *must* be given PID 1. * it has already been reserved for us. This * will be coming in from the proc_alloc() call * above, and the entry will be usurped later when * the first user LWP is created. * XXX this is slightly gross. */ if (__predict_false(PT_RESERVED(pid_table[1].pt_slot) && p != &proc0)) { KASSERT(PT_IS_PROC(slot)); pt = &pid_table[1]; pt->pt_slot = slot; return 1; } pt = &pid_table[next_free_pt]; #ifdef DIAGNOSTIC if (__predict_false(PT_VALID(pt->pt_slot) || pt->pt_pgrp)) panic("proc_alloc: slot busy"); #endif nxt = PT_NEXT(pt->pt_slot); if (nxt & pid_tbl_mask) break; /* Table full - expand (NB last entry not used....) */ } /* pid is 'saved use count' + 'size' + entry */ pid = (nxt & ~pid_tbl_mask) + pid_tbl_mask + 1 + next_free_pt; if ((uint)pid > (uint)pid_max) pid &= pid_tbl_mask; next_free_pt = nxt & pid_tbl_mask; /* XXX For now. The pratical limit is much lower anyway. */ KASSERT(pid <= FUTEX_TID_MASK); /* Grab table slot */ pt->pt_slot = slot; KASSERT(pt->pt_pid == 0); pt->pt_pid = pid; pid_alloc_cnt++; return pid; } pid_t proc_alloc_pid(struct proc *p) { pid_t pid; KASSERT((((uintptr_t)p) & PT_F_ALLBITS) == 0); KASSERT(p->p_stat == SIDL); mutex_enter(&proc_lock); pid = proc_alloc_pid_slot(p, PT_SET_PROC(p)); if (pid != -1) p->p_pid = pid; mutex_exit(&proc_lock); return pid; } pid_t proc_alloc_lwpid(struct proc *p, struct lwp *l) { struct pid_table *pt; pid_t pid; KASSERT((((uintptr_t)l) & PT_F_ALLBITS) == 0); KASSERT(l->l_proc == p); KASSERT(l->l_stat == LSIDL); /* * For unlocked lookup in proc_find_lwp(), make sure l->l_proc * is globally visible before the LWP becomes visible via the * pid_table. */ #ifndef __HAVE_ATOMIC_AS_MEMBAR membar_producer(); #endif /* * If the slot for p->p_pid currently points to the proc, * then we should usurp this ID for the LWP. This happens * at least once per process (for the first LWP), and can * happen again if the first LWP for a process exits and * before the process creates another. */ mutex_enter(&proc_lock); pid = p->p_pid; pt = &pid_table[pid & pid_tbl_mask]; KASSERT(pt->pt_pid == pid); if (PT_IS_PROC(pt->pt_slot)) { KASSERT(PT_GET_PROC(pt->pt_slot) == p); l->l_lid = pid; pt->pt_slot = PT_SET_LWP(l); } else { /* Need to allocate a new slot. */ pid = proc_alloc_pid_slot(p, PT_SET_LWP(l)); if (pid != -1) l->l_lid = pid; } mutex_exit(&proc_lock); return pid; } static void __noinline proc_free_pid_internal(pid_t pid, uintptr_t type __diagused) { struct pid_table *pt; KASSERT(mutex_owned(&proc_lock)); pt = &pid_table[pid & pid_tbl_mask]; KASSERT(PT_GET_TYPE(pt->pt_slot) == type); KASSERT(pt->pt_pid == pid); /* save pid use count in slot */ pt->pt_slot = PT_SET_FREE(pid & ~pid_tbl_mask); pt->pt_pid = 0; if (pt->pt_pgrp == NULL) { /* link last freed entry onto ours */ pid &= pid_tbl_mask; pt = &pid_table[last_free_pt]; pt->pt_slot = PT_SET_FREE(PT_NEXT(pt->pt_slot) | pid); pt->pt_pid = 0; last_free_pt = pid; pid_alloc_cnt--; } } /* * Free a process id - called from proc_free (in kern_exit.c) * * Called with the proc_lock held. */ void proc_free_pid(pid_t pid) { KASSERT(mutex_owned(&proc_lock)); proc_free_pid_internal(pid, PT_F_PROC); } /* * Free a process id used by an LWP. If this was the process's * first LWP, we convert the slot to point to the process; the * entry will get cleaned up later when the process finishes exiting. * * If not, then it's the same as proc_free_pid(). */ void proc_free_lwpid(struct proc *p, pid_t pid) { KASSERT(mutex_owned(&proc_lock)); if (__predict_true(p->p_pid == pid)) { struct pid_table *pt; pt = &pid_table[pid & pid_tbl_mask]; KASSERT(pt->pt_pid == pid); KASSERT(PT_IS_LWP(pt->pt_slot)); KASSERT(PT_GET_LWP(pt->pt_slot)->l_proc == p); pt->pt_slot = PT_SET_PROC(p); return; } proc_free_pid_internal(pid, PT_F_LWP); } void proc_free_mem(struct proc *p) { kdtrace_proc_dtor(NULL, p); pool_cache_put(proc_cache, p); } /* * proc_enterpgrp: move p to a new or existing process group (and session). * * If we are creating a new pgrp, the pgid should equal * the calling process' pid. * If is only valid to enter a process group that is in the session * of the process. * Also mksess should only be set if we are creating a process group * * Only called from sys_setsid, sys_setpgid and posix_spawn/spawn_return. */ int proc_enterpgrp(struct proc *curp, pid_t pid, pid_t pgid, bool mksess) { struct pgrp *new_pgrp, *pgrp; struct session *sess; struct proc *p; int rval; pid_t pg_id = NO_PGID; /* Allocate data areas we might need before doing any validity checks */ sess = mksess ? kmem_alloc(sizeof(*sess), KM_SLEEP) : NULL; new_pgrp = kmem_alloc(sizeof(*new_pgrp), KM_SLEEP); mutex_enter(&proc_lock); rval = EPERM; /* most common error (to save typing) */ /* Check pgrp exists or can be created */ pgrp = pid_table[pgid & pid_tbl_mask].pt_pgrp; if (pgrp != NULL && pgrp->pg_id != pgid) goto done; /* Can only set another process under restricted circumstances. */ if (pid != curp->p_pid) { /* Must exist and be one of our children... */ p = proc_find_internal(pid, false); if (p == NULL || !p_inferior(p, curp)) { rval = ESRCH; goto done; } /* ... in the same session... */ if (sess != NULL || p->p_session != curp->p_session) goto done; /* ... existing pgid must be in same session ... */ if (pgrp != NULL && pgrp->pg_session != p->p_session) goto done; /* ... and not done an exec. */ if (p->p_flag & PK_EXEC) { rval = EACCES; goto done; } } else { /* ... setsid() cannot re-enter a pgrp */ if (mksess && (curp->p_pgid == curp->p_pid || pgrp_find(curp->p_pid))) goto done; p = curp; } /* Changing the process group/session of a session leader is definitely off limits. */ if (SESS_LEADER(p)) { if (sess == NULL && p->p_pgrp == pgrp) /* unless it's a definite noop */ rval = 0; goto done; } /* Can only create a process group with id of process */ if (pgrp == NULL && pgid != pid) goto done; /* Can only create a session if creating pgrp */ if (sess != NULL && pgrp != NULL) goto done; /* Check we allocated memory for a pgrp... */ if (pgrp == NULL && new_pgrp == NULL) goto done; /* Don't attach to 'zombie' pgrp */ if (pgrp != NULL && LIST_EMPTY(&pgrp->pg_members)) goto done; /* Expect to succeed now */ rval = 0; if (pgrp == p->p_pgrp) /* nothing to do */ goto done; /* Ok all setup, link up required structures */ if (pgrp == NULL) { pgrp = new_pgrp; new_pgrp = NULL; if (sess != NULL) { sess->s_sid = p->p_pid; sess->s_leader = p; sess->s_count = 1; sess->s_ttyvp = NULL; sess->s_ttyp = NULL; sess->s_flags = p->p_session->s_flags & ~S_LOGIN_SET; memcpy(sess->s_login, p->p_session->s_login, sizeof(sess->s_login)); p->p_lflag &= ~PL_CONTROLT; } else { sess = p->p_pgrp->pg_session; proc_sesshold(sess); } pgrp->pg_session = sess; sess = NULL; pgrp->pg_id = pgid; LIST_INIT(&pgrp->pg_members); #ifdef DIAGNOSTIC if (__predict_false(pid_table[pgid & pid_tbl_mask].pt_pgrp)) panic("enterpgrp: pgrp table slot in use"); if (__predict_false(mksess && p != curp)) panic("enterpgrp: mksession and p != curproc"); #endif pid_table[pgid & pid_tbl_mask].pt_pgrp = pgrp; pgrp->pg_jobc = 0; } /* * Adjust eligibility of affected pgrps to participate in job control. * Increment eligibility counts before decrementing, otherwise we * could reach 0 spuriously during the first call. */ fixjobc(p, pgrp, 1); fixjobc(p, p->p_pgrp, 0); /* Interlock with ttread(). */ mutex_spin_enter(&tty_lock); /* Move process to requested group. */ LIST_REMOVE(p, p_pglist); if (LIST_EMPTY(&p->p_pgrp->pg_members)) /* defer delete until we've dumped the lock */ pg_id = p->p_pgrp->pg_id; p->p_pgrp = pgrp; LIST_INSERT_HEAD(&pgrp->pg_members, p, p_pglist); /* Done with the swap; we can release the tty mutex. */ mutex_spin_exit(&tty_lock); done: if (pg_id != NO_PGID) { /* Releases proc_lock. */ pg_delete(pg_id); } else { mutex_exit(&proc_lock); } if (sess != NULL) kmem_free(sess, sizeof(*sess)); if (new_pgrp != NULL) kmem_free(new_pgrp, sizeof(*new_pgrp)); #ifdef DEBUG_PGRP if (__predict_false(rval)) printf("enterpgrp(%d,%d,%d), curproc %d, rval %d\n", pid, pgid, mksess, curp->p_pid, rval); #endif return rval; } /* * proc_leavepgrp: remove a process from its process group. * => must be called with the proc_lock held, which will be released; */ void proc_leavepgrp(struct proc *p) { struct pgrp *pgrp; KASSERT(mutex_owned(&proc_lock)); /* Interlock with ttread() */ mutex_spin_enter(&tty_lock); pgrp = p->p_pgrp; LIST_REMOVE(p, p_pglist); p->p_pgrp = NULL; mutex_spin_exit(&tty_lock); if (LIST_EMPTY(&pgrp->pg_members)) { /* Releases proc_lock. */ pg_delete(pgrp->pg_id); } else { mutex_exit(&proc_lock); } } /* * pg_remove: remove a process group from the table. * => must be called with the proc_lock held; * => returns process group to free; */ static struct pgrp * pg_remove(pid_t pg_id) { struct pgrp *pgrp; struct pid_table *pt; KASSERT(mutex_owned(&proc_lock)); pt = &pid_table[pg_id & pid_tbl_mask]; pgrp = pt->pt_pgrp; KASSERT(pgrp != NULL); KASSERT(pgrp->pg_id == pg_id); KASSERT(LIST_EMPTY(&pgrp->pg_members)); pt->pt_pgrp = NULL; if (!PT_VALID(pt->pt_slot)) { /* Orphaned pgrp, put slot onto free list. */ KASSERT((PT_NEXT(pt->pt_slot) & pid_tbl_mask) == 0); pg_id &= pid_tbl_mask; pt = &pid_table[last_free_pt]; pt->pt_slot = PT_SET_FREE(PT_NEXT(pt->pt_slot) | pg_id); KASSERT(pt->pt_pid == 0); last_free_pt = pg_id; pid_alloc_cnt--; } return pgrp; } /* * pg_delete: delete and free a process group. * => must be called with the proc_lock held, which will be released. */ static void pg_delete(pid_t pg_id) { struct pgrp *pg; struct tty *ttyp; struct session *ss; KASSERT(mutex_owned(&proc_lock)); pg = pid_table[pg_id & pid_tbl_mask].pt_pgrp; if (pg == NULL || pg->pg_id != pg_id || !LIST_EMPTY(&pg->pg_members)) { mutex_exit(&proc_lock); return; } ss = pg->pg_session; /* Remove reference (if any) from tty to this process group */ mutex_spin_enter(&tty_lock); ttyp = ss->s_ttyp; if (ttyp != NULL && ttyp->t_pgrp == pg) { ttyp->t_pgrp = NULL; KASSERT(ttyp->t_session == ss); } mutex_spin_exit(&tty_lock); /* * The leading process group in a session is freed by proc_sessrele(), * if last reference. It will also release the locks. */ pg = (ss->s_sid != pg->pg_id) ? pg_remove(pg_id) : NULL; proc_sessrele(ss); if (pg != NULL) { /* Free it, if was not done above. */ kmem_free(pg, sizeof(struct pgrp)); } } /* * Adjust pgrp jobc counters when specified process changes process group. * We count the number of processes in each process group that "qualify" * the group for terminal job control (those with a parent in a different * process group of the same session). If that count reaches zero, the * process group becomes orphaned. Check both the specified process' * process group and that of its children. * entering == 0 => p is leaving specified group. * entering == 1 => p is entering specified group. * * Call with proc_lock held. */ void fixjobc(struct proc *p, struct pgrp *pgrp, int entering) { struct pgrp *hispgrp; struct session *mysession = pgrp->pg_session; struct proc *child; KASSERT(mutex_owned(&proc_lock)); /* * Check p's parent to see whether p qualifies its own process * group; if so, adjust count for p's process group. */ hispgrp = p->p_pptr->p_pgrp; if (hispgrp != pgrp && hispgrp->pg_session == mysession) { if (entering) { pgrp->pg_jobc++; p->p_lflag &= ~PL_ORPHANPG; } else { /* KASSERT(pgrp->pg_jobc > 0); */ if (--pgrp->pg_jobc == 0) orphanpg(pgrp); } } /* * Check this process' children to see whether they qualify * their process groups; if so, adjust counts for children's * process groups. */ LIST_FOREACH(child, &p->p_children, p_sibling) { hispgrp = child->p_pgrp; if (hispgrp != pgrp && hispgrp->pg_session == mysession && !P_ZOMBIE(child)) { if (entering) { child->p_lflag &= ~PL_ORPHANPG; hispgrp->pg_jobc++; } else { KASSERT(hispgrp->pg_jobc > 0); if (--hispgrp->pg_jobc == 0) orphanpg(hispgrp); } } } } /* * A process group has become orphaned; * if there are any stopped processes in the group, * hang-up all process in that group. * * Call with proc_lock held. */ static void orphanpg(struct pgrp *pg) { struct proc *p; KASSERT(mutex_owned(&proc_lock)); LIST_FOREACH(p, &pg->pg_members, p_pglist) { if (p->p_stat == SSTOP) { p->p_lflag |= PL_ORPHANPG; psignal(p, SIGHUP); psignal(p, SIGCONT); } } } #ifdef DDB #include <ddb/db_output.h> void pidtbl_dump(void); void pidtbl_dump(void) { struct pid_table *pt; struct proc *p; struct pgrp *pgrp; uintptr_t slot; int id; db_printf("pid table %p size %x, next %x, last %x\n", pid_table, pid_tbl_mask+1, next_free_pt, last_free_pt); for (pt = pid_table, id = 0; id <= pid_tbl_mask; id++, pt++) { slot = pt->pt_slot; if (!PT_VALID(slot) && !pt->pt_pgrp) continue; if (PT_IS_LWP(slot)) { p = PT_GET_LWP(slot)->l_proc; } else if (PT_IS_PROC(slot)) { p = PT_GET_PROC(slot); } else { p = NULL; } db_printf(" id %x: ", id); if (p != NULL) db_printf("slotpid %d proc %p id %d (0x%x) %s\n", pt->pt_pid, p, p->p_pid, p->p_pid, p->p_comm); else db_printf("next %x use %x\n", PT_NEXT(slot) & pid_tbl_mask, PT_NEXT(slot) & ~pid_tbl_mask); if ((pgrp = pt->pt_pgrp)) { db_printf("\tsession %p, sid %d, count %d, login %s\n", pgrp->pg_session, pgrp->pg_session->s_sid, pgrp->pg_session->s_count, pgrp->pg_session->s_login); db_printf("\tpgrp %p, pg_id %d, pg_jobc %d, members %p\n", pgrp, pgrp->pg_id, pgrp->pg_jobc, LIST_FIRST(&pgrp->pg_members)); LIST_FOREACH(p, &pgrp->pg_members, p_pglist) { db_printf("\t\tpid %d addr %p pgrp %p %s\n", p->p_pid, p, p->p_pgrp, p->p_comm); } } } } #endif /* DDB */ #ifdef KSTACK_CHECK_MAGIC #define KSTACK_MAGIC 0xdeadbeaf /* XXX should be per process basis? */ static int kstackleftmin = KSTACK_SIZE; static int kstackleftthres = KSTACK_SIZE / 8; void kstack_setup_magic(const struct lwp *l) { uint32_t *ip; uint32_t const *end; KASSERT(l != NULL); KASSERT(l != &lwp0); /* * fill all the stack with magic number * so that later modification on it can be detected. */ ip = (uint32_t *)KSTACK_LOWEST_ADDR(l); end = (uint32_t *)((char *)KSTACK_LOWEST_ADDR(l) + KSTACK_SIZE); for (; ip < end; ip++) { *ip = KSTACK_MAGIC; } } void kstack_check_magic(const struct lwp *l) { uint32_t const *ip, *end; int stackleft; KASSERT(l != NULL); /* don't check proc0 */ /*XXX*/ if (l == &lwp0) return; #ifdef __MACHINE_STACK_GROWS_UP /* stack grows upwards (eg. hppa) */ ip = (uint32_t *)((void *)KSTACK_LOWEST_ADDR(l) + KSTACK_SIZE); end = (uint32_t *)KSTACK_LOWEST_ADDR(l); for (ip--; ip >= end; ip--) if (*ip != KSTACK_MAGIC) break; stackleft = (void *)KSTACK_LOWEST_ADDR(l) + KSTACK_SIZE - (void *)ip; #else /* __MACHINE_STACK_GROWS_UP */ /* stack grows downwards (eg. i386) */ ip = (uint32_t *)KSTACK_LOWEST_ADDR(l); end = (uint32_t *)((char *)KSTACK_LOWEST_ADDR(l) + KSTACK_SIZE); for (; ip < end; ip++) if (*ip != KSTACK_MAGIC) break; stackleft = ((const char *)ip) - (const char *)KSTACK_LOWEST_ADDR(l); #endif /* __MACHINE_STACK_GROWS_UP */ if (kstackleftmin > stackleft) { kstackleftmin = stackleft; if (stackleft < kstackleftthres) printf("warning: kernel stack left %d bytes" "(pid %u:lid %u)\n", stackleft, (u_int)l->l_proc->p_pid, (u_int)l->l_lid); } if (stackleft <= 0) { panic("magic on the top of kernel stack changed for " "pid %u, lid %u: maybe kernel stack overflow", (u_int)l->l_proc->p_pid, (u_int)l->l_lid); } } #endif /* KSTACK_CHECK_MAGIC */ int proclist_foreach_call(struct proclist *list, int (*callback)(struct proc *, void *arg), void *arg) { struct proc marker; struct proc *p; int ret = 0; marker.p_flag = PK_MARKER; mutex_enter(&proc_lock); for (p = LIST_FIRST(list); ret == 0 && p != NULL;) { if (p->p_flag & PK_MARKER) { p = LIST_NEXT(p, p_list); continue; } LIST_INSERT_AFTER(p, &marker, p_list); ret = (*callback)(p, arg); KASSERT(mutex_owned(&proc_lock)); p = LIST_NEXT(&marker, p_list); LIST_REMOVE(&marker, p_list); } mutex_exit(&proc_lock); return ret; } int proc_vmspace_getref(struct proc *p, struct vmspace **vm) { /* XXXCDC: how should locking work here? */ /* curproc exception is for coredump. */ if ((p != curproc && (p->p_sflag & PS_WEXIT) != 0) || (p->p_vmspace->vm_refcnt < 1)) { return EFAULT; } uvmspace_addref(p->p_vmspace); *vm = p->p_vmspace; return 0; } /* * Acquire a write lock on the process credential. */ void proc_crmod_enter(void) { struct lwp *l = curlwp; struct proc *p = l->l_proc; kauth_cred_t oc; /* Reset what needs to be reset in plimit. */ if (p->p_limit->pl_corename != defcorename) { lim_setcorename(p, defcorename, 0); } mutex_enter(p->p_lock); /* Ensure the LWP cached credentials are up to date. */ if ((oc = l->l_cred) != p->p_cred) { l->l_cred = kauth_cred_hold(p->p_cred); kauth_cred_free(oc); } } /* * Set in a new process credential, and drop the write lock. The credential * must have a reference already. Optionally, free a no-longer required * credential. */ void proc_crmod_leave(kauth_cred_t scred, kauth_cred_t fcred, bool sugid) { struct lwp *l = curlwp, *l2; struct proc *p = l->l_proc; kauth_cred_t oc; KASSERT(mutex_owned(p->p_lock)); /* Is there a new credential to set in? */ if (scred != NULL) { p->p_cred = scred; LIST_FOREACH(l2, &p->p_lwps, l_sibling) { if (l2 != l) { lwp_lock(l2); l2->l_flag |= LW_CACHECRED; lwp_need_userret(l2); lwp_unlock(l2); } } /* Ensure the LWP cached credentials are up to date. */ if ((oc = l->l_cred) != scred) { l->l_cred = kauth_cred_hold(scred); } } else oc = NULL; /* XXXgcc */ if (sugid) { /* * Mark process as having changed credentials, stops * tracing etc. */ p->p_flag |= PK_SUGID; } mutex_exit(p->p_lock); /* If there is a credential to be released, free it now. */ if (fcred != NULL) { KASSERT(scred != NULL); kauth_cred_free(fcred); if (oc != scred) kauth_cred_free(oc); } } /* * proc_specific_key_create -- * Create a key for subsystem proc-specific data. */ int proc_specific_key_create(specificdata_key_t *keyp, specificdata_dtor_t dtor) { return (specificdata_key_create(proc_specificdata_domain, keyp, dtor)); } /* * proc_specific_key_delete -- * Delete a key for subsystem proc-specific data. */ void proc_specific_key_delete(specificdata_key_t key) { specificdata_key_delete(proc_specificdata_domain, key); } /* * proc_initspecific -- * Initialize a proc's specificdata container. */ void proc_initspecific(struct proc *p) { int error __diagused; error = specificdata_init(proc_specificdata_domain, &p->p_specdataref); KASSERT(error == 0); } /* * proc_finispecific -- * Finalize a proc's specificdata container. */ void proc_finispecific(struct proc *p) { specificdata_fini(proc_specificdata_domain, &p->p_specdataref); } /* * proc_getspecific -- * Return proc-specific data corresponding to the specified key. */ void * proc_getspecific(struct proc *p, specificdata_key_t key) { return (specificdata_getspecific(proc_specificdata_domain, &p->p_specdataref, key)); } /* * proc_setspecific -- * Set proc-specific data corresponding to the specified key. */ void proc_setspecific(struct proc *p, specificdata_key_t key, void *data) { specificdata_setspecific(proc_specificdata_domain, &p->p_specdataref, key, data); } int proc_uidmatch(kauth_cred_t cred, kauth_cred_t target) { int r = 0; if (kauth_cred_getuid(cred) != kauth_cred_getuid(target) || kauth_cred_getuid(cred) != kauth_cred_getsvuid(target)) { /* * suid proc of ours or proc not ours */ r = EPERM; } else if (kauth_cred_getgid(target) != kauth_cred_getsvgid(target)) { /* * sgid proc has sgid back to us temporarily */ r = EPERM; } else { /* * our rgid must be in target's group list (ie, * sub-processes started by a sgid process) */ int ismember = 0; if (kauth_cred_ismember_gid(cred, kauth_cred_getgid(target), &ismember) != 0 || !ismember) r = EPERM; } return (r); } /* * sysctl stuff */ #define KERN_PROCSLOP (5 * sizeof(struct kinfo_proc)) static const u_int sysctl_flagmap[] = { PK_ADVLOCK, P_ADVLOCK, PK_EXEC, P_EXEC, PK_NOCLDWAIT, P_NOCLDWAIT, PK_32, P_32, PK_CLDSIGIGN, P_CLDSIGIGN, PK_SUGID, P_SUGID, 0 }; static const u_int sysctl_sflagmap[] = { PS_NOCLDSTOP, P_NOCLDSTOP, PS_WEXIT, P_WEXIT, PS_STOPFORK, P_STOPFORK, PS_STOPEXEC, P_STOPEXEC, PS_STOPEXIT, P_STOPEXIT, 0 }; static const u_int sysctl_slflagmap[] = { PSL_TRACED, P_TRACED, PSL_CHTRACED, P_CHTRACED, PSL_SYSCALL, P_SYSCALL, 0 }; static const u_int sysctl_lflagmap[] = { PL_CONTROLT, P_CONTROLT, PL_PPWAIT, P_PPWAIT, 0 }; static const u_int sysctl_stflagmap[] = { PST_PROFIL, P_PROFIL, 0 }; /* used by kern_lwp also */ const u_int sysctl_lwpflagmap[] = { LW_SINTR, L_SINTR, LW_SYSTEM, L_SYSTEM, 0 }; /* * Find the most ``active'' lwp of a process and return it for ps display * purposes */ static struct lwp * proc_active_lwp(struct proc *p) { static const int ostat[] = { 0, 2, /* LSIDL */ 6, /* LSRUN */ 5, /* LSSLEEP */ 4, /* LSSTOP */ 0, /* LSZOMB */ 1, /* LSDEAD */ 7, /* LSONPROC */ 3 /* LSSUSPENDED */ }; struct lwp *l, *lp = NULL; LIST_FOREACH(l, &p->p_lwps, l_sibling) { KASSERT(l->l_stat >= 0); KASSERT(l->l_stat < __arraycount(ostat)); if (lp == NULL || ostat[l->l_stat] > ostat[lp->l_stat] || (ostat[l->l_stat] == ostat[lp->l_stat] && l->l_cpticks > lp->l_cpticks)) { lp = l; continue; } } return lp; } static int sysctl_doeproc(SYSCTLFN_ARGS) { union { struct kinfo_proc kproc; struct kinfo_proc2 kproc2; } *kbuf; struct proc *p, *next, *marker; char *where, *dp; int type, op, arg, error; u_int elem_size, kelem_size, elem_count; size_t buflen, needed; bool match, zombie, mmmbrains; const bool allowaddr = get_expose_address(curproc); if (namelen == 1 && name[0] == CTL_QUERY) return (sysctl_query(SYSCTLFN_CALL(rnode))); dp = where = oldp; buflen = where != NULL ? *oldlenp : 0; error = 0; needed = 0; type = rnode->sysctl_num; if (type == KERN_PROC) { if (namelen == 0) return EINVAL; switch (op = name[0]) { case KERN_PROC_ALL: if (namelen != 1) return EINVAL; arg = 0; break; default: if (namelen != 2) return EINVAL; arg = name[1]; break; } elem_count = 0; /* Hush little compiler, don't you cry */ kelem_size = elem_size = sizeof(kbuf->kproc); } else { if (namelen != 4) return EINVAL; op = name[0]; arg = name[1]; elem_size = name[2]; elem_count = name[3]; kelem_size = sizeof(kbuf->kproc2); } sysctl_unlock(); kbuf = kmem_zalloc(sizeof(*kbuf), KM_SLEEP); marker = kmem_alloc(sizeof(*marker), KM_SLEEP); marker->p_flag = PK_MARKER; mutex_enter(&proc_lock); /* * Start with zombies to prevent reporting processes twice, in case they * are dying and being moved from the list of alive processes to zombies. */ mmmbrains = true; for (p = LIST_FIRST(&zombproc);; p = next) { if (p == NULL) { if (mmmbrains) { p = LIST_FIRST(&allproc); mmmbrains = false; } if (p == NULL) break; } next = LIST_NEXT(p, p_list); if ((p->p_flag & PK_MARKER) != 0) continue; /* * Skip embryonic processes. */ if (p->p_stat == SIDL) continue; mutex_enter(p->p_lock); error = kauth_authorize_process(l->l_cred, KAUTH_PROCESS_CANSEE, p, KAUTH_ARG(KAUTH_REQ_PROCESS_CANSEE_EPROC), NULL, NULL); if (error != 0) { mutex_exit(p->p_lock); continue; } /* * Hande all the operations in one switch on the cost of * algorithm complexity is on purpose. The win splitting this * function into several similar copies makes maintenance * burden, code grow and boost is negligible in practical * systems. */ switch (op) { case KERN_PROC_PID: match = (p->p_pid == (pid_t)arg); break; case KERN_PROC_PGRP: match = (p->p_pgrp->pg_id == (pid_t)arg); break; case KERN_PROC_SESSION: match = (p->p_session->s_sid == (pid_t)arg); break; case KERN_PROC_TTY: match = true; if (arg == (int) KERN_PROC_TTY_REVOKE) { if ((p->p_lflag & PL_CONTROLT) == 0 || p->p_session->s_ttyp == NULL || p->p_session->s_ttyvp != NULL) { match = false; } } else if ((p->p_lflag & PL_CONTROLT) == 0 || p->p_session->s_ttyp == NULL) { if ((dev_t)arg != KERN_PROC_TTY_NODEV) { match = false; } } else if (p->p_session->s_ttyp->t_dev != (dev_t)arg) { match = false; } break; case KERN_PROC_UID: match = (kauth_cred_geteuid(p->p_cred) == (uid_t)arg); break; case KERN_PROC_RUID: match = (kauth_cred_getuid(p->p_cred) == (uid_t)arg); break; case KERN_PROC_GID: match = (kauth_cred_getegid(p->p_cred) == (uid_t)arg); break; case KERN_PROC_RGID: match = (kauth_cred_getgid(p->p_cred) == (uid_t)arg); break; case KERN_PROC_ALL: match = true; /* allow everything */ break; default: error = EINVAL; mutex_exit(p->p_lock); goto cleanup; } if (!match) { mutex_exit(p->p_lock); continue; } /* * Grab a hold on the process. */ if (mmmbrains) { zombie = true; } else { zombie = !rw_tryenter(&p->p_reflock, RW_READER); } if (zombie) { LIST_INSERT_AFTER(p, marker, p_list); } if (buflen >= elem_size && (type == KERN_PROC || elem_count > 0)) { ruspace(p); /* Update process vm resource use */ if (type == KERN_PROC) { fill_proc(p, &kbuf->kproc.kp_proc, allowaddr); fill_eproc(p, &kbuf->kproc.kp_eproc, zombie, allowaddr); } else { fill_kproc2(p, &kbuf->kproc2, zombie, allowaddr); elem_count--; } mutex_exit(p->p_lock); mutex_exit(&proc_lock); /* * Copy out elem_size, but not larger than kelem_size */ error = sysctl_copyout(l, kbuf, dp, uimin(kelem_size, elem_size)); mutex_enter(&proc_lock); if (error) { goto bah; } dp += elem_size; buflen -= elem_size; } else { mutex_exit(p->p_lock); } needed += elem_size; /* * Release reference to process. */ if (zombie) { next = LIST_NEXT(marker, p_list); LIST_REMOVE(marker, p_list); } else { rw_exit(&p->p_reflock); next = LIST_NEXT(p, p_list); } /* * Short-circuit break quickly! */ if (op == KERN_PROC_PID) break; } mutex_exit(&proc_lock); if (where != NULL) { *oldlenp = dp - where; if (needed > *oldlenp) { error = ENOMEM; goto out; } } else { needed += KERN_PROCSLOP; *oldlenp = needed; } kmem_free(kbuf, sizeof(*kbuf)); kmem_free(marker, sizeof(*marker)); sysctl_relock(); return 0; bah: if (zombie) LIST_REMOVE(marker, p_list); else rw_exit(&p->p_reflock); cleanup: mutex_exit(&proc_lock); out: kmem_free(kbuf, sizeof(*kbuf)); kmem_free(marker, sizeof(*marker)); sysctl_relock(); return error; } int copyin_psstrings(struct proc *p, struct ps_strings *arginfo) { #if !defined(_RUMPKERNEL) int retval; if (p->p_flag & PK_32) { MODULE_HOOK_CALL(kern_proc32_copyin_hook, (p, arginfo), enosys(), retval); return retval; } #endif /* !defined(_RUMPKERNEL) */ return copyin_proc(p, (void *)p->p_psstrp, arginfo, sizeof(*arginfo)); } static int copy_procargs_sysctl_cb(void *cookie_, const void *src, size_t off, size_t len) { void **cookie = cookie_; struct lwp *l = cookie[0]; char *dst = cookie[1]; return sysctl_copyout(l, src, dst + off, len); } /* * sysctl helper routine for kern.proc_args pseudo-subtree. */ static int sysctl_kern_proc_args(SYSCTLFN_ARGS) { struct ps_strings pss; struct proc *p; pid_t pid; int type, error; void *cookie[2]; if (namelen == 1 && name[0] == CTL_QUERY) return (sysctl_query(SYSCTLFN_CALL(rnode))); if (newp != NULL || namelen != 2) return (EINVAL); pid = name[0]; type = name[1]; switch (type) { case KERN_PROC_PATHNAME: sysctl_unlock(); error = fill_pathname(l, pid, oldp, oldlenp); sysctl_relock(); return error; case KERN_PROC_CWD: sysctl_unlock(); error = fill_cwd(l, pid, oldp, oldlenp); sysctl_relock(); return error; case KERN_PROC_ARGV: case KERN_PROC_NARGV: case KERN_PROC_ENV: case KERN_PROC_NENV: /* ok */ break; default: return (EINVAL); } sysctl_unlock(); /* check pid */ mutex_enter(&proc_lock); if ((p = proc_find(pid)) == NULL) { error = EINVAL; goto out_locked; } mutex_enter(p->p_lock); /* Check permission. */ if (type == KERN_PROC_ARGV || type == KERN_PROC_NARGV) error = kauth_authorize_process(l->l_cred, KAUTH_PROCESS_CANSEE, p, KAUTH_ARG(KAUTH_REQ_PROCESS_CANSEE_ARGS), NULL, NULL); else if (type == KERN_PROC_ENV || type == KERN_PROC_NENV) error = kauth_authorize_process(l->l_cred, KAUTH_PROCESS_CANSEE, p, KAUTH_ARG(KAUTH_REQ_PROCESS_CANSEE_ENV), NULL, NULL); else error = EINVAL; /* XXXGCC */ if (error) { mutex_exit(p->p_lock); goto out_locked; } if (oldp == NULL) { if (type == KERN_PROC_NARGV || type == KERN_PROC_NENV) *oldlenp = sizeof (int); else *oldlenp = ARG_MAX; /* XXX XXX XXX */ error = 0; mutex_exit(p->p_lock); goto out_locked; } /* * Zombies don't have a stack, so we can't read their psstrings. * System processes also don't have a user stack. */ if (P_ZOMBIE(p) || (p->p_flag & PK_SYSTEM) != 0) { error = EINVAL; mutex_exit(p->p_lock); goto out_locked; } error = rw_tryenter(&p->p_reflock, RW_READER) ? 0 : EBUSY; mutex_exit(p->p_lock); if (error) { goto out_locked; } mutex_exit(&proc_lock); if (type == KERN_PROC_NARGV || type == KERN_PROC_NENV) { int value; if ((error = copyin_psstrings(p, &pss)) == 0) { if (type == KERN_PROC_NARGV) value = pss.ps_nargvstr; else value = pss.ps_nenvstr; error = sysctl_copyout(l, &value, oldp, sizeof(value)); *oldlenp = sizeof(value); } } else { cookie[0] = l; cookie[1] = oldp; error = copy_procargs(p, type, oldlenp, copy_procargs_sysctl_cb, cookie); } rw_exit(&p->p_reflock); sysctl_relock(); return error; out_locked: mutex_exit(&proc_lock); sysctl_relock(); return error; } int copy_procargs(struct proc *p, int oid, size_t *limit, int (*cb)(void *, const void *, size_t, size_t), void *cookie) { struct ps_strings pss; size_t len, i, loaded, entry_len; struct uio auio; struct iovec aiov; int error, argvlen; char *arg; char **argv; vaddr_t user_argv; struct vmspace *vmspace; /* * Allocate a temporary buffer to hold the argument vector and * the arguments themselve. */ arg = kmem_alloc(PAGE_SIZE, KM_SLEEP); argv = kmem_alloc(PAGE_SIZE, KM_SLEEP); /* * Lock the process down in memory. */ vmspace = p->p_vmspace; uvmspace_addref(vmspace); /* * Read in the ps_strings structure. */ if ((error = copyin_psstrings(p, &pss)) != 0) goto done; /* * Now read the address of the argument vector. */ switch (oid) { case KERN_PROC_ARGV: user_argv = (uintptr_t)pss.ps_argvstr; argvlen = pss.ps_nargvstr; break; case KERN_PROC_ENV: user_argv = (uintptr_t)pss.ps_envstr; argvlen = pss.ps_nenvstr; break; default: error = EINVAL; goto done; } if (argvlen < 0) { error = EIO; goto done; } /* * Now copy each string. */ len = 0; /* bytes written to user buffer */ loaded = 0; /* bytes from argv already processed */ i = 0; /* To make compiler happy */ entry_len = PROC_PTRSZ(p); for (; argvlen; --argvlen) { int finished = 0; vaddr_t base; size_t xlen; int j; if (loaded == 0) { size_t rem = entry_len * argvlen; loaded = MIN(rem, PAGE_SIZE); error = copyin_vmspace(vmspace, (const void *)user_argv, argv, loaded); if (error) break; user_argv += loaded; i = 0; } #if !defined(_RUMPKERNEL) if (p->p_flag & PK_32) MODULE_HOOK_CALL(kern_proc32_base_hook, (argv, i++), 0, base); else #endif /* !defined(_RUMPKERNEL) */ base = (vaddr_t)argv[i++]; loaded -= entry_len; /* * The program has messed around with its arguments, * possibly deleting some, and replacing them with * NULL's. Treat this as the last argument and not * a failure. */ if (base == 0) break; while (!finished) { xlen = PAGE_SIZE - (base & PAGE_MASK); aiov.iov_base = arg; aiov.iov_len = PAGE_SIZE; auio.uio_iov = &aiov; auio.uio_iovcnt = 1; auio.uio_offset = base; auio.uio_resid = xlen; auio.uio_rw = UIO_READ; UIO_SETUP_SYSSPACE(&auio); error = uvm_io(&vmspace->vm_map, &auio, 0); if (error) goto done; /* Look for the end of the string */ for (j = 0; j < xlen; j++) { if (arg[j] == '\0') { xlen = j + 1; finished = 1; break; } } /* Check for user buffer overflow */ if (len + xlen > *limit) { finished = 1; if (len > *limit) xlen = 0; else xlen = *limit - len; } /* Copyout the page */ error = (*cb)(cookie, arg, len, xlen); if (error) goto done; len += xlen; base += xlen; } } *limit = len; done: kmem_free(argv, PAGE_SIZE); kmem_free(arg, PAGE_SIZE); uvmspace_free(vmspace); return error; } /* * Fill in a proc structure for the specified process. */ static void fill_proc(const struct proc *psrc, struct proc *p, bool allowaddr) { COND_SET_STRUCT(p->p_list, psrc->p_list, allowaddr); memset(&p->p_auxlock, 0, sizeof(p->p_auxlock)); COND_SET_STRUCT(p->p_lock, psrc->p_lock, allowaddr); memset(&p->p_stmutex, 0, sizeof(p->p_stmutex)); memset(&p->p_reflock, 0, sizeof(p->p_reflock)); COND_SET_STRUCT(p->p_waitcv, psrc->p_waitcv, allowaddr); COND_SET_STRUCT(p->p_lwpcv, psrc->p_lwpcv, allowaddr); COND_SET_PTR(p->p_cred, psrc->p_cred, allowaddr); COND_SET_PTR(p->p_fd, psrc->p_fd, allowaddr); COND_SET_PTR(p->p_cwdi, psrc->p_cwdi, allowaddr); COND_SET_PTR(p->p_stats, psrc->p_stats, allowaddr); COND_SET_PTR(p->p_limit, psrc->p_limit, allowaddr); COND_SET_PTR(p->p_vmspace, psrc->p_vmspace, allowaddr); COND_SET_PTR(p->p_sigacts, psrc->p_sigacts, allowaddr); COND_SET_PTR(p->p_aio, psrc->p_aio, allowaddr); p->p_mqueue_cnt = psrc->p_mqueue_cnt; memset(&p->p_specdataref, 0, sizeof(p->p_specdataref)); p->p_exitsig = psrc->p_exitsig; p->p_flag = psrc->p_flag; p->p_sflag = psrc->p_sflag; p->p_slflag = psrc->p_slflag; p->p_lflag = psrc->p_lflag; p->p_stflag = psrc->p_stflag; p->p_stat = psrc->p_stat; p->p_trace_enabled = psrc->p_trace_enabled; p->p_pid = psrc->p_pid; COND_SET_STRUCT(p->p_pglist, psrc->p_pglist, allowaddr); COND_SET_PTR(p->p_pptr, psrc->p_pptr, allowaddr); COND_SET_STRUCT(p->p_sibling, psrc->p_sibling, allowaddr); COND_SET_STRUCT(p->p_children, psrc->p_children, allowaddr); COND_SET_STRUCT(p->p_lwps, psrc->p_lwps, allowaddr); COND_SET_PTR(p->p_raslist, psrc->p_raslist, allowaddr); p->p_nlwps = psrc->p_nlwps; p->p_nzlwps = psrc->p_nzlwps; p->p_nrlwps = psrc->p_nrlwps; p->p_nlwpwait = psrc->p_nlwpwait; p->p_ndlwps = psrc->p_ndlwps; p->p_nstopchild = psrc->p_nstopchild; p->p_waited = psrc->p_waited; COND_SET_PTR(p->p_zomblwp, psrc->p_zomblwp, allowaddr); COND_SET_PTR(p->p_vforklwp, psrc->p_vforklwp, allowaddr); COND_SET_PTR(p->p_sched_info, psrc->p_sched_info, allowaddr); p->p_estcpu = psrc->p_estcpu; p->p_estcpu_inherited = psrc->p_estcpu_inherited; p->p_forktime = psrc->p_forktime; p->p_pctcpu = psrc->p_pctcpu; COND_SET_PTR(p->p_opptr, psrc->p_opptr, allowaddr); COND_SET_PTR(p->p_timers, psrc->p_timers, allowaddr); p->p_rtime = psrc->p_rtime; p->p_uticks = psrc->p_uticks; p->p_sticks = psrc->p_sticks; p->p_iticks = psrc->p_iticks; p->p_xutime = psrc->p_xutime; p->p_xstime = psrc->p_xstime; p->p_traceflag = psrc->p_traceflag; COND_SET_PTR(p->p_tracep, psrc->p_tracep, allowaddr); COND_SET_PTR(p->p_textvp, psrc->p_textvp, allowaddr); COND_SET_PTR(p->p_emul, psrc->p_emul, allowaddr); COND_SET_PTR(p->p_emuldata, psrc->p_emuldata, allowaddr); COND_SET_CPTR(p->p_execsw, psrc->p_execsw, allowaddr); COND_SET_STRUCT(p->p_klist, psrc->p_klist, allowaddr); COND_SET_STRUCT(p->p_sigwaiters, psrc->p_sigwaiters, allowaddr); COND_SET_STRUCT(p->p_sigpend.sp_info, psrc->p_sigpend.sp_info, allowaddr); p->p_sigpend.sp_set = psrc->p_sigpend.sp_set; COND_SET_PTR(p->p_lwpctl, psrc->p_lwpctl, allowaddr); p->p_ppid = psrc->p_ppid; p->p_oppid = psrc->p_oppid; COND_SET_PTR(p->p_path, psrc->p_path, allowaddr); p->p_sigctx = psrc->p_sigctx; p->p_nice = psrc->p_nice; memcpy(p->p_comm, psrc->p_comm, sizeof(p->p_comm)); COND_SET_PTR(p->p_pgrp, psrc->p_pgrp, allowaddr); COND_SET_VALUE(p->p_psstrp, psrc->p_psstrp, allowaddr); p->p_pax = psrc->p_pax; p->p_xexit = psrc->p_xexit; p->p_xsig = psrc->p_xsig; p->p_acflag = psrc->p_acflag; COND_SET_STRUCT(p->p_md, psrc->p_md, allowaddr); p->p_stackbase = psrc->p_stackbase; COND_SET_PTR(p->p_dtrace, psrc->p_dtrace, allowaddr); } /* * Fill in an eproc structure for the specified process. */ void fill_eproc(struct proc *p, struct eproc *ep, bool zombie, bool allowaddr) { struct tty *tp; struct lwp *l; KASSERT(mutex_owned(&proc_lock)); KASSERT(mutex_owned(p->p_lock)); COND_SET_PTR(ep->e_paddr, p, allowaddr); COND_SET_PTR(ep->e_sess, p->p_session, allowaddr); if (p->p_cred) { kauth_cred_topcred(p->p_cred, &ep->e_pcred); kauth_cred_toucred(p->p_cred, &ep->e_ucred); } if (p->p_stat != SIDL && !P_ZOMBIE(p) && !zombie) { struct vmspace *vm = p->p_vmspace; ep->e_vm.vm_rssize = vm_resident_count(vm); ep->e_vm.vm_tsize = vm->vm_tsize; ep->e_vm.vm_dsize = vm->vm_dsize; ep->e_vm.vm_ssize = vm->vm_ssize; ep->e_vm.vm_map.size = vm->vm_map.size; /* Pick the primary (first) LWP */ l = proc_active_lwp(p); KASSERT(l != NULL); lwp_lock(l); if (l->l_wchan) strncpy(ep->e_wmesg, l->l_wmesg, WMESGLEN); lwp_unlock(l); } ep->e_ppid = p->p_ppid; if (p->p_pgrp && p->p_session) { ep->e_pgid = p->p_pgrp->pg_id; ep->e_jobc = p->p_pgrp->pg_jobc; ep->e_sid = p->p_session->s_sid; if ((p->p_lflag & PL_CONTROLT) && (tp = p->p_session->s_ttyp)) { ep->e_tdev = tp->t_dev; ep->e_tpgid = tp->t_pgrp ? tp->t_pgrp->pg_id : NO_PGID; COND_SET_PTR(ep->e_tsess, tp->t_session, allowaddr); } else ep->e_tdev = (uint32_t)NODEV; ep->e_flag = p->p_session->s_ttyvp ? EPROC_CTTY : 0; if (SESS_LEADER(p)) ep->e_flag |= EPROC_SLEADER; strncpy(ep->e_login, p->p_session->s_login, MAXLOGNAME); } ep->e_xsize = ep->e_xrssize = 0; ep->e_xccount = ep->e_xswrss = 0; } /* * Fill in a kinfo_proc2 structure for the specified process. */ void fill_kproc2(struct proc *p, struct kinfo_proc2 *ki, bool zombie, bool allowaddr) { struct tty *tp; struct lwp *l; struct timeval ut, st, rt; sigset_t ss1, ss2; struct rusage ru; struct vmspace *vm; KASSERT(mutex_owned(&proc_lock)); KASSERT(mutex_owned(p->p_lock)); sigemptyset(&ss1); sigemptyset(&ss2); COND_SET_VALUE(ki->p_paddr, PTRTOUINT64(p), allowaddr); COND_SET_VALUE(ki->p_fd, PTRTOUINT64(p->p_fd), allowaddr); COND_SET_VALUE(ki->p_cwdi, PTRTOUINT64(p->p_cwdi), allowaddr); COND_SET_VALUE(ki->p_stats, PTRTOUINT64(p->p_stats), allowaddr); COND_SET_VALUE(ki->p_limit, PTRTOUINT64(p->p_limit), allowaddr); COND_SET_VALUE(ki->p_vmspace, PTRTOUINT64(p->p_vmspace), allowaddr); COND_SET_VALUE(ki->p_sigacts, PTRTOUINT64(p->p_sigacts), allowaddr); COND_SET_VALUE(ki->p_sess, PTRTOUINT64(p->p_session), allowaddr); ki->p_tsess = 0; /* may be changed if controlling tty below */ COND_SET_VALUE(ki->p_ru, PTRTOUINT64(&p->p_stats->p_ru), allowaddr); ki->p_eflag = 0; ki->p_exitsig = p->p_exitsig; ki->p_flag = L_INMEM; /* Process never swapped out */ ki->p_flag |= sysctl_map_flags(sysctl_flagmap, p->p_flag); ki->p_flag |= sysctl_map_flags(sysctl_sflagmap, p->p_sflag); ki->p_flag |= sysctl_map_flags(sysctl_slflagmap, p->p_slflag); ki->p_flag |= sysctl_map_flags(sysctl_lflagmap, p->p_lflag); ki->p_flag |= sysctl_map_flags(sysctl_stflagmap, p->p_stflag); ki->p_pid = p->p_pid; ki->p_ppid = p->p_ppid; ki->p_uid = kauth_cred_geteuid(p->p_cred); ki->p_ruid = kauth_cred_getuid(p->p_cred); ki->p_gid = kauth_cred_getegid(p->p_cred); ki->p_rgid = kauth_cred_getgid(p->p_cred); ki->p_svuid = kauth_cred_getsvuid(p->p_cred); ki->p_svgid = kauth_cred_getsvgid(p->p_cred); ki->p_ngroups = kauth_cred_ngroups(p->p_cred); kauth_cred_getgroups(p->p_cred, ki->p_groups, uimin(ki->p_ngroups, sizeof(ki->p_groups) / sizeof(ki->p_groups[0])), UIO_SYSSPACE); ki->p_uticks = p->p_uticks; ki->p_sticks = p->p_sticks; ki->p_iticks = p->p_iticks; ki->p_tpgid = NO_PGID; /* may be changed if controlling tty below */ COND_SET_VALUE(ki->p_tracep, PTRTOUINT64(p->p_tracep), allowaddr); ki->p_traceflag = p->p_traceflag; memcpy(&ki->p_sigignore, &p->p_sigctx.ps_sigignore,sizeof(ki_sigset_t)); memcpy(&ki->p_sigcatch, &p->p_sigctx.ps_sigcatch, sizeof(ki_sigset_t)); ki->p_cpticks = 0; ki->p_pctcpu = p->p_pctcpu; ki->p_estcpu = 0; ki->p_stat = p->p_stat; /* Will likely be overridden by LWP status */ ki->p_realstat = p->p_stat; ki->p_nice = p->p_nice; ki->p_xstat = P_WAITSTATUS(p); ki->p_acflag = p->p_acflag; strncpy(ki->p_comm, p->p_comm, uimin(sizeof(ki->p_comm), sizeof(p->p_comm))); strncpy(ki->p_ename, p->p_emul->e_name, sizeof(ki->p_ename)); ki->p_nlwps = p->p_nlwps; ki->p_realflag = ki->p_flag; if (p->p_stat != SIDL && !P_ZOMBIE(p) && !zombie) { vm = p->p_vmspace; ki->p_vm_rssize = vm_resident_count(vm); ki->p_vm_tsize = vm->vm_tsize; ki->p_vm_dsize = vm->vm_dsize; ki->p_vm_ssize = vm->vm_ssize; ki->p_vm_vsize = atop(vm->vm_map.size); /* * Since the stack is initially mapped mostly with * PROT_NONE and grown as needed, adjust the "mapped size" * to skip the unused stack portion. */ ki->p_vm_msize = atop(vm->vm_map.size) - vm->vm_issize + vm->vm_ssize; /* Pick the primary (first) LWP */ l = proc_active_lwp(p); KASSERT(l != NULL); lwp_lock(l); ki->p_nrlwps = p->p_nrlwps; ki->p_forw = 0; ki->p_back = 0; COND_SET_VALUE(ki->p_addr, PTRTOUINT64(l->l_addr), allowaddr); ki->p_stat = l->l_stat; ki->p_flag |= sysctl_map_flags(sysctl_lwpflagmap, l->l_flag); ki->p_swtime = l->l_swtime; ki->p_slptime = l->l_slptime; if (l->l_stat == LSONPROC) ki->p_schedflags = l->l_cpu->ci_schedstate.spc_flags; else ki->p_schedflags = 0; ki->p_priority = lwp_eprio(l); ki->p_usrpri = l->l_priority; if (l->l_wchan) strncpy(ki->p_wmesg, l->l_wmesg, sizeof(ki->p_wmesg)); COND_SET_VALUE(ki->p_wchan, PTRTOUINT64(l->l_wchan), allowaddr); ki->p_cpuid = cpu_index(l->l_cpu); lwp_unlock(l); LIST_FOREACH(l, &p->p_lwps, l_sibling) { /* This is hardly correct, but... */ sigplusset(&l->l_sigpend.sp_set, &ss1); sigplusset(&l->l_sigmask, &ss2); ki->p_cpticks += l->l_cpticks; ki->p_pctcpu += l->l_pctcpu; ki->p_estcpu += l->l_estcpu; } } sigplusset(&p->p_sigpend.sp_set, &ss1); memcpy(&ki->p_siglist, &ss1, sizeof(ki_sigset_t)); memcpy(&ki->p_sigmask, &ss2, sizeof(ki_sigset_t)); if (p->p_session != NULL) { ki->p_sid = p->p_session->s_sid; ki->p__pgid = p->p_pgrp->pg_id; if (p->p_session->s_ttyvp) ki->p_eflag |= EPROC_CTTY; if (SESS_LEADER(p)) ki->p_eflag |= EPROC_SLEADER; strncpy(ki->p_login, p->p_session->s_login, uimin(sizeof ki->p_login - 1, sizeof p->p_session->s_login)); ki->p_jobc = p->p_pgrp->pg_jobc; if ((p->p_lflag & PL_CONTROLT) && (tp = p->p_session->s_ttyp)) { ki->p_tdev = tp->t_dev; ki->p_tpgid = tp->t_pgrp ? tp->t_pgrp->pg_id : NO_PGID; COND_SET_VALUE(ki->p_tsess, PTRTOUINT64(tp->t_session), allowaddr); } else { ki->p_tdev = (int32_t)NODEV; } } if (!P_ZOMBIE(p) && !zombie) { ki->p_uvalid = 1; ki->p_ustart_sec = p->p_stats->p_start.tv_sec; ki->p_ustart_usec = p->p_stats->p_start.tv_usec; calcru(p, &ut, &st, NULL, &rt); ki->p_rtime_sec = rt.tv_sec; ki->p_rtime_usec = rt.tv_usec; ki->p_uutime_sec = ut.tv_sec; ki->p_uutime_usec = ut.tv_usec; ki->p_ustime_sec = st.tv_sec; ki->p_ustime_usec = st.tv_usec; memcpy(&ru, &p->p_stats->p_ru, sizeof(ru)); rulwps(p, &ru); ki->p_uru_nvcsw = ru.ru_nvcsw; ki->p_uru_nivcsw = ru.ru_nivcsw; ki->p_uru_maxrss = ru.ru_maxrss; ki->p_uru_ixrss = ru.ru_ixrss; ki->p_uru_idrss = ru.ru_idrss; ki->p_uru_isrss = ru.ru_isrss; ki->p_uru_minflt = ru.ru_minflt; ki->p_uru_majflt = ru.ru_majflt; ki->p_uru_nswap = ru.ru_nswap; ki->p_uru_inblock = ru.ru_inblock; ki->p_uru_oublock = ru.ru_oublock; ki->p_uru_msgsnd = ru.ru_msgsnd; ki->p_uru_msgrcv = ru.ru_msgrcv; ki->p_uru_nsignals = ru.ru_nsignals; timeradd(&p->p_stats->p_cru.ru_utime, &p->p_stats->p_cru.ru_stime, &ut); ki->p_uctime_sec = ut.tv_sec; ki->p_uctime_usec = ut.tv_usec; } } int proc_find_locked(struct lwp *l, struct proc **p, pid_t pid) { int error; mutex_enter(&proc_lock); if (pid == -1) *p = l->l_proc; else *p = proc_find(pid); if (*p == NULL) { if (pid != -1) mutex_exit(&proc_lock); return ESRCH; } if (pid != -1) mutex_enter((*p)->p_lock); mutex_exit(&proc_lock); error = kauth_authorize_process(l->l_cred, KAUTH_PROCESS_CANSEE, *p, KAUTH_ARG(KAUTH_REQ_PROCESS_CANSEE_ENTRY), NULL, NULL); if (error) { if (pid != -1) mutex_exit((*p)->p_lock); } return error; } static int fill_pathname(struct lwp *l, pid_t pid, void *oldp, size_t *oldlenp) { int error; struct proc *p; if ((error = proc_find_locked(l, &p, pid)) != 0) return error; if (p->p_path == NULL) { if (pid != -1) mutex_exit(p->p_lock); return ENOENT; } size_t len = strlen(p->p_path) + 1; if (oldp != NULL) { size_t copylen = uimin(len, *oldlenp); error = sysctl_copyout(l, p->p_path, oldp, copylen); if (error == 0 && *oldlenp < len) error = ENOSPC; } *oldlenp = len; if (pid != -1) mutex_exit(p->p_lock); return error; } static int fill_cwd(struct lwp *l, pid_t pid, void *oldp, size_t *oldlenp) { int error; struct proc *p; char *path; char *bp, *bend; struct cwdinfo *cwdi; struct vnode *vp; size_t len, lenused; if ((error = proc_find_locked(l, &p, pid)) != 0) return error; len = MAXPATHLEN * 4; path = kmem_alloc(len, KM_SLEEP); bp = &path[len]; bend = bp; *(--bp) = '\0'; cwdi = p->p_cwdi; rw_enter(&cwdi->cwdi_lock, RW_READER); vp = cwdi->cwdi_cdir; error = getcwd_common(vp, NULL, &bp, path, len/2, 0, l); rw_exit(&cwdi->cwdi_lock); if (error) goto out; lenused = bend - bp; if (oldp != NULL) { size_t copylen = uimin(lenused, *oldlenp); error = sysctl_copyout(l, bp, oldp, copylen); if (error == 0 && *oldlenp < lenused) error = ENOSPC; } *oldlenp = lenused; out: if (pid != -1) mutex_exit(p->p_lock); kmem_free(path, len); return error; } int proc_getauxv(struct proc *p, void **buf, size_t *len) { struct ps_strings pss; int error; void *uauxv, *kauxv; size_t size; if ((error = copyin_psstrings(p, &pss)) != 0) return error; if (pss.ps_envstr == NULL) return EIO; size = p->p_execsw->es_arglen; if (size == 0) return EIO; size_t ptrsz = PROC_PTRSZ(p); uauxv = (void *)((char *)pss.ps_envstr + (pss.ps_nenvstr + 1) * ptrsz); kauxv = kmem_alloc(size, KM_SLEEP); error = copyin_proc(p, uauxv, kauxv, size); if (error) { kmem_free(kauxv, size); return error; } *buf = kauxv; *len = size; return 0; } static int sysctl_security_expose_address(SYSCTLFN_ARGS) { int expose_address, error; struct sysctlnode node; node = *rnode; node.sysctl_data = &expose_address; expose_address = *(int *)rnode->sysctl_data; error = sysctl_lookup(SYSCTLFN_CALL(&node)); if (error || newp == NULL) return error; if (kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_KERNADDR, 0, NULL, NULL, NULL)) return EPERM; switch (expose_address) { case 0: case 1: case 2: break; default: return EINVAL; } *(int *)rnode->sysctl_data = expose_address; return 0; } bool get_expose_address(struct proc *p) { /* allow only if sysctl variable is set or privileged */ return kauth_authorize_process(kauth_cred_get(), KAUTH_PROCESS_CANSEE, p, KAUTH_ARG(KAUTH_REQ_PROCESS_CANSEE_KPTR), NULL, NULL) == 0; }
2 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 /*- * Copyright (c) 2009-2019 The NetBSD Foundation, Inc. * All rights reserved. * * This material is based upon work partially supported by The * NetBSD Foundation under a contract with Mindaugas Rasiukevicius. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /* * NPF tableset module. * * Notes * * The tableset is an array of tables. After the creation, the array * is immutable. The caller is responsible to synchronise the access * to the tableset. * * Warning (not applicable for the userspace npfkern): * * The thmap_put()/thmap_del() are not called from the interrupt * context and are protected by an IPL_NET mutex(9), therefore they * do not need SPL wrappers -- see the comment at the top of the * npf_conndb.c source file. */ #ifdef _KERNEL #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: npf_tableset.c,v 1.42 2023/02/24 11:03:01 riastradh Exp $"); #include <sys/param.h> #include <sys/types.h> #include <sys/atomic.h> #include <sys/cdbr.h> #include <sys/kmem.h> #include <sys/pool.h> #include <sys/queue.h> #include <sys/mutex.h> #include <sys/thmap.h> #include "lpm.h" #endif #include "npf_impl.h" typedef struct npf_tblent { LIST_ENTRY(npf_tblent) te_listent; uint16_t te_preflen; uint16_t te_alen; npf_addr_t te_addr; } npf_tblent_t; #define NPF_ADDRLEN2IDX(alen) ((alen) >> 4) #define NPF_ADDR_SLOTS (2) struct npf_table { /* * The storage type can be: a) hashmap b) LPM c) cdb. * There are separate trees for IPv4 and IPv6. */ union { struct { thmap_t * t_map; LIST_HEAD(, npf_tblent) t_gc; }; lpm_t * t_lpm; struct { void * t_blob; size_t t_bsize; struct cdbr * t_cdb; }; struct { npf_tblent_t ** t_elements[NPF_ADDR_SLOTS]; unsigned t_allocated[NPF_ADDR_SLOTS]; unsigned t_used[NPF_ADDR_SLOTS]; }; } /* C11 */; LIST_HEAD(, npf_tblent) t_list; unsigned t_nitems; /* * Table ID, type and lock. The ID may change during the * config reload, it is protected by the npf_t::config_lock. */ int t_type; unsigned t_id; kmutex_t t_lock; /* Reference count and table name. */ unsigned t_refcnt; char t_name[NPF_TABLE_MAXNAMELEN]; }; struct npf_tableset { unsigned ts_nitems; npf_table_t * ts_map[]; }; #define NPF_TABLESET_SIZE(n) \ (offsetof(npf_tableset_t, ts_map[n]) * sizeof(npf_table_t *)) #define NPF_IFADDR_STEP 4 static pool_cache_t tblent_cache __read_mostly; /* * npf_table_sysinit: initialise tableset structures. */ void npf_tableset_sysinit(void) { tblent_cache = pool_cache_init(sizeof(npf_tblent_t), 0, 0, 0, "npftblpl", NULL, IPL_NONE, NULL, NULL, NULL); } void npf_tableset_sysfini(void) { pool_cache_destroy(tblent_cache); } npf_tableset_t * npf_tableset_create(u_int nitems) { npf_tableset_t *ts = kmem_zalloc(NPF_TABLESET_SIZE(nitems), KM_SLEEP); ts->ts_nitems = nitems; return ts; } void npf_tableset_destroy(npf_tableset_t *ts) { /* * Destroy all tables (no references should be held, since the * ruleset should be destroyed before). */ for (u_int tid = 0; tid < ts->ts_nitems; tid++) { npf_table_t *t = ts->ts_map[tid]; if (t == NULL) continue; membar_release(); if (atomic_dec_uint_nv(&t->t_refcnt) > 0) continue; membar_acquire(); npf_table_destroy(t); } kmem_free(ts, NPF_TABLESET_SIZE(ts->ts_nitems)); } /* * npf_tableset_insert: insert the table into the specified tableset. * * => Returns 0 on success. Fails and returns error if ID is already used. */ int npf_tableset_insert(npf_tableset_t *ts, npf_table_t *t) { const u_int tid = t->t_id; int error; KASSERT((u_int)tid < ts->ts_nitems); if (ts->ts_map[tid] == NULL) { atomic_inc_uint(&t->t_refcnt); ts->ts_map[tid] = t; error = 0; } else { error = EEXIST; } return error; } npf_table_t * npf_tableset_swap(npf_tableset_t *ts, npf_table_t *newt) { const u_int tid = newt->t_id; npf_table_t *oldt = ts->ts_map[tid]; KASSERT(tid < ts->ts_nitems); KASSERT(oldt->t_id == newt->t_id); newt->t_refcnt = oldt->t_refcnt; oldt->t_refcnt = 0; membar_producer(); return atomic_swap_ptr(&ts->ts_map[tid], newt); } /* * npf_tableset_getbyname: look for a table in the set given the name. */ npf_table_t * npf_tableset_getbyname(npf_tableset_t *ts, const char *name) { npf_table_t *t; for (u_int tid = 0; tid < ts->ts_nitems; tid++) { if ((t = ts->ts_map[tid]) == NULL) continue; if (strcmp(name, t->t_name) == 0) return t; } return NULL; } npf_table_t * npf_tableset_getbyid(npf_tableset_t *ts, unsigned tid) { if (__predict_true(tid < ts->ts_nitems)) { return atomic_load_relaxed(&ts->ts_map[tid]); } return NULL; } /* * npf_tableset_reload: iterate all tables and if the new table is of the * same type and has no items, then we preserve the old one and its entries. * * => The caller is responsible for providing synchronisation. */ void npf_tableset_reload(npf_t *npf, npf_tableset_t *nts, npf_tableset_t *ots) { for (u_int tid = 0; tid < nts->ts_nitems; tid++) { npf_table_t *t, *ot; if ((t = nts->ts_map[tid]) == NULL) { continue; } /* If our table has entries, just load it. */ if (t->t_nitems) { continue; } /* Look for a currently existing table with such name. */ ot = npf_tableset_getbyname(ots, t->t_name); if (ot == NULL) { /* Not found: we have a new table. */ continue; } /* Found. Did the type change? */ if (t->t_type != ot->t_type) { /* Yes, load the new. */ continue; } /* * Preserve the current table. Acquire a reference since * we are keeping it in the old table set. Update its ID. */ atomic_inc_uint(&ot->t_refcnt); nts->ts_map[tid] = ot; KASSERT(npf_config_locked_p(npf)); ot->t_id = tid; /* Destroy the new table (we hold the only reference). */ t->t_refcnt--; npf_table_destroy(t); } } int npf_tableset_export(npf_t *npf, const npf_tableset_t *ts, nvlist_t *nvl) { const npf_table_t *t; KASSERT(npf_config_locked_p(npf)); for (u_int tid = 0; tid < ts->ts_nitems; tid++) { nvlist_t *table; if ((t = ts->ts_map[tid]) == NULL) { continue; } table = nvlist_create(0); nvlist_add_string(table, "name", t->t_name); nvlist_add_number(table, "type", t->t_type); nvlist_add_number(table, "id", tid); nvlist_append_nvlist_array(nvl, "tables", table); nvlist_destroy(table); } return 0; } /* * Few helper routines. */ static void table_ipset_flush(npf_table_t *t) { npf_tblent_t *ent; while ((ent = LIST_FIRST(&t->t_list)) != NULL) { thmap_del(t->t_map, &ent->te_addr, ent->te_alen); LIST_REMOVE(ent, te_listent); pool_cache_put(tblent_cache, ent); } t->t_nitems = 0; } static void table_tree_flush(npf_table_t *t) { npf_tblent_t *ent; while ((ent = LIST_FIRST(&t->t_list)) != NULL) { LIST_REMOVE(ent, te_listent); pool_cache_put(tblent_cache, ent); } lpm_clear(t->t_lpm, NULL, NULL); t->t_nitems = 0; } static void table_ifaddr_flush(npf_table_t *t) { npf_tblent_t *ent; for (unsigned i = 0; i < NPF_ADDR_SLOTS; i++) { size_t len; if (!t->t_allocated[i]) { KASSERT(t->t_elements[i] == NULL); continue; } len = t->t_allocated[i] * sizeof(npf_tblent_t *); kmem_free(t->t_elements[i], len); t->t_elements[i] = NULL; t->t_allocated[i] = 0; t->t_used[i] = 0; } while ((ent = LIST_FIRST(&t->t_list)) != NULL) { LIST_REMOVE(ent, te_listent); pool_cache_put(tblent_cache, ent); } t->t_nitems = 0; } /* * npf_table_create: create table with a specified ID. */ npf_table_t * npf_table_create(const char *name, u_int tid, int type, const void *blob, size_t size) { npf_table_t *t; t = kmem_zalloc(sizeof(npf_table_t), KM_SLEEP); strlcpy(t->t_name, name, NPF_TABLE_MAXNAMELEN); switch (type) { case NPF_TABLE_LPM: t->t_lpm = lpm_create(KM_NOSLEEP); if (t->t_lpm == NULL) { goto out; } LIST_INIT(&t->t_list); break; case NPF_TABLE_IPSET: t->t_map = thmap_create(0, NULL, THMAP_NOCOPY); if (t->t_map == NULL) { goto out; } break; case NPF_TABLE_CONST: t->t_blob = kmem_alloc(size, KM_SLEEP); if (t->t_blob == NULL) { goto out; } memcpy(t->t_blob, blob, size); t->t_bsize = size; t->t_cdb = cdbr_open_mem(t->t_blob, size, CDBR_DEFAULT, NULL, NULL); if (t->t_cdb == NULL) { kmem_free(t->t_blob, t->t_bsize); goto out; } t->t_nitems = cdbr_entries(t->t_cdb); break; case NPF_TABLE_IFADDR: break; default: KASSERT(false); } mutex_init(&t->t_lock, MUTEX_DEFAULT, IPL_NET); t->t_type = type; t->t_id = tid; return t; out: kmem_free(t, sizeof(npf_table_t)); return NULL; } /* * npf_table_destroy: free all table entries and table itself. */ void npf_table_destroy(npf_table_t *t) { KASSERT(t->t_refcnt == 0); switch (t->t_type) { case NPF_TABLE_IPSET: table_ipset_flush(t); npf_table_gc(NULL, t); thmap_destroy(t->t_map); break; case NPF_TABLE_LPM: table_tree_flush(t); lpm_destroy(t->t_lpm); break; case NPF_TABLE_CONST: cdbr_close(t->t_cdb); kmem_free(t->t_blob, t->t_bsize); break; case NPF_TABLE_IFADDR: table_ifaddr_flush(t); break; default: KASSERT(false); } mutex_destroy(&t->t_lock); kmem_free(t, sizeof(npf_table_t)); } u_int npf_table_getid(npf_table_t *t) { return t->t_id; } /* * npf_table_check: validate the name, ID and type. */ int npf_table_check(npf_tableset_t *ts, const char *name, uint64_t tid, uint64_t type, bool replacing) { const npf_table_t *t; if (tid >= ts->ts_nitems) { return EINVAL; } if (!replacing && ts->ts_map[tid] != NULL) { return EEXIST; } switch (type) { case NPF_TABLE_LPM: case NPF_TABLE_IPSET: case NPF_TABLE_CONST: case NPF_TABLE_IFADDR: break; default: return EINVAL; } if (strlen(name) >= NPF_TABLE_MAXNAMELEN) { return ENAMETOOLONG; } if ((t = npf_tableset_getbyname(ts, name)) != NULL) { if (!replacing || t->t_id != tid) { return EEXIST; } } return 0; } static int table_ifaddr_insert(npf_table_t *t, const int alen, npf_tblent_t *ent) { const unsigned aidx = NPF_ADDRLEN2IDX(alen); const unsigned allocated = t->t_allocated[aidx]; const unsigned used = t->t_used[aidx]; /* * No need to check for duplicates. */ if (allocated <= used) { npf_tblent_t **old_elements = t->t_elements[aidx]; npf_tblent_t **elements; size_t toalloc, newsize; toalloc = roundup2(allocated + 1, NPF_IFADDR_STEP); newsize = toalloc * sizeof(npf_tblent_t *); elements = kmem_zalloc(newsize, KM_NOSLEEP); if (elements == NULL) { return ENOMEM; } for (unsigned i = 0; i < used; i++) { elements[i] = old_elements[i]; } if (allocated) { const size_t len = allocated * sizeof(npf_tblent_t *); KASSERT(old_elements != NULL); kmem_free(old_elements, len); } t->t_elements[aidx] = elements; t->t_allocated[aidx] = toalloc; } t->t_elements[aidx][used] = ent; t->t_used[aidx]++; return 0; } /* * npf_table_insert: add an IP CIDR entry into the table. */ int npf_table_insert(npf_table_t *t, const int alen, const npf_addr_t *addr, const npf_netmask_t mask) { npf_tblent_t *ent; int error; error = npf_netmask_check(alen, mask); if (error) { return error; } ent = pool_cache_get(tblent_cache, PR_WAITOK); memcpy(&ent->te_addr, addr, alen); ent->te_alen = alen; ent->te_preflen = 0; /* * Insert the entry. Return an error on duplicate. */ mutex_enter(&t->t_lock); switch (t->t_type) { case NPF_TABLE_IPSET: /* * Hashmap supports only IPs. * * Note: the key must be already persistent, since we * use THMAP_NOCOPY. */ if (mask != NPF_NO_NETMASK) { error = EINVAL; break; } if (thmap_put(t->t_map, &ent->te_addr, alen, ent) == ent) { LIST_INSERT_HEAD(&t->t_list, ent, te_listent); t->t_nitems++; } else { error = EEXIST; } break; case NPF_TABLE_LPM: { const unsigned preflen = (mask == NPF_NO_NETMASK) ? (alen * 8) : mask; ent->te_preflen = preflen; if (lpm_lookup(t->t_lpm, addr, alen) == NULL && lpm_insert(t->t_lpm, addr, alen, preflen, ent) == 0) { LIST_INSERT_HEAD(&t->t_list, ent, te_listent); t->t_nitems++; error = 0; } else { error = EEXIST; } break; } case NPF_TABLE_CONST: error = EINVAL; break; case NPF_TABLE_IFADDR: if ((error = table_ifaddr_insert(t, alen, ent)) != 0) { break; } LIST_INSERT_HEAD(&t->t_list, ent, te_listent); t->t_nitems++; break; default: KASSERT(false); } mutex_exit(&t->t_lock); if (error) { pool_cache_put(tblent_cache, ent); } return error; } /* * npf_table_remove: remove the IP CIDR entry from the table. */ int npf_table_remove(npf_table_t *t, const int alen, const npf_addr_t *addr, const npf_netmask_t mask) { npf_tblent_t *ent = NULL; int error; error = npf_netmask_check(alen, mask); if (error) { return error; } mutex_enter(&t->t_lock); switch (t->t_type) { case NPF_TABLE_IPSET: ent = thmap_del(t->t_map, addr, alen); if (__predict_true(ent != NULL)) { LIST_REMOVE(ent, te_listent); LIST_INSERT_HEAD(&t->t_gc, ent, te_listent); ent = NULL; // to be G/C'ed t->t_nitems--; } else { error = ENOENT; } break; case NPF_TABLE_LPM: ent = lpm_lookup(t->t_lpm, addr, alen); if (__predict_true(ent != NULL)) { LIST_REMOVE(ent, te_listent); lpm_remove(t->t_lpm, &ent->te_addr, ent->te_alen, ent->te_preflen); t->t_nitems--; } else { error = ENOENT; } break; case NPF_TABLE_CONST: case NPF_TABLE_IFADDR: error = EINVAL; break; default: KASSERT(false); ent = NULL; } mutex_exit(&t->t_lock); if (ent) { pool_cache_put(tblent_cache, ent); } return error; } /* * npf_table_lookup: find the table according to ID, lookup and match * the contents with the specified IP address. */ int npf_table_lookup(npf_table_t *t, const int alen, const npf_addr_t *addr) { const void *data; size_t dlen; bool found; int error; error = npf_netmask_check(alen, NPF_NO_NETMASK); if (error) { return error; } switch (t->t_type) { case NPF_TABLE_IPSET: /* Note: the caller is in the npf_config_read_enter(). */ found = thmap_get(t->t_map, addr, alen) != NULL; break; case NPF_TABLE_LPM: mutex_enter(&t->t_lock); found = lpm_lookup(t->t_lpm, addr, alen) != NULL; mutex_exit(&t->t_lock); break; case NPF_TABLE_CONST: if (cdbr_find(t->t_cdb, addr, alen, &data, &dlen) == 0) { found = dlen == (unsigned)alen && memcmp(addr, data, dlen) == 0; } else { found = false; } break; case NPF_TABLE_IFADDR: { const unsigned aidx = NPF_ADDRLEN2IDX(alen); found = false; for (unsigned i = 0; i < t->t_used[aidx]; i++) { const npf_tblent_t *elm = t->t_elements[aidx][i]; KASSERT(elm->te_alen == alen); if (memcmp(&elm->te_addr, addr, alen) == 0) { found = true; break; } } break; } default: KASSERT(false); found = false; } return found ? 0 : ENOENT; } npf_addr_t * npf_table_getsome(npf_table_t *t, const int alen, unsigned idx) { const unsigned aidx = NPF_ADDRLEN2IDX(alen); npf_tblent_t *elm; unsigned nitems; KASSERT(t->t_type == NPF_TABLE_IFADDR); KASSERT(aidx < NPF_ADDR_SLOTS); nitems = t->t_used[aidx]; if (nitems == 0) { return NULL; } /* * No need to acquire the lock, since the table is immutable. */ elm = t->t_elements[aidx][idx % nitems]; return &elm->te_addr; } static int table_ent_copyout(const npf_addr_t *addr, const int alen, npf_netmask_t mask, void *ubuf, size_t len, size_t *off) { void *ubufp = (uint8_t *)ubuf + *off; npf_ioctl_ent_t uent; if ((*off += sizeof(npf_ioctl_ent_t)) > len) { return ENOMEM; } uent.alen = alen; memcpy(&uent.addr, addr, sizeof(npf_addr_t)); uent.mask = mask; return copyout(&uent, ubufp, sizeof(npf_ioctl_ent_t)); } static int table_generic_list(npf_table_t *t, void *ubuf, size_t len) { npf_tblent_t *ent; size_t off = 0; int error = 0; LIST_FOREACH(ent, &t->t_list, te_listent) { mutex_exit(&t->t_lock); error = table_ent_copyout(&ent->te_addr, ent->te_alen, ent->te_preflen, ubuf, len, &off); mutex_enter(&t->t_lock); if (error) break; } return error; } static int table_cdb_list(npf_table_t *t, void *ubuf, size_t len) { size_t off = 0, dlen; const void *data; int error = 0; for (size_t i = 0; i < t->t_nitems; i++) { if (cdbr_get(t->t_cdb, i, &data, &dlen) != 0) { return EINVAL; } error = table_ent_copyout(data, dlen, 0, ubuf, len, &off); if (error) break; } return error; } /* * npf_table_list: copy a list of all table entries into a userspace buffer. */ int npf_table_list(npf_table_t *t, void *ubuf, size_t len) { int error = 0; mutex_enter(&t->t_lock); switch (t->t_type) { case NPF_TABLE_IPSET: error = table_generic_list(t, ubuf, len); break; case NPF_TABLE_LPM: error = table_generic_list(t, ubuf, len); break; case NPF_TABLE_CONST: error = table_cdb_list(t, ubuf, len); break; case NPF_TABLE_IFADDR: error = table_generic_list(t, ubuf, len); break; default: KASSERT(false); } mutex_exit(&t->t_lock); return error; } /* * npf_table_flush: remove all table entries. */ int npf_table_flush(npf_table_t *t) { int error = 0; mutex_enter(&t->t_lock); switch (t->t_type) { case NPF_TABLE_IPSET: table_ipset_flush(t); break; case NPF_TABLE_LPM: table_tree_flush(t); break; case NPF_TABLE_CONST: case NPF_TABLE_IFADDR: error = EINVAL; break; default: KASSERT(false); } mutex_exit(&t->t_lock); return error; } void npf_table_gc(npf_t *npf, npf_table_t *t) { npf_tblent_t *ent; void *ref; if (t->t_type != NPF_TABLE_IPSET || LIST_EMPTY(&t->t_gc)) { return; } ref = thmap_stage_gc(t->t_map); if (npf) { npf_config_locked_p(npf); npf_config_sync(npf); } thmap_gc(t->t_map, ref); while ((ent = LIST_FIRST(&t->t_gc)) != NULL) { LIST_REMOVE(ent, te_listent); pool_cache_put(tblent_cache, ent); } }
8 6 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 /* $NetBSD: init_sysctl_base.c,v 1.9 2023/12/20 20:35:37 andvar Exp $ */ /*- * Copyright (c) 2003, 2007, 2008, 2009 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Andrew Brown, and by Andrew Doran. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: init_sysctl_base.c,v 1.9 2023/12/20 20:35:37 andvar Exp $"); #include <sys/types.h> #include <sys/param.h> #include <sys/sysctl.h> #include <sys/proc.h> #include <sys/cpu.h> #include <sys/kernel.h> #include <sys/disklabel.h> static int sysctl_setlen(SYSCTLFN_PROTO); /* * sets up the base nodes... */ void sysctl_basenode_init(void) { sysctl_createv(NULL, 0, NULL, NULL, CTLFLAG_PERMANENT, CTLTYPE_NODE, "kern", SYSCTL_DESCR("High kernel"), NULL, 0, NULL, 0, CTL_KERN, CTL_EOL); sysctl_createv(NULL, 0, NULL, NULL, CTLFLAG_PERMANENT, CTLTYPE_NODE, "vm", SYSCTL_DESCR("Virtual memory"), NULL, 0, NULL, 0, CTL_VM, CTL_EOL); sysctl_createv(NULL, 0, NULL, NULL, CTLFLAG_PERMANENT, CTLTYPE_NODE, "vfs", SYSCTL_DESCR("Filesystem"), NULL, 0, NULL, 0, CTL_VFS, CTL_EOL); sysctl_createv(NULL, 0, NULL, NULL, CTLFLAG_PERMANENT, CTLTYPE_NODE, "net", SYSCTL_DESCR("Networking"), NULL, 0, NULL, 0, CTL_NET, CTL_EOL); sysctl_createv(NULL, 0, NULL, NULL, CTLFLAG_PERMANENT, CTLTYPE_NODE, "debug", SYSCTL_DESCR("Debugging"), NULL, 0, NULL, 0, CTL_DEBUG, CTL_EOL); sysctl_createv(NULL, 0, NULL, NULL, CTLFLAG_PERMANENT, CTLTYPE_NODE, "hw", SYSCTL_DESCR("Generic CPU, I/O"), NULL, 0, NULL, 0, CTL_HW, CTL_EOL); sysctl_createv(NULL, 0, NULL, NULL, CTLFLAG_PERMANENT, CTLTYPE_NODE, "machdep", SYSCTL_DESCR("Machine dependent"), NULL, 0, NULL, 0, CTL_MACHDEP, CTL_EOL); /* * this node is inserted so that the sysctl nodes in libc can * operate. */ sysctl_createv(NULL, 0, NULL, NULL, CTLFLAG_PERMANENT, CTLTYPE_NODE, "user", SYSCTL_DESCR("User-level"), NULL, 0, NULL, 0, CTL_USER, CTL_EOL); sysctl_createv(NULL, 0, NULL, NULL, CTLFLAG_PERMANENT, CTLTYPE_NODE, "ddb", SYSCTL_DESCR("In-kernel debugger"), NULL, 0, NULL, 0, CTL_DDB, CTL_EOL); sysctl_createv(NULL, 0, NULL, NULL, CTLFLAG_PERMANENT, CTLTYPE_NODE, "proc", SYSCTL_DESCR("Per-process"), NULL, 0, NULL, 0, CTL_PROC, CTL_EOL); sysctl_createv(NULL, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_NODE, "vendor", SYSCTL_DESCR("Vendor specific"), NULL, 0, NULL, 0, CTL_VENDOR, CTL_EOL); sysctl_createv(NULL, 0, NULL, NULL, CTLFLAG_PERMANENT, CTLTYPE_NODE, "emul", SYSCTL_DESCR("Emulation settings"), NULL, 0, NULL, 0, CTL_EMUL, CTL_EOL); sysctl_createv(NULL, 0, NULL, NULL, CTLFLAG_PERMANENT, CTLTYPE_NODE, "security", SYSCTL_DESCR("Security"), NULL, 0, NULL, 0, CTL_SECURITY, CTL_EOL); } /* * now add some nodes which both rump kernel and standard * NetBSD both need, as rump cannot use sys/kern/init_sysctl.c */ SYSCTL_SETUP(sysctl_kernbase_setup, "sysctl kern subtree base setup") { sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT, CTLTYPE_STRING, "ostype", SYSCTL_DESCR("Operating system type"), NULL, 0, __UNCONST(&ostype), 0, CTL_KERN, KERN_OSTYPE, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT, CTLTYPE_STRING, "osrelease", SYSCTL_DESCR("Operating system release"), NULL, 0, __UNCONST(&osrelease), 0, CTL_KERN, KERN_OSRELEASE, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_IMMEDIATE, CTLTYPE_INT, "osrevision", SYSCTL_DESCR("Operating system revision"), NULL, __NetBSD_Version__, NULL, 0, CTL_KERN, KERN_OSREV, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT, CTLTYPE_STRING, "version", SYSCTL_DESCR("Kernel version"), NULL, 0, __UNCONST(&version), 0, CTL_KERN, KERN_VERSION, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_STRING, "hostname", SYSCTL_DESCR("System hostname"), sysctl_setlen, 0, hostname, MAXHOSTNAMELEN, CTL_KERN, KERN_HOSTNAME, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_STRING, "domainname", SYSCTL_DESCR("YP domain name"), sysctl_setlen, 0, domainname, MAXHOSTNAMELEN, CTL_KERN, KERN_DOMAINNAME, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_IMMEDIATE, CTLTYPE_INT, "rawpartition", SYSCTL_DESCR("Raw partition of a disk"), NULL, RAW_PART, NULL, 0, CTL_KERN, KERN_RAWPARTITION, CTL_EOL); } static int sysctl_hw_machine_arch(SYSCTLFN_ARGS) { struct sysctlnode node = *rnode; #ifndef PROC_MACHINE_ARCH #define PROC_MACHINE_ARCH(P) machine_arch #endif node.sysctl_data = PROC_MACHINE_ARCH(l->l_proc); node.sysctl_size = strlen(node.sysctl_data) + 1; return sysctl_lookup(SYSCTLFN_CALL(&node)); } SYSCTL_SETUP(sysctl_hwbase_setup, "sysctl hw subtree base setup") { u_int u; u_quad_t q; const char *model = cpu_getmodel(); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT, CTLTYPE_STRING, "model", SYSCTL_DESCR("Machine model"), NULL, 0, __UNCONST(model), 0, CTL_HW, HW_MODEL, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT, CTLTYPE_STRING, "machine", SYSCTL_DESCR("Machine class"), NULL, 0, machine, 0, CTL_HW, HW_MACHINE, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_READONLY, CTLTYPE_STRING, "machine_arch", SYSCTL_DESCR("Machine CPU class"), sysctl_hw_machine_arch, 0, NULL, 0, CTL_HW, HW_MACHINE_ARCH, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT, CTLTYPE_INT, "ncpu", SYSCTL_DESCR("Number of CPUs configured"), NULL, 0, &ncpu, 0, CTL_HW, HW_NCPU, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_IMMEDIATE, CTLTYPE_INT, "byteorder", SYSCTL_DESCR("System byte order"), NULL, BYTE_ORDER, NULL, 0, CTL_HW, HW_BYTEORDER, CTL_EOL); u = ((u_int)physmem > (UINT_MAX / PAGE_SIZE)) ? UINT_MAX : physmem * PAGE_SIZE; sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_IMMEDIATE, CTLTYPE_INT, "physmem", SYSCTL_DESCR("Bytes of physical memory"), NULL, u, NULL, 0, CTL_HW, HW_PHYSMEM, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_IMMEDIATE, CTLTYPE_INT, "pagesize", SYSCTL_DESCR("Software page size"), NULL, PAGE_SIZE, NULL, 0, CTL_HW, HW_PAGESIZE, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_IMMEDIATE, CTLTYPE_INT, "alignbytes", SYSCTL_DESCR("Alignment constraint for all possible " "data types"), NULL, ALIGNBYTES, NULL, 0, CTL_HW, HW_ALIGNBYTES, CTL_EOL); q = (u_quad_t)physmem * PAGE_SIZE; sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_IMMEDIATE, CTLTYPE_QUAD, "physmem64", SYSCTL_DESCR("Bytes of physical memory"), NULL, q, NULL, 0, CTL_HW, HW_PHYSMEM64, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT, CTLTYPE_INT, "ncpuonline", SYSCTL_DESCR("Number of CPUs online"), NULL, 0, &ncpuonline, 0, CTL_HW, HW_NCPUONLINE, CTL_EOL); } /* * sysctl helper function for kern.hostname and kern.domainname. * resets the relevant recorded length when the underlying name is * changed. */ static int sysctl_setlen(SYSCTLFN_ARGS) { int error; error = sysctl_lookup(SYSCTLFN_CALL(rnode)); if (error || newp == NULL) return (error); switch (rnode->sysctl_num) { case KERN_HOSTNAME: hostnamelen = strlen((const char*)rnode->sysctl_data); break; case KERN_DOMAINNAME: domainnamelen = strlen((const char*)rnode->sysctl_data); break; } return (0); }
3 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 /* $NetBSD: rtsock_50.c,v 1.16 2020/01/29 05:47:12 thorpej Exp $ */ /* * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the project nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * Copyright (c) 1988, 1991, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)rtsock.c 8.7 (Berkeley) 10/12/95 */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: rtsock_50.c,v 1.16 2020/01/29 05:47:12 thorpej Exp $"); #define COMPAT_RTSOCK /* Use the COMPATNAME/COMPATCALL macros and the * various other compat definitions - see * sys/net/rtsock_shared.c for details */ #include <net/rtsock_shared.c> #include <compat/net/route_50.h> static struct sysctllog *clog; void compat_50_rt_oifmsg(struct ifnet *ifp) { struct if_msghdr50 oifm; struct if_data ifi; struct mbuf *m; struct rt_addrinfo info; if (COMPATNAME(route_info).ri_cb.any_count == 0) return; (void)memset(&info, 0, sizeof(info)); (void)memset(&oifm, 0, sizeof(oifm)); if_export_if_data(ifp, &ifi, false); oifm.ifm_index = ifp->if_index; oifm.ifm_flags = ifp->if_flags; oifm.ifm_data.ifi_type = ifi.ifi_type; oifm.ifm_data.ifi_addrlen = ifi.ifi_addrlen; oifm.ifm_data.ifi_hdrlen = ifi.ifi_hdrlen; oifm.ifm_data.ifi_link_state = ifi.ifi_link_state; oifm.ifm_data.ifi_mtu = ifi.ifi_mtu; oifm.ifm_data.ifi_metric = ifi.ifi_metric; oifm.ifm_data.ifi_baudrate = ifi.ifi_baudrate; oifm.ifm_data.ifi_ipackets = ifi.ifi_ipackets; oifm.ifm_data.ifi_ierrors = ifi.ifi_ierrors; oifm.ifm_data.ifi_opackets = ifi.ifi_opackets; oifm.ifm_data.ifi_oerrors = ifi.ifi_oerrors; oifm.ifm_data.ifi_collisions = ifi.ifi_collisions; oifm.ifm_data.ifi_ibytes = ifi.ifi_ibytes; oifm.ifm_data.ifi_obytes = ifi.ifi_obytes; oifm.ifm_data.ifi_imcasts = ifi.ifi_imcasts; oifm.ifm_data.ifi_omcasts = ifi.ifi_omcasts; oifm.ifm_data.ifi_iqdrops = ifi.ifi_iqdrops; oifm.ifm_data.ifi_noproto = ifi.ifi_noproto; TIMESPEC_TO_TIMEVAL(&oifm.ifm_data.ifi_lastchange, &ifi.ifi_lastchange); oifm.ifm_addrs = 0; m = COMPATNAME(rt_msg1)(RTM_OIFINFO, &info, (void *)&oifm, sizeof(oifm)); if (m == NULL) return; COMPATNAME(route_enqueue)(m, 0); } int compat_50_iflist(struct ifnet *ifp, struct rt_walkarg *w, struct rt_addrinfo *info, size_t len) { struct if_msghdr50 *ifm; struct if_data ifi; int error; ifm = (struct if_msghdr50 *)w->w_tmem; if_export_if_data(ifp, &ifi, false); ifm->ifm_index = ifp->if_index; ifm->ifm_flags = ifp->if_flags; ifm->ifm_data.ifi_type = ifi.ifi_type; ifm->ifm_data.ifi_addrlen = ifi.ifi_addrlen; ifm->ifm_data.ifi_hdrlen = ifi.ifi_hdrlen; ifm->ifm_data.ifi_link_state = ifi.ifi_link_state; ifm->ifm_data.ifi_mtu = ifi.ifi_mtu; ifm->ifm_data.ifi_metric = ifi.ifi_metric; ifm->ifm_data.ifi_baudrate = ifi.ifi_baudrate; ifm->ifm_data.ifi_ipackets = ifi.ifi_ipackets; ifm->ifm_data.ifi_ierrors = ifi.ifi_ierrors; ifm->ifm_data.ifi_opackets = ifi.ifi_opackets; ifm->ifm_data.ifi_oerrors = ifi.ifi_oerrors; ifm->ifm_data.ifi_collisions = ifi.ifi_collisions; ifm->ifm_data.ifi_ibytes = ifi.ifi_ibytes; ifm->ifm_data.ifi_obytes = ifi.ifi_obytes; ifm->ifm_data.ifi_imcasts = ifi.ifi_imcasts; ifm->ifm_data.ifi_omcasts = ifi.ifi_omcasts; ifm->ifm_data.ifi_iqdrops = ifi.ifi_iqdrops; ifm->ifm_data.ifi_noproto = ifi.ifi_noproto; TIMESPEC_TO_TIMEVAL(&ifm->ifm_data.ifi_lastchange, &ifi.ifi_lastchange); ifm->ifm_addrs = info->rti_addrs; error = copyout(ifm, w->w_where, len); if (error) return error; w->w_where = (char *)w->w_where + len; return 0; } void rtsock_50_init(void) { MODULE_HOOK_SET(rtsock_iflist_50_hook, compat_50_iflist); MODULE_HOOK_SET(rtsock_oifmsg_50_hook, compat_50_rt_oifmsg); MODULE_HOOK_SET(rtsock_rt_missmsg_50_hook, compat_50_rt_missmsg); MODULE_HOOK_SET(rtsock_rt_ifmsg_50_hook, compat_50_rt_ifmsg); MODULE_HOOK_SET(rtsock_rt_addrmsg_rt_50_hook, compat_50_rt_addrmsg_rt); MODULE_HOOK_SET(rtsock_rt_addrmsg_src_50_hook, compat_50_rt_addrmsg_src); MODULE_HOOK_SET(rtsock_rt_addrmsg_50_hook, compat_50_rt_addrmsg); MODULE_HOOK_SET(rtsock_rt_ifannouncemsg_50_hook, compat_50_rt_ifannouncemsg); MODULE_HOOK_SET(rtsock_rt_ieee80211msg_50_hook, compat_50_rt_ieee80211msg); sysctl_net_route_setup(&clog, PF_OROUTE, "ortable"); } void rtsock_50_fini(void) { sysctl_teardown(&clog); MODULE_HOOK_UNSET(rtsock_iflist_50_hook); MODULE_HOOK_UNSET(rtsock_oifmsg_50_hook); MODULE_HOOK_UNSET(rtsock_rt_missmsg_50_hook); MODULE_HOOK_UNSET(rtsock_rt_ifmsg_50_hook); MODULE_HOOK_UNSET(rtsock_rt_addrmsg_rt_50_hook); MODULE_HOOK_UNSET(rtsock_rt_addrmsg_src_50_hook); MODULE_HOOK_UNSET(rtsock_rt_addrmsg_50_hook); MODULE_HOOK_UNSET(rtsock_rt_ifannouncemsg_50_hook); MODULE_HOOK_UNSET(rtsock_rt_ieee80211msg_50_hook); }
22 22 18 18 18 61 61 21 7 14 520 248 198 569 571 1 570 81 570 135 118 18 2 17 118 66 66 60 1 2 2 3 3 5 5 1 61 16 16 16 16 33 7 15 56 1 12 1 11 1 11 8 4 12 12 49 7 7 7 66 1 7 7 7 7 14 2 9 7 4 1 11 11 11 11 11 17 17 1 16 16 16 17 6 6 6 6 6 6 6 6 10 9 1 1 1 16 16 16 16 15 1 16 16 16 16 16 16 12 2 2 7 7 7 7 7 7 11 11 10 10 10 10 2 11 11 39 39 6 1 29 32 31 31 28 3 29 3 84 3 82 179 179 179 113 84 113 83 84 14 14 20 21 11 9 2 2 2 6 3 3 30 4 1 2 3 4 4 2 2 1 1 3 2 2 36 63 60 11 6 65 50 50 50 50 46 3 1 48 2 50 49 24 24 8 23 24 250 1 249 20 20 20 20 2 17 2 14 8 19 20 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 /* $NetBSD: ufs_vnops.c,v 1.262 2022/03/27 16:24:59 christos Exp $ */ /*- * Copyright (c) 2008, 2020 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Wasabi Systems, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /* * Copyright (c) 1982, 1986, 1989, 1993, 1995 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)ufs_vnops.c 8.28 (Berkeley) 7/31/95 */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: ufs_vnops.c,v 1.262 2022/03/27 16:24:59 christos Exp $"); #if defined(_KERNEL_OPT) #include "opt_ffs.h" #include "opt_quota.h" #include "opt_uvmhist.h" #endif #include <sys/param.h> #include <sys/systm.h> #include <sys/namei.h> #include <sys/resourcevar.h> #include <sys/kernel.h> #include <sys/file.h> #include <sys/stat.h> #include <sys/buf.h> #include <sys/proc.h> #include <sys/mount.h> #include <sys/vnode.h> #include <sys/fstrans.h> #include <sys/kmem.h> #include <sys/malloc.h> #include <sys/dirent.h> #include <sys/lockf.h> #include <sys/kauth.h> #include <sys/wapbl.h> #include <miscfs/specfs/specdev.h> #include <miscfs/fifofs/fifo.h> #include <miscfs/genfs/genfs.h> #include <ufs/ufs/acl.h> #include <ufs/ufs/inode.h> #include <ufs/ufs/dir.h> #include <ufs/ufs/ufsmount.h> #include <ufs/ufs/ufs_bswap.h> #include <ufs/ufs/ufs_extern.h> #include <ufs/ufs/ufs_wapbl.h> #ifdef UFS_DIRHASH #include <ufs/ufs/dirhash.h> #endif #include <ufs/ext2fs/ext2fs_extern.h> #include <ufs/ext2fs/ext2fs_dir.h> #include <ufs/ffs/ffs_extern.h> #include <ufs/lfs/lfs_extern.h> #include <ufs/lfs/lfs.h> #ifdef UVMHIST #include <uvm/uvm.h> #endif #include <uvm/uvm_extern.h> #include <uvm/uvm_stat.h> __CTASSERT(EXT2FS_MAXNAMLEN == FFS_MAXNAMLEN); __CTASSERT(LFS_MAXNAMLEN == FFS_MAXNAMLEN); static int ufs_chmod(struct vnode *, int, kauth_cred_t, struct lwp *); static int ufs_chown(struct vnode *, uid_t, gid_t, kauth_cred_t, struct lwp *); static int ufs_makeinode(struct vattr *, struct vnode *, const struct ufs_lookup_results *, struct vnode **, struct componentname *); /* * A virgin directory (no blushing please). */ static const struct dirtemplate mastertemplate = { 0, 12, DT_DIR, 1, ".", 0, UFS_DIRBLKSIZ - 12, DT_DIR, 2, ".." }; /* * Create a regular file */ int ufs_create(void *v) { struct vop_create_v3_args /* { struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; struct vattr *a_vap; } */ *ap = v; int error; struct vnode *dvp = ap->a_dvp; struct ufs_lookup_results *ulr; /* XXX should handle this material another way */ ulr = &VTOI(dvp)->i_crap; UFS_CHECK_CRAPCOUNTER(VTOI(dvp)); /* * UFS_WAPBL_BEGIN(dvp->v_mount) performed by successful * ufs_makeinode */ error = ufs_makeinode(ap->a_vap, dvp, ulr, ap->a_vpp, ap->a_cnp); if (error) { return (error); } UFS_WAPBL_END(dvp->v_mount); VOP_UNLOCK(*ap->a_vpp); return (0); } /* * Mknod vnode call */ /* ARGSUSED */ int ufs_mknod(void *v) { struct vop_mknod_v3_args /* { struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; struct vattr *a_vap; } */ *ap = v; struct vattr *vap; struct vnode **vpp; struct inode *ip; int error; struct ufs_lookup_results *ulr; vap = ap->a_vap; vpp = ap->a_vpp; /* XXX should handle this material another way */ ulr = &VTOI(ap->a_dvp)->i_crap; UFS_CHECK_CRAPCOUNTER(VTOI(ap->a_dvp)); /* * UFS_WAPBL_BEGIN(dvp->v_mount) performed by successful * ufs_makeinode */ if ((error = ufs_makeinode(vap, ap->a_dvp, ulr, vpp, ap->a_cnp)) != 0) goto out; ip = VTOI(*vpp); ip->i_flag |= IN_ACCESS | IN_CHANGE | IN_UPDATE; UFS_WAPBL_UPDATE(*vpp, NULL, NULL, 0); UFS_WAPBL_END(ap->a_dvp->v_mount); VOP_UNLOCK(*vpp); out: if (error != 0) { *vpp = NULL; return (error); } return (0); } /* * Open called. * * Nothing to do. */ /* ARGSUSED */ int ufs_open(void *v) { struct vop_open_args /* { struct vnode *a_vp; int a_mode; kauth_cred_t a_cred; } */ *ap = v; /* * Files marked append-only must be opened for appending. */ if ((VTOI(ap->a_vp)->i_flags & APPEND) && (ap->a_mode & (FWRITE | O_APPEND)) == FWRITE) return (EPERM); return (0); } /* * Close called. * * Update the times on the inode. */ /* ARGSUSED */ int ufs_close(void *v) { struct vop_close_args /* { struct vnode *a_vp; int a_fflag; kauth_cred_t a_cred; } */ *ap = v; struct vnode *vp; vp = ap->a_vp; if (vrefcnt(vp) > 1) UFS_ITIMES(vp, NULL, NULL, NULL); return (0); } static int ufs_check_possible(struct vnode *vp, struct inode *ip, accmode_t accmode, kauth_cred_t cred) { #if defined(QUOTA) || defined(QUOTA2) int error; #endif /* * Disallow write attempts on read-only file systems; * unless the file is a socket, fifo, or a block or * character device resident on the file system. */ if (accmode & VMODIFY_PERMS) { switch (vp->v_type) { case VDIR: case VLNK: case VREG: if (vp->v_mount->mnt_flag & MNT_RDONLY) return EROFS; #if defined(QUOTA) || defined(QUOTA2) error = chkdq(ip, 0, cred, 0); if (error != 0) return error; #endif break; case VBAD: case VBLK: case VCHR: case VSOCK: case VFIFO: case VNON: default: break; } } /* If it is a snapshot, nobody gets access to it. */ if ((ip->i_flags & SF_SNAPSHOT)) return EPERM; /* * If immutable bit set, nobody gets to write it. "& ~VADMIN_PERMS" * permits the owner of the file to remove the IMMUTABLE flag. */ if ((accmode & (VMODIFY_PERMS & ~VADMIN_PERMS)) && (ip->i_flags & IMMUTABLE)) return EPERM; return 0; } static int ufs_check_permitted(struct vnode *vp, struct inode *ip, struct acl *acl, accmode_t accmode, kauth_cred_t cred, int (*func)(struct vnode *, kauth_cred_t, uid_t, gid_t, mode_t, struct acl *, accmode_t)) { return kauth_authorize_vnode(cred, KAUTH_ACCESS_ACTION(accmode, vp->v_type, ip->i_mode & ALLPERMS), vp, NULL, (*func)(vp, cred, ip->i_uid, ip->i_gid, ip->i_mode & ALLPERMS, acl, accmode)); } int ufs_accessx(void *v) { struct vop_accessx_args /* { struct vnode *a_vp; accmode_t a_accmode; kauth_cred_t a_cred; } */ *ap = v; struct vnode *vp = ap->a_vp; struct inode *ip = VTOI(vp); accmode_t accmode = ap->a_accmode; int error; #ifdef UFS_ACL struct acl *acl; acl_type_t type; #endif error = ufs_check_possible(vp, ip, accmode, ap->a_cred); if (error) return error; #ifdef UFS_ACL if ((vp->v_mount->mnt_flag & (MNT_POSIX1EACLS | MNT_NFS4ACLS)) != 0) { if (vp->v_mount->mnt_flag & MNT_NFS4ACLS) type = ACL_TYPE_NFS4; else type = ACL_TYPE_ACCESS; acl = acl_alloc(KM_SLEEP); if (type == ACL_TYPE_NFS4) error = ufs_getacl_nfs4_internal(vp, acl, curlwp); else error = VOP_GETACL(vp, type, acl, ap->a_cred); if (!error) { if (type == ACL_TYPE_NFS4) { error = ufs_check_permitted(vp, ip, acl, accmode, ap->a_cred, genfs_can_access_acl_nfs4); } else { error = vfs_unixify_accmode(&accmode); if (error == 0) error = ufs_check_permitted(vp, ip, acl, accmode, ap->a_cred, genfs_can_access_acl_posix1e); } acl_free(acl); return error; } if (error != EOPNOTSUPP) printf("%s: Error retrieving ACL: %d\n", __func__, error); /* * XXX: Fall back until debugged. Should * eventually possibly log an error, and return * EPERM for safety. */ acl_free(acl); } #endif /* !UFS_ACL */ error = vfs_unixify_accmode(&accmode); if (error) return error; return ufs_check_permitted(vp, ip, NULL, accmode, ap->a_cred, genfs_can_access); } /* ARGSUSED */ int ufs_getattr(void *v) { struct vop_getattr_args /* { struct vnode *a_vp; struct vattr *a_vap; kauth_cred_t a_cred; } */ *ap = v; struct vnode *vp; struct inode *ip; struct vattr *vap; vp = ap->a_vp; ip = VTOI(vp); vap = ap->a_vap; UFS_ITIMES(vp, NULL, NULL, NULL); /* * Copy from inode table */ vap->va_fsid = ip->i_dev; vap->va_fileid = ip->i_number; vap->va_mode = ip->i_mode & ALLPERMS; vap->va_nlink = ip->i_nlink; vap->va_uid = ip->i_uid; vap->va_gid = ip->i_gid; vap->va_size = vp->v_size; if (ip->i_ump->um_fstype == UFS1) { switch (vp->v_type) { case VBLK: case VCHR: vap->va_rdev = (dev_t)ufs_rw32(ip->i_ffs1_rdev, UFS_MPNEEDSWAP(ip->i_ump)); break; default: vap->va_rdev = NODEV; break; } vap->va_atime.tv_sec = ip->i_ffs1_atime; vap->va_atime.tv_nsec = ip->i_ffs1_atimensec; vap->va_mtime.tv_sec = ip->i_ffs1_mtime; vap->va_mtime.tv_nsec = ip->i_ffs1_mtimensec; vap->va_ctime.tv_sec = ip->i_ffs1_ctime; vap->va_ctime.tv_nsec = ip->i_ffs1_ctimensec; vap->va_birthtime.tv_sec = 0; vap->va_birthtime.tv_nsec = 0; vap->va_bytes = dbtob((u_quad_t)ip->i_ffs1_blocks); } else { switch (vp->v_type) { case VBLK: case VCHR: vap->va_rdev = (dev_t)ufs_rw64(ip->i_ffs2_rdev, UFS_MPNEEDSWAP(ip->i_ump)); break; default: vap->va_rdev = NODEV; break; } vap->va_atime.tv_sec = ip->i_ffs2_atime; vap->va_atime.tv_nsec = ip->i_ffs2_atimensec; vap->va_mtime.tv_sec = ip->i_ffs2_mtime; vap->va_mtime.tv_nsec = ip->i_ffs2_mtimensec; vap->va_ctime.tv_sec = ip->i_ffs2_ctime; vap->va_ctime.tv_nsec = ip->i_ffs2_ctimensec; vap->va_birthtime.tv_sec = ip->i_ffs2_birthtime; vap->va_birthtime.tv_nsec = ip->i_ffs2_birthnsec; vap->va_bytes = dbtob(ip->i_ffs2_blocks); } vap->va_gen = ip->i_gen; vap->va_flags = ip->i_flags; /* this doesn't belong here */ if (vp->v_type == VBLK) vap->va_blocksize = BLKDEV_IOSIZE; else if (vp->v_type == VCHR) vap->va_blocksize = MAXBSIZE; else vap->va_blocksize = vp->v_mount->mnt_stat.f_iosize; vap->va_type = vp->v_type; vap->va_filerev = ip->i_modrev; return (0); } /* * Set attribute vnode op. called from several syscalls */ int ufs_setattr(void *v) { struct vop_setattr_args /* { struct vnode *a_vp; struct vattr *a_vap; kauth_cred_t a_cred; } */ *ap = v; struct vattr *vap; struct vnode *vp; struct inode *ip; kauth_cred_t cred; struct lwp *l; int error; kauth_action_t action; bool changing_sysflags; vap = ap->a_vap; vp = ap->a_vp; ip = VTOI(vp); cred = ap->a_cred; l = curlwp; action = KAUTH_VNODE_WRITE_FLAGS; changing_sysflags = false; /* * Check for unsettable attributes. */ if ((vap->va_type != VNON) || (vap->va_nlink != VNOVAL) || (vap->va_fsid != VNOVAL) || (vap->va_fileid != VNOVAL) || (vap->va_blocksize != VNOVAL) || (vap->va_rdev != VNOVAL) || ((int)vap->va_bytes != VNOVAL) || (vap->va_gen != VNOVAL)) { return (EINVAL); } UFS_WAPBL_JUNLOCK_ASSERT(vp->v_mount); if (vap->va_flags != VNOVAL) { if (vp->v_mount->mnt_flag & MNT_RDONLY) { error = EROFS; goto out; } /* Snapshot flag cannot be set or cleared */ if ((vap->va_flags & (SF_SNAPSHOT | SF_SNAPINVAL)) != (ip->i_flags & (SF_SNAPSHOT | SF_SNAPINVAL))) { error = EPERM; goto out; } if (ip->i_flags & (SF_IMMUTABLE | SF_APPEND)) { action |= KAUTH_VNODE_HAS_SYSFLAGS; } if ((vap->va_flags & SF_SETTABLE) != (ip->i_flags & SF_SETTABLE)) { action |= KAUTH_VNODE_WRITE_SYSFLAGS; changing_sysflags = true; } error = kauth_authorize_vnode(cred, action, vp, NULL, genfs_can_chflags(vp, cred, ip->i_uid, changing_sysflags)); if (error) goto out; if (changing_sysflags) { error = UFS_WAPBL_BEGIN(vp->v_mount); if (error) goto out; ip->i_flags = vap->va_flags; DIP_ASSIGN(ip, flags, ip->i_flags); } else { error = UFS_WAPBL_BEGIN(vp->v_mount); if (error) goto out; ip->i_flags &= SF_SETTABLE; ip->i_flags |= (vap->va_flags & UF_SETTABLE); DIP_ASSIGN(ip, flags, ip->i_flags); } ip->i_flag |= IN_CHANGE; UFS_WAPBL_UPDATE(vp, NULL, NULL, 0); UFS_WAPBL_END(vp->v_mount); if (vap->va_flags & (IMMUTABLE | APPEND)) { error = 0; goto out; } } if (ip->i_flags & (IMMUTABLE | APPEND)) { error = EPERM; goto out; } /* * Go through the fields and update iff not VNOVAL. */ if (vap->va_uid != (uid_t)VNOVAL || vap->va_gid != (gid_t)VNOVAL) { if (vp->v_mount->mnt_flag & MNT_RDONLY) { error = EROFS; goto out; } error = UFS_WAPBL_BEGIN(vp->v_mount); if (error) goto out; error = ufs_chown(vp, vap->va_uid, vap->va_gid, cred, l); UFS_WAPBL_END(vp->v_mount); if (error) goto out; } if (vap->va_size != VNOVAL) { /* * Disallow write attempts on read-only file systems; * unless the file is a socket, fifo, or a block or * character device resident on the file system. */ switch (vp->v_type) { case VDIR: error = EISDIR; goto out; case VCHR: case VBLK: case VFIFO: break; case VREG: if (vp->v_mount->mnt_flag & MNT_RDONLY) { error = EROFS; goto out; } if ((ip->i_flags & SF_SNAPSHOT) != 0) { error = EPERM; goto out; } error = ufs_truncate_retry(vp, 0, vap->va_size, cred); if (error) goto out; break; default: error = EOPNOTSUPP; goto out; } } ip = VTOI(vp); if (vap->va_atime.tv_sec != VNOVAL || vap->va_mtime.tv_sec != VNOVAL || vap->va_birthtime.tv_sec != VNOVAL) { if (vp->v_mount->mnt_flag & MNT_RDONLY) { error = EROFS; goto out; } if ((ip->i_flags & SF_SNAPSHOT) != 0) { error = EPERM; goto out; } error = kauth_authorize_vnode(cred, KAUTH_VNODE_WRITE_TIMES, vp, NULL, genfs_can_chtimes(vp, cred, ip->i_uid, vap->va_vaflags)); if (error) goto out; error = UFS_WAPBL_BEGIN(vp->v_mount); if (error) goto out; if (vap->va_atime.tv_sec != VNOVAL) if (!(vp->v_mount->mnt_flag & MNT_NOATIME)) ip->i_flag |= IN_ACCESS; if (vap->va_mtime.tv_sec != VNOVAL) { ip->i_flag |= IN_CHANGE | IN_UPDATE; if (vp->v_mount->mnt_flag & MNT_RELATIME) ip->i_flag |= IN_ACCESS; } if (vap->va_birthtime.tv_sec != VNOVAL && ip->i_ump->um_fstype == UFS2) { ip->i_ffs2_birthtime = vap->va_birthtime.tv_sec; ip->i_ffs2_birthnsec = vap->va_birthtime.tv_nsec; } error = UFS_UPDATE(vp, &vap->va_atime, &vap->va_mtime, 0); UFS_WAPBL_END(vp->v_mount); if (error) goto out; } error = 0; if (vap->va_mode != (mode_t)VNOVAL) { if (vp->v_mount->mnt_flag & MNT_RDONLY) { error = EROFS; goto out; } if ((ip->i_flags & SF_SNAPSHOT) != 0 && (vap->va_mode & (S_IXUSR | S_IWUSR | S_IXGRP | S_IWGRP | S_IXOTH | S_IWOTH))) { error = EPERM; goto out; } error = UFS_WAPBL_BEGIN(vp->v_mount); if (error) goto out; error = ufs_chmod(vp, (int)vap->va_mode, cred, l); UFS_WAPBL_END(vp->v_mount); } out: cache_enter_id(vp, ip->i_mode, ip->i_uid, ip->i_gid, !HAS_ACLS(ip)); return (error); } #ifdef UFS_ACL static int ufs_update_nfs4_acl_after_mode_change(struct vnode *vp, int mode, int file_owner_id, kauth_cred_t cred, struct lwp *l) { int error; struct acl *aclp; aclp = acl_alloc(KM_SLEEP); error = ufs_getacl_nfs4_internal(vp, aclp, l); /* * We don't have to handle EOPNOTSUPP here, as the filesystem claims * it supports ACLs. */ if (error) goto out; acl_nfs4_sync_acl_from_mode(aclp, mode, file_owner_id); error = ufs_setacl_nfs4_internal(vp, aclp, l, false); out: acl_free(aclp); return (error); } #endif /* UFS_ACL */ /* * Change the mode on a file. * Inode must be locked before calling. */ static int ufs_chmod(struct vnode *vp, int mode, kauth_cred_t cred, struct lwp *l) { struct inode *ip; int error; UFS_WAPBL_JLOCK_ASSERT(vp->v_mount); ip = VTOI(vp); #ifdef UFS_ACL /* * To modify the permissions on a file, must possess VADMIN * for that file. */ if ((error = VOP_ACCESSX(vp, VWRITE_ACL, cred)) != 0) return error; #endif error = kauth_authorize_vnode(cred, KAUTH_VNODE_WRITE_SECURITY, vp, NULL, genfs_can_chmod(vp, cred, ip->i_uid, ip->i_gid, mode)); if (error) return (error); #ifdef UFS_ACL if ((vp->v_mount->mnt_flag & MNT_NFS4ACLS) != 0) { error = ufs_update_nfs4_acl_after_mode_change(vp, mode, ip->i_uid, cred, l); if (error) return error; } #endif ip->i_mode &= ~ALLPERMS; ip->i_mode |= (mode & ALLPERMS); ip->i_flag |= IN_CHANGE; DIP_ASSIGN(ip, mode, ip->i_mode); UFS_WAPBL_UPDATE(vp, NULL, NULL, 0); cache_enter_id(vp, ip->i_mode, ip->i_uid, ip->i_gid, !HAS_ACLS(ip)); return (0); } /* * Perform chown operation on inode ip; * inode must be locked prior to call. */ static int ufs_chown(struct vnode *vp, uid_t uid, gid_t gid, kauth_cred_t cred, struct lwp *l) { struct inode *ip; int error = 0; #if defined(QUOTA) || defined(QUOTA2) uid_t ouid; gid_t ogid; int64_t change; #endif ip = VTOI(vp); error = 0; if (uid == (uid_t)VNOVAL) uid = ip->i_uid; if (gid == (gid_t)VNOVAL) gid = ip->i_gid; #ifdef UFS_ACL /* * To modify the ownership of a file, must possess VADMIN for that * file. */ if ((error = VOP_ACCESSX(vp, VWRITE_OWNER, cred)) != 0) return error; #endif error = kauth_authorize_vnode(cred, KAUTH_VNODE_CHANGE_OWNERSHIP, vp, NULL, genfs_can_chown(vp, cred, ip->i_uid, ip->i_gid, uid, gid)); if (error) return (error); #if defined(QUOTA) || defined(QUOTA2) ogid = ip->i_gid; ouid = ip->i_uid; change = DIP(ip, blocks); (void) chkdq(ip, -change, cred, 0); (void) chkiq(ip, -1, cred, 0); #endif ip->i_gid = gid; DIP_ASSIGN(ip, gid, gid); ip->i_uid = uid; DIP_ASSIGN(ip, uid, uid); #if defined(QUOTA) || defined(QUOTA2) if ((error = chkdq(ip, change, cred, 0)) == 0) { if ((error = chkiq(ip, 1, cred, 0)) == 0) goto good; else (void) chkdq(ip, -change, cred, FORCE); } ip->i_gid = ogid; DIP_ASSIGN(ip, gid, ogid); ip->i_uid = ouid; DIP_ASSIGN(ip, uid, ouid); (void) chkdq(ip, change, cred, FORCE); (void) chkiq(ip, 1, cred, FORCE); return (error); good: #endif /* QUOTA || QUOTA2 */ ip->i_flag |= IN_CHANGE; UFS_WAPBL_UPDATE(vp, NULL, NULL, 0); cache_enter_id(vp, ip->i_mode, ip->i_uid, ip->i_gid, !HAS_ACLS(ip)); return (0); } int ufs_remove(void *v) { struct vop_remove_v3_args /* { struct vnode *a_dvp; struct vnode *a_vp; struct componentname *a_cnp; nlink_t ctx_vp_new_nlink; } */ *ap = v; struct vnode *vp, *dvp; struct inode *ip; struct mount *mp; int error; struct ufs_lookup_results *ulr; vp = ap->a_vp; dvp = ap->a_dvp; ip = VTOI(vp); mp = dvp->v_mount; KASSERT(mp == vp->v_mount); /* XXX Not stable without lock. */ #ifdef UFS_ACL #ifdef notyet /* We don't do this because if the filesystem is mounted without ACLs * this goes through vfs_unixify_accmode() and we get EPERM. */ error = VOP_ACCESSX(vp, VDELETE, ap->a_cnp->cn_cred); if (error) goto err; #endif #endif /* XXX should handle this material another way */ ulr = &VTOI(dvp)->i_crap; UFS_CHECK_CRAPCOUNTER(VTOI(dvp)); if (vp->v_type == VDIR || (ip->i_flags & (IMMUTABLE | APPEND)) || (VTOI(dvp)->i_flags & APPEND)) error = EPERM; else { error = UFS_WAPBL_BEGIN(mp); if (error == 0) { error = ufs_dirremove(dvp, ulr, ip, ap->a_cnp->cn_flags, 0); UFS_WAPBL_END(mp); if (error == 0) { ap->ctx_vp_new_nlink = ip->i_nlink; } } } #ifdef notyet err: #endif if (dvp == vp) vrele(vp); else vput(vp); return (error); } /* * ufs_link: create hard link. */ int ufs_link(void *v) { struct vop_link_v2_args /* { struct vnode *a_dvp; struct vnode *a_vp; struct componentname *a_cnp; } */ *ap = v; struct vnode *dvp = ap->a_dvp; struct vnode *vp = ap->a_vp; struct componentname *cnp = ap->a_cnp; struct mount *mp = dvp->v_mount; struct inode *ip; struct direct *newdir; int error, abrt = 1; struct ufs_lookup_results *ulr; KASSERT(dvp != vp); KASSERT(vp->v_type != VDIR); KASSERT(mp == vp->v_mount); /* XXX Not stable without lock. */ /* XXX should handle this material another way */ ulr = &VTOI(dvp)->i_crap; UFS_CHECK_CRAPCOUNTER(VTOI(dvp)); error = vn_lock(vp, LK_EXCLUSIVE); if (error) goto out2; ip = VTOI(vp); if ((nlink_t)ip->i_nlink >= LINK_MAX) { error = EMLINK; goto out1; } if (ip->i_flags & (IMMUTABLE | APPEND)) { error = EPERM; goto out1; } error = kauth_authorize_vnode(cnp->cn_cred, KAUTH_VNODE_ADD_LINK, vp, dvp, 0); if (error) goto out1; error = UFS_WAPBL_BEGIN(mp); if (error) goto out1; ip->i_nlink++; DIP_ASSIGN(ip, nlink, ip->i_nlink); ip->i_flag |= IN_CHANGE; abrt = 0; error = UFS_UPDATE(vp, NULL, NULL, UPDATE_DIROP); if (!error) { newdir = pool_cache_get(ufs_direct_cache, PR_WAITOK); ufs_makedirentry(ip, cnp, newdir); error = ufs_direnter(dvp, ulr, vp, newdir, cnp, NULL); pool_cache_put(ufs_direct_cache, newdir); } if (error) { ip->i_nlink--; DIP_ASSIGN(ip, nlink, ip->i_nlink); ip->i_flag |= IN_CHANGE; UFS_WAPBL_UPDATE(vp, NULL, NULL, UPDATE_DIROP); } UFS_WAPBL_END(mp); out1: VOP_UNLOCK(vp); out2: if (abrt) VOP_ABORTOP(dvp, cnp); return (error); } /* * whiteout vnode call */ int ufs_whiteout(void *v) { struct vop_whiteout_args /* { struct vnode *a_dvp; struct componentname *a_cnp; int a_flags; } */ *ap = v; struct vnode *dvp = ap->a_dvp; struct componentname *cnp = ap->a_cnp; struct direct *newdir; int error; struct ufsmount *ump = VFSTOUFS(dvp->v_mount); struct ufs_lookup_results *ulr; /* XXX should handle this material another way */ ulr = &VTOI(dvp)->i_crap; UFS_CHECK_CRAPCOUNTER(VTOI(dvp)); error = 0; switch (ap->a_flags) { case LOOKUP: /* 4.4 format directories support whiteout operations */ if (ump->um_maxsymlinklen > 0) return (0); return (EOPNOTSUPP); case CREATE: /* create a new directory whiteout */ error = UFS_WAPBL_BEGIN(dvp->v_mount); if (error) break; KASSERTMSG((ump->um_maxsymlinklen > 0), "ufs_whiteout: old format filesystem"); newdir = pool_cache_get(ufs_direct_cache, PR_WAITOK); newdir->d_ino = UFS_WINO; newdir->d_namlen = cnp->cn_namelen; memcpy(newdir->d_name, cnp->cn_nameptr, (size_t)cnp->cn_namelen); /* NUL terminate and zero out padding */ memset(&newdir->d_name[cnp->cn_namelen], 0, UFS_NAMEPAD(cnp->cn_namelen)); newdir->d_type = DT_WHT; error = ufs_direnter(dvp, ulr, NULL, newdir, cnp, NULL); pool_cache_put(ufs_direct_cache, newdir); break; case DELETE: /* remove an existing directory whiteout */ error = UFS_WAPBL_BEGIN(dvp->v_mount); if (error) break; KASSERTMSG((ump->um_maxsymlinklen > 0), "ufs_whiteout: old format filesystem"); cnp->cn_flags &= ~DOWHITEOUT; error = ufs_dirremove(dvp, ulr, NULL, cnp->cn_flags, 0); break; default: panic("ufs_whiteout: unknown op"); /* NOTREACHED */ } UFS_WAPBL_END(dvp->v_mount); return (error); } #ifdef UFS_ACL static int ufs_do_posix1e_acl_inheritance_dir(struct vnode *dvp, struct vnode *tvp, mode_t dmode, kauth_cred_t cred, struct lwp *l) { int error; struct inode *ip = VTOI(tvp); struct acl *dacl, *acl; acl = acl_alloc(KM_SLEEP); dacl = acl_alloc(KM_SLEEP); /* * Retrieve default ACL from parent, if any. */ error = VOP_GETACL(dvp, ACL_TYPE_DEFAULT, acl, cred); switch (error) { case 0: /* * Retrieved a default ACL, so merge mode and ACL if * necessary. If the ACL is empty, fall through to * the "not defined or available" case. */ if (acl->acl_cnt != 0) { dmode = acl_posix1e_newfilemode(dmode, acl); ip->i_mode = dmode; DIP_ASSIGN(ip, mode, dmode); *dacl = *acl; ufs_sync_acl_from_inode(ip, acl); break; } /* FALLTHROUGH */ case EOPNOTSUPP: /* * Just use the mode as-is. */ ip->i_mode = dmode; DIP_ASSIGN(ip, mode, dmode); error = 0; goto out; default: goto out; } /* * XXX: If we abort now, will Soft Updates notify the extattr * code that the EAs for the file need to be released? */ UFS_WAPBL_END(tvp->v_mount); error = ufs_setacl_posix1e(tvp, ACL_TYPE_ACCESS, acl, cred, l); if (error == 0) error = ufs_setacl_posix1e(tvp, ACL_TYPE_DEFAULT, dacl, cred, l); UFS_WAPBL_BEGIN(tvp->v_mount); switch (error) { case 0: break; case EOPNOTSUPP: /* * XXX: This should not happen, as EOPNOTSUPP above * was supposed to free acl. */ printf("ufs_mkdir: VOP_GETACL() but no VOP_SETACL()\n"); /* panic("ufs_mkdir: VOP_GETACL() but no VOP_SETACL()"); */ break; default: goto out; } out: acl_free(acl); acl_free(dacl); return (error); } static int ufs_do_posix1e_acl_inheritance_file(struct vnode *dvp, struct vnode *tvp, mode_t mode, kauth_cred_t cred, struct lwp *l) { int error; struct inode *ip = VTOI(tvp); struct acl *acl; acl = acl_alloc(KM_SLEEP); /* * Retrieve default ACL for parent, if any. */ error = VOP_GETACL(dvp, ACL_TYPE_DEFAULT, acl, cred); switch (error) { case 0: /* * Retrieved a default ACL, so merge mode and ACL if * necessary. */ if (acl->acl_cnt != 0) { /* * Two possible ways for default ACL to not * be present. First, the EA can be * undefined, or second, the default ACL can * be blank. If it's blank, fall through to * the it's not defined case. */ mode = acl_posix1e_newfilemode(mode, acl); ip->i_mode = mode; DIP_ASSIGN(ip, mode, mode); ufs_sync_acl_from_inode(ip, acl); break; } /* FALLTHROUGH */ case EOPNOTSUPP: /* * Just use the mode as-is. */ ip->i_mode = mode; DIP_ASSIGN(ip, mode, mode); error = 0; goto out; default: goto out; } UFS_WAPBL_END(tvp->v_mount); /* * XXX: If we abort now, will Soft Updates notify the extattr * code that the EAs for the file need to be released? */ error = VOP_SETACL(tvp, ACL_TYPE_ACCESS, acl, cred); UFS_WAPBL_BEGIN(tvp->v_mount); switch (error) { case 0: break; case EOPNOTSUPP: /* * XXX: This should not happen, as EOPNOTSUPP above was * supposed to free acl. */ printf("%s: VOP_GETACL() but no VOP_SETACL()\n", __func__); /* panic("%s: VOP_GETACL() but no VOP_SETACL()", __func__); */ break; default: goto out; } out: acl_free(acl); return (error); } static int ufs_do_nfs4_acl_inheritance(struct vnode *dvp, struct vnode *tvp, mode_t child_mode, kauth_cred_t cred, struct lwp *l) { int error; struct acl *parent_aclp, *child_aclp; parent_aclp = acl_alloc(KM_SLEEP); child_aclp = acl_alloc(KM_SLEEP); error = ufs_getacl_nfs4_internal(dvp, parent_aclp, l); if (error) goto out; acl_nfs4_compute_inherited_acl(parent_aclp, child_aclp, child_mode, VTOI(tvp)->i_uid, tvp->v_type == VDIR); error = ufs_setacl_nfs4_internal(tvp, child_aclp, l, false); if (error) goto out; out: acl_free(parent_aclp); acl_free(child_aclp); return (error); } #endif int ufs_mkdir(void *v) { struct vop_mkdir_v3_args /* { struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; struct vattr *a_vap; } */ *ap = v; struct vnode *dvp = ap->a_dvp, *tvp; struct vattr *vap = ap->a_vap; struct componentname *cnp = ap->a_cnp; struct inode *ip, *dp = VTOI(dvp); struct buf *bp; struct dirtemplate dirtemplate; struct direct *newdir; int error; struct ufsmount *ump = dp->i_ump; int dirblksiz = ump->um_dirblksiz; struct ufs_lookup_results *ulr; /* XXX should handle this material another way */ ulr = &dp->i_crap; UFS_CHECK_CRAPCOUNTER(dp); KASSERT(vap->va_type == VDIR); if ((nlink_t)dp->i_nlink >= LINK_MAX) { error = EMLINK; goto out; } /* * Must simulate part of ufs_makeinode here to acquire the inode, * but not have it entered in the parent directory. The entry is * made later after writing "." and ".." entries. */ error = vcache_new(dvp->v_mount, dvp, vap, cnp->cn_cred, NULL, ap->a_vpp); if (error) goto out; error = vn_lock(*ap->a_vpp, LK_EXCLUSIVE); if (error) { vrele(*ap->a_vpp); *ap->a_vpp = NULL; goto out; } error = UFS_WAPBL_BEGIN(ap->a_dvp->v_mount); if (error) { vput(*ap->a_vpp); goto out; } tvp = *ap->a_vpp; ip = VTOI(tvp); ip->i_flag |= IN_ACCESS | IN_CHANGE | IN_UPDATE; ip->i_nlink = 2; DIP_ASSIGN(ip, nlink, 2); if (cnp->cn_flags & ISWHITEOUT) { ip->i_flags |= UF_OPAQUE; DIP_ASSIGN(ip, flags, ip->i_flags); } /* * Bump link count in parent directory to reflect work done below. * Should be done before reference is created so cleanup is * possible if we crash. */ dp->i_nlink++; DIP_ASSIGN(dp, nlink, dp->i_nlink); dp->i_flag |= IN_CHANGE; if ((error = UFS_UPDATE(dvp, NULL, NULL, UPDATE_DIROP)) != 0) goto bad; #ifdef UFS_ACL mode_t dmode = (vap->va_mode & 0777) | IFDIR; struct lwp *l = curlwp; if (dvp->v_mount->mnt_flag & MNT_POSIX1EACLS) { error = ufs_do_posix1e_acl_inheritance_dir(dvp, tvp, dmode, cnp->cn_cred, l); if (error) goto bad; } else if (dvp->v_mount->mnt_flag & MNT_NFS4ACLS) { error = ufs_do_nfs4_acl_inheritance(dvp, tvp, dmode, cnp->cn_cred, l); if (error) goto bad; } #endif /* !UFS_ACL */ /* * Initialize directory with "." and ".." from static template. */ dirtemplate = mastertemplate; dirtemplate.dotdot_reclen = dirblksiz - dirtemplate.dot_reclen; dirtemplate.dot_ino = ufs_rw32(ip->i_number, UFS_MPNEEDSWAP(ump)); dirtemplate.dotdot_ino = ufs_rw32(dp->i_number, UFS_MPNEEDSWAP(ump)); dirtemplate.dot_reclen = ufs_rw16(dirtemplate.dot_reclen, UFS_MPNEEDSWAP(ump)); dirtemplate.dotdot_reclen = ufs_rw16(dirtemplate.dotdot_reclen, UFS_MPNEEDSWAP(ump)); if (ump->um_maxsymlinklen <= 0) { #if BYTE_ORDER == LITTLE_ENDIAN if (UFS_MPNEEDSWAP(ump) == 0) #else if (UFS_MPNEEDSWAP(ump) != 0) #endif { dirtemplate.dot_type = dirtemplate.dot_namlen; dirtemplate.dotdot_type = dirtemplate.dotdot_namlen; dirtemplate.dot_namlen = dirtemplate.dotdot_namlen = 0; } else dirtemplate.dot_type = dirtemplate.dotdot_type = 0; } if ((error = UFS_BALLOC(tvp, (off_t)0, dirblksiz, cnp->cn_cred, B_CLRBUF, &bp)) != 0) goto bad; ip->i_size = dirblksiz; DIP_ASSIGN(ip, size, dirblksiz); ip->i_flag |= IN_ACCESS | IN_CHANGE | IN_UPDATE; uvm_vnp_setsize(tvp, ip->i_size); memcpy((void *)bp->b_data, (void *)&dirtemplate, sizeof dirtemplate); /* * Directory set up, now install its entry in the parent directory. * We must write out the buffer containing the new directory body * before entering the new name in the parent. */ if ((error = VOP_BWRITE(bp->b_vp, bp)) != 0) goto bad; if ((error = UFS_UPDATE(tvp, NULL, NULL, UPDATE_DIROP)) != 0) { goto bad; } newdir = pool_cache_get(ufs_direct_cache, PR_WAITOK); ufs_makedirentry(ip, cnp, newdir); error = ufs_direnter(dvp, ulr, tvp, newdir, cnp, bp); pool_cache_put(ufs_direct_cache, newdir); bad: if (error == 0) { VOP_UNLOCK(tvp); UFS_WAPBL_END(dvp->v_mount); } else { dp->i_nlink--; DIP_ASSIGN(dp, nlink, dp->i_nlink); dp->i_flag |= IN_CHANGE; UFS_WAPBL_UPDATE(dvp, NULL, NULL, UPDATE_DIROP); /* * No need to do an explicit UFS_TRUNCATE here, vrele will * do this for us because we set the link count to 0. */ ip->i_nlink = 0; DIP_ASSIGN(ip, nlink, 0); ip->i_flag |= IN_CHANGE; UFS_WAPBL_UPDATE(tvp, NULL, NULL, UPDATE_DIROP); UFS_WAPBL_END(dvp->v_mount); vput(tvp); } out: return (error); } int ufs_rmdir(void *v) { struct vop_rmdir_v2_args /* { struct vnode *a_dvp; struct vnode *a_vp; struct componentname *a_cnp; } */ *ap = v; struct vnode *vp, *dvp; struct componentname *cnp; struct inode *ip, *dp; int error; struct ufs_lookup_results *ulr; vp = ap->a_vp; dvp = ap->a_dvp; cnp = ap->a_cnp; ip = VTOI(vp); dp = VTOI(dvp); #ifdef UFS_ACL #ifdef notyet /* We don't do this because if the filesystem is mounted without ACLs * this goes through vfs_unixify_accmode() and we get EPERM. */ error = VOP_ACCESSX(vp, VDELETE, cnp->cn_cred); if (error) goto err; #endif #endif /* XXX should handle this material another way */ ulr = &dp->i_crap; UFS_CHECK_CRAPCOUNTER(dp); /* * No rmdir "." or of mounted directories please. */ if (dp == ip || vp->v_mountedhere != NULL) { error = EINVAL; goto err; } /* * Do not remove a directory that is in the process of being renamed. * Verify that the directory is empty (and valid). (Rmdir ".." won't * be valid since ".." will contain a reference to the current * directory and thus be non-empty.) */ error = 0; if (ip->i_nlink != 2 || !ufs_dirempty(ip, dp->i_number, cnp->cn_cred)) { error = ENOTEMPTY; goto out; } if ((dp->i_flags & APPEND) || (ip->i_flags & (IMMUTABLE | APPEND))) { error = EPERM; goto out; } error = UFS_WAPBL_BEGIN(dvp->v_mount); if (error) goto out; /* * Delete reference to directory before purging * inode. If we crash in between, the directory * will be reattached to lost+found, */ error = ufs_dirremove(dvp, ulr, ip, cnp->cn_flags, 1); if (error) { UFS_WAPBL_END(dvp->v_mount); goto out; } cache_purge(dvp); /* * Truncate inode. The only stuff left in the directory is "." and * "..". The "." reference is inconsequential since we're quashing * it. */ dp->i_nlink--; DIP_ASSIGN(dp, nlink, dp->i_nlink); dp->i_flag |= IN_CHANGE; UFS_WAPBL_UPDATE(dvp, NULL, NULL, UPDATE_DIROP); ip->i_nlink--; DIP_ASSIGN(ip, nlink, ip->i_nlink); ip->i_flag |= IN_CHANGE; (void) UFS_TRUNCATE(vp, (off_t)0, IO_SYNC, cnp->cn_cred); cache_purge(vp); /* * Unlock the log while we still have reference to unlinked * directory vp so that it will not get locked for recycling */ UFS_WAPBL_END(dvp->v_mount); #ifdef UFS_DIRHASH if (ip->i_dirhash != NULL) ufsdirhash_free(ip); #endif out: vput(vp); return error; err: if (dp == ip) vrele(vp); else vput(vp); return error; } /* * symlink -- make a symbolic link */ int ufs_symlink(void *v) { struct vop_symlink_v3_args /* { struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; struct vattr *a_vap; char *a_target; } */ *ap = v; struct vnode *vp, **vpp; struct inode *ip; int len, error; struct ufs_lookup_results *ulr; vpp = ap->a_vpp; /* XXX should handle this material another way */ ulr = &VTOI(ap->a_dvp)->i_crap; UFS_CHECK_CRAPCOUNTER(VTOI(ap->a_dvp)); /* * UFS_WAPBL_BEGIN(dvp->v_mount) performed by successful * ufs_makeinode */ KASSERT(ap->a_vap->va_type == VLNK); error = ufs_makeinode(ap->a_vap, ap->a_dvp, ulr, vpp, ap->a_cnp); if (error) goto out; vp = *vpp; len = strlen(ap->a_target); ip = VTOI(vp); /* * This test is off by one. um_maxsymlinklen contains the * number of bytes available, and we aren't storing a \0, so * the test should properly be <=. However, it cannot be * changed as this would break compatibility with existing fs * images -- see the way ufs_readlink() works. */ if (len < ip->i_ump->um_maxsymlinklen) { memcpy((char *)SHORTLINK(ip), ap->a_target, len); ip->i_size = len; DIP_ASSIGN(ip, size, len); uvm_vnp_setsize(vp, ip->i_size); ip->i_flag |= IN_CHANGE | IN_UPDATE; if (vp->v_mount->mnt_flag & MNT_RELATIME) ip->i_flag |= IN_ACCESS; UFS_WAPBL_UPDATE(vp, NULL, NULL, 0); } else error = ufs_bufio(UIO_WRITE, vp, ap->a_target, len, (off_t)0, IO_NODELOCKED | IO_JOURNALLOCKED, ap->a_cnp->cn_cred, NULL, NULL); UFS_WAPBL_END(ap->a_dvp->v_mount); VOP_UNLOCK(vp); if (error) vrele(vp); out: return (error); } /* * Vnode op for reading directories. * * This routine handles converting from the on-disk directory format * "struct direct" to the in-memory format "struct dirent" as well as * byte swapping the entries if necessary. */ int ufs_readdir(void *v) { struct vop_readdir_args /* { struct vnode *a_vp; struct uio *a_uio; kauth_cred_t a_cred; int *a_eofflag; off_t **a_cookies; int *a_ncookies; } */ *ap = v; /* vnode and fs */ struct vnode *vp = ap->a_vp; struct ufsmount *ump = VFSTOUFS(vp->v_mount); int nswap = UFS_MPNEEDSWAP(ump); #if BYTE_ORDER == LITTLE_ENDIAN int needswap = ump->um_maxsymlinklen <= 0 && nswap == 0; #else int needswap = ump->um_maxsymlinklen <= 0 && nswap != 0; #endif /* caller's buffer */ struct uio *calleruio = ap->a_uio; off_t startoffset, endoffset; size_t callerbytes; off_t curoffset; /* dirent production buffer */ char *direntbuf; size_t direntbufmax; struct dirent *dirent, *stopdirent; /* output cookies array */ off_t *cookies; size_t numcookies, maxcookies; /* disk buffer */ off_t physstart, physend; size_t skipstart, dropend; char *rawbuf; size_t rawbufmax, rawbytes; struct uio rawuio; struct iovec rawiov; struct direct *rawdp, *stoprawdp; /* general */ int error; KASSERT(VOP_ISLOCKED(vp)); /* * Figure out where the user wants us to read and how much. * * XXX: there should probably be an upper bound on callerbytes * to avoid silliness trying to do large kernel allocations. */ callerbytes = calleruio->uio_resid; startoffset = calleruio->uio_offset; endoffset = startoffset + callerbytes; if (callerbytes < _DIRENT_MINSIZE(dirent)) { /* no room for even one struct dirent */ return EINVAL; } /* * Now figure out where to actually start reading. Round the * start down to a block boundary: we need to start at the * beginning of a block in order to read the directory * correctly. * * We also want to always read a whole number of blocks so * that the copying code below doesn't have to worry about * partial entries. (It used to try at one point, and was a * horrible mess.) * * Furthermore, since blocks have to be scanned from the * beginning, if we go partially into another block now we'll * just have to rescan it on the next readdir call, which * doesn't really serve any useful purpose. * * So, round down the end as well. It's ok to underpopulate * the transfer buffer, as long as we send back at least one * dirent so as to avoid giving a bogus EOF indication. * * Note that because dirents are larger than ffs struct * directs, despite the rounding down we may not be able to * send all the entries in the blocks we read and may have to * rescan some of them on the next call anyway. Alternatively * if there's empty space on disk we might have actually been * able to fit the next block in, and so forth. None of this * actually matters that much in practice. * * XXX: what does ffs do if a directory block becomes * completely empty, and what happens if all the blocks we * read are completely empty even though we aren't at EOF? As * of this writing I (dholland) can't remember the details. */ physstart = rounddown2(startoffset, ump->um_dirblksiz); physend = rounddown2(endoffset, ump->um_dirblksiz); if (physstart >= physend) { /* Need at least one block */ return EINVAL; } /* * skipstart is the number of bytes we need to read in * (because we need to start at the beginning of a block) but * not transfer to the user. * * dropend is the number of bytes to ignore at the end of the * user's buffer. */ skipstart = startoffset - physstart; dropend = endoffset - physend; /* * Make a transfer buffer. * * Note: rawbufmax = physend - physstart. Proof: * * physend - physstart = physend - physstart * = physend - physstart + startoffset - startoffset * = physend + (startoffset - physstart) - startoffset * = physend + skipstart - startoffset * = physend + skipstart - startoffset + endoffset - endoffset * = skipstart - startoffset + endoffset - (endoffset - physend) * = skipstart - startoffset + endoffset - dropend * = skipstart - startoffset + (startoffset + callerbytes) - dropend * = skipstart + callerbytes - dropend * = rawbufmax * Qed. * * XXX: this should just use physend - physstart. * * XXX: this should be rewritten to read the directs straight * out of bufferio buffers instead of copying twice. This would * also let us adapt better to the user's buffer size. */ /* Base buffer space for CALLERBYTES of new data */ rawbufmax = callerbytes + skipstart; if (rawbufmax < callerbytes) return EINVAL; rawbufmax -= dropend; if (rawbufmax < _DIRENT_MINSIZE(rawdp)) { /* no room for even one struct direct */ return EINVAL; } /* read it */ rawbuf = kmem_alloc(rawbufmax, KM_SLEEP); rawiov.iov_base = rawbuf; rawiov.iov_len = rawbufmax; rawuio.uio_iov = &rawiov; rawuio.uio_iovcnt = 1; rawuio.uio_offset = physstart; rawuio.uio_resid = rawbufmax; UIO_SETUP_SYSSPACE(&rawuio); rawuio.uio_rw = UIO_READ; error = UFS_BUFRD(vp, &rawuio, 0, ap->a_cred); if (error != 0) { kmem_free(rawbuf, rawbufmax); return error; } rawbytes = rawbufmax - rawuio.uio_resid; /* the raw entries to iterate over */ rawdp = (struct direct *)(void *)rawbuf; stoprawdp = (struct direct *)(void *)&rawbuf[rawbytes]; /* allocate space to produce dirents into */ direntbufmax = callerbytes; direntbuf = kmem_alloc(direntbufmax, KM_SLEEP); /* the dirents to iterate over */ dirent = (struct dirent *)(void *)direntbuf; stopdirent = (struct dirent *)(void *)&direntbuf[direntbufmax]; /* the output "cookies" (seek positions of directory entries) */ if (ap->a_cookies) { numcookies = 0; maxcookies = rawbytes / _DIRENT_RECLEN(rawdp, 1); cookies = malloc(maxcookies * sizeof(*cookies), M_TEMP, M_WAITOK); } else { /* XXX: GCC */ maxcookies = 0; cookies = NULL; } /* now produce the dirents */ curoffset = calleruio->uio_offset; while (rawdp < stoprawdp) { rawdp->d_reclen = ufs_rw16(rawdp->d_reclen, nswap); if (skipstart > 0) { /* drain skipstart */ if (rawdp->d_reclen <= skipstart) { skipstart -= rawdp->d_reclen; rawdp = _DIRENT_NEXT(rawdp); continue; } /* caller's start position wasn't on an entry */ error = EINVAL; goto out; } if (rawdp->d_reclen == 0) { struct dirent *save = dirent; dirent->d_reclen = _DIRENT_MINSIZE(dirent); dirent = _DIRENT_NEXT(dirent); save->d_reclen = 0; rawdp = stoprawdp; break; } /* copy the header */ if (needswap) { dirent->d_type = rawdp->d_namlen; dirent->d_namlen = rawdp->d_type; } else { dirent->d_type = rawdp->d_type; dirent->d_namlen = rawdp->d_namlen; } dirent->d_reclen = _DIRENT_RECLEN(dirent, dirent->d_namlen); /* stop if there isn't room for the name AND another header */ if ((char *)(void *)dirent + dirent->d_reclen + _DIRENT_MINSIZE(dirent) > (char *)(void *)stopdirent) break; /* copy the name (and inode (XXX: why after the test?)) */ dirent->d_fileno = ufs_rw32(rawdp->d_ino, nswap); (void)memcpy(dirent->d_name, rawdp->d_name, dirent->d_namlen); memset(&dirent->d_name[dirent->d_namlen], 0, dirent->d_reclen - _DIRENT_NAMEOFF(dirent) - dirent->d_namlen); /* onward */ curoffset += rawdp->d_reclen; if (ap->a_cookies) { KASSERT(numcookies < maxcookies); cookies[numcookies++] = curoffset; } dirent = _DIRENT_NEXT(dirent); rawdp = _DIRENT_NEXT(rawdp); } /* transfer the dirents to the caller's buffer */ callerbytes = ((char *)(void *)dirent - direntbuf); error = uiomove(direntbuf, callerbytes, calleruio); out: calleruio->uio_offset = curoffset; if (ap->a_cookies) { if (error) { free(cookies, M_TEMP); *ap->a_cookies = NULL; *ap->a_ncookies = 0; } else { *ap->a_cookies = cookies; *ap->a_ncookies = numcookies; } } kmem_free(direntbuf, direntbufmax); kmem_free(rawbuf, rawbufmax); *ap->a_eofflag = VTOI(vp)->i_size <= calleruio->uio_offset; return error; } /* * Return target name of a symbolic link */ int ufs_readlink(void *v) { struct vop_readlink_args /* { struct vnode *a_vp; struct uio *a_uio; kauth_cred_t a_cred; } */ *ap = v; struct vnode *vp = ap->a_vp; struct inode *ip = VTOI(vp); struct ufsmount *ump = VFSTOUFS(vp->v_mount); int isize; /* * The test against um_maxsymlinklen is off by one; it should * theoretically be <=, not <. However, it cannot be changed * as that would break compatibility with existing fs images. */ isize = ip->i_size; if (isize < ump->um_maxsymlinklen || (ump->um_maxsymlinklen == 0 && DIP(ip, blocks) == 0)) { uiomove((char *)SHORTLINK(ip), isize, ap->a_uio); return (0); } return (UFS_BUFRD(vp, ap->a_uio, 0, ap->a_cred)); } /* * Calculate the logical to physical mapping if not done already, * then call the device strategy routine. */ int ufs_strategy(void *v) { struct vop_strategy_args /* { struct vnode *a_vp; struct buf *a_bp; } */ *ap = v; struct buf *bp; struct vnode *vp; struct inode *ip; struct mount *mp; int error; bp = ap->a_bp; vp = ap->a_vp; ip = VTOI(vp); if (vp->v_type == VBLK || vp->v_type == VCHR) panic("ufs_strategy: spec"); KASSERT(fstrans_held(vp->v_mount)); KASSERT(bp->b_bcount != 0); if (bp->b_blkno == bp->b_lblkno) { error = VOP_BMAP(vp, bp->b_lblkno, NULL, &bp->b_blkno, NULL); if (error) { bp->b_error = error; biodone(bp); return (error); } if (bp->b_blkno == -1) /* no valid data */ clrbuf(bp); } if (bp->b_blkno < 0) { /* block is not on disk */ biodone(bp); return (0); } vp = ip->i_devvp; error = VOP_STRATEGY(vp, bp); if (error) return error; if (!BUF_ISREAD(bp)) return 0; mp = wapbl_vptomp(vp); if (mp == NULL || mp->mnt_wapbl_replay == NULL || !WAPBL_REPLAY_ISOPEN(mp) || !WAPBL_REPLAY_CAN_READ(mp, bp->b_blkno, bp->b_bcount)) return 0; error = biowait(bp); if (error) return error; error = WAPBL_REPLAY_READ(mp, bp->b_data, bp->b_blkno, bp->b_bcount); if (error) { mutex_enter(&bufcache_lock); SET(bp->b_cflags, BC_INVAL); mutex_exit(&bufcache_lock); } return error; } /* * Print out the contents of an inode. */ int ufs_print(void *v) { struct vop_print_args /* { struct vnode *a_vp; } */ *ap = v; struct vnode *vp; struct inode *ip; vp = ap->a_vp; ip = VTOI(vp); printf("tag VT_UFS, ino %llu, on dev %llu, %llu", (unsigned long long)ip->i_number, (unsigned long long)major(ip->i_dev), (unsigned long long)minor(ip->i_dev)); printf(" flags 0x%x, nlink %d\n", ip->i_flag, ip->i_nlink); printf("\tmode 0%o, owner %d, group %d, size %qd", ip->i_mode, ip->i_uid, ip->i_gid, (long long)ip->i_size); if (vp->v_type == VFIFO) VOCALL(fifo_vnodeop_p, VOFFSET(vop_print), v); printf("\n"); return (0); } /* * Read wrapper for special devices. */ int ufsspec_read(void *v) { struct vop_read_args /* { struct vnode *a_vp; struct uio *a_uio; int a_ioflag; kauth_cred_t a_cred; } */ *ap = v; /* * Set access flag. */ if ((ap->a_vp->v_mount->mnt_flag & MNT_NODEVMTIME) == 0) VTOI(ap->a_vp)->i_flag |= IN_ACCESS; return (VOCALL (spec_vnodeop_p, VOFFSET(vop_read), ap)); } /* * Write wrapper for special devices. */ int ufsspec_write(void *v) { struct vop_write_args /* { struct vnode *a_vp; struct uio *a_uio; int a_ioflag; kauth_cred_t a_cred; } */ *ap = v; /* * Set update and change flags. */ if ((ap->a_vp->v_mount->mnt_flag & MNT_NODEVMTIME) == 0) VTOI(ap->a_vp)->i_flag |= IN_MODIFY; return (VOCALL (spec_vnodeop_p, VOFFSET(vop_write), ap)); } /* * Close wrapper for special devices. * * Update the times on the inode then do device close. */ int ufsspec_close(void *v) { struct vop_close_args /* { struct vnode *a_vp; int a_fflag; kauth_cred_t a_cred; } */ *ap = v; struct vnode *vp; vp = ap->a_vp; if (vrefcnt(vp) > 1) UFS_ITIMES(vp, NULL, NULL, NULL); return (VOCALL (spec_vnodeop_p, VOFFSET(vop_close), ap)); } /* * Read wrapper for fifo's */ int ufsfifo_read(void *v) { struct vop_read_args /* { struct vnode *a_vp; struct uio *a_uio; int a_ioflag; kauth_cred_t a_cred; } */ *ap = v; /* * Set access flag. */ VTOI(ap->a_vp)->i_flag |= IN_ACCESS; return (VOCALL (fifo_vnodeop_p, VOFFSET(vop_read), ap)); } /* * Write wrapper for fifo's. */ int ufsfifo_write(void *v) { struct vop_write_args /* { struct vnode *a_vp; struct uio *a_uio; int a_ioflag; kauth_cred_t a_cred; } */ *ap = v; /* * Set update and change flags. */ VTOI(ap->a_vp)->i_flag |= IN_MODIFY; return (VOCALL (fifo_vnodeop_p, VOFFSET(vop_write), ap)); } /* * Close wrapper for fifo's. * * Update the times on the inode then do device close. */ int ufsfifo_close(void *v) { struct vop_close_args /* { struct vnode *a_vp; int a_fflag; kauth_cred_t a_cred; } */ *ap = v; struct vnode *vp; vp = ap->a_vp; if (vrefcnt(ap->a_vp) > 1) UFS_ITIMES(vp, NULL, NULL, NULL); return (VOCALL (fifo_vnodeop_p, VOFFSET(vop_close), ap)); } /* * Return POSIX pathconf information applicable to ufs filesystems. */ int ufs_pathconf(void *v) { struct vop_pathconf_args /* { struct vnode *a_vp; int a_name; register_t *a_retval; } */ *ap = v; switch (ap->a_name) { case _PC_LINK_MAX: *ap->a_retval = LINK_MAX; return (0); case _PC_NAME_MAX: *ap->a_retval = FFS_MAXNAMLEN; return (0); case _PC_PATH_MAX: *ap->a_retval = PATH_MAX; return (0); case _PC_PIPE_BUF: *ap->a_retval = PIPE_BUF; return (0); case _PC_CHOWN_RESTRICTED: *ap->a_retval = 1; return (0); case _PC_NO_TRUNC: *ap->a_retval = 1; return (0); #ifdef UFS_ACL case _PC_ACL_EXTENDED: if (ap->a_vp->v_mount->mnt_flag & MNT_POSIX1EACLS) *ap->a_retval = 1; else *ap->a_retval = 0; return 0; case _PC_ACL_NFS4: if (ap->a_vp->v_mount->mnt_flag & MNT_NFS4ACLS) *ap->a_retval = 1; else *ap->a_retval = 0; return 0; #endif case _PC_ACL_PATH_MAX: #ifdef UFS_ACL if (ap->a_vp->v_mount->mnt_flag & (MNT_POSIX1EACLS | MNT_NFS4ACLS)) *ap->a_retval = ACL_MAX_ENTRIES; else *ap->a_retval = 3; #else *ap->a_retval = 3; #endif return 0; case _PC_SYNC_IO: *ap->a_retval = 1; return (0); case _PC_FILESIZEBITS: *ap->a_retval = 42; return (0); case _PC_SYMLINK_MAX: *ap->a_retval = MAXPATHLEN; return (0); case _PC_2_SYMLINKS: *ap->a_retval = 1; return (0); default: return (EINVAL); } /* NOTREACHED */ } /* * Advisory record locking support */ int ufs_advlock(void *v) { struct vop_advlock_args /* { struct vnode *a_vp; void * a_id; int a_op; struct flock *a_fl; int a_flags; } */ *ap = v; struct inode *ip; ip = VTOI(ap->a_vp); return lf_advlock(ap, &ip->i_lockf, ip->i_size); } /* * Initialize the vnode associated with a new inode, handle aliased * vnodes. */ void ufs_vinit(struct mount *mntp, int (**specops)(void *), int (**fifoops)(void *), struct vnode **vpp) { struct timeval tv; struct inode *ip; struct vnode *vp; dev_t rdev; struct ufsmount *ump; vp = *vpp; ip = VTOI(vp); switch(vp->v_type = IFTOVT(ip->i_mode)) { case VCHR: case VBLK: vp->v_op = specops; ump = ip->i_ump; if (ump->um_fstype == UFS1) rdev = (dev_t)ufs_rw32(ip->i_ffs1_rdev, UFS_MPNEEDSWAP(ump)); else rdev = (dev_t)ufs_rw64(ip->i_ffs2_rdev, UFS_MPNEEDSWAP(ump)); spec_node_init(vp, rdev); break; case VFIFO: vp->v_op = fifoops; break; case VNON: case VBAD: case VSOCK: case VLNK: case VDIR: case VREG: break; } if (ip->i_number == UFS_ROOTINO) vp->v_vflag |= VV_ROOT; /* * Initialize modrev times */ getmicrouptime(&tv); ip->i_modrev = (uint64_t)(uint)tv.tv_sec << 32 | tv.tv_usec * 4294u; *vpp = vp; } /* * Allocate a new inode. */ static int ufs_makeinode(struct vattr *vap, struct vnode *dvp, const struct ufs_lookup_results *ulr, struct vnode **vpp, struct componentname *cnp) { struct inode *ip; struct direct *newdir; struct vnode *tvp; int error; UFS_WAPBL_JUNLOCK_ASSERT(dvp->v_mount); error = vcache_new(dvp->v_mount, dvp, vap, cnp->cn_cred, NULL, &tvp); if (error) return error; error = vn_lock(tvp, LK_EXCLUSIVE); if (error) { vrele(tvp); return error; } *vpp = tvp; ip = VTOI(tvp); error = UFS_WAPBL_BEGIN(dvp->v_mount); if (error) { vput(tvp); return (error); } ip->i_flag |= IN_ACCESS | IN_CHANGE | IN_UPDATE; ip->i_nlink = 1; DIP_ASSIGN(ip, nlink, 1); /* Authorize setting SGID if needed. */ if (ip->i_mode & ISGID) { error = kauth_authorize_vnode(cnp->cn_cred, KAUTH_VNODE_WRITE_SECURITY, tvp, NULL, genfs_can_chmod(tvp, cnp->cn_cred, ip->i_uid, ip->i_gid, MAKEIMODE(vap->va_type, vap->va_mode))); if (error) { ip->i_mode &= ~ISGID; DIP_ASSIGN(ip, mode, ip->i_mode); } } if (cnp->cn_flags & ISWHITEOUT) { ip->i_flags |= UF_OPAQUE; DIP_ASSIGN(ip, flags, ip->i_flags); } /* * Make sure inode goes to disk before directory entry. */ if ((error = UFS_UPDATE(tvp, NULL, NULL, UPDATE_DIROP)) != 0) goto bad; #ifdef UFS_ACL struct lwp *l = curlwp; if (dvp->v_mount->mnt_flag & MNT_POSIX1EACLS) { error = ufs_do_posix1e_acl_inheritance_file(dvp, tvp, ip->i_mode, cnp->cn_cred, l); if (error) goto bad; } else if (dvp->v_mount->mnt_flag & MNT_NFS4ACLS) { error = ufs_do_nfs4_acl_inheritance(dvp, tvp, ip->i_mode, cnp->cn_cred, l); if (error) goto bad; } #endif /* !UFS_ACL */ newdir = pool_cache_get(ufs_direct_cache, PR_WAITOK); ufs_makedirentry(ip, cnp, newdir); error = ufs_direnter(dvp, ulr, tvp, newdir, cnp, NULL); pool_cache_put(ufs_direct_cache, newdir); if (error) goto bad; *vpp = tvp; cache_enter(dvp, *vpp, cnp->cn_nameptr, cnp->cn_namelen, cnp->cn_flags); return (0); bad: /* * Write error occurred trying to update the inode * or the directory so must deallocate the inode. */ ip->i_nlink = 0; DIP_ASSIGN(ip, nlink, 0); ip->i_flag |= IN_CHANGE; UFS_WAPBL_UPDATE(tvp, NULL, NULL, 0); UFS_WAPBL_END(dvp->v_mount); vput(tvp); return (error); } /* * Allocate len bytes at offset off. */ int ufs_gop_alloc(struct vnode *vp, off_t off, off_t len, int flags, kauth_cred_t cred) { struct inode *ip = VTOI(vp); int error, delta, bshift, bsize; UVMHIST_FUNC("ufs_gop_alloc"); UVMHIST_CALLED(ubchist); error = 0; bshift = vp->v_mount->mnt_fs_bshift; bsize = 1 << bshift; delta = off & (bsize - 1); off -= delta; len += delta; while (len > 0) { bsize = MIN(bsize, len); error = UFS_BALLOC(vp, off, bsize, cred, flags, NULL); if (error) { goto out; } /* * increase file size now, UFS_BALLOC() requires that * EOF be up-to-date before each call. */ if (ip->i_size < off + bsize) { UVMHIST_LOG(ubchist, "vp %#jx old 0x%jx new 0x%x", (uintptr_t)vp, ip->i_size, off + bsize, 0); ip->i_size = off + bsize; DIP_ASSIGN(ip, size, ip->i_size); } off += bsize; len -= bsize; } out: UFS_WAPBL_UPDATE(vp, NULL, NULL, 0); return error; } void ufs_gop_markupdate(struct vnode *vp, int flags) { u_int32_t mask = 0; if ((flags & GOP_UPDATE_ACCESSED) != 0) { mask = IN_ACCESS; } if ((flags & GOP_UPDATE_MODIFIED) != 0) { if (vp->v_type == VREG) { mask |= IN_CHANGE | IN_UPDATE; } else { mask |= IN_MODIFY; } } if (mask) { struct inode *ip = VTOI(vp); ip->i_flag |= mask; } } int ufs_bufio(enum uio_rw rw, struct vnode *vp, void *buf, size_t len, off_t off, int ioflg, kauth_cred_t cred, size_t *aresid, struct lwp *l) { struct iovec iov; struct uio uio; int error; KASSERT(ISSET(ioflg, IO_NODELOCKED)); KASSERT(VOP_ISLOCKED(vp)); KASSERT(rw != UIO_WRITE || VOP_ISLOCKED(vp) == LK_EXCLUSIVE); KASSERT(rw != UIO_WRITE || vp->v_mount->mnt_wapbl == NULL || ISSET(ioflg, IO_JOURNALLOCKED)); iov.iov_base = buf; iov.iov_len = len; uio.uio_iov = &iov; uio.uio_iovcnt = 1; uio.uio_resid = len; uio.uio_offset = off; uio.uio_rw = rw; UIO_SETUP_SYSSPACE(&uio); switch (rw) { case UIO_READ: error = UFS_BUFRD(vp, &uio, ioflg, cred); break; case UIO_WRITE: error = UFS_BUFWR(vp, &uio, ioflg, cred); break; default: panic("invalid uio rw: %d", (int)rw); } if (aresid) *aresid = uio.uio_resid; else if (uio.uio_resid && error == 0) error = EIO; KASSERT(VOP_ISLOCKED(vp)); KASSERT(rw != UIO_WRITE || VOP_ISLOCKED(vp) == LK_EXCLUSIVE); return error; }
4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 /* $NetBSD: clock.h,v 1.7 2023/10/27 14:34:58 jschauma Exp $ */ /*- * Copyright (c) 1996 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Gordon W. Ross * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #ifndef _SYS_CLOCK_H_ #define _SYS_CLOCK_H_ #if !defined(_KERNEL) && !defined(_STANDALONE) #include <stdint.h> #endif /* Some handy constants. */ #define SECS_PER_MINUTE 60 #define SECS_PER_HOUR 3600 #define SECS_PER_DAY 86400 #define DAYS_PER_COMMON_YEAR 365 #define DAYS_PER_LEAP_YEAR 366 #define SECS_PER_COMMON_YEAR (SECS_PER_DAY * DAYS_PER_COMMON_YEAR) #define SECS_PER_LEAP_YEAR (SECS_PER_DAY * DAYS_PER_LEAP_YEAR) /* Traditional POSIX base year */ #define POSIX_BASE_YEAR 1970 /* Some handy functions */ static __inline int days_in_month(int m) { switch (m) { case 2: return 28; case 4: case 6: case 9: case 11: return 30; case 1: case 3: case 5: case 7: case 8: case 10: case 12: return 31; default: return -1; } } /* * This inline avoids some unnecessary modulo operations * as compared with the usual macro: * ( ((year % 4) == 0 && * (year % 100) != 0) || * ((year % 400) == 0) ) * It is otherwise equivalent. */ static __inline int is_leap_year(uint64_t year) { if ((year & 3) != 0) return 0; if (__predict_false((year % 100) != 0)) return 1; return __predict_false((year % 400) == 0); } static __inline int days_per_year(uint64_t year) { return is_leap_year(year) ? DAYS_PER_LEAP_YEAR : DAYS_PER_COMMON_YEAR; } #endif /* _SYS_CLOCK_H_ */
99 98 5 1 5 5 1 4 2 2 2 2 2 1 1 1 1 1 1 2 2 12 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 /* $NetBSD: hci_socket.c,v 1.47 2019/09/28 07:10:55 plunky Exp $ */ /*- * Copyright (c) 2005 Iain Hibbert. * Copyright (c) 2006 Itronix Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The name of Itronix Inc. may not be used to endorse * or promote products derived from this software without specific * prior written permission. * * THIS SOFTWARE IS PROVIDED BY ITRONIX INC. ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL ITRONIX INC. BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: hci_socket.c,v 1.47 2019/09/28 07:10:55 plunky Exp $"); /* load symbolic names */ #ifdef BLUETOOTH_DEBUG #define PRUREQUESTS #define PRCOREQUESTS #endif #include <sys/param.h> #include <sys/domain.h> #include <sys/kauth.h> #include <sys/kernel.h> #include <sys/kmem.h> #include <sys/mbuf.h> #include <sys/proc.h> #include <sys/protosw.h> #include <sys/socket.h> #include <sys/socketvar.h> #include <sys/systm.h> #include <netbt/bluetooth.h> #include <netbt/hci.h> /******************************************************************************* * * HCI SOCK_RAW Sockets - for control of Bluetooth Devices * */ /* * the raw HCI protocol control block */ struct hci_pcb { struct socket *hp_socket; /* socket */ kauth_cred_t hp_cred; /* owner credential */ unsigned int hp_flags; /* flags */ bdaddr_t hp_laddr; /* local address */ bdaddr_t hp_raddr; /* remote address */ struct hci_filter hp_efilter; /* user event filter */ struct hci_filter hp_pfilter; /* user packet filter */ LIST_ENTRY(hci_pcb) hp_next; /* next HCI pcb */ }; /* hp_flags */ #define HCI_DIRECTION (1<<1) /* direction control messages */ #define HCI_PROMISCUOUS (1<<2) /* listen to all units */ LIST_HEAD(hci_pcb_list, hci_pcb) hci_pcb = LIST_HEAD_INITIALIZER(hci_pcb); /* sysctl defaults */ int hci_sendspace = HCI_CMD_PKT_SIZE; int hci_recvspace = 4096; /* unprivileged commands opcode table */ static const struct { uint16_t opcode; uint8_t offs; /* 0 - 63 */ uint8_t mask; /* bit 0 - 7 */ uint8_t length; /* approved length */ } hci_cmds[] = { { HCI_CMD_INQUIRY, 0, 0x01, sizeof(hci_inquiry_cp) }, { HCI_CMD_REMOTE_NAME_REQ, 2, 0x08, sizeof(hci_remote_name_req_cp) }, { HCI_CMD_READ_REMOTE_FEATURES, 2, 0x20, sizeof(hci_read_remote_features_cp) }, { HCI_CMD_READ_REMOTE_EXTENDED_FEATURES, 2, 0x40, sizeof(hci_read_remote_extended_features_cp) }, { HCI_CMD_READ_REMOTE_VER_INFO, 2, 0x80, sizeof(hci_read_remote_ver_info_cp) }, { HCI_CMD_READ_CLOCK_OFFSET, 3, 0x01, sizeof(hci_read_clock_offset_cp) }, { HCI_CMD_READ_LMP_HANDLE, 3, 0x02, sizeof(hci_read_lmp_handle_cp) }, { HCI_CMD_ROLE_DISCOVERY, 4, 0x80, sizeof(hci_role_discovery_cp) }, { HCI_CMD_READ_LINK_POLICY_SETTINGS, 5, 0x02, sizeof(hci_read_link_policy_settings_cp) }, { HCI_CMD_READ_DEFAULT_LINK_POLICY_SETTINGS, 5, 0x08, 0 }, { HCI_CMD_READ_PIN_TYPE, 6, 0x04, 0 }, { HCI_CMD_READ_LOCAL_NAME, 7, 0x02, 0 }, { HCI_CMD_READ_CON_ACCEPT_TIMEOUT, 7, 0x04, 0 }, { HCI_CMD_READ_PAGE_TIMEOUT, 7, 0x10, 0 }, { HCI_CMD_READ_SCAN_ENABLE, 7, 0x40, 0 }, { HCI_CMD_READ_PAGE_SCAN_ACTIVITY, 8, 0x01, 0 }, { HCI_CMD_READ_INQUIRY_SCAN_ACTIVITY, 8, 0x04, 0 }, { HCI_CMD_READ_AUTH_ENABLE, 8, 0x10, 0 }, { HCI_CMD_READ_ENCRYPTION_MODE, 8, 0x40, 0 }, { HCI_CMD_READ_UNIT_CLASS, 9, 0x01, 0 }, { HCI_CMD_READ_VOICE_SETTING, 9, 0x04, 0 }, { HCI_CMD_READ_AUTO_FLUSH_TIMEOUT, 9, 0x10, sizeof(hci_read_auto_flush_timeout_cp) }, { HCI_CMD_READ_NUM_BROADCAST_RETRANS, 9, 0x40, 0 }, { HCI_CMD_READ_HOLD_MODE_ACTIVITY, 10, 0x01, 0 }, { HCI_CMD_READ_XMIT_LEVEL, 10, 0x04, sizeof(hci_read_xmit_level_cp) }, { HCI_CMD_READ_SCO_FLOW_CONTROL, 10, 0x08, 0 }, { HCI_CMD_READ_LINK_SUPERVISION_TIMEOUT, 11, 0x01, sizeof(hci_read_link_supervision_timeout_cp) }, { HCI_CMD_READ_NUM_SUPPORTED_IAC, 11, 0x04, 0 }, { HCI_CMD_READ_IAC_LAP, 11, 0x08, 0 }, { HCI_CMD_READ_PAGE_SCAN_PERIOD, 11, 0x20, 0 }, { HCI_CMD_READ_PAGE_SCAN, 11, 0x80, 0 }, { HCI_CMD_READ_INQUIRY_SCAN_TYPE, 12, 0x10, 0 }, { HCI_CMD_READ_INQUIRY_MODE, 12, 0x40, 0 }, { HCI_CMD_READ_PAGE_SCAN_TYPE, 13, 0x01, 0 }, { HCI_CMD_READ_AFH_ASSESSMENT, 13, 0x04, 0 }, { HCI_CMD_READ_LOCAL_VER, 14, 0x08, 0 }, { HCI_CMD_READ_LOCAL_COMMANDS, 14, 0x10, 0 }, { HCI_CMD_READ_LOCAL_FEATURES, 14, 0x20, 0 }, { HCI_CMD_READ_LOCAL_EXTENDED_FEATURES, 14, 0x40, sizeof(hci_read_local_extended_features_cp) }, { HCI_CMD_READ_BUFFER_SIZE, 14, 0x80, 0 }, { HCI_CMD_READ_COUNTRY_CODE, 15, 0x01, 0 }, { HCI_CMD_READ_BDADDR, 15, 0x02, 0 }, { HCI_CMD_READ_FAILED_CONTACT_CNTR, 15, 0x04, sizeof(hci_read_failed_contact_cntr_cp) }, { HCI_CMD_READ_LINK_QUALITY, 15, 0x10, sizeof(hci_read_link_quality_cp) }, { HCI_CMD_READ_RSSI, 15, 0x20, sizeof(hci_read_rssi_cp) }, { HCI_CMD_READ_AFH_CHANNEL_MAP, 15, 0x40, sizeof(hci_read_afh_channel_map_cp) }, { HCI_CMD_READ_CLOCK, 15, 0x80, sizeof(hci_read_clock_cp) }, { HCI_CMD_READ_LOOPBACK_MODE, 16, 0x01, 0 }, { HCI_CMD_READ_EXTENDED_INQUIRY_RSP, 17, 0x01, 0 }, { HCI_CMD_READ_SIMPLE_PAIRING_MODE, 17, 0x20, 0 }, { HCI_CMD_READ_INQUIRY_RSP_XMIT_POWER, 18, 0x01, 0 }, { HCI_CMD_READ_DEFAULT_ERRDATA_REPORTING, 18, 0x04, 0 }, { HCI_CMD_READ_ENCRYPTION_KEY_SIZE, 20, 0x10, sizeof(hci_read_encryption_key_size_cp) }, }; /* * supply a basic device send/recv policy */ static int hci_device_cb(kauth_cred_t cred, kauth_action_t action, void *cookie, void *arg0, void *arg1, void *arg2, void *arg3) { int i, result; result = KAUTH_RESULT_DEFER; switch (action) { case KAUTH_DEVICE_BLUETOOTH_SEND: { struct hci_unit *unit = (struct hci_unit *)arg0; hci_cmd_hdr_t *hdr = (hci_cmd_hdr_t *)arg1; /* * Allow sending unprivileged commands if the packet size * is correct and the unit claims to support it */ if (hdr->type != HCI_CMD_PKT) break; for (i = 0; i < __arraycount(hci_cmds); i++) { if (hdr->opcode == hci_cmds[i].opcode && hdr->length == hci_cmds[i].length && (unit->hci_cmds[hci_cmds[i].offs] & hci_cmds[i].mask)) { result = KAUTH_RESULT_ALLOW; break; } } break; } case KAUTH_DEVICE_BLUETOOTH_RECV: switch((uint8_t)(uintptr_t)arg0) { case HCI_CMD_PKT: { uint16_t opcode = (uint16_t)(uintptr_t)arg1; /* * Allow to see any unprivileged command packet */ for (i = 0; i < __arraycount(hci_cmds); i++) { if (opcode == hci_cmds[i].opcode) { result = KAUTH_RESULT_ALLOW; break; } } break; } case HCI_EVENT_PKT: { uint8_t event = (uint8_t)(uintptr_t)arg1; /* * Allow to receive most events */ switch (event) { case HCI_EVENT_RETURN_LINK_KEYS: case HCI_EVENT_LINK_KEY_NOTIFICATION: case HCI_EVENT_USER_CONFIRM_REQ: case HCI_EVENT_USER_PASSKEY_NOTIFICATION: case HCI_EVENT_VENDOR: break; default: result = KAUTH_RESULT_ALLOW; break; } break; } case HCI_ACL_DATA_PKT: case HCI_SCO_DATA_PKT: { /* uint16_t handle = (uint16_t)(uintptr_t)arg1; */ /* * don't normally allow receiving data packets */ break; } default: break; } break; default: break; } return result; } /* * HCI protocol init routine, * - set up a kauth listener to provide basic packet access policy */ void hci_init(void) { if (kauth_listen_scope(KAUTH_SCOPE_DEVICE, hci_device_cb, NULL) == NULL) panic("Bluetooth HCI: cannot listen on device scope"); } /* * When command packet reaches the device, we can drop * it from the socket buffer (called from hci_output_acl) */ void hci_drop(void *arg) { struct socket *so = arg; sbdroprecord(&so->so_snd); sowwakeup(so); } /* * HCI socket is going away and has some pending packets. We let them * go by design, but remove the context pointer as it will be invalid * and we no longer need to be notified. */ static void hci_cmdwait_flush(struct socket *so) { struct hci_unit *unit; struct socket *ctx; struct mbuf *m; DPRINTF("flushing %p\n", so); SIMPLEQ_FOREACH(unit, &hci_unit_list, hci_next) { m = MBUFQ_FIRST(&unit->hci_cmdwait); while (m != NULL) { ctx = M_GETCTX(m, struct socket *); if (ctx == so) M_SETCTX(m, NULL); m = MBUFQ_NEXT(m); } } } static int hci_attach(struct socket *so, int proto) { struct hci_pcb *pcb; int error; KASSERT(so->so_pcb == NULL); if (so->so_lock == NULL) { mutex_obj_hold(bt_lock); so->so_lock = bt_lock; solock(so); } KASSERT(solocked(so)); error = soreserve(so, hci_sendspace, hci_recvspace); if (error) { return error; } pcb = kmem_zalloc(sizeof(struct hci_pcb), KM_SLEEP); pcb->hp_cred = kauth_cred_dup(curlwp->l_cred); pcb->hp_socket = so; /* * Set default user filter. By default, socket only passes * Command_Complete and Command_Status Events. */ hci_filter_set(HCI_EVENT_COMMAND_COMPL, &pcb->hp_efilter); hci_filter_set(HCI_EVENT_COMMAND_STATUS, &pcb->hp_efilter); hci_filter_set(HCI_EVENT_PKT, &pcb->hp_pfilter); LIST_INSERT_HEAD(&hci_pcb, pcb, hp_next); so->so_pcb = pcb; return 0; } static void hci_detach(struct socket *so) { struct hci_pcb *pcb; pcb = (struct hci_pcb *)so->so_pcb; KASSERT(pcb != NULL); if (so->so_snd.sb_mb != NULL) hci_cmdwait_flush(so); if (pcb->hp_cred != NULL) kauth_cred_free(pcb->hp_cred); so->so_pcb = NULL; LIST_REMOVE(pcb, hp_next); kmem_free(pcb, sizeof(*pcb)); } static int hci_accept(struct socket *so, struct sockaddr *nam) { KASSERT(solocked(so)); return EOPNOTSUPP; } static int hci_bind(struct socket *so, struct sockaddr *nam, struct lwp *l) { struct hci_pcb *pcb = so->so_pcb; struct sockaddr_bt *sa = (struct sockaddr_bt *)nam; KASSERT(solocked(so)); KASSERT(pcb != NULL); KASSERT(nam != NULL); if (sa->bt_len != sizeof(struct sockaddr_bt)) return EINVAL; if (sa->bt_family != AF_BLUETOOTH) return EAFNOSUPPORT; bdaddr_copy(&pcb->hp_laddr, &sa->bt_bdaddr); if (bdaddr_any(&sa->bt_bdaddr)) pcb->hp_flags |= HCI_PROMISCUOUS; else pcb->hp_flags &= ~HCI_PROMISCUOUS; return 0; } static int hci_listen(struct socket *so, struct lwp *l) { KASSERT(solocked(so)); return EOPNOTSUPP; } static int hci_connect(struct socket *so, struct sockaddr *nam, struct lwp *l) { struct hci_pcb *pcb = so->so_pcb; struct sockaddr_bt *sa = (struct sockaddr_bt *)nam; KASSERT(solocked(so)); KASSERT(pcb != NULL); KASSERT(nam != NULL); if (sa->bt_len != sizeof(struct sockaddr_bt)) return EINVAL; if (sa->bt_family != AF_BLUETOOTH) return EAFNOSUPPORT; if (hci_unit_lookup(&sa->bt_bdaddr) == NULL) return EADDRNOTAVAIL; bdaddr_copy(&pcb->hp_raddr, &sa->bt_bdaddr); soisconnected(so); return 0; } static int hci_connect2(struct socket *so, struct socket *so2) { KASSERT(solocked(so)); return EOPNOTSUPP; } static int hci_disconnect(struct socket *so) { struct hci_pcb *pcb = so->so_pcb; KASSERT(solocked(so)); KASSERT(pcb != NULL); bdaddr_copy(&pcb->hp_raddr, BDADDR_ANY); /* XXX we cannot call soisdisconnected() here, as it sets * SS_CANTRCVMORE and SS_CANTSENDMORE. The problem being, * that soisconnected() does not clear these and if you * try to reconnect this socket (which is permitted) you * get a broken pipe when you try to write any data. */ so->so_state &= ~SS_ISCONNECTED; return 0; } static int hci_shutdown(struct socket *so) { KASSERT(solocked(so)); socantsendmore(so); return 0; } static int hci_abort(struct socket *so) { KASSERT(solocked(so)); soisdisconnected(so); hci_detach(so); return 0; } static int hci_ioctl(struct socket *so, u_long cmd, void *nam, struct ifnet *ifp) { int err; mutex_enter(bt_lock); err = hci_ioctl_pcb(cmd, nam); mutex_exit(bt_lock); return err; } static int hci_stat(struct socket *so, struct stat *ub) { KASSERT(solocked(so)); return 0; } static int hci_peeraddr(struct socket *so, struct sockaddr *nam) { struct hci_pcb *pcb = (struct hci_pcb *)so->so_pcb; struct sockaddr_bt *sa = (struct sockaddr_bt *)nam; KASSERT(solocked(so)); KASSERT(pcb != NULL); KASSERT(nam != NULL); memset(sa, 0, sizeof(struct sockaddr_bt)); sa->bt_len = sizeof(struct sockaddr_bt); sa->bt_family = AF_BLUETOOTH; bdaddr_copy(&sa->bt_bdaddr, &pcb->hp_raddr); return 0; } static int hci_sockaddr(struct socket *so, struct sockaddr *nam) { struct hci_pcb *pcb = (struct hci_pcb *)so->so_pcb; struct sockaddr_bt *sa = (struct sockaddr_bt *)nam; KASSERT(solocked(so)); KASSERT(pcb != NULL); KASSERT(nam != NULL); memset(sa, 0, sizeof(struct sockaddr_bt)); sa->bt_len = sizeof(struct sockaddr_bt); sa->bt_family = AF_BLUETOOTH; bdaddr_copy(&sa->bt_bdaddr, &pcb->hp_laddr); return 0; } static int hci_rcvd(struct socket *so, int flags, struct lwp *l) { KASSERT(solocked(so)); return EOPNOTSUPP; } static int hci_recvoob(struct socket *so, struct mbuf *m, int flags) { KASSERT(solocked(so)); return EOPNOTSUPP; } static int hci_send(struct socket *so, struct mbuf *m, struct sockaddr *nam, struct mbuf *control, struct lwp *l) { struct hci_pcb *pcb = so->so_pcb; struct sockaddr_bt *sa = (struct sockaddr_bt *)nam; struct hci_unit *unit; struct mbuf *m0; hci_cmd_hdr_t hdr; int err = 0; KASSERT(solocked(so)); KASSERT(pcb != NULL); KASSERT(m != NULL); if (control) /* have no use for this */ m_freem(control); if (sa) { if (sa->bt_len != sizeof(struct sockaddr_bt)) { err = EINVAL; goto bad; } if (sa->bt_family != AF_BLUETOOTH) { err = EAFNOSUPPORT; goto bad; } } /* * this came from userland, so we check it out first */ /* wants at least a header to start with */ if (m->m_pkthdr.len < sizeof(hdr)) { err = EMSGSIZE; goto bad; } m_copydata(m, 0, sizeof(hdr), &hdr); hdr.opcode = le16toh(hdr.opcode); /* only allows CMD packets to be sent */ if (hdr.type != HCI_CMD_PKT) { err = EINVAL; goto bad; } /* validates packet length */ if (m->m_pkthdr.len != sizeof(hdr) + hdr.length) { err = EMSGSIZE; goto bad; } /* finds destination */ unit = hci_unit_lookup((sa ? &sa->bt_bdaddr : &pcb->hp_raddr)); if (unit == NULL) { err = ENETDOWN; goto bad; } /* security checks for unprivileged users */ if (pcb->hp_cred != NULL && kauth_authorize_device(pcb->hp_cred, KAUTH_DEVICE_BLUETOOTH_SEND, unit, &hdr, NULL, NULL) != 0) { err = EPERM; goto bad; } /* makess a copy for precious to keep */ m0 = m_copypacket(m, M_DONTWAIT); if (m0 == NULL) { err = ENOMEM; goto bad; } sbappendrecord(&pcb->hp_socket->so_snd, m0); M_SETCTX(m, pcb->hp_socket); /* enable drop callback */ DPRINTFN(2, "(%s) opcode (%03x|%04x)\n", device_xname(unit->hci_dev), HCI_OGF(hdr.opcode), HCI_OCF(hdr.opcode)); /* Sendss it */ if (unit->hci_num_cmd_pkts == 0) MBUFQ_ENQUEUE(&unit->hci_cmdwait, m); else hci_output_cmd(unit, m); return 0; bad: DPRINTF("packet (%d bytes) not sent (error %d)\n", m->m_pkthdr.len, err); if (m) m_freem(m); return err; } static int hci_sendoob(struct socket *so, struct mbuf *m, struct mbuf *control) { KASSERT(solocked(so)); m_freem(m); m_freem(control); return EOPNOTSUPP; } static int hci_purgeif(struct socket *so, struct ifnet *ifp) { return EOPNOTSUPP; } /* * get/set socket options */ int hci_ctloutput(int req, struct socket *so, struct sockopt *sopt) { struct hci_pcb *pcb = (struct hci_pcb *)so->so_pcb; int optval, err = 0; DPRINTFN(2, "req %s\n", prcorequests[req]); if (pcb == NULL) return EINVAL; if (sopt->sopt_level != BTPROTO_HCI) return ENOPROTOOPT; switch(req) { case PRCO_GETOPT: switch (sopt->sopt_name) { case SO_HCI_EVT_FILTER: err = sockopt_set(sopt, &pcb->hp_efilter, sizeof(struct hci_filter)); break; case SO_HCI_PKT_FILTER: err = sockopt_set(sopt, &pcb->hp_pfilter, sizeof(struct hci_filter)); break; case SO_HCI_DIRECTION: err = sockopt_setint(sopt, (pcb->hp_flags & HCI_DIRECTION ? 1 : 0)); break; default: err = ENOPROTOOPT; break; } break; case PRCO_SETOPT: switch (sopt->sopt_name) { case SO_HCI_EVT_FILTER: /* set event filter */ err = sockopt_get(sopt, &pcb->hp_efilter, sizeof(pcb->hp_efilter)); break; case SO_HCI_PKT_FILTER: /* set packet filter */ err = sockopt_get(sopt, &pcb->hp_pfilter, sizeof(pcb->hp_pfilter)); break; case SO_HCI_DIRECTION: /* request direction ctl messages */ err = sockopt_getint(sopt, &optval); if (err) break; if (optval) pcb->hp_flags |= HCI_DIRECTION; else pcb->hp_flags &= ~HCI_DIRECTION; break; default: err = ENOPROTOOPT; break; } break; default: err = ENOPROTOOPT; break; } return err; } /* * HCI mbuf tap routine * * copy packets to any raw HCI sockets that wish (and are * permitted) to see them */ void hci_mtap(struct mbuf *m, struct hci_unit *unit) { struct hci_pcb *pcb; struct mbuf *m0, *ctlmsg, **ctl; struct sockaddr_bt sa; uint8_t type; uint8_t event; uint16_t arg1; KASSERT(m->m_len >= sizeof(type)); type = *mtod(m, uint8_t *); memset(&sa, 0, sizeof(sa)); sa.bt_len = sizeof(struct sockaddr_bt); sa.bt_family = AF_BLUETOOTH; bdaddr_copy(&sa.bt_bdaddr, &unit->hci_bdaddr); LIST_FOREACH(pcb, &hci_pcb, hp_next) { /* * filter according to source address */ if ((pcb->hp_flags & HCI_PROMISCUOUS) == 0 && bdaddr_same(&pcb->hp_laddr, &sa.bt_bdaddr) == 0) continue; /* * filter according to packet type filter */ if (hci_filter_test(type, &pcb->hp_pfilter) == 0) continue; /* * filter according to event/security filters */ switch(type) { case HCI_EVENT_PKT: KASSERT(m->m_len >= sizeof(hci_event_hdr_t)); event = mtod(m, hci_event_hdr_t *)->event; if (hci_filter_test(event, &pcb->hp_efilter) == 0) continue; arg1 = event; break; case HCI_CMD_PKT: KASSERT(m->m_len >= sizeof(hci_cmd_hdr_t)); arg1 = le16toh(mtod(m, hci_cmd_hdr_t *)->opcode); break; case HCI_ACL_DATA_PKT: KASSERT(m->m_len >= sizeof(hci_acldata_hdr_t)); arg1 = le16toh(mtod(m, hci_acldata_hdr_t *)->con_handle); arg1 = HCI_CON_HANDLE(arg1); break; case HCI_SCO_DATA_PKT: KASSERT(m->m_len >= sizeof(hci_scodata_hdr_t)); arg1 = le16toh(mtod(m, hci_scodata_hdr_t *)->con_handle); arg1 = HCI_CON_HANDLE(arg1); break; default: arg1 = 0; break; } if (pcb->hp_cred != NULL && kauth_authorize_device(pcb->hp_cred, KAUTH_DEVICE_BLUETOOTH_RECV, KAUTH_ARG(type), KAUTH_ARG(arg1), NULL, NULL) != 0) continue; /* * create control messages */ ctlmsg = NULL; ctl = &ctlmsg; if (pcb->hp_flags & HCI_DIRECTION) { int dir = m->m_flags & M_LINK0 ? 1 : 0; *ctl = sbcreatecontrol(&dir, sizeof(dir), SCM_HCI_DIRECTION, BTPROTO_HCI); if (*ctl != NULL) ctl = &((*ctl)->m_next); } if (pcb->hp_socket->so_options & SO_TIMESTAMP) { struct timeval tv; microtime(&tv); *ctl = sbcreatecontrol(&tv, sizeof(tv), SCM_TIMESTAMP, SOL_SOCKET); if (*ctl != NULL) ctl = &((*ctl)->m_next); } /* * copy to socket */ m0 = m_copypacket(m, M_DONTWAIT); if (m0 && sbappendaddr(&pcb->hp_socket->so_rcv, (struct sockaddr *)&sa, m0, ctlmsg)) { sorwakeup(pcb->hp_socket); } else { m_freem(ctlmsg); m_freem(m0); } } } PR_WRAP_USRREQS(hci) #define hci_attach hci_attach_wrapper #define hci_detach hci_detach_wrapper #define hci_accept hci_accept_wrapper #define hci_bind hci_bind_wrapper #define hci_listen hci_listen_wrapper #define hci_connect hci_connect_wrapper #define hci_connect2 hci_connect2_wrapper #define hci_disconnect hci_disconnect_wrapper #define hci_shutdown hci_shutdown_wrapper #define hci_abort hci_abort_wrapper #define hci_ioctl hci_ioctl_wrapper #define hci_stat hci_stat_wrapper #define hci_peeraddr hci_peeraddr_wrapper #define hci_sockaddr hci_sockaddr_wrapper #define hci_rcvd hci_rcvd_wrapper #define hci_recvoob hci_recvoob_wrapper #define hci_send hci_send_wrapper #define hci_sendoob hci_sendoob_wrapper #define hci_purgeif hci_purgeif_wrapper const struct pr_usrreqs hci_usrreqs = { .pr_attach = hci_attach, .pr_detach = hci_detach, .pr_accept = hci_accept, .pr_bind = hci_bind, .pr_listen = hci_listen, .pr_connect = hci_connect, .pr_connect2 = hci_connect2, .pr_disconnect = hci_disconnect, .pr_shutdown = hci_shutdown, .pr_abort = hci_abort, .pr_ioctl = hci_ioctl, .pr_stat = hci_stat, .pr_peeraddr = hci_peeraddr, .pr_sockaddr = hci_sockaddr, .pr_rcvd = hci_rcvd, .pr_recvoob = hci_recvoob, .pr_send = hci_send, .pr_sendoob = hci_sendoob, .pr_purgeif = hci_purgeif, };
6 2 2 3 3 3 2 2 79 79 67 17 74 56 26 13 13 26 20 18 15 7 2 2 2 15 23 6 6 6 6 3 3 1 2 4 4 4 4 4 4 3 2 1 1 1 6 4 4 1 2 4 2 2 3 2 1 1 2 1 1 1 3 3 3 2 2 1 1 1 2 2 1 1 1 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 /* $NetBSD: radix.c,v 1.49 2020/10/18 13:07:31 gson Exp $ */ /* * Copyright (c) 1988, 1989, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)radix.c 8.6 (Berkeley) 10/17/95 */ /* * Routines to build and maintain radix trees for routing lookups. */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: radix.c,v 1.49 2020/10/18 13:07:31 gson Exp $"); #ifndef _NET_RADIX_H_ #include <sys/param.h> #include <sys/queue.h> #include <sys/kmem.h> #ifdef _KERNEL #ifdef _KERNEL_OPT #include "opt_inet.h" #endif #include <sys/systm.h> #include <sys/malloc.h> #define M_DONTWAIT M_NOWAIT #include <sys/domain.h> #else #include <stdlib.h> #endif #include <sys/syslog.h> #include <net/radix.h> #endif typedef void (*rn_printer_t)(void *, const char *fmt, ...); int max_keylen; struct radix_mask *rn_mkfreelist; struct radix_node_head *mask_rnhead; static char *addmask_key; static const char normal_chars[] = {0, 0x80, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc, 0xfe, -1}; static char *rn_zeros, *rn_ones; #define rn_masktop (mask_rnhead->rnh_treetop) static int rn_satisfies_leaf(const char *, struct radix_node *, int); static int rn_lexobetter(const void *, const void *); static struct radix_mask *rn_new_radix_mask(struct radix_node *, struct radix_mask *); static struct radix_node *rn_walknext(struct radix_node *, rn_printer_t, void *); static struct radix_node *rn_walkfirst(struct radix_node *, rn_printer_t, void *); static void rn_nodeprint(struct radix_node *, rn_printer_t, void *, const char *); #define SUBTREE_OPEN "[ " #define SUBTREE_CLOSE " ]" #ifdef RN_DEBUG static void rn_treeprint(struct radix_node_head *, rn_printer_t, void *); #endif /* RN_DEBUG */ /* * The data structure for the keys is a radix tree with one way * branching removed. The index rn_b at an internal node n represents a bit * position to be tested. The tree is arranged so that all descendants * of a node n have keys whose bits all agree up to position rn_b - 1. * (We say the index of n is rn_b.) * * There is at least one descendant which has a one bit at position rn_b, * and at least one with a zero there. * * A route is determined by a pair of key and mask. We require that the * bit-wise logical and of the key and mask to be the key. * We define the index of a route to associated with the mask to be * the first bit number in the mask where 0 occurs (with bit number 0 * representing the highest order bit). * * We say a mask is normal if every bit is 0, past the index of the mask. * If a node n has a descendant (k, m) with index(m) == index(n) == rn_b, * and m is a normal mask, then the route applies to every descendant of n. * If the index(m) < rn_b, this implies the trailing last few bits of k * before bit b are all 0, (and hence consequently true of every descendant * of n), so the route applies to all descendants of the node as well. * * Similar logic shows that a non-normal mask m such that * index(m) <= index(n) could potentially apply to many children of n. * Thus, for each non-host route, we attach its mask to a list at an internal * node as high in the tree as we can go. * * The present version of the code makes use of normal routes in short- * circuiting an explicit mask and compare operation when testing whether * a key satisfies a normal route, and also in remembering the unique leaf * that governs a subtree. */ struct radix_node * rn_search( const void *v_arg, struct radix_node *head) { const u_char * const v = v_arg; struct radix_node *x; for (x = head; x->rn_b >= 0;) { if (x->rn_bmask & v[x->rn_off]) x = x->rn_r; else x = x->rn_l; } return x; } struct radix_node * rn_search_m( const void *v_arg, struct radix_node *head, const void *m_arg) { struct radix_node *x; const u_char * const v = v_arg; const u_char * const m = m_arg; for (x = head; x->rn_b >= 0;) { if ((x->rn_bmask & m[x->rn_off]) && (x->rn_bmask & v[x->rn_off])) x = x->rn_r; else x = x->rn_l; } return x; } int rn_refines( const void *m_arg, const void *n_arg) { const char *m = m_arg; const char *n = n_arg; const char *lim = n + *(const u_char *)n; const char *lim2 = lim; int longer = (*(const u_char *)n++) - (int)(*(const u_char *)m++); int masks_are_equal = 1; if (longer > 0) lim -= longer; while (n < lim) { if (*n & ~(*m)) return 0; if (*n++ != *m++) masks_are_equal = 0; } while (n < lim2) if (*n++) return 0; if (masks_are_equal && (longer < 0)) for (lim2 = m - longer; m < lim2; ) if (*m++) return 1; return !masks_are_equal; } struct radix_node * rn_lookup( const void *v_arg, const void *m_arg, struct radix_node_head *head) { struct radix_node *x; const char *netmask = NULL; if (m_arg) { if ((x = rn_addmask(m_arg, 1, head->rnh_treetop->rn_off)) == 0) return NULL; netmask = x->rn_key; } x = rn_match(v_arg, head); if (x != NULL && netmask != NULL) { while (x != NULL && x->rn_mask != netmask) x = x->rn_dupedkey; } return x; } static int rn_satisfies_leaf( const char *trial, struct radix_node *leaf, int skip) { const char *cp = trial; const char *cp2 = leaf->rn_key; const char *cp3 = leaf->rn_mask; const char *cplim; int length = uimin(*(const u_char *)cp, *(const u_char *)cp2); if (cp3 == 0) cp3 = rn_ones; else length = uimin(length, *(const u_char *)cp3); cplim = cp + length; cp3 += skip; cp2 += skip; for (cp += skip; cp < cplim; cp++, cp2++, cp3++) if ((*cp ^ *cp2) & *cp3) return 0; return 1; } struct radix_node * rn_match( const void *v_arg, struct radix_node_head *head) { const char * const v = v_arg; struct radix_node *t = head->rnh_treetop; struct radix_node *top = t; struct radix_node *x; struct radix_node *saved_t; const char *cp = v; const char *cp2; const char *cplim; int off = t->rn_off; int vlen = *(const u_char *)cp; int matched_off; int test, b, rn_b; /* * Open code rn_search(v, top) to avoid overhead of extra * subroutine call. */ for (; t->rn_b >= 0; ) { if (t->rn_bmask & cp[t->rn_off]) t = t->rn_r; else t = t->rn_l; } /* * See if we match exactly as a host destination * or at least learn how many bits match, for normal mask finesse. * * It doesn't hurt us to limit how many bytes to check * to the length of the mask, since if it matches we had a genuine * match and the leaf we have is the most specific one anyway; * if it didn't match with a shorter length it would fail * with a long one. This wins big for class B&C netmasks which * are probably the most common case... */ if (t->rn_mask) vlen = *(const u_char *)t->rn_mask; cp += off; cp2 = t->rn_key + off; cplim = v + vlen; for (; cp < cplim; cp++, cp2++) if (*cp != *cp2) goto on1; /* * This extra grot is in case we are explicitly asked * to look up the default. Ugh! */ if ((t->rn_flags & RNF_ROOT) && t->rn_dupedkey) t = t->rn_dupedkey; return t; on1: test = (*cp ^ *cp2) & 0xff; /* find first bit that differs */ for (b = 7; (test >>= 1) > 0;) b--; matched_off = cp - v; b += matched_off << 3; rn_b = -1 - b; /* * If there is a host route in a duped-key chain, it will be first. */ if ((saved_t = t)->rn_mask == 0) t = t->rn_dupedkey; for (; t; t = t->rn_dupedkey) /* * Even if we don't match exactly as a host, * we may match if the leaf we wound up at is * a route to a net. */ if (t->rn_flags & RNF_NORMAL) { if (rn_b <= t->rn_b) return t; } else if (rn_satisfies_leaf(v, t, matched_off)) return t; t = saved_t; /* start searching up the tree */ do { struct radix_mask *m; t = t->rn_p; m = t->rn_mklist; if (m) { /* * If non-contiguous masks ever become important * we can restore the masking and open coding of * the search and satisfaction test and put the * calculation of "off" back before the "do". */ do { if (m->rm_flags & RNF_NORMAL) { if (rn_b <= m->rm_b) return m->rm_leaf; } else { off = uimin(t->rn_off, matched_off); x = rn_search_m(v, t, m->rm_mask); while (x && x->rn_mask != m->rm_mask) x = x->rn_dupedkey; if (x && rn_satisfies_leaf(v, x, off)) return x; } m = m->rm_mklist; } while (m); } } while (t != top); return NULL; } static void rn_nodeprint(struct radix_node *rn, rn_printer_t printer, void *arg, const char *delim) { (*printer)(arg, "%s(%s%p: p<%p> l<%p> r<%p>)", delim, ((void *)rn == arg) ? "*" : "", rn, rn->rn_p, rn->rn_l, rn->rn_r); } #ifdef RN_DEBUG int rn_debug = 1; static void rn_dbg_print(void *arg, const char *fmt, ...) { va_list ap; va_start(ap, fmt); vlog(LOG_DEBUG, fmt, ap); va_end(ap); } static void rn_treeprint(struct radix_node_head *h, rn_printer_t printer, void *arg) { struct radix_node *dup, *rn; const char *delim; if (printer == NULL) return; rn = rn_walkfirst(h->rnh_treetop, printer, arg); for (;;) { /* Process leaves */ delim = ""; for (dup = rn; dup != NULL; dup = dup->rn_dupedkey) { if ((dup->rn_flags & RNF_ROOT) != 0) continue; rn_nodeprint(dup, printer, arg, delim); delim = ", "; } rn = rn_walknext(rn, printer, arg); if (rn->rn_flags & RNF_ROOT) return; } /* NOTREACHED */ } #define traverse(__head, __rn) rn_treeprint((__head), rn_dbg_print, (__rn)) #endif /* RN_DEBUG */ struct radix_node * rn_newpair( const void *v, int b, struct radix_node nodes[2]) { struct radix_node *tt = nodes; struct radix_node *t = tt + 1; t->rn_b = b; t->rn_bmask = 0x80 >> (b & 7); t->rn_l = tt; t->rn_off = b >> 3; tt->rn_b = -1; tt->rn_key = v; tt->rn_p = t; tt->rn_flags = t->rn_flags = RNF_ACTIVE; return t; } struct radix_node * rn_insert( const void *v_arg, struct radix_node_head *head, int *dupentry, struct radix_node nodes[2]) { struct radix_node *top = head->rnh_treetop; struct radix_node *t = rn_search(v_arg, top); struct radix_node *tt; const char *v = v_arg; int head_off = top->rn_off; int vlen = *((const u_char *)v); const char *cp = v + head_off; int b; /* * Find first bit at which v and t->rn_key differ */ { const char *cp2 = t->rn_key + head_off; const char *cplim = v + vlen; int cmp_res; while (cp < cplim) if (*cp2++ != *cp++) goto on1; *dupentry = 1; return t; on1: *dupentry = 0; cmp_res = (cp[-1] ^ cp2[-1]) & 0xff; for (b = (cp - v) << 3; cmp_res; b--) cmp_res >>= 1; } { struct radix_node *p, *x = top; cp = v; do { p = x; if (cp[x->rn_off] & x->rn_bmask) x = x->rn_r; else x = x->rn_l; } while (b > (unsigned) x->rn_b); /* x->rn_b < b && x->rn_b >= 0 */ #ifdef RN_DEBUG if (rn_debug) log(LOG_DEBUG, "%s: Going In:\n", __func__), traverse(head, p); #endif t = rn_newpair(v_arg, b, nodes); tt = t->rn_l; if ((cp[p->rn_off] & p->rn_bmask) == 0) p->rn_l = t; else p->rn_r = t; x->rn_p = t; t->rn_p = p; /* frees x, p as temp vars below */ if ((cp[t->rn_off] & t->rn_bmask) == 0) { t->rn_r = x; } else { t->rn_r = tt; t->rn_l = x; } #ifdef RN_DEBUG if (rn_debug) { log(LOG_DEBUG, "%s: Coming Out:\n", __func__), traverse(head, p); } #endif /* RN_DEBUG */ } return tt; } struct radix_node * rn_addmask( const void *n_arg, int search, int skip) { const char *netmask = n_arg; const char *cp; const char *cplim; struct radix_node *x; struct radix_node *saved_x; int b = 0, mlen, j; int maskduplicated, m0, isnormal; static int last_zeroed = 0; if ((mlen = *(const u_char *)netmask) > max_keylen) mlen = max_keylen; if (skip == 0) skip = 1; if (mlen <= skip) return mask_rnhead->rnh_nodes; if (skip > 1) memmove(addmask_key + 1, rn_ones + 1, skip - 1); if ((m0 = mlen) > skip) memmove(addmask_key + skip, netmask + skip, mlen - skip); /* * Trim trailing zeroes. */ for (cp = addmask_key + mlen; (cp > addmask_key) && cp[-1] == 0;) cp--; mlen = cp - addmask_key; if (mlen <= skip) { if (m0 >= last_zeroed) last_zeroed = mlen; return mask_rnhead->rnh_nodes; } if (m0 < last_zeroed) memset(addmask_key + m0, 0, last_zeroed - m0); *addmask_key = last_zeroed = mlen; x = rn_search(addmask_key, rn_masktop); if (memcmp(addmask_key, x->rn_key, mlen) != 0) x = 0; if (x || search) return x; R_Malloc(x, struct radix_node *, max_keylen + 2 * sizeof (*x)); if ((saved_x = x) == NULL) return NULL; memset(x, 0, max_keylen + 2 * sizeof (*x)); cp = netmask = (void *)(x + 2); memmove(x + 2, addmask_key, mlen); x = rn_insert(cp, mask_rnhead, &maskduplicated, x); if (maskduplicated) { log(LOG_ERR, "rn_addmask: mask impossibly already in tree\n"); Free(saved_x); return x; } /* * Calculate index of mask, and check for normalcy. */ cplim = netmask + mlen; isnormal = 1; for (cp = netmask + skip; (cp < cplim) && *(const u_char *)cp == 0xff;) cp++; if (cp != cplim) { for (j = 0x80; (j & *cp) != 0; j >>= 1) b++; if (*cp != normal_chars[b] || cp != (cplim - 1)) isnormal = 0; } b += (cp - netmask) << 3; x->rn_b = -1 - b; if (isnormal) x->rn_flags |= RNF_NORMAL; return x; } static int /* XXX: arbitrary ordering for non-contiguous masks */ rn_lexobetter( const void *m_arg, const void *n_arg) { const u_char *mp = m_arg; const u_char *np = n_arg; const u_char *lim; if (*mp > *np) return 1; /* not really, but need to check longer one first */ if (*mp == *np) for (lim = mp + *mp; mp < lim;) if (*mp++ > *np++) return 1; return 0; } static struct radix_mask * rn_new_radix_mask( struct radix_node *tt, struct radix_mask *next) { struct radix_mask *m; MKGet(m); if (m == NULL) { log(LOG_ERR, "Mask for route not entered\n"); return NULL; } memset(m, 0, sizeof(*m)); m->rm_b = tt->rn_b; m->rm_flags = tt->rn_flags; if (tt->rn_flags & RNF_NORMAL) m->rm_leaf = tt; else m->rm_mask = tt->rn_mask; m->rm_mklist = next; tt->rn_mklist = m; return m; } struct radix_node * rn_addroute( const void *v_arg, const void *n_arg, struct radix_node_head *head, struct radix_node treenodes[2]) { const char *v = v_arg, *netmask = n_arg; struct radix_node *t, *x = NULL, *tt; struct radix_node *saved_tt, *top = head->rnh_treetop; short b = 0, b_leaf = 0; int keyduplicated; const char *mmask; struct radix_mask *m, **mp; /* * In dealing with non-contiguous masks, there may be * many different routes which have the same mask. * We will find it useful to have a unique pointer to * the mask to speed avoiding duplicate references at * nodes and possibly save time in calculating indices. */ if (netmask != NULL) { if ((x = rn_addmask(netmask, 0, top->rn_off)) == NULL) return NULL; b_leaf = x->rn_b; b = -1 - x->rn_b; netmask = x->rn_key; } /* * Deal with duplicated keys: attach node to previous instance */ saved_tt = tt = rn_insert(v, head, &keyduplicated, treenodes); if (keyduplicated) { for (t = tt; tt != NULL; t = tt, tt = tt->rn_dupedkey) { if (tt->rn_mask == netmask) return NULL; if (netmask == NULL || (tt->rn_mask != NULL && (b_leaf < tt->rn_b || /* index(netmask) > node */ rn_refines(netmask, tt->rn_mask) || rn_lexobetter(netmask, tt->rn_mask)))) break; } /* * If the mask is not duplicated, we wouldn't * find it among possible duplicate key entries * anyway, so the above test doesn't hurt. * * We sort the masks for a duplicated key the same way as * in a masklist -- most specific to least specific. * This may require the unfortunate nuisance of relocating * the head of the list. * * We also reverse, or doubly link the list through the * parent pointer. */ if (tt == saved_tt) { struct radix_node *xx = x; /* link in at head of list */ (tt = treenodes)->rn_dupedkey = t; tt->rn_flags = t->rn_flags; tt->rn_p = x = t->rn_p; t->rn_p = tt; if (x->rn_l == t) x->rn_l = tt; else x->rn_r = tt; saved_tt = tt; x = xx; } else { (tt = treenodes)->rn_dupedkey = t->rn_dupedkey; t->rn_dupedkey = tt; tt->rn_p = t; if (tt->rn_dupedkey) tt->rn_dupedkey->rn_p = tt; } tt->rn_key = v; tt->rn_b = -1; tt->rn_flags = RNF_ACTIVE; } /* * Put mask in tree. */ if (netmask != NULL) { tt->rn_mask = netmask; tt->rn_b = x->rn_b; tt->rn_flags |= x->rn_flags & RNF_NORMAL; } t = saved_tt->rn_p; if (keyduplicated) goto on2; b_leaf = -1 - t->rn_b; if (t->rn_r == saved_tt) x = t->rn_l; else x = t->rn_r; /* Promote general routes from below */ if (x->rn_b < 0) { for (mp = &t->rn_mklist; x != NULL; x = x->rn_dupedkey) { if (x->rn_mask != NULL && x->rn_b >= b_leaf && x->rn_mklist == NULL) { *mp = m = rn_new_radix_mask(x, NULL); if (m != NULL) mp = &m->rm_mklist; } } } else if (x->rn_mklist != NULL) { /* * Skip over masks whose index is > that of new node */ for (mp = &x->rn_mklist; (m = *mp) != NULL; mp = &m->rm_mklist) if (m->rm_b >= b_leaf) break; t->rn_mklist = m; *mp = NULL; } on2: /* Add new route to highest possible ancestor's list */ if (netmask == NULL || b > t->rn_b) return tt; /* can't lift at all */ b_leaf = tt->rn_b; do { x = t; t = t->rn_p; } while (b <= t->rn_b && x != top); /* * Search through routes associated with node to * insert new route according to index. * Need same criteria as when sorting dupedkeys to avoid * double loop on deletion. */ for (mp = &x->rn_mklist; (m = *mp) != NULL; mp = &m->rm_mklist) { if (m->rm_b < b_leaf) continue; if (m->rm_b > b_leaf) break; if (m->rm_flags & RNF_NORMAL) { mmask = m->rm_leaf->rn_mask; if (tt->rn_flags & RNF_NORMAL) { log(LOG_ERR, "Non-unique normal route," " mask not entered\n"); return tt; } } else mmask = m->rm_mask; if (mmask == netmask) { m->rm_refs++; tt->rn_mklist = m; return tt; } if (rn_refines(netmask, mmask) || rn_lexobetter(netmask, mmask)) break; } *mp = rn_new_radix_mask(tt, *mp); return tt; } struct radix_node * rn_delete1( const void *v_arg, const void *netmask_arg, struct radix_node_head *head, struct radix_node *rn) { struct radix_node *t, *p, *x, *tt; struct radix_mask *m, *saved_m, **mp; struct radix_node *dupedkey, *saved_tt, *top; const char *v, *netmask; int b, head_off, vlen; v = v_arg; netmask = netmask_arg; x = head->rnh_treetop; tt = rn_search(v, x); head_off = x->rn_off; vlen = *(const u_char *)v; saved_tt = tt; top = x; if (tt == NULL || memcmp(v + head_off, tt->rn_key + head_off, vlen - head_off) != 0) return NULL; /* * Delete our route from mask lists. */ if (netmask != NULL) { if ((x = rn_addmask(netmask, 1, head_off)) == NULL) return NULL; netmask = x->rn_key; while (tt->rn_mask != netmask) if ((tt = tt->rn_dupedkey) == NULL) return NULL; } if (tt->rn_mask == NULL || (saved_m = m = tt->rn_mklist) == NULL) goto on1; if (tt->rn_flags & RNF_NORMAL) { if (m->rm_leaf != tt || m->rm_refs > 0) { log(LOG_ERR, "rn_delete: inconsistent annotation\n"); return NULL; /* dangling ref could cause disaster */ } } else { if (m->rm_mask != tt->rn_mask) { log(LOG_ERR, "rn_delete: inconsistent annotation\n"); goto on1; } if (--m->rm_refs >= 0) goto on1; } b = -1 - tt->rn_b; t = saved_tt->rn_p; if (b > t->rn_b) goto on1; /* Wasn't lifted at all */ do { x = t; t = t->rn_p; } while (b <= t->rn_b && x != top); for (mp = &x->rn_mklist; (m = *mp) != NULL; mp = &m->rm_mklist) { if (m == saved_m) { *mp = m->rm_mklist; MKFree(m); break; } } if (m == NULL) { log(LOG_ERR, "rn_delete: couldn't find our annotation\n"); if (tt->rn_flags & RNF_NORMAL) return NULL; /* Dangling ref to us */ } on1: /* * Eliminate us from tree */ if (tt->rn_flags & RNF_ROOT) return NULL; #ifdef RN_DEBUG if (rn_debug) log(LOG_DEBUG, "%s: Going In:\n", __func__), traverse(head, tt); #endif t = tt->rn_p; dupedkey = saved_tt->rn_dupedkey; if (dupedkey != NULL) { /* * Here, tt is the deletion target, and * saved_tt is the head of the dupedkey chain. */ if (tt == saved_tt) { x = dupedkey; x->rn_p = t; if (t->rn_l == tt) t->rn_l = x; else t->rn_r = x; } else { /* find node in front of tt on the chain */ for (x = p = saved_tt; p != NULL && p->rn_dupedkey != tt;) p = p->rn_dupedkey; if (p != NULL) { p->rn_dupedkey = tt->rn_dupedkey; if (tt->rn_dupedkey != NULL) tt->rn_dupedkey->rn_p = p; } else log(LOG_ERR, "rn_delete: couldn't find us\n"); } t = tt + 1; if (t->rn_flags & RNF_ACTIVE) { *++x = *t; p = t->rn_p; if (p->rn_l == t) p->rn_l = x; else p->rn_r = x; x->rn_l->rn_p = x; x->rn_r->rn_p = x; } goto out; } if (t->rn_l == tt) x = t->rn_r; else x = t->rn_l; p = t->rn_p; if (p->rn_r == t) p->rn_r = x; else p->rn_l = x; x->rn_p = p; /* * Demote routes attached to us. */ if (t->rn_mklist == NULL) ; else if (x->rn_b >= 0) { for (mp = &x->rn_mklist; (m = *mp) != NULL; mp = &m->rm_mklist) ; *mp = t->rn_mklist; } else { /* If there are any key,mask pairs in a sibling duped-key chain, some subset will appear sorted in the same order attached to our mklist */ for (m = t->rn_mklist; m != NULL && x != NULL; x = x->rn_dupedkey) { if (m == x->rn_mklist) { struct radix_mask *mm = m->rm_mklist; x->rn_mklist = NULL; if (--(m->rm_refs) < 0) MKFree(m); m = mm; } } if (m != NULL) { log(LOG_ERR, "rn_delete: Orphaned Mask %p at %p\n", m, x); } } /* * We may be holding an active internal node in the tree. */ x = tt + 1; if (t != x) { *t = *x; t->rn_l->rn_p = t; t->rn_r->rn_p = t; p = x->rn_p; if (p->rn_l == x) p->rn_l = t; else p->rn_r = t; } out: #ifdef RN_DEBUG if (rn_debug) { log(LOG_DEBUG, "%s: Coming Out:\n", __func__), traverse(head, tt); } #endif /* RN_DEBUG */ tt->rn_flags &= ~RNF_ACTIVE; tt[1].rn_flags &= ~RNF_ACTIVE; return tt; } struct radix_node * rn_delete( const void *v_arg, const void *netmask_arg, struct radix_node_head *head) { return rn_delete1(v_arg, netmask_arg, head, NULL); } static struct radix_node * rn_walknext(struct radix_node *rn, rn_printer_t printer, void *arg) { /* If at right child go back up, otherwise, go right */ while (rn->rn_p->rn_r == rn && (rn->rn_flags & RNF_ROOT) == 0) { if (printer != NULL) (*printer)(arg, SUBTREE_CLOSE); rn = rn->rn_p; } if (printer) rn_nodeprint(rn->rn_p, printer, arg, ""); /* Find the next *leaf* since next node might vanish, too */ for (rn = rn->rn_p->rn_r; rn->rn_b >= 0;) { if (printer != NULL) (*printer)(arg, SUBTREE_OPEN); rn = rn->rn_l; } return rn; } static struct radix_node * rn_walkfirst(struct radix_node *rn, rn_printer_t printer, void *arg) { /* First time through node, go left */ while (rn->rn_b >= 0) { if (printer != NULL) (*printer)(arg, SUBTREE_OPEN); rn = rn->rn_l; } return rn; } int rn_walktree( struct radix_node_head *h, int (*f)(struct radix_node *, void *), void *w) { int error; struct radix_node *base, *next, *rn; /* * This gets complicated because we may delete the node * while applying the function f to it, so we need to calculate * the successor node in advance. */ rn = rn_walkfirst(h->rnh_treetop, NULL, NULL); for (;;) { base = rn; next = rn_walknext(rn, NULL, NULL); /* Process leaves */ while ((rn = base) != NULL) { base = rn->rn_dupedkey; if (!(rn->rn_flags & RNF_ROOT) && (error = (*f)(rn, w))) return error; } rn = next; if (rn->rn_flags & RNF_ROOT) return 0; } /* NOTREACHED */ } struct radix_node * rn_search_matched(struct radix_node_head *h, int (*matcher)(struct radix_node *, void *), void *w) { bool matched; struct radix_node *base, *next, *rn; /* * This gets complicated because we may delete the node * while applying the function f to it, so we need to calculate * the successor node in advance. */ rn = rn_walkfirst(h->rnh_treetop, NULL, NULL); for (;;) { base = rn; next = rn_walknext(rn, NULL, NULL); /* Process leaves */ while ((rn = base) != NULL) { base = rn->rn_dupedkey; if (!(rn->rn_flags & RNF_ROOT)) { matched = (*matcher)(rn, w); if (matched) return rn; } } rn = next; if (rn->rn_flags & RNF_ROOT) return NULL; } /* NOTREACHED */ } struct delayinit { void **head; int off; SLIST_ENTRY(delayinit) entries; }; static SLIST_HEAD(, delayinit) delayinits = SLIST_HEAD_INITIALIZER(delayheads); static int radix_initialized; /* * Initialize a radix tree once radix is initialized. Only for bootstrap. * Assume that no concurrency protection is necessary at this stage. */ void rn_delayedinit(void **head, int off) { struct delayinit *di; if (radix_initialized) return; di = kmem_alloc(sizeof(*di), KM_SLEEP); di->head = head; di->off = off; SLIST_INSERT_HEAD(&delayinits, di, entries); } int rn_inithead(void **head, int off) { struct radix_node_head *rnh; if (*head != NULL) return 1; R_Malloc(rnh, struct radix_node_head *, sizeof (*rnh)); if (rnh == NULL) return 0; *head = rnh; return rn_inithead0(rnh, off); } int rn_inithead0(struct radix_node_head *rnh, int off) { struct radix_node *t; struct radix_node *tt; struct radix_node *ttt; memset(rnh, 0, sizeof(*rnh)); t = rn_newpair(rn_zeros, off, rnh->rnh_nodes); ttt = rnh->rnh_nodes + 2; t->rn_r = ttt; t->rn_p = t; tt = t->rn_l; tt->rn_flags = t->rn_flags = RNF_ROOT | RNF_ACTIVE; tt->rn_b = -1 - off; *ttt = *tt; ttt->rn_key = rn_ones; rnh->rnh_addaddr = rn_addroute; rnh->rnh_deladdr = rn_delete; rnh->rnh_matchaddr = rn_match; rnh->rnh_lookup = rn_lookup; rnh->rnh_treetop = t; return 1; } void rn_init(void) { char *cp, *cplim; struct delayinit *di; #ifdef _KERNEL struct domain *dp; if (radix_initialized) panic("radix already initialized"); radix_initialized = 1; DOMAIN_FOREACH(dp) { if (dp->dom_maxrtkey > max_keylen) max_keylen = dp->dom_maxrtkey; } #endif if (max_keylen == 0) { #ifndef _KERNEL log(LOG_ERR, "rn_init: radix functions require max_keylen be set\n"); #endif return; } R_Malloc(rn_zeros, char *, 3 * max_keylen); if (rn_zeros == NULL) panic("rn_init"); memset(rn_zeros, 0, 3 * max_keylen); rn_ones = cp = rn_zeros + max_keylen; addmask_key = cplim = rn_ones + max_keylen; while (cp < cplim) *cp++ = -1; if (rn_inithead((void *)&mask_rnhead, 0) == 0) panic("rn_init 2"); while ((di = SLIST_FIRST(&delayinits)) != NULL) { if (!rn_inithead(di->head, di->off)) panic("delayed rn_inithead failed"); SLIST_REMOVE_HEAD(&delayinits, entries); kmem_free(di, sizeof(*di)); } }
1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 /* $NetBSD: spl.h,v 1.10 2021/11/02 11:26:05 ryo Exp $ */ /*- * Copyright (c)2005 YAMAMOTO Takashi, * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * this header is intended to be included by MD header. * * an assumption: makeiplcookie() is reasonably fast. * if it isn't the case for your port, it's better to have MD optimized * splxxx() functions, rather than using this header. */ #if !defined(_KERNEL) && !defined(_KMEMUSER) #error not supposed to be exposed to userland. #endif /* !defined(_KERNEL) && !defined(_KMEMUSER) */ #define _SPL_DECL(x, X) \ static __inline __always_inline int \ spl##x(void) \ { return splraiseipl(makeiplcookie(IPL_##X)); } #if defined(IPL_SOFTCLOCK) _SPL_DECL(softclock, SOFTCLOCK) #endif /* defined(IPL_SOFTCLOCK) */ #if defined(IPL_SOFTNET) _SPL_DECL(softnet, SOFTNET) #endif /* defined(IPL_SOFTNET) */ #if defined(IPL_SOFTSERIAL) _SPL_DECL(softserial, SOFTSERIAL) #endif /* defined(IPL_SOFTSERIAL) */ _SPL_DECL(vm, VM) _SPL_DECL(sched, SCHED) _SPL_DECL(high, HIGH) #undef _SPL_DECL
3 3 3 1 2 2 1 1 1 2 1 1 6 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 /* $NetBSD: rfcomm_socket.c,v 1.38 2019/01/28 12:53:01 martin Exp $ */ /*- * Copyright (c) 2006 Itronix Inc. * All rights reserved. * * Written by Iain Hibbert for Itronix Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The name of Itronix Inc. may not be used to endorse * or promote products derived from this software without specific * prior written permission. * * THIS SOFTWARE IS PROVIDED BY ITRONIX INC. ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL ITRONIX INC. BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: rfcomm_socket.c,v 1.38 2019/01/28 12:53:01 martin Exp $"); /* load symbolic names */ #ifdef BLUETOOTH_DEBUG #define PRUREQUESTS #define PRCOREQUESTS #endif #include <sys/param.h> #include <sys/domain.h> #include <sys/kernel.h> #include <sys/mbuf.h> #include <sys/proc.h> #include <sys/protosw.h> #include <sys/socket.h> #include <sys/socketvar.h> #include <sys/systm.h> #include <netbt/bluetooth.h> #include <netbt/rfcomm.h> /**************************************************************************** * * RFCOMM SOCK_STREAM Sockets - serial line emulation * */ static void rfcomm_connecting(void *); static void rfcomm_connected(void *); static void rfcomm_disconnected(void *, int); static void *rfcomm_newconn(void *, struct sockaddr_bt *, struct sockaddr_bt *); static void rfcomm_complete(void *, int); static void rfcomm_linkmode(void *, int); static void rfcomm_input(void *, struct mbuf *); static const struct btproto rfcomm_proto = { rfcomm_connecting, rfcomm_connected, rfcomm_disconnected, rfcomm_newconn, rfcomm_complete, rfcomm_linkmode, rfcomm_input, }; /* sysctl variables */ int rfcomm_sendspace = 4096; int rfcomm_recvspace = 4096; static int rfcomm_attach(struct socket *so, int proto) { int error; KASSERT(so->so_pcb == NULL); if (so->so_lock == NULL) { mutex_obj_hold(bt_lock); so->so_lock = bt_lock; solock(so); } KASSERT(solocked(so)); /* * Since we have nothing to add, we attach the DLC * structure directly to our PCB pointer. */ error = soreserve(so, rfcomm_sendspace, rfcomm_recvspace); if (error) return error; error = rfcomm_attach_pcb((struct rfcomm_dlc **)&so->so_pcb, &rfcomm_proto, so); if (error) return error; error = rfcomm_rcvd_pcb(so->so_pcb, sbspace(&so->so_rcv)); if (error) { rfcomm_detach_pcb((struct rfcomm_dlc **)&so->so_pcb); return error; } return 0; } static void rfcomm_detach(struct socket *so) { KASSERT(so->so_pcb != NULL); rfcomm_detach_pcb((struct rfcomm_dlc **)&so->so_pcb); KASSERT(so->so_pcb == NULL); } static int rfcomm_accept(struct socket *so, struct sockaddr *nam) { struct rfcomm_dlc *pcb = so->so_pcb; KASSERT(solocked(so)); KASSERT(nam != NULL); if (pcb == NULL) return EINVAL; return rfcomm_peeraddr_pcb(pcb, (struct sockaddr_bt *)nam); } static int rfcomm_bind(struct socket *so, struct sockaddr *nam, struct lwp *l) { struct rfcomm_dlc *pcb = so->so_pcb; struct sockaddr_bt *sa = (struct sockaddr_bt *)nam; KASSERT(solocked(so)); KASSERT(nam != NULL); if (pcb == NULL) return EINVAL; if (sa->bt_len != sizeof(struct sockaddr_bt)) return EINVAL; if (sa->bt_family != AF_BLUETOOTH) return EAFNOSUPPORT; return rfcomm_bind_pcb(pcb, sa); } static int rfcomm_listen(struct socket *so, struct lwp *l) { struct rfcomm_dlc *pcb = so->so_pcb; KASSERT(solocked(so)); if (pcb == NULL) return EINVAL; return rfcomm_listen_pcb(pcb); } static int rfcomm_connect(struct socket *so, struct sockaddr *nam, struct lwp *l) { struct rfcomm_dlc *pcb = so->so_pcb; struct sockaddr_bt *sa = (struct sockaddr_bt *)nam; KASSERT(solocked(so)); KASSERT(nam != NULL); if (pcb == NULL) return EINVAL; if (sa->bt_len != sizeof(struct sockaddr_bt)) return EINVAL; if (sa->bt_family != AF_BLUETOOTH) return EAFNOSUPPORT; soisconnecting(so); return rfcomm_connect_pcb(pcb, sa); } static int rfcomm_connect2(struct socket *so, struct socket *so2) { struct rfcomm_dlc *pcb = so->so_pcb; KASSERT(solocked(so)); if (pcb == NULL) return EINVAL; return EOPNOTSUPP; } static int rfcomm_disconnect(struct socket *so) { struct rfcomm_dlc *pcb = so->so_pcb; KASSERT(solocked(so)); if (pcb == NULL) return EINVAL; soisdisconnecting(so); return rfcomm_disconnect_pcb(pcb, so->so_linger); } static int rfcomm_shutdown(struct socket *so) { KASSERT(solocked(so)); socantsendmore(so); return 0; } static int rfcomm_abort(struct socket *so) { struct rfcomm_dlc *pcb = so->so_pcb; KASSERT(solocked(so)); if (pcb == NULL) return EINVAL; rfcomm_disconnect_pcb(pcb, 0); soisdisconnected(so); rfcomm_detach(so); return 0; } static int rfcomm_ioctl(struct socket *so, u_long cmd, void *nam, struct ifnet *ifp) { return EPASSTHROUGH; } static int rfcomm_stat(struct socket *so, struct stat *ub) { KASSERT(solocked(so)); return 0; } static int rfcomm_peeraddr(struct socket *so, struct sockaddr *nam) { struct rfcomm_dlc *pcb = so->so_pcb; KASSERT(solocked(so)); KASSERT(pcb != NULL); KASSERT(nam != NULL); return rfcomm_peeraddr_pcb(pcb, (struct sockaddr_bt *)nam); } static int rfcomm_sockaddr(struct socket *so, struct sockaddr *nam) { struct rfcomm_dlc *pcb = so->so_pcb; KASSERT(solocked(so)); KASSERT(pcb != NULL); KASSERT(nam != NULL); return rfcomm_sockaddr_pcb(pcb, (struct sockaddr_bt *)nam); } static int rfcomm_rcvd(struct socket *so, int flags, struct lwp *l) { struct rfcomm_dlc *pcb = so->so_pcb; KASSERT(solocked(so)); if (pcb == NULL) return EINVAL; return rfcomm_rcvd_pcb(pcb, sbspace(&so->so_rcv)); } static int rfcomm_recvoob(struct socket *so, struct mbuf *m, int flags) { KASSERT(solocked(so)); return EOPNOTSUPP; } static int rfcomm_send(struct socket *so, struct mbuf *m, struct sockaddr *nam, struct mbuf *control, struct lwp *l) { struct rfcomm_dlc *pcb = so->so_pcb; int err = 0; struct mbuf *m0; KASSERT(solocked(so)); KASSERT(m != NULL); if (control) /* no use for that */ m_freem(control); if (pcb == NULL) { err = EINVAL; goto release; } m0 = m_copypacket(m, M_DONTWAIT); if (m0 == NULL) { err = ENOMEM; goto release; } sbappendstream(&so->so_snd, m); return rfcomm_send_pcb(pcb, m0); release: m_freem(m); return err; } static int rfcomm_sendoob(struct socket *so, struct mbuf *m, struct mbuf *control) { KASSERT(solocked(so)); m_freem(m); m_freem(control); return EOPNOTSUPP; } static int rfcomm_purgeif(struct socket *so, struct ifnet *ifp) { return EOPNOTSUPP; } /* * rfcomm_ctloutput(req, socket, sockopt) * */ int rfcomm_ctloutput(int req, struct socket *so, struct sockopt *sopt) { struct rfcomm_dlc *pcb = so->so_pcb; int err = 0; DPRINTFN(2, "%s\n", prcorequests[req]); if (pcb == NULL) return EINVAL; if (sopt->sopt_level != BTPROTO_RFCOMM) return ENOPROTOOPT; switch(req) { case PRCO_GETOPT: err = rfcomm_getopt(pcb, sopt); break; case PRCO_SETOPT: err = rfcomm_setopt(pcb, sopt); break; default: err = ENOPROTOOPT; break; } return err; } /********************************************************************** * * RFCOMM callbacks */ static void rfcomm_connecting(void *arg) { /* struct socket *so = arg; */ KASSERT(arg != NULL); DPRINTF("Connecting\n"); } static void rfcomm_connected(void *arg) { struct socket *so = arg; KASSERT(so != NULL); DPRINTF("Connected\n"); soisconnected(so); } static void rfcomm_disconnected(void *arg, int err) { struct socket *so = arg; KASSERT(so != NULL); DPRINTF("Disconnected\n"); so->so_error = err; soisdisconnected(so); } static void * rfcomm_newconn(void *arg, struct sockaddr_bt *laddr, struct sockaddr_bt *raddr) { struct socket *so = arg; DPRINTF("New Connection\n"); so = sonewconn(so, false); if (so == NULL) return NULL; soisconnecting(so); return so->so_pcb; } /* * rfcomm_complete(rfcomm_dlc, length) * * length bytes are sent and may be removed from socket buffer */ static void rfcomm_complete(void *arg, int length) { struct socket *so = arg; sbdrop(&so->so_snd, length); sowwakeup(so); } /* * rfcomm_linkmode(rfcomm_dlc, new) * * link mode change notification. */ static void rfcomm_linkmode(void *arg, int new) { struct socket *so = arg; struct sockopt sopt; int mode; DPRINTF("auth %s, encrypt %s, secure %s\n", (new & RFCOMM_LM_AUTH ? "on" : "off"), (new & RFCOMM_LM_ENCRYPT ? "on" : "off"), (new & RFCOMM_LM_SECURE ? "on" : "off")); sockopt_init(&sopt, BTPROTO_RFCOMM, SO_RFCOMM_LM, 0); (void)rfcomm_getopt(so->so_pcb, &sopt); (void)sockopt_getint(&sopt, &mode); sockopt_destroy(&sopt); if (((mode & RFCOMM_LM_AUTH) && !(new & RFCOMM_LM_AUTH)) || ((mode & RFCOMM_LM_ENCRYPT) && !(new & RFCOMM_LM_ENCRYPT)) || ((mode & RFCOMM_LM_SECURE) && !(new & RFCOMM_LM_SECURE))) rfcomm_disconnect_pcb(so->so_pcb, 0); } /* * rfcomm_input(rfcomm_dlc, mbuf) */ static void rfcomm_input(void *arg, struct mbuf *m) { struct socket *so = arg; KASSERT(so != NULL); if (m->m_pkthdr.len > sbspace(&so->so_rcv)) { printf("%s: %d bytes dropped (socket buffer full)\n", __func__, m->m_pkthdr.len); m_freem(m); return; } DPRINTFN(10, "received %d bytes\n", m->m_pkthdr.len); sbappendstream(&so->so_rcv, m); sorwakeup(so); } PR_WRAP_USRREQS(rfcomm) #define rfcomm_attach rfcomm_attach_wrapper #define rfcomm_detach rfcomm_detach_wrapper #define rfcomm_accept rfcomm_accept_wrapper #define rfcomm_bind rfcomm_bind_wrapper #define rfcomm_listen rfcomm_listen_wrapper #define rfcomm_connect rfcomm_connect_wrapper #define rfcomm_connect2 rfcomm_connect2_wrapper #define rfcomm_disconnect rfcomm_disconnect_wrapper #define rfcomm_shutdown rfcomm_shutdown_wrapper #define rfcomm_abort rfcomm_abort_wrapper #define rfcomm_ioctl rfcomm_ioctl_wrapper #define rfcomm_stat rfcomm_stat_wrapper #define rfcomm_peeraddr rfcomm_peeraddr_wrapper #define rfcomm_sockaddr rfcomm_sockaddr_wrapper #define rfcomm_rcvd rfcomm_rcvd_wrapper #define rfcomm_recvoob rfcomm_recvoob_wrapper #define rfcomm_send rfcomm_send_wrapper #define rfcomm_sendoob rfcomm_sendoob_wrapper #define rfcomm_purgeif rfcomm_purgeif_wrapper const struct pr_usrreqs rfcomm_usrreqs = { .pr_attach = rfcomm_attach, .pr_detach = rfcomm_detach, .pr_accept = rfcomm_accept, .pr_bind = rfcomm_bind, .pr_listen = rfcomm_listen, .pr_connect = rfcomm_connect, .pr_connect2 = rfcomm_connect2, .pr_disconnect = rfcomm_disconnect, .pr_shutdown = rfcomm_shutdown, .pr_abort = rfcomm_abort, .pr_ioctl = rfcomm_ioctl, .pr_stat = rfcomm_stat, .pr_peeraddr = rfcomm_peeraddr, .pr_sockaddr = rfcomm_sockaddr, .pr_rcvd = rfcomm_rcvd, .pr_recvoob = rfcomm_recvoob, .pr_send = rfcomm_send, .pr_sendoob = rfcomm_sendoob, .pr_purgeif = rfcomm_purgeif, };
19 19 10 10 10 10 196 195 195 122 123 124 1035 1035 1037 1034 1036 162 80 80 191 191 190 740 741 741 741 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 /* $NetBSD: subr_kcpuset.c,v 1.20 2023/09/23 18:21:11 ad Exp $ */ /*- * Copyright (c) 2011, 2023 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Mindaugas Rasiukevicius. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /* * Kernel CPU set implementation. * * Interface can be used by kernel subsystems as a unified dynamic CPU * bitset implementation handling many CPUs. Facility also supports early * use by MD code on boot, as it fixups bitsets on further boot. * * TODO: * - Handle "reverse" bitset on fixup/grow. */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: subr_kcpuset.c,v 1.20 2023/09/23 18:21:11 ad Exp $"); #include <sys/param.h> #include <sys/types.h> #include <sys/atomic.h> #include <sys/intr.h> #include <sys/sched.h> #include <sys/kcpuset.h> #include <sys/kmem.h> /* Number of CPUs to support. */ #define KC_MAXCPUS roundup2(MAXCPUS, 32) /* * Structure of dynamic CPU set in the kernel. */ struct kcpuset { uint32_t bits[0]; }; typedef struct kcpuset_impl { /* Reference count. */ u_int kc_refcnt; /* Next to free, if non-NULL (used when multiple references). */ struct kcpuset * kc_next; /* Actual variable-sized field of bits. */ struct kcpuset kc_field; } kcpuset_impl_t; #define KC_BITS_OFF (offsetof(struct kcpuset_impl, kc_field)) #define KC_GETSTRUCT(b) ((kcpuset_impl_t *)((char *)(b) - KC_BITS_OFF)) #define KC_GETCSTRUCT(b) ((const kcpuset_impl_t *)((const char *)(b) - KC_BITS_OFF)) /* Sizes of a single bitset. */ #define KC_SHIFT 5 #define KC_MASK 31 /* An array of noted early kcpuset creations and data. */ #define KC_SAVE_NITEMS 8 /* Structures for early boot mechanism (must be statically initialised). */ static kcpuset_t ** kc_noted_early[KC_SAVE_NITEMS]; static uint32_t kc_bits_early[KC_SAVE_NITEMS]; static int kc_last_idx = 0; static bool kc_initialised = false; #define KC_BITSIZE_EARLY sizeof(kc_bits_early[0]) #define KC_NFIELDS_EARLY 1 /* * The size of whole bitset fields and amount of fields. * The whole size must statically initialise for early case. */ static size_t kc_bitsize __read_mostly = KC_BITSIZE_EARLY; static size_t kc_nfields __read_mostly = KC_NFIELDS_EARLY; static size_t kc_memsize __read_mostly; static kcpuset_t * kcpuset_create_raw(bool); /* * kcpuset_sysinit: initialize the subsystem, transfer early boot cases * to dynamically allocated sets. */ void kcpuset_sysinit(void) { kcpuset_t *kc_dynamic[KC_SAVE_NITEMS], *kcp; int i, s; /* Set a kcpuset_t sizes. */ kc_nfields = (KC_MAXCPUS >> KC_SHIFT); kc_bitsize = sizeof(uint32_t) * kc_nfields; kc_memsize = sizeof(kcpuset_impl_t) + kc_bitsize; KASSERT(kc_nfields != 0); KASSERT(kc_bitsize != 0); /* First, pre-allocate kcpuset entries. */ for (i = 0; i < kc_last_idx; i++) { kcp = kcpuset_create_raw(true); kc_dynamic[i] = kcp; } /* * Prepare to convert all early noted kcpuset uses to dynamic sets. * All processors, except the one we are currently running (primary), * must not be spinned yet. Since MD facilities can use kcpuset, * raise the IPL to high. */ KASSERT(mp_online == false); s = splhigh(); for (i = 0; i < kc_last_idx; i++) { /* * Transfer the bits from early static storage to the kcpuset. */ KASSERT(kc_bitsize >= KC_BITSIZE_EARLY); memcpy(kc_dynamic[i], &kc_bits_early[i], KC_BITSIZE_EARLY); /* * Store the new pointer, pointing to the allocated kcpuset. * Note: we are not in an interrupt context and it is the only * CPU running - thus store is safe (e.g. no need for pointer * variable to be volatile). */ *kc_noted_early[i] = kc_dynamic[i]; } kc_initialised = true; kc_last_idx = 0; splx(s); } /* * kcpuset_early_ptr: note an early boot use by saving the pointer and * returning a pointer to a static, temporary bit field. */ static kcpuset_t * kcpuset_early_ptr(kcpuset_t **kcptr) { kcpuset_t *kcp; int s; s = splhigh(); if (kc_last_idx < KC_SAVE_NITEMS) { /* * Save the pointer, return pointer to static early field. * Need to zero it out. */ kc_noted_early[kc_last_idx] = kcptr; kcp = (kcpuset_t *)&kc_bits_early[kc_last_idx]; kc_last_idx++; memset(kcp, 0, KC_BITSIZE_EARLY); KASSERT(kc_bitsize == KC_BITSIZE_EARLY); } else { panic("kcpuset(9): all early-use entries exhausted; " "increase KC_SAVE_NITEMS\n"); } splx(s); return kcp; } /* * Routines to create or destroy the CPU set. * Early boot case is handled. */ static kcpuset_t * kcpuset_create_raw(bool zero) { kcpuset_impl_t *kc; kc = kmem_alloc(kc_memsize, KM_SLEEP); kc->kc_refcnt = 1; kc->kc_next = NULL; if (zero) { memset(&kc->kc_field, 0, kc_bitsize); } /* Note: return pointer to the actual field of bits. */ KASSERT((uint8_t *)kc + KC_BITS_OFF == (uint8_t *)&kc->kc_field); return &kc->kc_field; } void kcpuset_create(kcpuset_t **retkcp, bool zero) { if (__predict_false(!kc_initialised)) { /* Early boot use - special case. */ *retkcp = kcpuset_early_ptr(retkcp); return; } *retkcp = kcpuset_create_raw(zero); } void kcpuset_clone(kcpuset_t **retkcp, const kcpuset_t *kcp) { kcpuset_create(retkcp, false); memcpy(*retkcp, kcp, kc_bitsize); } void kcpuset_destroy(kcpuset_t *kcp) { const size_t size = kc_memsize; kcpuset_impl_t *kc; KASSERT(kc_initialised); KASSERT(kcp != NULL); do { kc = KC_GETSTRUCT(kcp); kcp = kc->kc_next; kmem_free(kc, size); } while (kcp); } /* * Routines to reference/unreference the CPU set. * Note: early boot case is not supported by these routines. */ void kcpuset_use(kcpuset_t *kcp) { kcpuset_impl_t *kc = KC_GETSTRUCT(kcp); KASSERT(kc_initialised); atomic_inc_uint(&kc->kc_refcnt); } void kcpuset_unuse(kcpuset_t *kcp, kcpuset_t **lst) { kcpuset_impl_t *kc = KC_GETSTRUCT(kcp); KASSERT(kc_initialised); KASSERT(kc->kc_refcnt > 0); membar_release(); if (atomic_dec_uint_nv(&kc->kc_refcnt) != 0) { return; } membar_acquire(); KASSERT(kc->kc_next == NULL); if (lst == NULL) { kcpuset_destroy(kcp); return; } kc->kc_next = *lst; *lst = kcp; } /* * Routines to transfer the CPU set from / to userspace. * Note: early boot case is not supported by these routines. */ int kcpuset_copyin(const cpuset_t *ucp, kcpuset_t *kcp, size_t len) { kcpuset_impl_t *kc __diagused = KC_GETSTRUCT(kcp); KASSERT(kc_initialised); KASSERT(kc->kc_refcnt > 0); KASSERT(kc->kc_next == NULL); if (len > kc_bitsize) { /* XXX */ return EINVAL; } return copyin(ucp, kcp, len); } int kcpuset_copyout(kcpuset_t *kcp, cpuset_t *ucp, size_t len) { kcpuset_impl_t *kc __diagused = KC_GETSTRUCT(kcp); KASSERT(kc_initialised); KASSERT(kc->kc_refcnt > 0); KASSERT(kc->kc_next == NULL); if (len > kc_bitsize) { /* XXX */ return EINVAL; } return copyout(kcp, ucp, len); } void kcpuset_export_u32(const kcpuset_t *kcp, uint32_t *bitfield, size_t len) { size_t rlen = MIN(kc_bitsize, len); KASSERT(kcp != NULL); memcpy(bitfield, kcp->bits, rlen); } /* * Routines to change bit field - zero, fill, copy, set, unset, etc. */ void kcpuset_zero(kcpuset_t *kcp) { KASSERT(!kc_initialised || KC_GETSTRUCT(kcp)->kc_refcnt > 0); KASSERT(!kc_initialised || KC_GETSTRUCT(kcp)->kc_next == NULL); memset(kcp, 0, kc_bitsize); } void kcpuset_fill(kcpuset_t *kcp) { KASSERT(!kc_initialised || KC_GETSTRUCT(kcp)->kc_refcnt > 0); KASSERT(!kc_initialised || KC_GETSTRUCT(kcp)->kc_next == NULL); memset(kcp, ~0, kc_bitsize); } void kcpuset_copy(kcpuset_t *dkcp, const kcpuset_t *skcp) { KASSERT(!kc_initialised || KC_GETSTRUCT(dkcp)->kc_refcnt > 0); KASSERT(!kc_initialised || KC_GETSTRUCT(dkcp)->kc_next == NULL); memcpy(dkcp, skcp, kc_bitsize); } void kcpuset_set(kcpuset_t *kcp, cpuid_t i) { const size_t j = i >> KC_SHIFT; KASSERT(!kc_initialised || KC_GETSTRUCT(kcp)->kc_next == NULL); KASSERT(j < kc_nfields); kcp->bits[j] |= __BIT(i & KC_MASK); } void kcpuset_clear(kcpuset_t *kcp, cpuid_t i) { const size_t j = i >> KC_SHIFT; KASSERT(!kc_initialised || KC_GETCSTRUCT(kcp)->kc_next == NULL); KASSERT(j < kc_nfields); kcp->bits[j] &= ~(__BIT(i & KC_MASK)); } bool kcpuset_isset(const kcpuset_t *kcp, cpuid_t i) { const size_t j = i >> KC_SHIFT; KASSERT(kcp != NULL); KASSERT(!kc_initialised || KC_GETCSTRUCT(kcp)->kc_refcnt > 0); KASSERT(!kc_initialised || KC_GETCSTRUCT(kcp)->kc_next == NULL); KASSERT(j < kc_nfields); return ((__BIT(i & KC_MASK)) & kcp->bits[j]) != 0; } bool kcpuset_isotherset(const kcpuset_t *kcp, cpuid_t i) { const size_t j2 = i >> KC_SHIFT; const uint32_t mask = ~(__BIT(i & KC_MASK)); for (size_t j = 0; j < kc_nfields; j++) { const uint32_t bits = kcp->bits[j]; if (bits && (j != j2 || (bits & mask) != 0)) { return true; } } return false; } bool kcpuset_iszero(const kcpuset_t *kcp) { for (size_t j = 0; j < kc_nfields; j++) { if (kcp->bits[j] != 0) { return false; } } return true; } bool kcpuset_match(const kcpuset_t *kcp1, const kcpuset_t *kcp2) { return memcmp(kcp1, kcp2, kc_bitsize) == 0; } bool kcpuset_intersecting_p(const kcpuset_t *kcp1, const kcpuset_t *kcp2) { for (size_t j = 0; j < kc_nfields; j++) { if (kcp1->bits[j] & kcp2->bits[j]) return true; } return false; } cpuid_t kcpuset_ffs(const kcpuset_t *kcp) { for (size_t j = 0; j < kc_nfields; j++) { if (kcp->bits[j]) return 32 * j + ffs(kcp->bits[j]); } return 0; } cpuid_t kcpuset_ffs_intersecting(const kcpuset_t *kcp1, const kcpuset_t *kcp2) { for (size_t j = 0; j < kc_nfields; j++) { uint32_t bits = kcp1->bits[j] & kcp2->bits[j]; if (bits) return 32 * j + ffs(bits); } return 0; } void kcpuset_merge(kcpuset_t *kcp1, const kcpuset_t *kcp2) { for (size_t j = 0; j < kc_nfields; j++) { kcp1->bits[j] |= kcp2->bits[j]; } } void kcpuset_intersect(kcpuset_t *kcp1, const kcpuset_t *kcp2) { for (size_t j = 0; j < kc_nfields; j++) { kcp1->bits[j] &= kcp2->bits[j]; } } void kcpuset_remove(kcpuset_t *kcp1, const kcpuset_t *kcp2) { for (size_t j = 0; j < kc_nfields; j++) { kcp1->bits[j] &= ~kcp2->bits[j]; } } int kcpuset_countset(const kcpuset_t *kcp) { int count = 0; for (size_t j = 0; j < kc_nfields; j++) { count += popcount32(kcp->bits[j]); } return count; } /* * Routines to set/clear the flags atomically. */ void kcpuset_atomic_set(kcpuset_t *kcp, cpuid_t i) { const size_t j = i >> KC_SHIFT; KASSERT(j < kc_nfields); atomic_or_32(&kcp->bits[j], __BIT(i & KC_MASK)); } void kcpuset_atomic_clear(kcpuset_t *kcp, cpuid_t i) { const size_t j = i >> KC_SHIFT; KASSERT(j < kc_nfields); atomic_and_32(&kcp->bits[j], ~(__BIT(i & KC_MASK))); } void kcpuset_atomicly_intersect(kcpuset_t *kcp1, const kcpuset_t *kcp2) { for (size_t j = 0; j < kc_nfields; j++) { if (kcp2->bits[j]) atomic_and_32(&kcp1->bits[j], kcp2->bits[j]); } } void kcpuset_atomicly_merge(kcpuset_t *kcp1, const kcpuset_t *kcp2) { for (size_t j = 0; j < kc_nfields; j++) { if (kcp2->bits[j]) atomic_or_32(&kcp1->bits[j], kcp2->bits[j]); } } void kcpuset_atomicly_remove(kcpuset_t *kcp1, const kcpuset_t *kcp2) { for (size_t j = 0; j < kc_nfields; j++) { if (kcp2->bits[j]) atomic_and_32(&kcp1->bits[j], ~kcp2->bits[j]); } }
8 1 1 1 1 1 1 1 1 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 /* $NetBSD: null_vfsops.c,v 1.101 2023/02/06 10:32:58 hannken Exp $ */ /* * Copyright (c) 1999 National Aeronautics & Space Administration * All rights reserved. * * This software was written by William Studenmund of the * Numerical Aerospace Simulation Facility, NASA Ames Research Center. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the National Aeronautics & Space Administration * nor the names of its contributors may be used to endorse or promote * products derived from this software without specific prior written * permission. * * THIS SOFTWARE IS PROVIDED BY THE NATIONAL AERONAUTICS & SPACE ADMINISTRATION * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE ADMINISTRATION OR CONTRIB- * UTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /* * Copyright (c) 1992, 1993, 1995 * The Regents of the University of California. All rights reserved. * * This code is derived from software donated to Berkeley by * Jan-Simon Pendry. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: Id: lofs_vfsops.c,v 1.9 1992/05/30 10:26:24 jsp Exp * from: @(#)lofs_vfsops.c 1.2 (Berkeley) 6/18/92 * @(#)null_vfsops.c 8.7 (Berkeley) 5/14/95 */ /* * Null file-system: VFS operations. * * See null_vnops.c for a description. */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: null_vfsops.c,v 1.101 2023/02/06 10:32:58 hannken Exp $"); #include <sys/param.h> #include <sys/systm.h> #include <sys/sysctl.h> #include <sys/vnode.h> #include <sys/mount.h> #include <sys/namei.h> #include <sys/module.h> #include <miscfs/nullfs/null.h> #include <miscfs/genfs/layer_extern.h> MODULE(MODULE_CLASS_VFS, null, "layerfs"); VFS_PROTOS(nullfs); int nullfs_mount(struct mount *mp, const char *path, void *data, size_t *data_len) { struct vnode *lowerrootvp, *vp; struct null_args *args = data; struct null_mount *nmp; struct layer_mount *lmp; struct pathbuf *pb; struct nameidata nd; int error; if (args == NULL) return EINVAL; if (*data_len < sizeof(*args)) return EINVAL; if (mp->mnt_flag & MNT_GETARGS) { lmp = MOUNTTOLAYERMOUNT(mp); if (lmp == NULL) return EIO; args->la.target = NULL; *data_len = sizeof(*args); return 0; } /* Update is not supported. */ if (mp->mnt_flag & MNT_UPDATE) return EOPNOTSUPP; /* Find the lower vnode and lock it. */ error = pathbuf_copyin(args->la.target, &pb); if (error) { return error; } NDINIT(&nd, LOOKUP, FOLLOW|LOCKLEAF, pb); if ((error = namei(&nd)) != 0) { pathbuf_destroy(pb); return error; } lowerrootvp = nd.ni_vp; pathbuf_destroy(pb); /* Create the mount point. */ nmp = kmem_zalloc(sizeof(struct null_mount), KM_SLEEP); mp->mnt_data = nmp; mp->mnt_iflag |= lowerrootvp->v_mount->mnt_iflag & IMNT_MPSAFE; mp->mnt_iflag |= lowerrootvp->v_mount->mnt_iflag & IMNT_SHRLOOKUP; /* * Make sure that the mount point is sufficiently initialized * that the node create call will work. */ vfs_getnewfsid(mp); error = vfs_set_lowermount(mp, lowerrootvp->v_mount); if (error) { vput(lowerrootvp); kmem_free(nmp, sizeof(struct null_mount)); return error; } nmp->nullm_size = sizeof(struct null_node); nmp->nullm_tag = VT_NULL; nmp->nullm_bypass = layer_bypass; nmp->nullm_vnodeop_p = null_vnodeop_p; /* Setup a null node for root vnode. */ VOP_UNLOCK(lowerrootvp); error = layer_node_create(mp, lowerrootvp, &vp); if (error) { vrele(lowerrootvp); kmem_free(nmp, sizeof(struct null_mount)); return error; } /* * Keep a held reference to the root vnode. It will be released on * umount. Note: nullfs is MP-safe. */ vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); vp->v_vflag |= VV_ROOT; nmp->nullm_rootvp = vp; VOP_UNLOCK(vp); error = set_statvfs_info(path, UIO_USERSPACE, args->la.target, UIO_USERSPACE, mp->mnt_op->vfs_name, mp, curlwp); if (error) return error; if (mp->mnt_lower->mnt_flag & MNT_LOCAL) mp->mnt_flag |= MNT_LOCAL; return 0; } int nullfs_unmount(struct mount *mp, int mntflags) { struct null_mount *nmp = MOUNTTONULLMOUNT(mp); struct vnode *null_rootvp = nmp->nullm_rootvp; int error, flags = 0; if (mntflags & MNT_FORCE) flags |= FORCECLOSE; if (vrefcnt(null_rootvp) > 1 && (mntflags & MNT_FORCE) == 0) return EBUSY; if ((error = vflush(mp, null_rootvp, flags)) != 0) return error; /* Eliminate all activity and release the vnode. */ vgone(null_rootvp); /* Finally, destroy the mount point structures. */ kmem_free(mp->mnt_data, sizeof(struct null_mount)); mp->mnt_data = NULL; return 0; } extern const struct vnodeopv_desc null_vnodeop_opv_desc; const struct vnodeopv_desc * const nullfs_vnodeopv_descs[] = { &null_vnodeop_opv_desc, NULL, }; struct vfsops nullfs_vfsops = { .vfs_name = MOUNT_NULL, .vfs_min_mount_data = sizeof (struct null_args), .vfs_mount = nullfs_mount, .vfs_start = layerfs_start, .vfs_unmount = nullfs_unmount, .vfs_root = layerfs_root, .vfs_quotactl = layerfs_quotactl, .vfs_statvfs = layerfs_statvfs, .vfs_sync = layerfs_sync, .vfs_loadvnode = layerfs_loadvnode, .vfs_vget = layerfs_vget, .vfs_fhtovp = layerfs_fhtovp, .vfs_vptofh = layerfs_vptofh, .vfs_init = layerfs_init, .vfs_done = layerfs_done, .vfs_snapshot = layerfs_snapshot, .vfs_extattrctl = vfs_stdextattrctl, .vfs_suspendctl = layerfs_suspendctl, .vfs_renamelock_enter = layerfs_renamelock_enter, .vfs_renamelock_exit = layerfs_renamelock_exit, .vfs_fsync = (void *)eopnotsupp, .vfs_opv_descs = nullfs_vnodeopv_descs }; SYSCTL_SETUP(nullfs_sysctl_setup, "nullfs sysctl") { sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT, CTLTYPE_NODE, "null", SYSCTL_DESCR("Loopback file system"), NULL, 0, NULL, 0, CTL_VFS, 9, CTL_EOL); /* * XXX the "9" above could be dynamic, thereby eliminating * one more instance of the "number to vfs" mapping problem, * but "9" is the order as taken from sys/mount.h */ } static int null_modcmd(modcmd_t cmd, void *arg) { int error; switch (cmd) { case MODULE_CMD_INIT: error = vfs_attach(&nullfs_vfsops); if (error != 0) break; break; case MODULE_CMD_FINI: error = vfs_detach(&nullfs_vfsops); if (error != 0) break; break; default: error = ENOTTY; break; } return error; }
4 4 4 4 2 2 2 2 2 1 1 1 1 10 10 9 1 1 1 1 1 6 6 4 2 2 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 /* $NetBSD: wsmux.c,v 1.66 2022/03/28 12:38:58 riastradh Exp $ */ /* * Copyright (c) 1998, 2005 The NetBSD Foundation, Inc. * All rights reserved. * * Author: Lennart Augustsson <lennart@augustsson.net> * Carlstedt Research & Technology * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /* * wscons mux device. * * The mux device is a collection of real mice and keyboards and acts as * a merge point for all the events from the different real devices. */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: wsmux.c,v 1.66 2022/03/28 12:38:58 riastradh Exp $"); #ifdef _KERNEL_OPT #include "opt_compat_netbsd.h" #include "opt_modular.h" #endif #include "wsdisplay.h" #include "wsmux.h" #include "wskbd.h" #include "wsmouse.h" #include <sys/param.h> #include <sys/conf.h> #include <sys/ioctl.h> #include <sys/poll.h> #include <sys/fcntl.h> #include <sys/kernel.h> #include <sys/malloc.h> #include <sys/proc.h> #include <sys/queue.h> #include <sys/syslog.h> #include <sys/systm.h> #include <sys/tty.h> #include <sys/signalvar.h> #include <sys/device.h> #include <sys/device_impl.h> /* XXX autoconf abuse */ #include "opt_wsdisplay_compat.h" #include <dev/wscons/wsconsio.h> #include <dev/wscons/wsksymdef.h> #include <dev/wscons/wseventvar.h> #include <dev/wscons/wscons_callbacks.h> #include <dev/wscons/wsmuxvar.h> #include "ioconf.h" #ifdef WSMUX_DEBUG #define DPRINTF(x) if (wsmuxdebug) printf x #define DPRINTFN(n,x) if (wsmuxdebug > (n)) printf x int wsmuxdebug = 0; #else #define DPRINTF(x) #define DPRINTFN(n,x) #endif /* * The wsmux pseudo device is used to multiplex events from several wsmouse, * wskbd, and/or wsmux devices together. * The devices connected together form a tree with muxes in the interior * and real devices (mouse and kbd) at the leaves. The special case of * a tree with one node (mux or other) is supported as well. * Only the device at the root of the tree can be opened (if a non-root * device is opened the subtree rooted at that point is severed from the * containing tree). When the root is opened it allocates a wseventvar * struct which all the nodes in the tree will send their events too. * An ioctl() performed on the root is propagated to all the nodes. * There are also ioctl() operations to add and remove nodes from a tree. */ static int wsmux_mux_open(struct wsevsrc *, struct wseventvar *); static int wsmux_mux_close(struct wsevsrc *); static void wsmux_do_open(struct wsmux_softc *, struct wseventvar *); static void wsmux_do_close(struct wsmux_softc *); #if NWSDISPLAY > 0 static int wsmux_evsrc_set_display(device_t, struct wsevsrc *); #else #define wsmux_evsrc_set_display NULL #endif static int wsmux_do_displayioctl(device_t dev, u_long cmd, void *data, int flag, struct lwp *l); static int wsmux_do_ioctl(device_t, u_long, void *,int,struct lwp *); static int wsmux_add_mux(int, struct wsmux_softc *); #define WSMUXDEV(n) ((n) & 0x7f) #define WSMUXCTL(n) ((n) & 0x80) dev_type_open(wsmuxopen); dev_type_close(wsmuxclose); dev_type_read(wsmuxread); dev_type_ioctl(wsmuxioctl); dev_type_poll(wsmuxpoll); dev_type_kqfilter(wsmuxkqfilter); const struct cdevsw wsmux_cdevsw = { .d_open = wsmuxopen, .d_close = wsmuxclose, .d_read = wsmuxread, .d_write = nowrite, .d_ioctl = wsmuxioctl, .d_stop = nostop, .d_tty = notty, .d_poll = wsmuxpoll, .d_mmap = nommap, .d_kqfilter = wsmuxkqfilter, .d_discard = nodiscard, .d_flag = D_OTHER }; struct wssrcops wsmux_srcops = { WSMUX_MUX, wsmux_mux_open, wsmux_mux_close, wsmux_do_ioctl, wsmux_do_displayioctl, wsmux_evsrc_set_display }; /* From upper level */ void wsmuxattach(int n) { } /* Keep track of all muxes that have been allocated */ static struct wsmux_softc **wsmuxdevs = NULL; static int nwsmux = 0; /* Return mux n, create if necessary */ struct wsmux_softc * wsmux_getmux(int n) { struct wsmux_softc *sc; n = WSMUXDEV(n); /* limit range */ /* Make sure there is room for mux n in the table */ if (n >= nwsmux) { void *new; new = realloc(wsmuxdevs, (n + 1) * sizeof(*wsmuxdevs), M_DEVBUF, M_ZERO | M_WAITOK); wsmuxdevs = new; nwsmux = n + 1; } sc = wsmuxdevs[n]; if (sc == NULL) { sc = wsmux_create("wsmux", n); wsmuxdevs[n] = sc; } return (sc); } /* * open() of the pseudo device from device table. */ int wsmuxopen(dev_t dev, int flags, int mode, struct lwp *l) { struct wsmux_softc *sc; struct wseventvar *evar; int minr, unit; minr = minor(dev); unit = WSMUXDEV(minr); sc = wsmux_getmux(unit); if (sc == NULL) return (ENXIO); DPRINTF(("wsmuxopen: %s: sc=%p l=%p\n", device_xname(sc->sc_base.me_dv), sc, l)); if (WSMUXCTL(minr)) { /* This is the control device which does not allow reads. */ if (flags & FREAD) return (EINVAL); return (0); } if ((flags & (FREAD | FWRITE)) == FWRITE) /* Allow write only open */ return (0); if (sc->sc_base.me_parent != NULL) { /* Grab the mux out of the greedy hands of the parent mux. */ DPRINTF(("wsmuxopen: detach\n")); wsmux_detach_sc(&sc->sc_base); } if (sc->sc_base.me_evp != NULL) /* Already open. */ return (EBUSY); evar = &sc->sc_base.me_evar; wsevent_init(evar, l->l_proc); #ifdef WSDISPLAY_COMPAT_RAWKBD sc->sc_rawkbd = 0; #endif wsmux_do_open(sc, evar); return (0); } /* * Open of a mux via the parent mux. */ int wsmux_mux_open(struct wsevsrc *me, struct wseventvar *evar) { struct wsmux_softc *sc = (struct wsmux_softc *)me; #ifdef DIAGNOSTIC if (sc->sc_base.me_evp != NULL) { printf("wsmux_mux_open: busy\n"); return (EBUSY); } if (sc->sc_base.me_parent == NULL) { printf("wsmux_mux_open: no parent\n"); return (EINVAL); } #endif wsmux_do_open(sc, evar); return (0); } /* Common part of opening a mux. */ void wsmux_do_open(struct wsmux_softc *sc, struct wseventvar *evar) { struct wsevsrc *me; sc->sc_base.me_evp = evar; /* remember event variable, mark as open */ /* Open all children. */ TAILQ_FOREACH(me, &sc->sc_cld, me_next) { DPRINTF(("wsmuxopen: %s: m=%p dev=%s\n", device_xname(sc->sc_base.me_dv), me, device_xname(me->me_dv))); #ifdef DIAGNOSTIC if (me->me_evp != NULL) { printf("wsmuxopen: dev already in use\n"); continue; } if (me->me_parent != sc) { printf("wsmux_do_open: bad child=%p\n", me); continue; } { int error = wsevsrc_open(me, evar); if (error) { DPRINTF(("wsmuxopen: open failed %d\n", error)); } } #else /* ignore errors, failing children will not be marked open */ (void)wsevsrc_open(me, evar); #endif } } /* * close() of the pseudo device from device table. */ int wsmuxclose(dev_t dev, int flags, int mode, struct lwp *l) { int minr = minor(dev); struct wsmux_softc *sc = wsmuxdevs[WSMUXDEV(minr)]; struct wseventvar *evar = sc->sc_base.me_evp; if (WSMUXCTL(minr)) /* control device */ return (0); if (evar == NULL) /* Not open for read */ return (0); wsmux_do_close(sc); sc->sc_base.me_evp = NULL; wsevent_fini(evar); return (0); } /* * Close of a mux via the parent mux. */ int wsmux_mux_close(struct wsevsrc *me) { me->me_evp = NULL; wsmux_do_close((struct wsmux_softc *)me); return (0); } /* Common part of closing a mux. */ void wsmux_do_close(struct wsmux_softc *sc) { struct wsevsrc *me; DPRINTF(("wsmuxclose: %s: sc=%p\n", device_xname(sc->sc_base.me_dv), sc)); /* Close all the children. */ TAILQ_FOREACH(me, &sc->sc_cld, me_next) { DPRINTF(("wsmuxclose %s: m=%p dev=%s\n", device_xname(sc->sc_base.me_dv), me, device_xname(me->me_dv))); #ifdef DIAGNOSTIC if (me->me_parent != sc) { printf("wsmuxclose: bad child=%p\n", me); continue; } #endif (void)wsevsrc_close(me); me->me_evp = NULL; } } /* * read() of the pseudo device from device table. */ int wsmuxread(dev_t dev, struct uio *uio, int flags) { int minr = minor(dev); struct wsmux_softc *sc = wsmuxdevs[WSMUXDEV(minr)]; struct wseventvar *evar; int error; if (WSMUXCTL(minr)) { /* control device */ return (EINVAL); } evar = sc->sc_base.me_evp; if (evar == NULL) { #ifdef DIAGNOSTIC /* XXX can we get here? */ printf("wsmuxread: not open\n"); #endif return (EINVAL); } DPRINTFN(5,("wsmuxread: %s event read evar=%p\n", device_xname(sc->sc_base.me_dv), evar)); error = wsevent_read(evar, uio, flags); DPRINTFN(5,("wsmuxread: %s event read ==> error=%d\n", device_xname(sc->sc_base.me_dv), error)); return (error); } /* * ioctl of the pseudo device from device table. */ int wsmuxioctl(dev_t dev, u_long cmd, void *data, int flag, struct lwp *l) { int u = WSMUXDEV(minor(dev)); return wsmux_do_ioctl(wsmuxdevs[u]->sc_base.me_dv, cmd, data, flag, l); } /* * ioctl of a mux via the parent mux, continuation of wsmuxioctl(). */ int wsmux_do_ioctl(device_t dv, u_long cmd, void *data, int flag, struct lwp *lwp) { struct wsmux_softc *sc = device_private(dv); struct wsevsrc *me; int error, ok; int s, n; struct wseventvar *evar; struct wscons_event event; struct wsmux_device_list *l; DPRINTF(("wsmux_do_ioctl: %s: enter sc=%p, cmd=%08lx\n", device_xname(sc->sc_base.me_dv), sc, cmd)); switch (cmd) { #if defined(COMPAT_50) || defined(MODULAR) case WSMUXIO_OINJECTEVENT: #endif /* defined(COMPAT_50) || defined(MODULAR) */ case WSMUXIO_INJECTEVENT: /* Inject an event, e.g., from moused. */ DPRINTF(("%s: inject\n", device_xname(sc->sc_base.me_dv))); evar = sc->sc_base.me_evp; if (evar == NULL) { /* No event sink, so ignore it. */ DPRINTF(("wsmux_do_ioctl: event ignored\n")); return (0); } s = spltty(); event.type = ((struct wscons_event *)data)->type; event.value = ((struct wscons_event *)data)->value; error = wsevent_inject(evar, &event, 1); splx(s); return error; case WSMUXIO_ADD_DEVICE: #define d ((struct wsmux_device *)data) DPRINTF(("%s: add type=%d, no=%d\n", device_xname(sc->sc_base.me_dv), d->type, d->idx)); switch (d->type) { #if NWSMOUSE > 0 case WSMUX_MOUSE: return (wsmouse_add_mux(d->idx, sc)); #endif #if NWSKBD > 0 case WSMUX_KBD: return (wskbd_add_mux(d->idx, sc)); #endif case WSMUX_MUX: return (wsmux_add_mux(d->idx, sc)); case WSMUX_BELL: return (wsbell_add_mux(d->idx, sc)); default: return (EINVAL); } case WSMUXIO_REMOVE_DEVICE: DPRINTF(("%s: rem type=%d, no=%d\n", device_xname(sc->sc_base.me_dv), d->type, d->idx)); /* Locate the device */ TAILQ_FOREACH(me, &sc->sc_cld, me_next) { if (me->me_ops->type == d->type && device_unit(me->me_dv) == d->idx) { DPRINTF(("wsmux_do_ioctl: detach\n")); wsmux_detach_sc(me); return (0); } } return (EINVAL); #undef d case WSMUXIO_LIST_DEVICES: DPRINTF(("%s: list\n", device_xname(sc->sc_base.me_dv))); l = (struct wsmux_device_list *)data; n = 0; TAILQ_FOREACH(me, &sc->sc_cld, me_next) { if (n >= WSMUX_MAXDEV) break; l->devices[n].type = me->me_ops->type; l->devices[n].idx = device_unit(me->me_dv); n++; } l->ndevices = n; return (0); #ifdef WSDISPLAY_COMPAT_RAWKBD case WSKBDIO_SETMODE: sc->sc_rawkbd = *(int *)data; DPRINTF(("wsmux_do_ioctl: save rawkbd = %d\n", sc->sc_rawkbd)); break; #endif case WSKBDIO_SETVERSION: case WSMOUSEIO_SETVERSION: case WSDISPLAYIO_SETVERSION: DPRINTF(("%s: WSxxxIO_SETVERSION\n", device_xname(sc->sc_base.me_dv))); evar = sc->sc_base.me_evp; if (evar == NULL) return (EINVAL); return wsevent_setversion(evar, *(int *)data); case FIONBIO: DPRINTF(("%s: FIONBIO\n", device_xname(sc->sc_base.me_dv))); return (0); case FIOASYNC: DPRINTF(("%s: FIOASYNC\n", device_xname(sc->sc_base.me_dv))); evar = sc->sc_base.me_evp; if (evar == NULL) return (EINVAL); evar->async = *(int *)data != 0; return (0); case FIOSETOWN: DPRINTF(("%s: FIOSETOWN\n", device_xname(sc->sc_base.me_dv))); evar = sc->sc_base.me_evp; if (evar == NULL) return (EINVAL); if (-*(int *)data != evar->io->p_pgid && *(int *)data != evar->io->p_pid) return (EPERM); return (0); case TIOCSPGRP: DPRINTF(("%s: TIOCSPGRP\n", device_xname(sc->sc_base.me_dv))); evar = sc->sc_base.me_evp; if (evar == NULL) return (EINVAL); if (*(int *)data != evar->io->p_pgid) return (EPERM); return (0); default: DPRINTF(("%s: unknown\n", device_xname(sc->sc_base.me_dv))); break; } if (sc->sc_base.me_evp == NULL #if NWSDISPLAY > 0 && sc->sc_base.me_dispdv == NULL #endif ) return (EACCES); /* Return 0 if any of the ioctl() succeeds, otherwise the last error */ error = 0; ok = 0; TAILQ_FOREACH(me, &sc->sc_cld, me_next) { #ifdef DIAGNOSTIC /* XXX check evp? */ if (me->me_parent != sc) { printf("wsmux_do_ioctl: bad child %p\n", me); continue; } #endif error = wsevsrc_ioctl(me, cmd, data, flag, lwp); DPRINTF(("wsmux_do_ioctl: %s: me=%p dev=%s ==> %d\n", device_xname(sc->sc_base.me_dv), me, device_xname(me->me_dv), error)); if (!error) ok = 1; } if (ok) { error = 0; if (cmd == WSKBDIO_SETENCODING) { sc->sc_kbd_layout = *((kbd_t *)data); } } return (error); } /* * poll() of the pseudo device from device table. */ int wsmuxpoll(dev_t dev, int events, struct lwp *l) { int minr = minor(dev); struct wsmux_softc *sc = wsmuxdevs[WSMUXDEV(minr)]; if (WSMUXCTL(minr)) { /* control device */ return (0); } if (sc->sc_base.me_evp == NULL) { #ifdef DIAGNOSTIC printf("wsmuxpoll: not open\n"); #endif return (POLLHUP); } return (wsevent_poll(sc->sc_base.me_evp, events, l)); } /* * kqfilter() of the pseudo device from device table. */ int wsmuxkqfilter(dev_t dev, struct knote *kn) { int minr = minor(dev); struct wsmux_softc *sc = wsmuxdevs[WSMUXDEV(minr)]; if (WSMUXCTL(minr)) { /* control device */ return (1); } if (sc->sc_base.me_evp == NULL) { #ifdef DIAGNOSTIC printf("wsmuxkqfilter: not open\n"); #endif return (1); } return (wsevent_kqfilter(sc->sc_base.me_evp, kn)); } /* * Add mux unit as a child to muxsc. */ int wsmux_add_mux(int unit, struct wsmux_softc *muxsc) { struct wsmux_softc *sc, *m; sc = wsmux_getmux(unit); if (sc == NULL) return (ENXIO); DPRINTF(("wsmux_add_mux: %s(%p) to %s(%p)\n", device_xname(sc->sc_base.me_dv), sc, device_xname(muxsc->sc_base.me_dv), muxsc)); if (sc->sc_base.me_parent != NULL || sc->sc_base.me_evp != NULL) return (EBUSY); /* The mux we are adding must not be an ancestor of itself. */ for (m = muxsc; m != NULL ; m = m->sc_base.me_parent) if (m == sc) return (EINVAL); return (wsmux_attach_sc(muxsc, &sc->sc_base)); } /* Create a new mux softc. */ struct wsmux_softc * wsmux_create(const char *name, int unit) { struct wsmux_softc *sc; /* XXX This is wrong -- should use autoconfiguration framework */ DPRINTF(("wsmux_create: allocating\n")); sc = malloc(sizeof *sc, M_DEVBUF, M_WAITOK|M_ZERO); sc->sc_base.me_dv = malloc(sizeof(struct device), M_DEVBUF, M_WAITOK|M_ZERO); TAILQ_INIT(&sc->sc_cld); snprintf(sc->sc_base.me_dv->dv_xname, sizeof sc->sc_base.me_dv->dv_xname, "%s%d", name, unit); sc->sc_base.me_dv->dv_private = sc; sc->sc_base.me_dv->dv_unit = unit; sc->sc_base.me_ops = &wsmux_srcops; sc->sc_kbd_layout = KB_NONE; return (sc); } /* Attach me as a child to sc. */ int wsmux_attach_sc(struct wsmux_softc *sc, struct wsevsrc *me) { int error; if (sc == NULL) return (EINVAL); DPRINTF(("wsmux_attach_sc: %s(%p): type=%d\n", device_xname(sc->sc_base.me_dv), sc, me->me_ops->type)); #ifdef DIAGNOSTIC if (me->me_parent != NULL) { printf("wsmux_attach_sc: busy\n"); return (EBUSY); } #endif me->me_parent = sc; TAILQ_INSERT_TAIL(&sc->sc_cld, me, me_next); error = 0; #if NWSDISPLAY > 0 if (sc->sc_base.me_dispdv != NULL) { /* This is a display mux, so attach the new device to it. */ DPRINTF(("wsmux_attach_sc: %s: set display %p\n", device_xname(sc->sc_base.me_dv), sc->sc_base.me_dispdv)); if (me->me_ops->dsetdisplay != NULL) { error = wsevsrc_set_display(me, &sc->sc_base); /* Ignore that the console already has a display. */ if (error == EBUSY) error = 0; if (!error) { #ifdef WSDISPLAY_COMPAT_RAWKBD DPRINTF(("wsmux_attach_sc: %s set rawkbd=%d\n", device_xname(me->me_dv), sc->sc_rawkbd)); (void)wsevsrc_ioctl(me, WSKBDIO_SETMODE, &sc->sc_rawkbd, 0, 0); #endif if (sc->sc_kbd_layout != KB_NONE) (void)wsevsrc_ioctl(me, WSKBDIO_SETENCODING, &sc->sc_kbd_layout, FWRITE, 0); } } } #endif if (sc->sc_base.me_evp != NULL) { /* Mux is open, so open the new subdevice */ DPRINTF(("wsmux_attach_sc: %s: calling open of %s\n", device_xname(sc->sc_base.me_dv), device_xname(me->me_dv))); error = wsevsrc_open(me, sc->sc_base.me_evp); } else { DPRINTF(("wsmux_attach_sc: %s not open\n", device_xname(sc->sc_base.me_dv))); } if (error) { me->me_parent = NULL; TAILQ_REMOVE(&sc->sc_cld, me, me_next); } DPRINTF(("wsmux_attach_sc: %s(%p) done, error=%d\n", device_xname(sc->sc_base.me_dv), sc, error)); return (error); } /* Remove me from the parent. */ void wsmux_detach_sc(struct wsevsrc *me) { struct wsmux_softc *sc = me->me_parent; DPRINTF(("wsmux_detach_sc: %s(%p) parent=%p\n", device_xname(me->me_dv), me, sc)); #ifdef DIAGNOSTIC if (sc == NULL) { printf("wsmux_detach_sc: %s has no parent\n", device_xname(me->me_dv)); return; } #endif #if NWSDISPLAY > 0 if (sc->sc_base.me_dispdv != NULL) { if (me->me_ops->dsetdisplay != NULL) /* ignore error, there's nothing we can do */ (void)wsevsrc_set_display(me, NULL); } else #endif if (me->me_evp != NULL) { DPRINTF(("wsmux_detach_sc: close\n")); /* mux device is open, so close multiplexee */ (void)wsevsrc_close(me); } TAILQ_REMOVE(&sc->sc_cld, me, me_next); me->me_parent = NULL; DPRINTF(("wsmux_detach_sc: done sc=%p\n", sc)); } /* * Display ioctl() of a mux via the parent mux. */ int wsmux_do_displayioctl(device_t dv, u_long cmd, void *data, int flag, struct lwp *l) { struct wsmux_softc *sc = device_private(dv); struct wsevsrc *me; int error, ok; DPRINTF(("wsmux_displayioctl: %s: sc=%p, cmd=%08lx\n", device_xname(sc->sc_base.me_dv), sc, cmd)); #ifdef WSDISPLAY_COMPAT_RAWKBD if (cmd == WSKBDIO_SETMODE) { sc->sc_rawkbd = *(int *)data; DPRINTF(("wsmux_displayioctl: rawkbd = %d\n", sc->sc_rawkbd)); } #endif /* * Return 0 if any of the ioctl() succeeds, otherwise the last error. * Return EPASSTHROUGH if no mux component accepts the ioctl. */ error = EPASSTHROUGH; ok = 0; TAILQ_FOREACH(me, &sc->sc_cld, me_next) { DPRINTF(("wsmux_displayioctl: me=%p\n", me)); #ifdef DIAGNOSTIC if (me->me_parent != sc) { printf("wsmux_displayioctl: bad child %p\n", me); continue; } #endif if (me->me_ops->ddispioctl != NULL) { error = wsevsrc_display_ioctl(me, cmd, data, flag, l); DPRINTF(("wsmux_displayioctl: me=%p dev=%s ==> %d\n", me, device_xname(me->me_dv), error)); if (!error) ok = 1; } } if (ok) error = 0; return (error); } #if NWSDISPLAY > 0 /* * Set display of a mux via the parent mux. */ int wsmux_evsrc_set_display(device_t dv, struct wsevsrc *ame) { struct wsmux_softc *muxsc = (struct wsmux_softc *)ame; struct wsmux_softc *sc = device_private(dv); device_t displaydv = muxsc ? muxsc->sc_base.me_dispdv : NULL; DPRINTF(("wsmux_set_display: %s: displaydv=%p\n", device_xname(sc->sc_base.me_dv), displaydv)); if (displaydv != NULL) { if (sc->sc_base.me_dispdv != NULL) return (EBUSY); } else { if (sc->sc_base.me_dispdv == NULL) return (ENXIO); } return wsmux_set_display(sc, displaydv); } int wsmux_set_display(struct wsmux_softc *sc, device_t displaydv) { device_t odisplaydv; struct wsevsrc *me; struct wsmux_softc *nsc = displaydv ? sc : NULL; int error, ok; odisplaydv = sc->sc_base.me_dispdv; sc->sc_base.me_dispdv = displaydv; if (displaydv) aprint_verbose_dev(sc->sc_base.me_dv, "connecting to %s\n", device_xname(displaydv)); ok = 0; error = 0; TAILQ_FOREACH(me, &sc->sc_cld,me_next) { #ifdef DIAGNOSTIC if (me->me_parent != sc) { printf("wsmux_set_display: bad child parent %p\n", me); continue; } #endif if (me->me_ops->dsetdisplay != NULL) { error = wsevsrc_set_display(me, &nsc->sc_base); DPRINTF(("wsmux_set_display: m=%p dev=%s error=%d\n", me, device_xname(me->me_dv), error)); if (!error) { ok = 1; #ifdef WSDISPLAY_COMPAT_RAWKBD DPRINTF(("wsmux_set_display: %s set rawkbd=%d\n", device_xname(me->me_dv), sc->sc_rawkbd)); (void)wsevsrc_ioctl(me, WSKBDIO_SETMODE, &sc->sc_rawkbd, 0, 0); #endif } } } if (ok) error = 0; if (displaydv == NULL) aprint_verbose("%s: disconnecting from %s\n", device_xname(sc->sc_base.me_dv), device_xname(odisplaydv)); return (error); } #endif /* NWSDISPLAY > 0 */
9 8 9 5 4 9 9 4 4 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 /* $NetBSD: kern_todr.c,v 1.47 2021/04/03 12:06:53 simonb Exp $ */ /*- * Copyright (c) 2020 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Jason R. Thorpe. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /* * Copyright (c) 1988 University of Utah. * Copyright (c) 1992, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * the Systems Programming Group of the University of Utah Computer * Science Department and Ralph Campbell. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: Utah Hdr: clock.c 1.18 91/01/21 * * @(#)clock.c 8.1 (Berkeley) 6/10/93 */ #include "opt_todr.h" #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: kern_todr.c,v 1.47 2021/04/03 12:06:53 simonb Exp $"); #include <sys/param.h> #include <sys/kernel.h> #include <sys/systm.h> #include <sys/device.h> #include <sys/timetc.h> #include <sys/intr.h> #include <sys/rndsource.h> #include <sys/mutex.h> #include <dev/clock_subr.h> /* hmm.. this should probably move to sys */ static int todr_gettime(todr_chip_handle_t, struct timeval *); static int todr_settime(todr_chip_handle_t, struct timeval *); static kmutex_t todr_mutex; static todr_chip_handle_t todr_handle; static bool todr_initialized; /* The minimum reasonable RTC date before preposterousness */ #define PREPOSTEROUS_YEARS (2021 - POSIX_BASE_YEAR) /* * todr_init: * Initialize TOD clock data. */ void todr_init(void) { mutex_init(&todr_mutex, MUTEX_DEFAULT, IPL_NONE); todr_initialized = true; } /* * todr_lock: * Acquire the TODR lock. */ void todr_lock(void) { mutex_enter(&todr_mutex); } /* * todr_unlock: * Release the TODR lock. */ void todr_unlock(void) { mutex_exit(&todr_mutex); } /* * todr_lock_owned: * Return true if the current thread owns the TODR lock. * This is to be used by diagnostic assertions only. */ bool todr_lock_owned(void) { return mutex_owned(&todr_mutex) ? true : false; } /* * todr_attach: * Attach the clock device to todr_handle. */ void todr_attach(todr_chip_handle_t todr) { /* * todr_init() is called very early in main(), but this is * here to catch a case where todr_attach() is called before * main(). */ KASSERT(todr_initialized); todr_lock(); if (todr_handle) { todr_unlock(); printf("todr_attach: TOD already configured\n"); return; } todr_handle = todr; todr_unlock(); } static bool timeset = false; /* * todr_set_systime: * Set up the system's time. The "base" argument is a best-guess * close-enough value to use if the TOD clock is unavailable or * contains garbage. Must be called with the TODR lock held. */ void todr_set_systime(time_t base) { bool badbase = false; bool waszero = (base == 0); bool goodtime = false; bool badrtc = false; struct timespec ts; struct timeval tv; KASSERT(todr_lock_owned()); rnd_add_data(NULL, &base, sizeof(base), 0); if (base < 5 * SECS_PER_COMMON_YEAR) { struct clock_ymdhms basedate; /* * If base is 0, assume filesystem time is just unknown * instead of preposterous. Don't bark. */ if (base != 0) printf("WARNING: preposterous time in file system\n"); /* not going to use it anyway, if the chip is readable */ basedate.dt_year = 2010; basedate.dt_mon = 1; basedate.dt_day = 1; basedate.dt_hour = 12; basedate.dt_min = 0; basedate.dt_sec = 0; base = clock_ymdhms_to_secs(&basedate); badbase = true; } /* * Some ports need to be supplied base in order to fabricate a time_t. */ if (todr_handle) todr_handle->base_time = base; memset(&tv, 0, sizeof(tv)); if ((todr_handle == NULL) || (todr_gettime(todr_handle, &tv) != 0) || (tv.tv_sec < (PREPOSTEROUS_YEARS * SECS_PER_COMMON_YEAR))) { if (todr_handle != NULL) printf("WARNING: preposterous TOD clock time\n"); else printf("WARNING: no TOD clock present\n"); badrtc = true; } else { time_t deltat = tv.tv_sec - base; if (deltat < 0) deltat = -deltat; if (!badbase && deltat >= 2 * SECS_PER_DAY) { if (tv.tv_sec < base) { /* * The clock should never go backwards * relative to filesystem time. If it * does by more than the threshold, * believe the filesystem. */ printf("WARNING: clock lost %" PRId64 " days\n", deltat / SECS_PER_DAY); badrtc = true; } else { aprint_verbose("WARNING: clock gained %" PRId64 " days\n", deltat / SECS_PER_DAY); goodtime = true; } } else { goodtime = true; } rnd_add_data(NULL, &tv, sizeof(tv), 0); } /* if the rtc time is bad, use the filesystem time */ if (badrtc) { if (badbase) { printf("WARNING: using default initial time\n"); } else { printf("WARNING: using filesystem time\n"); } tv.tv_sec = base; tv.tv_usec = 0; } timeset = true; ts.tv_sec = tv.tv_sec; ts.tv_nsec = tv.tv_usec * 1000; tc_setclock(&ts); if (waszero || goodtime) return; printf("WARNING: CHECK AND RESET THE DATE!\n"); } /* * todr_save_systime: * Save the current system time back to the TOD clock. * Must be called with the TODR lock held. */ void todr_save_systime(void) { struct timeval tv; KASSERT(todr_lock_owned()); /* * We might have been called by boot() due to a crash early * on. Don't reset the clock chip if we don't know what time * it is. */ if (!timeset) return; getmicrotime(&tv); if (tv.tv_sec == 0) return; if (todr_handle) if (todr_settime(todr_handle, &tv) != 0) printf("Cannot set TOD clock time\n"); } /* * inittodr: * Legacy wrapper around todr_set_systime(). */ void inittodr(time_t base) { todr_lock(); todr_set_systime(base); todr_unlock(); } /* * resettodr: * Legacy wrapper around todr_save_systime(). */ void resettodr(void) { /* * If we're shutting down, we don't want to get stuck in case * someone was already fiddling with the TOD clock. */ if (shutting_down) { if (mutex_tryenter(&todr_mutex) == 0) { printf("WARNING: Cannot set TOD clock time (busy)\n"); return; } } else { todr_lock(); } todr_save_systime(); todr_unlock(); } #ifdef TODR_DEBUG static void todr_debug(const char *prefix, int rv, struct clock_ymdhms *dt, struct timeval *tvp) { struct timeval tv_val; struct clock_ymdhms dt_val; if (dt == NULL) { clock_secs_to_ymdhms(tvp->tv_sec, &dt_val); dt = &dt_val; } if (tvp == NULL) { tvp = &tv_val; tvp->tv_sec = clock_ymdhms_to_secs(dt); tvp->tv_usec = 0; } printf("%s: rv = %d\n", prefix, rv); printf("%s: rtc_offset = %d\n", prefix, rtc_offset); printf("%s: %4u/%02u/%02u %02u:%02u:%02u, (wday %d) (epoch %u.%06u)\n", prefix, (unsigned)dt->dt_year, dt->dt_mon, dt->dt_day, dt->dt_hour, dt->dt_min, dt->dt_sec, dt->dt_wday, (unsigned)tvp->tv_sec, (unsigned)tvp->tv_usec); } #else /* !TODR_DEBUG */ #define todr_debug(prefix, rv, dt, tvp) #endif /* TODR_DEBUG */ static int todr_wenable(todr_chip_handle_t todr, int onoff) { if (todr->todr_setwen != NULL) return todr->todr_setwen(todr, onoff); return 0; } #define ENABLE_TODR_WRITES() \ do { \ if ((rv = todr_wenable(tch, 1)) != 0) { \ printf("%s: cannot enable TODR writes\n", __func__); \ return rv; \ } \ } while (/*CONSTCOND*/0) #define DISABLE_TODR_WRITES() \ do { \ if (todr_wenable(tch, 0) != 0) \ printf("%s: WARNING: cannot disable TODR writes\n", \ __func__); \ } while (/*CONSTCOND*/0) static int todr_gettime(todr_chip_handle_t tch, struct timeval *tvp) { int rv; /* * Write-enable is used even when reading the TODR because * writing to registers may be required in order to do so. */ if (tch->todr_gettime) { ENABLE_TODR_WRITES(); rv = tch->todr_gettime(tch, tvp); DISABLE_TODR_WRITES(); /* * Some unconverted ports have their own references to * rtc_offset. A converted port must not do that. */ if (rv == 0) tvp->tv_sec += rtc_offset * 60; todr_debug("TODR-GET-SECS", rv, NULL, tvp); return rv; } else if (tch->todr_gettime_ymdhms) { struct clock_ymdhms dt = { 0 }; ENABLE_TODR_WRITES(); rv = tch->todr_gettime_ymdhms(tch, &dt); DISABLE_TODR_WRITES(); todr_debug("TODR-GET-YMDHMS", rv, &dt, NULL); if (rv) return rv; /* * Simple sanity checks. Note that this includes a * value for clocks that can return a leap second. * Note that we don't support double leap seconds, * since this was apparently an error/misunderstanding * on the part of the ISO C committee, and can never * actually occur. If your clock issues us a double * leap second, it must be broken. Ultimately, you'd * have to be trying to read time at precisely that * instant to even notice, so even broken clocks will * work the vast majority of the time. In such a case * it is recommended correction be applied in the * clock driver. */ if (dt.dt_mon < 1 || dt.dt_mon > 12 || dt.dt_day < 1 || dt.dt_day > 31 || dt.dt_hour > 23 || dt.dt_min > 59 || dt.dt_sec > 60) { return EINVAL; } tvp->tv_sec = clock_ymdhms_to_secs(&dt) + rtc_offset * 60; tvp->tv_usec = 0; return tvp->tv_sec < 0 ? EINVAL : 0; } return ENXIO; } static int todr_settime(todr_chip_handle_t tch, struct timeval *tvp) { int rv; if (tch->todr_settime) { struct timeval copy = *tvp; copy.tv_sec -= rtc_offset * 60; ENABLE_TODR_WRITES(); rv = tch->todr_settime(tch, &copy); DISABLE_TODR_WRITES(); todr_debug("TODR-SET-SECS", rv, NULL, tvp); return rv; } else if (tch->todr_settime_ymdhms) { struct clock_ymdhms dt; time_t sec = tvp->tv_sec - rtc_offset * 60; if (tvp->tv_usec >= 500000) sec++; clock_secs_to_ymdhms(sec, &dt); ENABLE_TODR_WRITES(); rv = tch->todr_settime_ymdhms(tch, &dt); DISABLE_TODR_WRITES(); todr_debug("TODR-SET-YMDHMS", rv, &dt, NULL); return rv; } return ENXIO; }
3 2 2 2 148 1 149 148 48 48 48 48 235 30 30 215 1 1 235 236 234 236 236 234 236 218 31 236 24 692 28 678 686 689 691 15 473 693 480 48 25 441 690 689 67 39 447 439 50 148 149 149 149 251 273 10 49 438 247 1 2 258 25 479 1 3 2 2 480 478 236 236 234 233 2 477 412 470 48 99 97 2 474 2 478 2 2 2 2 2 47 48 47 47 48 48 34 30 34 30 30 30 30 33 1 47 48 47 48 46 48 48 48 11 13 36 22 22 22 11 11 266 441 264 266 264 251 22 433 432 434 434 256 259 219 265 8 251 19 253 253 251 244 118 242 118 117 118 118 118 117 118 118 117 2 22 22 22 1 9 1 11 11 1 10 11 1 11 11 250 8 258 259 258 218 219 219 1 216 433 431 433 433 431 433 434 433 434 434 41 1 1 58 388 53 1 48 5 2 7 33 33 33 16 33 33 16 33 1 1 31 32 33 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 /* $NetBSD: uvm_fault.c,v 1.237 2024/03/15 07:09:37 andvar Exp $ */ /* * Copyright (c) 1997 Charles D. Cranor and Washington University. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * from: Id: uvm_fault.c,v 1.1.2.23 1998/02/06 05:29:05 chs Exp */ /* * uvm_fault.c: fault handler */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: uvm_fault.c,v 1.237 2024/03/15 07:09:37 andvar Exp $"); #include "opt_uvmhist.h" #include <sys/param.h> #include <sys/systm.h> #include <sys/atomic.h> #include <sys/kernel.h> #include <sys/mman.h> #include <uvm/uvm.h> #include <uvm/uvm_pdpolicy.h> #include <uvm/uvm_rndsource.h> /* * * a word on page faults: * * types of page faults we handle: * * CASE 1: upper layer faults CASE 2: lower layer faults * * CASE 1A CASE 1B CASE 2A CASE 2B * read/write1 write>1 read/write +-cow_write/zero * | | | | * +--|--+ +--|--+ +-----+ + | + | +-----+ * amap | V | | ---------> new | | | | ^ | * +-----+ +-----+ +-----+ + | + | +--|--+ * | | | * +-----+ +-----+ +--|--+ | +--|--+ * uobj | d/c | | d/c | | V | +----+ | * +-----+ +-----+ +-----+ +-----+ * * d/c = don't care * * case [0]: layerless fault * no amap or uobj is present. this is an error. * * case [1]: upper layer fault [anon active] * 1A: [read] or [write with anon->an_ref == 1] * I/O takes place in upper level anon and uobj is not touched. * 1B: [write with anon->an_ref > 1] * new anon is alloc'd and data is copied off ["COW"] * * case [2]: lower layer fault [uobj] * 2A: [read on non-NULL uobj] or [write to non-copy_on_write area] * I/O takes place directly in object. * 2B: [write to copy_on_write] or [read on NULL uobj] * data is "promoted" from uobj to a new anon. * if uobj is null, then we zero fill. * * we follow the standard UVM locking protocol ordering: * * MAPS => AMAP => UOBJ => ANON => PAGE QUEUES (PQ) * we hold a PG_BUSY page if we unlock for I/O * * * the code is structured as follows: * * - init the "IN" params in the ufi structure * ReFault: (ERESTART returned to the loop in uvm_fault_internal) * - do lookups [locks maps], check protection, handle needs_copy * - check for case 0 fault (error) * - establish "range" of fault * - if we have an amap lock it and extract the anons * - if sequential advice deactivate pages behind us * - at the same time check pmap for unmapped areas and anon for pages * that we could map in (and do map it if found) * - check object for resident pages that we could map in * - if (case 2) goto Case2 * - >>> handle case 1 * - ensure source anon is resident in RAM * - if case 1B alloc new anon and copy from source * - map the correct page in * Case2: * - >>> handle case 2 * - ensure source page is resident (if uobj) * - if case 2B alloc new anon and copy from source (could be zero * fill if uobj == NULL) * - map the correct page in * - done! * * note on paging: * if we have to do I/O we place a PG_BUSY page in the correct object, * unlock everything, and do the I/O. when I/O is done we must reverify * the state of the world before assuming that our data structures are * valid. [because mappings could change while the map is unlocked] * * alternative 1: unbusy the page in question and restart the page fault * from the top (ReFault). this is easy but does not take advantage * of the information that we already have from our previous lookup, * although it is possible that the "hints" in the vm_map will help here. * * alternative 2: the system already keeps track of a "version" number of * a map. [i.e. every time you write-lock a map (e.g. to change a * mapping) you bump the version number up by one...] so, we can save * the version number of the map before we release the lock and start I/O. * then when I/O is done we can relock and check the version numbers * to see if anything changed. this might save us some over 1 because * we don't have to unbusy the page and may be less compares(?). * * alternative 3: put in backpointers or a way to "hold" part of a map * in place while I/O is in progress. this could be complex to * implement (especially with structures like amap that can be referenced * by multiple map entries, and figuring out what should wait could be * complex as well...). * * we use alternative 2. given that we are multi-threaded now we may want * to reconsider the choice. */ /* * local data structures */ struct uvm_advice { int advice; int nback; int nforw; }; /* * page range array: * note: index in array must match "advice" value * XXX: borrowed numbers from freebsd. do they work well for us? */ static const struct uvm_advice uvmadvice[] = { { UVM_ADV_NORMAL, 3, 4 }, { UVM_ADV_RANDOM, 0, 0 }, { UVM_ADV_SEQUENTIAL, 8, 7}, }; #define UVM_MAXRANGE 16 /* must be MAX() of nback+nforw+1 */ /* * private prototypes */ /* * inline functions */ /* * uvmfault_anonflush: try and deactivate pages in specified anons * * => does not have to deactivate page if it is busy */ static inline void uvmfault_anonflush(struct vm_anon **anons, int n) { int lcv; struct vm_page *pg; for (lcv = 0; lcv < n; lcv++) { if (anons[lcv] == NULL) continue; KASSERT(rw_lock_held(anons[lcv]->an_lock)); pg = anons[lcv]->an_page; if (pg && (pg->flags & PG_BUSY) == 0) { uvm_pagelock(pg); uvm_pagedeactivate(pg); uvm_pageunlock(pg); } } } /* * normal functions */ /* * uvmfault_amapcopy: clear "needs_copy" in a map. * * => called with VM data structures unlocked (usually, see below) * => we get a write lock on the maps and clear needs_copy for a VA * => if we are out of RAM we sleep (waiting for more) */ static void uvmfault_amapcopy(struct uvm_faultinfo *ufi) { for (;;) { /* * no mapping? give up. */ if (uvmfault_lookup(ufi, true) == false) return; /* * copy if needed. */ if (UVM_ET_ISNEEDSCOPY(ufi->entry)) amap_copy(ufi->map, ufi->entry, AMAP_COPY_NOWAIT, ufi->orig_rvaddr, ufi->orig_rvaddr + 1); /* * didn't work? must be out of RAM. unlock and sleep. */ if (UVM_ET_ISNEEDSCOPY(ufi->entry)) { uvmfault_unlockmaps(ufi, true); uvm_wait("fltamapcopy"); continue; } /* * got it! unlock and return. */ uvmfault_unlockmaps(ufi, true); return; } /*NOTREACHED*/ } /* * uvmfault_anonget: get data in an anon into a non-busy, non-released * page in that anon. * * => Map, amap and thus anon should be locked by caller. * => If we fail, we unlock everything and error is returned. * => If we are successful, return with everything still locked. * => We do not move the page on the queues [gets moved later]. If we * allocate a new page [we_own], it gets put on the queues. Either way, * the result is that the page is on the queues at return time * => For pages which are on loan from a uvm_object (and thus are not owned * by the anon): if successful, return with the owning object locked. * The caller must unlock this object when it unlocks everything else. */ int uvmfault_anonget(struct uvm_faultinfo *ufi, struct vm_amap *amap, struct vm_anon *anon) { struct vm_page *pg; krw_t lock_type; int error __unused; /* used for VMSWAP */ UVMHIST_FUNC(__func__); UVMHIST_CALLED(maphist); KASSERT(rw_lock_held(anon->an_lock)); KASSERT(anon->an_lock == amap->am_lock); /* Increment the counters.*/ cpu_count(CPU_COUNT_FLTANGET, 1); if (anon->an_page) { curlwp->l_ru.ru_minflt++; } else { curlwp->l_ru.ru_majflt++; } error = 0; /* * Loop until we get the anon data, or fail. */ for (;;) { bool we_own, locked; /* * Note: 'we_own' will become true if we set PG_BUSY on a page. */ we_own = false; pg = anon->an_page; /* * If there is a resident page and it is loaned, then anon * may not own it. Call out to uvm_anon_lockloanpg() to * identify and lock the real owner of the page. */ if (pg && pg->loan_count) pg = uvm_anon_lockloanpg(anon); /* * Is page resident? Make sure it is not busy/released. */ lock_type = rw_lock_op(anon->an_lock); if (pg) { /* * at this point, if the page has a uobject [meaning * we have it on loan], then that uobject is locked * by us! if the page is busy, we drop all the * locks (including uobject) and try again. */ if ((pg->flags & PG_BUSY) == 0) { UVMHIST_LOG(maphist, "<- OK",0,0,0,0); return 0; } cpu_count(CPU_COUNT_FLTPGWAIT, 1); /* * The last unlock must be an atomic unlock and wait * on the owner of page. */ if (pg->uobject) { /* Owner of page is UVM object. */ uvmfault_unlockall(ufi, amap, NULL); UVMHIST_LOG(maphist, " unlock+wait on uobj",0, 0,0,0); uvm_pagewait(pg, pg->uobject->vmobjlock, "anonget1"); } else { /* Owner of page is anon. */ uvmfault_unlockall(ufi, NULL, NULL); UVMHIST_LOG(maphist, " unlock+wait on anon",0, 0,0,0); uvm_pagewait(pg, anon->an_lock, "anonget2"); } } else { #if defined(VMSWAP) /* * No page, therefore allocate one. A write lock is * required for this. If the caller didn't supply * one, fail now and have them retry. */ if (lock_type == RW_READER) { return ENOLCK; } pg = uvm_pagealloc(NULL, ufi != NULL ? ufi->orig_rvaddr : 0, anon, ufi != NULL ? UVM_FLAG_COLORMATCH : 0); if (pg == NULL) { /* Out of memory. Wait a little. */ uvmfault_unlockall(ufi, amap, NULL); cpu_count(CPU_COUNT_FLTNORAM, 1); UVMHIST_LOG(maphist, " noram -- UVM_WAIT",0, 0,0,0); if (!uvm_reclaimable()) { return ENOMEM; } uvm_wait("flt_noram1"); } else { /* PG_BUSY bit is set. */ we_own = true; uvmfault_unlockall(ufi, amap, NULL); /* * Pass a PG_BUSY+PG_FAKE clean page into * the uvm_swap_get() function with all data * structures unlocked. Note that it is OK * to read an_swslot here, because we hold * PG_BUSY on the page. */ cpu_count(CPU_COUNT_PAGEINS, 1); error = uvm_swap_get(pg, anon->an_swslot, PGO_SYNCIO); /* * We clean up after the I/O below in the * 'we_own' case. */ } #else panic("%s: no page", __func__); #endif /* defined(VMSWAP) */ } /* * Re-lock the map and anon. */ locked = uvmfault_relock(ufi); if (locked || we_own) { rw_enter(anon->an_lock, lock_type); } /* * If we own the page (i.e. we set PG_BUSY), then we need * to clean up after the I/O. There are three cases to * consider: * * 1) Page was released during I/O: free anon and ReFault. * 2) I/O not OK. Free the page and cause the fault to fail. * 3) I/O OK! Activate the page and sync with the non-we_own * case (i.e. drop anon lock if not locked). */ if (we_own) { KASSERT(lock_type == RW_WRITER); #if defined(VMSWAP) if (error) { /* * Remove the swap slot from the anon and * mark the anon as having no real slot. * Do not free the swap slot, thus preventing * it from being used again. */ if (anon->an_swslot > 0) { uvm_swap_markbad(anon->an_swslot, 1); } anon->an_swslot = SWSLOT_BAD; if ((pg->flags & PG_RELEASED) != 0) { goto released; } /* * Note: page was never !PG_BUSY, so it * cannot be mapped and thus no need to * pmap_page_protect() it. */ uvm_pagefree(pg); if (locked) { uvmfault_unlockall(ufi, NULL, NULL); } rw_exit(anon->an_lock); UVMHIST_LOG(maphist, "<- ERROR", 0,0,0,0); return error; } if ((pg->flags & PG_RELEASED) != 0) { released: KASSERT(anon->an_ref == 0); /* * Released while we had unlocked amap. */ if (locked) { uvmfault_unlockall(ufi, NULL, NULL); } uvm_anon_release(anon); if (error) { UVMHIST_LOG(maphist, "<- ERROR/RELEASED", 0,0,0,0); return error; } UVMHIST_LOG(maphist, "<- RELEASED", 0,0,0,0); return ERESTART; } /* * We have successfully read the page, activate it. */ uvm_pagelock(pg); uvm_pageactivate(pg); uvm_pagewakeup(pg); uvm_pageunlock(pg); pg->flags &= ~(PG_BUSY|PG_FAKE); uvm_pagemarkdirty(pg, UVM_PAGE_STATUS_UNKNOWN); UVM_PAGE_OWN(pg, NULL); #else panic("%s: we_own", __func__); #endif /* defined(VMSWAP) */ } /* * We were not able to re-lock the map - restart the fault. */ if (!locked) { if (we_own) { rw_exit(anon->an_lock); } UVMHIST_LOG(maphist, "<- REFAULT", 0,0,0,0); return ERESTART; } /* * Verify that no one has touched the amap and moved * the anon on us. */ if (ufi != NULL && amap_lookup(&ufi->entry->aref, ufi->orig_rvaddr - ufi->entry->start) != anon) { uvmfault_unlockall(ufi, amap, NULL); UVMHIST_LOG(maphist, "<- REFAULT", 0,0,0,0); return ERESTART; } /* * Retry.. */ cpu_count(CPU_COUNT_FLTANRETRY, 1); continue; } /*NOTREACHED*/ } /* * uvmfault_promote: promote data to a new anon. used for 1B and 2B. * * 1. allocate an anon and a page. * 2. fill its contents. * 3. put it into amap. * * => if we fail (result != 0) we unlock everything. * => on success, return a new locked anon via 'nanon'. * (*nanon)->an_page will be a resident, locked, dirty page. * => it's caller's responsibility to put the promoted nanon->an_page to the * page queue. */ static int uvmfault_promote(struct uvm_faultinfo *ufi, struct vm_anon *oanon, struct vm_page *uobjpage, struct vm_anon **nanon, /* OUT: allocated anon */ struct vm_anon **spare) { struct vm_amap *amap = ufi->entry->aref.ar_amap; struct uvm_object *uobj; struct vm_anon *anon; struct vm_page *pg; struct vm_page *opg; int error; UVMHIST_FUNC(__func__); UVMHIST_CALLED(maphist); if (oanon) { /* anon COW */ opg = oanon->an_page; KASSERT(opg != NULL); KASSERT(opg->uobject == NULL || opg->loan_count > 0); } else if (uobjpage != PGO_DONTCARE) { /* object-backed COW */ opg = uobjpage; KASSERT(rw_lock_held(opg->uobject->vmobjlock)); } else { /* ZFOD */ opg = NULL; } if (opg != NULL) { uobj = opg->uobject; } else { uobj = NULL; } KASSERT(amap != NULL); KASSERT(uobjpage != NULL); KASSERT(rw_write_held(amap->am_lock)); KASSERT(oanon == NULL || amap->am_lock == oanon->an_lock); KASSERT(uobj == NULL || rw_lock_held(uobj->vmobjlock)); if (*spare != NULL) { anon = *spare; *spare = NULL; } else { anon = uvm_analloc(); } if (anon) { /* * The new anon is locked. * * if opg == NULL, we want a zero'd, dirty page, * so have uvm_pagealloc() do that for us. */ KASSERT(anon->an_lock == NULL); anon->an_lock = amap->am_lock; pg = uvm_pagealloc(NULL, ufi->orig_rvaddr, anon, UVM_FLAG_COLORMATCH | (opg == NULL ? UVM_PGA_ZERO : 0)); if (pg == NULL) { anon->an_lock = NULL; } } else { pg = NULL; } /* * out of memory resources? */ if (pg == NULL) { /* save anon for the next try. */ if (anon != NULL) { *spare = anon; } /* unlock and fail ... */ uvmfault_unlockall(ufi, amap, uobj); if (!uvm_reclaimable()) { UVMHIST_LOG(maphist, "out of VM", 0,0,0,0); cpu_count(CPU_COUNT_FLTNOANON, 1); error = ENOMEM; goto done; } UVMHIST_LOG(maphist, "out of RAM, waiting for more", 0,0,0,0); cpu_count(CPU_COUNT_FLTNORAM, 1); uvm_wait("flt_noram5"); error = ERESTART; goto done; } /* * copy the page [pg now dirty] * * Remove the pmap entry now for the old page at this address * so that no thread can modify the new page while any thread * might still see the old page. */ if (opg) { pmap_remove(vm_map_pmap(ufi->orig_map), ufi->orig_rvaddr, ufi->orig_rvaddr + PAGE_SIZE); pmap_update(vm_map_pmap(ufi->orig_map)); uvm_pagecopy(opg, pg); } KASSERT(uvm_pagegetdirty(pg) == UVM_PAGE_STATUS_DIRTY); amap_add(&ufi->entry->aref, ufi->orig_rvaddr - ufi->entry->start, anon, oanon != NULL); /* * from this point on am_lock won't be dropped until the page is * entered, so it's safe to unbusy the page up front. * * uvm_fault_{upper,lower}_done will activate or enqueue the page. */ pg = anon->an_page; pg->flags &= ~(PG_BUSY|PG_FAKE); UVM_PAGE_OWN(pg, NULL); *nanon = anon; error = 0; done: return error; } /* * Update statistics after fault resolution. * - maxrss */ void uvmfault_update_stats(struct uvm_faultinfo *ufi) { struct vm_map *map; struct vmspace *vm; struct proc *p; vsize_t res; map = ufi->orig_map; p = curproc; KASSERT(p != NULL); vm = p->p_vmspace; if (&vm->vm_map != map) return; res = pmap_resident_count(map->pmap); if (vm->vm_rssmax < res) vm->vm_rssmax = res; } /* * F A U L T - m a i n e n t r y p o i n t */ /* * uvm_fault: page fault handler * * => called from MD code to resolve a page fault * => VM data structures usually should be unlocked. however, it is * possible to call here with the main map locked if the caller * gets a write lock, sets it recursive, and then calls us (c.f. * uvm_map_pageable). this should be avoided because it keeps * the map locked off during I/O. * => MUST NEVER BE CALLED IN INTERRUPT CONTEXT */ #define MASK(entry) (UVM_ET_ISCOPYONWRITE(entry) ? \ ~VM_PROT_WRITE : VM_PROT_ALL) /* fault_flag values passed from uvm_fault_wire to uvm_fault_internal */ #define UVM_FAULT_WIRE (1 << 0) #define UVM_FAULT_MAXPROT (1 << 1) struct uvm_faultctx { /* * the following members are set up by uvm_fault_check() and * read-only after that. * * note that narrow is used by uvm_fault_check() to change * the behaviour after ERESTART. * * most of them might change after RESTART if the underlying * map entry has been changed behind us. an exception is * wire_paging, which does never change. */ vm_prot_t access_type; vaddr_t startva; int npages; int centeridx; bool narrow; /* work on a single requested page only */ bool wire_mapping; /* request a PMAP_WIRED mapping (UVM_FAULT_WIRE or VM_MAPENT_ISWIRED) */ bool wire_paging; /* request uvm_pagewire (true for UVM_FAULT_WIRE) */ bool cow_now; /* VM_PROT_WRITE is actually requested (ie. should break COW and page loaning) */ /* * enter_prot is set up by uvm_fault_check() and clamped * (ie. drop the VM_PROT_WRITE bit) in various places in case * of !cow_now. */ vm_prot_t enter_prot; /* prot at which we want to enter pages in */ /* * the following member is for uvmfault_promote() and ERESTART. */ struct vm_anon *anon_spare; /* * the following is actually a uvm_fault_lower() internal. * it's here merely for debugging. * (or due to the mechanical separation of the function?) */ bool promote; /* * type of lock to acquire on objects in both layers. */ krw_t lower_lock_type; krw_t upper_lock_type; }; static inline int uvm_fault_check( struct uvm_faultinfo *, struct uvm_faultctx *, struct vm_anon ***, bool); static int uvm_fault_upper( struct uvm_faultinfo *, struct uvm_faultctx *, struct vm_anon **); static inline int uvm_fault_upper_lookup( struct uvm_faultinfo *, const struct uvm_faultctx *, struct vm_anon **, struct vm_page **); static inline void uvm_fault_upper_neighbor( struct uvm_faultinfo *, const struct uvm_faultctx *, vaddr_t, struct vm_page *, bool); static inline int uvm_fault_upper_loan( struct uvm_faultinfo *, struct uvm_faultctx *, struct vm_anon *, struct uvm_object **); static inline int uvm_fault_upper_promote( struct uvm_faultinfo *, struct uvm_faultctx *, struct uvm_object *, struct vm_anon *); static inline int uvm_fault_upper_direct( struct uvm_faultinfo *, struct uvm_faultctx *, struct uvm_object *, struct vm_anon *); static int uvm_fault_upper_enter( struct uvm_faultinfo *, const struct uvm_faultctx *, struct uvm_object *, struct vm_anon *, struct vm_page *, struct vm_anon *); static inline void uvm_fault_upper_done( struct uvm_faultinfo *, const struct uvm_faultctx *, struct vm_anon *, struct vm_page *); static int uvm_fault_lower( struct uvm_faultinfo *, struct uvm_faultctx *, struct vm_page **); static inline void uvm_fault_lower_lookup( struct uvm_faultinfo *, const struct uvm_faultctx *, struct vm_page **); static inline void uvm_fault_lower_neighbor( struct uvm_faultinfo *, const struct uvm_faultctx *, vaddr_t, struct vm_page *); static inline int uvm_fault_lower_io( struct uvm_faultinfo *, struct uvm_faultctx *, struct uvm_object **, struct vm_page **); static inline int uvm_fault_lower_direct( struct uvm_faultinfo *, struct uvm_faultctx *, struct uvm_object *, struct vm_page *); static inline int uvm_fault_lower_direct_loan( struct uvm_faultinfo *, struct uvm_faultctx *, struct uvm_object *, struct vm_page **, struct vm_page **); static inline int uvm_fault_lower_promote( struct uvm_faultinfo *, struct uvm_faultctx *, struct uvm_object *, struct vm_page *); static int uvm_fault_lower_enter( struct uvm_faultinfo *, const struct uvm_faultctx *, struct uvm_object *, struct vm_anon *, struct vm_page *); static inline void uvm_fault_lower_done( struct uvm_faultinfo *, const struct uvm_faultctx *, struct uvm_object *, struct vm_page *); int uvm_fault_internal(struct vm_map *orig_map, vaddr_t vaddr, vm_prot_t access_type, int fault_flag) { struct uvm_faultinfo ufi; struct uvm_faultctx flt = { .access_type = access_type, /* don't look for neighborhood * pages on "wire" fault */ .narrow = (fault_flag & UVM_FAULT_WIRE) != 0, /* "wire" fault causes wiring of both mapping and paging */ .wire_mapping = (fault_flag & UVM_FAULT_WIRE) != 0, .wire_paging = (fault_flag & UVM_FAULT_WIRE) != 0, /* * default lock type to acquire on upper & lower layer * objects: reader. this can be upgraded at any point * during the fault from read -> write and uvm_faultctx * changed to match, but is never downgraded write -> read. */ #ifdef __HAVE_UNLOCKED_PMAP /* XXX temporary */ .upper_lock_type = RW_WRITER, .lower_lock_type = RW_WRITER, #else .upper_lock_type = RW_READER, .lower_lock_type = RW_READER, #endif }; const bool maxprot = (fault_flag & UVM_FAULT_MAXPROT) != 0; struct vm_anon *anons_store[UVM_MAXRANGE], **anons; struct vm_page *pages_store[UVM_MAXRANGE], **pages; int error; UVMHIST_FUNC(__func__); UVMHIST_CALLARGS(maphist, "(map=%#jx, vaddr=%#jx, at=%jd, ff=%jd)", (uintptr_t)orig_map, vaddr, access_type, fault_flag); /* Don't count anything until user interaction is possible */ kpreempt_disable(); if (__predict_true(start_init_exec)) { struct cpu_info *ci = curcpu(); CPU_COUNT(CPU_COUNT_NFAULT, 1); /* Don't flood RNG subsystem with samples. */ if (++(ci->ci_faultrng) == 503) { ci->ci_faultrng = 0; rnd_add_uint32(&uvm_fault_rndsource, sizeof(vaddr_t) == sizeof(uint32_t) ? (uint32_t)vaddr : sizeof(vaddr_t) == sizeof(uint64_t) ? (uint32_t)vaddr : (uint32_t)ci->ci_counts[CPU_COUNT_NFAULT]); } } kpreempt_enable(); /* * init the IN parameters in the ufi */ ufi.orig_map = orig_map; ufi.orig_rvaddr = trunc_page(vaddr); ufi.orig_size = PAGE_SIZE; /* can't get any smaller than this */ error = ERESTART; while (error == ERESTART) { /* ReFault: */ anons = anons_store; pages = pages_store; error = uvm_fault_check(&ufi, &flt, &anons, maxprot); if (error != 0) continue; error = uvm_fault_upper_lookup(&ufi, &flt, anons, pages); if (error != 0) continue; if (pages[flt.centeridx] == PGO_DONTCARE) error = uvm_fault_upper(&ufi, &flt, anons); else { struct uvm_object * const uobj = ufi.entry->object.uvm_obj; if (uobj && uobj->pgops->pgo_fault != NULL) { /* * invoke "special" fault routine. */ rw_enter(uobj->vmobjlock, RW_WRITER); /* locked: maps(read), amap(if there), uobj */ error = uobj->pgops->pgo_fault(&ufi, flt.startva, pages, flt.npages, flt.centeridx, flt.access_type, PGO_LOCKED|PGO_SYNCIO); /* * locked: nothing, pgo_fault has unlocked * everything */ /* * object fault routine responsible for * pmap_update(). */ /* * Wake up the pagedaemon if the fault method * failed for lack of memory but some can be * reclaimed. */ if (error == ENOMEM && uvm_reclaimable()) { uvm_wait("pgo_fault"); error = ERESTART; } } else { error = uvm_fault_lower(&ufi, &flt, pages); } } } if (flt.anon_spare != NULL) { flt.anon_spare->an_ref--; KASSERT(flt.anon_spare->an_ref == 0); KASSERT(flt.anon_spare->an_lock == NULL); uvm_anfree(flt.anon_spare); } return error; } /* * uvm_fault_check: check prot, handle needs-copy, etc. * * 1. lookup entry. * 2. check protection. * 3. adjust fault condition (mainly for simulated fault). * 4. handle needs-copy (lazy amap copy). * 5. establish range of interest for neighbor fault (aka pre-fault). * 6. look up anons (if amap exists). * 7. flush pages (if MADV_SEQUENTIAL) * * => called with nothing locked. * => if we fail (result != 0) we unlock everything. * => initialize/adjust many members of flt. */ static int uvm_fault_check( struct uvm_faultinfo *ufi, struct uvm_faultctx *flt, struct vm_anon ***ranons, bool maxprot) { struct vm_amap *amap; struct uvm_object *uobj; vm_prot_t check_prot; int nback, nforw; UVMHIST_FUNC(__func__); UVMHIST_CALLED(maphist); /* * lookup and lock the maps */ if (uvmfault_lookup(ufi, false) == false) { UVMHIST_LOG(maphist, "<- no mapping @ %#jx", ufi->orig_rvaddr, 0,0,0); return EFAULT; } /* locked: maps(read) */ #ifdef DIAGNOSTIC if ((ufi->map->flags & VM_MAP_PAGEABLE) == 0) { printf("Page fault on non-pageable map:\n"); printf("ufi->map = %p\n", ufi->map); printf("ufi->orig_map = %p\n", ufi->orig_map); printf("ufi->orig_rvaddr = %#lx\n", (u_long) ufi->orig_rvaddr); panic("uvm_fault: (ufi->map->flags & VM_MAP_PAGEABLE) == 0"); } #endif /* * check protection */ check_prot = maxprot ? ufi->entry->max_protection : ufi->entry->protection; if ((check_prot & flt->access_type) != flt->access_type) { UVMHIST_LOG(maphist, "<- protection failure (prot=%#jx, access=%#jx)", ufi->entry->protection, flt->access_type, 0, 0); uvmfault_unlockmaps(ufi, false); return EFAULT; } /* * "enter_prot" is the protection we want to enter the page in at. * for certain pages (e.g. copy-on-write pages) this protection can * be more strict than ufi->entry->protection. "wired" means either * the entry is wired or we are fault-wiring the pg. */ flt->enter_prot = ufi->entry->protection; if (VM_MAPENT_ISWIRED(ufi->entry)) { flt->wire_mapping = true; flt->wire_paging = true; flt->narrow = true; } if (flt->wire_mapping) { flt->access_type = flt->enter_prot; /* full access for wired */ flt->cow_now = (check_prot & VM_PROT_WRITE) != 0; } else { flt->cow_now = (flt->access_type & VM_PROT_WRITE) != 0; } if (flt->wire_paging) { /* wiring pages requires a write lock. */ flt->upper_lock_type = RW_WRITER; flt->lower_lock_type = RW_WRITER; } flt->promote = false; /* * handle "needs_copy" case. if we need to copy the amap we will * have to drop our readlock and relock it with a write lock. (we * need a write lock to change anything in a map entry [e.g. * needs_copy]). */ if (UVM_ET_ISNEEDSCOPY(ufi->entry)) { if (flt->cow_now || (ufi->entry->object.uvm_obj == NULL)) { KASSERT(!maxprot); /* need to clear */ UVMHIST_LOG(maphist, " need to clear needs_copy and refault",0,0,0,0); uvmfault_unlockmaps(ufi, false); uvmfault_amapcopy(ufi); cpu_count(CPU_COUNT_FLTAMCOPY, 1); return ERESTART; } else { /* * ensure that we pmap_enter page R/O since * needs_copy is still true */ flt->enter_prot &= ~VM_PROT_WRITE; } } /* * identify the players */ amap = ufi->entry->aref.ar_amap; /* upper layer */ uobj = ufi->entry->object.uvm_obj; /* lower layer */ /* * check for a case 0 fault. if nothing backing the entry then * error now. */ if (amap == NULL && uobj == NULL) { uvmfault_unlockmaps(ufi, false); UVMHIST_LOG(maphist,"<- no backing store, no overlay",0,0,0,0); return EFAULT; } /* * for a case 2B fault waste no time on adjacent pages because * they are likely already entered. */ if (uobj != NULL && amap != NULL && (flt->access_type & VM_PROT_WRITE) != 0) { /* wide fault (!narrow) */ flt->narrow = true; } /* * establish range of interest based on advice from mapper * and then clip to fit map entry. note that we only want * to do this the first time through the fault. if we * ReFault we will disable this by setting "narrow" to true. */ if (flt->narrow == false) { /* wide fault (!narrow) */ KASSERT(uvmadvice[ufi->entry->advice].advice == ufi->entry->advice); nback = MIN(uvmadvice[ufi->entry->advice].nback, (ufi->orig_rvaddr - ufi->entry->start) >> PAGE_SHIFT); flt->startva = ufi->orig_rvaddr - (nback << PAGE_SHIFT); /* * note: "-1" because we don't want to count the * faulting page as forw */ nforw = MIN(uvmadvice[ufi->entry->advice].nforw, ((ufi->entry->end - ufi->orig_rvaddr) >> PAGE_SHIFT) - 1); flt->npages = nback + nforw + 1; flt->centeridx = nback; flt->narrow = true; /* ensure only once per-fault */ } else { /* narrow fault! */ nback = nforw = 0; flt->startva = ufi->orig_rvaddr; flt->npages = 1; flt->centeridx = 0; } /* offset from entry's start to pgs' start */ const voff_t eoff = flt->startva - ufi->entry->start; /* locked: maps(read) */ UVMHIST_LOG(maphist, " narrow=%jd, back=%jd, forw=%jd, startva=%#jx", flt->narrow, nback, nforw, flt->startva); UVMHIST_LOG(maphist, " entry=%#jx, amap=%#jx, obj=%#jx", (uintptr_t)ufi->entry, (uintptr_t)amap, (uintptr_t)uobj, 0); /* * guess at the most suitable lock types to acquire. * if we've got an amap then lock it and extract current anons. */ if (amap) { if ((amap_flags(amap) & AMAP_SHARED) == 0) { /* * the amap isn't shared. get a writer lock to * avoid the cost of upgrading the lock later if * needed. * * XXX nice for PostgreSQL, but consider threads. */ flt->upper_lock_type = RW_WRITER; } else if ((flt->access_type & VM_PROT_WRITE) != 0) { /* * assume we're about to COW. */ flt->upper_lock_type = RW_WRITER; } amap_lock(amap, flt->upper_lock_type); amap_lookups(&ufi->entry->aref, eoff, *ranons, flt->npages); } else { if ((flt->access_type & VM_PROT_WRITE) != 0) { /* * we are about to dirty the object and that * requires a write lock. */ flt->lower_lock_type = RW_WRITER; } *ranons = NULL; /* to be safe */ } /* locked: maps(read), amap(if there) */ KASSERT(amap == NULL || rw_lock_op(amap->am_lock) == flt->upper_lock_type); /* * for MADV_SEQUENTIAL mappings we want to deactivate the back pages * now and then forget about them (for the rest of the fault). */ if (ufi->entry->advice == MADV_SEQUENTIAL && nback != 0) { UVMHIST_LOG(maphist, " MADV_SEQUENTIAL: flushing backpages", 0,0,0,0); /* flush back-page anons? */ if (amap) uvmfault_anonflush(*ranons, nback); /* * flush object? change lock type to RW_WRITER, to avoid * excessive competition between read/write locks if many * threads doing "sequential access". */ if (uobj) { voff_t uoff; flt->lower_lock_type = RW_WRITER; uoff = ufi->entry->offset + eoff; rw_enter(uobj->vmobjlock, RW_WRITER); (void) (uobj->pgops->pgo_put)(uobj, uoff, uoff + (nback << PAGE_SHIFT), PGO_DEACTIVATE); } /* now forget about the backpages */ if (amap) *ranons += nback; flt->startva += (nback << PAGE_SHIFT); flt->npages -= nback; flt->centeridx = 0; } /* * => startva is fixed * => npages is fixed */ KASSERT(flt->startva <= ufi->orig_rvaddr); KASSERT(ufi->orig_rvaddr + ufi->orig_size <= flt->startva + (flt->npages << PAGE_SHIFT)); return 0; } /* * uvm_fault_upper_upgrade: upgrade upper lock, reader -> writer */ static inline int uvm_fault_upper_upgrade(struct uvm_faultinfo *ufi, struct uvm_faultctx *flt, struct vm_amap *amap, struct uvm_object *uobj) { UVMHIST_FUNC(__func__); UVMHIST_CALLED(maphist); KASSERT(amap != NULL); KASSERT(flt->upper_lock_type == rw_lock_op(amap->am_lock)); /* * fast path. */ if (__predict_true(flt->upper_lock_type == RW_WRITER)) { return 0; } /* * otherwise try for the upgrade. if we don't get it, unlock * everything, restart the fault and next time around get a writer * lock. */ flt->upper_lock_type = RW_WRITER; if (__predict_false(!rw_tryupgrade(amap->am_lock))) { uvmfault_unlockall(ufi, amap, uobj); cpu_count(CPU_COUNT_FLTNOUP, 1); UVMHIST_LOG(maphist, " !upgrade upper", 0, 0,0,0); return ERESTART; } cpu_count(CPU_COUNT_FLTUP, 1); KASSERT(flt->upper_lock_type == rw_lock_op(amap->am_lock)); return 0; } /* * uvm_fault_upper_lookup: look up existing h/w mapping and amap. * * iterate range of interest: * 1. check if h/w mapping exists. if yes, we don't care * 2. check if anon exists. if not, page is lower. * 3. if anon exists, enter h/w mapping for neighbors. * * => called with amap locked (if exists). */ static int uvm_fault_upper_lookup( struct uvm_faultinfo *ufi, const struct uvm_faultctx *flt, struct vm_anon **anons, struct vm_page **pages) { struct vm_amap *amap = ufi->entry->aref.ar_amap; int lcv; vaddr_t currva; bool shadowed __unused; bool entered; UVMHIST_FUNC(__func__); UVMHIST_CALLED(maphist); /* locked: maps(read), amap(if there) */ KASSERT(amap == NULL || rw_lock_op(amap->am_lock) == flt->upper_lock_type); /* * map in the backpages and frontpages we found in the amap in hopes * of preventing future faults. we also init the pages[] array as * we go. */ currva = flt->startva; shadowed = false; entered = false; for (lcv = 0; lcv < flt->npages; lcv++, currva += PAGE_SIZE) { /* * unmapped or center page. check if any anon at this level. */ if (amap == NULL || anons[lcv] == NULL) { pages[lcv] = NULL; continue; } /* * check for present page and map if possible. */ pages[lcv] = PGO_DONTCARE; if (lcv == flt->centeridx) { /* save center for later! */ shadowed = true; continue; } struct vm_anon *anon = anons[lcv]; struct vm_page *pg = anon->an_page; KASSERT(anon->an_lock == amap->am_lock); /* * ignore loaned and busy pages. * don't play with VAs that are already mapped. */ if (pg && pg->loan_count == 0 && (pg->flags & PG_BUSY) == 0 && !pmap_extract(ufi->orig_map->pmap, currva, NULL)) { uvm_fault_upper_neighbor(ufi, flt, currva, pg, anon->an_ref > 1); entered = true; } } if (entered) { pmap_update(ufi->orig_map->pmap); } /* locked: maps(read), amap(if there) */ KASSERT(amap == NULL || rw_lock_op(amap->am_lock) == flt->upper_lock_type); /* (shadowed == true) if there is an anon at the faulting address */ UVMHIST_LOG(maphist, " shadowed=%jd, will_get=%jd", shadowed, (ufi->entry->object.uvm_obj && shadowed != false),0,0); return 0; } /* * uvm_fault_upper_neighbor: enter single upper neighbor page. * * => called with amap and anon locked. */ static void uvm_fault_upper_neighbor( struct uvm_faultinfo *ufi, const struct uvm_faultctx *flt, vaddr_t currva, struct vm_page *pg, bool readonly) { UVMHIST_FUNC(__func__); UVMHIST_CALLED(maphist); /* locked: amap, anon */ KASSERT(pg->uobject == NULL); KASSERT(pg->uanon != NULL); KASSERT(rw_lock_op(pg->uanon->an_lock) == flt->upper_lock_type); KASSERT(uvm_pagegetdirty(pg) != UVM_PAGE_STATUS_CLEAN); /* * there wasn't a direct fault on the page, so avoid the cost of * activating it. */ if (!uvmpdpol_pageisqueued_p(pg) && pg->wire_count == 0) { uvm_pagelock(pg); uvm_pageenqueue(pg); uvm_pageunlock(pg); } UVMHIST_LOG(maphist, " MAPPING: n anon: pm=%#jx, va=%#jx, pg=%#jx", (uintptr_t)ufi->orig_map->pmap, currva, (uintptr_t)pg, 0); cpu_count(CPU_COUNT_FLTNAMAP, 1); /* * Since this page isn't the page that's actually faulting, * ignore pmap_enter() failures; it's not critical that we * enter these right now. */ (void) pmap_enter(ufi->orig_map->pmap, currva, VM_PAGE_TO_PHYS(pg), readonly ? (flt->enter_prot & ~VM_PROT_WRITE) : flt->enter_prot, PMAP_CANFAIL | (flt->wire_mapping ? PMAP_WIRED : 0)); } /* * uvm_fault_upper: handle upper fault. * * 1. acquire anon lock. * 2. get anon. let uvmfault_anonget do the dirty work. * 3. handle loan. * 4. dispatch direct or promote handlers. */ static int uvm_fault_upper( struct uvm_faultinfo *ufi, struct uvm_faultctx *flt, struct vm_anon **anons) { struct vm_amap * const amap = ufi->entry->aref.ar_amap; struct vm_anon * const anon = anons[flt->centeridx]; struct uvm_object *uobj; int error; UVMHIST_FUNC(__func__); UVMHIST_CALLED(maphist); /* locked: maps(read), amap, anon */ KASSERT(rw_lock_op(amap->am_lock) == flt->upper_lock_type); KASSERT(anon->an_lock == amap->am_lock); /* * handle case 1: fault on an anon in our amap */ UVMHIST_LOG(maphist, " case 1 fault: anon=%#jx", (uintptr_t)anon, 0, 0, 0); /* * no matter if we have case 1A or case 1B we are going to need to * have the anon's memory resident. ensure that now. */ /* * let uvmfault_anonget do the dirty work. * if it fails (!OK) it will unlock everything for us. * if it succeeds, locks are still valid and locked. * also, if it is OK, then the anon's page is on the queues. * if the page is on loan from a uvm_object, then anonget will * lock that object for us if it does not fail. */ retry: error = uvmfault_anonget(ufi, amap, anon); switch (error) { case 0: break; case ERESTART: return ERESTART; case EAGAIN: kpause("fltagain1", false, hz/2, NULL); return ERESTART; case ENOLCK: /* it needs a write lock: retry */ error = uvm_fault_upper_upgrade(ufi, flt, amap, NULL); if (error != 0) { return error; } KASSERT(rw_write_held(amap->am_lock)); goto retry; default: return error; } /* * uobj is non null if the page is on loan from an object (i.e. uobj) */ uobj = anon->an_page->uobject; /* locked by anonget if !NULL */ /* locked: maps(read), amap, anon, uobj(if one) */ KASSERT(rw_lock_op(amap->am_lock) == flt->upper_lock_type); KASSERT(anon->an_lock == amap->am_lock); KASSERT(uobj == NULL || rw_lock_op(uobj->vmobjlock) == flt->lower_lock_type); /* * special handling for loaned pages */ if (anon->an_page->loan_count) { error = uvm_fault_upper_loan(ufi, flt, anon, &uobj); if (error != 0) return error; } /* * if we are case 1B then we will need to allocate a new blank * anon to transfer the data into. note that we have a lock * on anon, so no one can busy or release the page until we are done. * also note that the ref count can't drop to zero here because * it is > 1 and we are only dropping one ref. * * in the (hopefully very rare) case that we are out of RAM we * will unlock, wait for more RAM, and refault. * * if we are out of anon VM we kill the process (XXX: could wait?). */ if (flt->cow_now && anon->an_ref > 1) { flt->promote = true; error = uvm_fault_upper_promote(ufi, flt, uobj, anon); } else { error = uvm_fault_upper_direct(ufi, flt, uobj, anon); } return error; } /* * uvm_fault_upper_loan: handle loaned upper page. * * 1. if not cow'ing now, simply adjust flt->enter_prot. * 2. if cow'ing now, and if ref count is 1, break loan. */ static int uvm_fault_upper_loan( struct uvm_faultinfo *ufi, struct uvm_faultctx *flt, struct vm_anon *anon, struct uvm_object **ruobj) { struct vm_amap * const amap = ufi->entry->aref.ar_amap; int error = 0; UVMHIST_FUNC(__func__); UVMHIST_CALLED(maphist); if (!flt->cow_now) { /* * for read faults on loaned pages we just cap the * protection at read-only. */ flt->enter_prot = flt->enter_prot & ~VM_PROT_WRITE; } else { /* * note that we can't allow writes into a loaned page! * * if we have a write fault on a loaned page in an * anon then we need to look at the anon's ref count. * if it is greater than one then we are going to do * a normal copy-on-write fault into a new anon (this * is not a problem). however, if the reference count * is one (a case where we would normally allow a * write directly to the page) then we need to kill * the loan before we continue. */ /* >1 case is already ok */ if (anon->an_ref == 1) { /* breaking loan requires a write lock. */ error = uvm_fault_upper_upgrade(ufi, flt, amap, NULL); if (error != 0) { return error; } KASSERT(rw_write_held(amap->am_lock)); error = uvm_loanbreak_anon(anon, *ruobj); if (error != 0) { uvmfault_unlockall(ufi, amap, *ruobj); uvm_wait("flt_noram2"); return ERESTART; } /* if we were a loan receiver uobj is gone */ if (*ruobj) *ruobj = NULL; } } return error; } /* * uvm_fault_upper_promote: promote upper page. * * 1. call uvmfault_promote. * 2. enqueue page. * 3. deref. * 4. pass page to uvm_fault_upper_enter. */ static int uvm_fault_upper_promote( struct uvm_faultinfo *ufi, struct uvm_faultctx *flt, struct uvm_object *uobj, struct vm_anon *anon) { struct vm_amap * const amap = ufi->entry->aref.ar_amap; struct vm_anon * const oanon = anon; struct vm_page *pg; int error; UVMHIST_FUNC(__func__); UVMHIST_CALLED(maphist); UVMHIST_LOG(maphist, " case 1B: COW fault",0,0,0,0); /* promoting requires a write lock. */ error = uvm_fault_upper_upgrade(ufi, flt, amap, NULL); if (error != 0) { return error; } KASSERT(rw_write_held(amap->am_lock)); cpu_count(CPU_COUNT_FLT_ACOW, 1); error = uvmfault_promote(ufi, oanon, PGO_DONTCARE, &anon, &flt->anon_spare); switch (error) { case 0: break; case ERESTART: return ERESTART; default: return error; } pg = anon->an_page; KASSERT(anon->an_lock == oanon->an_lock); KASSERT((pg->flags & (PG_BUSY | PG_FAKE)) == 0); /* deref: can not drop to zero here by defn! */ KASSERT(oanon->an_ref > 1); oanon->an_ref--; /* * note: oanon is still locked, as is the new anon. we * need to check for this later when we unlock oanon; if * oanon != anon, we'll have to unlock anon, too. */ return uvm_fault_upper_enter(ufi, flt, uobj, anon, pg, oanon); } /* * uvm_fault_upper_direct: handle direct fault. */ static int uvm_fault_upper_direct( struct uvm_faultinfo *ufi, struct uvm_faultctx *flt, struct uvm_object *uobj, struct vm_anon *anon) { struct vm_anon * const oanon = anon; struct vm_page *pg; UVMHIST_FUNC(__func__); UVMHIST_CALLED(maphist); cpu_count(CPU_COUNT_FLT_ANON, 1); pg = anon->an_page; if (anon->an_ref > 1) /* disallow writes to ref > 1 anons */ flt->enter_prot = flt->enter_prot & ~VM_PROT_WRITE; return uvm_fault_upper_enter(ufi, flt, uobj, anon, pg, oanon); } /* * uvm_fault_upper_enter: enter h/w mapping of upper page. */ static int uvm_fault_upper_enter( struct uvm_faultinfo *ufi, const struct uvm_faultctx *flt, struct uvm_object *uobj, struct vm_anon *anon, struct vm_page *pg, struct vm_anon *oanon) { struct pmap *pmap = ufi->orig_map->pmap; vaddr_t va = ufi->orig_rvaddr; struct vm_amap * const amap = ufi->entry->aref.ar_amap; UVMHIST_FUNC(__func__); UVMHIST_CALLED(maphist); /* locked: maps(read), amap, oanon, anon(if different from oanon) */ KASSERT(rw_lock_op(amap->am_lock) == flt->upper_lock_type); KASSERT(anon->an_lock == amap->am_lock); KASSERT(oanon->an_lock == amap->am_lock); KASSERT(uobj == NULL || rw_lock_op(uobj->vmobjlock) == flt->lower_lock_type); KASSERT(uvm_pagegetdirty(pg) != UVM_PAGE_STATUS_CLEAN); /* * now map the page in. */ UVMHIST_LOG(maphist, " MAPPING: anon: pm=%#jx, va=%#jx, pg=%#jx, promote=%jd", (uintptr_t)pmap, va, (uintptr_t)pg, flt->promote); if (pmap_enter(pmap, va, VM_PAGE_TO_PHYS(pg), flt->enter_prot, flt->access_type | PMAP_CANFAIL | (flt->wire_mapping ? PMAP_WIRED : 0)) != 0) { /* * If pmap_enter() fails, it must not leave behind an existing * pmap entry. In particular, a now-stale entry for a different * page would leave the pmap inconsistent with the vm_map. * This is not to imply that pmap_enter() should remove an * existing mapping in such a situation (since that could create * different problems, eg. if the existing mapping is wired), * but rather that the pmap should be designed such that it * never needs to fail when the new mapping is replacing an * existing mapping and the new page has no existing mappings. * * XXX This can't be asserted safely any more because many * LWPs and/or many processes could simultaneously fault on * the same VA and some might succeed. */ /* KASSERT(!pmap_extract(pmap, va, NULL)); */ /* * ensure that the page is queued in the case that * we just promoted. */ uvm_pagelock(pg); uvm_pageenqueue(pg); uvm_pageunlock(pg); /* * No need to undo what we did; we can simply think of * this as the pmap throwing away the mapping information. * * We do, however, have to go through the ReFault path, * as the map may change while we're asleep. */ uvmfault_unlockall(ufi, amap, uobj); if (!uvm_reclaimable()) { UVMHIST_LOG(maphist, "<- failed. out of VM",0,0,0,0); /* XXX instrumentation */ return ENOMEM; } /* XXX instrumentation */ uvm_wait("flt_pmfail1"); return ERESTART; } uvm_fault_upper_done(ufi, flt, anon, pg); /* * done case 1! finish up by unlocking everything and returning success */ pmap_update(pmap); uvmfault_unlockall(ufi, amap, uobj); return 0; } /* * uvm_fault_upper_done: queue upper center page. */ static void uvm_fault_upper_done( struct uvm_faultinfo *ufi, const struct uvm_faultctx *flt, struct vm_anon *anon, struct vm_page *pg) { const bool wire_paging = flt->wire_paging; UVMHIST_FUNC(__func__); UVMHIST_CALLED(maphist); /* * ... update the page queues. */ if (wire_paging) { uvm_pagelock(pg); uvm_pagewire(pg); uvm_pageunlock(pg); /* * since the now-wired page cannot be paged out, * release its swap resources for others to use. * and since an anon with no swap cannot be clean, * mark it dirty now. */ uvm_pagemarkdirty(pg, UVM_PAGE_STATUS_DIRTY); uvm_anon_dropswap(anon); } else if (uvmpdpol_pageactivate_p(pg)) { /* * avoid re-activating the page unless needed, * to avoid false sharing on multiprocessor. */ uvm_pagelock(pg); uvm_pageactivate(pg); uvm_pageunlock(pg); } } /* * uvm_fault_lower_upgrade: upgrade lower lock, reader -> writer */ static inline int uvm_fault_lower_upgrade(struct uvm_faultinfo *ufi, struct uvm_faultctx *flt, struct vm_amap *amap, struct uvm_object *uobj, struct vm_page *uobjpage) { UVMHIST_FUNC(__func__); UVMHIST_CALLED(maphist); KASSERT(uobj != NULL); KASSERT(flt->lower_lock_type == rw_lock_op(uobj->vmobjlock)); /* * fast path. */ if (__predict_true(flt->lower_lock_type == RW_WRITER)) { return 0; } /* * otherwise try for the upgrade. if we don't get it, unlock * everything, restart the fault and next time around get a writer * lock. */ flt->lower_lock_type = RW_WRITER; if (__predict_false(!rw_tryupgrade(uobj->vmobjlock))) { uvmfault_unlockall(ufi, amap, uobj); cpu_count(CPU_COUNT_FLTNOUP, 1); UVMHIST_LOG(maphist, " !upgrade lower", 0, 0,0,0); return ERESTART; } cpu_count(CPU_COUNT_FLTUP, 1); KASSERT(flt->lower_lock_type == rw_lock_op(uobj->vmobjlock)); return 0; } /* * uvm_fault_lower: handle lower fault. * * 1. check uobj * 1.1. if null, ZFOD. * 1.2. if not null, look up unmapped neighbor pages. * 2. for center page, check if promote. * 2.1. ZFOD always needs promotion. * 2.2. other uobjs, when entry is marked COW (usually MAP_PRIVATE vnode). * 3. if uobj is not ZFOD and page is not found, do i/o. * 4. dispatch either direct / promote fault. */ static int uvm_fault_lower( struct uvm_faultinfo *ufi, struct uvm_faultctx *flt, struct vm_page **pages) { struct vm_amap *amap __diagused = ufi->entry->aref.ar_amap; struct uvm_object *uobj = ufi->entry->object.uvm_obj; struct vm_page *uobjpage; int error; UVMHIST_FUNC(__func__); UVMHIST_CALLED(maphist); /* * now, if the desired page is not shadowed by the amap and we have * a backing object that does not have a special fault routine, then * we ask (with pgo_get) the object for resident pages that we care * about and attempt to map them in. we do not let pgo_get block * (PGO_LOCKED). */ if (uobj == NULL) { /* zero fill; don't care neighbor pages */ uobjpage = NULL; } else { uvm_fault_lower_lookup(ufi, flt, pages); uobjpage = pages[flt->centeridx]; } /* * note that at this point we are done with any front or back pages. * we are now going to focus on the center page (i.e. the one we've * faulted on). if we have faulted on the upper (anon) layer * [i.e. case 1], then the anon we want is anons[centeridx] (we have * not touched it yet). if we have faulted on the bottom (uobj) * layer [i.e. case 2] and the page was both present and available, * then we've got a pointer to it as "uobjpage" and we've already * made it BUSY. */ /* * locked: * maps(read), amap(if there), uobj(if !null), uobjpage(if !null) */ KASSERT(amap == NULL || rw_lock_op(amap->am_lock) == flt->upper_lock_type); KASSERT(uobj == NULL || rw_lock_op(uobj->vmobjlock) == flt->lower_lock_type); /* * note that uobjpage can not be PGO_DONTCARE at this point. we now * set uobjpage to PGO_DONTCARE if we are doing a zero fill. if we * have a backing object, check and see if we are going to promote * the data up to an anon during the fault. */ if (uobj == NULL) { uobjpage = PGO_DONTCARE; flt->promote = true; /* always need anon here */ } else { KASSERT(uobjpage != PGO_DONTCARE); flt->promote = flt->cow_now && UVM_ET_ISCOPYONWRITE(ufi->entry); } UVMHIST_LOG(maphist, " case 2 fault: promote=%jd, zfill=%jd", flt->promote, (uobj == NULL), 0,0); /* * if uobjpage is not null then we do not need to do I/O to get the * uobjpage. * * if uobjpage is null, then we need to unlock and ask the pager to * get the data for us. once we have the data, we need to reverify * the state the world. we are currently not holding any resources. */ if (uobjpage) { /* update rusage counters */ curlwp->l_ru.ru_minflt++; } else { error = uvm_fault_lower_io(ufi, flt, &uobj, &uobjpage); if (error != 0) return error; } /* * locked: * maps(read), amap(if !null), uobj(if !null), uobjpage(if uobj) */ KASSERT(amap == NULL || rw_lock_op(amap->am_lock) == flt->upper_lock_type); KASSERT(uobj == NULL || rw_lock_op(uobj->vmobjlock) == flt->lower_lock_type); /* * notes: * - at this point uobjpage can not be NULL * - at this point uobjpage can not be PG_RELEASED (since we checked * for it above) * - at this point uobjpage could be waited on (handle later) * - uobjpage can be from a different object if tmpfs (vnode vs UAO) */ KASSERT(uobjpage != NULL); KASSERT(uobj == NULL || uobjpage->uobject->vmobjlock == uobj->vmobjlock); KASSERT(uobj == NULL || !UVM_OBJ_IS_CLEAN(uobjpage->uobject) || uvm_pagegetdirty(uobjpage) == UVM_PAGE_STATUS_CLEAN); if (!flt->promote) { error = uvm_fault_lower_direct(ufi, flt, uobj, uobjpage); } else { error = uvm_fault_lower_promote(ufi, flt, uobj, uobjpage); } return error; } /* * uvm_fault_lower_lookup: look up on-memory uobj pages. * * 1. get on-memory pages. * 2. if failed, give up (get only center page later). * 3. if succeeded, enter h/w mapping of neighbor pages. */ static void uvm_fault_lower_lookup( struct uvm_faultinfo *ufi, const struct uvm_faultctx *flt, struct vm_page **pages) { struct uvm_object *uobj = ufi->entry->object.uvm_obj; int lcv, gotpages; vaddr_t currva; bool entered; UVMHIST_FUNC(__func__); UVMHIST_CALLED(maphist); rw_enter(uobj->vmobjlock, flt->lower_lock_type); /* * Locked: maps(read), amap(if there), uobj */ cpu_count(CPU_COUNT_FLTLGET, 1); gotpages = flt->npages; (void) uobj->pgops->pgo_get(uobj, ufi->entry->offset + flt->startva - ufi->entry->start, pages, &gotpages, flt->centeridx, flt->access_type & MASK(ufi->entry), ufi->entry->advice, PGO_LOCKED); KASSERT(rw_lock_op(uobj->vmobjlock) == flt->lower_lock_type); /* * check for pages to map, if we got any */ if (gotpages == 0) { pages[flt->centeridx] = NULL; return; } entered = false; currva = flt->startva; for (lcv = 0; lcv < flt->npages; lcv++, currva += PAGE_SIZE) { struct vm_page *curpg; curpg = pages[lcv]; if (curpg == NULL || curpg == PGO_DONTCARE) { continue; } /* * in the case of tmpfs, the pages might be from a different * uvm_object. just make sure that they have the same lock. */ KASSERT(curpg->uobject->vmobjlock == uobj->vmobjlock); KASSERT((curpg->flags & PG_BUSY) == 0); /* * leave the centre page for later. don't screw with * existing mappings (needless & expensive). */ if (lcv == flt->centeridx) { UVMHIST_LOG(maphist, " got uobjpage (%#jx) " "with locked get", (uintptr_t)curpg, 0, 0, 0); } else if (!pmap_extract(ufi->orig_map->pmap, currva, NULL)) { uvm_fault_lower_neighbor(ufi, flt, currva, curpg); entered = true; } } if (entered) { pmap_update(ufi->orig_map->pmap); } } /* * uvm_fault_lower_neighbor: enter h/w mapping of lower neighbor page. */ static void uvm_fault_lower_neighbor( struct uvm_faultinfo *ufi, const struct uvm_faultctx *flt, vaddr_t currva, struct vm_page *pg) { const bool readonly = uvm_pagereadonly_p(pg) || pg->loan_count > 0; UVMHIST_FUNC(__func__); UVMHIST_CALLED(maphist); /* locked: maps(read), amap(if there), uobj */ /* * calling pgo_get with PGO_LOCKED returns us pages which * are neither busy nor released, so we don't need to check * for this. we can just directly enter the pages. * * there wasn't a direct fault on the page, so avoid the cost of * activating it. */ if (!uvmpdpol_pageisqueued_p(pg) && pg->wire_count == 0) { uvm_pagelock(pg); uvm_pageenqueue(pg); uvm_pageunlock(pg); } UVMHIST_LOG(maphist, " MAPPING: n obj: pm=%#jx, va=%#jx, pg=%#jx", (uintptr_t)ufi->orig_map->pmap, currva, (uintptr_t)pg, 0); cpu_count(CPU_COUNT_FLTNOMAP, 1); /* * Since this page isn't the page that's actually faulting, * ignore pmap_enter() failures; it's not critical that we * enter these right now. * NOTE: page can't be waited on or PG_RELEASED because we've * held the lock the whole time we've had the handle. */ KASSERT((pg->flags & PG_PAGEOUT) == 0); KASSERT((pg->flags & PG_RELEASED) == 0); KASSERT(!UVM_OBJ_IS_CLEAN(pg->uobject) || uvm_pagegetdirty(pg) == UVM_PAGE_STATUS_CLEAN); KASSERT((pg->flags & PG_BUSY) == 0); KASSERT(rw_lock_op(pg->uobject->vmobjlock) == flt->lower_lock_type); const vm_prot_t mapprot = readonly ? (flt->enter_prot & ~VM_PROT_WRITE) : flt->enter_prot & MASK(ufi->entry); const u_int mapflags = PMAP_CANFAIL | (flt->wire_mapping ? (mapprot | PMAP_WIRED) : 0); (void) pmap_enter(ufi->orig_map->pmap, currva, VM_PAGE_TO_PHYS(pg), mapprot, mapflags); } /* * uvm_fault_lower_io: get lower page from backing store. * * 1. unlock everything, because i/o will block. * 2. call pgo_get. * 3. if failed, recover. * 4. if succeeded, relock everything and verify things. */ static int uvm_fault_lower_io( struct uvm_faultinfo *ufi, struct uvm_faultctx *flt, struct uvm_object **ruobj, struct vm_page **ruobjpage) { struct vm_amap * const amap = ufi->entry->aref.ar_amap; struct uvm_object *uobj = *ruobj; struct vm_page *pg; bool locked; int gotpages; int error; voff_t uoff; vm_prot_t access_type; int advice; UVMHIST_FUNC(__func__); UVMHIST_CALLED(maphist); /* grab everything we need from the entry before we unlock */ uoff = (ufi->orig_rvaddr - ufi->entry->start) + ufi->entry->offset; access_type = flt->access_type & MASK(ufi->entry); advice = ufi->entry->advice; /* Locked: maps(read), amap(if there), uobj */ KASSERT(rw_lock_op(uobj->vmobjlock) == flt->lower_lock_type); /* Upgrade to a write lock if needed. */ error = uvm_fault_lower_upgrade(ufi, flt, amap, uobj, NULL); if (error != 0) { return error; } uvmfault_unlockall(ufi, amap, NULL); /* update rusage counters */ curlwp->l_ru.ru_majflt++; /* Locked: uobj(write) */ KASSERT(rw_write_held(uobj->vmobjlock)); cpu_count(CPU_COUNT_FLTGET, 1); gotpages = 1; pg = NULL; error = uobj->pgops->pgo_get(uobj, uoff, &pg, &gotpages, 0, access_type, advice, PGO_SYNCIO); /* locked: pg(if no error) */ /* * recover from I/O */ if (error) { if (error == EAGAIN) { UVMHIST_LOG(maphist, " pgo_get says TRY AGAIN!",0,0,0,0); kpause("fltagain2", false, hz/2, NULL); return ERESTART; } #if 0 KASSERT(error != ERESTART); #else /* XXXUEBS don't re-fault? */ if (error == ERESTART) error = EIO; #endif UVMHIST_LOG(maphist, "<- pgo_get failed (code %jd)", error, 0,0,0); return error; } /* * re-verify the state of the world by first trying to relock * the maps. always relock the object. */ locked = uvmfault_relock(ufi); if (locked && amap) amap_lock(amap, flt->upper_lock_type); /* might be changed */ uobj = pg->uobject; rw_enter(uobj->vmobjlock, flt->lower_lock_type); KASSERT((pg->flags & PG_BUSY) != 0); KASSERT(flt->lower_lock_type == RW_WRITER); uvm_pagelock(pg); uvm_pageactivate(pg); uvm_pageunlock(pg); /* locked(locked): maps(read), amap(if !null), uobj, pg */ /* locked(!locked): uobj, pg */ /* * verify that the page has not be released and re-verify * that amap slot is still free. if there is a problem, * we unlock and clean up. */ if ((pg->flags & PG_RELEASED) != 0 || (locked && amap && amap_lookup(&ufi->entry->aref, ufi->orig_rvaddr - ufi->entry->start))) { if (locked) uvmfault_unlockall(ufi, amap, NULL); locked = false; } /* * unbusy/release the page. */ if ((pg->flags & PG_RELEASED) == 0) { pg->flags &= ~PG_BUSY; uvm_pagelock(pg); uvm_pagewakeup(pg); uvm_pageunlock(pg); UVM_PAGE_OWN(pg, NULL); } else { cpu_count(CPU_COUNT_FLTPGRELE, 1); uvm_pagefree(pg); } /* * didn't get the lock? retry. */ if (locked == false) { UVMHIST_LOG(maphist, " wasn't able to relock after fault: retry", 0,0,0,0); rw_exit(uobj->vmobjlock); return ERESTART; } /* * we have the data in pg. we are holding object lock (so the page * can't be released on us). */ /* locked: maps(read), amap(if !null), uobj */ *ruobj = uobj; *ruobjpage = pg; return 0; } /* * uvm_fault_lower_direct: fault lower center page * * 1. adjust flt->enter_prot. * 2. if page is loaned, resolve. */ int uvm_fault_lower_direct( struct uvm_faultinfo *ufi, struct uvm_faultctx *flt, struct uvm_object *uobj, struct vm_page *uobjpage) { struct vm_page *pg; UVMHIST_FUNC(__func__); UVMHIST_CALLED(maphist); /* * we are not promoting. if the mapping is COW ensure that we * don't give more access than we should (e.g. when doing a read * fault on a COPYONWRITE mapping we want to map the COW page in * R/O even though the entry protection could be R/W). * * set "pg" to the page we want to map in (uobjpage, usually) */ cpu_count(CPU_COUNT_FLT_OBJ, 1); if (UVM_ET_ISCOPYONWRITE(ufi->entry) || UVM_OBJ_NEEDS_WRITEFAULT(uobjpage->uobject)) flt->enter_prot &= ~VM_PROT_WRITE; pg = uobjpage; /* map in the actual object */ KASSERT(uobjpage != PGO_DONTCARE); /* * we are faulting directly on the page. be careful * about writing to loaned pages... */ if (uobjpage->loan_count) { uvm_fault_lower_direct_loan(ufi, flt, uobj, &pg, &uobjpage); } KASSERT(pg == uobjpage); KASSERT((pg->flags & PG_BUSY) == 0); return uvm_fault_lower_enter(ufi, flt, uobj, NULL, pg); } /* * uvm_fault_lower_direct_loan: resolve loaned page. * * 1. if not cow'ing, adjust flt->enter_prot. * 2. if cow'ing, break loan. */ static int uvm_fault_lower_direct_loan( struct uvm_faultinfo *ufi, struct uvm_faultctx *flt, struct uvm_object *uobj, struct vm_page **rpg, struct vm_page **ruobjpage) { struct vm_amap * const amap = ufi->entry->aref.ar_amap; struct vm_page *pg; struct vm_page *uobjpage = *ruobjpage; int error; UVMHIST_FUNC(__func__); UVMHIST_CALLED(maphist); if (!flt->cow_now) { /* read fault: cap the protection at readonly */ /* cap! */ flt->enter_prot = flt->enter_prot & ~VM_PROT_WRITE; } else { /* * write fault: must break the loan here. to do this * we need a write lock on the object. */ error = uvm_fault_lower_upgrade(ufi, flt, amap, uobj, uobjpage); if (error != 0) { return error; } KASSERT(rw_write_held(uobj->vmobjlock)); pg = uvm_loanbreak(uobjpage); if (pg == NULL) { uvmfault_unlockall(ufi, amap, uobj); UVMHIST_LOG(maphist, " out of RAM breaking loan, waiting", 0,0,0,0); cpu_count(CPU_COUNT_FLTNORAM, 1); uvm_wait("flt_noram4"); return ERESTART; } *rpg = pg; *ruobjpage = pg; /* * drop ownership of page while still holding object lock, * which won't be dropped until the page is entered. */ uvm_pagelock(pg); uvm_pagewakeup(pg); uvm_pageunlock(pg); pg->flags &= ~PG_BUSY; UVM_PAGE_OWN(pg, NULL); } return 0; } /* * uvm_fault_lower_promote: promote lower page. * * 1. call uvmfault_promote. * 2. fill in data. * 3. if not ZFOD, dispose old page. */ int uvm_fault_lower_promote( struct uvm_faultinfo *ufi, struct uvm_faultctx *flt, struct uvm_object *uobj, struct vm_page *uobjpage) { struct vm_amap * const amap = ufi->entry->aref.ar_amap; struct vm_anon *anon; struct vm_page *pg; int error; UVMHIST_FUNC(__func__); UVMHIST_CALLED(maphist); KASSERT(amap != NULL); /* promoting requires a write lock. */ error = uvm_fault_upper_upgrade(ufi, flt, amap, uobj); if (error != 0) { return error; } KASSERT(rw_write_held(amap->am_lock)); KASSERT(uobj == NULL || rw_lock_op(uobj->vmobjlock) == flt->lower_lock_type); /* * If we are going to promote the data to an anon we * allocate a blank anon here and plug it into our amap. */ error = uvmfault_promote(ufi, NULL, uobjpage, &anon, &flt->anon_spare); switch (error) { case 0: break; case ERESTART: return ERESTART; default: return error; } pg = anon->an_page; /* * Fill in the data. */ if (uobjpage != PGO_DONTCARE) { cpu_count(CPU_COUNT_FLT_PRCOPY, 1); /* * promote to shared amap? make sure all sharing * procs see it */ if ((amap_flags(amap) & AMAP_SHARED) != 0) { pmap_page_protect(uobjpage, VM_PROT_NONE); /* * XXX: PAGE MIGHT BE WIRED! */ } UVMHIST_LOG(maphist, " promote uobjpage %#jx to anon/page %#jx/%#jx", (uintptr_t)uobjpage, (uintptr_t)anon, (uintptr_t)pg, 0); } else { cpu_count(CPU_COUNT_FLT_PRZERO, 1); /* * Page is zero'd and marked dirty by * uvmfault_promote(). */ UVMHIST_LOG(maphist," zero fill anon/page %#jx/%#jx", (uintptr_t)anon, (uintptr_t)pg, 0, 0); } return uvm_fault_lower_enter(ufi, flt, uobj, anon, pg); } /* * uvm_fault_lower_enter: enter h/w mapping of lower page or anon page promoted * from the lower page. */ int uvm_fault_lower_enter( struct uvm_faultinfo *ufi, const struct uvm_faultctx *flt, struct uvm_object *uobj, struct vm_anon *anon, struct vm_page *pg) { struct vm_amap * const amap = ufi->entry->aref.ar_amap; const bool readonly = uvm_pagereadonly_p(pg); int error; UVMHIST_FUNC(__func__); UVMHIST_CALLED(maphist); /* * Locked: * * maps(read), amap(if !null), uobj(if !null), * anon(if !null), pg(if anon), unlock_uobj(if !null) * * anon must be write locked (promotion). uobj can be either. * * Note: pg is either the uobjpage or the new page in the new anon. */ KASSERT(amap == NULL || rw_lock_op(amap->am_lock) == flt->upper_lock_type); KASSERT(uobj == NULL || rw_lock_op(uobj->vmobjlock) == flt->lower_lock_type); KASSERT(anon == NULL || anon->an_lock == amap->am_lock); /* * note that pg can't be PG_RELEASED or PG_BUSY since we did * not drop the object lock since the last time we checked. */ KASSERT((pg->flags & PG_RELEASED) == 0); KASSERT((pg->flags & PG_BUSY) == 0); /* * all resources are present. we can now map it in and free our * resources. */ UVMHIST_LOG(maphist, " MAPPING: case2: pm=%#jx, va=%#jx, pg=%#jx, promote=%jd", (uintptr_t)ufi->orig_map->pmap, ufi->orig_rvaddr, (uintptr_t)pg, flt->promote); KASSERTMSG((flt->access_type & VM_PROT_WRITE) == 0 || !readonly, "promote=%u cow_now=%u access_type=%x enter_prot=%x cow=%u " "entry=%p map=%p orig_rvaddr=%p pg=%p", flt->promote, flt->cow_now, flt->access_type, flt->enter_prot, UVM_ET_ISCOPYONWRITE(ufi->entry), ufi->entry, ufi->orig_map, (void *)ufi->orig_rvaddr, pg); KASSERT((flt->access_type & VM_PROT_WRITE) == 0 || !readonly); if (pmap_enter(ufi->orig_map->pmap, ufi->orig_rvaddr, VM_PAGE_TO_PHYS(pg), readonly ? flt->enter_prot & ~VM_PROT_WRITE : flt->enter_prot, flt->access_type | PMAP_CANFAIL | (flt->wire_mapping ? PMAP_WIRED : 0)) != 0) { /* * No need to undo what we did; we can simply think of * this as the pmap throwing away the mapping information. * * We do, however, have to go through the ReFault path, * as the map may change while we're asleep. */ /* * ensure that the page is queued in the case that * we just promoted the page. */ if (anon != NULL) { uvm_pagelock(pg); uvm_pageenqueue(pg); uvm_pagewakeup(pg); uvm_pageunlock(pg); } uvmfault_unlockall(ufi, amap, uobj); if (!uvm_reclaimable()) { UVMHIST_LOG(maphist, "<- failed. out of VM",0,0,0,0); /* XXX instrumentation */ error = ENOMEM; return error; } /* XXX instrumentation */ uvm_wait("flt_pmfail2"); return ERESTART; } uvm_fault_lower_done(ufi, flt, uobj, pg); pmap_update(ufi->orig_map->pmap); uvmfault_unlockall(ufi, amap, uobj); UVMHIST_LOG(maphist, "<- done (SUCCESS!)",0,0,0,0); return 0; } /* * uvm_fault_lower_done: queue lower center page. */ void uvm_fault_lower_done( struct uvm_faultinfo *ufi, const struct uvm_faultctx *flt, struct uvm_object *uobj, struct vm_page *pg) { UVMHIST_FUNC(__func__); UVMHIST_CALLED(maphist); if (flt->wire_paging) { uvm_pagelock(pg); uvm_pagewire(pg); uvm_pageunlock(pg); if (pg->flags & PG_AOBJ) { /* * since the now-wired page cannot be paged out, * release its swap resources for others to use. * since an aobj page with no swap cannot be clean, * mark it dirty now. * * use pg->uobject here. if the page is from a * tmpfs vnode, the pages are backed by its UAO and * not the vnode. */ KASSERT(uobj != NULL); KASSERT(uobj->vmobjlock == pg->uobject->vmobjlock); uvm_pagemarkdirty(pg, UVM_PAGE_STATUS_DIRTY); uao_dropswap(pg->uobject, pg->offset >> PAGE_SHIFT); } } else if (uvmpdpol_pageactivate_p(pg)) { /* * avoid re-activating the page unless needed, * to avoid false sharing on multiprocessor. */ uvm_pagelock(pg); uvm_pageactivate(pg); uvm_pageunlock(pg); } } /* * uvm_fault_wire: wire down a range of virtual addresses in a map. * * => map may be read-locked by caller, but MUST NOT be write-locked. * => if map is read-locked, any operations which may cause map to * be write-locked in uvm_fault() must be taken care of by * the caller. See uvm_map_pageable(). */ int uvm_fault_wire(struct vm_map *map, vaddr_t start, vaddr_t end, vm_prot_t access_type, int maxprot) { vaddr_t va; int error; /* * now fault it in a page at a time. if the fault fails then we have * to undo what we have done. note that in uvm_fault VM_PROT_NONE * is replaced with the max protection if fault_type is VM_FAULT_WIRE. */ /* * XXX work around overflowing a vaddr_t. this prevents us from * wiring the last page in the address space, though. */ if (start > end) { return EFAULT; } for (va = start; va < end; va += PAGE_SIZE) { error = uvm_fault_internal(map, va, access_type, (maxprot ? UVM_FAULT_MAXPROT : 0) | UVM_FAULT_WIRE); if (error) { if (va != start) { uvm_fault_unwire(map, start, va); } return error; } } return 0; } /* * uvm_fault_unwire(): unwire range of virtual space. */ void uvm_fault_unwire(struct vm_map *map, vaddr_t start, vaddr_t end) { vm_map_lock_read(map); uvm_fault_unwire_locked(map, start, end); vm_map_unlock_read(map); } /* * uvm_fault_unwire_locked(): the guts of uvm_fault_unwire(). * * => map must be at least read-locked. */ void uvm_fault_unwire_locked(struct vm_map *map, vaddr_t start, vaddr_t end) { struct vm_map_entry *entry, *oentry; pmap_t pmap = vm_map_pmap(map); vaddr_t va; paddr_t pa; struct vm_page *pg; /* * we assume that the area we are unwiring has actually been wired * in the first place. this means that we should be able to extract * the PAs from the pmap. we also lock out the page daemon so that * we can call uvm_pageunwire. */ /* * find the beginning map entry for the region. */ KASSERT(start >= vm_map_min(map)); KASSERT(end <= vm_map_max(map)); if (uvm_map_lookup_entry(map, start, &entry) == false) panic("uvm_fault_unwire_locked: address not in map"); oentry = NULL; for (va = start; va < end; va += PAGE_SIZE) { /* * find the map entry for the current address. */ KASSERT(va >= entry->start); while (va >= entry->end) { KASSERT(entry->next != &map->header); KASSERT(entry->next->start <= entry->end); entry = entry->next; } /* * lock it. */ if (entry != oentry) { if (oentry != NULL) { uvm_map_unlock_entry(oentry); } uvm_map_lock_entry(entry, RW_WRITER); oentry = entry; } /* * if the entry is no longer wired, tell the pmap. */ if (!pmap_extract(pmap, va, &pa)) continue; if (VM_MAPENT_ISWIRED(entry) == 0) pmap_unwire(pmap, va); pg = PHYS_TO_VM_PAGE(pa); if (pg) { uvm_pagelock(pg); uvm_pageunwire(pg); uvm_pageunlock(pg); } } if (oentry != NULL) { uvm_map_unlock_entry(entry); } }
3 1 3 2 4 3 1 3 2 2 2 3 4 4 4 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 /* $NetBSD: igmp.c,v 1.70 2020/05/15 06:34:34 maxv Exp $ */ /* * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the project nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * Internet Group Management Protocol (IGMP) routines. * * Written by Steve Deering, Stanford, May 1988. * Modified by Rosen Sharma, Stanford, Aug 1994. * Modified by Bill Fenner, Xerox PARC, Feb 1995. * * MULTICAST Revision: 1.3 */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: igmp.c,v 1.70 2020/05/15 06:34:34 maxv Exp $"); #ifdef _KERNEL_OPT #include "opt_mrouting.h" #include "opt_net_mpsafe.h" #endif #include <sys/param.h> #include <sys/mbuf.h> #include <sys/socket.h> #include <sys/socketvar.h> #include <sys/systm.h> #include <sys/cprng.h> #include <sys/sysctl.h> #include <net/if.h> #include <net/net_stats.h> #include <netinet/in.h> #include <netinet/in_var.h> #include <netinet/in_systm.h> #include <netinet/ip.h> #include <netinet/ip_var.h> #include <netinet/igmp.h> #include <netinet/igmp_var.h> /* * Per-interface router version information. */ typedef struct router_info { LIST_ENTRY(router_info) rti_link; ifnet_t * rti_ifp; int rti_type; /* type of router on this interface */ int rti_age; /* time since last v1 query */ } router_info_t; /* * The router-info list and the timer flag are protected by in_multilock. * * Lock order: * * softnet_lock -> * in_multilock */ static struct pool igmp_rti_pool __cacheline_aligned; static LIST_HEAD(, router_info) rti_head __cacheline_aligned; static int igmp_timers_on __cacheline_aligned; static percpu_t * igmpstat_percpu __read_mostly; #define IGMP_STATINC(x) _NET_STATINC(igmpstat_percpu, x) static void igmp_sendpkt(struct in_multi *, int); static int rti_fill(struct in_multi *); static router_info_t * rti_find(struct ifnet *); static void rti_delete(struct ifnet *); static void sysctl_net_inet_igmp_setup(struct sysctllog **); /* * rti_fill: associate router information with the given multicast group; * if there is no router information for the interface, then create it. */ static int rti_fill(struct in_multi *inm) { router_info_t *rti; KASSERT(in_multi_lock_held()); LIST_FOREACH(rti, &rti_head, rti_link) { if (rti->rti_ifp == inm->inm_ifp) { inm->inm_rti = rti; return rti->rti_type == IGMP_v1_ROUTER ? IGMP_v1_HOST_MEMBERSHIP_REPORT : IGMP_v2_HOST_MEMBERSHIP_REPORT; } } rti = pool_get(&igmp_rti_pool, PR_NOWAIT); if (rti == NULL) { return 0; } rti->rti_ifp = inm->inm_ifp; rti->rti_type = IGMP_v2_ROUTER; LIST_INSERT_HEAD(&rti_head, rti, rti_link); inm->inm_rti = rti; return IGMP_v2_HOST_MEMBERSHIP_REPORT; } /* * rti_find: lookup or create router information for the given interface. */ static router_info_t * rti_find(ifnet_t *ifp) { router_info_t *rti; KASSERT(in_multi_lock_held()); LIST_FOREACH(rti, &rti_head, rti_link) { if (rti->rti_ifp == ifp) return rti; } rti = pool_get(&igmp_rti_pool, PR_NOWAIT); if (rti == NULL) { return NULL; } rti->rti_ifp = ifp; rti->rti_type = IGMP_v2_ROUTER; LIST_INSERT_HEAD(&rti_head, rti, rti_link); return rti; } /* * rti_delete: remove and free the router information entry for the * given interface. */ static void rti_delete(ifnet_t *ifp) { router_info_t *rti; KASSERT(in_multi_lock_held()); LIST_FOREACH(rti, &rti_head, rti_link) { if (rti->rti_ifp == ifp) { LIST_REMOVE(rti, rti_link); pool_put(&igmp_rti_pool, rti); break; } } } void igmp_init(void) { pool_init(&igmp_rti_pool, sizeof(router_info_t), 0, 0, 0, "igmppl", NULL, IPL_SOFTNET); igmpstat_percpu = percpu_alloc(sizeof(uint64_t) * IGMP_NSTATS); sysctl_net_inet_igmp_setup(NULL); LIST_INIT(&rti_head); } void igmp_input(struct mbuf *m, int off, int proto) { ifnet_t *ifp; struct ip *ip = mtod(m, struct ip *); struct igmp *igmp; u_int minlen, timer; struct in_multi *inm; struct in_ifaddr *ia; int ip_len, iphlen; struct psref psref; iphlen = off; IGMP_STATINC(IGMP_STAT_RCV_TOTAL); /* * Validate lengths */ minlen = iphlen + IGMP_MINLEN; ip_len = ntohs(ip->ip_len); if (ip_len < minlen) { IGMP_STATINC(IGMP_STAT_RCV_TOOSHORT); m_freem(m); return; } if (((m->m_flags & M_EXT) && (ip->ip_src.s_addr & IN_CLASSA_NET) == 0) || m->m_len < minlen) { if ((m = m_pullup(m, minlen)) == NULL) { IGMP_STATINC(IGMP_STAT_RCV_TOOSHORT); return; } ip = mtod(m, struct ip *); } /* * Validate checksum */ m->m_data += iphlen; m->m_len -= iphlen; igmp = mtod(m, struct igmp *); /* No need to assert alignment here. */ if (in_cksum(m, ip_len - iphlen)) { IGMP_STATINC(IGMP_STAT_RCV_BADSUM); m_freem(m); return; } m->m_data -= iphlen; m->m_len += iphlen; ifp = m_get_rcvif_psref(m, &psref); if (__predict_false(ifp == NULL)) goto drop; switch (igmp->igmp_type) { case IGMP_HOST_MEMBERSHIP_QUERY: IGMP_STATINC(IGMP_STAT_RCV_QUERIES); if (ifp->if_flags & IFF_LOOPBACK) break; if (igmp->igmp_code == 0) { struct in_multistep step; router_info_t *rti; if (ip->ip_dst.s_addr != INADDR_ALLHOSTS_GROUP) { IGMP_STATINC(IGMP_STAT_RCV_BADQUERIES); goto drop; } in_multi_lock(RW_WRITER); rti = rti_find(ifp); if (rti == NULL) { in_multi_unlock(); break; } rti->rti_type = IGMP_v1_ROUTER; rti->rti_age = 0; /* * Start the timers in all of our membership records * for the interface on which the query arrived, * except those that are already running and those * that belong to a "local" group (224.0.0.X). */ inm = in_first_multi(&step); while (inm != NULL) { if (inm->inm_ifp == ifp && inm->inm_timer == 0 && !IN_LOCAL_GROUP(inm->inm_addr.s_addr)) { inm->inm_state = IGMP_DELAYING_MEMBER; inm->inm_timer = IGMP_RANDOM_DELAY( IGMP_MAX_HOST_REPORT_DELAY * PR_FASTHZ); igmp_timers_on = true; } inm = in_next_multi(&step); } in_multi_unlock(); } else { struct in_multistep step; if (!IN_MULTICAST(ip->ip_dst.s_addr)) { IGMP_STATINC(IGMP_STAT_RCV_BADQUERIES); goto drop; } timer = igmp->igmp_code * PR_FASTHZ / IGMP_TIMER_SCALE; if (timer == 0) timer = 1; /* * Start the timers in all of our membership records * for the interface on which the query arrived, * except those that are already running and those * that belong to a "local" group (224.0.0.X). For * timers already running, check if they need to be * reset. */ in_multi_lock(RW_WRITER); inm = in_first_multi(&step); while (inm != NULL) { if (inm->inm_ifp == ifp && !IN_LOCAL_GROUP(inm->inm_addr.s_addr) && (ip->ip_dst.s_addr == INADDR_ALLHOSTS_GROUP || in_hosteq(ip->ip_dst, inm->inm_addr))) { switch (inm->inm_state) { case IGMP_DELAYING_MEMBER: if (inm->inm_timer <= timer) break; /* FALLTHROUGH */ case IGMP_IDLE_MEMBER: case IGMP_LAZY_MEMBER: case IGMP_AWAKENING_MEMBER: inm->inm_state = IGMP_DELAYING_MEMBER; inm->inm_timer = IGMP_RANDOM_DELAY(timer); igmp_timers_on = true; break; case IGMP_SLEEPING_MEMBER: inm->inm_state = IGMP_AWAKENING_MEMBER; break; } } inm = in_next_multi(&step); } in_multi_unlock(); } break; case IGMP_v1_HOST_MEMBERSHIP_REPORT: IGMP_STATINC(IGMP_STAT_RCV_REPORTS); if (ifp->if_flags & IFF_LOOPBACK) break; if (!IN_MULTICAST(igmp->igmp_group.s_addr) || !in_hosteq(igmp->igmp_group, ip->ip_dst)) { IGMP_STATINC(IGMP_STAT_RCV_BADREPORTS); goto drop; } /* * KLUDGE: if the IP source address of the report has an * unspecified (i.e., zero) subnet number, as is allowed for * a booting host, replace it with the correct subnet number * so that a process-level multicast routing daemon can * determine which subnet it arrived from. This is necessary * to compensate for the lack of any way for a process to * determine the arrival interface of an incoming packet. */ if ((ip->ip_src.s_addr & IN_CLASSA_NET) == 0) { int s = pserialize_read_enter(); ia = in_get_ia_from_ifp(ifp); /* XXX */ if (ia) ip->ip_src.s_addr = ia->ia_subnet; pserialize_read_exit(s); } /* * If we belong to the group being reported, stop * our timer for that group. */ in_multi_lock(RW_WRITER); inm = in_lookup_multi(igmp->igmp_group, ifp); if (inm != NULL) { inm->inm_timer = 0; IGMP_STATINC(IGMP_STAT_RCV_OURREPORTS); switch (inm->inm_state) { case IGMP_IDLE_MEMBER: case IGMP_LAZY_MEMBER: case IGMP_AWAKENING_MEMBER: case IGMP_SLEEPING_MEMBER: inm->inm_state = IGMP_SLEEPING_MEMBER; break; case IGMP_DELAYING_MEMBER: if (inm->inm_rti->rti_type == IGMP_v1_ROUTER) inm->inm_state = IGMP_LAZY_MEMBER; else inm->inm_state = IGMP_SLEEPING_MEMBER; break; } } in_multi_unlock(); break; case IGMP_v2_HOST_MEMBERSHIP_REPORT: { int s = pserialize_read_enter(); #ifdef MROUTING /* * Make sure we don't hear our own membership report. Fast * leave requires knowing that we are the only member of a * group. */ ia = in_get_ia_from_ifp(ifp); /* XXX */ if (ia && in_hosteq(ip->ip_src, ia->ia_addr.sin_addr)) { pserialize_read_exit(s); break; } #endif IGMP_STATINC(IGMP_STAT_RCV_REPORTS); if (ifp->if_flags & IFF_LOOPBACK) { pserialize_read_exit(s); break; } if (!IN_MULTICAST(igmp->igmp_group.s_addr) || !in_hosteq(igmp->igmp_group, ip->ip_dst)) { IGMP_STATINC(IGMP_STAT_RCV_BADREPORTS); pserialize_read_exit(s); goto drop; } /* * KLUDGE: if the IP source address of the report has an * unspecified (i.e., zero) subnet number, as is allowed for * a booting host, replace it with the correct subnet number * so that a process-level multicast routing daemon can * determine which subnet it arrived from. This is necessary * to compensate for the lack of any way for a process to * determine the arrival interface of an incoming packet. */ if ((ip->ip_src.s_addr & IN_CLASSA_NET) == 0) { #ifndef MROUTING ia = in_get_ia_from_ifp(ifp); /* XXX */ #endif if (ia) ip->ip_src.s_addr = ia->ia_subnet; } pserialize_read_exit(s); /* * If we belong to the group being reported, stop * our timer for that group. */ in_multi_lock(RW_WRITER); inm = in_lookup_multi(igmp->igmp_group, ifp); if (inm != NULL) { inm->inm_timer = 0; IGMP_STATINC(IGMP_STAT_RCV_OURREPORTS); switch (inm->inm_state) { case IGMP_DELAYING_MEMBER: case IGMP_IDLE_MEMBER: case IGMP_AWAKENING_MEMBER: inm->inm_state = IGMP_LAZY_MEMBER; break; case IGMP_LAZY_MEMBER: case IGMP_SLEEPING_MEMBER: break; } } in_multi_unlock(); break; } } m_put_rcvif_psref(ifp, &psref); /* * Pass all valid IGMP packets up to any process(es) listening * on a raw IGMP socket. */ /* * Currently, igmp_input() is always called holding softnet_lock * by ipintr()(!NET_MPSAFE) or PR_INPUT_WRAP()(NET_MPSAFE). */ KASSERT(mutex_owned(softnet_lock)); rip_input(m, iphlen, proto); return; drop: m_put_rcvif_psref(ifp, &psref); m_freem(m); return; } int igmp_joingroup(struct in_multi *inm) { KASSERT(in_multi_lock_held()); inm->inm_state = IGMP_IDLE_MEMBER; if (!IN_LOCAL_GROUP(inm->inm_addr.s_addr) && (inm->inm_ifp->if_flags & IFF_LOOPBACK) == 0) { int report_type; report_type = rti_fill(inm); if (report_type == 0) { return ENOMEM; } igmp_sendpkt(inm, report_type); inm->inm_state = IGMP_DELAYING_MEMBER; inm->inm_timer = IGMP_RANDOM_DELAY( IGMP_MAX_HOST_REPORT_DELAY * PR_FASTHZ); igmp_timers_on = true; } else inm->inm_timer = 0; return 0; } void igmp_leavegroup(struct in_multi *inm) { KASSERT(in_multi_lock_held()); switch (inm->inm_state) { case IGMP_DELAYING_MEMBER: case IGMP_IDLE_MEMBER: if (!IN_LOCAL_GROUP(inm->inm_addr.s_addr) && (inm->inm_ifp->if_flags & IFF_LOOPBACK) == 0) if (inm->inm_rti->rti_type != IGMP_v1_ROUTER) igmp_sendpkt(inm, IGMP_HOST_LEAVE_MESSAGE); break; case IGMP_LAZY_MEMBER: case IGMP_AWAKENING_MEMBER: case IGMP_SLEEPING_MEMBER: break; } } void igmp_fasttimo(void) { struct in_multi *inm; struct in_multistep step; /* * Quick check to see if any work needs to be done, in order * to minimize the overhead of fasttimo processing. */ if (!igmp_timers_on) { return; } /* XXX: Needed for ip_output(). */ SOFTNET_LOCK_UNLESS_NET_MPSAFE(); in_multi_lock(RW_WRITER); igmp_timers_on = false; inm = in_first_multi(&step); while (inm != NULL) { if (inm->inm_timer == 0) { /* do nothing */ } else if (--inm->inm_timer == 0) { if (inm->inm_state == IGMP_DELAYING_MEMBER) { if (inm->inm_rti->rti_type == IGMP_v1_ROUTER) igmp_sendpkt(inm, IGMP_v1_HOST_MEMBERSHIP_REPORT); else igmp_sendpkt(inm, IGMP_v2_HOST_MEMBERSHIP_REPORT); inm->inm_state = IGMP_IDLE_MEMBER; } } else { igmp_timers_on = true; } inm = in_next_multi(&step); } in_multi_unlock(); SOFTNET_UNLOCK_UNLESS_NET_MPSAFE(); } void igmp_slowtimo(void) { router_info_t *rti; in_multi_lock(RW_WRITER); LIST_FOREACH(rti, &rti_head, rti_link) { if (rti->rti_type == IGMP_v1_ROUTER && ++rti->rti_age >= IGMP_AGE_THRESHOLD) { rti->rti_type = IGMP_v2_ROUTER; } } in_multi_unlock(); } /* * igmp_sendpkt: construct an IGMP packet, given the multicast structure * and the type, and send the datagram. */ static void igmp_sendpkt(struct in_multi *inm, int type) { struct mbuf *m; struct igmp *igmp; struct ip *ip; struct ip_moptions imo; KASSERT(in_multi_lock_held()); MGETHDR(m, M_DONTWAIT, MT_HEADER); if (m == NULL) return; KASSERT(max_linkhdr + sizeof(struct ip) + IGMP_MINLEN <= MHLEN); m->m_data += max_linkhdr; m->m_len = sizeof(struct ip) + IGMP_MINLEN; m->m_pkthdr.len = sizeof(struct ip) + IGMP_MINLEN; ip = mtod(m, struct ip *); ip->ip_tos = 0; ip->ip_len = htons(sizeof(struct ip) + IGMP_MINLEN); ip->ip_off = htons(0); ip->ip_ttl = IP_DEFAULT_MULTICAST_TTL; ip->ip_p = IPPROTO_IGMP; ip->ip_src = zeroin_addr; ip->ip_dst = inm->inm_addr; m->m_data += sizeof(struct ip); m->m_len -= sizeof(struct ip); igmp = mtod(m, struct igmp *); igmp->igmp_type = type; igmp->igmp_code = 0; igmp->igmp_group = inm->inm_addr; igmp->igmp_cksum = 0; igmp->igmp_cksum = in_cksum(m, IGMP_MINLEN); m->m_data -= sizeof(struct ip); m->m_len += sizeof(struct ip); imo.imo_multicast_if_index = if_get_index(inm->inm_ifp); imo.imo_multicast_ttl = 1; /* * Request loopback of the report if we are acting as a multicast * router, so that the process-level routing demon can hear it. */ #ifdef MROUTING extern struct socket *ip_mrouter; imo.imo_multicast_loop = (ip_mrouter != NULL); #else imo.imo_multicast_loop = 0; #endif /* * Note: IP_IGMP_MCAST indicates that in_multilock is held. * The caller must still acquire softnet_lock for ip_output(). */ #ifndef NET_MPSAFE KASSERT(mutex_owned(softnet_lock)); #endif ip_output(m, NULL, NULL, IP_IGMP_MCAST, &imo, NULL); IGMP_STATINC(IGMP_STAT_SND_REPORTS); } void igmp_purgeif(ifnet_t *ifp) { in_multi_lock(RW_WRITER); rti_delete(ifp); in_multi_unlock(); } static int sysctl_net_inet_igmp_stats(SYSCTLFN_ARGS) { return NETSTAT_SYSCTL(igmpstat_percpu, IGMP_NSTATS); } static void sysctl_net_inet_igmp_setup(struct sysctllog **clog) { sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT, CTLTYPE_NODE, "inet", NULL, NULL, 0, NULL, 0, CTL_NET, PF_INET, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT, CTLTYPE_NODE, "igmp", SYSCTL_DESCR("Internet Group Management Protocol"), NULL, 0, NULL, 0, CTL_NET, PF_INET, IPPROTO_IGMP, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT, CTLTYPE_STRUCT, "stats", SYSCTL_DESCR("IGMP statistics"), sysctl_net_inet_igmp_stats, 0, NULL, 0, CTL_NET, PF_INET, IPPROTO_IGMP, CTL_CREATE, CTL_EOL); }
202 203 203 211 211 211 209 5 210 211 8 8 2 8 8 209 208 210 210 211 209 211 211 203 211 211 211 202 203 203 8 8 8 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 /* $NetBSD: bus_dma.c,v 1.90 2023/03/28 19:55:42 riastradh Exp $ */ /*- * Copyright (c) 1996, 1997, 1998, 2007, 2020 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Charles M. Hannum and by Jason R. Thorpe of the Numerical Aerospace * Simulation Facility NASA Ames Research Center, and by Andrew Doran. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: bus_dma.c,v 1.90 2023/03/28 19:55:42 riastradh Exp $"); /* * The following is included because _bus_dma_uiomove is derived from * uiomove() in kern_subr.c. */ /* * Copyright (c) 1982, 1986, 1991, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Copyright (c) 1992, 1993 * The Regents of the University of California. All rights reserved. * * This software was developed by the Computer Systems Engineering group * at Lawrence Berkeley Laboratory under DARPA contract BG 91-66 and * contributed to Berkeley. * * All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Lawrence Berkeley Laboratory. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include "ioapic.h" #include "isa.h" #include "opt_mpbios.h" #include <sys/param.h> #include <sys/systm.h> #include <sys/kernel.h> #include <sys/kmem.h> #include <sys/malloc.h> #include <sys/mbuf.h> #include <sys/proc.h> #include <sys/asan.h> #include <sys/msan.h> #include <sys/bus.h> #include <machine/bus_private.h> #if NIOAPIC > 0 #include <machine/i82093var.h> #endif #ifdef MPBIOS #include <machine/mpbiosvar.h> #endif #include <machine/pmap_private.h> #if NISA > 0 #include <dev/isa/isareg.h> #include <dev/isa/isavar.h> #endif #include <uvm/uvm.h> extern paddr_t avail_end; #define IDTVEC(name) __CONCAT(X,name) typedef void (vector)(void); extern vector *IDTVEC(intr)[]; #define BUSDMA_BOUNCESTATS #ifdef BUSDMA_BOUNCESTATS #define BUSDMA_EVCNT_DECL(name) \ static struct evcnt bus_dma_ev_##name = \ EVCNT_INITIALIZER(EVCNT_TYPE_MISC, NULL, "bus_dma", #name); \ EVCNT_ATTACH_STATIC(bus_dma_ev_##name) #define STAT_INCR(name) \ bus_dma_ev_##name.ev_count++ #define STAT_DECR(name) \ bus_dma_ev_##name.ev_count-- BUSDMA_EVCNT_DECL(nbouncebufs); BUSDMA_EVCNT_DECL(loads); BUSDMA_EVCNT_DECL(bounces); #else #define STAT_INCR(x) #define STAT_DECR(x) #endif static int _bus_dmamap_create(bus_dma_tag_t, bus_size_t, int, bus_size_t, bus_size_t, int, bus_dmamap_t *); static void _bus_dmamap_destroy(bus_dma_tag_t, bus_dmamap_t); static int _bus_dmamap_load(bus_dma_tag_t, bus_dmamap_t, void *, bus_size_t, struct proc *, int); static int _bus_dmamap_load_mbuf(bus_dma_tag_t, bus_dmamap_t, struct mbuf *, int); static int _bus_dmamap_load_uio(bus_dma_tag_t, bus_dmamap_t, struct uio *, int); static int _bus_dmamap_load_raw(bus_dma_tag_t, bus_dmamap_t, bus_dma_segment_t *, int, bus_size_t, int); static void _bus_dmamap_unload(bus_dma_tag_t, bus_dmamap_t); static void _bus_dmamap_sync(bus_dma_tag_t, bus_dmamap_t, bus_addr_t, bus_size_t, int); static int _bus_dmamem_alloc(bus_dma_tag_t tag, bus_size_t size, bus_size_t alignment, bus_size_t boundary, bus_dma_segment_t *segs, int nsegs, int *rsegs, int flags); static void _bus_dmamem_free(bus_dma_tag_t tag, bus_dma_segment_t *segs, int nsegs); static int _bus_dmamem_map(bus_dma_tag_t tag, bus_dma_segment_t *segs, int nsegs, size_t size, void **kvap, int flags); static void _bus_dmamem_unmap(bus_dma_tag_t tag, void *kva, size_t size); static paddr_t _bus_dmamem_mmap(bus_dma_tag_t tag, bus_dma_segment_t *segs, int nsegs, off_t off, int prot, int flags); static int _bus_dmatag_subregion(bus_dma_tag_t tag, bus_addr_t min_addr, bus_addr_t max_addr, bus_dma_tag_t *newtag, int flags); static void _bus_dmatag_destroy(bus_dma_tag_t tag); static int _bus_dma_uiomove(void *, struct uio *, size_t, int); static int _bus_dma_alloc_bouncebuf(bus_dma_tag_t t, bus_dmamap_t map, bus_size_t size, int flags); static void _bus_dma_free_bouncebuf(bus_dma_tag_t t, bus_dmamap_t map); static int _bus_dmamap_load_buffer(bus_dma_tag_t t, bus_dmamap_t map, void *buf, bus_size_t buflen, struct vmspace *vm, int flags); static int _bus_dmamap_load_busaddr(bus_dma_tag_t, bus_dmamap_t, bus_addr_t, bus_size_t); #ifndef _BUS_DMAMEM_ALLOC_RANGE static int _bus_dmamem_alloc_range(bus_dma_tag_t tag, bus_size_t size, bus_size_t alignment, bus_size_t boundary, bus_dma_segment_t *segs, int nsegs, int *rsegs, int flags, bus_addr_t low, bus_addr_t high); #define _BUS_DMAMEM_ALLOC_RANGE _bus_dmamem_alloc_range /* * Allocate physical memory from the given physical address range. * Called by DMA-safe memory allocation methods. */ static int _bus_dmamem_alloc_range(bus_dma_tag_t t, bus_size_t size, bus_size_t alignment, bus_size_t boundary, bus_dma_segment_t *segs, int nsegs, int *rsegs, int flags, bus_addr_t low, bus_addr_t high) { paddr_t curaddr, lastaddr; struct vm_page *m; struct pglist mlist; int curseg, error; bus_size_t uboundary; /* Always round the size. */ size = round_page(size); KASSERTMSG(boundary >= PAGE_SIZE || boundary == 0, "boundary=0x%"PRIxBUSSIZE, boundary); /* * Allocate pages from the VM system. * We accept boundaries < size, splitting in multiple segments * if needed. uvm_pglistalloc does not, so compute an appropriate * boundary: next power of 2 >= size */ if (boundary == 0) uboundary = 0; else { uboundary = boundary; while (uboundary < size) uboundary = uboundary << 1; } error = uvm_pglistalloc(size, low, high, alignment, uboundary, &mlist, nsegs, (flags & BUS_DMA_NOWAIT) == 0); if (error) return (error); /* * Compute the location, size, and number of segments actually * returned by the VM code. */ m = TAILQ_FIRST(&mlist); curseg = 0; lastaddr = segs[curseg].ds_addr = VM_PAGE_TO_PHYS(m); segs[curseg].ds_len = PAGE_SIZE; m = m->pageq.queue.tqe_next; for (; m != NULL; m = m->pageq.queue.tqe_next) { curaddr = VM_PAGE_TO_PHYS(m); KASSERTMSG(curaddr >= low, "curaddr=%#"PRIxPADDR " low=%#"PRIxBUSADDR" high=%#"PRIxBUSADDR, curaddr, low, high); KASSERTMSG(curaddr < high, "curaddr=%#"PRIxPADDR " low=%#"PRIxBUSADDR" high=%#"PRIxBUSADDR, curaddr, low, high); if (curaddr == (lastaddr + PAGE_SIZE) && (lastaddr & boundary) == (curaddr & boundary)) { segs[curseg].ds_len += PAGE_SIZE; } else { curseg++; KASSERTMSG(curseg < nsegs, "curseg %d size %llx", curseg, (long long)size); segs[curseg].ds_addr = curaddr; segs[curseg].ds_len = PAGE_SIZE; } lastaddr = curaddr; } *rsegs = curseg + 1; return (0); } #endif /* _BUS_DMAMEM_ALLOC_RANGE */ /* * Create a DMA map. */ static int _bus_dmamap_create(bus_dma_tag_t t, bus_size_t size, int nsegments, bus_size_t maxsegsz, bus_size_t boundary, int flags, bus_dmamap_t *dmamp) { struct x86_bus_dma_cookie *cookie; bus_dmamap_t map; int error, cookieflags; void *cookiestore, *mapstore; size_t cookiesize, mapsize; /* * Allocate and initialize the DMA map. The end of the map * is a variable-sized array of segments, so we allocate enough * room for them in one shot. * * Note we don't preserve the WAITOK or NOWAIT flags. Preservation * of ALLOCNOW notifies others that we've reserved these resources, * and they are not to be freed. * * The bus_dmamap_t includes one bus_dma_segment_t, hence * the (nsegments - 1). */ error = 0; mapsize = sizeof(struct x86_bus_dmamap) + (sizeof(bus_dma_segment_t) * (nsegments - 1)); if ((mapstore = malloc(mapsize, M_DMAMAP, M_ZERO | ((flags & BUS_DMA_NOWAIT) ? M_NOWAIT : M_WAITOK))) == NULL) return (ENOMEM); map = (struct x86_bus_dmamap *)mapstore; map->_dm_size = size; map->_dm_segcnt = nsegments; map->_dm_maxmaxsegsz = maxsegsz; map->_dm_boundary = boundary; map->_dm_bounce_thresh = t->_bounce_thresh; map->_dm_flags = flags & ~(BUS_DMA_WAITOK|BUS_DMA_NOWAIT); map->dm_maxsegsz = maxsegsz; map->dm_mapsize = 0; /* no valid mappings */ map->dm_nsegs = 0; if (t->_bounce_thresh == 0 || _BUS_AVAIL_END <= t->_bounce_thresh - 1) map->_dm_bounce_thresh = 0; cookieflags = 0; if (t->_may_bounce != NULL) { error = t->_may_bounce(t, map, flags, &cookieflags); if (error != 0) goto out; } if (map->_dm_bounce_thresh != 0) cookieflags |= X86_DMA_MIGHT_NEED_BOUNCE; if ((cookieflags & X86_DMA_MIGHT_NEED_BOUNCE) == 0) { *dmamp = map; return 0; } cookiesize = sizeof(struct x86_bus_dma_cookie) + (sizeof(bus_dma_segment_t) * map->_dm_segcnt); /* * Allocate our cookie. */ if ((cookiestore = malloc(cookiesize, M_DMAMAP, M_ZERO | ((flags & BUS_DMA_NOWAIT) ? M_NOWAIT : M_WAITOK))) == NULL) { error = ENOMEM; goto out; } cookie = (struct x86_bus_dma_cookie *)cookiestore; cookie->id_flags = cookieflags; map->_dm_cookie = cookie; error = _bus_dma_alloc_bouncebuf(t, map, size, flags); out: if (error) _bus_dmamap_destroy(t, map); else *dmamp = map; return (error); } /* * Destroy a DMA map. */ static void _bus_dmamap_destroy(bus_dma_tag_t t, bus_dmamap_t map) { struct x86_bus_dma_cookie *cookie = map->_dm_cookie; /* * Free any bounce pages this map might hold. */ if (cookie != NULL) { if (cookie->id_flags & X86_DMA_HAS_BOUNCE) _bus_dma_free_bouncebuf(t, map); free(cookie, M_DMAMAP); } free(map, M_DMAMAP); } /* * Load a DMA map with a linear buffer. */ static int _bus_dmamap_load(bus_dma_tag_t t, bus_dmamap_t map, void *buf, bus_size_t buflen, struct proc *p, int flags) { struct x86_bus_dma_cookie *cookie = map->_dm_cookie; int error; struct vmspace *vm; STAT_INCR(loads); /* * Make sure that on error condition we return "no valid mappings." */ map->dm_mapsize = 0; map->dm_nsegs = 0; KASSERTMSG(map->dm_maxsegsz <= map->_dm_maxmaxsegsz, "maxsegsz=0x%"PRIxBUSSIZE", maxmaxsegsz=0x%"PRIxBUSSIZE, map->dm_maxsegsz, map->_dm_maxmaxsegsz); if (buflen > map->_dm_size) return EINVAL; if (p != NULL) { vm = p->p_vmspace; } else { vm = vmspace_kernel(); } error = _bus_dmamap_load_buffer(t, map, buf, buflen, vm, flags); if (error == 0) { if (cookie != NULL) cookie->id_flags &= ~X86_DMA_IS_BOUNCING; map->dm_mapsize = buflen; return 0; } if (cookie == NULL || (cookie->id_flags & X86_DMA_MIGHT_NEED_BOUNCE) == 0) return error; /* * First attempt failed; bounce it. */ STAT_INCR(bounces); /* * Allocate bounce pages, if necessary. */ if ((cookie->id_flags & X86_DMA_HAS_BOUNCE) == 0) { error = _bus_dma_alloc_bouncebuf(t, map, buflen, flags); if (error) return (error); } /* * Cache a pointer to the caller's buffer and load the DMA map * with the bounce buffer. */ cookie->id_origbuf = buf; cookie->id_origbuflen = buflen; cookie->id_buftype = X86_DMA_BUFTYPE_LINEAR; map->dm_nsegs = 0; error = bus_dmamap_load(t, map, cookie->id_bouncebuf, buflen, p, flags); if (error) return (error); /* ...so _bus_dmamap_sync() knows we're bouncing */ cookie->id_flags |= X86_DMA_IS_BOUNCING; return (0); } static int _bus_dmamap_load_busaddr(bus_dma_tag_t t, bus_dmamap_t map, bus_addr_t addr, bus_size_t size) { bus_dma_segment_t * const segs = map->dm_segs; int nseg = map->dm_nsegs; bus_addr_t bmask = ~(map->_dm_boundary - 1); bus_addr_t lastaddr = 0xdead; /* XXX gcc */ bus_size_t sgsize; if (nseg > 0) lastaddr = segs[nseg-1].ds_addr + segs[nseg-1].ds_len; again: sgsize = size; /* * Make sure we don't cross any boundaries. */ if (map->_dm_boundary > 0) { bus_addr_t baddr; /* next boundary address */ baddr = (addr + map->_dm_boundary) & bmask; if (sgsize > (baddr - addr)) sgsize = (baddr - addr); } /* * Insert chunk into a segment, coalescing with * previous segment if possible. */ if (nseg > 0 && addr == lastaddr && segs[nseg-1].ds_len + sgsize <= map->dm_maxsegsz && (map->_dm_boundary == 0 || (segs[nseg-1].ds_addr & bmask) == (addr & bmask))) { /* coalesce */ segs[nseg-1].ds_len += sgsize; } else if (nseg >= map->_dm_segcnt) { return EFBIG; } else { /* new segment */ segs[nseg].ds_addr = addr; segs[nseg].ds_len = sgsize; nseg++; } lastaddr = addr + sgsize; if (map->_dm_bounce_thresh != 0 && lastaddr > map->_dm_bounce_thresh) return EINVAL; addr += sgsize; size -= sgsize; if (size > 0) goto again; map->dm_nsegs = nseg; return 0; } /* * Like _bus_dmamap_load(), but for mbufs. */ static int _bus_dmamap_load_mbuf(bus_dma_tag_t t, bus_dmamap_t map, struct mbuf *m0, int flags) { struct x86_bus_dma_cookie *cookie = map->_dm_cookie; int error; struct mbuf *m; /* * Make sure on error condition we return "no valid mappings." */ map->dm_mapsize = 0; map->dm_nsegs = 0; KASSERTMSG(map->dm_maxsegsz <= map->_dm_maxmaxsegsz, "maxsegsz=0x%"PRIxBUSSIZE", maxmaxsegsz=0x%"PRIxBUSSIZE, map->dm_maxsegsz, map->_dm_maxmaxsegsz); KASSERTMSG(m0->m_flags & M_PKTHDR, "m0=%p m_flags=0x%x", m0, m0->m_flags); if (m0->m_pkthdr.len > map->_dm_size) return (EINVAL); error = 0; for (m = m0; m != NULL && error == 0; m = m->m_next) { int offset; int remainbytes; const struct vm_page * const *pgs; paddr_t paddr; int size; if (m->m_len == 0) continue; switch (m->m_flags & (M_EXT|M_EXT_CLUSTER|M_EXT_PAGES)) { case M_EXT|M_EXT_CLUSTER: /* XXX KDASSERT */ KASSERT(m->m_ext.ext_paddr != M_PADDR_INVALID); paddr = m->m_ext.ext_paddr + (m->m_data - m->m_ext.ext_buf); size = m->m_len; error = _bus_dmamap_load_busaddr(t, map, _BUS_PHYS_TO_BUS(paddr), size); break; case M_EXT|M_EXT_PAGES: KASSERTMSG(m->m_ext.ext_buf <= m->m_data, "m=%p m_ext.ext_buf=%p m_ext.ext_size=%zu" " m_data=%p", m, m->m_ext.ext_buf, m->m_ext.ext_size, m->m_data); KASSERTMSG((m->m_data <= m->m_ext.ext_buf + m->m_ext.ext_size), "m=%p m_ext.ext_buf=%p m_ext.ext_size=%zu" " m_data=%p", m, m->m_ext.ext_buf, m->m_ext.ext_size, m->m_data); offset = (vaddr_t)m->m_data - trunc_page((vaddr_t)m->m_ext.ext_buf); remainbytes = m->m_len; /* skip uninteresting pages */ pgs = (const struct vm_page * const *) m->m_ext.ext_pgs + (offset >> PAGE_SHIFT); offset &= PAGE_MASK; /* offset in the first page */ /* load each pages */ while (remainbytes > 0) { const struct vm_page *pg; bus_addr_t busaddr; size = MIN(remainbytes, PAGE_SIZE - offset); pg = *pgs++; KASSERT(pg); busaddr = _BUS_VM_PAGE_TO_BUS(pg) + offset; error = _bus_dmamap_load_busaddr(t, map, busaddr, size); if (error) break; offset = 0; remainbytes -= size; } break; case 0: paddr = m->m_paddr + M_BUFOFFSET(m) + (m->m_data - M_BUFADDR(m)); size = m->m_len; error = _bus_dmamap_load_busaddr(t, map, _BUS_PHYS_TO_BUS(paddr), size); break; default: error = _bus_dmamap_load_buffer(t, map, m->m_data, m->m_len, vmspace_kernel(), flags); } } if (error == 0) { map->dm_mapsize = m0->m_pkthdr.len; return 0; } map->dm_nsegs = 0; if (cookie == NULL || (cookie->id_flags & X86_DMA_MIGHT_NEED_BOUNCE) == 0) return error; /* * First attempt failed; bounce it. */ STAT_INCR(bounces); /* * Allocate bounce pages, if necessary. */ if ((cookie->id_flags & X86_DMA_HAS_BOUNCE) == 0) { error = _bus_dma_alloc_bouncebuf(t, map, m0->m_pkthdr.len, flags); if (error) return (error); } /* * Cache a pointer to the caller's buffer and load the DMA map * with the bounce buffer. */ cookie->id_origbuf = m0; cookie->id_origbuflen = m0->m_pkthdr.len; /* not really used */ cookie->id_buftype = X86_DMA_BUFTYPE_MBUF; error = bus_dmamap_load(t, map, cookie->id_bouncebuf, m0->m_pkthdr.len, NULL, flags); if (error) return (error); /* ...so _bus_dmamap_sync() knows we're bouncing */ cookie->id_flags |= X86_DMA_IS_BOUNCING; return (0); } /* * Like _bus_dmamap_load(), but for uios. */ static int _bus_dmamap_load_uio(bus_dma_tag_t t, bus_dmamap_t map, struct uio *uio, int flags) { int i, error; bus_size_t minlen, resid; struct vmspace *vm; struct iovec *iov; void *addr; struct x86_bus_dma_cookie *cookie = map->_dm_cookie; /* * Make sure that on error condition we return "no valid mappings." */ map->dm_mapsize = 0; map->dm_nsegs = 0; KASSERTMSG(map->dm_maxsegsz <= map->_dm_maxmaxsegsz, "maxsegsz=0x%"PRIxBUSSIZE", maxmaxsegsz=0x%"PRIxBUSSIZE, map->dm_maxsegsz, map->_dm_maxmaxsegsz); resid = uio->uio_resid; iov = uio->uio_iov; vm = uio->uio_vmspace; error = 0; for (i = 0; i < uio->uio_iovcnt && resid != 0 && error == 0; i++) { /* * Now at the first iovec to load. Load each iovec * until we have exhausted the residual count. */ minlen = resid < iov[i].iov_len ? resid : iov[i].iov_len; addr = (void *)iov[i].iov_base; error = _bus_dmamap_load_buffer(t, map, addr, minlen, vm, flags); resid -= minlen; } if (error == 0) { map->dm_mapsize = uio->uio_resid; return 0; } map->dm_nsegs = 0; if (cookie == NULL || (cookie->id_flags & X86_DMA_MIGHT_NEED_BOUNCE) == 0) return error; STAT_INCR(bounces); /* * Allocate bounce pages, if necessary. */ if ((cookie->id_flags & X86_DMA_HAS_BOUNCE) == 0) { error = _bus_dma_alloc_bouncebuf(t, map, uio->uio_resid, flags); if (error) return (error); } /* * Cache a pointer to the caller's buffer and load the DMA map * with the bounce buffer. */ cookie->id_origbuf = uio; cookie->id_origbuflen = uio->uio_resid; cookie->id_buftype = X86_DMA_BUFTYPE_UIO; error = bus_dmamap_load(t, map, cookie->id_bouncebuf, uio->uio_resid, NULL, flags); if (error) return (error); /* ...so _bus_dmamap_sync() knows we're bouncing */ cookie->id_flags |= X86_DMA_IS_BOUNCING; return (0); } /* * Like _bus_dmamap_load(), but for raw memory allocated with * bus_dmamem_alloc(). */ static int _bus_dmamap_load_raw(bus_dma_tag_t t, bus_dmamap_t map, bus_dma_segment_t *segs, int nsegs, bus_size_t size0, int flags) { bus_size_t size; int i, error = 0; /* * Make sure that on error condition we return "no valid mappings." */ map->dm_mapsize = 0; map->dm_nsegs = 0; KASSERTMSG(map->dm_maxsegsz <= map->_dm_maxmaxsegsz, "maxsegsz=0x%"PRIxBUSSIZE", maxmaxsegsz=0x%"PRIxBUSSIZE, map->dm_maxsegsz, map->_dm_maxmaxsegsz); if (size0 > map->_dm_size) return EINVAL; for (i = 0, size = size0; i < nsegs && size > 0; i++) { bus_dma_segment_t *ds = &segs[i]; bus_size_t sgsize; sgsize = MIN(ds->ds_len, size); if (sgsize == 0) continue; error = _bus_dmamap_load_busaddr(t, map, ds->ds_addr, sgsize); if (error != 0) break; size -= sgsize; } if (error != 0) { map->dm_mapsize = 0; map->dm_nsegs = 0; return error; } /* XXX TBD bounce */ map->dm_mapsize = size0; return 0; } /* * Unload a DMA map. */ static void _bus_dmamap_unload(bus_dma_tag_t t, bus_dmamap_t map) { struct x86_bus_dma_cookie *cookie = map->_dm_cookie; /* * If we have bounce pages, free them, unless they're * reserved for our exclusive use. */ if (cookie != NULL) { cookie->id_flags &= ~X86_DMA_IS_BOUNCING; cookie->id_buftype = X86_DMA_BUFTYPE_INVALID; } map->dm_maxsegsz = map->_dm_maxmaxsegsz; map->dm_mapsize = 0; map->dm_nsegs = 0; } /* * Synchronize a DMA map. * * Reference: * * AMD64 Architecture Programmer's Manual, Volume 2: System * Programming, 24593--Rev. 3.38--November 2021, Sec. 7.4.2 Memory * Barrier Interaction with Memory Types, Table 7-3, p. 196. * https://web.archive.org/web/20220625040004/https://www.amd.com/system/files/TechDocs/24593.pdf#page=256 */ static void _bus_dmamap_sync(bus_dma_tag_t t, bus_dmamap_t map, bus_addr_t offset, bus_size_t len, int ops) { struct x86_bus_dma_cookie *cookie = map->_dm_cookie; /* * Mixing PRE and POST operations is not allowed. */ if ((ops & (BUS_DMASYNC_PREREAD|BUS_DMASYNC_PREWRITE)) != 0 && (ops & (BUS_DMASYNC_POSTREAD|BUS_DMASYNC_POSTWRITE)) != 0) panic("%s: mix PRE and POST", __func__); if ((ops & (BUS_DMASYNC_PREWRITE|BUS_DMASYNC_POSTREAD)) != 0) { KASSERTMSG(offset < map->dm_mapsize, "bad offset 0x%"PRIxBUSADDR" >= 0x%"PRIxBUSSIZE, offset, map->dm_mapsize); KASSERTMSG(len <= map->dm_mapsize - offset, "bad length 0x%"PRIxBUSADDR" + 0x%"PRIxBUSSIZE " > 0x%"PRIxBUSSIZE, offset, len, map->dm_mapsize); } /* * BUS_DMASYNC_POSTREAD: The caller has been alerted to DMA * completion by reading a register or DMA descriptor, and the * caller is about to read out of the DMA memory buffer that * the device just filled. * * => LFENCE ensures that these happen in order so that the * caller, or the bounce buffer logic here, doesn't proceed * to read any stale data from cache or speculation. x86 * never reorders loads from wp/wt/wb or uc memory, but it * may execute loads from wc/wc+ memory early, e.g. with * BUS_SPACE_MAP_PREFETCHABLE. */ if (ops & BUS_DMASYNC_POSTREAD) x86_lfence(); /* * If we're not bouncing, just return; nothing to do. */ if (len == 0 || cookie == NULL || (cookie->id_flags & X86_DMA_IS_BOUNCING) == 0) goto end; switch (cookie->id_buftype) { case X86_DMA_BUFTYPE_LINEAR: /* * Nothing to do for pre-read. */ if (ops & BUS_DMASYNC_PREWRITE) { /* * Copy the caller's buffer to the bounce buffer. */ memcpy((char *)cookie->id_bouncebuf + offset, (char *)cookie->id_origbuf + offset, len); } if (ops & BUS_DMASYNC_POSTREAD) { /* * Copy the bounce buffer to the caller's buffer. */ memcpy((char *)cookie->id_origbuf + offset, (char *)cookie->id_bouncebuf + offset, len); } /* * Nothing to do for post-write. */ break; case X86_DMA_BUFTYPE_MBUF: { struct mbuf *m, *m0 = cookie->id_origbuf; bus_size_t minlen, moff; /* * Nothing to do for pre-read. */ if (ops & BUS_DMASYNC_PREWRITE) { /* * Copy the caller's buffer to the bounce buffer. */ m_copydata(m0, offset, len, (char *)cookie->id_bouncebuf + offset); } if (ops & BUS_DMASYNC_POSTREAD) { /* * Copy the bounce buffer to the caller's buffer. */ for (moff = offset, m = m0; m != NULL && len != 0; m = m->m_next) { /* Find the beginning mbuf. */ if (moff >= m->m_len) { moff -= m->m_len; continue; } /* * Now at the first mbuf to sync; nail * each one until we have exhausted the * length. */ minlen = len < m->m_len - moff ? len : m->m_len - moff; memcpy(mtod(m, char *) + moff, (char *)cookie->id_bouncebuf + offset, minlen); moff = 0; len -= minlen; offset += minlen; } } /* * Nothing to do for post-write. */ break; } case X86_DMA_BUFTYPE_UIO: { struct uio *uio; uio = (struct uio *)cookie->id_origbuf; /* * Nothing to do for pre-read. */ if (ops & BUS_DMASYNC_PREWRITE) { /* * Copy the caller's buffer to the bounce buffer. */ _bus_dma_uiomove((char *)cookie->id_bouncebuf + offset, uio, len, UIO_WRITE); } if (ops & BUS_DMASYNC_POSTREAD) { _bus_dma_uiomove((char *)cookie->id_bouncebuf + offset, uio, len, UIO_READ); } /* * Nothing to do for post-write. */ break; } case X86_DMA_BUFTYPE_RAW: panic("%s: X86_DMA_BUFTYPE_RAW", __func__); break; case X86_DMA_BUFTYPE_INVALID: panic("%s: X86_DMA_BUFTYPE_INVALID", __func__); break; default: panic("%s: unknown buffer type %d", __func__, cookie->id_buftype); break; } end: /* * BUS_DMASYNC_PREREAD: The caller may have previously been * using a DMA memory buffer, with loads and stores, and is * about to trigger DMA by writing to a register or DMA * descriptor. * * => SFENCE ensures that the stores happen in order, in case * the latter one is non-temporal or to wc/wc+ memory and * thus may be executed early. x86 never reorders * load;store to store;load for any memory type, so no * barrier is needed for prior loads. * * BUS_DMASYNC_PREWRITE: The caller has just written to a DMA * memory buffer, or we just wrote to to the bounce buffer, * data that the device needs to use, and the caller is about * to trigger DMA by writing to a register or DMA descriptor. * * => SFENCE ensures that these happen in order so that any * buffered stores are visible to the device before the DMA * is triggered. x86 never reorders (non-temporal) stores * to wp/wt/wb or uc memory, but it may reorder two stores * if one is to wc/wc+ memory, e.g. if the DMA descriptor is * mapped with BUS_SPACE_MAP_PREFETCHABLE. */ if (ops & (BUS_DMASYNC_PREREAD|BUS_DMASYNC_PREWRITE)) x86_sfence(); /* * BUS_DMASYNC_POSTWRITE: The caller has been alerted to DMA * completion by reading a register or DMA descriptor, and the * caller may proceed to reuse the DMA memory buffer, with * loads and stores. * * => No barrier is needed. Since the DMA memory buffer is not * changing (we're sending data to the device, not receiving * data from the device), prefetched loads are safe. x86 * never reoreders load;store to store;load for any memory * type, so early execution of stores prior to witnessing * the DMA completion is not possible. */ } /* * Allocate memory safe for DMA. */ static int _bus_dmamem_alloc(bus_dma_tag_t t, bus_size_t size, bus_size_t alignment, bus_size_t boundary, bus_dma_segment_t *segs, int nsegs, int *rsegs, int flags) { bus_addr_t high; if (t->_bounce_alloc_hi != 0 && _BUS_AVAIL_END > t->_bounce_alloc_hi - 1) high = t->_bounce_alloc_hi - 1; else high = _BUS_AVAIL_END; return (_BUS_DMAMEM_ALLOC_RANGE(t, size, alignment, boundary, segs, nsegs, rsegs, flags, t->_bounce_alloc_lo, high)); } static int _bus_dma_alloc_bouncebuf(bus_dma_tag_t t, bus_dmamap_t map, bus_size_t size, int flags) { struct x86_bus_dma_cookie *cookie = map->_dm_cookie; int error = 0; KASSERT(cookie != NULL); cookie->id_bouncebuflen = round_page(size); error = _bus_dmamem_alloc(t, cookie->id_bouncebuflen, PAGE_SIZE, map->_dm_boundary, cookie->id_bouncesegs, map->_dm_segcnt, &cookie->id_nbouncesegs, flags); if (error) { cookie->id_bouncebuflen = 0; cookie->id_nbouncesegs = 0; return error; } error = _bus_dmamem_map(t, cookie->id_bouncesegs, cookie->id_nbouncesegs, cookie->id_bouncebuflen, (void **)&cookie->id_bouncebuf, flags); if (error) { _bus_dmamem_free(t, cookie->id_bouncesegs, cookie->id_nbouncesegs); cookie->id_bouncebuflen = 0; cookie->id_nbouncesegs = 0; } else { cookie->id_flags |= X86_DMA_HAS_BOUNCE; STAT_INCR(nbouncebufs); } return (error); } static void _bus_dma_free_bouncebuf(bus_dma_tag_t t, bus_dmamap_t map) { struct x86_bus_dma_cookie *cookie = map->_dm_cookie; KASSERT(cookie != NULL); STAT_DECR(nbouncebufs); _bus_dmamem_unmap(t, cookie->id_bouncebuf, cookie->id_bouncebuflen); _bus_dmamem_free(t, cookie->id_bouncesegs, cookie->id_nbouncesegs); cookie->id_bouncebuflen = 0; cookie->id_nbouncesegs = 0; cookie->id_flags &= ~X86_DMA_HAS_BOUNCE; } /* * This function does the same as uiomove, but takes an explicit * direction, and does not update the uio structure. */ static int _bus_dma_uiomove(void *buf, struct uio *uio, size_t n, int direction) { struct iovec *iov; int error; struct vmspace *vm; char *cp; size_t resid, cnt; int i; iov = uio->uio_iov; vm = uio->uio_vmspace; cp = buf; resid = n; for (i = 0; i < uio->uio_iovcnt && resid > 0; i++) { iov = &uio->uio_iov[i]; if (iov->iov_len == 0) continue; cnt = MIN(resid, iov->iov_len); if (!VMSPACE_IS_KERNEL_P(vm)) { preempt_point(); } if (direction == UIO_READ) { error = copyout_vmspace(vm, cp, iov->iov_base, cnt); } else { error = copyin_vmspace(vm, iov->iov_base, cp, cnt); } if (error) return (error); cp += cnt; resid -= cnt; } return (0); } /* * Common function for freeing DMA-safe memory. May be called by * bus-specific DMA memory free functions. */ static void _bus_dmamem_free(bus_dma_tag_t t, bus_dma_segment_t *segs, int nsegs) { struct vm_page *m; bus_addr_t addr; struct pglist mlist; int curseg; /* * Build a list of pages to free back to the VM system. */ TAILQ_INIT(&mlist); for (curseg = 0; curseg < nsegs; curseg++) { for (addr = segs[curseg].ds_addr; addr < (segs[curseg].ds_addr + segs[curseg].ds_len); addr += PAGE_SIZE) { m = _BUS_BUS_TO_VM_PAGE(addr); TAILQ_INSERT_TAIL(&mlist, m, pageq.queue); } } uvm_pglistfree(&mlist); } /* * Common function for mapping DMA-safe memory. May be called by * bus-specific DMA memory map functions. * This supports BUS_DMA_NOCACHE. */ static int _bus_dmamem_map(bus_dma_tag_t t, bus_dma_segment_t *segs, int nsegs, size_t size, void **kvap, int flags) { vaddr_t va; bus_addr_t addr; int curseg; const uvm_flag_t kmflags = (flags & BUS_DMA_NOWAIT) != 0 ? UVM_KMF_NOWAIT : 0; u_int pmapflags = PMAP_WIRED | VM_PROT_READ | VM_PROT_WRITE; size = round_page(size); if (flags & BUS_DMA_NOCACHE) pmapflags |= PMAP_NOCACHE; va = uvm_km_alloc(kernel_map, size, 0, UVM_KMF_VAONLY | kmflags); if (va == 0) return ENOMEM; *kvap = (void *)va; for (curseg = 0; curseg < nsegs; curseg++) { for (addr = segs[curseg].ds_addr; addr < (segs[curseg].ds_addr + segs[curseg].ds_len); addr += PAGE_SIZE, va += PAGE_SIZE, size -= PAGE_SIZE) { if (size == 0) panic("_bus_dmamem_map: size botch"); _BUS_PMAP_ENTER(pmap_kernel(), va, addr, VM_PROT_READ | VM_PROT_WRITE, pmapflags); } } pmap_update(pmap_kernel()); return 0; } /* * Common function for unmapping DMA-safe memory. May be called by * bus-specific DMA memory unmapping functions. */ static void _bus_dmamem_unmap(bus_dma_tag_t t, void *kva, size_t size) { pt_entry_t *pte, opte; vaddr_t va, sva, eva; KASSERTMSG(((uintptr_t)kva & PGOFSET) == 0, "kva=%p", kva); size = round_page(size); sva = (vaddr_t)kva; eva = sva + size; /* * mark pages cacheable again. */ for (va = sva; va < eva; va += PAGE_SIZE) { pte = kvtopte(va); opte = *pte; if ((opte & PTE_PCD) != 0) pmap_pte_clearbits(pte, PTE_PCD); } pmap_remove(pmap_kernel(), (vaddr_t)kva, (vaddr_t)kva + size); pmap_update(pmap_kernel()); uvm_km_free(kernel_map, (vaddr_t)kva, size, UVM_KMF_VAONLY); } /* * Common function for mmap(2)'ing DMA-safe memory. May be called by * bus-specific DMA mmap(2)'ing functions. */ static paddr_t _bus_dmamem_mmap(bus_dma_tag_t t, bus_dma_segment_t *segs, int nsegs, off_t off, int prot, int flags) { int i; for (i = 0; i < nsegs; i++) { KASSERTMSG((off & PGOFSET) == 0, "off=0x%jx", (uintmax_t)off); KASSERTMSG((segs[i].ds_addr & PGOFSET) == 0, "segs[%u].ds_addr=%"PRIxBUSADDR, i, segs[i].ds_addr); KASSERTMSG((segs[i].ds_len & PGOFSET) == 0, "segs[%u].ds_len=%"PRIxBUSSIZE, i, segs[i].ds_len); if (off >= segs[i].ds_len) { off -= segs[i].ds_len; continue; } return (x86_btop(_BUS_BUS_TO_PHYS(segs[i].ds_addr + off))); } /* Page not found. */ return (-1); } /********************************************************************** * DMA utility functions **********************************************************************/ /* * Utility function to load a linear buffer. */ static int _bus_dmamap_load_buffer(bus_dma_tag_t t, bus_dmamap_t map, void *buf, bus_size_t buflen, struct vmspace *vm, int flags) { bus_size_t sgsize; bus_addr_t curaddr; vaddr_t vaddr = (vaddr_t)buf; pmap_t pmap; if (vm != NULL) pmap = vm_map_pmap(&vm->vm_map); else pmap = pmap_kernel(); while (buflen > 0) { int error; /* * Get the bus address for this segment. */ curaddr = _BUS_VIRT_TO_BUS(pmap, vaddr); /* * Compute the segment size, and adjust counts. */ sgsize = PAGE_SIZE - ((u_long)vaddr & PGOFSET); if (buflen < sgsize) sgsize = buflen; /* * If we're beyond the bounce threshold, notify * the caller. */ if (map->_dm_bounce_thresh != 0 && curaddr + sgsize >= map->_dm_bounce_thresh) return (EINVAL); error = _bus_dmamap_load_busaddr(t, map, curaddr, sgsize); if (error) return error; vaddr += sgsize; buflen -= sgsize; } return (0); } static int _bus_dmatag_subregion(bus_dma_tag_t tag, bus_addr_t min_addr, bus_addr_t max_addr, bus_dma_tag_t *newtag, int flags) { if ((tag->_bounce_thresh != 0 && max_addr >= tag->_bounce_thresh - 1) && (tag->_bounce_alloc_hi != 0 && max_addr >= tag->_bounce_alloc_hi - 1) && (min_addr <= tag->_bounce_alloc_lo)) { *newtag = tag; /* if the tag must be freed, add a reference */ if (tag->_tag_needs_free) (tag->_tag_needs_free)++; return 0; } if ((*newtag = malloc(sizeof(struct x86_bus_dma_tag), M_DMAMAP, (flags & BUS_DMA_NOWAIT) ? M_NOWAIT : M_WAITOK)) == NULL) return ENOMEM; **newtag = *tag; (*newtag)->_tag_needs_free = 1; if (tag->_bounce_thresh == 0 || max_addr < tag->_bounce_thresh) (*newtag)->_bounce_thresh = max_addr; if (tag->_bounce_alloc_hi == 0 || max_addr < tag->_bounce_alloc_hi) (*newtag)->_bounce_alloc_hi = max_addr; if (min_addr > tag->_bounce_alloc_lo) (*newtag)->_bounce_alloc_lo = min_addr; return 0; } static void _bus_dmatag_destroy(bus_dma_tag_t tag) { switch (tag->_tag_needs_free) { case 0: break; /* not allocated with malloc */ case 1: free(tag, M_DMAMAP); /* last reference to tag */ break; default: (tag->_tag_needs_free)--; /* one less reference */ } } void bus_dmamap_sync(bus_dma_tag_t t, bus_dmamap_t p, bus_addr_t o, bus_size_t l, int ops) { bus_dma_tag_t it; kasan_dma_sync(p, o, l, ops); kmsan_dma_sync(p, o, l, ops); if ((t->bdt_exists & BUS_DMAMAP_OVERRIDE_SYNC) == 0) ; /* skip override */ else for (it = t; it != NULL; it = it->bdt_super) { if ((it->bdt_present & BUS_DMAMAP_OVERRIDE_SYNC) == 0) continue; (*it->bdt_ov->ov_dmamap_sync)(it->bdt_ctx, t, p, o, l, ops); return; } _bus_dmamap_sync(t, p, o, l, ops); } int bus_dmamap_create(bus_dma_tag_t t, bus_size_t size, int nsegments, bus_size_t maxsegsz, bus_size_t boundary, int flags, bus_dmamap_t *dmamp) { bus_dma_tag_t it; if ((t->bdt_exists & BUS_DMAMAP_OVERRIDE_CREATE) == 0) ; /* skip override */ else for (it = t; it != NULL; it = it->bdt_super) { if ((it->bdt_present & BUS_DMAMAP_OVERRIDE_CREATE) == 0) continue; return (*it->bdt_ov->ov_dmamap_create)(it->bdt_ctx, t, size, nsegments, maxsegsz, boundary, flags, dmamp); } return _bus_dmamap_create(t, size, nsegments, maxsegsz, boundary, flags, dmamp); } void bus_dmamap_destroy(bus_dma_tag_t t, bus_dmamap_t dmam) { bus_dma_tag_t it; if ((t->bdt_exists & BUS_DMAMAP_OVERRIDE_DESTROY) == 0) ; /* skip override */ else for (it = t; it != NULL; it = it->bdt_super) { if ((it->bdt_present & BUS_DMAMAP_OVERRIDE_DESTROY) == 0) continue; (*it->bdt_ov->ov_dmamap_destroy)(it->bdt_ctx, t, dmam); return; } _bus_dmamap_destroy(t, dmam); } int bus_dmamap_load(bus_dma_tag_t t, bus_dmamap_t dmam, void *buf, bus_size_t buflen, struct proc *p, int flags) { bus_dma_tag_t it; kasan_dma_load(dmam, buf, buflen, KASAN_DMA_LINEAR); kmsan_dma_load(dmam, buf, buflen, KMSAN_DMA_LINEAR); if ((t->bdt_exists & BUS_DMAMAP_OVERRIDE_LOAD) == 0) ; /* skip override */ else for (it = t; it != NULL; it = it->bdt_super) { if ((it->bdt_present & BUS_DMAMAP_OVERRIDE_LOAD) == 0) continue; return (*it->bdt_ov->ov_dmamap_load)(it->bdt_ctx, t, dmam, buf, buflen, p, flags); } return _bus_dmamap_load(t, dmam, buf, buflen, p, flags); } int bus_dmamap_load_mbuf(bus_dma_tag_t t, bus_dmamap_t dmam, struct mbuf *chain, int flags) { bus_dma_tag_t it; kasan_dma_load(dmam, chain, 0, KASAN_DMA_MBUF); kmsan_dma_load(dmam, chain, 0, KMSAN_DMA_MBUF); if ((t->bdt_exists & BUS_DMAMAP_OVERRIDE_LOAD_MBUF) == 0) ; /* skip override */ else for (it = t; it != NULL; it = it->bdt_super) { if ((it->bdt_present & BUS_DMAMAP_OVERRIDE_LOAD_MBUF) == 0) continue; return (*it->bdt_ov->ov_dmamap_load_mbuf)(it->bdt_ctx, t, dmam, chain, flags); } return _bus_dmamap_load_mbuf(t, dmam, chain, flags); } int bus_dmamap_load_uio(bus_dma_tag_t t, bus_dmamap_t dmam, struct uio *uio, int flags) { bus_dma_tag_t it; kasan_dma_load(dmam, uio, 0, KASAN_DMA_UIO); kmsan_dma_load(dmam, uio, 0, KMSAN_DMA_UIO); if ((t->bdt_exists & BUS_DMAMAP_OVERRIDE_LOAD_UIO) == 0) ; /* skip override */ else for (it = t; it != NULL; it = it->bdt_super) { if ((it->bdt_present & BUS_DMAMAP_OVERRIDE_LOAD_UIO) == 0) continue; return (*it->bdt_ov->ov_dmamap_load_uio)(it->bdt_ctx, t, dmam, uio, flags); } return _bus_dmamap_load_uio(t, dmam, uio, flags); } int bus_dmamap_load_raw(bus_dma_tag_t t, bus_dmamap_t dmam, bus_dma_segment_t *segs, int nsegs, bus_size_t size, int flags) { bus_dma_tag_t it; kasan_dma_load(dmam, NULL, 0, KASAN_DMA_RAW); kmsan_dma_load(dmam, NULL, 0, KMSAN_DMA_RAW); if ((t->bdt_exists & BUS_DMAMAP_OVERRIDE_LOAD_RAW) == 0) ; /* skip override */ else for (it = t; it != NULL; it = it->bdt_super) { if ((it->bdt_present & BUS_DMAMAP_OVERRIDE_LOAD_RAW) == 0) continue; return (*it->bdt_ov->ov_dmamap_load_raw)(it->bdt_ctx, t, dmam, segs, nsegs, size, flags); } return _bus_dmamap_load_raw(t, dmam, segs, nsegs, size, flags); } void bus_dmamap_unload(bus_dma_tag_t t, bus_dmamap_t dmam) { bus_dma_tag_t it; if ((t->bdt_exists & BUS_DMAMAP_OVERRIDE_UNLOAD) == 0) ; /* skip override */ else for (it = t; it != NULL; it = it->bdt_super) { if ((it->bdt_present & BUS_DMAMAP_OVERRIDE_UNLOAD) == 0) continue; (*it->bdt_ov->ov_dmamap_unload)(it->bdt_ctx, t, dmam); return; } _bus_dmamap_unload(t, dmam); } int bus_dmamem_alloc(bus_dma_tag_t t, bus_size_t size, bus_size_t alignment, bus_size_t boundary, bus_dma_segment_t *segs, int nsegs, int *rsegs, int flags) { bus_dma_tag_t it; if ((t->bdt_exists & BUS_DMAMEM_OVERRIDE_ALLOC) == 0) ; /* skip override */ else for (it = t; it != NULL; it = it->bdt_super) { if ((it->bdt_present & BUS_DMAMEM_OVERRIDE_ALLOC) == 0) continue; return (*it->bdt_ov->ov_dmamem_alloc)(it->bdt_ctx, t, size, alignment, boundary, segs, nsegs, rsegs, flags); } return _bus_dmamem_alloc(t, size, alignment, boundary, segs, nsegs, rsegs, flags); } void bus_dmamem_free(bus_dma_tag_t t, bus_dma_segment_t *segs, int nsegs) { bus_dma_tag_t it; if ((t->bdt_exists & BUS_DMAMEM_OVERRIDE_FREE) == 0) ; /* skip override */ else for (it = t; it != NULL; it = it->bdt_super) { if ((it->bdt_present & BUS_DMAMEM_OVERRIDE_FREE) == 0) continue; (*it->bdt_ov->ov_dmamem_free)(it->bdt_ctx, t, segs, nsegs); return; } _bus_dmamem_free(t, segs, nsegs); } int bus_dmamem_map(bus_dma_tag_t t, bus_dma_segment_t *segs, int nsegs, size_t size, void **kvap, int flags) { bus_dma_tag_t it; if ((t->bdt_exists & BUS_DMAMEM_OVERRIDE_MAP) == 0) ; /* skip override */ else for (it = t; it != NULL; it = it->bdt_super) { if ((it->bdt_present & BUS_DMAMEM_OVERRIDE_MAP) == 0) continue; return (*it->bdt_ov->ov_dmamem_map)(it->bdt_ctx, t, segs, nsegs, size, kvap, flags); } return _bus_dmamem_map(t, segs, nsegs, size, kvap, flags); } void bus_dmamem_unmap(bus_dma_tag_t t, void *kva, size_t size) { bus_dma_tag_t it; if ((t->bdt_exists & BUS_DMAMEM_OVERRIDE_UNMAP) == 0) ; /* skip override */ else for (it = t; it != NULL; it = it->bdt_super) { if ((it->bdt_present & BUS_DMAMEM_OVERRIDE_UNMAP) == 0) continue; (*it->bdt_ov->ov_dmamem_unmap)(it->bdt_ctx, t, kva, size); return; } _bus_dmamem_unmap(t, kva, size); } paddr_t bus_dmamem_mmap(bus_dma_tag_t t, bus_dma_segment_t *segs, int nsegs, off_t off, int prot, int flags) { bus_dma_tag_t it; if ((t->bdt_exists & BUS_DMAMEM_OVERRIDE_MMAP) == 0) ; /* skip override */ else for (it = t; it != NULL; it = it->bdt_super) { if ((it->bdt_present & BUS_DMAMEM_OVERRIDE_MMAP) == 0) continue; return (*it->bdt_ov->ov_dmamem_mmap)(it->bdt_ctx, t, segs, nsegs, off, prot, flags); } return _bus_dmamem_mmap(t, segs, nsegs, off, prot, flags); } int bus_dmatag_subregion(bus_dma_tag_t t, bus_addr_t min_addr, bus_addr_t max_addr, bus_dma_tag_t *newtag, int flags) { bus_dma_tag_t it; if ((t->bdt_exists & BUS_DMATAG_OVERRIDE_SUBREGION) == 0) ; /* skip override */ else for (it = t; it != NULL; it = it->bdt_super) { if ((it->bdt_present & BUS_DMATAG_OVERRIDE_SUBREGION) == 0) continue; return (*it->bdt_ov->ov_dmatag_subregion)(it->bdt_ctx, t, min_addr, max_addr, newtag, flags); } return _bus_dmatag_subregion(t, min_addr, max_addr, newtag, flags); } void bus_dmatag_destroy(bus_dma_tag_t t) { bus_dma_tag_t it; if ((t->bdt_exists & BUS_DMATAG_OVERRIDE_DESTROY) == 0) ; /* skip override */ else for (it = t; it != NULL; it = it->bdt_super) { if ((it->bdt_present & BUS_DMATAG_OVERRIDE_DESTROY) == 0) continue; (*it->bdt_ov->ov_dmatag_destroy)(it->bdt_ctx, t); return; } _bus_dmatag_destroy(t); } static const void * bit_to_function_pointer(const struct bus_dma_overrides *ov, uint64_t bit) { switch (bit) { case BUS_DMAMAP_OVERRIDE_CREATE: return ov->ov_dmamap_create; case BUS_DMAMAP_OVERRIDE_DESTROY: return ov->ov_dmamap_destroy; case BUS_DMAMAP_OVERRIDE_LOAD: return ov->ov_dmamap_load; case BUS_DMAMAP_OVERRIDE_LOAD_MBUF: return ov->ov_dmamap_load_mbuf; case BUS_DMAMAP_OVERRIDE_LOAD_UIO: return ov->ov_dmamap_load_uio; case BUS_DMAMAP_OVERRIDE_LOAD_RAW: return ov->ov_dmamap_load_raw; case BUS_DMAMAP_OVERRIDE_UNLOAD: return ov->ov_dmamap_unload; case BUS_DMAMAP_OVERRIDE_SYNC: return ov->ov_dmamap_sync; case BUS_DMAMEM_OVERRIDE_ALLOC: return ov->ov_dmamem_alloc; case BUS_DMAMEM_OVERRIDE_FREE: return ov->ov_dmamem_free; case BUS_DMAMEM_OVERRIDE_MAP: return ov->ov_dmamem_map; case BUS_DMAMEM_OVERRIDE_UNMAP: return ov->ov_dmamem_unmap; case BUS_DMAMEM_OVERRIDE_MMAP: return ov->ov_dmamem_mmap; case BUS_DMATAG_OVERRIDE_SUBREGION: return ov->ov_dmatag_subregion; case BUS_DMATAG_OVERRIDE_DESTROY: return ov->ov_dmatag_destroy; default: return NULL; } } void bus_dma_tag_destroy(bus_dma_tag_t bdt) { if (bdt->bdt_super != NULL) bus_dmatag_destroy(bdt->bdt_super); kmem_free(bdt, sizeof(struct x86_bus_dma_tag)); } int bus_dma_tag_create(bus_dma_tag_t obdt, const uint64_t present, const struct bus_dma_overrides *ov, void *ctx, bus_dma_tag_t *bdtp) { uint64_t bit, bits, nbits; bus_dma_tag_t bdt; const void *fp; if (ov == NULL || present == 0) return EINVAL; bdt = kmem_alloc(sizeof(struct x86_bus_dma_tag), KM_SLEEP); *bdt = *obdt; /* don't let bus_dmatag_destroy free these */ bdt->_tag_needs_free = 0; bdt->bdt_super = obdt; for (bits = present; bits != 0; bits = nbits) { nbits = bits & (bits - 1); bit = nbits ^ bits; if ((fp = bit_to_function_pointer(ov, bit)) == NULL) { #ifdef DEBUG printf("%s: missing bit %" PRIx64 "\n", __func__, bit); #endif goto einval; } } bdt->bdt_ov = ov; bdt->bdt_exists = obdt->bdt_exists | present; bdt->bdt_present = present; bdt->bdt_ctx = ctx; *bdtp = bdt; if (obdt->_tag_needs_free) obdt->_tag_needs_free++; return 0; einval: kmem_free(bdt, sizeof(struct x86_bus_dma_tag)); return EINVAL; }
1485 1830 1214 1836 1833 264 58 58 409 410 410 882 880 1304 1331 32 57 57 87 878 1299 1491 1483 1483 147 1841 1837 1840 1833 691 1743 1741 690 1835 268 265 11 1833 1838 410 1810 637 639 638 1761 1565 1563 1565 1568 1835 1839 1832 1488 1487 1493 1485 58 1488 1489 202 1488 466 465 465 467 1392 1318 1321 1490 1493 267 2 236 59 271 147 170 50 236 267 267 20 267 264 266 146 171 267 264 7 6 7 7 2 266 267 13 13 1586 1483 14 1483 1832 1834 1837 1830 1832 1837 1834 21 1837 1835 1659 1289 1285 679 1745 1748 679 1837 1485 1489 1311 880 1483 266 20 249 4 1483 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 /* $NetBSD: subr_pool.c,v 1.290 2023/04/09 12:21:59 riastradh Exp $ */ /* * Copyright (c) 1997, 1999, 2000, 2002, 2007, 2008, 2010, 2014, 2015, 2018, * 2020, 2021 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Paul Kranenburg; by Jason R. Thorpe of the Numerical Aerospace * Simulation Facility, NASA Ames Research Center; by Andrew Doran, and by * Maxime Villard. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: subr_pool.c,v 1.290 2023/04/09 12:21:59 riastradh Exp $"); #ifdef _KERNEL_OPT #include "opt_ddb.h" #include "opt_lockdebug.h" #include "opt_pool.h" #endif #include <sys/param.h> #include <sys/systm.h> #include <sys/sysctl.h> #include <sys/bitops.h> #include <sys/proc.h> #include <sys/errno.h> #include <sys/kernel.h> #include <sys/vmem.h> #include <sys/pool.h> #include <sys/syslog.h> #include <sys/debug.h> #include <sys/lock.h> #include <sys/lockdebug.h> #include <sys/xcall.h> #include <sys/cpu.h> #include <sys/atomic.h> #include <sys/asan.h> #include <sys/msan.h> #include <sys/fault.h> #include <uvm/uvm_extern.h> /* * Pool resource management utility. * * Memory is allocated in pages which are split into pieces according to * the pool item size. Each page is kept on one of three lists in the * pool structure: `pr_emptypages', `pr_fullpages' and `pr_partpages', * for empty, full and partially-full pages respectively. The individual * pool items are on a linked list headed by `ph_itemlist' in each page * header. The memory for building the page list is either taken from * the allocated pages themselves (for small pool items) or taken from * an internal pool of page headers (`phpool'). */ /* List of all pools. Non static as needed by 'vmstat -m' */ TAILQ_HEAD(, pool) pool_head = TAILQ_HEAD_INITIALIZER(pool_head); /* Private pool for page header structures */ #define PHPOOL_MAX 8 static struct pool phpool[PHPOOL_MAX]; #define PHPOOL_FREELIST_NELEM(idx) \ (((idx) == 0) ? BITMAP_MIN_SIZE : BITMAP_SIZE * (1 << (idx))) #if !defined(KMSAN) && (defined(DIAGNOSTIC) || defined(KASAN)) #define POOL_REDZONE #endif #if defined(POOL_QUARANTINE) #define POOL_NOCACHE #endif #ifdef POOL_REDZONE # ifdef KASAN # define POOL_REDZONE_SIZE 8 # else # define POOL_REDZONE_SIZE 2 # endif static void pool_redzone_init(struct pool *, size_t); static void pool_redzone_fill(struct pool *, void *); static void pool_redzone_check(struct pool *, void *); static void pool_cache_redzone_check(pool_cache_t, void *); #else # define pool_redzone_init(pp, sz) __nothing # define pool_redzone_fill(pp, ptr) __nothing # define pool_redzone_check(pp, ptr) __nothing # define pool_cache_redzone_check(pc, ptr) __nothing #endif #ifdef KMSAN static inline void pool_get_kmsan(struct pool *, void *); static inline void pool_put_kmsan(struct pool *, void *); static inline void pool_cache_get_kmsan(pool_cache_t, void *); static inline void pool_cache_put_kmsan(pool_cache_t, void *); #else #define pool_get_kmsan(pp, ptr) __nothing #define pool_put_kmsan(pp, ptr) __nothing #define pool_cache_get_kmsan(pc, ptr) __nothing #define pool_cache_put_kmsan(pc, ptr) __nothing #endif #ifdef POOL_QUARANTINE static void pool_quarantine_init(struct pool *); static void pool_quarantine_flush(struct pool *); static bool pool_put_quarantine(struct pool *, void *, struct pool_pagelist *); #else #define pool_quarantine_init(a) __nothing #define pool_quarantine_flush(a) __nothing #define pool_put_quarantine(a, b, c) false #endif #ifdef POOL_NOCACHE static bool pool_cache_put_nocache(pool_cache_t, void *); #else #define pool_cache_put_nocache(a, b) false #endif #define NO_CTOR __FPTRCAST(int (*)(void *, void *, int), nullop) #define NO_DTOR __FPTRCAST(void (*)(void *, void *), nullop) #define pc_has_pser(pc) (((pc)->pc_roflags & PR_PSERIALIZE) != 0) #define pc_has_ctor(pc) ((pc)->pc_ctor != NO_CTOR) #define pc_has_dtor(pc) ((pc)->pc_dtor != NO_DTOR) #define pp_has_pser(pp) (((pp)->pr_roflags & PR_PSERIALIZE) != 0) #define pool_barrier() xc_barrier(0) /* * Pool backend allocators. * * Each pool has a backend allocator that handles allocation, deallocation, * and any additional draining that might be needed. * * We provide two standard allocators: * * pool_allocator_kmem - the default when no allocator is specified * * pool_allocator_nointr - used for pools that will not be accessed * in interrupt context. */ void *pool_page_alloc(struct pool *, int); void pool_page_free(struct pool *, void *); static void *pool_page_alloc_meta(struct pool *, int); static void pool_page_free_meta(struct pool *, void *); struct pool_allocator pool_allocator_kmem = { .pa_alloc = pool_page_alloc, .pa_free = pool_page_free, .pa_pagesz = 0 }; struct pool_allocator pool_allocator_nointr = { .pa_alloc = pool_page_alloc, .pa_free = pool_page_free, .pa_pagesz = 0 }; struct pool_allocator pool_allocator_meta = { .pa_alloc = pool_page_alloc_meta, .pa_free = pool_page_free_meta, .pa_pagesz = 0 }; #define POOL_ALLOCATOR_BIG_BASE 13 static struct pool_allocator pool_allocator_big[] = { { .pa_alloc = pool_page_alloc, .pa_free = pool_page_free, .pa_pagesz = 1 << (POOL_ALLOCATOR_BIG_BASE + 0), }, { .pa_alloc = pool_page_alloc, .pa_free = pool_page_free, .pa_pagesz = 1 << (POOL_ALLOCATOR_BIG_BASE + 1), }, { .pa_alloc = pool_page_alloc, .pa_free = pool_page_free, .pa_pagesz = 1 << (POOL_ALLOCATOR_BIG_BASE + 2), }, { .pa_alloc = pool_page_alloc, .pa_free = pool_page_free, .pa_pagesz = 1 << (POOL_ALLOCATOR_BIG_BASE + 3), }, { .pa_alloc = pool_page_alloc, .pa_free = pool_page_free, .pa_pagesz = 1 << (POOL_ALLOCATOR_BIG_BASE + 4), }, { .pa_alloc = pool_page_alloc, .pa_free = pool_page_free, .pa_pagesz = 1 << (POOL_ALLOCATOR_BIG_BASE + 5), }, { .pa_alloc = pool_page_alloc, .pa_free = pool_page_free, .pa_pagesz = 1 << (POOL_ALLOCATOR_BIG_BASE + 6), }, { .pa_alloc = pool_page_alloc, .pa_free = pool_page_free, .pa_pagesz = 1 << (POOL_ALLOCATOR_BIG_BASE + 7), }, { .pa_alloc = pool_page_alloc, .pa_free = pool_page_free, .pa_pagesz = 1 << (POOL_ALLOCATOR_BIG_BASE + 8), }, { .pa_alloc = pool_page_alloc, .pa_free = pool_page_free, .pa_pagesz = 1 << (POOL_ALLOCATOR_BIG_BASE + 9), }, { .pa_alloc = pool_page_alloc, .pa_free = pool_page_free, .pa_pagesz = 1 << (POOL_ALLOCATOR_BIG_BASE + 10), }, { .pa_alloc = pool_page_alloc, .pa_free = pool_page_free, .pa_pagesz = 1 << (POOL_ALLOCATOR_BIG_BASE + 11), } }; static int pool_bigidx(size_t); /* # of seconds to retain page after last use */ int pool_inactive_time = 10; /* Next candidate for drainage (see pool_drain()) */ static struct pool *drainpp; /* This lock protects both pool_head and drainpp. */ static kmutex_t pool_head_lock; static kcondvar_t pool_busy; /* This lock protects initialization of a potentially shared pool allocator */ static kmutex_t pool_allocator_lock; static unsigned int poolid_counter = 0; typedef uint32_t pool_item_bitmap_t; #define BITMAP_SIZE (CHAR_BIT * sizeof(pool_item_bitmap_t)) #define BITMAP_MASK (BITMAP_SIZE - 1) #define BITMAP_MIN_SIZE (CHAR_BIT * sizeof(((struct pool_item_header *)NULL)->ph_u2)) struct pool_item_header { /* Page headers */ LIST_ENTRY(pool_item_header) ph_pagelist; /* pool page list */ union { /* !PR_PHINPAGE */ struct { SPLAY_ENTRY(pool_item_header) phu_node; /* off-page page headers */ } phu_offpage; /* PR_PHINPAGE */ struct { unsigned int phu_poolid; } phu_onpage; } ph_u1; void * ph_page; /* this page's address */ uint32_t ph_time; /* last referenced */ uint16_t ph_nmissing; /* # of chunks in use */ uint16_t ph_off; /* start offset in page */ union { /* !PR_USEBMAP */ struct { LIST_HEAD(, pool_item) phu_itemlist; /* chunk list for this page */ } phu_normal; /* PR_USEBMAP */ struct { pool_item_bitmap_t phu_bitmap[1]; } phu_notouch; } ph_u2; }; #define ph_node ph_u1.phu_offpage.phu_node #define ph_poolid ph_u1.phu_onpage.phu_poolid #define ph_itemlist ph_u2.phu_normal.phu_itemlist #define ph_bitmap ph_u2.phu_notouch.phu_bitmap #define PHSIZE ALIGN(sizeof(struct pool_item_header)) CTASSERT(offsetof(struct pool_item_header, ph_u2) + BITMAP_MIN_SIZE / CHAR_BIT == sizeof(struct pool_item_header)); #if defined(DIAGNOSTIC) && !defined(KASAN) #define POOL_CHECK_MAGIC #endif struct pool_item { #ifdef POOL_CHECK_MAGIC u_int pi_magic; #endif #define PI_MAGIC 0xdeaddeadU /* Other entries use only this list entry */ LIST_ENTRY(pool_item) pi_list; }; #define POOL_NEEDS_CATCHUP(pp) \ ((pp)->pr_nitems < (pp)->pr_minitems || \ (pp)->pr_npages < (pp)->pr_minpages) #define POOL_OBJ_TO_PAGE(pp, v) \ (void *)((uintptr_t)v & pp->pr_alloc->pa_pagemask) /* * Pool cache management. * * Pool caches provide a way for constructed objects to be cached by the * pool subsystem. This can lead to performance improvements by avoiding * needless object construction/destruction; it is deferred until absolutely * necessary. * * Caches are grouped into cache groups. Each cache group references up * to PCG_NUMOBJECTS constructed objects. When a cache allocates an * object from the pool, it calls the object's constructor and places it * into a cache group. When a cache group frees an object back to the * pool, it first calls the object's destructor. This allows the object * to persist in constructed form while freed to the cache. * * The pool references each cache, so that when a pool is drained by the * pagedaemon, it can drain each individual cache as well. Each time a * cache is drained, the most idle cache group is freed to the pool in * its entirety. * * Pool caches are laid on top of pools. By layering them, we can avoid * the complexity of cache management for pools which would not benefit * from it. */ static struct pool pcg_normal_pool; static struct pool pcg_large_pool; static struct pool cache_pool; static struct pool cache_cpu_pool; static pcg_t *volatile pcg_large_cache __cacheline_aligned; static pcg_t *volatile pcg_normal_cache __cacheline_aligned; /* List of all caches. */ TAILQ_HEAD(,pool_cache) pool_cache_head = TAILQ_HEAD_INITIALIZER(pool_cache_head); int pool_cache_disable; /* global disable for caching */ static const pcg_t pcg_dummy; /* zero sized: always empty, yet always full */ static bool pool_cache_put_slow(pool_cache_t, pool_cache_cpu_t *, int, void *); static bool pool_cache_get_slow(pool_cache_t, pool_cache_cpu_t *, int, void **, paddr_t *, int); static void pool_cache_cpu_init1(struct cpu_info *, pool_cache_t); static int pool_cache_invalidate_groups(pool_cache_t, pcg_t *); static void pool_cache_invalidate_cpu(pool_cache_t, u_int); static void pool_cache_transfer(pool_cache_t); static int pool_pcg_get(pcg_t *volatile *, pcg_t **); static int pool_pcg_put(pcg_t *volatile *, pcg_t *); static pcg_t * pool_pcg_trunc(pcg_t *volatile *); static int pool_catchup(struct pool *); static void pool_prime_page(struct pool *, void *, struct pool_item_header *); static void pool_update_curpage(struct pool *); static int pool_grow(struct pool *, int); static void *pool_allocator_alloc(struct pool *, int); static void pool_allocator_free(struct pool *, void *); static void pool_print_pagelist(struct pool *, struct pool_pagelist *, void (*)(const char *, ...) __printflike(1, 2)); static void pool_print1(struct pool *, const char *, void (*)(const char *, ...) __printflike(1, 2)); static int pool_chk_page(struct pool *, const char *, struct pool_item_header *); /* -------------------------------------------------------------------------- */ static inline unsigned int pr_item_bitmap_index(const struct pool *pp, const struct pool_item_header *ph, const void *v) { const char *cp = v; unsigned int idx; KASSERT(pp->pr_roflags & PR_USEBMAP); idx = (cp - (char *)ph->ph_page - ph->ph_off) / pp->pr_size; if (__predict_false(idx >= pp->pr_itemsperpage)) { panic("%s: [%s] %u >= %u", __func__, pp->pr_wchan, idx, pp->pr_itemsperpage); } return idx; } static inline void pr_item_bitmap_put(const struct pool *pp, struct pool_item_header *ph, void *obj) { unsigned int idx = pr_item_bitmap_index(pp, ph, obj); pool_item_bitmap_t *bitmap = ph->ph_bitmap + (idx / BITMAP_SIZE); pool_item_bitmap_t mask = 1U << (idx & BITMAP_MASK); if (__predict_false((*bitmap & mask) != 0)) { panic("%s: [%s] %p already freed", __func__, pp->pr_wchan, obj); } *bitmap |= mask; } static inline void * pr_item_bitmap_get(const struct pool *pp, struct pool_item_header *ph) { pool_item_bitmap_t *bitmap = ph->ph_bitmap; unsigned int idx; int i; for (i = 0; ; i++) { int bit; KASSERT((i * BITMAP_SIZE) < pp->pr_itemsperpage); bit = ffs32(bitmap[i]); if (bit) { pool_item_bitmap_t mask; bit--; idx = (i * BITMAP_SIZE) + bit; mask = 1U << bit; KASSERT((bitmap[i] & mask) != 0); bitmap[i] &= ~mask; break; } } KASSERT(idx < pp->pr_itemsperpage); return (char *)ph->ph_page + ph->ph_off + idx * pp->pr_size; } static inline void pr_item_bitmap_init(const struct pool *pp, struct pool_item_header *ph) { pool_item_bitmap_t *bitmap = ph->ph_bitmap; const int n = howmany(pp->pr_itemsperpage, BITMAP_SIZE); int i; for (i = 0; i < n; i++) { bitmap[i] = (pool_item_bitmap_t)-1; } } /* -------------------------------------------------------------------------- */ static inline void pr_item_linkedlist_put(const struct pool *pp, struct pool_item_header *ph, void *obj) { struct pool_item *pi = obj; KASSERT(!pp_has_pser(pp)); #ifdef POOL_CHECK_MAGIC pi->pi_magic = PI_MAGIC; #endif if (pp->pr_redzone) { /* * Mark the pool_item as valid. The rest is already * invalid. */ kasan_mark(pi, sizeof(*pi), sizeof(*pi), 0); } LIST_INSERT_HEAD(&ph->ph_itemlist, pi, pi_list); } static inline void * pr_item_linkedlist_get(struct pool *pp, struct pool_item_header *ph) { struct pool_item *pi; void *v; v = pi = LIST_FIRST(&ph->ph_itemlist); if (__predict_false(v == NULL)) { mutex_exit(&pp->pr_lock); panic("%s: [%s] page empty", __func__, pp->pr_wchan); } KASSERTMSG((pp->pr_nitems > 0), "%s: [%s] nitems %u inconsistent on itemlist", __func__, pp->pr_wchan, pp->pr_nitems); #ifdef POOL_CHECK_MAGIC KASSERTMSG((pi->pi_magic == PI_MAGIC), "%s: [%s] free list modified: " "magic=%x; page %p; item addr %p", __func__, pp->pr_wchan, pi->pi_magic, ph->ph_page, pi); #endif /* * Remove from item list. */ LIST_REMOVE(pi, pi_list); return v; } /* -------------------------------------------------------------------------- */ static inline void pr_phinpage_check(struct pool *pp, struct pool_item_header *ph, void *page, void *object) { if (__predict_false((void *)ph->ph_page != page)) { panic("%s: [%s] item %p not part of pool", __func__, pp->pr_wchan, object); } if (__predict_false((char *)object < (char *)page + ph->ph_off)) { panic("%s: [%s] item %p below item space", __func__, pp->pr_wchan, object); } if (__predict_false(ph->ph_poolid != pp->pr_poolid)) { panic("%s: [%s] item %p poolid %u != %u", __func__, pp->pr_wchan, object, ph->ph_poolid, pp->pr_poolid); } } static inline void pc_phinpage_check(pool_cache_t pc, void *object) { struct pool_item_header *ph; struct pool *pp; void *page; pp = &pc->pc_pool; page = POOL_OBJ_TO_PAGE(pp, object); ph = (struct pool_item_header *)page; pr_phinpage_check(pp, ph, page, object); } /* -------------------------------------------------------------------------- */ static inline int phtree_compare(struct pool_item_header *a, struct pool_item_header *b) { /* * We consider pool_item_header with smaller ph_page bigger. This * unnatural ordering is for the benefit of pr_find_pagehead. */ if (a->ph_page < b->ph_page) return 1; else if (a->ph_page > b->ph_page) return -1; else return 0; } SPLAY_PROTOTYPE(phtree, pool_item_header, ph_node, phtree_compare); SPLAY_GENERATE(phtree, pool_item_header, ph_node, phtree_compare); static inline struct pool_item_header * pr_find_pagehead_noalign(struct pool *pp, void *v) { struct pool_item_header *ph, tmp; tmp.ph_page = (void *)(uintptr_t)v; ph = SPLAY_FIND(phtree, &pp->pr_phtree, &tmp); if (ph == NULL) { ph = SPLAY_ROOT(&pp->pr_phtree); if (ph != NULL && phtree_compare(&tmp, ph) >= 0) { ph = SPLAY_NEXT(phtree, &pp->pr_phtree, ph); } KASSERT(ph == NULL || phtree_compare(&tmp, ph) < 0); } return ph; } /* * Return the pool page header based on item address. */ static inline struct pool_item_header * pr_find_pagehead(struct pool *pp, void *v) { struct pool_item_header *ph, tmp; if ((pp->pr_roflags & PR_NOALIGN) != 0) { ph = pr_find_pagehead_noalign(pp, v); } else { void *page = POOL_OBJ_TO_PAGE(pp, v); if ((pp->pr_roflags & PR_PHINPAGE) != 0) { ph = (struct pool_item_header *)page; pr_phinpage_check(pp, ph, page, v); } else { tmp.ph_page = page; ph = SPLAY_FIND(phtree, &pp->pr_phtree, &tmp); } } KASSERT(ph == NULL || ((pp->pr_roflags & PR_PHINPAGE) != 0) || ((char *)ph->ph_page <= (char *)v && (char *)v < (char *)ph->ph_page + pp->pr_alloc->pa_pagesz)); return ph; } static void pr_pagelist_free(struct pool *pp, struct pool_pagelist *pq) { struct pool_item_header *ph; while ((ph = LIST_FIRST(pq)) != NULL) { LIST_REMOVE(ph, ph_pagelist); pool_allocator_free(pp, ph->ph_page); if ((pp->pr_roflags & PR_PHINPAGE) == 0) pool_put(pp->pr_phpool, ph); } } /* * Remove a page from the pool. */ static inline void pr_rmpage(struct pool *pp, struct pool_item_header *ph, struct pool_pagelist *pq) { KASSERT(mutex_owned(&pp->pr_lock)); /* * If the page was idle, decrement the idle page count. */ if (ph->ph_nmissing == 0) { KASSERT(pp->pr_nidle != 0); KASSERTMSG((pp->pr_nitems >= pp->pr_itemsperpage), "%s: [%s] nitems=%u < itemsperpage=%u", __func__, pp->pr_wchan, pp->pr_nitems, pp->pr_itemsperpage); pp->pr_nidle--; } pp->pr_nitems -= pp->pr_itemsperpage; /* * Unlink the page from the pool and queue it for release. */ LIST_REMOVE(ph, ph_pagelist); if (pp->pr_roflags & PR_PHINPAGE) { if (__predict_false(ph->ph_poolid != pp->pr_poolid)) { panic("%s: [%s] ph %p poolid %u != %u", __func__, pp->pr_wchan, ph, ph->ph_poolid, pp->pr_poolid); } } else { SPLAY_REMOVE(phtree, &pp->pr_phtree, ph); } LIST_INSERT_HEAD(pq, ph, ph_pagelist); pp->pr_npages--; pp->pr_npagefree++; pool_update_curpage(pp); } /* * Initialize all the pools listed in the "pools" link set. */ void pool_subsystem_init(void) { size_t size; int idx; mutex_init(&pool_head_lock, MUTEX_DEFAULT, IPL_NONE); mutex_init(&pool_allocator_lock, MUTEX_DEFAULT, IPL_NONE); cv_init(&pool_busy, "poolbusy"); /* * Initialize private page header pool and cache magazine pool if we * haven't done so yet. */ for (idx = 0; idx < PHPOOL_MAX; idx++) { static char phpool_names[PHPOOL_MAX][6+1+6+1]; int nelem; size_t sz; nelem = PHPOOL_FREELIST_NELEM(idx); KASSERT(nelem != 0); snprintf(phpool_names[idx], sizeof(phpool_names[idx]), "phpool-%d", nelem); sz = offsetof(struct pool_item_header, ph_bitmap[howmany(nelem, BITMAP_SIZE)]); pool_init(&phpool[idx], sz, 0, 0, 0, phpool_names[idx], &pool_allocator_meta, IPL_VM); } size = sizeof(pcg_t) + (PCG_NOBJECTS_NORMAL - 1) * sizeof(pcgpair_t); pool_init(&pcg_normal_pool, size, coherency_unit, 0, 0, "pcgnormal", &pool_allocator_meta, IPL_VM); size = sizeof(pcg_t) + (PCG_NOBJECTS_LARGE - 1) * sizeof(pcgpair_t); pool_init(&pcg_large_pool, size, coherency_unit, 0, 0, "pcglarge", &pool_allocator_meta, IPL_VM); pool_init(&cache_pool, sizeof(struct pool_cache), coherency_unit, 0, 0, "pcache", &pool_allocator_meta, IPL_NONE); pool_init(&cache_cpu_pool, sizeof(pool_cache_cpu_t), coherency_unit, 0, 0, "pcachecpu", &pool_allocator_meta, IPL_NONE); } static inline bool pool_init_is_phinpage(const struct pool *pp) { size_t pagesize; if (pp->pr_roflags & PR_PHINPAGE) { return true; } if (pp->pr_roflags & (PR_NOTOUCH | PR_NOALIGN)) { return false; } pagesize = pp->pr_alloc->pa_pagesz; /* * Threshold: the item size is below 1/16 of a page size, and below * 8 times the page header size. The latter ensures we go off-page * if the page header would make us waste a rather big item. */ if (pp->pr_size < MIN(pagesize / 16, PHSIZE * 8)) { return true; } /* Put the header into the page if it doesn't waste any items. */ if (pagesize / pp->pr_size == (pagesize - PHSIZE) / pp->pr_size) { return true; } return false; } static inline bool pool_init_is_usebmap(const struct pool *pp) { size_t bmapsize; if (pp->pr_roflags & PR_NOTOUCH) { return true; } /* * If we're off-page, go with a bitmap. */ if (!(pp->pr_roflags & PR_PHINPAGE)) { return true; } /* * If we're on-page, and the page header can already contain a bitmap * big enough to cover all the items of the page, go with a bitmap. */ bmapsize = roundup(PHSIZE, pp->pr_align) - offsetof(struct pool_item_header, ph_bitmap[0]); KASSERT(bmapsize % sizeof(pool_item_bitmap_t) == 0); if (pp->pr_itemsperpage <= bmapsize * CHAR_BIT) { return true; } return false; } /* * Initialize the given pool resource structure. * * We export this routine to allow other kernel parts to declare * static pools that must be initialized before kmem(9) is available. */ void pool_init(struct pool *pp, size_t size, u_int align, u_int ioff, int flags, const char *wchan, struct pool_allocator *palloc, int ipl) { struct pool *pp1; size_t prsize; int itemspace, slack; /* XXX ioff will be removed. */ KASSERT(ioff == 0); #ifdef DEBUG if (__predict_true(!cold)) mutex_enter(&pool_head_lock); /* * Check that the pool hasn't already been initialised and * added to the list of all pools. */ TAILQ_FOREACH(pp1, &pool_head, pr_poollist) { if (pp == pp1) panic("%s: [%s] already initialised", __func__, wchan); } if (__predict_true(!cold)) mutex_exit(&pool_head_lock); #endif if (palloc == NULL) palloc = &pool_allocator_kmem; if (!cold) mutex_enter(&pool_allocator_lock); if (palloc->pa_refcnt++ == 0) { if (palloc->pa_pagesz == 0) palloc->pa_pagesz = PAGE_SIZE; TAILQ_INIT(&palloc->pa_list); mutex_init(&palloc->pa_lock, MUTEX_DEFAULT, IPL_VM); palloc->pa_pagemask = ~(palloc->pa_pagesz - 1); palloc->pa_pageshift = ffs(palloc->pa_pagesz) - 1; } if (!cold) mutex_exit(&pool_allocator_lock); /* * PR_PSERIALIZE implies PR_NOTOUCH; freed objects must remain * valid until the the backing page is returned to the system. */ if (flags & PR_PSERIALIZE) { flags |= PR_NOTOUCH; } if (align == 0) align = ALIGN(1); prsize = size; if ((flags & PR_NOTOUCH) == 0 && prsize < sizeof(struct pool_item)) prsize = sizeof(struct pool_item); prsize = roundup(prsize, align); KASSERTMSG((prsize <= palloc->pa_pagesz), "%s: [%s] pool item size (%zu) larger than page size (%u)", __func__, wchan, prsize, palloc->pa_pagesz); /* * Initialize the pool structure. */ LIST_INIT(&pp->pr_emptypages); LIST_INIT(&pp->pr_fullpages); LIST_INIT(&pp->pr_partpages); pp->pr_cache = NULL; pp->pr_curpage = NULL; pp->pr_npages = 0; pp->pr_minitems = 0; pp->pr_minpages = 0; pp->pr_maxpages = UINT_MAX; pp->pr_roflags = flags; pp->pr_flags = 0; pp->pr_size = prsize; pp->pr_reqsize = size; pp->pr_align = align; pp->pr_wchan = wchan; pp->pr_alloc = palloc; pp->pr_poolid = atomic_inc_uint_nv(&poolid_counter); pp->pr_nitems = 0; pp->pr_nout = 0; pp->pr_hardlimit = UINT_MAX; pp->pr_hardlimit_warning = NULL; pp->pr_hardlimit_ratecap.tv_sec = 0; pp->pr_hardlimit_ratecap.tv_usec = 0; pp->pr_hardlimit_warning_last.tv_sec = 0; pp->pr_hardlimit_warning_last.tv_usec = 0; pp->pr_drain_hook = NULL; pp->pr_drain_hook_arg = NULL; pp->pr_freecheck = NULL; pp->pr_redzone = false; pool_redzone_init(pp, size); pool_quarantine_init(pp); /* * Decide whether to put the page header off-page to avoid wasting too * large a part of the page or too big an item. Off-page page headers * go on a hash table, so we can match a returned item with its header * based on the page address. */ if (pool_init_is_phinpage(pp)) { /* Use the beginning of the page for the page header */ itemspace = palloc->pa_pagesz - roundup(PHSIZE, align); pp->pr_itemoffset = roundup(PHSIZE, align); pp->pr_roflags |= PR_PHINPAGE; } else { /* The page header will be taken from our page header pool */ itemspace = palloc->pa_pagesz; pp->pr_itemoffset = 0; SPLAY_INIT(&pp->pr_phtree); } pp->pr_itemsperpage = itemspace / pp->pr_size; KASSERT(pp->pr_itemsperpage != 0); /* * Decide whether to use a bitmap or a linked list to manage freed * items. */ if (pool_init_is_usebmap(pp)) { pp->pr_roflags |= PR_USEBMAP; } /* * If we're off-page, then we're using a bitmap; choose the appropriate * pool to allocate page headers, whose size varies depending on the * bitmap. If we're on-page, nothing to do. */ if (!(pp->pr_roflags & PR_PHINPAGE)) { int idx; KASSERT(pp->pr_roflags & PR_USEBMAP); for (idx = 0; pp->pr_itemsperpage > PHPOOL_FREELIST_NELEM(idx); idx++) { /* nothing */ } if (idx >= PHPOOL_MAX) { /* * if you see this panic, consider to tweak * PHPOOL_MAX and PHPOOL_FREELIST_NELEM. */ panic("%s: [%s] too large itemsperpage(%d) for " "PR_USEBMAP", __func__, pp->pr_wchan, pp->pr_itemsperpage); } pp->pr_phpool = &phpool[idx]; } else { pp->pr_phpool = NULL; } /* * Use the slack between the chunks and the page header * for "cache coloring". */ slack = itemspace - pp->pr_itemsperpage * pp->pr_size; pp->pr_maxcolor = rounddown(slack, align); pp->pr_curcolor = 0; pp->pr_nget = 0; pp->pr_nfail = 0; pp->pr_nput = 0; pp->pr_npagealloc = 0; pp->pr_npagefree = 0; pp->pr_hiwat = 0; pp->pr_nidle = 0; pp->pr_refcnt = 0; mutex_init(&pp->pr_lock, MUTEX_DEFAULT, ipl); cv_init(&pp->pr_cv, wchan); pp->pr_ipl = ipl; /* Insert into the list of all pools. */ if (!cold) mutex_enter(&pool_head_lock); TAILQ_FOREACH(pp1, &pool_head, pr_poollist) { if (strcmp(pp1->pr_wchan, pp->pr_wchan) > 0) break; } if (pp1 == NULL) TAILQ_INSERT_TAIL(&pool_head, pp, pr_poollist); else TAILQ_INSERT_BEFORE(pp1, pp, pr_poollist); if (!cold) mutex_exit(&pool_head_lock); /* Insert this into the list of pools using this allocator. */ if (!cold) mutex_enter(&palloc->pa_lock); TAILQ_INSERT_TAIL(&palloc->pa_list, pp, pr_alloc_list); if (!cold) mutex_exit(&palloc->pa_lock); } /* * De-commission a pool resource. */ void pool_destroy(struct pool *pp) { struct pool_pagelist pq; struct pool_item_header *ph; pool_quarantine_flush(pp); /* Remove from global pool list */ mutex_enter(&pool_head_lock); while (pp->pr_refcnt != 0) cv_wait(&pool_busy, &pool_head_lock); TAILQ_REMOVE(&pool_head, pp, pr_poollist); if (drainpp == pp) drainpp = NULL; mutex_exit(&pool_head_lock); /* Remove this pool from its allocator's list of pools. */ mutex_enter(&pp->pr_alloc->pa_lock); TAILQ_REMOVE(&pp->pr_alloc->pa_list, pp, pr_alloc_list); mutex_exit(&pp->pr_alloc->pa_lock); mutex_enter(&pool_allocator_lock); if (--pp->pr_alloc->pa_refcnt == 0) mutex_destroy(&pp->pr_alloc->pa_lock); mutex_exit(&pool_allocator_lock); mutex_enter(&pp->pr_lock); KASSERT(pp->pr_cache == NULL); KASSERTMSG((pp->pr_nout == 0), "%s: [%s] pool busy: still out: %u", __func__, pp->pr_wchan, pp->pr_nout); KASSERT(LIST_EMPTY(&pp->pr_fullpages)); KASSERT(LIST_EMPTY(&pp->pr_partpages)); /* Remove all pages */ LIST_INIT(&pq); while ((ph = LIST_FIRST(&pp->pr_emptypages)) != NULL) pr_rmpage(pp, ph, &pq); mutex_exit(&pp->pr_lock); pr_pagelist_free(pp, &pq); cv_destroy(&pp->pr_cv); mutex_destroy(&pp->pr_lock); } void pool_set_drain_hook(struct pool *pp, void (*fn)(void *, int), void *arg) { /* XXX no locking -- must be used just after pool_init() */ KASSERTMSG((pp->pr_drain_hook == NULL), "%s: [%s] already set", __func__, pp->pr_wchan); pp->pr_drain_hook = fn; pp->pr_drain_hook_arg = arg; } static struct pool_item_header * pool_alloc_item_header(struct pool *pp, void *storage, int flags) { struct pool_item_header *ph; if ((pp->pr_roflags & PR_PHINPAGE) != 0) ph = storage; else ph = pool_get(pp->pr_phpool, flags); return ph; } /* * Grab an item from the pool. */ void * pool_get(struct pool *pp, int flags) { struct pool_item_header *ph; void *v; KASSERT(!(flags & PR_NOWAIT) != !(flags & PR_WAITOK)); KASSERTMSG((pp->pr_itemsperpage != 0), "%s: [%s] pr_itemsperpage is zero, " "pool not initialized?", __func__, pp->pr_wchan); KASSERTMSG((!(cpu_intr_p() || cpu_softintr_p()) || pp->pr_ipl != IPL_NONE || cold || panicstr != NULL), "%s: [%s] is IPL_NONE, but called from interrupt context", __func__, pp->pr_wchan); if (flags & PR_WAITOK) { ASSERT_SLEEPABLE(); } if (flags & PR_NOWAIT) { if (fault_inject()) return NULL; } mutex_enter(&pp->pr_lock); startover: /* * Check to see if we've reached the hard limit. If we have, * and we can wait, then wait until an item has been returned to * the pool. */ KASSERTMSG((pp->pr_nout <= pp->pr_hardlimit), "%s: %s: crossed hard limit", __func__, pp->pr_wchan); if (__predict_false(pp->pr_nout == pp->pr_hardlimit)) { if (pp->pr_drain_hook != NULL) { /* * Since the drain hook is going to free things * back to the pool, unlock, call the hook, re-lock, * and check the hardlimit condition again. */ mutex_exit(&pp->pr_lock); (*pp->pr_drain_hook)(pp->pr_drain_hook_arg, flags); mutex_enter(&pp->pr_lock); if (pp->pr_nout < pp->pr_hardlimit) goto startover; } if ((flags & PR_WAITOK) && !(flags & PR_LIMITFAIL)) { /* * XXX: A warning isn't logged in this case. Should * it be? */ pp->pr_flags |= PR_WANTED; do { cv_wait(&pp->pr_cv, &pp->pr_lock); } while (pp->pr_flags & PR_WANTED); goto startover; } /* * Log a message that the hard limit has been hit. */ if (pp->pr_hardlimit_warning != NULL && ratecheck(&pp->pr_hardlimit_warning_last, &pp->pr_hardlimit_ratecap)) log(LOG_ERR, "%s\n", pp->pr_hardlimit_warning); pp->pr_nfail++; mutex_exit(&pp->pr_lock); KASSERT((flags & (PR_NOWAIT|PR_LIMITFAIL)) != 0); return NULL; } /* * The convention we use is that if `curpage' is not NULL, then * it points at a non-empty bucket. In particular, `curpage' * never points at a page header which has PR_PHINPAGE set and * has no items in its bucket. */ if ((ph = pp->pr_curpage) == NULL) { int error; KASSERTMSG((pp->pr_nitems == 0), "%s: [%s] curpage NULL, inconsistent nitems %u", __func__, pp->pr_wchan, pp->pr_nitems); /* * Call the back-end page allocator for more memory. * Release the pool lock, as the back-end page allocator * may block. */ error = pool_grow(pp, flags); if (error != 0) { /* * pool_grow aborts when another thread * is allocating a new page. Retry if it * waited for it. */ if (error == ERESTART) goto startover; /* * We were unable to allocate a page or item * header, but we released the lock during * allocation, so perhaps items were freed * back to the pool. Check for this case. */ if (pp->pr_curpage != NULL) goto startover; pp->pr_nfail++; mutex_exit(&pp->pr_lock); KASSERT((flags & (PR_NOWAIT|PR_LIMITFAIL)) != 0); return NULL; } /* Start the allocation process over. */ goto startover; } if (pp->pr_roflags & PR_USEBMAP) { KASSERTMSG((ph->ph_nmissing < pp->pr_itemsperpage), "%s: [%s] pool page empty", __func__, pp->pr_wchan); v = pr_item_bitmap_get(pp, ph); } else { v = pr_item_linkedlist_get(pp, ph); } pp->pr_nitems--; pp->pr_nout++; if (ph->ph_nmissing == 0) { KASSERT(pp->pr_nidle > 0); pp->pr_nidle--; /* * This page was previously empty. Move it to the list of * partially-full pages. This page is already curpage. */ LIST_REMOVE(ph, ph_pagelist); LIST_INSERT_HEAD(&pp->pr_partpages, ph, ph_pagelist); } ph->ph_nmissing++; if (ph->ph_nmissing == pp->pr_itemsperpage) { KASSERTMSG(((pp->pr_roflags & PR_USEBMAP) || LIST_EMPTY(&ph->ph_itemlist)), "%s: [%s] nmissing (%u) inconsistent", __func__, pp->pr_wchan, ph->ph_nmissing); /* * This page is now full. Move it to the full list * and select a new current page. */ LIST_REMOVE(ph, ph_pagelist); LIST_INSERT_HEAD(&pp->pr_fullpages, ph, ph_pagelist); pool_update_curpage(pp); } pp->pr_nget++; /* * If we have a low water mark and we are now below that low * water mark, add more items to the pool. */ if (POOL_NEEDS_CATCHUP(pp) && pool_catchup(pp) != 0) { /* * XXX: Should we log a warning? Should we set up a timeout * to try again in a second or so? The latter could break * a caller's assumptions about interrupt protection, etc. */ } mutex_exit(&pp->pr_lock); KASSERT((((vaddr_t)v) & (pp->pr_align - 1)) == 0); FREECHECK_OUT(&pp->pr_freecheck, v); pool_redzone_fill(pp, v); pool_get_kmsan(pp, v); if (flags & PR_ZERO) memset(v, 0, pp->pr_reqsize); return v; } /* * Internal version of pool_put(). Pool is already locked/entered. */ static void pool_do_put(struct pool *pp, void *v, struct pool_pagelist *pq) { struct pool_item_header *ph; KASSERT(mutex_owned(&pp->pr_lock)); pool_redzone_check(pp, v); pool_put_kmsan(pp, v); FREECHECK_IN(&pp->pr_freecheck, v); LOCKDEBUG_MEM_CHECK(v, pp->pr_size); KASSERTMSG((pp->pr_nout > 0), "%s: [%s] putting with none out", __func__, pp->pr_wchan); if (__predict_false((ph = pr_find_pagehead(pp, v)) == NULL)) { panic("%s: [%s] page header missing", __func__, pp->pr_wchan); } /* * Return to item list. */ if (pp->pr_roflags & PR_USEBMAP) { pr_item_bitmap_put(pp, ph, v); } else { pr_item_linkedlist_put(pp, ph, v); } KDASSERT(ph->ph_nmissing != 0); ph->ph_nmissing--; pp->pr_nput++; pp->pr_nitems++; pp->pr_nout--; /* Cancel "pool empty" condition if it exists */ if (pp->pr_curpage == NULL) pp->pr_curpage = ph; if (pp->pr_flags & PR_WANTED) { pp->pr_flags &= ~PR_WANTED; cv_broadcast(&pp->pr_cv); } /* * If this page is now empty, do one of two things: * * (1) If we have more pages than the page high water mark, * free the page back to the system. ONLY CONSIDER * FREEING BACK A PAGE IF WE HAVE MORE THAN OUR MINIMUM PAGE * CLAIM. * * (2) Otherwise, move the page to the empty page list. * * Either way, select a new current page (so we use a partially-full * page if one is available). */ if (ph->ph_nmissing == 0) { pp->pr_nidle++; if (pp->pr_nitems - pp->pr_itemsperpage >= pp->pr_minitems && pp->pr_npages > pp->pr_minpages && pp->pr_npages > pp->pr_maxpages) { pr_rmpage(pp, ph, pq); } else { LIST_REMOVE(ph, ph_pagelist); LIST_INSERT_HEAD(&pp->pr_emptypages, ph, ph_pagelist); /* * Update the timestamp on the page. A page must * be idle for some period of time before it can * be reclaimed by the pagedaemon. This minimizes * ping-pong'ing for memory. * * note for 64-bit time_t: truncating to 32-bit is not * a problem for our usage. */ ph->ph_time = time_uptime; } pool_update_curpage(pp); } /* * If the page was previously completely full, move it to the * partially-full list and make it the current page. The next * allocation will get the item from this page, instead of * further fragmenting the pool. */ else if (ph->ph_nmissing == (pp->pr_itemsperpage - 1)) { LIST_REMOVE(ph, ph_pagelist); LIST_INSERT_HEAD(&pp->pr_partpages, ph, ph_pagelist); pp->pr_curpage = ph; } } void pool_put(struct pool *pp, void *v) { struct pool_pagelist pq; LIST_INIT(&pq); mutex_enter(&pp->pr_lock); if (!pool_put_quarantine(pp, v, &pq)) { pool_do_put(pp, v, &pq); } mutex_exit(&pp->pr_lock); pr_pagelist_free(pp, &pq); } /* * pool_grow: grow a pool by a page. * * => called with pool locked. * => unlock and relock the pool. * => return with pool locked. */ static int pool_grow(struct pool *pp, int flags) { struct pool_item_header *ph; char *storage; /* * If there's a pool_grow in progress, wait for it to complete * and try again from the top. */ if (pp->pr_flags & PR_GROWING) { if (flags & PR_WAITOK) { do { cv_wait(&pp->pr_cv, &pp->pr_lock); } while (pp->pr_flags & PR_GROWING); return ERESTART; } else { if (pp->pr_flags & PR_GROWINGNOWAIT) { /* * This needs an unlock/relock dance so * that the other caller has a chance to * run and actually do the thing. Note * that this is effectively a busy-wait. */ mutex_exit(&pp->pr_lock); mutex_enter(&pp->pr_lock); return ERESTART; } return EWOULDBLOCK; } } pp->pr_flags |= PR_GROWING; if (flags & PR_WAITOK) mutex_exit(&pp->pr_lock); else pp->pr_flags |= PR_GROWINGNOWAIT; storage = pool_allocator_alloc(pp, flags); if (__predict_false(storage == NULL)) goto out; ph = pool_alloc_item_header(pp, storage, flags); if (__predict_false(ph == NULL)) { pool_allocator_free(pp, storage); goto out; } if (flags & PR_WAITOK) mutex_enter(&pp->pr_lock); pool_prime_page(pp, storage, ph); pp->pr_npagealloc++; KASSERT(pp->pr_flags & PR_GROWING); pp->pr_flags &= ~(PR_GROWING|PR_GROWINGNOWAIT); /* * If anyone was waiting for pool_grow, notify them that we * may have just done it. */ cv_broadcast(&pp->pr_cv); return 0; out: if (flags & PR_WAITOK) mutex_enter(&pp->pr_lock); KASSERT(pp->pr_flags & PR_GROWING); pp->pr_flags &= ~(PR_GROWING|PR_GROWINGNOWAIT); return ENOMEM; } void pool_prime(struct pool *pp, int n) { mutex_enter(&pp->pr_lock); pp->pr_minpages = roundup(n, pp->pr_itemsperpage) / pp->pr_itemsperpage; if (pp->pr_maxpages <= pp->pr_minpages) pp->pr_maxpages = pp->pr_minpages + 1; /* XXX */ while (pp->pr_npages < pp->pr_minpages) (void) pool_grow(pp, PR_WAITOK); mutex_exit(&pp->pr_lock); } /* * Add a page worth of items to the pool. * * Note, we must be called with the pool descriptor LOCKED. */ static void pool_prime_page(struct pool *pp, void *storage, struct pool_item_header *ph) { const unsigned int align = pp->pr_align; struct pool_item *pi; void *cp = storage; int n; KASSERT(mutex_owned(&pp->pr_lock)); KASSERTMSG(((pp->pr_roflags & PR_NOALIGN) || (((uintptr_t)cp & (pp->pr_alloc->pa_pagesz - 1)) == 0)), "%s: [%s] unaligned page: %p", __func__, pp->pr_wchan, cp); /* * Insert page header. */ LIST_INSERT_HEAD(&pp->pr_emptypages, ph, ph_pagelist); LIST_INIT(&ph->ph_itemlist); ph->ph_page = storage; ph->ph_nmissing = 0; ph->ph_time = time_uptime; if (pp->pr_roflags & PR_PHINPAGE) ph->ph_poolid = pp->pr_poolid; else SPLAY_INSERT(phtree, &pp->pr_phtree, ph); pp->pr_nidle++; /* * The item space starts after the on-page header, if any. */ ph->ph_off = pp->pr_itemoffset; /* * Color this page. */ ph->ph_off += pp->pr_curcolor; cp = (char *)cp + ph->ph_off; if ((pp->pr_curcolor += align) > pp->pr_maxcolor) pp->pr_curcolor = 0; KASSERT((((vaddr_t)cp) & (align - 1)) == 0); /* * Insert remaining chunks on the bucket list. */ n = pp->pr_itemsperpage; pp->pr_nitems += n; if (pp->pr_roflags & PR_USEBMAP) { pr_item_bitmap_init(pp, ph); } else { while (n--) { pi = (struct pool_item *)cp; KASSERT((((vaddr_t)pi) & (align - 1)) == 0); /* Insert on page list */ LIST_INSERT_HEAD(&ph->ph_itemlist, pi, pi_list); #ifdef POOL_CHECK_MAGIC pi->pi_magic = PI_MAGIC; #endif cp = (char *)cp + pp->pr_size; KASSERT((((vaddr_t)cp) & (align - 1)) == 0); } } /* * If the pool was depleted, point at the new page. */ if (pp->pr_curpage == NULL) pp->pr_curpage = ph; if (++pp->pr_npages > pp->pr_hiwat) pp->pr_hiwat = pp->pr_npages; } /* * Used by pool_get() when nitems drops below the low water mark. This * is used to catch up pr_nitems with the low water mark. * * Note 1, we never wait for memory here, we let the caller decide what to do. * * Note 2, we must be called with the pool already locked, and we return * with it locked. */ static int pool_catchup(struct pool *pp) { int error = 0; while (POOL_NEEDS_CATCHUP(pp)) { error = pool_grow(pp, PR_NOWAIT); if (error) { if (error == ERESTART) continue; break; } } return error; } static void pool_update_curpage(struct pool *pp) { pp->pr_curpage = LIST_FIRST(&pp->pr_partpages); if (pp->pr_curpage == NULL) { pp->pr_curpage = LIST_FIRST(&pp->pr_emptypages); } KASSERTMSG((pp->pr_curpage == NULL) == (pp->pr_nitems == 0), "pp=%p curpage=%p nitems=%u", pp, pp->pr_curpage, pp->pr_nitems); } void pool_setlowat(struct pool *pp, int n) { mutex_enter(&pp->pr_lock); pp->pr_minitems = n; /* Make sure we're caught up with the newly-set low water mark. */ if (POOL_NEEDS_CATCHUP(pp) && pool_catchup(pp) != 0) { /* * XXX: Should we log a warning? Should we set up a timeout * to try again in a second or so? The latter could break * a caller's assumptions about interrupt protection, etc. */ } mutex_exit(&pp->pr_lock); } void pool_sethiwat(struct pool *pp, int n) { mutex_enter(&pp->pr_lock); pp->pr_maxitems = n; mutex_exit(&pp->pr_lock); } void pool_sethardlimit(struct pool *pp, int n, const char *warnmess, int ratecap) { mutex_enter(&pp->pr_lock); pp->pr_hardlimit = n; pp->pr_hardlimit_warning = warnmess; pp->pr_hardlimit_ratecap.tv_sec = ratecap; pp->pr_hardlimit_warning_last.tv_sec = 0; pp->pr_hardlimit_warning_last.tv_usec = 0; pp->pr_maxpages = roundup(n, pp->pr_itemsperpage) / pp->pr_itemsperpage; mutex_exit(&pp->pr_lock); } unsigned int pool_nget(struct pool *pp) { return pp->pr_nget; } unsigned int pool_nput(struct pool *pp) { return pp->pr_nput; } /* * Release all complete pages that have not been used recently. * * Must not be called from interrupt context. */ int pool_reclaim(struct pool *pp) { struct pool_item_header *ph, *phnext; struct pool_pagelist pq; struct pool_cache *pc; uint32_t curtime; bool klock; int rv; KASSERT(!cpu_intr_p()); KASSERT(!cpu_softintr_p()); if (pp->pr_drain_hook != NULL) { /* * The drain hook must be called with the pool unlocked. */ (*pp->pr_drain_hook)(pp->pr_drain_hook_arg, PR_NOWAIT); } /* * XXXSMP Because we do not want to cause non-MPSAFE code * to block. */ if (pp->pr_ipl == IPL_SOFTNET || pp->pr_ipl == IPL_SOFTCLOCK || pp->pr_ipl == IPL_SOFTSERIAL) { KERNEL_LOCK(1, NULL); klock = true; } else klock = false; /* Reclaim items from the pool's cache (if any). */ if ((pc = atomic_load_consume(&pp->pr_cache)) != NULL) pool_cache_invalidate(pc); if (mutex_tryenter(&pp->pr_lock) == 0) { if (klock) { KERNEL_UNLOCK_ONE(NULL); } return 0; } LIST_INIT(&pq); curtime = time_uptime; for (ph = LIST_FIRST(&pp->pr_emptypages); ph != NULL; ph = phnext) { phnext = LIST_NEXT(ph, ph_pagelist); /* Check our minimum page claim */ if (pp->pr_npages <= pp->pr_minpages) break; KASSERT(ph->ph_nmissing == 0); if (curtime - ph->ph_time < pool_inactive_time) continue; /* * If freeing this page would put us below the minimum free items * or the minimum pages, stop now. */ if (pp->pr_nitems - pp->pr_itemsperpage < pp->pr_minitems || pp->pr_npages - 1 < pp->pr_minpages) break; pr_rmpage(pp, ph, &pq); } mutex_exit(&pp->pr_lock); if (LIST_EMPTY(&pq)) rv = 0; else { pr_pagelist_free(pp, &pq); rv = 1; } if (klock) { KERNEL_UNLOCK_ONE(NULL); } return rv; } /* * Drain pools, one at a time. The drained pool is returned within ppp. * * Note, must never be called from interrupt context. */ bool pool_drain(struct pool **ppp) { bool reclaimed; struct pool *pp; KASSERT(!TAILQ_EMPTY(&pool_head)); pp = NULL; /* Find next pool to drain, and add a reference. */ mutex_enter(&pool_head_lock); do { if (drainpp == NULL) { drainpp = TAILQ_FIRST(&pool_head); } if (drainpp != NULL) { pp = drainpp; drainpp = TAILQ_NEXT(pp, pr_poollist); } /* * Skip completely idle pools. We depend on at least * one pool in the system being active. */ } while (pp == NULL || pp->pr_npages == 0); pp->pr_refcnt++; mutex_exit(&pool_head_lock); /* Drain the cache (if any) and pool.. */ reclaimed = pool_reclaim(pp); /* Finally, unlock the pool. */ mutex_enter(&pool_head_lock); pp->pr_refcnt--; cv_broadcast(&pool_busy); mutex_exit(&pool_head_lock); if (ppp != NULL) *ppp = pp; return reclaimed; } /* * Calculate the total number of pages consumed by pools. */ int pool_totalpages(void) { mutex_enter(&pool_head_lock); int pages = pool_totalpages_locked(); mutex_exit(&pool_head_lock); return pages; } int pool_totalpages_locked(void) { struct pool *pp; uint64_t total = 0; TAILQ_FOREACH(pp, &pool_head, pr_poollist) { uint64_t bytes = (uint64_t)pp->pr_npages * pp->pr_alloc->pa_pagesz; if ((pp->pr_roflags & PR_RECURSIVE) != 0) bytes -= ((uint64_t)pp->pr_nout * pp->pr_size); total += bytes; } return atop(total); } /* * Diagnostic helpers. */ void pool_printall(const char *modif, void (*pr)(const char *, ...)) { struct pool *pp; TAILQ_FOREACH(pp, &pool_head, pr_poollist) { pool_printit(pp, modif, pr); } } void pool_printit(struct pool *pp, const char *modif, void (*pr)(const char *, ...)) { if (pp == NULL) { (*pr)("Must specify a pool to print.\n"); return; } pool_print1(pp, modif, pr); } static void pool_print_pagelist(struct pool *pp, struct pool_pagelist *pl, void (*pr)(const char *, ...)) { struct pool_item_header *ph; LIST_FOREACH(ph, pl, ph_pagelist) { (*pr)("\t\tpage %p, nmissing %d, time %" PRIu32 "\n", ph->ph_page, ph->ph_nmissing, ph->ph_time); #ifdef POOL_CHECK_MAGIC struct pool_item *pi; if (!(pp->pr_roflags & PR_USEBMAP)) { LIST_FOREACH(pi, &ph->ph_itemlist, pi_list) { if (pi->pi_magic != PI_MAGIC) { (*pr)("\t\t\titem %p, magic 0x%x\n", pi, pi->pi_magic); } } } #endif } } static void pool_print1(struct pool *pp, const char *modif, void (*pr)(const char *, ...)) { struct pool_item_header *ph; pool_cache_t pc; pcg_t *pcg; pool_cache_cpu_t *cc; uint64_t cpuhit, cpumiss, pchit, pcmiss; uint32_t nfull; int i; bool print_log = false, print_pagelist = false, print_cache = false; bool print_short = false, skip_empty = false; char c; while ((c = *modif++) != '\0') { if (c == 'l') print_log = true; if (c == 'p') print_pagelist = true; if (c == 'c') print_cache = true; if (c == 's') print_short = true; if (c == 'S') skip_empty = true; } if (skip_empty && pp->pr_nget == 0) return; if ((pc = atomic_load_consume(&pp->pr_cache)) != NULL) { (*pr)("POOLCACHE"); } else { (*pr)("POOL"); } /* Single line output. */ if (print_short) { (*pr)(" %s:%p:%u:%u:%u:%u:%u:%u:%u:%u:%u:%u\n", pp->pr_wchan, pp, pp->pr_size, pp->pr_align, pp->pr_npages, pp->pr_nitems, pp->pr_nout, pp->pr_nget, pp->pr_nput, pp->pr_npagealloc, pp->pr_npagefree, pp->pr_nidle); return; } (*pr)(" %s: size %u, align %u, ioff %u, roflags 0x%08x\n", pp->pr_wchan, pp->pr_size, pp->pr_align, pp->pr_itemoffset, pp->pr_roflags); (*pr)("\tpool %p, alloc %p\n", pp, pp->pr_alloc); (*pr)("\tminitems %u, minpages %u, maxpages %u, npages %u\n", pp->pr_minitems, pp->pr_minpages, pp->pr_maxpages, pp->pr_npages); (*pr)("\titemsperpage %u, nitems %u, nout %u, hardlimit %u\n", pp->pr_itemsperpage, pp->pr_nitems, pp->pr_nout, pp->pr_hardlimit); (*pr)("\tnget %lu, nfail %lu, nput %lu\n", pp->pr_nget, pp->pr_nfail, pp->pr_nput); (*pr)("\tnpagealloc %lu, npagefree %lu, hiwat %u, nidle %lu\n", pp->pr_npagealloc, pp->pr_npagefree, pp->pr_hiwat, pp->pr_nidle); if (!print_pagelist) goto skip_pagelist; if ((ph = LIST_FIRST(&pp->pr_emptypages)) != NULL) (*pr)("\n\tempty page list:\n"); pool_print_pagelist(pp, &pp->pr_emptypages, pr); if ((ph = LIST_FIRST(&pp->pr_fullpages)) != NULL) (*pr)("\n\tfull page list:\n"); pool_print_pagelist(pp, &pp->pr_fullpages, pr); if ((ph = LIST_FIRST(&pp->pr_partpages)) != NULL) (*pr)("\n\tpartial-page list:\n"); pool_print_pagelist(pp, &pp->pr_partpages, pr); if (pp->pr_curpage == NULL) (*pr)("\tno current page\n"); else (*pr)("\tcurpage %p\n", pp->pr_curpage->ph_page); skip_pagelist: if (print_log) goto skip_log; (*pr)("\n"); skip_log: #define PR_GROUPLIST(pcg) \ (*pr)("\t\tgroup %p: avail %d\n", pcg, pcg->pcg_avail); \ for (i = 0; i < pcg->pcg_size; i++) { \ if (pcg->pcg_objects[i].pcgo_pa != \ POOL_PADDR_INVALID) { \ (*pr)("\t\t\t%p, 0x%llx\n", \ pcg->pcg_objects[i].pcgo_va, \ (unsigned long long) \ pcg->pcg_objects[i].pcgo_pa); \ } else { \ (*pr)("\t\t\t%p\n", \ pcg->pcg_objects[i].pcgo_va); \ } \ } if (pc != NULL) { cpuhit = 0; cpumiss = 0; pcmiss = 0; nfull = 0; for (i = 0; i < __arraycount(pc->pc_cpus); i++) { if ((cc = pc->pc_cpus[i]) == NULL) continue; cpuhit += cc->cc_hits; cpumiss += cc->cc_misses; pcmiss += cc->cc_pcmisses; nfull += cc->cc_nfull; } pchit = cpumiss - pcmiss; (*pr)("\tcpu layer hits %llu misses %llu\n", cpuhit, cpumiss); (*pr)("\tcache layer hits %llu misses %llu\n", pchit, pcmiss); (*pr)("\tcache layer full groups %u\n", nfull); if (print_cache) { (*pr)("\tfull cache groups:\n"); for (pcg = pc->pc_fullgroups; pcg != NULL; pcg = pcg->pcg_next) { PR_GROUPLIST(pcg); } } } #undef PR_GROUPLIST } static int pool_chk_page(struct pool *pp, const char *label, struct pool_item_header *ph) { struct pool_item *pi; void *page; int n; if ((pp->pr_roflags & PR_NOALIGN) == 0) { page = POOL_OBJ_TO_PAGE(pp, ph); if (page != ph->ph_page && (pp->pr_roflags & PR_PHINPAGE) != 0) { if (label != NULL) printf("%s: ", label); printf("pool(%p:%s): page inconsistency: page %p;" " at page head addr %p (p %p)\n", pp, pp->pr_wchan, ph->ph_page, ph, page); return 1; } } if ((pp->pr_roflags & PR_USEBMAP) != 0) return 0; for (pi = LIST_FIRST(&ph->ph_itemlist), n = 0; pi != NULL; pi = LIST_NEXT(pi,pi_list), n++) { #ifdef POOL_CHECK_MAGIC if (pi->pi_magic != PI_MAGIC) { if (label != NULL) printf("%s: ", label); printf("pool(%s): free list modified: magic=%x;" " page %p; item ordinal %d; addr %p\n", pp->pr_wchan, pi->pi_magic, ph->ph_page, n, pi); panic("pool"); } #endif if ((pp->pr_roflags & PR_NOALIGN) != 0) { continue; } page = POOL_OBJ_TO_PAGE(pp, pi); if (page == ph->ph_page) continue; if (label != NULL) printf("%s: ", label); printf("pool(%p:%s): page inconsistency: page %p;" " item ordinal %d; addr %p (p %p)\n", pp, pp->pr_wchan, ph->ph_page, n, pi, page); return 1; } return 0; } int pool_chk(struct pool *pp, const char *label) { struct pool_item_header *ph; int r = 0; mutex_enter(&pp->pr_lock); LIST_FOREACH(ph, &pp->pr_emptypages, ph_pagelist) { r = pool_chk_page(pp, label, ph); if (r) { goto out; } } LIST_FOREACH(ph, &pp->pr_fullpages, ph_pagelist) { r = pool_chk_page(pp, label, ph); if (r) { goto out; } } LIST_FOREACH(ph, &pp->pr_partpages, ph_pagelist) { r = pool_chk_page(pp, label, ph); if (r) { goto out; } } out: mutex_exit(&pp->pr_lock); return r; } /* * pool_cache_init: * * Initialize a pool cache. */ pool_cache_t pool_cache_init(size_t size, u_int align, u_int align_offset, u_int flags, const char *wchan, struct pool_allocator *palloc, int ipl, int (*ctor)(void *, void *, int), void (*dtor)(void *, void *), void *arg) { pool_cache_t pc; pc = pool_get(&cache_pool, PR_WAITOK); if (pc == NULL) return NULL; pool_cache_bootstrap(pc, size, align, align_offset, flags, wchan, palloc, ipl, ctor, dtor, arg); return pc; } /* * pool_cache_bootstrap: * * Kernel-private version of pool_cache_init(). The caller * provides initial storage. */ void pool_cache_bootstrap(pool_cache_t pc, size_t size, u_int align, u_int align_offset, u_int flags, const char *wchan, struct pool_allocator *palloc, int ipl, int (*ctor)(void *, void *, int), void (*dtor)(void *, void *), void *arg) { CPU_INFO_ITERATOR cii; pool_cache_t pc1; struct cpu_info *ci; struct pool *pp; unsigned int ppflags; pp = &pc->pc_pool; if (palloc == NULL && ipl == IPL_NONE) { if (size > PAGE_SIZE) { int bigidx = pool_bigidx(size); palloc = &pool_allocator_big[bigidx]; flags |= PR_NOALIGN; } else palloc = &pool_allocator_nointr; } ppflags = flags; if (ctor == NULL) { ctor = NO_CTOR; } if (dtor == NULL) { dtor = NO_DTOR; } else { /* * If we have a destructor, then the pool layer does not * need to worry about PR_PSERIALIZE. */ ppflags &= ~PR_PSERIALIZE; } pool_init(pp, size, align, align_offset, ppflags, wchan, palloc, ipl); pc->pc_fullgroups = NULL; pc->pc_partgroups = NULL; pc->pc_ctor = ctor; pc->pc_dtor = dtor; pc->pc_arg = arg; pc->pc_refcnt = 0; pc->pc_roflags = flags; pc->pc_freecheck = NULL; if ((flags & PR_LARGECACHE) != 0) { pc->pc_pcgsize = PCG_NOBJECTS_LARGE; pc->pc_pcgpool = &pcg_large_pool; pc->pc_pcgcache = &pcg_large_cache; } else { pc->pc_pcgsize = PCG_NOBJECTS_NORMAL; pc->pc_pcgpool = &pcg_normal_pool; pc->pc_pcgcache = &pcg_normal_cache; } /* Allocate per-CPU caches. */ memset(pc->pc_cpus, 0, sizeof(pc->pc_cpus)); pc->pc_ncpu = 0; if (ncpu < 2) { /* XXX For sparc: boot CPU is not attached yet. */ pool_cache_cpu_init1(curcpu(), pc); } else { for (CPU_INFO_FOREACH(cii, ci)) { pool_cache_cpu_init1(ci, pc); } } /* Add to list of all pools. */ if (__predict_true(!cold)) mutex_enter(&pool_head_lock); TAILQ_FOREACH(pc1, &pool_cache_head, pc_cachelist) { if (strcmp(pc1->pc_pool.pr_wchan, pc->pc_pool.pr_wchan) > 0) break; } if (pc1 == NULL) TAILQ_INSERT_TAIL(&pool_cache_head, pc, pc_cachelist); else TAILQ_INSERT_BEFORE(pc1, pc, pc_cachelist); if (__predict_true(!cold)) mutex_exit(&pool_head_lock); atomic_store_release(&pp->pr_cache, pc); } /* * pool_cache_destroy: * * Destroy a pool cache. */ void pool_cache_destroy(pool_cache_t pc) { pool_cache_bootstrap_destroy(pc); pool_put(&cache_pool, pc); } /* * pool_cache_bootstrap_destroy: * * Destroy a pool cache. */ void pool_cache_bootstrap_destroy(pool_cache_t pc) { struct pool *pp = &pc->pc_pool; u_int i; /* Remove it from the global list. */ mutex_enter(&pool_head_lock); while (pc->pc_refcnt != 0) cv_wait(&pool_busy, &pool_head_lock); TAILQ_REMOVE(&pool_cache_head, pc, pc_cachelist); mutex_exit(&pool_head_lock); /* First, invalidate the entire cache. */ pool_cache_invalidate(pc); /* Disassociate it from the pool. */ mutex_enter(&pp->pr_lock); atomic_store_relaxed(&pp->pr_cache, NULL); mutex_exit(&pp->pr_lock); /* Destroy per-CPU data */ for (i = 0; i < __arraycount(pc->pc_cpus); i++) pool_cache_invalidate_cpu(pc, i); /* Finally, destroy it. */ pool_destroy(pp); } /* * pool_cache_cpu_init1: * * Called for each pool_cache whenever a new CPU is attached. */ static void pool_cache_cpu_init1(struct cpu_info *ci, pool_cache_t pc) { pool_cache_cpu_t *cc; int index; index = ci->ci_index; KASSERT(index < __arraycount(pc->pc_cpus)); if ((cc = pc->pc_cpus[index]) != NULL) { return; } /* * The first CPU is 'free'. This needs to be the case for * bootstrap - we may not be able to allocate yet. */ if (pc->pc_ncpu == 0) { cc = &pc->pc_cpu0; pc->pc_ncpu = 1; } else { pc->pc_ncpu++; cc = pool_get(&cache_cpu_pool, PR_WAITOK); } cc->cc_current = __UNCONST(&pcg_dummy); cc->cc_previous = __UNCONST(&pcg_dummy); cc->cc_pcgcache = pc->pc_pcgcache; cc->cc_hits = 0; cc->cc_misses = 0; cc->cc_pcmisses = 0; cc->cc_contended = 0; cc->cc_nfull = 0; cc->cc_npart = 0; pc->pc_cpus[index] = cc; } /* * pool_cache_cpu_init: * * Called whenever a new CPU is attached. */ void pool_cache_cpu_init(struct cpu_info *ci) { pool_cache_t pc; mutex_enter(&pool_head_lock); TAILQ_FOREACH(pc, &pool_cache_head, pc_cachelist) { pc->pc_refcnt++; mutex_exit(&pool_head_lock); pool_cache_cpu_init1(ci, pc); mutex_enter(&pool_head_lock); pc->pc_refcnt--; cv_broadcast(&pool_busy); } mutex_exit(&pool_head_lock); } /* * pool_cache_reclaim: * * Reclaim memory from a pool cache. */ bool pool_cache_reclaim(pool_cache_t pc) { return pool_reclaim(&pc->pc_pool); } static inline void pool_cache_pre_destruct(pool_cache_t pc) { /* * Perform a passive serialization barrier before destructing * a batch of one or more objects. */ if (__predict_false(pc_has_pser(pc))) { pool_barrier(); } } static void pool_cache_destruct_object1(pool_cache_t pc, void *object) { (*pc->pc_dtor)(pc->pc_arg, object); pool_put(&pc->pc_pool, object); } /* * pool_cache_destruct_object: * * Force destruction of an object and its release back into * the pool. */ void pool_cache_destruct_object(pool_cache_t pc, void *object) { FREECHECK_IN(&pc->pc_freecheck, object); pool_cache_pre_destruct(pc); pool_cache_destruct_object1(pc, object); } /* * pool_cache_invalidate_groups: * * Invalidate a chain of groups and destruct all objects. Return the * number of groups that were invalidated. */ static int pool_cache_invalidate_groups(pool_cache_t pc, pcg_t *pcg) { void *object; pcg_t *next; int i, n; if (pcg == NULL) { return 0; } pool_cache_pre_destruct(pc); for (n = 0; pcg != NULL; pcg = next, n++) { next = pcg->pcg_next; for (i = 0; i < pcg->pcg_avail; i++) { object = pcg->pcg_objects[i].pcgo_va; pool_cache_destruct_object1(pc, object); } if (pcg->pcg_size == PCG_NOBJECTS_LARGE) { pool_put(&pcg_large_pool, pcg); } else { KASSERT(pcg->pcg_size == PCG_NOBJECTS_NORMAL); pool_put(&pcg_normal_pool, pcg); } } return n; } /* * pool_cache_invalidate: * * Invalidate a pool cache (destruct and release all of the * cached objects). Does not reclaim objects from the pool. * * Note: For pool caches that provide constructed objects, there * is an assumption that another level of synchronization is occurring * between the input to the constructor and the cache invalidation. * * Invalidation is a costly process and should not be called from * interrupt context. */ void pool_cache_invalidate(pool_cache_t pc) { uint64_t where; pcg_t *pcg; int n, s; KASSERT(!cpu_intr_p()); KASSERT(!cpu_softintr_p()); if (ncpu < 2 || !mp_online) { /* * We might be called early enough in the boot process * for the CPU data structures to not be fully initialized. * In this case, transfer the content of the local CPU's * cache back into global cache as only this CPU is currently * running. */ pool_cache_transfer(pc); } else { /* * Signal all CPUs that they must transfer their local * cache back to the global pool then wait for the xcall to * complete. */ where = xc_broadcast(0, __FPTRCAST(xcfunc_t, pool_cache_transfer), pc, NULL); xc_wait(where); } /* Now dequeue and invalidate everything. */ pcg = pool_pcg_trunc(&pcg_normal_cache); (void)pool_cache_invalidate_groups(pc, pcg); pcg = pool_pcg_trunc(&pcg_large_cache); (void)pool_cache_invalidate_groups(pc, pcg); pcg = pool_pcg_trunc(&pc->pc_fullgroups); n = pool_cache_invalidate_groups(pc, pcg); s = splvm(); ((pool_cache_cpu_t *)pc->pc_cpus[curcpu()->ci_index])->cc_nfull -= n; splx(s); pcg = pool_pcg_trunc(&pc->pc_partgroups); n = pool_cache_invalidate_groups(pc, pcg); s = splvm(); ((pool_cache_cpu_t *)pc->pc_cpus[curcpu()->ci_index])->cc_npart -= n; splx(s); } /* * pool_cache_invalidate_cpu: * * Invalidate all CPU-bound cached objects in pool cache, the CPU being * identified by its associated index. * It is caller's responsibility to ensure that no operation is * taking place on this pool cache while doing this invalidation. * WARNING: as no inter-CPU locking is enforced, trying to invalidate * pool cached objects from a CPU different from the one currently running * may result in an undefined behaviour. */ static void pool_cache_invalidate_cpu(pool_cache_t pc, u_int index) { pool_cache_cpu_t *cc; pcg_t *pcg; if ((cc = pc->pc_cpus[index]) == NULL) return; if ((pcg = cc->cc_current) != &pcg_dummy) { pcg->pcg_next = NULL; pool_cache_invalidate_groups(pc, pcg); } if ((pcg = cc->cc_previous) != &pcg_dummy) { pcg->pcg_next = NULL; pool_cache_invalidate_groups(pc, pcg); } if (cc != &pc->pc_cpu0) pool_put(&cache_cpu_pool, cc); } void pool_cache_set_drain_hook(pool_cache_t pc, void (*fn)(void *, int), void *arg) { pool_set_drain_hook(&pc->pc_pool, fn, arg); } void pool_cache_setlowat(pool_cache_t pc, int n) { pool_setlowat(&pc->pc_pool, n); } void pool_cache_sethiwat(pool_cache_t pc, int n) { pool_sethiwat(&pc->pc_pool, n); } void pool_cache_sethardlimit(pool_cache_t pc, int n, const char *warnmess, int ratecap) { pool_sethardlimit(&pc->pc_pool, n, warnmess, ratecap); } void pool_cache_prime(pool_cache_t pc, int n) { pool_prime(&pc->pc_pool, n); } unsigned int pool_cache_nget(pool_cache_t pc) { return pool_nget(&pc->pc_pool); } unsigned int pool_cache_nput(pool_cache_t pc) { return pool_nput(&pc->pc_pool); } /* * pool_pcg_get: * * Get a cache group from the specified list. Return true if * contention was encountered. Must be called at IPL_VM because * of spin wait vs. kernel_lock. */ static int pool_pcg_get(pcg_t *volatile *head, pcg_t **pcgp) { int count = SPINLOCK_BACKOFF_MIN; pcg_t *o, *n; for (o = atomic_load_relaxed(head);; o = n) { if (__predict_false(o == &pcg_dummy)) { /* Wait for concurrent get to complete. */ SPINLOCK_BACKOFF(count); n = atomic_load_relaxed(head); continue; } if (__predict_false(o == NULL)) { break; } /* Lock out concurrent get/put. */ n = atomic_cas_ptr(head, o, __UNCONST(&pcg_dummy)); if (o == n) { /* Fetch pointer to next item and then unlock. */ membar_datadep_consumer(); /* alpha */ n = atomic_load_relaxed(&o->pcg_next); atomic_store_release(head, n); break; } } *pcgp = o; return count != SPINLOCK_BACKOFF_MIN; } /* * pool_pcg_trunc: * * Chop out entire list of pool cache groups. */ static pcg_t * pool_pcg_trunc(pcg_t *volatile *head) { int count = SPINLOCK_BACKOFF_MIN, s; pcg_t *o, *n; s = splvm(); for (o = atomic_load_relaxed(head);; o = n) { if (__predict_false(o == &pcg_dummy)) { /* Wait for concurrent get to complete. */ SPINLOCK_BACKOFF(count); n = atomic_load_relaxed(head); continue; } n = atomic_cas_ptr(head, o, NULL); if (o == n) { splx(s); membar_datadep_consumer(); /* alpha */ return o; } } } /* * pool_pcg_put: * * Put a pool cache group to the specified list. Return true if * contention was encountered. Must be called at IPL_VM because of * spin wait vs. kernel_lock. */ static int pool_pcg_put(pcg_t *volatile *head, pcg_t *pcg) { int count = SPINLOCK_BACKOFF_MIN; pcg_t *o, *n; for (o = atomic_load_relaxed(head);; o = n) { if (__predict_false(o == &pcg_dummy)) { /* Wait for concurrent get to complete. */ SPINLOCK_BACKOFF(count); n = atomic_load_relaxed(head); continue; } pcg->pcg_next = o; membar_release(); n = atomic_cas_ptr(head, o, pcg); if (o == n) { return count != SPINLOCK_BACKOFF_MIN; } } } static bool __noinline pool_cache_get_slow(pool_cache_t pc, pool_cache_cpu_t *cc, int s, void **objectp, paddr_t *pap, int flags) { pcg_t *pcg, *cur; void *object; KASSERT(cc->cc_current->pcg_avail == 0); KASSERT(cc->cc_previous->pcg_avail == 0); cc->cc_misses++; /* * If there's a full group, release our empty group back to the * cache. Install the full group as cc_current and return. */ cc->cc_contended += pool_pcg_get(&pc->pc_fullgroups, &pcg); if (__predict_true(pcg != NULL)) { KASSERT(pcg->pcg_avail == pcg->pcg_size); if (__predict_true((cur = cc->cc_current) != &pcg_dummy)) { KASSERT(cur->pcg_avail == 0); (void)pool_pcg_put(cc->cc_pcgcache, cur); } cc->cc_nfull--; cc->cc_current = pcg; return true; } /* * Nothing available locally or in cache. Take the slow * path: fetch a new object from the pool and construct * it. */ cc->cc_pcmisses++; splx(s); object = pool_get(&pc->pc_pool, flags); *objectp = object; if (__predict_false(object == NULL)) { KASSERT((flags & (PR_NOWAIT|PR_LIMITFAIL)) != 0); return false; } if (__predict_false((*pc->pc_ctor)(pc->pc_arg, object, flags) != 0)) { pool_put(&pc->pc_pool, object); *objectp = NULL; return false; } KASSERT((((vaddr_t)object) & (pc->pc_pool.pr_align - 1)) == 0); if (pap != NULL) { #ifdef POOL_VTOPHYS *pap = POOL_VTOPHYS(object); #else *pap = POOL_PADDR_INVALID; #endif } FREECHECK_OUT(&pc->pc_freecheck, object); return false; } /* * pool_cache_get{,_paddr}: * * Get an object from a pool cache (optionally returning * the physical address of the object). */ void * pool_cache_get_paddr(pool_cache_t pc, int flags, paddr_t *pap) { pool_cache_cpu_t *cc; pcg_t *pcg; void *object; int s; KASSERT(!(flags & PR_NOWAIT) != !(flags & PR_WAITOK)); if (pc->pc_pool.pr_ipl == IPL_NONE && __predict_true(!cold) && __predict_true(panicstr == NULL)) { KASSERTMSG(!cpu_intr_p(), "%s: [%s] is IPL_NONE, but called from interrupt context", __func__, pc->pc_pool.pr_wchan); KASSERTMSG(!cpu_softintr_p(), "%s: [%s] is IPL_NONE," " but called from soft interrupt context", __func__, pc->pc_pool.pr_wchan); } if (flags & PR_WAITOK) { ASSERT_SLEEPABLE(); } if (flags & PR_NOWAIT) { if (fault_inject()) return NULL; } /* Lock out interrupts and disable preemption. */ s = splvm(); while (/* CONSTCOND */ true) { /* Try and allocate an object from the current group. */ cc = pc->pc_cpus[curcpu()->ci_index]; pcg = cc->cc_current; if (__predict_true(pcg->pcg_avail > 0)) { object = pcg->pcg_objects[--pcg->pcg_avail].pcgo_va; if (__predict_false(pap != NULL)) *pap = pcg->pcg_objects[pcg->pcg_avail].pcgo_pa; #if defined(DIAGNOSTIC) pcg->pcg_objects[pcg->pcg_avail].pcgo_va = NULL; KASSERT(pcg->pcg_avail < pcg->pcg_size); KASSERT(object != NULL); #endif cc->cc_hits++; splx(s); FREECHECK_OUT(&pc->pc_freecheck, object); pool_redzone_fill(&pc->pc_pool, object); pool_cache_get_kmsan(pc, object); return object; } /* * That failed. If the previous group isn't empty, swap * it with the current group and allocate from there. */ pcg = cc->cc_previous; if (__predict_true(pcg->pcg_avail > 0)) { cc->cc_previous = cc->cc_current; cc->cc_current = pcg; continue; } /* * Can't allocate from either group: try the slow path. * If get_slow() allocated an object for us, or if * no more objects are available, it will return false. * Otherwise, we need to retry. */ if (!pool_cache_get_slow(pc, cc, s, &object, pap, flags)) { if (object != NULL) { kmsan_orig(object, pc->pc_pool.pr_size, KMSAN_TYPE_POOL, __RET_ADDR); } break; } } /* * We would like to KASSERT(object || (flags & PR_NOWAIT)), but * pool_cache_get can fail even in the PR_WAITOK case, if the * constructor fails. */ return object; } static bool __noinline pool_cache_put_slow(pool_cache_t pc, pool_cache_cpu_t *cc, int s, void *object) { pcg_t *pcg, *cur; KASSERT(cc->cc_current->pcg_avail == cc->cc_current->pcg_size); KASSERT(cc->cc_previous->pcg_avail == cc->cc_previous->pcg_size); cc->cc_misses++; /* * Try to get an empty group from the cache. If there are no empty * groups in the cache then allocate one. */ (void)pool_pcg_get(cc->cc_pcgcache, &pcg); if (__predict_false(pcg == NULL)) { if (__predict_true(!pool_cache_disable)) { pcg = pool_get(pc->pc_pcgpool, PR_NOWAIT); } if (__predict_true(pcg != NULL)) { pcg->pcg_avail = 0; pcg->pcg_size = pc->pc_pcgsize; } } /* * If there's a empty group, release our full group back to the * cache. Install the empty group to the local CPU and return. */ if (pcg != NULL) { KASSERT(pcg->pcg_avail == 0); if (__predict_false(cc->cc_previous == &pcg_dummy)) { cc->cc_previous = pcg; } else { cur = cc->cc_current; if (__predict_true(cur != &pcg_dummy)) { KASSERT(cur->pcg_avail == cur->pcg_size); cc->cc_contended += pool_pcg_put(&pc->pc_fullgroups, cur); cc->cc_nfull++; } cc->cc_current = pcg; } return true; } /* * Nothing available locally or in cache, and we didn't * allocate an empty group. Take the slow path and destroy * the object here and now. */ cc->cc_pcmisses++; splx(s); pool_cache_destruct_object(pc, object); return false; } /* * pool_cache_put{,_paddr}: * * Put an object back to the pool cache (optionally caching the * physical address of the object). */ void pool_cache_put_paddr(pool_cache_t pc, void *object, paddr_t pa) { pool_cache_cpu_t *cc; pcg_t *pcg; int s; KASSERT(object != NULL); pool_cache_put_kmsan(pc, object); pool_cache_redzone_check(pc, object); FREECHECK_IN(&pc->pc_freecheck, object); if (pc->pc_pool.pr_roflags & PR_PHINPAGE) { pc_phinpage_check(pc, object); } if (pool_cache_put_nocache(pc, object)) { return; } /* Lock out interrupts and disable preemption. */ s = splvm(); while (/* CONSTCOND */ true) { /* If the current group isn't full, release it there. */ cc = pc->pc_cpus[curcpu()->ci_index]; pcg = cc->cc_current; if (__predict_true(pcg->pcg_avail < pcg->pcg_size)) { pcg->pcg_objects[pcg->pcg_avail].pcgo_va = object; pcg->pcg_objects[pcg->pcg_avail].pcgo_pa = pa; pcg->pcg_avail++; cc->cc_hits++; splx(s); return; } /* * That failed. If the previous group isn't full, swap * it with the current group and try again. */ pcg = cc->cc_previous; if (__predict_true(pcg->pcg_avail < pcg->pcg_size)) { cc->cc_previous = cc->cc_current; cc->cc_current = pcg; continue; } /* * Can't free to either group: try the slow path. * If put_slow() releases the object for us, it * will return false. Otherwise we need to retry. */ if (!pool_cache_put_slow(pc, cc, s, object)) break; } } /* * pool_cache_transfer: * * Transfer objects from the per-CPU cache to the global cache. * Run within a cross-call thread. */ static void pool_cache_transfer(pool_cache_t pc) { pool_cache_cpu_t *cc; pcg_t *prev, *cur; int s; s = splvm(); cc = pc->pc_cpus[curcpu()->ci_index]; cur = cc->cc_current; cc->cc_current = __UNCONST(&pcg_dummy); prev = cc->cc_previous; cc->cc_previous = __UNCONST(&pcg_dummy); if (cur != &pcg_dummy) { if (cur->pcg_avail == cur->pcg_size) { (void)pool_pcg_put(&pc->pc_fullgroups, cur); cc->cc_nfull++; } else if (cur->pcg_avail == 0) { (void)pool_pcg_put(pc->pc_pcgcache, cur); } else { (void)pool_pcg_put(&pc->pc_partgroups, cur); cc->cc_npart++; } } if (prev != &pcg_dummy) { if (prev->pcg_avail == prev->pcg_size) { (void)pool_pcg_put(&pc->pc_fullgroups, prev); cc->cc_nfull++; } else if (prev->pcg_avail == 0) { (void)pool_pcg_put(pc->pc_pcgcache, prev); } else { (void)pool_pcg_put(&pc->pc_partgroups, prev); cc->cc_npart++; } } splx(s); } static int pool_bigidx(size_t size) { int i; for (i = 0; i < __arraycount(pool_allocator_big); i++) { if (1 << (i + POOL_ALLOCATOR_BIG_BASE) >= size) return i; } panic("pool item size %zu too large, use a custom allocator", size); } static void * pool_allocator_alloc(struct pool *pp, int flags) { struct pool_allocator *pa = pp->pr_alloc; void *res; res = (*pa->pa_alloc)(pp, flags); if (res == NULL && (flags & PR_WAITOK) == 0) { /* * We only run the drain hook here if PR_NOWAIT. * In other cases, the hook will be run in * pool_reclaim(). */ if (pp->pr_drain_hook != NULL) { (*pp->pr_drain_hook)(pp->pr_drain_hook_arg, flags); res = (*pa->pa_alloc)(pp, flags); } } return res; } static void pool_allocator_free(struct pool *pp, void *v) { struct pool_allocator *pa = pp->pr_alloc; if (pp->pr_redzone) { KASSERT(!pp_has_pser(pp)); kasan_mark(v, pa->pa_pagesz, pa->pa_pagesz, 0); } else if (__predict_false(pp_has_pser(pp))) { /* * Perform a passive serialization barrier before freeing * the pool page back to the system. */ pool_barrier(); } (*pa->pa_free)(pp, v); } void * pool_page_alloc(struct pool *pp, int flags) { const vm_flag_t vflags = (flags & PR_WAITOK) ? VM_SLEEP: VM_NOSLEEP; vmem_addr_t va; int ret; ret = uvm_km_kmem_alloc(kmem_va_arena, pp->pr_alloc->pa_pagesz, vflags | VM_INSTANTFIT, &va); return ret ? NULL : (void *)va; } void pool_page_free(struct pool *pp, void *v) { uvm_km_kmem_free(kmem_va_arena, (vaddr_t)v, pp->pr_alloc->pa_pagesz); } static void * pool_page_alloc_meta(struct pool *pp, int flags) { const vm_flag_t vflags = (flags & PR_WAITOK) ? VM_SLEEP: VM_NOSLEEP; vmem_addr_t va; int ret; ret = vmem_alloc(kmem_meta_arena, pp->pr_alloc->pa_pagesz, vflags | VM_INSTANTFIT, &va); return ret ? NULL : (void *)va; } static void pool_page_free_meta(struct pool *pp, void *v) { vmem_free(kmem_meta_arena, (vmem_addr_t)v, pp->pr_alloc->pa_pagesz); } #ifdef KMSAN static inline void pool_get_kmsan(struct pool *pp, void *p) { kmsan_orig(p, pp->pr_size, KMSAN_TYPE_POOL, __RET_ADDR); kmsan_mark(p, pp->pr_size, KMSAN_STATE_UNINIT); } static inline void pool_put_kmsan(struct pool *pp, void *p) { kmsan_mark(p, pp->pr_size, KMSAN_STATE_INITED); } static inline void pool_cache_get_kmsan(pool_cache_t pc, void *p) { if (__predict_false(pc_has_ctor(pc))) { return; } pool_get_kmsan(&pc->pc_pool, p); } static inline void pool_cache_put_kmsan(pool_cache_t pc, void *p) { pool_put_kmsan(&pc->pc_pool, p); } #endif #ifdef POOL_QUARANTINE static void pool_quarantine_init(struct pool *pp) { pp->pr_quar.rotor = 0; memset(&pp->pr_quar, 0, sizeof(pp->pr_quar)); } static void pool_quarantine_flush(struct pool *pp) { pool_quar_t *quar = &pp->pr_quar; struct pool_pagelist pq; size_t i; LIST_INIT(&pq); mutex_enter(&pp->pr_lock); for (i = 0; i < POOL_QUARANTINE_DEPTH; i++) { if (quar->list[i] == 0) continue; pool_do_put(pp, (void *)quar->list[i], &pq); } mutex_exit(&pp->pr_lock); pr_pagelist_free(pp, &pq); } static bool pool_put_quarantine(struct pool *pp, void *v, struct pool_pagelist *pq) { pool_quar_t *quar = &pp->pr_quar; uintptr_t old; if (pp->pr_roflags & PR_NOTOUCH) { return false; } pool_redzone_check(pp, v); old = quar->list[quar->rotor]; quar->list[quar->rotor] = (uintptr_t)v; quar->rotor = (quar->rotor + 1) % POOL_QUARANTINE_DEPTH; if (old != 0) { pool_do_put(pp, (void *)old, pq); } return true; } #endif #ifdef POOL_NOCACHE static bool pool_cache_put_nocache(pool_cache_t pc, void *p) { pool_cache_destruct_object(pc, p); return true; } #endif #ifdef POOL_REDZONE #if defined(_LP64) # define PRIME 0x9e37fffffffc0000UL #else /* defined(_LP64) */ # define PRIME 0x9e3779b1 #endif /* defined(_LP64) */ #define STATIC_BYTE 0xFE CTASSERT(POOL_REDZONE_SIZE > 1); #ifndef KASAN static inline uint8_t pool_pattern_generate(const void *p) { return (uint8_t)(((uintptr_t)p) * PRIME >> ((sizeof(uintptr_t) - sizeof(uint8_t))) * CHAR_BIT); } #endif static void pool_redzone_init(struct pool *pp, size_t requested_size) { size_t redzsz; size_t nsz; #ifdef KASAN redzsz = requested_size; kasan_add_redzone(&redzsz); redzsz -= requested_size; #else redzsz = POOL_REDZONE_SIZE; #endif if (pp->pr_roflags & PR_NOTOUCH) { pp->pr_redzone = false; return; } /* * We may have extended the requested size earlier; check if * there's naturally space in the padding for a red zone. */ if (pp->pr_size - requested_size >= redzsz) { pp->pr_reqsize_with_redzone = requested_size + redzsz; pp->pr_redzone = true; return; } /* * No space in the natural padding; check if we can extend a * bit the size of the pool. * * Avoid using redzone for allocations half of a page or larger. * For pagesize items, we'd waste a whole new page (could be * unmapped?), and for half pagesize items, approximately half * the space is lost (eg, 4K pages, you get one 2K allocation.) */ nsz = roundup(pp->pr_size + redzsz, pp->pr_align); if (nsz <= (pp->pr_alloc->pa_pagesz / 2)) { /* Ok, we can */ pp->pr_size = nsz; pp->pr_reqsize_with_redzone = requested_size + redzsz; pp->pr_redzone = true; } else { /* No space for a red zone... snif :'( */ pp->pr_redzone = false; aprint_debug("pool redzone disabled for '%s'\n", pp->pr_wchan); } } static void pool_redzone_fill(struct pool *pp, void *p) { if (!pp->pr_redzone) return; KASSERT(!pp_has_pser(pp)); #ifdef KASAN kasan_mark(p, pp->pr_reqsize, pp->pr_reqsize_with_redzone, KASAN_POOL_REDZONE); #else uint8_t *cp, pat; const uint8_t *ep; cp = (uint8_t *)p + pp->pr_reqsize; ep = cp + POOL_REDZONE_SIZE; /* * We really don't want the first byte of the red zone to be '\0'; * an off-by-one in a string may not be properly detected. */ pat = pool_pattern_generate(cp); *cp = (pat == '\0') ? STATIC_BYTE: pat; cp++; while (cp < ep) { *cp = pool_pattern_generate(cp); cp++; } #endif } static void pool_redzone_check(struct pool *pp, void *p) { if (!pp->pr_redzone) return; KASSERT(!pp_has_pser(pp)); #ifdef KASAN kasan_mark(p, 0, pp->pr_reqsize_with_redzone, KASAN_POOL_FREED); #else uint8_t *cp, pat, expected; const uint8_t *ep; cp = (uint8_t *)p + pp->pr_reqsize; ep = cp + POOL_REDZONE_SIZE; pat = pool_pattern_generate(cp); expected = (pat == '\0') ? STATIC_BYTE: pat; if (__predict_false(*cp != expected)) { panic("%s: [%s] 0x%02x != 0x%02x", __func__, pp->pr_wchan, *cp, expected); } cp++; while (cp < ep) { expected = pool_pattern_generate(cp); if (__predict_false(*cp != expected)) { panic("%s: [%s] 0x%02x != 0x%02x", __func__, pp->pr_wchan, *cp, expected); } cp++; } #endif } static void pool_cache_redzone_check(pool_cache_t pc, void *p) { #ifdef KASAN /* * If there is a ctor/dtor, or if the cache objects use * passive serialization, leave the data as valid. */ if (__predict_false(pc_has_ctor(pc) || pc_has_dtor(pc) || pc_has_pser(pc))) { return; } #endif pool_redzone_check(&pc->pc_pool, p); } #endif /* POOL_REDZONE */ #if defined(DDB) static bool pool_in_page(struct pool *pp, struct pool_item_header *ph, uintptr_t addr) { return (uintptr_t)ph->ph_page <= addr && addr < (uintptr_t)ph->ph_page + pp->pr_alloc->pa_pagesz; } static bool pool_in_item(struct pool *pp, void *item, uintptr_t addr) { return (uintptr_t)item <= addr && addr < (uintptr_t)item + pp->pr_size; } static bool pool_in_cg(struct pool *pp, struct pool_cache_group *pcg, uintptr_t addr) { int i; if (pcg == NULL) { return false; } for (i = 0; i < pcg->pcg_avail; i++) { if (pool_in_item(pp, pcg->pcg_objects[i].pcgo_va, addr)) { return true; } } return false; } static bool pool_allocated(struct pool *pp, struct pool_item_header *ph, uintptr_t addr) { if ((pp->pr_roflags & PR_USEBMAP) != 0) { unsigned int idx = pr_item_bitmap_index(pp, ph, (void *)addr); pool_item_bitmap_t *bitmap = ph->ph_bitmap + (idx / BITMAP_SIZE); pool_item_bitmap_t mask = 1U << (idx & BITMAP_MASK); return (*bitmap & mask) == 0; } else { struct pool_item *pi; LIST_FOREACH(pi, &ph->ph_itemlist, pi_list) { if (pool_in_item(pp, pi, addr)) { return false; } } return true; } } void pool_whatis(uintptr_t addr, void (*pr)(const char *, ...)) { struct pool *pp; TAILQ_FOREACH(pp, &pool_head, pr_poollist) { struct pool_item_header *ph; struct pool_cache *pc; uintptr_t item; bool allocated = true; bool incache = false; bool incpucache = false; char cpucachestr[32]; if ((pp->pr_roflags & PR_PHINPAGE) != 0) { LIST_FOREACH(ph, &pp->pr_fullpages, ph_pagelist) { if (pool_in_page(pp, ph, addr)) { goto found; } } LIST_FOREACH(ph, &pp->pr_partpages, ph_pagelist) { if (pool_in_page(pp, ph, addr)) { allocated = pool_allocated(pp, ph, addr); goto found; } } LIST_FOREACH(ph, &pp->pr_emptypages, ph_pagelist) { if (pool_in_page(pp, ph, addr)) { allocated = false; goto found; } } continue; } else { ph = pr_find_pagehead_noalign(pp, (void *)addr); if (ph == NULL || !pool_in_page(pp, ph, addr)) { continue; } allocated = pool_allocated(pp, ph, addr); } found: if (allocated && (pc = atomic_load_consume(&pp->pr_cache)) != NULL) { struct pool_cache_group *pcg; int i; for (pcg = pc->pc_fullgroups; pcg != NULL; pcg = pcg->pcg_next) { if (pool_in_cg(pp, pcg, addr)) { incache = true; goto print; } } for (i = 0; i < __arraycount(pc->pc_cpus); i++) { pool_cache_cpu_t *cc; if ((cc = pc->pc_cpus[i]) == NULL) { continue; } if (pool_in_cg(pp, cc->cc_current, addr) || pool_in_cg(pp, cc->cc_previous, addr)) { struct cpu_info *ci = cpu_lookup(i); incpucache = true; snprintf(cpucachestr, sizeof(cpucachestr), "cached by CPU %u", ci->ci_index); goto print; } } } print: item = (uintptr_t)ph->ph_page + ph->ph_off; item = item + rounddown(addr - item, pp->pr_size); (*pr)("%p is %p+%zu in POOL '%s' (%s)\n", (void *)addr, item, (size_t)(addr - item), pp->pr_wchan, incpucache ? cpucachestr : incache ? "cached" : allocated ? "allocated" : "free"); } } #endif /* defined(DDB) */ static int pool_sysctl(SYSCTLFN_ARGS) { struct pool_sysctl data; struct pool *pp; struct pool_cache *pc; pool_cache_cpu_t *cc; int error; size_t i, written; if (oldp == NULL) { *oldlenp = 0; TAILQ_FOREACH(pp, &pool_head, pr_poollist) *oldlenp += sizeof(data); return 0; } memset(&data, 0, sizeof(data)); error = 0; written = 0; mutex_enter(&pool_head_lock); TAILQ_FOREACH(pp, &pool_head, pr_poollist) { if (written + sizeof(data) > *oldlenp) break; pp->pr_refcnt++; strlcpy(data.pr_wchan, pp->pr_wchan, sizeof(data.pr_wchan)); data.pr_pagesize = pp->pr_alloc->pa_pagesz; data.pr_flags = pp->pr_roflags | pp->pr_flags; #define COPY(field) data.field = pp->field COPY(pr_size); COPY(pr_itemsperpage); COPY(pr_nitems); COPY(pr_nout); COPY(pr_hardlimit); COPY(pr_npages); COPY(pr_minpages); COPY(pr_maxpages); COPY(pr_nget); COPY(pr_nfail); COPY(pr_nput); COPY(pr_npagealloc); COPY(pr_npagefree); COPY(pr_hiwat); COPY(pr_nidle); #undef COPY data.pr_cache_nmiss_pcpu = 0; data.pr_cache_nhit_pcpu = 0; data.pr_cache_nmiss_global = 0; data.pr_cache_nempty = 0; data.pr_cache_ncontended = 0; data.pr_cache_npartial = 0; if ((pc = atomic_load_consume(&pp->pr_cache)) != NULL) { uint32_t nfull = 0; data.pr_cache_meta_size = pc->pc_pcgsize; for (i = 0; i < pc->pc_ncpu; ++i) { cc = pc->pc_cpus[i]; if (cc == NULL) continue; data.pr_cache_ncontended += cc->cc_contended; data.pr_cache_nmiss_pcpu += cc->cc_misses; data.pr_cache_nhit_pcpu += cc->cc_hits; data.pr_cache_nmiss_global += cc->cc_pcmisses; nfull += cc->cc_nfull; /* 32-bit rollover! */ data.pr_cache_npartial += cc->cc_npart; } data.pr_cache_nfull = nfull; } else { data.pr_cache_meta_size = 0; data.pr_cache_nfull = 0; } data.pr_cache_nhit_global = data.pr_cache_nmiss_pcpu - data.pr_cache_nmiss_global; if (pp->pr_refcnt == UINT_MAX) /* XXX possible? */ continue; mutex_exit(&pool_head_lock); error = sysctl_copyout(l, &data, oldp, sizeof(data)); mutex_enter(&pool_head_lock); if (--pp->pr_refcnt == 0) cv_broadcast(&pool_busy); if (error) break; written += sizeof(data); oldp = (char *)oldp + sizeof(data); } mutex_exit(&pool_head_lock); *oldlenp = written; return error; } SYSCTL_SETUP(sysctl_pool_setup, "sysctl kern.pool setup") { const struct sysctlnode *rnode = NULL; sysctl_createv(clog, 0, NULL, &rnode, CTLFLAG_PERMANENT, CTLTYPE_STRUCT, "pool", SYSCTL_DESCR("Get pool statistics"), pool_sysctl, 0, NULL, 0, CTL_KERN, CTL_CREATE, CTL_EOL); }
75 74 75 75 1292 1296 1295 1293 10 10 2617 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 /* $NetBSD: subr_pserialize.c,v 1.24 2023/10/04 20:28:06 ad Exp $ */ /*- * Copyright (c) 2010, 2011, 2023 The NetBSD Foundation, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /* * Passive serialization. */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: subr_pserialize.c,v 1.24 2023/10/04 20:28:06 ad Exp $"); #include <sys/param.h> #include <sys/atomic.h> #include <sys/cpu.h> #include <sys/evcnt.h> #include <sys/kernel.h> #include <sys/kmem.h> #include <sys/lwp.h> #include <sys/mutex.h> #include <sys/pserialize.h> #include <sys/xcall.h> struct pserialize { char psz_dummy; }; static kmutex_t psz_lock __cacheline_aligned; static struct evcnt psz_ev_excl __cacheline_aligned = EVCNT_INITIALIZER(EVCNT_TYPE_MISC, NULL, "pserialize", "exclusive access"); EVCNT_ATTACH_STATIC(psz_ev_excl); /* * pserialize_init: * * Initialize passive serialization structures. */ void pserialize_init(void) { mutex_init(&psz_lock, MUTEX_DEFAULT, IPL_NONE); } /* * pserialize_create: * * Create and initialize a passive serialization object. */ pserialize_t pserialize_create(void) { pserialize_t psz; psz = kmem_zalloc(sizeof(*psz), KM_SLEEP); return psz; } /* * pserialize_destroy: * * Destroy a passive serialization object. */ void pserialize_destroy(pserialize_t psz) { kmem_free(psz, sizeof(*psz)); } /* * pserialize_perform: * * Perform the write side of passive serialization. */ void pserialize_perform(pserialize_t psz) { KASSERT(!cpu_intr_p()); KASSERT(!cpu_softintr_p()); if (__predict_false(panicstr != NULL)) { return; } if (__predict_false(mp_online == false)) { psz_ev_excl.ev_count++; return; } /* * Broadcast a NOP to all CPUs and wait until all of them complete. */ xc_barrier(XC_HIGHPRI); mutex_enter(&psz_lock); psz_ev_excl.ev_count++; mutex_exit(&psz_lock); } int pserialize_read_enter(void) { int s; s = splsoftserial(); curcpu()->ci_psz_read_depth++; __insn_barrier(); return s; } void pserialize_read_exit(int s) { KASSERT(__predict_false(cold) || kpreempt_disabled()); __insn_barrier(); if (__predict_false(curcpu()->ci_psz_read_depth-- == 0)) panic("mismatching pserialize_read_exit()"); splx(s); } /* * pserialize_in_read_section: * * True if the caller is in a pserialize read section. To be used * only for diagnostic assertions where we want to guarantee the * condition like: * * KASSERT(pserialize_in_read_section()); */ bool pserialize_in_read_section(void) { return kpreempt_disabled() && curcpu()->ci_psz_read_depth > 0; } /* * pserialize_not_in_read_section: * * True if the caller is not in a pserialize read section. To be * used only for diagnostic assertions where we want to guarantee * the condition like: * * KASSERT(pserialize_not_in_read_section()); */ bool pserialize_not_in_read_section(void) { bool notin; long pctr; pctr = lwp_pctr(); notin = __predict_true(curcpu()->ci_psz_read_depth == 0); /* * If we had a context switch, we're definitely not in a * pserialize read section because pserialize read sections * block preemption. */ if (__predict_false(pctr != lwp_pctr())) notin = true; return notin; }
31 22 1 1 6 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 /* $NetBSD: kern_select_50.c,v 1.4 2023/07/28 18:19:00 christos Exp $ */ /*- * Copyright (c) 2008, 2009 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Christos Zoulas. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: kern_select_50.c,v 1.4 2023/07/28 18:19:00 christos Exp $"); #if defined(_KERNEL_OPT) #include "opt_compat_netbsd.h" #endif #include <sys/param.h> #include <sys/event.h> #include <sys/poll.h> #include <sys/select.h> #include <sys/time.h> #include <sys/syscall.h> #include <sys/syscallvar.h> #include <sys/syscallargs.h> #include <compat/sys/event.h> #include <compat/sys/time.h> #include <compat/common/compat_mod.h> static const struct syscall_package kern_select_50_syscalls[] = { { SYS_compat_50_kevent, 0, (sy_call_t *)compat_50_sys_kevent }, { SYS_compat_50_select, 0, (sy_call_t *)compat_50_sys_select }, { SYS_compat_50_pselect, 0, (sy_call_t *)compat_50_sys_pselect }, { SYS_compat_50_pollts, 0, (sy_call_t *)compat_50_sys_pollts }, { 0, 0, NULL } }; static int compat_50_kevent_fetch_timeout(const void *src, void *dest, size_t length) { struct timespec50 ts50; int error; KASSERT(length == sizeof(struct timespec)); error = copyin(src, &ts50, sizeof(ts50)); if (error) return error; timespec50_to_timespec(&ts50, (struct timespec *)dest); return 0; } int compat_50_sys_kevent(struct lwp *l, const struct compat_50_sys_kevent_args *uap, register_t *retval) { /* { syscallarg(int) fd; syscallarg(struct kevent100 *) changelist; syscallarg(size_t) nchanges; syscallarg(struct kevent100 *) eventlist; syscallarg(size_t) nevents; syscallarg(struct timespec50) timeout; } */ static const struct kevent_ops compat_50_kevent_ops = { .keo_private = NULL, .keo_fetch_timeout = compat_50_kevent_fetch_timeout, .keo_fetch_changes = compat_100___kevent50_fetch_changes, .keo_put_events = compat_100___kevent50_put_events, }; return kevent1(retval, SCARG(uap, fd), (const struct kevent *)(const void *)SCARG(uap, changelist), SCARG(uap, nchanges), (struct kevent *)(void *)SCARG(uap, eventlist), SCARG(uap, nevents), (const struct timespec *)(const void *)SCARG(uap, timeout), &compat_50_kevent_ops); } int compat_50_sys_select(struct lwp *l, const struct compat_50_sys_select_args *uap, register_t *retval) { /* { syscallarg(int) nd; syscallarg(fd_set *) in; syscallarg(fd_set *) ou; syscallarg(fd_set *) ex; syscallarg(struct timeval50 *) tv; } */ struct timespec ats, *ts = NULL; struct timeval50 atv50; int error; if (SCARG(uap, tv)) { error = copyin(SCARG(uap, tv), (void *)&atv50, sizeof(atv50)); if (error) return error; if (atv50.tv_usec < 0 || atv50.tv_usec >= 1000000) return EINVAL; ats.tv_sec = atv50.tv_sec; ats.tv_nsec = atv50.tv_usec * 1000; ts = &ats; } return selcommon(retval, SCARG(uap, nd), SCARG(uap, in), SCARG(uap, ou), SCARG(uap, ex), ts, NULL); } int compat_50_sys_pselect(struct lwp *l, const struct compat_50_sys_pselect_args *uap, register_t *retval) { /* { syscallarg(int) nd; syscallarg(fd_set *) in; syscallarg(fd_set *) ou; syscallarg(fd_set *) ex; syscallarg(const struct timespec50 *) ts; syscallarg(sigset_t *) mask; } */ struct timespec50 ats50; struct timespec ats, *ts = NULL; sigset_t amask, *mask = NULL; int error; if (SCARG(uap, ts)) { error = copyin(SCARG(uap, ts), &ats50, sizeof(ats50)); if (error) return error; timespec50_to_timespec(&ats50, &ats); ts = &ats; } if (SCARG(uap, mask) != NULL) { error = copyin(SCARG(uap, mask), &amask, sizeof(amask)); if (error) return error; mask = &amask; } return selcommon(retval, SCARG(uap, nd), SCARG(uap, in), SCARG(uap, ou), SCARG(uap, ex), ts, mask); } int compat_50_sys_pollts(struct lwp *l, const struct compat_50_sys_pollts_args *uap, register_t *retval) { /* { syscallarg(struct pollfd *) fds; syscallarg(u_int) nfds; syscallarg(const struct timespec50 *) ts; syscallarg(const sigset_t *) mask; } */ struct timespec ats, *ts = NULL; struct timespec50 ats50; sigset_t amask, *mask = NULL; int error; if (SCARG(uap, ts)) { error = copyin(SCARG(uap, ts), &ats50, sizeof(ats50)); if (error) return error; timespec50_to_timespec(&ats50, &ats); ts = &ats; } if (SCARG(uap, mask)) { error = copyin(SCARG(uap, mask), &amask, sizeof(amask)); if (error) return error; mask = &amask; } return pollcommon(retval, SCARG(uap, fds), SCARG(uap, nfds), ts, mask); } int kern_select_50_init(void) { return syscall_establish(NULL, kern_select_50_syscalls); } int kern_select_50_fini(void) { return syscall_disestablish(NULL, kern_select_50_syscalls); }
2 1 1 2 6 6 6 1 1 1 1 1 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 /* $NetBSD: wsmouse.c,v 1.73 2023/07/30 10:45:11 riastradh Exp $ */ /*- * Copyright (c) 2006 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Julio M. Merino Vidal. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /* * Copyright (c) 1996, 1997 Christopher G. Demetriou. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by Christopher G. Demetriou * for the NetBSD Project. * 4. The name of the author may not be used to endorse or promote products * derived from this software without specific prior written permission * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* * Copyright (c) 1992, 1993 * The Regents of the University of California. All rights reserved. * * This software was developed by the Computer Systems Engineering group * at Lawrence Berkeley Laboratory under DARPA contract BG 91-66 and * contributed to Berkeley. * * All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Lawrence Berkeley Laboratory. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)ms.c 8.1 (Berkeley) 6/11/93 */ /* * Mouse driver. */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: wsmouse.c,v 1.73 2023/07/30 10:45:11 riastradh Exp $"); #include "wsmouse.h" #include "wsdisplay.h" #include "wsmux.h" #include <sys/param.h> #include <sys/conf.h> #include <sys/ioctl.h> #include <sys/poll.h> #include <sys/fcntl.h> #include <sys/kernel.h> #include <sys/proc.h> #include <sys/syslog.h> #include <sys/systm.h> #include <sys/tty.h> #include <sys/signalvar.h> #include <sys/device.h> #include <sys/vnode.h> #include <sys/callout.h> #include <dev/wscons/wsconsio.h> #include <dev/wscons/wsmousevar.h> #include <dev/wscons/wseventvar.h> #include <dev/wscons/wsmuxvar.h> #include "ioconf.h" #if defined(WSMUX_DEBUG) && NWSMUX > 0 #define DPRINTF(x) if (wsmuxdebug) printf x #define DPRINTFN(n,x) if (wsmuxdebug > (n)) printf x extern int wsmuxdebug; #else #define DPRINTF(x) #define DPRINTFN(n,x) #endif #define INVALID_X INT_MAX #define INVALID_Y INT_MAX #define INVALID_Z INT_MAX #define INVALID_W INT_MAX struct wsmouse_softc { struct wsevsrc sc_base; const struct wsmouse_accessops *sc_accessops; void *sc_accesscookie; u_int sc_mb; /* mouse button state */ u_int sc_ub; /* user button state */ int sc_dx; /* delta-x */ int sc_dy; /* delta-y */ int sc_dz; /* delta-z */ int sc_dw; /* delta-w */ int sc_x; /* absolute-x */ int sc_y; /* absolute-y */ int sc_z; /* absolute-z */ int sc_w; /* absolute-w */ int sc_refcnt; u_char sc_dying; /* device is being detached */ struct wsmouse_repeat sc_repeat; int sc_repeat_button; callout_t sc_repeat_callout; unsigned int sc_repeat_delay; int sc_reverse_scroll; int sc_horiz_scroll_dist; int sc_vert_scroll_dist; }; static int wsmouse_match(device_t, cfdata_t, void *); static void wsmouse_attach(device_t, device_t, void *); static int wsmouse_detach(device_t, int); static int wsmouse_activate(device_t, enum devact); static int wsmouse_set_params(struct wsmouse_softc *, struct wsmouse_param *, size_t); static int wsmouse_get_params(struct wsmouse_softc *, struct wsmouse_param *, size_t); static int wsmouse_handle_params(struct wsmouse_softc *, struct wsmouse_parameters *, bool); static int wsmouse_do_ioctl(struct wsmouse_softc *, u_long, void *, int, struct lwp *); #if NWSMUX > 0 static int wsmouse_mux_open(struct wsevsrc *, struct wseventvar *); static int wsmouse_mux_close(struct wsevsrc *); #endif static int wsmousedoioctl(device_t, u_long, void *, int, struct lwp *); static int wsmousedoopen(struct wsmouse_softc *, struct wseventvar *); CFATTACH_DECL_NEW(wsmouse, sizeof (struct wsmouse_softc), wsmouse_match, wsmouse_attach, wsmouse_detach, wsmouse_activate); static void wsmouse_repeat(void *v); dev_type_open(wsmouseopen); dev_type_close(wsmouseclose); dev_type_read(wsmouseread); dev_type_ioctl(wsmouseioctl); dev_type_poll(wsmousepoll); dev_type_kqfilter(wsmousekqfilter); const struct cdevsw wsmouse_cdevsw = { .d_open = wsmouseopen, .d_close = wsmouseclose, .d_read = wsmouseread, .d_write = nowrite, .d_ioctl = wsmouseioctl, .d_stop = nostop, .d_tty = notty, .d_poll = wsmousepoll, .d_mmap = nommap, .d_kqfilter = wsmousekqfilter, .d_discard = nodiscard, .d_flag = D_OTHER }; #if NWSMUX > 0 struct wssrcops wsmouse_srcops = { WSMUX_MOUSE, wsmouse_mux_open, wsmouse_mux_close, wsmousedoioctl, NULL, NULL }; #endif /* * Print function (for parent devices). */ int wsmousedevprint(void *aux, const char *pnp) { if (pnp) aprint_normal("wsmouse at %s", pnp); return (UNCONF); } int wsmouse_match(device_t parent, cfdata_t match, void *aux) { return (1); } void wsmouse_attach(device_t parent, device_t self, void *aux) { struct wsmouse_softc *sc = device_private(self); struct wsmousedev_attach_args *ap = aux; #if NWSMUX > 0 int mux, error; #endif sc->sc_base.me_dv = self; sc->sc_accessops = ap->accessops; sc->sc_accesscookie = ap->accesscookie; /* Initialize button repeating. */ memset(&sc->sc_repeat, 0, sizeof(sc->sc_repeat)); sc->sc_repeat_button = -1; sc->sc_repeat_delay = 0; sc->sc_reverse_scroll = 0; sc->sc_horiz_scroll_dist = WSMOUSE_DEFAULT_SCROLL_DIST; sc->sc_vert_scroll_dist = WSMOUSE_DEFAULT_SCROLL_DIST; callout_init(&sc->sc_repeat_callout, 0); callout_setfunc(&sc->sc_repeat_callout, wsmouse_repeat, sc); #if NWSMUX > 0 sc->sc_base.me_ops = &wsmouse_srcops; mux = device_cfdata(self)->wsmousedevcf_mux; if (mux >= 0) { error = wsmux_attach_sc(wsmux_getmux(mux), &sc->sc_base); if (error) aprint_error(" attach error=%d", error); else aprint_normal(" mux %d", mux); } #else if (device_cfdata(self)->wsmousedevcf_mux >= 0) aprint_normal(" (mux ignored)"); #endif aprint_naive("\n"); aprint_normal("\n"); if (!pmf_device_register(self, NULL, NULL)) aprint_error_dev(self, "couldn't establish power handler\n"); } int wsmouse_activate(device_t self, enum devact act) { struct wsmouse_softc *sc = device_private(self); if (act == DVACT_DEACTIVATE) sc->sc_dying = 1; return (0); } /* * Detach a mouse. To keep track of users of the softc we keep * a reference count that's incremented while inside, e.g., read. * If the mouse is active and the reference count is > 0 (0 is the * normal state) we post an event and then wait for the process * that had the reference to wake us up again. Then we blow away the * vnode and return (which will deallocate the softc). */ int wsmouse_detach(device_t self, int flags) { struct wsmouse_softc *sc = device_private(self); struct wseventvar *evar; int maj, mn; int s; #if NWSMUX > 0 /* Tell parent mux we're leaving. */ if (sc->sc_base.me_parent != NULL) { DPRINTF(("wsmouse_detach:\n")); wsmux_detach_sc(&sc->sc_base); } #endif /* If we're open ... */ evar = sc->sc_base.me_evp; if (evar != NULL && evar->io != NULL) { s = spltty(); if (--sc->sc_refcnt >= 0) { struct wscons_event event; /* Wake everyone by generating a dummy event. */ event.type = 0; event.value = 0; if (wsevent_inject(evar, &event, 1) != 0) wsevent_wakeup(evar); /* Wait for processes to go away. */ if (tsleep(sc, PZERO, "wsmdet", hz * 60)) printf("wsmouse_detach: %s didn't detach\n", device_xname(self)); } splx(s); } /* locate the major number */ maj = cdevsw_lookup_major(&wsmouse_cdevsw); /* Nuke the vnodes for any open instances (calls close). */ mn = device_unit(self); vdevgone(maj, mn, mn, VCHR); return (0); } void wsmouse_input(device_t wsmousedev, u_int btns /* 0 is up */, int x, int y, int z, int w, u_int flags) { struct wsmouse_softc *sc = device_private(wsmousedev); struct wseventvar *evar; int mb, ub, d, nevents; /* one for each dimension (4) + a bit for each button */ struct wscons_event events[4 + sizeof(d) * 8]; KERNEL_LOCK(1, NULL); /* * Discard input if not open. */ evar = sc->sc_base.me_evp; if (evar == NULL) goto out; #ifdef DIAGNOSTIC if (evar->q == NULL) { printf("wsmouse_input: evar->q=NULL\n"); goto out; } #endif #if NWSMUX > 0 DPRINTFN(5,("wsmouse_input: %s mux=%p, evar=%p\n", device_xname(sc->sc_base.me_dv), sc->sc_base.me_parent, evar)); #endif sc->sc_mb = btns; if (!(flags & WSMOUSE_INPUT_ABSOLUTE_X)) sc->sc_dx += x; if (!(flags & WSMOUSE_INPUT_ABSOLUTE_Y)) sc->sc_dy += y; if (!(flags & WSMOUSE_INPUT_ABSOLUTE_Z)) sc->sc_dz += z; if (!(flags & WSMOUSE_INPUT_ABSOLUTE_W)) sc->sc_dw += w; /* * We have at least one event (mouse button, delta-X, or * delta-Y; possibly all three, and possibly three separate * button events). Deliver these events until we are out * of changes or out of room. As events get delivered, * mark them `unchanged'. */ ub = sc->sc_ub; nevents = 0; if (flags & WSMOUSE_INPUT_ABSOLUTE_X) { if (sc->sc_x != x) { events[nevents].type = WSCONS_EVENT_MOUSE_ABSOLUTE_X; events[nevents].value = x; nevents++; } } else { if (sc->sc_dx) { events[nevents].type = WSCONS_EVENT_MOUSE_DELTA_X; events[nevents].value = sc->sc_dx; nevents++; } } if (flags & WSMOUSE_INPUT_ABSOLUTE_Y) { if (sc->sc_y != y) { events[nevents].type = WSCONS_EVENT_MOUSE_ABSOLUTE_Y; events[nevents].value = y; nevents++; } } else { if (sc->sc_dy) { events[nevents].type = WSCONS_EVENT_MOUSE_DELTA_Y; events[nevents].value = sc->sc_dy; nevents++; } } if (flags & WSMOUSE_INPUT_ABSOLUTE_Z) { if (sc->sc_z != z) { events[nevents].type = WSCONS_EVENT_MOUSE_ABSOLUTE_Z; events[nevents].value = z; nevents++; } } else { if (sc->sc_dz) { events[nevents].type = WSCONS_EVENT_MOUSE_DELTA_Z; events[nevents].value = sc->sc_dz; nevents++; } } if (flags & WSMOUSE_INPUT_ABSOLUTE_W) { if (sc->sc_w != w) { events[nevents].type = WSCONS_EVENT_MOUSE_ABSOLUTE_W; events[nevents].value = w; nevents++; } } else { if (sc->sc_dw) { events[nevents].type = WSCONS_EVENT_MOUSE_DELTA_W; events[nevents].value = sc->sc_dw; nevents++; } } mb = sc->sc_mb; while ((d = mb ^ ub) != 0) { int btnno; /* * Cancel button repeating if button status changed. */ if (sc->sc_repeat_button != -1) { KASSERT(sc->sc_repeat_button >= 0); KASSERT(sc->sc_repeat.wr_buttons & (1 << sc->sc_repeat_button)); ub &= ~(1 << sc->sc_repeat_button); sc->sc_repeat_button = -1; callout_stop(&sc->sc_repeat_callout); } /* * Mouse button change. Find the first change and drop * it into the event queue. */ btnno = ffs(d) - 1; KASSERT(btnno >= 0); if (nevents >= __arraycount(events)) { aprint_error_dev(sc->sc_base.me_dv, "Event queue full (button status mb=0x%x" " ub=0x%x)\n", mb, ub); break; } events[nevents].type = (mb & d) ? WSCONS_EVENT_MOUSE_DOWN : WSCONS_EVENT_MOUSE_UP; events[nevents].value = btnno; nevents++; ub ^= (1 << btnno); /* * Program button repeating if configured for this button. */ if ((mb & d) && (sc->sc_repeat.wr_buttons & (1 << btnno)) && sc->sc_repeat.wr_delay_first > 0) { sc->sc_repeat_button = btnno; sc->sc_repeat_delay = sc->sc_repeat.wr_delay_first; callout_schedule(&sc->sc_repeat_callout, mstohz(sc->sc_repeat_delay)); } } if (nevents == 0 || wsevent_inject(evar, events, nevents) == 0) { /* All events were correctly injected into the queue. * Synchronize the mouse's status with what the user * has received. */ sc->sc_x = x; sc->sc_dx = 0; sc->sc_y = y; sc->sc_dy = 0; sc->sc_z = z; sc->sc_dz = 0; sc->sc_w = w; sc->sc_dw = 0; sc->sc_ub = ub; #if NWSMUX > 0 DPRINTFN(5,("wsmouse_input: %s wakeup evar=%p\n", device_xname(sc->sc_base.me_dv), evar)); #endif } out: KERNEL_UNLOCK_ONE(NULL); } void wsmouse_precision_scroll(device_t wsmousedev, int x, int y) { struct wsmouse_softc *sc = device_private(wsmousedev); struct wseventvar *evar; struct wscons_event events[2]; int nevents = 0; evar = sc->sc_base.me_evp; if (evar == NULL) return; if (sc->sc_reverse_scroll) { x = -x; y = -y; } x = (x * 4096) / sc->sc_horiz_scroll_dist; y = (y * 4096) / sc->sc_vert_scroll_dist; if (x != 0) { events[nevents].type = WSCONS_EVENT_HSCROLL; events[nevents].value = x; nevents++; } if (y != 0) { events[nevents].type = WSCONS_EVENT_VSCROLL; events[nevents].value = y; nevents++; } (void)wsevent_inject(evar, events, nevents); } static void wsmouse_repeat(void *v) { int oldspl; unsigned int newdelay; struct wsmouse_softc *sc; struct wscons_event events[2]; oldspl = spltty(); sc = (struct wsmouse_softc *)v; if (sc->sc_repeat_button == -1) { /* Race condition: a "button up" event came in when * this function was already called but did not do * spltty() yet. */ splx(oldspl); return; } KASSERT(sc->sc_repeat_button >= 0); KASSERT(sc->sc_repeat.wr_buttons & (1 << sc->sc_repeat_button)); newdelay = sc->sc_repeat_delay; events[0].type = WSCONS_EVENT_MOUSE_UP; events[0].value = sc->sc_repeat_button; events[1].type = WSCONS_EVENT_MOUSE_DOWN; events[1].value = sc->sc_repeat_button; if (wsevent_inject(sc->sc_base.me_evp, events, 2) == 0) { sc->sc_ub = 1 << sc->sc_repeat_button; if (newdelay - sc->sc_repeat.wr_delay_decrement < sc->sc_repeat.wr_delay_minimum) newdelay = sc->sc_repeat.wr_delay_minimum; else if (newdelay > sc->sc_repeat.wr_delay_minimum) newdelay -= sc->sc_repeat.wr_delay_decrement; KASSERT(newdelay >= sc->sc_repeat.wr_delay_minimum); KASSERT(newdelay <= sc->sc_repeat.wr_delay_first); } /* * Reprogram the repeating event. */ sc->sc_repeat_delay = newdelay; callout_schedule(&sc->sc_repeat_callout, mstohz(newdelay)); splx(oldspl); } static int wsmouse_set_params(struct wsmouse_softc *sc, struct wsmouse_param *buf, size_t nparams) { size_t i = 0; for (i = 0; i < nparams; ++i) { switch (buf[i].key) { case WSMOUSECFG_REVERSE_SCROLLING: sc->sc_reverse_scroll = (buf[i].value != 0); break; case WSMOUSECFG_HORIZSCROLLDIST: sc->sc_horiz_scroll_dist = buf[i].value; break; case WSMOUSECFG_VERTSCROLLDIST: sc->sc_vert_scroll_dist = buf[i].value; break; } } return 0; } static int wsmouse_get_params(struct wsmouse_softc *sc, struct wsmouse_param *buf, size_t nparams) { size_t i = 0; for (i = 0; i < nparams; ++i) { switch (buf[i].key) { case WSMOUSECFG_REVERSE_SCROLLING: buf[i].value = sc->sc_reverse_scroll; break; case WSMOUSECFG_HORIZSCROLLDIST: buf[i].value = sc->sc_horiz_scroll_dist; break; case WSMOUSECFG_VERTSCROLLDIST: buf[i].value = sc->sc_vert_scroll_dist; break; } } return 0; } static int wsmouse_handle_params(struct wsmouse_softc *sc, struct wsmouse_parameters *upl, bool set) { size_t len; struct wsmouse_param *buf; int error = 0; if (upl->params == NULL || upl->nparams > WSMOUSECFG_MAX) return EINVAL; if (upl->nparams == 0) return 0; len = upl->nparams * sizeof(struct wsmouse_param); buf = kmem_alloc(len, KM_SLEEP); if (buf == NULL) return ENOMEM; if ((error = copyin(upl->params, buf, len)) != 0) goto error; if (set) { error = wsmouse_set_params(sc, buf, upl->nparams); if (error != 0) goto error; } else { error = wsmouse_get_params(sc, buf, upl->nparams); if (error != 0) goto error; if ((error = copyout(buf, upl->params, len)) != 0) goto error; } error: kmem_free(buf, len); return error; } int wsmouseopen(dev_t dev, int flags, int mode, struct lwp *l) { struct wsmouse_softc *sc; struct wseventvar *evar; int error; sc = device_lookup_private(&wsmouse_cd, minor(dev)); if (sc == NULL) return ENXIO; #if NWSMUX > 0 DPRINTF(("wsmouseopen: %s mux=%p p=%p\n", device_xname(sc->sc_base.me_dv), sc->sc_base.me_parent, l)); #endif if (sc->sc_dying) return (EIO); if ((flags & (FREAD | FWRITE)) == FWRITE) return (0); /* always allow open for write so ioctl() is possible. */ if (sc->sc_base.me_evp != NULL) return (EBUSY); evar = &sc->sc_base.me_evar; wsevent_init(evar, l->l_proc); sc->sc_base.me_evp = evar; error = wsmousedoopen(sc, evar); if (error) { DPRINTF(("wsmouseopen: %s open failed\n", device_xname(sc->sc_base.me_dv))); sc->sc_base.me_evp = NULL; wsevent_fini(evar); } return (error); } int wsmouseclose(dev_t dev, int flags, int mode, struct lwp *l) { struct wsmouse_softc *sc = device_lookup_private(&wsmouse_cd, minor(dev)); struct wseventvar *evar = sc->sc_base.me_evp; if (evar == NULL) /* not open for read */ return (0); sc->sc_base.me_evp = NULL; (*sc->sc_accessops->disable)(sc->sc_accesscookie); wsevent_fini(evar); return (0); } int wsmousedoopen(struct wsmouse_softc *sc, struct wseventvar *evp) { sc->sc_base.me_evp = evp; sc->sc_x = INVALID_X; sc->sc_y = INVALID_Y; sc->sc_z = INVALID_Z; sc->sc_w = INVALID_W; /* Stop button repeating when messing with the device. */ if (sc->sc_repeat_button != -1) { KASSERT(sc->sc_repeat_button >= 0); sc->sc_repeat_button = -1; callout_stop(&sc->sc_repeat_callout); } /* enable the device, and punt if that's not possible */ return (*sc->sc_accessops->enable)(sc->sc_accesscookie); } int wsmouseread(dev_t dev, struct uio *uio, int flags) { struct wsmouse_softc *sc = device_lookup_private(&wsmouse_cd, minor(dev)); int error; if (sc->sc_dying) return (EIO); #ifdef DIAGNOSTIC if (sc->sc_base.me_evp == NULL) { printf("wsmouseread: evp == NULL\n"); return (EINVAL); } #endif sc->sc_refcnt++; error = wsevent_read(sc->sc_base.me_evp, uio, flags); if (--sc->sc_refcnt < 0) { wakeup(sc); error = EIO; } return (error); } int wsmouseioctl(dev_t dev, u_long cmd, void *data, int flag, struct lwp *l) { return (wsmousedoioctl(device_lookup(&wsmouse_cd, minor(dev)), cmd, data, flag, l)); } /* A wrapper around the ioctl() workhorse to make reference counting easy. */ int wsmousedoioctl(device_t dv, u_long cmd, void *data, int flag, struct lwp *l) { struct wsmouse_softc *sc = device_private(dv); int error; sc->sc_refcnt++; error = wsmouse_do_ioctl(sc, cmd, data, flag, l); if (--sc->sc_refcnt < 0) wakeup(sc); return (error); } int wsmouse_do_ioctl(struct wsmouse_softc *sc, u_long cmd, void *data, int flag, struct lwp *l) { int error; struct wsmouse_repeat *wr; if (sc->sc_dying) return (EIO); /* * Try the generic ioctls that the wsmouse interface supports. */ switch (cmd) { case FIONBIO: /* we will remove this someday (soon???) */ return (0); case FIOASYNC: if (sc->sc_base.me_evp == NULL) return (EINVAL); sc->sc_base.me_evp->async = *(int *)data != 0; return (0); case FIOSETOWN: if (sc->sc_base.me_evp == NULL) return (EINVAL); if (-*(int *)data != sc->sc_base.me_evp->io->p_pgid && *(int *)data != sc->sc_base.me_evp->io->p_pid) return (EPERM); return (0); case TIOCSPGRP: if (sc->sc_base.me_evp == NULL) return (EINVAL); if (*(int *)data != sc->sc_base.me_evp->io->p_pgid) return (EPERM); return (0); } /* * Try the wsmouse specific ioctls. */ switch (cmd) { case WSMOUSEIO_GETREPEAT: wr = (struct wsmouse_repeat *)data; memcpy(wr, &sc->sc_repeat, sizeof(sc->sc_repeat)); return 0; case WSMOUSEIO_SETREPEAT: if ((flag & FWRITE) == 0) return EACCES; /* Validate input data. */ wr = (struct wsmouse_repeat *)data; if (wr->wr_delay_first != 0 && (wr->wr_delay_first < wr->wr_delay_decrement || wr->wr_delay_first < wr->wr_delay_minimum || wr->wr_delay_first < wr->wr_delay_minimum + wr->wr_delay_decrement)) return EINVAL; /* Stop current repeating and set new data. */ sc->sc_repeat_button = -1; callout_stop(&sc->sc_repeat_callout); memcpy(&sc->sc_repeat, wr, sizeof(sc->sc_repeat)); return 0; case WSMOUSEIO_SETVERSION: return wsevent_setversion(sc->sc_base.me_evp, *(int *)data); case WSMOUSEIO_GETPARAMS: return wsmouse_handle_params(sc, (struct wsmouse_parameters *)data, false); case WSMOUSEIO_SETPARAMS: if ((flag & FWRITE) == 0) return EACCES; return wsmouse_handle_params(sc, (struct wsmouse_parameters *)data, true); } /* * Try the mouse driver for WSMOUSEIO ioctls. It returns -1 * if it didn't recognize the request. */ error = (*sc->sc_accessops->ioctl)(sc->sc_accesscookie, cmd, data, flag, l); return (error); /* may be EPASSTHROUGH */ } int wsmousepoll(dev_t dev, int events, struct lwp *l) { struct wsmouse_softc *sc = device_lookup_private(&wsmouse_cd, minor(dev)); if (sc->sc_base.me_evp == NULL) return (POLLERR); return (wsevent_poll(sc->sc_base.me_evp, events, l)); } int wsmousekqfilter(dev_t dev, struct knote *kn) { struct wsmouse_softc *sc = device_lookup_private(&wsmouse_cd, minor(dev)); if (sc->sc_base.me_evp == NULL) return (1); return (wsevent_kqfilter(sc->sc_base.me_evp, kn)); } #if NWSMUX > 0 int wsmouse_mux_open(struct wsevsrc *me, struct wseventvar *evp) { struct wsmouse_softc *sc = (struct wsmouse_softc *)me; if (sc->sc_base.me_evp != NULL) return (EBUSY); return wsmousedoopen(sc, evp); } int wsmouse_mux_close(struct wsevsrc *me) { struct wsmouse_softc *sc = (struct wsmouse_softc *)me; sc->sc_base.me_evp = NULL; (*sc->sc_accessops->disable)(sc->sc_accesscookie); return (0); } int wsmouse_add_mux(int unit, struct wsmux_softc *muxsc) { struct wsmouse_softc *sc; sc = device_lookup_private(&wsmouse_cd, unit); if (sc == NULL) return ENXIO; if (sc->sc_base.me_parent != NULL || sc->sc_base.me_evp != NULL) return (EBUSY); return (wsmux_attach_sc(muxsc, &sc->sc_base)); } #endif /* NWSMUX > 0 */
1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 /* $NetBSD: wd.c,v 1.469 2024/02/05 21:46:06 andvar Exp $ */ /* * Copyright (c) 1998, 2001 Manuel Bouyer. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /*- * Copyright (c) 1998, 2003, 2004 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Charles M. Hannum and by Onno van der Linden. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: wd.c,v 1.469 2024/02/05 21:46:06 andvar Exp $"); #include "opt_ata.h" #include "opt_wd.h" #include <sys/param.h> #include <sys/systm.h> #include <sys/kernel.h> #include <sys/conf.h> #include <sys/file.h> #include <sys/stat.h> #include <sys/ioctl.h> #include <sys/buf.h> #include <sys/bufq.h> #include <sys/uio.h> #include <sys/device.h> #include <sys/disklabel.h> #include <sys/disk.h> #include <sys/syslog.h> #include <sys/proc.h> #include <sys/reboot.h> #include <sys/vnode.h> #include <sys/rndsource.h> #include <sys/intr.h> #include <sys/bus.h> #include <dev/ata/atareg.h> #include <dev/ata/atavar.h> #include <dev/ata/wdvar.h> #include <dev/ic/wdcreg.h> #include <sys/ataio.h> #include "locators.h" #include <prop/proplib.h> #define WDIORETRIES_SINGLE 4 /* number of retries for single-sector */ #define WDIORETRIES 5 /* number of retries before giving up */ #define RECOVERYTIME hz/2 /* time to wait before retrying a cmd */ #define WDUNIT(dev) DISKUNIT(dev) #define WDPART(dev) DISKPART(dev) #define WDMINOR(unit, part) DISKMINOR(unit, part) #define MAKEWDDEV(maj, unit, part) MAKEDISKDEV(maj, unit, part) #define WDLABELDEV(dev) (MAKEWDDEV(major(dev), WDUNIT(dev), RAW_PART)) #define DEBUG_FUNCS 0x08 #define DEBUG_PROBE 0x10 #define DEBUG_DETACH 0x20 #define DEBUG_XFERS 0x40 #ifdef ATADEBUG #ifndef ATADEBUG_WD_MASK #define ATADEBUG_WD_MASK 0x0 #endif int wdcdebug_wd_mask = ATADEBUG_WD_MASK; #define ATADEBUG_PRINT(args, level) \ if (wdcdebug_wd_mask & (level)) \ printf args #else #define ATADEBUG_PRINT(args, level) #endif static int wdprobe(device_t, cfdata_t, void *); static void wdattach(device_t, device_t, void *); static int wddetach(device_t, int); static void wdperror(const struct wd_softc *, struct ata_xfer *); static void wdminphys(struct buf *); static int wd_firstopen(device_t, dev_t, int, int); static int wd_lastclose(device_t); static bool wd_suspend(device_t, const pmf_qual_t *); static int wd_standby(struct wd_softc *, int); CFATTACH_DECL3_NEW(wd, sizeof(struct wd_softc), wdprobe, wdattach, wddetach, NULL, NULL, NULL, DVF_DETACH_SHUTDOWN); extern struct cfdriver wd_cd; static dev_type_open(wdopen); static dev_type_close(wdclose); static dev_type_read(wdread); static dev_type_write(wdwrite); static dev_type_ioctl(wdioctl); static dev_type_strategy(wdstrategy); static dev_type_dump(wddump); static dev_type_size(wdsize); static dev_type_discard(wddiscard); const struct bdevsw wd_bdevsw = { .d_open = wdopen, .d_close = wdclose, .d_strategy = wdstrategy, .d_ioctl = wdioctl, .d_dump = wddump, .d_psize = wdsize, .d_discard = wddiscard, .d_cfdriver = &wd_cd, .d_devtounit = disklabel_dev_unit, .d_flag = D_DISK }; const struct cdevsw wd_cdevsw = { .d_open = wdopen, .d_close = wdclose, .d_read = wdread, .d_write = wdwrite, .d_ioctl = wdioctl, .d_stop = nostop, .d_tty = notty, .d_poll = nopoll, .d_mmap = nommap, .d_kqfilter = nokqfilter, .d_discard = wddiscard, .d_cfdriver = &wd_cd, .d_devtounit = disklabel_dev_unit, .d_flag = D_DISK }; /* #define WD_DUMP_NOT_TRUSTED if you just want to watch */ static int wddoingadump = 0; static int wddumprecalibrated = 0; /* * Glue necessary to hook WDCIOCCOMMAND into physio */ struct wd_ioctl { LIST_ENTRY(wd_ioctl) wi_list; struct buf wi_bp; struct uio wi_uio; struct iovec wi_iov; atareq_t wi_atareq; struct wd_softc *wi_softc; }; static struct wd_ioctl *wi_find(struct buf *); static void wi_free(struct wd_ioctl *); static struct wd_ioctl *wi_get(struct wd_softc *); static void wdioctlstrategy(struct buf *); static void wdrestart(void *); static void wdstart1(struct wd_softc *, struct buf *, struct ata_xfer *); static int wd_diskstart(device_t, struct buf *); static int wd_dumpblocks(device_t, void *, daddr_t, int); static void wd_iosize(device_t, int *); static int wd_discard(device_t, off_t, off_t); static void wdbioretry(void *); static void wdbiorequeue(void *); static void wddone(device_t, struct ata_xfer *); static int wd_get_params(struct wd_softc *, struct ataparams *); static void wd_set_geometry(struct wd_softc *); static int wd_flushcache(struct wd_softc *, int); static int wd_trim(struct wd_softc *, daddr_t, long); static bool wd_shutdown(device_t, int); static int wd_getcache(struct wd_softc *, int *); static int wd_setcache(struct wd_softc *, int); static void wd_sysctl_attach(struct wd_softc *); static void wd_sysctl_detach(struct wd_softc *); static const struct dkdriver wddkdriver = { .d_open = wdopen, .d_close = wdclose, .d_strategy = wdstrategy, .d_minphys = wdminphys, .d_diskstart = wd_diskstart, .d_dumpblocks = wd_dumpblocks, .d_iosize = wd_iosize, .d_firstopen = wd_firstopen, .d_lastclose = wd_lastclose, .d_discard = wd_discard }; #ifdef HAS_BAD144_HANDLING static void bad144intern(struct wd_softc *); #endif #define WD_QUIRK_SPLIT_MOD15_WRITE 0x0001 /* must split certain writes */ #define WD_QUIRK_FMT "\20\1SPLIT_MOD15_WRITE" /* * Quirk table for IDE drives. Put more-specific matches first, since * a simple globing routine is used for matching. */ static const struct wd_quirk { const char *wdq_match; /* inquiry pattern to match */ int wdq_quirks; /* drive quirks */ } wd_quirk_table[] = { /* * Some Seagate S-ATA drives have a PHY which can get confused * with the way data is packetized by some S-ATA controllers. * * The work-around is to split in two any write transfer whose * sector count % 15 == 1 (assuming 512 byte sectors). * * XXX This is an incomplete list. There are at least a couple * XXX more model numbers. If you have trouble with such transfers * XXX (8K is the most common) on Seagate S-ATA drives, please * XXX notify thorpej@NetBSD.org. * * The ST360015AS has not yet been confirmed to have this * issue, however, it is the only other drive in the * Seagate Barracuda Serial ATA V family. * */ { "ST3120023AS", WD_QUIRK_SPLIT_MOD15_WRITE }, { "ST380023AS", WD_QUIRK_SPLIT_MOD15_WRITE }, { "ST360015AS", WD_QUIRK_SPLIT_MOD15_WRITE }, { NULL, 0 } }; static const struct wd_quirk * wd_lookup_quirks(const char *name) { const struct wd_quirk *wdq; const char *estr; for (wdq = wd_quirk_table; wdq->wdq_match != NULL; wdq++) { /* * We only want exact matches (which include matches * against globbing characters). */ if (pmatch(name, wdq->wdq_match, &estr) == 2) return (wdq); } return (NULL); } static int wdprobe(device_t parent, cfdata_t match, void *aux) { struct ata_device *adev = aux; if (adev == NULL) return 0; if (adev->adev_bustype->bustype_type != SCSIPI_BUSTYPE_ATA) return 0; if (match->cf_loc[ATA_HLCF_DRIVE] != ATA_HLCF_DRIVE_DEFAULT && match->cf_loc[ATA_HLCF_DRIVE] != adev->adev_drv_data->drive) return 0; return 1; } static void wdattach(device_t parent, device_t self, void *aux) { struct wd_softc *wd = device_private(self); struct dk_softc *dksc = &wd->sc_dksc; struct ata_device *adev= aux; int i, blank; char tbuf[41],pbuf[9], c, *p, *q; const struct wd_quirk *wdq; int dtype = DKTYPE_UNKNOWN; dksc->sc_dev = self; ATADEBUG_PRINT(("wdattach\n"), DEBUG_FUNCS | DEBUG_PROBE); mutex_init(&wd->sc_lock, MUTEX_DEFAULT, IPL_BIO); #ifdef WD_SOFTBADSECT SLIST_INIT(&wd->sc_bslist); cv_init(&wd->sc_bslist_cv, "wdbadsect"); #endif wd->atabus = adev->adev_bustype; wd->inflight = 0; wd->drvp = adev->adev_drv_data; wd->drvp->drv_openings = 1; wd->drvp->drv_done = wddone; wd->drvp->drv_softc = dksc->sc_dev; /* done in atabusconfig_thread() but too late */ SLIST_INIT(&wd->sc_retry_list); SLIST_INIT(&wd->sc_requeue_list); callout_init(&wd->sc_retry_callout, 0); /* XXX MPSAFE */ callout_init(&wd->sc_requeue_callout, 0); /* XXX MPSAFE */ callout_init(&wd->sc_restart_diskqueue, 0); /* XXX MPSAFE */ aprint_naive("\n"); aprint_normal("\n"); /* read our drive info */ if (wd_get_params(wd, &wd->sc_params) != 0) { aprint_error_dev(self, "IDENTIFY failed\n"); goto out; } for (blank = 0, p = wd->sc_params.atap_model, q = tbuf, i = 0; i < sizeof(wd->sc_params.atap_model); i++) { c = *p++; if (c == '\0') break; if (c != ' ') { if (blank) { *q++ = ' '; blank = 0; } *q++ = c; } else blank = 1; } *q++ = '\0'; wd->sc_typename = kmem_asprintf("%s", tbuf); aprint_normal_dev(self, "<%s>\n", wd->sc_typename); wdq = wd_lookup_quirks(tbuf); if (wdq != NULL) wd->sc_quirks = wdq->wdq_quirks; if (wd->sc_quirks != 0) { char sbuf[sizeof(WD_QUIRK_FMT) + 64]; snprintb(sbuf, sizeof(sbuf), WD_QUIRK_FMT, wd->sc_quirks); aprint_normal_dev(self, "quirks %s\n", sbuf); if (wd->sc_quirks & WD_QUIRK_SPLIT_MOD15_WRITE) { aprint_error_dev(self, "drive corrupts write transfers with certain controllers, consider replacing\n"); } } if ((wd->sc_params.atap_multi & 0xff) > 1) { wd->drvp->multi = wd->sc_params.atap_multi & 0xff; } else { wd->drvp->multi = 1; } aprint_verbose_dev(self, "drive supports %d-sector PIO transfers,", wd->drvp->multi); /* 48-bit LBA addressing */ if ((wd->sc_params.atap_cmd2_en & ATA_CMD2_LBA48) != 0) wd->sc_flags |= WDF_LBA48; /* Prior to ATA-4, LBA was optional. */ if ((wd->sc_params.atap_capabilities1 & WDC_CAP_LBA) != 0) wd->sc_flags |= WDF_LBA; #if 0 /* ATA-4 requires LBA. */ if (wd->sc_params.atap_ataversion != 0xffff && wd->sc_params.atap_ataversion >= WDC_VER_ATA4) wd->sc_flags |= WDF_LBA; #endif if ((wd->sc_flags & WDF_LBA48) != 0) { aprint_verbose(" LBA48 addressing\n"); wd->sc_capacity = ((uint64_t) wd->sc_params.atap_max_lba[3] << 48) | ((uint64_t) wd->sc_params.atap_max_lba[2] << 32) | ((uint64_t) wd->sc_params.atap_max_lba[1] << 16) | ((uint64_t) wd->sc_params.atap_max_lba[0] << 0); wd->sc_capacity28 = (wd->sc_params.atap_capacity[1] << 16) | wd->sc_params.atap_capacity[0]; /* * Force LBA48 addressing for invalid numbers. */ if (wd->sc_capacity28 > 0xfffffff) wd->sc_capacity28 = 0xfffffff; } else if ((wd->sc_flags & WDF_LBA) != 0) { aprint_verbose(" LBA addressing\n"); wd->sc_capacity28 = (wd->sc_params.atap_capacity[1] << 16) | wd->sc_params.atap_capacity[0]; /* * Limit capacity to LBA28 numbers to avoid overflow. */ if (wd->sc_capacity28 > 0xfffffff) wd->sc_capacity28 = 0xfffffff; wd->sc_capacity = wd->sc_capacity28; } else { aprint_verbose(" chs addressing\n"); wd->sc_capacity = wd->sc_params.atap_cylinders * wd->sc_params.atap_heads * wd->sc_params.atap_sectors; /* * LBA28 size is ignored for CHS addressing. Use a reasonable * value for debugging. The CHS values may be artificial and * are mostly ignored. */ if (wd->sc_capacity < 0xfffffff) wd->sc_capacity28 = wd->sc_capacity; else wd->sc_capacity28 = 0xfffffff; } if ((wd->sc_params.atap_secsz & ATA_SECSZ_VALID_MASK) == ATA_SECSZ_VALID && ((wd->sc_params.atap_secsz & ATA_SECSZ_LLS) != 0)) { wd->sc_blksize = 2ULL * ((uint32_t)((wd->sc_params.atap_lls_secsz[1] << 16) | wd->sc_params.atap_lls_secsz[0])); } else { wd->sc_blksize = 512; } wd->sc_sectoralign.dsa_firstaligned = 0; wd->sc_sectoralign.dsa_alignment = 1; if ((wd->sc_params.atap_secsz & ATA_SECSZ_VALID_MASK) == ATA_SECSZ_VALID && ((wd->sc_params.atap_secsz & ATA_SECSZ_LPS) != 0)) { wd->sc_sectoralign.dsa_alignment = 1 << (wd->sc_params.atap_secsz & ATA_SECSZ_LPS_SZMSK); if ((wd->sc_params.atap_logical_align & ATA_LA_VALID_MASK) == ATA_LA_VALID) { wd->sc_sectoralign.dsa_firstaligned = (wd->sc_sectoralign.dsa_alignment - (wd->sc_params.atap_logical_align & ATA_LA_MASK)); } } wd->sc_capacity512 = (wd->sc_capacity * wd->sc_blksize) / DEV_BSIZE; format_bytes(pbuf, sizeof(pbuf), wd->sc_capacity * wd->sc_blksize); aprint_normal_dev(self, "%s, %d cyl, %d head, %d sec, " "%d bytes/sect x %llu sectors", pbuf, (wd->sc_flags & WDF_LBA) ? (int)(wd->sc_capacity / (wd->sc_params.atap_heads * wd->sc_params.atap_sectors)) : wd->sc_params.atap_cylinders, wd->sc_params.atap_heads, wd->sc_params.atap_sectors, wd->sc_blksize, (unsigned long long)wd->sc_capacity); if (wd->sc_sectoralign.dsa_alignment != 1) { aprint_normal(" (%d bytes/physsect", wd->sc_sectoralign.dsa_alignment * wd->sc_blksize); if (wd->sc_sectoralign.dsa_firstaligned != 0) { aprint_normal("; first aligned sector: %jd", (intmax_t)wd->sc_sectoralign.dsa_firstaligned); } aprint_normal(")"); } aprint_normal("\n"); ATADEBUG_PRINT(("%s: atap_dmatiming_mimi=%d, atap_dmatiming_recom=%d\n", device_xname(self), wd->sc_params.atap_dmatiming_mimi, wd->sc_params.atap_dmatiming_recom), DEBUG_PROBE); if (wd->sc_blksize <= 0 || !powerof2(wd->sc_blksize) || wd->sc_blksize < DEV_BSIZE || wd->sc_blksize > MAXPHYS) { aprint_normal_dev(self, "WARNING: block size %u " "might not actually work\n", wd->sc_blksize); } if (strcmp(wd->sc_params.atap_model, "ST506") == 0) dtype = DKTYPE_ST506; else dtype = DKTYPE_ESDI; out: /* * Initialize and attach the disk structure. */ dk_init(dksc, self, dtype); disk_init(&dksc->sc_dkdev, dksc->sc_xname, &wddkdriver); /* Attach dk and disk subsystems */ dk_attach(dksc); disk_attach(&dksc->sc_dkdev); wd_set_geometry(wd); bufq_alloc(&dksc->sc_bufq, BUFQ_DISK_DEFAULT_STRAT, BUFQ_SORT_RAWBLOCK); /* reference to label structure, used by ata code */ wd->drvp->lp = dksc->sc_dkdev.dk_label; /* Discover wedges on this disk. */ dkwedge_discover(&dksc->sc_dkdev); if (!pmf_device_register1(self, wd_suspend, NULL, wd_shutdown)) aprint_error_dev(self, "couldn't establish power handler\n"); wd_sysctl_attach(wd); } static bool wd_suspend(device_t dv, const pmf_qual_t *qual) { struct wd_softc *sc = device_private(dv); /* the adapter needs to be enabled */ if (sc->atabus->ata_addref(sc->drvp)) return true; /* no need to complain */ wd_flushcache(sc, AT_WAIT); wd_standby(sc, AT_WAIT); sc->atabus->ata_delref(sc->drvp); return true; } static int wddetach(device_t self, int flags) { struct wd_softc *wd = device_private(self); struct dk_softc *dksc = &wd->sc_dksc; int bmaj, cmaj, i, mn, rc; if ((rc = disk_begindetach(&dksc->sc_dkdev, wd_lastclose, self, flags)) != 0) return rc; /* locate the major number */ bmaj = bdevsw_lookup_major(&wd_bdevsw); cmaj = cdevsw_lookup_major(&wd_cdevsw); /* Nuke the vnodes for any open instances. */ for (i = 0; i < MAXPARTITIONS; i++) { mn = WDMINOR(device_unit(self), i); vdevgone(bmaj, mn, mn, VBLK); vdevgone(cmaj, mn, mn, VCHR); } dk_drain(dksc); /* Kill off any pending commands. */ mutex_enter(&wd->sc_lock); wd->atabus->ata_killpending(wd->drvp); callout_halt(&wd->sc_retry_callout, &wd->sc_lock); callout_destroy(&wd->sc_retry_callout); callout_halt(&wd->sc_requeue_callout, &wd->sc_lock); callout_destroy(&wd->sc_requeue_callout); callout_halt(&wd->sc_restart_diskqueue, &wd->sc_lock); callout_destroy(&wd->sc_restart_diskqueue); mutex_exit(&wd->sc_lock); bufq_free(dksc->sc_bufq); /* Delete all of our wedges. */ dkwedge_delall(&dksc->sc_dkdev); if (flags & DETACH_POWEROFF) wd_standby(wd, AT_POLL); /* Detach from the disk list. */ disk_detach(&dksc->sc_dkdev); disk_destroy(&dksc->sc_dkdev); dk_detach(dksc); #ifdef WD_SOFTBADSECT /* Clean out the bad sector list */ while (!SLIST_EMPTY(&wd->sc_bslist)) { struct disk_badsectors *dbs = SLIST_FIRST(&wd->sc_bslist); SLIST_REMOVE_HEAD(&wd->sc_bslist, dbs_next); kmem_free(dbs, sizeof(*dbs)); } wd->sc_bscount = 0; #endif if (wd->sc_typename != NULL) { kmem_free(wd->sc_typename, strlen(wd->sc_typename) + 1); wd->sc_typename = NULL; } pmf_device_deregister(self); wd_sysctl_detach(wd); #ifdef WD_SOFTBADSECT KASSERT(SLIST_EMPTY(&wd->sc_bslist)); cv_destroy(&wd->sc_bslist_cv); #endif mutex_destroy(&wd->sc_lock); wd->drvp->drive_type = ATA_DRIVET_NONE; /* no drive any more here */ wd->drvp->drive_flags = 0; return (0); } /* * Read/write routine for a buffer. Validates the arguments and schedules the * transfer. Does not wait for the transfer to complete. */ static void wdstrategy(struct buf *bp) { struct wd_softc *wd = device_lookup_private(&wd_cd, WDUNIT(bp->b_dev)); struct dk_softc *dksc = &wd->sc_dksc; ATADEBUG_PRINT(("wdstrategy (%s)\n", dksc->sc_xname), DEBUG_XFERS); /* If device invalidated (e.g. media change, door open, * device detachment), then error. */ if ((wd->sc_flags & WDF_LOADED) == 0 || !device_is_enabled(dksc->sc_dev)) goto err; #ifdef WD_SOFTBADSECT /* * If the transfer about to be attempted contains only a block that * is known to be bad then return an error for the transfer without * even attempting to start a transfer up under the premis that we * will just end up doing more retries for a transfer that will end * up failing again. */ if (__predict_false(!SLIST_EMPTY(&wd->sc_bslist))) { struct disklabel *lp = dksc->sc_dkdev.dk_label; struct disk_badsectors *dbs; daddr_t blkno, maxblk; /* convert the block number to absolute */ if (lp->d_secsize >= DEV_BSIZE) blkno = bp->b_blkno / (lp->d_secsize / DEV_BSIZE); else blkno = bp->b_blkno * (DEV_BSIZE / lp->d_secsize); if (WDPART(bp->b_dev) != RAW_PART) blkno += lp->d_partitions[WDPART(bp->b_dev)].p_offset; maxblk = blkno + (bp->b_bcount / wd->sc_blksize) - 1; mutex_enter(&wd->sc_lock); SLIST_FOREACH(dbs, &wd->sc_bslist, dbs_next) if ((dbs->dbs_min <= bp->b_rawblkno && bp->b_rawblkno <= dbs->dbs_max) || (dbs->dbs_min <= maxblk && maxblk <= dbs->dbs_max)){ mutex_exit(&wd->sc_lock); goto err; } mutex_exit(&wd->sc_lock); } #endif dk_strategy(dksc, bp); return; err: bp->b_error = EIO; bp->b_resid = bp->b_bcount; biodone(bp); } static void wdstart1(struct wd_softc *wd, struct buf *bp, struct ata_xfer *xfer) { struct dk_softc *dksc = &wd->sc_dksc; const uint32_t secsize = dksc->sc_dkdev.dk_geom.dg_secsize; KASSERT(bp == xfer->c_bio.bp || xfer->c_bio.bp == NULL); KASSERT((xfer->c_flags & (C_WAITACT|C_FREE)) == 0); KASSERT(mutex_owned(&wd->sc_lock)); /* Reset state, so that retries don't use stale info */ if (__predict_false(xfer->c_retries > 0)) { xfer->c_flags = 0; memset(&xfer->c_bio, 0, sizeof(xfer->c_bio)); } xfer->c_bio.blkno = bp->b_rawblkno; xfer->c_bio.bcount = bp->b_bcount; xfer->c_bio.databuf = bp->b_data; xfer->c_bio.blkdone = 0; xfer->c_bio.bp = bp; /* Adjust blkno and bcount if xfer has been already partially done */ if (__predict_false(xfer->c_skip > 0)) { KASSERT(xfer->c_skip < xfer->c_bio.bcount); KASSERT((xfer->c_skip % secsize) == 0); xfer->c_bio.bcount -= xfer->c_skip; xfer->c_bio.blkno += xfer->c_skip / secsize; } #ifdef WD_CHAOS_MONKEY /* * Override blkno to be over device capacity to trigger error, * but only if it's read, to avoid trashing disk contents should * the command be clipped, or otherwise misinterpreted, by the * driver or controller. */ if (BUF_ISREAD(bp) && xfer->c_retries == 0 && wd->drv_chaos_freq > 0 && (++wd->drv_chaos_cnt % wd->drv_chaos_freq) == 0) { device_printf(dksc->sc_dev, "%s: chaos xfer %"PRIxPTR"\n", __func__, (intptr_t)xfer & PAGE_MASK); xfer->c_bio.blkno = 7777777 + wd->sc_capacity; xfer->c_flags |= C_CHAOS; } #endif /* * If we're retrying, retry in single-sector mode. This will give us * the sector number of the problem, and will eventually allow the * transfer to succeed. If FUA is requested, we can't actually * do this, as ATA_SINGLE is usually executed as PIO transfer by drivers * which support it, and that isn't compatible with NCQ/FUA. */ if (xfer->c_retries >= WDIORETRIES_SINGLE && (bp->b_flags & B_MEDIA_FUA) == 0) xfer->c_bio.flags = ATA_SINGLE; else xfer->c_bio.flags = 0; /* * request LBA48 transfers when supported by the controller * and needed by transfer offset or size. */ if (wd->sc_flags & WDF_LBA48 && (((xfer->c_bio.blkno + xfer->c_bio.bcount / secsize) > wd->sc_capacity28) || ((xfer->c_bio.bcount / secsize) > 128))) xfer->c_bio.flags |= ATA_LBA48; /* * If NCQ was negotiated, always use it for the first several attempts. * Since device cancels all outstanding requests on error, downgrade * to non-NCQ on retry, so that the retried transfer would not cause * cascade failure for the other transfers if it fails again. * If FUA was requested, we can't downgrade, as that would violate * the semantics - FUA would not be honored. In that case, continue * retrying with NCQ. */ if (WD_USE_NCQ(wd) && (xfer->c_retries < WDIORETRIES_SINGLE || (bp->b_flags & B_MEDIA_FUA) != 0)) { xfer->c_bio.flags |= ATA_LBA48; xfer->c_flags |= C_NCQ; if (WD_USE_NCQ_PRIO(wd) && BIO_GETPRIO(bp) == BPRIO_TIMECRITICAL) xfer->c_bio.flags |= ATA_PRIO_HIGH; } if (wd->sc_flags & WDF_LBA) xfer->c_bio.flags |= ATA_LBA; if (bp->b_flags & B_READ) { xfer->c_bio.flags |= ATA_READ; } else { /* it's a write */ wd->sc_flags |= WDF_DIRTY; } if (bp->b_flags & B_MEDIA_FUA) { /* If not using NCQ, the command WRITE DMA FUA EXT is LBA48 */ KASSERT((wd->sc_flags & WDF_LBA48) != 0); if ((xfer->c_flags & C_NCQ) == 0) xfer->c_bio.flags |= ATA_LBA48; xfer->c_bio.flags |= ATA_FUA; } if (xfer->c_retries == 0) wd->inflight++; mutex_exit(&wd->sc_lock); /* Queue the xfer */ wd->atabus->ata_bio(wd->drvp, xfer); mutex_enter(&wd->sc_lock); } static int wd_diskstart(device_t dev, struct buf *bp) { struct wd_softc *wd = device_private(dev); #ifdef ATADEBUG struct dk_softc *dksc = &wd->sc_dksc; #endif struct ata_xfer *xfer; struct ata_channel *chp; unsigned openings; int ticks; mutex_enter(&wd->sc_lock); chp = wd->drvp->chnl_softc; ata_channel_lock(chp); openings = ata_queue_openings(chp); ata_channel_unlock(chp); openings = uimin(openings, wd->drvp->drv_openings); if (wd->inflight >= openings) { /* * pretend we run out of memory when the queue is full, * so that the operation is retried after a minimal * delay. */ xfer = NULL; ticks = 1; } else { /* * If there is no available memory, retry later. This * happens very rarely and only under memory pressure, * so wait relatively long before retry. */ xfer = ata_get_xfer(chp, false); ticks = hz/2; } if (xfer == NULL) { ATADEBUG_PRINT(("wd_diskstart %s no xfer\n", dksc->sc_xname), DEBUG_XFERS); /* * The disk queue is pushed automatically when an I/O * operation finishes or another one is queued. We * need this extra timeout because an ATA channel * might be shared by more than one disk queue and * all queues need to be restarted when another slot * becomes available. */ if (!callout_pending(&wd->sc_restart_diskqueue)) { callout_reset(&wd->sc_restart_diskqueue, ticks, wdrestart, dev); } mutex_exit(&wd->sc_lock); return EAGAIN; } wdstart1(wd, bp, xfer); mutex_exit(&wd->sc_lock); return 0; } /* * Queue a drive for I/O. */ static void wdrestart(void *x) { device_t self = x; struct wd_softc *wd = device_private(self); struct dk_softc *dksc = &wd->sc_dksc; ATADEBUG_PRINT(("wdstart %s\n", dksc->sc_xname), DEBUG_XFERS); if (!device_is_active(dksc->sc_dev)) return; dk_start(dksc, NULL); } static void wddone(device_t self, struct ata_xfer *xfer) { struct wd_softc *wd = device_private(self); struct dk_softc *dksc = &wd->sc_dksc; const char *errmsg; int do_perror = 0; struct buf *bp; ATADEBUG_PRINT(("wddone %s\n", dksc->sc_xname), DEBUG_XFERS); if (__predict_false(wddoingadump)) { /* just drop it to the floor */ ata_free_xfer(wd->drvp->chnl_softc, xfer); return; } bp = xfer->c_bio.bp; KASSERT(bp != NULL); bp->b_resid = xfer->c_bio.bcount; switch (xfer->c_bio.error) { case ERR_DMA: errmsg = "DMA error"; goto retry; case ERR_DF: errmsg = "device fault"; goto retry; case TIMEOUT: errmsg = "device timeout"; goto retry; case REQUEUE: errmsg = "requeue"; goto retry2; case ERR_RESET: errmsg = "channel reset"; goto retry2; case ERROR: /* Don't care about media change bits */ if (xfer->c_bio.r_error != 0 && (xfer->c_bio.r_error & ~(WDCE_MC | WDCE_MCR)) == 0) goto noerror; errmsg = "error"; do_perror = 1; retry: /* Just reset and retry. Can we do more ? */ if ((xfer->c_flags & C_RECOVERED) == 0) { int wflags = (xfer->c_flags & C_POLL) ? AT_POLL : 0; ata_channel_lock(wd->drvp->chnl_softc); ata_thread_run(wd->drvp->chnl_softc, wflags, ATACH_TH_DRIVE_RESET, wd->drvp->drive); ata_channel_unlock(wd->drvp->chnl_softc); } retry2: mutex_enter(&wd->sc_lock); diskerr(bp, "wd", errmsg, LOG_PRINTF, xfer->c_bio.blkdone, dksc->sc_dkdev.dk_label); if (xfer->c_retries < WDIORETRIES) printf(", xfer %"PRIxPTR", retry %d", (intptr_t)xfer & PAGE_MASK, xfer->c_retries); printf("\n"); if (do_perror) wdperror(wd, xfer); if (xfer->c_retries < WDIORETRIES) { xfer->c_retries++; /* Rerun ASAP if just requeued */ if (xfer->c_bio.error == REQUEUE) { SLIST_INSERT_HEAD(&wd->sc_requeue_list, xfer, c_retrychain); callout_reset(&wd->sc_requeue_callout, 1, wdbiorequeue, wd); } else { SLIST_INSERT_HEAD(&wd->sc_retry_list, xfer, c_retrychain); callout_reset(&wd->sc_retry_callout, RECOVERYTIME, wdbioretry, wd); } mutex_exit(&wd->sc_lock); return; } mutex_exit(&wd->sc_lock); #ifdef WD_SOFTBADSECT /* * Not all errors indicate a failed block but those that do, * put the block on the bad-block list for the device. Only * do this for reads because the drive should do it for writes, * itself, according to Manuel. */ if ((bp->b_flags & B_READ) && ((wd->drvp->ata_vers >= 4 && xfer->c_bio.r_error & 64) || (wd->drvp->ata_vers < 4 && xfer->c_bio.r_error & 192))) { struct disk_badsectors *dbs; dbs = kmem_zalloc(sizeof *dbs, KM_NOSLEEP); if (dbs == NULL) { aprint_error_dev(dksc->sc_dev, "failed to add bad block to list\n"); goto out; } dbs->dbs_min = bp->b_rawblkno; dbs->dbs_max = dbs->dbs_min + (bp->b_bcount /wd->sc_blksize) - 1; microtime(&dbs->dbs_failedat); mutex_enter(&wd->sc_lock); SLIST_INSERT_HEAD(&wd->sc_bslist, dbs, dbs_next); wd->sc_bscount++; mutex_exit(&wd->sc_lock); } out: #endif bp->b_error = EIO; break; case NOERROR: #ifdef WD_CHAOS_MONKEY /* * For example Parallels AHCI emulation doesn't actually * return error for the invalid I/O, so just re-run * the request and do not panic. */ if (__predict_false(xfer->c_flags & C_CHAOS)) { xfer->c_bio.error = REQUEUE; errmsg = "chaos noerror"; goto retry2; } #endif noerror: if ((xfer->c_bio.flags & ATA_CORR) || xfer->c_retries > 0) device_printf(dksc->sc_dev, "soft error (corrected) xfer %"PRIxPTR"\n", (intptr_t)xfer & PAGE_MASK); break; case ERR_NODEV: bp->b_error = EIO; break; } if (__predict_false(bp->b_error != 0) && bp->b_resid == 0) { /* * the disk or controller sometimes report a complete * xfer, when there has been an error. This is wrong, * assume nothing got transferred in this case */ bp->b_resid = bp->b_bcount; } ata_free_xfer(wd->drvp->chnl_softc, xfer); mutex_enter(&wd->sc_lock); wd->inflight--; mutex_exit(&wd->sc_lock); dk_done(dksc, bp); dk_start(dksc, NULL); } static void wdbioretry(void *v) { struct wd_softc *wd = v; struct ata_xfer *xfer; ATADEBUG_PRINT(("%s %s\n", __func__, wd->sc_dksc.sc_xname), DEBUG_XFERS); mutex_enter(&wd->sc_lock); while ((xfer = SLIST_FIRST(&wd->sc_retry_list))) { SLIST_REMOVE_HEAD(&wd->sc_retry_list, c_retrychain); wdstart1(wd, xfer->c_bio.bp, xfer); } mutex_exit(&wd->sc_lock); } static void wdbiorequeue(void *v) { struct wd_softc *wd = v; struct ata_xfer *xfer; ATADEBUG_PRINT(("%s %s\n", __func__, wd->sc_dksc.sc_xname), DEBUG_XFERS); mutex_enter(&wd->sc_lock); while ((xfer = SLIST_FIRST(&wd->sc_requeue_list))) { SLIST_REMOVE_HEAD(&wd->sc_requeue_list, c_retrychain); wdstart1(wd, xfer->c_bio.bp, xfer); } mutex_exit(&wd->sc_lock); } static void wdminphys(struct buf *bp) { const struct wd_softc * const wd = device_lookup_private(&wd_cd, WDUNIT(bp->b_dev)); int maxsectors; /* * The limit is actually 65536 for LBA48 and 256 for non-LBA48, * but that requires to set the count for the ATA command * to 0, which is somewhat error prone, so better stay safe. */ if (wd->sc_flags & WDF_LBA48) maxsectors = 65535; else maxsectors = 128; if (bp->b_bcount > (wd->sc_blksize * maxsectors)) bp->b_bcount = (wd->sc_blksize * maxsectors); minphys(bp); } static void wd_iosize(device_t dev, int *count) { struct buf B; int bmaj; bmaj = bdevsw_lookup_major(&wd_bdevsw); B.b_dev = MAKEWDDEV(bmaj,device_unit(dev),RAW_PART); B.b_bcount = *count; wdminphys(&B); *count = B.b_bcount; } static int wdread(dev_t dev, struct uio *uio, int flags) { ATADEBUG_PRINT(("wdread\n"), DEBUG_XFERS); return (physio(wdstrategy, NULL, dev, B_READ, wdminphys, uio)); } static int wdwrite(dev_t dev, struct uio *uio, int flags) { ATADEBUG_PRINT(("wdwrite\n"), DEBUG_XFERS); return (physio(wdstrategy, NULL, dev, B_WRITE, wdminphys, uio)); } static int wdopen(dev_t dev, int flag, int fmt, struct lwp *l) { struct wd_softc *wd; struct dk_softc *dksc; int unit, part, error; ATADEBUG_PRINT(("wdopen\n"), DEBUG_FUNCS); unit = WDUNIT(dev); wd = device_lookup_private(&wd_cd, unit); if (wd == NULL) return (ENXIO); dksc = &wd->sc_dksc; if (! device_is_active(dksc->sc_dev)) return (ENODEV); part = WDPART(dev); if (wd->sc_capacity == 0) return (ENODEV); /* * If any partition is open, but the disk has been invalidated, * disallow further opens. */ if ((wd->sc_flags & (WDF_OPEN | WDF_LOADED)) == WDF_OPEN) { if (part != RAW_PART || fmt != S_IFCHR) return EIO; } error = dk_open(dksc, dev, flag, fmt, l); return error; } /* * Serialized by caller */ static int wd_firstopen(device_t self, dev_t dev, int flag, int fmt) { struct wd_softc *wd = device_private(self); struct dk_softc *dksc = &wd->sc_dksc; int error; error = wd->atabus->ata_addref(wd->drvp); if (error) return error; if ((wd->sc_flags & WDF_LOADED) == 0) { int param_error; /* Load the physical device parameters. */ param_error = wd_get_params(wd, &wd->sc_params); if (param_error != 0) { aprint_error_dev(dksc->sc_dev, "IDENTIFY failed\n"); error = EIO; goto bad; } wd_set_geometry(wd); wd->sc_flags |= WDF_LOADED; } wd->sc_flags |= WDF_OPEN; return 0; bad: wd->atabus->ata_delref(wd->drvp); return error; } /* * Caller must hold wd->sc_dk.dk_openlock. */ static int wd_lastclose(device_t self) { struct wd_softc *wd = device_private(self); KASSERTMSG(bufq_peek(wd->sc_dksc.sc_bufq) == NULL, "bufq not empty"); if (wd->sc_flags & WDF_DIRTY) wd_flushcache(wd, AT_WAIT); wd->atabus->ata_delref(wd->drvp); wd->sc_flags &= ~WDF_OPEN; return 0; } static int wdclose(dev_t dev, int flag, int fmt, struct lwp *l) { struct wd_softc *wd; struct dk_softc *dksc; int unit; unit = WDUNIT(dev); wd = device_lookup_private(&wd_cd, unit); dksc = &wd->sc_dksc; return dk_close(dksc, dev, flag, fmt, l); } void wdperror(const struct wd_softc *wd, struct ata_xfer *xfer) { static const char *const errstr0_3[] = {"address mark not found", "track 0 not found", "aborted command", "media change requested", "id not found", "media changed", "uncorrectable data error", "bad block detected"}; static const char *const errstr4_5[] = { "obsolete (address mark not found)", "no media/write protected", "aborted command", "media change requested", "id not found", "media changed", "uncorrectable data error", "interface CRC error"}; const char *const *errstr; int i; const char *sep = ""; const struct dk_softc *dksc = &wd->sc_dksc; const char *devname = dksc->sc_xname; struct ata_drive_datas *drvp = wd->drvp; int errno = xfer->c_bio.r_error; if (drvp->ata_vers >= 4) errstr = errstr4_5; else errstr = errstr0_3; printf("%s: (", devname); if (errno == 0) printf("error not notified"); for (i = 0; i < 8; i++) { if (errno & (1 << i)) { printf("%s%s", sep, errstr[i]); sep = ", "; } } printf(")\n"); } int wdioctl(dev_t dev, u_long cmd, void *addr, int flag, struct lwp *l) { struct wd_softc *wd = device_lookup_private(&wd_cd, WDUNIT(dev)); struct dk_softc *dksc = &wd->sc_dksc; ATADEBUG_PRINT(("wdioctl\n"), DEBUG_FUNCS); if ((wd->sc_flags & WDF_LOADED) == 0) return EIO; switch (cmd) { #ifdef HAS_BAD144_HANDLING case DIOCSBAD: if ((flag & FWRITE) == 0) return EBADF; dksc->sc_dkdev.dk_cpulabel->bad = *(struct dkbad *)addr; dksc->sc_dkdev.dk_label->d_flags |= D_BADSECT; bad144intern(wd); return 0; #endif #ifdef WD_SOFTBADSECT case DIOCBSLIST: { uint32_t count, missing, skip; struct disk_badsecinfo dbsi; struct disk_badsectors *dbs, dbsbuf; size_t available; uint8_t *laddr; int error; dbsi = *(struct disk_badsecinfo *)addr; missing = wd->sc_bscount; count = 0; available = dbsi.dbsi_bufsize; skip = dbsi.dbsi_skip; laddr = (uint8_t *)dbsi.dbsi_buffer; /* * We start this loop with the expectation that all of the * entries will be missed and decrement this counter each * time we either skip over one (already copied out) or * we actually copy it back to user space. The structs * holding the bad sector information are copied directly * back to user space whilst the summary is returned via * the struct passed in via the ioctl. */ error = 0; mutex_enter(&wd->sc_lock); wd->sc_bslist_inuse++; SLIST_FOREACH(dbs, &wd->sc_bslist, dbs_next) { if (skip > 0) { missing--; skip--; continue; } if (available < sizeof(*dbs)) break; available -= sizeof(*dbs); memset(&dbsbuf, 0, sizeof(dbsbuf)); dbsbuf.dbs_min = dbs->dbs_min; dbsbuf.dbs_max = dbs->dbs_max; dbsbuf.dbs_failedat = dbs->dbs_failedat; mutex_exit(&wd->sc_lock); error = copyout(&dbsbuf, laddr, sizeof(dbsbuf)); mutex_enter(&wd->sc_lock); if (error) break; laddr += sizeof(*dbs); missing--; count++; } if (--wd->sc_bslist_inuse == 0) cv_broadcast(&wd->sc_bslist_cv); mutex_exit(&wd->sc_lock); dbsi.dbsi_left = missing; dbsi.dbsi_copied = count; *(struct disk_badsecinfo *)addr = dbsi; /* * If we copied anything out, ignore error and return * success -- can't back it out. */ return count ? 0 : error; } case DIOCBSFLUSH: { int error; /* Clean out the bad sector list */ mutex_enter(&wd->sc_lock); while (wd->sc_bslist_inuse) { error = cv_wait_sig(&wd->sc_bslist_cv, &wd->sc_lock); if (error) { mutex_exit(&wd->sc_lock); return error; } } while (!SLIST_EMPTY(&wd->sc_bslist)) { struct disk_badsectors *dbs = SLIST_FIRST(&wd->sc_bslist); SLIST_REMOVE_HEAD(&wd->sc_bslist, dbs_next); mutex_exit(&wd->sc_lock); kmem_free(dbs, sizeof(*dbs)); mutex_enter(&wd->sc_lock); } mutex_exit(&wd->sc_lock); wd->sc_bscount = 0; return 0; } #endif #ifdef notyet case DIOCWFORMAT: if ((flag & FWRITE) == 0) return EBADF; { register struct format_op *fop; struct iovec aiov; struct uio auio; int error1; fop = (struct format_op *)addr; aiov.iov_base = fop->df_buf; aiov.iov_len = fop->df_count; auio.uio_iov = &aiov; auio.uio_iovcnt = 1; auio.uio_resid = fop->df_count; auio.uio_offset = fop->df_startblk * wd->sc_dk.dk_label->d_secsize; auio.uio_vmspace = l->l_proc->p_vmspace; error1 = physio(wdformat, NULL, dev, B_WRITE, wdminphys, &auio); fop->df_count -= auio.uio_resid; fop->df_reg[0] = wdc->sc_status; fop->df_reg[1] = wdc->sc_error; return error1; } #endif case DIOCGCACHE: return wd_getcache(wd, (int *)addr); case DIOCSCACHE: return wd_setcache(wd, *(int *)addr); case DIOCCACHESYNC: return wd_flushcache(wd, AT_WAIT); case ATAIOCCOMMAND: /* * Make sure this command is (relatively) safe first */ if ((((atareq_t *) addr)->flags & ATACMD_READ) == 0 && (flag & FWRITE) == 0) return (EBADF); { struct wd_ioctl *wi; atareq_t *atareq = (atareq_t *) addr; int error1; wi = wi_get(wd); wi->wi_atareq = *atareq; if (atareq->datalen && atareq->flags & (ATACMD_READ | ATACMD_WRITE)) { void *tbuf; if (atareq->datalen < DEV_BSIZE && atareq->command == WDCC_IDENTIFY) { tbuf = kmem_zalloc(DEV_BSIZE, KM_SLEEP); wi->wi_iov.iov_base = tbuf; wi->wi_iov.iov_len = DEV_BSIZE; UIO_SETUP_SYSSPACE(&wi->wi_uio); } else { tbuf = NULL; wi->wi_iov.iov_base = atareq->databuf; wi->wi_iov.iov_len = atareq->datalen; wi->wi_uio.uio_vmspace = l->l_proc->p_vmspace; } wi->wi_uio.uio_iov = &wi->wi_iov; wi->wi_uio.uio_iovcnt = 1; wi->wi_uio.uio_resid = atareq->datalen; wi->wi_uio.uio_offset = 0; wi->wi_uio.uio_rw = (atareq->flags & ATACMD_READ) ? B_READ : B_WRITE; error1 = physio(wdioctlstrategy, &wi->wi_bp, dev, (atareq->flags & ATACMD_READ) ? B_READ : B_WRITE, wdminphys, &wi->wi_uio); if (tbuf != NULL && error1 == 0) { error1 = copyout(tbuf, atareq->databuf, atareq->datalen); kmem_free(tbuf, DEV_BSIZE); } } else { /* No need to call physio if we don't have any user data */ wi->wi_bp.b_flags = 0; wi->wi_bp.b_data = 0; wi->wi_bp.b_bcount = 0; wi->wi_bp.b_dev = dev; wi->wi_bp.b_proc = l->l_proc; wdioctlstrategy(&wi->wi_bp); error1 = wi->wi_bp.b_error; } *atareq = wi->wi_atareq; wi_free(wi); return(error1); } case DIOCGSECTORALIGN: { struct disk_sectoralign *dsa = addr; int part = WDPART(dev); *dsa = wd->sc_sectoralign; if (part != RAW_PART) { struct disklabel *lp = dksc->sc_dkdev.dk_label; daddr_t offset = lp->d_partitions[part].p_offset; uint32_t r = offset % dsa->dsa_alignment; if (r < dsa->dsa_firstaligned) dsa->dsa_firstaligned = dsa->dsa_firstaligned - r; else dsa->dsa_firstaligned = (dsa->dsa_firstaligned + dsa->dsa_alignment) - r; } return 0; } default: return dk_ioctl(dksc, dev, cmd, addr, flag, l); } #ifdef DIAGNOSTIC panic("wdioctl: impossible"); #endif } static int wd_discard(device_t dev, off_t pos, off_t len) { struct wd_softc *wd = device_private(dev); daddr_t bno; long size, done; long maxatonce, amount; int result; if (!(wd->sc_params.atap_ata_major & WDC_VER_ATA7) || !(wd->sc_params.support_dsm & ATA_SUPPORT_DSM_TRIM)) { /* not supported; ignore request */ ATADEBUG_PRINT(("wddiscard (unsupported)\n"), DEBUG_FUNCS); return 0; } maxatonce = 0xffff; /*wd->sc_params.max_dsm_blocks*/ ATADEBUG_PRINT(("wddiscard\n"), DEBUG_FUNCS); if ((wd->sc_flags & WDF_LOADED) == 0) return EIO; /* round the start up and the end down */ bno = (pos + wd->sc_blksize - 1) / wd->sc_blksize; size = ((pos + len) / wd->sc_blksize) - bno; done = 0; while (done < size) { amount = size - done; if (amount > maxatonce) { amount = maxatonce; } result = wd_trim(wd, bno + done, amount); if (result) { return result; } done += amount; } return 0; } static int wddiscard(dev_t dev, off_t pos, off_t len) { struct wd_softc *wd; struct dk_softc *dksc; int unit; unit = WDUNIT(dev); wd = device_lookup_private(&wd_cd, unit); dksc = &wd->sc_dksc; return dk_discard(dksc, dev, pos, len); } #ifdef B_FORMAT int wdformat(struct buf *bp) { bp->b_flags |= B_FORMAT; return wdstrategy(bp); } #endif int wdsize(dev_t dev) { struct wd_softc *wd; struct dk_softc *dksc; int unit; ATADEBUG_PRINT(("wdsize\n"), DEBUG_FUNCS); unit = WDUNIT(dev); wd = device_lookup_private(&wd_cd, unit); if (wd == NULL) return (-1); dksc = &wd->sc_dksc; if (!device_is_active(dksc->sc_dev)) return (-1); return dk_size(dksc, dev); } /* * Dump core after a system crash. */ static int wddump(dev_t dev, daddr_t blkno, void *va, size_t size) { struct wd_softc *wd; struct dk_softc *dksc; int unit; /* Check if recursive dump; if so, punt. */ if (wddoingadump) return EFAULT; wddoingadump = 1; unit = WDUNIT(dev); wd = device_lookup_private(&wd_cd, unit); if (wd == NULL) return (ENXIO); dksc = &wd->sc_dksc; return dk_dump(dksc, dev, blkno, va, size, 0); } static int wd_dumpblocks(device_t dev, void *va, daddr_t blkno, int nblk) { struct wd_softc *wd = device_private(dev); struct dk_softc *dksc = &wd->sc_dksc; struct disk_geom *dg = &dksc->sc_dkdev.dk_geom; struct ata_xfer *xfer = &wd->dump_xfer; int err; /* Recalibrate, if first dump transfer. */ if (wddumprecalibrated == 0) { wddumprecalibrated = 1; ata_channel_lock(wd->drvp->chnl_softc); /* This will directly execute the reset due to AT_POLL */ ata_thread_run(wd->drvp->chnl_softc, AT_POLL, ATACH_TH_DRIVE_RESET, wd->drvp->drive); wd->drvp->state = RESET; ata_channel_unlock(wd->drvp->chnl_softc); } memset(xfer, 0, sizeof(*xfer)); xfer->c_flags |= C_PRIVATE_ALLOC | C_SKIP_QUEUE; xfer->c_bio.blkno = blkno; xfer->c_bio.flags = ATA_POLL; if (wd->sc_flags & WDF_LBA48 && (xfer->c_bio.blkno + nblk) > wd->sc_capacity28) xfer->c_bio.flags |= ATA_LBA48; if (wd->sc_flags & WDF_LBA) xfer->c_bio.flags |= ATA_LBA; xfer->c_bio.bcount = nblk * dg->dg_secsize; xfer->c_bio.databuf = va; #ifndef WD_DUMP_NOT_TRUSTED /* This will poll until the bio is complete */ wd->atabus->ata_bio(wd->drvp, xfer); switch(err = xfer->c_bio.error) { case TIMEOUT: printf("wddump: device timed out"); err = EIO; break; case ERR_DF: printf("wddump: drive fault"); err = EIO; break; case ERR_DMA: printf("wddump: DMA error"); err = EIO; break; case ERROR: printf("wddump: "); wdperror(wd, xfer); err = EIO; break; case NOERROR: err = 0; break; default: panic("wddump: unknown error type %x", err); } if (err != 0) { printf("\n"); return err; } #else /* WD_DUMP_NOT_TRUSTED */ /* Let's just talk about this first... */ printf("wd%d: dump addr 0x%x, cylin %d, head %d, sector %d\n", unit, va, cylin, head, sector); delay(500 * 1000); /* half a second */ #endif wddoingadump = 0; return 0; } #ifdef HAS_BAD144_HANDLING /* * Internalize the bad sector table. */ void bad144intern(struct wd_softc *wd) { struct dk_softc *dksc = &wd->sc_dksc; struct dkbad *bt = &dksc->sc_dkdev.dk_cpulabel->bad; struct disklabel *lp = dksc->sc_dkdev.dk_label; int i = 0; ATADEBUG_PRINT(("bad144intern\n"), DEBUG_XFERS); for (; i < NBT_BAD; i++) { if (bt->bt_bad[i].bt_cyl == 0xffff) break; wd->drvp->badsect[i] = bt->bt_bad[i].bt_cyl * lp->d_secpercyl + (bt->bt_bad[i].bt_trksec >> 8) * lp->d_nsectors + (bt->bt_bad[i].bt_trksec & 0xff); } for (; i < NBT_BAD+1; i++) wd->drvp->badsect[i] = -1; } #endif static void wd_set_geometry(struct wd_softc *wd) { struct dk_softc *dksc = &wd->sc_dksc; struct disk_geom *dg = &dksc->sc_dkdev.dk_geom; memset(dg, 0, sizeof(*dg)); dg->dg_secperunit = wd->sc_capacity; dg->dg_secsize = wd->sc_blksize; dg->dg_nsectors = wd->sc_params.atap_sectors; dg->dg_ntracks = wd->sc_params.atap_heads; if ((wd->sc_flags & WDF_LBA) == 0) dg->dg_ncylinders = wd->sc_params.atap_cylinders; disk_set_info(dksc->sc_dev, &dksc->sc_dkdev, wd->sc_typename); } int wd_get_params(struct wd_softc *wd, struct ataparams *params) { int retry = 0; struct ata_channel *chp = wd->drvp->chnl_softc; const int flags = AT_WAIT; again: switch (wd->atabus->ata_get_params(wd->drvp, flags, params)) { case CMD_AGAIN: return 1; case CMD_ERR: if (retry == 0) { retry++; ata_channel_lock(chp); (*wd->atabus->ata_reset_drive)(wd->drvp, flags, NULL); ata_channel_unlock(chp); goto again; } if (wd->drvp->drive_type != ATA_DRIVET_OLD) return 1; /* * We `know' there's a drive here; just assume it's old. * This geometry is only used to read the MBR and print a * (false) attach message. */ strncpy(params->atap_model, "ST506", sizeof params->atap_model); params->atap_config = ATA_CFG_FIXED; params->atap_cylinders = 1024; params->atap_heads = 8; params->atap_sectors = 17; params->atap_multi = 1; params->atap_capabilities1 = params->atap_capabilities2 = 0; wd->drvp->ata_vers = -1; /* Mark it as pre-ATA */ /* FALLTHROUGH */ case CMD_OK: return 0; default: panic("wd_get_params: bad return code from ata_get_params"); /* NOTREACHED */ } } int wd_getcache(struct wd_softc *wd, int *bitsp) { struct ataparams params; if (wd_get_params(wd, &params) != 0) return EIO; if (params.atap_cmd_set1 == 0x0000 || params.atap_cmd_set1 == 0xffff || (params.atap_cmd_set1 & WDC_CMD1_CACHE) == 0) { *bitsp = 0; return 0; } *bitsp = DKCACHE_WCHANGE | DKCACHE_READ; if (params.atap_cmd1_en & WDC_CMD1_CACHE) *bitsp |= DKCACHE_WRITE; if (WD_USE_NCQ(wd) || (wd->drvp->drive_flags & ATA_DRIVE_WFUA)) *bitsp |= DKCACHE_FUA; return 0; } static int wd_check_error(const struct dk_softc *dksc, const struct ata_xfer *xfer, const char *func) { static const char at_errbits[] = "\20\10ERROR\11TIMEOU\12DF"; int flags = xfer->c_ata_c.flags; if ((flags & AT_ERROR) != 0 && xfer->c_ata_c.r_error == WDCE_ABRT) { /* command not supported */ aprint_debug_dev(dksc->sc_dev, "%s: not supported\n", func); return ENODEV; } if (flags & (AT_ERROR | AT_TIMEOU | AT_DF)) { char sbuf[sizeof(at_errbits) + 64]; snprintb(sbuf, sizeof(sbuf), at_errbits, flags); aprint_error_dev(dksc->sc_dev, "%s: status=%s\n", func, sbuf); return EIO; } return 0; } int wd_setcache(struct wd_softc *wd, int bits) { struct dk_softc *dksc = &wd->sc_dksc; struct ataparams params; struct ata_xfer *xfer; int error; if (wd_get_params(wd, &params) != 0) return EIO; if (params.atap_cmd_set1 == 0x0000 || params.atap_cmd_set1 == 0xffff || (params.atap_cmd_set1 & WDC_CMD1_CACHE) == 0) return EOPNOTSUPP; if ((bits & DKCACHE_READ) == 0 || (bits & DKCACHE_SAVE) != 0) return EOPNOTSUPP; xfer = ata_get_xfer(wd->drvp->chnl_softc, true); xfer->c_ata_c.r_command = SET_FEATURES; xfer->c_ata_c.r_st_bmask = 0; xfer->c_ata_c.r_st_pmask = 0; xfer->c_ata_c.timeout = 30000; /* 30s timeout */ xfer->c_ata_c.flags = AT_WAIT; if (bits & DKCACHE_WRITE) xfer->c_ata_c.r_features = WDSF_WRITE_CACHE_EN; else xfer->c_ata_c.r_features = WDSF_WRITE_CACHE_DS; wd->atabus->ata_exec_command(wd->drvp, xfer); ata_wait_cmd(wd->drvp->chnl_softc, xfer); error = wd_check_error(dksc, xfer, __func__); ata_free_xfer(wd->drvp->chnl_softc, xfer); return error; } static int wd_standby(struct wd_softc *wd, int flags) { struct dk_softc *dksc = &wd->sc_dksc; struct ata_xfer *xfer; int error; aprint_debug_dev(dksc->sc_dev, "standby immediate\n"); xfer = ata_get_xfer(wd->drvp->chnl_softc, true); xfer->c_ata_c.r_command = WDCC_STANDBY_IMMED; xfer->c_ata_c.r_st_bmask = WDCS_DRDY; xfer->c_ata_c.r_st_pmask = WDCS_DRDY; xfer->c_ata_c.flags = flags; xfer->c_ata_c.timeout = 30000; /* 30s timeout */ wd->atabus->ata_exec_command(wd->drvp, xfer); ata_wait_cmd(wd->drvp->chnl_softc, xfer); error = wd_check_error(dksc, xfer, __func__); ata_free_xfer(wd->drvp->chnl_softc, xfer); return error; } int wd_flushcache(struct wd_softc *wd, int flags) { struct dk_softc *dksc = &wd->sc_dksc; struct ata_xfer *xfer; int error; /* * WDCC_FLUSHCACHE is here since ATA-4, but some drives report * only ATA-2 and still support it. */ if (wd->drvp->ata_vers < 4 && ((wd->sc_params.atap_cmd_set2 & WDC_CMD2_FC) == 0 || wd->sc_params.atap_cmd_set2 == 0xffff)) return ENODEV; xfer = ata_get_xfer(wd->drvp->chnl_softc, true); if ((wd->sc_params.atap_cmd2_en & ATA_CMD2_LBA48) != 0 && (wd->sc_params.atap_cmd2_en & ATA_CMD2_FCE) != 0) { xfer->c_ata_c.r_command = WDCC_FLUSHCACHE_EXT; flags |= AT_LBA48; } else xfer->c_ata_c.r_command = WDCC_FLUSHCACHE; xfer->c_ata_c.r_st_bmask = WDCS_DRDY; xfer->c_ata_c.r_st_pmask = WDCS_DRDY; xfer->c_ata_c.flags = flags | AT_READREG; xfer->c_ata_c.timeout = 300000; /* 5m timeout */ wd->atabus->ata_exec_command(wd->drvp, xfer); ata_wait_cmd(wd->drvp->chnl_softc, xfer); error = wd_check_error(dksc, xfer, __func__); wd->sc_flags &= ~WDF_DIRTY; ata_free_xfer(wd->drvp->chnl_softc, xfer); return error; } /* * Execute TRIM command, assumes sleep context. */ static int wd_trim(struct wd_softc *wd, daddr_t bno, long size) { struct dk_softc *dksc = &wd->sc_dksc; struct ata_xfer *xfer; int error; unsigned char *req; xfer = ata_get_xfer(wd->drvp->chnl_softc, true); req = kmem_zalloc(512, KM_SLEEP); req[0] = bno & 0xff; req[1] = (bno >> 8) & 0xff; req[2] = (bno >> 16) & 0xff; req[3] = (bno >> 24) & 0xff; req[4] = (bno >> 32) & 0xff; req[5] = (bno >> 40) & 0xff; req[6] = size & 0xff; req[7] = (size >> 8) & 0xff; /* * XXX We could possibly use NCQ TRIM, which supports executing * this command concurrently. It would need some investigation, some * early or not so early disk firmware caused data loss with NCQ TRIM. * atastart() et.al would need to be adjusted to allow and support * running several non-I/O ATA commands in parallel. */ xfer->c_ata_c.r_command = ATA_DATA_SET_MANAGEMENT; xfer->c_ata_c.r_count = 1; xfer->c_ata_c.r_features = ATA_SUPPORT_DSM_TRIM; xfer->c_ata_c.r_st_bmask = WDCS_DRDY; xfer->c_ata_c.r_st_pmask = WDCS_DRDY; xfer->c_ata_c.timeout = 30000; /* 30s timeout */ xfer->c_ata_c.data = req; xfer->c_ata_c.bcount = 512; xfer->c_ata_c.flags |= AT_WRITE | AT_WAIT; wd->atabus->ata_exec_command(wd->drvp, xfer); ata_wait_cmd(wd->drvp->chnl_softc, xfer); kmem_free(req, 512); error = wd_check_error(dksc, xfer, __func__); ata_free_xfer(wd->drvp->chnl_softc, xfer); return error; } bool wd_shutdown(device_t dev, int how) { struct wd_softc *wd = device_private(dev); /* the adapter needs to be enabled */ if (wd->atabus->ata_addref(wd->drvp)) return true; /* no need to complain */ wd_flushcache(wd, AT_POLL); if ((how & RB_POWERDOWN) == RB_POWERDOWN) wd_standby(wd, AT_POLL); return true; } /* * Allocate space for a ioctl queue structure. Mostly taken from * scsipi_ioctl.c */ struct wd_ioctl * wi_get(struct wd_softc *wd) { struct wd_ioctl *wi; wi = kmem_zalloc(sizeof(struct wd_ioctl), KM_SLEEP); wi->wi_softc = wd; buf_init(&wi->wi_bp); return (wi); } /* * Free an ioctl structure and remove it from our list */ void wi_free(struct wd_ioctl *wi) { buf_destroy(&wi->wi_bp); kmem_free(wi, sizeof(*wi)); } /* * Find a wd_ioctl structure based on the struct buf. */ struct wd_ioctl * wi_find(struct buf *bp) { return container_of(bp, struct wd_ioctl, wi_bp); } static uint wi_sector_size(const struct wd_ioctl * const wi) { switch (wi->wi_atareq.command) { case WDCC_READ: case WDCC_WRITE: case WDCC_READMULTI: case WDCC_WRITEMULTI: case WDCC_READDMA: case WDCC_WRITEDMA: case WDCC_READ_EXT: case WDCC_WRITE_EXT: case WDCC_READMULTI_EXT: case WDCC_WRITEMULTI_EXT: case WDCC_READDMA_EXT: case WDCC_WRITEDMA_EXT: case WDCC_READ_FPDMA_QUEUED: case WDCC_WRITE_FPDMA_QUEUED: return wi->wi_softc->sc_blksize; default: return 512; } } /* * Ioctl pseudo strategy routine * * This is mostly stolen from scsipi_ioctl.c:scsistrategy(). What * happens here is: * * - wdioctl() queues a wd_ioctl structure. * * - wdioctl() calls physio/wdioctlstrategy based on whether or not * user space I/O is required. If physio() is called, physio() eventually * calls wdioctlstrategy(). * * - In either case, wdioctlstrategy() calls wd->atabus->ata_exec_command() * to perform the actual command * * The reason for the use of the pseudo strategy routine is because * when doing I/O to/from user space, physio _really_ wants to be in * the loop. We could put the entire buffer into the ioctl request * structure, but that won't scale if we want to do things like download * microcode. */ void wdioctlstrategy(struct buf *bp) { struct wd_ioctl *wi; struct ata_xfer *xfer; int error = 0; wi = wi_find(bp); if (wi == NULL) { printf("wdioctlstrategy: " "No matching ioctl request found in queue\n"); error = EINVAL; goto out2; } xfer = ata_get_xfer(wi->wi_softc->drvp->chnl_softc, true); /* * Abort if physio broke up the transfer */ if (bp->b_bcount != wi->wi_atareq.datalen) { printf("physio split wd ioctl request... cannot proceed\n"); error = EIO; goto out; } /* * Abort if we didn't get a buffer size that was a multiple of * our sector size (or overflows CHS/LBA28 sector count) */ if ((bp->b_bcount % wi_sector_size(wi)) != 0 || (bp->b_bcount / wi_sector_size(wi)) >= (1 << NBBY)) { error = EINVAL; goto out; } /* * Make sure a timeout was supplied in the ioctl request */ if (wi->wi_atareq.timeout == 0) { error = EINVAL; goto out; } if (wi->wi_atareq.flags & ATACMD_READ) xfer->c_ata_c.flags |= AT_READ; else if (wi->wi_atareq.flags & ATACMD_WRITE) xfer->c_ata_c.flags |= AT_WRITE; if (wi->wi_atareq.flags & ATACMD_READREG) xfer->c_ata_c.flags |= AT_READREG; if ((wi->wi_atareq.flags & ATACMD_LBA) != 0) xfer->c_ata_c.flags |= AT_LBA; xfer->c_ata_c.flags |= AT_WAIT; xfer->c_ata_c.timeout = wi->wi_atareq.timeout; xfer->c_ata_c.r_command = wi->wi_atareq.command; xfer->c_ata_c.r_lba = ((wi->wi_atareq.head & 0x0f) << 24) | (wi->wi_atareq.cylinder << 8) | wi->wi_atareq.sec_num; xfer->c_ata_c.r_count = wi->wi_atareq.sec_count; xfer->c_ata_c.r_features = wi->wi_atareq.features; xfer->c_ata_c.r_st_bmask = WDCS_DRDY; xfer->c_ata_c.r_st_pmask = WDCS_DRDY; xfer->c_ata_c.data = wi->wi_bp.b_data; xfer->c_ata_c.bcount = wi->wi_bp.b_bcount; wi->wi_softc->atabus->ata_exec_command(wi->wi_softc->drvp, xfer); ata_wait_cmd(wi->wi_softc->drvp->chnl_softc, xfer); if (xfer->c_ata_c.flags & (AT_ERROR | AT_TIMEOU | AT_DF)) { if (xfer->c_ata_c.flags & AT_ERROR) { wi->wi_atareq.retsts = ATACMD_ERROR; wi->wi_atareq.error = xfer->c_ata_c.r_error; } else if (xfer->c_ata_c.flags & AT_DF) wi->wi_atareq.retsts = ATACMD_DF; else wi->wi_atareq.retsts = ATACMD_TIMEOUT; } else { wi->wi_atareq.retsts = ATACMD_OK; if (wi->wi_atareq.flags & ATACMD_READREG) { wi->wi_atareq.command = xfer->c_ata_c.r_status; wi->wi_atareq.features = xfer->c_ata_c.r_error; wi->wi_atareq.sec_count = xfer->c_ata_c.r_count; wi->wi_atareq.sec_num = xfer->c_ata_c.r_lba & 0xff; wi->wi_atareq.head = (xfer->c_ata_c.r_device & 0xf0) | ((xfer->c_ata_c.r_lba >> 24) & 0x0f); wi->wi_atareq.cylinder = (xfer->c_ata_c.r_lba >> 8) & 0xffff; wi->wi_atareq.error = xfer->c_ata_c.r_error; } } out: ata_free_xfer(wi->wi_softc->drvp->chnl_softc, xfer); out2: bp->b_error = error; if (error) bp->b_resid = bp->b_bcount; biodone(bp); } static void wd_sysctl_attach(struct wd_softc *wd) { struct dk_softc *dksc = &wd->sc_dksc; const struct sysctlnode *node; int error; /* sysctl set-up */ if (sysctl_createv(&wd->nodelog, 0, NULL, &node, 0, CTLTYPE_NODE, dksc->sc_xname, SYSCTL_DESCR("wd driver settings"), NULL, 0, NULL, 0, CTL_HW, CTL_CREATE, CTL_EOL) != 0) { aprint_error_dev(dksc->sc_dev, "could not create %s.%s sysctl node\n", "hw", dksc->sc_xname); return; } wd->drv_ncq = true; if ((error = sysctl_createv(&wd->nodelog, 0, NULL, NULL, CTLFLAG_READWRITE, CTLTYPE_BOOL, "use_ncq", SYSCTL_DESCR("use NCQ if supported"), NULL, 0, &wd->drv_ncq, 0, CTL_HW, node->sysctl_num, CTL_CREATE, CTL_EOL)) != 0) { aprint_error_dev(dksc->sc_dev, "could not create %s.%s.use_ncq sysctl - error %d\n", "hw", dksc->sc_xname, error); return; } wd->drv_ncq_prio = false; if ((error = sysctl_createv(&wd->nodelog, 0, NULL, NULL, CTLFLAG_READWRITE, CTLTYPE_BOOL, "use_ncq_prio", SYSCTL_DESCR("use NCQ PRIORITY if supported"), NULL, 0, &wd->drv_ncq_prio, 0, CTL_HW, node->sysctl_num, CTL_CREATE, CTL_EOL)) != 0) { aprint_error_dev(dksc->sc_dev, "could not create %s.%s.use_ncq_prio sysctl - error %d\n", "hw", dksc->sc_xname, error); return; } #ifdef WD_CHAOS_MONKEY wd->drv_chaos_freq = 0; if ((error = sysctl_createv(&wd->nodelog, 0, NULL, NULL, CTLFLAG_READWRITE, CTLTYPE_INT, "chaos_freq", SYSCTL_DESCR("simulated bio read error rate"), NULL, 0, &wd->drv_chaos_freq, 0, CTL_HW, node->sysctl_num, CTL_CREATE, CTL_EOL)) != 0) { aprint_error_dev(dksc->sc_dev, "could not create %s.%s.chaos_freq sysctl - error %d\n", "hw", dksc->sc_xname, error); return; } wd->drv_chaos_cnt = 0; if ((error = sysctl_createv(&wd->nodelog, 0, NULL, NULL, CTLFLAG_READONLY, CTLTYPE_INT, "chaos_cnt", SYSCTL_DESCR("number of processed bio reads"), NULL, 0, &wd->drv_chaos_cnt, 0, CTL_HW, node->sysctl_num, CTL_CREATE, CTL_EOL)) != 0) { aprint_error_dev(dksc->sc_dev, "could not create %s.%s.chaos_cnt sysctl - error %d\n", "hw", dksc->sc_xname, error); return; } #endif } static void wd_sysctl_detach(struct wd_softc *wd) { sysctl_teardown(&wd->nodelog); } #ifdef ATADEBUG int wddebug(void); int wddebug(void) { struct wd_softc *wd; struct dk_softc *dksc; int unit; for (unit = 0; unit <= 3; unit++) { wd = device_lookup_private(&wd_cd, unit); if (wd == NULL) continue; dksc = &wd->sc_dksc; printf("%s fl %x bufq %p:\n", dksc->sc_xname, wd->sc_flags, bufq_peek(dksc->sc_bufq)); atachannel_debug(wd->drvp->chnl_softc); } return 0; } #endif /* ATADEBUG */
5 3 3 5 4 3 1 5 4 4 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 /* $NetBSD: mm.c,v 1.24 2019/02/05 11:33:13 mrg Exp $ */ /*- * Copyright (c) 2002, 2008, 2010 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Christos Zoulas, Joerg Sonnenberger and Mindaugas Rasiukevicius. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /* * Special /dev/{mem,kmem,zero,null} memory devices. */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: mm.c,v 1.24 2019/02/05 11:33:13 mrg Exp $"); #include "opt_compat_netbsd.h" #include <sys/param.h> #include <sys/conf.h> #include <sys/ioctl.h> #include <sys/mman.h> #include <sys/uio.h> #include <sys/termios.h> #include <dev/mm.h> #include <uvm/uvm_extern.h> static void * dev_zero_page __read_mostly; static kmutex_t dev_mem_lock __cacheline_aligned; static vaddr_t dev_mem_addr __read_mostly; static dev_type_open(mm_open); static dev_type_read(mm_readwrite); static dev_type_ioctl(mm_ioctl); static dev_type_mmap(mm_mmap); static dev_type_ioctl(mm_ioctl); const struct cdevsw mem_cdevsw = { .d_open = mm_open, .d_close = nullclose, .d_read = mm_readwrite, .d_write = mm_readwrite, .d_ioctl = mm_ioctl, .d_stop = nostop, .d_tty = notty, .d_poll = nopoll, .d_mmap = mm_mmap, .d_kqfilter = nokqfilter, .d_discard = nodiscard, .d_flag = D_MPSAFE }; #ifdef pmax /* XXX */ const struct cdevsw mem_ultrix_cdevsw = { .d_open = nullopen, .d_close = nullclose, .d_read = mm_readwrite, .d_write = mm_readwrite, .d_ioctl = mm_ioctl, .d_stop = nostop, .d_tty = notty, .d_poll = nopoll, .d_mmap = mm_mmap, .d_kqfilter = nokqfilter, .d_discard = nodiscard, .d_flag = D_MPSAFE }; #endif static int mm_open(dev_t dev, int flag, int mode, struct lwp *l) { #ifdef __HAVE_MM_MD_OPEN int error; if ((error = mm_md_open(dev, flag, mode, l)) != 0) return error; #endif l->l_proc->p_flag |= PK_KMEM; return 0; } /* * mm_init: initialize memory device driver. */ void mm_init(void) { vaddr_t pg; mutex_init(&dev_mem_lock, MUTEX_DEFAULT, IPL_NONE); /* Read-only zero-page. */ pg = uvm_km_alloc(kernel_map, PAGE_SIZE, 0, UVM_KMF_WIRED|UVM_KMF_ZERO); KASSERT(pg != 0); pmap_protect(pmap_kernel(), pg, pg + PAGE_SIZE, VM_PROT_READ); pmap_update(pmap_kernel()); dev_zero_page = (void *)pg; #ifndef __HAVE_MM_MD_CACHE_ALIASING /* KVA for mappings during I/O. */ dev_mem_addr = uvm_km_alloc(kernel_map, PAGE_SIZE, 0, UVM_KMF_VAONLY|UVM_KMF_WAITVA); KASSERT(dev_mem_addr != 0); #else dev_mem_addr = 0; #endif } /* * dev_mem_getva: get a special virtual address. If architecture requires, * allocate VA according to PA, which avoids cache-aliasing issues. Use a * constant, general mapping address otherwise. */ static inline vaddr_t dev_mem_getva(paddr_t pa, int color) { #ifdef __HAVE_MM_MD_CACHE_ALIASING return uvm_km_alloc(kernel_map, PAGE_SIZE, color & uvmexp.colormask, UVM_KMF_VAONLY | UVM_KMF_WAITVA | UVM_KMF_COLORMATCH); #else return dev_mem_addr; #endif } static inline void dev_mem_relva(paddr_t pa, vaddr_t va) { #ifdef __HAVE_MM_MD_CACHE_ALIASING uvm_km_free(kernel_map, va, PAGE_SIZE, UVM_KMF_VAONLY); #else KASSERT(dev_mem_addr == va); #endif } /* * dev_kmem_readwrite: helper for DEV_MEM (/dev/mem) case of R/W. */ static int dev_mem_readwrite(struct uio *uio, struct iovec *iov) { paddr_t paddr; vaddr_t vaddr; vm_prot_t prot; size_t len, offset; bool have_direct; int error; int color = 0; /* Check for wrap around. */ if ((uintptr_t)uio->uio_offset != uio->uio_offset) { return EFAULT; } paddr = uio->uio_offset & ~PAGE_MASK; prot = (uio->uio_rw == UIO_WRITE) ? VM_PROT_WRITE : VM_PROT_READ; error = mm_md_physacc(paddr, prot); if (error) { return error; } offset = uio->uio_offset & PAGE_MASK; len = MIN(uio->uio_resid, PAGE_SIZE - offset); #ifdef __HAVE_MM_MD_CACHE_ALIASING have_direct = mm_md_page_color(paddr, &color); #else have_direct = true; color = 0; #endif #ifdef __HAVE_MM_MD_DIRECT_MAPPED_PHYS /* Is physical address directly mapped? Return VA. */ if (have_direct) have_direct = mm_md_direct_mapped_phys(paddr, &vaddr); #else vaddr = 0; have_direct = false; #endif if (!have_direct) { /* Get a special virtual address. */ const vaddr_t va = dev_mem_getva(paddr, color); /* Map selected KVA to physical address. */ mutex_enter(&dev_mem_lock); pmap_kenter_pa(va, paddr, prot, 0); pmap_update(pmap_kernel()); /* Perform I/O. */ vaddr = va + offset; error = uiomove((void *)vaddr, len, uio); /* Unmap, flush before unlock. */ pmap_kremove(va, PAGE_SIZE); pmap_update(pmap_kernel()); mutex_exit(&dev_mem_lock); /* "Release" the virtual address. */ dev_mem_relva(paddr, va); } else { /* Direct map, just perform I/O. */ vaddr += offset; error = uiomove((void *)vaddr, len, uio); } return error; } /* * dev_kmem_readwrite: helper for DEV_KMEM (/dev/kmem) case of R/W. */ static int dev_kmem_readwrite(struct uio *uio, struct iovec *iov) { void *addr; size_t len, offset; vm_prot_t prot; int error; bool md_kva; /* Check for wrap around. */ addr = (void *)(intptr_t)uio->uio_offset; if ((uintptr_t)addr != uio->uio_offset) { return EFAULT; } /* * Handle non-page aligned offset. * Otherwise, we operate in page-by-page basis. */ offset = uio->uio_offset & PAGE_MASK; len = MIN(uio->uio_resid, PAGE_SIZE - offset); prot = (uio->uio_rw == UIO_WRITE) ? VM_PROT_WRITE : VM_PROT_READ; md_kva = false; #ifdef __HAVE_MM_MD_DIRECT_MAPPED_IO paddr_t paddr; /* MD case: is this is a directly mapped address? */ if (mm_md_direct_mapped_io(addr, &paddr)) { /* If so, validate physical address. */ error = mm_md_physacc(paddr, prot); if (error) { return error; } md_kva = true; } #endif if (!md_kva) { bool checked = false; #ifdef __HAVE_MM_MD_KERNACC /* MD check for the address. */ error = mm_md_kernacc(addr, prot, &checked); if (error) { return error; } #endif /* UVM check for the address (unless MD indicated to not). */ if (!checked && !uvm_kernacc(addr, len, prot)) { return EFAULT; } } error = uiomove(addr, len, uio); return error; } /* * dev_zero_readwrite: helper for DEV_ZERO (/dev/null) case of R/W. */ static inline int dev_zero_readwrite(struct uio *uio, struct iovec *iov) { size_t len; /* Nothing to do for the write case. */ if (uio->uio_rw == UIO_WRITE) { uio->uio_resid = 0; return 0; } /* * Read in page-by-page basis, caller will continue. * Cut appropriately for a single/last-iteration cases. */ len = MIN(iov->iov_len, PAGE_SIZE); return uiomove(dev_zero_page, len, uio); } /* * mm_readwrite: general memory R/W function. */ static int mm_readwrite(dev_t dev, struct uio *uio, int flags) { struct iovec *iov; int error; #ifdef __HAVE_MM_MD_READWRITE /* If defined - there are extra MD cases. */ switch (minor(dev)) { case DEV_MEM: case DEV_KMEM: case DEV_NULL: case DEV_ZERO: #if defined(COMPAT_16) && defined(__arm) case _DEV_ZERO_oARM: #endif break; default: return mm_md_readwrite(dev, uio); } #endif error = 0; while (uio->uio_resid > 0 && error == 0) { iov = uio->uio_iov; if (iov->iov_len == 0) { /* Processed; next I/O vector. */ uio->uio_iov++; uio->uio_iovcnt--; KASSERT(uio->uio_iovcnt >= 0); continue; } /* Helper functions will process in page-by-page basis. */ switch (minor(dev)) { case DEV_MEM: error = dev_mem_readwrite(uio, iov); break; case DEV_KMEM: error = dev_kmem_readwrite(uio, iov); break; case DEV_NULL: if (uio->uio_rw == UIO_WRITE) { uio->uio_resid = 0; } /* Break directly out of the loop. */ return 0; case DEV_FULL: if (uio->uio_rw == UIO_WRITE) { return ENOSPC; } #if defined(COMPAT_16) && defined(__arm) /* FALLTHROUGH */ case _DEV_ZERO_oARM: #endif /* FALLTHROUGH */ case DEV_ZERO: error = dev_zero_readwrite(uio, iov); break; default: error = ENXIO; break; } } return error; } /* * mm_mmap: general mmap() handler. */ static paddr_t mm_mmap(dev_t dev, off_t off, int acc) { vm_prot_t prot; #ifdef __HAVE_MM_MD_MMAP /* If defined - there are extra mmap() MD cases. */ switch (minor(dev)) { case DEV_MEM: case DEV_KMEM: case DEV_NULL: #if defined(COMPAT_16) && defined(__arm) case _DEV_ZERO_oARM: #endif case DEV_ZERO: break; default: return mm_md_mmap(dev, off, acc); } #endif /* * /dev/null does not make sense, /dev/kmem is volatile and * /dev/zero is handled in mmap already. */ if (minor(dev) != DEV_MEM) { return -1; } prot = 0; if (acc & PROT_EXEC) prot |= VM_PROT_EXECUTE; if (acc & PROT_READ) prot |= VM_PROT_READ; if (acc & PROT_WRITE) prot |= VM_PROT_WRITE; /* Validate the physical address. */ if (mm_md_physacc(off, prot) != 0) { return -1; } return off >> PGSHIFT; } static int mm_ioctl(dev_t dev, u_long cmd, void *data, int flag, struct lwp *l) { switch (cmd) { case FIONBIO: /* We never block anyway. */ return 0; case FIOSETOWN: case FIOGETOWN: case TIOCGPGRP: case TIOCSPGRP: case TIOCGETA: return ENOTTY; case FIOASYNC: if ((*(int *)data) == 0) { return 0; } /* FALLTHROUGH */ default: return EOPNOTSUPP; } }
2 1 6 6 1 1 2 2 2 2 2 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 /* $NetBSD: random.c,v 1.10 2021/12/28 13:22:43 riastradh Exp $ */ /*- * Copyright (c) 2019 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Taylor R. Campbell. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /* * /dev/random, /dev/urandom -- stateless version * * For short reads from /dev/urandom, up to 256 bytes, read from a * per-CPU NIST Hash_DRBG instance that is reseeded as soon as the * system has enough entropy. * * For all other reads, instantiate a fresh NIST Hash_DRBG from * the global entropy pool, and draw from it. * * Each read is independent; there is no per-open state. * Concurrent reads from the same open run in parallel. * * Reading from /dev/random may block until entropy is available. * Either device may return short reads if interrupted. */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: random.c,v 1.10 2021/12/28 13:22:43 riastradh Exp $"); #include <sys/param.h> #include <sys/types.h> #include <sys/atomic.h> #include <sys/conf.h> #include <sys/cprng.h> #include <sys/entropy.h> #include <sys/errno.h> #include <sys/event.h> #include <sys/fcntl.h> #include <sys/kauth.h> #include <sys/kmem.h> #include <sys/lwp.h> #include <sys/poll.h> #include <sys/random.h> #include <sys/rnd.h> #include <sys/rndsource.h> #include <sys/signalvar.h> #include <sys/systm.h> #include <sys/vnode.h> /* IO_NDELAY */ #include "ioconf.h" static dev_type_open(random_open); static dev_type_close(random_close); static dev_type_ioctl(random_ioctl); static dev_type_poll(random_poll); static dev_type_kqfilter(random_kqfilter); static dev_type_read(random_read); static dev_type_write(random_write); const struct cdevsw rnd_cdevsw = { .d_open = random_open, .d_close = random_close, .d_read = random_read, .d_write = random_write, .d_ioctl = random_ioctl, .d_stop = nostop, .d_tty = notty, .d_poll = random_poll, .d_mmap = nommap, .d_kqfilter = random_kqfilter, .d_discard = nodiscard, .d_flag = D_OTHER|D_MPSAFE, }; #define RANDOM_BUFSIZE 512 /* XXX pulled from arse */ /* Entropy source for writes to /dev/random and /dev/urandom */ static krndsource_t user_rndsource; void rndattach(int num) { rnd_attach_source(&user_rndsource, "/dev/random", RND_TYPE_UNKNOWN, RND_FLAG_COLLECT_VALUE); } static int random_open(dev_t dev, int flags, int fmt, struct lwp *l) { /* Validate minor. */ switch (minor(dev)) { case RND_DEV_RANDOM: case RND_DEV_URANDOM: break; default: return ENXIO; } return 0; } static int random_close(dev_t dev, int flags, int fmt, struct lwp *l) { /* Success! */ return 0; } static int random_ioctl(dev_t dev, unsigned long cmd, void *data, int flag, struct lwp *l) { /* * No non-blocking/async options; otherwise defer to * entropy_ioctl. */ switch (cmd) { case FIONBIO: case FIOASYNC: return 0; default: return entropy_ioctl(cmd, data); } } static int random_poll(dev_t dev, int events, struct lwp *l) { /* /dev/random may block; /dev/urandom is always ready. */ switch (minor(dev)) { case RND_DEV_RANDOM: return entropy_poll(events); case RND_DEV_URANDOM: return events & (POLLIN|POLLRDNORM | POLLOUT|POLLWRNORM); default: return 0; } } static int random_kqfilter(dev_t dev, struct knote *kn) { /* Validate the event filter. */ switch (kn->kn_filter) { case EVFILT_READ: case EVFILT_WRITE: break; default: return EINVAL; } /* /dev/random may block; /dev/urandom never does. */ switch (minor(dev)) { case RND_DEV_RANDOM: if (kn->kn_filter == EVFILT_READ) return entropy_kqfilter(kn); /* FALLTHROUGH */ case RND_DEV_URANDOM: kn->kn_fop = &seltrue_filtops; return 0; default: return ENXIO; } } /* * random_read(dev, uio, flags) * * Generate data from a PRNG seeded from the entropy pool. * * - If /dev/random, block until we have full entropy, or fail * with EWOULDBLOCK, and if `depleting' entropy, return at most * the entropy pool's capacity at once. * * - If /dev/urandom, generate data from whatever is in the * entropy pool now. * * On interrupt, return a short read, but not shorter than 256 * bytes (actually, no shorter than RANDOM_BUFSIZE bytes, which is * 512 for hysterical raisins). */ static int random_read(dev_t dev, struct uio *uio, int flags) { int gflags; /* Set the appropriate GRND_* mode. */ switch (minor(dev)) { case RND_DEV_RANDOM: gflags = GRND_RANDOM; break; case RND_DEV_URANDOM: gflags = GRND_INSECURE; break; default: return ENXIO; } /* * Set GRND_NONBLOCK if the user requested IO_NDELAY (i.e., the * file was opened with O_NONBLOCK). */ if (flags & IO_NDELAY) gflags |= GRND_NONBLOCK; /* Defer to getrandom. */ return dogetrandom(uio, gflags); } /* * random_write(dev, uio, flags) * * Enter data from uio into the entropy pool. * * Assume privileged users provide full entropy, and unprivileged * users provide no entropy. If you have a nonuniform source of * data with n bytes of min-entropy, hash it with an XOF like * SHAKE128 into exactly n bytes first. */ static int random_write(dev_t dev, struct uio *uio, int flags) { kauth_cred_t cred = kauth_cred_get(); uint8_t *buf; bool privileged = false, any = false; int error = 0; /* Verify user's authorization to affect the entropy pool. */ error = kauth_authorize_device(cred, KAUTH_DEVICE_RND_ADDDATA, NULL, NULL, NULL, NULL); if (error) return error; /* * Check whether user is privileged. If so, assume user * furnishes full-entropy data; if not, accept user's data but * assume it has zero entropy when we do accounting. If you * want to specify less entropy, use ioctl(RNDADDDATA). */ if (kauth_authorize_device(cred, KAUTH_DEVICE_RND_ADDDATA_ESTIMATE, NULL, NULL, NULL, NULL) == 0) privileged = true; /* Get a buffer for transfers. */ buf = kmem_alloc(RANDOM_BUFSIZE, KM_SLEEP); /* Consume data. */ while (uio->uio_resid) { size_t n = MIN(uio->uio_resid, RANDOM_BUFSIZE); /* Transfer n bytes in and enter them into the pool. */ error = uiomove(buf, n, uio); if (error) break; rnd_add_data(&user_rndsource, buf, n, privileged ? n*NBBY : 0); any = true; /* Now's a good time to yield if needed. */ preempt_point(); /* Check for interruption. */ if (__predict_false(curlwp->l_flag & LW_PENDSIG) && sigispending(curlwp, 0)) { error = EINTR; break; } } /* Zero the buffer and free it. */ explicit_memset(buf, 0, RANDOM_BUFSIZE); kmem_free(buf, RANDOM_BUFSIZE); /* If we added anything, consolidate entropy now. */ if (any) entropy_consolidate(); return error; }
7 1 1 4 1 2 1 1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 /* $NetBSD: msdosfs_vfsops.c,v 1.138 2022/04/16 07:58:21 hannken Exp $ */ /*- * Copyright (C) 1994, 1995, 1997 Wolfgang Solfrank. * Copyright (C) 1994, 1995, 1997 TooLs GmbH. * All rights reserved. * Original code by Paul Popelka (paulp@uts.amdahl.com) (see below). * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by TooLs GmbH. * 4. The name of TooLs GmbH may not be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY TOOLS GMBH ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL TOOLS GMBH BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* * Written by Paul Popelka (paulp@uts.amdahl.com) * * You can do anything you want with this software, just don't say you wrote * it, and don't remove this notice. * * This software is provided "as is". * * The author supplies this software to be publicly redistributed on the * understanding that the author is not responsible for the correct * functioning of this software in any circumstances and is not liable for * any damages caused by this software. * * October 1992 */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: msdosfs_vfsops.c,v 1.138 2022/04/16 07:58:21 hannken Exp $"); #if defined(_KERNEL_OPT) #include "opt_compat_netbsd.h" #endif #include <sys/param.h> #include <sys/systm.h> #include <sys/sysctl.h> #include <sys/namei.h> #include <sys/proc.h> #include <sys/kernel.h> #include <sys/vnode.h> #include <miscfs/genfs/genfs.h> #include <miscfs/specfs/specdev.h> /* XXX */ /* defines v_rdev */ #include <sys/mount.h> #include <sys/buf.h> #include <sys/file.h> #include <sys/device.h> #include <sys/disklabel.h> #include <sys/disk.h> #include <sys/ioctl.h> #include <sys/malloc.h> #include <sys/dirent.h> #include <sys/stat.h> #include <sys/conf.h> #include <sys/kauth.h> #include <sys/module.h> #include <fs/msdosfs/bpb.h> #include <fs/msdosfs/bootsect.h> #include <fs/msdosfs/direntry.h> #include <fs/msdosfs/denode.h> #include <fs/msdosfs/msdosfsmount.h> #include <fs/msdosfs/fat.h> MODULE(MODULE_CLASS_VFS, msdos, NULL); #ifdef MSDOSFS_DEBUG #define DPRINTF(fmt, ...) uprintf("%s(): " fmt "\n", __func__, ##__VA_ARGS__) #else #define DPRINTF(fmt, ...) #endif #define GEMDOSFS_BSIZE 512 #define MSDOSFS_NAMEMAX(pmp) \ (pmp)->pm_flags & MSDOSFSMNT_LONGNAME ? WIN_MAXLEN : 12 int msdosfs_mountfs(struct vnode *, struct mount *, struct lwp *, struct msdosfs_args *); static int update_mp(struct mount *, struct msdosfs_args *); MALLOC_JUSTDEFINE(M_MSDOSFSMNT, "MSDOSFS mount", "MSDOS FS mount structure"); MALLOC_JUSTDEFINE(M_MSDOSFSFAT, "MSDOSFS FAT", "MSDOS FS FAT table"); MALLOC_JUSTDEFINE(M_MSDOSFSTMP, "MSDOSFS temp", "MSDOS FS temp. structures"); extern const struct vnodeopv_desc msdosfs_vnodeop_opv_desc; const struct vnodeopv_desc * const msdosfs_vnodeopv_descs[] = { &msdosfs_vnodeop_opv_desc, NULL, }; struct vfsops msdosfs_vfsops = { .vfs_name = MOUNT_MSDOS, .vfs_min_mount_data = sizeof (struct msdosfs_args), .vfs_mount = msdosfs_mount, .vfs_start = msdosfs_start, .vfs_unmount = msdosfs_unmount, .vfs_root = msdosfs_root, .vfs_quotactl = (void *)eopnotsupp, .vfs_statvfs = msdosfs_statvfs, .vfs_sync = msdosfs_sync, .vfs_vget = msdosfs_vget, .vfs_loadvnode = msdosfs_loadvnode, .vfs_fhtovp = msdosfs_fhtovp, .vfs_vptofh = msdosfs_vptofh, .vfs_init = msdosfs_init, .vfs_reinit = msdosfs_reinit, .vfs_done = msdosfs_done, .vfs_mountroot = msdosfs_mountroot, .vfs_snapshot = (void *)eopnotsupp, .vfs_extattrctl = vfs_stdextattrctl, .vfs_suspendctl = genfs_suspendctl, .vfs_renamelock_enter = genfs_renamelock_enter, .vfs_renamelock_exit = genfs_renamelock_exit, .vfs_fsync = (void *)eopnotsupp, .vfs_opv_descs = msdosfs_vnodeopv_descs }; SYSCTL_SETUP(msdosfs_sysctl_setup, "msdosfs sysctl") { sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT, CTLTYPE_NODE, "msdosfs", SYSCTL_DESCR("MS-DOS file system"), NULL, 0, NULL, 0, CTL_VFS, 4, CTL_EOL); /* * XXX the "4" above could be dynamic, thereby eliminating one * more instance of the "number to vfs" mapping problem, but * "4" is the order as taken from sys/mount.h */ } static int msdos_modcmd(modcmd_t cmd, void *arg) { int error; switch (cmd) { case MODULE_CMD_INIT: error = vfs_attach(&msdosfs_vfsops); if (error != 0) break; break; case MODULE_CMD_FINI: error = vfs_detach(&msdosfs_vfsops); if (error != 0) break; break; default: error = ENOTTY; break; } return (error); } static int update_mp(struct mount *mp, struct msdosfs_args *argp) { struct msdosfsmount *pmp = VFSTOMSDOSFS(mp); int error; pmp->pm_gid = argp->gid; pmp->pm_uid = argp->uid; pmp->pm_mask = argp->mask & ALLPERMS; pmp->pm_dirmask = argp->dirmask & ALLPERMS; pmp->pm_gmtoff = argp->gmtoff; pmp->pm_flags |= argp->flags & MSDOSFSMNT_MNTOPT; /* * GEMDOS knows nothing about win95 long filenames */ if (pmp->pm_flags & MSDOSFSMNT_GEMDOSFS) pmp->pm_flags |= MSDOSFSMNT_NOWIN95; if (pmp->pm_flags & MSDOSFSMNT_NOWIN95) pmp->pm_flags |= MSDOSFSMNT_SHORTNAME; else if (!(pmp->pm_flags & (MSDOSFSMNT_SHORTNAME | MSDOSFSMNT_LONGNAME))) { struct vnode *rtvp; /* * Try to divine whether to support Win'95 long filenames */ if (FAT32(pmp)) pmp->pm_flags |= MSDOSFSMNT_LONGNAME; else { error = msdosfs_root(mp, LK_EXCLUSIVE, &rtvp); if (error != 0) return error; pmp->pm_flags |= msdosfs_findwin95(VTODE(rtvp)) ? MSDOSFSMNT_LONGNAME : MSDOSFSMNT_SHORTNAME; vput(rtvp); } } mp->mnt_stat.f_namemax = MSDOSFS_NAMEMAX(pmp); return 0; } int msdosfs_mountroot(void) { struct mount *mp; struct lwp *l = curlwp; /* XXX */ int error; struct msdosfs_args args; if (device_class(root_device) != DV_DISK) return (ENODEV); if ((error = vfs_rootmountalloc(MOUNT_MSDOS, "root_device", &mp))) { vrele(rootvp); return (error); } args.flags = MSDOSFSMNT_VERSIONED; args.uid = 0; args.gid = 0; args.mask = 0777; args.version = MSDOSFSMNT_VERSION; args.dirmask = 0777; if ((error = msdosfs_mountfs(rootvp, mp, l, &args)) != 0) { vfs_unbusy(mp); vfs_rele(mp); return (error); } if ((error = update_mp(mp, &args)) != 0) { (void)msdosfs_unmount(mp, 0); vfs_unbusy(mp); vfs_rele(mp); vrele(rootvp); return (error); } mountlist_append(mp); (void)msdosfs_statvfs(mp, &mp->mnt_stat); vfs_unbusy(mp); return (0); } /* * mp - path - addr in user space of mount point (ie /usr or whatever) * data - addr in user space of mount params including the name of the block * special file to treat as a filesystem. */ int msdosfs_mount(struct mount *mp, const char *path, void *data, size_t *data_len) { struct lwp *l = curlwp; struct vnode *devvp; /* vnode for blk device to mount */ struct msdosfs_args *args = data; /* holds data from mount request */ /* msdosfs specific mount control block */ struct msdosfsmount *pmp = NULL; int error, flags; mode_t accessmode; if (args == NULL) return EINVAL; if (*data_len < sizeof *args) return EINVAL; if (mp->mnt_flag & MNT_GETARGS) { pmp = VFSTOMSDOSFS(mp); if (pmp == NULL) return EIO; args->fspec = NULL; args->uid = pmp->pm_uid; args->gid = pmp->pm_gid; args->mask = pmp->pm_mask; args->flags = pmp->pm_flags; args->version = MSDOSFSMNT_VERSION; args->dirmask = pmp->pm_dirmask; args->gmtoff = pmp->pm_gmtoff; *data_len = sizeof *args; return 0; } /* * If not versioned (i.e. using old mount_msdos(8)), fill in * the additional structure items with suitable defaults. */ if ((args->flags & MSDOSFSMNT_VERSIONED) == 0) { args->version = 1; args->dirmask = args->mask; } /* * Reset GMT offset for pre-v3 mount structure args. */ if (args->version < 3) args->gmtoff = 0; /* * If updating, check whether changing from read-only to * read/write; if there is no device name, that's all we do. */ if (mp->mnt_flag & MNT_UPDATE) { pmp = VFSTOMSDOSFS(mp); error = 0; if (!(pmp->pm_flags & MSDOSFSMNT_RONLY) && (mp->mnt_flag & MNT_RDONLY)) { flags = WRITECLOSE; if (mp->mnt_flag & MNT_FORCE) flags |= FORCECLOSE; error = vflush(mp, NULLVP, flags); } if (!error && (mp->mnt_flag & MNT_RELOAD)) /* not yet implemented */ error = EOPNOTSUPP; if (error) { DPRINTF("vflush %d", error); return (error); } if ((pmp->pm_flags & MSDOSFSMNT_RONLY) && (mp->mnt_iflag & IMNT_WANTRDWR)) { /* * If upgrade to read-write by non-root, then verify * that user has necessary permissions on the device. * * Permission to update a mount is checked higher, so * here we presume updating the mount is okay (for * example, as far as securelevel goes) which leaves us * with the normal check. */ devvp = pmp->pm_devvp; vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY); error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_MOUNT, KAUTH_REQ_SYSTEM_MOUNT_DEVICE, mp, devvp, KAUTH_ARG(VREAD | VWRITE)); VOP_UNLOCK(devvp); DPRINTF("KAUTH_REQ_SYSTEM_MOUNT_DEVICE %d", error); if (error) return (error); pmp->pm_flags &= ~MSDOSFSMNT_RONLY; } if (args->fspec == NULL) { DPRINTF("missing fspec"); return EINVAL; } } /* * Not an update, or updating the name: look up the name * and verify that it refers to a sensible block device. */ error = namei_simple_user(args->fspec, NSM_FOLLOW_NOEMULROOT, &devvp); if (error != 0) { DPRINTF("namei %d", error); return (error); } if (devvp->v_type != VBLK) { DPRINTF("not block"); vrele(devvp); return (ENOTBLK); } if (bdevsw_lookup(devvp->v_rdev) == NULL) { DPRINTF("no block switch"); vrele(devvp); return (ENXIO); } /* * If mount by non-root, then verify that user has necessary * permissions on the device. */ accessmode = VREAD; if ((mp->mnt_flag & MNT_RDONLY) == 0) accessmode |= VWRITE; vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY); error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_MOUNT, KAUTH_REQ_SYSTEM_MOUNT_DEVICE, mp, devvp, KAUTH_ARG(accessmode)); VOP_UNLOCK(devvp); if (error) { DPRINTF("KAUTH_REQ_SYSTEM_MOUNT_DEVICE %d", error); vrele(devvp); return (error); } if ((mp->mnt_flag & MNT_UPDATE) == 0) { int xflags; if (mp->mnt_flag & MNT_RDONLY) xflags = FREAD; else xflags = FREAD|FWRITE; vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY); error = VOP_OPEN(devvp, xflags, FSCRED); VOP_UNLOCK(devvp); if (error) { DPRINTF("VOP_OPEN %d", error); goto fail; } error = msdosfs_mountfs(devvp, mp, l, args); if (error) { DPRINTF("msdosfs_mountfs %d", error); vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY); (void) VOP_CLOSE(devvp, xflags, NOCRED); VOP_UNLOCK(devvp); goto fail; } #ifdef MSDOSFS_DEBUG /* only needed for the printf below */ pmp = VFSTOMSDOSFS(mp); #endif } else { vrele(devvp); if (devvp != pmp->pm_devvp) { DPRINTF("devvp %p pmp %p", devvp, pmp->pm_devvp); return (EINVAL); /* needs translation */ } } if ((error = update_mp(mp, args)) != 0) { msdosfs_unmount(mp, MNT_FORCE); DPRINTF("update_mp %d", error); return error; } #ifdef MSDOSFS_DEBUG printf("msdosfs_mount(): mp %p, pmp %p, inusemap %p\n", mp, pmp, pmp->pm_inusemap); #endif return set_statvfs_info(path, UIO_USERSPACE, args->fspec, UIO_USERSPACE, mp->mnt_op->vfs_name, mp, l); fail: vrele(devvp); return (error); } int msdosfs_mountfs(struct vnode *devvp, struct mount *mp, struct lwp *l, struct msdosfs_args *argp) { struct msdosfsmount *pmp; struct buf *bp; dev_t dev = devvp->v_rdev; union bootsector *bsp; struct byte_bpb33 *b33; struct byte_bpb50 *b50; struct byte_bpb710 *b710; uint8_t SecPerClust; int ronly, error, BlkPerSec; uint64_t psize; unsigned secsize; u_long fatbytes, fatblocksecs; /* Flush out any old buffers remaining from a previous use. */ vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY); error = vinvalbuf(devvp, V_SAVE, l->l_cred, l, 0, 0); VOP_UNLOCK(devvp); if (error) return (error); ronly = (mp->mnt_flag & MNT_RDONLY) != 0; bp = NULL; /* both used in error_exit */ pmp = NULL; error = getdisksize(devvp, &psize, &secsize); if (error) { if (argp->flags & MSDOSFSMNT_GEMDOSFS) goto error_exit; /* ok, so it failed. we most likely don't need the info */ secsize = DEV_BSIZE; psize = 0; error = 0; } if (secsize < DEV_BSIZE) { DPRINTF("Invalid block secsize (%d < DEV_BSIZE)", secsize); error = EINVAL; goto error_exit; } if (argp->flags & MSDOSFSMNT_GEMDOSFS) { if (secsize != GEMDOSFS_BSIZE) { DPRINTF("Invalid block secsize %d for GEMDOS", secsize); error = EINVAL; goto error_exit; } } /* * Read the boot sector of the filesystem, and then check the * boot signature. If not a dos boot sector then error out. */ if (secsize < sizeof(*b50)) { DPRINTF("50 bootsec %u\n", secsize); error = EINVAL; goto error_exit; } if ((error = bread(devvp, 0, secsize, 0, &bp)) != 0) goto error_exit; bsp = (union bootsector *)bp->b_data; b33 = (struct byte_bpb33 *)bsp->bs33.bsBPB; b50 = (struct byte_bpb50 *)bsp->bs50.bsBPB; b710 = (struct byte_bpb710 *)bsp->bs710.bsBPB; #if 0 /* * Some FAT partition, for example Raspberry Pi Pico's * USB mass storage, does not have exptected BOOTSIGs. * According to FreeBSD's comment, some PC-9800/9821 * FAT floppy disks have similar problems. */ if (!(argp->flags & MSDOSFSMNT_GEMDOSFS)) { if (bsp->bs50.bsBootSectSig0 != BOOTSIG0 || bsp->bs50.bsBootSectSig1 != BOOTSIG1) { DPRINTF("bootsig0 %d bootsig1 %d", bsp->bs50.bsBootSectSig0, bsp->bs50.bsBootSectSig1); error = EINVAL; goto error_exit; } } #endif pmp = malloc(sizeof(*pmp), M_MSDOSFSMNT, M_WAITOK|M_ZERO); pmp->pm_mountp = mp; /* * Compute several useful quantities from the bpb in the * bootsector. Copy in the dos 5 variant of the bpb then fix up * the fields that are different between dos 5 and dos 3.3. */ SecPerClust = b50->bpbSecPerClust; pmp->pm_BytesPerSec = getushort(b50->bpbBytesPerSec); pmp->pm_ResSectors = getushort(b50->bpbResSectors); pmp->pm_FATs = b50->bpbFATs; pmp->pm_RootDirEnts = getushort(b50->bpbRootDirEnts); pmp->pm_Sectors = getushort(b50->bpbSectors); pmp->pm_FATsecs = getushort(b50->bpbFATsecs); pmp->pm_SecPerTrack = getushort(b50->bpbSecPerTrack); pmp->pm_Heads = getushort(b50->bpbHeads); pmp->pm_Media = b50->bpbMedia; if (pmp->pm_Sectors == 0) { pmp->pm_HiddenSects = getulong(b50->bpbHiddenSecs); pmp->pm_HugeSectors = getulong(b50->bpbHugeSectors); } else { if (secsize < sizeof(*b33)) { DPRINTF("33 bootsec %u\n", secsize); error = EINVAL; goto error_exit; } pmp->pm_HiddenSects = getushort(b33->bpbHiddenSecs); pmp->pm_HugeSectors = pmp->pm_Sectors; } /* * Sanity checks, from the FAT specification: * - sectors per cluster: >= 1, power of 2 * - logical sector size: >= 1, power of 2 * - cluster size: <= max FS block size * - number of sectors: >= 1 */ if ((SecPerClust == 0) || !powerof2(SecPerClust) || (pmp->pm_BytesPerSec == 0) || !powerof2(pmp->pm_BytesPerSec) || (SecPerClust * pmp->pm_BytesPerSec > MAXBSIZE) || (pmp->pm_HugeSectors == 0)) { DPRINTF("consistency checks"); error = EINVAL; goto error_exit; } if (!(argp->flags & MSDOSFSMNT_GEMDOSFS) && (pmp->pm_SecPerTrack > 63)) { DPRINTF("SecPerTrack %d", pmp->pm_SecPerTrack); error = EINVAL; goto error_exit; } if (pmp->pm_RootDirEnts == 0) { if (secsize < sizeof(*b710)) { DPRINTF("710 bootsec %u\n", secsize); error = EINVAL; goto error_exit; } unsigned short FSVers = getushort(b710->bpbFSVers); unsigned short ExtFlags = getushort(b710->bpbExtFlags); /* * Some say that bsBootSectSig[23] must be zero, but * Windows does not require this and some digital cameras * do not set these to zero. Therefore, do not insist. */ if (pmp->pm_Sectors || pmp->pm_FATsecs || FSVers) { DPRINTF("Sectors %d FATsecs %lu FSVers %d", pmp->pm_Sectors, pmp->pm_FATsecs, FSVers); error = EINVAL; goto error_exit; } pmp->pm_fatmask = FAT32_MASK; pmp->pm_fatmult = 4; pmp->pm_fatdiv = 1; pmp->pm_FATsecs = getulong(b710->bpbBigFATsecs); /* Mirroring is enabled if the FATMIRROR bit is not set. */ if ((ExtFlags & FATMIRROR) == 0) pmp->pm_flags |= MSDOSFS_FATMIRROR; else pmp->pm_curfat = ExtFlags & FATNUM; } else pmp->pm_flags |= MSDOSFS_FATMIRROR; if (argp->flags & MSDOSFSMNT_GEMDOSFS) { if (FAT32(pmp)) { /* GEMDOS doesn't know FAT32. */ DPRINTF("FAT32 for GEMDOS"); error = EINVAL; goto error_exit; } /* * Check a few values (could do some more): * - logical sector size: >= block size * - number of sectors: <= size of partition */ if ((pmp->pm_BytesPerSec < GEMDOSFS_BSIZE) || (pmp->pm_HugeSectors * (pmp->pm_BytesPerSec / GEMDOSFS_BSIZE) > psize)) { DPRINTF("consistency checks for GEMDOS"); error = EINVAL; goto error_exit; } /* * XXX - Many parts of the msdosfs driver seem to assume that * the number of bytes per logical sector (BytesPerSec) will * always be the same as the number of bytes per disk block * Let's pretend it is. */ BlkPerSec = pmp->pm_BytesPerSec / GEMDOSFS_BSIZE; pmp->pm_BytesPerSec = GEMDOSFS_BSIZE; pmp->pm_HugeSectors *= BlkPerSec; pmp->pm_HiddenSects *= BlkPerSec; pmp->pm_ResSectors *= BlkPerSec; pmp->pm_Sectors *= BlkPerSec; pmp->pm_FATsecs *= BlkPerSec; SecPerClust *= BlkPerSec; } /* Check that fs has nonzero FAT size */ if (pmp->pm_FATsecs == 0) { DPRINTF("FATsecs is 0"); error = EINVAL; goto error_exit; } pmp->pm_fatblk = pmp->pm_ResSectors; if (FAT32(pmp)) { if (secsize < sizeof(*b710)) { DPRINTF("710 bootsec %u\n", secsize); error = EINVAL; goto error_exit; } pmp->pm_rootdirblk = getulong(b710->bpbRootClust); pmp->pm_firstcluster = pmp->pm_fatblk + (pmp->pm_FATs * pmp->pm_FATsecs); pmp->pm_fsinfo = getushort(b710->bpbFSInfo); } else { pmp->pm_rootdirblk = pmp->pm_fatblk + (pmp->pm_FATs * pmp->pm_FATsecs); pmp->pm_rootdirsize = (pmp->pm_RootDirEnts * sizeof(struct direntry) + pmp->pm_BytesPerSec - 1) / pmp->pm_BytesPerSec;/* in sectors */ pmp->pm_firstcluster = pmp->pm_rootdirblk + pmp->pm_rootdirsize; } pmp->pm_nmbrofclusters = (pmp->pm_HugeSectors - pmp->pm_firstcluster) / SecPerClust; pmp->pm_maxcluster = pmp->pm_nmbrofclusters + 1; pmp->pm_fatsize = pmp->pm_FATsecs * pmp->pm_BytesPerSec; if (argp->flags & MSDOSFSMNT_GEMDOSFS) { if (pmp->pm_nmbrofclusters <= (0xff0 - 2)) { pmp->pm_fatmask = FAT12_MASK; pmp->pm_fatmult = 3; pmp->pm_fatdiv = 2; } else { pmp->pm_fatmask = FAT16_MASK; pmp->pm_fatmult = 2; pmp->pm_fatdiv = 1; } } else if (pmp->pm_fatmask == 0) { if (pmp->pm_maxcluster <= ((CLUST_RSRVD - CLUST_FIRST) & FAT12_MASK)) { /* * This will usually be a floppy disk. This size makes * sure that one FAT entry will not be split across * multiple blocks. */ pmp->pm_fatmask = FAT12_MASK; pmp->pm_fatmult = 3; pmp->pm_fatdiv = 2; } else { pmp->pm_fatmask = FAT16_MASK; pmp->pm_fatmult = 2; pmp->pm_fatdiv = 1; } } /* validate cluster count against FAT */ if ((pmp->pm_maxcluster & pmp->pm_fatmask) != pmp->pm_maxcluster) { DPRINTF("maxcluster %lu outside of mask %#lx\n", pmp->pm_maxcluster, pmp->pm_fatmask); error = EINVAL; goto error_exit; } /* validate FAT size */ fatbytes = (pmp->pm_maxcluster+1) * pmp->pm_fatmult / pmp->pm_fatdiv; fatblocksecs = howmany(fatbytes, pmp->pm_BytesPerSec); if (pmp->pm_FATsecs < fatblocksecs) { DPRINTF("FATsecs %lu < real %lu\n", pmp->pm_FATsecs, fatblocksecs); error = EINVAL; goto error_exit; } if (FAT12(pmp)) { /* * limit block size to what is needed to read a FAT block * to not exceed MAXBSIZE */ pmp->pm_fatblocksec = uimin(3, fatblocksecs); pmp->pm_fatblocksize = pmp->pm_fatblocksec * pmp->pm_BytesPerSec; } else { pmp->pm_fatblocksize = MAXBSIZE; pmp->pm_fatblocksec = pmp->pm_fatblocksize / pmp->pm_BytesPerSec; } pmp->pm_bnshift = ffs(pmp->pm_BytesPerSec) - 1; /* * Compute mask and shift value for isolating cluster relative byte * offsets and cluster numbers from a file offset. */ pmp->pm_bpcluster = SecPerClust * pmp->pm_BytesPerSec; pmp->pm_crbomask = pmp->pm_bpcluster - 1; pmp->pm_cnshift = ffs(pmp->pm_bpcluster) - 1; /* * Check for valid cluster size * must be a power of 2 */ if (pmp->pm_bpcluster ^ (1 << pmp->pm_cnshift)) { DPRINTF("bpcluster %lu cnshift %lu", pmp->pm_bpcluster, pmp->pm_cnshift); error = EINVAL; goto error_exit; } /* * Cluster size must be within limit of MAXBSIZE. * Many FAT filesystems will not have clusters larger than * 32KiB due to limits in Windows versions before Vista. */ if (pmp->pm_bpcluster > MAXBSIZE) { DPRINTF("bpcluster %lu > MAXBSIZE %d", pmp->pm_bpcluster, MAXBSIZE); error = EINVAL; goto error_exit; } /* * Release the bootsector buffer. */ brelse(bp, BC_AGE); bp = NULL; /* * Check FSInfo. */ if (pmp->pm_fsinfo) { struct fsinfo *fp; const int rdsz = roundup(sizeof(*fp), pmp->pm_BytesPerSec); /* * XXX If the fsinfo block is stored on media with * 2KB or larger sectors, is the fsinfo structure * padded at the end or in the middle? */ if ((error = bread(devvp, de_bn2kb(pmp, pmp->pm_fsinfo), rdsz, 0, &bp)) != 0) goto error_exit; fp = (struct fsinfo *)bp->b_data; if (!memcmp(fp->fsisig1, "RRaA", 4) && !memcmp(fp->fsisig2, "rrAa", 4) && !memcmp(fp->fsisig3, "\0\0\125\252", 4) && !memcmp(fp->fsisig4, "\0\0\125\252", 4)) pmp->pm_nxtfree = getulong(fp->fsinxtfree); else pmp->pm_fsinfo = 0; brelse(bp, 0); bp = NULL; } /* * Check and validate (or perhaps invalidate?) the fsinfo structure? * XXX */ if (pmp->pm_fsinfo) { if ((pmp->pm_nxtfree == 0xffffffffUL) || (pmp->pm_nxtfree > pmp->pm_maxcluster)) pmp->pm_fsinfo = 0; } /* * Allocate memory for the bitmap of allocated clusters, and then * fill it in. */ pmp->pm_inusemap = malloc(((pmp->pm_maxcluster + N_INUSEBITS) / N_INUSEBITS) * sizeof(*pmp->pm_inusemap), M_MSDOSFSFAT, M_WAITOK); /* * fillinusemap() needs pm_devvp. */ pmp->pm_dev = dev; pmp->pm_devvp = devvp; /* * Have the inuse map filled in. */ if ((error = msdosfs_fillinusemap(pmp)) != 0) { DPRINTF("fillinusemap %d", error); goto error_exit; } /* * If they want FAT updates to be synchronous then let them suffer * the performance degradation in exchange for the on disk copy of * the FAT being correct just about all the time. I suppose this * would be a good thing to turn on if the kernel is still flakey. */ if (mp->mnt_flag & MNT_SYNCHRONOUS) pmp->pm_flags |= MSDOSFSMNT_WAITONFAT; /* * Finish up. */ if (ronly) pmp->pm_flags |= MSDOSFSMNT_RONLY; else pmp->pm_fmod = 1; mp->mnt_data = pmp; mp->mnt_stat.f_fsidx.__fsid_val[0] = (long)dev; mp->mnt_stat.f_fsidx.__fsid_val[1] = makefstype(MOUNT_MSDOS); mp->mnt_stat.f_fsid = mp->mnt_stat.f_fsidx.__fsid_val[0]; mp->mnt_stat.f_namemax = MSDOSFS_NAMEMAX(pmp); mp->mnt_flag |= MNT_LOCAL; mp->mnt_iflag |= IMNT_SHRLOOKUP; mp->mnt_dev_bshift = pmp->pm_bnshift; mp->mnt_fs_bshift = pmp->pm_cnshift; /* * If we ever do quotas for DOS filesystems this would be a place * to fill in the info in the msdosfsmount structure. You dolt, * quotas on dos filesystems make no sense because files have no * owners on dos filesystems. of course there is some empty space * in the directory entry where we could put uid's and gid's. */ spec_node_setmountedfs(devvp, mp); return (0); error_exit: if (bp) brelse(bp, BC_AGE); if (pmp) { if (pmp->pm_inusemap) free(pmp->pm_inusemap, M_MSDOSFSFAT); free(pmp, M_MSDOSFSMNT); mp->mnt_data = NULL; } return (error); } int msdosfs_start(struct mount *mp, int flags) { return (0); } /* * Unmount the filesystem described by mp. */ int msdosfs_unmount(struct mount *mp, int mntflags) { struct msdosfsmount *pmp; int error, flags; flags = 0; if (mntflags & MNT_FORCE) flags |= FORCECLOSE; if ((error = vflush(mp, NULLVP, flags)) != 0) return (error); pmp = VFSTOMSDOSFS(mp); if (pmp->pm_devvp->v_type != VBAD) spec_node_setmountedfs(pmp->pm_devvp, NULL); #ifdef MSDOSFS_DEBUG { struct vnode *vp = pmp->pm_devvp; printf("msdosfs_umount(): just before calling VOP_CLOSE()\n"); printf("flag %08x, usecount %d, writecount %d, holdcnt %d\n", vp->v_vflag | vp->v_iflag | vp->v_uflag, vrefcnt(vp), vp->v_writecount, vp->v_holdcnt); printf("mount %p, op %p\n", vp->v_mount, vp->v_op); printf("cleanblkhd %p, dirtyblkhd %p, numoutput %d, type %d\n", vp->v_cleanblkhd.lh_first, vp->v_dirtyblkhd.lh_first, vp->v_numoutput, vp->v_type); printf("union %p, tag %d, data[0] %08x, data[1] %08x\n", vp->v_socket, vp->v_tag, ((u_int *)vp->v_data)[0], ((u_int *)vp->v_data)[1]); } #endif vn_lock(pmp->pm_devvp, LK_EXCLUSIVE | LK_RETRY); (void) VOP_CLOSE(pmp->pm_devvp, pmp->pm_flags & MSDOSFSMNT_RONLY ? FREAD : FREAD|FWRITE, NOCRED); vput(pmp->pm_devvp); msdosfs_fh_destroy(pmp); free(pmp->pm_inusemap, M_MSDOSFSFAT); free(pmp, M_MSDOSFSMNT); mp->mnt_data = NULL; mp->mnt_flag &= ~MNT_LOCAL; return (0); } int msdosfs_root(struct mount *mp, int lktype, struct vnode **vpp) { struct msdosfsmount *pmp = VFSTOMSDOSFS(mp); int error; #ifdef MSDOSFS_DEBUG printf("msdosfs_root(); mp %p, pmp %p\n", mp, pmp); #endif if ((error = msdosfs_deget(pmp, MSDOSFSROOT, MSDOSFSROOT_OFS, vpp)) != 0) return error; error = vn_lock(*vpp, lktype); if (error) { vrele(*vpp); *vpp = NULL; return error; } return 0; } int msdosfs_statvfs(struct mount *mp, struct statvfs *sbp) { struct msdosfsmount *pmp; pmp = VFSTOMSDOSFS(mp); sbp->f_bsize = pmp->pm_bpcluster; sbp->f_frsize = sbp->f_bsize; sbp->f_iosize = pmp->pm_bpcluster; sbp->f_blocks = pmp->pm_nmbrofclusters; sbp->f_bfree = pmp->pm_freeclustercount; sbp->f_bavail = pmp->pm_freeclustercount; sbp->f_bresvd = 0; sbp->f_files = pmp->pm_RootDirEnts; /* XXX */ sbp->f_ffree = 0; /* what to put in here? */ sbp->f_favail = 0; /* what to put in here? */ sbp->f_fresvd = 0; copy_statvfs_info(sbp, mp); return (0); } struct msdosfs_sync_ctx { int waitfor; }; static bool msdosfs_sync_selector(void *cl, struct vnode *vp) { struct msdosfs_sync_ctx *c = cl; struct denode *dep; KASSERT(mutex_owned(vp->v_interlock)); dep = VTODE(vp); if (c->waitfor == MNT_LAZY || vp->v_type == VNON || dep == NULL || (((dep->de_flag & (DE_ACCESS | DE_CREATE | DE_UPDATE | DE_MODIFIED)) == 0) && (LIST_EMPTY(&vp->v_dirtyblkhd) && (vp->v_iflag & VI_ONWORKLST) == 0))) return false; return true; } int msdosfs_sync(struct mount *mp, int waitfor, kauth_cred_t cred) { struct vnode *vp; struct vnode_iterator *marker; struct msdosfsmount *pmp = VFSTOMSDOSFS(mp); int error, allerror = 0; struct msdosfs_sync_ctx ctx; /* * If we ever switch to not updating all of the FATs all the time, * this would be the place to update them from the first one. */ if (pmp->pm_fmod != 0) { if (pmp->pm_flags & MSDOSFSMNT_RONLY) panic("msdosfs_sync: rofs mod"); else { /* update FATs here */ } } /* * Write back each (modified) denode. */ vfs_vnode_iterator_init(mp, &marker); ctx.waitfor = waitfor; while ((vp = vfs_vnode_iterator_next(marker, msdosfs_sync_selector, &ctx))) { error = vn_lock(vp, LK_EXCLUSIVE); if (error) { vrele(vp); continue; } if ((error = VOP_FSYNC(vp, cred, waitfor == MNT_WAIT ? FSYNC_WAIT : 0, 0, 0)) != 0) allerror = error; vput(vp); } vfs_vnode_iterator_destroy(marker); /* * Force stale file system control information to be flushed. */ vn_lock(pmp->pm_devvp, LK_EXCLUSIVE | LK_RETRY); if ((error = VOP_FSYNC(pmp->pm_devvp, cred, waitfor == MNT_WAIT ? FSYNC_WAIT : 0, 0, 0)) != 0) allerror = error; VOP_UNLOCK(pmp->pm_devvp); return (allerror); } int msdosfs_fhtovp(struct mount *mp, struct fid *fhp, int lktype, struct vnode **vpp) { struct msdosfsmount *pmp = VFSTOMSDOSFS(mp); struct defid defh; uint32_t gen; int error; if (fhp->fid_len != sizeof(struct defid)) { DPRINTF("fid_len %d %zd", fhp->fid_len, sizeof(struct defid)); return EINVAL; } memcpy(&defh, fhp, sizeof(defh)); error = msdosfs_fh_lookup(pmp, defh.defid_dirclust, defh.defid_dirofs, &gen); if (error == 0 && gen != defh.defid_gen) error = ESTALE; if (error) { *vpp = NULLVP; return error; } error = msdosfs_deget(pmp, defh.defid_dirclust, defh.defid_dirofs, vpp); if (error) { DPRINTF("deget %d", error); *vpp = NULLVP; return error; } error = vn_lock(*vpp, lktype); if (error) { vrele(*vpp); *vpp = NULLVP; return error; } return 0; } int msdosfs_vptofh(struct vnode *vp, struct fid *fhp, size_t *fh_size) { struct msdosfsmount *pmp = VFSTOMSDOSFS(vp->v_mount); struct denode *dep; struct defid defh; int error; if (*fh_size < sizeof(struct defid)) { *fh_size = sizeof(struct defid); return E2BIG; } *fh_size = sizeof(struct defid); dep = VTODE(vp); memset(&defh, 0, sizeof(defh)); defh.defid_len = sizeof(struct defid); defh.defid_dirclust = dep->de_dirclust; defh.defid_dirofs = dep->de_diroffset; error = msdosfs_fh_enter(pmp, dep->de_dirclust, dep->de_diroffset, &defh.defid_gen); if (error == 0) memcpy(fhp, &defh, sizeof(defh)); return error; } int msdosfs_vget(struct mount *mp, ino_t ino, int lktype, struct vnode **vpp) { return (EOPNOTSUPP); }
65 64 15 15 15 15 15 15 65 65 65 16 64 3 2 65 18 52 65 15 15 15 15 15 65 66 66 65 65 65 66 57 66 53 52 52 53 4 4 4 71 71 26 9 94 94 65 94 94 84 71 65 66 1 65 15 64 65 71 60 60 19 6 71 6 71 65 79 16 16 66 79 81 1 1 80 19 81 81 81 65 94 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 /* $NetBSD: subr_prf.c,v 1.203 2023/08/29 21:23:14 andvar Exp $ */ /*- * Copyright (c) 1986, 1988, 1991, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)subr_prf.c 8.4 (Berkeley) 5/4/95 */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: subr_prf.c,v 1.203 2023/08/29 21:23:14 andvar Exp $"); #ifdef _KERNEL_OPT #include "opt_ddb.h" #include "opt_kgdb.h" #include "opt_dump.h" #include "opt_rnd_printf.h" #endif #include <sys/param.h> #include <sys/stdint.h> #include <sys/systm.h> #include <sys/buf.h> #include <sys/device.h> #include <sys/reboot.h> #include <sys/msgbuf.h> #include <sys/proc.h> #include <sys/ioctl.h> #include <sys/vnode.h> #include <sys/file.h> #include <sys/tty.h> #include <sys/tprintf.h> #include <sys/spldebug.h> #include <sys/syslog.h> #include <sys/kprintf.h> #include <sys/atomic.h> #include <sys/kernel.h> #include <sys/cpu.h> #include <sys/rndsource.h> #include <sys/kmem.h> #include <dev/cons.h> #include <net/if.h> static kmutex_t kprintf_mtx; static bool kprintf_inited = false; #ifdef KGDB #include <sys/kgdb.h> #endif #ifdef DDB #include <ddb/ddbvar.h> /* db_panic */ #include <ddb/db_output.h> /* db_printf, db_putchar prototypes */ #endif /* * defines */ #define KLOG_PRI 0x80000000 /* * local prototypes */ static void putchar(int, int, struct tty *); static void kprintf_internal(const char *, int, void *, char *, ...); /* * globals */ const char *panicstr; /* arg to first call to panic (used as a flag to indicate that panic has already been called). */ struct cpu_info *paniccpu; /* cpu that first panicked */ long panicstart, panicend; /* position in the msgbuf of the start and end of the formatted panicstr. */ int doing_shutdown; /* set to indicate shutdown in progress */ #ifdef RND_PRINTF static krndsource_t rnd_printf_source; #endif #ifndef DUMP_ON_PANIC #define DUMP_ON_PANIC 1 #endif int dumponpanic = DUMP_ON_PANIC; /* * v_putc: routine to putc on virtual console * * the v_putc pointer can be used to redirect the console cnputc elsewhere * [e.g. to a "virtual console"]. */ void (*v_putc)(int) = cnputc; /* start with cnputc (normal cons) */ void (*v_flush)(void) = cnflush; /* start with cnflush (normal cons) */ const char hexdigits[] = "0123456789abcdef"; const char HEXDIGITS[] = "0123456789ABCDEF"; /* * functions */ /* * Locking is inited fairly early in MI bootstrap. Before that * prints are done unlocked. But that doesn't really matter, * since nothing can preempt us before interrupts are enabled. */ void kprintf_init(void) { KASSERT(!kprintf_inited); /* not foolproof, but ... */ KASSERT(cold); mutex_init(&kprintf_mtx, MUTEX_DEFAULT, IPL_HIGH); #ifdef RND_PRINTF rnd_attach_source(&rnd_printf_source, "printf", RND_TYPE_UNKNOWN, RND_FLAG_COLLECT_TIME|RND_FLAG_COLLECT_VALUE); #endif kprintf_inited = true; } void kprintf_lock(void) { if (__predict_true(kprintf_inited)) mutex_enter(&kprintf_mtx); } void kprintf_unlock(void) { if (__predict_true(kprintf_inited)) { /* assert kprintf wasn't somehow inited while we were in */ KASSERT(mutex_owned(&kprintf_mtx)); mutex_exit(&kprintf_mtx); } } /* * twiddle: spin a little propellor on the console. */ void twiddle(void) { static const char twiddle_chars[] = "|/-\\"; static int pos; kprintf_lock(); putchar(twiddle_chars[pos++ & 3], TOCONS|NOTSTAMP, NULL); putchar('\b', TOCONS|NOTSTAMP, NULL); kprintf_unlock(); } /* * panic: handle an unresolvable fatal error * * prints "panic: <message>" and reboots. if called twice (i.e. recursive * call) we avoid trying to dump and just reboot (to avoid recursive panics). */ void panic(const char *fmt, ...) { va_list ap; va_start(ap, fmt); vpanic(fmt, ap); va_end(ap); } void vpanic(const char *fmt, va_list ap) { CPU_INFO_ITERATOR cii; struct cpu_info *ci, *oci; int bootopt; static char scratchstr[384]; /* stores panic message */ spldebug_stop(); if (lwp0.l_cpu && curlwp) { /* * Disable preemption. If already panicking on another CPU, sit * here and spin until the system is rebooted. Allow the CPU that * first panicked to panic again. */ kpreempt_disable(); ci = curcpu(); oci = atomic_cas_ptr((void *)&paniccpu, NULL, ci); if (oci != NULL && oci != ci) { /* Give interrupts a chance to try and prevent deadlock. */ for (;;) { #ifndef _RUMPKERNEL /* XXXpooka: temporary build fix, see kern/40505 */ DELAY(10); #endif /* _RUMPKERNEL */ } } /* * Convert the current thread to a bound thread and prevent all * CPUs from scheduling unbound jobs. Do so without taking any * locks. */ curlwp->l_pflag |= LP_BOUND; for (CPU_INFO_FOREACH(cii, ci)) { ci->ci_schedstate.spc_flags |= SPCF_OFFLINE; } } bootopt = RB_AUTOBOOT | RB_NOSYNC; if (!doing_shutdown) { if (dumponpanic) bootopt |= RB_DUMP; } else printf("Skipping crash dump on recursive panic\n"); doing_shutdown = 1; if (logenabled(msgbufp)) panicstart = msgbufp->msg_bufx; kprintf_lock(); kprintf_internal("panic: ", TOLOG|TOCONS, NULL, NULL); if (panicstr == NULL) { /* first time in panic - store fmt first for precaution */ panicstr = fmt; vsnprintf(scratchstr, sizeof(scratchstr), fmt, ap); kprintf_internal("%s", TOLOG|TOCONS, NULL, NULL, scratchstr); panicstr = scratchstr; } else { kprintf(fmt, TOLOG|TOCONS, NULL, NULL, ap); } kprintf_internal("\n", TOLOG|TOCONS, NULL, NULL); kprintf_unlock(); if (logenabled(msgbufp)) panicend = msgbufp->msg_bufx; #ifdef KGDB kgdb_panic(); #endif #ifdef DDB db_panic(); #endif kern_reboot(bootopt, NULL); } /* * kernel logging functions: log, logpri, addlog */ /* * log: write to the log buffer * * => will not sleep [so safe to call from interrupt] * => will log to console if /dev/klog isn't open */ void log(int level, const char *fmt, ...) { va_list ap; kprintf_lock(); klogpri(level); /* log the level first */ va_start(ap, fmt); kprintf(fmt, TOLOG, NULL, NULL, ap); va_end(ap); if (!log_open) { va_start(ap, fmt); kprintf(fmt, TOCONS, NULL, NULL, ap); va_end(ap); } kprintf_unlock(); logwakeup(); /* wake up anyone waiting for log msgs */ } /* * vlog: write to the log buffer [already have va_list] */ void vlog(int level, const char *fmt, va_list ap) { va_list cap; va_copy(cap, ap); kprintf_lock(); klogpri(level); /* log the level first */ kprintf(fmt, TOLOG, NULL, NULL, ap); if (!log_open) kprintf(fmt, TOCONS, NULL, NULL, cap); kprintf_unlock(); va_end(cap); logwakeup(); /* wake up anyone waiting for log msgs */ } /* * logpri: log the priority level to the klog */ void logpri(int level) { kprintf_lock(); klogpri(level); kprintf_unlock(); } /* * Note: we must be in the mutex here! */ void klogpri(int level) { KASSERT((level & KLOG_PRI) == 0); putchar(level | KLOG_PRI, TOLOG, NULL); } /* * addlog: add info to previous log message */ void addlog(const char *fmt, ...) { va_list ap; kprintf_lock(); va_start(ap, fmt); kprintf(fmt, TOLOG, NULL, NULL, ap); va_end(ap); if (!log_open) { va_start(ap, fmt); kprintf(fmt, TOCONS, NULL, NULL, ap); va_end(ap); } kprintf_unlock(); logwakeup(); } static void putone(int c, int flags, struct tty *tp) { struct tty *ctp; int s; bool do_ps = !cold; ctp = NULL; /* XXX gcc i386 -Os */ /* * Ensure whatever constty points to can't go away while we're * trying to use it. */ if (__predict_true(do_ps)) s = pserialize_read_enter(); if (panicstr) atomic_store_relaxed(&constty, NULL); if ((flags & TOCONS) && (ctp = atomic_load_consume(&constty)) != NULL && tp == NULL) { tp = ctp; flags |= TOTTY; } if ((flags & TOTTY) && tp && tputchar(c, flags, tp) < 0 && (flags & TOCONS)) atomic_cas_ptr(&constty, tp, NULL); if ((flags & TOLOG) && c != '\0' && c != '\r' && c != 0177) logputchar(c); if ((flags & TOCONS) && ctp == NULL && c != '\0') (*v_putc)(c); if (__predict_true(do_ps)) pserialize_read_exit(s); } static void putlogpri(int level) { char *p; char snbuf[KPRINTF_BUFSIZE]; putone('<', TOLOG, NULL); snprintf(snbuf, sizeof(snbuf), "%d", level); for (p = snbuf ; *p ; p++) putone(*p, TOLOG, NULL); putone('>', TOLOG, NULL); } #ifndef KLOG_NOTIMESTAMP static int needtstamp = 1; int log_ts_prec = 7; static void addtstamp(int flags, struct tty *tp) { char buf[64]; struct timespec ts; int n, prec; long fsec; prec = log_ts_prec; if (prec < 0) { prec = 0; log_ts_prec = prec; } else if (prec > 9) { prec = 9; log_ts_prec = prec; } getnanouptime(&ts); for (n = prec, fsec = ts.tv_nsec; n < 8; n++) fsec /= 10; if (n < 9) fsec = (fsec / 10) + ((fsec % 10) >= 5); n = snprintf(buf, sizeof(buf), "[% 4jd.%.*ld] ", (intmax_t)ts.tv_sec, prec, fsec); for (int i = 0; i < n; i++) putone(buf[i], flags, tp); } #endif /* * putchar: print a single character on console or user terminal. * * => if console, then the last MSGBUFS chars are saved in msgbuf * for inspection later (e.g. dmesg/syslog) * => we must already be in the mutex! */ static void putchar(int c, int flags, struct tty *tp) { if (c & KLOG_PRI) { putlogpri(c & ~KLOG_PRI); return; } #ifndef KLOG_NOTIMESTAMP if (c != '\0' && c != '\n' && needtstamp && (flags & NOTSTAMP) == 0) { addtstamp(flags, tp); needtstamp = 0; } if (c == '\n') needtstamp = 1; #endif putone(c, flags, tp); #ifdef DDB if (flags & TODDB) { db_putchar(c); return; } #endif #ifdef RND_PRINTF if (__predict_true(kprintf_inited)) { unsigned char ch = c; rnd_add_data_intr(&rnd_printf_source, &ch, 1, 0); } #endif } /* * tablefull: warn that a system table is full */ void tablefull(const char *tab, const char *hint) { if (hint) log(LOG_ERR, "%s: table is full - %s\n", tab, hint); else log(LOG_ERR, "%s: table is full\n", tab); } /* * uprintf: print to the controlling tty of the current process * * => we may block if the tty queue is full * => no message is printed if the queue doesn't clear in a reasonable * time */ void uprintf(const char *fmt, ...) { struct proc *p = curproc; va_list ap; /* mutex_enter(&proc_lock); XXXSMP */ if (p->p_lflag & PL_CONTROLT && p->p_session->s_ttyvp) { /* No mutex needed; going to process TTY. */ va_start(ap, fmt); kprintf(fmt, TOTTY, p->p_session->s_ttyp, NULL, ap); va_end(ap); } /* mutex_exit(&proc_lock); XXXSMP */ } void uprintf_locked(const char *fmt, ...) { struct proc *p = curproc; va_list ap; if (p->p_lflag & PL_CONTROLT && p->p_session->s_ttyvp) { /* No mutex needed; going to process TTY. */ va_start(ap, fmt); kprintf(fmt, TOTTY, p->p_session->s_ttyp, NULL, ap); va_end(ap); } } /* * tprintf functions: used to send messages to a specific process * * usage: * get a tpr_t handle on a process "p" by using "tprintf_open(p)" * use the handle when calling "tprintf" * when done, do a "tprintf_close" to drop the handle */ /* * tprintf_open: get a tprintf handle on a process "p" * * => returns NULL if process can't be printed to */ tpr_t tprintf_open(struct proc *p) { tpr_t cookie; cookie = NULL; mutex_enter(&proc_lock); if (p->p_lflag & PL_CONTROLT && p->p_session->s_ttyvp) { proc_sesshold(p->p_session); cookie = (tpr_t)p->p_session; } mutex_exit(&proc_lock); return cookie; } /* * tprintf_close: dispose of a tprintf handle obtained with tprintf_open */ void tprintf_close(tpr_t sess) { if (sess) { mutex_enter(&proc_lock); /* Releases proc_lock. */ proc_sessrele((struct session *)sess); } } /* * tprintf: given tprintf handle to a process [obtained with tprintf_open], * send a message to the controlling tty for that process. * * => also sends message to /dev/klog */ void tprintf(tpr_t tpr, const char *fmt, ...) { struct session *sess = (struct session *)tpr; struct tty *tp = NULL; int flags = TOLOG; va_list ap; /* mutex_enter(&proc_lock); XXXSMP */ if (sess && sess->s_ttyvp && ttycheckoutq(sess->s_ttyp)) { flags |= TOTTY; tp = sess->s_ttyp; } kprintf_lock(); klogpri(LOG_INFO); va_start(ap, fmt); kprintf(fmt, flags, tp, NULL, ap); va_end(ap); kprintf_unlock(); /* mutex_exit(&proc_lock); XXXSMP */ logwakeup(); } /* * ttyprintf: send a message to a specific tty * * => should be used only by tty driver or anything that knows the * underlying tty will not be revoked(2)'d away. [otherwise, * use tprintf] */ void ttyprintf(struct tty *tp, const char *fmt, ...) { va_list ap; /* No mutex needed; going to process TTY. */ va_start(ap, fmt); kprintf(fmt, TOTTY, tp, NULL, ap); va_end(ap); } #ifdef DDB /* * db_printf: printf for DDB (via db_putchar) */ void db_printf(const char *fmt, ...) { va_list ap; /* No mutex needed; DDB pauses all processors. */ va_start(ap, fmt); kprintf(fmt, TODDB, NULL, NULL, ap); va_end(ap); if (db_tee_msgbuf) { va_start(ap, fmt); kprintf(fmt, TOLOG, NULL, NULL, ap); va_end(ap); } } void db_vprintf(const char *fmt, va_list ap) { va_list cap; va_copy(cap, ap); /* No mutex needed; DDB pauses all processors. */ kprintf(fmt, TODDB, NULL, NULL, ap); if (db_tee_msgbuf) kprintf(fmt, TOLOG, NULL, NULL, cap); va_end(cap); } #endif /* DDB */ static void kprintf_internal(const char *fmt, int oflags, void *vp, char *sbuf, ...) { va_list ap; va_start(ap, sbuf); (void)kprintf(fmt, oflags, vp, sbuf, ap); va_end(ap); } /* * Device autoconfiguration printf routines. These change their * behavior based on the AB_* flags in boothowto. If AB_SILENT * is set, messages never go to the console (but they still always * go to the log). AB_VERBOSE overrides AB_SILENT. */ /* * aprint_normal: Send to console unless AB_QUIET. Always goes * to the log. */ static void aprint_normal_internal(const char *prefix, const char *fmt, va_list ap) { int flags = TOLOG; if ((boothowto & (AB_SILENT|AB_QUIET)) == 0 || (boothowto & AB_VERBOSE) != 0) flags |= TOCONS; kprintf_lock(); if (prefix) kprintf_internal("%s: ", flags, NULL, NULL, prefix); kprintf(fmt, flags, NULL, NULL, ap); kprintf_unlock(); if (!panicstr) logwakeup(); } void aprint_normal(const char *fmt, ...) { va_list ap; va_start(ap, fmt); aprint_normal_internal(NULL, fmt, ap); va_end(ap); } void aprint_normal_dev(device_t dv, const char *fmt, ...) { va_list ap; KASSERT(dv != NULL); va_start(ap, fmt); aprint_normal_internal(device_xname(dv), fmt, ap); va_end(ap); } void aprint_normal_ifnet(struct ifnet *ifp, const char *fmt, ...) { va_list ap; KASSERT(ifp != NULL); va_start(ap, fmt); aprint_normal_internal(ifp->if_xname, fmt, ap); va_end(ap); } /* * aprint_error: Send to console unless AB_QUIET. Always goes * to the log. Also counts the number of times called so other * parts of the kernel can report the number of errors during a * given phase of system startup. */ static int aprint_error_count; int aprint_get_error_count(void) { int count; kprintf_lock(); count = aprint_error_count; aprint_error_count = 0; kprintf_unlock(); return (count); } static void aprint_error_internal(const char *prefix, const char *fmt, va_list ap) { int flags = TOLOG; if ((boothowto & (AB_SILENT|AB_QUIET)) == 0 || (boothowto & AB_VERBOSE) != 0) flags |= TOCONS; kprintf_lock(); aprint_error_count++; if (prefix) kprintf_internal("%s: ", flags, NULL, NULL, prefix); kprintf_internal("autoconfiguration error: ", TOLOG, NULL, NULL); kprintf(fmt, flags, NULL, NULL, ap); kprintf_unlock(); if (!panicstr) logwakeup(); } void aprint_error(const char *fmt, ...) { va_list ap; va_start(ap, fmt); aprint_error_internal(NULL, fmt, ap); va_end(ap); } void aprint_error_dev(device_t dv, const char *fmt, ...) { va_list ap; KASSERT(dv != NULL); va_start(ap, fmt); aprint_error_internal(device_xname(dv), fmt, ap); va_end(ap); } void aprint_error_ifnet(struct ifnet *ifp, const char *fmt, ...) { va_list ap; KASSERT(ifp != NULL); va_start(ap, fmt); aprint_error_internal(ifp->if_xname, fmt, ap); va_end(ap); } /* * aprint_naive: Send to console only if AB_QUIET. Never goes * to the log. */ static void aprint_naive_internal(const char *prefix, const char *fmt, va_list ap) { if ((boothowto & (AB_QUIET|AB_SILENT|AB_VERBOSE)) != AB_QUIET) return; kprintf_lock(); if (prefix) kprintf_internal("%s: ", TOCONS, NULL, NULL, prefix); kprintf(fmt, TOCONS, NULL, NULL, ap); kprintf_unlock(); } void aprint_naive(const char *fmt, ...) { va_list ap; va_start(ap, fmt); aprint_naive_internal(NULL, fmt, ap); va_end(ap); } void aprint_naive_dev(device_t dv, const char *fmt, ...) { va_list ap; KASSERT(dv != NULL); va_start(ap, fmt); aprint_naive_internal(device_xname(dv), fmt, ap); va_end(ap); } void aprint_naive_ifnet(struct ifnet *ifp, const char *fmt, ...) { va_list ap; KASSERT(ifp != NULL); va_start(ap, fmt); aprint_naive_internal(ifp->if_xname, fmt, ap); va_end(ap); } /* * aprint_verbose: Send to console only if AB_VERBOSE. Always * goes to the log. */ static void aprint_verbose_internal(const char *prefix, const char *fmt, va_list ap) { int flags = TOLOG; if (boothowto & AB_VERBOSE) flags |= TOCONS; kprintf_lock(); if (prefix) kprintf_internal("%s: ", flags, NULL, NULL, prefix); kprintf(fmt, flags, NULL, NULL, ap); kprintf_unlock(); if (!panicstr) logwakeup(); } void aprint_verbose(const char *fmt, ...) { va_list ap; va_start(ap, fmt); aprint_verbose_internal(NULL, fmt, ap); va_end(ap); } void aprint_verbose_dev(device_t dv, const char *fmt, ...) { va_list ap; KASSERT(dv != NULL); va_start(ap, fmt); aprint_verbose_internal(device_xname(dv), fmt, ap); va_end(ap); } void aprint_verbose_ifnet(struct ifnet *ifp, const char *fmt, ...) { va_list ap; KASSERT(ifp != NULL); va_start(ap, fmt); aprint_verbose_internal(ifp->if_xname, fmt, ap); va_end(ap); } /* * aprint_debug: Send to console and log only if AB_DEBUG. */ static void aprint_debug_internal(const char *prefix, const char *fmt, va_list ap) { if ((boothowto & AB_DEBUG) == 0) return; kprintf_lock(); if (prefix) kprintf_internal("%s: ", TOCONS | TOLOG, NULL, NULL, prefix); kprintf(fmt, TOCONS | TOLOG, NULL, NULL, ap); kprintf_unlock(); } void aprint_debug(const char *fmt, ...) { va_list ap; va_start(ap, fmt); aprint_debug_internal(NULL, fmt, ap); va_end(ap); } void aprint_debug_dev(device_t dv, const char *fmt, ...) { va_list ap; KASSERT(dv != NULL); va_start(ap, fmt); aprint_debug_internal(device_xname(dv), fmt, ap); va_end(ap); } void aprint_debug_ifnet(struct ifnet *ifp, const char *fmt, ...) { va_list ap; KASSERT(ifp != NULL); va_start(ap, fmt); aprint_debug_internal(ifp->if_xname, fmt, ap); va_end(ap); } void vprintf_flags(int flags, const char *fmt, va_list ap) { kprintf_lock(); kprintf(fmt, flags, NULL, NULL, ap); kprintf_unlock(); } void printf_flags(int flags, const char *fmt, ...) { va_list ap; va_start(ap, fmt); vprintf_flags(flags, fmt, ap); va_end(ap); } void printf_tolog(const char *fmt, ...) { va_list ap; va_start(ap, fmt); vprintf_flags(TOLOG, fmt, ap); va_end(ap); } /* * printf_nolog: Like printf(), but does not send message to the log. */ void printf_nolog(const char *fmt, ...) { va_list ap; va_start(ap, fmt); vprintf_flags(TOCONS, fmt, ap); va_end(ap); } /* * printf_nostamp: Like printf(), but does not prepend a timestamp. */ void printf_nostamp(const char *fmt, ...) { va_list ap; va_start(ap, fmt); vprintf_flags(TOCONS|NOTSTAMP, fmt, ap); va_end(ap); } /* * normal kernel printf functions: printf, vprintf, snprintf, vsnprintf */ /* * printf: print a message to the console and the log */ void printf(const char *fmt, ...) { va_list ap; va_start(ap, fmt); vprintf_flags(TOCONS | TOLOG, fmt, ap); va_end(ap); } /* * vprintf: print a message to the console and the log [already have * va_list] */ void vprintf(const char *fmt, va_list ap) { vprintf_flags(TOCONS | TOLOG, fmt, ap); if (!panicstr) logwakeup(); } /* * snprintf: print a message to a buffer */ int snprintf(char *bf, size_t size, const char *fmt, ...) { int retval; va_list ap; va_start(ap, fmt); retval = vsnprintf(bf, size, fmt, ap); va_end(ap); return retval; } /* * vsnprintf: print a message to a buffer [already have va_list] */ int vsnprintf(char *bf, size_t size, const char *fmt, va_list ap) { int retval; char *p; p = bf + size; retval = kprintf(fmt, TOBUFONLY, &p, bf, ap); if (bf && size > 0) { /* nul terminate */ if (size <= (size_t)retval) bf[size - 1] = '\0'; else bf[retval] = '\0'; } return retval; } int vasprintf(char **bf, const char *fmt, va_list ap) { int retval; va_list cap; va_copy(cap, ap); retval = kprintf(fmt, TOBUFONLY, NULL, NULL, cap) + 1; va_end(cap); *bf = kmem_alloc(retval, KM_SLEEP); return vsnprintf(*bf, retval, fmt, ap); } /* * kprintf: scaled down version of printf(3). * * this version based on vfprintf() from libc which was derived from * software contributed to Berkeley by Chris Torek. * * NOTE: The kprintf mutex must be held if we're going TOBUF or TOCONS! */ /* * macros for converting digits to letters and vice versa */ #define to_digit(c) ((c) - '0') #define is_digit(c) ((unsigned)to_digit(c) <= 9) #define to_char(n) ((n) + '0') /* * flags used during conversion. */ #define ALT 0x001 /* alternate form */ #define HEXPREFIX 0x002 /* add 0x or 0X prefix */ #define LADJUST 0x004 /* left adjustment */ #define LONGDBL 0x008 /* long double; unimplemented */ #define LONGINT 0x010 /* long integer */ #define QUADINT 0x020 /* quad integer */ #define SHORTINT 0x040 /* short integer */ #define MAXINT 0x080 /* intmax_t */ #define PTRINT 0x100 /* intptr_t */ #define SIZEINT 0x200 /* size_t */ #define ZEROPAD 0x400 /* zero (as opposed to blank) pad */ #define FPT 0x800 /* Floating point number */ /* * To extend shorts properly, we need both signed and unsigned * argument extraction methods. */ #define SARG() \ (flags&MAXINT ? va_arg(ap, intmax_t) : \ flags&PTRINT ? va_arg(ap, intptr_t) : \ flags&SIZEINT ? va_arg(ap, ssize_t) : /* XXX */ \ flags&QUADINT ? va_arg(ap, quad_t) : \ flags&LONGINT ? va_arg(ap, long) : \ flags&SHORTINT ? (long)(short)va_arg(ap, int) : \ (long)va_arg(ap, int)) #define UARG() \ (flags&MAXINT ? va_arg(ap, uintmax_t) : \ flags&PTRINT ? va_arg(ap, uintptr_t) : \ flags&SIZEINT ? va_arg(ap, size_t) : \ flags&QUADINT ? va_arg(ap, u_quad_t) : \ flags&LONGINT ? va_arg(ap, u_long) : \ flags&SHORTINT ? (u_long)(u_short)va_arg(ap, int) : \ (u_long)va_arg(ap, u_int)) #define KPRINTF_PUTCHAR(C) { \ if (oflags == TOBUFONLY) { \ if (sbuf && ((vp == NULL) || (sbuf < tailp))) \ *sbuf++ = (C); \ } else { \ putchar((C), oflags, vp); \ } \ } void device_printf(device_t dev, const char *fmt, ...) { va_list ap; kprintf_lock(); kprintf_internal("%s: ", TOCONS|TOLOG, NULL, NULL, device_xname(dev)); va_start(ap, fmt); kprintf(fmt, TOCONS|TOLOG, NULL, NULL, ap); va_end(ap); kprintf_unlock(); } /* * Guts of kernel printf. Note, we already expect to be in a mutex! */ int kprintf(const char *fmt0, int oflags, void *vp, char *sbuf, va_list ap) { const char *fmt; /* format string */ int ch; /* character from fmt */ int n; /* handy integer (short term usage) */ char *cp; /* handy char pointer (short term usage) */ int flags; /* flags as above */ int ret; /* return value accumulator */ int width; /* width from format (%8d), or 0 */ int prec; /* precision from format (%.3d), or -1 */ char sign; /* sign prefix (' ', '+', '-', or \0) */ u_quad_t _uquad; /* integer arguments %[diouxX] */ enum { OCT, DEC, HEX } base;/* base for [diouxX] conversion */ int dprec; /* a copy of prec if [diouxX], 0 otherwise */ int realsz; /* field size expanded by dprec */ int size; /* size of converted field or string */ const char *xdigs; /* digits for [xX] conversion */ char bf[KPRINTF_BUFSIZE]; /* space for %c, %[diouxX] */ char *tailp; /* tail pointer for snprintf */ if (oflags == TOBUFONLY && (vp != NULL)) tailp = *(char **)vp; else tailp = NULL; cp = NULL; /* XXX: shutup gcc */ size = 0; /* XXX: shutup gcc */ fmt = fmt0; ret = 0; xdigs = NULL; /* XXX: shut up gcc warning */ /* * Scan the format for conversions (`%' character). */ for (;;) { for (; *fmt != '%' && *fmt; fmt++) { ret++; KPRINTF_PUTCHAR(*fmt); } if (*fmt == 0) goto done; fmt++; /* skip over '%' */ flags = 0; dprec = 0; width = 0; prec = -1; sign = '\0'; rflag: ch = *fmt++; reswitch: switch (ch) { case ' ': /* * ``If the space and + flags both appear, the space * flag will be ignored.'' * -- ANSI X3J11 */ if (!sign) sign = ' '; goto rflag; case '#': flags |= ALT; goto rflag; case '*': /* * ``A negative field width argument is taken as a * - flag followed by a positive field width.'' * -- ANSI X3J11 * They don't exclude field widths read from args. */ if ((width = va_arg(ap, int)) >= 0) goto rflag; width = -width; /* FALLTHROUGH */ case '-': flags |= LADJUST; goto rflag; case '+': sign = '+'; goto rflag; case '.': if ((ch = *fmt++) == '*') { n = va_arg(ap, int); prec = n < 0 ? -1 : n; goto rflag; } n = 0; while (is_digit(ch)) { n = 10 * n + to_digit(ch); ch = *fmt++; } prec = n < 0 ? -1 : n; goto reswitch; case '0': /* * ``Note that 0 is taken as a flag, not as the * beginning of a field width.'' * -- ANSI X3J11 */ flags |= ZEROPAD; goto rflag; case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': n = 0; do { n = 10 * n + to_digit(ch); ch = *fmt++; } while (is_digit(ch)); width = n; goto reswitch; case 'h': flags |= SHORTINT; goto rflag; case 'j': flags |= MAXINT; goto rflag; case 'l': if (*fmt == 'l') { fmt++; flags |= QUADINT; } else { flags |= LONGINT; } goto rflag; case 'q': flags |= QUADINT; goto rflag; case 't': flags |= PTRINT; goto rflag; case 'z': flags |= SIZEINT; goto rflag; case 'c': *(cp = bf) = va_arg(ap, int); size = 1; sign = '\0'; break; case 'D': flags |= LONGINT; /*FALLTHROUGH*/ case 'd': case 'i': _uquad = SARG(); if ((quad_t)_uquad < 0) { _uquad = -_uquad; sign = '-'; } base = DEC; goto number; case 'n': /* no %n support in the kernel, consume and skip */ if (flags & MAXINT) (void)va_arg(ap, intmax_t *); else if (flags & PTRINT) (void)va_arg(ap, intptr_t *); else if (flags & SIZEINT) (void)va_arg(ap, ssize_t *); else if (flags & QUADINT) (void)va_arg(ap, quad_t *); else if (flags & LONGINT) (void)va_arg(ap, long *); else if (flags & SHORTINT) (void)va_arg(ap, short *); else (void)va_arg(ap, int *); continue; /* no output */ case 'O': flags |= LONGINT; /*FALLTHROUGH*/ case 'o': _uquad = UARG(); base = OCT; goto nosign; case 'p': /* * ``The argument shall be a pointer to void. The * value of the pointer is converted to a sequence * of printable characters, in an implementation- * defined manner.'' * -- ANSI X3J11 */ /* NOSTRICT */ _uquad = (u_long)va_arg(ap, void *); base = HEX; xdigs = hexdigits; flags |= HEXPREFIX; ch = 'x'; goto nosign; case 's': if ((cp = va_arg(ap, char *)) == NULL) /*XXXUNCONST*/ cp = __UNCONST("(null)"); if (prec >= 0) { /* * can't use strlen; can only look for the * NUL in the first `prec' characters, and * strlen() will go further. */ char *p = memchr(cp, 0, prec); if (p != NULL) { size = p - cp; if (size > prec) size = prec; } else size = prec; } else size = strlen(cp); sign = '\0'; break; case 'U': flags |= LONGINT; /*FALLTHROUGH*/ case 'u': _uquad = UARG(); base = DEC; goto nosign; case 'X': xdigs = HEXDIGITS; goto hex; case 'x': xdigs = hexdigits; hex: _uquad = UARG(); base = HEX; /* leading 0x/X only if non-zero */ if (flags & ALT && _uquad != 0) flags |= HEXPREFIX; /* unsigned conversions */ nosign: sign = '\0'; /* * ``... diouXx conversions ... if a precision is * specified, the 0 flag will be ignored.'' * -- ANSI X3J11 */ number: if ((dprec = prec) >= 0) flags &= ~ZEROPAD; /* * ``The result of converting a zero value with an * explicit precision of zero is no characters.'' * -- ANSI X3J11 */ cp = bf + KPRINTF_BUFSIZE; if (_uquad != 0 || prec != 0) { /* * Unsigned mod is hard, and unsigned mod * by a constant is easier than that by * a variable; hence this switch. */ switch (base) { case OCT: do { *--cp = to_char(_uquad & 7); _uquad >>= 3; } while (_uquad); /* handle octal leading 0 */ if (flags & ALT && *cp != '0') *--cp = '0'; break; case DEC: /* many numbers are 1 digit */ while (_uquad >= 10) { *--cp = to_char(_uquad % 10); _uquad /= 10; } *--cp = to_char(_uquad); break; case HEX: do { *--cp = xdigs[_uquad & 15]; _uquad >>= 4; } while (_uquad); break; default: /*XXXUNCONST*/ cp = __UNCONST("bug in kprintf: bad base"); size = strlen(cp); goto skipsize; } } size = bf + KPRINTF_BUFSIZE - cp; skipsize: break; default: /* "%?" prints ?, unless ? is NUL */ if (ch == '\0') goto done; /* pretend it was %c with argument ch */ cp = bf; *cp = ch; size = 1; sign = '\0'; break; } /* * All reasonable formats wind up here. At this point, `cp' * points to a string which (if not flags&LADJUST) should be * padded out to `width' places. If flags&ZEROPAD, it should * first be prefixed by any sign or other prefix; otherwise, * it should be blank padded before the prefix is emitted. * After any left-hand padding and prefixing, emit zeroes * required by a decimal [diouxX] precision, then print the * string proper, then emit zeroes required by any leftover * floating precision; finally, if LADJUST, pad with blanks. * * Compute actual size, so we know how much to pad. * size excludes decimal prec; realsz includes it. */ realsz = dprec > size ? dprec : size; if (sign) realsz++; else if (flags & HEXPREFIX) realsz+= 2; /* adjust ret */ ret += width > realsz ? width : realsz; /* right-adjusting blank padding */ if ((flags & (LADJUST|ZEROPAD)) == 0) { n = width - realsz; while (n-- > 0) KPRINTF_PUTCHAR(' '); } /* prefix */ if (sign) { KPRINTF_PUTCHAR(sign); } else if (flags & HEXPREFIX) { KPRINTF_PUTCHAR('0'); KPRINTF_PUTCHAR(ch); } /* right-adjusting zero padding */ if ((flags & (LADJUST|ZEROPAD)) == ZEROPAD) { n = width - realsz; while (n-- > 0) KPRINTF_PUTCHAR('0'); } /* leading zeroes from decimal precision */ n = dprec - size; while (n-- > 0) KPRINTF_PUTCHAR('0'); /* the string or number proper */ for (; size--; cp++) KPRINTF_PUTCHAR(*cp); /* left-adjusting padding (always blank) */ if (flags & LADJUST) { n = width - realsz; while (n-- > 0) KPRINTF_PUTCHAR(' '); } } done: if ((oflags == TOBUFONLY) && (vp != NULL)) *(char **)vp = sbuf; (*v_flush)(); #ifdef RND_PRINTF if (__predict_true(kprintf_inited)) rnd_add_data_intr(&rnd_printf_source, NULL, 0, 0); #endif return ret; }
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 /* $NetBSD: bufq_impl.h,v 1.10 2016/11/16 00:46:46 pgoyette Exp $ */ /* NetBSD: bufq.h,v 1.3 2005/03/31 11:28:53 yamt Exp */ /* NetBSD: buf.h,v 1.75 2004/09/18 16:40:11 yamt Exp */ /*- * Copyright (c) 1999, 2000 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility, * NASA Ames Research Center. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /* * Copyright (c) 1982, 1986, 1989, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)buf.h 8.9 (Berkeley) 3/30/95 */ #if !defined(_KERNEL) #error not supposed to be exposed to userland. #endif struct bufq_strat; /* * Device driver buffer queue. */ struct bufq_state { void (*bq_put)(struct bufq_state *, struct buf *); struct buf *(*bq_get)(struct bufq_state *, int); struct buf *(*bq_cancel)(struct bufq_state *, struct buf *); void (*bq_fini)(struct bufq_state *); void *bq_private; int bq_flags; /* Flags from bufq_alloc() */ struct bufq_strat *bq_strat; }; static __inline void *bufq_private(const struct bufq_state *) __unused; static __inline bool buf_inorder(const struct buf *, const struct buf *, int) __unused; #include <sys/null.h> /* for NULL */ static __inline void * bufq_private(const struct bufq_state *bufq) { return bufq->bq_private; } /* * Check if two buf's are in ascending order. * * this function consider a NULL buf is after any non-NULL buf. * * this function returns false if two are "same". */ static __inline bool buf_inorder(const struct buf *bp, const struct buf *bq, int sortby) { KASSERT(bp != NULL || bq != NULL); if (bp == NULL || bq == NULL) return (bq == NULL); if (sortby == BUFQ_SORT_CYLINDER) { if (bp->b_cylinder != bq->b_cylinder) return bp->b_cylinder < bq->b_cylinder; else return bp->b_rawblkno < bq->b_rawblkno; } else return bp->b_rawblkno < bq->b_rawblkno; } struct bufq_strat { const char *bs_name; void (*bs_initfn)(struct bufq_state *); int bs_prio; int bs_refcnt; SLIST_ENTRY(bufq_strat) bs_next; }; #define BUFQ_DEFINE(name, prio, initfn) \ static struct bufq_strat bufq_strat_##name = { \ .bs_name = #name, \ .bs_prio = prio, \ .bs_initfn = initfn, \ .bs_refcnt = 0 \ }; int bufq_register(struct bufq_strat *); int bufq_unregister(struct bufq_strat *);
1 1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 /* $NetBSD: in6_print.c,v 1.1 2014/12/02 19:36:58 christos Exp $ */ /*- * Copyright (c) 2014 The NetBSD Foundation, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #include <sys/cdefs.h> #include <sys/types.h> #ifdef _KERNEL __KERNEL_RCSID(0, "$NetBSD: in6_print.c,v 1.1 2014/12/02 19:36:58 christos Exp $"); #include <sys/systm.h> #else __RCSID("$NetBSD: in6_print.c,v 1.1 2014/12/02 19:36:58 christos Exp $"); #include <stdio.h> #define s6_addr32 __u6_addr.__u6_addr32 static const uint8_t hexdigits[] = "0123456789abcdef"; #endif #include <netinet/in.h> int in6_print(char *buf, size_t len, const struct in6_addr *ia6) { int i; char *bp; char *cp, *ecp; const uint16_t *a; const uint8_t *d; int dcolon = 0; if (IN6_IS_ADDR_V4MAPPED(ia6)) { char buf4[INET_ADDRSTRLEN]; struct in_addr ia = { .s_addr = ia6->s6_addr32[3] }; in_print(buf4, sizeof(buf4), &ia); return snprintf(buf, len, "::ffff:%s", buf4); } #define ADDC(c) do { \ if (cp >= ecp) {\ cp++; \ } else \ *cp++ = (char)(c); \ } while (/*CONSTCOND*/0) #define ADDX(v) do { \ uint8_t n = hexdigits[(v)]; \ ADDC(n); \ if (cp == bp && n == '0') \ cp--; \ } while (/*CONSTCOND*/0) cp = buf; ecp = buf + len; a = (const uint16_t *)ia6; for (i = 0; i < 8; i++) { if (dcolon == 1) { if (*a == 0) { if (i == 7) ADDC(':'); a++; continue; } else dcolon = 2; } if (*a == 0) { if (dcolon == 0 && *(a + 1) == 0) { if (i == 0) ADDC(':'); ADDC(':'); dcolon = 1; } else { ADDC('0'); ADDC(':'); } a++; continue; } d = (const u_char *)a; bp = cp + 1; ADDX((u_int)*d >> 4); ADDX(*d & 0xf); d++; ADDX((u_int)*d >> 4); ADDX(*d & 0xf); ADDC(':'); a++; } if (cp > buf) --cp; if (ecp > buf) { if (cp < ecp) *cp = '\0'; else *--ecp = '\0'; } return (int)(cp - buf); } int sin6_print(char *buf, size_t len, const void *v) { const struct sockaddr_in6 *sin6 = v; const struct in6_addr *ia6 = &sin6->sin6_addr; char abuf[INET6_ADDRSTRLEN]; if (!sin6->sin6_port) return in6_print(buf, len, ia6); in6_print(abuf, sizeof(abuf), ia6); return snprintf(buf, len, "[%s]:%hu", abuf, ntohs(sin6->sin6_port)); }
2 1 1 1 1 1 6 1 5 1 1 2 2 2 3 1 2 2 2 2 32 32 3 18 18 18 1 17 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 /* $NetBSD: ptyfs_vfsops.c,v 1.58 2020/03/16 21:20:10 pgoyette Exp $ */ /* * Copyright (c) 1992, 1993, 1995 * The Regents of the University of California. All rights reserved. * * This code is derived from software donated to Berkeley by * Jan-Simon Pendry. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * */ /* * Pseudo-tty Filesystem */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: ptyfs_vfsops.c,v 1.58 2020/03/16 21:20:10 pgoyette Exp $"); #include <sys/param.h> #include <sys/systm.h> #include <sys/sysctl.h> #include <sys/conf.h> #include <sys/proc.h> #include <sys/vnode.h> #include <sys/mount.h> #include <sys/namei.h> #include <sys/stat.h> #include <sys/dirent.h> #include <sys/malloc.h> #include <sys/syslog.h> #include <sys/select.h> #include <sys/filedesc.h> #include <sys/tty.h> #include <sys/pty.h> #include <sys/kauth.h> #include <sys/module.h> #include <fs/ptyfs/ptyfs.h> #include <miscfs/genfs/genfs.h> #include <miscfs/specfs/specdev.h> MODULE(MODULE_CLASS_VFS, ptyfs, NULL); MALLOC_JUSTDEFINE(M_PTYFSMNT, "ptyfs mount", "ptyfs mount structures"); MALLOC_JUSTDEFINE(M_PTYFSTMP, "ptyfs temp", "ptyfs temporary structures"); VFS_PROTOS(ptyfs); static int ptyfs__allocvp(struct mount *, struct lwp *, struct vnode **, dev_t, char); static int ptyfs__makename(struct mount *, struct lwp *, char *, size_t, dev_t, char); static void ptyfs__getvattr(struct mount *, struct lwp *, struct vattr *); static int ptyfs__getmp(struct lwp *, struct mount **); /* * ptm glue: When we mount, we make ptm point to us. */ struct ptm_pty *ptyfs_save_ptm; static int ptyfs_count; static TAILQ_HEAD(, ptyfsmount) ptyfs_head; struct ptm_pty ptm_ptyfspty = { ptyfs__allocvp, ptyfs__makename, ptyfs__getvattr, ptyfs__getmp, }; static int ptyfs__getmp(struct lwp *l, struct mount **mpp) { struct cwdinfo *cwdi = l->l_proc->p_cwdi; struct mount *mp; struct ptyfsmount *pmnt; TAILQ_FOREACH(pmnt, &ptyfs_head, pmnt_le) { mp = pmnt->pmnt_mp; if (cwdi->cwdi_rdir == NULL) goto ok; if (vn_isunder(mp->mnt_vnodecovered, cwdi->cwdi_rdir, l)) goto ok; } *mpp = NULL; return EOPNOTSUPP; ok: *mpp = mp; return 0; } static const char * ptyfs__getpath(struct lwp *l, const struct mount *mp) { #define MAXBUF (sizeof(mp->mnt_stat.f_mntonname) + 32) struct cwdinfo *cwdi = l->l_proc->p_cwdi; char *buf; const char *rv; size_t len; char *bp; int error; rv = mp->mnt_stat.f_mntonname; if (cwdi->cwdi_rdir == NULL) return rv; buf = malloc(MAXBUF, M_TEMP, M_WAITOK); bp = buf + MAXBUF; *--bp = '\0'; error = getcwd_common(mp->mnt_vnodecovered, cwdi->cwdi_rdir, &bp, buf, MAXBUF / 2, 0, l); if (error) { /* Mount point is out of rdir */ rv = NULL; goto out; } len = strlen(bp); if (len < sizeof(mp->mnt_stat.f_mntonname)) /* XXX */ rv += strlen(rv) - len; out: free(buf, M_TEMP); return rv; } static int ptyfs__makename(struct mount *mp, struct lwp *l, char *tbuf, size_t bufsiz, dev_t dev, char ms) { size_t len; const char *np; int pty = minor(dev); switch (ms) { case 'p': /* We don't provide access to the master, should we? */ len = snprintf(tbuf, bufsiz, "/dev/null"); break; case 't': /* * We support traditional ptys, so we can get here, * if pty had been opened before PTYFS was mounted, * or was opened through /dev/ptyXX devices. * Return it only outside chroot for more security . */ if (l->l_proc->p_cwdi->cwdi_rdir == NULL && ptyfs_save_ptm != NULL && ptyfs_next_active(mp, pty) != pty) return (*ptyfs_save_ptm->makename)(mp, l, tbuf, bufsiz, dev, ms); np = ptyfs__getpath(l, mp); if (np == NULL) return EOPNOTSUPP; len = snprintf(tbuf, bufsiz, "%s/%llu", np, (unsigned long long)minor(dev)); break; default: return EINVAL; } return len >= bufsiz ? ENOSPC : 0; } static int /*ARGSUSED*/ ptyfs__allocvp(struct mount *mp, struct lwp *l, struct vnode **vpp, dev_t dev, char ms) { int error; ptyfstype type; switch (ms) { case 'p': type = PTYFSptc; break; case 't': type = PTYFSpts; break; default: return EINVAL; } error = ptyfs_allocvp(mp, vpp, type, minor(dev)); if (error) return error; error = vn_lock(*vpp, LK_EXCLUSIVE); if (error) { vrele(*vpp); *vpp = NULL; return error; } if (type == PTYFSptc) ptyfs_set_active(mp, minor(dev)); return 0; } static void ptyfs__getvattr(struct mount *mp, struct lwp *l, struct vattr *vattr) { struct ptyfsmount *pmnt = VFSTOPTY(mp); vattr_null(vattr); /* get real uid */ vattr->va_uid = kauth_cred_getuid(l->l_cred); vattr->va_gid = pmnt->pmnt_gid; vattr->va_mode = pmnt->pmnt_mode; } void ptyfs_init(void) { TAILQ_INIT(&ptyfs_head); malloc_type_attach(M_PTYFSMNT); malloc_type_attach(M_PTYFSTMP); ptyfs_hashinit(); } void ptyfs_reinit(void) { } void ptyfs_done(void) { ptyfs_hashdone(); malloc_type_detach(M_PTYFSTMP); malloc_type_detach(M_PTYFSMNT); } #define OSIZE sizeof(struct { int f; gid_t g; mode_t m; }) /* * Mount the Pseudo tty params filesystem */ int ptyfs_mount(struct mount *mp, const char *path, void *data, size_t *data_len) { struct lwp *l = curlwp; int error = 0; struct ptyfsmount *pmnt; struct ptyfs_args *args = data; if (args == NULL) return EINVAL; if (*data_len != sizeof *args) { if (*data_len != OSIZE || args->version >= PTYFS_ARGSVERSION) return EINVAL; } if (UIO_MX & (UIO_MX - 1)) { log(LOG_ERR, "ptyfs: invalid directory entry size"); return EINVAL; } if (mp->mnt_flag & MNT_GETARGS) { pmnt = VFSTOPTY(mp); if (pmnt == NULL) return EIO; args->mode = pmnt->pmnt_mode; args->gid = pmnt->pmnt_gid; if (args->version >= PTYFS_ARGSVERSION) { args->flags = pmnt->pmnt_flags; *data_len = sizeof *args; } else { *data_len = OSIZE; } return 0; } #if 0 /* Don't allow more than one mount */ if (ptyfs_count) return EBUSY; #endif if (mp->mnt_flag & MNT_UPDATE) return EOPNOTSUPP; if (args->version > PTYFS_ARGSVERSION) return EINVAL; pmnt = malloc(sizeof(struct ptyfsmount), M_PTYFSMNT, M_WAITOK); mp->mnt_data = pmnt; mutex_init(&pmnt->pmnt_lock, MUTEX_DEFAULT, IPL_NONE); pmnt->pmnt_gid = args->gid; pmnt->pmnt_mode = args->mode; if (args->version >= PTYFS_ARGSVERSION) pmnt->pmnt_flags = args->flags; else pmnt->pmnt_flags = 0; pmnt->pmnt_bitmap_size = 0; pmnt->pmnt_bitmap = NULL; mp->mnt_flag |= MNT_LOCAL; vfs_getnewfsid(mp); if ((error = set_statvfs_info(path, UIO_USERSPACE, "ptyfs", UIO_SYSSPACE, mp->mnt_op->vfs_name, mp, l)) != 0) { free(pmnt, M_PTYFSMNT); return error; } pmnt->pmnt_mp = mp; TAILQ_INSERT_TAIL(&ptyfs_head, pmnt, pmnt_le); if (ptyfs_count++ == 0) { /* Point pty access to us */ ptyfs_save_ptm = pty_sethandler(&ptm_ptyfspty); } return 0; } /*ARGSUSED*/ int ptyfs_start(struct mount *mp, int flags) { return 0; } /*ARGSUSED*/ int ptyfs_unmount(struct mount *mp, int mntflags) { int error; int flags = 0; struct ptyfsmount *pmnt; if (mntflags & MNT_FORCE) flags |= FORCECLOSE; if ((error = vflush(mp, 0, flags)) != 0) return error; ptyfs_count--; if (ptyfs_count == 0) { /* Restore where pty access was pointing */ (void)pty_sethandler(ptyfs_save_ptm); ptyfs_save_ptm = NULL; } TAILQ_FOREACH(pmnt, &ptyfs_head, pmnt_le) { if (pmnt->pmnt_mp == mp) { TAILQ_REMOVE(&ptyfs_head, pmnt, pmnt_le); break; } } /* * Finally, throw away the ptyfsmount structure */ if (pmnt->pmnt_bitmap_size > 0) kmem_free(pmnt->pmnt_bitmap, pmnt->pmnt_bitmap_size); mutex_destroy(&pmnt->pmnt_lock); free(mp->mnt_data, M_PTYFSMNT); mp->mnt_data = NULL; return 0; } int ptyfs_root(struct mount *mp, int lktype, struct vnode **vpp) { int error; /* setup "." */ error = ptyfs_allocvp(mp, vpp, PTYFSroot, 0); if (error) return error; error = vn_lock(*vpp, lktype); if (error) { vrele(*vpp); *vpp = NULL; return error; } return 0; } /*ARGSUSED*/ int ptyfs_sync(struct mount *mp, int waitfor, kauth_cred_t uc) { return 0; } /* * Initialize this vnode / ptynode pair. * Only for the slave side of a pty, caller assures * no other thread will try to load this node. */ int ptyfs_loadvnode(struct mount *mp, struct vnode *vp, const void *key, size_t key_len, const void **new_key) { struct ptyfskey pkey; struct ptyfsnode *ptyfs; KASSERT(key_len == sizeof(pkey)); memcpy(&pkey, key, key_len); ptyfs = ptyfs_get_node(pkey.ptk_type, pkey.ptk_pty); KASSERT(memcmp(&ptyfs->ptyfs_key, &pkey, sizeof(pkey)) == 0); switch (pkey.ptk_type) { case PTYFSroot: /* /pts = dr-xr-xr-x */ vp->v_type = VDIR; vp->v_vflag = VV_ROOT; break; case PTYFSpts: /* /pts/N = cxxxxxxxxx */ case PTYFSptc: /* controlling side = cxxxxxxxxx */ vp->v_type = VCHR; spec_node_init(vp, PTYFS_MAKEDEV(ptyfs)); break; default: panic("ptyfs_loadvnode"); } vp->v_tag = VT_PTYFS; vp->v_op = ptyfs_vnodeop_p; vp->v_data = ptyfs; uvm_vnp_setsize(vp, 0); *new_key = &ptyfs->ptyfs_key; return 0; } /* * Kernfs flat namespace lookup. * Currently unsupported. */ /*ARGSUSED*/ int ptyfs_vget(struct mount *mp, ino_t ino, int lktype, struct vnode **vpp) { return EOPNOTSUPP; } extern const struct vnodeopv_desc ptyfs_vnodeop_opv_desc; const struct vnodeopv_desc * const ptyfs_vnodeopv_descs[] = { &ptyfs_vnodeop_opv_desc, NULL, }; struct vfsops ptyfs_vfsops = { .vfs_name = MOUNT_PTYFS, .vfs_min_mount_data = sizeof (struct ptyfs_args), .vfs_mount = ptyfs_mount, .vfs_start = ptyfs_start, .vfs_unmount = ptyfs_unmount, .vfs_root = ptyfs_root, .vfs_quotactl = (void *)eopnotsupp, .vfs_statvfs = genfs_statvfs, .vfs_sync = ptyfs_sync, .vfs_vget = ptyfs_vget, .vfs_loadvnode = ptyfs_loadvnode, .vfs_fhtovp = (void *)eopnotsupp, .vfs_vptofh = (void *)eopnotsupp, .vfs_init = ptyfs_init, .vfs_reinit = ptyfs_reinit, .vfs_done = ptyfs_done, .vfs_snapshot = (void *)eopnotsupp, .vfs_extattrctl = (void *)eopnotsupp, .vfs_suspendctl = genfs_suspendctl, .vfs_renamelock_enter = genfs_renamelock_enter, .vfs_renamelock_exit = genfs_renamelock_exit, .vfs_fsync = (void *)eopnotsupp, .vfs_opv_descs = ptyfs_vnodeopv_descs }; SYSCTL_SETUP(ptyfs_sysctl_setup, "ptyfs sysctl") { sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT, CTLTYPE_NODE, "ptyfs", SYSCTL_DESCR("Pty file system"), NULL, 0, NULL, 0, CTL_VFS, 23, CTL_EOL); /* * XXX the "23" above could be dynamic, thereby eliminating * one more instance of the "number to vfs" mapping problem, * but "23" is the order as taken from sys/mount.h */ } static int ptyfs_modcmd(modcmd_t cmd, void *arg) { int error; switch (cmd) { case MODULE_CMD_INIT: error = vfs_attach(&ptyfs_vfsops); if (error != 0) break; break; case MODULE_CMD_FINI: error = vfs_detach(&ptyfs_vfsops); if (error != 0) break; break; default: error = ENOTTY; break; } return (error); }
1 3 8 1823 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 /* $NetBSD: kern_stub.c,v 1.50 2020/08/01 02:04:55 riastradh Exp $ */ /*- * Copyright (c) 2007, 2008 The NetBSD Foundation, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /* * Copyright (c) 1982, 1986, 1991, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)subr_xxx.c 8.3 (Berkeley) 3/29/95 */ /* * Stubs for system calls and facilities not included in the system. */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: kern_stub.c,v 1.50 2020/08/01 02:04:55 riastradh Exp $"); #ifdef _KERNEL_OPT #include "opt_ktrace.h" #include "opt_sysv.h" #include "opt_modular.h" #endif #include <sys/param.h> #include <sys/kernel.h> #include <sys/proc.h> #include <sys/fstypes.h> #include <sys/signalvar.h> #include <sys/syscall.h> #include <sys/ktrace.h> #include <sys/intr.h> #include <sys/cpu.h> #include <sys/module.h> #include <sys/bus.h> #include <sys/userconf.h> bool default_bus_space_is_equal(bus_space_tag_t, bus_space_tag_t); bool default_bus_space_handle_is_equal(bus_space_tag_t, bus_space_handle_t, bus_space_handle_t); /* * SYSV Semaphores, Shared Memory, Message Queues */ #ifndef MODULAR #ifndef SYSVMSG __strong_alias(msgctl1,enosys); #endif #ifndef SYSVSHM __strong_alias(shmctl1,enosys); #endif #ifndef SYSVSEM __strong_alias(semctl1,enosys); #endif #endif /* * ktrace stubs. ktruser() goes to enosys as we want to fail the syscall, * but not kill the process: utrace() is a debugging feature. */ #ifndef KTRACE __strong_alias(ktr_csw,nullop); /* Probes */ __strong_alias(ktr_emul,nullop); __strong_alias(ktr_geniov,nullop); __strong_alias(ktr_genio,nullop); __strong_alias(ktr_mibio,nullop); __strong_alias(ktr_namei,nullop); __strong_alias(ktr_namei2,nullop); __strong_alias(ktr_psig,nullop); __strong_alias(ktr_syscall,nullop); __strong_alias(ktr_sysret,nullop); __strong_alias(ktr_kuser,nullop); __strong_alias(ktr_mib,nullop); __strong_alias(ktr_execarg,nullop); __strong_alias(ktr_execenv,nullop); __strong_alias(ktr_execfd,nullop); __strong_alias(sys_fktrace,sys_nosys); /* Syscalls */ __strong_alias(sys_ktrace,sys_nosys); __strong_alias(sys_utrace,sys_nosys); int ktrace_on; /* Misc */ __strong_alias(ktruser,enosys); __strong_alias(ktr_point,nullop); #endif /* KTRACE */ __weak_alias(device_register, voidop); __weak_alias(device_register_post_config, voidop); __weak_alias(spldebug_start, voidop); __weak_alias(spldebug_stop, voidop); __weak_alias(machdep_init,nullop); __weak_alias(pci_chipset_tag_create, eopnotsupp); __weak_alias(pci_chipset_tag_destroy, voidop); __weak_alias(bus_space_reserve, eopnotsupp); __weak_alias(bus_space_reserve_subregion, eopnotsupp); __weak_alias(bus_space_release, voidop); __weak_alias(bus_space_reservation_map, eopnotsupp); __weak_alias(bus_space_reservation_unmap, voidop); __weak_alias(bus_dma_tag_create, eopnotsupp); __weak_alias(bus_dma_tag_destroy, voidop); __weak_alias(bus_space_tag_create, eopnotsupp); __weak_alias(bus_space_tag_destroy, voidop); __strict_weak_alias(bus_space_is_equal, default_bus_space_is_equal); __strict_weak_alias(bus_space_handle_is_equal, default_bus_space_handle_is_equal); __weak_alias(userconf_bootinfo, voidop); __weak_alias(userconf_init, voidop); __weak_alias(userconf_prompt, voidop); __weak_alias(kobj_renamespace, nullop); __weak_alias(interrupt_get_count, nullop); __weak_alias(interrupt_get_assigned, voidop); __weak_alias(interrupt_get_available, voidop); __weak_alias(interrupt_get_devname, voidop); __weak_alias(interrupt_construct_intrids, nullret); __weak_alias(interrupt_destruct_intrids, voidop); __weak_alias(interrupt_distribute, eopnotsupp); __weak_alias(interrupt_distribute_handler, eopnotsupp); /* * Scheduler activations system calls. These need to remain until libc's * major version is bumped. */ __strong_alias(sys_sa_register,sys_nosys); __strong_alias(sys_sa_stacks,sys_nosys); __strong_alias(sys_sa_enable,sys_nosys); __strong_alias(sys_sa_setconcurrency,sys_nosys); __strong_alias(sys_sa_yield,sys_nosys); __strong_alias(sys_sa_preempt,sys_nosys); __strong_alias(sys_sa_unblockyield,sys_nosys); /* * Stubs for compat_netbsd32. */ __strong_alias(dosa_register,sys_nosys); __strong_alias(sa_stacks1,sys_nosys); /* * Stubs for drivers. See sys/conf.h. */ __strong_alias(devenodev,enodev); __strong_alias(deveopnotsupp,eopnotsupp); __strong_alias(devnullop,nullop); __strong_alias(ttyenodev,enodev); __strong_alias(ttyvenodev,voidop); __strong_alias(ttyvnullop,nullop); /* * Stubs for architectures that do not support kernel preemption. */ #ifndef __HAVE_PREEMPTION bool cpu_kpreempt_enter(uintptr_t where, int s) { return false; } void cpu_kpreempt_exit(uintptr_t where) { } bool cpu_kpreempt_disabled(void) { return true; } #else # ifndef MULTIPROCESSOR # error __HAVE_PREEMPTION requires MULTIPROCESSOR # endif #endif /* !__HAVE_PREEMPTION */ int sys_nosys(struct lwp *l, const void *v, register_t *retval) { mutex_enter(&proc_lock); psignal(l->l_proc, SIGSYS); mutex_exit(&proc_lock); return ENOSYS; } /* * Unsupported device function (e.g. writing to read-only device). */ int enodev(void) { return (ENODEV); } /* * Unconfigured device function; driver not configured. */ int enxio(void) { return (ENXIO); } /* * Unsupported ioctl function. */ int enoioctl(void) { return (ENOTTY); } /* * Unsupported system function. * This is used for an otherwise-reasonable operation * that is not supported by the current system binary. */ int enosys(void) { return (ENOSYS); } /* * Return error for operation not supported * on a specific object or file type. */ int eopnotsupp(void) { return (EOPNOTSUPP); } /* * Generic null operation, void return value. */ void voidop(void) { } /* * Generic null operation, always returns success. */ int nullop(void *v) { return (0); } /* * Generic null operation, always returns null. */ void * nullret(void) { return (NULL); } bool default_bus_space_handle_is_equal(bus_space_tag_t t, bus_space_handle_t h1, bus_space_handle_t h2) { return memcmp(&h1, &h2, sizeof(h1)) == 0; } bool default_bus_space_is_equal(bus_space_tag_t t1, bus_space_tag_t t2) { return memcmp(&t1, &t2, sizeof(t1)) == 0; } /* Stubs for architectures with no kernel FPU access. */ __weak_alias(kthread_fpu_enter_md, voidop); __weak_alias(kthread_fpu_exit_md, voidop);
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 /* $NetBSD: bus_private.h,v 1.16 2022/01/22 15:10:32 skrll Exp $ */ /* NetBSD: bus.h,v 1.8 2005/03/09 19:04:46 matt Exp */ /*- * Copyright (c) 1996, 1997, 1998, 2001 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility, * NASA Ames Research Center. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /* * Copyright (c) 1996 Charles M. Hannum. All rights reserved. * Copyright (c) 1996 Christopher G. Demetriou. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by Christopher G. Demetriou * for the NetBSD Project. * 4. The name of the author may not be used to endorse or promote products * derived from this software without specific prior written permission * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #if !defined(_X86_BUS_PRIVATE_H_) #define _X86_BUS_PRIVATE_H_ /* * Cookie used for bounce buffers. A pointer to one of these it stashed in * the DMA map. */ struct x86_bus_dma_cookie { int id_flags; /* flags; see below */ /* * Information about the original buffer used during * DMA map syncs. Note that origibuflen is only used * for ID_BUFTYPE_LINEAR. */ void *id_origbuf; /* pointer to orig buffer if bouncing */ bus_size_t id_origbuflen; /* ...and size */ int id_buftype; /* type of buffer */ void *id_bouncebuf; /* pointer to the bounce buffer */ bus_size_t id_bouncebuflen; /* ...and size */ int id_nbouncesegs; /* number of valid bounce segs */ bus_dma_segment_t id_bouncesegs[0]; /* array of bounce buffer physical memory segments */ }; /* id_flags */ #define X86_DMA_MIGHT_NEED_BOUNCE 0x01 /* may need bounce buffers */ #define X86_DMA_HAS_BOUNCE 0x02 /* has bounce buffers */ #define X86_DMA_IS_BOUNCING 0x04 /* is bouncing current xfer */ /* id_buftype */ #define X86_DMA_BUFTYPE_INVALID 0 #define X86_DMA_BUFTYPE_LINEAR 1 #define X86_DMA_BUFTYPE_MBUF 2 #define X86_DMA_BUFTYPE_UIO 3 #define X86_DMA_BUFTYPE_RAW 4 /* * default address translation macros, which are appropriate where * paddr_t == bus_addr_t. */ #if !defined(_BUS_PHYS_TO_BUS) #define _BUS_PHYS_TO_BUS(pa) ((bus_addr_t)(pa)) #endif /* !defined(_BUS_PHYS_TO_BUS) */ #if !defined(_BUS_BUS_TO_PHYS) #define _BUS_BUS_TO_PHYS(ba) ((paddr_t)(ba)) #endif /* !defined(_BUS_BUS_TO_PHYS) */ #if !defined(_BUS_VM_PAGE_TO_BUS) #define _BUS_VM_PAGE_TO_BUS(pg) _BUS_PHYS_TO_BUS(VM_PAGE_TO_PHYS(pg)) #endif /* !defined(_BUS_VM_PAGE_TO_BUS) */ #if !defined(_BUS_BUS_TO_VM_PAGE) #define _BUS_BUS_TO_VM_PAGE(ba) PHYS_TO_VM_PAGE(ba) #endif /* !defined(_BUS_BUS_TO_VM_PAGE) */ #if !defined(_BUS_PMAP_ENTER) #define _BUS_PMAP_ENTER(pmap, va, ba, prot, flags) \ pmap_enter(pmap, va, ba, prot, flags) #endif /* _BUS_PMAP_ENTER */ #if !defined(_BUS_VIRT_TO_BUS) #include <uvm/uvm_extern.h> static __inline bus_addr_t _bus_virt_to_bus(struct pmap *, vaddr_t); #define _BUS_VIRT_TO_BUS(pm, va) _bus_virt_to_bus((pm), (va)) static __inline bus_addr_t _bus_virt_to_bus(struct pmap *pm, vaddr_t va) { paddr_t pa; if (!pmap_extract(pm, va, &pa)) { panic("_bus_virt_to_bus"); } return _BUS_PHYS_TO_BUS(pa); } #endif /* !defined(_BUS_VIRT_TO_BUS) */ /* * by default, the end address of RAM visible on bus is the same as the * largest physical address. */ #ifndef _BUS_AVAIL_END #define _BUS_AVAIL_END (avail_end - 1) #endif struct x86_bus_dma_tag { bus_dma_tag_t bdt_super; /* bdt_present: bitmap indicating overrides present (1) in *this* tag, * bdt_exists: bitmap indicating overrides present (1) in *this* tag * or in an ancestor's tag (follow bdt_super to ancestors) */ uint64_t bdt_present; uint64_t bdt_exists; const struct bus_dma_overrides *bdt_ov; void *bdt_ctx; /* * The `bounce threshold' is checked while we are loading * the DMA map. If the physical address of the segment * exceeds the threshold, an error will be returned. The * caller can then take whatever action is necessary to * bounce the transfer. If this value is 0, it will be * ignored. */ int _tag_needs_free; bus_addr_t _bounce_thresh; bus_addr_t _bounce_alloc_lo; bus_addr_t _bounce_alloc_hi; int (*_may_bounce)(bus_dma_tag_t, bus_dmamap_t, int, int *); }; #endif /* !defined(_X86_BUS_PRIVATE_H_) */
11 26 22 22 22 26 26 26 26 4 26 26 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 /* $NetBSD: portalgo.c,v 1.15 2022/11/04 09:01:53 ozaki-r Exp $ */ /* * Copyright 2011 Vlad Balan * * Written by Vlad Balan for the NetBSD Foundation. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * */ /* * see: * RFC 6056 Recommendations for Transport-Protocol Port Randomization */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: portalgo.c,v 1.15 2022/11/04 09:01:53 ozaki-r Exp $"); #ifdef _KERNEL_OPT #include "opt_inet.h" #endif #include <sys/param.h> #include <sys/errno.h> #include <sys/kauth.h> #include <sys/uidinfo.h> #include <sys/md5.h> #include <sys/cprng.h> #include <sys/bitops.h> #include <net/if.h> #include <netinet/in.h> #include <netinet/in_systm.h> #include <netinet/ip.h> #include <netinet/in_pcb.h> #include <netinet/in_var.h> #include <netinet/ip_var.h> #ifdef INET6 #include <netinet/ip6.h> #include <netinet6/ip6_var.h> #include <netinet6/in6_pcb.h> #endif #include <netinet/tcp_vtw.h> #include "portalgo.h" #define NPROTO 2 #define PORTALGO_TCP 0 #define PORTALGO_UDP 1 #define NAF 2 #define PORTALGO_IPV4 0 #define PORTALGO_IPV6 1 #define NRANGES 2 #define PORTALGO_LOWPORT 0 #define PORTALGO_HIGHPORT 1 #if PORTALGO_DEBUG static bool portalgo_debug = true; #define DPRINTF if (portalgo_debug) printf #else #define DPRINTF while (/*CONSTCOND*/0) printf #endif #ifndef PORTALGO_INET4_DEFAULT #define PORTALGO_INET4_DEFAULT PORTALGO_BSD #endif #ifndef PORTALGO_INET6_DEFAULT #define PORTALGO_INET6_DEFAULT PORTALGO_BSD #endif typedef __BITMAP_TYPE(, uint32_t, 0x10000) bitmap; #ifdef INET static int inet4_portalgo = PORTALGO_INET4_DEFAULT; static bitmap inet4_reserve; #endif #ifdef INET6 static int inet6_portalgo = PORTALGO_INET6_DEFAULT; static bitmap inet6_reserve; #endif typedef struct { const char *name; int (*func)(int, uint16_t *, struct inpcb *, kauth_cred_t); } portalgo_algorithm_t; static int algo_bsd(int, uint16_t *, struct inpcb *, kauth_cred_t); static int algo_random_start(int, uint16_t *, struct inpcb *, kauth_cred_t); static int algo_random_pick(int, uint16_t *, struct inpcb *, kauth_cred_t); static int algo_hash(int, uint16_t *, struct inpcb *, kauth_cred_t); static int algo_doublehash(int, uint16_t *, struct inpcb *, kauth_cred_t); static int algo_randinc(int, uint16_t *, struct inpcb *, kauth_cred_t); static const portalgo_algorithm_t algos[] = { { .name = "bsd", .func = algo_bsd }, { .name = "random_start", .func = algo_random_start }, { .name = "random_pick", .func = algo_random_pick }, { .name = "hash", .func = algo_hash }, { .name = "doublehash", .func = algo_doublehash }, { .name = "randinc", .func = algo_randinc } }; #define NALGOS __arraycount(algos) static uint16_t portalgo_next_ephemeral[NPROTO][NAF][NRANGES][NALGOS]; /* * Access the pcb and copy the values of the last port and the ends of * the port range. */ static int pcb_getports(struct inpcb *inp, uint16_t *lastport, uint16_t *mymin, uint16_t *mymax, uint16_t **pnext_ephemeral, int algo) { struct inpcbtable * const table = inp->inp_table; struct socket *so; int portalgo_proto; int portalgo_af; int portalgo_range; so = inp->inp_socket; switch (so->so_type) { case SOCK_DGRAM: /* UDP or DCCP */ case SOCK_CONN_DGRAM: portalgo_proto = PORTALGO_UDP; break; case SOCK_STREAM: /* TCP or SCTP */ portalgo_proto = PORTALGO_TCP; break; default: return EPFNOSUPPORT; } switch (inp->inp_af) { #ifdef INET case AF_INET: { portalgo_af = PORTALGO_IPV4; if (inp->inp_flags & INP_LOWPORT) { *mymin = lowportmin; *mymax = lowportmax; *lastport = table->inpt_lastlow; portalgo_range = PORTALGO_LOWPORT; } else { *mymin = anonportmin; *mymax = anonportmax; *lastport = table->inpt_lastport; portalgo_range = PORTALGO_HIGHPORT; } break; } #endif #ifdef INET6 case AF_INET6: { portalgo_af = PORTALGO_IPV6; if (inp->inp_flags & IN6P_LOWPORT) { *mymin = ip6_lowportmin; *mymax = ip6_lowportmax; *lastport = table->inpt_lastlow; portalgo_range = PORTALGO_LOWPORT; } else { *mymin = ip6_anonportmin; *mymax = ip6_anonportmax; *lastport = table->inpt_lastport; portalgo_range = PORTALGO_HIGHPORT; } break; } #endif default: return EAFNOSUPPORT; } if (*mymin > *mymax) { /* sanity check */ u_int16_t swp; swp = *mymin; *mymin = *mymax; *mymax = swp; } DPRINTF("%s mymin:%d mymax:%d lastport:%d\n", __func__, *mymin, *mymax, *lastport); *pnext_ephemeral = &portalgo_next_ephemeral[portalgo_proto] [portalgo_af][portalgo_range][algo]; DPRINTF("%s portalgo_proto:%d portalgo_af:%d portalgo_range:%d\n", __func__, portalgo_proto, portalgo_af, portalgo_range); return 0; } /* * Check whether the port picked by the port randomizer is available * and whether KAUTH approves of our choice. This part of the code * shamelessly copied from in_pcb.c. */ static bool check_suitable_port(uint16_t port, struct inpcb *inp, kauth_cred_t cred) { struct inpcbtable * const table = inp->inp_table; #ifdef INET vestigial_inpcb_t vestigial; #endif int error; #ifdef INET6 struct socket *so; int wild = 0; #endif DPRINTF("%s called for argument %d\n", __func__, port); switch (inp->inp_af) { #ifdef INET case AF_INET: { /* IPv4 */ struct inpcb *pcb; struct sockaddr_in sin; if (__BITMAP_ISSET(port, &inet4_reserve)) return false; sin.sin_addr = in4p_laddr(inp); pcb = inpcb_lookup_local(table, sin.sin_addr, htons(port), 1, &vestigial); DPRINTF("%s inpcb_lookup_local returned %p and " "vestigial.valid %d\n", __func__, pcb, vestigial.valid); if ((!pcb) && (!vestigial.valid)) { enum kauth_network_req req; /* We have a free port. Check with the secmodel. */ if (inp->inp_flags & INP_LOWPORT) { #ifndef IPNOPRIVPORTS req = KAUTH_REQ_NETWORK_BIND_PRIVPORT; #else req = KAUTH_REQ_NETWORK_BIND_PORT; #endif } else req = KAUTH_REQ_NETWORK_BIND_PORT; sin.sin_port = port; error = kauth_authorize_network(cred, KAUTH_NETWORK_BIND, req, inp->inp_socket, &sin, NULL); DPRINTF("%s kauth_authorize_network returned %d\n", __func__, error); if (error == 0) { DPRINTF("%s port approved\n", __func__); return true; /* KAUTH agrees */ } } break; } #endif #ifdef INET6 case AF_INET6: { /* IPv6 */ struct sockaddr_in6 sin6; void *t; if (__BITMAP_ISSET(port, &inet6_reserve)) return false; sin6.sin6_addr = in6p_laddr(inp); so = inp->inp_socket; /* XXX: this is redundant when called from in6pcb_bind */ if ((so->so_options & (SO_REUSEADDR|SO_REUSEPORT)) == 0 && ((so->so_proto->pr_flags & PR_CONNREQUIRED) == 0 || (so->so_options & SO_ACCEPTCONN) == 0)) wild = 1; #ifdef INET if (IN6_IS_ADDR_V4MAPPED(&sin6.sin6_addr)) { t = inpcb_lookup_local(table, *(struct in_addr *)&sin6.sin6_addr.s6_addr32[3], htons(port), wild, &vestigial); if (!t && vestigial.valid) { DPRINTF("%s inpcb_lookup_local returned " "a result\n", __func__); return false; } } else #endif { t = in6pcb_lookup_local(table, &sin6.sin6_addr, htons(port), wild, &vestigial); if (!t && vestigial.valid) { DPRINTF("%s in6pcb_lookup_local returned " "a result\n", __func__); return false; } } if (t == NULL) { enum kauth_network_req req; /* We have a free port. Check with the secmodel. */ if (inp->inp_flags & IN6P_LOWPORT) { #ifndef IPNOPRIVPORTS req = KAUTH_REQ_NETWORK_BIND_PRIVPORT; #else req = KAUTH_REQ_NETWORK_BIND_PORT; #endif } else { req = KAUTH_REQ_NETWORK_BIND_PORT; } sin6.sin6_port = port; error = kauth_authorize_network(cred, KAUTH_NETWORK_BIND, req, so, &sin6, NULL); if (error) { /* Secmodel says no. Keep looking. */ DPRINTF("%s secmodel says no\n", __func__); return false; } DPRINTF("%s port approved\n", __func__); return true; } break; } #endif default: DPRINTF("%s unknown address family\n", __func__); return false; } return false; } /* This is the default BSD algorithm, as described in RFC 6056 */ static int algo_bsd(int algo, uint16_t *port, struct inpcb *inp, kauth_cred_t cred) { uint16_t count; uint16_t mymin, mymax, lastport; uint16_t *next_ephemeral; int error; DPRINTF("%s called\n", __func__); error = pcb_getports(inp, &lastport, &mymin, &mymax, &next_ephemeral, algo); if (error) return error; count = mymax - mymin + 1; do { uint16_t myport = *next_ephemeral; if (myport < mymin || mymax < myport) myport = mymax; *next_ephemeral = myport - 1; if (check_suitable_port(myport, inp, cred)) { *port = myport; DPRINTF("%s returning port %d\n", __func__, *port); return 0; } count--; } while (count > 0); DPRINTF("%s returning EAGAIN\n", __func__); return EAGAIN; } /* * The straightforward algorithm that increments the port number * by a random amount. */ static int algo_random_start(int algo, uint16_t *port, struct inpcb *inp, kauth_cred_t cred) { uint16_t count, num_ephemeral; uint16_t mymin, mymax, lastport; uint16_t *next_ephemeral; int error; DPRINTF("%s called\n", __func__); error = pcb_getports(inp, &lastport, &mymin, &mymax, &next_ephemeral, algo); if (error) return error; num_ephemeral = mymax - mymin + 1; DPRINTF("num_ephemeral: %u\n", num_ephemeral); *next_ephemeral = mymin + (cprng_fast32() % num_ephemeral); DPRINTF("next_ephemeral initially: %u\n", *next_ephemeral); count = num_ephemeral; do { if (check_suitable_port(*next_ephemeral, inp, cred)) { *port = *next_ephemeral; DPRINTF("%s returning port %d\n", __func__, *port); return 0; } if (*next_ephemeral == mymax) { *next_ephemeral = mymin; } else (*next_ephemeral)++; count--; DPRINTF("next_ephemeral: %u count: %u\n", *next_ephemeral, count); } while (count > 0); DPRINTF("%s returning EINVAL\n", __func__); return EINVAL; } /* * Since there is no state kept on the ports tried, we might actually * give up before exhausting the free ports. */ static int algo_random_pick(int algo, uint16_t *port, struct inpcb *inp, kauth_cred_t cred) { uint16_t count, num_ephemeral; uint16_t mymin, mymax, lastport; uint16_t *next_ephemeral; int error; DPRINTF("%s called\n", __func__); error = pcb_getports(inp, &lastport, &mymin, &mymax, &next_ephemeral, algo); if (error) return error; num_ephemeral = mymax - mymin + 1; DPRINTF("num_ephemeral: %u\n", num_ephemeral); *next_ephemeral = mymin + (cprng_fast32() % num_ephemeral); DPRINTF("next_ephemeral initially: %u\n", *next_ephemeral); count = num_ephemeral; do { if (check_suitable_port(*next_ephemeral, inp, cred)) { *port = *next_ephemeral; DPRINTF("%s returning port %d\n", __func__, *port); return 0; } *next_ephemeral = mymin + (cprng_fast32() % num_ephemeral); count--; DPRINTF("next_ephemeral: %u count: %u\n", *next_ephemeral, count); } while (count > 0); DPRINTF("%s returning EINVAL\n", __func__); return EINVAL; } /* This is the implementation from FreeBSD, with tweaks */ static uint16_t Fhash(const struct inpcb *inp) { MD5_CTX f_ctx; uint32_t Ff[4]; uint32_t secret_f[4]; uint32_t offset; uint16_t soffset[2]; cprng_fast(secret_f, sizeof(secret_f)); MD5Init(&f_ctx); switch (inp->inp_af) { #ifdef INET case AF_INET: { MD5Update(&f_ctx, (const u_char *)&const_in4p_laddr(inp), sizeof(const_in4p_laddr(inp))); MD5Update(&f_ctx, (const u_char *)&const_in4p_faddr(inp), sizeof(const_in4p_faddr(inp))); MD5Update(&f_ctx, (const u_char *)&inp->inp_fport, sizeof(inp->inp_fport)); break; } #endif #ifdef INET6 case AF_INET6: { MD5Update(&f_ctx, (const u_char *)&const_in6p_laddr(inp), sizeof(const_in6p_laddr(inp))); MD5Update(&f_ctx, (const u_char *)&const_in6p_faddr(inp), sizeof(const_in6p_faddr(inp))); MD5Update(&f_ctx, (const u_char *)&inp->inp_fport, sizeof(inp->inp_fport)); break; } #endif default: break; } MD5Update(&f_ctx, (const u_char *)secret_f, sizeof(secret_f)); MD5Final((u_char *)&Ff, &f_ctx); offset = (Ff[0] ^ Ff[1]) ^ (Ff[2] ^ Ff[3]); memcpy(&soffset, &offset, sizeof(soffset)); return soffset[0] ^ soffset[1]; } /* * Checks whether the tuple is complete. If not, marks the pcb for * late binding. */ static bool iscompletetuple(struct inpcb *inp) { switch (inp->inp_af) { #ifdef INET case AF_INET: { if (inp->inp_fport == 0 || in_nullhost(in4p_faddr(inp))) { DPRINTF("%s fport or faddr missing, delaying port " "to connect/send\n", __func__); inp->inp_bindportonsend = true; return false; } else { inp->inp_bindportonsend = false; } break; } #endif #ifdef INET6 case AF_INET6: { if (inp->inp_fport == 0 || memcmp(&in6p_faddr(inp), &in6addr_any, sizeof(in6p_faddr(inp))) == 0) { DPRINTF("%s fport or faddr missing, delaying port " "to connect/send\n", __func__); inp->inp_bindportonsend = true; return false; } else { inp->inp_bindportonsend = false; } break; } #endif default: DPRINTF("%s incorrect address family\n", __func__); return false; } return true; } static int algo_hash(int algo, uint16_t *port, struct inpcb *inp, kauth_cred_t cred) { uint16_t count, num_ephemeral; uint16_t mymin, mymax, lastport; uint16_t *next_ephemeral; uint16_t offset, myport; int error; DPRINTF("%s called\n", __func__); error = pcb_getports(inp, &lastport, &mymin, &mymax, &next_ephemeral, algo); if (error) return error; if (!iscompletetuple(inp)) { *port = 0; return 0; } /* Ephemeral port selection function */ num_ephemeral = mymax - mymin + 1; DPRINTF("num_ephemeral: %d\n", num_ephemeral); offset = Fhash(inp); count = num_ephemeral; do { myport = mymin + (*next_ephemeral + offset) % num_ephemeral; (*next_ephemeral)++; if (check_suitable_port(myport, inp, cred)) { *port = myport; DPRINTF("%s returning port %d\n", __func__, *port); return 0; } count--; } while (count > 0); DPRINTF("%s returning EINVAL\n", __func__); return EINVAL; } static int algo_doublehash(int algo, uint16_t *port, struct inpcb *inp, kauth_cred_t cred) { uint16_t count, num_ephemeral; uint16_t mymin, mymax, lastport; uint16_t *next_ephemeral; uint16_t offset, myport; static uint16_t dhtable[8]; size_t idx; int error; DPRINTF("%s called\n", __func__); error = pcb_getports(inp, &lastport, &mymin, &mymax, &next_ephemeral, algo); if (error) return error; if (!iscompletetuple(inp)) { *port = 0; return 0; } /* first time initialization */ if (dhtable[0] == 0) for (size_t i = 0; i < __arraycount(dhtable); i++) dhtable[i] = cprng_fast32() & 0xffff; /* Ephemeral port selection function */ num_ephemeral = mymax - mymin + 1; offset = Fhash(inp); idx = Fhash(inp) % __arraycount(dhtable); /* G */ count = num_ephemeral; do { myport = mymin + (offset + dhtable[idx]) % num_ephemeral; dhtable[idx]++; if (check_suitable_port(myport, inp, cred)) { *port = myport; DPRINTF("%s returning port %d\n", __func__, *port); return 0; } count--; } while (count > 0); DPRINTF("%s returning EINVAL\n", __func__); return EINVAL; } static int algo_randinc(int algo, uint16_t *port, struct inpcb *inp, kauth_cred_t cred) { static const uint16_t N = 500; /* Determines the trade-off */ uint16_t count, num_ephemeral; uint16_t mymin, mymax, lastport; uint16_t *next_ephemeral; uint16_t myport; int error; DPRINTF("%s called\n", __func__); error = pcb_getports(inp, &lastport, &mymin, &mymax, &next_ephemeral, algo); if (error) return error; if (*next_ephemeral == 0) *next_ephemeral = cprng_fast32() & 0xffff; /* Ephemeral port selection function */ num_ephemeral = mymax - mymin + 1; count = num_ephemeral; do { *next_ephemeral = *next_ephemeral + (cprng_fast32() % N) + 1; myport = mymin + (*next_ephemeral % num_ephemeral); if (check_suitable_port(myport, inp, cred)) { *port = myport; DPRINTF("%s returning port %d\n", __func__, *port); return 0; } count--; } while (count > 0); return EINVAL; } /* The generic function called in order to pick a port. */ int portalgo_randport(uint16_t *port, struct inpcb *inp, kauth_cred_t cred) { int algo, error; uint16_t lport; int default_algo; DPRINTF("%s called\n", __func__); if (inp->inp_portalgo == PORTALGO_DEFAULT) { switch (inp->inp_af) { #ifdef INET case AF_INET: default_algo = inet4_portalgo; break; #endif #ifdef INET6 case AF_INET6: default_algo = inet6_portalgo; break; #endif default: return EINVAL; } if (default_algo == PORTALGO_DEFAULT) algo = PORTALGO_BSD; else algo = default_algo; } else /* socket specifies the algorithm */ algo = inp->inp_portalgo; KASSERT(algo >= 0); KASSERT(algo < NALGOS); switch (inp->inp_af) { #ifdef INET case AF_INET: { char buf[INET_ADDRSTRLEN]; DPRINTF("local addr: %s\n", IN_PRINT(buf, &in4p_laddr(inp))); DPRINTF("local port: %d\n", inp->inp_lport); DPRINTF("foreign addr: %s\n", IN_PRINT(buf, &in4p_faddr(inp))); DPRINTF("foreign port: %d\n", inp->inp_fport); break; } #endif #ifdef INET6 case AF_INET6: { char buf[INET6_ADDRSTRLEN]; DPRINTF("local addr: %s\n", IN6_PRINT(buf, &in6p_laddr(inp))); DPRINTF("local port: %d\n", inp->inp_lport); DPRINTF("foreign addr: %s\n", IN6_PRINT(buf, &in6p_laddr(inp))); DPRINTF("foreign port: %d\n", inp->inp_fport); break; } #endif default: break; } DPRINTF("%s portalgo = %d\n", __func__, algo); error = (*algos[algo].func)(algo, &lport, inp, cred); if (error == 0) { *port = lport; } else if (error != EAGAIN) { uint16_t lastport, mymin, mymax, *pnext_ephemeral; error = pcb_getports(inp, &lastport, &mymin, &mymax, &pnext_ephemeral, algo); if (error) return error; *port = lastport - 1; } return error; } /* Sets the algorithm to be used globally */ static int portalgo_algo_name_select(const char *name, int *algo) { size_t ai; DPRINTF("%s called\n", __func__); for (ai = 0; ai < NALGOS; ai++) if (strcmp(algos[ai].name, name) == 0) { DPRINTF("%s: found idx %zu\n", __func__, ai); *algo = ai; return 0; } return EINVAL; } /* Sets the algorithm to be used by the pcb inp. */ int portalgo_algo_index_select(struct inpcb *inp, int algo) { DPRINTF("%s called with algo %d for pcb %p\n", __func__, algo, inp ); if ((algo < 0 || algo >= NALGOS) && (algo != PORTALGO_DEFAULT)) return EINVAL; inp->inp_portalgo = algo; return 0; } /* * The sysctl hook that is supposed to check that we are picking one * of the valid algorithms. */ static int sysctl_portalgo_selected(SYSCTLFN_ARGS, int *algo) { struct sysctlnode node; int error; char newalgo[PORTALGO_MAXLEN]; DPRINTF("%s called\n", __func__); strlcpy(newalgo, algos[*algo].name, sizeof(newalgo)); node = *rnode; node.sysctl_data = newalgo; node.sysctl_size = sizeof(newalgo); error = sysctl_lookup(SYSCTLFN_CALL(&node)); DPRINTF("newalgo: %s\n", newalgo); if (error || newp == NULL || strncmp(newalgo, algos[*algo].name, sizeof(newalgo)) == 0) return error; #ifdef KAUTH_NETWORK_SOCKET_PORT_RANDOMIZE if (l != NULL && (error = kauth_authorize_system(l->l_cred, KAUTH_NETWORK_SOCKET, KAUTH_NETWORK_SOCKET_PORT_RANDOMIZE, newname, NULL, NULL)) != 0) return error; #endif mutex_enter(softnet_lock); error = portalgo_algo_name_select(newalgo, algo); mutex_exit(softnet_lock); return error; } static int sysctl_portalgo_reserve(SYSCTLFN_ARGS, bitmap *bt) { struct sysctlnode node; int error; DPRINTF("%s called\n", __func__); node = *rnode; node.sysctl_data = bt; node.sysctl_size = sizeof(*bt); error = sysctl_lookup(SYSCTLFN_CALL(&node)); if (error || newp == NULL) return error; #ifdef KAUTH_NETWORK_SOCKET_PORT_RESERVE if (l != NULL && (error = kauth_authorize_system(l->l_cred, KAUTH_NETWORK_SOCKET, KAUTH_NETWORK_SOCKET_PORT_RESERVE, bt, NULL, NULL)) != 0) return error; #endif return error; } #ifdef INET /* * The sysctl hook that is supposed to check that we are picking one * of the valid algorithms. */ int sysctl_portalgo_selected4(SYSCTLFN_ARGS) { return sysctl_portalgo_selected(SYSCTLFN_CALL(rnode), &inet4_portalgo); } int sysctl_portalgo_reserve4(SYSCTLFN_ARGS) { return sysctl_portalgo_reserve(SYSCTLFN_CALL(rnode), &inet4_reserve); } #endif #ifdef INET6 int sysctl_portalgo_selected6(SYSCTLFN_ARGS) { return sysctl_portalgo_selected(SYSCTLFN_CALL(rnode), &inet6_portalgo); } int sysctl_portalgo_reserve6(SYSCTLFN_ARGS) { return sysctl_portalgo_reserve(SYSCTLFN_CALL(rnode), &inet6_reserve); } #endif /* * The sysctl hook that returns the available * algorithms. */ int sysctl_portalgo_available(SYSCTLFN_ARGS) { size_t ai, len = 0; struct sysctlnode node; char availalgo[NALGOS * PORTALGO_MAXLEN]; DPRINTF("%s called\n", __func__); availalgo[0] = '\0'; for (ai = 0; ai < NALGOS; ai++) { len = strlcat(availalgo, algos[ai].name, sizeof(availalgo)); if (ai < NALGOS - 1) strlcat(availalgo, " ", sizeof(availalgo)); } DPRINTF("available algos: %s\n", availalgo); node = *rnode; node.sysctl_data = availalgo; node.sysctl_size = len; return sysctl_lookup(SYSCTLFN_CALL(&node)); }
9 1 3 204 203 203 204 7 2 4 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 /* $NetBSD: subr_disk.c,v 1.137 2023/05/09 12:04:04 riastradh Exp $ */ /*- * Copyright (c) 1996, 1997, 1999, 2000, 2009 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility, * NASA Ames Research Center. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /* * Copyright (c) 1982, 1986, 1988, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)ufs_disksubr.c 8.5 (Berkeley) 1/21/94 */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: subr_disk.c,v 1.137 2023/05/09 12:04:04 riastradh Exp $"); #include <sys/param.h> #include <sys/kernel.h> #include <sys/kmem.h> #include <sys/buf.h> #include <sys/fcntl.h> #include <sys/syslog.h> #include <sys/disklabel.h> #include <sys/disk.h> #include <sys/sysctl.h> #include <lib/libkern/libkern.h> /* * Disk error is the preface to plaintive error messages * about failing disk transfers. It prints messages of the form hp0g: hard error reading fsbn 12345 of 12344-12347 (hp0 bn %d cn %d tn %d sn %d) * if the offset of the error in the transfer and a disk label * are both available. blkdone should be -1 if the position of the error * is unknown; the disklabel pointer may be null from drivers that have not * been converted to use them. The message is printed with printf * if pri is LOG_PRINTF, otherwise it uses log at the specified priority. * The message should be completed (with at least a newline) with printf * or addlog, respectively. There is no trailing space. */ #ifndef PRIdaddr #define PRIdaddr PRId64 #endif void diskerr(const struct buf *bp, const char *dname, const char *what, int pri, int blkdone, const struct disklabel *lp) { int unit = DISKUNIT(bp->b_dev), part = DISKPART(bp->b_dev); void (*pr)(const char *, ...) __printflike(1, 2); char partname = 'a' + part; daddr_t sn; if (/*CONSTCOND*/0) /* Compiler will error this if the format is wrong... */ printf("%" PRIdaddr, bp->b_blkno); if (pri != LOG_PRINTF) { static const char fmt[] = ""; log(pri, fmt); pr = addlog; } else pr = printf; (*pr)("%s%d%c: %s %sing fsbn ", dname, unit, partname, what, bp->b_flags & B_READ ? "read" : "writ"); sn = bp->b_blkno; if (bp->b_bcount <= DEV_BSIZE) (*pr)("%" PRIdaddr, sn); else { if (blkdone >= 0) { sn += blkdone; (*pr)("%" PRIdaddr " of ", sn); } (*pr)("%" PRIdaddr "-%" PRIdaddr "", bp->b_blkno, bp->b_blkno + (bp->b_bcount - 1) / DEV_BSIZE); } if (lp && (blkdone >= 0 || bp->b_bcount <= lp->d_secsize)) { sn += lp->d_partitions[part].p_offset; (*pr)(" (%s%d bn %" PRIdaddr "; cn %" PRIdaddr "", dname, unit, sn, sn / lp->d_secpercyl); sn %= lp->d_secpercyl; (*pr)(" tn %" PRIdaddr " sn %" PRIdaddr ")", sn / lp->d_nsectors, sn % lp->d_nsectors); } } /* * Searches the iostatlist for the disk corresponding to the * name provided. */ struct disk * disk_find(const char *name) { struct io_stats *stat; stat = iostat_find(name); if ((stat != NULL) && (stat->io_type == IOSTAT_DISK)) return stat->io_parent; return (NULL); } void disk_init(struct disk *diskp, const char *name, const struct dkdriver *driver) { u_int blocksize = DEV_BSIZE; /* * Initialize the wedge-related locks and other fields. */ mutex_init(&diskp->dk_rawlock, MUTEX_DEFAULT, IPL_NONE); mutex_init(&diskp->dk_openlock, MUTEX_DEFAULT, IPL_NONE); LIST_INIT(&diskp->dk_wedges); diskp->dk_nwedges = 0; diskp->dk_labelsector = LABELSECTOR; diskp->dk_blkshift = DK_BSIZE2BLKSHIFT(blocksize); diskp->dk_byteshift = DK_BSIZE2BYTESHIFT(blocksize); diskp->dk_name = name; diskp->dk_driver = driver; } /* * Rename a disk. */ void disk_rename(struct disk *diskp, const char *name) { diskp->dk_name = name; iostat_rename(diskp->dk_stats, diskp->dk_name); } /* * Attach a disk. */ void disk_attach(struct disk *diskp) { /* * Allocate and initialize the disklabel structures. */ diskp->dk_label = kmem_zalloc(sizeof(struct disklabel), KM_SLEEP); diskp->dk_cpulabel = kmem_zalloc(sizeof(struct cpu_disklabel), KM_SLEEP); /* * Set up the stats collection. */ diskp->dk_stats = iostat_alloc(IOSTAT_DISK, diskp, diskp->dk_name); } int disk_begindetach(struct disk *dk, int (*lastclose)(device_t), device_t self, int flags) { int rc; rc = 0; mutex_enter(&dk->dk_openlock); if (dk->dk_openmask == 0) ; /* nothing to do */ else if ((flags & DETACH_FORCE) == 0) rc = EBUSY; else if (lastclose != NULL) rc = (*lastclose)(self); mutex_exit(&dk->dk_openlock); return rc; } /* * Detach a disk. */ void disk_detach(struct disk *diskp) { /* * Remove from the drivelist. */ iostat_free(diskp->dk_stats); /* * Release the disk-info dictionary. */ if (diskp->dk_info) { prop_object_release(diskp->dk_info); diskp->dk_info = NULL; } /* * Free the space used by the disklabel structures. */ kmem_free(diskp->dk_label, sizeof(*diskp->dk_label)); kmem_free(diskp->dk_cpulabel, sizeof(*diskp->dk_cpulabel)); } void disk_destroy(struct disk *diskp) { mutex_destroy(&diskp->dk_openlock); mutex_destroy(&diskp->dk_rawlock); } /* * Mark the disk as having work queued for metrics collection. */ void disk_wait(struct disk *diskp) { iostat_wait(diskp->dk_stats); } /* * Mark the disk as busy for metrics collection. */ void disk_busy(struct disk *diskp) { iostat_busy(diskp->dk_stats); } /* * Finished disk operations, gather metrics. */ void disk_unbusy(struct disk *diskp, long bcount, int read) { iostat_unbusy(diskp->dk_stats, bcount, read); } /* * Return true if disk has an I/O operation in flight. */ bool disk_isbusy(struct disk *diskp) { return iostat_isbusy(diskp->dk_stats); } /* * Bounds checking against the media size, used for the raw partition. * secsize, mediasize and b_blkno must all be the same units. * Possibly this has to be DEV_BSIZE (512). */ int bounds_check_with_mediasize(struct buf *bp, int secsize, uint64_t mediasize) { int64_t sz; if (bp->b_blkno < 0) { /* Reject negative offsets immediately. */ bp->b_error = EINVAL; return 0; } sz = howmany((int64_t)bp->b_bcount, secsize); /* * bp->b_bcount is a 32-bit value, and we rejected a negative * bp->b_blkno already, so "bp->b_blkno + sz" cannot overflow. */ if (bp->b_blkno + sz > mediasize) { sz = mediasize - bp->b_blkno; if (sz == 0) { /* If exactly at end of disk, return EOF. */ bp->b_resid = bp->b_bcount; return 0; } if (sz < 0) { /* If past end of disk, return EINVAL. */ bp->b_error = EINVAL; return 0; } /* Otherwise, truncate request. */ bp->b_bcount = sz * secsize; } return 1; } /* * Determine the size of the transfer, and make sure it is * within the boundaries of the partition. Adjust transfer * if needed, and signal errors or early completion. */ int bounds_check_with_label(struct disk *dk, struct buf *bp, int wlabel) { struct disklabel *lp = dk->dk_label; struct partition *p = lp->d_partitions + DISKPART(bp->b_dev); uint64_t p_size, p_offset, labelsector; int64_t sz; if (bp->b_blkno < 0) { /* Reject negative offsets immediately. */ bp->b_error = EINVAL; return -1; } /* Protect against division by zero. XXX: Should never happen?!?! */ if ((lp->d_secsize / DEV_BSIZE) == 0 || lp->d_secpercyl == 0) { bp->b_error = EINVAL; return -1; } p_size = (uint64_t)p->p_size << dk->dk_blkshift; p_offset = (uint64_t)p->p_offset << dk->dk_blkshift; #if RAW_PART == 3 labelsector = lp->d_partitions[2].p_offset; #else labelsector = lp->d_partitions[RAW_PART].p_offset; #endif labelsector = (labelsector + dk->dk_labelsector) << dk->dk_blkshift; sz = howmany((int64_t)bp->b_bcount, DEV_BSIZE); /* * bp->b_bcount is a 32-bit value, and we rejected a negative * bp->b_blkno already, so "bp->b_blkno + sz" cannot overflow. */ if (bp->b_blkno + sz > p_size) { sz = p_size - bp->b_blkno; if (sz == 0) { /* If exactly at end of disk, return EOF. */ bp->b_resid = bp->b_bcount; return 0; } if (sz < 0) { /* If past end of disk, return EINVAL. */ bp->b_error = EINVAL; return -1; } /* Otherwise, truncate request. */ bp->b_bcount = sz << DEV_BSHIFT; } /* Overwriting disk label? */ if (bp->b_blkno + p_offset <= labelsector && bp->b_blkno + p_offset + sz > labelsector && (bp->b_flags & B_READ) == 0 && !wlabel) { bp->b_error = EROFS; return -1; } /* calculate cylinder for disksort to order transfers with */ bp->b_cylinder = (bp->b_blkno + p->p_offset) / (lp->d_secsize / DEV_BSIZE) / lp->d_secpercyl; return 1; } int disk_read_sectors(void (*strat)(struct buf *), const struct disklabel *lp, struct buf *bp, unsigned int sector, int count) { if ((lp->d_secsize / DEV_BSIZE) == 0 || lp->d_secpercyl == 0) return EINVAL; bp->b_blkno = btodb((off_t)sector * lp->d_secsize); bp->b_bcount = count * lp->d_secsize; bp->b_flags = (bp->b_flags & ~B_WRITE) | B_READ; bp->b_oflags &= ~BO_DONE; bp->b_cylinder = sector / lp->d_secpercyl; (*strat)(bp); return biowait(bp); } const char * convertdisklabel(struct disklabel *lp, void (*strat)(struct buf *), struct buf *bp, uint32_t secperunit) { struct partition rp, *altp, *p; int geom_ok; const char *str; memset(&rp, 0, sizeof(rp)); rp.p_size = secperunit; rp.p_fstype = FS_UNUSED; /* If we can seek to d_secperunit - 1, believe the disk geometry. */ if (secperunit != 0 && disk_read_sectors(strat, lp, bp, secperunit - 1, 1) == 0) geom_ok = 1; else geom_ok = 0; #if 0 printf("%s: secperunit (%" PRIu32 ") %s\n", __func__, secperunit, geom_ok ? "ok" : "not ok"); #endif p = &lp->d_partitions[RAW_PART]; if (RAW_PART == 'c' - 'a') altp = &lp->d_partitions['d' - 'a']; else altp = &lp->d_partitions['c' - 'a']; if (lp->d_npartitions > RAW_PART && p->p_offset == 0 && p->p_size != 0) return NULL; /* already a raw partition */ else if (lp->d_npartitions > MAX('c', 'd') - 'a' && altp->p_offset == 0 && altp->p_size != 0) { /* alternate partition ('c' or 'd') is suitable for raw slot, * swap with 'd' or 'c'. */ rp = *p; *p = *altp; *altp = rp; return NULL; } else if (lp->d_npartitions <= RAW_PART && lp->d_npartitions > 'c' - 'a') { /* No raw partition is present, but the alternate is present. * Copy alternate to raw partition. */ lp->d_npartitions = RAW_PART + 1; *p = *altp; return NULL; } else if (!geom_ok) str = "no raw partition and disk reports bad geometry"; else if (lp->d_npartitions <= RAW_PART) { memset(&lp->d_partitions[lp->d_npartitions], 0, sizeof(struct partition) * (RAW_PART - lp->d_npartitions)); *p = rp; lp->d_npartitions = RAW_PART + 1; return NULL; } else if (lp->d_npartitions < MAXPARTITIONS) { memmove(p + 1, p, sizeof(struct partition) * (lp->d_npartitions - RAW_PART)); *p = rp; lp->d_npartitions++; return NULL; } else str = "no raw partition and partition table is full"; #ifdef DIAGNOSTIC printf("Bad partition: %s\n", str); printf("type = %u, subtype = %u, typename = %s\n", lp->d_type, lp->d_subtype, lp->d_typename); printf("secsize = %u, nsectors = %u, ntracks = %u\n", lp->d_secsize, lp->d_nsectors, lp->d_ntracks); printf("ncylinders = %u, secpercyl = %u, secperunit = %u\n", lp->d_ncylinders, lp->d_secpercyl, lp->d_secperunit); printf("npartitions = %u\n", lp->d_npartitions); for (size_t i = 0; i < MIN(lp->d_npartitions, MAXPARTITIONS); i++) { p = &lp->d_partitions[i]; printf("\t%c: offset = %u size = %u fstype = %u\n", (char)(i + 'a'), p->p_offset, p->p_size, p->p_fstype); } #endif return str; } /* * disk_ioctl -- * Generic disk ioctl handling. */ int disk_ioctl(struct disk *dk, dev_t dev, u_long cmd, void *data, int flag, struct lwp *l) { struct dkwedge_info *dkw; struct partinfo *pi; struct partition *dp; #ifdef __HAVE_OLD_DISKLABEL struct disklabel newlabel; #endif switch (cmd) { case DIOCGDISKINFO: { prop_dictionary_t disk_info; int error; mutex_enter(&dk->dk_openlock); if ((disk_info = dk->dk_info) == NULL) { error = ENOTSUP; } else { prop_object_retain(disk_info); error = 0; } mutex_exit(&dk->dk_openlock); if (error) return error; error = prop_dictionary_copyout_ioctl(data, cmd, disk_info); prop_object_release(disk_info); return error; } case DIOCGSECTORSIZE: *(u_int *)data = dk->dk_geom.dg_secsize; return 0; case DIOCGMEDIASIZE: *(off_t *)data = (off_t)dk->dk_geom.dg_secsize * dk->dk_geom.dg_secperunit; return 0; default: break; } if (dev == NODEV) return EPASSTHROUGH; /* The following should be moved to dk_ioctl */ switch (cmd) { case DIOCGDINFO: if (dk->dk_label == NULL) return EBUSY; memcpy(data, dk->dk_label, sizeof (*dk->dk_label)); return 0; #ifdef __HAVE_OLD_DISKLABEL case ODIOCGDINFO: if (dk->dk_label == NULL) return EBUSY; memcpy(&newlabel, dk->dk_label, sizeof(newlabel)); if (newlabel.d_npartitions > OLDMAXPARTITIONS) return ENOTTY; memcpy(data, &newlabel, sizeof(struct olddisklabel)); return 0; #endif case DIOCGPARTINFO: pi = data; memset(pi, 0, sizeof(*pi)); pi->pi_secsize = dk->dk_geom.dg_secsize; pi->pi_bsize = MAX(BLKDEV_IOSIZE, pi->pi_secsize); if (DISKPART(dev) == RAW_PART) { pi->pi_size = dk->dk_geom.dg_secperunit; return 0; } if (dk->dk_label == NULL) return EBUSY; dp = &dk->dk_label->d_partitions[DISKPART(dev)]; pi->pi_offset = dp->p_offset; pi->pi_size = dp->p_size; pi->pi_fstype = dp->p_fstype; pi->pi_frag = dp->p_frag; pi->pi_fsize = dp->p_fsize; pi->pi_cpg = dp->p_cpg; /* * dholland 20130616: XXX this logic should not be * here. It is here because the old buffer cache * demands that all accesses to the same blocks need * to be the same size; but it only works for FFS and * nowadays I think it'll fail silently if the size * info in the disklabel is wrong. (Or missing.) The * buffer cache needs to be smarter; or failing that * we need a reliable way here to get the right block * size; or a reliable way to guarantee that (a) the * fs is not mounted when we get here and (b) any * buffers generated here will get purged when the fs * does get mounted. */ if (dp->p_fstype == FS_BSDFFS && dp->p_frag != 0 && dp->p_fsize != 0) pi->pi_bsize = dp->p_frag * dp->p_fsize; return 0; case DIOCAWEDGE: if ((flag & FWRITE) == 0) return EBADF; dkw = data; strlcpy(dkw->dkw_parent, dk->dk_name, sizeof(dkw->dkw_parent)); return dkwedge_add(dkw); case DIOCDWEDGE: if ((flag & FWRITE) == 0) return EBADF; dkw = data; strlcpy(dkw->dkw_parent, dk->dk_name, sizeof(dkw->dkw_parent)); return dkwedge_del(dkw); case DIOCLWEDGES: return dkwedge_list(dk, data, l); case DIOCMWEDGES: if ((flag & FWRITE) == 0) return EBADF; dkwedge_discover(dk); return 0; case DIOCRMWEDGES: if ((flag & FWRITE) == 0) return EBADF; dkwedge_delidle(dk); return 0; default: return EPASSTHROUGH; } } /* * disk_set_info -- * Canonicalize dk->dk_geom and set some parameters. * * If disk_set_info can happen concurrently with disk_ioctl in a * driver, the driver must serialize calls to disk_set_info with * dk_openlock. */ void disk_set_info(device_t dev, struct disk *dk, const char *type) { struct disk_geom *dg = &dk->dk_geom; if (dg->dg_secsize == 0) { #ifdef DIAGNOSTIC printf("%s: fixing 0 sector size\n", dk->dk_name); #endif dg->dg_secsize = DEV_BSIZE; } dk->dk_blkshift = DK_BSIZE2BLKSHIFT(dg->dg_secsize); dk->dk_byteshift = DK_BSIZE2BYTESHIFT(dg->dg_secsize); if (dg->dg_secperunit == 0) { #ifdef DIAGNOSTIC if (dg->dg_ncylinders == 0) { printf("%s: secperunit and ncylinders are zero\n", dk->dk_name); } if (dg->dg_nsectors == 0 || dg->dg_ntracks == 0) { printf("%s: secperunit and (sectors or tracks) " "are zero\n", dk->dk_name); } #endif dg->dg_secperunit = (int64_t) dg->dg_nsectors * dg->dg_ntracks * dg->dg_ncylinders; } if (dg->dg_ncylinders == 0) { if (dg->dg_ntracks && dg->dg_nsectors) dg->dg_ncylinders = dg->dg_secperunit / (dg->dg_ntracks * dg->dg_nsectors); } prop_dictionary_t disk_info, odisk_info, geom; disk_info = prop_dictionary_create(); geom = prop_dictionary_create(); prop_dictionary_set_uint64(geom, "sectors-per-unit", dg->dg_secperunit); prop_dictionary_set_uint32(geom, "sector-size", dg->dg_secsize); if (dg->dg_nsectors) prop_dictionary_set_uint16(geom, "sectors-per-track", dg->dg_nsectors); if (dg->dg_ntracks) prop_dictionary_set_uint16(geom, "tracks-per-cylinder", dg->dg_ntracks); if (dg->dg_ncylinders) prop_dictionary_set_uint64(geom, "cylinders-per-unit", dg->dg_ncylinders); prop_dictionary_set(disk_info, "geometry", geom); if (type) prop_dictionary_set_string_nocopy(disk_info, "type", type); prop_object_release(geom); odisk_info = dk->dk_info; dk->dk_info = disk_info; if (dev) prop_dictionary_set(device_properties(dev), "disk-info", disk_info); /* * Don't release disk_info here; we keep a reference to it. * disk_detach() will release it when we go away. */ if (odisk_info) prop_object_release(odisk_info); } int disklabel_dev_unit(dev_t dev) { return DISKUNIT(dev); }
4 2 2 2 2 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 /* $NetBSD: vfs_syscalls_20.c,v 1.46 2020/06/28 14:37:53 christos Exp $ */ /* * Copyright (c) 1989, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)vfs_syscalls.c 8.42 (Berkeley) 7/31/95 */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: vfs_syscalls_20.c,v 1.46 2020/06/28 14:37:53 christos Exp $"); #ifdef _KERNEL_OPT #include "opt_compat_netbsd.h" #endif #include <sys/param.h> #include <sys/systm.h> #include <sys/namei.h> #include <sys/filedesc.h> #include <sys/kernel.h> #include <sys/file.h> #include <sys/stat.h> #include <sys/vnode.h> #include <sys/mount.h> #include <sys/proc.h> #include <sys/uio.h> #include <sys/dirent.h> #include <sys/sysctl.h> #include <sys/syscall.h> #include <sys/syscallvar.h> #include <sys/syscallargs.h> #include <sys/kauth.h> #include <sys/vfs_syscalls.h> #include <compat/common/compat_mod.h> #include <compat/sys/mount.h> #include <compat/sys/statvfs.h> static const struct syscall_package vfs_syscalls_20_syscalls[] = { { SYS_compat_20_fhstatfs, 0, (sy_call_t *)compat_20_sys_fhstatfs }, { SYS_compat_20_fstatfs, 0, (sy_call_t *)compat_20_sys_fstatfs }, { SYS_compat_20_getfsstat, 0, (sy_call_t *)compat_20_sys_getfsstat }, { SYS_compat_20_statfs, 0, (sy_call_t *)compat_20_sys_statfs }, { 0, 0, NULL } }; /* * Get filesystem statistics. */ /* ARGSUSED */ int compat_20_sys_statfs(struct lwp *l, const struct compat_20_sys_statfs_args *uap, register_t *retval) { /* { syscallarg(const char *) path; syscallarg(struct statfs12 *) buf; } */ struct mount *mp; struct statvfs *sbuf; int error; struct vnode *vp; error = namei_simple_user(SCARG(uap, path), NSM_FOLLOW_TRYEMULROOT, &vp); if (error != 0) return error; mp = vp->v_mount; sbuf = STATVFSBUF_GET(); if ((error = dostatvfs(mp, sbuf, l, 0, 1)) != 0) goto done; error = statvfs_to_statfs12_copy(sbuf, SCARG(uap, buf), 0); done: vrele(vp); STATVFSBUF_PUT(sbuf); return error; } /* * Get filesystem statistics. */ /* ARGSUSED */ int compat_20_sys_fstatfs(struct lwp *l, const struct compat_20_sys_fstatfs_args *uap, register_t *retval) { /* { syscallarg(int) fd; syscallarg(struct statfs12 *) buf; } */ struct file *fp; struct mount *mp; struct statvfs *sbuf; int error; /* fd_getvnode() will use the descriptor for us */ if ((error = fd_getvnode(SCARG(uap, fd), &fp)) != 0) return (error); mp = fp->f_vnode->v_mount; sbuf = STATVFSBUF_GET(); if ((error = dostatvfs(mp, sbuf, l, 0, 1)) != 0) goto out; error = statvfs_to_statfs12_copy(sbuf, SCARG(uap, buf), 0); out: fd_putfile(SCARG(uap, fd)); STATVFSBUF_PUT(sbuf); return error; } /* * Get statistics on all filesystems. */ int compat_20_sys_getfsstat(struct lwp *l, const struct compat_20_sys_getfsstat_args *uap, register_t *retval) { /* { syscallarg(struct statfs12 *) buf; syscallarg(long) bufsize; syscallarg(int) flags; } */ return do_sys_getvfsstat(l, SCARG(uap, buf), SCARG(uap, bufsize), SCARG(uap, flags), statvfs_to_statfs12_copy, sizeof(struct statfs12), retval); } int compat_20_sys_fhstatfs(struct lwp *l, const struct compat_20_sys_fhstatfs_args *uap, register_t *retval) { /* { syscallarg(const struct compat_30_fhandle *) fhp; syscallarg(struct statfs12 *) buf; } */ struct statvfs *sbuf; struct compat_30_fhandle fh; struct mount *mp; struct vnode *vp; int error; /* * Must be super user */ if ((error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_FILEHANDLE, 0, NULL, NULL, NULL))) return (error); if ((error = copyin(SCARG(uap, fhp), &fh, sizeof(fh))) != 0) return (error); if ((mp = vfs_getvfs(&fh.fh_fsid)) == NULL) return (ESTALE); error = VFS_FHTOVP(mp, (struct fid*)&fh.fh_fid, LK_EXCLUSIVE, &vp); if (error != 0) return (error); mp = vp->v_mount; VOP_UNLOCK(vp); sbuf = STATVFSBUF_GET(); if ((error = VFS_STATVFS(mp, sbuf)) != 0) goto out; error = statvfs_to_statfs12_copy(sbuf, SCARG(uap, buf), 0); out: vrele(vp); STATVFSBUF_PUT(sbuf); return error; } int vfs_syscalls_20_init(void) { return syscall_establish(NULL, vfs_syscalls_20_syscalls); } int vfs_syscalls_20_fini(void) { return syscall_disestablish(NULL, vfs_syscalls_20_syscalls); }
295 296 30 275 30 30 30 30 55 258 76 294 30 30 30 30 30 23 23 23 11 16 15 30 30 30 30 30 30 30 14 16 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 /* $NetBSD: ufs_inode.c,v 1.112 2020/09/05 16:30:13 riastradh Exp $ */ /* * Copyright (c) 1991, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)ufs_inode.c 8.9 (Berkeley) 5/14/95 */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: ufs_inode.c,v 1.112 2020/09/05 16:30:13 riastradh Exp $"); #if defined(_KERNEL_OPT) #include "opt_ffs.h" #include "opt_quota.h" #include "opt_wapbl.h" #include "opt_uvmhist.h" #endif #include <sys/param.h> #include <sys/systm.h> #include <sys/proc.h> #include <sys/vnode.h> #include <sys/mount.h> #include <sys/kernel.h> #include <sys/namei.h> #include <sys/kauth.h> #include <sys/wapbl.h> #include <sys/kmem.h> #include <ufs/ufs/inode.h> #include <ufs/ufs/ufsmount.h> #include <ufs/ufs/ufs_extern.h> #include <ufs/ufs/ufs_wapbl.h> #ifdef UFS_DIRHASH #include <ufs/ufs/dirhash.h> #endif #ifdef UFS_EXTATTR #include <ufs/ufs/extattr.h> #endif #ifdef UVMHIST #include <uvm/uvm.h> #endif #include <uvm/uvm_page.h> #include <uvm/uvm_stat.h> /* * Last reference to an inode. If necessary, write or delete it. */ int ufs_inactive(void *v) { struct vop_inactive_v2_args /* { struct vnode *a_vp; struct bool *a_recycle; } */ *ap = v; struct vnode *vp = ap->a_vp; struct inode *ip = VTOI(vp); struct mount *mp = vp->v_mount; mode_t mode; int allerror = 0, error; bool wapbl_locked = false; UFS_WAPBL_JUNLOCK_ASSERT(mp); /* * Ignore inodes related to stale file handles. */ if (ip->i_mode == 0) goto out; if (ip->i_nlink <= 0 && (mp->mnt_flag & MNT_RDONLY) == 0) { #ifdef UFS_EXTATTR ufs_extattr_vnode_inactive(vp, curlwp); #endif /* * All file blocks must be freed before we can let the vnode * be reclaimed, so can't postpone full truncating any further. */ ufs_truncate_all(vp); #if defined(QUOTA) || defined(QUOTA2) error = UFS_WAPBL_BEGIN(mp); if (error) { allerror = error; } else { wapbl_locked = true; (void)chkiq(ip, -1, NOCRED, 0); } #endif DIP_ASSIGN(ip, rdev, 0); mode = ip->i_mode; ip->i_mode = 0; ip->i_omode = mode; DIP_ASSIGN(ip, mode, 0); ip->i_flag |= IN_CHANGE | IN_UPDATE; /* * Defer final inode free and update to ufs_reclaim(). */ } if (ip->i_flag & (IN_CHANGE | IN_UPDATE | IN_MODIFIED)) { if (! wapbl_locked) { error = UFS_WAPBL_BEGIN(mp); if (error) { allerror = error; goto out; } wapbl_locked = true; } UFS_UPDATE(vp, NULL, NULL, 0); } out: if (wapbl_locked) UFS_WAPBL_END(mp); /* * If we are done with the inode, reclaim it * so that it can be reused immediately. */ *ap->a_recycle = (ip->i_mode == 0); if (ip->i_mode == 0 && (DIP(ip, size) != 0 || DIP(ip, blocks) != 0)) { printf("%s: unlinked ino %" PRId64 " on \"%s\" has" " non zero size %" PRIx64 " or blocks %" PRIx64 " with allerror %d\n", __func__, ip->i_number, mp->mnt_stat.f_mntonname, DIP(ip, size), DIP(ip, blocks), allerror); panic("%s: dirty filesystem?", __func__); } return (allerror); } /* * Reclaim an inode so that it can be used for other purposes. */ int ufs_reclaim(struct vnode *vp) { struct inode *ip = VTOI(vp); if (!UFS_WAPBL_BEGIN(vp->v_mount)) { UFS_UPDATE(vp, NULL, NULL, UPDATE_CLOSE); UFS_WAPBL_END(vp->v_mount); } UFS_UPDATE(vp, NULL, NULL, UPDATE_CLOSE); if (ip->i_devvp) { vrele(ip->i_devvp); ip->i_devvp = 0; } #if defined(QUOTA) || defined(QUOTA2) ufsquota_free(ip); #endif #ifdef UFS_DIRHASH if (ip->i_dirhash != NULL) ufsdirhash_free(ip); #endif return (0); } /* * allocate a range of blocks in a file. * after this function returns, any page entirely contained within the range * will map to invalid data and thus must be overwritten before it is made * accessible to others. */ int ufs_balloc_range(struct vnode *vp, off_t off, off_t len, kauth_cred_t cred, int flags) { off_t neweof; /* file size after the operation */ off_t neweob; /* offset next to the last block after the operation */ off_t pagestart; /* starting offset of range covered by pgs */ off_t eob; /* offset next to allocated blocks */ struct uvm_object *uobj; int i, delta, error, npages; int bshift = vp->v_mount->mnt_fs_bshift; int bsize = 1 << bshift; int ppb = MAX(bsize >> PAGE_SHIFT, 1); struct vm_page **pgs; size_t pgssize; UVMHIST_FUNC("ufs_balloc_range"); UVMHIST_CALLED(ubchist); UVMHIST_LOG(ubchist, "vp %#jx off 0x%jx len 0x%jx u_size 0x%jx", (uintptr_t)vp, off, len, vp->v_size); neweof = MAX(vp->v_size, off + len); GOP_SIZE(vp, neweof, &neweob, 0); error = 0; uobj = &vp->v_uobj; /* * read or create pages covering the range of the allocation and * keep them locked until the new block is allocated, so there * will be no window where the old contents of the new block are * visible to racing threads. */ pagestart = trunc_page(off) & ~(bsize - 1); npages = MIN(ppb, (round_page(neweob) - pagestart) >> PAGE_SHIFT); pgssize = npages * sizeof(struct vm_page *); pgs = kmem_zalloc(pgssize, KM_SLEEP); /* * adjust off to be block-aligned. */ delta = off & (bsize - 1); off -= delta; len += delta; genfs_node_wrlock(vp); rw_enter(uobj->vmobjlock, RW_WRITER); error = VOP_GETPAGES(vp, pagestart, pgs, &npages, 0, VM_PROT_WRITE, 0, PGO_SYNCIO | PGO_PASTEOF | PGO_NOBLOCKALLOC | PGO_NOTIMESTAMP | PGO_GLOCKHELD); if (error) { genfs_node_unlock(vp); goto out; } /* * now allocate the range. */ error = GOP_ALLOC(vp, off, len, flags, cred); genfs_node_unlock(vp); /* * if the allocation succeeded, mark all the pages dirty * and clear PG_RDONLY on any pages that are now fully backed * by disk blocks. if the allocation failed, we do not invalidate * the pages since they might have already existed and been dirty, * in which case we need to keep them around. if we created the pages, * they will be clean and read-only, and leaving such pages * in the cache won't cause any problems. */ GOP_SIZE(vp, off + len, &eob, 0); rw_enter(uobj->vmobjlock, RW_WRITER); for (i = 0; i < npages; i++) { KASSERT((pgs[i]->flags & PG_RELEASED) == 0); if (!error) { if (off <= pagestart + (i << PAGE_SHIFT) && pagestart + ((i + 1) << PAGE_SHIFT) <= eob) { pgs[i]->flags &= ~PG_RDONLY; } uvm_pagemarkdirty(pgs[i], UVM_PAGE_STATUS_DIRTY); } uvm_pagelock(pgs[i]); uvm_pageactivate(pgs[i]); uvm_pageunlock(pgs[i]); } uvm_page_unbusy(pgs, npages); rw_exit(uobj->vmobjlock); out: kmem_free(pgs, pgssize); return error; } int ufs_truncate_retry(struct vnode *vp, int ioflag, uint64_t newsize, kauth_cred_t cred) { struct inode *ip = VTOI(vp); struct mount *mp = vp->v_mount; int error = 0; UFS_WAPBL_JUNLOCK_ASSERT(mp); /* * Truncate might temporarily fail, loop until done. */ do { error = UFS_WAPBL_BEGIN(mp); if (error) goto out; error = UFS_TRUNCATE(vp, newsize, ioflag, cred); UFS_WAPBL_END(mp); if (error != 0 && error != EAGAIN) goto out; } while (ip->i_size != newsize); out: return error; } /* truncate all the data of the inode including extended attributes */ int ufs_truncate_all(struct vnode *vp) { struct inode *ip = VTOI(vp); off_t isize = ip->i_size; if (ip->i_ump->um_fstype == UFS2) isize += ip->i_ffs2_extsize; if (isize == 0) return 0; return ufs_truncate_retry(vp, IO_NORMAL | IO_EXT, 0, NOCRED); }
8 8 8 8 7 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 /* $NetBSD: chacha_sse2.c,v 1.3 2023/08/07 01:07:36 rin Exp $ */ /*- * Copyright (c) 2020 The NetBSD Foundation, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #include <sys/types.h> #include <sys/endian.h> #include <crypto/arch/x86/immintrin.h> #include "chacha_sse2.h" static inline __m128i rol32(__m128i x, uint8_t n) { return _mm_slli_epi32(x, n) | _mm_srli_epi32(x, 32 - n); } static inline void chacha_permute(__m128i *p0, __m128i *p1, __m128i *p2, __m128i *p3, unsigned nr) { __m128i r0, r1, r2, r3; __m128i c0, c1, c2, c3; r0 = *p0; r1 = *p1; r2 = *p2; r3 = *p3; for (; nr > 0; nr -= 2) { r0 = _mm_add_epi32(r0, r1); r3 ^= r0; r3 = rol32(r3, 16); r2 = _mm_add_epi32(r2, r3); r1 ^= r2; r1 = rol32(r1, 12); r0 = _mm_add_epi32(r0, r1); r3 ^= r0; r3 = rol32(r3, 8); r2 = _mm_add_epi32(r2, r3); r1 ^= r2; r1 = rol32(r1, 7); c0 = r0; c1 = _mm_shuffle_epi32(r1, 0x39); c2 = _mm_shuffle_epi32(r2, 0x4e); c3 = _mm_shuffle_epi32(r3, 0x93); c0 = _mm_add_epi32(c0, c1); c3 ^= c0; c3 = rol32(c3, 16); c2 = _mm_add_epi32(c2, c3); c1 ^= c2; c1 = rol32(c1, 12); c0 = _mm_add_epi32(c0, c1); c3 ^= c0; c3 = rol32(c3, 8); c2 = _mm_add_epi32(c2, c3); c1 ^= c2; c1 = rol32(c1, 7); r0 = c0; r1 = _mm_shuffle_epi32(c1, 0x93); r2 = _mm_shuffle_epi32(c2, 0x4e); r3 = _mm_shuffle_epi32(c3, 0x39); } *p0 = r0; *p1 = r1; *p2 = r2; *p3 = r3; } void chacha_core_sse2(uint8_t out[restrict static 64], const uint8_t in[static 16], const uint8_t k[static 32], const uint8_t c[static 16], unsigned nr) { __m128i in0, in1, in2, in3; __m128i r0, r1, r2, r3; r0 = in0 = _mm_loadu_si128((const __m128i *)c); r1 = in1 = _mm_loadu_si128((const __m128i *)k); r2 = in2 = _mm_loadu_si128((const __m128i *)k + 1); r3 = in3 = _mm_loadu_si128((const __m128i *)in); chacha_permute(&r0, &r1, &r2, &r3, nr); _mm_storeu_si128((__m128i *)out + 0, _mm_add_epi32(r0, in0)); _mm_storeu_si128((__m128i *)out + 1, _mm_add_epi32(r1, in1)); _mm_storeu_si128((__m128i *)out + 2, _mm_add_epi32(r2, in2)); _mm_storeu_si128((__m128i *)out + 3, _mm_add_epi32(r3, in3)); } void hchacha_sse2(uint8_t out[restrict static 32], const uint8_t in[static 16], const uint8_t k[static 32], const uint8_t c[static 16], unsigned nr) { __m128i r0, r1, r2, r3; r0 = _mm_loadu_si128((const __m128i *)c); r1 = _mm_loadu_si128((const __m128i *)k); r2 = _mm_loadu_si128((const __m128i *)k + 1); r3 = _mm_loadu_si128((const __m128i *)in); chacha_permute(&r0, &r1, &r2, &r3, nr); _mm_storeu_si128((__m128i *)out + 0, r0); _mm_storeu_si128((__m128i *)out + 1, r3); } #define CHACHA_QUARTERROUND(a, b, c, d) do \ { \ (a) = _mm_add_epi32((a), (b)); (d) ^= a; (d) = rol32((d), 16); \ (c) = _mm_add_epi32((c), (d)); (b) ^= c; (b) = rol32((b), 12); \ (a) = _mm_add_epi32((a), (b)); (d) ^= a; (d) = rol32((d), 8); \ (c) = _mm_add_epi32((c), (d)); (b) ^= c; (b) = rol32((b), 7); \ } while (/*CONSTCOND*/0) static inline __m128i load1_epi32(const void *p) { return (__m128i)_mm_load1_ps(p); } static inline __m128i loadu_epi32(const void *p) { return _mm_loadu_si128(p); } static inline void storeu_epi32(void *p, __m128i v) { return _mm_storeu_si128(p, v); } static inline __m128i unpack0_epi32(__m128i a, __m128i b, __m128i c, __m128i d) { __m128 lo = (__m128)_mm_unpacklo_epi32(a, b); /* (a[0], b[0], ...) */ __m128 hi = (__m128)_mm_unpacklo_epi32(c, d); /* (c[0], d[0], ...) */ /* (lo[0]=a[0], lo[1]=b[0], hi[0]=c[0], hi[1]=d[0]) */ return (__m128i)_mm_movelh_ps(lo, hi); } static inline __m128i unpack1_epi32(__m128i a, __m128i b, __m128i c, __m128i d) { __m128 lo = (__m128)_mm_unpacklo_epi32(a, b); /* (..., a[1], b[1]) */ __m128 hi = (__m128)_mm_unpacklo_epi32(c, d); /* (..., c[1], d[1]) */ /* (lo[2]=a[1], lo[3]=b[1], hi[2]=c[1], hi[3]=d[1]) */ return (__m128i)_mm_movehl_ps(hi, lo); } static inline __m128i unpack2_epi32(__m128i a, __m128i b, __m128i c, __m128i d) { __m128 lo = (__m128)_mm_unpackhi_epi32(a, b); /* (a[2], b[2], ...) */ __m128 hi = (__m128)_mm_unpackhi_epi32(c, d); /* (c[2], d[2], ...) */ /* (lo[0]=a[2], lo[1]=b[2], hi[0]=c[2], hi[1]=d[2]) */ return (__m128i)_mm_movelh_ps(lo, hi); } static inline __m128i unpack3_epi32(__m128i a, __m128i b, __m128i c, __m128i d) { __m128 lo = (__m128)_mm_unpackhi_epi32(a, b); /* (..., a[3], b[3]) */ __m128 hi = (__m128)_mm_unpackhi_epi32(c, d); /* (..., c[3], d[3]) */ /* (lo[2]=a[3], lo[3]=b[3], hi[2]=c[3], hi[3]=d[3]) */ return (__m128i)_mm_movehl_ps(hi, lo); } void chacha_stream_sse2(uint8_t *restrict s, size_t n, uint32_t blkno, const uint8_t nonce[static 12], const uint8_t k[static 32], unsigned nr) { __m128i x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15; __m128i y0,y1,y2,y3,y4,y5,y6,y7,y8,y9,y10,y11,y12,y13,y14,y15; __m128i z0,z1,z2,z3,z4,z5,z6,z7,z8,z9,z10,z11,z12,z13,z14,z15; unsigned r; if (n < 256) goto out; x0 = load1_epi32(chacha_const32 + 0); x1 = load1_epi32(chacha_const32 + 4); x2 = load1_epi32(chacha_const32 + 8); x3 = load1_epi32(chacha_const32 + 12); x4 = load1_epi32(k + 0); x5 = load1_epi32(k + 4); x6 = load1_epi32(k + 8); x7 = load1_epi32(k + 12); x8 = load1_epi32(k + 16); x9 = load1_epi32(k + 20); x10 = load1_epi32(k + 24); x11 = load1_epi32(k + 28); /* x12 set in the loop */ x13 = load1_epi32(nonce + 0); x14 = load1_epi32(nonce + 4); x15 = load1_epi32(nonce + 8); for (; n >= 256; s += 256, n -= 256, blkno += 4) { x12 = _mm_add_epi32(_mm_set1_epi32(blkno), _mm_set_epi32(3,2,1,0)); y0 = x0; y1 = x1; y2 = x2; y3 = x3; y4 = x4; y5 = x5; y6 = x6; y7 = x7; y8 = x8; y9 = x9; y10 = x10; y11 = x11; y12 = x12; y13 = x13; y14 = x14; y15 = x15; for (r = nr; r > 0; r -= 2) { CHACHA_QUARTERROUND( y0, y4, y8,y12); CHACHA_QUARTERROUND( y1, y5, y9,y13); CHACHA_QUARTERROUND( y2, y6,y10,y14); CHACHA_QUARTERROUND( y3, y7,y11,y15); CHACHA_QUARTERROUND( y0, y5,y10,y15); CHACHA_QUARTERROUND( y1, y6,y11,y12); CHACHA_QUARTERROUND( y2, y7, y8,y13); CHACHA_QUARTERROUND( y3, y4, y9,y14); } y0 = _mm_add_epi32(y0, x0); y1 = _mm_add_epi32(y1, x1); y2 = _mm_add_epi32(y2, x2); y3 = _mm_add_epi32(y3, x3); y4 = _mm_add_epi32(y4, x4); y5 = _mm_add_epi32(y5, x5); y6 = _mm_add_epi32(y6, x6); y7 = _mm_add_epi32(y7, x7); y8 = _mm_add_epi32(y8, x8); y9 = _mm_add_epi32(y9, x9); y10 = _mm_add_epi32(y10, x10); y11 = _mm_add_epi32(y11, x11); y12 = _mm_add_epi32(y12, x12); y13 = _mm_add_epi32(y13, x13); y14 = _mm_add_epi32(y14, x14); y15 = _mm_add_epi32(y15, x15); z0 = unpack0_epi32(y0, y1, y2, y3); z1 = unpack0_epi32(y4, y5, y6, y7); z2 = unpack0_epi32(y8, y9, y10, y11); z3 = unpack0_epi32(y12, y13, y14, y15); z4 = unpack1_epi32(y0, y1, y2, y3); z5 = unpack1_epi32(y4, y5, y6, y7); z6 = unpack1_epi32(y8, y9, y10, y11); z7 = unpack1_epi32(y12, y13, y14, y15); z8 = unpack2_epi32(y0, y1, y2, y3); z9 = unpack2_epi32(y4, y5, y6, y7); z10 = unpack2_epi32(y8, y9, y10, y11); z11 = unpack2_epi32(y12, y13, y14, y15); z12 = unpack3_epi32(y0, y1, y2, y3); z13 = unpack3_epi32(y4, y5, y6, y7); z14 = unpack3_epi32(y8, y9, y10, y11); z15 = unpack3_epi32(y12, y13, y14, y15); storeu_epi32(s + 16*0, z0); storeu_epi32(s + 16*1, z1); storeu_epi32(s + 16*2, z2); storeu_epi32(s + 16*3, z3); storeu_epi32(s + 16*4, z4); storeu_epi32(s + 16*5, z5); storeu_epi32(s + 16*6, z6); storeu_epi32(s + 16*7, z7); storeu_epi32(s + 16*8, z8); storeu_epi32(s + 16*9, z9); storeu_epi32(s + 16*10, z10); storeu_epi32(s + 16*11, z11); storeu_epi32(s + 16*12, z12); storeu_epi32(s + 16*13, z13); storeu_epi32(s + 16*14, z14); storeu_epi32(s + 16*15, z15); } out: if (n) { const __m128i blkno_inc = _mm_set_epi32(0,0,0,1); __m128i in0, in1, in2, in3; __m128i r0, r1, r2, r3; in0 = _mm_loadu_si128((const __m128i *)chacha_const32); in1 = _mm_loadu_si128((const __m128i *)k); in2 = _mm_loadu_si128((const __m128i *)k + 1); in3 = _mm_set_epi32(le32dec(nonce + 8), le32dec(nonce + 4), le32dec(nonce), blkno); for (; n; s += 64, n -= 64) { r0 = in0; r1 = in1; r2 = in2; r3 = in3; chacha_permute(&r0, &r1, &r2, &r3, nr); r0 = _mm_add_epi32(r0, in0); r1 = _mm_add_epi32(r1, in1); r2 = _mm_add_epi32(r2, in2); r3 = _mm_add_epi32(r3, in3); if (n < 64) { uint8_t buf[64] __aligned(16); _mm_storeu_si128((__m128i *)buf + 0, r0); _mm_storeu_si128((__m128i *)buf + 1, r1); _mm_storeu_si128((__m128i *)buf + 2, r2); _mm_storeu_si128((__m128i *)buf + 3, r3); memcpy(s, buf, n); break; } _mm_storeu_si128((__m128i *)s + 0, r0); _mm_storeu_si128((__m128i *)s + 1, r1); _mm_storeu_si128((__m128i *)s + 2, r2); _mm_storeu_si128((__m128i *)s + 3, r3); in3 = _mm_add_epi32(in3, blkno_inc); } } } void chacha_stream_xor_sse2(uint8_t *s, const uint8_t *p, size_t n, uint32_t blkno, const uint8_t nonce[static 12], const uint8_t k[static 32], unsigned nr) { __m128i x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15; __m128i y0,y1,y2,y3,y4,y5,y6,y7,y8,y9,y10,y11,y12,y13,y14,y15; __m128i z0,z1,z2,z3,z4,z5,z6,z7,z8,z9,z10,z11,z12,z13,z14,z15; unsigned r; if (n < 256) goto out; x0 = load1_epi32(chacha_const32 + 0); x1 = load1_epi32(chacha_const32 + 4); x2 = load1_epi32(chacha_const32 + 8); x3 = load1_epi32(chacha_const32 + 12); x4 = load1_epi32(k + 0); x5 = load1_epi32(k + 4); x6 = load1_epi32(k + 8); x7 = load1_epi32(k + 12); x8 = load1_epi32(k + 16); x9 = load1_epi32(k + 20); x10 = load1_epi32(k + 24); x11 = load1_epi32(k + 28); /* x12 set in the loop */ x13 = load1_epi32(nonce + 0); x14 = load1_epi32(nonce + 4); x15 = load1_epi32(nonce + 8); for (; n >= 256; s += 256, p += 256, n -= 256, blkno += 4) { x12 = _mm_add_epi32(_mm_set1_epi32(blkno), _mm_set_epi32(3,2,1,0)); y0 = x0; y1 = x1; y2 = x2; y3 = x3; y4 = x4; y5 = x5; y6 = x6; y7 = x7; y8 = x8; y9 = x9; y10 = x10; y11 = x11; y12 = x12; y13 = x13; y14 = x14; y15 = x15; for (r = nr; r > 0; r -= 2) { CHACHA_QUARTERROUND( y0, y4, y8,y12); CHACHA_QUARTERROUND( y1, y5, y9,y13); CHACHA_QUARTERROUND( y2, y6,y10,y14); CHACHA_QUARTERROUND( y3, y7,y11,y15); CHACHA_QUARTERROUND( y0, y5,y10,y15); CHACHA_QUARTERROUND( y1, y6,y11,y12); CHACHA_QUARTERROUND( y2, y7, y8,y13); CHACHA_QUARTERROUND( y3, y4, y9,y14); } y0 = _mm_add_epi32(y0, x0); y1 = _mm_add_epi32(y1, x1); y2 = _mm_add_epi32(y2, x2); y3 = _mm_add_epi32(y3, x3); y4 = _mm_add_epi32(y4, x4); y5 = _mm_add_epi32(y5, x5); y6 = _mm_add_epi32(y6, x6); y7 = _mm_add_epi32(y7, x7); y8 = _mm_add_epi32(y8, x8); y9 = _mm_add_epi32(y9, x9); y10 = _mm_add_epi32(y10, x10); y11 = _mm_add_epi32(y11, x11); y12 = _mm_add_epi32(y12, x12); y13 = _mm_add_epi32(y13, x13); y14 = _mm_add_epi32(y14, x14); y15 = _mm_add_epi32(y15, x15); z0 = unpack0_epi32(y0, y1, y2, y3); z1 = unpack0_epi32(y4, y5, y6, y7); z2 = unpack0_epi32(y8, y9, y10, y11); z3 = unpack0_epi32(y12, y13, y14, y15); z4 = unpack1_epi32(y0, y1, y2, y3); z5 = unpack1_epi32(y4, y5, y6, y7); z6 = unpack1_epi32(y8, y9, y10, y11); z7 = unpack1_epi32(y12, y13, y14, y15); z8 = unpack2_epi32(y0, y1, y2, y3); z9 = unpack2_epi32(y4, y5, y6, y7); z10 = unpack2_epi32(y8, y9, y10, y11); z11 = unpack2_epi32(y12, y13, y14, y15); z12 = unpack3_epi32(y0, y1, y2, y3); z13 = unpack3_epi32(y4, y5, y6, y7); z14 = unpack3_epi32(y8, y9, y10, y11); z15 = unpack3_epi32(y12, y13, y14, y15); storeu_epi32(s + 16*0, loadu_epi32(p + 16*0) ^ z0); storeu_epi32(s + 16*1, loadu_epi32(p + 16*1) ^ z1); storeu_epi32(s + 16*2, loadu_epi32(p + 16*2) ^ z2); storeu_epi32(s + 16*3, loadu_epi32(p + 16*3) ^ z3); storeu_epi32(s + 16*4, loadu_epi32(p + 16*4) ^ z4); storeu_epi32(s + 16*5, loadu_epi32(p + 16*5) ^ z5); storeu_epi32(s + 16*6, loadu_epi32(p + 16*6) ^ z6); storeu_epi32(s + 16*7, loadu_epi32(p + 16*7) ^ z7); storeu_epi32(s + 16*8, loadu_epi32(p + 16*8) ^ z8); storeu_epi32(s + 16*9, loadu_epi32(p + 16*9) ^ z9); storeu_epi32(s + 16*10, loadu_epi32(p + 16*10) ^ z10); storeu_epi32(s + 16*11, loadu_epi32(p + 16*11) ^ z11); storeu_epi32(s + 16*12, loadu_epi32(p + 16*12) ^ z12); storeu_epi32(s + 16*13, loadu_epi32(p + 16*13) ^ z13); storeu_epi32(s + 16*14, loadu_epi32(p + 16*14) ^ z14); storeu_epi32(s + 16*15, loadu_epi32(p + 16*15) ^ z15); } out: if (n) { const __m128i blkno_inc = _mm_set_epi32(0,0,0,1); __m128i in0, in1, in2, in3; __m128i r0, r1, r2, r3; in0 = _mm_loadu_si128((const __m128i *)chacha_const32); in1 = _mm_loadu_si128((const __m128i *)k); in2 = _mm_loadu_si128((const __m128i *)k + 1); in3 = _mm_set_epi32(le32dec(nonce + 8), le32dec(nonce + 4), le32dec(nonce), blkno); for (; n; s += 64, p += 64, n -= 64) { r0 = in0; r1 = in1; r2 = in2; r3 = in3; chacha_permute(&r0, &r1, &r2, &r3, nr); r0 = _mm_add_epi32(r0, in0); r1 = _mm_add_epi32(r1, in1); r2 = _mm_add_epi32(r2, in2); r3 = _mm_add_epi32(r3, in3); if (n < 64) { uint8_t buf[64] __aligned(16); unsigned i; _mm_storeu_si128((__m128i *)buf + 0, r0); _mm_storeu_si128((__m128i *)buf + 1, r1); _mm_storeu_si128((__m128i *)buf + 2, r2); _mm_storeu_si128((__m128i *)buf + 3, r3); for (i = 0; i < n - n%4; i += 4) le32enc(s + i, le32dec(p + i) ^ le32dec(buf + i)); for (; i < n; i++) s[i] = p[i] ^ buf[i]; break; } r0 ^= _mm_loadu_si128((const __m128i *)p + 0); r1 ^= _mm_loadu_si128((const __m128i *)p + 1); r2 ^= _mm_loadu_si128((const __m128i *)p + 2); r3 ^= _mm_loadu_si128((const __m128i *)p + 3); _mm_storeu_si128((__m128i *)s + 0, r0); _mm_storeu_si128((__m128i *)s + 1, r1); _mm_storeu_si128((__m128i *)s + 2, r2); _mm_storeu_si128((__m128i *)s + 3, r3); in3 = _mm_add_epi32(in3, blkno_inc); } } } void xchacha_stream_sse2(uint8_t *restrict s, size_t nbytes, uint32_t blkno, const uint8_t nonce[static 24], const uint8_t k[static 32], unsigned nr) { uint8_t subkey[32]; uint8_t subnonce[12]; hchacha_sse2(subkey, nonce/*[0:16)*/, k, chacha_const32, nr); memset(subnonce, 0, 4); memcpy(subnonce + 4, nonce + 16, 8); chacha_stream_sse2(s, nbytes, blkno, subnonce, subkey, nr); } void xchacha_stream_xor_sse2(uint8_t *restrict c, const uint8_t *p, size_t nbytes, uint32_t blkno, const uint8_t nonce[static 24], const uint8_t k[static 32], unsigned nr) { uint8_t subkey[32]; uint8_t subnonce[12]; hchacha_sse2(subkey, nonce/*[0:16)*/, k, chacha_const32, nr); memset(subnonce, 0, 4); memcpy(subnonce + 4, nonce + 16, 8); chacha_stream_xor_sse2(c, p, nbytes, blkno, subnonce, subkey, nr); }
10 10 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 /* $NetBSD: vfs_xattr.c,v 1.39 2023/03/24 12:22:52 bouyer Exp $ */ /*- * Copyright (c) 2005, 2008 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Jason R. Thorpe. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /* * Copyright (c) 1989, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * VFS extended attribute support. */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: vfs_xattr.c,v 1.39 2023/03/24 12:22:52 bouyer Exp $"); #include <sys/param.h> #include <sys/systm.h> #include <sys/namei.h> #include <sys/filedesc.h> #include <sys/kernel.h> #include <sys/file.h> #include <sys/vnode.h> #include <sys/mount.h> #include <sys/proc.h> #include <sys/uio.h> #include <sys/extattr.h> #include <sys/xattr.h> #include <sys/sysctl.h> #include <sys/syscallargs.h> #include <sys/kauth.h> #include <sys/ktrace.h> #include <miscfs/genfs/genfs.h> static void ktr_xattr_name(const char *str) { ktrkuser("xattr-name", (void *)__UNCONST(str), strlen(str)); } static void ktr_xattr_val(const void *data, size_t cnt) { ktruser("xattr-val", __UNCONST(data), cnt, 0); } /* * Credential check based on process requesting service, and per-attribute * permissions. * * NOTE: Vnode must be locked. */ int extattr_check_cred(struct vnode *vp, int attrspace, kauth_cred_t cred, int access) { if (cred == NOCRED) return 0; return kauth_authorize_vnode(cred, kauth_extattr_action(access), vp, NULL, genfs_can_extattr(vp, cred, access, attrspace)); } /* * Default vfs_extattrctl routine for file systems that do not support * it. */ /*ARGSUSED*/ int vfs_stdextattrctl(struct mount *mp, int cmt, struct vnode *vp, int attrnamespace, const char *attrname) { if (vp != NULL) VOP_UNLOCK(vp); return EOPNOTSUPP; } /* * Push extended attribute configuration information into the file * system. * * NOTE: Not all file systems that support extended attributes will * require the use of this system call. */ int sys_extattrctl(struct lwp *l, const struct sys_extattrctl_args *uap, register_t *retval) { /* { syscallarg(const char *) path; syscallarg(int) cmd; syscallarg(const char *) filename; syscallarg(int) attrnamespace; syscallarg(const char *) attrname; } */ struct vnode *path_vp, *file_vp; struct pathbuf *file_pb; struct nameidata file_nd; char attrname[EXTATTR_MAXNAMELEN]; int error; if (SCARG(uap, attrname) != NULL) { error = copyinstr(SCARG(uap, attrname), attrname, sizeof(attrname), NULL); if (error) return error; } error = namei_simple_user(SCARG(uap, path), NSM_FOLLOW_NOEMULROOT, &path_vp); if (error) return error; file_vp = NULL; if (SCARG(uap, filename) != NULL) { error = pathbuf_copyin(SCARG(uap, filename), &file_pb); if (error) { vrele(path_vp); return error; } NDINIT(&file_nd, LOOKUP, FOLLOW | LOCKLEAF, file_pb); error = namei(&file_nd); if (error) { pathbuf_destroy(file_pb); vrele(path_vp); return error; } file_vp = file_nd.ni_vp; pathbuf_destroy(file_pb); } error = VFS_EXTATTRCTL(path_vp->v_mount, SCARG(uap, cmd), file_vp, SCARG(uap, attrnamespace), SCARG(uap, attrname) != NULL ? attrname : NULL); if (file_vp != NULL) vrele(file_vp); vrele(path_vp); return error; } /***************************************************************************** * Internal routines to manipulate file system extended attributes: * - set * - get * - delete * - list *****************************************************************************/ /* * extattr_set_vp: * * Set a named extended attribute on a file or directory. */ static int extattr_set_vp(struct vnode *vp, int attrnamespace, const char *attrname, const void *data, size_t nbytes, struct lwp *l, register_t *retval, int flag) { struct uio auio; struct iovec aiov; ssize_t cnt; int error; vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); if (flag) { size_t attrlen; error = VOP_GETEXTATTR(vp, attrnamespace, attrname, NULL, &attrlen, l->l_cred); switch (error) { case ENODATA: case ENOATTR: if (flag & XATTR_REPLACE) goto done; break; case 0: if (flag & XATTR_CREATE) { error = EEXIST; goto done; } break; default: goto done; break; } } aiov.iov_base = __UNCONST(data); /* XXXUNCONST kills const */ aiov.iov_len = nbytes; auio.uio_iov = &aiov; auio.uio_iovcnt = 1; auio.uio_offset = 0; if (nbytes > INT_MAX) { error = EINVAL; goto done; } auio.uio_resid = nbytes; auio.uio_rw = UIO_WRITE; KASSERT(l == curlwp); auio.uio_vmspace = l->l_proc->p_vmspace; cnt = nbytes; ktr_xattr_name(attrname); ktr_xattr_val(data, nbytes); error = VOP_SETEXTATTR(vp, attrnamespace, attrname, &auio, l->l_cred); cnt -= auio.uio_resid; retval[0] = cnt; done: VOP_UNLOCK(vp); return error; } /* * extattr_get_vp: * * Get a named extended attribute on a file or directory. */ static int extattr_get_vp(struct vnode *vp, int attrnamespace, const char *attrname, void *data, size_t nbytes, struct lwp *l, register_t *retval) { struct uio auio, *auiop; struct iovec aiov; ssize_t cnt; size_t size, *sizep; int error; vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); /* * Slightly unusual semantics: if the user provides a NULL data * pointer, they don't want to receive the data, just the maximum * read length. */ auiop = NULL; sizep = NULL; cnt = 0; if (data != NULL) { aiov.iov_base = data; aiov.iov_len = nbytes; auio.uio_iov = &aiov; auio.uio_iovcnt = 1; auio.uio_offset = 0; if (nbytes > INT_MAX) { error = EINVAL; goto done; } auio.uio_resid = nbytes; auio.uio_rw = UIO_READ; KASSERT(l == curlwp); auio.uio_vmspace = l->l_proc->p_vmspace; auiop = &auio; cnt = nbytes; } else sizep = &size; ktr_xattr_name(attrname); error = VOP_GETEXTATTR(vp, attrnamespace, attrname, auiop, sizep, l->l_cred); if (auiop != NULL) { cnt -= auio.uio_resid; retval[0] = cnt; ktr_xattr_val(data, cnt); } else retval[0] = size; done: VOP_UNLOCK(vp); return error; } /* * extattr_delete_vp: * * Delete a named extended attribute on a file or directory. */ static int extattr_delete_vp(struct vnode *vp, int attrnamespace, const char *attrname, struct lwp *l) { int error; vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); ktr_xattr_name(attrname); error = VOP_DELETEEXTATTR(vp, attrnamespace, attrname, l->l_cred); if (error == EOPNOTSUPP) error = VOP_SETEXTATTR(vp, attrnamespace, attrname, NULL, l->l_cred); VOP_UNLOCK(vp); return error; } /* * extattr_list_vp: * * Retrieve a list of extended attributes on a file or directory. */ static int extattr_list_vp(struct vnode *vp, int attrnamespace, void *data, size_t nbytes, int flag, struct lwp *l, register_t *retval) { struct uio auio, *auiop; size_t size, *sizep; struct iovec aiov; ssize_t cnt; int error; vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); auiop = NULL; sizep = NULL; cnt = 0; if (data != NULL) { aiov.iov_base = data; aiov.iov_len = nbytes; auio.uio_iov = &aiov; auio.uio_iovcnt = 1; auio.uio_offset = 0; if (nbytes > INT_MAX) { error = EINVAL; goto done; } auio.uio_resid = nbytes; auio.uio_rw = UIO_READ; KASSERT(l == curlwp); auio.uio_vmspace = l->l_proc->p_vmspace; auiop = &auio; cnt = nbytes; } else sizep = &size; error = VOP_LISTEXTATTR(vp, attrnamespace, auiop, sizep, flag, l->l_cred); if (auiop != NULL) { cnt -= auio.uio_resid; retval[0] = cnt; ktruser("xattr-list", data, cnt, 0); } else retval[0] = size; done: VOP_UNLOCK(vp); return error; } /***************************************************************************** * BSD <sys/extattr.h> API for file system extended attributes *****************************************************************************/ int sys_extattr_set_fd(struct lwp *l, const struct sys_extattr_set_fd_args *uap, register_t *retval) { /* { syscallarg(int) fd; syscallarg(int) attrnamespace; syscallarg(const char *) attrname; syscallarg(const void *) data; syscallarg(size_t) nbytes; } */ struct file *fp; struct vnode *vp; char attrname[EXTATTR_MAXNAMELEN]; int error; error = copyinstr(SCARG(uap, attrname), attrname, sizeof(attrname), NULL); if (error) return error; error = fd_getvnode(SCARG(uap, fd), &fp); if (error) return error; vp = fp->f_vnode; error = extattr_set_vp(vp, SCARG(uap, attrnamespace), attrname, SCARG(uap, data), SCARG(uap, nbytes), l, retval, 0); fd_putfile(SCARG(uap, fd)); return error; } int sys_extattr_set_file(struct lwp *l, const struct sys_extattr_set_file_args *uap, register_t *retval) { /* { syscallarg(const char *) path; syscallarg(int) attrnamespace; syscallarg(const char *) attrname; syscallarg(const void *) data; syscallarg(size_t) nbytes; } */ struct vnode *vp; char attrname[EXTATTR_MAXNAMELEN]; int error; error = copyinstr(SCARG(uap, attrname), attrname, sizeof(attrname), NULL); if (error) return error; error = namei_simple_user(SCARG(uap, path), NSM_FOLLOW_NOEMULROOT, &vp); if (error) return error; error = extattr_set_vp(vp, SCARG(uap, attrnamespace), attrname, SCARG(uap, data), SCARG(uap, nbytes), l, retval, 0); vrele(vp); return error; } int sys_extattr_set_link(struct lwp *l, const struct sys_extattr_set_link_args *uap, register_t *retval) { /* { syscallarg(const char *) path; syscallarg(int) attrnamespace; syscallarg(const char *) attrname; syscallarg(const void *) data; syscallarg(size_t) nbytes; } */ struct vnode *vp; char attrname[EXTATTR_MAXNAMELEN]; int error; error = copyinstr(SCARG(uap, attrname), attrname, sizeof(attrname), NULL); if (error) return error; error = namei_simple_user(SCARG(uap, path), NSM_NOFOLLOW_NOEMULROOT, &vp); if (error) return error; error = extattr_set_vp(vp, SCARG(uap, attrnamespace), attrname, SCARG(uap, data), SCARG(uap, nbytes), l, retval, 0); vrele(vp); return error; } int sys_extattr_get_fd(struct lwp *l, const struct sys_extattr_get_fd_args *uap, register_t *retval) { /* { syscallarg(int) fd; syscallarg(int) attrnamespace; syscallarg(const char *) attrname; syscallarg(void *) data; syscallarg(size_t) nbytes; } */ struct file *fp; struct vnode *vp; char attrname[EXTATTR_MAXNAMELEN]; int error; error = copyinstr(SCARG(uap, attrname), attrname, sizeof(attrname), NULL); if (error) return error; error = fd_getvnode(SCARG(uap, fd), &fp); if (error) return error; vp = fp->f_vnode; error = extattr_get_vp(vp, SCARG(uap, attrnamespace), attrname, SCARG(uap, data), SCARG(uap, nbytes), l, retval); fd_putfile(SCARG(uap, fd)); return error; } int sys_extattr_get_file(struct lwp *l, const struct sys_extattr_get_file_args *uap, register_t *retval) { /* { syscallarg(const char *) path; syscallarg(int) attrnamespace; syscallarg(const char *) attrname; syscallarg(void *) data; syscallarg(size_t) nbytes; } */ struct vnode *vp; char attrname[EXTATTR_MAXNAMELEN]; int error; error = copyinstr(SCARG(uap, attrname), attrname, sizeof(attrname), NULL); if (error) return error; error = namei_simple_user(SCARG(uap, path), NSM_FOLLOW_NOEMULROOT, &vp); if (error) return error; error = extattr_get_vp(vp, SCARG(uap, attrnamespace), attrname, SCARG(uap, data), SCARG(uap, nbytes), l, retval); vrele(vp); return error; } int sys_extattr_get_link(struct lwp *l, const struct sys_extattr_get_link_args *uap, register_t *retval) { /* { syscallarg(const char *) path; syscallarg(int) attrnamespace; syscallarg(const char *) attrname; syscallarg(void *) data; syscallarg(size_t) nbytes; } */ struct vnode *vp; char attrname[EXTATTR_MAXNAMELEN]; int error; error = copyinstr(SCARG(uap, attrname), attrname, sizeof(attrname), NULL); if (error) return error; error = namei_simple_user(SCARG(uap, path), NSM_NOFOLLOW_NOEMULROOT, &vp); if (error) return error; error = extattr_get_vp(vp, SCARG(uap, attrnamespace), attrname, SCARG(uap, data), SCARG(uap, nbytes), l, retval); vrele(vp); return error; } int sys_extattr_delete_fd(struct lwp *l, const struct sys_extattr_delete_fd_args *uap, register_t *retval) { /* { syscallarg(int) fd; syscallarg(int) attrnamespace; syscallarg(const char *) attrname; } */ struct file *fp; struct vnode *vp; char attrname[EXTATTR_MAXNAMELEN]; int error; error = copyinstr(SCARG(uap, attrname), attrname, sizeof(attrname), NULL); if (error) return error; error = fd_getvnode(SCARG(uap, fd), &fp); if (error) return error; vp = fp->f_vnode; error = extattr_delete_vp(vp, SCARG(uap, attrnamespace), attrname, l); fd_putfile(SCARG(uap, fd)); return error; } int sys_extattr_delete_file(struct lwp *l, const struct sys_extattr_delete_file_args *uap, register_t *retval) { /* { syscallarg(const char *) path; syscallarg(int) attrnamespace; syscallarg(const char *) attrname; } */ struct vnode *vp; char attrname[EXTATTR_MAXNAMELEN]; int error; error = copyinstr(SCARG(uap, attrname), attrname, sizeof(attrname), NULL); if (error) return error; error = namei_simple_user(SCARG(uap, path), NSM_FOLLOW_NOEMULROOT, &vp); if (error) return error; error = extattr_delete_vp(vp, SCARG(uap, attrnamespace), attrname, l); vrele(vp); return error; } int sys_extattr_delete_link(struct lwp *l, const struct sys_extattr_delete_link_args *uap, register_t *retval) { /* { syscallarg(const char *) path; syscallarg(int) attrnamespace; syscallarg(const char *) attrname; } */ struct vnode *vp; char attrname[EXTATTR_MAXNAMELEN]; int error; error = copyinstr(SCARG(uap, attrname), attrname, sizeof(attrname), NULL); if (error) return error; error = namei_simple_user(SCARG(uap, path), NSM_NOFOLLOW_NOEMULROOT, &vp); if (error) return error; error = extattr_delete_vp(vp, SCARG(uap, attrnamespace), attrname, l); vrele(vp); return error; } int sys_extattr_list_fd(struct lwp *l, const struct sys_extattr_list_fd_args *uap, register_t *retval) { /* { syscallarg(int) fd; syscallarg(int) attrnamespace; syscallarg(void *) data; syscallarg(size_t) nbytes; } */ struct file *fp; struct vnode *vp; int error; error = fd_getvnode(SCARG(uap, fd), &fp); if (error) return error; vp = fp->f_vnode; error = extattr_list_vp(vp, SCARG(uap, attrnamespace), SCARG(uap, data), SCARG(uap, nbytes), EXTATTR_LIST_LENPREFIX, l, retval); fd_putfile(SCARG(uap, fd)); return error; } int sys_extattr_list_file(struct lwp *l, const struct sys_extattr_list_file_args *uap, register_t *retval) { /* { syscallarg(const char *) path; syscallarg(int) attrnamespace; syscallarg(void *) data; syscallarg(size_t) nbytes; } */ struct vnode *vp; int error; error = namei_simple_user(SCARG(uap, path), NSM_FOLLOW_NOEMULROOT, &vp); if (error) return error; error = extattr_list_vp(vp, SCARG(uap, attrnamespace), SCARG(uap, data), SCARG(uap, nbytes), EXTATTR_LIST_LENPREFIX, l, retval); vrele(vp); return error; } int sys_extattr_list_link(struct lwp *l, const struct sys_extattr_list_link_args *uap, register_t *retval) { /* { syscallarg(const char *) path; syscallarg(int) attrnamespace; syscallarg(void *) data; syscallarg(size_t) nbytes; } */ struct vnode *vp; int error; error = namei_simple_user(SCARG(uap, path), NSM_NOFOLLOW_NOEMULROOT, &vp); if (error) return error; error = extattr_list_vp(vp, SCARG(uap, attrnamespace), SCARG(uap, data), SCARG(uap, nbytes), EXTATTR_LIST_LENPREFIX, l, retval); vrele(vp); return error; } /***************************************************************************** * Linux-compatible <sys/xattr.h> API for file system extended attributes *****************************************************************************/ #define MATCH_NS(ns, key) (strncmp(ns, key, sizeof(ns) - 1) == 0) static int xattr_native(const char *key) { if (MATCH_NS("system.", key)) return EXTATTR_NAMESPACE_SYSTEM; else if (MATCH_NS("user.", key)) return EXTATTR_NAMESPACE_USER; else if (MATCH_NS("security.", key)) return EXTATTR_NAMESPACE_SYSTEM; else if (MATCH_NS("trusted.", key)) return EXTATTR_NAMESPACE_SYSTEM; else return EXTATTR_NAMESPACE_USER; } #undef MATCH_NS #define XATTR_ERRNO(e) ((e) == EOPNOTSUPP ? ENOTSUP : (e)) int sys_setxattr(struct lwp *l, const struct sys_setxattr_args *uap, register_t *retval) { /* { syscallarg(const char *) path; syscallarg(const char *) name; syscallarg(void *) value; syscallarg(size_t) size; syscallarg(int) flags; } */ struct vnode *vp; char attrname[XATTR_NAME_MAX]; int attrnamespace; register_t attrlen; int error; error = copyinstr(SCARG(uap, name), attrname, sizeof(attrname), NULL); if (error) goto out; error = namei_simple_user(SCARG(uap, path), NSM_FOLLOW_NOEMULROOT, &vp); if (error) goto out; attrnamespace = xattr_native(attrname); error = extattr_set_vp(vp, attrnamespace, attrname, SCARG(uap, value), SCARG(uap, size), l, &attrlen, SCARG(uap, flags)); vrele(vp); out: *retval = (error == 0 ? 0 : -1); return XATTR_ERRNO(error); } int sys_lsetxattr(struct lwp *l, const struct sys_lsetxattr_args *uap, register_t *retval) { /* { syscallarg(const char *) path; syscallarg(const char *) name; syscallarg(void *) value; syscallarg(size_t) size; syscallarg(int) flags; } */ struct vnode *vp; char attrname[XATTR_NAME_MAX]; int attrnamespace; register_t attrlen; int error; error = copyinstr(SCARG(uap, name), attrname, sizeof(attrname), NULL); if (error) goto out; error = namei_simple_user(SCARG(uap, path), NSM_NOFOLLOW_NOEMULROOT, &vp); if (error) goto out; attrnamespace = xattr_native(attrname); error = extattr_set_vp(vp, attrnamespace, attrname, SCARG(uap, value), SCARG(uap, size), l, &attrlen, SCARG(uap, flags)); vrele(vp); out: *retval = (error == 0 ? 0 : -1); return XATTR_ERRNO(error); } int sys_fsetxattr(struct lwp *l, const struct sys_fsetxattr_args *uap, register_t *retval) { /* { syscallarg(int) fd; syscallarg(const char *) name; syscallarg(void *) value; syscallarg(size_t) size; syscallarg(int) flags; } */ struct file *fp; struct vnode *vp; char attrname[XATTR_NAME_MAX]; int attrnamespace; register_t attrlen; int error; error = copyinstr(SCARG(uap, name), attrname, sizeof(attrname), NULL); if (error) goto out; error = fd_getvnode(SCARG(uap, fd), &fp); if (error) goto out; vp = fp->f_vnode; attrnamespace = xattr_native(attrname); error = extattr_set_vp(vp, attrnamespace, attrname, SCARG(uap, value), SCARG(uap, size), l, &attrlen, SCARG(uap, flags)); fd_putfile(SCARG(uap, fd)); out: *retval = (error == 0 ? 0 : -1); return XATTR_ERRNO(error); } int sys_getxattr(struct lwp *l, const struct sys_getxattr_args *uap, register_t *retval) { /* { syscallarg(const char *) path; syscallarg(const char *) name; syscallarg(void *) value; syscallarg(size_t) size; } */ struct vnode *vp; char attrname[XATTR_NAME_MAX]; int attrnamespace; int error; error = copyinstr(SCARG(uap, name), attrname, sizeof(attrname), NULL); if (error) return error; error = namei_simple_user(SCARG(uap, path), NSM_FOLLOW_NOEMULROOT, &vp); if (error) return error; attrnamespace = xattr_native(attrname); error = extattr_get_vp(vp, attrnamespace, attrname, SCARG(uap, value), SCARG(uap, size), l, retval); vrele(vp); return XATTR_ERRNO(error); } int sys_lgetxattr(struct lwp *l, const struct sys_lgetxattr_args *uap, register_t *retval) { /* { syscallarg(const char *) path; syscallarg(const char *) name; syscallarg(void *) value; syscallarg(size_t) size; } */ struct vnode *vp; char attrname[XATTR_NAME_MAX]; int attrnamespace; int error; error = copyinstr(SCARG(uap, name), attrname, sizeof(attrname), NULL); if (error) return error; error = namei_simple_user(SCARG(uap, path), NSM_NOFOLLOW_NOEMULROOT, &vp); if (error) return error; attrnamespace = xattr_native(attrname); error = extattr_get_vp(vp, attrnamespace, attrname, SCARG(uap, value), SCARG(uap, size), l, retval); vrele(vp); return XATTR_ERRNO(error); } int sys_fgetxattr(struct lwp *l, const struct sys_fgetxattr_args *uap, register_t *retval) { /* { syscallarg(int) fd; syscallarg(const char *) name; syscallarg(void *) value; syscallarg(size_t) size; } */ struct file *fp; struct vnode *vp; char attrname[XATTR_NAME_MAX]; int attrnamespace; int error; error = copyinstr(SCARG(uap, name), attrname, sizeof(attrname), NULL); if (error) return error; error = fd_getvnode(SCARG(uap, fd), &fp); if (error) return error; vp = fp->f_vnode; attrnamespace = xattr_native(attrname); error = extattr_get_vp(vp, attrnamespace, attrname, SCARG(uap, value), SCARG(uap, size), l, retval); fd_putfile(SCARG(uap, fd)); return XATTR_ERRNO(error); } int sys_listxattr(struct lwp *l, const struct sys_listxattr_args *uap, register_t *retval) { /* { syscallarg(const char *) path; syscallarg(char *) list; syscallarg(size_t) size; } */ struct vnode *vp; char *list; size_t size; register_t listsize_usr, listsize_sys; int error; error = namei_simple_user(SCARG(uap, path), NSM_FOLLOW_NOEMULROOT, &vp); if (error) return error; list = SCARG(uap, list); size = SCARG(uap, size); error = extattr_list_vp(vp, EXTATTR_NAMESPACE_USER, list, size, 0, l, &listsize_usr); if (error) goto out; if (list) list += listsize_usr; if (size) size -= listsize_usr; error = extattr_list_vp(vp, EXTATTR_NAMESPACE_SYSTEM, list, size, 0, l, &listsize_sys); switch (error) { case EPERM: error = 0; /* Ignore and just skip system EA */ listsize_sys = 0; break; case 0: break; default: goto out; break; } *retval = listsize_usr + listsize_sys; out: vrele(vp); return XATTR_ERRNO(error); } int sys_llistxattr(struct lwp *l, const struct sys_llistxattr_args *uap, register_t *retval) { /* { syscallarg(const char *) path; syscallarg(char *) list; syscallarg(size_t) size; } */ struct vnode *vp; char *list; size_t size; register_t listsize_usr, listsize_sys; int error; error = namei_simple_user(SCARG(uap, path), NSM_NOFOLLOW_NOEMULROOT, &vp); if (error) return error; list = SCARG(uap, list); size = SCARG(uap, size); error = extattr_list_vp(vp, EXTATTR_NAMESPACE_USER, list, size, 0, l, &listsize_usr); if (error) goto out; if (list) list += listsize_usr; if (size) size -= listsize_usr; error = extattr_list_vp(vp, EXTATTR_NAMESPACE_SYSTEM, list, size, 0, l, &listsize_sys); switch (error) { case EPERM: error = 0; /* Ignore and just skip system EA */ listsize_sys = 0; break; case 0: break; default: goto out; break; } *retval = listsize_usr + listsize_sys; out: vrele(vp); return XATTR_ERRNO(error); } int sys_flistxattr(struct lwp *l, const struct sys_flistxattr_args *uap, register_t *retval) { /* { syscallarg(int) fd; syscallarg(char *) list; syscallarg(size_t) size; } */ struct file *fp; struct vnode *vp; char *list; size_t size; register_t listsize_usr, listsize_sys; int error; error = fd_getvnode(SCARG(uap, fd), &fp); if (error) return error; vp = fp->f_vnode; list = SCARG(uap, list); size = SCARG(uap, size); error = extattr_list_vp(vp, EXTATTR_NAMESPACE_USER, list, size, 0, l, &listsize_usr); if (error) goto out; if (list) list += listsize_usr; if (size) size -= listsize_usr; error = extattr_list_vp(vp, EXTATTR_NAMESPACE_SYSTEM, list, size, 0, l, &listsize_sys); switch (error) { case EPERM: error = 0; /* Ignore and just skip system EA */ listsize_sys = 0; break; case 0: break; default: goto out; break; } *retval = listsize_usr + listsize_sys; out: fd_putfile(SCARG(uap, fd)); return XATTR_ERRNO(error); } int sys_removexattr(struct lwp *l, const struct sys_removexattr_args *uap, register_t *retval) { /* { syscallarg(const char *) path; syscallarg(const char *) name; } */ struct vnode *vp; char attrname[XATTR_NAME_MAX]; int attrnamespace; int error; error = copyinstr(SCARG(uap, name), attrname, sizeof(attrname), NULL); if (error) return error; error = namei_simple_user(SCARG(uap, path), NSM_FOLLOW_NOEMULROOT, &vp); if (error) return error; attrnamespace = xattr_native(attrname); error = extattr_delete_vp(vp, attrnamespace, attrname, l); vrele(vp); return XATTR_ERRNO(error); } int sys_lremovexattr(struct lwp *l, const struct sys_lremovexattr_args *uap, register_t *retval) { /* { syscallarg(const char *) path; syscallarg(const char *) name; } */ struct vnode *vp; char attrname[XATTR_NAME_MAX]; int attrnamespace; int error; error = copyinstr(SCARG(uap, name), attrname, sizeof(attrname), NULL); if (error) return error; error = namei_simple_user(SCARG(uap, path), NSM_NOFOLLOW_NOEMULROOT, &vp); if (error) return error; attrnamespace = xattr_native(attrname); error = extattr_delete_vp(vp, attrnamespace, attrname, l); vrele(vp); return XATTR_ERRNO(error); } int sys_fremovexattr(struct lwp *l, const struct sys_fremovexattr_args *uap, register_t *retval) { /* { syscallarg(int) fd; syscallarg(const char *) name; } */ struct file *fp; struct vnode *vp; char attrname[XATTR_NAME_MAX]; int attrnamespace; int error; error = copyinstr(SCARG(uap, name), attrname, sizeof(attrname), NULL); if (error) return error; error = fd_getvnode(SCARG(uap, fd), &fp); if (error) return error; vp = fp->f_vnode; attrnamespace = xattr_native(attrname); error = extattr_delete_vp(vp, attrnamespace, attrname, l); fd_putfile(SCARG(uap, fd)); return XATTR_ERRNO(error); }
2 1 2 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 10 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 /* $NetBSD: sco_socket.c,v 1.38 2019/01/28 12:53:01 martin Exp $ */ /*- * Copyright (c) 2006 Itronix Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The name of Itronix Inc. may not be used to endorse * or promote products derived from this software without specific * prior written permission. * * THIS SOFTWARE IS PROVIDED BY ITRONIX INC. ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL ITRONIX INC. BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: sco_socket.c,v 1.38 2019/01/28 12:53:01 martin Exp $"); /* load symbolic names */ #ifdef BLUETOOTH_DEBUG #define PRUREQUESTS #define PRCOREQUESTS #endif #include <sys/param.h> #include <sys/domain.h> #include <sys/kernel.h> #include <sys/mbuf.h> #include <sys/proc.h> #include <sys/protosw.h> #include <sys/socket.h> #include <sys/socketvar.h> #include <sys/systm.h> #include <netbt/bluetooth.h> #include <netbt/hci.h> #include <netbt/sco.h> /******************************************************************************* * * SCO SOCK_SEQPACKET sockets - low latency audio data */ static void sco_connecting(void *); static void sco_connected(void *); static void sco_disconnected(void *, int); static void *sco_newconn(void *, struct sockaddr_bt *, struct sockaddr_bt *); static void sco_complete(void *, int); static void sco_linkmode(void *, int); static void sco_input(void *, struct mbuf *); static const struct btproto sco_proto = { sco_connecting, sco_connected, sco_disconnected, sco_newconn, sco_complete, sco_linkmode, sco_input, }; int sco_sendspace = 4096; int sco_recvspace = 4096; static int sco_attach(struct socket *so, int proto) { int error; KASSERT(so->so_pcb == NULL); if (so->so_lock == NULL) { mutex_obj_hold(bt_lock); so->so_lock = bt_lock; solock(so); } KASSERT(solocked(so)); error = soreserve(so, sco_sendspace, sco_recvspace); if (error) { return error; } return sco_attach_pcb((struct sco_pcb **)&so->so_pcb, &sco_proto, so); } static void sco_detach(struct socket *so) { KASSERT(so->so_pcb != NULL); sco_detach_pcb((struct sco_pcb **)&so->so_pcb); KASSERT(so->so_pcb == NULL); } static int sco_accept(struct socket *so, struct sockaddr *nam) { struct sco_pcb *pcb = so->so_pcb; KASSERT(solocked(so)); KASSERT(nam != NULL); if (pcb == NULL) return EINVAL; return sco_peeraddr_pcb(pcb, (struct sockaddr_bt *)nam); } static int sco_bind(struct socket *so, struct sockaddr *nam, struct lwp *l) { struct sco_pcb *pcb = so->so_pcb; struct sockaddr_bt *sa = (struct sockaddr_bt *)nam; KASSERT(solocked(so)); KASSERT(nam != NULL); if (pcb == NULL) return EINVAL; if (sa->bt_len != sizeof(struct sockaddr_bt)) return EINVAL; if (sa->bt_family != AF_BLUETOOTH) return EAFNOSUPPORT; return sco_bind_pcb(pcb, sa); } static int sco_listen(struct socket *so, struct lwp *l) { struct sco_pcb *pcb = so->so_pcb; KASSERT(solocked(so)); if (pcb == NULL) return EINVAL; return sco_listen_pcb(pcb); } static int sco_connect(struct socket *so, struct sockaddr *nam, struct lwp *l) { struct sco_pcb *pcb = so->so_pcb; struct sockaddr_bt *sa = (struct sockaddr_bt *)nam; KASSERT(solocked(so)); KASSERT(nam != NULL); if (pcb == NULL) return EINVAL; if (sa->bt_len != sizeof(struct sockaddr_bt)) return EINVAL; if (sa->bt_family != AF_BLUETOOTH) return EAFNOSUPPORT; soisconnecting(so); return sco_connect_pcb(pcb, sa); } static int sco_connect2(struct socket *so, struct socket *so2) { struct sco_pcb *pcb = so->so_pcb; KASSERT(solocked(so)); if (pcb == NULL) return EINVAL; return EOPNOTSUPP; } static int sco_disconnect(struct socket *so) { struct sco_pcb *pcb = so->so_pcb; KASSERT(solocked(so)); if (pcb == NULL) return EINVAL; soisdisconnecting(so); return sco_disconnect_pcb(pcb, so->so_linger); } static int sco_shutdown(struct socket *so) { KASSERT(solocked(so)); socantsendmore(so); return 0; } static int sco_abort(struct socket *so) { struct sco_pcb *pcb = so->so_pcb; KASSERT(solocked(so)); if (pcb == NULL) return EINVAL; sco_disconnect_pcb(pcb, 0); soisdisconnected(so); sco_detach(so); return 0; } static int sco_ioctl(struct socket *so, u_long cmd, void *nam, struct ifnet *ifp) { return EOPNOTSUPP; } static int sco_stat(struct socket *so, struct stat *ub) { KASSERT(solocked(so)); return 0; } static int sco_peeraddr(struct socket *so, struct sockaddr *nam) { struct sco_pcb *pcb = (struct sco_pcb *)so->so_pcb; KASSERT(solocked(so)); KASSERT(pcb != NULL); KASSERT(nam != NULL); return sco_peeraddr_pcb(pcb, (struct sockaddr_bt *)nam); } static int sco_sockaddr(struct socket *so, struct sockaddr *nam) { struct sco_pcb *pcb = (struct sco_pcb *)so->so_pcb; KASSERT(solocked(so)); KASSERT(pcb != NULL); KASSERT(nam != NULL); return sco_sockaddr_pcb(pcb, (struct sockaddr_bt *)nam); } static int sco_rcvd(struct socket *so, int flags, struct lwp *l) { KASSERT(solocked(so)); return EOPNOTSUPP; } static int sco_recvoob(struct socket *so, struct mbuf *m, int flags) { KASSERT(solocked(so)); return EOPNOTSUPP; } static int sco_send(struct socket *so, struct mbuf *m, struct sockaddr *nam, struct mbuf *control, struct lwp *l) { struct sco_pcb *pcb = so->so_pcb; int err = 0; struct mbuf *m0; KASSERT(solocked(so)); KASSERT(m != NULL); if (control) /* no use for that */ m_freem(control); if (pcb == NULL) { err = EINVAL; goto release; } if (m->m_pkthdr.len == 0) goto release; if (m->m_pkthdr.len > pcb->sp_mtu) { err = EMSGSIZE; goto release; } m0 = m_copypacket(m, M_DONTWAIT); if (m0 == NULL) { err = ENOMEM; goto release; } sbappendrecord(&so->so_snd, m); return sco_send_pcb(pcb, m0); release: m_freem(m); return err; } static int sco_sendoob(struct socket *so, struct mbuf *m, struct mbuf *control) { KASSERT(solocked(so)); m_freem(m); m_freem(control); return EOPNOTSUPP; } static int sco_purgeif(struct socket *so, struct ifnet *ifp) { return EOPNOTSUPP; } /* * get/set socket options */ int sco_ctloutput(int req, struct socket *so, struct sockopt *sopt) { struct sco_pcb *pcb = (struct sco_pcb *)so->so_pcb; int err = 0; DPRINTFN(2, "req %s\n", prcorequests[req]); if (pcb == NULL) return EINVAL; if (sopt->sopt_level != BTPROTO_SCO) return ENOPROTOOPT; switch(req) { case PRCO_GETOPT: err = sco_getopt(pcb, sopt); break; case PRCO_SETOPT: err = sco_setopt(pcb, sopt); break; default: err = ENOPROTOOPT; break; } return err; } /***************************************************************************** * * SCO Protocol socket callbacks * */ static void sco_connecting(void *arg) { struct socket *so = arg; DPRINTF("Connecting\n"); soisconnecting(so); } static void sco_connected(void *arg) { struct socket *so = arg; DPRINTF("Connected\n"); soisconnected(so); } static void sco_disconnected(void *arg, int err) { struct socket *so = arg; DPRINTF("Disconnected (%d)\n", err); so->so_error = err; soisdisconnected(so); } static void * sco_newconn(void *arg, struct sockaddr_bt *laddr, struct sockaddr_bt *raddr) { struct socket *so = arg; DPRINTF("New Connection\n"); so = sonewconn(so, false); if (so == NULL) return NULL; soisconnecting(so); return so->so_pcb; } static void sco_complete(void *arg, int num) { struct socket *so = arg; while (num-- > 0) sbdroprecord(&so->so_snd); sowwakeup(so); } static void sco_linkmode(void *arg, int mode) { } static void sco_input(void *arg, struct mbuf *m) { struct socket *so = arg; /* * since this data is time sensitive, if the buffer * is full we just dump data until the latest one * will fit. */ while (m->m_pkthdr.len > sbspace(&so->so_rcv)) sbdroprecord(&so->so_rcv); DPRINTFN(10, "received %d bytes\n", m->m_pkthdr.len); sbappendrecord(&so->so_rcv, m); sorwakeup(so); } PR_WRAP_USRREQS(sco) #define sco_attach sco_attach_wrapper #define sco_detach sco_detach_wrapper #define sco_accept sco_accept_wrapper #define sco_bind sco_bind_wrapper #define sco_listen sco_listen_wrapper #define sco_connect sco_connect_wrapper #define sco_connect2 sco_connect2_wrapper #define sco_disconnect sco_disconnect_wrapper #define sco_shutdown sco_shutdown_wrapper #define sco_abort sco_abort_wrapper #define sco_ioctl sco_ioctl_wrapper #define sco_stat sco_stat_wrapper #define sco_peeraddr sco_peeraddr_wrapper #define sco_sockaddr sco_sockaddr_wrapper #define sco_rcvd sco_rcvd_wrapper #define sco_recvoob sco_recvoob_wrapper #define sco_send sco_send_wrapper #define sco_sendoob sco_sendoob_wrapper #define sco_purgeif sco_purgeif_wrapper const struct pr_usrreqs sco_usrreqs = { .pr_attach = sco_attach, .pr_detach = sco_detach, .pr_accept = sco_accept, .pr_bind = sco_bind, .pr_listen = sco_listen, .pr_connect = sco_connect, .pr_connect2 = sco_connect2, .pr_disconnect = sco_disconnect, .pr_shutdown = sco_shutdown, .pr_abort = sco_abort, .pr_ioctl = sco_ioctl, .pr_stat = sco_stat, .pr_peeraddr = sco_peeraddr, .pr_sockaddr = sco_sockaddr, .pr_rcvd = sco_rcvd, .pr_recvoob = sco_recvoob, .pr_send = sco_send, .pr_sendoob = sco_sendoob, .pr_purgeif = sco_purgeif, };
14 15 56 14 51 51 51 51 328 307 51 7 7 7 7 58 20 40 8 23 23 9 9 9 6 318 509 39 4 1016 871 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 /* $NetBSD: sched_4bsd.c,v 1.46 2022/10/26 23:24:09 riastradh Exp $ */ /* * Copyright (c) 1999, 2000, 2004, 2006, 2007, 2008, 2019, 2020 * The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility, * NASA Ames Research Center, by Charles M. Hannum, Andrew Doran, and * Daniel Sieger. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /* * Copyright (c) 1982, 1986, 1990, 1991, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)kern_synch.c 8.9 (Berkeley) 5/19/95 */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: sched_4bsd.c,v 1.46 2022/10/26 23:24:09 riastradh Exp $"); #include "opt_ddb.h" #include "opt_lockdebug.h" #include <sys/param.h> #include <sys/systm.h> #include <sys/callout.h> #include <sys/cpu.h> #include <sys/proc.h> #include <sys/kernel.h> #include <sys/resourcevar.h> #include <sys/sched.h> #include <sys/sysctl.h> #include <sys/lockdebug.h> #include <sys/intr.h> #include <sys/atomic.h> static void updatepri(struct lwp *); static void resetpriority(struct lwp *); /* Number of hardclock ticks per sched_tick() */ u_int sched_rrticks __read_mostly; /* * Force switch among equal priority processes every 100ms. * Called from hardclock every hz/10 == sched_rrticks hardclock ticks. */ /* ARGSUSED */ void sched_tick(struct cpu_info *ci) { struct schedstate_percpu *spc = &ci->ci_schedstate; pri_t pri = PRI_NONE; lwp_t *l; spc->spc_ticks = sched_rrticks; if (CURCPU_IDLE_P()) { spc_lock(ci); sched_resched_cpu(ci, MAXPRI_KTHREAD, true); /* spc now unlocked */ return; } l = ci->ci_onproc; if (l == NULL) { return; } /* * Can only be spc_lwplock or a turnstile lock at this point * (if we interrupted priority inheritance trylock dance). */ KASSERT(l->l_mutex != spc->spc_mutex); switch (l->l_class) { case SCHED_FIFO: /* No timeslicing for FIFO jobs. */ break; case SCHED_RR: /* Force it into mi_switch() to look for other jobs to run. */ pri = MAXPRI_KERNEL_RT; break; default: if (spc->spc_flags & SPCF_SHOULDYIELD) { /* * Process is stuck in kernel somewhere, probably * due to buggy or inefficient code. Force a * kernel preemption. */ pri = MAXPRI_KERNEL_RT; } else if (spc->spc_flags & SPCF_SEENRR) { /* * The process has already been through a roundrobin * without switching and may be hogging the CPU. * Indicate that the process should yield. */ pri = MAXPRI_KTHREAD; spc->spc_flags |= SPCF_SHOULDYIELD; } else if ((spc->spc_flags & SPCF_1STCLASS) == 0) { /* * For SMT or asymmetric systems push a little * harder: if this is not a 1st class CPU, try to * find a better one to run this LWP. */ pri = MAXPRI_KTHREAD; spc->spc_flags |= SPCF_SHOULDYIELD; } else { spc->spc_flags |= SPCF_SEENRR; } break; } if (pri != PRI_NONE) { spc_lock(ci); sched_resched_cpu(ci, pri, true); /* spc now unlocked */ } } /* * Why PRIO_MAX - 2? From setpriority(2): * * prio is a value in the range -20 to 20. The default priority is * 0; lower priorities cause more favorable scheduling. A value of * 19 or 20 will schedule a process only when nothing at priority <= * 0 is runnable. * * This gives estcpu influence over 18 priority levels, and leaves nice * with 40 levels. One way to think about it is that nice has 20 levels * either side of estcpu's 18. */ #define ESTCPU_SHIFT 11 #define ESTCPU_MAX ((PRIO_MAX - 2) << ESTCPU_SHIFT) #define ESTCPU_ACCUM (1 << (ESTCPU_SHIFT - 1)) #define ESTCPULIM(e) uimin((e), ESTCPU_MAX) /* * The main parameter used by this algorithm is 'l_estcpu'. It is an estimate * of the recent CPU utilization of the thread. * * l_estcpu is: * - increased each time the hardclock ticks and the thread is found to * be executing, in sched_schedclock() called from hardclock() * - decreased (filtered) on each sched tick, in sched_pstats_hook() * If the lwp is sleeping for more than a second, we don't touch l_estcpu: it * will be updated in sched_setrunnable() when the lwp wakes up, in burst mode * (ie, we decrease it n times). * * Note that hardclock updates l_estcpu and l_cpticks independently. * * ----------------------------------------------------------------------------- * * Here we describe how l_estcpu is decreased. * * Constants for digital decay (filter): * 90% of l_estcpu usage in (5 * loadavg) seconds * * We wish to decay away 90% of l_estcpu in (5 * loadavg) seconds. That is, we * want to compute a value of decay such that the following loop: * for (i = 0; i < (5 * loadavg); i++) * l_estcpu *= decay; * will result in * l_estcpu *= 0.1; * for all values of loadavg. * * Mathematically this loop can be expressed by saying: * decay ** (5 * loadavg) ~= .1 * * And finally, the corresponding value of decay we're using is: * decay = (2 * loadavg) / (2 * loadavg + 1) * * ----------------------------------------------------------------------------- * * Now, let's prove that the value of decay stated above will always fulfill * the equation: * decay ** (5 * loadavg) ~= .1 * * If we compute b as: * b = 2 * loadavg * then * decay = b / (b + 1) * * We now need to prove two things: * 1) Given [factor ** (5 * loadavg) =~ .1], prove [factor == b/(b+1)]. * 2) Given [b/(b+1) ** power =~ .1], prove [power == (5 * loadavg)]. * * Facts: * * For x real: exp(x) = 0! + x**1/1! + x**2/2! + ... * Therefore, for x close to zero, exp(x) =~ 1 + x. * In turn, for b large enough, exp(-1/b) =~ 1 - (1/b) = (b-1)/b. * * * For b large enough, (b-1)/b =~ b/(b+1). * * * For x belonging to [-1;1[, ln(1-x) = - x - x**2/2 - x**3/3 - ... * Therefore ln(b/(b+1)) = ln(1 - 1/(b+1)) =~ -1/(b+1). * * * ln(0.1) =~ -2.30 * * Proof of (1): * factor ** (5 * loadavg) =~ 0.1 * => ln(factor) =~ -2.30 / (5 * loadavg) * => factor =~ exp(-1 / ((5 / 2.30) * loadavg)) * =~ exp(-1 / (2 * loadavg)) * =~ exp(-1 / b) * =~ (b - 1) / b * =~ b / (b + 1) * =~ (2 * loadavg) / ((2 * loadavg) + 1) * * Proof of (2): * (b / (b + 1)) ** power =~ .1 * => power * ln(b / (b + 1)) =~ -2.30 * => power * (-1 / (b + 1)) =~ -2.30 * => power =~ 2.30 * (b + 1) * => power =~ 4.60 * loadavg + 2.30 * => power =~ 5 * loadavg * * Conclusion: decay = (2 * loadavg) / (2 * loadavg + 1) */ /* See calculations above */ #define loadfactor(loadavg) (2 * (loadavg)) static fixpt_t decay_cpu(fixpt_t loadfac, fixpt_t estcpu) { if (estcpu == 0) { return 0; } #if !defined(_LP64) /* avoid 64bit arithmetics. */ #define FIXPT_MAX ((fixpt_t)((UINTMAX_C(1) << sizeof(fixpt_t) * CHAR_BIT) - 1)) if (__predict_true(loadfac <= FIXPT_MAX / ESTCPU_MAX)) { return estcpu * loadfac / (loadfac + FSCALE); } #endif return (uint64_t)estcpu * loadfac / (loadfac + FSCALE); } static fixpt_t decay_cpu_batch(fixpt_t loadfac, fixpt_t estcpu, unsigned int n) { /* * For all load averages >= 1 and max l_estcpu of (255 << ESTCPU_SHIFT), * if we slept for at least seven times the loadfactor, we will decay * l_estcpu to less than (1 << ESTCPU_SHIFT), and therefore we can * return zero directly. * * Note that our ESTCPU_MAX is actually much smaller than * (255 << ESTCPU_SHIFT). */ if ((n << FSHIFT) >= 7 * loadfac) { return 0; } while (estcpu != 0 && n > 1) { estcpu = decay_cpu(loadfac, estcpu); n--; } return estcpu; } /* * sched_pstats_hook: * * Periodically called from sched_pstats(); used to recalculate priorities. */ void sched_pstats_hook(struct lwp *l, int batch) { fixpt_t loadfac; /* * If the LWP has slept an entire second, stop recalculating * its priority until it wakes up. */ KASSERT(lwp_locked(l, NULL)); if (l->l_stat == LSSLEEP || l->l_stat == LSSTOP || l->l_stat == LSSUSPENDED) { if (l->l_slptime > 1) { return; } } loadfac = loadfactor(averunnable.ldavg[0]); l->l_estcpu = decay_cpu(loadfac, l->l_estcpu); resetpriority(l); } /* * Recalculate the priority of an LWP after it has slept for a while. */ static void updatepri(struct lwp *l) { fixpt_t loadfac; KASSERT(lwp_locked(l, NULL)); KASSERT(l->l_slptime > 1); loadfac = loadfactor(averunnable.ldavg[0]); l->l_slptime--; /* the first time was done in sched_pstats */ l->l_estcpu = decay_cpu_batch(loadfac, l->l_estcpu, l->l_slptime); resetpriority(l); } void sched_rqinit(void) { } void sched_setrunnable(struct lwp *l) { if (l->l_slptime > 1) updatepri(l); } void sched_nice(struct proc *p, int n) { struct lwp *l; KASSERT(mutex_owned(p->p_lock)); p->p_nice = n; LIST_FOREACH(l, &p->p_lwps, l_sibling) { lwp_lock(l); resetpriority(l); lwp_unlock(l); } } /* * Recompute the priority of an LWP. Arrange to reschedule if * the resulting priority is better than that of the current LWP. */ static void resetpriority(struct lwp *l) { pri_t pri; struct proc *p = l->l_proc; KASSERT(lwp_locked(l, NULL)); if (l->l_class != SCHED_OTHER) return; /* See comments above ESTCPU_SHIFT definition. */ pri = (PRI_KERNEL - 1) - (l->l_estcpu >> ESTCPU_SHIFT) - p->p_nice; pri = imax(pri, 0); if (pri != l->l_priority) lwp_changepri(l, pri); } /* * We adjust the priority of the current LWP. The priority of a LWP * gets worse as it accumulates CPU time. The CPU usage estimator (l_estcpu) * is increased here. The formula for computing priorities will compute a * different value each time l_estcpu increases. This can cause a switch, * but unless the priority crosses a PPQ boundary the actual queue will not * change. The CPU usage estimator ramps up quite quickly when the process * is running (linearly), and decays away exponentially, at a rate which is * proportionally slower when the system is busy. The basic principle is * that the system will 90% forget that the process used a lot of CPU time * in (5 * loadavg) seconds. This causes the system to favor processes which * haven't run much recently, and to round-robin among other processes. */ void sched_schedclock(struct lwp *l) { if (l->l_class != SCHED_OTHER) return; KASSERT(!CURCPU_IDLE_P()); l->l_estcpu = ESTCPULIM(l->l_estcpu + ESTCPU_ACCUM); lwp_lock(l); resetpriority(l); lwp_unlock(l); } /* * sched_proc_fork: * * Inherit the parent's scheduler history. */ void sched_proc_fork(struct proc *parent, struct proc *child) { lwp_t *pl; KASSERT(mutex_owned(parent->p_lock)); pl = LIST_FIRST(&parent->p_lwps); child->p_estcpu_inherited = pl->l_estcpu; child->p_forktime = sched_pstats_ticks; } /* * sched_proc_exit: * * Chargeback parents for the sins of their children. */ void sched_proc_exit(struct proc *parent, struct proc *child) { fixpt_t loadfac = loadfactor(averunnable.ldavg[0]); fixpt_t estcpu; lwp_t *pl, *cl; /* XXX Only if parent != init?? */ mutex_enter(parent->p_lock); pl = LIST_FIRST(&parent->p_lwps); cl = LIST_FIRST(&child->p_lwps); estcpu = decay_cpu_batch(loadfac, child->p_estcpu_inherited, sched_pstats_ticks - child->p_forktime); if (cl->l_estcpu > estcpu) { lwp_lock(pl); pl->l_estcpu = ESTCPULIM(pl->l_estcpu + cl->l_estcpu - estcpu); lwp_unlock(pl); } mutex_exit(parent->p_lock); } void sched_wakeup(struct lwp *l) { } void sched_slept(struct lwp *l) { } void sched_lwp_fork(struct lwp *l1, struct lwp *l2) { l2->l_estcpu = l1->l_estcpu; } void sched_lwp_collect(struct lwp *t) { lwp_t *l; /* Absorb estcpu value of collected LWP. */ l = curlwp; lwp_lock(l); l->l_estcpu += t->l_estcpu; lwp_unlock(l); } void sched_oncpu(lwp_t *l) { } void sched_newts(lwp_t *l) { } /* * Sysctl nodes and initialization. */ static int sysctl_sched_rtts(SYSCTLFN_ARGS) { struct sysctlnode node; int rttsms = hztoms(sched_rrticks); node = *rnode; node.sysctl_data = &rttsms; return sysctl_lookup(SYSCTLFN_CALL(&node)); } SYSCTL_SETUP(sysctl_sched_4bsd_setup, "sysctl sched setup") { const struct sysctlnode *node = NULL; sysctl_createv(clog, 0, NULL, &node, CTLFLAG_PERMANENT, CTLTYPE_NODE, "sched", SYSCTL_DESCR("Scheduler options"), NULL, 0, NULL, 0, CTL_KERN, CTL_CREATE, CTL_EOL); if (node == NULL) return; sched_rrticks = hz / 10; sysctl_createv(NULL, 0, &node, NULL, CTLFLAG_PERMANENT, CTLTYPE_STRING, "name", NULL, NULL, 0, __UNCONST("4.4BSD"), 0, CTL_CREATE, CTL_EOL); sysctl_createv(NULL, 0, &node, NULL, CTLFLAG_PERMANENT, CTLTYPE_INT, "rtts", SYSCTL_DESCR("Round-robin time quantum (in milliseconds)"), sysctl_sched_rtts, 0, NULL, 0, CTL_CREATE, CTL_EOL); }
1 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 /* $NetBSD: tmpfs_fifoops.c,v 1.15 2021/07/19 01:30:25 dholland Exp $ */ /* * Copyright (c) 2005 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Julio M. Merino Vidal, developed as part of Google's Summer of Code * 2005 program. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /* * tmpfs vnode interface for named pipes. */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: tmpfs_fifoops.c,v 1.15 2021/07/19 01:30:25 dholland Exp $"); #include <sys/param.h> #include <sys/vnode.h> #include <fs/tmpfs/tmpfs.h> #include <fs/tmpfs/tmpfs_fifoops.h> /* * vnode operations vector used for fifos stored in a tmpfs file system. */ int (**tmpfs_fifoop_p)(void *); const struct vnodeopv_entry_desc tmpfs_fifoop_entries[] = { { &vop_default_desc, vn_default_error }, GENFS_FIFOOP_ENTRIES, { &vop_close_desc, tmpfs_fifo_close }, { &vop_access_desc, tmpfs_access }, { &vop_accessx_desc, genfs_accessx }, { &vop_getattr_desc, tmpfs_getattr }, { &vop_setattr_desc, tmpfs_setattr }, { &vop_read_desc, tmpfs_fifo_read }, { &vop_write_desc, tmpfs_fifo_write }, { &vop_fcntl_desc, genfs_fcntl }, { &vop_fsync_desc, vn_fifo_bypass }, { &vop_inactive_desc, tmpfs_inactive }, { &vop_reclaim_desc, tmpfs_reclaim }, { &vop_lock_desc, genfs_lock }, { &vop_unlock_desc, genfs_unlock }, { &vop_strategy_desc, vn_fifo_bypass }, { &vop_print_desc, tmpfs_print }, { &vop_islocked_desc, genfs_islocked }, { &vop_bwrite_desc, genfs_nullop }, { NULL, NULL } }; const struct vnodeopv_desc tmpfs_fifoop_opv_desc = { &tmpfs_fifoop_p, tmpfs_fifoop_entries }; int tmpfs_fifo_close(void *v) { struct vop_close_args /* { struct vnode *a_vp; int a_fflag; kauth_cred_t a_cred; } */ *ap __unused = v; return VOCALL(fifo_vnodeop_p, VOFFSET(vop_close), v); } int tmpfs_fifo_read(void *v) { struct vop_read_args /* { struct vnode *a_vp; struct uio *a_uio; int a_ioflag; kauth_cred_t a_cred; } */ *ap = v; vnode_t *vp = ap->a_vp; tmpfs_update(vp, TMPFS_UPDATE_ATIME); return VOCALL(fifo_vnodeop_p, VOFFSET(vop_read), v); } int tmpfs_fifo_write(void *v) { struct vop_write_args /* { struct vnode *a_vp; struct uio *a_uio; int a_ioflag; kauth_cred_t a_cred; } */ *ap = v; vnode_t *vp = ap->a_vp; tmpfs_update(vp, TMPFS_UPDATE_MTIME); return VOCALL(fifo_vnodeop_p, VOFFSET(vop_write), v); }
413 172 21 70 159 6 1 5 191 129 82 46 210 42 33 1 4 35 4 100 5 5 99 97 767 734 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 /* $NetBSD: secmodel_suser.c,v 1.58 2024/03/01 22:01:03 andvar Exp $ */ /*- * Copyright (c) 2006 Elad Efrat <elad@NetBSD.org> * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The name of the author may not be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* * This file contains kauth(9) listeners needed to implement the traditional * NetBSD superuser access restrictions. * * There are two main resources a request can be issued to: user-owned and * system owned. For the first, traditional Unix access checks are done, as * well as superuser checks. If needed, the request context is examined before * a decision is made. For the latter, usually only superuser checks are done * as normal users are not allowed to access system resources. */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: secmodel_suser.c,v 1.58 2024/03/01 22:01:03 andvar Exp $"); #include <sys/types.h> #include <sys/param.h> #include <sys/kauth.h> #include <sys/mutex.h> #include <sys/mount.h> #include <sys/socketvar.h> #include <sys/sysctl.h> #include <sys/vnode.h> #include <sys/proc.h> #include <sys/module.h> #include <secmodel/secmodel.h> #include <secmodel/suser/suser.h> MODULE(MODULE_CLASS_SECMODEL, suser, NULL); static kauth_listener_t l_generic, l_system, l_process, l_network, l_machdep, l_device, l_vnode; static secmodel_t suser_sm; SYSCTL_SETUP(sysctl_security_suser_setup, "secmodel_user sysctl") { const struct sysctlnode *rnode; sysctl_createv(clog, 0, NULL, &rnode, CTLFLAG_PERMANENT, CTLTYPE_NODE, "models", NULL, NULL, 0, NULL, 0, CTL_SECURITY, CTL_CREATE, CTL_EOL); sysctl_createv(clog, 0, &rnode, &rnode, CTLFLAG_PERMANENT, CTLTYPE_NODE, "suser", NULL, NULL, 0, NULL, 0, CTL_CREATE, CTL_EOL); sysctl_createv(clog, 0, &rnode, NULL, CTLFLAG_PERMANENT, CTLTYPE_STRING, "name", NULL, NULL, 0, __UNCONST(SECMODEL_SUSER_NAME), 0, CTL_CREATE, CTL_EOL); } void secmodel_suser_init(void) { } void secmodel_suser_start(void) { l_generic = kauth_listen_scope(KAUTH_SCOPE_GENERIC, secmodel_suser_generic_cb, NULL); l_system = kauth_listen_scope(KAUTH_SCOPE_SYSTEM, secmodel_suser_system_cb, NULL); l_process = kauth_listen_scope(KAUTH_SCOPE_PROCESS, secmodel_suser_process_cb, NULL); l_network = kauth_listen_scope(KAUTH_SCOPE_NETWORK, secmodel_suser_network_cb, NULL); l_machdep = kauth_listen_scope(KAUTH_SCOPE_MACHDEP, secmodel_suser_machdep_cb, NULL); l_device = kauth_listen_scope(KAUTH_SCOPE_DEVICE, secmodel_suser_device_cb, NULL); l_vnode = kauth_listen_scope(KAUTH_SCOPE_VNODE, secmodel_suser_vnode_cb, NULL); } void secmodel_suser_stop(void) { kauth_unlisten_scope(l_generic); kauth_unlisten_scope(l_system); kauth_unlisten_scope(l_process); kauth_unlisten_scope(l_network); kauth_unlisten_scope(l_machdep); kauth_unlisten_scope(l_device); kauth_unlisten_scope(l_vnode); } static bool suser_isroot(kauth_cred_t cred) { return kauth_cred_geteuid(cred) == 0; } static int suser_eval(const char *what, void *arg, void *ret) { int error = 0; if (strcasecmp(what, "is-root") == 0) { kauth_cred_t cred = arg; bool *bp = ret; *bp = suser_isroot(cred); } else { error = ENOENT; } return error; } static int suser_modcmd(modcmd_t cmd, void *arg) { int error = 0; switch (cmd) { case MODULE_CMD_INIT: error = secmodel_register(&suser_sm, SECMODEL_SUSER_ID, SECMODEL_SUSER_NAME, NULL, suser_eval, NULL); if (error != 0) printf("suser_modcmd::init: secmodel_register " "returned %d\n", error); secmodel_suser_init(); secmodel_suser_start(); break; case MODULE_CMD_FINI: secmodel_suser_stop(); error = secmodel_deregister(suser_sm); if (error != 0) printf("suser_modcmd::fini: secmodel_deregister " "returned %d\n", error); break; case MODULE_CMD_AUTOUNLOAD: error = EPERM; break; default: error = ENOTTY; break; } return (error); } /* * kauth(9) listener * * Security model: Traditional NetBSD * Scope: Generic * Responsibility: Superuser access */ int secmodel_suser_generic_cb(kauth_cred_t cred, kauth_action_t action, void *cookie, void *arg0, void *arg1, void *arg2, void *arg3) { bool isroot; int result; isroot = suser_isroot(cred); result = KAUTH_RESULT_DEFER; switch (action) { case KAUTH_GENERIC_ISSUSER: if (isroot) result = KAUTH_RESULT_ALLOW; break; default: break; } return (result); } /* * kauth(9) listener * * Security model: Traditional NetBSD * Scope: System * Responsibility: Superuser access */ int secmodel_suser_system_cb(kauth_cred_t cred, kauth_action_t action, void *cookie, void *arg0, void *arg1, void *arg2, void *arg3) { bool isroot; int result; enum kauth_system_req req; isroot = suser_isroot(cred); result = KAUTH_RESULT_DEFER; req = (enum kauth_system_req)(uintptr_t)arg0; switch (action) { case KAUTH_SYSTEM_CPU: switch (req) { case KAUTH_REQ_SYSTEM_CPU_SETSTATE: if (isroot) result = KAUTH_RESULT_ALLOW; break; default: break; } break; case KAUTH_SYSTEM_DEVMAPPER: if (isroot) result = KAUTH_RESULT_ALLOW; break; case KAUTH_SYSTEM_FS_QUOTA: switch (req) { case KAUTH_REQ_SYSTEM_FS_QUOTA_GET: case KAUTH_REQ_SYSTEM_FS_QUOTA_ONOFF: case KAUTH_REQ_SYSTEM_FS_QUOTA_MANAGE: case KAUTH_REQ_SYSTEM_FS_QUOTA_NOLIMIT: if (isroot) result = KAUTH_RESULT_ALLOW; break; default: break; } break; case KAUTH_SYSTEM_SYSVIPC: switch (req) { case KAUTH_REQ_SYSTEM_SYSVIPC_BYPASS: case KAUTH_REQ_SYSTEM_SYSVIPC_SHM_LOCK: case KAUTH_REQ_SYSTEM_SYSVIPC_SHM_UNLOCK: case KAUTH_REQ_SYSTEM_SYSVIPC_MSGQ_OVERSIZE: if (isroot) result = KAUTH_RESULT_ALLOW; break; default: break; } break; case KAUTH_SYSTEM_MOUNT: switch (req) { case KAUTH_REQ_SYSTEM_MOUNT_DEVICE: case KAUTH_REQ_SYSTEM_MOUNT_GET: case KAUTH_REQ_SYSTEM_MOUNT_NEW: case KAUTH_REQ_SYSTEM_MOUNT_UNMOUNT: case KAUTH_REQ_SYSTEM_MOUNT_UPDATE: case KAUTH_REQ_SYSTEM_MOUNT_UMAP: if (isroot) { result = KAUTH_RESULT_ALLOW; break; } break; default: break; } break; case KAUTH_SYSTEM_MQUEUE: if (isroot) result = KAUTH_RESULT_ALLOW; break; case KAUTH_SYSTEM_PSET: switch (req) { case KAUTH_REQ_SYSTEM_PSET_ASSIGN: case KAUTH_REQ_SYSTEM_PSET_BIND: case KAUTH_REQ_SYSTEM_PSET_CREATE: case KAUTH_REQ_SYSTEM_PSET_DESTROY: if (isroot) result = KAUTH_RESULT_ALLOW; break; default: break; } break; case KAUTH_SYSTEM_TIME: switch (req) { case KAUTH_REQ_SYSTEM_TIME_ADJTIME: case KAUTH_REQ_SYSTEM_TIME_NTPADJTIME: case KAUTH_REQ_SYSTEM_TIME_TIMECOUNTERS: case KAUTH_REQ_SYSTEM_TIME_SYSTEM: case KAUTH_REQ_SYSTEM_TIME_RTCOFFSET: if (isroot) result = KAUTH_RESULT_ALLOW; break; default: break; } break; case KAUTH_SYSTEM_SEMAPHORE: if (isroot) result = KAUTH_RESULT_ALLOW; break; case KAUTH_SYSTEM_SYSCTL: switch (req) { case KAUTH_REQ_SYSTEM_SYSCTL_ADD: case KAUTH_REQ_SYSTEM_SYSCTL_DELETE: case KAUTH_REQ_SYSTEM_SYSCTL_DESC: case KAUTH_REQ_SYSTEM_SYSCTL_MODIFY: case KAUTH_REQ_SYSTEM_SYSCTL_PRVT: if (isroot) result = KAUTH_RESULT_ALLOW; break; default: break; } break; case KAUTH_SYSTEM_SWAPCTL: case KAUTH_SYSTEM_ACCOUNTING: case KAUTH_SYSTEM_REBOOT: case KAUTH_SYSTEM_CHROOT: case KAUTH_SYSTEM_FILEHANDLE: case KAUTH_SYSTEM_MKNOD: case KAUTH_SYSTEM_SETIDCORE: case KAUTH_SYSTEM_MODULE: case KAUTH_SYSTEM_FS_RESERVEDSPACE: case KAUTH_SYSTEM_MAP_VA_ZERO: case KAUTH_SYSTEM_FS_EXTATTR: case KAUTH_SYSTEM_FS_SNAPSHOT: if (isroot) result = KAUTH_RESULT_ALLOW; break; case KAUTH_SYSTEM_DEBUG: break; case KAUTH_SYSTEM_CHSYSFLAGS: /* Deprecated. */ if (isroot) result = KAUTH_RESULT_ALLOW; break; case KAUTH_SYSTEM_VERIEXEC: switch (req) { case KAUTH_REQ_SYSTEM_VERIEXEC_ACCESS: case KAUTH_REQ_SYSTEM_VERIEXEC_MODIFY: if (isroot) result = KAUTH_RESULT_ALLOW; break; default: break; } break; case KAUTH_SYSTEM_LFS: switch (req) { case KAUTH_REQ_SYSTEM_LFS_MARKV: case KAUTH_REQ_SYSTEM_LFS_BMAPV: case KAUTH_REQ_SYSTEM_LFS_SEGCLEAN: case KAUTH_REQ_SYSTEM_LFS_SEGWAIT: case KAUTH_REQ_SYSTEM_LFS_FCNTL: if (isroot) result = KAUTH_RESULT_ALLOW; default: break; } break; case KAUTH_SYSTEM_INTR: switch (req) { case KAUTH_REQ_SYSTEM_INTR_AFFINITY: if (isroot) result = KAUTH_RESULT_ALLOW; break; default: break; } break; case KAUTH_SYSTEM_KERNADDR: if (isroot) result = KAUTH_RESULT_ALLOW; break; default: break; } return (result); } /* * kauth(9) listener * * Security model: Traditional NetBSD * Scope: Process * Responsibility: Superuser access */ int secmodel_suser_process_cb(kauth_cred_t cred, kauth_action_t action, void *cookie, void *arg0, void *arg1, void *arg2, void *arg3) { bool isroot; int result; isroot = suser_isroot(cred); result = KAUTH_RESULT_DEFER; switch (action) { case KAUTH_PROCESS_SIGNAL: case KAUTH_PROCESS_KTRACE: case KAUTH_PROCESS_PROCFS: case KAUTH_PROCESS_PTRACE: case KAUTH_PROCESS_SCHEDULER_GETPARAM: case KAUTH_PROCESS_SCHEDULER_SETPARAM: case KAUTH_PROCESS_SCHEDULER_GETAFFINITY: case KAUTH_PROCESS_SCHEDULER_SETAFFINITY: case KAUTH_PROCESS_SETID: case KAUTH_PROCESS_KEVENT_FILTER: case KAUTH_PROCESS_NICE: case KAUTH_PROCESS_FORK: case KAUTH_PROCESS_CORENAME: case KAUTH_PROCESS_STOPFLAG: if (isroot) result = KAUTH_RESULT_ALLOW; break; case KAUTH_PROCESS_CANSEE: { unsigned long req; req = (unsigned long)arg1; switch (req) { case KAUTH_REQ_PROCESS_CANSEE_ARGS: case KAUTH_REQ_PROCESS_CANSEE_ENTRY: case KAUTH_REQ_PROCESS_CANSEE_OPENFILES: case KAUTH_REQ_PROCESS_CANSEE_EPROC: case KAUTH_REQ_PROCESS_CANSEE_KPTR: if (isroot) { result = KAUTH_RESULT_ALLOW; break; } break; case KAUTH_REQ_PROCESS_CANSEE_ENV: if (isroot) result = KAUTH_RESULT_ALLOW; break; default: break; } break; } case KAUTH_PROCESS_RLIMIT: { enum kauth_process_req req; req = (enum kauth_process_req)(uintptr_t)arg1; switch (req) { case KAUTH_REQ_PROCESS_RLIMIT_SET: case KAUTH_REQ_PROCESS_RLIMIT_GET: case KAUTH_REQ_PROCESS_RLIMIT_BYPASS: if (isroot) result = KAUTH_RESULT_ALLOW; break; default: break; } break; } default: break; } return (result); } /* * kauth(9) listener * * Security model: Traditional NetBSD * Scope: Network * Responsibility: Superuser access */ int secmodel_suser_network_cb(kauth_cred_t cred, kauth_action_t action, void *cookie, void *arg0, void *arg1, void *arg2, void *arg3) { bool isroot; int result; enum kauth_network_req req; isroot = suser_isroot(cred); result = KAUTH_RESULT_DEFER; req = (enum kauth_network_req)(uintptr_t)arg0; switch (action) { case KAUTH_NETWORK_ALTQ: switch (req) { case KAUTH_REQ_NETWORK_ALTQ_AFMAP: case KAUTH_REQ_NETWORK_ALTQ_BLUE: case KAUTH_REQ_NETWORK_ALTQ_CBQ: case KAUTH_REQ_NETWORK_ALTQ_CDNR: case KAUTH_REQ_NETWORK_ALTQ_CONF: case KAUTH_REQ_NETWORK_ALTQ_FIFOQ: case KAUTH_REQ_NETWORK_ALTQ_HFSC: case KAUTH_REQ_NETWORK_ALTQ_JOBS: case KAUTH_REQ_NETWORK_ALTQ_PRIQ: case KAUTH_REQ_NETWORK_ALTQ_RED: case KAUTH_REQ_NETWORK_ALTQ_RIO: case KAUTH_REQ_NETWORK_ALTQ_WFQ: if (isroot) result = KAUTH_RESULT_ALLOW; break; default: break; } break; case KAUTH_NETWORK_BIND: switch (req) { case KAUTH_REQ_NETWORK_BIND_PORT: case KAUTH_REQ_NETWORK_BIND_PRIVPORT: case KAUTH_REQ_NETWORK_BIND_ANYADDR: if (isroot) result = KAUTH_RESULT_ALLOW; break; default: break; } break; case KAUTH_NETWORK_FIREWALL: switch (req) { case KAUTH_REQ_NETWORK_FIREWALL_FW: case KAUTH_REQ_NETWORK_FIREWALL_NAT: if (isroot) result = KAUTH_RESULT_ALLOW; break; default: break; } break; case KAUTH_NETWORK_FORWSRCRT: case KAUTH_NETWORK_ROUTE: if (isroot) result = KAUTH_RESULT_ALLOW; break; case KAUTH_NETWORK_INTERFACE: switch (req) { case KAUTH_REQ_NETWORK_INTERFACE_GET: case KAUTH_REQ_NETWORK_INTERFACE_SET: case KAUTH_REQ_NETWORK_INTERFACE_GETPRIV: case KAUTH_REQ_NETWORK_INTERFACE_SETPRIV: case KAUTH_REQ_NETWORK_INTERFACE_FIRMWARE: if (isroot) result = KAUTH_RESULT_ALLOW; break; default: break; } break; case KAUTH_NETWORK_INTERFACE_BRIDGE: switch (req) { case KAUTH_REQ_NETWORK_INTERFACE_BRIDGE_GETPRIV: case KAUTH_REQ_NETWORK_INTERFACE_BRIDGE_SETPRIV: if (isroot) result = KAUTH_RESULT_ALLOW; break; default: break; } break; case KAUTH_NETWORK_INTERFACE_PPP: switch (req) { case KAUTH_REQ_NETWORK_INTERFACE_PPP_ADD: if (isroot) result = KAUTH_RESULT_ALLOW; break; default: break; } break; case KAUTH_NETWORK_INTERFACE_PVC: switch (req) { case KAUTH_REQ_NETWORK_INTERFACE_PVC_ADD: if (isroot) result = KAUTH_RESULT_ALLOW; break; default: break; } break; case KAUTH_NETWORK_INTERFACE_SLIP: switch (req) { case KAUTH_REQ_NETWORK_INTERFACE_SLIP_ADD: if (isroot) result = KAUTH_RESULT_ALLOW; break; default: break; } break; case KAUTH_NETWORK_INTERFACE_TUN: switch (req) { case KAUTH_REQ_NETWORK_INTERFACE_TUN_ADD: if (isroot) result = KAUTH_RESULT_ALLOW; break; default: break; } break; case KAUTH_NETWORK_IPV6: switch (req) { case KAUTH_REQ_NETWORK_IPV6_HOPBYHOP: case KAUTH_REQ_NETWORK_IPV6_JOIN_MULTICAST: if (isroot) result = KAUTH_RESULT_ALLOW; break; default: break; } break; case KAUTH_NETWORK_NFS: switch (req) { case KAUTH_REQ_NETWORK_NFS_EXPORT: case KAUTH_REQ_NETWORK_NFS_SVC: if (isroot) result = KAUTH_RESULT_ALLOW; break; default: break; } break; case KAUTH_NETWORK_SMB: switch (req) { case KAUTH_REQ_NETWORK_SMB_SHARE_ACCESS: case KAUTH_REQ_NETWORK_SMB_SHARE_CREATE: case KAUTH_REQ_NETWORK_SMB_VC_ACCESS: case KAUTH_REQ_NETWORK_SMB_VC_CREATE: if (isroot) result = KAUTH_RESULT_ALLOW; break; default: break; } break; case KAUTH_NETWORK_INTERFACE_WG: switch (req) { case KAUTH_REQ_NETWORK_INTERFACE_WG_GETPRIV: case KAUTH_REQ_NETWORK_INTERFACE_WG_SETPRIV: if (isroot) result = KAUTH_RESULT_ALLOW; break; default: break; } break; case KAUTH_NETWORK_SOCKET: switch (req) { case KAUTH_REQ_NETWORK_SOCKET_DROP: case KAUTH_REQ_NETWORK_SOCKET_OPEN: case KAUTH_REQ_NETWORK_SOCKET_RAWSOCK: case KAUTH_REQ_NETWORK_SOCKET_SETPRIV: if (isroot) result = KAUTH_RESULT_ALLOW; break; case KAUTH_REQ_NETWORK_SOCKET_CANSEE: if (isroot) { result = KAUTH_RESULT_ALLOW; break; } break; default: break; } break; case KAUTH_NETWORK_IPSEC: switch (req) { case KAUTH_REQ_NETWORK_IPSEC_BYPASS: if (isroot) result = KAUTH_RESULT_ALLOW; break; default: break; } break; default: break; } return (result); } /* * kauth(9) listener * * Security model: Traditional NetBSD * Scope: Machdep * Responsibility: Superuser access */ int secmodel_suser_machdep_cb(kauth_cred_t cred, kauth_action_t action, void *cookie, void *arg0, void *arg1, void *arg2, void *arg3) { bool isroot; int result; isroot = suser_isroot(cred); result = KAUTH_RESULT_DEFER; switch (action) { case KAUTH_MACHDEP_CPU_UCODE_APPLY: case KAUTH_MACHDEP_IOPERM_GET: case KAUTH_MACHDEP_LDT_GET: case KAUTH_MACHDEP_LDT_SET: case KAUTH_MACHDEP_MTRR_GET: case KAUTH_MACHDEP_CACHEFLUSH: case KAUTH_MACHDEP_IOPERM_SET: case KAUTH_MACHDEP_IOPL: case KAUTH_MACHDEP_MTRR_SET: case KAUTH_MACHDEP_NVRAM: case KAUTH_MACHDEP_UNMANAGEDMEM: case KAUTH_MACHDEP_PXG: if (isroot) result = KAUTH_RESULT_ALLOW; break; case KAUTH_MACHDEP_SVS_DISABLE: /* Deprecated. */ if (isroot) result = KAUTH_RESULT_ALLOW; break; default: break; } return (result); } /* * kauth(9) listener * * Security model: Traditional NetBSD * Scope: Device * Responsibility: Superuser access */ int secmodel_suser_device_cb(kauth_cred_t cred, kauth_action_t action, void *cookie, void *arg0, void *arg1, void *arg2, void *arg3) { bool isroot; int result; isroot = suser_isroot(cred); result = KAUTH_RESULT_DEFER; switch (action) { case KAUTH_DEVICE_BLUETOOTH_SETPRIV: case KAUTH_DEVICE_BLUETOOTH_SEND: case KAUTH_DEVICE_BLUETOOTH_RECV: case KAUTH_DEVICE_TTY_OPEN: case KAUTH_DEVICE_TTY_PRIVSET: case KAUTH_DEVICE_TTY_STI: case KAUTH_DEVICE_TTY_VIRTUAL: case KAUTH_DEVICE_RND_ADDDATA: case KAUTH_DEVICE_RND_ADDDATA_ESTIMATE: case KAUTH_DEVICE_RND_GETPRIV: case KAUTH_DEVICE_RND_SETPRIV: case KAUTH_DEVICE_WSCONS_KEYBOARD_BELL: case KAUTH_DEVICE_WSCONS_KEYBOARD_KEYREPEAT: case KAUTH_DEVICE_NVMM_CTL: if (isroot) result = KAUTH_RESULT_ALLOW; break; case KAUTH_DEVICE_BLUETOOTH_BCSP: case KAUTH_DEVICE_BLUETOOTH_BTUART: { enum kauth_device_req req; req = (enum kauth_device_req)(uintptr_t)arg0; switch (req) { case KAUTH_REQ_DEVICE_BLUETOOTH_BCSP_ADD: case KAUTH_REQ_DEVICE_BLUETOOTH_BTUART_ADD: if (isroot) result = KAUTH_RESULT_ALLOW; break; default: break; } break; } case KAUTH_DEVICE_GPIO_PINSET: /* * root can access gpio pins, secmodel_securelevel can veto * this decision. */ if (isroot) result = KAUTH_RESULT_ALLOW; break; default: break; } return (result); } int secmodel_suser_vnode_cb(kauth_cred_t cred, kauth_action_t action, void *cookie, void *arg0, void *arg1, void *arg2, void *arg3) { bool isroot; int result; isroot = suser_isroot(cred); result = KAUTH_RESULT_DEFER; if (isroot) { /* Superuser can execute only if the file's executable. */ if ((action & KAUTH_VNODE_EXECUTE) == 0 || (action & KAUTH_VNODE_IS_EXEC)) result = KAUTH_RESULT_ALLOW; } return (result); }
104 103 104 1 102 4 102 1 33 1 4 4 4 63 76 38 39 30 36 4 4 12 19 1 32 78 104 10 10 10 5 5 17 17 17 5 5 108 107 106 108 106 57 27 28 22 6 22 22 20 13 17 20 3 5 17 17 5 5 5 1 4 1 3 4 3 3 3 1 1 1 2 1 1 15 15 6 6 6 6 6 1 1 3 4 4 6 3 3 3 3 3 3 3 3 3 3 4 4 3 3 3 3 3 3 2 1 2 2 2 1 1 1 1 1 3 5 5 5 5 5 1 4 2 3 2 1 2 1 3 10 10 10 10 10 10 59 59 59 6 2 53 13 13 13 5 8 9 1 2 1 1 1 1 1 1 5 5 3 3 3 1 2 1 1 1 1 1 1 1 21 21 16 5 5 2 2 2 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 /* $NetBSD: tmpfs_vnops.c,v 1.150 2022/06/01 08:42:38 hannken Exp $ */ /* * Copyright (c) 2005, 2006, 2007, 2020 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Julio M. Merino Vidal, developed as part of Google's Summer of Code * 2005 program. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /* * tmpfs vnode interface. */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: tmpfs_vnops.c,v 1.150 2022/06/01 08:42:38 hannken Exp $"); #include <sys/param.h> #include <sys/dirent.h> #include <sys/fcntl.h> #include <sys/event.h> #include <sys/malloc.h> #include <sys/namei.h> #include <sys/stat.h> #include <sys/uio.h> #include <sys/unistd.h> #include <sys/vnode.h> #include <sys/lockf.h> #include <sys/kauth.h> #include <sys/atomic.h> #include <uvm/uvm_object.h> #include <miscfs/fifofs/fifo.h> #include <miscfs/genfs/genfs.h> #include <fs/tmpfs/tmpfs_vnops.h> #include <fs/tmpfs/tmpfs.h> /* * vnode operations vector used for files stored in a tmpfs file system. */ int (**tmpfs_vnodeop_p)(void *); const struct vnodeopv_entry_desc tmpfs_vnodeop_entries[] = { { &vop_default_desc, vn_default_error }, { &vop_parsepath_desc, genfs_parsepath }, { &vop_lookup_desc, tmpfs_lookup }, { &vop_create_desc, tmpfs_create }, { &vop_mknod_desc, tmpfs_mknod }, { &vop_open_desc, tmpfs_open }, { &vop_close_desc, tmpfs_close }, { &vop_access_desc, tmpfs_access }, { &vop_accessx_desc, genfs_accessx }, { &vop_getattr_desc, tmpfs_getattr }, { &vop_setattr_desc, tmpfs_setattr }, { &vop_read_desc, tmpfs_read }, { &vop_write_desc, tmpfs_write }, { &vop_fallocate_desc, genfs_eopnotsupp }, { &vop_fdiscard_desc, genfs_eopnotsupp }, { &vop_ioctl_desc, genfs_enoioctl }, { &vop_fcntl_desc, genfs_fcntl }, { &vop_poll_desc, genfs_poll }, { &vop_kqfilter_desc, genfs_kqfilter }, { &vop_revoke_desc, genfs_revoke }, { &vop_mmap_desc, genfs_mmap }, { &vop_fsync_desc, tmpfs_fsync }, { &vop_seek_desc, genfs_seek }, { &vop_remove_desc, tmpfs_remove }, { &vop_link_desc, tmpfs_link }, { &vop_rename_desc, tmpfs_rename }, { &vop_mkdir_desc, tmpfs_mkdir }, { &vop_rmdir_desc, tmpfs_rmdir }, { &vop_symlink_desc, tmpfs_symlink }, { &vop_readdir_desc, tmpfs_readdir }, { &vop_readlink_desc, tmpfs_readlink }, { &vop_abortop_desc, genfs_abortop }, { &vop_inactive_desc, tmpfs_inactive }, { &vop_reclaim_desc, tmpfs_reclaim }, { &vop_lock_desc, genfs_lock }, { &vop_unlock_desc, genfs_unlock }, { &vop_bmap_desc, genfs_eopnotsupp }, { &vop_strategy_desc, genfs_eopnotsupp }, { &vop_print_desc, tmpfs_print }, { &vop_pathconf_desc, tmpfs_pathconf }, { &vop_islocked_desc, genfs_islocked }, { &vop_advlock_desc, tmpfs_advlock }, { &vop_bwrite_desc, genfs_nullop }, { &vop_getpages_desc, tmpfs_getpages }, { &vop_putpages_desc, tmpfs_putpages }, { &vop_whiteout_desc, tmpfs_whiteout }, { NULL, NULL } }; const struct vnodeopv_desc tmpfs_vnodeop_opv_desc = { &tmpfs_vnodeop_p, tmpfs_vnodeop_entries }; /* * tmpfs_lookup: path name traversal routine. * * Arguments: dvp (directory being searched), vpp (result), * cnp (component name - path). * * => Caller holds a reference and lock on dvp. * => We return looked-up vnode (vpp) locked, with a reference held. */ int tmpfs_lookup(void *v) { struct vop_lookup_v2_args /* { struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; } */ *ap = v; vnode_t *dvp = ap->a_dvp, **vpp = ap->a_vpp; struct componentname *cnp = ap->a_cnp; const bool lastcn = (cnp->cn_flags & ISLASTCN) != 0; tmpfs_node_t *dnode, *tnode; tmpfs_dirent_t *de; int cachefound, iswhiteout; int error; KASSERT(VOP_ISLOCKED(dvp)); dnode = VP_TO_TMPFS_DIR(dvp); *vpp = NULL; /* Check accessibility of directory. */ error = VOP_ACCESS(dvp, VEXEC, cnp->cn_cred); if (error) { goto out; } /* * If requesting the last path component on a read-only file system * with a write operation, deny it. */ if (lastcn && (dvp->v_mount->mnt_flag & MNT_RDONLY) != 0 && (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)) { error = EROFS; goto out; } /* * Avoid doing a linear scan of the directory if the requested * directory/name couple is already in the cache. */ cachefound = cache_lookup(dvp, cnp->cn_nameptr, cnp->cn_namelen, cnp->cn_nameiop, cnp->cn_flags, &iswhiteout, vpp); if (iswhiteout) { cnp->cn_flags |= ISWHITEOUT; } if (cachefound && *vpp == NULLVP) { /* Negative cache hit. */ error = ENOENT; goto out; } else if (cachefound) { error = 0; goto out; } /* * Treat an unlinked directory as empty (no "." or "..") */ if (dnode->tn_links == 0) { KASSERT(dnode->tn_size == 0); error = ENOENT; goto out; } if (cnp->cn_flags & ISDOTDOT) { tmpfs_node_t *pnode; /* * Lookup of ".." case. */ if (lastcn && cnp->cn_nameiop == RENAME) { error = EINVAL; goto out; } KASSERT(dnode->tn_type == VDIR); pnode = dnode->tn_spec.tn_dir.tn_parent; if (pnode == NULL) { error = ENOENT; goto done; } error = vcache_get(dvp->v_mount, &pnode, sizeof(pnode), vpp); goto done; } else if (cnp->cn_namelen == 1 && cnp->cn_nameptr[0] == '.') { /* * Lookup of "." case. */ if (lastcn && cnp->cn_nameiop == RENAME) { error = EISDIR; goto out; } vref(dvp); *vpp = dvp; error = 0; goto done; } /* * Other lookup cases: perform directory scan. */ de = tmpfs_dir_lookup(dnode, cnp); if (de == NULL || de->td_node == TMPFS_NODE_WHITEOUT) { /* * The entry was not found in the directory. This is valid * if we are creating or renaming an entry and are working * on the last component of the path name. */ if (lastcn && (cnp->cn_nameiop == CREATE || cnp->cn_nameiop == RENAME)) { error = VOP_ACCESS(dvp, VWRITE, cnp->cn_cred); if (error) { goto out; } error = EJUSTRETURN; } else { error = ENOENT; } if (de) { KASSERT(de->td_node == TMPFS_NODE_WHITEOUT); cnp->cn_flags |= ISWHITEOUT; } goto done; } tnode = de->td_node; /* * If it is not the last path component and found a non-directory * or non-link entry (which may itself be pointing to a directory), * raise an error. */ if (!lastcn && tnode->tn_type != VDIR && tnode->tn_type != VLNK) { error = ENOTDIR; goto out; } /* Check the permissions. */ if (lastcn && (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)) { error = VOP_ACCESS(dvp, VWRITE, cnp->cn_cred); if (error) goto out; if ((dnode->tn_mode & S_ISTXT) != 0) { error = kauth_authorize_vnode(cnp->cn_cred, KAUTH_VNODE_DELETE, tnode->tn_vnode, dnode->tn_vnode, genfs_can_sticky(dvp, cnp->cn_cred, dnode->tn_uid, tnode->tn_uid)); if (error) { error = EPERM; goto out; } } } /* Get a vnode for the matching entry. */ error = vcache_get(dvp->v_mount, &tnode, sizeof(tnode), vpp); done: /* * Cache the result, unless request was for creation (as it does * not improve the performance). */ if (cnp->cn_nameiop != CREATE) { cache_enter(dvp, *vpp, cnp->cn_nameptr, cnp->cn_namelen, cnp->cn_flags); } out: KASSERT(VOP_ISLOCKED(dvp)); return error; } int tmpfs_create(void *v) { struct vop_create_v3_args /* { struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; struct vattr *a_vap; } */ *ap = v; vnode_t *dvp = ap->a_dvp, **vpp = ap->a_vpp; struct componentname *cnp = ap->a_cnp; struct vattr *vap = ap->a_vap; KASSERT(VOP_ISLOCKED(dvp)); KASSERT(vap->va_type == VREG || vap->va_type == VSOCK); return tmpfs_construct_node(dvp, vpp, vap, cnp, NULL); } int tmpfs_mknod(void *v) { struct vop_mknod_v3_args /* { struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; struct vattr *a_vap; } */ *ap = v; vnode_t *dvp = ap->a_dvp, **vpp = ap->a_vpp; struct componentname *cnp = ap->a_cnp; struct vattr *vap = ap->a_vap; enum vtype vt = vap->va_type; if (vt != VBLK && vt != VCHR && vt != VFIFO) { *vpp = NULL; return EINVAL; } return tmpfs_construct_node(dvp, vpp, vap, cnp, NULL); } int tmpfs_open(void *v) { struct vop_open_args /* { struct vnode *a_vp; int a_mode; kauth_cred_t a_cred; } */ *ap = v; vnode_t *vp = ap->a_vp; mode_t mode = ap->a_mode; tmpfs_node_t *node; KASSERT(VOP_ISLOCKED(vp)); node = VP_TO_TMPFS_NODE(vp); /* If the file is marked append-only, deny write requests. */ if ((node->tn_flags & APPEND) != 0 && (mode & (FWRITE | O_APPEND)) == FWRITE) { return EPERM; } return 0; } int tmpfs_close(void *v) { struct vop_close_args /* { struct vnode *a_vp; int a_fflag; kauth_cred_t a_cred; } */ *ap = v; vnode_t *vp __diagused = ap->a_vp; KASSERT(VOP_ISLOCKED(vp)); return 0; } int tmpfs_access(void *v) { struct vop_access_args /* { struct vnode *a_vp; accmode_t a_accmode; kauth_cred_t a_cred; } */ *ap = v; vnode_t *vp = ap->a_vp; accmode_t accmode = ap->a_accmode; kauth_cred_t cred = ap->a_cred; tmpfs_node_t *node = VP_TO_TMPFS_NODE(vp); const bool writing = (accmode & VWRITE) != 0; KASSERT(VOP_ISLOCKED(vp)); /* Possible? */ switch (vp->v_type) { case VDIR: case VLNK: case VREG: if (writing && (vp->v_mount->mnt_flag & MNT_RDONLY) != 0) { return EROFS; } break; case VBLK: case VCHR: case VSOCK: case VFIFO: break; default: return EINVAL; } if (writing && (node->tn_flags & IMMUTABLE) != 0) { return EPERM; } return kauth_authorize_vnode(cred, KAUTH_ACCESS_ACTION(accmode, vp->v_type, node->tn_mode), vp, NULL, genfs_can_access(vp, cred, node->tn_uid, node->tn_gid, node->tn_mode, NULL, accmode)); } int tmpfs_getattr(void *v) { struct vop_getattr_args /* { struct vnode *a_vp; struct vattr *a_vap; kauth_cred_t a_cred; } */ *ap = v; vnode_t *vp = ap->a_vp; struct vattr *vap = ap->a_vap; tmpfs_node_t *node = VP_TO_TMPFS_NODE(vp); vattr_null(vap); vap->va_type = vp->v_type; vap->va_mode = node->tn_mode; vap->va_nlink = node->tn_links; vap->va_uid = node->tn_uid; vap->va_gid = node->tn_gid; vap->va_fsid = vp->v_mount->mnt_stat.f_fsidx.__fsid_val[0]; vap->va_fileid = node->tn_id; vap->va_size = node->tn_size; vap->va_blocksize = PAGE_SIZE; vap->va_gen = TMPFS_NODE_GEN(node); vap->va_flags = node->tn_flags; vap->va_rdev = (vp->v_type == VBLK || vp->v_type == VCHR) ? node->tn_spec.tn_dev.tn_rdev : VNOVAL; vap->va_bytes = round_page(node->tn_size); vap->va_filerev = VNOVAL; vap->va_vaflags = 0; vap->va_spare = VNOVAL; /* XXX */ mutex_enter(&node->tn_timelock); tmpfs_update_locked(vp, 0); vap->va_atime = node->tn_atime; vap->va_mtime = node->tn_mtime; vap->va_ctime = node->tn_ctime; vap->va_birthtime = node->tn_birthtime; mutex_exit(&node->tn_timelock); return 0; } int tmpfs_setattr(void *v) { struct vop_setattr_args /* { struct vnode *a_vp; struct vattr *a_vap; kauth_cred_t a_cred; } */ *ap = v; vnode_t *vp = ap->a_vp; struct vattr *vap = ap->a_vap; kauth_cred_t cred = ap->a_cred; lwp_t *l = curlwp; int error = 0; KASSERT(VOP_ISLOCKED(vp)); /* Abort if any unsettable attribute is given. */ if (vap->va_type != VNON || vap->va_nlink != VNOVAL || vap->va_fsid != VNOVAL || vap->va_fileid != VNOVAL || vap->va_blocksize != VNOVAL || vap->va_ctime.tv_sec != VNOVAL || vap->va_gen != VNOVAL || vap->va_rdev != VNOVAL || vap->va_bytes != VNOVAL) { return EINVAL; } if (error == 0 && vap->va_flags != VNOVAL) error = tmpfs_chflags(vp, vap->va_flags, cred, l); if (error == 0 && vap->va_size != VNOVAL) error = tmpfs_chsize(vp, vap->va_size, cred, l); if (error == 0 && (vap->va_uid != VNOVAL || vap->va_gid != VNOVAL)) error = tmpfs_chown(vp, vap->va_uid, vap->va_gid, cred, l); if (error == 0 && vap->va_mode != VNOVAL) error = tmpfs_chmod(vp, vap->va_mode, cred, l); const bool chsometime = vap->va_atime.tv_sec != VNOVAL || vap->va_mtime.tv_sec != VNOVAL || vap->va_birthtime.tv_sec != VNOVAL; if (error == 0 && chsometime) { error = tmpfs_chtimes(vp, &vap->va_atime, &vap->va_mtime, &vap->va_birthtime, vap->va_vaflags, cred, l); } return error; } int tmpfs_read(void *v) { struct vop_read_args /* { struct vnode *a_vp; struct uio *a_uio; int a_ioflag; kauth_cred_t a_cred; } */ *ap = v; vnode_t *vp = ap->a_vp; struct uio *uio = ap->a_uio; const int ioflag = ap->a_ioflag; tmpfs_node_t *node; struct uvm_object *uobj; int error; KASSERT(VOP_ISLOCKED(vp)); if (vp->v_type == VDIR) { return EISDIR; } if (uio->uio_offset < 0 || vp->v_type != VREG) { return EINVAL; } /* Note: reading zero bytes should not update atime. */ if (uio->uio_resid == 0) { return 0; } node = VP_TO_TMPFS_NODE(vp); uobj = node->tn_spec.tn_reg.tn_aobj; error = 0; while (error == 0 && uio->uio_resid > 0) { vsize_t len; if (node->tn_size <= uio->uio_offset) { break; } len = MIN(node->tn_size - uio->uio_offset, uio->uio_resid); if (len == 0) { break; } error = ubc_uiomove(uobj, uio, len, IO_ADV_DECODE(ioflag), UBC_READ | UBC_PARTIALOK | UBC_VNODE_FLAGS(vp)); } if ((vp->v_mount->mnt_flag & MNT_NOATIME) == 0) tmpfs_update(vp, TMPFS_UPDATE_ATIME); return error; } int tmpfs_write(void *v) { struct vop_write_args /* { struct vnode *a_vp; struct uio *a_uio; int a_ioflag; kauth_cred_t a_cred; } */ *ap = v; vnode_t *vp = ap->a_vp; struct uio *uio = ap->a_uio; const int ioflag = ap->a_ioflag; tmpfs_node_t *node; struct uvm_object *uobj; off_t oldsize; int error, ubc_flags; KASSERT(VOP_ISLOCKED(vp)); node = VP_TO_TMPFS_NODE(vp); oldsize = node->tn_size; if ((vp->v_mount->mnt_flag & MNT_RDONLY) != 0) { error = EROFS; goto out; } if (uio->uio_offset < 0 || vp->v_type != VREG) { error = EINVAL; goto out; } if (uio->uio_resid == 0) { error = 0; goto out; } if (ioflag & IO_APPEND) { uio->uio_offset = node->tn_size; } if (uio->uio_offset + uio->uio_resid > node->tn_size) { error = tmpfs_reg_resize(vp, uio->uio_offset + uio->uio_resid); if (error) goto out; } /* * If we're extending the file and have data to write that would * not leave an un-zeroed hole, we can avoid fault processing and * zeroing of pages on allocation. * * Don't do this if the file is mapped and we need to touch an * existing page, because writing a mapping of the file into itself * could cause a deadlock on PG_BUSY. * * New pages will not become visible until finished here (because * of PG_BUSY and the vnode lock). */ ubc_flags = UBC_WRITE | UBC_VNODE_FLAGS(vp); #if 0 /* * XXX disable use of UBC_FAULTBUSY for now, this check is insufficient * because it does not zero uninitialized parts of pages in all of * the cases where zeroing is needed. */ if (uio->uio_offset >= oldsize && ((uio->uio_offset & (PAGE_SIZE - 1)) == 0 || ((vp->v_vflag & VV_MAPPED) == 0 && trunc_page(uio->uio_offset) == trunc_page(oldsize)))) { ubc_flags |= UBC_FAULTBUSY; } #endif uobj = node->tn_spec.tn_reg.tn_aobj; error = 0; while (error == 0 && uio->uio_resid > 0) { vsize_t len; len = MIN(node->tn_size - uio->uio_offset, uio->uio_resid); if (len == 0) { break; } error = ubc_uiomove(uobj, uio, len, IO_ADV_DECODE(ioflag), ubc_flags); } if (error) { (void)tmpfs_reg_resize(vp, oldsize); } tmpfs_update(vp, TMPFS_UPDATE_MTIME | TMPFS_UPDATE_CTIME); out: if (error) { KASSERT(oldsize == node->tn_size); } else { KASSERT(uio->uio_resid == 0); } return error; } int tmpfs_fsync(void *v) { struct vop_fsync_args /* { struct vnode *a_vp; kauth_cred_t a_cred; int a_flags; off_t a_offlo; off_t a_offhi; struct lwp *a_l; } */ *ap = v; vnode_t *vp __diagused = ap->a_vp; /* Nothing to do. Should be up to date. */ KASSERT(VOP_ISLOCKED(vp)); return 0; } /* * tmpfs_remove: unlink a file. * * => Both directory (dvp) and file (vp) are locked. * => We unlock and drop the reference on both. */ int tmpfs_remove(void *v) { struct vop_remove_v3_args /* { struct vnode *a_dvp; struct vnode *a_vp; struct componentname *a_cnp; nlink_t ctx_vp_new_nlink; } */ *ap = v; vnode_t *dvp = ap->a_dvp, *vp = ap->a_vp; tmpfs_node_t *dnode, *node; tmpfs_dirent_t *de; int error, tflags; KASSERT(VOP_ISLOCKED(dvp)); KASSERT(VOP_ISLOCKED(vp)); if (vp->v_type == VDIR) { error = EPERM; goto out; } dnode = VP_TO_TMPFS_DIR(dvp); node = VP_TO_TMPFS_NODE(vp); /* * Files marked as immutable or append-only cannot be deleted. * Likewise, files residing on directories marked as append-only * cannot be deleted. */ if (node->tn_flags & (IMMUTABLE | APPEND)) { error = EPERM; goto out; } if (dnode->tn_flags & APPEND) { error = EPERM; goto out; } /* Lookup the directory entry (check the cached hint first). */ de = tmpfs_dir_cached(node); if (de == NULL) { struct componentname *cnp = ap->a_cnp; de = tmpfs_dir_lookup(dnode, cnp); } KASSERT(de && de->td_node == node); /* * Remove the entry from the directory (drops the link count) and * destroy it or replace with a whiteout. * * Note: the inode referred by it will not be destroyed until the * vnode is reclaimed/recycled. */ tmpfs_dir_detach(dnode, de); if (ap->a_cnp->cn_flags & DOWHITEOUT) tmpfs_dir_attach(dnode, de, TMPFS_NODE_WHITEOUT); else tmpfs_free_dirent(VFS_TO_TMPFS(vp->v_mount), de); tflags = TMPFS_UPDATE_MTIME | TMPFS_UPDATE_CTIME; if (node->tn_links > 0) { /* We removed a hard link. */ tflags |= TMPFS_UPDATE_CTIME; } ap->ctx_vp_new_nlink = node->tn_links; tmpfs_update(dvp, tflags); error = 0; out: /* Drop the reference and unlock the node. */ if (dvp == vp) { vrele(vp); } else { vput(vp); } return error; } /* * tmpfs_link: create a hard link. */ int tmpfs_link(void *v) { struct vop_link_v2_args /* { struct vnode *a_dvp; struct vnode *a_vp; struct componentname *a_cnp; } */ *ap = v; vnode_t *dvp = ap->a_dvp; vnode_t *vp = ap->a_vp; struct componentname *cnp = ap->a_cnp; tmpfs_node_t *dnode, *node; tmpfs_dirent_t *de; int error; KASSERT(dvp != vp); KASSERT(VOP_ISLOCKED(dvp)); KASSERT(vp->v_type != VDIR); KASSERT(dvp->v_mount == vp->v_mount); dnode = VP_TO_TMPFS_DIR(dvp); node = VP_TO_TMPFS_NODE(vp); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); /* Check for maximum number of links limit. */ if (node->tn_links == LINK_MAX) { error = EMLINK; goto out; } KASSERT(node->tn_links < LINK_MAX); /* We cannot create links of files marked immutable or append-only. */ if (node->tn_flags & (IMMUTABLE | APPEND)) { error = EPERM; goto out; } error = kauth_authorize_vnode(cnp->cn_cred, KAUTH_VNODE_ADD_LINK, vp, dvp, 0); if (error) goto out; /* Allocate a new directory entry to represent the inode. */ error = tmpfs_alloc_dirent(VFS_TO_TMPFS(vp->v_mount), cnp->cn_nameptr, cnp->cn_namelen, &de); if (error) { goto out; } /* * Insert the entry into the directory. * It will increase the inode link count. */ tmpfs_dir_attach(dnode, de, node); tmpfs_update(dvp, TMPFS_UPDATE_MTIME | TMPFS_UPDATE_CTIME); /* Update the timestamps. */ tmpfs_update(vp, TMPFS_UPDATE_CTIME); error = 0; out: VOP_UNLOCK(vp); return error; } int tmpfs_mkdir(void *v) { struct vop_mkdir_v3_args /* { struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; struct vattr *a_vap; } */ *ap = v; vnode_t *dvp = ap->a_dvp; vnode_t **vpp = ap->a_vpp; struct componentname *cnp = ap->a_cnp; struct vattr *vap = ap->a_vap; KASSERT(vap->va_type == VDIR); return tmpfs_construct_node(dvp, vpp, vap, cnp, NULL); } int tmpfs_rmdir(void *v) { struct vop_rmdir_v2_args /* { struct vnode *a_dvp; struct vnode *a_vp; struct componentname *a_cnp; } */ *ap = v; vnode_t *dvp = ap->a_dvp; vnode_t *vp = ap->a_vp; tmpfs_mount_t *tmp = VFS_TO_TMPFS(dvp->v_mount); tmpfs_node_t *dnode = VP_TO_TMPFS_DIR(dvp); tmpfs_node_t *node = VP_TO_TMPFS_DIR(vp); tmpfs_dirent_t *de; int error = 0; KASSERT(VOP_ISLOCKED(dvp)); KASSERT(VOP_ISLOCKED(vp)); /* * Directories with more than two entries ('.' and '..') cannot be * removed. There may be whiteout entries, which we will destroy. */ if (node->tn_size > 0) { /* * If never had whiteout entries, the directory is certainly * not empty. Otherwise, scan for any non-whiteout entry. */ if ((node->tn_gen & TMPFS_WHITEOUT_BIT) == 0) { error = ENOTEMPTY; goto out; } TAILQ_FOREACH(de, &node->tn_spec.tn_dir.tn_dir, td_entries) { if (de->td_node != TMPFS_NODE_WHITEOUT) { error = ENOTEMPTY; goto out; } } KASSERT(error == 0); } KASSERT(node->tn_spec.tn_dir.tn_parent == dnode); /* Lookup the directory entry (check the cached hint first). */ de = tmpfs_dir_cached(node); if (de == NULL) { struct componentname *cnp = ap->a_cnp; de = tmpfs_dir_lookup(dnode, cnp); } KASSERT(de && de->td_node == node); /* Check flags to see if we are allowed to remove the directory. */ if (dnode->tn_flags & APPEND || node->tn_flags & (IMMUTABLE | APPEND)) { error = EPERM; goto out; } /* Decrement the link count for the virtual '.' entry. */ node->tn_links--; /* Detach the directory entry from the directory. */ tmpfs_dir_detach(dnode, de); /* Purge the cache for parent. */ cache_purge(dvp); /* * Destroy the directory entry or replace it with a whiteout. * * Note: the inode referred by it will not be destroyed until the * vnode is reclaimed. */ if (ap->a_cnp->cn_flags & DOWHITEOUT) tmpfs_dir_attach(dnode, de, TMPFS_NODE_WHITEOUT); else tmpfs_free_dirent(tmp, de); /* Destroy the whiteout entries from the node. */ while ((de = TAILQ_FIRST(&node->tn_spec.tn_dir.tn_dir)) != NULL) { KASSERT(de->td_node == TMPFS_NODE_WHITEOUT); tmpfs_dir_detach(node, de); tmpfs_free_dirent(tmp, de); } tmpfs_update(dvp, TMPFS_UPDATE_MTIME | TMPFS_UPDATE_CTIME); KASSERT(node->tn_size == 0); KASSERT(node->tn_links == 0); out: /* Release the node. */ KASSERT(dvp != vp); vput(vp); return error; } int tmpfs_symlink(void *v) { struct vop_symlink_v3_args /* { struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; struct vattr *a_vap; char *a_target; } */ *ap = v; vnode_t *dvp = ap->a_dvp; vnode_t **vpp = ap->a_vpp; struct componentname *cnp = ap->a_cnp; struct vattr *vap = ap->a_vap; char *target = ap->a_target; KASSERT(vap->va_type == VLNK); return tmpfs_construct_node(dvp, vpp, vap, cnp, target); } int tmpfs_readdir(void *v) { struct vop_readdir_args /* { struct vnode *a_vp; struct uio *a_uio; kauth_cred_t a_cred; int *a_eofflag; off_t **a_cookies; int *ncookies; } */ *ap = v; vnode_t *vp = ap->a_vp; struct uio *uio = ap->a_uio; int *eofflag = ap->a_eofflag; off_t **cookies = ap->a_cookies; int *ncookies = ap->a_ncookies; off_t startoff, cnt; tmpfs_node_t *node; int error; KASSERT(VOP_ISLOCKED(vp)); /* This operation only makes sense on directory nodes. */ if (vp->v_type != VDIR) { return ENOTDIR; } node = VP_TO_TMPFS_DIR(vp); startoff = uio->uio_offset; cnt = 0; /* * Retrieve the directory entries, unless it is being destroyed. */ if (node->tn_links) { error = tmpfs_dir_getdents(node, uio, &cnt); } else { error = 0; } if (eofflag != NULL) { *eofflag = !error && uio->uio_offset == TMPFS_DIRSEQ_EOF; } if (error || cookies == NULL || ncookies == NULL) { return error; } /* Update NFS-related variables, if any. */ tmpfs_dirent_t *de = NULL; off_t i, off = startoff; *cookies = malloc(cnt * sizeof(off_t), M_TEMP, M_WAITOK); *ncookies = cnt; for (i = 0; i < cnt; i++) { KASSERT(off != TMPFS_DIRSEQ_EOF); if (off != TMPFS_DIRSEQ_DOT) { if (off == TMPFS_DIRSEQ_DOTDOT) { de = TAILQ_FIRST(&node->tn_spec.tn_dir.tn_dir); } else if (de != NULL) { de = TAILQ_NEXT(de, td_entries); } else { de = tmpfs_dir_lookupbyseq(node, off); KASSERT(de != NULL); de = TAILQ_NEXT(de, td_entries); } if (de == NULL) { off = TMPFS_DIRSEQ_EOF; } else { off = tmpfs_dir_getseq(node, de); } } else { off = TMPFS_DIRSEQ_DOTDOT; } (*cookies)[i] = off; } KASSERT(uio->uio_offset == off); return error; } int tmpfs_readlink(void *v) { struct vop_readlink_args /* { struct vnode *a_vp; struct uio *a_uio; kauth_cred_t a_cred; } */ *ap = v; vnode_t *vp = ap->a_vp; struct uio *uio = ap->a_uio; tmpfs_node_t *node = VP_TO_TMPFS_NODE(vp); int error; KASSERT(VOP_ISLOCKED(vp)); KASSERT(uio->uio_offset == 0); KASSERT(vp->v_type == VLNK); /* Note: readlink(2) returns the path without NUL terminator. */ if (node->tn_size > 0) { error = uiomove(node->tn_spec.tn_lnk.tn_link, MIN(node->tn_size, uio->uio_resid), uio); } else { error = 0; } tmpfs_update(vp, TMPFS_UPDATE_ATIME); return error; } int tmpfs_inactive(void *v) { struct vop_inactive_v2_args /* { struct vnode *a_vp; bool *a_recycle; } */ *ap = v; vnode_t *vp = ap->a_vp; tmpfs_node_t *node; int error = 0; KASSERT(VOP_ISLOCKED(vp)); node = VP_TO_TMPFS_NODE(vp); if (node->tn_links == 0) { /* * Mark node as dead by setting its generation to zero. */ atomic_and_32(&node->tn_gen, ~TMPFS_NODE_GEN_MASK); /* * If the file has been deleted, truncate it, otherwise VFS * will quite rightly try to write back dirty data, which in * the case of tmpfs/UAO means needless page deactivations. */ if (vp->v_type == VREG) { error = tmpfs_reg_resize(vp, 0); } *ap->a_recycle = true; } else { tmpfs_update(vp, 0); *ap->a_recycle = false; } return error; } int tmpfs_reclaim(void *v) { struct vop_reclaim_v2_args /* { struct vnode *a_vp; } */ *ap = v; vnode_t *vp = ap->a_vp; tmpfs_mount_t *tmp = VFS_TO_TMPFS(vp->v_mount); tmpfs_node_t *node = VP_TO_TMPFS_NODE(vp); /* Unlock vnode. We still have exclusive access to it. */ VOP_UNLOCK(vp); /* Disassociate inode from vnode. */ node->tn_vnode = NULL; vp->v_data = NULL; /* If inode is not referenced, i.e. no links, then destroy it. */ if (node->tn_links == 0) tmpfs_free_node(tmp, node); return 0; } int tmpfs_pathconf(void *v) { struct vop_pathconf_args /* { struct vnode *a_vp; int a_name; register_t *a_retval; } */ *ap = v; register_t *retval = ap->a_retval; switch (ap->a_name) { case _PC_LINK_MAX: *retval = LINK_MAX; return 0; case _PC_NAME_MAX: *retval = TMPFS_MAXNAMLEN; return 0; case _PC_PATH_MAX: *retval = PATH_MAX; return 0; case _PC_PIPE_BUF: *retval = PIPE_BUF; return 0; case _PC_CHOWN_RESTRICTED: *retval = 1; return 0; case _PC_NO_TRUNC: *retval = 1; return 0; case _PC_SYNC_IO: *retval = 1; return 0; case _PC_FILESIZEBITS: *retval = sizeof(off_t) * CHAR_BIT; return 0; default: return genfs_pathconf(ap); } } int tmpfs_advlock(void *v) { struct vop_advlock_args /* { struct vnode *a_vp; void * a_id; int a_op; struct flock *a_fl; int a_flags; } */ *ap = v; vnode_t *vp = ap->a_vp; tmpfs_node_t *node = VP_TO_TMPFS_NODE(vp); return lf_advlock(v, &node->tn_lockf, node->tn_size); } int tmpfs_getpages(void *v) { struct vop_getpages_args /* { struct vnode *a_vp; voff_t a_offset; struct vm_page **a_m; int *a_count; int a_centeridx; vm_prot_t a_access_type; int a_advice; int a_flags; } */ * const ap = v; vnode_t *vp = ap->a_vp; const voff_t offset = ap->a_offset; struct vm_page **pgs = ap->a_m; const int centeridx = ap->a_centeridx; const vm_prot_t access_type = ap->a_access_type; const int advice = ap->a_advice; const int flags = ap->a_flags; int error, iflag, npages = *ap->a_count; tmpfs_node_t *node; struct uvm_object *uobj; KASSERT(vp->v_type == VREG); KASSERT(rw_lock_held(vp->v_uobj.vmobjlock)); /* * Currently, PGO_PASTEOF is not supported. */ if (vp->v_size <= offset + (centeridx << PAGE_SHIFT)) { if ((flags & PGO_LOCKED) == 0) rw_exit(vp->v_uobj.vmobjlock); return EINVAL; } if (vp->v_size < offset + (npages << PAGE_SHIFT)) { npages = (round_page(vp->v_size) - offset) >> PAGE_SHIFT; } /* * Check for reclaimed vnode. v_interlock is not held here, but * VI_DEADCHECK is set with vmobjlock held. */ iflag = atomic_load_relaxed(&vp->v_iflag); if (__predict_false((iflag & VI_DEADCHECK) != 0)) { mutex_enter(vp->v_interlock); error = vdead_check(vp, VDEAD_NOWAIT); mutex_exit(vp->v_interlock); if (error) { if ((flags & PGO_LOCKED) == 0) rw_exit(vp->v_uobj.vmobjlock); return error; } } node = VP_TO_TMPFS_NODE(vp); uobj = node->tn_spec.tn_reg.tn_aobj; /* * Update timestamp lazily. The update will be made real when * a synchronous update is next made -- or by tmpfs_getattr, * tmpfs_putpages, and tmpfs_inactive. */ if ((flags & PGO_NOTIMESTAMP) == 0) { u_int tflags = 0; if ((vp->v_mount->mnt_flag & MNT_NOATIME) == 0) tflags |= TMPFS_UPDATE_ATIME; if ((access_type & VM_PROT_WRITE) != 0) { tflags |= TMPFS_UPDATE_MTIME; if (vp->v_mount->mnt_flag & MNT_RELATIME) tflags |= TMPFS_UPDATE_ATIME; } tmpfs_update_lazily(vp, tflags); } /* Invoke the pager. The vnode vmobjlock is shared with the UAO. */ KASSERT(vp->v_uobj.vmobjlock == uobj->vmobjlock); error = (*uobj->pgops->pgo_get)(uobj, offset, pgs, &npages, centeridx, access_type, advice, flags); #if defined(DEBUG) if (!error && pgs) { KASSERT(pgs[centeridx] != NULL); } #endif return error; } int tmpfs_putpages(void *v) { struct vop_putpages_args /* { struct vnode *a_vp; voff_t a_offlo; voff_t a_offhi; int a_flags; } */ * const ap = v; vnode_t *vp = ap->a_vp; const voff_t offlo = ap->a_offlo; const voff_t offhi = ap->a_offhi; const int flags = ap->a_flags; tmpfs_node_t *node; struct uvm_object *uobj; int error; KASSERT(rw_write_held(vp->v_uobj.vmobjlock)); if (vp->v_type != VREG) { rw_exit(vp->v_uobj.vmobjlock); return 0; } node = VP_TO_TMPFS_NODE(vp); uobj = node->tn_spec.tn_reg.tn_aobj; KASSERT(vp->v_uobj.vmobjlock == uobj->vmobjlock); error = (*uobj->pgops->pgo_put)(uobj, offlo, offhi, flags); /* XXX mtime */ /* Process deferred updates. */ tmpfs_update(vp, 0); return error; } int tmpfs_whiteout(void *v) { struct vop_whiteout_args /* { struct vnode *a_dvp; struct componentname *a_cnp; int a_flags; } */ *ap = v; vnode_t *dvp = ap->a_dvp; struct componentname *cnp = ap->a_cnp; const int flags = ap->a_flags; tmpfs_mount_t *tmp = VFS_TO_TMPFS(dvp->v_mount); tmpfs_node_t *dnode = VP_TO_TMPFS_DIR(dvp); tmpfs_dirent_t *de; int error; switch (flags) { case LOOKUP: break; case CREATE: error = tmpfs_alloc_dirent(tmp, cnp->cn_nameptr, cnp->cn_namelen, &de); if (error) return error; tmpfs_dir_attach(dnode, de, TMPFS_NODE_WHITEOUT); break; case DELETE: cnp->cn_flags &= ~DOWHITEOUT; /* when in doubt, cargo cult */ de = tmpfs_dir_lookup(dnode, cnp); if (de == NULL) return ENOENT; tmpfs_dir_detach(dnode, de); tmpfs_free_dirent(tmp, de); break; } tmpfs_update(dvp, TMPFS_UPDATE_MTIME | TMPFS_UPDATE_CTIME); return 0; } int tmpfs_print(void *v) { struct vop_print_args /* { struct vnode *a_vp; } */ *ap = v; vnode_t *vp = ap->a_vp; tmpfs_node_t *node = VP_TO_TMPFS_NODE(vp); printf("tag VT_TMPFS, tmpfs_node %p, flags 0x%x, links %d\n" "\tmode 0%o, owner %d, group %d, size %" PRIdMAX, node, node->tn_flags, node->tn_links, node->tn_mode, node->tn_uid, node->tn_gid, (uintmax_t)node->tn_size); if (vp->v_type == VFIFO) { VOCALL(fifo_vnodeop_p, VOFFSET(vop_print), v); } printf("\n"); return 0; }
1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 /* $NetBSD: cd.c,v 1.354 2022/06/26 21:00:28 andvar Exp $ */ /*- * Copyright (c) 1998, 2001, 2003, 2004, 2005, 2008 The NetBSD Foundation, * Inc. All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Charles M. Hannum. * * MMC framework implemented and contributed to the NetBSD Foundation by * Reinoud Zandijk. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /* * Originally written by Julian Elischer (julian@tfs.com) * for TRW Financial Systems for use under the MACH(2.5) operating system. * * TRW Financial Systems, in accordance with their agreement with Carnegie * Mellon University, makes this software available to CMU to distribute * or use in any manner that they see fit as long as this message is kept with * the software. For this reason TFS also grants any other persons or * organisations permission to use or modify this software. * * TFS supplies this software to be publicly redistributed * on the understanding that TFS is not responsible for the correct * functioning of this software in any circumstances. * * Ported to run under 386BSD by Julian Elischer (julian@tfs.com) Sept 1992 */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: cd.c,v 1.354 2022/06/26 21:00:28 andvar Exp $"); #include <sys/param.h> #include <sys/systm.h> #include <sys/kernel.h> #include <sys/file.h> #include <sys/stat.h> #include <sys/ioctl.h> #include <sys/buf.h> #include <sys/bufq.h> #include <sys/uio.h> #include <sys/malloc.h> #include <sys/errno.h> #include <sys/device.h> #include <sys/disklabel.h> #include <sys/disk.h> #include <sys/cdio.h> #include <sys/dvdio.h> #include <sys/scsiio.h> #include <sys/proc.h> #include <sys/conf.h> #include <sys/vnode.h> #include <sys/rndsource.h> #include <dev/scsipi/scsi_spc.h> #include <dev/scsipi/scsipi_all.h> #include <dev/scsipi/scsipi_cd.h> #include <dev/scsipi/scsipi_disk.h> /* rw_big and start_stop come */ #include <dev/scsipi/scsi_all.h> /* from there */ #include <dev/scsipi/scsi_disk.h> /* rw comes from there */ #include <dev/scsipi/scsipiconf.h> #include <dev/scsipi/scsipi_base.h> #include <dev/scsipi/cdvar.h> #include <prop/proplib.h> #define CDUNIT(z) DISKUNIT(z) #define CDPART(z) DISKPART(z) #define CDMINOR(unit, part) DISKMINOR(unit, part) #define MAKECDDEV(maj, unit, part) MAKEDISKDEV(maj, unit, part) #define MAXTRACK 99 #define CD_BLOCK_OFFSET 150 #define CD_FRAMES 75 #define CD_SECS 60 #define CD_TOC_FORM 0 /* formatted TOC, exposed to userland */ #define CD_TOC_MSINFO 1 /* multi-session info */ #define CD_TOC_RAW 2 /* raw TOC as on disc, unprocessed */ #define CD_TOC_PMA 3 /* PMA, used as intermediate (rare use) */ #define CD_TOC_ATIP 4 /* pressed space of recordable */ #define CD_TOC_CDTEXT 5 /* special CD-TEXT, rarely used */ #define P5LEN 0x32 #define MS5LEN (P5LEN + 8 + 2) struct cd_formatted_toc { struct ioc_toc_header header; struct cd_toc_entry entries[MAXTRACK+1]; /* One extra for the */ /* leadout */ }; struct cdbounce { struct buf *obp; /* original buf */ struct buf *lbp; /* first buffer */ struct buf *rbp; /* second buffer */ int lerr; /* error returned for first buffer */ int rerr; /* error returned for second buffer */ int head; /* bytes skipped at the start */ int lcount; /* bytes copied to first buffer */ int rcount; /* bytes copied to second buffer */ }; static void cdstart(struct scsipi_periph *); static void cdrestart(void *); static void cdminphys(struct buf *); static void cddone(struct scsipi_xfer *, int); static int cd_interpret_sense(struct scsipi_xfer *); static int cd_diskstart(device_t, struct buf *); static void cd_iosize(device_t, int *); static int cd_lastclose(device_t); static int cd_firstopen(device_t, dev_t, int, int); static void cd_label(device_t, struct disklabel *); static u_long cd_size(struct cd_softc *, int); static int cd_play(struct cd_softc *, int, int); static int cd_play_tracks(struct cd_softc *, struct cd_formatted_toc *, int, int, int, int); static int cd_play_msf(struct cd_softc *, int, int, int, int, int, int); static int cd_pause(struct cd_softc *, int); static int cd_reset(struct cd_softc *); static int cd_read_subchannel(struct cd_softc *, int, int, int, struct cd_sub_channel_info *, int, int); static int cd_read_toc(struct cd_softc *, int, int, int, struct cd_formatted_toc *, int, int, int); static int cd_get_parms(struct cd_softc *, int); static int cd_load_toc(struct cd_softc *, int, struct cd_formatted_toc *, int); static int cdreadmsaddr(struct cd_softc *, struct cd_formatted_toc *,int *); static int cdcachesync(struct scsipi_periph *periph, int flags); static int dvd_auth(struct cd_softc *, dvd_authinfo *); static int dvd_read_physical(struct cd_softc *, dvd_struct *); static int dvd_read_copyright(struct cd_softc *, dvd_struct *); static int dvd_read_disckey(struct cd_softc *, dvd_struct *); static int dvd_read_bca(struct cd_softc *, dvd_struct *); static int dvd_read_manufact(struct cd_softc *, dvd_struct *); static int dvd_read_struct(struct cd_softc *, dvd_struct *); static int cd_mode_sense(struct cd_softc *, u_int8_t, void *, size_t, int, int, int *); static int cd_mode_select(struct cd_softc *, u_int8_t, void *, size_t, int, int); static int cd_setchan(struct cd_softc *, int, int, int, int, int); static int cd_getvol(struct cd_softc *, struct ioc_vol *, int); static int cd_setvol(struct cd_softc *, const struct ioc_vol *, int); static int cd_set_pa_immed(struct cd_softc *, int); static int cd_load_unload(struct cd_softc *, struct ioc_load_unload *); static int cd_setblksize(struct cd_softc *); static int cdmatch(device_t, cfdata_t, void *); static void cdattach(device_t, device_t, void *); static int cddetach(device_t, int); static int mmc_getdiscinfo(struct scsipi_periph *, struct mmc_discinfo *); static int mmc_gettrackinfo(struct scsipi_periph *, struct mmc_trackinfo *); static int mmc_do_op(struct scsipi_periph *, struct mmc_op *); static int mmc_setup_writeparams(struct scsipi_periph *, struct mmc_writeparams *); static void cd_set_geometry(struct cd_softc *); CFATTACH_DECL3_NEW(cd, sizeof(struct cd_softc), cdmatch, cdattach, cddetach, NULL, NULL, NULL, DVF_DETACH_SHUTDOWN); extern struct cfdriver cd_cd; static const struct scsipi_inquiry_pattern cd_patterns[] = { {T_CDROM, T_REMOV, "", "", ""}, {T_WORM, T_REMOV, "", "", ""}, #if 0 {T_CDROM, T_REMOV, /* more luns */ "PIONEER ", "CD-ROM DRM-600 ", ""}, #endif {T_DIRECT, T_REMOV, "NEC CD-ROM DRIVE:260", "", ""}, }; static dev_type_open(cdopen); static dev_type_close(cdclose); static dev_type_read(cdread); static dev_type_write(cdwrite); static dev_type_ioctl(cdioctl); static dev_type_strategy(cdstrategy); static dev_type_dump(cddump); static dev_type_size(cdsize); const struct bdevsw cd_bdevsw = { .d_open = cdopen, .d_close = cdclose, .d_strategy = cdstrategy, .d_ioctl = cdioctl, .d_dump = cddump, .d_psize = cdsize, .d_discard = nodiscard, .d_flag = D_DISK | D_MPSAFE }; const struct cdevsw cd_cdevsw = { .d_open = cdopen, .d_close = cdclose, .d_read = cdread, .d_write = cdwrite, .d_ioctl = cdioctl, .d_stop = nostop, .d_tty = notty, .d_poll = nopoll, .d_mmap = nommap, .d_kqfilter = nokqfilter, .d_discard = nodiscard, .d_flag = D_DISK | D_MPSAFE }; static const struct dkdriver cddkdriver = { .d_open = cdopen, .d_close = cdclose, .d_strategy = cdstrategy, .d_minphys = cdminphys, .d_diskstart = cd_diskstart, .d_firstopen = cd_firstopen, .d_lastclose = cd_lastclose, .d_label = cd_label, }; static const struct scsipi_periphsw cd_switch = { cd_interpret_sense, /* use our error handler first */ cdstart, /* we have a queue, which is started by this */ NULL, /* we do not have an async handler */ cddone, /* deal with stats at interrupt time */ }; /* * The routine called by the low level scsi routine when it discovers * A device suitable for this driver */ static int cdmatch(device_t parent, cfdata_t match, void *aux) { struct scsipibus_attach_args *sa = aux; int priority; (void)scsipi_inqmatch(&sa->sa_inqbuf, cd_patterns, sizeof(cd_patterns) / sizeof(cd_patterns[0]), sizeof(cd_patterns[0]), &priority); return (priority); } static void cdattach(device_t parent, device_t self, void *aux) { struct cd_softc *cd = device_private(self); struct dk_softc *dksc = &cd->sc_dksc; struct scsipibus_attach_args *sa = aux; struct scsipi_periph *periph = sa->sa_periph; int dtype; SC_DEBUG(periph, SCSIPI_DB2, ("cdattach: ")); switch (SCSIPI_BUSTYPE_TYPE(scsipi_periph_bustype(sa->sa_periph))) { case SCSIPI_BUSTYPE_SCSI: dtype = DKTYPE_SCSI; if (periph->periph_version == 0) cd->flags |= CDF_ANCIENT; break; case SCSIPI_BUSTYPE_ATAPI: dtype = DKTYPE_ATAPI; break; default: dtype = DKTYPE_UNKNOWN; break; } /* * Initialize and attach the disk structure. */ dk_init(dksc, self, dtype); disk_init(&dksc->sc_dkdev, dksc->sc_xname, &cddkdriver); dk_attach(dksc); disk_attach(&dksc->sc_dkdev); bufq_alloc(&dksc->sc_bufq, "disksort", BUFQ_SORT_RAWBLOCK); callout_init(&cd->sc_callout, 0); /* * Store information needed to contact our base driver */ cd->sc_periph = periph; periph->periph_dev = dksc->sc_dev; periph->periph_switch = &cd_switch; /* * Increase our openings to the maximum-per-periph * supported by the adapter. This will either be * clamped down or grown by the adapter if necessary. */ periph->periph_openings = SCSIPI_CHAN_MAX_PERIPH(periph->periph_channel); periph->periph_flags |= PERIPH_GROW_OPENINGS; aprint_naive("\n"); aprint_normal("\n"); if (!pmf_device_register(self, NULL, NULL)) aprint_error_dev(self, "couldn't establish power handler\n"); } static int cddetach(device_t self, int flags) { struct cd_softc *cd = device_private(self); struct dk_softc *dksc = &cd->sc_dksc; struct scsipi_periph *periph = cd->sc_periph; struct scsipi_channel *chan = periph->periph_channel; int bmaj, cmaj, i, mn, rc; if ((rc = disk_begindetach(&dksc->sc_dkdev, cd_lastclose, self, flags)) != 0) return rc; /* locate the major number */ bmaj = bdevsw_lookup_major(&cd_bdevsw); cmaj = cdevsw_lookup_major(&cd_cdevsw); /* Nuke the vnodes for any open instances */ for (i = 0; i < MAXPARTITIONS; i++) { mn = CDMINOR(device_unit(self), i); vdevgone(bmaj, mn, mn, VBLK); vdevgone(cmaj, mn, mn, VCHR); } /* kill any pending restart */ callout_halt(&cd->sc_callout, NULL); dk_drain(dksc); /* Kill off any pending commands. */ mutex_enter(chan_mtx(chan)); scsipi_kill_pending(cd->sc_periph); mutex_exit(chan_mtx(chan)); bufq_free(dksc->sc_bufq); /* Detach from the disk list. */ disk_detach(&dksc->sc_dkdev); disk_destroy(&dksc->sc_dkdev); dk_detach(dksc); callout_destroy(&cd->sc_callout); pmf_device_deregister(self); return (0); } /* * Serialized by caller */ static int cd_firstopen(device_t self, dev_t dev, int flag, int fmt) { struct cd_softc *cd = device_private(self); struct scsipi_periph *periph = cd->sc_periph; struct scsipi_adapter *adapt = periph->periph_channel->chan_adapter; int error, silent; int part; part = CDPART(dev); error = scsipi_adapter_addref(adapt); if (error) return error; if ((part == RAW_PART && fmt == S_IFCHR) || (flag & FSILENT)) silent = XS_CTL_SILENT; else silent = 0; /* make cdclose() silent */ cd->flags |= CDF_EJECTED; /* Check that it is still responding and ok. */ error = scsipi_test_unit_ready(periph, XS_CTL_IGNORE_ILLEGAL_REQUEST | XS_CTL_IGNORE_MEDIA_CHANGE | XS_CTL_SILENT); /* * Start the pack spinning if necessary. Always allow the * raw partition to be opened, for raw IOCTLs. Data transfers * will check for SDEV_MEDIA_LOADED. */ if (error == EIO) { error = scsipi_start(periph, SSS_START, silent); if (error == EINVAL) error = EIO; } if (error) { if (part == RAW_PART) goto out; goto bad; } /* Lock the pack in. */ error = scsipi_prevent(periph, SPAMR_PREVENT_DT, XS_CTL_IGNORE_ILLEGAL_REQUEST | XS_CTL_IGNORE_MEDIA_CHANGE); SC_DEBUG(periph, SCSIPI_DB1, ("cdopen: scsipi_prevent, error=%d\n", error)); if (error) { if (part == RAW_PART) goto out; goto bad; } if ((periph->periph_flags & PERIPH_MEDIA_LOADED) == 0) { int param_error; /* Load the physical device parameters. */ param_error = cd_get_parms(cd, 0); if (param_error == CDGP_RESULT_OFFLINE) { error = ENXIO; goto bad2; } periph->periph_flags |= PERIPH_MEDIA_LOADED; SC_DEBUG(periph, SCSIPI_DB3, ("Params loaded ")); cd_set_geometry(cd); /* make cdclose() loud again */ cd->flags &= ~CDF_EJECTED; } periph->periph_flags |= PERIPH_OPEN; out: return 0; bad2: scsipi_prevent(periph, SPAMR_ALLOW, XS_CTL_IGNORE_ILLEGAL_REQUEST | XS_CTL_IGNORE_MEDIA_CHANGE | XS_CTL_SILENT); bad: scsipi_adapter_delref(adapt); return error; } /* * open the device. Make sure the partition info is a up-to-date as can be. */ static int cdopen(dev_t dev, int flag, int fmt, struct lwp *l) { struct cd_softc *cd; struct dk_softc *dksc; struct scsipi_periph *periph; int unit, part; int error; unit = CDUNIT(dev); cd = device_lookup_private(&cd_cd, unit); if (cd == NULL) return (ENXIO); dksc = &cd->sc_dksc; periph = cd->sc_periph; part = CDPART(dev); SC_DEBUG(periph, SCSIPI_DB1, ("cdopen: dev=0x%"PRIu64" (unit %"PRIu32" (of %d), partition %d)\n", dev, unit, cd_cd.cd_ndevs, CDPART(dev))); /* * If any partition is open, but the disk has been invalidated, * disallow further opens of non-raw partition */ if ((periph->periph_flags & (PERIPH_OPEN | PERIPH_MEDIA_LOADED)) == PERIPH_OPEN) { if (part != RAW_PART || fmt != S_IFCHR) return EIO; } error = dk_open(dksc, dev, flag, fmt, l); SC_DEBUG(periph, SCSIPI_DB3, ("open complete\n")); return error; } /* * Serialized by caller */ static int cd_lastclose(device_t self) { struct cd_softc *cd = device_private(self); struct scsipi_periph *periph = cd->sc_periph; struct scsipi_adapter *adapt = periph->periph_channel->chan_adapter; int silent; if ((cd->flags & CDF_EJECTED) != 0 || (periph->periph_flags & PERIPH_MEDIA_LOADED) == 0) silent = XS_CTL_SILENT; else silent = 0; cdcachesync(periph, silent); scsipi_wait_drain(periph); scsipi_prevent(periph, SPAMR_ALLOW, XS_CTL_IGNORE_ILLEGAL_REQUEST | XS_CTL_IGNORE_NOT_READY | XS_CTL_SILENT); periph->periph_flags &= ~PERIPH_OPEN; scsipi_wait_drain(periph); scsipi_adapter_delref(adapt); return 0; } /* * close the device.. only called if we are the LAST * occurrence of an open device */ static int cdclose(dev_t dev, int flag, int fmt, struct lwp *l) { struct cd_softc *cd; struct dk_softc *dksc; int unit; unit = CDUNIT(dev); cd = device_lookup_private(&cd_cd, unit); dksc = &cd->sc_dksc; return dk_close(dksc, dev, flag, fmt, l); } static void cd_bounce_buffer_done(struct buf *bp) { struct cdbounce *bounce = bp->b_private; struct buf *obp = bounce->obp; if (bp == bounce->lbp) { if ((bounce->lerr = bp->b_error) == 0) memcpy(obp->b_data, (char *)bp->b_data + bounce->head, bounce->lcount); bounce->lbp = NULL; } if (bp == bounce->rbp) { if ((bounce->rerr = bp->b_error) == 0) memcpy((char *)obp->b_data + bounce->lcount, bp->b_data, bounce->rcount); bounce->rbp = NULL; } free(bp->b_data, M_DEVBUF); putiobuf(bp); if (bounce->lbp != NULL || bounce->rbp != NULL) return; obp->b_error = bounce->rerr; if (bounce->lerr) obp->b_error = bounce->lerr; obp->b_resid = 0; if (obp->b_error) obp->b_resid = obp->b_bcount; free(bounce, M_DEVBUF); biodone(obp); } static int cd_make_bounce_buffer(struct cd_softc *cd, struct buf *bp, daddr_t blkno, int count, struct buf **nbpp, void *priv) { struct buf *nbp; /* We don't support bouncing writes */ if ((bp->b_flags & B_READ) == 0) return EACCES; /* XXX */ nbp = getiobuf(NULL, false); if (nbp == NULL) return ENOMEM; nbp->b_data = malloc(count, M_DEVBUF, M_NOWAIT); if (nbp->b_data == NULL) { putiobuf(nbp); return ENOMEM; } /* Set up the IOP to the bounce buffer */ nbp->b_error = 0; nbp->b_dev = bp->b_dev; nbp->b_proc = bp->b_proc; nbp->b_bcount = count; nbp->b_bufsize = count; nbp->b_blkno = blkno; nbp->b_flags = bp->b_flags | B_READ; nbp->b_oflags = bp->b_oflags; nbp->b_cflags = bp->b_cflags; nbp->b_iodone = cd_bounce_buffer_done; nbp->b_private = priv; BIO_COPYPRIO(nbp, bp); *nbpp = nbp; return 0; } static int cd_make_bounce(struct cd_softc *cd, struct buf *bp, struct cdbounce **bouncep) { struct dk_softc *dksc = &cd->sc_dksc; unsigned secsize = dksc->sc_dkdev.dk_geom.dg_secsize; struct cdbounce *bounce; int bps, nblks, skip, total, count; daddr_t blkno; struct buf *lbp, *rbp; int error; bounce = malloc(sizeof(struct cdbounce), M_DEVBUF, M_NOWAIT|M_ZERO); if (bounce == NULL) return ENOMEM; bps = howmany(secsize, DEV_BSIZE); nblks = howmany(bp->b_bcount, DEV_BSIZE); skip = bp->b_blkno % bps; blkno = bp->b_blkno - skip; total = roundup(nblks + skip, bps) * DEV_BSIZE; count = total; cd_iosize(dksc->sc_dev, &count); bounce->head = skip * DEV_BSIZE; bounce->lcount = imin(count - bounce->head, bp->b_bcount); bounce->rcount = bp->b_bcount - bounce->lcount; error = cd_make_bounce_buffer(cd, bp, blkno, count, &lbp, bounce); if (error) goto bad; blkno += howmany(count, DEV_BSIZE); count = total - count; if (count > 0) { error = cd_make_bounce_buffer(cd, bp, blkno, count, &rbp, bounce); if (error) { free(lbp->b_data, M_DEVBUF); putiobuf(lbp); goto bad; } } else rbp = NULL; bounce->obp = bp; bounce->lbp = lbp; bounce->rbp = rbp; *bouncep = bounce; return 0; bad: free(bounce, M_DEVBUF); return error; } /* * Actually translate the requested transfer into one the physical driver can * understand. The transfer is described by a buf and will include only one * physical transfer. */ static void cdstrategy(struct buf *bp) { struct cd_softc *cd = device_lookup_private(&cd_cd,CDUNIT(bp->b_dev)); struct dk_softc *dksc = &cd->sc_dksc; struct scsipi_periph *periph = cd->sc_periph; int error; SC_DEBUG(cd->sc_periph, SCSIPI_DB2, ("cdstrategy ")); SC_DEBUG(cd->sc_periph, SCSIPI_DB1, ("%d bytes @ blk %" PRId64 "\n", bp->b_bcount, bp->b_blkno)); /* * If the device has been made invalid, error out * maybe the media changed */ if ((periph->periph_flags & PERIPH_MEDIA_LOADED) == 0) { if (periph->periph_flags & PERIPH_OPEN) error = EIO; else error = ENODEV; goto bad; } /* * If label and device don't agree in sector size use a bounce buffer */ if (dksc->sc_dkdev.dk_label->d_secsize != dksc->sc_dkdev.dk_geom.dg_secsize) { struct cdbounce *bounce = NULL; error = cd_make_bounce(cd, bp, &bounce); if (error) goto bad; dk_strategy(dksc, bounce->lbp); if (bounce->rbp != NULL) dk_strategy(dksc, bounce->rbp); return; } dk_strategy(dksc, bp); return; bad: bp->b_error = error; bp->b_resid = bp->b_bcount; biodone(bp); } /* * Issue single I/O command * * Called from dk_start and implicitly from dk_strategy */ static int cd_diskstart(device_t dev, struct buf *bp) { struct cd_softc *cd = device_private(dev); struct scsipi_periph *periph = cd->sc_periph; struct scsipi_channel *chan = periph->periph_channel; struct scsipi_rw_10 cmd_big; struct scsi_rw_6 cmd_small; struct scsipi_generic *cmdp; struct scsipi_xfer *xs; int error, flags, nblks, cmdlen; SC_DEBUG(periph, SCSIPI_DB2, ("cdstart ")); mutex_enter(chan_mtx(chan)); if (periph->periph_active >= periph->periph_openings) { error = EAGAIN; goto out; } /* * there is excess capacity, but a special waits * It'll need the adapter as soon as we clear out of the * way and let it run (user level wait). */ if (periph->periph_flags & PERIPH_WAITING) { periph->periph_flags &= ~PERIPH_WAITING; cv_broadcast(periph_cv_periph(periph)); error = EAGAIN; goto out; } /* * If the device has become invalid, abort all the * reads and writes until all files have been closed and * re-opened */ if (__predict_false( (periph->periph_flags & PERIPH_MEDIA_LOADED) == 0)) { error = EIO; goto out; } nblks = howmany(bp->b_bcount, cd->params.blksize); /* * Fill out the scsi command. If the transfer will * fit in a "small" cdb, use it. */ if (((bp->b_rawblkno & 0x1fffff) == bp->b_rawblkno) && ((nblks & 0xff) == nblks) && !(periph->periph_quirks & PQUIRK_ONLYBIG)) { /* * We can fit in a small cdb. */ memset(&cmd_small, 0, sizeof(cmd_small)); cmd_small.opcode = (bp->b_flags & B_READ) ? SCSI_READ_6_COMMAND : SCSI_WRITE_6_COMMAND; _lto3b(bp->b_rawblkno, cmd_small.addr); cmd_small.length = nblks & 0xff; cmdlen = sizeof(cmd_small); cmdp = (struct scsipi_generic *)&cmd_small; } else { /* * Need a large cdb. */ memset(&cmd_big, 0, sizeof(cmd_big)); cmd_big.opcode = (bp->b_flags & B_READ) ? READ_10 : WRITE_10; _lto4b(bp->b_rawblkno, cmd_big.addr); _lto2b(nblks, cmd_big.length); cmdlen = sizeof(cmd_big); cmdp = (struct scsipi_generic *)&cmd_big; } /* * Figure out what flags to use. */ flags = XS_CTL_NOSLEEP|XS_CTL_ASYNC|XS_CTL_SIMPLE_TAG; if (bp->b_flags & B_READ) flags |= XS_CTL_DATA_IN; else flags |= XS_CTL_DATA_OUT; /* * Call the routine that chats with the adapter. * Note: we cannot sleep as we may be an interrupt */ xs = scsipi_make_xs_locked(periph, cmdp, cmdlen, (u_char *)bp->b_data, bp->b_bcount, CDRETRIES, 30000, bp, flags); if (__predict_false(xs == NULL)) { /* * out of memory. Keep this buffer in the queue, and * retry later. */ callout_reset(&cd->sc_callout, hz / 2, cdrestart, cd); error = EAGAIN; goto out; } error = scsipi_execute_xs(xs); /* with a scsipi_xfer preallocated, scsipi_command can't fail */ KASSERT(error == 0); out: mutex_exit(chan_mtx(chan)); return error; } /* * Recover I/O request after memory shortage * * Called from callout */ static void cdrestart(void *v) { struct cd_softc *cd = v; struct dk_softc *dksc = &cd->sc_dksc; dk_start(dksc, NULL); } /* * Recover I/O request after memory shortage * * Called from scsipi midlayer when resources have been freed * with channel lock held */ static void cdstart(struct scsipi_periph *periph) { struct cd_softc *cd = device_private(periph->periph_dev); struct dk_softc *dksc = &cd->sc_dksc; struct scsipi_channel *chan = periph->periph_channel; /* * release channel lock as dk_start may need to acquire * other locks * * cdstart is called from scsipi_put_xs and all its callers * release the lock afterwards. So releasing it here * doesn't matter. */ mutex_exit(chan_mtx(chan)); dk_start(dksc, NULL); mutex_enter(chan_mtx(chan)); } static void cddone(struct scsipi_xfer *xs, int error) { struct cd_softc *cd = device_private(xs->xs_periph->periph_dev); struct dk_softc *dksc = &cd->sc_dksc; struct buf *bp = xs->bp; if (bp) { bp->b_error = error; bp->b_resid = xs->resid; if (error) { /* on a read/write error bp->b_resid is zero, so fix */ bp->b_resid = bp->b_bcount; } dk_done(dksc, bp); /* dk_start is called from scsipi_complete */ } } static int cd_interpret_sense(struct scsipi_xfer *xs) { struct scsipi_periph *periph = xs->xs_periph; struct scsi_sense_data *sense = &xs->sense.scsi_sense; int retval = EJUSTRETURN; /* * If it isn't an extended or extended/deferred error, let * the generic code handle it. */ if (SSD_RCODE(sense->response_code) != SSD_RCODE_CURRENT && SSD_RCODE(sense->response_code) != SSD_RCODE_DEFERRED) return (retval); /* * If we got a "Unit not ready" (SKEY_NOT_READY) and "Logical Unit * Is In The Process of Becoming Ready" (Sense code 0x04,0x01), then * wait a bit for the drive to spin up */ if ((SSD_SENSE_KEY(sense->flags) == SKEY_NOT_READY) && (sense->asc == 0x04) && (sense->ascq == 0x01)) { /* * Sleep for 5 seconds to wait for the drive to spin up */ SC_DEBUG(periph, SCSIPI_DB1, ("Waiting 5 sec for CD " "spinup\n")); if (!callout_pending(&periph->periph_callout)) scsipi_periph_freeze(periph, 1); callout_reset(&periph->periph_callout, 5 * hz, scsipi_periph_timed_thaw, periph); retval = ERESTART; } /* * If we got a "Unit not ready" (SKEY_NOT_READY) and "Logical Unit Not * Ready, Operation In Progress" (Sense code 0x04, 0x07), * then wait for the specified time */ if ((SSD_SENSE_KEY(sense->flags) == SKEY_NOT_READY) && (sense->asc == 0x04) && (sense->ascq == 0x07)) { /* * we could listen to the delay; but it looks like the skey * data is not always returned. */ /* cd_delay = _2btol(sense->sks.sks_bytes); */ /* wait for a half second and get going again */ if (!callout_pending(&periph->periph_callout)) scsipi_periph_freeze(periph, 1); callout_reset(&periph->periph_callout, hz/2, scsipi_periph_timed_thaw, periph); retval = ERESTART; } /* * If we got a "Unit not ready" (SKEY_NOT_READY) and "Long write in * progress" (Sense code 0x04, 0x08), then wait for the specified * time */ if ((SSD_SENSE_KEY(sense->flags) == SKEY_NOT_READY) && (sense->asc == 0x04) && (sense->ascq == 0x08)) { /* * long write in process; we could listen to the delay; but it * looks like the skey data is not always returned. */ /* cd_delay = _2btol(sense->sks.sks_bytes); */ /* wait for a half second and get going again */ if (!callout_pending(&periph->periph_callout)) scsipi_periph_freeze(periph, 1); callout_reset(&periph->periph_callout, hz/2, scsipi_periph_timed_thaw, periph); retval = ERESTART; } return (retval); } static void cdminphys(struct buf *bp) { struct cd_softc *cd = device_lookup_private(&cd_cd, CDUNIT(bp->b_dev)); struct dk_softc *dksc = &cd->sc_dksc; long xmax; /* * If the device is ancient, we want to make sure that * the transfer fits into a 6-byte cdb. * * XXX Note that the SCSI-I spec says that 256-block transfers * are allowed in a 6-byte read/write, and are specified * by setting the "length" to 0. However, we're conservative * here, allowing only 255-block transfers in case an * ancient device gets confused by length == 0. A length of 0 * in a 10-byte read/write actually means 0 blocks. */ if (cd->flags & CDF_ANCIENT) { xmax = dksc->sc_dkdev.dk_geom.dg_secsize * 0xff; if (bp->b_bcount > xmax) bp->b_bcount = xmax; } scsipi_adapter_minphys(cd->sc_periph->periph_channel, bp); } static void cd_iosize(device_t dev, int *count) { struct buf B; int bmaj; bmaj = bdevsw_lookup_major(&cd_bdevsw); B.b_dev = MAKECDDEV(bmaj,device_unit(dev),RAW_PART); B.b_bcount = *count; cdminphys(&B); *count = B.b_bcount; } static int cdread(dev_t dev, struct uio *uio, int ioflag) { return (physio(cdstrategy, NULL, dev, B_READ, cdminphys, uio)); } static int cdwrite(dev_t dev, struct uio *uio, int ioflag) { return (physio(cdstrategy, NULL, dev, B_WRITE, cdminphys, uio)); } #if 0 /* XXX Not used */ /* * conversion between minute-seconde-frame and logical block address * addresses format */ static void lba2msf(u_long lba, u_char *m, u_char *s, u_char *f) { u_long tmp; tmp = lba + CD_BLOCK_OFFSET; /* offset of first logical frame */ tmp &= 0xffffff; /* negative lbas use only 24 bits */ *m = tmp / (CD_SECS * CD_FRAMES); tmp %= (CD_SECS * CD_FRAMES); *s = tmp / CD_FRAMES; *f = tmp % CD_FRAMES; } #endif /* XXX Not used */ /* * Convert an hour:minute:second:frame address to a logical block address. In * theory the number of secs/minute and number of frames/second could be * configured differently in the device as could the block offset but in * practice these values are rock solid and most drives don't even allow * theses values to be changed. */ static uint32_t hmsf2lba(uint8_t h, uint8_t m, uint8_t s, uint8_t f) { return (((((uint32_t) h * 60 + m) * CD_SECS) + s) * CD_FRAMES + f) - CD_BLOCK_OFFSET; } static int cdreadmsaddr(struct cd_softc *cd, struct cd_formatted_toc *toc, int *addr) { struct scsipi_periph *periph = cd->sc_periph; int error; struct cd_toc_entry *cte; error = cd_read_toc(cd, CD_TOC_FORM, 0, 0, toc, sizeof(struct ioc_toc_header) + sizeof(struct cd_toc_entry), 0, 0x40 /* control word for "get MS info" */); if (error) return (error); cte = &toc->entries[0]; if (periph->periph_quirks & PQUIRK_LITTLETOC) { cte->addr.lba = le32toh(cte->addr.lba); toc->header.len = le16toh(toc->header.len); } else { cte->addr.lba = be32toh(cte->addr.lba); toc->header.len = be16toh(toc->header.len); } *addr = (toc->header.len >= 10 && cte->track > 1) ? cte->addr.lba : 0; return 0; } /* synchronise caches code from cd.c, move to scsipi_ioctl.c ? */ static int cdcachesync(struct scsipi_periph *periph, int flags) { struct scsi_synchronize_cache_10 cmd; /* * Issue a SYNCHRONIZE CACHE. MMC devices have to issue with address 0 * and length 0 as it can't synchronise parts of the disc per spec. * We ignore ILLEGAL REQUEST in the event that the command is not * supported by the device, and poll for completion so that we know * that the cache has actually been flushed. * * XXX should we handle the PQUIRK_NOSYNCCACHE ? */ memset(&cmd, 0, sizeof(cmd)); cmd.opcode = SCSI_SYNCHRONIZE_CACHE_10; return (scsipi_command(periph, (void *)&cmd, sizeof(cmd), 0, 0, CDRETRIES, 30000, NULL, flags | XS_CTL_IGNORE_ILLEGAL_REQUEST)); } static int do_cdioreadentries(struct cd_softc *cd, struct ioc_read_toc_entry *te, struct cd_formatted_toc *toc) { /* READ TOC format 0 command, entries */ struct ioc_toc_header *th; struct cd_toc_entry *cte; u_int len = te->data_len; int ntracks; int error; th = &toc->header; if (len > sizeof(toc->entries) || len < sizeof(toc->entries[0])) return (EINVAL); error = cd_read_toc(cd, CD_TOC_FORM, te->address_format, te->starting_track, toc, sizeof(toc->header) + len, 0, 0); if (error) return (error); if (te->address_format == CD_LBA_FORMAT) for (ntracks = th->ending_track - th->starting_track + 1; ntracks >= 0; ntracks--) { cte = &toc->entries[ntracks]; cte->addr_type = CD_LBA_FORMAT; if (cd->sc_periph->periph_quirks & PQUIRK_LITTLETOC) cte->addr.lba = le32toh(cte->addr.lba); else cte->addr.lba = be32toh(cte->addr.lba); } if (cd->sc_periph->periph_quirks & PQUIRK_LITTLETOC) th->len = le16toh(th->len); else th->len = be16toh(th->len); return 0; } /* * Perform special action on behalf of the user. * Knows about the internals of this device */ static int cdioctl(dev_t dev, u_long cmd, void *addr, int flag, struct lwp *l) { struct cd_softc *cd = device_lookup_private(&cd_cd, CDUNIT(dev)); struct dk_softc *dksc = &cd->sc_dksc; struct scsipi_periph *periph = cd->sc_periph; struct cd_formatted_toc toc; int part = CDPART(dev); int error; SC_DEBUG(cd->sc_periph, SCSIPI_DB2, ("cdioctl 0x%lx ", cmd)); /* * If the device is not valid, some IOCTLs can still be * handled on the raw partition. Check this here. */ if ((periph->periph_flags & PERIPH_MEDIA_LOADED) == 0 && part != RAW_PART) return (EIO); switch (cmd) { case DIOCTUR: { /* test unit ready */ error = scsipi_test_unit_ready(cd->sc_periph, XS_CTL_SILENT); *((int*)addr) = (error == 0); if (error == ENODEV || error == EIO || error == 0) return 0; return error; } case CDIOCPLAYTRACKS: { /* PLAY_MSF command */ struct ioc_play_track *args = addr; if ((error = cd_set_pa_immed(cd, 0)) != 0) return (error); return (cd_play_tracks(cd, &toc, args->start_track, args->start_index, args->end_track, args->end_index)); } case CDIOCPLAYMSF: { /* PLAY_MSF command */ struct ioc_play_msf *args = addr; if ((error = cd_set_pa_immed(cd, 0)) != 0) return (error); return (cd_play_msf(cd, args->start_m, args->start_s, args->start_f, args->end_m, args->end_s, args->end_f)); } case CDIOCPLAYBLOCKS: { /* PLAY command */ struct ioc_play_blocks *args = addr; if ((error = cd_set_pa_immed(cd, 0)) != 0) return (error); return (cd_play(cd, args->blk, args->len)); } case CDIOCREADSUBCHANNEL: { /* READ_SUBCHANNEL command */ struct ioc_read_subchannel *args = addr; struct cd_sub_channel_info data; u_int len = args->data_len; if (len > sizeof(data) || len < sizeof(struct cd_sub_channel_header)) return (EINVAL); error = cd_read_subchannel(cd, args->address_format, args->data_format, args->track, &data, len, 0); if (error) return (error); len = uimin(len, _2btol(data.header.data_len) + sizeof(struct cd_sub_channel_header)); return (copyout(&data, args->data, len)); } case CDIOCREADSUBCHANNEL_BUF: { /* As CDIOCREADSUBCHANNEL, but without a 2nd buffer area */ struct ioc_read_subchannel_buf *args = addr; if (args->req.data_len != sizeof args->info) return EINVAL; return cd_read_subchannel(cd, args->req.address_format, args->req.data_format, args->req.track, &args->info, sizeof(args->info), 0); } case CDIOREADTOCHEADER: { /* READ TOC format 0 command, static header */ if ((error = cd_read_toc(cd, CD_TOC_FORM, 0, 0, &toc, sizeof(toc.header), 0, 0)) != 0) return (error); if (cd->sc_periph->periph_quirks & PQUIRK_LITTLETOC) toc.header.len = le16toh(toc.header.len); else toc.header.len = be16toh(toc.header.len); memcpy(addr, &toc.header, sizeof(toc.header)); return (0); } case CDIOREADTOCENTRYS: { struct ioc_read_toc_entry *te = addr; error = do_cdioreadentries(cd, te, &toc); if (error != 0) return error; return copyout(toc.entries, te->data, uimin(te->data_len, toc.header.len - (sizeof(toc.header.starting_track) + sizeof(toc.header.ending_track)))); } case CDIOREADTOCENTRIES_BUF: { struct ioc_read_toc_entry_buf *te = addr; error = do_cdioreadentries(cd, &te->req, &toc); if (error != 0) return error; memcpy(te->entry, toc.entries, uimin(te->req.data_len, toc.header.len - (sizeof(toc.header.starting_track) + sizeof(toc.header.ending_track)))); return 0; } case CDIOREADMSADDR: { /* READ TOC format 0 command, length of first track only */ int sessno = *(int*)addr; if (sessno != 0) return (EINVAL); return (cdreadmsaddr(cd, &toc, addr)); } case CDIOCSETPATCH: { struct ioc_patch *arg = addr; return (cd_setchan(cd, arg->patch[0], arg->patch[1], arg->patch[2], arg->patch[3], 0)); } case CDIOCGETVOL: { /* MODE SENSE command (AUDIO page) */ struct ioc_vol *arg = addr; return (cd_getvol(cd, arg, 0)); } case CDIOCSETVOL: { /* MODE SENSE/MODE SELECT commands (AUDIO page) */ struct ioc_vol *arg = addr; return (cd_setvol(cd, arg, 0)); } case CDIOCSETMONO: /* MODE SENSE/MODE SELECT commands (AUDIO page) */ return (cd_setchan(cd, BOTH_CHANNEL, BOTH_CHANNEL, MUTE_CHANNEL, MUTE_CHANNEL, 0)); case CDIOCSETSTEREO: /* MODE SENSE/MODE SELECT commands (AUDIO page) */ return (cd_setchan(cd, LEFT_CHANNEL, RIGHT_CHANNEL, MUTE_CHANNEL, MUTE_CHANNEL, 0)); case CDIOCSETMUTE: /* MODE SENSE/MODE SELECT commands (AUDIO page) */ return (cd_setchan(cd, MUTE_CHANNEL, MUTE_CHANNEL, MUTE_CHANNEL, MUTE_CHANNEL, 0)); case CDIOCSETLEFT: /* MODE SENSE/MODE SELECT commands (AUDIO page) */ return (cd_setchan(cd, LEFT_CHANNEL, LEFT_CHANNEL, MUTE_CHANNEL, MUTE_CHANNEL, 0)); case CDIOCSETRIGHT: /* MODE SENSE/MODE SELECT commands (AUDIO page) */ return (cd_setchan(cd, RIGHT_CHANNEL, RIGHT_CHANNEL, MUTE_CHANNEL, MUTE_CHANNEL, 0)); case CDIOCRESUME: /* PAUSE command */ return (cd_pause(cd, PA_RESUME)); case CDIOCPAUSE: /* PAUSE command */ return (cd_pause(cd, PA_PAUSE)); case CDIOCSTART: return (scsipi_start(periph, SSS_START, 0)); case CDIOCSTOP: return (scsipi_start(periph, SSS_STOP, 0)); case CDIOCCLOSE: return (scsipi_start(periph, SSS_START|SSS_LOEJ, XS_CTL_IGNORE_NOT_READY | XS_CTL_IGNORE_MEDIA_CHANGE)); case DIOCEJECT: if (*(int *)addr == 0) { int pmask = __BIT(part); /* * Don't force eject: check that we are the only * partition open. If so, unlock it. */ if (DK_BUSY(dksc, pmask) == 0) { error = scsipi_prevent(periph, SPAMR_ALLOW, XS_CTL_IGNORE_NOT_READY); if (error) return (error); } else { return (EBUSY); } } /* FALLTHROUGH */ case CDIOCEJECT: /* FALLTHROUGH */ case ODIOCEJECT: error = scsipi_start(periph, SSS_STOP|SSS_LOEJ, 0); if (error == 0) /* Make sure cdclose() will do silent operations */ cd->flags |= CDF_EJECTED; return error; case DIOCCACHESYNC: /* SYNCHRONISE CACHES command */ return (cdcachesync(periph, 0)); case CDIOCALLOW: return (scsipi_prevent(periph, SPAMR_ALLOW, 0)); case CDIOCPREVENT: return (scsipi_prevent(periph, SPAMR_PREVENT_DT, 0)); case DIOCLOCK: return (scsipi_prevent(periph, (*(int *)addr) ? SPAMR_PREVENT_DT : SPAMR_ALLOW, 0)); case CDIOCSETDEBUG: cd->sc_periph->periph_dbflags |= (SCSIPI_DB1 | SCSIPI_DB2); return (0); case CDIOCCLRDEBUG: cd->sc_periph->periph_dbflags &= ~(SCSIPI_DB1 | SCSIPI_DB2); return (0); case CDIOCRESET: case SCIOCRESET: return (cd_reset(cd)); case CDIOCLOADUNLOAD: /* LOAD_UNLOAD command */ return (cd_load_unload(cd, addr)); case DVD_AUTH: /* GPCMD_REPORT_KEY or GPCMD_SEND_KEY command */ return (dvd_auth(cd, addr)); case DVD_READ_STRUCT: /* GPCMD_READ_DVD_STRUCTURE command */ return (dvd_read_struct(cd, addr)); case MMCGETDISCINFO: /* * GET_CONFIGURATION, READ_DISCINFO, READ_TRACKINFO, * (READ_TOCf2, READ_CD_CAPACITY and GET_CONFIGURATION) commands */ return mmc_getdiscinfo(periph, (struct mmc_discinfo *) addr); case MMCGETTRACKINFO: /* READ TOCf2, READ_CD_CAPACITY and READ_TRACKINFO commands */ return mmc_gettrackinfo(periph, (struct mmc_trackinfo *) addr); case MMCOP: /* * CLOSE TRACK/SESSION, RESERVE_TRACK, REPAIR_TRACK, * SYNCHRONISE_CACHE commands */ return mmc_do_op(periph, (struct mmc_op *) addr); case MMCSETUPWRITEPARAMS : /* MODE SENSE page 5, MODE_SELECT page 5 commands */ return mmc_setup_writeparams(periph, (struct mmc_writeparams *) addr); default: error = dk_ioctl(dksc, dev, cmd, addr, flag, l); if (error == ENOTTY) error = scsipi_do_ioctl(periph, dev, cmd, addr, flag, l); return (error); } #ifdef DIAGNOSTIC panic("cdioctl: impossible"); #endif } static void cd_label(device_t self, struct disklabel *lp) { struct cd_softc *cd = device_private(self); struct scsipi_periph *periph = cd->sc_periph; struct cd_formatted_toc toc; int lastsession = 0; strncpy(lp->d_typename, "optical media", 16); lp->d_rpm = 300; lp->d_flags |= D_REMOVABLE; if ((periph->periph_flags & PERIPH_MEDIA_LOADED) != 0) { lp->d_flags |= D_SCSI_MMC; (void) cdreadmsaddr(cd, &toc, &lastsession); } lp->d_partitions[0].p_offset = 0; lp->d_partitions[0].p_size = lp->d_secperunit; lp->d_partitions[0].p_cdsession = lastsession; lp->d_partitions[0].p_fstype = FS_ISO9660; lp->d_partitions[RAW_PART].p_offset = 0; lp->d_partitions[RAW_PART].p_size = lp->d_secperunit; lp->d_partitions[RAW_PART].p_fstype = FS_UDF; } /* * Reading a disc's total capacity is apparently a very difficult issue for the * SCSI standardisation group. Every disc type seems to have its own * (re)invented size request method and modifiers. The failsafe way of * determining the total (max) capacity i.e. not the recorded capacity but the * total maximum capacity is to request the info on the last track and * calculate the last lba. * * For ROM drives, we go for the CD recorded capacity. For recordable devices * we count. */ static int read_cd_capacity(struct scsipi_periph *periph, uint32_t *blksize, u_long *last_lba) { struct scsipi_read_cd_capacity cap_cmd; /* * XXX: see PR 48550 and PR 48754: * the ahcisata(4) driver can not deal with unaligned * data, so align this "a bit" */ struct scsipi_read_cd_cap_data cap __aligned(2); struct scsipi_read_discinfo di_cmd; struct scsipi_read_discinfo_data di __aligned(2); struct scsipi_read_trackinfo ti_cmd; struct scsipi_read_trackinfo_data ti __aligned(2); uint32_t track_start, track_size; int error, flags, msb, lsb, last_track; /* if the device doesn't grok capacity, return the dummies */ if (periph->periph_quirks & PQUIRK_NOCAPACITY) return 0; /* first try read CD capacity for blksize and last recorded lba */ /* issue the cd capacity request */ flags = XS_CTL_DATA_IN; memset(&cap_cmd, 0, sizeof(cap_cmd)); memset(&cap, 0, sizeof(cap)); cap_cmd.opcode = READ_CD_CAPACITY; error = scsipi_command(periph, (void *) &cap_cmd, sizeof(cap_cmd), (void *) &cap, sizeof(cap), CDRETRIES, 30000, NULL, flags); if (error) return error; /* retrieve values and sanity check them */ *blksize = _4btol(cap.length); *last_lba = _4btol(cap.addr); /* blksize is 2048 for CD, but some drives give gibberish */ if ((*blksize < 512) || ((*blksize & 511) != 0) || (*blksize > 16*1024)) { if (*blksize > 16*1024) aprint_error("read_cd_capacity: extra large block " "size %u found - limiting to 2kByte\n", *blksize); *blksize = 2048; /* some drives lie ! */ } /* recordables have READ_DISCINFO implemented */ flags = XS_CTL_DATA_IN | XS_CTL_SILENT; memset(&di_cmd, 0, sizeof(di_cmd)); di_cmd.opcode = READ_DISCINFO; _lto2b(READ_DISCINFO_BIGSIZE, di_cmd.data_len); error = scsipi_command(periph, (void *) &di_cmd, sizeof(di_cmd), (void *) &di, READ_DISCINFO_BIGSIZE, CDRETRIES, 30000, NULL, flags); if (error == 0) { msb = di.last_track_last_session_msb; lsb = di.last_track_last_session_lsb; last_track = (msb << 8) | lsb; /* request info on last track */ memset(&ti_cmd, 0, sizeof(ti_cmd)); ti_cmd.opcode = READ_TRACKINFO; ti_cmd.addr_type = 1; /* on tracknr */ _lto4b(last_track, ti_cmd.address); /* tracknr */ _lto2b(sizeof(ti), ti_cmd.data_len); error = scsipi_command(periph, (void *) &ti_cmd, sizeof(ti_cmd), (void *) &ti, sizeof(ti), CDRETRIES, 30000, NULL, flags); if (error == 0) { track_start = _4btol(ti.track_start); track_size = _4btol(ti.track_size); /* overwrite only with a sane value */ if (track_start + track_size >= 100) *last_lba = (u_long) track_start + track_size -1; } } /* sanity check for lba_size */ if (*last_lba < 100) *last_lba = 400000-1; return 0; } /* * Find out from the device what its capacity is */ static u_long cd_size(struct cd_softc *cd, int flags) { uint32_t blksize = 2048; u_long last_lba = 0, size; int error; error = read_cd_capacity(cd->sc_periph, &blksize, &last_lba); if (error) goto error; if (blksize != 2048) { if (cd_setblksize(cd) == 0) { blksize = 2048; error = read_cd_capacity(cd->sc_periph, &blksize, &last_lba); if (error) goto error; } } size = last_lba + 1; cd->params.blksize = blksize; cd->params.disksize = size; cd->params.disksize512 = ((u_int64_t)cd->params.disksize * blksize) / DEV_BSIZE; SC_DEBUG(cd->sc_periph, SCSIPI_DB2, ("cd_size: %u %lu\n", blksize, size)); return size; error: /* something went wrong */ cd->params.blksize = 2048; cd->params.disksize = 0; cd->params.disksize512 = 0; SC_DEBUG(cd->sc_periph, SCSIPI_DB2, ("cd_size: failed\n")); return 0; } /* * Get scsi driver to send a "start playing" command */ static int cd_play(struct cd_softc *cd, int blkno, int nblks) { struct scsipi_play cmd; memset(&cmd, 0, sizeof(cmd)); cmd.opcode = PLAY; _lto4b(blkno, cmd.blk_addr); _lto2b(nblks, cmd.xfer_len); return (scsipi_command(cd->sc_periph, (void *)&cmd, sizeof(cmd), 0, 0, CDRETRIES, 30000, NULL, 0)); } /* * Get scsi driver to send a "start playing" command */ static int cd_play_tracks(struct cd_softc *cd, struct cd_formatted_toc *toc, int strack, int sindex, int etrack, int eindex) { int error; if (!etrack) return (EIO); if (strack > etrack) return (EINVAL); error = cd_load_toc(cd, CD_TOC_FORM, toc, 0); if (error) return (error); if (++etrack > (toc->header.ending_track+1)) etrack = toc->header.ending_track+1; strack -= toc->header.starting_track; etrack -= toc->header.starting_track; if (strack < 0) return (EINVAL); return (cd_play_msf(cd, toc->entries[strack].addr.msf.minute, toc->entries[strack].addr.msf.second, toc->entries[strack].addr.msf.frame, toc->entries[etrack].addr.msf.minute, toc->entries[etrack].addr.msf.second, toc->entries[etrack].addr.msf.frame)); } /* * Get scsi driver to send a "play msf" command */ static int cd_play_msf(struct cd_softc *cd, int startm, int starts, int startf, int endm, int ends, int endf) { struct scsipi_play_msf cmd; memset(&cmd, 0, sizeof(cmd)); cmd.opcode = PLAY_MSF; cmd.start_m = startm; cmd.start_s = starts; cmd.start_f = startf; cmd.end_m = endm; cmd.end_s = ends; cmd.end_f = endf; return (scsipi_command(cd->sc_periph, (void *)&cmd, sizeof(cmd), 0, 0, CDRETRIES, 30000, NULL, 0)); } /* * Get scsi driver to send a "start up" command */ static int cd_pause(struct cd_softc *cd, int go) { struct scsipi_pause cmd; memset(&cmd, 0, sizeof(cmd)); cmd.opcode = PAUSE; cmd.resume = go & 0xff; return (scsipi_command(cd->sc_periph, (void *)&cmd, sizeof(cmd), 0, 0, CDRETRIES, 30000, NULL, 0)); } /* * Get scsi driver to send a "RESET" command */ static int cd_reset(struct cd_softc *cd) { return (scsipi_command(cd->sc_periph, 0, 0, 0, 0, CDRETRIES, 30000, NULL, XS_CTL_RESET)); } /* * Read subchannel */ static int cd_read_subchannel(struct cd_softc *cd, int mode, int format, int track, struct cd_sub_channel_info *data, int len, int flags) { struct scsipi_read_subchannel cmd; memset(&cmd, 0, sizeof(cmd)); cmd.opcode = READ_SUBCHANNEL; if (mode == CD_MSF_FORMAT) cmd.byte2 |= CD_MSF; cmd.byte3 = SRS_SUBQ; cmd.subchan_format = format; cmd.track = track; _lto2b(len, cmd.data_len); return (scsipi_command(cd->sc_periph, (void *)&cmd, sizeof(struct scsipi_read_subchannel), (void *)data, len, CDRETRIES, 30000, NULL, flags | XS_CTL_DATA_IN | XS_CTL_SILENT)); } /* * Read table of contents */ static int cd_read_toc(struct cd_softc *cd, int respf, int mode, int start, struct cd_formatted_toc *toc, int len, int flags, int control) { struct scsipi_read_toc cmd; int ntoc; memset(&cmd, 0, sizeof(cmd)); #if 0 if (len != sizeof(struct ioc_toc_header)) ntoc = ((len) - sizeof(struct ioc_toc_header)) / sizeof(struct cd_toc_entry); else #endif ntoc = len; cmd.opcode = READ_TOC; if (mode == CD_MSF_FORMAT) cmd.addr_mode |= CD_MSF; cmd.resp_format = respf; cmd.from_track = start; _lto2b(ntoc, cmd.data_len); cmd.control = control; return (scsipi_command(cd->sc_periph, (void *)&cmd, sizeof(cmd), (void *)toc, len, CDRETRIES, 30000, NULL, flags | XS_CTL_DATA_IN)); } static int cd_load_toc(struct cd_softc *cd, int respf, struct cd_formatted_toc *toc, int flags) { int ntracks, len, error; if ((error = cd_read_toc(cd, respf, 0, 0, toc, sizeof(toc->header), flags, 0)) != 0) return (error); ntracks = toc->header.ending_track - toc->header.starting_track + 1; len = (ntracks + 1) * sizeof(struct cd_toc_entry) + sizeof(toc->header); if ((error = cd_read_toc(cd, respf, CD_MSF_FORMAT, 0, toc, len, flags, 0)) != 0) return (error); return (0); } /* * Get the scsi driver to send a full inquiry to the device and use the * results to fill out the disk parameter structure. */ static int cd_get_parms(struct cd_softc *cd, int flags) { /* * give a number of sectors so that sec * trks * cyls * is <= disk_size */ if (cd_size(cd, flags) == 0) return (ENXIO); return (0); } static int cdsize(dev_t dev) { /* CD-ROMs are read-only. */ return (-1); } static int cddump(dev_t dev, daddr_t blkno, void *va, size_t size) { /* Not implemented. */ return (ENXIO); } #define dvd_copy_key(dst, src) memcpy((dst), (src), sizeof(dvd_key)) #define dvd_copy_challenge(dst, src) memcpy((dst), (src), sizeof(dvd_challenge)) static int dvd_auth(struct cd_softc *cd, dvd_authinfo *a) { struct scsipi_generic cmd; u_int8_t bf[20]; int error; memset(cmd.bytes, 0, 15); memset(bf, 0, sizeof(bf)); switch (a->type) { case DVD_LU_SEND_AGID: cmd.opcode = GPCMD_REPORT_KEY; cmd.bytes[8] = 8; cmd.bytes[9] = 0 | (0 << 6); error = scsipi_command(cd->sc_periph, &cmd, 12, bf, 8, CDRETRIES, 30000, NULL, XS_CTL_DATA_IN); if (error) return (error); a->lsa.agid = bf[7] >> 6; return (0); case DVD_LU_SEND_CHALLENGE: cmd.opcode = GPCMD_REPORT_KEY; cmd.bytes[8] = 16; cmd.bytes[9] = 1 | (a->lsc.agid << 6); error = scsipi_command(cd->sc_periph, &cmd, 12, bf, 16, CDRETRIES, 30000, NULL, XS_CTL_DATA_IN); if (error) return (error); dvd_copy_challenge(a->lsc.chal, &bf[4]); return (0); case DVD_LU_SEND_KEY1: cmd.opcode = GPCMD_REPORT_KEY; cmd.bytes[8] = 12; cmd.bytes[9] = 2 | (a->lsk.agid << 6); error = scsipi_command(cd->sc_periph, &cmd, 12, bf, 12, CDRETRIES, 30000, NULL, XS_CTL_DATA_IN); if (error) return (error); dvd_copy_key(a->lsk.key, &bf[4]); return (0); case DVD_LU_SEND_TITLE_KEY: cmd.opcode = GPCMD_REPORT_KEY; _lto4b(a->lstk.lba, &cmd.bytes[1]); cmd.bytes[8] = 12; cmd.bytes[9] = 4 | (a->lstk.agid << 6); error = scsipi_command(cd->sc_periph, &cmd, 12, bf, 12, CDRETRIES, 30000, NULL, XS_CTL_DATA_IN); if (error) return (error); a->lstk.cpm = (bf[4] >> 7) & 1; a->lstk.cp_sec = (bf[4] >> 6) & 1; a->lstk.cgms = (bf[4] >> 4) & 3; dvd_copy_key(a->lstk.title_key, &bf[5]); return (0); case DVD_LU_SEND_ASF: cmd.opcode = GPCMD_REPORT_KEY; cmd.bytes[8] = 8; cmd.bytes[9] = 5 | (a->lsasf.agid << 6); error = scsipi_command(cd->sc_periph, &cmd, 12, bf, 8, CDRETRIES, 30000, NULL, XS_CTL_DATA_IN); if (error) return (error); a->lsasf.asf = bf[7] & 1; return (0); case DVD_HOST_SEND_CHALLENGE: cmd.opcode = GPCMD_SEND_KEY; cmd.bytes[8] = 16; cmd.bytes[9] = 1 | (a->hsc.agid << 6); bf[1] = 14; dvd_copy_challenge(&bf[4], a->hsc.chal); error = scsipi_command(cd->sc_periph, &cmd, 12, bf, 16, CDRETRIES, 30000, NULL, XS_CTL_DATA_OUT); if (error) return (error); a->type = DVD_LU_SEND_KEY1; return (0); case DVD_HOST_SEND_KEY2: cmd.opcode = GPCMD_SEND_KEY; cmd.bytes[8] = 12; cmd.bytes[9] = 3 | (a->hsk.agid << 6); bf[1] = 10; dvd_copy_key(&bf[4], a->hsk.key); error = scsipi_command(cd->sc_periph, &cmd, 12, bf, 12, CDRETRIES, 30000, NULL, XS_CTL_DATA_OUT); if (error) { a->type = DVD_AUTH_FAILURE; return (error); } a->type = DVD_AUTH_ESTABLISHED; return (0); case DVD_INVALIDATE_AGID: cmd.opcode = GPCMD_REPORT_KEY; cmd.bytes[9] = 0x3f | (a->lsa.agid << 6); error = scsipi_command(cd->sc_periph, &cmd, 12, bf, 16, CDRETRIES, 30000, NULL, 0); if (error) return (error); return (0); case DVD_LU_SEND_RPC_STATE: cmd.opcode = GPCMD_REPORT_KEY; cmd.bytes[8] = 8; cmd.bytes[9] = 8 | (0 << 6); error = scsipi_command(cd->sc_periph, &cmd, 12, bf, 8, CDRETRIES, 30000, NULL, XS_CTL_DATA_IN); if (error) return (error); a->lrpcs.type = (bf[4] >> 6) & 3; a->lrpcs.vra = (bf[4] >> 3) & 7; a->lrpcs.ucca = (bf[4]) & 7; a->lrpcs.region_mask = bf[5]; a->lrpcs.rpc_scheme = bf[6]; return (0); case DVD_HOST_SEND_RPC_STATE: cmd.opcode = GPCMD_SEND_KEY; cmd.bytes[8] = 8; cmd.bytes[9] = 6 | (0 << 6); bf[1] = 6; bf[4] = a->hrpcs.pdrc; error = scsipi_command(cd->sc_periph, &cmd, 12, bf, 8, CDRETRIES, 30000, NULL, XS_CTL_DATA_OUT); if (error) return (error); return (0); default: return (ENOTTY); } } static int dvd_read_physical(struct cd_softc *cd, dvd_struct *s) { struct scsipi_generic cmd; u_int8_t bf[4 + 4 * 20], *bufp; int error; struct dvd_layer *layer; int i; memset(cmd.bytes, 0, 15); memset(bf, 0, sizeof(bf)); cmd.opcode = GPCMD_READ_DVD_STRUCTURE; cmd.bytes[6] = s->type; _lto2b(sizeof(bf), &cmd.bytes[7]); cmd.bytes[5] = s->physical.layer_num; error = scsipi_command(cd->sc_periph, &cmd, 12, bf, sizeof(bf), CDRETRIES, 30000, NULL, XS_CTL_DATA_IN); if (error) return (error); for (i = 0, bufp = &bf[4], layer = &s->physical.layer[0]; i < 4; i++, bufp += 20, layer++) { memset(layer, 0, sizeof(*layer)); layer->book_version = bufp[0] & 0xf; layer->book_type = bufp[0] >> 4; layer->min_rate = bufp[1] & 0xf; layer->disc_size = bufp[1] >> 4; layer->layer_type = bufp[2] & 0xf; layer->track_path = (bufp[2] >> 4) & 1; layer->nlayers = (bufp[2] >> 5) & 3; layer->track_density = bufp[3] & 0xf; layer->linear_density = bufp[3] >> 4; layer->start_sector = _4btol(&bufp[4]); layer->end_sector = _4btol(&bufp[8]); layer->end_sector_l0 = _4btol(&bufp[12]); layer->bca = bufp[16] >> 7; } return (0); } static int dvd_read_copyright(struct cd_softc *cd, dvd_struct *s) { struct scsipi_generic cmd; u_int8_t bf[8]; int error; memset(cmd.bytes, 0, 15); memset(bf, 0, sizeof(bf)); cmd.opcode = GPCMD_READ_DVD_STRUCTURE; cmd.bytes[6] = s->type; _lto2b(sizeof(bf), &cmd.bytes[7]); cmd.bytes[5] = s->copyright.layer_num; error = scsipi_command(cd->sc_periph, &cmd, 12, bf, sizeof(bf), CDRETRIES, 30000, NULL, XS_CTL_DATA_IN); if (error) return (error); s->copyright.cpst = bf[4]; s->copyright.rmi = bf[5]; return (0); } static int dvd_read_disckey(struct cd_softc *cd, dvd_struct *s) { struct scsipi_generic cmd; u_int8_t *bf; int error; bf = malloc(4 + 2048, M_TEMP, M_WAITOK|M_ZERO); if (bf == NULL) return EIO; memset(cmd.bytes, 0, 15); cmd.opcode = GPCMD_READ_DVD_STRUCTURE; cmd.bytes[6] = s->type; _lto2b(4 + 2048, &cmd.bytes[7]); cmd.bytes[9] = s->disckey.agid << 6; error = scsipi_command(cd->sc_periph, &cmd, 12, bf, 4 + 2048, CDRETRIES, 30000, NULL, XS_CTL_DATA_IN); if (error == 0) memcpy(s->disckey.value, &bf[4], 2048); free(bf, M_TEMP); return error; } static int dvd_read_bca(struct cd_softc *cd, dvd_struct *s) { struct scsipi_generic cmd; u_int8_t bf[4 + 188]; int error; memset(cmd.bytes, 0, 15); memset(bf, 0, sizeof(bf)); cmd.opcode = GPCMD_READ_DVD_STRUCTURE; cmd.bytes[6] = s->type; _lto2b(sizeof(bf), &cmd.bytes[7]); error = scsipi_command(cd->sc_periph, &cmd, 12, bf, sizeof(bf), CDRETRIES, 30000, NULL, XS_CTL_DATA_IN); if (error) return (error); s->bca.len = _2btol(&bf[0]); if (s->bca.len < 12 || s->bca.len > 188) return (EIO); memcpy(s->bca.value, &bf[4], s->bca.len); return (0); } static int dvd_read_manufact(struct cd_softc *cd, dvd_struct *s) { struct scsipi_generic cmd; u_int8_t *bf; int error; bf = malloc(4 + 2048, M_TEMP, M_WAITOK|M_ZERO); if (bf == NULL) return (EIO); memset(cmd.bytes, 0, 15); cmd.opcode = GPCMD_READ_DVD_STRUCTURE; cmd.bytes[6] = s->type; _lto2b(4 + 2048, &cmd.bytes[7]); error = scsipi_command(cd->sc_periph, &cmd, 12, bf, 4 + 2048, CDRETRIES, 30000, NULL, XS_CTL_DATA_IN); if (error == 0) { s->manufact.len = _2btol(&bf[0]); if (s->manufact.len >= 0 && s->manufact.len <= 2048) memcpy(s->manufact.value, &bf[4], s->manufact.len); else error = EIO; } free(bf, M_TEMP); return error; } static int dvd_read_struct(struct cd_softc *cd, dvd_struct *s) { switch (s->type) { case DVD_STRUCT_PHYSICAL: return (dvd_read_physical(cd, s)); case DVD_STRUCT_COPYRIGHT: return (dvd_read_copyright(cd, s)); case DVD_STRUCT_DISCKEY: return (dvd_read_disckey(cd, s)); case DVD_STRUCT_BCA: return (dvd_read_bca(cd, s)); case DVD_STRUCT_MANUFACT: return (dvd_read_manufact(cd, s)); default: return (EINVAL); } } static int cd_mode_sense(struct cd_softc *cd, u_int8_t byte2, void *sense, size_t size, int page, int flags, int *big) { if (cd->sc_periph->periph_quirks & PQUIRK_ONLYBIG) { *big = 1; return scsipi_mode_sense_big(cd->sc_periph, byte2, page, sense, size + sizeof(struct scsi_mode_parameter_header_10), flags, CDRETRIES, 20000); } else { *big = 0; return scsipi_mode_sense(cd->sc_periph, byte2, page, sense, size + sizeof(struct scsi_mode_parameter_header_6), flags, CDRETRIES, 20000); } } static int cd_mode_select(struct cd_softc *cd, u_int8_t byte2, void *sense, size_t size, int flags, int big) { if (big) { struct scsi_mode_parameter_header_10 *header = sense; _lto2b(0, header->data_length); return scsipi_mode_select_big(cd->sc_periph, byte2, sense, size + sizeof(struct scsi_mode_parameter_header_10), flags, CDRETRIES, 20000); } else { struct scsi_mode_parameter_header_6 *header = sense; header->data_length = 0; return scsipi_mode_select(cd->sc_periph, byte2, sense, size + sizeof(struct scsi_mode_parameter_header_6), flags, CDRETRIES, 20000); } } static int cd_set_pa_immed(struct cd_softc *cd, int flags) { struct { union { struct scsi_mode_parameter_header_6 small; struct scsi_mode_parameter_header_10 big; } header; struct cd_audio_page page; } data; int error; uint8_t oflags; int big, byte2; struct cd_audio_page *page; byte2 = SMS_DBD; try_again: if ((error = cd_mode_sense(cd, byte2, &data, sizeof(data.page), AUDIO_PAGE, flags, &big)) != 0) { if (byte2 == SMS_DBD) { /* Device may not understand DBD; retry without */ byte2 = 0; goto try_again; } return (error); } if (big) page = (void *)((u_long)&data.header.big + sizeof data.header.big + _2btol(data.header.big.blk_desc_len)); else page = (void *)((u_long)&data.header.small + sizeof data.header.small + data.header.small.blk_desc_len); oflags = page->flags; page->flags &= ~CD_PA_SOTC; page->flags |= CD_PA_IMMED; if (oflags == page->flags) return (0); return (cd_mode_select(cd, SMS_PF, &data, sizeof(struct scsi_mode_page_header) + page->pg_length, flags, big)); } static int cd_setchan(struct cd_softc *cd, int p0, int p1, int p2, int p3, int flags) { struct { union { struct scsi_mode_parameter_header_6 small; struct scsi_mode_parameter_header_10 big; } header; struct cd_audio_page page; } data; int error; int big, byte2; struct cd_audio_page *page; byte2 = SMS_DBD; try_again: if ((error = cd_mode_sense(cd, byte2, &data, sizeof(data.page), AUDIO_PAGE, flags, &big)) != 0) { if (byte2 == SMS_DBD) { /* Device may not understand DBD; retry without */ byte2 = 0; goto try_again; } return (error); } if (big) page = (void *)((u_long)&data.header.big + sizeof data.header.big + _2btol(data.header.big.blk_desc_len)); else page = (void *)((u_long)&data.header.small + sizeof data.header.small + data.header.small.blk_desc_len); page->port[0].channels = p0; page->port[1].channels = p1; page->port[2].channels = p2; page->port[3].channels = p3; return (cd_mode_select(cd, SMS_PF, &data, sizeof(struct scsi_mode_page_header) + page->pg_length, flags, big)); } static int cd_getvol(struct cd_softc *cd, struct ioc_vol *arg, int flags) { struct { union { struct scsi_mode_parameter_header_6 small; struct scsi_mode_parameter_header_10 big; } header; struct cd_audio_page page; } data; int error; int big, byte2; struct cd_audio_page *page; byte2 = SMS_DBD; try_again: if ((error = cd_mode_sense(cd, byte2, &data, sizeof(data.page), AUDIO_PAGE, flags, &big)) != 0) { if (byte2 == SMS_DBD) { /* Device may not understand DBD; retry without */ byte2 = 0; goto try_again; } return (error); } if (big) page = (void *)((u_long)&data.header.big + sizeof data.header.big + _2btol(data.header.big.blk_desc_len)); else page = (void *)((u_long)&data.header.small + sizeof data.header.small + data.header.small.blk_desc_len); arg->vol[0] = page->port[0].volume; arg->vol[1] = page->port[1].volume; arg->vol[2] = page->port[2].volume; arg->vol[3] = page->port[3].volume; return (0); } static int cd_setvol(struct cd_softc *cd, const struct ioc_vol *arg, int flags) { struct { union { struct scsi_mode_parameter_header_6 small; struct scsi_mode_parameter_header_10 big; } header; struct cd_audio_page page; } data, mask; int error; int big, byte2; struct cd_audio_page *page, *page2; byte2 = SMS_DBD; try_again: if ((error = cd_mode_sense(cd, byte2, &data, sizeof(data.page), AUDIO_PAGE, flags, &big)) != 0) { if (byte2 == SMS_DBD) { /* Device may not understand DBD; retry without */ byte2 = 0; goto try_again; } return (error); } if ((error = cd_mode_sense(cd, byte2, &mask, sizeof(mask.page), AUDIO_PAGE|SMS_PCTRL_CHANGEABLE, flags, &big)) != 0) return (error); if (big) { page = (void *)((u_long)&data.header.big + sizeof data.header.big + _2btol(data.header.big.blk_desc_len)); page2 = (void *)((u_long)&mask.header.big + sizeof mask.header.big + _2btol(mask.header.big.blk_desc_len)); } else { page = (void *)((u_long)&data.header.small + sizeof data.header.small + data.header.small.blk_desc_len); page2 = (void *)((u_long)&mask.header.small + sizeof mask.header.small + mask.header.small.blk_desc_len); } page->port[0].volume = arg->vol[0] & page2->port[0].volume; page->port[1].volume = arg->vol[1] & page2->port[1].volume; page->port[2].volume = arg->vol[2] & page2->port[2].volume; page->port[3].volume = arg->vol[3] & page2->port[3].volume; page->port[0].channels = CHANNEL_0; page->port[1].channels = CHANNEL_1; return (cd_mode_select(cd, SMS_PF, &data, sizeof(struct scsi_mode_page_header) + page->pg_length, flags, big)); } static int cd_load_unload(struct cd_softc *cd, struct ioc_load_unload *args) { struct scsipi_load_unload cmd; memset(&cmd, 0, sizeof(cmd)); cmd.opcode = LOAD_UNLOAD; cmd.options = args->options; /* ioctl uses MMC values */ cmd.slot = args->slot; return (scsipi_command(cd->sc_periph, (void *)&cmd, sizeof(cmd), 0, 0, CDRETRIES, 200000, NULL, 0)); } static int cd_setblksize(struct cd_softc *cd) { struct { union { struct scsi_mode_parameter_header_6 small; struct scsi_mode_parameter_header_10 big; } header; struct scsi_general_block_descriptor blk_desc; } data; int error; int big, bsize; struct scsi_general_block_descriptor *bdesc; if ((error = cd_mode_sense(cd, 0, &data, sizeof(data.blk_desc), 0, 0, &big)) != 0) return (error); if (big) { bdesc = (void *)(&data.header.big + 1); bsize = _2btol(data.header.big.blk_desc_len); } else { bdesc = (void *)(&data.header.small + 1); bsize = data.header.small.blk_desc_len; } if (bsize == 0) { printf("cd_setblksize: trying to change bsize, but no blk_desc\n"); return (EINVAL); } if (_3btol(bdesc->blklen) == 2048) { printf("cd_setblksize: trying to change bsize, but blk_desc is correct\n"); return (EINVAL); } _lto3b(2048, bdesc->blklen); return (cd_mode_select(cd, SMS_PF, &data, sizeof(data.blk_desc), 0, big)); } static int mmc_profile2class(uint16_t mmc_profile) { switch (mmc_profile) { case 0x01 : /* SCSI discs */ case 0x02 : /* this can't happen really, cd.c wouldn't have matched */ return MMC_CLASS_DISC; case 0x03 : /* Magneto Optical with sector erase */ case 0x04 : /* Magneto Optical write once */ case 0x05 : /* Advance Storage Magneto Optical */ return MMC_CLASS_MO; case 0x00 : /* Unknown MMC profile, can also be CD-ROM */ case 0x08 : /* CD-ROM */ case 0x09 : /* CD-R */ case 0x0a : /* CD-RW */ return MMC_CLASS_CD; case 0x10 : /* DVD-ROM */ case 0x11 : /* DVD-R */ case 0x12 : /* DVD-RAM */ case 0x13 : /* DVD-RW restricted overwrite */ case 0x14 : /* DVD-RW sequential */ case 0x1a : /* DVD+RW */ case 0x1b : /* DVD+R */ case 0x2a : /* DVD+RW Dual layer */ case 0x2b : /* DVD+R Dual layer */ case 0x50 : /* HD DVD-ROM */ case 0x51 : /* HD DVD-R */ case 0x52 : /* HD DVD-RW; DVD-RAM like */ return MMC_CLASS_DVD; case 0x40 : /* BD-ROM */ case 0x41 : /* BD-R Sequential recording (SRM) */ case 0x42 : /* BD-R Random Recording (RRM) */ case 0x43 : /* BD-RE */ return MMC_CLASS_BD; } return MMC_CLASS_UNKN; } /* * Drive/media combination is reflected in a series of features that can * either be current or dormant. We try to make sense out of them to create a * set of easy to use flags that abstract the device/media capabilities. */ static void mmc_process_feature(struct mmc_discinfo *mmc_discinfo, uint16_t feature, int cur, uint8_t *rpos) { uint32_t blockingnr; uint64_t flags; if (cur == 1) { flags = mmc_discinfo->mmc_cur; } else { flags = mmc_discinfo->mmc_cap; } switch (feature) { case 0x0010 : /* random readable feature */ blockingnr = rpos[5] | (rpos[4] << 8); if (blockingnr > 1) flags |= MMC_CAP_PACKET; /* RW error page */ break; case 0x0020 : /* random writable feature */ flags |= MMC_CAP_RECORDABLE; flags |= MMC_CAP_REWRITABLE; blockingnr = rpos[9] | (rpos[8] << 8); if (blockingnr > 1) flags |= MMC_CAP_PACKET; break; case 0x0021 : /* incremental streaming write feature */ flags |= MMC_CAP_RECORDABLE; flags |= MMC_CAP_SEQUENTIAL; if (cur) mmc_discinfo->link_block_penalty = rpos[4]; if (rpos[2] & 1) flags |= MMC_CAP_ZEROLINKBLK; break; case 0x0022 : /* (obsolete) erase support feature */ flags |= MMC_CAP_RECORDABLE; flags |= MMC_CAP_ERASABLE; break; case 0x0023 : /* formatting media support feature */ flags |= MMC_CAP_RECORDABLE; flags |= MMC_CAP_FORMATTABLE; break; case 0x0024 : /* hardware assised defect management feature */ flags |= MMC_CAP_HW_DEFECTFREE; break; case 0x0025 : /* write once */ flags |= MMC_CAP_RECORDABLE; break; case 0x0026 : /* restricted overwrite feature */ flags |= MMC_CAP_RECORDABLE; flags |= MMC_CAP_REWRITABLE; flags |= MMC_CAP_STRICTOVERWRITE; break; case 0x0028 : /* MRW formatted media support feature */ flags |= MMC_CAP_MRW; break; case 0x002b : /* DVD+R read (and opt. write) support */ flags |= MMC_CAP_SEQUENTIAL; if (rpos[0] & 1) /* write support */ flags |= MMC_CAP_RECORDABLE; break; case 0x002c : /* rigid restricted overwrite feature */ flags |= MMC_CAP_RECORDABLE; flags |= MMC_CAP_REWRITABLE; flags |= MMC_CAP_STRICTOVERWRITE; if (rpos[0] & 1) /* blank bit */ flags |= MMC_CAP_BLANKABLE; break; case 0x002d : /* track at once recording feature */ flags |= MMC_CAP_RECORDABLE; flags |= MMC_CAP_SEQUENTIAL; break; case 0x002f : /* DVD-R/-RW write feature */ flags |= MMC_CAP_RECORDABLE; if (rpos[0] & 2) /* DVD-RW bit */ flags |= MMC_CAP_BLANKABLE; break; case 0x0038 : /* BD-R SRM with pseudo overwrite */ flags |= MMC_CAP_PSEUDOOVERWRITE; break; default : /* ignore */ break; } if (cur == 1) { mmc_discinfo->mmc_cur = flags; } else { mmc_discinfo->mmc_cap = flags; } } static int mmc_getdiscinfo_cdrom(struct scsipi_periph *periph, struct mmc_discinfo *mmc_discinfo) { struct scsipi_read_toc gtoc_cmd; struct scsipi_toc_header *toc_hdr; struct scsipi_toc_msinfo *toc_msinfo; const uint32_t buffer_size = 1024; uint32_t req_size; uint8_t *buffer; int error, flags; buffer = malloc(buffer_size, M_TEMP, M_WAITOK); /* * Fabricate mmc_discinfo for CD-ROM. Some values are really `dont * care' but others might be of interest to programs. */ mmc_discinfo->disc_state = MMC_STATE_FULL; mmc_discinfo->last_session_state = MMC_STATE_FULL; mmc_discinfo->bg_format_state = MMC_BGFSTATE_COMPLETED; mmc_discinfo->link_block_penalty = 7; /* not relevant */ /* get number of sessions and first tracknr in last session */ flags = XS_CTL_DATA_IN; memset(&gtoc_cmd, 0, sizeof(gtoc_cmd)); gtoc_cmd.opcode = READ_TOC; gtoc_cmd.addr_mode = CD_MSF; /* not relevant */ gtoc_cmd.resp_format = CD_TOC_MSINFO; /* multisession info */ gtoc_cmd.from_track = 0; /* reserved, must be 0 */ req_size = sizeof(*toc_hdr) + sizeof(*toc_msinfo); _lto2b(req_size, gtoc_cmd.data_len); error = scsipi_command(periph, (void *)&gtoc_cmd, sizeof(gtoc_cmd), (void *)buffer, req_size, CDRETRIES, 30000, NULL, flags); if (error) goto out; toc_hdr = (struct scsipi_toc_header *) buffer; toc_msinfo = (struct scsipi_toc_msinfo *) (buffer + 4); mmc_discinfo->num_sessions = toc_hdr->last - toc_hdr->first + 1; mmc_discinfo->first_track = toc_hdr->first; mmc_discinfo->first_track_last_session = toc_msinfo->tracknr; /* get last track of last session */ flags = XS_CTL_DATA_IN; gtoc_cmd.resp_format = CD_TOC_FORM; /* formatted toc */ req_size = sizeof(*toc_hdr); _lto2b(req_size, gtoc_cmd.data_len); error = scsipi_command(periph, (void *)&gtoc_cmd, sizeof(gtoc_cmd), (void *)buffer, req_size, CDRETRIES, 30000, NULL, flags); if (error) goto out; toc_hdr = (struct scsipi_toc_header *) buffer; mmc_discinfo->last_track_last_session = toc_hdr->last; mmc_discinfo->num_tracks = toc_hdr->last - toc_hdr->first + 1; /* TODO how to handle disc_barcode and disc_id */ /* done */ out: free(buffer, M_TEMP); return error; } static int mmc_getdiscinfo_dvdrom(struct scsipi_periph *periph, struct mmc_discinfo *mmc_discinfo) { struct scsipi_read_toc gtoc_cmd; struct scsipi_toc_header toc_hdr; uint32_t req_size; int error, flags; /* * Fabricate mmc_discinfo for DVD-ROM. Some values are really `dont * care' but others might be of interest to programs. */ mmc_discinfo->disc_state = MMC_STATE_FULL; mmc_discinfo->last_session_state = MMC_STATE_FULL; mmc_discinfo->bg_format_state = MMC_BGFSTATE_COMPLETED; mmc_discinfo->link_block_penalty = 16; /* not relevant */ /* get number of sessions and first tracknr in last session */ flags = XS_CTL_DATA_IN; memset(&gtoc_cmd, 0, sizeof(gtoc_cmd)); gtoc_cmd.opcode = READ_TOC; gtoc_cmd.addr_mode = 0; /* LBA */ gtoc_cmd.resp_format = CD_TOC_FORM; /* multisession info */ gtoc_cmd.from_track = 1; /* first track */ req_size = sizeof(toc_hdr); _lto2b(req_size, gtoc_cmd.data_len); error = scsipi_command(periph, (void *)&gtoc_cmd, sizeof(gtoc_cmd), (void *)&toc_hdr, req_size, CDRETRIES, 30000, NULL, flags); if (error) return error; /* DVD-ROM squashes the track/session space */ mmc_discinfo->num_sessions = toc_hdr.last - toc_hdr.first + 1; mmc_discinfo->num_tracks = mmc_discinfo->num_sessions; mmc_discinfo->first_track = toc_hdr.first; mmc_discinfo->first_track_last_session = toc_hdr.last; mmc_discinfo->last_track_last_session = toc_hdr.last; /* TODO how to handle disc_barcode and disc_id */ /* done */ return 0; } static int mmc_getdiscinfo(struct scsipi_periph *periph, struct mmc_discinfo *mmc_discinfo) { struct scsipi_get_configuration gc_cmd; struct scsipi_get_conf_data *gc; struct scsipi_get_conf_feature *gcf; struct scsipi_read_discinfo di_cmd; struct scsipi_read_discinfo_data di __aligned(2); const uint32_t buffer_size = 0x200; /* XXX RPZ USB3 SCSI size issue */ uint32_t pos; u_long last_lba = 0; uint8_t *buffer, *fpos; int feature, last_feature, features_len, feature_cur, feature_len; int lsb, msb, error, flags; buffer = malloc(buffer_size, M_TEMP, M_WAITOK); /* initialise structure */ memset(mmc_discinfo, 0, sizeof(struct mmc_discinfo)); mmc_discinfo->mmc_profile = 0x00; /* unknown */ mmc_discinfo->mmc_class = MMC_CLASS_UNKN; mmc_discinfo->mmc_cur = 0; mmc_discinfo->mmc_cap = 0; mmc_discinfo->link_block_penalty = 0; /* determine mmc profile and class */ flags = XS_CTL_DATA_IN; memset(&gc_cmd, 0, sizeof(gc_cmd)); gc_cmd.opcode = GET_CONFIGURATION; _lto2b(GET_CONF_NO_FEATURES_LEN, gc_cmd.data_len); gc = (struct scsipi_get_conf_data *) buffer; error = scsipi_command(periph, (void *)&gc_cmd, sizeof(gc_cmd), (void *) gc, GET_CONF_NO_FEATURES_LEN, CDRETRIES, 30000, NULL, flags); if (error) goto out; mmc_discinfo->mmc_profile = _2btol(gc->mmc_profile); mmc_discinfo->mmc_class = mmc_profile2class(mmc_discinfo->mmc_profile); /* assume 2048 sector size unless told otherwise */ mmc_discinfo->sector_size = 2048; error = read_cd_capacity(periph, &mmc_discinfo->sector_size, &last_lba); if (error) goto out; mmc_discinfo->last_possible_lba = (uint32_t) last_lba; /* Read in all features to determine device capabilities */ last_feature = feature = 0; do { /* determine mmc profile and class */ flags = XS_CTL_DATA_IN; memset(&gc_cmd, 0, sizeof(gc_cmd)); gc_cmd.opcode = GET_CONFIGURATION; _lto2b(last_feature, gc_cmd.start_at_feature); _lto2b(buffer_size, gc_cmd.data_len); memset(gc, 0, buffer_size); error = scsipi_command(periph, (void *)&gc_cmd, sizeof(gc_cmd), (void *) gc, buffer_size, CDRETRIES, 30000, NULL, flags); if (error) { /* ieeek... break out of loop... i dunno what to do */ break; } features_len = _4btol(gc->data_len); if (features_len < 4 || features_len > buffer_size) break; pos = 0; fpos = &gc->feature_desc[0]; while (pos < features_len - 4) { gcf = (struct scsipi_get_conf_feature *) fpos; feature = _2btol(gcf->featurecode); feature_cur = gcf->flags & 1; feature_len = gcf->additional_length; mmc_process_feature(mmc_discinfo, feature, feature_cur, gcf->feature_dependent); last_feature = MAX(last_feature, feature); #ifdef DIAGNOSTIC /* assert((feature_len & 3) == 0); */ if ((feature_len & 3) != 0) { printf("feature %d having length %d\n", feature, feature_len); } #endif pos += 4 + feature_len; fpos += 4 + feature_len; } /* unlikely to ever grow past our 1kb buffer */ } while (features_len >= 0xffff); /* * Fixup CD-RW drives that are on crack. * * Some drives report the capability to incrementally write * sequentially on CD-R(W) media... nice, but this should not be * active for a fixed packet formatted CD-RW media. Other report the * ability of HW_DEFECTFREE even when the media is NOT MRW * formatted.... */ if (mmc_discinfo->mmc_profile == 0x0a) { if ((mmc_discinfo->mmc_cur & MMC_CAP_SEQUENTIAL) == 0) mmc_discinfo->mmc_cur |= MMC_CAP_STRICTOVERWRITE; if (mmc_discinfo->mmc_cur & MMC_CAP_STRICTOVERWRITE) mmc_discinfo->mmc_cur &= ~MMC_CAP_SEQUENTIAL; if (mmc_discinfo->mmc_cur & MMC_CAP_MRW) { mmc_discinfo->mmc_cur &= ~MMC_CAP_SEQUENTIAL; mmc_discinfo->mmc_cur &= ~MMC_CAP_STRICTOVERWRITE; } else { mmc_discinfo->mmc_cur &= ~MMC_CAP_HW_DEFECTFREE; } } if (mmc_discinfo->mmc_profile == 0x09) { mmc_discinfo->mmc_cur &= ~MMC_CAP_REWRITABLE; } #ifdef DEBUG printf("CD mmc %d, mmc_cur 0x%"PRIx64", mmc_cap 0x%"PRIx64"\n", mmc_discinfo->mmc_profile, mmc_discinfo->mmc_cur, mmc_discinfo->mmc_cap); #endif /* read in disc state and number of sessions and tracks */ flags = XS_CTL_DATA_IN | XS_CTL_SILENT; memset(&di_cmd, 0, sizeof(di_cmd)); di_cmd.opcode = READ_DISCINFO; di_cmd.data_len[1] = READ_DISCINFO_BIGSIZE; error = scsipi_command(periph, (void *)&di_cmd, sizeof(di_cmd), (void *)&di, READ_DISCINFO_BIGSIZE, CDRETRIES, 30000, NULL, flags); if (error) { /* discinfo call failed, emulate for cd-rom/dvd-rom */ if (mmc_discinfo->mmc_profile == 0x08) /* CD-ROM */ error = mmc_getdiscinfo_cdrom(periph, mmc_discinfo); else if (mmc_discinfo->mmc_profile == 0x10) /* DVD-ROM */ error = mmc_getdiscinfo_dvdrom(periph, mmc_discinfo); else /* CD/DVD drive is violating specs */ error = EIO; goto out; } /* call went OK */ mmc_discinfo->disc_state = di.disc_state & 3; mmc_discinfo->last_session_state = (di.disc_state >> 2) & 3; mmc_discinfo->bg_format_state = (di.disc_state2 & 3); lsb = di.num_sessions_lsb; msb = di.num_sessions_msb; mmc_discinfo->num_sessions = lsb | (msb << 8); mmc_discinfo->first_track = di.first_track; lsb = di.first_track_last_session_lsb; msb = di.first_track_last_session_msb; mmc_discinfo->first_track_last_session = lsb | (msb << 8); lsb = di.last_track_last_session_lsb; msb = di.last_track_last_session_msb; mmc_discinfo->last_track_last_session = lsb | (msb << 8); mmc_discinfo->num_tracks = mmc_discinfo->last_track_last_session - mmc_discinfo->first_track + 1; /* set misc. flags and parameters from this disc info */ if (di.disc_state & 16) mmc_discinfo->mmc_cur |= MMC_CAP_BLANKABLE; if (di.disc_state2 & 128) { mmc_discinfo->disc_id = _4btol(di.discid); mmc_discinfo->disc_flags |= MMC_DFLAGS_DISCIDVALID; } if (di.disc_state2 & 64) { mmc_discinfo->disc_barcode = _8btol(di.disc_bar_code); mmc_discinfo->disc_flags |= MMC_DFLAGS_BARCODEVALID; } if (di.disc_state2 & 32) mmc_discinfo->disc_flags |= MMC_DFLAGS_UNRESTRICTED; if (di.disc_state2 & 16) { mmc_discinfo->application_code = di.application_code; mmc_discinfo->disc_flags |= MMC_DFLAGS_APPCODEVALID; } /* done */ out: free(buffer, M_TEMP); return error; } static int mmc_gettrackinfo_cdrom(struct scsipi_periph *periph, struct mmc_trackinfo *trackinfo) { struct scsipi_read_toc gtoc_cmd; struct scsipi_toc_header *toc_hdr; struct scsipi_toc_rawtoc *rawtoc; uint32_t track_start, track_size; uint32_t last_recorded, next_writable; uint32_t lba, next_track_start, lead_out; const uint32_t buffer_size = 4 * 1024; /* worst case TOC estimate */ uint8_t *buffer; uint8_t track_sessionnr, sessionnr, adr, tno, point; uint8_t control, tmin, tsec, tframe, pmin, psec, pframe; int size, req_size; int error, flags; buffer = malloc(buffer_size, M_TEMP, M_WAITOK); /* * Emulate read trackinfo for CD-ROM using the raw-TOC. * * Not all information is present and this presents a problem. Track * starts are known for each track but other values are deducted. * * For a complete overview of `magic' values used here, see the * SCSI/ATAPI MMC documentation. Note that the `magic' values have no * names, they are specified as numbers. */ /* get raw toc to process, first header to check size */ flags = XS_CTL_DATA_IN | XS_CTL_SILENT; memset(&gtoc_cmd, 0, sizeof(gtoc_cmd)); gtoc_cmd.opcode = READ_TOC; gtoc_cmd.addr_mode = CD_MSF; /* not relevant */ gtoc_cmd.resp_format = CD_TOC_RAW; /* raw toc */ gtoc_cmd.from_track = 1; /* first session */ req_size = sizeof(*toc_hdr); _lto2b(req_size, gtoc_cmd.data_len); error = scsipi_command(periph, (void *)&gtoc_cmd, sizeof(gtoc_cmd), (void *)buffer, req_size, CDRETRIES, 30000, NULL, flags); if (error) goto out; toc_hdr = (struct scsipi_toc_header *) buffer; if (_2btol(toc_hdr->length) > buffer_size - 2) { #ifdef DIAGNOSTIC printf("increase buffersize in mmc_readtrackinfo_cdrom\n"); #endif error = ENOBUFS; goto out; } /* read in complete raw toc */ req_size = _2btol(toc_hdr->length); req_size = 2*((req_size + 1) / 2); /* for ATAPI */ _lto2b(req_size, gtoc_cmd.data_len); error = scsipi_command(periph, (void *)&gtoc_cmd, sizeof(gtoc_cmd), (void *)buffer, req_size, CDRETRIES, 30000, NULL, flags); if (error) goto out; toc_hdr = (struct scsipi_toc_header *) buffer; rawtoc = (struct scsipi_toc_rawtoc *) (buffer + 4); track_start = 0; track_size = 0; last_recorded = 0; next_writable = 0; flags = 0; next_track_start = 0; track_sessionnr = MAXTRACK; /* by definition */ lead_out = 0; size = req_size - sizeof(struct scsipi_toc_header) + 1; while (size > 0) { /* get track start and session end */ tno = rawtoc->tno; sessionnr = rawtoc->sessionnr; adr = rawtoc->adrcontrol >> 4; control = rawtoc->adrcontrol & 0xf; point = rawtoc->point; tmin = rawtoc->min; tsec = rawtoc->sec; tframe = rawtoc->frame; pmin = rawtoc->pmin; psec = rawtoc->psec; pframe = rawtoc->pframe; if (tno == 0 && sessionnr && adr == 1) { lba = hmsf2lba(0, pmin, psec, pframe); if (point == trackinfo->tracknr) { track_start = lba; track_sessionnr = sessionnr; } if (point == trackinfo->tracknr + 1) { /* estimate size */ track_size = lba - track_start; next_track_start = lba; } if (point == 0xa2) { lead_out = lba; } if (point <= 0x63) { /* CD's ok, DVD are glued */ /* last_tracknr = point; */ } if (sessionnr == track_sessionnr) { last_recorded = lead_out; } } if (tno == 0 && sessionnr && adr == 5) { lba = hmsf2lba(0, tmin, tsec, tframe); if (sessionnr == track_sessionnr) { next_writable = lba; } } if ((control & (3<<2)) == 4) /* 01xxb */ flags |= MMC_TRACKINFO_DATA; if ((control & (1<<2)) == 0) { /* x0xxb */ flags |= MMC_TRACKINFO_AUDIO; if (control & 1) /* xxx1b */ flags |= MMC_TRACKINFO_PRE_EMPH; } rawtoc++; size -= sizeof(struct scsipi_toc_rawtoc); } /* process found values; some voodoo */ /* if no tracksize tracknr is the last of the disc */ if ((track_size == 0) && last_recorded) { track_size = last_recorded - track_start; } /* if last_recorded < tracksize, tracksize is overestimated */ if (last_recorded) { if (last_recorded - track_start <= track_size) { track_size = last_recorded - track_start; flags |= MMC_TRACKINFO_LRA_VALID; } } /* check if its a the last track of the sector */ if (next_writable) { if (next_track_start > next_writable) flags |= MMC_TRACKINFO_NWA_VALID; } /* no flag set -> no values */ if ((flags & MMC_TRACKINFO_LRA_VALID) == 0) last_recorded = 0; if ((flags & MMC_TRACKINFO_NWA_VALID) == 0) next_writable = 0; /* fill in */ /* trackinfo->tracknr preserved */ trackinfo->sessionnr = track_sessionnr; trackinfo->track_mode = 7; /* data, incremental */ trackinfo->data_mode = 8; /* 2048 bytes mode1 */ trackinfo->flags = flags; trackinfo->track_start = track_start; trackinfo->next_writable = next_writable; trackinfo->free_blocks = 0; trackinfo->packet_size = 1; trackinfo->track_size = track_size; trackinfo->last_recorded = last_recorded; out: free(buffer, M_TEMP); return error; } static int mmc_gettrackinfo_dvdrom(struct scsipi_periph *periph, struct mmc_trackinfo *trackinfo) { struct scsipi_read_toc gtoc_cmd; struct scsipi_toc_header *toc_hdr; struct scsipi_toc_formatted *toc; uint32_t tracknr, track_start, track_size; uint32_t lba, lead_out; const uint32_t buffer_size = 4 * 1024; /* worst case TOC estimate */ uint8_t *buffer; uint8_t control, last_tracknr; int size, req_size; int error, flags; buffer = malloc(buffer_size, M_TEMP, M_WAITOK); /* * Emulate read trackinfo for DVD-ROM. We can't use the raw-TOC as the * CD-ROM emulation uses since the specification tells us that no such * thing is defined for DVD's. The reason for this is due to the large * number of tracks and that would clash with the `magic' values. This * suxs. * * Not all information is present and this presents a problem. * Track starts are known for each track but other values are * deducted. */ /* get formatted toc to process, first header to check size */ flags = XS_CTL_DATA_IN | XS_CTL_SILENT; memset(&gtoc_cmd, 0, sizeof(gtoc_cmd)); gtoc_cmd.opcode = READ_TOC; gtoc_cmd.addr_mode = 0; /* lba's please */ gtoc_cmd.resp_format = CD_TOC_FORM; /* formatted toc */ gtoc_cmd.from_track = 1; /* first track */ req_size = sizeof(*toc_hdr); _lto2b(req_size, gtoc_cmd.data_len); error = scsipi_command(periph, (void *)&gtoc_cmd, sizeof(gtoc_cmd), (void *)buffer, req_size, CDRETRIES, 30000, NULL, flags); if (error) goto out; toc_hdr = (struct scsipi_toc_header *) buffer; if (_2btol(toc_hdr->length) > buffer_size - 2) { #ifdef DIAGNOSTIC printf("increase buffersize in mmc_readtrackinfo_dvdrom\n"); #endif error = ENOBUFS; goto out; } /* read in complete formatted toc */ req_size = _2btol(toc_hdr->length); _lto2b(req_size, gtoc_cmd.data_len); error = scsipi_command(periph, (void *)&gtoc_cmd, sizeof(gtoc_cmd), (void *)buffer, req_size, CDRETRIES, 30000, NULL, flags); if (error) goto out; toc_hdr = (struct scsipi_toc_header *) buffer; toc = (struct scsipi_toc_formatted *) (buffer + 4); /* as in read disc info, all sessions are converted to tracks */ /* track 1.. -> offsets, sizes can be (roughly) estimated (16 ECC) */ /* last track -> we got the size from the lead-out */ tracknr = 0; last_tracknr = toc_hdr->last; track_start = 0; track_size = 0; lead_out = 0; flags = 0; size = req_size - sizeof(struct scsipi_toc_header) + 1; while (size > 0) { /* remember, DVD-ROM: tracknr == sessionnr */ lba = _4btol(toc->msf_lba); tracknr = toc->tracknr; control = toc->adrcontrol & 0xf; if (trackinfo->tracknr == tracknr) { track_start = lba; } if (trackinfo->tracknr == tracknr+1) { track_size = lba - track_start; track_size -= 16; /* link block ? */ } if (tracknr == 0xAA) { lead_out = lba; } if ((control & (3<<2)) == 4) /* 01xxb */ flags |= MMC_TRACKINFO_DATA; if ((control & (1<<2)) == 0) { /* x0xxb */ flags |= MMC_TRACKINFO_AUDIO; if (control & (1<<3)) /* 10xxb */ flags |= MMC_TRACKINFO_AUDIO_4CHAN; if (control & 1) /* xxx1b */ flags |= MMC_TRACKINFO_PRE_EMPH; } toc++; size -= sizeof(struct scsipi_toc_formatted); } if (trackinfo->tracknr == last_tracknr) { track_size = lead_out - track_start; } /* fill in */ /* trackinfo->tracknr preserved */ trackinfo->sessionnr = trackinfo->tracknr; trackinfo->track_mode = 0; /* unknown */ trackinfo->data_mode = 8; /* 2048 bytes mode1 */ trackinfo->flags = flags; trackinfo->track_start = track_start; trackinfo->next_writable = 0; trackinfo->free_blocks = 0; trackinfo->packet_size = 16; /* standard length 16 blocks ECC */ trackinfo->track_size = track_size; trackinfo->last_recorded = 0; out: free(buffer, M_TEMP); return error; } static int mmc_gettrackinfo(struct scsipi_periph *periph, struct mmc_trackinfo *trackinfo) { struct scsipi_read_trackinfo ti_cmd; struct scsipi_read_trackinfo_data ti __aligned(2); struct scsipi_get_configuration gc_cmd; struct scsipi_get_conf_data gc __aligned(2); int error, flags; int mmc_profile; /* set up SCSI call with track number from trackinfo.tracknr */ flags = XS_CTL_DATA_IN | XS_CTL_SILENT; memset(&ti_cmd, 0, sizeof(ti_cmd)); ti_cmd.opcode = READ_TRACKINFO; ti_cmd.addr_type = READ_TRACKINFO_ADDR_TRACK; ti_cmd.data_len[1] = READ_TRACKINFO_RETURNSIZE; /* trackinfo.tracknr contains number of tracks to query */ _lto4b(trackinfo->tracknr, ti_cmd.address); error = scsipi_command(periph, (void *)&ti_cmd, sizeof(ti_cmd), (void *)&ti, READ_TRACKINFO_RETURNSIZE, CDRETRIES, 30000, NULL, flags); if (error) { /* trackinfo call failed, emulate for cd-rom/dvd-rom */ /* first determine mmc profile */ flags = XS_CTL_DATA_IN; memset(&gc_cmd, 0, sizeof(gc_cmd)); gc_cmd.opcode = GET_CONFIGURATION; _lto2b(GET_CONF_NO_FEATURES_LEN, gc_cmd.data_len); error = scsipi_command(periph, (void *)&gc_cmd, sizeof(gc_cmd), (void *)&gc, GET_CONF_NO_FEATURES_LEN, CDRETRIES, 30000, NULL, flags); if (error) return error; mmc_profile = _2btol(gc.mmc_profile); /* choose emulation */ if (mmc_profile == 0x08) /* CD-ROM */ return mmc_gettrackinfo_cdrom(periph, trackinfo); if (mmc_profile == 0x10) /* DVD-ROM */ return mmc_gettrackinfo_dvdrom(periph, trackinfo); /* CD/DVD drive is violating specs */ return EIO; } /* (re)initialise structure */ memset(trackinfo, 0, sizeof(struct mmc_trackinfo)); /* account for short returns screwing up track and session msb */ if ((ti.data_len[1] | (ti.data_len[0] << 8)) <= 32) { ti.track_msb = 0; ti.session_msb = 0; } trackinfo->tracknr = ti.track_lsb | (ti.track_msb << 8); trackinfo->sessionnr = ti.session_lsb | (ti.session_msb << 8); trackinfo->track_mode = ti.track_info_1 & 0xf; trackinfo->data_mode = ti.track_info_2 & 0xf; flags = 0; if (ti.track_info_1 & 0x10) flags |= MMC_TRACKINFO_COPY; if (ti.track_info_1 & 0x20) flags |= MMC_TRACKINFO_DAMAGED; if (ti.track_info_2 & 0x10) flags |= MMC_TRACKINFO_FIXED_PACKET; if (ti.track_info_2 & 0x20) flags |= MMC_TRACKINFO_INCREMENTAL; if (ti.track_info_2 & 0x40) flags |= MMC_TRACKINFO_BLANK; if (ti.track_info_2 & 0x80) flags |= MMC_TRACKINFO_RESERVED; if (ti.data_valid & 0x01) flags |= MMC_TRACKINFO_NWA_VALID; if (ti.data_valid & 0x02) flags |= MMC_TRACKINFO_LRA_VALID; if ((trackinfo->track_mode & (3<<2)) == 4) /* 01xxb */ flags |= MMC_TRACKINFO_DATA; if ((trackinfo->track_mode & (1<<2)) == 0) { /* x0xxb */ flags |= MMC_TRACKINFO_AUDIO; if (trackinfo->track_mode & (1<<3)) /* 10xxb */ flags |= MMC_TRACKINFO_AUDIO_4CHAN; if (trackinfo->track_mode & 1) /* xxx1b */ flags |= MMC_TRACKINFO_PRE_EMPH; } trackinfo->flags = flags; trackinfo->track_start = _4btol(ti.track_start); trackinfo->next_writable = _4btol(ti.next_writable); trackinfo->free_blocks = _4btol(ti.free_blocks); trackinfo->packet_size = _4btol(ti.packet_size); trackinfo->track_size = _4btol(ti.track_size); trackinfo->last_recorded = _4btol(ti.last_recorded); return 0; } static int mmc_doclose(struct scsipi_periph *periph, int param, int func) { struct scsipi_close_tracksession close_cmd; int error, flags; /* set up SCSI call with track number */ flags = XS_CTL_DATA_OUT; memset(&close_cmd, 0, sizeof(close_cmd)); close_cmd.opcode = CLOSE_TRACKSESSION; close_cmd.function = func; _lto2b(param, close_cmd.tracksessionnr); error = scsipi_command(periph, (void *) &close_cmd, sizeof(close_cmd), NULL, 0, CDRETRIES, 120000, NULL, flags); return error; } static int mmc_do_closetrack(struct scsipi_periph *periph, struct mmc_op *mmc_op) { int mmc_profile = mmc_op->mmc_profile; switch (mmc_profile) { case 0x12 : /* DVD-RAM */ case 0x1a : /* DVD+RW */ case 0x2a : /* DVD+RW Dual layer */ case 0x42 : /* BD-R Random Recording (RRM) */ case 0x43 : /* BD-RE */ case 0x52 : /* HD DVD-RW ; DVD-RAM like */ return EINVAL; } return mmc_doclose(periph, mmc_op->tracknr, 1); } static int mmc_do_close_or_finalise(struct scsipi_periph *periph, struct mmc_op *mmc_op) { uint8_t blob[MS5LEN], *page5; int mmc_profile = mmc_op->mmc_profile; int func, close, flags; int error; close = (mmc_op->operation == MMC_OP_CLOSESESSION); switch (mmc_profile) { case 0x09 : /* CD-R */ case 0x0a : /* CD-RW */ /* Special case : need to update MS field in mode page 5 */ memset(blob, 0, sizeof(blob)); page5 = blob+8; flags = XS_CTL_DATA_IN; error = scsipi_mode_sense_big(periph, SMS_PF, 5, (void *)blob, sizeof(blob), flags, CDRETRIES, 20000); if (error) return error; /* set multi session field when closing a session only */ page5[3] &= 63; if (close) page5[3] |= 3 << 6; flags = XS_CTL_DATA_OUT; error = scsipi_mode_select_big(periph, SMS_PF, (void *)blob, sizeof(blob), flags, CDRETRIES, 20000); if (error) return error; /* and use function 2 */ func = 2; break; case 0x11 : /* DVD-R (DL) */ case 0x13 : /* DVD-RW restricted overwrite */ case 0x14 : /* DVD-RW sequential */ func = close ? 2 : 3; break; case 0x1b : /* DVD+R */ case 0x2b : /* DVD+R Dual layer */ case 0x51 : /* HD DVD-R */ case 0x41 : /* BD-R Sequential recording (SRM) */ func = close ? 2 : 6; break; case 0x12 : /* DVD-RAM */ case 0x1a : /* DVD+RW */ case 0x2a : /* DVD+RW Dual layer */ case 0x42 : /* BD-R Random Recording (RRM) */ case 0x43 : /* BD-RE */ case 0x52 : /* HD DVD-RW; DVD-RAM like */ return EINVAL; default: printf("MMC close/finalise passed wrong device type! (%d)\n", mmc_profile); return EINVAL; } return mmc_doclose(periph, mmc_op->sessionnr, func); } static int mmc_do_reserve_track(struct scsipi_periph *periph, struct mmc_op *mmc_op) { struct scsipi_reserve_track reserve_cmd; uint32_t extent; int error, flags; /* TODO make mmc safeguards? */ extent = mmc_op->extent; /* TODO min/max support? */ /* set up SCSI call with requested space */ flags = XS_CTL_DATA_OUT; memset(&reserve_cmd, 0, sizeof(reserve_cmd)); reserve_cmd.opcode = RESERVE_TRACK; _lto4b(extent, reserve_cmd.reservation_size); error = scsipi_command(periph, (void *) &reserve_cmd, sizeof(reserve_cmd), NULL, 0, CDRETRIES, 30000, NULL, flags); return error; } static int mmc_do_reserve_track_nwa(struct scsipi_periph *periph, struct mmc_op *mmc_op) { /* XXX assumes that NWA given is valid */ switch (mmc_op->mmc_profile) { case 0x09 : /* CD-R */ /* XXX unknown boundary checks XXX */ if (mmc_op->extent <= 152) return EINVAL; /* CD-R takes 152 sectors to close track */ mmc_op->extent -= 152; return mmc_do_reserve_track(periph, mmc_op); case 0x11 : /* DVD-R (DL) */ case 0x1b : /* DVD+R */ case 0x2b : /* DVD+R Dual layer */ if (mmc_op->extent % 16) return EINVAL; /* upto one ECC block of 16 sectors lost */ mmc_op->extent -= 16; return mmc_do_reserve_track(periph, mmc_op); case 0x41 : /* BD-R Sequential recording (SRM) */ case 0x51 : /* HD DVD-R */ if (mmc_op->extent % 32) return EINVAL; /* one ECC block of 32 sectors lost (AFAIK) */ mmc_op->extent -= 32; return mmc_do_reserve_track(periph, mmc_op); } /* unknown behaviour or invalid disc type */ return EINVAL; } static int mmc_do_repair_track(struct scsipi_periph *periph, struct mmc_op *mmc_op) { struct scsipi_repair_track repair_cmd; int error, flags; /* TODO make mmc safeguards? */ /* set up SCSI call with track number */ flags = XS_CTL_DATA_OUT; memset(&repair_cmd, 0, sizeof(repair_cmd)); repair_cmd.opcode = REPAIR_TRACK; _lto2b(mmc_op->tracknr, repair_cmd.tracknr); error = scsipi_command(periph, (void *) &repair_cmd, sizeof(repair_cmd), NULL, 0, CDRETRIES, 30000, NULL, flags); return error; } static int mmc_do_op(struct scsipi_periph *periph, struct mmc_op *mmc_op) { /* guard operation value */ if (mmc_op->operation < 1 || mmc_op->operation > MMC_OP_MAX) return EINVAL; /* synchronise cache is special since it doesn't rely on mmc_profile */ if (mmc_op->operation == MMC_OP_SYNCHRONISECACHE) return cdcachesync(periph, 0); /* zero mmc_profile means unknown disc so operations are not defined */ if (mmc_op->mmc_profile == 0) { #ifdef DEBUG printf("mmc_do_op called with mmc_profile = 0\n"); #endif return EINVAL; } /* do the operations */ switch (mmc_op->operation) { case MMC_OP_CLOSETRACK : return mmc_do_closetrack(periph, mmc_op); case MMC_OP_CLOSESESSION : case MMC_OP_FINALISEDISC : return mmc_do_close_or_finalise(periph, mmc_op); case MMC_OP_RESERVETRACK : return mmc_do_reserve_track(periph, mmc_op); case MMC_OP_RESERVETRACK_NWA : return mmc_do_reserve_track_nwa(periph, mmc_op); case MMC_OP_REPAIRTRACK : return mmc_do_repair_track(periph, mmc_op); case MMC_OP_UNCLOSELASTSESSION : /* TODO unclose last session support */ return EINVAL; default : printf("mmc_do_op: unhandled operation %d\n", mmc_op->operation); } return EINVAL; } static int mmc_setup_writeparams(struct scsipi_periph *periph, struct mmc_writeparams *mmc_writeparams) { struct mmc_trackinfo trackinfo; uint8_t blob[MS5LEN]; uint8_t *page5; int flags, error; int track_mode, data_mode; /* setup mode page 5 for CD only */ if (mmc_writeparams->mmc_class != MMC_CLASS_CD) return 0; memset(blob, 0, sizeof(blob)); page5 = blob+8; /* read mode page 5 (with header) */ flags = XS_CTL_DATA_IN; error = scsipi_mode_sense_big(periph, SMS_PF, 5, (void *)blob, sizeof(blob), flags, CDRETRIES, 20000); if (error) return error; /* set page length for reassurance */ page5[1] = P5LEN; /* page length */ /* write type packet/incremental */ page5[2] &= 0xf0; /* set specified mode parameters */ track_mode = mmc_writeparams->track_mode; data_mode = mmc_writeparams->data_mode; if (track_mode <= 0 || track_mode > 15) return EINVAL; if (data_mode < 1 || data_mode > 2) return EINVAL; /* if a tracknr is passed, setup according to the track */ if (mmc_writeparams->tracknr > 0) { trackinfo.tracknr = mmc_writeparams->tracknr; error = mmc_gettrackinfo(periph, &trackinfo); if (error) return error; if ((trackinfo.flags & MMC_TRACKINFO_BLANK) == 0) { track_mode = trackinfo.track_mode; data_mode = trackinfo.data_mode; } mmc_writeparams->blockingnr = trackinfo.packet_size; } /* copy track mode and data mode from trackinfo */ page5[3] &= 16; /* keep only `Copy' bit */ page5[3] |= (3 << 6) | track_mode; page5[4] &= 0xf0; /* wipe data block type */ if (data_mode == 1) { /* select ISO mode 1 (CD only) */ page5[4] |= 8; /* select session format normal disc (CD only) */ page5[8] = 0; } else { /* select ISO mode 2; XA form 1 (CD only) */ page5[4] |= 10; /* select session format CD-ROM XA disc (CD only) */ page5[8] = 0x20; } if (mmc_writeparams->mmc_cur & MMC_CAP_SEQUENTIAL) { if (mmc_writeparams->mmc_cur & MMC_CAP_ZEROLINKBLK) { /* set BUFE buffer underrun protection */ page5[2] |= 1<<6; } /* allow for multi session */ page5[3] |= 3 << 6; } else { /* select fixed packets */ page5[3] |= 1<<5; _lto4b(mmc_writeparams->blockingnr, &(page5[10])); } /* write out updated mode page 5 (with header) */ flags = XS_CTL_DATA_OUT; error = scsipi_mode_select_big(periph, SMS_PF, (void *)blob, sizeof(blob), flags, CDRETRIES, 20000); if (error) return error; return 0; } static void cd_set_geometry(struct cd_softc *cd) { struct dk_softc *dksc = &cd->sc_dksc; struct disk_geom *dg = &dksc->sc_dkdev.dk_geom; memset(dg, 0, sizeof(*dg)); dg->dg_secperunit = cd->params.disksize; dg->dg_secsize = cd->params.blksize; dg->dg_nsectors = 100; dg->dg_ntracks = 1; disk_set_info(dksc->sc_dev, &dksc->sc_dkdev, NULL); }
26 2 1 2 2 10 10 10 9 1 13 13 13 7 2 2 1 1 1 7 7 1 4 5 1 4 1 13 13 13 1 9 4 5 10 5 10 7 3 8 1 15 15 15 3 1 2 1 2 1 1 1 1 2 1 4 4 4 1 1 6 6 2 4 3 1 3 3 3 3 3 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 /* $NetBSD: sysv_shm.c,v 1.142 2024/03/02 08:59:47 mlelstv Exp $ */ /*- * Copyright (c) 1999, 2007 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility, * NASA Ames Research Center, and by Mindaugas Rasiukevicius. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /* * Copyright (c) 1994 Adam Glass and Charles M. Hannum. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by Adam Glass and Charles M. * Hannum. * 4. The names of the authors may not be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: sysv_shm.c,v 1.142 2024/03/02 08:59:47 mlelstv Exp $"); #ifdef _KERNEL_OPT #include "opt_sysv.h" #endif #include <sys/param.h> #include <sys/kernel.h> #include <sys/kmem.h> #include <sys/shm.h> #include <sys/mutex.h> #include <sys/mman.h> #include <sys/stat.h> #include <sys/sysctl.h> #include <sys/mount.h> /* XXX for <sys/syscallargs.h> */ #include <sys/syscallargs.h> #include <sys/queue.h> #include <sys/kauth.h> #include <uvm/uvm_extern.h> #include <uvm/uvm_object.h> struct shmmap_entry { SLIST_ENTRY(shmmap_entry) next; vaddr_t va; int shmid; }; int shm_nused __cacheline_aligned; struct shmid_ds * shmsegs __read_mostly; static kmutex_t shm_lock __cacheline_aligned; static kcondvar_t * shm_cv __cacheline_aligned; static int shm_last_free __cacheline_aligned; static size_t shm_committed __cacheline_aligned; static int shm_use_phys __read_mostly; static kcondvar_t shm_realloc_cv; static bool shm_realloc_state; static u_int shm_realloc_disable; struct shmmap_state { unsigned int nitems; unsigned int nrefs; SLIST_HEAD(, shmmap_entry) entries; }; extern int kern_has_sysvshm; SYSCTL_SETUP_PROTO(sysctl_ipc_shm_setup); #ifdef SHMDEBUG #define SHMPRINTF(a) printf a #else #define SHMPRINTF(a) #endif static int shmrealloc(int); /* * Find the shared memory segment permission by the index. Only used by * compat_linux to implement SHM_STAT. */ int shm_find_segment_perm_by_index(int index, struct ipc_perm *perm) { struct shmid_ds *shmseg; mutex_enter(&shm_lock); if (index < 0 || index >= shminfo.shmmni) { mutex_exit(&shm_lock); return EINVAL; } shmseg = &shmsegs[index]; memcpy(perm, &shmseg->shm_perm, sizeof(*perm)); mutex_exit(&shm_lock); return 0; } /* * Find the shared memory segment by the identifier. * => must be called with shm_lock held; */ static struct shmid_ds * shm_find_segment_by_shmid(int shmid) { int segnum; struct shmid_ds *shmseg; KASSERT(mutex_owned(&shm_lock)); segnum = IPCID_TO_IX(shmid); if (segnum < 0 || segnum >= shminfo.shmmni) return NULL; shmseg = &shmsegs[segnum]; if ((shmseg->shm_perm.mode & SHMSEG_ALLOCATED) == 0) return NULL; if ((shmseg->shm_perm.mode & (SHMSEG_REMOVED|SHMSEG_RMLINGER)) == SHMSEG_REMOVED) return NULL; if (shmseg->shm_perm._seq != IPCID_TO_SEQ(shmid)) return NULL; return shmseg; } /* * Free memory segment. * => must be called with shm_lock held; */ static void shm_free_segment(int segnum) { struct shmid_ds *shmseg; size_t size; bool wanted; KASSERT(mutex_owned(&shm_lock)); shmseg = &shmsegs[segnum]; SHMPRINTF(("shm freeing key 0x%lx seq 0x%x\n", shmseg->shm_perm._key, shmseg->shm_perm._seq)); size = (shmseg->shm_segsz + PGOFSET) & ~PGOFSET; wanted = (shmseg->shm_perm.mode & SHMSEG_WANTED); shmseg->_shm_internal = NULL; shm_committed -= btoc(size); shm_nused--; shmseg->shm_perm.mode = SHMSEG_FREE; shm_last_free = segnum; if (wanted == true) cv_broadcast(&shm_cv[segnum]); } /* * Delete entry from the shm map. * => must be called with shm_lock held; */ static struct uvm_object * shm_delete_mapping(struct shmmap_state *shmmap_s, struct shmmap_entry *shmmap_se) { struct uvm_object *uobj = NULL; struct shmid_ds *shmseg; int segnum; KASSERT(mutex_owned(&shm_lock)); segnum = IPCID_TO_IX(shmmap_se->shmid); shmseg = &shmsegs[segnum]; SLIST_REMOVE(&shmmap_s->entries, shmmap_se, shmmap_entry, next); shmmap_s->nitems--; shmseg->shm_dtime = time_second; if ((--shmseg->shm_nattch <= 0) && (shmseg->shm_perm.mode & SHMSEG_REMOVED)) { uobj = shmseg->_shm_internal; shm_free_segment(segnum); } return uobj; } /* * Get a non-shared shm map for that vmspace. Note, that memory * allocation might be performed with lock held. */ static struct shmmap_state * shmmap_getprivate(struct proc *p) { struct shmmap_state *oshmmap_s, *shmmap_s; struct shmmap_entry *oshmmap_se, *shmmap_se; KASSERT(mutex_owned(&shm_lock)); /* 1. A shm map with refcnt = 1, used by ourselves, thus return */ oshmmap_s = (struct shmmap_state *)p->p_vmspace->vm_shm; if (oshmmap_s && oshmmap_s->nrefs == 1) return oshmmap_s; /* 2. No shm map preset - create a fresh one */ shmmap_s = kmem_zalloc(sizeof(struct shmmap_state), KM_SLEEP); shmmap_s->nrefs = 1; SLIST_INIT(&shmmap_s->entries); p->p_vmspace->vm_shm = (void *)shmmap_s; if (oshmmap_s == NULL) return shmmap_s; SHMPRINTF(("shmmap_getprivate: vm %p split (%d entries), was used by %d\n", p->p_vmspace, oshmmap_s->nitems, oshmmap_s->nrefs)); /* 3. A shared shm map, copy to a fresh one and adjust refcounts */ SLIST_FOREACH(oshmmap_se, &oshmmap_s->entries, next) { shmmap_se = kmem_alloc(sizeof(struct shmmap_entry), KM_SLEEP); shmmap_se->va = oshmmap_se->va; shmmap_se->shmid = oshmmap_se->shmid; SLIST_INSERT_HEAD(&shmmap_s->entries, shmmap_se, next); } shmmap_s->nitems = oshmmap_s->nitems; oshmmap_s->nrefs--; return shmmap_s; } /* * Lock/unlock the memory. * => must be called with shm_lock held; */ static int shm_memlock(struct shmid_ds *shmseg, int shmid, int cmd) { size_t size; int error; KASSERT(mutex_owned(&shm_lock)); size = round_page(shmseg->shm_segsz); if (cmd == SHM_LOCK && (shmseg->shm_perm.mode & SHMSEG_WIRED) == 0) { /* Wire the object and map, then tag it */ error = uvm_obj_wirepages(shmseg->_shm_internal, 0, size, NULL); if (error) return EIO; shmseg->shm_perm.mode |= SHMSEG_WIRED; } else if (cmd == SHM_UNLOCK && (shmseg->shm_perm.mode & SHMSEG_WIRED) != 0) { /* Unwire the object, then untag it */ uvm_obj_unwirepages(shmseg->_shm_internal, 0, size); shmseg->shm_perm.mode &= ~SHMSEG_WIRED; } return 0; } /* * Unmap shared memory. */ int sys_shmdt(struct lwp *l, const struct sys_shmdt_args *uap, register_t *retval) { /* { syscallarg(const void *) shmaddr; } */ struct proc *p = l->l_proc; struct shmmap_state *shmmap_s1, *shmmap_s; struct shmmap_entry *shmmap_se; struct uvm_object *uobj; struct shmid_ds *shmseg; size_t size; mutex_enter(&shm_lock); /* In case of reallocation, we will wait for completion */ while (__predict_false(shm_realloc_state)) cv_wait(&shm_realloc_cv, &shm_lock); shmmap_s1 = (struct shmmap_state *)p->p_vmspace->vm_shm; if (shmmap_s1 == NULL) { mutex_exit(&shm_lock); return EINVAL; } /* Find the map entry */ SLIST_FOREACH(shmmap_se, &shmmap_s1->entries, next) if (shmmap_se->va == (vaddr_t)SCARG(uap, shmaddr)) break; if (shmmap_se == NULL) { mutex_exit(&shm_lock); return EINVAL; } shmmap_s = shmmap_getprivate(p); if (shmmap_s != shmmap_s1) { /* Map has been copied, lookup entry in new map */ SLIST_FOREACH(shmmap_se, &shmmap_s->entries, next) if (shmmap_se->va == (vaddr_t)SCARG(uap, shmaddr)) break; if (shmmap_se == NULL) { mutex_exit(&shm_lock); return EINVAL; } } SHMPRINTF(("shmdt: vm %p: remove %d @%lx\n", p->p_vmspace, shmmap_se->shmid, shmmap_se->va)); /* Delete the entry from shm map */ uobj = shm_delete_mapping(shmmap_s, shmmap_se); shmseg = &shmsegs[IPCID_TO_IX(shmmap_se->shmid)]; size = (shmseg->shm_segsz + PGOFSET) & ~PGOFSET; mutex_exit(&shm_lock); uvm_deallocate(&p->p_vmspace->vm_map, shmmap_se->va, size); if (uobj != NULL) { uao_detach(uobj); } kmem_free(shmmap_se, sizeof(struct shmmap_entry)); return 0; } /* * Map shared memory. */ int sys_shmat(struct lwp *l, const struct sys_shmat_args *uap, register_t *retval) { /* { syscallarg(int) shmid; syscallarg(const void *) shmaddr; syscallarg(int) shmflg; } */ int error, flags = 0; struct proc *p = l->l_proc; kauth_cred_t cred = l->l_cred; struct shmid_ds *shmseg; struct shmmap_state *shmmap_s; struct shmmap_entry *shmmap_se; struct uvm_object *uobj; struct vmspace *vm; vaddr_t attach_va; vm_prot_t prot; vsize_t size; /* Allocate a new map entry and set it */ shmmap_se = kmem_alloc(sizeof(struct shmmap_entry), KM_SLEEP); shmmap_se->shmid = SCARG(uap, shmid); mutex_enter(&shm_lock); /* In case of reallocation, we will wait for completion */ while (__predict_false(shm_realloc_state)) cv_wait(&shm_realloc_cv, &shm_lock); shmseg = shm_find_segment_by_shmid(SCARG(uap, shmid)); if (shmseg == NULL) { error = EINVAL; goto err; } error = ipcperm(cred, &shmseg->shm_perm, (SCARG(uap, shmflg) & SHM_RDONLY) ? IPC_R : IPC_R|IPC_W); if (error) goto err; vm = p->p_vmspace; shmmap_s = (struct shmmap_state *)vm->vm_shm; if (shmmap_s && shmmap_s->nitems >= shminfo.shmseg) { error = EMFILE; goto err; } size = (shmseg->shm_segsz + PGOFSET) & ~PGOFSET; prot = VM_PROT_READ; if ((SCARG(uap, shmflg) & SHM_RDONLY) == 0) prot |= VM_PROT_WRITE; if (SCARG(uap, shmaddr)) { flags |= UVM_FLAG_FIXED; if (SCARG(uap, shmflg) & SHM_RND) attach_va = (vaddr_t)SCARG(uap, shmaddr) & ~(SHMLBA-1); else if (((vaddr_t)SCARG(uap, shmaddr) & (SHMLBA-1)) == 0) attach_va = (vaddr_t)SCARG(uap, shmaddr); else { error = EINVAL; goto err; } } else { /* This is just a hint to uvm_map() about where to put it. */ attach_va = p->p_emul->e_vm_default_addr(p, (vaddr_t)vm->vm_daddr, size, p->p_vmspace->vm_map.flags & VM_MAP_TOPDOWN); } /* * Create a map entry, add it to the list and increase the counters. */ shmmap_s = shmmap_getprivate(p); SLIST_INSERT_HEAD(&shmmap_s->entries, shmmap_se, next); shmmap_s->nitems++; shmseg->shm_lpid = p->p_pid; shmseg->shm_nattch++; /* * Map the segment into the address space. */ uobj = shmseg->_shm_internal; uao_reference(uobj); error = uvm_map(&vm->vm_map, &attach_va, size, uobj, 0, 0, UVM_MAPFLAG(prot, prot, UVM_INH_SHARE, UVM_ADV_RANDOM, flags)); if (error) goto err_detach; /* Set the new address, and update the time */ shmmap_se->va = attach_va; shmseg->shm_atime = time_second; retval[0] = attach_va; SHMPRINTF(("shmat: vm %p: add %d @%lx\n", p->p_vmspace, shmmap_se->shmid, attach_va)); err: mutex_exit(&shm_lock); if (error && shmmap_se) { kmem_free(shmmap_se, sizeof(struct shmmap_entry)); } return error; err_detach: uao_detach(uobj); uobj = shm_delete_mapping(shmmap_s, shmmap_se); mutex_exit(&shm_lock); if (uobj != NULL) { uao_detach(uobj); } kmem_free(shmmap_se, sizeof(struct shmmap_entry)); return error; } /* * Shared memory control operations. */ int sys___shmctl50(struct lwp *l, const struct sys___shmctl50_args *uap, register_t *retval) { /* { syscallarg(int) shmid; syscallarg(int) cmd; syscallarg(struct shmid_ds *) buf; } */ struct shmid_ds shmbuf; int cmd, error; cmd = SCARG(uap, cmd); if (cmd == IPC_SET) { error = copyin(SCARG(uap, buf), &shmbuf, sizeof(shmbuf)); if (error) return error; } error = shmctl1(l, SCARG(uap, shmid), cmd, (cmd == IPC_SET || cmd == IPC_STAT) ? &shmbuf : NULL); if (error == 0 && cmd == IPC_STAT) error = copyout(&shmbuf, SCARG(uap, buf), sizeof(shmbuf)); return error; } int shmctl1(struct lwp *l, int shmid, int cmd, struct shmid_ds *shmbuf) { struct uvm_object *uobj = NULL; kauth_cred_t cred = l->l_cred; struct shmid_ds *shmseg; int error = 0; mutex_enter(&shm_lock); /* In case of reallocation, we will wait for completion */ while (__predict_false(shm_realloc_state)) cv_wait(&shm_realloc_cv, &shm_lock); shmseg = shm_find_segment_by_shmid(shmid); if (shmseg == NULL) { mutex_exit(&shm_lock); return EINVAL; } switch (cmd) { case IPC_STAT: if ((error = ipcperm(cred, &shmseg->shm_perm, IPC_R)) != 0) break; memset(shmbuf, 0, sizeof *shmbuf); shmbuf->shm_perm = shmseg->shm_perm; shmbuf->shm_perm.mode &= 0777; shmbuf->shm_segsz = shmseg->shm_segsz; shmbuf->shm_lpid = shmseg->shm_lpid; shmbuf->shm_cpid = shmseg->shm_cpid; shmbuf->shm_nattch = shmseg->shm_nattch; shmbuf->shm_atime = shmseg->shm_atime; shmbuf->shm_dtime = shmseg->shm_dtime; shmbuf->shm_ctime = shmseg->shm_ctime; break; case IPC_SET: if ((error = ipcperm(cred, &shmseg->shm_perm, IPC_M)) != 0) break; shmseg->shm_perm.uid = shmbuf->shm_perm.uid; shmseg->shm_perm.gid = shmbuf->shm_perm.gid; shmseg->shm_perm.mode = (shmseg->shm_perm.mode & ~ACCESSPERMS) | (shmbuf->shm_perm.mode & ACCESSPERMS); shmseg->shm_ctime = time_second; break; case IPC_RMID: if ((error = ipcperm(cred, &shmseg->shm_perm, IPC_M)) != 0) break; shmseg->shm_perm._key = IPC_PRIVATE; shmseg->shm_perm.mode |= SHMSEG_REMOVED; if (shmseg->shm_nattch <= 0) { uobj = shmseg->_shm_internal; shm_free_segment(IPCID_TO_IX(shmid)); } break; case SHM_LOCK: case SHM_UNLOCK: if ((error = kauth_authorize_system(cred, KAUTH_SYSTEM_SYSVIPC, (cmd == SHM_LOCK) ? KAUTH_REQ_SYSTEM_SYSVIPC_SHM_LOCK : KAUTH_REQ_SYSTEM_SYSVIPC_SHM_UNLOCK, NULL, NULL, NULL)) != 0) break; error = shm_memlock(shmseg, shmid, cmd); break; default: error = EINVAL; } mutex_exit(&shm_lock); if (uobj != NULL) uao_detach(uobj); return error; } /* * Try to take an already existing segment. * => must be called with shm_lock held; * => called from one place, thus, inline; */ static inline int shmget_existing(struct lwp *l, const struct sys_shmget_args *uap, int mode, register_t *retval) { struct shmid_ds *shmseg; kauth_cred_t cred = l->l_cred; int segnum, error; again: KASSERT(mutex_owned(&shm_lock)); /* Find segment by key */ for (segnum = 0; segnum < shminfo.shmmni; segnum++) if ((shmsegs[segnum].shm_perm.mode & SHMSEG_ALLOCATED) && shmsegs[segnum].shm_perm._key == SCARG(uap, key)) break; if (segnum == shminfo.shmmni) { /* Not found */ return -1; } shmseg = &shmsegs[segnum]; if (shmseg->shm_perm.mode & SHMSEG_REMOVED) { /* * This segment is in the process of being allocated. Wait * until it's done, and look the key up again (in case the * allocation failed or it was freed). */ shmseg->shm_perm.mode |= SHMSEG_WANTED; error = cv_wait_sig(&shm_cv[segnum], &shm_lock); if (error) return error; goto again; } /* * First check the flags, to generate a useful error when a * segment already exists. */ if ((SCARG(uap, shmflg) & (IPC_CREAT | IPC_EXCL)) == (IPC_CREAT | IPC_EXCL)) return EEXIST; /* Check the permission and segment size. */ error = ipcperm(cred, &shmseg->shm_perm, mode); if (error) return error; if (SCARG(uap, size) && SCARG(uap, size) > shmseg->shm_segsz) return EINVAL; *retval = IXSEQ_TO_IPCID(segnum, shmseg->shm_perm); return 0; } int sys_shmget(struct lwp *l, const struct sys_shmget_args *uap, register_t *retval) { /* { syscallarg(key_t) key; syscallarg(size_t) size; syscallarg(int) shmflg; } */ struct shmid_ds *shmseg; kauth_cred_t cred = l->l_cred; key_t key = SCARG(uap, key); size_t size; int error, mode, segnum; bool lockmem; mode = SCARG(uap, shmflg) & ACCESSPERMS; if (SCARG(uap, shmflg) & _SHM_RMLINGER) mode |= SHMSEG_RMLINGER; SHMPRINTF(("shmget: key 0x%lx size 0x%zx shmflg 0x%x mode 0x%x\n", SCARG(uap, key), SCARG(uap, size), SCARG(uap, shmflg), mode)); mutex_enter(&shm_lock); /* In case of reallocation, we will wait for completion */ while (__predict_false(shm_realloc_state)) cv_wait(&shm_realloc_cv, &shm_lock); if (key != IPC_PRIVATE) { error = shmget_existing(l, uap, mode, retval); if (error != -1) { mutex_exit(&shm_lock); return error; } if ((SCARG(uap, shmflg) & IPC_CREAT) == 0) { mutex_exit(&shm_lock); return ENOENT; } } error = 0; /* * Check the for the limits. */ size = SCARG(uap, size); if (size < shminfo.shmmin || size > shminfo.shmmax) { mutex_exit(&shm_lock); return EINVAL; } if (shm_nused >= shminfo.shmmni) { mutex_exit(&shm_lock); return ENOSPC; } size = round_page(size); if (shm_committed + btoc(size) > shminfo.shmall) { mutex_exit(&shm_lock); return ENOMEM; } /* Find the first available segment */ if (shm_last_free < 0) { for (segnum = 0; segnum < shminfo.shmmni; segnum++) if (shmsegs[segnum].shm_perm.mode & SHMSEG_FREE) break; KASSERT(segnum < shminfo.shmmni); } else { segnum = shm_last_free; shm_last_free = -1; } /* * Initialize the segment. * We will drop the lock while allocating the memory, thus mark the * segment present, but removed, that no other thread could take it. * Also, disable reallocation, while lock is dropped. */ shmseg = &shmsegs[segnum]; shmseg->shm_perm.mode = SHMSEG_ALLOCATED | SHMSEG_REMOVED; shm_committed += btoc(size); shm_nused++; lockmem = shm_use_phys; shm_realloc_disable++; mutex_exit(&shm_lock); /* Allocate the memory object and lock it if needed */ shmseg->_shm_internal = uao_create(size, 0); if (lockmem) { /* Wire the pages and tag it */ error = uvm_obj_wirepages(shmseg->_shm_internal, 0, size, NULL); if (error) { uao_detach(shmseg->_shm_internal); mutex_enter(&shm_lock); shm_free_segment(segnum); shm_realloc_disable--; mutex_exit(&shm_lock); return error; } } /* * Please note, while segment is marked, there are no need to hold the * lock, while setting it (except shm_perm.mode). */ shmseg->shm_perm._key = SCARG(uap, key); shmseg->shm_perm._seq = (shmseg->shm_perm._seq + 1) & 0x7fff; *retval = IXSEQ_TO_IPCID(segnum, shmseg->shm_perm); shmseg->shm_perm.cuid = shmseg->shm_perm.uid = kauth_cred_geteuid(cred); shmseg->shm_perm.cgid = shmseg->shm_perm.gid = kauth_cred_getegid(cred); shmseg->shm_segsz = SCARG(uap, size); shmseg->shm_cpid = l->l_proc->p_pid; shmseg->shm_lpid = shmseg->shm_nattch = 0; shmseg->shm_atime = shmseg->shm_dtime = 0; shmseg->shm_ctime = time_second; /* * Segment is initialized. * Enter the lock, mark as allocated, and notify waiters (if any). * Also, unmark the state of reallocation. */ mutex_enter(&shm_lock); shmseg->shm_perm.mode = (shmseg->shm_perm.mode & SHMSEG_WANTED) | (mode & (ACCESSPERMS | SHMSEG_RMLINGER)) | SHMSEG_ALLOCATED | (lockmem ? SHMSEG_WIRED : 0); if (shmseg->shm_perm.mode & SHMSEG_WANTED) { shmseg->shm_perm.mode &= ~SHMSEG_WANTED; cv_broadcast(&shm_cv[segnum]); } shm_realloc_disable--; cv_broadcast(&shm_realloc_cv); mutex_exit(&shm_lock); return error; } void shmfork(struct vmspace *vm1, struct vmspace *vm2) { struct shmmap_state *shmmap_s; struct shmmap_entry *shmmap_se; SHMPRINTF(("shmfork %p->%p\n", vm1, vm2)); mutex_enter(&shm_lock); vm2->vm_shm = vm1->vm_shm; if (vm1->vm_shm) { shmmap_s = (struct shmmap_state *)vm1->vm_shm; SLIST_FOREACH(shmmap_se, &shmmap_s->entries, next) shmsegs[IPCID_TO_IX(shmmap_se->shmid)].shm_nattch++; shmmap_s->nrefs++; } mutex_exit(&shm_lock); } void shmexit(struct vmspace *vm) { struct shmmap_state *shmmap_s; struct shmmap_entry *shmmap_se; mutex_enter(&shm_lock); shmmap_s = (struct shmmap_state *)vm->vm_shm; if (shmmap_s == NULL) { mutex_exit(&shm_lock); return; } vm->vm_shm = NULL; if (--shmmap_s->nrefs > 0) { SHMPRINTF(("shmexit: vm %p drop ref (%d entries), refs = %d\n", vm, shmmap_s->nitems, shmmap_s->nrefs)); SLIST_FOREACH(shmmap_se, &shmmap_s->entries, next) { shmsegs[IPCID_TO_IX(shmmap_se->shmid)].shm_nattch--; } mutex_exit(&shm_lock); return; } SHMPRINTF(("shmexit: vm %p cleanup (%d entries)\n", vm, shmmap_s->nitems)); if (shmmap_s->nitems == 0) { mutex_exit(&shm_lock); kmem_free(shmmap_s, sizeof(struct shmmap_state)); return; } /* * Delete the entry from shm map. */ for (;;) { struct shmid_ds *shmseg; struct uvm_object *uobj; size_t sz; shmmap_se = SLIST_FIRST(&shmmap_s->entries); KASSERT(shmmap_se != NULL); shmseg = &shmsegs[IPCID_TO_IX(shmmap_se->shmid)]; sz = (shmseg->shm_segsz + PGOFSET) & ~PGOFSET; /* shm_delete_mapping() removes from the list. */ uobj = shm_delete_mapping(shmmap_s, shmmap_se); mutex_exit(&shm_lock); uvm_deallocate(&vm->vm_map, shmmap_se->va, sz); if (uobj != NULL) { uao_detach(uobj); } kmem_free(shmmap_se, sizeof(struct shmmap_entry)); if (SLIST_EMPTY(&shmmap_s->entries)) { break; } mutex_enter(&shm_lock); KASSERT(!SLIST_EMPTY(&shmmap_s->entries)); } kmem_free(shmmap_s, sizeof(struct shmmap_state)); } static int shmrealloc(int newshmni) { vaddr_t v; struct shmid_ds *oldshmsegs, *newshmsegs; kcondvar_t *newshm_cv, *oldshm_cv; size_t sz; int i, lsegid, oldshmni; if (newshmni < 1) return EINVAL; /* Allocate new memory area */ sz = ALIGN(newshmni * sizeof(struct shmid_ds)) + ALIGN(newshmni * sizeof(kcondvar_t)); sz = round_page(sz); v = uvm_km_alloc(kernel_map, sz, 0, UVM_KMF_WIRED|UVM_KMF_ZERO); if (v == 0) return ENOMEM; mutex_enter(&shm_lock); while (shm_realloc_state || shm_realloc_disable) cv_wait(&shm_realloc_cv, &shm_lock); /* * Get the number of last segment. Fail we are trying to * reallocate less memory than we use. */ lsegid = 0; for (i = 0; i < shminfo.shmmni; i++) if ((shmsegs[i].shm_perm.mode & SHMSEG_FREE) == 0) lsegid = i; if (lsegid >= newshmni) { mutex_exit(&shm_lock); uvm_km_free(kernel_map, v, sz, UVM_KMF_WIRED); return EBUSY; } shm_realloc_state = true; newshmsegs = (void *)v; newshm_cv = (void *)((uintptr_t)newshmsegs + ALIGN(newshmni * sizeof(struct shmid_ds))); /* Copy all memory to the new area */ for (i = 0; i < shm_nused; i++) { cv_init(&newshm_cv[i], "shmwait"); (void)memcpy(&newshmsegs[i], &shmsegs[i], sizeof(newshmsegs[0])); } /* Mark as free all new segments, if there is any */ for (; i < newshmni; i++) { cv_init(&newshm_cv[i], "shmwait"); newshmsegs[i].shm_perm.mode = SHMSEG_FREE; newshmsegs[i].shm_perm._seq = 0; } oldshmsegs = shmsegs; oldshmni = shminfo.shmmni; shminfo.shmmni = newshmni; shmsegs = newshmsegs; shm_cv = newshm_cv; /* Reallocation completed - notify all waiters, if any */ shm_realloc_state = false; cv_broadcast(&shm_realloc_cv); mutex_exit(&shm_lock); /* Release now unused resources. */ oldshm_cv = (void *)((uintptr_t)oldshmsegs + ALIGN(oldshmni * sizeof(struct shmid_ds))); for (i = 0; i < oldshmni; i++) cv_destroy(&oldshm_cv[i]); sz = ALIGN(oldshmni * sizeof(struct shmid_ds)) + ALIGN(oldshmni * sizeof(kcondvar_t)); sz = round_page(sz); uvm_km_free(kernel_map, (vaddr_t)oldshmsegs, sz, UVM_KMF_WIRED); return 0; } int shminit(void) { vaddr_t v; size_t sz; int i; mutex_init(&shm_lock, MUTEX_DEFAULT, IPL_NONE); cv_init(&shm_realloc_cv, "shmrealc"); /* Allocate the wired memory for our structures */ sz = ALIGN(shminfo.shmmni * sizeof(struct shmid_ds)) + ALIGN(shminfo.shmmni * sizeof(kcondvar_t)); sz = round_page(sz); v = uvm_km_alloc(kernel_map, sz, 0, UVM_KMF_WIRED|UVM_KMF_ZERO); if (v == 0) { printf("sysv_shm: cannot allocate memory"); return ENOMEM; } shmsegs = (void *)v; shm_cv = (void *)((uintptr_t)shmsegs + ALIGN(shminfo.shmmni * sizeof(struct shmid_ds))); if (shminfo.shmmax == 0) shminfo.shmall = uimax(physmem / 4, 1024); else shminfo.shmall = shminfo.shmmax / PAGE_SIZE; shminfo.shmmax = (uint64_t)shminfo.shmall * PAGE_SIZE; for (i = 0; i < shminfo.shmmni; i++) { cv_init(&shm_cv[i], "shmwait"); shmsegs[i].shm_perm.mode = SHMSEG_FREE; shmsegs[i].shm_perm._seq = 0; } shm_last_free = 0; shm_nused = 0; shm_committed = 0; shm_realloc_disable = 0; shm_realloc_state = false; kern_has_sysvshm = 1; /* Load the callback function pointers for the uvm subsystem */ uvm_shmexit = shmexit; uvm_shmfork = shmfork; return 0; } int shmfini(void) { size_t sz; int i; vaddr_t v = (vaddr_t)shmsegs; mutex_enter(&shm_lock); if (shm_nused) { mutex_exit(&shm_lock); return 1; } /* Clear the callback function pointers for the uvm subsystem */ uvm_shmexit = NULL; uvm_shmfork = NULL; /* Destroy all condvars */ for (i = 0; i < shminfo.shmmni; i++) cv_destroy(&shm_cv[i]); cv_destroy(&shm_realloc_cv); /* Free the allocated/wired memory */ sz = ALIGN(shminfo.shmmni * sizeof(struct shmid_ds)) + ALIGN(shminfo.shmmni * sizeof(kcondvar_t)); sz = round_page(sz); uvm_km_free(kernel_map, v, sz, UVM_KMF_WIRED); /* Release and destroy our mutex */ mutex_exit(&shm_lock); mutex_destroy(&shm_lock); kern_has_sysvshm = 0; return 0; } static int sysctl_ipc_shmmni(SYSCTLFN_ARGS) { int newsize, error; struct sysctlnode node; node = *rnode; node.sysctl_data = &newsize; newsize = shminfo.shmmni; error = sysctl_lookup(SYSCTLFN_CALL(&node)); if (error || newp == NULL) return error; sysctl_unlock(); error = shmrealloc(newsize); sysctl_relock(); return error; } static int sysctl_ipc_shmmaxpgs(SYSCTLFN_ARGS) { uint32_t newsize; int error; struct sysctlnode node; node = *rnode; node.sysctl_data = &newsize; newsize = shminfo.shmall; error = sysctl_lookup(SYSCTLFN_CALL(&node)); if (error || newp == NULL) return error; if (newsize < 1) return EINVAL; shminfo.shmall = newsize; shminfo.shmmax = (uint64_t)shminfo.shmall * PAGE_SIZE; return 0; } static int sysctl_ipc_shmmax(SYSCTLFN_ARGS) { uint64_t newsize; int error; struct sysctlnode node; node = *rnode; node.sysctl_data = &newsize; newsize = shminfo.shmmax; error = sysctl_lookup(SYSCTLFN_CALL(&node)); if (error || newp == NULL) return error; if (newsize < PAGE_SIZE) return EINVAL; shminfo.shmmax = round_page(newsize); shminfo.shmall = shminfo.shmmax / PAGE_SIZE; return 0; } SYSCTL_SETUP(sysctl_ipc_shm_setup, "sysctl kern.ipc subtree setup") { sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT, CTLTYPE_NODE, "ipc", SYSCTL_DESCR("SysV IPC options"), NULL, 0, NULL, 0, CTL_KERN, KERN_SYSVIPC, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT | CTLFLAG_READWRITE, CTLTYPE_QUAD, "shmmax", SYSCTL_DESCR("Max shared memory segment size in bytes"), sysctl_ipc_shmmax, 0, &shminfo.shmmax, 0, CTL_KERN, KERN_SYSVIPC, KERN_SYSVIPC_SHMMAX, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT | CTLFLAG_READWRITE, CTLTYPE_INT, "shmmni", SYSCTL_DESCR("Max number of shared memory identifiers"), sysctl_ipc_shmmni, 0, &shminfo.shmmni, 0, CTL_KERN, KERN_SYSVIPC, KERN_SYSVIPC_SHMMNI, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT | CTLFLAG_READWRITE, CTLTYPE_INT, "shmseg", SYSCTL_DESCR("Max shared memory segments per process"), NULL, 0, &shminfo.shmseg, 0, CTL_KERN, KERN_SYSVIPC, KERN_SYSVIPC_SHMSEG, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT | CTLFLAG_READWRITE, CTLTYPE_INT, "shmmaxpgs", SYSCTL_DESCR("Max amount of shared memory in pages"), sysctl_ipc_shmmaxpgs, 0, &shminfo.shmall, 0, CTL_KERN, KERN_SYSVIPC, KERN_SYSVIPC_SHMMAXPGS, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT | CTLFLAG_READWRITE, CTLTYPE_INT, "shm_use_phys", SYSCTL_DESCR("Enable/disable locking of shared memory in " "physical memory"), NULL, 0, &shm_use_phys, 0, CTL_KERN, KERN_SYSVIPC, KERN_SYSVIPC_SHMUSEPHYS, CTL_EOL); }
178 181 180 179 160 160 21 21 21 22 22 22 6 5 5 1 6 5 2 3 3 6 1 6 4 2 2 4 1 2 26 24 15 26 5 1 4 5 1 1 1 1 5 5 1 4 5 5 5 5 5 5 19 2 4 17 18 19 7 2 5 5 4 1 18 31 32 21 21 21 21 53 53 52 52 52 53 17 21 21 20 1 53 52 53 23 53 21 6 6 6 6 29 30 30 30 90 90 90 90 89 17 16 17 17 17 16 2 2 1 6 10 143 104 106 16 135 113 30 105 122 121 123 8 8 8 6 6 6 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 /* $NetBSD: uipc_mbuf.c,v 1.252 2023/11/27 02:50:27 ozaki-r Exp $ */ /* * Copyright (c) 1999, 2001, 2018 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility, * NASA Ames Research Center, and Maxime Villard. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /* * Copyright (c) 1982, 1986, 1988, 1991, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)uipc_mbuf.c 8.4 (Berkeley) 2/14/95 */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: uipc_mbuf.c,v 1.252 2023/11/27 02:50:27 ozaki-r Exp $"); #ifdef _KERNEL_OPT #include "opt_mbuftrace.h" #include "opt_nmbclusters.h" #include "opt_ddb.h" #include "ether.h" #endif #include <sys/param.h> #include <sys/systm.h> #include <sys/atomic.h> #include <sys/cpu.h> #include <sys/proc.h> #include <sys/mbuf.h> #include <sys/kernel.h> #include <sys/syslog.h> #include <sys/domain.h> #include <sys/protosw.h> #include <sys/percpu.h> #include <sys/pool.h> #include <sys/socket.h> #include <sys/sysctl.h> #include <net/if.h> pool_cache_t mb_cache; /* mbuf cache */ static pool_cache_t mcl_cache; /* mbuf cluster cache */ struct mbstat mbstat; int max_linkhdr; int max_protohdr; int max_hdr; int max_datalen; static void mb_drain(void *, int); static int mb_ctor(void *, void *, int); static void sysctl_kern_mbuf_setup(void); static struct sysctllog *mbuf_sysctllog; static struct mbuf *m_copy_internal(struct mbuf *, int, int, int, bool); static struct mbuf *m_split_internal(struct mbuf *, int, int, bool); static int m_copyback_internal(struct mbuf **, int, int, const void *, int, int); /* Flags for m_copyback_internal. */ #define CB_COPYBACK 0x0001 /* copyback from cp */ #define CB_PRESERVE 0x0002 /* preserve original data */ #define CB_COW 0x0004 /* do copy-on-write */ #define CB_EXTEND 0x0008 /* extend chain */ static const char mclpool_warnmsg[] = "WARNING: mclpool limit reached; increase kern.mbuf.nmbclusters"; MALLOC_DEFINE(M_MBUF, "mbuf", "mbuf"); static percpu_t *mbstat_percpu; #ifdef MBUFTRACE struct mownerhead mowners = LIST_HEAD_INITIALIZER(mowners); struct mowner unknown_mowners[] = { MOWNER_INIT("unknown", "free"), MOWNER_INIT("unknown", "data"), MOWNER_INIT("unknown", "header"), MOWNER_INIT("unknown", "soname"), MOWNER_INIT("unknown", "soopts"), MOWNER_INIT("unknown", "ftable"), MOWNER_INIT("unknown", "control"), MOWNER_INIT("unknown", "oobdata"), }; struct mowner revoked_mowner = MOWNER_INIT("revoked", ""); #endif #define MEXT_ISEMBEDDED(m) ((m)->m_ext_ref == (m)) #define MCLADDREFERENCE(o, n) \ do { \ KASSERT(((o)->m_flags & M_EXT) != 0); \ KASSERT(((n)->m_flags & M_EXT) == 0); \ KASSERT((o)->m_ext.ext_refcnt >= 1); \ (n)->m_flags |= ((o)->m_flags & M_EXTCOPYFLAGS); \ atomic_inc_uint(&(o)->m_ext.ext_refcnt); \ (n)->m_ext_ref = (o)->m_ext_ref; \ mowner_ref((n), (n)->m_flags); \ } while (/* CONSTCOND */ 0) static int nmbclusters_limit(void) { #if defined(PMAP_MAP_POOLPAGE) /* direct mapping, doesn't use space in kmem_arena */ vsize_t max_size = physmem / 4; #else vsize_t max_size = MIN(physmem / 4, nkmempages / 4); #endif max_size = max_size * PAGE_SIZE / MCLBYTES; #ifdef NMBCLUSTERS_MAX max_size = MIN(max_size, NMBCLUSTERS_MAX); #endif return max_size; } /* * Initialize the mbuf allocator. */ void mbinit(void) { CTASSERT(sizeof(struct _m_ext) <= MHLEN); CTASSERT(sizeof(struct mbuf) == MSIZE); sysctl_kern_mbuf_setup(); mb_cache = pool_cache_init(msize, 0, 0, 0, "mbpl", NULL, IPL_VM, mb_ctor, NULL, NULL); KASSERT(mb_cache != NULL); mcl_cache = pool_cache_init(mclbytes, COHERENCY_UNIT, 0, 0, "mclpl", NULL, IPL_VM, NULL, NULL, NULL); KASSERT(mcl_cache != NULL); pool_cache_set_drain_hook(mb_cache, mb_drain, NULL); pool_cache_set_drain_hook(mcl_cache, mb_drain, NULL); /* * Set an arbitrary default limit on the number of mbuf clusters. */ #ifdef NMBCLUSTERS nmbclusters = MIN(NMBCLUSTERS, nmbclusters_limit()); #else nmbclusters = MAX(1024, (vsize_t)physmem * PAGE_SIZE / MCLBYTES / 16); nmbclusters = MIN(nmbclusters, nmbclusters_limit()); #endif /* * Set the hard limit on the mclpool to the number of * mbuf clusters the kernel is to support. Log the limit * reached message max once a minute. */ pool_cache_sethardlimit(mcl_cache, nmbclusters, mclpool_warnmsg, 60); mbstat_percpu = percpu_alloc(sizeof(struct mbstat_cpu)); /* * Set a low water mark for both mbufs and clusters. This should * help ensure that they can be allocated in a memory starvation * situation. This is important for e.g. diskless systems which * must allocate mbufs in order for the pagedaemon to clean pages. */ pool_cache_setlowat(mb_cache, mblowat); pool_cache_setlowat(mcl_cache, mcllowat); #ifdef MBUFTRACE { /* * Attach the unknown mowners. */ int i; MOWNER_ATTACH(&revoked_mowner); for (i = sizeof(unknown_mowners)/sizeof(unknown_mowners[0]); i-- > 0; ) MOWNER_ATTACH(&unknown_mowners[i]); } #endif } static void mb_drain(void *arg, int flags) { struct domain *dp; const struct protosw *pr; struct ifnet *ifp; int s; KERNEL_LOCK(1, NULL); s = splvm(); DOMAIN_FOREACH(dp) { for (pr = dp->dom_protosw; pr < dp->dom_protoswNPROTOSW; pr++) if (pr->pr_drain) (*pr->pr_drain)(); } /* XXX we cannot use psref in H/W interrupt */ if (!cpu_intr_p()) { int bound = curlwp_bind(); IFNET_READER_FOREACH(ifp) { struct psref psref; if_acquire(ifp, &psref); if (ifp->if_drain) (*ifp->if_drain)(ifp); if_release(ifp, &psref); } curlwp_bindx(bound); } splx(s); mbstat.m_drain++; KERNEL_UNLOCK_ONE(NULL); } /* * sysctl helper routine for the kern.mbuf subtree. * nmbclusters, mblowat and mcllowat need range * checking and pool tweaking after being reset. */ static int sysctl_kern_mbuf(SYSCTLFN_ARGS) { int error, newval; struct sysctlnode node; node = *rnode; node.sysctl_data = &newval; switch (rnode->sysctl_num) { case MBUF_NMBCLUSTERS: case MBUF_MBLOWAT: case MBUF_MCLLOWAT: newval = *(int*)rnode->sysctl_data; break; case MBUF_NMBCLUSTERS_LIMIT: newval = nmbclusters_limit(); break; default: return EOPNOTSUPP; } error = sysctl_lookup(SYSCTLFN_CALL(&node)); if (error || newp == NULL) return error; if (newval < 0) return EINVAL; switch (node.sysctl_num) { case MBUF_NMBCLUSTERS: if (newval < nmbclusters) return EINVAL; if (newval > nmbclusters_limit()) return EINVAL; nmbclusters = newval; pool_cache_sethardlimit(mcl_cache, nmbclusters, mclpool_warnmsg, 60); break; case MBUF_MBLOWAT: mblowat = newval; pool_cache_setlowat(mb_cache, mblowat); break; case MBUF_MCLLOWAT: mcllowat = newval; pool_cache_setlowat(mcl_cache, mcllowat); break; } return 0; } #ifdef MBUFTRACE static void mowner_convert_to_user_cb(void *v1, void *v2, struct cpu_info *ci) { struct mowner_counter *mc = v1; struct mowner_user *mo_user = v2; int i; for (i = 0; i < MOWNER_COUNTER_NCOUNTERS; i++) { mo_user->mo_counter[i] += mc->mc_counter[i]; } } static void mowner_convert_to_user(struct mowner *mo, struct mowner_user *mo_user) { memset(mo_user, 0, sizeof(*mo_user)); CTASSERT(sizeof(mo_user->mo_name) == sizeof(mo->mo_name)); CTASSERT(sizeof(mo_user->mo_descr) == sizeof(mo->mo_descr)); memcpy(mo_user->mo_name, mo->mo_name, sizeof(mo->mo_name)); memcpy(mo_user->mo_descr, mo->mo_descr, sizeof(mo->mo_descr)); percpu_foreach(mo->mo_counters, mowner_convert_to_user_cb, mo_user); } static int sysctl_kern_mbuf_mowners(SYSCTLFN_ARGS) { struct mowner *mo; size_t len = 0; int error = 0; if (namelen != 0) return EINVAL; if (newp != NULL) return EPERM; LIST_FOREACH(mo, &mowners, mo_link) { struct mowner_user mo_user; mowner_convert_to_user(mo, &mo_user); if (oldp != NULL) { if (*oldlenp - len < sizeof(mo_user)) { error = ENOMEM; break; } error = copyout(&mo_user, (char *)oldp + len, sizeof(mo_user)); if (error) break; } len += sizeof(mo_user); } if (error == 0) *oldlenp = len; return error; } #endif /* MBUFTRACE */ void mbstat_type_add(int type, int diff) { struct mbstat_cpu *mb; int s; s = splvm(); mb = percpu_getref(mbstat_percpu); mb->m_mtypes[type] += diff; percpu_putref(mbstat_percpu); splx(s); } static void mbstat_convert_to_user_cb(void *v1, void *v2, struct cpu_info *ci) { struct mbstat_cpu *mbsc = v1; struct mbstat *mbs = v2; int i; for (i = 0; i < __arraycount(mbs->m_mtypes); i++) { mbs->m_mtypes[i] += mbsc->m_mtypes[i]; } } static void mbstat_convert_to_user(struct mbstat *mbs) { memset(mbs, 0, sizeof(*mbs)); mbs->m_drain = mbstat.m_drain; percpu_foreach(mbstat_percpu, mbstat_convert_to_user_cb, mbs); } static int sysctl_kern_mbuf_stats(SYSCTLFN_ARGS) { struct sysctlnode node; struct mbstat mbs; mbstat_convert_to_user(&mbs); node = *rnode; node.sysctl_data = &mbs; node.sysctl_size = sizeof(mbs); return sysctl_lookup(SYSCTLFN_CALL(&node)); } static void sysctl_kern_mbuf_setup(void) { KASSERT(mbuf_sysctllog == NULL); sysctl_createv(&mbuf_sysctllog, 0, NULL, NULL, CTLFLAG_PERMANENT, CTLTYPE_NODE, "mbuf", SYSCTL_DESCR("mbuf control variables"), NULL, 0, NULL, 0, CTL_KERN, KERN_MBUF, CTL_EOL); sysctl_createv(&mbuf_sysctllog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_IMMEDIATE, CTLTYPE_INT, "msize", SYSCTL_DESCR("mbuf base size"), NULL, msize, NULL, 0, CTL_KERN, KERN_MBUF, MBUF_MSIZE, CTL_EOL); sysctl_createv(&mbuf_sysctllog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_IMMEDIATE, CTLTYPE_INT, "mclbytes", SYSCTL_DESCR("mbuf cluster size"), NULL, mclbytes, NULL, 0, CTL_KERN, KERN_MBUF, MBUF_MCLBYTES, CTL_EOL); sysctl_createv(&mbuf_sysctllog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_INT, "nmbclusters", SYSCTL_DESCR("Limit on the number of mbuf clusters"), sysctl_kern_mbuf, 0, &nmbclusters, 0, CTL_KERN, KERN_MBUF, MBUF_NMBCLUSTERS, CTL_EOL); sysctl_createv(&mbuf_sysctllog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_INT, "mblowat", SYSCTL_DESCR("mbuf low water mark"), sysctl_kern_mbuf, 0, &mblowat, 0, CTL_KERN, KERN_MBUF, MBUF_MBLOWAT, CTL_EOL); sysctl_createv(&mbuf_sysctllog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_INT, "mcllowat", SYSCTL_DESCR("mbuf cluster low water mark"), sysctl_kern_mbuf, 0, &mcllowat, 0, CTL_KERN, KERN_MBUF, MBUF_MCLLOWAT, CTL_EOL); sysctl_createv(&mbuf_sysctllog, 0, NULL, NULL, CTLFLAG_PERMANENT, CTLTYPE_STRUCT, "stats", SYSCTL_DESCR("mbuf allocation statistics"), sysctl_kern_mbuf_stats, 0, NULL, 0, CTL_KERN, KERN_MBUF, MBUF_STATS, CTL_EOL); #ifdef MBUFTRACE sysctl_createv(&mbuf_sysctllog, 0, NULL, NULL, CTLFLAG_PERMANENT, CTLTYPE_STRUCT, "mowners", SYSCTL_DESCR("Information about mbuf owners"), sysctl_kern_mbuf_mowners, 0, NULL, 0, CTL_KERN, KERN_MBUF, MBUF_MOWNERS, CTL_EOL); #endif sysctl_createv(&mbuf_sysctllog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_READONLY, CTLTYPE_INT, "nmbclusters_limit", SYSCTL_DESCR("Limit of nmbclusters"), sysctl_kern_mbuf, 0, NULL, 0, CTL_KERN, KERN_MBUF, MBUF_NMBCLUSTERS_LIMIT, CTL_EOL); } static int mb_ctor(void *arg, void *object, int flags) { struct mbuf *m = object; #ifdef POOL_VTOPHYS m->m_paddr = POOL_VTOPHYS(m); #else m->m_paddr = M_PADDR_INVALID; #endif return 0; } /* * Add mbuf to the end of a chain */ struct mbuf * m_add(struct mbuf *c, struct mbuf *m) { struct mbuf *n; if (c == NULL) return m; for (n = c; n->m_next != NULL; n = n->m_next) continue; n->m_next = m; return c; } struct mbuf * m_get(int how, int type) { struct mbuf *m; KASSERT(type != MT_FREE); m = pool_cache_get(mb_cache, how == M_WAIT ? PR_WAITOK|PR_LIMITFAIL : PR_NOWAIT); if (m == NULL) return NULL; KASSERTMSG(((vaddr_t)m->m_dat & PAGE_MASK) + MLEN <= PAGE_SIZE, "m=%p m->m_dat=%p" " MLEN=%u PAGE_MASK=0x%x PAGE_SIZE=%u", m, m->m_dat, (unsigned)MLEN, (unsigned)PAGE_MASK, (unsigned)PAGE_SIZE); mbstat_type_add(type, 1); mowner_init(m, type); m->m_ext_ref = m; /* default */ m->m_type = type; m->m_len = 0; m->m_next = NULL; m->m_nextpkt = NULL; /* default */ m->m_data = m->m_dat; m->m_flags = 0; /* default */ return m; } struct mbuf * m_gethdr(int how, int type) { struct mbuf *m; m = m_get(how, type); if (m == NULL) return NULL; m->m_data = m->m_pktdat; m->m_flags = M_PKTHDR; m_reset_rcvif(m); m->m_pkthdr.len = 0; m->m_pkthdr.csum_flags = 0; m->m_pkthdr.csum_data = 0; m->m_pkthdr.segsz = 0; m->m_pkthdr.ether_vtag = 0; m->m_pkthdr.pkthdr_flags = 0; SLIST_INIT(&m->m_pkthdr.tags); m->m_pkthdr.pattr_class = NULL; m->m_pkthdr.pattr_af = AF_UNSPEC; m->m_pkthdr.pattr_hdr = NULL; return m; } struct mbuf * m_get_n(int how, int type, size_t alignbytes, size_t nbytes) { struct mbuf *m; if (alignbytes > MCLBYTES || nbytes > MCLBYTES - alignbytes) return NULL; if ((m = m_get(how, type)) == NULL) return NULL; if (nbytes + alignbytes > MLEN) { m_clget(m, how); if ((m->m_flags & M_EXT) == 0) { m_free(m); return NULL; } } m->m_len = alignbytes + nbytes; m_adj(m, alignbytes); return m; } struct mbuf * m_gethdr_n(int how, int type, size_t alignbytes, size_t nbytes) { struct mbuf *m; if (nbytes > MCLBYTES || nbytes > MCLBYTES - alignbytes) return NULL; if ((m = m_gethdr(how, type)) == NULL) return NULL; if (alignbytes + nbytes > MHLEN) { m_clget(m, how); if ((m->m_flags & M_EXT) == 0) { m_free(m); return NULL; } } m->m_len = m->m_pkthdr.len = alignbytes + nbytes; m_adj(m, alignbytes); return m; } void m_clget(struct mbuf *m, int how) { m->m_ext_storage.ext_buf = (char *)pool_cache_get_paddr(mcl_cache, how == M_WAIT ? (PR_WAITOK|PR_LIMITFAIL) : PR_NOWAIT, &m->m_ext_storage.ext_paddr); if (m->m_ext_storage.ext_buf == NULL) return; KASSERTMSG((((vaddr_t)m->m_ext_storage.ext_buf & PAGE_MASK) + mclbytes <= PAGE_SIZE), "m=%p m->m_ext_storage.ext_buf=%p" " mclbytes=%u PAGE_MASK=0x%x PAGE_SIZE=%u", m, m->m_dat, (unsigned)mclbytes, (unsigned)PAGE_MASK, (unsigned)PAGE_SIZE); MCLINITREFERENCE(m); m->m_data = m->m_ext.ext_buf; m->m_flags = (m->m_flags & ~M_EXTCOPYFLAGS) | M_EXT|M_EXT_CLUSTER|M_EXT_RW; m->m_ext.ext_size = MCLBYTES; m->m_ext.ext_free = NULL; m->m_ext.ext_arg = NULL; /* ext_paddr initialized above */ mowner_ref(m, M_EXT|M_EXT_CLUSTER); } struct mbuf * m_getcl(int how, int type, int flags) { struct mbuf *mp; if ((flags & M_PKTHDR) != 0) mp = m_gethdr(how, type); else mp = m_get(how, type); if (mp == NULL) return NULL; MCLGET(mp, how); if ((mp->m_flags & M_EXT) != 0) return mp; m_free(mp); return NULL; } /* * Utility function for M_PREPEND. Do *NOT* use it directly. */ struct mbuf * m_prepend(struct mbuf *m, int len, int how) { struct mbuf *mn; if (__predict_false(len > MHLEN)) { panic("%s: len > MHLEN", __func__); } KASSERT(len != M_COPYALL); mn = m_get(how, m->m_type); if (mn == NULL) { m_freem(m); return NULL; } if (m->m_flags & M_PKTHDR) { m_move_pkthdr(mn, m); } else { MCLAIM(mn, m->m_owner); } mn->m_next = m; m = mn; if (m->m_flags & M_PKTHDR) { if (len < MHLEN) m_align(m, len); } else { if (len < MLEN) m_align(m, len); } m->m_len = len; return m; } struct mbuf * m_copym(struct mbuf *m, int off, int len, int wait) { /* Shallow copy on M_EXT. */ return m_copy_internal(m, off, len, wait, false); } struct mbuf * m_dup(struct mbuf *m, int off, int len, int wait) { /* Deep copy. */ return m_copy_internal(m, off, len, wait, true); } static inline int m_copylen(int len, int copylen) { return (len == M_COPYALL) ? copylen : uimin(len, copylen); } static struct mbuf * m_copy_internal(struct mbuf *m, int off0, int len, int wait, bool deep) { struct mbuf *m0 __diagused = m; int len0 __diagused = len; struct mbuf *n, **np; int off = off0; struct mbuf *top; int copyhdr = 0; if (off < 0 || (len != M_COPYALL && len < 0)) panic("%s: off %d, len %d", __func__, off, len); if (off == 0 && m->m_flags & M_PKTHDR) copyhdr = 1; while (off > 0) { if (m == NULL) panic("%s: m == NULL, off %d", __func__, off); if (off < m->m_len) break; off -= m->m_len; m = m->m_next; } np = &top; top = NULL; while (len == M_COPYALL || len > 0) { if (m == NULL) { if (len != M_COPYALL) panic("%s: m == NULL, len %d [!COPYALL]", __func__, len); break; } n = m_get(wait, m->m_type); *np = n; if (n == NULL) goto nospace; MCLAIM(n, m->m_owner); if (copyhdr) { m_copy_pkthdr(n, m); if (len == M_COPYALL) n->m_pkthdr.len -= off0; else n->m_pkthdr.len = len; copyhdr = 0; } n->m_len = m_copylen(len, m->m_len - off); if (m->m_flags & M_EXT) { if (!deep) { n->m_data = m->m_data + off; MCLADDREFERENCE(m, n); } else { /* * We don't care if MCLGET fails. n->m_len is * recomputed and handles that. */ MCLGET(n, wait); n->m_len = 0; n->m_len = M_TRAILINGSPACE(n); n->m_len = m_copylen(len, n->m_len); n->m_len = uimin(n->m_len, m->m_len - off); memcpy(mtod(n, void *), mtod(m, char *) + off, (unsigned)n->m_len); } } else { memcpy(mtod(n, void *), mtod(m, char *) + off, (unsigned)n->m_len); } if (len != M_COPYALL) len -= n->m_len; off += n->m_len; KASSERTMSG(off <= m->m_len, "m=%p m->m_len=%d off=%d len=%d m0=%p off0=%d len0=%d", m, m->m_len, off, len, m0, off0, len0); if (off == m->m_len) { m = m->m_next; off = 0; } np = &n->m_next; } return top; nospace: m_freem(top); return NULL; } /* * Copy an entire packet, including header (which must be present). * An optimization of the common case 'm_copym(m, 0, M_COPYALL, how)'. */ struct mbuf * m_copypacket(struct mbuf *m, int how) { struct mbuf *top, *n, *o; if (__predict_false((m->m_flags & M_PKTHDR) == 0)) { panic("%s: no header (m = %p)", __func__, m); } n = m_get(how, m->m_type); top = n; if (!n) goto nospace; MCLAIM(n, m->m_owner); m_copy_pkthdr(n, m); n->m_len = m->m_len; if (m->m_flags & M_EXT) { n->m_data = m->m_data; MCLADDREFERENCE(m, n); } else { memcpy(mtod(n, char *), mtod(m, char *), n->m_len); } m = m->m_next; while (m) { o = m_get(how, m->m_type); if (!o) goto nospace; MCLAIM(o, m->m_owner); n->m_next = o; n = n->m_next; n->m_len = m->m_len; if (m->m_flags & M_EXT) { n->m_data = m->m_data; MCLADDREFERENCE(m, n); } else { memcpy(mtod(n, char *), mtod(m, char *), n->m_len); } m = m->m_next; } return top; nospace: m_freem(top); return NULL; } void m_copydata(struct mbuf *m, int off, int len, void *cp) { unsigned int count; struct mbuf *m0 = m; int len0 = len; int off0 = off; void *cp0 = cp; KASSERT(len != M_COPYALL); if (off < 0 || len < 0) panic("m_copydata: off %d, len %d", off, len); while (off > 0) { if (m == NULL) panic("m_copydata(%p,%d,%d,%p): m=NULL, off=%d (%d)", m0, len0, off0, cp0, off, off0 - off); if (off < m->m_len) break; off -= m->m_len; m = m->m_next; } while (len > 0) { if (m == NULL) panic("m_copydata(%p,%d,%d,%p): " "m=NULL, off=%d (%d), len=%d (%d)", m0, len0, off0, cp0, off, off0 - off, len, len0 - len); count = uimin(m->m_len - off, len); memcpy(cp, mtod(m, char *) + off, count); len -= count; cp = (char *)cp + count; off = 0; m = m->m_next; } } /* * Concatenate mbuf chain n to m. * n might be copied into m (when n->m_len is small), therefore data portion of * n could be copied into an mbuf of different mbuf type. * Any m_pkthdr is not updated. */ void m_cat(struct mbuf *m, struct mbuf *n) { while (m->m_next) m = m->m_next; while (n) { if (M_READONLY(m) || n->m_len > M_TRAILINGSPACE(m)) { /* just join the two chains */ m->m_next = n; return; } /* splat the data from one into the other */ memcpy(mtod(m, char *) + m->m_len, mtod(n, void *), (u_int)n->m_len); m->m_len += n->m_len; n = m_free(n); } } void m_adj(struct mbuf *mp, int req_len) { int len = req_len; struct mbuf *m; int count; if ((m = mp) == NULL) return; if (len >= 0) { /* * Trim from head. */ while (m != NULL && len > 0) { if (m->m_len <= len) { len -= m->m_len; m->m_len = 0; m = m->m_next; } else { m->m_len -= len; m->m_data += len; len = 0; } } if (mp->m_flags & M_PKTHDR) mp->m_pkthdr.len -= (req_len - len); } else { /* * Trim from tail. Scan the mbuf chain, * calculating its length and finding the last mbuf. * If the adjustment only affects this mbuf, then just * adjust and return. Otherwise, rescan and truncate * after the remaining size. */ len = -len; count = 0; for (;;) { count += m->m_len; if (m->m_next == NULL) break; m = m->m_next; } if (m->m_len >= len) { m->m_len -= len; if (mp->m_flags & M_PKTHDR) mp->m_pkthdr.len -= len; return; } count -= len; if (count < 0) count = 0; /* * Correct length for chain is "count". * Find the mbuf with last data, adjust its length, * and toss data from remaining mbufs on chain. */ m = mp; if (m->m_flags & M_PKTHDR) m->m_pkthdr.len = count; for (; m; m = m->m_next) { if (m->m_len >= count) { m->m_len = count; break; } count -= m->m_len; } if (m) { while (m->m_next) (m = m->m_next)->m_len = 0; } } } /* * m_ensure_contig: rearrange an mbuf chain that given length of bytes * would be contiguous and in the data area of an mbuf (therefore, mtod() * would work for a structure of given length). * * => On success, returns true and the resulting mbuf chain; false otherwise. * => The mbuf chain may change, but is always preserved valid. */ bool m_ensure_contig(struct mbuf **m0, int len) { struct mbuf *n = *m0, *m; size_t count, space; KASSERT(len != M_COPYALL); /* * If first mbuf has no cluster, and has room for len bytes * without shifting current data, pullup into it, * otherwise allocate a new mbuf to prepend to the chain. */ if ((n->m_flags & M_EXT) == 0 && n->m_data + len < &n->m_dat[MLEN] && n->m_next) { if (n->m_len >= len) { return true; } m = n; n = n->m_next; len -= m->m_len; } else { if (len > MHLEN) { return false; } m = m_get(M_DONTWAIT, n->m_type); if (m == NULL) { return false; } MCLAIM(m, n->m_owner); if (n->m_flags & M_PKTHDR) { m_move_pkthdr(m, n); } } space = &m->m_dat[MLEN] - (m->m_data + m->m_len); do { count = MIN(MIN(MAX(len, max_protohdr), space), n->m_len); memcpy(mtod(m, char *) + m->m_len, mtod(n, void *), (unsigned)count); len -= count; m->m_len += count; n->m_len -= count; space -= count; if (n->m_len) n->m_data += count; else n = m_free(n); } while (len > 0 && n); m->m_next = n; *m0 = m; return len <= 0; } /* * m_pullup: same as m_ensure_contig(), but destroys mbuf chain on error. */ struct mbuf * m_pullup(struct mbuf *n, int len) { struct mbuf *m = n; KASSERT(len != M_COPYALL); if (!m_ensure_contig(&m, len)) { KASSERT(m != NULL); m_freem(m); m = NULL; } return m; } /* * ensure that [off, off + len) is contiguous on the mbuf chain "m". * packet chain before "off" is kept untouched. * if offp == NULL, the target will start at <retval, 0> on resulting chain. * if offp != NULL, the target will start at <retval, *offp> on resulting chain. * * on error return (NULL return value), original "m" will be freed. * * XXX M_TRAILINGSPACE/M_LEADINGSPACE on shared cluster (sharedcluster) */ struct mbuf * m_pulldown(struct mbuf *m, int off, int len, int *offp) { struct mbuf *n, *o; int hlen, tlen, olen; int sharedcluster; /* Check invalid arguments. */ if (m == NULL) panic("%s: m == NULL", __func__); if (len > MCLBYTES) { m_freem(m); return NULL; } n = m; while (n != NULL && off > 0) { if (n->m_len > off) break; off -= n->m_len; n = n->m_next; } /* Be sure to point non-empty mbuf. */ while (n != NULL && n->m_len == 0) n = n->m_next; if (!n) { m_freem(m); return NULL; /* mbuf chain too short */ } sharedcluster = M_READONLY(n); /* * The target data is on <n, off>. If we got enough data on the mbuf * "n", we're done. */ #ifdef __NO_STRICT_ALIGNMENT if ((off == 0 || offp) && len <= n->m_len - off && !sharedcluster) #else if ((off == 0 || offp) && len <= n->m_len - off && !sharedcluster && ALIGNED_POINTER((mtod(n, char *) + off), uint32_t)) #endif goto ok; /* * When (len <= n->m_len - off) and (off != 0), it is a special case. * Len bytes from <n, off> sit in single mbuf, but the caller does * not like the starting position (off). * * Chop the current mbuf into two pieces, set off to 0. */ if (len <= n->m_len - off) { struct mbuf *mlast; o = m_dup(n, off, n->m_len - off, M_DONTWAIT); if (o == NULL) { m_freem(m); return NULL; /* ENOBUFS */ } KASSERTMSG(o->m_len >= len, "o=%p o->m_len=%d len=%d", o, o->m_len, len); for (mlast = o; mlast->m_next != NULL; mlast = mlast->m_next) ; n->m_len = off; mlast->m_next = n->m_next; n->m_next = o; n = o; off = 0; goto ok; } /* * We need to take hlen from <n, off> and tlen from <n->m_next, 0>, * and construct contiguous mbuf with m_len == len. * * Note that hlen + tlen == len, and tlen > 0. */ hlen = n->m_len - off; tlen = len - hlen; /* * Ensure that we have enough trailing data on mbuf chain. If not, * we can do nothing about the chain. */ olen = 0; for (o = n->m_next; o != NULL; o = o->m_next) olen += o->m_len; if (hlen + olen < len) { m_freem(m); return NULL; /* mbuf chain too short */ } /* * Easy cases first. We need to use m_copydata() to get data from * <n->m_next, 0>. */ if ((off == 0 || offp) && M_TRAILINGSPACE(n) >= tlen && !sharedcluster) { m_copydata(n->m_next, 0, tlen, mtod(n, char *) + n->m_len); n->m_len += tlen; m_adj(n->m_next, tlen); goto ok; } if ((off == 0 || offp) && M_LEADINGSPACE(n->m_next) >= hlen && #ifndef __NO_STRICT_ALIGNMENT ALIGNED_POINTER((n->m_next->m_data - hlen), uint32_t) && #endif !sharedcluster && n->m_next->m_len >= tlen) { n->m_next->m_data -= hlen; n->m_next->m_len += hlen; memcpy(mtod(n->m_next, void *), mtod(n, char *) + off, hlen); n->m_len -= hlen; n = n->m_next; off = 0; goto ok; } /* * Now, we need to do the hard way. Don't copy as there's no room * on both ends. */ o = m_get(M_DONTWAIT, m->m_type); if (o && len > MLEN) { MCLGET(o, M_DONTWAIT); if ((o->m_flags & M_EXT) == 0) { m_free(o); o = NULL; } } if (!o) { m_freem(m); return NULL; /* ENOBUFS */ } /* get hlen from <n, off> into <o, 0> */ o->m_len = hlen; memcpy(mtod(o, void *), mtod(n, char *) + off, hlen); n->m_len -= hlen; /* get tlen from <n->m_next, 0> into <o, hlen> */ m_copydata(n->m_next, 0, tlen, mtod(o, char *) + o->m_len); o->m_len += tlen; m_adj(n->m_next, tlen); o->m_next = n->m_next; n->m_next = o; n = o; off = 0; ok: if (offp) *offp = off; return n; } /* * Like m_pullup(), except a new mbuf is always allocated, and we allow * the amount of empty space before the data in the new mbuf to be specified * (in the event that the caller expects to prepend later). */ struct mbuf * m_copyup(struct mbuf *n, int len, int dstoff) { struct mbuf *m; int count, space; KASSERT(len != M_COPYALL); if (len > ((int)MHLEN - dstoff)) goto bad; m = m_get(M_DONTWAIT, n->m_type); if (m == NULL) goto bad; MCLAIM(m, n->m_owner); if (n->m_flags & M_PKTHDR) { m_move_pkthdr(m, n); } m->m_data += dstoff; space = &m->m_dat[MLEN] - (m->m_data + m->m_len); do { count = uimin(uimin(uimax(len, max_protohdr), space), n->m_len); memcpy(mtod(m, char *) + m->m_len, mtod(n, void *), (unsigned)count); len -= count; m->m_len += count; n->m_len -= count; space -= count; if (n->m_len) n->m_data += count; else n = m_free(n); } while (len > 0 && n); if (len > 0) { (void) m_free(m); goto bad; } m->m_next = n; return m; bad: m_freem(n); return NULL; } struct mbuf * m_split(struct mbuf *m0, int len, int wait) { return m_split_internal(m0, len, wait, true); } static struct mbuf * m_split_internal(struct mbuf *m0, int len0, int wait, bool copyhdr) { struct mbuf *m, *n; unsigned len = len0, remain, len_save; KASSERT(len0 != M_COPYALL); for (m = m0; m && len > m->m_len; m = m->m_next) len -= m->m_len; if (m == NULL) return NULL; remain = m->m_len - len; if (copyhdr && (m0->m_flags & M_PKTHDR)) { n = m_gethdr(wait, m0->m_type); if (n == NULL) return NULL; MCLAIM(n, m0->m_owner); m_copy_rcvif(n, m0); n->m_pkthdr.len = m0->m_pkthdr.len - len0; len_save = m0->m_pkthdr.len; m0->m_pkthdr.len = len0; if ((m->m_flags & M_EXT) == 0 && remain > MHLEN) { /* m can't be the lead packet */ m_align(n, 0); n->m_len = 0; n->m_next = m_split(m, len, wait); if (n->m_next == NULL) { (void)m_free(n); m0->m_pkthdr.len = len_save; return NULL; } return n; } } else if (remain == 0) { n = m->m_next; m->m_next = NULL; return n; } else { n = m_get(wait, m->m_type); if (n == NULL) return NULL; MCLAIM(n, m->m_owner); } if (m->m_flags & M_EXT) { n->m_data = m->m_data + len; MCLADDREFERENCE(m, n); } else { m_align(n, remain); memcpy(mtod(n, void *), mtod(m, char *) + len, remain); } n->m_len = remain; m->m_len = len; n->m_next = m->m_next; m->m_next = NULL; return n; } /* * Routine to copy from device local memory into mbufs. */ struct mbuf * m_devget(char *buf, int totlen, int off, struct ifnet *ifp) { struct mbuf *m; struct mbuf *top = NULL, **mp = &top; char *cp, *epkt; int len; cp = buf; epkt = cp + totlen; if (off) { /* * If 'off' is non-zero, packet is trailer-encapsulated, * so we have to skip the type and length fields. */ cp += off + 2 * sizeof(uint16_t); totlen -= 2 * sizeof(uint16_t); } m = m_gethdr(M_DONTWAIT, MT_DATA); if (m == NULL) return NULL; m_set_rcvif(m, ifp); m->m_pkthdr.len = totlen; m->m_len = MHLEN; while (totlen > 0) { if (top) { m = m_get(M_DONTWAIT, MT_DATA); if (m == NULL) { m_freem(top); return NULL; } m->m_len = MLEN; } len = uimin(totlen, epkt - cp); if (len >= MINCLSIZE) { MCLGET(m, M_DONTWAIT); if ((m->m_flags & M_EXT) == 0) { m_free(m); m_freem(top); return NULL; } m->m_len = len = uimin(len, MCLBYTES); } else { /* * Place initial small packet/header at end of mbuf. */ if (len < m->m_len) { if (top == 0 && len + max_linkhdr <= m->m_len) m->m_data += max_linkhdr; m->m_len = len; } else len = m->m_len; } memcpy(mtod(m, void *), cp, (size_t)len); cp += len; *mp = m; mp = &m->m_next; totlen -= len; if (cp == epkt) cp = buf; } return top; } /* * Copy data from a buffer back into the indicated mbuf chain, * starting "off" bytes from the beginning, extending the mbuf * chain if necessary. */ void m_copyback(struct mbuf *m0, int off, int len, const void *cp) { #if defined(DEBUG) struct mbuf *origm = m0; int error; #endif if (m0 == NULL) return; #if defined(DEBUG) error = #endif m_copyback_internal(&m0, off, len, cp, CB_COPYBACK|CB_EXTEND, M_DONTWAIT); #if defined(DEBUG) if (error != 0 || (m0 != NULL && origm != m0)) panic("m_copyback"); #endif } struct mbuf * m_copyback_cow(struct mbuf *m0, int off, int len, const void *cp, int how) { int error; /* don't support chain expansion */ KASSERT(len != M_COPYALL); KDASSERT(off + len <= m_length(m0)); error = m_copyback_internal(&m0, off, len, cp, CB_COPYBACK|CB_COW, how); if (error) { /* * no way to recover from partial success. * just free the chain. */ m_freem(m0); return NULL; } return m0; } int m_makewritable(struct mbuf **mp, int off, int len, int how) { int error; #if defined(DEBUG) int origlen = m_length(*mp); #endif error = m_copyback_internal(mp, off, len, NULL, CB_PRESERVE|CB_COW, how); if (error) return error; #if defined(DEBUG) int reslen = 0; for (struct mbuf *n = *mp; n; n = n->m_next) reslen += n->m_len; if (origlen != reslen) panic("m_makewritable: length changed"); if (((*mp)->m_flags & M_PKTHDR) != 0 && reslen != (*mp)->m_pkthdr.len) panic("m_makewritable: inconsist"); #endif return 0; } static int m_copyback_internal(struct mbuf **mp0, int off, int len, const void *vp, int flags, int how) { int mlen; struct mbuf *m, *n; struct mbuf **mp; int totlen = 0; const char *cp = vp; KASSERT(mp0 != NULL); KASSERT(*mp0 != NULL); KASSERT((flags & CB_PRESERVE) == 0 || cp == NULL); KASSERT((flags & CB_COPYBACK) == 0 || cp != NULL); if (len == M_COPYALL) len = m_length(*mp0) - off; /* * we don't bother to update "totlen" in the case of CB_COW, * assuming that CB_EXTEND and CB_COW are exclusive. */ KASSERT((~flags & (CB_EXTEND|CB_COW)) != 0); mp = mp0; m = *mp; while (off > (mlen = m->m_len)) { off -= mlen; totlen += mlen; if (m->m_next == NULL) { int tspace; extend: if ((flags & CB_EXTEND) == 0) goto out; /* * try to make some space at the end of "m". */ mlen = m->m_len; if (off + len >= MINCLSIZE && (m->m_flags & M_EXT) == 0 && m->m_len == 0) { MCLGET(m, how); } tspace = M_TRAILINGSPACE(m); if (tspace > 0) { tspace = uimin(tspace, off + len); KASSERT(tspace > 0); memset(mtod(m, char *) + m->m_len, 0, uimin(off, tspace)); m->m_len += tspace; off += mlen; totlen -= mlen; continue; } /* * need to allocate an mbuf. */ if (off + len >= MINCLSIZE) { n = m_getcl(how, m->m_type, 0); } else { n = m_get(how, m->m_type); } if (n == NULL) { goto out; } n->m_len = uimin(M_TRAILINGSPACE(n), off + len); memset(mtod(n, char *), 0, uimin(n->m_len, off)); m->m_next = n; } mp = &m->m_next; m = m->m_next; } while (len > 0) { mlen = m->m_len - off; if (mlen != 0 && M_READONLY(m)) { /* * This mbuf is read-only. Allocate a new writable * mbuf and try again. */ char *datap; int eatlen; KASSERT((flags & CB_COW) != 0); /* * if we're going to write into the middle of * a mbuf, split it first. */ if (off > 0) { n = m_split_internal(m, off, how, false); if (n == NULL) goto enobufs; m->m_next = n; mp = &m->m_next; m = n; off = 0; continue; } /* * XXX TODO coalesce into the trailingspace of * the previous mbuf when possible. */ /* * allocate a new mbuf. copy packet header if needed. */ n = m_get(how, m->m_type); if (n == NULL) goto enobufs; MCLAIM(n, m->m_owner); if (off == 0 && (m->m_flags & M_PKTHDR) != 0) { m_move_pkthdr(n, m); n->m_len = MHLEN; } else { if (len >= MINCLSIZE) MCLGET(n, M_DONTWAIT); n->m_len = (n->m_flags & M_EXT) ? MCLBYTES : MLEN; } if (n->m_len > len) n->m_len = len; /* * free the region which has been overwritten. * copying data from old mbufs if requested. */ if (flags & CB_PRESERVE) datap = mtod(n, char *); else datap = NULL; eatlen = n->m_len; while (m != NULL && M_READONLY(m) && n->m_type == m->m_type && eatlen > 0) { mlen = uimin(eatlen, m->m_len); if (datap) { m_copydata(m, 0, mlen, datap); datap += mlen; } m->m_data += mlen; m->m_len -= mlen; eatlen -= mlen; if (m->m_len == 0) *mp = m = m_free(m); } if (eatlen > 0) n->m_len -= eatlen; n->m_next = m; *mp = m = n; continue; } mlen = uimin(mlen, len); if (flags & CB_COPYBACK) { memcpy(mtod(m, char *) + off, cp, (unsigned)mlen); cp += mlen; } len -= mlen; mlen += off; off = 0; totlen += mlen; if (len == 0) break; if (m->m_next == NULL) { goto extend; } mp = &m->m_next; m = m->m_next; } out: if (((m = *mp0)->m_flags & M_PKTHDR) && (m->m_pkthdr.len < totlen)) { KASSERT((flags & CB_EXTEND) != 0); m->m_pkthdr.len = totlen; } return 0; enobufs: return ENOBUFS; } /* * Compress the mbuf chain. Return the new mbuf chain on success, NULL on * failure. The first mbuf is preserved, and on success the pointer returned * is the same as the one passed. */ struct mbuf * m_defrag(struct mbuf *m, int how) { struct mbuf *m0, *mn, *n; int sz; KASSERT((m->m_flags & M_PKTHDR) != 0); if (m->m_next == NULL) return m; /* Defrag to single mbuf if at all possible */ if ((m->m_flags & M_EXT) == 0 && m->m_pkthdr.len <= MCLBYTES) { if (m->m_pkthdr.len <= MHLEN) { if (M_TRAILINGSPACE(m) < (m->m_pkthdr.len - m->m_len)) { KASSERTMSG(M_LEADINGSPACE(m) + M_TRAILINGSPACE(m) >= (m->m_pkthdr.len - m->m_len), "too small leading %d trailing %d ro? %d" " pkthdr.len %d mlen %d", (int)M_LEADINGSPACE(m), (int)M_TRAILINGSPACE(m), M_READONLY(m), m->m_pkthdr.len, m->m_len); memmove(m->m_pktdat, m->m_data, m->m_len); m->m_data = m->m_pktdat; KASSERT(M_TRAILINGSPACE(m) >= (m->m_pkthdr.len - m->m_len)); } } else { /* Must copy data before adding cluster */ m0 = m_get(how, MT_DATA); if (m0 == NULL) return NULL; KASSERTMSG(m->m_len <= MHLEN, "m=%p m->m_len=%d MHLEN=%u", m, m->m_len, (unsigned)MHLEN); m_copydata(m, 0, m->m_len, mtod(m0, void *)); MCLGET(m, how); if ((m->m_flags & M_EXT) == 0) { m_free(m0); return NULL; } memcpy(m->m_data, mtod(m0, void *), m->m_len); m_free(m0); } KASSERTMSG(M_TRAILINGSPACE(m) >= (m->m_pkthdr.len - m->m_len), "m=%p M_TRAILINGSPACE(m)=%zd m->m_pkthdr.len=%d" " m->m_len=%d", m, M_TRAILINGSPACE(m), m->m_pkthdr.len, m->m_len); m_copydata(m->m_next, 0, m->m_pkthdr.len - m->m_len, mtod(m, char *) + m->m_len); m->m_len = m->m_pkthdr.len; m_freem(m->m_next); m->m_next = NULL; return m; } m0 = m_get(how, MT_DATA); if (m0 == NULL) return NULL; mn = m0; sz = m->m_pkthdr.len - m->m_len; KASSERT(sz >= 0); do { if (sz > MLEN) { MCLGET(mn, how); if ((mn->m_flags & M_EXT) == 0) { m_freem(m0); return NULL; } } mn->m_len = MIN(sz, MCLBYTES); m_copydata(m, m->m_pkthdr.len - sz, mn->m_len, mtod(mn, void *)); sz -= mn->m_len; if (sz > 0) { /* need more mbufs */ n = m_get(how, MT_DATA); if (n == NULL) { m_freem(m0); return NULL; } mn->m_next = n; mn = n; } } while (sz > 0); m_freem(m->m_next); m->m_next = m0; return m; } void m_remove_pkthdr(struct mbuf *m) { KASSERT(m->m_flags & M_PKTHDR); m_tag_delete_chain(m); m->m_flags &= ~M_PKTHDR; memset(&m->m_pkthdr, 0, sizeof(m->m_pkthdr)); } void m_copy_pkthdr(struct mbuf *to, struct mbuf *from) { KASSERT((to->m_flags & M_EXT) == 0); KASSERT((to->m_flags & M_PKTHDR) == 0 || SLIST_FIRST(&to->m_pkthdr.tags) == NULL); KASSERT((from->m_flags & M_PKTHDR) != 0); to->m_pkthdr = from->m_pkthdr; to->m_flags = from->m_flags & M_COPYFLAGS; to->m_data = to->m_pktdat; SLIST_INIT(&to->m_pkthdr.tags); m_tag_copy_chain(to, from); } void m_move_pkthdr(struct mbuf *to, struct mbuf *from) { KASSERT((to->m_flags & M_EXT) == 0); KASSERT((to->m_flags & M_PKTHDR) == 0 || SLIST_FIRST(&to->m_pkthdr.tags) == NULL); KASSERT((from->m_flags & M_PKTHDR) != 0); to->m_pkthdr = from->m_pkthdr; to->m_flags = from->m_flags & M_COPYFLAGS; to->m_data = to->m_pktdat; from->m_flags &= ~M_PKTHDR; } /* * Set the m_data pointer of a newly-allocated mbuf to place an object of the * specified size at the end of the mbuf, longword aligned. */ void m_align(struct mbuf *m, int len) { int buflen, adjust; KASSERT(len != M_COPYALL); KASSERTMSG(M_LEADINGSPACE(m) == 0, "m=%p M_LEADINGSPACE(m)=%zd", m, M_LEADINGSPACE(m)); buflen = M_BUFSIZE(m); KASSERTMSG(len <= buflen, "m=%p len=%d buflen=%d", m, len, buflen); adjust = buflen - len; m->m_data += adjust &~ (sizeof(long)-1); } /* * Apply function f to the data in an mbuf chain starting "off" bytes from the * beginning, continuing for "len" bytes. */ int m_apply(struct mbuf *m, int off, int len, int (*f)(void *, void *, unsigned int), void *arg) { unsigned int count; int rval; KASSERT(len != M_COPYALL); KASSERT(len >= 0); KASSERT(off >= 0); while (off > 0) { KASSERT(m != NULL); if (off < m->m_len) break; off -= m->m_len; m = m->m_next; } while (len > 0) { KASSERT(m != NULL); count = uimin(m->m_len - off, len); rval = (*f)(arg, mtod(m, char *) + off, count); if (rval) return rval; len -= count; off = 0; m = m->m_next; } return 0; } /* * Return a pointer to mbuf/offset of location in mbuf chain. */ struct mbuf * m_getptr(struct mbuf *m, int loc, int *off) { while (loc >= 0) { /* Normal end of search */ if (m->m_len > loc) { *off = loc; return m; } loc -= m->m_len; if (m->m_next == NULL) { if (loc == 0) { /* Point at the end of valid data */ *off = m->m_len; return m; } return NULL; } else { m = m->m_next; } } return NULL; } /* * Release a reference to the mbuf external storage. * * => free the mbuf m itself as well. */ static void m_ext_free(struct mbuf *m) { const bool embedded = MEXT_ISEMBEDDED(m); bool dofree = true; u_int refcnt; KASSERT((m->m_flags & M_EXT) != 0); KASSERT(MEXT_ISEMBEDDED(m->m_ext_ref)); KASSERT((m->m_ext_ref->m_flags & M_EXT) != 0); KASSERT((m->m_flags & M_EXT_CLUSTER) == (m->m_ext_ref->m_flags & M_EXT_CLUSTER)); if (__predict_false(m->m_type == MT_FREE)) { panic("mbuf %p already freed", m); } if (__predict_true(m->m_ext.ext_refcnt == 1)) { refcnt = m->m_ext.ext_refcnt = 0; } else { membar_release(); refcnt = atomic_dec_uint_nv(&m->m_ext.ext_refcnt); } if (refcnt > 0) { if (embedded) { /* * other mbuf's m_ext_ref still points to us. */ dofree = false; } else { m->m_ext_ref = m; } } else { /* * dropping the last reference */ membar_acquire(); if (!embedded) { m->m_ext.ext_refcnt++; /* XXX */ m_ext_free(m->m_ext_ref); m->m_ext_ref = m; } else if ((m->m_flags & M_EXT_CLUSTER) != 0) { pool_cache_put_paddr(mcl_cache, m->m_ext.ext_buf, m->m_ext.ext_paddr); } else if (m->m_ext.ext_free) { (*m->m_ext.ext_free)(m, m->m_ext.ext_buf, m->m_ext.ext_size, m->m_ext.ext_arg); /* * 'm' is already freed by the ext_free callback. */ dofree = false; } else { free(m->m_ext.ext_buf, 0); } } if (dofree) { m->m_type = MT_FREE; m->m_data = NULL; pool_cache_put(mb_cache, m); } } /* * Free a single mbuf and associated external storage. Return the * successor, if any. */ struct mbuf * m_free(struct mbuf *m) { struct mbuf *n; mowner_revoke(m, 1, m->m_flags); mbstat_type_add(m->m_type, -1); if (m->m_flags & M_PKTHDR) m_tag_delete_chain(m); n = m->m_next; if (m->m_flags & M_EXT) { m_ext_free(m); } else { if (__predict_false(m->m_type == MT_FREE)) { panic("mbuf %p already freed", m); } m->m_type = MT_FREE; m->m_data = NULL; pool_cache_put(mb_cache, m); } return n; } void m_freem(struct mbuf *m) { if (m == NULL) return; do { m = m_free(m); } while (m); } #if defined(DDB) void m_print(const struct mbuf *m, const char *modif, void (*pr)(const char *, ...)) { char ch; bool opt_c = false; bool opt_d = false; #if NETHER > 0 bool opt_v = false; const struct mbuf *m0 = NULL; #endif int no = 0; char buf[512]; while ((ch = *(modif++)) != '\0') { switch (ch) { case 'c': opt_c = true; break; case 'd': opt_d = true; break; #if NETHER > 0 case 'v': opt_v = true; m0 = m; break; #endif default: break; } } nextchain: (*pr)("MBUF(%d) %p\n", no, m); snprintb(buf, sizeof(buf), M_FLAGS_BITS, (u_int)m->m_flags); (*pr)(" data=%p, len=%d, type=%d, flags=%s\n", m->m_data, m->m_len, m->m_type, buf); if (opt_d) { int i; unsigned char *p = m->m_data; (*pr)(" data:"); for (i = 0; i < m->m_len; i++) { if (i % 16 == 0) (*pr)("\n"); (*pr)(" %02x", p[i]); } (*pr)("\n"); } (*pr)(" owner=%p, next=%p, nextpkt=%p\n", m->m_owner, m->m_next, m->m_nextpkt); (*pr)(" leadingspace=%u, trailingspace=%u, readonly=%u\n", (int)M_LEADINGSPACE(m), (int)M_TRAILINGSPACE(m), (int)M_READONLY(m)); if ((m->m_flags & M_PKTHDR) != 0) { snprintb(buf, sizeof(buf), M_CSUM_BITS, m->m_pkthdr.csum_flags); (*pr)(" pktlen=%d, rcvif=%p, csum_flags=%s, csum_data=0x%" PRIx32 ", segsz=%u\n", m->m_pkthdr.len, m_get_rcvif_NOMPSAFE(m), buf, m->m_pkthdr.csum_data, m->m_pkthdr.segsz); } if ((m->m_flags & M_EXT)) { (*pr)(" ext_refcnt=%u, ext_buf=%p, ext_size=%zd, " "ext_free=%p, ext_arg=%p\n", m->m_ext.ext_refcnt, m->m_ext.ext_buf, m->m_ext.ext_size, m->m_ext.ext_free, m->m_ext.ext_arg); } if ((~m->m_flags & (M_EXT|M_EXT_PAGES)) == 0) { vaddr_t sva = (vaddr_t)m->m_ext.ext_buf; vaddr_t eva = sva + m->m_ext.ext_size; int n = (round_page(eva) - trunc_page(sva)) >> PAGE_SHIFT; int i; (*pr)(" pages:"); for (i = 0; i < n; i ++) { (*pr)(" %p", m->m_ext.ext_pgs[i]); } (*pr)("\n"); } if (opt_c) { m = m->m_next; if (m != NULL) { no++; goto nextchain; } } #if NETHER > 0 if (opt_v && m0) m_examine(m0, AF_ETHER, modif, pr); #endif } #endif /* defined(DDB) */ #if defined(MBUFTRACE) void mowner_init_owner(struct mowner *mo, const char *name, const char *descr) { memset(mo, 0, sizeof(*mo)); strlcpy(mo->mo_name, name, sizeof(mo->mo_name)); strlcpy(mo->mo_descr, descr, sizeof(mo->mo_descr)); } void mowner_attach(struct mowner *mo) { KASSERT(mo->mo_counters == NULL); mo->mo_counters = percpu_alloc(sizeof(struct mowner_counter)); /* XXX lock */ LIST_INSERT_HEAD(&mowners, mo, mo_link); } void mowner_detach(struct mowner *mo) { KASSERT(mo->mo_counters != NULL); /* XXX lock */ LIST_REMOVE(mo, mo_link); percpu_free(mo->mo_counters, sizeof(struct mowner_counter)); mo->mo_counters = NULL; } void mowner_init(struct mbuf *m, int type) { struct mowner_counter *mc; struct mowner *mo; int s; m->m_owner = mo = &unknown_mowners[type]; s = splvm(); mc = percpu_getref(mo->mo_counters); mc->mc_counter[MOWNER_COUNTER_CLAIMS]++; percpu_putref(mo->mo_counters); splx(s); } void mowner_ref(struct mbuf *m, int flags) { struct mowner *mo = m->m_owner; struct mowner_counter *mc; int s; s = splvm(); mc = percpu_getref(mo->mo_counters); if ((flags & M_EXT) != 0) mc->mc_counter[MOWNER_COUNTER_EXT_CLAIMS]++; if ((flags & M_EXT_CLUSTER) != 0) mc->mc_counter[MOWNER_COUNTER_CLUSTER_CLAIMS]++; percpu_putref(mo->mo_counters); splx(s); } void mowner_revoke(struct mbuf *m, bool all, int flags) { struct mowner *mo = m->m_owner; struct mowner_counter *mc; int s; s = splvm(); mc = percpu_getref(mo->mo_counters); if ((flags & M_EXT) != 0) mc->mc_counter[MOWNER_COUNTER_EXT_RELEASES]++; if ((flags & M_EXT_CLUSTER) != 0) mc->mc_counter[MOWNER_COUNTER_CLUSTER_RELEASES]++; if (all) mc->mc_counter[MOWNER_COUNTER_RELEASES]++; percpu_putref(mo->mo_counters); splx(s); if (all) m->m_owner = &revoked_mowner; } static void mowner_claim(struct mbuf *m, struct mowner *mo) { struct mowner_counter *mc; int flags = m->m_flags; int s; s = splvm(); mc = percpu_getref(mo->mo_counters); mc->mc_counter[MOWNER_COUNTER_CLAIMS]++; if ((flags & M_EXT) != 0) mc->mc_counter[MOWNER_COUNTER_EXT_CLAIMS]++; if ((flags & M_EXT_CLUSTER) != 0) mc->mc_counter[MOWNER_COUNTER_CLUSTER_CLAIMS]++; percpu_putref(mo->mo_counters); splx(s); m->m_owner = mo; } void m_claim(struct mbuf *m, struct mowner *mo) { if (m->m_owner == mo || mo == NULL) return; mowner_revoke(m, true, m->m_flags); mowner_claim(m, mo); } void m_claimm(struct mbuf *m, struct mowner *mo) { for (; m != NULL; m = m->m_next) m_claim(m, mo); } #endif /* defined(MBUFTRACE) */ #ifdef DIAGNOSTIC /* * Verify that the mbuf chain is not malformed. Used only for diagnostic. * Panics on error. */ void m_verify_packet(struct mbuf *m) { struct mbuf *n = m; char *low, *high, *dat; int totlen = 0, len; if (__predict_false((m->m_flags & M_PKTHDR) == 0)) { panic("%s: mbuf doesn't have M_PKTHDR", __func__); } while (n != NULL) { if (__predict_false(n->m_type == MT_FREE)) { panic("%s: mbuf already freed (n = %p)", __func__, n); } #if 0 /* * This ought to be a rule of the mbuf API. Unfortunately, * many places don't respect that rule. */ if (__predict_false((n != m) && (n->m_flags & M_PKTHDR) != 0)) { panic("%s: M_PKTHDR set on secondary mbuf", __func__); } #endif if (__predict_false(n->m_nextpkt != NULL)) { panic("%s: m_nextpkt not null (m_nextpkt = %p)", __func__, n->m_nextpkt); } dat = n->m_data; len = n->m_len; if (__predict_false(len < 0)) { panic("%s: incorrect length (len = %d)", __func__, len); } low = M_BUFADDR(n); high = low + M_BUFSIZE(n); if (__predict_false((dat < low) || (dat + len > high))) { panic("%s: m_data not in packet" "(dat = %p, len = %d, low = %p, high = %p)", __func__, dat, len, low, high); } totlen += len; n = n->m_next; } if (__predict_false(totlen != m->m_pkthdr.len)) { panic("%s: inconsistent mbuf length (%d != %d)", __func__, totlen, m->m_pkthdr.len); } } #endif struct m_tag * m_tag_get(int type, int len, int wait) { struct m_tag *t; if (len < 0) return NULL; t = malloc(len + sizeof(struct m_tag), M_PACKET_TAGS, wait); if (t == NULL) return NULL; t->m_tag_id = type; t->m_tag_len = len; return t; } void m_tag_free(struct m_tag *t) { free(t, M_PACKET_TAGS); } void m_tag_prepend(struct mbuf *m, struct m_tag *t) { KASSERT((m->m_flags & M_PKTHDR) != 0); SLIST_INSERT_HEAD(&m->m_pkthdr.tags, t, m_tag_link); } void m_tag_unlink(struct mbuf *m, struct m_tag *t) { KASSERT((m->m_flags & M_PKTHDR) != 0); SLIST_REMOVE(&m->m_pkthdr.tags, t, m_tag, m_tag_link); } void m_tag_delete(struct mbuf *m, struct m_tag *t) { m_tag_unlink(m, t); m_tag_free(t); } void m_tag_delete_chain(struct mbuf *m) { struct m_tag *p, *q; KASSERT((m->m_flags & M_PKTHDR) != 0); p = SLIST_FIRST(&m->m_pkthdr.tags); if (p == NULL) return; while ((q = SLIST_NEXT(p, m_tag_link)) != NULL) m_tag_delete(m, q); m_tag_delete(m, p); } struct m_tag * m_tag_find(const struct mbuf *m, int type) { struct m_tag *p; KASSERT((m->m_flags & M_PKTHDR) != 0); p = SLIST_FIRST(&m->m_pkthdr.tags); while (p != NULL) { if (p->m_tag_id == type) return p; p = SLIST_NEXT(p, m_tag_link); } return NULL; } struct m_tag * m_tag_copy(struct m_tag *t) { struct m_tag *p; p = m_tag_get(t->m_tag_id, t->m_tag_len, M_NOWAIT); if (p == NULL) return NULL; memcpy(p + 1, t + 1, t->m_tag_len); return p; } /* * Copy two tag chains. The destination mbuf (to) loses any attached * tags even if the operation fails. This should not be a problem, as * m_tag_copy_chain() is typically called with a newly-allocated * destination mbuf. */ int m_tag_copy_chain(struct mbuf *to, struct mbuf *from) { struct m_tag *p, *t, *tprev = NULL; KASSERT((from->m_flags & M_PKTHDR) != 0); m_tag_delete_chain(to); SLIST_FOREACH(p, &from->m_pkthdr.tags, m_tag_link) { t = m_tag_copy(p); if (t == NULL) { m_tag_delete_chain(to); return 0; } if (tprev == NULL) SLIST_INSERT_HEAD(&to->m_pkthdr.tags, t, m_tag_link); else SLIST_INSERT_AFTER(tprev, t, m_tag_link); tprev = t; } return 1; }
1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 /* $NetBSD: agp.c,v 1.88 2022/05/22 11:27:35 andvar Exp $ */ /*- * Copyright (c) 2000 Doug Rabson * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD: src/sys/pci/agp.c,v 1.12 2001/05/19 01:28:07 alfred Exp $ */ /* * Copyright (c) 2001 Wasabi Systems, Inc. * All rights reserved. * * Written by Frank van der Linden for Wasabi Systems, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed for the NetBSD Project by * Wasabi Systems, Inc. * 4. The name of Wasabi Systems, Inc. may not be used to endorse * or promote products derived from this software without specific prior * written permission. * * THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL WASABI SYSTEMS, INC * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: agp.c,v 1.88 2022/05/22 11:27:35 andvar Exp $"); #include <sys/param.h> #include <sys/systm.h> #include <sys/malloc.h> #include <sys/kernel.h> #include <sys/device.h> #include <sys/conf.h> #include <sys/ioctl.h> #include <sys/fcntl.h> #include <sys/agpio.h> #include <sys/proc.h> #include <sys/mutex.h> #include <dev/pci/pcireg.h> #include <dev/pci/pcivar.h> #include <dev/pci/agpvar.h> #include <dev/pci/agpreg.h> #include <dev/pci/pcidevs.h> #include <sys/bus.h> MALLOC_DEFINE(M_AGP, "AGP", "AGP memory"); /* Helper functions for implementing chipset mini drivers. */ /* XXXfvdl get rid of this one. */ extern struct cfdriver agp_cd; static int agp_info_user(struct agp_softc *, agp_info *); static int agp_setup_user(struct agp_softc *, agp_setup *); static int agp_allocate_user(struct agp_softc *, agp_allocate *); static int agp_deallocate_user(struct agp_softc *, int); static int agp_bind_user(struct agp_softc *, agp_bind *); static int agp_unbind_user(struct agp_softc *, agp_unbind *); static int agp_generic_enable_v2(struct agp_softc *, const struct pci_attach_args *, int, u_int32_t); static int agp_generic_enable_v3(struct agp_softc *, const struct pci_attach_args *, int, u_int32_t); static int agpdev_match(const struct pci_attach_args *); static bool agp_resume(device_t, const pmf_qual_t *); #include "agp_ali.h" #include "agp_amd.h" #include "agp_i810.h" #include "agp_intel.h" #include "agp_sis.h" #include "agp_via.h" #include "agp_amd64.h" const struct agp_product { uint32_t ap_vendor; uint32_t ap_product; int (*ap_match)(const struct pci_attach_args *); int (*ap_attach)(device_t, device_t, void *); } agp_products[] = { #if NAGP_AMD64 > 0 { PCI_VENDOR_ALI, PCI_PRODUCT_ALI_M1689, agp_amd64_match, agp_amd64_attach }, #endif #if NAGP_ALI > 0 { PCI_VENDOR_ALI, -1, NULL, agp_ali_attach }, #endif #if NAGP_AMD64 > 0 { PCI_VENDOR_AMD, PCI_PRODUCT_AMD_AGP8151_DEV, agp_amd64_match, agp_amd64_attach }, #endif #if NAGP_AMD > 0 { PCI_VENDOR_AMD, -1, agp_amd_match, agp_amd_attach }, #endif #if NAGP_I810 > 0 { PCI_VENDOR_INTEL, PCI_PRODUCT_INTEL_82810_MCH, NULL, agp_i810_attach }, { PCI_VENDOR_INTEL, PCI_PRODUCT_INTEL_82810_DC100_MCH, NULL, agp_i810_attach }, { PCI_VENDOR_INTEL, PCI_PRODUCT_INTEL_82810E_MCH, NULL, agp_i810_attach }, { PCI_VENDOR_INTEL, PCI_PRODUCT_INTEL_82815_FULL_HUB, NULL, agp_i810_attach }, { PCI_VENDOR_INTEL, PCI_PRODUCT_INTEL_82840_HB, NULL, agp_i810_attach }, { PCI_VENDOR_INTEL, PCI_PRODUCT_INTEL_82830MP_IO_1, NULL, agp_i810_attach }, { PCI_VENDOR_INTEL, PCI_PRODUCT_INTEL_82845G_DRAM, NULL, agp_i810_attach }, { PCI_VENDOR_INTEL, PCI_PRODUCT_INTEL_82855GM_MCH, NULL, agp_i810_attach }, { PCI_VENDOR_INTEL, PCI_PRODUCT_INTEL_82865_HB, NULL, agp_i810_attach }, { PCI_VENDOR_INTEL, PCI_PRODUCT_INTEL_82915G_HB, NULL, agp_i810_attach }, { PCI_VENDOR_INTEL, PCI_PRODUCT_INTEL_82915GM_HB, NULL, agp_i810_attach }, { PCI_VENDOR_INTEL, PCI_PRODUCT_INTEL_82945P_MCH, NULL, agp_i810_attach }, { PCI_VENDOR_INTEL, PCI_PRODUCT_INTEL_82945GM_HB, NULL, agp_i810_attach }, { PCI_VENDOR_INTEL, PCI_PRODUCT_INTEL_82945GME_HB, NULL, agp_i810_attach }, { PCI_VENDOR_INTEL, PCI_PRODUCT_INTEL_82965Q_HB, NULL, agp_i810_attach }, { PCI_VENDOR_INTEL, PCI_PRODUCT_INTEL_82965PM_HB, NULL, agp_i810_attach }, { PCI_VENDOR_INTEL, PCI_PRODUCT_INTEL_82965G_HB, NULL, agp_i810_attach }, { PCI_VENDOR_INTEL, PCI_PRODUCT_INTEL_82Q35_HB, NULL, agp_i810_attach }, { PCI_VENDOR_INTEL, PCI_PRODUCT_INTEL_82G33_HB, NULL, agp_i810_attach }, { PCI_VENDOR_INTEL, PCI_PRODUCT_INTEL_82Q33_HB, NULL, agp_i810_attach }, { PCI_VENDOR_INTEL, PCI_PRODUCT_INTEL_82G35_HB, NULL, agp_i810_attach }, { PCI_VENDOR_INTEL, PCI_PRODUCT_INTEL_82946GZ_HB, NULL, agp_i810_attach }, { PCI_VENDOR_INTEL, PCI_PRODUCT_INTEL_82GM45_HB, NULL, agp_i810_attach }, { PCI_VENDOR_INTEL, PCI_PRODUCT_INTEL_82IGD_E_HB, NULL, agp_i810_attach }, { PCI_VENDOR_INTEL, PCI_PRODUCT_INTEL_82Q45_HB, NULL, agp_i810_attach }, { PCI_VENDOR_INTEL, PCI_PRODUCT_INTEL_82G45_HB, NULL, agp_i810_attach }, { PCI_VENDOR_INTEL, PCI_PRODUCT_INTEL_82G41_HB, NULL, agp_i810_attach }, { PCI_VENDOR_INTEL, PCI_PRODUCT_INTEL_E7221_HB, NULL, agp_i810_attach }, { PCI_VENDOR_INTEL, PCI_PRODUCT_INTEL_82965GME_HB, NULL, agp_i810_attach }, { PCI_VENDOR_INTEL, PCI_PRODUCT_INTEL_82B43_HB, NULL, agp_i810_attach }, { PCI_VENDOR_INTEL, PCI_PRODUCT_INTEL_IRONLAKE_D_HB, NULL, agp_i810_attach }, { PCI_VENDOR_INTEL, PCI_PRODUCT_INTEL_IRONLAKE_M_HB, NULL, agp_i810_attach }, { PCI_VENDOR_INTEL, PCI_PRODUCT_INTEL_IRONLAKE_MA_HB, NULL, agp_i810_attach }, { PCI_VENDOR_INTEL, PCI_PRODUCT_INTEL_IRONLAKE_MC2_HB, NULL, agp_i810_attach }, { PCI_VENDOR_INTEL, PCI_PRODUCT_INTEL_PINEVIEW_HB, NULL, agp_i810_attach }, { PCI_VENDOR_INTEL, PCI_PRODUCT_INTEL_PINEVIEW_M_HB, NULL, agp_i810_attach }, #endif #if NAGP_INTEL > 0 { PCI_VENDOR_INTEL, -1, NULL, agp_intel_attach }, #endif #if NAGP_AMD64 > 0 { PCI_VENDOR_NVIDIA, PCI_PRODUCT_NVIDIA_NFORCE3_PCHB, agp_amd64_match, agp_amd64_attach }, { PCI_VENDOR_NVIDIA, PCI_PRODUCT_NVIDIA_NFORCE3_250_PCHB, agp_amd64_match, agp_amd64_attach }, #endif #if NAGP_AMD64 > 0 { PCI_VENDOR_SIS, PCI_PRODUCT_SIS_755, agp_amd64_match, agp_amd64_attach }, { PCI_VENDOR_SIS, PCI_PRODUCT_SIS_760, agp_amd64_match, agp_amd64_attach }, #endif #if NAGP_SIS > 0 { PCI_VENDOR_SIS, -1, NULL, agp_sis_attach }, #endif #if NAGP_AMD64 > 0 { PCI_VENDOR_VIATECH, PCI_PRODUCT_VIATECH_K8M800_0, agp_amd64_match, agp_amd64_attach }, { PCI_VENDOR_VIATECH, PCI_PRODUCT_VIATECH_K8T890_0, agp_amd64_match, agp_amd64_attach }, { PCI_VENDOR_VIATECH, PCI_PRODUCT_VIATECH_K8HTB_0, agp_amd64_match, agp_amd64_attach }, { PCI_VENDOR_VIATECH, PCI_PRODUCT_VIATECH_K8HTB, agp_amd64_match, agp_amd64_attach }, #endif #if NAGP_VIA > 0 { PCI_VENDOR_VIATECH, -1, NULL, agp_via_attach }, #endif { 0, 0, NULL, NULL }, }; static const struct agp_product * agp_lookup(const struct pci_attach_args *pa) { const struct agp_product *ap; /* First find the vendor. */ for (ap = agp_products; ap->ap_attach != NULL; ap++) { if (PCI_VENDOR(pa->pa_id) == ap->ap_vendor) break; } if (ap->ap_attach == NULL) return (NULL); /* Now find the product within the vendor's domain. */ for (; ap->ap_attach != NULL; ap++) { if (PCI_VENDOR(pa->pa_id) != ap->ap_vendor) { /* Ran out of this vendor's section of the table. */ return (NULL); } if (ap->ap_product == PCI_PRODUCT(pa->pa_id)) { /* Exact match. */ break; } if (ap->ap_product == (uint32_t) -1) { /* Wildcard match. */ break; } } if (ap->ap_attach == NULL) return (NULL); /* Now let the product-specific driver filter the match. */ if (ap->ap_match != NULL && (*ap->ap_match)(pa) == 0) return (NULL); return (ap); } static int agpmatch(device_t parent, cfdata_t match, void *aux) { struct agpbus_attach_args *apa = aux; struct pci_attach_args *pa = &apa->apa_pci_args; if (agp_lookup(pa) == NULL) return (0); return (1); } static const u_int agp_max[][2] = { {0, 0}, {32, 4}, {64, 28}, {128, 96}, {256, 204}, {512, 440}, {1024, 942}, {2048, 1920}, {4096, 3932} }; #define agp_max_size (sizeof(agp_max) / sizeof(agp_max[0])) static void agpattach(device_t parent, device_t self, void *aux) { struct agpbus_attach_args *apa = aux; struct pci_attach_args *pa = &apa->apa_pci_args; struct agp_softc *sc = device_private(self); const struct agp_product *ap; int ret; u_int memsize, i; ap = agp_lookup(pa); KASSERT(ap != NULL); aprint_naive(": AGP controller\n"); sc->as_dev = self; sc->as_dmat = pa->pa_dmat; sc->as_pc = pa->pa_pc; sc->as_tag = pa->pa_tag; sc->as_id = pa->pa_id; /* * Work out an upper bound for agp memory allocation. This * uses a heuristic table from the Linux driver. */ memsize = physmem >> (20 - PAGE_SHIFT); /* memsize is in MB */ for (i = 0; i < agp_max_size; i++) { if (memsize <= agp_max[i][0]) break; } if (i == agp_max_size) i = agp_max_size - 1; sc->as_maxmem = agp_max[i][1] << 20U; /* * The mutex is used to prevent re-entry to * agp_generic_bind_memory() since that function can sleep. */ mutex_init(&sc->as_mtx, MUTEX_DEFAULT, IPL_NONE); TAILQ_INIT(&sc->as_memory); ret = (*ap->ap_attach)(parent, self, pa); if (ret == 0) aprint_normal(": aperture at 0x%lx, size 0x%lx\n", (unsigned long)sc->as_apaddr, (unsigned long)AGP_GET_APERTURE(sc)); else sc->as_chipc = NULL; if (!pmf_device_register(self, NULL, agp_resume)) aprint_error_dev(self, "couldn't establish power handler\n"); } CFATTACH_DECL_NEW(agp, sizeof(struct agp_softc), agpmatch, agpattach, NULL, NULL); int agp_map_aperture(struct pci_attach_args *pa, struct agp_softc *sc, int reg) { /* * Find the aperture. Don't map it (yet), this would * eat KVA. */ if (pci_mapreg_info(pa->pa_pc, pa->pa_tag, reg, PCI_MAPREG_TYPE_MEM, &sc->as_apaddr, &sc->as_apsize, &sc->as_apflags) != 0) return ENXIO; sc->as_apt = pa->pa_memt; return 0; } struct agp_gatt * agp_alloc_gatt(struct agp_softc *sc) { u_int32_t apsize = AGP_GET_APERTURE(sc); u_int32_t entries = apsize >> AGP_PAGE_SHIFT; struct agp_gatt *gatt; void *virtual; int dummyseg; gatt = malloc(sizeof(struct agp_gatt), M_AGP, M_WAITOK); gatt->ag_entries = entries; if (agp_alloc_dmamem(sc->as_dmat, entries * sizeof(u_int32_t), 0, &gatt->ag_dmamap, &virtual, &gatt->ag_physical, &gatt->ag_dmaseg, 1, &dummyseg) != 0) { free(gatt, M_AGP); return NULL; } gatt->ag_virtual = (uint32_t *)virtual; gatt->ag_size = entries * sizeof(u_int32_t); memset(gatt->ag_virtual, 0, gatt->ag_size); agp_flush_cache(); return gatt; } void agp_free_gatt(struct agp_softc *sc, struct agp_gatt *gatt) { agp_free_dmamem(sc->as_dmat, gatt->ag_size, gatt->ag_dmamap, (void *)gatt->ag_virtual, &gatt->ag_dmaseg, 1); free(gatt, M_AGP); } int agp_generic_detach(struct agp_softc *sc) { mutex_destroy(&sc->as_mtx); agp_flush_cache(); return 0; } static int agpdev_match(const struct pci_attach_args *pa) { if (PCI_CLASS(pa->pa_class) == PCI_CLASS_DISPLAY && PCI_SUBCLASS(pa->pa_class) == PCI_SUBCLASS_DISPLAY_VGA) if (pci_get_capability(pa->pa_pc, pa->pa_tag, PCI_CAP_AGP, NULL, NULL)) return 1; return 0; } int agp_generic_enable(struct agp_softc *sc, u_int32_t mode) { struct pci_attach_args pa; pcireg_t tstatus, mstatus; int capoff; if (pci_find_device(&pa, agpdev_match) == 0 || pci_get_capability(pa.pa_pc, pa.pa_tag, PCI_CAP_AGP, &capoff, NULL) == 0) { aprint_error_dev(sc->as_dev, "can't find display\n"); return ENXIO; } tstatus = pci_conf_read(sc->as_pc, sc->as_tag, sc->as_capoff + PCI_AGP_STATUS); mstatus = pci_conf_read(pa.pa_pc, pa.pa_tag, capoff + PCI_AGP_STATUS); if (AGP_MODE_GET_MODE_3(mode) && AGP_MODE_GET_MODE_3(tstatus) && AGP_MODE_GET_MODE_3(mstatus)) return agp_generic_enable_v3(sc, &pa, capoff, mode); else return agp_generic_enable_v2(sc, &pa, capoff, mode); } static int agp_generic_enable_v2(struct agp_softc *sc, const struct pci_attach_args *pa, int capoff, u_int32_t mode) { pcireg_t tstatus, mstatus; pcireg_t command; int rq, sba, fw, rate; tstatus = pci_conf_read(sc->as_pc, sc->as_tag, sc->as_capoff + PCI_AGP_STATUS); mstatus = pci_conf_read(pa->pa_pc, pa->pa_tag, capoff + PCI_AGP_STATUS); /* Set RQ to the min of mode, tstatus and mstatus */ rq = AGP_MODE_GET_RQ(mode); if (AGP_MODE_GET_RQ(tstatus) < rq) rq = AGP_MODE_GET_RQ(tstatus); if (AGP_MODE_GET_RQ(mstatus) < rq) rq = AGP_MODE_GET_RQ(mstatus); /* Set SBA if all three can deal with SBA */ sba = (AGP_MODE_GET_SBA(tstatus) & AGP_MODE_GET_SBA(mstatus) & AGP_MODE_GET_SBA(mode)); /* Similar for FW */ fw = (AGP_MODE_GET_FW(tstatus) & AGP_MODE_GET_FW(mstatus) & AGP_MODE_GET_FW(mode)); /* Figure out the max rate */ rate = (AGP_MODE_GET_RATE(tstatus) & AGP_MODE_GET_RATE(mstatus) & AGP_MODE_GET_RATE(mode)); if (rate & AGP_MODE_V2_RATE_4x) rate = AGP_MODE_V2_RATE_4x; else if (rate & AGP_MODE_V2_RATE_2x) rate = AGP_MODE_V2_RATE_2x; else rate = AGP_MODE_V2_RATE_1x; /* Construct the new mode word and tell the hardware */ command = AGP_MODE_SET_RQ(0, rq); command = AGP_MODE_SET_SBA(command, sba); command = AGP_MODE_SET_FW(command, fw); command = AGP_MODE_SET_RATE(command, rate); command = AGP_MODE_SET_AGP(command, 1); pci_conf_write(sc->as_pc, sc->as_tag, sc->as_capoff + PCI_AGP_COMMAND, command); pci_conf_write(pa->pa_pc, pa->pa_tag, capoff + PCI_AGP_COMMAND, command); return 0; } static int agp_generic_enable_v3(struct agp_softc *sc, const struct pci_attach_args *pa, int capoff, u_int32_t mode) { pcireg_t tstatus, mstatus; pcireg_t command; int rq, sba, fw, rate, arqsz, cal; tstatus = pci_conf_read(sc->as_pc, sc->as_tag, sc->as_capoff + PCI_AGP_STATUS); mstatus = pci_conf_read(pa->pa_pc, pa->pa_tag, capoff + PCI_AGP_STATUS); /* Set RQ to the min of mode, tstatus and mstatus */ rq = AGP_MODE_GET_RQ(mode); if (AGP_MODE_GET_RQ(tstatus) < rq) rq = AGP_MODE_GET_RQ(tstatus); if (AGP_MODE_GET_RQ(mstatus) < rq) rq = AGP_MODE_GET_RQ(mstatus); /* * ARQSZ - Set the value to the maximum one. * Don't allow the mode register to override values. */ arqsz = AGP_MODE_GET_ARQSZ(mode); if (AGP_MODE_GET_ARQSZ(tstatus) > arqsz) arqsz = AGP_MODE_GET_ARQSZ(tstatus); if (AGP_MODE_GET_ARQSZ(mstatus) > arqsz) arqsz = AGP_MODE_GET_ARQSZ(mstatus); /* Calibration cycle - don't allow override by mode register */ cal = AGP_MODE_GET_CAL(tstatus); if (AGP_MODE_GET_CAL(mstatus) < cal) cal = AGP_MODE_GET_CAL(mstatus); /* SBA must be supported for AGP v3. */ sba = 1; /* Set FW if all three support it. */ fw = (AGP_MODE_GET_FW(tstatus) & AGP_MODE_GET_FW(mstatus) & AGP_MODE_GET_FW(mode)); /* Figure out the max rate */ rate = (AGP_MODE_GET_RATE(tstatus) & AGP_MODE_GET_RATE(mstatus) & AGP_MODE_GET_RATE(mode)); if (rate & AGP_MODE_V3_RATE_8x) rate = AGP_MODE_V3_RATE_8x; else rate = AGP_MODE_V3_RATE_4x; /* Construct the new mode word and tell the hardware */ command = AGP_MODE_SET_RQ(0, rq); command = AGP_MODE_SET_ARQSZ(command, arqsz); command = AGP_MODE_SET_CAL(command, cal); command = AGP_MODE_SET_SBA(command, sba); command = AGP_MODE_SET_FW(command, fw); command = AGP_MODE_SET_RATE(command, rate); command = AGP_MODE_SET_AGP(command, 1); pci_conf_write(sc->as_pc, sc->as_tag, sc->as_capoff + PCI_AGP_COMMAND, command); pci_conf_write(pa->pa_pc, pa->pa_tag, capoff + PCI_AGP_COMMAND, command); return 0; } struct agp_memory * agp_generic_alloc_memory(struct agp_softc *sc, int type, vsize_t size) { struct agp_memory *mem; if ((size & (AGP_PAGE_SIZE - 1)) != 0) return 0; if (sc->as_allocated + size > sc->as_maxmem) return 0; if (type != 0) { printf("agp_generic_alloc_memory: unsupported type %d\n", type); return 0; } mem = malloc(sizeof *mem, M_AGP, M_WAITOK); if (mem == NULL) return NULL; if (bus_dmamap_create(sc->as_dmat, size, size / PAGE_SIZE + 1, size, 0, BUS_DMA_NOWAIT, &mem->am_dmamap) != 0) { free(mem, M_AGP); return NULL; } mem->am_id = sc->as_nextid++; mem->am_size = size; mem->am_type = 0; mem->am_physical = 0; mem->am_offset = 0; mem->am_is_bound = 0; TAILQ_INSERT_TAIL(&sc->as_memory, mem, am_link); sc->as_allocated += size; return mem; } int agp_generic_free_memory(struct agp_softc *sc, struct agp_memory *mem) { if (mem->am_is_bound) return EBUSY; sc->as_allocated -= mem->am_size; TAILQ_REMOVE(&sc->as_memory, mem, am_link); bus_dmamap_destroy(sc->as_dmat, mem->am_dmamap); free(mem, M_AGP); return 0; } int agp_generic_bind_memory(struct agp_softc *sc, struct agp_memory *mem, off_t offset) { return agp_generic_bind_memory_bounded(sc, mem, offset, 0, AGP_GET_APERTURE(sc)); } int agp_generic_bind_memory_bounded(struct agp_softc *sc, struct agp_memory *mem, off_t offset, off_t start, off_t end) { off_t i, k; bus_size_t done, j; int error; bus_dma_segment_t *segs, *seg; bus_addr_t pa; int contigpages, nseg; mutex_enter(&sc->as_mtx); if (mem->am_is_bound) { aprint_error_dev(sc->as_dev, "memory already bound\n"); mutex_exit(&sc->as_mtx); return EINVAL; } if (offset < start || (offset & (AGP_PAGE_SIZE - 1)) != 0 || offset > end || mem->am_size > (end - offset)) { aprint_error_dev(sc->as_dev, "binding memory at bad offset %#lx\n", (unsigned long) offset); mutex_exit(&sc->as_mtx); return EINVAL; } /* * XXXfvdl * The memory here needs to be directly accessible from the * AGP video card, so it should be allocated using bus_dma. * However, it need not be contiguous, since individual pages * are translated using the GATT. * * Using a large chunk of contiguous memory may get in the way * of other subsystems that may need one, so we try to be friendly * and ask for allocation in chunks of a minimum of 8 pages * of contiguous memory on average, falling back to 4, 2 and 1 * if really needed. Larger chunks are preferred, since allocating * a bus_dma_segment per page would be overkill. */ for (contigpages = 8; contigpages > 0; contigpages >>= 1) { nseg = (mem->am_size / (contigpages * PAGE_SIZE)) + 1; segs = malloc(nseg * sizeof *segs, M_AGP, M_WAITOK); if (segs == NULL) { mutex_exit(&sc->as_mtx); return ENOMEM; } if (bus_dmamem_alloc(sc->as_dmat, mem->am_size, PAGE_SIZE, 0, segs, nseg, &mem->am_nseg, contigpages > 1 ? BUS_DMA_NOWAIT : BUS_DMA_WAITOK) != 0) { free(segs, M_AGP); continue; } if (bus_dmamem_map(sc->as_dmat, segs, mem->am_nseg, mem->am_size, &mem->am_virtual, BUS_DMA_WAITOK) != 0) { bus_dmamem_free(sc->as_dmat, segs, mem->am_nseg); free(segs, M_AGP); continue; } if (bus_dmamap_load(sc->as_dmat, mem->am_dmamap, mem->am_virtual, mem->am_size, NULL, BUS_DMA_WAITOK) != 0) { bus_dmamem_unmap(sc->as_dmat, mem->am_virtual, mem->am_size); bus_dmamem_free(sc->as_dmat, segs, mem->am_nseg); free(segs, M_AGP); continue; } mem->am_dmaseg = segs; break; } if (contigpages == 0) { mutex_exit(&sc->as_mtx); return ENOMEM; } /* * Bind the individual pages and flush the chipset's * TLB. */ done = 0; for (i = 0; i < mem->am_dmamap->dm_nsegs; i++) { seg = &mem->am_dmamap->dm_segs[i]; /* * Install entries in the GATT, making sure that if * AGP_PAGE_SIZE < PAGE_SIZE and mem->am_size is not * aligned to PAGE_SIZE, we don't modify too many GATT * entries. */ for (j = 0; j < seg->ds_len && (done + j) < mem->am_size; j += AGP_PAGE_SIZE) { pa = seg->ds_addr + j; AGP_DPF(("binding offset %#lx to pa %#lx\n", (unsigned long)(offset + done + j), (unsigned long)pa)); error = AGP_BIND_PAGE(sc, offset + done + j, pa); if (error) { /* * Bail out. Reverse all the mappings * and unwire the pages. */ for (k = 0; k < done + j; k += AGP_PAGE_SIZE) AGP_UNBIND_PAGE(sc, offset + k); bus_dmamap_unload(sc->as_dmat, mem->am_dmamap); bus_dmamem_unmap(sc->as_dmat, mem->am_virtual, mem->am_size); bus_dmamem_free(sc->as_dmat, mem->am_dmaseg, mem->am_nseg); free(mem->am_dmaseg, M_AGP); mutex_exit(&sc->as_mtx); return error; } } done += seg->ds_len; } /* * Flush the CPU cache since we are providing a new mapping * for these pages. */ agp_flush_cache(); /* * Make sure the chipset gets the new mappings. */ AGP_FLUSH_TLB(sc); mem->am_offset = offset; mem->am_is_bound = 1; mutex_exit(&sc->as_mtx); return 0; } int agp_generic_unbind_memory(struct agp_softc *sc, struct agp_memory *mem) { int i; mutex_enter(&sc->as_mtx); if (!mem->am_is_bound) { aprint_error_dev(sc->as_dev, "memory is not bound\n"); mutex_exit(&sc->as_mtx); return EINVAL; } /* * Unbind the individual pages and flush the chipset's * TLB. Unwire the pages so they can be swapped. */ for (i = 0; i < mem->am_size; i += AGP_PAGE_SIZE) AGP_UNBIND_PAGE(sc, mem->am_offset + i); agp_flush_cache(); AGP_FLUSH_TLB(sc); bus_dmamap_unload(sc->as_dmat, mem->am_dmamap); bus_dmamem_unmap(sc->as_dmat, mem->am_virtual, mem->am_size); bus_dmamem_free(sc->as_dmat, mem->am_dmaseg, mem->am_nseg); free(mem->am_dmaseg, M_AGP); mem->am_offset = 0; mem->am_is_bound = 0; mutex_exit(&sc->as_mtx); return 0; } /* Helper functions for implementing user/kernel api */ static int agp_acquire_helper(struct agp_softc *sc, enum agp_acquire_state state) { if (sc->as_state != AGP_ACQUIRE_FREE) return EBUSY; sc->as_state = state; return 0; } static int agp_release_helper(struct agp_softc *sc, enum agp_acquire_state state) { if (sc->as_state == AGP_ACQUIRE_FREE) return 0; if (sc->as_state != state) return EBUSY; sc->as_state = AGP_ACQUIRE_FREE; return 0; } static struct agp_memory * agp_find_memory(struct agp_softc *sc, int id) { struct agp_memory *mem; AGP_DPF(("searching for memory block %d\n", id)); TAILQ_FOREACH(mem, &sc->as_memory, am_link) { AGP_DPF(("considering memory block %d\n", mem->am_id)); if (mem->am_id == id) return mem; } return 0; } /* Implementation of the userland ioctl api */ static int agp_info_user(struct agp_softc *sc, agp_info *info) { memset(info, 0, sizeof *info); info->bridge_id = sc->as_id; if (sc->as_capoff != 0) info->agp_mode = pci_conf_read(sc->as_pc, sc->as_tag, sc->as_capoff + PCI_AGP_STATUS); else info->agp_mode = 0; /* i810 doesn't have real AGP */ info->aper_base = sc->as_apaddr; info->aper_size = AGP_GET_APERTURE(sc) >> 20; info->pg_total = info->pg_system = sc->as_maxmem >> AGP_PAGE_SHIFT; info->pg_used = sc->as_allocated >> AGP_PAGE_SHIFT; return 0; } static int agp_setup_user(struct agp_softc *sc, agp_setup *setup) { return AGP_ENABLE(sc, setup->agp_mode); } static int agp_allocate_user(struct agp_softc *sc, agp_allocate *alloc) { struct agp_memory *mem; mem = AGP_ALLOC_MEMORY(sc, alloc->type, alloc->pg_count << AGP_PAGE_SHIFT); if (mem) { alloc->key = mem->am_id; alloc->physical = mem->am_physical; return 0; } else { return ENOMEM; } } static int agp_deallocate_user(struct agp_softc *sc, int id) { struct agp_memory *mem = agp_find_memory(sc, id); if (mem) { AGP_FREE_MEMORY(sc, mem); return 0; } else { return ENOENT; } } static int agp_bind_user(struct agp_softc *sc, agp_bind *bind) { struct agp_memory *mem = agp_find_memory(sc, bind->key); if (!mem) return ENOENT; return AGP_BIND_MEMORY(sc, mem, bind->pg_start << AGP_PAGE_SHIFT); } static int agp_unbind_user(struct agp_softc *sc, agp_unbind *unbind) { struct agp_memory *mem = agp_find_memory(sc, unbind->key); if (!mem) return ENOENT; return AGP_UNBIND_MEMORY(sc, mem); } static int agpopen(dev_t dev, int oflags, int devtype, struct lwp *l) { struct agp_softc *sc = device_lookup_private(&agp_cd, AGPUNIT(dev)); if (sc == NULL) return ENXIO; if (sc->as_chipc == NULL) return ENXIO; if (!sc->as_isopen) sc->as_isopen = 1; else return EBUSY; return 0; } static int agpclose(dev_t dev, int fflag, int devtype, struct lwp *l) { struct agp_softc *sc = device_lookup_private(&agp_cd, AGPUNIT(dev)); struct agp_memory *mem; if (sc == NULL) return ENODEV; /* * Clear the GATT and force release on last close */ if (sc->as_state == AGP_ACQUIRE_USER) { while ((mem = TAILQ_FIRST(&sc->as_memory))) { if (mem->am_is_bound) { printf("agpclose: mem %d is bound\n", mem->am_id); AGP_UNBIND_MEMORY(sc, mem); } /* * XXX it is not documented, but if the protocol allows * allocate->acquire->bind, it would be possible that * memory ranges are allocated by the kernel here, * which we shouldn't free. We'd have to keep track of * the memory range's owner. * The kernel API is unsed yet, so we get away with * freeing all. */ AGP_FREE_MEMORY(sc, mem); } agp_release_helper(sc, AGP_ACQUIRE_USER); } sc->as_isopen = 0; return 0; } static int agpioctl(dev_t dev, u_long cmd, void *data, int fflag, struct lwp *l) { struct agp_softc *sc = device_lookup_private(&agp_cd, AGPUNIT(dev)); if (sc == NULL) return ENODEV; if ((fflag & FWRITE) == 0 && cmd != AGPIOC_INFO) return EPERM; switch (cmd) { case AGPIOC_INFO: return agp_info_user(sc, (agp_info *) data); case AGPIOC_ACQUIRE: return agp_acquire_helper(sc, AGP_ACQUIRE_USER); case AGPIOC_RELEASE: return agp_release_helper(sc, AGP_ACQUIRE_USER); case AGPIOC_SETUP: return agp_setup_user(sc, (agp_setup *)data); #ifdef __x86_64__ { /* * Handle paddr_t change from 32 bit for non PAE kernels * to 64 bit. */ #define AGPIOC_OALLOCATE _IOWR(AGPIOC_BASE, 6, agp_oallocate) typedef struct _agp_oallocate { int key; /* tag of allocation */ size_t pg_count; /* number of pages */ uint32_t type; /* 0 == normal, other devspec */ u_long physical; /* device specific (some devices * need a phys address of the * actual page behind the gatt * table) */ } agp_oallocate; case AGPIOC_OALLOCATE: { int ret; agp_allocate aga; agp_oallocate *oaga = data; aga.type = oaga->type; aga.pg_count = oaga->pg_count; if ((ret = agp_allocate_user(sc, &aga)) == 0) { oaga->key = aga.key; oaga->physical = (u_long)aga.physical; } return ret; } } #endif case AGPIOC_ALLOCATE: return agp_allocate_user(sc, (agp_allocate *)data); case AGPIOC_DEALLOCATE: return agp_deallocate_user(sc, *(int *) data); case AGPIOC_BIND: return agp_bind_user(sc, (agp_bind *)data); case AGPIOC_UNBIND: return agp_unbind_user(sc, (agp_unbind *)data); } return EINVAL; } static paddr_t agpmmap(dev_t dev, off_t offset, int prot) { struct agp_softc *sc = device_lookup_private(&agp_cd, AGPUNIT(dev)); if (sc == NULL) return ENODEV; if (offset > AGP_GET_APERTURE(sc)) return -1; return (bus_space_mmap(sc->as_apt, sc->as_apaddr, offset, prot, BUS_SPACE_MAP_LINEAR)); } const struct cdevsw agp_cdevsw = { .d_open = agpopen, .d_close = agpclose, .d_read = noread, .d_write = nowrite, .d_ioctl = agpioctl, .d_stop = nostop, .d_tty = notty, .d_poll = nopoll, .d_mmap = agpmmap, .d_kqfilter = nokqfilter, .d_discard = nodiscard, .d_flag = D_OTHER }; /* Implementation of the kernel api */ void * agp_find_device(int unit) { return device_lookup_private(&agp_cd, unit); } enum agp_acquire_state agp_state(void *devcookie) { struct agp_softc *sc = devcookie; return sc->as_state; } void agp_get_info(void *devcookie, struct agp_info *info) { struct agp_softc *sc = devcookie; info->ai_mode = pci_conf_read(sc->as_pc, sc->as_tag, sc->as_capoff + PCI_AGP_STATUS); info->ai_aperture_base = sc->as_apaddr; info->ai_aperture_size = sc->as_apsize; /* XXXfvdl inconsistent */ info->ai_memory_allowed = sc->as_maxmem; info->ai_memory_used = sc->as_allocated; info->ai_devid = sc->as_id; } int agp_acquire(void *dev) { return agp_acquire_helper(dev, AGP_ACQUIRE_KERNEL); } int agp_release(void *dev) { return agp_release_helper(dev, AGP_ACQUIRE_KERNEL); } int agp_enable(void *dev, u_int32_t mode) { struct agp_softc *sc = dev; return AGP_ENABLE(sc, mode); } void * agp_alloc_memory(void *dev, int type, vsize_t bytes) { struct agp_softc *sc = dev; return (void *)AGP_ALLOC_MEMORY(sc, type, bytes); } void agp_free_memory(void *dev, void *handle) { struct agp_softc *sc = dev; struct agp_memory *mem = handle; AGP_FREE_MEMORY(sc, mem); } int agp_bind_memory(void *dev, void *handle, off_t offset) { struct agp_softc *sc = dev; struct agp_memory *mem = handle; return AGP_BIND_MEMORY(sc, mem, offset); } int agp_unbind_memory(void *dev, void *handle) { struct agp_softc *sc = dev; struct agp_memory *mem = handle; return AGP_UNBIND_MEMORY(sc, mem); } void agp_memory_info(void *dev, void *handle, struct agp_memory_info *mi) { struct agp_memory *mem = handle; mi->ami_size = mem->am_size; mi->ami_physical = mem->am_physical; mi->ami_offset = mem->am_offset; mi->ami_is_bound = mem->am_is_bound; } int agp_alloc_dmamem(bus_dma_tag_t tag, size_t size, int flags, bus_dmamap_t *mapp, void **vaddr, bus_addr_t *baddr, bus_dma_segment_t *seg, int nseg, int *rseg) { int error, level = 0; if ((error = bus_dmamem_alloc(tag, size, PAGE_SIZE, 0, seg, nseg, rseg, BUS_DMA_NOWAIT)) != 0) goto out; level++; if ((error = bus_dmamem_map(tag, seg, *rseg, size, vaddr, BUS_DMA_NOWAIT | flags)) != 0) goto out; level++; if ((error = bus_dmamap_create(tag, size, *rseg, size, 0, BUS_DMA_NOWAIT, mapp)) != 0) goto out; level++; if ((error = bus_dmamap_load(tag, *mapp, *vaddr, size, NULL, BUS_DMA_NOWAIT)) != 0) goto out; *baddr = (*mapp)->dm_segs[0].ds_addr; return 0; out: switch (level) { case 3: bus_dmamap_destroy(tag, *mapp); /* FALLTHROUGH */ case 2: bus_dmamem_unmap(tag, *vaddr, size); /* FALLTHROUGH */ case 1: bus_dmamem_free(tag, seg, *rseg); break; default: break; } return error; } void agp_free_dmamem(bus_dma_tag_t tag, size_t size, bus_dmamap_t map, void *vaddr, bus_dma_segment_t *seg, int nseg) { bus_dmamap_unload(tag, map); bus_dmamap_destroy(tag, map); bus_dmamem_unmap(tag, vaddr, size); bus_dmamem_free(tag, seg, nseg); } static bool agp_resume(device_t dv, const pmf_qual_t *qual) { agp_flush_cache(); return true; }
3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 1 1 1 1 1 3 3 3 3 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 /* $NetBSD: usb_subr.c,v 1.279 2024/05/04 12:45:13 mlelstv Exp $ */ /* $FreeBSD: src/sys/dev/usb/usb_subr.c,v 1.18 1999/11/17 22:33:47 n_hibma Exp $ */ /* * Copyright (c) 1998, 2004 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Lennart Augustsson (lennart@augustsson.net) at * Carlstedt Research & Technology. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: usb_subr.c,v 1.279 2024/05/04 12:45:13 mlelstv Exp $"); #ifdef _KERNEL_OPT #include "opt_compat_netbsd.h" #include "opt_usb.h" #include "opt_usbverbose.h" #endif #include <sys/param.h> #include <sys/systm.h> #include <sys/kernel.h> #include <sys/kmem.h> #include <sys/device.h> #include <sys/select.h> #include <sys/proc.h> #include <sys/bus.h> #include <sys/module.h> #include <dev/usb/usb.h> #include <dev/usb/usbdi.h> #include <dev/usb/usbdi_util.h> #include <dev/usb/usbdivar.h> #include <dev/usb/usbdevs.h> #include <dev/usb/usb_quirks.h> #include <dev/usb/usb_verbose.h> #include <dev/usb/usbhist.h> #include "locators.h" #define DPRINTF(FMT,A,B,C,D) USBHIST_LOG(usbdebug,FMT,A,B,C,D) #define DPRINTFN(N,FMT,A,B,C,D) USBHIST_LOGN(usbdebug,N,FMT,A,B,C,D) Static void usbd_devinfo(struct usbd_device *, int, char *, size_t); Static int usbd_getnewaddr(struct usbd_bus *); Static int usbd_print(void *, const char *); Static int usbd_ifprint(void *, const char *); Static void usbd_free_iface_data(struct usbd_device *, int); uint32_t usb_cookie_no = 0; Static const char * const usbd_error_strs[] = { "NORMAL_COMPLETION", "IN_PROGRESS", "PENDING_REQUESTS", "NOT_STARTED", "INVAL", "NOMEM", "CANCELLED", "BAD_ADDRESS", "IN_USE", "NO_ADDR", "SET_ADDR_FAILED", "NO_POWER", "TOO_DEEP", "IOERROR", "NOT_CONFIGURED", "TIMEOUT", "SHORT_XFER", "STALLED", "INTERRUPTED", "XXX", }; DEV_VERBOSE_DEFINE(usb); const char * usbd_errstr(usbd_status err) { static char buffer[5]; if (err < USBD_ERROR_MAX) { return usbd_error_strs[err]; } else { snprintf(buffer, sizeof(buffer), "%d", err); return buffer; } } static void usbd_trim_spaces(char *p) { char *q, *e; q = e = p; while (*q == ' ') /* skip leading spaces */ q++; while ((*p = *q++)) /* copy string */ if (*p++ != ' ') /* remember last non-space */ e = p; *e = '\0'; /* kill trailing spaces */ } static void usbd_get_device_string(struct usbd_device *ud, uByte index, char **buf) { char *b; usbd_status err; b = kmem_alloc(USB_MAX_ENCODED_STRING_LEN, KM_SLEEP); err = usbd_get_string0(ud, index, b, true); if (err != USBD_NORMAL_COMPLETION) { kmem_free(b, USB_MAX_ENCODED_STRING_LEN); b = NULL; } else { usbd_trim_spaces(b); } *buf = b; } void usbd_get_device_strings(struct usbd_device *ud) { usb_device_descriptor_t *udd = &ud->ud_ddesc; usbd_get_device_string(ud, udd->iManufacturer, &ud->ud_vendor); usbd_get_device_string(ud, udd->iProduct, &ud->ud_product); usbd_get_device_string(ud, udd->iSerialNumber, &ud->ud_serial); } void usbd_devinfo_vp(struct usbd_device *dev, char *v, size_t vl, char *p, size_t pl, int usedev, int useencoded) { usb_device_descriptor_t *udd = &dev->ud_ddesc; if (dev == NULL) return; v[0] = p[0] = '\0'; if (usedev) { if (usbd_get_string0(dev, udd->iManufacturer, v, useencoded) == USBD_NORMAL_COMPLETION) usbd_trim_spaces(v); if (usbd_get_string0(dev, udd->iProduct, p, useencoded) == USBD_NORMAL_COMPLETION) usbd_trim_spaces(p); } else { if (dev->ud_vendor) { strlcpy(v, dev->ud_vendor, vl); } if (dev->ud_product) { strlcpy(p, dev->ud_product, pl); } } if (v[0] == '\0') usb_findvendor(v, vl, UGETW(udd->idVendor)); if (p[0] == '\0') usb_findproduct(p, pl, UGETW(udd->idVendor), UGETW(udd->idProduct)); } int usbd_printBCD(char *cp, size_t l, int bcd) { return snprintf(cp, l, "%x.%02x", bcd >> 8, bcd & 0xff); } Static void usbd_devinfo(struct usbd_device *dev, int showclass, char *cp, size_t l) { usb_device_descriptor_t *udd = &dev->ud_ddesc; char *vendor, *product; int bcdDevice, bcdUSB; char *ep; vendor = kmem_alloc(USB_MAX_ENCODED_STRING_LEN * 2, KM_SLEEP); product = &vendor[USB_MAX_ENCODED_STRING_LEN]; ep = cp + l; usbd_devinfo_vp(dev, vendor, USB_MAX_ENCODED_STRING_LEN, product, USB_MAX_ENCODED_STRING_LEN, 0, 1); cp += snprintf(cp, ep - cp, "%s (0x%04x) %s (0x%04x)", vendor, UGETW(udd->idVendor), product, UGETW(udd->idProduct)); if (showclass) cp += snprintf(cp, ep - cp, ", class %d/%d", udd->bDeviceClass, udd->bDeviceSubClass); bcdUSB = UGETW(udd->bcdUSB); bcdDevice = UGETW(udd->bcdDevice); cp += snprintf(cp, ep - cp, ", rev "); cp += usbd_printBCD(cp, ep - cp, bcdUSB); *cp++ = '/'; cp += usbd_printBCD(cp, ep - cp, bcdDevice); cp += snprintf(cp, ep - cp, ", addr %d", dev->ud_addr); *cp = 0; kmem_free(vendor, USB_MAX_ENCODED_STRING_LEN * 2); } char * usbd_devinfo_alloc(struct usbd_device *dev, int showclass) { char *devinfop; devinfop = kmem_alloc(DEVINFOSIZE, KM_SLEEP); usbd_devinfo(dev, showclass, devinfop, DEVINFOSIZE); return devinfop; } void usbd_devinfo_free(char *devinfop) { kmem_free(devinfop, DEVINFOSIZE); } /* Delay for a certain number of ms */ void usb_delay_ms_locked(struct usbd_bus *bus, u_int ms, kmutex_t *lock) { /* Wait at least two clock ticks so we know the time has passed. */ if (bus->ub_usepolling || cold) delay((ms+1) * 1000); else kpause("usbdly", false, (ms*hz+999)/1000 + 1, lock); } void usb_delay_ms(struct usbd_bus *bus, u_int ms) { usb_delay_ms_locked(bus, ms, NULL); } /* Delay given a device handle. */ void usbd_delay_ms_locked(struct usbd_device *dev, u_int ms, kmutex_t *lock) { usb_delay_ms_locked(dev->ud_bus, ms, lock); } /* Delay given a device handle. */ void usbd_delay_ms(struct usbd_device *dev, u_int ms) { usb_delay_ms_locked(dev->ud_bus, ms, NULL); } usbd_status usbd_reset_port(struct usbd_device *dev, int port, usb_port_status_t *ps) { USBHIST_FUNC(); USBHIST_CALLARGS(usbdebug, "port %jd", port, 0, 0, 0); usb_device_request_t req; usbd_status err; int n; req.bmRequestType = UT_WRITE_CLASS_OTHER; req.bRequest = UR_SET_FEATURE; USETW(req.wValue, UHF_PORT_RESET); USETW(req.wIndex, port); USETW(req.wLength, 0); err = usbd_do_request(dev, &req, 0); DPRINTFN(1, "port %jd reset done, error=%jd", port, err, 0, 0); if (err) return err; n = 10; do { /* Wait for device to recover from reset. */ usbd_delay_ms(dev, USB_PORT_RESET_DELAY); err = usbd_get_port_status(dev, port, ps); if (err) { DPRINTF("get status failed %jd", err, 0, 0, 0); return err; } /* If the device disappeared, just give up. */ if (!(UGETW(ps->wPortStatus) & UPS_CURRENT_CONNECT_STATUS)) return USBD_NORMAL_COMPLETION; } while ((UGETW(ps->wPortChange) & UPS_C_PORT_RESET) == 0 && --n > 0); if (n == 0) return USBD_TIMEOUT; err = usbd_clear_port_feature(dev, port, UHF_C_PORT_RESET); #ifdef USB_DEBUG if (err) DPRINTF("clear port feature failed %jd", err, 0, 0, 0); #endif /* Wait for the device to recover from reset. */ usbd_delay_ms(dev, USB_PORT_RESET_RECOVERY); return err; } usb_interface_descriptor_t * usbd_find_idesc(usb_config_descriptor_t *cd, int ifaceidx, int altidx) { USBHIST_FUNC(); USBHIST_CALLARGS(usbdebug, "iface/alt idx %jd/%jd", ifaceidx, altidx, 0, 0); char *p = (char *)cd; char *end = p + UGETW(cd->wTotalLength); usb_descriptor_t *desc; usb_interface_descriptor_t *idesc; int curidx, lastidx, curaidx = 0; for (curidx = lastidx = -1; end - p >= sizeof(*desc);) { desc = (usb_descriptor_t *)p; DPRINTFN(4, "idx=%jd(%jd) altidx=%jd(%jd)", ifaceidx, curidx, altidx, curaidx); DPRINTFN(4, "len=%jd type=%jd", desc->bLength, desc->bDescriptorType, 0, 0); if (desc->bLength < USB_DESCRIPTOR_SIZE) break; if (desc->bLength > end - p) break; p += desc->bLength; if (desc->bDescriptorType != UDESC_INTERFACE) continue; if (desc->bLength < USB_INTERFACE_DESCRIPTOR_SIZE) break; idesc = (usb_interface_descriptor_t *)desc; if (idesc->bInterfaceNumber != lastidx) { lastidx = idesc->bInterfaceNumber; curidx++; curaidx = 0; } else { curaidx++; } if (ifaceidx == curidx && altidx == curaidx) return idesc; } return NULL; } usb_endpoint_descriptor_t * usbd_find_edesc(usb_config_descriptor_t *cd, int ifaceidx, int altidx, int endptidx) { char *p = (char *)cd; char *end = p + UGETW(cd->wTotalLength); usb_interface_descriptor_t *idesc; usb_endpoint_descriptor_t *edesc; usb_descriptor_t *desc; int curidx; idesc = usbd_find_idesc(cd, ifaceidx, altidx); if (idesc == NULL) return NULL; if (endptidx >= idesc->bNumEndpoints) /* quick exit */ return NULL; curidx = -1; for (p = (char *)idesc + idesc->bLength; end - p >= sizeof(*edesc);) { desc = (usb_descriptor_t *)p; if (desc->bLength < USB_DESCRIPTOR_SIZE) break; if (desc->bLength > end - p) break; p += desc->bLength; if (desc->bDescriptorType == UDESC_INTERFACE) break; if (desc->bDescriptorType != UDESC_ENDPOINT) continue; if (desc->bLength < USB_ENDPOINT_DESCRIPTOR_SIZE) break; edesc = (usb_endpoint_descriptor_t *)desc; curidx++; if (curidx == endptidx) return edesc; } return NULL; } static void usbd_iface_init(struct usbd_device *dev, int ifaceidx) { struct usbd_interface *ifc = &dev->ud_ifaces[ifaceidx]; memset(ifc, 0, sizeof(*ifc)); ifc->ui_dev = dev; ifc->ui_idesc = NULL; ifc->ui_index = 0; ifc->ui_altindex = 0; ifc->ui_endpoints = NULL; ifc->ui_busy = 0; } static void usbd_iface_fini(struct usbd_device *dev, int ifaceidx) { struct usbd_interface *ifc __diagused = &dev->ud_ifaces[ifaceidx]; KASSERT(ifc->ui_dev == dev); KASSERT(ifc->ui_idesc == NULL); KASSERT(ifc->ui_index == 0); KASSERT(ifc->ui_altindex == 0); KASSERT(ifc->ui_endpoints == NULL); KASSERTMSG(ifc->ui_busy == 0, "%"PRId64, ifc->ui_busy); } /* * usbd_iface_lock/locked/unlock, usbd_iface_piperef/pipeunref * * We lock the interface while we are setting it, and we acquire a * reference to the interface for each pipe opened on it. * * Setting the interface while pipes are open is forbidden, and * opening pipes while the interface is being set is forbidden. */ bool usbd_iface_locked(struct usbd_interface *iface) { bool locked; mutex_enter(iface->ui_dev->ud_bus->ub_lock); locked = (iface->ui_busy == -1); mutex_exit(iface->ui_dev->ud_bus->ub_lock); return locked; } static void usbd_iface_exlock(struct usbd_interface *iface) { mutex_enter(iface->ui_dev->ud_bus->ub_lock); KASSERTMSG(iface->ui_busy == 0, "interface is not idle," " busy=%"PRId64, iface->ui_busy); iface->ui_busy = -1; mutex_exit(iface->ui_dev->ud_bus->ub_lock); } usbd_status usbd_iface_lock(struct usbd_interface *iface) { usbd_status err; mutex_enter(iface->ui_dev->ud_bus->ub_lock); KASSERTMSG(iface->ui_busy != -1, "interface is locked"); KASSERTMSG(iface->ui_busy >= 0, "%"PRId64, iface->ui_busy); if (iface->ui_busy) { err = USBD_IN_USE; } else { iface->ui_busy = -1; err = 0; } mutex_exit(iface->ui_dev->ud_bus->ub_lock); return err; } void usbd_iface_unlock(struct usbd_interface *iface) { mutex_enter(iface->ui_dev->ud_bus->ub_lock); KASSERTMSG(iface->ui_busy == -1, "interface is not locked," " busy=%"PRId64, iface->ui_busy); iface->ui_busy = 0; mutex_exit(iface->ui_dev->ud_bus->ub_lock); } usbd_status usbd_iface_piperef(struct usbd_interface *iface) { usbd_status err; mutex_enter(iface->ui_dev->ud_bus->ub_lock); KASSERTMSG(iface->ui_busy >= -1, "%"PRId64, iface->ui_busy); if (iface->ui_busy == -1) { err = USBD_IN_USE; } else { iface->ui_busy++; err = 0; } mutex_exit(iface->ui_dev->ud_bus->ub_lock); return err; } void usbd_iface_pipeunref(struct usbd_interface *iface) { mutex_enter(iface->ui_dev->ud_bus->ub_lock); KASSERTMSG(iface->ui_busy != -1, "interface is locked"); KASSERTMSG(iface->ui_busy != 0, "interface not in use"); KASSERTMSG(iface->ui_busy >= 1, "%"PRId64, iface->ui_busy); iface->ui_busy--; mutex_exit(iface->ui_dev->ud_bus->ub_lock); } usbd_status usbd_fill_iface_data(struct usbd_device *dev, int ifaceidx, int altidx) { USBHIST_FUNC(); USBHIST_CALLARGS(usbdebug, "ifaceidx=%jd altidx=%jd", ifaceidx, altidx, 0, 0); struct usbd_interface *ifc = &dev->ud_ifaces[ifaceidx]; usb_descriptor_t *desc; usb_interface_descriptor_t *idesc; usb_endpoint_descriptor_t *ed; struct usbd_endpoint *endpoints; char *p, *end; int endpt, nendpt; KASSERT(ifc->ui_dev == dev); KASSERT(usbd_iface_locked(ifc)); idesc = usbd_find_idesc(dev->ud_cdesc, ifaceidx, altidx); if (idesc == NULL) return USBD_INVAL; nendpt = idesc->bNumEndpoints; DPRINTFN(4, "found idesc nendpt=%jd", nendpt, 0, 0, 0); if (nendpt != 0) { endpoints = kmem_alloc(nendpt * sizeof(struct usbd_endpoint), KM_SLEEP); } else endpoints = NULL; p = (char *)idesc + idesc->bLength; end = (char *)dev->ud_cdesc + UGETW(dev->ud_cdesc->wTotalLength); KASSERTMSG((char *)dev->ud_cdesc <= (char *)idesc, "cdesc=%p idesc=%p", dev->ud_cdesc, idesc); KASSERTMSG((char *)idesc < end, "idesc=%p end=%p", idesc, end); for (endpt = 0; endpt < nendpt; endpt++) { DPRINTFN(10, "endpt=%jd", endpt, 0, 0, 0); for (; end - p >= sizeof(*desc); p += desc->bLength) { desc = (usb_descriptor_t *)p; DPRINTFN(10, "p=%#jx end=%#jx len=%jd type=%jd", (uintptr_t)p, (uintptr_t)end, desc->bLength, desc->bDescriptorType); if (desc->bLength < sizeof(*desc)) { printf("%s: bad descriptor: too short\n", __func__); goto bad; } else if (desc->bLength > end - p) { printf("%s: bad descriptor: too long\n", __func__); goto bad; } else if (desc->bDescriptorType == UDESC_INTERFACE) { printf("%s: bad descriptor: iface desc\n", __func__); goto bad; } if (desc->bLength >= USB_ENDPOINT_DESCRIPTOR_SIZE && desc->bDescriptorType == UDESC_ENDPOINT) { ed = (usb_endpoint_descriptor_t *)p; goto found; } } printf("%s: no desc found\n", __func__); goto bad; found: endpoints[endpt].ue_edesc = ed; if (dev->ud_speed == USB_SPEED_HIGH) { u_int mps; /* Control and bulk endpoints have max packet limits. */ switch (UE_GET_XFERTYPE(ed->bmAttributes)) { case UE_CONTROL: mps = USB_2_MAX_CTRL_PACKET; goto check; case UE_BULK: mps = USB_2_MAX_BULK_PACKET; check: if (UGETW(ed->wMaxPacketSize) != mps) { USETW(ed->wMaxPacketSize, mps); #ifdef DIAGNOSTIC printf("usbd_fill_iface_data: bad max " "packet size\n"); #endif } break; default: break; } } endpoints[endpt].ue_refcnt = 0; endpoints[endpt].ue_toggle = 0; KASSERTMSG(end - p >= ed->bLength, "p=%p end=%p length=%u", p, end, ed->bLength); p += ed->bLength; } #undef ed /* Success! Free the old endpoints and commit the changes. */ if (ifc->ui_endpoints) { kmem_free(ifc->ui_endpoints, (sizeof(ifc->ui_endpoints[0]) * ifc->ui_idesc->bNumEndpoints)); } ifc->ui_idesc = idesc; ifc->ui_index = ifaceidx; ifc->ui_altindex = altidx; ifc->ui_endpoints = endpoints; return USBD_NORMAL_COMPLETION; bad: if (endpoints) kmem_free(endpoints, nendpt * sizeof(struct usbd_endpoint)); return USBD_INVAL; } Static void usbd_free_iface_data(struct usbd_device *dev, int ifcno) { struct usbd_interface *ifc = &dev->ud_ifaces[ifcno]; KASSERT(ifc->ui_dev == dev); KASSERT(ifc->ui_idesc != NULL); KASSERT(usbd_iface_locked(ifc)); if (ifc->ui_endpoints) { int nendpt = ifc->ui_idesc->bNumEndpoints; size_t sz = nendpt * sizeof(struct usbd_endpoint); kmem_free(ifc->ui_endpoints, sz); ifc->ui_endpoints = NULL; } ifc->ui_altindex = 0; ifc->ui_index = 0; ifc->ui_idesc = NULL; } usbd_status usbd_set_config_no(struct usbd_device *dev, int no, int msg) { USBHIST_FUNC(); USBHIST_CALLARGS(usbdebug, "%jd", no, 0, 0, 0); usb_config_descriptor_t cd; usbd_status err; int index; if (no == USB_UNCONFIG_NO) return usbd_set_config_index(dev, USB_UNCONFIG_INDEX, msg); /* Figure out what config index to use. */ for (index = 0; index < dev->ud_ddesc.bNumConfigurations; index++) { err = usbd_get_config_desc(dev, index, &cd); if (err) return err; if (cd.bConfigurationValue == no) return usbd_set_config_index(dev, index, msg); } return USBD_INVAL; } usbd_status usbd_set_config_index(struct usbd_device *dev, int index, int msg) { USBHIST_FUNC(); USBHIST_CALLARGS(usbdebug, "dev=%#jx index=%jd", (uintptr_t)dev, index, 0, 0); usb_config_descriptor_t cd, *cdp; usb_bos_descriptor_t *bdp = NULL; usbd_status err; int i, ifcidx, nifc, len, selfpowered, power; if (index >= dev->ud_ddesc.bNumConfigurations && index != USB_UNCONFIG_INDEX) { /* panic? */ printf("usbd_set_config_index: illegal index\n"); return USBD_INVAL; } /* XXX check that all interfaces are idle */ if (dev->ud_config != USB_UNCONFIG_NO) { DPRINTF("free old config", 0, 0, 0, 0); /* Free all configuration data structures. */ nifc = dev->ud_cdesc->bNumInterface; for (ifcidx = 0; ifcidx < nifc; ifcidx++) { usbd_iface_exlock(&dev->ud_ifaces[ifcidx]); usbd_free_iface_data(dev, ifcidx); usbd_iface_unlock(&dev->ud_ifaces[ifcidx]); usbd_iface_fini(dev, ifcidx); } kmem_free(dev->ud_ifaces, nifc * sizeof(struct usbd_interface)); kmem_free(dev->ud_cdesc, UGETW(dev->ud_cdesc->wTotalLength)); if (dev->ud_bdesc != NULL) kmem_free(dev->ud_bdesc, UGETW(dev->ud_bdesc->wTotalLength)); dev->ud_ifaces = NULL; dev->ud_cdesc = NULL; dev->ud_bdesc = NULL; dev->ud_config = USB_UNCONFIG_NO; } if (index == USB_UNCONFIG_INDEX) { /* We are unconfiguring the device, so leave unallocated. */ DPRINTF("set config 0", 0, 0, 0, 0); err = usbd_set_config(dev, USB_UNCONFIG_NO); if (err) { DPRINTF("setting config=0 failed, err = %jd", err, 0, 0, 0); } return err; } /* Get the short descriptor. */ err = usbd_get_config_desc(dev, index, &cd); if (err) { DPRINTF("get_config_desc=%jd", err, 0, 0, 0); return err; } len = UGETW(cd.wTotalLength); if (len < USB_CONFIG_DESCRIPTOR_SIZE) { DPRINTF("empty short descriptor", 0, 0, 0, 0); return USBD_INVAL; } cdp = kmem_alloc(len, KM_SLEEP); /* Get the full descriptor. Try a few times for slow devices. */ for (i = 0; i < 3; i++) { err = usbd_get_desc(dev, UDESC_CONFIG, index, len, cdp); if (!err) break; usbd_delay_ms(dev, 200); } if (err) { DPRINTF("get_desc=%jd", err, 0, 0, 0); goto bad; } if (cdp->bDescriptorType != UDESC_CONFIG) { DPRINTF("bad desc %jd", cdp->bDescriptorType, 0, 0, 0); err = USBD_INVAL; goto bad; } if (UGETW(cdp->wTotalLength) != UGETW(cd.wTotalLength)) { DPRINTF("bad len %jd", UGETW(cdp->wTotalLength), 0, 0, 0); err = USBD_INVAL; goto bad; } if (USB_IS_SS(dev->ud_speed)) { usb_bos_descriptor_t bd; /* get short bos desc */ err = usbd_get_bos_desc(dev, index, &bd); if (!err) { int blen = UGETW(bd.wTotalLength); if (blen < USB_BOS_DESCRIPTOR_SIZE) { DPRINTF("empty bos descriptor", 0, 0, 0, 0); err = USBD_INVAL; goto bad; } bdp = kmem_alloc(blen, KM_SLEEP); /* Get the full desc */ for (i = 0; i < 3; i++) { err = usbd_get_desc(dev, UDESC_BOS, index, blen, bdp); if (!err) break; usbd_delay_ms(dev, 200); } if (err || bdp->bDescriptorType != UDESC_BOS || UGETW(bdp->wTotalLength) != UGETW(bd.wTotalLength)) { DPRINTF("error %jd or bad desc %jd", err, bdp->bDescriptorType, 0, 0); kmem_free(bdp, blen); bdp = NULL; } } } dev->ud_bdesc = bdp; /* * Figure out if the device is self or bus powered. */ #if 0 /* XXX various devices don't report the power state correctly */ selfpowered = 0; err = usbd_get_device_status(dev, &ds); if (!err && (UGETW(ds.wStatus) & UDS_SELF_POWERED)) selfpowered = 1; #endif /* * Use the power state in the configuration we are going * to set. This doesn't necessarily reflect the actual * power state of the device; the driver can control this * by choosing the appropriate configuration. */ selfpowered = !!(cdp->bmAttributes & UC_SELF_POWERED); DPRINTF("addr %jd cno=%jd attr=0x%02jx, selfpowered=%jd", dev->ud_addr, cdp->bConfigurationValue, cdp->bmAttributes, selfpowered); DPRINTF("max power=%jd", cdp->bMaxPower * 2, 0, 0, 0); /* Check if we have enough power. */ #if 0 /* this is a no-op, see above */ if ((cdp->bmAttributes & UC_SELF_POWERED) && !selfpowered) { if (msg) printf("%s: device addr %d (config %d): " "can't set self powered configuration\n", device_xname(dev->ud_bus->bdev), dev->ud_addr, cdp->bConfigurationValue); err = USBD_NO_POWER; goto bad; } #endif #ifdef USB_DEBUG if (dev->ud_powersrc == NULL) { DPRINTF("No power source?", 0, 0, 0, 0); err = USBD_IOERROR; goto bad; } #endif power = cdp->bMaxPower * 2; if (power > dev->ud_powersrc->up_power) { DPRINTF("power exceeded %jd %jd", power, dev->ud_powersrc->up_power, 0, 0); /* XXX print nicer message. */ if (msg) printf("%s: device addr %d (config %d) exceeds power " "budget, %d mA > %d mA\n", device_xname(dev->ud_bus->ub_usbctl), dev->ud_addr, cdp->bConfigurationValue, power, dev->ud_powersrc->up_power); err = USBD_NO_POWER; goto bad; } dev->ud_power = power; dev->ud_selfpowered = selfpowered; /* Set the actual configuration value. */ DPRINTF("set config %jd", cdp->bConfigurationValue, 0, 0, 0); err = usbd_set_config(dev, cdp->bConfigurationValue); if (err) { DPRINTF("setting config=%jd failed, error=%jd", cdp->bConfigurationValue, err, 0, 0); goto bad; } KASSERTMSG(dev->ud_ifaces == NULL, "ud_ifaces=%p", dev->ud_ifaces); /* Allocate and fill interface data. */ nifc = cdp->bNumInterface; if (nifc == 0) { DPRINTF("no interfaces", 0, 0, 0, 0); err = USBD_INVAL; goto bad; } dev->ud_ifaces = kmem_alloc(nifc * sizeof(struct usbd_interface), KM_SLEEP); DPRINTFN(5, "dev=%#jx cdesc=%#jx", (uintptr_t)dev, (uintptr_t)cdp, 0, 0); dev->ud_cdesc = cdp; dev->ud_config = cdp->bConfigurationValue; for (ifcidx = 0; ifcidx < nifc; ifcidx++) { usbd_iface_init(dev, ifcidx); usbd_iface_exlock(&dev->ud_ifaces[ifcidx]); err = usbd_fill_iface_data(dev, ifcidx, 0); usbd_iface_unlock(&dev->ud_ifaces[ifcidx]); if (err) { while (--ifcidx >= 0) { usbd_iface_exlock(&dev->ud_ifaces[ifcidx]); usbd_free_iface_data(dev, ifcidx); usbd_iface_unlock(&dev->ud_ifaces[ifcidx]); usbd_iface_fini(dev, ifcidx); } kmem_free(dev->ud_ifaces, nifc * sizeof(struct usbd_interface)); dev->ud_ifaces = NULL; goto bad; } } return USBD_NORMAL_COMPLETION; bad: /* XXX Use usbd_set_config() to reset the config? */ /* XXX Should we forbid USB_UNCONFIG_NO from bConfigurationValue? */ dev->ud_config = USB_UNCONFIG_NO; KASSERT(dev->ud_ifaces == NULL); kmem_free(cdp, len); dev->ud_cdesc = NULL; if (bdp != NULL) { kmem_free(bdp, UGETW(bdp->wTotalLength)); dev->ud_bdesc = NULL; } return err; } /* XXX add function for alternate settings */ usbd_status usbd_setup_pipe(struct usbd_device *dev, struct usbd_interface *iface, struct usbd_endpoint *ep, int ival, struct usbd_pipe **pipe) { return usbd_setup_pipe_flags(dev, iface, ep, ival, pipe, 0); } usbd_status usbd_setup_pipe_flags(struct usbd_device *dev, struct usbd_interface *iface, struct usbd_endpoint *ep, int ival, struct usbd_pipe **pipe, uint8_t flags) { USBHIST_FUNC(); USBHIST_CALLARGS(usbdebug, "dev=%#jx addr=%jd iface=%#jx ep=%#jx", (uintptr_t)dev, dev->ud_addr, (uintptr_t)iface, (uintptr_t)ep); struct usbd_pipe *p = NULL; bool ep_acquired = false; usbd_status err; /* Block exclusive use of the endpoint by later pipes. */ err = usbd_endpoint_acquire(dev, ep, flags & USBD_EXCLUSIVE_USE); if (err) goto out; ep_acquired = true; p = kmem_alloc(dev->ud_bus->ub_pipesize, KM_SLEEP); DPRINTFN(1, "pipe=%#jx", (uintptr_t)p, 0, 0, 0); p->up_dev = dev; p->up_iface = iface; p->up_endpoint = ep; p->up_intrxfer = NULL; p->up_running = 0; p->up_aborting = 0; p->up_serialise = true; p->up_repeat = 0; p->up_interval = ival; p->up_flags = flags; SIMPLEQ_INIT(&p->up_queue); p->up_callingxfer = NULL; cv_init(&p->up_callingcv, "usbpipecb"); p->up_abortlwp = NULL; err = dev->ud_bus->ub_methods->ubm_open(p); if (err) { DPRINTF("endpoint=%#jx failed, error=%jd", (uintptr_t)ep->ue_edesc->bEndpointAddress, err, 0, 0); goto out; } KASSERT(p->up_methods->upm_start || p->up_serialise == false); usb_init_task(&p->up_async_task, usbd_clear_endpoint_stall_task, p, USB_TASKQ_MPSAFE); DPRINTFN(1, "pipe=%#jx", (uintptr_t)p, 0, 0, 0); *pipe = p; p = NULL; /* handed off to caller */ ep_acquired = false; /* handed off to pipe */ err = USBD_NORMAL_COMPLETION; out: if (p) { KASSERT(p->up_abortlwp == NULL); KASSERT(p->up_callingxfer == NULL); cv_destroy(&p->up_callingcv); kmem_free(p, dev->ud_bus->ub_pipesize); } if (ep_acquired) usbd_endpoint_release(dev, ep); return err; } usbd_status usbd_endpoint_acquire(struct usbd_device *dev, struct usbd_endpoint *ep, int flags) { usbd_status err; mutex_enter(dev->ud_bus->ub_lock); if (ep->ue_refcnt == INT_MAX) { err = USBD_IN_USE; /* XXX rule out or switch to 64-bit */ } else if ((flags & USBD_EXCLUSIVE_USE) && ep->ue_refcnt) { err = USBD_IN_USE; } else { ep->ue_refcnt++; err = 0; } mutex_exit(dev->ud_bus->ub_lock); return err; } void usbd_endpoint_release(struct usbd_device *dev, struct usbd_endpoint *ep) { mutex_enter(dev->ud_bus->ub_lock); KASSERT(ep->ue_refcnt); ep->ue_refcnt--; mutex_exit(dev->ud_bus->ub_lock); } /* Abort and close the device control pipe. */ void usbd_kill_pipe(struct usbd_pipe *pipe) { usbd_abort_pipe(pipe); usbd_close_pipe(pipe); } int usbd_getnewaddr(struct usbd_bus *bus) { int addr; for (addr = 1; addr < USB_MAX_DEVICES; addr++) { size_t dindex = usb_addr2dindex(addr); if (bus->ub_devices[dindex] == NULL) return addr; } return -1; } usbd_status usbd_attach_roothub(device_t parent, struct usbd_device *dev) { struct usb_attach_arg uaa; usb_device_descriptor_t *dd = &dev->ud_ddesc; device_t dv; uaa.uaa_device = dev; uaa.uaa_usegeneric = 0; uaa.uaa_port = 0; uaa.uaa_vendor = UGETW(dd->idVendor); uaa.uaa_product = UGETW(dd->idProduct); uaa.uaa_release = UGETW(dd->bcdDevice); uaa.uaa_class = dd->bDeviceClass; uaa.uaa_subclass = dd->bDeviceSubClass; uaa.uaa_proto = dd->bDeviceProtocol; KERNEL_LOCK(1, curlwp); dv = config_found(parent, &uaa, NULL, CFARGS(.iattr = "usbroothubif")); KERNEL_UNLOCK_ONE(curlwp); if (dv) { dev->ud_subdevs = kmem_alloc(sizeof(dv), KM_SLEEP); dev->ud_subdevs[0] = dv; dev->ud_subdevlen = 1; } return USBD_NORMAL_COMPLETION; } static void usbd_properties(device_t dv, struct usbd_device *dev) { usb_device_descriptor_t *dd = &dev->ud_ddesc; prop_dictionary_t dict = device_properties(dv); int class, subclass, release, proto, vendor, product; class = dd->bDeviceClass; subclass = dd->bDeviceSubClass; release = UGETW(dd->bcdDevice); proto = dd->bDeviceProtocol; vendor = UGETW(dd->idVendor); product = UGETW(dd->idProduct); prop_dictionary_set_uint8(dict, "address", dev->ud_addr); if (dev->ud_myhub) { struct usbd_device *hdev = dev->ud_myhub; struct usbd_hub *hub = hdev->ud_hub; int p; KASSERT(hub != NULL); prop_dictionary_set_uint8(dict, "hub-address", hdev->ud_addr); for (p=1; p <= hub->uh_hubdesc.bNbrPorts; ++p) { if (hub->uh_ports[p-1].up_dev == dev) { prop_dictionary_set_uint8(dict, "hub-port", p); break; } } } prop_dictionary_set_uint8(dict, "class", class); prop_dictionary_set_uint8(dict, "subclass", subclass); prop_dictionary_set_uint16(dict, "release", release); prop_dictionary_set_uint8(dict, "proto", proto); prop_dictionary_set_uint16(dict, "vendor-id", vendor); prop_dictionary_set_uint16(dict, "product-id", product); if (dev->ud_vendor) { prop_dictionary_set_string(dict, "vendor-string", dev->ud_vendor); } if (dev->ud_product) { prop_dictionary_set_string(dict, "product-string", dev->ud_product); } if (dev->ud_serial) { prop_dictionary_set_string(dict, "serialnumber", dev->ud_serial); } } static usbd_status usbd_attachwholedevice(device_t parent, struct usbd_device *dev, int port, int usegeneric) { struct usb_attach_arg uaa; usb_device_descriptor_t *dd = &dev->ud_ddesc; device_t dv; int dlocs[USBDEVIFCF_NLOCS]; KASSERT(usb_in_event_thread(parent)); uaa.uaa_device = dev; uaa.uaa_usegeneric = usegeneric; uaa.uaa_port = port; uaa.uaa_vendor = UGETW(dd->idVendor); uaa.uaa_product = UGETW(dd->idProduct); uaa.uaa_release = UGETW(dd->bcdDevice); uaa.uaa_class = dd->bDeviceClass; uaa.uaa_subclass = dd->bDeviceSubClass; uaa.uaa_proto = dd->bDeviceProtocol; dlocs[USBDEVIFCF_PORT] = uaa.uaa_port; dlocs[USBDEVIFCF_VENDOR] = uaa.uaa_vendor; dlocs[USBDEVIFCF_PRODUCT] = uaa.uaa_product; dlocs[USBDEVIFCF_RELEASE] = uaa.uaa_release; /* the rest is historical ballast */ dlocs[USBDEVIFCF_CONFIGURATION] = -1; dlocs[USBDEVIFCF_INTERFACE] = -1; config_pending_incr(parent); KERNEL_LOCK(1, curlwp); dv = config_found(parent, &uaa, usbd_print, CFARGS(.submatch = config_stdsubmatch, .iattr = "usbdevif", .locators = dlocs)); KERNEL_UNLOCK_ONE(curlwp); if (dv) { dev->ud_subdevs = kmem_alloc(sizeof(dv), KM_SLEEP); dev->ud_subdevs[0] = dv; dev->ud_subdevlen = 1; dev->ud_nifaces_claimed = 1; /* XXX */ usbd_properties(dv, dev); } config_pending_decr(parent); return USBD_NORMAL_COMPLETION; } static usbd_status usbd_attachinterfaces(device_t parent, struct usbd_device *dev, int port, const int *locators) { USBHIST_FUNC(); USBHIST_CALLED(usbdebug); struct usbif_attach_arg uiaa; int ilocs[USBIFIFCF_NLOCS]; usb_device_descriptor_t *dd = &dev->ud_ddesc; int nifaces; struct usbd_interface **ifaces; int i, j, loc; device_t dv; KASSERT(usb_in_event_thread(parent)); nifaces = dev->ud_cdesc->bNumInterface; ifaces = kmem_zalloc(nifaces * sizeof(*ifaces), KM_SLEEP); for (i = 0; i < nifaces; i++) { if (!dev->ud_subdevs[i]) { ifaces[i] = &dev->ud_ifaces[i]; } DPRINTF("interface %jd %#jx", i, (uintptr_t)ifaces[i], 0, 0); } uiaa.uiaa_device = dev; uiaa.uiaa_port = port; uiaa.uiaa_vendor = UGETW(dd->idVendor); uiaa.uiaa_product = UGETW(dd->idProduct); uiaa.uiaa_release = UGETW(dd->bcdDevice); uiaa.uiaa_configno = dev->ud_cdesc->bConfigurationValue; uiaa.uiaa_ifaces = ifaces; uiaa.uiaa_nifaces = nifaces; ilocs[USBIFIFCF_PORT] = uiaa.uiaa_port; ilocs[USBIFIFCF_VENDOR] = uiaa.uiaa_vendor; ilocs[USBIFIFCF_PRODUCT] = uiaa.uiaa_product; ilocs[USBIFIFCF_RELEASE] = uiaa.uiaa_release; ilocs[USBIFIFCF_CONFIGURATION] = uiaa.uiaa_configno; for (i = 0; i < nifaces; i++) { if (!ifaces[i]) { DPRINTF("interface %jd claimed", i, 0, 0, 0); continue; /* interface already claimed */ } uiaa.uiaa_iface = ifaces[i]; uiaa.uiaa_class = ifaces[i]->ui_idesc->bInterfaceClass; uiaa.uiaa_subclass = ifaces[i]->ui_idesc->bInterfaceSubClass; uiaa.uiaa_proto = ifaces[i]->ui_idesc->bInterfaceProtocol; uiaa.uiaa_ifaceno = ifaces[i]->ui_idesc->bInterfaceNumber; DPRINTF("searching for interface %jd...", i, 0, 0, 0); DPRINTF("class %jx subclass %jx proto %jx ifaceno %jd", uiaa.uiaa_class, uiaa.uiaa_subclass, uiaa.uiaa_proto, uiaa.uiaa_ifaceno); ilocs[USBIFIFCF_INTERFACE] = uiaa.uiaa_ifaceno; if (locators != NULL) { loc = locators[USBIFIFCF_CONFIGURATION]; if (loc != USBIFIFCF_CONFIGURATION_DEFAULT && loc != uiaa.uiaa_configno) continue; loc = locators[USBIFIFCF_INTERFACE]; if (loc != USBIFIFCF_INTERFACE_DEFAULT && loc != uiaa.uiaa_ifaceno) continue; } KERNEL_LOCK(1, curlwp); dv = config_found(parent, &uiaa, usbd_ifprint, CFARGS(.submatch = config_stdsubmatch, .iattr = "usbifif", .locators = ilocs)); KERNEL_UNLOCK_ONE(curlwp); if (!dv) continue; usbd_properties(dv, dev); /* claim */ ifaces[i] = NULL; /* account for ifaces claimed by the driver behind our back */ for (j = 0; j < nifaces; j++) { if (!ifaces[j] && !dev->ud_subdevs[j]) { DPRINTF("interface %jd claimed behind our back", j, 0, 0, 0); dev->ud_subdevs[j] = dv; dev->ud_nifaces_claimed++; } } } kmem_free(ifaces, nifaces * sizeof(*ifaces)); return USBD_NORMAL_COMPLETION; } usbd_status usbd_probe_and_attach(device_t parent, struct usbd_device *dev, int port, int addr) { USBHIST_FUNC(); USBHIST_CALLARGS(usbdebug, "trying device specific drivers", 0, 0, 0, 0); usb_device_descriptor_t *dd = &dev->ud_ddesc; int confi, nifaces; usbd_status err; KASSERT(usb_in_event_thread(parent)); /* First try with device specific drivers. */ err = usbd_attachwholedevice(parent, dev, port, 0); if (dev->ud_nifaces_claimed || err) return err; DPRINTF("no device specific driver found", 0, 0, 0, 0); DPRINTF("looping over %jd configurations", dd->bNumConfigurations, 0, 0, 0); for (confi = 0; confi < dd->bNumConfigurations; confi++) { DPRINTFN(1, "trying config idx=%jd", confi, 0, 0, 0); err = usbd_set_config_index(dev, confi, 1); if (err) { DPRINTF("port %jd, set config at addr %jd failed, " "error=%jd", port, addr, err, 0); printf("%s: port %d, set config at addr %d failed\n", device_xname(parent), port, addr); return err; } nifaces = dev->ud_cdesc->bNumInterface; dev->ud_subdevs = kmem_zalloc(nifaces * sizeof(device_t), KM_SLEEP); dev->ud_subdevlen = nifaces; err = usbd_attachinterfaces(parent, dev, port, NULL); if (dev->ud_subdevs && dev->ud_nifaces_claimed == 0) { kmem_free(dev->ud_subdevs, dev->ud_subdevlen * sizeof(device_t)); dev->ud_subdevs = 0; dev->ud_subdevlen = 0; } if (dev->ud_nifaces_claimed || err) return err; } /* No interfaces were attached in any of the configurations. */ if (dd->bNumConfigurations > 1) /* don't change if only 1 config */ usbd_set_config_index(dev, 0, 0); DPRINTF("no interface drivers found", 0, 0, 0, 0); /* Finally try the generic driver. */ err = usbd_attachwholedevice(parent, dev, port, 1); /* * The generic attach failed, but leave the device as it is. * We just did not find any drivers, that's all. The device is * fully operational and not harming anyone. */ DPRINTF("generic attach failed", 0, 0, 0, 0); return USBD_NORMAL_COMPLETION; } /** * Called from uhub_rescan(). usbd_new_device() for the target dev must be * called before calling this. */ usbd_status usbd_reattach_device(device_t parent, struct usbd_device *dev, int port, const int *locators) { int i, loc; USBHIST_FUNC(); USBHIST_CALLARGS(usbdebug, "uhub%jd port=%jd", device_unit(parent), port, 0, 0); KASSERT(usb_in_event_thread(parent)); if (locators != NULL) { loc = locators[USBIFIFCF_PORT]; if (loc != USBIFIFCF_PORT_DEFAULT && loc != port) return USBD_NORMAL_COMPLETION; loc = locators[USBIFIFCF_VENDOR]; if (loc != USBIFIFCF_VENDOR_DEFAULT && loc != UGETW(dev->ud_ddesc.idVendor)) return USBD_NORMAL_COMPLETION; loc = locators[USBIFIFCF_PRODUCT]; if (loc != USBIFIFCF_PRODUCT_DEFAULT && loc != UGETW(dev->ud_ddesc.idProduct)) return USBD_NORMAL_COMPLETION; loc = locators[USBIFIFCF_RELEASE]; if (loc != USBIFIFCF_RELEASE_DEFAULT && loc != UGETW(dev->ud_ddesc.bcdDevice)) return USBD_NORMAL_COMPLETION; } if (dev->ud_subdevlen == 0) { /* XXX: check USBIFIFCF_CONFIGURATION and * USBIFIFCF_INTERFACE too */ return usbd_probe_and_attach(parent, dev, port, dev->ud_addr); } else if (dev->ud_subdevlen != dev->ud_cdesc->bNumInterface) { /* device-specific or generic driver is already attached. */ return USBD_NORMAL_COMPLETION; } /* Does the device have unconfigured interfaces? */ for (i = 0; i < dev->ud_subdevlen; i++) { if (dev->ud_subdevs[i] == NULL) { break; } } if (i >= dev->ud_subdevlen) return USBD_NORMAL_COMPLETION; return usbd_attachinterfaces(parent, dev, port, locators); } /* * Called when a new device has been put in the powered state, * but not yet in the addressed state. * Get initial descriptor, set the address, get full descriptor, * and attach a driver. */ usbd_status usbd_new_device(device_t parent, struct usbd_bus *bus, int depth, int speed, int port, struct usbd_port *up) { USBHIST_FUNC(); USBHIST_CALLARGS(usbdebug, "bus=%#jx port=%jd depth=%jd speed=%jd", (uintptr_t)bus, port, depth, speed); struct usbd_device *dev, *adev; struct usbd_device *hub; usb_device_descriptor_t *dd; usb_port_status_t ps; usbd_status err; int addr; int i; int p; KASSERT(usb_in_event_thread(parent)); if (bus->ub_methods->ubm_newdev != NULL) return (bus->ub_methods->ubm_newdev)(parent, bus, depth, speed, port, up); addr = usbd_getnewaddr(bus); if (addr < 0) { printf("%s: No free USB addresses, new device ignored.\n", device_xname(bus->ub_usbctl)); return USBD_NO_ADDR; } dev = kmem_zalloc(sizeof(*dev), KM_SLEEP); dev->ud_bus = bus; /* Set up default endpoint handle. */ dev->ud_ep0.ue_edesc = &dev->ud_ep0desc; /* Set up default endpoint descriptor. */ dev->ud_ep0desc.bLength = USB_ENDPOINT_DESCRIPTOR_SIZE; dev->ud_ep0desc.bDescriptorType = UDESC_ENDPOINT; dev->ud_ep0desc.bEndpointAddress = USB_CONTROL_ENDPOINT; dev->ud_ep0desc.bmAttributes = UE_CONTROL; /* * temporary, will be fixed after first descriptor fetch * (which uses 64 bytes so it shouldn't be less), * highspeed devices must support 64 byte packets anyway */ if (speed == USB_SPEED_HIGH || speed == USB_SPEED_FULL) USETW(dev->ud_ep0desc.wMaxPacketSize, 64); else USETW(dev->ud_ep0desc.wMaxPacketSize, USB_MAX_IPACKET); dev->ud_ep0desc.bInterval = 0; /* doesn't matter, just don't leave it uninitialized */ dev->ud_ep0.ue_toggle = 0; dev->ud_quirks = &usbd_no_quirk; dev->ud_addr = USB_START_ADDR; dev->ud_ddesc.bMaxPacketSize = 0; dev->ud_depth = depth; dev->ud_powersrc = up; dev->ud_myhub = up->up_parent; up->up_dev = dev; /* Locate port on upstream high speed hub */ for (adev = dev, hub = up->up_parent; hub != NULL && hub->ud_speed != USB_SPEED_HIGH; adev = hub, hub = hub->ud_myhub) ; if (hub) { for (p = 1; p <= hub->ud_hub->uh_hubdesc.bNbrPorts; p++) { if (hub->ud_hub->uh_ports[p - 1].up_dev == adev) { dev->ud_myhsport = &hub->ud_hub->uh_ports[p - 1]; goto found; } } panic("usbd_new_device: cannot find HS port"); found: DPRINTFN(1, "high speed port %jd", p, 0, 0, 0); } else { dev->ud_myhsport = NULL; } dev->ud_speed = speed; dev->ud_langid = USBD_NOLANG; dev->ud_cookie.cookie = ++usb_cookie_no; /* Establish the default pipe. */ err = usbd_setup_pipe_flags(dev, 0, &dev->ud_ep0, USBD_DEFAULT_INTERVAL, &dev->ud_pipe0, USBD_MPSAFE); if (err) { usbd_remove_device(dev, up); return err; } dd = &dev->ud_ddesc; /* Try a few times in case the device is slow (i.e. outside specs.) */ for (i = 0; i < 10; i++) { /* Get the first 8 bytes of the device descriptor. */ err = usbd_get_initial_ddesc(dev, dd); if (!err) break; /* * The root hub can never fail to give the initial descriptor, * but assert it just in case. */ KASSERT(up->up_parent); usbd_delay_ms(dev, 200); if ((i & 3) == 3) usbd_reset_port(up->up_parent, port, &ps); } if (err) { DPRINTF("addr=%jd, getting first desc failed: %jd", addr, err, 0, 0); usbd_remove_device(dev, up); return err; } /* Windows resets the port here, do likewise */ if (up->up_parent) usbd_reset_port(up->up_parent, port, &ps); if (speed == USB_SPEED_HIGH) { /* Max packet size must be 64 (sec 5.5.3). */ if (dd->bMaxPacketSize != USB_2_MAX_CTRL_PACKET) { #ifdef DIAGNOSTIC printf("usbd_new_device: addr=%d bad max packet " "size=%d. adjusting to %d.\n", addr, dd->bMaxPacketSize, USB_2_MAX_CTRL_PACKET); #endif dd->bMaxPacketSize = USB_2_MAX_CTRL_PACKET; } } DPRINTF("adding unit addr=%jd, rev=%02jx, class=%jd, subclass=%jd", addr, UGETW(dd->bcdUSB), dd->bDeviceClass, dd->bDeviceSubClass); DPRINTF("protocol=%jd, maxpacket=%jd, len=%jd, speed=%jd", dd->bDeviceProtocol, dd->bMaxPacketSize, dd->bLength, dev->ud_speed); if (dd->bDescriptorType != UDESC_DEVICE) { /* Illegal device descriptor */ DPRINTF("illegal descriptor %jd", dd->bDescriptorType, 0, 0, 0); usbd_remove_device(dev, up); return USBD_INVAL; } if (dd->bLength < USB_DEVICE_DESCRIPTOR_SIZE) { DPRINTF("bad length %jd", dd->bLength, 0, 0, 0); usbd_remove_device(dev, up); return USBD_INVAL; } USETW(dev->ud_ep0desc.wMaxPacketSize, dd->bMaxPacketSize); /* Re-establish the default pipe with the new MPS. */ usbd_kill_pipe(dev->ud_pipe0); dev->ud_pipe0 = NULL; err = usbd_setup_pipe_flags(dev, 0, &dev->ud_ep0, USBD_DEFAULT_INTERVAL, &dev->ud_pipe0, USBD_MPSAFE); if (err) { DPRINTF("setup default pipe failed err %jd", err, 0, 0, 0); usbd_remove_device(dev, up); return err; } /* Set the address */ DPRINTFN(5, "setting device address=%jd", addr, 0, 0, 0); err = usbd_set_address(dev, addr); if (err) { DPRINTF("set address %jd failed, err = %jd", addr, err, 0, 0); err = USBD_SET_ADDR_FAILED; usbd_remove_device(dev, up); return err; } /* Allow device time to set new address */ usbd_delay_ms(dev, USB_SET_ADDRESS_SETTLE); dev->ud_addr = addr; /* new device address now */ bus->ub_devices[usb_addr2dindex(addr)] = dev; /* Re-establish the default pipe with the new address. */ usbd_kill_pipe(dev->ud_pipe0); dev->ud_pipe0 = NULL; err = usbd_setup_pipe_flags(dev, 0, &dev->ud_ep0, USBD_DEFAULT_INTERVAL, &dev->ud_pipe0, USBD_MPSAFE); if (err) { DPRINTF("setup default pipe failed, err = %jd", err, 0, 0, 0); usbd_remove_device(dev, up); return err; } err = usbd_reload_device_desc(dev); if (err) { DPRINTF("addr=%jd, getting full desc failed, err = %jd", addr, err, 0, 0); usbd_remove_device(dev, up); return err; } /* Assume 100mA bus powered for now. Changed when configured. */ dev->ud_power = USB_MIN_POWER; dev->ud_selfpowered = 0; DPRINTF("new dev (addr %jd), dev=%#jx, parent=%#jx", addr, (uintptr_t)dev, (uintptr_t)parent, 0); usbd_get_device_strings(dev); usbd_add_dev_event(USB_EVENT_DEVICE_ATTACH, dev); if (port == 0) { /* root hub */ KASSERT(addr == 1); usbd_attach_roothub(parent, dev); return USBD_NORMAL_COMPLETION; } err = usbd_probe_and_attach(parent, dev, port, addr); if (err) { usbd_remove_device(dev, up); return err; } return USBD_NORMAL_COMPLETION; } usbd_status usbd_reload_device_desc(struct usbd_device *dev) { USBHIST_FUNC(); USBHIST_CALLED(usbdebug); usb_device_descriptor_t *udd = &dev->ud_ddesc; usbd_status err; /* Get the full device descriptor. */ err = usbd_get_device_desc(dev, udd); if (err) return err; if (udd->bDescriptorType != UDESC_DEVICE) return USBD_INVAL; if (udd->bLength < USB_DEVICE_DESCRIPTOR_SIZE) return USBD_INVAL; DPRINTFN(15, "bLength %5ju", udd->bLength, 0, 0, 0); DPRINTFN(15, "bDescriptorType %5ju", udd->bDescriptorType, 0, 0, 0); DPRINTFN(15, "bcdUSB %2jx.%02jx", udd->bcdUSB[1], udd->bcdUSB[0], 0, 0); DPRINTFN(15, "bDeviceClass %5ju", udd->bDeviceClass, 0, 0, 0); DPRINTFN(15, "bDeviceSubClass %5ju", udd->bDeviceSubClass, 0, 0, 0); DPRINTFN(15, "bDeviceProtocol %5ju", udd->bDeviceProtocol, 0, 0, 0); DPRINTFN(15, "bMaxPacketSize0 %5ju", udd->bMaxPacketSize, 0, 0, 0); DPRINTFN(15, "idVendor 0x%02jx 0x%02jx", udd->idVendor[0], udd->idVendor[1], 0, 0); DPRINTFN(15, "idProduct 0x%02jx 0x%02jx", udd->idProduct[0], udd->idProduct[1], 0, 0); DPRINTFN(15, "bcdDevice %2jx.%02jx", udd->bcdDevice[1], udd->bcdDevice[0], 0, 0); DPRINTFN(15, "iManufacturer %5ju", udd->iManufacturer, 0, 0, 0); DPRINTFN(15, "iProduct %5ju", udd->iProduct, 0, 0, 0); DPRINTFN(15, "iSerial %5ju", udd->iSerialNumber, 0, 0, 0); DPRINTFN(15, "bNumConfigurations %5ju", udd->bNumConfigurations, 0, 0, 0); /* Figure out what's wrong with this device. */ dev->ud_quirks = usbd_find_quirk(udd); return USBD_NORMAL_COMPLETION; } void usbd_remove_device(struct usbd_device *dev, struct usbd_port *up) { USBHIST_FUNC(); USBHIST_CALLARGS(usbdebug, "dev %#jx up %#jx", (uintptr_t)dev, (uintptr_t)up, 0, 0); if (dev->ud_pipe0 != NULL) usbd_kill_pipe(dev->ud_pipe0); up->up_dev = NULL; dev->ud_bus->ub_devices[usb_addr2dindex(dev->ud_addr)] = NULL; if (dev->ud_vendor != NULL) { kmem_free(dev->ud_vendor, USB_MAX_ENCODED_STRING_LEN); } if (dev->ud_product != NULL) { kmem_free(dev->ud_product, USB_MAX_ENCODED_STRING_LEN); } if (dev->ud_serial != NULL) { kmem_free(dev->ud_serial, USB_MAX_ENCODED_STRING_LEN); } kmem_free(dev, sizeof(*dev)); } int usbd_print(void *aux, const char *pnp) { struct usb_attach_arg *uaa = aux; if (pnp) { #define USB_DEVINFO 1024 char *devinfo; if (!uaa->uaa_usegeneric) return QUIET; devinfo = kmem_alloc(USB_DEVINFO, KM_SLEEP); usbd_devinfo(uaa->uaa_device, 1, devinfo, USB_DEVINFO); aprint_normal("%s, %s", devinfo, pnp); kmem_free(devinfo, USB_DEVINFO); } aprint_normal(" port %d", uaa->uaa_port); #if 0 /* * It gets very crowded with these locators on the attach line. * They are not really needed since they are printed in the clear * by each driver. */ if (uaa->uaa_vendor != UHUB_UNK_VENDOR) aprint_normal(" vendor 0x%04x", uaa->uaa_vendor); if (uaa->uaa_product != UHUB_UNK_PRODUCT) aprint_normal(" product 0x%04x", uaa->uaa_product); if (uaa->uaa_release != UHUB_UNK_RELEASE) aprint_normal(" release 0x%04x", uaa->uaa_release); #endif return UNCONF; } int usbd_ifprint(void *aux, const char *pnp) { struct usbif_attach_arg *uiaa = aux; if (pnp) return QUIET; aprint_normal(" port %d", uiaa->uiaa_port); aprint_normal(" configuration %d", uiaa->uiaa_configno); aprint_normal(" interface %d", uiaa->uiaa_ifaceno); #if 0 /* * It gets very crowded with these locators on the attach line. * They are not really needed since they are printed in the clear * by each driver. */ if (uaa->uaa_vendor != UHUB_UNK_VENDOR) aprint_normal(" vendor 0x%04x", uaa->uaa_vendor); if (uaa->uaa_product != UHUB_UNK_PRODUCT) aprint_normal(" product 0x%04x", uaa->uaa_product); if (uaa->uaa_release != UHUB_UNK_RELEASE) aprint_normal(" release 0x%04x", uaa->uaa_release); #endif return UNCONF; } void usbd_fill_deviceinfo(struct usbd_device *dev, struct usb_device_info *di, int usedev) { struct usbd_port *p; int i, j, err; di->udi_bus = device_unit(dev->ud_bus->ub_usbctl); di->udi_addr = dev->ud_addr; di->udi_cookie = dev->ud_cookie; usbd_devinfo_vp(dev, di->udi_vendor, sizeof(di->udi_vendor), di->udi_product, sizeof(di->udi_product), usedev, 1); usbd_printBCD(di->udi_release, sizeof(di->udi_release), UGETW(dev->ud_ddesc.bcdDevice)); if (usedev) { usbd_status uerr = usbd_get_string(dev, dev->ud_ddesc.iSerialNumber, di->udi_serial); if (uerr != USBD_NORMAL_COMPLETION) { di->udi_serial[0] = '\0'; } else { usbd_trim_spaces(di->udi_serial); } } else { di->udi_serial[0] = '\0'; if (dev->ud_serial) { strlcpy(di->udi_serial, dev->ud_serial, sizeof(di->udi_serial)); } } di->udi_vendorNo = UGETW(dev->ud_ddesc.idVendor); di->udi_productNo = UGETW(dev->ud_ddesc.idProduct); di->udi_releaseNo = UGETW(dev->ud_ddesc.bcdDevice); di->udi_class = dev->ud_ddesc.bDeviceClass; di->udi_subclass = dev->ud_ddesc.bDeviceSubClass; di->udi_protocol = dev->ud_ddesc.bDeviceProtocol; di->udi_config = dev->ud_config; di->udi_power = dev->ud_selfpowered ? 0 : dev->ud_power; di->udi_speed = dev->ud_speed; if (dev->ud_subdevlen > 0) { for (i = 0, j = 0; i < dev->ud_subdevlen && j < USB_MAX_DEVNAMES; i++) { if (!dev->ud_subdevs[i]) continue; strncpy(di->udi_devnames[j], device_xname(dev->ud_subdevs[i]), USB_MAX_DEVNAMELEN); di->udi_devnames[j][USB_MAX_DEVNAMELEN-1] = '\0'; j++; } } else { j = 0; } for (/* j is set */; j < USB_MAX_DEVNAMES; j++) di->udi_devnames[j][0] = 0; /* empty */ if (!dev->ud_hub) { di->udi_nports = 0; return; } const int nports = dev->ud_hub->uh_hubdesc.bNbrPorts; for (i = 1; i <= __arraycount(di->udi_ports) && i <= nports; i++) { p = &dev->ud_hub->uh_ports[i - 1]; if (p->up_dev) err = p->up_dev->ud_addr; else { const int s = UGETW(p->up_status.wPortStatus); const bool sshub_p = USB_IS_SS(dev->ud_speed); if (s & UPS_PORT_ENABLED) err = USB_PORT_ENABLED; else if (s & UPS_SUSPEND) err = USB_PORT_SUSPENDED; /* * Note: UPS_PORT_POWER_SS is available only * on 3.x, and UPS_PORT_POWER is available * only on 2.0 or 1.1. */ else if (sshub_p && (s & UPS_PORT_POWER_SS)) err = USB_PORT_POWERED; else if (!sshub_p && (s & UPS_PORT_POWER)) err = USB_PORT_POWERED; else err = USB_PORT_DISABLED; } di->udi_ports[i - 1] = err; } di->udi_nports = nports; } void usb_free_device(struct usbd_device *dev) { int ifcidx, nifc; if (dev->ud_pipe0 != NULL) usbd_kill_pipe(dev->ud_pipe0); if (dev->ud_ifaces != NULL) { nifc = dev->ud_cdesc->bNumInterface; for (ifcidx = 0; ifcidx < nifc; ifcidx++) { usbd_iface_exlock(&dev->ud_ifaces[ifcidx]); usbd_free_iface_data(dev, ifcidx); usbd_iface_unlock(&dev->ud_ifaces[ifcidx]); usbd_iface_fini(dev, ifcidx); } kmem_free(dev->ud_ifaces, nifc * sizeof(struct usbd_interface)); } if (dev->ud_cdesc != NULL) kmem_free(dev->ud_cdesc, UGETW(dev->ud_cdesc->wTotalLength)); if (dev->ud_bdesc != NULL) kmem_free(dev->ud_bdesc, UGETW(dev->ud_bdesc->wTotalLength)); if (dev->ud_subdevlen > 0) { kmem_free(dev->ud_subdevs, dev->ud_subdevlen * sizeof(device_t)); dev->ud_subdevlen = 0; } if (dev->ud_vendor) { kmem_free(dev->ud_vendor, USB_MAX_ENCODED_STRING_LEN); } if (dev->ud_product) { kmem_free(dev->ud_product, USB_MAX_ENCODED_STRING_LEN); } if (dev->ud_serial) { kmem_free(dev->ud_serial, USB_MAX_ENCODED_STRING_LEN); } kmem_free(dev, sizeof(*dev)); } /* * The general mechanism for detaching drivers works as follows: Each * driver is responsible for maintaining a reference count on the * number of outstanding references to its softc (e.g. from * processing hanging in a read or write). The detach method of the * driver decrements this counter and flags in the softc that the * driver is dying and then wakes any sleepers. It then sleeps on the * softc. Each place that can sleep must maintain the reference * count. When the reference count drops to -1 (0 is the normal value * of the reference count) then a wakeup on the softc is performed * signaling to the detach waiter that all references are gone. */ /* * Called from process context when we discover that a port has * been disconnected. */ int usb_disconnect_port(struct usbd_port *up, device_t parent, int flags) { struct usbd_device *dev = up->up_dev; device_t subdev; char subdevname[16]; const char *hubname = device_xname(parent); int i, rc; USBHIST_FUNC(); USBHIST_CALLARGS(usbdebug, "up=%#jx dev=%#jx port=%jd", (uintptr_t)up, (uintptr_t)dev, up->up_portno, 0); if (dev == NULL) { return 0; } usbd_suspend_pipe(dev->ud_pipe0); if (dev->ud_subdevlen > 0) { DPRINTFN(3, "disconnect subdevs", 0, 0, 0, 0); for (i = 0; i < dev->ud_subdevlen; i++) { if ((subdev = dev->ud_subdevs[i]) == NULL) continue; strlcpy(subdevname, device_xname(subdev), sizeof(subdevname)); KERNEL_LOCK(1, curlwp); rc = config_detach(subdev, flags); KERNEL_UNLOCK_ONE(curlwp); if (rc != 0) return rc; printf("%s: at %s", subdevname, hubname); if (up->up_portno != 0) printf(" port %d", up->up_portno); printf(" (addr %d) disconnected\n", dev->ud_addr); } KASSERT(!dev->ud_nifaces_claimed); } mutex_enter(dev->ud_bus->ub_lock); dev->ud_bus->ub_devices[usb_addr2dindex(dev->ud_addr)] = NULL; up->up_dev = NULL; mutex_exit(dev->ud_bus->ub_lock); usbd_add_dev_event(USB_EVENT_DEVICE_DETACH, dev); usb_free_device(dev); return 0; }
203 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 /* $NetBSD: virtio_pci.c,v 1.44 2023/11/19 19:49:44 thorpej Exp $ */ /* * Copyright (c) 2020 The NetBSD Foundation, Inc. * Copyright (c) 2012 Stefan Fritsch. * Copyright (c) 2010 Minoura Makoto. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: virtio_pci.c,v 1.44 2023/11/19 19:49:44 thorpej Exp $"); #include <sys/param.h> #include <sys/systm.h> #include <sys/kmem.h> #include <sys/module.h> #include <sys/endian.h> #include <sys/interrupt.h> #include <sys/syslog.h> #include <sys/device.h> #include <dev/pci/pcidevs.h> #include <dev/pci/pcireg.h> #include <dev/pci/pcivar.h> #include <dev/pci/virtioreg.h> /* XXX: move to non-pci */ #include <dev/pci/virtio_pcireg.h> #define VIRTIO_PRIVATE #include <dev/pci/virtiovar.h> /* XXX: move to non-pci */ #if defined(__alpha__) || defined(__sparc64__) /* * XXX VIRTIO_F_ACCESS_PLATFORM is required for standard PCI DMA * XXX to work on these platforms, at least by Qemu. * XXX * XXX Generalize this later. */ #define __NEED_VIRTIO_F_ACCESS_PLATFORM #endif /* __alpha__ || __sparc64__ */ #define VIRTIO_PCI_LOG(_sc, _use_log, _fmt, _args...) \ do { \ if ((_use_log)) { \ log(LOG_DEBUG, "%s: " _fmt, \ device_xname((_sc)->sc_dev), \ ##_args); \ } else { \ aprint_error_dev((_sc)->sc_dev, \ _fmt, ##_args); \ } \ } while(0) static int virtio_pci_match(device_t, cfdata_t, void *); static void virtio_pci_attach(device_t, device_t, void *); static int virtio_pci_rescan(device_t, const char *, const int *); static int virtio_pci_detach(device_t, int); #define NMAPREG ((PCI_MAPREG_END - PCI_MAPREG_START) / \ sizeof(pcireg_t)) struct virtio_pci_softc { struct virtio_softc sc_sc; bool sc_intr_pervq; /* IO space */ bus_space_tag_t sc_iot; bus_space_handle_t sc_ioh; bus_size_t sc_iosize; bus_size_t sc_mapped_iosize; /* BARs */ bus_space_tag_t sc_bars_iot[NMAPREG]; bus_space_handle_t sc_bars_ioh[NMAPREG]; bus_size_t sc_bars_iosize[NMAPREG]; /* notify space */ bus_space_tag_t sc_notify_iot; bus_space_handle_t sc_notify_ioh; bus_size_t sc_notify_iosize; uint32_t sc_notify_off_multiplier; /* isr space */ bus_space_tag_t sc_isr_iot; bus_space_handle_t sc_isr_ioh; bus_size_t sc_isr_iosize; /* generic */ struct pci_attach_args sc_pa; pci_intr_handle_t *sc_ihp; void **sc_ihs; int sc_ihs_num; int sc_devcfg_offset; /* for 0.9 */ }; static int virtio_pci_attach_09(device_t, void *); static void virtio_pci_kick_09(struct virtio_softc *, uint16_t); static uint16_t virtio_pci_read_queue_size_09(struct virtio_softc *, uint16_t); static void virtio_pci_setup_queue_09(struct virtio_softc *, uint16_t, uint64_t); static void virtio_pci_set_status_09(struct virtio_softc *, int); static void virtio_pci_negotiate_features_09(struct virtio_softc *, uint64_t); static int virtio_pci_attach_10(device_t, void *); static void virtio_pci_kick_10(struct virtio_softc *, uint16_t); static uint16_t virtio_pci_read_queue_size_10(struct virtio_softc *, uint16_t); static void virtio_pci_setup_queue_10(struct virtio_softc *, uint16_t, uint64_t); static void virtio_pci_set_status_10(struct virtio_softc *, int); static void virtio_pci_negotiate_features_10(struct virtio_softc *, uint64_t); static int virtio_pci_find_cap(struct virtio_pci_softc *psc, int cfg_type, void *buf, int buflen); static int virtio_pci_alloc_interrupts(struct virtio_softc *); static void virtio_pci_free_interrupts(struct virtio_softc *); static int virtio_pci_adjust_config_region(struct virtio_pci_softc *psc); static int virtio_pci_intr(void *arg); static int virtio_pci_msix_queue_intr(void *); static int virtio_pci_msix_config_intr(void *); static int virtio_pci_setup_interrupts_09(struct virtio_softc *, int); static int virtio_pci_setup_interrupts_10(struct virtio_softc *, int); static int virtio_pci_establish_msix_interrupts(struct virtio_softc *, struct pci_attach_args *); static int virtio_pci_establish_intx_interrupt(struct virtio_softc *, struct pci_attach_args *); static bool virtio_pci_msix_enabled(struct virtio_pci_softc *); #define VIRTIO_MSIX_CONFIG_VECTOR_INDEX 0 #define VIRTIO_MSIX_QUEUE_VECTOR_INDEX 1 /* * For big-endian aarch64/armv7 on QEMU (and most real HW), only CPU cores * are running in big-endian mode, with all peripheral being configured to * little-endian mode. Their default bus_space(9) functions forcibly swap * byte-order. This guarantees that PIO'ed data from pci(4), e.g., are * correctly handled by bus_space(9), while DMA'ed ones should be swapped * by hand, in violation of virtio(4) specifications. */ #if (defined(__aarch64__) || defined(__arm__)) && BYTE_ORDER == BIG_ENDIAN # define READ_ENDIAN_09 BIG_ENDIAN # define READ_ENDIAN_10 BIG_ENDIAN # define STRUCT_ENDIAN_09 BIG_ENDIAN # define STRUCT_ENDIAN_10 LITTLE_ENDIAN #elif BYTE_ORDER == BIG_ENDIAN # define READ_ENDIAN_09 LITTLE_ENDIAN # define READ_ENDIAN_10 BIG_ENDIAN # define STRUCT_ENDIAN_09 BIG_ENDIAN # define STRUCT_ENDIAN_10 LITTLE_ENDIAN #else /* little endian */ # define READ_ENDIAN_09 LITTLE_ENDIAN # define READ_ENDIAN_10 LITTLE_ENDIAN # define STRUCT_ENDIAN_09 LITTLE_ENDIAN # define STRUCT_ENDIAN_10 LITTLE_ENDIAN #endif CFATTACH_DECL3_NEW(virtio_pci, sizeof(struct virtio_pci_softc), virtio_pci_match, virtio_pci_attach, virtio_pci_detach, NULL, virtio_pci_rescan, NULL, DVF_DETACH_SHUTDOWN); static const struct virtio_ops virtio_pci_ops_09 = { .kick = virtio_pci_kick_09, .read_queue_size = virtio_pci_read_queue_size_09, .setup_queue = virtio_pci_setup_queue_09, .set_status = virtio_pci_set_status_09, .neg_features = virtio_pci_negotiate_features_09, .alloc_interrupts = virtio_pci_alloc_interrupts, .free_interrupts = virtio_pci_free_interrupts, .setup_interrupts = virtio_pci_setup_interrupts_09, }; static const struct virtio_ops virtio_pci_ops_10 = { .kick = virtio_pci_kick_10, .read_queue_size = virtio_pci_read_queue_size_10, .setup_queue = virtio_pci_setup_queue_10, .set_status = virtio_pci_set_status_10, .neg_features = virtio_pci_negotiate_features_10, .alloc_interrupts = virtio_pci_alloc_interrupts, .free_interrupts = virtio_pci_free_interrupts, .setup_interrupts = virtio_pci_setup_interrupts_10, }; static int virtio_pci_match(device_t parent, cfdata_t match, void *aux) { struct pci_attach_args *pa; pa = (struct pci_attach_args *)aux; switch (PCI_VENDOR(pa->pa_id)) { case PCI_VENDOR_QUMRANET: /* Transitional devices MUST have a PCI Revision ID of 0. */ if (((PCI_PRODUCT_QUMRANET_VIRTIO_1000 <= PCI_PRODUCT(pa->pa_id)) && (PCI_PRODUCT(pa->pa_id) <= PCI_PRODUCT_QUMRANET_VIRTIO_103F)) && PCI_REVISION(pa->pa_class) == 0) return 1; /* * Non-transitional devices SHOULD have a PCI Revision * ID of 1 or higher. Drivers MUST match any PCI * Revision ID value. */ if (((PCI_PRODUCT_QUMRANET_VIRTIO_1040 <= PCI_PRODUCT(pa->pa_id)) && (PCI_PRODUCT(pa->pa_id) <= PCI_PRODUCT_QUMRANET_VIRTIO_107F)) && /* XXX: TODO */ PCI_REVISION(pa->pa_class) == 1) return 1; break; } return 0; } static void virtio_pci_attach(device_t parent, device_t self, void *aux) { struct virtio_pci_softc * const psc = device_private(self); struct virtio_softc * const sc = &psc->sc_sc; struct pci_attach_args *pa = (struct pci_attach_args *)aux; pci_chipset_tag_t pc = pa->pa_pc; pcitag_t tag = pa->pa_tag; int revision; int ret; pcireg_t id; pcireg_t csr; revision = PCI_REVISION(pa->pa_class); switch (revision) { case 0: /* subsystem ID shows what I am */ id = PCI_SUBSYS_ID(pci_conf_read(pc, tag, PCI_SUBSYS_ID_REG)); break; case 1: /* pci product number shows what I am */ id = PCI_PRODUCT(pa->pa_id) - PCI_PRODUCT_QUMRANET_VIRTIO_1040; break; default: aprint_normal(": unknown revision 0x%02x; giving up\n", revision); return; } aprint_normal("\n"); aprint_naive("\n"); virtio_print_device_type(self, id, revision); csr = pci_conf_read(pc, tag, PCI_COMMAND_STATUS_REG); csr |= PCI_COMMAND_MASTER_ENABLE | PCI_COMMAND_IO_ENABLE; pci_conf_write(pc, tag, PCI_COMMAND_STATUS_REG, csr); sc->sc_dev = self; psc->sc_pa = *pa; psc->sc_iot = pa->pa_iot; sc->sc_dmat = pa->pa_dmat; if (pci_dma64_available(pa)) sc->sc_dmat = pa->pa_dmat64; /* attach is dependent on revision */ ret = 0; if (revision == 1) { /* try to attach 1.0 */ ret = virtio_pci_attach_10(self, aux); } if (ret == 0 && revision == 0) { /* * revision 0 means 0.9 only or both 0.9 and 1.0. The * latter are so-called "Transitional Devices". For * those devices, we want to use the 1.0 interface if * possible. * * XXX Currently only on platforms that require 1.0 * XXX features, such as VIRTIO_F_ACCESS_PLATFORM. */ #ifdef __NEED_VIRTIO_F_ACCESS_PLATFORM /* First, try to attach 1.0 */ ret = virtio_pci_attach_10(self, aux); if (ret != 0) { aprint_error_dev(self, "VirtIO 1.0 error = %d, falling back to 0.9\n", ret); /* Fall back to 0.9. */ ret = virtio_pci_attach_09(self, aux); } #else ret = virtio_pci_attach_09(self, aux); #endif /* __NEED_VIRTIO_F_ACCESS_PLATFORM */ } if (ret) { aprint_error_dev(self, "cannot attach (%d)\n", ret); return; } KASSERT(sc->sc_ops); /* preset config region */ psc->sc_devcfg_offset = VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI; if (virtio_pci_adjust_config_region(psc)) return; /* generic */ virtio_device_reset(sc); virtio_set_status(sc, VIRTIO_CONFIG_DEVICE_STATUS_ACK); virtio_set_status(sc, VIRTIO_CONFIG_DEVICE_STATUS_DRIVER); sc->sc_childdevid = id; sc->sc_child = NULL; virtio_pci_rescan(self, NULL, NULL); return; } /* ARGSUSED */ static int virtio_pci_rescan(device_t self, const char *ifattr, const int *locs) { struct virtio_pci_softc * const psc = device_private(self); struct virtio_softc * const sc = &psc->sc_sc; struct virtio_attach_args va; if (sc->sc_child) /* Child already attached? */ return 0; memset(&va, 0, sizeof(va)); va.sc_childdevid = sc->sc_childdevid; config_found(self, &va, NULL, CFARGS_NONE); if (virtio_attach_failed(sc)) return 0; return 0; } static int virtio_pci_detach(device_t self, int flags) { struct virtio_pci_softc * const psc = device_private(self); struct virtio_softc * const sc = &psc->sc_sc; int r; r = config_detach_children(self, flags); if (r != 0) return r; /* Check that child never attached, or detached properly */ KASSERT(sc->sc_child == NULL); KASSERT(sc->sc_vqs == NULL); KASSERT(psc->sc_ihs_num == 0); if (psc->sc_iosize) bus_space_unmap(psc->sc_iot, psc->sc_ioh, psc->sc_mapped_iosize); psc->sc_iosize = 0; return 0; } static int virtio_pci_attach_09(device_t self, void *aux) //struct virtio_pci_softc *psc, struct pci_attach_args *pa) { struct virtio_pci_softc * const psc = device_private(self); struct pci_attach_args *pa = (struct pci_attach_args *)aux; struct virtio_softc * const sc = &psc->sc_sc; // pci_chipset_tag_t pc = pa->pa_pc; // pcitag_t tag = pa->pa_tag; /* complete IO region */ if (pci_mapreg_map(pa, PCI_MAPREG_START, PCI_MAPREG_TYPE_IO, 0, &psc->sc_iot, &psc->sc_ioh, NULL, &psc->sc_iosize)) { aprint_error_dev(self, "can't map i/o space\n"); return EIO; } psc->sc_mapped_iosize = psc->sc_iosize; /* queue space */ if (bus_space_subregion(psc->sc_iot, psc->sc_ioh, VIRTIO_CONFIG_QUEUE_NOTIFY, 2, &psc->sc_notify_ioh)) { aprint_error_dev(self, "can't map notify i/o space\n"); return EIO; } psc->sc_notify_iosize = 2; psc->sc_notify_iot = psc->sc_iot; /* ISR space */ if (bus_space_subregion(psc->sc_iot, psc->sc_ioh, VIRTIO_CONFIG_ISR_STATUS, 1, &psc->sc_isr_ioh)) { aprint_error_dev(self, "can't map isr i/o space\n"); return EIO; } psc->sc_isr_iosize = 1; psc->sc_isr_iot = psc->sc_iot; /* set our version 0.9 ops */ sc->sc_ops = &virtio_pci_ops_09; sc->sc_bus_endian = READ_ENDIAN_09; sc->sc_struct_endian = STRUCT_ENDIAN_09; return 0; } static int virtio_pci_attach_10(device_t self, void *aux) { struct virtio_pci_softc * const psc = device_private(self); struct pci_attach_args *pa = (struct pci_attach_args *)aux; struct virtio_softc * const sc = &psc->sc_sc; pci_chipset_tag_t pc = pa->pa_pc; pcitag_t tag = pa->pa_tag; struct virtio_pci_cap common, isr, device; struct virtio_pci_notify_cap notify; int have_device_cfg = 0; bus_size_t bars[NMAPREG] = { 0 }; int bars_idx[NMAPREG] = { 0 }; struct virtio_pci_cap *caps[] = { &common, &isr, &device, &notify.cap }; int i, j, ret = 0; if (virtio_pci_find_cap(psc, VIRTIO_PCI_CAP_COMMON_CFG, &common, sizeof(common))) return ENODEV; if (virtio_pci_find_cap(psc, VIRTIO_PCI_CAP_NOTIFY_CFG, &notify, sizeof(notify))) return ENODEV; if (virtio_pci_find_cap(psc, VIRTIO_PCI_CAP_ISR_CFG, &isr, sizeof(isr))) return ENODEV; if (virtio_pci_find_cap(psc, VIRTIO_PCI_CAP_DEVICE_CFG, &device, sizeof(device))) memset(&device, 0, sizeof(device)); else have_device_cfg = 1; /* Figure out which bars we need to map */ for (i = 0; i < __arraycount(caps); i++) { int bar = caps[i]->bar; bus_size_t len = caps[i]->offset + caps[i]->length; if (caps[i]->length == 0) continue; if (bars[bar] < len) bars[bar] = len; } for (i = j = 0; i < __arraycount(bars); i++) { int reg; pcireg_t type; if (bars[i] == 0) continue; reg = PCI_BAR(i); type = pci_mapreg_type(pc, tag, reg); if (pci_mapreg_map(pa, reg, type, 0, &psc->sc_bars_iot[j], &psc->sc_bars_ioh[j], NULL, &psc->sc_bars_iosize[j])) { aprint_error_dev(self, "can't map bar %u \n", i); ret = EIO; goto err; } aprint_debug_dev(self, "bar[%d]: iot %p, size 0x%" PRIxBUSSIZE "\n", j, psc->sc_bars_iot[j], psc->sc_bars_iosize[j]); bars_idx[i] = j; j++; } i = bars_idx[notify.cap.bar]; if (bus_space_subregion(psc->sc_bars_iot[i], psc->sc_bars_ioh[i], notify.cap.offset, notify.cap.length, &psc->sc_notify_ioh)) { aprint_error_dev(self, "can't map notify i/o space\n"); ret = EIO; goto err; } psc->sc_notify_iosize = notify.cap.length; psc->sc_notify_iot = psc->sc_bars_iot[i]; psc->sc_notify_off_multiplier = le32toh(notify.notify_off_multiplier); if (have_device_cfg) { i = bars_idx[device.bar]; if (bus_space_subregion(psc->sc_bars_iot[i], psc->sc_bars_ioh[i], device.offset, device.length, &sc->sc_devcfg_ioh)) { aprint_error_dev(self, "can't map devcfg i/o space\n"); ret = EIO; goto err; } aprint_debug_dev(self, "device.offset = 0x%x, device.length = 0x%x\n", device.offset, device.length); sc->sc_devcfg_iosize = device.length; sc->sc_devcfg_iot = psc->sc_bars_iot[i]; } i = bars_idx[isr.bar]; if (bus_space_subregion(psc->sc_bars_iot[i], psc->sc_bars_ioh[i], isr.offset, isr.length, &psc->sc_isr_ioh)) { aprint_error_dev(self, "can't map isr i/o space\n"); ret = EIO; goto err; } psc->sc_isr_iosize = isr.length; psc->sc_isr_iot = psc->sc_bars_iot[i]; i = bars_idx[common.bar]; if (bus_space_subregion(psc->sc_bars_iot[i], psc->sc_bars_ioh[i], common.offset, common.length, &psc->sc_ioh)) { aprint_error_dev(self, "can't map common i/o space\n"); ret = EIO; goto err; } psc->sc_iosize = common.length; psc->sc_iot = psc->sc_bars_iot[i]; psc->sc_mapped_iosize = psc->sc_bars_iosize[i]; psc->sc_sc.sc_version_1 = 1; /* set our version 1.0 ops */ sc->sc_ops = &virtio_pci_ops_10; sc->sc_bus_endian = READ_ENDIAN_10; sc->sc_struct_endian = STRUCT_ENDIAN_10; return 0; err: /* undo our pci_mapreg_map()s */ for (i = 0; i < __arraycount(bars); i++) { if (psc->sc_bars_iosize[i] == 0) continue; bus_space_unmap(psc->sc_bars_iot[i], psc->sc_bars_ioh[i], psc->sc_bars_iosize[i]); } return ret; } /* v1.0 attach helper */ static int virtio_pci_find_cap(struct virtio_pci_softc *psc, int cfg_type, void *buf, int buflen) { device_t self = psc->sc_sc.sc_dev; pci_chipset_tag_t pc = psc->sc_pa.pa_pc; pcitag_t tag = psc->sc_pa.pa_tag; unsigned int offset, i, len; union { pcireg_t reg[8]; struct virtio_pci_cap vcap; } *v = buf; if (buflen < sizeof(struct virtio_pci_cap)) return ERANGE; if (!pci_get_capability(pc, tag, PCI_CAP_VENDSPEC, &offset, &v->reg[0])) return ENOENT; do { for (i = 0; i < 4; i++) v->reg[i] = le32toh(pci_conf_read(pc, tag, offset + i * 4)); if (v->vcap.cfg_type == cfg_type) break; offset = v->vcap.cap_next; } while (offset != 0); if (offset == 0) return ENOENT; if (v->vcap.cap_len > sizeof(struct virtio_pci_cap)) { len = roundup(v->vcap.cap_len, sizeof(pcireg_t)); if (len > buflen) { aprint_error_dev(self, "%s cap too large\n", __func__); return ERANGE; } for (i = 4; i < len / sizeof(pcireg_t); i++) v->reg[i] = le32toh(pci_conf_read(pc, tag, offset + i * 4)); } /* endian fixup */ v->vcap.offset = le32toh(v->vcap.offset); v->vcap.length = le32toh(v->vcap.length); return 0; } /* ------------------------------------- * Version 0.9 support * -------------------------------------*/ static void virtio_pci_kick_09(struct virtio_softc *sc, uint16_t idx) { struct virtio_pci_softc * const psc = (struct virtio_pci_softc *)sc; bus_space_write_2(psc->sc_notify_iot, psc->sc_notify_ioh, 0, idx); } /* only applicable for v 0.9 but also called for 1.0 */ static int virtio_pci_adjust_config_region(struct virtio_pci_softc *psc) { struct virtio_softc * const sc = &psc->sc_sc; device_t self = sc->sc_dev; if (psc->sc_sc.sc_version_1) return 0; sc->sc_devcfg_iosize = psc->sc_iosize - psc->sc_devcfg_offset; sc->sc_devcfg_iot = psc->sc_iot; if (bus_space_subregion(psc->sc_iot, psc->sc_ioh, psc->sc_devcfg_offset, sc->sc_devcfg_iosize, &sc->sc_devcfg_ioh)) { aprint_error_dev(self, "can't map config i/o space\n"); return EIO; } return 0; } static uint16_t virtio_pci_read_queue_size_09(struct virtio_softc *sc, uint16_t idx) { struct virtio_pci_softc * const psc = (struct virtio_pci_softc *)sc; bus_space_write_2(psc->sc_iot, psc->sc_ioh, VIRTIO_CONFIG_QUEUE_SELECT, idx); return bus_space_read_2(psc->sc_iot, psc->sc_ioh, VIRTIO_CONFIG_QUEUE_SIZE); } static void virtio_pci_setup_queue_09(struct virtio_softc *sc, uint16_t idx, uint64_t addr) { struct virtio_pci_softc * const psc = (struct virtio_pci_softc *)sc; bus_space_write_2(psc->sc_iot, psc->sc_ioh, VIRTIO_CONFIG_QUEUE_SELECT, idx); bus_space_write_4(psc->sc_iot, psc->sc_ioh, VIRTIO_CONFIG_QUEUE_ADDRESS, addr / VIRTIO_PAGE_SIZE); if (psc->sc_ihs_num > 1) { int vec = VIRTIO_MSIX_QUEUE_VECTOR_INDEX; if (psc->sc_intr_pervq) vec += idx; bus_space_write_2(psc->sc_iot, psc->sc_ioh, VIRTIO_CONFIG_MSI_QUEUE_VECTOR, vec); } } static void virtio_pci_set_status_09(struct virtio_softc *sc, int status) { struct virtio_pci_softc * const psc = (struct virtio_pci_softc *)sc; int old = 0; if (status != 0) { old = bus_space_read_1(psc->sc_iot, psc->sc_ioh, VIRTIO_CONFIG_DEVICE_STATUS); } bus_space_write_1(psc->sc_iot, psc->sc_ioh, VIRTIO_CONFIG_DEVICE_STATUS, status|old); } static void virtio_pci_negotiate_features_09(struct virtio_softc *sc, uint64_t guest_features) { struct virtio_pci_softc * const psc = (struct virtio_pci_softc *)sc; uint32_t r; r = bus_space_read_4(psc->sc_iot, psc->sc_ioh, VIRTIO_CONFIG_DEVICE_FEATURES); r &= guest_features; bus_space_write_4(psc->sc_iot, psc->sc_ioh, VIRTIO_CONFIG_GUEST_FEATURES, r); sc->sc_active_features = r; } /* ------------------------------------- * Version 1.0 support * -------------------------------------*/ static void virtio_pci_kick_10(struct virtio_softc *sc, uint16_t idx) { struct virtio_pci_softc * const psc = (struct virtio_pci_softc *)sc; unsigned offset = sc->sc_vqs[idx].vq_notify_off * psc->sc_notify_off_multiplier; bus_space_write_2(psc->sc_notify_iot, psc->sc_notify_ioh, offset, idx); } static uint16_t virtio_pci_read_queue_size_10(struct virtio_softc *sc, uint16_t idx) { struct virtio_pci_softc * const psc = (struct virtio_pci_softc *)sc; bus_space_tag_t iot = psc->sc_iot; bus_space_handle_t ioh = psc->sc_ioh; bus_space_write_2(iot, ioh, VIRTIO_CONFIG1_QUEUE_SELECT, idx); return bus_space_read_2(iot, ioh, VIRTIO_CONFIG1_QUEUE_SIZE); } /* * By definition little endian only in v1.0. NB: "MAY" in the text * below refers to "independently" (i.e. the order of accesses) not * "32-bit" (which is restricted by the earlier "MUST"). * * 4.1.3.1 Driver Requirements: PCI Device Layout * * For device configuration access, the driver MUST use ... 32-bit * wide and aligned accesses for ... 64-bit wide fields. For 64-bit * fields, the driver MAY access each of the high and low 32-bit parts * of the field independently. */ static __inline void virtio_pci_bus_space_write_8(bus_space_tag_t iot, bus_space_handle_t ioh, bus_size_t offset, uint64_t value) { #if _QUAD_HIGHWORD bus_space_write_4(iot, ioh, offset, BUS_ADDR_LO32(value)); bus_space_write_4(iot, ioh, offset + 4, BUS_ADDR_HI32(value)); #else bus_space_write_4(iot, ioh, offset, BUS_ADDR_HI32(value)); bus_space_write_4(iot, ioh, offset + 4, BUS_ADDR_LO32(value)); #endif } static void virtio_pci_setup_queue_10(struct virtio_softc *sc, uint16_t idx, uint64_t addr) { struct virtio_pci_softc * const psc = (struct virtio_pci_softc *)sc; struct virtqueue *vq = &sc->sc_vqs[idx]; bus_space_tag_t iot = psc->sc_iot; bus_space_handle_t ioh = psc->sc_ioh; KASSERT(vq->vq_index == idx); bus_space_write_2(iot, ioh, VIRTIO_CONFIG1_QUEUE_SELECT, vq->vq_index); if (addr == 0) { bus_space_write_2(iot, ioh, VIRTIO_CONFIG1_QUEUE_ENABLE, 0); virtio_pci_bus_space_write_8(iot, ioh, VIRTIO_CONFIG1_QUEUE_DESC, 0); virtio_pci_bus_space_write_8(iot, ioh, VIRTIO_CONFIG1_QUEUE_AVAIL, 0); virtio_pci_bus_space_write_8(iot, ioh, VIRTIO_CONFIG1_QUEUE_USED, 0); } else { virtio_pci_bus_space_write_8(iot, ioh, VIRTIO_CONFIG1_QUEUE_DESC, addr); virtio_pci_bus_space_write_8(iot, ioh, VIRTIO_CONFIG1_QUEUE_AVAIL, addr + vq->vq_availoffset); virtio_pci_bus_space_write_8(iot, ioh, VIRTIO_CONFIG1_QUEUE_USED, addr + vq->vq_usedoffset); bus_space_write_2(iot, ioh, VIRTIO_CONFIG1_QUEUE_ENABLE, 1); vq->vq_notify_off = bus_space_read_2(iot, ioh, VIRTIO_CONFIG1_QUEUE_NOTIFY_OFF); } if (psc->sc_ihs_num > 1) { int vec = VIRTIO_MSIX_QUEUE_VECTOR_INDEX; if (psc->sc_intr_pervq) vec += idx; bus_space_write_2(iot, ioh, VIRTIO_CONFIG1_QUEUE_MSIX_VECTOR, vec); } } static void virtio_pci_set_status_10(struct virtio_softc *sc, int status) { struct virtio_pci_softc * const psc = (struct virtio_pci_softc *)sc; bus_space_tag_t iot = psc->sc_iot; bus_space_handle_t ioh = psc->sc_ioh; int old = 0; if (status) old = bus_space_read_1(iot, ioh, VIRTIO_CONFIG1_DEVICE_STATUS); bus_space_write_1(iot, ioh, VIRTIO_CONFIG1_DEVICE_STATUS, status | old); } void virtio_pci_negotiate_features_10(struct virtio_softc *sc, uint64_t guest_features) { struct virtio_pci_softc * const psc = (struct virtio_pci_softc *)sc; device_t self = sc->sc_dev; bus_space_tag_t iot = psc->sc_iot; bus_space_handle_t ioh = psc->sc_ioh; uint64_t host, negotiated, device_status; guest_features |= VIRTIO_F_VERSION_1; #ifdef __NEED_VIRTIO_F_ACCESS_PLATFORM /* XXX This could use some work. */ guest_features |= VIRTIO_F_ACCESS_PLATFORM; #endif /* __NEED_VIRTIO_F_ACCESS_PLATFORM */ /* notify on empty is 0.9 only */ guest_features &= ~VIRTIO_F_NOTIFY_ON_EMPTY; sc->sc_active_features = 0; bus_space_write_4(iot, ioh, VIRTIO_CONFIG1_DEVICE_FEATURE_SELECT, 0); host = bus_space_read_4(iot, ioh, VIRTIO_CONFIG1_DEVICE_FEATURE); bus_space_write_4(iot, ioh, VIRTIO_CONFIG1_DEVICE_FEATURE_SELECT, 1); host |= (uint64_t) bus_space_read_4(iot, ioh, VIRTIO_CONFIG1_DEVICE_FEATURE) << 32; negotiated = host & guest_features; bus_space_write_4(iot, ioh, VIRTIO_CONFIG1_DRIVER_FEATURE_SELECT, 0); bus_space_write_4(iot, ioh, VIRTIO_CONFIG1_DRIVER_FEATURE, negotiated & 0xffffffff); bus_space_write_4(iot, ioh, VIRTIO_CONFIG1_DRIVER_FEATURE_SELECT, 1); bus_space_write_4(iot, ioh, VIRTIO_CONFIG1_DRIVER_FEATURE, negotiated >> 32); virtio_pci_set_status_10(sc, VIRTIO_CONFIG_DEVICE_STATUS_FEATURES_OK); device_status = bus_space_read_1(iot, ioh, VIRTIO_CONFIG1_DEVICE_STATUS); if ((device_status & VIRTIO_CONFIG_DEVICE_STATUS_FEATURES_OK) == 0) { aprint_error_dev(self, "feature negotiation failed\n"); bus_space_write_1(iot, ioh, VIRTIO_CONFIG1_DEVICE_STATUS, VIRTIO_CONFIG_DEVICE_STATUS_FAILED); return; } if ((negotiated & VIRTIO_F_VERSION_1) == 0) { aprint_error_dev(self, "host rejected version 1\n"); bus_space_write_1(iot, ioh, VIRTIO_CONFIG1_DEVICE_STATUS, VIRTIO_CONFIG_DEVICE_STATUS_FAILED); return; } sc->sc_active_features = negotiated; return; } /* ------------------------------------- * Generic PCI interrupt code * -------------------------------------*/ static int virtio_pci_setup_interrupts_10(struct virtio_softc *sc, int reinit) { struct virtio_pci_softc * const psc = (struct virtio_pci_softc *)sc; bus_space_tag_t iot = psc->sc_iot; bus_space_handle_t ioh = psc->sc_ioh; int vector, ret, qid; if (!virtio_pci_msix_enabled(psc)) return 0; vector = VIRTIO_MSIX_CONFIG_VECTOR_INDEX; bus_space_write_2(iot, ioh, VIRTIO_CONFIG1_CONFIG_MSIX_VECTOR, vector); ret = bus_space_read_2(iot, ioh, VIRTIO_CONFIG1_CONFIG_MSIX_VECTOR); if (ret != vector) { VIRTIO_PCI_LOG(sc, reinit, "can't set config msix vector\n"); return -1; } for (qid = 0; qid < sc->sc_nvqs; qid++) { vector = VIRTIO_MSIX_QUEUE_VECTOR_INDEX; if (psc->sc_intr_pervq) vector += qid; bus_space_write_2(iot, ioh, VIRTIO_CONFIG1_QUEUE_SELECT, qid); bus_space_write_2(iot, ioh, VIRTIO_CONFIG1_QUEUE_MSIX_VECTOR, vector); ret = bus_space_read_2(iot, ioh, VIRTIO_CONFIG1_QUEUE_MSIX_VECTOR); if (ret != vector) { VIRTIO_PCI_LOG(sc, reinit, "can't set queue %d " "msix vector\n", qid); return -1; } } return 0; } static int virtio_pci_setup_interrupts_09(struct virtio_softc *sc, int reinit) { struct virtio_pci_softc * const psc = (struct virtio_pci_softc *)sc; int offset, vector, ret, qid; if (!virtio_pci_msix_enabled(psc)) return 0; offset = VIRTIO_CONFIG_MSI_CONFIG_VECTOR; vector = VIRTIO_MSIX_CONFIG_VECTOR_INDEX; bus_space_write_2(psc->sc_iot, psc->sc_ioh, offset, vector); ret = bus_space_read_2(psc->sc_iot, psc->sc_ioh, offset); if (ret != vector) { aprint_debug_dev(sc->sc_dev, "%s: expected=%d, actual=%d\n", __func__, vector, ret); VIRTIO_PCI_LOG(sc, reinit, "can't set config msix vector\n"); return -1; } for (qid = 0; qid < sc->sc_nvqs; qid++) { offset = VIRTIO_CONFIG_QUEUE_SELECT; bus_space_write_2(psc->sc_iot, psc->sc_ioh, offset, qid); offset = VIRTIO_CONFIG_MSI_QUEUE_VECTOR; vector = VIRTIO_MSIX_QUEUE_VECTOR_INDEX; if (psc->sc_intr_pervq) vector += qid; bus_space_write_2(psc->sc_iot, psc->sc_ioh, offset, vector); ret = bus_space_read_2(psc->sc_iot, psc->sc_ioh, offset); if (ret != vector) { aprint_debug_dev(sc->sc_dev, "%s[qid=%d]:" " expected=%d, actual=%d\n", __func__, qid, vector, ret); VIRTIO_PCI_LOG(sc, reinit, "can't set queue %d " "msix vector\n", qid); return -1; } } return 0; } static int virtio_pci_establish_msix_interrupts(struct virtio_softc *sc, struct pci_attach_args *pa) { struct virtio_pci_softc * const psc = (struct virtio_pci_softc *)sc; device_t self = sc->sc_dev; pci_chipset_tag_t pc = pa->pa_pc; struct virtqueue *vq; char intrbuf[PCI_INTRSTR_LEN]; char intr_xname[INTRDEVNAMEBUF]; char const *intrstr; int idx, qid, n; idx = VIRTIO_MSIX_CONFIG_VECTOR_INDEX; if (sc->sc_flags & VIRTIO_F_INTR_MPSAFE) pci_intr_setattr(pc, &psc->sc_ihp[idx], PCI_INTR_MPSAFE, true); snprintf(intr_xname, sizeof(intr_xname), "%s config", device_xname(sc->sc_dev)); psc->sc_ihs[idx] = pci_intr_establish_xname(pc, psc->sc_ihp[idx], sc->sc_ipl, virtio_pci_msix_config_intr, sc, intr_xname); if (psc->sc_ihs[idx] == NULL) { aprint_error_dev(self, "couldn't establish MSI-X for config\n"); goto error; } idx = VIRTIO_MSIX_QUEUE_VECTOR_INDEX; if (psc->sc_intr_pervq) { for (qid = 0; qid < sc->sc_nvqs; qid++) { n = idx + qid; vq = &sc->sc_vqs[qid]; snprintf(intr_xname, sizeof(intr_xname), "%s vq#%d", device_xname(sc->sc_dev), qid); if (sc->sc_flags & VIRTIO_F_INTR_MPSAFE) { pci_intr_setattr(pc, &psc->sc_ihp[n], PCI_INTR_MPSAFE, true); } psc->sc_ihs[n] = pci_intr_establish_xname(pc, psc->sc_ihp[n], sc->sc_ipl, vq->vq_intrhand, vq->vq_intrhand_arg, intr_xname); if (psc->sc_ihs[n] == NULL) { aprint_error_dev(self, "couldn't establish MSI-X for a vq\n"); goto error; } } } else { if (sc->sc_flags & VIRTIO_F_INTR_MPSAFE) pci_intr_setattr(pc, &psc->sc_ihp[idx], PCI_INTR_MPSAFE, true); snprintf(intr_xname, sizeof(intr_xname), "%s queues", device_xname(sc->sc_dev)); psc->sc_ihs[idx] = pci_intr_establish_xname(pc, psc->sc_ihp[idx], sc->sc_ipl, virtio_pci_msix_queue_intr, sc, intr_xname); if (psc->sc_ihs[idx] == NULL) { aprint_error_dev(self, "couldn't establish MSI-X for queues\n"); goto error; } } idx = VIRTIO_MSIX_CONFIG_VECTOR_INDEX; intrstr = pci_intr_string(pc, psc->sc_ihp[idx], intrbuf, sizeof(intrbuf)); aprint_normal_dev(self, "config interrupting at %s\n", intrstr); idx = VIRTIO_MSIX_QUEUE_VECTOR_INDEX; if (psc->sc_intr_pervq) { kcpuset_t *affinity; int affinity_to, r; kcpuset_create(&affinity, false); for (qid = 0; qid < sc->sc_nvqs; qid++) { n = idx + qid; affinity_to = (qid / 2) % ncpu; intrstr = pci_intr_string(pc, psc->sc_ihp[n], intrbuf, sizeof(intrbuf)); kcpuset_zero(affinity); kcpuset_set(affinity, affinity_to); r = interrupt_distribute(psc->sc_ihs[n], affinity, NULL); if (r == 0) { aprint_normal_dev(self, "for vq #%d interrupting at %s affinity to %u\n", qid, intrstr, affinity_to); } else { aprint_normal_dev(self, "for vq #%d interrupting at %s\n", qid, intrstr); } } kcpuset_destroy(affinity); } else { intrstr = pci_intr_string(pc, psc->sc_ihp[idx], intrbuf, sizeof(intrbuf)); aprint_normal_dev(self, "queues interrupting at %s\n", intrstr); } return 0; error: idx = VIRTIO_MSIX_CONFIG_VECTOR_INDEX; if (psc->sc_ihs[idx] != NULL) pci_intr_disestablish(psc->sc_pa.pa_pc, psc->sc_ihs[idx]); idx = VIRTIO_MSIX_QUEUE_VECTOR_INDEX; if (psc->sc_intr_pervq) { for (qid = 0; qid < sc->sc_nvqs; qid++) { n = idx + qid; if (psc->sc_ihs[n] == NULL) continue; pci_intr_disestablish(psc->sc_pa.pa_pc, psc->sc_ihs[n]); } } else { if (psc->sc_ihs[idx] != NULL) pci_intr_disestablish(psc->sc_pa.pa_pc, psc->sc_ihs[idx]); } return -1; } static int virtio_pci_establish_intx_interrupt(struct virtio_softc *sc, struct pci_attach_args *pa) { struct virtio_pci_softc * const psc = (struct virtio_pci_softc *)sc; device_t self = sc->sc_dev; pci_chipset_tag_t pc = pa->pa_pc; char intrbuf[PCI_INTRSTR_LEN]; char const *intrstr; if (sc->sc_flags & VIRTIO_F_INTR_MPSAFE) pci_intr_setattr(pc, &psc->sc_ihp[0], PCI_INTR_MPSAFE, true); psc->sc_ihs[0] = pci_intr_establish_xname(pc, psc->sc_ihp[0], sc->sc_ipl, virtio_pci_intr, sc, device_xname(sc->sc_dev)); if (psc->sc_ihs[0] == NULL) { aprint_error_dev(self, "couldn't establish INTx\n"); return -1; } intrstr = pci_intr_string(pc, psc->sc_ihp[0], intrbuf, sizeof(intrbuf)); aprint_normal_dev(self, "interrupting at %s\n", intrstr); return 0; } static int virtio_pci_alloc_interrupts(struct virtio_softc *sc) { struct virtio_pci_softc * const psc = (struct virtio_pci_softc *)sc; device_t self = sc->sc_dev; pci_chipset_tag_t pc = psc->sc_pa.pa_pc; pcitag_t tag = psc->sc_pa.pa_tag; int error; int nmsix; int off; int counts[PCI_INTR_TYPE_SIZE]; pci_intr_type_t max_type; pcireg_t ctl; nmsix = pci_msix_count(psc->sc_pa.pa_pc, psc->sc_pa.pa_tag); aprint_debug_dev(self, "pci_msix_count=%d\n", nmsix); /* We need at least two: one for config and the other for queues */ if ((sc->sc_flags & VIRTIO_F_INTR_MSIX) == 0 || nmsix < 2) { /* Try INTx only */ max_type = PCI_INTR_TYPE_INTX; counts[PCI_INTR_TYPE_INTX] = 1; } else { /* Try MSI-X first and INTx second */ if (ISSET(sc->sc_flags, VIRTIO_F_INTR_PERVQ) && sc->sc_nvqs + VIRTIO_MSIX_QUEUE_VECTOR_INDEX <= nmsix) { nmsix = sc->sc_nvqs + VIRTIO_MSIX_QUEUE_VECTOR_INDEX; } else { nmsix = 2; } max_type = PCI_INTR_TYPE_MSIX; counts[PCI_INTR_TYPE_MSIX] = nmsix; counts[PCI_INTR_TYPE_MSI] = 0; counts[PCI_INTR_TYPE_INTX] = 1; } retry: error = pci_intr_alloc(&psc->sc_pa, &psc->sc_ihp, counts, max_type); if (error != 0) { aprint_error_dev(self, "couldn't map interrupt\n"); return -1; } if (pci_intr_type(pc, psc->sc_ihp[0]) == PCI_INTR_TYPE_MSIX) { psc->sc_intr_pervq = nmsix > 2 ? true : false; psc->sc_ihs = kmem_zalloc(sizeof(*psc->sc_ihs) * nmsix, KM_SLEEP); error = virtio_pci_establish_msix_interrupts(sc, &psc->sc_pa); if (error != 0) { kmem_free(psc->sc_ihs, sizeof(*psc->sc_ihs) * nmsix); pci_intr_release(pc, psc->sc_ihp, nmsix); /* Retry INTx */ max_type = PCI_INTR_TYPE_INTX; counts[PCI_INTR_TYPE_INTX] = 1; goto retry; } psc->sc_ihs_num = nmsix; psc->sc_devcfg_offset = VIRTIO_CONFIG_DEVICE_CONFIG_MSI; virtio_pci_adjust_config_region(psc); } else if (pci_intr_type(pc, psc->sc_ihp[0]) == PCI_INTR_TYPE_INTX) { psc->sc_intr_pervq = false; psc->sc_ihs = kmem_zalloc(sizeof(*psc->sc_ihs) * 1, KM_SLEEP); error = virtio_pci_establish_intx_interrupt(sc, &psc->sc_pa); if (error != 0) { kmem_free(psc->sc_ihs, sizeof(*psc->sc_ihs) * 1); pci_intr_release(pc, psc->sc_ihp, 1); return -1; } psc->sc_ihs_num = 1; psc->sc_devcfg_offset = VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI; virtio_pci_adjust_config_region(psc); error = pci_get_capability(pc, tag, PCI_CAP_MSIX, &off, NULL); if (error != 0) { ctl = pci_conf_read(pc, tag, off + PCI_MSIX_CTL); ctl &= ~PCI_MSIX_CTL_ENABLE; pci_conf_write(pc, tag, off + PCI_MSIX_CTL, ctl); } } if (!psc->sc_intr_pervq) CLR(sc->sc_flags, VIRTIO_F_INTR_PERVQ); return 0; } static void virtio_pci_free_interrupts(struct virtio_softc *sc) { struct virtio_pci_softc * const psc = (struct virtio_pci_softc *)sc; for (int i = 0; i < psc->sc_ihs_num; i++) { if (psc->sc_ihs[i] == NULL) continue; pci_intr_disestablish(psc->sc_pa.pa_pc, psc->sc_ihs[i]); psc->sc_ihs[i] = NULL; } if (psc->sc_ihs_num > 0) pci_intr_release(psc->sc_pa.pa_pc, psc->sc_ihp, psc->sc_ihs_num); if (psc->sc_ihs != NULL) { kmem_free(psc->sc_ihs, sizeof(*psc->sc_ihs) * psc->sc_ihs_num); psc->sc_ihs = NULL; } psc->sc_ihs_num = 0; } static bool virtio_pci_msix_enabled(struct virtio_pci_softc *psc) { pci_chipset_tag_t pc = psc->sc_pa.pa_pc; if (pci_intr_type(pc, psc->sc_ihp[0]) == PCI_INTR_TYPE_MSIX) return true; return false; } /* * Interrupt handler. */ static int virtio_pci_intr(void *arg) { struct virtio_softc *sc = arg; struct virtio_pci_softc * const psc = (struct virtio_pci_softc *)sc; int isr, r = 0; /* check and ack the interrupt */ isr = bus_space_read_1(psc->sc_isr_iot, psc->sc_isr_ioh, 0); if (isr == 0) return 0; if ((isr & VIRTIO_CONFIG_ISR_CONFIG_CHANGE) && (sc->sc_config_change != NULL)) r = (sc->sc_config_change)(sc); if (sc->sc_intrhand != NULL) { if (sc->sc_soft_ih != NULL) softint_schedule(sc->sc_soft_ih); else r |= (sc->sc_intrhand)(sc); } return r; } static int virtio_pci_msix_queue_intr(void *arg) { struct virtio_softc *sc = arg; int r = 0; if (sc->sc_intrhand != NULL) { if (sc->sc_soft_ih != NULL) softint_schedule(sc->sc_soft_ih); else r |= (sc->sc_intrhand)(sc); } return r; } static int virtio_pci_msix_config_intr(void *arg) { struct virtio_softc *sc = arg; int r = 0; if (sc->sc_config_change != NULL) r = (sc->sc_config_change)(sc); return r; } MODULE(MODULE_CLASS_DRIVER, virtio_pci, "pci,virtio"); #ifdef _MODULE #include "ioconf.c" #endif static int virtio_pci_modcmd(modcmd_t cmd, void *opaque) { int error = 0; #ifdef _MODULE switch (cmd) { case MODULE_CMD_INIT: error = config_init_component(cfdriver_ioconf_virtio_pci, cfattach_ioconf_virtio_pci, cfdata_ioconf_virtio_pci); break; case MODULE_CMD_FINI: error = config_fini_component(cfdriver_ioconf_virtio_pci, cfattach_ioconf_virtio_pci, cfdata_ioconf_virtio_pci); break; default: error = ENOTTY; break; } #endif return error; }
9 4 6 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 /* $NetBSD: netbsd32_exec_aout.c,v 1.31 2021/01/19 03:20:13 simonb Exp $ */ /* from: NetBSD: exec_aout.c,v 1.15 1996/09/26 23:34:46 cgd Exp */ /* * Copyright (c) 1998, 2001 Matthew R. Green. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* * Copyright (c) 1993, 1994 Christopher G. Demetriou * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by Christopher G. Demetriou. * 4. The name of the author may not be used to endorse or promote products * derived from this software without specific prior written permission * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: netbsd32_exec_aout.c,v 1.31 2021/01/19 03:20:13 simonb Exp $"); #include <sys/param.h> #include <sys/systm.h> #include <sys/proc.h> #include <sys/vnode.h> #include <sys/exec.h> #include <sys/exec_aout.h> #include <sys/resourcevar.h> #include <sys/signal.h> #include <sys/signalvar.h> #include <compat/netbsd32/netbsd32.h> #ifndef EXEC_AOUT #define EXEC_AOUT #endif #include <compat/netbsd32/netbsd32_exec.h> #include <machine/frame.h> #include <machine/netbsd32_machdep.h> #ifdef COMPAT_NOMID static int netbsd32_exec_aout_nomid(struct lwp *, struct exec_package *); #endif /* * exec_netbsd32_makecmds(): Check if it's an netbsd32 a.out format * executable. * * Given a lwp pointer and an exec package pointer, see if the referent * of the epp is in netbsd32 a.out format. Check 'standard' magic * numbers for this architecture. * * This function, in the former case, or the hook, in the latter, is * responsible for creating a set of vmcmds which can be used to build * the process's vm space and inserting them into the exec package. */ int exec_netbsd32_makecmds(struct lwp *l, struct exec_package *epp) { netbsd32_u_long midmag, magic; u_short mid; int error; struct netbsd32_exec *execp = epp->ep_hdr; if (epp->ep_hdrvalid < sizeof(struct netbsd32_exec)) return ENOEXEC; midmag = (netbsd32_u_long)ntohl(execp->a_midmag); mid = (midmag >> 16) & 0x3ff; magic = midmag & 0xffff; midmag = mid << 16 | magic; /* this is already needed by setup_stack() */ epp->ep_flags |= EXEC_32; switch (midmag) { case (NETBSD32_MID_MACHINE << 16) | ZMAGIC: error = netbsd32_exec_aout_prep_zmagic(l, epp); break; case (NETBSD32_MID_MACHINE << 16) | NMAGIC: error = netbsd32_exec_aout_prep_nmagic(l, epp); break; case (NETBSD32_MID_MACHINE << 16) | OMAGIC: error = netbsd32_exec_aout_prep_omagic(l, epp); break; default: #ifdef COMPAT_NOMID error = netbsd32_exec_aout_nomid(l, epp); #else error = ENOEXEC; #endif break; } if (error) { kill_vmcmds(&epp->ep_vmcmds); epp->ep_flags &= ~EXEC_32; } else epp->ep_flags &= ~EXEC_TOPDOWN_VM; return error; } /* * netbsd32_exec_aout_prep_zmagic(): Prepare a 'native' ZMAGIC binary's * exec package * * First, set of the various offsets/lengths in the exec package. * * Then, mark the text image busy (so it can be demand paged) or error * out if this is not possible. Finally, set up vmcmds for the * text, data, bss, and stack segments. */ int netbsd32_exec_aout_prep_zmagic(struct lwp *l, struct exec_package *epp) { struct netbsd32_exec *execp = epp->ep_hdr; int error; epp->ep_taddr = AOUT_LDPGSZ; epp->ep_tsize = execp->a_text; epp->ep_daddr = epp->ep_taddr + execp->a_text; epp->ep_dsize = execp->a_data + execp->a_bss; epp->ep_entry = execp->a_entry; epp->ep_vm_minaddr = exec_vm_minaddr(VM_MIN_ADDRESS); epp->ep_vm_maxaddr = VM_MAXUSER_ADDRESS32; error = vn_marktext(epp->ep_vp); if (error) return error; /* set up command for text segment */ NEW_VMCMD(&epp->ep_vmcmds, vmcmd_map_pagedvn, execp->a_text, epp->ep_taddr, epp->ep_vp, 0, VM_PROT_READ|VM_PROT_EXECUTE); /* set up command for data segment */ NEW_VMCMD(&epp->ep_vmcmds, vmcmd_map_pagedvn, execp->a_data, epp->ep_daddr, epp->ep_vp, execp->a_text, VM_PROT_READ|VM_PROT_WRITE|VM_PROT_EXECUTE); /* set up command for bss segment */ if (execp->a_bss > 0) NEW_VMCMD(&epp->ep_vmcmds, vmcmd_map_zero, execp->a_bss, epp->ep_daddr + execp->a_data, NULLVP, 0, VM_PROT_READ|VM_PROT_WRITE|VM_PROT_EXECUTE); return (*epp->ep_esch->es_setup_stack)(l, epp); } /* * netbsd32_exec_aout_prep_nmagic(): Prepare a 'native' NMAGIC binary's * exec package */ int netbsd32_exec_aout_prep_nmagic(struct lwp *l, struct exec_package *epp) { struct netbsd32_exec *execp = epp->ep_hdr; long bsize, baddr; epp->ep_taddr = AOUT_LDPGSZ; epp->ep_tsize = execp->a_text; epp->ep_daddr = roundup(epp->ep_taddr + execp->a_text, AOUT_LDPGSZ); epp->ep_dsize = execp->a_data + execp->a_bss; epp->ep_entry = execp->a_entry; epp->ep_vm_minaddr = exec_vm_minaddr(VM_MIN_ADDRESS); epp->ep_vm_maxaddr = VM_MAXUSER_ADDRESS32; /* set up command for text segment */ NEW_VMCMD(&epp->ep_vmcmds, vmcmd_map_readvn, execp->a_text, epp->ep_taddr, epp->ep_vp, sizeof(struct netbsd32_exec), VM_PROT_READ|VM_PROT_EXECUTE); /* set up command for data segment */ NEW_VMCMD(&epp->ep_vmcmds, vmcmd_map_readvn, execp->a_data, epp->ep_daddr, epp->ep_vp, execp->a_text + sizeof(struct netbsd32_exec), VM_PROT_READ|VM_PROT_WRITE|VM_PROT_EXECUTE); /* set up command for bss segment */ baddr = roundup(epp->ep_daddr + execp->a_data, PAGE_SIZE); bsize = epp->ep_daddr + epp->ep_dsize - baddr; if (bsize > 0) NEW_VMCMD(&epp->ep_vmcmds, vmcmd_map_zero, bsize, baddr, NULLVP, 0, VM_PROT_READ|VM_PROT_WRITE|VM_PROT_EXECUTE); return (*epp->ep_esch->es_setup_stack)(l, epp); } /* * netbsd32_exec_aout_prep_omagic(): Prepare a 'native' OMAGIC binary's * exec package */ int netbsd32_exec_aout_prep_omagic(struct lwp *l, struct exec_package *epp) { struct netbsd32_exec *execp = epp->ep_hdr; long dsize, bsize, baddr; epp->ep_taddr = AOUT_LDPGSZ; epp->ep_tsize = execp->a_text; epp->ep_daddr = epp->ep_taddr + execp->a_text; epp->ep_dsize = execp->a_data + execp->a_bss; epp->ep_entry = execp->a_entry; epp->ep_vm_minaddr = exec_vm_minaddr(VM_MIN_ADDRESS); epp->ep_vm_maxaddr = VM_MAXUSER_ADDRESS32; /* set up command for text and data segments */ NEW_VMCMD(&epp->ep_vmcmds, vmcmd_map_readvn, execp->a_text + execp->a_data, epp->ep_taddr, epp->ep_vp, sizeof(struct netbsd32_exec), VM_PROT_READ|VM_PROT_WRITE|VM_PROT_EXECUTE); /* set up command for bss segment */ baddr = roundup(epp->ep_daddr + execp->a_data, PAGE_SIZE); bsize = epp->ep_daddr + epp->ep_dsize - baddr; if (bsize > 0) NEW_VMCMD(&epp->ep_vmcmds, vmcmd_map_zero, bsize, baddr, NULLVP, 0, VM_PROT_READ|VM_PROT_WRITE|VM_PROT_EXECUTE); /* * Make sure (# of pages) mapped above equals (vm_tsize + vm_dsize); * obreak(2) relies on this fact. Both `vm_tsize' and `vm_dsize' are * computed (in execve(2)) by rounding *up* `ep_tsize' and `ep_dsize' * respectively to page boundaries. * Compensate `ep_dsize' for the amount of data covered by the last * text page. */ dsize = epp->ep_dsize + execp->a_text - roundup(execp->a_text, PAGE_SIZE); epp->ep_dsize = (dsize > 0) ? dsize : 0; return (*epp->ep_esch->es_setup_stack)(l, epp); } #ifdef COMPAT_NOMID /* * netbsd32_exec_aout_prep_oldzmagic(): * Prepare the vmcmds to build a vmspace for an old ZMAGIC * binary. [386BSD/BSDI/4.4BSD/NetBSD0.8] * * Cloned from exec_aout_prep_zmagic() in kern/exec_aout.c; a more verbose * description of operation is there. * There were copies of this in the mac68k, hp300, and i386 ports. */ static int netbsd32_exec_aout_prep_oldzmagic(struct lwp *l, struct exec_package *epp) { struct netbsd32_exec *execp = epp->ep_hdr; int error; epp->ep_taddr = 0; epp->ep_tsize = execp->a_text; epp->ep_daddr = epp->ep_taddr + execp->a_text; epp->ep_dsize = execp->a_data + execp->a_bss; epp->ep_entry = execp->a_entry; epp->ep_vm_minaddr = exec_vm_minaddr(VM_MIN_ADDRESS); epp->ep_vm_maxaddr = VM_MAXUSER_ADDRESS32; error = vn_marktext(epp->ep_vp); if (error) return error; /* set up command for text segment */ NEW_VMCMD(&epp->ep_vmcmds, vmcmd_map_pagedvn, execp->a_text, epp->ep_taddr, epp->ep_vp, PAGE_SIZE, /* XXX CLBYTES? */ VM_PROT_READ|VM_PROT_EXECUTE); /* set up command for data segment */ NEW_VMCMD(&epp->ep_vmcmds, vmcmd_map_pagedvn, execp->a_data, epp->ep_daddr, epp->ep_vp, execp->a_text + PAGE_SIZE, /* XXX CLBYTES? */ VM_PROT_READ|VM_PROT_WRITE|VM_PROT_EXECUTE); /* set up command for bss segment */ if (execp->a_bss) NEW_VMCMD(&epp->ep_vmcmds, vmcmd_map_zero, execp->a_bss, epp->ep_daddr + execp->a_data, NULLVP, 0, VM_PROT_READ|VM_PROT_WRITE|VM_PROT_EXECUTE); return (*epp->ep_esch->es_setup_stack)(l, epp); } /* * netbsd32_exec_aout_prep_oldnmagic(): * Prepare the vmcmds to build a vmspace for an old NMAGIC * binary. [BSDI] * * Cloned from exec_aout_prep_nmagic() in kern/exec_aout.c; with text starting * at 0. * XXX: There must be a better way to share this code. */ static int netbsd32_exec_aout_prep_oldnmagic(struct lwp *l, struct exec_package *epp) { struct netbsd32_exec *execp = epp->ep_hdr; long bsize, baddr; epp->ep_taddr = 0; epp->ep_tsize = execp->a_text; epp->ep_daddr = roundup(epp->ep_taddr + execp->a_text, AOUT_LDPGSZ); epp->ep_dsize = execp->a_data + execp->a_bss; epp->ep_entry = execp->a_entry; epp->ep_vm_minaddr = exec_vm_minaddr(VM_MIN_ADDRESS); epp->ep_vm_maxaddr = VM_MAXUSER_ADDRESS32; /* set up command for text segment */ NEW_VMCMD(&epp->ep_vmcmds, vmcmd_map_readvn, execp->a_text, epp->ep_taddr, epp->ep_vp, sizeof(struct netbsd32_exec), VM_PROT_READ|VM_PROT_EXECUTE); /* set up command for data segment */ NEW_VMCMD(&epp->ep_vmcmds, vmcmd_map_readvn, execp->a_data, epp->ep_daddr, epp->ep_vp, execp->a_text + sizeof(struct netbsd32_exec), VM_PROT_READ|VM_PROT_WRITE|VM_PROT_EXECUTE); /* set up command for bss segment */ baddr = roundup(epp->ep_daddr + execp->a_data, PAGE_SIZE); bsize = epp->ep_daddr + epp->ep_dsize - baddr; if (bsize > 0) NEW_VMCMD(&epp->ep_vmcmds, vmcmd_map_zero, bsize, baddr, NULLVP, 0, VM_PROT_READ|VM_PROT_WRITE|VM_PROT_EXECUTE); return (*epp->ep_esch->es_setup_stack)(l, epp); } /* * netbsd32_exec_aout_prep_oldomagic(): * Prepare the vmcmds to build a vmspace for an old OMAGIC * binary. [BSDI] * * Cloned from exec_aout_prep_omagic() in kern/exec_aout.c; with text starting * at 0. * XXX: There must be a better way to share this code. */ static int netbsd32_exec_aout_prep_oldomagic(struct lwp *l, struct exec_package *epp) { struct netbsd32_exec *execp = epp->ep_hdr; long dsize, bsize, baddr; epp->ep_taddr = 0; epp->ep_tsize = execp->a_text; epp->ep_daddr = epp->ep_taddr + execp->a_text; epp->ep_dsize = execp->a_data + execp->a_bss; epp->ep_entry = execp->a_entry; /* set up command for text and data segments */ NEW_VMCMD(&epp->ep_vmcmds, vmcmd_map_readvn, execp->a_text + execp->a_data, epp->ep_taddr, epp->ep_vp, sizeof(struct netbsd32_exec), VM_PROT_READ|VM_PROT_WRITE|VM_PROT_EXECUTE); /* set up command for bss segment */ baddr = roundup(epp->ep_daddr + execp->a_data, PAGE_SIZE); bsize = epp->ep_daddr + epp->ep_dsize - baddr; if (bsize > 0) NEW_VMCMD(&epp->ep_vmcmds, vmcmd_map_zero, bsize, baddr, NULLVP, 0, VM_PROT_READ|VM_PROT_WRITE|VM_PROT_EXECUTE); /* * Make sure (# of pages) mapped above equals (vm_tsize + vm_dsize); * obreak(2) relies on this fact. Both `vm_tsize' and `vm_dsize' are * computed (in execve(2)) by rounding *up* `ep_tsize' and `ep_dsize' * respectively to page boundaries. * Compensate `ep_dsize' for the amount of data covered by the last * text page. */ dsize = epp->ep_dsize + execp->a_text - roundup(execp->a_text, PAGE_SIZE); epp->ep_dsize = (dsize > 0) ? dsize : 0; return (*epp->ep_esch->es_setup_stack)(l, epp); } static int netbsd32_exec_aout_nomid(struct lwp *l, struct exec_package *epp) { int error; u_long midmag, magic; u_short mid; struct exec *execp = epp->ep_hdr; /* check on validity of epp->ep_hdr performed by exec_out_makecmds */ midmag = ntohl(execp->a_midmag); mid = (midmag >> 16) & 0xffff; magic = midmag & 0xffff; if (magic == 0) { magic = (execp->a_midmag & 0xffff); mid = MID_ZERO; } midmag = mid << 16 | magic; switch (midmag) { case (MID_ZERO << 16) | ZMAGIC: /* * 386BSD's ZMAGIC format: */ return netbsd32_exec_aout_prep_oldzmagic(l, epp); break; case (MID_ZERO << 16) | QMAGIC: /* * BSDI's QMAGIC format: * same as new ZMAGIC format, but with different magic number */ return netbsd32_exec_aout_prep_zmagic(l, epp); break; case (MID_ZERO << 16) | NMAGIC: /* * BSDI's NMAGIC format: * same as NMAGIC format, but with different magic number * and with text starting at 0. */ return netbsd32_exec_aout_prep_oldnmagic(l, epp); case (MID_ZERO << 16) | OMAGIC: /* * BSDI's OMAGIC format: * same as OMAGIC format, but with different magic number * and with text starting at 0. */ return netbsd32_exec_aout_prep_oldomagic(l, epp); default: return ENOEXEC; } return error; } #endif
7 7 2 5 5 5 1 4 4 4 2 4 4 3 3 4 4 4 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 /* $NetBSD: uvm_device.c,v 1.80 2022/07/07 13:27:02 riastradh Exp $ */ /* * Copyright (c) 1997 Charles D. Cranor and Washington University. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * from: Id: uvm_device.c,v 1.1.2.9 1998/02/06 05:11:47 chs Exp */ /* * uvm_device.c: the device pager. */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: uvm_device.c,v 1.80 2022/07/07 13:27:02 riastradh Exp $"); #include "opt_uvmhist.h" #include <sys/param.h> #include <sys/systm.h> #include <sys/conf.h> #include <sys/proc.h> #include <sys/kmem.h> #include <uvm/uvm.h> #include <uvm/uvm_device.h> #include <uvm/uvm_pmap.h> /* * private global data structure * * we keep a list of active device objects in the system. */ LIST_HEAD(udv_list_struct, uvm_device); static struct udv_list_struct udv_list; static kmutex_t udv_lock __cacheline_aligned; /* * functions */ static void udv_init(void); static void udv_reference(struct uvm_object *); static void udv_detach(struct uvm_object *); static int udv_fault(struct uvm_faultinfo *, vaddr_t, struct vm_page **, int, int, vm_prot_t, int); /* * master pager structure */ const struct uvm_pagerops uvm_deviceops = { .pgo_init = udv_init, .pgo_reference = udv_reference, .pgo_detach = udv_detach, .pgo_fault = udv_fault, }; /* * the ops! */ /* * udv_init * * init pager private data structures. */ static void udv_init(void) { LIST_INIT(&udv_list); mutex_init(&udv_lock, MUTEX_DEFAULT, IPL_NONE); } /* * udv_attach * * get a VM object that is associated with a device. allocate a new * one if needed. * * => caller must _not_ already be holding the lock on the uvm_object. * => in fact, nothing should be locked so that we can sleep here. */ struct uvm_object * udv_attach(dev_t device, vm_prot_t accessprot, voff_t off, /* used only for access check */ vsize_t size /* used only for access check */) { struct uvm_device *udv, *lcv; const struct cdevsw *cdev; dev_mmap_t *mapfn; UVMHIST_FUNC(__func__); UVMHIST_CALLARGS(maphist, "(device=%#jx)", device,0,0,0); KASSERT(size > 0); /* * before we do anything, ensure this device supports mmap */ cdev = cdevsw_lookup(device); if (cdev == NULL) { return NULL; } mapfn = cdev->d_mmap; if (mapfn == NULL || mapfn == nommap) { return NULL; } /* * Negative offsets on the object are not allowed, unless the * device has affirmatively set D_NEGOFFSAFE. */ if ((cdev->d_flag & D_NEGOFFSAFE) == 0 && off != UVM_UNKNOWN_OFFSET) { if (off < 0) return NULL; #if SIZE_MAX > UINT32_MAX /* XXX -Wtype-limits */ if (size > __type_max(voff_t)) return NULL; #endif if (off > __type_max(voff_t) - size) return NULL; } /* * Check that the specified range of the device allows the * desired protection. * * XXX assumes VM_PROT_* == PROT_* * XXX clobbers off and size, but nothing else here needs them. */ do { KASSERTMSG((off % PAGE_SIZE) == 0, "off=%jd", (intmax_t)off); KASSERTMSG(size >= PAGE_SIZE, "size=%"PRIuVSIZE, size); if (cdev_mmap(device, off, accessprot) == -1) return NULL; KASSERT(off <= __type_max(voff_t) - PAGE_SIZE || (cdev->d_flag & D_NEGOFFSAFE) != 0); if (__predict_false(off > __type_max(voff_t) - PAGE_SIZE)) { /* * off += PAGE_SIZE, with two's-complement * wraparound, or * * off += PAGE_SIZE - 2*(VOFF_MAX + 1). */ CTASSERT(MIN_PAGE_SIZE >= 2); off -= __type_max(voff_t); off += PAGE_SIZE - 2; off -= __type_max(voff_t); } else { off += PAGE_SIZE; } size -= PAGE_SIZE; } while (size != 0); /* * keep looping until we get it */ for (;;) { /* * first, attempt to find it on the main list */ mutex_enter(&udv_lock); LIST_FOREACH(lcv, &udv_list, u_list) { if (device == lcv->u_device) break; } /* * got it on main list. put a hold on it and unlock udv_lock. */ if (lcv) { /* * if someone else has a hold on it, sleep and start * over again. */ if (lcv->u_flags & UVM_DEVICE_HOLD) { lcv->u_flags |= UVM_DEVICE_WANTED; UVM_UNLOCK_AND_WAIT(lcv, &udv_lock, false, "udv_attach",0); continue; } /* we are now holding it */ lcv->u_flags |= UVM_DEVICE_HOLD; mutex_exit(&udv_lock); /* * bump reference count, unhold, return. */ rw_enter(lcv->u_obj.vmobjlock, RW_WRITER); lcv->u_obj.uo_refs++; rw_exit(lcv->u_obj.vmobjlock); mutex_enter(&udv_lock); if (lcv->u_flags & UVM_DEVICE_WANTED) wakeup(lcv); lcv->u_flags &= ~(UVM_DEVICE_WANTED|UVM_DEVICE_HOLD); mutex_exit(&udv_lock); return &lcv->u_obj; } /* * Did not find it on main list. Need to allocate a new one. */ mutex_exit(&udv_lock); /* Note: both calls may allocate memory and sleep. */ udv = kmem_alloc(sizeof(*udv), KM_SLEEP); uvm_obj_init(&udv->u_obj, &uvm_deviceops, true, 1); mutex_enter(&udv_lock); /* * now we have to double check to make sure no one added it * to the list while we were sleeping... */ LIST_FOREACH(lcv, &udv_list, u_list) { if (device == lcv->u_device) break; } /* * did we lose a race to someone else? * free our memory and retry. */ if (lcv) { mutex_exit(&udv_lock); uvm_obj_destroy(&udv->u_obj, true); kmem_free(udv, sizeof(*udv)); continue; } /* * we have it! init the data structures, add to list * and return. */ udv->u_flags = 0; udv->u_device = device; LIST_INSERT_HEAD(&udv_list, udv, u_list); mutex_exit(&udv_lock); return &udv->u_obj; } /*NOTREACHED*/ } /* * udv_reference * * add a reference to a VM object. Note that the reference count must * already be one (the passed in reference) so there is no chance of the * udv being released or locked out here. * * => caller must call with object unlocked. */ static void udv_reference(struct uvm_object *uobj) { UVMHIST_FUNC(__func__); UVMHIST_CALLED(maphist); rw_enter(uobj->vmobjlock, RW_WRITER); uobj->uo_refs++; UVMHIST_LOG(maphist, "<- done (uobj=%#jx, ref = %jd)", (uintptr_t)uobj, uobj->uo_refs,0,0); rw_exit(uobj->vmobjlock); } /* * udv_detach * * remove a reference to a VM object. * * => caller must call with object unlocked and map locked. */ static void udv_detach(struct uvm_object *uobj) { struct uvm_device *udv = (struct uvm_device *)uobj; UVMHIST_FUNC(__func__); UVMHIST_CALLED(maphist); /* * loop until done */ again: rw_enter(uobj->vmobjlock, RW_WRITER); if (uobj->uo_refs > 1) { uobj->uo_refs--; rw_exit(uobj->vmobjlock); UVMHIST_LOG(maphist," <- done, uobj=%#jx, ref=%jd", (uintptr_t)uobj,uobj->uo_refs,0,0); return; } /* * is it being held? if so, wait until others are done. */ mutex_enter(&udv_lock); if (udv->u_flags & UVM_DEVICE_HOLD) { udv->u_flags |= UVM_DEVICE_WANTED; rw_exit(uobj->vmobjlock); UVM_UNLOCK_AND_WAIT(udv, &udv_lock, false, "udv_detach",0); goto again; } /* * got it! nuke it now. */ LIST_REMOVE(udv, u_list); if (udv->u_flags & UVM_DEVICE_WANTED) wakeup(udv); mutex_exit(&udv_lock); rw_exit(uobj->vmobjlock); uvm_obj_destroy(uobj, true); kmem_free(udv, sizeof(*udv)); UVMHIST_LOG(maphist," <- done, freed uobj=%#jx", (uintptr_t)uobj, 0, 0, 0); } /* * udv_fault: non-standard fault routine for device "pages" * * => rather than having a "get" function, we have a fault routine * since we don't return vm_pages we need full control over the * pmap_enter map in * => all the usual fault data structured are locked by the caller * (i.e. maps(read), amap (if any), uobj) * => on return, we unlock all fault data structures * => flags: PGO_ALLPAGES: get all of the pages * PGO_LOCKED: fault data structures are locked * XXX: currently PGO_LOCKED is always required ... consider removing * it as a flag * => NOTE: vaddr is the VA of pps[0] in ufi->entry, _NOT_ pps[centeridx] */ static int udv_fault(struct uvm_faultinfo *ufi, vaddr_t vaddr, struct vm_page **pps, int npages, int centeridx, vm_prot_t access_type, int flags) { struct vm_map_entry *entry = ufi->entry; struct uvm_object *uobj = entry->object.uvm_obj; struct uvm_device *udv = (struct uvm_device *)uobj; vaddr_t curr_va; off_t curr_offset; paddr_t paddr, mdpgno; u_int mmapflags; int lcv, retval; dev_t device; vm_prot_t mapprot; UVMHIST_FUNC(__func__); UVMHIST_CALLED(maphist); UVMHIST_LOG(maphist," flags=%#jx", flags,0,0,0); /* * we do not allow device mappings to be mapped copy-on-write * so we kill any attempt to do so here. */ if (UVM_ET_ISCOPYONWRITE(entry)) { UVMHIST_LOG(maphist, "<- failed -- COW entry (etype=%#jx)", entry->etype, 0,0,0); uvmfault_unlockall(ufi, ufi->entry->aref.ar_amap, uobj); return EIO; } /* * get device map function. */ device = udv->u_device; if (cdevsw_lookup(device) == NULL) { /* XXX This should not happen */ uvmfault_unlockall(ufi, ufi->entry->aref.ar_amap, uobj); return EIO; } /* * now we must determine the offset in udv to use and the VA to * use for pmap_enter. note that we always use orig_map's pmap * for pmap_enter (even if we have a submap). since virtual * addresses in a submap must match the main map, this is ok. */ /* udv offset = (offset from start of entry) + entry's offset */ curr_offset = entry->offset + (vaddr - entry->start); /* pmap va = vaddr (virtual address of pps[0]) */ curr_va = vaddr; /* * loop over the page range entering in as needed */ retval = 0; for (lcv = 0 ; lcv < npages ; lcv++, curr_offset += PAGE_SIZE, curr_va += PAGE_SIZE) { if ((flags & PGO_ALLPAGES) == 0 && lcv != centeridx) continue; if (pps[lcv] == PGO_DONTCARE) continue; mdpgno = cdev_mmap(device, curr_offset, access_type); if (mdpgno == -1) { retval = EIO; break; } paddr = pmap_phys_address(mdpgno); mmapflags = pmap_mmap_flags(mdpgno); mapprot = ufi->entry->protection; UVMHIST_LOG(maphist, " MAPPING: device: pm=%#jx, va=%#jx, pa=%#jx, at=%jd", (uintptr_t)ufi->orig_map->pmap, curr_va, paddr, mapprot); if (pmap_enter(ufi->orig_map->pmap, curr_va, paddr, mapprot, PMAP_CANFAIL | mapprot | mmapflags) != 0) { /* * pmap_enter() didn't have the resource to * enter this mapping. Unlock everything, * wait for the pagedaemon to free up some * pages, and then tell uvm_fault() to start * the fault again. * * XXX Needs some rethinking for the PGO_ALLPAGES * XXX case. */ pmap_update(ufi->orig_map->pmap); /* sync what we have so far */ uvmfault_unlockall(ufi, ufi->entry->aref.ar_amap, uobj); return ENOMEM; } } pmap_update(ufi->orig_map->pmap); uvmfault_unlockall(ufi, ufi->entry->aref.ar_amap, uobj); return retval; }
5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 /* $NetBSD: if43_20.c,v 1.5 2019/12/12 02:15:42 pgoyette Exp $ */ /*- * Copyright (c) 2018 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Paul Goyette * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: if43_20.c,v 1.5 2019/12/12 02:15:42 pgoyette Exp $"); #if defined(_KERNEL_OPT) #include "opt_compat_netbsd.h" #endif #include <sys/param.h> #include <sys/systm.h> #include <sys/filedesc.h> #include <sys/kernel.h> #include <sys/proc.h> #include <sys/file.h> #include <sys/socket.h> #include <sys/socketvar.h> #include <sys/stat.h> #include <sys/ioctl.h> #include <sys/fcntl.h> #include <sys/syslog.h> #include <sys/unistd.h> #include <sys/resourcevar.h> #include <sys/mbuf.h> /* for MLEN */ #include <sys/protosw.h> #include <sys/compat_stub.h> #include <net/if.h> #include <net/bpf.h> #include <net/route.h> #include <netinet/in.h> #include <netinet/in_systm.h> #include <netinet/ip.h> #include <net/if_gre.h> #include <net/if_tap.h> #include <net80211/ieee80211_ioctl.h> #include <compat/common/compat_mod.h> static int if43_cvtcmd_20(u_long ncmd) { switch (ncmd) { case OSIOCG80211STATS: case OSIOCG80211ZSTATS: return 0; default: return EINVAL; } } void if43_20_init(void) { MODULE_HOOK_SET(if43_cvtcmd_20_hook, if43_cvtcmd_20); } void if43_20_fini(void) { MODULE_HOOK_UNSET(if43_cvtcmd_20_hook); }
203 202 203 202 1 197 5 5 53 3 148 15 22 3 25 1 34 2 33 1 141 1 2 25 169 162 166 17 44 71 71 54 15 136 17 1 1 2 95 3 72 5 5 39 19 21 57 35 24 24 9 9 9 9 9 9 4 1 4 2 1 2 35 35 3 29 49 48 12 1 2 29 1 18 17 1 11 194 194 2 2 2 2 2 2 4 9 179 24 22 22 22 21 4 17 16 5 4 1 6 11 20 8 5 3 3 4 11 6 2 14 2 17 39 3 3 25 8 34 2 26 10 33 3 34 28 9 6 4 1 1 3 2 3 1 2 1 1 925 926 926 922 929 928 264 918 238 335 15 31 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 /* $NetBSD: vfs_vnops.c,v 1.242 2023/07/10 02:31:55 christos Exp $ */ /*- * Copyright (c) 2009 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Andrew Doran. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /* * Copyright (c) 1982, 1986, 1989, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)vfs_vnops.c 8.14 (Berkeley) 6/15/95 */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: vfs_vnops.c,v 1.242 2023/07/10 02:31:55 christos Exp $"); #include "veriexec.h" #include <sys/param.h> #include <sys/systm.h> #include <sys/kernel.h> #include <sys/file.h> #include <sys/stat.h> #include <sys/buf.h> #include <sys/proc.h> #include <sys/mount.h> #include <sys/namei.h> #include <sys/vnode_impl.h> #include <sys/ioctl.h> #include <sys/tty.h> #include <sys/poll.h> #include <sys/kauth.h> #include <sys/syslog.h> #include <sys/fstrans.h> #include <sys/atomic.h> #include <sys/filedesc.h> #include <sys/wapbl.h> #include <sys/mman.h> #include <miscfs/specfs/specdev.h> #include <miscfs/fifofs/fifo.h> #include <uvm/uvm_extern.h> #include <uvm/uvm_readahead.h> #include <uvm/uvm_device.h> #ifdef UNION #include <fs/union/union.h> #endif #ifndef COMPAT_ZERODEV #define COMPAT_ZERODEV(dev) (0) #endif int (*vn_union_readdir_hook)(struct vnode **, struct file *, struct lwp *); #include <sys/verified_exec.h> static int vn_read(file_t *fp, off_t *offset, struct uio *uio, kauth_cred_t cred, int flags); static int vn_write(file_t *fp, off_t *offset, struct uio *uio, kauth_cred_t cred, int flags); static int vn_closefile(file_t *fp); static int vn_poll(file_t *fp, int events); static int vn_fcntl(file_t *fp, u_int com, void *data); static int vn_statfile(file_t *fp, struct stat *sb); static int vn_ioctl(file_t *fp, u_long com, void *data); static int vn_mmap(struct file *, off_t *, size_t, int, int *, int *, struct uvm_object **, int *); static int vn_seek(struct file *, off_t, int, off_t *, int); static int vn_advlock(struct file *, void *, int, struct flock *, int); static int vn_fpathconf(struct file *, int, register_t *); static int vn_posix_fadvise(struct file *, off_t, off_t, int); static int vn_truncate(file_t *, off_t); const struct fileops vnops = { .fo_name = "vn", .fo_read = vn_read, .fo_write = vn_write, .fo_ioctl = vn_ioctl, .fo_fcntl = vn_fcntl, .fo_poll = vn_poll, .fo_stat = vn_statfile, .fo_close = vn_closefile, .fo_kqfilter = vn_kqfilter, .fo_restart = fnullop_restart, .fo_mmap = vn_mmap, .fo_seek = vn_seek, .fo_advlock = vn_advlock, .fo_fpathconf = vn_fpathconf, .fo_posix_fadvise = vn_posix_fadvise, .fo_truncate = vn_truncate, }; /* * Common code for vnode open operations. * Check permissions, and call the VOP_OPEN or VOP_CREATE routine. * * at_dvp is the directory for openat(), if any. * pb is the path. * nmode is additional namei flags, restricted to TRYEMULROOT and NOCHROOT. * fmode is the open flags, converted from O_* to F* * cmode is the creation file permissions. * * XXX shouldn't cmode be mode_t? * * On success produces either a locked vnode in *ret_vp, or NULL in * *ret_vp and a file descriptor number in *ret_fd. * * The caller may pass NULL for ret_fd (and ret_domove), in which case * EOPNOTSUPP will be produced in the cases that would otherwise return * a file descriptor. * * Note that callers that want no-follow behavior should pass * O_NOFOLLOW in fmode. Neither FOLLOW nor NOFOLLOW in nmode is * honored. */ int vn_open(struct vnode *at_dvp, struct pathbuf *pb, int nmode, int fmode, int cmode, struct vnode **ret_vp, bool *ret_domove, int *ret_fd) { struct nameidata nd; struct vnode *vp = NULL; struct lwp *l = curlwp; kauth_cred_t cred = l->l_cred; struct vattr va; int error; const char *pathstring; KASSERT((nmode & (TRYEMULROOT | NOCHROOT)) == nmode); KASSERT(ret_vp != NULL); KASSERT((ret_domove == NULL) == (ret_fd == NULL)); if ((fmode & (O_CREAT | O_DIRECTORY)) == (O_CREAT | O_DIRECTORY)) return EINVAL; NDINIT(&nd, LOOKUP, nmode, pb); if (at_dvp != NULL) NDAT(&nd, at_dvp); nd.ni_cnd.cn_flags &= TRYEMULROOT | NOCHROOT; if (fmode & O_CREAT) { nd.ni_cnd.cn_nameiop = CREATE; nd.ni_cnd.cn_flags |= LOCKPARENT | LOCKLEAF; if ((fmode & O_EXCL) == 0 && ((fmode & O_NOFOLLOW) == 0)) nd.ni_cnd.cn_flags |= FOLLOW; if ((fmode & O_EXCL) == 0) nd.ni_cnd.cn_flags |= NONEXCLHACK; } else { nd.ni_cnd.cn_nameiop = LOOKUP; nd.ni_cnd.cn_flags |= LOCKLEAF; if ((fmode & O_NOFOLLOW) == 0) nd.ni_cnd.cn_flags |= FOLLOW; } pathstring = pathbuf_stringcopy_get(nd.ni_pathbuf); if (pathstring == NULL) { return ENOMEM; } /* * When this "interface" was exposed to do_open() it used * to initialize l_dupfd to -newfd-1 (thus passing in the * new file handle number to use)... but nothing in the * kernel uses that value. So just send 0. */ l->l_dupfd = 0; error = namei(&nd); if (error) goto out; vp = nd.ni_vp; #if NVERIEXEC > 0 error = veriexec_openchk(l, nd.ni_vp, pathstring, fmode); if (error) { /* We have to release the locks ourselves */ /* * 20210604 dholland passing NONEXCLHACK means we can * get ni_dvp == NULL back if ni_vp exists, and we should * treat that like the non-O_CREAT case. */ if ((fmode & O_CREAT) != 0 && nd.ni_dvp != NULL) { if (vp == NULL) { vput(nd.ni_dvp); } else { VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd); if (nd.ni_dvp == nd.ni_vp) vrele(nd.ni_dvp); else vput(nd.ni_dvp); nd.ni_dvp = NULL; vput(vp); vp = NULL; } } else { vput(vp); vp = NULL; } goto out; } #endif /* NVERIEXEC > 0 */ /* * 20210604 dholland ditto */ if ((fmode & O_CREAT) != 0 && nd.ni_dvp != NULL) { if (nd.ni_vp == NULL) { vattr_null(&va); va.va_type = VREG; va.va_mode = cmode; if (fmode & O_EXCL) va.va_vaflags |= VA_EXCLUSIVE; error = VOP_CREATE(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &va); if (error) { vput(nd.ni_dvp); goto out; } fmode &= ~O_TRUNC; vp = nd.ni_vp; vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); vput(nd.ni_dvp); } else { VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd); if (nd.ni_dvp == nd.ni_vp) vrele(nd.ni_dvp); else vput(nd.ni_dvp); nd.ni_dvp = NULL; vp = nd.ni_vp; if (fmode & O_EXCL) { error = EEXIST; goto bad; } fmode &= ~O_CREAT; } } else if ((fmode & O_CREAT) != 0) { /* * 20210606 dholland passing NONEXCLHACK means this * case exists; it is the same as the following one * but also needs to do things in the second (exists) * half of the following block. (Besides handle * ni_dvp, anyway.) */ vp = nd.ni_vp; KASSERT((fmode & O_EXCL) == 0); fmode &= ~O_CREAT; } else { vp = nd.ni_vp; } if (vp->v_type == VSOCK) { error = EOPNOTSUPP; goto bad; } if (nd.ni_vp->v_type == VLNK) { error = EFTYPE; goto bad; } if ((fmode & O_CREAT) == 0) { error = vn_openchk(vp, cred, fmode); if (error != 0) goto bad; } if (fmode & O_TRUNC) { vattr_null(&va); va.va_size = 0; error = VOP_SETATTR(vp, &va, cred); if (error != 0) goto bad; } if ((error = VOP_OPEN(vp, fmode, cred)) != 0) goto bad; if (fmode & FWRITE) { mutex_enter(vp->v_interlock); vp->v_writecount++; mutex_exit(vp->v_interlock); } bad: if (error) { vput(vp); vp = NULL; } out: pathbuf_stringcopy_put(nd.ni_pathbuf, pathstring); switch (error) { case EDUPFD: case EMOVEFD: /* if the caller isn't prepared to handle fds, fail for them */ if (ret_fd == NULL) { error = EOPNOTSUPP; break; } *ret_vp = NULL; *ret_domove = error == EMOVEFD; *ret_fd = l->l_dupfd; error = 0; break; case 0: KASSERT(VOP_ISLOCKED(vp) == LK_EXCLUSIVE); *ret_vp = vp; break; } l->l_dupfd = 0; return error; } /* * Check for write permissions on the specified vnode. * Prototype text segments cannot be written. */ int vn_writechk(struct vnode *vp) { /* * If the vnode is in use as a process's text, * we can't allow writing. */ if (vp->v_iflag & VI_TEXT) return ETXTBSY; return 0; } int vn_openchk(struct vnode *vp, kauth_cred_t cred, int fflags) { int permbits = 0; int error; if (vp->v_type == VNON || vp->v_type == VBAD) return ENXIO; if ((fflags & O_DIRECTORY) != 0 && vp->v_type != VDIR) return ENOTDIR; if ((fflags & O_REGULAR) != 0 && vp->v_type != VREG) return EFTYPE; if ((fflags & FREAD) != 0) { permbits = VREAD; } if ((fflags & FEXEC) != 0) { permbits |= VEXEC; } if ((fflags & (FWRITE | O_TRUNC)) != 0) { permbits |= VWRITE; if (vp->v_type == VDIR) { error = EISDIR; goto bad; } error = vn_writechk(vp); if (error != 0) goto bad; } error = VOP_ACCESS(vp, permbits, cred); bad: return error; } /* * Mark a vnode as having executable mappings. */ void vn_markexec(struct vnode *vp) { if ((vp->v_iflag & VI_EXECMAP) != 0) { /* Safe unlocked, as long as caller holds a reference. */ return; } rw_enter(vp->v_uobj.vmobjlock, RW_WRITER); mutex_enter(vp->v_interlock); if ((vp->v_iflag & VI_EXECMAP) == 0) { cpu_count(CPU_COUNT_EXECPAGES, vp->v_uobj.uo_npages); vp->v_iflag |= VI_EXECMAP; } mutex_exit(vp->v_interlock); rw_exit(vp->v_uobj.vmobjlock); } /* * Mark a vnode as being the text of a process. * Fail if the vnode is currently writable. */ int vn_marktext(struct vnode *vp) { if ((vp->v_iflag & (VI_TEXT|VI_EXECMAP)) == (VI_TEXT|VI_EXECMAP)) { /* Safe unlocked, as long as caller holds a reference. */ return 0; } rw_enter(vp->v_uobj.vmobjlock, RW_WRITER); mutex_enter(vp->v_interlock); if (vp->v_writecount != 0) { KASSERT((vp->v_iflag & VI_TEXT) == 0); mutex_exit(vp->v_interlock); rw_exit(vp->v_uobj.vmobjlock); return ETXTBSY; } if ((vp->v_iflag & VI_EXECMAP) == 0) { cpu_count(CPU_COUNT_EXECPAGES, vp->v_uobj.uo_npages); } vp->v_iflag |= (VI_TEXT | VI_EXECMAP); mutex_exit(vp->v_interlock); rw_exit(vp->v_uobj.vmobjlock); return 0; } /* * Vnode close call * * Note: takes an unlocked vnode, while VOP_CLOSE takes a locked node. */ int vn_close(struct vnode *vp, int flags, kauth_cred_t cred) { int error; vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); if (flags & FWRITE) { mutex_enter(vp->v_interlock); KASSERT(vp->v_writecount > 0); vp->v_writecount--; mutex_exit(vp->v_interlock); } error = VOP_CLOSE(vp, flags, cred); vput(vp); return error; } static int enforce_rlimit_fsize(struct vnode *vp, struct uio *uio, int ioflag) { struct lwp *l = curlwp; off_t testoff; if (uio->uio_rw != UIO_WRITE || vp->v_type != VREG) return 0; KASSERT(VOP_ISLOCKED(vp) == LK_EXCLUSIVE); if (ioflag & IO_APPEND) testoff = vp->v_size; else testoff = uio->uio_offset; if (testoff + uio->uio_resid > l->l_proc->p_rlimit[RLIMIT_FSIZE].rlim_cur) { mutex_enter(&proc_lock); psignal(l->l_proc, SIGXFSZ); mutex_exit(&proc_lock); return EFBIG; } return 0; } /* * Package up an I/O request on a vnode into a uio and do it. */ int vn_rdwr(enum uio_rw rw, struct vnode *vp, void *base, int len, off_t offset, enum uio_seg segflg, int ioflg, kauth_cred_t cred, size_t *aresid, struct lwp *l) { struct uio auio; struct iovec aiov; int error; if ((ioflg & IO_NODELOCKED) == 0) { if (rw == UIO_READ) { vn_lock(vp, LK_SHARED | LK_RETRY); } else /* UIO_WRITE */ { vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); } } auio.uio_iov = &aiov; auio.uio_iovcnt = 1; aiov.iov_base = base; aiov.iov_len = len; auio.uio_resid = len; auio.uio_offset = offset; auio.uio_rw = rw; if (segflg == UIO_SYSSPACE) { UIO_SETUP_SYSSPACE(&auio); } else { auio.uio_vmspace = l->l_proc->p_vmspace; } if ((error = enforce_rlimit_fsize(vp, &auio, ioflg)) != 0) goto out; if (rw == UIO_READ) { error = VOP_READ(vp, &auio, ioflg, cred); } else { error = VOP_WRITE(vp, &auio, ioflg, cred); } if (aresid) *aresid = auio.uio_resid; else if (auio.uio_resid && error == 0) error = EIO; out: if ((ioflg & IO_NODELOCKED) == 0) { VOP_UNLOCK(vp); } return error; } int vn_readdir(file_t *fp, char *bf, int segflg, u_int count, int *done, struct lwp *l, off_t **cookies, int *ncookies) { struct vnode *vp = fp->f_vnode; struct iovec aiov; struct uio auio; int error, eofflag; /* Limit the size on any kernel buffers used by VOP_READDIR */ count = uimin(MAXBSIZE, count); unionread: if (vp->v_type != VDIR) return EINVAL; aiov.iov_base = bf; aiov.iov_len = count; auio.uio_iov = &aiov; auio.uio_iovcnt = 1; auio.uio_rw = UIO_READ; if (segflg == UIO_SYSSPACE) { UIO_SETUP_SYSSPACE(&auio); } else { KASSERT(l == curlwp); auio.uio_vmspace = l->l_proc->p_vmspace; } auio.uio_resid = count; vn_lock(vp, LK_SHARED | LK_RETRY); mutex_enter(&fp->f_lock); auio.uio_offset = fp->f_offset; mutex_exit(&fp->f_lock); error = VOP_READDIR(vp, &auio, fp->f_cred, &eofflag, cookies, ncookies); mutex_enter(&fp->f_lock); fp->f_offset = auio.uio_offset; mutex_exit(&fp->f_lock); VOP_UNLOCK(vp); if (error) return error; if (count == auio.uio_resid && vn_union_readdir_hook) { struct vnode *ovp = vp; error = (*vn_union_readdir_hook)(&vp, fp, l); if (error) return error; if (vp != ovp) goto unionread; } if (count == auio.uio_resid && (vp->v_vflag & VV_ROOT) && (vp->v_mount->mnt_flag & MNT_UNION)) { struct vnode *tvp = vp; vp = vp->v_mount->mnt_vnodecovered; vref(vp); mutex_enter(&fp->f_lock); fp->f_vnode = vp; fp->f_offset = 0; mutex_exit(&fp->f_lock); vrele(tvp); goto unionread; } *done = count - auio.uio_resid; return error; } /* * File table vnode read routine. */ static int vn_read(file_t *fp, off_t *offset, struct uio *uio, kauth_cred_t cred, int flags) { struct vnode *vp = fp->f_vnode; int error, ioflag, fflag; size_t count; ioflag = IO_ADV_ENCODE(fp->f_advice); fflag = fp->f_flag; if (fflag & FNONBLOCK) ioflag |= IO_NDELAY; if ((fflag & (FFSYNC | FRSYNC)) == (FFSYNC | FRSYNC)) ioflag |= IO_SYNC; if (fflag & FALTIO) ioflag |= IO_ALTSEMANTICS; if (fflag & FDIRECT) ioflag |= IO_DIRECT; if (offset == &fp->f_offset && (flags & FOF_UPDATE_OFFSET) != 0) vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); else vn_lock(vp, LK_SHARED | LK_RETRY); if (__predict_false(vp->v_type == VDIR) && offset == &fp->f_offset && (flags & FOF_UPDATE_OFFSET) == 0) mutex_enter(&fp->f_lock); uio->uio_offset = *offset; if (__predict_false(vp->v_type == VDIR) && offset == &fp->f_offset && (flags & FOF_UPDATE_OFFSET) == 0) mutex_enter(&fp->f_lock); count = uio->uio_resid; error = VOP_READ(vp, uio, ioflag, cred); if (flags & FOF_UPDATE_OFFSET) *offset += count - uio->uio_resid; VOP_UNLOCK(vp); return error; } /* * File table vnode write routine. */ static int vn_write(file_t *fp, off_t *offset, struct uio *uio, kauth_cred_t cred, int flags) { struct vnode *vp = fp->f_vnode; int error, ioflag, fflag; size_t count; ioflag = IO_ADV_ENCODE(fp->f_advice) | IO_UNIT; fflag = fp->f_flag; if (vp->v_type == VREG && (fflag & O_APPEND)) ioflag |= IO_APPEND; if (fflag & FNONBLOCK) ioflag |= IO_NDELAY; if (fflag & FFSYNC || (vp->v_mount && (vp->v_mount->mnt_flag & MNT_SYNCHRONOUS))) ioflag |= IO_SYNC; else if (fflag & FDSYNC) ioflag |= IO_DSYNC; if (fflag & FALTIO) ioflag |= IO_ALTSEMANTICS; if (fflag & FDIRECT) ioflag |= IO_DIRECT; vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); uio->uio_offset = *offset; count = uio->uio_resid; if ((error = enforce_rlimit_fsize(vp, uio, ioflag)) != 0) goto out; error = VOP_WRITE(vp, uio, ioflag, cred); if (flags & FOF_UPDATE_OFFSET) { if (ioflag & IO_APPEND) { /* * SUSv3 describes behaviour for count = 0 as following: * "Before any action ... is taken, and if nbyte is zero * and the file is a regular file, the write() function * ... in the absence of errors ... shall return zero * and have no other results." */ if (count) *offset = uio->uio_offset; } else *offset += count - uio->uio_resid; } out: VOP_UNLOCK(vp); return error; } /* * File table vnode stat routine. */ static int vn_statfile(file_t *fp, struct stat *sb) { struct vnode *vp = fp->f_vnode; int error; vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); error = vn_stat(vp, sb); VOP_UNLOCK(vp); return error; } int vn_stat(struct vnode *vp, struct stat *sb) { struct vattr va; int error; mode_t mode; memset(&va, 0, sizeof(va)); error = VOP_GETATTR(vp, &va, kauth_cred_get()); if (error) return error; /* * Copy from vattr table */ memset(sb, 0, sizeof(*sb)); sb->st_dev = va.va_fsid; sb->st_ino = va.va_fileid; mode = va.va_mode; switch (vp->v_type) { case VREG: mode |= S_IFREG; break; case VDIR: mode |= S_IFDIR; break; case VBLK: mode |= S_IFBLK; break; case VCHR: mode |= S_IFCHR; break; case VLNK: mode |= S_IFLNK; break; case VSOCK: mode |= S_IFSOCK; break; case VFIFO: mode |= S_IFIFO; break; default: return EBADF; } sb->st_mode = mode; sb->st_nlink = va.va_nlink; sb->st_uid = va.va_uid; sb->st_gid = va.va_gid; sb->st_rdev = va.va_rdev; sb->st_size = va.va_size; sb->st_atimespec = va.va_atime; sb->st_mtimespec = va.va_mtime; sb->st_ctimespec = va.va_ctime; sb->st_birthtimespec = va.va_birthtime; sb->st_blksize = va.va_blocksize; sb->st_flags = va.va_flags; sb->st_gen = 0; sb->st_blocks = va.va_bytes / S_BLKSIZE; return 0; } /* * File table vnode fcntl routine. */ static int vn_fcntl(file_t *fp, u_int com, void *data) { struct vnode *vp = fp->f_vnode; int error; error = VOP_FCNTL(vp, com, data, fp->f_flag, kauth_cred_get()); return error; } /* * File table vnode ioctl routine. */ static int vn_ioctl(file_t *fp, u_long com, void *data) { struct vnode *vp = fp->f_vnode, *ovp; struct vattr vattr; int error; switch (vp->v_type) { case VREG: case VDIR: if (com == FIONREAD) { vn_lock(vp, LK_SHARED | LK_RETRY); error = VOP_GETATTR(vp, &vattr, kauth_cred_get()); if (error == 0) { if (vp->v_type == VDIR) mutex_enter(&fp->f_lock); *(int *)data = vattr.va_size - fp->f_offset; if (vp->v_type == VDIR) mutex_exit(&fp->f_lock); } VOP_UNLOCK(vp); if (error) return error; return 0; } if ((com == FIONWRITE) || (com == FIONSPACE)) { /* * Files don't have send queues, so there never * are any bytes in them, nor is there any * open space in them. */ *(int *)data = 0; return 0; } if (com == FIOGETBMAP) { daddr_t *block; if (*(daddr_t *)data < 0) return EINVAL; block = (daddr_t *)data; vn_lock(vp, LK_SHARED | LK_RETRY); error = VOP_BMAP(vp, *block, NULL, block, NULL); VOP_UNLOCK(vp); return error; } if (com == OFIOGETBMAP) { daddr_t ibn, obn; if (*(int32_t *)data < 0) return EINVAL; ibn = (daddr_t)*(int32_t *)data; vn_lock(vp, LK_SHARED | LK_RETRY); error = VOP_BMAP(vp, ibn, NULL, &obn, NULL); VOP_UNLOCK(vp); *(int32_t *)data = (int32_t)obn; return error; } if (com == FIONBIO || com == FIOASYNC) /* XXX */ return 0; /* XXX */ /* FALLTHROUGH */ case VFIFO: case VCHR: case VBLK: error = VOP_IOCTL(vp, com, data, fp->f_flag, kauth_cred_get()); if (error == 0 && com == TIOCSCTTY) { vref(vp); mutex_enter(&proc_lock); ovp = curproc->p_session->s_ttyvp; curproc->p_session->s_ttyvp = vp; mutex_exit(&proc_lock); if (ovp != NULL) vrele(ovp); } return error; default: return EPASSTHROUGH; } } /* * File table vnode poll routine. */ static int vn_poll(file_t *fp, int events) { return VOP_POLL(fp->f_vnode, events); } /* * File table vnode kqfilter routine. */ int vn_kqfilter(file_t *fp, struct knote *kn) { return VOP_KQFILTER(fp->f_vnode, kn); } static int vn_mmap(struct file *fp, off_t *offp, size_t size, int prot, int *flagsp, int *advicep, struct uvm_object **uobjp, int *maxprotp) { struct uvm_object *uobj; struct vnode *vp; struct vattr va; struct lwp *l; vm_prot_t maxprot; off_t off; int error, flags; bool needwritemap; l = curlwp; off = *offp; flags = *flagsp; maxprot = VM_PROT_EXECUTE; KASSERT(size > 0); vp = fp->f_vnode; if (vp->v_type != VREG && vp->v_type != VCHR && vp->v_type != VBLK) { /* only REG/CHR/BLK support mmap */ return ENODEV; } if (vp->v_type != VCHR && off < 0) { return EINVAL; } #if SIZE_MAX > UINT32_MAX /* XXX -Wtype-limits */ if (vp->v_type != VCHR && size > __type_max(off_t)) { return EOVERFLOW; } #endif if (vp->v_type != VCHR && off > __type_max(off_t) - size) { /* no offset wrapping */ return EOVERFLOW; } /* special case: catch SunOS style /dev/zero */ if (vp->v_type == VCHR && (vp->v_rdev == zerodev || COMPAT_ZERODEV(vp->v_rdev))) { *uobjp = NULL; *maxprotp = VM_PROT_ALL; return 0; } /* * Old programs may not select a specific sharing type, so * default to an appropriate one. * * XXX: how does MAP_ANON fit in the picture? */ if ((flags & (MAP_SHARED|MAP_PRIVATE)) == 0) { #if defined(DEBUG) struct proc *p = l->l_proc; printf("WARNING: defaulted mmap() share type to " "%s (pid %d command %s)\n", vp->v_type == VCHR ? "MAP_SHARED" : "MAP_PRIVATE", p->p_pid, p->p_comm); #endif if (vp->v_type == VCHR) flags |= MAP_SHARED; /* for a device */ else flags |= MAP_PRIVATE; /* for a file */ } /* * MAP_PRIVATE device mappings don't make sense (and aren't * supported anyway). However, some programs rely on this, * so just change it to MAP_SHARED. */ if (vp->v_type == VCHR && (flags & MAP_PRIVATE) != 0) { flags = (flags & ~MAP_PRIVATE) | MAP_SHARED; } /* * now check protection */ /* check read access */ if (fp->f_flag & FREAD) maxprot |= VM_PROT_READ; else if (prot & PROT_READ) { return EACCES; } /* check write access, shared case first */ if (flags & MAP_SHARED) { /* * if the file is writable, only add PROT_WRITE to * maxprot if the file is not immutable, append-only. * otherwise, if we have asked for PROT_WRITE, return * EPERM. */ if (fp->f_flag & FWRITE) { vn_lock(vp, LK_SHARED | LK_RETRY); error = VOP_GETATTR(vp, &va, l->l_cred); VOP_UNLOCK(vp); if (error) { return error; } if ((va.va_flags & (SF_SNAPSHOT|IMMUTABLE|APPEND)) == 0) maxprot |= VM_PROT_WRITE; else if (prot & PROT_WRITE) { return EPERM; } } else if (prot & PROT_WRITE) { return EACCES; } } else { /* MAP_PRIVATE mappings can always write to */ maxprot |= VM_PROT_WRITE; } /* * Don't allow mmap for EXEC if the file system * is mounted NOEXEC. */ if ((prot & PROT_EXEC) != 0 && (vp->v_mount->mnt_flag & MNT_NOEXEC) != 0) { return EACCES; } if (vp->v_type != VCHR) { error = VOP_MMAP(vp, prot, curlwp->l_cred); if (error) { return error; } vref(vp); uobj = &vp->v_uobj; /* * If the vnode is being mapped with PROT_EXEC, * then mark it as text. */ if (prot & PROT_EXEC) { vn_markexec(vp); } } else { int i = maxprot; /* * XXX Some devices don't like to be mapped with * XXX PROT_EXEC or PROT_WRITE, but we don't really * XXX have a better way of handling this, right now */ do { uobj = udv_attach(vp->v_rdev, (flags & MAP_SHARED) ? i : (i & ~VM_PROT_WRITE), off, size); i--; } while ((uobj == NULL) && (i > 0)); if (uobj == NULL) { return EINVAL; } *advicep = UVM_ADV_RANDOM; } /* * Set vnode flags to indicate the new kinds of mapping. * We take the vnode lock in exclusive mode here to serialize * with direct I/O. * * Safe to check for these flag values without a lock, as * long as a reference to the vnode is held. */ needwritemap = (vp->v_iflag & VI_WRMAP) == 0 && (flags & MAP_SHARED) != 0 && (maxprot & VM_PROT_WRITE) != 0; if ((vp->v_vflag & VV_MAPPED) == 0 || needwritemap) { vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); vp->v_vflag |= VV_MAPPED; if (needwritemap) { rw_enter(vp->v_uobj.vmobjlock, RW_WRITER); mutex_enter(vp->v_interlock); vp->v_iflag |= VI_WRMAP; mutex_exit(vp->v_interlock); rw_exit(vp->v_uobj.vmobjlock); } VOP_UNLOCK(vp); } #if NVERIEXEC > 0 /* * Check if the file can be executed indirectly. * * XXX: This gives false warnings about "Incorrect access type" * XXX: if the mapping is not executable. Harmless, but will be * XXX: fixed as part of other changes. */ if (veriexec_verify(l, vp, "(mmap)", VERIEXEC_INDIRECT, NULL)) { /* * Don't allow executable mappings if we can't * indirectly execute the file. */ if (prot & VM_PROT_EXECUTE) { return EPERM; } /* * Strip the executable bit from 'maxprot' to make sure * it can't be made executable later. */ maxprot &= ~VM_PROT_EXECUTE; } #endif /* NVERIEXEC > 0 */ *uobjp = uobj; *maxprotp = maxprot; *flagsp = flags; return 0; } static int vn_seek(struct file *fp, off_t delta, int whence, off_t *newoffp, int flags) { const off_t OFF_MIN = __type_min(off_t); const off_t OFF_MAX = __type_max(off_t); kauth_cred_t cred = fp->f_cred; off_t oldoff, newoff; struct vnode *vp = fp->f_vnode; struct vattr vattr; int error; if (vp->v_type == VFIFO) return ESPIPE; if (flags & FOF_UPDATE_OFFSET) vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); else vn_lock(vp, LK_SHARED | LK_RETRY); /* Compute the old and new offsets. */ if (vp->v_type == VDIR && (flags & FOF_UPDATE_OFFSET) == 0) mutex_enter(&fp->f_lock); oldoff = fp->f_offset; if (vp->v_type == VDIR && (flags & FOF_UPDATE_OFFSET) == 0) mutex_exit(&fp->f_lock); switch (whence) { case SEEK_CUR: if (delta > 0) { if (oldoff > 0 && delta > OFF_MAX - oldoff) { newoff = OFF_MAX; break; } } else { if (oldoff < 0 && delta < OFF_MIN - oldoff) { newoff = OFF_MIN; break; } } newoff = oldoff + delta; break; case SEEK_END: error = VOP_GETATTR(vp, &vattr, cred); if (error) goto out; if (vattr.va_size > OFF_MAX || delta > OFF_MAX - (off_t)vattr.va_size) { newoff = OFF_MAX; break; } newoff = delta + vattr.va_size; break; case SEEK_SET: newoff = delta; break; default: error = EINVAL; goto out; } /* Pass the proposed change to the file system to audit. */ error = VOP_SEEK(vp, oldoff, newoff, cred); if (error) goto out; /* Success! */ if (newoffp) *newoffp = newoff; if (flags & FOF_UPDATE_OFFSET) fp->f_offset = newoff; error = 0; out: VOP_UNLOCK(vp); return error; } static int vn_advlock(struct file *fp, void *id, int op, struct flock *fl, int flags) { struct vnode *const vp = fp->f_vnode; if (fl->l_whence == SEEK_CUR) { vn_lock(vp, LK_SHARED | LK_RETRY); fl->l_start += fp->f_offset; VOP_UNLOCK(vp); } return VOP_ADVLOCK(vp, id, op, fl, flags); } static int vn_fpathconf(struct file *fp, int name, register_t *retval) { struct vnode *const vp = fp->f_vnode; int error; vn_lock(vp, LK_SHARED | LK_RETRY); error = VOP_PATHCONF(vp, name, retval); VOP_UNLOCK(vp); return error; } static int vn_posix_fadvise(struct file *fp, off_t offset, off_t len, int advice) { const off_t OFF_MAX = __type_max(off_t); struct vnode *vp = fp->f_vnode; off_t endoffset; int error; if (offset < 0) { return EINVAL; } if (len == 0) { endoffset = OFF_MAX; } else if (len > 0 && (OFF_MAX - offset) >= len) { endoffset = offset + len; } else { return EINVAL; } CTASSERT(POSIX_FADV_NORMAL == UVM_ADV_NORMAL); CTASSERT(POSIX_FADV_RANDOM == UVM_ADV_RANDOM); CTASSERT(POSIX_FADV_SEQUENTIAL == UVM_ADV_SEQUENTIAL); switch (advice) { case POSIX_FADV_WILLNEED: case POSIX_FADV_DONTNEED: if (vp->v_type != VREG && vp->v_type != VBLK) return 0; break; } switch (advice) { case POSIX_FADV_NORMAL: case POSIX_FADV_RANDOM: case POSIX_FADV_SEQUENTIAL: /* * We ignore offset and size. Must lock the file to * do this, as f_advice is sub-word sized. */ mutex_enter(&fp->f_lock); fp->f_advice = (u_char)advice; mutex_exit(&fp->f_lock); error = 0; break; case POSIX_FADV_WILLNEED: error = uvm_readahead(&vp->v_uobj, offset, endoffset - offset); break; case POSIX_FADV_DONTNEED: /* * Align the region to page boundaries as VOP_PUTPAGES expects * by shrinking it. We shrink instead of expand because we * do not want to deactivate cache outside of the requested * region. It means that if the specified region is smaller * than PAGE_SIZE, we do nothing. */ if (offset <= trunc_page(OFF_MAX) && round_page(offset) < trunc_page(endoffset)) { rw_enter(vp->v_uobj.vmobjlock, RW_WRITER); error = VOP_PUTPAGES(vp, round_page(offset), trunc_page(endoffset), PGO_DEACTIVATE | PGO_CLEANIT); } else { error = 0; } break; case POSIX_FADV_NOREUSE: /* Not implemented yet. */ error = 0; break; default: error = EINVAL; break; } return error; } static int vn_truncate(file_t *fp, off_t length) { struct vattr vattr; struct vnode *vp; int error = 0; if (length < 0) return EINVAL; if ((fp->f_flag & FWRITE) == 0) return EINVAL; vp = fp->f_vnode; vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); if (vp->v_type == VDIR) error = EISDIR; else if ((error = vn_writechk(vp)) == 0) { vattr_null(&vattr); vattr.va_size = length; error = VOP_SETATTR(vp, &vattr, fp->f_cred); } VOP_UNLOCK(vp); return error; } /* * Check that the vnode is still valid, and if so * acquire requested lock. */ int vn_lock(struct vnode *vp, int flags) { struct lwp *l; int error; KASSERT(vrefcnt(vp) > 0); KASSERT((flags & ~(LK_SHARED|LK_EXCLUSIVE|LK_NOWAIT|LK_RETRY| LK_UPGRADE|LK_DOWNGRADE)) == 0); KASSERT((flags & LK_NOWAIT) != 0 || !mutex_owned(vp->v_interlock)); #ifdef DIAGNOSTIC if (wapbl_vphaswapbl(vp)) WAPBL_JUNLOCK_ASSERT(wapbl_vptomp(vp)); #endif /* Get a more useful report for lockstat. */ l = curlwp; KASSERT(l->l_rwcallsite == 0); l->l_rwcallsite = (uintptr_t)__builtin_return_address(0); error = VOP_LOCK(vp, flags); l->l_rwcallsite = 0; switch (flags & (LK_RETRY | LK_NOWAIT)) { case 0: KASSERT(error == 0 || error == ENOENT); break; case LK_RETRY: KASSERT(error == 0); break; case LK_NOWAIT: KASSERT(error == 0 || error == EBUSY || error == ENOENT); break; case LK_RETRY | LK_NOWAIT: KASSERT(error == 0 || error == EBUSY); break; } return error; } /* * File table vnode close routine. */ static int vn_closefile(file_t *fp) { return vn_close(fp->f_vnode, fp->f_flag, fp->f_cred); } /* * Simplified in-kernel wrapper calls for extended attribute access. * Both calls pass in a NULL credential, authorizing a "kernel" access. * Set IO_NODELOCKED in ioflg if the vnode is already locked. */ int vn_extattr_get(struct vnode *vp, int ioflg, int attrnamespace, const char *attrname, size_t *buflen, void *bf, struct lwp *l) { struct uio auio; struct iovec aiov; int error; aiov.iov_len = *buflen; aiov.iov_base = bf; auio.uio_iov = &aiov; auio.uio_iovcnt = 1; auio.uio_rw = UIO_READ; auio.uio_offset = 0; auio.uio_resid = *buflen; UIO_SETUP_SYSSPACE(&auio); if ((ioflg & IO_NODELOCKED) == 0) vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); error = VOP_GETEXTATTR(vp, attrnamespace, attrname, &auio, NULL, NOCRED); if ((ioflg & IO_NODELOCKED) == 0) VOP_UNLOCK(vp); if (error == 0) *buflen = *buflen - auio.uio_resid; return error; } /* * XXX Failure mode if partially written? */ int vn_extattr_set(struct vnode *vp, int ioflg, int attrnamespace, const char *attrname, size_t buflen, const void *bf, struct lwp *l) { struct uio auio; struct iovec aiov; int error; aiov.iov_len = buflen; aiov.iov_base = __UNCONST(bf); /* XXXUNCONST kills const */ auio.uio_iov = &aiov; auio.uio_iovcnt = 1; auio.uio_rw = UIO_WRITE; auio.uio_offset = 0; auio.uio_resid = buflen; UIO_SETUP_SYSSPACE(&auio); if ((ioflg & IO_NODELOCKED) == 0) { vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); } error = VOP_SETEXTATTR(vp, attrnamespace, attrname, &auio, NOCRED); if ((ioflg & IO_NODELOCKED) == 0) { VOP_UNLOCK(vp); } return error; } int vn_extattr_rm(struct vnode *vp, int ioflg, int attrnamespace, const char *attrname, struct lwp *l) { int error; if ((ioflg & IO_NODELOCKED) == 0) { vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); } error = VOP_DELETEEXTATTR(vp, attrnamespace, attrname, NOCRED); if (error == EOPNOTSUPP) error = VOP_SETEXTATTR(vp, attrnamespace, attrname, NULL, NOCRED); if ((ioflg & IO_NODELOCKED) == 0) { VOP_UNLOCK(vp); } return error; } int vn_fifo_bypass(void *v) { struct vop_generic_args *ap = v; return VOCALL(fifo_vnodeop_p, ap->a_desc->vdesc_offset, v); } /* * Open block device by device number */ int vn_bdev_open(dev_t dev, struct vnode **vpp, struct lwp *l) { int error; if ((error = bdevvp(dev, vpp)) != 0) return error; vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY); if ((error = VOP_OPEN(*vpp, FREAD | FWRITE, l->l_cred)) != 0) { vput(*vpp); return error; } mutex_enter((*vpp)->v_interlock); (*vpp)->v_writecount++; mutex_exit((*vpp)->v_interlock); VOP_UNLOCK(*vpp); return 0; } /* * Lookup the provided name in the filesystem. If the file exists, * is a valid block device, and isn't being used by anyone else, * set *vpp to the file's vnode. */ int vn_bdev_openpath(struct pathbuf *pb, struct vnode **vpp, struct lwp *l) { struct vnode *vp; dev_t dev; enum vtype vt; int error; error = vn_open(NULL, pb, 0, FREAD | FWRITE, 0, &vp, NULL, NULL); if (error != 0) return error; dev = vp->v_rdev; vt = vp->v_type; VOP_UNLOCK(vp); (void) vn_close(vp, FREAD | FWRITE, l->l_cred); if (vt != VBLK) return ENOTBLK; return vn_bdev_open(dev, vpp, l); } static long vn_knote_to_interest(const struct knote *kn) { switch (kn->kn_filter) { case EVFILT_READ: /* * Writing to the file or changing its attributes can * set the file size, which impacts the readability * filter. * * (No need to set NOTE_EXTEND here; it's only ever * send with other hints; see vnode_if.c.) */ return NOTE_WRITE | NOTE_ATTRIB; case EVFILT_VNODE: return kn->kn_sfflags; case EVFILT_WRITE: default: return 0; } } void vn_knote_attach(struct vnode *vp, struct knote *kn) { struct vnode_klist *vk = vp->v_klist; long interest = 0; /* * In the case of layered / stacked file systems, knotes * should only ever be associated with the base vnode. */ KASSERT(kn->kn_hook == vp); KASSERT(vp->v_klist == &VNODE_TO_VIMPL(vp)->vi_klist); /* * We maintain a bitmask of the kevents that there is interest in, * to minimize the impact of having watchers. It's silly to have * to traverse vn_klist every time a read or write happens simply * because there is someone interested in knowing when the file * is deleted, for example. */ mutex_enter(vp->v_interlock); SLIST_INSERT_HEAD(&vk->vk_klist, kn, kn_selnext); SLIST_FOREACH(kn, &vk->vk_klist, kn_selnext) { interest |= vn_knote_to_interest(kn); } vk->vk_interest = interest; mutex_exit(vp->v_interlock); } void vn_knote_detach(struct vnode *vp, struct knote *kn) { struct vnode_klist *vk = vp->v_klist; long interest = 0; /* See above. */ KASSERT(kn->kn_hook == vp); KASSERT(vp->v_klist == &VNODE_TO_VIMPL(vp)->vi_klist); /* * We special case removing the head of the list, because: * * 1. It's extremely likely that we're detaching the only * knote. * * 2. We're already traversing the whole list, so we don't * want to use the generic SLIST_REMOVE() which would * traverse it *again*. */ mutex_enter(vp->v_interlock); if (__predict_true(kn == SLIST_FIRST(&vk->vk_klist))) { SLIST_REMOVE_HEAD(&vk->vk_klist, kn_selnext); SLIST_FOREACH(kn, &vk->vk_klist, kn_selnext) { interest |= vn_knote_to_interest(kn); } vk->vk_interest = interest; } else { struct knote *thiskn, *nextkn, *prevkn = NULL; SLIST_FOREACH_SAFE(thiskn, &vk->vk_klist, kn_selnext, nextkn) { if (thiskn == kn) { KASSERT(kn != NULL); KASSERT(prevkn != NULL); SLIST_REMOVE_AFTER(prevkn, kn_selnext); kn = NULL; } else { interest |= vn_knote_to_interest(thiskn); prevkn = thiskn; } } vk->vk_interest = interest; } mutex_exit(vp->v_interlock); }
75 68 3 1 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 /* $NetBSD: uipc_syscalls_50.c,v 1.12 2022/09/28 15:32:09 msaitoh Exp $ */ /*- * Copyright (c) 2008 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Christos Zoulas. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #include <sys/cdefs.h> #if defined(_KERNEL_OPT) #include "opt_compat_netbsd.h" #endif #include <sys/param.h> #include <sys/kernel.h> #include <sys/msg.h> #include <sys/sysctl.h> #include <sys/syscallargs.h> #include <sys/errno.h> #include <sys/kauth.h> #include <sys/proc.h> #include <sys/time.h> #include <sys/compat_stub.h> #include <net/if.h> #include <compat/net/if.h> #include <compat/sys/time.h> #include <compat/sys/socket.h> #include <compat/sys/sockio.h> #include <compat/common/compat_mod.h> /*ARGSUSED*/ static int compat_ifdatareq(struct lwp *l, u_long cmd, void *data) { struct if_data ifi; struct ifdatareq50 *ifdr = data; struct ifnet *ifp; int error; /* Validate arguments. */ switch (cmd) { case OSIOCGIFDATA: case OSIOCZIFDATA: break; default: return ENOSYS; } ifp = ifunit(ifdr->ifdr_name); if (ifp == NULL) return ENXIO; /* Do work. */ switch (cmd) { case OSIOCGIFDATA: if_export_if_data(ifp, &ifi, false); ifdatan2o(&ifdr->ifdr_data, &ifi); return 0; case OSIOCZIFDATA: if (l != NULL) { error = kauth_authorize_network(l->l_cred, KAUTH_NETWORK_INTERFACE, KAUTH_REQ_NETWORK_INTERFACE_SETPRIV, ifp, (void *)cmd, NULL); if (error != 0) return error; } if_export_if_data(ifp, &ifi, true); ifdatan2o(&ifdr->ifdr_data, &ifi); /* XXX if_lastchange? */ return 0; default: /* Impossible due to above validation, but makes gcc happy. */ return ENOSYS; } } void uipc_syscalls_50_init(void) { MODULE_HOOK_SET(uipc_syscalls_50_hook, compat_ifdatareq); } void uipc_syscalls_50_fini(void) { MODULE_HOOK_UNSET(uipc_syscalls_50_hook); }
1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 /* $NetBSD: in_offload.c,v 1.14 2020/03/27 16:34:58 jdolecek Exp $ */ /* * Copyright (c)2005, 2006 YAMAMOTO Takashi, * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: in_offload.c,v 1.14 2020/03/27 16:34:58 jdolecek Exp $"); #include <sys/param.h> #include <sys/mbuf.h> #include <net/if.h> #include <netinet/in.h> #include <netinet/in_systm.h> #include <netinet/ip.h> #include <netinet/ip_var.h> #include <netinet/tcp.h> #include <netinet/in_offload.h> /* * Handle M_CSUM_TSOv4 in software. Split the TCP payload in chunks of * size MSS, and return mbuf chain consists of them. */ struct mbuf * tcp4_segment(struct mbuf *m, int off) { int mss; int iphlen, thlen; int hlen, len; struct ip *ip; struct tcphdr *th; uint16_t ipid, phsum; uint32_t tcpseq; struct mbuf *hdr = NULL; struct mbuf *m0 = NULL; struct mbuf *prev = NULL; struct mbuf *n, *t; int nsegs; KASSERT((m->m_flags & M_PKTHDR) != 0); KASSERT((m->m_pkthdr.csum_flags & M_CSUM_TSOv4) != 0); m->m_pkthdr.csum_flags = 0; len = m->m_pkthdr.len; KASSERT(len >= off + sizeof(*ip) + sizeof(*th)); hlen = off + sizeof(*ip); if (m->m_len < hlen) { m = m_pullup(m, hlen); if (m == NULL) goto quit; } ip = (void *)(mtod(m, char *) + off); iphlen = ip->ip_hl * 4; KASSERT(ip->ip_v == IPVERSION); KASSERT(iphlen >= sizeof(*ip)); KASSERT(ip->ip_p == IPPROTO_TCP); ipid = ntohs(ip->ip_id); hlen = off + iphlen + sizeof(*th); if (m->m_len < hlen) { m = m_pullup(m, hlen); if (m == NULL) goto quit; } th = (void *)(mtod(m, char *) + off + iphlen); tcpseq = ntohl(th->th_seq); thlen = th->th_off * 4; hlen = off + iphlen + thlen; mss = m->m_pkthdr.segsz; KASSERT(mss != 0); KASSERT(len > hlen); t = m_split(m, hlen, M_NOWAIT); if (t == NULL) goto quit; hdr = m; m = t; len -= hlen; KASSERT(len % mss == 0); ip = (void *)(mtod(hdr, char *) + off); ip->ip_len = htons(iphlen + thlen + mss); phsum = in_cksum_phdr(ip->ip_src.s_addr, ip->ip_dst.s_addr, htons((uint16_t)(thlen + mss) + IPPROTO_TCP)); for (nsegs = len / mss; nsegs > 0; nsegs--) { if (nsegs > 1) { n = m_dup(hdr, 0, hlen, M_NOWAIT); if (n == NULL) goto quit; } else n = hdr; KASSERT(n->m_len == hlen); /* XXX */ if (nsegs > 1) { t = m_split(m, mss, M_NOWAIT); if (t == NULL) { m_freem(n); goto quit; } } else t = m; m_cat(n, m); m = t; KASSERT(n->m_len >= hlen); /* XXX */ if (m0 == NULL) m0 = n; if (prev != NULL) prev->m_nextpkt = n; n->m_pkthdr.len = hlen + mss; n->m_nextpkt = NULL; /* XXX */ ip = (void *)(mtod(n, char *) + off); ip->ip_id = htons(ipid); ip->ip_sum = 0; ip->ip_sum = in4_cksum(n, 0, off, iphlen); th = (void *)(mtod(n, char *) + off + iphlen); th->th_seq = htonl(tcpseq); th->th_sum = phsum; th->th_sum = in4_cksum(n, 0, off + iphlen, thlen + mss); tcpseq += mss; ipid++; prev = n; } return m0; quit: if (hdr != NULL) m_freem(hdr); if (m != NULL) m_freem(m); for (m = m0; m != NULL; m = n) { n = m->m_nextpkt; m_freem(m); } return NULL; } int ip_tso_output(struct ifnet *ifp, struct mbuf *m, const struct sockaddr *sa, struct rtentry *rt) { struct mbuf *n; int error = 0; m = tcp4_segment(m, 0); if (m == NULL) return ENOMEM; do { n = m->m_nextpkt; if (error == 0) error = ip_if_output(ifp, m, sa, rt); else m_freem(m); m = n; } while (m != NULL); return error; } /* * Compute now in software the IP and TCP/UDP checksums. Cancel the * hardware offloading. */ void in_undefer_cksum(struct mbuf *mh, size_t hdrlen, int csum_flags) { const size_t iphdrlen = M_CSUM_DATA_IPv4_IPHL(mh->m_pkthdr.csum_data); uint16_t csum; uint16_t ip_len; uint16_t *csump; struct mbuf *m = mh; KASSERT(mh->m_flags & M_PKTHDR); KASSERT(mh->m_pkthdr.len > hdrlen); KASSERT((mh->m_pkthdr.csum_flags & csum_flags) == csum_flags); /* * Deal with prepended frame header as done by e.g. ether_output(). * If first mbuf in chain has just the header, use second mbuf * for the actual checksum. in4_csum() expects the passed mbuf * to have the whole (struct ip) area contiguous. */ if (m->m_len <= hdrlen) { hdrlen -= m->m_len; m = m->m_next; KASSERT(m != NULL); } if (__predict_true(hdrlen + sizeof(struct ip) <= m->m_len)) { struct ip *ip = (struct ip *)(mtod(m, uint8_t *) + hdrlen); ip_len = ip->ip_len; csump = &ip->ip_sum; } else { const size_t ip_len_offset = hdrlen + offsetof(struct ip, ip_len); m_copydata(m, ip_len_offset, sizeof(ip_len), &ip_len); csump = NULL; } ip_len = ntohs(ip_len); if (csum_flags & M_CSUM_IPv4) { csum = in4_cksum(m, 0, hdrlen, iphdrlen); if (csump != NULL) { *csump = csum; } else { const size_t offset = hdrlen + offsetof(struct ip, ip_sum); m_copyback(m, offset, sizeof(uint16_t), &csum); } } if (csum_flags & (M_CSUM_UDPv4|M_CSUM_TCPv4)) { size_t l4offset = hdrlen + iphdrlen; csum = in4_cksum(m, 0, l4offset, ip_len - iphdrlen); if (csum == 0 && (csum_flags & M_CSUM_UDPv4) != 0) csum = 0xffff; l4offset += M_CSUM_DATA_IPv4_OFFSET(m->m_pkthdr.csum_data); if (__predict_true(l4offset + sizeof(uint16_t) <= m->m_len)) { *(uint16_t *)(mtod(m, char *) + l4offset) = csum; } else { m_copyback(m, l4offset, sizeof(csum), (void *)&csum); } } mh->m_pkthdr.csum_flags ^= csum_flags; } /* * Compute now in software the TCP/UDP checksum. Cancel the hardware * offloading. */ void in_undefer_cksum_tcpudp(struct mbuf *m) { struct ip *ip; uint16_t csum, offset; KASSERT((m->m_flags & M_PKTHDR) != 0); KASSERT((m->m_pkthdr.csum_flags & (M_CSUM_TCPv4|M_CSUM_UDPv4)) != 0); KASSERT((m->m_pkthdr.csum_flags & (M_CSUM_TCPv6|M_CSUM_UDPv6)) == 0); ip = mtod(m, struct ip *); offset = ip->ip_hl << 2; csum = in4_cksum(m, 0, offset, ntohs(ip->ip_len) - offset); if (csum == 0 && (m->m_pkthdr.csum_flags & M_CSUM_UDPv4) != 0) csum = 0xffff; offset += M_CSUM_DATA_IPv4_OFFSET(m->m_pkthdr.csum_data); if ((offset + sizeof(uint16_t)) <= m->m_len) { *(uint16_t *)(mtod(m, char *) + offset) = csum; } else { m_copyback(m, offset, sizeof(csum), (void *)&csum); } }
1839 5 1835 1835 1047 1045 253 1044 1042 1045 80 80 80 78 79 79 80 1059 1058 19 19 1047 108 1038 1044 1051 1037 161 251 1049 47 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 /* $NetBSD: kern_lock.c,v 1.188 2024/01/14 11:46:05 andvar Exp $ */ /*- * Copyright (c) 2002, 2006, 2007, 2008, 2009, 2020, 2023 * The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility, * NASA Ames Research Center, and by Andrew Doran. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: kern_lock.c,v 1.188 2024/01/14 11:46:05 andvar Exp $"); #ifdef _KERNEL_OPT #include "opt_lockdebug.h" #endif #include <sys/param.h> #include <sys/proc.h> #include <sys/lock.h> #include <sys/systm.h> #include <sys/kernel.h> #include <sys/lockdebug.h> #include <sys/cpu.h> #include <sys/syslog.h> #include <sys/atomic.h> #include <sys/lwp.h> #include <sys/pserialize.h> #if defined(DIAGNOSTIC) && !defined(LOCKDEBUG) #include <sys/ksyms.h> #endif #include <machine/lock.h> #include <dev/lockstat.h> #define RETURN_ADDRESS (uintptr_t)__builtin_return_address(0) bool kernel_lock_dodebug; __cpu_simple_lock_t kernel_lock[CACHE_LINE_SIZE / sizeof(__cpu_simple_lock_t)] __cacheline_aligned; void assert_sleepable(void) { const char *reason; long pctr; bool idle; if (__predict_false(panicstr != NULL)) { return; } LOCKDEBUG_BARRIER(kernel_lock, 1); /* * Avoid disabling/re-enabling preemption here since this * routine may be called in delicate situations. */ do { pctr = lwp_pctr(); idle = CURCPU_IDLE_P(); } while (__predict_false(pctr != lwp_pctr())); reason = NULL; if (__predict_false(idle) && !cold) { reason = "idle"; goto panic; } if (__predict_false(cpu_intr_p())) { reason = "interrupt"; goto panic; } if (__predict_false(cpu_softintr_p())) { reason = "softint"; goto panic; } if (__predict_false(!pserialize_not_in_read_section())) { reason = "pserialize"; goto panic; } return; panic: panic("%s: %s caller=%p", __func__, reason, (void *)RETURN_ADDRESS); } /* * Functions for manipulating the kernel_lock. We put them here * so that they show up in profiles. */ #define _KERNEL_LOCK_ABORT(msg) \ LOCKDEBUG_ABORT(__func__, __LINE__, kernel_lock, &_kernel_lock_ops, msg) #ifdef LOCKDEBUG #define _KERNEL_LOCK_ASSERT(cond) \ do { \ if (!(cond)) \ _KERNEL_LOCK_ABORT("assertion failed: " #cond); \ } while (/* CONSTCOND */ 0) #else #define _KERNEL_LOCK_ASSERT(cond) /* nothing */ #endif static void _kernel_lock_dump(const volatile void *, lockop_printer_t); lockops_t _kernel_lock_ops = { .lo_name = "Kernel lock", .lo_type = LOCKOPS_SPIN, .lo_dump = _kernel_lock_dump, }; #ifdef LOCKDEBUG #ifdef DDB #include <ddb/ddb.h> #endif static void kernel_lock_trace_ipi(void *cookie) { printf("%s[%d %s]: hogging kernel lock\n", cpu_name(curcpu()), curlwp->l_lid, curlwp->l_name ? curlwp->l_name : curproc->p_comm); #ifdef DDB db_stacktrace(); #endif } #endif /* * Initialize the kernel lock. */ void kernel_lock_init(void) { __cpu_simple_lock_init(kernel_lock); kernel_lock_dodebug = LOCKDEBUG_ALLOC(kernel_lock, &_kernel_lock_ops, RETURN_ADDRESS); } CTASSERT(CACHE_LINE_SIZE >= sizeof(__cpu_simple_lock_t)); /* * Print debugging information about the kernel lock. */ static void _kernel_lock_dump(const volatile void *junk, lockop_printer_t pr) { struct cpu_info *ci = curcpu(); (void)junk; pr("curcpu holds : %18d wanted by: %#018lx\n", ci->ci_biglock_count, (long)ci->ci_biglock_wanted); } /* * Acquire 'nlocks' holds on the kernel lock. * * Although it may not look it, this is one of the most central, intricate * routines in the kernel, and tons of code elsewhere depends on its exact * behaviour. If you change something in here, expect it to bite you in the * rear. */ void _kernel_lock(int nlocks) { struct cpu_info *ci; LOCKSTAT_TIMER(spintime); LOCKSTAT_FLAG(lsflag); struct lwp *owant; #ifdef LOCKDEBUG static struct cpu_info *kernel_lock_holder; u_int spins = 0; u_int starttime = getticks(); #endif int s; struct lwp *l = curlwp; _KERNEL_LOCK_ASSERT(nlocks > 0); s = splvm(); ci = curcpu(); if (ci->ci_biglock_count != 0) { _KERNEL_LOCK_ASSERT(__SIMPLELOCK_LOCKED_P(kernel_lock)); ci->ci_biglock_count += nlocks; l->l_blcnt += nlocks; splx(s); return; } _KERNEL_LOCK_ASSERT(l->l_blcnt == 0); LOCKDEBUG_WANTLOCK(kernel_lock_dodebug, kernel_lock, RETURN_ADDRESS, 0); if (__predict_true(__cpu_simple_lock_try(kernel_lock))) { #ifdef LOCKDEBUG kernel_lock_holder = curcpu(); #endif ci->ci_biglock_count = nlocks; l->l_blcnt = nlocks; LOCKDEBUG_LOCKED(kernel_lock_dodebug, kernel_lock, NULL, RETURN_ADDRESS, 0); splx(s); return; } /* * To remove the ordering constraint between adaptive mutexes * and kernel_lock we must make it appear as if this thread is * blocking. For non-interlocked mutex release, a store fence * is required to ensure that the result of any mutex_exit() * by the current LWP becomes visible on the bus before the set * of ci->ci_biglock_wanted becomes visible. * * This membar_producer matches the membar_consumer in * mutex_vector_enter. * * That way, if l has just released a mutex, mutex_vector_enter * can't see this store ci->ci_biglock_wanted := l until it * will also see the mutex_exit store mtx->mtx_owner := 0 which * clears the has-waiters bit. */ membar_producer(); owant = ci->ci_biglock_wanted; atomic_store_relaxed(&ci->ci_biglock_wanted, l); #if defined(DIAGNOSTIC) && !defined(LOCKDEBUG) l->l_ld_wanted = __builtin_return_address(0); #endif /* * Spin until we acquire the lock. Once we have it, record the * time spent with lockstat. */ LOCKSTAT_ENTER(lsflag); LOCKSTAT_START_TIMER(lsflag, spintime); do { splx(s); while (__SIMPLELOCK_LOCKED_P(kernel_lock)) { #ifdef LOCKDEBUG if (SPINLOCK_SPINOUT(spins) && start_init_exec && (getticks() - starttime) > 10*hz) { ipi_msg_t msg = { .func = kernel_lock_trace_ipi, }; kpreempt_disable(); ipi_unicast(&msg, kernel_lock_holder); ipi_wait(&msg); kpreempt_enable(); _KERNEL_LOCK_ABORT("spinout"); } #endif SPINLOCK_BACKOFF_HOOK; SPINLOCK_SPIN_HOOK; } s = splvm(); } while (!__cpu_simple_lock_try(kernel_lock)); ci->ci_biglock_count = nlocks; l->l_blcnt = nlocks; LOCKSTAT_STOP_TIMER(lsflag, spintime); LOCKDEBUG_LOCKED(kernel_lock_dodebug, kernel_lock, NULL, RETURN_ADDRESS, 0); if (owant == NULL) { LOCKSTAT_EVENT_RA(lsflag, kernel_lock, LB_KERNEL_LOCK | LB_SPIN, 1, spintime, RETURN_ADDRESS); } LOCKSTAT_EXIT(lsflag); splx(s); /* * Now that we have kernel_lock, reset ci_biglock_wanted. This * store must be visible on other CPUs before a mutex_exit() on * this CPU can test the has-waiters bit. * * This membar_enter matches the membar_enter in * mutex_vector_enter. (Yes, not membar_exit -- the legacy * naming is confusing, but store-before-load usually pairs * with store-before-load, in the extremely rare cases where it * is used at all.) * * That way, mutex_vector_enter can't see this store * ci->ci_biglock_wanted := owant until it has set the * has-waiters bit. */ (void)atomic_swap_ptr(&ci->ci_biglock_wanted, owant); #ifndef __HAVE_ATOMIC_AS_MEMBAR membar_enter(); #endif #ifdef LOCKDEBUG kernel_lock_holder = curcpu(); #endif } /* * Release 'nlocks' holds on the kernel lock. If 'nlocks' is zero, release * all holds. */ void _kernel_unlock(int nlocks, int *countp) { struct cpu_info *ci; u_int olocks; int s; struct lwp *l = curlwp; _KERNEL_LOCK_ASSERT(nlocks < 2); olocks = l->l_blcnt; if (olocks == 0) { _KERNEL_LOCK_ASSERT(nlocks <= 0); if (countp != NULL) *countp = 0; return; } _KERNEL_LOCK_ASSERT(__SIMPLELOCK_LOCKED_P(kernel_lock)); if (nlocks == 0) nlocks = olocks; else if (nlocks == -1) { nlocks = 1; _KERNEL_LOCK_ASSERT(olocks == 1); } s = splvm(); ci = curcpu(); _KERNEL_LOCK_ASSERT(ci->ci_biglock_count >= l->l_blcnt); if (ci->ci_biglock_count == nlocks) { LOCKDEBUG_UNLOCKED(kernel_lock_dodebug, kernel_lock, RETURN_ADDRESS, 0); ci->ci_biglock_count = 0; __cpu_simple_unlock(kernel_lock); l->l_blcnt -= nlocks; splx(s); if (l->l_dopreempt) kpreempt(0); } else { ci->ci_biglock_count -= nlocks; l->l_blcnt -= nlocks; splx(s); } if (countp != NULL) *countp = olocks; } bool _kernel_locked_p(void) { return __SIMPLELOCK_LOCKED_P(kernel_lock); }
4 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 /* $NetBSD: netbsd32_machdep.c,v 1.141 2022/08/20 23:49:31 riastradh Exp $ */ /* * Copyright (c) 2001 Wasabi Systems, Inc. * All rights reserved. * * Written by Frank van der Linden for Wasabi Systems, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed for the NetBSD Project by * Wasabi Systems, Inc. * 4. The name of Wasabi Systems, Inc. may not be used to endorse * or promote products derived from this software without specific prior * written permission. * * THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL WASABI SYSTEMS, INC * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: netbsd32_machdep.c,v 1.141 2022/08/20 23:49:31 riastradh Exp $"); #ifdef _KERNEL_OPT #include "opt_compat_netbsd.h" #include "opt_compat_netbsd32.h" #include "opt_execfmt.h" #include "opt_user_ldt.h" #include "opt_mtrr.h" #endif #include <sys/param.h> #include <sys/exec.h> #include <sys/exec_aout.h> #include <sys/kmem.h> #include <sys/kmem.h> #include <sys/proc.h> #include <sys/signalvar.h> #include <sys/systm.h> #include <sys/core.h> #include <sys/mount.h> #include <sys/buf.h> #include <sys/vnode.h> #include <sys/ras.h> #include <sys/ptrace.h> #include <sys/kauth.h> #include <sys/compat_stub.h> #include <x86/fpu.h> #include <x86/dbregs.h> #include <machine/frame.h> #include <machine/reg.h> #include <machine/vmparam.h> #ifdef MTRR #include <machine/mtrr.h> #endif #include <machine/netbsd32_machdep.h> #include <machine/sysarch.h> #include <machine/userret.h> #include <machine/gdt.h> #include <machine/pmap_private.h> #include <compat/netbsd32/netbsd32.h> #include <compat/netbsd32/netbsd32_exec.h> #include <compat/netbsd32/netbsd32_syscallargs.h> #include <compat/sys/signal.h> #include <compat/sys/signalvar.h> /* Provide a the name of the architecture we're emulating */ const char machine32[] = "i386"; const char machine_arch32[] = "i386"; static int netbsd32_process_doxmmregs(struct lwp *, struct lwp *, void *, bool); static int netbsd32_process_xmmregio(struct lwp *, struct lwp *, struct uio *); #ifdef USER_LDT static int x86_64_get_ldt32(struct lwp *, void *, register_t *); static int x86_64_set_ldt32(struct lwp *, void *, register_t *); #else #define x86_64_get_ldt32(x, y, z) ENOSYS #define x86_64_set_ldt32(x, y, z) ENOSYS #endif #ifdef MTRR static int x86_64_get_mtrr32(struct lwp *, void *, register_t *); static int x86_64_set_mtrr32(struct lwp *, void *, register_t *); #else #define x86_64_get_mtrr32(x, y, z) ENOSYS #define x86_64_set_mtrr32(x, y, z) ENOSYS #endif int check_sigcontext32(struct lwp *, const struct netbsd32_sigcontext *); void netbsd32_buildcontext(struct lwp *, struct trapframe *, void *, sig_t, int); #ifdef EXEC_AOUT /* * There is no native a.out -- this function is required * for i386 a.out emulation (COMPAT_NETBSD32+EXEC_AOUT). */ int cpu_exec_aout_makecmds(struct lwp *p, struct exec_package *e) { return ENOEXEC; } #endif void netbsd32_setregs(struct lwp *l, struct exec_package *pack, vaddr_t stack) { struct pcb *pcb; struct trapframe *tf; struct proc *p = l->l_proc; pcb = lwp_getpcb(l); #if defined(USER_LDT) pmap_ldt_cleanup(l); #endif netbsd32_adjust_limits(p); fpu_clear(l, pack->ep_osversion >= 699002600 ? __NetBSD_NPXCW__ : __NetBSD_COMPAT_NPXCW__); x86_dbregs_clear(l); kpreempt_disable(); pcb->pcb_flags = PCB_COMPAT32; p->p_flag |= PK_32; l->l_md.md_flags = MDL_COMPAT32; /* force iret not sysret */ cpu_segregs32_zero(l); cpu_fsgs_reload(l, LSEL(LUDATA32_SEL, SEL_UPL), LSEL(LUDATA32_SEL, SEL_UPL)); kpreempt_enable(); tf = l->l_md.md_regs; tf->tf_ds = LSEL(LUDATA32_SEL, SEL_UPL); tf->tf_es = LSEL(LUDATA32_SEL, SEL_UPL); tf->tf_rdi = 0; tf->tf_rsi = 0; tf->tf_rbp = 0; tf->tf_rbx = (uint32_t)p->p_psstrp; tf->tf_rdx = 0; tf->tf_rcx = 0; tf->tf_rax = 0; tf->tf_rip = pack->ep_entry; tf->tf_cs = LSEL(LUCODE32_SEL, SEL_UPL); tf->tf_rflags = PSL_USERSET; tf->tf_rsp = stack; tf->tf_ss = LSEL(LUDATA32_SEL, SEL_UPL); } void netbsd32_buildcontext(struct lwp *l, struct trapframe *tf, void *fp, sig_t catcher, int onstack) { /* * Build context to run handler in. */ tf->tf_ds = GSEL(GUDATA32_SEL, SEL_UPL); tf->tf_es = GSEL(GUDATA32_SEL, SEL_UPL); #if 0 tf->tf_fs = GSEL(GUDATA32_SEL, SEL_UPL); tf->tf_gs = GSEL(GUDATA32_SEL, SEL_UPL); #endif /* Ensure FP state is sane. */ fpu_sigreset(l); tf->tf_rip = (uint64_t)catcher; tf->tf_cs = GSEL(GUCODE32_SEL, SEL_UPL); tf->tf_rflags &= ~PSL_CLEARSIG; tf->tf_rsp = (uint64_t)fp; tf->tf_ss = GSEL(GUDATA32_SEL, SEL_UPL); /* Remember that we're now on the signal stack. */ if (onstack) l->l_sigstk.ss_flags |= SS_ONSTACK; if ((vaddr_t)catcher >= VM_MAXUSER_ADDRESS32) { /* * process has given an invalid address for the * handler. Stop it, but do not do it before so * we can return the right info to userland (or in core dump) */ sigexit(l, SIGILL); /* NOTREACHED */ } } void netbsd32_sendsig_siginfo(const ksiginfo_t *ksi, const sigset_t *mask) { struct lwp *l = curlwp; struct proc *p = l->l_proc; struct sigacts *ps = p->p_sigacts; int onstack, error; int sig = ksi->ksi_signo; struct netbsd32_sigframe_siginfo *fp, frame; const struct sigaction *sa = &SIGACTION(p, sig); sig_t catcher = sa->sa_handler; struct trapframe *tf = l->l_md.md_regs; stack_t * const ss = &l->l_sigstk; /* Do we need to jump onto the signal stack? */ onstack = (ss->ss_flags & (SS_DISABLE | SS_ONSTACK)) == 0 && (sa->sa_flags & SA_ONSTACK) != 0; /* Allocate space for the signal handler context. */ if (onstack) fp = (struct netbsd32_sigframe_siginfo *) ((char *)ss->ss_sp + ss->ss_size); else fp = (struct netbsd32_sigframe_siginfo *)tf->tf_rsp; fp--; /* Build stack frame for signal trampoline. */ switch (ps->sa_sigdesc[sig].sd_vers) { case __SIGTRAMP_SIGCODE_VERSION: /* handled by sendsig_sigcontext */ case __SIGTRAMP_SIGCONTEXT_VERSION: /* handled by sendsig_sigcontext */ default: /* unknown version */ printf("nsendsig: bad version %d\n", ps->sa_sigdesc[sig].sd_vers); sigexit(l, SIGILL); case __SIGTRAMP_SIGINFO_VERSION: break; } memset(&frame, 0, sizeof(frame)); frame.sf_ra = (uint32_t)(uintptr_t)ps->sa_sigdesc[sig].sd_tramp; frame.sf_signum = sig; frame.sf_sip = (uint32_t)(uintptr_t)&fp->sf_si; frame.sf_ucp = (uint32_t)(uintptr_t)&fp->sf_uc; netbsd32_si_to_si32(&frame.sf_si, (const siginfo_t *)&ksi->ksi_info); frame.sf_uc.uc_flags = _UC_SIGMASK; frame.sf_uc.uc_sigmask = *mask; frame.sf_uc.uc_link = (uint32_t)(uintptr_t)l->l_ctxlink; frame.sf_uc.uc_flags |= (ss->ss_flags & SS_ONSTACK) ? _UC_SETSTACK : _UC_CLRSTACK; sendsig_reset(l, sig); mutex_exit(p->p_lock); cpu_getmcontext32(l, &frame.sf_uc.uc_mcontext, &frame.sf_uc.uc_flags); error = copyout(&frame, fp, sizeof(frame)); mutex_enter(p->p_lock); if (error != 0) { /* * Process has trashed its stack; give it an illegal * instruction to halt it in its tracks. */ sigexit(l, SIGILL); /* NOTREACHED */ } netbsd32_buildcontext(l, tf, fp, catcher, onstack); } /* * Dump the machine specific segment at the start of a core dump. */ struct md_core32 { struct reg32 intreg; struct fpreg32 freg; }; int cpu_coredump32(struct lwp *l, struct coredump_iostate *iocookie, struct core32 *chdr) { struct md_core32 md_core; struct coreseg cseg; int error; if (iocookie == NULL) { CORE_SETMAGIC(*chdr, COREMAGIC, MID_I386, 0); chdr->c_hdrsize = ALIGN32(sizeof(*chdr)); chdr->c_seghdrsize = ALIGN32(sizeof(cseg)); chdr->c_cpusize = sizeof(md_core); chdr->c_nseg++; return 0; } /* Save integer registers. */ error = netbsd32_process_read_regs(l, &md_core.intreg); if (error) return error; /* Save floating point registers. */ error = netbsd32_process_read_fpregs(l, &md_core.freg, NULL); if (error) return error; CORE_SETMAGIC(cseg, CORESEGMAGIC, MID_I386, CORE_CPU); cseg.c_addr = 0; cseg.c_size = chdr->c_cpusize; MODULE_HOOK_CALL(coredump_write_hook, (iocookie, UIO_SYSSPACE, &cseg, chdr->c_seghdrsize), ENOSYS, error); if (error) return error; MODULE_HOOK_CALL(coredump_write_hook, (iocookie, UIO_SYSSPACE, &md_core, sizeof(md_core)), ENOSYS, error); return error; } int netbsd32_ptrace_translate_request(int req) { switch (req) { case 0 ... PT_FIRSTMACH - 1: return req; case PT32_STEP: return PT_STEP; case PT32_GETREGS: return PT_GETREGS; case PT32_SETREGS: return PT_SETREGS; case PT32_GETFPREGS: return PT_GETFPREGS; case PT32_SETFPREGS: return PT_SETFPREGS; case PT32_GETXMMREGS: return PT_GETXMMREGS; case PT32_SETXMMREGS: return PT_SETXMMREGS; case PT32_GETDBREGS: return PT_GETDBREGS; case PT32_SETDBREGS: return PT_SETDBREGS; case PT32_SETSTEP: return PT_SETSTEP; case PT32_CLEARSTEP: return PT_CLEARSTEP; case PT32_GETXSTATE: return PT_GETXSTATE; case PT32_SETXSTATE: return PT_SETXSTATE; default: return -1; } } int netbsd32_process_read_regs(struct lwp *l, struct reg32 *regs) { struct trapframe *tf = l->l_md.md_regs; /* XXX avoid sign extension problems with unknown upper bits? */ regs->r_gs = tf->tf_gs & 0xffff; regs->r_fs = tf->tf_fs & 0xffff; regs->r_es = tf->tf_es & 0xffff; regs->r_ds = tf->tf_ds & 0xffff; regs->r_eflags = tf->tf_rflags; regs->r_edi = tf->tf_rdi & 0xffffffff; regs->r_esi = tf->tf_rsi & 0xffffffff; regs->r_ebp = tf->tf_rbp & 0xffffffff; regs->r_ebx = tf->tf_rbx & 0xffffffff; regs->r_edx = tf->tf_rdx & 0xffffffff; regs->r_ecx = tf->tf_rcx & 0xffffffff; regs->r_eax = tf->tf_rax & 0xffffffff; regs->r_eip = tf->tf_rip & 0xffffffff; regs->r_cs = tf->tf_cs & 0xffff; regs->r_esp = tf->tf_rsp & 0xffffffff; regs->r_ss = tf->tf_ss & 0xffff; return 0; } int netbsd32_process_read_fpregs(struct lwp *l, struct fpreg32 *regs, size_t *sz) { __CTASSERT(sizeof(*regs) == sizeof(struct save87)); process_read_fpregs_s87(l, (struct save87 *)regs); return 0; } int netbsd32_process_read_dbregs(struct lwp *l, struct dbreg32 *regs, size_t *sz) { struct dbreg regs64; x86_dbregs_read(l, &regs64); memset(regs, 0, sizeof(*regs)); regs->dr[0] = regs64.dr[0] & 0xffffffff; regs->dr[1] = regs64.dr[1] & 0xffffffff; regs->dr[2] = regs64.dr[2] & 0xffffffff; regs->dr[3] = regs64.dr[3] & 0xffffffff; regs->dr[6] = regs64.dr[6] & 0xffffffff; regs->dr[7] = regs64.dr[7] & 0xffffffff; return 0; } int netbsd32_process_write_regs(struct lwp *l, const struct reg32 *regs) { struct trapframe *tf; struct pcb *pcb; tf = l->l_md.md_regs; pcb = lwp_getpcb(l); /* * Check for security violations. */ if (((regs->r_eflags ^ tf->tf_rflags) & PSL_USERSTATIC) != 0) return EINVAL; if (!VALID_USER_CSEL32(regs->r_cs)) return EINVAL; if (regs->r_fs != 0 && !VALID_USER_DSEL32(regs->r_fs) && !(VALID_USER_FSEL32(regs->r_fs) && pcb->pcb_fs != 0)) return EINVAL; if (regs->r_gs != 0 && !VALID_USER_DSEL32(regs->r_gs) && !(VALID_USER_GSEL32(regs->r_gs) && pcb->pcb_gs != 0)) return EINVAL; if (regs->r_es != 0 && !VALID_USER_DSEL32(regs->r_es)) return EINVAL; if (!VALID_USER_DSEL32(regs->r_ds) || !VALID_USER_DSEL32(regs->r_ss)) return EINVAL; if ((u_int)regs->r_eip >= VM_MAXUSER_ADDRESS32) return EINVAL; tf->tf_rax = regs->r_eax; tf->tf_rcx = regs->r_ecx; tf->tf_rdx = regs->r_edx; tf->tf_rbx = regs->r_ebx; tf->tf_rsp = regs->r_esp; tf->tf_rbp = regs->r_ebp; tf->tf_rsi = regs->r_esi; tf->tf_rdi = regs->r_edi; tf->tf_rip = regs->r_eip; tf->tf_rflags = regs->r_eflags; tf->tf_cs = regs->r_cs & 0xFFFF; tf->tf_ss = regs->r_ss & 0xFFFF; tf->tf_ds = regs->r_ds & 0xFFFF; tf->tf_es = regs->r_es & 0xFFFF; tf->tf_fs = regs->r_fs & 0xFFFF; tf->tf_gs = regs->r_gs & 0xFFFF; return 0; } int netbsd32_process_write_fpregs(struct lwp *l, const struct fpreg32 *regs, size_t sz) { __CTASSERT(sizeof(*regs) == sizeof(struct save87)); process_write_fpregs_s87(l, (const struct save87 *)regs); return 0; } int netbsd32_process_write_dbregs(struct lwp *l, const struct dbreg32 *regs, size_t sz) { size_t i; struct dbreg regs64; /* Check that DR0-DR3 contain user-space address */ for (i = 0; i < X86_DBREGS; i++) { if ((u_int)regs->dr[i] >= VM_MAXUSER_ADDRESS32) return EINVAL; } if (regs->dr[7] & X86_DR7_GENERAL_DETECT_ENABLE) { return EINVAL; } memset(&regs64, 0, sizeof(regs64)); regs64.dr[0] = (u_int)regs->dr[0]; regs64.dr[1] = (u_int)regs->dr[1]; regs64.dr[2] = (u_int)regs->dr[2]; regs64.dr[3] = (u_int)regs->dr[3]; regs64.dr[6] = (u_int)regs->dr[6]; regs64.dr[7] = (u_int)regs->dr[7]; x86_dbregs_write(l, &regs64); return 0; } static int netbsd32_process_doxmmregs(struct lwp *curl, struct lwp *l, void *addr, bool write) /* curl: tracer */ /* l: traced */ { struct uio uio; struct iovec iov; struct vmspace *vm; int error; if ((curl->l_proc->p_flag & PK_32) == 0 || (l->l_proc->p_flag & PK_32) == 0) return EINVAL; if (!process_machdep_validfpu(l->l_proc)) return EINVAL; error = proc_vmspace_getref(curl->l_proc, &vm); if (error) return error; iov.iov_base = addr; iov.iov_len = sizeof(struct xmmregs32); uio.uio_iov = &iov; uio.uio_iovcnt = 1; uio.uio_offset = 0; uio.uio_resid = sizeof(struct xmmregs32); uio.uio_rw = write ? UIO_WRITE : UIO_READ; uio.uio_vmspace = vm; error = netbsd32_process_xmmregio(curl, l, &uio); uvmspace_free(vm); return error; } static int netbsd32_process_xmmregio(struct lwp *curl, struct lwp *l, struct uio *uio) /* curl: tracer */ /* l: traced */ { struct xmmregs32 regs; int error; char *kv; size_t kl; kl = sizeof(regs); kv = (char *)&regs; if (uio->uio_offset < 0 || uio->uio_offset > (off_t)kl) return EINVAL; kv += uio->uio_offset; kl -= uio->uio_offset; if (kl > uio->uio_resid) kl = uio->uio_resid; process_read_fpregs_xmm(l, &regs.fxstate); error = uiomove(kv, kl, uio); if (error == 0 && uio->uio_rw == UIO_WRITE) { if (l->l_proc->p_stat != SSTOP) error = EBUSY; else process_write_fpregs_xmm(l, &regs.fxstate); } uio->uio_offset = 0; return error; } int netbsd32_sysarch(struct lwp *l, const struct netbsd32_sysarch_args *uap, register_t *retval) { /* { syscallarg(int) op; syscallarg(netbsd32_voidp) parms; } */ int error; switch (SCARG(uap, op)) { case X86_IOPL: error = x86_iopl(l, NETBSD32PTR64(SCARG(uap, parms)), retval); break; case X86_GET_LDT: error = x86_64_get_ldt32(l, NETBSD32PTR64(SCARG(uap, parms)), retval); break; case X86_SET_LDT: error = x86_64_set_ldt32(l, NETBSD32PTR64(SCARG(uap, parms)), retval); break; case X86_GET_MTRR: error = x86_64_get_mtrr32(l, NETBSD32PTR64(SCARG(uap, parms)), retval); break; case X86_SET_MTRR: error = x86_64_set_mtrr32(l, NETBSD32PTR64(SCARG(uap, parms)), retval); break; default: error = EINVAL; break; } return error; } #ifdef USER_LDT static int x86_64_set_ldt32(struct lwp *l, void *args, register_t *retval) { struct x86_set_ldt_args32 ua32; struct x86_set_ldt_args ua; union descriptor *descv; int error; if ((error = copyin(args, &ua32, sizeof(ua32))) != 0) return error; ua.start = ua32.start; ua.num = ua32.num; if (ua.num < 0 || ua.num > MAX_USERLDT_SLOTS) return EINVAL; const size_t alloc_size = sizeof(*descv) * ua.num; descv = kmem_alloc(alloc_size, KM_SLEEP); error = copyin((void *)(uintptr_t)ua32.desc, descv, sizeof(*descv) * ua.num); if (error == 0) error = x86_set_ldt1(l, &ua, descv); *retval = ua.start; kmem_free(descv, alloc_size); return error; } static int x86_64_get_ldt32(struct lwp *l, void *args, register_t *retval) { struct x86_get_ldt_args32 ua32; struct x86_get_ldt_args ua; union descriptor *cp; int error; if ((error = copyin(args, &ua32, sizeof(ua32))) != 0) return error; ua.start = ua32.start; ua.num = ua32.num; if (ua.num < 0 || ua.num > MAX_USERLDT_SLOTS) return EINVAL; const size_t alloc_size = ua.num * sizeof(union descriptor); cp = kmem_alloc(alloc_size, KM_SLEEP); error = x86_get_ldt1(l, &ua, cp); *retval = ua.num; if (error == 0) error = copyout(cp, (void *)(uintptr_t)ua32.desc, ua.num * sizeof(*cp)); kmem_free(cp, alloc_size); return error; } #endif #ifdef MTRR static int x86_64_get_mtrr32(struct lwp *l, void *args, register_t *retval) { struct x86_64_get_mtrr_args32 args32; int error, i; int32_t n; struct mtrr32 *m32p, m32; struct mtrr *m64p, *mp; size_t size; m64p = NULL; if (mtrr_funcs == NULL) return ENOSYS; error = kauth_authorize_machdep(l->l_cred, KAUTH_MACHDEP_MTRR_GET, NULL, NULL, NULL, NULL); if (error) return error; error = copyin(args, &args32, sizeof(args32)); if (error != 0) return error; if (args32.mtrrp == 0) { n = (MTRR_I686_NFIXED_SOFT + MTRR_I686_NVAR_MAX); return copyout(&n, (void *)(uintptr_t)args32.n, sizeof(n)); } error = copyin((void *)(uintptr_t)args32.n, &n, sizeof(n)); if (error != 0) return error; if (n <= 0 || n > (MTRR_I686_NFIXED_SOFT + MTRR_I686_NVAR_MAX)) return EINVAL; size = n * sizeof(struct mtrr); m64p = kmem_zalloc(size, KM_SLEEP); error = mtrr_get(m64p, &n, l->l_proc, 0); if (error != 0) goto fail; m32p = (struct mtrr32 *)(uintptr_t)args32.mtrrp; mp = m64p; for (i = 0; i < n; i++) { m32.base = mp->base; m32.len = mp->len; m32.type = mp->type; m32.flags = mp->flags; m32.owner = mp->owner; error = copyout(&m32, m32p, sizeof(m32)); if (error != 0) break; mp++; m32p++; } fail: if (m64p != NULL) kmem_free(m64p, size); if (error != 0) n = 0; copyout(&n, (void *)(uintptr_t)args32.n, sizeof(n)); return error; } static int x86_64_set_mtrr32(struct lwp *l, void *args, register_t *retval) { struct x86_64_set_mtrr_args32 args32; struct mtrr32 *m32p, m32; struct mtrr *m64p, *mp; int error, i; int32_t n; size_t size; m64p = NULL; if (mtrr_funcs == NULL) return ENOSYS; error = kauth_authorize_machdep(l->l_cred, KAUTH_MACHDEP_MTRR_SET, NULL, NULL, NULL, NULL); if (error) return error; error = copyin(args, &args32, sizeof(args32)); if (error != 0) return error; error = copyin((void *)(uintptr_t)args32.n, &n, sizeof(n)); if (error != 0) return error; if (n <= 0 || n > (MTRR_I686_NFIXED_SOFT + MTRR_I686_NVAR_MAX)) { error = EINVAL; goto fail; } size = n * sizeof(struct mtrr); m64p = kmem_zalloc(size, KM_SLEEP); m32p = (struct mtrr32 *)(uintptr_t)args32.mtrrp; mp = m64p; for (i = 0; i < n; i++) { error = copyin(m32p, &m32, sizeof(m32)); if (error != 0) goto fail; mp->base = m32.base; mp->len = m32.len; mp->type = m32.type; mp->flags = m32.flags; mp->owner = m32.owner; m32p++; mp++; } error = mtrr_set(m64p, &n, l->l_proc, 0); fail: if (m64p != NULL) kmem_free(m64p, size); if (error != 0) n = 0; copyout(&n, (void *)(uintptr_t)args32.n, sizeof(n)); return error; } #endif int cpu_setmcontext32(struct lwp *l, const mcontext32_t *mcp, unsigned int flags) { struct trapframe *tf = l->l_md.md_regs; const __greg32_t *gr = mcp->__gregs; struct proc *p = l->l_proc; int error; /* Restore register context, if any. */ if ((flags & _UC_CPU) != 0) { /* * Check for security violations. */ error = cpu_mcontext32_validate(l, mcp); if (error != 0) return error; cpu_fsgs_reload(l, gr[_REG32_FS], gr[_REG32_GS]); tf->tf_es = gr[_REG32_ES] & 0xFFFF; tf->tf_ds = gr[_REG32_DS] & 0xFFFF; /* Only change the user-alterable part of eflags */ tf->tf_rflags &= ~PSL_USER; tf->tf_rflags |= (gr[_REG32_EFL] & PSL_USER); tf->tf_rdi = gr[_REG32_EDI]; tf->tf_rsi = gr[_REG32_ESI]; tf->tf_rbp = gr[_REG32_EBP]; tf->tf_rbx = gr[_REG32_EBX]; tf->tf_rdx = gr[_REG32_EDX]; tf->tf_rcx = gr[_REG32_ECX]; tf->tf_rax = gr[_REG32_EAX]; tf->tf_rip = gr[_REG32_EIP]; tf->tf_cs = gr[_REG32_CS] & 0xFFFF; tf->tf_rsp = gr[_REG32_UESP]; tf->tf_ss = gr[_REG32_SS] & 0xFFFF; } if ((flags & _UC_TLSBASE) != 0) lwp_setprivate(l, (void *)(uintptr_t)mcp->_mc_tlsbase); /* Restore floating point register context, if any. */ if ((flags & _UC_FPU) != 0) { /* Assume fxsave context */ process_write_fpregs_xmm(l, (const struct fxsave *) &mcp->__fpregs.__fp_reg_set.__fp_xmm_state); } mutex_enter(p->p_lock); if (flags & _UC_SETSTACK) l->l_sigstk.ss_flags |= SS_ONSTACK; if (flags & _UC_CLRSTACK) l->l_sigstk.ss_flags &= ~SS_ONSTACK; mutex_exit(p->p_lock); return 0; } void cpu_getmcontext32(struct lwp *l, mcontext32_t *mcp, unsigned int *flags) { const struct trapframe *tf = l->l_md.md_regs; __greg32_t *gr = mcp->__gregs; __greg32_t ras_eip; /* Save register context. */ gr[_REG32_GS] = tf->tf_gs & 0xFFFF; gr[_REG32_FS] = tf->tf_fs & 0xFFFF; gr[_REG32_ES] = tf->tf_es & 0xFFFF; gr[_REG32_DS] = tf->tf_ds & 0xFFFF; gr[_REG32_EFL] = tf->tf_rflags; gr[_REG32_EDI] = tf->tf_rdi; gr[_REG32_ESI] = tf->tf_rsi; gr[_REG32_EBP] = tf->tf_rbp; gr[_REG32_EBX] = tf->tf_rbx; gr[_REG32_EDX] = tf->tf_rdx; gr[_REG32_ECX] = tf->tf_rcx; gr[_REG32_EAX] = tf->tf_rax; gr[_REG32_EIP] = tf->tf_rip; gr[_REG32_CS] = tf->tf_cs & 0xFFFF; gr[_REG32_ESP] = tf->tf_rsp; gr[_REG32_UESP] = tf->tf_rsp; gr[_REG32_SS] = tf->tf_ss & 0xFFFF; gr[_REG32_TRAPNO] = tf->tf_trapno; gr[_REG32_ERR] = tf->tf_err; if ((ras_eip = (__greg32_t)(uintptr_t)ras_lookup(l->l_proc, (void *) (uintptr_t)gr[_REG32_EIP])) != (__greg32_t)-1) gr[_REG32_EIP] = ras_eip; *flags |= _UC_CPU; mcp->_mc_tlsbase = (uint32_t)(uintptr_t)l->l_private; *flags |= _UC_TLSBASE; /* Save floating point register context. */ process_read_fpregs_xmm(l, (struct fxsave *) &mcp->__fpregs.__fp_reg_set.__fp_xmm_state); memset(&mcp->__fpregs.__fp_pad, 0, sizeof(mcp->__fpregs.__fp_pad)); *flags |= _UC_FXSAVE | _UC_FPU; } void startlwp32(void *arg) { ucontext32_t *uc = arg; lwp_t *l = curlwp; int error __diagused; error = cpu_setmcontext32(l, &uc->uc_mcontext, uc->uc_flags); KASSERT(error == 0); /* Note: we are freeing ucontext_t, not ucontext32_t. */ kmem_free(uc, sizeof(ucontext_t)); userret(l); } int check_sigcontext32(struct lwp *l, const struct netbsd32_sigcontext *scp) { struct pmap *pmap = l->l_proc->p_vmspace->vm_map.pmap; struct trapframe *tf; struct pcb *pcb; tf = l->l_md.md_regs; pcb = lwp_getpcb(curlwp); if (((scp->sc_eflags ^ tf->tf_rflags) & PSL_USERSTATIC) != 0) return EINVAL; if (__predict_false(pmap->pm_ldt != NULL)) { /* Allow unfamiliar segment register values (USER_LDT). */ if (!USERMODE(scp->sc_cs)) return EINVAL; } else { if (!VALID_USER_CSEL32(scp->sc_cs)) return EINVAL; if (scp->sc_fs != 0 && !VALID_USER_DSEL32(scp->sc_fs) && !(VALID_USER_FSEL32(scp->sc_fs) && pcb->pcb_fs != 0)) return EINVAL; if (scp->sc_gs != 0 && !VALID_USER_DSEL32(scp->sc_gs) && !(VALID_USER_GSEL32(scp->sc_gs) && pcb->pcb_gs != 0)) return EINVAL; if (scp->sc_es != 0 && !VALID_USER_DSEL32(scp->sc_es)) return EINVAL; if (!VALID_USER_DSEL32(scp->sc_ds) || !VALID_USER_DSEL32(scp->sc_ss)) return EINVAL; } if (scp->sc_eip >= VM_MAXUSER_ADDRESS32) return EINVAL; return 0; } int cpu_mcontext32_validate(struct lwp *l, const mcontext32_t *mcp) { struct pmap *pmap = l->l_proc->p_vmspace->vm_map.pmap; const __greg32_t *gr; struct trapframe *tf; struct pcb *pcb; gr = mcp->__gregs; tf = l->l_md.md_regs; pcb = lwp_getpcb(l); if (((gr[_REG32_EFL] ^ tf->tf_rflags) & PSL_USERSTATIC) != 0) return EINVAL; if (__predict_false(pmap->pm_ldt != NULL)) { /* Allow unfamiliar segment register values (USER_LDT). */ if (!USERMODE(gr[_REG32_CS])) return EINVAL; } else { if (!VALID_USER_CSEL32(gr[_REG32_CS])) return EINVAL; if (gr[_REG32_FS] != 0 && !VALID_USER_DSEL32(gr[_REG32_FS]) && !(VALID_USER_FSEL32(gr[_REG32_FS]) && pcb->pcb_fs != 0)) return EINVAL; if (gr[_REG32_GS] != 0 && !VALID_USER_DSEL32(gr[_REG32_GS]) && !(VALID_USER_GSEL32(gr[_REG32_GS]) && pcb->pcb_gs != 0)) return EINVAL; if (gr[_REG32_ES] != 0 && !VALID_USER_DSEL32(gr[_REG32_ES])) return EINVAL; if (!VALID_USER_DSEL32(gr[_REG32_DS]) || !VALID_USER_DSEL32(gr[_REG32_SS])) return EINVAL; } if (gr[_REG32_EIP] >= VM_MAXUSER_ADDRESS32) return EINVAL; return 0; } static int cpu_mcontext32from64_validate(struct lwp *l, const struct reg *regp) { mcontext32_t mc; __greg32_t *gr32 = mc.__gregs; const __greg_t *gr = regp->regs; memset(&mc, 0, sizeof(mc)); gr32[_REG32_EFL] = gr[_REG_RFLAGS]; gr32[_REG32_EIP] = gr[_REG_RIP]; gr32[_REG32_CS] = gr[_REG_CS]; gr32[_REG32_DS] = gr[_REG_DS]; gr32[_REG32_ES] = gr[_REG_ES]; gr32[_REG32_FS] = gr[_REG_FS]; gr32[_REG32_GS] = gr[_REG_GS]; gr32[_REG32_SS] = gr[_REG_SS]; return cpu_mcontext32_validate(l, &mc); } vaddr_t netbsd32_vm_default_addr(struct proc *p, vaddr_t base, vsize_t sz, int topdown) { if (topdown) return VM_DEFAULT_ADDRESS32_TOPDOWN(base, sz); else return VM_DEFAULT_ADDRESS32_BOTTOMUP(base, sz); } static const char * netbsd32_machine32(void) { return machine32; } void netbsd32_machdep_md_init(void) { MODULE_HOOK_SET(netbsd32_machine32_hook, netbsd32_machine32); MODULE_HOOK_SET(netbsd32_reg_validate_hook, cpu_mcontext32from64_validate); MODULE_HOOK_SET(netbsd32_process_doxmmregs_hook, netbsd32_process_doxmmregs); } void netbsd32_machdep_md_fini(void) { MODULE_HOOK_UNSET(netbsd32_machine32_hook); MODULE_HOOK_UNSET(netbsd32_reg_validate_hook); MODULE_HOOK_UNSET(netbsd32_process_doxmmregs_hook); }
412 411 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 /* $NetBSD: sys_pset.c,v 1.24 2020/05/23 23:42:43 ad Exp $ */ /* * Copyright (c) 2008, Mindaugas Rasiukevicius <rmind at NetBSD org> * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * Implementation of the Processor Sets. * * Locking * The array of the processor-set structures and its members are protected * by the global cpu_lock. Note that in scheduler, the very l_psid value * might be used without lock held. */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: sys_pset.c,v 1.24 2020/05/23 23:42:43 ad Exp $"); #include <sys/param.h> #include <sys/cpu.h> #include <sys/kauth.h> #include <sys/kmem.h> #include <sys/lwp.h> #include <sys/mutex.h> #include <sys/proc.h> #include <sys/pset.h> #include <sys/sched.h> #include <sys/syscallargs.h> #include <sys/sysctl.h> #include <sys/systm.h> #include <sys/types.h> static pset_info_t ** psets; static u_int psets_max; static u_int psets_count; static kauth_listener_t psets_listener; static int psets_realloc(int); static int psid_validate(psetid_t, bool); static int kern_pset_create(psetid_t *); static int kern_pset_destroy(psetid_t); static int psets_listener_cb(kauth_cred_t cred, kauth_action_t action, void *cookie, void *arg0, void *arg1, void *arg2, void *arg3) { psetid_t id; enum kauth_system_req req; int result; result = KAUTH_RESULT_DEFER; req = (enum kauth_system_req)(uintptr_t)arg0; id = (psetid_t)(uintptr_t)arg1; if (action != KAUTH_SYSTEM_PSET) return result; if ((req == KAUTH_REQ_SYSTEM_PSET_ASSIGN) || (req == KAUTH_REQ_SYSTEM_PSET_BIND)) { if (id == PS_QUERY) result = KAUTH_RESULT_ALLOW; } return result; } /* * Initialization of the processor-sets. */ void psets_init(void) { psets_max = uimax(maxcpus, 32); psets = kmem_zalloc(psets_max * sizeof(void *), KM_SLEEP); psets_count = 0; psets_listener = kauth_listen_scope(KAUTH_SCOPE_SYSTEM, psets_listener_cb, NULL); } /* * Reallocate the array of the processor-set structures. */ static int psets_realloc(int new_psets_max) { pset_info_t **new_psets, **old_psets; const u_int newsize = new_psets_max * sizeof(void *); u_int i, oldsize; if (new_psets_max < 1) return EINVAL; new_psets = kmem_zalloc(newsize, KM_SLEEP); mutex_enter(&cpu_lock); old_psets = psets; oldsize = psets_max * sizeof(void *); /* Check if we can lower the size of the array */ if (new_psets_max < psets_max) { for (i = new_psets_max; i < psets_max; i++) { if (psets[i] == NULL) continue; mutex_exit(&cpu_lock); kmem_free(new_psets, newsize); return EBUSY; } } /* Copy all pointers to the new array */ memcpy(new_psets, psets, newsize); psets_max = new_psets_max; psets = new_psets; mutex_exit(&cpu_lock); kmem_free(old_psets, oldsize); return 0; } /* * Validate processor-set ID. */ static int psid_validate(psetid_t psid, bool chkps) { KASSERT(mutex_owned(&cpu_lock)); if (chkps && (psid == PS_NONE || psid == PS_QUERY || psid == PS_MYID)) return 0; if (psid <= 0 || psid > psets_max) return EINVAL; if (psets[psid - 1] == NULL) return EINVAL; return 0; } /* * Create a processor-set. */ static int kern_pset_create(psetid_t *psid) { pset_info_t *pi; u_int i; if (psets_count == psets_max) return ENOMEM; pi = kmem_zalloc(sizeof(pset_info_t), KM_SLEEP); mutex_enter(&cpu_lock); if (psets_count == psets_max) { mutex_exit(&cpu_lock); kmem_free(pi, sizeof(pset_info_t)); return ENOMEM; } /* Find a free entry in the array */ for (i = 0; i < psets_max; i++) if (psets[i] == NULL) break; KASSERT(i != psets_max); psets[i] = pi; psets_count++; mutex_exit(&cpu_lock); *psid = i + 1; return 0; } /* * Destroy a processor-set. */ static int kern_pset_destroy(psetid_t psid) { struct cpu_info *ci; struct lwp *l; CPU_INFO_ITERATOR cii; int error; mutex_enter(&cpu_lock); if (psid == PS_MYID) { /* Use caller's processor-set ID */ psid = curlwp->l_psid; } error = psid_validate(psid, false); if (error) { mutex_exit(&cpu_lock); return error; } /* Release the processor-set from all CPUs */ for (CPU_INFO_FOREACH(cii, ci)) { struct schedstate_percpu *spc; spc = &ci->ci_schedstate; if (spc->spc_psid != psid) continue; spc->spc_psid = PS_NONE; } /* Unmark the processor-set ID from each thread */ mutex_enter(&proc_lock); LIST_FOREACH(l, &alllwp, l_list) { /* Safe to check and set without lock held */ if (l->l_psid != psid) continue; l->l_psid = PS_NONE; } mutex_exit(&proc_lock); /* Destroy the processor-set */ kmem_free(psets[psid - 1], sizeof(pset_info_t)); psets[psid - 1] = NULL; psets_count--; mutex_exit(&cpu_lock); return 0; } /* * General system calls for the processor-sets. */ int sys_pset_create(struct lwp *l, const struct sys_pset_create_args *uap, register_t *retval) { /* { syscallarg(psetid_t) *psid; } */ psetid_t psid; int error; /* Available only for super-user */ if (kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_PSET, KAUTH_REQ_SYSTEM_PSET_CREATE, NULL, NULL, NULL)) return EPERM; error = kern_pset_create(&psid); if (error) return error; error = copyout(&psid, SCARG(uap, psid), sizeof(psetid_t)); if (error) (void)kern_pset_destroy(psid); return error; } int sys_pset_destroy(struct lwp *l, const struct sys_pset_destroy_args *uap, register_t *retval) { /* { syscallarg(psetid_t) psid; } */ /* Available only for super-user */ if (kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_PSET, KAUTH_REQ_SYSTEM_PSET_DESTROY, KAUTH_ARG(SCARG(uap, psid)), NULL, NULL)) return EPERM; return kern_pset_destroy(SCARG(uap, psid)); } int sys_pset_assign(struct lwp *l, const struct sys_pset_assign_args *uap, register_t *retval) { /* { syscallarg(psetid_t) psid; syscallarg(cpuid_t) cpuid; syscallarg(psetid_t) *opsid; } */ struct cpu_info *ici, *ci = NULL; struct schedstate_percpu *spc = NULL; struct lwp *t; psetid_t psid = SCARG(uap, psid), opsid = 0; CPU_INFO_ITERATOR cii; int error = 0, nnone = 0; /* Available only for super-user, except the case of PS_QUERY */ if (kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_PSET, KAUTH_REQ_SYSTEM_PSET_ASSIGN, KAUTH_ARG(SCARG(uap, psid)), NULL, NULL)) return EPERM; /* Find the target CPU */ mutex_enter(&cpu_lock); for (CPU_INFO_FOREACH(cii, ici)) { struct schedstate_percpu *ispc; ispc = &ici->ci_schedstate; if (cpu_index(ici) == SCARG(uap, cpuid)) { ci = ici; spc = ispc; } nnone += (ispc->spc_psid == PS_NONE); } if (ci == NULL) { mutex_exit(&cpu_lock); return EINVAL; } error = psid_validate(psid, true); if (error) { mutex_exit(&cpu_lock); return error; } opsid = spc->spc_psid; switch (psid) { case PS_QUERY: break; case PS_MYID: psid = curlwp->l_psid; /* FALLTHROUGH */ default: /* * Just finish if old and new processor-sets are * the same. */ if (spc->spc_psid == psid) break; /* * Ensure at least one CPU stays in the default set, * and that specified CPU is not offline. */ if (psid != PS_NONE && ((spc->spc_flags & SPCF_OFFLINE) || (nnone == 1 && spc->spc_psid == PS_NONE))) { mutex_exit(&cpu_lock); return EBUSY; } mutex_enter(&proc_lock); /* * Ensure that none of the threads are using affinity mask * with this target CPU in it. */ LIST_FOREACH(t, &alllwp, l_list) { if (t->l_affinity == NULL) { continue; } lwp_lock(t); if (t->l_affinity == NULL) { lwp_unlock(t); continue; } if (kcpuset_isset(t->l_affinity, cpu_index(ci))) { lwp_unlock(t); mutex_exit(&proc_lock); mutex_exit(&cpu_lock); return EPERM; } lwp_unlock(t); } /* * Set the processor-set ID. * Migrate out any threads running on this CPU. */ spc->spc_psid = psid; LIST_FOREACH(t, &alllwp, l_list) { struct cpu_info *tci; if (t->l_cpu != ci) continue; if (t->l_pflag & (LP_BOUND | LP_INTR)) continue; lwp_lock(t); tci = sched_takecpu(t); KASSERT(tci != ci); lwp_migrate(t, tci); } mutex_exit(&proc_lock); break; } mutex_exit(&cpu_lock); if (SCARG(uap, opsid) != NULL) error = copyout(&opsid, SCARG(uap, opsid), sizeof(psetid_t)); return error; } int sys__pset_bind(struct lwp *l, const struct sys__pset_bind_args *uap, register_t *retval) { /* { syscallarg(idtype_t) idtype; syscallarg(id_t) first_id; syscallarg(id_t) second_id; syscallarg(psetid_t) psid; syscallarg(psetid_t) *opsid; } */ struct cpu_info *ci; struct proc *p; struct lwp *t; id_t id1, id2; pid_t pid = 0; lwpid_t lid = 0; psetid_t psid, opsid; int error = 0, lcnt; psid = SCARG(uap, psid); /* Available only for super-user, except the case of PS_QUERY */ if (kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_PSET, KAUTH_REQ_SYSTEM_PSET_BIND, KAUTH_ARG(SCARG(uap, psid)), NULL, NULL)) return EPERM; mutex_enter(&cpu_lock); error = psid_validate(psid, true); if (error) { mutex_exit(&cpu_lock); return error; } if (psid == PS_MYID) psid = curlwp->l_psid; /* * Get PID and LID from the ID. */ p = l->l_proc; id1 = SCARG(uap, first_id); id2 = SCARG(uap, second_id); mutex_enter(&proc_lock); switch (SCARG(uap, idtype)) { case P_PID: /* * Process: * First ID - PID; * Second ID - ignored; */ pid = (id1 == P_MYID) ? p->p_pid : id1; lid = 0; break; case P_LWPID: /* * Thread (LWP): * First ID - LID; * Second ID - PID; */ if (id1 == P_MYID) { pid = p->p_pid; lid = l->l_lid; break; } lid = id1; pid = (id2 == P_MYID) ? p->p_pid : id2; break; default: error = EINVAL; goto error; } /* Find the process */ p = proc_find(pid); if (p == NULL) { error = ESRCH; goto error; } /* Disallow modification of the system processes */ if (p->p_flag & PK_SYSTEM) { error = EPERM; goto error; } /* Find the LWP(s) */ lcnt = 0; ci = NULL; mutex_enter(p->p_lock); LIST_FOREACH(t, &p->p_lwps, l_sibling) { if (lid && lid != t->l_lid) continue; /* * Bind the thread to the processor-set, * take some CPU and migrate. */ lwp_lock(t); opsid = t->l_psid; t->l_psid = psid; ci = sched_takecpu(t); /* Unlocks LWP */ lwp_migrate(t, ci); lcnt++; } mutex_exit(p->p_lock); if (lcnt == 0) { error = ESRCH; } error: mutex_exit(&proc_lock); mutex_exit(&cpu_lock); if (error == 0 && SCARG(uap, opsid)) error = copyout(&opsid, SCARG(uap, opsid), sizeof(psetid_t)); return error; } /* * Sysctl nodes and initialization. */ static int sysctl_psets_max(SYSCTLFN_ARGS) { struct sysctlnode node; int error, newsize; node = *rnode; node.sysctl_data = &newsize; newsize = psets_max; error = sysctl_lookup(SYSCTLFN_CALL(&node)); if (error || newp == NULL) return error; if (newsize <= 0) return EINVAL; sysctl_unlock(); error = psets_realloc(newsize); sysctl_relock(); return error; } static int sysctl_psets_list(SYSCTLFN_ARGS) { const size_t bufsz = 1024; char *buf, tbuf[16]; int i, error; size_t len; sysctl_unlock(); buf = kmem_alloc(bufsz, KM_SLEEP); snprintf(buf, bufsz, "%d:1", PS_NONE); /* XXX */ mutex_enter(&cpu_lock); for (i = 0; i < psets_max; i++) { if (psets[i] == NULL) continue; snprintf(tbuf, sizeof(tbuf), ",%d:2", i + 1); /* XXX */ strlcat(buf, tbuf, bufsz); } mutex_exit(&cpu_lock); len = strlen(buf) + 1; error = 0; if (oldp != NULL) error = copyout(buf, oldp, uimin(len, *oldlenp)); *oldlenp = len; kmem_free(buf, bufsz); sysctl_relock(); return error; } SYSCTL_SETUP(sysctl_pset_setup, "sysctl kern.pset subtree setup") { const struct sysctlnode *node = NULL; sysctl_createv(clog, 0, NULL, &node, CTLFLAG_PERMANENT, CTLTYPE_NODE, "pset", SYSCTL_DESCR("Processor-set options"), NULL, 0, NULL, 0, CTL_KERN, CTL_CREATE, CTL_EOL); if (node == NULL) return; sysctl_createv(clog, 0, &node, NULL, CTLFLAG_PERMANENT | CTLFLAG_READWRITE, CTLTYPE_INT, "psets_max", SYSCTL_DESCR("Maximal count of the processor-sets"), sysctl_psets_max, 0, &psets_max, 0, CTL_CREATE, CTL_EOL); sysctl_createv(clog, 0, &node, NULL, CTLFLAG_PERMANENT, CTLTYPE_STRING, "list", SYSCTL_DESCR("List of active sets"), sysctl_psets_list, 0, NULL, 0, CTL_CREATE, CTL_EOL); }
12 2 3 8 7 3 5 6 1 4 7 7 3 5 7 6 5 3 7 8 3 4 5 6 1 4 11 3 3 3 3 3 5 5 3 1 2 1 3 3 3 3 11 6 8 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 /* $NetBSD: kern_ntptime.c,v 1.64 2022/10/26 23:23:52 riastradh Exp $ */ /*- * Copyright (c) 2008 The NetBSD Foundation, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /*- *********************************************************************** * * * Copyright (c) David L. Mills 1993-2001 * * * * Permission to use, copy, modify, and distribute this software and * * its documentation for any purpose and without fee is hereby * * granted, provided that the above copyright notice appears in all * * copies and that both the copyright notice and this permission * * notice appear in supporting documentation, and that the name * * University of Delaware not be used in advertising or publicity * * pertaining to distribution of the software without specific, * * written prior permission. The University of Delaware makes no * * representations about the suitability this software for any * * purpose. It is provided "as is" without express or implied * * warranty. * * * **********************************************************************/ /* * Adapted from the original sources for FreeBSD and timecounters by: * Poul-Henning Kamp <phk@FreeBSD.org>. * * The 32bit version of the "LP" macros seems a bit past its "sell by" * date so I have retained only the 64bit version and included it directly * in this file. * * Only minor changes done to interface with the timecounters over in * sys/kern/kern_clock.c. Some of the comments below may be (even more) * confusing and/or plain wrong in that context. */ #include <sys/cdefs.h> /* __FBSDID("$FreeBSD: src/sys/kern/kern_ntptime.c,v 1.59 2005/05/28 14:34:41 rwatson Exp $"); */ __KERNEL_RCSID(0, "$NetBSD: kern_ntptime.c,v 1.64 2022/10/26 23:23:52 riastradh Exp $"); #ifdef _KERNEL_OPT #include "opt_ntp.h" #endif #include <sys/param.h> #include <sys/resourcevar.h> #include <sys/systm.h> #include <sys/kernel.h> #include <sys/proc.h> #include <sys/sysctl.h> #include <sys/timex.h> #include <sys/vnode.h> #include <sys/kauth.h> #include <sys/mount.h> #include <sys/syscallargs.h> #include <sys/cpu.h> #include <compat/sys/timex.h> /* * Single-precision macros for 64-bit machines */ typedef int64_t l_fp; #define L_ADD(v, u) ((v) += (u)) #define L_SUB(v, u) ((v) -= (u)) #define L_ADDHI(v, a) ((v) += (int64_t)(a) << 32) #define L_NEG(v) ((v) = -(v)) #define L_RSHIFT(v, n) \ do { \ if ((v) < 0) \ (v) = -(-(v) >> (n)); \ else \ (v) = (v) >> (n); \ } while (0) #define L_MPY(v, a) ((v) *= (a)) #define L_CLR(v) ((v) = 0) #define L_ISNEG(v) ((v) < 0) #define L_LINT(v, a) ((v) = (int64_t)((uint64_t)(a) << 32)) #define L_GINT(v) ((v) < 0 ? -(-(v) >> 32) : (v) >> 32) #ifdef NTP /* * Generic NTP kernel interface * * These routines constitute the Network Time Protocol (NTP) interfaces * for user and daemon application programs. The ntp_gettime() routine * provides the time, maximum error (synch distance) and estimated error * (dispersion) to client user application programs. The ntp_adjtime() * routine is used by the NTP daemon to adjust the system clock to an * externally derived time. The time offset and related variables set by * this routine are used by other routines in this module to adjust the * phase and frequency of the clock discipline loop which controls the * system clock. * * When the kernel time is reckoned directly in nanoseconds (NTP_NANO * defined), the time at each tick interrupt is derived directly from * the kernel time variable. When the kernel time is reckoned in * microseconds, (NTP_NANO undefined), the time is derived from the * kernel time variable together with a variable representing the * leftover nanoseconds at the last tick interrupt. In either case, the * current nanosecond time is reckoned from these values plus an * interpolated value derived by the clock routines in another * architecture-specific module. The interpolation can use either a * dedicated counter or a processor cycle counter (PCC) implemented in * some architectures. * * Note that all routines must run at priority splclock or higher. */ /* * Phase/frequency-lock loop (PLL/FLL) definitions * * The nanosecond clock discipline uses two variable types, time * variables and frequency variables. Both types are represented as 64- * bit fixed-point quantities with the decimal point between two 32-bit * halves. On a 32-bit machine, each half is represented as a single * word and mathematical operations are done using multiple-precision * arithmetic. On a 64-bit machine, ordinary computer arithmetic is * used. * * A time variable is a signed 64-bit fixed-point number in ns and * fraction. It represents the remaining time offset to be amortized * over succeeding tick interrupts. The maximum time offset is about * 0.5 s and the resolution is about 2.3e-10 ns. * * 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 3 3 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * |s s s| ns | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | fraction | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * * A frequency variable is a signed 64-bit fixed-point number in ns/s * and fraction. It represents the ns and fraction to be added to the * kernel time variable at each second. The maximum frequency offset is * about +-500000 ns/s and the resolution is about 2.3e-10 ns/s. * * 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 3 3 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * |s s s s s s s s s s s s s| ns/s | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | fraction | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ */ /* * The following variables establish the state of the PLL/FLL and the * residual time and frequency offset of the local clock. */ #define SHIFT_PLL 4 /* PLL loop gain (shift) */ #define SHIFT_FLL 2 /* FLL loop gain (shift) */ static int time_state = TIME_OK; /* clock state */ static int time_status = STA_UNSYNC; /* clock status bits */ static long time_tai; /* TAI offset (s) */ static long time_monitor; /* last time offset scaled (ns) */ static long time_constant; /* poll interval (shift) (s) */ static long time_precision = 1; /* clock precision (ns) */ static long time_maxerror = MAXPHASE / 1000; /* maximum error (us) */ static long time_esterror = MAXPHASE / 1000; /* estimated error (us) */ static time_t time_reftime; /* time at last adjustment (s) */ static l_fp time_offset; /* time offset (ns) */ static l_fp time_freq; /* frequency offset (ns/s) */ #endif /* NTP */ static l_fp time_adj; /* tick adjust (ns/s) */ int64_t time_adjtime; /* correction from adjtime(2) (usec) */ #ifdef NTP #ifdef PPS_SYNC /* * The following variables are used when a pulse-per-second (PPS) signal * is available and connected via a modem control lead. They establish * the engineering parameters of the clock discipline loop when * controlled by the PPS signal. */ #define PPS_FAVG 2 /* min freq avg interval (s) (shift) */ #define PPS_FAVGDEF 8 /* default freq avg int (s) (shift) */ #define PPS_FAVGMAX 15 /* max freq avg interval (s) (shift) */ #define PPS_PAVG 4 /* phase avg interval (s) (shift) */ #define PPS_VALID 120 /* PPS signal watchdog max (s) */ #define PPS_MAXWANDER 100000 /* max PPS wander (ns/s) */ #define PPS_POPCORN 2 /* popcorn spike threshold (shift) */ static struct timespec pps_tf[3]; /* phase median filter */ static l_fp pps_freq; /* scaled frequency offset (ns/s) */ static long pps_fcount; /* frequency accumulator */ static long pps_jitter; /* nominal jitter (ns) */ static long pps_stabil; /* nominal stability (scaled ns/s) */ static long pps_lastsec; /* time at last calibration (s) */ static int pps_valid; /* signal watchdog counter */ static int pps_shift = PPS_FAVG; /* interval duration (s) (shift) */ static int pps_shiftmax = PPS_FAVGDEF; /* max interval duration (s) (shift) */ static int pps_intcnt; /* wander counter */ /* * PPS signal quality monitors */ static long pps_calcnt; /* calibration intervals */ static long pps_jitcnt; /* jitter limit exceeded */ static long pps_stbcnt; /* stability limit exceeded */ static long pps_errcnt; /* calibration errors */ #endif /* PPS_SYNC */ /* * End of phase/frequency-lock loop (PLL/FLL) definitions */ static void hardupdate(long offset); /* * ntp_gettime() - NTP user application interface */ void ntp_gettime(struct ntptimeval *ntv) { memset(ntv, 0, sizeof(*ntv)); mutex_spin_enter(&timecounter_lock); nanotime(&ntv->time); ntv->maxerror = time_maxerror; ntv->esterror = time_esterror; ntv->tai = time_tai; ntv->time_state = time_state; mutex_spin_exit(&timecounter_lock); } /* ARGSUSED */ /* * ntp_adjtime() - NTP daemon application interface */ int sys_ntp_adjtime(struct lwp *l, const struct sys_ntp_adjtime_args *uap, register_t *retval) { /* { syscallarg(struct timex *) tp; } */ struct timex ntv; int error; error = copyin((void *)SCARG(uap, tp), (void *)&ntv, sizeof(ntv)); if (error != 0) return (error); if (ntv.modes != 0 && (error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_TIME, KAUTH_REQ_SYSTEM_TIME_NTPADJTIME, NULL, NULL, NULL)) != 0) return (error); ntp_adjtime1(&ntv); error = copyout((void *)&ntv, (void *)SCARG(uap, tp), sizeof(ntv)); if (!error) *retval = ntp_timestatus(); return error; } void ntp_adjtime1(struct timex *ntv) { long freq; int modes; /* * Update selected clock variables - only the superuser can * change anything. Note that there is no error checking here on * the assumption the superuser should know what it is doing. * Note that either the time constant or TAI offset are loaded * from the ntv.constant member, depending on the mode bits. If * the STA_PLL bit in the status word is cleared, the state and * status words are reset to the initial values at boot. */ mutex_spin_enter(&timecounter_lock); modes = ntv->modes; if (modes != 0) /* We need to save the system time during shutdown */ time_adjusted |= 2; if (modes & MOD_MAXERROR) time_maxerror = ntv->maxerror; if (modes & MOD_ESTERROR) time_esterror = ntv->esterror; if (modes & MOD_STATUS) { if (time_status & STA_PLL && !(ntv->status & STA_PLL)) { time_state = TIME_OK; time_status = STA_UNSYNC; #ifdef PPS_SYNC pps_shift = PPS_FAVG; #endif /* PPS_SYNC */ } time_status &= STA_RONLY; time_status |= ntv->status & ~STA_RONLY; } if (modes & MOD_TIMECONST) { if (ntv->constant < 0) time_constant = 0; else if (ntv->constant > MAXTC) time_constant = MAXTC; else time_constant = ntv->constant; } if (modes & MOD_TAI) { if (ntv->constant > 0) /* XXX zero & negative numbers ? */ time_tai = ntv->constant; } #ifdef PPS_SYNC if (modes & MOD_PPSMAX) { if (ntv->shift < PPS_FAVG) pps_shiftmax = PPS_FAVG; else if (ntv->shift > PPS_FAVGMAX) pps_shiftmax = PPS_FAVGMAX; else pps_shiftmax = ntv->shift; } #endif /* PPS_SYNC */ if (modes & MOD_NANO) time_status |= STA_NANO; if (modes & MOD_MICRO) time_status &= ~STA_NANO; if (modes & MOD_CLKB) time_status |= STA_CLK; if (modes & MOD_CLKA) time_status &= ~STA_CLK; if (modes & MOD_FREQUENCY) { freq = MIN(INT32_MAX, MAX(INT32_MIN, ntv->freq)); freq = (freq * (int64_t)1000) >> 16; if (freq > MAXFREQ) L_LINT(time_freq, MAXFREQ); else if (freq < -MAXFREQ) L_LINT(time_freq, -MAXFREQ); else { /* * ntv.freq is [PPM * 2^16] = [us/s * 2^16] * time_freq is [ns/s * 2^32] */ time_freq = ntv->freq * 1000LL * 65536LL; } #ifdef PPS_SYNC pps_freq = time_freq; #endif /* PPS_SYNC */ } if (modes & MOD_OFFSET) { if (time_status & STA_NANO) { hardupdate(ntv->offset); } else { long offset = ntv->offset; offset = MIN(offset, MAXPHASE/1000); offset = MAX(offset, -MAXPHASE/1000); hardupdate(offset * 1000); } } /* * Retrieve all clock variables. Note that the TAI offset is * returned only by ntp_gettime(); */ if (time_status & STA_NANO) ntv->offset = L_GINT(time_offset); else ntv->offset = L_GINT(time_offset) / 1000; /* XXX rounding ? */ if (time_freq < 0) ntv->freq = L_GINT(-((-time_freq / 1000LL) << 16)); else ntv->freq = L_GINT((time_freq / 1000LL) << 16); ntv->maxerror = time_maxerror; ntv->esterror = time_esterror; ntv->status = time_status; ntv->constant = time_constant; if (time_status & STA_NANO) ntv->precision = time_precision; else ntv->precision = time_precision / 1000; ntv->tolerance = MAXFREQ * SCALE_PPM; #ifdef PPS_SYNC ntv->shift = pps_shift; ntv->ppsfreq = L_GINT((pps_freq / 1000LL) << 16); if (time_status & STA_NANO) ntv->jitter = pps_jitter; else ntv->jitter = pps_jitter / 1000; ntv->stabil = pps_stabil; ntv->calcnt = pps_calcnt; ntv->errcnt = pps_errcnt; ntv->jitcnt = pps_jitcnt; ntv->stbcnt = pps_stbcnt; #endif /* PPS_SYNC */ mutex_spin_exit(&timecounter_lock); } #endif /* NTP */ /* * second_overflow() - called after ntp_tick_adjust() * * This routine is ordinarily called immediately following the above * routine ntp_tick_adjust(). While these two routines are normally * combined, they are separated here only for the purposes of * simulation. */ void ntp_update_second(int64_t *adjustment, time_t *newsec) { int tickrate; l_fp ftemp; /* 32/64-bit temporary */ KASSERT(mutex_owned(&timecounter_lock)); #ifdef NTP /* * On rollover of the second both the nanosecond and microsecond * clocks are updated and the state machine cranked as * necessary. The phase adjustment to be used for the next * second is calculated and the maximum error is increased by * the tolerance. */ time_maxerror += MAXFREQ / 1000; /* * Leap second processing. If in leap-insert state at * the end of the day, the system clock is set back one * second; if in leap-delete state, the system clock is * set ahead one second. The nano_time() routine or * external clock driver will insure that reported time * is always monotonic. */ switch (time_state) { /* * No warning. */ case TIME_OK: if (time_status & STA_INS) time_state = TIME_INS; else if (time_status & STA_DEL) time_state = TIME_DEL; break; /* * Insert second 23:59:60 following second * 23:59:59. */ case TIME_INS: if (!(time_status & STA_INS)) time_state = TIME_OK; else if ((*newsec) % 86400 == 0) { (*newsec)--; time_state = TIME_OOP; time_tai++; } break; /* * Delete second 23:59:59. */ case TIME_DEL: if (!(time_status & STA_DEL)) time_state = TIME_OK; else if (((*newsec) + 1) % 86400 == 0) { (*newsec)++; time_tai--; time_state = TIME_WAIT; } break; /* * Insert second in progress. */ case TIME_OOP: time_state = TIME_WAIT; break; /* * Wait for status bits to clear. */ case TIME_WAIT: if (!(time_status & (STA_INS | STA_DEL))) time_state = TIME_OK; } /* * Compute the total time adjustment for the next second * in ns. The offset is reduced by a factor depending on * whether the PPS signal is operating. Note that the * value is in effect scaled by the clock frequency, * since the adjustment is added at each tick interrupt. */ ftemp = time_offset; #ifdef PPS_SYNC /* XXX even if PPS signal dies we should finish adjustment ? */ if (time_status & STA_PPSTIME && time_status & STA_PPSSIGNAL) L_RSHIFT(ftemp, pps_shift); else L_RSHIFT(ftemp, SHIFT_PLL + time_constant); #else L_RSHIFT(ftemp, SHIFT_PLL + time_constant); #endif /* PPS_SYNC */ time_adj = ftemp; L_SUB(time_offset, ftemp); L_ADD(time_adj, time_freq); #ifdef PPS_SYNC if (pps_valid > 0) pps_valid--; else time_status &= ~STA_PPSSIGNAL; #endif /* PPS_SYNC */ #else /* !NTP */ L_CLR(time_adj); #endif /* !NTP */ /* * Apply any correction from adjtime(2). If more than one second * off we slew at a rate of 5ms/s (5000 PPM) else 500us/s (500PPM) * until the last second is slewed the final < 500 usecs. */ if (time_adjtime != 0) { if (time_adjtime > 1000000) tickrate = 5000; else if (time_adjtime < -1000000) tickrate = -5000; else if (time_adjtime > 500) tickrate = 500; else if (time_adjtime < -500) tickrate = -500; else tickrate = time_adjtime; time_adjtime -= tickrate; L_LINT(ftemp, tickrate * 1000); L_ADD(time_adj, ftemp); } *adjustment = time_adj; } /* * ntp_init() - initialize variables and structures * * This routine must be called after the kernel variables hz and tick * are set or changed and before the next tick interrupt. In this * particular implementation, these values are assumed set elsewhere in * the kernel. The design allows the clock frequency and tick interval * to be changed while the system is running. So, this routine should * probably be integrated with the code that does that. */ void ntp_init(void) { /* * The following variables are initialized only at startup. Only * those structures not cleared by the compiler need to be * initialized, and these only in the simulator. In the actual * kernel, any nonzero values here will quickly evaporate. */ L_CLR(time_adj); #ifdef NTP L_CLR(time_offset); L_CLR(time_freq); #ifdef PPS_SYNC pps_tf[0].tv_sec = pps_tf[0].tv_nsec = 0; pps_tf[1].tv_sec = pps_tf[1].tv_nsec = 0; pps_tf[2].tv_sec = pps_tf[2].tv_nsec = 0; pps_fcount = 0; L_CLR(pps_freq); #endif /* PPS_SYNC */ #endif } #ifdef NTP /* * hardupdate() - local clock update * * This routine is called by ntp_adjtime() to update the local clock * phase and frequency. The implementation is of an adaptive-parameter, * hybrid phase/frequency-lock loop (PLL/FLL). The routine computes new * time and frequency offset estimates for each call. If the kernel PPS * discipline code is configured (PPS_SYNC), the PPS signal itself * determines the new time offset, instead of the calling argument. * Presumably, calls to ntp_adjtime() occur only when the caller * believes the local clock is valid within some bound (+-128 ms with * NTP). If the caller's time is far different than the PPS time, an * argument will ensue, and it's not clear who will lose. * * For uncompensated quartz crystal oscillators and nominal update * intervals less than 256 s, operation should be in phase-lock mode, * where the loop is disciplined to phase. For update intervals greater * than 1024 s, operation should be in frequency-lock mode, where the * loop is disciplined to frequency. Between 256 s and 1024 s, the mode * is selected by the STA_MODE status bit. * * Note: splclock() is in effect. */ void hardupdate(long offset) { long mtemp; l_fp ftemp; KASSERT(mutex_owned(&timecounter_lock)); /* * Select how the phase is to be controlled and from which * source. If the PPS signal is present and enabled to * discipline the time, the PPS offset is used; otherwise, the * argument offset is used. */ if (!(time_status & STA_PLL)) return; if (!(time_status & STA_PPSTIME && time_status & STA_PPSSIGNAL)) { if (offset > MAXPHASE) time_monitor = MAXPHASE; else if (offset < -MAXPHASE) time_monitor = -MAXPHASE; else time_monitor = offset; L_LINT(time_offset, time_monitor); } /* * Select how the frequency is to be controlled and in which * mode (PLL or FLL). If the PPS signal is present and enabled * to discipline the frequency, the PPS frequency is used; * otherwise, the argument offset is used to compute it. */ if (time_status & STA_PPSFREQ && time_status & STA_PPSSIGNAL) { time_reftime = time_second; return; } if (time_status & STA_FREQHOLD || time_reftime == 0) time_reftime = time_second; mtemp = time_second - time_reftime; L_LINT(ftemp, time_monitor); L_RSHIFT(ftemp, (SHIFT_PLL + 2 + time_constant) << 1); L_MPY(ftemp, mtemp); L_ADD(time_freq, ftemp); time_status &= ~STA_MODE; if (mtemp >= MINSEC && (time_status & STA_FLL || mtemp > MAXSEC)) { L_LINT(ftemp, (time_monitor << 4) / mtemp); L_RSHIFT(ftemp, SHIFT_FLL + 4); L_ADD(time_freq, ftemp); time_status |= STA_MODE; } time_reftime = time_second; if (L_GINT(time_freq) > MAXFREQ) L_LINT(time_freq, MAXFREQ); else if (L_GINT(time_freq) < -MAXFREQ) L_LINT(time_freq, -MAXFREQ); } #ifdef PPS_SYNC /* * hardpps() - discipline CPU clock oscillator to external PPS signal * * This routine is called at each PPS interrupt in order to discipline * the CPU clock oscillator to the PPS signal. It measures the PPS phase * and leaves it in a handy spot for the hardclock() routine. It * integrates successive PPS phase differences and calculates the * frequency offset. This is used in hardclock() to discipline the CPU * clock oscillator so that intrinsic frequency error is cancelled out. * The code requires the caller to capture the time and hardware counter * value at the on-time PPS signal transition. * * Note that, on some Unix systems, this routine runs at an interrupt * priority level higher than the timer interrupt routine hardclock(). * Therefore, the variables used are distinct from the hardclock() * variables, except for certain exceptions: The PPS frequency pps_freq * and phase pps_offset variables are determined by this routine and * updated atomically. The time_tolerance variable can be considered a * constant, since it is infrequently changed, and then only when the * PPS signal is disabled. The watchdog counter pps_valid is updated * once per second by hardclock() and is atomically cleared in this * routine. */ void hardpps(struct timespec *tsp, /* time at PPS */ long nsec /* hardware counter at PPS */) { long u_sec, u_nsec, v_nsec; /* temps */ l_fp ftemp; KASSERT(mutex_owned(&timecounter_lock)); /* * The signal is first processed by a range gate and frequency * discriminator. The range gate rejects noise spikes outside * the range +-500 us. The frequency discriminator rejects input * signals with apparent frequency outside the range 1 +-500 * PPM. If two hits occur in the same second, we ignore the * later hit; if not and a hit occurs outside the range gate, * keep the later hit for later comparison, but do not process * it. */ time_status |= STA_PPSSIGNAL | STA_PPSJITTER; time_status &= ~(STA_PPSWANDER | STA_PPSERROR); pps_valid = PPS_VALID; u_sec = tsp->tv_sec; u_nsec = tsp->tv_nsec; if (u_nsec >= (NANOSECOND >> 1)) { u_nsec -= NANOSECOND; u_sec++; } v_nsec = u_nsec - pps_tf[0].tv_nsec; if (u_sec == pps_tf[0].tv_sec && v_nsec < NANOSECOND - MAXFREQ) return; pps_tf[2] = pps_tf[1]; pps_tf[1] = pps_tf[0]; pps_tf[0].tv_sec = u_sec; pps_tf[0].tv_nsec = u_nsec; /* * Compute the difference between the current and previous * counter values. If the difference exceeds 0.5 s, assume it * has wrapped around, so correct 1.0 s. If the result exceeds * the tick interval, the sample point has crossed a tick * boundary during the last second, so correct the tick. Very * intricate. */ u_nsec = nsec; if (u_nsec > (NANOSECOND >> 1)) u_nsec -= NANOSECOND; else if (u_nsec < -(NANOSECOND >> 1)) u_nsec += NANOSECOND; pps_fcount += u_nsec; if (v_nsec > MAXFREQ || v_nsec < -MAXFREQ) return; time_status &= ~STA_PPSJITTER; /* * A three-stage median filter is used to help denoise the PPS * time. The median sample becomes the time offset estimate; the * difference between the other two samples becomes the time * dispersion (jitter) estimate. */ if (pps_tf[0].tv_nsec > pps_tf[1].tv_nsec) { if (pps_tf[1].tv_nsec > pps_tf[2].tv_nsec) { v_nsec = pps_tf[1].tv_nsec; /* 0 1 2 */ u_nsec = pps_tf[0].tv_nsec - pps_tf[2].tv_nsec; } else if (pps_tf[2].tv_nsec > pps_tf[0].tv_nsec) { v_nsec = pps_tf[0].tv_nsec; /* 2 0 1 */ u_nsec = pps_tf[2].tv_nsec - pps_tf[1].tv_nsec; } else { v_nsec = pps_tf[2].tv_nsec; /* 0 2 1 */ u_nsec = pps_tf[0].tv_nsec - pps_tf[1].tv_nsec; } } else { if (pps_tf[1].tv_nsec < pps_tf[2].tv_nsec) { v_nsec = pps_tf[1].tv_nsec; /* 2 1 0 */ u_nsec = pps_tf[2].tv_nsec - pps_tf[0].tv_nsec; } else if (pps_tf[2].tv_nsec < pps_tf[0].tv_nsec) { v_nsec = pps_tf[0].tv_nsec; /* 1 0 2 */ u_nsec = pps_tf[1].tv_nsec - pps_tf[2].tv_nsec; } else { v_nsec = pps_tf[2].tv_nsec; /* 1 2 0 */ u_nsec = pps_tf[1].tv_nsec - pps_tf[0].tv_nsec; } } /* * Nominal jitter is due to PPS signal noise and interrupt * latency. If it exceeds the popcorn threshold, the sample is * discarded. otherwise, if so enabled, the time offset is * updated. We can tolerate a modest loss of data here without * much degrading time accuracy. */ if (u_nsec > (pps_jitter << PPS_POPCORN)) { time_status |= STA_PPSJITTER; pps_jitcnt++; } else if (time_status & STA_PPSTIME) { time_monitor = -v_nsec; L_LINT(time_offset, time_monitor); } pps_jitter += (u_nsec - pps_jitter) >> PPS_FAVG; u_sec = pps_tf[0].tv_sec - pps_lastsec; if (u_sec < (1 << pps_shift)) return; /* * At the end of the calibration interval the difference between * the first and last counter values becomes the scaled * frequency. It will later be divided by the length of the * interval to determine the frequency update. If the frequency * exceeds a sanity threshold, or if the actual calibration * interval is not equal to the expected length, the data are * discarded. We can tolerate a modest loss of data here without * much degrading frequency accuracy. */ pps_calcnt++; v_nsec = -pps_fcount; pps_lastsec = pps_tf[0].tv_sec; pps_fcount = 0; u_nsec = MAXFREQ << pps_shift; if (v_nsec > u_nsec || v_nsec < -u_nsec || u_sec != (1 << pps_shift)) { time_status |= STA_PPSERROR; pps_errcnt++; return; } /* * Here the raw frequency offset and wander (stability) is * calculated. If the wander is less than the wander threshold * for four consecutive averaging intervals, the interval is * doubled; if it is greater than the threshold for four * consecutive intervals, the interval is halved. The scaled * frequency offset is converted to frequency offset. The * stability metric is calculated as the average of recent * frequency changes, but is used only for performance * monitoring. */ L_LINT(ftemp, v_nsec); L_RSHIFT(ftemp, pps_shift); L_SUB(ftemp, pps_freq); u_nsec = L_GINT(ftemp); if (u_nsec > PPS_MAXWANDER) { L_LINT(ftemp, PPS_MAXWANDER); pps_intcnt--; time_status |= STA_PPSWANDER; pps_stbcnt++; } else if (u_nsec < -PPS_MAXWANDER) { L_LINT(ftemp, -PPS_MAXWANDER); pps_intcnt--; time_status |= STA_PPSWANDER; pps_stbcnt++; } else { pps_intcnt++; } if (pps_intcnt >= 4) { pps_intcnt = 4; if (pps_shift < pps_shiftmax) { pps_shift++; pps_intcnt = 0; } } else if (pps_intcnt <= -4 || pps_shift > pps_shiftmax) { pps_intcnt = -4; if (pps_shift > PPS_FAVG) { pps_shift--; pps_intcnt = 0; } } if (u_nsec < 0) u_nsec = -u_nsec; pps_stabil += (u_nsec * SCALE_PPM - pps_stabil) >> PPS_FAVG; /* * The PPS frequency is recalculated and clamped to the maximum * MAXFREQ. If enabled, the system clock frequency is updated as * well. */ L_ADD(pps_freq, ftemp); u_nsec = L_GINT(pps_freq); if (u_nsec > MAXFREQ) L_LINT(pps_freq, MAXFREQ); else if (u_nsec < -MAXFREQ) L_LINT(pps_freq, -MAXFREQ); if (time_status & STA_PPSFREQ) time_freq = pps_freq; } #endif /* PPS_SYNC */ #endif /* NTP */ #ifdef NTP int ntp_timestatus(void) { int rv; /* * Status word error decode. If any of these conditions * occur, an error is returned, instead of the status * word. Most applications will care only about the fact * the system clock may not be trusted, not about the * details. * * Hardware or software error */ mutex_spin_enter(&timecounter_lock); if ((time_status & (STA_UNSYNC | STA_CLOCKERR)) || /* * PPS signal lost when either time or frequency * synchronization requested */ (time_status & (STA_PPSFREQ | STA_PPSTIME) && !(time_status & STA_PPSSIGNAL)) || /* * PPS jitter exceeded when time synchronization * requested */ (time_status & STA_PPSTIME && time_status & STA_PPSJITTER) || /* * PPS wander exceeded or calibration error when * frequency synchronization requested */ (time_status & STA_PPSFREQ && time_status & (STA_PPSWANDER | STA_PPSERROR))) rv = TIME_ERROR; else rv = time_state; mutex_spin_exit(&timecounter_lock); return rv; } /*ARGSUSED*/ /* * ntp_gettime() - NTP user application interface */ int sys___ntp_gettime50(struct lwp *l, const struct sys___ntp_gettime50_args *uap, register_t *retval) { /* { syscallarg(struct ntptimeval *) ntvp; } */ struct ntptimeval ntv; int error = 0; if (SCARG(uap, ntvp)) { ntp_gettime(&ntv); error = copyout((void *)&ntv, (void *)SCARG(uap, ntvp), sizeof(ntv)); } if (!error) { *retval = ntp_timestatus(); } return(error); } /* * return information about kernel precision timekeeping */ static int sysctl_kern_ntptime(SYSCTLFN_ARGS) { struct sysctlnode node; struct ntptimeval ntv; ntp_gettime(&ntv); node = *rnode; node.sysctl_data = &ntv; node.sysctl_size = sizeof(ntv); return (sysctl_lookup(SYSCTLFN_CALL(&node))); } SYSCTL_SETUP(sysctl_kern_ntptime_setup, "sysctl kern.ntptime node setup") { sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT, CTLTYPE_STRUCT, "ntptime", SYSCTL_DESCR("Kernel clock values for NTP"), sysctl_kern_ntptime, 0, NULL, sizeof(struct ntptimeval), CTL_KERN, KERN_NTPTIME, CTL_EOL); } #endif /* !NTP */
9 9 8 8 8 9 9 9 8 8 8 8 7 8 8 8 8 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 /* $NetBSD: prop_dictionary.c,v 1.46 2023/06/14 00:35:18 rin Exp $ */ /*- * Copyright (c) 2006, 2007, 2020 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Jason R. Thorpe. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #include "prop_object_impl.h" #include <prop/prop_array.h> #include <prop/prop_dictionary.h> #include <prop/prop_string.h> #include <sys/rbtree.h> #if !defined(_KERNEL) && !defined(_STANDALONE) #include <errno.h> #endif /* * We implement these like arrays, but we keep them sorted by key. * This allows us to binary-search as well as keep externalized output * sane-looking for human eyes. */ #define EXPAND_STEP 16 /* * prop_dictionary_keysym_t is allocated with space at the end to hold the * key. This must be a regular object so that we can maintain sane iterator * semantics -- we don't want to require that the caller release the result * of prop_object_iterator_next(). * * We'd like to have some small'ish keysym objects for up-to-16 characters * in a key, some for up-to-32 characters in a key, and then a final bucket * for up-to-128 characters in a key (not including NUL). Keys longer than * 128 characters are not allowed. */ struct _prop_dictionary_keysym { struct _prop_object pdk_obj; size_t pdk_size; struct rb_node pdk_link; char pdk_key[1]; /* actually variable length */ }; /* pdk_key[1] takes care of the NUL */ #define PDK_SIZE_16 (sizeof(struct _prop_dictionary_keysym) + 16) #define PDK_SIZE_32 (sizeof(struct _prop_dictionary_keysym) + 32) #define PDK_SIZE_128 (sizeof(struct _prop_dictionary_keysym) + 128) #define PDK_MAXKEY 128 _PROP_POOL_INIT(_prop_dictionary_keysym16_pool, PDK_SIZE_16, "pdict16") _PROP_POOL_INIT(_prop_dictionary_keysym32_pool, PDK_SIZE_32, "pdict32") _PROP_POOL_INIT(_prop_dictionary_keysym128_pool, PDK_SIZE_128, "pdict128") struct _prop_dict_entry { prop_dictionary_keysym_t pde_key; prop_object_t pde_objref; }; struct _prop_dictionary { struct _prop_object pd_obj; _PROP_RWLOCK_DECL(pd_rwlock) struct _prop_dict_entry *pd_array; unsigned int pd_capacity; unsigned int pd_count; int pd_flags; uint32_t pd_version; }; #define PD_F_IMMUTABLE 0x01 /* dictionary is immutable */ _PROP_POOL_INIT(_prop_dictionary_pool, sizeof(struct _prop_dictionary), "propdict") _PROP_MALLOC_DEFINE(M_PROP_DICT, "prop dictionary", "property dictionary container object") static _prop_object_free_rv_t _prop_dictionary_free(prop_stack_t, prop_object_t *); static void _prop_dictionary_emergency_free(prop_object_t); static bool _prop_dictionary_externalize( struct _prop_object_externalize_context *, void *); static _prop_object_equals_rv_t _prop_dictionary_equals(prop_object_t, prop_object_t, void **, void **, prop_object_t *, prop_object_t *); static void _prop_dictionary_equals_finish(prop_object_t, prop_object_t); static prop_object_iterator_t _prop_dictionary_iterator_locked(prop_dictionary_t); static prop_object_t _prop_dictionary_iterator_next_object_locked(void *); static prop_object_t _prop_dictionary_get_keysym(prop_dictionary_t, prop_dictionary_keysym_t, bool); static prop_object_t _prop_dictionary_get(prop_dictionary_t, const char *, bool); static void _prop_dictionary_lock(void); static void _prop_dictionary_unlock(void); static const struct _prop_object_type _prop_object_type_dictionary = { .pot_type = PROP_TYPE_DICTIONARY, .pot_free = _prop_dictionary_free, .pot_emergency_free = _prop_dictionary_emergency_free, .pot_extern = _prop_dictionary_externalize, .pot_equals = _prop_dictionary_equals, .pot_equals_finish = _prop_dictionary_equals_finish, .pot_lock = _prop_dictionary_lock, .pot_unlock = _prop_dictionary_unlock, }; static _prop_object_free_rv_t _prop_dict_keysym_free(prop_stack_t, prop_object_t *); static bool _prop_dict_keysym_externalize( struct _prop_object_externalize_context *, void *); static _prop_object_equals_rv_t _prop_dict_keysym_equals(prop_object_t, prop_object_t, void **, void **, prop_object_t *, prop_object_t *); static const struct _prop_object_type _prop_object_type_dict_keysym = { .pot_type = PROP_TYPE_DICT_KEYSYM, .pot_free = _prop_dict_keysym_free, .pot_extern = _prop_dict_keysym_externalize, .pot_equals = _prop_dict_keysym_equals, }; #define prop_object_is_dictionary(x) \ ((x) != NULL && (x)->pd_obj.po_type == &_prop_object_type_dictionary) #define prop_object_is_dictionary_keysym(x) \ ((x) != NULL && (x)->pdk_obj.po_type == &_prop_object_type_dict_keysym) #define prop_dictionary_is_immutable(x) \ (((x)->pd_flags & PD_F_IMMUTABLE) != 0) struct _prop_dictionary_iterator { struct _prop_object_iterator pdi_base; unsigned int pdi_index; }; /* * Dictionary key symbols are immutable, and we are likely to have many * duplicated key symbols. So, to save memory, we unique'ify key symbols * so we only have to have one copy of each string. */ static int /*ARGSUSED*/ _prop_dict_keysym_rb_compare_nodes(void *ctx _PROP_ARG_UNUSED, const void *n1, const void *n2) { const struct _prop_dictionary_keysym *pdk1 = n1; const struct _prop_dictionary_keysym *pdk2 = n2; return strcmp(pdk1->pdk_key, pdk2->pdk_key); } static int /*ARGSUSED*/ _prop_dict_keysym_rb_compare_key(void *ctx _PROP_ARG_UNUSED, const void *n, const void *v) { const struct _prop_dictionary_keysym *pdk = n; const char *cp = v; return strcmp(pdk->pdk_key, cp); } static const rb_tree_ops_t _prop_dict_keysym_rb_tree_ops = { .rbto_compare_nodes = _prop_dict_keysym_rb_compare_nodes, .rbto_compare_key = _prop_dict_keysym_rb_compare_key, .rbto_node_offset = offsetof(struct _prop_dictionary_keysym, pdk_link), .rbto_context = NULL }; static struct rb_tree _prop_dict_keysym_tree; _PROP_ONCE_DECL(_prop_dict_init_once) _PROP_MUTEX_DECL_STATIC(_prop_dict_keysym_tree_mutex) static int _prop_dict_init(void) { _PROP_MUTEX_INIT(_prop_dict_keysym_tree_mutex); rb_tree_init(&_prop_dict_keysym_tree, &_prop_dict_keysym_rb_tree_ops); return 0; } static void _prop_dict_keysym_put(prop_dictionary_keysym_t pdk) { if (pdk->pdk_size <= PDK_SIZE_16) _PROP_POOL_PUT(_prop_dictionary_keysym16_pool, pdk); else if (pdk->pdk_size <= PDK_SIZE_32) _PROP_POOL_PUT(_prop_dictionary_keysym32_pool, pdk); else { _PROP_ASSERT(pdk->pdk_size <= PDK_SIZE_128); _PROP_POOL_PUT(_prop_dictionary_keysym128_pool, pdk); } } /* ARGSUSED */ static _prop_object_free_rv_t _prop_dict_keysym_free(prop_stack_t stack, prop_object_t *obj) { prop_dictionary_keysym_t pdk = *obj; rb_tree_remove_node(&_prop_dict_keysym_tree, pdk); _prop_dict_keysym_put(pdk); return _PROP_OBJECT_FREE_DONE; } static bool _prop_dict_keysym_externalize(struct _prop_object_externalize_context *ctx, void *v) { prop_dictionary_keysym_t pdk = v; /* We externalize these as strings, and they're never empty. */ _PROP_ASSERT(pdk->pdk_key[0] != '\0'); if (_prop_object_externalize_start_tag(ctx, "string") == false || _prop_object_externalize_append_encoded_cstring(ctx, pdk->pdk_key) == false || _prop_object_externalize_end_tag(ctx, "string") == false) return (false); return (true); } /* ARGSUSED */ static _prop_object_equals_rv_t _prop_dict_keysym_equals(prop_object_t v1, prop_object_t v2, void **stored_pointer1, void **stored_pointer2, prop_object_t *next_obj1, prop_object_t *next_obj2) { prop_dictionary_keysym_t pdk1 = v1; prop_dictionary_keysym_t pdk2 = v2; /* * There is only ever one copy of a keysym at any given time, * so we can reduce this to a simple pointer equality check. */ if (pdk1 == pdk2) return _PROP_OBJECT_EQUALS_TRUE; else return _PROP_OBJECT_EQUALS_FALSE; } static prop_dictionary_keysym_t _prop_dict_keysym_alloc(const char *key) { prop_dictionary_keysym_t opdk, pdk, rpdk; size_t size; _PROP_ONCE_RUN(_prop_dict_init_once, _prop_dict_init); /* * Check to see if this already exists in the tree. If it does, * we just retain it and return it. */ _PROP_MUTEX_LOCK(_prop_dict_keysym_tree_mutex); opdk = rb_tree_find_node(&_prop_dict_keysym_tree, key); if (opdk != NULL) { prop_object_retain(opdk); _PROP_MUTEX_UNLOCK(_prop_dict_keysym_tree_mutex); return (opdk); } _PROP_MUTEX_UNLOCK(_prop_dict_keysym_tree_mutex); /* * Not in the tree. Create it now. */ size = sizeof(*pdk) + strlen(key) /* pdk_key[1] covers the NUL */; if (size <= PDK_SIZE_16) pdk = _PROP_POOL_GET(_prop_dictionary_keysym16_pool); else if (size <= PDK_SIZE_32) pdk = _PROP_POOL_GET(_prop_dictionary_keysym32_pool); else if (size <= PDK_SIZE_128) pdk = _PROP_POOL_GET(_prop_dictionary_keysym128_pool); else pdk = NULL; /* key too long */ if (pdk == NULL) return (NULL); _prop_object_init(&pdk->pdk_obj, &_prop_object_type_dict_keysym); strcpy(pdk->pdk_key, key); pdk->pdk_size = size; /* * We dropped the mutex when we allocated the new object, so * we have to check again if it is in the tree. */ _PROP_MUTEX_LOCK(_prop_dict_keysym_tree_mutex); opdk = rb_tree_find_node(&_prop_dict_keysym_tree, key); if (opdk != NULL) { prop_object_retain(opdk); _PROP_MUTEX_UNLOCK(_prop_dict_keysym_tree_mutex); _prop_dict_keysym_put(pdk); return (opdk); } rpdk = rb_tree_insert_node(&_prop_dict_keysym_tree, pdk); _PROP_ASSERT(rpdk == pdk); _PROP_MUTEX_UNLOCK(_prop_dict_keysym_tree_mutex); return (rpdk); } static _prop_object_free_rv_t _prop_dictionary_free(prop_stack_t stack, prop_object_t *obj) { prop_dictionary_t pd = *obj; prop_dictionary_keysym_t pdk; prop_object_t po; _PROP_ASSERT(pd->pd_count <= pd->pd_capacity); _PROP_ASSERT((pd->pd_capacity == 0 && pd->pd_array == NULL) || (pd->pd_capacity != 0 && pd->pd_array != NULL)); /* The empty dictorinary is easy, handle that first. */ if (pd->pd_count == 0) { if (pd->pd_array != NULL) _PROP_FREE(pd->pd_array, M_PROP_DICT); _PROP_RWLOCK_DESTROY(pd->pd_rwlock); _PROP_POOL_PUT(_prop_dictionary_pool, pd); return (_PROP_OBJECT_FREE_DONE); } po = pd->pd_array[pd->pd_count - 1].pde_objref; _PROP_ASSERT(po != NULL); if (stack == NULL) { /* * If we are in emergency release mode, * just let caller recurse down. */ *obj = po; return (_PROP_OBJECT_FREE_FAILED); } /* Otherwise, try to push the current object on the stack. */ if (!_prop_stack_push(stack, pd, NULL, NULL, NULL)) { /* Push failed, entering emergency release mode. */ return (_PROP_OBJECT_FREE_FAILED); } /* Object pushed on stack, caller will release it. */ --pd->pd_count; pdk = pd->pd_array[pd->pd_count].pde_key; _PROP_ASSERT(pdk != NULL); prop_object_release(pdk); *obj = po; return (_PROP_OBJECT_FREE_RECURSE); } static void _prop_dictionary_lock(void) { /* XXX: once necessary or paranoia? */ _PROP_ONCE_RUN(_prop_dict_init_once, _prop_dict_init); _PROP_MUTEX_LOCK(_prop_dict_keysym_tree_mutex); } static void _prop_dictionary_unlock(void) { _PROP_MUTEX_UNLOCK(_prop_dict_keysym_tree_mutex); } static void _prop_dictionary_emergency_free(prop_object_t obj) { prop_dictionary_t pd = obj; prop_dictionary_keysym_t pdk; _PROP_ASSERT(pd->pd_count != 0); --pd->pd_count; pdk = pd->pd_array[pd->pd_count].pde_key; _PROP_ASSERT(pdk != NULL); prop_object_release(pdk); } static bool _prop_dictionary_externalize(struct _prop_object_externalize_context *ctx, void *v) { prop_dictionary_t pd = v; prop_dictionary_keysym_t pdk; struct _prop_object *po; prop_object_iterator_t pi; unsigned int i; bool rv = false; _PROP_RWLOCK_RDLOCK(pd->pd_rwlock); if (pd->pd_count == 0) { _PROP_RWLOCK_UNLOCK(pd->pd_rwlock); return (_prop_object_externalize_empty_tag(ctx, "dict")); } if (_prop_object_externalize_start_tag(ctx, "dict") == false || _prop_object_externalize_append_char(ctx, '\n') == false) goto out; pi = _prop_dictionary_iterator_locked(pd); if (pi == NULL) goto out; ctx->poec_depth++; _PROP_ASSERT(ctx->poec_depth != 0); while ((pdk = _prop_dictionary_iterator_next_object_locked(pi)) != NULL) { po = _prop_dictionary_get_keysym(pd, pdk, true); if (po == NULL || _prop_object_externalize_start_tag(ctx, "key") == false || _prop_object_externalize_append_encoded_cstring(ctx, pdk->pdk_key) == false || _prop_object_externalize_end_tag(ctx, "key") == false || (*po->po_type->pot_extern)(ctx, po) == false) { prop_object_iterator_release(pi); goto out; } } prop_object_iterator_release(pi); ctx->poec_depth--; for (i = 0; i < ctx->poec_depth; i++) { if (_prop_object_externalize_append_char(ctx, '\t') == false) goto out; } if (_prop_object_externalize_end_tag(ctx, "dict") == false) goto out; rv = true; out: _PROP_RWLOCK_UNLOCK(pd->pd_rwlock); return (rv); } /* ARGSUSED */ static _prop_object_equals_rv_t _prop_dictionary_equals(prop_object_t v1, prop_object_t v2, void **stored_pointer1, void **stored_pointer2, prop_object_t *next_obj1, prop_object_t *next_obj2) { prop_dictionary_t dict1 = v1; prop_dictionary_t dict2 = v2; uintptr_t idx; _prop_object_equals_rv_t rv = _PROP_OBJECT_EQUALS_FALSE; if (dict1 == dict2) return (_PROP_OBJECT_EQUALS_TRUE); _PROP_ASSERT(*stored_pointer1 == *stored_pointer2); idx = (uintptr_t)*stored_pointer1; if (idx == 0) { if ((uintptr_t)dict1 < (uintptr_t)dict2) { _PROP_RWLOCK_RDLOCK(dict1->pd_rwlock); _PROP_RWLOCK_RDLOCK(dict2->pd_rwlock); } else { _PROP_RWLOCK_RDLOCK(dict2->pd_rwlock); _PROP_RWLOCK_RDLOCK(dict1->pd_rwlock); } } if (dict1->pd_count != dict2->pd_count) goto out; if (idx == dict1->pd_count) { rv = _PROP_OBJECT_EQUALS_TRUE; goto out; } _PROP_ASSERT(idx < dict1->pd_count); *stored_pointer1 = (void *)(idx + 1); *stored_pointer2 = (void *)(idx + 1); *next_obj1 = dict1->pd_array[idx].pde_objref; *next_obj2 = dict2->pd_array[idx].pde_objref; if (!prop_dictionary_keysym_equals(dict1->pd_array[idx].pde_key, dict2->pd_array[idx].pde_key)) goto out; return (_PROP_OBJECT_EQUALS_RECURSE); out: _PROP_RWLOCK_UNLOCK(dict1->pd_rwlock); _PROP_RWLOCK_UNLOCK(dict2->pd_rwlock); return (rv); } static void _prop_dictionary_equals_finish(prop_object_t v1, prop_object_t v2) { _PROP_RWLOCK_UNLOCK(((prop_dictionary_t)v1)->pd_rwlock); _PROP_RWLOCK_UNLOCK(((prop_dictionary_t)v2)->pd_rwlock); } static prop_dictionary_t _prop_dictionary_alloc(unsigned int capacity) { prop_dictionary_t pd; struct _prop_dict_entry *array; if (capacity != 0) { array = _PROP_CALLOC(capacity * sizeof(*array), M_PROP_DICT); if (array == NULL) return (NULL); } else array = NULL; pd = _PROP_POOL_GET(_prop_dictionary_pool); if (pd != NULL) { _prop_object_init(&pd->pd_obj, &_prop_object_type_dictionary); _PROP_RWLOCK_INIT(pd->pd_rwlock); pd->pd_array = array; pd->pd_capacity = capacity; pd->pd_count = 0; pd->pd_flags = 0; pd->pd_version = 0; } else if (array != NULL) _PROP_FREE(array, M_PROP_DICT); return (pd); } static bool _prop_dictionary_expand(prop_dictionary_t pd, unsigned int capacity) { struct _prop_dict_entry *array, *oarray; /* * Dictionary must be WRITE-LOCKED. */ oarray = pd->pd_array; array = _PROP_CALLOC(capacity * sizeof(*array), M_PROP_DICT); if (array == NULL) return (false); if (oarray != NULL) memcpy(array, oarray, pd->pd_capacity * sizeof(*array)); pd->pd_array = array; pd->pd_capacity = capacity; if (oarray != NULL) _PROP_FREE(oarray, M_PROP_DICT); return (true); } static prop_object_t _prop_dictionary_iterator_next_object_locked(void *v) { struct _prop_dictionary_iterator *pdi = v; prop_dictionary_t pd = pdi->pdi_base.pi_obj; prop_dictionary_keysym_t pdk = NULL; _PROP_ASSERT(prop_object_is_dictionary(pd)); if (pd->pd_version != pdi->pdi_base.pi_version) goto out; /* dictionary changed during iteration */ _PROP_ASSERT(pdi->pdi_index <= pd->pd_count); if (pdi->pdi_index == pd->pd_count) goto out; /* we've iterated all objects */ pdk = pd->pd_array[pdi->pdi_index].pde_key; pdi->pdi_index++; out: return (pdk); } static prop_object_t _prop_dictionary_iterator_next_object(void *v) { struct _prop_dictionary_iterator *pdi = v; prop_dictionary_t pd _PROP_ARG_UNUSED = pdi->pdi_base.pi_obj; prop_dictionary_keysym_t pdk; _PROP_ASSERT(prop_object_is_dictionary(pd)); _PROP_RWLOCK_RDLOCK(pd->pd_rwlock); pdk = _prop_dictionary_iterator_next_object_locked(pdi); _PROP_RWLOCK_UNLOCK(pd->pd_rwlock); return (pdk); } static void _prop_dictionary_iterator_reset_locked(void *v) { struct _prop_dictionary_iterator *pdi = v; prop_dictionary_t pd = pdi->pdi_base.pi_obj; _PROP_ASSERT(prop_object_is_dictionary(pd)); pdi->pdi_index = 0; pdi->pdi_base.pi_version = pd->pd_version; } static void _prop_dictionary_iterator_reset(void *v) { struct _prop_dictionary_iterator *pdi = v; prop_dictionary_t pd _PROP_ARG_UNUSED = pdi->pdi_base.pi_obj; _PROP_RWLOCK_RDLOCK(pd->pd_rwlock); _prop_dictionary_iterator_reset_locked(pdi); _PROP_RWLOCK_UNLOCK(pd->pd_rwlock); } /* * prop_dictionary_create -- * Create a dictionary. */ prop_dictionary_t prop_dictionary_create(void) { return (_prop_dictionary_alloc(0)); } /* * prop_dictionary_create_with_capacity -- * Create a dictionary with the capacity to store N objects. */ prop_dictionary_t prop_dictionary_create_with_capacity(unsigned int capacity) { return (_prop_dictionary_alloc(capacity)); } /* * prop_dictionary_copy -- * Copy a dictionary. The new dictionary has an initial capacity equal * to the number of objects stored int the original dictionary. The new * dictionary contains references to the original dictionary's objects, * not copies of those objects (i.e. a shallow copy). */ prop_dictionary_t prop_dictionary_copy(prop_dictionary_t opd) { prop_dictionary_t pd; prop_dictionary_keysym_t pdk; prop_object_t po; unsigned int idx; if (! prop_object_is_dictionary(opd)) return (NULL); _PROP_RWLOCK_RDLOCK(opd->pd_rwlock); pd = _prop_dictionary_alloc(opd->pd_count); if (pd != NULL) { for (idx = 0; idx < opd->pd_count; idx++) { pdk = opd->pd_array[idx].pde_key; po = opd->pd_array[idx].pde_objref; prop_object_retain(pdk); prop_object_retain(po); pd->pd_array[idx].pde_key = pdk; pd->pd_array[idx].pde_objref = po; } pd->pd_count = opd->pd_count; pd->pd_flags = opd->pd_flags; } _PROP_RWLOCK_UNLOCK(opd->pd_rwlock); return (pd); } /* * prop_dictionary_copy_mutable -- * Like prop_dictionary_copy(), but the resulting dictionary is * mutable. */ prop_dictionary_t prop_dictionary_copy_mutable(prop_dictionary_t opd) { prop_dictionary_t pd; if (! prop_object_is_dictionary(opd)) return (NULL); pd = prop_dictionary_copy(opd); if (pd != NULL) pd->pd_flags &= ~PD_F_IMMUTABLE; return (pd); } /* * prop_dictionary_make_immutable -- * Set the immutable flag on that dictionary. */ void prop_dictionary_make_immutable(prop_dictionary_t pd) { _PROP_RWLOCK_WRLOCK(pd->pd_rwlock); if (prop_dictionary_is_immutable(pd) == false) pd->pd_flags |= PD_F_IMMUTABLE; _PROP_RWLOCK_UNLOCK(pd->pd_rwlock); } /* * prop_dictionary_count -- * Return the number of objects stored in the dictionary. */ unsigned int prop_dictionary_count(prop_dictionary_t pd) { unsigned int rv; if (! prop_object_is_dictionary(pd)) return (0); _PROP_RWLOCK_RDLOCK(pd->pd_rwlock); rv = pd->pd_count; _PROP_RWLOCK_UNLOCK(pd->pd_rwlock); return (rv); } /* * prop_dictionary_ensure_capacity -- * Ensure that the dictionary has the capacity to store the specified * total number of objects (including the objects already stored in * the dictionary). */ bool prop_dictionary_ensure_capacity(prop_dictionary_t pd, unsigned int capacity) { bool rv; if (! prop_object_is_dictionary(pd)) return (false); _PROP_RWLOCK_WRLOCK(pd->pd_rwlock); if (capacity > pd->pd_capacity) rv = _prop_dictionary_expand(pd, capacity); else rv = true; _PROP_RWLOCK_UNLOCK(pd->pd_rwlock); return (rv); } static prop_object_iterator_t _prop_dictionary_iterator_locked(prop_dictionary_t pd) { struct _prop_dictionary_iterator *pdi; if (! prop_object_is_dictionary(pd)) return (NULL); pdi = _PROP_CALLOC(sizeof(*pdi), M_TEMP); if (pdi == NULL) return (NULL); pdi->pdi_base.pi_next_object = _prop_dictionary_iterator_next_object; pdi->pdi_base.pi_reset = _prop_dictionary_iterator_reset; prop_object_retain(pd); pdi->pdi_base.pi_obj = pd; _prop_dictionary_iterator_reset_locked(pdi); return (&pdi->pdi_base); } /* * prop_dictionary_iterator -- * Return an iterator for the dictionary. The dictionary is retained by * the iterator. */ prop_object_iterator_t prop_dictionary_iterator(prop_dictionary_t pd) { prop_object_iterator_t pi; _PROP_RWLOCK_RDLOCK(pd->pd_rwlock); pi = _prop_dictionary_iterator_locked(pd); _PROP_RWLOCK_UNLOCK(pd->pd_rwlock); return (pi); } /* * prop_dictionary_all_keys -- * Return an array containing a snapshot of all of the keys * in the dictionary. */ prop_array_t prop_dictionary_all_keys(prop_dictionary_t pd) { prop_array_t array; unsigned int idx; bool rv = true; if (! prop_object_is_dictionary(pd)) return (NULL); /* There is no pressing need to lock the dictionary for this. */ array = prop_array_create_with_capacity(pd->pd_count); _PROP_RWLOCK_RDLOCK(pd->pd_rwlock); for (idx = 0; idx < pd->pd_count; idx++) { rv = prop_array_add(array, pd->pd_array[idx].pde_key); if (rv == false) break; } _PROP_RWLOCK_UNLOCK(pd->pd_rwlock); if (rv == false) { prop_object_release(array); array = NULL; } return (array); } static struct _prop_dict_entry * _prop_dict_lookup(prop_dictionary_t pd, const char *key, unsigned int *idxp) { struct _prop_dict_entry *pde; unsigned int base, idx, distance; int res; /* * Dictionary must be READ-LOCKED or WRITE-LOCKED. */ for (idx = 0, base = 0, distance = pd->pd_count; distance != 0; distance >>= 1) { idx = base + (distance >> 1); pde = &pd->pd_array[idx]; _PROP_ASSERT(pde->pde_key != NULL); res = strcmp(key, pde->pde_key->pdk_key); if (res == 0) { if (idxp != NULL) *idxp = idx; return (pde); } if (res > 0) { /* key > pdk_key: move right */ base = idx + 1; distance--; } /* else move left */ } /* idx points to the slot we looked at last. */ if (idxp != NULL) *idxp = idx; return (NULL); } static prop_object_t _prop_dictionary_get(prop_dictionary_t pd, const char *key, bool locked) { const struct _prop_dict_entry *pde; prop_object_t po = NULL; if (! prop_object_is_dictionary(pd)) return (NULL); if (!locked) { _PROP_RWLOCK_RDLOCK(pd->pd_rwlock); } pde = _prop_dict_lookup(pd, key, NULL); if (pde != NULL) { _PROP_ASSERT(pde->pde_objref != NULL); po = pde->pde_objref; } if (!locked) { _PROP_RWLOCK_UNLOCK(pd->pd_rwlock); } return (po); } /* * prop_dictionary_get -- * Return the object stored with specified key. */ prop_object_t prop_dictionary_get(prop_dictionary_t pd, const char *key) { prop_object_t po = NULL; if (! prop_object_is_dictionary(pd)) return (NULL); _PROP_RWLOCK_RDLOCK(pd->pd_rwlock); po = _prop_dictionary_get(pd, key, true); _PROP_RWLOCK_UNLOCK(pd->pd_rwlock); return (po); } static prop_object_t _prop_dictionary_get_keysym(prop_dictionary_t pd, prop_dictionary_keysym_t pdk, bool locked) { if (! (prop_object_is_dictionary(pd) && prop_object_is_dictionary_keysym(pdk))) return (NULL); return (_prop_dictionary_get(pd, pdk->pdk_key, locked)); } /* * prop_dictionary_get_keysym -- * Return the object stored at the location encoded by the keysym. */ prop_object_t prop_dictionary_get_keysym(prop_dictionary_t pd, prop_dictionary_keysym_t pdk) { return (_prop_dictionary_get_keysym(pd, pdk, false)); } /* * prop_dictionary_set -- * Store a reference to an object at with the specified key. * If the key already exist, the original object is released. */ bool prop_dictionary_set(prop_dictionary_t pd, const char *key, prop_object_t po) { struct _prop_dict_entry *pde; prop_dictionary_keysym_t pdk; unsigned int idx; bool rv = false; if (! prop_object_is_dictionary(pd)) return (false); _PROP_ASSERT(pd->pd_count <= pd->pd_capacity); if (prop_dictionary_is_immutable(pd)) return (false); _PROP_RWLOCK_WRLOCK(pd->pd_rwlock); pde = _prop_dict_lookup(pd, key, &idx); if (pde != NULL) { prop_object_t opo = pde->pde_objref; prop_object_retain(po); pde->pde_objref = po; prop_object_release(opo); rv = true; goto out; } pdk = _prop_dict_keysym_alloc(key); if (pdk == NULL) goto out; if (pd->pd_count == pd->pd_capacity && _prop_dictionary_expand(pd, pd->pd_capacity + EXPAND_STEP) == false) { prop_object_release(pdk); goto out; } /* At this point, the store will succeed. */ prop_object_retain(po); if (pd->pd_count == 0) { pd->pd_array[0].pde_key = pdk; pd->pd_array[0].pde_objref = po; pd->pd_count++; pd->pd_version++; rv = true; goto out; } pde = &pd->pd_array[idx]; _PROP_ASSERT(pde->pde_key != NULL); if (strcmp(key, pde->pde_key->pdk_key) < 0) { /* * key < pdk_key: insert to the left. This is the same as * inserting to the right, except we decrement the current * index first. * * Because we're unsigned, we have to special case 0 * (grumble). */ if (idx == 0) { memmove(&pd->pd_array[1], &pd->pd_array[0], pd->pd_count * sizeof(*pde)); pd->pd_array[0].pde_key = pdk; pd->pd_array[0].pde_objref = po; pd->pd_count++; pd->pd_version++; rv = true; goto out; } idx--; } memmove(&pd->pd_array[idx + 2], &pd->pd_array[idx + 1], (pd->pd_count - (idx + 1)) * sizeof(*pde)); pd->pd_array[idx + 1].pde_key = pdk; pd->pd_array[idx + 1].pde_objref = po; pd->pd_count++; pd->pd_version++; rv = true; out: _PROP_RWLOCK_UNLOCK(pd->pd_rwlock); return (rv); } /* * prop_dictionary_set_keysym -- * Replace the object in the dictionary at the location encoded by * the keysym. */ bool prop_dictionary_set_keysym(prop_dictionary_t pd, prop_dictionary_keysym_t pdk, prop_object_t po) { if (! (prop_object_is_dictionary(pd) && prop_object_is_dictionary_keysym(pdk))) return (false); return (prop_dictionary_set(pd, pdk->pdk_key, po)); } static void _prop_dictionary_remove(prop_dictionary_t pd, struct _prop_dict_entry *pde, unsigned int idx) { prop_dictionary_keysym_t pdk = pde->pde_key; prop_object_t po = pde->pde_objref; /* * Dictionary must be WRITE-LOCKED. */ _PROP_ASSERT(pd->pd_count != 0); _PROP_ASSERT(idx < pd->pd_count); _PROP_ASSERT(pde == &pd->pd_array[idx]); idx++; memmove(&pd->pd_array[idx - 1], &pd->pd_array[idx], (pd->pd_count - idx) * sizeof(*pde)); pd->pd_count--; pd->pd_version++; prop_object_release(pdk); prop_object_release(po); } /* * prop_dictionary_remove -- * Remove the reference to an object with the specified key from * the dictionary. */ void prop_dictionary_remove(prop_dictionary_t pd, const char *key) { struct _prop_dict_entry *pde; unsigned int idx; if (! prop_object_is_dictionary(pd)) return; _PROP_RWLOCK_WRLOCK(pd->pd_rwlock); /* XXX Should this be a _PROP_ASSERT()? */ if (prop_dictionary_is_immutable(pd)) goto out; pde = _prop_dict_lookup(pd, key, &idx); /* XXX Should this be a _PROP_ASSERT()? */ if (pde == NULL) goto out; _prop_dictionary_remove(pd, pde, idx); out: _PROP_RWLOCK_UNLOCK(pd->pd_rwlock); } /* * prop_dictionary_remove_keysym -- * Remove a reference to an object stored in the dictionary at the * location encoded by the keysym. */ void prop_dictionary_remove_keysym(prop_dictionary_t pd, prop_dictionary_keysym_t pdk) { if (! (prop_object_is_dictionary(pd) && prop_object_is_dictionary_keysym(pdk))) return; prop_dictionary_remove(pd, pdk->pdk_key); } /* * prop_dictionary_equals -- * Return true if the two dictionaries are equivalent. Note we do a * by-value comparison of the objects in the dictionary. */ bool prop_dictionary_equals(prop_dictionary_t dict1, prop_dictionary_t dict2) { if (!prop_object_is_dictionary(dict1) || !prop_object_is_dictionary(dict2)) return (false); return (prop_object_equals(dict1, dict2)); } /* * prop_dictionary_keysym_value -- * Return a reference to the keysym's value. */ const char * prop_dictionary_keysym_value(prop_dictionary_keysym_t pdk) { if (! prop_object_is_dictionary_keysym(pdk)) return (NULL); return (pdk->pdk_key); } _PROP_DEPRECATED(prop_dictionary_keysym_cstring_nocopy, "this program uses prop_dictionary_keysym_cstring_nocopy(), " "which is deprecated; use prop_dictionary_keysym_value() instead.") const char * prop_dictionary_keysym_cstring_nocopy(prop_dictionary_keysym_t pdk) { if (! prop_object_is_dictionary_keysym(pdk)) return (NULL); return (pdk->pdk_key); } /* * prop_dictionary_keysym_equals -- * Return true if the two dictionary key symbols are equivalent. * Note: We do not compare the object references. */ bool prop_dictionary_keysym_equals(prop_dictionary_keysym_t pdk1, prop_dictionary_keysym_t pdk2) { if (!prop_object_is_dictionary_keysym(pdk1) || !prop_object_is_dictionary_keysym(pdk2)) return (false); return (prop_object_equals(pdk1, pdk2)); } /* * prop_dictionary_externalize -- * Externalize a dictionary, returning a NUL-terminated buffer * containing the XML-style representation. The buffer is allocated * with the M_TEMP memory type. */ char * prop_dictionary_externalize(prop_dictionary_t pd) { struct _prop_object_externalize_context *ctx; char *cp; ctx = _prop_object_externalize_context_alloc(); if (ctx == NULL) return (NULL); if (_prop_object_externalize_header(ctx) == false || (*pd->pd_obj.po_type->pot_extern)(ctx, pd) == false || _prop_object_externalize_footer(ctx) == false) { /* We are responsible for releasing the buffer. */ _PROP_FREE(ctx->poec_buf, M_TEMP); _prop_object_externalize_context_free(ctx); return (NULL); } cp = ctx->poec_buf; _prop_object_externalize_context_free(ctx); return (cp); } /* * _prop_dictionary_internalize -- * Parse a <dict>...</dict> and return the object created from the * external representation. * * Internal state in via rec_data is the storage area for the last processed * key. * _prop_dictionary_internalize_body is the upper half of the parse loop. * It is responsible for parsing the key directly and storing it in the area * referenced by rec_data. * _prop_dictionary_internalize_cont is the lower half and called with the value * associated with the key. */ static bool _prop_dictionary_internalize_body(prop_stack_t, prop_object_t *, struct _prop_object_internalize_context *, char *); bool _prop_dictionary_internalize(prop_stack_t stack, prop_object_t *obj, struct _prop_object_internalize_context *ctx) { prop_dictionary_t dict; char *tmpkey; /* We don't currently understand any attributes. */ if (ctx->poic_tagattr != NULL) return (true); dict = prop_dictionary_create(); if (dict == NULL) return (true); if (ctx->poic_is_empty_element) { *obj = dict; return (true); } tmpkey = _PROP_MALLOC(PDK_MAXKEY + 1, M_TEMP); if (tmpkey == NULL) { prop_object_release(dict); return (true); } *obj = dict; /* * Opening tag is found, storage for key allocated and * now continue to the first element. */ return _prop_dictionary_internalize_body(stack, obj, ctx, tmpkey); } static bool _prop_dictionary_internalize_continue(prop_stack_t stack, prop_object_t *obj, struct _prop_object_internalize_context *ctx, void *data, prop_object_t child) { prop_dictionary_t dict = *obj; char *tmpkey = data; _PROP_ASSERT(tmpkey != NULL); if (child == NULL || prop_dictionary_set(dict, tmpkey, child) == false) { _PROP_FREE(tmpkey, M_TEMP); if (child != NULL) prop_object_release(child); prop_object_release(dict); *obj = NULL; return (true); } prop_object_release(child); /* * key, value was added, now continue looking for the next key * or the closing tag. */ return _prop_dictionary_internalize_body(stack, obj, ctx, tmpkey); } static bool _prop_dictionary_internalize_body(prop_stack_t stack, prop_object_t *obj, struct _prop_object_internalize_context *ctx, char *tmpkey) { prop_dictionary_t dict = *obj; size_t keylen; /* Fetch the next tag. */ if (_prop_object_internalize_find_tag(ctx, NULL, _PROP_TAG_TYPE_EITHER) == false) goto bad; /* Check to see if this is the end of the dictionary. */ if (_PROP_TAG_MATCH(ctx, "dict") && ctx->poic_tag_type == _PROP_TAG_TYPE_END) { _PROP_FREE(tmpkey, M_TEMP); return (true); } /* Ok, it must be a non-empty key start tag. */ if (!_PROP_TAG_MATCH(ctx, "key") || ctx->poic_tag_type != _PROP_TAG_TYPE_START || ctx->poic_is_empty_element) goto bad; if (_prop_object_internalize_decode_string(ctx, tmpkey, PDK_MAXKEY, &keylen, &ctx->poic_cp) == false) goto bad; _PROP_ASSERT(keylen <= PDK_MAXKEY); tmpkey[keylen] = '\0'; if (_prop_object_internalize_find_tag(ctx, "key", _PROP_TAG_TYPE_END) == false) goto bad; /* ..and now the beginning of the value. */ if (_prop_object_internalize_find_tag(ctx, NULL, _PROP_TAG_TYPE_START) == false) goto bad; /* * Key is found, now wait for value to be parsed. */ if (_prop_stack_push(stack, *obj, _prop_dictionary_internalize_continue, tmpkey, NULL)) return (false); bad: _PROP_FREE(tmpkey, M_TEMP); prop_object_release(dict); *obj = NULL; return (true); } /* * prop_dictionary_internalize -- * Create a dictionary by parsing the NUL-terminated XML-style * representation. */ prop_dictionary_t prop_dictionary_internalize(const char *xml) { return _prop_generic_internalize(xml, "dict"); } #if !defined(_KERNEL) && !defined(_STANDALONE) /* * prop_dictionary_externalize_to_file -- * Externalize a dictionary to the specified file. */ bool prop_dictionary_externalize_to_file(prop_dictionary_t dict, const char *fname) { char *xml; bool rv; int save_errno = 0; /* XXXGCC -Wuninitialized [mips, ...] */ xml = prop_dictionary_externalize(dict); if (xml == NULL) return (false); rv = _prop_object_externalize_write_file(fname, xml, strlen(xml)); if (rv == false) save_errno = errno; _PROP_FREE(xml, M_TEMP); if (rv == false) errno = save_errno; return (rv); } /* * prop_dictionary_internalize_from_file -- * Internalize a dictionary from a file. */ prop_dictionary_t prop_dictionary_internalize_from_file(const char *fname) { struct _prop_object_internalize_mapped_file *mf; prop_dictionary_t dict; mf = _prop_object_internalize_map_file(fname); if (mf == NULL) return (NULL); dict = prop_dictionary_internalize(mf->poimf_xml); _prop_object_internalize_unmap_file(mf); return (dict); } #endif /* !_KERNEL && !_STANDALONE */
3 1 1 1 1 1 1 3 3 1 1 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 /* $NetBSD: uipc_accf.c,v 1.13 2014/02/25 18:30:11 pooka Exp $ */ /*- * Copyright (c) 2008 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software developed for The NetBSD Foundation * by Andrew Doran. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /*- * Copyright (c) 2000 Paycounter, Inc. * Copyright (c) 2005 Robert N. M. Watson * Author: Alfred Perlstein <alfred@paycounter.com>, <alfred@FreeBSD.org> * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: uipc_accf.c,v 1.13 2014/02/25 18:30:11 pooka Exp $"); #define ACCEPT_FILTER_MOD #include <sys/param.h> #include <sys/systm.h> #include <sys/domain.h> #include <sys/kernel.h> #include <sys/lock.h> #include <sys/kmem.h> #include <sys/mbuf.h> #include <sys/rwlock.h> #include <sys/protosw.h> #include <sys/sysctl.h> #include <sys/socket.h> #include <sys/socketvar.h> #include <sys/queue.h> #include <sys/once.h> #include <sys/atomic.h> #include <sys/module.h> static krwlock_t accept_filter_lock; static LIST_HEAD(, accept_filter) accept_filtlsthd = LIST_HEAD_INITIALIZER(&accept_filtlsthd); /* * Names of Accept filter sysctl objects */ static struct sysctllog *ctllog; static void sysctl_net_inet_accf_setup(void) { sysctl_createv(&ctllog, 0, NULL, NULL, CTLFLAG_PERMANENT, CTLTYPE_NODE, "inet", NULL, NULL, 0, NULL, 0, CTL_NET, PF_INET, CTL_EOL); sysctl_createv(&ctllog, 0, NULL, NULL, CTLFLAG_PERMANENT, CTLTYPE_NODE, "accf", SYSCTL_DESCR("Accept filters"), NULL, 0, NULL, 0, CTL_NET, PF_INET, SO_ACCEPTFILTER, CTL_EOL); } int accept_filt_add(struct accept_filter *filt) { struct accept_filter *p; accept_filter_init(); rw_enter(&accept_filter_lock, RW_WRITER); LIST_FOREACH(p, &accept_filtlsthd, accf_next) { if (strcmp(p->accf_name, filt->accf_name) == 0) { rw_exit(&accept_filter_lock); return EEXIST; } } LIST_INSERT_HEAD(&accept_filtlsthd, filt, accf_next); rw_exit(&accept_filter_lock); return 0; } int accept_filt_del(struct accept_filter *p) { rw_enter(&accept_filter_lock, RW_WRITER); if (p->accf_refcnt != 0) { rw_exit(&accept_filter_lock); return EBUSY; } LIST_REMOVE(p, accf_next); rw_exit(&accept_filter_lock); return 0; } struct accept_filter * accept_filt_get(char *name) { struct accept_filter *p; char buf[32]; u_int gen; do { rw_enter(&accept_filter_lock, RW_READER); LIST_FOREACH(p, &accept_filtlsthd, accf_next) { if (strcmp(p->accf_name, name) == 0) { atomic_inc_uint(&p->accf_refcnt); break; } } rw_exit(&accept_filter_lock); if (p != NULL) { break; } /* Try to autoload a module to satisfy the request. */ strcpy(buf, "accf_"); strlcat(buf, name, sizeof(buf)); gen = module_gen; (void)module_autoload(buf, MODULE_CLASS_ANY); } while (gen != module_gen); return p; } /* * Accept filter initialization routine. * This should be called only once. */ static int accept_filter_init0(void) { rw_init(&accept_filter_lock); sysctl_net_inet_accf_setup(); return 0; } /* * Initialization routine: This can also be replaced with * accept_filt_generic_mod_event for attaching new accept filter. */ void accept_filter_init(void) { static ONCE_DECL(accept_filter_init_once); RUN_ONCE(&accept_filter_init_once, accept_filter_init0); } int accept_filt_getopt(struct socket *so, struct sockopt *sopt) { struct accept_filter_arg afa; int error; KASSERT(solocked(so)); if ((so->so_options & SO_ACCEPTCONN) == 0) { error = EINVAL; goto out; } if ((so->so_options & SO_ACCEPTFILTER) == 0) { error = EINVAL; goto out; } memset(&afa, 0, sizeof(afa)); strcpy(afa.af_name, so->so_accf->so_accept_filter->accf_name); if (so->so_accf->so_accept_filter_str != NULL) strcpy(afa.af_arg, so->so_accf->so_accept_filter_str); error = sockopt_set(sopt, &afa, sizeof(afa)); out: return error; } /* * Simple delete case, with socket locked. */ int accept_filt_clear(struct socket *so) { struct accept_filter_arg afa; struct accept_filter *afp; struct socket *so2, *next; struct so_accf *af; KASSERT(solocked(so)); if ((so->so_options & SO_ACCEPTCONN) == 0) { return EINVAL; } if (so->so_accf != NULL) { /* Break in-flight processing. */ for (so2 = TAILQ_FIRST(&so->so_q0); so2 != NULL; so2 = next) { next = TAILQ_NEXT(so2, so_qe); if (so2->so_upcall == NULL) { continue; } so2->so_upcall = NULL; so2->so_upcallarg = NULL; so2->so_options &= ~SO_ACCEPTFILTER; so2->so_rcv.sb_flags &= ~SB_UPCALL; soisconnected(so2); } af = so->so_accf; afp = af->so_accept_filter; if (afp != NULL && afp->accf_destroy != NULL) { (*afp->accf_destroy)(so); } if (af->so_accept_filter_str != NULL) { kmem_free(af->so_accept_filter_str, sizeof(afa.af_name)); } kmem_free(af, sizeof(*af)); so->so_accf = NULL; atomic_dec_uint(&afp->accf_refcnt); } so->so_options &= ~SO_ACCEPTFILTER; return 0; } /* * setsockopt() for accept filters. Called with the socket unlocked, * will always return it locked. */ int accept_filt_setopt(struct socket *so, const struct sockopt *sopt) { struct accept_filter_arg afa; struct accept_filter *afp; struct so_accf *newaf; int error; accept_filter_init(); if (sopt == NULL || sopt->sopt_size == 0) { solock(so); return accept_filt_clear(so); } /* * Pre-allocate any memory we may need later to avoid blocking at * untimely moments. This does not optimize for invalid arguments. */ error = sockopt_get(sopt, &afa, sizeof(afa)); if (error) { solock(so); return error; } afa.af_name[sizeof(afa.af_name)-1] = '\0'; afa.af_arg[sizeof(afa.af_arg)-1] = '\0'; afp = accept_filt_get(afa.af_name); if (afp == NULL) { solock(so); return ENOENT; } /* * Allocate the new accept filter instance storage. We may * have to free it again later if we fail to attach it. If * attached properly, 'newaf' is NULLed to avoid a free() * while in use. */ newaf = kmem_zalloc(sizeof(*newaf), KM_SLEEP); if (afp->accf_create != NULL && afa.af_name[0] != '\0') { /* * FreeBSD did a variable-size allocation here * with the actual string length from afa.af_name * but it is so short, why bother tracking it? * XXX as others have noted, this is an API mistake; * XXX accept_filter_arg should have a mandatory namelen. * XXX (but it's a bit too late to fix that now) */ newaf->so_accept_filter_str = kmem_alloc(sizeof(afa.af_name), KM_SLEEP); strcpy(newaf->so_accept_filter_str, afa.af_name); } /* * Require a listen socket; don't try to replace an existing filter * without first removing it. */ solock(so); if ((so->so_options & SO_ACCEPTCONN) == 0 || so->so_accf != NULL) { error = EINVAL; goto out; } /* * Invoke the accf_create() method of the filter if required. The * socket lock is held over this call, so create methods for filters * shouldn't block. */ if (afp->accf_create != NULL) { newaf->so_accept_filter_arg = (*afp->accf_create)(so, afa.af_arg); if (newaf->so_accept_filter_arg == NULL) { error = EINVAL; goto out; } } newaf->so_accept_filter = afp; so->so_accf = newaf; so->so_options |= SO_ACCEPTFILTER; newaf = NULL; out: if (newaf != NULL) { if (newaf->so_accept_filter_str != NULL) kmem_free(newaf->so_accept_filter_str, sizeof(afa.af_name)); kmem_free(newaf, sizeof(*newaf)); atomic_dec_uint(&afp->accf_refcnt); } return error; }
5 9 9 5 9 23 23 22 3 1 2 2 1 4 5 4 3 2 2 11 11 7 4 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 /* $NetBSD: kern_ras.c,v 1.42 2022/08/08 22:31:45 riastradh Exp $ */ /*- * Copyright (c) 2002, 2006, 2007, 2008 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Gregory McGarry, and by Andrew Doran. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: kern_ras.c,v 1.42 2022/08/08 22:31:45 riastradh Exp $"); #include <sys/param.h> #include <sys/systm.h> #include <sys/kernel.h> #include <sys/kmem.h> #include <sys/proc.h> #include <sys/ras.h> #include <sys/xcall.h> #include <sys/syscallargs.h> #include <uvm/uvm_extern.h> #define MAX_RAS_PER_PROC 16 u_int ras_per_proc = MAX_RAS_PER_PROC; #ifdef DEBUG int ras_debug = 0; #define DPRINTF(x) if (ras_debug) printf x #else #define DPRINTF(x) /* nothing */ #endif /* * Force all CPUs through cpu_switchto(), waiting until complete. * Context switching will drain the write buffer on the calling * CPU. */ static void ras_sync(void) { /* No need to sync if exiting or single threaded. */ if (curproc->p_nlwps > 1 && ncpu > 1) { xc_barrier(0); } } /* * Check the specified address to see if it is within the * sequence. If it is found, we return the restart address, * otherwise we return -1. If we do perform a restart, we * mark the sequence as hit. * * No locking required: we disable preemption and ras_sync() * guarantees that individual entries are valid while we still * have visibility of them. */ void * ras_lookup(struct proc *p, void *addr) { struct ras *rp; void *startaddr; lwp_t *l; startaddr = (void *)-1; l = curlwp; KPREEMPT_DISABLE(l); for (rp = p->p_raslist; rp != NULL; rp = rp->ras_next) { if (addr > rp->ras_startaddr && addr < rp->ras_endaddr) { startaddr = rp->ras_startaddr; DPRINTF(("RAS hit: p=%p %p\n", p, addr)); break; } } KPREEMPT_ENABLE(l); return startaddr; } /* * During a fork, we copy all of the sequences from parent p1 to * the child p2. * * No locking required as the parent must be paused. */ int ras_fork(struct proc *p1, struct proc *p2) { struct ras *rp, *nrp; for (rp = p1->p_raslist; rp != NULL; rp = rp->ras_next) { nrp = kmem_alloc(sizeof(*nrp), KM_SLEEP); nrp->ras_startaddr = rp->ras_startaddr; nrp->ras_endaddr = rp->ras_endaddr; nrp->ras_next = p2->p_raslist; p2->p_raslist = nrp; } DPRINTF(("ras_fork: p1=%p, p2=%p\n", p1, p2)); return 0; } /* * Nuke all sequences for this process. */ int ras_purgeall(void) { struct ras *rp, *nrp; proc_t *p; p = curproc; if (p->p_raslist == NULL) return 0; mutex_enter(&p->p_auxlock); if ((rp = p->p_raslist) != NULL) { p->p_raslist = NULL; ras_sync(); for(; rp != NULL; rp = nrp) { nrp = rp->ras_next; kmem_free(rp, sizeof(*rp)); } } mutex_exit(&p->p_auxlock); return 0; } #if defined(__HAVE_RAS) /* * Install the new sequence. If it already exists, return * an error. */ static int ras_install(void *addr, size_t len) { struct ras *rp; struct ras *newrp; void *endaddr; int nras, error; proc_t *p; if (len == 0) return EINVAL; if ((uintptr_t)addr < VM_MIN_ADDRESS || (uintptr_t)addr > VM_MAXUSER_ADDRESS) return EINVAL; if (len > VM_MAXUSER_ADDRESS - (uintptr_t)addr) return EINVAL; endaddr = (char *)addr + len; newrp = kmem_alloc(sizeof(*newrp), KM_SLEEP); newrp->ras_startaddr = addr; newrp->ras_endaddr = endaddr; error = 0; nras = 0; p = curproc; mutex_enter(&p->p_auxlock); for (rp = p->p_raslist; rp != NULL; rp = rp->ras_next) { if (++nras >= ras_per_proc) { error = EINVAL; break; } if (addr < rp->ras_endaddr && endaddr > rp->ras_startaddr) { error = EEXIST; break; } } if (rp == NULL) { newrp->ras_next = p->p_raslist; p->p_raslist = newrp; ras_sync(); mutex_exit(&p->p_auxlock); } else { mutex_exit(&p->p_auxlock); kmem_free(newrp, sizeof(*newrp)); } return error; } /* * Nuke the specified sequence. Both address and len must * match, otherwise we return an error. */ static int ras_purge(void *addr, size_t len) { struct ras *rp, **link; proc_t *p; p = curproc; mutex_enter(&p->p_auxlock); link = &p->p_raslist; for (rp = *link; rp != NULL; link = &rp->ras_next, rp = *link) { if (addr == rp->ras_startaddr && (char *)rp->ras_endaddr - (char *)rp->ras_startaddr == len) break; } if (rp != NULL) { *link = rp->ras_next; ras_sync(); mutex_exit(&p->p_auxlock); kmem_free(rp, sizeof(*rp)); return 0; } else { mutex_exit(&p->p_auxlock); return ESRCH; } } #endif /* defined(__HAVE_RAS) */ /*ARGSUSED*/ int sys_rasctl(struct lwp *l, const struct sys_rasctl_args *uap, register_t *retval) { #if defined(__HAVE_RAS) /* { syscallarg(void *) addr; syscallarg(size_t) len; syscallarg(int) op; } */ void *addr; size_t len; int op; int error; /* * first, extract syscall args from the uap. */ addr = (void *)SCARG(uap, addr); len = (size_t)SCARG(uap, len); op = SCARG(uap, op); DPRINTF(("sys_rasctl: p=%p addr=%p, len=%ld, op=0x%x\n", curproc, addr, (long)len, op)); switch (op) { case RAS_INSTALL: error = ras_install(addr, len); break; case RAS_PURGE: error = ras_purge(addr, len); break; case RAS_PURGE_ALL: error = ras_purgeall(); break; default: error = EINVAL; break; } return (error); #else return (EOPNOTSUPP); #endif }
6 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 /* $NetBSD: uipc_syscalls_30.c,v 1.4 2019/01/27 02:08:39 pgoyette Exp $ */ /* written by Pavel Cahyna, 2006. Public domain. */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: uipc_syscalls_30.c,v 1.4 2019/01/27 02:08:39 pgoyette Exp $"); #if defined(_KERNEL_OPT) #include "opt_compat_netbsd.h" #endif /* * System call interface to the socket abstraction. */ #include <sys/param.h> #include <sys/kernel.h> #include <sys/msg.h> #include <sys/sysctl.h> #include <sys/mount.h> #include <sys/syscall.h> #include <sys/syscallvar.h> #include <sys/syscallargs.h> #include <sys/errno.h> #include <compat/common/compat_mod.h> static const struct syscall_package uipc_syscalls_30_syscalls[] = { { SYS_compat_30_socket, 0, (sy_call_t *)compat_30_sys_socket }, { 0, 0, NULL} }; int compat_30_sys_socket(struct lwp *l, const struct compat_30_sys_socket_args *uap, register_t *retval) { int error; error = sys___socket30(l, (const void *)uap, retval); if (error == EAFNOSUPPORT) error = EPROTONOSUPPORT; return (error); } int uipc_syscalls_30_init(void) { return syscall_establish(NULL, uipc_syscalls_30_syscalls); } int uipc_syscalls_30_fini(void) { return syscall_disestablish(NULL, uipc_syscalls_30_syscalls); }
9 9 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 /* $NetBSD: vnd_50.c,v 1.5 2019/12/12 02:15:42 pgoyette Exp $ */ /*- * Copyright (c) 1996, 1997, 1998, 2008 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Jason R. Thorpe. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /* * Copyright (c) 1988 University of Utah. * Copyright (c) 1990, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * the Systems Programming Group of the University of Utah Computer * Science Department. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: Utah $Hdr: vn.c 1.13 94/04/02$ * * @(#)vn.c 8.9 (Berkeley) 5/14/95 */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: vnd_50.c,v 1.5 2019/12/12 02:15:42 pgoyette Exp $"); #if defined(_KERNEL_OPT) #include "opt_compat_netbsd.h" #endif #include <sys/param.h> #include <sys/systm.h> #include <sys/proc.h> #include <sys/errno.h> #include <sys/malloc.h> #include <sys/ioctl.h> #include <sys/device.h> #include <sys/disk.h> #include <sys/stat.h> #include <sys/vnode.h> #include <sys/file.h> #include <sys/uio.h> #include <sys/conf.h> #include <sys/compat_stub.h> #include <net/zlib.h> #include <dev/vndvar.h> #include <compat/common/compat_mod.h> static int compat_50_vndioctl(u_long, struct lwp *, void *, int, struct vattr *, int (*)(struct lwp *, void *, int, struct vattr *)); static int compat_50_vndioctl(u_long cmd, struct lwp *l, void *data, int unit, struct vattr *vattr_p, int (*get)(struct lwp *, void *, int, struct vattr *)) { struct vnd_user50 *vnu = data; int error; if (cmd != VNDIOCGET50) return EPASSTHROUGH; error = (*get)(l, data, unit, vattr_p); if (error != 0) return error; vnu->vnu_dev = vattr_p->va_fsid; vnu->vnu_ino = vattr_p->va_fileid; return 0; } void vnd_50_init(void) { MODULE_HOOK_SET(compat_vndioctl_50_hook, compat_50_vndioctl); } void vnd_50_fini(void) { MODULE_HOOK_UNSET(compat_vndioctl_50_hook); }
20 20 2 4 18 4 3 4 22 1 1 21 6 16 9 32 6 37 37 18 18 18 16 5 39 4 1 3 3 1 3 17 11 29 3 1 9 15 1 10 21 15 8 13 13 13 13 13 13 11 11 11 11 11 2 9 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 /* $NetBSD: subr_time.c,v 1.38 2023/07/08 20:02:10 riastradh Exp $ */ /* * Copyright (c) 1982, 1986, 1989, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)kern_clock.c 8.5 (Berkeley) 1/21/94 * @(#)kern_time.c 8.4 (Berkeley) 5/26/95 */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: subr_time.c,v 1.38 2023/07/08 20:02:10 riastradh Exp $"); #include <sys/param.h> #include <sys/kernel.h> #include <sys/proc.h> #include <sys/kauth.h> #include <sys/lwp.h> #include <sys/timex.h> #include <sys/time.h> #include <sys/timetc.h> #include <sys/intr.h> /* * Compute number of hz until specified time. Used to compute second * argument to callout_reset() from an absolute time. */ int tvhzto(const struct timeval *tvp) { struct timeval now, tv; tv = *tvp; /* Don't modify original tvp. */ getmicrotime(&now); timersub(&tv, &now, &tv); return tvtohz(&tv); } /* * Compute number of ticks in the specified amount of time. */ int tvtohz(const struct timeval *tv) { unsigned long ticks; long sec, usec; /* * If the number of usecs in the whole seconds part of the time * difference fits in a long, then the total number of usecs will * fit in an unsigned long. Compute the total and convert it to * ticks, rounding up and adding 1 to allow for the current tick * to expire. Rounding also depends on unsigned long arithmetic * to avoid overflow. * * Otherwise, if the number of ticks in the whole seconds part of * the time difference fits in a long, then convert the parts to * ticks separately and add, using similar rounding methods and * overflow avoidance. This method would work in the previous * case, but it is slightly slower and assumes that hz is integral. * * Otherwise, round the time difference down to the maximum * representable value. * * If ints are 32-bit, then the maximum value for any timeout in * 10ms ticks is 248 days. */ sec = tv->tv_sec; usec = tv->tv_usec; KASSERT(usec >= 0); KASSERT(usec < 1000000); /* catch overflows in conversion time_t->int */ if (tv->tv_sec > INT_MAX) return INT_MAX; if (tv->tv_sec < 0) return 0; if (sec < 0 || (sec == 0 && usec == 0)) { /* * Would expire now or in the past. Return 0 ticks. * This is different from the legacy tvhzto() interface, * and callers need to check for it. */ ticks = 0; } else if (sec <= (LONG_MAX / 1000000)) ticks = (((sec * 1000000) + (unsigned long)usec + (tick - 1)) / tick) + 1; else if (sec <= (LONG_MAX / hz)) ticks = (sec * hz) + (((unsigned long)usec + (tick - 1)) / tick) + 1; else ticks = LONG_MAX; if (ticks > INT_MAX) ticks = INT_MAX; return ((int)ticks); } int tshzto(const struct timespec *tsp) { struct timespec now, ts; ts = *tsp; /* Don't modify original tsp. */ getnanotime(&now); timespecsub(&ts, &now, &ts); return tstohz(&ts); } int tshztoup(const struct timespec *tsp) { struct timespec now, ts; ts = *tsp; /* Don't modify original tsp. */ getnanouptime(&now); timespecsub(&ts, &now, &ts); return tstohz(&ts); } /* * Compute number of ticks in the specified amount of time. */ int tstohz(const struct timespec *ts) { struct timeval tv; /* * usec has great enough resolution for hz, so convert to a * timeval and use tvtohz() above. */ TIMESPEC_TO_TIMEVAL(&tv, ts); return tvtohz(&tv); } /* * Check that a proposed value to load into the .it_value or * .it_interval part of an interval timer is acceptable, and * fix it to have at least minimal value (i.e. if it is less * than the resolution of the clock, round it up.). We don't * timeout the 0,0 value because this means to disable the * timer or the interval. */ int itimerfix(struct timeval *tv) { if (tv->tv_usec < 0 || tv->tv_usec >= 1000000) return EINVAL; if (tv->tv_sec < 0) return ETIMEDOUT; if (tv->tv_sec == 0 && tv->tv_usec != 0 && tv->tv_usec < tick) tv->tv_usec = tick; return 0; } int itimespecfix(struct timespec *ts) { if (ts->tv_nsec < 0 || ts->tv_nsec >= 1000000000) return EINVAL; if (ts->tv_sec < 0) return ETIMEDOUT; if (ts->tv_sec == 0 && ts->tv_nsec != 0 && ts->tv_nsec < tick * 1000) ts->tv_nsec = tick * 1000; return 0; } int inittimeleft(struct timespec *ts, struct timespec *sleepts) { if (itimespecfix(ts)) { return -1; } KASSERT(ts->tv_sec >= 0); getnanouptime(sleepts); return 0; } int gettimeleft(struct timespec *ts, struct timespec *sleepts) { struct timespec now, sleptts; KASSERT(ts->tv_sec >= 0); /* * Reduce ts by elapsed time based on monotonic time scale. */ getnanouptime(&now); KASSERT(timespeccmp(sleepts, &now, <=)); timespecsub(&now, sleepts, &sleptts); *sleepts = now; if (timespeccmp(ts, &sleptts, <=)) { /* timed out */ timespecclear(ts); return 0; } timespecsub(ts, &sleptts, ts); return tstohz(ts); } void clock_timeleft(clockid_t clockid, struct timespec *ts, struct timespec *sleepts) { struct timespec sleptts; clock_gettime1(clockid, &sleptts); timespecadd(ts, sleepts, ts); timespecsub(ts, &sleptts, ts); *sleepts = sleptts; } int clock_gettime1(clockid_t clock_id, struct timespec *ts) { int error; struct proc *p; #define CPUCLOCK_ID_MASK (~(CLOCK_THREAD_CPUTIME_ID|CLOCK_PROCESS_CPUTIME_ID)) if (clock_id & CLOCK_PROCESS_CPUTIME_ID) { pid_t pid = clock_id & CPUCLOCK_ID_MASK; struct timeval cputime; mutex_enter(&proc_lock); p = pid == 0 ? curproc : proc_find(pid); if (p == NULL) { mutex_exit(&proc_lock); return ESRCH; } mutex_enter(p->p_lock); calcru(p, /*usertime*/NULL, /*systime*/NULL, /*intrtime*/NULL, &cputime); mutex_exit(p->p_lock); mutex_exit(&proc_lock); // XXX: Perhaps create a special kauth type error = kauth_authorize_process(kauth_cred_get(), KAUTH_PROCESS_PTRACE, p, KAUTH_ARG(KAUTH_REQ_PROCESS_CANSEE_ENTRY), NULL, NULL); if (error) return error; TIMEVAL_TO_TIMESPEC(&cputime, ts); return 0; } else if (clock_id & CLOCK_THREAD_CPUTIME_ID) { struct lwp *l; lwpid_t lid = clock_id & CPUCLOCK_ID_MASK; struct bintime tm = {0, 0}; p = curproc; mutex_enter(p->p_lock); l = lid == 0 ? curlwp : lwp_find(p, lid); if (l == NULL) { mutex_exit(p->p_lock); return ESRCH; } addrulwp(l, &tm); mutex_exit(p->p_lock); bintime2timespec(&tm, ts); return 0; } switch (clock_id) { case CLOCK_REALTIME: nanotime(ts); break; case CLOCK_MONOTONIC: nanouptime(ts); break; default: return EINVAL; } return 0; } /* * Calculate delta and convert from struct timespec to the ticks. */ int ts2timo(clockid_t clock_id, int flags, struct timespec *ts, int *timo, struct timespec *start) { int error; struct timespec tsd; if (ts->tv_nsec < 0 || ts->tv_nsec >= 1000000000L) return EINVAL; if ((flags & TIMER_ABSTIME) != 0 || start != NULL) { error = clock_gettime1(clock_id, &tsd); if (error != 0) return error; if (start != NULL) *start = tsd; } if ((flags & TIMER_ABSTIME) != 0) { if (!timespecsubok(ts, &tsd)) return EINVAL; timespecsub(ts, &tsd, ts); } error = itimespecfix(ts); if (error != 0) return error; if (ts->tv_sec == 0 && ts->tv_nsec == 0) return ETIMEDOUT; *timo = tstohz(ts); KASSERT(*timo > 0); return 0; } bool timespecaddok(const struct timespec *tsp, const struct timespec *usp) { enum { TIME_MIN = __type_min(time_t), TIME_MAX = __type_max(time_t) }; time_t a = tsp->tv_sec; time_t b = usp->tv_sec; bool carry; /* * Caller is responsible for guaranteeing valid timespec * inputs. Any user-controlled inputs must be validated or * adjusted. */ KASSERT(tsp->tv_nsec >= 0); KASSERT(usp->tv_nsec >= 0); KASSERT(tsp->tv_nsec < 1000000000L); KASSERT(usp->tv_nsec < 1000000000L); CTASSERT(1000000000L <= __type_max(long) - 1000000000L); /* * Fail if a + b + carry overflows TIME_MAX, or if a + b * overflows TIME_MIN because timespecadd adds the carry after * computing a + b. * * Break it into two mutually exclusive and exhaustive cases: * I. a >= 0 * II. a < 0 */ carry = (tsp->tv_nsec + usp->tv_nsec >= 1000000000L); if (a >= 0) { /* * Case I: a >= 0. If b < 0, then b + 1 <= 0, so * * a + b + 1 <= a + 0 <= TIME_MAX, * * and * * a + b >= 0 + b = b >= TIME_MIN, * * so this can't overflow. * * If b >= 0, then a + b + carry >= a + b >= 0, so * negative results and thus results below TIME_MIN are * impossible; we need only avoid * * a + b + carry > TIME_MAX, * * which we will do by rejecting if * * b > TIME_MAX - a - carry, * * which in turn is incidentally always false if b < 0 * so we don't need extra logic to discriminate on the * b >= 0 and b < 0 cases. * * Since 0 <= a <= TIME_MAX, we know * * 0 <= TIME_MAX - a <= TIME_MAX, * * and hence * * -1 <= TIME_MAX - a - 1 < TIME_MAX. * * So we can compute TIME_MAX - a - carry (i.e., either * TIME_MAX - a or TIME_MAX - a - 1) safely without * overflow. */ if (b > TIME_MAX - a - carry) return false; } else { /* * Case II: a < 0. If b >= 0, then since a + 1 <= 0, * we have * * a + b + 1 <= b <= TIME_MAX, * * and * * a + b >= a >= TIME_MIN, * * so this can't overflow. * * If b < 0, then the intermediate a + b is negative * and the outcome a + b + 1 is nonpositive, so we need * only avoid * * a + b < TIME_MIN, * * which we will do by rejecting if * * a < TIME_MIN - b. * * (Reminder: The carry is added afterward in * timespecadd, so to avoid overflow it is not enough * to merely reject a + b + carry < TIME_MIN.) * * It is safe to compute the difference TIME_MIN - b * because b is negative, so the result lies in * (TIME_MIN, 0]. */ if (b < 0 && a < TIME_MIN - b) return false; } return true; } bool timespecsubok(const struct timespec *tsp, const struct timespec *usp) { enum { TIME_MIN = __type_min(time_t), TIME_MAX = __type_max(time_t) }; time_t a = tsp->tv_sec, b = usp->tv_sec; bool borrow; /* * Caller is responsible for guaranteeing valid timespec * inputs. Any user-controlled inputs must be validated or * adjusted. */ KASSERT(tsp->tv_nsec >= 0); KASSERT(usp->tv_nsec >= 0); KASSERT(tsp->tv_nsec < 1000000000L); KASSERT(usp->tv_nsec < 1000000000L); CTASSERT(1000000000L <= __type_max(long) - 1000000000L); /* * Fail if a - b - borrow overflows TIME_MIN, or if a - b * overflows TIME_MAX because timespecsub subtracts the borrow * after computing a - b. * * Break it into two mutually exclusive and exhaustive cases: * I. a < 0 * II. a >= 0 */ borrow = (tsp->tv_nsec - usp->tv_nsec < 0); if (a < 0) { /* * Case I: a < 0. If b < 0, then -b - 1 >= 0, so * * a - b - 1 >= a + 0 >= TIME_MIN, * * and, since a <= -1, provided that TIME_MIN <= * -TIME_MAX - 1 so that TIME_MAX <= -TIME_MIN - 1 (in * fact, equality holds, under the assumption of * two's-complement arithmetic), * * a - b <= -1 - b = -b - 1 <= TIME_MAX, * * so this can't overflow. */ CTASSERT(TIME_MIN <= -TIME_MAX - 1); /* * If b >= 0, then a - b - borrow <= a - b < 0, so * positive results and thus results above TIME_MAX are * impossible; we need only avoid * * a - b - borrow < TIME_MIN, * * which we will do by rejecting if * * a < TIME_MIN + b + borrow. * * The right-hand side is safe to evaluate for any * values of b and borrow as long as TIME_MIN + * TIME_MAX + 1 <= TIME_MAX, i.e., TIME_MIN <= -1. * (Note: If time_t were unsigned, this would fail!) * * Note: Unlike Case I in timespecaddok, this criterion * does not work for b < 0, nor can the roles of a and * b in the inequality be reversed (e.g., -b < TIME_MIN * - a + borrow) without extra cases like checking for * b = TEST_MIN. */ CTASSERT(TIME_MIN < -1); if (b >= 0 && a < TIME_MIN + b + borrow) return false; } else { /* * Case II: a >= 0. If b >= 0, then * * a - b <= a <= TIME_MAX, * * and, provided TIME_MIN <= -TIME_MAX - 1 (in fact, * equality holds, under the assumption of * two's-complement arithmetic) * * a - b - 1 >= -b - 1 >= -TIME_MAX - 1 >= TIME_MIN, * * so this can't overflow. */ CTASSERT(TIME_MIN <= -TIME_MAX - 1); /* * If b < 0, then a - b >= a >= 0, so negative results * and thus results below TIME_MIN are impossible; we * need only avoid * * a - b > TIME_MAX, * * which we will do by rejecting if * * a > TIME_MAX + b. * * (Reminder: The borrow is subtracted afterward in * timespecsub, so to avoid overflow it is not enough * to merely reject a - b - borrow > TIME_MAX.) * * It is safe to compute the sum TIME_MAX + b because b * is negative, so the result lies in [0, TIME_MAX). */ if (b < 0 && a > TIME_MAX + b) return false; } return true; }
1 1 5 1 1 3 1 2 1 1 13 13 3 9 1 11 11 11 11 11 191 186 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 /* $NetBSD: procfs_vfsops.c,v 1.114 2024/01/17 10:21:01 hannken Exp $ */ /* * Copyright (c) 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * Jan-Simon Pendry. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)procfs_vfsops.c 8.7 (Berkeley) 5/10/95 */ /* * Copyright (c) 1993 Jan-Simon Pendry * * This code is derived from software contributed to Berkeley by * Jan-Simon Pendry. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)procfs_vfsops.c 8.7 (Berkeley) 5/10/95 */ /* * procfs VFS interface */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: procfs_vfsops.c,v 1.114 2024/01/17 10:21:01 hannken Exp $"); #if defined(_KERNEL_OPT) #include "opt_compat_netbsd.h" #endif #include <sys/param.h> #include <sys/atomic.h> #include <sys/buf.h> #include <sys/dirent.h> #include <sys/file.h> #include <sys/filedesc.h> #include <sys/fstrans.h> #include <sys/kauth.h> #include <sys/kernel.h> #include <sys/module.h> #include <sys/mount.h> #include <sys/proc.h> #include <sys/signalvar.h> #include <sys/sysctl.h> #include <sys/syslog.h> #include <sys/systm.h> #include <sys/time.h> #include <sys/vnode.h> #include <miscfs/genfs/genfs.h> #include <miscfs/procfs/procfs.h> #include <uvm/uvm_extern.h> /* for PAGE_SIZE */ MODULE(MODULE_CLASS_VFS, procfs, "ptrace_common"); VFS_PROTOS(procfs); #define PROCFS_HASHSIZE 256 #define PROCFS_EXEC_HOOK ((void *)1) #define PROCFS_EXIT_HOOK ((void *)2) static kauth_listener_t procfs_listener; static void *procfs_exechook; static void *procfs_exithook; LIST_HEAD(hashhead, pfsnode); static u_long procfs_hashmask; static struct hashhead *procfs_hashtab; static kmutex_t procfs_hashlock; static struct hashhead * procfs_hashhead(pid_t pid) { return &procfs_hashtab[pid & procfs_hashmask]; } void procfs_hashrem(struct pfsnode *pfs) { mutex_enter(&procfs_hashlock); LIST_REMOVE(pfs, pfs_hash); mutex_exit(&procfs_hashlock); } /* * VFS Operations. * * mount system call */ /* ARGSUSED */ int procfs_mount( struct mount *mp, const char *path, void *data, size_t *data_len) { struct lwp *l = curlwp; struct procfsmount *pmnt; struct procfs_args *args = data; int error; if (args == NULL) return EINVAL; if (UIO_MX & (UIO_MX-1)) { log(LOG_ERR, "procfs: invalid directory entry size"); return (EINVAL); } if (mp->mnt_flag & MNT_GETARGS) { if (*data_len < sizeof *args) return EINVAL; pmnt = VFSTOPROC(mp); if (pmnt == NULL) return EIO; args->version = PROCFS_ARGSVERSION; args->flags = pmnt->pmnt_flags; *data_len = sizeof *args; return 0; } if (mp->mnt_flag & MNT_UPDATE) return (EOPNOTSUPP); if (*data_len >= sizeof *args && args->version != PROCFS_ARGSVERSION) return EINVAL; pmnt = kmem_zalloc(sizeof(struct procfsmount), KM_SLEEP); mp->mnt_stat.f_namemax = PROCFS_MAXNAMLEN; mp->mnt_flag |= MNT_LOCAL; mp->mnt_data = pmnt; vfs_getnewfsid(mp); error = set_statvfs_info(path, UIO_USERSPACE, "procfs", UIO_SYSSPACE, mp->mnt_op->vfs_name, mp, l); if (*data_len >= sizeof *args) pmnt->pmnt_flags = args->flags; else pmnt->pmnt_flags = 0; mp->mnt_iflag |= IMNT_MPSAFE | IMNT_SHRLOOKUP; return error; } /* * unmount system call */ int procfs_unmount(struct mount *mp, int mntflags) { int error; int flags = 0; if (mntflags & MNT_FORCE) flags |= FORCECLOSE; if ((error = vflush(mp, 0, flags)) != 0) return (error); kmem_free(mp->mnt_data, sizeof(struct procfsmount)); mp->mnt_data = NULL; return 0; } int procfs_root(struct mount *mp, int lktype, struct vnode **vpp) { int error; error = procfs_allocvp(mp, vpp, 0, PFSroot, -1); if (error == 0) { error = vn_lock(*vpp, lktype); if (error != 0) { vrele(*vpp); *vpp = NULL; } } return error; } /* ARGSUSED */ int procfs_start(struct mount *mp, int flags) { return (0); } /* * Get file system statistics. */ int procfs_statvfs(struct mount *mp, struct statvfs *sbp) { genfs_statvfs(mp, sbp); sbp->f_bsize = PAGE_SIZE; sbp->f_frsize = PAGE_SIZE; sbp->f_iosize = PAGE_SIZE; sbp->f_blocks = 1; sbp->f_files = maxproc; /* approx */ sbp->f_ffree = maxproc - atomic_load_relaxed(&nprocs); /* approx */ sbp->f_favail = maxproc - atomic_load_relaxed(&nprocs); /* approx */ return (0); } /*ARGSUSED*/ int procfs_sync( struct mount *mp, int waitfor, kauth_cred_t uc) { return (0); } /*ARGSUSED*/ int procfs_vget(struct mount *mp, ino_t ino, int lktype, struct vnode **vpp) { return (EOPNOTSUPP); } int procfs_loadvnode(struct mount *mp, struct vnode *vp, const void *key, size_t key_len, const void **new_key) { int error; struct pfskey pfskey; struct pfsnode *pfs; KASSERT(key_len == sizeof(pfskey)); memcpy(&pfskey, key, key_len); pfs = kmem_alloc(sizeof(*pfs), KM_SLEEP); pfs->pfs_pid = pfskey.pk_pid; pfs->pfs_type = pfskey.pk_type; pfs->pfs_fd = pfskey.pk_fd; pfs->pfs_vnode = vp; pfs->pfs_mount = mp; pfs->pfs_flags = 0; pfs->pfs_fileno = PROCFS_FILENO(pfs->pfs_pid, pfs->pfs_type, pfs->pfs_fd); vp->v_tag = VT_PROCFS; vp->v_op = procfs_vnodeop_p; vp->v_data = pfs; switch (pfs->pfs_type) { case PFSroot: /* /proc = dr-xr-xr-x */ vp->v_vflag |= VV_ROOT; /*FALLTHROUGH*/ case PFSproc: /* /proc/N = dr-xr-xr-x */ pfs->pfs_mode = S_IRUSR|S_IXUSR|S_IRGRP|S_IXGRP|S_IROTH|S_IXOTH; vp->v_type = VDIR; break; case PFStask: /* /proc/N/task = dr-xr-xr-x */ if (pfs->pfs_fd == -1) { pfs->pfs_mode = S_IRUSR|S_IXUSR|S_IRGRP|S_IXGRP| S_IROTH|S_IXOTH; vp->v_type = VDIR; break; } /*FALLTHROUGH*/ case PFScurproc: /* /proc/curproc = lr-xr-xr-x */ case PFSself: /* /proc/self = lr-xr-xr-x */ case PFScwd: /* /proc/N/cwd = lr-xr-xr-x */ case PFSchroot: /* /proc/N/chroot = lr-xr-xr-x */ case PFSexe: /* /proc/N/exe = lr-xr-xr-x */ pfs->pfs_mode = S_IRUSR|S_IXUSR|S_IRGRP|S_IXGRP|S_IROTH|S_IXOTH; vp->v_type = VLNK; break; case PFSfd: if (pfs->pfs_fd == -1) { /* /proc/N/fd = dr-x------ */ pfs->pfs_mode = S_IRUSR|S_IXUSR; vp->v_type = VDIR; } else { /* /proc/N/fd/M = [ps-]rw------- */ file_t *fp; vnode_t *vxp; struct proc *p; mutex_enter(&proc_lock); p = procfs_proc_find(mp, pfs->pfs_pid); mutex_exit(&proc_lock); if (p == NULL) { error = ENOENT; goto bad; } KASSERT(rw_read_held(&p->p_reflock)); if ((fp = fd_getfile2(p, pfs->pfs_fd)) == NULL) { error = EBADF; goto bad; } pfs->pfs_mode = S_IRUSR|S_IWUSR; switch (fp->f_type) { case DTYPE_VNODE: vxp = fp->f_vnode; /* * We make symlinks for directories * to avoid cycles. */ if (vxp->v_type == VDIR || procfs_proc_is_linux_compat()) goto symlink; vp->v_type = vxp->v_type; break; case DTYPE_PIPE: vp->v_type = VFIFO; break; case DTYPE_SOCKET: vp->v_type = VSOCK; break; case DTYPE_KQUEUE: case DTYPE_MISC: case DTYPE_SEM: symlink: pfs->pfs_mode = S_IRUSR|S_IXUSR|S_IRGRP| S_IXGRP|S_IROTH|S_IXOTH; vp->v_type = VLNK; break; default: error = EOPNOTSUPP; closef(fp); goto bad; } closef(fp); } break; case PFSfile: /* /proc/N/file = -rw------- */ case PFSmem: /* /proc/N/mem = -rw------- */ case PFSregs: /* /proc/N/regs = -rw------- */ case PFSfpregs: /* /proc/N/fpregs = -rw------- */ pfs->pfs_mode = S_IRUSR|S_IWUSR; vp->v_type = VREG; break; case PFSnote: /* /proc/N/note = --w------ */ case PFSnotepg: /* /proc/N/notepg = --w------ */ pfs->pfs_mode = S_IWUSR; vp->v_type = VREG; break; case PFSmap: /* /proc/N/map = -r-------- */ case PFSmaps: /* /proc/N/maps = -r-------- */ case PFSauxv: /* /proc/N/auxv = -r-------- */ case PFSenviron: /* /proc/N/environ = -r-------- */ pfs->pfs_mode = S_IRUSR; vp->v_type = VREG; break; case PFSstatus: /* /proc/N/status = -r--r--r-- */ case PFSstat: /* /proc/N/stat = -r--r--r-- */ case PFScmdline: /* /proc/N/cmdline = -r--r--r-- */ case PFSemul: /* /proc/N/emul = -r--r--r-- */ case PFSmeminfo: /* /proc/meminfo = -r--r--r-- */ case PFScpustat: /* /proc/stat = -r--r--r-- */ case PFSdevices: /* /proc/devices = -r--r--r-- */ case PFScpuinfo: /* /proc/cpuinfo = -r--r--r-- */ case PFSuptime: /* /proc/uptime = -r--r--r-- */ case PFSmounts: /* /proc/mounts = -r--r--r-- */ case PFSloadavg: /* /proc/loadavg = -r--r--r-- */ case PFSstatm: /* /proc/N/statm = -r--r--r-- */ case PFSversion: /* /proc/version = -r--r--r-- */ case PFSlimit: /* /proc/limit = -r--r--r-- */ pfs->pfs_mode = S_IRUSR|S_IRGRP|S_IROTH; vp->v_type = VREG; break; #ifdef __HAVE_PROCFS_MACHDEP PROCFS_MACHDEP_NODETYPE_CASES procfs_machdep_allocvp(vp); break; #endif default: panic("procfs_allocvp"); } mutex_enter(&procfs_hashlock); LIST_INSERT_HEAD(procfs_hashhead(pfs->pfs_pid), pfs, pfs_hash); mutex_exit(&procfs_hashlock); uvm_vnp_setsize(vp, 0); *new_key = &pfs->pfs_key; return 0; bad: vp->v_tag =VT_NON; vp->v_type = VNON; vp->v_op = NULL; vp->v_data = NULL; kmem_free(pfs, sizeof(*pfs)); return error; } void procfs_init(void) { } void procfs_reinit(void) { } void procfs_done(void) { } extern const struct vnodeopv_desc procfs_vnodeop_opv_desc; const struct vnodeopv_desc * const procfs_vnodeopv_descs[] = { &procfs_vnodeop_opv_desc, NULL, }; struct vfsops procfs_vfsops = { .vfs_name = MOUNT_PROCFS, .vfs_min_mount_data = sizeof (struct procfs_args), .vfs_mount = procfs_mount, .vfs_start = procfs_start, .vfs_unmount = procfs_unmount, .vfs_root = procfs_root, .vfs_quotactl = (void *)eopnotsupp, .vfs_statvfs = procfs_statvfs, .vfs_sync = procfs_sync, .vfs_vget = procfs_vget, .vfs_loadvnode = procfs_loadvnode, .vfs_fhtovp = (void *)eopnotsupp, .vfs_vptofh = (void *)eopnotsupp, .vfs_init = procfs_init, .vfs_reinit = procfs_reinit, .vfs_done = procfs_done, .vfs_snapshot = (void *)eopnotsupp, .vfs_extattrctl = vfs_stdextattrctl, .vfs_suspendctl = genfs_suspendctl, .vfs_renamelock_enter = genfs_renamelock_enter, .vfs_renamelock_exit = genfs_renamelock_exit, .vfs_fsync = (void *)eopnotsupp, .vfs_opv_descs = procfs_vnodeopv_descs }; static void procfs_exechook_cb(struct proc *p, void *arg) { struct hashhead *head; struct pfsnode *pfs; struct mount *mp; struct pfskey key; struct vnode *vp; int error; if (arg == PROCFS_EXEC_HOOK && !(p->p_flag & PK_SUGID)) return; head = procfs_hashhead(p->p_pid); again: mutex_enter(&procfs_hashlock); LIST_FOREACH(pfs, head, pfs_hash) { if (pfs->pfs_pid != p->p_pid) continue; mp = pfs->pfs_mount; key = pfs->pfs_key; vfs_ref(mp); mutex_exit(&procfs_hashlock); error = vcache_get(mp, &key, sizeof(key), &vp); vfs_rele(mp); if (error != 0) goto again; if (vrecycle(vp)) goto again; do { error = vfs_suspend(mp, 0); } while (error == EINTR || error == ERESTART); vgone(vp); if (error == 0) vfs_resume(mp); goto again; } mutex_exit(&procfs_hashlock); } static int procfs_listener_cb(kauth_cred_t cred, kauth_action_t action, void *cookie, void *arg0, void *arg1, void *arg2, void *arg3) { struct proc *p; struct pfsnode *pfs; int result; result = KAUTH_RESULT_DEFER; p = arg0; pfs = arg1; if (action != KAUTH_PROCESS_PROCFS) return result; switch (pfs->pfs_type) { case PFSregs: case PFSfpregs: case PFSmem: if (kauth_cred_getuid(cred) != kauth_cred_getuid(p->p_cred) || ISSET(p->p_flag, PK_SUGID)) break; /*FALLTHROUGH*/ default: result = KAUTH_RESULT_ALLOW; break; } return result; } SYSCTL_SETUP(procfs_sysctl_setup, "procfs sysctl") { sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT, CTLTYPE_NODE, "procfs", SYSCTL_DESCR("Process file system"), NULL, 0, NULL, 0, CTL_VFS, 12, CTL_EOL); /* * XXX the "12" above could be dynamic, thereby eliminating * one more instance of the "number to vfs" mapping problem, * but "12" is the order as taken from sys/mount.h */ } static int procfs_modcmd(modcmd_t cmd, void *arg) { int error; switch (cmd) { case MODULE_CMD_INIT: error = vfs_attach(&procfs_vfsops); if (error != 0) break; procfs_listener = kauth_listen_scope(KAUTH_SCOPE_PROCESS, procfs_listener_cb, NULL); procfs_exechook = exechook_establish(procfs_exechook_cb, PROCFS_EXEC_HOOK); procfs_exithook = exithook_establish(procfs_exechook_cb, PROCFS_EXIT_HOOK); mutex_init(&procfs_hashlock, MUTEX_DEFAULT, IPL_NONE); procfs_hashtab = hashinit(PROCFS_HASHSIZE, HASH_LIST, true, &procfs_hashmask); break; case MODULE_CMD_FINI: error = vfs_detach(&procfs_vfsops); if (error != 0) break; kauth_unlisten_scope(procfs_listener); exechook_disestablish(procfs_exechook); exithook_disestablish(procfs_exithook); mutex_destroy(&procfs_hashlock); hashdone(procfs_hashtab, HASH_LIST, procfs_hashmask); break; default: error = ENOTTY; break; } return (error); }
16 1 1 1 1 1 8 9 6 6 6 6 6 6 6 6 6 2 1 3 2 2 1 5 44 1 2 16 2 6 2 6 9 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 /* $NetBSD: sys_module.c,v 1.30 2022/05/24 06:20:05 andvar Exp $ */ /*- * Copyright (c) 2008 The NetBSD Foundation, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /* * System calls relating to loadable modules. */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: sys_module.c,v 1.30 2022/05/24 06:20:05 andvar Exp $"); #ifdef _KERNEL_OPT #include "opt_modular.h" #endif #include <sys/param.h> #include <sys/systm.h> #include <sys/proc.h> #include <sys/namei.h> #include <sys/kauth.h> #include <sys/kmem.h> #include <sys/kobj.h> #include <sys/module.h> #include <sys/syscall.h> #include <sys/syscallargs.h> #include <sys/compat_stub.h> /* * Arbitrary limit to avoid DoS for excessive memory allocation. */ #define MAXPROPSLEN 4096 int handle_modctl_load(const char *ml_filename, int ml_flags, const char *ml_props, size_t ml_propslen) { char *path; char *props; int error; prop_dictionary_t dict; size_t propslen = 0; if ((ml_props != NULL && ml_propslen == 0) || (ml_props == NULL && ml_propslen > 0)) { return EINVAL; } path = PNBUF_GET(); error = copyinstr(ml_filename, path, MAXPATHLEN, NULL); if (error != 0) goto out1; if (ml_props != NULL) { if (ml_propslen > MAXPROPSLEN) { error = ENOMEM; goto out1; } propslen = ml_propslen + 1; props = kmem_alloc(propslen, KM_SLEEP); error = copyinstr(ml_props, props, propslen, NULL); if (error != 0) goto out2; dict = prop_dictionary_internalize(props); if (dict == NULL) { error = EINVAL; goto out2; } } else { dict = NULL; props = NULL; } error = module_load(path, ml_flags, dict, MODULE_CLASS_ANY); if (dict != NULL) { prop_object_release(dict); } out2: if (props != NULL) { kmem_free(props, propslen); } out1: PNBUF_PUT(path); return error; } static int handle_modctl_stat(struct iovec *iov, void *arg) { int ms_cnt; modstat_t *ms, *mso; size_t ms_len; char *req, *reqo; size_t req_len; char *out_p; size_t out_s; modinfo_t *mi; module_t *mod; vaddr_t addr; size_t size; size_t used; int off; int error; bool stataddr; /* If not privileged, don't expose kernel addresses. */ error = kauth_authorize_process(kauth_cred_get(), KAUTH_PROCESS_CANSEE, curproc, KAUTH_ARG(KAUTH_REQ_PROCESS_CANSEE_KPTR), NULL, NULL); stataddr = (error == 0); kernconfig_lock(); ms_cnt = 0; req_len = 1; /* * Count up the number of modstat_t needed, and total size of * require_module lists on both active and built-in lists */ TAILQ_FOREACH(mod, &module_list, mod_chain) { ms_cnt++; mi = mod->mod_info; if (mi->mi_required != NULL) { req_len += strlen(mi->mi_required) + 1; } } TAILQ_FOREACH(mod, &module_builtins, mod_chain) { ms_cnt++; mi = mod->mod_info; if (mi->mi_required != NULL) { req_len += strlen(mi->mi_required) + 1; } } /* Allocate internal buffers to hold all the output data */ ms_len = ms_cnt * sizeof(modstat_t); ms = kmem_zalloc(ms_len, KM_SLEEP); req = kmem_zalloc(req_len, KM_SLEEP); mso = ms; reqo = req++; off = 1; /* * Load data into our internal buffers for both active and * built-in module lists */ TAILQ_FOREACH(mod, &module_list, mod_chain) { mi = mod->mod_info; strlcpy(ms->ms_name, mi->mi_name, sizeof(ms->ms_name)); if (mi->mi_required != NULL) { ms->ms_reqoffset = off; used = strlcpy(req, mi->mi_required, req_len - off); KASSERTMSG(used < req_len - off, "reqlist grew!"); off += used + 1; req += used + 1; } else ms->ms_reqoffset = 0; if (mod->mod_kobj != NULL && stataddr) { kobj_stat(mod->mod_kobj, &addr, &size); ms->ms_addr = addr; ms->ms_size = size; } ms->ms_class = mi->mi_class; ms->ms_refcnt = mod->mod_refcnt; ms->ms_source = mod->mod_source; ms->ms_flags = mod->mod_flags; ms++; } TAILQ_FOREACH(mod, &module_builtins, mod_chain) { mi = mod->mod_info; strlcpy(ms->ms_name, mi->mi_name, sizeof(ms->ms_name)); if (mi->mi_required != NULL) { ms->ms_reqoffset = off; used = strlcpy(req, mi->mi_required, req_len - off); KASSERTMSG(used < req_len - off, "reqlist grew!"); off += used + 1; req += used + 1; } else ms->ms_reqoffset = 0; if (mod->mod_kobj != NULL && stataddr) { kobj_stat(mod->mod_kobj, &addr, &size); ms->ms_addr = addr; ms->ms_size = size; } ms->ms_class = mi->mi_class; ms->ms_refcnt = -1; KASSERT(mod->mod_source == MODULE_SOURCE_KERNEL); ms->ms_source = mod->mod_source; ms++; } kernconfig_unlock(); /* * Now copyout our internal buffers back to userland */ out_p = iov->iov_base; out_s = iov->iov_len; size = sizeof(ms_cnt); /* Copy out the count of modstat_t */ if (out_s) { size = uimin(sizeof(ms_cnt), out_s); error = copyout(&ms_cnt, out_p, size); out_p += size; out_s -= size; } /* Copy out the modstat_t array */ if (out_s && error == 0) { size = uimin(ms_len, out_s); error = copyout(mso, out_p, size); out_p += size; out_s -= size; } /* Copy out the "required" strings */ if (out_s && error == 0) { size = uimin(req_len, out_s); error = copyout(reqo, out_p, size); out_p += size; out_s -= size; } kmem_free(mso, ms_len); kmem_free(reqo, req_len); /* Finally, update the userland copy of the iovec's length */ if (error == 0) { iov->iov_len = ms_len + req_len + sizeof(ms_cnt); error = copyout(iov, arg, sizeof(*iov)); } return error; } int sys_modctl(struct lwp *l, const struct sys_modctl_args *uap, register_t *retval) { /* { syscallarg(int) cmd; syscallarg(void *) arg; } */ char buf[MAXMODNAME]; struct iovec iov; modctl_load_t ml; int error; void *arg; #ifdef MODULAR uintptr_t loadtype; #endif arg = SCARG(uap, arg); switch (SCARG(uap, cmd)) { case MODCTL_LOAD: error = copyin(arg, &ml, sizeof(ml)); if (error != 0) break; error = handle_modctl_load(ml.ml_filename, ml.ml_flags, ml.ml_props, ml.ml_propslen); break; case MODCTL_UNLOAD: error = copyinstr(arg, buf, sizeof(buf), NULL); if (error == 0) { error = module_unload(buf); } break; case MODCTL_STAT: error = copyin(arg, &iov, sizeof(iov)); if (error != 0) { break; } error = handle_modctl_stat(&iov, arg); break; case MODCTL_EXISTS: #ifndef MODULAR error = ENOSYS; #else loadtype = (uintptr_t)arg; switch (loadtype) { /* 0 = modload, 1 = autoload */ case 0: /* FALLTHROUGH */ case 1: error = kauth_authorize_system(kauth_cred_get(), KAUTH_SYSTEM_MODULE, 0, (void *)(uintptr_t)MODCTL_LOAD, (void *)loadtype, NULL); break; default: error = EINVAL; break; } #endif break; default: (void)module_autoload("compat_80", MODULE_CLASS_EXEC); MODULE_HOOK_CALL(compat_modstat_80_hook, (SCARG(uap, cmd), &iov, arg), enosys(), error); if (error == ENOSYS) error = EINVAL; break; } return error; }
203 203 202 202 202 203 202 202 201 203 203 202 203 203 203 202 203 203 203 203 203 202 202 203 203 203 203 203 203 203 203 203 203 203 202 201 200 203 203 202 203 202 202 1 1 1 201 203 201 201 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 /* $NetBSD: scsipi_base.c,v 1.189 2022/04/09 23:38:32 riastradh Exp $ */ /*- * Copyright (c) 1998, 1999, 2000, 2002, 2003, 2004 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Charles M. Hannum; by Jason R. Thorpe of the Numerical Aerospace * Simulation Facility, NASA Ames Research Center. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: scsipi_base.c,v 1.189 2022/04/09 23:38:32 riastradh Exp $"); #ifdef _KERNEL_OPT #include "opt_scsi.h" #endif #include <sys/param.h> #include <sys/systm.h> #include <sys/kernel.h> #include <sys/buf.h> #include <sys/uio.h> #include <sys/malloc.h> #include <sys/pool.h> #include <sys/errno.h> #include <sys/device.h> #include <sys/proc.h> #include <sys/kthread.h> #include <sys/hash.h> #include <sys/atomic.h> #include <dev/scsipi/scsi_sdt.h> #include <dev/scsipi/scsi_spc.h> #include <dev/scsipi/scsipi_all.h> #include <dev/scsipi/scsipi_disk.h> #include <dev/scsipi/scsipiconf.h> #include <dev/scsipi/scsipi_base.h> #include <dev/scsipi/scsi_all.h> #include <dev/scsipi/scsi_message.h> #include <machine/param.h> SDT_PROVIDER_DEFINE(scsi); SDT_PROBE_DEFINE3(scsi, base, tag, get, "struct scsipi_xfer *"/*xs*/, "uint8_t"/*tag*/, "uint8_t"/*type*/); SDT_PROBE_DEFINE3(scsi, base, tag, put, "struct scsipi_xfer *"/*xs*/, "uint8_t"/*tag*/, "uint8_t"/*type*/); SDT_PROBE_DEFINE3(scsi, base, adapter, request__start, "struct scsipi_channel *"/*chan*/, "scsipi_adapter_req_t"/*req*/, "void *"/*arg*/); SDT_PROBE_DEFINE3(scsi, base, adapter, request__done, "struct scsipi_channel *"/*chan*/, "scsipi_adapter_req_t"/*req*/, "void *"/*arg*/); SDT_PROBE_DEFINE1(scsi, base, queue, batch__start, "struct scsipi_channel *"/*chan*/); SDT_PROBE_DEFINE2(scsi, base, queue, run, "struct scsipi_channel *"/*chan*/, "struct scsipi_xfer *"/*xs*/); SDT_PROBE_DEFINE1(scsi, base, queue, batch__done, "struct scsipi_channel *"/*chan*/); SDT_PROBE_DEFINE1(scsi, base, xfer, execute, "struct scsipi_xfer *"/*xs*/); SDT_PROBE_DEFINE1(scsi, base, xfer, enqueue, "struct scsipi_xfer *"/*xs*/); SDT_PROBE_DEFINE1(scsi, base, xfer, done, "struct scsipi_xfer *"/*xs*/); SDT_PROBE_DEFINE1(scsi, base, xfer, redone, "struct scsipi_xfer *"/*xs*/); SDT_PROBE_DEFINE1(scsi, base, xfer, complete, "struct scsipi_xfer *"/*xs*/); SDT_PROBE_DEFINE1(scsi, base, xfer, restart, "struct scsipi_xfer *"/*xs*/); SDT_PROBE_DEFINE1(scsi, base, xfer, free, "struct scsipi_xfer *"/*xs*/); static int scsipi_complete(struct scsipi_xfer *); static void scsipi_request_sense(struct scsipi_xfer *); static int scsipi_enqueue(struct scsipi_xfer *); static void scsipi_run_queue(struct scsipi_channel *chan); static void scsipi_completion_thread(void *); static void scsipi_get_tag(struct scsipi_xfer *); static void scsipi_put_tag(struct scsipi_xfer *); static int scsipi_get_resource(struct scsipi_channel *); static void scsipi_put_resource(struct scsipi_channel *); static void scsipi_async_event_max_openings(struct scsipi_channel *, struct scsipi_max_openings *); static void scsipi_async_event_channel_reset(struct scsipi_channel *); static void scsipi_channel_freeze_locked(struct scsipi_channel *, int); static void scsipi_adapter_lock(struct scsipi_adapter *adapt); static void scsipi_adapter_unlock(struct scsipi_adapter *adapt); static void scsipi_update_timeouts(struct scsipi_xfer *xs); static struct pool scsipi_xfer_pool; int scsipi_xs_count = 0; /* * scsipi_init: * * Called when a scsibus or atapibus is attached to the system * to initialize shared data structures. */ void scsipi_init(void) { static int scsipi_init_done; if (scsipi_init_done) return; scsipi_init_done = 1; /* Initialize the scsipi_xfer pool. */ pool_init(&scsipi_xfer_pool, sizeof(struct scsipi_xfer), 0, 0, 0, "scxspl", NULL, IPL_BIO); pool_prime(&scsipi_xfer_pool, 1); scsipi_ioctl_init(); } /* * scsipi_channel_init: * * Initialize a scsipi_channel when it is attached. */ int scsipi_channel_init(struct scsipi_channel *chan) { struct scsipi_adapter *adapt = chan->chan_adapter; int i; /* Initialize shared data. */ scsipi_init(); /* Initialize the queues. */ TAILQ_INIT(&chan->chan_queue); TAILQ_INIT(&chan->chan_complete); for (i = 0; i < SCSIPI_CHAN_PERIPH_BUCKETS; i++) LIST_INIT(&chan->chan_periphtab[i]); /* * Create the asynchronous completion thread. */ if (kthread_create(PRI_NONE, 0, NULL, scsipi_completion_thread, chan, &chan->chan_thread, "%s", chan->chan_name)) { aprint_error_dev(adapt->adapt_dev, "unable to create completion thread for " "channel %d\n", chan->chan_channel); panic("scsipi_channel_init"); } return 0; } /* * scsipi_channel_shutdown: * * Shutdown a scsipi_channel. */ void scsipi_channel_shutdown(struct scsipi_channel *chan) { mutex_enter(chan_mtx(chan)); /* * Shut down the completion thread. */ chan->chan_tflags |= SCSIPI_CHANT_SHUTDOWN; cv_broadcast(chan_cv_complete(chan)); /* * Now wait for the thread to exit. */ while (chan->chan_thread != NULL) cv_wait(chan_cv_thread(chan), chan_mtx(chan)); mutex_exit(chan_mtx(chan)); } static uint32_t scsipi_chan_periph_hash(uint64_t t, uint64_t l) { uint32_t hash; hash = hash32_buf(&t, sizeof(t), HASH32_BUF_INIT); hash = hash32_buf(&l, sizeof(l), hash); return hash & SCSIPI_CHAN_PERIPH_HASHMASK; } /* * scsipi_insert_periph: * * Insert a periph into the channel. */ void scsipi_insert_periph(struct scsipi_channel *chan, struct scsipi_periph *periph) { uint32_t hash; hash = scsipi_chan_periph_hash(periph->periph_target, periph->periph_lun); mutex_enter(chan_mtx(chan)); LIST_INSERT_HEAD(&chan->chan_periphtab[hash], periph, periph_hash); mutex_exit(chan_mtx(chan)); } /* * scsipi_remove_periph: * * Remove a periph from the channel. */ void scsipi_remove_periph(struct scsipi_channel *chan, struct scsipi_periph *periph) { LIST_REMOVE(periph, periph_hash); } /* * scsipi_lookup_periph: * * Lookup a periph on the specified channel. */ static struct scsipi_periph * scsipi_lookup_periph_internal(struct scsipi_channel *chan, int target, int lun, bool lock) { struct scsipi_periph *periph; uint32_t hash; if (target >= chan->chan_ntargets || lun >= chan->chan_nluns) return NULL; hash = scsipi_chan_periph_hash(target, lun); if (lock) mutex_enter(chan_mtx(chan)); LIST_FOREACH(periph, &chan->chan_periphtab[hash], periph_hash) { if (periph->periph_target == target && periph->periph_lun == lun) break; } if (lock) mutex_exit(chan_mtx(chan)); return periph; } struct scsipi_periph * scsipi_lookup_periph_locked(struct scsipi_channel *chan, int target, int lun) { return scsipi_lookup_periph_internal(chan, target, lun, false); } struct scsipi_periph * scsipi_lookup_periph(struct scsipi_channel *chan, int target, int lun) { return scsipi_lookup_periph_internal(chan, target, lun, true); } /* * scsipi_get_resource: * * Allocate a single xfer `resource' from the channel. * * NOTE: Must be called with channel lock held */ static int scsipi_get_resource(struct scsipi_channel *chan) { struct scsipi_adapter *adapt = chan->chan_adapter; if (chan->chan_flags & SCSIPI_CHAN_OPENINGS) { if (chan->chan_openings > 0) { chan->chan_openings--; return 1; } return 0; } if (adapt->adapt_openings > 0) { adapt->adapt_openings--; return 1; } return 0; } /* * scsipi_grow_resources: * * Attempt to grow resources for a channel. If this succeeds, * we allocate one for our caller. * * NOTE: Must be called with channel lock held */ static inline int scsipi_grow_resources(struct scsipi_channel *chan) { if (chan->chan_flags & SCSIPI_CHAN_CANGROW) { if ((chan->chan_flags & SCSIPI_CHAN_TACTIVE) == 0) { mutex_exit(chan_mtx(chan)); scsipi_adapter_request(chan, ADAPTER_REQ_GROW_RESOURCES, NULL); mutex_enter(chan_mtx(chan)); return scsipi_get_resource(chan); } /* * ask the channel thread to do it. It'll have to thaw the * queue */ scsipi_channel_freeze_locked(chan, 1); chan->chan_tflags |= SCSIPI_CHANT_GROWRES; cv_broadcast(chan_cv_complete(chan)); return 0; } return 0; } /* * scsipi_put_resource: * * Free a single xfer `resource' to the channel. * * NOTE: Must be called with channel lock held */ static void scsipi_put_resource(struct scsipi_channel *chan) { struct scsipi_adapter *adapt = chan->chan_adapter; if (chan->chan_flags & SCSIPI_CHAN_OPENINGS) chan->chan_openings++; else adapt->adapt_openings++; } /* * scsipi_get_tag: * * Get a tag ID for the specified xfer. * * NOTE: Must be called with channel lock held */ static void scsipi_get_tag(struct scsipi_xfer *xs) { struct scsipi_periph *periph = xs->xs_periph; int bit, tag; u_int word; KASSERT(mutex_owned(chan_mtx(periph->periph_channel))); bit = 0; /* XXX gcc */ for (word = 0; word < PERIPH_NTAGWORDS; word++) { bit = ffs(periph->periph_freetags[word]); if (bit != 0) break; } #ifdef DIAGNOSTIC if (word == PERIPH_NTAGWORDS) { scsipi_printaddr(periph); printf("no free tags\n"); panic("scsipi_get_tag"); } #endif bit -= 1; periph->periph_freetags[word] &= ~(1U << bit); tag = (word << 5) | bit; /* XXX Should eventually disallow this completely. */ if (tag >= periph->periph_openings) { scsipi_printaddr(periph); printf("WARNING: tag %d greater than available openings %d\n", tag, periph->periph_openings); } xs->xs_tag_id = tag; SDT_PROBE3(scsi, base, tag, get, xs, xs->xs_tag_id, xs->xs_tag_type); } /* * scsipi_put_tag: * * Put the tag ID for the specified xfer back into the pool. * * NOTE: Must be called with channel lock held */ static void scsipi_put_tag(struct scsipi_xfer *xs) { struct scsipi_periph *periph = xs->xs_periph; int word, bit; KASSERT(mutex_owned(chan_mtx(periph->periph_channel))); SDT_PROBE3(scsi, base, tag, put, xs, xs->xs_tag_id, xs->xs_tag_type); word = xs->xs_tag_id >> 5; bit = xs->xs_tag_id & 0x1f; periph->periph_freetags[word] |= (1U << bit); } /* * scsipi_get_xs: * * Allocate an xfer descriptor and associate it with the * specified peripheral. If the peripheral has no more * available command openings, we either block waiting for * one to become available, or fail. * * When this routine is called with the channel lock held * the flags must include XS_CTL_NOSLEEP. */ struct scsipi_xfer * scsipi_get_xs(struct scsipi_periph *periph, int flags) { struct scsipi_xfer *xs; bool lock = (flags & XS_CTL_NOSLEEP) == 0; SC_DEBUG(periph, SCSIPI_DB3, ("scsipi_get_xs\n")); KASSERT(!cold); #ifdef DIAGNOSTIC /* * URGENT commands can never be ASYNC. */ if ((flags & (XS_CTL_URGENT|XS_CTL_ASYNC)) == (XS_CTL_URGENT|XS_CTL_ASYNC)) { scsipi_printaddr(periph); printf("URGENT and ASYNC\n"); panic("scsipi_get_xs"); } #endif /* * Wait for a command opening to become available. Rules: * * - All xfers must wait for an available opening. * Exception: URGENT xfers can proceed when * active == openings, because we use the opening * of the command we're recovering for. * - if the periph has sense pending, only URGENT & REQSENSE * xfers may proceed. * * - If the periph is recovering, only URGENT xfers may * proceed. * * - If the periph is currently executing a recovery * command, URGENT commands must block, because only * one recovery command can execute at a time. */ if (lock) mutex_enter(chan_mtx(periph->periph_channel)); for (;;) { if (flags & XS_CTL_URGENT) { if (periph->periph_active > periph->periph_openings) goto wait_for_opening; if (periph->periph_flags & PERIPH_SENSE) { if ((flags & XS_CTL_REQSENSE) == 0) goto wait_for_opening; } else { if ((periph->periph_flags & PERIPH_RECOVERY_ACTIVE) != 0) goto wait_for_opening; periph->periph_flags |= PERIPH_RECOVERY_ACTIVE; } break; } if (periph->periph_active >= periph->periph_openings || (periph->periph_flags & PERIPH_RECOVERING) != 0) goto wait_for_opening; periph->periph_active++; KASSERT(mutex_owned(chan_mtx(periph->periph_channel))); break; wait_for_opening: if (flags & XS_CTL_NOSLEEP) { KASSERT(!lock); return NULL; } KASSERT(lock); SC_DEBUG(periph, SCSIPI_DB3, ("sleeping\n")); periph->periph_flags |= PERIPH_WAITING; cv_wait(periph_cv_periph(periph), chan_mtx(periph->periph_channel)); } if (lock) mutex_exit(chan_mtx(periph->periph_channel)); SC_DEBUG(periph, SCSIPI_DB3, ("calling pool_get\n")); xs = pool_get(&scsipi_xfer_pool, ((flags & XS_CTL_NOSLEEP) != 0 ? PR_NOWAIT : PR_WAITOK)); if (xs == NULL) { if (lock) mutex_enter(chan_mtx(periph->periph_channel)); if (flags & XS_CTL_URGENT) { if ((flags & XS_CTL_REQSENSE) == 0) periph->periph_flags &= ~PERIPH_RECOVERY_ACTIVE; } else periph->periph_active--; if (lock) mutex_exit(chan_mtx(periph->periph_channel)); scsipi_printaddr(periph); printf("unable to allocate %sscsipi_xfer\n", (flags & XS_CTL_URGENT) ? "URGENT " : ""); } SC_DEBUG(periph, SCSIPI_DB3, ("returning\n")); if (xs != NULL) { memset(xs, 0, sizeof(*xs)); callout_init(&xs->xs_callout, 0); xs->xs_periph = periph; xs->xs_control = flags; xs->xs_status = 0; if ((flags & XS_CTL_NOSLEEP) == 0) mutex_enter(chan_mtx(periph->periph_channel)); TAILQ_INSERT_TAIL(&periph->periph_xferq, xs, device_q); KASSERT(mutex_owned(chan_mtx(periph->periph_channel))); if ((flags & XS_CTL_NOSLEEP) == 0) mutex_exit(chan_mtx(periph->periph_channel)); } return xs; } /* * scsipi_put_xs: * * Release an xfer descriptor, decreasing the outstanding command * count for the peripheral. If there is a thread waiting for * an opening, wake it up. If not, kick any queued I/O the * peripheral may have. * * NOTE: Must be called with channel lock held */ void scsipi_put_xs(struct scsipi_xfer *xs) { struct scsipi_periph *periph = xs->xs_periph; int flags = xs->xs_control; SDT_PROBE1(scsi, base, xfer, free, xs); SC_DEBUG(periph, SCSIPI_DB3, ("scsipi_free_xs\n")); KASSERT(mutex_owned(chan_mtx(periph->periph_channel))); TAILQ_REMOVE(&periph->periph_xferq, xs, device_q); callout_destroy(&xs->xs_callout); pool_put(&scsipi_xfer_pool, xs); #ifdef DIAGNOSTIC if ((periph->periph_flags & PERIPH_RECOVERY_ACTIVE) != 0 && periph->periph_active == 0) { scsipi_printaddr(periph); printf("recovery without a command to recovery for\n"); panic("scsipi_put_xs"); } #endif if (flags & XS_CTL_URGENT) { if ((flags & XS_CTL_REQSENSE) == 0) periph->periph_flags &= ~PERIPH_RECOVERY_ACTIVE; } else periph->periph_active--; if (periph->periph_active == 0 && (periph->periph_flags & PERIPH_WAITDRAIN) != 0) { periph->periph_flags &= ~PERIPH_WAITDRAIN; cv_broadcast(periph_cv_active(periph)); } if (periph->periph_flags & PERIPH_WAITING) { periph->periph_flags &= ~PERIPH_WAITING; cv_broadcast(periph_cv_periph(periph)); } else { if (periph->periph_switch->psw_start != NULL && device_is_active(periph->periph_dev)) { SC_DEBUG(periph, SCSIPI_DB2, ("calling private start()\n")); (*periph->periph_switch->psw_start)(periph); } } } /* * scsipi_channel_freeze: * * Freeze a channel's xfer queue. */ void scsipi_channel_freeze(struct scsipi_channel *chan, int count) { bool lock = chan_running(chan) > 0; if (lock) mutex_enter(chan_mtx(chan)); chan->chan_qfreeze += count; if (lock) mutex_exit(chan_mtx(chan)); } static void scsipi_channel_freeze_locked(struct scsipi_channel *chan, int count) { chan->chan_qfreeze += count; } /* * scsipi_channel_thaw: * * Thaw a channel's xfer queue. */ void scsipi_channel_thaw(struct scsipi_channel *chan, int count) { bool lock = chan_running(chan) > 0; if (lock) mutex_enter(chan_mtx(chan)); chan->chan_qfreeze -= count; /* * Don't let the freeze count go negative. * * Presumably the adapter driver could keep track of this, * but it might just be easier to do this here so as to allow * multiple callers, including those outside the adapter driver. */ if (chan->chan_qfreeze < 0) { chan->chan_qfreeze = 0; } if (lock) mutex_exit(chan_mtx(chan)); /* * until the channel is running */ if (!lock) return; /* * Kick the channel's queue here. Note, we may be running in * interrupt context (softclock or HBA's interrupt), so the adapter * driver had better not sleep. */ if (chan->chan_qfreeze == 0) scsipi_run_queue(chan); } /* * scsipi_channel_timed_thaw: * * Thaw a channel after some time has expired. This will also * run the channel's queue if the freeze count has reached 0. */ void scsipi_channel_timed_thaw(void *arg) { struct scsipi_channel *chan = arg; scsipi_channel_thaw(chan, 1); } /* * scsipi_periph_freeze: * * Freeze a device's xfer queue. */ void scsipi_periph_freeze_locked(struct scsipi_periph *periph, int count) { periph->periph_qfreeze += count; } /* * scsipi_periph_thaw: * * Thaw a device's xfer queue. */ void scsipi_periph_thaw_locked(struct scsipi_periph *periph, int count) { periph->periph_qfreeze -= count; #ifdef DIAGNOSTIC if (periph->periph_qfreeze < 0) { static const char pc[] = "periph freeze count < 0"; scsipi_printaddr(periph); printf("%s\n", pc); panic(pc); } #endif if (periph->periph_qfreeze == 0 && (periph->periph_flags & PERIPH_WAITING) != 0) cv_broadcast(periph_cv_periph(periph)); } void scsipi_periph_freeze(struct scsipi_periph *periph, int count) { mutex_enter(chan_mtx(periph->periph_channel)); scsipi_periph_freeze_locked(periph, count); mutex_exit(chan_mtx(periph->periph_channel)); } void scsipi_periph_thaw(struct scsipi_periph *periph, int count) { mutex_enter(chan_mtx(periph->periph_channel)); scsipi_periph_thaw_locked(periph, count); mutex_exit(chan_mtx(periph->periph_channel)); } /* * scsipi_periph_timed_thaw: * * Thaw a device after some time has expired. */ void scsipi_periph_timed_thaw(void *arg) { struct scsipi_periph *periph = arg; struct scsipi_channel *chan = periph->periph_channel; callout_stop(&periph->periph_callout); mutex_enter(chan_mtx(chan)); scsipi_periph_thaw_locked(periph, 1); if ((periph->periph_channel->chan_flags & SCSIPI_CHAN_TACTIVE) == 0) { /* * Kick the channel's queue here. Note, we're running in * interrupt context (softclock), so the adapter driver * had better not sleep. */ mutex_exit(chan_mtx(chan)); scsipi_run_queue(periph->periph_channel); } else { /* * Tell the completion thread to kick the channel's queue here. */ periph->periph_channel->chan_tflags |= SCSIPI_CHANT_KICK; cv_broadcast(chan_cv_complete(chan)); mutex_exit(chan_mtx(chan)); } } /* * scsipi_wait_drain: * * Wait for a periph's pending xfers to drain. */ void scsipi_wait_drain(struct scsipi_periph *periph) { struct scsipi_channel *chan = periph->periph_channel; mutex_enter(chan_mtx(chan)); while (periph->periph_active != 0) { periph->periph_flags |= PERIPH_WAITDRAIN; cv_wait(periph_cv_active(periph), chan_mtx(chan)); } mutex_exit(chan_mtx(chan)); } /* * scsipi_kill_pending: * * Kill off all pending xfers for a periph. * * NOTE: Must be called with channel lock held */ void scsipi_kill_pending(struct scsipi_periph *periph) { struct scsipi_channel *chan = periph->periph_channel; (*chan->chan_bustype->bustype_kill_pending)(periph); while (periph->periph_active != 0) { periph->periph_flags |= PERIPH_WAITDRAIN; cv_wait(periph_cv_active(periph), chan_mtx(chan)); } } /* * scsipi_print_cdb: * prints a command descriptor block (for debug purpose, error messages, * SCSIVERBOSE, ...) */ void scsipi_print_cdb(struct scsipi_generic *cmd) { int i, j; printf("0x%02x", cmd->opcode); switch (CDB_GROUPID(cmd->opcode)) { case CDB_GROUPID_0: j = CDB_GROUP0; break; case CDB_GROUPID_1: j = CDB_GROUP1; break; case CDB_GROUPID_2: j = CDB_GROUP2; break; case CDB_GROUPID_3: j = CDB_GROUP3; break; case CDB_GROUPID_4: j = CDB_GROUP4; break; case CDB_GROUPID_5: j = CDB_GROUP5; break; case CDB_GROUPID_6: j = CDB_GROUP6; break; case CDB_GROUPID_7: j = CDB_GROUP7; break; default: j = 0; } if (j == 0) j = sizeof (cmd->bytes); for (i = 0; i < j-1; i++) /* already done the opcode */ printf(" %02x", cmd->bytes[i]); } /* * scsipi_interpret_sense: * * Look at the returned sense and act on the error, determining * the unix error number to pass back. (0 = report no error) * * NOTE: If we return ERESTART, we are expected to have * thawed the device! * * THIS IS THE DEFAULT ERROR HANDLER FOR SCSI DEVICES. */ int scsipi_interpret_sense(struct scsipi_xfer *xs) { struct scsi_sense_data *sense; struct scsipi_periph *periph = xs->xs_periph; u_int8_t key; int error; u_int32_t info; static const char *error_mes[] = { "soft error (corrected)", "not ready", "medium error", "non-media hardware failure", "illegal request", "unit attention", "readonly device", "no data found", "vendor unique", "copy aborted", "command aborted", "search returned equal", "volume overflow", "verify miscompare", "unknown error key" }; sense = &xs->sense.scsi_sense; #ifdef SCSIPI_DEBUG if (periph->periph_flags & SCSIPI_DB1) { int count, len; scsipi_printaddr(periph); printf(" sense debug information:\n"); printf("\tcode 0x%x valid %d\n", SSD_RCODE(sense->response_code), sense->response_code & SSD_RCODE_VALID ? 1 : 0); printf("\tseg 0x%x key 0x%x ili 0x%x eom 0x%x fmark 0x%x\n", sense->segment, SSD_SENSE_KEY(sense->flags), sense->flags & SSD_ILI ? 1 : 0, sense->flags & SSD_EOM ? 1 : 0, sense->flags & SSD_FILEMARK ? 1 : 0); printf("\ninfo: 0x%x 0x%x 0x%x 0x%x followed by %d " "extra bytes\n", sense->info[0], sense->info[1], sense->info[2], sense->info[3], sense->extra_len); len = SSD_ADD_BYTES_LIM(sense); printf("\textra (up to %d bytes): ", len); for (count = 0; count < len; count++) printf("0x%x ", sense->csi[count]); printf("\n"); } #endif /* * If the periph has its own error handler, call it first. * If it returns a legit error value, return that, otherwise * it wants us to continue with normal error processing. */ if (periph->periph_switch->psw_error != NULL) { SC_DEBUG(periph, SCSIPI_DB2, ("calling private err_handler()\n")); error = (*periph->periph_switch->psw_error)(xs); if (error != EJUSTRETURN) return error; } /* otherwise use the default */ switch (SSD_RCODE(sense->response_code)) { /* * Old SCSI-1 and SASI devices respond with * codes other than 70. */ case 0x00: /* no error (command completed OK) */ return 0; case 0x04: /* drive not ready after it was selected */ if ((periph->periph_flags & PERIPH_REMOVABLE) != 0) periph->periph_flags &= ~PERIPH_MEDIA_LOADED; if ((xs->xs_control & XS_CTL_IGNORE_NOT_READY) != 0) return 0; /* XXX - display some sort of error here? */ return EIO; case 0x20: /* invalid command */ if ((xs->xs_control & XS_CTL_IGNORE_ILLEGAL_REQUEST) != 0) return 0; return EINVAL; case 0x25: /* invalid LUN (Adaptec ACB-4000) */ return EACCES; /* * If it's code 70, use the extended stuff and * interpret the key */ case 0x71: /* delayed error */ scsipi_printaddr(periph); key = SSD_SENSE_KEY(sense->flags); printf(" DEFERRED ERROR, key = 0x%x\n", key); /* FALLTHROUGH */ case 0x70: if ((sense->response_code & SSD_RCODE_VALID) != 0) info = _4btol(sense->info); else info = 0; key = SSD_SENSE_KEY(sense->flags); switch (key) { case SKEY_NO_SENSE: case SKEY_RECOVERED_ERROR: if (xs->resid == xs->datalen && xs->datalen) { /* * Why is this here? */ xs->resid = 0; /* not short read */ } error = 0; break; case SKEY_EQUAL: error = 0; break; case SKEY_NOT_READY: if ((periph->periph_flags & PERIPH_REMOVABLE) != 0) periph->periph_flags &= ~PERIPH_MEDIA_LOADED; if ((xs->xs_control & XS_CTL_IGNORE_NOT_READY) != 0) return 0; if (sense->asc == 0x3A) { error = ENODEV; /* Medium not present */ if (xs->xs_control & XS_CTL_SILENT_NODEV) return error; } else error = EIO; if ((xs->xs_control & XS_CTL_SILENT) != 0) return error; break; case SKEY_ILLEGAL_REQUEST: if ((xs->xs_control & XS_CTL_IGNORE_ILLEGAL_REQUEST) != 0) return 0; /* * Handle the case where a device reports * Logical Unit Not Supported during discovery. */ if ((xs->xs_control & XS_CTL_DISCOVERY) != 0 && sense->asc == 0x25 && sense->ascq == 0x00) return EINVAL; if ((xs->xs_control & XS_CTL_SILENT) != 0) return EIO; error = EINVAL; break; case SKEY_UNIT_ATTENTION: if (sense->asc == 0x29 && sense->ascq == 0x00) { /* device or bus reset */ return ERESTART; } if ((periph->periph_flags & PERIPH_REMOVABLE) != 0) periph->periph_flags &= ~PERIPH_MEDIA_LOADED; if ((xs->xs_control & XS_CTL_IGNORE_MEDIA_CHANGE) != 0 || /* XXX Should reupload any transient state. */ (periph->periph_flags & PERIPH_REMOVABLE) == 0) { return ERESTART; } if ((xs->xs_control & XS_CTL_SILENT) != 0) return EIO; error = EIO; break; case SKEY_DATA_PROTECT: error = EROFS; break; case SKEY_BLANK_CHECK: error = 0; break; case SKEY_ABORTED_COMMAND: if (xs->xs_retries != 0) { xs->xs_retries--; error = ERESTART; } else error = EIO; break; case SKEY_VOLUME_OVERFLOW: error = ENOSPC; break; default: error = EIO; break; } /* Print verbose decode if appropriate and possible */ if ((key == 0) || ((xs->xs_control & XS_CTL_SILENT) != 0) || (scsipi_print_sense(xs, 0) != 0)) return error; /* Print brief(er) sense information */ scsipi_printaddr(periph); printf("%s", error_mes[key - 1]); if ((sense->response_code & SSD_RCODE_VALID) != 0) { switch (key) { case SKEY_NOT_READY: case SKEY_ILLEGAL_REQUEST: case SKEY_UNIT_ATTENTION: case SKEY_DATA_PROTECT: break; case SKEY_BLANK_CHECK: printf(", requested size: %d (decimal)", info); break; case SKEY_ABORTED_COMMAND: if (xs->xs_retries) printf(", retrying"); printf(", cmd 0x%x, info 0x%x", xs->cmd->opcode, info); break; default: printf(", info = %d (decimal)", info); } } if (sense->extra_len != 0) { int n; printf(", data ="); for (n = 0; n < sense->extra_len; n++) printf(" %02x", sense->csi[n]); } printf("\n"); return error; /* * Some other code, just report it */ default: #if defined(SCSIDEBUG) || defined(DEBUG) { static const char *uc = "undecodable sense error"; int i; u_int8_t *cptr = (u_int8_t *) sense; scsipi_printaddr(periph); if (xs->cmd == &xs->cmdstore) { printf("%s for opcode 0x%x, data=", uc, xs->cmdstore.opcode); } else { printf("%s, data=", uc); } for (i = 0; i < sizeof (sense); i++) printf(" 0x%02x", *(cptr++) & 0xff); printf("\n"); } #else scsipi_printaddr(periph); printf("Sense Error Code 0x%x", SSD_RCODE(sense->response_code)); if ((sense->response_code & SSD_RCODE_VALID) != 0) { struct scsi_sense_data_unextended *usense = (struct scsi_sense_data_unextended *)sense; printf(" at block no. %d (decimal)", _3btol(usense->block)); } printf("\n"); #endif return EIO; } } /* * scsipi_test_unit_ready: * * Issue a `test unit ready' request. */ int scsipi_test_unit_ready(struct scsipi_periph *periph, int flags) { struct scsi_test_unit_ready cmd; int retries; /* some ATAPI drives don't support TEST UNIT READY. Sigh */ if (periph->periph_quirks & PQUIRK_NOTUR) return 0; if (flags & XS_CTL_DISCOVERY) retries = 0; else retries = SCSIPIRETRIES; memset(&cmd, 0, sizeof(cmd)); cmd.opcode = SCSI_TEST_UNIT_READY; return scsipi_command(periph, (void *)&cmd, sizeof(cmd), 0, 0, retries, 10000, NULL, flags); } static const struct scsipi_inquiry3_pattern { const char vendor[8]; const char product[16]; const char revision[4]; } scsipi_inquiry3_quirk[] = { { "ES-6600 ", "", "" }, }; static int scsipi_inquiry3_ok(const struct scsipi_inquiry_data *ib) { for (size_t i = 0; i < __arraycount(scsipi_inquiry3_quirk); i++) { const struct scsipi_inquiry3_pattern *q = &scsipi_inquiry3_quirk[i]; #define MATCH(field) \ (q->field[0] ? memcmp(ib->field, q->field, sizeof(ib->field)) == 0 : 1) if (MATCH(vendor) && MATCH(product) && MATCH(revision)) return 0; } return 1; } /* * scsipi_inquire: * * Ask the device about itself. */ int scsipi_inquire(struct scsipi_periph *periph, struct scsipi_inquiry_data *inqbuf, int flags) { struct scsipi_inquiry cmd; int error; int retries; if (flags & XS_CTL_DISCOVERY) retries = 0; else retries = SCSIPIRETRIES; /* * If we request more data than the device can provide, it SHOULD just * return a short response. However, some devices error with an * ILLEGAL REQUEST sense code, and yet others have even more special * failure modes (such as the GL641USB flash adapter, which goes loony * and sends corrupted CRCs). To work around this, and to bring our * behavior more in line with other OSes, we do a shorter inquiry, * covering all the SCSI-2 information, first, and then request more * data iff the "additional length" field indicates there is more. * - mycroft, 2003/10/16 */ memset(&cmd, 0, sizeof(cmd)); cmd.opcode = INQUIRY; cmd.length = SCSIPI_INQUIRY_LENGTH_SCSI2; error = scsipi_command(periph, (void *)&cmd, sizeof(cmd), (void *)inqbuf, SCSIPI_INQUIRY_LENGTH_SCSI2, retries, 10000, NULL, flags | XS_CTL_DATA_IN); if (!error && inqbuf->additional_length > SCSIPI_INQUIRY_LENGTH_SCSI2 - 4) { if (scsipi_inquiry3_ok(inqbuf)) { #if 0 printf("inquire: addlen=%d, retrying\n", inqbuf->additional_length); #endif cmd.length = SCSIPI_INQUIRY_LENGTH_SCSI3; error = scsipi_command(periph, (void *)&cmd, sizeof(cmd), (void *)inqbuf, SCSIPI_INQUIRY_LENGTH_SCSI3, retries, 10000, NULL, flags | XS_CTL_DATA_IN); #if 0 printf("inquire: error=%d\n", error); #endif } } #ifdef SCSI_OLD_NOINQUIRY /* * Kludge for the Adaptec ACB-4000 SCSI->MFM translator. * This board doesn't support the INQUIRY command at all. */ if (error == EINVAL || error == EACCES) { /* * Conjure up an INQUIRY response. */ inqbuf->device = (error == EINVAL ? SID_QUAL_LU_PRESENT : SID_QUAL_LU_NOTPRESENT) | T_DIRECT; inqbuf->dev_qual2 = 0; inqbuf->version = 0; inqbuf->response_format = SID_FORMAT_SCSI1; inqbuf->additional_length = SCSIPI_INQUIRY_LENGTH_SCSI2 - 4; inqbuf->flags1 = inqbuf->flags2 = inqbuf->flags3 = 0; memcpy(inqbuf->vendor, "ADAPTEC ACB-4000 ", 28); error = 0; } /* * Kludge for the Emulex MT-02 SCSI->QIC translator. * This board gives an empty response to an INQUIRY command. */ else if (error == 0 && inqbuf->device == (SID_QUAL_LU_PRESENT | T_DIRECT) && inqbuf->dev_qual2 == 0 && inqbuf->version == 0 && inqbuf->response_format == SID_FORMAT_SCSI1) { /* * Fill out the INQUIRY response. */ inqbuf->device = (SID_QUAL_LU_PRESENT | T_SEQUENTIAL); inqbuf->dev_qual2 = SID_REMOVABLE; inqbuf->additional_length = SCSIPI_INQUIRY_LENGTH_SCSI2 - 4; inqbuf->flags1 = inqbuf->flags2 = inqbuf->flags3 = 0; memcpy(inqbuf->vendor, "EMULEX MT-02 QIC ", 28); } #endif /* SCSI_OLD_NOINQUIRY */ return error; } /* * scsipi_prevent: * * Prevent or allow the user to remove the media */ int scsipi_prevent(struct scsipi_periph *periph, int type, int flags) { struct scsi_prevent_allow_medium_removal cmd; if (periph->periph_quirks & PQUIRK_NODOORLOCK) return 0; memset(&cmd, 0, sizeof(cmd)); cmd.opcode = SCSI_PREVENT_ALLOW_MEDIUM_REMOVAL; cmd.how = type; return (scsipi_command(periph, (void *)&cmd, sizeof(cmd), 0, 0, SCSIPIRETRIES, 5000, NULL, flags)); } /* * scsipi_start: * * Send a START UNIT. */ int scsipi_start(struct scsipi_periph *periph, int type, int flags) { struct scsipi_start_stop cmd; memset(&cmd, 0, sizeof(cmd)); cmd.opcode = START_STOP; cmd.byte2 = 0x00; cmd.how = type; return scsipi_command(periph, (void *)&cmd, sizeof(cmd), 0, 0, SCSIPIRETRIES, (type & SSS_START) ? 60000 : 10000, NULL, flags); } /* * scsipi_mode_sense, scsipi_mode_sense_big: * get a sense page from a device */ int scsipi_mode_sense(struct scsipi_periph *periph, int byte2, int page, struct scsi_mode_parameter_header_6 *data, int len, int flags, int retries, int timeout) { struct scsi_mode_sense_6 cmd; memset(&cmd, 0, sizeof(cmd)); cmd.opcode = SCSI_MODE_SENSE_6; cmd.byte2 = byte2; cmd.page = page; cmd.length = len & 0xff; return scsipi_command(periph, (void *)&cmd, sizeof(cmd), (void *)data, len, retries, timeout, NULL, flags | XS_CTL_DATA_IN); } int scsipi_mode_sense_big(struct scsipi_periph *periph, int byte2, int page, struct scsi_mode_parameter_header_10 *data, int len, int flags, int retries, int timeout) { struct scsi_mode_sense_10 cmd; memset(&cmd, 0, sizeof(cmd)); cmd.opcode = SCSI_MODE_SENSE_10; cmd.byte2 = byte2; cmd.page = page; _lto2b(len, cmd.length); return scsipi_command(periph, (void *)&cmd, sizeof(cmd), (void *)data, len, retries, timeout, NULL, flags | XS_CTL_DATA_IN); } int scsipi_mode_select(struct scsipi_periph *periph, int byte2, struct scsi_mode_parameter_header_6 *data, int len, int flags, int retries, int timeout) { struct scsi_mode_select_6 cmd; memset(&cmd, 0, sizeof(cmd)); cmd.opcode = SCSI_MODE_SELECT_6; cmd.byte2 = byte2; cmd.length = len & 0xff; return scsipi_command(periph, (void *)&cmd, sizeof(cmd), (void *)data, len, retries, timeout, NULL, flags | XS_CTL_DATA_OUT); } int scsipi_mode_select_big(struct scsipi_periph *periph, int byte2, struct scsi_mode_parameter_header_10 *data, int len, int flags, int retries, int timeout) { struct scsi_mode_select_10 cmd; memset(&cmd, 0, sizeof(cmd)); cmd.opcode = SCSI_MODE_SELECT_10; cmd.byte2 = byte2; _lto2b(len, cmd.length); return scsipi_command(periph, (void *)&cmd, sizeof(cmd), (void *)data, len, retries, timeout, NULL, flags | XS_CTL_DATA_OUT); } /* * scsipi_get_opcodeinfo: * * query the device for supported commands and their timeout * building a timeout lookup table if timeout information is available. */ void scsipi_get_opcodeinfo(struct scsipi_periph *periph) { u_int8_t *data; int len = 16*1024; int rc; struct scsi_repsuppopcode cmd; /* refrain from asking for supported opcodes */ if (periph->periph_quirks & PQUIRK_NOREPSUPPOPC || periph->periph_type == T_PROCESSOR || /* spec. */ periph->periph_type == T_CDROM) /* spec. */ return; scsipi_free_opcodeinfo(periph); /* * query REPORT SUPPORTED OPERATION CODES * if OK * enumerate all codes * if timeout exists insert maximum into opcode table */ data = malloc(len, M_DEVBUF, M_WAITOK|M_ZERO); memset(&cmd, 0, sizeof(cmd)); cmd.opcode = SCSI_MAINTENANCE_IN; cmd.svcaction = RSOC_REPORT_SUPPORTED_OPCODES; cmd.repoption = RSOC_RCTD|RSOC_ALL; _lto4b(len, cmd.alloclen); rc = scsipi_command(periph, (void *)&cmd, sizeof(cmd), (void *)data, len, 0, 1000, NULL, XS_CTL_DATA_IN|XS_CTL_SILENT); if (rc == 0) { int count; int dlen = _4btol(data); u_int8_t *c = data + 4; SC_DEBUG(periph, SCSIPI_DB3, ("supported opcode timeout-values loaded\n")); SC_DEBUG(periph, SCSIPI_DB3, ("CMD LEN SA spec nom. time cmd timeout\n")); struct scsipi_opcodes *tot = malloc(sizeof(struct scsipi_opcodes), M_DEVBUF, M_WAITOK|M_ZERO); count = 0; while (tot != NULL && dlen >= (int)sizeof(struct scsi_repsupopcode_all_commands_descriptor)) { struct scsi_repsupopcode_all_commands_descriptor *acd = (struct scsi_repsupopcode_all_commands_descriptor *)c; #ifdef SCSIPI_DEBUG int cdblen = _2btol((const u_int8_t *)&acd->cdblen); #endif dlen -= sizeof(struct scsi_repsupopcode_all_commands_descriptor); c += sizeof(struct scsi_repsupopcode_all_commands_descriptor); SC_DEBUG(periph, SCSIPI_DB3, ("0x%02x(%2d) ", acd->opcode, cdblen)); tot->opcode_info[acd->opcode].ti_flags = SCSIPI_TI_VALID; if (acd->flags & RSOC_ACD_SERVACTV) { SC_DEBUGN(periph, SCSIPI_DB3, ("0x%02x%02x ", acd->serviceaction[0], acd->serviceaction[1])); } else { SC_DEBUGN(periph, SCSIPI_DB3, (" ")); } if (acd->flags & RSOC_ACD_CTDP && dlen >= (int)sizeof(struct scsi_repsupopcode_timeouts_descriptor)) { struct scsi_repsupopcode_timeouts_descriptor *td = (struct scsi_repsupopcode_timeouts_descriptor *)c; long nomto = _4btol(td->nom_process_timeout); long cmdto = _4btol(td->cmd_process_timeout); long t = (cmdto > nomto) ? cmdto : nomto; dlen -= sizeof(struct scsi_repsupopcode_timeouts_descriptor); c += sizeof(struct scsi_repsupopcode_timeouts_descriptor); SC_DEBUGN(periph, SCSIPI_DB3, ("0x%02x %10ld %10ld", td->cmd_specific, nomto, cmdto)); if (t > tot->opcode_info[acd->opcode].ti_timeout) { tot->opcode_info[acd->opcode].ti_timeout = t; ++count; } } SC_DEBUGN(periph, SCSIPI_DB3,("\n")); } if (count > 0) { periph->periph_opcs = tot; } else { free(tot, M_DEVBUF); SC_DEBUG(periph, SCSIPI_DB3, ("no usable timeout values available\n")); } } else { SC_DEBUG(periph, SCSIPI_DB3, ("SCSI_MAINTENANCE_IN" "[RSOC_REPORT_SUPPORTED_OPCODES] failed error=%d" " - no device provided timeout " "values available\n", rc)); } free(data, M_DEVBUF); } /* * scsipi_update_timeouts: * Override timeout value if device/config provided * timeouts are available. */ static void scsipi_update_timeouts(struct scsipi_xfer *xs) { struct scsipi_opcodes *opcs; u_int8_t cmd; int timeout; struct scsipi_opinfo *oi; if (xs->timeout <= 0) { return; } opcs = xs->xs_periph->periph_opcs; if (opcs == NULL) { return; } cmd = xs->cmd->opcode; oi = &opcs->opcode_info[cmd]; timeout = 1000 * (int)oi->ti_timeout; if (timeout > xs->timeout && timeout < 86400000) { /* * pick up device configured timeouts if they * are longer than the requested ones but less * than a day */ #ifdef SCSIPI_DEBUG if ((oi->ti_flags & SCSIPI_TI_LOGGED) == 0) { SC_DEBUG(xs->xs_periph, SCSIPI_DB3, ("Overriding command 0x%02x " "timeout of %d with %d ms\n", cmd, xs->timeout, timeout)); oi->ti_flags |= SCSIPI_TI_LOGGED; } #endif xs->timeout = timeout; } } /* * scsipi_free_opcodeinfo: * * free the opcode information table */ void scsipi_free_opcodeinfo(struct scsipi_periph *periph) { if (periph->periph_opcs != NULL) { free(periph->periph_opcs, M_DEVBUF); } periph->periph_opcs = NULL; } /* * scsipi_done: * * This routine is called by an adapter's interrupt handler when * an xfer is completed. */ void scsipi_done(struct scsipi_xfer *xs) { struct scsipi_periph *periph = xs->xs_periph; struct scsipi_channel *chan = periph->periph_channel; int freezecnt; SC_DEBUG(periph, SCSIPI_DB2, ("scsipi_done\n")); #ifdef SCSIPI_DEBUG if (periph->periph_dbflags & SCSIPI_DB1) show_scsipi_cmd(xs); #endif mutex_enter(chan_mtx(chan)); SDT_PROBE1(scsi, base, xfer, done, xs); /* * The resource this command was using is now free. */ if (xs->xs_status & XS_STS_DONE) { /* XXX in certain circumstances, such as a device * being detached, a xs that has already been * scsipi_done()'d by the main thread will be done'd * again by scsibusdetach(). Putting the xs on the * chan_complete queue causes list corruption and * everyone dies. This prevents that, but perhaps * there should be better coordination somewhere such * that this won't ever happen (and can be turned into * a KASSERT(). */ SDT_PROBE1(scsi, base, xfer, redone, xs); mutex_exit(chan_mtx(chan)); goto out; } scsipi_put_resource(chan); xs->xs_periph->periph_sent--; /* * If the command was tagged, free the tag. */ if (XS_CTL_TAGTYPE(xs) != 0) scsipi_put_tag(xs); else periph->periph_flags &= ~PERIPH_UNTAG; /* Mark the command as `done'. */ xs->xs_status |= XS_STS_DONE; #ifdef DIAGNOSTIC if ((xs->xs_control & (XS_CTL_ASYNC|XS_CTL_POLL)) == (XS_CTL_ASYNC|XS_CTL_POLL)) panic("scsipi_done: ASYNC and POLL"); #endif /* * If the xfer had an error of any sort, freeze the * periph's queue. Freeze it again if we were requested * to do so in the xfer. */ freezecnt = 0; if (xs->error != XS_NOERROR) freezecnt++; if (xs->xs_control & XS_CTL_FREEZE_PERIPH) freezecnt++; if (freezecnt != 0) scsipi_periph_freeze_locked(periph, freezecnt); /* * record the xfer with a pending sense, in case a SCSI reset is * received before the thread is waked up. */ if (xs->error == XS_BUSY && xs->status == SCSI_CHECK) { periph->periph_flags |= PERIPH_SENSE; periph->periph_xscheck = xs; } /* * If this was an xfer that was not to complete asynchronously, * let the requesting thread perform error checking/handling * in its context. */ if ((xs->xs_control & XS_CTL_ASYNC) == 0) { /* * If it's a polling job, just return, to unwind the * call graph. We don't need to restart the queue, * because polling jobs are treated specially, and * are really only used during crash dumps anyway * (XXX or during boot-time autoconfiguration of * ATAPI devices). */ if (xs->xs_control & XS_CTL_POLL) { mutex_exit(chan_mtx(chan)); return; } cv_broadcast(xs_cv(xs)); mutex_exit(chan_mtx(chan)); goto out; } /* * Catch the extremely common case of I/O completing * without error; no use in taking a context switch * if we can handle it in interrupt context. */ if (xs->error == XS_NOERROR) { mutex_exit(chan_mtx(chan)); (void) scsipi_complete(xs); goto out; } /* * There is an error on this xfer. Put it on the channel's * completion queue, and wake up the completion thread. */ TAILQ_INSERT_TAIL(&chan->chan_complete, xs, channel_q); cv_broadcast(chan_cv_complete(chan)); mutex_exit(chan_mtx(chan)); out: /* * If there are more xfers on the channel's queue, attempt to * run them. */ scsipi_run_queue(chan); } /* * scsipi_complete: * * Completion of a scsipi_xfer. This is the guts of scsipi_done(). * * NOTE: This routine MUST be called with valid thread context * except for the case where the following two conditions are * true: * * xs->error == XS_NOERROR * XS_CTL_ASYNC is set in xs->xs_control * * The semantics of this routine can be tricky, so here is an * explanation: * * 0 Xfer completed successfully. * * ERESTART Xfer had an error, but was restarted. * * anything else Xfer had an error, return value is Unix * errno. * * If the return value is anything but ERESTART: * * - If XS_CTL_ASYNC is set, `xs' has been freed back to * the pool. * - If there is a buf associated with the xfer, * it has been biodone()'d. */ static int scsipi_complete(struct scsipi_xfer *xs) { struct scsipi_periph *periph = xs->xs_periph; struct scsipi_channel *chan = periph->periph_channel; int error; SDT_PROBE1(scsi, base, xfer, complete, xs); #ifdef DIAGNOSTIC if ((xs->xs_control & XS_CTL_ASYNC) != 0 && xs->bp == NULL) panic("scsipi_complete: XS_CTL_ASYNC but no buf"); #endif /* * If command terminated with a CHECK CONDITION, we need to issue a * REQUEST_SENSE command. Once the REQUEST_SENSE has been processed * we'll have the real status. * Must be processed with channel lock held to avoid missing * a SCSI bus reset for this command. */ mutex_enter(chan_mtx(chan)); if (xs->error == XS_BUSY && xs->status == SCSI_CHECK) { /* request sense for a request sense ? */ if (xs->xs_control & XS_CTL_REQSENSE) { scsipi_printaddr(periph); printf("request sense for a request sense ?\n"); /* XXX maybe we should reset the device ? */ /* we've been frozen because xs->error != XS_NOERROR */ scsipi_periph_thaw_locked(periph, 1); mutex_exit(chan_mtx(chan)); if (xs->resid < xs->datalen) { printf("we read %d bytes of sense anyway:\n", xs->datalen - xs->resid); scsipi_print_sense_data((void *)xs->data, 0); } return EINVAL; } mutex_exit(chan_mtx(chan)); // XXX allows other commands to queue or run scsipi_request_sense(xs); } else mutex_exit(chan_mtx(chan)); /* * If it's a user level request, bypass all usual completion * processing, let the user work it out.. */ if ((xs->xs_control & XS_CTL_USERCMD) != 0) { SC_DEBUG(periph, SCSIPI_DB3, ("calling user done()\n")); mutex_enter(chan_mtx(chan)); if (xs->error != XS_NOERROR) scsipi_periph_thaw_locked(periph, 1); mutex_exit(chan_mtx(chan)); scsipi_user_done(xs); SC_DEBUG(periph, SCSIPI_DB3, ("returned from user done()\n ")); return 0; } switch (xs->error) { case XS_NOERROR: error = 0; break; case XS_SENSE: case XS_SHORTSENSE: error = (*chan->chan_bustype->bustype_interpret_sense)(xs); break; case XS_RESOURCE_SHORTAGE: /* * XXX Should freeze channel's queue. */ scsipi_printaddr(periph); printf("adapter resource shortage\n"); /* FALLTHROUGH */ case XS_BUSY: if (xs->error == XS_BUSY && xs->status == SCSI_QUEUE_FULL) { struct scsipi_max_openings mo; /* * We set the openings to active - 1, assuming that * the command that got us here is the first one that * can't fit into the device's queue. If that's not * the case, I guess we'll find out soon enough. */ mo.mo_target = periph->periph_target; mo.mo_lun = periph->periph_lun; if (periph->periph_active < periph->periph_openings) mo.mo_openings = periph->periph_active - 1; else mo.mo_openings = periph->periph_openings - 1; #ifdef DIAGNOSTIC if (mo.mo_openings < 0) { scsipi_printaddr(periph); printf("QUEUE FULL resulted in < 0 openings\n"); panic("scsipi_done"); } #endif if (mo.mo_openings == 0) { scsipi_printaddr(periph); printf("QUEUE FULL resulted in 0 openings\n"); mo.mo_openings = 1; } scsipi_async_event(chan, ASYNC_EVENT_MAX_OPENINGS, &mo); error = ERESTART; } else if (xs->xs_retries != 0) { xs->xs_retries--; /* * Wait one second, and try again. */ mutex_enter(chan_mtx(chan)); if ((xs->xs_control & XS_CTL_POLL) || (chan->chan_flags & SCSIPI_CHAN_TACTIVE) == 0) { /* XXX: quite extreme */ kpause("xsbusy", false, hz, chan_mtx(chan)); } else if (!callout_pending(&periph->periph_callout)) { scsipi_periph_freeze_locked(periph, 1); callout_reset(&periph->periph_callout, hz, scsipi_periph_timed_thaw, periph); } mutex_exit(chan_mtx(chan)); error = ERESTART; } else error = EBUSY; break; case XS_REQUEUE: error = ERESTART; break; case XS_SELTIMEOUT: case XS_TIMEOUT: /* * If the device hasn't gone away, honor retry counts. * * Note that if we're in the middle of probing it, * it won't be found because it isn't here yet so * we won't honor the retry count in that case. */ if (scsipi_lookup_periph(chan, periph->periph_target, periph->periph_lun) && xs->xs_retries != 0) { xs->xs_retries--; error = ERESTART; } else error = EIO; break; case XS_RESET: if (xs->xs_control & XS_CTL_REQSENSE) { /* * request sense interrupted by reset: signal it * with EINTR return code. */ error = EINTR; } else { if (xs->xs_retries != 0) { xs->xs_retries--; error = ERESTART; } else error = EIO; } break; case XS_DRIVER_STUFFUP: scsipi_printaddr(periph); printf("generic HBA error\n"); error = EIO; break; default: scsipi_printaddr(periph); printf("invalid return code from adapter: %d\n", xs->error); error = EIO; break; } mutex_enter(chan_mtx(chan)); if (error == ERESTART) { SDT_PROBE1(scsi, base, xfer, restart, xs); /* * If we get here, the periph has been thawed and frozen * again if we had to issue recovery commands. Alternatively, * it may have been frozen again and in a timed thaw. In * any case, we thaw the periph once we re-enqueue the * command. Once the periph is fully thawed, it will begin * operation again. */ xs->error = XS_NOERROR; xs->status = SCSI_OK; xs->xs_status &= ~XS_STS_DONE; xs->xs_requeuecnt++; error = scsipi_enqueue(xs); if (error == 0) { scsipi_periph_thaw_locked(periph, 1); mutex_exit(chan_mtx(chan)); return ERESTART; } } /* * scsipi_done() freezes the queue if not XS_NOERROR. * Thaw it here. */ if (xs->error != XS_NOERROR) scsipi_periph_thaw_locked(periph, 1); mutex_exit(chan_mtx(chan)); if (periph->periph_switch->psw_done) periph->periph_switch->psw_done(xs, error); mutex_enter(chan_mtx(chan)); if (xs->xs_control & XS_CTL_ASYNC) scsipi_put_xs(xs); mutex_exit(chan_mtx(chan)); return error; } /* * Issue a request sense for the given scsipi_xfer. Called when the xfer * returns with a CHECK_CONDITION status. Must be called in valid thread * context. */ static void scsipi_request_sense(struct scsipi_xfer *xs) { struct scsipi_periph *periph = xs->xs_periph; int flags, error; struct scsi_request_sense cmd; periph->periph_flags |= PERIPH_SENSE; /* if command was polling, request sense will too */ flags = xs->xs_control & XS_CTL_POLL; /* Polling commands can't sleep */ if (flags) flags |= XS_CTL_NOSLEEP; flags |= XS_CTL_REQSENSE | XS_CTL_URGENT | XS_CTL_DATA_IN | XS_CTL_THAW_PERIPH | XS_CTL_FREEZE_PERIPH; memset(&cmd, 0, sizeof(cmd)); cmd.opcode = SCSI_REQUEST_SENSE; cmd.length = sizeof(struct scsi_sense_data); error = scsipi_command(periph, (void *)&cmd, sizeof(cmd), (void *)&xs->sense.scsi_sense, sizeof(struct scsi_sense_data), 0, 1000, NULL, flags); periph->periph_flags &= ~PERIPH_SENSE; periph->periph_xscheck = NULL; switch (error) { case 0: /* we have a valid sense */ xs->error = XS_SENSE; return; case EINTR: /* REQUEST_SENSE interrupted by bus reset. */ xs->error = XS_RESET; return; case EIO: /* request sense couldn't be performed */ /* * XXX this isn't quite right but we don't have anything * better for now */ xs->error = XS_DRIVER_STUFFUP; return; default: /* Notify that request sense failed. */ xs->error = XS_DRIVER_STUFFUP; scsipi_printaddr(periph); printf("request sense failed with error %d\n", error); return; } } /* * scsipi_enqueue: * * Enqueue an xfer on a channel. */ static int scsipi_enqueue(struct scsipi_xfer *xs) { struct scsipi_channel *chan = xs->xs_periph->periph_channel; struct scsipi_xfer *qxs; SDT_PROBE1(scsi, base, xfer, enqueue, xs); /* * If the xfer is to be polled, and there are already jobs on * the queue, we can't proceed. */ KASSERT(mutex_owned(chan_mtx(chan))); if ((xs->xs_control & XS_CTL_POLL) != 0 && TAILQ_FIRST(&chan->chan_queue) != NULL) { xs->error = XS_DRIVER_STUFFUP; return EAGAIN; } /* * If we have an URGENT xfer, it's an error recovery command * and it should just go on the head of the channel's queue. */ if (xs->xs_control & XS_CTL_URGENT) { TAILQ_INSERT_HEAD(&chan->chan_queue, xs, channel_q); goto out; } /* * If this xfer has already been on the queue before, we * need to reinsert it in the correct order. That order is: * * Immediately before the first xfer for this periph * with a requeuecnt less than xs->xs_requeuecnt. * * Failing that, at the end of the queue. (We'll end up * there naturally.) */ if (xs->xs_requeuecnt != 0) { for (qxs = TAILQ_FIRST(&chan->chan_queue); qxs != NULL; qxs = TAILQ_NEXT(qxs, channel_q)) { if (qxs->xs_periph == xs->xs_periph && qxs->xs_requeuecnt < xs->xs_requeuecnt) break; } if (qxs != NULL) { TAILQ_INSERT_AFTER(&chan->chan_queue, qxs, xs, channel_q); goto out; } } TAILQ_INSERT_TAIL(&chan->chan_queue, xs, channel_q); out: if (xs->xs_control & XS_CTL_THAW_PERIPH) scsipi_periph_thaw_locked(xs->xs_periph, 1); return 0; } /* * scsipi_run_queue: * * Start as many xfers as possible running on the channel. */ static void scsipi_run_queue(struct scsipi_channel *chan) { struct scsipi_xfer *xs; struct scsipi_periph *periph; SDT_PROBE1(scsi, base, queue, batch__start, chan); for (;;) { mutex_enter(chan_mtx(chan)); /* * If the channel is frozen, we can't do any work right * now. */ if (chan->chan_qfreeze != 0) { mutex_exit(chan_mtx(chan)); break; } /* * Look for work to do, and make sure we can do it. */ for (xs = TAILQ_FIRST(&chan->chan_queue); xs != NULL; xs = TAILQ_NEXT(xs, channel_q)) { periph = xs->xs_periph; if ((periph->periph_sent >= periph->periph_openings) || periph->periph_qfreeze != 0 || (periph->periph_flags & PERIPH_UNTAG) != 0) continue; if ((periph->periph_flags & (PERIPH_RECOVERING | PERIPH_SENSE)) != 0 && (xs->xs_control & XS_CTL_URGENT) == 0) continue; /* * We can issue this xfer! */ goto got_one; } /* * Can't find any work to do right now. */ mutex_exit(chan_mtx(chan)); break; got_one: /* * Have an xfer to run. Allocate a resource from * the adapter to run it. If we can't allocate that * resource, we don't dequeue the xfer. */ if (scsipi_get_resource(chan) == 0) { /* * Adapter is out of resources. If the adapter * supports it, attempt to grow them. */ if (scsipi_grow_resources(chan) == 0) { /* * Wasn't able to grow resources, * nothing more we can do. */ if (xs->xs_control & XS_CTL_POLL) { scsipi_printaddr(xs->xs_periph); printf("polling command but no " "adapter resources"); /* We'll panic shortly... */ } mutex_exit(chan_mtx(chan)); /* * XXX: We should be able to note that * XXX: that resources are needed here! */ break; } /* * scsipi_grow_resources() allocated the resource * for us. */ } /* * We have a resource to run this xfer, do it! */ TAILQ_REMOVE(&chan->chan_queue, xs, channel_q); /* * If the command is to be tagged, allocate a tag ID * for it. */ if (XS_CTL_TAGTYPE(xs) != 0) scsipi_get_tag(xs); else periph->periph_flags |= PERIPH_UNTAG; periph->periph_sent++; mutex_exit(chan_mtx(chan)); SDT_PROBE2(scsi, base, queue, run, chan, xs); scsipi_adapter_request(chan, ADAPTER_REQ_RUN_XFER, xs); } SDT_PROBE1(scsi, base, queue, batch__done, chan); } /* * scsipi_execute_xs: * * Begin execution of an xfer, waiting for it to complete, if necessary. */ int scsipi_execute_xs(struct scsipi_xfer *xs) { struct scsipi_periph *periph = xs->xs_periph; struct scsipi_channel *chan = periph->periph_channel; int oasync, async, poll, error; KASSERT(!cold); scsipi_update_timeouts(xs); (chan->chan_bustype->bustype_cmd)(xs); xs->xs_status &= ~XS_STS_DONE; xs->error = XS_NOERROR; xs->resid = xs->datalen; xs->status = SCSI_OK; SDT_PROBE1(scsi, base, xfer, execute, xs); #ifdef SCSIPI_DEBUG if (xs->xs_periph->periph_dbflags & SCSIPI_DB3) { printf("scsipi_execute_xs: "); show_scsipi_xs(xs); printf("\n"); } #endif /* * Deal with command tagging: * * - If the device's current operating mode doesn't * include tagged queueing, clear the tag mask. * * - If the device's current operating mode *does* * include tagged queueing, set the tag_type in * the xfer to the appropriate byte for the tag * message. */ if ((PERIPH_XFER_MODE(periph) & PERIPH_CAP_TQING) == 0 || (xs->xs_control & XS_CTL_REQSENSE)) { xs->xs_control &= ~XS_CTL_TAGMASK; xs->xs_tag_type = 0; } else { /* * If the request doesn't specify a tag, give Head * tags to URGENT operations and Simple tags to * everything else. */ if (XS_CTL_TAGTYPE(xs) == 0) { if (xs->xs_control & XS_CTL_URGENT) xs->xs_control |= XS_CTL_HEAD_TAG; else xs->xs_control |= XS_CTL_SIMPLE_TAG; } switch (XS_CTL_TAGTYPE(xs)) { case XS_CTL_ORDERED_TAG: xs->xs_tag_type = MSG_ORDERED_Q_TAG; break; case XS_CTL_SIMPLE_TAG: xs->xs_tag_type = MSG_SIMPLE_Q_TAG; break; case XS_CTL_HEAD_TAG: xs->xs_tag_type = MSG_HEAD_OF_Q_TAG; break; default: scsipi_printaddr(periph); printf("invalid tag mask 0x%08x\n", XS_CTL_TAGTYPE(xs)); panic("scsipi_execute_xs"); } } /* If the adapter wants us to poll, poll. */ if (chan->chan_adapter->adapt_flags & SCSIPI_ADAPT_POLL_ONLY) xs->xs_control |= XS_CTL_POLL; /* * If we don't yet have a completion thread, or we are to poll for * completion, clear the ASYNC flag. */ oasync = (xs->xs_control & XS_CTL_ASYNC); if (chan->chan_thread == NULL || (xs->xs_control & XS_CTL_POLL) != 0) xs->xs_control &= ~XS_CTL_ASYNC; async = (xs->xs_control & XS_CTL_ASYNC); poll = (xs->xs_control & XS_CTL_POLL); #ifdef DIAGNOSTIC if (oasync != 0 && xs->bp == NULL) panic("scsipi_execute_xs: XS_CTL_ASYNC but no buf"); #endif /* * Enqueue the transfer. If we're not polling for completion, this * should ALWAYS return `no error'. */ error = scsipi_enqueue(xs); if (error) { if (poll == 0) { scsipi_printaddr(periph); printf("not polling, but enqueue failed with %d\n", error); panic("scsipi_execute_xs"); } scsipi_printaddr(periph); printf("should have flushed queue?\n"); goto free_xs; } mutex_exit(chan_mtx(chan)); restarted: scsipi_run_queue(chan); mutex_enter(chan_mtx(chan)); /* * The xfer is enqueued, and possibly running. If it's to be * completed asynchronously, just return now. */ if (async) return 0; /* * Not an asynchronous command; wait for it to complete. */ while ((xs->xs_status & XS_STS_DONE) == 0) { if (poll) { scsipi_printaddr(periph); printf("polling command not done\n"); panic("scsipi_execute_xs"); } cv_wait(xs_cv(xs), chan_mtx(chan)); } /* * Command is complete. scsipi_done() has awakened us to perform * the error handling. */ mutex_exit(chan_mtx(chan)); error = scsipi_complete(xs); if (error == ERESTART) goto restarted; /* * If it was meant to run async and we cleared async ourselves, * don't return an error here. It has already been handled */ if (oasync) error = 0; /* * Command completed successfully or fatal error occurred. Fall * into.... */ mutex_enter(chan_mtx(chan)); free_xs: scsipi_put_xs(xs); mutex_exit(chan_mtx(chan)); /* * Kick the queue, keep it running in case it stopped for some * reason. */ scsipi_run_queue(chan); mutex_enter(chan_mtx(chan)); return error; } /* * scsipi_completion_thread: * * This is the completion thread. We wait for errors on * asynchronous xfers, and perform the error handling * function, restarting the command, if necessary. */ static void scsipi_completion_thread(void *arg) { struct scsipi_channel *chan = arg; struct scsipi_xfer *xs; if (chan->chan_init_cb) (*chan->chan_init_cb)(chan, chan->chan_init_cb_arg); mutex_enter(chan_mtx(chan)); chan->chan_flags |= SCSIPI_CHAN_TACTIVE; for (;;) { xs = TAILQ_FIRST(&chan->chan_complete); if (xs == NULL && chan->chan_tflags == 0) { /* nothing to do; wait */ cv_wait(chan_cv_complete(chan), chan_mtx(chan)); continue; } if (chan->chan_tflags & SCSIPI_CHANT_CALLBACK) { /* call chan_callback from thread context */ chan->chan_tflags &= ~SCSIPI_CHANT_CALLBACK; chan->chan_callback(chan, chan->chan_callback_arg); continue; } if (chan->chan_tflags & SCSIPI_CHANT_GROWRES) { /* attempt to get more openings for this channel */ chan->chan_tflags &= ~SCSIPI_CHANT_GROWRES; mutex_exit(chan_mtx(chan)); scsipi_adapter_request(chan, ADAPTER_REQ_GROW_RESOURCES, NULL); scsipi_channel_thaw(chan, 1); if (chan->chan_tflags & SCSIPI_CHANT_GROWRES) kpause("scsizzz", FALSE, hz/10, NULL); mutex_enter(chan_mtx(chan)); continue; } if (chan->chan_tflags & SCSIPI_CHANT_KICK) { /* explicitly run the queues for this channel */ chan->chan_tflags &= ~SCSIPI_CHANT_KICK; mutex_exit(chan_mtx(chan)); scsipi_run_queue(chan); mutex_enter(chan_mtx(chan)); continue; } if (chan->chan_tflags & SCSIPI_CHANT_SHUTDOWN) { break; } if (xs) { TAILQ_REMOVE(&chan->chan_complete, xs, channel_q); mutex_exit(chan_mtx(chan)); /* * Have an xfer with an error; process it. */ (void) scsipi_complete(xs); /* * Kick the queue; keep it running if it was stopped * for some reason. */ scsipi_run_queue(chan); mutex_enter(chan_mtx(chan)); } } chan->chan_thread = NULL; /* In case parent is waiting for us to exit. */ cv_broadcast(chan_cv_thread(chan)); mutex_exit(chan_mtx(chan)); kthread_exit(0); } /* * scsipi_thread_call_callback: * * request to call a callback from the completion thread */ int scsipi_thread_call_callback(struct scsipi_channel *chan, void (*callback)(struct scsipi_channel *, void *), void *arg) { mutex_enter(chan_mtx(chan)); if ((chan->chan_flags & SCSIPI_CHAN_TACTIVE) == 0) { /* kernel thread doesn't exist yet */ mutex_exit(chan_mtx(chan)); return ESRCH; } if (chan->chan_tflags & SCSIPI_CHANT_CALLBACK) { mutex_exit(chan_mtx(chan)); return EBUSY; } scsipi_channel_freeze(chan, 1); chan->chan_callback = callback; chan->chan_callback_arg = arg; chan->chan_tflags |= SCSIPI_CHANT_CALLBACK; cv_broadcast(chan_cv_complete(chan)); mutex_exit(chan_mtx(chan)); return 0; } /* * scsipi_async_event: * * Handle an asynchronous event from an adapter. */ void scsipi_async_event(struct scsipi_channel *chan, scsipi_async_event_t event, void *arg) { bool lock = chan_running(chan) > 0; if (lock) mutex_enter(chan_mtx(chan)); switch (event) { case ASYNC_EVENT_MAX_OPENINGS: scsipi_async_event_max_openings(chan, (struct scsipi_max_openings *)arg); break; case ASYNC_EVENT_XFER_MODE: if (chan->chan_bustype->bustype_async_event_xfer_mode) { chan->chan_bustype->bustype_async_event_xfer_mode( chan, arg); } break; case ASYNC_EVENT_RESET: scsipi_async_event_channel_reset(chan); break; } if (lock) mutex_exit(chan_mtx(chan)); } /* * scsipi_async_event_max_openings: * * Update the maximum number of outstanding commands a * device may have. */ static void scsipi_async_event_max_openings(struct scsipi_channel *chan, struct scsipi_max_openings *mo) { struct scsipi_periph *periph; int minlun, maxlun; if (mo->mo_lun == -1) { /* * Wildcarded; apply it to all LUNs. */ minlun = 0; maxlun = chan->chan_nluns - 1; } else minlun = maxlun = mo->mo_lun; /* XXX This could really suck with a large LUN space. */ for (; minlun <= maxlun; minlun++) { periph = scsipi_lookup_periph_locked(chan, mo->mo_target, minlun); if (periph == NULL) continue; if (mo->mo_openings < periph->periph_openings) periph->periph_openings = mo->mo_openings; else if (mo->mo_openings > periph->periph_openings && (periph->periph_flags & PERIPH_GROW_OPENINGS) != 0) periph->periph_openings = mo->mo_openings; } } /* * scsipi_set_xfer_mode: * * Set the xfer mode for the specified I_T Nexus. */ void scsipi_set_xfer_mode(struct scsipi_channel *chan, int target, int immed) { struct scsipi_xfer_mode xm; struct scsipi_periph *itperiph; int lun; /* * Go to the minimal xfer mode. */ xm.xm_target = target; xm.xm_mode = 0; xm.xm_period = 0; /* ignored */ xm.xm_offset = 0; /* ignored */ /* * Find the first LUN we know about on this I_T Nexus. */ for (itperiph = NULL, lun = 0; lun < chan->chan_nluns; lun++) { itperiph = scsipi_lookup_periph(chan, target, lun); if (itperiph != NULL) break; } if (itperiph != NULL) { xm.xm_mode = itperiph->periph_cap; /* * Now issue the request to the adapter. */ scsipi_adapter_request(chan, ADAPTER_REQ_SET_XFER_MODE, &xm); /* * If we want this to happen immediately, issue a dummy * command, since most adapters can't really negotiate unless * they're executing a job. */ if (immed != 0) { (void) scsipi_test_unit_ready(itperiph, XS_CTL_DISCOVERY | XS_CTL_IGNORE_ILLEGAL_REQUEST | XS_CTL_IGNORE_NOT_READY | XS_CTL_IGNORE_MEDIA_CHANGE); } } } /* * scsipi_channel_reset: * * handle scsi bus reset * called with channel lock held */ static void scsipi_async_event_channel_reset(struct scsipi_channel *chan) { struct scsipi_xfer *xs, *xs_next; struct scsipi_periph *periph; int target, lun; /* * Channel has been reset. Also mark as reset pending REQUEST_SENSE * commands; as the sense is not available any more. * can't call scsipi_done() from here, as the command has not been * sent to the adapter yet (this would corrupt accounting). */ for (xs = TAILQ_FIRST(&chan->chan_queue); xs != NULL; xs = xs_next) { xs_next = TAILQ_NEXT(xs, channel_q); if (xs->xs_control & XS_CTL_REQSENSE) { TAILQ_REMOVE(&chan->chan_queue, xs, channel_q); xs->error = XS_RESET; if ((xs->xs_control & XS_CTL_ASYNC) != 0) TAILQ_INSERT_TAIL(&chan->chan_complete, xs, channel_q); } } cv_broadcast(chan_cv_complete(chan)); /* Catch xs with pending sense which may not have a REQSENSE xs yet */ for (target = 0; target < chan->chan_ntargets; target++) { if (target == chan->chan_id) continue; for (lun = 0; lun < chan->chan_nluns; lun++) { periph = scsipi_lookup_periph_locked(chan, target, lun); if (periph) { xs = periph->periph_xscheck; if (xs) xs->error = XS_RESET; } } } } /* * scsipi_target_detach: * * detach all periph associated with a I_T * must be called from valid thread context */ int scsipi_target_detach(struct scsipi_channel *chan, int target, int lun, int flags) { struct scsipi_periph *periph; device_t tdev; int ctarget, mintarget, maxtarget; int clun, minlun, maxlun; int error = 0; if (target == -1) { mintarget = 0; maxtarget = chan->chan_ntargets; } else { if (target == chan->chan_id) return EINVAL; if (target < 0 || target >= chan->chan_ntargets) return EINVAL; mintarget = target; maxtarget = target + 1; } if (lun == -1) { minlun = 0; maxlun = chan->chan_nluns; } else { if (lun < 0 || lun >= chan->chan_nluns) return EINVAL; minlun = lun; maxlun = lun + 1; } /* for config_detach */ KERNEL_LOCK(1, curlwp); mutex_enter(chan_mtx(chan)); for (ctarget = mintarget; ctarget < maxtarget; ctarget++) { if (ctarget == chan->chan_id) continue; for (clun = minlun; clun < maxlun; clun++) { periph = scsipi_lookup_periph_locked(chan, ctarget, clun); if (periph == NULL) continue; tdev = periph->periph_dev; mutex_exit(chan_mtx(chan)); error = config_detach(tdev, flags); if (error) goto out; mutex_enter(chan_mtx(chan)); KASSERT(scsipi_lookup_periph_locked(chan, ctarget, clun) == NULL); } } mutex_exit(chan_mtx(chan)); out: KERNEL_UNLOCK_ONE(curlwp); return error; } /* * scsipi_adapter_addref: * * Add a reference to the adapter pointed to by the provided * link, enabling the adapter if necessary. */ int scsipi_adapter_addref(struct scsipi_adapter *adapt) { int error = 0; if (atomic_inc_uint_nv(&adapt->adapt_refcnt) == 1 && adapt->adapt_enable != NULL) { scsipi_adapter_lock(adapt); error = scsipi_adapter_enable(adapt, 1); scsipi_adapter_unlock(adapt); if (error) atomic_dec_uint(&adapt->adapt_refcnt); } return error; } /* * scsipi_adapter_delref: * * Delete a reference to the adapter pointed to by the provided * link, disabling the adapter if possible. */ void scsipi_adapter_delref(struct scsipi_adapter *adapt) { membar_release(); if (atomic_dec_uint_nv(&adapt->adapt_refcnt) == 0 && adapt->adapt_enable != NULL) { membar_acquire(); scsipi_adapter_lock(adapt); (void) scsipi_adapter_enable(adapt, 0); scsipi_adapter_unlock(adapt); } } static struct scsipi_syncparam { int ss_factor; int ss_period; /* ns * 100 */ } scsipi_syncparams[] = { { 0x08, 625 }, /* FAST-160 (Ultra320) */ { 0x09, 1250 }, /* FAST-80 (Ultra160) */ { 0x0a, 2500 }, /* FAST-40 40MHz (Ultra2) */ { 0x0b, 3030 }, /* FAST-40 33MHz (Ultra2) */ { 0x0c, 5000 }, /* FAST-20 (Ultra) */ }; static const int scsipi_nsyncparams = sizeof(scsipi_syncparams) / sizeof(scsipi_syncparams[0]); int scsipi_sync_period_to_factor(int period /* ns * 100 */) { int i; for (i = 0; i < scsipi_nsyncparams; i++) { if (period <= scsipi_syncparams[i].ss_period) return scsipi_syncparams[i].ss_factor; } return (period / 100) / 4; } int scsipi_sync_factor_to_period(int factor) { int i; for (i = 0; i < scsipi_nsyncparams; i++) { if (factor == scsipi_syncparams[i].ss_factor) return scsipi_syncparams[i].ss_period; } return (factor * 4) * 100; } int scsipi_sync_factor_to_freq(int factor) { int i; for (i = 0; i < scsipi_nsyncparams; i++) { if (factor == scsipi_syncparams[i].ss_factor) return 100000000 / scsipi_syncparams[i].ss_period; } return 10000000 / ((factor * 4) * 10); } static inline void scsipi_adapter_lock(struct scsipi_adapter *adapt) { if ((adapt->adapt_flags & SCSIPI_ADAPT_MPSAFE) == 0) KERNEL_LOCK(1, NULL); } static inline void scsipi_adapter_unlock(struct scsipi_adapter *adapt) { if ((adapt->adapt_flags & SCSIPI_ADAPT_MPSAFE) == 0) KERNEL_UNLOCK_ONE(NULL); } void scsipi_adapter_minphys(struct scsipi_channel *chan, struct buf *bp) { struct scsipi_adapter *adapt = chan->chan_adapter; scsipi_adapter_lock(adapt); (adapt->adapt_minphys)(bp); scsipi_adapter_unlock(chan->chan_adapter); } void scsipi_adapter_request(struct scsipi_channel *chan, scsipi_adapter_req_t req, void *arg) { struct scsipi_adapter *adapt = chan->chan_adapter; scsipi_adapter_lock(adapt); SDT_PROBE3(scsi, base, adapter, request__start, chan, req, arg); (adapt->adapt_request)(chan, req, arg); SDT_PROBE3(scsi, base, adapter, request__done, chan, req, arg); scsipi_adapter_unlock(adapt); } int scsipi_adapter_ioctl(struct scsipi_channel *chan, u_long cmd, void *data, int flag, struct proc *p) { struct scsipi_adapter *adapt = chan->chan_adapter; int error; if (adapt->adapt_ioctl == NULL) return ENOTTY; scsipi_adapter_lock(adapt); error = (adapt->adapt_ioctl)(chan, cmd, data, flag, p); scsipi_adapter_unlock(adapt); return error; } int scsipi_adapter_enable(struct scsipi_adapter *adapt, int enable) { int error; scsipi_adapter_lock(adapt); error = (adapt->adapt_enable)(adapt->adapt_dev, enable); scsipi_adapter_unlock(adapt); return error; } #ifdef SCSIPI_DEBUG /* * Given a scsipi_xfer, dump the request, in all its glory */ void show_scsipi_xs(struct scsipi_xfer *xs) { printf("xs(%p): ", xs); printf("xs_control(0x%08x)", xs->xs_control); printf("xs_status(0x%08x)", xs->xs_status); printf("periph(%p)", xs->xs_periph); printf("retr(0x%x)", xs->xs_retries); printf("timo(0x%x)", xs->timeout); printf("cmd(%p)", xs->cmd); printf("len(0x%x)", xs->cmdlen); printf("data(%p)", xs->data); printf("len(0x%x)", xs->datalen); printf("res(0x%x)", xs->resid); printf("err(0x%x)", xs->error); printf("bp(%p)", xs->bp); show_scsipi_cmd(xs); } void show_scsipi_cmd(struct scsipi_xfer *xs) { u_char *b = (u_char *) xs->cmd; int i = 0; scsipi_printaddr(xs->xs_periph); printf(" command: "); if ((xs->xs_control & XS_CTL_RESET) == 0) { while (i < xs->cmdlen) { if (i) printf(","); printf("0x%x", b[i++]); } printf("-[%d bytes]\n", xs->datalen); if (xs->datalen) show_mem(xs->data, uimin(64, xs->datalen)); } else printf("-RESET-\n"); } void show_mem(u_char *address, int num) { int x; printf("------------------------------"); for (x = 0; x < num; x++) { if ((x % 16) == 0) printf("\n%03d: ", x); printf("%02x ", *address++); } printf("\n------------------------------\n"); } #endif /* SCSIPI_DEBUG */
1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 /* $NetBSD: ip_encap.c,v 1.77 2022/12/07 08:33:02 knakahara Exp $ */ /* $KAME: ip_encap.c,v 1.73 2001/10/02 08:30:58 itojun Exp $ */ /* * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the project nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * My grandfather said that there's a devil inside tunnelling technology... * * We have surprisingly many protocols that want packets with IP protocol * #4 or #41. Here's a list of protocols that want protocol #41: * RFC1933 configured tunnel * RFC1933 automatic tunnel * RFC2401 IPsec tunnel * RFC2473 IPv6 generic packet tunnelling * RFC2529 6over4 tunnel * RFC3056 6to4 tunnel * isatap tunnel * mobile-ip6 (uses RFC2473) * Here's a list of protocol that want protocol #4: * RFC1853 IPv4-in-IPv4 tunnelling * RFC2003 IPv4 encapsulation within IPv4 * RFC2344 reverse tunnelling for mobile-ip4 * RFC2401 IPsec tunnel * Well, what can I say. They impose different en/decapsulation mechanism * from each other, so they need separate protocol handler. The only one * we can easily determine by protocol # is IPsec, which always has * AH/ESP/IPComp header right after outer IP header. * * So, clearly good old protosw does not work for protocol #4 and #41. * The code will let you match protocol via src/dst address pair. */ /* XXX is M_NETADDR correct? */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: ip_encap.c,v 1.77 2022/12/07 08:33:02 knakahara Exp $"); #ifdef _KERNEL_OPT #include "opt_mrouting.h" #include "opt_inet.h" #include "opt_net_mpsafe.h" #endif #include <sys/param.h> #include <sys/systm.h> #include <sys/socket.h> #include <sys/socketvar.h> /* for softnet_lock */ #include <sys/sockio.h> #include <sys/mbuf.h> #include <sys/errno.h> #include <sys/queue.h> #include <sys/kmem.h> #include <sys/mutex.h> #include <sys/condvar.h> #include <sys/psref.h> #include <sys/pslist.h> #include <sys/thmap.h> #include <net/if.h> #include <netinet/in.h> #include <netinet/in_systm.h> #include <netinet/ip.h> #include <netinet/ip_var.h> #include <netinet/ip_encap.h> #ifdef MROUTING #include <netinet/ip_mroute.h> #endif /* MROUTING */ #ifdef INET6 #include <netinet/ip6.h> #include <netinet6/ip6_var.h> #include <netinet6/ip6protosw.h> /* for struct ip6ctlparam */ #include <netinet6/in6_var.h> #include <netinet6/in6_pcb.h> #include <netinet/icmp6.h> #endif #ifdef NET_MPSAFE #define ENCAP_MPSAFE 1 #endif enum direction { INBOUND, OUTBOUND }; #ifdef INET static struct encaptab *encap4_lookup(struct mbuf *, int, int, enum direction, struct psref *); #endif #ifdef INET6 static struct encaptab *encap6_lookup(struct mbuf *, int, int, enum direction, struct psref *); #endif static int encap_add(struct encaptab *); static int encap_remove(struct encaptab *); static void encap_afcheck(int, const struct sockaddr *, const struct sockaddr *); static void encap_key_init(struct encap_key *, const struct sockaddr *, const struct sockaddr *); static void encap_key_inc(struct encap_key *); /* * In encap[46]_lookup(), ep->func can sleep(e.g. rtalloc1) while walking * encap_table. So, it cannot use pserialize_read_enter() */ static struct { struct pslist_head list; pserialize_t psz; struct psref_class *elem_class; /* for the element of et_list */ } encaptab __cacheline_aligned = { .list = PSLIST_INITIALIZER, }; #define encap_table encaptab.list static struct { kmutex_t lock; kcondvar_t cv; struct lwp *busy; } encap_whole __cacheline_aligned; static thmap_t *encap_map[2]; /* 0 for AF_INET, 1 for AF_INET6 */ static bool encap_initialized = false; /* * must be done before other encap interfaces initialization. */ void encapinit(void) { if (encap_initialized) return; encaptab.psz = pserialize_create(); encaptab.elem_class = psref_class_create("encapelem", IPL_SOFTNET); mutex_init(&encap_whole.lock, MUTEX_DEFAULT, IPL_NONE); cv_init(&encap_whole.cv, "ip_encap cv"); encap_whole.busy = NULL; encap_initialized = true; } void encap_init(void) { static int initialized = 0; if (initialized) return; initialized++; #if 0 /* * we cannot use LIST_INIT() here, since drivers may want to call * encap_attach(), on driver attach. encap_init() will be called * on AF_INET{,6} initialization, which happens after driver * initialization - using LIST_INIT() here can nuke encap_attach() * from drivers. */ PSLIST_INIT(&encap_table); #endif encap_map[0] = thmap_create(0, NULL, THMAP_NOCOPY); #ifdef INET6 encap_map[1] = thmap_create(0, NULL, THMAP_NOCOPY); #endif } #ifdef INET static struct encaptab * encap4_lookup(struct mbuf *m, int off, int proto, enum direction dir, struct psref *match_psref) { struct ip *ip; struct ip_pack4 pack; struct encaptab *ep, *match; int prio, matchprio; int s; thmap_t *emap = encap_map[0]; struct encap_key key; KASSERT(m->m_len >= sizeof(*ip)); ip = mtod(m, struct ip *); memset(&pack, 0, sizeof(pack)); pack.p.sp_len = sizeof(pack); pack.mine.sin_family = pack.yours.sin_family = AF_INET; pack.mine.sin_len = pack.yours.sin_len = sizeof(struct sockaddr_in); if (dir == INBOUND) { pack.mine.sin_addr = ip->ip_dst; pack.yours.sin_addr = ip->ip_src; } else { pack.mine.sin_addr = ip->ip_src; pack.yours.sin_addr = ip->ip_dst; } match = NULL; matchprio = 0; s = pserialize_read_enter(); encap_key_init(&key, sintosa(&pack.mine), sintosa(&pack.yours)); while ((ep = thmap_get(emap, &key, sizeof(key))) != NULL) { struct psref elem_psref; KASSERT(ep->af == AF_INET); if (ep->proto >= 0 && ep->proto != proto) { encap_key_inc(&key); continue; } psref_acquire(&elem_psref, &ep->psref, encaptab.elem_class); if (ep->func) { pserialize_read_exit(s); prio = (*ep->func)(m, off, proto, ep->arg); s = pserialize_read_enter(); } else { prio = pack.mine.sin_len + pack.yours.sin_len; } if (prio <= 0) { psref_release(&elem_psref, &ep->psref, encaptab.elem_class); encap_key_inc(&key); continue; } if (prio > matchprio) { /* release last matched ep */ if (match != NULL) psref_release(match_psref, &match->psref, encaptab.elem_class); psref_copy(match_psref, &elem_psref, encaptab.elem_class); matchprio = prio; match = ep; } psref_release(&elem_psref, &ep->psref, encaptab.elem_class); encap_key_inc(&key); } PSLIST_READER_FOREACH(ep, &encap_table, struct encaptab, chain) { struct psref elem_psref; if (ep->af != AF_INET) continue; if (ep->proto >= 0 && ep->proto != proto) continue; psref_acquire(&elem_psref, &ep->psref, encaptab.elem_class); pserialize_read_exit(s); /* ep->func is sleepable. e.g. rtalloc1 */ prio = (*ep->func)(m, off, proto, ep->arg); s = pserialize_read_enter(); /* * We prioritize the matches by using bit length of the * matches. user-supplied matching function * should return the bit length of the matches (for example, * if both src/dst are matched for IPv4, 64 should be returned). * 0 or negative return value means "it did not match". * * We need to loop through all the possible candidates * to get the best match - the search takes O(n) for * n attachments (i.e. interfaces). */ if (prio <= 0) { psref_release(&elem_psref, &ep->psref, encaptab.elem_class); continue; } if (prio > matchprio) { /* release last matched ep */ if (match != NULL) psref_release(match_psref, &match->psref, encaptab.elem_class); psref_copy(match_psref, &elem_psref, encaptab.elem_class); matchprio = prio; match = ep; } KASSERTMSG((match == NULL) || psref_held(&match->psref, encaptab.elem_class), "current match = %p, but not hold its psref", match); psref_release(&elem_psref, &ep->psref, encaptab.elem_class); } pserialize_read_exit(s); return match; } void encap4_input(struct mbuf *m, int off, int proto) { const struct encapsw *esw; struct encaptab *match; struct psref match_psref; match = encap4_lookup(m, off, proto, INBOUND, &match_psref); if (match) { /* found a match, "match" has the best one */ esw = match->esw; if (esw && esw->encapsw4.pr_input) { (*esw->encapsw4.pr_input)(m, off, proto, match->arg); psref_release(&match_psref, &match->psref, encaptab.elem_class); } else { psref_release(&match_psref, &match->psref, encaptab.elem_class); m_freem(m); } return; } /* last resort: inject to raw socket */ SOFTNET_LOCK_IF_NET_MPSAFE(); rip_input(m, off, proto); SOFTNET_UNLOCK_IF_NET_MPSAFE(); } #endif #ifdef INET6 static struct encaptab * encap6_lookup(struct mbuf *m, int off, int proto, enum direction dir, struct psref *match_psref) { struct ip6_hdr *ip6; struct ip_pack6 pack; int prio, matchprio; int s; struct encaptab *ep, *match; thmap_t *emap = encap_map[1]; struct encap_key key; KASSERT(m->m_len >= sizeof(*ip6)); ip6 = mtod(m, struct ip6_hdr *); memset(&pack, 0, sizeof(pack)); pack.p.sp_len = sizeof(pack); pack.mine.sin6_family = pack.yours.sin6_family = AF_INET6; pack.mine.sin6_len = pack.yours.sin6_len = sizeof(struct sockaddr_in6); if (dir == INBOUND) { pack.mine.sin6_addr = ip6->ip6_dst; pack.yours.sin6_addr = ip6->ip6_src; } else { pack.mine.sin6_addr = ip6->ip6_src; pack.yours.sin6_addr = ip6->ip6_dst; } match = NULL; matchprio = 0; s = pserialize_read_enter(); encap_key_init(&key, sin6tosa(&pack.mine), sin6tosa(&pack.yours)); while ((ep = thmap_get(emap, &key, sizeof(key))) != NULL) { struct psref elem_psref; KASSERT(ep->af == AF_INET6); if (ep->proto >= 0 && ep->proto != proto) { encap_key_inc(&key); continue; } psref_acquire(&elem_psref, &ep->psref, encaptab.elem_class); if (ep->func) { pserialize_read_exit(s); prio = (*ep->func)(m, off, proto, ep->arg); s = pserialize_read_enter(); } else { prio = pack.mine.sin6_len + pack.yours.sin6_len; } if (prio <= 0) { psref_release(&elem_psref, &ep->psref, encaptab.elem_class); encap_key_inc(&key); continue; } if (prio > matchprio) { /* release last matched ep */ if (match != NULL) psref_release(match_psref, &match->psref, encaptab.elem_class); psref_copy(match_psref, &elem_psref, encaptab.elem_class); matchprio = prio; match = ep; } psref_release(&elem_psref, &ep->psref, encaptab.elem_class); encap_key_inc(&key); } PSLIST_READER_FOREACH(ep, &encap_table, struct encaptab, chain) { struct psref elem_psref; if (ep->af != AF_INET6) continue; if (ep->proto >= 0 && ep->proto != proto) continue; psref_acquire(&elem_psref, &ep->psref, encaptab.elem_class); pserialize_read_exit(s); /* ep->func is sleepable. e.g. rtalloc1 */ prio = (*ep->func)(m, off, proto, ep->arg); s = pserialize_read_enter(); /* see encap4_lookup() for issues here */ if (prio <= 0) { psref_release(&elem_psref, &ep->psref, encaptab.elem_class); continue; } if (prio > matchprio) { /* release last matched ep */ if (match != NULL) psref_release(match_psref, &match->psref, encaptab.elem_class); psref_copy(match_psref, &elem_psref, encaptab.elem_class); matchprio = prio; match = ep; } KASSERTMSG((match == NULL) || psref_held(&match->psref, encaptab.elem_class), "current match = %p, but not hold its psref", match); psref_release(&elem_psref, &ep->psref, encaptab.elem_class); } pserialize_read_exit(s); return match; } int encap6_input(struct mbuf **mp, int *offp, int proto) { struct mbuf *m = *mp; const struct encapsw *esw; struct encaptab *match; struct psref match_psref; int rv; match = encap6_lookup(m, *offp, proto, INBOUND, &match_psref); if (match) { /* found a match */ esw = match->esw; if (esw && esw->encapsw6.pr_input) { int ret; ret = (*esw->encapsw6.pr_input)(mp, offp, proto, match->arg); psref_release(&match_psref, &match->psref, encaptab.elem_class); return ret; } else { psref_release(&match_psref, &match->psref, encaptab.elem_class); m_freem(m); return IPPROTO_DONE; } } /* last resort: inject to raw socket */ SOFTNET_LOCK_IF_NET_MPSAFE(); rv = rip6_input(mp, offp, proto); SOFTNET_UNLOCK_IF_NET_MPSAFE(); return rv; } #endif static int encap_add(struct encaptab *ep) { KASSERT(encap_lock_held()); PSLIST_WRITER_INSERT_HEAD(&encap_table, ep, chain); return 0; } static int encap_remove(struct encaptab *ep) { int error = 0; KASSERT(encap_lock_held()); PSLIST_WRITER_REMOVE(ep, chain); return error; } static void encap_afcheck(int af, const struct sockaddr *sp, const struct sockaddr *dp) { KASSERT(sp != NULL && dp != NULL); KASSERT(sp->sa_len == dp->sa_len); KASSERT(af == sp->sa_family && af == dp->sa_family); socklen_t len __diagused = sockaddr_getsize_by_family(af); KASSERT(len != 0 && len == sp->sa_len && len == dp->sa_len); } const struct encaptab * encap_attach_func(int af, int proto, encap_priofunc_t *func, const struct encapsw *esw, void *arg) { struct encaptab *ep; int error; #ifndef ENCAP_MPSAFE int s; s = splsoftnet(); #endif ASSERT_SLEEPABLE(); /* sanity check on args */ KASSERT(func != NULL); KASSERT(af == AF_INET #ifdef INET6 || af == AF_INET6 #endif ); ep = kmem_alloc(sizeof(*ep), KM_SLEEP); memset(ep, 0, sizeof(*ep)); ep->af = af; ep->proto = proto; ep->func = func; ep->esw = esw; ep->arg = arg; psref_target_init(&ep->psref, encaptab.elem_class); error = encap_add(ep); if (error) goto gc; error = 0; #ifndef ENCAP_MPSAFE splx(s); #endif return ep; gc: kmem_free(ep, sizeof(*ep)); #ifndef ENCAP_MPSAFE splx(s); #endif return NULL; } static void encap_key_init(struct encap_key *key, const struct sockaddr *local, const struct sockaddr *remote) { memset(key, 0, sizeof(*key)); sockaddr_copy(&key->local_sa, sizeof(key->local_u), local); sockaddr_copy(&key->remote_sa, sizeof(key->remote_u), remote); } static void encap_key_inc(struct encap_key *key) { (key->seq)++; } static void encap_key_dec(struct encap_key *key) { (key->seq)--; } static void encap_key_copy(struct encap_key *dst, const struct encap_key *src) { memset(dst, 0, sizeof(*dst)); *dst = *src; } /* * src is always my side, and dst is always remote side. * Return value will be necessary as input (cookie) for encap_detach(). */ const struct encaptab * encap_attach_addr(int af, int proto, const struct sockaddr *src, const struct sockaddr *dst, encap_priofunc_t *func, const struct encapsw *esw, void *arg) { struct encaptab *ep; size_t l; thmap_t *emap; void *retep; struct ip_pack4 *pack4; #ifdef INET6 struct ip_pack6 *pack6; #endif ASSERT_SLEEPABLE(); encap_afcheck(af, src, dst); switch (af) { case AF_INET: l = sizeof(*pack4); emap = encap_map[0]; break; #ifdef INET6 case AF_INET6: l = sizeof(*pack6); emap = encap_map[1]; break; #endif default: return NULL; } ep = kmem_zalloc(sizeof(*ep), KM_SLEEP); ep->addrpack = kmem_zalloc(l, KM_SLEEP); ep->addrpack->sa_len = l & 0xff; ep->af = af; ep->proto = proto; ep->flag = IP_ENCAP_ADDR_ENABLE; switch (af) { case AF_INET: pack4 = (struct ip_pack4 *)ep->addrpack; ep->src = (struct sockaddr *)&pack4->mine; ep->dst = (struct sockaddr *)&pack4->yours; break; #ifdef INET6 case AF_INET6: pack6 = (struct ip_pack6 *)ep->addrpack; ep->src = (struct sockaddr *)&pack6->mine; ep->dst = (struct sockaddr *)&pack6->yours; break; #endif } memcpy(ep->src, src, src->sa_len); memcpy(ep->dst, dst, dst->sa_len); ep->esw = esw; ep->arg = arg; ep->func = func; psref_target_init(&ep->psref, encaptab.elem_class); encap_key_init(&ep->key, src, dst); while ((retep = thmap_put(emap, &ep->key, sizeof(ep->key), ep)) != ep) encap_key_inc(&ep->key); return ep; } /* XXX encap4_ctlinput() is necessary if we set DF=1 on outer IPv4 header */ #ifdef INET6 void * encap6_ctlinput(int cmd, const struct sockaddr *sa, void *d0) { void *d = d0; struct ip6_hdr *ip6; struct mbuf *m; int off; struct ip6ctlparam *ip6cp = NULL; int nxt; int s; struct encaptab *ep; const struct encapsw *esw; if (sa->sa_family != AF_INET6 || sa->sa_len != sizeof(struct sockaddr_in6)) return NULL; if ((unsigned)cmd >= PRC_NCMDS) return NULL; if (cmd == PRC_HOSTDEAD) d = NULL; else if (cmd == PRC_MSGSIZE) ; /* special code is present, see below */ else if (inet6ctlerrmap[cmd] == 0) return NULL; /* if the parameter is from icmp6, decode it. */ if (d != NULL) { ip6cp = (struct ip6ctlparam *)d; m = ip6cp->ip6c_m; ip6 = ip6cp->ip6c_ip6; off = ip6cp->ip6c_off; nxt = ip6cp->ip6c_nxt; if (ip6 && cmd == PRC_MSGSIZE) { int valid = 0; struct encaptab *match; struct psref elem_psref; /* * Check to see if we have a valid encap configuration. */ match = encap6_lookup(m, off, nxt, OUTBOUND, &elem_psref); if (match) { valid++; psref_release(&elem_psref, &match->psref, encaptab.elem_class); } /* * Depending on the value of "valid" and routing table * size (mtudisc_{hi,lo}wat), we will: * - recalcurate the new MTU and create the * corresponding routing entry, or * - ignore the MTU change notification. */ icmp6_mtudisc_update((struct ip6ctlparam *)d, valid); } } else { m = NULL; ip6 = NULL; nxt = -1; } /* inform all listeners */ s = pserialize_read_enter(); PSLIST_READER_FOREACH(ep, &encap_table, struct encaptab, chain) { struct psref elem_psref; if (ep->af != AF_INET6) continue; if (ep->proto >= 0 && ep->proto != nxt) continue; /* should optimize by looking at address pairs */ /* XXX need to pass ep->arg or ep itself to listeners */ psref_acquire(&elem_psref, &ep->psref, encaptab.elem_class); esw = ep->esw; if (esw && esw->encapsw6.pr_ctlinput) { pserialize_read_exit(s); /* pr_ctlinput is sleepable. e.g. rtcache_free */ (*esw->encapsw6.pr_ctlinput)(cmd, sa, d, ep->arg); s = pserialize_read_enter(); } psref_release(&elem_psref, &ep->psref, encaptab.elem_class); } pserialize_read_exit(s); rip6_ctlinput(cmd, sa, d0); return NULL; } #endif static int encap_detach_addr(const struct encaptab *ep) { thmap_t *emap; struct encaptab *retep; struct encaptab *target; void *thgc; struct encap_key key; KASSERT(encap_lock_held()); KASSERT(ep->flag & IP_ENCAP_ADDR_ENABLE); switch (ep->af) { case AF_INET: emap = encap_map[0]; break; #ifdef INET6 case AF_INET6: emap = encap_map[1]; break; #endif default: return EINVAL; } retep = thmap_del(emap, &ep->key, sizeof(ep->key)); if (retep != ep) { return ENOENT; } target = retep; /* * To keep continuity, decrement seq after detached encaptab. */ encap_key_copy(&key, &ep->key); encap_key_inc(&key); while ((retep = thmap_del(emap, &key, sizeof(key))) != NULL) { void *pp; encap_key_dec(&retep->key); pp = thmap_put(emap, &retep->key, sizeof(retep->key), retep); KASSERT(retep == pp); encap_key_inc(&key); } thgc = thmap_stage_gc(emap); pserialize_perform(encaptab.psz); thmap_gc(emap, thgc); psref_target_destroy(&target->psref, encaptab.elem_class); kmem_free(target->addrpack, target->addrpack->sa_len); kmem_free(target, sizeof(*target)); return 0; } int encap_detach(const struct encaptab *cookie) { const struct encaptab *ep = cookie; struct encaptab *p; int error; KASSERT(encap_lock_held()); if (ep->flag & IP_ENCAP_ADDR_ENABLE) return encap_detach_addr(ep); PSLIST_WRITER_FOREACH(p, &encap_table, struct encaptab, chain) { if (p == ep) { error = encap_remove(p); if (error) return error; else break; } } if (p == NULL) return ENOENT; pserialize_perform(encaptab.psz); psref_target_destroy(&p->psref, encaptab.elem_class); kmem_free(p, sizeof(*p)); return 0; } int encap_lock_enter(void) { int error; mutex_enter(&encap_whole.lock); while (encap_whole.busy != NULL) { error = cv_wait_sig(&encap_whole.cv, &encap_whole.lock); if (error) { mutex_exit(&encap_whole.lock); return error; } } KASSERT(encap_whole.busy == NULL); encap_whole.busy = curlwp; mutex_exit(&encap_whole.lock); return 0; } void encap_lock_exit(void) { mutex_enter(&encap_whole.lock); KASSERT(encap_whole.busy == curlwp); encap_whole.busy = NULL; cv_broadcast(&encap_whole.cv); mutex_exit(&encap_whole.lock); } bool encap_lock_held(void) { return (encap_whole.busy == curlwp); }
10 10 10 10 9 10 10 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 /* $NetBSD: subr_workqueue.c,v 1.48 2024/03/01 04:32:38 mrg Exp $ */ /*- * Copyright (c)2002, 2005, 2006, 2007 YAMAMOTO Takashi, * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: subr_workqueue.c,v 1.48 2024/03/01 04:32:38 mrg Exp $"); #include <sys/param.h> #include <sys/condvar.h> #include <sys/cpu.h> #include <sys/kmem.h> #include <sys/kthread.h> #include <sys/mutex.h> #include <sys/proc.h> #include <sys/queue.h> #include <sys/sdt.h> #include <sys/systm.h> #include <sys/workqueue.h> typedef struct work_impl { SIMPLEQ_ENTRY(work_impl) wk_entry; } work_impl_t; SIMPLEQ_HEAD(workqhead, work_impl); struct workqueue_queue { kmutex_t q_mutex; kcondvar_t q_cv; struct workqhead q_queue_pending; uint64_t q_gen; lwp_t *q_worker; }; struct workqueue { void (*wq_func)(struct work *, void *); void *wq_arg; int wq_flags; char wq_name[MAXCOMLEN]; pri_t wq_prio; void *wq_ptr; }; #define WQ_SIZE (roundup2(sizeof(struct workqueue), coherency_unit)) #define WQ_QUEUE_SIZE (roundup2(sizeof(struct workqueue_queue), coherency_unit)) #define POISON 0xaabbccdd SDT_PROBE_DEFINE7(sdt, kernel, workqueue, create, "struct workqueue *"/*wq*/, "const char *"/*name*/, "void (*)(struct work *, void *)"/*func*/, "void *"/*arg*/, "pri_t"/*prio*/, "int"/*ipl*/, "int"/*flags*/); SDT_PROBE_DEFINE1(sdt, kernel, workqueue, destroy, "struct workqueue *"/*wq*/); SDT_PROBE_DEFINE3(sdt, kernel, workqueue, enqueue, "struct workqueue *"/*wq*/, "struct work *"/*wk*/, "struct cpu_info *"/*ci*/); SDT_PROBE_DEFINE4(sdt, kernel, workqueue, entry, "struct workqueue *"/*wq*/, "struct work *"/*wk*/, "void (*)(struct work *, void *)"/*func*/, "void *"/*arg*/); SDT_PROBE_DEFINE4(sdt, kernel, workqueue, return, "struct workqueue *"/*wq*/, "struct work *"/*wk*/, "void (*)(struct work *, void *)"/*func*/, "void *"/*arg*/); SDT_PROBE_DEFINE2(sdt, kernel, workqueue, wait__start, "struct workqueue *"/*wq*/, "struct work *"/*wk*/); SDT_PROBE_DEFINE2(sdt, kernel, workqueue, wait__self, "struct workqueue *"/*wq*/, "struct work *"/*wk*/); SDT_PROBE_DEFINE2(sdt, kernel, workqueue, wait__hit, "struct workqueue *"/*wq*/, "struct work *"/*wk*/); SDT_PROBE_DEFINE2(sdt, kernel, workqueue, wait__done, "struct workqueue *"/*wq*/, "struct work *"/*wk*/); SDT_PROBE_DEFINE1(sdt, kernel, workqueue, exit__start, "struct workqueue *"/*wq*/); SDT_PROBE_DEFINE1(sdt, kernel, workqueue, exit__done, "struct workqueue *"/*wq*/); static size_t workqueue_size(int flags) { return WQ_SIZE + ((flags & WQ_PERCPU) != 0 ? ncpu : 1) * WQ_QUEUE_SIZE + coherency_unit; } static struct workqueue_queue * workqueue_queue_lookup(struct workqueue *wq, struct cpu_info *ci) { u_int idx = 0; if (wq->wq_flags & WQ_PERCPU) { idx = ci ? cpu_index(ci) : cpu_index(curcpu()); } return (void *)((uintptr_t)(wq) + WQ_SIZE + (idx * WQ_QUEUE_SIZE)); } static void workqueue_runlist(struct workqueue *wq, struct workqhead *list) { work_impl_t *wk; work_impl_t *next; struct lwp *l = curlwp; KASSERTMSG(l->l_nopreempt == 0, "lwp %p nopreempt %d", l, l->l_nopreempt); for (wk = SIMPLEQ_FIRST(list); wk != NULL; wk = next) { next = SIMPLEQ_NEXT(wk, wk_entry); SDT_PROBE4(sdt, kernel, workqueue, entry, wq, wk, wq->wq_func, wq->wq_arg); (*wq->wq_func)((void *)wk, wq->wq_arg); SDT_PROBE4(sdt, kernel, workqueue, return, wq, wk, wq->wq_func, wq->wq_arg); KASSERTMSG(l->l_nopreempt == 0, "lwp %p nopreempt %d func %p", l, l->l_nopreempt, wq->wq_func); } } static void workqueue_worker(void *cookie) { struct workqueue *wq = cookie; struct workqueue_queue *q; int s, fpu = wq->wq_flags & WQ_FPU; /* find the workqueue of this kthread */ q = workqueue_queue_lookup(wq, curlwp->l_cpu); if (fpu) s = kthread_fpu_enter(); mutex_enter(&q->q_mutex); for (;;) { struct workqhead tmp; SIMPLEQ_INIT(&tmp); while (SIMPLEQ_EMPTY(&q->q_queue_pending)) cv_wait(&q->q_cv, &q->q_mutex); SIMPLEQ_CONCAT(&tmp, &q->q_queue_pending); SIMPLEQ_INIT(&q->q_queue_pending); /* * Mark the queue as actively running a batch of work * by setting the generation number odd. */ q->q_gen |= 1; mutex_exit(&q->q_mutex); workqueue_runlist(wq, &tmp); /* * Notify workqueue_wait that we have completed a batch * of work by incrementing the generation number. */ mutex_enter(&q->q_mutex); KASSERTMSG(q->q_gen & 1, "q=%p gen=%"PRIu64, q, q->q_gen); q->q_gen++; cv_broadcast(&q->q_cv); } mutex_exit(&q->q_mutex); if (fpu) kthread_fpu_exit(s); } static void workqueue_init(struct workqueue *wq, const char *name, void (*callback_func)(struct work *, void *), void *callback_arg, pri_t prio, int ipl) { KASSERT(sizeof(wq->wq_name) > strlen(name)); strncpy(wq->wq_name, name, sizeof(wq->wq_name)); wq->wq_prio = prio; wq->wq_func = callback_func; wq->wq_arg = callback_arg; } static int workqueue_initqueue(struct workqueue *wq, struct workqueue_queue *q, int ipl, struct cpu_info *ci) { int error, ktf; KASSERT(q->q_worker == NULL); mutex_init(&q->q_mutex, MUTEX_DEFAULT, ipl); cv_init(&q->q_cv, wq->wq_name); SIMPLEQ_INIT(&q->q_queue_pending); q->q_gen = 0; ktf = ((wq->wq_flags & WQ_MPSAFE) != 0 ? KTHREAD_MPSAFE : 0); if (wq->wq_prio < PRI_KERNEL) ktf |= KTHREAD_TS; if (ci) { error = kthread_create(wq->wq_prio, ktf, ci, workqueue_worker, wq, &q->q_worker, "%s/%u", wq->wq_name, ci->ci_index); } else { error = kthread_create(wq->wq_prio, ktf, ci, workqueue_worker, wq, &q->q_worker, "%s", wq->wq_name); } if (error != 0) { mutex_destroy(&q->q_mutex); cv_destroy(&q->q_cv); KASSERT(q->q_worker == NULL); } return error; } struct workqueue_exitargs { work_impl_t wqe_wk; struct workqueue_queue *wqe_q; }; static void workqueue_exit(struct work *wk, void *arg) { struct workqueue_exitargs *wqe = (void *)wk; struct workqueue_queue *q = wqe->wqe_q; /* * only competition at this point is workqueue_finiqueue. */ KASSERT(q->q_worker == curlwp); KASSERT(SIMPLEQ_EMPTY(&q->q_queue_pending)); mutex_enter(&q->q_mutex); q->q_worker = NULL; cv_broadcast(&q->q_cv); mutex_exit(&q->q_mutex); kthread_exit(0); } static void workqueue_finiqueue(struct workqueue *wq, struct workqueue_queue *q) { struct workqueue_exitargs wqe; KASSERT(wq->wq_func == workqueue_exit); wqe.wqe_q = q; KASSERT(SIMPLEQ_EMPTY(&q->q_queue_pending)); KASSERT(q->q_worker != NULL); mutex_enter(&q->q_mutex); SIMPLEQ_INSERT_TAIL(&q->q_queue_pending, &wqe.wqe_wk, wk_entry); cv_broadcast(&q->q_cv); while (q->q_worker != NULL) { cv_wait(&q->q_cv, &q->q_mutex); } mutex_exit(&q->q_mutex); mutex_destroy(&q->q_mutex); cv_destroy(&q->q_cv); } /* --- */ int workqueue_create(struct workqueue **wqp, const char *name, void (*callback_func)(struct work *, void *), void *callback_arg, pri_t prio, int ipl, int flags) { struct workqueue *wq; struct workqueue_queue *q; void *ptr; int error = 0; CTASSERT(sizeof(work_impl_t) <= sizeof(struct work)); ptr = kmem_zalloc(workqueue_size(flags), KM_SLEEP); wq = (void *)roundup2((uintptr_t)ptr, coherency_unit); wq->wq_ptr = ptr; wq->wq_flags = flags; workqueue_init(wq, name, callback_func, callback_arg, prio, ipl); if (flags & WQ_PERCPU) { struct cpu_info *ci; CPU_INFO_ITERATOR cii; /* create the work-queue for each CPU */ for (CPU_INFO_FOREACH(cii, ci)) { q = workqueue_queue_lookup(wq, ci); error = workqueue_initqueue(wq, q, ipl, ci); if (error) { break; } } } else { /* initialize a work-queue */ q = workqueue_queue_lookup(wq, NULL); error = workqueue_initqueue(wq, q, ipl, NULL); } if (error != 0) { workqueue_destroy(wq); } else { *wqp = wq; } return error; } static bool workqueue_q_wait(struct workqueue *wq, struct workqueue_queue *q, work_impl_t *wk_target) { work_impl_t *wk; bool found = false; uint64_t gen; mutex_enter(&q->q_mutex); /* * Avoid a deadlock scenario. We can't guarantee that * wk_target has completed at this point, but we can't wait for * it either, so do nothing. * * XXX Are there use-cases that require this semantics? */ if (q->q_worker == curlwp) { SDT_PROBE2(sdt, kernel, workqueue, wait__self, wq, wk_target); goto out; } /* * Wait until the target is no longer pending. If we find it * on this queue, the caller can stop looking in other queues. * If we don't find it in this queue, however, we can't skip * waiting -- it may be hidden in the running queue which we * have no access to. */ again: SIMPLEQ_FOREACH(wk, &q->q_queue_pending, wk_entry) { if (wk == wk_target) { SDT_PROBE2(sdt, kernel, workqueue, wait__hit, wq, wk); found = true; cv_wait(&q->q_cv, &q->q_mutex); goto again; } } /* * The target may be in the batch of work currently running, * but we can't touch that queue. So if there's anything * running, wait until the generation changes. */ gen = q->q_gen; if (gen & 1) { do cv_wait(&q->q_cv, &q->q_mutex); while (gen == q->q_gen); } out: mutex_exit(&q->q_mutex); return found; } /* * Wait for a specified work to finish. The caller must ensure that no new * work will be enqueued before calling workqueue_wait. Note that if the * workqueue is WQ_PERCPU, the caller can enqueue a new work to another queue * other than the waiting queue. */ void workqueue_wait(struct workqueue *wq, struct work *wk) { struct workqueue_queue *q; bool found; ASSERT_SLEEPABLE(); SDT_PROBE2(sdt, kernel, workqueue, wait__start, wq, wk); if (ISSET(wq->wq_flags, WQ_PERCPU)) { struct cpu_info *ci; CPU_INFO_ITERATOR cii; for (CPU_INFO_FOREACH(cii, ci)) { q = workqueue_queue_lookup(wq, ci); found = workqueue_q_wait(wq, q, (work_impl_t *)wk); if (found) break; } } else { q = workqueue_queue_lookup(wq, NULL); (void)workqueue_q_wait(wq, q, (work_impl_t *)wk); } SDT_PROBE2(sdt, kernel, workqueue, wait__done, wq, wk); } void workqueue_destroy(struct workqueue *wq) { struct workqueue_queue *q; struct cpu_info *ci; CPU_INFO_ITERATOR cii; ASSERT_SLEEPABLE(); SDT_PROBE1(sdt, kernel, workqueue, exit__start, wq); wq->wq_func = workqueue_exit; for (CPU_INFO_FOREACH(cii, ci)) { q = workqueue_queue_lookup(wq, ci); if (q->q_worker != NULL) { workqueue_finiqueue(wq, q); } } SDT_PROBE1(sdt, kernel, workqueue, exit__done, wq); kmem_free(wq->wq_ptr, workqueue_size(wq->wq_flags)); } #ifdef DEBUG static void workqueue_check_duplication(struct workqueue_queue *q, work_impl_t *wk) { work_impl_t *_wk; SIMPLEQ_FOREACH(_wk, &q->q_queue_pending, wk_entry) { if (_wk == wk) panic("%s: tried to enqueue a queued work", __func__); } } #endif void workqueue_enqueue(struct workqueue *wq, struct work *wk0, struct cpu_info *ci) { struct workqueue_queue *q; work_impl_t *wk = (void *)wk0; SDT_PROBE3(sdt, kernel, workqueue, enqueue, wq, wk0, ci); KASSERT(wq->wq_flags & WQ_PERCPU || ci == NULL); q = workqueue_queue_lookup(wq, ci); mutex_enter(&q->q_mutex); #ifdef DEBUG workqueue_check_duplication(q, wk); #endif SIMPLEQ_INSERT_TAIL(&q->q_queue_pending, wk, wk_entry); cv_broadcast(&q->q_cv); mutex_exit(&q->q_mutex); }
1 1 5 5 2 2 2 2 2 7 5 5 2 3 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 /* $NetBSD: nd6.c,v 1.282 2024/04/11 07:34:37 knakahara Exp $ */ /* $KAME: nd6.c,v 1.279 2002/06/08 11:16:51 itojun Exp $ */ /* * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the project nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: nd6.c,v 1.282 2024/04/11 07:34:37 knakahara Exp $"); #ifdef _KERNEL_OPT #include "opt_compat_netbsd.h" #include "opt_net_mpsafe.h" #endif #include "bridge.h" #include "carp.h" #include <sys/param.h> #include <sys/systm.h> #include <sys/callout.h> #include <sys/kmem.h> #include <sys/mbuf.h> #include <sys/socket.h> #include <sys/socketvar.h> #include <sys/sockio.h> #include <sys/time.h> #include <sys/kernel.h> #include <sys/errno.h> #include <sys/ioctl.h> #include <sys/syslog.h> #include <sys/queue.h> #include <sys/cprng.h> #include <sys/workqueue.h> #include <sys/compat_stub.h> #include <net/if.h> #include <net/if_dl.h> #include <net/if_llatbl.h> #include <net/if_types.h> #include <net/nd.h> #include <net/route.h> #include <net/if_ether.h> #include <net/if_arc.h> #include <netinet/in.h> #include <netinet6/in6_var.h> #include <netinet/ip6.h> #include <netinet6/ip6_var.h> #include <netinet6/scope6_var.h> #include <netinet6/nd6.h> #include <netinet6/in6_ifattach.h> #include <netinet/icmp6.h> #include <netinet6/icmp6_private.h> #include <compat/netinet6/in6_var.h> #include <compat/netinet6/nd6.h> #define ND6_SLOWTIMER_INTERVAL (60 * 60) /* 1 hour */ #define ND6_RECALC_REACHTM_INTERVAL (60 * 120) /* 2 hours */ /* timer values */ int nd6_prune = 1; /* walk list every 1 seconds */ int nd6_useloopback = 1; /* use loopback interface for local traffic */ /* preventing too many loops in ND option parsing */ int nd6_maxndopt = 10; /* max # of ND options allowed */ #ifdef ND6_DEBUG int nd6_debug = 1; #else int nd6_debug = 0; #endif krwlock_t nd6_lock __cacheline_aligned; int nd6_recalc_reachtm_interval = ND6_RECALC_REACHTM_INTERVAL; static void nd6_slowtimo(void *); static void nd6_free(struct llentry *, int); static bool nd6_nud_enabled(struct ifnet *); static unsigned int nd6_llinfo_reachable(struct ifnet *); static unsigned int nd6_llinfo_retrans(struct ifnet *); static union l3addr *nd6_llinfo_holdsrc(struct llentry *, union l3addr *); static void nd6_llinfo_output(struct ifnet *, const union l3addr *, const union l3addr *, const uint8_t *, const union l3addr *); static void nd6_llinfo_missed(struct ifnet *, const union l3addr *, int16_t, struct mbuf *); static void nd6_timer(void *); static void nd6_timer_work(struct work *, void *); static struct nd_opt_hdr *nd6_option(union nd_opts *); static callout_t nd6_slowtimo_ch; static callout_t nd6_timer_ch; static struct workqueue *nd6_timer_wq; static struct work nd6_timer_wk; struct nd_domain nd6_nd_domain = { .nd_family = AF_INET6, .nd_delay = 5, /* delay first probe time 5 second */ .nd_mmaxtries = 3, /* maximum unicast query */ .nd_umaxtries = 3, /* maximum multicast query */ .nd_retransmultiple = BACKOFF_MULTIPLE, .nd_maxretrans = MAX_RETRANS_TIMER, .nd_maxnudhint = 0, /* max # of subsequent upper layer hints */ .nd_maxqueuelen = 1, /* max # of packets in unresolved ND entries */ .nd_nud_enabled = nd6_nud_enabled, .nd_reachable = nd6_llinfo_reachable, .nd_retrans = nd6_llinfo_retrans, .nd_holdsrc = nd6_llinfo_holdsrc, .nd_output = nd6_llinfo_output, .nd_missed = nd6_llinfo_missed, .nd_free = nd6_free, }; MALLOC_DEFINE(M_IP6NDP, "NDP", "IPv6 Neighbour Discovery"); void nd6_init(void) { int error; nd_attach_domain(&nd6_nd_domain); nd6_nbr_init(); rw_init(&nd6_lock); callout_init(&nd6_slowtimo_ch, CALLOUT_MPSAFE); callout_init(&nd6_timer_ch, CALLOUT_MPSAFE); error = workqueue_create(&nd6_timer_wq, "nd6_timer", nd6_timer_work, NULL, PRI_SOFTNET, IPL_SOFTNET, WQ_MPSAFE); if (error) panic("%s: workqueue_create failed (%d)\n", __func__, error); /* start timer */ callout_reset(&nd6_slowtimo_ch, ND6_SLOWTIMER_INTERVAL * hz, nd6_slowtimo, NULL); callout_reset(&nd6_timer_ch, hz, nd6_timer, NULL); } struct nd_kifinfo * nd6_ifattach(struct ifnet *ifp) { struct nd_kifinfo *nd; nd = kmem_zalloc(sizeof(*nd), KM_SLEEP); nd->chlim = IPV6_DEFHLIM; nd->basereachable = REACHABLE_TIME; nd->reachable = ND_COMPUTE_RTIME(nd->basereachable); nd->retrans = RETRANS_TIMER; nd->flags = ND6_IFF_PERFORMNUD; /* A loopback interface always has ND6_IFF_AUTO_LINKLOCAL. * A bridge interface should not have ND6_IFF_AUTO_LINKLOCAL * because one of its members should. */ if ((ip6_auto_linklocal && ifp->if_type != IFT_BRIDGE) || (ifp->if_flags & IFF_LOOPBACK)) nd->flags |= ND6_IFF_AUTO_LINKLOCAL; return nd; } void nd6_ifdetach(struct ifnet *ifp, struct in6_ifextra *ext) { /* Ensure all IPv6 addresses are purged before calling nd6_purge */ if_purgeaddrs(ifp, AF_INET6, in6_purgeaddr); nd6_purge(ifp, ext); kmem_free(ext->nd_ifinfo, sizeof(struct nd_kifinfo)); } void nd6_option_init(void *opt, int icmp6len, union nd_opts *ndopts) { memset(ndopts, 0, sizeof(*ndopts)); ndopts->nd_opts_search = (struct nd_opt_hdr *)opt; ndopts->nd_opts_last = (struct nd_opt_hdr *)(((u_char *)opt) + icmp6len); if (icmp6len == 0) { ndopts->nd_opts_done = 1; ndopts->nd_opts_search = NULL; } } /* * Take one ND option. */ static struct nd_opt_hdr * nd6_option(union nd_opts *ndopts) { struct nd_opt_hdr *nd_opt; int olen; KASSERT(ndopts != NULL); KASSERT(ndopts->nd_opts_last != NULL); if (ndopts->nd_opts_search == NULL) return NULL; if (ndopts->nd_opts_done) return NULL; nd_opt = ndopts->nd_opts_search; /* make sure nd_opt_len is inside the buffer */ if ((void *)&nd_opt->nd_opt_len >= (void *)ndopts->nd_opts_last) { memset(ndopts, 0, sizeof(*ndopts)); return NULL; } olen = nd_opt->nd_opt_len << 3; if (olen == 0) { /* * Message validation requires that all included * options have a length that is greater than zero. */ memset(ndopts, 0, sizeof(*ndopts)); return NULL; } ndopts->nd_opts_search = (struct nd_opt_hdr *)((char *)nd_opt + olen); if (ndopts->nd_opts_search > ndopts->nd_opts_last) { /* option overruns the end of buffer, invalid */ memset(ndopts, 0, sizeof(*ndopts)); return NULL; } else if (ndopts->nd_opts_search == ndopts->nd_opts_last) { /* reached the end of options chain */ ndopts->nd_opts_done = 1; ndopts->nd_opts_search = NULL; } return nd_opt; } /* * Parse multiple ND options. * This function is much easier to use, for ND routines that do not need * multiple options of the same type. */ int nd6_options(union nd_opts *ndopts) { struct nd_opt_hdr *nd_opt; int i = 0; KASSERT(ndopts != NULL); KASSERT(ndopts->nd_opts_last != NULL); if (ndopts->nd_opts_search == NULL) return 0; while (1) { nd_opt = nd6_option(ndopts); if (nd_opt == NULL && ndopts->nd_opts_last == NULL) { /* * Message validation requires that all included * options have a length that is greater than zero. */ ICMP6_STATINC(ICMP6_STAT_ND_BADOPT); memset(ndopts, 0, sizeof(*ndopts)); return -1; } if (nd_opt == NULL) goto skip1; switch (nd_opt->nd_opt_type) { case ND_OPT_SOURCE_LINKADDR: case ND_OPT_TARGET_LINKADDR: case ND_OPT_MTU: case ND_OPT_REDIRECTED_HEADER: case ND_OPT_NONCE: if (ndopts->nd_opt_array[nd_opt->nd_opt_type]) { nd6log(LOG_INFO, "duplicated ND6 option found (type=%d)\n", nd_opt->nd_opt_type); /* XXX bark? */ } else { ndopts->nd_opt_array[nd_opt->nd_opt_type] = nd_opt; } break; case ND_OPT_PREFIX_INFORMATION: if (ndopts->nd_opt_array[nd_opt->nd_opt_type] == 0) { ndopts->nd_opt_array[nd_opt->nd_opt_type] = nd_opt; } ndopts->nd_opts_pi_end = (struct nd_opt_prefix_info *)nd_opt; break; default: /* * Unknown options must be silently ignored, * to accommodate future extension to the protocol. */ nd6log(LOG_DEBUG, "nd6_options: unsupported option %d - " "option ignored\n", nd_opt->nd_opt_type); } skip1: i++; if (i > nd6_maxndopt) { ICMP6_STATINC(ICMP6_STAT_ND_TOOMANYOPT); nd6log(LOG_INFO, "too many loop in nd opt\n"); break; } if (ndopts->nd_opts_done) break; } return 0; } /* * Gets source address of the first packet in hold queue * and stores it in @src. * Returns pointer to @src (if hold queue is not empty) or NULL. */ static struct in6_addr * nd6_llinfo_get_holdsrc(struct llentry *ln, struct in6_addr *src) { struct ip6_hdr *hip6; if (ln == NULL || ln->ln_hold == NULL) return NULL; /* * assuming every packet in ln_hold has the same IP header */ hip6 = mtod(ln->ln_hold, struct ip6_hdr *); /* XXX pullup? */ if (sizeof(*hip6) < ln->ln_hold->m_len) *src = hip6->ip6_src; else src = NULL; return src; } static union l3addr * nd6_llinfo_holdsrc(struct llentry *ln, union l3addr *src) { if (nd6_llinfo_get_holdsrc(ln, &src->addr6) == NULL) return NULL; return src; } static void nd6_llinfo_output(struct ifnet *ifp, const union l3addr *daddr, const union l3addr *taddr, __unused const uint8_t *tlladdr, const union l3addr *hsrc) { nd6_ns_output(ifp, daddr != NULL ? &daddr->addr6 : NULL, taddr != NULL ? &taddr->addr6 : NULL, hsrc != NULL ? &hsrc->addr6 : NULL, NULL); } static bool nd6_nud_enabled(struct ifnet *ifp) { struct nd_kifinfo *ndi = ND_IFINFO(ifp); return ndi->flags & ND6_IFF_PERFORMNUD; } static unsigned int nd6_llinfo_reachable(struct ifnet *ifp) { struct nd_kifinfo *ndi = ND_IFINFO(ifp); return ndi->reachable; } static unsigned int nd6_llinfo_retrans(struct ifnet *ifp) { struct nd_kifinfo *ndi = ND_IFINFO(ifp); return ndi->retrans; } static void nd6_llinfo_missed(struct ifnet *ifp, const union l3addr *taddr, int16_t type, struct mbuf *m) { struct in6_addr mdaddr6 = zeroin6_addr; struct sockaddr_in6 dsin6, tsin6; struct sockaddr *sa; if (m != NULL) { if (type == ND_LLINFO_PROBE) { struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *); /* XXX pullup? */ if (sizeof(*ip6) < m->m_len) mdaddr6 = ip6->ip6_src; m_freem(m); } else icmp6_error2(m, ICMP6_DST_UNREACH, ICMP6_DST_UNREACH_ADDR, 0, ifp, &mdaddr6); } if (!IN6_IS_ADDR_UNSPECIFIED(&mdaddr6)) { sockaddr_in6_init(&dsin6, &mdaddr6, 0, 0, 0); sa = sin6tosa(&dsin6); } else sa = NULL; sockaddr_in6_init(&tsin6, &taddr->addr6, 0, 0, 0); rt_clonedmsg(RTM_MISS, sa, sin6tosa(&tsin6), NULL, ifp); } /* * ND6 timer routine to expire default route list and prefix list */ static void nd6_timer_work(struct work *wk, void *arg) { struct in6_ifaddr *ia6, *nia6; int s, bound; struct psref psref; callout_reset(&nd6_timer_ch, nd6_prune * hz, nd6_timer, NULL); SOFTNET_KERNEL_LOCK_UNLESS_NET_MPSAFE(); /* expire interface addresses */ bound = curlwp_bind(); s = pserialize_read_enter(); for (ia6 = IN6_ADDRLIST_READER_FIRST(); ia6; ia6 = nia6) { nia6 = IN6_ADDRLIST_READER_NEXT(ia6); ia6_acquire(ia6, &psref); pserialize_read_exit(s); /* check address lifetime */ if (IFA6_IS_INVALID(ia6)) { struct ifnet *ifp; ifp = ia6->ia_ifa.ifa_ifp; IFNET_LOCK(ifp); /* * Need to take the lock first to prevent if_detach * from running in6_purgeaddr concurrently. */ if (!if_is_deactivated(ifp)) { ia6_release(ia6, &psref); in6_purgeaddr(&ia6->ia_ifa); } else { /* * ifp is being destroyed, ia6 will be destroyed * by if_detach. */ ia6_release(ia6, &psref); } ia6 = NULL; IFNET_UNLOCK(ifp); } else if (IFA6_IS_DEPRECATED(ia6)) { int oldflags = ia6->ia6_flags; if ((oldflags & IN6_IFF_DEPRECATED) == 0) { ia6->ia6_flags |= IN6_IFF_DEPRECATED; rt_addrmsg(RTM_NEWADDR, (struct ifaddr *)ia6); } } else { /* * A new RA might have made a deprecated address * preferred. */ if (ia6->ia6_flags & IN6_IFF_DEPRECATED) { ia6->ia6_flags &= ~IN6_IFF_DEPRECATED; rt_addrmsg(RTM_NEWADDR, (struct ifaddr *)ia6); } } s = pserialize_read_enter(); ia6_release(ia6, &psref); } pserialize_read_exit(s); curlwp_bindx(bound); SOFTNET_KERNEL_UNLOCK_UNLESS_NET_MPSAFE(); } static void nd6_timer(void *ignored_arg) { workqueue_enqueue(nd6_timer_wq, &nd6_timer_wk, NULL); } /* * Nuke neighbor cache/prefix/default router management table, right before * ifp goes away. */ void nd6_purge(struct ifnet *ifp, struct in6_ifextra *ext) { /* * During detach, the ND info might be already removed, but * then is explitly passed as argument. * Otherwise get it from ifp->if_afdata. */ if (ext == NULL) ext = ifp->if_afdata[AF_INET6]; if (ext == NULL) return; /* * We may not need to nuke the neighbor cache entries here * because the neighbor cache is kept in if_afdata[AF_INET6]. * nd6_purge() is invoked by in6_ifdetach() which is called * from if_detach() where everything gets purged. However * in6_ifdetach is directly called from vlan(4), so we still * need to purge entries here. */ if (ext->lltable != NULL) lltable_purge_entries(ext->lltable); } struct llentry * nd6_lookup(const struct in6_addr *addr6, const struct ifnet *ifp, bool wlock) { struct sockaddr_in6 sin6; struct llentry *ln; sockaddr_in6_init(&sin6, addr6, 0, 0, 0); IF_AFDATA_RLOCK(ifp); ln = lla_lookup(LLTABLE6(ifp), wlock ? LLE_EXCLUSIVE : 0, sin6tosa(&sin6)); IF_AFDATA_RUNLOCK(ifp); return ln; } struct llentry * nd6_create(const struct in6_addr *addr6, const struct ifnet *ifp) { struct sockaddr_in6 sin6; struct llentry *ln; struct rtentry *rt; sockaddr_in6_init(&sin6, addr6, 0, 0, 0); rt = rtalloc1(sin6tosa(&sin6), 0); IF_AFDATA_WLOCK(ifp); ln = lla_create(LLTABLE6(ifp), LLE_EXCLUSIVE, sin6tosa(&sin6), rt); IF_AFDATA_WUNLOCK(ifp); if (rt != NULL) rt_unref(rt); if (ln != NULL) ln->ln_state = ND_LLINFO_NOSTATE; return ln; } /* * Test whether a given IPv6 address is a neighbor or not, ignoring * the actual neighbor cache. The neighbor cache is ignored in order * to not reenter the routing code from within itself. */ static int nd6_is_new_addr_neighbor(const struct sockaddr_in6 *addr, struct ifnet *ifp) { struct ifaddr *dstaddr; int s; /* * A link-local address is always a neighbor. * XXX: a link does not necessarily specify a single interface. */ if (IN6_IS_ADDR_LINKLOCAL(&addr->sin6_addr)) { struct sockaddr_in6 sin6_copy; u_int32_t zone; /* * We need sin6_copy since sa6_recoverscope() may modify the * content (XXX). */ sin6_copy = *addr; if (sa6_recoverscope(&sin6_copy)) return 0; /* XXX: should be impossible */ if (in6_setscope(&sin6_copy.sin6_addr, ifp, &zone)) return 0; if (sin6_copy.sin6_scope_id == zone) return 1; else return 0; } /* * If the address is assigned on the node of the other side of * a p2p interface, the address should be a neighbor. */ s = pserialize_read_enter(); dstaddr = ifa_ifwithdstaddr(sin6tocsa(addr)); if (dstaddr != NULL) { if (dstaddr->ifa_ifp == ifp) { pserialize_read_exit(s); return 1; } } pserialize_read_exit(s); return 0; } /* * Detect if a given IPv6 address identifies a neighbor on a given link. * XXX: should take care of the destination of a p2p link? */ int nd6_is_addr_neighbor(const struct sockaddr_in6 *addr, struct ifnet *ifp) { struct llentry *ln; struct rtentry *rt; /* * A link-local address is always a neighbor. * XXX: a link does not necessarily specify a single interface. */ if (IN6_IS_ADDR_LINKLOCAL(&addr->sin6_addr)) { struct sockaddr_in6 sin6_copy; u_int32_t zone; /* * We need sin6_copy since sa6_recoverscope() may modify the * content (XXX). */ sin6_copy = *addr; if (sa6_recoverscope(&sin6_copy)) return 0; /* XXX: should be impossible */ if (in6_setscope(&sin6_copy.sin6_addr, ifp, &zone)) return 0; if (sin6_copy.sin6_scope_id == zone) return 1; else return 0; } if (nd6_is_new_addr_neighbor(addr, ifp)) return 1; /* * Even if the address matches none of our addresses, it might be * in the neighbor cache or a connected route. */ ln = nd6_lookup(&addr->sin6_addr, ifp, false); if (ln != NULL) { LLE_RUNLOCK(ln); return 1; } rt = rtalloc1(sin6tocsa(addr), 0); if (rt == NULL) return 0; if ((rt->rt_flags & RTF_CONNECTED) && (rt->rt_ifp == ifp #if NBRIDGE > 0 || rt->rt_ifp->if_bridge == ifp->if_bridge #endif #if NCARP > 0 || (ifp->if_type == IFT_CARP && rt->rt_ifp == ifp->if_carpdev) || (rt->rt_ifp->if_type == IFT_CARP && rt->rt_ifp->if_carpdev == ifp)|| (ifp->if_type == IFT_CARP && rt->rt_ifp->if_type == IFT_CARP && rt->rt_ifp->if_carpdev == ifp->if_carpdev) #endif )) { rt_unref(rt); return 1; } rt_unref(rt); return 0; } /* * Free an nd6 llinfo entry. * Since the function would cause significant changes in the kernel, DO NOT * make it global, unless you have a strong reason for the change, and are sure * that the change is safe. */ static void nd6_free(struct llentry *ln, int gc) { struct ifnet *ifp; KASSERT(ln != NULL); LLE_WLOCK_ASSERT(ln); /* * If the reason for the deletion is just garbage collection, * and the neighbor is an active router, do not delete it. * Instead, reset the GC timer using the router's lifetime. * XXX: the check for ln_state should be redundant, * but we intentionally keep it just in case. */ if (!ip6_forwarding && ln->ln_router && ln->ln_state == ND_LLINFO_STALE && gc) { nd_set_timer(ln, ND_TIMER_EXPIRE); LLE_WUNLOCK(ln); return; } ifp = ln->lle_tbl->llt_ifp; if (ln->la_flags & LLE_VALID || gc) { struct sockaddr_in6 sin6; const char *lladdr; sockaddr_in6_init(&sin6, &ln->r_l3addr.addr6, 0, 0, 0); lladdr = ln->la_flags & LLE_VALID ? (const char *)&ln->ll_addr : NULL; rt_clonedmsg(RTM_DELETE, NULL, sin6tosa(&sin6), lladdr, ifp); } /* * Save to unlock. We still hold an extra reference and will not * free(9) in llentry_free() if someone else holds one as well. */ LLE_WUNLOCK(ln); IF_AFDATA_LOCK(ifp); LLE_WLOCK(ln); lltable_free_entry(LLTABLE6(ifp), ln); IF_AFDATA_UNLOCK(ifp); } /* * Upper-layer reachability hint for Neighbor Unreachability Detection. * * XXX cost-effective methods? */ void nd6_nud_hint(struct rtentry *rt) { struct llentry *ln; struct ifnet *ifp; if (rt == NULL) return; ifp = rt->rt_ifp; ln = nd6_lookup(&(satocsin6(rt_getkey(rt)))->sin6_addr, ifp, true); nd_nud_hint(ln); } struct gc_args { int gc_entries; const struct in6_addr *skip_in6; }; static int nd6_purge_entry(struct lltable *llt, struct llentry *ln, void *farg) { struct gc_args *args = farg; int *n = &args->gc_entries; const struct in6_addr *skip_in6 = args->skip_in6; if (*n <= 0) return 0; if (ND_IS_LLINFO_PERMANENT(ln)) return 0; if (IN6_ARE_ADDR_EQUAL(&ln->r_l3addr.addr6, skip_in6)) return 0; LLE_WLOCK(ln); if (ln->ln_state > ND_LLINFO_INCOMPLETE) ln->ln_state = ND_LLINFO_STALE; else ln->ln_state = ND_LLINFO_PURGE; nd_set_timer(ln, ND_TIMER_IMMEDIATE); LLE_WUNLOCK(ln); (*n)--; return 0; } static void nd6_gc_neighbors(struct lltable *llt, const struct in6_addr *in6) { if (ip6_neighborgcthresh >= 0 && lltable_get_entry_count(llt) >= ip6_neighborgcthresh) { struct gc_args gc_args = {10, in6}; /* * XXX entries that are "less recently used" should be * freed first. */ lltable_foreach_lle(llt, nd6_purge_entry, &gc_args); } } void nd6_rtrequest(int req, struct rtentry *rt, const struct rt_addrinfo *info) { struct sockaddr *gate = rt->rt_gateway; struct ifnet *ifp = rt->rt_ifp; uint8_t namelen = strlen(ifp->if_xname), addrlen = ifp->if_addrlen; struct ifaddr *ifa; RT_DPRINTF("rt_getkey(rt) = %p\n", rt_getkey(rt)); if (req == RTM_LLINFO_UPD) { int rc; struct in6_addr *in6; struct in6_addr in6_all; int anycast; if ((ifa = info->rti_ifa) == NULL) return; in6 = &ifatoia6(ifa)->ia_addr.sin6_addr; anycast = ifatoia6(ifa)->ia6_flags & IN6_IFF_ANYCAST; in6_all = in6addr_linklocal_allnodes; if ((rc = in6_setscope(&in6_all, ifa->ifa_ifp, NULL)) != 0) { log(LOG_ERR, "%s: failed to set scope %s " "(errno=%d)\n", __func__, if_name(ifp), rc); return; } /* XXX don't set Override for proxy addresses */ nd6_na_output(ifa->ifa_ifp, &in6_all, in6, (anycast ? 0 : ND_NA_FLAG_OVERRIDE) #if 0 | (ip6_forwarding ? ND_NA_FLAG_ROUTER : 0) #endif , 1, NULL); return; } if ((rt->rt_flags & RTF_GATEWAY) != 0) { if (req != RTM_ADD) return; /* * linklayers with particular MTU limitation. */ switch(ifp->if_type) { #if NARCNET > 0 case IFT_ARCNET: if (rt->rt_rmx.rmx_mtu > ARC_PHDS_MAXMTU) /* RFC2497 */ rt->rt_rmx.rmx_mtu = ARC_PHDS_MAXMTU; break; #endif } return; } if (nd6_need_cache(ifp) == 0 && (rt->rt_flags & RTF_HOST) == 0) { RT_DPRINTF("rt_getkey(rt) = %p\n", rt_getkey(rt)); /* * This is probably an interface direct route for a link * which does not need neighbor caches (e.g. fe80::%lo0/64). * We do not need special treatment below for such a route. * Moreover, the RTF_LLINFO flag which would be set below * would annoy the ndp(8) command. */ return; } switch (req) { case RTM_ADD: { struct psref psref; RT_DPRINTF("rt_getkey(rt) = %p\n", rt_getkey(rt)); /* * There is no backward compatibility :) * * if ((rt->rt_flags & RTF_HOST) == 0 && * SIN(rt_mask(rt))->sin_addr.s_addr != 0xffffffff) * rt->rt_flags |= RTF_CLONING; */ /* XXX should move to route.c? */ if (rt->rt_flags & (RTF_CONNECTED | RTF_LOCAL)) { union { struct sockaddr sa; struct sockaddr_dl sdl; struct sockaddr_storage ss; } u; /* * Case 1: This route should come from a route to * interface (RTF_CLONING case) or the route should be * treated as on-link but is currently not * (RTF_LLINFO && ln == NULL case). */ if (sockaddr_dl_init(&u.sdl, sizeof(u.ss), ifp->if_index, ifp->if_type, NULL, namelen, NULL, addrlen) == NULL) { printf("%s.%d: sockaddr_dl_init(, %zu, ) " "failed on %s\n", __func__, __LINE__, sizeof(u.ss), if_name(ifp)); } rt_setgate(rt, &u.sa); gate = rt->rt_gateway; RT_DPRINTF("rt_getkey(rt) = %p\n", rt_getkey(rt)); if (gate == NULL) { log(LOG_ERR, "%s: rt_setgate failed on %s\n", __func__, if_name(ifp)); break; } RT_DPRINTF("rt_getkey(rt) = %p\n", rt_getkey(rt)); if ((rt->rt_flags & RTF_CONNECTED) != 0) break; } RT_DPRINTF("rt_getkey(rt) = %p\n", rt_getkey(rt)); /* * In IPv4 code, we try to annonuce new RTF_ANNOUNCE entry here. * We don't do that here since llinfo is not ready yet. * * There are also couple of other things to be discussed: * - unsolicited NA code needs improvement beforehand * - RFC2461 says we MAY send multicast unsolicited NA * (7.2.6 paragraph 4), however, it also says that we * SHOULD provide a mechanism to prevent multicast NA storm. * we don't have anything like it right now. * note that the mechanism needs a mutual agreement * between proxies, which means that we need to implement * a new protocol, or a new kludge. * - from RFC2461 6.2.4, host MUST NOT send an unsolicited NA. * we need to check ip6forwarding before sending it. * (or should we allow proxy ND configuration only for * routers? there's no mention about proxy ND from hosts) */ #if 0 /* XXX it does not work */ if (rt->rt_flags & RTF_ANNOUNCE) nd6_na_output(ifp, &satocsin6(rt_getkey(rt))->sin6_addr, &satocsin6(rt_getkey(rt))->sin6_addr, ip6_forwarding ? ND_NA_FLAG_ROUTER : 0, 1, NULL); #endif if ((ifp->if_flags & (IFF_POINTOPOINT | IFF_LOOPBACK)) == 0) { RT_DPRINTF("rt_getkey(rt) = %p\n", rt_getkey(rt)); /* * Address resolution isn't necessary for a point to * point link, so we can skip this test for a p2p link. */ if (gate->sa_family != AF_LINK || gate->sa_len < sockaddr_dl_measure(namelen, addrlen)) { log(LOG_DEBUG, "nd6_rtrequest: bad gateway value: %s\n", if_name(ifp)); break; } satosdl(gate)->sdl_type = ifp->if_type; satosdl(gate)->sdl_index = ifp->if_index; RT_DPRINTF("rt_getkey(rt) = %p\n", rt_getkey(rt)); } RT_DPRINTF("rt_getkey(rt) = %p\n", rt_getkey(rt)); /* * When called from rt_ifa_addlocal, we cannot depend on that * the address (rt_getkey(rt)) exits in the address list of the * interface. So check RTF_LOCAL instead. */ if (rt->rt_flags & RTF_LOCAL) { if (nd6_useloopback) rt->rt_ifp = lo0ifp; /* XXX */ break; } /* * check if rt_getkey(rt) is an address assigned * to the interface. */ ifa = (struct ifaddr *)in6ifa_ifpwithaddr_psref(ifp, &satocsin6(rt_getkey(rt))->sin6_addr, &psref); if (ifa != NULL) { if (nd6_useloopback) { rt->rt_ifp = lo0ifp; /* XXX */ /* * Make sure rt_ifa be equal to the ifaddr * corresponding to the address. * We need this because when we refer * rt_ifa->ia6_flags in ip6_input, we assume * that the rt_ifa points to the address instead * of the loopback address. */ if (!ISSET(info->rti_flags, RTF_DONTCHANGEIFA) && ifa != rt->rt_ifa) rt_replace_ifa(rt, ifa); } } else if (rt->rt_flags & RTF_ANNOUNCE) { /* join solicited node multicast for proxy ND */ if (ifp->if_flags & IFF_MULTICAST) { struct in6_addr llsol; int error; llsol = satocsin6(rt_getkey(rt))->sin6_addr; llsol.s6_addr32[0] = htonl(0xff020000); llsol.s6_addr32[1] = 0; llsol.s6_addr32[2] = htonl(1); llsol.s6_addr8[12] = 0xff; if (in6_setscope(&llsol, ifp, NULL)) goto out; if (!in6_addmulti(&llsol, ifp, &error, 0)) { char ip6buf[INET6_ADDRSTRLEN]; nd6log(LOG_ERR, "%s: failed to join " "%s (errno=%d)\n", if_name(ifp), IN6_PRINT(ip6buf, &llsol), error); } } } out: ifa_release(ifa, &psref); /* * If we have too many cache entries, initiate immediate * purging for some entries. */ if (rt->rt_ifp != NULL) nd6_gc_neighbors(LLTABLE6(rt->rt_ifp), NULL); break; } case RTM_DELETE: /* leave from solicited node multicast for proxy ND */ if ((rt->rt_flags & RTF_ANNOUNCE) != 0 && (ifp->if_flags & IFF_MULTICAST) != 0) { struct in6_addr llsol; llsol = satocsin6(rt_getkey(rt))->sin6_addr; llsol.s6_addr32[0] = htonl(0xff020000); llsol.s6_addr32[1] = 0; llsol.s6_addr32[2] = htonl(1); llsol.s6_addr8[12] = 0xff; if (in6_setscope(&llsol, ifp, NULL) == 0) in6_lookup_and_delete_multi(&llsol, ifp); } break; } } static void nd6_setifflags(struct ifnet *ifp, uint32_t flags) { struct nd_kifinfo *ndi = ND_IFINFO(ifp); struct ifaddr *ifa; struct in6_ifaddr *ia; int s; if (ndi->flags & ND6_IFF_IFDISABLED && !(flags & ND6_IFF_IFDISABLED)) { /* * If the interface is marked as ND6_IFF_IFDISABLED and * has a link-local address with IN6_IFF_DUPLICATED, * do not clear ND6_IFF_IFDISABLED. * See RFC 4862, section 5.4.5. */ bool duplicated_linklocal = false; s = pserialize_read_enter(); IFADDR_READER_FOREACH(ifa, ifp) { if (ifa->ifa_addr->sa_family != AF_INET6) continue; ia = (struct in6_ifaddr *)ifa; if ((ia->ia6_flags & IN6_IFF_DUPLICATED) && IN6_IS_ADDR_LINKLOCAL(IA6_IN6(ia))) { duplicated_linklocal = true; break; } } pserialize_read_exit(s); if (duplicated_linklocal) { flags |= ND6_IFF_IFDISABLED; log(LOG_ERR, "%s: Cannot enable an interface" " with a link-local address marked" " duplicate.\n", if_name(ifp)); } else { ndi->flags &= ~ND6_IFF_IFDISABLED; if (ifp->if_flags & IFF_UP) in6_if_up(ifp); } } else if (!(ndi->flags & ND6_IFF_IFDISABLED) && (flags & ND6_IFF_IFDISABLED)) { struct psref psref; int bound = curlwp_bind(); /* Mark all IPv6 addresses as tentative. */ ndi->flags |= ND6_IFF_IFDISABLED; s = pserialize_read_enter(); IFADDR_READER_FOREACH(ifa, ifp) { if (ifa->ifa_addr->sa_family != AF_INET6) continue; ifa_acquire(ifa, &psref); pserialize_read_exit(s); nd6_dad_stop(ifa); ia = (struct in6_ifaddr *)ifa; ia->ia6_flags |= IN6_IFF_TENTATIVE; s = pserialize_read_enter(); ifa_release(ifa, &psref); } pserialize_read_exit(s); curlwp_bindx(bound); } if (flags & ND6_IFF_AUTO_LINKLOCAL) { if (!(ndi->flags & ND6_IFF_AUTO_LINKLOCAL)) { /* auto_linklocal 0->1 transition */ ndi->flags |= ND6_IFF_AUTO_LINKLOCAL; in6_ifattach(ifp, NULL); } else if (!(flags & ND6_IFF_IFDISABLED) && ifp->if_flags & IFF_UP) { /* * When the IF already has * ND6_IFF_AUTO_LINKLOCAL, no link-local * address is assigned, and IFF_UP, try to * assign one. */ bool haslinklocal = 0; s = pserialize_read_enter(); IFADDR_READER_FOREACH(ifa, ifp) { if (ifa->ifa_addr->sa_family !=AF_INET6) continue; ia = (struct in6_ifaddr *)ifa; if (IN6_IS_ADDR_LINKLOCAL(IA6_IN6(ia))){ haslinklocal = true; break; } } pserialize_read_exit(s); if (!haslinklocal) in6_ifattach(ifp, NULL); } } ndi->flags = flags; } int nd6_ioctl(u_long cmd, void *data, struct ifnet *ifp) { #ifdef OSIOCGIFINFO_IN6_90 struct in6_ndireq90 *ondi = (struct in6_ndireq90 *)data; struct in6_ndifreq90 *ndif = (struct in6_ndifreq90 *)data; #define OND ondi->ndi #endif struct in6_ndireq *ndi = (struct in6_ndireq *)data; struct in6_nbrinfo *nbi = (struct in6_nbrinfo *)data; struct nd_kifinfo *ifndi = ND_IFINFO(ifp); int error = 0; #define ND ndi->ndi switch (cmd) { #ifdef OSIOCSRTRFLUSH_IN6 case OSIOCGDRLST_IN6: /* FALLTHROUGH */ case OSIOCGPRLST_IN6: /* FALLTHROUGH */ case OSIOCSNDFLUSH_IN6: /* FALLTHROUGH */ case OSIOCSPFXFLUSH_IN6: /* FALLTHROUGH */ case OSIOCSRTRFLUSH_IN6: /* FALLTHROUGH */ break; case OSIOCGDEFIFACE_IN6: ndif->ifindex = 0; break; case OSIOCSDEFIFACE_IN6: error = ENOTSUP; break; #endif #ifdef OSIOCGIFINFO_IN6 case OSIOCGIFINFO_IN6: /* FALLTHROUGH */ #endif #ifdef OSIOCGIFINFO_IN6_90 case OSIOCGIFINFO_IN6_90: memset(&OND, 0, sizeof(OND)); OND.initialized = 1; OND.chlim = ifndi->chlim; OND.basereachable = ifndi->basereachable; OND.retrans = ifndi->retrans; OND.flags = ifndi->flags; break; case OSIOCSIFINFO_IN6_90: /* Allow userland to set Neighbor Unreachability Detection * timers. */ if (OND.chlim != 0) ifndi->chlim = OND.chlim; if (OND.basereachable != 0 && OND.basereachable != ifndi->basereachable) { ifndi->basereachable = OND.basereachable; ifndi->reachable = ND_COMPUTE_RTIME(OND.basereachable); } if (OND.retrans != 0) ifndi->retrans = OND.retrans; /* Retain the old behaviour .... */ /* FALLTHROUGH */ case OSIOCSIFINFO_FLAGS_90: nd6_setifflags(ifp, OND.flags); break; #undef OND #endif case SIOCGIFINFO_IN6: ND.chlim = ifndi->chlim; ND.basereachable = ifndi->basereachable; ND.retrans = ifndi->retrans; ND.flags = ifndi->flags; break; case SIOCSIFINFO_IN6: /* Allow userland to set Neighbor Unreachability Detection * timers. */ if (ND.chlim != 0) ifndi->chlim = ND.chlim; if (ND.basereachable != 0 && ND.basereachable != ifndi->basereachable) { ifndi->basereachable = ND.basereachable; ifndi->reachable = ND_COMPUTE_RTIME(ND.basereachable); } if (ND.retrans != 0) ifndi->retrans = ND.retrans; break; case SIOCSIFINFO_FLAGS: nd6_setifflags(ifp, ND.flags); break; #undef ND case SIOCGNBRINFO_IN6: { struct llentry *ln; struct in6_addr nb_addr = nbi->addr; /* make local for safety */ if ((error = in6_setscope(&nb_addr, ifp, NULL)) != 0) return error; ln = nd6_lookup(&nb_addr, ifp, false); if (ln == NULL) { error = EINVAL; break; } nbi->state = ln->ln_state; nbi->asked = ln->ln_asked; nbi->isrouter = ln->ln_router; nbi->expire = ln->ln_expire ? time_mono_to_wall(ln->ln_expire) : 0; LLE_RUNLOCK(ln); break; } } return error; } void nd6_llinfo_release_pkts(struct llentry *ln, struct ifnet *ifp) { struct mbuf *m_hold, *m_hold_next; struct sockaddr_in6 sin6; LLE_WLOCK_ASSERT(ln); sockaddr_in6_init(&sin6, &ln->r_l3addr.addr6, 0, 0, 0); m_hold = ln->la_hold, ln->la_hold = NULL, ln->la_numheld = 0; LLE_ADDREF(ln); LLE_WUNLOCK(ln); for (; m_hold != NULL; m_hold = m_hold_next) { m_hold_next = m_hold->m_nextpkt; m_hold->m_nextpkt = NULL; /* * we assume ifp is not a p2p here, so * just set the 2nd argument as the * 1st one. */ ip6_if_output(ifp, ifp, m_hold, &sin6, NULL); } LLE_WLOCK(ln); LLE_REMREF(ln); } /* * Create neighbor cache entry and cache link-layer address, * on reception of inbound ND6 packets. (RS/RA/NS/redirect) */ void nd6_cache_lladdr( struct ifnet *ifp, struct in6_addr *from, char *lladdr, int lladdrlen, int type, /* ICMP6 type */ int code /* type dependent information */ ) { struct llentry *ln = NULL; int is_newentry; int do_update; int olladdr; int llchange; int newstate = 0; KASSERT(ifp != NULL); KASSERT(from != NULL); /* nothing must be updated for unspecified address */ if (IN6_IS_ADDR_UNSPECIFIED(from)) return; /* * Validation about ifp->if_addrlen and lladdrlen must be done in * the caller. * * XXX If the link does not have link-layer adderss, what should * we do? (ifp->if_addrlen == 0) * Spec says nothing in sections for RA, RS and NA. There's small * description on it in NS section (RFC 2461 7.2.3). */ ln = nd6_lookup(from, ifp, true); if (ln == NULL) { #if 0 /* nothing must be done if there's no lladdr */ if (!lladdr || !lladdrlen) return NULL; #endif ln = nd6_create(from, ifp); is_newentry = 1; } else { /* do nothing if static ndp is set */ if (ln->la_flags & LLE_STATIC) { LLE_WUNLOCK(ln); return; } is_newentry = 0; } if (ln == NULL) return; olladdr = (ln->la_flags & LLE_VALID) ? 1 : 0; if (olladdr && lladdr) { llchange = memcmp(lladdr, &ln->ll_addr, ifp->if_addrlen); } else llchange = 0; /* * newentry olladdr lladdr llchange (*=record) * 0 n n -- (1) * 0 y n -- (2) * 0 n y -- (3) * STALE * 0 y y n (4) * * 0 y y y (5) * STALE * 1 -- n -- (6) NOSTATE(= PASSIVE) * 1 -- y -- (7) * STALE */ if (lladdr) { /* (3-5) and (7) */ /* * Record source link-layer address * XXX is it dependent to ifp->if_type? */ memcpy(&ln->ll_addr, lladdr, ifp->if_addrlen); ln->la_flags |= LLE_VALID; } if (!is_newentry) { if ((!olladdr && lladdr) || /* (3) */ (olladdr && lladdr && llchange)) { /* (5) */ do_update = 1; newstate = ND_LLINFO_STALE; } else /* (1-2,4) */ do_update = 0; } else { do_update = 1; if (lladdr == NULL) /* (6) */ newstate = ND_LLINFO_NOSTATE; else /* (7) */ newstate = ND_LLINFO_STALE; } if (do_update) { /* * Update the state of the neighbor cache. */ ln->ln_state = newstate; if (ln->ln_state == ND_LLINFO_STALE) { /* * XXX: since nd6_output() below will cause * state tansition to DELAY and reset the timer, * we must set the timer now, although it is actually * meaningless. */ nd_set_timer(ln, ND_TIMER_GC); nd6_llinfo_release_pkts(ln, ifp); } else if (ln->ln_state == ND_LLINFO_INCOMPLETE) { /* probe right away */ nd_set_timer(ln, ND_TIMER_IMMEDIATE); } } /* * ICMP6 type dependent behavior. * * NS: clear IsRouter if new entry * RS: clear IsRouter * RA: set IsRouter if there's lladdr * redir: clear IsRouter if new entry * * RA case, (1): * The spec says that we must set IsRouter in the following cases: * - If lladdr exist, set IsRouter. This means (1-5). * - If it is old entry (!newentry), set IsRouter. This means (7). * So, based on the spec, in (1-5) and (7) cases we must set IsRouter. * A question arises for (1) case. (1) case has no lladdr in the * neighbor cache, this is similar to (6). * This case is rare but we figured that we MUST NOT set IsRouter. * * newentry olladdr lladdr llchange NS RS RA redir * D R * 0 n n -- (1) c ? s * 0 y n -- (2) c s s * 0 n y -- (3) c s s * 0 y y n (4) c s s * 0 y y y (5) c s s * 1 -- n -- (6) c c c s * 1 -- y -- (7) c c s c s * * (c=clear s=set) */ switch (type & 0xff) { case ND_NEIGHBOR_SOLICIT: /* * New entry must have is_router flag cleared. */ if (is_newentry) /* (6-7) */ ln->ln_router = 0; break; case ND_REDIRECT: /* * If the icmp is a redirect to a better router, always set the * is_router flag. Otherwise, if the entry is newly created, * clear the flag. [RFC 2461, sec 8.3] */ if (code == ND_REDIRECT_ROUTER) ln->ln_router = 1; else if (is_newentry) /* (6-7) */ ln->ln_router = 0; break; case ND_ROUTER_SOLICIT: /* * is_router flag must always be cleared. */ ln->ln_router = 0; break; case ND_ROUTER_ADVERT: /* * Mark an entry with lladdr as a router. */ if ((!is_newentry && (olladdr || lladdr)) || /* (2-5) */ (is_newentry && lladdr)) { /* (7) */ ln->ln_router = 1; } break; } if (do_update && lladdr != NULL) { struct sockaddr_in6 sin6; sockaddr_in6_init(&sin6, from, 0, 0, 0); rt_clonedmsg(is_newentry ? RTM_ADD : RTM_CHANGE, NULL, sin6tosa(&sin6), lladdr, ifp); } if (ln != NULL) LLE_WUNLOCK(ln); /* * If we have too many cache entries, initiate immediate * purging for some entries. */ if (is_newentry) nd6_gc_neighbors(LLTABLE6(ifp), &ln->r_l3addr.addr6); } static void nd6_slowtimo(void *ignored_arg) { struct nd_kifinfo *ndi; struct ifnet *ifp; struct psref psref; int s; SOFTNET_KERNEL_LOCK_UNLESS_NET_MPSAFE(); callout_reset(&nd6_slowtimo_ch, ND6_SLOWTIMER_INTERVAL * hz, nd6_slowtimo, NULL); s = pserialize_read_enter(); IFNET_READER_FOREACH(ifp) { ndi = ND_IFINFO(ifp); if (ndi->basereachable && /* already initialized */ (ndi->recalctm -= ND6_SLOWTIMER_INTERVAL) <= 0) { if_acquire(ifp, &psref); pserialize_read_exit(s); /* * Since reachable time rarely changes by router * advertisements, we SHOULD insure that a new random * value gets recomputed at least once every few hours. * (RFC 2461, 6.3.4) */ ndi->recalctm = nd6_recalc_reachtm_interval; ndi->reachable = ND_COMPUTE_RTIME(ndi->basereachable); s = pserialize_read_enter(); if_release(ifp, &psref); } } pserialize_read_exit(s); SOFTNET_KERNEL_UNLOCK_UNLESS_NET_MPSAFE(); } /* * Return 0 if a neighbor cache is found. Return EWOULDBLOCK if a cache is not * found and trying to resolve a neighbor; in this case the mbuf is queued in * the list. Otherwise return errno after freeing the mbuf. */ int nd6_resolve(struct ifnet *ifp, const struct rtentry *rt, struct mbuf *m, const struct sockaddr *_dst, uint8_t *lldst, size_t dstsize) { struct llentry *ln = NULL; bool created = false; const struct sockaddr_in6 *dst = satocsin6(_dst); int error; struct nd_kifinfo *ndi = ND_IFINFO(ifp); /* discard the packet if IPv6 operation is disabled on the interface */ if (ndi->flags & ND6_IFF_IFDISABLED) { m_freem(m); return ENETDOWN; /* better error? */ } /* * Address resolution or Neighbor Unreachability Detection * for the next hop. * At this point, the destination of the packet must be a unicast * or an anycast address(i.e. not a multicast). */ /* Look up the neighbor cache for the nexthop */ ln = nd6_lookup(&dst->sin6_addr, ifp, false); if (ln != NULL && (ln->la_flags & LLE_VALID) != 0 && ln->ln_state == ND_LLINFO_REACHABLE) { /* Fast path */ memcpy(lldst, &ln->ll_addr, MIN(dstsize, ifp->if_addrlen)); LLE_RUNLOCK(ln); return 0; } if (ln != NULL) LLE_RUNLOCK(ln); /* Slow path */ ln = nd6_lookup(&dst->sin6_addr, ifp, true); if (ln == NULL && nd6_is_addr_neighbor(dst, ifp)) { /* * Since nd6_is_addr_neighbor() internally calls nd6_lookup(), * the condition below is not very efficient. But we believe * it is tolerable, because this should be a rare case. */ ln = nd6_create(&dst->sin6_addr, ifp); if (ln == NULL) { char ip6buf[INET6_ADDRSTRLEN]; log(LOG_DEBUG, "%s: can't allocate llinfo for %s " "(ln=%p, rt=%p)\n", __func__, IN6_PRINT(ip6buf, &dst->sin6_addr), ln, rt); m_freem(m); return ENOBUFS; } created = true; } if (ln == NULL) { m_freem(m); return ENETDOWN; /* better error? */ } error = nd_resolve(ln, rt, m, lldst, dstsize); if (created) nd6_gc_neighbors(LLTABLE6(ifp), &dst->sin6_addr); return error; } int nd6_need_cache(struct ifnet *ifp) { /* * XXX: we currently do not make neighbor cache on any interface * other than ARCnet, Ethernet, and GIF. * * RFC2893 says: * - unidirectional tunnels needs no ND */ switch (ifp->if_type) { case IFT_ARCNET: case IFT_ETHER: case IFT_IEEE1394: case IFT_CARP: case IFT_GIF: /* XXX need more cases? */ case IFT_IPSEC: case IFT_PPP: case IFT_TUNNEL: return 1; default: return 0; } } int nd6_sysctl( int name, void *oldp, /* syscall arg, need copyout */ size_t *oldlenp, void *newp, /* syscall arg, need copyin */ size_t newlen ) { int error; if (newp) return EPERM; switch (name) { /* call the nd6 compat_90 hook to validate the nd6-related names */ case OICMPV6CTL_ND6_DRLIST: /* FALLTHROUGH */ case OICMPV6CTL_ND6_PRLIST: MODULE_HOOK_CALL(net_inet6_nd_90_hook, (name), ENOPROTOOPT, error); if (error == 0) *oldlenp = 0; return error; case ICMPV6CTL_ND6_MAXQLEN: return 0; default: return ENOPROTOOPT; } }
50 12 12 12 2 2 2 24 24 24 24 1 1 1 1 1 1 38 38 38 38 38 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 /* $NetBSD: ipsec.c,v 1.178 2023/01/27 09:33:43 ozaki-r Exp $ */ /* $FreeBSD: ipsec.c,v 1.2.2.2 2003/07/01 01:38:13 sam Exp $ */ /* $KAME: ipsec.c,v 1.103 2001/05/24 07:14:18 sakane Exp $ */ /* * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the project nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: ipsec.c,v 1.178 2023/01/27 09:33:43 ozaki-r Exp $"); /* * IPsec controller part. */ #if defined(_KERNEL_OPT) #include "opt_inet.h" #include "opt_ipsec.h" #endif #include <sys/param.h> #include <sys/systm.h> #include <sys/mbuf.h> #include <sys/domain.h> #include <sys/protosw.h> #include <sys/socket.h> #include <sys/socketvar.h> #include <sys/errno.h> #include <sys/time.h> #include <sys/kernel.h> #include <sys/syslog.h> #include <sys/sysctl.h> #include <sys/proc.h> #include <sys/kauth.h> #include <sys/cpu.h> #include <sys/kmem.h> #include <sys/pserialize.h> #include <net/if.h> #include <net/route.h> #include <netinet/in.h> #include <netinet/in_systm.h> #include <netinet/ip.h> #include <netinet/ip_var.h> #include <netinet/in_var.h> #include <netinet/udp.h> #include <netinet/udp_var.h> #include <netinet/tcp.h> #include <netinet/udp.h> #include <netinet/ip_icmp.h> #include <netinet/ip_private.h> #include <netinet/ip6.h> #ifdef INET6 #include <netinet6/ip6_var.h> #endif #include <netinet/in_pcb.h> #include <netinet/in_offload.h> #ifdef INET6 #include <netinet6/in6_pcb.h> #include <netinet/icmp6.h> #endif #include <netipsec/ipsec.h> #include <netipsec/ipsec_var.h> #include <netipsec/ipsec_private.h> #ifdef INET6 #include <netipsec/ipsec6.h> #endif #include <netipsec/ah_var.h> #include <netipsec/esp_var.h> #include <netipsec/ipcomp.h> /*XXX*/ #include <netipsec/ipcomp_var.h> #include <netipsec/key.h> #include <netipsec/keydb.h> #include <netipsec/key_debug.h> #include <netipsec/xform.h> int ipsec_used = 0; int ipsec_enabled = 1; #ifdef IPSEC_DEBUG int ipsec_debug = 1; /* * When set to 1, IPsec will send packets with the same sequence number. * This allows to verify if the other side has proper replay attacks detection. */ int ipsec_replay = 0; /* * When set 1, IPsec will send packets with corrupted HMAC. * This allows to verify if the other side properly detects modified packets. */ int ipsec_integrity = 0; #else int ipsec_debug = 0; #endif percpu_t *ipsecstat_percpu; int ip4_ah_offsetmask = 0; /* maybe IP_DF? */ int ip4_ipsec_dfbit = 2; /* DF bit on encap. 0: clear 1: set 2: copy */ int ip4_esp_trans_deflev = IPSEC_LEVEL_USE; int ip4_esp_net_deflev = IPSEC_LEVEL_USE; int ip4_ah_trans_deflev = IPSEC_LEVEL_USE; int ip4_ah_net_deflev = IPSEC_LEVEL_USE; struct secpolicy ip4_def_policy; int ip4_ipsec_ecn = 0; /* ECN ignore(-1)/forbidden(0)/allowed(1) */ u_int ipsec_spdgen = 1; /* SPD generation # */ static struct secpolicy ipsec_dummy_sp __read_mostly = { .state = IPSEC_SPSTATE_ALIVE, /* If ENTRUST, the dummy SP never be used. See ipsec_getpolicybysock. */ .policy = IPSEC_POLICY_ENTRUST, }; static struct secpolicy *ipsec_checkpcbcache(struct mbuf *, struct inpcbpolicy *, int); static int ipsec_fillpcbcache(struct inpcbpolicy *, struct mbuf *, struct secpolicy *, int); static int ipsec_invalpcbcache(struct inpcbpolicy *, int); /* * Crypto support requirements: * * 1 require hardware support * -1 require software support * 0 take anything */ int crypto_support = 0; static struct secpolicy *ipsec_getpolicybysock(struct mbuf *, u_int, struct inpcb *, int *); #ifdef INET6 int ip6_esp_trans_deflev = IPSEC_LEVEL_USE; int ip6_esp_net_deflev = IPSEC_LEVEL_USE; int ip6_ah_trans_deflev = IPSEC_LEVEL_USE; int ip6_ah_net_deflev = IPSEC_LEVEL_USE; struct secpolicy ip6_def_policy; int ip6_ipsec_ecn = 0; /* ECN ignore(-1)/forbidden(0)/allowed(1) */ #endif static int ipsec_setspidx_inpcb(struct mbuf *, struct inpcb *); static int ipsec_setspidx(struct mbuf *, struct secpolicyindex *, int, int); static void ipsec4_get_ulp(struct mbuf *m, struct secpolicyindex *, int); static int ipsec4_setspidx_ipaddr(struct mbuf *, struct secpolicyindex *); #ifdef INET6 static void ipsec6_get_ulp(struct mbuf *m, struct secpolicyindex *, int); static int ipsec6_setspidx_ipaddr(struct mbuf *, struct secpolicyindex *); #endif static void ipsec_delpcbpolicy(struct inpcbpolicy *); static void ipsec_destroy_policy(struct secpolicy *); static int ipsec_sp_reject(const struct secpolicy *, const struct mbuf *); static void vshiftl(unsigned char *, int, int); static size_t ipsec_sp_hdrsiz(const struct secpolicy *, const struct mbuf *); /* * Try to validate and use cached policy on a PCB. */ static struct secpolicy * ipsec_checkpcbcache(struct mbuf *m, struct inpcbpolicy *pcbsp, int dir) { struct secpolicyindex spidx; struct secpolicy *sp = NULL; int s; KASSERT(IPSEC_DIR_IS_VALID(dir)); KASSERT(pcbsp != NULL); KASSERT(dir < __arraycount(pcbsp->sp_cache)); KASSERT(inp_locked(pcbsp->sp_inp)); /* * Checking the generation and sp->state and taking a reference to an SP * must be in a critical section of pserialize. See key_unlink_sp. */ s = pserialize_read_enter(); /* SPD table change invalidate all the caches. */ if (ipsec_spdgen != pcbsp->sp_cache[dir].cachegen) { ipsec_invalpcbcache(pcbsp, dir); goto out; } sp = pcbsp->sp_cache[dir].cachesp; if (sp == NULL) goto out; if (sp->state != IPSEC_SPSTATE_ALIVE) { sp = NULL; ipsec_invalpcbcache(pcbsp, dir); goto out; } if ((pcbsp->sp_cacheflags & IPSEC_PCBSP_CONNECTED) == 0) { /* NB: assume ipsec_setspidx never sleep */ if (ipsec_setspidx(m, &spidx, dir, 1) != 0) { sp = NULL; goto out; } /* * We have to make an exact match here since the cached rule * might have lower priority than a rule that would otherwise * have matched the packet. */ if (memcmp(&pcbsp->sp_cache[dir].cacheidx, &spidx, sizeof(spidx))) { sp = NULL; goto out; } } else { /* * The pcb is connected, and the L4 code is sure that: * - outgoing side uses inp_[lf]addr * - incoming side looks up policy after inpcb lookup * and address pair is know to be stable. We do not need * to generate spidx again, nor check the address match again. * * For IPv4/v6 SOCK_STREAM sockets, this assumptions holds * and there are calls to ipsec_pcbconn() from inpcb_connect(). */ } key_sp_touch(sp); KEY_SP_REF(sp); KEYDEBUG_PRINTF(KEYDEBUG_IPSEC_STAMP, "DP cause refcnt++:%d SP:%p\n", key_sp_refcnt(sp), pcbsp->sp_cache[dir].cachesp); out: pserialize_read_exit(s); return sp; } static int ipsec_fillpcbcache(struct inpcbpolicy *pcbsp, struct mbuf *m, struct secpolicy *sp, int dir) { KASSERT(IPSEC_DIR_IS_INOROUT(dir)); KASSERT(dir < __arraycount(pcbsp->sp_cache)); KASSERT(inp_locked(pcbsp->sp_inp)); pcbsp->sp_cache[dir].cachesp = NULL; pcbsp->sp_cache[dir].cachehint = IPSEC_PCBHINT_UNKNOWN; if (ipsec_setspidx(m, &pcbsp->sp_cache[dir].cacheidx, dir, 1) != 0) { return EINVAL; } pcbsp->sp_cache[dir].cachesp = sp; if (pcbsp->sp_cache[dir].cachesp) { /* * If the PCB is connected, we can remember a hint to * possibly short-circuit IPsec processing in other places. */ if (pcbsp->sp_cacheflags & IPSEC_PCBSP_CONNECTED) { switch (pcbsp->sp_cache[dir].cachesp->policy) { case IPSEC_POLICY_NONE: case IPSEC_POLICY_BYPASS: pcbsp->sp_cache[dir].cachehint = IPSEC_PCBHINT_NO; break; default: pcbsp->sp_cache[dir].cachehint = IPSEC_PCBHINT_YES; } } } pcbsp->sp_cache[dir].cachegen = ipsec_spdgen; return 0; } static int ipsec_invalpcbcache(struct inpcbpolicy *pcbsp, int dir) { int i; KASSERT(inp_locked(pcbsp->sp_inp)); for (i = IPSEC_DIR_INBOUND; i <= IPSEC_DIR_OUTBOUND; i++) { if (dir != IPSEC_DIR_ANY && i != dir) continue; pcbsp->sp_cache[i].cachesp = NULL; pcbsp->sp_cache[i].cachehint = IPSEC_PCBHINT_UNKNOWN; pcbsp->sp_cache[i].cachegen = 0; memset(&pcbsp->sp_cache[i].cacheidx, 0, sizeof(pcbsp->sp_cache[i].cacheidx)); } return 0; } void ipsec_pcbconn(struct inpcbpolicy *pcbsp) { KASSERT(inp_locked(pcbsp->sp_inp)); pcbsp->sp_cacheflags |= IPSEC_PCBSP_CONNECTED; ipsec_invalpcbcache(pcbsp, IPSEC_DIR_ANY); } void ipsec_pcbdisconn(struct inpcbpolicy *pcbsp) { KASSERT(inp_locked(pcbsp->sp_inp)); pcbsp->sp_cacheflags &= ~IPSEC_PCBSP_CONNECTED; ipsec_invalpcbcache(pcbsp, IPSEC_DIR_ANY); } void ipsec_invalpcbcacheall(void) { if (ipsec_spdgen == UINT_MAX) ipsec_spdgen = 1; else ipsec_spdgen++; } /* * Return a held reference to the default SP. */ static struct secpolicy * key_get_default_sp(int af, const char *where, int tag) { struct secpolicy *sp; KEYDEBUG_PRINTF(KEYDEBUG_IPSEC_STAMP, "DP from %s:%u\n", where, tag); switch(af) { case AF_INET: sp = &ip4_def_policy; break; #ifdef INET6 case AF_INET6: sp = &ip6_def_policy; break; #endif default: KEYDEBUG_PRINTF(KEYDEBUG_IPSEC_STAMP, "unexpected protocol family %u\n", af); return NULL; } if (sp->policy != IPSEC_POLICY_DISCARD && sp->policy != IPSEC_POLICY_NONE) { IPSECLOG(LOG_INFO, "fixed system default policy: %d->%d\n", sp->policy, IPSEC_POLICY_NONE); sp->policy = IPSEC_POLICY_NONE; } KEY_SP_REF(sp); KEYDEBUG_PRINTF(KEYDEBUG_IPSEC_STAMP, "DP returns SP:%p (%u)\n", sp, key_sp_refcnt(sp)); return sp; } #define KEY_GET_DEFAULT_SP(af) \ key_get_default_sp((af), __func__, __LINE__) /* * For OUTBOUND packet having a socket. Searching SPD for packet, * and return a pointer to SP. * OUT: NULL: no appropriate SP found, the following value is set to error. * 0 : bypass * EACCES : discard packet. * ENOENT : ipsec_acquire() in progress, maybe. * others : error occurred. * others: a pointer to SP * * NOTE: IPv6 mapped address concern is implemented here. */ static struct secpolicy * ipsec_getpolicybysock(struct mbuf *m, u_int dir, struct inpcb *inp, int *error) { struct inpcbpolicy *pcbsp = NULL; struct secpolicy *currsp = NULL; /* policy on socket */ struct secpolicy *sp; int af; KASSERT(m != NULL); KASSERT(inp != NULL); KASSERT(error != NULL); KASSERTMSG(IPSEC_DIR_IS_INOROUT(dir), "invalid direction %u", dir); KASSERT(inp->inp_socket != NULL); KASSERT(inp_locked(inp)); /* XXX FIXME inpcb vs socket*/ af = inp->inp_af; KASSERTMSG(af == AF_INET || af == AF_INET6, "unexpected protocol family %u", af); KASSERT(inp->inp_sp != NULL); /* If we have a cached entry, and if it is still valid, use it. */ IPSEC_STATINC(IPSEC_STAT_SPDCACHELOOKUP); currsp = ipsec_checkpcbcache(m, inp->inp_sp, dir); if (currsp) { *error = 0; return currsp; } IPSEC_STATINC(IPSEC_STAT_SPDCACHEMISS); switch (af) { case AF_INET: #if defined(INET6) case AF_INET6: #endif *error = ipsec_setspidx_inpcb(m, inp); pcbsp = inp->inp_sp; break; default: *error = EPFNOSUPPORT; break; } if (*error) return NULL; KASSERT(pcbsp != NULL); switch (dir) { case IPSEC_DIR_INBOUND: currsp = pcbsp->sp_in; break; case IPSEC_DIR_OUTBOUND: currsp = pcbsp->sp_out; break; } KASSERT(currsp != NULL); if (pcbsp->priv) { /* when privileged socket */ switch (currsp->policy) { case IPSEC_POLICY_BYPASS: case IPSEC_POLICY_IPSEC: KEY_SP_REF(currsp); sp = currsp; break; case IPSEC_POLICY_ENTRUST: /* look for a policy in SPD */ if (key_havesp(dir)) sp = KEY_LOOKUP_SP_BYSPIDX(&currsp->spidx, dir); else sp = NULL; if (sp == NULL) /* no SP found */ sp = KEY_GET_DEFAULT_SP(af); break; default: IPSECLOG(LOG_ERR, "Invalid policy for PCB %d\n", currsp->policy); *error = EINVAL; return NULL; } } else { /* unpriv, SPD has policy */ if (key_havesp(dir)) sp = KEY_LOOKUP_SP_BYSPIDX(&currsp->spidx, dir); else sp = NULL; if (sp == NULL) { /* no SP found */ switch (currsp->policy) { case IPSEC_POLICY_BYPASS: IPSECLOG(LOG_ERR, "Illegal policy for " "non-priviliged defined %d\n", currsp->policy); *error = EINVAL; return NULL; case IPSEC_POLICY_ENTRUST: sp = KEY_GET_DEFAULT_SP(af); break; case IPSEC_POLICY_IPSEC: KEY_SP_REF(currsp); sp = currsp; break; default: IPSECLOG(LOG_ERR, "Invalid policy for " "PCB %d\n", currsp->policy); *error = EINVAL; return NULL; } } } KASSERTMSG(sp != NULL, "null SP (priv %u policy %u", pcbsp->priv, currsp->policy); KEYDEBUG_PRINTF(KEYDEBUG_IPSEC_STAMP, "DP (priv %u policy %u) allocates SP:%p (refcnt %u)\n", pcbsp->priv, currsp->policy, sp, key_sp_refcnt(sp)); ipsec_fillpcbcache(pcbsp, m, sp, dir); return sp; } /* * For FORWARDING packet or OUTBOUND without a socket. Searching SPD for packet, * and return a pointer to SP. * OUT: positive: a pointer to the entry for security policy leaf matched. * NULL: no appropriate SP found, the following value is set to error. * 0 : bypass * EACCES : discard packet. * ENOENT : ipsec_acquire() in progress, maybe. * others : error occurred. */ static struct secpolicy * ipsec_getpolicybyaddr(struct mbuf *m, u_int dir, int flag, int *error) { struct secpolicyindex spidx; struct secpolicy *sp; KASSERT(m != NULL); KASSERT(error != NULL); KASSERTMSG(IPSEC_DIR_IS_INOROUT(dir), "invalid direction %u", dir); sp = NULL; /* Make an index to look for a policy. */ *error = ipsec_setspidx(m, &spidx, dir, 1); if (*error != 0) { IPSECLOG(LOG_DEBUG, "setpidx failed, dir %u flag %u\n", dir, flag); memset(&spidx, 0, sizeof(spidx)); return NULL; } spidx.dir = dir; if (key_havesp(dir)) { sp = KEY_LOOKUP_SP_BYSPIDX(&spidx, dir); } if (sp == NULL) { /* no SP found, use system default */ sp = KEY_GET_DEFAULT_SP(spidx.dst.sa.sa_family); } KASSERT(sp != NULL); return sp; } static struct secpolicy * ipsec_checkpolicy(struct mbuf *m, u_int dir, u_int flag, int *error, struct inpcb *inp) { struct secpolicy *sp; *error = 0; if (inp == NULL) { sp = ipsec_getpolicybyaddr(m, dir, flag, error); } else { KASSERT(inp->inp_socket != NULL); sp = ipsec_getpolicybysock(m, dir, inp, error); } if (sp == NULL) { KASSERTMSG(*error != 0, "getpolicy failed w/o error"); IPSEC_STATINC(IPSEC_STAT_OUT_INVAL); return NULL; } KASSERTMSG(*error == 0, "sp w/ error set to %u", *error); switch (sp->policy) { case IPSEC_POLICY_ENTRUST: default: printf("%s: invalid policy %u\n", __func__, sp->policy); /* fall thru... */ case IPSEC_POLICY_DISCARD: IPSEC_STATINC(IPSEC_STAT_OUT_POLVIO); *error = -EINVAL; /* packet is discarded by caller */ break; case IPSEC_POLICY_BYPASS: case IPSEC_POLICY_NONE: KEY_SP_UNREF(&sp); sp = NULL; /* NB: force NULL result */ break; case IPSEC_POLICY_IPSEC: KASSERT(sp->req != NULL); break; } if (*error != 0) { KEY_SP_UNREF(&sp); sp = NULL; IPSECLOG(LOG_DEBUG, "done, error %d\n", *error); } return sp; } int ipsec4_output(struct mbuf *m, struct inpcb *inp, int flags, u_long *mtu, bool *natt_frag, bool *done, bool *count_drop) { struct secpolicy *sp = NULL; u_long _mtu = 0; int error; /* * Check the security policy (SP) for the packet and, if required, * do IPsec-related processing. There are two cases here; the first * time a packet is sent through it will be untagged and handled by * ipsec_checkpolicy(). If the packet is resubmitted to ip_output * (e.g. after AH, ESP, etc. processing), there will be a tag to * bypass the lookup and related policy checking. */ if (ipsec_outdone(m)) { return 0; } if (inp && ipsec_pcb_skip_ipsec(inp->inp_sp, IPSEC_DIR_OUTBOUND)) { return 0; } sp = ipsec_checkpolicy(m, IPSEC_DIR_OUTBOUND, flags, &error, inp); /* * There are four return cases: * sp != NULL apply IPsec policy * sp == NULL, error == 0 no IPsec handling needed * sp == NULL, error == -EINVAL discard packet w/o error * sp == NULL, error != 0 discard packet, report error */ if (sp == NULL) { if (error) { /* * Hack: -EINVAL is used to signal that a packet * should be silently discarded. This is typically * because we asked key management for an SA and * it was delayed (e.g. kicked up to IKE). */ if (error == -EINVAL) error = 0; m_freem(m); *done = true; *count_drop = true; return error; } /* No IPsec processing for this packet. */ return 0; } /* * Do delayed checksums now because we send before * this is done in the normal processing path. */ if (m->m_pkthdr.csum_flags & (M_CSUM_TCPv4|M_CSUM_UDPv4)) { in_undefer_cksum_tcpudp(m); m->m_pkthdr.csum_flags &= ~(M_CSUM_TCPv4|M_CSUM_UDPv4); } error = ipsec4_process_packet(m, sp->req, &_mtu); if (error == 0 && _mtu != 0) { /* * NAT-T ESP fragmentation: do not do IPSec processing * now, we will do it on each fragmented packet. */ *mtu = _mtu; *natt_frag = true; KEY_SP_UNREF(&sp); return 0; } /* * Preserve KAME behaviour: ENOENT can be returned * when an SA acquire is in progress. Don't propagate * this to user-level; it confuses applications. * * XXX this will go away when the SADB is redone. */ if (error == ENOENT) error = 0; KEY_SP_UNREF(&sp); *done = true; return error; } int ipsec_ip_input_checkpolicy(struct mbuf *m, bool forward) { struct secpolicy *sp; int error; error = ipsec_in_reject(m, NULL); if (error) { return EINVAL; } if (!forward || !(m->m_flags & M_CANFASTFWD)) { return 0; } /* * Peek at the outbound SP for this packet to determine if * it is a Fast Forward candidate. */ sp = ipsec_checkpolicy(m, IPSEC_DIR_OUTBOUND, IP_FORWARDING, &error, NULL); if (sp != NULL) { m->m_flags &= ~M_CANFASTFWD; KEY_SP_UNREF(&sp); } return 0; } /* * If the packet is routed over IPsec tunnel, tell the originator the * tunnel MTU. * tunnel MTU = if MTU - sizeof(IP) - ESP/AH hdrsiz * * XXX: Quick hack!!! * * XXX: And what if the MTU goes negative? */ void ipsec_mtu(struct mbuf *m, int *destmtu) { struct secpolicy *sp; size_t ipsechdr; int error; sp = ipsec_getpolicybyaddr(m, IPSEC_DIR_OUTBOUND, IP_FORWARDING, &error); if (sp == NULL) { return; } /* Count IPsec header size. */ ipsechdr = ipsec_sp_hdrsiz(sp, m); /* * Find the correct route for outer IP header, compute tunnel MTU. */ if (sp->req) { struct secasvar *sav; sav = ipsec_lookup_sa(sp->req, m); if (sav != NULL) { struct route *ro; struct rtentry *rt; ro = &sav->sah->sa_route; rt = rtcache_validate(ro); if (rt && rt->rt_ifp) { *destmtu = rt->rt_rmx.rmx_mtu ? rt->rt_rmx.rmx_mtu : rt->rt_ifp->if_mtu; *destmtu -= ipsechdr; } rtcache_unref(rt, ro); KEY_SA_UNREF(&sav); } } KEY_SP_UNREF(&sp); } static int ipsec_setspidx_inpcb(struct mbuf *m, struct inpcb *inp) { int error; KASSERT(inp != NULL); KASSERT(inp->inp_sp != NULL); KASSERT(inp->inp_sp->sp_out != NULL); KASSERT(inp->inp_sp->sp_in != NULL); error = ipsec_setspidx(m, &inp->inp_sp->sp_in->spidx, IPSEC_DIR_INBOUND, 1); if (error == 0) { inp->inp_sp->sp_out->spidx = inp->inp_sp->sp_in->spidx; inp->inp_sp->sp_out->spidx.dir = IPSEC_DIR_OUTBOUND; } else { memset(&inp->inp_sp->sp_in->spidx, 0, sizeof(inp->inp_sp->sp_in->spidx)); memset(&inp->inp_sp->sp_out->spidx, 0, sizeof(inp->inp_sp->sp_out->spidx)); } return error; } /* * configure security policy index (src/dst/proto/sport/dport) * by looking at the content of mbuf. * the caller is responsible for error recovery (like clearing up spidx). */ static int ipsec_setspidx(struct mbuf *m, struct secpolicyindex *spidx, int dir, int needport) { struct ip *ip = NULL; struct ip ipbuf; u_int v; int error; KASSERT(m != NULL); M_VERIFY_PACKET(m); if (m->m_pkthdr.len < sizeof(struct ip)) { KEYDEBUG_PRINTF(KEYDEBUG_IPSEC_DUMP, "pkthdr.len(%d) < sizeof(struct ip), ignored.\n", m->m_pkthdr.len); return EINVAL; } memset(spidx, 0, sizeof(*spidx)); spidx->dir = dir; if (m->m_len >= sizeof(*ip)) { ip = mtod(m, struct ip *); } else { m_copydata(m, 0, sizeof(ipbuf), &ipbuf); ip = &ipbuf; } v = ip->ip_v; switch (v) { case 4: error = ipsec4_setspidx_ipaddr(m, spidx); if (error) return error; ipsec4_get_ulp(m, spidx, needport); return 0; #ifdef INET6 case 6: if (m->m_pkthdr.len < sizeof(struct ip6_hdr)) { KEYDEBUG_PRINTF(KEYDEBUG_IPSEC_DUMP, "pkthdr.len(%d) < sizeof(struct ip6_hdr), " "ignored.\n", m->m_pkthdr.len); return EINVAL; } error = ipsec6_setspidx_ipaddr(m, spidx); if (error) return error; ipsec6_get_ulp(m, spidx, needport); return 0; #endif default: KEYDEBUG_PRINTF(KEYDEBUG_IPSEC_DUMP, "unknown IP version %u, ignored.\n", v); return EINVAL; } } static void ipsec4_get_ulp(struct mbuf *m, struct secpolicyindex *spidx, int needport) { u_int8_t nxt; int off; KASSERT(m != NULL); KASSERTMSG(m->m_pkthdr.len >= sizeof(struct ip), "packet too short"); /* NB: ip_input() flips it into host endian XXX need more checking */ if (m->m_len >= sizeof(struct ip)) { struct ip *ip = mtod(m, struct ip *); if (ip->ip_off & htons(IP_MF | IP_OFFMASK)) goto done; off = ip->ip_hl << 2; nxt = ip->ip_p; } else { struct ip ih; m_copydata(m, 0, sizeof(struct ip), &ih); if (ih.ip_off & htons(IP_MF | IP_OFFMASK)) goto done; off = ih.ip_hl << 2; nxt = ih.ip_p; } while (off < m->m_pkthdr.len) { struct ip6_ext ip6e; struct tcphdr th; struct udphdr uh; struct icmp icmph; switch (nxt) { case IPPROTO_TCP: spidx->ul_proto = nxt; if (!needport) goto done_proto; if (off + sizeof(struct tcphdr) > m->m_pkthdr.len) goto done; m_copydata(m, off, sizeof(th), &th); spidx->src.sin.sin_port = th.th_sport; spidx->dst.sin.sin_port = th.th_dport; return; case IPPROTO_UDP: spidx->ul_proto = nxt; if (!needport) goto done_proto; if (off + sizeof(struct udphdr) > m->m_pkthdr.len) goto done; m_copydata(m, off, sizeof(uh), &uh); spidx->src.sin.sin_port = uh.uh_sport; spidx->dst.sin.sin_port = uh.uh_dport; return; case IPPROTO_AH: if (off + sizeof(ip6e) > m->m_pkthdr.len) goto done; /* XXX sigh, this works but is totally bogus */ m_copydata(m, off, sizeof(ip6e), &ip6e); off += (ip6e.ip6e_len + 2) << 2; nxt = ip6e.ip6e_nxt; break; case IPPROTO_ICMP: spidx->ul_proto = nxt; if (off + sizeof(struct icmp) > m->m_pkthdr.len) goto done; m_copydata(m, off, sizeof(icmph), &icmph); ((struct sockaddr_in *)&spidx->src)->sin_port = htons((uint16_t)icmph.icmp_type); ((struct sockaddr_in *)&spidx->dst)->sin_port = htons((uint16_t)icmph.icmp_code); return; default: /* XXX intermediate headers??? */ spidx->ul_proto = nxt; goto done_proto; } } done: spidx->ul_proto = IPSEC_ULPROTO_ANY; done_proto: spidx->src.sin.sin_port = IPSEC_PORT_ANY; spidx->dst.sin.sin_port = IPSEC_PORT_ANY; } static int ipsec4_setspidx_ipaddr(struct mbuf *m, struct secpolicyindex *spidx) { static const struct sockaddr_in template = { sizeof(struct sockaddr_in), AF_INET, 0, { 0 }, { 0, 0, 0, 0, 0, 0, 0, 0 } }; spidx->src.sin = template; spidx->dst.sin = template; if (m->m_len < sizeof(struct ip)) { m_copydata(m, offsetof(struct ip, ip_src), sizeof(struct in_addr), &spidx->src.sin.sin_addr); m_copydata(m, offsetof(struct ip, ip_dst), sizeof(struct in_addr), &spidx->dst.sin.sin_addr); } else { struct ip *ip = mtod(m, struct ip *); spidx->src.sin.sin_addr = ip->ip_src; spidx->dst.sin.sin_addr = ip->ip_dst; } spidx->prefs = sizeof(struct in_addr) << 3; spidx->prefd = sizeof(struct in_addr) << 3; return 0; } #ifdef INET6 static void ipsec6_get_ulp(struct mbuf *m, struct secpolicyindex *spidx, int needport) { int off, nxt; struct tcphdr th; struct udphdr uh; struct icmp6_hdr icmph; KASSERT(m != NULL); if (KEYDEBUG_ON(KEYDEBUG_IPSEC_DUMP)) { kdebug_mbuf(__func__, m); } /* set default */ spidx->ul_proto = IPSEC_ULPROTO_ANY; ((struct sockaddr_in6 *)&spidx->src)->sin6_port = IPSEC_PORT_ANY; ((struct sockaddr_in6 *)&spidx->dst)->sin6_port = IPSEC_PORT_ANY; nxt = -1; off = ip6_lasthdr(m, 0, IPPROTO_IPV6, &nxt); if (off < 0 || m->m_pkthdr.len < off) return; switch (nxt) { case IPPROTO_TCP: spidx->ul_proto = nxt; if (!needport) break; if (off + sizeof(struct tcphdr) > m->m_pkthdr.len) break; m_copydata(m, off, sizeof(th), &th); ((struct sockaddr_in6 *)&spidx->src)->sin6_port = th.th_sport; ((struct sockaddr_in6 *)&spidx->dst)->sin6_port = th.th_dport; break; case IPPROTO_UDP: spidx->ul_proto = nxt; if (!needport) break; if (off + sizeof(struct udphdr) > m->m_pkthdr.len) break; m_copydata(m, off, sizeof(uh), &uh); ((struct sockaddr_in6 *)&spidx->src)->sin6_port = uh.uh_sport; ((struct sockaddr_in6 *)&spidx->dst)->sin6_port = uh.uh_dport; break; case IPPROTO_ICMPV6: spidx->ul_proto = nxt; if (off + sizeof(struct icmp6_hdr) > m->m_pkthdr.len) break; m_copydata(m, off, sizeof(icmph), &icmph); ((struct sockaddr_in6 *)&spidx->src)->sin6_port = htons((uint16_t)icmph.icmp6_type); ((struct sockaddr_in6 *)&spidx->dst)->sin6_port = htons((uint16_t)icmph.icmp6_code); break; default: /* XXX intermediate headers??? */ spidx->ul_proto = nxt; break; } } static int ipsec6_setspidx_ipaddr(struct mbuf *m, struct secpolicyindex *spidx) { struct ip6_hdr *ip6 = NULL; struct ip6_hdr ip6buf; struct sockaddr_in6 *sin6; if (m->m_len >= sizeof(*ip6)) { ip6 = mtod(m, struct ip6_hdr *); } else { m_copydata(m, 0, sizeof(ip6buf), &ip6buf); ip6 = &ip6buf; } sin6 = (struct sockaddr_in6 *)&spidx->src; memset(sin6, 0, sizeof(*sin6)); sin6->sin6_family = AF_INET6; sin6->sin6_len = sizeof(struct sockaddr_in6); memcpy(&sin6->sin6_addr, &ip6->ip6_src, sizeof(ip6->ip6_src)); if (IN6_IS_SCOPE_LINKLOCAL(&ip6->ip6_src)) { sin6->sin6_addr.s6_addr16[1] = 0; sin6->sin6_scope_id = ntohs(ip6->ip6_src.s6_addr16[1]); } spidx->prefs = sizeof(struct in6_addr) << 3; sin6 = (struct sockaddr_in6 *)&spidx->dst; memset(sin6, 0, sizeof(*sin6)); sin6->sin6_family = AF_INET6; sin6->sin6_len = sizeof(struct sockaddr_in6); memcpy(&sin6->sin6_addr, &ip6->ip6_dst, sizeof(ip6->ip6_dst)); if (IN6_IS_SCOPE_LINKLOCAL(&ip6->ip6_dst)) { sin6->sin6_addr.s6_addr16[1] = 0; sin6->sin6_scope_id = ntohs(ip6->ip6_dst.s6_addr16[1]); } spidx->prefd = sizeof(struct in6_addr) << 3; return 0; } #endif static void ipsec_delpcbpolicy(struct inpcbpolicy *p) { kmem_intr_free(p, sizeof(*p)); } int ipsec_init_pcbpolicy(struct socket *so, struct inpcbpolicy **policy) { struct inpcbpolicy *new; KASSERT(so != NULL); KASSERT(policy != NULL); new = kmem_intr_zalloc(sizeof(*new), KM_NOSLEEP); if (new == NULL) { IPSECLOG(LOG_DEBUG, "No more memory.\n"); return ENOBUFS; } if (IPSEC_PRIVILEGED_SO(so)) new->priv = 1; else new->priv = 0; /* * Set dummy SPs. Actual SPs will be allocated later if needed. */ new->sp_in = &ipsec_dummy_sp; new->sp_out = &ipsec_dummy_sp; *policy = new; return 0; } static void ipsec_destroy_policy(struct secpolicy *sp) { if (sp == &ipsec_dummy_sp) { ; /* It's dummy. No need to free it. */ } else { /* * We cannot destroy here because it can be called in * softint. So mark the SP as DEAD and let the timer * destroy it. See key_timehandler_spd. */ sp->state = IPSEC_SPSTATE_DEAD; } } int ipsec_set_policy(struct inpcb *inp, const void *request, size_t len, kauth_cred_t cred) { const struct sadb_x_policy *xpl; struct secpolicy *newsp, *oldsp; struct secpolicy **policy; int error; KASSERT(!cpu_softintr_p()); KASSERT(inp != NULL); KASSERT(inp_locked(inp)); KASSERT(request != NULL); if (len < sizeof(*xpl)) return EINVAL; xpl = (const struct sadb_x_policy *)request; KASSERT(inp->inp_sp != NULL); /* select direction */ switch (xpl->sadb_x_policy_dir) { case IPSEC_DIR_INBOUND: policy = &inp->inp_sp->sp_in; break; case IPSEC_DIR_OUTBOUND: policy = &inp->inp_sp->sp_out; break; default: IPSECLOG(LOG_ERR, "invalid direction=%u\n", xpl->sadb_x_policy_dir); return EINVAL; } /* sanity check. */ if (policy == NULL || *policy == NULL) return EINVAL; if (KEYDEBUG_ON(KEYDEBUG_IPSEC_DUMP)) { kdebug_sadb_xpolicy("set passed policy", request); } /* check policy type */ /* ipsec_set_policy() accepts IPSEC, ENTRUST and BYPASS. */ if (xpl->sadb_x_policy_type == IPSEC_POLICY_DISCARD || xpl->sadb_x_policy_type == IPSEC_POLICY_NONE) return EINVAL; /* check privileged socket */ if (xpl->sadb_x_policy_type == IPSEC_POLICY_BYPASS) { error = kauth_authorize_network(cred, KAUTH_NETWORK_IPSEC, KAUTH_REQ_NETWORK_IPSEC_BYPASS, NULL, NULL, NULL); if (error) return error; } /* allocation new SP entry */ if ((newsp = key_msg2sp(xpl, len, &error)) == NULL) return error; key_init_sp(newsp); newsp->created = time_uptime; /* Insert the global list for SPs for sockets */ key_socksplist_add(newsp); /* clear old SP and set new SP */ oldsp = *policy; *policy = newsp; ipsec_destroy_policy(oldsp); if (KEYDEBUG_ON(KEYDEBUG_IPSEC_DUMP)) { printf("%s: new policy\n", __func__); kdebug_secpolicy(newsp); } return 0; } int ipsec_get_policy(struct inpcb *inp, const void *request, size_t len, struct mbuf **mp) { const struct sadb_x_policy *xpl; struct secpolicy *policy; /* sanity check. */ if (inp == NULL || request == NULL || mp == NULL) return EINVAL; KASSERT(inp->inp_sp != NULL); if (len < sizeof(*xpl)) return EINVAL; xpl = (const struct sadb_x_policy *)request; /* select direction */ switch (xpl->sadb_x_policy_dir) { case IPSEC_DIR_INBOUND: policy = inp->inp_sp->sp_in; break; case IPSEC_DIR_OUTBOUND: policy = inp->inp_sp->sp_out; break; default: IPSECLOG(LOG_ERR, "invalid direction=%u\n", xpl->sadb_x_policy_dir); return EINVAL; } if (policy == NULL) return EINVAL; *mp = key_sp2msg(policy, M_NOWAIT); if (!*mp) { IPSECLOG(LOG_DEBUG, "No more memory.\n"); return ENOBUFS; } if (KEYDEBUG_ON(KEYDEBUG_IPSEC_DUMP)) { kdebug_mbuf(__func__, *mp); } return 0; } int ipsec_delete_pcbpolicy(struct inpcb *inp) { KASSERT(inp != NULL); if (inp->inp_sp == NULL) return 0; if (inp->inp_sp->sp_in != NULL) ipsec_destroy_policy(inp->inp_sp->sp_in); if (inp->inp_sp->sp_out != NULL) ipsec_destroy_policy(inp->inp_sp->sp_out); ipsec_invalpcbcache(inp->inp_sp, IPSEC_DIR_ANY); ipsec_delpcbpolicy(inp->inp_sp); inp->inp_sp = NULL; return 0; } /* * Return the current level (either IPSEC_LEVEL_USE or IPSEC_LEVEL_REQUIRE). */ u_int ipsec_get_reqlevel(const struct ipsecrequest *isr) { u_int level = 0; u_int esp_trans_deflev, esp_net_deflev; u_int ah_trans_deflev, ah_net_deflev; KASSERT(isr != NULL); KASSERT(isr->sp != NULL); KASSERTMSG( isr->sp->spidx.src.sa.sa_family == isr->sp->spidx.dst.sa.sa_family, "af family mismatch, src %u, dst %u", isr->sp->spidx.src.sa.sa_family, isr->sp->spidx.dst.sa.sa_family); /* XXX note that we have ipseclog() expanded here - code sync issue */ #define IPSEC_CHECK_DEFAULT(lev) \ (((lev) != IPSEC_LEVEL_USE && (lev) != IPSEC_LEVEL_REQUIRE \ && (lev) != IPSEC_LEVEL_UNIQUE) ? \ (ipsec_debug ? log(LOG_INFO, "fixed system default level " #lev \ ":%d->%d\n", (lev), IPSEC_LEVEL_REQUIRE) : (void)0), \ (lev) = IPSEC_LEVEL_REQUIRE, (lev) \ : (lev)) /* set default level */ switch (((struct sockaddr *)&isr->sp->spidx.src)->sa_family) { #ifdef INET case AF_INET: esp_trans_deflev = IPSEC_CHECK_DEFAULT(ip4_esp_trans_deflev); esp_net_deflev = IPSEC_CHECK_DEFAULT(ip4_esp_net_deflev); ah_trans_deflev = IPSEC_CHECK_DEFAULT(ip4_ah_trans_deflev); ah_net_deflev = IPSEC_CHECK_DEFAULT(ip4_ah_net_deflev); break; #endif #ifdef INET6 case AF_INET6: esp_trans_deflev = IPSEC_CHECK_DEFAULT(ip6_esp_trans_deflev); esp_net_deflev = IPSEC_CHECK_DEFAULT(ip6_esp_net_deflev); ah_trans_deflev = IPSEC_CHECK_DEFAULT(ip6_ah_trans_deflev); ah_net_deflev = IPSEC_CHECK_DEFAULT(ip6_ah_net_deflev); break; #endif default: panic("%s: unknown af %u", __func__, isr->sp->spidx.src.sa.sa_family); } #undef IPSEC_CHECK_DEFAULT /* set level */ switch (isr->level) { case IPSEC_LEVEL_DEFAULT: switch (isr->saidx.proto) { case IPPROTO_ESP: if (isr->saidx.mode == IPSEC_MODE_TUNNEL) level = esp_net_deflev; else level = esp_trans_deflev; break; case IPPROTO_AH: if (isr->saidx.mode == IPSEC_MODE_TUNNEL) level = ah_net_deflev; else level = ah_trans_deflev; break; case IPPROTO_IPCOMP: /* * we don't really care, as IPcomp document says that * we shouldn't compress small packets */ level = IPSEC_LEVEL_USE; break; default: panic("%s: Illegal protocol defined %u", __func__, isr->saidx.proto); } break; case IPSEC_LEVEL_USE: case IPSEC_LEVEL_REQUIRE: level = isr->level; break; case IPSEC_LEVEL_UNIQUE: level = IPSEC_LEVEL_REQUIRE; break; default: panic("%s: Illegal IPsec level %u", __func__, isr->level); } return level; } /* * Check security policy requirements against the actual packet contents. * * If the SP requires an IPsec packet, and the packet was neither AH nor ESP, * then kick it. */ static int ipsec_sp_reject(const struct secpolicy *sp, const struct mbuf *m) { struct ipsecrequest *isr; if (KEYDEBUG_ON(KEYDEBUG_IPSEC_DATA)) { printf("%s: using SP\n", __func__); kdebug_secpolicy(sp); } /* check policy */ switch (sp->policy) { case IPSEC_POLICY_DISCARD: return 1; case IPSEC_POLICY_BYPASS: case IPSEC_POLICY_NONE: return 0; } KASSERTMSG(sp->policy == IPSEC_POLICY_IPSEC, "invalid policy %u", sp->policy); /* XXX should compare policy against ipsec header history */ for (isr = sp->req; isr != NULL; isr = isr->next) { if (ipsec_get_reqlevel(isr) != IPSEC_LEVEL_REQUIRE) continue; switch (isr->saidx.proto) { case IPPROTO_ESP: if ((m->m_flags & M_DECRYPTED) == 0) { KEYDEBUG_PRINTF(KEYDEBUG_IPSEC_DUMP, "ESP m_flags:%x\n", m->m_flags); return 1; } break; case IPPROTO_AH: if ((m->m_flags & M_AUTHIPHDR) == 0) { KEYDEBUG_PRINTF(KEYDEBUG_IPSEC_DUMP, "AH m_flags:%x\n", m->m_flags); return 1; } break; case IPPROTO_IPCOMP: /* * We don't really care, as IPcomp document * says that we shouldn't compress small * packets, IPComp policy should always be * treated as being in "use" level. */ break; } } return 0; } /* * Check security policy requirements. */ int ipsec_in_reject(struct mbuf *m, struct inpcb *inp) { struct secpolicy *sp; int error; int result; KASSERT(m != NULL); if (inp == NULL) sp = ipsec_getpolicybyaddr(m, IPSEC_DIR_INBOUND, IP_FORWARDING, &error); else sp = ipsec_getpolicybysock(m, IPSEC_DIR_INBOUND, inp, &error); if (sp != NULL) { result = ipsec_sp_reject(sp, m); if (result) IPSEC_STATINC(IPSEC_STAT_IN_POLVIO); KEY_SP_UNREF(&sp); } else { result = 0; } return result; } /* * Compute the byte size to be occupied by the IPsec header. If it is * tunneled, it includes the size of outer IP header. */ static size_t ipsec_sp_hdrsiz(const struct secpolicy *sp, const struct mbuf *m) { struct ipsecrequest *isr; size_t siz; if (KEYDEBUG_ON(KEYDEBUG_IPSEC_DATA)) { printf("%s: using SP\n", __func__); kdebug_secpolicy(sp); } switch (sp->policy) { case IPSEC_POLICY_DISCARD: case IPSEC_POLICY_BYPASS: case IPSEC_POLICY_NONE: return 0; } KASSERTMSG(sp->policy == IPSEC_POLICY_IPSEC, "invalid policy %u", sp->policy); siz = 0; for (isr = sp->req; isr != NULL; isr = isr->next) { size_t clen = 0; struct secasvar *sav; switch (isr->saidx.proto) { case IPPROTO_ESP: sav = ipsec_lookup_sa(isr, m); if (sav != NULL) { clen = esp_hdrsiz(sav); KEY_SA_UNREF(&sav); } else clen = esp_hdrsiz(NULL); break; case IPPROTO_AH: sav = ipsec_lookup_sa(isr, m); if (sav != NULL) { clen = ah_hdrsiz(sav); KEY_SA_UNREF(&sav); } else clen = ah_hdrsiz(NULL); break; case IPPROTO_IPCOMP: clen = sizeof(struct ipcomp); break; } if (isr->saidx.mode == IPSEC_MODE_TUNNEL) { switch (isr->saidx.dst.sa.sa_family) { case AF_INET: clen += sizeof(struct ip); break; #ifdef INET6 case AF_INET6: clen += sizeof(struct ip6_hdr); break; #endif default: IPSECLOG(LOG_ERR, "unknown AF %d in " "IPsec tunnel SA\n", ((const struct sockaddr *)&isr->saidx.dst) ->sa_family); break; } } siz += clen; } return siz; } size_t ipsec_hdrsiz(struct mbuf *m, u_int dir, struct inpcb *inp) { struct secpolicy *sp; int error; size_t size; KASSERT(m != NULL); KASSERTMSG(inp == NULL || inp->inp_socket != NULL, "socket w/o inpcb"); if (inp == NULL) sp = ipsec_getpolicybyaddr(m, dir, IP_FORWARDING, &error); else sp = ipsec_getpolicybysock(m, dir, inp, &error); if (sp != NULL) { size = ipsec_sp_hdrsiz(sp, m); KEYDEBUG_PRINTF(KEYDEBUG_IPSEC_DATA, "size:%zu.\n", size); KEY_SP_UNREF(&sp); } else { size = 0; } return size; } /* * Check the variable replay window. * ipsec_chkreplay() performs replay check before ICV verification. * ipsec_updatereplay() updates replay bitmap. This must be called after * ICV verification (it also performs replay check, which is usually done * beforehand). * 0 (zero) is returned if packet disallowed, 1 if packet permitted. * * based on RFC 2401. */ int ipsec_chkreplay(u_int32_t seq, const struct secasvar *sav) { const struct secreplay *replay; u_int32_t diff; int fr; u_int32_t wsizeb; /* constant: bits of window size */ int frlast; /* constant: last frame */ KASSERT(sav != NULL); KASSERT(sav->replay != NULL); replay = sav->replay; if (replay->wsize == 0) return 1; /* no need to check replay. */ /* constant */ frlast = replay->wsize - 1; wsizeb = replay->wsize << 3; /* sequence number of 0 is invalid */ if (seq == 0) return 0; /* first time is always okay */ if (replay->count == 0) return 1; if (seq > replay->lastseq) { /* larger sequences are okay */ return 1; } else { /* seq is equal or less than lastseq. */ diff = replay->lastseq - seq; /* over range to check, i.e. too old or wrapped */ if (diff >= wsizeb) return 0; fr = frlast - diff / 8; /* this packet already seen ? */ if ((replay->bitmap)[fr] & (1 << (diff % 8))) return 0; /* out of order but good */ return 1; } } /* * check replay counter whether to update or not. * OUT: 0: OK * 1: NG */ int ipsec_updatereplay(u_int32_t seq, const struct secasvar *sav) { struct secreplay *replay; u_int32_t diff; int fr; u_int32_t wsizeb; /* constant: bits of window size */ int frlast; /* constant: last frame */ KASSERT(sav != NULL); KASSERT(sav->replay != NULL); replay = sav->replay; if (replay->wsize == 0) goto ok; /* no need to check replay. */ /* constant */ frlast = replay->wsize - 1; wsizeb = replay->wsize << 3; /* sequence number of 0 is invalid */ if (seq == 0) return 1; /* first time */ if (replay->count == 0) { replay->lastseq = seq; memset(replay->bitmap, 0, replay->wsize); (replay->bitmap)[frlast] = 1; goto ok; } if (seq > replay->lastseq) { /* seq is larger than lastseq. */ diff = seq - replay->lastseq; /* new larger sequence number */ if (diff < wsizeb) { /* In window */ /* set bit for this packet */ vshiftl(replay->bitmap, diff, replay->wsize); (replay->bitmap)[frlast] |= 1; } else { /* this packet has a "way larger" */ memset(replay->bitmap, 0, replay->wsize); (replay->bitmap)[frlast] = 1; } replay->lastseq = seq; /* larger is good */ } else { /* seq is equal or less than lastseq. */ diff = replay->lastseq - seq; /* over range to check, i.e. too old or wrapped */ if (diff >= wsizeb) return 1; fr = frlast - diff / 8; /* this packet already seen ? */ if ((replay->bitmap)[fr] & (1 << (diff % 8))) return 1; /* mark as seen */ (replay->bitmap)[fr] |= (1 << (diff % 8)); /* out of order but good */ } ok: if (replay->count == ~0) { char buf[IPSEC_LOGSASTRLEN]; /* set overflow flag */ replay->overflow++; /* don't increment, no more packets accepted */ if ((sav->flags & SADB_X_EXT_CYCSEQ) == 0) return 1; IPSECLOG(LOG_WARNING, "replay counter made %d cycle. %s\n", replay->overflow, ipsec_logsastr(sav, buf, sizeof(buf))); } replay->count++; return 0; } /* * shift variable length buffer to left. * IN: bitmap: pointer to the buffer * nbit: the number of to shift. * wsize: buffer size (bytes). */ static void vshiftl(unsigned char *bitmap, int nbit, int wsize) { int s, j, i; unsigned char over; for (j = 0; j < nbit; j += 8) { s = (nbit - j < 8) ? (nbit - j): 8; bitmap[0] <<= s; for (i = 1; i < wsize; i++) { over = (bitmap[i] >> (8 - s)); bitmap[i] <<= s; bitmap[i-1] |= over; } } return; } /* Return a printable string for the address. */ const char * ipsec_address(const union sockaddr_union *sa, char *buf, size_t size) { switch (sa->sa.sa_family) { case AF_INET: in_print(buf, size, &sa->sin.sin_addr); return buf; #if INET6 case AF_INET6: in6_print(buf, size, &sa->sin6.sin6_addr); return buf; #endif default: return "(unknown address family)"; } } const char * ipsec_logsastr(const struct secasvar *sav, char *buf, size_t size) { const struct secasindex *saidx = &sav->sah->saidx; char sbuf[IPSEC_ADDRSTRLEN], dbuf[IPSEC_ADDRSTRLEN]; KASSERTMSG(saidx->src.sa.sa_family == saidx->dst.sa.sa_family, "af family mismatch, src %u, dst %u", saidx->src.sa.sa_family, saidx->dst.sa.sa_family); snprintf(buf, size, "SA(SPI=%u src=%s dst=%s)", (u_int32_t)ntohl(sav->spi), ipsec_address(&saidx->src, sbuf, sizeof(sbuf)), ipsec_address(&saidx->dst, dbuf, sizeof(dbuf))); return buf; } #ifdef INET6 struct secpolicy * ipsec6_check_policy(struct mbuf *m, struct inpcb *inp, int flags, int *needipsecp, int *errorp) { struct secpolicy *sp = NULL; int error = 0; int needipsec = 0; if (ipsec_outdone(m)) { goto skippolicycheck; } if (inp && ipsec_pcb_skip_ipsec(inp->inp_sp, IPSEC_DIR_OUTBOUND)) { goto skippolicycheck; } sp = ipsec_checkpolicy(m, IPSEC_DIR_OUTBOUND, flags, &error, inp); /* * There are four return cases: * sp != NULL apply IPsec policy * sp == NULL, error == 0 no IPsec handling needed * sp == NULL, error == -EINVAL discard packet w/o error * sp == NULL, error != 0 discard packet, report error */ if (sp == NULL) { needipsec = 0; } else { needipsec = 1; } skippolicycheck: *errorp = error; *needipsecp = needipsec; return sp; } /* * calculate UDP checksum for UDP encapsulated ESP for IPv6. * * RFC2460(Internet Protocol, Version 6 Specification) says: * * IPv6 receivers MUST discard UDP packets with a zero checksum. * * There is more relaxed specification RFC6935(IPv6 and UDP Checksums for * Tunneled Packets). The document allows zero checksum. It's too * late to publish, there are a lot of interoperability problems... */ void ipsec6_udp_cksum(struct mbuf *m) { struct ip6_hdr *ip6; uint16_t plen, uh_sum; int off; /* must called after m_pullup() */ KASSERT(m->m_len >= sizeof(struct ip6_hdr)); ip6 = mtod(m, struct ip6_hdr *); KASSERT(ip6->ip6_nxt == IPPROTO_UDP); /* ip6->ip6_plen can not be updated before ip6_output() */ plen = m->m_pkthdr.len - sizeof(*ip6); KASSERT(plen >= sizeof(struct udphdr)); uh_sum = in6_cksum(m, IPPROTO_UDP, sizeof(*ip6), plen); if (uh_sum == 0) uh_sum = 0xffff; off = sizeof(*ip6) + offsetof(struct udphdr, uh_sum); m_copyback(m, off, sizeof(uh_sum), (void *)&uh_sum); } #endif /* INET6 */ /* * ----------------------------------------------------------------------------- */ /* XXX this stuff doesn't belong here... */ static struct xformsw *xforms = NULL; /* * Register a transform; typically at system startup. */ void xform_register(struct xformsw *xsp) { xsp->xf_next = xforms; xforms = xsp; } /* * Initialize transform support in an sav. */ int xform_init(struct secasvar *sav, int xftype) { struct xformsw *xsp; if (sav->tdb_xform != NULL) /* previously initialized */ return 0; for (xsp = xforms; xsp; xsp = xsp->xf_next) if (xsp->xf_type == xftype) return (*xsp->xf_init)(sav, xsp); IPSECLOG(LOG_DEBUG, "no match for xform type %d\n", xftype); return EINVAL; } /* * XXXJRT This should be done as a protosw init call. */ void ipsec_attach(void) { ipsec_output_init(); ipsecstat_percpu = percpu_alloc(sizeof(uint64_t) * IPSEC_NSTATS); sysctl_net_inet_ipsec_setup(NULL); #ifdef INET6 sysctl_net_inet6_ipsec6_setup(NULL); #endif ah_attach(); esp_attach(); ipcomp_attach(); ipe4_attach(); #ifdef TCP_SIGNATURE tcpsignature_attach(); #endif }
1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 /* $NetBSD: st.c,v 1.243 2022/02/23 21:54:41 andvar Exp $ */ /*- * Copyright (c) 1998, 2004 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Charles M. Hannum. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /* * Originally written by Julian Elischer (julian@tfs.com) * for TRW Financial Systems for use under the MACH(2.5) operating system. * * TRW Financial Systems, in accordance with their agreement with Carnegie * Mellon University, makes this software available to CMU to distribute * or use in any manner that they see fit as long as this message is kept with * the software. For this reason TFS also grants any other persons or * organisations permission to use or modify this software. * * TFS supplies this software to be publicly redistributed * on the understanding that TFS is not responsible for the correct * functioning of this software in any circumstances. * * Ported to run under 386BSD by Julian Elischer (julian@tfs.com) Sept 1992 * major changes by Julian Elischer (julian@jules.dialix.oz.au) May 1993 * * A lot of rewhacking done by mjacob (mjacob@nas.nasa.gov). */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: st.c,v 1.243 2022/02/23 21:54:41 andvar Exp $"); #ifdef _KERNEL_OPT #include "opt_scsi.h" #endif #include <sys/param.h> #include <sys/systm.h> #include <sys/fcntl.h> #include <sys/errno.h> #include <sys/ioctl.h> #include <sys/malloc.h> #include <sys/buf.h> #include <sys/bufq.h> #include <sys/proc.h> #include <sys/mtio.h> #include <sys/device.h> #include <sys/conf.h> #include <sys/kernel.h> #include <sys/vnode.h> #include <sys/iostat.h> #include <sys/sysctl.h> #include <dev/scsipi/scsi_spc.h> #include <dev/scsipi/scsipi_all.h> #include <dev/scsipi/scsi_all.h> #include <dev/scsipi/scsi_tape.h> #include <dev/scsipi/scsipiconf.h> #include <dev/scsipi/scsipi_base.h> #include <dev/scsipi/stvar.h> /* Defines for device specific stuff */ #define DEF_FIXED_BSIZE 512 #define STMODE(z) ( minor(z) & 0x03) #define STDSTY(z) ((minor(z) >> 2) & 0x03) #define STUNIT(z) ((minor(z) >> 4) ) #define STNMINOR 16 #define NORMAL_MODE 0 #define NOREW_MODE 1 #define EJECT_MODE 2 #define CTRL_MODE 3 #ifndef ST_MOUNT_DELAY #define ST_MOUNT_DELAY 0 #endif static dev_type_open(stopen); static dev_type_close(stclose); static dev_type_read(stread); static dev_type_write(stwrite); static dev_type_ioctl(stioctl); static dev_type_strategy(ststrategy); static dev_type_dump(stdump); const struct bdevsw st_bdevsw = { .d_open = stopen, .d_close = stclose, .d_strategy = ststrategy, .d_ioctl = stioctl, .d_dump = stdump, .d_psize = nosize, .d_discard = nodiscard, .d_flag = D_TAPE | D_MPSAFE }; const struct cdevsw st_cdevsw = { .d_open = stopen, .d_close = stclose, .d_read = stread, .d_write = stwrite, .d_ioctl = stioctl, .d_stop = nostop, .d_tty = notty, .d_poll = nopoll, .d_mmap = nommap, .d_kqfilter = nokqfilter, .d_discard = nodiscard, .d_flag = D_TAPE | D_MPSAFE }; /* * Define various devices that we know mis-behave in some way, * and note how they are bad, so we can correct for them */ static const struct st_quirk_inquiry_pattern st_quirk_patterns[] = { {{T_SEQUENTIAL, T_REMOV, " ", " ", " "}, {0, 0, { {ST_Q_FORCE_BLKSIZE, 512, 0}, /* minor 0-3 */ {ST_Q_FORCE_BLKSIZE, 512, QIC_24}, /* minor 4-7 */ {ST_Q_FORCE_BLKSIZE, 0, HALFINCH_1600}, /* minor 8-11 */ {ST_Q_FORCE_BLKSIZE, 0, HALFINCH_6250} /* minor 12-15 */ }}}, {{T_SEQUENTIAL, T_REMOV, "TANDBERG", " TDC 3600 ", ""}, {0, 12, { {0, 0, 0}, /* minor 0-3 */ {ST_Q_FORCE_BLKSIZE, 0, QIC_525}, /* minor 4-7 */ {0, 0, QIC_150}, /* minor 8-11 */ {0, 0, QIC_120} /* minor 12-15 */ }}}, {{T_SEQUENTIAL, T_REMOV, "TANDBERG", " TDC 3800 ", ""}, {0, 0, { {ST_Q_FORCE_BLKSIZE, 512, 0}, /* minor 0-3 */ {0, 0, QIC_525}, /* minor 4-7 */ {0, 0, QIC_150}, /* minor 8-11 */ {0, 0, QIC_120} /* minor 12-15 */ }}}, {{T_SEQUENTIAL, T_REMOV, "TANDBERG", " SLR5 4/8GB ", ""}, {0, 0, { {ST_Q_FORCE_BLKSIZE, 1024, 0}, /* minor 0-3 */ {0, 0, 0}, /* minor 4-7 */ {0, 0, 0}, /* minor 8-11 */ {0, 0, 0} /* minor 12-15 */ }}}, /* * lacking a manual for the 4200, it's not clear what the * specific density codes should be- the device is a 2.5GB * capable QIC drive, those density codes aren't readily * available. The 'default' will just have to do. */ {{T_SEQUENTIAL, T_REMOV, "TANDBERG", " TDC 4200 ", ""}, {0, 0, { {ST_Q_FORCE_BLKSIZE, 512, 0}, /* minor 0-3 */ {0, 0, QIC_525}, /* minor 4-7 */ {0, 0, QIC_150}, /* minor 8-11 */ {0, 0, QIC_120} /* minor 12-15 */ }}}, /* * At least -005 and -007 need this. I'll assume they all do unless I * hear otherwise. - mycroft, 31MAR1994 */ {{T_SEQUENTIAL, T_REMOV, "ARCHIVE ", "VIPER 2525 25462", ""}, {0, 0, { {ST_Q_SENSE_HELP, 0, 0}, /* minor 0-3 */ {ST_Q_SENSE_HELP, 0, QIC_525}, /* minor 4-7 */ {0, 0, QIC_150}, /* minor 8-11 */ {0, 0, QIC_120} /* minor 12-15 */ }}}, /* * One user reports that this works for his tape drive. It probably * needs more work. - mycroft, 09APR1994 */ {{T_SEQUENTIAL, T_REMOV, "SANKYO ", "CP525 ", ""}, {0, 0, { {ST_Q_FORCE_BLKSIZE, 512, 0}, /* minor 0-3 */ {ST_Q_FORCE_BLKSIZE, 512, QIC_525}, /* minor 4-7 */ {0, 0, QIC_150}, /* minor 8-11 */ {0, 0, QIC_120} /* minor 12-15 */ }}}, {{T_SEQUENTIAL, T_REMOV, "ANRITSU ", "DMT780 ", ""}, {0, 0, { {ST_Q_FORCE_BLKSIZE, 512, 0}, /* minor 0-3 */ {ST_Q_FORCE_BLKSIZE, 512, QIC_525}, /* minor 4-7 */ {0, 0, QIC_150}, /* minor 8-11 */ {0, 0, QIC_120} /* minor 12-15 */ }}}, {{T_SEQUENTIAL, T_REMOV, "ARCHIVE ", "VIPER 150 21247", ""}, {ST_Q_ERASE_NOIMM, 12, { {ST_Q_SENSE_HELP, 0, 0}, /* minor 0-3 */ {0, 0, QIC_150}, /* minor 4-7 */ {0, 0, QIC_120}, /* minor 8-11 */ {0, 0, QIC_24} /* minor 12-15 */ }}}, {{T_SEQUENTIAL, T_REMOV, "ARCHIVE ", "VIPER 150 21531", ""}, {ST_Q_ERASE_NOIMM, 12, { {ST_Q_SENSE_HELP, 0, 0}, /* minor 0-3 */ {0, 0, QIC_150}, /* minor 4-7 */ {0, 0, QIC_120}, /* minor 8-11 */ {0, 0, QIC_24} /* minor 12-15 */ }}}, {{T_SEQUENTIAL, T_REMOV, "WANGTEK ", "5099ES SCSI", ""}, {0, 0, { {ST_Q_FORCE_BLKSIZE, 512, 0}, /* minor 0-3 */ {0, 0, QIC_11}, /* minor 4-7 */ {0, 0, QIC_24}, /* minor 8-11 */ {0, 0, QIC_24} /* minor 12-15 */ }}}, {{T_SEQUENTIAL, T_REMOV, "WANGTEK ", "5150ES SCSI", ""}, {0, 0, { {ST_Q_FORCE_BLKSIZE, 512, 0}, /* minor 0-3 */ {0, 0, QIC_24}, /* minor 4-7 */ {0, 0, QIC_120}, /* minor 8-11 */ {0, 0, QIC_150} /* minor 12-15 */ }}}, {{T_SEQUENTIAL, T_REMOV, "WANGTEK ", "5525ES SCSI REV7", ""}, {0, 0, { {0, 0, 0}, /* minor 0-3 */ {ST_Q_BLKSIZE, 0, QIC_525}, /* minor 4-7 */ {0, 0, QIC_150}, /* minor 8-11 */ {0, 0, QIC_120} /* minor 12-15 */ }}}, {{T_SEQUENTIAL, T_REMOV, "WangDAT ", "Model 1300 ", ""}, {0, 0, { {0, 0, 0}, /* minor 0-3 */ {ST_Q_FORCE_BLKSIZE, 512, DDS}, /* minor 4-7 */ {ST_Q_FORCE_BLKSIZE, 1024, DDS}, /* minor 8-11 */ {ST_Q_FORCE_BLKSIZE, 0, DDS} /* minor 12-15 */ }}}, {{T_SEQUENTIAL, T_REMOV, "EXABYTE ", "EXB-8200 ", "263H"}, {0, 5, { {0, 0, 0}, /* minor 0-3 */ {0, 0, 0}, /* minor 4-7 */ {0, 0, 0}, /* minor 8-11 */ {0, 0, 0} /* minor 12-15 */ }}}, {{T_SEQUENTIAL, T_REMOV, "STK", "9490", ""}, {ST_Q_FORCE_BLKSIZE, 0, { {0, 0, 0}, /* minor 0-3 */ {0, 0, 0}, /* minor 4-7 */ {0, 0, 0}, /* minor 8-11 */ {0, 0, 0} /* minor 12-15 */ }}}, {{T_SEQUENTIAL, T_REMOV, "STK", "SD-3", ""}, {ST_Q_FORCE_BLKSIZE, 0, { {0, 0, 0}, /* minor 0-3 */ {0, 0, 0}, /* minor 4-7 */ {0, 0, 0}, /* minor 8-11 */ {0, 0, 0} /* minor 12-15 */ }}}, {{T_SEQUENTIAL, T_REMOV, "IBM", "03590", ""}, {ST_Q_IGNORE_LOADS, 0, { {0, 0, 0}, /* minor 0-3 */ {0, 0, 0}, /* minor 4-7 */ {0, 0, 0}, /* minor 8-11 */ {0, 0, 0} /* minor 12-15 */ }}}, {{T_SEQUENTIAL, T_REMOV, "HP ", "T4000s ", ""}, {ST_Q_UNIMODAL, 0, { {0, 0, QIC_3095}, /* minor 0-3 */ {0, 0, QIC_3095}, /* minor 4-7 */ {0, 0, QIC_3095}, /* minor 8-11 */ {0, 0, QIC_3095}, /* minor 12-15 */ }}}, #if 0 {{T_SEQUENTIAL, T_REMOV, "EXABYTE ", "EXB-8200 ", ""}, {0, 12, { {0, 0, 0}, /* minor 0-3 */ {0, 0, 0}, /* minor 4-7 */ {0, 0, 0}, /* minor 8-11 */ {0, 0, 0} /* minor 12-15 */ }}}, #endif {{T_SEQUENTIAL, T_REMOV, "TEAC ", "MT-2ST/N50 ", ""}, {ST_Q_IGNORE_LOADS, 0, { {0, 0, 0}, /* minor 0-3 */ {0, 0, 0}, /* minor 4-7 */ {0, 0, 0}, /* minor 8-11 */ {0, 0, 0} /* minor 12-15 */ }}}, {{T_SEQUENTIAL, T_REMOV, "OnStream", "ADR50 Drive", ""}, {ST_Q_UNIMODAL, 0, { {ST_Q_FORCE_BLKSIZE, 512, 0}, /* minor 0-3 */ {ST_Q_FORCE_BLKSIZE, 512, 0}, /* minor 4-7 */ {ST_Q_FORCE_BLKSIZE, 512, 0}, /* minor 8-11 */ {ST_Q_FORCE_BLKSIZE, 512, 0}, /* minor 12-15 */ }}}, {{T_SEQUENTIAL, T_REMOV, "OnStream DI-30", "", "1.0"}, {ST_Q_NOFILEMARKS, 0, { {0, 0, 0}, /* minor 0-3 */ {0, 0, 0}, /* minor 4-7 */ {0, 0, 0}, /* minor 8-11 */ {0, 0, 0} /* minor 12-15 */ }}}, {{T_SEQUENTIAL, T_REMOV, "NCR H621", "0-STD-03-46F880 ", ""}, {ST_Q_NOPREVENT, 0, { {0, 0, 0}, /* minor 0-3 */ {0, 0, 0}, /* minor 4-7 */ {0, 0, 0}, /* minor 8-11 */ {0, 0, 0} /* minor 12-15 */ }}}, {{T_SEQUENTIAL, T_REMOV, "Seagate STT3401A", "hp0atxa", ""}, {0, 0, { {ST_Q_FORCE_BLKSIZE, 512, 0}, /* minor 0-3 */ {ST_Q_FORCE_BLKSIZE, 1024, 0}, /* minor 4-7 */ {ST_Q_FORCE_BLKSIZE, 512, 0}, /* minor 8-11 */ {ST_Q_FORCE_BLKSIZE, 512, 0} /* minor 12-15 */ }}}, }; #define NOEJECT 0 #define EJECT 1 static void st_identify_drive(struct st_softc *, struct scsipi_inquiry_pattern *); static void st_loadquirks(struct st_softc *); static int st_mount_tape(dev_t, int); static void st_unmount(struct st_softc *, boolean); static int st_decide_mode(struct st_softc *, boolean); static void ststart(struct scsipi_periph *); static int ststart1(struct scsipi_periph *, struct buf *, int *); static void strestart(void *); static void stdone(struct scsipi_xfer *, int); static int st_read(struct st_softc *, char *, int, int); static int st_space(struct st_softc *, int, u_int, int); static int st_write_filemarks(struct st_softc *, int, int); static int st_check_eod(struct st_softc *, boolean, int *, int); static int st_load(struct st_softc *, u_int, int); static int st_rewind(struct st_softc *, u_int, int); static int st_interpret_sense(struct scsipi_xfer *); static int st_touch_tape(struct st_softc *); static int st_erase(struct st_softc *, int full, int flags); static void st_updatefilepos(struct st_softc *); static int st_rdpos(struct st_softc *, int, uint32_t *); static int st_setpos(struct st_softc *, int, uint32_t *); static const struct scsipi_periphsw st_switch = { st_interpret_sense, ststart, NULL, stdone }; #if defined(ST_ENABLE_EARLYWARN) #define ST_INIT_FLAGS ST_EARLYWARN #else #define ST_INIT_FLAGS 0 #endif /* * The routine called by the low level scsi routine when it discovers * A device suitable for this driver */ void stattach(device_t parent, device_t self, void *aux) { struct st_softc *st = device_private(self); struct scsipibus_attach_args *sa = aux; struct scsipi_periph *periph = sa->sa_periph; SC_DEBUG(periph, SCSIPI_DB2, ("stattach: ")); st->sc_dev = self; /* Store information needed to contact our base driver */ st->sc_periph = periph; periph->periph_dev = st->sc_dev; periph->periph_switch = &st_switch; /* Set initial flags */ st->flags = ST_INIT_FLAGS; /* Set up the buf queues for this device */ bufq_alloc(&st->buf_queue, "fcfs", 0); bufq_alloc(&st->buf_defer, "fcfs", 0); callout_init(&st->sc_callout, 0); mutex_init(&st->sc_iolock, MUTEX_DEFAULT, IPL_VM); /* * Check if the drive is a known criminal and take * Any steps needed to bring it into line */ st_identify_drive(st, &sa->sa_inqbuf); aprint_naive("\n"); aprint_normal("\n"); /* Use the subdriver to request information regarding the drive. */ aprint_normal_dev(self, "%s", st->quirkdata ? "quirks apply, " : ""); if (scsipi_test_unit_ready(periph, XS_CTL_DISCOVERY | XS_CTL_SILENT | XS_CTL_IGNORE_MEDIA_CHANGE) || st->ops(st, ST_OPS_MODESENSE, XS_CTL_DISCOVERY | XS_CTL_SILENT | XS_CTL_IGNORE_MEDIA_CHANGE)) aprint_normal("drive empty\n"); else { aprint_normal("density code %d, ", st->media_density); if (st->media_blksize > 0) aprint_normal("%d-byte", st->media_blksize); else aprint_normal("variable"); aprint_normal(" blocks, write-%s\n", (st->flags & ST_READONLY) ? "protected" : "enabled"); } st->stats = iostat_alloc(IOSTAT_TAPE, parent, device_xname(st->sc_dev)); rnd_attach_source(&st->rnd_source, device_xname(st->sc_dev), RND_TYPE_TAPE, RND_FLAG_DEFAULT); } int stdetach(device_t self, int flags) { struct st_softc *st = device_private(self); struct scsipi_periph *periph = st->sc_periph; struct scsipi_channel *chan = periph->periph_channel; int bmaj, cmaj, mn; /* locate the major number */ bmaj = bdevsw_lookup_major(&st_bdevsw); cmaj = cdevsw_lookup_major(&st_cdevsw); /* kill any pending restart */ callout_halt(&st->sc_callout, NULL); mutex_enter(chan_mtx(chan)); /* Kill off any queued buffers. */ bufq_drain(st->buf_defer); bufq_drain(st->buf_queue); /* Kill off any pending commands. */ scsipi_kill_pending(st->sc_periph); mutex_exit(chan_mtx(chan)); bufq_free(st->buf_defer); bufq_free(st->buf_queue); mutex_destroy(&st->sc_iolock); /* Nuke the vnodes for any open instances */ mn = STUNIT(device_unit(self)); vdevgone(bmaj, mn, mn+STNMINOR-1, VBLK); vdevgone(cmaj, mn, mn+STNMINOR-1, VCHR); iostat_free(st->stats); /* Unhook the entropy source. */ rnd_detach_source(&st->rnd_source); return 0; } /* * Use the inquiry routine in 'scsi_base' to get drive info so we can * Further tailor our behaviour. */ static void st_identify_drive(struct st_softc *st, struct scsipi_inquiry_pattern *inqbuf) { const struct st_quirk_inquiry_pattern *finger; int priority; finger = scsipi_inqmatch(inqbuf, st_quirk_patterns, sizeof(st_quirk_patterns) / sizeof(st_quirk_patterns[0]), sizeof(st_quirk_patterns[0]), &priority); if (priority != 0) { st->quirkdata = &finger->quirkdata; st->drive_quirks = finger->quirkdata.quirks; st->quirks = finger->quirkdata.quirks; /* start value */ st->page_0_size = finger->quirkdata.page_0_size; KASSERT(st->page_0_size <= MAX_PAGE_0_SIZE); st_loadquirks(st); } } /* * initialise the subdevices to the default (QUIRK) state. * this will remove any setting made by the system operator or previous * operations. */ static void st_loadquirks(struct st_softc *st) { const struct modes *mode; struct modes *mode2; int i; mode = st->quirkdata->modes; mode2 = st->modes; for (i = 0; i < 4; i++) { memset(mode2, 0, sizeof(struct modes)); st->modeflags[i] &= ~(BLKSIZE_SET_BY_QUIRK | DENSITY_SET_BY_QUIRK | BLKSIZE_SET_BY_USER | DENSITY_SET_BY_USER); if ((mode->quirks | st->drive_quirks) & ST_Q_FORCE_BLKSIZE) { mode2->blksize = mode->blksize; st->modeflags[i] |= BLKSIZE_SET_BY_QUIRK; } if (mode->density) { mode2->density = mode->density; st->modeflags[i] |= DENSITY_SET_BY_QUIRK; } mode2->quirks |= mode->quirks; mode++; mode2++; } } /* open the device. */ static int stopen(dev_t dev, int flags, int mode, struct lwp *l) { u_int stmode, dsty; int error, sflags, unit, tries, ntries; struct st_softc *st; struct scsipi_periph *periph; struct scsipi_adapter *adapt; unit = STUNIT(dev); st = device_lookup_private(&st_cd, unit); if (st == NULL) return ENXIO; stmode = STMODE(dev); dsty = STDSTY(dev); periph = st->sc_periph; adapt = periph->periph_channel->chan_adapter; SC_DEBUG(periph, SCSIPI_DB1, ("open: dev=0x%"PRIx64" (unit %d (of %d))\n", dev, unit, st_cd.cd_ndevs)); /* Only allow one at a time */ if (periph->periph_flags & PERIPH_OPEN) { aprint_error_dev(st->sc_dev, "already open\n"); return EBUSY; } if ((error = scsipi_adapter_addref(adapt)) != 0) return error; /* clear any latched errors. */ st->mt_resid = 0; st->mt_erreg = 0; st->asc = 0; st->ascq = 0; /* * Catch any unit attention errors. Be silent about this * unless we're already mounted. We ignore media change * if we're in control mode or not mounted yet. */ if ((st->flags & ST_MOUNTED) == 0 || stmode == CTRL_MODE) { #ifdef SCSIDEBUG sflags = XS_CTL_IGNORE_MEDIA_CHANGE; #else sflags = XS_CTL_SILENT|XS_CTL_IGNORE_MEDIA_CHANGE; #endif } else sflags = 0; /* * If we're already mounted or we aren't configured for * a mount delay, only try a test unit ready once. Otherwise, * try up to ST_MOUNT_DELAY times with a rest interval of * one second between each try. */ if ((st->flags & ST_MOUNTED) || ST_MOUNT_DELAY == 0) ntries = 1; else ntries = ST_MOUNT_DELAY; for (error = tries = 0; tries < ntries; tries++) { int slpintr, oflags; /* * If we had no error, or we're opening the control mode * device, we jump out right away. */ error = scsipi_test_unit_ready(periph, sflags); if (error == 0 || stmode == CTRL_MODE) break; /* * We had an error. * * If we're already mounted or we aren't configured for * a mount delay, or the error isn't a NOT READY error, * skip to the error exit now. */ if ((st->flags & ST_MOUNTED) || ST_MOUNT_DELAY == 0 || (st->mt_key != SKEY_NOT_READY)) { device_printf(st->sc_dev, "mount error (sense key=%d) - " "terminating mount session\n", st->mt_key); /* * the following should not trigger unless * something serious happened while the device * was open (PREVENT MEDIUM REMOVAL in effect) */ if (st->flags & ST_WRITTEN && st->mt_key == SKEY_UNIT_ATTENTION) { /* * device / media state may have changed * refrain from writing missing file marks * onto potentially newly inserted/formatted * media (e. g. emergency EJECT/RESET/etc.) */ st->flags &= ~(ST_WRITTEN|ST_FM_WRITTEN); device_printf(st->sc_dev, "CAUTION: file marks/data may be missing" " - ASC = 0x%02x, ASCQ = 0x%02x\n", st->asc, st->ascq); } goto bad; } /* clear any latched errors. */ st->mt_resid = 0; st->mt_erreg = 0; st->asc = 0; st->ascq = 0; /* * Fake that we have the device open so * we block other apps from getting in. */ oflags = periph->periph_flags; periph->periph_flags |= PERIPH_OPEN; slpintr = kpause("stload", true, hz, NULL); periph->periph_flags = oflags; /* restore flags */ if (slpintr != 0 && slpintr != EWOULDBLOCK) { device_printf(st->sc_dev, "load interrupted\n"); goto bad; } } /* * If the mode is 3 (e.g. minor = 3,7,11,15) then the device has * been opened to set defaults and perform other, usually non-I/O * related, operations. In this case, do a quick check to see * whether the unit actually had a tape loaded (this will be known * as to whether or not we got a NOT READY for the above * unit attention). If a tape is there, go do a mount sequence. */ if (stmode == CTRL_MODE && st->mt_key != SKEY_NO_SENSE && st->mt_key != SKEY_UNIT_ATTENTION) { periph->periph_flags |= PERIPH_OPEN; return 0; } /* * If we get this far and had an error set, that means we failed * to pass the 'test unit ready' test for the non-controlmode device, * so we bounce the open. */ if (error) return error; /* Else, we're now committed to saying we're open. */ periph->periph_flags |= PERIPH_OPEN; /* unit attn are now errors */ /* * If it's a different mode, or if the media has been * invalidated, unmount the tape from the previous * session but continue with open processing */ if (st->last_dsty != dsty || (periph->periph_flags & PERIPH_MEDIA_LOADED) == 0) st_unmount(st, NOEJECT); /* * If we are not mounted, then we should start a new * mount session. */ if (!(st->flags & ST_MOUNTED)) { if ((error = st_mount_tape(dev, flags)) != 0) goto bad; st->last_dsty = dsty; } if (!(st->quirks & ST_Q_NOPREVENT)) { scsipi_prevent(periph, SPAMR_PREVENT_DT, XS_CTL_IGNORE_ILLEGAL_REQUEST | XS_CTL_IGNORE_NOT_READY); } SC_DEBUG(periph, SCSIPI_DB2, ("open complete\n")); return 0; bad: st_unmount(st, NOEJECT); scsipi_adapter_delref(adapt); periph->periph_flags &= ~PERIPH_OPEN; return error; } static int stclose(dev_t dev, int flags, int mode, struct lwp *l) { int stxx, error = 0; struct st_softc *st = device_lookup_private(&st_cd, STUNIT(dev)); struct scsipi_periph *periph = st->sc_periph; struct scsipi_adapter *adapt = periph->periph_channel->chan_adapter; SC_DEBUG(st->sc_periph, SCSIPI_DB1, ("closing\n")); /* * Make sure that a tape opened in write-only mode will have * file marks written on it when closed, even if not written to. * * This is for SUN compatibility. Actually, the Sun way of * things is to: * * only write filemarks if there are fmks to be written and * - open for write (possibly read/write) * - the last operation was a write * or: * - opened for wronly * - no data was written (including filemarks) */ stxx = st->flags & (ST_WRITTEN | ST_FM_WRITTEN); if ((flags & FWRITE) != 0) { int nm = 0; #ifdef ST_SUNCOMPAT /* * on request only * original compat code has not been working * since ~1998 */ if ((flags & O_ACCMODE) == FWRITE && (stxx == 0)) { st->flags |= ST_WRITTEN; SC_DEBUG(st->sc_periph, SCSIPI_DB3, ("SUN compatibility: write FM(s) at close\n")); } #endif error = st_check_eod(st, FALSE, &nm, 0); SC_DEBUG(st->sc_periph, SCSIPI_DB3, ("wrote %d FM(s) at close error=%d\n", nm, error)); } /* Allow robots to eject tape if needed. */ if (!(st->quirks & ST_Q_NOPREVENT)) { scsipi_prevent(periph, SPAMR_ALLOW, XS_CTL_IGNORE_ILLEGAL_REQUEST | XS_CTL_IGNORE_NOT_READY); } switch (STMODE(dev)) { case NORMAL_MODE: st_unmount(st, NOEJECT); break; case NOREW_MODE: case CTRL_MODE: /* * Leave mounted unless media seems to have been removed. * * Otherwise, if we're to terminate a tape with more than one * filemark [ and because we're not rewinding here ], backspace * one filemark so that later appends will see an unbroken * sequence of: * * file - FMK - file - FMK ... file - FMK FMK (EOM) */ if ((periph->periph_flags & PERIPH_MEDIA_LOADED) == 0) { st_unmount(st, NOEJECT); } else if (error == 0) { /* * ST_WRITTEN was preserved from above. * * All we need to know here is: * * Were we writing this tape and was the last * operation a write? * * Are there supposed to be 2FM at EOD? * * If both statements are true, then we backspace * one filemark. */ stxx &= ~ST_FM_WRITTEN; stxx |= (st->flags & ST_2FM_AT_EOD); if ((flags & FWRITE) != 0 && (stxx == (ST_2FM_AT_EOD|ST_WRITTEN))) { error = st_space(st, -1, SP_FILEMARKS, 0); SC_DEBUG(st->sc_periph, SCSIPI_DB3, ("st_space(-1) error=%d\n", error)); } else { SC_DEBUG(st->sc_periph, SCSIPI_DB3, ("no backspacing - flags = 0x%x, stxx=0x%x, st->flags=0x%x\n", flags, stxx, st->flags)); } } else { SC_DEBUG(st->sc_periph, SCSIPI_DB3, ("error %d from st_check_eod\n", error)); } break; case EJECT_MODE: st_unmount(st, EJECT); break; } KASSERTMSG((st->flags & ST_WRITTEN) == 0, "pending ST_WRITTEN flag NOT cleared (flags=0x%x)", st->flags); scsipi_wait_drain(periph); scsipi_adapter_delref(adapt); periph->periph_flags &= ~PERIPH_OPEN; return error; } /* * Start a new mount session. * Copy in all the default parameters from the selected device mode. * and try guess any that seem to be defaulted. */ static int st_mount_tape(dev_t dev, int flags) { int unit; u_int dsty; struct st_softc *st; struct scsipi_periph *periph; int error = 0; unit = STUNIT(dev); dsty = STDSTY(dev); st = device_lookup_private(&st_cd, unit); periph = st->sc_periph; if (st->flags & ST_MOUNTED) return 0; SC_DEBUG(periph, SCSIPI_DB1, ("mounting\n ")); st->flags |= ST_NEW_MOUNT; st->quirks = st->drive_quirks | st->modes[dsty].quirks; /* * If the media is new, then make sure we give it a chance to * to do a 'load' instruction. (We assume it is new.) */ if ((error = st_load(st, LD_LOAD, XS_CTL_SILENT)) != 0) return error; /* * Throw another dummy instruction to catch * 'Unit attention' errors. Many drives give * these after doing a Load instruction (with * the MEDIUM MAY HAVE CHANGED asc/ascq). */ scsipi_test_unit_ready(periph, XS_CTL_SILENT); /* XXX */ /* * Some devices can't tell you much until they have been * asked to look at the media. This quirk does this. */ if (st->quirks & ST_Q_SENSE_HELP) if ((error = st_touch_tape(st)) != 0) return error; /* * Load the physical device parameters * loads: blkmin, blkmax */ if ((error = st->ops(st, ST_OPS_RBL, 0)) != 0) return error; /* * Load the media dependent parameters * includes: media_blksize,media_density,numblks * As we have a tape in, it should be reflected here. * If not you may need the "quirk" above. */ if ((error = st->ops(st, ST_OPS_MODESENSE, 0)) != 0) return error; /* * If we have gained a permanent density from somewhere, * then use it in preference to the one supplied by * default by the driver. */ if (st->modeflags[dsty] & (DENSITY_SET_BY_QUIRK | DENSITY_SET_BY_USER)) st->density = st->modes[dsty].density; else st->density = st->media_density; /* * If we have gained a permanent blocksize * then use it in preference to the one supplied by * default by the driver. */ st->flags &= ~ST_FIXEDBLOCKS; if (st->modeflags[dsty] & (BLKSIZE_SET_BY_QUIRK | BLKSIZE_SET_BY_USER)) { st->blksize = st->modes[dsty].blksize; if (st->blksize) st->flags |= ST_FIXEDBLOCKS; } else { if ((error = st_decide_mode(st, FALSE)) != 0) return error; } if ((error = st->ops(st, ST_OPS_MODESELECT, 0)) != 0) { /* ATAPI will return ENODEV for this, and this may be OK */ if (error != ENODEV) { aprint_error_dev(st->sc_dev, "cannot set selected mode\n"); return error; } } st->flags &= ~ST_NEW_MOUNT; st->flags |= ST_MOUNTED; periph->periph_flags |= PERIPH_MEDIA_LOADED; /* move earlier? */ st->blkno = st->fileno = (daddr_t) 0; return 0; } /* * End the present mount session. * Rewind, and optionally eject the tape. * Reset various flags to indicate that all new * operations require another mount operation */ static void st_unmount(struct st_softc *st, boolean eject) { struct scsipi_periph *periph = st->sc_periph; int nmarks; if ((st->flags & ST_MOUNTED) == 0) return; SC_DEBUG(periph, SCSIPI_DB1, ("unmounting\n")); st_check_eod(st, FALSE, &nmarks, XS_CTL_IGNORE_NOT_READY); st_rewind(st, 0, XS_CTL_IGNORE_NOT_READY); /* * Section 9.3.3 of the SCSI specs states that a device shall return * the density value specified in the last successful MODE SELECT * after an unload operation, in case it is not able to * automatically determine the density of the new medium. * * So we instruct the device to use the default density, which will * prevent the use of stale density values (in particular, * in st_touch_tape(). */ st->density = 0; if (st->ops(st, ST_OPS_MODESELECT, 0) != 0) { aprint_error_dev(st->sc_dev, "WARNING: cannot revert to default density\n"); } if (eject) { if (!(st->quirks & ST_Q_NOPREVENT)) { scsipi_prevent(periph, SPAMR_ALLOW, XS_CTL_IGNORE_ILLEGAL_REQUEST | XS_CTL_IGNORE_NOT_READY); } st_load(st, LD_UNLOAD, XS_CTL_IGNORE_NOT_READY); st->blkno = st->fileno = (daddr_t) -1; } else { st->blkno = st->fileno = (daddr_t) 0; } st->flags &= ~(ST_MOUNTED | ST_NEW_MOUNT); periph->periph_flags &= ~PERIPH_MEDIA_LOADED; } /* * Given all we know about the device, media, mode, 'quirks' and * initial operation, make a decision as to how we should be set * to run (regarding blocking and EOD marks) */ int st_decide_mode(struct st_softc *st, boolean first_read) { SC_DEBUG(st->sc_periph, SCSIPI_DB2, ("starting block mode decision\n")); /* * If the drive can only handle fixed-length blocks and only at * one size, perhaps we should just do that. */ if (st->blkmin && (st->blkmin == st->blkmax)) { st->flags |= ST_FIXEDBLOCKS; st->blksize = st->blkmin; SC_DEBUG(st->sc_periph, SCSIPI_DB3, ("blkmin == blkmax of %d\n", st->blkmin)); goto done; } /* * If the tape density mandates (or even suggests) use of fixed * or variable-length blocks, comply. */ switch (st->density) { case HALFINCH_800: case HALFINCH_1600: case HALFINCH_6250: case DDS: st->flags &= ~ST_FIXEDBLOCKS; st->blksize = 0; SC_DEBUG(st->sc_periph, SCSIPI_DB3, ("density specified variable\n")); goto done; case QIC_11: case QIC_24: case QIC_120: case QIC_150: case QIC_525: case QIC_1320: case QIC_3095: case QIC_3220: st->flags |= ST_FIXEDBLOCKS; if (st->media_blksize > 0) st->blksize = st->media_blksize; else st->blksize = DEF_FIXED_BSIZE; SC_DEBUG(st->sc_periph, SCSIPI_DB3, ("density specified fixed\n")); goto done; } /* * If we're about to read the tape, perhaps we should choose * fixed or variable-length blocks and block size according to * what the drive found on the tape. */ if (first_read && (!(st->quirks & ST_Q_BLKSIZE) || (st->media_blksize == 0) || (st->media_blksize == DEF_FIXED_BSIZE) || (st->media_blksize == 1024))) { if (st->media_blksize > 0) st->flags |= ST_FIXEDBLOCKS; else st->flags &= ~ST_FIXEDBLOCKS; st->blksize = st->media_blksize; SC_DEBUG(st->sc_periph, SCSIPI_DB3, ("Used media_blksize of %d\n", st->media_blksize)); goto done; } /* * We're getting no hints from any direction. Choose variable- * length blocks arbitrarily. */ st->flags &= ~ST_FIXEDBLOCKS; st->blksize = 0; SC_DEBUG(st->sc_periph, SCSIPI_DB3, ("Give up and default to variable mode\n")); done: /* * Decide whether or not to write two file marks to signify end- * of-data. Make the decision as a function of density. If * the decision is not to use a second file mark, the SCSI BLANK * CHECK condition code will be recognized as end-of-data when * first read. * (I think this should be a by-product of fixed/variable..julian) */ switch (st->density) { /* case 8 mm: What is the SCSI density code for 8 mm, anyway? */ case QIC_11: case QIC_24: case QIC_120: case QIC_150: case QIC_525: case QIC_1320: case QIC_3095: case QIC_3220: st->flags &= ~ST_2FM_AT_EOD; break; default: st->flags |= ST_2FM_AT_EOD; } return 0; } /* * Actually translate the requested transfer into * one the physical driver can understand * The transfer is described by a buf and will include * only one physical transfer. */ static void ststrategy(struct buf *bp) { struct st_softc *st = device_lookup_private(&st_cd, STUNIT(bp->b_dev)); struct scsipi_periph *periph = st->sc_periph; struct scsipi_channel *chan = periph->periph_channel; SC_DEBUG(periph, SCSIPI_DB1, ("ststrategy %d bytes @ blk %" PRId64 "\n", bp->b_bcount, bp->b_blkno)); /* If it's a null transfer, return immediately */ if (bp->b_bcount == 0) goto abort; /* If offset is negative, error */ if (bp->b_blkno < 0) { SC_DEBUG(periph, SCSIPI_DB3, ("EINVAL: ststrategy negative blockcount %" PRId64 "\n", bp->b_blkno)); bp->b_error = EINVAL; goto abort; } /* Odd sized request on fixed drives are verboten */ if (st->flags & ST_FIXEDBLOCKS) { if (bp->b_bcount % st->blksize) { aprint_error_dev(st->sc_dev, "bad request, must be multiple of %d\n", st->blksize); bp->b_error = EIO; goto abort; } } /* as are out-of-range requests on variable drives. */ else if (bp->b_bcount < st->blkmin || (st->blkmax && bp->b_bcount > st->blkmax)) { aprint_error_dev(st->sc_dev, "bad request, must be between %d and %d\n", st->blkmin, st->blkmax); bp->b_error = EIO; goto abort; } mutex_enter(chan_mtx(chan)); /* * Place it in the queue of activities for this tape * at the end (a bit silly because we only have on user.. * (but it could fork())) */ bufq_put(st->buf_queue, bp); /* * Tell the device to get going on the transfer if it's * not doing anything, otherwise just wait for completion * (All a bit silly if we're only allowing 1 open but..) */ ststart(periph); mutex_exit(chan_mtx(chan)); return; abort: /* * Reset the residue because we didn't do anything, * and send the buffer back as done. */ bp->b_resid = bp->b_bcount; biodone(bp); return; } /* * ststart looks to see if there is a buf waiting for the device * and that the device is not already busy. If the device is busy, * the request is deferred and retried on the next attempt. * If both are true, ststart creates a scsi command to perform * the transfer required. * * The transfer request will call scsipi_done on completion, * which will in turn call this routine again so that the next * queued transfer is performed. The bufs are queued by the * strategy routine (ststrategy) * * This routine is also called after other non-queued requests * have been made of the scsi driver, to ensure that the queue * continues to be drained. * ststart() is called with channel lock held */ static int ststart1(struct scsipi_periph *periph, struct buf *bp, int *errnop) { struct st_softc *st = device_private(periph->periph_dev); struct scsipi_channel *chan = periph->periph_channel; struct scsi_rw_tape cmd; struct scsipi_xfer *xs; int flags, error, complete = 1; SC_DEBUG(periph, SCSIPI_DB2, ("ststart1 ")); mutex_enter(chan_mtx(chan)); if (periph->periph_active >= periph->periph_openings) { error = EAGAIN; goto out; } /* if a special awaits, let it proceed first */ if (periph->periph_flags & PERIPH_WAITING) { periph->periph_flags &= ~PERIPH_WAITING; cv_broadcast(periph_cv_periph(periph)); error = EAGAIN; goto out; } /* * If the device has been unmounted by the user * then throw away all requests until done. */ if (__predict_false((st->flags & ST_MOUNTED) == 0 || (periph->periph_flags & PERIPH_MEDIA_LOADED) == 0)) { error = EIO; goto out; } /* * only FIXEDBLOCK devices have pending I/O or space operations. */ if (st->flags & ST_FIXEDBLOCKS) { /* * If we are at a filemark but have not reported it yet * then we should report it now */ if (st->flags & ST_AT_FILEMARK) { if ((bp->b_flags & B_READ) == B_WRITE) { /* * Handling of ST_AT_FILEMARK in * st_space will fill in the right file * mark count. * Back up over filemark */ if (st_space(st, 0, SP_FILEMARKS, 0)) { error = EIO; goto out; } } else { error = 0; st->flags &= ~ST_AT_FILEMARK; goto out; } } } /* * If we are at EOM but have not reported it * yet then we should report it now. */ if (st->flags & (ST_EOM_PENDING|ST_EIO_PENDING)) { error = 0; if (st->flags & ST_EIO_PENDING) error = EIO; st->flags &= ~(ST_EOM_PENDING|ST_EIO_PENDING); goto out; } /* Fill out the scsi command */ memset(&cmd, 0, sizeof(cmd)); flags = XS_CTL_NOSLEEP | XS_CTL_ASYNC; if ((bp->b_flags & B_READ) == B_WRITE) { cmd.opcode = WRITE; st->flags &= ~ST_FM_WRITTEN; flags |= XS_CTL_DATA_OUT; } else { cmd.opcode = READ; flags |= XS_CTL_DATA_IN; } /* * Handle "fixed-block-mode" tape drives by using the * block count instead of the length. */ if (st->flags & ST_FIXEDBLOCKS) { cmd.byte2 |= SRW_FIXED; _lto3b(bp->b_bcount / st->blksize, cmd.len); } else _lto3b(bp->b_bcount, cmd.len); /* Clear 'position updated' indicator */ st->flags &= ~ST_POSUPDATED; /* go ask the adapter to do all this for us */ xs = scsipi_make_xs_locked(periph, (struct scsipi_generic *)&cmd, sizeof(cmd), (u_char *)bp->b_data, bp->b_bcount, 0, ST_IO_TIME, bp, flags); if (__predict_false(xs == NULL)) { /* * out of memory. Keep this buffer in the queue, and * retry later. */ callout_reset(&st->sc_callout, hz / 2, strestart, periph); error = EAGAIN; goto out; } error = scsipi_execute_xs(xs); /* with a scsipi_xfer preallocated, scsipi_command can't fail */ KASSERT(error == 0); if (error == 0) complete = 0; out: mutex_exit(chan_mtx(chan)); *errnop = error; return complete; } static void ststart(struct scsipi_periph *periph) { struct st_softc *st = device_private(periph->periph_dev); struct scsipi_channel *chan = periph->periph_channel; struct buf *bp; int error, complete; SC_DEBUG(periph, SCSIPI_DB2, ("ststart ")); mutex_exit(chan_mtx(chan)); mutex_enter(&st->sc_iolock); while ((bp = bufq_get(st->buf_defer)) != NULL || (bp = bufq_get(st->buf_queue)) != NULL) { iostat_busy(st->stats); mutex_exit(&st->sc_iolock); complete = ststart1(periph, bp, &error); mutex_enter(&st->sc_iolock); if (complete) { iostat_unbusy(st->stats, 0, ((bp->b_flags & B_READ) == B_READ)); if (error == EAGAIN) { bufq_put(st->buf_defer, bp); break; } } mutex_exit(&st->sc_iolock); if (complete) { bp->b_error = error; bp->b_resid = bp->b_bcount; biodone(bp); } mutex_enter(&st->sc_iolock); } mutex_exit(&st->sc_iolock); mutex_enter(chan_mtx(chan)); } static void strestart(void *v) { struct scsipi_periph *periph = (struct scsipi_periph *)v; struct scsipi_channel *chan = periph->periph_channel; mutex_enter(chan_mtx(chan)); ststart((struct scsipi_periph *)v); mutex_exit(chan_mtx(chan)); } static void stdone(struct scsipi_xfer *xs, int error) { struct st_softc *st = device_private(xs->xs_periph->periph_dev); struct buf *bp = xs->bp; if (bp) { bp->b_error = error; bp->b_resid = xs->resid; /* * buggy device ? A SDLT320 can report an info * field of 0x3de8000 on a Media Error/Write Error * for this CBD: 0x0a 00 00 80 00 00 */ if (bp->b_resid > bp->b_bcount || bp->b_resid < 0) bp->b_resid = bp->b_bcount; mutex_enter(&st->sc_iolock); if ((bp->b_flags & B_READ) == B_WRITE) st->flags |= ST_WRITTEN; else st->flags &= ~ST_WRITTEN; iostat_unbusy(st->stats, bp->b_bcount, ((bp->b_flags & B_READ) == B_READ)); if ((st->flags & ST_POSUPDATED) == 0) { if (error) { st->fileno = st->blkno = -1; } else if (st->blkno != -1) { if (st->flags & ST_FIXEDBLOCKS) st->blkno += (bp->b_bcount / st->blksize); else st->blkno++; } } mutex_exit(&st->sc_iolock); rnd_add_uint32(&st->rnd_source, bp->b_blkno); biodone(bp); } } static int stread(dev_t dev, struct uio *uio, int iomode) { struct st_softc *st = device_lookup_private(&st_cd, STUNIT(dev)); int r = physio(ststrategy, NULL, dev, B_READ, st->sc_periph->periph_channel->chan_adapter->adapt_minphys, uio); SC_DEBUG(st->sc_periph, SCSIPI_DB1, ("[stread: result=%d]\n", r)); return r; } static int stwrite(dev_t dev, struct uio *uio, int iomode) { struct st_softc *st = device_lookup_private(&st_cd, STUNIT(dev)); int r = physio(ststrategy, NULL, dev, B_WRITE, st->sc_periph->periph_channel->chan_adapter->adapt_minphys, uio); SC_DEBUG(st->sc_periph, SCSIPI_DB1, ("[stwrite: result=%d]\n", r)); return r; } /* * Perform special action on behalf of the user; * knows about the internals of this device */ static int stioctl(dev_t dev, u_long cmd, void *arg, int flag, struct lwp *l) { int error = 0; int unit; int number, nmarks, dsty; int flags; struct st_softc *st; int hold_blksize; uint8_t hold_density; struct mtop *mt = (struct mtop *) arg; /* Find the device that the user is talking about */ flags = 0; /* give error messages, act on errors etc. */ unit = STUNIT(dev); dsty = STDSTY(dev); st = device_lookup_private(&st_cd, unit); hold_blksize = st->blksize; hold_density = st->density; switch ((u_int)cmd) { case MTIOCGET: { struct mtget *g = (struct mtget *) arg; /* * (to get the current state of READONLY) */ error = st->ops(st, ST_OPS_MODESENSE, XS_CTL_SILENT); if (error) { /* * Ignore the error if in control mode; * this is mandated by st(4). */ if (STMODE(dev) != CTRL_MODE) break; error = 0; } SC_DEBUG(st->sc_periph, SCSIPI_DB1, ("[ioctl: get status]\n")); memset(g, 0, sizeof(struct mtget)); g->mt_type = MT_ISAR; /* Ultrix compat *//*? */ g->mt_blksiz = st->blksize; g->mt_density = st->density; g->mt_mblksiz[0] = st->modes[0].blksize; g->mt_mblksiz[1] = st->modes[1].blksize; g->mt_mblksiz[2] = st->modes[2].blksize; g->mt_mblksiz[3] = st->modes[3].blksize; g->mt_mdensity[0] = st->modes[0].density; g->mt_mdensity[1] = st->modes[1].density; g->mt_mdensity[2] = st->modes[2].density; g->mt_mdensity[3] = st->modes[3].density; g->mt_fileno = st->fileno; g->mt_blkno = st->blkno; if (st->flags & ST_READONLY) g->mt_dsreg |= MT_DS_RDONLY; if (st->flags & ST_MOUNTED) g->mt_dsreg |= MT_DS_MOUNTED; g->mt_resid = st->mt_resid; g->mt_erreg = st->mt_erreg; /* * clear latched errors. */ st->mt_resid = 0; st->mt_erreg = 0; st->asc = 0; st->ascq = 0; break; } case MTIOCTOP: { SC_DEBUG(st->sc_periph, SCSIPI_DB1, ("[ioctl: op=0x%x count=0x%x]\n", mt->mt_op, mt->mt_count)); /* compat: in U*x it is a short */ number = mt->mt_count; switch ((short) (mt->mt_op)) { case MTWEOF: /* write an end-of-file record */ error = st_write_filemarks(st, number, flags); break; case MTBSF: /* backward space file */ number = -number; /* FALLTHROUGH */ case MTFSF: /* forward space file */ error = st_check_eod(st, FALSE, &nmarks, flags); if (!error) error = st_space(st, number - nmarks, SP_FILEMARKS, flags); break; case MTBSR: /* backward space record */ number = -number; /* FALLTHROUGH */ case MTFSR: /* forward space record */ error = st_check_eod(st, true, &nmarks, flags); if (!error) error = st_space(st, number, SP_BLKS, flags); break; case MTREW: /* rewind */ error = st_rewind(st, 0, flags); break; case MTOFFL: /* rewind and put the drive offline */ st_unmount(st, EJECT); break; case MTNOP: /* no operation, sets status only */ break; case MTRETEN: /* retension the tape */ error = st_load(st, LD_RETENSION, flags); if (!error) error = st_load(st, LD_LOAD, flags); break; case MTEOM: /* forward space to end of media */ error = st_check_eod(st, FALSE, &nmarks, flags); if (!error) error = st_space(st, 1, SP_EOM, flags); break; case MTCACHE: /* enable controller cache */ st->flags &= ~ST_DONTBUFFER; goto try_new_value; case MTNOCACHE: /* disable controller cache */ st->flags |= ST_DONTBUFFER; goto try_new_value; case MTERASE: /* erase volume */ error = st_erase(st, number, flags); break; case MTSETBSIZ: /* Set block size for device */ #ifdef NOTYET if (!(st->flags & ST_NEW_MOUNT)) { uprintf("re-mount tape before changing " "blocksize"); error = EINVAL; break; } #endif if (number == 0) st->flags &= ~ST_FIXEDBLOCKS; else { if ((st->blkmin || st->blkmax) && (number < st->blkmin || number > st->blkmax)) { error = EINVAL; break; } st->flags |= ST_FIXEDBLOCKS; } st->blksize = number; st->flags |= ST_BLOCK_SET; /*XXX */ goto try_new_value; case MTSETDNSTY: /* Set density for device and mode */ /* * Any number >= 0 and <= 0xff is legal. Numbers * above 0x80 are 'vendor unique'. */ if (number < 0 || number > 255) { error = EINVAL; break; } else st->density = number; goto try_new_value; case MTCMPRESS: error = st->ops(st, (number == 0) ? ST_OPS_CMPRSS_OFF : ST_OPS_CMPRSS_ON, XS_CTL_SILENT); break; case MTEWARN: if (number) st->flags |= ST_EARLYWARN; else st->flags &= ~ST_EARLYWARN; break; default: error = EINVAL; } break; } case MTIOCIEOT: case MTIOCEEOT: break; case MTIOCRDSPOS: error = st_rdpos(st, 0, (uint32_t *)arg); break; case MTIOCRDHPOS: error = st_rdpos(st, 1, (uint32_t *)arg); break; case MTIOCSLOCATE: error = st_setpos(st, 0, (uint32_t *)arg); break; case MTIOCHLOCATE: error = st_setpos(st, 1, (uint32_t *)arg); break; default: error = scsipi_do_ioctl(st->sc_periph, dev, cmd, arg, flag, l); break; } return error; try_new_value: /* * Check that the mode being asked for is aggreeable to the * drive. If not, put it back the way it was. * * If in control mode, we can make (persistent) mode changes * even if no medium is loaded (see st(4)). */ if ((STMODE(dev) != CTRL_MODE || (st->flags & ST_MOUNTED) != 0) && (error = st->ops(st, ST_OPS_MODESELECT, 0)) != 0) { /* put it back as it was */ aprint_error_dev(st->sc_dev, "cannot set selected mode\n"); st->density = hold_density; st->blksize = hold_blksize; if (st->blksize) st->flags |= ST_FIXEDBLOCKS; else st->flags &= ~ST_FIXEDBLOCKS; return error; } /* * As the drive liked it, if we are setting a new default, * set it into the structures as such. * * The means for deciding this are not finalised yet- but * if the device was opened in Control Mode, the values * are persistent now across mounts. */ if (STMODE(dev) == CTRL_MODE) { switch ((short) (mt->mt_op)) { case MTSETBSIZ: st->modes[dsty].blksize = st->blksize; st->modeflags[dsty] |= BLKSIZE_SET_BY_USER; break; case MTSETDNSTY: st->modes[dsty].density = st->density; st->modeflags[dsty] |= DENSITY_SET_BY_USER; break; } } return 0; } /* Do a synchronous read. */ static int st_read(struct st_softc *st, char *bf, int size, int flags) { struct scsi_rw_tape cmd; /* If it's a null transfer, return immediately */ if (size == 0) return 0; memset(&cmd, 0, sizeof(cmd)); cmd.opcode = READ; if (st->flags & ST_FIXEDBLOCKS) { cmd.byte2 |= SRW_FIXED; _lto3b(size / (st->blksize ? st->blksize : DEF_FIXED_BSIZE), cmd.len); } else _lto3b(size, cmd.len); return scsipi_command(st->sc_periph, (void *)&cmd, sizeof(cmd), (void *)bf, size, 0, ST_IO_TIME, NULL, flags | XS_CTL_DATA_IN); } /* issue an erase command */ static int st_erase(struct st_softc *st, int full, int flags) { int tmo; struct scsi_erase cmd; /* * Full erase means set LONG bit in erase command, which asks * the drive to erase the entire unit. Without this bit, we're * asking the drive to write an erase gap. */ memset(&cmd, 0, sizeof(cmd)); cmd.opcode = ERASE; if (full) { cmd.byte2 = SE_LONG; tmo = ST_SPC_TIME; } else tmo = ST_IO_TIME; /* * XXX We always do this asynchronously, for now, unless the device * has the ST_Q_ERASE_NOIMM quirk. How long should we wait if we * want to (eventually) to it synchronously? */ if ((st->quirks & ST_Q_ERASE_NOIMM) == 0) cmd.byte2 |= SE_IMMED; return scsipi_command(st->sc_periph, (void *)&cmd, sizeof(cmd), 0, 0, ST_RETRIES, tmo, NULL, flags); } /* skip N blocks/filemarks/seq filemarks/eom */ static int st_space(struct st_softc *st, int number, u_int what, int flags) { struct scsi_space cmd; int error; switch (what) { case SP_BLKS: if (st->flags & ST_PER_ACTION) { if (number > 0) { st->flags &= ~ST_PER_ACTION; return EIO; } else if (number < 0) { if (st->flags & ST_AT_FILEMARK) { /* * Handling of ST_AT_FILEMARK * in st_space will fill in the * right file mark count. */ error = st_space(st, 0, SP_FILEMARKS, flags); if (error) return error; } if (st->flags & ST_BLANK_READ) { st->flags &= ~ST_BLANK_READ; return EIO; } st->flags &= ~(ST_EIO_PENDING|ST_EOM_PENDING); } } break; case SP_FILEMARKS: if (st->flags & ST_EIO_PENDING) { if (number > 0) { /* pretend we just discovered the error */ st->flags &= ~ST_EIO_PENDING; return EIO; } else if (number < 0) { /* back away from the error */ st->flags &= ~ST_EIO_PENDING; } } if (st->flags & ST_AT_FILEMARK) { st->flags &= ~ST_AT_FILEMARK; number--; } if ((st->flags & ST_BLANK_READ) && (number < 0)) { /* back away from unwritten tape */ st->flags &= ~ST_BLANK_READ; number++; /* XXX dubious */ } break; case SP_EOM: if (st->flags & ST_EOM_PENDING) { /* we're already there */ st->flags &= ~ST_EOM_PENDING; return 0; } if (st->flags & ST_EIO_PENDING) { /* pretend we just discovered the error */ st->flags &= ~ST_EIO_PENDING; return EIO; } if (st->flags & ST_AT_FILEMARK) st->flags &= ~ST_AT_FILEMARK; break; } if (number == 0) return 0; memset(&cmd, 0, sizeof(cmd)); cmd.opcode = SPACE; cmd.byte2 = what; _lto3b(number, cmd.number); st->flags &= ~ST_POSUPDATED; st->last_ctl_resid = 0; error = scsipi_command(st->sc_periph, (void *)&cmd, sizeof(cmd), 0, 0, 0, ST_SPC_TIME, NULL, flags); if (error == 0 && (st->flags & ST_POSUPDATED) == 0) { number = number - st->last_ctl_resid; if (what == SP_BLKS) { if (st->blkno != -1) st->blkno += number; } else if (what == SP_FILEMARKS) { if (st->fileno != -1) { st->fileno += number; if (number > 0) st->blkno = 0; else if (number < 0) st->blkno = -1; } } else if (what == SP_EOM) { st_updatefilepos(st); } } return error; } /* * write N filemarks */ static int st_write_filemarks(struct st_softc *st, int number, int flags) { int error; struct scsi_write_filemarks cmd; /* * It's hard to write a negative number of file marks. * Don't try. */ if (number < 0) { SC_DEBUG(st->sc_periph, SCSIPI_DB3, ("EINVAL: st_write_filemarks not writing %d file marks\n", number)); return EINVAL; } switch (number) { case 0: /* really a command to sync the drive's buffers */ break; case 1: if (st->flags & ST_FM_WRITTEN) /* already have one down */ st->flags &= ~ST_WRITTEN; else st->flags |= ST_FM_WRITTEN; st->flags &= ~ST_PER_ACTION; break; default: st->flags &= ~(ST_PER_ACTION | ST_WRITTEN); } memset(&cmd, 0, sizeof(cmd)); cmd.opcode = WRITE_FILEMARKS; if (SCSIPI_BUSTYPE_TYPE(scsipi_periph_bustype(st->sc_periph)) == SCSIPI_BUSTYPE_ATAPI) cmd.byte2 = SR_IMMED; /* * The ATAPI Onstream DI-30 doesn't support writing filemarks, but * WRITE_FILEMARKS is still used to flush the buffer */ if ((st->quirks & ST_Q_NOFILEMARKS) == 0) _lto3b(number, cmd.number); /* XXX WE NEED TO BE ABLE TO GET A RESIDIUAL XXX */ error = scsipi_command(st->sc_periph, (void *)&cmd, sizeof(cmd), 0, 0, 0, ST_IO_TIME * 4, NULL, flags); if (error == 0 && st->fileno != -1) st->fileno += number; return error; } /* * Make sure the right number of file marks is on tape if the * tape has been written. If the position argument is true, * leave the tape positioned where it was originally. * * nmarks returns the number of marks to skip (or, if position * true, which were skipped) to get back original position. */ static int st_check_eod(struct st_softc *st, boolean position, int *nmarks, int flags) { int error; switch (st->flags & (ST_WRITTEN | ST_FM_WRITTEN | ST_2FM_AT_EOD)) { default: *nmarks = 0; return 0; case ST_WRITTEN: case ST_WRITTEN | ST_FM_WRITTEN | ST_2FM_AT_EOD: *nmarks = 1; break; case ST_WRITTEN | ST_2FM_AT_EOD: *nmarks = 2; } error = st_write_filemarks(st, *nmarks, flags); if (position && !error) error = st_space(st, -*nmarks, SP_FILEMARKS, flags); return error; } /* load/unload/retension */ static int st_load(struct st_softc *st, u_int type, int flags) { int error; struct scsi_load cmd; if (type != LD_LOAD) { int nmarks; error = st_check_eod(st, FALSE, &nmarks, flags); if (error) { aprint_error_dev(st->sc_dev, "failed to write closing filemarks at " "unload, errno=%d\n", error); return error; } } if (st->quirks & ST_Q_IGNORE_LOADS) { if (type == LD_LOAD) /* * If we ignore loads, at least we should try a rewind. */ return st_rewind(st, 0, flags); /* otherwise, we should do what's asked of us */ } memset(&cmd, 0, sizeof(cmd)); cmd.opcode = LOAD; if (SCSIPI_BUSTYPE_TYPE(scsipi_periph_bustype(st->sc_periph)) == SCSIPI_BUSTYPE_ATAPI) cmd.byte2 = SR_IMMED; cmd.how = type; error = scsipi_command(st->sc_periph, (void *)&cmd, sizeof(cmd), 0, 0, ST_RETRIES, ST_SPC_TIME, NULL, flags); if (error) { aprint_error_dev(st->sc_dev, "error %d in st_load (op %d)\n", error, type); } return error; } /* Rewind the device */ static int st_rewind(struct st_softc *st, u_int immediate, int flags) { struct scsi_rewind cmd; int error; int nmarks; int timeout; error = st_check_eod(st, FALSE, &nmarks, flags); if (error) { aprint_error_dev(st->sc_dev, "failed to write closing filemarks at " "rewind, errno=%d\n", error); return error; } st->flags &= ~ST_PER_ACTION; /* If requestor asked for immediate response, set a short timeout */ timeout = immediate ? ST_CTL_TIME : ST_SPC_TIME; /* ATAPI tapes always need immediate to be set */ if (scsipi_periph_bustype(st->sc_periph) == SCSIPI_BUSTYPE_ATAPI) immediate = SR_IMMED; memset(&cmd, 0, sizeof(cmd)); cmd.opcode = REWIND; cmd.byte2 = immediate; error = scsipi_command(st->sc_periph, (void *)&cmd, sizeof(cmd), 0, 0, ST_RETRIES, timeout, NULL, flags); if (error) { aprint_error_dev(st->sc_dev, "error %d trying to rewind\n", error); /* lost position */ st->fileno = st->blkno = -1; } else st->fileno = st->blkno = 0; return error; } static void st_updatefilepos(struct st_softc *st) { int error; uint8_t posdata[32]; struct scsi_tape_read_position cmd; memset(&cmd, 0, sizeof(cmd)); memset(&posdata, 0, sizeof(posdata)); cmd.opcode = READ_POSITION; cmd.byte1 = 6; /* service action: LONG FORM */ error = scsipi_command(st->sc_periph, (void *)&cmd, sizeof(cmd), (void *)&posdata, sizeof(posdata), ST_RETRIES, ST_CTL_TIME, NULL, XS_CTL_SILENT | XS_CTL_DATA_IN); if (error == 0) { #ifdef SCSIPI_DEBUG if (st->sc_periph->periph_dbflags & SCSIPI_DB3) { int hard; printf("posdata: "); for (hard = 0; hard < sizeof(posdata); hard++) printf("%02x ", posdata[hard] & 0xff); printf("\n"); } #endif if (posdata[0] & 0xC) { /* Block|Mark Position Unknown */ SC_DEBUG(st->sc_periph, SCSIPI_DB3, ("st_updatefilepos block/mark position unknown (0x%02x)\n", posdata[0])); } else { st->fileno = _8btol(&posdata[16]); st->blkno = 0; SC_DEBUG(st->sc_periph, SCSIPI_DB3, ("st_updatefilepos file position %"PRId64"\n", st->fileno)); return; } } else { SC_DEBUG(st->sc_periph, SCSIPI_DB3, ("st_updatefilepos READ POSITION(LONG_FORM) failed (error=%d)\n", error)); } st->fileno = -1; st->blkno = -1; } static int st_rdpos(struct st_softc *st, int hard, uint32_t *blkptr) { int error; uint8_t posdata[20]; struct scsi_tape_read_position cmd; /* * We try and flush any buffered writes here if we were writing * and we're trying to get hardware block position. It eats * up performance substantially, but I'm wary of drive firmware. * * I think that *logical* block position is probably okay- * but hardware block position might have to wait for data * to hit media to be valid. Caveat Emptor. */ if (hard && (st->flags & ST_WRITTEN)) { /* First flush any pending writes... */ error = st_write_filemarks(st, 0, XS_CTL_SILENT); /* * The latter case is for 'write protected' tapes * which are too stupid to recognize a zero count * for writing filemarks as a no-op. */ if (error != 0 && error != EACCES && error != EROFS) return error; } memset(&cmd, 0, sizeof(cmd)); memset(&posdata, 0, sizeof(posdata)); cmd.opcode = READ_POSITION; if (hard) cmd.byte1 = 1; error = scsipi_command(st->sc_periph, (void *)&cmd, sizeof(cmd), (void *)&posdata, sizeof(posdata), ST_RETRIES, ST_CTL_TIME, NULL, XS_CTL_SILENT | XS_CTL_DATA_IN); if (error == 0) { #if 0 printf("posdata:"); for (hard = 0; hard < sizeof(posdata); hard++) printf("%02x ", posdata[hard] & 0xff); printf("\n"); #endif if (posdata[0] & 0x4) { /* Block Position Unknown */ SC_DEBUG(st->sc_periph, SCSIPI_DB3, ("EINVAL: strdpos block position unknown\n")); error = EINVAL; } else *blkptr = _4btol(&posdata[4]); } return error; } static int st_setpos(struct st_softc *st, int hard, uint32_t *blkptr) { int error; struct scsi_tape_locate cmd; /* * We used to try and flush any buffered writes here. * Now we push this onto user applications to either * flush the pending writes themselves (via a zero count * WRITE FILEMARKS command) or they can trust their tape * drive to do this correctly for them. * * There are very ugly performance limitations otherwise. */ memset(&cmd, 0, sizeof(cmd)); cmd.opcode = LOCATE; if (hard) cmd.byte2 = 1 << 2; _lto4b(*blkptr, cmd.blkaddr); error = scsipi_command(st->sc_periph, (void *)&cmd, sizeof(cmd), 0, 0, ST_RETRIES, ST_SPC_TIME, NULL, 0); /* * Note file && block number position now unknown (if * these things ever start being maintained in this driver) */ st->fileno = st->blkno = -1; return error; } /* * Look at the returned sense and act on the error and determine * the unix error number to pass back..., 0 (== report no error), * -1 = retry the operation, -2 continue error processing. */ static int st_interpret_sense(struct scsipi_xfer *xs) { struct scsipi_periph *periph = xs->xs_periph; struct scsi_sense_data *sense = &xs->sense.scsi_sense; struct buf *bp = xs->bp; struct st_softc *st = device_private(periph->periph_dev); int retval = EJUSTRETURN; int doprint = ((xs->xs_control & XS_CTL_SILENT) == 0); uint8_t key; int32_t info; /* * If it isn't a extended or extended/deferred error, let * the generic code handle it. */ if (SSD_RCODE(sense->response_code) != SSD_RCODE_CURRENT && SSD_RCODE(sense->response_code) != SSD_RCODE_DEFERRED) return retval; if (sense->response_code & SSD_RCODE_VALID) info = _4btol(sense->info); else info = (st->flags & ST_FIXEDBLOCKS) ? xs->datalen / st->blksize : xs->datalen; key = SSD_SENSE_KEY(sense->flags); st->mt_erreg = key; st->asc = sense->asc; st->ascq = sense->ascq; st->mt_resid = (short) info; if (key == SKEY_NOT_READY && st->asc == 0x4 && st->ascq == 0x1) { /* Not Ready, Logical Unit Is in Process Of Becoming Ready */ if (!callout_pending(&periph->periph_callout)) scsipi_periph_freeze(periph, 1); callout_reset(&periph->periph_callout, hz, scsipi_periph_timed_thaw, periph); return ERESTART; } /* If the device is not open yet, let generic handle */ if ((periph->periph_flags & PERIPH_OPEN) == 0) return retval; xs->resid = info; if (st->flags & ST_FIXEDBLOCKS) { if (bp) { xs->resid *= st->blksize; st->last_io_resid = xs->resid; } else st->last_ctl_resid = xs->resid; if (key == SKEY_VOLUME_OVERFLOW) { st->flags |= ST_EIO_PENDING; if (bp) bp->b_resid = xs->resid; } else if (sense->flags & SSD_EOM) { if ((st->flags & ST_EARLYWARN) == 0) st->flags |= ST_EIO_PENDING; st->flags |= ST_EOM_PENDING; if (bp) { #if 0 bp->b_resid = xs->resid; #else /* * Grotesque as it seems, the few times * I've actually seen a non-zero resid, * the tape drive actually lied and had * written all the data! */ bp->b_resid = 0; #endif } } if (sense->flags & SSD_FILEMARK) { st->flags |= ST_AT_FILEMARK; if (bp) bp->b_resid = xs->resid; if (st->fileno != (daddr_t) -1) { st->fileno++; st->blkno = 0; st->flags |= ST_POSUPDATED; } } if (sense->flags & SSD_ILI) { st->flags |= ST_EIO_PENDING; if (bp) bp->b_resid = xs->resid; if (sense->response_code & SSD_RCODE_VALID && (xs->xs_control & XS_CTL_SILENT) == 0) aprint_error_dev(st->sc_dev, "block wrong size, %d blocks residual\n", info); /* * This quirk code helps the drive read * the first tape block, regardless of * format. That is required for these * drives to return proper MODE SENSE * information. */ if ((st->quirks & ST_Q_SENSE_HELP) && (periph->periph_flags & PERIPH_MEDIA_LOADED) == 0) st->blksize -= 512; else if ((st->flags & ST_POSUPDATED) == 0) { if (st->blkno != (daddr_t) -1) { st->blkno += (xs->datalen / st->blksize); st->flags |= ST_POSUPDATED; } } } /* * If data wanted and no data was transferred, do it immediately */ if (xs->datalen && xs->resid >= xs->datalen) { if (st->flags & ST_EIO_PENDING) return EIO; if (st->flags & ST_AT_FILEMARK) { if (bp) bp->b_resid = xs->resid; return 0; } } } else { /* must be variable mode */ if (bp) st->last_io_resid = xs->resid; else st->last_ctl_resid = xs->resid; if (sense->flags & SSD_EOM) { /* * The current semantics of this * driver requires EOM detection * to return EIO unless early * warning detection is enabled * for variable mode (this is always * on for fixed block mode). */ if (st->flags & ST_EARLYWARN) { st->flags |= ST_EOM_PENDING; retval = 0; } else { retval = EIO; /* * If we return an error we can't claim to * have transferred all data. */ if (xs->resid == 0) xs->resid = xs->datalen; } /* * If it's an unadorned EOM detection, * suppress printing an error. */ if (key == SKEY_NO_SENSE) { doprint = 0; } } else if (sense->flags & SSD_FILEMARK) { retval = 0; if (st->fileno != (daddr_t) -1) { st->fileno++; st->blkno = 0; st->flags |= ST_POSUPDATED; } } else if (sense->flags & SSD_ILI) { if (info < 0) { /* * The tape record was bigger than the read * we issued. */ if ((xs->xs_control & XS_CTL_SILENT) == 0) { aprint_error_dev(st->sc_dev, "%d-byte tape record too big" " for %d-byte user buffer\n", xs->datalen - info, xs->datalen); } retval = EIO; } else { retval = 0; if (st->blkno != (daddr_t) -1) { st->blkno++; st->flags |= ST_POSUPDATED; } } } if (bp) bp->b_resid = xs->resid; } #ifndef SCSIPI_DEBUG if (retval == 0 && key == SKEY_NO_SENSE) doprint = 0; #endif if (key == SKEY_BLANK_CHECK) { /* * This quirk code helps the drive read the * first tape block, regardless of format. That * is required for these drives to return proper * MODE SENSE information. */ if ((st->quirks & ST_Q_SENSE_HELP) && (periph->periph_flags & PERIPH_MEDIA_LOADED) == 0) { /* still starting */ st->blksize -= 512; } else if (!(st->flags & (ST_2FM_AT_EOD | ST_BLANK_READ))) { st->flags |= ST_BLANK_READ; xs->resid = xs->datalen; if (bp) { bp->b_resid = xs->resid; /* return an EOF */ } retval = 0; /* lost position */ st->fileno = st->blkno = -1; } } /* * If generic sense processing will continue, we should not * print sense info here. */ if (retval == EJUSTRETURN) doprint = 0; if (doprint) { /* Print verbose sense info if possible */ if (scsipi_print_sense(xs, 0) != 0) return retval; /* Print less-verbose sense info */ scsipi_printaddr(periph); printf("Sense Key 0x%02x", key); if ((sense->response_code & SSD_RCODE_VALID) != 0) { switch (key) { case SKEY_NOT_READY: case SKEY_ILLEGAL_REQUEST: case SKEY_UNIT_ATTENTION: case SKEY_DATA_PROTECT: break; case SKEY_VOLUME_OVERFLOW: case SKEY_BLANK_CHECK: printf(", requested size: %d (decimal)", info); break; case SKEY_ABORTED_COMMAND: if (xs->xs_retries) printf(", retrying"); printf(", cmd 0x%x, info 0x%x", xs->cmd->opcode, info); break; default: printf(", info = %d (decimal)", info); } } if (sense->extra_len != 0) { int n; printf(", data ="); for (n = 0; n < sense->extra_len; n++) printf(" %02x", sense->csi[n]); } printf("\n"); } return retval; } /* * The quirk here is that the drive returns some value to st_mode_sense * incorrectly until the tape has actually passed by the head. * * The method is to set the drive to large fixed-block state (user-specified * density and 1024-byte blocks), then read and rewind to get it to sense the * tape. If that doesn't work, try 512-byte fixed blocks. If that doesn't * work, as a last resort, try variable- length blocks. The result will be * the ability to do an accurate st_mode_sense. * * We know we can do a rewind because we just did a load, which implies rewind. * Rewind seems preferable to space backward if we have a virgin tape. * * The rest of the code for this quirk is in ILI processing and BLANK CHECK * error processing, both part of st_interpret_sense. */ static int st_touch_tape(struct st_softc *st) { char *bf; int readsize; int error; bf = malloc(1024, M_TEMP, M_WAITOK); if ((error = st->ops(st, ST_OPS_MODESENSE, 0)) != 0) goto bad; /* * If the block size is already known from the * sense data, use it. Else start probing at 1024. */ if (st->media_blksize > 0) st->blksize = st->media_blksize; else st->blksize = 1024; do { switch (st->blksize) { case 512: case 1024: readsize = st->blksize; st->flags |= ST_FIXEDBLOCKS; break; default: readsize = 1; st->flags &= ~ST_FIXEDBLOCKS; } if ((error = st->ops(st, ST_OPS_MODESELECT, XS_CTL_SILENT)) != 0) { /* * The device did not agree with the proposed * block size. If we exhausted our options, * return failure, else try another. */ if (readsize == 1) goto bad; st->blksize -= 512; continue; } st_read(st, bf, readsize, XS_CTL_SILENT); /* XXX */ if ((error = st_rewind(st, 0, 0)) != 0) { bad: free(bf, M_TEMP); return error; } } while (readsize != 1 && readsize > st->blksize); free(bf, M_TEMP); return 0; } static int stdump(dev_t dev, daddr_t blkno, void *va, size_t size) { /* Not implemented. */ return ENXIO; } /* * Send a filled out parameter structure to the drive to * set it into the desire modes etc. */ int st_mode_select(struct st_softc *st, int flags) { u_int select_len; struct select { struct scsi_mode_parameter_header_6 header; struct scsi_general_block_descriptor blk_desc; u_char sense_data[MAX_PAGE_0_SIZE]; } select; struct scsipi_periph *periph = st->sc_periph; select_len = sizeof(select.header) + sizeof(select.blk_desc) + st->page_0_size; /* * This quirk deals with drives that have only one valid mode * and think this gives them license to reject all mode selects, * even if the selected mode is the one that is supported. */ if (st->quirks & ST_Q_UNIMODAL) { SC_DEBUG(periph, SCSIPI_DB3, ("not setting density 0x%x blksize 0x%x\n", st->density, st->blksize)); return 0; } /* Set up for a mode select */ memset(&select, 0, sizeof(select)); select.header.blk_desc_len = sizeof(struct scsi_general_block_descriptor); select.header.dev_spec &= ~SMH_DSP_BUFF_MODE; select.blk_desc.density = st->density; if (st->flags & ST_DONTBUFFER) select.header.dev_spec |= SMH_DSP_BUFF_MODE_OFF; else select.header.dev_spec |= SMH_DSP_BUFF_MODE_ON; if (st->flags & ST_FIXEDBLOCKS) _lto3b(st->blksize, select.blk_desc.blklen); if (st->page_0_size) memcpy(select.sense_data, st->sense_data, st->page_0_size); /* do the command */ return scsipi_mode_select(periph, 0, &select.header, select_len, flags, ST_RETRIES, ST_CTL_TIME); }
1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 /* $NetBSD: kern_module_vfs.c,v 1.18 2021/06/29 22:40:53 dholland Exp $ */ /*- * Copyright (c) 2008 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software developed for The NetBSD Foundation * by Andrew Doran. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /* * Kernel module file system interaction. */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: kern_module_vfs.c,v 1.18 2021/06/29 22:40:53 dholland Exp $"); #define _MODULE_INTERNAL #include <sys/param.h> #include <sys/fcntl.h> #include <sys/kmem.h> #include <sys/kobj.h> #include <sys/module.h> #include <sys/namei.h> #include <sys/pool.h> #include <sys/stat.h> #include <sys/vnode.h> #include <prop/proplib.h> static int module_load_plist_vfs(const char *, const bool, prop_dictionary_t *); void module_load_vfs_init(void) { module_load_vfs_vec = module_load_vfs; aprint_normal("kern.module.path=%s\n", module_base); } int module_load_vfs(const char *name, int flags, bool autoload, module_t *mod, prop_dictionary_t *filedictp) { char *path; bool nochroot; int error; prop_bool_t noload; prop_dictionary_t moduledict; nochroot = false; error = 0; path = NULL; moduledict = NULL; if (filedictp) *filedictp = NULL; path = PNBUF_GET(); if (!autoload) { if (strchr(name, '/') != NULL) { nochroot = false; snprintf(path, MAXPATHLEN, "%s", name); module_print("Loading module from %s", path); error = kobj_load_vfs(&mod->mod_kobj, path, nochroot); } else error = ENOENT; } if (autoload || (error == ENOENT)) { if (strchr(name, '/') == NULL) { nochroot = true; snprintf(path, MAXPATHLEN, "%s/%s/%s.kmod", module_base, name, name); module_print("Loading module from %s", path); error = kobj_load_vfs(&mod->mod_kobj, path, nochroot); } else error = ENOENT; } if (error != 0) { PNBUF_PUT(path); module_print("Cannot %sload kernel object `%s'" " error=%d", autoload ? "auto" : "", name, error); return error; } /* * Load and process <module>.plist if it exists. */ if ((!ISSET(flags, MODCTL_NO_PROP) && filedictp) || autoload) { error = module_load_plist_vfs(path, nochroot, &moduledict); if (error != 0) { module_print("plist load returned error %d for `%s'", error, path); if (error != ENOENT) goto fail; } else if (autoload) { noload = prop_dictionary_get(moduledict, "noautoload"); if (noload != NULL && prop_bool_true(noload)) { module_error("autoloading is disallowed for %s", path); prop_object_release(moduledict); error = EPERM; goto fail; } } if (error == 0) { /* can get here if error == ENOENT */ if (!ISSET(flags, MODCTL_NO_PROP) && filedictp) *filedictp = moduledict; else prop_object_release(moduledict); } } PNBUF_PUT(path); return 0; fail: kobj_unload(mod->mod_kobj); PNBUF_PUT(path); return error; } /* * module_load_plist_vfs: * * Load a plist located in the file system into memory. */ static int module_load_plist_vfs(const char *modpath, const bool nochroot, prop_dictionary_t *filedictp) { struct pathbuf *pb; struct vnode *vp; struct stat sb; void *base; char *proppath; const size_t plistsize = 8192; size_t resid; int error, pathlen; KASSERT(filedictp != NULL); base = NULL; proppath = PNBUF_GET(); strlcpy(proppath, modpath, MAXPATHLEN); pathlen = strlen(proppath); if ((pathlen >= 6) && (strcmp(&proppath[pathlen - 5], ".kmod") == 0)) { strcpy(&proppath[pathlen - 5], ".plist"); } else if (pathlen < MAXPATHLEN - 6) { strcat(proppath, ".plist"); } else { error = ENOENT; goto out1; } /* XXX this makes an unnecessary extra copy of the path */ pb = pathbuf_create(proppath); if (pb == NULL) { error = ENOMEM; goto out1; } module_print("Loading plist from %s", proppath); error = vn_open(NULL, pb, (nochroot ? NOCHROOT : 0), FREAD, 0, &vp, NULL, NULL); if (error != 0) { goto out2; } error = vn_stat(vp, &sb); if (error != 0) { goto out3; } if (sb.st_size >= (plistsize - 1)) { /* leave space for term \0 */ error = EFBIG; goto out3; } base = kmem_alloc(plistsize, KM_SLEEP); error = vn_rdwr(UIO_READ, vp, base, sb.st_size, 0, UIO_SYSSPACE, IO_NODELOCKED, curlwp->l_cred, &resid, curlwp); *((uint8_t *)base + sb.st_size) = '\0'; if (error == 0 && resid != 0) { error = EFBIG; } if (error != 0) { kmem_free(base, plistsize); base = NULL; goto out3; } *filedictp = prop_dictionary_internalize(base); if (*filedictp == NULL) { error = EINVAL; } kmem_free(base, plistsize); base = NULL; KASSERT(error == 0); out3: VOP_UNLOCK(vp); vn_close(vp, FREAD, kauth_cred_get()); out2: pathbuf_destroy(pb); out1: PNBUF_PUT(proppath); return error; }
3 4 2 2 1 1 1 4 4 1 1 4 1 1 1 1 4 1 1 1 1 1 3 3 3 1 2 2 2 2 2 2 2 2 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 /* $NetBSD: if_tun.c,v 1.175 2024/03/09 13:55:27 riastradh Exp $ */ /* * Copyright (c) 1988, Julian Onions <jpo@cs.nott.ac.uk> * Nottingham University 1987. * * This source may be freely distributed, however I would be interested * in any changes that are made. * * This driver takes packets off the IP i/f and hands them up to a * user process to have its wicked way with. This driver has its * roots in a similar driver written by Phil Cockcroft (formerly) at * UCL. This driver is based much more on read/write/poll mode of * operation though. */ /* * tun - tunnel software network interface. */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: if_tun.c,v 1.175 2024/03/09 13:55:27 riastradh Exp $"); #ifdef _KERNEL_OPT #include "opt_inet.h" #endif #include <sys/param.h> #include <sys/buf.h> #include <sys/conf.h> #include <sys/cpu.h> #include <sys/device.h> #include <sys/file.h> #include <sys/ioctl.h> #include <sys/kauth.h> #include <sys/kmem.h> #include <sys/lwp.h> #include <sys/mbuf.h> #include <sys/module.h> #include <sys/mutex.h> #include <sys/poll.h> #include <sys/select.h> #include <sys/signalvar.h> #include <sys/socket.h> #include <net/bpf.h> #include <net/if.h> #include <net/if_types.h> #include <net/route.h> #ifdef INET #include <netinet/in.h> #include <netinet/in_systm.h> #include <netinet/in_var.h> #include <netinet/ip.h> #include <netinet/if_inarp.h> #endif #include <net/if_tun.h> #include "ioconf.h" #define TUNDEBUG if (tundebug) printf int tundebug = 0; extern int ifqmaxlen; static LIST_HEAD(, tun_softc) tun_softc_list; static LIST_HEAD(, tun_softc) tunz_softc_list; static kmutex_t tun_softc_lock; static int tun_ioctl(struct ifnet *, u_long, void *); static int tun_output(struct ifnet *, struct mbuf *, const struct sockaddr *, const struct rtentry *rt); static int tun_clone_create(struct if_clone *, int); static int tun_clone_destroy(struct ifnet *); static struct if_clone tun_cloner = IF_CLONE_INITIALIZER("tun", tun_clone_create, tun_clone_destroy); static void tunattach0(struct tun_softc *); static void tun_enable(struct tun_softc *, const struct ifaddr *); static void tun_i_softintr(void *); static void tun_o_softintr(void *); #ifdef ALTQ static void tunstart(struct ifnet *); #endif static struct tun_softc *tun_find_unit(dev_t); static struct tun_softc *tun_find_zunit(int); static dev_type_open(tunopen); static dev_type_close(tunclose); static dev_type_read(tunread); static dev_type_write(tunwrite); static dev_type_ioctl(tunioctl); static dev_type_poll(tunpoll); static dev_type_kqfilter(tunkqfilter); const struct cdevsw tun_cdevsw = { .d_open = tunopen, .d_close = tunclose, .d_read = tunread, .d_write = tunwrite, .d_ioctl = tunioctl, .d_stop = nostop, .d_tty = notty, .d_poll = tunpoll, .d_mmap = nommap, .d_kqfilter = tunkqfilter, .d_discard = nodiscard, .d_flag = D_OTHER | D_MPSAFE }; #ifdef _MODULE devmajor_t tun_bmajor = -1, tun_cmajor = -1; #endif void tunattach(int unused) { /* * Nothing to do here, initialization is handled by the * module initialization code in tuninit() below). */ } static void tuninit(void) { mutex_init(&tun_softc_lock, MUTEX_DEFAULT, IPL_NONE); LIST_INIT(&tun_softc_list); LIST_INIT(&tunz_softc_list); if_clone_attach(&tun_cloner); #ifdef _MODULE devsw_attach("tun", NULL, &tun_bmajor, &tun_cdevsw, &tun_cmajor); #endif } static int tundetach(void) { if_clone_detach(&tun_cloner); #ifdef _MODULE devsw_detach(NULL, &tun_cdevsw); #endif if (!LIST_EMPTY(&tun_softc_list) || !LIST_EMPTY(&tunz_softc_list)) { #ifdef _MODULE devsw_attach("tun", NULL, &tun_bmajor, &tun_cdevsw, &tun_cmajor); #endif if_clone_attach(&tun_cloner); return EBUSY; } mutex_destroy(&tun_softc_lock); return 0; } /* * Find driver instance from dev_t. * Returns with tp locked (if found). */ static struct tun_softc * tun_find_unit(dev_t dev) { struct tun_softc *tp; int unit = minor(dev); mutex_enter(&tun_softc_lock); LIST_FOREACH(tp, &tun_softc_list, tun_list) if (unit == tp->tun_unit) break; if (tp) mutex_enter(&tp->tun_lock); mutex_exit(&tun_softc_lock); return tp; } /* * Find zombie driver instance by unit number. * Remove tp from list and return it unlocked (if found). */ static struct tun_softc * tun_find_zunit(int unit) { struct tun_softc *tp; mutex_enter(&tun_softc_lock); LIST_FOREACH(tp, &tunz_softc_list, tun_list) if (unit == tp->tun_unit) break; if (tp) LIST_REMOVE(tp, tun_list); mutex_exit(&tun_softc_lock); KASSERTMSG(!tp || (tp->tun_flags & (TUN_INITED|TUN_OPEN)) == TUN_OPEN, "tun%d: inconsistent flags: %x", unit, tp->tun_flags); return tp; } static void tun_init(struct tun_softc *tp, int unit) { tp->tun_unit = unit; mutex_init(&tp->tun_lock, MUTEX_DEFAULT, IPL_SOFTNET); cv_init(&tp->tun_cv, "tunread"); selinit(&tp->tun_rsel); selinit(&tp->tun_wsel); tp->tun_osih = softint_establish(SOFTINT_CLOCK, tun_o_softintr, tp); tp->tun_isih = softint_establish(SOFTINT_CLOCK, tun_i_softintr, tp); } static void tun_fini(struct tun_softc *tp) { softint_disestablish(tp->tun_isih); softint_disestablish(tp->tun_osih); seldestroy(&tp->tun_wsel); seldestroy(&tp->tun_rsel); mutex_destroy(&tp->tun_lock); cv_destroy(&tp->tun_cv); } static struct tun_softc * tun_alloc(int unit) { struct tun_softc *tp; tp = kmem_zalloc(sizeof(*tp), KM_SLEEP); tun_init(tp, unit); return tp; } static void tun_recycle(struct tun_softc *tp) { memset(&tp->tun_if, 0, sizeof(struct ifnet)); /* XXX ??? */ } static void tun_free(struct tun_softc *tp) { tun_fini(tp); kmem_free(tp, sizeof(*tp)); } static int tun_clone_create(struct if_clone *ifc, int unit) { struct tun_softc *tp; if ((tp = tun_find_zunit(unit)) == NULL) { tp = tun_alloc(unit); } else { tun_recycle(tp); } if_initname(&tp->tun_if, ifc->ifc_name, unit); tunattach0(tp); tp->tun_flags |= TUN_INITED; mutex_enter(&tun_softc_lock); LIST_INSERT_HEAD(&tun_softc_list, tp, tun_list); mutex_exit(&tun_softc_lock); return 0; } static void tunattach0(struct tun_softc *tp) { struct ifnet *ifp; ifp = &tp->tun_if; ifp->if_softc = tp; ifp->if_mtu = TUNMTU; ifp->if_ioctl = tun_ioctl; ifp->if_output = tun_output; #ifdef ALTQ ifp->if_start = tunstart; #endif ifp->if_flags = IFF_POINTOPOINT; ifp->if_type = IFT_TUNNEL; ifp->if_snd.ifq_maxlen = ifqmaxlen; ifp->if_dlt = DLT_NULL; IFQ_SET_READY(&ifp->if_snd); if_attach(ifp); ifp->if_link_state = LINK_STATE_DOWN; if_alloc_sadl(ifp); bpf_attach(ifp, DLT_NULL, sizeof(uint32_t)); } static int tun_clone_destroy(struct ifnet *ifp) { struct tun_softc *tp = (void *)ifp; bool zombie = false; IF_PURGE(&ifp->if_snd); ifp->if_flags &= ~IFF_RUNNING; mutex_enter(&tun_softc_lock); mutex_enter(&tp->tun_lock); LIST_REMOVE(tp, tun_list); if (tp->tun_flags & TUN_OPEN) { /* Hang on to storage until last close. */ tp->tun_flags &= ~TUN_INITED; LIST_INSERT_HEAD(&tunz_softc_list, tp, tun_list); zombie = true; } mutex_exit(&tun_softc_lock); cv_broadcast(&tp->tun_cv); if (tp->tun_flags & TUN_ASYNC && tp->tun_pgid) fownsignal(tp->tun_pgid, SIGIO, POLL_HUP, 0, NULL); selnotify(&tp->tun_rsel, 0, NOTE_SUBMIT); mutex_exit(&tp->tun_lock); bpf_detach(ifp); if_detach(ifp); if (!zombie) { tun_free(tp); } return 0; } /* * tunnel open - must be superuser & the device must be * configured in */ static int tunopen(dev_t dev, int flag, int mode, struct lwp *l) { struct ifnet *ifp; struct tun_softc *tp; int error; error = kauth_authorize_network(l->l_cred, KAUTH_NETWORK_INTERFACE_TUN, KAUTH_REQ_NETWORK_INTERFACE_TUN_ADD, NULL, NULL, NULL); if (error) return error; tp = tun_find_unit(dev); if (tp == NULL) { (void)tun_clone_create(&tun_cloner, minor(dev)); tp = tun_find_unit(dev); if (tp == NULL) { return ENXIO; } } if (tp->tun_flags & TUN_OPEN) { mutex_exit(&tp->tun_lock); return EBUSY; } ifp = &tp->tun_if; tp->tun_flags |= TUN_OPEN; TUNDEBUG("%s: open\n", ifp->if_xname); if_link_state_change(ifp, LINK_STATE_UP); mutex_exit(&tp->tun_lock); return error; } /* * tunclose - close the device - mark i/f down & delete * routing info */ int tunclose(dev_t dev, int flag, int mode, struct lwp *l) { struct tun_softc *tp; struct ifnet *ifp; if ((tp = tun_find_zunit(minor(dev))) != NULL) { /* interface was "destroyed" before the close */ tun_free(tp); return 0; } if ((tp = tun_find_unit(dev)) == NULL) goto out_nolock; ifp = &tp->tun_if; tp->tun_flags &= ~TUN_OPEN; tp->tun_pgid = 0; selnotify(&tp->tun_rsel, 0, NOTE_SUBMIT); TUNDEBUG ("%s: closed\n", ifp->if_xname); mutex_exit(&tp->tun_lock); /* * junk all pending output */ IFQ_PURGE(&ifp->if_snd); if (ifp->if_flags & IFF_UP) { if_down(ifp); if (ifp->if_flags & IFF_RUNNING) { /* find internet addresses and delete routes */ struct ifaddr *ifa; IFADDR_READER_FOREACH(ifa, ifp) { #if defined(INET) || defined(INET6) if (ifa->ifa_addr->sa_family == AF_INET || ifa->ifa_addr->sa_family == AF_INET6) { rtinit(ifa, (int)RTM_DELETE, tp->tun_flags & TUN_DSTADDR ? RTF_HOST : 0); } #endif } } } if_link_state_change(ifp, LINK_STATE_DOWN); out_nolock: return 0; } static void tun_enable(struct tun_softc *tp, const struct ifaddr *ifa) { struct ifnet *ifp = &tp->tun_if; TUNDEBUG("%s: %s\n", __func__, ifp->if_xname); mutex_enter(&tp->tun_lock); tp->tun_flags &= ~(TUN_IASET|TUN_DSTADDR); switch (ifa->ifa_addr->sa_family) { #ifdef INET case AF_INET: { struct sockaddr_in *sin; sin = satosin(ifa->ifa_addr); if (sin && sin->sin_addr.s_addr) tp->tun_flags |= TUN_IASET; if (ifp->if_flags & IFF_POINTOPOINT) { sin = satosin(ifa->ifa_dstaddr); if (sin && sin->sin_addr.s_addr) tp->tun_flags |= TUN_DSTADDR; } break; } #endif #ifdef INET6 case AF_INET6: { struct sockaddr_in6 *sin; sin = satosin6(ifa->ifa_addr); if (!IN6_IS_ADDR_UNSPECIFIED(&sin->sin6_addr)) tp->tun_flags |= TUN_IASET; if (ifp->if_flags & IFF_POINTOPOINT) { sin = satosin6(ifa->ifa_dstaddr); if (sin && !IN6_IS_ADDR_UNSPECIFIED(&sin->sin6_addr)) tp->tun_flags |= TUN_DSTADDR; } else tp->tun_flags &= ~TUN_DSTADDR; break; } #endif /* INET6 */ default: break; } ifp->if_flags |= IFF_UP | IFF_RUNNING; mutex_exit(&tp->tun_lock); } /* * Process an ioctl request. */ static int tun_ioctl(struct ifnet *ifp, u_long cmd, void *data) { struct tun_softc *tp = (struct tun_softc *)(ifp->if_softc); struct ifreq *ifr = (struct ifreq *)data; struct ifaddr *ifa = (struct ifaddr *)data; int error = 0; switch (cmd) { case SIOCINITIFADDR: tun_enable(tp, ifa); ifa->ifa_rtrequest = p2p_rtrequest; TUNDEBUG("%s: address set\n", ifp->if_xname); break; case SIOCSIFBRDADDR: TUNDEBUG("%s: broadcast address set\n", ifp->if_xname); break; case SIOCSIFMTU: if (ifr->ifr_mtu > TUNMTU || ifr->ifr_mtu < 576) { error = EINVAL; break; } TUNDEBUG("%s: interface mtu set\n", ifp->if_xname); if ((error = ifioctl_common(ifp, cmd, data)) == ENETRESET) error = 0; break; case SIOCADDMULTI: case SIOCDELMULTI: if (ifr == NULL) { error = EAFNOSUPPORT; /* XXX */ break; } switch (ifreq_getaddr(cmd, ifr)->sa_family) { #ifdef INET case AF_INET: break; #endif #ifdef INET6 case AF_INET6: break; #endif default: error = EAFNOSUPPORT; break; } break; default: error = ifioctl_common(ifp, cmd, data); } return error; } /* * tun_output - queue packets from higher level ready to put out. */ static int tun_output(struct ifnet *ifp, struct mbuf *m0, const struct sockaddr *dst, const struct rtentry *rt) { struct tun_softc *tp = ifp->if_softc; int error; #if defined(INET) || defined(INET6) int mlen; uint32_t *af; #endif mutex_enter(&tp->tun_lock); TUNDEBUG ("%s: tun_output\n", ifp->if_xname); if ((tp->tun_flags & TUN_READY) != TUN_READY) { TUNDEBUG ("%s: not ready 0%o\n", ifp->if_xname, tp->tun_flags); error = EHOSTDOWN; mutex_exit(&tp->tun_lock); goto out; } // XXXrmind mutex_exit(&tp->tun_lock); /* * if the queueing discipline needs packet classification, * do it before prepending link headers. */ IFQ_CLASSIFY(&ifp->if_snd, m0, dst->sa_family); bpf_mtap_af(ifp, dst->sa_family, m0, BPF_D_OUT); if ((error = pfil_run_hooks(ifp->if_pfil, &m0, ifp, PFIL_OUT)) != 0) goto out; if (m0 == NULL) goto out; switch(dst->sa_family) { #ifdef INET6 case AF_INET6: #endif #ifdef INET case AF_INET: #endif #if defined(INET) || defined(INET6) if (tp->tun_flags & TUN_PREPADDR) { /* Simple link-layer header */ M_PREPEND(m0, dst->sa_len, M_DONTWAIT); if (m0 == NULL) { IF_DROP(&ifp->if_snd); error = ENOBUFS; goto out; } memcpy(mtod(m0, char *), dst, dst->sa_len); } else if (tp->tun_flags & TUN_IFHEAD) { /* Prepend the address family */ M_PREPEND(m0, sizeof(*af), M_DONTWAIT); if (m0 == NULL) { IF_DROP(&ifp->if_snd); error = ENOBUFS; goto out; } af = mtod(m0,uint32_t *); *af = htonl(dst->sa_family); } else { #ifdef INET if (dst->sa_family != AF_INET) #endif { error = EAFNOSUPPORT; goto out; } } /* FALLTHROUGH */ case AF_UNSPEC: mlen = m0->m_pkthdr.len; IFQ_ENQUEUE(&ifp->if_snd, m0, error); if (error) { if_statinc(ifp, if_collisions); error = EAFNOSUPPORT; m0 = NULL; goto out; } if_statadd2(ifp, if_opackets, 1, if_obytes, mlen); break; #endif default: error = EAFNOSUPPORT; goto out; } mutex_enter(&tp->tun_lock); cv_broadcast(&tp->tun_cv); if (tp->tun_flags & TUN_ASYNC && tp->tun_pgid) softint_schedule(tp->tun_isih); selnotify(&tp->tun_rsel, 0, NOTE_SUBMIT); mutex_exit(&tp->tun_lock); out: if (error && m0) m_freem(m0); return error; } static void tun_i_softintr(void *cookie) { struct tun_softc *tp = cookie; if (tp->tun_flags & TUN_ASYNC && tp->tun_pgid) fownsignal(tp->tun_pgid, SIGIO, POLL_IN, POLLIN|POLLRDNORM, NULL); } static void tun_o_softintr(void *cookie) { struct tun_softc *tp = cookie; if (tp->tun_flags & TUN_ASYNC && tp->tun_pgid) fownsignal(tp->tun_pgid, SIGIO, POLL_OUT, POLLOUT|POLLWRNORM, NULL); } /* * the cdevsw interface is now pretty minimal. */ int tunioctl(dev_t dev, u_long cmd, void *data, int flag, struct lwp *l) { struct tun_softc *tp; int error = 0; tp = tun_find_unit(dev); /* interface was "destroyed" already */ if (tp == NULL) { return ENXIO; } switch (cmd) { case TUNSDEBUG: tundebug = *(int *)data; break; case TUNGDEBUG: *(int *)data = tundebug; break; case TUNSIFMODE: switch (*(int *)data & (IFF_POINTOPOINT|IFF_BROADCAST)) { case IFF_POINTOPOINT: case IFF_BROADCAST: if (tp->tun_if.if_flags & IFF_UP) { error = EBUSY; goto out; } tp->tun_if.if_flags &= ~(IFF_BROADCAST|IFF_POINTOPOINT|IFF_MULTICAST); tp->tun_if.if_flags |= *(int *)data; break; default: error = EINVAL; goto out; } break; case TUNSLMODE: if (*(int *)data) { tp->tun_flags |= TUN_PREPADDR; tp->tun_flags &= ~TUN_IFHEAD; } else tp->tun_flags &= ~TUN_PREPADDR; break; case TUNSIFHEAD: if (*(int *)data) { tp->tun_flags |= TUN_IFHEAD; tp->tun_flags &= ~TUN_PREPADDR; } else tp->tun_flags &= ~TUN_IFHEAD; break; case TUNGIFHEAD: *(int *)data = (tp->tun_flags & TUN_IFHEAD); break; case FIONBIO: if (*(int *)data) tp->tun_flags |= TUN_NBIO; else tp->tun_flags &= ~TUN_NBIO; break; case FIOASYNC: if (*(int *)data) tp->tun_flags |= TUN_ASYNC; else tp->tun_flags &= ~TUN_ASYNC; break; case FIONREAD: if (tp->tun_if.if_snd.ifq_head) *(int *)data = tp->tun_if.if_snd.ifq_head->m_pkthdr.len; else *(int *)data = 0; break; case TIOCSPGRP: case FIOSETOWN: error = fsetown(&tp->tun_pgid, cmd, data); break; case TIOCGPGRP: case FIOGETOWN: error = fgetown(tp->tun_pgid, cmd, data); break; default: error = ENOTTY; } out: mutex_exit(&tp->tun_lock); return error; } /* * The cdevsw read interface - reads a packet at a time, or at * least as much of a packet as can be read. */ int tunread(dev_t dev, struct uio *uio, int ioflag) { struct tun_softc *tp; struct ifnet *ifp; struct mbuf *m, *m0; int error = 0, len; tp = tun_find_unit(dev); /* interface was "destroyed" already */ if (tp == NULL) { return ENXIO; } ifp = &tp->tun_if; TUNDEBUG ("%s: read\n", ifp->if_xname); if ((tp->tun_flags & TUN_READY) != TUN_READY) { TUNDEBUG ("%s: not ready 0%o\n", ifp->if_xname, tp->tun_flags); error = EHOSTDOWN; goto out; } do { IFQ_DEQUEUE(&ifp->if_snd, m0); if (m0 == 0) { if (tp->tun_flags & TUN_NBIO) { error = EWOULDBLOCK; goto out; } if (cv_wait_sig(&tp->tun_cv, &tp->tun_lock)) { error = EINTR; goto out; } } } while (m0 == 0); mutex_exit(&tp->tun_lock); /* Copy the mbuf chain */ while (m0 && uio->uio_resid > 0 && error == 0) { len = uimin(uio->uio_resid, m0->m_len); if (len != 0) error = uiomove(mtod(m0, void *), len, uio); m0 = m = m_free(m0); } if (m0) { TUNDEBUG("Dropping mbuf\n"); m_freem(m0); } if (error) if_statinc(ifp, if_ierrors); return error; out: mutex_exit(&tp->tun_lock); return error; } /* * the cdevsw write interface - an atomic write is a packet - or else! */ int tunwrite(dev_t dev, struct uio *uio, int ioflag) { struct tun_softc *tp; struct ifnet *ifp; struct mbuf *top, **mp, *m; pktqueue_t *pktq; struct sockaddr dst; int error = 0, tlen, mlen; uint32_t family; tp = tun_find_unit(dev); if (tp == NULL) { /* Interface was "destroyed" already. */ return ENXIO; } /* Unlock until we've got the data */ mutex_exit(&tp->tun_lock); ifp = &tp->tun_if; TUNDEBUG("%s: tunwrite\n", ifp->if_xname); if (tp->tun_flags & TUN_PREPADDR) { if (uio->uio_resid < sizeof(dst)) { error = EIO; goto out0; } error = uiomove((void *)&dst, sizeof(dst), uio); if (error) goto out0; if (dst.sa_len > sizeof(dst)) { /* Duh.. */ int n = dst.sa_len - sizeof(dst); while (n--) { char discard; error = uiomove(&discard, 1, uio); if (error) { goto out0; } } } } else if (tp->tun_flags & TUN_IFHEAD) { if (uio->uio_resid < sizeof(family)){ error = EIO; goto out0; } error = uiomove((void *)&family, sizeof(family), uio); if (error) goto out0; dst.sa_family = ntohl(family); } else { #ifdef INET dst.sa_family = AF_INET; #endif } if (uio->uio_resid == 0 || uio->uio_resid > TUNMTU) { TUNDEBUG("%s: len=%lu!\n", ifp->if_xname, (unsigned long)uio->uio_resid); error = EIO; goto out0; } switch (dst.sa_family) { #ifdef INET case AF_INET: pktq = ip_pktq; break; #endif #ifdef INET6 case AF_INET6: pktq = ip6_pktq; break; #endif default: error = EAFNOSUPPORT; goto out0; } tlen = uio->uio_resid; /* get a header mbuf */ MGETHDR(m, M_DONTWAIT, MT_DATA); if (m == NULL) { error = ENOBUFS; goto out0; } mlen = MHLEN; top = NULL; mp = &top; while (error == 0 && uio->uio_resid > 0) { m->m_len = uimin(mlen, uio->uio_resid); error = uiomove(mtod(m, void *), m->m_len, uio); *mp = m; mp = &m->m_next; if (error == 0 && uio->uio_resid > 0) { MGET(m, M_DONTWAIT, MT_DATA); if (m == NULL) { error = ENOBUFS; break; } mlen = MLEN; } } if (error) { if (top != NULL) m_freem(top); if_statinc(ifp, if_ierrors); goto out0; } top->m_pkthdr.len = tlen; m_set_rcvif(top, ifp); bpf_mtap_af(ifp, dst.sa_family, top, BPF_D_IN); if ((error = pfil_run_hooks(ifp->if_pfil, &top, ifp, PFIL_IN)) != 0) goto out0; if (top == NULL) goto out0; mutex_enter(&tp->tun_lock); if ((tp->tun_flags & TUN_INITED) == 0) { /* Interface was destroyed */ error = ENXIO; goto out; } kpreempt_disable(); if (__predict_false(!pktq_enqueue(pktq, top, 0))) { kpreempt_enable(); if_statinc(ifp, if_collisions); mutex_exit(&tp->tun_lock); error = ENOBUFS; m_freem(top); goto out0; } kpreempt_enable(); if_statadd2(ifp, if_ipackets, 1, if_ibytes, tlen); out: mutex_exit(&tp->tun_lock); out0: return error; } #ifdef ALTQ /* * Start packet transmission on the interface. * when the interface queue is rate-limited by ALTQ or TBR, * if_start is needed to drain packets from the queue in order * to notify readers when outgoing packets become ready. */ static void tunstart(struct ifnet *ifp) { struct tun_softc *tp = ifp->if_softc; if (!ALTQ_IS_ENABLED(&ifp->if_snd) && !TBR_IS_ENABLED(&ifp->if_snd)) return; mutex_enter(&tp->tun_lock); if (!IF_IS_EMPTY(&ifp->if_snd)) { cv_broadcast(&tp->tun_cv); if (tp->tun_flags & TUN_ASYNC && tp->tun_pgid) softint_schedule(tp->tun_osih); selnotify(&tp->tun_rsel, 0, NOTE_SUBMIT); } mutex_exit(&tp->tun_lock); } #endif /* ALTQ */ /* * tunpoll - the poll interface, this is only useful on reads * really. The write detect always returns true, write never blocks * anyway, it either accepts the packet or drops it. */ int tunpoll(dev_t dev, int events, struct lwp *l) { struct tun_softc *tp; struct ifnet *ifp; int revents = 0; tp = tun_find_unit(dev); if (tp == NULL) { /* Interface was "destroyed" already. */ return 0; } ifp = &tp->tun_if; TUNDEBUG("%s: tunpoll\n", ifp->if_xname); if (events & (POLLIN | POLLRDNORM)) { if (!IFQ_IS_EMPTY(&ifp->if_snd)) { TUNDEBUG("%s: tunpoll q=%d\n", ifp->if_xname, ifp->if_snd.ifq_len); revents |= events & (POLLIN | POLLRDNORM); } else { TUNDEBUG("%s: tunpoll waiting\n", ifp->if_xname); selrecord(l, &tp->tun_rsel); } } if (events & (POLLOUT | POLLWRNORM)) revents |= events & (POLLOUT | POLLWRNORM); mutex_exit(&tp->tun_lock); return revents; } static void filt_tunrdetach(struct knote *kn) { struct tun_softc *tp = kn->kn_hook; mutex_enter(&tp->tun_lock); selremove_knote(&tp->tun_rsel, kn); mutex_exit(&tp->tun_lock); } static int filt_tunread(struct knote *kn, long hint) { struct tun_softc *tp = kn->kn_hook; struct ifnet *ifp = &tp->tun_if; struct mbuf *m; int ready; if (hint & NOTE_SUBMIT) KASSERT(mutex_owned(&tp->tun_lock)); else mutex_enter(&tp->tun_lock); IF_POLL(&ifp->if_snd, m); ready = (m != NULL); for (kn->kn_data = 0; m != NULL; m = m->m_next) kn->kn_data += m->m_len; if (hint & NOTE_SUBMIT) KASSERT(mutex_owned(&tp->tun_lock)); else mutex_exit(&tp->tun_lock); return ready; } static const struct filterops tunread_filtops = { .f_flags = FILTEROP_ISFD, .f_attach = NULL, .f_detach = filt_tunrdetach, .f_event = filt_tunread, }; int tunkqfilter(dev_t dev, struct knote *kn) { struct tun_softc *tp; int rv = 0; tp = tun_find_unit(dev); if (tp == NULL) goto out_nolock; switch (kn->kn_filter) { case EVFILT_READ: kn->kn_fop = &tunread_filtops; kn->kn_hook = tp; selrecord_knote(&tp->tun_rsel, kn); break; case EVFILT_WRITE: kn->kn_fop = &seltrue_filtops; break; default: rv = EINVAL; goto out; } out: mutex_exit(&tp->tun_lock); out_nolock: return rv; } /* * Module infrastructure */ #include "if_module.h" IF_MODULE(MODULE_CLASS_DRIVER, tun, NULL)
6 414 264 6 3 153 97 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 /* $NetBSD: vfs_init.c,v 1.64 2023/09/23 18:21:11 ad Exp $ */ /*- * Copyright (c) 1998, 2000, 2008 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility, * NASA Ames Research Center. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /* * Copyright (c) 1989, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed * to Berkeley by John Heidemann of the UCLA Ficus project. * * Source: * @(#)i405_init.c 2.10 92/04/27 UCLA Ficus project * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)vfs_init.c 8.5 (Berkeley) 5/11/95 */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: vfs_init.c,v 1.64 2023/09/23 18:21:11 ad Exp $"); #include <sys/param.h> #include <sys/types.h> #include <sys/buf.h> #include <sys/dirhash.h> #include <sys/errno.h> #include <sys/kauth.h> #include <sys/kmem.h> #include <sys/module.h> #include <sys/mount.h> #include <sys/namei.h> #include <sys/sdt.h> #include <sys/stat.h> #include <sys/sysctl.h> #include <sys/systm.h> #include <sys/time.h> #include <sys/ucred.h> #include <sys/vnode.h> #include <sys/vnode_impl.h> #include <miscfs/deadfs/deadfs.h> #include <miscfs/fifofs/fifo.h> #include <miscfs/specfs/specdev.h> /* * Sigh, such primitive tools are these... */ #if 0 #define DODEBUG(A) A #else #define DODEBUG(A) #endif SDT_PROVIDER_DEFINE(vfs); /* * These vnodeopv_descs are listed here because they are not * associated with any particular file system, and thus cannot * be initialized by vfs_attach(). */ const struct vnodeopv_desc * const vfs_special_vnodeopv_descs[] = { &dead_vnodeop_opv_desc, &fifo_vnodeop_opv_desc, &spec_vnodeop_opv_desc, NULL, }; struct vfs_list_head vfs_list = /* vfs list */ LIST_HEAD_INITIALIZER(vfs_list); static kauth_listener_t mount_listener; /* * This code doesn't work if the defn is **vnodop_defns with cc. * The problem is because of the compiler sometimes putting in an * extra level of indirection for arrays. It's an interesting * "feature" of C. */ typedef int (*PFI)(void *); /* * A miscellaneous routine. * A generic "default" routine that just returns an error. */ /*ARGSUSED*/ int vn_default_error(void *v) { return (EOPNOTSUPP); } static struct sysctllog *vfs_sysctllog; /* * Top level filesystem related information gathering. */ static void sysctl_vfs_setup(void) { sysctl_createv(&vfs_sysctllog, 0, NULL, NULL, CTLFLAG_PERMANENT, CTLTYPE_NODE, "generic", SYSCTL_DESCR("Non-specific vfs related information"), NULL, 0, NULL, 0, CTL_VFS, VFS_GENERIC, CTL_EOL); sysctl_createv(&vfs_sysctllog, 0, NULL, NULL, CTLFLAG_PERMANENT, CTLTYPE_STRING, "fstypes", SYSCTL_DESCR("List of file systems present"), sysctl_vfs_generic_fstypes, 0, NULL, 0, CTL_VFS, VFS_GENERIC, CTL_CREATE, CTL_EOL); sysctl_createv(&vfs_sysctllog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_INT, "magiclinks", SYSCTL_DESCR("Whether \"magic\" symlinks are expanded"), NULL, 0, &vfs_magiclinks, 0, CTL_VFS, VFS_GENERIC, VFS_MAGICLINKS, CTL_EOL); sysctl_createv(&vfs_sysctllog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_INT, "timestamp_precision", SYSCTL_DESCR("File timestamp precision"), NULL, 0, &vfs_timestamp_precision, 0, CTL_VFS, VFS_GENERIC, VFS_TIMESTAMP_PRECISION, CTL_EOL); } /* * vfs_init.c * * Allocate and fill in operations vectors. * * An undocumented feature of this approach to defining operations is that * there can be multiple entries in vfs_opv_descs for the same operations * vector. This allows third parties to extend the set of operations * supported by another layer in a binary compatibile way. For example, * assume that NFS needed to be modified to support Ficus. NFS has an entry * (probably nfs_vnopdeop_decls) declaring all the operations NFS supports by * default. Ficus could add another entry (ficus_nfs_vnodeop_decl_entensions) * listing those new operations Ficus adds to NFS, all without modifying the * NFS code. (Of couse, the OTW NFS protocol still needs to be munged, but * that is a(whole)nother story.) This is a feature. */ /* * Init the vector, if it needs it. * Also handle backwards compatibility. */ static void vfs_opv_init_explicit(const struct vnodeopv_desc *vfs_opv_desc) { int (**opv_desc_vector)(void *); const struct vnodeopv_entry_desc *opve_descp; opv_desc_vector = *(vfs_opv_desc->opv_desc_vector_p); for (opve_descp = vfs_opv_desc->opv_desc_ops; opve_descp->opve_op; opve_descp++) { /* * Sanity check: is this operation listed * in the list of operations? We check this * by seeing if its offset is zero. Since * the default routine should always be listed * first, it should be the only one with a zero * offset. Any other operation with a zero * offset is probably not listed in * vfs_op_descs, and so is probably an error. * * A panic here means the layer programmer * has committed the all-too common bug * of adding a new operation to the layer's * list of vnode operations but * not adding the operation to the system-wide * list of supported operations. */ if (opve_descp->opve_op->vdesc_offset == 0 && opve_descp->opve_op->vdesc_offset != VOFFSET(vop_default)) { printf("operation %s not listed in %s.\n", opve_descp->opve_op->vdesc_name, "vfs_op_descs"); panic ("vfs_opv_init: bad operation"); } /* * Fill in this entry. */ opv_desc_vector[opve_descp->opve_op->vdesc_offset] = opve_descp->opve_impl; } } static void vfs_opv_init_default(const struct vnodeopv_desc *vfs_opv_desc) { int j; int (**opv_desc_vector)(void *); opv_desc_vector = *(vfs_opv_desc->opv_desc_vector_p); /* * Force every operations vector to have a default routine. */ if (opv_desc_vector[VOFFSET(vop_default)] == NULL) panic("vfs_opv_init: operation vector without default routine."); for (j = 0; j < VNODE_OPS_COUNT; j++) if (opv_desc_vector[j] == NULL) opv_desc_vector[j] = opv_desc_vector[VOFFSET(vop_default)]; } void vfs_opv_init(const struct vnodeopv_desc * const *vopvdpp) { int (**opv_desc_vector)(void *); int i; /* * Allocate the vectors. */ for (i = 0; vopvdpp[i] != NULL; i++) { opv_desc_vector = kmem_alloc(VNODE_OPS_COUNT * sizeof(PFI), KM_SLEEP); memset(opv_desc_vector, 0, VNODE_OPS_COUNT * sizeof(PFI)); *(vopvdpp[i]->opv_desc_vector_p) = opv_desc_vector; DODEBUG(printf("vector at %p allocated\n", opv_desc_vector_p)); } /* * ...and fill them in. */ for (i = 0; vopvdpp[i] != NULL; i++) vfs_opv_init_explicit(vopvdpp[i]); /* * Finally, go back and replace unfilled routines * with their default. */ for (i = 0; vopvdpp[i] != NULL; i++) vfs_opv_init_default(vopvdpp[i]); } void vfs_opv_free(const struct vnodeopv_desc * const *vopvdpp) { int i; /* * Free the vectors allocated in vfs_opv_init(). */ for (i = 0; vopvdpp[i] != NULL; i++) { kmem_free(*(vopvdpp[i]->opv_desc_vector_p), VNODE_OPS_COUNT * sizeof(PFI)); *(vopvdpp[i]->opv_desc_vector_p) = NULL; } } #ifdef DEBUG static void vfs_op_check(void) { int i; DODEBUG(printf("Vnode_interface_init.\n")); /* * Check offset of each op. */ for (i = 0; vfs_op_descs[i]; i++) { if (vfs_op_descs[i]->vdesc_offset != i) panic("vfs_op_check: vfs_op_desc[] offset mismatch"); } if (i != VNODE_OPS_COUNT) { panic("vfs_op_check: vnode ops count mismatch (%d != %d)", i, VNODE_OPS_COUNT); } DODEBUG(printf ("vfs_opv_numops=%d\n", VNODE_OPS_COUNT)); } #endif /* DEBUG */ /* * Common routine to check if an unprivileged mount is allowed. * * We export just this part (i.e., without the access control) so that if a * secmodel wants to implement finer grained user mounts it can do so without * copying too much code. More elaborate policies (i.e., specific users allowed * to also create devices and/or introduce set-id binaries, or export * file-systems) will require a different implementation. * * This routine is intended to be called from listener context, and as such * does not take credentials as an argument. */ int usermount_common_policy(struct mount *mp, u_long flags) { /* No exporting if unprivileged. */ if (flags & MNT_EXPORTED) return EPERM; /* Must have 'nosuid' and 'nodev'. */ if ((flags & MNT_NODEV) == 0 || (flags & MNT_NOSUID) == 0) return EPERM; /* Retain 'noexec'. */ if ((mp->mnt_flag & MNT_NOEXEC) && (flags & MNT_NOEXEC) == 0) return EPERM; return 0; } static int mount_listener_cb(kauth_cred_t cred, kauth_action_t action, void *cookie, void *arg0, void *arg1, void *arg2, void *arg3) { int result; enum kauth_system_req req; result = KAUTH_RESULT_DEFER; req = (enum kauth_system_req)(uintptr_t)(uintptr_t)arg0; if (action != KAUTH_SYSTEM_MOUNT) return result; if (req == KAUTH_REQ_SYSTEM_MOUNT_GET) result = KAUTH_RESULT_ALLOW; else if (req == KAUTH_REQ_SYSTEM_MOUNT_DEVICE) { vnode_t *devvp = arg2; accmode_t accmode = (accmode_t)(unsigned long)arg3; int error; error = VOP_ACCESS(devvp, accmode, cred); if (!error) result = KAUTH_RESULT_ALLOW; } return result; } /* * Initialize the vnode structures and initialize each file system type. */ void vfsinit(void) { /* * Attach sysctl nodes */ sysctl_vfs_setup(); /* * Initialize the vnode table */ vntblinit(); /* * Initialize the vnode name cache */ nchinit(); #ifdef DEBUG /* * Check the list of vnode operations. */ vfs_op_check(); #endif /* * Initialize the special vnode operations. */ vfs_opv_init(vfs_special_vnodeopv_descs); /* * Initialise generic dirhash. */ dirhash_init(); /* * Initialise VFS hooks. */ vfs_hooks_init(); mount_listener = kauth_listen_scope(KAUTH_SCOPE_SYSTEM, mount_listener_cb, NULL); /* * Establish each file system which was statically * included in the kernel. */ module_init_class(MODULE_CLASS_VFS); /* * Initialize EVFILT_FS for kqueue. */ vfs_evfilt_fs_init(); } /* * Drop a reference to a file system type. */ void vfs_delref(struct vfsops *vfs) { mutex_enter(&vfs_list_lock); vfs->vfs_refcount--; mutex_exit(&vfs_list_lock); } /* * Establish a file system and initialize it. */ int vfs_attach(struct vfsops *vfs) { struct vfsops *v; int error = 0; mutex_enter(&vfs_list_lock); /* * Make sure this file system doesn't already exist. */ LIST_FOREACH(v, &vfs_list, vfs_list) { if (strcmp(vfs->vfs_name, v->vfs_name) == 0) { error = EEXIST; goto out; } } /* * Initialize the vnode operations for this file system. */ vfs_opv_init(vfs->vfs_opv_descs); /* * Now initialize the file system itself. */ (*vfs->vfs_init)(); /* * ...and link it into the kernel's list. */ LIST_INSERT_HEAD(&vfs_list, vfs, vfs_list); /* * Sanity: make sure the reference count is 0. */ vfs->vfs_refcount = 0; out: mutex_exit(&vfs_list_lock); return (error); } /* * Remove a file system from the kernel. */ int vfs_detach(struct vfsops *vfs) { struct vfsops *v; int error = 0; mutex_enter(&vfs_list_lock); /* * Make sure no one is using the filesystem. */ if (vfs->vfs_refcount != 0) { error = EBUSY; goto out; } /* * ...and remove it from the kernel's list. */ LIST_FOREACH(v, &vfs_list, vfs_list) { if (v == vfs) { LIST_REMOVE(v, vfs_list); break; } } if (v == NULL) { error = ESRCH; goto out; } /* * Now run the file system-specific cleanups. */ (*vfs->vfs_done)(); /* * Free the vnode operations vector. */ vfs_opv_free(vfs->vfs_opv_descs); out: mutex_exit(&vfs_list_lock); return (error); } void vfs_reinit(void) { struct vfsops *vfs; mutex_enter(&vfs_list_lock); LIST_FOREACH(vfs, &vfs_list, vfs_list) { if (vfs->vfs_reinit) { vfs->vfs_refcount++; mutex_exit(&vfs_list_lock); (*vfs->vfs_reinit)(); mutex_enter(&vfs_list_lock); vfs->vfs_refcount--; } } mutex_exit(&vfs_list_lock); }
58 68 11 3 7 1 1 1 1 1 1 1 5 1 2 2 2 2 1 2 2 2 2 2 4 1 1 3 3 4 4 2 2 55 42 1 11 1 1 1 3 2 1 1 10 10 9 7 5 3 2 2 1 2 1 2 3 3 3 2 2 3 3 6 6 6 6 6 6 6 6 6 2 14 7 3 11 10 2 2 51 51 52 52 52 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 /* $NetBSD: com.c,v 1.384 2023/04/11 13:01:41 riastradh Exp $ */ /*- * Copyright (c) 1998, 1999, 2004, 2008 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Charles M. Hannum. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /* * Copyright (c) 1991 The Regents of the University of California. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)com.c 7.5 (Berkeley) 5/16/91 */ /* * COM driver, uses National Semiconductor NS16450/NS16550AF UART * Supports automatic hardware flow control on StarTech ST16C650A UART * * Lock order: * ttylock (IPL_VM) * -> sc->sc_lock (IPL_HIGH) * -> timecounter_lock (IPL_HIGH) */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: com.c,v 1.384 2023/04/11 13:01:41 riastradh Exp $"); #include "opt_com.h" #include "opt_ddb.h" #include "opt_kgdb.h" #include "opt_lockdebug.h" #include "opt_multiprocessor.h" #include "opt_ntp.h" /* The COM16650 option was renamed to COM_16650. */ #ifdef COM16650 #error Obsolete COM16650 option; use COM_16650 instead. #endif /* * Override cnmagic(9) macro before including <sys/systm.h>. * We need to know if cn_check_magic triggered debugger, so set a flag. * Callers of cn_check_magic must declare int cn_trapped = 0; * XXX: this is *ugly*! */ #define cn_trap() \ do { \ console_debugger(); \ cn_trapped = 1; \ (void)cn_trapped; \ } while (/* CONSTCOND */ 0) #include <sys/param.h> #include <sys/systm.h> #include <sys/ioctl.h> #include <sys/select.h> #include <sys/poll.h> #include <sys/tty.h> #include <sys/proc.h> #include <sys/conf.h> #include <sys/file.h> #include <sys/uio.h> #include <sys/kernel.h> #include <sys/syslog.h> #include <sys/device.h> #include <sys/malloc.h> #include <sys/timepps.h> #include <sys/vnode.h> #include <sys/kauth.h> #include <sys/intr.h> #ifdef RND_COM #include <sys/rndsource.h> #endif #include <sys/bus.h> #include <ddb/db_active.h> #include <dev/ic/comreg.h> #include <dev/ic/comvar.h> #include <dev/ic/ns16550reg.h> #include <dev/ic/st16650reg.h> #include <dev/ic/hayespreg.h> #define com_lcr com_cfcr #include <dev/cons.h> #include "ioconf.h" #define CSR_READ_1(r, o) \ (r)->cr_read((r), (r)->cr_map[o]) #define CSR_WRITE_1(r, o, v) \ (r)->cr_write((r), (r)->cr_map[o], (v)) #define CSR_WRITE_MULTI(r, o, p, n) \ (r)->cr_write_multi((r), (r)->cr_map[o], (p), (n)) /* * XXX COM_TYPE_AU1x00 specific */ #define CSR_WRITE_2(r, o, v) \ bus_space_write_2((r)->cr_iot, (r)->cr_ioh, (r)->cr_map[o], v) #define CSR_READ_2(r, o) \ bus_space_read_2((r)->cr_iot, (r)->cr_ioh, (r)->cr_map[o]) static void com_enable_debugport(struct com_softc *); void com_config(struct com_softc *); void com_shutdown(struct com_softc *); int comspeed(long, long, int); static u_char cflag2lcr(tcflag_t); int comparam(struct tty *, struct termios *); void comstart(struct tty *); int comhwiflow(struct tty *, int); void com_loadchannelregs(struct com_softc *); void com_hwiflow(struct com_softc *); void com_break(struct com_softc *, int); void com_modem(struct com_softc *, int); void tiocm_to_com(struct com_softc *, u_long, int); int com_to_tiocm(struct com_softc *); void com_iflush(struct com_softc *); int com_common_getc(dev_t, struct com_regs *); static void com_common_putc(dev_t, struct com_regs *, int, int); int cominit(struct com_regs *, int, int, int, tcflag_t); static int comcnreattach(void); int comcngetc(dev_t); void comcnputc(dev_t, int); void comcnpollc(dev_t, int); void comsoft(void *); static inline void com_rxsoft(struct com_softc *, struct tty *); static inline void com_txsoft(struct com_softc *, struct tty *); static inline void com_stsoft(struct com_softc *, struct tty *); static inline void com_schedrx(struct com_softc *); void comdiag(void *); dev_type_open(comopen); dev_type_close(comclose); dev_type_read(comread); dev_type_write(comwrite); dev_type_ioctl(comioctl); dev_type_stop(comstop); dev_type_tty(comtty); dev_type_poll(compoll); static struct comcons_info comcons_info; /* * Following are all routines needed for COM to act as console */ static struct consdev comcons = { .cn_getc = comcngetc, .cn_putc = comcnputc, .cn_pollc = comcnpollc, .cn_dev = NODEV, .cn_pri = CN_NORMAL }; const struct cdevsw com_cdevsw = { .d_open = comopen, .d_close = comclose, .d_read = comread, .d_write = comwrite, .d_ioctl = comioctl, .d_stop = comstop, .d_tty = comtty, .d_poll = compoll, .d_mmap = nommap, .d_kqfilter = ttykqfilter, .d_discard = nodiscard, .d_flag = D_TTY }; /* * Make this an option variable one can patch. * But be warned: this must be a power of 2! */ u_int com_rbuf_size = COM_RING_SIZE; /* Stop input when 3/4 of the ring is full; restart when only 1/4 is full. */ u_int com_rbuf_hiwat = (COM_RING_SIZE * 1) / 4; u_int com_rbuf_lowat = (COM_RING_SIZE * 3) / 4; static int comconsattached; static struct cnm_state com_cnm_state; #ifdef KGDB #include <sys/kgdb.h> static struct com_regs comkgdbregs; static int com_kgdb_attached; int com_kgdb_getc(void *); void com_kgdb_putc(void *, int); #endif /* KGDB */ /* initializer for typical 16550-ish hardware */ static const bus_size_t com_std_map[COM_REGMAP_NENTRIES] = { [COM_REG_RXDATA] = com_data, [COM_REG_TXDATA] = com_data, [COM_REG_DLBL] = com_dlbl, [COM_REG_DLBH] = com_dlbh, [COM_REG_IER] = com_ier, [COM_REG_IIR] = com_iir, [COM_REG_FIFO] = com_fifo, [COM_REG_TCR] = com_fifo, [COM_REG_EFR] = com_efr, [COM_REG_TLR] = com_efr, [COM_REG_LCR] = com_lcr, [COM_REG_MCR] = com_mcr, [COM_REG_LSR] = com_lsr, [COM_REG_MSR] = com_msr, [COM_REG_USR] = com_usr, [COM_REG_TFL] = com_tfl, [COM_REG_RFL] = com_rfl, [COM_REG_HALT] = com_halt, [COM_REG_MDR1] = com_mdr1, }; #define COMDIALOUT_MASK TTDIALOUT_MASK #define COMUNIT(x) TTUNIT(x) #define COMDIALOUT(x) TTDIALOUT(x) #define COM_ISALIVE(sc) ((sc)->enabled != 0 && \ device_is_active((sc)->sc_dev)) #define BR BUS_SPACE_BARRIER_READ #define BW BUS_SPACE_BARRIER_WRITE #define COM_BARRIER(r, f) \ bus_space_barrier((r)->cr_iot, (r)->cr_ioh, 0, (r)->cr_nports, (f)) /* * com_read_1 -- * Default register read callback using single byte accesses. */ static uint8_t com_read_1(struct com_regs *regs, u_int reg) { return bus_space_read_1(regs->cr_iot, regs->cr_ioh, reg); } /* * com_write_1 -- * Default register write callback using single byte accesses. */ static void com_write_1(struct com_regs *regs, u_int reg, uint8_t val) { bus_space_write_1(regs->cr_iot, regs->cr_ioh, reg, val); } /* * com_write_multi_1 -- * Default register multi write callback using single byte accesses. */ static void com_write_multi_1(struct com_regs *regs, u_int reg, const uint8_t *datap, bus_size_t count) { bus_space_write_multi_1(regs->cr_iot, regs->cr_ioh, reg, datap, count); } /* * com_read_4 -- * Default register read callback using dword accesses. */ static uint8_t com_read_4(struct com_regs *regs, u_int reg) { return bus_space_read_4(regs->cr_iot, regs->cr_ioh, reg) & 0xff; } /* * com_write_4 -- * Default register write callback using dword accesses. */ static void com_write_4(struct com_regs *regs, u_int reg, uint8_t val) { bus_space_write_4(regs->cr_iot, regs->cr_ioh, reg, val); } /* * com_write_multi_4 -- * Default register multi write callback using dword accesses. */ static void com_write_multi_4(struct com_regs *regs, u_int reg, const uint8_t *datap, bus_size_t count) { while (count-- > 0) { bus_space_write_4(regs->cr_iot, regs->cr_ioh, reg, *datap++); } } /* * com_init_regs -- * Driver front-ends use this to initialize our register map * in the standard fashion. They may then tailor the map to * their own particular requirements. */ void com_init_regs(struct com_regs *regs, bus_space_tag_t st, bus_space_handle_t sh, bus_addr_t addr) { memset(regs, 0, sizeof(*regs)); regs->cr_iot = st; regs->cr_ioh = sh; regs->cr_iobase = addr; regs->cr_nports = COM_NPORTS; regs->cr_read = com_read_1; regs->cr_write = com_write_1; regs->cr_write_multi = com_write_multi_1; memcpy(regs->cr_map, com_std_map, sizeof(regs->cr_map)); } /* * com_init_regs_stride -- * Convenience function for front-ends that have a stride between * registers. */ void com_init_regs_stride(struct com_regs *regs, bus_space_tag_t st, bus_space_handle_t sh, bus_addr_t addr, u_int regshift) { com_init_regs(regs, st, sh, addr); for (size_t i = 0; i < __arraycount(regs->cr_map); i++) { regs->cr_map[i] <<= regshift; } regs->cr_nports <<= regshift; } /* * com_init_regs_stride_width -- * Convenience function for front-ends that have a stride between * registers and specific I/O width requirements. */ void com_init_regs_stride_width(struct com_regs *regs, bus_space_tag_t st, bus_space_handle_t sh, bus_addr_t addr, u_int regshift, u_int width) { com_init_regs(regs, st, sh, addr); for (size_t i = 0; i < __arraycount(regs->cr_map); i++) { regs->cr_map[i] <<= regshift; } regs->cr_nports <<= regshift; switch (width) { case 1: /* Already set by com_init_regs */ break; case 4: regs->cr_read = com_read_4; regs->cr_write = com_write_4; regs->cr_write_multi = com_write_multi_4; break; default: panic("com: unsupported I/O width %d", width); } } /*ARGSUSED*/ int comspeed(long speed, long frequency, int type) { #define divrnd(n, q) (((n)*2/(q)+1)/2) /* divide and round off */ int x, err; int divisor = 16; if ((type == COM_TYPE_OMAP) && (speed > 230400)) { divisor = 13; } if (speed == 0) return (0); if (speed < 0) return (-1); x = divrnd(frequency / divisor, speed); if (x <= 0) return (-1); err = divrnd(((quad_t)frequency) * 1000 / divisor, speed * x) - 1000; if (err < 0) err = -err; if (err > COM_TOLERANCE) return (-1); return (x); #undef divrnd } #ifdef COM_DEBUG int com_debug = 0; void comstatus(struct com_softc *, const char *); void comstatus(struct com_softc *sc, const char *str) { struct tty *tp = sc->sc_tty; aprint_normal_dev(sc->sc_dev, "%s %cclocal %cdcd %cts_carr_on %cdtr %ctx_stopped\n", str, ISSET(tp->t_cflag, CLOCAL) ? '+' : '-', ISSET(sc->sc_msr, MSR_DCD) ? '+' : '-', ISSET(tp->t_state, TS_CARR_ON) ? '+' : '-', ISSET(sc->sc_mcr, MCR_DTR) ? '+' : '-', sc->sc_tx_stopped ? '+' : '-'); aprint_normal_dev(sc->sc_dev, "%s %ccrtscts %ccts %cts_ttstop %crts rx_flags=0x%x\n", str, ISSET(tp->t_cflag, CRTSCTS) ? '+' : '-', ISSET(sc->sc_msr, MSR_CTS) ? '+' : '-', ISSET(tp->t_state, TS_TTSTOP) ? '+' : '-', ISSET(sc->sc_mcr, MCR_RTS) ? '+' : '-', sc->sc_rx_flags); } #endif int com_probe_subr(struct com_regs *regs) { /* force access to id reg */ CSR_WRITE_1(regs, COM_REG_LCR, LCR_8BITS); CSR_WRITE_1(regs, COM_REG_IIR, 0); if ((CSR_READ_1(regs, COM_REG_LCR) != LCR_8BITS) || (CSR_READ_1(regs, COM_REG_IIR) & 0x38)) return (0); return (1); } int comprobe1(bus_space_tag_t iot, bus_space_handle_t ioh) { struct com_regs regs; com_init_regs(&regs, iot, ioh, 0/*XXX*/); return com_probe_subr(&regs); } /* * No locking in this routine; it is only called during attach, * or with the port already locked. */ static void com_enable_debugport(struct com_softc *sc) { /* Turn on line break interrupt, set carrier. */ sc->sc_ier = IER_ERLS; if (sc->sc_type == COM_TYPE_PXA2x0) sc->sc_ier |= IER_EUART | IER_ERXTOUT; if (sc->sc_type == COM_TYPE_INGENIC || sc->sc_type == COM_TYPE_TEGRA) sc->sc_ier |= IER_ERXTOUT; CSR_WRITE_1(&sc->sc_regs, COM_REG_IER, sc->sc_ier); SET(sc->sc_mcr, MCR_DTR | MCR_RTS); CSR_WRITE_1(&sc->sc_regs, COM_REG_MCR, sc->sc_mcr); } static void com_intr_poll(void *arg) { struct com_softc * const sc = arg; comintr(sc); callout_schedule(&sc->sc_poll_callout, sc->sc_poll_ticks); } void com_attach_subr(struct com_softc *sc) { struct com_regs *regsp = &sc->sc_regs; struct tty *tp; uint32_t cpr; uint8_t lcr; const char *fifo_msg = NULL; prop_dictionary_t dict; bool is_console = true; bool force_console = false; aprint_naive("\n"); dict = device_properties(sc->sc_dev); prop_dictionary_get_bool(dict, "is_console", &is_console); prop_dictionary_get_bool(dict, "force_console", &force_console); callout_init(&sc->sc_diag_callout, 0); callout_init(&sc->sc_poll_callout, 0); callout_setfunc(&sc->sc_poll_callout, com_intr_poll, sc); mutex_init(&sc->sc_lock, MUTEX_DEFAULT, IPL_HIGH); #if defined(COM_16650) sc->sc_type = COM_TYPE_16650; #elif defined(COM_16750) sc->sc_type = COM_TYPE_16750; #elif defined(COM_HAYESP) sc->sc_type = COM_TYPE_HAYESP; #elif defined(COM_PXA2X0) sc->sc_type = COM_TYPE_PXA2x0; #endif /* Disable interrupts before configuring the device. */ if (sc->sc_type == COM_TYPE_PXA2x0) sc->sc_ier = IER_EUART; else sc->sc_ier = 0; CSR_WRITE_1(regsp, COM_REG_IER, sc->sc_ier); if ((bus_space_is_equal(regsp->cr_iot, comcons_info.regs.cr_iot) && regsp->cr_iobase == comcons_info.regs.cr_iobase) || force_console) { comconsattached = 1; if (force_console) memcpy(regsp, &comcons_info.regs, sizeof(*regsp)); if (cn_tab == NULL && comcnreattach() != 0) { printf("can't re-init serial console @%lx\n", (u_long)comcons_info.regs.cr_iobase); } switch (sc->sc_type) { case COM_TYPE_16750: case COM_TYPE_DW_APB: /* Use in comintr(). */ sc->sc_lcr = cflag2lcr(comcons_info.cflag); break; } /* Make sure the console is always "hardwired". */ delay(10000); /* wait for output to finish */ if (is_console) { SET(sc->sc_hwflags, COM_HW_CONSOLE); } SET(sc->sc_swflags, TIOCFLAG_SOFTCAR); } /* Probe for FIFO */ switch (sc->sc_type) { case COM_TYPE_HAYESP: goto fifodone; case COM_TYPE_AU1x00: sc->sc_fifolen = 16; fifo_msg = "Au1X00 UART"; SET(sc->sc_hwflags, COM_HW_FIFO); goto fifodelay; case COM_TYPE_16550_NOERS: sc->sc_fifolen = 16; fifo_msg = "ns16650, no ERS"; SET(sc->sc_hwflags, COM_HW_FIFO); goto fifodelay; case COM_TYPE_OMAP: sc->sc_fifolen = 64; fifo_msg = "OMAP UART"; SET(sc->sc_hwflags, COM_HW_FIFO); goto fifodelay; case COM_TYPE_INGENIC: sc->sc_fifolen = 16; fifo_msg = "Ingenic UART"; SET(sc->sc_hwflags, COM_HW_FIFO); SET(sc->sc_hwflags, COM_HW_NOIEN); goto fifodelay; case COM_TYPE_TEGRA: sc->sc_fifolen = 8; fifo_msg = "Tegra UART"; SET(sc->sc_hwflags, COM_HW_FIFO); CSR_WRITE_1(regsp, COM_REG_FIFO, FIFO_ENABLE | FIFO_RCV_RST | FIFO_XMT_RST | FIFO_TRIGGER_1); goto fifodelay; case COM_TYPE_BCMAUXUART: sc->sc_fifolen = 1; fifo_msg = "BCM AUX UART"; SET(sc->sc_hwflags, COM_HW_FIFO); CSR_WRITE_1(regsp, COM_REG_FIFO, FIFO_ENABLE | FIFO_RCV_RST | FIFO_XMT_RST | FIFO_TRIGGER_1); goto fifodelay; case COM_TYPE_DW_APB: if (!prop_dictionary_get_uint(dict, "fifolen", &sc->sc_fifolen)) { cpr = bus_space_read_4(sc->sc_regs.cr_iot, sc->sc_regs.cr_ioh, DW_APB_UART_CPR); sc->sc_fifolen = __SHIFTOUT(cpr, UART_CPR_FIFO_MODE) * 16; } if (sc->sc_fifolen == 0) { sc->sc_fifolen = 1; fifo_msg = "DesignWare APB UART, no fifo"; CSR_WRITE_1(regsp, COM_REG_FIFO, 0); } else { fifo_msg = "DesignWare APB UART"; SET(sc->sc_hwflags, COM_HW_FIFO); CSR_WRITE_1(regsp, COM_REG_FIFO, FIFO_ENABLE | FIFO_RCV_RST | FIFO_XMT_RST | FIFO_TRIGGER_1); } goto fifodelay; } sc->sc_fifolen = 1; /* look for a NS 16550AF UART with FIFOs */ if (sc->sc_type == COM_TYPE_INGENIC) { CSR_WRITE_1(regsp, COM_REG_FIFO, FIFO_ENABLE | FIFO_RCV_RST | FIFO_XMT_RST | FIFO_TRIGGER_14 | FIFO_UART_ON); } else CSR_WRITE_1(regsp, COM_REG_FIFO, FIFO_ENABLE | FIFO_RCV_RST | FIFO_XMT_RST | FIFO_TRIGGER_14); delay(100); if (ISSET(CSR_READ_1(regsp, COM_REG_IIR), IIR_FIFO_MASK) == IIR_FIFO_MASK) if (ISSET(CSR_READ_1(regsp, COM_REG_FIFO), FIFO_TRIGGER_14) == FIFO_TRIGGER_14) { SET(sc->sc_hwflags, COM_HW_FIFO); fifo_msg = "ns16550a"; sc->sc_fifolen = 16; /* * IIR changes into the EFR if LCR is set to LCR_EERS * on 16650s. We also know IIR != 0 at this point. * Write 0 into the EFR, and read it. If the result * is 0, we have a 16650. * * Older 16650s were broken; the test to detect them * is taken from the Linux driver. Apparently * setting DLAB enable gives access to the EFR on * these chips. */ if (sc->sc_type == COM_TYPE_16650) { lcr = CSR_READ_1(regsp, COM_REG_LCR); CSR_WRITE_1(regsp, COM_REG_LCR, LCR_EERS); CSR_WRITE_1(regsp, COM_REG_EFR, 0); if (CSR_READ_1(regsp, COM_REG_EFR) == 0) { CSR_WRITE_1(regsp, COM_REG_LCR, lcr | LCR_DLAB); if (CSR_READ_1(regsp, COM_REG_EFR) == 0) { CLR(sc->sc_hwflags, COM_HW_FIFO); sc->sc_fifolen = 0; } else { SET(sc->sc_hwflags, COM_HW_FLOW); sc->sc_fifolen = 32; } } else sc->sc_fifolen = 16; CSR_WRITE_1(regsp, COM_REG_LCR, lcr); if (sc->sc_fifolen == 0) fifo_msg = "st16650, broken fifo"; else if (sc->sc_fifolen == 32) fifo_msg = "st16650a"; else fifo_msg = "ns16550a"; } /* * TL16C750 can enable 64byte FIFO, only when DLAB * is 1. However, some 16750 may always enable. For * example, restrictions according to DLAB in a data * sheet for SC16C750 were not described. * Please enable 'options COM_16650', supposing you * use SC16C750. Probably 32 bytes of FIFO and HW FLOW * should become effective. */ if (sc->sc_type == COM_TYPE_16750) { uint8_t iir1, iir2; uint8_t fcr = FIFO_ENABLE | FIFO_TRIGGER_14; lcr = CSR_READ_1(regsp, COM_REG_LCR); CSR_WRITE_1(regsp, COM_REG_LCR, lcr & ~LCR_DLAB); CSR_WRITE_1(regsp, COM_REG_FIFO, fcr | FIFO_64B_ENABLE); iir1 = CSR_READ_1(regsp, COM_REG_IIR); CSR_WRITE_1(regsp, COM_REG_FIFO, fcr); CSR_WRITE_1(regsp, COM_REG_LCR, lcr | LCR_DLAB); CSR_WRITE_1(regsp, COM_REG_FIFO, fcr | FIFO_64B_ENABLE); iir2 = CSR_READ_1(regsp, COM_REG_IIR); CSR_WRITE_1(regsp, COM_REG_LCR, lcr); if (!ISSET(iir1, IIR_64B_FIFO) && ISSET(iir2, IIR_64B_FIFO)) { /* It is TL16C750. */ sc->sc_fifolen = 64; SET(sc->sc_hwflags, COM_HW_AFE); } else CSR_WRITE_1(regsp, COM_REG_FIFO, fcr); if (sc->sc_fifolen == 64) fifo_msg = "tl16c750"; else fifo_msg = "ns16750"; } } else fifo_msg = "ns16550, broken fifo"; else fifo_msg = "ns8250 or ns16450, no fifo"; CSR_WRITE_1(regsp, COM_REG_FIFO, 0); fifodelay: /* * Some chips will clear down both Tx and Rx FIFOs when zero is * written to com_fifo. If this chip is the console, writing zero * results in some of the chip/FIFO description being lost, so delay * printing it until now. */ delay(10); if (ISSET(sc->sc_hwflags, COM_HW_FIFO)) { aprint_normal(": %s, %d-byte FIFO\n", fifo_msg, sc->sc_fifolen); } else { aprint_normal(": %s\n", fifo_msg); } if (ISSET(sc->sc_hwflags, COM_HW_TXFIFO_DISABLE)) { sc->sc_fifolen = 1; aprint_normal_dev(sc->sc_dev, "txfifo disabled\n"); } fifodone: tp = tty_alloc(); tp->t_oproc = comstart; tp->t_param = comparam; tp->t_hwiflow = comhwiflow; tp->t_softc = sc; sc->sc_tty = tp; sc->sc_rbuf = malloc(com_rbuf_size << 1, M_DEVBUF, M_WAITOK); sc->sc_rbput = sc->sc_rbget = sc->sc_rbuf; sc->sc_rbavail = com_rbuf_size; sc->sc_ebuf = sc->sc_rbuf + (com_rbuf_size << 1); tty_attach(tp); if (!ISSET(sc->sc_hwflags, COM_HW_NOIEN)) SET(sc->sc_mcr, MCR_IENABLE); if (ISSET(sc->sc_hwflags, COM_HW_CONSOLE)) { int maj; /* locate the major number */ maj = cdevsw_lookup_major(&com_cdevsw); tp->t_dev = cn_tab->cn_dev = makedev(maj, device_unit(sc->sc_dev)); aprint_normal_dev(sc->sc_dev, "console\n"); } #ifdef KGDB /* * Allow kgdb to "take over" this port. If this is * not the console and is the kgdb device, it has * exclusive use. If it's the console _and_ the * kgdb device, it doesn't. */ if (bus_space_is_equal(regsp->cr_iot, comkgdbregs.cr_iot) && regsp->cr_iobase == comkgdbregs.cr_iobase) { if (!ISSET(sc->sc_hwflags, COM_HW_CONSOLE)) { com_kgdb_attached = 1; SET(sc->sc_hwflags, COM_HW_KGDB); } aprint_normal_dev(sc->sc_dev, "kgdb\n"); } #endif sc->sc_si = softint_establish(SOFTINT_SERIAL, comsoft, sc); #ifdef RND_COM rnd_attach_source(&sc->rnd_source, device_xname(sc->sc_dev), RND_TYPE_TTY, RND_FLAG_DEFAULT); #endif /* if there are no enable/disable functions, assume the device is always enabled */ if (!sc->enable) sc->enabled = 1; com_config(sc); SET(sc->sc_hwflags, COM_HW_DEV_OK); if (sc->sc_poll_ticks != 0) callout_schedule(&sc->sc_poll_callout, sc->sc_poll_ticks); } void com_config(struct com_softc *sc) { struct com_regs *regsp = &sc->sc_regs; /* Disable interrupts before configuring the device. */ if (sc->sc_type == COM_TYPE_PXA2x0) sc->sc_ier = IER_EUART; else sc->sc_ier = 0; CSR_WRITE_1(regsp, COM_REG_IER, sc->sc_ier); (void) CSR_READ_1(regsp, COM_REG_IIR); /* Look for a Hayes ESP board. */ if (sc->sc_type == COM_TYPE_HAYESP) { /* Set 16550 compatibility mode */ bus_space_write_1(regsp->cr_iot, sc->sc_hayespioh, HAYESP_CMD1, HAYESP_SETMODE); bus_space_write_1(regsp->cr_iot, sc->sc_hayespioh, HAYESP_CMD2, HAYESP_MODE_FIFO|HAYESP_MODE_RTS| HAYESP_MODE_SCALE); /* Set RTS/CTS flow control */ bus_space_write_1(regsp->cr_iot, sc->sc_hayespioh, HAYESP_CMD1, HAYESP_SETFLOWTYPE); bus_space_write_1(regsp->cr_iot, sc->sc_hayespioh, HAYESP_CMD2, HAYESP_FLOW_RTS); bus_space_write_1(regsp->cr_iot, sc->sc_hayespioh, HAYESP_CMD2, HAYESP_FLOW_CTS); /* Set flow control levels */ bus_space_write_1(regsp->cr_iot, sc->sc_hayespioh, HAYESP_CMD1, HAYESP_SETRXFLOW); bus_space_write_1(regsp->cr_iot, sc->sc_hayespioh, HAYESP_CMD2, HAYESP_HIBYTE(HAYESP_RXHIWMARK)); bus_space_write_1(regsp->cr_iot, sc->sc_hayespioh, HAYESP_CMD2, HAYESP_LOBYTE(HAYESP_RXHIWMARK)); bus_space_write_1(regsp->cr_iot, sc->sc_hayespioh, HAYESP_CMD2, HAYESP_HIBYTE(HAYESP_RXLOWMARK)); bus_space_write_1(regsp->cr_iot, sc->sc_hayespioh, HAYESP_CMD2, HAYESP_LOBYTE(HAYESP_RXLOWMARK)); } if (ISSET(sc->sc_hwflags, COM_HW_CONSOLE|COM_HW_KGDB)) com_enable_debugport(sc); } int com_detach(device_t self, int flags) { struct com_softc *sc = device_private(self); int maj, mn; if (ISSET(sc->sc_hwflags, COM_HW_KGDB)) return EBUSY; if (ISSET(sc->sc_hwflags, COM_HW_CONSOLE) && (flags & DETACH_SHUTDOWN) != 0) return EBUSY; if (sc->disable != NULL && sc->enabled != 0) { (*sc->disable)(sc); sc->enabled = 0; } if (ISSET(sc->sc_hwflags, COM_HW_CONSOLE)) { comconsattached = 0; cn_tab = NULL; } /* locate the major number */ maj = cdevsw_lookup_major(&com_cdevsw); /* Nuke the vnodes for any open instances. */ mn = device_unit(self); vdevgone(maj, mn, mn, VCHR); mn |= COMDIALOUT_MASK; vdevgone(maj, mn, mn, VCHR); if (sc->sc_rbuf == NULL) { /* * Ring buffer allocation failed in the com_attach_subr, * only the tty is allocated, and nothing else. */ tty_free(sc->sc_tty); return 0; } /* Free the receive buffer. */ free(sc->sc_rbuf, M_DEVBUF); /* Detach and free the tty. */ tty_detach(sc->sc_tty); tty_free(sc->sc_tty); /* Unhook the soft interrupt handler. */ softint_disestablish(sc->sc_si); #ifdef RND_COM /* Unhook the entropy source. */ rnd_detach_source(&sc->rnd_source); #endif callout_destroy(&sc->sc_diag_callout); /* Destroy the lock. */ mutex_destroy(&sc->sc_lock); return (0); } void com_shutdown(struct com_softc *sc) { struct tty *tp = sc->sc_tty; mutex_spin_enter(&sc->sc_lock); /* If we were asserting flow control, then deassert it. */ SET(sc->sc_rx_flags, RX_IBUF_BLOCKED); com_hwiflow(sc); /* Clear any break condition set with TIOCSBRK. */ com_break(sc, 0); /* * Hang up if necessary. Record when we hung up, so if we * immediately open the port again, we will wait a bit until * the other side has had time to notice that we hung up. */ if (ISSET(tp->t_cflag, HUPCL)) { com_modem(sc, 0); microuptime(&sc->sc_hup_pending); sc->sc_hup_pending.tv_sec++; } /* Turn off interrupts. */ if (ISSET(sc->sc_hwflags, COM_HW_CONSOLE)) { sc->sc_ier = IER_ERLS; /* interrupt on line break */ if ((sc->sc_type == COM_TYPE_PXA2x0) || (sc->sc_type == COM_TYPE_INGENIC) || (sc->sc_type == COM_TYPE_TEGRA)) sc->sc_ier |= IER_ERXTOUT; } else sc->sc_ier = 0; if (sc->sc_type == COM_TYPE_PXA2x0) sc->sc_ier |= IER_EUART; CSR_WRITE_1(&sc->sc_regs, COM_REG_IER, sc->sc_ier); mutex_spin_exit(&sc->sc_lock); if (sc->disable) { #ifdef DIAGNOSTIC if (!sc->enabled) panic("com_shutdown: not enabled?"); #endif (*sc->disable)(sc); sc->enabled = 0; } } int comopen(dev_t dev, int flag, int mode, struct lwp *l) { struct com_softc *sc; struct tty *tp; int s; int error; sc = device_lookup_private(&com_cd, COMUNIT(dev)); if (sc == NULL || !ISSET(sc->sc_hwflags, COM_HW_DEV_OK) || sc->sc_rbuf == NULL) return (ENXIO); if (!device_is_active(sc->sc_dev)) return (ENXIO); #ifdef KGDB /* * If this is the kgdb port, no other use is permitted. */ if (ISSET(sc->sc_hwflags, COM_HW_KGDB)) return (EBUSY); #endif tp = sc->sc_tty; /* * If the device is exclusively for kernel use, deny userland * open. */ if (ISSET(tp->t_state, TS_KERN_ONLY)) return (EBUSY); if (kauth_authorize_device_tty(l->l_cred, KAUTH_DEVICE_TTY_OPEN, tp)) return (EBUSY); s = spltty(); /* * Do the following iff this is a first open. */ if (!ISSET(tp->t_state, TS_ISOPEN) && tp->t_wopen == 0) { struct termios t; struct timeval now, diff; tp->t_dev = dev; if (sc->enable) { if ((*sc->enable)(sc)) { splx(s); aprint_error_dev(sc->sc_dev, "device enable failed\n"); return (EIO); } mutex_spin_enter(&sc->sc_lock); sc->enabled = 1; com_config(sc); } else { mutex_spin_enter(&sc->sc_lock); } if (timerisset(&sc->sc_hup_pending)) { microuptime(&now); while (timercmp(&now, &sc->sc_hup_pending, <)) { timersub(&sc->sc_hup_pending, &now, &diff); const int ms = diff.tv_sec * 1000 + diff.tv_usec / 1000; kpause(ttclos, false, uimax(mstohz(ms), 1), &sc->sc_lock); microuptime(&now); } timerclear(&sc->sc_hup_pending); } /* Turn on interrupts. */ sc->sc_ier = IER_ERXRDY | IER_ERLS; if (!ISSET(tp->t_cflag, CLOCAL)) sc->sc_ier |= IER_EMSC; if (sc->sc_type == COM_TYPE_PXA2x0) sc->sc_ier |= IER_EUART | IER_ERXTOUT; else if (sc->sc_type == COM_TYPE_INGENIC || sc->sc_type == COM_TYPE_TEGRA) sc->sc_ier |= IER_ERXTOUT; CSR_WRITE_1(&sc->sc_regs, COM_REG_IER, sc->sc_ier); /* Fetch the current modem control status, needed later. */ sc->sc_msr = CSR_READ_1(&sc->sc_regs, COM_REG_MSR); /* Clear PPS capture state on first open. */ mutex_spin_enter(&timecounter_lock); memset(&sc->sc_pps_state, 0, sizeof(sc->sc_pps_state)); sc->sc_pps_state.ppscap = PPS_CAPTUREASSERT | PPS_CAPTURECLEAR; pps_init(&sc->sc_pps_state); mutex_spin_exit(&timecounter_lock); mutex_spin_exit(&sc->sc_lock); /* * Initialize the termios status to the defaults. Add in the * sticky bits from TIOCSFLAGS. */ if (ISSET(sc->sc_hwflags, COM_HW_CONSOLE)) { t.c_ospeed = comcons_info.rate; t.c_cflag = comcons_info.cflag; } else { t.c_ospeed = TTYDEF_SPEED; t.c_cflag = TTYDEF_CFLAG; } t.c_ispeed = t.c_ospeed; if (ISSET(sc->sc_swflags, TIOCFLAG_CLOCAL)) SET(t.c_cflag, CLOCAL); if (ISSET(sc->sc_swflags, TIOCFLAG_CRTSCTS)) SET(t.c_cflag, CRTSCTS); if (ISSET(sc->sc_swflags, TIOCFLAG_MDMBUF)) SET(t.c_cflag, MDMBUF); /* Make sure comparam() will do something. */ tp->t_ospeed = 0; (void) comparam(tp, &t); tp->t_iflag = TTYDEF_IFLAG; tp->t_oflag = TTYDEF_OFLAG; tp->t_lflag = TTYDEF_LFLAG; ttychars(tp); ttsetwater(tp); mutex_spin_enter(&sc->sc_lock); /* * Turn on DTR. We must always do this, even if carrier is not * present, because otherwise we'd have to use TIOCSDTR * immediately after setting CLOCAL, which applications do not * expect. We always assert DTR while the device is open * unless explicitly requested to deassert it. */ com_modem(sc, 1); /* Clear the input ring, and unblock. */ sc->sc_rbput = sc->sc_rbget = sc->sc_rbuf; sc->sc_rbavail = com_rbuf_size; com_iflush(sc); CLR(sc->sc_rx_flags, RX_ANY_BLOCK); com_hwiflow(sc); #ifdef COM_DEBUG if (com_debug) comstatus(sc, "comopen "); #endif mutex_spin_exit(&sc->sc_lock); } splx(s); error = ttyopen(tp, COMDIALOUT(dev), ISSET(flag, O_NONBLOCK)); if (error) goto bad; error = (*tp->t_linesw->l_open)(dev, tp); if (error) goto bad; return (0); bad: if (!ISSET(tp->t_state, TS_ISOPEN) && tp->t_wopen == 0) { /* * We failed to open the device, and nobody else had it opened. * Clean up the state as appropriate. */ com_shutdown(sc); } return (error); } int comclose(dev_t dev, int flag, int mode, struct lwp *l) { struct com_softc *sc = device_lookup_private(&com_cd, COMUNIT(dev)); struct tty *tp = sc->sc_tty; /* XXX This is for cons.c. */ if (!ISSET(tp->t_state, TS_ISOPEN)) return (0); /* * If the device is exclusively for kernel use, deny userland * close. */ if (ISSET(tp->t_state, TS_KERN_ONLY)) return (0); (*tp->t_linesw->l_close)(tp, flag); ttyclose(tp); if (COM_ISALIVE(sc) == 0) return (0); if (!ISSET(tp->t_state, TS_ISOPEN) && tp->t_wopen == 0) { /* * Although we got a last close, the device may still be in * use; e.g. if this was the dialout node, and there are still * processes waiting for carrier on the non-dialout node. */ com_shutdown(sc); } return (0); } int comread(dev_t dev, struct uio *uio, int flag) { struct com_softc *sc = device_lookup_private(&com_cd, COMUNIT(dev)); struct tty *tp = sc->sc_tty; if (COM_ISALIVE(sc) == 0) return (EIO); return ((*tp->t_linesw->l_read)(tp, uio, flag)); } int comwrite(dev_t dev, struct uio *uio, int flag) { struct com_softc *sc = device_lookup_private(&com_cd, COMUNIT(dev)); struct tty *tp = sc->sc_tty; if (COM_ISALIVE(sc) == 0) return (EIO); return ((*tp->t_linesw->l_write)(tp, uio, flag)); } int compoll(dev_t dev, int events, struct lwp *l) { struct com_softc *sc = device_lookup_private(&com_cd, COMUNIT(dev)); struct tty *tp = sc->sc_tty; if (COM_ISALIVE(sc) == 0) return (POLLHUP); return ((*tp->t_linesw->l_poll)(tp, events, l)); } struct tty * comtty(dev_t dev) { struct com_softc *sc = device_lookup_private(&com_cd, COMUNIT(dev)); struct tty *tp = sc->sc_tty; return (tp); } int comioctl(dev_t dev, u_long cmd, void *data, int flag, struct lwp *l) { struct com_softc *sc; struct tty *tp; int error; sc = device_lookup_private(&com_cd, COMUNIT(dev)); if (sc == NULL) return ENXIO; if (COM_ISALIVE(sc) == 0) return (EIO); tp = sc->sc_tty; error = (*tp->t_linesw->l_ioctl)(tp, cmd, data, flag, l); if (error != EPASSTHROUGH) return (error); error = ttioctl(tp, cmd, data, flag, l); if (error != EPASSTHROUGH) return (error); error = 0; switch (cmd) { case TIOCSFLAGS: error = kauth_authorize_device_tty(l->l_cred, KAUTH_DEVICE_TTY_PRIVSET, tp); break; default: /* nothing */ break; } if (error) { return error; } mutex_spin_enter(&sc->sc_lock); switch (cmd) { case TIOCSBRK: com_break(sc, 1); break; case TIOCCBRK: com_break(sc, 0); break; case TIOCSDTR: com_modem(sc, 1); break; case TIOCCDTR: com_modem(sc, 0); break; case TIOCGFLAGS: *(int *)data = sc->sc_swflags; break; case TIOCSFLAGS: sc->sc_swflags = *(int *)data; break; case TIOCMSET: case TIOCMBIS: case TIOCMBIC: tiocm_to_com(sc, cmd, *(int *)data); break; case TIOCMGET: *(int *)data = com_to_tiocm(sc); break; case PPS_IOC_CREATE: case PPS_IOC_DESTROY: case PPS_IOC_GETPARAMS: case PPS_IOC_SETPARAMS: case PPS_IOC_GETCAP: case PPS_IOC_FETCH: #ifdef PPS_SYNC case PPS_IOC_KCBIND: #endif mutex_spin_enter(&timecounter_lock); error = pps_ioctl(cmd, data, &sc->sc_pps_state); mutex_spin_exit(&timecounter_lock); break; case TIOCDCDTIMESTAMP: /* XXX old, overloaded API used by xntpd v3 */ mutex_spin_enter(&timecounter_lock); #ifndef PPS_TRAILING_EDGE TIMESPEC_TO_TIMEVAL((struct timeval *)data, &sc->sc_pps_state.ppsinfo.assert_timestamp); #else TIMESPEC_TO_TIMEVAL((struct timeval *)data, &sc->sc_pps_state.ppsinfo.clear_timestamp); #endif mutex_spin_exit(&timecounter_lock); break; default: error = EPASSTHROUGH; break; } mutex_spin_exit(&sc->sc_lock); #ifdef COM_DEBUG if (com_debug) comstatus(sc, "comioctl "); #endif return (error); } static inline void com_schedrx(struct com_softc *sc) { sc->sc_rx_ready = 1; /* Wake up the poller. */ softint_schedule(sc->sc_si); } void com_break(struct com_softc *sc, int onoff) { if (onoff) SET(sc->sc_lcr, LCR_SBREAK); else CLR(sc->sc_lcr, LCR_SBREAK); if (!sc->sc_heldchange) { if (sc->sc_tx_busy) { sc->sc_heldtbc = sc->sc_tbc; sc->sc_tbc = 0; sc->sc_heldchange = 1; } else com_loadchannelregs(sc); } } void com_modem(struct com_softc *sc, int onoff) { if (sc->sc_mcr_dtr == 0) return; if (onoff) SET(sc->sc_mcr, sc->sc_mcr_dtr); else CLR(sc->sc_mcr, sc->sc_mcr_dtr); if (!sc->sc_heldchange) { if (sc->sc_tx_busy) { sc->sc_heldtbc = sc->sc_tbc; sc->sc_tbc = 0; sc->sc_heldchange = 1; } else com_loadchannelregs(sc); } } void tiocm_to_com(struct com_softc *sc, u_long how, int ttybits) { u_char combits; combits = 0; if (ISSET(ttybits, TIOCM_DTR)) SET(combits, MCR_DTR); if (ISSET(ttybits, TIOCM_RTS)) SET(combits, MCR_RTS); switch (how) { case TIOCMBIC: CLR(sc->sc_mcr, combits); break; case TIOCMBIS: SET(sc->sc_mcr, combits); break; case TIOCMSET: CLR(sc->sc_mcr, MCR_DTR | MCR_RTS); SET(sc->sc_mcr, combits); break; } if (!sc->sc_heldchange) { if (sc->sc_tx_busy) { sc->sc_heldtbc = sc->sc_tbc; sc->sc_tbc = 0; sc->sc_heldchange = 1; } else com_loadchannelregs(sc); } } int com_to_tiocm(struct com_softc *sc) { u_char combits; int ttybits = 0; combits = sc->sc_mcr; if (ISSET(combits, MCR_DTR)) SET(ttybits, TIOCM_DTR); if (ISSET(combits, MCR_RTS)) SET(ttybits, TIOCM_RTS); combits = sc->sc_msr; if (sc->sc_type == COM_TYPE_INGENIC) { SET(ttybits, TIOCM_CD); } else { if (ISSET(combits, MSR_DCD)) SET(ttybits, TIOCM_CD); } if (ISSET(combits, MSR_CTS)) SET(ttybits, TIOCM_CTS); if (ISSET(combits, MSR_DSR)) SET(ttybits, TIOCM_DSR); if (ISSET(combits, MSR_RI | MSR_TERI)) SET(ttybits, TIOCM_RI); if (ISSET(sc->sc_ier, IER_ERXRDY | IER_ETXRDY | IER_ERLS | IER_EMSC)) SET(ttybits, TIOCM_LE); return (ttybits); } static u_char cflag2lcr(tcflag_t cflag) { u_char lcr = 0; switch (ISSET(cflag, CSIZE)) { case CS5: SET(lcr, LCR_5BITS); break; case CS6: SET(lcr, LCR_6BITS); break; case CS7: SET(lcr, LCR_7BITS); break; case CS8: SET(lcr, LCR_8BITS); break; } if (ISSET(cflag, PARENB)) { SET(lcr, LCR_PENAB); if (!ISSET(cflag, PARODD)) SET(lcr, LCR_PEVEN); } if (ISSET(cflag, CSTOPB)) SET(lcr, LCR_STOPB); return (lcr); } int comparam(struct tty *tp, struct termios *t) { struct com_softc *sc = device_lookup_private(&com_cd, COMUNIT(tp->t_dev)); int ospeed; u_char lcr; if (COM_ISALIVE(sc) == 0) return (EIO); if (sc->sc_type == COM_TYPE_HAYESP) { int prescaler, speed; /* * Calculate UART clock prescaler. It should be in * range of 0 .. 3. */ for (prescaler = 0, speed = t->c_ospeed; prescaler < 4; prescaler++, speed /= 2) if ((ospeed = comspeed(speed, sc->sc_frequency, sc->sc_type)) > 0) break; if (prescaler == 4) return (EINVAL); sc->sc_prescaler = prescaler; } else ospeed = comspeed(t->c_ospeed, sc->sc_frequency, sc->sc_type); /* Check requested parameters. */ if (ospeed < 0) return (EINVAL); if (t->c_ispeed && t->c_ispeed != t->c_ospeed) return (EINVAL); /* * For the console, always force CLOCAL and !HUPCL, so that the port * is always active. */ if (ISSET(sc->sc_swflags, TIOCFLAG_SOFTCAR) || ISSET(sc->sc_hwflags, COM_HW_CONSOLE)) { SET(t->c_cflag, CLOCAL); CLR(t->c_cflag, HUPCL); } /* * If there were no changes, don't do anything. This avoids dropping * input and improves performance when all we did was frob things like * VMIN and VTIME. */ if (tp->t_ospeed == t->c_ospeed && tp->t_cflag == t->c_cflag) return (0); lcr = ISSET(sc->sc_lcr, LCR_SBREAK) | cflag2lcr(t->c_cflag); mutex_spin_enter(&sc->sc_lock); sc->sc_lcr = lcr; /* * If we're not in a mode that assumes a connection is present, then * ignore carrier changes. */ if (ISSET(t->c_cflag, CLOCAL | MDMBUF)) sc->sc_msr_dcd = 0; else sc->sc_msr_dcd = MSR_DCD; /* * Set the flow control pins depending on the current flow control * mode. */ if (ISSET(t->c_cflag, CRTSCTS)) { sc->sc_mcr_dtr = MCR_DTR; sc->sc_mcr_rts = MCR_RTS; sc->sc_msr_cts = MSR_CTS; if (ISSET(sc->sc_hwflags, COM_HW_AFE)) { SET(sc->sc_mcr, MCR_AFE); } else { sc->sc_efr = EFR_AUTORTS | EFR_AUTOCTS; } } else if (ISSET(t->c_cflag, MDMBUF)) { /* * For DTR/DCD flow control, make sure we don't toggle DTR for * carrier detection. */ sc->sc_mcr_dtr = 0; sc->sc_mcr_rts = MCR_DTR; sc->sc_msr_cts = MSR_DCD; if (ISSET(sc->sc_hwflags, COM_HW_AFE)) { CLR(sc->sc_mcr, MCR_AFE); } else { sc->sc_efr = 0; } } else { /* * If no flow control, then always set RTS. This will make * the other side happy if it mistakenly thinks we're doing * RTS/CTS flow control. */ sc->sc_mcr_dtr = MCR_DTR | MCR_RTS; sc->sc_mcr_rts = 0; sc->sc_msr_cts = 0; if (ISSET(sc->sc_hwflags, COM_HW_AFE)) { CLR(sc->sc_mcr, MCR_AFE); } else { sc->sc_efr = 0; } if (ISSET(sc->sc_mcr, MCR_DTR)) SET(sc->sc_mcr, MCR_RTS); else CLR(sc->sc_mcr, MCR_RTS); } sc->sc_msr_mask = sc->sc_msr_cts | sc->sc_msr_dcd; if (t->c_ospeed == 0 && tp->t_ospeed != 0) CLR(sc->sc_mcr, sc->sc_mcr_dtr); else if (t->c_ospeed != 0 && tp->t_ospeed == 0) SET(sc->sc_mcr, sc->sc_mcr_dtr); sc->sc_dlbl = ospeed; sc->sc_dlbh = ospeed >> 8; /* * Set the FIFO threshold based on the receive speed. * * * If it's a low speed, it's probably a mouse or some other * interactive device, so set the threshold low. * * If it's a high speed, trim the trigger level down to prevent * overflows. * * Otherwise set it a bit higher. */ if (sc->sc_type == COM_TYPE_HAYESP) { sc->sc_fifo = FIFO_DMA_MODE | FIFO_ENABLE | FIFO_TRIGGER_8; } else if (sc->sc_type == COM_TYPE_TEGRA) { sc->sc_fifo = FIFO_ENABLE | FIFO_TRIGGER_1; } else if (ISSET(sc->sc_hwflags, COM_HW_FIFO)) { if (t->c_ospeed <= 1200) sc->sc_fifo = FIFO_ENABLE | FIFO_TRIGGER_1; else if (t->c_ospeed <= 38400) sc->sc_fifo = FIFO_ENABLE | FIFO_TRIGGER_8; else sc->sc_fifo = FIFO_ENABLE | FIFO_TRIGGER_4; } else { sc->sc_fifo = 0; } if (sc->sc_type == COM_TYPE_INGENIC) sc->sc_fifo |= FIFO_UART_ON; /* And copy to tty. */ tp->t_ispeed = t->c_ospeed; tp->t_ospeed = t->c_ospeed; tp->t_cflag = t->c_cflag; if (!sc->sc_heldchange) { if (sc->sc_tx_busy) { sc->sc_heldtbc = sc->sc_tbc; sc->sc_tbc = 0; sc->sc_heldchange = 1; } else com_loadchannelregs(sc); } if (!ISSET(t->c_cflag, CHWFLOW)) { /* Disable the high water mark. */ sc->sc_r_hiwat = 0; sc->sc_r_lowat = 0; if (ISSET(sc->sc_rx_flags, RX_TTY_OVERFLOWED)) { CLR(sc->sc_rx_flags, RX_TTY_OVERFLOWED); com_schedrx(sc); } if (ISSET(sc->sc_rx_flags, RX_TTY_BLOCKED|RX_IBUF_BLOCKED)) { CLR(sc->sc_rx_flags, RX_TTY_BLOCKED|RX_IBUF_BLOCKED); com_hwiflow(sc); } } else { sc->sc_r_hiwat = com_rbuf_hiwat; sc->sc_r_lowat = com_rbuf_lowat; } mutex_spin_exit(&sc->sc_lock); /* * Update the tty layer's idea of the carrier bit, in case we changed * CLOCAL or MDMBUF. We don't hang up here; we only do that by * explicit request. */ if (sc->sc_type == COM_TYPE_INGENIC) { /* no DCD here */ (void) (*tp->t_linesw->l_modem)(tp, 1); } else (void) (*tp->t_linesw->l_modem)(tp, ISSET(sc->sc_msr, MSR_DCD)); #ifdef COM_DEBUG if (com_debug) comstatus(sc, "comparam "); #endif if (!ISSET(t->c_cflag, CHWFLOW)) { if (sc->sc_tx_stopped) { sc->sc_tx_stopped = 0; comstart(tp); } } return (0); } void com_iflush(struct com_softc *sc) { struct com_regs *regsp = &sc->sc_regs; uint8_t fifo; #ifdef DIAGNOSTIC int reg; #endif int timo; #ifdef DIAGNOSTIC reg = 0xffff; #endif timo = 50000; /* flush any pending I/O */ while (ISSET(CSR_READ_1(regsp, COM_REG_LSR), LSR_RXRDY) && --timo) #ifdef DIAGNOSTIC reg = #else (void) #endif CSR_READ_1(regsp, COM_REG_RXDATA); #ifdef DIAGNOSTIC if (!timo) aprint_error_dev(sc->sc_dev, "com_iflush timeout %02x\n", reg); #endif switch (sc->sc_type) { case COM_TYPE_16750: case COM_TYPE_DW_APB: /* * Reset all Rx/Tx FIFO, preserve current FIFO length. * This should prevent triggering busy interrupt while * manipulating divisors. */ fifo = CSR_READ_1(regsp, COM_REG_FIFO) & (FIFO_TRIGGER_1 | FIFO_TRIGGER_4 | FIFO_TRIGGER_8 | FIFO_TRIGGER_14); CSR_WRITE_1(regsp, COM_REG_FIFO, fifo | FIFO_ENABLE | FIFO_RCV_RST | FIFO_XMT_RST); delay(100); break; } } void com_loadchannelregs(struct com_softc *sc) { struct com_regs *regsp = &sc->sc_regs; /* XXXXX necessary? */ com_iflush(sc); if (sc->sc_type == COM_TYPE_PXA2x0) CSR_WRITE_1(regsp, COM_REG_IER, IER_EUART); else CSR_WRITE_1(regsp, COM_REG_IER, 0); if (sc->sc_type == COM_TYPE_OMAP) { /* disable before changing settings */ CSR_WRITE_1(regsp, COM_REG_MDR1, MDR1_MODE_DISABLE); } if (ISSET(sc->sc_hwflags, COM_HW_FLOW)) { KASSERT(sc->sc_type != COM_TYPE_AU1x00); KASSERT(sc->sc_type != COM_TYPE_16550_NOERS); /* no EFR on alchemy */ CSR_WRITE_1(regsp, COM_REG_LCR, LCR_EERS); CSR_WRITE_1(regsp, COM_REG_EFR, sc->sc_efr); } if (sc->sc_type == COM_TYPE_AU1x00) { /* alchemy has single separate 16-bit clock divisor register */ CSR_WRITE_2(regsp, COM_REG_DLBL, sc->sc_dlbl + (sc->sc_dlbh << 8)); } else { CSR_WRITE_1(regsp, COM_REG_LCR, sc->sc_lcr | LCR_DLAB); CSR_WRITE_1(regsp, COM_REG_DLBL, sc->sc_dlbl); CSR_WRITE_1(regsp, COM_REG_DLBH, sc->sc_dlbh); } CSR_WRITE_1(regsp, COM_REG_LCR, sc->sc_lcr); CSR_WRITE_1(regsp, COM_REG_MCR, sc->sc_mcr_active = sc->sc_mcr); CSR_WRITE_1(regsp, COM_REG_FIFO, sc->sc_fifo); if (sc->sc_type == COM_TYPE_HAYESP) { bus_space_write_1(regsp->cr_iot, sc->sc_hayespioh, HAYESP_CMD1, HAYESP_SETPRESCALER); bus_space_write_1(regsp->cr_iot, sc->sc_hayespioh, HAYESP_CMD2, sc->sc_prescaler); } if (sc->sc_type == COM_TYPE_OMAP) { /* setup the fifos. the FCR value is not used as long as SCR[6] and SCR[7] are 0, which they are at reset and we never touch the SCR register */ uint8_t rx_fifo_trig = 40; uint8_t tx_fifo_trig = 60; uint8_t rx_start = 8; uint8_t rx_halt = 60; uint8_t tlr_value = ((rx_fifo_trig>>2) << 4) | (tx_fifo_trig>>2); uint8_t tcr_value = ((rx_start>>2) << 4) | (rx_halt>>2); /* enable access to TCR & TLR */ CSR_WRITE_1(regsp, COM_REG_MCR, sc->sc_mcr | MCR_TCR_TLR); /* write tcr and tlr values */ CSR_WRITE_1(regsp, COM_REG_TLR, tlr_value); CSR_WRITE_1(regsp, COM_REG_TCR, tcr_value); /* disable access to TCR & TLR */ CSR_WRITE_1(regsp, COM_REG_MCR, sc->sc_mcr); /* enable again, but mode is based on speed */ if (sc->sc_tty->t_termios.c_ospeed > 230400) { CSR_WRITE_1(regsp, COM_REG_MDR1, MDR1_MODE_UART_13X); } else { CSR_WRITE_1(regsp, COM_REG_MDR1, MDR1_MODE_UART_16X); } } CSR_WRITE_1(regsp, COM_REG_IER, sc->sc_ier); } int comhwiflow(struct tty *tp, int block) { struct com_softc *sc = device_lookup_private(&com_cd, COMUNIT(tp->t_dev)); if (COM_ISALIVE(sc) == 0) return (0); if (sc->sc_mcr_rts == 0) return (0); mutex_spin_enter(&sc->sc_lock); if (block) { if (!ISSET(sc->sc_rx_flags, RX_TTY_BLOCKED)) { SET(sc->sc_rx_flags, RX_TTY_BLOCKED); com_hwiflow(sc); } } else { if (ISSET(sc->sc_rx_flags, RX_TTY_OVERFLOWED)) { CLR(sc->sc_rx_flags, RX_TTY_OVERFLOWED); com_schedrx(sc); } if (ISSET(sc->sc_rx_flags, RX_TTY_BLOCKED)) { CLR(sc->sc_rx_flags, RX_TTY_BLOCKED); com_hwiflow(sc); } } mutex_spin_exit(&sc->sc_lock); return (1); } /* * (un)block input via hw flowcontrol */ void com_hwiflow(struct com_softc *sc) { struct com_regs *regsp= &sc->sc_regs; if (sc->sc_mcr_rts == 0) return; if (ISSET(sc->sc_rx_flags, RX_ANY_BLOCK)) { CLR(sc->sc_mcr, sc->sc_mcr_rts); CLR(sc->sc_mcr_active, sc->sc_mcr_rts); } else { SET(sc->sc_mcr, sc->sc_mcr_rts); SET(sc->sc_mcr_active, sc->sc_mcr_rts); } CSR_WRITE_1(regsp, COM_REG_MCR, sc->sc_mcr_active); } void comstart(struct tty *tp) { struct com_softc *sc = device_lookup_private(&com_cd, COMUNIT(tp->t_dev)); struct com_regs *regsp = &sc->sc_regs; if (COM_ISALIVE(sc) == 0) return; if (ISSET(tp->t_state, TS_BUSY | TS_TIMEOUT | TS_TTSTOP)) return; if (sc->sc_tx_stopped) return; if (!ttypull(tp)) return; /* Grab the first contiguous region of buffer space. */ { u_char *tba; int tbc; tba = tp->t_outq.c_cf; tbc = ndqb(&tp->t_outq, 0); mutex_spin_enter(&sc->sc_lock); sc->sc_tba = tba; sc->sc_tbc = tbc; } SET(tp->t_state, TS_BUSY); sc->sc_tx_busy = 1; /* Enable transmit completion interrupts if necessary. */ if (!ISSET(sc->sc_ier, IER_ETXRDY)) { SET(sc->sc_ier, IER_ETXRDY); CSR_WRITE_1(regsp, COM_REG_IER, sc->sc_ier); } /* Output the first chunk of the contiguous buffer. */ if (!ISSET(sc->sc_hwflags, COM_HW_NO_TXPRELOAD)) { u_int n; n = sc->sc_tbc; if (n > sc->sc_fifolen) n = sc->sc_fifolen; CSR_WRITE_MULTI(regsp, COM_REG_TXDATA, sc->sc_tba, n); sc->sc_tbc -= n; sc->sc_tba += n; } mutex_spin_exit(&sc->sc_lock); } /* * Stop output on a line. */ void comstop(struct tty *tp, int flag) { struct com_softc *sc = device_lookup_private(&com_cd, COMUNIT(tp->t_dev)); mutex_spin_enter(&sc->sc_lock); if (ISSET(tp->t_state, TS_BUSY)) { /* Stop transmitting at the next chunk. */ sc->sc_tbc = 0; sc->sc_heldtbc = 0; if (!ISSET(tp->t_state, TS_TTSTOP)) SET(tp->t_state, TS_FLUSH); } mutex_spin_exit(&sc->sc_lock); } void comdiag(void *arg) { struct com_softc *sc = arg; int overflows, floods; mutex_spin_enter(&sc->sc_lock); overflows = sc->sc_overflows; sc->sc_overflows = 0; floods = sc->sc_floods; sc->sc_floods = 0; sc->sc_errors = 0; mutex_spin_exit(&sc->sc_lock); log(LOG_WARNING, "%s: %d silo overflow%s, %d ibuf flood%s\n", device_xname(sc->sc_dev), overflows, overflows == 1 ? "" : "s", floods, floods == 1 ? "" : "s"); } static inline void com_rxsoft(struct com_softc *sc, struct tty *tp) { int (*rint)(int, struct tty *) = tp->t_linesw->l_rint; u_char *get, *end; u_int cc, scc; u_char lsr; int code; end = sc->sc_ebuf; get = sc->sc_rbget; scc = cc = com_rbuf_size - sc->sc_rbavail; if (cc == com_rbuf_size) { sc->sc_floods++; if (sc->sc_errors++ == 0) callout_reset(&sc->sc_diag_callout, 60 * hz, comdiag, sc); } /* If not yet open, drop the entire buffer content here */ if (!ISSET(tp->t_state, TS_ISOPEN)) { get += cc << 1; if (get >= end) get -= com_rbuf_size << 1; cc = 0; } while (cc) { code = get[0]; lsr = get[1]; if (ISSET(lsr, LSR_OE | LSR_BI | LSR_FE | LSR_PE)) { if (ISSET(lsr, LSR_OE)) { sc->sc_overflows++; if (sc->sc_errors++ == 0) callout_reset(&sc->sc_diag_callout, 60 * hz, comdiag, sc); } if (ISSET(lsr, LSR_BI | LSR_FE)) SET(code, TTY_FE); if (ISSET(lsr, LSR_PE)) SET(code, TTY_PE); } if ((*rint)(code, tp) == -1) { /* * The line discipline's buffer is out of space. */ if (!ISSET(sc->sc_rx_flags, RX_TTY_BLOCKED)) { /* * We're either not using flow control, or the * line discipline didn't tell us to block for * some reason. Either way, we have no way to * know when there's more space available, so * just drop the rest of the data. */ get += cc << 1; if (get >= end) get -= com_rbuf_size << 1; cc = 0; } else { /* * Don't schedule any more receive processing * until the line discipline tells us there's * space available (through comhwiflow()). * Leave the rest of the data in the input * buffer. */ SET(sc->sc_rx_flags, RX_TTY_OVERFLOWED); } break; } get += 2; if (get >= end) get = sc->sc_rbuf; cc--; } if (cc != scc) { sc->sc_rbget = get; mutex_spin_enter(&sc->sc_lock); cc = sc->sc_rbavail += scc - cc; /* Buffers should be ok again, release possible block. */ if (cc >= sc->sc_r_lowat) { if (ISSET(sc->sc_rx_flags, RX_IBUF_OVERFLOWED)) { CLR(sc->sc_rx_flags, RX_IBUF_OVERFLOWED); SET(sc->sc_ier, IER_ERXRDY); if (sc->sc_type == COM_TYPE_PXA2x0) SET(sc->sc_ier, IER_ERXTOUT); if (sc->sc_type == COM_TYPE_INGENIC || sc->sc_type == COM_TYPE_TEGRA) SET(sc->sc_ier, IER_ERXTOUT); CSR_WRITE_1(&sc->sc_regs, COM_REG_IER, sc->sc_ier); } if (ISSET(sc->sc_rx_flags, RX_IBUF_BLOCKED)) { CLR(sc->sc_rx_flags, RX_IBUF_BLOCKED); com_hwiflow(sc); } } mutex_spin_exit(&sc->sc_lock); } } static inline void com_txsoft(struct com_softc *sc, struct tty *tp) { CLR(tp->t_state, TS_BUSY); if (ISSET(tp->t_state, TS_FLUSH)) CLR(tp->t_state, TS_FLUSH); else ndflush(&tp->t_outq, (int)(sc->sc_tba - tp->t_outq.c_cf)); (*tp->t_linesw->l_start)(tp); } static inline void com_stsoft(struct com_softc *sc, struct tty *tp) { u_char msr, delta; mutex_spin_enter(&sc->sc_lock); msr = sc->sc_msr; delta = sc->sc_msr_delta; sc->sc_msr_delta = 0; mutex_spin_exit(&sc->sc_lock); if (ISSET(delta, sc->sc_msr_dcd)) { /* * Inform the tty layer that carrier detect changed. */ (void) (*tp->t_linesw->l_modem)(tp, ISSET(msr, MSR_DCD)); } if (ISSET(delta, sc->sc_msr_cts)) { /* Block or unblock output according to flow control. */ if (ISSET(msr, sc->sc_msr_cts)) { sc->sc_tx_stopped = 0; (*tp->t_linesw->l_start)(tp); } else { sc->sc_tx_stopped = 1; } } #ifdef COM_DEBUG if (com_debug) comstatus(sc, "com_stsoft"); #endif } void comsoft(void *arg) { struct com_softc *sc = arg; struct tty *tp; if (COM_ISALIVE(sc) == 0) return; tp = sc->sc_tty; if (sc->sc_rx_ready) { sc->sc_rx_ready = 0; com_rxsoft(sc, tp); } if (sc->sc_st_check) { sc->sc_st_check = 0; com_stsoft(sc, tp); } if (sc->sc_tx_done) { sc->sc_tx_done = 0; com_txsoft(sc, tp); } } int comintr(void *arg) { struct com_softc *sc = arg; struct com_regs *regsp = &sc->sc_regs; u_char *put, *end; u_int cc; u_char lsr, iir; if (COM_ISALIVE(sc) == 0) return (0); KASSERT(regsp != NULL); mutex_spin_enter(&sc->sc_lock); iir = CSR_READ_1(regsp, COM_REG_IIR); /* Handle ns16750-specific busy interrupt. */ if (sc->sc_type == COM_TYPE_16750 && (iir & IIR_BUSY) == IIR_BUSY) { for (int timeout = 10000; (CSR_READ_1(regsp, COM_REG_USR) & 0x1) != 0; timeout--) if (timeout <= 0) { aprint_error_dev(sc->sc_dev, "timeout while waiting for BUSY interrupt " "acknowledge\n"); mutex_spin_exit(&sc->sc_lock); return (0); } CSR_WRITE_1(regsp, COM_REG_LCR, sc->sc_lcr); iir = CSR_READ_1(regsp, COM_REG_IIR); } /* DesignWare APB UART BUSY interrupt */ if (sc->sc_type == COM_TYPE_DW_APB && (iir & IIR_BUSY) == IIR_BUSY) { if (ISSET(sc->sc_hwflags, COM_HW_CONSOLE)) { (void)CSR_READ_1(regsp, COM_REG_USR); } else if ((CSR_READ_1(regsp, COM_REG_USR) & 0x1) != 0) { CSR_WRITE_1(regsp, COM_REG_HALT, HALT_CHCFG_EN); CSR_WRITE_1(regsp, COM_REG_LCR, sc->sc_lcr | LCR_DLAB); CSR_WRITE_1(regsp, COM_REG_DLBL, sc->sc_dlbl); CSR_WRITE_1(regsp, COM_REG_DLBH, sc->sc_dlbh); CSR_WRITE_1(regsp, COM_REG_LCR, sc->sc_lcr); CSR_WRITE_1(regsp, COM_REG_HALT, HALT_CHCFG_EN | HALT_CHCFG_UD); for (int timeout = 10000000; (CSR_READ_1(regsp, COM_REG_HALT) & HALT_CHCFG_UD) != 0; timeout--) { if (timeout <= 0) { aprint_error_dev(sc->sc_dev, "timeout while waiting for HALT " "update acknowledge 0x%x 0x%x\n", CSR_READ_1(regsp, COM_REG_HALT), CSR_READ_1(regsp, COM_REG_USR)); break; } } CSR_WRITE_1(regsp, COM_REG_HALT, 0); (void)CSR_READ_1(regsp, COM_REG_USR); } else { CSR_WRITE_1(regsp, COM_REG_LCR, sc->sc_lcr | LCR_DLAB); CSR_WRITE_1(regsp, COM_REG_DLBL, sc->sc_dlbl); CSR_WRITE_1(regsp, COM_REG_DLBH, sc->sc_dlbh); CSR_WRITE_1(regsp, COM_REG_LCR, sc->sc_lcr); } } end = sc->sc_ebuf; put = sc->sc_rbput; cc = sc->sc_rbavail; if (ISSET(iir, IIR_NOPEND)) { if (ISSET(sc->sc_hwflags, COM_HW_BROKEN_ETXRDY)) goto do_tx; mutex_spin_exit(&sc->sc_lock); return (0); } again: do { u_char msr, delta; lsr = CSR_READ_1(regsp, COM_REG_LSR); if (ISSET(lsr, LSR_BI)) { int cn_trapped = 0; /* see above: cn_trap() */ cn_check_magic(sc->sc_tty->t_dev, CNC_BREAK, com_cnm_state); if (cn_trapped) continue; #if defined(KGDB) && !defined(DDB) if (ISSET(sc->sc_hwflags, COM_HW_KGDB)) { kgdb_connect(1); continue; } #endif } if (sc->sc_type == COM_TYPE_BCMAUXUART && ISSET(iir, IIR_RXRDY)) lsr |= LSR_RXRDY; if (ISSET(lsr, LSR_RCV_MASK) && !ISSET(sc->sc_rx_flags, RX_IBUF_OVERFLOWED)) { while (cc > 0) { int cn_trapped = 0; put[0] = CSR_READ_1(regsp, COM_REG_RXDATA); put[1] = lsr; cn_check_magic(sc->sc_tty->t_dev, put[0], com_cnm_state); if (cn_trapped) goto next; put += 2; if (put >= end) put = sc->sc_rbuf; cc--; next: lsr = CSR_READ_1(regsp, COM_REG_LSR); if (!ISSET(lsr, LSR_RCV_MASK)) break; } /* * Current string of incoming characters ended because * no more data was available or we ran out of space. * Schedule a receive event if any data was received. * If we're out of space, turn off receive interrupts. */ sc->sc_rbput = put; sc->sc_rbavail = cc; if (!ISSET(sc->sc_rx_flags, RX_TTY_OVERFLOWED)) sc->sc_rx_ready = 1; /* * See if we are in danger of overflowing a buffer. If * so, use hardware flow control to ease the pressure. */ if (!ISSET(sc->sc_rx_flags, RX_IBUF_BLOCKED) && cc < sc->sc_r_hiwat) { SET(sc->sc_rx_flags, RX_IBUF_BLOCKED); com_hwiflow(sc); } /* * If we're out of space, disable receive interrupts * until the queue has drained a bit. */ if (!cc) { SET(sc->sc_rx_flags, RX_IBUF_OVERFLOWED); switch (sc->sc_type) { case COM_TYPE_PXA2x0: CLR(sc->sc_ier, IER_ERXRDY|IER_ERXTOUT); break; case COM_TYPE_INGENIC: case COM_TYPE_TEGRA: CLR(sc->sc_ier, IER_ERXRDY | IER_ERXTOUT); break; default: CLR(sc->sc_ier, IER_ERXRDY); break; } CSR_WRITE_1(regsp, COM_REG_IER, sc->sc_ier); } } else { if ((iir & (IIR_RXRDY|IIR_TXRDY)) == IIR_RXRDY) { (void) CSR_READ_1(regsp, COM_REG_RXDATA); continue; } } msr = CSR_READ_1(regsp, COM_REG_MSR); delta = msr ^ sc->sc_msr; sc->sc_msr = msr; if ((sc->sc_pps_state.ppsparam.mode & PPS_CAPTUREBOTH) && (delta & MSR_DCD)) { mutex_spin_enter(&timecounter_lock); pps_capture(&sc->sc_pps_state); pps_event(&sc->sc_pps_state, (msr & MSR_DCD) ? PPS_CAPTUREASSERT : PPS_CAPTURECLEAR); mutex_spin_exit(&timecounter_lock); } /* * Process normal status changes */ if (ISSET(delta, sc->sc_msr_mask)) { SET(sc->sc_msr_delta, delta); /* * Stop output immediately if we lose the output * flow control signal or carrier detect. */ if (ISSET(~msr, sc->sc_msr_mask)) { sc->sc_tbc = 0; sc->sc_heldtbc = 0; #ifdef COM_DEBUG if (com_debug) comstatus(sc, "comintr "); #endif } sc->sc_st_check = 1; } } while (!ISSET((iir = CSR_READ_1(regsp, COM_REG_IIR)), IIR_NOPEND) && /* * Since some device (e.g., ST16C1550) doesn't clear IIR_TXRDY * by IIR read, so we can't do this way: `process all interrupts, * then do TX if possible'. */ (iir & IIR_IMASK) != IIR_TXRDY); do_tx: /* * Read LSR again, since there may be an interrupt between * the last LSR read and IIR read above. */ lsr = CSR_READ_1(regsp, COM_REG_LSR); /* * See if data can be transmitted as well. * Schedule tx done event if no data left * and tty was marked busy. */ if (ISSET(lsr, LSR_TXRDY)) { /* * If we've delayed a parameter change, do it now, and restart * output. */ if (sc->sc_heldchange) { com_loadchannelregs(sc); sc->sc_heldchange = 0; sc->sc_tbc = sc->sc_heldtbc; sc->sc_heldtbc = 0; } /* Output the next chunk of the contiguous buffer, if any. */ if (sc->sc_tbc > 0) { u_int n; n = sc->sc_tbc; if (n > sc->sc_fifolen) n = sc->sc_fifolen; CSR_WRITE_MULTI(regsp, COM_REG_TXDATA, sc->sc_tba, n); sc->sc_tbc -= n; sc->sc_tba += n; } else { /* Disable transmit completion interrupts if necessary. */ if (ISSET(sc->sc_ier, IER_ETXRDY)) { CLR(sc->sc_ier, IER_ETXRDY); CSR_WRITE_1(regsp, COM_REG_IER, sc->sc_ier); } if (sc->sc_tx_busy) { sc->sc_tx_busy = 0; sc->sc_tx_done = 1; } } } if (!ISSET((iir = CSR_READ_1(regsp, COM_REG_IIR)), IIR_NOPEND)) goto again; mutex_spin_exit(&sc->sc_lock); /* Wake up the poller. */ if ((sc->sc_rx_ready | sc->sc_st_check | sc->sc_tx_done) != 0) softint_schedule(sc->sc_si); #ifdef RND_COM rnd_add_uint32(&sc->rnd_source, iir | lsr); #endif return (1); } /* * The following functions are polled getc and putc routines, shared * by the console and kgdb glue. * * The read-ahead code is so that you can detect pending in-band * cn_magic in polled mode while doing output rather than having to * wait until the kernel decides it needs input. */ #define MAX_READAHEAD 20 static int com_readahead[MAX_READAHEAD]; static int com_readaheadcount = 0; int com_common_getc(dev_t dev, struct com_regs *regsp) { int s = splserial(); u_char stat, c; /* got a character from reading things earlier */ if (com_readaheadcount > 0) { int i; c = com_readahead[0]; for (i = 1; i < com_readaheadcount; i++) { com_readahead[i-1] = com_readahead[i]; } com_readaheadcount--; splx(s); return (c); } /* don't block until a character becomes available */ if (!ISSET(stat = CSR_READ_1(regsp, COM_REG_LSR), LSR_RXRDY)) { splx(s); return -1; } c = CSR_READ_1(regsp, COM_REG_RXDATA); stat = CSR_READ_1(regsp, COM_REG_IIR); { int cn_trapped = 0; /* required by cn_trap, see above */ if (!db_active) cn_check_magic(dev, c, com_cnm_state); } splx(s); return (c); } static void com_common_putc(dev_t dev, struct com_regs *regsp, int c, int with_readahead) { int s = splserial(); int cin, stat, timo; if (with_readahead && com_readaheadcount < MAX_READAHEAD && ISSET(stat = CSR_READ_1(regsp, COM_REG_LSR), LSR_RXRDY)) { int cn_trapped = 0; cin = CSR_READ_1(regsp, COM_REG_RXDATA); stat = CSR_READ_1(regsp, COM_REG_IIR); cn_check_magic(dev, cin, com_cnm_state); com_readahead[com_readaheadcount++] = cin; } /* wait for any pending transmission to finish */ timo = 150000; while (!ISSET(CSR_READ_1(regsp, COM_REG_LSR), LSR_TXRDY) && --timo) continue; CSR_WRITE_1(regsp, COM_REG_TXDATA, c); COM_BARRIER(regsp, BR | BW); splx(s); } /* * Initialize UART for use as console or KGDB line. */ int cominit(struct com_regs *regsp, int rate, int frequency, int type, tcflag_t cflag) { if (bus_space_map(regsp->cr_iot, regsp->cr_iobase, regsp->cr_nports, 0, &regsp->cr_ioh)) return (ENOMEM); /* ??? */ if (type == COM_TYPE_OMAP) { /* disable before changing settings */ CSR_WRITE_1(regsp, COM_REG_MDR1, MDR1_MODE_DISABLE); } rate = comspeed(rate, frequency, type); if (rate != -1) { if (type == COM_TYPE_AU1x00) { /* no EFR on alchemy */ CSR_WRITE_2(regsp, COM_REG_DLBL, rate); } else { if ((type != COM_TYPE_16550_NOERS) && (type != COM_TYPE_INGENIC)) { CSR_WRITE_1(regsp, COM_REG_LCR, LCR_EERS); CSR_WRITE_1(regsp, COM_REG_EFR, 0); } CSR_WRITE_1(regsp, COM_REG_LCR, LCR_DLAB); CSR_WRITE_1(regsp, COM_REG_DLBL, rate & 0xff); CSR_WRITE_1(regsp, COM_REG_DLBH, rate >> 8); } } CSR_WRITE_1(regsp, COM_REG_LCR, cflag2lcr(cflag)); CSR_WRITE_1(regsp, COM_REG_MCR, MCR_DTR | MCR_RTS); if (type == COM_TYPE_INGENIC) { CSR_WRITE_1(regsp, COM_REG_FIFO, FIFO_ENABLE | FIFO_RCV_RST | FIFO_XMT_RST | FIFO_TRIGGER_1 | FIFO_UART_ON); } else { CSR_WRITE_1(regsp, COM_REG_FIFO, FIFO_ENABLE | FIFO_RCV_RST | FIFO_XMT_RST | FIFO_TRIGGER_1); } if (type == COM_TYPE_OMAP) { /* setup the fifos. the FCR value is not used as long as SCR[6] and SCR[7] are 0, which they are at reset and we never touch the SCR register */ uint8_t rx_fifo_trig = 40; uint8_t tx_fifo_trig = 60; uint8_t rx_start = 8; uint8_t rx_halt = 60; uint8_t tlr_value = ((rx_fifo_trig>>2) << 4) | (tx_fifo_trig>>2); uint8_t tcr_value = ((rx_start>>2) << 4) | (rx_halt>>2); /* enable access to TCR & TLR */ CSR_WRITE_1(regsp, COM_REG_MCR, MCR_DTR | MCR_RTS | MCR_TCR_TLR); /* write tcr and tlr values */ CSR_WRITE_1(regsp, COM_REG_TLR, tlr_value); CSR_WRITE_1(regsp, COM_REG_TCR, tcr_value); /* disable access to TCR & TLR */ CSR_WRITE_1(regsp, COM_REG_MCR, MCR_DTR | MCR_RTS); /* enable again, but mode is based on speed */ if (rate > 230400) { CSR_WRITE_1(regsp, COM_REG_MDR1, MDR1_MODE_UART_13X); } else { CSR_WRITE_1(regsp, COM_REG_MDR1, MDR1_MODE_UART_16X); } } if (type == COM_TYPE_PXA2x0) CSR_WRITE_1(regsp, COM_REG_IER, IER_EUART); else CSR_WRITE_1(regsp, COM_REG_IER, 0); return (0); } int comcnattach1(struct com_regs *regsp, int rate, int frequency, int type, tcflag_t cflag) { int res; comcons_info.regs = *regsp; res = cominit(&comcons_info.regs, rate, frequency, type, cflag); if (res) return (res); cn_tab = &comcons; cn_init_magic(&com_cnm_state); cn_set_magic("\047\001"); /* default magic is BREAK */ comcons_info.frequency = frequency; comcons_info.type = type; comcons_info.rate = rate; comcons_info.cflag = cflag; return (0); } int comcnattach(bus_space_tag_t iot, bus_addr_t iobase, int rate, int frequency, int type, tcflag_t cflag) { struct com_regs regs; /*XXX*/ bus_space_handle_t dummy_bsh; memset(&dummy_bsh, 0, sizeof(dummy_bsh)); /* * dummy_bsh required because com_init_regs() wants it. A * real bus_space_handle will be filled in by cominit() later. * XXXJRT Detangle this mess eventually, plz. */ com_init_regs(&regs, iot, dummy_bsh/*XXX*/, iobase); return comcnattach1(&regs, rate, frequency, type, cflag); } static int comcnreattach(void) { return comcnattach1(&comcons_info.regs, comcons_info.rate, comcons_info.frequency, comcons_info.type, comcons_info.cflag); } int comcngetc(dev_t dev) { return (com_common_getc(dev, &comcons_info.regs)); } /* * Console kernel output character routine. */ void comcnputc(dev_t dev, int c) { com_common_putc(dev, &comcons_info.regs, c, cold); } void comcnpollc(dev_t dev, int on) { com_readaheadcount = 0; } #ifdef KGDB int com_kgdb_attach1(struct com_regs *regsp, int rate, int frequency, int type, tcflag_t cflag) { int res; if (bus_space_is_equal(regsp->cr_iot, comcons_info.regs.cr_iot) && regsp->cr_iobase == comcons_info.regs.cr_iobase) { #if !defined(DDB) return (EBUSY); /* cannot share with console */ #else comkgdbregs = *regsp; comkgdbregs.cr_ioh = comcons_info.regs.cr_ioh; #endif } else { comkgdbregs = *regsp; res = cominit(&comkgdbregs, rate, frequency, type, cflag); if (res) return (res); /* * XXXfvdl this shouldn't be needed, but the cn_magic goo * expects this to be initialized */ cn_init_magic(&com_cnm_state); cn_set_magic("\047\001"); } kgdb_attach(com_kgdb_getc, com_kgdb_putc, NULL); kgdb_dev = 123; /* unneeded, only to satisfy some tests */ return (0); } int com_kgdb_attach(bus_space_tag_t iot, bus_addr_t iobase, int rate, int frequency, int type, tcflag_t cflag) { struct com_regs regs; com_init_regs(&regs, iot, (bus_space_handle_t)0/*XXX*/, iobase); return com_kgdb_attach1(&regs, rate, frequency, type, cflag); } /* ARGSUSED */ int com_kgdb_getc(void *arg) { return (com_common_getc(NODEV, &comkgdbregs)); } /* ARGSUSED */ void com_kgdb_putc(void *arg, int c) { com_common_putc(NODEV, &comkgdbregs, c, 0); } #endif /* KGDB */ /* * helper function to identify the com ports used by * console or KGDB (and not yet autoconf attached) */ int com_is_console(bus_space_tag_t iot, bus_addr_t iobase, bus_space_handle_t *ioh) { bus_space_handle_t help; if (!comconsattached && bus_space_is_equal(iot, comcons_info.regs.cr_iot) && iobase == comcons_info.regs.cr_iobase) help = comcons_info.regs.cr_ioh; #ifdef KGDB else if (!com_kgdb_attached && bus_space_is_equal(iot, comkgdbregs.cr_iot) && iobase == comkgdbregs.cr_iobase) help = comkgdbregs.cr_ioh; #endif else return (0); if (ioh) *ioh = help; return (1); } /* * this routine exists to serve as a shutdown hook for systems that * have firmware which doesn't interact properly with a com device in * FIFO mode. */ bool com_cleanup(device_t self, int how) { struct com_softc *sc = device_private(self); if (ISSET(sc->sc_hwflags, COM_HW_FIFO)) CSR_WRITE_1(&sc->sc_regs, COM_REG_FIFO, 0); return true; } bool com_suspend(device_t self, const pmf_qual_t *qual) { struct com_softc *sc = device_private(self); CSR_WRITE_1(&sc->sc_regs, COM_REG_IER, 0); (void)CSR_READ_1(&sc->sc_regs, COM_REG_IIR); return true; } bool com_resume(device_t self, const pmf_qual_t *qual) { struct com_softc *sc = device_private(self); mutex_spin_enter(&sc->sc_lock); com_loadchannelregs(sc); mutex_spin_exit(&sc->sc_lock); return true; }
13 1 1 1 1 7 10 10 8 5 3 3 5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 /* $NetBSD: procfs_subr.c,v 1.117 2024/01/17 10:20:12 hannken Exp $ */ /*- * Copyright (c) 2006, 2007, 2008 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Andrew Doran. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /* * Copyright (c) 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * Jan-Simon Pendry. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)procfs_subr.c 8.6 (Berkeley) 5/14/95 */ /* * Copyright (c) 1994 Christopher G. Demetriou. All rights reserved. * Copyright (c) 1993 Jan-Simon Pendry * * This code is derived from software contributed to Berkeley by * Jan-Simon Pendry. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)procfs_subr.c 8.6 (Berkeley) 5/14/95 */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: procfs_subr.c,v 1.117 2024/01/17 10:20:12 hannken Exp $"); #include <sys/param.h> #include <sys/systm.h> #include <sys/time.h> #include <sys/kernel.h> #include <sys/proc.h> #include <sys/fstrans.h> #include <sys/vnode.h> #include <sys/stat.h> #include <sys/file.h> #include <sys/filedesc.h> #include <sys/kauth.h> #include <sys/sysctl.h> #include <miscfs/procfs/procfs.h> /* * Allocate a pfsnode/vnode pair. The vnode is referenced. * The pid, type, and file descriptor uniquely identify a pfsnode. */ int procfs_allocvp(struct mount *mp, struct vnode **vpp, pid_t pid, pfstype type, int fd) { struct pfskey key; memset(&key, 0, sizeof(key)); key.pk_type = type; key.pk_pid = pid; key.pk_fd = fd; return vcache_get(mp, &key, sizeof(key), vpp); } int procfs_rw(void *v) { struct vop_read_args *ap = v; struct vnode *vp = ap->a_vp; struct uio *uio = ap->a_uio; struct lwp *curl; struct lwp *l; struct pfsnode *pfs = VTOPFS(vp); struct proc *p; int error; if (uio->uio_offset < 0) return EINVAL; if ((error = procfs_proc_lock(vp->v_mount, pfs->pfs_pid, &p, ESRCH)) != 0) return error; curl = curlwp; /* * Do not allow init to be modified while in secure mode; it * could be duped into changing the security level. */ #define M2K(m) ((m) == UIO_READ ? KAUTH_REQ_PROCESS_PROCFS_READ : \ KAUTH_REQ_PROCESS_PROCFS_WRITE) mutex_enter(p->p_lock); error = kauth_authorize_process(curl->l_cred, KAUTH_PROCESS_PROCFS, p, pfs, KAUTH_ARG(M2K(uio->uio_rw)), NULL); mutex_exit(p->p_lock); if (error) { procfs_proc_unlock(p); return (error); } #undef M2K mutex_enter(p->p_lock); LIST_FOREACH(l, &p->p_lwps, l_sibling) { if (l->l_stat != LSZOMB) break; } /* Process is exiting if no-LWPS or all LWPs are LSZOMB */ if (l == NULL) { mutex_exit(p->p_lock); procfs_proc_unlock(p); return ESRCH; } lwp_addref(l); mutex_exit(p->p_lock); switch (pfs->pfs_type) { case PFSnote: case PFSnotepg: error = procfs_donote(curl, p, pfs, uio); break; case PFSregs: error = procfs_doregs(curl, l, pfs, uio); break; case PFSfpregs: error = procfs_dofpregs(curl, l, pfs, uio); break; case PFSstatus: error = procfs_dostatus(curl, l, pfs, uio); break; case PFSstat: error = procfs_do_pid_stat(curl, l, pfs, uio); break; case PFSlimit: error = procfs_dolimit(curl, p, pfs, uio); break; case PFSmap: error = procfs_domap(curl, p, pfs, uio, 0); break; case PFSmaps: error = procfs_domap(curl, p, pfs, uio, 1); break; case PFSmem: error = procfs_domem(curl, l, pfs, uio); break; case PFScmdline: error = procfs_doprocargs(curl, p, pfs, uio, KERN_PROC_ARGV); break; case PFSenviron: error = procfs_doprocargs(curl, p, pfs, uio, KERN_PROC_ENV); break; case PFSmeminfo: error = procfs_domeminfo(curl, p, pfs, uio); break; case PFSdevices: error = procfs_dodevices(curl, p, pfs, uio); break; case PFScpuinfo: error = procfs_docpuinfo(curl, p, pfs, uio); break; case PFScpustat: error = procfs_docpustat(curl, p, pfs, uio); break; case PFSloadavg: error = procfs_doloadavg(curl, p, pfs, uio); break; case PFSstatm: error = procfs_do_pid_statm(curl, l, pfs, uio); break; case PFSfd: error = procfs_dofd(curl, p, pfs, uio); break; case PFSuptime: error = procfs_douptime(curl, p, pfs, uio); break; case PFSmounts: error = procfs_domounts(curl, p, pfs, uio); break; case PFSemul: error = procfs_doemul(curl, p, pfs, uio); break; case PFSversion: error = procfs_doversion(curl, p, pfs, uio); break; case PFSauxv: error = procfs_doauxv(curl, p, pfs, uio); break; #ifdef __HAVE_PROCFS_MACHDEP PROCFS_MACHDEP_NODETYPE_CASES error = procfs_machdep_rw(curl, l, pfs, uio); break; #endif default: error = EOPNOTSUPP; break; } /* * Release the references that we acquired earlier. */ lwp_delref(l); procfs_proc_unlock(p); return (error); } /* * Get a string from userland into (bf). Strip a trailing * nl character (to allow easy access from the shell). * The buffer should be *buflenp + 1 chars long. vfs_getuserstr * will automatically add a nul char at the end. * * Returns 0 on success or the following errors * * EINVAL: file offset is non-zero. * EMSGSIZE: message is longer than kernel buffer * EFAULT: user i/o buffer is not addressable */ int vfs_getuserstr(struct uio *uio, char *bf, int *buflenp) { size_t xlen; int error; if (uio->uio_offset != 0) return (EINVAL); xlen = *buflenp; /* must be able to read the whole string in one go */ if (xlen < uio->uio_resid) return (EMSGSIZE); xlen = uio->uio_resid; if ((error = uiomove(bf, xlen, uio)) != 0) return (error); /* allow multiple writes without seeks */ uio->uio_offset = 0; /* cleanup string and remove trailing newline */ bf[xlen] = '\0'; xlen = strlen(bf); if (xlen > 0 && bf[xlen-1] == '\n') bf[--xlen] = '\0'; *buflenp = xlen; return (0); } const vfs_namemap_t * vfs_findname(const vfs_namemap_t *nm, const char *bf, int buflen) { for (; nm->nm_name; nm++) if (memcmp(bf, nm->nm_name, buflen+1) == 0) return (nm); return (0); } bool procfs_use_linux_compat(struct mount *mp) { const int flags = VFSTOPROC(mp)->pmnt_flags; return (flags & PROCFSMNT_LINUXCOMPAT) ? true : false; } struct proc * procfs_proc_find(struct mount *mp, pid_t pid) { KASSERT(mutex_owned(&proc_lock)); return procfs_use_linux_compat(mp) ? proc_find_lwpid(pid) : proc_find(pid); } int procfs_proc_lock(struct mount *mp, int pid, struct proc **bunghole, int notfound) { struct proc *tp; int error = 0; mutex_enter(&proc_lock); if (pid == 0) tp = &proc0; else if ((tp = procfs_proc_find(mp, pid)) == NULL) error = notfound; if (tp != NULL && !rw_tryenter(&tp->p_reflock, RW_READER)) error = EBUSY; mutex_exit(&proc_lock); *bunghole = tp; return error; } void procfs_proc_unlock(struct proc *p) { rw_exit(&p->p_reflock); } int procfs_doemul(struct lwp *curl, struct proc *p, struct pfsnode *pfs, struct uio *uio) { const char *ename = p->p_emul->e_name; return uiomove_frombuf(__UNCONST(ename), strlen(ename), uio); }
2 1 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 /* $NetBSD: bt_proto.c,v 1.17 2023/08/07 13:31:54 riastradh Exp $ */ /*- * Copyright (c) 2005 Iain Hibbert. * Copyright (c) 2006 Itronix Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The name of Itronix Inc. may not be used to endorse * or promote products derived from this software without specific * prior written permission. * * THIS SOFTWARE IS PROVIDED BY ITRONIX INC. ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL ITRONIX INC. BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: bt_proto.c,v 1.17 2023/08/07 13:31:54 riastradh Exp $"); #include <sys/param.h> #include <sys/domain.h> #include <sys/kernel.h> #include <sys/module.h> #include <sys/protosw.h> #include <sys/socket.h> #include <sys/systm.h> #include <net/route.h> #include <netbt/bluetooth.h> #include <netbt/hci.h> #include <netbt/l2cap.h> #include <netbt/rfcomm.h> #include <netbt/sco.h> DOMAIN_DEFINE(btdomain); /* forward declare and add to link set */ static void bt_init(void); PR_WRAP_CTLOUTPUT(hci_ctloutput) PR_WRAP_CTLOUTPUT(sco_ctloutput) PR_WRAP_CTLOUTPUT(l2cap_ctloutput) PR_WRAP_CTLOUTPUT(rfcomm_ctloutput) #define hci_ctloutput hci_ctloutput_wrapper #define sco_ctloutput sco_ctloutput_wrapper #define l2cap_ctloutput l2cap_ctloutput_wrapper #define rfcomm_ctloutput rfcomm_ctloutput_wrapper static const struct protosw btsw[] = { { /* raw HCI commands */ .pr_type = SOCK_RAW, .pr_domain = &btdomain, .pr_protocol = BTPROTO_HCI, .pr_flags = (PR_ADDR | PR_ATOMIC), .pr_init = hci_init, .pr_ctloutput = hci_ctloutput, .pr_usrreqs = &hci_usrreqs, }, { /* HCI SCO data (audio) */ .pr_type = SOCK_SEQPACKET, .pr_domain = &btdomain, .pr_protocol = BTPROTO_SCO, .pr_flags = (PR_CONNREQUIRED | PR_ATOMIC | PR_LISTEN), .pr_ctloutput = sco_ctloutput, .pr_usrreqs = &sco_usrreqs, }, { /* L2CAP Connection Oriented */ .pr_type = SOCK_SEQPACKET, .pr_domain = &btdomain, .pr_protocol = BTPROTO_L2CAP, .pr_flags = (PR_CONNREQUIRED | PR_ATOMIC | PR_LISTEN), .pr_ctloutput = l2cap_ctloutput, .pr_usrreqs = &l2cap_usrreqs, .pr_init = l2cap_init, }, { /* RFCOMM */ .pr_type = SOCK_STREAM, .pr_domain = &btdomain, .pr_protocol = BTPROTO_RFCOMM, .pr_flags = (PR_CONNREQUIRED | PR_LISTEN | PR_WANTRCVD), .pr_ctloutput = rfcomm_ctloutput, .pr_usrreqs = &rfcomm_usrreqs, .pr_init = rfcomm_init, }, }; struct domain btdomain = { .dom_family = AF_BLUETOOTH, .dom_name = "bluetooth", .dom_init = bt_init, .dom_protosw = btsw, .dom_protoswNPROTOSW = &btsw[__arraycount(btsw)], }; kmutex_t *bt_lock; static void bt_init(void) { } MODULE(MODULE_CLASS_DRIVER, netbt, NULL); static int netbt_modcmd(modcmd_t cmd, void *aux) { switch (cmd) { case MODULE_CMD_INIT: bt_lock = mutex_obj_alloc(MUTEX_DEFAULT, IPL_NONE); return 0; case MODULE_CMD_FINI: return EBUSY; /* XXX */ default: return ENOTTY; } }
9 9 9 9 9 8 7 2 1 1 7 1 8 8 8 8 8 8 8 8 1 7 40 39 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 /* $NetBSD: icmp6.c,v 1.256 2024/02/24 21:41:13 mlelstv Exp $ */ /* $KAME: icmp6.c,v 1.217 2001/06/20 15:03:29 jinmei Exp $ */ /* * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the project nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * Copyright (c) 1982, 1986, 1988, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)ip_icmp.c 8.2 (Berkeley) 1/4/94 */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: icmp6.c,v 1.256 2024/02/24 21:41:13 mlelstv Exp $"); #ifdef _KERNEL_OPT #include "opt_compat_netbsd.h" #include "opt_inet.h" #include "opt_ipsec.h" #endif #include <sys/param.h> #include <sys/systm.h> #include <sys/kmem.h> #include <sys/mbuf.h> #include <sys/protosw.h> #include <sys/socket.h> #include <sys/socketvar.h> #include <sys/time.h> #include <sys/kernel.h> #include <sys/syslog.h> #include <sys/domain.h> #include <sys/sysctl.h> #include <net/if.h> #include <net/route.h> #include <net/if_dl.h> #include <net/if_types.h> #include <net/nd.h> #include <netinet/in.h> #include <netinet/in_pcb.h> #include <netinet/in_var.h> #include <netinet/ip6.h> #include <netinet/wqinput.h> #include <netinet6/ip6_var.h> #include <netinet6/ip6_private.h> #include <netinet/icmp6.h> #include <netinet6/icmp6_private.h> #include <netinet6/mld6_var.h> #include <netinet6/in6_pcb.h> #include <netinet6/in6_ifattach.h> #include <netinet6/ip6protosw.h> #include <netinet6/nd6.h> #include <netinet6/scope6_var.h> #ifdef IPSEC #include <netipsec/ipsec.h> #include <netipsec/ipsec6.h> #include <netipsec/key.h> #endif #include "faith.h" #if defined(NFAITH) && 0 < NFAITH #include <net/if_faith.h> #endif /* Ensure that non packed structures are the desired size. */ __CTASSERT(sizeof(struct icmp6_hdr) == 8); __CTASSERT(sizeof(struct icmp6_nodeinfo) == 16); __CTASSERT(sizeof(struct icmp6_namelookup) == 20); __CTASSERT(sizeof(struct icmp6_router_renum) == 16); __CTASSERT(sizeof(struct nd_router_solicit) == 8); __CTASSERT(sizeof(struct nd_router_advert) == 16); __CTASSERT(sizeof(struct nd_neighbor_solicit) == 24); __CTASSERT(sizeof(struct nd_neighbor_advert) == 24); __CTASSERT(sizeof(struct nd_redirect) == 40); __CTASSERT(sizeof(struct nd_opt_hdr) == 2); __CTASSERT(sizeof(struct nd_opt_route_info) == 8); __CTASSERT(sizeof(struct nd_opt_prefix_info) == 32); __CTASSERT(sizeof(struct nd_opt_rd_hdr) == 8); __CTASSERT(sizeof(struct nd_opt_mtu) == 8); __CTASSERT(sizeof(struct nd_opt_nonce) == 2 + ND_OPT_NONCE_LEN); __CTASSERT(sizeof(struct nd_opt_rdnss) == 8); __CTASSERT(sizeof(struct nd_opt_dnssl) == 8); __CTASSERT(sizeof(struct mld_hdr) == 24); __CTASSERT(sizeof(struct ni_reply_fqdn) == 8); __CTASSERT(sizeof(struct rr_pco_match) == 24); __CTASSERT(sizeof(struct rr_pco_use) == 32); __CTASSERT(sizeof(struct rr_result) == 24); extern struct domain inet6domain; percpu_t *icmp6stat_percpu; extern struct inpcbtable raw6cbtable; extern int icmp6errppslim; static int icmp6errpps_count = 0; static struct timeval icmp6errppslim_last; extern int icmp6_nodeinfo; bool icmp6_dynamic_rt_msg = false; /* * List of callbacks to notify when Path MTU changes are made. */ struct icmp6_mtudisc_callback { LIST_ENTRY(icmp6_mtudisc_callback) mc_list; void (*mc_func)(struct in6_addr *); }; LIST_HEAD(, icmp6_mtudisc_callback) icmp6_mtudisc_callbacks = LIST_HEAD_INITIALIZER(&icmp6_mtudisc_callbacks); static struct rttimer_queue *icmp6_mtudisc_timeout_q = NULL; extern int pmtu_expire; /* XXX do these values make any sense? */ static int icmp6_mtudisc_hiwat = 1280; static int icmp6_mtudisc_lowat = 256; /* * keep track of # of redirect routes. */ static struct rttimer_queue *icmp6_redirect_timeout_q = NULL; /* XXX experimental, turned off */ static int icmp6_redirect_hiwat = -1; static int icmp6_redirect_lowat = -1; /* Protect mtudisc and redirect stuffs */ static kmutex_t icmp6_mtx __cacheline_aligned; static bool icmp6_reflect_pmtu = false; static void icmp6_errcount(u_int, int, int); static int icmp6_rip6_input(struct mbuf **, int); static void icmp6_reflect(struct mbuf *, size_t); static int icmp6_ratelimit(const struct in6_addr *, const int, const int); static const char *icmp6_redirect_diag(char *, size_t, struct in6_addr *, struct in6_addr *, struct in6_addr *); static void icmp6_redirect_input(struct mbuf *, int); static struct mbuf *ni6_input(struct mbuf *, int); static struct mbuf *ni6_nametodns(const char *, int, int); static int ni6_dnsmatch(const char *, int, const char *, int); static int ni6_addrs(struct icmp6_nodeinfo *, struct ifnet **, char *, struct psref *); static int ni6_store_addrs(struct icmp6_nodeinfo *, struct icmp6_nodeinfo *, struct ifnet *, int); static int icmp6_notify_error(struct mbuf *, int, int, int); static struct rtentry *icmp6_mtudisc_clone(struct sockaddr *); static void icmp6_mtudisc_timeout(struct rtentry *, struct rttimer *); static void icmp6_redirect_timeout(struct rtentry *, struct rttimer *); static void sysctl_net_inet6_icmp6_setup(struct sysctllog **); /* workqueue-based pr_input */ static struct wqinput *icmp6_wqinput; static void _icmp6_input(struct mbuf *m, int off, int proto); void icmp6_init(void) { sysctl_net_inet6_icmp6_setup(NULL); mld_init(); mutex_init(&icmp6_mtx, MUTEX_DEFAULT, IPL_NONE); mutex_enter(&icmp6_mtx); icmp6_mtudisc_timeout_q = rt_timer_queue_create(pmtu_expire); icmp6_redirect_timeout_q = rt_timer_queue_create(icmp6_redirtimeout); mutex_exit(&icmp6_mtx); icmp6stat_percpu = percpu_alloc(sizeof(uint64_t) * ICMP6_NSTATS); icmp6_wqinput = wqinput_create("icmp6", _icmp6_input); } static void icmp6_errcount(u_int base, int type, int code) { switch (type) { case ICMP6_DST_UNREACH: switch (code) { case ICMP6_DST_UNREACH_NOROUTE: ICMP6_STATINC(base + ICMP6_ERRSTAT_DST_UNREACH_NOROUTE); return; case ICMP6_DST_UNREACH_ADMIN: ICMP6_STATINC(base + ICMP6_ERRSTAT_DST_UNREACH_ADMIN); return; case ICMP6_DST_UNREACH_BEYONDSCOPE: ICMP6_STATINC(base + ICMP6_ERRSTAT_DST_UNREACH_BEYONDSCOPE); return; case ICMP6_DST_UNREACH_ADDR: ICMP6_STATINC(base + ICMP6_ERRSTAT_DST_UNREACH_ADDR); return; case ICMP6_DST_UNREACH_NOPORT: ICMP6_STATINC(base + ICMP6_ERRSTAT_DST_UNREACH_NOPORT); return; } break; case ICMP6_PACKET_TOO_BIG: ICMP6_STATINC(base + ICMP6_ERRSTAT_PACKET_TOO_BIG); return; case ICMP6_TIME_EXCEEDED: switch (code) { case ICMP6_TIME_EXCEED_TRANSIT: ICMP6_STATINC(base + ICMP6_ERRSTAT_TIME_EXCEED_TRANSIT); return; case ICMP6_TIME_EXCEED_REASSEMBLY: ICMP6_STATINC(base + ICMP6_ERRSTAT_TIME_EXCEED_REASSEMBLY); return; } break; case ICMP6_PARAM_PROB: switch (code) { case ICMP6_PARAMPROB_HEADER: ICMP6_STATINC(base + ICMP6_ERRSTAT_PARAMPROB_HEADER); return; case ICMP6_PARAMPROB_NEXTHEADER: ICMP6_STATINC(base + ICMP6_ERRSTAT_PARAMPROB_NEXTHEADER); return; case ICMP6_PARAMPROB_OPTION: ICMP6_STATINC(base + ICMP6_ERRSTAT_PARAMPROB_OPTION); return; } break; case ND_REDIRECT: ICMP6_STATINC(base + ICMP6_ERRSTAT_REDIRECT); return; } ICMP6_STATINC(base + ICMP6_ERRSTAT_UNKNOWN); } /* * Register a Path MTU Discovery callback. */ void icmp6_mtudisc_callback_register(void (*func)(struct in6_addr *)) { struct icmp6_mtudisc_callback *mc, *new; new = kmem_alloc(sizeof(*mc), KM_SLEEP); mutex_enter(&icmp6_mtx); for (mc = LIST_FIRST(&icmp6_mtudisc_callbacks); mc != NULL; mc = LIST_NEXT(mc, mc_list)) { if (mc->mc_func == func) { mutex_exit(&icmp6_mtx); kmem_free(new, sizeof(*mc)); return; } } new->mc_func = func; LIST_INSERT_HEAD(&icmp6_mtudisc_callbacks, new, mc_list); mutex_exit(&icmp6_mtx); } /* * A wrapper function for icmp6_error() necessary when the erroneous packet * may not contain enough scope zone information. */ void icmp6_error2(struct mbuf *m, int type, int code, int param, struct ifnet *ifp, struct in6_addr *src) { struct ip6_hdr *ip6; KASSERT(ifp != NULL); if (m->m_len < sizeof(struct ip6_hdr)) { m = m_pullup(m, sizeof(struct ip6_hdr)); if (m == NULL) return; } ip6 = mtod(m, struct ip6_hdr *); if (in6_setscope(&ip6->ip6_src, ifp, NULL) != 0) goto out; if (in6_setscope(&ip6->ip6_dst, ifp, NULL) != 0) goto out; *src = ip6->ip6_src; icmp6_error(m, type, code, param); return; out: m_freem(m); } /* * Generate an error packet of type error in response to bad IP6 packet. */ void icmp6_error(struct mbuf *m, int type, int code, int param) { struct ip6_hdr *oip6, *nip6; struct icmp6_hdr *icmp6; u_int preplen; int off; int nxt; ICMP6_STATINC(ICMP6_STAT_ERROR); /* count per-type-code statistics */ icmp6_errcount(ICMP6_STAT_OUTERRHIST, type, code); if (m->m_flags & M_DECRYPTED) { ICMP6_STATINC(ICMP6_STAT_CANTERROR); goto freeit; } if (M_UNWRITABLE(m, sizeof(struct ip6_hdr)) && (m = m_pullup(m, sizeof(struct ip6_hdr))) == NULL) return; oip6 = mtod(m, struct ip6_hdr *); /* * If the destination address of the erroneous packet is a multicast * address, or the packet was sent using link-layer multicast, * we should basically suppress sending an error (RFC 2463, Section * 2.4). * We have two exceptions (the item e.2 in that section): * - the Packet Too Big message can be sent for path MTU discovery. * - the Parameter Problem Message that can be allowed an icmp6 error * in the option type field. This check has been done in * ip6_unknown_opt(), so we can just check the type and code. */ if ((m->m_flags & (M_BCAST|M_MCAST) || IN6_IS_ADDR_MULTICAST(&oip6->ip6_dst)) && (type != ICMP6_PACKET_TOO_BIG && (type != ICMP6_PARAM_PROB || code != ICMP6_PARAMPROB_OPTION))) goto freeit; /* * RFC 2463, 2.4 (e.5): source address check. * XXX: the case of anycast source? */ if (IN6_IS_ADDR_UNSPECIFIED(&oip6->ip6_src) || IN6_IS_ADDR_MULTICAST(&oip6->ip6_src)) goto freeit; /* * If we are about to send ICMPv6 against ICMPv6 error/redirect, * don't do it. */ nxt = -1; off = ip6_lasthdr(m, 0, IPPROTO_IPV6, &nxt); if (off >= 0 && nxt == IPPROTO_ICMPV6) { struct icmp6_hdr *icp; IP6_EXTHDR_GET(icp, struct icmp6_hdr *, m, off, sizeof(*icp)); if (icp == NULL) { ICMP6_STATINC(ICMP6_STAT_TOOSHORT); return; } if (icp->icmp6_type < ICMP6_ECHO_REQUEST || icp->icmp6_type == ND_REDIRECT) { /* * ICMPv6 error * Special case: for redirect (which is * informational) we must not send icmp6 error. */ ICMP6_STATINC(ICMP6_STAT_CANTERROR); goto freeit; } else { /* ICMPv6 informational - send the error */ } } else { /* non-ICMPv6 - send the error */ } oip6 = mtod(m, struct ip6_hdr *); /* adjust pointer */ /* Finally, do rate limitation check. */ if (icmp6_ratelimit(&oip6->ip6_src, type, code)) { ICMP6_STATINC(ICMP6_STAT_TOOFREQ); goto freeit; } /* * OK, ICMP6 can be generated. */ if (m->m_pkthdr.len >= ICMPV6_PLD_MAXLEN) m_adj(m, ICMPV6_PLD_MAXLEN - m->m_pkthdr.len); preplen = sizeof(struct ip6_hdr) + sizeof(struct icmp6_hdr); M_PREPEND(m, preplen, M_DONTWAIT); if (m && M_UNWRITABLE(m, preplen)) m = m_pullup(m, preplen); if (m == NULL) { nd6log(LOG_DEBUG, "ENOBUFS in icmp6_error %d\n", __LINE__); return; } nip6 = mtod(m, struct ip6_hdr *); nip6->ip6_src = oip6->ip6_src; nip6->ip6_dst = oip6->ip6_dst; in6_clearscope(&oip6->ip6_src); in6_clearscope(&oip6->ip6_dst); icmp6 = (struct icmp6_hdr *)(nip6 + 1); icmp6->icmp6_type = type; icmp6->icmp6_code = code; icmp6->icmp6_pptr = htonl((u_int32_t)param); /* * icmp6_reflect() is designed to be in the input path. * icmp6_error() can be called from both input and output path, * and if we are in output path rcvif could contain bogus value. * clear m->m_pkthdr.rcvif for safety, we should have enough scope * information in ip header (nip6). */ m_reset_rcvif(m); ICMP6_STATINC(ICMP6_STAT_OUTHIST + type); /* header order: IPv6 - ICMPv6 */ icmp6_reflect(m, sizeof(struct ip6_hdr)); return; freeit: /* * If we can't tell whether or not we can generate ICMP6, free it. */ m_freem(m); } /* * Process a received ICMP6 message. */ static void _icmp6_input(struct mbuf *m, int off, int proto) { struct mbuf *n; struct ip6_hdr *ip6, *nip6; struct icmp6_hdr *icmp6, *nicmp6; int icmp6len = m->m_pkthdr.len - off; int code, sum; struct ifnet *rcvif; struct psref psref; char ip6buf[INET6_ADDRSTRLEN], ip6buf2[INET6_ADDRSTRLEN]; rcvif = m_get_rcvif_psref(m, &psref); if (__predict_false(rcvif == NULL)) goto freeit; #define ICMP6_MAXLEN (sizeof(*nip6) + sizeof(*nicmp6) + 4) KASSERT(ICMP6_MAXLEN < MCLBYTES); icmp6_ifstat_inc(rcvif, ifs6_in_msg); /* * Locate icmp6 structure in mbuf, and check * that not corrupted and of at least minimum length */ if (icmp6len < sizeof(struct icmp6_hdr)) { ICMP6_STATINC(ICMP6_STAT_TOOSHORT); icmp6_ifstat_inc(rcvif, ifs6_in_error); goto freeit; } if (m->m_len < sizeof(struct ip6_hdr)) { m = m_pullup(m, sizeof(struct ip6_hdr)); if (m == NULL) { ICMP6_STATINC(ICMP6_STAT_TOOSHORT); icmp6_ifstat_inc(rcvif, ifs6_in_error); goto freeit; } } ip6 = mtod(m, struct ip6_hdr *); IP6_EXTHDR_GET(icmp6, struct icmp6_hdr *, m, off, sizeof(*icmp6)); if (icmp6 == NULL) { ICMP6_STATINC(ICMP6_STAT_TOOSHORT); icmp6_ifstat_inc(rcvif, ifs6_in_error); goto freeit; } /* * Enforce alignment requirements that are violated in * some cases, see kern/50766 for details. */ if (ACCESSIBLE_POINTER(icmp6, struct ip6_hdr) == 0) { m = m_copyup(m, off + sizeof(struct icmp6_hdr), 0); if (m == NULL) { ICMP6_STATINC(ICMP6_STAT_TOOSHORT); icmp6_ifstat_inc(rcvif, ifs6_in_error); goto freeit; } ip6 = mtod(m, struct ip6_hdr *); icmp6 = (struct icmp6_hdr *)(mtod(m, char *) + off); } KASSERT(ACCESSIBLE_POINTER(icmp6, struct ip6_hdr)); /* * calculate the checksum */ if ((sum = in6_cksum(m, IPPROTO_ICMPV6, off, icmp6len)) != 0) { nd6log(LOG_ERR, "ICMP6 checksum error(%d|%x) %s\n", icmp6->icmp6_type, sum, IN6_PRINT(ip6buf, &ip6->ip6_src)); ICMP6_STATINC(ICMP6_STAT_CHECKSUM); icmp6_ifstat_inc(rcvif, ifs6_in_error); goto freeit; } #if defined(NFAITH) && 0 < NFAITH if (faithprefix(&ip6->ip6_dst)) { /* * Deliver very specific ICMP6 type only. * This is important to deliver TOOBIG. Otherwise PMTUD * will not work. */ switch (icmp6->icmp6_type) { case ICMP6_DST_UNREACH: case ICMP6_PACKET_TOO_BIG: case ICMP6_TIME_EXCEEDED: break; default: goto freeit; } } #endif code = icmp6->icmp6_code; ICMP6_STATINC(ICMP6_STAT_INHIST + icmp6->icmp6_type); switch (icmp6->icmp6_type) { case ICMP6_DST_UNREACH: icmp6_ifstat_inc(rcvif, ifs6_in_dstunreach); switch (code) { case ICMP6_DST_UNREACH_NOROUTE: code = PRC_UNREACH_NET; break; case ICMP6_DST_UNREACH_ADMIN: icmp6_ifstat_inc(rcvif, ifs6_in_adminprohib); code = PRC_UNREACH_PROTOCOL; /* is this a good code? */ break; case ICMP6_DST_UNREACH_ADDR: code = PRC_HOSTDEAD; break; case ICMP6_DST_UNREACH_BEYONDSCOPE: /* I mean "source address was incorrect." */ code = PRC_UNREACH_NET; break; case ICMP6_DST_UNREACH_NOPORT: code = PRC_UNREACH_PORT; break; default: goto badcode; } goto deliver; case ICMP6_PACKET_TOO_BIG: icmp6_ifstat_inc(rcvif, ifs6_in_pkttoobig); /* * MTU is checked in icmp6_mtudisc. */ code = PRC_MSGSIZE; /* * Updating the path MTU will be done after examining * intermediate extension headers. */ goto deliver; case ICMP6_TIME_EXCEEDED: icmp6_ifstat_inc(rcvif, ifs6_in_timeexceed); switch (code) { case ICMP6_TIME_EXCEED_TRANSIT: code = PRC_TIMXCEED_INTRANS; break; case ICMP6_TIME_EXCEED_REASSEMBLY: code = PRC_TIMXCEED_REASS; break; default: goto badcode; } goto deliver; case ICMP6_PARAM_PROB: icmp6_ifstat_inc(rcvif, ifs6_in_paramprob); switch (code) { case ICMP6_PARAMPROB_NEXTHEADER: code = PRC_UNREACH_PROTOCOL; break; case ICMP6_PARAMPROB_HEADER: case ICMP6_PARAMPROB_OPTION: code = PRC_PARAMPROB; break; default: goto badcode; } goto deliver; case ICMP6_ECHO_REQUEST: icmp6_ifstat_inc(rcvif, ifs6_in_echo); if (code != 0) goto badcode; /* * Copy mbuf to send to two data paths: userland socket(s), * and to the querier (echo reply). * m: a copy for socket, n: a copy for querier * * If the first mbuf is shared, or the first mbuf is too short, * copy the first part of the data into a fresh mbuf. * Otherwise, we will wrongly overwrite both copies. */ if ((n = m_copypacket(m, M_DONTWAIT)) == NULL) { /* Give up local */ n = m; m = NULL; } else if (M_UNWRITABLE(n, off + sizeof(struct icmp6_hdr))) { struct mbuf *n0 = n; /* * Prepare an internal mbuf. m_pullup() doesn't * always copy the length we specified. */ if ((n = m_dup(n0, 0, M_COPYALL, M_DONTWAIT)) == NULL) { /* Give up local */ n = m; m = NULL; } m_freem(n0); } IP6_EXTHDR_GET(nicmp6, struct icmp6_hdr *, n, off, sizeof(*nicmp6)); if (nicmp6 == NULL) goto freeit; nicmp6->icmp6_type = ICMP6_ECHO_REPLY; nicmp6->icmp6_code = 0; if (n) { uint64_t *icmp6s = ICMP6_STAT_GETREF(); icmp6s[ICMP6_STAT_REFLECT]++; icmp6s[ICMP6_STAT_OUTHIST + ICMP6_ECHO_REPLY]++; ICMP6_STAT_PUTREF(); icmp6_reflect(n, off); } if (!m) goto freeit; break; case ICMP6_ECHO_REPLY: icmp6_ifstat_inc(rcvif, ifs6_in_echoreply); if (code != 0) goto badcode; break; case MLD_LISTENER_QUERY: case MLD_LISTENER_REPORT: if (icmp6len < sizeof(struct mld_hdr)) goto badlen; if (icmp6->icmp6_type == MLD_LISTENER_QUERY) /* XXX: ugly... */ icmp6_ifstat_inc(rcvif, ifs6_in_mldquery); else icmp6_ifstat_inc(rcvif, ifs6_in_mldreport); if ((n = m_copypacket(m, M_DONTWAIT)) == NULL) { /* give up local */ mld_input(m, off); m = NULL; goto freeit; } mld_input(n, off); /* m stays. */ break; case MLD_LISTENER_DONE: icmp6_ifstat_inc(rcvif, ifs6_in_mlddone); if (icmp6len < sizeof(struct mld_hdr)) /* necessary? */ goto badlen; break; /* nothing to be done in kernel */ case MLD_MTRACE_RESP: case MLD_MTRACE: /* XXX: these two are experimental. not officially defined. */ /* XXX: per-interface statistics? */ break; /* just pass it to applications */ case ICMP6_WRUREQUEST: /* ICMP6_FQDN_QUERY */ { enum { WRU, FQDN } mode; if (!icmp6_nodeinfo) break; if (icmp6len == sizeof(struct icmp6_hdr) + 4) mode = WRU; else if (icmp6len >= sizeof(struct icmp6_nodeinfo)) mode = FQDN; else goto badlen; if (mode == FQDN) { n = m_copypacket(m, M_DONTWAIT); if (n) n = ni6_input(n, off); } else { u_char *p; int maxhlen; if ((icmp6_nodeinfo & 5) != 5) break; if (code != 0) goto badcode; MGETHDR(n, M_DONTWAIT, m->m_type); if (n && ICMP6_MAXLEN > MHLEN) { MCLGET(n, M_DONTWAIT); if ((n->m_flags & M_EXT) == 0) { m_free(n); n = NULL; } } if (n == NULL) { /* Give up remote */ break; } m_reset_rcvif(n); n->m_len = 0; maxhlen = M_TRAILINGSPACE(n) - ICMP6_MAXLEN; if (maxhlen < 0) { m_free(n); break; } if (maxhlen > hostnamelen) maxhlen = hostnamelen; /* * Copy IPv6 and ICMPv6 only. */ nip6 = mtod(n, struct ip6_hdr *); memcpy(nip6, ip6, sizeof(struct ip6_hdr)); nicmp6 = (struct icmp6_hdr *)(nip6 + 1); memcpy(nicmp6, icmp6, sizeof(struct icmp6_hdr)); p = (u_char *)(nicmp6 + 1); memset(p, 0, 4); memcpy(p + 4, hostname, maxhlen); /* meaningless TTL */ m_copy_pkthdr(n, m); n->m_pkthdr.len = n->m_len = sizeof(struct ip6_hdr) + sizeof(struct icmp6_hdr) + 4 + maxhlen; nicmp6->icmp6_type = ICMP6_WRUREPLY; nicmp6->icmp6_code = 0; } if (n) { uint64_t *icmp6s = ICMP6_STAT_GETREF(); icmp6s[ICMP6_STAT_REFLECT]++; icmp6s[ICMP6_STAT_OUTHIST + ICMP6_WRUREPLY]++; ICMP6_STAT_PUTREF(); icmp6_reflect(n, sizeof(struct ip6_hdr)); } break; } case ICMP6_WRUREPLY: if (code != 0) goto badcode; break; case ND_ROUTER_SOLICIT: icmp6_ifstat_inc(rcvif, ifs6_in_routersolicit); /* FALLTHROUGH */ case ND_ROUTER_ADVERT: if (icmp6->icmp6_type == ND_ROUTER_ADVERT) icmp6_ifstat_inc(rcvif, ifs6_in_routeradvert); if (code != 0) goto badcode; if ((icmp6->icmp6_type == ND_ROUTER_SOLICIT && icmp6len < sizeof(struct nd_router_solicit)) || (icmp6->icmp6_type == ND_ROUTER_ADVERT && icmp6len < sizeof(struct nd_router_advert))) goto badlen; if ((n = m_copypacket(m, M_DONTWAIT)) == NULL) { /* give up local */ nd6_rtr_cache(m, off, icmp6len, icmp6->icmp6_type); m = NULL; goto freeit; } nd6_rtr_cache(n, off, icmp6len, icmp6->icmp6_type); /* m stays. */ break; case ND_NEIGHBOR_SOLICIT: icmp6_ifstat_inc(rcvif, ifs6_in_neighborsolicit); if (code != 0) goto badcode; if (icmp6len < sizeof(struct nd_neighbor_solicit)) goto badlen; if ((n = m_copypacket(m, M_DONTWAIT)) == NULL) { /* give up local */ nd6_ns_input(m, off, icmp6len); m = NULL; goto freeit; } nd6_ns_input(n, off, icmp6len); /* m stays. */ break; case ND_NEIGHBOR_ADVERT: icmp6_ifstat_inc(rcvif, ifs6_in_neighboradvert); if (code != 0) goto badcode; if (icmp6len < sizeof(struct nd_neighbor_advert)) goto badlen; if ((n = m_copypacket(m, M_DONTWAIT)) == NULL) { /* give up local */ nd6_na_input(m, off, icmp6len); m = NULL; goto freeit; } nd6_na_input(n, off, icmp6len); /* m stays. */ break; case ND_REDIRECT: icmp6_ifstat_inc(rcvif, ifs6_in_redirect); if (code != 0) goto badcode; if (icmp6len < sizeof(struct nd_redirect)) goto badlen; if ((n = m_copypacket(m, M_DONTWAIT)) == NULL) { /* give up local */ icmp6_redirect_input(m, off); m = NULL; goto freeit; } icmp6_redirect_input(n, off); /* m stays. */ break; case ICMP6_ROUTER_RENUMBERING: if (code != ICMP6_ROUTER_RENUMBERING_COMMAND && code != ICMP6_ROUTER_RENUMBERING_RESULT) goto badcode; if (icmp6len < sizeof(struct icmp6_router_renum)) goto badlen; break; default: nd6log(LOG_DEBUG, "unknown type %d(src=%s, dst=%s, ifid=%d)\n", icmp6->icmp6_type, IN6_PRINT(ip6buf, &ip6->ip6_src), IN6_PRINT(ip6buf2, &ip6->ip6_dst), rcvif ? rcvif->if_index : 0); if (icmp6->icmp6_type < ICMP6_ECHO_REQUEST) { /* ICMPv6 error: MUST deliver it by spec... */ code = PRC_NCMDS; /* deliver */ } else { /* ICMPv6 informational: MUST not deliver */ break; } deliver: if (icmp6_notify_error(m, off, icmp6len, code)) { /* In this case, m should've been freed. */ m_put_rcvif_psref(rcvif, &psref); return; } break; badcode: ICMP6_STATINC(ICMP6_STAT_BADCODE); break; badlen: ICMP6_STATINC(ICMP6_STAT_BADLEN); break; } m_put_rcvif_psref(rcvif, &psref); /* deliver the packet to appropriate sockets */ icmp6_rip6_input(&m, off); return; freeit: m_put_rcvif_psref(rcvif, &psref); m_freem(m); return; } int icmp6_input(struct mbuf **mp, int *offp, int proto) { wqinput_input(icmp6_wqinput, *mp, *offp, proto); return IPPROTO_DONE; } static int icmp6_notify_error(struct mbuf *m, int off, int icmp6len, int code) { struct icmp6_hdr *icmp6; struct ip6_hdr *eip6; u_int32_t notifymtu; struct sockaddr_in6 icmp6src, icmp6dst; if (icmp6len < sizeof(struct icmp6_hdr) + sizeof(struct ip6_hdr)) { ICMP6_STATINC(ICMP6_STAT_TOOSHORT); goto freeit; } IP6_EXTHDR_GET(icmp6, struct icmp6_hdr *, m, off, sizeof(*icmp6) + sizeof(struct ip6_hdr)); if (icmp6 == NULL) { ICMP6_STATINC(ICMP6_STAT_TOOSHORT); return (-1); } eip6 = (struct ip6_hdr *)(icmp6 + 1); /* Detect the upper level protocol */ { void *(*ctlfunc)(int, const struct sockaddr *, void *); u_int8_t nxt = eip6->ip6_nxt; int eoff = off + sizeof(struct icmp6_hdr) + sizeof(struct ip6_hdr); struct ip6ctlparam ip6cp; struct in6_addr *finaldst = NULL; int icmp6type = icmp6->icmp6_type; struct ip6_frag *fh; struct ip6_rthdr *rth; struct ifnet *rcvif; int s; while (1) { /* XXX: should avoid infinite loop explicitly? */ struct ip6_ext *eh; switch (nxt) { case IPPROTO_HOPOPTS: case IPPROTO_DSTOPTS: case IPPROTO_AH: IP6_EXTHDR_GET(eh, struct ip6_ext *, m, eoff, sizeof(*eh)); if (eh == NULL) { ICMP6_STATINC(ICMP6_STAT_TOOSHORT); return (-1); } if (nxt == IPPROTO_AH) eoff += (eh->ip6e_len + 2) << 2; else eoff += (eh->ip6e_len + 1) << 3; nxt = eh->ip6e_nxt; break; case IPPROTO_ROUTING: /* Ignore the option. */ IP6_EXTHDR_GET(rth, struct ip6_rthdr *, m, eoff, sizeof(*rth)); if (rth == NULL) { ICMP6_STATINC(ICMP6_STAT_TOOSHORT); return (-1); } eoff += (rth->ip6r_len + 1) << 3; nxt = rth->ip6r_nxt; break; case IPPROTO_FRAGMENT: IP6_EXTHDR_GET(fh, struct ip6_frag *, m, eoff, sizeof(*fh)); if (fh == NULL) { ICMP6_STATINC(ICMP6_STAT_TOOSHORT); return (-1); } /* * Data after a fragment header is meaningless * unless it is the first fragment, but * we'll go to the notify label for path MTU * discovery. */ if (fh->ip6f_offlg & IP6F_OFF_MASK) goto notify; eoff += sizeof(struct ip6_frag); nxt = fh->ip6f_nxt; break; default: /* * This case includes ESP and the No Next * Header. In such cases going to the notify * label does not have any meaning * (i.e. ctlfunc will be NULL), but we go * anyway since we might have to update * path MTU information. */ goto notify; } } notify: IP6_EXTHDR_GET(icmp6, struct icmp6_hdr *, m, off, sizeof(*icmp6) + sizeof(struct ip6_hdr)); if (icmp6 == NULL) { ICMP6_STATINC(ICMP6_STAT_TOOSHORT); return (-1); } /* * retrieve parameters from the inner IPv6 header, and convert * them into sockaddr structures. * XXX: there is no guarantee that the source or destination * addresses of the inner packet are in the same scope zone as * the addresses of the icmp packet. But there is no other * way to determine the zone. */ eip6 = (struct ip6_hdr *)(icmp6 + 1); rcvif = m_get_rcvif(m, &s); if (__predict_false(rcvif == NULL)) goto freeit; sockaddr_in6_init(&icmp6dst, (finaldst == NULL) ? &eip6->ip6_dst : finaldst, 0, 0, 0); if (in6_setscope(&icmp6dst.sin6_addr, rcvif, NULL)) { m_put_rcvif(rcvif, &s); goto freeit; } sockaddr_in6_init(&icmp6src, &eip6->ip6_src, 0, 0, 0); if (in6_setscope(&icmp6src.sin6_addr, rcvif, NULL)) { m_put_rcvif(rcvif, &s); goto freeit; } m_put_rcvif(rcvif, &s); icmp6src.sin6_flowinfo = (eip6->ip6_flow & IPV6_FLOWLABEL_MASK); if (finaldst == NULL) finaldst = &eip6->ip6_dst; ip6cp.ip6c_m = m; ip6cp.ip6c_icmp6 = icmp6; ip6cp.ip6c_ip6 = (struct ip6_hdr *)(icmp6 + 1); ip6cp.ip6c_off = eoff; ip6cp.ip6c_finaldst = finaldst; ip6cp.ip6c_src = &icmp6src; ip6cp.ip6c_nxt = nxt; if (icmp6type == ICMP6_PACKET_TOO_BIG) { notifymtu = ntohl(icmp6->icmp6_mtu); ip6cp.ip6c_cmdarg = (void *)&notifymtu; } ctlfunc = inet6sw[ip6_protox[nxt]].pr_ctlinput; if (ctlfunc) { (void)(*ctlfunc)(code, sin6tosa(&icmp6dst), &ip6cp); } } return (0); freeit: m_freem(m); return (-1); } void icmp6_mtudisc_update(struct ip6ctlparam *ip6cp, int validated) { unsigned long rtcount; struct icmp6_mtudisc_callback *mc; struct in6_addr *dst = ip6cp->ip6c_finaldst; struct icmp6_hdr *icmp6 = ip6cp->ip6c_icmp6; struct mbuf *m = ip6cp->ip6c_m; /* will be necessary for scope issue */ u_int mtu = ntohl(icmp6->icmp6_mtu); struct rtentry *rt = NULL; struct sockaddr_in6 sin6; struct ifnet *rcvif; int s; /* * The MTU should not be less than the minimal IPv6 MTU except for the * hack in ip6_output/ip6_setpmtu where we always include a frag header. * In that one case, the MTU might be less than 1280. */ if (__predict_false(mtu < IPV6_MMTU - sizeof(struct ip6_frag))) { /* is the mtu even sane? */ if (mtu < sizeof(struct ip6_hdr) + sizeof(struct ip6_frag) + 8) return; if (!validated) return; mtu = IPV6_MMTU - sizeof(struct ip6_frag); } /* * allow non-validated cases if memory is plenty, to make traffic * from non-connected pcb happy. */ mutex_enter(&icmp6_mtx); rtcount = rt_timer_count(icmp6_mtudisc_timeout_q); if (validated) { if (0 <= icmp6_mtudisc_hiwat && rtcount > icmp6_mtudisc_hiwat) { mutex_exit(&icmp6_mtx); return; } else if (0 <= icmp6_mtudisc_lowat && rtcount > icmp6_mtudisc_lowat) { /* * XXX nuke a victim, install the new one. */ } } else { if (0 <= icmp6_mtudisc_lowat && rtcount > icmp6_mtudisc_lowat) { mutex_exit(&icmp6_mtx); return; } } mutex_exit(&icmp6_mtx); memset(&sin6, 0, sizeof(sin6)); sin6.sin6_family = PF_INET6; sin6.sin6_len = sizeof(struct sockaddr_in6); sin6.sin6_addr = *dst; rcvif = m_get_rcvif(m, &s); if (__predict_false(rcvif == NULL)) return; if (in6_setscope(&sin6.sin6_addr, rcvif, NULL)) { m_put_rcvif(rcvif, &s); return; } m_put_rcvif(rcvif, &s); rt = icmp6_mtudisc_clone(sin6tosa(&sin6)); if (rt && (rt->rt_flags & RTF_HOST) && !(rt->rt_rmx.rmx_locks & RTV_MTU) && (rt->rt_rmx.rmx_mtu > mtu || rt->rt_rmx.rmx_mtu == 0)) { if (mtu < rt->rt_ifp->if_mtu) { ICMP6_STATINC(ICMP6_STAT_PMTUCHG); rt->rt_rmx.rmx_mtu = mtu; } } if (rt) { rt_unref(rt); } /* * Notify protocols that the MTU for this destination * has changed. */ mutex_enter(&icmp6_mtx); for (mc = LIST_FIRST(&icmp6_mtudisc_callbacks); mc != NULL; mc = LIST_NEXT(mc, mc_list)) (*mc->mc_func)(&sin6.sin6_addr); mutex_exit(&icmp6_mtx); } /* * Process a Node Information Query packet, based on * draft-ietf-ipngwg-icmp-name-lookups-07. * * Spec incompatibilities: * - IPv6 Subject address handling * - IPv4 Subject address handling support missing * - Proxy reply (answer even if it's not for me) * - joins NI group address at in6_ifattach() time only, does not cope * with hostname changes by sethostname(3) */ static struct mbuf * ni6_input(struct mbuf *m, int off) { struct icmp6_nodeinfo *ni6, *nni6; struct mbuf *n = NULL; u_int16_t qtype; int subjlen; int replylen = sizeof(struct ip6_hdr) + sizeof(struct icmp6_nodeinfo); struct ni_reply_fqdn *fqdn; int addrs; /* for NI_QTYPE_NODEADDR */ struct ifnet *ifp = NULL; /* for NI_QTYPE_NODEADDR */ struct sockaddr_in6 sin6; /* ip6_dst */ struct in6_addr in6_subj; /* subject address */ struct ip6_hdr *ip6; int oldfqdn = 0; /* if 1, return pascal string (03 draft) */ char *subj = NULL; struct ifnet *rcvif; int s, ss; struct ifaddr *ifa; struct psref psref; ip6 = mtod(m, struct ip6_hdr *); IP6_EXTHDR_GET(ni6, struct icmp6_nodeinfo *, m, off, sizeof(*ni6)); if (ni6 == NULL) { /* m is already reclaimed */ return NULL; } KASSERT((m->m_flags & M_PKTHDR) != 0); /* * Validate IPv6 destination address. * * The Responder must discard the Query without further processing * unless it is one of the Responder's unicast or anycast addresses, or * a link-local scope multicast address which the Responder has joined. * [icmp-name-lookups-07, Section 4.] */ sockaddr_in6_init(&sin6, &ip6->ip6_dst, 0, 0, 0); /* XXX scopeid */ ss = pserialize_read_enter(); ifa = ifa_ifwithaddr(sin6tosa(&sin6)); if (ifa != NULL) { ; /* unicast/anycast, fine */ } else if (IN6_IS_ADDR_MC_LINKLOCAL(&sin6.sin6_addr)) { ; /* link-local multicast, fine */ } else { pserialize_read_exit(ss); goto bad; } pserialize_read_exit(ss); /* validate query Subject field. */ qtype = ntohs(ni6->ni_qtype); subjlen = m->m_pkthdr.len - off - sizeof(struct icmp6_nodeinfo); switch (qtype) { case NI_QTYPE_NOOP: case NI_QTYPE_SUPTYPES: /* 07 draft */ if (ni6->ni_code == ICMP6_NI_SUBJ_FQDN && subjlen == 0) break; /* FALLTHROUGH */ case NI_QTYPE_FQDN: case NI_QTYPE_NODEADDR: case NI_QTYPE_IPV4ADDR: switch (ni6->ni_code) { case ICMP6_NI_SUBJ_IPV6: #if ICMP6_NI_SUBJ_IPV6 != 0 case 0: #endif /* * backward compatibility - try to accept 03 draft * format, where no Subject is present. */ if (qtype == NI_QTYPE_FQDN && ni6->ni_code == 0 && subjlen == 0) { oldfqdn++; break; } #if ICMP6_NI_SUBJ_IPV6 != 0 if (ni6->ni_code != ICMP6_NI_SUBJ_IPV6) goto bad; #endif if (subjlen != sizeof(sin6.sin6_addr)) goto bad; /* * Validate Subject address. * * Not sure what exactly "address belongs to the node" * means in the spec, is it just unicast, or what? * * At this moment we consider Subject address as * "belong to the node" if the Subject address equals * to the IPv6 destination address; validation for * IPv6 destination address should have done enough * check for us. * * We do not do proxy at this moment. */ /* m_pulldown instead of copy? */ m_copydata(m, off + sizeof(struct icmp6_nodeinfo), subjlen, (void *)&in6_subj); rcvif = m_get_rcvif(m, &s); if (__predict_false(rcvif == NULL)) goto bad; if (in6_setscope(&in6_subj, rcvif, NULL)) { m_put_rcvif(rcvif, &s); goto bad; } m_put_rcvif(rcvif, &s); subj = (char *)&in6_subj; if (IN6_ARE_ADDR_EQUAL(&ip6->ip6_dst, &in6_subj)) break; /* * XXX if we are to allow other cases, we should really * be careful about scope here. * basically, we should disallow queries toward IPv6 * destination X with subject Y, if scope(X) > scope(Y). * if we allow scope(X) > scope(Y), it will result in * information leakage across scope boundary. */ goto bad; case ICMP6_NI_SUBJ_FQDN: /* * Validate Subject name with gethostname(3). * * The behavior may need some debate, since: * - we are not sure if the node has FQDN as * hostname (returned by gethostname(3)). * - the code does wildcard match for truncated names. * however, we are not sure if we want to perform * wildcard match, if gethostname(3) side has * truncated hostname. */ n = ni6_nametodns(hostname, hostnamelen, 0); if (!n || n->m_next || n->m_len == 0) goto bad; IP6_EXTHDR_GET(subj, char *, m, off + sizeof(struct icmp6_nodeinfo), subjlen); if (subj == NULL) goto bad; if (!ni6_dnsmatch(subj, subjlen, mtod(n, const char *), n->m_len)) { goto bad; } m_freem(n); n = NULL; break; case ICMP6_NI_SUBJ_IPV4: /* XXX: to be implemented? */ default: goto bad; } break; } /* refuse based on configuration. XXX ICMP6_NI_REFUSED? */ switch (qtype) { case NI_QTYPE_FQDN: if ((icmp6_nodeinfo & 1) == 0) goto bad; break; case NI_QTYPE_NODEADDR: case NI_QTYPE_IPV4ADDR: if ((icmp6_nodeinfo & 2) == 0) goto bad; break; } /* guess reply length */ switch (qtype) { case NI_QTYPE_NOOP: break; /* no reply data */ case NI_QTYPE_SUPTYPES: replylen += sizeof(u_int32_t); break; case NI_QTYPE_FQDN: /* will append an mbuf */ replylen += offsetof(struct ni_reply_fqdn, ni_fqdn_namelen); break; case NI_QTYPE_NODEADDR: addrs = ni6_addrs(ni6, &ifp, subj, &psref); replylen += addrs * (sizeof(struct in6_addr) + sizeof(u_int32_t)); if (replylen > MCLBYTES) replylen = MCLBYTES; /* XXX: will truncate pkt later */ break; case NI_QTYPE_IPV4ADDR: /* unsupported - should respond with unknown Qtype? */ goto bad; default: /* * XXX: We must return a reply with the ICMP6 code * `unknown Qtype' in this case. However we regard the case * as an FQDN query for backward compatibility. * Older versions set a random value to this field, * so it rarely varies in the defined qtypes. * But the mechanism is not reliable... * maybe we should obsolete older versions. */ qtype = NI_QTYPE_FQDN; /* will append an mbuf */ replylen += offsetof(struct ni_reply_fqdn, ni_fqdn_namelen); oldfqdn++; break; } /* allocate an mbuf to reply. */ MGETHDR(n, M_DONTWAIT, m->m_type); if (n == NULL) { goto bad; } m_move_pkthdr(n, m); if (replylen > MHLEN) { if (replylen > MCLBYTES) { /* * XXX: should we try to allocate more? But MCLBYTES * is probably much larger than IPV6_MMTU... */ goto bad; } MCLGET(n, M_DONTWAIT); if ((n->m_flags & M_EXT) == 0) { goto bad; } } n->m_pkthdr.len = n->m_len = replylen; /* copy mbuf header and IPv6 + Node Information base headers */ bcopy(mtod(m, void *), mtod(n, void *), sizeof(struct ip6_hdr)); nni6 = (struct icmp6_nodeinfo *)(mtod(n, struct ip6_hdr *) + 1); bcopy((void *)ni6, (void *)nni6, sizeof(struct icmp6_nodeinfo)); /* qtype dependent procedure */ switch (qtype) { case NI_QTYPE_NOOP: nni6->ni_code = ICMP6_NI_SUCCESS; nni6->ni_flags = 0; break; case NI_QTYPE_SUPTYPES: { u_int32_t v; nni6->ni_code = ICMP6_NI_SUCCESS; nni6->ni_flags = htons(0x0000); /* raw bitmap */ /* supports NOOP, SUPTYPES, FQDN, and NODEADDR */ v = (u_int32_t)htonl(0x0000000f); memcpy(nni6 + 1, &v, sizeof(u_int32_t)); break; } case NI_QTYPE_FQDN: nni6->ni_code = ICMP6_NI_SUCCESS; fqdn = (struct ni_reply_fqdn *)(mtod(n, char *) + sizeof(struct ip6_hdr) + sizeof(struct icmp6_nodeinfo)); nni6->ni_flags = 0; /* XXX: meaningless TTL */ fqdn->ni_fqdn_ttl = 0; /* ditto. */ /* * XXX do we really have FQDN in variable "hostname"? */ n->m_next = ni6_nametodns(hostname, hostnamelen, oldfqdn); if (n->m_next == NULL) goto bad; /* XXX we assume that n->m_next is not a chain */ if (n->m_next->m_next != NULL) goto bad; n->m_pkthdr.len += n->m_next->m_len; break; case NI_QTYPE_NODEADDR: { int lenlim, copied; nni6->ni_code = ICMP6_NI_SUCCESS; n->m_pkthdr.len = n->m_len = sizeof(struct ip6_hdr) + sizeof(struct icmp6_nodeinfo); lenlim = M_TRAILINGSPACE(n); copied = ni6_store_addrs(ni6, nni6, ifp, lenlim); if_put(ifp, &psref); ifp = NULL; /* update mbuf length */ n->m_pkthdr.len = n->m_len = sizeof(struct ip6_hdr) + sizeof(struct icmp6_nodeinfo) + copied; break; } default: panic("%s: impossible", __func__); break; } nni6->ni_type = ICMP6_NI_REPLY; m_freem(m); return n; bad: if_put(ifp, &psref); m_freem(m); if (n) m_freem(n); return NULL; } #define isupper(x) ('A' <= (x) && (x) <= 'Z') #define isalpha(x) (('A' <= (x) && (x) <= 'Z') || ('a' <= (x) && (x) <= 'z')) #define isalnum(x) (isalpha(x) || ('0' <= (x) && (x) <= '9')) #define tolower(x) (isupper(x) ? (x) + 'a' - 'A' : (x)) /* * make a mbuf with DNS-encoded string. no compression support. * * XXX names with less than 2 dots (like "foo" or "foo.section") will be * treated as truncated name (two \0 at the end). this is a wild guess. * * old - return pascal string if non-zero */ static struct mbuf * ni6_nametodns(const char *name, int namelen, int old) { struct mbuf *m; char *cp, *ep; const char *p, *q; int i, len, nterm; if (old) len = namelen + 1; else len = MCLBYTES; /* because MAXHOSTNAMELEN is usually 256, we use cluster mbuf */ MGET(m, M_DONTWAIT, MT_DATA); if (m && len > MLEN) { MCLGET(m, M_DONTWAIT); if ((m->m_flags & M_EXT) == 0) goto fail; } if (!m) goto fail; m->m_next = NULL; if (old) { m->m_len = len; *mtod(m, char *) = namelen; memcpy(mtod(m, char *) + 1, name, namelen); return m; } else { m->m_len = 0; cp = mtod(m, char *); ep = mtod(m, char *) + M_TRAILINGSPACE(m); /* if not certain about my name, return empty buffer */ if (namelen == 0) return m; /* * guess if it looks like shortened hostname, or FQDN. * shortened hostname needs two trailing "\0". */ i = 0; for (p = name; p < name + namelen; p++) { if (*p == '.') i++; } if (i < 2) nterm = 2; else nterm = 1; p = name; while (cp < ep && p < name + namelen) { i = 0; for (q = p; q < name + namelen && *q && *q != '.'; q++) i++; /* result does not fit into mbuf */ if (cp + i + 1 >= ep) goto fail; /* * DNS label length restriction, RFC1035 page 8. * "i == 0" case is included here to avoid returning * 0-length label on "foo..bar". */ if (i <= 0 || i >= 64) goto fail; *cp++ = i; if (!isalpha(p[0]) || !isalnum(p[i - 1])) goto fail; while (i > 0) { if (!isalnum(*p) && *p != '-') goto fail; if (isupper(*p)) { *cp++ = tolower(*p); p++; } else *cp++ = *p++; i--; } p = q; if (p < name + namelen && *p == '.') p++; } /* termination */ if (cp + nterm >= ep) goto fail; while (nterm-- > 0) *cp++ = '\0'; m->m_len = cp - mtod(m, char *); return m; } panic("should not reach here"); /* NOTREACHED */ fail: if (m) m_freem(m); return NULL; } /* * check if two DNS-encoded string matches. takes care of truncated * form (with \0\0 at the end). no compression support. * XXX upper/lowercase match (see RFC2065) */ static int ni6_dnsmatch(const char *a, int alen, const char *b, int blen) { const char *a0, *b0; int l; /* simplest case - need validation? */ if (alen == blen && memcmp(a, b, alen) == 0) return 1; a0 = a; b0 = b; /* termination is mandatory */ if (alen < 2 || blen < 2) return 0; if (a0[alen - 1] != '\0' || b0[blen - 1] != '\0') return 0; alen--; blen--; while (a - a0 < alen && b - b0 < blen) { if (a - a0 + 1 > alen || b - b0 + 1 > blen) return 0; if ((signed char)a[0] < 0 || (signed char)b[0] < 0) return 0; /* we don't support compression yet */ if (a[0] >= 64 || b[0] >= 64) return 0; /* truncated case */ if (a[0] == 0 && a - a0 == alen - 1) return 1; if (b[0] == 0 && b - b0 == blen - 1) return 1; if (a[0] == 0 || b[0] == 0) return 0; if (a[0] != b[0]) return 0; l = a[0]; if (a - a0 + 1 + l > alen || b - b0 + 1 + l > blen) return 0; if (memcmp(a + 1, b + 1, l) != 0) return 0; a += 1 + l; b += 1 + l; } if (a - a0 == alen && b - b0 == blen) return 1; else return 0; } /* * calculate the number of addresses to be returned in the node info reply. */ static int ni6_addrs(struct icmp6_nodeinfo *ni6, struct ifnet **ifpp, char *subj, struct psref *psref) { struct ifnet *ifp; struct in6_ifaddr *ia6; struct ifaddr *ifa; struct sockaddr_in6 *subj_ip6 = NULL; /* XXX pedant */ int addrs = 0, addrsofif, iffound = 0; int niflags = ni6->ni_flags; int s; if ((niflags & NI_NODEADDR_FLAG_ALL) == 0) { switch (ni6->ni_code) { case ICMP6_NI_SUBJ_IPV6: if (subj == NULL) /* must be impossible... */ return 0; subj_ip6 = (struct sockaddr_in6 *)subj; break; default: /* * XXX: we only support IPv6 subject address for * this Qtype. */ return 0; } } s = pserialize_read_enter(); IFNET_READER_FOREACH(ifp) { addrsofif = 0; IFADDR_READER_FOREACH(ifa, ifp) { if (ifa->ifa_addr->sa_family != AF_INET6) continue; ia6 = (struct in6_ifaddr *)ifa; if ((niflags & NI_NODEADDR_FLAG_ALL) == 0 && IN6_ARE_ADDR_EQUAL(&subj_ip6->sin6_addr, &ia6->ia_addr.sin6_addr)) iffound = 1; /* * IPv4-mapped addresses can only be returned by a * Node Information proxy, since they represent * addresses of IPv4-only nodes, which perforce do * not implement this protocol. * [icmp-name-lookups-07, Section 5.4] * So we don't support NI_NODEADDR_FLAG_COMPAT in * this function at this moment. */ /* What do we have to do about ::1? */ switch (in6_addrscope(&ia6->ia_addr.sin6_addr)) { case IPV6_ADDR_SCOPE_LINKLOCAL: if ((niflags & NI_NODEADDR_FLAG_LINKLOCAL) == 0) continue; break; case IPV6_ADDR_SCOPE_SITELOCAL: if ((niflags & NI_NODEADDR_FLAG_SITELOCAL) == 0) continue; break; case IPV6_ADDR_SCOPE_GLOBAL: if ((niflags & NI_NODEADDR_FLAG_GLOBAL) == 0) continue; break; default: continue; } /* * check if anycast is okay. * XXX: just experimental. not in the spec. */ if ((ia6->ia6_flags & IN6_IFF_ANYCAST) != 0 && (niflags & NI_NODEADDR_FLAG_ANYCAST) == 0) continue; /* we need only unicast addresses */ addrsofif++; /* count the address */ } if (iffound) { if_acquire(ifp, psref); pserialize_read_exit(s); *ifpp = ifp; return addrsofif; } addrs += addrsofif; } pserialize_read_exit(s); return addrs; } static int ni6_store_addrs(struct icmp6_nodeinfo *ni6, struct icmp6_nodeinfo *nni6, struct ifnet *ifp0, int resid) { struct ifnet *ifp; struct in6_ifaddr *ia6; struct ifaddr *ifa; struct ifnet *ifp_dep = NULL; int copied = 0, allow_deprecated = 0; u_char *cp = (u_char *)(nni6 + 1); int niflags = ni6->ni_flags; u_int32_t ltime; int s; if (ifp0 == NULL && !(niflags & NI_NODEADDR_FLAG_ALL)) return 0; /* needless to copy */ s = pserialize_read_enter(); ifp = ifp0 ? ifp0 : IFNET_READER_FIRST(); again: for (; ifp; ifp = IFNET_READER_NEXT(ifp)) { IFADDR_READER_FOREACH(ifa, ifp) { if (ifa->ifa_addr->sa_family != AF_INET6) continue; ia6 = (struct in6_ifaddr *)ifa; if ((ia6->ia6_flags & IN6_IFF_DEPRECATED) != 0 && allow_deprecated == 0) { /* * prefererred address should be put before * deprecated addresses. */ /* record the interface for later search */ if (ifp_dep == NULL) ifp_dep = ifp; continue; } else if ((ia6->ia6_flags & IN6_IFF_DEPRECATED) == 0 && allow_deprecated != 0) continue; /* we now collect deprecated addrs */ /* What do we have to do about ::1? */ switch (in6_addrscope(&ia6->ia_addr.sin6_addr)) { case IPV6_ADDR_SCOPE_LINKLOCAL: if ((niflags & NI_NODEADDR_FLAG_LINKLOCAL) == 0) continue; break; case IPV6_ADDR_SCOPE_SITELOCAL: if ((niflags & NI_NODEADDR_FLAG_SITELOCAL) == 0) continue; break; case IPV6_ADDR_SCOPE_GLOBAL: if ((niflags & NI_NODEADDR_FLAG_GLOBAL) == 0) continue; break; default: continue; } /* * check if anycast is okay. * XXX: just experimental. not in the spec. */ if ((ia6->ia6_flags & IN6_IFF_ANYCAST) != 0 && (niflags & NI_NODEADDR_FLAG_ANYCAST) == 0) continue; /* now we can copy the address */ if (resid < sizeof(struct in6_addr) + sizeof(u_int32_t)) { /* * We give up much more copy. * Set the truncate flag and return. */ nni6->ni_flags |= NI_NODEADDR_FLAG_TRUNCATE; goto out; } /* * Set the TTL of the address. * The TTL value should be one of the following * according to the specification: * * 1. The remaining lifetime of a DHCP lease on the * address, or * 2. The remaining Valid Lifetime of a prefix from * which the address was derived through Stateless * Autoconfiguration. * * Note that we currently do not support stateful * address configuration by DHCPv6, so the former * case can't happen. * * TTL must be 2^31 > TTL >= 0. */ if (ia6->ia6_lifetime.ia6t_expire == 0) ltime = ND6_INFINITE_LIFETIME; else { if (ia6->ia6_lifetime.ia6t_expire > time_uptime) ltime = ia6->ia6_lifetime.ia6t_expire - time_uptime; else ltime = 0; } if (ltime > 0x7fffffff) ltime = 0x7fffffff; ltime = htonl(ltime); memcpy(cp, &ltime, sizeof(u_int32_t)); cp += sizeof(u_int32_t); /* copy the address itself */ bcopy(&ia6->ia_addr.sin6_addr, cp, sizeof(struct in6_addr)); in6_clearscope((struct in6_addr *)cp); /* XXX */ cp += sizeof(struct in6_addr); resid -= (sizeof(struct in6_addr) + sizeof(u_int32_t)); copied += (sizeof(struct in6_addr) + sizeof(u_int32_t)); } if (ifp0) /* we need search only on the specified IF */ break; } if (allow_deprecated == 0 && ifp_dep != NULL) { ifp = ifp_dep; allow_deprecated = 1; goto again; } out: pserialize_read_exit(s); return copied; } /* * XXX almost dup'ed code with rip6_input. */ static int icmp6_rip6_input(struct mbuf **mp, int off) { struct mbuf *m = *mp; struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *); struct inpcb *inp; struct inpcb *last = NULL; struct sockaddr_in6 rip6src; struct icmp6_hdr *icmp6; struct mbuf *n, *opts = NULL; IP6_EXTHDR_GET(icmp6, struct icmp6_hdr *, m, off, sizeof(*icmp6)); if (icmp6 == NULL) { /* m is already reclaimed */ return IPPROTO_DONE; } /* * XXX: the address may have embedded scope zone ID, which should be * hidden from applications. */ sockaddr_in6_init(&rip6src, &ip6->ip6_src, 0, 0, 0); if (sa6_recoverscope(&rip6src)) { m_freem(m); return IPPROTO_DONE; } TAILQ_FOREACH(inp, &raw6cbtable.inpt_queue, inp_queue) { if (inp->inp_af != AF_INET6) continue; if (in6p_ip6(inp).ip6_nxt != IPPROTO_ICMPV6) continue; if (!IN6_IS_ADDR_UNSPECIFIED(&in6p_laddr(inp)) && !IN6_ARE_ADDR_EQUAL(&in6p_laddr(inp), &ip6->ip6_dst)) continue; if (!IN6_IS_ADDR_UNSPECIFIED(&in6p_faddr(inp)) && !IN6_ARE_ADDR_EQUAL(&in6p_faddr(inp), &ip6->ip6_src)) continue; if (in6p_icmp6filt(inp) && ICMP6_FILTER_WILLBLOCK(icmp6->icmp6_type, in6p_icmp6filt(inp))) continue; if (last == NULL) { ; } #ifdef IPSEC else if (ipsec_used && ipsec_in_reject(m, last)) { /* do not inject data into pcb */ } #endif else if ((n = m_copypacket(m, M_DONTWAIT)) != NULL) { if (last->inp_flags & IN6P_CONTROLOPTS || SOOPT_TIMESTAMP(last->inp_socket->so_options)) ip6_savecontrol(last, &opts, ip6, n); /* strip intermediate headers */ m_adj(n, off); if (sbappendaddr(&last->inp_socket->so_rcv, sin6tosa(&rip6src), n, opts) == 0) { soroverflow(last->inp_socket); m_freem(n); if (opts) m_freem(opts); } else { sorwakeup(last->inp_socket); } opts = NULL; } last = inp; } #ifdef IPSEC if (ipsec_used && last && ipsec_in_reject(m, last)) { m_freem(m); IP6_STATDEC(IP6_STAT_DELIVERED); /* do not inject data into pcb */ } else #endif if (last) { if (last->inp_flags & IN6P_CONTROLOPTS || SOOPT_TIMESTAMP(last->inp_socket->so_options)) ip6_savecontrol(last, &opts, ip6, m); /* strip intermediate headers */ m_adj(m, off); if (sbappendaddr(&last->inp_socket->so_rcv, sin6tosa(&rip6src), m, opts) == 0) { soroverflow(last->inp_socket); m_freem(m); if (opts) m_freem(opts); } else { sorwakeup(last->inp_socket); } } else { m_freem(m); IP6_STATDEC(IP6_STAT_DELIVERED); } return IPPROTO_DONE; } /* * Reflect the ip6 packet back to the source. * OFF points to the icmp6 header, counted from the top of the mbuf. * * Note: RFC 1885 required that an echo reply should be truncated if it * did not fit in with (return) path MTU, and KAME code supported the * behavior. However, as a clarification after the RFC, this limitation * was removed in a revised version of the spec, RFC 2463. We had kept the * old behavior, with a (non-default) ifdef block, while the new version of * the spec was an internet-draft status, and even after the new RFC was * published. But it would rather make sense to clean the obsoleted part * up, and to make the code simpler at this stage. */ static void icmp6_reflect(struct mbuf *m, size_t off) { struct ip6_hdr *ip6; struct icmp6_hdr *icmp6; const struct in6_ifaddr *ia; const struct ip6aux *ip6a; int plen; int type, code; struct ifnet *outif = NULL; struct in6_addr origdst; struct ifnet *rcvif; int s; bool ip6_src_filled = false; int flags; /* too short to reflect */ if (off < sizeof(struct ip6_hdr)) { nd6log(LOG_DEBUG, "sanity fail: off=%lx, sizeof(ip6)=%lx in %s:%d\n", (u_long)off, (u_long)sizeof(struct ip6_hdr), __FILE__, __LINE__); goto bad; } /* * If there are extra headers between IPv6 and ICMPv6, strip * off that header first. */ CTASSERT(sizeof(struct ip6_hdr) + sizeof(struct icmp6_hdr) <= MHLEN); if (off > sizeof(struct ip6_hdr)) { size_t l; struct ip6_hdr nip6; l = off - sizeof(struct ip6_hdr); m_copydata(m, 0, sizeof(nip6), (void *)&nip6); m_adj(m, l); l = sizeof(struct ip6_hdr) + sizeof(struct icmp6_hdr); if (m->m_len < l) { if ((m = m_pullup(m, l)) == NULL) return; } memcpy(mtod(m, void *), (void *)&nip6, sizeof(nip6)); } else { size_t l = sizeof(struct ip6_hdr) + sizeof(struct icmp6_hdr); if (m->m_len < l) { if ((m = m_pullup(m, l)) == NULL) return; } } plen = m->m_pkthdr.len - sizeof(struct ip6_hdr); ip6 = mtod(m, struct ip6_hdr *); ip6->ip6_nxt = IPPROTO_ICMPV6; icmp6 = (struct icmp6_hdr *)(ip6 + 1); type = icmp6->icmp6_type; /* keep type for statistics */ code = icmp6->icmp6_code; /* ditto. */ origdst = ip6->ip6_dst; /* * ip6_input() drops a packet if its src is multicast. * So, the src is never multicast. */ ip6->ip6_dst = ip6->ip6_src; /* * If the incoming packet was addressed directly to us (i.e. unicast), * use dst as the src for the reply. * The IN6_IFF_NOTREADY case should be VERY rare, but is possible * (for example) when we encounter an error while forwarding procedure * destined to a duplicated address of ours. * Note that ip6_getdstifaddr() may fail if we are in an error handling * procedure of an outgoing packet of our own, in which case we need * to search in the ifaddr list. */ if (IN6_IS_ADDR_MULTICAST(&origdst)) { ; } else if ((ip6a = ip6_getdstifaddr(m)) != NULL) { if ((ip6a->ip6a_flags & (IN6_IFF_ANYCAST|IN6_IFF_NOTREADY)) == 0) { ip6->ip6_src = ip6a->ip6a_src; ip6_src_filled = true; } } else { union { struct sockaddr_in6 sin6; struct sockaddr sa; } u; int _s; struct ifaddr *ifa; sockaddr_in6_init(&u.sin6, &origdst, 0, 0, 0); _s = pserialize_read_enter(); ifa = ifa_ifwithaddr(&u.sa); if (ifa != NULL) { ia = ifatoia6(ifa); if ((ia->ia6_flags & (IN6_IFF_ANYCAST|IN6_IFF_NOTREADY)) == 0) { ip6->ip6_src = ia->ia_addr.sin6_addr; ip6_src_filled = true; } } pserialize_read_exit(_s); } if (!ip6_src_filled) { int e; struct sockaddr_in6 sin6; struct route ro; /* * This case matches to multicasts, our anycast, or unicasts * that we do not own. Select a source address based on the * source address of the erroneous packet. */ /* zone ID should be embedded */ sockaddr_in6_init(&sin6, &ip6->ip6_dst, 0, 0, 0); memset(&ro, 0, sizeof(ro)); e = in6_selectsrc(&sin6, NULL, NULL, &ro, NULL, NULL, NULL, &ip6->ip6_src); rtcache_free(&ro); if (e != 0) { char ip6buf[INET6_ADDRSTRLEN]; nd6log(LOG_DEBUG, "source can't be determined: " "dst=%s, error=%d\n", IN6_PRINT(ip6buf, &sin6.sin6_addr), e); goto bad; } } ip6->ip6_flow = 0; ip6->ip6_vfc &= ~IPV6_VERSION_MASK; ip6->ip6_vfc |= IPV6_VERSION; ip6->ip6_nxt = IPPROTO_ICMPV6; rcvif = m_get_rcvif(m, &s); if (rcvif) { /* XXX: This may not be the outgoing interface */ ip6->ip6_hlim = ND_IFINFO(rcvif)->chlim; } else { ip6->ip6_hlim = ip6_defhlim; } m_put_rcvif(rcvif, &s); m->m_pkthdr.csum_flags = 0; icmp6->icmp6_cksum = 0; icmp6->icmp6_cksum = in6_cksum(m, IPPROTO_ICMPV6, sizeof(struct ip6_hdr), plen); /* * XXX option handling */ m->m_flags &= ~(M_BCAST|M_MCAST); /* * Note for icmp6_reflect_pmtu == false * To avoid a "too big" situation at an intermediate router * and the path MTU discovery process, specify the IPV6_MINMTU flag. * Note that only echo and node information replies are affected, * since the length of ICMP6 errors is limited to the minimum MTU. */ flags = icmp6_reflect_pmtu ? 0 : IPV6_MINMTU; if (ip6_output(m, NULL, NULL, flags, NULL, NULL, &outif) != 0 && outif) icmp6_ifstat_inc(outif, ifs6_out_error); if (outif) icmp6_ifoutstat_inc(outif, type, code); return; bad: m_freem(m); return; } static const char * icmp6_redirect_diag(char *buf, size_t buflen, struct in6_addr *src6, struct in6_addr *dst6, struct in6_addr *tgt6) { char ip6bufs[INET6_ADDRSTRLEN], ip6bufd[INET6_ADDRSTRLEN]; char ip6buft[INET6_ADDRSTRLEN]; snprintf(buf, buflen, "(src=%s dst=%s tgt=%s)", IN6_PRINT(ip6bufs, src6), IN6_PRINT(ip6bufd, dst6), IN6_PRINT(ip6buft, tgt6)); return buf; } static void icmp6_redirect_input(struct mbuf *m, int off) { struct ifnet *ifp; struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *); struct nd_redirect *nd_rd; int icmp6len = m->m_pkthdr.len - off; char *lladdr = NULL; int lladdrlen = 0; struct rtentry *rt = NULL; int is_router; int is_onlink; struct in6_addr src6 = ip6->ip6_src; struct in6_addr redtgt6; struct in6_addr reddst6; union nd_opts ndopts; struct psref psref; char ip6buf[INET6_ADDRSTRLEN]; char diagbuf[256]; ifp = m_get_rcvif_psref(m, &psref); if (ifp == NULL) goto freeit; /* XXX if we are router, we don't update route by icmp6 redirect */ if (ip6_forwarding) goto freeit; if (!icmp6_rediraccept) goto freeit; IP6_EXTHDR_GET(nd_rd, struct nd_redirect *, m, off, icmp6len); if (nd_rd == NULL) { ICMP6_STATINC(ICMP6_STAT_TOOSHORT); m_put_rcvif_psref(ifp, &psref); return; } redtgt6 = nd_rd->nd_rd_target; reddst6 = nd_rd->nd_rd_dst; if (in6_setscope(&redtgt6, ifp, NULL) || in6_setscope(&reddst6, ifp, NULL)) { goto freeit; } /* validation */ if (!IN6_IS_ADDR_LINKLOCAL(&src6)) { nd6log(LOG_ERR, "ICMP6 redirect sent from %s rejected; " "must be from linklocal\n", IN6_PRINT(ip6buf, &src6)); goto bad; } if (ip6->ip6_hlim != 255) { nd6log(LOG_ERR, "ICMP6 redirect sent from %s rejected; " "hlim=%d (must be 255)\n", IN6_PRINT(ip6buf, &src6), ip6->ip6_hlim); goto bad; } { /* ip6->ip6_src must be equal to gw for icmp6->icmp6_reddst */ struct sockaddr_in6 sin6; struct in6_addr *gw6; sockaddr_in6_init(&sin6, &reddst6, 0, 0, 0); rt = rtalloc1(sin6tosa(&sin6), 0); if (rt) { if (rt->rt_gateway == NULL || rt->rt_gateway->sa_family != AF_INET6) { nd6log(LOG_ERR, "ICMP6 redirect rejected; no route " "with inet6 gateway found for redirect dst: %s\n", icmp6_redirect_diag(diagbuf, sizeof(diagbuf), &src6, &reddst6, &redtgt6)); rt_unref(rt); goto bad; } gw6 = &(((struct sockaddr_in6 *)rt->rt_gateway)->sin6_addr); if (memcmp(&src6, gw6, sizeof(struct in6_addr)) != 0) { nd6log(LOG_ERR, "ICMP6 redirect rejected; " "not equal to gw-for-src=%s (must be same): %s\n", IN6_PRINT(ip6buf, gw6), icmp6_redirect_diag(diagbuf, sizeof(diagbuf), &src6, &reddst6, &redtgt6)); rt_unref(rt); goto bad; } } else { nd6log(LOG_ERR, "ICMP6 redirect rejected; " "no route found for redirect dst: %s\n", icmp6_redirect_diag(diagbuf, sizeof(diagbuf), &src6, &reddst6, &redtgt6)); goto bad; } rt_unref(rt); rt = NULL; } if (IN6_IS_ADDR_MULTICAST(&reddst6)) { nd6log(LOG_ERR, "ICMP6 redirect rejected; " "redirect dst must be unicast: %s\n", icmp6_redirect_diag(diagbuf, sizeof(diagbuf), &src6, &reddst6, &redtgt6)); goto bad; } is_router = is_onlink = 0; if (IN6_IS_ADDR_LINKLOCAL(&redtgt6)) is_router = 1; /* router case */ if (memcmp(&redtgt6, &reddst6, sizeof(redtgt6)) == 0) is_onlink = 1; /* on-link destination case */ if (!is_router && !is_onlink) { nd6log(LOG_ERR, "ICMP6 redirect rejected; " "neither router case nor onlink case: %s\n", icmp6_redirect_diag(diagbuf, sizeof(diagbuf), &src6, &reddst6, &redtgt6)); goto bad; } /* validation passed */ icmp6len -= sizeof(*nd_rd); nd6_option_init(nd_rd + 1, icmp6len, &ndopts); if (nd6_options(&ndopts) < 0) { nd6log(LOG_INFO, "invalid ND option, rejected: %s\n", icmp6_redirect_diag(diagbuf, sizeof(diagbuf), &src6, &reddst6, &redtgt6)); /* nd6_options have incremented stats */ goto freeit; } if (ndopts.nd_opts_tgt_lladdr) { lladdr = (char *)(ndopts.nd_opts_tgt_lladdr + 1); lladdrlen = ndopts.nd_opts_tgt_lladdr->nd_opt_len << 3; } if (lladdr && ((ifp->if_addrlen + 2 + 7) & ~7) != lladdrlen) { nd6log(LOG_INFO, "lladdrlen mismatch for %s " "(if %d, icmp6 packet %d): %s\n", IN6_PRINT(ip6buf, &redtgt6), ifp->if_addrlen, lladdrlen - 2, icmp6_redirect_diag(diagbuf, sizeof(diagbuf), &src6, &reddst6, &redtgt6)); goto bad; } /* RFC 2461 8.3 */ nd6_cache_lladdr(ifp, &redtgt6, lladdr, lladdrlen, ND_REDIRECT, is_onlink ? ND_REDIRECT_ONLINK : ND_REDIRECT_ROUTER); m_put_rcvif_psref(ifp, &psref); ifp = NULL; if (!is_onlink) { /* better router case. perform rtredirect. */ /* perform rtredirect */ struct sockaddr_in6 sdst; struct sockaddr_in6 sgw; struct sockaddr_in6 ssrc; unsigned long rtcount; struct rtentry *newrt = NULL; /* * do not install redirect route, if the number of entries * is too much (> hiwat). note that, the node (= host) will * work just fine even if we do not install redirect route * (there will be additional hops, though). */ mutex_enter(&icmp6_mtx); rtcount = rt_timer_count(icmp6_redirect_timeout_q); if (0 <= ip6_maxdynroutes && rtcount >= ip6_maxdynroutes) { mutex_exit(&icmp6_mtx); goto freeit; } if (0 <= icmp6_redirect_hiwat && rtcount > icmp6_redirect_hiwat) { mutex_exit(&icmp6_mtx); goto freeit; } else if (0 <= icmp6_redirect_lowat && rtcount > icmp6_redirect_lowat) { /* * XXX nuke a victim, install the new one. */ } memset(&sdst, 0, sizeof(sdst)); memset(&sgw, 0, sizeof(sgw)); memset(&ssrc, 0, sizeof(ssrc)); sdst.sin6_family = sgw.sin6_family = ssrc.sin6_family = AF_INET6; sdst.sin6_len = sgw.sin6_len = ssrc.sin6_len = sizeof(struct sockaddr_in6); bcopy(&redtgt6, &sgw.sin6_addr, sizeof(struct in6_addr)); bcopy(&reddst6, &sdst.sin6_addr, sizeof(struct in6_addr)); bcopy(&src6, &ssrc.sin6_addr, sizeof(struct in6_addr)); rtredirect(sin6tosa(&sdst), sin6tosa(&sgw), NULL, RTF_GATEWAY | RTF_HOST, sin6tosa(&ssrc), &newrt); if (newrt) { (void)rt_timer_add(newrt, icmp6_redirect_timeout, icmp6_redirect_timeout_q); rt_unref(newrt); } mutex_exit(&icmp6_mtx); } /* finally update cached route in each socket via pfctlinput */ { struct sockaddr_in6 sdst; sockaddr_in6_init(&sdst, &reddst6, 0, 0, 0); pfctlinput(PRC_REDIRECT_HOST, sin6tosa(&sdst)); #if defined(IPSEC) if (ipsec_used) key_sa_routechange(sin6tosa(&sdst)); #endif } freeit: if (ifp != NULL) m_put_rcvif_psref(ifp, &psref); m_freem(m); return; bad: m_put_rcvif_psref(ifp, &psref); ICMP6_STATINC(ICMP6_STAT_BADREDIRECT); m_freem(m); } void icmp6_redirect_output(struct mbuf *m0, struct rtentry *rt) { struct ifnet *ifp; /* my outgoing interface */ struct in6_addr *ifp_ll6; struct in6_addr *nexthop; struct ip6_hdr *sip6; /* m0 as struct ip6_hdr */ struct mbuf *m = NULL; /* newly allocated one */ struct ip6_hdr *ip6; /* m as struct ip6_hdr */ struct nd_redirect *nd_rd; size_t maxlen; u_char *p; struct sockaddr_in6 src_sa; icmp6_errcount(ICMP6_STAT_OUTERRHIST, ND_REDIRECT, 0); /* if we are not router, we don't send icmp6 redirect */ if (!ip6_forwarding) goto fail; /* sanity check */ KASSERT(m0 != NULL); KASSERT(rt != NULL); ifp = rt->rt_ifp; /* * Address check: * the source address must identify a neighbor, and * the destination address must not be a multicast address * [RFC 2461, sec 8.2] */ sip6 = mtod(m0, struct ip6_hdr *); sockaddr_in6_init(&src_sa, &sip6->ip6_src, 0, 0, 0); if (nd6_is_addr_neighbor(&src_sa, ifp) == 0) goto fail; if (IN6_IS_ADDR_MULTICAST(&sip6->ip6_dst)) goto fail; /* what should we do here? */ /* rate limit */ if (icmp6_ratelimit(&sip6->ip6_src, ND_REDIRECT, 0)) goto fail; /* * Since we are going to append up to 1280 bytes (= IPV6_MMTU), * we almost always ask for an mbuf cluster for simplicity. * (MHLEN < IPV6_MMTU is almost always true) */ MGETHDR(m, M_DONTWAIT, MT_HEADER); if (m && IPV6_MMTU >= MHLEN) { #if IPV6_MMTU >= MCLBYTES MEXTMALLOC(m, IPV6_MMTU, M_NOWAIT); #else MCLGET(m, M_DONTWAIT); #endif } if (!m) goto fail; m_reset_rcvif(m); m->m_len = 0; maxlen = M_TRAILINGSPACE(m); maxlen = uimin(IPV6_MMTU, maxlen); /* just for safety */ if (maxlen < sizeof(struct ip6_hdr) + sizeof(struct nd_redirect) + ((sizeof(struct nd_opt_hdr) + ifp->if_addrlen + 7) & ~7)) { goto fail; } { /* get ip6 linklocal address for ifp(my outgoing interface). */ struct in6_ifaddr *ia; int s = pserialize_read_enter(); if ((ia = in6ifa_ifpforlinklocal(ifp, IN6_IFF_NOTREADY| IN6_IFF_ANYCAST)) == NULL) { pserialize_read_exit(s); goto fail; } ifp_ll6 = &ia->ia_addr.sin6_addr; pserialize_read_exit(s); } /* get ip6 linklocal address for the router. */ if (rt->rt_gateway && (rt->rt_flags & RTF_GATEWAY)) { struct sockaddr_in6 *sin6; sin6 = (struct sockaddr_in6 *)rt->rt_gateway; nexthop = &sin6->sin6_addr; if (!IN6_IS_ADDR_LINKLOCAL(nexthop)) nexthop = NULL; } else nexthop = NULL; /* ip6 */ ip6 = mtod(m, struct ip6_hdr *); ip6->ip6_flow = 0; ip6->ip6_vfc &= ~IPV6_VERSION_MASK; ip6->ip6_vfc |= IPV6_VERSION; /* ip6->ip6_plen will be set later */ ip6->ip6_nxt = IPPROTO_ICMPV6; ip6->ip6_hlim = 255; /* ip6->ip6_src must be linklocal addr for my outgoing if. */ bcopy(ifp_ll6, &ip6->ip6_src, sizeof(struct in6_addr)); bcopy(&sip6->ip6_src, &ip6->ip6_dst, sizeof(struct in6_addr)); /* ND Redirect */ nd_rd = (struct nd_redirect *)(ip6 + 1); nd_rd->nd_rd_type = ND_REDIRECT; nd_rd->nd_rd_code = 0; nd_rd->nd_rd_reserved = 0; if (rt->rt_flags & RTF_GATEWAY) { /* * nd_rd->nd_rd_target must be a link-local address in * better router cases. */ if (!nexthop) goto fail; bcopy(nexthop, &nd_rd->nd_rd_target, sizeof(nd_rd->nd_rd_target)); bcopy(&sip6->ip6_dst, &nd_rd->nd_rd_dst, sizeof(nd_rd->nd_rd_dst)); } else { /* make sure redtgt == reddst */ nexthop = &sip6->ip6_dst; bcopy(&sip6->ip6_dst, &nd_rd->nd_rd_target, sizeof(nd_rd->nd_rd_target)); bcopy(&sip6->ip6_dst, &nd_rd->nd_rd_dst, sizeof(nd_rd->nd_rd_dst)); } p = (u_char *)(nd_rd + 1); { /* target lladdr option */ struct llentry *ln = NULL; int len, pad; struct nd_opt_hdr *nd_opt; char *lladdr; ln = nd6_lookup(nexthop, ifp, false); if (ln == NULL) goto nolladdropt; len = sizeof(*nd_opt) + ifp->if_addrlen; len = (len + 7) & ~7; /* round by 8 */ pad = len - (sizeof(*nd_opt) + ifp->if_addrlen); /* safety check */ if (len + (p - (u_char *)ip6) > maxlen) { LLE_RUNLOCK(ln); goto nolladdropt; } if (ln->la_flags & LLE_VALID) { nd_opt = (struct nd_opt_hdr *)p; nd_opt->nd_opt_type = ND_OPT_TARGET_LINKADDR; nd_opt->nd_opt_len = len >> 3; lladdr = (char *)(nd_opt + 1); memcpy(lladdr, &ln->ll_addr, ifp->if_addrlen); memset(lladdr + ifp->if_addrlen, 0, pad); p += len; } LLE_RUNLOCK(ln); } nolladdropt: m->m_pkthdr.len = m->m_len = p - (u_char *)ip6; /* just to be safe */ if (m0->m_flags & M_DECRYPTED) goto noredhdropt; if (p - (u_char *)ip6 > maxlen) goto noredhdropt; { /* redirected header option */ int len; struct nd_opt_rd_hdr *nd_opt_rh; /* * compute the maximum size for icmp6 redirect header option. * XXX room for auth header? */ len = maxlen - (p - (u_char *)ip6); len &= ~7; if (len < sizeof(*nd_opt_rh)) { goto noredhdropt; } /* * Redirected header option spec (RFC2461 4.6.3) talks nothing * about padding/truncate rule for the original IP packet. * From the discussion on IPv6imp in Feb 1999, * the consensus was: * - "attach as much as possible" is the goal * - pad if not aligned (original size can be guessed by * original ip6 header) * Following code adds the padding if it is simple enough, * and truncates if not. */ if (len - sizeof(*nd_opt_rh) < m0->m_pkthdr.len) { /* not enough room, truncate */ m_adj(m0, (len - sizeof(*nd_opt_rh)) - m0->m_pkthdr.len); } else { /* * enough room, truncate if not aligned. * we don't pad here for simplicity. */ int extra; extra = m0->m_pkthdr.len % 8; if (extra) { /* truncate */ m_adj(m0, -extra); } len = m0->m_pkthdr.len + sizeof(*nd_opt_rh); } nd_opt_rh = (struct nd_opt_rd_hdr *)p; memset(nd_opt_rh, 0, sizeof(*nd_opt_rh)); nd_opt_rh->nd_opt_rh_type = ND_OPT_REDIRECTED_HEADER; nd_opt_rh->nd_opt_rh_len = len >> 3; p += sizeof(*nd_opt_rh); m->m_pkthdr.len = m->m_len = p - (u_char *)ip6; /* connect m0 to m */ m->m_pkthdr.len += m0->m_pkthdr.len; m_cat(m, m0); m0 = NULL; } noredhdropt: if (m0) { m_freem(m0); m0 = NULL; } /* XXX: clear embedded link IDs in the inner header */ in6_clearscope(&sip6->ip6_src); in6_clearscope(&sip6->ip6_dst); in6_clearscope(&nd_rd->nd_rd_target); in6_clearscope(&nd_rd->nd_rd_dst); ip6->ip6_plen = htons(m->m_pkthdr.len - sizeof(struct ip6_hdr)); nd_rd->nd_rd_cksum = 0; nd_rd->nd_rd_cksum = in6_cksum(m, IPPROTO_ICMPV6, sizeof(*ip6), ntohs(ip6->ip6_plen)); /* send the packet to outside... */ if (ip6_output(m, NULL, NULL, 0, NULL, NULL, NULL) != 0) icmp6_ifstat_inc(ifp, ifs6_out_error); icmp6_ifstat_inc(ifp, ifs6_out_msg); icmp6_ifstat_inc(ifp, ifs6_out_redirect); ICMP6_STATINC(ICMP6_STAT_OUTHIST + ND_REDIRECT); return; fail: if (m) m_freem(m); if (m0) m_freem(m0); } /* * ICMPv6 socket option processing. */ int icmp6_ctloutput(int op, struct socket *so, struct sockopt *sopt) { int error = 0; struct inpcb *inp = sotoinpcb(so); if (sopt->sopt_level != IPPROTO_ICMPV6) return rip6_ctloutput(op, so, sopt); switch (op) { case PRCO_SETOPT: switch (sopt->sopt_name) { case ICMP6_FILTER: { struct icmp6_filter fil; error = sockopt_get(sopt, &fil, sizeof(fil)); if (error) break; memcpy(in6p_icmp6filt(inp), &fil, sizeof(struct icmp6_filter)); error = 0; break; } default: error = ENOPROTOOPT; break; } break; case PRCO_GETOPT: switch (sopt->sopt_name) { case ICMP6_FILTER: { if (in6p_icmp6filt(inp) == NULL) { error = EINVAL; break; } error = sockopt_set(sopt, in6p_icmp6filt(inp), sizeof(struct icmp6_filter)); break; } default: error = ENOPROTOOPT; break; } break; } return error; } /* * Perform rate limit check. * Returns 0 if it is okay to send the icmp6 packet. * Returns 1 if the router SHOULD NOT send this icmp6 packet due to rate * limitation. * * XXX per-destination/type check necessary? */ static int icmp6_ratelimit( const struct in6_addr *dst, /* not used at this moment */ const int type, /* not used at this moment */ const int code) /* not used at this moment */ { int ret; ret = 0; /* okay to send */ /* PPS limit */ if (!ppsratecheck(&icmp6errppslim_last, &icmp6errpps_count, icmp6errppslim)) { /* The packet is subject to rate limit */ ret++; } return ret; } static struct rtentry * icmp6_mtudisc_clone(struct sockaddr *dst) { struct rtentry *rt; int error; rt = rtalloc1(dst, 1); if (rt == NULL) return NULL; /* If we didn't get a host route, allocate one */ if ((rt->rt_flags & RTF_HOST) == 0) { struct rtentry *nrt; error = rtrequest(RTM_ADD, dst, rt->rt_gateway, NULL, RTF_GATEWAY | RTF_HOST | RTF_DYNAMIC, &nrt); if (error) { rt_unref(rt); return NULL; } nrt->rt_rmx = rt->rt_rmx; rt_newmsg_dynamic(RTM_ADD, nrt); rt_unref(rt); rt = nrt; } mutex_enter(&icmp6_mtx); error = rt_timer_add(rt, icmp6_mtudisc_timeout, icmp6_mtudisc_timeout_q); mutex_exit(&icmp6_mtx); if (error) { rt_unref(rt); return NULL; } return rt; /* caller need to call rtfree() */ } static void icmp6_mtudisc_timeout(struct rtentry *rt, struct rttimer *r) { struct rtentry *retrt; KASSERT(rt != NULL); rt_assert_referenced(rt); if ((rt->rt_flags & (RTF_DYNAMIC | RTF_HOST)) == (RTF_DYNAMIC | RTF_HOST)) { rtrequest(RTM_DELETE, rt_getkey(rt), rt->rt_gateway, rt_mask(rt), rt->rt_flags, &retrt); rt_newmsg_dynamic(RTM_DELETE, retrt); rt_unref(rt); rt_free(retrt); } else { if (!(rt->rt_rmx.rmx_locks & RTV_MTU)) rt->rt_rmx.rmx_mtu = 0; } } static void icmp6_redirect_timeout(struct rtentry *rt, struct rttimer *r) { struct rtentry *retrt; KASSERT(rt != NULL); rt_assert_referenced(rt); if ((rt->rt_flags & (RTF_GATEWAY | RTF_DYNAMIC | RTF_HOST)) == (RTF_GATEWAY | RTF_DYNAMIC | RTF_HOST)) { rtrequest(RTM_DELETE, rt_getkey(rt), rt->rt_gateway, rt_mask(rt), rt->rt_flags, &retrt); rt_newmsg_dynamic(RTM_DELETE, retrt); rt_unref(rt); rt_free(retrt); } } static int sysctl_net_inet6_icmp6_stats(SYSCTLFN_ARGS) { return (NETSTAT_SYSCTL(icmp6stat_percpu, ICMP6_NSTATS)); } static int sysctl_net_inet6_icmp6_redirtimeout(SYSCTLFN_ARGS) { int error, tmp; struct sysctlnode node; mutex_enter(&icmp6_mtx); node = *rnode; node.sysctl_data = &tmp; tmp = icmp6_redirtimeout; error = sysctl_lookup(SYSCTLFN_CALL(&node)); if (error || newp == NULL) goto out; if (tmp < 0) { error = EINVAL; goto out; } icmp6_redirtimeout = tmp; if (icmp6_redirect_timeout_q != NULL) { if (icmp6_redirtimeout == 0) { rt_timer_queue_destroy(icmp6_redirect_timeout_q); } else { rt_timer_queue_change(icmp6_redirect_timeout_q, icmp6_redirtimeout); } } else if (icmp6_redirtimeout > 0) { icmp6_redirect_timeout_q = rt_timer_queue_create(icmp6_redirtimeout); } error = 0; out: mutex_exit(&icmp6_mtx); return error; } static void sysctl_net_inet6_icmp6_setup(struct sysctllog **clog) { sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT, CTLTYPE_NODE, "inet6", NULL, NULL, 0, NULL, 0, CTL_NET, PF_INET6, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT, CTLTYPE_NODE, "icmp6", SYSCTL_DESCR("ICMPv6 related settings"), NULL, 0, NULL, 0, CTL_NET, PF_INET6, IPPROTO_ICMPV6, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT, CTLTYPE_STRUCT, "stats", SYSCTL_DESCR("ICMPv6 transmission statistics"), sysctl_net_inet6_icmp6_stats, 0, NULL, 0, CTL_NET, PF_INET6, IPPROTO_ICMPV6, ICMPV6CTL_STATS, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_INT, "rediraccept", SYSCTL_DESCR("Accept and process redirect messages"), NULL, 0, &icmp6_rediraccept, 0, CTL_NET, PF_INET6, IPPROTO_ICMPV6, ICMPV6CTL_REDIRACCEPT, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_INT, "redirtimeout", SYSCTL_DESCR("Redirect generated route lifetime"), sysctl_net_inet6_icmp6_redirtimeout, 0, &icmp6_redirtimeout, 0, CTL_NET, PF_INET6, IPPROTO_ICMPV6, ICMPV6CTL_REDIRTIMEOUT, CTL_EOL); #if 0 /* obsoleted */ sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_INT, "errratelimit", NULL, NULL, 0, &icmp6_errratelimit, 0, CTL_NET, PF_INET6, IPPROTO_ICMPV6, ICMPV6CTL_ERRRATELIMIT, CTL_EOL); #endif sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_INT, "nd6_prune", SYSCTL_DESCR("Neighbor discovery prune interval"), NULL, 0, &nd6_prune, 0, CTL_NET, PF_INET6, IPPROTO_ICMPV6, ICMPV6CTL_ND6_PRUNE, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_INT, "nd6_delay", SYSCTL_DESCR("First probe delay time"), NULL, 0, &nd6_nd_domain.nd_delay, 0, CTL_NET, PF_INET6, IPPROTO_ICMPV6, ICMPV6CTL_ND6_DELAY, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_INT, "nd6_mmaxtries", SYSCTL_DESCR("Number of multicast discovery attempts"), NULL, 0, &nd6_nd_domain.nd_mmaxtries, 0, CTL_NET, PF_INET6, IPPROTO_ICMPV6, ICMPV6CTL_ND6_MMAXTRIES, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_INT, "nd6_umaxtries", SYSCTL_DESCR("Number of unicast discovery attempts"), NULL, 0, &nd6_nd_domain.nd_umaxtries, 0, CTL_NET, PF_INET6, IPPROTO_ICMPV6, ICMPV6CTL_ND6_UMAXTRIES, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_INT, "nd6_maxnudhint", SYSCTL_DESCR("Maximum neighbor unreachable hint count"), NULL, 0, &nd6_nd_domain.nd_maxnudhint, 0, CTL_NET, PF_INET6, IPPROTO_ICMPV6, ICMPV6CTL_ND6_MAXNUDHINT, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_INT, "maxqueuelen", SYSCTL_DESCR("max packet queue len for a unresolved ND"), NULL, 1, &nd6_nd_domain.nd_maxqueuelen, 0, CTL_NET, PF_INET6, IPPROTO_ICMPV6, ICMPV6CTL_ND6_MAXQLEN, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_INT, "nd6_useloopback", SYSCTL_DESCR("Use loopback interface for local traffic"), NULL, 0, &nd6_useloopback, 0, CTL_NET, PF_INET6, IPPROTO_ICMPV6, ICMPV6CTL_ND6_USELOOPBACK, CTL_EOL); #if 0 /* obsoleted */ sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_INT, "nd6_proxyall", NULL, NULL, 0, &nd6_proxyall, 0, CTL_NET, PF_INET6, IPPROTO_ICMPV6, ICMPV6CTL_ND6_PROXYALL, CTL_EOL); #endif sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_INT, "nodeinfo", SYSCTL_DESCR("Respond to node information requests"), NULL, 0, &icmp6_nodeinfo, 0, CTL_NET, PF_INET6, IPPROTO_ICMPV6, ICMPV6CTL_NODEINFO, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_INT, "errppslimit", SYSCTL_DESCR("Maximum ICMP errors sent per second"), NULL, 0, &icmp6errppslim, 0, CTL_NET, PF_INET6, IPPROTO_ICMPV6, ICMPV6CTL_ERRPPSLIMIT, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_INT, "mtudisc_hiwat", SYSCTL_DESCR("Low mark on MTU Discovery route timers"), NULL, 0, &icmp6_mtudisc_hiwat, 0, CTL_NET, PF_INET6, IPPROTO_ICMPV6, ICMPV6CTL_MTUDISC_HIWAT, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_INT, "mtudisc_lowat", SYSCTL_DESCR("Low mark on MTU Discovery route timers"), NULL, 0, &icmp6_mtudisc_lowat, 0, CTL_NET, PF_INET6, IPPROTO_ICMPV6, ICMPV6CTL_MTUDISC_LOWAT, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_INT, "nd6_debug", SYSCTL_DESCR("Enable neighbor discovery debug output"), NULL, 0, &nd6_debug, 0, CTL_NET, PF_INET6, IPPROTO_ICMPV6, ICMPV6CTL_ND6_DEBUG, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_BOOL, "reflect_pmtu", SYSCTL_DESCR("Use path MTU Discovery for icmpv6 reflect"), NULL, 0, &icmp6_reflect_pmtu, 0, CTL_NET, PF_INET6, IPPROTO_ICMPV6, ICMPV6CTL_REFLECT_PMTU, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_BOOL, "dynamic_rt_msg", SYSCTL_DESCR("Send routing message for RTF_DYNAMIC"), NULL, 0, &icmp6_dynamic_rt_msg, 0, CTL_NET, PF_INET6, IPPROTO_ICMPV6, ICMPV6CTL_DYNAMIC_RT_MSG, CTL_EOL); } void icmp6_statinc(u_int stat) { KASSERT(stat < ICMP6_NSTATS); ICMP6_STATINC(stat); }
1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 /* $NetBSD: ugen.c,v 1.177 2024/03/29 19:30:09 thorpej Exp $ */ /* * Copyright (c) 1998, 2004 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Lennart Augustsson (lennart@augustsson.net) at * Carlstedt Research & Technology. * * Copyright (c) 2006 BBN Technologies Corp. All rights reserved. * Effort sponsored in part by the Defense Advanced Research Projects * Agency (DARPA) and the Department of the Interior National Business * Center under agreement number NBCHC050166. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: ugen.c,v 1.177 2024/03/29 19:30:09 thorpej Exp $"); #ifdef _KERNEL_OPT #include "opt_compat_netbsd.h" #include "opt_usb.h" #endif #include <sys/param.h> #include <sys/systm.h> #include <sys/kernel.h> #include <sys/kmem.h> #include <sys/device.h> #include <sys/ioctl.h> #include <sys/conf.h> #include <sys/tty.h> #include <sys/file.h> #include <sys/select.h> #include <sys/proc.h> #include <sys/vnode.h> #include <sys/poll.h> #include <sys/compat_stub.h> #include <sys/module.h> #include <sys/rbtree.h> #include <dev/usb/usb.h> #include <dev/usb/usbdi.h> #include <dev/usb/usbdi_util.h> #include <dev/usb/usbhist.h> #include "ioconf.h" #ifdef USB_DEBUG #ifndef UGEN_DEBUG #define ugendebug 0 #else #ifndef UGEN_DEBUG_DEFAULT #define UGEN_DEBUG_DEFAULT 0 #endif int ugendebug = UGEN_DEBUG_DEFAULT; SYSCTL_SETUP(sysctl_hw_ugen_setup, "sysctl hw.ugen setup") { int err; const struct sysctlnode *rnode; const struct sysctlnode *cnode; err = sysctl_createv(clog, 0, NULL, &rnode, CTLFLAG_PERMANENT, CTLTYPE_NODE, "ugen", SYSCTL_DESCR("ugen global controls"), NULL, 0, NULL, 0, CTL_HW, CTL_CREATE, CTL_EOL); if (err) goto fail; /* control debugging printfs */ err = sysctl_createv(clog, 0, &rnode, &cnode, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_INT, "debug", SYSCTL_DESCR("Enable debugging output"), NULL, 0, &ugendebug, sizeof(ugendebug), CTL_CREATE, CTL_EOL); if (err) goto fail; return; fail: aprint_error("%s: sysctl_createv failed (err = %d)\n", __func__, err); } #endif /* UGEN_DEBUG */ #endif /* USB_DEBUG */ #define DPRINTF(FMT,A,B,C,D) USBHIST_LOGN(ugendebug,1,FMT,A,B,C,D) #define DPRINTFN(N,FMT,A,B,C,D) USBHIST_LOGN(ugendebug,N,FMT,A,B,C,D) #define UGENHIST_FUNC() USBHIST_FUNC() #define UGENHIST_CALLED(name) USBHIST_CALLED(ugendebug) #define UGENHIST_CALLARGS(FMT,A,B,C,D) \ USBHIST_CALLARGS(ugendebug,FMT,A,B,C,D) #define UGENHIST_CALLARGSN(N,FMT,A,B,C,D) \ USBHIST_CALLARGSN(ugendebug,N,FMT,A,B,C,D) #define UGEN_CHUNK 128 /* chunk size for read */ #define UGEN_IBSIZE 1020 /* buffer size */ #define UGEN_BBSIZE 1024 #define UGEN_NISOREQS 4 /* number of outstanding xfer requests */ #define UGEN_NISORFRMS 8 /* number of transactions per req */ #define UGEN_NISOFRAMES (UGEN_NISORFRMS * UGEN_NISOREQS) #define UGEN_BULK_RA_WB_BUFSIZE 16384 /* default buffer size */ #define UGEN_BULK_RA_WB_BUFMAX (1 << 20) /* maximum allowed buffer */ struct isoreq { struct ugen_endpoint *sce; struct usbd_xfer *xfer; void *dmabuf; uint16_t sizes[UGEN_NISORFRMS]; }; struct ugen_endpoint { struct ugen_softc *sc; usb_endpoint_descriptor_t *edesc; struct usbd_interface *iface; int state; #define UGEN_SHORT_OK 0x04 /* short xfers are OK */ #define UGEN_BULK_RA 0x08 /* in bulk read-ahead mode */ #define UGEN_BULK_WB 0x10 /* in bulk write-behind mode */ #define UGEN_RA_WB_STOP 0x20 /* RA/WB xfer is stopped (buffer full/empty) */ struct usbd_pipe *pipeh; struct clist q; u_char *ibuf; /* start of buffer (circular for isoc) */ u_char *fill; /* location for input (isoc) */ u_char *limit; /* end of circular buffer (isoc) */ u_char *cur; /* current read location (isoc) */ uint32_t timeout; uint32_t ra_wb_bufsize; /* requested size for RA/WB buffer */ uint32_t ra_wb_reqsize; /* requested xfer length for RA/WB */ uint32_t ra_wb_used; /* how much is in buffer */ uint32_t ra_wb_xferlen; /* current xfer length for RA/WB */ struct usbd_xfer *ra_wb_xfer; struct isoreq isoreqs[UGEN_NISOREQS]; /* Keep these last; we don't overwrite them in ugen_set_config() */ #define UGEN_ENDPOINT_NONZERO_CRUFT offsetof(struct ugen_endpoint, rsel) struct selinfo rsel; kcondvar_t cv; }; struct ugen_softc { device_t sc_dev; /* base device */ struct usbd_device *sc_udev; struct rb_node sc_node; unsigned sc_unit; kmutex_t sc_lock; kcondvar_t sc_detach_cv; char sc_is_open[USB_MAX_ENDPOINTS]; struct ugen_endpoint sc_endpoints[USB_MAX_ENDPOINTS][2]; #define OUT 0 #define IN 1 int sc_refcnt; char sc_buffer[UGEN_BBSIZE]; u_char sc_dying; u_char sc_attached; }; static struct { kmutex_t lock; rb_tree_t tree; } ugenif __cacheline_aligned; static int compare_ugen(void *cookie, const void *vsca, const void *vscb) { const struct ugen_softc *sca = vsca; const struct ugen_softc *scb = vscb; if (sca->sc_unit < scb->sc_unit) return -1; if (sca->sc_unit > scb->sc_unit) return +1; return 0; } static int compare_ugen_key(void *cookie, const void *vsc, const void *vk) { const struct ugen_softc *sc = vsc; const unsigned *k = vk; if (sc->sc_unit < *k) return -1; if (sc->sc_unit > *k) return +1; return 0; } static const rb_tree_ops_t ugenif_tree_ops = { .rbto_compare_nodes = compare_ugen, .rbto_compare_key = compare_ugen_key, .rbto_node_offset = offsetof(struct ugen_softc, sc_node), }; static void ugenif_get_unit(struct ugen_softc *sc) { struct ugen_softc *sc0; unsigned i; mutex_enter(&ugenif.lock); for (i = 0, sc0 = RB_TREE_MIN(&ugenif.tree); sc0 != NULL && i == sc0->sc_unit; i++, sc0 = RB_TREE_NEXT(&ugenif.tree, sc0)) KASSERT(i < UINT_MAX); KASSERT(rb_tree_find_node(&ugenif.tree, &i) == NULL); sc->sc_unit = i; sc0 = rb_tree_insert_node(&ugenif.tree, sc); KASSERT(sc0 == sc); KASSERT(rb_tree_find_node(&ugenif.tree, &i) == sc); mutex_exit(&ugenif.lock); prop_dictionary_set_uint(device_properties(sc->sc_dev), "ugen-unit", sc->sc_unit); } static void ugenif_put_unit(struct ugen_softc *sc) { prop_dictionary_remove(device_properties(sc->sc_dev), "ugen-unit"); mutex_enter(&ugenif.lock); KASSERT(rb_tree_find_node(&ugenif.tree, &sc->sc_unit) == sc); rb_tree_remove_node(&ugenif.tree, sc); sc->sc_unit = -1; mutex_exit(&ugenif.lock); } static struct ugen_softc * ugenif_acquire(unsigned unit) { struct ugen_softc *sc; mutex_enter(&ugenif.lock); sc = rb_tree_find_node(&ugenif.tree, &unit); if (sc == NULL) goto out; mutex_enter(&sc->sc_lock); if (sc->sc_dying) { mutex_exit(&sc->sc_lock); sc = NULL; goto out; } KASSERT(sc->sc_refcnt < INT_MAX); sc->sc_refcnt++; mutex_exit(&sc->sc_lock); out: mutex_exit(&ugenif.lock); return sc; } static void ugenif_release(struct ugen_softc *sc) { mutex_enter(&sc->sc_lock); if (--sc->sc_refcnt < 0) cv_broadcast(&sc->sc_detach_cv); mutex_exit(&sc->sc_lock); } static dev_type_open(ugenopen); static dev_type_close(ugenclose); static dev_type_read(ugenread); static dev_type_write(ugenwrite); static dev_type_ioctl(ugenioctl); static dev_type_poll(ugenpoll); static dev_type_kqfilter(ugenkqfilter); const struct cdevsw ugen_cdevsw = { .d_open = ugenopen, .d_close = ugenclose, .d_read = ugenread, .d_write = ugenwrite, .d_ioctl = ugenioctl, .d_stop = nostop, .d_tty = notty, .d_poll = ugenpoll, .d_mmap = nommap, .d_kqfilter = ugenkqfilter, .d_discard = nodiscard, .d_flag = D_OTHER, }; Static void ugenintr(struct usbd_xfer *, void *, usbd_status); Static void ugen_isoc_rintr(struct usbd_xfer *, void *, usbd_status); Static void ugen_bulkra_intr(struct usbd_xfer *, void *, usbd_status); Static void ugen_bulkwb_intr(struct usbd_xfer *, void *, usbd_status); Static int ugen_do_read(struct ugen_softc *, int, struct uio *, int); Static int ugen_do_write(struct ugen_softc *, int, struct uio *, int); Static int ugen_do_ioctl(struct ugen_softc *, int, u_long, void *, int, struct lwp *); Static int ugen_set_config(struct ugen_softc *, int, int); Static usb_config_descriptor_t *ugen_get_cdesc(struct ugen_softc *, int, int *); Static usbd_status ugen_set_interface(struct ugen_softc *, int, int); Static int ugen_get_alt_index(struct ugen_softc *, int); Static void ugen_clear_endpoints(struct ugen_softc *); #define UGENUNIT(n) ((minor(n) >> 4) & 0xf) #define UGENENDPOINT(n) (minor(n) & 0xf) #define UGENDEV(u, e) (makedev(0, ((u) << 4) | (e))) static int ugenif_match(device_t, cfdata_t, void *); static void ugenif_attach(device_t, device_t, void *); static int ugen_match(device_t, cfdata_t, void *); static void ugen_attach(device_t, device_t, void *); static int ugen_detach(device_t, int); static int ugen_activate(device_t, enum devact); CFATTACH_DECL_NEW(ugen, sizeof(struct ugen_softc), ugen_match, ugen_attach, ugen_detach, ugen_activate); CFATTACH_DECL_NEW(ugenif, sizeof(struct ugen_softc), ugenif_match, ugenif_attach, ugen_detach, ugen_activate); /* toggle to control attach priority. -1 means "let autoconf decide" */ int ugen_override = -1; static int ugen_match(device_t parent, cfdata_t match, void *aux) { struct usb_attach_arg *uaa = aux; int override; if (ugen_override != -1) override = ugen_override; else override = match->cf_flags & 1; if (override) return UMATCH_HIGHEST; else if (uaa->uaa_usegeneric) return UMATCH_GENERIC; else return UMATCH_NONE; } static int ugenif_match(device_t parent, cfdata_t match, void *aux) { /* * Like ugen(4), ugenif(4) also has an override flag. It has the * opposite effect, however, causing us to match with GENERIC * priority rather than HIGHEST. */ return (match->cf_flags & 1) ? UMATCH_GENERIC : UMATCH_HIGHEST; } static void ugen_attach(device_t parent, device_t self, void *aux) { struct usb_attach_arg *uaa = aux; struct usbif_attach_arg uiaa; memset(&uiaa, 0, sizeof(uiaa)); uiaa.uiaa_port = uaa->uaa_port; uiaa.uiaa_vendor = uaa->uaa_vendor; uiaa.uiaa_product = uaa->uaa_product; uiaa.uiaa_release = uaa->uaa_release; uiaa.uiaa_device = uaa->uaa_device; uiaa.uiaa_configno = -1; uiaa.uiaa_ifaceno = -1; ugenif_attach(parent, self, &uiaa); } static void ugenif_attach(device_t parent, device_t self, void *aux) { struct ugen_softc *sc = device_private(self); struct usbif_attach_arg *uiaa = aux; struct usbd_device *udev; char *devinfop; usbd_status err; int i, dir, conf; aprint_naive("\n"); aprint_normal("\n"); mutex_init(&sc->sc_lock, MUTEX_DEFAULT, IPL_SOFTUSB); cv_init(&sc->sc_detach_cv, "ugendet"); devinfop = usbd_devinfo_alloc(uiaa->uiaa_device, 0); aprint_normal_dev(self, "%s\n", devinfop); usbd_devinfo_free(devinfop); sc->sc_dev = self; sc->sc_udev = udev = uiaa->uiaa_device; for (i = 0; i < USB_MAX_ENDPOINTS; i++) { for (dir = OUT; dir <= IN; dir++) { struct ugen_endpoint *sce; sce = &sc->sc_endpoints[i][dir]; selinit(&sce->rsel); cv_init(&sce->cv, "ugensce"); } } if (!pmf_device_register(self, NULL, NULL)) aprint_error_dev(self, "couldn't establish power handler\n"); if (uiaa->uiaa_ifaceno < 0) { /* * If we attach the whole device, * set configuration index 0, the default one. */ err = usbd_set_config_index(udev, 0, 0); if (err) { aprint_error_dev(self, "setting configuration index 0 failed\n"); return; } } /* Get current configuration */ conf = usbd_get_config_descriptor(udev)->bConfigurationValue; /* Set up all the local state for this configuration. */ err = ugen_set_config(sc, conf, uiaa->uiaa_ifaceno < 0); if (err) { aprint_error_dev(self, "setting configuration %d failed\n", conf); return; } ugenif_get_unit(sc); usbd_add_drv_event(USB_EVENT_DRIVER_ATTACH, sc->sc_udev, sc->sc_dev); sc->sc_attached = 1; } Static void ugen_clear_endpoints(struct ugen_softc *sc) { /* Clear out the old info, but leave the selinfo and cv initialised. */ for (int i = 0; i < USB_MAX_ENDPOINTS; i++) { for (int dir = OUT; dir <= IN; dir++) { struct ugen_endpoint *sce = &sc->sc_endpoints[i][dir]; memset(sce, 0, UGEN_ENDPOINT_NONZERO_CRUFT); } } } Static int ugen_set_config(struct ugen_softc *sc, int configno, int chkopen) { struct usbd_device *dev = sc->sc_udev; usb_config_descriptor_t *cdesc; struct usbd_interface *iface; usb_endpoint_descriptor_t *ed; struct ugen_endpoint *sce; uint8_t niface, nendpt; int ifaceno, endptno, endpt; usbd_status err; int dir; UGENHIST_FUNC(); UGENHIST_CALLARGSN(1, "ugen%jd: to configno %jd, sc=%jx", device_unit(sc->sc_dev), configno, (uintptr_t)sc, 0); KASSERT(KERNEL_LOCKED_P()); /* sc_is_open */ if (chkopen) { /* * We start at 1, not 0, because we don't care whether the * control endpoint is open or not. It is always present. */ for (endptno = 1; endptno < USB_MAX_ENDPOINTS; endptno++) if (sc->sc_is_open[endptno]) { DPRINTFN(1, "ugen%jd - endpoint %d is open", device_unit(sc->sc_dev), endptno, 0, 0); return USBD_IN_USE; } /* Prevent opening while we're setting the config. */ for (endptno = 1; endptno < USB_MAX_ENDPOINTS; endptno++) { KASSERT(!sc->sc_is_open[endptno]); sc->sc_is_open[endptno] = 1; } } /* Avoid setting the current value. */ cdesc = usbd_get_config_descriptor(dev); if (!cdesc || cdesc->bConfigurationValue != configno) { err = usbd_set_config_no(dev, configno, 1); if (err) goto out; } ugen_clear_endpoints(sc); err = usbd_interface_count(dev, &niface); if (err) goto out; for (ifaceno = 0; ifaceno < niface; ifaceno++) { DPRINTFN(1, "ifaceno %jd", ifaceno, 0, 0, 0); err = usbd_device2interface_handle(dev, ifaceno, &iface); if (err) goto out; err = usbd_endpoint_count(iface, &nendpt); if (err) goto out; for (endptno = 0; endptno < nendpt; endptno++) { ed = usbd_interface2endpoint_descriptor(iface, endptno); KASSERT(ed != NULL); endpt = ed->bEndpointAddress; dir = UE_GET_DIR(endpt) == UE_DIR_IN ? IN : OUT; sce = &sc->sc_endpoints[UE_GET_ADDR(endpt)][dir]; DPRINTFN(1, "endptno %jd, endpt=0x%02jx (%jd,%jd)", endptno, endpt, UE_GET_ADDR(endpt), UE_GET_DIR(endpt)); sce->sc = sc; sce->edesc = ed; sce->iface = iface; } } err = USBD_NORMAL_COMPLETION; out: if (chkopen) { /* * Allow open again now that we're done trying to set * the config. */ for (endptno = 1; endptno < USB_MAX_ENDPOINTS; endptno++) { KASSERT(sc->sc_is_open[endptno]); sc->sc_is_open[endptno] = 0; } } return err; } static int ugenopen(dev_t dev, int flag, int mode, struct lwp *l) { struct ugen_softc *sc; int unit = UGENUNIT(dev); int endpt = UGENENDPOINT(dev); usb_endpoint_descriptor_t *edesc; struct ugen_endpoint *sce; int dir, isize; usbd_status err; struct usbd_xfer *xfer; int i, j; int error; int opened = 0; UGENHIST_FUNC(); UGENHIST_CALLARGS("flag=%jd, mode=%jd, unit=%jd endpt=%jd", flag, mode, unit, endpt); KASSERT(KERNEL_LOCKED_P()); /* sc_is_open */ if ((sc = ugenif_acquire(unit)) == NULL) return ENXIO; /* The control endpoint allows multiple opens. */ if (endpt == USB_CONTROL_ENDPOINT) { opened = sc->sc_is_open[USB_CONTROL_ENDPOINT] = 1; error = 0; goto out; } if (sc->sc_is_open[endpt]) { error = EBUSY; goto out; } opened = sc->sc_is_open[endpt] = 1; /* Make sure there are pipes for all directions. */ for (dir = OUT; dir <= IN; dir++) { if (flag & (dir == OUT ? FWRITE : FREAD)) { sce = &sc->sc_endpoints[endpt][dir]; if (sce->edesc == NULL) { error = ENXIO; goto out; } } } /* Actually open the pipes. */ /* XXX Should back out properly if it fails. */ for (dir = OUT; dir <= IN; dir++) { if (!(flag & (dir == OUT ? FWRITE : FREAD))) continue; sce = &sc->sc_endpoints[endpt][dir]; sce->state = 0; sce->timeout = USBD_NO_TIMEOUT; DPRINTFN(5, "sc=%jx, endpt=%jd, dir=%jd, sce=%jp", (uintptr_t)sc, endpt, dir, (uintptr_t)sce); edesc = sce->edesc; switch (edesc->bmAttributes & UE_XFERTYPE) { case UE_INTERRUPT: if (dir == OUT) { err = usbd_open_pipe(sce->iface, edesc->bEndpointAddress, 0, &sce->pipeh); if (err) { error = EIO; goto out; } break; } isize = UGETW(edesc->wMaxPacketSize); if (isize == 0) { /* shouldn't happen */ error = EINVAL; goto out; } sce->ibuf = kmem_alloc(isize, KM_SLEEP); DPRINTFN(5, "intr endpt=%d, isize=%d", endpt, isize, 0, 0); if (clalloc(&sce->q, UGEN_IBSIZE, 0) == -1) { kmem_free(sce->ibuf, isize); sce->ibuf = NULL; error = ENOMEM; goto out; } err = usbd_open_pipe_intr(sce->iface, edesc->bEndpointAddress, USBD_SHORT_XFER_OK, &sce->pipeh, sce, sce->ibuf, isize, ugenintr, USBD_DEFAULT_INTERVAL); if (err) { clfree(&sce->q); kmem_free(sce->ibuf, isize); sce->ibuf = NULL; error = EIO; goto out; } DPRINTFN(5, "interrupt open done", 0, 0, 0, 0); break; case UE_BULK: err = usbd_open_pipe(sce->iface, edesc->bEndpointAddress, 0, &sce->pipeh); if (err) { error = EIO; goto out; } sce->ra_wb_bufsize = UGEN_BULK_RA_WB_BUFSIZE; /* * Use request size for non-RA/WB transfers * as the default. */ sce->ra_wb_reqsize = UGEN_BBSIZE; break; case UE_ISOCHRONOUS: if (dir == OUT) { error = EINVAL; goto out; } isize = UGETW(edesc->wMaxPacketSize); if (isize == 0) { /* shouldn't happen */ error = EINVAL; goto out; } sce->ibuf = kmem_alloc(isize * UGEN_NISOFRAMES, KM_SLEEP); sce->cur = sce->fill = sce->ibuf; sce->limit = sce->ibuf + isize * UGEN_NISOFRAMES; DPRINTFN(5, "isoc endpt=%d, isize=%d", endpt, isize, 0, 0); err = usbd_open_pipe(sce->iface, edesc->bEndpointAddress, 0, &sce->pipeh); if (err) { kmem_free(sce->ibuf, isize * UGEN_NISOFRAMES); sce->ibuf = NULL; error = EIO; goto out; } for (i = 0; i < UGEN_NISOREQS; ++i) { sce->isoreqs[i].sce = sce; err = usbd_create_xfer(sce->pipeh, isize * UGEN_NISORFRMS, 0, UGEN_NISORFRMS, &xfer); if (err) goto bad; sce->isoreqs[i].xfer = xfer; sce->isoreqs[i].dmabuf = usbd_get_buffer(xfer); for (j = 0; j < UGEN_NISORFRMS; ++j) sce->isoreqs[i].sizes[j] = isize; usbd_setup_isoc_xfer(xfer, &sce->isoreqs[i], sce->isoreqs[i].sizes, UGEN_NISORFRMS, 0, ugen_isoc_rintr); (void)usbd_transfer(xfer); } DPRINTFN(5, "isoc open done", 0, 0, 0, 0); break; bad: while (--i >= 0) { /* implicit buffer free */ usbd_destroy_xfer(sce->isoreqs[i].xfer); sce->isoreqs[i].xfer = NULL; } usbd_close_pipe(sce->pipeh); sce->pipeh = NULL; kmem_free(sce->ibuf, isize * UGEN_NISOFRAMES); sce->ibuf = NULL; error = ENOMEM; goto out; case UE_CONTROL: sce->timeout = USBD_DEFAULT_TIMEOUT; error = EINVAL; goto out; } } error = 0; out: if (error && opened) sc->sc_is_open[endpt] = 0; ugenif_release(sc); return error; } static void ugen_do_close(struct ugen_softc *sc, int flag, int endpt) { struct ugen_endpoint *sce; int dir; int i; UGENHIST_FUNC(); UGENHIST_CALLARGS("flag=%jd endpt=%jd", flag, endpt, 0, 0); KASSERT(KERNEL_LOCKED_P()); /* sc_is_open */ if (!sc->sc_is_open[endpt]) goto out; if (endpt == USB_CONTROL_ENDPOINT) { DPRINTFN(5, "close control", 0, 0, 0, 0); goto out; } for (dir = OUT; dir <= IN; dir++) { if (!(flag & (dir == OUT ? FWRITE : FREAD))) continue; sce = &sc->sc_endpoints[endpt][dir]; if (sce->pipeh == NULL) continue; DPRINTFN(5, "endpt=%jd dir=%jd sce=%jx", endpt, dir, (uintptr_t)sce, 0); usbd_abort_pipe(sce->pipeh); int isize = UGETW(sce->edesc->wMaxPacketSize); int msize = 0; switch (sce->edesc->bmAttributes & UE_XFERTYPE) { case UE_INTERRUPT: ndflush(&sce->q, sce->q.c_cc); clfree(&sce->q); msize = isize; break; case UE_ISOCHRONOUS: for (i = 0; i < UGEN_NISOREQS; ++i) { usbd_destroy_xfer(sce->isoreqs[i].xfer); sce->isoreqs[i].xfer = NULL; } msize = isize * UGEN_NISOFRAMES; break; case UE_BULK: if (sce->state & (UGEN_BULK_RA | UGEN_BULK_WB)) { usbd_destroy_xfer(sce->ra_wb_xfer); sce->ra_wb_xfer = NULL; msize = sce->ra_wb_bufsize; } break; default: break; } usbd_close_pipe(sce->pipeh); sce->pipeh = NULL; if (sce->ibuf != NULL) { kmem_free(sce->ibuf, msize); sce->ibuf = NULL; } } out: sc->sc_is_open[endpt] = 0; for (dir = OUT; dir <= IN; dir++) { sce = &sc->sc_endpoints[endpt][dir]; KASSERT(sce->pipeh == NULL); KASSERT(sce->ibuf == NULL); KASSERT(sce->ra_wb_xfer == NULL); for (i = 0; i < UGEN_NISOREQS; i++) KASSERT(sce->isoreqs[i].xfer == NULL); } } static int ugenclose(dev_t dev, int flag, int mode, struct lwp *l) { int endpt = UGENENDPOINT(dev); struct ugen_softc *sc; UGENHIST_FUNC(); UGENHIST_CALLARGS("flag=%jd, mode=%jd, unit=%jd, endpt=%jd", flag, mode, UGENUNIT(dev), endpt); KASSERT(KERNEL_LOCKED_P()); /* ugen_do_close */ if ((sc = ugenif_acquire(UGENUNIT(dev))) == NULL) return ENXIO; KASSERT(sc->sc_is_open[endpt]); ugen_do_close(sc, flag, endpt); KASSERT(!sc->sc_is_open[endpt]); ugenif_release(sc); return 0; } Static int ugen_do_read(struct ugen_softc *sc, int endpt, struct uio *uio, int flag) { struct ugen_endpoint *sce = &sc->sc_endpoints[endpt][IN]; uint32_t n, tn; struct usbd_xfer *xfer; usbd_status err; int error = 0; UGENHIST_FUNC(); UGENHIST_CALLARGS("ugen%d: %jd", device_unit(sc->sc_dev), endpt, 0, 0); if (endpt == USB_CONTROL_ENDPOINT) return ENODEV; KASSERT(sce->edesc); KASSERT(sce->pipeh); switch (sce->edesc->bmAttributes & UE_XFERTYPE) { case UE_INTERRUPT: /* Block until activity occurred. */ mutex_enter(&sc->sc_lock); while (sce->q.c_cc == 0) { if (flag & IO_NDELAY) { mutex_exit(&sc->sc_lock); return EWOULDBLOCK; } DPRINTFN(5, "sleep on %jx", (uintptr_t)sce, 0, 0, 0); /* "ugenri" */ error = cv_timedwait_sig(&sce->cv, &sc->sc_lock, mstohz(sce->timeout)); DPRINTFN(5, "woke, error=%jd", error, 0, 0, 0); if (sc->sc_dying) error = EIO; if (error) break; } mutex_exit(&sc->sc_lock); /* Transfer as many chunks as possible. */ while (sce->q.c_cc > 0 && uio->uio_resid > 0 && !error) { n = uimin(sce->q.c_cc, uio->uio_resid); if (n > sizeof(sc->sc_buffer)) n = sizeof(sc->sc_buffer); /* Remove a small chunk from the input queue. */ q_to_b(&sce->q, sc->sc_buffer, n); DPRINTFN(5, "got %jd chars", n, 0, 0, 0); /* Copy the data to the user process. */ error = uiomove(sc->sc_buffer, n, uio); if (error) break; } break; case UE_BULK: if (sce->state & UGEN_BULK_RA) { DPRINTFN(5, "BULK_RA req: %zd used: %d", uio->uio_resid, sce->ra_wb_used, 0, 0); xfer = sce->ra_wb_xfer; mutex_enter(&sc->sc_lock); if (sce->ra_wb_used == 0 && flag & IO_NDELAY) { mutex_exit(&sc->sc_lock); return EWOULDBLOCK; } while (uio->uio_resid > 0 && !error) { while (sce->ra_wb_used == 0) { DPRINTFN(5, "sleep on %jx", (uintptr_t)sce, 0, 0, 0); /* "ugenrb" */ error = cv_timedwait_sig(&sce->cv, &sc->sc_lock, mstohz(sce->timeout)); DPRINTFN(5, "woke, error=%jd", error, 0, 0, 0); if (sc->sc_dying) error = EIO; if (error) break; } /* Copy data to the process. */ while (uio->uio_resid > 0 && sce->ra_wb_used > 0) { n = uimin(uio->uio_resid, sce->ra_wb_used); n = uimin(n, sce->limit - sce->cur); error = uiomove(sce->cur, n, uio); if (error) break; sce->cur += n; sce->ra_wb_used -= n; if (sce->cur == sce->limit) sce->cur = sce->ibuf; } /* * If the transfers stopped because the * buffer was full, restart them. */ if (sce->state & UGEN_RA_WB_STOP && sce->ra_wb_used < sce->limit - sce->ibuf) { n = (sce->limit - sce->ibuf) - sce->ra_wb_used; usbd_setup_xfer(xfer, sce, NULL, uimin(n, sce->ra_wb_xferlen), 0, USBD_NO_TIMEOUT, ugen_bulkra_intr); sce->state &= ~UGEN_RA_WB_STOP; err = usbd_transfer(xfer); if (err != USBD_IN_PROGRESS) /* * The transfer has not been * queued. Setting STOP * will make us try * again at the next read. */ sce->state |= UGEN_RA_WB_STOP; } } mutex_exit(&sc->sc_lock); break; } error = usbd_create_xfer(sce->pipeh, UGEN_BBSIZE, 0, 0, &xfer); if (error) return error; while ((n = uimin(UGEN_BBSIZE, uio->uio_resid)) != 0) { DPRINTFN(1, "start transfer %jd bytes", n, 0, 0, 0); tn = n; err = usbd_bulk_transfer(xfer, sce->pipeh, sce->state & UGEN_SHORT_OK ? USBD_SHORT_XFER_OK : 0, sce->timeout, sc->sc_buffer, &tn); if (err) { if (err == USBD_INTERRUPTED) error = EINTR; else if (err == USBD_TIMEOUT) error = ETIMEDOUT; else error = EIO; break; } DPRINTFN(1, "got %jd bytes", tn, 0, 0, 0); error = uiomove(sc->sc_buffer, tn, uio); if (error || tn < n) break; } usbd_destroy_xfer(xfer); break; case UE_ISOCHRONOUS: mutex_enter(&sc->sc_lock); while (sce->cur == sce->fill) { if (flag & IO_NDELAY) { mutex_exit(&sc->sc_lock); return EWOULDBLOCK; } /* "ugenri" */ DPRINTFN(5, "sleep on %jx", (uintptr_t)sce, 0, 0, 0); error = cv_timedwait_sig(&sce->cv, &sc->sc_lock, mstohz(sce->timeout)); DPRINTFN(5, "woke, error=%jd", error, 0, 0, 0); if (sc->sc_dying) error = EIO; if (error) break; } while (sce->cur != sce->fill && uio->uio_resid > 0 && !error) { if(sce->fill > sce->cur) n = uimin(sce->fill - sce->cur, uio->uio_resid); else n = uimin(sce->limit - sce->cur, uio->uio_resid); DPRINTFN(5, "isoc got %jd chars", n, 0, 0, 0); /* Copy the data to the user process. */ error = uiomove(sce->cur, n, uio); if (error) break; sce->cur += n; if (sce->cur >= sce->limit) sce->cur = sce->ibuf; } mutex_exit(&sc->sc_lock); break; default: return ENXIO; } return error; } static int ugenread(dev_t dev, struct uio *uio, int flag) { int endpt = UGENENDPOINT(dev); struct ugen_softc *sc; int error; if ((sc = ugenif_acquire(UGENUNIT(dev))) == NULL) return ENXIO; error = ugen_do_read(sc, endpt, uio, flag); ugenif_release(sc); return error; } Static int ugen_do_write(struct ugen_softc *sc, int endpt, struct uio *uio, int flag) { struct ugen_endpoint *sce = &sc->sc_endpoints[endpt][OUT]; uint32_t n; int error = 0; uint32_t tn; char *dbuf; struct usbd_xfer *xfer; usbd_status err; UGENHIST_FUNC(); UGENHIST_CALLARGSN(5, "ugen%jd: %jd", device_unit(sc->sc_dev), endpt, 0, 0); if (endpt == USB_CONTROL_ENDPOINT) return ENODEV; KASSERT(sce->edesc); KASSERT(sce->pipeh); switch (sce->edesc->bmAttributes & UE_XFERTYPE) { case UE_BULK: if (sce->state & UGEN_BULK_WB) { DPRINTFN(5, "BULK_WB req: %jd used: %jd", uio->uio_resid, sce->ra_wb_used, 0, 0); xfer = sce->ra_wb_xfer; mutex_enter(&sc->sc_lock); if (sce->ra_wb_used == sce->limit - sce->ibuf && flag & IO_NDELAY) { mutex_exit(&sc->sc_lock); return EWOULDBLOCK; } while (uio->uio_resid > 0 && !error) { while (sce->ra_wb_used == sce->limit - sce->ibuf) { DPRINTFN(5, "sleep on %#jx", (uintptr_t)sce, 0, 0, 0); /* "ugenwb" */ error = cv_timedwait_sig(&sce->cv, &sc->sc_lock, mstohz(sce->timeout)); DPRINTFN(5, "woke, error=%d", error, 0, 0, 0); if (sc->sc_dying) error = EIO; if (error) break; } /* Copy data from the process. */ while (uio->uio_resid > 0 && sce->ra_wb_used < sce->limit - sce->ibuf) { n = uimin(uio->uio_resid, (sce->limit - sce->ibuf) - sce->ra_wb_used); n = uimin(n, sce->limit - sce->fill); error = uiomove(sce->fill, n, uio); if (error) break; sce->fill += n; sce->ra_wb_used += n; if (sce->fill == sce->limit) sce->fill = sce->ibuf; } /* * If the transfers stopped because the * buffer was empty, restart them. */ if (sce->state & UGEN_RA_WB_STOP && sce->ra_wb_used > 0) { dbuf = (char *)usbd_get_buffer(xfer); n = uimin(sce->ra_wb_used, sce->ra_wb_xferlen); tn = uimin(n, sce->limit - sce->cur); memcpy(dbuf, sce->cur, tn); dbuf += tn; if (n - tn > 0) memcpy(dbuf, sce->ibuf, n - tn); usbd_setup_xfer(xfer, sce, NULL, n, 0, USBD_NO_TIMEOUT, ugen_bulkwb_intr); sce->state &= ~UGEN_RA_WB_STOP; err = usbd_transfer(xfer); if (err != USBD_IN_PROGRESS) /* * The transfer has not been * queued. Setting STOP * will make us try again * at the next read. */ sce->state |= UGEN_RA_WB_STOP; } } mutex_exit(&sc->sc_lock); break; } error = usbd_create_xfer(sce->pipeh, UGEN_BBSIZE, 0, 0, &xfer); if (error) return error; while ((n = uimin(UGEN_BBSIZE, uio->uio_resid)) != 0) { error = uiomove(sc->sc_buffer, n, uio); if (error) break; DPRINTFN(1, "transfer %jd bytes", n, 0, 0, 0); err = usbd_bulk_transfer(xfer, sce->pipeh, 0, sce->timeout, sc->sc_buffer, &n); if (err) { if (err == USBD_INTERRUPTED) error = EINTR; else if (err == USBD_TIMEOUT) error = ETIMEDOUT; else error = EIO; break; } } usbd_destroy_xfer(xfer); break; case UE_INTERRUPT: error = usbd_create_xfer(sce->pipeh, UGETW(sce->edesc->wMaxPacketSize), 0, 0, &xfer); if (error) return error; while ((n = uimin(UGETW(sce->edesc->wMaxPacketSize), uio->uio_resid)) != 0) { error = uiomove(sc->sc_buffer, n, uio); if (error) break; DPRINTFN(1, "transfer %jd bytes", n, 0, 0, 0); err = usbd_intr_transfer(xfer, sce->pipeh, 0, sce->timeout, sc->sc_buffer, &n); if (err) { if (err == USBD_INTERRUPTED) error = EINTR; else if (err == USBD_TIMEOUT) error = ETIMEDOUT; else error = EIO; break; } } usbd_destroy_xfer(xfer); break; default: return ENXIO; } return error; } static int ugenwrite(dev_t dev, struct uio *uio, int flag) { int endpt = UGENENDPOINT(dev); struct ugen_softc *sc; int error; if ((sc = ugenif_acquire(UGENUNIT(dev))) == NULL) return ENXIO; error = ugen_do_write(sc, endpt, uio, flag); ugenif_release(sc); return error; } static int ugen_activate(device_t self, enum devact act) { struct ugen_softc *sc = device_private(self); switch (act) { case DVACT_DEACTIVATE: sc->sc_dying = 1; return 0; default: return EOPNOTSUPP; } } static int ugen_detach(device_t self, int flags) { struct ugen_softc *sc = device_private(self); struct ugen_endpoint *sce; int i, dir; int maj, mn; UGENHIST_FUNC(); UGENHIST_CALLARGS("sc=%ju flags=%ju", (uintptr_t)sc, flags, 0, 0); KASSERT(KERNEL_LOCKED_P()); /* sc_is_open */ /* * Fail if we're not forced to detach and userland has any * endpoints open. */ if ((flags & DETACH_FORCE) == 0) { for (i = 0; i < USB_MAX_ENDPOINTS; i++) { if (sc->sc_is_open[i]) return EBUSY; } } /* Prevent new users. Prevent suspend/resume. */ sc->sc_dying = 1; pmf_device_deregister(self); /* * If we never finished attaching, skip nixing endpoints and * users because there aren't any. */ if (!sc->sc_attached) goto out; /* Abort all pipes. */ for (i = 0; i < USB_MAX_ENDPOINTS; i++) { for (dir = OUT; dir <= IN; dir++) { sce = &sc->sc_endpoints[i][dir]; if (sce->pipeh) usbd_abort_pipe(sce->pipeh); } } /* * Wait for users to drain. Before this point there can be no * more I/O operations started because we set sc_dying; after * this, there can be no more I/O operations in progress, so it * will be safe to free things. */ mutex_enter(&sc->sc_lock); if (--sc->sc_refcnt >= 0) { /* Wake everyone */ for (i = 0; i < USB_MAX_ENDPOINTS; i++) { for (dir = OUT; dir <= IN; dir++) cv_broadcast(&sc->sc_endpoints[i][dir].cv); } /* Wait for processes to go away. */ do { cv_wait(&sc->sc_detach_cv, &sc->sc_lock); } while (sc->sc_refcnt >= 0); } mutex_exit(&sc->sc_lock); /* locate the major number */ maj = cdevsw_lookup_major(&ugen_cdevsw); /* * Nuke the vnodes for any open instances (calls ugenclose, but * with no effect because we already set sc_dying). */ mn = sc->sc_unit * USB_MAX_ENDPOINTS; vdevgone(maj, mn, mn + USB_MAX_ENDPOINTS - 1, VCHR); /* Actually close any lingering pipes. */ for (i = 0; i < USB_MAX_ENDPOINTS; i++) ugen_do_close(sc, FREAD|FWRITE, i); usbd_add_drv_event(USB_EVENT_DRIVER_DETACH, sc->sc_udev, sc->sc_dev); ugenif_put_unit(sc); out: for (i = 0; i < USB_MAX_ENDPOINTS; i++) { for (dir = OUT; dir <= IN; dir++) { sce = &sc->sc_endpoints[i][dir]; seldestroy(&sce->rsel); cv_destroy(&sce->cv); } } cv_destroy(&sc->sc_detach_cv); mutex_destroy(&sc->sc_lock); return 0; } Static void ugenintr(struct usbd_xfer *xfer, void *addr, usbd_status status) { struct ugen_endpoint *sce = addr; struct ugen_softc *sc = sce->sc; uint32_t count; u_char *ibuf; UGENHIST_FUNC(); UGENHIST_CALLARGS("xfer %jx status %d", (uintptr_t)xfer, status, 0, 0); if (status == USBD_CANCELLED) return; if (status != USBD_NORMAL_COMPLETION) { DPRINTF("status=%jd", status, 0, 0, 0); if (status == USBD_STALLED) usbd_clear_endpoint_stall_async(sce->pipeh); return; } usbd_get_xfer_status(xfer, NULL, NULL, &count, NULL); ibuf = sce->ibuf; DPRINTFN(5, "xfer=%#jx status=%d count=%d", (uintptr_t)xfer, status, count, 0); DPRINTFN(5, " data = %02x %02x %02x", ibuf[0], ibuf[1], ibuf[2], 0); mutex_enter(&sc->sc_lock); (void)b_to_q(ibuf, count, &sce->q); cv_signal(&sce->cv); mutex_exit(&sc->sc_lock); selnotify(&sce->rsel, 0, 0); } Static void ugen_isoc_rintr(struct usbd_xfer *xfer, void *addr, usbd_status status) { struct isoreq *req = addr; struct ugen_endpoint *sce = req->sce; struct ugen_softc *sc = sce->sc; uint32_t count, n; int i, isize; UGENHIST_FUNC(); UGENHIST_CALLARGS("xfer=%jx status=%jd", (uintptr_t)xfer, status, 0, 0); /* Return if we are aborting. */ if (status == USBD_CANCELLED) return; usbd_get_xfer_status(xfer, NULL, NULL, &count, NULL); DPRINTFN(5, "xfer %ld, count=%d", (long)(req - sce->isoreqs), count, 0, 0); mutex_enter(&sc->sc_lock); /* throw away oldest input if the buffer is full */ if (sce->fill < sce->cur && sce->cur <= sce->fill + count) { sce->cur += count; if (sce->cur >= sce->limit) sce->cur = sce->ibuf + (sce->limit - sce->cur); DPRINTFN(5, "throwing away %jd bytes", count, 0, 0, 0); } isize = UGETW(sce->edesc->wMaxPacketSize); for (i = 0; i < UGEN_NISORFRMS; i++) { uint32_t actlen = req->sizes[i]; char const *tbuf = (char const *)req->dmabuf + isize * i; /* copy data to buffer */ while (actlen > 0) { n = uimin(actlen, sce->limit - sce->fill); memcpy(sce->fill, tbuf, n); tbuf += n; actlen -= n; sce->fill += n; if (sce->fill == sce->limit) sce->fill = sce->ibuf; } /* setup size for next transfer */ req->sizes[i] = isize; } usbd_setup_isoc_xfer(xfer, req, req->sizes, UGEN_NISORFRMS, 0, ugen_isoc_rintr); (void)usbd_transfer(xfer); cv_signal(&sce->cv); mutex_exit(&sc->sc_lock); selnotify(&sce->rsel, 0, 0); } Static void ugen_bulkra_intr(struct usbd_xfer *xfer, void *addr, usbd_status status) { struct ugen_endpoint *sce = addr; struct ugen_softc *sc = sce->sc; uint32_t count, n; char const *tbuf; usbd_status err; UGENHIST_FUNC(); UGENHIST_CALLARGS("xfer=%jx status=%jd", (uintptr_t)xfer, status, 0, 0); /* Return if we are aborting. */ if (status == USBD_CANCELLED) return; if (status != USBD_NORMAL_COMPLETION) { DPRINTF("status=%jd", status, 0, 0, 0); sce->state |= UGEN_RA_WB_STOP; if (status == USBD_STALLED) usbd_clear_endpoint_stall_async(sce->pipeh); return; } usbd_get_xfer_status(xfer, NULL, NULL, &count, NULL); mutex_enter(&sc->sc_lock); /* Keep track of how much is in the buffer. */ sce->ra_wb_used += count; /* Copy data to buffer. */ tbuf = (char const *)usbd_get_buffer(sce->ra_wb_xfer); n = uimin(count, sce->limit - sce->fill); memcpy(sce->fill, tbuf, n); tbuf += n; count -= n; sce->fill += n; if (sce->fill == sce->limit) sce->fill = sce->ibuf; if (count > 0) { memcpy(sce->fill, tbuf, count); sce->fill += count; } /* Set up the next request if necessary. */ n = (sce->limit - sce->ibuf) - sce->ra_wb_used; if (n > 0) { usbd_setup_xfer(xfer, sce, NULL, uimin(n, sce->ra_wb_xferlen), 0, USBD_NO_TIMEOUT, ugen_bulkra_intr); err = usbd_transfer(xfer); if (err != USBD_IN_PROGRESS) { printf("error=%d", err); /* * The transfer has not been queued. Setting STOP * will make us try again at the next read. */ sce->state |= UGEN_RA_WB_STOP; } } else sce->state |= UGEN_RA_WB_STOP; cv_signal(&sce->cv); mutex_exit(&sc->sc_lock); selnotify(&sce->rsel, 0, 0); } Static void ugen_bulkwb_intr(struct usbd_xfer *xfer, void *addr, usbd_status status) { struct ugen_endpoint *sce = addr; struct ugen_softc *sc = sce->sc; uint32_t count, n; char *tbuf; usbd_status err; UGENHIST_FUNC(); UGENHIST_CALLARGS("xfer=%jx status=%jd", (uintptr_t)xfer, status, 0, 0); /* Return if we are aborting. */ if (status == USBD_CANCELLED) return; if (status != USBD_NORMAL_COMPLETION) { DPRINTF("status=%jd", status, 0, 0, 0); sce->state |= UGEN_RA_WB_STOP; if (status == USBD_STALLED) usbd_clear_endpoint_stall_async(sce->pipeh); return; } usbd_get_xfer_status(xfer, NULL, NULL, &count, NULL); mutex_enter(&sc->sc_lock); /* Keep track of how much is in the buffer. */ sce->ra_wb_used -= count; /* Update buffer pointers. */ sce->cur += count; if (sce->cur >= sce->limit) sce->cur = sce->ibuf + (sce->cur - sce->limit); /* Set up next request if necessary. */ if (sce->ra_wb_used > 0) { /* copy data from buffer */ tbuf = (char *)usbd_get_buffer(sce->ra_wb_xfer); count = uimin(sce->ra_wb_used, sce->ra_wb_xferlen); n = uimin(count, sce->limit - sce->cur); memcpy(tbuf, sce->cur, n); tbuf += n; if (count - n > 0) memcpy(tbuf, sce->ibuf, count - n); usbd_setup_xfer(xfer, sce, NULL, count, 0, USBD_NO_TIMEOUT, ugen_bulkwb_intr); err = usbd_transfer(xfer); if (err != USBD_IN_PROGRESS) { printf("error=%d", err); /* * The transfer has not been queued. Setting STOP * will make us try again at the next write. */ sce->state |= UGEN_RA_WB_STOP; } } else sce->state |= UGEN_RA_WB_STOP; cv_signal(&sce->cv); mutex_exit(&sc->sc_lock); selnotify(&sce->rsel, 0, 0); } Static usbd_status ugen_set_interface(struct ugen_softc *sc, int ifaceidx, int altno) { struct usbd_interface *iface; usb_endpoint_descriptor_t *ed; usbd_status err; struct ugen_endpoint *sce; uint8_t niface, nendpt, endptno, endpt; int dir; UGENHIST_FUNC(); UGENHIST_CALLARGSN(15, "ifaceidx=%jd altno=%jd", ifaceidx, altno, 0, 0); err = usbd_interface_count(sc->sc_udev, &niface); if (err) return err; if (ifaceidx < 0 || ifaceidx >= niface) return USBD_INVAL; err = usbd_device2interface_handle(sc->sc_udev, ifaceidx, &iface); if (err) return err; err = usbd_endpoint_count(iface, &nendpt); if (err) return err; /* change setting */ err = usbd_set_interface(iface, altno); if (err) return err; err = usbd_endpoint_count(iface, &nendpt); if (err) return err; ugen_clear_endpoints(sc); for (endptno = 0; endptno < nendpt; endptno++) { ed = usbd_interface2endpoint_descriptor(iface, endptno); KASSERT(ed != NULL); endpt = ed->bEndpointAddress; dir = UE_GET_DIR(endpt) == UE_DIR_IN ? IN : OUT; sce = &sc->sc_endpoints[UE_GET_ADDR(endpt)][dir]; sce->sc = sc; sce->edesc = ed; sce->iface = iface; } return 0; } /* Retrieve a complete descriptor for a certain device and index. */ Static usb_config_descriptor_t * ugen_get_cdesc(struct ugen_softc *sc, int index, int *lenp) { usb_config_descriptor_t *cdesc = NULL, *tdesc, cdescr; int len = 0; usbd_status err; UGENHIST_FUNC(); UGENHIST_CALLARGS("index=%jd", index, 0, 0, 0); switch (index) { case USB_CURRENT_CONFIG_INDEX: tdesc = usbd_get_config_descriptor(sc->sc_udev); if (tdesc == NULL) break; len = UGETW(tdesc->wTotalLength); cdesc = kmem_alloc(len, KM_SLEEP); memcpy(cdesc, tdesc, len); break; default: err = usbd_get_config_desc(sc->sc_udev, index, &cdescr); if (err) break; len = UGETW(cdescr.wTotalLength); cdesc = kmem_alloc(len, KM_SLEEP); err = usbd_get_config_desc_full(sc->sc_udev, index, cdesc, len); if (err) { kmem_free(cdesc, len); cdesc = NULL; } break; } DPRINTFN(5, "req len=%jd cdesc=%jx", len, (uintptr_t)cdesc, 0, 0); if (cdesc && lenp) *lenp = len; return cdesc; } Static int ugen_get_alt_index(struct ugen_softc *sc, int ifaceidx) { struct usbd_interface *iface; usbd_status err; err = usbd_device2interface_handle(sc->sc_udev, ifaceidx, &iface); if (err) return -1; return usbd_get_interface_altindex(iface); } Static int ugen_do_ioctl(struct ugen_softc *sc, int endpt, u_long cmd, void *addr, int flag, struct lwp *l) { struct ugen_endpoint *sce; usbd_status err; struct usbd_interface *iface; struct usb_config_desc *cd; usb_config_descriptor_t *cdesc; struct usb_interface_desc *id; usb_interface_descriptor_t *idesc; struct usb_endpoint_desc *ed; usb_endpoint_descriptor_t *edesc; struct usb_alt_interface *ai; struct usb_string_desc *si; uint8_t conf, alt; int cdesclen; int error; int dir; UGENHIST_FUNC(); UGENHIST_CALLARGS("ugen%d: endpt=%ju cmd=%08jx flag=%jx", device_unit(sc->sc_dev), endpt, cmd, flag); KASSERT(KERNEL_LOCKED_P()); /* ugen_set_config */ switch (cmd) { case FIONBIO: /* All handled in the upper FS layer. */ return 0; case USB_SET_SHORT_XFER: if (endpt == USB_CONTROL_ENDPOINT) return EINVAL; /* This flag only affects read */ sce = &sc->sc_endpoints[endpt][IN]; if (sce == NULL || sce->pipeh == NULL) return EINVAL; if (*(int *)addr) sce->state |= UGEN_SHORT_OK; else sce->state &= ~UGEN_SHORT_OK; DPRINTFN(5, "pipe=%jx short xfer=%ju", (uintptr_t)sce->pipeh, sce->state & UGEN_SHORT_OK, 0, 0); return 0; case USB_SET_TIMEOUT: for (dir = OUT; dir <= IN; dir++) { sce = &sc->sc_endpoints[endpt][dir]; if (sce == NULL) return EINVAL; sce->timeout = *(int *)addr; DPRINTFN(5, "pipe=%jx timeout[dir=%ju] timeout=%ju", (uintptr_t)sce->pipeh, dir, sce->timeout, 0); } return 0; case USB_SET_BULK_RA: if (endpt == USB_CONTROL_ENDPOINT) return EINVAL; sce = &sc->sc_endpoints[endpt][IN]; if (sce == NULL || sce->pipeh == NULL) return EINVAL; edesc = sce->edesc; if ((edesc->bmAttributes & UE_XFERTYPE) != UE_BULK) return EINVAL; if (*(int *)addr) { /* Only turn RA on if it's currently off. */ if (sce->state & UGEN_BULK_RA) return 0; KASSERT(sce->ra_wb_xfer == NULL); KASSERT(sce->ibuf == NULL); if (sce->ra_wb_bufsize == 0 || sce->ra_wb_reqsize == 0) /* shouldn't happen */ return EINVAL; error = usbd_create_xfer(sce->pipeh, sce->ra_wb_reqsize, 0, 0, &sce->ra_wb_xfer); if (error) return error; sce->ra_wb_xferlen = sce->ra_wb_reqsize; sce->ibuf = kmem_alloc(sce->ra_wb_bufsize, KM_SLEEP); sce->fill = sce->cur = sce->ibuf; sce->limit = sce->ibuf + sce->ra_wb_bufsize; sce->ra_wb_used = 0; sce->state |= UGEN_BULK_RA; sce->state &= ~UGEN_RA_WB_STOP; /* Now start reading. */ usbd_setup_xfer(sce->ra_wb_xfer, sce, NULL, uimin(sce->ra_wb_xferlen, sce->ra_wb_bufsize), 0, USBD_NO_TIMEOUT, ugen_bulkra_intr); err = usbd_transfer(sce->ra_wb_xfer); if (err != USBD_IN_PROGRESS) { sce->state &= ~UGEN_BULK_RA; kmem_free(sce->ibuf, sce->ra_wb_bufsize); sce->ibuf = NULL; usbd_destroy_xfer(sce->ra_wb_xfer); sce->ra_wb_xfer = NULL; return EIO; } } else { /* Only turn RA off if it's currently on. */ if (!(sce->state & UGEN_BULK_RA)) return 0; sce->state &= ~UGEN_BULK_RA; usbd_abort_pipe(sce->pipeh); usbd_destroy_xfer(sce->ra_wb_xfer); sce->ra_wb_xfer = NULL; /* * XXX Discard whatever's in the buffer, but we * should keep it around and drain the buffer * instead. */ kmem_free(sce->ibuf, sce->ra_wb_bufsize); sce->ibuf = NULL; } return 0; case USB_SET_BULK_WB: if (endpt == USB_CONTROL_ENDPOINT) return EINVAL; sce = &sc->sc_endpoints[endpt][OUT]; if (sce == NULL || sce->pipeh == NULL) return EINVAL; edesc = sce->edesc; if ((edesc->bmAttributes & UE_XFERTYPE) != UE_BULK) return EINVAL; if (*(int *)addr) { /* Only turn WB on if it's currently off. */ if (sce->state & UGEN_BULK_WB) return 0; KASSERT(sce->ra_wb_xfer == NULL); KASSERT(sce->ibuf == NULL); if (sce->ra_wb_bufsize == 0 || sce->ra_wb_reqsize == 0) /* shouldn't happen */ return EINVAL; error = usbd_create_xfer(sce->pipeh, sce->ra_wb_reqsize, 0, 0, &sce->ra_wb_xfer); /* XXX check error??? */ sce->ra_wb_xferlen = sce->ra_wb_reqsize; sce->ibuf = kmem_alloc(sce->ra_wb_bufsize, KM_SLEEP); sce->fill = sce->cur = sce->ibuf; sce->limit = sce->ibuf + sce->ra_wb_bufsize; sce->ra_wb_used = 0; sce->state |= UGEN_BULK_WB | UGEN_RA_WB_STOP; } else { /* Only turn WB off if it's currently on. */ if (!(sce->state & UGEN_BULK_WB)) return 0; sce->state &= ~UGEN_BULK_WB; /* * XXX Discard whatever's in the buffer, but we * should keep it around and keep writing to * drain the buffer instead. */ usbd_abort_pipe(sce->pipeh); usbd_destroy_xfer(sce->ra_wb_xfer); sce->ra_wb_xfer = NULL; kmem_free(sce->ibuf, sce->ra_wb_bufsize); sce->ibuf = NULL; } return 0; case USB_SET_BULK_RA_OPT: case USB_SET_BULK_WB_OPT: { struct usb_bulk_ra_wb_opt *opt; if (endpt == USB_CONTROL_ENDPOINT) return EINVAL; opt = (struct usb_bulk_ra_wb_opt *)addr; if (cmd == USB_SET_BULK_RA_OPT) sce = &sc->sc_endpoints[endpt][IN]; else sce = &sc->sc_endpoints[endpt][OUT]; if (sce == NULL || sce->pipeh == NULL) return EINVAL; if (opt->ra_wb_buffer_size < 1 || opt->ra_wb_buffer_size > UGEN_BULK_RA_WB_BUFMAX || opt->ra_wb_request_size < 1 || opt->ra_wb_request_size > opt->ra_wb_buffer_size) return EINVAL; /* * XXX These changes do not take effect until the * next time RA/WB mode is enabled but they ought to * take effect immediately. */ sce->ra_wb_bufsize = opt->ra_wb_buffer_size; sce->ra_wb_reqsize = opt->ra_wb_request_size; return 0; } default: break; } if (endpt != USB_CONTROL_ENDPOINT) return EINVAL; switch (cmd) { #ifdef UGEN_DEBUG case USB_SETDEBUG: ugendebug = *(int *)addr; break; #endif case USB_GET_CONFIG: err = usbd_get_config(sc->sc_udev, &conf); if (err) return EIO; *(int *)addr = conf; break; case USB_SET_CONFIG: if (!(flag & FWRITE)) return EPERM; err = ugen_set_config(sc, *(int *)addr, 1); switch (err) { case USBD_NORMAL_COMPLETION: break; case USBD_IN_USE: return EBUSY; default: return EIO; } break; case USB_GET_ALTINTERFACE: ai = (struct usb_alt_interface *)addr; err = usbd_device2interface_handle(sc->sc_udev, ai->uai_interface_index, &iface); if (err) return EINVAL; idesc = usbd_get_interface_descriptor(iface); if (idesc == NULL) return EIO; ai->uai_alt_no = idesc->bAlternateSetting; break; case USB_SET_ALTINTERFACE: if (!(flag & FWRITE)) return EPERM; ai = (struct usb_alt_interface *)addr; err = usbd_device2interface_handle(sc->sc_udev, ai->uai_interface_index, &iface); if (err) return EINVAL; err = ugen_set_interface(sc, ai->uai_interface_index, ai->uai_alt_no); if (err) return EINVAL; break; case USB_GET_NO_ALT: ai = (struct usb_alt_interface *)addr; cdesc = ugen_get_cdesc(sc, ai->uai_config_index, &cdesclen); if (cdesc == NULL) return EINVAL; idesc = usbd_find_idesc(cdesc, ai->uai_interface_index, 0); if (idesc == NULL) { kmem_free(cdesc, cdesclen); return EINVAL; } ai->uai_alt_no = usbd_get_no_alts(cdesc, idesc->bInterfaceNumber); kmem_free(cdesc, cdesclen); break; case USB_GET_DEVICE_DESC: *(usb_device_descriptor_t *)addr = *usbd_get_device_descriptor(sc->sc_udev); break; case USB_GET_CONFIG_DESC: cd = (struct usb_config_desc *)addr; cdesc = ugen_get_cdesc(sc, cd->ucd_config_index, &cdesclen); if (cdesc == NULL) return EINVAL; cd->ucd_desc = *cdesc; kmem_free(cdesc, cdesclen); break; case USB_GET_INTERFACE_DESC: id = (struct usb_interface_desc *)addr; cdesc = ugen_get_cdesc(sc, id->uid_config_index, &cdesclen); if (cdesc == NULL) return EINVAL; if (id->uid_config_index == USB_CURRENT_CONFIG_INDEX && id->uid_alt_index == USB_CURRENT_ALT_INDEX) alt = ugen_get_alt_index(sc, id->uid_interface_index); else alt = id->uid_alt_index; idesc = usbd_find_idesc(cdesc, id->uid_interface_index, alt); if (idesc == NULL) { kmem_free(cdesc, cdesclen); return EINVAL; } id->uid_desc = *idesc; kmem_free(cdesc, cdesclen); break; case USB_GET_ENDPOINT_DESC: ed = (struct usb_endpoint_desc *)addr; cdesc = ugen_get_cdesc(sc, ed->ued_config_index, &cdesclen); if (cdesc == NULL) return EINVAL; if (ed->ued_config_index == USB_CURRENT_CONFIG_INDEX && ed->ued_alt_index == USB_CURRENT_ALT_INDEX) alt = ugen_get_alt_index(sc, ed->ued_interface_index); else alt = ed->ued_alt_index; edesc = usbd_find_edesc(cdesc, ed->ued_interface_index, alt, ed->ued_endpoint_index); if (edesc == NULL) { kmem_free(cdesc, cdesclen); return EINVAL; } ed->ued_desc = *edesc; kmem_free(cdesc, cdesclen); break; case USB_GET_FULL_DESC: { int len; struct iovec iov; struct uio uio; struct usb_full_desc *fd = (struct usb_full_desc *)addr; cdesc = ugen_get_cdesc(sc, fd->ufd_config_index, &cdesclen); if (cdesc == NULL) return EINVAL; len = cdesclen; if (len > fd->ufd_size) len = fd->ufd_size; iov.iov_base = (void *)fd->ufd_data; iov.iov_len = len; uio.uio_iov = &iov; uio.uio_iovcnt = 1; uio.uio_resid = len; uio.uio_offset = 0; uio.uio_rw = UIO_READ; uio.uio_vmspace = l->l_proc->p_vmspace; error = uiomove((void *)cdesc, len, &uio); kmem_free(cdesc, cdesclen); return error; } case USB_GET_STRING_DESC: { int len; si = (struct usb_string_desc *)addr; err = usbd_get_string_desc(sc->sc_udev, si->usd_string_index, si->usd_language_id, &si->usd_desc, &len); if (err) return EINVAL; break; } case USB_DO_REQUEST: { struct usb_ctl_request *ur = (void *)addr; int len = UGETW(ur->ucr_request.wLength); struct iovec iov; struct uio uio; void *ptr = 0; usbd_status xerr; error = 0; if (!(flag & FWRITE)) return EPERM; /* Avoid requests that would damage the bus integrity. */ if ((ur->ucr_request.bmRequestType == UT_WRITE_DEVICE && ur->ucr_request.bRequest == UR_SET_ADDRESS) || (ur->ucr_request.bmRequestType == UT_WRITE_DEVICE && ur->ucr_request.bRequest == UR_SET_CONFIG) || (ur->ucr_request.bmRequestType == UT_WRITE_INTERFACE && ur->ucr_request.bRequest == UR_SET_INTERFACE)) return EINVAL; if (len < 0 || len > 32767) return EINVAL; if (len != 0) { iov.iov_base = (void *)ur->ucr_data; iov.iov_len = len; uio.uio_iov = &iov; uio.uio_iovcnt = 1; uio.uio_resid = len; uio.uio_offset = 0; uio.uio_rw = ur->ucr_request.bmRequestType & UT_READ ? UIO_READ : UIO_WRITE; uio.uio_vmspace = l->l_proc->p_vmspace; ptr = kmem_alloc(len, KM_SLEEP); if (uio.uio_rw == UIO_WRITE) { error = uiomove(ptr, len, &uio); if (error) goto ret; } } sce = &sc->sc_endpoints[endpt][IN]; xerr = usbd_do_request_flags(sc->sc_udev, &ur->ucr_request, ptr, ur->ucr_flags, &ur->ucr_actlen, sce->timeout); if (xerr) { error = EIO; goto ret; } if (len != 0) { if (uio.uio_rw == UIO_READ) { size_t alen = uimin(len, ur->ucr_actlen); error = uiomove(ptr, alen, &uio); if (error) goto ret; } } ret: if (ptr) kmem_free(ptr, len); return error; } case USB_GET_DEVICEINFO: usbd_fill_deviceinfo(sc->sc_udev, (struct usb_device_info *)addr, 0); break; case USB_GET_DEVICEINFO_30: { int ret; MODULE_HOOK_CALL(usb_subr_fill_30_hook, (sc->sc_udev, (struct usb_device_info30 *)addr, 0, usbd_devinfo_vp, usbd_printBCD), enosys(), ret); if (ret == 0) return 0; return EINVAL; } default: return EINVAL; } return 0; } static int ugenioctl(dev_t dev, u_long cmd, void *addr, int flag, struct lwp *l) { int endpt = UGENENDPOINT(dev); struct ugen_softc *sc; int error; if ((sc = ugenif_acquire(UGENUNIT(dev))) == 0) return ENXIO; error = ugen_do_ioctl(sc, endpt, cmd, addr, flag, l); ugenif_release(sc); return error; } static int ugenpoll(dev_t dev, int events, struct lwp *l) { struct ugen_softc *sc; struct ugen_endpoint *sce_in, *sce_out; int revents = 0; if ((sc = ugenif_acquire(UGENUNIT(dev))) == NULL) return POLLHUP; if (UGENENDPOINT(dev) == USB_CONTROL_ENDPOINT) { revents |= POLLERR; goto out; } sce_in = &sc->sc_endpoints[UGENENDPOINT(dev)][IN]; sce_out = &sc->sc_endpoints[UGENENDPOINT(dev)][OUT]; KASSERT(sce_in->edesc || sce_out->edesc); KASSERT(sce_in->pipeh || sce_out->pipeh); mutex_enter(&sc->sc_lock); if (sce_in && sce_in->pipeh && (events & (POLLIN | POLLRDNORM))) switch (sce_in->edesc->bmAttributes & UE_XFERTYPE) { case UE_INTERRUPT: if (sce_in->q.c_cc > 0) revents |= events & (POLLIN | POLLRDNORM); else selrecord(l, &sce_in->rsel); break; case UE_ISOCHRONOUS: if (sce_in->cur != sce_in->fill) revents |= events & (POLLIN | POLLRDNORM); else selrecord(l, &sce_in->rsel); break; case UE_BULK: if (sce_in->state & UGEN_BULK_RA) { if (sce_in->ra_wb_used > 0) revents |= events & (POLLIN | POLLRDNORM); else selrecord(l, &sce_in->rsel); break; } /* * We have no easy way of determining if a read will * yield any data or a write will happen. * Pretend they will. */ revents |= events & (POLLIN | POLLRDNORM); break; default: break; } if (sce_out && sce_out->pipeh && (events & (POLLOUT | POLLWRNORM))) switch (sce_out->edesc->bmAttributes & UE_XFERTYPE) { case UE_INTERRUPT: case UE_ISOCHRONOUS: /* XXX unimplemented */ break; case UE_BULK: if (sce_out->state & UGEN_BULK_WB) { if (sce_out->ra_wb_used < sce_out->limit - sce_out->ibuf) revents |= events & (POLLOUT | POLLWRNORM); else selrecord(l, &sce_out->rsel); break; } /* * We have no easy way of determining if a read will * yield any data or a write will happen. * Pretend they will. */ revents |= events & (POLLOUT | POLLWRNORM); break; default: break; } mutex_exit(&sc->sc_lock); out: ugenif_release(sc); return revents; } static void filt_ugenrdetach(struct knote *kn) { struct ugen_endpoint *sce = kn->kn_hook; struct ugen_softc *sc = sce->sc; mutex_enter(&sc->sc_lock); selremove_knote(&sce->rsel, kn); mutex_exit(&sc->sc_lock); } static int filt_ugenread_intr(struct knote *kn, long hint) { struct ugen_endpoint *sce = kn->kn_hook; struct ugen_softc *sc = sce->sc; int ret; mutex_enter(&sc->sc_lock); if (sc->sc_dying) { ret = 0; } else { kn->kn_data = sce->q.c_cc; ret = kn->kn_data > 0; } mutex_exit(&sc->sc_lock); return ret; } static int filt_ugenread_isoc(struct knote *kn, long hint) { struct ugen_endpoint *sce = kn->kn_hook; struct ugen_softc *sc = sce->sc; int ret; mutex_enter(&sc->sc_lock); if (sc->sc_dying) { ret = 0; } else if (sce->cur == sce->fill) { ret = 0; } else if (sce->cur < sce->fill) { kn->kn_data = sce->fill - sce->cur; ret = 1; } else { kn->kn_data = (sce->limit - sce->cur) + (sce->fill - sce->ibuf); ret = 1; } mutex_exit(&sc->sc_lock); return ret; } static int filt_ugenread_bulk(struct knote *kn, long hint) { struct ugen_endpoint *sce = kn->kn_hook; struct ugen_softc *sc = sce->sc; int ret; mutex_enter(&sc->sc_lock); if (sc->sc_dying) { ret = 0; } else if (!(sce->state & UGEN_BULK_RA)) { /* * We have no easy way of determining if a read will * yield any data or a write will happen. * So, emulate "seltrue". */ ret = filt_seltrue(kn, hint); } else if (sce->ra_wb_used == 0) { ret = 0; } else { kn->kn_data = sce->ra_wb_used; ret = 1; } mutex_exit(&sc->sc_lock); return ret; } static int filt_ugenwrite_bulk(struct knote *kn, long hint) { struct ugen_endpoint *sce = kn->kn_hook; struct ugen_softc *sc = sce->sc; int ret; mutex_enter(&sc->sc_lock); if (sc->sc_dying) { ret = 0; } else if (!(sce->state & UGEN_BULK_WB)) { /* * We have no easy way of determining if a read will * yield any data or a write will happen. * So, emulate "seltrue". */ ret = filt_seltrue(kn, hint); } else if (sce->ra_wb_used == sce->limit - sce->ibuf) { ret = 0; } else { kn->kn_data = (sce->limit - sce->ibuf) - sce->ra_wb_used; ret = 1; } mutex_exit(&sc->sc_lock); return ret; } static const struct filterops ugenread_intr_filtops = { .f_flags = FILTEROP_ISFD, .f_attach = NULL, .f_detach = filt_ugenrdetach, .f_event = filt_ugenread_intr, }; static const struct filterops ugenread_isoc_filtops = { .f_flags = FILTEROP_ISFD, .f_attach = NULL, .f_detach = filt_ugenrdetach, .f_event = filt_ugenread_isoc, }; static const struct filterops ugenread_bulk_filtops = { .f_flags = FILTEROP_ISFD, .f_attach = NULL, .f_detach = filt_ugenrdetach, .f_event = filt_ugenread_bulk, }; static const struct filterops ugenwrite_bulk_filtops = { .f_flags = FILTEROP_ISFD, .f_attach = NULL, .f_detach = filt_ugenrdetach, .f_event = filt_ugenwrite_bulk, }; static int ugenkqfilter(dev_t dev, struct knote *kn) { struct ugen_softc *sc; struct ugen_endpoint *sce; struct selinfo *sip; int error; if ((sc = ugenif_acquire(UGENUNIT(dev))) == NULL) return ENXIO; if (UGENENDPOINT(dev) == USB_CONTROL_ENDPOINT) { error = ENODEV; goto out; } switch (kn->kn_filter) { case EVFILT_READ: sce = &sc->sc_endpoints[UGENENDPOINT(dev)][IN]; if (sce == NULL) { error = EINVAL; goto out; } sip = &sce->rsel; switch (sce->edesc->bmAttributes & UE_XFERTYPE) { case UE_INTERRUPT: kn->kn_fop = &ugenread_intr_filtops; break; case UE_ISOCHRONOUS: kn->kn_fop = &ugenread_isoc_filtops; break; case UE_BULK: kn->kn_fop = &ugenread_bulk_filtops; break; default: error = EINVAL; goto out; } break; case EVFILT_WRITE: sce = &sc->sc_endpoints[UGENENDPOINT(dev)][OUT]; if (sce == NULL) { error = EINVAL; goto out; } sip = &sce->rsel; switch (sce->edesc->bmAttributes & UE_XFERTYPE) { case UE_INTERRUPT: case UE_ISOCHRONOUS: /* XXX poll doesn't support this */ error = EINVAL; goto out; case UE_BULK: kn->kn_fop = &ugenwrite_bulk_filtops; break; default: error = EINVAL; goto out; } break; default: error = EINVAL; goto out; } kn->kn_hook = sce; mutex_enter(&sc->sc_lock); selrecord_knote(sip, kn); mutex_exit(&sc->sc_lock); error = 0; out: ugenif_release(sc); return error; } MODULE(MODULE_CLASS_DRIVER, ugen, NULL); static int ugen_modcmd(modcmd_t cmd, void *aux) { switch (cmd) { case MODULE_CMD_INIT: mutex_init(&ugenif.lock, MUTEX_DEFAULT, IPL_NONE); rb_tree_init(&ugenif.tree, &ugenif_tree_ops); return 0; default: return ENOTTY; } }
2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 /*- * Copyright (c) 2013-2020 The NetBSD Foundation, Inc. * All rights reserved. * * This material is based upon work partially supported by The * NetBSD Foundation under a contract with Mindaugas Rasiukevicius. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /* * NPF configuration loading mechanism. * * The main operations on the configuration are the following: * 1) Read access, primarily from the npf_packet_handler() function. * 2) Write access on a particular set, mainly rule or table updates. * 3) Deletion of the configuration after the reload operation. * * Synchronization * * For the (1) case, EBR is used to allow concurrent access to * the configuration set (ruleset, etc). It guarantees that the * configuration will not be destroyed while accessing it. * * For the cases (2) and (3), mutual exclusion (npf_t::config_lock) * is used with, when necessary, the writer-side barrier of EBR. */ #ifdef _KERNEL #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: npf_conf.c,v 1.18 2022/02/13 19:20:11 riastradh Exp $"); #include <sys/param.h> #include <sys/types.h> #include <sys/atomic.h> #include <sys/kmem.h> #include <sys/mutex.h> #endif #include "npf_impl.h" #include "npf_conn.h" void npf_config_init(npf_t *npf) { npf_config_t *nc; mutex_init(&npf->config_lock, MUTEX_DEFAULT, IPL_SOFTNET); nc = npf_config_create(); /* * Load an empty configuration. */ nc->ruleset = npf_ruleset_create(0); nc->nat_ruleset = npf_ruleset_create(0); nc->rule_procs = npf_rprocset_create(); nc->tableset = npf_tableset_create(0); nc->default_pass = true; npf_config_load(npf, nc, NULL, true); KASSERT(npf->config != NULL); } npf_config_t * npf_config_create(void) { return kmem_zalloc(sizeof(npf_config_t), KM_SLEEP); } void npf_config_destroy(npf_config_t *nc) { /* * Note: the rulesets must be destroyed first, in order to drop * any references to the tableset. */ if (nc->ruleset) { npf_ruleset_destroy(nc->ruleset); } if (nc->nat_ruleset) { npf_ruleset_destroy(nc->nat_ruleset); } if (nc->rule_procs) { npf_rprocset_destroy(nc->rule_procs); } if (nc->tableset) { npf_tableset_destroy(nc->tableset); } kmem_free(nc, sizeof(npf_config_t)); } void npf_config_fini(npf_t *npf) { npf_conndb_t *cd = npf_conndb_create(); /* Flush the connections. */ mutex_enter(&npf->config_lock); npf_conn_tracking(npf, false); npf_ebr_full_sync(npf->ebr); npf_conn_load(npf, cd, false); npf_ifmap_flush(npf); mutex_exit(&npf->config_lock); npf_config_destroy(npf->config); mutex_destroy(&npf->config_lock); } /* * npf_config_load: the main routine performing configuration load. * Performs the necessary synchronization and destroys the old config. */ void npf_config_load(npf_t *npf, npf_config_t *nc, npf_conndb_t *conns, bool flush) { const bool load = conns != NULL; npf_config_t *onc; nc->default_pass = flush; /* * Acquire the lock and perform the first phase: * - Scan and use existing dynamic tables, reload only static. * - Scan and use matching NAT policies to preserve the connections. */ mutex_enter(&npf->config_lock); if ((onc = atomic_load_relaxed(&npf->config)) != NULL) { npf_ruleset_reload(npf, nc->ruleset, onc->ruleset, load); npf_tableset_reload(npf, nc->tableset, onc->tableset); npf_ruleset_reload(npf, nc->nat_ruleset, onc->nat_ruleset, load); } /* * Set the new config and release the lock. */ atomic_store_release(&npf->config, nc); if (onc == NULL) { /* Initial load, done. */ npf_ifmap_flush(npf); npf_conn_load(npf, conns, !flush); mutex_exit(&npf->config_lock); goto done; } /* * If we are going to flush the connections or load the new ones, * then disable the connection tracking for the grace period. */ if (flush || conns) { npf_conn_tracking(npf, false); } /* Synchronise: drain all references. */ npf_ebr_full_sync(npf->ebr); if (flush) { npf_portmap_flush(npf->portmap); npf_ifmap_flush(npf); } /* * G/C the existing connections and, if passed, load the new ones. * If not flushing - enable the connection tracking. */ npf_conn_load(npf, conns, !flush); mutex_exit(&npf->config_lock); /* Finally, it is safe to destroy the old config. */ npf_config_destroy(onc); done: /* Sync all interface address tables (can be done asynchronously). */ npf_ifaddr_syncall(npf); } /* * Writer-side exclusive locking. */ npf_config_t * npf_config_enter(npf_t *npf) { mutex_enter(&npf->config_lock); return npf->config; } void npf_config_exit(npf_t *npf) { mutex_exit(&npf->config_lock); } bool npf_config_locked_p(npf_t *npf) { return mutex_owned(&npf->config_lock); } void npf_config_sync(npf_t *npf) { KASSERT(npf_config_locked_p(npf)); npf_ebr_full_sync(npf->ebr); } /* * Reader-side synchronization routines. */ int npf_config_read_enter(npf_t *npf) { /* Note: issues an acquire fence. */ return npf_ebr_enter(npf->ebr); } void npf_config_read_exit(npf_t *npf, int s) { /* Note: issues a release fence. */ npf_ebr_exit(npf->ebr, s); } /* * Accessors. */ npf_ruleset_t * npf_config_ruleset(npf_t *npf) { npf_config_t *config = atomic_load_consume(&npf->config); KASSERT(npf_config_locked_p(npf) || npf_ebr_incrit_p(npf->ebr)); return config->ruleset; } npf_ruleset_t * npf_config_natset(npf_t *npf) { npf_config_t *config = atomic_load_consume(&npf->config); KASSERT(npf_config_locked_p(npf) || npf_ebr_incrit_p(npf->ebr)); return config->nat_ruleset; } npf_tableset_t * npf_config_tableset(npf_t *npf) { npf_config_t *config = atomic_load_consume(&npf->config); KASSERT(npf_config_locked_p(npf) || npf_ebr_incrit_p(npf->ebr)); return config->tableset; } bool npf_default_pass(npf_t *npf) { npf_config_t *config = atomic_load_consume(&npf->config); KASSERT(npf_config_locked_p(npf) || npf_ebr_incrit_p(npf->ebr)); return config->default_pass; }
2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 /* $NetBSD: nist_hash_drbg.c,v 1.3 2019/09/19 18:29:55 riastradh Exp $ */ /*- * Copyright (c) 2019 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Taylor R. Campbell. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /* * This file implements Hash_DRBG, a `deterministic random bit * generator' (more commonly known in lay terms and in the cryptography * literature as a pseudorandom bit generator or pseudorandom number * generator), described in * * Elaine Barker and John Kelsey, `Recommendation for Random * Number Generation Using Deterministic Random Bit Generators', * NIST SP800-90A, June 2015. * * This code is meant to work in userland or in kernel. For a test * program, compile with -DNIST_HASH_DRBG_MAIN to define a `main' * function; for verbose debugging output, compile with * -DNIST_HASH_DRBG_DEBUG, mainly useful if you need to change * something and have to diagnose what's wrong with the known-answer * tests. */ #ifdef _KERNEL #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: nist_hash_drbg.c,v 1.3 2019/09/19 18:29:55 riastradh Exp $"); #endif #include <sys/param.h> #include <sys/types.h> #include <sys/sha2.h> #ifdef _KERNEL #include <sys/systm.h> /* memcpy */ #include <lib/libkern/libkern.h> /* KASSERT */ #define ASSERT KASSERT #else #include <assert.h> #include <stdbool.h> #include <stdio.h> #include <string.h> #define ASSERT assert #define CTASSERT __CTASSERT #endif #include "nist_hash_drbg.h" #define secret /* must not use in variable-time operations; should zero */ #define arraycount(A) (sizeof(A)/sizeof(A[0])) CTASSERT(0 < NIST_HASH_DRBG_RESEED_INTERVAL); CTASSERT(NIST_HASH_DRBG_RESEED_INTERVAL <= INT_MAX); CTASSERT(NIST_HASH_DRBG_RESEED_INTERVAL <= ~(~0ull << 48)); /* Instantiation: SHA-256 */ #define HASH_LENGTH SHA256_DIGEST_LENGTH #define HASH_CTX SHA256_CTX #define hash_init SHA256_Init #define hash_update SHA256_Update #define hash_final SHA256_Final #define SEEDLEN_BYTES NIST_HASH_DRBG_SEEDLEN_BYTES struct hvec { const void *hv_base; size_t hv_len; }; static void hashgen(secret uint8_t *, size_t, const secret uint8_t[SEEDLEN_BYTES]); static void add8(secret uint8_t *, size_t, const secret uint8_t *, size_t); static void hash_df(secret void *, size_t, const struct hvec *, size_t); static void hash_df_block(secret void *, uint8_t, uint8_t[4], const struct hvec *, size_t); /* 10.1.1 Hash_DRBG */ int nist_hash_drbg_destroy(struct nist_hash_drbg *D) { explicit_memset(D, 0, sizeof(*D)); D->reseed_counter = UINT_MAX; /* paranoia: make generate fail */ /* Always return zero for hysterical raisins. (XXX) */ return 0; } /* 10.1.1.2 Instantiation of Hash_DRBG */ int nist_hash_drbg_instantiate(secret struct nist_hash_drbg *D, const secret void *entropy, size_t entropylen, const void *nonce, size_t noncelen, const void *personalization, size_t personalizationlen) { /* * 1. seed_material = entropy_input || nonce || personalization_string */ const struct hvec seed_material[] = { { .hv_base = entropy, .hv_len = entropylen }, { .hv_base = nonce, .hv_len = noncelen }, { .hv_base = personalization, .hv_len = personalizationlen }, }; /* * 2. seed = Hash_df(seed_material, seedlen) * 3. V = seed */ CTASSERT(sizeof D->V == SEEDLEN_BYTES); hash_df(D->V, sizeof D->V, seed_material, arraycount(seed_material)); /* 4. C = Hash_df((0x00 || V), seedlen) */ const struct hvec hv[] = { { .hv_base = (const uint8_t[]) {0x00}, .hv_len = 1 }, { .hv_base = D->V, .hv_len = sizeof D->V }, }; CTASSERT(sizeof D->C == SEEDLEN_BYTES); hash_df(D->C, sizeof D->C, hv, arraycount(hv)); /* 5. reseed_counter = 1 */ D->reseed_counter = 1; /* Always return zero for hysterical raisins. (XXX) */ return 0; } /* 10.1.1.3 Reseeding a Hash_DRBG Instantiation */ int nist_hash_drbg_reseed(secret struct nist_hash_drbg *D, const secret void *entropy, size_t entropylen, const void *additional, size_t additionallen) { /* 1. seed_material = 0x01 || V || entropy_input || additional_input */ const struct hvec seed_material[] = { { .hv_base = (const uint8_t[]) {0x01}, .hv_len = 1 }, { .hv_base = D->V, .hv_len = sizeof D->V }, { .hv_base = entropy, .hv_len = entropylen }, { .hv_base = additional, .hv_len = additionallen }, }; uint8_t seed[SEEDLEN_BYTES]; /* * 2. seed = Hash_df(seed_material, seedlen) * 3. V = seed */ CTASSERT(sizeof D->V == SEEDLEN_BYTES); hash_df(seed, sizeof seed, seed_material, arraycount(seed_material)); memcpy(D->V, seed, sizeof D->V); /* 3. C = Hash_df((0x00 || V), seedlen) */ const struct hvec hv[] = { { .hv_base = (const uint8_t[]) {0x00}, .hv_len = 1 }, { .hv_base = D->V, .hv_len = sizeof D->V }, }; CTASSERT(sizeof D->C == SEEDLEN_BYTES); hash_df(D->C, sizeof D->C, hv, arraycount(hv)); /* 5. reseed_counter = 1 */ D->reseed_counter = 1; /* Always return zero for hysterical raisins. (XXX) */ return 0; } /* 10.1.1.4 Generating Pseudorandom Bits Using Hash_DRBG */ int nist_hash_drbg_generate(secret struct nist_hash_drbg *D, secret void *output, size_t outputlen, const void *additional, size_t additionallen) { secret HASH_CTX ctx; secret uint8_t H[HASH_LENGTH]; uint8_t reseed_counter[4]; ASSERT(outputlen <= NIST_HASH_DRBG_MAX_REQUEST_BYTES); /* * 1. If reseed_counter > reseed_interval, then return an * indication that a reseed is required. */ if (D->reseed_counter > NIST_HASH_DRBG_RESEED_INTERVAL) return 1; /* 2. If (additional_input != Null), then do: */ if (additionallen) { /* 2.1 w = Hash(0x02 || V || additional_input) */ secret uint8_t w[HASH_LENGTH]; hash_init(&ctx); hash_update(&ctx, (const uint8_t[]) {0x02}, 1); hash_update(&ctx, D->V, sizeof D->V); hash_update(&ctx, additional, additionallen); hash_final(w, &ctx); /* 2.2 V = (V + w) mod 2^seedlen */ add8(D->V, sizeof D->V, w, sizeof w); explicit_memset(w, 0, sizeof w); } /* 3. (returned_bits) = Hashgen(requested_number_of_bits, V) */ hashgen(output, outputlen, D->V); /* 4. H = Hash(0x03 || V) */ hash_init(&ctx); hash_update(&ctx, (const uint8_t[]) {0x03}, 1); hash_update(&ctx, D->V, sizeof D->V); hash_final(H, &ctx); /* 5. V = (V + H + C + reseed_counter) mod 2^seedlen */ be32enc(reseed_counter, D->reseed_counter); add8(D->V, sizeof D->V, H, sizeof H); add8(D->V, sizeof D->V, D->C, sizeof D->C); add8(D->V, sizeof D->V, reseed_counter, sizeof reseed_counter); /* 6. reseed_counter = reseed_counter + 1 */ D->reseed_counter++; explicit_memset(&ctx, 0, sizeof ctx); explicit_memset(H, 0, sizeof H); /* 7. Return SUCCESS, ... */ return 0; } /* * p := H(V) || H(V + 1) || H(V + 2) || ... */ static void hashgen(secret uint8_t *p, size_t n, const secret uint8_t V[SEEDLEN_BYTES]) { secret uint8_t data[SEEDLEN_BYTES]; secret HASH_CTX ctx; /* Save a copy so that we can increment it. */ memcpy(data, V, SEEDLEN_BYTES); /* Generate block by block into p directly. */ while (HASH_LENGTH <= n) { hash_init(&ctx); hash_update(&ctx, data, SEEDLEN_BYTES); hash_final(p, &ctx); p += HASH_LENGTH; n -= HASH_LENGTH; add8(data, sizeof data, (const uint8_t[]) {1}, 1); } /* * If any partial block requested, generate a full block and * copy the part we need. */ if (n) { secret uint8_t t[HASH_LENGTH]; hash_init(&ctx); hash_update(&ctx, data, SEEDLEN_BYTES); hash_final(t, &ctx); memcpy(p, t, n); explicit_memset(t, 0, sizeof t); } explicit_memset(data, 0, sizeof data); explicit_memset(&ctx, 0, sizeof ctx); } /* * s := s + a (big-endian, radix-2^8) */ static void add8(secret uint8_t *s, size_t slen, const secret uint8_t *a, size_t alen) { const size_t smax = slen - 1, amax = alen - 1; size_t i; secret unsigned c = 0; /* 2^8 c + s_i := s_i + a_i + c */ for (i = 0; i < MIN(slen, alen); i++) { c += s[smax - i] + a[amax - i]; s[smax - i] = c & 0xff; c >>= 8; } /* 2^8 c + s_i := s_i + c */ for (; i < slen; i++) { c += s[smax - i]; s[smax - i] = c & 0xff; c >>= 8; } explicit_memset(&c, 0, sizeof c); } /* 10.4.1 Derivation Function Using a Hash Function (Hash_df) */ static void hash_df(void *h, size_t hlen, const struct hvec *input, size_t inputlen) { uint8_t *p = h; size_t n = hlen; uint8_t counter = 1; uint8_t hbits[4]; ASSERT(hlen <= 255*HASH_LENGTH); ASSERT(hlen <= UINT32_MAX/8); be32enc(hbits, 8*hlen); while (HASH_LENGTH <= n) { hash_df_block(p, counter++, hbits, input, inputlen); p += HASH_LENGTH; n -= HASH_LENGTH; } if (n) { secret uint8_t t[HASH_LENGTH]; hash_df_block(t, counter, hbits, input, inputlen); memcpy(p, t, n); explicit_memset(t, 0, sizeof t); } } static void hash_df_block(secret void *h, uint8_t counter, uint8_t hbits[4], const struct hvec *input, size_t inputlen) { secret HASH_CTX ctx; size_t i; /* * Hash_df Process, step 4.1: * Hash(counter || no_of_bits_to_return || input_string) */ hash_init(&ctx); hash_update(&ctx, &counter, 1); hash_update(&ctx, hbits, 4); for (i = 0; i < inputlen; i++) { if (input[i].hv_len) hash_update(&ctx, input[i].hv_base, input[i].hv_len); } hash_final(h, &ctx); explicit_memset(&ctx, 0, sizeof ctx); } /* * Known-answer test vectors for Hash_DRBG with SHA-256 */ /* Hash_DRBG.PDF, p. 190 */ static const uint8_t kat_entropy[3][SEEDLEN_BYTES] = { [0] = { 0x00,0x01,0x02,0x03, 0x04,0x05,0x06,0x07, 0x08,0x09,0x0a,0x0b, 0x0c,0x0d,0x0e,0x0f, 0x10,0x11,0x12,0x13, 0x14,0x15,0x16,0x17, 0x18,0x19,0x1a,0x1b, 0x1c,0x1d,0x1e,0x1f, 0x20,0x21,0x22,0x23, 0x24,0x25,0x26,0x27, 0x28,0x29,0x2a,0x2b, 0x2c,0x2d,0x2e,0x2f, 0x30,0x31,0x32,0x33, 0x34,0x35,0x36, }, [1] = { /* for reseed1 */ 0x80,0x81,0x82,0x83, 0x84,0x85,0x86,0x87, 0x88,0x89,0x8a,0x8b, 0x8c,0x8d,0x8e,0x8f, 0x90,0x91,0x92,0x93, 0x94,0x95,0x96,0x97, 0x98,0x99,0x9a,0x9b, 0x9c,0x9d,0x9e,0x9f, 0xa0,0xa1,0xa2,0xa3, 0xa4,0xa5,0xa6,0xa7, 0xa8,0xa9,0xaa,0xab, 0xac,0xad,0xae,0xaf, 0xb0,0xb1,0xb2,0xb3, 0xb4,0xb5,0xb6, }, [2] = { /* for reseed2 */ 0xc0,0xc1,0xc2,0xc3, 0xc4,0xc5,0xc6,0xc7, 0xc8,0xc9,0xca,0xcb, 0xcc,0xcd,0xce,0xcf, 0xd0,0xd1,0xd2,0xd3, 0xd4,0xd5,0xd6,0xd7, 0xd8,0xd9,0xda,0xdb, 0xdc,0xdd,0xde,0xdf, 0xe0,0xe1,0xe2,0xe3, 0xe4,0xe5,0xe6,0xe7, 0xe8,0xe9,0xea,0xeb, 0xec,0xed,0xee,0xef, 0xf0,0xf1,0xf2,0xf3, 0xf4,0xf5,0xf6, }, }; static const uint8_t kat_nonce[] = { 0x20,0x21,0x22,0x23, 0x24,0x25,0x26,0x27, }; static const struct hvec kat_zero = { .hv_base = 0, .hv_len = 0 }; static const struct hvec kat_personalization = { .hv_len = 55, .hv_base = (const void *)(const uint8_t[]) { /* p. 208 */ 0x40,0x41,0x42,0x43, 0x44,0x45,0x46,0x47, 0x48,0x49,0x4a,0x4b, 0x4c,0x4d,0x4e,0x4f, 0x50,0x51,0x52,0x53, 0x54,0x55,0x56,0x57, 0x58,0x59,0x5a,0x5b, 0x5c,0x5d,0x5e,0x5f, 0x60,0x61,0x62,0x63, 0x64,0x65,0x66,0x67, 0x68,0x69,0x6a,0x6b, 0x6c,0x6d,0x6e,0x6f, 0x70,0x71,0x72,0x73, 0x74,0x75,0x76, }, }; static const struct hvec *const kat_no_additional[] = { [0] = &kat_zero, [1] = &kat_zero, }; static const struct hvec *const kat_additional[] = { [0] = &(const struct hvec) { .hv_len = 55, .hv_base = (const void *)(const uint8_t[]) { 0x60,0x61,0x62,0x63, 0x64,0x65,0x66,0x67, 0x68,0x69,0x6a,0x6b, 0x6c,0x6d,0x6e,0x6f, 0x70,0x71,0x72,0x73, 0x74,0x75,0x76,0x77, 0x78,0x79,0x7a,0x7b, 0x7c,0x7d,0x7e,0x7f, 0x80,0x81,0x82,0x83, 0x84,0x85,0x86,0x87, 0x88,0x89,0x8a,0x8b, 0x8c,0x8d,0x8e,0x8f, 0x90,0x91,0x92,0x93, 0x94,0x95,0x96, }, }, [1] = &(const struct hvec) { .hv_len = 55, .hv_base = (const void *)(const uint8_t[]) { 0xa0,0xa1,0xa2,0xa3, 0xa4,0xa5,0xa6,0xa7, 0xa8,0xa9,0xaa,0xab, 0xac,0xad,0xae,0xaf, 0xb0,0xb1,0xb2,0xb3, 0xb4,0xb5,0xb6,0xb7, 0xb8,0xb9,0xba,0xbb, 0xbc,0xbd,0xbe,0xbf, 0xc0,0xc1,0xc2,0xc3, 0xc4,0xc5,0xc6,0xc7, 0xc8,0xc9,0xca,0xcb, 0xcc,0xcd,0xce,0xcf, 0xd0,0xd1,0xd2,0xd3, 0xd4,0xd5,0xd6, }, }, }; static const struct { const struct hvec *personalization; const struct hvec *const *additional; bool reseed; uint8_t C[SEEDLEN_BYTES]; uint8_t V[3][SEEDLEN_BYTES]; uint8_t rnd_val[2][64]; } kat[] = { [0] = { /* Hash_DRBG.pdf, p. 190 */ .personalization = &kat_zero, .additional = kat_no_additional, .reseed = false, .C = { /* p. 193 */ 0xe1,0x5d,0xe4,0xa8, 0xe3,0xb1,0x41,0x9b, 0x61,0xd5,0x34,0xf1, 0x5d,0xbd,0x31,0xee, 0x19,0xec,0x59,0x5f, 0x8b,0x98,0x11,0x1a, 0x94,0xf5,0x22,0x37, 0xad,0x5d,0x66,0xf0, 0xcf,0xaa,0xfd,0xdc, 0x90,0x19,0x59,0x02, 0xe9,0x79,0xf7,0x9b, 0x65,0x35,0x7f,0xea, 0x85,0x99,0x8e,0x4e, 0x37,0xd2,0xc1, }, .V = { [0] = { /* p. 192 */ 0xab,0x41,0xcd,0xe4, 0x37,0xab,0x8b,0x09, 0x1c,0xa7,0xc5,0x75, 0x5d,0x10,0xf0,0x11, 0x0c,0x1d,0xbd,0x46, 0x2f,0x22,0x6c,0xfd, 0xab,0xfb,0xb0,0x4a, 0x8b,0xcd,0xef,0x95, 0x16,0x7d,0x84,0xaf, 0x64,0x12,0x8c,0x0d, 0x71,0xf4,0xd5,0xb8, 0xc0,0xed,0xfb,0xbe, 0x3d,0xf4,0x04,0x48, 0xd2,0xd8,0xe1, }, [1] = { /* p. 195 */ 0x8c,0x9f,0xb2,0x8d, 0x1b,0x5c,0xcc,0xa4, 0x7e,0x7c,0xfa,0x66, 0xba,0xce,0x21,0xff, 0x26,0x0a,0x16,0xa5, 0xba,0xba,0x7f,0x14, 0x4e,0x75,0x79,0x36, 0x8e,0x99,0x55,0xbe, 0xfb,0xe7,0x00,0xee, 0xf8,0x72,0x77,0x6b, 0x17,0xae,0xff,0xd5, 0x3d,0x76,0xf4,0xe3, 0xbe,0x65,0xe8,0xc9, 0x4b,0x70,0x8f, }, [2] = { /* p. 197 */ 0x6d,0xfd,0x97,0x35, 0xff,0x0e,0x0e,0x3f, 0xe0,0x52,0x2f,0x58, 0x18,0x8b,0x53,0xed, 0x3f,0xf6,0x70,0x05, 0x46,0x52,0x90,0x44, 0xb6,0x2b,0xe1,0x7d, 0x1b,0x1c,0x21,0xd0, 0x91,0xb0,0x89,0xb1, 0x77,0x47,0x95,0xdb, 0x14,0x22,0xa8,0x6c, 0x95,0x46,0x34,0x80, 0x76,0xb4,0xb6,0x21, 0xc7,0x2f,0x91, }, }, .rnd_val = { [0] = { 0x77,0xe0,0x5a,0x0e, 0x7d,0xc7,0x8a,0xb5, 0xd8,0x93,0x4d,0x5e, 0x93,0xe8,0x2c,0x06, 0xa0,0x7c,0x04,0xce, 0xe6,0xc9,0xc5,0x30, 0x45,0xee,0xb4,0x85, 0x87,0x27,0x77,0xcf, 0x3b,0x3e,0x35,0xc4, 0x74,0xf9,0x76,0xb8, 0x94,0xbf,0x30,0x1a, 0x86,0xfa,0x65,0x1f, 0x46,0x39,0x70,0xe8, 0x9d,0x4a,0x05,0x34, 0xb2,0xec,0xad,0x29, 0xec,0x04,0x4e,0x7e, }, { 0x5f,0xf4,0xba,0x49, 0x3c,0x40,0xcf,0xff, 0x3b,0x01,0xe4,0x72, 0xc5,0x75,0x66,0x8c, 0xce,0x38,0x80,0xb9, 0x29,0x0b,0x05,0xbf, 0xed,0xe5,0xec,0x96, 0xed,0x5e,0x9b,0x28, 0x98,0x50,0x8b,0x09, 0xbc,0x80,0x0e,0xee, 0x09,0x9a,0x3c,0x90, 0x60,0x2a,0xbd,0x4b, 0x1d,0x4f,0x34,0x3d, 0x49,0x7c,0x60,0x55, 0xc8,0x7b,0xb9,0x56, 0xd5,0x3b,0xf3,0x51, }, }, }, [1] = { /* Hash_DRBG.pdf, p. 198 */ .personalization = &kat_zero, .additional = kat_additional, .reseed = false, .C = { /* p. 201 */ 0xe1,0x5d,0xe4,0xa8, 0xe3,0xb1,0x41,0x9b, 0x61,0xd5,0x34,0xf1, 0x5d,0xbd,0x31,0xee, 0x19,0xec,0x59,0x5f, 0x8b,0x98,0x11,0x1a, 0x94,0xf5,0x22,0x37, 0xad,0x5d,0x66,0xf0, 0xcf,0xaa,0xfd,0xdc, 0x90,0x19,0x59,0x02, 0xe9,0x79,0xf7,0x9b, 0x65,0x35,0x7f,0xea, 0x85,0x99,0x8e,0x4e, 0x37,0xd2,0xc1, }, .V = { [0] = { /* p. 200 */ 0xab,0x41,0xcd,0xe4, 0x37,0xab,0x8b,0x09, 0x1c,0xa7,0xc5,0x75, 0x5d,0x10,0xf0,0x11, 0x0c,0x1d,0xbd,0x46, 0x2f,0x22,0x6c,0xfd, 0xab,0xfb,0xb0,0x4a, 0x8b,0xcd,0xef,0x95, 0x16,0x7d,0x84,0xaf, 0x64,0x12,0x8c,0x0d, 0x71,0xf4,0xd5,0xb8, 0xc0,0xed,0xfb,0xbe, 0x3d,0xf4,0x04,0x48, 0xd2,0xd8,0xe1, }, [1] = { /* p. 204 */ 0x8c,0x9f,0xb2,0x8d, 0x1b,0x5c,0xcc,0xa4, 0x7e,0x7c,0xfa,0x66, 0xba,0xce,0x21,0xff, 0x26,0x0a,0x16,0xa5, 0xba,0xba,0x7f,0x1f, 0xd3,0x3b,0x30,0x79, 0x8f,0xb2,0x9a,0x0f, 0xba,0x66,0x65,0x02, 0x7d,0x7f,0x10,0x58, 0x71,0xbf,0xb4,0x40, 0xdf,0xbe,0xde,0x81, 0xd0,0x4d,0x22,0xdf, 0xf7,0x89,0xe1, }, [2] = { /* p. 207 */ 0x6d,0xfd,0x97,0x35, 0xff,0x0e,0x0e,0x3f, 0xe0,0x52,0x2f,0x58, 0x18,0x8b,0x53,0xed, 0x3f,0xf6,0x70,0x05, 0x46,0x52,0x90,0xe1, 0x7c,0x5a,0xd8,0x2d, 0xa9,0x2a,0x05,0x01, 0xaa,0x66,0x3a,0xa6, 0x9f,0xa5,0xa0,0xb0, 0x81,0x2b,0x4b,0x4f, 0xaf,0xf3,0xfe,0xce, 0x79,0xcc,0xf6,0xaa, 0xde,0xc1,0xd0, }, }, .rnd_val = { [0] = { /* p. 203 */ 0x51,0x07,0x24,0xb9, 0x3a,0xe9,0xa1,0x82, 0x70,0xe4,0x84,0x73, 0x71,0x1d,0x88,0x24, 0x63,0x1b,0xaa,0x7f, 0x1d,0x9a,0xc9,0x28, 0x4e,0x7e,0xc8,0xf3, 0x63,0x7f,0x7a,0x74, 0x3b,0x36,0x44,0xeb, 0x96,0xc9,0x86,0x27, 0xc8,0xfd,0x40,0x5a, 0x7a,0x46,0x03,0xf3, 0x8c,0xff,0x7c,0x89, 0xe9,0xc1,0x33,0xf5, 0x85,0x1f,0x40,0xe9, 0x20,0x30,0xfe,0xa2, }, [1] = { /* p. 206 */ 0x62,0x53,0xda,0x3a, 0xae,0x8b,0x88,0xa3, 0xb7,0x46,0xe4,0xc8, 0xb2,0x63,0x5c,0x54, 0x0f,0x6e,0x9e,0xa7, 0x15,0x7e,0xe6,0x9d, 0xd7,0x1e,0xfb,0x2e, 0x8f,0xf7,0xbb,0xe1, 0xe3,0x33,0x68,0x88, 0x38,0xdd,0x7d,0xe4, 0x9c,0xc8,0x89,0x90, 0x30,0x9c,0x96,0xcd, 0xb2,0xab,0x92,0x95, 0x74,0x36,0xbf,0x83, 0xd1,0xbd,0x83,0x08, 0x19,0xc7,0x48,0xca, }, }, }, [2] = { /* Hash_DRBG.pdf, p. 208 */ .personalization = &kat_personalization, .additional = kat_no_additional, .reseed = false, .C = { /* p. 211 */ 0x44,0x74,0x8a,0x78, 0xb1,0x6e,0x75,0x55, 0x9f,0x88,0x1d,0x51, 0xc1,0x5d,0xfe,0x6c, 0x52,0xcf,0xb0,0xbb, 0x71,0x62,0x01,0x69, 0xc7,0x93,0x34,0x27, 0x67,0xe7,0xf8,0x87, 0x5f,0x42,0xcb,0x6a, 0x20,0xc8,0x9d,0x7c, 0x6e,0xf3,0xdc,0x61, 0x0d,0x8f,0xf2,0x03, 0xd6,0x76,0x6c,0xed, 0x19,0x19,0xd0, }, .V = { [0] = { /* p. 210 */ 0xa3,0xe9,0x4e,0x39, 0x26,0xfd,0xa1,0x69, 0xc3,0x03,0xd6,0x64, 0x38,0x39,0x05,0xe0, 0xd7,0x99,0x62,0xd1, 0x65,0x44,0x6d,0x63, 0xbd,0xa6,0x54,0xd1, 0x32,0xf7,0x2d,0xb4, 0x71,0x56,0x4b,0x45, 0x6f,0xf2,0xee,0xc8, 0x36,0x42,0x2a,0xcc, 0x5a,0x02,0x99,0x35, 0xa7,0x99,0x29,0x90, 0x94,0xa1,0xca, }, [1] = { /* p. 213 */ 0xe8,0x5d,0xd8,0xb1, 0xd8,0x6c,0x16,0xbf, 0x62,0x8b,0xf3,0xb5, 0xf9,0x97,0x04,0x4d, 0x2a,0x69,0x13,0x8c, 0xd6,0xa6,0x6e,0xe7, 0x36,0xdb,0xaa,0x3b, 0xf1,0xd0,0x28,0x3b, 0x71,0x7b,0x33,0x6e, 0xb3,0xae,0x5b,0xdd, 0x04,0x17,0x2e,0xa2, 0x6e,0x5a,0x48,0xf3, 0xb3,0xfb,0xab,0xf8, 0x2f,0x76,0x79, }, [2] = { /* p. 215 */ 0x2c,0xd2,0x63,0x2a, 0x89,0xda,0x8c,0x15, 0x02,0x14,0x11,0x07, 0xba,0xf5,0x02,0xb9, 0x7d,0x38,0xc4,0x48, 0x48,0x08,0x71,0x0a, 0x66,0xf8,0x40,0x11, 0xd7,0x02,0x8d,0x14, 0xd3,0x15,0x5a,0x73, 0x79,0xad,0xd5,0x3c, 0xc8,0xea,0x84,0xd0, 0xfc,0x64,0x1d,0xfc, 0x62,0x9e,0x06,0x19, 0x1f,0x5f,0x6d, }, }, .rnd_val = { [0] = { /* p. 213 */ 0x4a,0x62,0x66,0x4f, 0x26,0x6e,0xe5,0x37, 0xb9,0x0d,0x64,0xb0, 0x5e,0x1d,0x81,0x3d, 0x28,0xb1,0x59,0xa9, 0x79,0xf1,0x50,0x9d, 0xde,0x31,0xb7,0x1d, 0xa4,0x3d,0x54,0x6e, 0xe8,0xe7,0x86,0x78, 0x20,0x2d,0xc2,0x37, 0xad,0x4a,0xfe,0x7d, 0xf3,0x10,0xc9,0xa4, 0x13,0xe3,0x8a,0xaf, 0x41,0x7d,0x2d,0x22, 0x5a,0xa3,0x65,0xec, 0x4a,0x7d,0x29,0x96, }, [1] = { /* p. 215 */ 0x59,0x58,0x3d,0x3c, 0x0a,0xc3,0x71,0x30, 0xc4,0x78,0x9a,0x83, 0x11,0xb8,0xca,0x8f, 0x98,0x5e,0xf1,0xe8, 0xf9,0x4d,0x95,0x4e, 0x32,0xe3,0x44,0xa6, 0x21,0xc2,0x4b,0x2f, 0x37,0x1d,0xa9,0xba, 0x3c,0x33,0x15,0x3f, 0x09,0xe5,0x51,0x45, 0xe7,0x62,0x92,0x6b, 0x73,0xac,0x14,0x7a, 0x1e,0x86,0x31,0xd1, 0xcc,0xd0,0x85,0x67, 0xcf,0x67,0x7c,0x72, }, }, }, [3] = { /* Hash_DRBG.pdf, p. 215 */ .personalization = &kat_personalization, .additional = kat_additional, .reseed = false, .C = { /* p. 220 */ 0x44,0x74,0x8a,0x78, 0xb1,0x6e,0x75,0x55, 0x9f,0x88,0x1d,0x51, 0xc1,0x5d,0xfe,0x6c, 0x52,0xcf,0xb0,0xbb, 0x71,0x62,0x01,0x69, 0xc7,0x93,0x34,0x27, 0x67,0xe7,0xf8,0x87, 0x5f,0x42,0xcb,0x6a, 0x20,0xc8,0x9d,0x7c, 0x6e,0xf3,0xdc,0x61, 0x0d,0x8f,0xf2,0x03, 0xd6,0x76,0x6c,0xed, 0x19,0x19,0xd0, }, .V = { [0] = { /* p. 218 */ 0xa3,0xe9,0x4e,0x39, 0x26,0xfd,0xa1,0x69, 0xc3,0x03,0xd6,0x64, 0x38,0x39,0x05,0xe0, 0xd7,0x99,0x62,0xd1, 0x65,0x44,0x6d,0x63, 0xbd,0xa6,0x54,0xd1, 0x32,0xf7,0x2d,0xb4, 0x71,0x56,0x4b,0x45, 0x6f,0xf2,0xee,0xc8, 0x36,0x42,0x2a,0xcc, 0x5a,0x02,0x99,0x35, 0xa7,0x99,0x29,0x90, 0x94,0xa1,0xca, }, [1] = { /* p. 222 */ 0xe8,0x5d,0xd8,0xb1, 0xd8,0x6c,0x16,0xbf, 0x62,0x8b,0xf3,0xb5, 0xf9,0x97,0x04,0x4d, 0x2a,0x69,0x13,0x8c, 0xd6,0xa6,0x6f,0x8c, 0xa8,0x7b,0x87,0x43, 0x50,0x20,0x2e,0x1d, 0x8a,0xb0,0xb5,0xad, 0x47,0xac,0xc2,0x75, 0x40,0x28,0x9f,0xe3, 0xa8,0xe3,0x1f,0x7b, 0x56,0x58,0xdd,0xd1, 0x96,0x94,0x89, }, [2] = { /* p. 225 */ 0x2c,0xd2,0x63,0x2a, 0x89,0xda,0x8c,0x15, 0x02,0x14,0x11,0x07, 0xba,0xf5,0x02,0xb9, 0x7d,0x38,0xc4,0x48, 0x48,0x08,0x71,0xb2, 0x77,0xae,0xc7,0xff, 0x8d,0xa2,0x3c,0x71, 0xef,0xf5,0x9d,0xc2, 0x4e,0x5e,0x4c,0x7f, 0x58,0x47,0xb0,0xc1, 0x2f,0x6a,0x59,0x9e, 0x6b,0x2e,0xda,0xc0, 0x30,0x6b,0xcd, }, }, .rnd_val = { /* p. 222 */ [0] = { 0xe0,0xb9,0x7c,0x82, 0x12,0x68,0xfd,0x3b, 0xb2,0xca,0xbf,0xd1, 0xf9,0x54,0x84,0x78, 0xae,0x8a,0x60,0x41, 0x7f,0x7b,0x09,0x4a, 0x26,0x13,0x95,0x46, 0x06,0x2b,0x52,0x1c, 0xfd,0x33,0xe4,0xe3, 0x9b,0x9d,0xcd,0x0a, 0x3d,0xa1,0x52,0x09, 0xc7,0x2a,0xdb,0xe5, 0x8c,0x20,0xab,0x34, 0x07,0x02,0x69,0x51, 0x29,0x7a,0xd2,0x54, 0x30,0x75,0x53,0xa5, }, [1] = { /* p. 225 */ 0xc1,0xac,0xd3,0xad, 0xa4,0xc8,0xc4,0x95, 0xbf,0x17,0x9d,0xb5, 0x98,0x22,0xc3,0x51, 0xbc,0x47,0x9a,0xbe, 0x4e,0xb2,0x8f,0x84, 0x39,0x57,0xb1,0x1e, 0x3c,0x2b,0xc0,0x48, 0x83,0x96,0x42,0x97, 0x97,0x5b,0xd7,0x2d, 0x10,0x24,0xab,0xcf, 0x6f,0x66,0x15,0xd7, 0xf5,0xb4,0xfd,0x1e, 0x40,0xa6,0x4e,0xeb, 0x45,0xba,0x21,0x81, 0xb8,0x39,0x37,0xed, }, }, }, [4] = { /* Hash_DRBG.pdf, p. 225 */ .personalization = &kat_zero, .additional = kat_no_additional, .reseed = true, .C = { /* p. 229 */ 0xe1,0x5d,0xe4,0xa8, 0xe3,0xb1,0x41,0x9b, 0x61,0xd5,0x34,0xf1, 0x5d,0xbd,0x31,0xee, 0x19,0xec,0x59,0x5f, 0x8b,0x98,0x11,0x1a, 0x94,0xf5,0x22,0x37, 0xad,0x5d,0x66,0xf0, 0xcf,0xaa,0xfd,0xdc, 0x90,0x19,0x59,0x02, 0xe9,0x79,0xf7,0x9b, 0x65,0x35,0x7f,0xea, 0x85,0x99,0x8e,0x4e, 0x37,0xd2,0xc1, }, .V = { [0] = { /* p. 227 */ 0xab,0x41,0xcd,0xe4, 0x37,0xab,0x8b,0x09, 0x1c,0xa7,0xc5,0x75, 0x5d,0x10,0xf0,0x11, 0x0c,0x1d,0xbd,0x46, 0x2f,0x22,0x6c,0xfd, 0xab,0xfb,0xb0,0x4a, 0x8b,0xcd,0xef,0x95, 0x16,0x7d,0x84,0xaf, 0x64,0x12,0x8c,0x0d, 0x71,0xf4,0xd5,0xb8, 0xc0,0xed,0xfb,0xbe, 0x3d,0xf4,0x04,0x48, 0xd2,0xd8,0xe1, }, [1] = { /* p. 234 */ 0x23,0x97,0x6c,0x61, 0x63,0xd7,0xe2,0x4a, 0x1a,0x03,0x8f,0x2b, 0x2b,0x64,0x67,0x97, 0x50,0xca,0x9e,0xd8, 0xd1,0x40,0x69,0x8d, 0x64,0x22,0x39,0x7b, 0x02,0x96,0x9e,0x6e, 0xcd,0xd2,0x9d,0xac, 0xc5,0x76,0x7e,0x2c, 0xc2,0xd0,0xa1,0x56, 0xc8,0x7a,0xd0,0xb3, 0x57,0x89,0x05,0x07, 0xe0,0x37,0x77, }, [2] = { /* p. 239 */ 0x92,0xfb,0x0e,0x48, 0x0e,0x86,0x99,0x13, 0xc7,0xad,0x45,0xc7, 0xe3,0xfd,0x46,0x10, 0x17,0xe5,0xa6,0xb7, 0x70,0xf3,0x3b,0x31, 0x3c,0x38,0x83,0xf1, 0xcc,0x56,0x71,0x89, 0x45,0x21,0xf5,0xed, 0xe6,0x2e,0xaa,0xb0, 0x83,0xb1,0x41,0xa7, 0x5b,0x5c,0xc0,0x22, 0x60,0x5a,0x8a,0x3d, 0xc7,0x1b,0xa7, }, }, .rnd_val = { [0] = { /* p. 234 */ 0x92,0x27,0x55,0x23, 0xc7,0x0e,0x56,0x7b, 0xcf,0x9b,0x35,0xec, 0x50,0xb9,0x33,0xf8, 0x12,0x61,0x6d,0xf5, 0x86,0xb7,0xf7,0x2e, 0xe1,0xbc,0x77,0x35, 0xa5,0xc2,0x65,0x43, 0x73,0xcb,0xbc,0x72, 0x31,0x6d,0xff,0x84, 0x20,0xa3,0x3b,0xf0, 0x2b,0x97,0xac,0x8d, 0x19,0x52,0x58,0x3f, 0x27,0x0a,0xcd,0x70, 0x05,0xcc,0x02,0x7f, 0x4c,0xf1,0x18,0x7e, }, [1] = { /* p. 239 */ 0x68,0x1a,0x46,0xb2, 0xaa,0x86,0x94,0xa0, 0xfe,0x4d,0xee,0xa7, 0x20,0x92,0x7a,0x84, 0xea,0xaa,0x98,0x5e, 0x59,0xc1,0x9f,0x8b, 0xe0,0x98,0x4d,0x8c, 0xbe,0xf8,0xc6,0x9b, 0x75,0x41,0x67,0x64, 0x19,0x46,0xe0,0x40, 0xee,0x20,0x43,0xe1, 0xcc,0xb2,0x9d,0xcf, 0x06,0x3c,0x0a,0x50, 0x83,0x0e,0x42,0x8e, 0x6d,0xca,0x26,0x2e, 0xcd,0x77,0xc5,0x42, }, }, }, [5] = { /* Hash_DRBG.pdf, p. 239 */ .personalization = &kat_zero, .additional = kat_additional, .reseed = true, .C = { /* p. 243 */ 0xe1,0x5d,0xe4,0xa8, 0xe3,0xb1,0x41,0x9b, 0x61,0xd5,0x34,0xf1, 0x5d,0xbd,0x31,0xee, 0x19,0xec,0x59,0x5f, 0x8b,0x98,0x11,0x1a, 0x94,0xf5,0x22,0x37, 0xad,0x5d,0x66,0xf0, 0xcf,0xaa,0xfd,0xdc, 0x90,0x19,0x59,0x02, 0xe9,0x79,0xf7,0x9b, 0x65,0x35,0x7f,0xea, 0x85,0x99,0x8e,0x4e, 0x37,0xd2,0xc1, }, .V = { [0] = { /* p. 242 */ 0xab,0x41,0xcd,0xe4, 0x37,0xab,0x8b,0x09, 0x1c,0xa7,0xc5,0x75, 0x5d,0x10,0xf0,0x11, 0x0c,0x1d,0xbd,0x46, 0x2f,0x22,0x6c,0xfd, 0xab,0xfb,0xb0,0x4a, 0x8b,0xcd,0xef,0x95, 0x16,0x7d,0x84,0xaf, 0x64,0x12,0x8c,0x0d, 0x71,0xf4,0xd5,0xb8, 0xc0,0xed,0xfb,0xbe, 0x3d,0xf4,0x04,0x48, 0xd2,0xd8,0xe1, }, [1] = { /* p. 249 */ 0xb3,0x74,0x95,0x46, 0x81,0xcf,0xc9,0x5b, 0x8d,0xb8,0x39,0x52, 0x8c,0x71,0x08,0x83, 0x5e,0xb4,0xf3,0x0a, 0xd9,0x1c,0xbe,0x9e, 0xa0,0xd5,0x45,0xcc, 0xfd,0x18,0x13,0x2a, 0xf1,0xd3,0x76,0x8f, 0x47,0x02,0x77,0x2b, 0x69,0x15,0x9f,0x2c, 0xc0,0x7f,0x48,0x74, 0x1e,0xb5,0xb2,0xb1, 0x22,0x11,0x25, }, [2] = { /* p. 254 */ 0xbf,0xe3,0xd6,0x81, 0xa2,0x0f,0xbe,0x39, 0x03,0x8f,0x4d,0x66, 0x77,0x7c,0x1b,0xe5, 0x79,0xee,0xb4,0x85, 0x7b,0x42,0xf2,0x1c, 0x3f,0x59,0x8b,0x59, 0x62,0xb7,0xaa,0x48, 0x0e,0xa5,0x65,0xfe, 0xea,0xbd,0xfb,0xd6, 0xa7,0xec,0xcb,0x96, 0x02,0xc1,0x4b,0xfa, 0x30,0xf0,0xf9,0x81, 0x90,0x0c,0xd0, }, }, .rnd_val = { [0] = { /* p. 249 */ 0x11,0x60,0x1b,0x72, 0xca,0x60,0x89,0x73, 0x6b,0x20,0x47,0x44, 0xb2,0x9d,0xa1,0xaa, 0xaf,0xba,0xca,0xa5, 0x28,0x8f,0x06,0xbe, 0x48,0x45,0x69,0xcc, 0xed,0xbe,0xce,0x03, 0xe8,0x22,0xea,0xa5, 0xb1,0x4f,0x0e,0x04, 0x94,0x8c,0x05,0xcd, 0x3c,0xc2,0xe2,0x88, 0x9a,0x89,0xfa,0x03, 0xd6,0x5d,0x4d,0x74, 0xac,0x50,0xff,0x6b, 0xd8,0x56,0xe5,0x79, }, [1] = { /* p. 255 */ 0x05,0x5b,0xc1,0x28, 0xcc,0x2d,0x0e,0x25, 0x0f,0x47,0xe4,0xe4, 0xf5,0x82,0x37,0x5d, 0xe3,0xee,0x5e,0x9f, 0xe8,0x31,0x68,0x74, 0x97,0xe5,0xaf,0x1e, 0x7c,0xb6,0x9e,0xfd, 0xeb,0xd2,0xfd,0x31, 0xc7,0xce,0x2b,0xba, 0x0d,0xbc,0x6c,0x74, 0xc8,0xa2,0x0a,0x7d, 0x72,0xf6,0x0e,0x6d, 0x9f,0x63,0xed,0x50, 0x9e,0x96,0x3e,0x54, 0xa6,0x9e,0x90,0x48, }, }, }, [6] = { /* Hash_DRBG.pdf, p. 255 */ .personalization = &kat_personalization, .additional = kat_no_additional, .reseed = true, .C = { /* p. 259 */ 0x44,0x74,0x8a,0x78, 0xb1,0x6e,0x75,0x55, 0x9f,0x88,0x1d,0x51, 0xc1,0x5d,0xfe,0x6c, 0x52,0xcf,0xb0,0xbb, 0x71,0x62,0x01,0x69, 0xc7,0x93,0x34,0x27, 0x67,0xe7,0xf8,0x87, 0x5f,0x42,0xcb,0x6a, 0x20,0xc8,0x9d,0x7c, 0x6e,0xf3,0xdc,0x61, 0x0d,0x8f,0xf2,0x03, 0xd6,0x76,0x6c,0xed, 0x19,0x19,0xd0, }, .V = { [0] = { /* p. 257 */ 0xa3,0xe9,0x4e,0x39, 0x26,0xfd,0xa1,0x69, 0xc3,0x03,0xd6,0x64, 0x38,0x39,0x05,0xe0, 0xd7,0x99,0x62,0xd1, 0x65,0x44,0x6d,0x63, 0xbd,0xa6,0x54,0xd1, 0x32,0xf7,0x2d,0xb4, 0x71,0x56,0x4b,0x45, 0x6f,0xf2,0xee,0xc8, 0x36,0x42,0x2a,0xcc, 0x5a,0x02,0x99,0x35, 0xa7,0x99,0x29,0x90, 0x94,0xa1,0xca, }, [1] = { /* p. 264 */ 0xaa,0x11,0x1b,0x0e, 0xd5,0x6c,0xf4,0xa6, 0xcc,0xe4,0xad,0xe7, 0xf1,0x1b,0x06,0x10, 0x45,0xbf,0x10,0x92, 0xcb,0xb3,0x8f,0xf3, 0x23,0x95,0xea,0x62, 0xd2,0x6b,0x27,0xc8, 0x86,0x89,0x45,0xc5, 0x93,0xba,0x70,0xc3, 0x84,0xad,0xad,0x45, 0x77,0x1c,0x93,0xb0, 0x9c,0x27,0x69,0x07, 0x52,0xd1,0xd8, }, [2] = { /* p. 269 */ 0x5f,0x0f,0xd4,0x0c, 0x8c,0x82,0xef,0x41, 0x03,0x14,0xb8,0x30, 0xc2,0x0f,0xcc,0xea, 0x71,0x59,0x18,0x9a, 0xea,0x13,0xe8,0x48, 0x75,0x68,0x68,0x18, 0xcd,0x4f,0x12,0xb9, 0xde,0xa8,0x82,0x58, 0x16,0xa4,0x13,0xa2, 0x95,0x72,0x5e,0xb3, 0x3e,0x33,0xb9,0xad, 0xfe,0xe0,0xb1,0xc2, 0x34,0x0a,0xe0, }, }, .rnd_val = { [0] = { /* p. 264 */ 0x7a,0x33,0xd3,0x90, 0x33,0xf8,0x60,0x58, 0x9f,0x37,0x5e,0x73, 0x35,0x30,0x75,0x52, 0x96,0x58,0xbb,0xed, 0x99,0xc8,0xa0,0xef, 0x5e,0x28,0xb3,0x51, 0xb2,0xdf,0x33,0x58, 0xb3,0xd8,0x9b,0xac, 0x72,0x25,0xdf,0x9e, 0x3b,0xcd,0x08,0x36, 0xb9,0x9b,0x5d,0xbf, 0x36,0x3a,0x17,0x0c, 0x7b,0xb9,0xbe,0x41, 0xa4,0xaa,0x97,0x44, 0x5e,0xce,0xe4,0x1e, }, [1] = { /* p. 269 */ 0x04,0x1a,0xbd,0x94, 0x07,0x9a,0x05,0x71, 0x88,0x5f,0x16,0x65, 0x94,0x4e,0x0e,0x7f, 0x1b,0xfa,0xcd,0xea, 0xea,0xe9,0xd4,0x4e, 0xed,0xc1,0x1f,0xad, 0xd8,0x4c,0x34,0xc7, 0xca,0xa7,0x3d,0x09, 0xa0,0x19,0x31,0x93, 0xfa,0x40,0xa1,0x9f, 0x64,0x4f,0x04,0x8d, 0x2a,0x54,0x17,0x04, 0x25,0x53,0xdf,0x52, 0x51,0x74,0x1b,0x40, 0xea,0xcf,0xeb,0x98, }, }, }, [7] = { /* Hash_DRBG.pdf, p. 269 */ .personalization = &kat_personalization, .additional = kat_additional, .reseed = true, .C = { /* p. 274 */ 0x44,0x74,0x8a,0x78, 0xb1,0x6e,0x75,0x55, 0x9f,0x88,0x1d,0x51, 0xc1,0x5d,0xfe,0x6c, 0x52,0xcf,0xb0,0xbb, 0x71,0x62,0x01,0x69, 0xc7,0x93,0x34,0x27, 0x67,0xe7,0xf8,0x87, 0x5f,0x42,0xcb,0x6a, 0x20,0xc8,0x9d,0x7c, 0x6e,0xf3,0xdc,0x61, 0x0d,0x8f,0xf2,0x03, 0xd6,0x76,0x6c,0xed, 0x19,0x19,0xd0, }, .V = { [0] = { /* p. 272 */ 0xa3,0xe9,0x4e,0x39, 0x26,0xfd,0xa1,0x69, 0xc3,0x03,0xd6,0x64, 0x38,0x39,0x05,0xe0, 0xd7,0x99,0x62,0xd1, 0x65,0x44,0x6d,0x63, 0xbd,0xa6,0x54,0xd1, 0x32,0xf7,0x2d,0xb4, 0x71,0x56,0x4b,0x45, 0x6f,0xf2,0xee,0xc8, 0x36,0x42,0x2a,0xcc, 0x5a,0x02,0x99,0x35, 0xa7,0x99,0x29,0x90, 0x94,0xa1,0xca, }, [1] = { /* p. 279 */ 0xaa,0xf6,0xb9,0x9b, 0x7f,0x84,0xb0,0x36, 0xe1,0xcc,0xbc,0x9d, 0x57,0x3a,0x36,0xb8, 0xbd,0xd4,0x7c,0x35, 0x8b,0xb5,0xf3,0xc1, 0xd6,0xe7,0x90,0x3a, 0xaa,0x29,0xf1,0xc8, 0x7a,0xe6,0x66,0xb8, 0x86,0x93,0xbe,0xf4, 0x6c,0x51,0xc2,0x4c, 0x47,0xbe,0xfe,0x4b, 0x35,0x75,0x4d,0xcb, 0xfa,0x1e,0x7d, }, [2] = { /* p. 285 */ 0x0c,0x75,0x77,0x4d, 0x61,0x02,0x69,0xad, 0x5b,0xb4,0xab,0xbb, 0x14,0x83,0x23,0xc9, 0x78,0x9f,0x8f,0x76, 0x25,0xcc,0x34,0x33, 0x7c,0x03,0x47,0x2d, 0x9a,0x0c,0x4f,0xac, 0x30,0xbe,0xd2,0xdd, 0xde,0x64,0xb8,0x7a, 0x2c,0x70,0x67,0x52, 0xc2,0x1a,0xc0,0x11, 0x27,0x43,0x59,0x2c, 0x4f,0xdf,0x67, }, }, .rnd_val = { /* p. 279 */ [0] = { 0x88,0x97,0x32,0x97, 0x5b,0x36,0xe8,0xe2, 0xe7,0xb7,0x40,0x50, 0xae,0xa1,0x71,0x39, 0xda,0x2b,0x86,0x34, 0xdc,0xe2,0x13,0x3b, 0x06,0x34,0x74,0x3f, 0x47,0x75,0x57,0xab, 0x7b,0x84,0x4e,0xd3, 0xf2,0xa4,0x6c,0xc6, 0x3e,0xb2,0x32,0x86, 0x46,0x4c,0x51,0xd5, 0xd7,0x69,0x71,0xc4, 0x7b,0xc5,0xb5,0x5f, 0xed,0x72,0xa8,0x04, 0x3c,0xbf,0x66,0x4f, }, [1] = { 0xbf,0x49,0xb8,0x89, 0xba,0x98,0x4d,0x34, 0x63,0x87,0xe8,0x64, 0x7e,0x98,0xbb,0x99, 0xcd,0x41,0xa3,0x2f, 0xbe,0xc1,0xfc,0xb3, 0xb6,0xa1,0xb7,0xd9, 0x93,0x2b,0xa7,0xe1, 0x1e,0xe6,0xbb,0xd9, 0x24,0x40,0x5a,0x2c, 0x7f,0xca,0x89,0x0a, 0x5e,0x9a,0x8d,0xea, 0x66,0xac,0x0c,0xac, 0xa0,0xca,0x7b,0xc1, 0x8d,0x74,0xfb,0xc0, 0x2a,0x11,0xe4,0x53, }, }, }, }; #ifdef NIST_HASH_DRBG_DEBUG #define DPRINTF(fmt, ...) \ printf("%s:%d: " fmt, __func__, __LINE__, ##__VA_ARGS__) #define DUSE(v) #else #define DPRINTF(fmt, ...) #define DUSE(v) (void)(v) #endif #define CHECK(i, name, actual, expected, n) do \ { \ CTASSERT(sizeof(actual) == (n)); \ ok &= check(i, name, actual, expected, (n)); \ } while (0) static bool check(unsigned katno, const char *name, const uint8_t *actual, const uint8_t *expected, size_t n) { bool ok = true; size_t i; DUSE(katno); DUSE(name); for (i = 0; i < n; i++) { if (actual[i] != expected[i]) { DPRINTF("KAT %u %s[%zu] = %02x, expected %02x\n", katno, name, i, actual[i], expected[i]); ok = false; } } return ok; } #ifdef NIST_HASH_DRBG_MAIN int main(void) { int ret; ret = nist_hash_drbg_initialize(); fflush(stdout); return ret || ferror(stdout); } #endif int nist_hash_drbg_initialize(void) { const unsigned truncation[] = { 0, 1, 32, 63 }; bool ok = true; size_t i, j; for (i = 0; i < arraycount(kat); i++) { for (j = 0; j < arraycount(truncation); j++) { const unsigned trunc = truncation[j]; struct nist_hash_drbg drbg, *D = &drbg; uint8_t rnd_val[64]; unsigned reseed_counter; nist_hash_drbg_instantiate(D, kat_entropy[0], sizeof kat_entropy[0], kat_nonce, sizeof kat_nonce, kat[i].personalization->hv_base, kat[i].personalization->hv_len); reseed_counter = 1; CHECK(i, "C", D->C, kat[i].C, SEEDLEN_BYTES); CHECK(i, "V[0]", D->V, kat[i].V[0], SEEDLEN_BYTES); if (D->reseed_counter != reseed_counter) { DPRINTF("bad reseed counter: %u, expected %u", D->reseed_counter, reseed_counter); ok = false; } if (kat[i].reseed) { nist_hash_drbg_reseed(D, kat_entropy[1], sizeof kat_entropy[1], kat[i].additional[0]->hv_base, kat[i].additional[0]->hv_len); } nist_hash_drbg_generate(D, rnd_val, sizeof(rnd_val) - trunc, kat[i].reseed ? 0 : kat[i].additional[0]->hv_base, kat[i].reseed ? 0 : kat[i].additional[0]->hv_len); reseed_counter++; CHECK(i, "V[1]", D->V, kat[i].V[1], SEEDLEN_BYTES); ASSERT(sizeof(kat[i].rnd_val[0]) - trunc <= sizeof rnd_val); check(i, "rnd_val[0]", rnd_val, kat[i].rnd_val[0], sizeof(kat[i].rnd_val[0]) - trunc); if (D->reseed_counter != reseed_counter) { DPRINTF("bad reseed counter: %u, expected %u", D->reseed_counter, reseed_counter); ok = false; } if (kat[i].reseed) { nist_hash_drbg_reseed(D, kat_entropy[2], sizeof kat_entropy[2], kat[i].additional[1]->hv_base, kat[i].additional[1]->hv_len); reseed_counter = 1; } nist_hash_drbg_generate(D, rnd_val, sizeof(rnd_val) - trunc, kat[i].reseed ? 0 : kat[i].additional[1]->hv_base, kat[i].reseed ? 0 : kat[i].additional[1]->hv_len); reseed_counter++; CHECK(i, "V[2]", D->V, kat[i].V[2], SEEDLEN_BYTES); ASSERT(sizeof(kat[i].rnd_val[1]) - trunc <= sizeof rnd_val); check(i, "rnd_val[1]", rnd_val, kat[i].rnd_val[1], sizeof(kat[i].rnd_val[1]) - trunc); if (D->reseed_counter != reseed_counter) { DPRINTF("bad reseed counter: %u, expected %u", D->reseed_counter, reseed_counter); ok = false; } nist_hash_drbg_destroy(D); } } if (!ok) return 1; return 0; }
6 2 4 5 1 4 4 1 1 2 6 2 2 2 6 1 5 19 20 3 17 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 /* $NetBSD: kern_time_50.c,v 1.38 2024/01/19 18:39:15 christos Exp $ */ /*- * Copyright (c) 2008, 2009 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Christos Zoulas. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: kern_time_50.c,v 1.38 2024/01/19 18:39:15 christos Exp $"); #ifdef _KERNEL_OPT #include "opt_compat_netbsd.h" #include "opt_aio.h" #include "opt_ntp.h" #include "opt_mqueue.h" #endif #include <sys/param.h> #include <sys/conf.h> #include <sys/systm.h> #include <sys/namei.h> #include <sys/filedesc.h> #include <sys/kernel.h> #include <sys/file.h> #include <sys/stat.h> #include <sys/socketvar.h> #include <sys/vnode.h> #include <sys/proc.h> #include <sys/uio.h> #include <sys/dirent.h> #include <sys/kauth.h> #include <sys/time.h> #include <sys/timex.h> #include <sys/clockctl.h> #include <sys/aio.h> #include <sys/poll.h> #include <sys/syscall.h> #include <sys/syscallargs.h> #include <sys/syscallvar.h> #include <sys/sysctl.h> #include <sys/resource.h> #include <sys/compat_stub.h> #include <compat/common/compat_util.h> #include <compat/common/compat_mod.h> #include <compat/sys/time.h> #include <compat/sys/timex.h> #include <compat/sys/resource.h> #include <compat/sys/clockctl.h> struct timeval50 boottime50; static const struct syscall_package kern_time_50_syscalls[] = { { SYS_compat_50_clock_gettime, 0, (sy_call_t *)compat_50_sys_clock_gettime }, { SYS_compat_50_clock_settime, 0, (sy_call_t *)compat_50_sys_clock_settime }, { SYS_compat_50_clock_getres, 0, (sy_call_t *)compat_50_sys_clock_getres}, { SYS_compat_50_nanosleep, 0, (sy_call_t *)compat_50_sys_nanosleep }, { SYS_compat_50_gettimeofday, 0, (sy_call_t *)compat_50_sys_gettimeofday }, { SYS_compat_50_settimeofday, 0, (sy_call_t *)compat_50_sys_settimeofday }, { SYS_compat_50_adjtime, 0, (sy_call_t *)compat_50_sys_adjtime }, { SYS_compat_50_setitimer, 0, (sy_call_t *)compat_50_sys_setitimer }, { SYS_compat_50_getitimer, 0, (sy_call_t *)compat_50_sys_getitimer }, { SYS_compat_50_aio_suspend, 0, (sy_call_t *)compat_50_sys_aio_suspend }, { SYS_compat_50_mq_timedsend, 0, (sy_call_t *)compat_50_sys_mq_timedsend }, { SYS_compat_50_mq_timedreceive, 0, (sy_call_t *)compat_50_sys_mq_timedreceive }, { SYS_compat_50_getrusage, 0, (sy_call_t *)compat_50_sys_getrusage }, { SYS_compat_50_timer_settime, 0, (sy_call_t *)compat_50_sys_timer_settime }, { SYS_compat_50_timer_gettime, 0, (sy_call_t *)compat_50_sys_timer_gettime }, { SYS_compat_50___ntp_gettime30, 0, (sy_call_t *)compat_50_sys___ntp_gettime30 }, { 0, 0, NULL } }; int compat_50_sys_clock_gettime(struct lwp *l, const struct compat_50_sys_clock_gettime_args *uap, register_t *retval) { /* { syscallarg(clockid_t) clock_id; syscallarg(struct timespec50 *) tp; } */ int error; struct timespec ats; struct timespec50 ats50; error = clock_gettime1(SCARG(uap, clock_id), &ats); if (error != 0) return error; timespec_to_timespec50(&ats, &ats50); return copyout(&ats50, SCARG(uap, tp), sizeof(ats50)); } /* ARGSUSED */ int compat_50_sys_clock_settime(struct lwp *l, const struct compat_50_sys_clock_settime_args *uap, register_t *retval) { /* { syscallarg(clockid_t) clock_id; syscallarg(const struct timespec50 *) tp; } */ int error; struct timespec ats; struct timespec50 ats50; error = copyin(SCARG(uap, tp), &ats50, sizeof(ats50)); if (error) return error; timespec50_to_timespec(&ats50, &ats); return clock_settime1(l->l_proc, SCARG(uap, clock_id), &ats, true); } int compat_50_sys_clock_getres(struct lwp *l, const struct compat_50_sys_clock_getres_args *uap, register_t *retval) { /* { syscallarg(clockid_t) clock_id; syscallarg(struct timespec50 *) tp; } */ struct timespec50 ats50; struct timespec ats; int error; error = clock_getres1(SCARG(uap, clock_id), &ats); if (error != 0) return error; if (SCARG(uap, tp)) { timespec_to_timespec50(&ats, &ats50); error = copyout(&ats50, SCARG(uap, tp), sizeof(ats50)); } return error; } /* ARGSUSED */ int compat_50_sys_nanosleep(struct lwp *l, const struct compat_50_sys_nanosleep_args *uap, register_t *retval) { /* { syscallarg(struct timespec50 *) rqtp; syscallarg(struct timespec50 *) rmtp; } */ struct timespec rmt, rqt; struct timespec50 rmt50, rqt50; int error, error1; error = copyin(SCARG(uap, rqtp), &rqt50, sizeof(rqt50)); if (error) return error; timespec50_to_timespec(&rqt50, &rqt); error = nanosleep1(l, CLOCK_MONOTONIC, 0, &rqt, SCARG(uap, rmtp) ? &rmt : NULL); if (SCARG(uap, rmtp) == NULL || (error != 0 && error != EINTR)) return error; timespec_to_timespec50(&rmt, &rmt50); error1 = copyout(&rmt50, SCARG(uap, rmtp), sizeof(*SCARG(uap, rmtp))); return error1 ? error1 : error; } /* ARGSUSED */ int compat_50_sys_gettimeofday(struct lwp *l, const struct compat_50_sys_gettimeofday_args *uap, register_t *retval) { /* { syscallarg(struct timeval50 *) tp; syscallarg(void *) tzp; really "struct timezone *"; } */ struct timeval atv; struct timeval50 atv50; int error = 0; struct timezone tzfake; if (SCARG(uap, tp)) { microtime(&atv); timeval_to_timeval50(&atv, &atv50); error = copyout(&atv50, SCARG(uap, tp), sizeof(*SCARG(uap, tp))); if (error) return error; } if (SCARG(uap, tzp)) { /* * NetBSD has no kernel notion of time zone, so we just * fake up a timezone struct and return it if demanded. */ memset(&tzfake, 0, sizeof(tzfake)); tzfake.tz_minuteswest = 0; tzfake.tz_dsttime = 0; error = copyout(&tzfake, SCARG(uap, tzp), sizeof(tzfake)); } return error; } /* ARGSUSED */ int compat_50_sys_settimeofday(struct lwp *l, const struct compat_50_sys_settimeofday_args *uap, register_t *retval) { /* { syscallarg(const struct timeval50 *) tv; syscallarg(const void *) tzp; really "const struct timezone *"; } */ struct timeval50 atv50; struct timeval atv; int error = copyin(SCARG(uap, tv), &atv50, sizeof(atv50)); if (error) return error; timeval50_to_timeval(&atv50, &atv); return settimeofday1(&atv, false, SCARG(uap, tzp), l, true); } /* ARGSUSED */ int compat_50_sys_adjtime(struct lwp *l, const struct compat_50_sys_adjtime_args *uap, register_t *retval) { /* { syscallarg(const struct timeval50 *) delta; syscallarg(struct timeval50 *) olddelta; } */ int error; struct timeval50 delta50, olddelta50; struct timeval delta, olddelta; if ((error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_TIME, KAUTH_REQ_SYSTEM_TIME_ADJTIME, NULL, NULL, NULL)) != 0) return error; if (SCARG(uap, delta)) { error = copyin(SCARG(uap, delta), &delta50, sizeof(*SCARG(uap, delta))); if (error) return (error); timeval50_to_timeval(&delta50, &delta); } adjtime1(SCARG(uap, delta) ? &delta : NULL, SCARG(uap, olddelta) ? &olddelta : NULL, l->l_proc); if (SCARG(uap, olddelta)) { timeval_to_timeval50(&olddelta, &olddelta50); error = copyout(&olddelta50, SCARG(uap, olddelta), sizeof(*SCARG(uap, olddelta))); } return error; } /* BSD routine to set/arm an interval timer. */ /* ARGSUSED */ int compat_50_sys_getitimer(struct lwp *l, const struct compat_50_sys_getitimer_args *uap, register_t *retval) { /* { syscallarg(int) which; syscallarg(struct itimerval50 *) itv; } */ struct proc *p = l->l_proc; struct itimerval aitv; struct itimerval50 aitv50; int error; error = dogetitimer(p, SCARG(uap, which), &aitv); if (error) return error; itimerval_to_itimerval50(&aitv, &aitv50); return copyout(&aitv50, SCARG(uap, itv), sizeof(*SCARG(uap, itv))); } int compat_50_sys_setitimer(struct lwp *l, const struct compat_50_sys_setitimer_args *uap, register_t *retval) { /* { syscallarg(int) which; syscallarg(const struct itimerval50 *) itv; syscallarg(struct itimerval50 *) oitv; } */ struct proc *p = l->l_proc; int which = SCARG(uap, which); struct compat_50_sys_getitimer_args getargs; const struct itimerval50 *itvp; struct itimerval50 aitv50; struct itimerval aitv; int error; itvp = SCARG(uap, itv); if (itvp && (error = copyin(itvp, &aitv50, sizeof(aitv50))) != 0) return (error); itimerval50_to_itimerval(&aitv50, &aitv); if (SCARG(uap, oitv) != NULL) { SCARG(&getargs, which) = which; SCARG(&getargs, itv) = SCARG(uap, oitv); if ((error = compat_50_sys_getitimer(l, &getargs, retval)) != 0) return (error); } if (itvp == 0) return (0); return dosetitimer(p, which, &aitv); } int compat_50_sys_aio_suspend(struct lwp *l, const struct compat_50_sys_aio_suspend_args *uap, register_t *retval) { /* { syscallarg(const struct aiocb *const[]) list; syscallarg(int) nent; syscallarg(const struct timespec50 *) timeout; } */ #ifdef AIO struct aiocb **list; struct timespec ts; struct timespec50 ts50; int error, nent; nent = SCARG(uap, nent); if (nent <= 0 || nent > aio_listio_max) return EAGAIN; if (SCARG(uap, timeout)) { /* Convert timespec to ticks */ error = copyin(SCARG(uap, timeout), &ts50, sizeof(*SCARG(uap, timeout))); if (error) return error; timespec50_to_timespec(&ts50, &ts); } list = kmem_alloc(nent * sizeof(*list), KM_SLEEP); error = copyin(SCARG(uap, list), list, nent * sizeof(*list)); if (error) goto out; error = aio_suspend1(l, list, nent, SCARG(uap, timeout) ? &ts : NULL); out: kmem_free(list, nent * sizeof(*list)); return error; #else return ENOSYS; #endif } int compat_50_sys_mq_timedsend(struct lwp *l, const struct compat_50_sys_mq_timedsend_args *uap, register_t *retval) { /* { syscallarg(mqd_t) mqdes; syscallarg(const char *) msg_ptr; syscallarg(size_t) msg_len; syscallarg(unsigned) msg_prio; syscallarg(const struct timespec50 *) abs_timeout; } */ #ifdef MQUEUE struct timespec50 ts50; struct timespec ts, *tsp; int error; /* Get and convert time value */ if (SCARG(uap, abs_timeout)) { error = copyin(SCARG(uap, abs_timeout), &ts50, sizeof(ts50)); if (error) return error; timespec50_to_timespec(&ts50, &ts); tsp = &ts; } else { tsp = NULL; } return mq_send1(SCARG(uap, mqdes), SCARG(uap, msg_ptr), SCARG(uap, msg_len), SCARG(uap, msg_prio), tsp); #else return ENOSYS; #endif } int compat_50_sys_mq_timedreceive(struct lwp *l, const struct compat_50_sys_mq_timedreceive_args *uap, register_t *retval) { /* { syscallarg(mqd_t) mqdes; syscallarg(char *) msg_ptr; syscallarg(size_t) msg_len; syscallarg(unsigned *) msg_prio; syscallarg(const struct timespec50 *) abs_timeout; } */ #ifdef MQUEUE struct timespec ts, *tsp; struct timespec50 ts50; ssize_t mlen; int error; /* Get and convert time value */ if (SCARG(uap, abs_timeout)) { error = copyin(SCARG(uap, abs_timeout), &ts50, sizeof(ts50)); if (error) return error; timespec50_to_timespec(&ts50, &ts); tsp = &ts; } else { tsp = NULL; } error = mq_recv1(SCARG(uap, mqdes), SCARG(uap, msg_ptr), SCARG(uap, msg_len), SCARG(uap, msg_prio), tsp, &mlen); if (error == 0) *retval = mlen; return error; #else return ENOSYS; #endif } int compat_50_sys_getrusage(struct lwp *l, const struct compat_50_sys_getrusage_args *uap, register_t *retval) { /* { syscallarg(int) who; syscallarg(struct rusage50 *) rusage; } */ int error; struct rusage ru; struct rusage50 ru50; struct proc *p = l->l_proc; error = getrusage1(p, SCARG(uap, who), &ru); if (error != 0) return error; rusage_to_rusage50(&ru, &ru50); return copyout(&ru50, SCARG(uap, rusage), sizeof(ru50)); } /* Return the time remaining until a POSIX timer fires. */ int compat_50_sys_timer_gettime(struct lwp *l, const struct compat_50_sys_timer_gettime_args *uap, register_t *retval) { /* { syscallarg(timer_t) timerid; syscallarg(struct itimerspec50 *) value; } */ struct itimerspec its; struct itimerspec50 its50; int error; if ((error = dotimer_gettime(SCARG(uap, timerid), l->l_proc, &its)) != 0) return error; itimerspec_to_itimerspec50(&its, &its50); return copyout(&its50, SCARG(uap, value), sizeof(its50)); } /* Set and arm a POSIX realtime timer */ int compat_50_sys_timer_settime(struct lwp *l, const struct compat_50_sys_timer_settime_args *uap, register_t *retval) { /* { syscallarg(timer_t) timerid; syscallarg(int) flags; syscallarg(const struct itimerspec50 *) value; syscallarg(struct itimerspec50 *) ovalue; } */ int error; struct itimerspec value, ovalue, *ovp = NULL; struct itimerspec50 value50, ovalue50; if ((error = copyin(SCARG(uap, value), &value50, sizeof(value50))) != 0) return error; itimerspec50_to_itimerspec(&value50, &value); if (SCARG(uap, ovalue)) ovp = &ovalue; if ((error = dotimer_settime(SCARG(uap, timerid), &value, ovp, SCARG(uap, flags), l->l_proc)) != 0) return error; if (ovp) { itimerspec_to_itimerspec50(&ovalue, &ovalue50); return copyout(&ovalue50, SCARG(uap, ovalue), sizeof(ovalue50)); } return 0; } /* * ntp_gettime() - NTP user application interface */ int compat_50_sys___ntp_gettime30(struct lwp *l, const struct compat_50_sys___ntp_gettime30_args *uap, register_t *retval) { if (vec_ntp_gettime == NULL) return ENOSYS; /* No NTP available in kernel */ /* { syscallarg(struct ntptimeval *) ntvp; } */ struct ntptimeval ntv; struct ntptimeval50 ntv50; int error; if (SCARG(uap, ntvp)) { (*vec_ntp_gettime)(&ntv); memset(&ntv50, 0, sizeof(ntv50)); timespec_to_timespec50(&ntv.time, &ntv50.time); ntv50.maxerror = ntv.maxerror; ntv50.esterror = ntv.esterror; ntv50.tai = ntv.tai; ntv50.time_state = ntv.time_state; error = copyout(&ntv50, SCARG(uap, ntvp), sizeof(ntv50)); if (error) return error; } *retval = (*vec_ntp_timestatus)(); return 0; } SYSCTL_SETUP(compat_sysctl_time, "Old system boottime") { struct timeval tv; getmicroboottime(&tv); timeval_to_timeval50(&tv, &boottime50); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT, CTLTYPE_STRUCT, "oboottime", SYSCTL_DESCR("System boot time"), NULL, 0, &boottime50, sizeof(boottime50), CTL_KERN, KERN_OBOOTTIME, CTL_EOL); } int kern_time_50_init(void) { int error; error = syscall_establish(NULL, kern_time_50_syscalls); return error; } int kern_time_50_fini(void) { int error; error = syscall_disestablish(NULL, kern_time_50_syscalls); return error; }
1 1 1 8 26 31 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 /* $NetBSD: in_proto.c,v 1.131 2022/09/03 02:53:18 thorpej Exp $ */ /* * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the project nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * Copyright (c) 1982, 1986, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)in_proto.c 8.2 (Berkeley) 2/9/95 */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: in_proto.c,v 1.131 2022/09/03 02:53:18 thorpej Exp $"); #ifdef _KERNEL_OPT #include "opt_mrouting.h" #include "opt_inet.h" #include "opt_ipsec.h" #include "opt_pim.h" #include "opt_gateway.h" #include "opt_dccp.h" #include "opt_sctp.h" #include "opt_compat_netbsd.h" #include "opt_net_mpsafe.h" #endif #include <sys/param.h> #include <sys/socket.h> #include <sys/protosw.h> #include <sys/domain.h> #include <sys/mbuf.h> #include <net/if.h> #include <netinet/in.h> #include <netinet/in_systm.h> #include <netinet/in_var.h> #include <netinet/ip.h> #include <netinet/ip_var.h> #include <netinet/ip_icmp.h> #include <netinet/in_ifattach.h> #include <netinet/in_pcb.h> #include <netinet/in_proto.h> #ifdef INET6 #ifndef INET #include <netinet/in.h> #endif #include <netinet/ip6.h> #endif #include <netinet/igmp_var.h> #ifdef PIM #include <netinet/pim_var.h> #endif #include <netinet/tcp.h> #include <netinet/tcp_fsm.h> #include <netinet/tcp_seq.h> #include <netinet/tcp_timer.h> #include <netinet/tcp_var.h> #include <netinet/tcp_debug.h> #include <netinet/udp.h> #include <netinet/udp_var.h> #include <netinet/ip_encap.h> #ifdef DCCP #include <netinet/dccp.h> #include <netinet/dccp_var.h> #endif #ifdef SCTP #include <netinet/sctp.h> #include <netinet/sctp_var.h> #endif /* * TCP/IP protocol family: IP, ICMP, UDP, TCP. */ #ifdef IPSEC #include <netipsec/ipsec.h> #include <netipsec/key.h> #endif /* IPSEC */ #include "carp.h" #if NCARP > 0 #include <netinet/ip_carp.h> #endif #include "pfsync.h" #if NPFSYNC > 0 #include <net/pfvar.h> #include <net/if_pfsync.h> #endif DOMAIN_DEFINE(inetdomain); /* forward declare and add to link set */ /* Wrappers to acquire kernel_lock. */ PR_WRAP_CTLINPUT(rip_ctlinput) PR_WRAP_CTLINPUT(udp_ctlinput) PR_WRAP_CTLINPUT(tcp_ctlinput) #define rip_ctlinput rip_ctlinput_wrapper #define udp_ctlinput udp_ctlinput_wrapper #define tcp_ctlinput tcp_ctlinput_wrapper PR_WRAP_CTLOUTPUT(rip_ctloutput) PR_WRAP_CTLOUTPUT(udp_ctloutput) PR_WRAP_CTLOUTPUT(tcp_ctloutput) #define rip_ctloutput rip_ctloutput_wrapper #define udp_ctloutput udp_ctloutput_wrapper #define tcp_ctloutput tcp_ctloutput_wrapper #ifdef DCCP PR_WRAP_CTLINPUT(dccp_ctlinput) PR_WRAP_CTLOUTPUT(dccp_ctloutput) #define dccp_ctlinput dccp_ctlinput_wrapper #define dccp_ctloutput dccp_ctloutput_wrapper #endif #ifdef SCTP PR_WRAP_CTLINPUT(sctp_ctlinput) PR_WRAP_CTLOUTPUT(sctp_ctloutput) #define sctp_ctlinput sctp_ctlinput_wrapper #define sctp_ctloutput sctp_ctloutput_wrapper #endif #ifdef NET_MPSAFE PR_WRAP_INPUT(udp_input) PR_WRAP_INPUT(tcp_input) #ifdef DCCP PR_WRAP_INPUT(dccp_input) #endif #ifdef SCTP PR_WRAP_INPUT(sctp_input) #endif PR_WRAP_INPUT(rip_input) #if NPFSYNC > 0 PR_WRAP_INPUT(pfsync_input) #endif PR_WRAP_INPUT(igmp_input) #ifdef PIM PR_WRAP_INPUT(pim_input) #endif #define udp_input udp_input_wrapper #define tcp_input tcp_input_wrapper #define dccp_input dccp_input_wrapper #define sctp_input sctp_input_wrapper #define rip_input rip_input_wrapper #define pfsync_input pfsync_input_wrapper #define igmp_input igmp_input_wrapper #define pim_input pim_input_wrapper #endif #if defined(IPSEC) #ifdef IPSEC_RUMPKERNEL /* * .pr_input = ipsec4_common_input won't be resolved on loading * the ipsec shared library. We need a wrapper anyway. */ static void ipsec4_common_input_wrapper(struct mbuf *m, int off, int proto) { if (ipsec_enabled) { ipsec4_common_input(m, off, proto); } else { m_freem(m); } } #define ipsec4_common_input ipsec4_common_input_wrapper /* The ctlinput functions may not be loaded */ #define IPSEC_WRAP_CTLINPUT(name) \ static void * \ name##_wrapper(int a, const struct sockaddr *b, void *c)\ { \ void *rv; \ KERNEL_LOCK(1, NULL); \ if (ipsec_enabled) \ rv = name(a, b, c); \ else \ rv = NULL; \ KERNEL_UNLOCK_ONE(NULL); \ return rv; \ } IPSEC_WRAP_CTLINPUT(ah4_ctlinput) IPSEC_WRAP_CTLINPUT(esp4_ctlinput) #else /* !IPSEC_RUMPKERNEL */ PR_WRAP_CTLINPUT(ah4_ctlinput) PR_WRAP_CTLINPUT(esp4_ctlinput) #endif /* !IPSEC_RUMPKERNEL */ #define ah4_ctlinput ah4_ctlinput_wrapper #define esp4_ctlinput esp4_ctlinput_wrapper #endif /* IPSEC */ const struct protosw inetsw[] = { { .pr_domain = &inetdomain, .pr_init = ip_init, .pr_fasttimo = ip_fasttimo, .pr_slowtimo = ip_slowtimo, .pr_drain = ip_drainstub, }, { .pr_type = SOCK_RAW, .pr_domain = &inetdomain, .pr_protocol = IPPROTO_ICMP, .pr_flags = PR_ATOMIC|PR_ADDR|PR_LASTHDR, .pr_input = icmp_input, .pr_ctlinput = rip_ctlinput, .pr_ctloutput = rip_ctloutput, .pr_usrreqs = &rip_usrreqs, .pr_init = icmp_init, }, { .pr_type = SOCK_DGRAM, .pr_domain = &inetdomain, .pr_protocol = IPPROTO_UDP, .pr_flags = PR_ATOMIC|PR_ADDR|PR_PURGEIF, .pr_input = udp_input, .pr_ctlinput = udp_ctlinput, .pr_ctloutput = udp_ctloutput, .pr_usrreqs = &udp_usrreqs, .pr_init = udp_init, }, { .pr_type = SOCK_STREAM, .pr_domain = &inetdomain, .pr_protocol = IPPROTO_TCP, .pr_flags = PR_CONNREQUIRED|PR_WANTRCVD|PR_LISTEN|PR_ABRTACPTDIS|PR_PURGEIF, .pr_input = tcp_input, .pr_ctlinput = tcp_ctlinput, .pr_ctloutput = tcp_ctloutput, .pr_usrreqs = &tcp_usrreqs, .pr_init = tcp_init, .pr_fasttimo = tcp_fasttimo, .pr_drain = tcp_drainstub, }, #ifdef DCCP { .pr_type = SOCK_CONN_DGRAM, .pr_domain = &inetdomain, .pr_protocol = IPPROTO_DCCP, .pr_flags = PR_CONNREQUIRED|PR_WANTRCVD|PR_ATOMIC|PR_LISTEN|PR_ABRTACPTDIS, .pr_input = dccp_input, .pr_ctlinput = dccp_ctlinput, .pr_ctloutput = dccp_ctloutput, .pr_usrreqs = &dccp_usrreqs, .pr_init = dccp_init, }, #endif #ifdef SCTP { .pr_type = SOCK_DGRAM, .pr_domain = &inetdomain, .pr_protocol = IPPROTO_SCTP, .pr_flags = PR_ADDR_OPT|PR_WANTRCVD, .pr_input = sctp_input, .pr_ctlinput = sctp_ctlinput, .pr_ctloutput = sctp_ctloutput, .pr_usrreqs = &sctp_usrreqs, .pr_init = sctp_init, .pr_drain = sctp_drain }, { .pr_type = SOCK_SEQPACKET, .pr_domain = &inetdomain, .pr_protocol = IPPROTO_SCTP, .pr_flags = PR_ADDR_OPT|PR_WANTRCVD, .pr_input = sctp_input, .pr_ctlinput = sctp_ctlinput, .pr_ctloutput = sctp_ctloutput, .pr_usrreqs = &sctp_usrreqs, .pr_drain = sctp_drain }, { .pr_type = SOCK_STREAM, .pr_domain = &inetdomain, .pr_protocol = IPPROTO_SCTP, .pr_flags = PR_CONNREQUIRED|PR_ADDR_OPT|PR_WANTRCVD|PR_LISTEN, .pr_input = sctp_input, .pr_ctlinput = sctp_ctlinput, .pr_ctloutput = sctp_ctloutput, .pr_usrreqs = &sctp_usrreqs, .pr_drain = sctp_drain }, #endif /* SCTP */ { .pr_type = SOCK_RAW, .pr_domain = &inetdomain, .pr_protocol = IPPROTO_RAW, .pr_flags = PR_ATOMIC|PR_ADDR|PR_PURGEIF, .pr_input = rip_input, .pr_ctlinput = rip_ctlinput, .pr_ctloutput = rip_ctloutput, .pr_usrreqs = &rip_usrreqs, }, #ifdef GATEWAY { .pr_domain = &inetdomain, .pr_protocol = IPPROTO_IP, .pr_slowtimo = ipflow_slowtimo, .pr_init = ipflow_poolinit, }, #endif /* GATEWAY */ #ifdef IPSEC { .pr_type = SOCK_RAW, .pr_domain = &inetdomain, .pr_protocol = IPPROTO_AH, .pr_flags = PR_ATOMIC|PR_ADDR, .pr_input = ipsec4_common_input, .pr_ctlinput = ah4_ctlinput, }, { .pr_type = SOCK_RAW, .pr_domain = &inetdomain, .pr_protocol = IPPROTO_ESP, .pr_flags = PR_ATOMIC|PR_ADDR, .pr_input = ipsec4_common_input, .pr_ctlinput = esp4_ctlinput, }, { .pr_type = SOCK_RAW, .pr_domain = &inetdomain, .pr_protocol = IPPROTO_IPCOMP, .pr_flags = PR_ATOMIC|PR_ADDR, .pr_input = ipsec4_common_input, }, #endif /* IPSEC */ { .pr_type = SOCK_RAW, .pr_domain = &inetdomain, .pr_protocol = IPPROTO_IPV4, .pr_flags = PR_ATOMIC|PR_ADDR|PR_LASTHDR, .pr_input = encap4_input, .pr_ctlinput = rip_ctlinput, .pr_ctloutput = rip_ctloutput, .pr_usrreqs = &rip_usrreqs, .pr_init = encap_init, }, #ifdef INET6 { .pr_type = SOCK_RAW, .pr_domain = &inetdomain, .pr_protocol = IPPROTO_IPV6, .pr_flags = PR_ATOMIC|PR_ADDR|PR_LASTHDR, .pr_input = encap4_input, .pr_ctlinput = rip_ctlinput, .pr_ctloutput = rip_ctloutput, .pr_usrreqs = &rip_usrreqs, .pr_init = encap_init, }, #endif /* INET6 */ #if NCARP > 0 { .pr_type = SOCK_RAW, .pr_domain = &inetdomain, .pr_protocol = IPPROTO_CARP, .pr_flags = PR_ATOMIC|PR_ADDR|PR_LASTHDR, .pr_input = carp_proto_input, .pr_ctloutput = rip_ctloutput, .pr_usrreqs = &rip_usrreqs, .pr_init = carp_init, }, #endif /* NCARP > 0 */ { .pr_type = SOCK_RAW, .pr_domain = &inetdomain, .pr_protocol = IPPROTO_L2TP, .pr_flags = PR_ATOMIC|PR_ADDR|PR_LASTHDR, .pr_input = encap4_input, .pr_ctlinput = rip_ctlinput, .pr_ctloutput = rip_ctloutput, .pr_usrreqs = &rip_usrreqs, /*XXX*/ .pr_init = encap_init, }, #if NPFSYNC > 0 { .pr_type = SOCK_RAW, .pr_domain = &inetdomain, .pr_protocol = IPPROTO_PFSYNC, .pr_flags = PR_ATOMIC|PR_ADDR|PR_LASTHDR, .pr_input = pfsync_input, .pr_ctloutput = rip_ctloutput, .pr_usrreqs = &rip_usrreqs, }, #endif /* NPFSYNC > 0 */ { .pr_type = SOCK_RAW, .pr_domain = &inetdomain, .pr_protocol = IPPROTO_IGMP, .pr_flags = PR_ATOMIC|PR_ADDR|PR_LASTHDR, .pr_input = igmp_input, .pr_ctloutput = rip_ctloutput, .pr_ctlinput = rip_ctlinput, .pr_usrreqs = &rip_usrreqs, .pr_fasttimo = igmp_fasttimo, .pr_slowtimo = igmp_slowtimo, .pr_init = igmp_init, }, #ifdef PIM { .pr_type = SOCK_RAW, .pr_domain = &inetdomain, .pr_protocol = IPPROTO_PIM, .pr_flags = PR_ATOMIC|PR_ADDR|PR_LASTHDR, .pr_input = pim_input, .pr_ctloutput = rip_ctloutput, .pr_ctlinput = rip_ctlinput, .pr_usrreqs = &rip_usrreqs, }, #endif /* PIM */ /* raw wildcard */ { .pr_type = SOCK_RAW, .pr_domain = &inetdomain, .pr_flags = PR_ATOMIC|PR_ADDR|PR_LASTHDR, .pr_input = rip_input, .pr_ctloutput = rip_ctloutput, .pr_ctlinput = rip_ctlinput, .pr_usrreqs = &rip_usrreqs, .pr_init = rip_init, }, }; const struct sockaddr_in in_any = { .sin_len = sizeof(struct sockaddr_in) , .sin_family = AF_INET , .sin_port = 0 , .sin_addr = {.s_addr = 0 /* INADDR_ANY */} }; struct domain inetdomain = { .dom_family = PF_INET, .dom_name = "internet", .dom_init = NULL, .dom_externalize = NULL, .dom_dispose = NULL, .dom_protosw = inetsw, .dom_protoswNPROTOSW = &inetsw[__arraycount(inetsw)], .dom_rtattach = rt_inithead, .dom_rtoffset = 32, .dom_maxrtkey = sizeof(struct ip_pack4), .dom_if_up = in_if_up, .dom_if_down = in_if_down, .dom_ifattach = in_domifattach, .dom_ifdetach = in_domifdetach, .dom_if_link_state_change = in_if_link_state_change, .dom_link = { NULL }, .dom_mowner = MOWNER_INIT("",""), .dom_sa_cmpofs = offsetof(struct sockaddr_in, sin_addr), .dom_sa_cmplen = sizeof(struct in_addr), .dom_sa_any = (const struct sockaddr *)&in_any, .dom_sockaddr_const_addr = sockaddr_in_const_addr, .dom_sockaddr_addr = sockaddr_in_addr, }; u_char ip_protox[IPPROTO_MAX]; static void sockaddr_in_addrlen(const struct sockaddr *sa, socklen_t *slenp) { socklen_t slen; if (slenp == NULL) return; slen = sockaddr_getlen(sa); *slenp = (socklen_t)MIN(sizeof(struct in_addr), slen - MIN(slen, offsetof(struct sockaddr_in, sin_addr))); } const void * sockaddr_in_const_addr(const struct sockaddr *sa, socklen_t *slenp) { const struct sockaddr_in *sin; sockaddr_in_addrlen(sa, slenp); sin = (const struct sockaddr_in *)sa; return &sin->sin_addr; } void * sockaddr_in_addr(struct sockaddr *sa, socklen_t *slenp) { struct sockaddr_in *sin; sockaddr_in_addrlen(sa, slenp); sin = (struct sockaddr_in *)sa; return &sin->sin_addr; } int sockaddr_in_cmp(const struct sockaddr *sa1, const struct sockaddr *sa2) { uint_fast8_t len; const uint_fast8_t addrofs = offsetof(struct sockaddr_in, sin_addr), addrend = addrofs + sizeof(struct in_addr); int rc; const struct sockaddr_in *sin1, *sin2; sin1 = satocsin(sa1); sin2 = satocsin(sa2); len = MIN(addrend, MIN(sin1->sin_len, sin2->sin_len)); if (len > addrofs && (rc = memcmp(&sin1->sin_addr, &sin2->sin_addr, len - addrofs)) != 0) return rc; return sin1->sin_len - sin2->sin_len; }
3 3 1 2 3 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 /* $NetBSD: in6_ifattach.c,v 1.122 2024/04/11 07:34:37 knakahara Exp $ */ /* $KAME: in6_ifattach.c,v 1.124 2001/07/18 08:32:51 jinmei Exp $ */ /* * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the project nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: in6_ifattach.c,v 1.122 2024/04/11 07:34:37 knakahara Exp $"); #include <sys/param.h> #include <sys/systm.h> #include <sys/kmem.h> #include <sys/socket.h> #include <sys/sockio.h> #include <sys/kernel.h> #include <sys/syslog.h> #include <sys/md5.h> #include <sys/socketvar.h> #include <sys/cprng.h> #include <net/if.h> #include <net/if_dl.h> #include <net/if_types.h> #include <net/route.h> #include <netinet/in.h> #include <netinet/in_var.h> #include <netinet/ip6.h> #include <netinet6/in6_ifattach.h> #include <netinet6/ip6_var.h> #include <netinet6/nd6.h> #include <netinet6/ip6_mroute.h> #include <netinet6/scope6_var.h> int ip6_auto_linklocal = 1; /* enable by default */ #if 0 static int get_hostid_ifid(struct ifnet *, struct in6_addr *); #endif static int get_ifid(struct ifnet *, struct ifnet *, struct in6_addr *); static int in6_ifattach_linklocal(struct ifnet *, struct ifnet *); static int in6_ifattach_loopback(struct ifnet *); #define EUI64_GBIT 0x01 #define EUI64_UBIT 0x02 #define EUI64_TO_IFID(in6) do {(in6)->s6_addr[8] ^= EUI64_UBIT; } while (/*CONSTCOND*/ 0) #define EUI64_GROUP(in6) ((in6)->s6_addr[8] & EUI64_GBIT) #define EUI64_INDIVIDUAL(in6) (!EUI64_GROUP(in6)) #define EUI64_LOCAL(in6) ((in6)->s6_addr[8] & EUI64_UBIT) #define EUI64_UNIVERSAL(in6) (!EUI64_LOCAL(in6)) #define IFID_LOCAL(in6) (!EUI64_LOCAL(in6)) #define IFID_UNIVERSAL(in6) (!EUI64_UNIVERSAL(in6)) #if 0 /* * Generate a last-resort interface identifier from hostid. * works only for certain architectures (like sparc). * also, using hostid itself may constitute a privacy threat, much worse * than MAC addresses (hostids are used for software licensing). * maybe we should use MD5(hostid) instead. * * in6 - upper 64bits are preserved */ static int get_hostid_ifid(struct ifnet *ifp, struct in6_addr *in6) { int off, len; static const uint8_t allzero[8] = { 0, 0, 0, 0, 0, 0, 0, 0 }; static const uint8_t allone[8] = { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff }; if (!hostid) return -1; /* get up to 8 bytes from the hostid field - should we get */ len = (sizeof(hostid) > 8) ? 8 : sizeof(hostid); off = sizeof(*in6) - len; memcpy(&in6->s6_addr[off], &hostid, len); /* make sure we do not return anything bogus */ if (memcmp(&in6->s6_addr[8], allzero, sizeof(allzero))) return -1; if (memcmp(&in6->s6_addr[8], allone, sizeof(allone))) return -1; /* make sure to set "u" bit to local, and "g" bit to individual. */ in6->s6_addr[8] &= ~EUI64_GBIT; /* g bit to "individual" */ in6->s6_addr[8] |= EUI64_UBIT; /* u bit to "local" */ /* convert EUI64 into IPv6 interface identifier */ EUI64_TO_IFID(in6); return 0; } #endif /* * Generate a last-resort interface identifier, when the machine has no * IEEE802/EUI64 address sources. * The goal here is to get an interface identifier that is * (1) random enough and (2) does not change across reboot. * We currently use MD5(hostname) for it. */ static int get_rand_ifid(struct in6_addr *in6) /* upper 64bits are preserved */ { MD5_CTX ctxt; u_int8_t digest[16]; #if 0 /* we need at least several letters as seed for ifid */ if (hostnamelen < 3) return -1; #endif /* generate 8 bytes of pseudo-random value. */ memset(&ctxt, 0, sizeof(ctxt)); MD5Init(&ctxt); MD5Update(&ctxt, (u_char *)hostname, hostnamelen); MD5Final(digest, &ctxt); /* assumes sizeof(digest) > sizeof(ifid) */ memcpy(&in6->s6_addr[8], digest, 8); /* make sure to set "u" bit to local, and "g" bit to individual. */ in6->s6_addr[8] &= ~EUI64_GBIT; /* g bit to "individual" */ in6->s6_addr[8] |= EUI64_UBIT; /* u bit to "local" */ /* convert EUI64 into IPv6 interface identifier */ EUI64_TO_IFID(in6); return 0; } /* * Get interface identifier for the specified interface. * * in6 - upper 64bits are preserved */ int in6_get_hw_ifid(struct ifnet *ifp, struct in6_addr *in6) { struct ifaddr *ifa; const struct sockaddr_dl *sdl = NULL; const char *addr = NULL; /* XXX gcc 4.8 -Werror=maybe-uninitialized */ size_t addrlen = 0; /* XXX gcc 4.8 -Werror=maybe-uninitialized */ static u_int8_t allzero[8] = { 0, 0, 0, 0, 0, 0, 0, 0 }; static u_int8_t allone[8] = { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff }; int s; s = pserialize_read_enter(); IFADDR_READER_FOREACH(ifa, ifp) { const struct sockaddr_dl *tsdl; if (ifa->ifa_addr->sa_family != AF_LINK) continue; tsdl = satocsdl(ifa->ifa_addr); if (tsdl == NULL || tsdl->sdl_alen == 0) continue; if (sdl == NULL || ifa == ifp->if_dl || ifa == ifp->if_hwdl) { sdl = tsdl; addr = CLLADDR(sdl); addrlen = sdl->sdl_alen; } if (ifa == ifp->if_hwdl) break; } pserialize_read_exit(s); if (sdl == NULL) return -1; switch (ifp->if_type) { case IFT_IEEE1394: case IFT_IEEE80211: /* IEEE1394 uses 16byte length address starting with EUI64 */ if (addrlen > 8) addrlen = 8; break; default: break; } /* get EUI64 */ switch (ifp->if_type) { /* IEEE802/EUI64 cases - what others? */ case IFT_ETHER: case IFT_ATM: case IFT_IEEE1394: case IFT_IEEE80211: /* look at IEEE802/EUI64 only */ if (addrlen != 8 && addrlen != 6) return -1; /* * check for invalid MAC address - on bsdi, we see it a lot * since wildboar configures all-zero MAC on pccard before * card insertion. */ if (memcmp(addr, allzero, addrlen) == 0) return -1; if (memcmp(addr, allone, addrlen) == 0) return -1; /* make EUI64 address */ if (addrlen == 8) memcpy(&in6->s6_addr[8], addr, 8); else if (addrlen == 6) { in6->s6_addr[8] = addr[0]; in6->s6_addr[9] = addr[1]; in6->s6_addr[10] = addr[2]; in6->s6_addr[11] = 0xff; in6->s6_addr[12] = 0xfe; in6->s6_addr[13] = addr[3]; in6->s6_addr[14] = addr[4]; in6->s6_addr[15] = addr[5]; } break; case IFT_ARCNET: if (addrlen != 1) return -1; if (!addr[0]) return -1; memset(&in6->s6_addr[8], 0, 8); in6->s6_addr[15] = addr[0]; /* * due to insufficient bitwidth, we mark it local. */ in6->s6_addr[8] &= ~EUI64_GBIT; /* g bit to "individual" */ in6->s6_addr[8] |= EUI64_UBIT; /* u bit to "local" */ break; case IFT_GIF: case IFT_IPSEC: #ifdef IFT_STF case IFT_STF: #endif /* * RFC2893 says: "SHOULD use IPv4 address as ifid source". * however, IPv4 address is not very suitable as unique * identifier source (can be renumbered). * we don't do this. */ return -1; default: return -1; } /* sanity check: g bit must not indicate "group" */ if (EUI64_GROUP(in6)) return -1; /* convert EUI64 into IPv6 interface identifier */ EUI64_TO_IFID(in6); /* * sanity check: ifid must not be all zero, avoid conflict with * subnet router anycast */ if ((in6->s6_addr[8] & ~(EUI64_GBIT | EUI64_UBIT)) == 0x00 && memcmp(&in6->s6_addr[9], allzero, 7) == 0) { return -1; } return 0; } /* * Get interface identifier for the specified interface. If it is not * available on ifp0, borrow interface identifier from other information * sources. * * altifp - secondary EUI64 source */ static int get_ifid(struct ifnet *ifp0, struct ifnet *altifp, struct in6_addr *in6) { struct ifnet *ifp; int s; /* first, try to get it from the interface itself */ if (in6_get_hw_ifid(ifp0, in6) == 0) { nd6log(LOG_DEBUG, "%s: got interface identifier from itself\n", if_name(ifp0)); goto success; } /* try secondary EUI64 source. this basically is for ATM PVC */ if (altifp && in6_get_hw_ifid(altifp, in6) == 0) { nd6log(LOG_DEBUG, "%s: got interface identifier from %s\n", if_name(ifp0), if_name(altifp)); goto success; } /* next, try to get it from some other hardware interface */ s = pserialize_read_enter(); IFNET_READER_FOREACH(ifp) { if (ifp == ifp0) continue; if (in6_get_hw_ifid(ifp, in6) != 0) continue; /* * to borrow ifid from other interface, ifid needs to be * globally unique */ if (IFID_UNIVERSAL(in6)) { nd6log(LOG_DEBUG, "%s: borrow interface identifier from %s\n", if_name(ifp0), if_name(ifp)); pserialize_read_exit(s); goto success; } } pserialize_read_exit(s); #if 0 /* get from hostid - only for certain architectures */ if (get_hostid_ifid(ifp, in6) == 0) { nd6log(LOG_DEBUG, "%s: interface identifier generated by hostid\n", if_name(ifp0)); goto success; } #endif /* last resort: get from random number source */ if (get_rand_ifid(in6) == 0) { nd6log(LOG_DEBUG, "%s: interface identifier generated by random number\n", if_name(ifp0)); goto success; } printf("%s: failed to get interface identifier\n", if_name(ifp0)); return -1; success: nd6log(LOG_INFO, "%s: ifid: %02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x\n", if_name(ifp0), in6->s6_addr[8], in6->s6_addr[9], in6->s6_addr[10], in6->s6_addr[11], in6->s6_addr[12], in6->s6_addr[13], in6->s6_addr[14], in6->s6_addr[15]); return 0; } /* * altifp - secondary EUI64 source */ static int in6_ifattach_linklocal(struct ifnet *ifp, struct ifnet *altifp) { struct in6_aliasreq ifra; int error; /* * configure link-local address. */ memset(&ifra, 0, sizeof(ifra)); /* * in6_update_ifa() does not use ifra_name, but we accurately set it * for safety. */ strncpy(ifra.ifra_name, if_name(ifp), sizeof(ifra.ifra_name)); ifra.ifra_addr.sin6_family = AF_INET6; ifra.ifra_addr.sin6_len = sizeof(struct sockaddr_in6); ifra.ifra_addr.sin6_addr.s6_addr32[0] = htonl(0xfe800000); ifra.ifra_addr.sin6_addr.s6_addr32[1] = 0; if ((ifp->if_flags & IFF_LOOPBACK) != 0) { ifra.ifra_addr.sin6_addr.s6_addr32[2] = 0; ifra.ifra_addr.sin6_addr.s6_addr32[3] = htonl(1); } else { if (get_ifid(ifp, altifp, &ifra.ifra_addr.sin6_addr) != 0) { nd6log(LOG_ERR, "%s: no ifid available\n", if_name(ifp)); return -1; } } if (in6_setscope(&ifra.ifra_addr.sin6_addr, ifp, NULL)) return -1; sockaddr_in6_init(&ifra.ifra_prefixmask, &in6mask64, 0, 0, 0); /* link-local addresses should NEVER expire. */ ifra.ifra_lifetime.ia6t_vltime = ND6_INFINITE_LIFETIME; ifra.ifra_lifetime.ia6t_pltime = ND6_INFINITE_LIFETIME; /* * Now call in6_update_ifa() to do a bunch of procedures to configure * a link-local address. We can set the 3rd argument to NULL, because * we know there's no other link-local address on the interface * and therefore we are adding one (instead of updating one). */ if ((error = in6_update_ifa(ifp, &ifra, IN6_IFAUPDATE_DADDELAY)) != 0) { /* * XXX: When the interface does not support IPv6, this call * would fail in the SIOCINITIFADDR ioctl. I believe the * notification is rather confusing in this case, so just * suppress it. (jinmei@kame.net 20010130) */ if (error != EAFNOSUPPORT) nd6log(LOG_NOTICE, "failed to configure a link-local address on %s " "(errno=%d)\n", if_name(ifp), error); return -1; } return 0; } /* * ifp - must be IFT_LOOP */ static int in6_ifattach_loopback(struct ifnet *ifp) { struct in6_aliasreq ifra; int error; memset(&ifra, 0, sizeof(ifra)); /* * in6_update_ifa() does not use ifra_name, but we accurately set it * for safety. */ strncpy(ifra.ifra_name, if_name(ifp), sizeof(ifra.ifra_name)); sockaddr_in6_init(&ifra.ifra_prefixmask, &in6mask128, 0, 0, 0); /* * Always initialize ia_dstaddr (= broadcast address) to loopback * address. Follows IPv4 practice - see in_ifinit(). */ sockaddr_in6_init(&ifra.ifra_dstaddr, &in6addr_loopback, 0, 0, 0); sockaddr_in6_init(&ifra.ifra_addr, &in6addr_loopback, 0, 0, 0); /* the loopback address should NEVER expire. */ ifra.ifra_lifetime.ia6t_vltime = ND6_INFINITE_LIFETIME; ifra.ifra_lifetime.ia6t_pltime = ND6_INFINITE_LIFETIME; /* we don't need to perform DAD on loopback interfaces. */ ifra.ifra_flags |= IN6_IFF_NODAD; /* * We are sure that this is a newly assigned address, so we can set * NULL to the 3rd arg. */ if ((error = in6_update_ifa(ifp, &ifra, 0)) != 0) { nd6log(LOG_ERR, "failed to configure " "the loopback address on %s (errno=%d)\n", if_name(ifp), error); return -1; } return 0; } /* * compute NI group address, based on the current hostname setting. * see draft-ietf-ipngwg-icmp-name-lookup-* (04 and later). * * when ifp == NULL, the caller is responsible for filling scopeid. */ int in6_nigroup(struct ifnet *ifp, const char *name, int namelen, struct sockaddr_in6 *sa6) { const char *p; u_int8_t *q; MD5_CTX ctxt; u_int8_t digest[16]; u_int8_t l; u_int8_t n[64]; /* a single label must not exceed 63 chars */ if (!namelen || !name) return -1; p = name; while (p && *p && *p != '.' && p - name < namelen) p++; if (p - name > sizeof(n) - 1) return -1; /* label too long */ l = p - name; strncpy((char *)n, name, l); n[(int)l] = '\0'; for (q = n; *q; q++) { if ('A' <= *q && *q <= 'Z') *q = *q - 'A' + 'a'; } /* generate 8 bytes of pseudo-random value. */ memset(&ctxt, 0, sizeof(ctxt)); MD5Init(&ctxt); MD5Update(&ctxt, &l, sizeof(l)); MD5Update(&ctxt, n, l); MD5Final(digest, &ctxt); memset(sa6, 0, sizeof(*sa6)); sa6->sin6_family = AF_INET6; sa6->sin6_len = sizeof(*sa6); sa6->sin6_addr.s6_addr16[0] = htons(0xff02); sa6->sin6_addr.s6_addr8[11] = 2; memcpy(&sa6->sin6_addr.s6_addr32[3], digest, sizeof(sa6->sin6_addr.s6_addr32[3])); if (in6_setscope(&sa6->sin6_addr, ifp, NULL)) return -1; /* XXX: should not fail */ return 0; } /* * XXX multiple loopback interface needs more care. for instance, * nodelocal address needs to be configured onto only one of them. * XXX multiple link-local address case * * altifp - secondary EUI64 source */ void in6_ifattach(struct ifnet *ifp, struct ifnet *altifp) { struct in6_ifaddr *ia; struct in6_addr in6; KASSERT(IFNET_LOCKED(ifp)); /* some of the interfaces are inherently not IPv6 capable */ switch (ifp->if_type) { case IFT_BRIDGE: case IFT_L2TP: case IFT_IEEE8023ADLAG: #ifdef IFT_PFLOG case IFT_PFLOG: #endif #ifdef IFT_PFSYNC case IFT_PFSYNC: #endif ND_IFINFO(ifp)->flags &= ~ND6_IFF_AUTO_LINKLOCAL; ND_IFINFO(ifp)->flags |= ND6_IFF_IFDISABLED; return; } /* * if link mtu is too small, don't try to configure IPv6. * remember there could be some link-layer that has special * fragmentation logic. */ if (ifp->if_mtu < IPV6_MMTU) { nd6log(LOG_INFO, "%s has too small MTU, IPv6 not enabled\n", if_name(ifp)); return; } /* * quirks based on interface type */ switch (ifp->if_type) { #ifdef IFT_STF case IFT_STF: /* * 6to4 interface is a very special kind of beast. * no multicast, no linklocal. RFC2529 specifies how to make * linklocals for 6to4 interface, but there's no use and * it is rather harmful to have one. */ ND_IFINFO(ifp)->flags &= ~ND6_IFF_AUTO_LINKLOCAL; return; #endif case IFT_CARP: return; default: break; } /* * usually, we require multicast capability to the interface */ if ((ifp->if_flags & IFF_MULTICAST) == 0) { nd6log(LOG_INFO, "%s is not multicast capable, IPv6 not enabled\n", if_name(ifp)); return; } /* * assign loopback address for loopback interface. * XXX multiple loopback interface case. */ if ((ifp->if_flags & IFF_LOOPBACK) != 0) { in6 = in6addr_loopback; /* These are safe and atomic thanks to IFNET_LOCK */ if (in6ifa_ifpwithaddr(ifp, &in6) == NULL) { if (in6_ifattach_loopback(ifp) != 0) return; } } /* * assign a link-local address, if there's none. */ if (!(ND_IFINFO(ifp)->flags & ND6_IFF_IFDISABLED) && ND_IFINFO(ifp)->flags & ND6_IFF_AUTO_LINKLOCAL) { int bound = curlwp_bind(); struct psref psref; ia = in6ifa_ifpforlinklocal_psref(ifp, 0, &psref); if (ia == NULL && in6_ifattach_linklocal(ifp, altifp) != 0) { printf("%s: cannot assign link-local address\n", ifp->if_xname); } ia6_release(ia, &psref); curlwp_bindx(bound); } } /* * NOTE: in6_ifdetach() does not support loopback if at this moment. * We don't need this function in bsdi, because interfaces are never removed * from the ifnet list in bsdi. */ void in6_ifdetach(struct ifnet *ifp) { /* nuke any of IPv6 addresses we have */ if_purgeaddrs(ifp, AF_INET6, in6_purgeaddr); in6_purge_multi(ifp); /* remove ip6_mrouter stuff */ ip6_mrouter_detach(ifp); /* remove neighbor management table */ nd6_purge(ifp, NULL); }
4 5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 /* $NetBSD: uvm_user.c,v 1.14 2011/02/02 15:13:34 chuck Exp $ */ /* * Copyright (c) 1997 Charles D. Cranor and Washington University. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * from: Id: uvm_user.c,v 1.1.2.1 1997/08/14 19:10:41 chuck Exp */ /* * uvm_user.c: high level uvm_allocate/uvm_deallocate interface into vm. */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: uvm_user.c,v 1.14 2011/02/02 15:13:34 chuck Exp $"); #include <sys/param.h> #include <sys/systm.h> #include <sys/proc.h> #include <uvm/uvm.h> /* * uvm_deallocate: deallocate memory (unmap) */ void uvm_deallocate(struct vm_map *map, vaddr_t start, vsize_t size) { if (size == 0) return; uvm_unmap(map, trunc_page(start), round_page(start + size)); }
1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 /*- * Copyright (c) 2019 Mindaugas Rasiukevicius <rmind at noxt eu> * Copyright (c) 2013 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Mindaugas Rasiukevicius. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /* * NPF network interface handling. * * NPF uses its own interface IDs (npf-if-id). These IDs start from 1. * Zero is reserved to indicate "no interface" case or an interface of * no interest (i.e. not registered). * * This module provides an interface to primarily handle the following: * * - Bind a symbolic interface name to NPF interface ID. * - Associate NPF interface ID when the network interface is attached. * * When NPF configuration is (re)loaded, each referenced network interface * name is registered with a unique ID. If the network interface is already * attached, then the ID is associated with it immediately; otherwise, IDs * are associated/disassociated on interface events which are monitored * using pfil(9) hooks. * * To avoid race conditions when an active NPF configuration is updated or * interfaces are detached/attached, the interface names are never removed * and therefore IDs are never re-assigned. The only point when interface * names and IDs are cleared is when the configuration is flushed. * * A linear counter is used for IDs. */ #ifdef _KERNEL #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: npf_if.c,v 1.13 2020/05/30 14:16:56 rmind Exp $"); #include <sys/param.h> #include <sys/types.h> #include <sys/kmem.h> #include <net/if.h> #endif #include "npf_impl.h" typedef struct npf_ifmap { char ifname[IFNAMSIZ + 1]; } npf_ifmap_t; #define NPF_IFMAP_NOID (0U) #define NPF_IFMAP_SLOT2ID(npf, slot) ((npf)->ifmap_off + (slot) + 1) #define NPF_IFMAP_ID2SLOT(npf, id) \ ((id) - atomic_load_relaxed(&(npf)->ifmap_off) - 1) void npf_ifmap_init(npf_t *npf, const npf_ifops_t *ifops) { const size_t nbytes = sizeof(npf_ifmap_t) * NPF_MAX_IFMAP; KASSERT(ifops != NULL); ifops->flush(npf, (void *)(uintptr_t)0); mutex_init(&npf->ifmap_lock, MUTEX_DEFAULT, IPL_SOFTNET); npf->ifmap = kmem_zalloc(nbytes, KM_SLEEP); npf->ifmap_cnt = 0; npf->ifmap_off = 0; npf->ifops = ifops; } void npf_ifmap_fini(npf_t *npf) { const size_t nbytes = sizeof(npf_ifmap_t) * NPF_MAX_IFMAP; mutex_destroy(&npf->ifmap_lock); kmem_free(npf->ifmap, nbytes); } static unsigned npf_ifmap_lookup(npf_t *npf, const char *ifname) { KASSERT(mutex_owned(&npf->ifmap_lock)); for (unsigned i = 0; i < npf->ifmap_cnt; i++) { npf_ifmap_t *ifmap = &npf->ifmap[i]; if (strcmp(ifmap->ifname, ifname) == 0) { return NPF_IFMAP_SLOT2ID(npf, i); } } return NPF_IFMAP_NOID; } /* * npf_ifmap_register: register an interface name; return an assigned * NPF network ID on success (non-zero). * * This routine is mostly called on NPF configuration (re)load for the * interfaces names referenced by the rules. */ unsigned npf_ifmap_register(npf_t *npf, const char *ifname) { npf_ifmap_t *ifmap; unsigned id, i; ifnet_t *ifp; mutex_enter(&npf->ifmap_lock); if ((id = npf_ifmap_lookup(npf, ifname)) != NPF_IFMAP_NOID) { goto out; } if (npf->ifmap_cnt == NPF_MAX_IFMAP) { printf("npf_ifmap_new: out of slots; bump NPF_MAX_IFMAP\n"); id = NPF_IFMAP_NOID; goto out; } KASSERT(npf->ifmap_cnt < NPF_MAX_IFMAP); /* Allocate a new slot and convert and assign an ID. */ i = npf->ifmap_cnt++; ifmap = &npf->ifmap[i]; strlcpy(ifmap->ifname, ifname, IFNAMSIZ); id = NPF_IFMAP_SLOT2ID(npf, i); if ((ifp = npf->ifops->lookup(npf, ifname)) != NULL) { npf->ifops->setmeta(npf, ifp, (void *)(uintptr_t)id); } out: mutex_exit(&npf->ifmap_lock); return id; } void npf_ifmap_flush(npf_t *npf) { mutex_enter(&npf->ifmap_lock); npf->ifops->flush(npf, (void *)(uintptr_t)NPF_IFMAP_NOID); for (unsigned i = 0; i < npf->ifmap_cnt; i++) { npf->ifmap[i].ifname[0] = '\0'; } npf->ifmap_cnt = 0; /* * Reset the ID counter if reaching the overflow; this is not * realistic, but we maintain correctness. */ if (npf->ifmap_off < (UINT_MAX - NPF_MAX_IFMAP)) { npf->ifmap_off += NPF_MAX_IFMAP; } else { npf->ifmap_off = 0; } mutex_exit(&npf->ifmap_lock); } /* * npf_ifmap_getid: get the ID for the given network interface. * * => This routine is typically called from the packet handler when * matching whether the packet is on particular network interface. * * => This routine is lock-free; if the NPF configuration is flushed * while the packet is in-flight, the ID will not match because we * keep the IDs linear. */ unsigned npf_ifmap_getid(npf_t *npf, const ifnet_t *ifp) { const unsigned id = (uintptr_t)npf->ifops->getmeta(npf, ifp); return id; } /* * npf_ifmap_copylogname: this function is toxic; it can return garbage * as we don't lock, but it is only used temporarily and only for logging. */ void npf_ifmap_copylogname(npf_t *npf, unsigned id, char *buf, size_t len) { const unsigned i = NPF_IFMAP_ID2SLOT(npf, id); membar_consumer(); if (id != NPF_IFMAP_NOID && i < NPF_MAX_IFMAP) { /* * Lock-free access is safe as there is an extra byte * with a permanent NUL terminator at the end. */ const npf_ifmap_t *ifmap = &npf->ifmap[i]; strlcpy(buf, ifmap->ifname, MIN(len, IFNAMSIZ)); } else { strlcpy(buf, "???", len); } } void npf_ifmap_copyname(npf_t *npf, unsigned id, char *buf, size_t len) { mutex_enter(&npf->ifmap_lock); npf_ifmap_copylogname(npf, id, buf, len); mutex_exit(&npf->ifmap_lock); } __dso_public void npfk_ifmap_attach(npf_t *npf, ifnet_t *ifp) { const npf_ifops_t *ifops = npf->ifops; unsigned id; mutex_enter(&npf->ifmap_lock); id = npf_ifmap_lookup(npf, ifops->getname(npf, ifp)); ifops->setmeta(npf, ifp, (void *)(uintptr_t)id); mutex_exit(&npf->ifmap_lock); } __dso_public void npfk_ifmap_detach(npf_t *npf, ifnet_t *ifp) { /* Diagnostic. */ mutex_enter(&npf->ifmap_lock); npf->ifops->setmeta(npf, ifp, (void *)(uintptr_t)NPF_IFMAP_NOID); mutex_exit(&npf->ifmap_lock); }
1 1 3 1 6 4 10 10 10 10 11 11 1 11 11 1 11 11 11 11 11 10 11 11 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 10 10 10 10 9 7 2 6 3 3 5 2 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 10 7 3 10 7 1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 /* $NetBSD: kern_sysctl.c,v 1.270 2023/09/09 16:01:09 christos Exp $ */ /*- * Copyright (c) 2003, 2007, 2008 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Andrew Brown. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /*- * Copyright (c) 1982, 1986, 1989, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * Mike Karels at Berkeley Software Design, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)kern_sysctl.c 8.9 (Berkeley) 5/20/95 */ /* * sysctl system call. */ #define __COMPAT_SYSCTL #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: kern_sysctl.c,v 1.270 2023/09/09 16:01:09 christos Exp $"); #ifdef _KERNEL_OPT #include "opt_defcorename.h" #endif #include "ksyms.h" #include <sys/param.h> #include <sys/types.h> #include <sys/buf.h> #include <sys/cprng.h> #include <sys/kauth.h> #include <sys/ksyms.h> #include <sys/ktrace.h> #include <sys/malloc.h> #include <sys/mount.h> #include <sys/once.h> #include <sys/rndsource.h> #include <sys/syscallargs.h> #include <sys/sysctl.h> #include <sys/systm.h> #include <crypto/blake2/blake2s.h> #define MAXDESCLEN 1024 MALLOC_DEFINE(M_SYSCTLNODE, "sysctlnode", "sysctl node structures"); MALLOC_DEFINE(M_SYSCTLDATA, "sysctldata", "misc sysctl data"); static int sysctl_mmap(SYSCTLFN_PROTO); static int sysctl_alloc(struct sysctlnode *, int); static int sysctl_realloc(struct sysctlnode *); static int sysctl_cvt_in(struct lwp *, int *, const void *, size_t, struct sysctlnode *); static int sysctl_cvt_out(struct lwp *, int, const struct sysctlnode *, void *, size_t, size_t *); static int sysctl_log_add(struct sysctllog **, const struct sysctlnode *); static int sysctl_log_realloc(struct sysctllog *); typedef void sysctl_setup_func(struct sysctllog **); #ifdef SYSCTL_DEBUG #define DPRINTF(a) printf a #else #define DPRINTF(a) #endif struct sysctllog { const struct sysctlnode *log_root; int *log_num; int log_size, log_left; }; /* * the "root" of the new sysctl tree */ struct sysctlnode sysctl_root = { .sysctl_flags = SYSCTL_VERSION| CTLFLAG_ROOT|CTLFLAG_READWRITE| CTLTYPE_NODE, .sysctl_num = 0, .sysctl_size = sizeof(struct sysctlnode), .sysctl_name = "(root)", }; /* * link set of functions that add nodes at boot time (see also * sysctl_buildtree()) */ __link_set_decl(sysctl_funcs, sysctl_setup_func); /* * The `sysctl_treelock' is intended to serialize access to the sysctl * tree. XXX This has serious problems; allocating memory and * copying data out with the lock held is insane. */ krwlock_t sysctl_treelock; kmutex_t sysctl_file_marker_lock; /* * Attributes stored in the kernel. */ char hostname[MAXHOSTNAMELEN]; int hostnamelen; char domainname[MAXHOSTNAMELEN]; int domainnamelen; long hostid; #ifndef DEFCORENAME #define DEFCORENAME "%n.core" #endif char defcorename[MAXPATHLEN] = DEFCORENAME; /* * ******************************************************************** * Section 0: Some simple glue * ******************************************************************** * By wrapping copyin(), copyout(), and copyinstr() like this, we can * stop caring about who's calling us and simplify some code a bunch. * ******************************************************************** */ int sysctl_copyin(struct lwp *l, const void *uaddr, void *kaddr, size_t len) { int error; if (l != NULL) { error = copyin(uaddr, kaddr, len); ktrmibio(-1, UIO_WRITE, uaddr, len, error); } else { error = kcopy(uaddr, kaddr, len); } return error; } int sysctl_copyout(struct lwp *l, const void *kaddr, void *uaddr, size_t len) { int error; if (l != NULL) { error = copyout(kaddr, uaddr, len); ktrmibio(-1, UIO_READ, uaddr, len, error); } else { error = kcopy(kaddr, uaddr, len); } return error; } int sysctl_copyinstr(struct lwp *l, const void *uaddr, void *kaddr, size_t len, size_t *done) { int error; if (l != NULL) { error = copyinstr(uaddr, kaddr, len, done); ktrmibio(-1, UIO_WRITE, uaddr, len, error); } else { error = copystr(uaddr, kaddr, len, done); } return error; } /* * ******************************************************************** * Initialize sysctl subsystem. * ******************************************************************** */ void sysctl_init(void) { sysctl_setup_func *const *sysctl_setup; rw_init(&sysctl_treelock); /* * dynamic mib numbers start here */ sysctl_root.sysctl_num = CREATE_BASE; sysctl_basenode_init(); __link_set_foreach(sysctl_setup, sysctl_funcs) { (**sysctl_setup)(NULL); } mutex_init(&sysctl_file_marker_lock, MUTEX_DEFAULT, IPL_NONE); } /* * Setting this means no more permanent nodes can be added, * trees that claim to be readonly at the root now are, and if * the main tree is readonly, *everything* is. * * Also starts up the PRNG used for the "random" sysctl: it's * better to start it later than sooner. * * Call this at the end of kernel init. */ void sysctl_finalize(void) { sysctl_root.sysctl_flags |= CTLFLAG_PERMANENT; } /* * ******************************************************************** * The main native sysctl system call itself. * ******************************************************************** */ int sys___sysctl(struct lwp *l, const struct sys___sysctl_args *uap, register_t *retval) { /* { syscallarg(const int *) name; syscallarg(u_int) namelen; syscallarg(void *) old; syscallarg(size_t *) oldlenp; syscallarg(const void *) new; syscallarg(size_t) newlen; } */ int error, nerror, name[CTL_MAXNAME]; size_t oldlen, savelen, *oldlenp; /* * get oldlen */ oldlen = 0; oldlenp = SCARG(uap, oldlenp); if (oldlenp != NULL) { error = copyin(oldlenp, &oldlen, sizeof(oldlen)); if (error) return (error); } savelen = oldlen; /* * top-level sysctl names may or may not be non-terminal, but * we don't care */ if (SCARG(uap, namelen) > CTL_MAXNAME || SCARG(uap, namelen) < 1) return (EINVAL); error = copyin(SCARG(uap, name), &name, SCARG(uap, namelen) * sizeof(int)); if (error) return (error); ktrmib(name, SCARG(uap, namelen)); sysctl_lock(SCARG(uap, newv) != NULL); /* * do sysctl work (NULL means main built-in default tree) */ error = sysctl_dispatch(&name[0], SCARG(uap, namelen), SCARG(uap, oldv), &oldlen, SCARG(uap, newv), SCARG(uap, newlen), &name[0], l, NULL); /* * release the sysctl lock */ sysctl_unlock(); /* * set caller's oldlen to new value even in the face of an * error (if this gets an error and they didn't have one, they * get this one) */ if (oldlenp) { nerror = copyout(&oldlen, oldlenp, sizeof(oldlen)); if (error == 0) error = nerror; } /* * if the only problem is that we weren't given enough space, * that's an ENOMEM error */ if (error == 0 && SCARG(uap, oldv) != NULL && savelen < oldlen) error = ENOMEM; return (error); } /* * ******************************************************************** * Section 1: How the tree is used * ******************************************************************** * Implementations of sysctl for emulations should typically need only * these three functions in this order: lock the tree, dispatch * request into it, unlock the tree. * ******************************************************************** */ void sysctl_lock(bool write) { if (write) { rw_enter(&sysctl_treelock, RW_WRITER); curlwp->l_pflag |= LP_SYSCTLWRITE; } else { rw_enter(&sysctl_treelock, RW_READER); curlwp->l_pflag &= ~LP_SYSCTLWRITE; } } void sysctl_relock(void) { if ((curlwp->l_pflag & LP_SYSCTLWRITE) != 0) { rw_enter(&sysctl_treelock, RW_WRITER); } else { rw_enter(&sysctl_treelock, RW_READER); } } /* * ******************************************************************** * the main sysctl dispatch routine. scans the given tree and picks a * function to call based on what it finds. * ******************************************************************** */ int sysctl_dispatch(SYSCTLFN_ARGS) { int error; sysctlfn fn; int ni; KASSERT(rw_lock_held(&sysctl_treelock)); if (rnode && SYSCTL_VERS(rnode->sysctl_flags) != SYSCTL_VERSION) { printf("sysctl_dispatch: rnode %p wrong version\n", rnode); error = EINVAL; goto out; } fn = NULL; error = sysctl_locate(l, name, namelen, &rnode, &ni); if (rnode->sysctl_func != NULL) { /* * the node we ended up at has a function, so call it. it can * hand off to query or create if it wants to. */ fn = rnode->sysctl_func; } else if (error == 0) { /* * we found the node they were looking for, so do a lookup. */ fn = (sysctlfn)sysctl_lookup; /* XXX may write to rnode */ } else if (error == ENOENT && (ni + 1) == namelen && name[ni] < 0) { /* * prospective parent node found, but the terminal node was * not. generic operations associate with the parent. */ switch (name[ni]) { case CTL_QUERY: fn = sysctl_query; break; case CTL_CREATE: #if NKSYMS > 0 case CTL_CREATESYM: #endif /* NKSYMS > 0 */ if (newp == NULL) { error = EINVAL; break; } KASSERT(rw_write_held(&sysctl_treelock)); fn = (sysctlfn)sysctl_create; /* we own the rnode */ break; case CTL_DESTROY: if (newp == NULL) { error = EINVAL; break; } KASSERT(rw_write_held(&sysctl_treelock)); fn = (sysctlfn)sysctl_destroy; /* we own the rnode */ break; case CTL_MMAP: fn = (sysctlfn)sysctl_mmap; /* we own the rnode */ break; case CTL_DESCRIBE: fn = sysctl_describe; break; default: error = EOPNOTSUPP; break; } } /* * after all of that, maybe we found someone who knows how to * get us what we want? */ if (fn != NULL) error = (*fn)(name + ni, namelen - ni, oldp, oldlenp, newp, newlen, name, l, rnode); else if (error == 0) error = EOPNOTSUPP; out: return (error); } /* * ******************************************************************** * Releases the tree lock. * ******************************************************************** */ void sysctl_unlock(void) { rw_exit(&sysctl_treelock); } /* * ******************************************************************** * Section 2: The main tree interfaces * ******************************************************************** * This is how sysctl_dispatch() does its work, and you can too, by * calling these routines from helpers (though typically only * sysctl_lookup() will be used). The tree MUST BE LOCKED when these * are called. * ******************************************************************** */ /* * sysctl_locate -- Finds the node matching the given mib under the * given tree (via rv). If no tree is given, we fall back to the * native tree. The current process (via l) is used for access * control on the tree (some nodes may be traversable only by root) and * on return, nip will show how many numbers in the mib were consumed. */ int sysctl_locate(struct lwp *l, const int *name, u_int namelen, const struct sysctlnode **rnode, int *nip) { const struct sysctlnode *node, *pnode; int tn, si, ni, error, alias; KASSERT(rw_lock_held(&sysctl_treelock)); /* * basic checks and setup */ if (*rnode == NULL) *rnode = &sysctl_root; if (nip) *nip = 0; if (namelen == 0) return (0); /* * search starts from "root" */ pnode = *rnode; if (SYSCTL_VERS(pnode->sysctl_flags) != SYSCTL_VERSION) { printf("sysctl_locate: pnode %p wrong version\n", pnode); return (EINVAL); } node = pnode->sysctl_child; error = 0; /* * scan for node to which new node should be attached */ for (ni = 0; ni < namelen; ni++) { /* * walked off bottom of tree */ if (node == NULL) { if (SYSCTL_TYPE(pnode->sysctl_flags) == CTLTYPE_NODE) error = ENOENT; else error = ENOTDIR; break; } /* * can anyone traverse this node or only root? */ if (l != NULL && (pnode->sysctl_flags & CTLFLAG_PRIVATE) && (error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_SYSCTL, KAUTH_REQ_SYSTEM_SYSCTL_PRVT, NULL, NULL, NULL)) != 0) return (error); /* * find a child node with the right number */ tn = name[ni]; alias = 0; si = 0; /* * Note: ANYNUMBER only matches positive integers. * Since ANYNUMBER is only permitted on single-node * sub-trees (eg proc), check before the loop and skip * it if we can. */ if ((node[si].sysctl_flags & CTLFLAG_ANYNUMBER) && (tn >= 0)) goto foundit; for (; si < pnode->sysctl_clen; si++) { if (node[si].sysctl_num == tn) { if (node[si].sysctl_flags & CTLFLAG_ALIAS) { if (alias++ == 4) break; else { tn = node[si].sysctl_alias; si = -1; } } else goto foundit; } } /* * if we ran off the end, it obviously doesn't exist */ error = ENOENT; break; /* * so far so good, move on down the line */ foundit: pnode = &node[si]; if (SYSCTL_TYPE(pnode->sysctl_flags) == CTLTYPE_NODE) node = node[si].sysctl_child; else node = NULL; } *rnode = pnode; if (nip) *nip = ni; return (error); } /* * sysctl_query -- The auto-discovery engine. Copies out the structs * describing nodes under the given node and handles overlay trees. */ int sysctl_query(SYSCTLFN_ARGS) { int error, ni, elim, v; size_t out, left, t; const struct sysctlnode *enode, *onode; struct sysctlnode qnode; KASSERT(rw_lock_held(&sysctl_treelock)); if (SYSCTL_VERS(rnode->sysctl_flags) != SYSCTL_VERSION) { printf("sysctl_query: rnode %p wrong version\n", rnode); return (EINVAL); } if (SYSCTL_TYPE(rnode->sysctl_flags) != CTLTYPE_NODE) return (ENOTDIR); if (namelen != 1 || name[0] != CTL_QUERY) return (EINVAL); error = 0; out = 0; left = *oldlenp; elim = 0; enode = NULL; /* * translate the given request to a current node */ error = sysctl_cvt_in(l, &v, newp, newlen, &qnode); if (error) return (error); /* * if the request specifies a version, check it */ if (qnode.sysctl_ver != 0) { enode = rnode; if (qnode.sysctl_ver != enode->sysctl_ver && qnode.sysctl_ver != sysctl_rootof(enode)->sysctl_ver) return (EINVAL); } /* * process has overlay tree */ if (l && l->l_proc->p_emul->e_sysctlovly) { enode = l->l_proc->p_emul->e_sysctlovly; elim = (name - oname); error = sysctl_locate(l, oname, elim, &enode, NULL); if (error == 0) { /* ah, found parent in overlay */ elim = enode->sysctl_clen; enode = enode->sysctl_child; } else { error = 0; elim = 0; enode = NULL; } } for (ni = 0; ni < rnode->sysctl_clen; ni++) { onode = &rnode->sysctl_child[ni]; if (enode && enode->sysctl_num == onode->sysctl_num) { if (SYSCTL_TYPE(enode->sysctl_flags) != CTLTYPE_NODE) onode = enode; if (--elim > 0) enode++; else enode = NULL; } error = sysctl_cvt_out(l, v, onode, oldp, left, &t); if (error) return (error); if (oldp != NULL) oldp = (char*)oldp + t; out += t; left -= MIN(left, t); } /* * overlay trees *MUST* be entirely consumed */ KASSERT(enode == NULL); *oldlenp = out; return (error); } /* * sysctl_create -- Adds a node (the description of which is taken * from newp) to the tree, returning a copy of it in the space pointed * to by oldp. In the event that the requested slot is already taken * (either by name or by number), the offending node is returned * instead. Yes, this is complex, but we want to make sure everything * is proper. */ #ifdef SYSCTL_DEBUG_CREATE int _sysctl_create(SYSCTLFN_ARGS); int _sysctl_create(SYSCTLFN_ARGS) #else int sysctl_create(SYSCTLFN_ARGS) #endif { struct sysctlnode nnode, *node, *pnode; int error, ni, at, nm, type, nsz, sz, flags, anum, v; void *own; KASSERT(rw_write_held(&sysctl_treelock)); error = 0; own = NULL; anum = -1; if (SYSCTL_VERS(rnode->sysctl_flags) != SYSCTL_VERSION) { printf("sysctl_create: rnode %p wrong version\n", rnode); return (EINVAL); } if (namelen != 1 || (name[namelen - 1] != CTL_CREATE #if NKSYMS > 0 && name[namelen - 1] != CTL_CREATESYM #endif /* NKSYMS > 0 */ )) return (EINVAL); /* * processes can only add nodes at securelevel 0, must be * root, and can't add nodes to a parent that's not writeable */ if (l != NULL) { #ifndef SYSCTL_DISALLOW_CREATE error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_SYSCTL, KAUTH_REQ_SYSTEM_SYSCTL_ADD, NULL, NULL, NULL); if (error) return (error); if (!(rnode->sysctl_flags & CTLFLAG_READWRITE)) #endif /* SYSCTL_DISALLOW_CREATE */ return (EPERM); } /* * nothing can add a node if: * we've finished initial set up of this tree and * (the tree itself is not writeable or * the entire sysctl system is not writeable) */ if ((sysctl_rootof(rnode)->sysctl_flags & CTLFLAG_PERMANENT) && (!(sysctl_rootof(rnode)->sysctl_flags & CTLFLAG_READWRITE) || !(sysctl_root.sysctl_flags & CTLFLAG_READWRITE))) return (EPERM); /* * it must be a "node", not a "int" or something */ if (SYSCTL_TYPE(rnode->sysctl_flags) != CTLTYPE_NODE) return (ENOTDIR); if (rnode->sysctl_flags & CTLFLAG_ALIAS) { printf("sysctl_create: attempt to add node to aliased " "node %p\n", rnode); return (EINVAL); } pnode = __UNCONST(rnode); /* we are adding children to this node */ if (newp == NULL) return (EINVAL); error = sysctl_cvt_in(l, &v, newp, newlen, &nnode); if (error) return (error); /* * nodes passed in don't *have* parents */ if (nnode.sysctl_parent != NULL) return (EINVAL); /* * if we are indeed adding it, it should be a "good" name and * number */ nm = nnode.sysctl_num; #if NKSYMS > 0 if (nm == CTL_CREATESYM) nm = CTL_CREATE; #endif /* NKSYMS > 0 */ if (nm < 0 && nm != CTL_CREATE) return (EINVAL); /* * the name can't start with a digit */ if (nnode.sysctl_name[0] >= '0' && nnode.sysctl_name[0] <= '9') return (EINVAL); /* * the name must be only alphanumerics or - or _, longer than * 0 bytes and less than SYSCTL_NAMELEN */ nsz = 0; while (nsz < SYSCTL_NAMELEN && nnode.sysctl_name[nsz] != '\0') { if ((nnode.sysctl_name[nsz] >= '0' && nnode.sysctl_name[nsz] <= '9') || (nnode.sysctl_name[nsz] >= 'A' && nnode.sysctl_name[nsz] <= 'Z') || (nnode.sysctl_name[nsz] >= 'a' && nnode.sysctl_name[nsz] <= 'z') || nnode.sysctl_name[nsz] == '-' || nnode.sysctl_name[nsz] == '_') nsz++; else return (EINVAL); } if (nsz == 0 || nsz == SYSCTL_NAMELEN) return (EINVAL); /* * various checks revolve around size vs type, etc */ type = SYSCTL_TYPE(nnode.sysctl_flags); flags = SYSCTL_FLAGS(nnode.sysctl_flags); sz = nnode.sysctl_size; /* * find out if there's a collision, and if so, let the caller * know what they collided with */ node = pnode->sysctl_child; at = 0; if (node) { if ((flags | node->sysctl_flags) & CTLFLAG_ANYNUMBER) /* No siblings for a CTLFLAG_ANYNUMBER node */ return EINVAL; for (ni = 0; ni < pnode->sysctl_clen; ni++) { if (nm == node[ni].sysctl_num || strcmp(nnode.sysctl_name, node[ni].sysctl_name) == 0) { /* * ignore error here, since we * are already fixed on EEXIST */ (void)sysctl_cvt_out(l, v, &node[ni], oldp, *oldlenp, oldlenp); return (EEXIST); } if (nm > node[ni].sysctl_num) at++; } } /* * use sysctl_ver to add to the tree iff it hasn't changed */ if (nnode.sysctl_ver != 0) { /* * a specified value must match either the parent * node's version or the root node's version */ if (nnode.sysctl_ver != sysctl_rootof(rnode)->sysctl_ver && nnode.sysctl_ver != rnode->sysctl_ver) { return (EINVAL); } } /* * only the kernel can assign functions to entries */ if (l != NULL && nnode.sysctl_func != NULL) return (EPERM); /* * only the kernel can create permanent entries, and only then * before the kernel is finished setting itself up */ if (l != NULL && (flags & ~SYSCTL_USERFLAGS)) return (EPERM); if ((flags & CTLFLAG_PERMANENT) & (sysctl_root.sysctl_flags & CTLFLAG_PERMANENT)) return (EPERM); if ((flags & (CTLFLAG_OWNDATA | CTLFLAG_IMMEDIATE)) == (CTLFLAG_OWNDATA | CTLFLAG_IMMEDIATE)) return (EINVAL); if ((flags & CTLFLAG_IMMEDIATE) && type != CTLTYPE_INT && type != CTLTYPE_QUAD && type != CTLTYPE_BOOL) return (EINVAL); /* * check size, or set it if unset and we can figure it out. * kernel created nodes are allowed to have a function instead * of a size (or a data pointer). */ switch (type) { case CTLTYPE_NODE: /* * only *i* can assert the size of a node */ if (flags & CTLFLAG_ALIAS) { anum = nnode.sysctl_alias; if (anum < 0) return (EINVAL); nnode.sysctl_alias = 0; } if (sz != 0 || nnode.sysctl_data != NULL) return (EINVAL); if (nnode.sysctl_csize != 0 || nnode.sysctl_clen != 0 || nnode.sysctl_child != 0) return (EINVAL); if (flags & CTLFLAG_OWNDATA) return (EINVAL); sz = sizeof(struct sysctlnode); break; case CTLTYPE_INT: /* * since an int is an int, if the size is not given or * is wrong, we can "int-uit" it. */ if (sz != 0 && sz != sizeof(int)) return (EINVAL); sz = sizeof(int); break; case CTLTYPE_STRING: /* * strings are a little more tricky */ if (sz == 0) { if (l == NULL) { if (nnode.sysctl_func == NULL) { if (nnode.sysctl_data == NULL) return (EINVAL); else sz = strlen(nnode.sysctl_data) + 1; } } else if (nnode.sysctl_data == NULL && flags & CTLFLAG_OWNDATA) { return (EINVAL); } else { char *vp, *e; size_t s; /* * we want a rough idea of what the * size is now */ vp = malloc(PAGE_SIZE, M_SYSCTLDATA, M_WAITOK); if (vp == NULL) return (ENOMEM); e = nnode.sysctl_data; do { error = copyinstr(e, vp, PAGE_SIZE, &s); if (error) { if (error != ENAMETOOLONG) { free(vp, M_SYSCTLDATA); return (error); } e += PAGE_SIZE; if ((e - 32 * PAGE_SIZE) > (char*)nnode.sysctl_data) { free(vp, M_SYSCTLDATA); return (ERANGE); } } } while (error != 0); sz = s + (e - (char*)nnode.sysctl_data); free(vp, M_SYSCTLDATA); } } break; case CTLTYPE_QUAD: if (sz != 0 && sz != sizeof(u_quad_t)) return (EINVAL); sz = sizeof(u_quad_t); break; case CTLTYPE_BOOL: /* * since an bool is an bool, if the size is not given or * is wrong, we can "intuit" it. */ if (sz != 0 && sz != sizeof(bool)) return (EINVAL); sz = sizeof(bool); break; case CTLTYPE_STRUCT: if (sz == 0) { if (l != NULL || nnode.sysctl_func == NULL) return (EINVAL); if (flags & CTLFLAG_OWNDATA) return (EINVAL); } break; default: return (EINVAL); } /* * at this point, if sz is zero, we *must* have a * function to go with it and we can't own it. */ /* * l ptr own * 0 0 0 -> EINVAL (if no func) * 0 0 1 -> own * 0 1 0 -> kptr * 0 1 1 -> kptr * 1 0 0 -> EINVAL * 1 0 1 -> own * 1 1 0 -> kptr, no own (fault on lookup) * 1 1 1 -> uptr, own */ if (type != CTLTYPE_NODE) { if (sz != 0) { if (flags & CTLFLAG_OWNDATA) { own = malloc(sz, M_SYSCTLDATA, M_WAITOK); if (own == NULL) return ENOMEM; if (nnode.sysctl_data == NULL) memset(own, 0, sz); else { error = sysctl_copyin(l, nnode.sysctl_data, own, sz); if (error != 0) { free(own, M_SYSCTLDATA); return (error); } } } else if ((nnode.sysctl_data != NULL) && !(flags & CTLFLAG_IMMEDIATE)) { #if NKSYMS > 0 if (name[namelen - 1] == CTL_CREATESYM) { char symname[128]; /* XXX enough? */ u_long symaddr; size_t symlen; error = sysctl_copyinstr(l, nnode.sysctl_data, symname, sizeof(symname), &symlen); if (error) return (error); error = ksyms_getval(NULL, symname, &symaddr, KSYMS_EXTERN); if (error) return (error); /* EINVAL? */ nnode.sysctl_data = (void*)symaddr; } #endif /* NKSYMS > 0 */ /* * Ideally, we'd like to verify here * that this address is acceptable, * but... * * - it might be valid now, only to * become invalid later * * - it might be invalid only for the * moment and valid later * * - or something else. * * Since we can't get a good answer, * we'll just accept the address as * given, and fault on individual * lookups. */ } } else if (nnode.sysctl_func == NULL) return (EINVAL); } /* * a process can't assign a function to a node, and the kernel * can't create a node that has no function or data. * (XXX somewhat redundant check) */ if (l != NULL || nnode.sysctl_func == NULL) { if (type != CTLTYPE_NODE && !(flags & CTLFLAG_IMMEDIATE) && nnode.sysctl_data == NULL && own == NULL) return (EINVAL); } #ifdef SYSCTL_DISALLOW_KWRITE /* * a process can't create a writable node unless it refers to * new data. */ if (l != NULL && own == NULL && type != CTLTYPE_NODE && (flags & CTLFLAG_READWRITE) != CTLFLAG_READONLY && !(flags & CTLFLAG_IMMEDIATE)) return (EPERM); #endif /* SYSCTL_DISALLOW_KWRITE */ /* * make sure there's somewhere to put the new stuff. */ if (pnode->sysctl_child == NULL) { if (flags & CTLFLAG_ANYNUMBER) error = sysctl_alloc(pnode, 1); else error = sysctl_alloc(pnode, 0); if (error) { if (own != NULL) free(own, M_SYSCTLDATA); return (error); } } node = pnode->sysctl_child; /* * no collisions, so pick a good dynamic number if we need to. */ if (nm == CTL_CREATE) { nm = ++sysctl_root.sysctl_num; for (ni = 0; ni < pnode->sysctl_clen; ni++) { if (nm == node[ni].sysctl_num) { nm++; ni = -1; } else if (nm > node[ni].sysctl_num) at = ni + 1; } } /* * oops...ran out of space */ if (pnode->sysctl_clen == pnode->sysctl_csize) { error = sysctl_realloc(pnode); if (error) { if (own != NULL) free(own, M_SYSCTLDATA); return (error); } node = pnode->sysctl_child; } /* * insert new node data */ if (at < pnode->sysctl_clen) { int t; /* * move the nodes that should come after the new one */ memmove(&node[at + 1], &node[at], (pnode->sysctl_clen - at) * sizeof(struct sysctlnode)); memset(&node[at], 0, sizeof(struct sysctlnode)); node[at].sysctl_parent = pnode; /* * and...reparent any children of any moved nodes */ for (ni = at; ni <= pnode->sysctl_clen; ni++) if (node[ni].sysctl_child != NULL) for (t = 0; t < node[ni].sysctl_csize; t++) node[ni].sysctl_child[t].sysctl_parent = &node[ni]; } node = &node[at]; pnode->sysctl_clen++; strlcpy(node->sysctl_name, nnode.sysctl_name, sizeof(node->sysctl_name)); node->sysctl_num = nm; node->sysctl_size = sz; node->sysctl_flags = SYSCTL_VERSION|type|flags; /* XXX other trees */ node->sysctl_csize = 0; node->sysctl_clen = 0; if (own) { node->sysctl_data = own; node->sysctl_flags |= CTLFLAG_OWNDATA; } else if (flags & CTLFLAG_ALIAS) { node->sysctl_alias = anum; } else if (flags & CTLFLAG_IMMEDIATE) { switch (type) { case CTLTYPE_BOOL: node->sysctl_bdata = nnode.sysctl_bdata; break; case CTLTYPE_INT: node->sysctl_idata = nnode.sysctl_idata; break; case CTLTYPE_QUAD: node->sysctl_qdata = nnode.sysctl_qdata; break; } } else { node->sysctl_data = nnode.sysctl_data; node->sysctl_flags &= ~CTLFLAG_OWNDATA; } node->sysctl_func = nnode.sysctl_func; node->sysctl_child = NULL; /* node->sysctl_parent should already be done */ /* * update "version" on path to "root" */ for (; rnode->sysctl_parent != NULL; rnode = rnode->sysctl_parent) ; pnode = node; for (nm = rnode->sysctl_ver + 1; pnode != NULL; pnode = pnode->sysctl_parent) pnode->sysctl_ver = nm; /* If this fails, the node is already added - the user won't know! */ error = sysctl_cvt_out(l, v, node, oldp, *oldlenp, oldlenp); return (error); } /* * ******************************************************************** * A wrapper around sysctl_create() that prints the thing we're trying * to add. * ******************************************************************** */ #ifdef SYSCTL_DEBUG_CREATE int sysctl_create(SYSCTLFN_ARGS) { const struct sysctlnode *node; int k, v, rc, ni, nl = namelen + (name - oname); struct sysctlnode nnode; if (newp == NULL) return EINVAL; int error = sysctl_cvt_in(l, &v, newp, newlen, &nnode); if (error) return error; node = &nnode; printf("namelen %d (", nl); for (ni = 0; ni < nl - 1; ni++) printf(" %d", oname[ni]); printf(" %d )\t[%s]\tflags %08x (%08x %d %zu)\n", k = node->sysctl_num, node->sysctl_name, node->sysctl_flags, SYSCTL_FLAGS(node->sysctl_flags), SYSCTL_TYPE(node->sysctl_flags), node->sysctl_size); node = rnode; rc = _sysctl_create(SYSCTLFN_CALL(rnode)); printf("sysctl_create("); for (ni = 0; ni < nl - 1; ni++) printf(" %d", oname[ni]); printf(" %d ) returned %d\n", k, rc); return (rc); } #endif /* SYSCTL_DEBUG_CREATE */ /* * sysctl_destroy -- Removes a node (as described by newp) from the * given tree, returning (if successful) a copy of the dead node in * oldp. Since we're removing stuff, there's not much to check. */ int sysctl_destroy(SYSCTLFN_ARGS) { struct sysctlnode *node, *pnode, onode, nnode; int ni, error, v; KASSERT(rw_write_held(&sysctl_treelock)); if (SYSCTL_VERS(rnode->sysctl_flags) != SYSCTL_VERSION) { printf("sysctl_destroy: rnode %p wrong version\n", rnode); return (EINVAL); } error = 0; if (namelen != 1 || name[namelen - 1] != CTL_DESTROY) return (EINVAL); /* * processes can only destroy nodes at securelevel 0, must be * root, and can't remove nodes from a parent that's not * writeable */ if (l != NULL) { #ifndef SYSCTL_DISALLOW_CREATE error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_SYSCTL, KAUTH_REQ_SYSTEM_SYSCTL_DELETE, NULL, NULL, NULL); if (error) return (error); if (!(rnode->sysctl_flags & CTLFLAG_READWRITE)) #endif /* SYSCTL_DISALLOW_CREATE */ return (EPERM); } /* * nothing can remove a node if: * the node is permanent (checked later) or * the tree itself is not writeable or * the entire sysctl system is not writeable * * note that we ignore whether setup is complete or not, * because these rules always apply. */ if (!(sysctl_rootof(rnode)->sysctl_flags & CTLFLAG_READWRITE) || !(sysctl_root.sysctl_flags & CTLFLAG_READWRITE)) return (EPERM); if (newp == NULL) return (EINVAL); error = sysctl_cvt_in(l, &v, newp, newlen, &nnode); if (error) return (error); memset(&onode, 0, sizeof(struct sysctlnode)); node = rnode->sysctl_child; for (ni = 0; ni < rnode->sysctl_clen; ni++) { if (nnode.sysctl_num == node[ni].sysctl_num) { /* * if name specified, must match */ if (nnode.sysctl_name[0] != '\0' && strcmp(nnode.sysctl_name, node[ni].sysctl_name)) continue; /* * if version specified, must match */ if (nnode.sysctl_ver != 0 && nnode.sysctl_ver != node[ni].sysctl_ver) continue; /* * this must be the one */ break; } } if (ni == rnode->sysctl_clen) return (ENOENT); node = &node[ni]; pnode = node->sysctl_parent; /* * if the kernel says permanent, it is, so there. nyah. */ if (SYSCTL_FLAGS(node->sysctl_flags) & CTLFLAG_PERMANENT) return (EPERM); /* * can't delete non-empty nodes */ if (SYSCTL_TYPE(node->sysctl_flags) == CTLTYPE_NODE && node->sysctl_clen != 0) return (ENOTEMPTY); /* * if the node "owns" data, release it now */ if (node->sysctl_flags & CTLFLAG_OWNDATA) { if (node->sysctl_data != NULL) free(node->sysctl_data, M_SYSCTLDATA); node->sysctl_data = NULL; } if (node->sysctl_flags & CTLFLAG_OWNDESC) { if (node->sysctl_desc != NULL) /*XXXUNCONST*/ free(__UNCONST(node->sysctl_desc), M_SYSCTLDATA); node->sysctl_desc = NULL; } /* * if the node to be removed is not the last one on the list, * move the remaining nodes up, and reparent any grandchildren */ onode = *node; if (ni < pnode->sysctl_clen - 1) { int t; memmove(&pnode->sysctl_child[ni], &pnode->sysctl_child[ni + 1], (pnode->sysctl_clen - ni - 1) * sizeof(struct sysctlnode)); for (; ni < pnode->sysctl_clen - 1; ni++) if (SYSCTL_TYPE(pnode->sysctl_child[ni].sysctl_flags) == CTLTYPE_NODE) for (t = 0; t < pnode->sysctl_child[ni].sysctl_clen; t++) pnode->sysctl_child[ni].sysctl_child[t]. sysctl_parent = &pnode->sysctl_child[ni]; ni = pnode->sysctl_clen - 1; node = &pnode->sysctl_child[ni]; } /* * reset the space we just vacated */ memset(node, 0, sizeof(struct sysctlnode)); node->sysctl_parent = pnode; pnode->sysctl_clen--; /* * if this parent just lost its last child, nuke the creche */ if (pnode->sysctl_clen == 0) { free(pnode->sysctl_child, M_SYSCTLNODE); pnode->sysctl_csize = 0; pnode->sysctl_child = NULL; } /* * update "version" on path to "root" */ for (; rnode->sysctl_parent != NULL; rnode = rnode->sysctl_parent) ; for (ni = rnode->sysctl_ver + 1; pnode != NULL; pnode = pnode->sysctl_parent) pnode->sysctl_ver = ni; error = sysctl_cvt_out(l, v, &onode, oldp, *oldlenp, oldlenp); return (error); } /* * sysctl_lookup -- Handles copyin/copyout of new and old values. * Partial reads are globally allowed. Only root can write to things * unless the node says otherwise. */ int sysctl_lookup(SYSCTLFN_ARGS) { int error, rw; size_t sz, len; void *d; KASSERT(rw_lock_held(&sysctl_treelock)); if (SYSCTL_VERS(rnode->sysctl_flags) != SYSCTL_VERSION) { printf("%s: rnode %p wrong version\n", __func__, rnode); return EINVAL; } if (newlen == 0) newp = NULL; error = 0; /* * you can't "look up" a node. you can "query" it, but you * can't "look it up". */ if (SYSCTL_TYPE(rnode->sysctl_flags) == CTLTYPE_NODE || namelen != 0) { DPRINTF(("%s: can't lookup a node\n", __func__)); return EINVAL; } /* * some nodes are private, so only root can look into them. */ if (l != NULL && (rnode->sysctl_flags & CTLFLAG_PRIVATE) && (error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_SYSCTL, KAUTH_REQ_SYSTEM_SYSCTL_PRVT, NULL, NULL, NULL)) != 0) { DPRINTF(("%s: private node\n", __func__)); return error; } /* * if a node wants to be writable according to different rules * other than "only root can write to stuff unless a flag is * set", then it needs its own function which should have been * called and not us. */ if (l != NULL && newp != NULL && !(rnode->sysctl_flags & CTLFLAG_ANYWRITE) && (error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_SYSCTL, KAUTH_REQ_SYSTEM_SYSCTL_MODIFY, NULL, NULL, NULL)) != 0) { DPRINTF(("%s: can't modify\n", __func__)); return error; } /* * is this node supposedly writable? */ rw = (rnode->sysctl_flags & CTLFLAG_READWRITE) ? 1 : 0; /* * it appears not to be writable at this time, so if someone * tried to write to it, we must tell them to go away */ if (!rw && newp != NULL) { DPRINTF(("%s: not writable\n", __func__)); return EPERM; } /* * step one, copy out the stuff we have presently */ if (rnode->sysctl_flags & CTLFLAG_IMMEDIATE) { /* * note that we discard const here because we are * modifying the contents of the node (which is okay * because it's ours) * * It also doesn't matter which field of the union we pick. */ d = __UNCONST(&rnode->sysctl_qdata); } else d = rnode->sysctl_data; if (SYSCTL_TYPE(rnode->sysctl_flags) == CTLTYPE_STRING) sz = strlen(d) + 1; /* XXX@@@ possible fault here */ else sz = rnode->sysctl_size; if (oldp != NULL) { error = sysctl_copyout(l, d, oldp, MIN(sz, *oldlenp)); if (error) { DPRINTF(("%s: bad copyout %d\n", __func__, error)); return error; } } *oldlenp = sz; /* * are we done? */ if (newp == NULL) return 0; /* * hmm...not done. must now "copy in" new value. re-adjust * sz to maximum value (strings are "weird"). */ sz = rnode->sysctl_size; switch (SYSCTL_TYPE(rnode->sysctl_flags)) { case CTLTYPE_BOOL: { bool tmp; /* * these data must be *exactly* the same size coming * in. bool may only be true or false. */ if (newlen != sz) { DPRINTF(("%s: bad size %zu != %zu\n", __func__, newlen, sz)); return EINVAL; } error = sysctl_copyin(l, newp, &tmp, sz); if (error) break; if (tmp != true && tmp != false) { DPRINTF(("%s: tmp %d\n", __func__, tmp)); return EINVAL; } *(bool *)d = tmp; break; } case CTLTYPE_INT: case CTLTYPE_QUAD: case CTLTYPE_STRUCT: /* * these data must be *exactly* the same size coming * in. */ if (newlen != sz) goto bad_size; error = sysctl_copyin(l, newp, d, sz); rnd_add_data(NULL, d, sz, 0); break; case CTLTYPE_STRING: { /* * strings, on the other hand, can be shorter, and we * let userland be sloppy about the trailing nul. */ char *newbuf; /* * too much new string? */ if (newlen > sz) goto bad_size; /* * temporary copy of new inbound string */ len = MIN(sz, newlen); newbuf = malloc(len, M_SYSCTLDATA, M_WAITOK); if (newbuf == NULL) { DPRINTF(("%s: oomem %zu\n", __func__, len)); return ENOMEM; } error = sysctl_copyin(l, newp, newbuf, len); if (error) { free(newbuf, M_SYSCTLDATA); DPRINTF(("%s: copyin %d\n", __func__, error)); return error; } /* * did they NUL terminate it, or do we have space * left to do it ourselves? */ if (newbuf[len - 1] != '\0' && len == sz) { free(newbuf, M_SYSCTLDATA); DPRINTF(("%s: string too long\n", __func__)); return EINVAL; } /* * looks good, so pop it into place and zero the rest. */ if (len > 0) { memcpy(d, newbuf, len); rnd_add_data(NULL, d, len, 0); } if (sz != len) memset((char*)d + len, 0, sz - len); free(newbuf, M_SYSCTLDATA); break; } default: DPRINTF(("%s: bad type\n", __func__)); return EINVAL; } if (error) { DPRINTF(("%s: copyin %d\n", __func__, error)); } return error; bad_size: DPRINTF(("%s: bad size %zu > %zu\n", __func__, newlen, sz)); return EINVAL; } /* * sysctl_mmap -- Dispatches sysctl mmap requests to those nodes that * purport to handle it. This interface isn't fully fleshed out yet, * unfortunately. */ static int sysctl_mmap(SYSCTLFN_ARGS) { const struct sysctlnode *node; struct sysctlnode nnode; int error; int sysctl_num; if (SYSCTL_VERS(rnode->sysctl_flags) != SYSCTL_VERSION) { printf("sysctl_mmap: rnode %p wrong version\n", rnode); return (EINVAL); } /* * let's just pretend that didn't happen, m'kay? */ if (l == NULL) return (EPERM); /* * is this a sysctlnode description of an mmap request? */ if (newp == NULL || newlen != sizeof(struct sysctlnode)) return (EINVAL); error = sysctl_copyin(l, newp, &nnode, sizeof(nnode)); if (error) return (error); /* * does the node they asked for exist? */ if (namelen != 1) return (EOPNOTSUPP); node = rnode; sysctl_num = nnode.sysctl_num; error = sysctl_locate(l, &sysctl_num, 1, &node, NULL); if (error) return (error); /* * does this node that we have found purport to handle mmap? */ if (node->sysctl_func == NULL || !(node->sysctl_flags & CTLFLAG_MMAP)) return (EOPNOTSUPP); /* * well...okay, they asked for it. */ return ((*node->sysctl_func)(SYSCTLFN_CALL(node))); } int sysctl_describe(SYSCTLFN_ARGS) { struct sysctldesc *d; void *bf; size_t sz, left, tot; int i, error, v = -1; struct sysctlnode *node; struct sysctlnode dnode; if (SYSCTL_VERS(rnode->sysctl_flags) != SYSCTL_VERSION) { printf("sysctl_query: rnode %p wrong version\n", rnode); return (EINVAL); } if (SYSCTL_TYPE(rnode->sysctl_flags) != CTLTYPE_NODE) return (ENOTDIR); if (namelen != 1 || name[0] != CTL_DESCRIBE) return (EINVAL); /* * get ready... */ error = 0; d = bf = malloc(MAXDESCLEN, M_TEMP, M_WAITOK); if (bf == NULL) return ENOMEM; tot = 0; node = rnode->sysctl_child; left = *oldlenp; /* * no request -> all descriptions at this level * request with desc unset -> just this node * request with desc set -> set descr for this node */ if (newp != NULL) { error = sysctl_cvt_in(l, &v, newp, newlen, &dnode); if (error) goto out; if (dnode.sysctl_desc != NULL) { /* * processes cannot set descriptions above * securelevel 0. and must be root. blah * blah blah. a couple more checks are made * once we find the node we want. */ if (l != NULL) { #ifndef SYSCTL_DISALLOW_CREATE error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_SYSCTL, KAUTH_REQ_SYSTEM_SYSCTL_DESC, NULL, NULL, NULL); if (error) goto out; #else /* SYSCTL_DISALLOW_CREATE */ error = EPERM; goto out; #endif /* SYSCTL_DISALLOW_CREATE */ } /* * find node and try to set the description on it */ for (i = 0; i < rnode->sysctl_clen; i++) if (node[i].sysctl_num == dnode.sysctl_num) break; if (i == rnode->sysctl_clen) { error = ENOENT; goto out; } node = &node[i]; /* * did the caller specify a node version? */ if (dnode.sysctl_ver != 0 && dnode.sysctl_ver != node->sysctl_ver) { error = EINVAL; goto out; } /* * okay...some rules: * (1) if setup is done and the tree is * read-only or the whole system is * read-only * (2) no one can set a description on a * permanent node (it must be set when * using createv) * (3) processes cannot *change* a description * (4) processes *can*, however, set a * description on a read-only node so that * one can be created and then described * in two steps * anything else come to mind? */ if ((sysctl_root.sysctl_flags & CTLFLAG_PERMANENT) && (!(sysctl_rootof(node)->sysctl_flags & CTLFLAG_READWRITE) || !(sysctl_root.sysctl_flags & CTLFLAG_READWRITE))) { error = EPERM; goto out; } if (node->sysctl_flags & CTLFLAG_PERMANENT) { error = EPERM; goto out; } if (l != NULL && node->sysctl_desc != NULL) { error = EPERM; goto out; } /* * right, let's go ahead. the first step is * making the description into something the * node can "own", if need be. */ if (l != NULL || dnode.sysctl_flags & CTLFLAG_OWNDESC) { char *nd, *k; k = malloc(MAXDESCLEN, M_TEMP, M_WAITOK); if (k == NULL) { error = ENOMEM; goto out; } error = sysctl_copyinstr(l, dnode.sysctl_desc, k, MAXDESCLEN, &sz); if (error) { free(k, M_TEMP); goto out; } nd = malloc(sz, M_SYSCTLDATA, M_WAITOK); if (nd == NULL) { free(k, M_TEMP); error = ENOMEM; goto out; } memcpy(nd, k, sz); dnode.sysctl_flags |= CTLFLAG_OWNDESC; dnode.sysctl_desc = nd; free(k, M_TEMP); } /* * now "release" the old description and * attach the new one. ta-da. */ if ((node->sysctl_flags & CTLFLAG_OWNDESC) && node->sysctl_desc != NULL) /*XXXUNCONST*/ free(__UNCONST(node->sysctl_desc), M_SYSCTLDATA); node->sysctl_desc = dnode.sysctl_desc; node->sysctl_flags |= (dnode.sysctl_flags & CTLFLAG_OWNDESC); /* * now we "fall out" and into the loop which * will copy the new description back out for * those interested parties */ } } /* * scan for one description or just retrieve all descriptions */ for (i = 0; i < rnode->sysctl_clen; i++) { /* * did they ask for the description of only one node? */ if (v != -1 && node[i].sysctl_num != dnode.sysctl_num) continue; /* * don't describe "private" nodes to non-suser users */ if ((node[i].sysctl_flags & CTLFLAG_PRIVATE) && (l != NULL) && !(kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_SYSCTL, KAUTH_REQ_SYSTEM_SYSCTL_PRVT, NULL, NULL, NULL))) continue; /* * is this description "valid"? */ memset(bf, 0, MAXDESCLEN); if (node[i].sysctl_desc == NULL) sz = 1; else if (copystr(node[i].sysctl_desc, &d->descr_str[0], MAXDESCLEN - sizeof(*d), &sz) != 0) { /* * erase possible partial description */ memset(bf, 0, MAXDESCLEN); sz = 1; } /* * we've got it, stuff it into the caller's buffer */ d->descr_num = node[i].sysctl_num; d->descr_ver = node[i].sysctl_ver; d->descr_len = sz; /* includes trailing nul */ sz = (char *)NEXT_DESCR(d) - (char *)d; if (oldp != NULL && left >= sz) { error = sysctl_copyout(l, d, oldp, sz); if (error) goto out; left -= sz; oldp = (void *)__sysc_desc_adv(oldp, d->descr_len); } tot += sz; /* * if we get this far with v not "unset", they asked * for a specific node and we found it */ if (v != -1) break; } /* * did we find it after all? */ if (v != -1 && tot == 0) error = ENOENT; else *oldlenp = tot; out: free(bf, M_TEMP); return (error); } /* * ******************************************************************** * Section 3: Create and destroy from inside the kernel * ******************************************************************** * sysctl_createv() and sysctl_destroyv() are simpler-to-use * interfaces for the kernel to fling new entries into the mib and rip * them out later. In the case of sysctl_createv(), the returned copy * of the node (see sysctl_create()) will be translated back into a * pointer to the actual node. * * Note that sysctl_createv() will return 0 if the create request * matches an existing node (ala mkdir -p), and that sysctl_destroyv() * will return 0 if the node to be destroyed already does not exist * (aka rm -f) or if it is a parent of other nodes. * * This allows two (or more) different subsystems to assert sub-tree * existence before populating their own nodes, and to remove their * own nodes without orphaning the others when they are done. * ******************************************************************** */ #undef sysctl_createv int sysctl_createv(struct sysctllog **log, int cflags, const struct sysctlnode **rnode, const struct sysctlnode **cnode, int flags, int type, const char *namep, const char *descr, sysctlfn func, u_quad_t qv, void *newp, size_t newlen, ...) { va_list ap; int error, ni, namelen, name[CTL_MAXNAME]; const struct sysctlnode *root, *pnode; struct sysctlnode nnode, onode, *dnode; size_t sz; const struct sysctlnode *snode __diagused; /* * where are we putting this? */ if (rnode != NULL && *rnode == NULL) { printf("sysctl_createv: rnode NULL\n"); return (EINVAL); } root = rnode ? *rnode : NULL; if (cnode != NULL) *cnode = NULL; if (cflags != 0) return (EINVAL); /* * what is it? */ flags = SYSCTL_VERSION|SYSCTL_TYPE(type)|SYSCTL_FLAGS(flags); if (log != NULL) flags &= ~CTLFLAG_PERMANENT; /* * where do we put it? */ va_start(ap, newlen); namelen = 0; error = 0; ni = -1; do { if (++ni == CTL_MAXNAME) { error = ENAMETOOLONG; break; } name[ni] = va_arg(ap, int); /* * sorry, this is not supported from here */ if (name[ni] == CTL_CREATESYM) { error = EINVAL; break; } } while (name[ni] != CTL_EOL && name[ni] != CTL_CREATE); va_end(ap); if (error) return error; namelen = ni + (name[ni] == CTL_CREATE ? 1 : 0); /* * what's it called */ if (strlcpy(nnode.sysctl_name, namep, sizeof(nnode.sysctl_name)) >= sizeof(nnode.sysctl_name)) return (ENAMETOOLONG); /* * cons up the description of the new node */ nnode.sysctl_num = name[namelen - 1]; name[namelen - 1] = CTL_CREATE; nnode.sysctl_size = newlen; nnode.sysctl_flags = flags; if (type == CTLTYPE_NODE) { nnode.sysctl_csize = 0; nnode.sysctl_clen = 0; nnode.sysctl_child = NULL; if (flags & CTLFLAG_ALIAS) nnode.sysctl_alias = qv; } else if (flags & CTLFLAG_IMMEDIATE) { switch (type) { case CTLTYPE_BOOL: nnode.sysctl_bdata = qv; break; case CTLTYPE_INT: nnode.sysctl_idata = qv; break; case CTLTYPE_QUAD: nnode.sysctl_qdata = qv; break; default: return (EINVAL); } } else { nnode.sysctl_data = newp; } nnode.sysctl_func = func; nnode.sysctl_parent = NULL; nnode.sysctl_ver = 0; /* * initialize lock state -- we need locks if the main tree has * been marked as complete, but since we could be called from * either there, or from a device driver (say, at device * insertion), or from a module (at module load time, say), we * don't really want to "wait"... */ sysctl_lock(true); /* * locate the prospective parent of the new node, and if we * find it, add the new node. */ sz = sizeof(onode); pnode = root; error = sysctl_locate(NULL, &name[0], namelen - 1, &pnode, &ni); if (error) { /* * XXX: If you are seeing this printf in early bringup * stages, perhaps your setfault is not functioning and * thus kcopy() is mis-behaving. */ printf("sysctl_createv: sysctl_locate(%s) returned %d\n", nnode.sysctl_name, error); sysctl_unlock(); return (error); } error = sysctl_create(&name[ni], namelen - ni, &onode, &sz, &nnode, sizeof(nnode), &name[0], NULL, pnode); /* * unfortunately the node we wanted to create is already * there. if the node that's already there is a reasonable * facsimile of the node we wanted to create, just pretend * (for the caller's benefit) that we managed to create the * node they wanted. */ if (error == EEXIST) { /* name is the same as requested... */ if (strcmp(nnode.sysctl_name, onode.sysctl_name) == 0 && /* they want the same function... */ nnode.sysctl_func == onode.sysctl_func && /* number is the same as requested, or... */ (nnode.sysctl_num == onode.sysctl_num || /* they didn't pick a number... */ nnode.sysctl_num == CTL_CREATE)) { /* * collision here from trying to create * something that already existed; let's give * our customers a hand and tell them they got * what they wanted. */ #ifdef SYSCTL_DEBUG_CREATE printf("cleared\n"); #endif /* SYSCTL_DEBUG_CREATE */ error = 0; } } if (error == 0 && (cnode != NULL || log != NULL || descr != NULL)) { /* * sysctl_create() gave us back a copy of the node, * but we need to know where it actually is... */ pnode = root; error = sysctl_locate(NULL, &name[0], namelen - 1, &pnode, &ni); snode = pnode; /* * manual scan of last layer so that aliased nodes * aren't followed. */ if (error == 0) { for (ni = 0; ni < pnode->sysctl_clen; ni++) if (pnode->sysctl_child[ni].sysctl_num == onode.sysctl_num) break; if (ni < pnode->sysctl_clen) pnode = &pnode->sysctl_child[ni]; else error = ENOENT; } /* * not expecting an error here, but... */ if (error == 0) { KASSERTMSG(pnode->sysctl_parent == snode, "sysctl parent mis-match pnode %s, snode %s", pnode->sysctl_name, snode->sysctl_name); if (log != NULL) sysctl_log_add(log, pnode); if (cnode != NULL) *cnode = pnode; if (descr != NULL) { /* * allow first caller to *set* a * description actually to set it * * discard const here so we can attach * the description */ dnode = __UNCONST(pnode); if (pnode->sysctl_desc != NULL) /* skip it...we've got one */; else if (flags & CTLFLAG_OWNDESC) { size_t l = strlen(descr) + 1; char *d = malloc(l, M_SYSCTLDATA, M_WAITOK); if (d != NULL) { memcpy(d, descr, l); dnode->sysctl_desc = d; dnode->sysctl_flags |= CTLFLAG_OWNDESC; } } else dnode->sysctl_desc = descr; } } else { printf("sysctl_create succeeded but node not found?!\n"); /* * confusing, but the create said it * succeeded, so... */ error = 0; } } /* * now it should be safe to release the lock state. note that * the pointer to the newly created node being passed back may * not be "good" for very long. */ sysctl_unlock(); if (error != 0) { printf("sysctl_createv: sysctl_create(%s) returned %d\n", nnode.sysctl_name, error); #if 0 if (error != ENOENT) sysctl_dump(&onode); #endif } return (error); } int sysctl_destroyv(struct sysctlnode *rnode, ...) { va_list ap; int error, name[CTL_MAXNAME], namelen, ni; const struct sysctlnode *pnode, *node; struct sysctlnode dnode, *onode; size_t sz; va_start(ap, rnode); namelen = 0; ni = 0; do { if (ni == CTL_MAXNAME) { va_end(ap); return (ENAMETOOLONG); } name[ni] = va_arg(ap, int); } while (name[ni++] != CTL_EOL); namelen = ni - 1; va_end(ap); /* * i can't imagine why we'd be destroying a node when the tree * wasn't complete, but who knows? */ sysctl_lock(true); /* * where is it? */ node = rnode; error = sysctl_locate(NULL, &name[0], namelen - 1, &node, &ni); if (error) { /* they want it gone and it's not there, so... */ sysctl_unlock(); return (error == ENOENT ? 0 : error); } /* * set up the deletion */ pnode = node; node = &dnode; memset(&dnode, 0, sizeof(dnode)); dnode.sysctl_flags = SYSCTL_VERSION; dnode.sysctl_num = name[namelen - 1]; /* * we found it, now let's nuke it */ name[namelen - 1] = CTL_DESTROY; sz = 0; error = sysctl_destroy(&name[namelen - 1], 1, NULL, &sz, node, sizeof(*node), &name[0], NULL, pnode); if (error == ENOTEMPTY) { /* * think of trying to delete "foo" when "foo.bar" * (which someone else put there) is still in * existence */ error = 0; /* * dunno who put the description there, but if this * node can ever be removed, we need to make sure the * string doesn't go out of context. that means we * need to find the node that's still there (don't use * sysctl_locate() because that follows aliasing). */ node = pnode->sysctl_child; for (ni = 0; ni < pnode->sysctl_clen; ni++) if (node[ni].sysctl_num == dnode.sysctl_num) break; node = (ni < pnode->sysctl_clen) ? &node[ni] : NULL; /* * if we found it, and this node has a description, * and this node can be released, and it doesn't * already own its own description...sigh. :) */ if (node != NULL && node->sysctl_desc != NULL && !(node->sysctl_flags & CTLFLAG_PERMANENT) && !(node->sysctl_flags & CTLFLAG_OWNDESC)) { char *d; sz = strlen(node->sysctl_desc) + 1; d = malloc(sz, M_SYSCTLDATA, M_WAITOK); if (d != NULL) { /* * discard const so that we can * re-attach the description */ memcpy(d, node->sysctl_desc, sz); onode = __UNCONST(node); onode->sysctl_desc = d; onode->sysctl_flags |= CTLFLAG_OWNDESC; } else { /* * XXX drop the description? be * afraid? don't care? */ } } } sysctl_unlock(); return (error); } /* * ******************************************************************** * Deletes an entire n-ary tree. Not recommended unless you know why * you're doing it. Personally, I don't know why you'd even think * about it. * ******************************************************************** */ void sysctl_free(struct sysctlnode *rnode) { struct sysctlnode *node, *pnode; rw_enter(&sysctl_treelock, RW_WRITER); if (rnode == NULL) rnode = &sysctl_root; if (SYSCTL_VERS(rnode->sysctl_flags) != SYSCTL_VERSION) { printf("sysctl_free: rnode %p wrong version\n", rnode); rw_exit(&sysctl_treelock); return; } pnode = rnode; node = pnode->sysctl_child; do { while (node != NULL && pnode->sysctl_csize > 0) { while (node < &pnode->sysctl_child[pnode->sysctl_clen] && (SYSCTL_TYPE(node->sysctl_flags) != CTLTYPE_NODE || node->sysctl_csize == 0)) { if (SYSCTL_FLAGS(node->sysctl_flags) & CTLFLAG_OWNDATA) { if (node->sysctl_data != NULL) { free(node->sysctl_data, M_SYSCTLDATA); node->sysctl_data = NULL; } } if (SYSCTL_FLAGS(node->sysctl_flags) & CTLFLAG_OWNDESC) { if (node->sysctl_desc != NULL) { /*XXXUNCONST*/ free(__UNCONST(node->sysctl_desc), M_SYSCTLDATA); node->sysctl_desc = NULL; } } node++; } if (node < &pnode->sysctl_child[pnode->sysctl_clen]) { pnode = node; node = node->sysctl_child; } else break; } if (pnode->sysctl_child != NULL) free(pnode->sysctl_child, M_SYSCTLNODE); pnode->sysctl_clen = 0; pnode->sysctl_csize = 0; pnode->sysctl_child = NULL; node = pnode; pnode = node->sysctl_parent; } while (pnode != NULL && node != rnode); rw_exit(&sysctl_treelock); } void sysctl_log_print(const struct sysctllog *slog) { int i, len; printf("root %p left %d size %d content", (const void *)slog->log_root, slog->log_left, slog->log_size); for (len = 0, i = slog->log_left; i < slog->log_size; i++) { switch (len) { case 0: len = -1; printf(" version %d", slog->log_num[i]); break; case -1: len = -2; printf(" type %d", slog->log_num[i]); break; case -2: len = slog->log_num[i]; printf(" len %d:", slog->log_num[i]); if (len <= 0) len = -1; break; default: len--; printf(" %d", slog->log_num[i]); break; } } printf(" end\n"); } int sysctl_log_add(struct sysctllog **logp, const struct sysctlnode *node) { const int size0 = 16; int name[CTL_MAXNAME], namelen, i; const struct sysctlnode *pnode; struct sysctllog *log; if (node->sysctl_flags & CTLFLAG_PERMANENT) return (0); if (logp == NULL) return (0); if (*logp == NULL) { log = malloc(sizeof(struct sysctllog), M_SYSCTLDATA, M_WAITOK); if (log == NULL) { /* XXX print error message? */ return (-1); } log->log_num = malloc(size0 * sizeof(int), M_SYSCTLDATA, M_WAITOK); if (log->log_num == NULL) { /* XXX print error message? */ free(log, M_SYSCTLDATA); return (-1); } memset(log->log_num, 0, size0 * sizeof(int)); log->log_root = NULL; log->log_size = size0; log->log_left = size0; *logp = log; } else log = *logp; /* * check that the root is proper. it's okay to record the * address of the root of a tree. it's the only thing that's * guaranteed not to shift around as nodes come and go. */ if (log->log_root == NULL) log->log_root = sysctl_rootof(node); else if (log->log_root != sysctl_rootof(node)) { printf("sysctl: log %p root mismatch (%p)\n", log->log_root, sysctl_rootof(node)); return (-1); } /* * we will copy out name in reverse order */ for (pnode = node, namelen = 0; pnode != NULL && !(pnode->sysctl_flags & CTLFLAG_ROOT); pnode = pnode->sysctl_parent) name[namelen++] = pnode->sysctl_num; /* * do we have space? */ if (log->log_left < (namelen + 3)) sysctl_log_realloc(log); if (log->log_left < (namelen + 3)) return (-1); /* * stuff name in, then namelen, then node type, and finally, * the version for non-node nodes. */ for (i = 0; i < namelen && i < CTL_MAXNAME; i++) log->log_num[--log->log_left] = name[i]; log->log_num[--log->log_left] = namelen; log->log_num[--log->log_left] = SYSCTL_TYPE(node->sysctl_flags); if (log->log_num[log->log_left] != CTLTYPE_NODE) log->log_num[--log->log_left] = node->sysctl_ver; else log->log_num[--log->log_left] = 0; return (0); } void sysctl_teardown(struct sysctllog **logp) { const struct sysctlnode *rnode; struct sysctlnode node; struct sysctllog *log; uint namelen; int *name, t, v, error, ni; size_t sz; if (logp == NULL || *logp == NULL) return; log = *logp; rw_enter(&sysctl_treelock, RW_WRITER); memset(&node, 0, sizeof(node)); while (log->log_left < log->log_size) { KASSERT(log->log_left + 3 < log->log_size); KASSERT(log->log_left + log->log_num[log->log_left + 2] <= log->log_size); v = log->log_num[log->log_left++]; t = log->log_num[log->log_left++]; namelen = log->log_num[log->log_left++]; name = &log->log_num[log->log_left]; node.sysctl_num = name[namelen - 1]; node.sysctl_flags = SYSCTL_VERSION|t; node.sysctl_ver = v; rnode = log->log_root; error = sysctl_locate(NULL, &name[0], namelen, &rnode, &ni); if (error == 0) { name[namelen - 1] = CTL_DESTROY; rnode = rnode->sysctl_parent; sz = 0; (void)sysctl_destroy(&name[namelen - 1], 1, NULL, &sz, &node, sizeof(node), &name[0], NULL, rnode); } log->log_left += namelen; } KASSERT(log->log_size == log->log_left); free(log->log_num, M_SYSCTLDATA); free(log, M_SYSCTLDATA); *logp = NULL; rw_exit(&sysctl_treelock); } /* * ******************************************************************** * old_sysctl -- A routine to bridge old-style internal calls to the * new infrastructure. * ******************************************************************** */ int old_sysctl(int *name, u_int namelen, void *oldp, size_t *oldlenp, void *newp, size_t newlen, struct lwp *l) { int error; size_t oldlen = 0; size_t savelen; if (oldlenp) { oldlen = *oldlenp; } savelen = oldlen; sysctl_lock(newp != NULL); error = sysctl_dispatch(name, namelen, oldp, &oldlen, newp, newlen, name, l, NULL); sysctl_unlock(); if (error == 0 && oldp != NULL && savelen < oldlen) error = ENOMEM; if (oldlenp) { *oldlenp = oldlen; } return (error); } /* * ******************************************************************** * Section 4: Generic helper routines * ******************************************************************** * "helper" routines that can do more finely grained access control, * construct structures from disparate information, create the * appearance of more nodes and sub-trees, etc. for example, if * CTL_PROC wanted a helper function, it could respond to a CTL_QUERY * with a dynamically created list of nodes that represented the * currently running processes at that instant. * ******************************************************************** */ /* * first, a few generic helpers that provide: * * sysctl_needfunc() a readonly interface that emits a warning * sysctl_notavail() returns EOPNOTSUPP (generic error) * sysctl_null() an empty return buffer with no error */ int sysctl_needfunc(SYSCTLFN_ARGS) { int error; printf("!!SYSCTL_NEEDFUNC!!\n"); if (newp != NULL || namelen != 0) return (EOPNOTSUPP); error = 0; if (oldp != NULL) error = sysctl_copyout(l, rnode->sysctl_data, oldp, MIN(rnode->sysctl_size, *oldlenp)); *oldlenp = rnode->sysctl_size; return (error); } int sysctl_notavail(SYSCTLFN_ARGS) { if (namelen == 1 && name[0] == CTL_QUERY) return (sysctl_query(SYSCTLFN_CALL(rnode))); return (EOPNOTSUPP); } int sysctl_null(SYSCTLFN_ARGS) { *oldlenp = 0; return (0); } u_int sysctl_map_flags(const u_int *map, u_int word) { u_int rv; for (rv = 0; *map != 0; map += 2) if ((word & map[0]) != 0) rv |= map[1]; return rv; } /* * ******************************************************************** * Section 5: The machinery that makes it all go * ******************************************************************** * Memory "manglement" routines. Not much to this, eh? * ******************************************************************** */ static int sysctl_alloc(struct sysctlnode *p, int x) { int i; struct sysctlnode *n; assert(p->sysctl_child == NULL); if (x == 1) n = malloc(sizeof(struct sysctlnode), M_SYSCTLNODE, M_WAITOK); else n = malloc(SYSCTL_DEFSIZE * sizeof(struct sysctlnode), M_SYSCTLNODE, M_WAITOK); if (n == NULL) return (ENOMEM); if (x == 1) { memset(n, 0, sizeof(struct sysctlnode)); p->sysctl_csize = 1; } else { memset(n, 0, SYSCTL_DEFSIZE * sizeof(struct sysctlnode)); p->sysctl_csize = SYSCTL_DEFSIZE; } p->sysctl_clen = 0; for (i = 0; i < p->sysctl_csize; i++) n[i].sysctl_parent = p; p->sysctl_child = n; return (0); } static int sysctl_realloc(struct sysctlnode *p) { int i, j, olen; struct sysctlnode *n; assert(p->sysctl_csize == p->sysctl_clen); /* * how many do we have...how many should we make? */ olen = p->sysctl_clen; n = malloc(2 * olen * sizeof(struct sysctlnode), M_SYSCTLNODE, M_WAITOK); if (n == NULL) return (ENOMEM); /* * move old children over...initialize new children */ memcpy(n, p->sysctl_child, olen * sizeof(struct sysctlnode)); memset(&n[olen], 0, olen * sizeof(struct sysctlnode)); p->sysctl_csize = 2 * olen; /* * reattach moved (and new) children to parent; if a moved * child node has children, reattach the parent pointers of * grandchildren */ for (i = 0; i < p->sysctl_csize; i++) { n[i].sysctl_parent = p; if (n[i].sysctl_child != NULL) { for (j = 0; j < n[i].sysctl_csize; j++) n[i].sysctl_child[j].sysctl_parent = &n[i]; } } /* * get out with the old and in with the new */ free(p->sysctl_child, M_SYSCTLNODE); p->sysctl_child = n; return (0); } static int sysctl_log_realloc(struct sysctllog *log) { int *n, s, d; s = log->log_size * 2; d = log->log_size; n = malloc(s * sizeof(int), M_SYSCTLDATA, M_WAITOK); if (n == NULL) return (-1); memset(n, 0, s * sizeof(int)); memcpy(&n[d], log->log_num, d * sizeof(int)); free(log->log_num, M_SYSCTLDATA); log->log_num = n; if (d) log->log_left += d; else log->log_left = s; log->log_size = s; return (0); } /* * ******************************************************************** * Section 6: Conversion between API versions wrt the sysctlnode * ******************************************************************** */ static int sysctl_cvt_in(struct lwp *l, int *vp, const void *i, size_t sz, struct sysctlnode *node) { int error, flags; if (i == NULL || sz < sizeof(flags)) return (EINVAL); error = sysctl_copyin(l, i, &flags, sizeof(flags)); if (error) return (error); #if (SYSCTL_VERSION != SYSCTL_VERS_1) #error sysctl_cvt_in: no support for SYSCTL_VERSION #endif /* (SYSCTL_VERSION != SYSCTL_VERS_1) */ if (sz == sizeof(*node) && SYSCTL_VERS(flags) == SYSCTL_VERSION) { error = sysctl_copyin(l, i, node, sizeof(*node)); if (error) return (error); *vp = SYSCTL_VERSION; return (0); } return (EINVAL); } static int sysctl_cvt_out(struct lwp *l, int v, const struct sysctlnode *i, void *ovp, size_t left, size_t *szp) { size_t sz = sizeof(*i); const void *src = i; int error; switch (v) { case SYSCTL_VERS_0: return (EINVAL); #if (SYSCTL_VERSION != SYSCTL_VERS_1) #error sysctl_cvt_out: no support for SYSCTL_VERSION #endif /* (SYSCTL_VERSION != SYSCTL_VERS_1) */ case SYSCTL_VERSION: /* nothing more to do here */ break; } if (ovp != NULL && left >= sz) { error = sysctl_copyout(l, src, ovp, sz); if (error) return (error); } if (szp != NULL) *szp = sz; return (0); } static uint8_t address_key[32]; /* key used in address hashing */ static ONCE_DECL(random_inithook); static int random_address_init(void) { cprng_strong(kern_cprng, address_key, sizeof(address_key), 0); return 0; } void hash_value_ensure_initialized(void) { RUN_ONCE(&random_inithook, random_address_init); } void hash_value(void *d, size_t ds, const void *s, size_t ss) { blake2s(d, ds, address_key, sizeof(address_key), s, ss); }
5 3 18 2 1 15 3 15 1 1 1 1 28 4 9 1 3 6 2 9 5 5 1 2 85 32 1 13 1 5 24 2 3 1 4 2 1 1 1 1 2 5 6 3 1 1 6 4 7 2 2 1 3 1 25 4 21 8 4 7 15 1 1 13 2 1 1 10 3 4 1 3 1 6 4 1 6 9 9 3 1 2 4 1 1 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 /* $NetBSD: sys_descrip.c,v 1.48 2023/07/10 02:31:55 christos Exp $ */ /*- * Copyright (c) 2008, 2020 The NetBSD Foundation, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /* * Copyright (c) 1982, 1986, 1989, 1991, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)kern_descrip.c 8.8 (Berkeley) 2/14/95 */ /* * System calls on descriptors. */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: sys_descrip.c,v 1.48 2023/07/10 02:31:55 christos Exp $"); #include <sys/param.h> #include <sys/systm.h> #include <sys/filedesc.h> #include <sys/kernel.h> #include <sys/vnode.h> #include <sys/proc.h> #include <sys/file.h> #include <sys/namei.h> #include <sys/socket.h> #include <sys/socketvar.h> #include <sys/stat.h> #include <sys/ioctl.h> #include <sys/fcntl.h> #include <sys/kmem.h> #include <sys/pool.h> #include <sys/syslog.h> #include <sys/unistd.h> #include <sys/resourcevar.h> #include <sys/conf.h> #include <sys/event.h> #include <sys/kauth.h> #include <sys/atomic.h> #include <sys/mount.h> #include <sys/syscallargs.h> #include <uvm/uvm_readahead.h> /* * Duplicate a file descriptor. */ int sys_dup(struct lwp *l, const struct sys_dup_args *uap, register_t *retval) { /* { syscallarg(int) fd; } */ int error, newfd, oldfd; file_t *fp; oldfd = SCARG(uap, fd); if ((fp = fd_getfile(oldfd)) == NULL) { return EBADF; } error = fd_dup(fp, 0, &newfd, false); fd_putfile(oldfd); *retval = newfd; return error; } /* * Duplicate a file descriptor to a particular value. */ int dodup(struct lwp *l, int from, int to, int flags, register_t *retval) { int error; file_t *fp; if ((fp = fd_getfile(from)) == NULL) return EBADF; mutex_enter(&fp->f_lock); fp->f_count++; mutex_exit(&fp->f_lock); fd_putfile(from); if ((u_int)to >= curproc->p_rlimit[RLIMIT_NOFILE].rlim_cur || (u_int)to >= maxfiles) error = EBADF; else if (from == to) error = 0; else error = fd_dup2(fp, to, flags); closef(fp); *retval = to; return error; } int sys_dup3(struct lwp *l, const struct sys_dup3_args *uap, register_t *retval) { /* { syscallarg(int) from; syscallarg(int) to; syscallarg(int) flags; } */ return dodup(l, SCARG(uap, from), SCARG(uap, to), SCARG(uap, flags), retval); } int sys_dup2(struct lwp *l, const struct sys_dup2_args *uap, register_t *retval) { /* { syscallarg(int) from; syscallarg(int) to; } */ return dodup(l, SCARG(uap, from), SCARG(uap, to), 0, retval); } /* * fcntl call which is being passed to the file's fs. */ static int fcntl_forfs(int fd, file_t *fp, int cmd, void *arg) { int error; u_int size; void *data, *memp; #define STK_PARAMS 128 char stkbuf[STK_PARAMS]; if ((fp->f_flag & (FREAD | FWRITE)) == 0) return (EBADF); /* * Interpret high order word to find amount of data to be * copied to/from the user's address space. */ size = (size_t)F_PARAM_LEN(cmd); if (size > F_PARAM_MAX) return (EINVAL); memp = NULL; if (size > sizeof(stkbuf)) { memp = kmem_alloc(size, KM_SLEEP); data = memp; } else data = stkbuf; if (cmd & F_FSIN) { if (size) { error = copyin(arg, data, size); if (error) { if (memp) kmem_free(memp, size); return (error); } } else *(void **)data = arg; } else if ((cmd & F_FSOUT) != 0 && size != 0) { /* * Zero the buffer so the user always * gets back something deterministic. */ memset(data, 0, size); } else if (cmd & F_FSVOID) *(void **)data = arg; error = (*fp->f_ops->fo_fcntl)(fp, cmd, data); /* * Copy any data to user, size was * already set and checked above. */ if (error == 0 && (cmd & F_FSOUT) && size) error = copyout(data, arg, size); if (memp) kmem_free(memp, size); return (error); } int do_fcntl_lock(int fd, int cmd, struct flock *fl) { struct file *fp = NULL; proc_t *p; int (*fo_advlock)(struct file *, void *, int, struct flock *, int); int error, flg; if ((fp = fd_getfile(fd)) == NULL) { error = EBADF; goto out; } if ((fo_advlock = fp->f_ops->fo_advlock) == NULL) { error = EINVAL; goto out; } flg = F_POSIX; p = curproc; switch (cmd) { case F_SETLKW: flg |= F_WAIT; /* Fall into F_SETLK */ /* FALLTHROUGH */ case F_SETLK: switch (fl->l_type) { case F_RDLCK: if ((fp->f_flag & FREAD) == 0) { error = EBADF; break; } if ((p->p_flag & PK_ADVLOCK) == 0) { mutex_enter(p->p_lock); p->p_flag |= PK_ADVLOCK; mutex_exit(p->p_lock); } error = (*fo_advlock)(fp, p, F_SETLK, fl, flg); break; case F_WRLCK: if ((fp->f_flag & FWRITE) == 0) { error = EBADF; break; } if ((p->p_flag & PK_ADVLOCK) == 0) { mutex_enter(p->p_lock); p->p_flag |= PK_ADVLOCK; mutex_exit(p->p_lock); } error = (*fo_advlock)(fp, p, F_SETLK, fl, flg); break; case F_UNLCK: error = (*fo_advlock)(fp, p, F_UNLCK, fl, F_POSIX); break; default: error = EINVAL; break; } break; case F_GETLK: if (fl->l_type != F_RDLCK && fl->l_type != F_WRLCK && fl->l_type != F_UNLCK) { error = EINVAL; break; } error = (*fo_advlock)(fp, p, F_GETLK, fl, F_POSIX); break; default: error = EINVAL; break; } out: if (fp) fd_putfile(fd); return error; } /* * The file control system call. */ int sys_fcntl(struct lwp *l, const struct sys_fcntl_args *uap, register_t *retval) { /* { syscallarg(int) fd; syscallarg(int) cmd; syscallarg(void *) arg; } */ int fd, i, tmp, error, cmd, newmin; filedesc_t *fdp; fdtab_t *dt; file_t *fp; char *kpath; struct flock fl; bool cloexec = false; fd = SCARG(uap, fd); cmd = SCARG(uap, cmd); fdp = l->l_fd; error = 0; switch (cmd) { case F_CLOSEM: if (fd < 0) return EBADF; while ((i = fdp->fd_lastfile) >= fd) { if (fd_getfile(i) == NULL) { /* Another thread has updated. */ continue; } fd_close(i); } return 0; case F_MAXFD: *retval = fdp->fd_lastfile; return 0; case F_SETLKW: case F_SETLK: case F_GETLK: error = copyin(SCARG(uap, arg), &fl, sizeof(fl)); if (error) return error; error = do_fcntl_lock(fd, cmd, &fl); if (cmd == F_GETLK && error == 0) error = copyout(&fl, SCARG(uap, arg), sizeof(fl)); return error; default: /* Handled below */ break; } if ((fp = fd_getfile(fd)) == NULL) return EBADF; if ((cmd & F_FSCTL)) { error = fcntl_forfs(fd, fp, cmd, SCARG(uap, arg)); fd_putfile(fd); return error; } switch (cmd) { case F_DUPFD_CLOEXEC: cloexec = true; /*FALLTHROUGH*/ case F_DUPFD: newmin = (long)SCARG(uap, arg); if ((u_int)newmin >= l->l_proc->p_rlimit[RLIMIT_NOFILE].rlim_cur || (u_int)newmin >= maxfiles) { fd_putfile(fd); return EINVAL; } error = fd_dup(fp, newmin, &i, cloexec); *retval = i; break; case F_GETFD: dt = atomic_load_consume(&fdp->fd_dt); *retval = dt->dt_ff[fd]->ff_exclose; break; case F_SETFD: fd_set_exclose(l, fd, ((long)SCARG(uap, arg) & FD_CLOEXEC) != 0); break; case F_GETNOSIGPIPE: *retval = (fp->f_flag & FNOSIGPIPE) != 0; break; case F_SETNOSIGPIPE: if (SCARG(uap, arg)) atomic_or_uint(&fp->f_flag, FNOSIGPIPE); else atomic_and_uint(&fp->f_flag, ~FNOSIGPIPE); *retval = 0; break; case F_GETFL: *retval = OFLAGS(fp->f_flag); break; case F_SETFL: /* XXX not guaranteed to be atomic. */ tmp = FFLAGS((long)SCARG(uap, arg)) & FCNTLFLAGS; error = (*fp->f_ops->fo_fcntl)(fp, F_SETFL, &tmp); if (error) break; i = tmp ^ fp->f_flag; if (i & FNONBLOCK) { int flgs = tmp & FNONBLOCK; error = (*fp->f_ops->fo_ioctl)(fp, FIONBIO, &flgs); if (error) { (*fp->f_ops->fo_fcntl)(fp, F_SETFL, &fp->f_flag); break; } } if (i & FASYNC) { int flgs = tmp & FASYNC; error = (*fp->f_ops->fo_ioctl)(fp, FIOASYNC, &flgs); if (error) { if (i & FNONBLOCK) { tmp = fp->f_flag & FNONBLOCK; (void)(*fp->f_ops->fo_ioctl)(fp, FIONBIO, &tmp); } (*fp->f_ops->fo_fcntl)(fp, F_SETFL, &fp->f_flag); break; } } fp->f_flag = (fp->f_flag & ~FCNTLFLAGS) | tmp; break; case F_GETOWN: error = (*fp->f_ops->fo_ioctl)(fp, FIOGETOWN, &tmp); *retval = tmp; break; case F_SETOWN: tmp = (int)(uintptr_t) SCARG(uap, arg); error = (*fp->f_ops->fo_ioctl)(fp, FIOSETOWN, &tmp); break; case F_GETPATH: kpath = PNBUF_GET(); /* vnodes need extra context, so are handled separately */ if (fp->f_type == DTYPE_VNODE) error = vnode_to_path(kpath, MAXPATHLEN, fp->f_vnode, l, l->l_proc); else error = (*fp->f_ops->fo_fcntl)(fp, F_GETPATH, kpath); if (error == 0) error = copyoutstr(kpath, SCARG(uap, arg), MAXPATHLEN, NULL); PNBUF_PUT(kpath); break; case F_ADD_SEALS: tmp = (int)(uintptr_t) SCARG(uap, arg); error = (*fp->f_ops->fo_fcntl)(fp, F_ADD_SEALS, &tmp); break; case F_GET_SEALS: error = (*fp->f_ops->fo_fcntl)(fp, F_GET_SEALS, &tmp); *retval = tmp; break; default: error = EINVAL; } fd_putfile(fd); return (error); } /* * Close a file descriptor. */ int sys_close(struct lwp *l, const struct sys_close_args *uap, register_t *retval) { /* { syscallarg(int) fd; } */ int error; int fd = SCARG(uap, fd); if (fd_getfile(fd) == NULL) { return EBADF; } error = fd_close(fd); if (error == ERESTART) { #ifdef DIAGNOSTIC printf("%s[%d]: close(%d) returned ERESTART\n", l->l_proc->p_comm, (int)l->l_proc->p_pid, fd); #endif error = EINTR; } return error; } /* * Return status information about a file descriptor. * Common function for compat code. */ int do_sys_fstat(int fd, struct stat *sb) { file_t *fp; int error; if ((fp = fd_getfile(fd)) == NULL) { return EBADF; } error = (*fp->f_ops->fo_stat)(fp, sb); fd_putfile(fd); return error; } /* * Return status information about a file descriptor. */ int sys___fstat50(struct lwp *l, const struct sys___fstat50_args *uap, register_t *retval) { /* { syscallarg(int) fd; syscallarg(struct stat *) sb; } */ struct stat sb; int error; error = do_sys_fstat(SCARG(uap, fd), &sb); if (error == 0) { error = copyout(&sb, SCARG(uap, sb), sizeof(sb)); } return error; } /* * Return pathconf information about a file descriptor. */ int sys_fpathconf(struct lwp *l, const struct sys_fpathconf_args *uap, register_t *retval) { /* { syscallarg(int) fd; syscallarg(int) name; } */ int fd, name, error; file_t *fp; fd = SCARG(uap, fd); name = SCARG(uap, name); error = 0; if ((fp = fd_getfile(fd)) == NULL) return EBADF; if (fp->f_ops->fo_fpathconf == NULL) error = EOPNOTSUPP; else error = (*fp->f_ops->fo_fpathconf)(fp, name, retval); fd_putfile(fd); return error; } /* * Apply an advisory lock on a file descriptor. * * Just attempt to get a record lock of the requested type on * the entire file (l_whence = SEEK_SET, l_start = 0, l_len = 0). */ /* ARGSUSED */ int sys_flock(struct lwp *l, const struct sys_flock_args *uap, register_t *retval) { /* { syscallarg(int) fd; syscallarg(int) how; } */ int fd, how, error; struct file *fp = NULL; int (*fo_advlock)(struct file *, void *, int, struct flock *, int); struct flock lf; fd = SCARG(uap, fd); how = SCARG(uap, how); if ((fp = fd_getfile(fd)) == NULL) { error = EBADF; goto out; } if ((fo_advlock = fp->f_ops->fo_advlock) == NULL) { KASSERT((atomic_load_relaxed(&fp->f_flag) & FHASLOCK) == 0); error = EOPNOTSUPP; goto out; } lf.l_whence = SEEK_SET; lf.l_start = 0; lf.l_len = 0; switch (how & ~LOCK_NB) { case LOCK_UN: lf.l_type = F_UNLCK; atomic_and_uint(&fp->f_flag, ~FHASLOCK); error = (*fo_advlock)(fp, fp, F_UNLCK, &lf, F_FLOCK); goto out; case LOCK_EX: lf.l_type = F_WRLCK; break; case LOCK_SH: lf.l_type = F_RDLCK; break; default: error = EINVAL; goto out; } atomic_or_uint(&fp->f_flag, FHASLOCK); if (how & LOCK_NB) { error = (*fo_advlock)(fp, fp, F_SETLK, &lf, F_FLOCK); } else { error = (*fo_advlock)(fp, fp, F_SETLK, &lf, F_FLOCK|F_WAIT); } out: if (fp) fd_putfile(fd); return error; } int do_posix_fadvise(int fd, off_t offset, off_t len, int advice) { file_t *fp; int error; if ((fp = fd_getfile(fd)) == NULL) return EBADF; if (fp->f_ops->fo_posix_fadvise == NULL) { error = EOPNOTSUPP; } else { error = (*fp->f_ops->fo_posix_fadvise)(fp, offset, len, advice); } fd_putfile(fd); return error; } int sys___posix_fadvise50(struct lwp *l, const struct sys___posix_fadvise50_args *uap, register_t *retval) { /* { syscallarg(int) fd; syscallarg(int) pad; syscallarg(off_t) offset; syscallarg(off_t) len; syscallarg(int) advice; } */ *retval = do_posix_fadvise(SCARG(uap, fd), SCARG(uap, offset), SCARG(uap, len), SCARG(uap, advice)); return 0; } int sys_pipe(struct lwp *l, const void *v, register_t *retval) { int fd[2], error; if ((error = pipe1(l, fd, 0)) != 0) return error; retval[0] = fd[0]; retval[1] = fd[1]; return 0; } int sys_pipe2(struct lwp *l, const struct sys_pipe2_args *uap, register_t *retval) { /* { syscallarg(int[2]) fildes; syscallarg(int) flags; } */ int fd[2], error; if ((error = pipe1(l, fd, SCARG(uap, flags))) != 0) return error; if ((error = copyout(fd, SCARG(uap, fildes), sizeof(fd))) != 0) return error; retval[0] = 0; return 0; }
1 1 3 1 2 7 5 2 2 4 3 1 9 1 7 1 6 1 1 1 1 15 5 2 3 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 /* $NetBSD: uipc_syscalls_43.c,v 1.51 2019/01/27 02:08:39 pgoyette Exp $ */ /* * Copyright (c) 1982, 1986, 1989, 1990, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)uipc_syscalls.c 8.4 (Berkeley) 2/21/94 */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: uipc_syscalls_43.c,v 1.51 2019/01/27 02:08:39 pgoyette Exp $"); #if defined(_KERNEL_OPT) #include "opt_compat_netbsd.h" #endif #include <sys/param.h> #include <sys/systm.h> #include <sys/filedesc.h> #include <sys/kernel.h> #include <sys/proc.h> #include <sys/file.h> #include <sys/socket.h> #include <sys/socketvar.h> #include <sys/stat.h> #include <sys/ioctl.h> #include <sys/fcntl.h> #include <sys/syslog.h> #include <sys/unistd.h> #include <sys/resourcevar.h> #include <sys/mbuf.h> /* for MLEN */ #include <sys/protosw.h> #include <sys/mount.h> #include <sys/syscall.h> #include <sys/syscallvar.h> #include <sys/syscallargs.h> #include <net/if.h> #include <net/bpf.h> #include <net/route.h> #include <netinet/in.h> #include <netinet/in_systm.h> #include <netinet/ip.h> #include <net/if_gre.h> #include <net/if_tap.h> #include <net80211/ieee80211_ioctl.h> #include <netinet6/in6_var.h> #include <netinet6/nd6.h> #include <compat/sys/socket.h> #include <compat/sys/sockio.h> #include <compat/common/compat_util.h> #include <compat/common/compat_mod.h> #include <uvm/uvm_extern.h> /* * Following 4.3 syscalls were not versioned, even through they should * have been: * connect(2), bind(2), sendto(2) */ static struct syscall_package uipc_syscalls_43_syscalls[] = { { SYS_compat_43_oaccept, 0, (sy_call_t *)compat_43_sys_accept }, { SYS_compat_43_ogetpeername, 0, (sy_call_t *)compat_43_sys_getpeername }, { SYS_compat_43_ogetsockname, 0, (sy_call_t *)compat_43_sys_getsockname }, { SYS_compat_43_orecv, 0, (sy_call_t *)compat_43_sys_recv }, { SYS_compat_43_orecvfrom, 0, (sy_call_t *)compat_43_sys_recvfrom }, { SYS_compat_43_orecvmsg, 0, (sy_call_t *)compat_43_sys_recvmsg }, { SYS_compat_43_osend, 0, (sy_call_t *)compat_43_sys_send }, { SYS_compat_43_osendmsg, 0, (sy_call_t *)compat_43_sys_sendmsg }, { 0, 0, NULL } }; static int compat_43_sa_put(void *); int compat_43_sys_accept(struct lwp *l, const struct compat_43_sys_accept_args *uap, register_t *retval) { /* { syscallarg(int) s; syscallarg(void *) name; syscallarg(int *) anamelen; } */ int error; if ((error = sys_accept(l, (const struct sys_accept_args *)uap, retval)) != 0) return error; if (SCARG(uap, name) && (error = compat_43_sa_put(SCARG(uap, name)))) return (error); return 0; } int compat_43_sys_getpeername(struct lwp *l, const struct compat_43_sys_getpeername_args *uap, register_t *retval) { /* { syscallarg(int) fdes; syscallarg(void *) asa; syscallarg(int *) alen; } */ int error; if ((error = sys_getpeername(l, (const struct sys_getpeername_args *)uap, retval)) != 0) return error; if ((error = compat_43_sa_put(SCARG(uap, asa)))) return (error); return 0; } int compat_43_sys_getsockname(struct lwp *l, const struct compat_43_sys_getsockname_args *uap, register_t *retval) { /* { syscallarg(int) fdes; syscallarg(void *) asa; syscallarg(int *) alen; } */ int error; if ((error = sys_getsockname(l, (const struct sys_getsockname_args *)uap, retval)) != 0) return error; if ((error = compat_43_sa_put(SCARG(uap, asa)))) return (error); return 0; } int compat_43_sys_recv(struct lwp *l, const struct compat_43_sys_recv_args *uap, register_t *retval) { /* { syscallarg(int) s; syscallarg(void *) buf; syscallarg(int) len; syscallarg(int) flags; } */ struct sys_recvfrom_args bra; SCARG(&bra, s) = SCARG(uap, s); SCARG(&bra, buf) = SCARG(uap, buf); SCARG(&bra, len) = (size_t) SCARG(uap, len); SCARG(&bra, flags) = SCARG(uap, flags); SCARG(&bra, from) = NULL; SCARG(&bra, fromlenaddr) = NULL; return (sys_recvfrom(l, &bra, retval)); } int compat_43_sys_recvfrom(struct lwp *l, const struct compat_43_sys_recvfrom_args *uap, register_t *retval) { /* { syscallarg(int) s; syscallarg(void *) buf; syscallarg(size_t) len; syscallarg(int) flags; syscallarg(void *) from; syscallarg(int *) fromlenaddr; } */ int error; if ((error = sys_recvfrom(l, (const struct sys_recvfrom_args *)uap, retval))) return (error); if (SCARG(uap, from) && (error = compat_43_sa_put(SCARG(uap, from)))) return (error); return (0); } /* * Old recvmsg. Arrange necessary structures, calls generic code and * adjusts results accordingly. */ int compat_43_sys_recvmsg(struct lwp *l, const struct compat_43_sys_recvmsg_args *uap, register_t *retval) { /* { syscallarg(int) s; syscallarg(struct omsghdr *) msg; syscallarg(int) flags; } */ struct omsghdr omsg; struct msghdr msg; struct mbuf *from, *control; int error; error = copyin(SCARG(uap, msg), &omsg, sizeof (struct omsghdr)); if (error) return (error); if (omsg.msg_accrights == NULL) omsg.msg_accrightslen = 0; /* it was this way in 4.4BSD */ if (omsg.msg_accrightslen > MLEN) return EINVAL; msg.msg_name = omsg.msg_name; msg.msg_namelen = omsg.msg_namelen; msg.msg_iovlen = omsg.msg_iovlen; msg.msg_iov = omsg.msg_iov; msg.msg_flags = (SCARG(uap, flags) & MSG_USERFLAGS) | MSG_IOVUSRSPACE; error = do_sys_recvmsg(l, SCARG(uap, s), &msg, &from, omsg.msg_accrights != NULL ? &control : NULL, retval); if (error != 0) return error; /* * If there is any control information and it's SCM_RIGHTS, * pass it back to the program. * XXX: maybe there can be more than one chunk of control data? */ if (omsg.msg_accrights && control != NULL) { struct cmsghdr *cmsg = mtod(control, struct cmsghdr *); if (cmsg->cmsg_level == SOL_SOCKET && cmsg->cmsg_type == SCM_RIGHTS && cmsg->cmsg_len < omsg.msg_accrightslen && copyout(CMSG_DATA(cmsg), omsg.msg_accrights, cmsg->cmsg_len) == 0) { omsg.msg_accrightslen = cmsg->cmsg_len; free_control_mbuf(l, control, control->m_next); } else { omsg.msg_accrightslen = 0; free_control_mbuf(l, control, control); } } else omsg.msg_accrightslen = 0; if (from != NULL) /* convert from sockaddr sa_family to osockaddr one here */ mtod(from, struct osockaddr *)->sa_family = mtod(from, struct sockaddr *)->sa_family; error = copyout_sockname((struct sockaddr *)omsg.msg_name, &omsg.msg_namelen, 0, from); if (from != NULL) m_free(from); if (error != 0) error = copyout(&omsg, SCARG(uap, msg), sizeof(omsg)); return error; } int compat_43_sys_send(struct lwp *l, const struct compat_43_sys_send_args *uap, register_t *retval) { /* { syscallarg(int) s; syscallarg(void *) buf; syscallarg(int) len; syscallarg(int) flags; } */ struct sys_sendto_args bsa; SCARG(&bsa, s) = SCARG(uap, s); SCARG(&bsa, buf) = SCARG(uap, buf); SCARG(&bsa, len) = SCARG(uap, len); SCARG(&bsa, flags) = SCARG(uap, flags); SCARG(&bsa, to) = NULL; SCARG(&bsa, tolen) = 0; return (sys_sendto(l, &bsa, retval)); } int compat43_set_accrights(struct msghdr *msg, void *accrights, int accrightslen) { struct cmsghdr *cmsg; int error; struct mbuf *ctl; u_int clen; if (accrights == NULL || accrightslen == 0) { msg->msg_control = NULL; msg->msg_controllen = 0; return 0; } clen = CMSG_SPACE(accrightslen); /* it was (almost) this way in 4.4BSD */ if (accrightslen < 0 || clen > MLEN) return EINVAL; ctl = m_get(M_WAIT, MT_CONTROL); ctl->m_len = clen; cmsg = mtod(ctl, struct cmsghdr *); cmsg->cmsg_len = CMSG_SPACE(accrightslen); cmsg->cmsg_level = SOL_SOCKET; cmsg->cmsg_type = SCM_RIGHTS; error = copyin(accrights, CMSG_DATA(cmsg), accrightslen); if (error) { m_free(ctl); return error; } msg->msg_control = ctl; msg->msg_controllen = clen; msg->msg_flags |= MSG_CONTROLMBUF; return 0; } /* * Old sendmsg. Arrange necessary structures, call generic code and * adjust the results accordingly for old code. */ int compat_43_sys_sendmsg(struct lwp *l, const struct compat_43_sys_sendmsg_args *uap, register_t *retval) { /* { syscallarg(int) s; syscallarg(void *) msg; syscallarg(int) flags; } */ struct omsghdr omsg; struct msghdr msg; int error; struct mbuf *nam; struct osockaddr *osa; struct sockaddr *sa; error = copyin(SCARG(uap, msg), &omsg, sizeof (struct omsghdr)); if (error != 0) return (error); msg.msg_iovlen = omsg.msg_iovlen; msg.msg_iov = omsg.msg_iov; error = sockargs(&nam, omsg.msg_name, omsg.msg_namelen, UIO_USERSPACE, MT_SONAME); if (error != 0) return (error); sa = mtod(nam, struct sockaddr *); osa = mtod(nam, struct osockaddr *); sa->sa_family = osa->sa_family; sa->sa_len = omsg.msg_namelen; msg.msg_flags = MSG_IOVUSRSPACE | MSG_NAMEMBUF; msg.msg_name = nam; msg.msg_namelen = omsg.msg_namelen; error = compat43_set_accrights(&msg, omsg.msg_accrights, omsg.msg_accrightslen); if (error != 0) goto bad; return do_sys_sendmsg(l, SCARG(uap, s), &msg, SCARG(uap, flags), retval); bad: if (nam != NULL) m_free(nam); return (error); } static int compat_43_sa_put(void *from) { struct osockaddr *osa = (struct osockaddr *) from; struct sockaddr sa; struct osockaddr *kosa; int error, len; /* * Only read/write the sockaddr family and length, the rest is * not changed. */ len = sizeof(sa.sa_len) + sizeof(sa.sa_family); error = copyin((void *) osa, (void *) &sa, len); if (error) return (error); /* Note: we convert from sockaddr sa_family to osockaddr one here */ kosa = (struct osockaddr *) &sa; kosa->sa_family = sa.sa_family; error = copyout(kosa, osa, len); if (error) return (error); return (0); } int uipc_syscalls_43_init(void) { return syscall_establish(NULL, uipc_syscalls_43_syscalls); } int uipc_syscalls_43_fini(void) { return syscall_disestablish(NULL, uipc_syscalls_43_syscalls); }
620 837 838 20 13 13 808 73 770 184 185 25 295 270 790 295 791 791 269 790 786 786 785 783 15 780 778 779 65 764 747 32 778 782 782 775 69 15 782 782 2 407 5 90 2 1 90 20 80 88 2 2 780 776 2 2 658 764 765 660 778 10 13 116 116 114 3 116 116 11 113 39 116 116 359 280 592 590 590 6 1 2 9 2 10 588 236 589 354 20 239 238 236 4 31 39 1 119 554 145 486 780 777 2 777 13 659 762 507 518 517 11 59 502 21 6 26 26 39 39 485 296 290 36 36 36 405 317 776 787 32 778 778 780 346 590 210 415 491 5 7 120 443 90 91 6 6 89 1 336 3 1 1 677 20 278 719 120 680 566 142 2 144 671 7 309 3 426 551 266 785 784 75 765 787 720 124 76 769 784 787 787 719 125 125 49 1 42 26 26 48 49 26 42 43 33 240 1 1 1 1 1 232 290 26 275 267 8 39 253 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 /* $NetBSD: vfs_lookup.c,v 1.234 2023/05/01 05:12:44 mlelstv Exp $ */ /* * Copyright (c) 1982, 1986, 1989, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)vfs_lookup.c 8.10 (Berkeley) 5/27/95 */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: vfs_lookup.c,v 1.234 2023/05/01 05:12:44 mlelstv Exp $"); #ifdef _KERNEL_OPT #include "opt_magiclinks.h" #endif #include <sys/param.h> #include <sys/systm.h> #include <sys/kernel.h> #include <sys/syslimits.h> #include <sys/time.h> #include <sys/namei.h> #include <sys/vnode.h> #include <sys/vnode_impl.h> #include <sys/fstrans.h> #include <sys/mount.h> #include <sys/errno.h> #include <sys/filedesc.h> #include <sys/hash.h> #include <sys/proc.h> #include <sys/syslog.h> #include <sys/kauth.h> #include <sys/ktrace.h> #include <sys/dirent.h> #ifndef MAGICLINKS #define MAGICLINKS 0 #endif int vfs_magiclinks = MAGICLINKS; __CTASSERT(MAXNAMLEN == NAME_MAX); /* * Substitute replacement text for 'magic' strings in symlinks. * Returns 0 if successful, and returns non-zero if an error * occurs. (Currently, the only possible error is running out * of temporary pathname space.) * * Looks for "@<string>" and "@<string>/", where <string> is a * recognized 'magic' string. Replaces the "@<string>" with the * appropriate replacement text. (Note that in some cases the * replacement text may have zero length.) * * This would have been table driven, but the variance in * replacement strings (and replacement string lengths) made * that impractical. */ #define VNL(x) \ (sizeof(x) - 1) #define VO '{' #define VC '}' #define MATCH(str) \ ((termchar == '/' && i + VNL(str) == *len) || \ (i + VNL(str) < *len && \ cp[i + VNL(str)] == termchar)) && \ !strncmp((str), &cp[i], VNL(str)) #define SUBSTITUTE(m, s, sl) \ if ((newlen + (sl)) >= MAXPATHLEN) \ return 1; \ i += VNL(m); \ if (termchar != '/') \ i++; \ (void)memcpy(&tmp[newlen], (s), (sl)); \ newlen += (sl); \ change = 1; \ termchar = '/'; static int symlink_magic(struct proc *p, char *cp, size_t *len) { char *tmp; size_t change, i, newlen, slen; char termchar = '/'; char idtmp[11]; /* enough for 32 bit *unsigned* integer */ tmp = PNBUF_GET(); for (change = i = newlen = 0; i < *len; ) { if (cp[i] != '@') { tmp[newlen++] = cp[i++]; continue; } i++; /* Check for @{var} syntax. */ if (cp[i] == VO) { termchar = VC; i++; } /* * The following checks should be ordered according * to frequency of use. */ if (MATCH("machine_arch")) { slen = strlen(PROC_MACHINE_ARCH(p)); SUBSTITUTE("machine_arch", PROC_MACHINE_ARCH(p), slen); } else if (MATCH("machine")) { slen = VNL(MACHINE); SUBSTITUTE("machine", MACHINE, slen); } else if (MATCH("hostname")) { SUBSTITUTE("hostname", hostname, hostnamelen); } else if (MATCH("osrelease")) { slen = strlen(osrelease); SUBSTITUTE("osrelease", osrelease, slen); } else if (MATCH("emul")) { slen = strlen(p->p_emul->e_name); SUBSTITUTE("emul", p->p_emul->e_name, slen); } else if (MATCH("kernel_ident")) { slen = strlen(kernel_ident); SUBSTITUTE("kernel_ident", kernel_ident, slen); } else if (MATCH("domainname")) { SUBSTITUTE("domainname", domainname, domainnamelen); } else if (MATCH("ostype")) { slen = strlen(ostype); SUBSTITUTE("ostype", ostype, slen); } else if (MATCH("uid")) { slen = snprintf(idtmp, sizeof(idtmp), "%u", kauth_cred_geteuid(kauth_cred_get())); SUBSTITUTE("uid", idtmp, slen); } else if (MATCH("ruid")) { slen = snprintf(idtmp, sizeof(idtmp), "%u", kauth_cred_getuid(kauth_cred_get())); SUBSTITUTE("ruid", idtmp, slen); } else if (MATCH("gid")) { slen = snprintf(idtmp, sizeof(idtmp), "%u", kauth_cred_getegid(kauth_cred_get())); SUBSTITUTE("gid", idtmp, slen); } else if (MATCH("rgid")) { slen = snprintf(idtmp, sizeof(idtmp), "%u", kauth_cred_getgid(kauth_cred_get())); SUBSTITUTE("rgid", idtmp, slen); } else { tmp[newlen++] = '@'; if (termchar == VC) tmp[newlen++] = VO; } } if (change) { (void)memcpy(cp, tmp, newlen); *len = newlen; } PNBUF_PUT(tmp); return 0; } #undef VNL #undef VO #undef VC #undef MATCH #undef SUBSTITUTE //////////////////////////////////////////////////////////// /* * Determine the namei hash (for the namecache) for name. * If *ep != NULL, hash from name to ep-1. * If *ep == NULL, hash from name until the first NUL or '/', and * return the location of this termination character in *ep. * * This function returns an equivalent hash to the MI hash32_strn(). * The latter isn't used because in the *ep == NULL case, determining * the length of the string to the first NUL or `/' and then calling * hash32_strn() involves unnecessary double-handling of the data. */ uint32_t namei_hash(const char *name, const char **ep) { uint32_t hash; hash = HASH32_STR_INIT; if (*ep != NULL) { for (; name < *ep; name++) hash = hash * 33 + *(const uint8_t *)name; } else { for (; *name != '\0' && *name != '/'; name++) hash = hash * 33 + *(const uint8_t *)name; *ep = name; } return (hash + (hash >> 5)); } //////////////////////////////////////////////////////////// /* * Sealed abstraction for pathnames. * * System-call-layer level code that is going to call namei should * first create a pathbuf and adjust all the bells and whistles on it * as needed by context. */ struct pathbuf { char *pb_path; char *pb_pathcopy; unsigned pb_pathcopyuses; }; static struct pathbuf * pathbuf_create_raw(void) { struct pathbuf *pb; pb = kmem_alloc(sizeof(*pb), KM_SLEEP); pb->pb_path = PNBUF_GET(); if (pb->pb_path == NULL) { kmem_free(pb, sizeof(*pb)); return NULL; } pb->pb_pathcopy = NULL; pb->pb_pathcopyuses = 0; return pb; } void pathbuf_destroy(struct pathbuf *pb) { KASSERT(pb->pb_pathcopyuses == 0); KASSERT(pb->pb_pathcopy == NULL); PNBUF_PUT(pb->pb_path); kmem_free(pb, sizeof(*pb)); } struct pathbuf * pathbuf_assimilate(char *pnbuf) { struct pathbuf *pb; pb = kmem_alloc(sizeof(*pb), KM_SLEEP); pb->pb_path = pnbuf; pb->pb_pathcopy = NULL; pb->pb_pathcopyuses = 0; return pb; } struct pathbuf * pathbuf_create(const char *path) { struct pathbuf *pb; int error; pb = pathbuf_create_raw(); if (pb == NULL) { return NULL; } error = copystr(path, pb->pb_path, PATH_MAX, NULL); if (error != 0) { KASSERT(!"kernel path too long in pathbuf_create"); /* make sure it's null-terminated, just in case */ pb->pb_path[PATH_MAX-1] = '\0'; } return pb; } int pathbuf_copyin(const char *userpath, struct pathbuf **ret) { struct pathbuf *pb; int error; pb = pathbuf_create_raw(); if (pb == NULL) { return ENOMEM; } error = copyinstr(userpath, pb->pb_path, PATH_MAX, NULL); if (error) { pathbuf_destroy(pb); return error; } *ret = pb; return 0; } /* * XXX should not exist: * 1. whether a pointer is kernel or user should be statically checkable. * 2. copyin should be handled by the upper part of the syscall layer, * not in here. */ int pathbuf_maybe_copyin(const char *path, enum uio_seg seg, struct pathbuf **ret) { if (seg == UIO_USERSPACE) { return pathbuf_copyin(path, ret); } else { *ret = pathbuf_create(path); if (*ret == NULL) { return ENOMEM; } return 0; } } /* * Get a copy of the path buffer as it currently exists. If this is * called after namei starts the results may be arbitrary. */ void pathbuf_copystring(const struct pathbuf *pb, char *buf, size_t maxlen) { strlcpy(buf, pb->pb_path, maxlen); } /* * These two functions allow access to a saved copy of the original * path string. The first copy should be gotten before namei is * called. Each copy that is gotten should be put back. */ const char * pathbuf_stringcopy_get(struct pathbuf *pb) { if (pb->pb_pathcopyuses == 0) { pb->pb_pathcopy = PNBUF_GET(); strcpy(pb->pb_pathcopy, pb->pb_path); } pb->pb_pathcopyuses++; return pb->pb_pathcopy; } void pathbuf_stringcopy_put(struct pathbuf *pb, const char *str) { KASSERT(str == pb->pb_pathcopy); KASSERT(pb->pb_pathcopyuses > 0); pb->pb_pathcopyuses--; if (pb->pb_pathcopyuses == 0) { PNBUF_PUT(pb->pb_pathcopy); pb->pb_pathcopy = NULL; } } //////////////////////////////////////////////////////////// /* * namei: convert a pathname into a pointer to a (maybe-locked) vnode, * and maybe also its parent directory vnode, and assorted other guff. * See namei(9) for the interface documentation. * * * The FOLLOW flag is set when symbolic links are to be followed * when they occur at the end of the name translation process. * Symbolic links are always followed for all other pathname * components other than the last. * * The segflg defines whether the name is to be copied from user * space or kernel space. * * Overall outline of namei: * * copy in name * get starting directory * while (!done && !error) { * call lookup to search path. * if symbolic link, massage name in buffer and continue * } */ /* * Search a pathname. * This is a very central and rather complicated routine. * * The pathname is pointed to by ni_ptr and is of length ni_pathlen. * The starting directory is passed in. The pathname is descended * until done, or a symbolic link is encountered. The variable ni_more * is clear if the path is completed; it is set to one if a symbolic * link needing interpretation is encountered. * * The flag argument is LOOKUP, CREATE, RENAME, or DELETE depending on * whether the name is to be looked up, created, renamed, or deleted. * When CREATE, RENAME, or DELETE is specified, information usable in * creating, renaming, or deleting a directory entry may be calculated. * If flag has LOCKPARENT or'ed into it, the parent directory is returned * locked. Otherwise the parent directory is not returned. If the target * of the pathname exists and LOCKLEAF is or'ed into the flag the target * is returned locked, otherwise it is returned unlocked. When creating * or renaming and LOCKPARENT is specified, the target may not be ".". * When deleting and LOCKPARENT is specified, the target may be ".". * * Overall outline of lookup: * * dirloop: * identify next component of name at ndp->ni_ptr * handle degenerate case where name is null string * if .. and crossing mount points and on mounted filesys, find parent * call VOP_LOOKUP routine for next component name * directory vnode returned in ni_dvp, locked. * component vnode returned in ni_vp (if it exists), locked. * if result vnode is mounted on and crossing mount points, * find mounted on vnode * if more components of name, do next level at dirloop * return the answer in ni_vp, locked if LOCKLEAF set * if LOCKPARENT set, return locked parent in ni_dvp */ /* * Internal state for a namei operation. * * cnp is always equal to &ndp->ni_cnp. */ struct namei_state { struct nameidata *ndp; struct componentname *cnp; int docache; /* == 0 do not cache last component */ int rdonly; /* lookup read-only flag bit */ int slashes; unsigned attempt_retry:1; /* true if error allows emul retry */ unsigned root_referenced:1; /* true if ndp->ni_rootdir and ndp->ni_erootdir were referenced */ }; /* * Initialize the namei working state. */ static void namei_init(struct namei_state *state, struct nameidata *ndp) { state->ndp = ndp; state->cnp = &ndp->ni_cnd; state->docache = 0; state->rdonly = 0; state->slashes = 0; state->root_referenced = 0; KASSERTMSG((state->cnp->cn_cred != NULL), "namei: bad cred/proc"); KASSERTMSG(((state->cnp->cn_nameiop & (~OPMASK)) == 0), "namei: nameiop contaminated with flags: %08"PRIx32, state->cnp->cn_nameiop); KASSERTMSG(((state->cnp->cn_flags & OPMASK) == 0), "name: flags contaminated with nameiops: %08"PRIx32, state->cnp->cn_flags); /* * The buffer for name translation shall be the one inside the * pathbuf. */ state->ndp->ni_pnbuf = state->ndp->ni_pathbuf->pb_path; } /* * Clean up the working namei state, leaving things ready for return * from namei. */ static void namei_cleanup(struct namei_state *state) { KASSERT(state->cnp == &state->ndp->ni_cnd); if (state->root_referenced) { if (state->ndp->ni_rootdir != NULL) vrele(state->ndp->ni_rootdir); if (state->ndp->ni_erootdir != NULL) vrele(state->ndp->ni_erootdir); } } ////////////////////////////// /* * Get the directory context. * Initializes the rootdir and erootdir state and returns a reference * to the starting dir. */ static struct vnode * namei_getstartdir(struct namei_state *state) { struct nameidata *ndp = state->ndp; struct componentname *cnp = state->cnp; struct cwdinfo *cwdi; /* pointer to cwd state */ struct lwp *self = curlwp; /* thread doing namei() */ struct vnode *rootdir, *erootdir, *curdir, *startdir; if (state->root_referenced) { if (state->ndp->ni_rootdir != NULL) vrele(state->ndp->ni_rootdir); if (state->ndp->ni_erootdir != NULL) vrele(state->ndp->ni_erootdir); state->root_referenced = 0; } cwdi = self->l_proc->p_cwdi; rw_enter(&cwdi->cwdi_lock, RW_READER); /* root dir */ if (cwdi->cwdi_rdir == NULL || (cnp->cn_flags & NOCHROOT)) { rootdir = rootvnode; } else { rootdir = cwdi->cwdi_rdir; } /* emulation root dir, if any */ if ((cnp->cn_flags & TRYEMULROOT) == 0) { /* if we don't want it, don't fetch it */ erootdir = NULL; } else if (cnp->cn_flags & EMULROOTSET) { /* explicitly set emulroot; "/../" doesn't override this */ erootdir = ndp->ni_erootdir; } else if (!strncmp(ndp->ni_pnbuf, "/../", 4)) { /* explicit reference to real rootdir */ erootdir = NULL; } else { /* may be null */ erootdir = cwdi->cwdi_edir; } /* current dir */ curdir = cwdi->cwdi_cdir; if (ndp->ni_pnbuf[0] != '/') { if (ndp->ni_atdir != NULL) { startdir = ndp->ni_atdir; } else { startdir = curdir; } erootdir = NULL; } else if (cnp->cn_flags & TRYEMULROOT && erootdir != NULL) { startdir = erootdir; } else { startdir = rootdir; erootdir = NULL; } state->ndp->ni_rootdir = rootdir; state->ndp->ni_erootdir = erootdir; /* * Get a reference to the start dir so we can safely unlock cwdi. * * Must hold references to rootdir and erootdir while we're running. * A multithreaded process may chroot during namei. */ if (startdir != NULL) vref(startdir); if (state->ndp->ni_rootdir != NULL) vref(state->ndp->ni_rootdir); if (state->ndp->ni_erootdir != NULL) vref(state->ndp->ni_erootdir); state->root_referenced = 1; rw_exit(&cwdi->cwdi_lock); return startdir; } /* * Get the directory context for the nfsd case, in parallel to * getstartdir. Initializes the rootdir and erootdir state and * returns a reference to the passed-in starting dir. */ static struct vnode * namei_getstartdir_for_nfsd(struct namei_state *state) { KASSERT(state->ndp->ni_atdir != NULL); /* always use the real root, and never set an emulation root */ if (rootvnode == NULL) { return NULL; } state->ndp->ni_rootdir = rootvnode; state->ndp->ni_erootdir = NULL; vref(state->ndp->ni_atdir); KASSERT(! state->root_referenced); vref(state->ndp->ni_rootdir); state->root_referenced = 1; return state->ndp->ni_atdir; } /* * Ktrace the namei operation. */ static void namei_ktrace(struct namei_state *state) { struct nameidata *ndp = state->ndp; struct componentname *cnp = state->cnp; struct lwp *self = curlwp; /* thread doing namei() */ const char *emul_path; if (ktrpoint(KTR_NAMEI)) { if (ndp->ni_erootdir != NULL) { /* * To make any sense, the trace entry need to have the * text of the emulation path prepended. * Usually we can get this from the current process, * but when called from emul_find_interp() it is only * in the exec_package - so we get it passed in ni_next * (this is a hack). */ if (cnp->cn_flags & EMULROOTSET) emul_path = ndp->ni_next; else emul_path = self->l_proc->p_emul->e_path; ktrnamei2(emul_path, strlen(emul_path), ndp->ni_pnbuf, ndp->ni_pathlen); } else ktrnamei(ndp->ni_pnbuf, ndp->ni_pathlen); } } /* * Start up namei. Find the root dir and cwd, establish the starting * directory for lookup, and lock it. Also calls ktrace when * appropriate. */ static int namei_start(struct namei_state *state, int isnfsd, struct vnode **startdir_ret) { struct nameidata *ndp = state->ndp; struct vnode *startdir; /* length includes null terminator (was originally from copyinstr) */ ndp->ni_pathlen = strlen(ndp->ni_pnbuf) + 1; /* * POSIX.1 requirement: "" is not a valid file name. */ if (ndp->ni_pathlen == 1) { ndp->ni_erootdir = NULL; return ENOENT; } ndp->ni_loopcnt = 0; /* Get starting directory, set up root, and ktrace. */ if (isnfsd) { startdir = namei_getstartdir_for_nfsd(state); /* no ktrace */ } else { startdir = namei_getstartdir(state); namei_ktrace(state); } if (startdir == NULL) { return ENOENT; } /* NDAT may feed us with a non directory namei_getstartdir */ if (startdir->v_type != VDIR) { vrele(startdir); return ENOTDIR; } *startdir_ret = startdir; return 0; } /* * Check for being at a symlink that we're going to follow. */ static inline int namei_atsymlink(struct namei_state *state, struct vnode *foundobj) { return (foundobj->v_type == VLNK) && (state->cnp->cn_flags & (FOLLOW|REQUIREDIR)); } /* * Follow a symlink. * * Updates searchdir. inhibitmagic causes magic symlinks to not be * interpreted; this is used by nfsd. * * Unlocks foundobj on success (ugh) */ static inline int namei_follow(struct namei_state *state, int inhibitmagic, struct vnode *searchdir, struct vnode *foundobj, struct vnode **newsearchdir_ret) { struct nameidata *ndp = state->ndp; struct componentname *cnp = state->cnp; struct lwp *self = curlwp; /* thread doing namei() */ struct iovec aiov; /* uio for reading symbolic links */ struct uio auio; char *cp; /* pointer into pathname argument */ size_t linklen; int error; if (ndp->ni_loopcnt++ >= MAXSYMLINKS) { return ELOOP; } vn_lock(foundobj, LK_EXCLUSIVE | LK_RETRY); if (foundobj->v_mount->mnt_flag & MNT_SYMPERM) { error = VOP_ACCESS(foundobj, VEXEC, cnp->cn_cred); if (error != 0) { VOP_UNLOCK(foundobj); return error; } } /* FUTURE: fix this to not use a second buffer */ cp = PNBUF_GET(); aiov.iov_base = cp; aiov.iov_len = MAXPATHLEN; auio.uio_iov = &aiov; auio.uio_iovcnt = 1; auio.uio_offset = 0; auio.uio_rw = UIO_READ; auio.uio_resid = MAXPATHLEN; UIO_SETUP_SYSSPACE(&auio); error = VOP_READLINK(foundobj, &auio, cnp->cn_cred); VOP_UNLOCK(foundobj); if (error) { PNBUF_PUT(cp); return error; } linklen = MAXPATHLEN - auio.uio_resid; if (linklen == 0) { PNBUF_PUT(cp); return ENOENT; } /* * Do symlink substitution, if appropriate, and * check length for potential overflow. * * Inhibit symlink substitution for nfsd. * XXX: This is how it was before; is that a bug or a feature? */ if ((!inhibitmagic && vfs_magiclinks && symlink_magic(self->l_proc, cp, &linklen)) || (linklen + ndp->ni_pathlen >= MAXPATHLEN)) { PNBUF_PUT(cp); return ENAMETOOLONG; } if (ndp->ni_pathlen > 1) { /* includes a null-terminator */ memcpy(cp + linklen, ndp->ni_next, ndp->ni_pathlen); } else { cp[linklen] = '\0'; } ndp->ni_pathlen += linklen; memcpy(ndp->ni_pnbuf, cp, ndp->ni_pathlen); PNBUF_PUT(cp); /* we're now starting from the beginning of the buffer again */ cnp->cn_nameptr = ndp->ni_pnbuf; /* * Check if root directory should replace current directory. */ if (ndp->ni_pnbuf[0] == '/') { vrele(searchdir); /* Keep absolute symbolic links inside emulation root */ searchdir = ndp->ni_erootdir; if (searchdir == NULL || (ndp->ni_pnbuf[1] == '.' && ndp->ni_pnbuf[2] == '.' && ndp->ni_pnbuf[3] == '/')) { ndp->ni_erootdir = NULL; searchdir = ndp->ni_rootdir; } vref(searchdir); while (cnp->cn_nameptr[0] == '/') { cnp->cn_nameptr++; ndp->ni_pathlen--; } } *newsearchdir_ret = searchdir; return 0; } ////////////////////////////// /* * Inspect the leading path component and update the state accordingly. */ static int lookup_parsepath(struct namei_state *state, struct vnode *searchdir) { const char *cp; /* pointer into pathname argument */ int error; struct componentname *cnp = state->cnp; struct nameidata *ndp = state->ndp; KASSERT(cnp == &ndp->ni_cnd); /* * Search a new directory. * * The last component of the filename is left accessible via * cnp->cn_nameptr for callers that need the name. Callers needing * the name set the SAVENAME flag. When done, they assume * responsibility for freeing the pathname buffer. * * At this point, our only vnode state is that the search dir * is held. */ error = VOP_PARSEPATH(searchdir, cnp->cn_nameptr, &cnp->cn_namelen); if (error) { return error; } cp = cnp->cn_nameptr + cnp->cn_namelen; if (cnp->cn_namelen > KERNEL_NAME_MAX) { return ENAMETOOLONG; } #ifdef NAMEI_DIAGNOSTIC { char c = *cp; *(char *)cp = '\0'; printf("{%s}: ", cnp->cn_nameptr); *(char *)cp = c; } #endif /* NAMEI_DIAGNOSTIC */ ndp->ni_pathlen -= cnp->cn_namelen; ndp->ni_next = cp; /* * If this component is followed by a slash, then move the pointer to * the next component forward, and remember that this component must be * a directory. */ if (*cp == '/') { do { cp++; } while (*cp == '/'); state->slashes = cp - ndp->ni_next; ndp->ni_pathlen -= state->slashes; ndp->ni_next = cp; cnp->cn_flags |= REQUIREDIR; } else { state->slashes = 0; cnp->cn_flags &= ~REQUIREDIR; } /* * We do special processing on the last component, whether or not it's * a directory. Cache all intervening lookups, but not the final one. */ if (*cp == '\0') { if (state->docache) cnp->cn_flags |= MAKEENTRY; else cnp->cn_flags &= ~MAKEENTRY; cnp->cn_flags |= ISLASTCN; } else { cnp->cn_flags |= MAKEENTRY; cnp->cn_flags &= ~ISLASTCN; } if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.' && cnp->cn_nameptr[0] == '.') cnp->cn_flags |= ISDOTDOT; else cnp->cn_flags &= ~ISDOTDOT; return 0; } /* * Take care of crossing a mounted-on vnode. On error, foundobj_ret will be * vrele'd, but searchdir is left alone. */ static int lookup_crossmount(struct namei_state *state, struct vnode **searchdir_ret, struct vnode **foundobj_ret, bool *searchdir_locked) { struct componentname *cnp = state->cnp; struct vnode *foundobj, *vp; struct vnode *searchdir; struct mount *mp; int error, lktype; searchdir = *searchdir_ret; foundobj = *foundobj_ret; error = 0; KASSERT((cnp->cn_flags & NOCROSSMOUNT) == 0); /* First, unlock searchdir (oof). */ if (*searchdir_locked) { KASSERT(searchdir != NULL); lktype = VOP_ISLOCKED(searchdir); VOP_UNLOCK(searchdir); *searchdir_locked = false; } else { lktype = LK_NONE; } /* * Do an unlocked check to see if the vnode has been mounted on; if * so find the root of the mounted file system. */ while (foundobj->v_type == VDIR && (mp = foundobj->v_mountedhere) != NULL && (cnp->cn_flags & NOCROSSMOUNT) == 0) { /* * Try the namecache first. If that doesn't work, do * it the hard way. */ if (cache_lookup_mount(foundobj, &vp)) { vrele(foundobj); foundobj = vp; } else { /* First get the vnodes mount stable. */ while ((mp = foundobj->v_mountedhere) != NULL) { fstrans_start(mp); if (fstrans_held(mp) && mp == foundobj->v_mountedhere) { break; } fstrans_done(mp); } if (mp == NULL) { break; } /* * Now get a reference on the root vnode. * XXX Future - maybe allow only VDIR here. */ error = VFS_ROOT(mp, LK_NONE, &vp); /* * If successful, enter it into the cache while * holding the mount busy (competing with unmount). */ if (error == 0) { cache_enter_mount(foundobj, vp); } /* Finally, drop references to foundobj & mountpoint. */ vrele(foundobj); fstrans_done(mp); if (error) { foundobj = NULL; break; } foundobj = vp; } /* * Avoid locking vnodes from two filesystems because * it's prone to deadlock, e.g. when using puffs. * Also, it isn't a good idea to propagate slowness of * a filesystem up to the root directory. For now, * only handle the common case, where foundobj is * VDIR. * * In this case set searchdir to null to avoid using * it again. It is not correct to set searchdir == * foundobj here as that will confuse the caller. * (See PR 40740.) */ if (searchdir == NULL) { /* already been here once; do nothing further */ } else if (foundobj->v_type == VDIR) { vrele(searchdir); *searchdir_ret = searchdir = NULL; lktype = LK_NONE; } } /* If searchdir is still around, re-lock it. */ if (error == 0 && lktype != LK_NONE) { vn_lock(searchdir, lktype | LK_RETRY); *searchdir_locked = true; } *foundobj_ret = foundobj; return error; } /* * Determine the desired locking mode for the directory of a lookup. */ static int lookup_lktype(struct vnode *searchdir, struct componentname *cnp) { /* * If the file system supports VOP_LOOKUP() with a shared lock, and * we are not making any modifications (nameiop LOOKUP) or this is * not the last component then get a shared lock. Where we can't do * fast-forwarded lookups (for example with layered file systems) * then this is the fallback for reducing lock contention. */ if ((searchdir->v_mount->mnt_iflag & IMNT_SHRLOOKUP) != 0 && (cnp->cn_nameiop == LOOKUP || (cnp->cn_flags & ISLASTCN) == 0)) { return LK_SHARED; } else { return LK_EXCLUSIVE; } } /* * Call VOP_LOOKUP for a single lookup; return a new search directory * (used when crossing mountpoints up or searching union mounts down) and * the found object, which for create operations may be NULL on success. * * Note that the new search directory may be null, which means the * searchdir was unlocked and released. This happens in the common case * when crossing a mount point downwards, in order to avoid coupling * locks between different file system volumes. Importantly, this can * happen even if the call fails. (XXX: this is gross and should be * tidied somehow.) */ static int lookup_once(struct namei_state *state, struct vnode *searchdir, struct vnode **newsearchdir_ret, struct vnode **foundobj_ret, bool *newsearchdir_locked_ret) { struct vnode *tmpvn; /* scratch vnode */ struct vnode *foundobj; /* result */ struct lwp *l = curlwp; bool searchdir_locked = false; int error, lktype; struct componentname *cnp = state->cnp; struct nameidata *ndp = state->ndp; KASSERT(cnp == &ndp->ni_cnd); *newsearchdir_ret = searchdir; /* * Handle "..": two special cases. * 1. If at root directory (e.g. after chroot) * or at absolute root directory * then ignore it so can't get out. * 1a. If at the root of the emulation filesystem go to the real * root. So "/../<path>" is always absolute. * 1b. If we have somehow gotten out of a jail, warn * and also ignore it so we can't get farther out. * 2. If this vnode is the root of a mounted * filesystem, then replace it with the * vnode which was mounted on so we take the * .. in the other file system. */ if (cnp->cn_flags & ISDOTDOT) { struct proc *p = l->l_proc; for (;;) { if (searchdir == ndp->ni_rootdir || searchdir == rootvnode) { foundobj = searchdir; vref(foundobj); *foundobj_ret = foundobj; if (cnp->cn_flags & LOCKPARENT) { lktype = lookup_lktype(searchdir, cnp); vn_lock(searchdir, lktype | LK_RETRY); searchdir_locked = true; } error = 0; goto done; } if (ndp->ni_rootdir != rootvnode) { int retval; retval = vn_isunder(searchdir, ndp->ni_rootdir, l); if (!retval) { /* Oops! We got out of jail! */ log(LOG_WARNING, "chrooted pid %d uid %d (%s) " "detected outside of its chroot\n", p->p_pid, kauth_cred_geteuid(l->l_cred), p->p_comm); /* Put us at the jail root. */ vrele(searchdir); searchdir = NULL; foundobj = ndp->ni_rootdir; vref(foundobj); vref(foundobj); *newsearchdir_ret = foundobj; *foundobj_ret = foundobj; error = 0; goto done; } } if ((searchdir->v_vflag & VV_ROOT) == 0 || (cnp->cn_flags & NOCROSSMOUNT)) break; tmpvn = searchdir; searchdir = searchdir->v_mount->mnt_vnodecovered; vref(searchdir); vrele(tmpvn); *newsearchdir_ret = searchdir; } } lktype = lookup_lktype(searchdir, cnp); /* * We now have a segment name to search for, and a directory to search. * Our vnode state here is that "searchdir" is held. */ unionlookup: foundobj = NULL; if (!searchdir_locked) { vn_lock(searchdir, lktype | LK_RETRY); searchdir_locked = true; } error = VOP_LOOKUP(searchdir, &foundobj, cnp); if (error != 0) { KASSERTMSG((foundobj == NULL), "leaf `%s' should be empty but is %p", cnp->cn_nameptr, foundobj); #ifdef NAMEI_DIAGNOSTIC printf("not found\n"); #endif /* NAMEI_DIAGNOSTIC */ /* * If ENOLCK, the file system needs us to retry the lookup * with an exclusive lock. It's likely nothing was found in * cache and/or modifications need to be made. */ if (error == ENOLCK) { KASSERT(VOP_ISLOCKED(searchdir) == LK_SHARED); KASSERT(searchdir_locked); if (vn_lock(searchdir, LK_UPGRADE | LK_NOWAIT)) { VOP_UNLOCK(searchdir); searchdir_locked = false; } lktype = LK_EXCLUSIVE; goto unionlookup; } if ((error == ENOENT) && (searchdir->v_vflag & VV_ROOT) && (searchdir->v_mount->mnt_flag & MNT_UNION)) { tmpvn = searchdir; searchdir = searchdir->v_mount->mnt_vnodecovered; vref(searchdir); vput(tmpvn); searchdir_locked = false; *newsearchdir_ret = searchdir; goto unionlookup; } if (error != EJUSTRETURN) goto done; /* * If this was not the last component, or there were trailing * slashes, and we are not going to create a directory, * then the name must exist. */ if ((cnp->cn_flags & (REQUIREDIR | CREATEDIR)) == REQUIREDIR) { error = ENOENT; goto done; } /* * If creating and at end of pathname, then can consider * allowing file to be created. */ if (state->rdonly) { error = EROFS; goto done; } /* * We return success and a NULL foundobj to indicate * that the entry doesn't currently exist, leaving a * pointer to the (normally, locked) directory vnode * as searchdir. */ *foundobj_ret = NULL; error = 0; goto done; } #ifdef NAMEI_DIAGNOSTIC printf("found\n"); #endif /* NAMEI_DIAGNOSTIC */ /* Unlock, unless the caller needs the parent locked. */ if (searchdir != NULL) { KASSERT(searchdir_locked); if ((cnp->cn_flags & (ISLASTCN | LOCKPARENT)) != (ISLASTCN | LOCKPARENT)) { VOP_UNLOCK(searchdir); searchdir_locked = false; } } else { KASSERT(!searchdir_locked); } *foundobj_ret = foundobj; error = 0; done: *newsearchdir_locked_ret = searchdir_locked; return error; } /* * Parse out the first path name component that we need to to consider. * * While doing this, attempt to use the name cache to fast-forward through * as many "easy" to find components of the path as possible. * * We use the namecache's node locks to form a chain, and avoid as many * vnode references and locks as possible. In the ideal case, only the * final vnode will have its reference count adjusted and lock taken. */ static int lookup_fastforward(struct namei_state *state, struct vnode **searchdir_ret, struct vnode **foundobj_ret) { struct componentname *cnp = state->cnp; struct nameidata *ndp = state->ndp; krwlock_t *plock; struct vnode *foundobj, *searchdir; int error, error2; size_t oldpathlen; const char *oldnameptr; bool terminal; /* * Eat as many path name components as possible before giving up and * letting lookup_once() handle it. Remember the starting point in * case we can't get vnode references and need to roll back. */ plock = NULL; searchdir = *searchdir_ret; oldnameptr = cnp->cn_nameptr; oldpathlen = ndp->ni_pathlen; terminal = false; for (;;) { foundobj = NULL; /* * Get the next component name. There should be no slashes * here, and we shouldn't have looped around if we were * done. */ KASSERT(cnp->cn_nameptr[0] != '/'); KASSERT(cnp->cn_nameptr[0] != '\0'); if ((error = lookup_parsepath(state, searchdir)) != 0) { break; } /* * Can't deal with DOTDOT lookups if NOCROSSMOUNT or the * lookup is chrooted. */ if ((cnp->cn_flags & ISDOTDOT) != 0) { if ((searchdir->v_vflag & VV_ROOT) != 0 && (cnp->cn_flags & NOCROSSMOUNT)) { error = EOPNOTSUPP; break; } if (ndp->ni_rootdir != rootvnode) { error = EOPNOTSUPP; break; } } /* * Can't deal with last component when modifying; this needs * searchdir locked and VOP_LOOKUP() called (which can and * does modify state, despite the name). NB: this case means * terminal is never set true when LOCKPARENT. */ if ((cnp->cn_flags & ISLASTCN) != 0) { if (cnp->cn_nameiop != LOOKUP || (cnp->cn_flags & LOCKPARENT) != 0) { error = EOPNOTSUPP; break; } } /* * Good, now look for it in cache. cache_lookup_linked() * will fail if there's nothing there, or if there's no * ownership info for the directory, or if the user doesn't * have permission to look up files in this directory. */ if (!cache_lookup_linked(searchdir, cnp->cn_nameptr, cnp->cn_namelen, &foundobj, &plock, cnp->cn_cred)) { error = EOPNOTSUPP; break; } KASSERT(plock != NULL); KASSERT(rw_lock_held(plock)); /* * Scored a hit. Negative is good too (ENOENT). If there's * a '-o union' mount here, punt and let lookup_once() deal * with it. */ if (foundobj == NULL) { if ((searchdir->v_vflag & VV_ROOT) != 0 && (searchdir->v_mount->mnt_flag & MNT_UNION) != 0) { error = EOPNOTSUPP; } else { error = ENOENT; terminal = ((cnp->cn_flags & ISLASTCN) != 0); } break; } /* * Stop and get a hold on the vnode if we've encountered * something other than a dirctory. */ if (foundobj->v_type != VDIR) { error = vcache_tryvget(foundobj); if (error != 0) { foundobj = NULL; error = EOPNOTSUPP; } else { terminal = (foundobj->v_type != VLNK && (cnp->cn_flags & ISLASTCN) != 0); } break; } /* * Try to cross mountpoints, bearing in mind that they can * be stacked. If at any point we can't go further, stop * and try to get a reference on the vnode. If we are able * to get a ref then lookup_crossmount() will take care of * it, otherwise we'll fall through to lookup_once(). */ if (foundobj->v_mountedhere != NULL) { while (foundobj->v_mountedhere != NULL && (cnp->cn_flags & NOCROSSMOUNT) == 0 && cache_cross_mount(&foundobj, &plock)) { KASSERT(foundobj != NULL); KASSERT(foundobj->v_type == VDIR); } if (foundobj->v_mountedhere != NULL) { error = vcache_tryvget(foundobj); if (error != 0) { foundobj = NULL; error = EOPNOTSUPP; } break; } else { searchdir = NULL; } } /* * Time to stop if we found the last component & traversed * all mounts. */ if ((cnp->cn_flags & ISLASTCN) != 0) { error = vcache_tryvget(foundobj); if (error != 0) { foundobj = NULL; error = EOPNOTSUPP; } else { terminal = (foundobj->v_type != VLNK); } break; } /* * Otherwise, we're still in business. Set the found VDIR * vnode as the search dir for the next component and * continue on to it. */ cnp->cn_nameptr = ndp->ni_next; searchdir = foundobj; } if (terminal) { /* * If we exited the loop above having successfully located * the last component with a zero error code, and it's not a * symbolic link, then the parent directory is not needed. * Release reference to the starting parent and make the * terminal parent disappear into thin air. */ KASSERT(plock != NULL); rw_exit(plock); vrele(*searchdir_ret); *searchdir_ret = NULL; } else if (searchdir != *searchdir_ret) { /* * Otherwise we need to return the parent. If we ended up * with a new search dir, ref it before dropping the * namecache's lock. The lock prevents both searchdir and * foundobj from disappearing. If we can't ref the new * searchdir, we have a bit of a problem. Roll back the * fastforward to the beginning and let lookup_once() take * care of it. */ if (searchdir == NULL) { /* * It's possible for searchdir to be NULL in the * case of a root vnode being reclaimed while * trying to cross a mount. */ error2 = EOPNOTSUPP; } else { error2 = vcache_tryvget(searchdir); } KASSERT(plock != NULL); rw_exit(plock); if (__predict_true(error2 == 0)) { /* Returning new searchdir, and maybe new foundobj. */ vrele(*searchdir_ret); *searchdir_ret = searchdir; } else { /* Returning nothing. */ if (foundobj != NULL) { vrele(foundobj); foundobj = NULL; } cnp->cn_nameptr = oldnameptr; ndp->ni_pathlen = oldpathlen; error = lookup_parsepath(state, *searchdir_ret); if (error == 0) { error = EOPNOTSUPP; } } } else if (plock != NULL) { /* Drop any namecache lock still held. */ rw_exit(plock); } KASSERT(error == 0 ? foundobj != NULL : foundobj == NULL); *foundobj_ret = foundobj; return error; } ////////////////////////////// /* * Do a complete path search from a single root directory. * (This is called up to twice if TRYEMULROOT is in effect.) */ static int namei_oneroot(struct namei_state *state, int neverfollow, int inhibitmagic, int isnfsd) { struct nameidata *ndp = state->ndp; struct componentname *cnp = state->cnp; struct vnode *searchdir, *foundobj; bool searchdir_locked = false; int error; error = namei_start(state, isnfsd, &searchdir); if (error) { ndp->ni_dvp = NULL; ndp->ni_vp = NULL; return error; } KASSERT(searchdir->v_type == VDIR); /* * Setup: break out flag bits into variables. */ state->docache = (cnp->cn_flags & NOCACHE) ^ NOCACHE; if (cnp->cn_nameiop == DELETE) state->docache = 0; state->rdonly = cnp->cn_flags & RDONLY; /* * Keep going until we run out of path components. */ cnp->cn_nameptr = ndp->ni_pnbuf; /* drop leading slashes (already used them to choose startdir) */ while (cnp->cn_nameptr[0] == '/') { cnp->cn_nameptr++; ndp->ni_pathlen--; } /* was it just "/"? */ if (cnp->cn_nameptr[0] == '\0') { foundobj = searchdir; searchdir = NULL; cnp->cn_flags |= ISLASTCN; /* bleh */ goto skiploop; } for (;;) { KASSERT(searchdir != NULL); KASSERT(!searchdir_locked); /* * Parse out the first path name component that we need to * to consider. While doing this, attempt to use the name * cache to fast-forward through as many "easy" to find * components of the path as possible. */ error = lookup_fastforward(state, &searchdir, &foundobj); /* * If we didn't get a good answer from the namecache, then * go directly to the file system. */ if (error == EOPNOTSUPP) { error = lookup_once(state, searchdir, &searchdir, &foundobj, &searchdir_locked); } /* * If the vnode we found is mounted on, then cross the mount * and get the root vnode in foundobj. If this encounters * an error, it will dispose of foundobj, but searchdir is * untouched. */ if (error == 0 && foundobj != NULL && foundobj->v_type == VDIR && foundobj->v_mountedhere != NULL && (cnp->cn_flags & NOCROSSMOUNT) == 0) { error = lookup_crossmount(state, &searchdir, &foundobj, &searchdir_locked); } if (error) { if (searchdir != NULL) { if (searchdir_locked) { searchdir_locked = false; vput(searchdir); } else { vrele(searchdir); } } ndp->ni_dvp = NULL; ndp->ni_vp = NULL; /* * Note that if we're doing TRYEMULROOT we can * retry with the normal root. Where this is * currently set matches previous practice, * but the previous practice didn't make much * sense and somebody should sit down and * figure out which cases should cause retry * and which shouldn't. XXX. */ state->attempt_retry = 1; return (error); } if (foundobj == NULL) { /* * Success with no object returned means we're * creating something and it isn't already * there. Break out of the main loop now so * the code below doesn't have to test for * foundobj == NULL. */ /* lookup_once can't have dropped the searchdir */ KASSERT(searchdir != NULL || (cnp->cn_flags & ISLASTCN) != 0); break; } /* * Check for symbolic link. If we've reached one, * follow it, unless we aren't supposed to. Back up * over any slashes that we skipped, as we will need * them again. */ if (namei_atsymlink(state, foundobj)) { /* Don't need searchdir locked any more. */ if (searchdir_locked) { searchdir_locked = false; VOP_UNLOCK(searchdir); } ndp->ni_pathlen += state->slashes; ndp->ni_next -= state->slashes; if (neverfollow) { error = EINVAL; } else if (searchdir == NULL) { /* * dholland 20160410: lookup_once only * drops searchdir if it crossed a * mount point. Therefore, if we get * here it means we crossed a mount * point to a mounted filesystem whose * root vnode is a symlink. In theory * we could continue at this point by * using the pre-crossing searchdir * (e.g. just take out an extra * reference on it before calling * lookup_once so we still have it), * but this will make an ugly mess and * it should never happen in practice * as only badly broken filesystems * have non-directory root vnodes. (I * have seen this sort of thing with * NFS occasionally but even then it * means something's badly wrong.) */ error = ENOTDIR; } else { /* * dholland 20110410: if we're at a * union mount it might make sense to * use the top of the union stack here * rather than the layer we found the * symlink in. (FUTURE) */ error = namei_follow(state, inhibitmagic, searchdir, foundobj, &searchdir); } if (error) { KASSERT(searchdir != foundobj); if (searchdir != NULL) { vrele(searchdir); } vrele(foundobj); ndp->ni_dvp = NULL; ndp->ni_vp = NULL; return error; } vrele(foundobj); foundobj = NULL; /* * If we followed a symlink to `/' and there * are no more components after the symlink, * we're done with the loop and what we found * is the searchdir. */ if (cnp->cn_nameptr[0] == '\0') { KASSERT(searchdir != NULL); foundobj = searchdir; searchdir = NULL; cnp->cn_flags |= ISLASTCN; break; } continue; } /* * Not a symbolic link. * * Check for directory, if the component was * followed by a series of slashes. */ if ((foundobj->v_type != VDIR) && (cnp->cn_flags & REQUIREDIR)) { KASSERT(foundobj != searchdir); if (searchdir) { if (searchdir_locked) { searchdir_locked = false; vput(searchdir); } else { vrele(searchdir); } } else { KASSERT(!searchdir_locked); } vrele(foundobj); ndp->ni_dvp = NULL; ndp->ni_vp = NULL; state->attempt_retry = 1; return ENOTDIR; } /* * Stop if we've reached the last component. */ if (cnp->cn_flags & ISLASTCN) { break; } /* * Continue with the next component. */ cnp->cn_nameptr = ndp->ni_next; if (searchdir != NULL) { if (searchdir_locked) { searchdir_locked = false; vput(searchdir); } else { vrele(searchdir); } } searchdir = foundobj; foundobj = NULL; } KASSERT((cnp->cn_flags & LOCKPARENT) == 0 || searchdir == NULL || VOP_ISLOCKED(searchdir) == LK_EXCLUSIVE); skiploop: if (foundobj != NULL) { if (foundobj == ndp->ni_erootdir) { /* * We are about to return the emulation root. * This isn't a good idea because code might * repeatedly lookup ".." until the file * matches that returned for "/" and loop * forever. So convert it to the real root. */ if (searchdir != NULL) { if (searchdir_locked) { vput(searchdir); searchdir_locked = false; } else { vrele(searchdir); } searchdir = NULL; } vrele(foundobj); foundobj = ndp->ni_rootdir; vref(foundobj); } /* * If the caller requested the parent node (i.e. it's * a CREATE, DELETE, or RENAME), and we don't have one * (because this is the root directory, or we crossed * a mount point), then we must fail. * * 20210604 dholland when NONEXCLHACK is set (open * with O_CREAT but not O_EXCL) skip this logic. Since * we have a foundobj, open will not be creating, so * it doesn't actually need or use the searchdir, so * it's ok to return it even if it's on a different * volume, and it's also ok to return NULL; by setting * NONEXCLHACK the open code promises to cope with * those cases correctly. (That is, it should do what * it would do anyway, that is, just release the * searchdir, except not crash if it's null.) This is * needed because otherwise opening mountpoints with * O_CREAT but not O_EXCL fails... which is a silly * thing to do but ought to work. (This whole issue * came to light because 3rd party code wanted to open * certain procfs nodes with O_CREAT for some 3rd * party reason, and it failed.) * * Note that NONEXCLHACK is properly a different * nameiop (it is partway between LOOKUP and CREATE) * but it was stuffed in as a flag instead to make the * resulting patch less invasive for pullup. Blah. */ if (cnp->cn_nameiop != LOOKUP && (searchdir == NULL || searchdir->v_mount != foundobj->v_mount) && (cnp->cn_flags & NONEXCLHACK) == 0) { if (searchdir) { if (searchdir_locked) { vput(searchdir); searchdir_locked = false; } else { vrele(searchdir); } searchdir = NULL; } vrele(foundobj); foundobj = NULL; ndp->ni_dvp = NULL; ndp->ni_vp = NULL; state->attempt_retry = 1; switch (cnp->cn_nameiop) { case CREATE: return EEXIST; case DELETE: case RENAME: return EBUSY; default: break; } panic("Invalid nameiop\n"); } /* * Disallow directory write attempts on read-only lookups. * Prefers EEXIST over EROFS for the CREATE case. */ if (state->rdonly && (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)) { if (searchdir) { if (searchdir_locked) { vput(searchdir); searchdir_locked = false; } else { vrele(searchdir); } searchdir = NULL; } vrele(foundobj); foundobj = NULL; ndp->ni_dvp = NULL; ndp->ni_vp = NULL; state->attempt_retry = 1; return EROFS; } /* Lock the leaf node if requested. */ if ((cnp->cn_flags & (LOCKLEAF | LOCKPARENT)) == LOCKPARENT && searchdir == foundobj) { /* * Note: if LOCKPARENT but not LOCKLEAF is * set, and searchdir == foundobj, this code * necessarily unlocks the parent as well as * the leaf. That is, just because you specify * LOCKPARENT doesn't mean you necessarily get * a locked parent vnode. The code in * vfs_syscalls.c, and possibly elsewhere, * that uses this combination "knows" this, so * it can't be safely changed. Feh. XXX */ KASSERT(searchdir_locked); VOP_UNLOCK(searchdir); searchdir_locked = false; } else if ((cnp->cn_flags & LOCKLEAF) != 0 && (searchdir != foundobj || (cnp->cn_flags & LOCKPARENT) == 0)) { const int lktype = (cnp->cn_flags & LOCKSHARED) != 0 ? LK_SHARED : LK_EXCLUSIVE; vn_lock(foundobj, lktype | LK_RETRY); } } /* * Done. */ /* * If LOCKPARENT is not set, the parent directory isn't returned. */ if ((cnp->cn_flags & LOCKPARENT) == 0 && searchdir != NULL) { vrele(searchdir); searchdir = NULL; } ndp->ni_dvp = searchdir; ndp->ni_vp = foundobj; return 0; } /* * Do namei; wrapper layer that handles TRYEMULROOT. */ static int namei_tryemulroot(struct namei_state *state, int neverfollow, int inhibitmagic, int isnfsd) { int error; struct nameidata *ndp = state->ndp; struct componentname *cnp = state->cnp; const char *savepath = NULL; KASSERT(cnp == &ndp->ni_cnd); if (cnp->cn_flags & TRYEMULROOT) { savepath = pathbuf_stringcopy_get(ndp->ni_pathbuf); } emul_retry: state->attempt_retry = 0; error = namei_oneroot(state, neverfollow, inhibitmagic, isnfsd); if (error) { /* * Once namei has started up, the existence of ni_erootdir * tells us whether we're working from an emulation root. * The TRYEMULROOT flag isn't necessarily authoritative. */ if (ndp->ni_erootdir != NULL && state->attempt_retry) { /* Retry the whole thing using the normal root */ cnp->cn_flags &= ~TRYEMULROOT; state->attempt_retry = 0; /* kinda gross */ strcpy(ndp->ni_pathbuf->pb_path, savepath); pathbuf_stringcopy_put(ndp->ni_pathbuf, savepath); savepath = NULL; goto emul_retry; } } if (savepath != NULL) { pathbuf_stringcopy_put(ndp->ni_pathbuf, savepath); } return error; } /* * External interface. */ int namei(struct nameidata *ndp) { struct namei_state state; int error; namei_init(&state, ndp); error = namei_tryemulroot(&state, 0/*!neverfollow*/, 0/*!inhibitmagic*/, 0/*isnfsd*/); namei_cleanup(&state); if (error) { /* make sure no stray refs leak out */ KASSERT(ndp->ni_dvp == NULL); KASSERT(ndp->ni_vp == NULL); } return error; } //////////////////////////////////////////////////////////// /* * External interface used by nfsd. This is basically different from * namei only in that it has the ability to pass in the "current * directory", and uses an extra flag "neverfollow" for which there's * no physical flag defined in namei.h. (There used to be a cut&paste * copy of about half of namei in nfsd to allow these minor * adjustments to exist.) * * XXX: the namei interface should be adjusted so nfsd can just use * ordinary namei(). */ int lookup_for_nfsd(struct nameidata *ndp, struct vnode *forcecwd, int neverfollow) { struct namei_state state; int error; KASSERT(ndp->ni_atdir == NULL); ndp->ni_atdir = forcecwd; namei_init(&state, ndp); error = namei_tryemulroot(&state, neverfollow, 1/*inhibitmagic*/, 1/*isnfsd*/); namei_cleanup(&state); if (error) { /* make sure no stray refs leak out */ KASSERT(ndp->ni_dvp == NULL); KASSERT(ndp->ni_vp == NULL); } return error; } /* * A second external interface used by nfsd. This turns out to be a * single lookup used by the WebNFS code (ha!) to get "index.html" or * equivalent when asked for a directory. It should eventually evolve * into some kind of namei_once() call; for the time being it's kind * of a mess. XXX. * * dholland 20110109: I don't think it works, and I don't think it * worked before I started hacking and slashing either, and I doubt * anyone will ever notice. */ /* * Internals. This calls lookup_once() after setting up the assorted * pieces of state the way they ought to be. */ static int do_lookup_for_nfsd_index(struct namei_state *state) { int error; struct componentname *cnp = state->cnp; struct nameidata *ndp = state->ndp; struct vnode *startdir; struct vnode *foundobj; bool startdir_locked; const char *cp; /* pointer into pathname argument */ KASSERT(cnp == &ndp->ni_cnd); startdir = state->ndp->ni_atdir; cnp->cn_nameptr = ndp->ni_pnbuf; state->docache = 1; state->rdonly = cnp->cn_flags & RDONLY; ndp->ni_dvp = NULL; error = VOP_PARSEPATH(startdir, cnp->cn_nameptr, &cnp->cn_namelen); if (error) { return error; } cp = cnp->cn_nameptr + cnp->cn_namelen; KASSERT(cnp->cn_namelen <= KERNEL_NAME_MAX); ndp->ni_pathlen -= cnp->cn_namelen; ndp->ni_next = cp; state->slashes = 0; cnp->cn_flags &= ~REQUIREDIR; cnp->cn_flags |= MAKEENTRY|ISLASTCN; if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.' && cnp->cn_nameptr[0] == '.') cnp->cn_flags |= ISDOTDOT; else cnp->cn_flags &= ~ISDOTDOT; /* * Because lookup_once can change the startdir, we need our * own reference to it to avoid consuming the caller's. */ vref(startdir); error = lookup_once(state, startdir, &startdir, &foundobj, &startdir_locked); KASSERT((cnp->cn_flags & LOCKPARENT) == 0); if (startdir_locked) { VOP_UNLOCK(startdir); startdir_locked = false; } /* * If the vnode we found is mounted on, then cross the mount and get * the root vnode in foundobj. If this encounters an error, it will * dispose of foundobj, but searchdir is untouched. */ if (error == 0 && foundobj != NULL && foundobj->v_type == VDIR && foundobj->v_mountedhere != NULL && (cnp->cn_flags & NOCROSSMOUNT) == 0) { error = lookup_crossmount(state, &startdir, &foundobj, &startdir_locked); } /* Now toss startdir and see if we have an error. */ if (startdir != NULL) vrele(startdir); if (error) foundobj = NULL; else if (foundobj != NULL && (cnp->cn_flags & LOCKLEAF) != 0) vn_lock(foundobj, LK_EXCLUSIVE | LK_RETRY); ndp->ni_vp = foundobj; return (error); } /* * External interface. The partitioning between this function and the * above isn't very clear - the above function exists mostly so code * that uses "state->" can be shuffled around without having to change * it to "state.". */ int lookup_for_nfsd_index(struct nameidata *ndp, struct vnode *startdir) { struct namei_state state; int error; KASSERT(ndp->ni_atdir == NULL); ndp->ni_atdir = startdir; /* * Note: the name sent in here (is not|should not be) allowed * to contain a slash. */ if (strlen(ndp->ni_pathbuf->pb_path) > KERNEL_NAME_MAX) { return ENAMETOOLONG; } if (strchr(ndp->ni_pathbuf->pb_path, '/')) { return EINVAL; } ndp->ni_pathlen = strlen(ndp->ni_pathbuf->pb_path) + 1; ndp->ni_pnbuf = NULL; ndp->ni_cnd.cn_nameptr = NULL; namei_init(&state, ndp); error = do_lookup_for_nfsd_index(&state); namei_cleanup(&state); return error; } //////////////////////////////////////////////////////////// /* * Reacquire a path name component. * dvp is locked on entry and exit. * *vpp is locked on exit unless it's NULL. */ int relookup(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, int dummy) { int rdonly; /* lookup read-only flag bit */ int error = 0; #ifdef DEBUG size_t newlen; /* DEBUG: check name len */ const char *cp; /* DEBUG: check name ptr */ #endif /* DEBUG */ (void)dummy; /* * Setup: break out flag bits into variables. */ rdonly = cnp->cn_flags & RDONLY; /* * Search a new directory. * * The cn_hash value is for use by vfs_cache. * The last component of the filename is left accessible via * cnp->cn_nameptr for callers that need the name. Callers needing * the name set the SAVENAME flag. When done, they assume * responsibility for freeing the pathname buffer. */ #ifdef DEBUG #if 0 cp = NULL; newhash = namei_hash(cnp->cn_nameptr, &cp); if ((uint32_t)newhash != (uint32_t)cnp->cn_hash) panic("relookup: bad hash"); #endif error = VOP_PARSEPATH(dvp, cnp->cn_nameptr, &newlen); if (error) { panic("relookup: parsepath failed with error %d", error); } if (cnp->cn_namelen != newlen) panic("relookup: bad len"); cp = cnp->cn_nameptr + cnp->cn_namelen; while (*cp == '/') cp++; if (*cp != 0) panic("relookup: not last component"); #endif /* DEBUG */ /* * Check for degenerate name (e.g. / or "") * which is a way of talking about a directory, * e.g. like "/." or ".". */ if (cnp->cn_nameptr[0] == '\0') panic("relookup: null name"); if (cnp->cn_flags & ISDOTDOT) panic("relookup: lookup on dot-dot"); /* * We now have a segment name to search for, and a directory to search. */ *vpp = NULL; error = VOP_LOOKUP(dvp, vpp, cnp); if ((error) != 0) { KASSERTMSG((*vpp == NULL), "leaf `%s' should be empty but is %p", cnp->cn_nameptr, *vpp); if (error != EJUSTRETURN) goto bad; } /* * Check for symbolic link */ KASSERTMSG((*vpp == NULL || (*vpp)->v_type != VLNK || (cnp->cn_flags & FOLLOW) == 0), "relookup: symlink found"); /* * Check for read-only lookups. */ if (rdonly && cnp->cn_nameiop != LOOKUP) { error = EROFS; if (*vpp) { vrele(*vpp); } goto bad; } /* * Lock result. */ if (*vpp && *vpp != dvp) { error = vn_lock(*vpp, LK_EXCLUSIVE); if (error != 0) { vrele(*vpp); goto bad; } } return (0); bad: *vpp = NULL; return (error); } /* * namei_simple - simple forms of namei. * * These are wrappers to allow the simple case callers of namei to be * left alone while everything else changes under them. */ /* Flags */ struct namei_simple_flags_type { int dummy; }; static const struct namei_simple_flags_type ns_nn, ns_nt, ns_fn, ns_ft; const namei_simple_flags_t NSM_NOFOLLOW_NOEMULROOT = &ns_nn; const namei_simple_flags_t NSM_NOFOLLOW_TRYEMULROOT = &ns_nt; const namei_simple_flags_t NSM_FOLLOW_NOEMULROOT = &ns_fn; const namei_simple_flags_t NSM_FOLLOW_TRYEMULROOT = &ns_ft; static int namei_simple_convert_flags(namei_simple_flags_t sflags) { if (sflags == NSM_NOFOLLOW_NOEMULROOT) return NOFOLLOW | 0; if (sflags == NSM_NOFOLLOW_TRYEMULROOT) return NOFOLLOW | TRYEMULROOT; if (sflags == NSM_FOLLOW_NOEMULROOT) return FOLLOW | 0; if (sflags == NSM_FOLLOW_TRYEMULROOT) return FOLLOW | TRYEMULROOT; panic("namei_simple_convert_flags: bogus sflags\n"); return 0; } int namei_simple_kernel(const char *path, namei_simple_flags_t sflags, struct vnode **vp_ret) { return nameiat_simple_kernel(NULL, path, sflags, vp_ret); } int nameiat_simple_kernel(struct vnode *dvp, const char *path, namei_simple_flags_t sflags, struct vnode **vp_ret) { struct nameidata nd; struct pathbuf *pb; int err; pb = pathbuf_create(path); if (pb == NULL) { return ENOMEM; } NDINIT(&nd, LOOKUP, namei_simple_convert_flags(sflags), pb); if (dvp != NULL) NDAT(&nd, dvp); err = namei(&nd); if (err != 0) { pathbuf_destroy(pb); return err; } *vp_ret = nd.ni_vp; pathbuf_destroy(pb); return 0; } int namei_simple_user(const char *path, namei_simple_flags_t sflags, struct vnode **vp_ret) { return nameiat_simple_user(NULL, path, sflags, vp_ret); } int nameiat_simple_user(struct vnode *dvp, const char *path, namei_simple_flags_t sflags, struct vnode **vp_ret) { struct pathbuf *pb; struct nameidata nd; int err; err = pathbuf_copyin(path, &pb); if (err) { return err; } NDINIT(&nd, LOOKUP, namei_simple_convert_flags(sflags), pb); if (dvp != NULL) NDAT(&nd, dvp); err = namei(&nd); if (err != 0) { pathbuf_destroy(pb); return err; } *vp_ret = nd.ni_vp; pathbuf_destroy(pb); return 0; }
20 1 11 100 1 2 13 13 2 9 92 1 1 78 2 189 479 1 478 171 1 239 395 1 370 123 393 22 20 9 391 4 324 325 392 2 392 387 119 392 393 393 369 1 119 119 109 5 393 368 3 369 138 9 138 40 3 100 5 98 1 96 2 367 2 366 217 171 3 82 7 24 24 186 173 391 391 5 393 392 82 82 7 7 7 1 80 80 80 80 79 3 3 3 1 2 5 80 5 78 80 80 75 9 79 1 3 80 82 7 80 50 50 50 45 9 7 47 46 9 46 50 50 50 50 11 11 11 11 11 11 11 14 13 14 14 14 1 14 14 397 396 395 396 396 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 /* $NetBSD: ufs_lookup.c,v 1.158 2023/08/10 20:49:20 mrg Exp $ */ /* * Copyright (c) 1989, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)ufs_lookup.c 8.9 (Berkeley) 8/11/94 */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: ufs_lookup.c,v 1.158 2023/08/10 20:49:20 mrg Exp $"); #ifdef _KERNEL_OPT #include "opt_ffs.h" #endif #include <sys/param.h> #include <sys/systm.h> #include <sys/namei.h> #include <sys/buf.h> #include <sys/file.h> #include <sys/stat.h> #include <sys/mount.h> #include <sys/vnode.h> #include <sys/kernel.h> #include <sys/kauth.h> #include <sys/wapbl.h> #include <sys/proc.h> #include <sys/kmem.h> #include <ufs/ufs/inode.h> #include <ufs/ufs/dir.h> #ifdef UFS_DIRHASH #include <ufs/ufs/dirhash.h> #endif #include <ufs/ufs/ufsmount.h> #include <ufs/ufs/ufs_extern.h> #include <ufs/ufs/ufs_bswap.h> #include <ufs/ufs/ufs_wapbl.h> #include <miscfs/genfs/genfs.h> #ifdef DIAGNOSTIC int dirchk = 1; #else int dirchk = 0; #endif #if BYTE_ORDER == LITTLE_ENDIAN # define ENDIANSWAP(needswap) ((needswap) == 0) #else # define ENDIANSWAP(needswap) ((needswap) != 0) #endif #define NAMLEN(fsfmt, needswap, dp) \ ((fsfmt) && ENDIANSWAP(needswap) ? (dp)->d_type : (dp)->d_namlen) static void ufs_dirswap(struct direct *dirp) { uint8_t tmp = dirp->d_namlen; dirp->d_namlen = dirp->d_type; dirp->d_type = tmp; } struct slotinfo { enum { NONE, /* need to search a slot for our new entry */ COMPACT, /* a compaction can make a slot in the current DIRBLKSIZ block */ FOUND, /* found a slot (or no need to search) */ } status; doff_t offset; /* offset of area with free space. a special value -1 for invalid */ int size; /* size of area at slotoffset */ int freespace; /* accumulated amount of space free in the current DIRBLKSIZ block */ int needed; /* size of the entry we're seeking */ }; static void calc_count(struct ufs_lookup_results *results, int dirblksiz, doff_t prevoff) { if ((results->ulr_offset & (dirblksiz - 1)) == 0) results->ulr_count = 0; else results->ulr_count = results->ulr_offset - prevoff; } static void slot_init(struct slotinfo *slot) { slot->status = FOUND; slot->offset = -1; slot->freespace = slot->size = slot->needed = 0; } #ifdef UFS_DIRHASH static doff_t slot_findfree(struct slotinfo *slot, struct inode *dp) { if (slot->status == FOUND) return dp->i_size; slot->offset = ufsdirhash_findfree(dp, slot->needed, &slot->size); if (slot->offset < 0) return dp->i_size; slot->status = COMPACT; doff_t enduseful = ufsdirhash_enduseful(dp); if (enduseful < 0) return dp->i_size; return enduseful; } #endif static void slot_white(struct slotinfo *slot, uint16_t reclen, struct ufs_lookup_results *results) { slot->status = FOUND; slot->offset = results->ulr_offset; slot->size = reclen; results->ulr_reclen = slot->size; } static void slot_update(struct slotinfo *slot, int size, uint16_t reclen, doff_t offset) { if (size >= slot->needed) { slot->status = FOUND; slot->offset = offset; slot->size = reclen; } else if (slot->status == NONE) { slot->freespace += size; if (slot->offset == -1) slot->offset = offset; if (slot->freespace >= slot->needed) { slot->status = COMPACT; slot->size = offset + reclen - slot->offset; } } } /* * Return an indication of where the new directory entry should be put. * If we didn't find a slot, then set results->ulr_count to 0 indicating * that the new slot belongs at the end of the directory. If we found a slot, * then the new entry can be put in the range from results->ulr_offset to * results->ulr_offset + results->ulr_count. */ static int slot_estimate(const struct slotinfo *slot, int dirblksiz, int nameiop, doff_t prevoff, doff_t enduseful, const struct inode *ip, struct ufs_lookup_results *results) { if (slot->status == NONE) { results->ulr_offset = roundup(ip->i_size, dirblksiz); results->ulr_count = 0; enduseful = results->ulr_offset; } else if (nameiop == DELETE) { results->ulr_offset = slot->offset; calc_count(results, dirblksiz, prevoff); } else { results->ulr_offset = slot->offset; results->ulr_count = slot->size; if (enduseful < slot->offset + slot->size) enduseful = slot->offset + slot->size; } results->ulr_endoff = roundup(enduseful, dirblksiz); #if 0 /* commented out by dbj. none of the on disk fields changed */ ip->i_flag |= IN_CHANGE | IN_UPDATE; #endif return EJUSTRETURN; } /* * Check if we can delete inode tdp in directory vdp with inode ip and creds. */ static int ufs_can_delete(struct vnode *tdp, struct vnode *vdp, struct inode *ip, kauth_cred_t cred) { int error; #ifdef UFS_ACL /* * NFSv4 Minor Version 1, draft-ietf-nfsv4-minorversion1-03.txt * * 3.16.2.1. ACE4_DELETE vs. ACE4_DELETE_CHILD */ /* * XXX: Is this check required? */ error = VOP_ACCESS(vdp, VEXEC, cred); if (error) goto out; #if 0 /* Moved to ufs_remove, ufs_rmdir because they hold the lock */ error = VOP_ACCESSX(tdp, VDELETE, cred); if (error == 0) return (0); #endif error = VOP_ACCESSX(vdp, VDELETE_CHILD, cred); if (error == 0) return (0); error = VOP_ACCESSX(vdp, VEXPLICIT_DENY | VDELETE_CHILD, cred); if (error) goto out; #endif /* !UFS_ACL */ /* * Write access to directory required to delete files. */ error = VOP_ACCESS(vdp, VWRITE, cred); if (error) goto out; if (!(ip->i_mode & ISVTX)) return 0; /* * If directory is "sticky", then user must own * the directory, or the file in it, else she * may not delete it (unless she's root). This * implements append-only directories. */ error = kauth_authorize_vnode(cred, KAUTH_VNODE_DELETE, tdp, vdp, genfs_can_sticky(vdp, cred, ip->i_uid, VTOI(tdp)->i_uid)); if (error) { error = EPERM; // Why override? goto out; } return 0; out: vrele(tdp); return error; } static int ufs_getino(struct vnode *vdp, struct inode *ip, ino_t foundino, struct vnode **tdp, bool same) { if (ip->i_number == foundino) { if (same) return EISDIR; vref(vdp); *tdp = vdp; return 0; } return vcache_get(vdp->v_mount, &foundino, sizeof(foundino), tdp); } /* * Convert a component of a pathname into a pointer to a locked inode. * This is a very central and rather complicated routine. * If the file system is not maintained in a strict tree hierarchy, * this can result in a deadlock situation (see comments in code below). * * The cnp->cn_nameiop argument is LOOKUP, CREATE, RENAME, or DELETE depending * on whether the name is to be looked up, created, renamed, or deleted. * When CREATE, RENAME, or DELETE is specified, information usable in * creating, renaming, or deleting a directory entry may be calculated. * If flag has LOCKPARENT or'ed into it and the target of the pathname * exists, lookup returns both the target and its parent directory locked. * When creating or renaming and LOCKPARENT is specified, the target may * not be ".". When deleting and LOCKPARENT is specified, the target may * be "."., but the caller must check to ensure it does an vrele and vput * instead of two vputs. * * Overall outline of ufs_lookup: * * check accessibility of directory * look for name in cache, if found, then if at end of path * and deleting or creating, drop it, else return name * search for name in directory, to found or notfound * notfound: * if creating, return locked directory, leaving info on available slots * else return error * found: * if at end of path and deleting, return information to allow delete * if at end of path and rewriting (RENAME and LOCKPARENT), lock target * inode and return info to allow rewrite * if not at end, add name to cache; if at end and neither creating * nor deleting, add name to cache */ int ufs_lookup(void *v) { struct vop_lookup_v2_args /* { struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; } */ *ap = v; struct vnode *vdp = ap->a_dvp; /* vnode for directory being searched */ struct inode *dp = VTOI(vdp); /* inode for directory being searched */ struct buf *bp; /* a buffer of directory entries */ struct direct *ep; /* the current directory entry */ int entryoffsetinblock; /* offset of ep in bp's buffer */ struct slotinfo slot; int numdirpasses; /* strategy for directory search */ doff_t endsearch; /* offset to end directory search */ doff_t prevoff; /* previous value of ulr_offset */ struct vnode *tdp; /* returned by vcache_get */ doff_t enduseful; /* pointer past last used dir slot. used for directory truncation. */ u_long bmask; /* block offset mask */ int error; struct vnode **vpp = ap->a_vpp; struct componentname *cnp = ap->a_cnp; kauth_cred_t cred = cnp->cn_cred; int flags; int nameiop = cnp->cn_nameiop; struct ufsmount *ump = dp->i_ump; const int needswap = UFS_MPNEEDSWAP(ump); int dirblksiz = ump->um_dirblksiz; ino_t foundino; struct ufs_lookup_results *results; int iswhiteout; /* temp result from cache_lookup() */ const int fsfmt = FSFMT(vdp); uint16_t reclen; flags = cnp->cn_flags; bp = NULL; *vpp = NULL; endsearch = 0; /* silence compiler warning */ /* * Check accessibility of directory. */ if ((error = VOP_ACCESS(vdp, VEXEC, cred)) != 0) return (error); if ((flags & ISLASTCN) && (vdp->v_mount->mnt_flag & MNT_RDONLY) && (nameiop == DELETE || nameiop == RENAME)) return (EROFS); /* * We now have a segment name to search for, and a directory to search. * * Before tediously performing a linear scan of the directory, * check the name cache to see if the directory/name pair * we are looking for is known already. */ if (cache_lookup(vdp, cnp->cn_nameptr, cnp->cn_namelen, cnp->cn_nameiop, cnp->cn_flags, &iswhiteout, vpp)) { if (iswhiteout) { cnp->cn_flags |= ISWHITEOUT; } return *vpp == NULLVP ? ENOENT : 0; } /* May need to restart the lookup with an exclusive lock. */ if (VOP_ISLOCKED(vdp) != LK_EXCLUSIVE) { return ENOLCK; } /* * Produce the auxiliary lookup results into i_crap. Increment * its serial number so elsewhere we can tell if we're using * stale results. This should not be done this way. XXX. */ results = &dp->i_crap; dp->i_crapcounter++; if (iswhiteout) { /* * The namecache set iswhiteout without finding a * cache entry. As of this writing (20121014), this * can happen if there was a whiteout entry that has * been invalidated by the lookup. It is not clear if * it is correct to set ISWHITEOUT in this case or * not; however, doing so retains the prior behavior, * so we'll go with that until some clearer answer * appears. XXX */ cnp->cn_flags |= ISWHITEOUT; } /* * Suppress search for slots unless creating * file and at end of pathname, in which case * we watch for a place to put the new file in * case it doesn't already exist. */ slot_init(&slot); if ((nameiop == CREATE || nameiop == RENAME) && (flags & ISLASTCN)) { slot.status = NONE; slot.needed = UFS_DIRECTSIZ(cnp->cn_namelen); } /* * If there is cached information on a previous search of * this directory, pick up where we last left off. * We cache only lookups as these are the most common * and have the greatest payoff. Caching CREATE has little * benefit as it usually must search the entire directory * to determine that the entry does not exist. Caching the * location of the last DELETE or RENAME has not reduced * profiling time and hence has been removed in the interest * of simplicity. */ bmask = vdp->v_mount->mnt_stat.f_iosize - 1; #ifdef UFS_DIRHASH /* * Use dirhash for fast operations on large directories. The logic * to determine whether to hash the directory is contained within * ufsdirhash_build(); a zero return means that it decided to hash * this directory and it successfully built up the hash table. */ if (ufsdirhash_build(dp) == 0) { /* Look for a free slot if needed. */ enduseful = slot_findfree(&slot, dp); /* Look up the component. */ numdirpasses = 1; entryoffsetinblock = 0; /* silence compiler warning */ switch (ufsdirhash_lookup(dp, cnp->cn_nameptr, cnp->cn_namelen, &results->ulr_offset, &bp, nameiop == DELETE ? &prevoff : NULL)) { case 0: ep = (void *)((char *)bp->b_data + (results->ulr_offset & bmask)); reclen = ufs_rw16(ep->d_reclen, needswap); goto foundentry; case ENOENT: results->ulr_offset = roundup(dp->i_size, dirblksiz); goto notfound; default: /* Something failed; just do a linear search. */ break; } } #endif /* UFS_DIRHASH */ if (nameiop != LOOKUP || results->ulr_diroff == 0 || results->ulr_diroff >= dp->i_size) { entryoffsetinblock = 0; results->ulr_offset = 0; numdirpasses = 1; } else { results->ulr_offset = results->ulr_diroff; entryoffsetinblock = results->ulr_offset & bmask; if (entryoffsetinblock != 0 && (error = ufs_blkatoff(vdp, (off_t)results->ulr_offset, NULL, &bp, false))) goto out; numdirpasses = 2; namecache_count_2passes(); } prevoff = results->ulr_offset; endsearch = roundup(dp->i_size, dirblksiz); enduseful = 0; searchloop: while (results->ulr_offset < endsearch) { preempt_point(); /* * If necessary, get the next directory block. */ if ((results->ulr_offset & bmask) == 0) { if (bp != NULL) brelse(bp, 0); error = ufs_blkatoff(vdp, (off_t)results->ulr_offset, NULL, &bp, false); if (error) goto out; entryoffsetinblock = 0; } /* * If still looking for a slot, and at a DIRBLKSIZ * boundary, have to start looking for free space again. */ if (slot.status == NONE && (entryoffsetinblock & (dirblksiz - 1)) == 0) { slot.offset = -1; slot.freespace = 0; } /* * Get pointer to next entry. * Full validation checks are slow, so we only check * enough to insure forward progress through the * directory. Complete checks can be run by patching * "dirchk" to be true. */ KASSERT(bp != NULL); ep = (void *)((char *)bp->b_data + entryoffsetinblock); const char *msg; reclen = ufs_rw16(ep->d_reclen, needswap); if ((reclen == 0 && (msg = "null entry")) || (dirchk && (msg = ufs_dirbadentry(vdp, ep, entryoffsetinblock)))) { ufs_dirbad(dp, results->ulr_offset, msg); reclen = dirblksiz - (entryoffsetinblock & (dirblksiz - 1)); goto next; } /* * If an appropriate sized slot has not yet been found, * check to see if one is available. Also accumulate space * in the current block so that we can determine if * compaction is viable. */ if (slot.status != FOUND) { int size = reclen; if (ep->d_ino != 0) size -= UFS_DIRSIZ(fsfmt, ep, needswap); if (size > 0) slot_update(&slot, size, reclen, results->ulr_offset); } if (ep->d_ino == 0) goto next; /* * Check for a name match. */ const uint16_t namlen = NAMLEN(fsfmt, needswap, ep); if (namlen != cnp->cn_namelen || memcmp(cnp->cn_nameptr, ep->d_name, (size_t)namlen)) goto next; #ifdef UFS_DIRHASH foundentry: #endif /* * Save directory entry's inode number and * reclen, and release directory buffer. */ if (!fsfmt && ep->d_type == DT_WHT) { slot_white(&slot, reclen, results); /* * This is used to set results->ulr_endoff, which may * be used by ufs_direnter() as a length to truncate * the directory to. Therefore, it must point past the * end of the last non-empty directory entry. We don't * know where that is in this case, so we effectively * disable shrinking by using the existing size of the * directory. * * Note that we wouldn't expect to shrink the * directory while rewriting an existing entry anyway. */ enduseful = endsearch; cnp->cn_flags |= ISWHITEOUT; numdirpasses--; goto notfound; } foundino = ufs_rw32(ep->d_ino, needswap); results->ulr_reclen = reclen; goto found; next: prevoff = results->ulr_offset; results->ulr_offset += reclen; entryoffsetinblock += reclen; if (ep->d_ino) enduseful = results->ulr_offset; } notfound: /* * If we started in the middle of the directory and failed * to find our target, we must check the beginning as well. */ if (numdirpasses == 2) { numdirpasses--; results->ulr_offset = 0; endsearch = results->ulr_diroff; goto searchloop; } if (bp != NULL) brelse(bp, 0); /* * If creating, and at end of pathname and current * directory has not been removed, then can consider * allowing file to be created. */ if ((nameiop == CREATE || nameiop == RENAME || (nameiop == DELETE && (cnp->cn_flags & DOWHITEOUT) && (cnp->cn_flags & ISWHITEOUT))) && (flags & ISLASTCN) && dp->i_nlink != 0) { /* * Access for write is interpreted as allowing * creation of files in the directory. */ if (flags & WILLBEDIR) error = VOP_ACCESSX(vdp, VWRITE | VAPPEND, cred); else error = VOP_ACCESS(vdp, VWRITE, cred); if (error) goto out; error = slot_estimate(&slot, dirblksiz, nameiop, prevoff, enduseful, dp, results); /* * We return with the directory locked, so that * the parameters we set up above will still be * valid if we actually decide to do a direnter(). * We return ni_vp == NULL to indicate that the entry * does not currently exist; we leave a pointer to * the (locked) directory inode in ndp->ni_dvp. * * NB - if the directory is unlocked, then this * information cannot be used. */ goto out; } /* * Insert name into cache (as non-existent) if appropriate. */ if (nameiop != CREATE) { cache_enter(vdp, *vpp, cnp->cn_nameptr, cnp->cn_namelen, cnp->cn_flags); } error = ENOENT; goto out; found: if (numdirpasses == 2) namecache_count_pass2(); /* * Check that directory length properly reflects presence * of this entry. */ const uint64_t newisize = results->ulr_offset + UFS_DIRSIZ(fsfmt, ep, needswap); if (newisize > dp->i_size) { ufs_dirbad(dp, results->ulr_offset, "i_size too small"); dp->i_size = newisize; DIP_ASSIGN(dp, size, dp->i_size); dp->i_flag |= IN_CHANGE | IN_UPDATE; UFS_WAPBL_UPDATE(vdp, NULL, NULL, UPDATE_DIROP); } brelse(bp, 0); /* * Found component in pathname. * If the final component of path name, save information * in the cache as to where the entry was found. */ if ((flags & ISLASTCN) && nameiop == LOOKUP) results->ulr_diroff = results->ulr_offset & ~(dirblksiz - 1); /* * If deleting, and at end of pathname, return * parameters which can be used to remove file. * Lock the inode, being careful with ".". */ if (nameiop == DELETE && (flags & ISLASTCN)) { /* * Return pointer to current entry in results->ulr_offset, * and distance past previous entry (if there * is a previous entry in this block) in results->ulr_count. * Save directory inode pointer in ndp->ni_dvp for dirremove(). */ calc_count(results, dirblksiz, prevoff); if ((error = ufs_getino(vdp, dp, foundino, &tdp, false)) != 0) goto out; if ((error = ufs_can_delete(tdp, vdp, dp, cred)) != 0) goto out; *vpp = tdp; goto out; } /* * If rewriting (RENAME), return the inode and the * information required to rewrite the present directory * Must get inode of directory entry to verify it's a * regular file, or empty directory. */ if (nameiop == RENAME && (flags & ISLASTCN)) { if (flags & WILLBEDIR) error = VOP_ACCESSX(vdp, VWRITE | VAPPEND, cred); else error = VOP_ACCESS(vdp, VWRITE, cred); if (error) goto out; /* * Careful about locking second inode. * This can only occur if the target is ".". */ if ((error = ufs_getino(vdp, dp, foundino, &tdp, true)) != 0) goto out; *vpp = tdp; goto out; } if ((error = ufs_getino(vdp, dp, foundino, &tdp, false)) != 0) goto out; *vpp = tdp; /* * Insert name into cache if appropriate. */ cache_enter(vdp, *vpp, cnp->cn_nameptr, cnp->cn_namelen, cnp->cn_flags); error = 0; out: return error; } void ufs_dirbad(struct inode *ip, doff_t offset, const char *how) { struct mount *mp = ITOV(ip)->v_mount; void (*p)(const char *, ...) __printflike(1, 2) = (mp->mnt_flag & MNT_RDONLY) == 0 ? panic : printf; (*p)("%s: bad dir ino %ju at offset %d: %s\n", mp->mnt_stat.f_mntonname, (uintmax_t)ip->i_number, offset, how); } /* * Do consistency checking on a directory entry: * record length must be multiple of 4 * entry must fit in rest of its DIRBLKSIZ block * record must be large enough to contain entry * name is not longer than FFS_MAXNAMLEN * name must be as long as advertised, and null terminated */ const char * ufs_dirbadentry(const struct vnode *dp, const struct direct *ep, int entryoffsetinblock) { const struct ufsmount *ump = VFSTOUFS(dp->v_mount); const int needswap = UFS_MPNEEDSWAP(ump); const int dirblksiz = ump->um_dirblksiz; const int maxsize = dirblksiz - (entryoffsetinblock & (dirblksiz - 1)); const int fsfmt = FSFMT(dp); const uint8_t namlen = NAMLEN(fsfmt, needswap, ep); const uint16_t reclen = ufs_rw16(ep->d_reclen, needswap); const int dirsiz = (int)UFS_DIRSIZ(fsfmt, ep, needswap); const char *name = ep->d_name; const char *str; #ifdef DIAGNOSTIC static char buf[512]; #endif if ((reclen & 0x3) != 0) str = "not rounded"; else if (reclen > maxsize) str = "too big"; else if (reclen < dirsiz) str = "too small"; #if FFS_MAXNAMLEN < 255 else if (namlen > FFS_MAXNAMLEN) str = "long name"; #endif else str = NULL; if (str) { #ifdef DIAGNOSTIC snprintf(buf, sizeof(buf), "Bad dir (%s), reclen=%#x, " "namlen=%d, dirsiz=%d <= reclen=%d <= maxsize=%d, " "flags=%#x, entryoffsetinblock=%d, dirblksiz=%d", str, reclen, namlen, dirsiz, reclen, maxsize, dp->v_mount->mnt_flag, entryoffsetinblock, dirblksiz); str = buf; #endif return str; } if (ep->d_ino == 0) return NULL; for (uint8_t i = 0; i < namlen; i++) if (name[i] == '\0') { str = "NUL in name"; #ifdef DIAGNOSTIC snprintf(buf, sizeof(buf), "%s [%s] i=%d, namlen=%d", str, name, i, namlen); str = buf; #endif return str; } if (name[namlen]) { str = "missing NUL in name"; #ifdef DIAGNOSTIC snprintf(buf, sizeof(buf), "%s [%*.*s] namlen=%d", str, namlen, namlen, name, namlen); str = buf; #endif return str; } return NULL; } /* * Construct a new directory entry after a call to namei, using the * name in the componentname argument cnp. The argument ip is the * inode to which the new directory entry will refer. */ void ufs_makedirentry(struct inode *ip, struct componentname *cnp, struct direct *newdirp) { size_t namelen = cnp->cn_namelen; newdirp->d_ino = ip->i_number; newdirp->d_namlen = namelen; memcpy(newdirp->d_name, cnp->cn_nameptr, namelen); /* NUL terminate and zero out padding */ memset(&newdirp->d_name[namelen], 0, UFS_NAMEPAD(namelen)); if (FSFMT(ITOV(ip))) newdirp->d_type = 0; else newdirp->d_type = IFTODT(ip->i_mode); } static int ufs_dirgrow(struct vnode *dvp, const struct ufs_lookup_results *ulr, struct vnode *tvp, struct direct *dirp, struct componentname *cnp, struct buf *newdirbp) { const kauth_cred_t cr = cnp->cn_cred; const struct ufsmount *ump = VFSTOUFS(dvp->v_mount); const int needswap = UFS_MPNEEDSWAP(ump); const int dirblksiz = ump->um_dirblksiz; const int fsfmt = FSFMT(dvp); const u_int newentrysize = UFS_DIRSIZ(0, dirp, 0); struct inode *dp = VTOI(dvp); int error, ret, blkoff; struct timespec ts; struct buf *bp; /* * If ulr_count is 0, then namei could find no * space in the directory. Here, ulr_offset will * be on a directory block boundary and we will write the * new entry into a fresh block. */ if (ulr->ulr_offset & (dirblksiz - 1)) panic("%s: newblk", __func__); if ((error = UFS_BALLOC(dvp, (off_t)ulr->ulr_offset, dirblksiz, cr, B_CLRBUF | B_SYNC, &bp)) != 0) { return error; } dp->i_size = ulr->ulr_offset + dirblksiz; DIP_ASSIGN(dp, size, dp->i_size); dp->i_flag |= IN_CHANGE | IN_UPDATE; uvm_vnp_setsize(dvp, dp->i_size); dirp->d_reclen = ufs_rw16(dirblksiz, needswap); dirp->d_ino = ufs_rw32(dirp->d_ino, needswap); if (fsfmt && ENDIANSWAP(needswap)) ufs_dirswap(dirp); blkoff = ulr->ulr_offset & (ump->um_mountp->mnt_stat.f_iosize - 1); memcpy((char *)bp->b_data + blkoff, dirp, newentrysize); #ifdef UFS_DIRHASH if (dp->i_dirhash != NULL) { ufsdirhash_newblk(dp, ulr->ulr_offset); ufsdirhash_add(dp, dirp, ulr->ulr_offset); ufsdirhash_checkblock(dp, (char *)bp->b_data + blkoff, ulr->ulr_offset); } #endif error = VOP_BWRITE(bp->b_vp, bp); vfs_timestamp(&ts); ret = UFS_UPDATE(dvp, &ts, &ts, UPDATE_DIROP); if (error == 0) return ret; return error; } static int #if __GNUC_PREREQ__(5, 3) /* This gets miscompiled by gcc 5.3 PR/51094 */ __attribute__((__optimize__("no-tree-vrp"))) #endif ufs_dircompact(struct vnode *dvp, const struct ufs_lookup_results *ulr, struct vnode *tvp, struct direct *dirp, struct componentname *cnp, struct buf *newdirbp) { const struct ufsmount *ump = VFSTOUFS(dvp->v_mount); const int needswap = UFS_MPNEEDSWAP(ump); const int fsfmt = FSFMT(dvp); const u_int newentrysize = UFS_DIRSIZ(0, dirp, 0); struct inode *dp = VTOI(dvp); struct buf *bp; u_int dsize; struct direct *ep, *nep; int error, loc, spacefree; char *dirbuf; uint16_t reclen; UFS_WAPBL_JLOCK_ASSERT(dvp->v_mount); /* * If ulr_count is non-zero, then namei found space for the new * entry in the range ulr_offset to ulr_offset + ulr_count * in the directory. To use this space, we may have to compact * the entries located there, by copying them together towards the * beginning of the block, leaving the free space in one usable * chunk at the end. */ /* * Increase size of directory if entry eats into new space. * This should never push the size past a new multiple of * DIRBLKSIZ. * * N.B. - THIS IS AN ARTIFACT OF 4.2 AND SHOULD NEVER HAPPEN. */ if (ulr->ulr_offset + ulr->ulr_count > dp->i_size) { #ifdef DIAGNOSTIC printf("%s: reached 4.2-only block, not supposed to happen\n", __func__); #endif dp->i_size = ulr->ulr_offset + ulr->ulr_count; DIP_ASSIGN(dp, size, dp->i_size); dp->i_flag |= IN_CHANGE | IN_UPDATE; UFS_WAPBL_UPDATE(dvp, NULL, NULL, UPDATE_DIROP); } /* * Get the block containing the space for the new directory entry. */ error = ufs_blkatoff(dvp, (off_t)ulr->ulr_offset, &dirbuf, &bp, true); if (error) return error; /* * Find space for the new entry. In the simple case, the entry at * offset base will have the space. If it does not, then namei * arranged that compacting the region ulr_offset to * ulr_offset + ulr_count would yield the space. */ ep = (void *)dirbuf; dsize = (ep->d_ino != 0) ? UFS_DIRSIZ(fsfmt, ep, needswap) : 0; reclen = ufs_rw16(ep->d_reclen, needswap); spacefree = reclen - dsize; for (loc = reclen; loc < ulr->ulr_count; ) { nep = (void *)(dirbuf + loc); /* Trim the existing slot (NB: dsize may be zero). */ ep->d_reclen = ufs_rw16(dsize, needswap); ep = (void *)((char *)ep + dsize); reclen = ufs_rw16(nep->d_reclen, needswap); loc += reclen; if (nep->d_ino == 0) { /* * A mid-block unused entry. Such entries are * never created by the kernel, but fsck_ffs * can create them (and it doesn't fix them). * * Add up the free space, and initialise the * relocated entry since we don't memcpy it. */ spacefree += reclen; ep->d_ino = 0; dsize = 0; continue; } dsize = UFS_DIRSIZ(fsfmt, nep, needswap); spacefree += reclen - dsize; #ifdef UFS_DIRHASH if (dp->i_dirhash != NULL) ufsdirhash_move(dp, nep, ulr->ulr_offset + ((char *)nep - dirbuf), ulr->ulr_offset + ((char *)ep - dirbuf)); #endif memcpy(ep, nep, dsize); } /* * Here, `ep' points to a directory entry containing `dsize' in-use * bytes followed by `spacefree' unused bytes. If ep->d_ino == 0, * then the entry is completely unused (dsize == 0). The value * of ep->d_reclen is always indeterminate. * * Update the pointer fields in the previous entry (if any), * copy in the new entry, and write out the block. */ if (ep->d_ino == 0 || (ufs_rw32(ep->d_ino, needswap) == UFS_WINO && memcmp(ep->d_name, dirp->d_name, dirp->d_namlen) == 0)) { if (spacefree + dsize < newentrysize) panic("%s: too big", __func__); dirp->d_reclen = spacefree + dsize; } else { if (spacefree < newentrysize) panic("%s: nospace", __func__); dirp->d_reclen = spacefree; ep->d_reclen = ufs_rw16(dsize, needswap); ep = (void *)((char *)ep + dsize); } dirp->d_reclen = ufs_rw16(dirp->d_reclen, needswap); dirp->d_ino = ufs_rw32(dirp->d_ino, needswap); if (fsfmt && ENDIANSWAP(needswap)) ufs_dirswap(dirp); #ifdef UFS_DIRHASH if (dp->i_dirhash != NULL && (ep->d_ino == 0 || dirp->d_reclen == spacefree)) ufsdirhash_add(dp, dirp, ulr->ulr_offset + ((char *)ep - dirbuf)); #endif memcpy(ep, dirp, newentrysize); #ifdef UFS_DIRHASH if (dp->i_dirhash != NULL) { const int dirblkmsk = ump->um_dirblksiz - 1; ufsdirhash_checkblock(dp, dirbuf - (ulr->ulr_offset & dirblkmsk), ulr->ulr_offset & ~dirblkmsk); } #endif error = VOP_BWRITE(bp->b_vp, bp); dp->i_flag |= IN_CHANGE | IN_UPDATE; /* * If all went well, and the directory can be shortened, proceed * with the truncation. Note that we have to unlock the inode for * the entry that we just entered, as the truncation may need to * lock other inodes which can lead to deadlock if we also hold a * lock on the newly entered node. */ if (error == 0 && ulr->ulr_endoff && ulr->ulr_endoff < dp->i_size) { const kauth_cred_t cr = cnp->cn_cred; #ifdef UFS_DIRHASH if (dp->i_dirhash != NULL) ufsdirhash_dirtrunc(dp, ulr->ulr_endoff); #endif (void) UFS_TRUNCATE(dvp, (off_t)ulr->ulr_endoff, IO_SYNC, cr); } UFS_WAPBL_UPDATE(dvp, NULL, NULL, UPDATE_DIROP); return error; } /* * Write a directory entry after a call to namei, using the parameters * that ufs_lookup left in nameidata and in the ufs_lookup_results. * * DVP is the directory to be updated. It must be locked. * ULR is the ufs_lookup_results structure from the final lookup step. * TVP is not used. (XXX: why is it here? remove it) * DIRP is the new directory entry contents. * CNP is the componentname from the final lookup step. * NEWDIRBP is not used and (XXX) should be removed. The previous * comment here said it was used by the now-removed softupdates code. * * The link count of the target inode is *not* incremented; the * caller does that. * * If ulr->ulr_count is 0, ufs_lookup did not find space to insert the * directory entry. ulr_offset, which is the place to put the entry, * should be on a block boundary (and should be at the end of the * directory AFAIK) and a fresh block is allocated to put the new * directory entry in. * * If ulr->ulr_count is not zero, ufs_lookup found a slot to insert * the entry into. This slot ranges from ulr_offset to ulr_offset + * ulr_count. However, this slot may already be partially populated * requiring compaction. See notes below. * * Furthermore, if ulr_count is not zero and ulr_endoff is not the * same as i_size, the directory is truncated to size ulr_endoff. */ int ufs_direnter(struct vnode *dvp, const struct ufs_lookup_results *ulr, struct vnode *tvp, struct direct *dirp, struct componentname *cnp, struct buf *newdirbp) { if (ulr->ulr_count == 0) return ufs_dirgrow(dvp, ulr, tvp, dirp, cnp, newdirbp); else return ufs_dircompact(dvp, ulr, tvp, dirp, cnp, newdirbp); } /* * Remove a directory entry after a call to namei, using the * parameters that ufs_lookup left in nameidata and in the * ufs_lookup_results. * * DVP is the directory to be updated. It must be locked. * ULR is the ufs_lookup_results structure from the final lookup step. * IP, if not null, is the inode being unlinked. * FLAGS may contain DOWHITEOUT. * ISRMDIR is not used and (XXX) should be removed. * * If FLAGS contains DOWHITEOUT the entry is replaced with a whiteout * instead of being cleared. * * ulr->ulr_offset contains the position of the directory entry * to be removed. * * ulr->ulr_reclen contains the size of the directory entry to be * removed. * * ulr->ulr_count contains the size of the *previous* directory * entry. This allows finding it, for free space management. If * ulr_count is 0, the target entry is at the beginning of the * directory. (Does this ever happen? The first entry should be ".", * which should only be removed at rmdir time. Does rmdir come here * to clear out the "." and ".." entries? Perhaps, but I doubt it.) * * The space is marked free by adding it to the record length (not * name length) of the preceding entry. If the first entry becomes * free, it is marked free by setting the inode number to 0. * * The link count of IP is decremented. Note that this is not the * inverse behavior of ufs_direnter, which does not adjust link * counts. Sigh. */ int ufs_dirremove(struct vnode *dvp, const struct ufs_lookup_results *ulr, struct inode *ip, int flags, int isrmdir) { struct inode *dp = VTOI(dvp); struct direct *ep; struct buf *bp; int error; const int needswap = UFS_MPNEEDSWAP(dp->i_ump); uint16_t reclen; UFS_WAPBL_JLOCK_ASSERT(dvp->v_mount); if (flags & DOWHITEOUT) { /* * Whiteout entry: set d_ino to UFS_WINO. */ error = ufs_blkatoff(dvp, (off_t)ulr->ulr_offset, &ep, &bp, true); if (error) return (error); ep->d_ino = ufs_rw32(UFS_WINO, needswap); ep->d_type = DT_WHT; goto out; } if ((error = ufs_blkatoff(dvp, (off_t)(ulr->ulr_offset - ulr->ulr_count), &ep, &bp, true)) != 0) return (error); reclen = ufs_rw16(ep->d_reclen, needswap); #ifdef UFS_DIRHASH /* * Remove the dirhash entry. This is complicated by the fact * that `ep' is the previous entry when ulr_count != 0. */ if (dp->i_dirhash != NULL) ufsdirhash_remove(dp, (ulr->ulr_count == 0) ? ep : (void *)((char *)ep + reclen), ulr->ulr_offset); #endif if (ulr->ulr_count == 0) { /* * First entry in block: set d_ino to zero. */ ep->d_ino = 0; } else { /* * Collapse new free space into previous entry. */ ep->d_reclen = ufs_rw16(reclen + ulr->ulr_reclen, needswap); } #ifdef UFS_DIRHASH if (dp->i_dirhash != NULL) { int dirblksiz = ip->i_ump->um_dirblksiz; ufsdirhash_checkblock(dp, (char *)ep - ((ulr->ulr_offset - ulr->ulr_count) & (dirblksiz - 1)), ulr->ulr_offset & ~(dirblksiz - 1)); } #endif out: if (ip) { ip->i_nlink--; DIP_ASSIGN(ip, nlink, ip->i_nlink); ip->i_flag |= IN_CHANGE; UFS_WAPBL_UPDATE(ITOV(ip), NULL, NULL, 0); } /* * XXX did it ever occur to anyone that it might be a good * idea to restore ip->i_nlink if this fails? Or something? * Currently on error return from this function the state of * ip->i_nlink depends on what happened, and callers * definitely do not take this into account. */ error = VOP_BWRITE(bp->b_vp, bp); dp->i_flag |= IN_CHANGE | IN_UPDATE; /* * If the last named reference to a snapshot goes away, * drop its snapshot reference so that it will be reclaimed * when last open reference goes away. */ if (ip != 0 && (ip->i_flags & SF_SNAPSHOT) != 0 && ip->i_nlink == 0) UFS_SNAPGONE(ITOV(ip)); UFS_WAPBL_UPDATE(dvp, NULL, NULL, 0); return (error); } /* * Rewrite an existing directory entry to point at the inode supplied. * * DP is the directory to update. * OFFSET is the position of the entry in question. It may come * from ulr_offset of a ufs_lookup_results. * OIP is the old inode the directory previously pointed to. * NEWINUM is the number of the new inode. * NEWTYPE is the new value for the type field of the directory entry. * (This is ignored if the fs doesn't support that.) * ISRMDIR is not used and (XXX) should be removed. * IFLAGS are added to DP's inode flags. * * The link count of OIP is decremented. Note that the link count of * the new inode is *not* incremented. Yay for symmetry. */ int ufs_dirrewrite(struct inode *dp, off_t offset, struct inode *oip, ino_t newinum, int newtype, int isrmdir, int iflags) { struct buf *bp; struct direct *ep; struct vnode *vdp = ITOV(dp); int error; error = ufs_blkatoff(vdp, offset, &ep, &bp, true); if (error) return (error); ep->d_ino = ufs_rw32(newinum, UFS_MPNEEDSWAP(dp->i_ump)); if (!FSFMT(vdp)) ep->d_type = newtype; oip->i_nlink--; DIP_ASSIGN(oip, nlink, oip->i_nlink); oip->i_flag |= IN_CHANGE; UFS_WAPBL_UPDATE(ITOV(oip), NULL, NULL, UPDATE_DIROP); error = VOP_BWRITE(bp->b_vp, bp); dp->i_flag |= iflags; /* * If the last named reference to a snapshot goes away, * drop its snapshot reference so that it will be reclaimed * when last open reference goes away. */ if ((oip->i_flags & SF_SNAPSHOT) != 0 && oip->i_nlink == 0) UFS_SNAPGONE(ITOV(oip)); UFS_WAPBL_UPDATE(vdp, NULL, NULL, UPDATE_DIROP); return (error); } /* * Check if a directory is empty or not. * Inode supplied must be locked. * * Using a struct dirtemplate here is not precisely * what we want, but better than using a struct direct. * * NB: does not handle corrupted directories. */ int ufs_dirempty(struct inode *ip, ino_t parentino, kauth_cred_t cred) { doff_t off; struct direct dbuf; struct direct *dp = &dbuf; int error; size_t count; const int needswap = UFS_IPNEEDSWAP(ip); const int fsfmt = FSFMT(ITOV(ip)); #define MINDIRSIZ (sizeof (struct dirtemplate) / 2) for (off = 0; off < ip->i_size; off += ufs_rw16(dp->d_reclen, needswap)) { error = ufs_bufio(UIO_READ, ITOV(ip), dp, MINDIRSIZ, off, IO_NODELOCKED, cred, &count, NULL); /* * Since we read MINDIRSIZ, residual must * be 0 unless we're at end of file. */ if (error || count != 0) return (0); /* avoid infinite loops */ if (dp->d_reclen == 0) return (0); /* skip empty entries */ ino_t ino = ufs_rw32(dp->d_ino, needswap); if (ino == 0 || ino == UFS_WINO) continue; /* accept only "." and ".." */ const uint8_t namlen = NAMLEN(fsfmt, needswap, dp); if (namlen > 2) return (0); if (dp->d_name[0] != '.') return (0); /* * At this point namlen must be 1 or 2. * 1 implies ".", 2 implies ".." if second * char is also "." */ if (namlen == 1 && ino == ip->i_number) continue; if (dp->d_name[1] == '.' && ino == parentino) continue; return (0); } return (1); } #define UFS_DIRRABLKS 0 int ufs_dirrablks = UFS_DIRRABLKS; /* * ufs_blkatoff: Return buffer with the contents of block "offset" from * the beginning of directory "vp". If "res" is non-NULL, fill it in with * a pointer to the remaining space in the directory. If the caller intends * to modify the buffer returned, "modify" must be true. */ int ufs_blkatoff(struct vnode *vp, off_t offset, void *v, struct buf **bpp, bool modify) { char **res = v; struct inode *ip __diagused; struct buf *bp; daddr_t lbn; const int dirrablks = ufs_dirrablks; daddr_t *blks; int *blksizes; int run, error; struct mount *mp = vp->v_mount; const int bshift = mp->mnt_fs_bshift; const int bsize = 1 << bshift; off_t eof; blks = kmem_alloc((1 + dirrablks) * sizeof(daddr_t), KM_SLEEP); blksizes = kmem_alloc((1 + dirrablks) * sizeof(int), KM_SLEEP); ip = VTOI(vp); KASSERT(vp->v_size == ip->i_size); GOP_SIZE(vp, vp->v_size, &eof, 0); lbn = offset >> bshift; for (run = 0; run <= dirrablks;) { const off_t curoff = lbn << bshift; const int size = MIN(eof - curoff, bsize); if (size == 0) { break; } KASSERT(curoff < eof); blks[run] = lbn; blksizes[run] = size; lbn++; run++; if (size != bsize) { break; } } KASSERT(run >= 1); error = breadn(vp, blks[0], blksizes[0], &blks[1], &blksizes[1], run - 1, (modify ? B_MODIFY : 0), &bp); if (error != 0) { *bpp = NULL; goto out; } if (res) { *res = (char *)bp->b_data + (offset & (bsize - 1)); } *bpp = bp; out: kmem_free(blks, (1 + dirrablks) * sizeof(daddr_t)); kmem_free(blksizes, (1 + dirrablks) * sizeof(int)); return error; }
1 2 2 2 2 2 2 1 2 2 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 /* $NetBSD: pms.c,v 1.41 2023/09/05 05:55:12 mrg Exp $ */ /*- * Copyright (c) 2004 Kentaro Kurahone. * Copyright (c) 2004 Ales Krenek. * Copyright (c) 1994 Charles M. Hannum. * Copyright (c) 1992, 1993 Erik Forsberg. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * THIS SOFTWARE IS PROVIDED BY ``AS IS'' AND ANY EXPRESS OR IMPLIED * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN * NO EVENT SHALL I BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: pms.c,v 1.41 2023/09/05 05:55:12 mrg Exp $"); #include "opt_pms.h" #include <sys/param.h> #include <sys/systm.h> #include <sys/device.h> #include <sys/ioctl.h> #include <sys/kernel.h> #include <sys/kthread.h> #include <sys/bus.h> #include <dev/pckbport/pckbportvar.h> #ifdef PMS_SYNAPTICS_TOUCHPAD #include <dev/pckbport/synapticsvar.h> #endif #ifdef PMS_ELANTECH_TOUCHPAD #include <dev/pckbport/elantechvar.h> #endif #ifdef PMS_ALPS_TOUCHPAD #include <dev/pckbport/alpsvar.h> #endif #include <dev/pckbport/pmsreg.h> #include <dev/pckbport/pmsvar.h> #include <dev/wscons/wsconsio.h> #include <dev/wscons/wsmousevar.h> #ifdef PMSDEBUG int pmsdebug = 1; #define DPRINTF(x) if (pmsdebug) printf x #else #define DPRINTF(x) #endif static const enum pms_type tries[] = { PMS_SCROLL5, PMS_SCROLL3, PMS_STANDARD, PMS_UNKNOWN }; static const struct pms_protocol pms_protocols[] = { { { 0, 0, 0 }, 0, "unknown protocol" }, { { 0, 0, 0 }, 0, "no scroll wheel (3 buttons)" }, { { 200, 100, 80 }, 3, "scroll wheel (3 buttons)" }, { { 200, 200, 80 }, 4, "scroll wheel (5 buttons)" }, { { 0, 0, 0 }, 0, "synaptics" }, { { 0, 0, 0 }, 0, "elantech" } }; static int pmsprobe(device_t, cfdata_t, void *); static void pmsattach(device_t, device_t, void *); static void pmsinput(void *, int); CFATTACH_DECL_NEW(pms, sizeof(struct pms_softc), pmsprobe, pmsattach, NULL, NULL); static int pms_protocol(pckbport_tag_t, pckbport_slot_t); static void do_enable(struct pms_softc *); static void do_disable(struct pms_softc *); static void pms_reset_thread(void*); static int pms_enable(void *); static int pms_ioctl(void *, u_long, void *, int, struct lwp *); static void pms_disable(void *); static bool pms_suspend(device_t, const pmf_qual_t *); static bool pms_resume(device_t, const pmf_qual_t *); static const struct wsmouse_accessops pms_accessops = { .enable = pms_enable, .ioctl = pms_ioctl, .disable = pms_disable, }; static int pms_protocol(pckbport_tag_t tag, pckbport_slot_t slot) { u_char cmd[2], resp[1]; int i, j, res; const struct pms_protocol *p; for (j = 0; j < sizeof(tries) / sizeof(tries[0]); ++j) { p = &pms_protocols[tries[j]]; if (!p->rates[0]) break; cmd[0] = PMS_SET_SAMPLE; for (i = 0; i < 3; i++) { cmd[1] = p->rates[i]; res = pckbport_enqueue_cmd(tag, slot, cmd, 2, 0, 1, 0); if (res) return PMS_STANDARD; } cmd[0] = PMS_SEND_DEV_ID; res = pckbport_enqueue_cmd(tag, slot, cmd, 1, 1, 1, resp); if (res) return PMS_UNKNOWN; if (resp[0] == p->response) { DPRINTF(("pms_protocol: found mouse protocol %d\n", tries[j])); return tries[j]; } } DPRINTF(("pms_protocol: standard PS/2 protocol (no scroll wheel)\n")); return PMS_STANDARD; } int pmsprobe(device_t parent, cfdata_t match, void *aux) { struct pckbport_attach_args *pa = aux; u_char cmd[1], resp[2]; int res; if (pa->pa_slot != PCKBPORT_AUX_SLOT) return 0; /* Flush any garbage. */ pckbport_flush(pa->pa_tag, pa->pa_slot); /* reset the device */ cmd[0] = PMS_RESET; res = pckbport_poll_cmd(pa->pa_tag, pa->pa_slot, cmd, 1, 2, resp, 1); if (res) { aprint_debug("pmsprobe: reset error %d\n", res); return 0; } if (resp[0] != PMS_RSTDONE) { printf("pmsprobe: reset response 0x%x\n", resp[0]); return 0; } /* get type number (0 = mouse) */ if (resp[1] != 0) { aprint_debug("pmsprobe: type 0x%x\n", resp[1]); return 0; } return 10; } static void pmsattach(device_t parent, device_t self, void *aux) { struct pms_softc *sc = device_private(self); struct pckbport_attach_args *pa = aux; struct wsmousedev_attach_args a; u_char cmd[2], resp[2]; int res; sc->sc_dev = self; sc->sc_kbctag = pa->pa_tag; sc->sc_kbcslot = pa->pa_slot; aprint_naive("\n"); aprint_normal("\n"); /* Flush any garbage. */ pckbport_flush(pa->pa_tag, pa->pa_slot); /* reset the device */ cmd[0] = PMS_RESET; res = pckbport_poll_cmd(pa->pa_tag, pa->pa_slot, cmd, 1, 2, resp, 1); if (res || resp[0] != PMS_RSTDONE || resp[1] != 0) { aprint_debug("pmsattach: reset error\n"); return; } sc->inputstate = 0; sc->buttons = 0; sc->protocol = PMS_UNKNOWN; #ifdef PMS_SYNAPTICS_TOUCHPAD /* Probe for synaptics touchpad. */ if (pms_synaptics_probe_init(sc) == 0) { sc->protocol = PMS_SYNAPTICS; } else #endif #ifdef PMS_ELANTECH_TOUCHPAD if (pms_elantech_probe_init(sc) == 0) { sc->protocol = PMS_ELANTECH; } else #endif #ifdef PMS_ALPS_TOUCHPAD if (pms_alps_probe_init(sc) == 0) { sc->protocol = PMS_ALPS; } else #endif /* Install generic handler. */ pckbport_set_inputhandler(sc->sc_kbctag, sc->sc_kbcslot, pmsinput, sc, device_xname(sc->sc_dev)); a.accessops = &pms_accessops; a.accesscookie = sc; /* * Attach the wsmouse, saving a handle to it. * Note that we don't need to check this pointer against NULL * here or in pmsintr, because if this fails pms_enable() will * never be called, so pmsinput() will never be called. */ sc->sc_wsmousedev = config_found(self, &a, wsmousedevprint, CFARGS_NONE); /* no interrupts until enabled */ cmd[0] = PMS_DEV_DISABLE; res = pckbport_poll_cmd(pa->pa_tag, pa->pa_slot, cmd, 1, 0, NULL, 0); if (res) aprint_error("pmsattach: disable error\n"); pckbport_slot_enable(sc->sc_kbctag, sc->sc_kbcslot, 0); kthread_create(PRI_NONE, 0, NULL, pms_reset_thread, sc, &sc->sc_event_thread, "%s", device_xname(sc->sc_dev)); if (!pmf_device_register(self, pms_suspend, pms_resume)) aprint_error_dev(self, "couldn't establish power handler\n"); } static void do_enable(struct pms_softc *sc) { u_char cmd[2]; int res; sc->inputstate = 0; sc->buttons = 0; pckbport_slot_enable(sc->sc_kbctag, sc->sc_kbcslot, 1); #ifdef PMS_SYNAPTICS_TOUCHPAD if (sc->protocol == PMS_SYNAPTICS) pms_synaptics_enable(sc); #endif #ifdef PMS_ELANTECH_TOUCHPAD if (sc->protocol == PMS_ELANTECH) pms_elantech_enable(sc); #endif #ifdef PMS_ALPS_TOUCHPAD if (sc->protocol == PMS_ALPS) pms_alps_enable(sc); #endif cmd[0] = PMS_DEV_ENABLE; res = pckbport_enqueue_cmd(sc->sc_kbctag, sc->sc_kbcslot, cmd, 1, 0, 1, 0); if (res) aprint_error("pms_enable: command error %d\n", res); if (sc->protocol == PMS_UNKNOWN) sc->protocol = pms_protocol(sc->sc_kbctag, sc->sc_kbcslot); DPRINTF(("pms_enable: using %s protocol\n", pms_protocols[sc->protocol].name)); #if 0 { u_char scmd[2]; scmd[0] = PMS_SET_RES; scmd[1] = 3; /* 8 counts/mm */ res = pckbport_enqueue_cmd(sc->sc_kbctag, sc->sc_kbcslot, scmd, 2, 0, 1, 0); if (res) printf("pms_enable: setup error1 (%d)\n", res); scmd[0] = PMS_SET_SCALE21; res = pckbport_enqueue_cmd(sc->sc_kbctag, sc->sc_kbcslot, scmd, 1, 0, 1, 0); if (res) printf("pms_enable: setup error2 (%d)\n", res); scmd[0] = PMS_SET_SAMPLE; scmd[1] = 100; /* 100 samples/sec */ res = pckbport_enqueue_cmd(sc->sc_kbctag, sc->sc_kbcslot, scmd, 2, 0, 1, 0); if (res) printf("pms_enable: setup error3 (%d)\n", res); } #endif } static void do_disable(struct pms_softc *sc) { u_char cmd[1]; int res; cmd[0] = PMS_DEV_DISABLE; res = pckbport_enqueue_cmd(sc->sc_kbctag, sc->sc_kbcslot, cmd, 1, 0, 1, 0); if (res) aprint_error("pms_disable: command error\n"); pckbport_slot_enable(sc->sc_kbctag, sc->sc_kbcslot, 0); } static int pms_enable(void *v) { struct pms_softc *sc = v; int s; if (sc->sc_enabled) return EBUSY; do_enable(sc); s = spltty(); sc->sc_enabled = 1; splx(s); return 0; } static void pms_disable(void *v) { struct pms_softc *sc = v; int s; do_disable(sc); s = spltty(); sc->sc_enabled = 0; splx(s); } static bool pms_suspend(device_t dv, const pmf_qual_t *qual) { struct pms_softc *sc = device_private(dv); if (sc->sc_enabled) do_disable(sc); return true; } static bool pms_resume(device_t dv, const pmf_qual_t *qual) { struct pms_softc *sc = device_private(dv); #ifdef PMS_SYNAPTICS_TOUCHPAD if (sc->protocol == PMS_SYNAPTICS) { pms_synaptics_resume(sc); if (sc->sc_enabled) { do_enable(sc); } } else #endif #ifdef PMS_ELANTECH_TOUCHPAD if (sc->protocol == PMS_ELANTECH) { pms_elantech_resume(sc); if (sc->sc_enabled) { do_enable(sc); } } else #endif #ifdef PMS_ALPS_TOUCHPAD if (sc->protocol == PMS_ALPS) { pms_alps_resume(sc); if (sc->sc_enabled) { do_enable(sc); } } else #endif if (sc->sc_enabled) { /* recheck protocol & init mouse */ sc->protocol = PMS_UNKNOWN; do_enable(sc); /* only if we were suspended */ } return true; } static int pms_ioctl(void *v, u_long cmd, void *data, int flag, struct lwp *l) { struct pms_softc *sc = v; u_char kbcmd[2]; int i; switch (cmd) { case WSMOUSEIO_GTYPE: *(u_int *)data = WSMOUSE_TYPE_PS2; break; case WSMOUSEIO_SRES: i = (*(u_int *)data - 12) / 25; if (i < 0) i = 0; if (i > 3) i = 3; kbcmd[0] = PMS_SET_RES; kbcmd[1] = i; i = pckbport_enqueue_cmd(sc->sc_kbctag, sc->sc_kbcslot, kbcmd, 2, 0, 1, 0); if (i) printf("pms_ioctl: SET_RES command error\n"); break; default: return EPASSTHROUGH; } return 0; } static void pms_reset_thread(void *arg) { struct pms_softc *sc = arg; u_char cmd[1], resp[2]; int res; int save_protocol; for (;;) { tsleep(&sc->sc_enabled, PWAIT, "pmsreset", 0); #ifdef PMSDEBUG if (pmsdebug) #endif #if defined(PMSDEBUG) || defined(DIAGNOSTIC) aprint_debug_dev(sc->sc_dev, "resetting mouse interface\n"); #endif save_protocol = sc->protocol; pms_disable(sc); cmd[0] = PMS_RESET; res = pckbport_enqueue_cmd(sc->sc_kbctag, sc->sc_kbcslot, cmd, 1, 2, 1, resp); if (res) { DPRINTF(("%s: reset error %d\n", device_xname(sc->sc_dev), res)); } /* For the synaptics and elantech case, leave the protocol alone. */ if (sc->protocol != PMS_SYNAPTICS && sc->protocol != PMS_ELANTECH && sc->protocol != PMS_ALPS) sc->protocol = PMS_UNKNOWN; pms_enable(sc); if (sc->protocol != save_protocol) { #if defined(PMSDEBUG) || defined(DIAGNOSTIC) aprint_verbose_dev(sc->sc_dev, "protocol change, sleeping and retrying\n"); #endif pms_disable(sc); cmd[0] = PMS_RESET; res = pckbport_enqueue_cmd(sc->sc_kbctag, sc->sc_kbcslot, cmd, 1, 2, 1, resp); if (res) { DPRINTF(("%s: reset error %d\n", device_xname(sc->sc_dev), res)); } tsleep(pms_reset_thread, PWAIT, "pmsreset", hz); cmd[0] = PMS_RESET; res = pckbport_enqueue_cmd(sc->sc_kbctag, sc->sc_kbcslot, cmd, 1, 2, 1, resp); if (res) { DPRINTF(("%s: reset error %d\n", device_xname(sc->sc_dev), res)); } sc->protocol = PMS_UNKNOWN; /* reprobe protocol */ pms_enable(sc); #if defined(PMSDEBUG) || defined(DIAGNOSTIC) if (sc->protocol != save_protocol) { printf("%s: protocol changed.\n", device_xname(sc->sc_dev)); } #endif } } } /* Masks for the first byte of a packet */ #define PMS_LBUTMASK 0x01 #define PMS_RBUTMASK 0x02 #define PMS_MBUTMASK 0x04 #define PMS_4BUTMASK 0x10 #define PMS_5BUTMASK 0x20 static void pmsinput(void *vsc, int data) { struct pms_softc *sc = vsc; u_int changed; int dx, dy, dz = 0; int newbuttons = 0; if (!sc->sc_enabled) { /* Interrupts are not expected. Discard the byte. */ return; } getmicrouptime(&sc->current); if (sc->inputstate > 0) { struct timeval diff; timersub(&sc->current, &sc->last, &diff); /* * Empirically, the delay should be about 1700us on a standard * PS/2 port. I have seen delays as large as 4500us (rarely) * in regular use. When using a confused mouse, I generally * see delays at least as large as 30,000us. -seebs * * The thinkpad trackball returns at 22-23ms. So we use * >= 40ms. In the future, I'll implement adaptable timeout * by increasing the timeout if the mouse reset happens * too frequently -christos */ if (diff.tv_sec > 0 || diff.tv_usec >= 40000) { DPRINTF(("pms_input: unusual delay (%ld.%06ld s), " "scheduling reset\n", (long)diff.tv_sec, (long)diff.tv_usec)); sc->inputstate = 0; sc->sc_enabled = 0; wakeup(&sc->sc_enabled); return; } } sc->last = sc->current; if (sc->inputstate == 0) { /* * Some devices (seen on trackballs anytime, and on * some mice shortly after reset) output garbage bytes * between packets. Just ignore them. */ if ((data & 0xc0) != 0) return; /* not in sync yet, discard input */ } if (sc->inputstate >= sizeof(sc->packet)) panic("inputstate should never be %d", sc->inputstate); sc->packet[sc->inputstate++] = data & 0xff; switch (sc->inputstate) { case 0: /* no useful processing can be done yet */ break; case 1: /* * Why should we test for bit 0x8 and insist on it here? * The old (psm.c and psm_intelli.c) drivers didn't do * it, and there are devices where it does harm (that's * why it is not used if using PMS_STANDARD protocol). * Anyway, it does not to cause any harm to accept packets * without this bit. */ #if 0 if (sc->protocol == PMS_STANDARD) break; if (!(sc->packet[0] & 0x8)) { DPRINTF(("pmsinput: 0x8 not set in first byte " "[0x%02x], resetting\n", sc->packet[0])); sc->inputstate = 0; sc->sc_enabled = 0; wakeup(&sc->sc_enabled); return; } #endif break; case 2: break; case 4: /* Case 4 is a superset of case 3. This is *not* an accident. */ if (sc->protocol == PMS_SCROLL3) { dz = sc->packet[3]; if (dz >= 128) dz -= 256; if (dz == -128) dz = -127; } else if (sc->protocol == PMS_SCROLL5) { dz = sc->packet[3] & 0xf; if (dz >= 8) dz -= 16; if (sc->packet[3] & PMS_4BUTMASK) newbuttons |= 0x8; if (sc->packet[3] & PMS_5BUTMASK) newbuttons |= 0x10; } else { DPRINTF(("pmsinput: why am I looking at this byte?\n")); dz = 0; } /* FALLTHROUGH */ case 3: /* * This is only an endpoint for scroll protocols with 4 * bytes, or the standard protocol with 3. */ if (sc->protocol != PMS_STANDARD && sc->inputstate == 3) break; newbuttons |= ((sc->packet[0] & PMS_LBUTMASK) ? 0x1 : 0) | ((sc->packet[0] & PMS_MBUTMASK) ? 0x2 : 0) | ((sc->packet[0] & PMS_RBUTMASK) ? 0x4 : 0); dx = sc->packet[1]; if (dx >= 128) dx -= 256; if (dx == -128) dx = -127; dy = sc->packet[2]; if (dy >= 128) dy -= 256; if (dy == -128) dy = -127; sc->inputstate = 0; changed = (sc->buttons ^ newbuttons); sc->buttons = newbuttons; #ifdef PMSDEBUG if (sc->protocol == PMS_STANDARD) { DPRINTF(("pms: packet: 0x%02x%02x%02x\n", sc->packet[0], sc->packet[1], sc->packet[2])); } else { DPRINTF(("pms: packet: 0x%02x%02x%02x%02x\n", sc->packet[0], sc->packet[1], sc->packet[2], sc->packet[3])); } #endif if (dx || dy || dz || changed) { #ifdef PMSDEBUG DPRINTF(("pms: x %+03d y %+03d z %+03d " "buttons 0x%02x\n", dx, dy, dz, sc->buttons)); #endif wsmouse_input(sc->sc_wsmousedev, sc->buttons, dx, dy, dz, 0, WSMOUSE_INPUT_DELTA); } memset(sc->packet, 0, 4); break; /* If we get here, we have problems. */ default: printf("pmsinput: very confused. resetting.\n"); sc->inputstate = 0; sc->sc_enabled = 0; wakeup(&sc->sc_enabled); return; } } /* * Touchpad special command sequence used by Synaptics and others. * Sends 0xE6 0xE8 rr 0xE8 ss 0xE8 tt 0xE8 uu where (rr*64)+(ss*16)+(tt*4)+uu */ int pms_sliced_command(pckbport_tag_t tag, pckbport_slot_t slot, u_char scmd) { u_char cmd[2]; int i, err, ret = 0; cmd[0] = PMS_SET_SCALE11; ret = pckbport_poll_cmd(tag, slot, cmd, 1, 0, NULL, 0); /* * Need to send 4 Set Resolution commands, with the argument * encoded in the bottom most 2 bits. */ for (i = 6; i >= 0; i -= 2) { cmd[0] = PMS_SET_RES; cmd[1] = (scmd >> i) & 3; err = pckbport_poll_cmd(tag, slot, cmd, 2, 0, NULL, 0); if (ret == 0 && err != 0) { ret = err; } } return ret; }
2 2 2 2 1 1 2 2 2 1 5 2 4 4 1 4 3 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 /* $NetBSD: process_machdep.c,v 1.50 2023/11/20 03:05:48 simonb Exp $ */ /* * Copyright (c) 1998, 2000 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Charles M. Hannum. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /* * This file may seem a bit stylized, but that so that it's easier to port. * Functions to be implemented here are: * * process_read_regs(proc, regs) * Get the current user-visible register set from the process * and copy it into the regs structure (<machine/reg.h>). * The process is stopped at the time read_regs is called. * * process_write_regs(proc, regs) * Update the current register set from the passed in regs * structure. Take care to avoid clobbering special CPU * registers or privileged bits in the PSL. * The process is stopped at the time write_regs is called. * * process_read_fpregs(proc, regs, sz) * Get the current user-visible register set from the process * and copy it into the regs structure (<machine/reg.h>). * The process is stopped at the time read_fpregs is called. * * process_write_fpregs(proc, regs, sz) * Update the current register set from the passed in regs * structure. Take care to avoid clobbering special CPU * registers or privileged bits in the PSL. * The process is stopped at the time write_fpregs is called. * * process_read_dbregs(proc, regs, sz) * Get the current user-visible register set from the process * and copy it into the regs structure (<machine/reg.h>). * The process is stopped at the time read_dbregs is called. * * process_write_dbregs(proc, regs, sz) * Update the current register set from the passed in regs * structure. Take care to avoid clobbering special CPU * registers or privileged bits in the PSL. * The process is stopped at the time write_dbregs is called. * * process_sstep(proc) * Arrange for the process to trap after executing a single instruction. * * process_set_pc(proc) * Set the process's program counter. */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: process_machdep.c,v 1.50 2023/11/20 03:05:48 simonb Exp $"); #ifdef _KERNEL_OPT #include "opt_xen.h" #endif #include <sys/param.h> #include <sys/systm.h> #include <sys/time.h> #include <sys/kernel.h> #include <sys/proc.h> #include <sys/ptrace.h> #include <sys/compat_stub.h> #include <uvm/uvm_extern.h> #include <compat/netbsd32/netbsd32.h> #include <machine/psl.h> #include <machine/reg.h> #include <machine/segments.h> #include <x86/dbregs.h> #include <x86/fpu.h> struct netbsd32_process_doxmmregs_hook_t netbsd32_process_doxmmregs_hook; static inline struct trapframe *process_frame(struct lwp *); static inline struct trapframe * process_frame(struct lwp *l) { return l->l_md.md_regs; } int process_read_regs(struct lwp *l, struct reg *regp) { struct trapframe *tf = process_frame(l); long *regs = regp->regs; const bool pk32 = (l->l_proc->p_flag & PK_32) != 0; regs[_REG_RDI] = tf->tf_rdi; regs[_REG_RSI] = tf->tf_rsi; regs[_REG_RDX] = tf->tf_rdx; regs[_REG_R10] = tf->tf_r10; regs[_REG_R8] = tf->tf_r8; regs[_REG_R9] = tf->tf_r9; /* argX not touched */ regs[_REG_RCX] = tf->tf_rcx; regs[_REG_R11] = tf->tf_r11; regs[_REG_R12] = tf->tf_r12; regs[_REG_R13] = tf->tf_r13; regs[_REG_R14] = tf->tf_r14; regs[_REG_R15] = tf->tf_r15; regs[_REG_RBP] = tf->tf_rbp; regs[_REG_RBX] = tf->tf_rbx; regs[_REG_RAX] = tf->tf_rax; if (pk32) { regs[_REG_GS] = tf->tf_gs & 0xffff; regs[_REG_FS] = tf->tf_fs & 0xffff; regs[_REG_ES] = tf->tf_es & 0xffff; regs[_REG_DS] = tf->tf_ds & 0xffff; regs[_REG_CS] = tf->tf_cs & 0xffff; regs[_REG_SS] = tf->tf_ss & 0xffff; } else { regs[_REG_GS] = 0; regs[_REG_FS] = 0; regs[_REG_ES] = GSEL(GUDATA_SEL, SEL_UPL); regs[_REG_DS] = GSEL(GUDATA_SEL, SEL_UPL); regs[_REG_CS] = LSEL(LUCODE_SEL, SEL_UPL); regs[_REG_SS] = LSEL(LUDATA_SEL, SEL_UPL); } regs[_REG_TRAPNO] = tf->tf_trapno; regs[_REG_ERR] = tf->tf_err; regs[_REG_RIP] = tf->tf_rip; regs[_REG_RFLAGS] = tf->tf_rflags; regs[_REG_RSP] = tf->tf_rsp; return 0; } int process_read_fpregs(struct lwp *l, struct fpreg *regs, size_t *sz) { process_read_fpregs_xmm(l, &regs->fxstate); return 0; } int process_read_dbregs(struct lwp *l, struct dbreg *regs, size_t *sz) { x86_dbregs_read(l, regs); return 0; } int process_write_regs(struct lwp *l, const struct reg *regp) { struct trapframe *tf = process_frame(l); int error; const long *regs = regp->regs; const bool pk32 = (l->l_proc->p_flag & PK_32) != 0; /* * Check for security violations. Note that struct regs is compatible * with the __gregs array in mcontext_t. */ if (pk32) { MODULE_HOOK_CALL(netbsd32_reg_validate_hook, (l, regp), EINVAL, error); } else { error = cpu_mcontext_validate(l, (const mcontext_t *)regs); } if (error != 0) return error; tf->tf_rdi = regs[_REG_RDI]; tf->tf_rsi = regs[_REG_RSI]; tf->tf_rdx = regs[_REG_RDX]; tf->tf_r10 = regs[_REG_R10]; tf->tf_r8 = regs[_REG_R8]; tf->tf_r9 = regs[_REG_R9]; /* argX not touched */ tf->tf_rcx = regs[_REG_RCX]; tf->tf_r11 = regs[_REG_R11]; tf->tf_r12 = regs[_REG_R12]; tf->tf_r13 = regs[_REG_R13]; tf->tf_r14 = regs[_REG_R14]; tf->tf_r15 = regs[_REG_R15]; tf->tf_rbp = regs[_REG_RBP]; tf->tf_rbx = regs[_REG_RBX]; tf->tf_rax = regs[_REG_RAX]; if (pk32) { tf->tf_gs = regs[_REG_GS] & 0xffff; tf->tf_fs = regs[_REG_FS] & 0xffff; tf->tf_es = regs[_REG_ES] & 0xffff; tf->tf_ds = regs[_REG_DS] & 0xffff; tf->tf_cs = regs[_REG_CS] & 0xffff; tf->tf_ss = regs[_REG_SS] & 0xffff; } else { tf->tf_gs = 0; tf->tf_fs = 0; tf->tf_es = GSEL(GUDATA_SEL, SEL_UPL); tf->tf_ds = GSEL(GUDATA_SEL, SEL_UPL); tf->tf_cs = LSEL(LUCODE_SEL, SEL_UPL); tf->tf_ss = LSEL(LUDATA_SEL, SEL_UPL); } /* trapno, err not touched */ tf->tf_rip = regs[_REG_RIP]; tf->tf_rflags = regs[_REG_RFLAGS]; tf->tf_rsp = regs[_REG_RSP]; return 0; } int process_write_fpregs(struct lwp *l, const struct fpreg *regs, size_t sz) { process_write_fpregs_xmm(l, &regs->fxstate); return 0; } int process_write_dbregs(struct lwp *l, const struct dbreg *regs, size_t sz) { int error; /* * Check for security violations. */ error = x86_dbregs_validate(regs); if (error != 0) return error; x86_dbregs_write(l, regs); return 0; } int process_sstep(struct lwp *l, int sstep) { struct trapframe *tf = process_frame(l); if (sstep) tf->tf_rflags |= PSL_T; else tf->tf_rflags &= ~PSL_T; return 0; } int process_set_pc(struct lwp *l, void *addr) { struct trapframe *tf = process_frame(l); const bool pk32 = (l->l_proc->p_flag & PK_32) != 0; const uint64_t rip = (uint64_t)addr; if (rip >= (pk32 ? VM_MAXUSER_ADDRESS32 : VM_MAXUSER_ADDRESS)) return EINVAL; tf->tf_rip = rip; return 0; } #ifdef __HAVE_PTRACE_MACHDEP static int process_machdep_read_xstate(struct lwp *l, struct xstate *regs) { return process_read_xstate(l, regs); } static int process_machdep_write_xstate(struct lwp *l, const struct xstate *regs) { int error; /* * Check for security violations. */ error = process_verify_xstate(regs); if (error != 0) return error; return process_write_xstate(l, regs); } int ptrace_machdep_dorequest( struct lwp *l, struct lwp **lt, int req, void *addr, int data ) { struct uio uio; struct iovec iov; struct vmspace *vm; int error; bool write = false; switch (req) { case PT_SETXSTATE: write = true; /* FALLTHROUGH */ case PT_GETXSTATE: /* write = false done above. */ if ((error = ptrace_update_lwp((*lt)->l_proc, lt, data)) != 0) return error; if (!process_machdep_validfpu((*lt)->l_proc)) return EINVAL; if (__predict_false(l->l_proc->p_flag & PK_32)) { struct netbsd32_iovec user_iov; if ((error = copyin(addr, &user_iov, sizeof(user_iov))) != 0) return error; iov.iov_base = NETBSD32PTR64(user_iov.iov_base); iov.iov_len = user_iov.iov_len; } else { struct iovec user_iov; if ((error = copyin(addr, &user_iov, sizeof(user_iov))) != 0) return error; iov.iov_base = user_iov.iov_base; iov.iov_len = user_iov.iov_len; } error = proc_vmspace_getref(l->l_proc, &vm); if (error) return error; if (iov.iov_len > sizeof(struct xstate)) iov.iov_len = sizeof(struct xstate); uio.uio_iov = &iov; uio.uio_iovcnt = 1; uio.uio_offset = 0; uio.uio_resid = iov.iov_len; uio.uio_rw = write ? UIO_WRITE : UIO_READ; uio.uio_vmspace = vm; error = process_machdep_doxstate(l, *lt, &uio); uvmspace_free(vm); return error; case PT_SETXMMREGS: /* only for COMPAT_NETBSD32 */ write = true; /* FALLTHROUGH */ case PT_GETXMMREGS: /* only for COMPAT_NETBSD32 */ /* write = false done above. */ if ((error = ptrace_update_lwp((*lt)->l_proc, lt, data)) != 0) return error; MODULE_HOOK_CALL(netbsd32_process_doxmmregs_hook, (l, *lt, addr, write), EINVAL, error); return error; } #ifdef DIAGNOSTIC panic("ptrace_machdep: impossible"); #endif return 0; } /* * The following functions are used by both ptrace(2) and procfs. */ int process_machdep_doxstate(struct lwp *curl, struct lwp *l, struct uio *uio) /* curl: tracer */ /* l: traced */ { int error; struct xstate r; /* XXX FIXME big stack object */ char *kv; ssize_t kl; memset(&r, 0, sizeof(r)); kl = MIN(uio->uio_iov->iov_len, sizeof(r)); kv = (char *) &r; kv += uio->uio_offset; kl -= uio->uio_offset; if (kl > uio->uio_resid) kl = uio->uio_resid; if (kl < 0) error = EINVAL; else error = process_machdep_read_xstate(l, &r); if (error == 0) error = uiomove(kv, kl, uio); if (error == 0 && uio->uio_rw == UIO_WRITE) error = process_machdep_write_xstate(l, &r); uio->uio_offset = 0; return error; } int process_machdep_validfpu(struct proc *p) { if (p->p_flag & PK_SYSTEM) return 0; return 1; } #endif /* __HAVE_PTRACE_MACHDEP */
280 133 158 158 158 221 221 220 31 220 159 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 /* $NetBSD: kern_rwlock_obj.c,v 1.13 2023/10/02 21:03:55 ad Exp $ */ /*- * Copyright (c) 2008, 2009, 2019, 2023 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Andrew Doran. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: kern_rwlock_obj.c,v 1.13 2023/10/02 21:03:55 ad Exp $"); #include <sys/param.h> #include <sys/atomic.h> #include <sys/kmem.h> #include <sys/rwlock.h> /* Mutex cache */ #define RW_OBJ_MAGIC 0x85d3c85d struct krwobj { krwlock_t ro_lock; u_int ro_magic; u_int ro_refcnt; uint8_t mo_pad[COHERENCY_UNIT - sizeof(krwlock_t) - sizeof(u_int) * 2]; }; /* * rw_obj_alloc: * * Allocate a single lock object, waiting for memory if needed. */ krwlock_t * rw_obj_alloc(void) { struct krwobj *ro; ro = kmem_intr_alloc(sizeof(*ro), KM_SLEEP); KASSERT(ALIGNED_POINTER(ro, coherency_unit)); _rw_init(&ro->ro_lock, (uintptr_t)__builtin_return_address(0)); ro->ro_magic = RW_OBJ_MAGIC; ro->ro_refcnt = 1; return (krwlock_t *)ro; } /* * rw_obj_tryalloc: * * Allocate a single lock object, but fail if no memory is available. */ krwlock_t * rw_obj_tryalloc(void) { struct krwobj *ro; ro = kmem_intr_alloc(sizeof(*ro), KM_NOSLEEP); KASSERT(ALIGNED_POINTER(ro, coherency_unit)); if (__predict_true(ro != NULL)) { _rw_init(&ro->ro_lock, (uintptr_t)__builtin_return_address(0)); ro->ro_magic = RW_OBJ_MAGIC; ro->ro_refcnt = 1; } return (krwlock_t *)ro; } /* * rw_obj_hold: * * Add a single reference to a lock object. A reference to the object * must already be held, and must be held across this call. */ void rw_obj_hold(krwlock_t *lock) { struct krwobj *ro = (struct krwobj *)lock; KASSERT(ro->ro_magic == RW_OBJ_MAGIC); KASSERT(ro->ro_refcnt > 0); atomic_inc_uint(&ro->ro_refcnt); } /* * rw_obj_free: * * Drop a reference from a lock object. If the last reference is being * dropped, free the object and return true. Otherwise, return false. */ bool rw_obj_free(krwlock_t *lock) { struct krwobj *ro = (struct krwobj *)lock; KASSERT(ro->ro_magic == RW_OBJ_MAGIC); KASSERT(ro->ro_refcnt > 0); membar_release(); if (atomic_dec_uint_nv(&ro->ro_refcnt) > 0) { return false; } membar_acquire(); rw_destroy(&ro->ro_lock); kmem_intr_free(ro, sizeof(*ro)); return true; } /* * rw_obj_refcnt: * * Return the reference count for a lock object. */ u_int rw_obj_refcnt(krwlock_t *lock) { struct krwobj *ro = (struct krwobj *)lock; return ro->ro_refcnt; }
35 35 39 30 19 28 1 3 4 12 15 4 5 4 8 3 1 1 14 14 2 2 11 5 1 1 32 2 33 1 5 3 2 2 7 28 2 1 1 2 1 2 2 1 1 1 1 42 42 42 32 7 4 2 38 24 19 33 10 35 28 23 1 24 20 33 1 27 17 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 /* $NetBSD: vfs_lockf.c,v 1.81 2023/09/23 18:21:11 ad Exp $ */ /* * Copyright (c) 1982, 1986, 1989, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * Scooter Morris at Genentech Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)ufs_lockf.c 8.4 (Berkeley) 10/26/94 */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: vfs_lockf.c,v 1.81 2023/09/23 18:21:11 ad Exp $"); #include <sys/param.h> #include <sys/systm.h> #include <sys/kernel.h> #include <sys/file.h> #include <sys/proc.h> #include <sys/vnode.h> #include <sys/kmem.h> #include <sys/fcntl.h> #include <sys/lockf.h> #include <sys/atomic.h> #include <sys/kauth.h> #include <sys/uidinfo.h> /* * The lockf structure is a kernel structure which contains the information * associated with a byte range lock. The lockf structures are linked into * the vnode structure. Locks are sorted by the starting byte of the lock for * efficiency. * * lf_next is used for two purposes, depending on whether the lock is * being held, or is in conflict with an existing lock. If this lock * is held, it indicates the next lock on the same vnode. * For pending locks, if lock->lf_next is non-NULL, then lock->lf_block * must be queued on the lf_blkhd TAILQ of lock->lf_next. */ TAILQ_HEAD(locklist, lockf); struct lockf { kcondvar_t lf_cv; /* Signalling */ short lf_flags; /* Lock semantics: F_POSIX, F_FLOCK, F_WAIT */ short lf_type; /* Lock type: F_RDLCK, F_WRLCK */ off_t lf_start; /* The byte # of the start of the lock */ off_t lf_end; /* The byte # of the end of the lock (-1=EOF)*/ void *lf_id; /* process or file description holding lock */ struct lockf **lf_head; /* Back pointer to the head of lockf list */ struct lockf *lf_next; /* Next lock on this vnode, or blocking lock */ struct locklist lf_blkhd; /* List of requests blocked on this lock */ TAILQ_ENTRY(lockf) lf_block;/* A request waiting for a lock */ struct uidinfo *lf_uip; /* Cached pointer to uidinfo */ }; /* Maximum length of sleep chains to traverse to try and detect deadlock. */ #define MAXDEPTH 50 static kmutex_t lockf_lock __cacheline_aligned; static char lockstr[] = "lockf"; /* * This variable controls the maximum number of processes that will * be checked in doing deadlock detection. */ int maxlockdepth = MAXDEPTH; #ifdef LOCKF_DEBUG int lockf_debug = 0; #endif #define SELF 0x1 #define OTHERS 0x2 /* * XXX TODO * Misc cleanups: "void *id" should be visible in the API as a * "struct proc *". * (This requires rototilling all VFS's which support advisory locking). */ /* * If there's a lot of lock contention on a single vnode, locking * schemes which allow for more paralleism would be needed. Given how * infrequently byte-range locks are actually used in typical BSD * code, a more complex approach probably isn't worth it. */ /* * We enforce a limit on locks by uid, so that a single user cannot * run the kernel out of memory. For now, the limit is pretty coarse. * There is no limit on root. * * Splitting a lock will always succeed, regardless of current allocations. * If you're slightly above the limit, we still have to permit an allocation * so that the unlock can succeed. If the unlocking causes too many splits, * however, you're totally cutoff. */ #define MAXLOCKSPERUID (2 * maxfiles) #ifdef LOCKF_DEBUG /* * Print out a lock. */ static void lf_print(const char *tag, struct lockf *lock) { printf("%s: lock %p for ", tag, lock); if (lock->lf_flags & F_POSIX) printf("proc %d", ((struct proc *)lock->lf_id)->p_pid); else printf("file %p", (struct file *)lock->lf_id); printf(" %s, start %jd, end %jd", lock->lf_type == F_RDLCK ? "shared" : lock->lf_type == F_WRLCK ? "exclusive" : lock->lf_type == F_UNLCK ? "unlock" : "unknown", (intmax_t)lock->lf_start, (intmax_t)lock->lf_end); if (TAILQ_FIRST(&lock->lf_blkhd)) printf(" block %p\n", TAILQ_FIRST(&lock->lf_blkhd)); else printf("\n"); } static void lf_printlist(const char *tag, struct lockf *lock) { struct lockf *lf, *blk; printf("%s: Lock list:\n", tag); for (lf = *lock->lf_head; lf; lf = lf->lf_next) { printf("\tlock %p for ", lf); if (lf->lf_flags & F_POSIX) printf("proc %d", ((struct proc *)lf->lf_id)->p_pid); else printf("file %p", (struct file *)lf->lf_id); printf(", %s, start %jd, end %jd", lf->lf_type == F_RDLCK ? "shared" : lf->lf_type == F_WRLCK ? "exclusive" : lf->lf_type == F_UNLCK ? "unlock" : "unknown", (intmax_t)lf->lf_start, (intmax_t)lf->lf_end); TAILQ_FOREACH(blk, &lf->lf_blkhd, lf_block) { if (blk->lf_flags & F_POSIX) printf("; proc %d", ((struct proc *)blk->lf_id)->p_pid); else printf("; file %p", (struct file *)blk->lf_id); printf(", %s, start %jd, end %jd", blk->lf_type == F_RDLCK ? "shared" : blk->lf_type == F_WRLCK ? "exclusive" : blk->lf_type == F_UNLCK ? "unlock" : "unknown", (intmax_t)blk->lf_start, (intmax_t)blk->lf_end); if (TAILQ_FIRST(&blk->lf_blkhd)) panic("lf_printlist: bad list"); } printf("\n"); } } #endif /* LOCKF_DEBUG */ /* * 3 options for allowfail. * 0 - always allocate. 1 - cutoff at limit. 2 - cutoff at double limit. */ static struct lockf * lf_alloc(int allowfail) { struct uidinfo *uip; struct lockf *lock; u_long lcnt; const uid_t uid = kauth_cred_geteuid(kauth_cred_get()); uip = uid_find(uid); lcnt = atomic_inc_ulong_nv(&uip->ui_lockcnt); if (uid && allowfail && lcnt > (allowfail == 1 ? MAXLOCKSPERUID : (MAXLOCKSPERUID * 2))) { atomic_dec_ulong(&uip->ui_lockcnt); return NULL; } lock = kmem_alloc(sizeof(*lock), KM_SLEEP); lock->lf_uip = uip; cv_init(&lock->lf_cv, lockstr); return lock; } static void lf_free(struct lockf *lock) { atomic_dec_ulong(&lock->lf_uip->ui_lockcnt); cv_destroy(&lock->lf_cv); kmem_free(lock, sizeof(*lock)); } /* * Walk the list of locks for an inode to * find an overlapping lock (if any). * * NOTE: this returns only the FIRST overlapping lock. There * may be more than one. */ static int lf_findoverlap(struct lockf *lf, struct lockf *lock, int type, struct lockf ***prev, struct lockf **overlap) { off_t start, end; *overlap = lf; if (lf == NULL) return 0; #ifdef LOCKF_DEBUG if (lockf_debug & 2) lf_print("lf_findoverlap: looking for overlap in", lock); #endif /* LOCKF_DEBUG */ start = lock->lf_start; end = lock->lf_end; while (lf != NULL) { if (((type == SELF) && lf->lf_id != lock->lf_id) || ((type == OTHERS) && lf->lf_id == lock->lf_id)) { *prev = &lf->lf_next; *overlap = lf = lf->lf_next; continue; } #ifdef LOCKF_DEBUG if (lockf_debug & 2) lf_print("\tchecking", lf); #endif /* LOCKF_DEBUG */ /* * OK, check for overlap * * Six cases: * 0) no overlap * 1) overlap == lock * 2) overlap contains lock * 3) lock contains overlap * 4) overlap starts before lock * 5) overlap ends after lock */ if ((lf->lf_end != -1 && start > lf->lf_end) || (end != -1 && lf->lf_start > end)) { /* Case 0 */ #ifdef LOCKF_DEBUG if (lockf_debug & 2) printf("no overlap\n"); #endif /* LOCKF_DEBUG */ if ((type & SELF) && end != -1 && lf->lf_start > end) return 0; *prev = &lf->lf_next; *overlap = lf = lf->lf_next; continue; } if ((lf->lf_start == start) && (lf->lf_end == end)) { /* Case 1 */ #ifdef LOCKF_DEBUG if (lockf_debug & 2) printf("overlap == lock\n"); #endif /* LOCKF_DEBUG */ return 1; } if ((lf->lf_start <= start) && (end != -1) && ((lf->lf_end >= end) || (lf->lf_end == -1))) { /* Case 2 */ #ifdef LOCKF_DEBUG if (lockf_debug & 2) printf("overlap contains lock\n"); #endif /* LOCKF_DEBUG */ return 2; } if (start <= lf->lf_start && (end == -1 || (lf->lf_end != -1 && end >= lf->lf_end))) { /* Case 3 */ #ifdef LOCKF_DEBUG if (lockf_debug & 2) printf("lock contains overlap\n"); #endif /* LOCKF_DEBUG */ return 3; } if ((lf->lf_start < start) && ((lf->lf_end >= start) || (lf->lf_end == -1))) { /* Case 4 */ #ifdef LOCKF_DEBUG if (lockf_debug & 2) printf("overlap starts before lock\n"); #endif /* LOCKF_DEBUG */ return 4; } if ((lf->lf_start > start) && (end != -1) && ((lf->lf_end > end) || (lf->lf_end == -1))) { /* Case 5 */ #ifdef LOCKF_DEBUG if (lockf_debug & 2) printf("overlap ends after lock\n"); #endif /* LOCKF_DEBUG */ return 5; } panic("lf_findoverlap: default"); } return 0; } /* * Split a lock and a contained region into * two or three locks as necessary. */ static void lf_split(struct lockf *lock1, struct lockf *lock2, struct lockf **sparelock) { struct lockf *splitlock; #ifdef LOCKF_DEBUG if (lockf_debug & 2) { lf_print("lf_split", lock1); lf_print("splitting from", lock2); } #endif /* LOCKF_DEBUG */ /* * Check to see if splitting into only two pieces. */ if (lock1->lf_start == lock2->lf_start) { lock1->lf_start = lock2->lf_end + 1; lock2->lf_next = lock1; return; } if (lock1->lf_end == lock2->lf_end) { lock1->lf_end = lock2->lf_start - 1; lock2->lf_next = lock1->lf_next; lock1->lf_next = lock2; return; } /* * Make a new lock consisting of the last part of * the encompassing lock */ splitlock = *sparelock; *sparelock = NULL; cv_destroy(&splitlock->lf_cv); memcpy(splitlock, lock1, sizeof(*splitlock)); cv_init(&splitlock->lf_cv, lockstr); splitlock->lf_start = lock2->lf_end + 1; TAILQ_INIT(&splitlock->lf_blkhd); lock1->lf_end = lock2->lf_start - 1; /* * OK, now link it in */ splitlock->lf_next = lock1->lf_next; lock2->lf_next = splitlock; lock1->lf_next = lock2; } /* * Wakeup a blocklist */ static void lf_wakelock(struct lockf *listhead) { struct lockf *wakelock; while ((wakelock = TAILQ_FIRST(&listhead->lf_blkhd))) { KASSERT(wakelock->lf_next == listhead); TAILQ_REMOVE(&listhead->lf_blkhd, wakelock, lf_block); wakelock->lf_next = NULL; #ifdef LOCKF_DEBUG if (lockf_debug & 2) lf_print("lf_wakelock: awakening", wakelock); #endif cv_broadcast(&wakelock->lf_cv); } } /* * Remove a byte-range lock on an inode. * * Generally, find the lock (or an overlap to that lock) * and remove it (or shrink it), then wakeup anyone we can. */ static int lf_clearlock(struct lockf *unlock, struct lockf **sparelock) { struct lockf **head = unlock->lf_head; struct lockf *lf = *head; struct lockf *overlap, **prev; int ovcase; if (lf == NULL) return 0; #ifdef LOCKF_DEBUG if (unlock->lf_type != F_UNLCK) panic("lf_clearlock: bad type"); if (lockf_debug & 1) lf_print("lf_clearlock", unlock); #endif /* LOCKF_DEBUG */ prev = head; while ((ovcase = lf_findoverlap(lf, unlock, SELF, &prev, &overlap)) != 0) { /* * Wakeup the list of locks to be retried. */ lf_wakelock(overlap); switch (ovcase) { case 1: /* overlap == lock */ *prev = overlap->lf_next; lf_free(overlap); break; case 2: /* overlap contains lock: split it */ if (overlap->lf_start == unlock->lf_start) { overlap->lf_start = unlock->lf_end + 1; break; } lf_split(overlap, unlock, sparelock); overlap->lf_next = unlock->lf_next; break; case 3: /* lock contains overlap */ *prev = overlap->lf_next; lf = overlap->lf_next; lf_free(overlap); continue; case 4: /* overlap starts before lock */ overlap->lf_end = unlock->lf_start - 1; prev = &overlap->lf_next; lf = overlap->lf_next; continue; case 5: /* overlap ends after lock */ overlap->lf_start = unlock->lf_end + 1; break; } break; } #ifdef LOCKF_DEBUG if (lockf_debug & 1) lf_printlist("lf_clearlock", unlock); #endif /* LOCKF_DEBUG */ return 0; } /* * Walk the list of locks for an inode and * return the first blocking lock. */ static struct lockf * lf_getblock(struct lockf *lock) { struct lockf **prev, *overlap, *lf = *(lock->lf_head); prev = lock->lf_head; while (lf_findoverlap(lf, lock, OTHERS, &prev, &overlap) != 0) { /* * We've found an overlap, see if it blocks us */ if ((lock->lf_type == F_WRLCK || overlap->lf_type == F_WRLCK)) return overlap; /* * Nope, point to the next one on the list and * see if it blocks us */ lf = overlap->lf_next; } return NULL; } /* * Set a byte-range lock. */ static int lf_setlock(struct lockf *lock, struct lockf **sparelock, kmutex_t *interlock) { struct lockf *block; struct lockf **head = lock->lf_head; struct lockf **prev, *overlap, *ltmp; int ovcase, needtolink, error; #ifdef LOCKF_DEBUG if (lockf_debug & 1) lf_print("lf_setlock", lock); #endif /* LOCKF_DEBUG */ /* * Scan lock list for this file looking for locks that would block us. */ while ((block = lf_getblock(lock)) != NULL) { /* * Free the structure and return if nonblocking. */ if ((lock->lf_flags & F_WAIT) == 0) { lf_free(lock); return EAGAIN; } /* * We are blocked. Since flock style locks cover * the whole file, there is no chance for deadlock. * For byte-range locks we must check for deadlock. * * Deadlock detection is done by looking through the * wait channels to see if there are any cycles that * involve us. MAXDEPTH is set just to make sure we * do not go off into neverneverland. */ if ((lock->lf_flags & F_POSIX) && (block->lf_flags & F_POSIX)) { struct lwp *wlwp; volatile const struct lockf *waitblock; int i = 0; struct proc *p; p = (struct proc *)block->lf_id; KASSERT(p != NULL); while (i++ < maxlockdepth) { mutex_enter(p->p_lock); if (p->p_nlwps > 1) { mutex_exit(p->p_lock); break; } wlwp = LIST_FIRST(&p->p_lwps); lwp_lock(wlwp); if (wlwp->l_wchan == NULL || wlwp->l_wmesg != lockstr) { lwp_unlock(wlwp); mutex_exit(p->p_lock); break; } waitblock = wlwp->l_wchan; lwp_unlock(wlwp); mutex_exit(p->p_lock); /* Get the owner of the blocking lock */ waitblock = waitblock->lf_next; if ((waitblock->lf_flags & F_POSIX) == 0) break; p = (struct proc *)waitblock->lf_id; if (p == curproc) { lf_free(lock); return EDEADLK; } } /* * If we're still following a dependency chain * after maxlockdepth iterations, assume we're in * a cycle to be safe. */ if (i >= maxlockdepth) { lf_free(lock); return EDEADLK; } } /* * For flock type locks, we must first remove * any shared locks that we hold before we sleep * waiting for an exclusive lock. */ if ((lock->lf_flags & F_FLOCK) && lock->lf_type == F_WRLCK) { lock->lf_type = F_UNLCK; (void) lf_clearlock(lock, NULL); lock->lf_type = F_WRLCK; } /* * Add our lock to the blocked list and sleep until we're free. * Remember who blocked us (for deadlock detection). */ lock->lf_next = block; TAILQ_INSERT_TAIL(&block->lf_blkhd, lock, lf_block); #ifdef LOCKF_DEBUG if (lockf_debug & 1) { lf_print("lf_setlock: blocking on", block); lf_printlist("lf_setlock", block); } #endif /* LOCKF_DEBUG */ error = cv_wait_sig(&lock->lf_cv, interlock); /* * We may have been awoken by a signal (in * which case we must remove ourselves from the * blocked list) and/or by another process * releasing a lock (in which case we have already * been removed from the blocked list and our * lf_next field set to NULL). */ if (lock->lf_next != NULL) { TAILQ_REMOVE(&lock->lf_next->lf_blkhd, lock, lf_block); lock->lf_next = NULL; } if (error) { lf_free(lock); return error; } } /* * No blocks!! Add the lock. Note that we will * downgrade or upgrade any overlapping locks this * process already owns. * * Skip over locks owned by other processes. * Handle any locks that overlap and are owned by ourselves. */ prev = head; block = *head; needtolink = 1; for (;;) { ovcase = lf_findoverlap(block, lock, SELF, &prev, &overlap); if (ovcase) block = overlap->lf_next; /* * Six cases: * 0) no overlap * 1) overlap == lock * 2) overlap contains lock * 3) lock contains overlap * 4) overlap starts before lock * 5) overlap ends after lock */ switch (ovcase) { case 0: /* no overlap */ if (needtolink) { *prev = lock; lock->lf_next = overlap; } break; case 1: /* overlap == lock */ /* * If downgrading lock, others may be * able to acquire it. */ if (lock->lf_type == F_RDLCK && overlap->lf_type == F_WRLCK) lf_wakelock(overlap); overlap->lf_type = lock->lf_type; lf_free(lock); lock = overlap; /* for debug output below */ break; case 2: /* overlap contains lock */ /* * Check for common starting point and different types. */ if (overlap->lf_type == lock->lf_type) { lf_free(lock); lock = overlap; /* for debug output below */ break; } if (overlap->lf_start == lock->lf_start) { *prev = lock; lock->lf_next = overlap; overlap->lf_start = lock->lf_end + 1; } else lf_split(overlap, lock, sparelock); lf_wakelock(overlap); break; case 3: /* lock contains overlap */ /* * If downgrading lock, others may be able to * acquire it, otherwise take the list. */ if (lock->lf_type == F_RDLCK && overlap->lf_type == F_WRLCK) { lf_wakelock(overlap); } else { while ((ltmp = TAILQ_FIRST(&overlap->lf_blkhd))) { KASSERT(ltmp->lf_next == overlap); TAILQ_REMOVE(&overlap->lf_blkhd, ltmp, lf_block); ltmp->lf_next = lock; TAILQ_INSERT_TAIL(&lock->lf_blkhd, ltmp, lf_block); } } /* * Add the new lock if necessary and delete the overlap. */ if (needtolink) { *prev = lock; lock->lf_next = overlap->lf_next; prev = &lock->lf_next; needtolink = 0; } else *prev = overlap->lf_next; lf_free(overlap); continue; case 4: /* overlap starts before lock */ /* * Add lock after overlap on the list. */ lock->lf_next = overlap->lf_next; overlap->lf_next = lock; overlap->lf_end = lock->lf_start - 1; prev = &lock->lf_next; lf_wakelock(overlap); needtolink = 0; continue; case 5: /* overlap ends after lock */ /* * Add the new lock before overlap. */ if (needtolink) { *prev = lock; lock->lf_next = overlap; } overlap->lf_start = lock->lf_end + 1; lf_wakelock(overlap); break; } break; } #ifdef LOCKF_DEBUG if (lockf_debug & 1) { lf_print("lf_setlock: got the lock", lock); lf_printlist("lf_setlock", lock); } #endif /* LOCKF_DEBUG */ return 0; } /* * Check whether there is a blocking lock, * and if so return its process identifier. */ static int lf_getlock(struct lockf *lock, struct flock *fl) { struct lockf *block; #ifdef LOCKF_DEBUG if (lockf_debug & 1) lf_print("lf_getlock", lock); #endif /* LOCKF_DEBUG */ if ((block = lf_getblock(lock)) != NULL) { fl->l_type = block->lf_type; fl->l_whence = SEEK_SET; fl->l_start = block->lf_start; if (block->lf_end == -1) fl->l_len = 0; else fl->l_len = block->lf_end - block->lf_start + 1; if (block->lf_flags & F_POSIX) fl->l_pid = ((struct proc *)block->lf_id)->p_pid; else fl->l_pid = -1; } else { fl->l_type = F_UNLCK; } return 0; } /* * Do an advisory lock operation. */ int lf_advlock(struct vop_advlock_args *ap, struct lockf **head, off_t size) { struct flock *fl = ap->a_fl; struct lockf *lock = NULL; struct lockf *sparelock; kmutex_t *interlock = &lockf_lock; off_t start, end; int error = 0; KASSERTMSG(size >= 0, "size=%jd", (intmax_t)size); /* * Convert the flock structure into a start and end. */ switch (fl->l_whence) { case SEEK_SET: case SEEK_CUR: /* * Caller is responsible for adding any necessary offset * when SEEK_CUR is used. */ start = fl->l_start; break; case SEEK_END: if (fl->l_start > __type_max(off_t) - size) return EINVAL; start = size + fl->l_start; break; default: return EINVAL; } if (fl->l_len == 0) end = -1; else { if (fl->l_len >= 0) { if (start >= 0 && fl->l_len - 1 > __type_max(off_t) - start) return EINVAL; end = start + (fl->l_len - 1); } else { /* lockf() allows -ve lengths */ if (start < 0) return EINVAL; end = start - 1; start += fl->l_len; } } if (start < 0) return EINVAL; /* * Allocate locks before acquiring the interlock. We need two * locks in the worst case. */ switch (ap->a_op) { case F_SETLK: case F_UNLCK: /* * XXX For F_UNLCK case, we can re-use the lock. */ if ((ap->a_flags & F_FLOCK) == 0) { /* * Byte-range lock might need one more lock. */ sparelock = lf_alloc(0); if (sparelock == NULL) { error = ENOMEM; goto quit; } break; } /* FALLTHROUGH */ case F_GETLK: sparelock = NULL; break; default: return EINVAL; } switch (ap->a_op) { case F_SETLK: lock = lf_alloc(1); break; case F_UNLCK: if (start == 0 || end == -1) { /* never split */ lock = lf_alloc(0); } else { /* might split */ lock = lf_alloc(2); } break; case F_GETLK: lock = lf_alloc(0); break; } if (lock == NULL) { error = ENOMEM; goto quit; } mutex_enter(interlock); /* * Avoid the common case of unlocking when inode has no locks. */ if (*head == (struct lockf *)0) { if (ap->a_op != F_SETLK) { fl->l_type = F_UNLCK; error = 0; goto quit_unlock; } } /* * Create the lockf structure. */ lock->lf_start = start; lock->lf_end = end; lock->lf_head = head; lock->lf_type = fl->l_type; lock->lf_next = (struct lockf *)0; TAILQ_INIT(&lock->lf_blkhd); lock->lf_flags = ap->a_flags; if (lock->lf_flags & F_POSIX) { KASSERT(curproc == (struct proc *)ap->a_id); } lock->lf_id = ap->a_id; /* * Do the requested operation. */ switch (ap->a_op) { case F_SETLK: error = lf_setlock(lock, &sparelock, interlock); lock = NULL; /* lf_setlock freed it */ break; case F_UNLCK: error = lf_clearlock(lock, &sparelock); break; case F_GETLK: error = lf_getlock(lock, fl); break; default: break; /* NOTREACHED */ } quit_unlock: mutex_exit(interlock); quit: if (lock) lf_free(lock); if (sparelock) lf_free(sparelock); return error; } /* * Initialize subsystem. XXX We use a global lock. This could be the * vnode interlock, but the deadlock detection code may need to inspect * locks belonging to other files. */ void lf_init(void) { mutex_init(&lockf_lock, MUTEX_DEFAULT, IPL_NONE); }
415 413 769 769 9 9 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 /* $NetBSD: secmodel_extensions_vfs.c,v 1.1 2023/04/22 13:54:19 riastradh Exp $ */ /*- * Copyright (c) 2011 Elad Efrat <elad@NetBSD.org> * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The name of the author may not be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: secmodel_extensions_vfs.c,v 1.1 2023/04/22 13:54:19 riastradh Exp $"); #include <sys/types.h> #include <sys/param.h> #include <sys/kauth.h> #include <sys/vnode.h> #include <secmodel/secmodel.h> #include <secmodel/extensions/extensions.h> #include <secmodel/extensions/extensions_impl.h> static int dovfsusermount; static int hardlink_check_uid; static int hardlink_check_gid; static kauth_listener_t l_system, l_vnode; static int secmodel_extensions_system_cb(kauth_cred_t, kauth_action_t, void *, void *, void *, void *, void *); static int secmodel_extensions_vnode_cb(kauth_cred_t, kauth_action_t, void *, void *, void *, void *, void *); void secmodel_extensions_vfs_start(void) { l_system = kauth_listen_scope(KAUTH_SCOPE_SYSTEM, secmodel_extensions_system_cb, NULL); l_vnode = kauth_listen_scope(KAUTH_SCOPE_VNODE, secmodel_extensions_vnode_cb, NULL); } void secmodel_extensions_vfs_stop(void) { kauth_unlisten_scope(l_system); kauth_unlisten_scope(l_vnode); } void secmodel_extensions_vfs_sysctl(struct sysctllog **clog, const struct sysctlnode *rnode) { sysctl_createv(clog, 0, &rnode, NULL, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_INT, "usermount", SYSCTL_DESCR("Whether unprivileged users may mount " "filesystems"), sysctl_extensions_user_handler, 0, &dovfsusermount, 0, CTL_CREATE, CTL_EOL); sysctl_createv(clog, 0, &rnode, NULL, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_INT, "hardlink_check_uid", SYSCTL_DESCR("Whether unprivileged users can hardlink "\ "to files they don't own"), sysctl_extensions_user_handler, 0, &hardlink_check_uid, 0, CTL_CREATE, CTL_EOL); sysctl_createv(clog, 0, &rnode, NULL, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_INT, "hardlink_check_gid", SYSCTL_DESCR("Whether unprivileged users can hardlink "\ "to files that are not in their " \ "group membership"), sysctl_extensions_user_handler, 0, &hardlink_check_gid, 0, CTL_CREATE, CTL_EOL); /* Compatibility: vfs.generic.usermount */ sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT, CTLTYPE_NODE, "generic", SYSCTL_DESCR("Non-specific vfs related information"), NULL, 0, NULL, 0, CTL_VFS, VFS_GENERIC, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_INT, "usermount", SYSCTL_DESCR("Whether unprivileged users may mount " "filesystems"), sysctl_extensions_user_handler, 0, &dovfsusermount, 0, CTL_VFS, VFS_GENERIC, VFS_USERMOUNT, CTL_EOL); } static int secmodel_extensions_system_cb(kauth_cred_t cred, kauth_action_t action, void *cookie, void *arg0, void *arg1, void *arg2, void *arg3) { vnode_t *vp; struct vattr va; struct mount *mp; u_long flags; int result; enum kauth_system_req req; int error; req = (enum kauth_system_req)(uintptr_t)arg0; result = KAUTH_RESULT_DEFER; switch (action) { case KAUTH_SYSTEM_MOUNT: if (dovfsusermount == 0) break; switch (req) { case KAUTH_REQ_SYSTEM_MOUNT_NEW: vp = (vnode_t *)arg1; mp = vp->v_mount; flags = (u_long)arg2; /* * Ensure that the user owns the directory onto which * the mount is attempted. */ vn_lock(vp, LK_SHARED | LK_RETRY); error = VOP_GETATTR(vp, &va, cred); VOP_UNLOCK(vp); if (error) break; if (va.va_uid != kauth_cred_geteuid(cred)) break; error = usermount_common_policy(mp, flags); if (error) break; result = KAUTH_RESULT_ALLOW; break; case KAUTH_REQ_SYSTEM_MOUNT_UNMOUNT: mp = arg1; /* Must own the mount. */ if (mp->mnt_stat.f_owner == kauth_cred_geteuid(cred)) result = KAUTH_RESULT_ALLOW; break; case KAUTH_REQ_SYSTEM_MOUNT_UPDATE: mp = arg1; flags = (u_long)arg2; /* Must own the mount. */ if (mp->mnt_stat.f_owner == kauth_cred_geteuid(cred) && usermount_common_policy(mp, flags) == 0) result = KAUTH_RESULT_ALLOW; break; default: break; } break; default: break; } return (result); } static int secmodel_extensions_vnode_cb(kauth_cred_t cred, kauth_action_t action, void *cookie, void *arg0, void *arg1, void *arg2, void *arg3) { int error; bool isroot; struct vattr va; if ((action & KAUTH_VNODE_ADD_LINK) == 0) return KAUTH_RESULT_DEFER; error = VOP_GETATTR((vnode_t *)arg0, &va, cred); if (error) goto checkroot; if (hardlink_check_uid && kauth_cred_geteuid(cred) != va.va_uid) goto checkroot; if (hardlink_check_gid && kauth_cred_groupmember(cred, va.va_gid) != 0) goto checkroot; return KAUTH_RESULT_DEFER; checkroot: error = secmodel_eval("org.netbsd.secmodel.suser", "is-root", cred, &isroot); if (error || !isroot) return KAUTH_RESULT_DENY; return KAUTH_RESULT_DEFER; }
5 5 4 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 /* $NetBSD: rtsock_70.c,v 1.8 2019/12/12 02:15:42 pgoyette Exp $ */ /* * Copyright (c) 2016 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Roy Marples. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: rtsock_70.c,v 1.8 2019/12/12 02:15:42 pgoyette Exp $"); #ifdef _KERNEL_OPT #include "opt_compat_netbsd.h" #endif #include <sys/mbuf.h> #include <sys/compat_stub.h> #include <net/if.h> #include <net/route.h> #include <compat/net/if.h> #include <compat/net/route.h> #include <compat/net/route_70.h> void compat_70_rt_newaddrmsg1(int cmd, struct ifaddr *ifa) { struct rt_addrinfo info; const struct sockaddr *sa; struct mbuf *m; struct ifnet *ifp; struct ifa_msghdr70 ifam; int ncmd; KASSERT(ifa != NULL); ifp = ifa->ifa_ifp; switch (cmd) { case RTM_NEWADDR: ncmd = RTM_ONEWADDR; break; case RTM_DELADDR: ncmd = RTM_ODELADDR; break; case RTM_CHGADDR: ncmd = RTM_OCHGADDR; break; default: panic("%s: called with wrong command", __func__); } memset(&info, 0, sizeof(info)); info.rti_info[RTAX_IFA] = sa = ifa->ifa_addr; KASSERT(ifp->if_dl != NULL); info.rti_info[RTAX_IFP] = ifp->if_dl->ifa_addr; info.rti_info[RTAX_NETMASK] = ifa->ifa_netmask; info.rti_info[RTAX_BRD] = ifa->ifa_dstaddr; memset(&ifam, 0, sizeof(ifam)); ifam.ifam_index = ifp->if_index; ifam.ifam_metric = ifa->ifa_metric; ifam.ifam_flags = ifa->ifa_flags; m = rt_msg1(ncmd, &info, &ifam, sizeof(ifam)); if (m == NULL) return; mtod(m, struct ifa_msghdr70 *)->ifam_addrs = info.rti_addrs; route_enqueue(m, sa ? sa->sa_family : 0); } int compat_70_iflist_addr(struct rt_walkarg *w, struct ifaddr *ifa, struct rt_addrinfo *info) { int len, error; if ((error = rt_msg3(RTM_ONEWADDR, info, 0, w, &len))) return error; if (w->w_where && w->w_tmem && w->w_needed <= 0) { struct ifa_msghdr70 *ifam; ifam = (struct ifa_msghdr70 *)w->w_tmem; ifam->ifam_index = ifa->ifa_ifp->if_index; ifam->ifam_flags = ifa->ifa_flags; ifam->ifam_metric = ifa->ifa_metric; ifam->ifam_addrs = info->rti_addrs; if ((error = copyout(w->w_tmem, w->w_where, len)) == 0) w->w_where = (char *)w->w_where + len; } return error; } void rtsock_70_init(void) { MODULE_HOOK_SET(rtsock_newaddr_70_hook, compat_70_rt_newaddrmsg1); MODULE_HOOK_SET(rtsock_iflist_70_hook, compat_70_iflist_addr); } void rtsock_70_fini(void) { MODULE_HOOK_UNSET(rtsock_newaddr_70_hook); MODULE_HOOK_UNSET(rtsock_iflist_70_hook); }
12 12 2 7 2 6 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 /* $NetBSD: sysv_sem_50.c,v 1.5 2019/12/15 16:48:26 tsutsui Exp $ */ /*- * Copyright (c) 1999 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility, * NASA Ames Research Center. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: sysv_sem_50.c,v 1.5 2019/12/15 16:48:26 tsutsui Exp $"); #if defined(_KERNEL_OPT) #include "opt_compat_netbsd.h" #endif #include <sys/param.h> #include <sys/systm.h> #include <sys/signal.h> #include <sys/proc.h> #include <sys/sem.h> #ifndef SYSVSEM #define SYSVSEM #endif #include <sys/syscallargs.h> #include <compat/sys/sem.h> int compat_50_sys_____semctl13(struct lwp *l, const struct compat_50_sys_____semctl13_args *uap, register_t *retval) { /* { syscallarg(int) semid; syscallarg(int) semnum; syscallarg(int) cmd; syscallarg(union __semun *) arg; } */ union __semun arg; struct semid_ds sembuf; struct semid_ds13 osembuf; int cmd, error; void *pass_arg; cmd = SCARG(uap, cmd); pass_arg = get_semctl_arg(cmd, &sembuf, &arg); if (pass_arg != NULL) { error = copyin(SCARG(uap, arg), &arg, sizeof(arg)); if (error) return (error); if (cmd == IPC_SET) { error = copyin(arg.buf, &osembuf, sizeof(osembuf)); if (error) return (error); __semid_ds13_to_native(&osembuf, &sembuf); } } error = semctl1(l, SCARG(uap, semid), SCARG(uap, semnum), cmd, pass_arg, retval); if (error == 0 && cmd == IPC_STAT) { __native_to_semid_ds13(&sembuf, &osembuf); error = copyout(&osembuf, arg.buf, sizeof(osembuf)); } return (error); }
89 86 60 9 2 1 1 3 3 1 3 5 41 17 24 24 17 39 6 10 16 40 40 24 16 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 /* $NetBSD: if_43.c,v 1.27 2023/03/30 17:48:10 riastradh Exp $ */ /* * Copyright (c) 1982, 1986, 1989, 1990, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)uipc_syscalls.c 8.4 (Berkeley) 2/21/94 */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: if_43.c,v 1.27 2023/03/30 17:48:10 riastradh Exp $"); #if defined(_KERNEL_OPT) #include "opt_compat_netbsd.h" #endif #include <sys/param.h> #include <sys/systm.h> #include <sys/filedesc.h> #include <sys/kernel.h> #include <sys/proc.h> #include <sys/file.h> #include <sys/socket.h> #include <sys/socketvar.h> #include <sys/stat.h> #include <sys/ioctl.h> #include <sys/fcntl.h> #include <sys/syslog.h> #include <sys/unistd.h> #include <sys/resourcevar.h> #include <sys/mbuf.h> /* for MLEN */ #include <sys/protosw.h> #include <sys/compat_stub.h> #include <sys/syscallargs.h> #include <net/if.h> #include <net/bpf.h> #include <net/route.h> #include <netinet/in.h> #include <netinet/in_systm.h> #include <netinet/ip.h> #include <net/if_gre.h> #include <net/if_tap.h> #include <net80211/ieee80211_ioctl.h> #include <netinet6/in6_var.h> #include <netinet6/nd6.h> #include <compat/net/if.h> #include <compat/sys/socket.h> #include <compat/sys/sockio.h> #include <compat/common/compat_util.h> #include <compat/common/compat_mod.h> #include <uvm/uvm_extern.h> #if defined(COMPAT_43) /* * Use a wrapper so that the compat_cvtcmd() can return a u_long */ static int do_compat_cvtcmd(u_long *ncmd, u_long ocmd) { *ncmd = compat_cvtcmd(ocmd); return 0; } u_long compat_cvtcmd(u_long cmd) { u_long ncmd; if (IOCPARM_LEN(cmd) != sizeof(struct oifreq)) return cmd; switch (cmd) { case OSIOCSIFADDR: return SIOCSIFADDR; case OOSIOCGIFADDR: return SIOCGIFADDR; case OSIOCSIFDSTADDR: return SIOCSIFDSTADDR; case OOSIOCGIFDSTADDR: return SIOCGIFDSTADDR; case OSIOCSIFFLAGS: return SIOCSIFFLAGS; case OSIOCGIFFLAGS: return SIOCGIFFLAGS; case OOSIOCGIFBRDADDR: return SIOCGIFBRDADDR; case OSIOCSIFBRDADDR: return SIOCSIFBRDADDR; case OOSIOCGIFCONF: return SIOCGIFCONF; case OOSIOCGIFNETMASK: return SIOCGIFNETMASK; case OSIOCSIFNETMASK: return SIOCSIFNETMASK; case OSIOCGIFCONF: return SIOCGIFCONF; case OSIOCADDMULTI: return SIOCADDMULTI; case OSIOCDELMULTI: return SIOCDELMULTI; case SIOCSIFMEDIA_43: return SIOCSIFMEDIA_80; case OSIOCGIFMTU: return SIOCGIFMTU; case OSIOCGIFDATA: return SIOCGIFDATA; case OSIOCZIFDATA: return SIOCZIFDATA; case OBIOCGETIF: return BIOCGETIF; case OBIOCSETIF: return BIOCSETIF; case OTAPGIFNAME: return TAPGIFNAME; default: /* * XXX: the following code should be removed and the * needing treatment ioctls should move to the switch * above. */ ncmd = ((cmd) & ~(IOCPARM_MASK << IOCPARM_SHIFT)) | (sizeof(struct ifreq) << IOCPARM_SHIFT); switch (ncmd) { case BIOCGETIF: case BIOCSETIF: case GREDSOCK: case GREGADDRD: case GREGADDRS: case GREGPROTO: case GRESADDRD: case GRESADDRS: case GRESPROTO: case GRESSOCK: case SIOCADDMULTI: case SIOCDELMULTI: case SIOCDIFADDR: case SIOCDIFADDR_IN6: case SIOCDIFPHYADDR: case SIOCG80211NWID: case SIOCG80211STATS: case SIOCG80211ZSTATS: case SIOCGIFADDR: case SIOCGIFADDR_IN6: case SIOCGIFAFLAG_IN6: case SIOCGIFALIFETIME_IN6: case SIOCGIFBRDADDR: case SIOCGIFDLT: case SIOCGIFDSTADDR: case SIOCGIFDSTADDR_IN6: case SIOCGIFFLAGS: case SIOCGIFGENERIC: case SIOCGIFMETRIC: case SIOCGIFMTU: case SIOCGIFNETMASK: case SIOCGIFNETMASK_IN6: case SIOCGIFPDSTADDR: case SIOCGIFPDSTADDR_IN6: case SIOCGIFPSRCADDR: case SIOCGIFPSRCADDR_IN6: case SIOCGIFSTAT_ICMP6: case SIOCGIFSTAT_IN6: case SIOCGVH: case SIOCIFCREATE: case SIOCIFDESTROY: case SIOCS80211NWID: case SIOCSIFADDR: case SIOCSIFADDR_IN6: case SIOCSIFBRDADDR: case SIOCSIFDSTADDR: case SIOCSIFDSTADDR_IN6: case SIOCSIFFLAGS: case SIOCSIFGENERIC: case SIOCSIFMEDIA: case SIOCSIFMETRIC: case SIOCSIFMTU: case SIOCSIFNETMASK: case SIOCSIFNETMASK_IN6: case SIOCSVH: case TAPGIFNAME: return ncmd; default: { int rv; MODULE_HOOK_CALL(if43_cvtcmd_20_hook, (ncmd), enosys(), rv); if (rv == 0) return ncmd; return cmd; } } } } int compat_ifioctl(struct socket *so, u_long ocmd, u_long cmd, void *data, struct lwp *l) { int error; struct ifreq *ifr = (struct ifreq *)data; struct ifreq ifrb; struct oifreq *oifr = NULL; struct ifnet *ifp; struct sockaddr *sa; struct psref psref; int bound = curlwp_bind(); ifp = if_get(ifr->ifr_name, &psref); if (ifp == NULL) { curlwp_bindx(bound); return ENXIO; } /* * If we have not been converted, make sure that we are. * (because the upper layer handles old socket calls, but * not oifreq calls. */ if (cmd == ocmd) { cmd = compat_cvtcmd(ocmd); } if (cmd != ocmd) { oifr = data; ifr = &ifrb; IFREQO2N_43(oifr, ifr); } switch (ocmd) { enum { maxlen = sizeof(oifr->ifr_ifru) }; CTASSERT(maxlen == 16); socklen_t famlen; case OSIOCSIFADDR: case OSIOCSIFDSTADDR: case OSIOCSIFBRDADDR: case OSIOCSIFNETMASK: sa = &ifr->ifr_addr; #if BYTE_ORDER != BIG_ENDIAN if (sa->sa_family == 0 && sa->sa_len < maxlen) { sa->sa_family = sa->sa_len; sa->sa_len = maxlen; } #else if (sa->sa_len == 0) sa->sa_len = maxlen; #endif famlen = sockaddr_getsize_by_family(sa->sa_family); if (famlen > sa->sa_len) { curlwp_bindx(bound); return EAFNOSUPPORT; } break; } error = (*so->so_proto->pr_usrreqs->pr_ioctl)(so, cmd, ifr, ifp); if_put(ifp, &psref); curlwp_bindx(bound); switch (ocmd) { case OOSIOCGIFADDR: case OOSIOCGIFDSTADDR: case OOSIOCGIFBRDADDR: case OOSIOCGIFNETMASK: *(u_int16_t *)&ifr->ifr_addr = ((struct sockaddr *)&ifr->ifr_addr)->sa_family; break; } if (cmd != ocmd) IFREQN2O_43(oifr, ifr); return error; } int if_43_init(void) { MODULE_HOOK_SET(if_cvtcmd_43_hook, do_compat_cvtcmd); MODULE_HOOK_SET(if_ifioctl_43_hook, compat_ifioctl); return 0; } int if_43_fini(void) { MODULE_HOOK_UNSET(if_cvtcmd_43_hook); MODULE_HOOK_UNSET(if_ifioctl_43_hook); return 0; } #endif /* defined(COMPAT_43) */
5 1 1 2 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 /* $NetBSD: ntfs_vfsops.c,v 1.111 2024/02/04 00:16:59 christos Exp $ */ /*- * Copyright (c) 1998, 1999 Semen Ustimenko * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * Id: ntfs_vfsops.c,v 1.7 1999/05/31 11:28:30 phk Exp */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: ntfs_vfsops.c,v 1.111 2024/02/04 00:16:59 christos Exp $"); #include <sys/param.h> #include <sys/systm.h> #include <sys/namei.h> #include <sys/proc.h> #include <sys/kernel.h> #include <sys/vnode.h> #include <sys/mount.h> #include <sys/buf.h> #include <sys/fcntl.h> #include <sys/malloc.h> #include <sys/sysctl.h> #include <sys/device.h> #include <sys/conf.h> #include <sys/kauth.h> #include <sys/module.h> #include <uvm/uvm_extern.h> #include <miscfs/genfs/genfs.h> #include <miscfs/specfs/specdev.h> #include <fs/ntfs/ntfs.h> #include <fs/ntfs/ntfs_inode.h> #include <fs/ntfs/ntfs_subr.h> #include <fs/ntfs/ntfs_vfsops.h> #include <fs/ntfs/ntfs_ihash.h> #include <fs/ntfs/ntfsmount.h> MODULE(MODULE_CLASS_VFS, ntfs, NULL); MALLOC_JUSTDEFINE(M_NTFSMNT, "NTFS mount", "NTFS mount structure"); MALLOC_JUSTDEFINE(M_NTFSNTNODE,"NTFS ntnode", "NTFS ntnode information"); MALLOC_JUSTDEFINE(M_NTFSDIR,"NTFS dir", "NTFS dir buffer"); static int ntfs_superblock_validate(struct ntfsmount *); static int ntfs_mount(struct mount *, const char *, void *, size_t *); static int ntfs_root(struct mount *, int, struct vnode **); static int ntfs_start(struct mount *, int); static int ntfs_statvfs(struct mount *, struct statvfs *); static int ntfs_sync(struct mount *, int, kauth_cred_t); static int ntfs_unmount(struct mount *, int); static int ntfs_vget(struct mount *mp, ino_t ino, int, struct vnode **vpp); static int ntfs_loadvnode(struct mount *, struct vnode *, const void *, size_t, const void **); static int ntfs_mountfs(struct vnode *, struct mount *, struct ntfs_args *, struct lwp *); static int ntfs_vptofh(struct vnode *, struct fid *, size_t *); static void ntfs_init(void); static void ntfs_reinit(void); static void ntfs_done(void); static int ntfs_fhtovp(struct mount *, struct fid *, int, struct vnode **); static int ntfs_mountroot(void); static const struct genfs_ops ntfs_genfsops = { .gop_write = genfs_compat_gop_write, }; static struct sysctllog *ntfs_sysctl_log; static int ntfs_mountroot(void) { struct mount *mp; struct lwp *l = curlwp; /* XXX */ int error; struct ntfs_args args; if (device_class(root_device) != DV_DISK) return (ENODEV); if ((error = vfs_rootmountalloc(MOUNT_NTFS, "root_device", &mp))) { vrele(rootvp); return (error); } args.flag = 0; args.uid = 0; args.gid = 0; args.mode = S_IRWXU|S_IRWXG|S_IRWXO; if ((error = ntfs_mountfs(rootvp, mp, &args, l)) != 0) { vfs_unbusy(mp); vfs_rele(mp); return (error); } mountlist_append(mp); (void)ntfs_statvfs(mp, &mp->mnt_stat); vfs_unbusy(mp); return (0); } static void ntfs_init(void) { malloc_type_attach(M_NTFSMNT); malloc_type_attach(M_NTFSNTNODE); malloc_type_attach(M_NTFSDIR); malloc_type_attach(M_NTFSNTVATTR); malloc_type_attach(M_NTFSRDATA); malloc_type_attach(M_NTFSDECOMP); malloc_type_attach(M_NTFSRUN); ntfs_nthashinit(); ntfs_toupper_init(); } static void ntfs_reinit(void) { ntfs_nthashreinit(); } static void ntfs_done(void) { ntfs_nthashdone(); malloc_type_detach(M_NTFSMNT); malloc_type_detach(M_NTFSNTNODE); malloc_type_detach(M_NTFSDIR); malloc_type_detach(M_NTFSNTVATTR); malloc_type_detach(M_NTFSRDATA); malloc_type_detach(M_NTFSDECOMP); malloc_type_detach(M_NTFSRUN); } static int ntfs_mount(struct mount *mp, const char *path, void *data, size_t *data_len) { struct lwp *l = curlwp; int err = 0, flags; struct vnode *devvp; struct ntfs_args *args = data; if (args == NULL) return EINVAL; if (*data_len < sizeof *args) return EINVAL; if (mp->mnt_flag & MNT_GETARGS) { struct ntfsmount *ntmp = VFSTONTFS(mp); if (ntmp == NULL) return EIO; args->fspec = NULL; args->uid = ntmp->ntm_uid; args->gid = ntmp->ntm_gid; args->mode = ntmp->ntm_mode; args->flag = ntmp->ntm_flag; *data_len = sizeof *args; return 0; } /* *** * Mounting non-root file system or updating a file system *** */ /* * If updating, check whether changing from read-only to * read/write; if there is no device name, that's all we do. */ if (mp->mnt_flag & MNT_UPDATE) { printf("ntfs_mount(): MNT_UPDATE not supported\n"); return (EINVAL); } /* * Not an update, or updating the name: look up the name * and verify that it refers to a sensible block device. */ err = namei_simple_user(args->fspec, NSM_FOLLOW_NOEMULROOT, &devvp); if (err) return (err); if (devvp->v_type != VBLK) { err = ENOTBLK; goto fail; } if (bdevsw_lookup(devvp->v_rdev) == NULL) { err = ENXIO; goto fail; } if (mp->mnt_flag & MNT_UPDATE) { #if 0 /* ******************** * UPDATE ******************** */ if (devvp != ntmp->um_devvp) { err = EINVAL; /* needs translation */ goto fail; } /* * Update device name only on success */ err = set_statvfs_info(NULL, UIO_USERSPACE, args->fspec, UIO_USERSPACE, mp->mnt_op->vfs_name, mp, p); if (err) goto fail; vrele(devvp); #endif } else { /* ******************** * NEW MOUNT ******************** */ /* * Since this is a new mount, we want the names for * the device and the mount point copied in. If an * error occurs, the mountpoint is discarded by the * upper level code. */ /* Save "last mounted on" info for mount point (NULL pad)*/ err = set_statvfs_info(path, UIO_USERSPACE, args->fspec, UIO_USERSPACE, mp->mnt_op->vfs_name, mp, l); if (err) goto fail; if (mp->mnt_flag & MNT_RDONLY) flags = FREAD; else flags = FREAD|FWRITE; vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY); err = VOP_OPEN(devvp, flags, FSCRED); VOP_UNLOCK(devvp); if (err) goto fail; err = ntfs_mountfs(devvp, mp, args, l); if (err) { vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY); (void)VOP_CLOSE(devvp, flags, NOCRED); VOP_UNLOCK(devvp); goto fail; } } /* * Initialize FS stat information in mount struct; uses both * mp->mnt_stat.f_mntonname and mp->mnt_stat.f_mntfromname * * This code is common to root and non-root mounts */ (void)VFS_STATVFS(mp, &mp->mnt_stat); return (err); fail: vrele(devvp); return (err); } static int ntfs_superblock_validate(struct ntfsmount *ntmp) { /* Sanity checks. XXX: More checks are probably needed. */ if (strncmp(ntmp->ntm_bootfile.bf_sysid, NTFS_BBID, NTFS_BBIDLEN)) { dprintf(("ntfs_superblock_validate: invalid boot block\n")); return EINVAL; } if (ntmp->ntm_bps == 0) { dprintf(("ntfs_superblock_validate: invalid bytes per sector\n")); return EINVAL; } if (ntmp->ntm_spc == 0) { dprintf(("ntfs_superblock_validate: invalid sectors per cluster\n")); return EINVAL; } return 0; } /* * Common code for mount and mountroot */ int ntfs_mountfs(struct vnode *devvp, struct mount *mp, struct ntfs_args *argsp, struct lwp *l) { struct buf *bp; struct ntfsmount *ntmp; dev_t dev = devvp->v_rdev; int error, i; struct vnode *vp; struct vnode_iterator *marker; ntmp = NULL; /* * Flush out any old buffers remaining from a previous use. */ vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY); error = vinvalbuf(devvp, V_SAVE, l->l_cred, l, 0, 0); VOP_UNLOCK(devvp); if (error) return (error); bp = NULL; error = bread(devvp, BBLOCK, BBSIZE, 0, &bp); if (error) goto out; ntmp = malloc(sizeof(*ntmp), M_NTFSMNT, M_WAITOK|M_ZERO); memcpy(&ntmp->ntm_bootfile, bp->b_data, sizeof(struct bootfile)); brelse(bp, 0); bp = NULL; if ((error = ntfs_superblock_validate(ntmp))) goto out; { int8_t cpr = ntmp->ntm_mftrecsz; if (cpr > 0) ntmp->ntm_bpmftrec = ntmp->ntm_spc * cpr; else ntmp->ntm_bpmftrec = (1 << (-cpr)) / ntmp->ntm_bps; } dprintf(("ntfs_mountfs(): bps: %d, spc: %d, media: %x, mftrecsz: %d (%d sects)\n", ntmp->ntm_bps, ntmp->ntm_spc, ntmp->ntm_bootfile.bf_media, ntmp->ntm_mftrecsz, ntmp->ntm_bpmftrec)); dprintf(("ntfs_mountfs(): mftcn: 0x%x|0x%x\n", (u_int32_t)ntmp->ntm_mftcn, (u_int32_t)ntmp->ntm_mftmirrcn)); ntmp->ntm_mountp = mp; ntmp->ntm_dev = dev; ntmp->ntm_devvp = devvp; ntmp->ntm_uid = argsp->uid; ntmp->ntm_gid = argsp->gid; ntmp->ntm_mode = argsp->mode & (S_IRWXU|S_IRWXG|S_IRWXO); ntmp->ntm_flag = argsp->flag; mp->mnt_data = ntmp; /* set file name encode/decode hooks XXX utf-8 only for now */ ntmp->ntm_wget = ntfs_utf8_wget; ntmp->ntm_wput = ntfs_utf8_wput; ntmp->ntm_wcmp = ntfs_utf8_wcmp; dprintf(("ntfs_mountfs(): case-%s,%s uid: %d, gid: %d, mode: %o\n", (ntmp->ntm_flag & NTFS_MFLAG_CASEINS)?"insens.":"sens.", (ntmp->ntm_flag & NTFS_MFLAG_ALLNAMES)?" allnames,":"", ntmp->ntm_uid, ntmp->ntm_gid, ntmp->ntm_mode)); /* * We read in some system nodes to do not allow * reclaim them and to have everytime access to them. */ { int pi[3] = { NTFS_MFTINO, NTFS_ROOTINO, NTFS_BITMAPINO }; for (i = 0; i < 3; i++) { error = VFS_VGET(mp, pi[i], LK_EXCLUSIVE, &(ntmp->ntm_sysvn[pi[i]])); if (error) goto out1; ntmp->ntm_sysvn[pi[i]]->v_vflag |= VV_SYSTEM; vref(ntmp->ntm_sysvn[pi[i]]); vput(ntmp->ntm_sysvn[pi[i]]); } } /* read the Unicode lowercase --> uppercase translation table, * if necessary */ if ((error = ntfs_toupper_use(mp, ntmp))) goto out1; /* * Scan $BitMap and count free clusters */ error = ntfs_calccfree(ntmp, &ntmp->ntm_cfree); if (error) goto out1; /* * Read and translate to internal format attribute * definition file. */ { int num,j; struct attrdef ad; /* Open $AttrDef */ error = VFS_VGET(mp, NTFS_ATTRDEFINO, LK_EXCLUSIVE, &vp); if (error) goto out1; /* Count valid entries */ for (num = 0; ; num++) { error = ntfs_readattr(ntmp, VTONT(vp), NTFS_A_DATA, NULL, num * sizeof(ad), sizeof(ad), &ad, NULL); if (error) goto out1; if (ad.ad_name[0] == 0) break; } /* Alloc memory for attribute definitions */ ntmp->ntm_ad = (struct ntvattrdef *) malloc( num * sizeof(struct ntvattrdef), M_NTFSMNT, M_WAITOK); ntmp->ntm_adnum = num; /* Read them and translate */ for (i = 0; i < num; i++) { error = ntfs_readattr(ntmp, VTONT(vp), NTFS_A_DATA, NULL, i * sizeof(ad), sizeof(ad), &ad, NULL); if (error) goto out1; j = 0; do { ntmp->ntm_ad[i].ad_name[j] = ad.ad_name[j]; } while(ad.ad_name[j++]); ntmp->ntm_ad[i].ad_namelen = j - 1; ntmp->ntm_ad[i].ad_type = ad.ad_type; } vput(vp); } mp->mnt_stat.f_fsidx.__fsid_val[0] = dev; mp->mnt_stat.f_fsidx.__fsid_val[1] = makefstype(MOUNT_NTFS); mp->mnt_stat.f_fsid = mp->mnt_stat.f_fsidx.__fsid_val[0]; mp->mnt_stat.f_namemax = NTFS_MAXFILENAME; mp->mnt_flag |= MNT_LOCAL; spec_node_setmountedfs(devvp, mp); return (0); out1: for (i = 0; i < NTFS_SYSNODESNUM; i++) if (ntmp->ntm_sysvn[i]) vrele(ntmp->ntm_sysvn[i]); vfs_vnode_iterator_init(mp, &marker); while ((vp = vfs_vnode_iterator_next(marker, NULL, NULL))) { if (vrecycle(vp)) continue; panic("%s: cannot recycle vnode %p", __func__, vp); } vfs_vnode_iterator_destroy(marker); out: spec_node_setmountedfs(devvp, NULL); if (bp) brelse(bp, 0); if (error) { if (ntmp) { if (ntmp->ntm_ad) free(ntmp->ntm_ad, M_NTFSMNT); free(ntmp, M_NTFSMNT); } } return (error); } static int ntfs_start(struct mount *mp, int flags) { return (0); } static int ntfs_unmount(struct mount *mp, int mntflags) { struct lwp *l = curlwp; struct ntfsmount *ntmp; int error, ronly = 0, flags, i; dprintf(("ntfs_unmount: unmounting...\n")); ntmp = VFSTONTFS(mp); flags = 0; if (mntflags & MNT_FORCE) flags |= FORCECLOSE; dprintf(("ntfs_unmount: vflushing...\n")); error = vflush(mp, NULLVP, flags | SKIPSYSTEM); if (error) { dprintf(("ntfs_unmount: vflush failed: %d\n",error)); return (error); } /* Check if only system vnodes are rest */ for (i = 0; i < NTFS_SYSNODESNUM; i++) if ((ntmp->ntm_sysvn[i]) && (vrefcnt(ntmp->ntm_sysvn[i]) > 1)) return (EBUSY); /* Dereference all system vnodes */ for (i = 0; i < NTFS_SYSNODESNUM; i++) if (ntmp->ntm_sysvn[i]) vrele(ntmp->ntm_sysvn[i]); /* vflush system vnodes */ error = vflush(mp, NULLVP, flags); if (error) { panic("ntfs_unmount: vflush failed(sysnodes): %d\n",error); } /* Check if the type of device node isn't VBAD before * touching v_specinfo. If the device vnode is revoked, the * field is NULL and touching it causes null pointer derefercence. */ if (ntmp->ntm_devvp->v_type != VBAD) spec_node_setmountedfs(ntmp->ntm_devvp, NULL); error = vinvalbuf(ntmp->ntm_devvp, V_SAVE, NOCRED, l, 0, 0); KASSERT(error == 0); /* lock the device vnode before calling VOP_CLOSE() */ vn_lock(ntmp->ntm_devvp, LK_EXCLUSIVE | LK_RETRY); error = VOP_CLOSE(ntmp->ntm_devvp, ronly ? FREAD : FREAD|FWRITE, NOCRED); KASSERT(error == 0); VOP_UNLOCK(ntmp->ntm_devvp); vrele(ntmp->ntm_devvp); /* free the toupper table, if this has been last mounted ntfs volume */ ntfs_toupper_unuse(); dprintf(("ntfs_umount: freeing memory...\n")); mp->mnt_data = NULL; mp->mnt_flag &= ~MNT_LOCAL; free(ntmp->ntm_ad, M_NTFSMNT); free(ntmp, M_NTFSMNT); return (0); } static int ntfs_root(struct mount *mp, int lktype, struct vnode **vpp) { struct vnode *nvp; int error = 0; dprintf(("ntfs_root(): sysvn: %p\n", VFSTONTFS(mp)->ntm_sysvn[NTFS_ROOTINO])); error = VFS_VGET(mp, (ino_t)NTFS_ROOTINO, lktype, &nvp); if (error) { printf("ntfs_root: VFS_VGET failed: %d\n", error); return (error); } *vpp = nvp; return (0); } int ntfs_calccfree(struct ntfsmount *ntmp, cn_t *cfreep) { struct vnode *vp; u_int8_t *tmp; int j, error; cn_t cfree = 0; size_t bmsize, i; vp = ntmp->ntm_sysvn[NTFS_BITMAPINO]; bmsize = VTOF(vp)->f_size; tmp = (u_int8_t *) malloc(bmsize, M_TEMP, M_WAITOK); error = ntfs_readattr(ntmp, VTONT(vp), NTFS_A_DATA, NULL, 0, bmsize, tmp, NULL); if (error) goto out; for (i = 0; i < bmsize; i++) for (j = 0; j < 8; j++) if (~tmp[i] & (1 << j)) cfree++; *cfreep = cfree; out: free(tmp, M_TEMP); return(error); } static int ntfs_statvfs(struct mount *mp, struct statvfs *sbp) { struct ntfsmount *ntmp = VFSTONTFS(mp); u_int64_t mftallocated; dprintf(("ntfs_statvfs():\n")); mftallocated = VTOF(ntmp->ntm_sysvn[NTFS_MFTINO])->f_allocated; sbp->f_bsize = ntmp->ntm_bps; sbp->f_frsize = sbp->f_bsize; /* XXX */ sbp->f_iosize = ntmp->ntm_bps * ntmp->ntm_spc; sbp->f_blocks = ntmp->ntm_bootfile.bf_spv; sbp->f_bfree = sbp->f_bavail = ntfs_cntobn(ntmp->ntm_cfree); sbp->f_ffree = sbp->f_favail = sbp->f_bfree / ntmp->ntm_bpmftrec; sbp->f_files = mftallocated / ntfs_bntob(ntmp->ntm_bpmftrec) + sbp->f_ffree; sbp->f_fresvd = sbp->f_bresvd = 0; /* XXX */ sbp->f_flag = mp->mnt_flag; copy_statvfs_info(sbp, mp); return (0); } static int ntfs_sync(struct mount *mp, int waitfor, kauth_cred_t cred) { /*dprintf(("ntfs_sync():\n"));*/ return (0); } /*ARGSUSED*/ static int ntfs_fhtovp(struct mount *mp, struct fid *fhp, int lktype, struct vnode **vpp) { struct ntfid ntfh; int error; if (fhp->fid_len != sizeof(struct ntfid)) return EINVAL; memcpy(&ntfh, fhp, sizeof(ntfh)); ddprintf(("ntfs_fhtovp(): %s: %llu\n", mp->mnt_stat.f_mntonname, (unsigned long long)ntfh.ntfid_ino)); error = ntfs_vgetex(mp, ntfh.ntfid_ino, ntfh.ntfid_attr, "", lktype, vpp); if (error != 0) { *vpp = NULLVP; return (error); } /* XXX as unlink/rmdir/mkdir/creat are not currently possible * with NTFS, we don't need to check anything else for now */ return (0); } static int ntfs_vptofh(struct vnode *vp, struct fid *fhp, size_t *fh_size) { struct ntnode *ntp; struct ntfid ntfh; struct fnode *fn; if (*fh_size < sizeof(struct ntfid)) { *fh_size = sizeof(struct ntfid); return E2BIG; } *fh_size = sizeof(struct ntfid); ddprintf(("ntfs_fhtovp(): %s: %p\n", vp->v_mount->mnt_stat.f_mntonname, vp)); fn = VTOF(vp); ntp = VTONT(vp); memset(&ntfh, 0, sizeof(ntfh)); ntfh.ntfid_len = sizeof(struct ntfid); ntfh.ntfid_ino = ntp->i_number; ntfh.ntfid_attr = fn->f_attrtype; #ifdef notyet ntfh.ntfid_gen = ntp->i_gen; #endif memcpy(fhp, &ntfh, sizeof(ntfh)); return (0); } static int ntfs_loadvnode(struct mount *mp, struct vnode *vp, const void *key, size_t key_len, const void **new_key) { int error; struct ntvattr *vap; struct ntkey small_key, *ntkey; struct ntfsmount *ntmp; struct ntnode *ip; struct fnode *fp = NULL; enum vtype f_type = VBAD; if (key_len <= sizeof(small_key)) ntkey = &small_key; else ntkey = kmem_alloc(key_len, KM_SLEEP); memcpy(ntkey, key, key_len); dprintf(("ntfs_loadvnode: ino: %llu, attr: 0x%x:%s", (unsigned long long)ntkey->k_ino, ntkey->k_attrtype, ntkey->k_attrname)); ntmp = VFSTONTFS(mp); /* Get ntnode */ error = ntfs_ntlookup(ntmp, ntkey->k_ino, &ip); if (error) { printf("ntfs_loadvnode: ntfs_ntget failed\n"); goto out; } /* It may be not initialized fully, so force load it */ if (!(ip->i_flag & IN_LOADED)) { error = ntfs_loadntnode(ntmp, ip); if (error) { printf("ntfs_loadvnode: CAN'T LOAD ATTRIBUTES FOR INO:" " %llu\n", (unsigned long long)ip->i_number); ntfs_ntput(ip); goto out; } } /* Setup fnode */ fp = kmem_zalloc(sizeof(*fp), KM_SLEEP); dprintf(("%s: allocating fnode: %p\n", __func__, fp)); error = ntfs_ntvattrget(ntmp, ip, NTFS_A_NAME, NULL, 0, &vap); if (error) { printf("%s: attr %x for ino %" PRId64 ": error %d\n", __func__, NTFS_A_NAME, ip->i_number, error); ntfs_ntput(ip); goto out; } fp->f_fflag = vap->va_a_name->n_flag; fp->f_pnumber = vap->va_a_name->n_pnumber; fp->f_times = vap->va_a_name->n_times; ntfs_ntvattrrele(vap); if ((ip->i_frflag & NTFS_FRFLAG_DIR) && (ntkey->k_attrtype == NTFS_A_DATA && strcmp(ntkey->k_attrname, "") == 0)) { f_type = VDIR; } else { f_type = VREG; error = ntfs_ntvattrget(ntmp, ip, ntkey->k_attrtype, ntkey->k_attrname, 0, &vap); if (error == 0) { fp->f_size = vap->va_datalen; fp->f_allocated = vap->va_allocated; ntfs_ntvattrrele(vap); } else if (ntkey->k_attrtype == NTFS_A_DATA && strcmp(ntkey->k_attrname, "") == 0 && error == ENOENT) { fp->f_size = 0; fp->f_allocated = 0; error = 0; } else { printf("%s: attr %x for ino %" PRId64 ": error %d\n", __func__, ntkey->k_attrtype, ip->i_number, error); ntfs_ntput(ip); goto out; } } if (key_len <= sizeof(fp->f_smallkey)) fp->f_key = &fp->f_smallkey; else fp->f_key = kmem_alloc(key_len, KM_SLEEP); fp->f_ip = ip; fp->f_ino = ip->i_number; strcpy(fp->f_attrname, ntkey->k_attrname); fp->f_attrtype = ntkey->k_attrtype; fp->f_vp = vp; vp->v_data = fp; vp->v_tag = VT_NTFS; vp->v_type = f_type; vp->v_op = ntfs_vnodeop_p; ntfs_ntref(ip); vref(ip->i_devvp); genfs_node_init(vp, &ntfs_genfsops); if (ip->i_number == NTFS_ROOTINO) vp->v_vflag |= VV_ROOT; uvm_vnp_setsize(vp, fp->f_size); ntfs_ntput(ip); *new_key = fp->f_key; fp = NULL; out: if (ntkey != &small_key) kmem_free(ntkey, key_len); if (fp) kmem_free(fp, sizeof(*fp)); return error; } static int ntfs_vget(struct mount *mp, ino_t ino, int lktype, struct vnode **vpp) { return ntfs_vgetex(mp, ino, NTFS_A_DATA, "", lktype, vpp); } int ntfs_vgetex(struct mount *mp, ino_t ino, u_int32_t attrtype, const char *attrname, u_long lkflags, struct vnode **vpp) { const int attrlen = strlen(attrname); int error; struct ntkey small_key, *ntkey; if (NTKEY_SIZE(attrlen) <= sizeof(small_key)) ntkey = &small_key; else ntkey = malloc(NTKEY_SIZE(attrlen), M_TEMP, M_WAITOK); ntkey->k_ino = ino; ntkey->k_attrtype = attrtype; strcpy(ntkey->k_attrname, attrname); error = vcache_get(mp, ntkey, NTKEY_SIZE(attrlen), vpp); if (error) goto out; if ((lkflags & (LK_SHARED | LK_EXCLUSIVE)) != 0) { error = vn_lock(*vpp, lkflags); if (error) { vrele(*vpp); *vpp = NULL; } } out: if (ntkey != &small_key) free(ntkey, M_TEMP); return error; } extern const struct vnodeopv_desc ntfs_vnodeop_opv_desc; const struct vnodeopv_desc * const ntfs_vnodeopv_descs[] = { &ntfs_vnodeop_opv_desc, NULL, }; struct vfsops ntfs_vfsops = { .vfs_name = MOUNT_NTFS, .vfs_min_mount_data = sizeof (struct ntfs_args), .vfs_mount = ntfs_mount, .vfs_start = ntfs_start, .vfs_unmount = ntfs_unmount, .vfs_root = ntfs_root, .vfs_quotactl = (void *)eopnotsupp, .vfs_statvfs = ntfs_statvfs, .vfs_sync = ntfs_sync, .vfs_vget = ntfs_vget, .vfs_loadvnode = ntfs_loadvnode, .vfs_fhtovp = ntfs_fhtovp, .vfs_vptofh = ntfs_vptofh, .vfs_init = ntfs_init, .vfs_reinit = ntfs_reinit, .vfs_done = ntfs_done, .vfs_mountroot = ntfs_mountroot, .vfs_snapshot = (void *)eopnotsupp, .vfs_extattrctl = vfs_stdextattrctl, .vfs_suspendctl = genfs_suspendctl, .vfs_renamelock_enter = genfs_renamelock_enter, .vfs_renamelock_exit = genfs_renamelock_exit, .vfs_fsync = (void *)eopnotsupp, .vfs_opv_descs = ntfs_vnodeopv_descs }; static int ntfs_modcmd(modcmd_t cmd, void *arg) { int error; switch (cmd) { case MODULE_CMD_INIT: error = vfs_attach(&ntfs_vfsops); if (error != 0) break; sysctl_createv(&ntfs_sysctl_log, 0, NULL, NULL, CTLFLAG_PERMANENT, CTLTYPE_NODE, "ntfs", SYSCTL_DESCR("NTFS file system"), NULL, 0, NULL, 0, CTL_VFS, 20, CTL_EOL); /* * XXX the "20" above could be dynamic, thereby eliminating * one more instance of the "number to vfs" mapping problem, * but "20" is the order as taken from sys/mount.h */ break; case MODULE_CMD_FINI: error = vfs_detach(&ntfs_vfsops); if (error != 0) break; sysctl_teardown(&ntfs_sysctl_log); break; default: error = ENOTTY; break; } return (error); }
7 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 /* $NetBSD: chacha_sse2_impl.c,v 1.1 2020/07/25 22:49:20 riastradh Exp $ */ /*- * Copyright (c) 2020 The NetBSD Foundation, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #include <sys/cdefs.h> __KERNEL_RCSID(1, "$NetBSD: chacha_sse2_impl.c,v 1.1 2020/07/25 22:49:20 riastradh Exp $"); #include "chacha_sse2.h" #ifdef _KERNEL #include <x86/cpu.h> #include <x86/fpu.h> #else #include <sys/sysctl.h> #include <cpuid.h> #include <stddef.h> #define fpu_kern_enter() ((void)0) #define fpu_kern_leave() ((void)0) #endif static void chacha_core_sse2_impl(uint8_t out[restrict static 64], const uint8_t in[static 16], const uint8_t k[static 32], const uint8_t c[static 16], unsigned nr) { fpu_kern_enter(); chacha_core_sse2(out, in, k, c, nr); fpu_kern_leave(); } static void hchacha_sse2_impl(uint8_t out[restrict static 32], const uint8_t in[static 16], const uint8_t k[static 32], const uint8_t c[static 16], unsigned nr) { fpu_kern_enter(); hchacha_sse2(out, in, k, c, nr); fpu_kern_leave(); } static void chacha_stream_sse2_impl(uint8_t *restrict s, size_t nbytes, uint32_t blkno, const uint8_t nonce[static 12], const uint8_t key[static 32], unsigned nr) { fpu_kern_enter(); chacha_stream_sse2(s, nbytes, blkno, nonce, key, nr); fpu_kern_leave(); } static void chacha_stream_xor_sse2_impl(uint8_t *c, const uint8_t *p, size_t nbytes, uint32_t blkno, const uint8_t nonce[static 12], const uint8_t key[static 32], unsigned nr) { fpu_kern_enter(); chacha_stream_xor_sse2(c, p, nbytes, blkno, nonce, key, nr); fpu_kern_leave(); } static void xchacha_stream_sse2_impl(uint8_t *restrict s, size_t nbytes, uint32_t blkno, const uint8_t nonce[static 24], const uint8_t key[static 32], unsigned nr) { fpu_kern_enter(); xchacha_stream_sse2(s, nbytes, blkno, nonce, key, nr); fpu_kern_leave(); } static void xchacha_stream_xor_sse2_impl(uint8_t *c, const uint8_t *p, size_t nbytes, uint32_t blkno, const uint8_t nonce[static 24], const uint8_t key[static 32], unsigned nr) { fpu_kern_enter(); xchacha_stream_xor_sse2(c, p, nbytes, blkno, nonce, key, nr); fpu_kern_leave(); } static int chacha_probe_sse2(void) { /* Verify that the CPU supports SSE and SSE2. */ #ifdef _KERNEL if (!i386_has_sse) return -1; if (!i386_has_sse2) return -1; #else unsigned eax, ebx, ecx, edx; if (!__get_cpuid(1, &eax, &ebx, &ecx, &edx)) return -1; if ((edx & bit_SSE) == 0) return -1; if ((edx & bit_SSE2) == 0) return -1; #endif return 0; } const struct chacha_impl chacha_sse2_impl = { .ci_name = "x86 SSE2 ChaCha", .ci_probe = chacha_probe_sse2, .ci_chacha_core = chacha_core_sse2_impl, .ci_hchacha = hchacha_sse2_impl, .ci_chacha_stream = chacha_stream_sse2_impl, .ci_chacha_stream_xor = chacha_stream_xor_sse2_impl, .ci_xchacha_stream = xchacha_stream_sse2_impl, .ci_xchacha_stream_xor = xchacha_stream_xor_sse2_impl, };
172 172 172 172 172 172 19 171 3 170 19 172 18 128 128 3 1 128 23 1 122 42 89 89 33 22 4 17 10 7 28 52 31 22 10 47 47 27 21 17 7 10 34 32 2 1 98 12 2 10 4 3 1 3 1 7 1 6 22 20 3 22 3 1 1 1 1 33 33 32 31 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 /* $NetBSD: layer_vnops.c,v 1.72 2021/10/20 03:08:18 thorpej Exp $ */ /* * Copyright (c) 1999 National Aeronautics & Space Administration * All rights reserved. * * This software was written by William Studenmund of the * Numerical Aerospace Simulation Facility, NASA Ames Research Center. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the National Aeronautics & Space Administration * nor the names of its contributors may be used to endorse or promote * products derived from this software without specific prior written * permission. * * THIS SOFTWARE IS PROVIDED BY THE NATIONAL AERONAUTICS & SPACE ADMINISTRATION * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE ADMINISTRATION OR CONTRIB- * UTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /* * Copyright (c) 1992, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * John Heidemann of the UCLA Ficus project. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)null_vnops.c 8.6 (Berkeley) 5/27/95 * * Ancestors: * @(#)lofs_vnops.c 1.2 (Berkeley) 6/18/92 * Id: lofs_vnops.c,v 1.11 1992/05/30 10:05:43 jsp Exp jsp * ...and... * @(#)null_vnodeops.c 1.20 92/07/07 UCLA Ficus project */ /* * Generic layer vnode operations. * * The layer.h, layer_extern.h, layer_vfs.c, and layer_vnops.c files provide * the core implementation of stacked file-systems. * * The layerfs duplicates a portion of the file system name space under * a new name. In this respect, it is similar to the loopback file system. * It differs from the loopback fs in two respects: it is implemented using * a stackable layers technique, and it is "layerfs-nodes" stack above all * lower-layer vnodes, not just over directory vnodes. * * OPERATION OF LAYERFS * * The layerfs is the minimum file system layer, bypassing all possible * operations to the lower layer for processing there. The majority of its * activity centers on the bypass routine, through which nearly all vnode * operations pass. * * The bypass routine accepts arbitrary vnode operations for handling by * the lower layer. It begins by examining vnode operation arguments and * replacing any layered nodes by their lower-layer equivalents. It then * invokes an operation on the lower layer. Finally, it replaces the * layered nodes in the arguments and, if a vnode is returned by the * operation, stacks a layered node on top of the returned vnode. * * The bypass routine in this file, layer_bypass(), is suitable for use * by many different layered filesystems. It can be used by multiple * filesystems simultaneously. Alternatively, a layered fs may provide * its own bypass routine, in which case layer_bypass() should be used as * a model. For instance, the main functionality provided by umapfs, the user * identity mapping file system, is handled by a custom bypass routine. * * Typically a layered fs registers its selected bypass routine as the * default vnode operation in its vnodeopv_entry_desc table. Additionally * the filesystem must store the bypass entry point in the layerm_bypass * field of struct layer_mount. All other layer routines in this file will * use the layerm_bypass() routine. * * Although the bypass routine handles most operations outright, a number * of operations are special cased and handled by the layerfs. For instance, * layer_getattr() must change the fsid being returned. While layer_lock() * and layer_unlock() must handle any locking for the current vnode as well * as pass the lock request down. layer_inactive() and layer_reclaim() are * not bypassed so that they can handle freeing layerfs-specific data. Also, * certain vnode operations (create, mknod, remove, link, rename, mkdir, * rmdir, and symlink) change the locking state within the operation. Ideally * these operations should not change the lock state, but should be changed * to let the caller of the function unlock them. Otherwise, all intermediate * vnode layers (such as union, umapfs, etc) must catch these functions to do * the necessary locking at their layer. * * INSTANTIATING VNODE STACKS * * Mounting associates "layerfs-nodes" stack and lower layer, in effect * stacking two VFSes. The initial mount creates a single vnode stack for * the root of the new layerfs. All other vnode stacks are created as a * result of vnode operations on this or other layerfs vnode stacks. * * New vnode stacks come into existence as a result of an operation which * returns a vnode. The bypass routine stacks a layerfs-node above the new * vnode before returning it to the caller. * * For example, imagine mounting a null layer with: * * "mount_null /usr/include /dev/layer/null" * * Changing directory to /dev/layer/null will assign the root layerfs-node, * which was created when the null layer was mounted). Now consider opening * "sys". A layer_lookup() would be performed on the root layerfs-node. * This operation would bypass through to the lower layer which would return * a vnode representing the UFS "sys". Then, layer_bypass() builds a * layerfs-node aliasing the UFS "sys" and returns this to the caller. * Later operations on the layerfs-node "sys" will repeat this process when * constructing other vnode stacks. * * INVOKING OPERATIONS ON LOWER LAYERS * * There are two techniques to invoke operations on a lower layer when the * operation cannot be completely bypassed. Each method is appropriate in * different situations. In both cases, it is the responsibility of the * aliasing layer to make the operation arguments "correct" for the lower * layer by mapping any vnode arguments to the lower layer. * * The first approach is to call the aliasing layer's bypass routine. This * method is most suitable when you wish to invoke the operation currently * being handled on the lower layer. It has the advantage that the bypass * routine already must do argument mapping. An example of this is * layer_getattr(). * * A second approach is to directly invoke vnode operations on the lower * layer with the VOP_OPERATIONNAME interface. The advantage of this method * is that it is easy to invoke arbitrary operations on the lower layer. * The disadvantage is that vnode's arguments must be manually mapped. */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: layer_vnops.c,v 1.72 2021/10/20 03:08:18 thorpej Exp $"); #include <sys/param.h> #include <sys/systm.h> #include <sys/proc.h> #include <sys/time.h> #include <sys/vnode.h> #include <sys/mount.h> #include <sys/namei.h> #include <sys/kmem.h> #include <sys/buf.h> #include <sys/kauth.h> #include <sys/fcntl.h> #include <sys/fstrans.h> #include <miscfs/genfs/layer.h> #include <miscfs/genfs/layer_extern.h> #include <miscfs/genfs/genfs.h> #include <miscfs/specfs/specdev.h> /* * This is the 08-June-99 bypass routine, based on the 10-Apr-92 bypass * routine by John Heidemann. * The new element for this version is that the whole nullfs * system gained the concept of locks on the lower node. * The 10-Apr-92 version was optimized for speed, throwing away some * safety checks. It should still always work, but it's not as * robust to programmer errors. * * In general, we map all vnodes going down and unmap them on the way back. * * Also, some BSD vnode operations have the side effect of vrele'ing * their arguments. With stacking, the reference counts are held * by the upper node, not the lower one, so we must handle these * side-effects here. This is not of concern in Sun-derived systems * since there are no such side-effects. * * New for the 08-June-99 version: we also handle operations which unlock * the passed-in node (typically they vput the node). * * This makes the following assumptions: * - only one returned vpp * - no INOUT vpp's (Sun's vop_open has one of these) * - the vnode operation vector of the first vnode should be used * to determine what implementation of the op should be invoked * - all mapped vnodes are of our vnode-type (NEEDSWORK: * problems on rmdir'ing mount points and renaming?) */ int layer_bypass(void *v) { struct vop_generic_args /* { struct vnodeop_desc *a_desc; <other random data follows, presumably> } */ *ap = v; int (**our_vnodeop_p)(void *); struct vnode **this_vp_p; int error; struct vnode *old_vps[VDESC_MAX_VPS], *vp0; struct vnode **vps_p[VDESC_MAX_VPS]; struct vnode ***vppp; struct mount *mp; struct vnodeop_desc *descp = ap->a_desc; int reles, i, flags; #ifdef DIAGNOSTIC /* * We require at least one vp. */ if (descp->vdesc_vp_offsets == NULL || descp->vdesc_vp_offsets[0] == VDESC_NO_OFFSET) panic("%s: no vp's in map.\n", __func__); #endif vps_p[0] = VOPARG_OFFSETTO(struct vnode**, descp->vdesc_vp_offsets[0], ap); vp0 = *vps_p[0]; mp = vp0->v_mount; flags = MOUNTTOLAYERMOUNT(mp)->layerm_flags; our_vnodeop_p = vp0->v_op; if (flags & LAYERFS_MBYPASSDEBUG) printf("%s: %s\n", __func__, descp->vdesc_name); /* * Map the vnodes going in. * Later, we'll invoke the operation based on * the first mapped vnode's operation vector. */ reles = descp->vdesc_flags; for (i = 0; i < VDESC_MAX_VPS; reles >>= 1, i++) { if (descp->vdesc_vp_offsets[i] == VDESC_NO_OFFSET) break; /* bail out at end of list */ vps_p[i] = this_vp_p = VOPARG_OFFSETTO(struct vnode**, descp->vdesc_vp_offsets[i], ap); /* * We're not guaranteed that any but the first vnode * are of our type. Check for and don't map any * that aren't. (We must always map first vp or vclean fails.) */ if (i && (*this_vp_p == NULL || (*this_vp_p)->v_op != our_vnodeop_p)) { old_vps[i] = NULL; } else { old_vps[i] = *this_vp_p; *(vps_p[i]) = LAYERVPTOLOWERVP(*this_vp_p); /* * XXX - Several operations have the side effect * of vrele'ing their vp's. We must account for * that. (This should go away in the future.) */ if (reles & VDESC_VP0_WILLRELE) vref(*this_vp_p); } } /* * Call the operation on the lower layer * with the modified argument structure. */ error = VCALL(*vps_p[0], descp->vdesc_offset, ap); /* * Maintain the illusion of call-by-value * by restoring vnodes in the argument structure * to their original value. */ reles = descp->vdesc_flags; for (i = 0; i < VDESC_MAX_VPS; reles >>= 1, i++) { if (descp->vdesc_vp_offsets[i] == VDESC_NO_OFFSET) break; /* bail out at end of list */ if (old_vps[i]) { *(vps_p[i]) = old_vps[i]; if (reles & VDESC_VP0_WILLRELE) vrele(*(vps_p[i])); } } /* * Map the possible out-going vpp * (Assumes that the lower layer always returns * a VREF'ed vpp unless it gets an error.) */ if (descp->vdesc_vpp_offset != VDESC_NO_OFFSET && !error) { vppp = VOPARG_OFFSETTO(struct vnode***, descp->vdesc_vpp_offset, ap); /* * Only vop_lookup, vop_create, vop_makedir, vop_mknod * and vop_symlink return vpp's. vop_lookup doesn't call bypass * as a lookup on "." would generate a locking error. * So all the calls which get us here have a unlocked vpp. :-) */ error = layer_node_create(mp, **vppp, *vppp); if (error) { vrele(**vppp); **vppp = NULL; } } return error; } /* * We have to carry on the locking protocol on the layer vnodes * as we progress through the tree. We also have to enforce read-only * if this layer is mounted read-only. */ int layer_lookup(void *v) { struct vop_lookup_v2_args /* { struct vnodeop_desc *a_desc; struct vnode * a_dvp; struct vnode ** a_vpp; struct componentname * a_cnp; } */ *ap = v; struct componentname *cnp = ap->a_cnp; struct vnode *dvp, *lvp, *ldvp; int error, flags = cnp->cn_flags; dvp = ap->a_dvp; if ((flags & ISLASTCN) && (dvp->v_mount->mnt_flag & MNT_RDONLY) && (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)) { *ap->a_vpp = NULL; return EROFS; } ldvp = LAYERVPTOLOWERVP(dvp); ap->a_dvp = ldvp; error = VCALL(ldvp, ap->a_desc->vdesc_offset, ap); lvp = *ap->a_vpp; *ap->a_vpp = NULL; if (error == EJUSTRETURN && (flags & ISLASTCN) && (dvp->v_mount->mnt_flag & MNT_RDONLY) && (cnp->cn_nameiop == CREATE || cnp->cn_nameiop == RENAME)) error = EROFS; /* * We must do the same locking and unlocking at this layer as * is done in the layers below us. */ if (ldvp == lvp) { /* * Got the same object back, because we looked up ".", * or ".." in the root node of a mount point. * So we make another reference to dvp and return it. */ vref(dvp); *ap->a_vpp = dvp; vrele(lvp); } else if (lvp != NULL) { /* Note: dvp and ldvp are both locked. */ KASSERT(error != ENOLCK); error = layer_node_create(dvp->v_mount, lvp, ap->a_vpp); if (error) { vrele(lvp); } } return error; } /* * Setattr call. Disallow write attempts if the layer is mounted read-only. */ int layer_setattr(void *v) { struct vop_setattr_args /* { struct vnodeop_desc *a_desc; struct vnode *a_vp; struct vattr *a_vap; kauth_cred_t a_cred; struct lwp *a_l; } */ *ap = v; struct vnode *vp = ap->a_vp; struct vattr *vap = ap->a_vap; if ((vap->va_flags != VNOVAL || vap->va_uid != (uid_t)VNOVAL || vap->va_gid != (gid_t)VNOVAL || vap->va_atime.tv_sec != VNOVAL || vap->va_mtime.tv_sec != VNOVAL || vap->va_mode != (mode_t)VNOVAL) && (vp->v_mount->mnt_flag & MNT_RDONLY)) return EROFS; if (vap->va_size != VNOVAL) { switch (vp->v_type) { case VDIR: return EISDIR; case VCHR: case VBLK: case VSOCK: case VFIFO: return 0; case VREG: case VLNK: default: /* * Disallow write attempts if the filesystem is * mounted read-only. */ if (vp->v_mount->mnt_flag & MNT_RDONLY) return EROFS; } } return LAYERFS_DO_BYPASS(vp, ap); } /* * We handle getattr only to change the fsid. */ int layer_getattr(void *v) { struct vop_getattr_args /* { struct vnode *a_vp; struct vattr *a_vap; kauth_cred_t a_cred; struct lwp *a_l; } */ *ap = v; struct vnode *vp = ap->a_vp; int error; error = LAYERFS_DO_BYPASS(vp, ap); if (error) { return error; } /* Requires that arguments be restored. */ ap->a_vap->va_fsid = vp->v_mount->mnt_stat.f_fsidx.__fsid_val[0]; return 0; } int layer_access(void *v) { struct vop_access_args /* { struct vnode *a_vp; accmode_t a_accmode; kauth_cred_t a_cred; struct lwp *a_l; } */ *ap = v; struct vnode *vp = ap->a_vp; accmode_t accmode = ap->a_accmode; /* * Disallow write attempts on read-only layers; * unless the file is a socket, fifo, or a block or * character device resident on the file system. */ if (accmode & VWRITE) { switch (vp->v_type) { case VDIR: case VLNK: case VREG: if (vp->v_mount->mnt_flag & MNT_RDONLY) return EROFS; break; default: break; } } return LAYERFS_DO_BYPASS(vp, ap); } /* * We must handle open to be able to catch MNT_NODEV and friends * and increment the lower v_writecount. */ int layer_open(void *v) { struct vop_open_args /* { const struct vnodeop_desc *a_desc; struct vnode *a_vp; int a_mode; kauth_cred_t a_cred; } */ *ap = v; struct vnode *vp = ap->a_vp; struct vnode *lvp = LAYERVPTOLOWERVP(vp); int error; if (((lvp->v_type == VBLK) || (lvp->v_type == VCHR)) && (vp->v_mount->mnt_flag & MNT_NODEV)) return ENXIO; error = LAYERFS_DO_BYPASS(vp, ap); if (error == 0 && (ap->a_mode & FWRITE)) { mutex_enter(lvp->v_interlock); lvp->v_writecount++; mutex_exit(lvp->v_interlock); } return error; } /* * We must handle close to decrement the lower v_writecount. */ int layer_close(void *v) { struct vop_close_args /* { const struct vnodeop_desc *a_desc; struct vnode *a_vp; int a_fflag; kauth_cred_t a_cred; } */ *ap = v; struct vnode *vp = ap->a_vp; struct vnode *lvp = LAYERVPTOLOWERVP(vp); if ((ap->a_fflag & FWRITE)) { mutex_enter(lvp->v_interlock); KASSERT(lvp->v_writecount > 0); lvp->v_writecount--; mutex_exit(lvp->v_interlock); } return LAYERFS_DO_BYPASS(vp, ap); } /* * If vinvalbuf is calling us, it's a "shallow fsync" -- don't bother * syncing the underlying vnodes, since they'll be fsync'ed when * reclaimed; otherwise, pass it through to the underlying layer. * * XXX Do we still need to worry about shallow fsync? */ int layer_fsync(void *v) { struct vop_fsync_args /* { struct vnode *a_vp; kauth_cred_t a_cred; int a_flags; off_t offlo; off_t offhi; struct lwp *a_l; } */ *ap = v; int error; if (ap->a_flags & FSYNC_RECLAIM) { return 0; } if (ap->a_vp->v_type == VBLK || ap->a_vp->v_type == VCHR) { error = spec_fsync(v); if (error) return error; } return LAYERFS_DO_BYPASS(ap->a_vp, ap); } int layer_inactive(void *v) { struct vop_inactive_v2_args /* { struct vnode *a_vp; bool *a_recycle; } */ *ap = v; struct vnode *vp = ap->a_vp; /* * If we did a remove, don't cache the node. */ *ap->a_recycle = ((VTOLAYER(vp)->layer_flags & LAYERFS_REMOVED) != 0); /* * Do nothing (and _don't_ bypass). * Wait to vrele lowervp until reclaim, * so that until then our layer_node is in the * cache and reusable. * * NEEDSWORK: Someday, consider inactive'ing * the lowervp and then trying to reactivate it * with capabilities (v_id) * like they do in the name lookup cache code. * That's too much work for now. */ return 0; } int layer_remove(void *v) { struct vop_remove_v3_args /* { struct vnode *a_dvp; struct vnode *a_vp; struct componentname *a_cnp; nlink_t ctx_vp_new_nlink; } */ *ap = v; struct vnode *vp = ap->a_vp; int error; vref(vp); error = LAYERFS_DO_BYPASS(vp, ap); if (error == 0) { VTOLAYER(vp)->layer_flags |= LAYERFS_REMOVED; } vrele(vp); return error; } int layer_rename(void *v) { struct vop_rename_args /* { struct vnode *a_fdvp; struct vnode *a_fvp; struct componentname *a_fcnp; struct vnode *a_tdvp; struct vnode *a_tvp; struct componentname *a_tcnp; } */ *ap = v; struct vnode *fdvp = ap->a_fdvp, *tvp; int error; tvp = ap->a_tvp; if (tvp) { if (tvp->v_mount != fdvp->v_mount) tvp = NULL; else vref(tvp); } error = LAYERFS_DO_BYPASS(fdvp, ap); if (tvp) { if (error == 0) VTOLAYER(tvp)->layer_flags |= LAYERFS_REMOVED; vrele(tvp); } return error; } int layer_rmdir(void *v) { struct vop_rmdir_v2_args /* { struct vnode *a_dvp; struct vnode *a_vp; struct componentname *a_cnp; } */ *ap = v; int error; struct vnode *vp = ap->a_vp; vref(vp); error = LAYERFS_DO_BYPASS(vp, ap); if (error == 0) { VTOLAYER(vp)->layer_flags |= LAYERFS_REMOVED; } vrele(vp); return error; } int layer_revoke(void *v) { struct vop_revoke_args /* { struct vnode *a_vp; int a_flags; } */ *ap = v; struct vnode *vp = ap->a_vp; struct vnode *lvp = LAYERVPTOLOWERVP(vp); int error; /* * We will most likely end up in vclean which uses the usecount * to determine if a vnode is active. Take an extra reference on * the lower vnode so it will always close and inactivate. */ vref(lvp); error = LAYERFS_DO_BYPASS(vp, ap); vrele(lvp); return error; } int layer_reclaim(void *v) { struct vop_reclaim_v2_args /* { struct vnode *a_vp; struct lwp *a_l; } */ *ap = v; struct vnode *vp = ap->a_vp; struct layer_mount *lmp = MOUNTTOLAYERMOUNT(vp->v_mount); struct layer_node *xp = VTOLAYER(vp); struct vnode *lowervp = xp->layer_lowervp; VOP_UNLOCK(vp); /* * Note: in vop_reclaim, the node's struct lock has been * decomissioned, so we have to be careful about calling * VOP's on ourself. We must be careful as VXLOCK is set. */ if (vp == lmp->layerm_rootvp) { /* * Oops! We no longer have a root node. Most likely reason is * that someone forcably unmunted the underlying fs. * * Now getting the root vnode will fail. We're dead. :-( */ lmp->layerm_rootvp = NULL; } mutex_enter(vp->v_interlock); KASSERT(vp->v_interlock == lowervp->v_interlock); lowervp->v_writecount -= vp->v_writecount; mutex_exit(vp->v_interlock); /* After this assignment, this node will not be re-used. */ xp->layer_lowervp = NULL; kmem_free(vp->v_data, lmp->layerm_size); vp->v_data = NULL; vrele(lowervp); return 0; } /* * We just feed the returned vnode up to the caller - there's no need * to build a layer node on top of the node on which we're going to do * i/o. :-) */ int layer_bmap(void *v) { struct vop_bmap_args /* { struct vnode *a_vp; daddr_t a_bn; struct vnode **a_vpp; daddr_t *a_bnp; int *a_runp; } */ *ap = v; struct vnode *vp; vp = LAYERVPTOLOWERVP(ap->a_vp); ap->a_vp = vp; return VCALL(vp, ap->a_desc->vdesc_offset, ap); } int layer_print(void *v) { struct vop_print_args /* { struct vnode *a_vp; } */ *ap = v; struct vnode *vp = ap->a_vp; printf ("\ttag VT_LAYERFS, vp=%p, lowervp=%p\n", vp, LAYERVPTOLOWERVP(vp)); return 0; } int layer_getpages(void *v) { struct vop_getpages_args /* { struct vnode *a_vp; voff_t a_offset; struct vm_page **a_m; int *a_count; int a_centeridx; vm_prot_t a_access_type; int a_advice; int a_flags; } */ *ap = v; struct vnode *vp = ap->a_vp; struct mount *mp = vp->v_mount; int error; krw_t op; KASSERT(rw_lock_held(vp->v_uobj.vmobjlock)); if (ap->a_flags & PGO_LOCKED) { return EBUSY; } ap->a_vp = LAYERVPTOLOWERVP(vp); KASSERT(vp->v_uobj.vmobjlock == ap->a_vp->v_uobj.vmobjlock); /* Just pass the request on to the underlying layer. */ op = rw_lock_op(vp->v_uobj.vmobjlock); rw_exit(vp->v_uobj.vmobjlock); fstrans_start(mp); rw_enter(vp->v_uobj.vmobjlock, op); if (mp == vp->v_mount) { /* Will release the lock. */ error = VCALL(ap->a_vp, VOFFSET(vop_getpages), ap); } else { rw_exit(vp->v_uobj.vmobjlock); error = ENOENT; } fstrans_done(mp); return error; } int layer_putpages(void *v) { struct vop_putpages_args /* { struct vnode *a_vp; voff_t a_offlo; voff_t a_offhi; int a_flags; } */ *ap = v; struct vnode *vp = ap->a_vp; KASSERT(rw_write_held(vp->v_uobj.vmobjlock)); ap->a_vp = LAYERVPTOLOWERVP(vp); KASSERT(vp->v_uobj.vmobjlock == ap->a_vp->v_uobj.vmobjlock); if (ap->a_flags & PGO_RECLAIM) { rw_exit(vp->v_uobj.vmobjlock); return 0; } /* Just pass the request on to the underlying layer. */ return VCALL(ap->a_vp, VOFFSET(vop_putpages), ap); }
21 20 154 8 8 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 /* $NetBSD: mbuf.h,v 1.239 2024/01/22 21:15:02 jdolecek Exp $ */ /* * Copyright (c) 1996, 1997, 1999, 2001, 2007 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility, * NASA Ames Research Center and Matt Thomas of 3am Software Foundry. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /* * Copyright (c) 1982, 1986, 1988, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)mbuf.h 8.5 (Berkeley) 2/19/95 */ #ifndef _SYS_MBUF_H_ #define _SYS_MBUF_H_ #ifdef _KERNEL_OPT #include "opt_mbuftrace.h" #endif #ifndef M_WAITOK #include <sys/malloc.h> #endif #include <sys/pool.h> #include <sys/queue.h> #if defined(_KERNEL) #include <sys/percpu_types.h> #include <sys/socket.h> /* for AF_UNSPEC */ #include <sys/psref.h> #endif /* defined(_KERNEL) */ /* For offsetof() */ #if defined(_KERNEL) || defined(_STANDALONE) #include <sys/systm.h> #else #include <stddef.h> #endif #include <uvm/uvm_param.h> /* for MIN_PAGE_SIZE */ #include <net/if.h> /* * Mbufs are of a single size, MSIZE (machine/param.h), which * includes overhead. An mbuf may add a single "mbuf cluster" of size * MCLBYTES (also in machine/param.h), which has no additional overhead * and is used instead of the internal data area; this is done when * at least MINCLSIZE of data must be stored. */ /* Packet tags structure */ struct m_tag { SLIST_ENTRY(m_tag) m_tag_link; /* List of packet tags */ uint16_t m_tag_id; /* Tag ID */ uint16_t m_tag_len; /* Length of data */ }; /* mbuf ownership structure */ struct mowner { char mo_name[16]; /* owner name (fxp0) */ char mo_descr[16]; /* owner description (input) */ LIST_ENTRY(mowner) mo_link; /* */ struct percpu *mo_counters; }; #define MOWNER_INIT(x, y) { .mo_name = x, .mo_descr = y } enum mowner_counter_index { MOWNER_COUNTER_CLAIMS, /* # of small mbuf claimed */ MOWNER_COUNTER_RELEASES, /* # of small mbuf released */ MOWNER_COUNTER_CLUSTER_CLAIMS, /* # of cluster mbuf claimed */ MOWNER_COUNTER_CLUSTER_RELEASES,/* # of cluster mbuf released */ MOWNER_COUNTER_EXT_CLAIMS, /* # of M_EXT mbuf claimed */ MOWNER_COUNTER_EXT_RELEASES, /* # of M_EXT mbuf released */ MOWNER_COUNTER_NCOUNTERS, }; #if defined(_KERNEL) struct mowner_counter { u_long mc_counter[MOWNER_COUNTER_NCOUNTERS]; }; #endif /* userland-exported version of struct mowner */ struct mowner_user { char mo_name[16]; /* owner name (fxp0) */ char mo_descr[16]; /* owner description (input) */ LIST_ENTRY(mowner) mo_link; /* unused padding; for compatibility */ u_long mo_counter[MOWNER_COUNTER_NCOUNTERS]; /* counters */ }; /* * Macros for type conversion * mtod(m,t) - convert mbuf pointer to data pointer of correct type */ #define mtod(m, t) ((t)((m)->m_data)) /* header at beginning of each mbuf */ struct m_hdr { struct mbuf *mh_next; /* next buffer in chain */ struct mbuf *mh_nextpkt; /* next chain in queue/record */ char *mh_data; /* location of data */ struct mowner *mh_owner; /* mbuf owner */ int mh_len; /* amount of data in this mbuf */ int mh_flags; /* flags; see below */ paddr_t mh_paddr; /* physical address of mbuf */ short mh_type; /* type of data in this mbuf */ }; /* * record/packet header in first mbuf of chain; valid if M_PKTHDR set * * A note about csum_data: * * o For the out-bound direction, the low 16 bits indicates the offset after * the L4 header where the final L4 checksum value is to be stored and the * high 16 bits is the length of the L3 header (the start of the data to * be checksummed). * * o For the in-bound direction, it is only valid if the M_CSUM_DATA flag is * set. In this case, an L4 checksum has been calculated by hardware and * is stored in csum_data, but it is up to software to perform final * verification. * * Note for in-bound TCP/UDP checksums: we expect the csum_data to NOT * be bit-wise inverted (the final step in the calculation of an IP * checksum) -- this is so we can accumulate the checksum for fragmented * packets during reassembly. * * Size ILP32: 40 * LP64: 56 */ struct pkthdr { union { void *ctx; /* for M_GETCTX/M_SETCTX */ if_index_t index; /* rcv interface index */ } _rcvif; #define rcvif_index _rcvif.index SLIST_HEAD(packet_tags, m_tag) tags; /* list of packet tags */ int len; /* total packet length */ int csum_flags; /* checksum flags */ uint32_t csum_data; /* checksum data */ u_int segsz; /* segment size */ uint16_t ether_vtag; /* ethernet 802.1p+q vlan tag */ uint16_t pkthdr_flags; /* flags for pkthdr, see blow */ #define PKTHDR_FLAG_IPSEC_SKIP_PFIL 0x0001 /* skip pfil_run_hooks() after ipsec decrypt */ /* * Following three fields are open-coded struct altq_pktattr * to rearrange struct pkthdr fields flexibly. */ int pattr_af; /* ALTQ: address family */ void *pattr_class; /* ALTQ: sched class set by classifier */ void *pattr_hdr; /* ALTQ: saved header position in mbuf */ }; /* Checksumming flags (csum_flags). */ #define M_CSUM_TCPv4 0x00000001 /* TCP header/payload */ #define M_CSUM_UDPv4 0x00000002 /* UDP header/payload */ #define M_CSUM_TCP_UDP_BAD 0x00000004 /* TCP/UDP checksum bad */ #define M_CSUM_DATA 0x00000008 /* consult csum_data */ #define M_CSUM_TCPv6 0x00000010 /* IPv6 TCP header/payload */ #define M_CSUM_UDPv6 0x00000020 /* IPv6 UDP header/payload */ #define M_CSUM_IPv4 0x00000040 /* IPv4 header */ #define M_CSUM_IPv4_BAD 0x00000080 /* IPv4 header checksum bad */ #define M_CSUM_TSOv4 0x00000100 /* TCPv4 segmentation offload */ #define M_CSUM_TSOv6 0x00000200 /* TCPv6 segmentation offload */ /* Checksum-assist quirks: keep separate from jump-table bits. */ #define M_CSUM_BLANK 0x40000000 /* csum is missing */ #define M_CSUM_NO_PSEUDOHDR 0x80000000 /* Rx csum_data does not include * the UDP/TCP pseudo-hdr, and * is not yet 1s-complemented. */ #define M_CSUM_BITS \ "\20\1TCPv4\2UDPv4\3TCP_UDP_BAD\4DATA\5TCPv6\6UDPv6\7IPv4\10IPv4_BAD" \ "\11TSOv4\12TSOv6\37BLANK\40NO_PSEUDOHDR" /* * Macros for manipulating csum_data on outgoing packets. These are * used to pass information down from the L4/L3 to the L2. * * _IPHL: Length of the IPv{4/6} header, plus the options; in other * words the offset of the UDP/TCP header in the packet. * _OFFSET: Offset of the checksum field in the UDP/TCP header. */ #define M_CSUM_DATA_IPv4_IPHL(x) ((x) >> 16) #define M_CSUM_DATA_IPv4_OFFSET(x) ((x) & 0xffff) #define M_CSUM_DATA_IPv6_IPHL(x) ((x) >> 16) #define M_CSUM_DATA_IPv6_OFFSET(x) ((x) & 0xffff) #define M_CSUM_DATA_IPv6_SET(x, v) (x) = ((x) & 0xffff) | ((v) << 16) /* * Max # of pages we can attach to m_ext. This is carefully chosen * to be able to handle SOSEND_LOAN_CHUNK with our minimum sized page. */ #ifdef MIN_PAGE_SIZE #define M_EXT_MAXPAGES ((65536 / MIN_PAGE_SIZE) + 1) #endif /* * Description of external storage mapped into mbuf, valid if M_EXT set. */ struct _m_ext_storage { unsigned int ext_refcnt; char *ext_buf; /* start of buffer */ void (*ext_free) /* free routine if not the usual */ (struct mbuf *, void *, size_t, void *); void *ext_arg; /* argument for ext_free */ size_t ext_size; /* size of buffer, for ext_free */ union { /* M_EXT_CLUSTER: physical address */ paddr_t extun_paddr; #ifdef M_EXT_MAXPAGES /* M_EXT_PAGES: pages */ struct vm_page *extun_pgs[M_EXT_MAXPAGES]; #endif } ext_un; #define ext_paddr ext_un.extun_paddr #define ext_pgs ext_un.extun_pgs }; struct _m_ext { struct mbuf *ext_ref; struct _m_ext_storage ext_storage; }; #define M_PADDR_INVALID POOL_PADDR_INVALID /* * Definition of "struct mbuf". * Don't change this without understanding how MHLEN/MLEN are defined. */ #define MBUF_DEFINE(name, mhlen, mlen) \ struct name { \ struct m_hdr m_hdr; \ union { \ struct { \ struct pkthdr MH_pkthdr; \ union { \ struct _m_ext MH_ext; \ char MH_databuf[(mhlen)]; \ } MH_dat; \ } MH; \ char M_databuf[(mlen)]; \ } M_dat; \ } #define m_next m_hdr.mh_next #define m_len m_hdr.mh_len #define m_data m_hdr.mh_data #define m_owner m_hdr.mh_owner #define m_type m_hdr.mh_type #define m_flags m_hdr.mh_flags #define m_nextpkt m_hdr.mh_nextpkt #define m_paddr m_hdr.mh_paddr #define m_pkthdr M_dat.MH.MH_pkthdr #define m_ext_storage M_dat.MH.MH_dat.MH_ext.ext_storage #define m_ext_ref M_dat.MH.MH_dat.MH_ext.ext_ref #define m_ext m_ext_ref->m_ext_storage #define m_pktdat M_dat.MH.MH_dat.MH_databuf #define m_dat M_dat.M_databuf /* * Dummy mbuf structure to calculate the right values for MLEN/MHLEN, taking * into account inter-structure padding. */ MBUF_DEFINE(_mbuf_dummy, 1, 1); /* normal data len */ #define MLEN ((int)(MSIZE - offsetof(struct _mbuf_dummy, m_dat))) /* data len w/pkthdr */ #define MHLEN ((int)(MSIZE - offsetof(struct _mbuf_dummy, m_pktdat))) #define MINCLSIZE (MHLEN+MLEN+1) /* smallest amount to put in cluster */ /* * The *real* struct mbuf */ MBUF_DEFINE(mbuf, MHLEN, MLEN); /* mbuf flags */ #define M_EXT 0x00000001 /* has associated external storage */ #define M_PKTHDR 0x00000002 /* start of record */ #define M_EOR 0x00000004 /* end of record */ #define M_PROTO1 0x00000008 /* protocol-specific */ /* mbuf pkthdr flags, also in m_flags */ #define M_AUTHIPHDR 0x00000010 /* authenticated (IPsec) */ #define M_DECRYPTED 0x00000020 /* decrypted (IPsec) */ #define M_LOOP 0x00000040 /* received on loopback */ #define M_BCAST 0x00000100 /* send/received as L2 broadcast */ #define M_MCAST 0x00000200 /* send/received as L2 multicast */ #define M_CANFASTFWD 0x00000400 /* packet can be fast-forwarded */ #define M_ANYCAST6 0x00000800 /* received as IPv6 anycast */ #define M_LINK0 0x00001000 /* link layer specific flag */ #define M_LINK1 0x00002000 /* link layer specific flag */ #define M_LINK2 0x00004000 /* link layer specific flag */ #define M_LINK3 0x00008000 /* link layer specific flag */ #define M_LINK4 0x00010000 /* link layer specific flag */ #define M_LINK5 0x00020000 /* link layer specific flag */ #define M_LINK6 0x00040000 /* link layer specific flag */ #define M_LINK7 0x00080000 /* link layer specific flag */ #define M_VLANTAG 0x00100000 /* ether_vtag is valid */ /* additional flags for M_EXT mbufs */ #define M_EXT_FLAGS 0xff000000 #define M_EXT_CLUSTER 0x01000000 /* ext is a cluster */ #define M_EXT_PAGES 0x02000000 /* ext_pgs is valid */ #define M_EXT_ROMAP 0x04000000 /* ext mapping is r-o at MMU */ #define M_EXT_RW 0x08000000 /* ext storage is writable */ /* for source-level compatibility */ #define M_NOTIFICATION M_PROTO1 #define M_FLAGS_BITS \ "\20\1EXT\2PKTHDR\3EOR\4PROTO1\5AUTHIPHDR\6DECRYPTED\7LOOP\10NONE" \ "\11BCAST\12MCAST\13CANFASTFWD\14ANYCAST6\15LINK0\16LINK1\17LINK2\20LINK3" \ "\21LINK4\22LINK5\23LINK6\24LINK7" \ "\25VLANTAG" \ "\31EXT_CLUSTER\32EXT_PAGES\33EXT_ROMAP\34EXT_RW" /* flags copied when copying m_pkthdr */ #define M_COPYFLAGS (M_PKTHDR|M_EOR|M_BCAST|M_MCAST|M_CANFASTFWD| \ M_ANYCAST6|M_LINK0|M_LINK1|M_LINK2|M_AUTHIPHDR|M_DECRYPTED|M_LOOP| \ M_VLANTAG) /* flag copied when shallow-copying external storage */ #define M_EXTCOPYFLAGS (M_EXT|M_EXT_FLAGS) /* mbuf types */ #define MT_FREE 0 /* should be on free list */ #define MT_DATA 1 /* dynamic (data) allocation */ #define MT_HEADER 2 /* packet header */ #define MT_SONAME 3 /* socket name */ #define MT_SOOPTS 4 /* socket options */ #define MT_FTABLE 5 /* fragment reassembly header */ #define MT_CONTROL 6 /* extra-data protocol message */ #define MT_OOBDATA 7 /* expedited data */ #ifdef MBUFTYPES const char * const mbuftypes[] = { "mbfree", "mbdata", "mbheader", "mbsoname", "mbsopts", "mbftable", "mbcontrol", "mboobdata", }; #else extern const char * const mbuftypes[]; #endif /* flags to m_get/MGET */ #define M_DONTWAIT M_NOWAIT #define M_WAIT M_WAITOK #ifdef MBUFTRACE /* Mbuf allocation tracing. */ void mowner_init_owner(struct mowner *, const char *, const char *); void mowner_init(struct mbuf *, int); void mowner_ref(struct mbuf *, int); void m_claim(struct mbuf *, struct mowner *); void mowner_revoke(struct mbuf *, bool, int); void mowner_attach(struct mowner *); void mowner_detach(struct mowner *); void m_claimm(struct mbuf *, struct mowner *); #else #define mowner_init_owner(mo, n, d) __nothing #define mowner_init(m, type) __nothing #define mowner_ref(m, flags) __nothing #define mowner_revoke(m, all, flags) __nothing #define m_claim(m, mowner) __nothing #define mowner_attach(mo) __nothing #define mowner_detach(mo) __nothing #define m_claimm(m, mo) __nothing #endif #define MCLAIM(m, mo) m_claim((m), (mo)) #define MOWNER_ATTACH(mo) mowner_attach(mo) #define MOWNER_DETACH(mo) mowner_detach(mo) /* * mbuf allocation/deallocation macros: * * MGET(struct mbuf *m, int how, int type) * allocates an mbuf and initializes it to contain internal data. * * MGETHDR(struct mbuf *m, int how, int type) * allocates an mbuf and initializes it to contain a packet header * and internal data. * * If 'how' is M_WAIT, these macros (and the corresponding functions) * are guaranteed to return successfully. */ #define MGET(m, how, type) m = m_get((how), (type)) #define MGETHDR(m, how, type) m = m_gethdr((how), (type)) #if defined(_KERNEL) #define MCLINITREFERENCE(m) \ do { \ KASSERT(((m)->m_flags & M_EXT) == 0); \ (m)->m_ext_ref = (m); \ (m)->m_ext.ext_refcnt = 1; \ } while (/* CONSTCOND */ 0) /* * Macros for mbuf external storage. * * MCLGET allocates and adds an mbuf cluster to a normal mbuf; * the flag M_EXT is set upon success. * * MEXTMALLOC allocates external storage and adds it to * a normal mbuf; the flag M_EXT is set upon success. * * MEXTADD adds pre-allocated external storage to * a normal mbuf; the flag M_EXT is set upon success. */ #define MCLGET(m, how) m_clget((m), (how)) #define MEXTMALLOC(m, size, how) \ do { \ (m)->m_ext_storage.ext_buf = malloc((size), 0, (how)); \ if ((m)->m_ext_storage.ext_buf != NULL) { \ MCLINITREFERENCE(m); \ (m)->m_data = (m)->m_ext.ext_buf; \ (m)->m_flags = ((m)->m_flags & ~M_EXTCOPYFLAGS) | \ M_EXT|M_EXT_RW; \ (m)->m_ext.ext_size = (size); \ (m)->m_ext.ext_free = NULL; \ (m)->m_ext.ext_arg = NULL; \ mowner_ref((m), M_EXT); \ } \ } while (/* CONSTCOND */ 0) #define MEXTADD(m, buf, size, type, free, arg) \ do { \ MCLINITREFERENCE(m); \ (m)->m_data = (m)->m_ext.ext_buf = (char *)(buf); \ (m)->m_flags = ((m)->m_flags & ~M_EXTCOPYFLAGS) | M_EXT; \ (m)->m_ext.ext_size = (size); \ (m)->m_ext.ext_free = (free); \ (m)->m_ext.ext_arg = (arg); \ mowner_ref((m), M_EXT); \ } while (/* CONSTCOND */ 0) #define M_BUFADDR(m) \ (((m)->m_flags & M_EXT) ? (m)->m_ext.ext_buf : \ ((m)->m_flags & M_PKTHDR) ? (m)->m_pktdat : (m)->m_dat) #define M_BUFSIZE(m) \ (((m)->m_flags & M_EXT) ? (m)->m_ext.ext_size : \ ((m)->m_flags & M_PKTHDR) ? MHLEN : MLEN) #define MRESETDATA(m) (m)->m_data = M_BUFADDR(m) /* * Compute the offset of the beginning of the data buffer of a non-ext * mbuf. */ #define M_BUFOFFSET(m) \ (((m)->m_flags & M_PKTHDR) ? \ offsetof(struct mbuf, m_pktdat) : offsetof(struct mbuf, m_dat)) /* * Determine if an mbuf's data area is read-only. This is true * if external storage is read-only mapped, or not marked as R/W, * or referenced by more than one mbuf. */ #define M_READONLY(m) \ (((m)->m_flags & M_EXT) != 0 && \ (((m)->m_flags & (M_EXT_ROMAP|M_EXT_RW)) != M_EXT_RW || \ (m)->m_ext.ext_refcnt > 1)) #define M_UNWRITABLE(__m, __len) \ ((__m)->m_len < (__len) || M_READONLY((__m))) /* * Determine if an mbuf's data area is read-only at the MMU. */ #define M_ROMAP(m) \ (((m)->m_flags & (M_EXT|M_EXT_ROMAP)) == (M_EXT|M_EXT_ROMAP)) /* * Compute the amount of space available before the current start of * data in an mbuf. */ #define M_LEADINGSPACE(m) \ (M_READONLY((m)) ? 0 : ((m)->m_data - M_BUFADDR(m))) /* * Compute the amount of space available * after the end of data in an mbuf. */ #define _M_TRAILINGSPACE(m) \ ((m)->m_flags & M_EXT ? (m)->m_ext.ext_buf + (m)->m_ext.ext_size - \ ((m)->m_data + (m)->m_len) : \ &(m)->m_dat[MLEN] - ((m)->m_data + (m)->m_len)) #define M_TRAILINGSPACE(m) \ (M_READONLY((m)) ? 0 : _M_TRAILINGSPACE((m))) /* * Arrange to prepend space of size plen to mbuf m. * If a new mbuf must be allocated, how specifies whether to wait. * If how is M_DONTWAIT and allocation fails, the original mbuf chain * is freed and m is set to NULL. */ #define M_PREPEND(m, plen, how) \ do { \ if (M_LEADINGSPACE(m) >= (plen)) { \ (m)->m_data -= (plen); \ (m)->m_len += (plen); \ } else \ (m) = m_prepend((m), (plen), (how)); \ if ((m) && (m)->m_flags & M_PKTHDR) \ (m)->m_pkthdr.len += (plen); \ } while (/* CONSTCOND */ 0) /* change mbuf to new type */ #define MCHTYPE(m, t) \ do { \ KASSERT((t) != MT_FREE); \ mbstat_type_add((m)->m_type, -1); \ mbstat_type_add(t, 1); \ (m)->m_type = t; \ } while (/* CONSTCOND */ 0) #ifdef DIAGNOSTIC #define M_VERIFY_PACKET(m) m_verify_packet(m) #else #define M_VERIFY_PACKET(m) __nothing #endif /* The "copy all" special length. */ #define M_COPYALL -1 /* * Allow drivers and/or protocols to store private context information. */ #define M_GETCTX(m, t) ((t)(m)->m_pkthdr._rcvif.ctx) #define M_SETCTX(m, c) ((void)((m)->m_pkthdr._rcvif.ctx = (void *)(c))) #define M_CLEARCTX(m) M_SETCTX((m), NULL) /* * M_REGION_GET ensures that the "len"-sized region of type "typ" starting * from "off" within "m" is located in a single mbuf, contiguously. * * The pointer to the region will be returned to pointer variable "val". */ #define M_REGION_GET(val, typ, m, off, len) \ do { \ struct mbuf *_t; \ int _tmp; \ if ((m)->m_len >= (off) + (len)) \ (val) = (typ)(mtod((m), char *) + (off)); \ else { \ _t = m_pulldown((m), (off), (len), &_tmp); \ if (_t) { \ if (_t->m_len < _tmp + (len)) \ panic("m_pulldown malfunction"); \ (val) = (typ)(mtod(_t, char *) + _tmp); \ } else { \ (val) = (typ)NULL; \ (m) = NULL; \ } \ } \ } while (/*CONSTCOND*/ 0) #endif /* defined(_KERNEL) */ /* * Simple mbuf queueing system * * this is basically a SIMPLEQ adapted to mbuf use (ie using * m_nextpkt instead of field.sqe_next). * * m_next is ignored, so queueing chains of mbufs is possible */ #define MBUFQ_HEAD(name) \ struct name { \ struct mbuf *mq_first; \ struct mbuf **mq_last; \ } #define MBUFQ_INIT(q) do { \ (q)->mq_first = NULL; \ (q)->mq_last = &(q)->mq_first; \ } while (/*CONSTCOND*/0) #define MBUFQ_ENQUEUE(q, m) do { \ (m)->m_nextpkt = NULL; \ *(q)->mq_last = (m); \ (q)->mq_last = &(m)->m_nextpkt; \ } while (/*CONSTCOND*/0) #define MBUFQ_PREPEND(q, m) do { \ if (((m)->m_nextpkt = (q)->mq_first) == NULL) \ (q)->mq_last = &(m)->m_nextpkt; \ (q)->mq_first = (m); \ } while (/*CONSTCOND*/0) #define MBUFQ_DEQUEUE(q, m) do { \ if (((m) = (q)->mq_first) != NULL) { \ if (((q)->mq_first = (m)->m_nextpkt) == NULL) \ (q)->mq_last = &(q)->mq_first; \ else \ (m)->m_nextpkt = NULL; \ } \ } while (/*CONSTCOND*/0) #define MBUFQ_DRAIN(q) do { \ struct mbuf *__m0; \ while ((__m0 = (q)->mq_first) != NULL) { \ (q)->mq_first = __m0->m_nextpkt; \ m_freem(__m0); \ } \ (q)->mq_last = &(q)->mq_first; \ } while (/*CONSTCOND*/0) #define MBUFQ_FIRST(q) ((q)->mq_first) #define MBUFQ_NEXT(m) ((m)->m_nextpkt) #define MBUFQ_LAST(q) (*(q)->mq_last) /* * Mbuf statistics. * For statistics related to mbuf and cluster allocations, see also the * pool headers (mb_cache and mcl_cache). */ struct mbstat { u_long _m_spare; /* formerly m_mbufs */ u_long _m_spare1; /* formerly m_clusters */ u_long _m_spare2; /* spare field */ u_long _m_spare3; /* formely m_clfree - free clusters */ u_long m_drops; /* times failed to find space */ u_long m_wait; /* times waited for space */ u_long m_drain; /* times drained protocols for space */ u_short m_mtypes[256]; /* type specific mbuf allocations */ }; struct mbstat_cpu { u_int m_mtypes[256]; /* type specific mbuf allocations */ }; /* * Mbuf sysctl variables. */ #define MBUF_MSIZE 1 /* int: mbuf base size */ #define MBUF_MCLBYTES 2 /* int: mbuf cluster size */ #define MBUF_NMBCLUSTERS 3 /* int: limit on the # of clusters */ #define MBUF_MBLOWAT 4 /* int: mbuf low water mark */ #define MBUF_MCLLOWAT 5 /* int: mbuf cluster low water mark */ #define MBUF_STATS 6 /* struct: mbstat */ #define MBUF_MOWNERS 7 /* struct: m_owner[] */ #define MBUF_NMBCLUSTERS_LIMIT 8 /* int: limit of nmbclusters */ #ifdef _KERNEL extern struct mbstat mbstat; extern int nmbclusters; /* limit on the # of clusters */ extern int mblowat; /* mbuf low water mark */ extern int mcllowat; /* mbuf cluster low water mark */ extern int max_linkhdr; /* largest link-level header */ extern int max_protohdr; /* largest protocol header */ extern int max_hdr; /* largest link+protocol header */ extern int max_datalen; /* MHLEN - max_hdr */ extern const int msize; /* mbuf base size */ extern const int mclbytes; /* mbuf cluster size */ extern pool_cache_t mb_cache; #ifdef MBUFTRACE LIST_HEAD(mownerhead, mowner); extern struct mownerhead mowners; extern struct mowner unknown_mowners[]; extern struct mowner revoked_mowner; #endif MALLOC_DECLARE(M_MBUF); MALLOC_DECLARE(M_SONAME); struct mbuf *m_copym(struct mbuf *, int, int, int); struct mbuf *m_copypacket(struct mbuf *, int); struct mbuf *m_devget(char *, int, int, struct ifnet *); struct mbuf *m_dup(struct mbuf *, int, int, int); struct mbuf *m_get(int, int); struct mbuf *m_gethdr(int, int); struct mbuf *m_get_n(int, int, size_t, size_t); struct mbuf *m_gethdr_n(int, int, size_t, size_t); struct mbuf *m_prepend(struct mbuf *,int, int); struct mbuf *m_pulldown(struct mbuf *, int, int, int *); struct mbuf *m_pullup(struct mbuf *, int); struct mbuf *m_copyup(struct mbuf *, int, int); struct mbuf *m_split(struct mbuf *,int, int); struct mbuf *m_getptr(struct mbuf *, int, int *); void m_adj(struct mbuf *, int); struct mbuf *m_defrag(struct mbuf *, int); int m_apply(struct mbuf *, int, int, int (*)(void *, void *, unsigned int), void *); void m_cat(struct mbuf *,struct mbuf *); void m_clget(struct mbuf *, int); void m_copyback(struct mbuf *, int, int, const void *); struct mbuf *m_copyback_cow(struct mbuf *, int, int, const void *, int); int m_makewritable(struct mbuf **, int, int, int); struct mbuf *m_getcl(int, int, int); void m_copydata(struct mbuf *, int, int, void *); void m_verify_packet(struct mbuf *); struct mbuf *m_free(struct mbuf *); void m_freem(struct mbuf *); void mbinit(void); void m_remove_pkthdr(struct mbuf *); void m_copy_pkthdr(struct mbuf *, struct mbuf *); void m_move_pkthdr(struct mbuf *, struct mbuf *); void m_align(struct mbuf *, int); bool m_ensure_contig(struct mbuf **, int); struct mbuf *m_add(struct mbuf *, struct mbuf *); /* Inline routines. */ static __inline u_int m_length(const struct mbuf *) __unused; /* Statistics */ void mbstat_type_add(int, int); /* Packet tag routines */ struct m_tag *m_tag_get(int, int, int); void m_tag_free(struct m_tag *); void m_tag_prepend(struct mbuf *, struct m_tag *); void m_tag_unlink(struct mbuf *, struct m_tag *); void m_tag_delete(struct mbuf *, struct m_tag *); void m_tag_delete_chain(struct mbuf *); struct m_tag *m_tag_find(const struct mbuf *, int); struct m_tag *m_tag_copy(struct m_tag *); int m_tag_copy_chain(struct mbuf *, struct mbuf *); /* Packet tag types */ #define PACKET_TAG_NONE 0 /* Nothing */ #define PACKET_TAG_SO 4 /* sending socket pointer */ #define PACKET_TAG_NPF 10 /* packet filter */ #define PACKET_TAG_PF 11 /* packet filter */ #define PACKET_TAG_ALTQ_QID 12 /* ALTQ queue id */ #define PACKET_TAG_IPSEC_OUT_DONE 18 #define PACKET_TAG_IPSEC_NAT_T_PORTS 25 /* two uint16_t */ #define PACKET_TAG_INET6 26 /* IPv6 info */ #define PACKET_TAG_TUNNEL_INFO 28 /* tunnel identification and * protocol callback, for loop * detection/recovery */ #define PACKET_TAG_MPLS 29 /* Indicate it's for MPLS */ #define PACKET_TAG_SRCROUTE 30 /* IPv4 source routing */ #define PACKET_TAG_ETHERNET_SRC 31 /* Ethernet source address */ /* * Return the number of bytes in the mbuf chain, m. */ static __inline u_int m_length(const struct mbuf *m) { const struct mbuf *m0; u_int pktlen; if ((m->m_flags & M_PKTHDR) != 0) return m->m_pkthdr.len; pktlen = 0; for (m0 = m; m0 != NULL; m0 = m0->m_next) pktlen += m0->m_len; return pktlen; } static __inline void m_set_rcvif(struct mbuf *m, const struct ifnet *ifp) { KASSERT(m->m_flags & M_PKTHDR); m->m_pkthdr.rcvif_index = ifp->if_index; } static __inline void m_reset_rcvif(struct mbuf *m) { KASSERT(m->m_flags & M_PKTHDR); /* A caller may expect whole _rcvif union is zeroed */ /* m->m_pkthdr.rcvif_index = 0; */ m->m_pkthdr._rcvif.ctx = NULL; } static __inline void m_copy_rcvif(struct mbuf *m, const struct mbuf *n) { KASSERT(m->m_flags & M_PKTHDR); KASSERT(n->m_flags & M_PKTHDR); m->m_pkthdr.rcvif_index = n->m_pkthdr.rcvif_index; } #define M_GET_ALIGNED_HDR(m, type, linkhdr) \ m_get_aligned_hdr((m), __alignof(type) - 1, sizeof(type), (linkhdr)) static __inline int m_get_aligned_hdr(struct mbuf **m, int mask, size_t hlen, bool linkhdr) { #ifndef __NO_STRICT_ALIGNMENT if (((uintptr_t)mtod(*m, void *) & mask) != 0) *m = m_copyup(*m, hlen, linkhdr ? (max_linkhdr + mask) & ~mask : 0); else #endif if (__predict_false((size_t)(*m)->m_len < hlen)) *m = m_pullup(*m, hlen); return *m == NULL; } void m_print(const struct mbuf *, const char *, void (*)(const char *, ...) __printflike(1, 2)); /* from uipc_mbufdebug.c */ void m_examine(const struct mbuf *, int, const char *, void (*)(const char *, ...) __printflike(1, 2)); /* parsers for m_examine() */ void m_examine_ether(const struct mbuf *, int, const char *, void (*)(const char *, ...) __printflike(1, 2)); void m_examine_pppoe(const struct mbuf *, int, const char *, void (*)(const char *, ...) __printflike(1, 2)); void m_examine_ppp(const struct mbuf *, int, const char *, void (*)(const char *, ...) __printflike(1, 2)); void m_examine_arp(const struct mbuf *, int, const char *, void (*)(const char *, ...) __printflike(1, 2)); void m_examine_ip(const struct mbuf *, int, const char *, void (*)(const char *, ...) __printflike(1, 2)); void m_examine_icmp(const struct mbuf *, int, const char *, void (*)(const char *, ...) __printflike(1, 2)); void m_examine_ip6(const struct mbuf *, int, const char *, void (*)(const char *, ...) __printflike(1, 2)); void m_examine_icmp6(const struct mbuf *, int, const char *, void (*)(const char *, ...) __printflike(1, 2)); void m_examine_tcp(const struct mbuf *, int, const char *, void (*)(const char *, ...) __printflike(1, 2)); void m_examine_udp(const struct mbuf *, int, const char *, void (*)(const char *, ...) __printflike(1, 2)); void m_examine_hex(const struct mbuf *, int, const char *, void (*)(const char *, ...) __printflike(1, 2)); /* * Get rcvif of a mbuf. * * The caller must call m_put_rcvif after using rcvif if the returned rcvif * isn't NULL. If the returned rcvif is NULL, the caller doesn't need to call * m_put_rcvif (although calling it is safe). * * The caller must not block or sleep while using rcvif. The API ensures a * returned rcvif isn't freed until m_put_rcvif is called. */ static __inline struct ifnet * m_get_rcvif(const struct mbuf *m, int *s) { struct ifnet *ifp; KASSERT(m->m_flags & M_PKTHDR); *s = pserialize_read_enter(); ifp = if_byindex(m->m_pkthdr.rcvif_index); if (__predict_false(ifp == NULL)) pserialize_read_exit(*s); return ifp; } static __inline void m_put_rcvif(struct ifnet *ifp, int *s) { if (ifp == NULL) return; pserialize_read_exit(*s); } /* * Get rcvif of a mbuf. * * The caller must call m_put_rcvif_psref after using rcvif. The API ensures * a got rcvif isn't be freed until m_put_rcvif_psref is called. */ static __inline struct ifnet * m_get_rcvif_psref(const struct mbuf *m, struct psref *psref) { KASSERT(m->m_flags & M_PKTHDR); return if_get_byindex(m->m_pkthdr.rcvif_index, psref); } static __inline void m_put_rcvif_psref(struct ifnet *ifp, struct psref *psref) { if (ifp == NULL) return; if_put(ifp, psref); } /* * Get rcvif of a mbuf. * * This is NOT an MP-safe API and shouldn't be used at where we want MP-safe. */ static __inline struct ifnet * m_get_rcvif_NOMPSAFE(const struct mbuf *m) { KASSERT(m->m_flags & M_PKTHDR); return if_byindex(m->m_pkthdr.rcvif_index); } #endif /* _KERNEL */ #endif /* !_SYS_MBUF_H_ */
3 1 2 2 1 1 23 2 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 /* $NetBSD: fdesc_vfsops.c,v 1.96 2020/04/13 19:23:18 ad Exp $ */ /* * Copyright (c) 1992, 1993, 1995 * The Regents of the University of California. All rights reserved. * * This code is derived from software donated to Berkeley by * Jan-Simon Pendry. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)fdesc_vfsops.c 8.10 (Berkeley) 5/14/95 * * #Id: fdesc_vfsops.c,v 1.9 1993/04/06 15:28:33 jsp Exp # */ /* * /dev/fd Filesystem */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: fdesc_vfsops.c,v 1.96 2020/04/13 19:23:18 ad Exp $"); #if defined(_KERNEL_OPT) #include "opt_compat_netbsd.h" #endif #include <sys/param.h> #include <sys/systm.h> #include <sys/sysctl.h> #include <sys/time.h> #include <sys/proc.h> #include <sys/resourcevar.h> #include <sys/filedesc.h> #include <sys/vnode.h> #include <sys/mount.h> #include <sys/dirent.h> #include <sys/namei.h> #include <sys/kauth.h> #include <sys/module.h> #include <miscfs/genfs/genfs.h> #include <miscfs/fdesc/fdesc.h> MODULE(MODULE_CLASS_VFS, fdesc, NULL); VFS_PROTOS(fdesc); /* * Mount the per-process file descriptors (/dev/fd) */ int fdesc_mount(struct mount *mp, const char *path, void *data, size_t *data_len) { struct lwp *l = curlwp; int error = 0, ix; struct vnode *rvp; if (mp->mnt_flag & MNT_GETARGS) { *data_len = 0; return 0; } /* * Update is a no-op */ if (mp->mnt_flag & MNT_UPDATE) return (EOPNOTSUPP); ix = FD_ROOT; error = vcache_get(mp, &ix, sizeof(ix), &rvp); if (error) return error; mp->mnt_stat.f_namemax = FDESC_MAXNAMLEN; mp->mnt_flag |= MNT_LOCAL; mp->mnt_data = rvp; vfs_getnewfsid(mp); error = set_statvfs_info(path, UIO_USERSPACE, "fdesc", UIO_SYSSPACE, mp->mnt_op->vfs_name, mp, l); return error; } int fdesc_start(struct mount *mp, int flags) { return (0); } int fdesc_unmount(struct mount *mp, int mntflags) { int error; int flags = 0; struct vnode *rtvp = mp->mnt_data; if (mntflags & MNT_FORCE) flags |= FORCECLOSE; if (vrefcnt(rtvp) > 1 && (mntflags & MNT_FORCE) == 0) return (EBUSY); if ((error = vflush(mp, rtvp, flags)) != 0) return (error); /* * Blow it away for future re-use */ vgone(rtvp); mp->mnt_data = NULL; return (0); } int fdesc_root(struct mount *mp, int lktype, struct vnode **vpp) { struct vnode *vp; /* * Return locked reference to root. */ vp = mp->mnt_data; vref(vp); vn_lock(vp, lktype | LK_RETRY); *vpp = vp; return (0); } /*ARGSUSED*/ int fdesc_sync(struct mount *mp, int waitfor, kauth_cred_t uc) { return (0); } /* * Fdesc flat namespace lookup. * Currently unsupported. */ int fdesc_vget(struct mount *mp, ino_t ino, int lktype, struct vnode **vpp) { return (EOPNOTSUPP); } int fdesc_loadvnode(struct mount *mp, struct vnode *vp, const void *key, size_t key_len, const void **new_key) { int ix; struct fdescnode *fd; KASSERT(key_len == sizeof(ix)); memcpy(&ix, key, key_len); fd = kmem_alloc(sizeof(struct fdescnode), KM_SLEEP); fd->fd_fd = -1; fd->fd_link = NULL; fd->fd_ix = ix; fd->fd_vnode = vp; vp->v_tag = VT_FDESC; vp->v_op = fdesc_vnodeop_p; vp->v_data = fd; switch (ix) { case FD_ROOT: fd->fd_type = Froot; vp->v_type = VDIR; vp->v_vflag |= VV_ROOT; break; case FD_DEVFD: fd->fd_type = Fdevfd; vp->v_type = VDIR; break; case FD_CTTY: fd->fd_type = Fctty; vp->v_type = VCHR; break; case FD_STDIN: fd->fd_type = Flink; fd->fd_link = "fd/0"; vp->v_type = VLNK; break; case FD_STDOUT: fd->fd_type = Flink; fd->fd_link = "fd/1"; vp->v_type = VLNK; break; case FD_STDERR: fd->fd_type = Flink; fd->fd_link = "fd/2"; vp->v_type = VLNK; break; default: KASSERT(ix >= FD_DESC); fd->fd_type = Fdesc; fd->fd_fd = ix - FD_DESC; vp->v_type = VNON; break; } uvm_vnp_setsize(vp, 0); *new_key = &fd->fd_ix; return 0; } extern const struct vnodeopv_desc fdesc_vnodeop_opv_desc; const struct vnodeopv_desc * const fdesc_vnodeopv_descs[] = { &fdesc_vnodeop_opv_desc, NULL, }; struct vfsops fdesc_vfsops = { .vfs_name = MOUNT_FDESC, .vfs_min_mount_data = 0, .vfs_mount = fdesc_mount, .vfs_start = fdesc_start, .vfs_unmount = fdesc_unmount, .vfs_root = fdesc_root, .vfs_quotactl = (void *)eopnotsupp, .vfs_statvfs = genfs_statvfs, .vfs_sync = fdesc_sync, .vfs_vget = fdesc_vget, .vfs_loadvnode = fdesc_loadvnode, .vfs_fhtovp = (void *)eopnotsupp, .vfs_vptofh = (void *)eopnotsupp, .vfs_init = fdesc_init, .vfs_done = fdesc_done, .vfs_snapshot = (void *)eopnotsupp, .vfs_extattrctl = vfs_stdextattrctl, .vfs_suspendctl = genfs_suspendctl, .vfs_renamelock_enter = genfs_renamelock_enter, .vfs_renamelock_exit = genfs_renamelock_exit, .vfs_fsync = (void *)eopnotsupp, .vfs_opv_descs = fdesc_vnodeopv_descs }; SYSCTL_SETUP(fdesc_sysctl_setup, "fdesc sysctl") { sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT, CTLTYPE_NODE, "fdesc", SYSCTL_DESCR("File-descriptor file system"), NULL, 0, NULL, 0, CTL_VFS, 7, CTL_EOL); /* * XXX the "7" above could be dynamic, thereby eliminating one * more instance of the "number to vfs" mapping problem, but * "7" is the order as taken from sys/mount.h */ } static int fdesc_modcmd(modcmd_t cmd, void *arg) { int error; switch (cmd) { case MODULE_CMD_INIT: error = vfs_attach(&fdesc_vfsops); if (error != 0) break; break; case MODULE_CMD_FINI: error = vfs_detach(&fdesc_vfsops); if (error != 0) break; break; default: error = ENOTTY; break; } return (error); }
1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 /* $NetBSD: mly.c,v 1.56 2021/09/03 22:33:17 andvar Exp $ */ /*- * Copyright (c) 2001 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Andrew Doran, Thor Lancelot Simon, and Eric Haszlakiewicz. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /*- * Copyright (c) 2000, 2001 Michael Smith * Copyright (c) 2000 BSDi * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from FreeBSD: mly.c,v 1.8 2001/07/14 00:12:22 msmith Exp */ /* * Driver for the Mylex AcceleRAID and eXtremeRAID family with v6 firmware. * * TODO: * * o Make mly->mly_btl a hash, then MLY_BTL_RESCAN becomes a SIMPLEQ. * o Handle FC and multiple LUNs. * o Fix mmbox usage. * o Fix transfer speed fudge. */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: mly.c,v 1.56 2021/09/03 22:33:17 andvar Exp $"); #include <sys/param.h> #include <sys/systm.h> #include <sys/device.h> #include <sys/kernel.h> #include <sys/queue.h> #include <sys/buf.h> #include <sys/endian.h> #include <sys/conf.h> #include <sys/malloc.h> #include <sys/ioctl.h> #include <sys/scsiio.h> #include <sys/kthread.h> #include <sys/kauth.h> #include <sys/bus.h> #include <dev/scsipi/scsi_all.h> #include <dev/scsipi/scsipi_all.h> #include <dev/scsipi/scsiconf.h> #include <dev/pci/pcireg.h> #include <dev/pci/pcivar.h> #include <dev/pci/pcidevs.h> #include <dev/pci/mlyreg.h> #include <dev/pci/mlyio.h> #include <dev/pci/mlyvar.h> #include <dev/pci/mly_tables.h> static void mly_attach(device_t, device_t, void *); static int mly_match(device_t, cfdata_t, void *); static const struct mly_ident *mly_find_ident(struct pci_attach_args *); static int mly_fwhandshake(struct mly_softc *); static int mly_flush(struct mly_softc *); static int mly_intr(void *); static void mly_shutdown(void *); static int mly_alloc_ccbs(struct mly_softc *); static void mly_check_event(struct mly_softc *); static void mly_complete_event(struct mly_softc *, struct mly_ccb *); static void mly_complete_rescan(struct mly_softc *, struct mly_ccb *); static int mly_dmamem_alloc(struct mly_softc *, int, bus_dmamap_t *, void **, bus_addr_t *, bus_dma_segment_t *); static void mly_dmamem_free(struct mly_softc *, int, bus_dmamap_t, void *, bus_dma_segment_t *); static int mly_enable_mmbox(struct mly_softc *); static void mly_fetch_event(struct mly_softc *); static int mly_get_controllerinfo(struct mly_softc *); static int mly_get_eventstatus(struct mly_softc *); static int mly_ioctl(struct mly_softc *, struct mly_cmd_ioctl *, void **, size_t, void *, size_t *); static void mly_padstr(char *, const char *, int); static void mly_process_event(struct mly_softc *, struct mly_event *); static void mly_release_ccbs(struct mly_softc *); static int mly_scan_btl(struct mly_softc *, int, int); static void mly_scan_channel(struct mly_softc *, int); static void mly_thread(void *); static int mly_ccb_alloc(struct mly_softc *, struct mly_ccb **); static void mly_ccb_complete(struct mly_softc *, struct mly_ccb *); static void mly_ccb_enqueue(struct mly_softc *, struct mly_ccb *); static void mly_ccb_free(struct mly_softc *, struct mly_ccb *); static int mly_ccb_map(struct mly_softc *, struct mly_ccb *); static int mly_ccb_poll(struct mly_softc *, struct mly_ccb *, int); static int mly_ccb_submit(struct mly_softc *, struct mly_ccb *); static void mly_ccb_unmap(struct mly_softc *, struct mly_ccb *); static int mly_ccb_wait(struct mly_softc *, struct mly_ccb *, int); static void mly_get_xfer_mode(struct mly_softc *, int, struct scsipi_xfer_mode *); static void mly_scsipi_complete(struct mly_softc *, struct mly_ccb *); static int mly_scsipi_ioctl(struct scsipi_channel *, u_long, void *, int, struct proc *); static void mly_scsipi_minphys(struct buf *); static void mly_scsipi_request(struct scsipi_channel *, scsipi_adapter_req_t, void *); static int mly_user_command(struct mly_softc *, struct mly_user_command *); static int mly_user_health(struct mly_softc *, struct mly_user_health *); extern struct cfdriver mly_cd; CFATTACH_DECL_NEW(mly, sizeof(struct mly_softc), mly_match, mly_attach, NULL, NULL); dev_type_open(mlyopen); dev_type_close(mlyclose); dev_type_ioctl(mlyioctl); const struct cdevsw mly_cdevsw = { .d_open = mlyopen, .d_close = mlyclose, .d_read = noread, .d_write = nowrite, .d_ioctl = mlyioctl, .d_stop = nostop, .d_tty = notty, .d_poll = nopoll, .d_mmap = nommap, .d_kqfilter = nokqfilter, .d_discard = nodiscard, .d_flag = D_OTHER }; static struct mly_ident { u_short vendor; u_short product; u_short subvendor; u_short subproduct; int hwif; const char *desc; } const mly_ident[] = { { PCI_VENDOR_MYLEX, PCI_PRODUCT_MYLEX_EXTREMERAID, PCI_VENDOR_MYLEX, 0x0040, MLY_HWIF_STRONGARM, "eXtremeRAID 2000" }, { PCI_VENDOR_MYLEX, PCI_PRODUCT_MYLEX_EXTREMERAID, PCI_VENDOR_MYLEX, 0x0030, MLY_HWIF_STRONGARM, "eXtremeRAID 3000" }, { PCI_VENDOR_MYLEX, PCI_PRODUCT_MYLEX_ACCELERAID, PCI_VENDOR_MYLEX, 0x0050, MLY_HWIF_I960RX, "AcceleRAID 352" }, { PCI_VENDOR_MYLEX, PCI_PRODUCT_MYLEX_ACCELERAID, PCI_VENDOR_MYLEX, 0x0052, MLY_HWIF_I960RX, "AcceleRAID 170" }, { PCI_VENDOR_MYLEX, PCI_PRODUCT_MYLEX_ACCELERAID, PCI_VENDOR_MYLEX, 0x0054, MLY_HWIF_I960RX, "AcceleRAID 160" }, }; static void *mly_sdh; /* * Try to find a `mly_ident' entry corresponding to this board. */ static const struct mly_ident * mly_find_ident(struct pci_attach_args *pa) { const struct mly_ident *mpi, *maxmpi; pcireg_t reg; mpi = mly_ident; maxmpi = mpi + sizeof(mly_ident) / sizeof(mly_ident[0]); if (PCI_CLASS(pa->pa_class) == PCI_CLASS_I2O) return (NULL); for (; mpi < maxmpi; mpi++) { if (PCI_VENDOR(pa->pa_id) != mpi->vendor || PCI_PRODUCT(pa->pa_id) != mpi->product) continue; if (mpi->subvendor == 0x0000) return (mpi); reg = pci_conf_read(pa->pa_pc, pa->pa_tag, PCI_SUBSYS_ID_REG); if (PCI_VENDOR(reg) == mpi->subvendor && PCI_PRODUCT(reg) == mpi->subproduct) return (mpi); } return (NULL); } /* * Match a supported board. */ static int mly_match(device_t parent, cfdata_t cfdata, void *aux) { return (mly_find_ident(aux) != NULL); } /* * Attach a supported board. */ static void mly_attach(device_t parent, device_t self, void *aux) { struct pci_attach_args *pa; struct mly_softc *mly; struct mly_ioctl_getcontrollerinfo *mi; const struct mly_ident *ident; pci_chipset_tag_t pc; pci_intr_handle_t ih; bus_space_handle_t memh, ioh; bus_space_tag_t memt, iot; pcireg_t reg; const char *intrstr; int ior, memr, i, rv, state; struct scsipi_adapter *adapt; struct scsipi_channel *chan; char intrbuf[PCI_INTRSTR_LEN]; mly = device_private(self); mly->mly_dv = self; pa = aux; pc = pa->pa_pc; ident = mly_find_ident(pa); state = 0; mly->mly_dmat = pa->pa_dmat; mly->mly_hwif = ident->hwif; printf(": Mylex %s\n", ident->desc); /* * Map the PCI register window. */ memr = -1; ior = -1; for (i = 0x10; i <= 0x14; i += 4) { reg = pci_conf_read(pa->pa_pc, pa->pa_tag, i); if (PCI_MAPREG_TYPE(reg) == PCI_MAPREG_TYPE_IO) { if (ior == -1 && PCI_MAPREG_IO_SIZE(reg) != 0) ior = i; } else { if (memr == -1 && PCI_MAPREG_MEM_SIZE(reg) != 0) memr = i; } } if (memr != -1) if (pci_mapreg_map(pa, memr, PCI_MAPREG_TYPE_MEM, 0, &memt, &memh, NULL, NULL)) memr = -1; if (ior != -1) if (pci_mapreg_map(pa, ior, PCI_MAPREG_TYPE_IO, 0, &iot, &ioh, NULL, NULL)) ior = -1; if (memr != -1) { mly->mly_iot = memt; mly->mly_ioh = memh; } else if (ior != -1) { mly->mly_iot = iot; mly->mly_ioh = ioh; } else { aprint_error_dev(self, "can't map i/o or memory space\n"); return; } /* * Enable the device. */ reg = pci_conf_read(pa->pa_pc, pa->pa_tag, PCI_COMMAND_STATUS_REG); pci_conf_write(pa->pa_pc, pa->pa_tag, PCI_COMMAND_STATUS_REG, reg | PCI_COMMAND_MASTER_ENABLE); /* * Map and establish the interrupt. */ if (pci_intr_map(pa, &ih)) { aprint_error_dev(self, "can't map interrupt\n"); return; } intrstr = pci_intr_string(pc, ih, intrbuf, sizeof(intrbuf)); mly->mly_ih = pci_intr_establish_xname(pc, ih, IPL_BIO, mly_intr, mly, device_xname(self)); if (mly->mly_ih == NULL) { aprint_error_dev(self, "can't establish interrupt"); if (intrstr != NULL) aprint_error(" at %s", intrstr); aprint_error("\n"); return; } if (intrstr != NULL) aprint_normal_dev(self, "interrupting at %s\n", intrstr); /* * Take care of interface-specific tasks. */ switch (mly->mly_hwif) { case MLY_HWIF_I960RX: mly->mly_doorbell_true = 0x00; mly->mly_cmd_mailbox = MLY_I960RX_COMMAND_MAILBOX; mly->mly_status_mailbox = MLY_I960RX_STATUS_MAILBOX; mly->mly_idbr = MLY_I960RX_IDBR; mly->mly_odbr = MLY_I960RX_ODBR; mly->mly_error_status = MLY_I960RX_ERROR_STATUS; mly->mly_interrupt_status = MLY_I960RX_INTERRUPT_STATUS; mly->mly_interrupt_mask = MLY_I960RX_INTERRUPT_MASK; break; case MLY_HWIF_STRONGARM: mly->mly_doorbell_true = 0xff; mly->mly_cmd_mailbox = MLY_STRONGARM_COMMAND_MAILBOX; mly->mly_status_mailbox = MLY_STRONGARM_STATUS_MAILBOX; mly->mly_idbr = MLY_STRONGARM_IDBR; mly->mly_odbr = MLY_STRONGARM_ODBR; mly->mly_error_status = MLY_STRONGARM_ERROR_STATUS; mly->mly_interrupt_status = MLY_STRONGARM_INTERRUPT_STATUS; mly->mly_interrupt_mask = MLY_STRONGARM_INTERRUPT_MASK; break; } /* * Allocate and map the scatter/gather lists. */ rv = mly_dmamem_alloc(mly, MLY_SGL_SIZE * MLY_MAX_CCBS, &mly->mly_sg_dmamap, (void **)&mly->mly_sg, &mly->mly_sg_busaddr, &mly->mly_sg_seg); if (rv) { printf("%s: unable to allocate S/G maps\n", device_xname(self)); goto bad; } state++; /* * Allocate and map the memory mailbox. */ rv = mly_dmamem_alloc(mly, sizeof(struct mly_mmbox), &mly->mly_mmbox_dmamap, (void **)&mly->mly_mmbox, &mly->mly_mmbox_busaddr, &mly->mly_mmbox_seg); if (rv) { aprint_error_dev(self, "unable to allocate mailboxes\n"); goto bad; } state++; /* * Initialise per-controller queues. */ SLIST_INIT(&mly->mly_ccb_free); SIMPLEQ_INIT(&mly->mly_ccb_queue); /* * Disable interrupts before we start talking to the controller. */ mly_outb(mly, mly->mly_interrupt_mask, MLY_INTERRUPT_MASK_DISABLE); /* * Wait for the controller to come ready, handshaking with the * firmware if required. This is typically only necessary on * platforms where the controller BIOS does not run. */ if (mly_fwhandshake(mly)) { aprint_error_dev(self, "unable to bring controller online\n"); goto bad; } /* * Allocate initial command buffers, obtain controller feature * information, and then reallocate command buffers, since we'll * know how many we want. */ if (mly_alloc_ccbs(mly)) { aprint_error_dev(self, "unable to allocate CCBs\n"); goto bad; } state++; if (mly_get_controllerinfo(mly)) { aprint_error_dev(self, "unable to retrieve controller info\n"); goto bad; } mly_release_ccbs(mly); if (mly_alloc_ccbs(mly)) { aprint_error_dev(self, "unable to allocate CCBs\n"); state--; goto bad; } /* * Get the current event counter for health purposes, populate the * initial health status buffer. */ if (mly_get_eventstatus(mly)) { aprint_error_dev(self, "unable to retrieve event status\n"); goto bad; } /* * Enable memory-mailbox mode. */ if (mly_enable_mmbox(mly)) { aprint_error_dev(self, "unable to enable memory mailbox\n"); goto bad; } /* * Print a little information about the controller. */ mi = mly->mly_controllerinfo; printf("%s: %d physical channel%s, firmware %d.%02d-%d-%02d " "(%02d%02d%02d%02d), %dMB RAM\n", device_xname(self), mi->physical_channels_present, (mi->physical_channels_present) > 1 ? "s" : "", mi->fw_major, mi->fw_minor, mi->fw_turn, mi->fw_build, mi->fw_century, mi->fw_year, mi->fw_month, mi->fw_day, le16toh(mi->memory_size)); /* * Register our `shutdownhook'. */ if (mly_sdh == NULL) shutdownhook_establish(mly_shutdown, NULL); /* * Clear any previous BTL information. For each bus that scsipi * wants to scan, we'll receive the SCBUSIOLLSCAN ioctl and retrieve * all BTL info at that point. */ memset(&mly->mly_btl, 0, sizeof(mly->mly_btl)); mly->mly_nchans = mly->mly_controllerinfo->physical_channels_present + mly->mly_controllerinfo->virtual_channels_present; /* * Attach to scsipi. */ adapt = &mly->mly_adapt; memset(adapt, 0, sizeof(*adapt)); adapt->adapt_dev = self; adapt->adapt_nchannels = mly->mly_nchans; adapt->adapt_openings = mly->mly_ncmds - MLY_CCBS_RESV; adapt->adapt_max_periph = mly->mly_ncmds - MLY_CCBS_RESV; adapt->adapt_request = mly_scsipi_request; adapt->adapt_minphys = mly_scsipi_minphys; adapt->adapt_ioctl = mly_scsipi_ioctl; for (i = 0; i < mly->mly_nchans; i++) { chan = &mly->mly_chans[i]; memset(chan, 0, sizeof(*chan)); chan->chan_adapter = adapt; chan->chan_bustype = &scsi_bustype; chan->chan_channel = i; chan->chan_ntargets = MLY_MAX_TARGETS; chan->chan_nluns = MLY_MAX_LUNS; chan->chan_id = mly->mly_controllerparam->initiator_id; chan->chan_flags = SCSIPI_CHAN_NOSETTLE; config_found(self, chan, scsiprint, CFARGS_NONE); } /* * Now enable interrupts... */ mly_outb(mly, mly->mly_interrupt_mask, MLY_INTERRUPT_MASK_ENABLE); /* * Finally, create our monitoring thread. */ mly->mly_state |= MLY_STATE_INITOK; rv = kthread_create(PRI_NONE, 0, NULL, mly_thread, mly, &mly->mly_thread, "%s", device_xname(self)); if (rv != 0) aprint_error_dev(self, "unable to create thread (%d)\n", rv); return; bad: if (state > 2) mly_release_ccbs(mly); if (state > 1) mly_dmamem_free(mly, sizeof(struct mly_mmbox), mly->mly_mmbox_dmamap, (void *)mly->mly_mmbox, &mly->mly_mmbox_seg); if (state > 0) mly_dmamem_free(mly, MLY_SGL_SIZE * MLY_MAX_CCBS, mly->mly_sg_dmamap, (void *)mly->mly_sg, &mly->mly_sg_seg); } /* * Scan all possible devices on the specified channel. */ static void mly_scan_channel(struct mly_softc *mly, int bus) { int s, target; for (target = 0; target < MLY_MAX_TARGETS; target++) { s = splbio(); if (!mly_scan_btl(mly, bus, target)) { tsleep(&mly->mly_btl[bus][target], PRIBIO, "mlyscan", 0); } splx(s); } } /* * Shut down all configured `mly' devices. */ static void mly_shutdown(void *cookie) { struct mly_softc *mly; int i; for (i = 0; i < mly_cd.cd_ndevs; i++) { if ((mly = device_lookup_private(&mly_cd, i)) == NULL) continue; if (mly_flush(mly)) aprint_error_dev(mly->mly_dv, "unable to flush cache\n"); } } /* * Fill in the mly_controllerinfo and mly_controllerparam fields in the * softc. */ static int mly_get_controllerinfo(struct mly_softc *mly) { struct mly_cmd_ioctl mci; int rv; /* * Build the getcontrollerinfo ioctl and send it. */ memset(&mci, 0, sizeof(mci)); mci.sub_ioctl = MDACIOCTL_GETCONTROLLERINFO; rv = mly_ioctl(mly, &mci, (void **)&mly->mly_controllerinfo, sizeof(*mly->mly_controllerinfo), NULL, NULL); if (rv != 0) return (rv); /* * Build the getcontrollerparameter ioctl and send it. */ memset(&mci, 0, sizeof(mci)); mci.sub_ioctl = MDACIOCTL_GETCONTROLLERPARAMETER; rv = mly_ioctl(mly, &mci, (void **)&mly->mly_controllerparam, sizeof(*mly->mly_controllerparam), NULL, NULL); return (rv); } /* * Rescan a device, possibly as a consequence of getting an event which * suggests that it may have changed. Must be called with interrupts * blocked. */ static int mly_scan_btl(struct mly_softc *mly, int bus, int target) { struct mly_ccb *mc; struct mly_cmd_ioctl *mci; int rv; if (target == mly->mly_controllerparam->initiator_id) { mly->mly_btl[bus][target].mb_flags = MLY_BTL_PROTECTED; return (EIO); } /* Don't re-scan if a scan is already in progress. */ if ((mly->mly_btl[bus][target].mb_flags & MLY_BTL_SCANNING) != 0) return (EBUSY); /* Get a command. */ if ((rv = mly_ccb_alloc(mly, &mc)) != 0) return (rv); /* Set up the data buffer. */ mc->mc_data = malloc(sizeof(union mly_devinfo), M_DEVBUF, M_NOWAIT|M_ZERO); mc->mc_flags |= MLY_CCB_DATAIN; mc->mc_complete = mly_complete_rescan; /* * Build the ioctl. */ mci = (struct mly_cmd_ioctl *)&mc->mc_packet->ioctl; mci->opcode = MDACMD_IOCTL; mci->timeout = 30 | MLY_TIMEOUT_SECONDS; memset(&mci->param, 0, sizeof(mci->param)); if (MLY_BUS_IS_VIRTUAL(mly, bus)) { mc->mc_length = sizeof(struct mly_ioctl_getlogdevinfovalid); mci->data_size = htole32(mc->mc_length); mci->sub_ioctl = MDACIOCTL_GETLOGDEVINFOVALID; _lto3l(MLY_LOGADDR(0, MLY_LOGDEV_ID(mly, bus, target)), mci->addr); } else { mc->mc_length = sizeof(struct mly_ioctl_getphysdevinfovalid); mci->data_size = htole32(mc->mc_length); mci->sub_ioctl = MDACIOCTL_GETPHYSDEVINFOVALID; _lto3l(MLY_PHYADDR(0, bus, target, 0), mci->addr); } /* * Dispatch the command. */ if ((rv = mly_ccb_map(mly, mc)) != 0) { free(mc->mc_data, M_DEVBUF); mly_ccb_free(mly, mc); return(rv); } mly->mly_btl[bus][target].mb_flags |= MLY_BTL_SCANNING; mly_ccb_enqueue(mly, mc); return (0); } /* * Handle the completion of a rescan operation. */ static void mly_complete_rescan(struct mly_softc *mly, struct mly_ccb *mc) { struct mly_ioctl_getlogdevinfovalid *ldi; struct mly_ioctl_getphysdevinfovalid *pdi; struct mly_cmd_ioctl *mci; struct mly_btl btl, *btlp; struct scsipi_xfer_mode xm; int bus, target, rescan; u_int tmp; mly_ccb_unmap(mly, mc); /* * Recover the bus and target from the command. We need these even * in the case where we don't have a useful response. */ mci = (struct mly_cmd_ioctl *)&mc->mc_packet->ioctl; tmp = _3ltol(mci->addr); rescan = 0; if (mci->sub_ioctl == MDACIOCTL_GETLOGDEVINFOVALID) { bus = MLY_LOGDEV_BUS(mly, MLY_LOGADDR_DEV(tmp)); target = MLY_LOGDEV_TARGET(mly, MLY_LOGADDR_DEV(tmp)); } else { bus = MLY_PHYADDR_CHANNEL(tmp); target = MLY_PHYADDR_TARGET(tmp); } btlp = &mly->mly_btl[bus][target]; /* The default result is 'no device'. */ memset(&btl, 0, sizeof(btl)); btl.mb_flags = MLY_BTL_PROTECTED; /* If the rescan completed OK, we have possibly-new BTL data. */ if (mc->mc_status != 0) goto out; if (mc->mc_length == sizeof(*ldi)) { ldi = (struct mly_ioctl_getlogdevinfovalid *)mc->mc_data; tmp = le32toh(ldi->logical_device_number); if (MLY_LOGDEV_BUS(mly, tmp) != bus || MLY_LOGDEV_TARGET(mly, tmp) != target) { #ifdef MLYDEBUG printf("%s: WARNING: BTL rescan (logical) for %d:%d " "returned data for %d:%d instead\n", device_xname(mly->mly_dv), bus, target, MLY_LOGDEV_BUS(mly, tmp), MLY_LOGDEV_TARGET(mly, tmp)); #endif goto out; } btl.mb_flags = MLY_BTL_LOGICAL | MLY_BTL_TQING; btl.mb_type = ldi->raid_level; btl.mb_state = ldi->state; } else if (mc->mc_length == sizeof(*pdi)) { pdi = (struct mly_ioctl_getphysdevinfovalid *)mc->mc_data; if (pdi->channel != bus || pdi->target != target) { #ifdef MLYDEBUG printf("%s: WARNING: BTL rescan (physical) for %d:%d " " returned data for %d:%d instead\n", device_xname(mly->mly_dv), bus, target, pdi->channel, pdi->target); #endif goto out; } btl.mb_flags = MLY_BTL_PHYSICAL; btl.mb_type = MLY_DEVICE_TYPE_PHYSICAL; btl.mb_state = pdi->state; btl.mb_speed = pdi->speed; btl.mb_width = pdi->width; if (pdi->state != MLY_DEVICE_STATE_UNCONFIGURED) btl.mb_flags |= MLY_BTL_PROTECTED; if (pdi->command_tags != 0) btl.mb_flags |= MLY_BTL_TQING; } else { printf("%s: BTL rescan result invalid\n", device_xname(mly->mly_dv)); goto out; } /* Decide whether we need to rescan the device. */ if (btl.mb_flags != btlp->mb_flags || btl.mb_speed != btlp->mb_speed || btl.mb_width != btlp->mb_width) rescan = 1; out: *btlp = btl; if (rescan && (btl.mb_flags & MLY_BTL_PROTECTED) == 0) { xm.xm_target = target; mly_get_xfer_mode(mly, bus, &xm); /* XXX SCSI mid-layer rescan goes here. */ } /* Wake anybody waiting on the device to be rescanned. */ wakeup(btlp); free(mc->mc_data, M_DEVBUF); mly_ccb_free(mly, mc); } /* * Get the current health status and set the 'next event' counter to suit. */ static int mly_get_eventstatus(struct mly_softc *mly) { struct mly_cmd_ioctl mci; struct mly_health_status *mh; int rv; /* Build the gethealthstatus ioctl and send it. */ memset(&mci, 0, sizeof(mci)); mh = NULL; mci.sub_ioctl = MDACIOCTL_GETHEALTHSTATUS; rv = mly_ioctl(mly, &mci, (void *)&mh, sizeof(*mh), NULL, NULL); if (rv) return (rv); /* Get the event counter. */ mly->mly_event_change = le32toh(mh->change_counter); mly->mly_event_waiting = le32toh(mh->next_event); mly->mly_event_counter = le32toh(mh->next_event); /* Save the health status into the memory mailbox */ memcpy(&mly->mly_mmbox->mmm_health.status, mh, sizeof(*mh)); bus_dmamap_sync(mly->mly_dmat, mly->mly_mmbox_dmamap, offsetof(struct mly_mmbox, mmm_health), sizeof(mly->mly_mmbox->mmm_health), BUS_DMASYNC_PREWRITE | BUS_DMASYNC_PREREAD); free(mh, M_DEVBUF); return (0); } /* * Enable memory mailbox mode. */ static int mly_enable_mmbox(struct mly_softc *mly) { struct mly_cmd_ioctl mci; u_int8_t *sp; u_int64_t tmp; int rv; /* Build the ioctl and send it. */ memset(&mci, 0, sizeof(mci)); mci.sub_ioctl = MDACIOCTL_SETMEMORYMAILBOX; /* Set buffer addresses. */ tmp = mly->mly_mmbox_busaddr + offsetof(struct mly_mmbox, mmm_command); mci.param.setmemorymailbox.command_mailbox_physaddr = htole64(tmp); tmp = mly->mly_mmbox_busaddr + offsetof(struct mly_mmbox, mmm_status); mci.param.setmemorymailbox.status_mailbox_physaddr = htole64(tmp); tmp = mly->mly_mmbox_busaddr + offsetof(struct mly_mmbox, mmm_health); mci.param.setmemorymailbox.health_buffer_physaddr = htole64(tmp); /* Set buffer sizes - abuse of data_size field is revolting. */ sp = (u_int8_t *)&mci.data_size; sp[0] = (sizeof(union mly_cmd_packet) * MLY_MMBOX_COMMANDS) >> 10; sp[1] = (sizeof(union mly_status_packet) * MLY_MMBOX_STATUS) >> 10; mci.param.setmemorymailbox.health_buffer_size = sizeof(union mly_health_region) >> 10; rv = mly_ioctl(mly, &mci, NULL, 0, NULL, NULL); if (rv) return (rv); mly->mly_state |= MLY_STATE_MMBOX_ACTIVE; return (0); } /* * Flush all pending I/O from the controller. */ static int mly_flush(struct mly_softc *mly) { struct mly_cmd_ioctl mci; /* Build the ioctl */ memset(&mci, 0, sizeof(mci)); mci.sub_ioctl = MDACIOCTL_FLUSHDEVICEDATA; mci.param.deviceoperation.operation_device = MLY_OPDEVICE_PHYSICAL_CONTROLLER; /* Pass it off to the controller */ return (mly_ioctl(mly, &mci, NULL, 0, NULL, NULL)); } /* * Perform an ioctl command. * * If (data) is not NULL, the command requires data transfer to the * controller. If (*data) is NULL the command requires data transfer from * the controller, and we will allocate a buffer for it. */ static int mly_ioctl(struct mly_softc *mly, struct mly_cmd_ioctl *ioctl, void **data, size_t datasize, void *sense_buffer, size_t *sense_length) { struct mly_ccb *mc; struct mly_cmd_ioctl *mci; u_int8_t status; int rv; mc = NULL; if ((rv = mly_ccb_alloc(mly, &mc)) != 0) goto bad; /* * Copy the ioctl structure, but save some important fields and then * fixup. */ mci = &mc->mc_packet->ioctl; ioctl->sense_buffer_address = htole64(mci->sense_buffer_address); ioctl->maximum_sense_size = mci->maximum_sense_size; *mci = *ioctl; mci->opcode = MDACMD_IOCTL; mci->timeout = 30 | MLY_TIMEOUT_SECONDS; /* Handle the data buffer. */ if (data != NULL) { if (*data == NULL) { /* Allocate data buffer */ mc->mc_data = malloc(datasize, M_DEVBUF, M_NOWAIT); mc->mc_flags |= MLY_CCB_DATAIN; } else { mc->mc_data = *data; mc->mc_flags |= MLY_CCB_DATAOUT; } mc->mc_length = datasize; mc->mc_packet->generic.data_size = htole32(datasize); } /* Run the command. */ if (datasize > 0) if ((rv = mly_ccb_map(mly, mc)) != 0) goto bad; rv = mly_ccb_poll(mly, mc, 30000); if (datasize > 0) mly_ccb_unmap(mly, mc); if (rv != 0) goto bad; /* Clean up and return any data. */ status = mc->mc_status; if (status != 0) printf("mly_ioctl: command status %d\n", status); if (mc->mc_sense > 0 && sense_buffer != NULL) { memcpy(sense_buffer, mc->mc_packet, mc->mc_sense); *sense_length = mc->mc_sense; goto bad; } /* Should we return a data pointer? */ if (data != NULL && *data == NULL) *data = mc->mc_data; /* Command completed OK. */ rv = (status != 0 ? EIO : 0); bad: if (mc != NULL) { /* Do we need to free a data buffer we allocated? */ if (rv != 0 && mc->mc_data != NULL && (data == NULL || *data == NULL)) free(mc->mc_data, M_DEVBUF); mly_ccb_free(mly, mc); } return (rv); } /* * Check for event(s) outstanding in the controller. */ static void mly_check_event(struct mly_softc *mly) { bus_dmamap_sync(mly->mly_dmat, mly->mly_mmbox_dmamap, offsetof(struct mly_mmbox, mmm_health), sizeof(mly->mly_mmbox->mmm_health), BUS_DMASYNC_POSTWRITE | BUS_DMASYNC_POSTREAD); /* * The controller may have updated the health status information, so * check for it here. Note that the counters are all in host * memory, so this check is very cheap. Also note that we depend on * checking on completion */ if (le32toh(mly->mly_mmbox->mmm_health.status.change_counter) != mly->mly_event_change) { mly->mly_event_change = le32toh(mly->mly_mmbox->mmm_health.status.change_counter); mly->mly_event_waiting = le32toh(mly->mly_mmbox->mmm_health.status.next_event); /* Wake up anyone that might be interested in this. */ wakeup(&mly->mly_event_change); } bus_dmamap_sync(mly->mly_dmat, mly->mly_mmbox_dmamap, offsetof(struct mly_mmbox, mmm_health), sizeof(mly->mly_mmbox->mmm_health), BUS_DMASYNC_PREWRITE | BUS_DMASYNC_PREREAD); if (mly->mly_event_counter != mly->mly_event_waiting) mly_fetch_event(mly); } /* * Fetch one event from the controller. If we fail due to resource * starvation, we'll be retried the next time a command completes. */ static void mly_fetch_event(struct mly_softc *mly) { struct mly_ccb *mc; struct mly_cmd_ioctl *mci; int s; u_int32_t event; /* Get a command. */ if (mly_ccb_alloc(mly, &mc)) return; /* Set up the data buffer. */ mc->mc_data = malloc(sizeof(struct mly_event), M_DEVBUF, M_NOWAIT|M_ZERO); mc->mc_length = sizeof(struct mly_event); mc->mc_flags |= MLY_CCB_DATAIN; mc->mc_complete = mly_complete_event; /* * Get an event number to fetch. It's possible that we've raced * with another context for the last event, in which case there will * be no more events. */ s = splbio(); if (mly->mly_event_counter == mly->mly_event_waiting) { splx(s); free(mc->mc_data, M_DEVBUF); mly_ccb_free(mly, mc); return; } event = mly->mly_event_counter++; splx(s); /* * Build the ioctl. * * At this point we are committed to sending this request, as it * will be the only one constructed for this particular event * number. */ mci = (struct mly_cmd_ioctl *)&mc->mc_packet->ioctl; mci->opcode = MDACMD_IOCTL; mci->data_size = htole32(sizeof(struct mly_event)); _lto3l(MLY_PHYADDR(0, 0, (event >> 16) & 0xff, (event >> 24) & 0xff), mci->addr); mci->timeout = 30 | MLY_TIMEOUT_SECONDS; mci->sub_ioctl = MDACIOCTL_GETEVENT; mci->param.getevent.sequence_number_low = htole16(event & 0xffff); /* * Submit the command. */ if (mly_ccb_map(mly, mc) != 0) goto bad; mly_ccb_enqueue(mly, mc); return; bad: printf("%s: couldn't fetch event %u\n", device_xname(mly->mly_dv), event); free(mc->mc_data, M_DEVBUF); mly_ccb_free(mly, mc); } /* * Handle the completion of an event poll. */ static void mly_complete_event(struct mly_softc *mly, struct mly_ccb *mc) { struct mly_event *me; me = (struct mly_event *)mc->mc_data; mly_ccb_unmap(mly, mc); mly_ccb_free(mly, mc); /* If the event was successfully fetched, process it. */ if (mc->mc_status == SCSI_OK) mly_process_event(mly, me); else aprint_error_dev(mly->mly_dv, "unable to fetch event; status = 0x%x\n", mc->mc_status); free(me, M_DEVBUF); /* Check for another event. */ mly_check_event(mly); } /* * Process a controller event. Called with interrupts blocked (i.e., at * interrupt time). */ static void mly_process_event(struct mly_softc *mly, struct mly_event *me) { struct scsi_sense_data *ssd; int bus, target, event, class, action; const char *fp, *tp; ssd = (struct scsi_sense_data *)&me->sense[0]; /* * Errors can be reported using vendor-unique sense data. In this * case, the event code will be 0x1c (Request sense data present), * the sense key will be 0x09 (vendor specific), the MSB of the ASC * will be set, and the actual event code will be a 16-bit value * comprised of the ASCQ (low byte) and low seven bits of the ASC * (low seven bits of the high byte). */ if (le32toh(me->code) == 0x1c && SSD_SENSE_KEY(ssd->flags) == SKEY_VENDOR_SPECIFIC && (ssd->asc & 0x80) != 0) { event = ((int)(ssd->asc & ~0x80) << 8) + ssd->ascq; } else event = le32toh(me->code); /* Look up event, get codes. */ fp = mly_describe_code(mly_table_event, event); /* Quiet event? */ class = fp[0]; #ifdef notyet if (isupper(class) && bootverbose) class = tolower(class); #endif /* Get action code, text string. */ action = fp[1]; tp = fp + 3; /* * Print some information about the event. * * This code uses a table derived from the corresponding portion of * the Linux driver, and thus the parser is very similar. */ switch (class) { case 'p': /* * Error on physical drive. */ printf("%s: physical device %d:%d %s\n", device_xname(mly->mly_dv), me->channel, me->target, tp); if (action == 'r') mly->mly_btl[me->channel][me->target].mb_flags |= MLY_BTL_RESCAN; break; case 'l': case 'm': /* * Error on logical unit, or message about logical unit. */ bus = MLY_LOGDEV_BUS(mly, me->lun); target = MLY_LOGDEV_TARGET(mly, me->lun); printf("%s: logical device %d:%d %s\n", device_xname(mly->mly_dv), bus, target, tp); if (action == 'r') mly->mly_btl[bus][target].mb_flags |= MLY_BTL_RESCAN; break; case 's': /* * Report of sense data. */ if ((SSD_SENSE_KEY(ssd->flags) == SKEY_NO_SENSE || SSD_SENSE_KEY(ssd->flags) == SKEY_NOT_READY) && ssd->asc == 0x04 && (ssd->ascq == 0x01 || ssd->ascq == 0x02)) { /* Ignore NO_SENSE or NOT_READY in one case */ break; } /* * XXX Should translate this if SCSIVERBOSE. */ printf("%s: physical device %d:%d %s\n", device_xname(mly->mly_dv), me->channel, me->target, tp); printf("%s: sense key %d asc %02x ascq %02x\n", device_xname(mly->mly_dv), SSD_SENSE_KEY(ssd->flags), ssd->asc, ssd->ascq); printf("%s: info %x%x%x%x csi %x%x%x%x\n", device_xname(mly->mly_dv), ssd->info[0], ssd->info[1], ssd->info[2], ssd->info[3], ssd->csi[0], ssd->csi[1], ssd->csi[2], ssd->csi[3]); if (action == 'r') mly->mly_btl[me->channel][me->target].mb_flags |= MLY_BTL_RESCAN; break; case 'e': printf("%s: ", device_xname(mly->mly_dv)); printf(tp, me->target, me->lun); break; case 'c': printf("%s: controller %s\n", device_xname(mly->mly_dv), tp); break; case '?': printf("%s: %s - %d\n", device_xname(mly->mly_dv), tp, event); break; default: /* Probably a 'noisy' event being ignored. */ break; } } /* * Perform periodic activities. */ static void mly_thread(void *cookie) { struct mly_softc *mly; struct mly_btl *btl; int s, bus, target, done; mly = (struct mly_softc *)cookie; for (;;) { /* Check for new events. */ mly_check_event(mly); /* Re-scan up to 1 device. */ s = splbio(); done = 0; for (bus = 0; bus < mly->mly_nchans && !done; bus++) { for (target = 0; target < MLY_MAX_TARGETS; target++) { /* Perform device rescan? */ btl = &mly->mly_btl[bus][target]; if ((btl->mb_flags & MLY_BTL_RESCAN) != 0) { btl->mb_flags ^= MLY_BTL_RESCAN; mly_scan_btl(mly, bus, target); done = 1; break; } } } splx(s); /* Sleep for N seconds. */ tsleep(mly_thread, PWAIT, "mlyzzz", hz * MLY_PERIODIC_INTERVAL); } } /* * Submit a command to the controller and poll on completion. Return * non-zero on timeout. */ static int mly_ccb_poll(struct mly_softc *mly, struct mly_ccb *mc, int timo) { int rv; if ((rv = mly_ccb_submit(mly, mc)) != 0) return (rv); for (timo *= 10; timo != 0; timo--) { if ((mc->mc_flags & MLY_CCB_COMPLETE) != 0) break; mly_intr(mly); DELAY(100); } return (timo == 0); } /* * Submit a command to the controller and sleep on completion. Return * non-zero on timeout. */ static int mly_ccb_wait(struct mly_softc *mly, struct mly_ccb *mc, int timo) { int rv, s; mly_ccb_enqueue(mly, mc); s = splbio(); if ((mc->mc_flags & MLY_CCB_COMPLETE) != 0) { splx(s); return (0); } rv = tsleep(mc, PRIBIO, "mlywccb", timo * hz / 1000); splx(s); return (rv); } /* * If a CCB is specified, enqueue it. Pull CCBs off the software queue in * the order that they were enqueued and try to submit their command blocks * to the controller for execution. */ void mly_ccb_enqueue(struct mly_softc *mly, struct mly_ccb *mc) { int s; s = splbio(); if (mc != NULL) SIMPLEQ_INSERT_TAIL(&mly->mly_ccb_queue, mc, mc_link.simpleq); while ((mc = SIMPLEQ_FIRST(&mly->mly_ccb_queue)) != NULL) { if (mly_ccb_submit(mly, mc)) break; SIMPLEQ_REMOVE_HEAD(&mly->mly_ccb_queue, mc_link.simpleq); } splx(s); } /* * Deliver a command to the controller. */ static int mly_ccb_submit(struct mly_softc *mly, struct mly_ccb *mc) { union mly_cmd_packet *pkt; int s, off; mc->mc_packet->generic.command_id = htole16(mc->mc_slot); bus_dmamap_sync(mly->mly_dmat, mly->mly_pkt_dmamap, mc->mc_packetphys - mly->mly_pkt_busaddr, sizeof(union mly_cmd_packet), BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE); s = splbio(); /* * Do we have to use the hardware mailbox? */ if ((mly->mly_state & MLY_STATE_MMBOX_ACTIVE) == 0) { /* * Check to see if the controller is ready for us. */ if (mly_idbr_true(mly, MLY_HM_CMDSENT)) { splx(s); return (EBUSY); } /* * It's ready, send the command. */ mly_outl(mly, mly->mly_cmd_mailbox, (u_int64_t)mc->mc_packetphys & 0xffffffff); mly_outl(mly, mly->mly_cmd_mailbox + 4, (u_int64_t)mc->mc_packetphys >> 32); mly_outb(mly, mly->mly_idbr, MLY_HM_CMDSENT); } else { pkt = &mly->mly_mmbox->mmm_command[mly->mly_mmbox_cmd_idx]; off = (char *)pkt - (char *)mly->mly_mmbox; bus_dmamap_sync(mly->mly_dmat, mly->mly_mmbox_dmamap, off, sizeof(mly->mly_mmbox->mmm_command[0]), BUS_DMASYNC_POSTWRITE | BUS_DMASYNC_POSTREAD); /* Check to see if the next index is free yet. */ if (pkt->mmbox.flag != 0) { splx(s); return (EBUSY); } /* Copy in new command */ memcpy(pkt->mmbox.data, mc->mc_packet->mmbox.data, sizeof(pkt->mmbox.data)); /* Copy flag last. */ pkt->mmbox.flag = mc->mc_packet->mmbox.flag; bus_dmamap_sync(mly->mly_dmat, mly->mly_mmbox_dmamap, off, sizeof(mly->mly_mmbox->mmm_command[0]), BUS_DMASYNC_PREWRITE | BUS_DMASYNC_PREREAD); /* Signal controller and update index. */ mly_outb(mly, mly->mly_idbr, MLY_AM_CMDSENT); mly->mly_mmbox_cmd_idx = (mly->mly_mmbox_cmd_idx + 1) % MLY_MMBOX_COMMANDS; } splx(s); return (0); } /* * Pick up completed commands from the controller and handle accordingly. */ int mly_intr(void *cookie) { struct mly_ccb *mc; union mly_status_packet *sp; u_int16_t slot; int forus, off; struct mly_softc *mly; mly = cookie; forus = 0; /* * Pick up hardware-mailbox commands. */ if (mly_odbr_true(mly, MLY_HM_STSREADY)) { slot = mly_inw(mly, mly->mly_status_mailbox); if (slot < MLY_SLOT_MAX) { mc = mly->mly_ccbs + (slot - MLY_SLOT_START); mc->mc_status = mly_inb(mly, mly->mly_status_mailbox + 2); mc->mc_sense = mly_inb(mly, mly->mly_status_mailbox + 3); mc->mc_resid = mly_inl(mly, mly->mly_status_mailbox + 4); mly_ccb_complete(mly, mc); } else { /* Slot 0xffff may mean "extremely bogus command". */ printf("%s: got HM completion for illegal slot %u\n", device_xname(mly->mly_dv), slot); } /* Unconditionally acknowledge status. */ mly_outb(mly, mly->mly_odbr, MLY_HM_STSREADY); mly_outb(mly, mly->mly_idbr, MLY_HM_STSACK); forus = 1; } /* * Pick up memory-mailbox commands. */ if (mly_odbr_true(mly, MLY_AM_STSREADY)) { for (;;) { sp = &mly->mly_mmbox->mmm_status[mly->mly_mmbox_sts_idx]; off = (char *)sp - (char *)mly->mly_mmbox; bus_dmamap_sync(mly->mly_dmat, mly->mly_mmbox_dmamap, off, sizeof(mly->mly_mmbox->mmm_command[0]), BUS_DMASYNC_POSTWRITE | BUS_DMASYNC_POSTREAD); /* Check for more status. */ if (sp->mmbox.flag == 0) break; /* Get slot number. */ slot = le16toh(sp->status.command_id); if (slot < MLY_SLOT_MAX) { mc = mly->mly_ccbs + (slot - MLY_SLOT_START); mc->mc_status = sp->status.status; mc->mc_sense = sp->status.sense_length; mc->mc_resid = le32toh(sp->status.residue); mly_ccb_complete(mly, mc); } else { /* * Slot 0xffff may mean "extremely bogus * command". */ printf("%s: got AM completion for illegal " "slot %u at %d\n", device_xname(mly->mly_dv), slot, mly->mly_mmbox_sts_idx); } /* Clear and move to next index. */ sp->mmbox.flag = 0; mly->mly_mmbox_sts_idx = (mly->mly_mmbox_sts_idx + 1) % MLY_MMBOX_STATUS; } /* Acknowledge that we have collected status value(s). */ mly_outb(mly, mly->mly_odbr, MLY_AM_STSREADY); forus = 1; } /* * Run the queue. */ if (forus && ! SIMPLEQ_EMPTY(&mly->mly_ccb_queue)) mly_ccb_enqueue(mly, NULL); return (forus); } /* * Process completed commands */ static void mly_ccb_complete(struct mly_softc *mly, struct mly_ccb *mc) { void (*complete)(struct mly_softc *, struct mly_ccb *); bus_dmamap_sync(mly->mly_dmat, mly->mly_pkt_dmamap, mc->mc_packetphys - mly->mly_pkt_busaddr, sizeof(union mly_cmd_packet), BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE); complete = mc->mc_complete; mc->mc_flags |= MLY_CCB_COMPLETE; /* * Call completion handler or wake up sleeping consumer. */ if (complete != NULL) (*complete)(mly, mc); else wakeup(mc); } /* * Allocate a command. */ int mly_ccb_alloc(struct mly_softc *mly, struct mly_ccb **mcp) { struct mly_ccb *mc; int s; s = splbio(); mc = SLIST_FIRST(&mly->mly_ccb_free); if (mc != NULL) SLIST_REMOVE_HEAD(&mly->mly_ccb_free, mc_link.slist); splx(s); *mcp = mc; return (mc == NULL ? EAGAIN : 0); } /* * Release a command back to the freelist. */ void mly_ccb_free(struct mly_softc *mly, struct mly_ccb *mc) { int s; /* * Fill in parts of the command that may cause confusion if a * consumer doesn't when we are later allocated. */ mc->mc_data = NULL; mc->mc_flags = 0; mc->mc_complete = NULL; mc->mc_private = NULL; mc->mc_packet->generic.command_control = 0; /* * By default, we set up to overwrite the command packet with sense * information. */ mc->mc_packet->generic.sense_buffer_address = htole64(mc->mc_packetphys); mc->mc_packet->generic.maximum_sense_size = sizeof(union mly_cmd_packet); s = splbio(); SLIST_INSERT_HEAD(&mly->mly_ccb_free, mc, mc_link.slist); splx(s); } /* * Allocate and initialize command and packet structures. * * If the controller supports fewer than MLY_MAX_CCBS commands, limit our * allocation to that number. If we don't yet know how many commands the * controller supports, allocate a very small set (suitable for initialization * purposes only). */ static int mly_alloc_ccbs(struct mly_softc *mly) { struct mly_ccb *mc; int i, rv; if (mly->mly_controllerinfo == NULL) mly->mly_ncmds = MLY_CCBS_RESV; else { i = le16toh(mly->mly_controllerinfo->maximum_parallel_commands); mly->mly_ncmds = uimin(MLY_MAX_CCBS, i); } /* * Allocate enough space for all the command packets in one chunk * and map them permanently into controller-visible space. */ rv = mly_dmamem_alloc(mly, mly->mly_ncmds * sizeof(union mly_cmd_packet), &mly->mly_pkt_dmamap, (void **)&mly->mly_pkt, &mly->mly_pkt_busaddr, &mly->mly_pkt_seg); if (rv) return (rv); mly->mly_ccbs = malloc(sizeof(struct mly_ccb) * mly->mly_ncmds, M_DEVBUF, M_WAITOK|M_ZERO); for (i = 0; i < mly->mly_ncmds; i++) { mc = mly->mly_ccbs + i; mc->mc_slot = MLY_SLOT_START + i; mc->mc_packet = mly->mly_pkt + i; mc->mc_packetphys = mly->mly_pkt_busaddr + (i * sizeof(union mly_cmd_packet)); rv = bus_dmamap_create(mly->mly_dmat, MLY_MAX_XFER, MLY_MAX_SEGS, MLY_MAX_XFER, 0, BUS_DMA_NOWAIT | BUS_DMA_ALLOCNOW, &mc->mc_datamap); if (rv) { mly_release_ccbs(mly); return (rv); } mly_ccb_free(mly, mc); } return (0); } /* * Free all the storage held by commands. * * Must be called with all commands on the free list. */ static void mly_release_ccbs(struct mly_softc *mly) { struct mly_ccb *mc; /* Throw away command buffer DMA maps. */ while (mly_ccb_alloc(mly, &mc) == 0) bus_dmamap_destroy(mly->mly_dmat, mc->mc_datamap); /* Release CCB storage. */ free(mly->mly_ccbs, M_DEVBUF); /* Release the packet storage. */ mly_dmamem_free(mly, mly->mly_ncmds * sizeof(union mly_cmd_packet), mly->mly_pkt_dmamap, (void *)mly->mly_pkt, &mly->mly_pkt_seg); } /* * Map a command into controller-visible space. */ static int mly_ccb_map(struct mly_softc *mly, struct mly_ccb *mc) { struct mly_cmd_generic *gen; struct mly_sg_entry *sg; bus_dma_segment_t *ds; int flg, nseg, rv; #ifdef DIAGNOSTIC /* Don't map more than once. */ if ((mc->mc_flags & MLY_CCB_MAPPED) != 0) panic("mly_ccb_map: already mapped"); mc->mc_flags |= MLY_CCB_MAPPED; /* Does the command have a data buffer? */ if (mc->mc_data == NULL) panic("mly_ccb_map: no data buffer"); #endif rv = bus_dmamap_load(mly->mly_dmat, mc->mc_datamap, mc->mc_data, mc->mc_length, NULL, BUS_DMA_NOWAIT | BUS_DMA_STREAMING | ((mc->mc_flags & MLY_CCB_DATAIN) != 0 ? BUS_DMA_READ : BUS_DMA_WRITE)); if (rv != 0) return (rv); gen = &mc->mc_packet->generic; /* * Can we use the transfer structure directly? */ if ((nseg = mc->mc_datamap->dm_nsegs) <= 2) { mc->mc_sgoff = -1; sg = &gen->transfer.direct.sg[0]; } else { mc->mc_sgoff = (mc->mc_slot - MLY_SLOT_START) * MLY_MAX_SEGS; sg = mly->mly_sg + mc->mc_sgoff; gen->command_control |= MLY_CMDCTL_EXTENDED_SG_TABLE; gen->transfer.indirect.entries[0] = htole16(nseg); gen->transfer.indirect.table_physaddr[0] = htole64(mly->mly_sg_busaddr + (mc->mc_sgoff * sizeof(struct mly_sg_entry))); } /* * Fill the S/G table. */ for (ds = mc->mc_datamap->dm_segs; nseg != 0; nseg--, sg++, ds++) { sg->physaddr = htole64(ds->ds_addr); sg->length = htole64(ds->ds_len); } /* * Sync up the data map. */ if ((mc->mc_flags & MLY_CCB_DATAIN) != 0) flg = BUS_DMASYNC_PREREAD; else /* if ((mc->mc_flags & MLY_CCB_DATAOUT) != 0) */ { gen->command_control |= MLY_CMDCTL_DATA_DIRECTION; flg = BUS_DMASYNC_PREWRITE; } bus_dmamap_sync(mly->mly_dmat, mc->mc_datamap, 0, mc->mc_length, flg); /* * Sync up the chained S/G table, if we're using one. */ if (mc->mc_sgoff == -1) return (0); bus_dmamap_sync(mly->mly_dmat, mly->mly_sg_dmamap, mc->mc_sgoff, MLY_SGL_SIZE, BUS_DMASYNC_PREWRITE); return (0); } /* * Unmap a command from controller-visible space. */ static void mly_ccb_unmap(struct mly_softc *mly, struct mly_ccb *mc) { int flg; #ifdef DIAGNOSTIC if ((mc->mc_flags & MLY_CCB_MAPPED) == 0) panic("mly_ccb_unmap: not mapped"); mc->mc_flags &= ~MLY_CCB_MAPPED; #endif if ((mc->mc_flags & MLY_CCB_DATAIN) != 0) flg = BUS_DMASYNC_POSTREAD; else /* if ((mc->mc_flags & MLY_CCB_DATAOUT) != 0) */ flg = BUS_DMASYNC_POSTWRITE; bus_dmamap_sync(mly->mly_dmat, mc->mc_datamap, 0, mc->mc_length, flg); bus_dmamap_unload(mly->mly_dmat, mc->mc_datamap); if (mc->mc_sgoff == -1) return; bus_dmamap_sync(mly->mly_dmat, mly->mly_sg_dmamap, mc->mc_sgoff, MLY_SGL_SIZE, BUS_DMASYNC_POSTWRITE); } /* * Adjust the size of each I/O before it passes to the SCSI layer. */ static void mly_scsipi_minphys(struct buf *bp) { if (bp->b_bcount > MLY_MAX_XFER) bp->b_bcount = MLY_MAX_XFER; minphys(bp); } /* * Start a SCSI command. */ static void mly_scsipi_request(struct scsipi_channel *chan, scsipi_adapter_req_t req, void *arg) { struct mly_ccb *mc; struct mly_cmd_scsi_small *ss; struct scsipi_xfer *xs; struct scsipi_periph *periph; struct mly_softc *mly; struct mly_btl *btl; int s, tmp; mly = device_private(chan->chan_adapter->adapt_dev); switch (req) { case ADAPTER_REQ_RUN_XFER: xs = arg; periph = xs->xs_periph; btl = &mly->mly_btl[chan->chan_channel][periph->periph_target]; s = splbio(); tmp = btl->mb_flags; splx(s); /* * Check for I/O attempt to a protected or non-existent * device. */ if ((tmp & MLY_BTL_PROTECTED) != 0) { xs->error = XS_SELTIMEOUT; scsipi_done(xs); break; } #ifdef DIAGNOSTIC /* XXX Increase if/when we support large SCSI commands. */ if (xs->cmdlen > MLY_CMD_SCSI_SMALL_CDB) { printf("%s: cmd too large\n", device_xname(mly->mly_dv)); xs->error = XS_DRIVER_STUFFUP; scsipi_done(xs); break; } #endif if (mly_ccb_alloc(mly, &mc)) { xs->error = XS_RESOURCE_SHORTAGE; scsipi_done(xs); break; } /* Build the command. */ mc->mc_data = xs->data; mc->mc_length = xs->datalen; mc->mc_complete = mly_scsipi_complete; mc->mc_private = xs; /* Build the packet for the controller. */ ss = &mc->mc_packet->scsi_small; ss->opcode = MDACMD_SCSI; #ifdef notdef /* * XXX FreeBSD does this, but it doesn't fix anything, * XXX and appears potentially harmful. */ ss->command_control |= MLY_CMDCTL_DISABLE_DISCONNECT; #endif ss->data_size = htole32(xs->datalen); _lto3l(MLY_PHYADDR(0, chan->chan_channel, periph->periph_target, periph->periph_lun), ss->addr); if (xs->timeout < 60 * 1000) ss->timeout = xs->timeout / 1000 | MLY_TIMEOUT_SECONDS; else if (xs->timeout < 60 * 60 * 1000) ss->timeout = xs->timeout / (60 * 1000) | MLY_TIMEOUT_MINUTES; else ss->timeout = xs->timeout / (60 * 60 * 1000) | MLY_TIMEOUT_HOURS; ss->maximum_sense_size = sizeof(xs->sense); ss->cdb_length = xs->cmdlen; memcpy(ss->cdb, xs->cmd, xs->cmdlen); if (mc->mc_length != 0) { if ((xs->xs_control & XS_CTL_DATA_OUT) != 0) mc->mc_flags |= MLY_CCB_DATAOUT; else /* if ((xs->xs_control & XS_CTL_DATA_IN) != 0) */ mc->mc_flags |= MLY_CCB_DATAIN; if (mly_ccb_map(mly, mc) != 0) { xs->error = XS_DRIVER_STUFFUP; mly_ccb_free(mly, mc); scsipi_done(xs); break; } } /* * Give the command to the controller. */ if ((xs->xs_control & XS_CTL_POLL) != 0) { if (mly_ccb_poll(mly, mc, xs->timeout + 5000)) { xs->error = XS_REQUEUE; if (mc->mc_length != 0) mly_ccb_unmap(mly, mc); mly_ccb_free(mly, mc); scsipi_done(xs); } } else mly_ccb_enqueue(mly, mc); break; case ADAPTER_REQ_GROW_RESOURCES: /* * Not supported. */ break; case ADAPTER_REQ_SET_XFER_MODE: /* * We can't change the transfer mode, but at least let * scsipi know what the adapter has negotiated. */ mly_get_xfer_mode(mly, chan->chan_channel, arg); break; } } /* * Handle completion of a SCSI command. */ static void mly_scsipi_complete(struct mly_softc *mly, struct mly_ccb *mc) { struct scsipi_xfer *xs; struct scsipi_channel *chan; struct scsipi_inquiry_data *inq; struct mly_btl *btl; int target, sl, s; const char *p; xs = mc->mc_private; xs->status = mc->mc_status; /* * XXX The `resid' value as returned by the controller appears to be * bogus, so we always set it to zero. Is it perhaps the transfer * count? */ xs->resid = 0; /* mc->mc_resid; */ if (mc->mc_length != 0) mly_ccb_unmap(mly, mc); switch (mc->mc_status) { case SCSI_OK: /* * In order to report logical device type and status, we * overwrite the result of the INQUIRY command to logical * devices. */ if (xs->cmd->opcode == INQUIRY) { chan = xs->xs_periph->periph_channel; target = xs->xs_periph->periph_target; btl = &mly->mly_btl[chan->chan_channel][target]; s = splbio(); if ((btl->mb_flags & MLY_BTL_LOGICAL) != 0) { inq = (struct scsipi_inquiry_data *)xs->data; mly_padstr(inq->vendor, "MYLEX", 8); p = mly_describe_code(mly_table_device_type, btl->mb_type); mly_padstr(inq->product, p, 16); p = mly_describe_code(mly_table_device_state, btl->mb_state); mly_padstr(inq->revision, p, 4); } splx(s); } xs->error = XS_NOERROR; break; case SCSI_CHECK: sl = mc->mc_sense; if (sl > sizeof(xs->sense.scsi_sense)) sl = sizeof(xs->sense.scsi_sense); memcpy(&xs->sense.scsi_sense, mc->mc_packet, sl); xs->error = XS_SENSE; break; case SCSI_BUSY: case SCSI_QUEUE_FULL: xs->error = XS_BUSY; break; default: printf("%s: unknown SCSI status 0x%x\n", device_xname(mly->mly_dv), xs->status); xs->error = XS_DRIVER_STUFFUP; break; } mly_ccb_free(mly, mc); scsipi_done(xs); } /* * Notify scsipi about a target's transfer mode. */ static void mly_get_xfer_mode(struct mly_softc *mly, int bus, struct scsipi_xfer_mode *xm) { struct mly_btl *btl; int s; btl = &mly->mly_btl[bus][xm->xm_target]; xm->xm_mode = 0; s = splbio(); if ((btl->mb_flags & MLY_BTL_PHYSICAL) != 0) { if (btl->mb_speed == 0) { xm->xm_period = 0; xm->xm_offset = 0; } else { xm->xm_period = 12; /* XXX */ xm->xm_offset = 8; /* XXX */ xm->xm_mode |= PERIPH_CAP_SYNC; /* XXX */ } switch (btl->mb_width) { case 32: xm->xm_mode = PERIPH_CAP_WIDE32; break; case 16: xm->xm_mode = PERIPH_CAP_WIDE16; break; default: xm->xm_mode = 0; break; } } else /* ((btl->mb_flags & MLY_BTL_LOGICAL) != 0) */ { xm->xm_mode = PERIPH_CAP_WIDE16 | PERIPH_CAP_SYNC; xm->xm_period = 12; xm->xm_offset = 8; } if ((btl->mb_flags & MLY_BTL_TQING) != 0) xm->xm_mode |= PERIPH_CAP_TQING; splx(s); scsipi_async_event(&mly->mly_chans[bus], ASYNC_EVENT_XFER_MODE, xm); } /* * ioctl hook; used here only to initiate low-level rescans. */ static int mly_scsipi_ioctl(struct scsipi_channel *chan, u_long cmd, void *data, int flag, struct proc *p) { struct mly_softc *mly; int rv; mly = device_private(chan->chan_adapter->adapt_dev); switch (cmd) { case SCBUSIOLLSCAN: mly_scan_channel(mly, chan->chan_channel); rv = 0; break; default: rv = ENOTTY; break; } return (rv); } /* * Handshake with the firmware while the card is being initialized. */ static int mly_fwhandshake(struct mly_softc *mly) { u_int8_t error; int spinup; spinup = 0; /* Set HM_STSACK and let the firmware initialize. */ mly_outb(mly, mly->mly_idbr, MLY_HM_STSACK); DELAY(1000); /* too short? */ /* If HM_STSACK is still true, the controller is initializing. */ if (!mly_idbr_true(mly, MLY_HM_STSACK)) return (0); printf("%s: controller initialization started\n", device_xname(mly->mly_dv)); /* * Spin waiting for initialization to finish, or for a message to be * delivered. */ while (mly_idbr_true(mly, MLY_HM_STSACK)) { /* Check for a message */ if (!mly_error_valid(mly)) continue; error = mly_inb(mly, mly->mly_error_status) & ~MLY_MSG_EMPTY; (void)mly_inb(mly, mly->mly_cmd_mailbox); (void)mly_inb(mly, mly->mly_cmd_mailbox + 1); switch (error) { case MLY_MSG_SPINUP: if (!spinup) { printf("%s: drive spinup in progress\n", device_xname(mly->mly_dv)); spinup = 1; } break; case MLY_MSG_RACE_RECOVERY_FAIL: printf("%s: mirror race recovery failed - \n", device_xname(mly->mly_dv)); printf("%s: one or more drives offline\n", device_xname(mly->mly_dv)); break; case MLY_MSG_RACE_IN_PROGRESS: printf("%s: mirror race recovery in progress\n", device_xname(mly->mly_dv)); break; case MLY_MSG_RACE_ON_CRITICAL: printf("%s: mirror race recovery on critical drive\n", device_xname(mly->mly_dv)); break; case MLY_MSG_PARITY_ERROR: printf("%s: FATAL MEMORY PARITY ERROR\n", device_xname(mly->mly_dv)); return (ENXIO); default: printf("%s: unknown initialization code 0x%x\n", device_xname(mly->mly_dv), error); break; } } return (0); } /* * Space-fill a character string */ static void mly_padstr(char *dst, const char *src, int len) { while (len-- > 0) { if (*src != '\0') *dst++ = *src++; else *dst++ = ' '; } } /* * Allocate DMA safe memory. */ static int mly_dmamem_alloc(struct mly_softc *mly, int size, bus_dmamap_t *dmamap, void **kva, bus_addr_t *paddr, bus_dma_segment_t *seg) { int rseg, rv, state; state = 0; if ((rv = bus_dmamem_alloc(mly->mly_dmat, size, PAGE_SIZE, 0, seg, 1, &rseg, BUS_DMA_NOWAIT)) != 0) { aprint_error_dev(mly->mly_dv, "dmamem_alloc = %d\n", rv); goto bad; } state++; if ((rv = bus_dmamem_map(mly->mly_dmat, seg, 1, size, kva, BUS_DMA_NOWAIT | BUS_DMA_COHERENT)) != 0) { aprint_error_dev(mly->mly_dv, "dmamem_map = %d\n", rv); goto bad; } state++; if ((rv = bus_dmamap_create(mly->mly_dmat, size, size, 1, 0, BUS_DMA_NOWAIT, dmamap)) != 0) { aprint_error_dev(mly->mly_dv, "dmamap_create = %d\n", rv); goto bad; } state++; if ((rv = bus_dmamap_load(mly->mly_dmat, *dmamap, *kva, size, NULL, BUS_DMA_NOWAIT)) != 0) { aprint_error_dev(mly->mly_dv, "dmamap_load = %d\n", rv); goto bad; } *paddr = (*dmamap)->dm_segs[0].ds_addr; memset(*kva, 0, size); return (0); bad: if (state > 2) bus_dmamap_destroy(mly->mly_dmat, *dmamap); if (state > 1) bus_dmamem_unmap(mly->mly_dmat, *kva, size); if (state > 0) bus_dmamem_free(mly->mly_dmat, seg, 1); return (rv); } /* * Free DMA safe memory. */ static void mly_dmamem_free(struct mly_softc *mly, int size, bus_dmamap_t dmamap, void *kva, bus_dma_segment_t *seg) { bus_dmamap_unload(mly->mly_dmat, dmamap); bus_dmamap_destroy(mly->mly_dmat, dmamap); bus_dmamem_unmap(mly->mly_dmat, kva, size); bus_dmamem_free(mly->mly_dmat, seg, 1); } /* * Accept an open operation on the control device. */ int mlyopen(dev_t dev, int flag, int mode, struct lwp *l) { struct mly_softc *mly; if ((mly = device_lookup_private(&mly_cd, minor(dev))) == NULL) return (ENXIO); if ((mly->mly_state & MLY_STATE_INITOK) == 0) return (ENXIO); if ((mly->mly_state & MLY_STATE_OPEN) != 0) return (EBUSY); mly->mly_state |= MLY_STATE_OPEN; return (0); } /* * Accept the last close on the control device. */ int mlyclose(dev_t dev, int flag, int mode, struct lwp *l) { struct mly_softc *mly; mly = device_lookup_private(&mly_cd, minor(dev)); mly->mly_state &= ~MLY_STATE_OPEN; return (0); } /* * Handle control operations. */ int mlyioctl(dev_t dev, u_long cmd, void *data, int flag, struct lwp *l) { struct mly_softc *mly; int rv; mly = device_lookup_private(&mly_cd, minor(dev)); switch (cmd) { case MLYIO_COMMAND: rv = kauth_authorize_device_passthru(l->l_cred, dev, KAUTH_REQ_DEVICE_RAWIO_PASSTHRU_ALL, data); if (rv) break; rv = mly_user_command(mly, (void *)data); break; case MLYIO_HEALTH: rv = mly_user_health(mly, (void *)data); break; default: rv = ENOTTY; break; } return (rv); } /* * Execute a command passed in from userspace. * * The control structure contains the actual command for the controller, as * well as the user-space data pointer and data size, and an optional sense * buffer size/pointer. On completion, the data size is adjusted to the * command residual, and the sense buffer size to the size of the returned * sense data. */ static int mly_user_command(struct mly_softc *mly, struct mly_user_command *uc) { struct mly_ccb *mc; int rv, mapped; if ((rv = mly_ccb_alloc(mly, &mc)) != 0) return (rv); mapped = 0; mc->mc_data = NULL; /* * Handle data size/direction. */ if ((mc->mc_length = abs(uc->DataTransferLength)) != 0) { if (mc->mc_length > MAXPHYS) { rv = EINVAL; goto out; } mc->mc_data = malloc(mc->mc_length, M_DEVBUF, M_WAITOK); if (mc->mc_data == NULL) { rv = ENOMEM; goto out; } if (uc->DataTransferLength > 0) { mc->mc_flags |= MLY_CCB_DATAIN; memset(mc->mc_data, 0, mc->mc_length); } if (uc->DataTransferLength < 0) { mc->mc_flags |= MLY_CCB_DATAOUT; rv = copyin(uc->DataTransferBuffer, mc->mc_data, mc->mc_length); if (rv != 0) goto out; } if ((rv = mly_ccb_map(mly, mc)) != 0) goto out; mapped = 1; } /* Copy in the command and execute it. */ memcpy(mc->mc_packet, &uc->CommandMailbox, sizeof(uc->CommandMailbox)); if ((rv = mly_ccb_wait(mly, mc, 60000)) != 0) goto out; /* Return the data to userspace. */ if (uc->DataTransferLength > 0) { rv = copyout(mc->mc_data, uc->DataTransferBuffer, mc->mc_length); if (rv != 0) goto out; } /* Return the sense buffer to userspace. */ if (uc->RequestSenseLength > 0 && mc->mc_sense > 0) { rv = copyout(mc->mc_packet, uc->RequestSenseBuffer, uimin(uc->RequestSenseLength, mc->mc_sense)); if (rv != 0) goto out; } /* Return command results to userspace (caller will copy out). */ uc->DataTransferLength = mc->mc_resid; uc->RequestSenseLength = uimin(uc->RequestSenseLength, mc->mc_sense); uc->CommandStatus = mc->mc_status; rv = 0; out: if (mapped) mly_ccb_unmap(mly, mc); if (mc->mc_data != NULL) free(mc->mc_data, M_DEVBUF); mly_ccb_free(mly, mc); return (rv); } /* * Return health status to userspace. If the health change index in the * user structure does not match that currently exported by the controller, * we return the current status immediately. Otherwise, we block until * either interrupted or new status is delivered. */ static int mly_user_health(struct mly_softc *mly, struct mly_user_health *uh) { struct mly_health_status mh; int rv, s; /* Fetch the current health status from userspace. */ rv = copyin(uh->HealthStatusBuffer, &mh, sizeof(mh)); if (rv != 0) return (rv); /* spin waiting for a status update */ s = splbio(); if (mly->mly_event_change == mh.change_counter) rv = tsleep(&mly->mly_event_change, PRIBIO | PCATCH, "mlyhealth", 0); splx(s); if (rv == 0) { /* * Copy the controller's health status buffer out (there is * a race here if it changes again). */ rv = copyout(&mly->mly_mmbox->mmm_health.status, uh->HealthStatusBuffer, sizeof(uh->HealthStatusBuffer)); } return (rv); }
3 3 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 /* $NetBSD: coda_vnops.c,v 1.118 2022/03/27 16:24:58 christos Exp $ */ /* * * Coda: an Experimental Distributed File System * Release 3.1 * * Copyright (c) 1987-1998 Carnegie Mellon University * All Rights Reserved * * Permission to use, copy, modify and distribute this software and its * documentation is hereby granted, provided that both the copyright * notice and this permission notice appear in all copies of the * software, derivative works or modified versions, and any portions * thereof, and that both notices appear in supporting documentation, and * that credit is given to Carnegie Mellon University in all documents * and publicity pertaining to direct or indirect use of this code or its * derivatives. * * CODA IS AN EXPERIMENTAL SOFTWARE SYSTEM AND IS KNOWN TO HAVE BUGS, * SOME OF WHICH MAY HAVE SERIOUS CONSEQUENCES. CARNEGIE MELLON ALLOWS * FREE USE OF THIS SOFTWARE IN ITS "AS IS" CONDITION. CARNEGIE MELLON * DISCLAIMS ANY LIABILITY OF ANY KIND FOR ANY DAMAGES WHATSOEVER * RESULTING DIRECTLY OR INDIRECTLY FROM THE USE OF THIS SOFTWARE OR OF * ANY DERIVATIVE WORK. * * Carnegie Mellon encourages users of this software to return any * improvements or extensions that they make, and to grant Carnegie * Mellon the rights to redistribute these changes without encumbrance. * * @(#) coda/coda_vnops.c,v 1.1.1.1 1998/08/29 21:26:46 rvb Exp $ */ /* * Mach Operating System * Copyright (c) 1990 Carnegie-Mellon University * Copyright (c) 1989 Carnegie-Mellon University * All rights reserved. The CMU software License Agreement specifies * the terms and conditions for use and redistribution. */ /* * This code was written for the Coda file system at Carnegie Mellon * University. Contributers include David Steere, James Kistler, and * M. Satyanarayanan. */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: coda_vnops.c,v 1.118 2022/03/27 16:24:58 christos Exp $"); #include <sys/param.h> #include <sys/systm.h> #include <sys/malloc.h> #include <sys/errno.h> #include <sys/acct.h> #include <sys/file.h> #include <sys/uio.h> #include <sys/namei.h> #include <sys/ioctl.h> #include <sys/mount.h> #include <sys/proc.h> #include <sys/select.h> #include <sys/vnode.h> #include <sys/kauth.h> #include <sys/dirent.h> #include <miscfs/genfs/genfs.h> #include <miscfs/specfs/specdev.h> #include <coda/coda.h> #include <coda/cnode.h> #include <coda/coda_vnops.h> #include <coda/coda_venus.h> #include <coda/coda_opstats.h> #include <coda/coda_subr.h> #include <coda/coda_namecache.h> #include <coda/coda_pioctl.h> /* * These flags select various performance enhancements. */ int coda_attr_cache = 1; /* Set to cache attributes in the kernel */ int coda_symlink_cache = 1; /* Set to cache symbolic link information */ int coda_access_cache = 1; /* Set to handle some access checks directly */ /* structure to keep track of vfs calls */ struct coda_op_stats coda_vnodeopstats[CODA_VNODEOPS_SIZE]; #define MARK_ENTRY(op) (coda_vnodeopstats[op].entries++) #define MARK_INT_SAT(op) (coda_vnodeopstats[op].sat_intrn++) #define MARK_INT_FAIL(op) (coda_vnodeopstats[op].unsat_intrn++) #define MARK_INT_GEN(op) (coda_vnodeopstats[op].gen_intrn++) /* What we are delaying for in printf */ static int coda_lockdebug = 0; #define ENTRY if(coda_vnop_print_entry) myprintf(("Entered %s\n",__func__)) /* Definition of the vnode operation vector */ const struct vnodeopv_entry_desc coda_vnodeop_entries[] = { { &vop_default_desc, coda_vop_error }, { &vop_parsepath_desc, genfs_parsepath }, /* parsepath */ { &vop_lookup_desc, coda_lookup }, /* lookup */ { &vop_create_desc, coda_create }, /* create */ { &vop_mknod_desc, coda_vop_error }, /* mknod */ { &vop_open_desc, coda_open }, /* open */ { &vop_close_desc, coda_close }, /* close */ { &vop_access_desc, coda_access }, /* access */ { &vop_accessx_desc, genfs_accessx }, /* access */ { &vop_getattr_desc, coda_getattr }, /* getattr */ { &vop_setattr_desc, coda_setattr }, /* setattr */ { &vop_read_desc, coda_read }, /* read */ { &vop_write_desc, coda_write }, /* write */ { &vop_fallocate_desc, genfs_eopnotsupp }, /* fallocate */ { &vop_fdiscard_desc, genfs_eopnotsupp }, /* fdiscard */ { &vop_fcntl_desc, genfs_fcntl }, /* fcntl */ { &vop_ioctl_desc, coda_ioctl }, /* ioctl */ { &vop_mmap_desc, genfs_mmap }, /* mmap */ { &vop_fsync_desc, coda_fsync }, /* fsync */ { &vop_remove_desc, coda_remove }, /* remove */ { &vop_link_desc, coda_link }, /* link */ { &vop_rename_desc, coda_rename }, /* rename */ { &vop_mkdir_desc, coda_mkdir }, /* mkdir */ { &vop_rmdir_desc, coda_rmdir }, /* rmdir */ { &vop_symlink_desc, coda_symlink }, /* symlink */ { &vop_readdir_desc, coda_readdir }, /* readdir */ { &vop_readlink_desc, coda_readlink }, /* readlink */ { &vop_abortop_desc, coda_abortop }, /* abortop */ { &vop_inactive_desc, coda_inactive }, /* inactive */ { &vop_reclaim_desc, coda_reclaim }, /* reclaim */ { &vop_lock_desc, coda_lock }, /* lock */ { &vop_unlock_desc, coda_unlock }, /* unlock */ { &vop_bmap_desc, coda_bmap }, /* bmap */ { &vop_strategy_desc, coda_strategy }, /* strategy */ { &vop_print_desc, coda_vop_error }, /* print */ { &vop_islocked_desc, coda_islocked }, /* islocked */ { &vop_pathconf_desc, coda_pathconf }, /* pathconf */ { &vop_advlock_desc, coda_vop_nop }, /* advlock */ { &vop_bwrite_desc, coda_vop_error }, /* bwrite */ { &vop_seek_desc, genfs_seek }, /* seek */ { &vop_poll_desc, genfs_poll }, /* poll */ { &vop_getpages_desc, coda_getpages }, /* getpages */ { &vop_putpages_desc, coda_putpages }, /* putpages */ { NULL, NULL } }; static void coda_print_vattr(struct vattr *); int (**coda_vnodeop_p)(void *); const struct vnodeopv_desc coda_vnodeop_opv_desc = { &coda_vnodeop_p, coda_vnodeop_entries }; /* Definitions of NetBSD vnodeop interfaces */ /* * A generic error routine. Return EIO without looking at arguments. */ int coda_vop_error(void *anon) { struct vnodeop_desc **desc = (struct vnodeop_desc **)anon; if (codadebug) { myprintf(("%s: Vnode operation %s called (error).\n", __func__, (*desc)->vdesc_name)); } return EIO; } /* A generic do-nothing. */ int coda_vop_nop(void *anon) { struct vnodeop_desc **desc = (struct vnodeop_desc **)anon; if (codadebug) { myprintf(("Vnode operation %s called, but unsupported\n", (*desc)->vdesc_name)); } return (0); } int coda_vnodeopstats_init(void) { int i; for(i=0;i<CODA_VNODEOPS_SIZE;i++) { coda_vnodeopstats[i].opcode = i; coda_vnodeopstats[i].entries = 0; coda_vnodeopstats[i].sat_intrn = 0; coda_vnodeopstats[i].unsat_intrn = 0; coda_vnodeopstats[i].gen_intrn = 0; } return 0; } /* * XXX The entire relationship between VOP_OPEN and having a container * file (via venus_open) needs to be reexamined. In particular, it's * valid to open/mmap/close and then reference. Instead of doing * VOP_OPEN when getpages needs a container, we should do the * venus_open part, and record that the vnode has opened the container * for getpages, and do the matching logical close on coda_inactive. * Further, coda_rdwr needs a container file, and sometimes needs to * do the equivalent of open (core dumps). */ /* * coda_open calls Venus to return the device and inode of the * container file, and then obtains a vnode for that file. The * container vnode is stored in the coda vnode, and a reference is * added for each open file. */ int coda_open(void *v) { /* * NetBSD can pass the O_EXCL flag in mode, even though the check * has already happened. Venus defensively assumes that if open * is passed the EXCL, it must be a bug. We strip the flag here. */ /* true args */ struct vop_open_args *ap = v; vnode_t *vp = ap->a_vp; struct cnode *cp = VTOC(vp); int flag = ap->a_mode & (~O_EXCL); kauth_cred_t cred = ap->a_cred; /* locals */ int error; dev_t dev; /* container file device, inode, vnode */ ino_t inode; vnode_t *container_vp; MARK_ENTRY(CODA_OPEN_STATS); KASSERT(VOP_ISLOCKED(vp)); /* Check for open of control file. */ if (IS_CTL_VP(vp)) { /* if (WRITABLE(flag)) */ if (flag & (FWRITE | O_TRUNC | O_CREAT | O_EXCL)) { MARK_INT_FAIL(CODA_OPEN_STATS); return(EACCES); } MARK_INT_SAT(CODA_OPEN_STATS); return(0); } error = venus_open(vtomi(vp), &cp->c_fid, flag, cred, curlwp, &dev, &inode); if (error) return (error); if (!error) { CODADEBUG(CODA_OPEN, myprintf(( "%s: dev 0x%llx inode %llu result %d\n", __func__, (unsigned long long)dev, (unsigned long long)inode, error));) } /* * Obtain locked and referenced container vnode from container * device/inode. */ error = coda_grab_vnode(vp, dev, inode, &container_vp); if (error) return (error); /* Save the vnode pointer for the container file. */ if (cp->c_ovp == NULL) { cp->c_ovp = container_vp; } else { if (cp->c_ovp != container_vp) /* * Perhaps venus returned a different container, or * something else went wrong. */ panic("%s: cp->c_ovp != container_vp", __func__); } cp->c_ocount++; /* Flush the attribute cache if writing the file. */ if (flag & FWRITE) { cp->c_owrite++; cp->c_flags &= ~C_VATTR; } /* * Save the <device, inode> pair for the container file to speed * up subsequent reads while closed (mmap, program execution). * This is perhaps safe because venus will invalidate the node * before changing the container file mapping. */ cp->c_device = dev; cp->c_inode = inode; /* Open the container file. */ error = VOP_OPEN(container_vp, flag, cred); /* * Drop the lock on the container, after we have done VOP_OPEN * (which requires a locked vnode). */ VOP_UNLOCK(container_vp); return(error); } /* * Close the cache file used for I/O and notify Venus. */ int coda_close(void *v) { /* true args */ struct vop_close_args *ap = v; vnode_t *vp = ap->a_vp; struct cnode *cp = VTOC(vp); int flag = ap->a_fflag; kauth_cred_t cred = ap->a_cred; /* locals */ int error; MARK_ENTRY(CODA_CLOSE_STATS); /* Check for close of control file. */ if (IS_CTL_VP(vp)) { MARK_INT_SAT(CODA_CLOSE_STATS); return(0); } /* * XXX The IS_UNMOUNTING part of this is very suspect. */ if (IS_UNMOUNTING(cp)) { if (cp->c_ovp) { #ifdef CODA_VERBOSE printf("%s: destroying container %d, ufs vp %p of vp %p/cp %p\n", __func__, vrefcnt(vp), cp->c_ovp, vp, cp); #endif #ifdef hmm vgone(cp->c_ovp); #else vn_lock(cp->c_ovp, LK_EXCLUSIVE | LK_RETRY); VOP_CLOSE(cp->c_ovp, flag, cred); /* Do errors matter here? */ vput(cp->c_ovp); #endif } else { #ifdef CODA_VERBOSE printf("%s: NO container vp %p/cp %p\n", __func__, vp, cp); #endif } return ENODEV; } /* Lock the container node, and VOP_CLOSE it. */ vn_lock(cp->c_ovp, LK_EXCLUSIVE | LK_RETRY); VOP_CLOSE(cp->c_ovp, flag, cred); /* Do errors matter here? */ /* * Drop the lock we just obtained, and vrele the container vnode. * Decrement reference counts, and clear container vnode pointer on * last close. */ vput(cp->c_ovp); if (flag & FWRITE) --cp->c_owrite; if (--cp->c_ocount == 0) cp->c_ovp = NULL; error = venus_close(vtomi(vp), &cp->c_fid, flag, cred, curlwp); CODADEBUG(CODA_CLOSE, myprintf(("%s: result %d\n", __func__, error)); ) return(error); } int coda_read(void *v) { struct vop_read_args *ap = v; ENTRY; return(coda_rdwr(ap->a_vp, ap->a_uio, UIO_READ, ap->a_ioflag, ap->a_cred, curlwp)); } int coda_write(void *v) { struct vop_write_args *ap = v; ENTRY; return(coda_rdwr(ap->a_vp, ap->a_uio, UIO_WRITE, ap->a_ioflag, ap->a_cred, curlwp)); } int coda_rdwr(vnode_t *vp, struct uio *uiop, enum uio_rw rw, int ioflag, kauth_cred_t cred, struct lwp *l) { /* upcall decl */ /* NOTE: container file operation!!! */ /* locals */ struct cnode *cp = VTOC(vp); vnode_t *cfvp = cp->c_ovp; struct proc *p = l->l_proc; int opened_internally = 0; int error = 0; MARK_ENTRY(CODA_RDWR_STATS); CODADEBUG(CODA_RDWR, myprintf(("coda_rdwr(%d, %p, %lu, %lld)\n", rw, uiop->uio_iov->iov_base, (unsigned long) uiop->uio_resid, (long long) uiop->uio_offset)); ) /* Check for rdwr of control object. */ if (IS_CTL_VP(vp)) { MARK_INT_FAIL(CODA_RDWR_STATS); return(EINVAL); } /* Redirect the request to UFS. */ /* * If file is not already open this must be a page * {read,write} request. Iget the cache file's inode * pointer if we still have its <device, inode> pair. * Otherwise, we must do an internal open to derive the * pair. * XXX Integrate this into a coherent strategy for container * file acquisition. */ if (cfvp == NULL) { /* * If we're dumping core, do the internal open. Otherwise * venus won't have the correct size of the core when * it's completely written. */ if (cp->c_inode != 0 && !(p && (p->p_acflag & ACORE))) { #ifdef CODA_VERBOSE printf("%s: grabbing container vnode, losing reference\n", __func__); #endif /* Get locked and refed vnode. */ error = coda_grab_vnode(vp, cp->c_device, cp->c_inode, &cfvp); if (error) { MARK_INT_FAIL(CODA_RDWR_STATS); return(error); } /* * Drop lock. * XXX Where is reference released. */ VOP_UNLOCK(cfvp); } else { #ifdef CODA_VERBOSE printf("%s: internal VOP_OPEN\n", __func__); #endif opened_internally = 1; MARK_INT_GEN(CODA_OPEN_STATS); error = VOP_OPEN(vp, (rw == UIO_READ ? FREAD : FWRITE), cred); #ifdef CODA_VERBOSE printf("%s: Internally Opening %p\n", __func__, vp); #endif if (error) { MARK_INT_FAIL(CODA_RDWR_STATS); return(error); } cfvp = cp->c_ovp; } } /* Have UFS handle the call. */ CODADEBUG(CODA_RDWR, myprintf(("%s: fid = %s, refcnt = %d\n", __func__, coda_f2s(&cp->c_fid), vrefcnt(CTOV(cp)))); ) if (rw == UIO_READ) { error = VOP_READ(cfvp, uiop, ioflag, cred); } else { error = VOP_WRITE(cfvp, uiop, ioflag, cred); } if (error) MARK_INT_FAIL(CODA_RDWR_STATS); else MARK_INT_SAT(CODA_RDWR_STATS); /* Do an internal close if necessary. */ if (opened_internally) { MARK_INT_GEN(CODA_CLOSE_STATS); (void)VOP_CLOSE(vp, (rw == UIO_READ ? FREAD : FWRITE), cred); } /* Invalidate cached attributes if writing. */ if (rw == UIO_WRITE) cp->c_flags &= ~C_VATTR; return(error); } int coda_ioctl(void *v) { /* true args */ struct vop_ioctl_args *ap = v; vnode_t *vp = ap->a_vp; int com = ap->a_command; void *data = ap->a_data; int flag = ap->a_fflag; kauth_cred_t cred = ap->a_cred; /* locals */ int error; vnode_t *tvp; struct PioctlData *iap = (struct PioctlData *)data; namei_simple_flags_t sflags; MARK_ENTRY(CODA_IOCTL_STATS); CODADEBUG(CODA_IOCTL, myprintf(("in coda_ioctl on %s\n", iap->path));) /* Don't check for operation on a dying object, for ctlvp it shouldn't matter */ /* Must be control object to succeed. */ if (!IS_CTL_VP(vp)) { MARK_INT_FAIL(CODA_IOCTL_STATS); CODADEBUG(CODA_IOCTL, myprintf(("%s error: vp != ctlvp", __func__));) return (EOPNOTSUPP); } /* Look up the pathname. */ /* Should we use the name cache here? It would get it from lookupname sooner or later anyway, right? */ sflags = iap->follow ? NSM_FOLLOW_NOEMULROOT : NSM_NOFOLLOW_NOEMULROOT; error = namei_simple_user(iap->path, sflags, &tvp); if (error) { MARK_INT_FAIL(CODA_IOCTL_STATS); CODADEBUG(CODA_IOCTL, myprintf(("%s error: lookup returns %d\n", __func__, error));) return(error); } /* * Make sure this is a coda style cnode, but it may be a * different vfsp */ /* XXX: this totally violates the comment about vtagtype in vnode.h */ if (tvp->v_tag != VT_CODA) { vrele(tvp); MARK_INT_FAIL(CODA_IOCTL_STATS); CODADEBUG(CODA_IOCTL, myprintf(("%s error: %s not a coda object\n", __func__, iap->path));) return(EINVAL); } if (iap->vi.in_size > VC_MAXDATASIZE || iap->vi.out_size > VC_MAXDATASIZE) { vrele(tvp); return(EINVAL); } error = venus_ioctl(vtomi(tvp), &((VTOC(tvp))->c_fid), com, flag, data, cred, curlwp); if (error) MARK_INT_FAIL(CODA_IOCTL_STATS); else CODADEBUG(CODA_IOCTL, myprintf(("Ioctl returns %d \n", error)); ) vrele(tvp); return(error); } /* * To reduce the cost of a user-level venus;we cache attributes in * the kernel. Each cnode has storage allocated for an attribute. If * c_vattr is valid, return a reference to it. Otherwise, get the * attributes from venus and store them in the cnode. There is some * question if this method is a security leak. But I think that in * order to make this call, the user must have done a lookup and * opened the file, and therefore should already have access. */ int coda_getattr(void *v) { /* true args */ struct vop_getattr_args *ap = v; vnode_t *vp = ap->a_vp; struct cnode *cp = VTOC(vp); struct vattr *vap = ap->a_vap; kauth_cred_t cred = ap->a_cred; /* locals */ int error; MARK_ENTRY(CODA_GETATTR_STATS); /* Check for getattr of control object. */ if (IS_CTL_VP(vp)) { MARK_INT_FAIL(CODA_GETATTR_STATS); return(ENOENT); } /* Check to see if the attributes have already been cached */ if (VALID_VATTR(cp)) { CODADEBUG(CODA_GETATTR, { myprintf(("%s: attr cache hit: %s\n", __func__, coda_f2s(&cp->c_fid)));}) CODADEBUG(CODA_GETATTR, if (!(codadebug & ~CODA_GETATTR)) coda_print_vattr(&cp->c_vattr); ) *vap = cp->c_vattr; MARK_INT_SAT(CODA_GETATTR_STATS); return(0); } error = venus_getattr(vtomi(vp), &cp->c_fid, cred, curlwp, vap); if (!error) { CODADEBUG(CODA_GETATTR, myprintf(("%s miss %s: result %d\n", __func__, coda_f2s(&cp->c_fid), error)); ) CODADEBUG(CODA_GETATTR, if (!(codadebug & ~CODA_GETATTR)) coda_print_vattr(vap); ) /* If not open for write, store attributes in cnode */ if ((cp->c_owrite == 0) && (coda_attr_cache)) { cp->c_vattr = *vap; cp->c_flags |= C_VATTR; } } return(error); } int coda_setattr(void *v) { /* true args */ struct vop_setattr_args *ap = v; vnode_t *vp = ap->a_vp; struct cnode *cp = VTOC(vp); struct vattr *vap = ap->a_vap; kauth_cred_t cred = ap->a_cred; /* locals */ int error; MARK_ENTRY(CODA_SETATTR_STATS); /* Check for setattr of control object. */ if (IS_CTL_VP(vp)) { MARK_INT_FAIL(CODA_SETATTR_STATS); return(ENOENT); } if (codadebug & CODADBGMSK(CODA_SETATTR)) { coda_print_vattr(vap); } error = venus_setattr(vtomi(vp), &cp->c_fid, vap, cred, curlwp); if (!error) cp->c_flags &= ~C_VATTR; CODADEBUG(CODA_SETATTR, myprintf(("setattr %d\n", error)); ) return(error); } int coda_access(void *v) { /* true args */ struct vop_access_args *ap = v; vnode_t *vp = ap->a_vp; struct cnode *cp = VTOC(vp); accmode_t accmode = ap->a_accmode; kauth_cred_t cred = ap->a_cred; /* locals */ int error; MARK_ENTRY(CODA_ACCESS_STATS); KASSERT((accmode & ~(VEXEC | VWRITE | VREAD | VADMIN | VAPPEND)) == 0); /* Check for access of control object. Only read access is allowed on it. */ if (IS_CTL_VP(vp)) { /* bogus hack - all will be marked as successes */ MARK_INT_SAT(CODA_ACCESS_STATS); return(((accmode & VREAD) && !(accmode & (VWRITE | VEXEC))) ? 0 : EACCES); } /* * if the file is a directory, and we are checking exec (eg lookup) * access, and the file is in the namecache, then the user must have * lookup access to it. */ if (coda_access_cache) { if ((vp->v_type == VDIR) && (accmode & VEXEC)) { if (coda_nc_lookup(cp, ".", 1, cred)) { MARK_INT_SAT(CODA_ACCESS_STATS); return(0); /* it was in the cache */ } } } error = venus_access(vtomi(vp), &cp->c_fid, accmode, cred, curlwp); return(error); } /* * CODA abort op, called after namei() when a CREATE/DELETE isn't actually * done. If a buffer has been saved in anticipation of a coda_create or * a coda_remove, delete it. */ /* ARGSUSED */ int coda_abortop(void *v) { /* true args */ struct vop_abortop_args /* { vnode_t *a_dvp; struct componentname *a_cnp; } */ *ap = v; (void)ap; /* upcall decl */ /* locals */ return (0); } int coda_readlink(void *v) { /* true args */ struct vop_readlink_args *ap = v; vnode_t *vp = ap->a_vp; struct cnode *cp = VTOC(vp); struct uio *uiop = ap->a_uio; kauth_cred_t cred = ap->a_cred; /* locals */ struct lwp *l = curlwp; int error; char *str; int len; MARK_ENTRY(CODA_READLINK_STATS); /* Check for readlink of control object. */ if (IS_CTL_VP(vp)) { MARK_INT_FAIL(CODA_READLINK_STATS); return(ENOENT); } if ((coda_symlink_cache) && (VALID_SYMLINK(cp))) { /* symlink was cached */ uiop->uio_rw = UIO_READ; error = uiomove(cp->c_symlink, (int)cp->c_symlen, uiop); if (error) MARK_INT_FAIL(CODA_READLINK_STATS); else MARK_INT_SAT(CODA_READLINK_STATS); return(error); } error = venus_readlink(vtomi(vp), &cp->c_fid, cred, l, &str, &len); if (!error) { uiop->uio_rw = UIO_READ; error = uiomove(str, len, uiop); if (coda_symlink_cache) { cp->c_symlink = str; cp->c_symlen = len; cp->c_flags |= C_SYMLINK; } else CODA_FREE(str, len); } CODADEBUG(CODA_READLINK, myprintf(("in readlink result %d\n",error));) return(error); } int coda_fsync(void *v) { /* true args */ struct vop_fsync_args *ap = v; vnode_t *vp = ap->a_vp; struct cnode *cp = VTOC(vp); kauth_cred_t cred = ap->a_cred; /* locals */ vnode_t *convp = cp->c_ovp; int error; MARK_ENTRY(CODA_FSYNC_STATS); /* Check for fsync on an unmounting object */ /* The NetBSD kernel, in its infinite wisdom, can try to fsync * after an unmount has been initiated. This is a Bad Thing, * which we have to avoid. Not a legitimate failure for stats. */ if (IS_UNMOUNTING(cp)) { return(ENODEV); } /* Check for fsync of control object or unitialized cnode. */ if (IS_CTL_VP(vp) || vp->v_type == VNON) { MARK_INT_SAT(CODA_FSYNC_STATS); return(0); } if (convp) VOP_FSYNC(convp, cred, MNT_WAIT, 0, 0); /* * We can expect fsync on any vnode at all if venus is pruging it. * Venus can't very well answer the fsync request, now can it? * Hopefully, it won't have to, because hopefully, venus preserves * the (possibly untrue) invariant that it never purges an open * vnode. Hopefully. */ if (cp->c_flags & C_PURGING) { return(0); } error = venus_fsync(vtomi(vp), &cp->c_fid, cred, curlwp); CODADEBUG(CODA_FSYNC, myprintf(("in fsync result %d\n",error)); ) return(error); } /* * vp is locked on entry, and we must unlock it. * XXX This routine is suspect and probably needs rewriting. */ int coda_inactive(void *v) { /* true args */ struct vop_inactive_v2_args *ap = v; vnode_t *vp = ap->a_vp; struct cnode *cp = VTOC(vp); kauth_cred_t cred __unused = NULL; /* We don't need to send inactive to venus - DCS */ MARK_ENTRY(CODA_INACTIVE_STATS); if (IS_CTL_VP(vp)) { MARK_INT_SAT(CODA_INACTIVE_STATS); return 0; } CODADEBUG(CODA_INACTIVE, myprintf(("in inactive, %s, vfsp %p\n", coda_f2s(&cp->c_fid), vp->v_mount));) if (vp->v_mount->mnt_data == NULL) { myprintf(("Help! vfsp->vfs_data was NULL, but vnode %p wasn't dying\n", vp)); panic("badness in coda_inactive"); } #ifdef CODA_VERBOSE /* Sanity checks that perhaps should be panic. */ if (vrefcnt(vp) > 1) printf("%s: %p usecount %d\n", __func__, vp, vrefcnt(vp)); if (cp->c_ovp != NULL) printf("%s: %p ovp != NULL\n", __func__, vp); #endif /* XXX Do we need to VOP_CLOSE container vnodes? */ if (!IS_UNMOUNTING(cp)) *ap->a_recycle = true; MARK_INT_SAT(CODA_INACTIVE_STATS); return(0); } /* * Coda does not use the normal namecache, but a private version. * Consider how to use the standard facility instead. */ int coda_lookup(void *v) { /* true args */ struct vop_lookup_v2_args *ap = v; /* (locked) vnode of dir in which to do lookup */ vnode_t *dvp = ap->a_dvp; struct cnode *dcp = VTOC(dvp); /* output variable for result */ vnode_t **vpp = ap->a_vpp; /* name to lookup */ struct componentname *cnp = ap->a_cnp; kauth_cred_t cred = cnp->cn_cred; struct lwp *l = curlwp; /* locals */ struct cnode *cp; const char *nm = cnp->cn_nameptr; int len = cnp->cn_namelen; CodaFid VFid; int vtype; int error = 0; MARK_ENTRY(CODA_LOOKUP_STATS); CODADEBUG(CODA_LOOKUP, myprintf(("%s: %s in %s\n", __func__, nm, coda_f2s(&dcp->c_fid)));) /* * XXX componentname flags in MODMASK are not handled at all */ /* * The overall strategy is to switch on the lookup type and get a * result vnode that is vref'd but not locked. */ /* Check for lookup of control object. */ if (IS_CTL_NAME(dvp, nm, len)) { *vpp = coda_ctlvp; vref(*vpp); MARK_INT_SAT(CODA_LOOKUP_STATS); goto exit; } /* Avoid trying to hand venus an unreasonably long name. */ if (len+1 > CODA_MAXNAMLEN) { MARK_INT_FAIL(CODA_LOOKUP_STATS); CODADEBUG(CODA_LOOKUP, myprintf(("%s: name too long:, %s (%s)\n", __func__, coda_f2s(&dcp->c_fid), nm));) *vpp = (vnode_t *)0; error = EINVAL; goto exit; } /* * Try to resolve the lookup in the minicache. If that fails, ask * venus to do the lookup. XXX The interaction between vnode * locking and any locking that coda does is not clear. */ cp = coda_nc_lookup(dcp, nm, len, cred); if (cp) { *vpp = CTOV(cp); vref(*vpp); CODADEBUG(CODA_LOOKUP, myprintf(("lookup result %d vpp %p\n",error,*vpp));) } else { /* The name wasn't cached, so ask Venus. */ error = venus_lookup(vtomi(dvp), &dcp->c_fid, nm, len, cred, l, &VFid, &vtype); if (error) { MARK_INT_FAIL(CODA_LOOKUP_STATS); CODADEBUG(CODA_LOOKUP, myprintf(("%s: lookup error on %s (%s)%d\n", __func__, coda_f2s(&dcp->c_fid), nm, error));) *vpp = (vnode_t *)0; } else { MARK_INT_SAT(CODA_LOOKUP_STATS); CODADEBUG(CODA_LOOKUP, myprintf(("%s: %s type %o result %d\n", __func__, coda_f2s(&VFid), vtype, error)); ) cp = make_coda_node(&VFid, dvp->v_mount, vtype); *vpp = CTOV(cp); /* vpp is now vrefed. */ /* * Unless this vnode is marked CODA_NOCACHE, enter it into * the coda name cache to avoid a future venus round-trip. * XXX Interaction with componentname NOCACHE is unclear. */ if (!(vtype & CODA_NOCACHE)) coda_nc_enter(VTOC(dvp), nm, len, cred, VTOC(*vpp)); } } exit: /* * If we are creating, and this was the last name to be looked up, * and the error was ENOENT, then make the leaf NULL and return * success. * XXX Check against new lookup rules. */ if (((cnp->cn_nameiop == CREATE) || (cnp->cn_nameiop == RENAME)) && (cnp->cn_flags & ISLASTCN) && (error == ENOENT)) { error = EJUSTRETURN; *ap->a_vpp = NULL; } return(error); } /*ARGSUSED*/ int coda_create(void *v) { /* true args */ struct vop_create_v3_args *ap = v; vnode_t *dvp = ap->a_dvp; struct cnode *dcp = VTOC(dvp); struct vattr *va = ap->a_vap; int exclusive = 1; int mode = ap->a_vap->va_mode; vnode_t **vpp = ap->a_vpp; struct componentname *cnp = ap->a_cnp; kauth_cred_t cred = cnp->cn_cred; struct lwp *l = curlwp; /* locals */ int error; struct cnode *cp; const char *nm = cnp->cn_nameptr; int len = cnp->cn_namelen; CodaFid VFid; struct vattr attr; MARK_ENTRY(CODA_CREATE_STATS); /* All creates are exclusive XXX */ /* I'm assuming the 'mode' argument is the file mode bits XXX */ /* Check for create of control object. */ if (IS_CTL_NAME(dvp, nm, len)) { *vpp = (vnode_t *)0; MARK_INT_FAIL(CODA_CREATE_STATS); return(EACCES); } error = venus_create(vtomi(dvp), &dcp->c_fid, nm, len, exclusive, mode, va, cred, l, &VFid, &attr); if (!error) { /* * XXX Violation of venus/kernel invariants is a difficult case, * but venus should not be able to cause a panic. */ /* If this is an exclusive create, panic if the file already exists. */ /* Venus should have detected the file and reported EEXIST. */ if ((exclusive == 1) && (coda_find(&VFid) != NULL)) panic("cnode existed for newly created file!"); cp = make_coda_node(&VFid, dvp->v_mount, attr.va_type); *vpp = CTOV(cp); /* XXX vnodeops doesn't say this argument can be changed. */ /* Update va to reflect the new attributes. */ (*va) = attr; /* Update the attribute cache and mark it as valid */ if (coda_attr_cache) { VTOC(*vpp)->c_vattr = attr; VTOC(*vpp)->c_flags |= C_VATTR; } /* Invalidate parent's attr cache (modification time has changed). */ VTOC(dvp)->c_flags &= ~C_VATTR; /* enter the new vnode in the Name Cache */ coda_nc_enter(VTOC(dvp), nm, len, cred, VTOC(*vpp)); CODADEBUG(CODA_CREATE, myprintf(("%s: %s, result %d\n", __func__, coda_f2s(&VFid), error)); ) } else { *vpp = (vnode_t *)0; CODADEBUG(CODA_CREATE, myprintf(("%s: create error %d\n", __func__, error));) } if (!error) { #ifdef CODA_VERBOSE if ((cnp->cn_flags & LOCKLEAF) == 0) /* This should not happen; flags are for lookup only. */ printf("%s: LOCKLEAF not set!\n", __func__); #endif } return(error); } int coda_remove(void *v) { /* true args */ struct vop_remove_v3_args *ap = v; vnode_t *dvp = ap->a_dvp; struct cnode *cp = VTOC(dvp); vnode_t *vp = ap->a_vp; struct componentname *cnp = ap->a_cnp; kauth_cred_t cred = cnp->cn_cred; struct lwp *l = curlwp; /* locals */ int error; const char *nm = cnp->cn_nameptr; int len = cnp->cn_namelen; struct cnode *tp; MARK_ENTRY(CODA_REMOVE_STATS); CODADEBUG(CODA_REMOVE, myprintf(("%s: %s in %s\n", __func__, nm, coda_f2s(&cp->c_fid)));) /* Remove the file's entry from the CODA Name Cache */ /* We're being conservative here, it might be that this person * doesn't really have sufficient access to delete the file * but we feel zapping the entry won't really hurt anyone -- dcs */ /* I'm gonna go out on a limb here. If a file and a hardlink to it * exist, and one is removed, the link count on the other will be * off by 1. We could either invalidate the attrs if cached, or * fix them. I'll try to fix them. DCS 11/8/94 */ tp = coda_nc_lookup(VTOC(dvp), nm, len, cred); if (tp) { if (VALID_VATTR(tp)) { /* If attrs are cached */ if (tp->c_vattr.va_nlink > 1) { /* If it's a hard link */ tp->c_vattr.va_nlink--; } } coda_nc_zapfile(VTOC(dvp), nm, len); /* No need to flush it if it doesn't exist! */ } /* Invalidate the parent's attr cache, the modification time has changed */ VTOC(dvp)->c_flags &= ~C_VATTR; /* Check for remove of control object. */ if (IS_CTL_NAME(dvp, nm, len)) { MARK_INT_FAIL(CODA_REMOVE_STATS); return(ENOENT); } error = venus_remove(vtomi(dvp), &cp->c_fid, nm, len, cred, l); CODADEBUG(CODA_REMOVE, myprintf(("in remove result %d\n",error)); ) /* * Unlock and release child (avoiding double if "."). */ if (dvp == vp) { vrele(vp); } else { vput(vp); } return(error); } /* * dvp is the directory where the link is to go, and is locked. * vp is the object to be linked to, and is unlocked. * At exit, we must unlock dvp, and vput dvp. */ int coda_link(void *v) { /* true args */ struct vop_link_v2_args *ap = v; vnode_t *vp = ap->a_vp; struct cnode *cp = VTOC(vp); vnode_t *dvp = ap->a_dvp; struct cnode *dcp = VTOC(dvp); struct componentname *cnp = ap->a_cnp; kauth_cred_t cred = cnp->cn_cred; struct lwp *l = curlwp; /* locals */ int error; const char *nm = cnp->cn_nameptr; int len = cnp->cn_namelen; MARK_ENTRY(CODA_LINK_STATS); if (codadebug & CODADBGMSK(CODA_LINK)) { myprintf(("%s: vp fid: %s\n", __func__, coda_f2s(&cp->c_fid))); myprintf(("%s: dvp fid: %s)\n", __func__, coda_f2s(&dcp->c_fid))); } if (codadebug & CODADBGMSK(CODA_LINK)) { myprintf(("%s: vp fid: %s\n", __func__, coda_f2s(&cp->c_fid))); myprintf(("%s: dvp fid: %s\n", __func__, coda_f2s(&dcp->c_fid))); } /* Check for link to/from control object. */ if (IS_CTL_NAME(dvp, nm, len) || IS_CTL_VP(vp)) { MARK_INT_FAIL(CODA_LINK_STATS); return(EACCES); } /* If linking . to a name, error out earlier. */ if (vp == dvp) { #ifdef CODA_VERBOSE printf("%s coda_link vp==dvp\n", __func__); #endif error = EISDIR; goto exit; } /* XXX Why does venus_link need the vnode to be locked?*/ if ((error = vn_lock(vp, LK_EXCLUSIVE)) != 0) { #ifdef CODA_VERBOSE printf("%s: couldn't lock vnode %p\n", __func__, vp); #endif error = EFAULT; /* XXX better value */ goto exit; } error = kauth_authorize_vnode(cnp->cn_cred, KAUTH_VNODE_ADD_LINK, vp, dvp, 0); if (error) goto exit; error = venus_link(vtomi(vp), &cp->c_fid, &dcp->c_fid, nm, len, cred, l); VOP_UNLOCK(vp); /* Invalidate parent's attr cache (the modification time has changed). */ VTOC(dvp)->c_flags &= ~C_VATTR; /* Invalidate child's attr cache (XXX why). */ VTOC(vp)->c_flags &= ~C_VATTR; CODADEBUG(CODA_LINK, myprintf(("in link result %d\n",error)); ) exit: return(error); } int coda_rename(void *v) { /* true args */ struct vop_rename_args *ap = v; vnode_t *odvp = ap->a_fdvp; struct cnode *odcp = VTOC(odvp); struct componentname *fcnp = ap->a_fcnp; vnode_t *ndvp = ap->a_tdvp; struct cnode *ndcp = VTOC(ndvp); struct componentname *tcnp = ap->a_tcnp; kauth_cred_t cred = fcnp->cn_cred; struct lwp *l = curlwp; /* true args */ int error; const char *fnm = fcnp->cn_nameptr; int flen = fcnp->cn_namelen; const char *tnm = tcnp->cn_nameptr; int tlen = tcnp->cn_namelen; MARK_ENTRY(CODA_RENAME_STATS); /* Hmmm. The vnodes are already looked up. Perhaps they are locked? This could be Bad. XXX */ #ifdef OLD_DIAGNOSTIC if ((fcnp->cn_cred != tcnp->cn_cred) || (fcnp->cn_lwp != tcnp->cn_lwp)) { panic("%s: component names don't agree", __func__); } #endif /* Check for rename involving control object. */ if (IS_CTL_NAME(odvp, fnm, flen) || IS_CTL_NAME(ndvp, tnm, tlen)) { MARK_INT_FAIL(CODA_RENAME_STATS); return(EACCES); } /* Problem with moving directories -- need to flush entry for .. */ if (odvp != ndvp) { struct cnode *ovcp = coda_nc_lookup(VTOC(odvp), fnm, flen, cred); if (ovcp) { vnode_t *ovp = CTOV(ovcp); if ((ovp) && (ovp->v_type == VDIR)) /* If it's a directory */ coda_nc_zapfile(VTOC(ovp),"..", 2); } } /* Remove the entries for both source and target files */ coda_nc_zapfile(VTOC(odvp), fnm, flen); coda_nc_zapfile(VTOC(ndvp), tnm, tlen); /* Invalidate the parent's attr cache, the modification time has changed */ VTOC(odvp)->c_flags &= ~C_VATTR; VTOC(ndvp)->c_flags &= ~C_VATTR; if (flen+1 > CODA_MAXNAMLEN) { MARK_INT_FAIL(CODA_RENAME_STATS); error = EINVAL; goto exit; } if (tlen+1 > CODA_MAXNAMLEN) { MARK_INT_FAIL(CODA_RENAME_STATS); error = EINVAL; goto exit; } error = venus_rename(vtomi(odvp), &odcp->c_fid, &ndcp->c_fid, fnm, flen, tnm, tlen, cred, l); exit: CODADEBUG(CODA_RENAME, myprintf(("in rename result %d\n",error));) /* XXX - do we need to call cache pureg on the moved vnode? */ cache_purge(ap->a_fvp); /* It seems to be incumbent on us to drop locks on all four vnodes */ /* From-vnodes are not locked, only ref'd. To-vnodes are locked. */ vrele(ap->a_fvp); vrele(odvp); if (ap->a_tvp) { if (ap->a_tvp == ndvp) { vrele(ap->a_tvp); } else { vput(ap->a_tvp); } } vput(ndvp); return(error); } int coda_mkdir(void *v) { /* true args */ struct vop_mkdir_v3_args *ap = v; vnode_t *dvp = ap->a_dvp; struct cnode *dcp = VTOC(dvp); struct componentname *cnp = ap->a_cnp; struct vattr *va = ap->a_vap; vnode_t **vpp = ap->a_vpp; kauth_cred_t cred = cnp->cn_cred; struct lwp *l = curlwp; /* locals */ int error; const char *nm = cnp->cn_nameptr; int len = cnp->cn_namelen; struct cnode *cp; CodaFid VFid; struct vattr ova; MARK_ENTRY(CODA_MKDIR_STATS); /* Check for mkdir of target object. */ if (IS_CTL_NAME(dvp, nm, len)) { *vpp = (vnode_t *)0; MARK_INT_FAIL(CODA_MKDIR_STATS); return(EACCES); } if (len+1 > CODA_MAXNAMLEN) { *vpp = (vnode_t *)0; MARK_INT_FAIL(CODA_MKDIR_STATS); return(EACCES); } error = venus_mkdir(vtomi(dvp), &dcp->c_fid, nm, len, va, cred, l, &VFid, &ova); if (!error) { if (coda_find(&VFid) != NULL) panic("cnode existed for newly created directory!"); cp = make_coda_node(&VFid, dvp->v_mount, va->va_type); *vpp = CTOV(cp); /* enter the new vnode in the Name Cache */ coda_nc_enter(VTOC(dvp), nm, len, cred, VTOC(*vpp)); /* as a side effect, enter "." and ".." for the directory */ coda_nc_enter(VTOC(*vpp), ".", 1, cred, VTOC(*vpp)); coda_nc_enter(VTOC(*vpp), "..", 2, cred, VTOC(dvp)); if (coda_attr_cache) { VTOC(*vpp)->c_vattr = ova; /* update the attr cache */ VTOC(*vpp)->c_flags |= C_VATTR; /* Valid attributes in cnode */ } /* Invalidate the parent's attr cache, the modification time has changed */ VTOC(dvp)->c_flags &= ~C_VATTR; CODADEBUG( CODA_MKDIR, myprintf(("%s: %s result %d\n", __func__, coda_f2s(&VFid), error)); ) } else { *vpp = (vnode_t *)0; CODADEBUG(CODA_MKDIR, myprintf(("%s error %d\n", __func__, error));) } return(error); } int coda_rmdir(void *v) { /* true args */ struct vop_rmdir_v2_args *ap = v; vnode_t *dvp = ap->a_dvp; struct cnode *dcp = VTOC(dvp); vnode_t *vp = ap->a_vp; struct componentname *cnp = ap->a_cnp; kauth_cred_t cred = cnp->cn_cred; struct lwp *l = curlwp; /* true args */ int error; const char *nm = cnp->cn_nameptr; int len = cnp->cn_namelen; struct cnode *cp; MARK_ENTRY(CODA_RMDIR_STATS); /* Check for rmdir of control object. */ if (IS_CTL_NAME(dvp, nm, len)) { MARK_INT_FAIL(CODA_RMDIR_STATS); return(ENOENT); } /* Can't remove . in self. */ if (dvp == vp) { #ifdef CODA_VERBOSE printf("%s: dvp == vp\n", __func__); #endif error = EINVAL; goto exit; } /* * The caller may not have adequate permissions, and the venus * operation may fail, but it doesn't hurt from a correctness * viewpoint to invalidate cache entries. * XXX Why isn't this done after the venus_rmdir call? */ /* Look up child in name cache (by name, from parent). */ cp = coda_nc_lookup(dcp, nm, len, cred); /* If found, remove all children of the child (., ..). */ if (cp) coda_nc_zapParentfid(&(cp->c_fid), NOT_DOWNCALL); /* Remove child's own entry. */ coda_nc_zapfile(dcp, nm, len); /* Invalidate parent's attr cache (the modification time has changed). */ dcp->c_flags &= ~C_VATTR; error = venus_rmdir(vtomi(dvp), &dcp->c_fid, nm, len, cred, l); CODADEBUG(CODA_RMDIR, myprintf(("in rmdir result %d\n", error)); ) exit: /* unlock and release child */ if (dvp == vp) { vrele(vp); } else { vput(vp); } return(error); } int coda_symlink(void *v) { /* true args */ struct vop_symlink_v3_args *ap = v; vnode_t *dvp = ap->a_dvp; struct cnode *dcp = VTOC(dvp); /* a_vpp is used in place below */ struct componentname *cnp = ap->a_cnp; struct vattr *tva = ap->a_vap; char *path = ap->a_target; kauth_cred_t cred = cnp->cn_cred; struct lwp *l = curlwp; /* locals */ int error; u_long saved_cn_flags; const char *nm = cnp->cn_nameptr; int len = cnp->cn_namelen; int plen = strlen(path); /* * Here's the strategy for the moment: perform the symlink, then * do a lookup to grab the resulting vnode. I know this requires * two communications with Venus for a new symbolic link, but * that's the way the ball bounces. I don't yet want to change * the way the Mach symlink works. When Mach support is * deprecated, we should change symlink so that the common case * returns the resultant vnode in a vpp argument. */ MARK_ENTRY(CODA_SYMLINK_STATS); /* Check for symlink of control object. */ if (IS_CTL_NAME(dvp, nm, len)) { MARK_INT_FAIL(CODA_SYMLINK_STATS); error = EACCES; goto exit; } if (plen+1 > CODA_MAXPATHLEN) { MARK_INT_FAIL(CODA_SYMLINK_STATS); error = EINVAL; goto exit; } if (len+1 > CODA_MAXNAMLEN) { MARK_INT_FAIL(CODA_SYMLINK_STATS); error = EINVAL; goto exit; } error = venus_symlink(vtomi(dvp), &dcp->c_fid, path, plen, nm, len, tva, cred, l); /* Invalidate the parent's attr cache (modification time has changed). */ dcp->c_flags &= ~C_VATTR; if (!error) { /* * VOP_SYMLINK is not defined to pay attention to cnp->cn_flags; * these are defined only for VOP_LOOKUP. We desire to reuse * cnp for a VOP_LOOKUP operation, and must be sure to not pass * stray flags passed to us. Such stray flags can occur because * sys_symlink makes a namei call and then reuses the * componentname structure. */ /* * XXX Arguably we should create our own componentname structure * and not reuse the one that was passed in. */ saved_cn_flags = cnp->cn_flags; cnp->cn_flags &= ~(MODMASK | OPMASK); cnp->cn_flags |= LOOKUP; error = VOP_LOOKUP(dvp, ap->a_vpp, cnp); cnp->cn_flags = saved_cn_flags; } exit: CODADEBUG(CODA_SYMLINK, myprintf(("in symlink result %d\n",error)); ) return(error); } /* * Read directory entries. */ int coda_readdir(void *v) { /* true args */ struct vop_readdir_args *ap = v; vnode_t *vp = ap->a_vp; struct cnode *cp = VTOC(vp); struct uio *uiop = ap->a_uio; kauth_cred_t cred = ap->a_cred; int *eofflag = ap->a_eofflag; /* upcall decl */ /* locals */ size_t initial_resid = uiop->uio_resid; int error = 0; int opened_internally = 0; int ncookies; char *buf; struct vnode *cvp; struct dirent *dirp; MARK_ENTRY(CODA_READDIR_STATS); CODADEBUG(CODA_READDIR, myprintf(("%s: (%p, %lu, %lld)\n", __func__, uiop->uio_iov->iov_base, (unsigned long) uiop->uio_resid, (long long) uiop->uio_offset)); ) /* Check for readdir of control object. */ if (IS_CTL_VP(vp)) { MARK_INT_FAIL(CODA_READDIR_STATS); return ENOENT; } /* If directory is not already open do an "internal open" on it. */ if (cp->c_ovp == NULL) { opened_internally = 1; MARK_INT_GEN(CODA_OPEN_STATS); error = VOP_OPEN(vp, FREAD, cred); #ifdef CODA_VERBOSE printf("%s: Internally Opening %p\n", __func__, vp); #endif if (error) return error; KASSERT(cp->c_ovp != NULL); } cvp = cp->c_ovp; CODADEBUG(CODA_READDIR, myprintf(("%s: fid = %s, refcnt = %d\n", __func__, coda_f2s(&cp->c_fid), vrefcnt(cvp))); ) if (ap->a_ncookies) { ncookies = ap->a_uio->uio_resid / _DIRENT_RECLEN(dirp, 1); *ap->a_ncookies = 0; *ap->a_cookies = malloc(ncookies * sizeof (off_t), M_TEMP, M_WAITOK); } buf = kmem_alloc(CODA_DIRBLKSIZ, KM_SLEEP); dirp = kmem_alloc(sizeof(*dirp), KM_SLEEP); vn_lock(cvp, LK_EXCLUSIVE | LK_RETRY); while (error == 0) { size_t resid = 0; char *dp, *ep; if (!ALIGNED_POINTER(uiop->uio_offset, uint32_t)) { error = EINVAL; break; } error = vn_rdwr(UIO_READ, cvp, buf, CODA_DIRBLKSIZ, uiop->uio_offset, UIO_SYSSPACE, IO_NODELOCKED, cred, &resid, curlwp); if (error || resid == CODA_DIRBLKSIZ) break; for (dp = buf, ep = dp + CODA_DIRBLKSIZ - resid; dp < ep; ) { off_t off; struct venus_dirent *vd = (struct venus_dirent *)dp; if (!ALIGNED_POINTER(vd, uint32_t) || !ALIGNED_POINTER(vd->d_reclen, uint32_t) || vd->d_reclen == 0) { error = EINVAL; break; } if (dp + vd->d_reclen > ep) { error = ENAMETOOLONG; break; } if (vd->d_namlen == 0) { uiop->uio_offset += vd->d_reclen; dp += vd->d_reclen; continue; } dirp->d_fileno = vd->d_fileno; dirp->d_type = vd->d_type; dirp->d_namlen = vd->d_namlen; dirp->d_reclen = _DIRENT_SIZE(dirp); strlcpy(dirp->d_name, vd->d_name, dirp->d_namlen + 1); if (uiop->uio_resid < dirp->d_reclen) { error = ENAMETOOLONG; break; } off = uiop->uio_offset; error = uiomove(dirp, dirp->d_reclen, uiop); uiop->uio_offset = off; if (error) break; uiop->uio_offset += vd->d_reclen; dp += vd->d_reclen; if (ap->a_ncookies) (*ap->a_cookies)[(*ap->a_ncookies)++] = uiop->uio_offset; } } VOP_UNLOCK(cvp); kmem_free(dirp, sizeof(*dirp)); kmem_free(buf, CODA_DIRBLKSIZ); if (eofflag && error == 0) *eofflag = 1; if (uiop->uio_resid < initial_resid && error == ENAMETOOLONG) error = 0; if (ap->a_ncookies && error) { free(*ap->a_cookies, M_TEMP); *ap->a_ncookies = 0; *ap->a_cookies = NULL; } if (error) MARK_INT_FAIL(CODA_READDIR_STATS); else MARK_INT_SAT(CODA_READDIR_STATS); /* Do an "internal close" if necessary. */ if (opened_internally) { MARK_INT_GEN(CODA_CLOSE_STATS); (void)VOP_CLOSE(vp, FREAD, cred); } return error; } /* * Convert from file system blocks to device blocks */ int coda_bmap(void *v) { /* XXX on the global proc */ /* true args */ struct vop_bmap_args *ap = v; vnode_t *vp __unused = ap->a_vp; /* file's vnode */ daddr_t bn __unused = ap->a_bn; /* fs block number */ vnode_t **vpp = ap->a_vpp; /* RETURN vp of device */ daddr_t *bnp __unused = ap->a_bnp; /* RETURN device block number */ struct lwp *l __unused = curlwp; /* upcall decl */ /* locals */ *vpp = (vnode_t *)0; myprintf(("coda_bmap called!\n")); return(EINVAL); } /* * I don't think the following two things are used anywhere, so I've * commented them out * * struct buf *async_bufhead; * int async_daemon_count; */ int coda_strategy(void *v) { /* true args */ struct vop_strategy_args *ap = v; struct buf *bp __unused = ap->a_bp; struct lwp *l __unused = curlwp; /* upcall decl */ /* locals */ myprintf(("coda_strategy called! ")); return(EINVAL); } int coda_reclaim(void *v) { /* true args */ struct vop_reclaim_v2_args *ap = v; vnode_t *vp = ap->a_vp; struct cnode *cp = VTOC(vp); /* upcall decl */ /* locals */ VOP_UNLOCK(vp); /* * Forced unmount/flush will let vnodes with non zero use be destroyed! */ ENTRY; if (IS_UNMOUNTING(cp)) { #ifdef DEBUG if (VTOC(vp)->c_ovp) { if (IS_UNMOUNTING(cp)) printf("%s: c_ovp not void: vp %p, cp %p\n", __func__, vp, cp); } #endif } else { #ifdef OLD_DIAGNOSTIC if (vrefcnt(vp) != 0) print("%s: pushing active %p\n", __func__, vp); if (VTOC(vp)->c_ovp) { panic("%s: c_ovp not void", __func__); } #endif } /* If an array has been allocated to hold the symlink, deallocate it */ if ((coda_symlink_cache) && (VALID_SYMLINK(cp))) { if (cp->c_symlink == NULL) panic("%s: null symlink pointer in cnode", __func__); CODA_FREE(cp->c_symlink, cp->c_symlen); cp->c_flags &= ~C_SYMLINK; cp->c_symlen = 0; } mutex_enter(vp->v_interlock); mutex_enter(&cp->c_lock); SET_VTOC(vp) = NULL; mutex_exit(&cp->c_lock); mutex_exit(vp->v_interlock); mutex_destroy(&cp->c_lock); kmem_free(cp, sizeof(*cp)); return (0); } int coda_lock(void *v) { /* true args */ struct vop_lock_args *ap = v; vnode_t *vp = ap->a_vp; struct cnode *cp = VTOC(vp); /* upcall decl */ /* locals */ ENTRY; if (coda_lockdebug) { myprintf(("Attempting lock on %s\n", coda_f2s(&cp->c_fid))); } return genfs_lock(v); } int coda_unlock(void *v) { /* true args */ struct vop_unlock_args *ap = v; vnode_t *vp = ap->a_vp; struct cnode *cp = VTOC(vp); /* upcall decl */ /* locals */ ENTRY; if (coda_lockdebug) { myprintf(("Attempting unlock on %s\n", coda_f2s(&cp->c_fid))); } return genfs_unlock(v); } int coda_islocked(void *v) { /* true args */ ENTRY; return genfs_islocked(v); } int coda_pathconf(void *v) { struct vop_pathconf_args *ap = v; switch (ap->a_name) { default: return EINVAL; } /* NOTREACHED */ } /* * Given a device and inode, obtain a locked vnode. One reference is * obtained and passed back to the caller. */ int coda_grab_vnode(vnode_t *uvp, dev_t dev, ino_t ino, vnode_t **vpp) { int error; struct mount *mp; /* Obtain mount point structure from device. */ if (!(mp = devtomp(dev))) { myprintf(("%s: devtomp(0x%llx) returns NULL\n", __func__, (unsigned long long)dev)); return(ENXIO); } /* * Obtain vnode from mount point and inode. */ error = VFS_VGET(mp, ino, LK_EXCLUSIVE, vpp); if (error) { myprintf(("%s: iget/vget(0x%llx, %llu) returns %p, err %d\n", __func__, (unsigned long long)dev, (unsigned long long)ino, *vpp, error)); return(ENOENT); } /* share the underlying vnode lock with the coda vnode */ vshareilock(*vpp, uvp); KASSERT(VOP_ISLOCKED(*vpp)); return(0); } static void coda_print_vattr(struct vattr *attr) { const char *typestr; switch (attr->va_type) { case VNON: typestr = "VNON"; break; case VREG: typestr = "VREG"; break; case VDIR: typestr = "VDIR"; break; case VBLK: typestr = "VBLK"; break; case VCHR: typestr = "VCHR"; break; case VLNK: typestr = "VLNK"; break; case VSOCK: typestr = "VSCK"; break; case VFIFO: typestr = "VFFO"; break; case VBAD: typestr = "VBAD"; break; default: typestr = "????"; break; } myprintf(("attr: type %s mode %d uid %d gid %d fsid %d rdev %d\n", typestr, (int)attr->va_mode, (int)attr->va_uid, (int)attr->va_gid, (int)attr->va_fsid, (int)attr->va_rdev)); myprintf((" fileid %d nlink %d size %d blocksize %d bytes %d\n", (int)attr->va_fileid, (int)attr->va_nlink, (int)attr->va_size, (int)attr->va_blocksize,(int)attr->va_bytes)); myprintf((" gen %ld flags %ld vaflags %d\n", attr->va_gen, attr->va_flags, attr->va_vaflags)); myprintf((" atime sec %d nsec %d\n", (int)attr->va_atime.tv_sec, (int)attr->va_atime.tv_nsec)); myprintf((" mtime sec %d nsec %d\n", (int)attr->va_mtime.tv_sec, (int)attr->va_mtime.tv_nsec)); myprintf((" ctime sec %d nsec %d\n", (int)attr->va_ctime.tv_sec, (int)attr->va_ctime.tv_nsec)); } /* * Return a vnode for the given fid. * If no cnode exists for this fid create one and put it * in a table hashed by coda_f2i(). If the cnode for * this fid is already in the table return it (ref count is * incremented by coda_find. The cnode will be flushed from the * table when coda_inactive calls coda_unsave. */ struct cnode * make_coda_node(CodaFid *fid, struct mount *fvsp, short type) { int error __diagused; struct vnode *vp; struct cnode *cp; error = vcache_get(fvsp, fid, sizeof(CodaFid), &vp); KASSERT(error == 0); mutex_enter(vp->v_interlock); cp = VTOC(vp); KASSERT(cp != NULL); mutex_enter(&cp->c_lock); mutex_exit(vp->v_interlock); if (vp->v_type != type) { if (vp->v_type == VCHR || vp->v_type == VBLK) spec_node_destroy(vp); vp->v_type = type; if (type == VCHR || type == VBLK) spec_node_init(vp, NODEV); uvm_vnp_setsize(vp, 0); } mutex_exit(&cp->c_lock); return cp; } /* * coda_getpages may be called on a vnode which has not been opened, * e.g. to fault in pages to execute a program. In that case, we must * open the file to get the container. The vnode may or may not be * locked, and we must leave it in the same state. */ int coda_getpages(void *v) { struct vop_getpages_args /* { vnode_t *a_vp; voff_t a_offset; struct vm_page **a_m; int *a_count; int a_centeridx; vm_prot_t a_access_type; int a_advice; int a_flags; } */ *ap = v; vnode_t *vp = ap->a_vp, *cvp; struct cnode *cp = VTOC(vp); struct lwp *l = curlwp; kauth_cred_t cred = l->l_cred; int error, cerror; int waslocked; /* 1 if vnode lock was held on entry */ int didopen = 0; /* 1 if we opened container file */ krw_t op; /* * Handle a case that uvm_fault doesn't quite use yet. * See layer_vnops.c. for inspiration. */ if (ap->a_flags & PGO_LOCKED) { return EBUSY; } KASSERT(rw_lock_held(vp->v_uobj.vmobjlock)); /* Check for control object. */ if (IS_CTL_VP(vp)) { #ifdef CODA_VERBOSE printf("%s: control object %p\n", __func__, vp); #endif return(EINVAL); } /* * XXX It's really not ok to be releasing the lock we get, * because we could be overlapping with another call to * getpages and drop a lock they are relying on. We need to * figure out whether getpages ever is called holding the * lock, and if we should serialize getpages calls by some * mechanism. */ /* XXX VOP_ISLOCKED() may not be used for lock decisions. */ op = rw_lock_op(vp->v_uobj.vmobjlock); waslocked = VOP_ISLOCKED(vp); /* Get container file if not already present. */ cvp = cp->c_ovp; if (cvp == NULL) { /* * VOP_OPEN requires a locked vnode. We must avoid * locking the vnode if it is already locked, and * leave it in the same state on exit. */ if (waslocked == 0) { rw_exit(vp->v_uobj.vmobjlock); cerror = vn_lock(vp, LK_EXCLUSIVE); if (cerror) { #ifdef CODA_VERBOSE printf("%s: can't lock vnode %p\n", __func__, vp); #endif return cerror; } #ifdef CODA_VERBOSE printf("%s: locked vnode %p\n", __func__, vp); #endif } /* * Open file (causes upcall to venus). * XXX Perhaps we should not fully open the file, but * simply obtain a container file. */ /* XXX Is it ok to do this while holding the mutex? */ cerror = VOP_OPEN(vp, FREAD, cred); if (cerror) { #ifdef CODA_VERBOSE printf("%s: cannot open vnode %p => %d\n", __func__, vp, cerror); #endif if (waslocked == 0) VOP_UNLOCK(vp); return cerror; } #ifdef CODA_VERBOSE printf("%s: opened vnode %p\n", __func__, vp); #endif cvp = cp->c_ovp; didopen = 1; if (waslocked == 0) rw_enter(vp->v_uobj.vmobjlock, op); } KASSERT(cvp != NULL); /* Munge the arg structure to refer to the container vnode. */ KASSERT(cvp->v_uobj.vmobjlock == vp->v_uobj.vmobjlock); ap->a_vp = cp->c_ovp; /* Finally, call getpages on it. */ error = VCALL(ap->a_vp, VOFFSET(vop_getpages), ap); /* If we opened the vnode, we must close it. */ if (didopen) { /* * VOP_CLOSE requires a locked vnode, but we are still * holding the lock (or riding a caller's lock). */ cerror = VOP_CLOSE(vp, FREAD, cred); #ifdef CODA_VERBOSE if (cerror != 0) /* XXX How should we handle this? */ printf("%s: closed vnode %p -> %d\n", __func__, vp, cerror); #endif /* If we obtained a lock, drop it. */ if (waslocked == 0) VOP_UNLOCK(vp); } return error; } /* * The protocol requires v_interlock to be held by the caller. */ int coda_putpages(void *v) { struct vop_putpages_args /* { vnode_t *a_vp; voff_t a_offlo; voff_t a_offhi; int a_flags; } */ *ap = v; vnode_t *vp = ap->a_vp, *cvp; struct cnode *cp = VTOC(vp); int error; KASSERT(rw_write_held(vp->v_uobj.vmobjlock)); /* Check for control object. */ if (IS_CTL_VP(vp)) { rw_exit(vp->v_uobj.vmobjlock); #ifdef CODA_VERBOSE printf("%s: control object %p\n", __func__, vp); #endif return 0; } /* * If container object is not present, then there are no pages * to put; just return without error. This happens all the * time, apparently during discard of a closed vnode (which * trivially can't have dirty pages). */ cvp = cp->c_ovp; if (cvp == NULL) { rw_exit(vp->v_uobj.vmobjlock); return 0; } /* Munge the arg structure to refer to the container vnode. */ KASSERT(cvp->v_uobj.vmobjlock == vp->v_uobj.vmobjlock); ap->a_vp = cvp; /* Finally, call putpages on it. */ error = VCALL(ap->a_vp, VOFFSET(vop_putpages), ap); return error; }
86 86 399 245 241 240 240 180 94 113 1601 1599 1601 1600 1602 1484 1389 4 1602 180 180 35 35 35 35 35 167 1 165 1605 1599 1603 1598 1648 1652 1402 1409 1526 1524 1650 86 86 69 69 3 69 29 1060 1063 932 457 34 2 1063 1063 1062 258 259 258 8 8 257 257 257 257 256 312 313 1232 1235 1257 1259 460 463 167 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 /* $NetBSD: kern_rwlock.c,v 1.76 2023/10/15 10:28:48 riastradh Exp $ */ /*- * Copyright (c) 2002, 2006, 2007, 2008, 2009, 2019, 2020, 2023 * The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Jason R. Thorpe and Andrew Doran. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /* * Kernel reader/writer lock implementation, modeled after those * found in Solaris, a description of which can be found in: * * Solaris Internals: Core Kernel Architecture, Jim Mauro and * Richard McDougall. * * The NetBSD implementation differs from that described in the book, in * that the locks are partially adaptive. Lock waiters spin wait while a * lock is write held and the holder is still running on a CPU. The method * of choosing which threads to awaken when a lock is released also differs, * mainly to take account of the partially adaptive behaviour. */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: kern_rwlock.c,v 1.76 2023/10/15 10:28:48 riastradh Exp $"); #include "opt_lockdebug.h" #define __RWLOCK_PRIVATE #include <sys/param.h> #include <sys/atomic.h> #include <sys/cpu.h> #include <sys/lock.h> #include <sys/lockdebug.h> #include <sys/proc.h> #include <sys/pserialize.h> #include <sys/rwlock.h> #include <sys/sched.h> #include <sys/sleepq.h> #include <sys/syncobj.h> #include <sys/systm.h> #include <dev/lockstat.h> #include <machine/rwlock.h> /* * LOCKDEBUG */ #define RW_DEBUG_P(rw) (((rw)->rw_owner & RW_NODEBUG) == 0) #define RW_WANTLOCK(rw, op) \ LOCKDEBUG_WANTLOCK(RW_DEBUG_P(rw), (rw), \ (uintptr_t)__builtin_return_address(0), op == RW_READER); #define RW_LOCKED(rw, op) \ LOCKDEBUG_LOCKED(RW_DEBUG_P(rw), (rw), NULL, \ (uintptr_t)__builtin_return_address(0), op == RW_READER); #define RW_UNLOCKED(rw, op) \ LOCKDEBUG_UNLOCKED(RW_DEBUG_P(rw), (rw), \ (uintptr_t)__builtin_return_address(0), op == RW_READER); /* * DIAGNOSTIC */ #if defined(DIAGNOSTIC) #define RW_ASSERT(rw, cond) \ do { \ if (__predict_false(!(cond))) \ rw_abort(__func__, __LINE__, rw, "assertion failed: " #cond);\ } while (/* CONSTCOND */ 0) #else #define RW_ASSERT(rw, cond) /* nothing */ #endif /* DIAGNOSTIC */ /* * For platforms that do not provide stubs, or for the LOCKDEBUG case. */ #ifdef LOCKDEBUG #undef __HAVE_RW_STUBS #endif #ifndef __HAVE_RW_STUBS __strong_alias(rw_enter,rw_vector_enter); __strong_alias(rw_exit,rw_vector_exit); __strong_alias(rw_tryenter,rw_vector_tryenter); #endif static void rw_abort(const char *, size_t, krwlock_t *, const char *); static void rw_dump(const volatile void *, lockop_printer_t); static lwp_t *rw_owner(wchan_t); lockops_t rwlock_lockops = { .lo_name = "Reader / writer lock", .lo_type = LOCKOPS_SLEEP, .lo_dump = rw_dump, }; /* * Give rwlock holders an extra-high priority boost on-blocking due to * direct handoff. XXX To be revisited. */ syncobj_t rw_syncobj = { .sobj_name = "rwlock", .sobj_flag = SOBJ_SLEEPQ_SORTED, .sobj_boostpri = PRI_KTHREAD, .sobj_unsleep = turnstile_unsleep, .sobj_changepri = turnstile_changepri, .sobj_lendpri = sleepq_lendpri, .sobj_owner = rw_owner, }; /* * rw_cas: * * Do an atomic compare-and-swap on the lock word. */ static inline uintptr_t rw_cas(krwlock_t *rw, uintptr_t o, uintptr_t n) { return (uintptr_t)atomic_cas_ptr((volatile void *)&rw->rw_owner, (void *)o, (void *)n); } /* * rw_swap: * * Do an atomic swap of the lock word. This is used only when it's * known that the lock word is set up such that it can't be changed * behind us (assert this), so there's no point considering the result. */ static inline void rw_swap(krwlock_t *rw, uintptr_t o, uintptr_t n) { n = (uintptr_t)atomic_swap_ptr((volatile void *)&rw->rw_owner, (void *)n); RW_ASSERT(rw, n == o); RW_ASSERT(rw, (o & RW_HAS_WAITERS) != 0); } /* * rw_dump: * * Dump the contents of a rwlock structure. */ static void rw_dump(const volatile void *cookie, lockop_printer_t pr) { const volatile krwlock_t *rw = cookie; pr("owner/count : %#018lx flags : %#018x\n", (long)RW_OWNER(rw), (int)RW_FLAGS(rw)); } /* * rw_abort: * * Dump information about an error and panic the system. This * generates a lot of machine code in the DIAGNOSTIC case, so * we ask the compiler to not inline it. */ static void __noinline rw_abort(const char *func, size_t line, krwlock_t *rw, const char *msg) { if (__predict_false(panicstr != NULL)) return; LOCKDEBUG_ABORT(func, line, rw, &rwlock_lockops, msg); } /* * rw_init: * * Initialize a rwlock for use. */ void _rw_init(krwlock_t *rw, uintptr_t return_address) { #ifdef LOCKDEBUG /* XXX only because the assembly stubs can't handle RW_NODEBUG */ if (LOCKDEBUG_ALLOC(rw, &rwlock_lockops, return_address)) rw->rw_owner = 0; else rw->rw_owner = RW_NODEBUG; #else rw->rw_owner = 0; #endif } void rw_init(krwlock_t *rw) { _rw_init(rw, (uintptr_t)__builtin_return_address(0)); } /* * rw_destroy: * * Tear down a rwlock. */ void rw_destroy(krwlock_t *rw) { RW_ASSERT(rw, (rw->rw_owner & ~RW_NODEBUG) == 0); LOCKDEBUG_FREE((rw->rw_owner & RW_NODEBUG) == 0, rw); } /* * rw_oncpu: * * Return true if an rwlock owner is running on a CPU in the system. * If the target is waiting on the kernel big lock, then we must * release it. This is necessary to avoid deadlock. */ static bool rw_oncpu(uintptr_t owner) { #ifdef MULTIPROCESSOR struct cpu_info *ci; lwp_t *l; KASSERT(kpreempt_disabled()); if ((owner & (RW_WRITE_LOCKED|RW_HAS_WAITERS)) != RW_WRITE_LOCKED) { return false; } /* * See lwp_dtor() why dereference of the LWP pointer is safe. * We must have kernel preemption disabled for that. */ l = (lwp_t *)(owner & RW_THREAD); ci = l->l_cpu; if (ci && ci->ci_curlwp == l) { /* Target is running; do we need to block? */ return (ci->ci_biglock_wanted != l); } #endif /* Not running. It may be safe to block now. */ return false; } /* * rw_vector_enter: * * Acquire a rwlock. */ void rw_vector_enter(krwlock_t *rw, const krw_t op) { uintptr_t owner, incr, need_wait, set_wait, curthread, next; turnstile_t *ts; int queue; lwp_t *l; LOCKSTAT_TIMER(slptime); LOCKSTAT_TIMER(slpcnt); LOCKSTAT_TIMER(spintime); LOCKSTAT_COUNTER(spincnt); LOCKSTAT_FLAG(lsflag); l = curlwp; curthread = (uintptr_t)l; RW_ASSERT(rw, !cpu_intr_p()); RW_ASSERT(rw, curthread != 0); RW_WANTLOCK(rw, op); if (__predict_true(panicstr == NULL)) { KDASSERT(pserialize_not_in_read_section()); LOCKDEBUG_BARRIER(&kernel_lock, 1); } /* * We play a slight trick here. If we're a reader, we want * increment the read count. If we're a writer, we want to * set the owner field and the WRITE_LOCKED bit. * * In the latter case, we expect those bits to be zero, * therefore we can use an add operation to set them, which * means an add operation for both cases. */ if (__predict_true(op == RW_READER)) { incr = RW_READ_INCR; set_wait = RW_HAS_WAITERS; need_wait = RW_WRITE_LOCKED | RW_WRITE_WANTED; queue = TS_READER_Q; } else { RW_ASSERT(rw, op == RW_WRITER); incr = curthread | RW_WRITE_LOCKED; set_wait = RW_HAS_WAITERS | RW_WRITE_WANTED; need_wait = RW_WRITE_LOCKED | RW_THREAD; queue = TS_WRITER_Q; } LOCKSTAT_ENTER(lsflag); KPREEMPT_DISABLE(curlwp); for (owner = rw->rw_owner;;) { /* * Read the lock owner field. If the need-to-wait * indicator is clear, then try to acquire the lock. */ if ((owner & need_wait) == 0) { next = rw_cas(rw, owner, (owner + incr) & ~RW_WRITE_WANTED); if (__predict_true(next == owner)) { /* Got it! */ membar_acquire(); break; } /* * Didn't get it -- spin around again (we'll * probably sleep on the next iteration). */ owner = next; continue; } if (__predict_false(RW_OWNER(rw) == curthread)) { rw_abort(__func__, __LINE__, rw, "locking against myself"); } /* * If the lock owner is running on another CPU, and * there are no existing waiters, then spin. */ if (rw_oncpu(owner)) { LOCKSTAT_START_TIMER(lsflag, spintime); u_int count = SPINLOCK_BACKOFF_MIN; do { KPREEMPT_ENABLE(curlwp); SPINLOCK_BACKOFF(count); KPREEMPT_DISABLE(curlwp); owner = rw->rw_owner; } while (rw_oncpu(owner)); LOCKSTAT_STOP_TIMER(lsflag, spintime); LOCKSTAT_COUNT(spincnt, 1); if ((owner & need_wait) == 0) continue; } /* * Grab the turnstile chain lock. Once we have that, we * can adjust the waiter bits and sleep queue. */ ts = turnstile_lookup(rw); /* * Mark the rwlock as having waiters. If the set fails, * then we may not need to sleep and should spin again. * Reload rw_owner because turnstile_lookup() may have * spun on the turnstile chain lock. */ owner = rw->rw_owner; if ((owner & need_wait) == 0 || rw_oncpu(owner)) { turnstile_exit(rw); continue; } next = rw_cas(rw, owner, owner | set_wait); /* XXX membar? */ if (__predict_false(next != owner)) { turnstile_exit(rw); owner = next; continue; } LOCKSTAT_START_TIMER(lsflag, slptime); turnstile_block(ts, queue, rw, &rw_syncobj); LOCKSTAT_STOP_TIMER(lsflag, slptime); LOCKSTAT_COUNT(slpcnt, 1); /* * No need for a memory barrier because of context switch. * If not handed the lock, then spin again. */ if (op == RW_READER || (rw->rw_owner & RW_THREAD) == curthread) break; owner = rw->rw_owner; } KPREEMPT_ENABLE(curlwp); LOCKSTAT_EVENT_RA(lsflag, rw, LB_RWLOCK | (op == RW_WRITER ? LB_SLEEP1 : LB_SLEEP2), slpcnt, slptime, (l->l_rwcallsite != 0 ? l->l_rwcallsite : (uintptr_t)__builtin_return_address(0))); LOCKSTAT_EVENT_RA(lsflag, rw, LB_RWLOCK | LB_SPIN, spincnt, spintime, (l->l_rwcallsite != 0 ? l->l_rwcallsite : (uintptr_t)__builtin_return_address(0))); LOCKSTAT_EXIT(lsflag); RW_ASSERT(rw, (op != RW_READER && RW_OWNER(rw) == curthread) || (op == RW_READER && RW_COUNT(rw) != 0)); RW_LOCKED(rw, op); } /* * rw_vector_exit: * * Release a rwlock. */ void rw_vector_exit(krwlock_t *rw) { uintptr_t curthread, owner, decr, newown, next; turnstile_t *ts; int rcnt, wcnt; lwp_t *l; l = curlwp; curthread = (uintptr_t)l; RW_ASSERT(rw, curthread != 0); /* * Again, we use a trick. Since we used an add operation to * set the required lock bits, we can use a subtract to clear * them, which makes the read-release and write-release path * the same. */ owner = rw->rw_owner; if (__predict_false((owner & RW_WRITE_LOCKED) != 0)) { RW_UNLOCKED(rw, RW_WRITER); RW_ASSERT(rw, RW_OWNER(rw) == curthread); decr = curthread | RW_WRITE_LOCKED; } else { RW_UNLOCKED(rw, RW_READER); RW_ASSERT(rw, RW_COUNT(rw) != 0); decr = RW_READ_INCR; } /* * Compute what we expect the new value of the lock to be. Only * proceed to do direct handoff if there are waiters, and if the * lock would become unowned. */ membar_release(); for (;;) { newown = (owner - decr); if ((newown & (RW_THREAD | RW_HAS_WAITERS)) == RW_HAS_WAITERS) break; next = rw_cas(rw, owner, newown); if (__predict_true(next == owner)) return; owner = next; } /* * Grab the turnstile chain lock. This gets the interlock * on the sleep queue. Once we have that, we can adjust the * waiter bits. */ ts = turnstile_lookup(rw); owner = rw->rw_owner; RW_ASSERT(rw, ts != NULL); RW_ASSERT(rw, (owner & RW_HAS_WAITERS) != 0); wcnt = TS_WAITERS(ts, TS_WRITER_Q); rcnt = TS_WAITERS(ts, TS_READER_Q); /* * Give the lock away. * * If we are releasing a write lock, then prefer to wake all * outstanding readers. Otherwise, wake one writer if there * are outstanding readers, or all writers if there are no * pending readers. If waking one specific writer, the writer * is handed the lock here. If waking multiple writers, we * set WRITE_WANTED to block out new readers, and let them * do the work of acquiring the lock in rw_vector_enter(). */ if (rcnt == 0 || decr == RW_READ_INCR) { RW_ASSERT(rw, wcnt != 0); RW_ASSERT(rw, (owner & RW_WRITE_WANTED) != 0); if (rcnt != 0) { /* Give the lock to the longest waiting writer. */ l = TS_FIRST(ts, TS_WRITER_Q); newown = (uintptr_t)l | (owner & RW_NODEBUG); newown |= RW_WRITE_LOCKED | RW_HAS_WAITERS; if (wcnt > 1) newown |= RW_WRITE_WANTED; rw_swap(rw, owner, newown); turnstile_wakeup(ts, TS_WRITER_Q, 1, l); } else { /* Wake all writers and let them fight it out. */ newown = owner & RW_NODEBUG; newown |= RW_WRITE_WANTED; rw_swap(rw, owner, newown); turnstile_wakeup(ts, TS_WRITER_Q, wcnt, NULL); } } else { RW_ASSERT(rw, rcnt != 0); /* * Give the lock to all blocked readers. If there * is a writer waiting, new readers that arrive * after the release will be blocked out. */ newown = owner & RW_NODEBUG; newown += rcnt << RW_READ_COUNT_SHIFT; if (wcnt != 0) newown |= RW_HAS_WAITERS | RW_WRITE_WANTED; /* Wake up all sleeping readers. */ rw_swap(rw, owner, newown); turnstile_wakeup(ts, TS_READER_Q, rcnt, NULL); } } /* * rw_vector_tryenter: * * Try to acquire a rwlock. */ int rw_vector_tryenter(krwlock_t *rw, const krw_t op) { uintptr_t curthread, owner, incr, need_wait, next; lwp_t *l; l = curlwp; curthread = (uintptr_t)l; RW_ASSERT(rw, curthread != 0); if (op == RW_READER) { incr = RW_READ_INCR; need_wait = RW_WRITE_LOCKED | RW_WRITE_WANTED; } else { RW_ASSERT(rw, op == RW_WRITER); incr = curthread | RW_WRITE_LOCKED; need_wait = RW_WRITE_LOCKED | RW_THREAD; } for (owner = rw->rw_owner;; owner = next) { if (__predict_false((owner & need_wait) != 0)) return 0; next = rw_cas(rw, owner, owner + incr); if (__predict_true(next == owner)) { /* Got it! */ break; } } RW_WANTLOCK(rw, op); RW_LOCKED(rw, op); RW_ASSERT(rw, (op != RW_READER && RW_OWNER(rw) == curthread) || (op == RW_READER && RW_COUNT(rw) != 0)); membar_acquire(); return 1; } /* * rw_downgrade: * * Downgrade a write lock to a read lock. */ void rw_downgrade(krwlock_t *rw) { uintptr_t owner, newown, next, curthread __diagused; turnstile_t *ts; int rcnt, wcnt; lwp_t *l; l = curlwp; curthread = (uintptr_t)l; RW_ASSERT(rw, curthread != 0); RW_ASSERT(rw, (rw->rw_owner & RW_WRITE_LOCKED) != 0); RW_ASSERT(rw, RW_OWNER(rw) == curthread); RW_UNLOCKED(rw, RW_WRITER); membar_release(); for (owner = rw->rw_owner;; owner = next) { /* * If there are no waiters we can do this the easy way. Try * swapping us down to one read hold. If it fails, the lock * condition has changed and we most likely now have * waiters. */ if ((owner & RW_HAS_WAITERS) == 0) { newown = (owner & RW_NODEBUG); next = rw_cas(rw, owner, newown + RW_READ_INCR); if (__predict_true(next == owner)) { RW_LOCKED(rw, RW_READER); RW_ASSERT(rw, (rw->rw_owner & RW_WRITE_LOCKED) == 0); RW_ASSERT(rw, RW_COUNT(rw) != 0); return; } continue; } /* * Grab the turnstile chain lock. This gets the interlock * on the sleep queue. Once we have that, we can adjust the * waiter bits. */ ts = turnstile_lookup(rw); RW_ASSERT(rw, ts != NULL); rcnt = TS_WAITERS(ts, TS_READER_Q); wcnt = TS_WAITERS(ts, TS_WRITER_Q); if (rcnt == 0) { /* * If there are no readers, just preserve the * waiters bits, swap us down to one read hold and * return. */ RW_ASSERT(rw, wcnt != 0); RW_ASSERT(rw, (rw->rw_owner & RW_WRITE_WANTED) != 0); RW_ASSERT(rw, (rw->rw_owner & RW_HAS_WAITERS) != 0); newown = owner & RW_NODEBUG; newown |= RW_READ_INCR | RW_HAS_WAITERS | RW_WRITE_WANTED; next = rw_cas(rw, owner, newown); turnstile_exit(rw); if (__predict_true(next == owner)) break; } else { /* * Give the lock to all blocked readers. We may * retain one read hold if downgrading. If there is * a writer waiting, new readers will be blocked * out. */ newown = owner & RW_NODEBUG; newown += (rcnt << RW_READ_COUNT_SHIFT) + RW_READ_INCR; if (wcnt != 0) newown |= RW_HAS_WAITERS | RW_WRITE_WANTED; next = rw_cas(rw, owner, newown); if (__predict_true(next == owner)) { /* Wake up all sleeping readers. */ turnstile_wakeup(ts, TS_READER_Q, rcnt, NULL); break; } turnstile_exit(rw); } } RW_WANTLOCK(rw, RW_READER); RW_LOCKED(rw, RW_READER); RW_ASSERT(rw, (rw->rw_owner & RW_WRITE_LOCKED) == 0); RW_ASSERT(rw, RW_COUNT(rw) != 0); } /* * rw_tryupgrade: * * Try to upgrade a read lock to a write lock. We must be the only * reader. */ int rw_tryupgrade(krwlock_t *rw) { uintptr_t owner, curthread, newown, next; struct lwp *l; l = curlwp; curthread = (uintptr_t)l; RW_ASSERT(rw, curthread != 0); RW_ASSERT(rw, rw_read_held(rw)); for (owner = RW_READ_INCR;; owner = next) { newown = curthread | RW_WRITE_LOCKED | (owner & ~RW_THREAD); next = rw_cas(rw, owner, newown); if (__predict_true(next == owner)) { membar_acquire(); break; } RW_ASSERT(rw, (next & RW_WRITE_LOCKED) == 0); if (__predict_false((next & RW_THREAD) != RW_READ_INCR)) { RW_ASSERT(rw, (next & RW_THREAD) != 0); return 0; } } RW_UNLOCKED(rw, RW_READER); RW_WANTLOCK(rw, RW_WRITER); RW_LOCKED(rw, RW_WRITER); RW_ASSERT(rw, rw->rw_owner & RW_WRITE_LOCKED); RW_ASSERT(rw, RW_OWNER(rw) == curthread); return 1; } /* * rw_read_held: * * Returns true if the rwlock is held for reading. Must only be * used for diagnostic assertions, and never be used to make * decisions about how to use a rwlock. */ int rw_read_held(krwlock_t *rw) { uintptr_t owner; if (rw == NULL) return 0; owner = rw->rw_owner; return (owner & RW_WRITE_LOCKED) == 0 && (owner & RW_THREAD) != 0; } /* * rw_write_held: * * Returns true if the rwlock is held for writing. Must only be * used for diagnostic assertions, and never be used to make * decisions about how to use a rwlock. */ int rw_write_held(krwlock_t *rw) { if (rw == NULL) return 0; return (rw->rw_owner & (RW_WRITE_LOCKED | RW_THREAD)) == (RW_WRITE_LOCKED | (uintptr_t)curlwp); } /* * rw_lock_held: * * Returns true if the rwlock is held for reading or writing. Must * only be used for diagnostic assertions, and never be used to make * decisions about how to use a rwlock. */ int rw_lock_held(krwlock_t *rw) { if (rw == NULL) return 0; return (rw->rw_owner & RW_THREAD) != 0; } /* * rw_lock_op: * * For a rwlock that is known to be held by the caller, return * RW_READER or RW_WRITER to describe the hold type. */ krw_t rw_lock_op(krwlock_t *rw) { RW_ASSERT(rw, rw_lock_held(rw)); return (rw->rw_owner & RW_WRITE_LOCKED) != 0 ? RW_WRITER : RW_READER; } /* * rw_owner: * * Return the current owner of an RW lock, but only if it is write * held. Used for priority inheritance. */ static lwp_t * rw_owner(wchan_t obj) { krwlock_t *rw = (void *)(uintptr_t)obj; /* discard qualifiers */ uintptr_t owner = rw->rw_owner; if ((owner & RW_WRITE_LOCKED) == 0) return NULL; return (void *)(owner & RW_THREAD); }
1 7 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 /* $NetBSD: if_stats.c,v 1.4 2021/06/29 21:19:58 riastradh Exp $ */ /*- * Copyright (c) 2020 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Jason R. Thorpe. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: if_stats.c,v 1.4 2021/06/29 21:19:58 riastradh Exp $"); #include <sys/param.h> #include <sys/mbuf.h> #include <sys/systm.h> #include <sys/xcall.h> #include <net/if.h> #define IF_STATS_SIZE (sizeof(uint64_t) * IF_NSTATS) /* * if_stats_init -- * Initialize statistics storage for a network interface. */ void if_stats_init(ifnet_t * const ifp) { ifp->if_stats = percpu_alloc(IF_STATS_SIZE); } /* * if_stats_fini -- * Tear down statistics storage for a network interface. */ void if_stats_fini(ifnet_t * const ifp) { percpu_t *pc = ifp->if_stats; ifp->if_stats = NULL; if (pc) { percpu_free(pc, IF_STATS_SIZE); } } struct if_stats_to_if_data_ctx { struct if_data * const ifi; const bool zero_stats; }; static void if_stats_to_if_data_cb(void *v1, void *v2, struct cpu_info *ci) { const uint64_t * const local_counters = v1; struct if_stats_to_if_data_ctx *ctx = v2; int s = splnet(); if (ctx->ifi) { ctx->ifi->ifi_ipackets += local_counters[if_ipackets]; ctx->ifi->ifi_ierrors += local_counters[if_ierrors]; ctx->ifi->ifi_opackets += local_counters[if_opackets]; ctx->ifi->ifi_oerrors += local_counters[if_oerrors]; ctx->ifi->ifi_collisions += local_counters[if_collisions]; ctx->ifi->ifi_ibytes += local_counters[if_ibytes]; ctx->ifi->ifi_obytes += local_counters[if_obytes]; ctx->ifi->ifi_imcasts += local_counters[if_imcasts]; ctx->ifi->ifi_omcasts += local_counters[if_omcasts]; ctx->ifi->ifi_iqdrops += local_counters[if_iqdrops]; ctx->ifi->ifi_noproto += local_counters[if_noproto]; } if (ctx->zero_stats) { memset(v1, 0, IF_STATS_SIZE); } splx(s); } /* * if_stats_to_if_data -- * Collect the interface statistics and place them into the * legacy if_data structure for reportig to user space. * Optionally zeros the stats after collection. */ void if_stats_to_if_data(ifnet_t * const ifp, struct if_data * const ifi, const bool zero_stats) { struct if_stats_to_if_data_ctx ctx = { .ifi = ifi, .zero_stats = zero_stats, }; memset(ifi, 0, sizeof(*ifi)); percpu_foreach_xcall(ifp->if_stats, XC_HIGHPRI_IPL(IPL_SOFTNET), if_stats_to_if_data_cb, &ctx); }
7 15 6 4 17 12 5 10 10 10 7 3 5 5 3 2 13 13 13 10 3 13 13 13 13 12 33 4 28 23 23 9 5 5 2 5 8 8 8 4 3 8 3 2 14 14 14 14 3 6 3 8 5 2 3 1 13 12 3 13 12 13 13 13 13 6 10 10 7 7 7 5 5 2 2 7 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 /* $NetBSD: uvm_aobj.c,v 1.157 2023/02/24 11:03:13 riastradh Exp $ */ /* * Copyright (c) 1998 Chuck Silvers, Charles D. Cranor and * Washington University. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * from: Id: uvm_aobj.c,v 1.1.2.5 1998/02/06 05:14:38 chs Exp */ /* * uvm_aobj.c: anonymous memory uvm_object pager * * author: Chuck Silvers <chuq@chuq.com> * started: Jan-1998 * * - design mostly from Chuck Cranor */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: uvm_aobj.c,v 1.157 2023/02/24 11:03:13 riastradh Exp $"); #ifdef _KERNEL_OPT #include "opt_uvmhist.h" #endif #include <sys/param.h> #include <sys/systm.h> #include <sys/kernel.h> #include <sys/kmem.h> #include <sys/pool.h> #include <sys/atomic.h> #include <uvm/uvm.h> #include <uvm/uvm_page_array.h> /* * An anonymous UVM object (aobj) manages anonymous-memory. In addition to * keeping the list of resident pages, it may also keep a list of allocated * swap blocks. Depending on the size of the object, this list is either * stored in an array (small objects) or in a hash table (large objects). * * Lock order * * uao_list_lock -> * uvm_object::vmobjlock */ /* * Note: for hash tables, we break the address space of the aobj into blocks * of UAO_SWHASH_CLUSTER_SIZE pages, which shall be a power of two. */ #define UAO_SWHASH_CLUSTER_SHIFT 4 #define UAO_SWHASH_CLUSTER_SIZE (1 << UAO_SWHASH_CLUSTER_SHIFT) /* Get the "tag" for this page index. */ #define UAO_SWHASH_ELT_TAG(idx) ((idx) >> UAO_SWHASH_CLUSTER_SHIFT) #define UAO_SWHASH_ELT_PAGESLOT_IDX(idx) \ ((idx) & (UAO_SWHASH_CLUSTER_SIZE - 1)) /* Given an ELT and a page index, find the swap slot. */ #define UAO_SWHASH_ELT_PAGESLOT(elt, idx) \ ((elt)->slots[UAO_SWHASH_ELT_PAGESLOT_IDX(idx)]) /* Given an ELT, return its pageidx base. */ #define UAO_SWHASH_ELT_PAGEIDX_BASE(ELT) \ ((elt)->tag << UAO_SWHASH_CLUSTER_SHIFT) /* The hash function. */ #define UAO_SWHASH_HASH(aobj, idx) \ (&(aobj)->u_swhash[(((idx) >> UAO_SWHASH_CLUSTER_SHIFT) \ & (aobj)->u_swhashmask)]) /* * The threshold which determines whether we will use an array or a * hash table to store the list of allocated swap blocks. */ #define UAO_SWHASH_THRESHOLD (UAO_SWHASH_CLUSTER_SIZE * 4) #define UAO_USES_SWHASH(aobj) \ ((aobj)->u_pages > UAO_SWHASH_THRESHOLD) /* The number of buckets in a hash, with an upper bound. */ #define UAO_SWHASH_MAXBUCKETS 256 #define UAO_SWHASH_BUCKETS(aobj) \ (MIN((aobj)->u_pages >> UAO_SWHASH_CLUSTER_SHIFT, UAO_SWHASH_MAXBUCKETS)) /* * uao_swhash_elt: when a hash table is being used, this structure defines * the format of an entry in the bucket list. */ struct uao_swhash_elt { LIST_ENTRY(uao_swhash_elt) list; /* the hash list */ voff_t tag; /* our 'tag' */ int count; /* our number of active slots */ int slots[UAO_SWHASH_CLUSTER_SIZE]; /* the slots */ }; /* * uao_swhash: the swap hash table structure */ LIST_HEAD(uao_swhash, uao_swhash_elt); /* * uao_swhash_elt_pool: pool of uao_swhash_elt structures. * Note: pages for this pool must not come from a pageable kernel map. */ static struct pool uao_swhash_elt_pool __cacheline_aligned; /* * uvm_aobj: the actual anon-backed uvm_object * * => the uvm_object is at the top of the structure, this allows * (struct uvm_aobj *) == (struct uvm_object *) * => only one of u_swslots and u_swhash is used in any given aobj */ struct uvm_aobj { struct uvm_object u_obj; /* has: lock, pgops, #pages, #refs */ pgoff_t u_pages; /* number of pages in entire object */ int u_flags; /* the flags (see uvm_aobj.h) */ int *u_swslots; /* array of offset->swapslot mappings */ /* * hashtable of offset->swapslot mappings * (u_swhash is an array of bucket heads) */ struct uao_swhash *u_swhash; u_long u_swhashmask; /* mask for hashtable */ LIST_ENTRY(uvm_aobj) u_list; /* global list of aobjs */ int u_freelist; /* freelist to allocate pages from */ }; static void uao_free(struct uvm_aobj *); static int uao_get(struct uvm_object *, voff_t, struct vm_page **, int *, int, vm_prot_t, int, int); static int uao_put(struct uvm_object *, voff_t, voff_t, int); #if defined(VMSWAP) static struct uao_swhash_elt *uao_find_swhash_elt (struct uvm_aobj *, int, bool); static bool uao_pagein(struct uvm_aobj *, int, int); static bool uao_pagein_page(struct uvm_aobj *, int); #endif /* defined(VMSWAP) */ static struct vm_page *uao_pagealloc(struct uvm_object *, voff_t, int); /* * aobj_pager * * note that some functions (e.g. put) are handled elsewhere */ const struct uvm_pagerops aobj_pager = { .pgo_reference = uao_reference, .pgo_detach = uao_detach, .pgo_get = uao_get, .pgo_put = uao_put, }; /* * uao_list: global list of active aobjs, locked by uao_list_lock */ static LIST_HEAD(aobjlist, uvm_aobj) uao_list __cacheline_aligned; static kmutex_t uao_list_lock __cacheline_aligned; /* * hash table/array related functions */ #if defined(VMSWAP) /* * uao_find_swhash_elt: find (or create) a hash table entry for a page * offset. * * => the object should be locked by the caller */ static struct uao_swhash_elt * uao_find_swhash_elt(struct uvm_aobj *aobj, int pageidx, bool create) { struct uao_swhash *swhash; struct uao_swhash_elt *elt; voff_t page_tag; swhash = UAO_SWHASH_HASH(aobj, pageidx); page_tag = UAO_SWHASH_ELT_TAG(pageidx); /* * now search the bucket for the requested tag */ LIST_FOREACH(elt, swhash, list) { if (elt->tag == page_tag) { return elt; } } if (!create) { return NULL; } /* * allocate a new entry for the bucket and init/insert it in */ elt = pool_get(&uao_swhash_elt_pool, PR_NOWAIT); if (elt == NULL) { return NULL; } LIST_INSERT_HEAD(swhash, elt, list); elt->tag = page_tag; elt->count = 0; memset(elt->slots, 0, sizeof(elt->slots)); return elt; } /* * uao_find_swslot: find the swap slot number for an aobj/pageidx * * => object must be locked by caller */ int uao_find_swslot(struct uvm_object *uobj, int pageidx) { struct uvm_aobj *aobj = (struct uvm_aobj *)uobj; struct uao_swhash_elt *elt; KASSERT(UVM_OBJ_IS_AOBJ(uobj)); /* * if noswap flag is set, then we never return a slot */ if (aobj->u_flags & UAO_FLAG_NOSWAP) return 0; /* * if hashing, look in hash table. */ if (UAO_USES_SWHASH(aobj)) { elt = uao_find_swhash_elt(aobj, pageidx, false); return elt ? UAO_SWHASH_ELT_PAGESLOT(elt, pageidx) : 0; } /* * otherwise, look in the array */ return aobj->u_swslots[pageidx]; } /* * uao_set_swslot: set the swap slot for a page in an aobj. * * => setting a slot to zero frees the slot * => object must be locked by caller * => we return the old slot number, or -1 if we failed to allocate * memory to record the new slot number */ int uao_set_swslot(struct uvm_object *uobj, int pageidx, int slot) { struct uvm_aobj *aobj = (struct uvm_aobj *)uobj; struct uao_swhash_elt *elt; int oldslot; UVMHIST_FUNC(__func__); UVMHIST_CALLARGS(pdhist, "aobj %#jx pageidx %jd slot %jd", (uintptr_t)aobj, pageidx, slot, 0); KASSERT(rw_write_held(uobj->vmobjlock) || uobj->uo_refs == 0); KASSERT(UVM_OBJ_IS_AOBJ(uobj)); /* * if noswap flag is set, then we can't set a non-zero slot. */ if (aobj->u_flags & UAO_FLAG_NOSWAP) { KASSERTMSG(slot == 0, "uao_set_swslot: no swap object"); return 0; } /* * are we using a hash table? if so, add it in the hash. */ if (UAO_USES_SWHASH(aobj)) { /* * Avoid allocating an entry just to free it again if * the page had not swap slot in the first place, and * we are freeing. */ elt = uao_find_swhash_elt(aobj, pageidx, slot != 0); if (elt == NULL) { return slot ? -1 : 0; } oldslot = UAO_SWHASH_ELT_PAGESLOT(elt, pageidx); UAO_SWHASH_ELT_PAGESLOT(elt, pageidx) = slot; /* * now adjust the elt's reference counter and free it if we've * dropped it to zero. */ if (slot) { if (oldslot == 0) elt->count++; } else { if (oldslot) elt->count--; if (elt->count == 0) { LIST_REMOVE(elt, list); pool_put(&uao_swhash_elt_pool, elt); } } } else { /* we are using an array */ oldslot = aobj->u_swslots[pageidx]; aobj->u_swslots[pageidx] = slot; } return oldslot; } #endif /* defined(VMSWAP) */ /* * end of hash/array functions */ /* * uao_free: free all resources held by an aobj, and then free the aobj * * => the aobj should be dead */ static void uao_free(struct uvm_aobj *aobj) { struct uvm_object *uobj = &aobj->u_obj; KASSERT(UVM_OBJ_IS_AOBJ(uobj)); KASSERT(rw_write_held(uobj->vmobjlock)); uao_dropswap_range(uobj, 0, 0); rw_exit(uobj->vmobjlock); #if defined(VMSWAP) if (UAO_USES_SWHASH(aobj)) { /* * free the hash table itself. */ hashdone(aobj->u_swhash, HASH_LIST, aobj->u_swhashmask); } else { /* * free the array itself. */ kmem_free(aobj->u_swslots, aobj->u_pages * sizeof(int)); } #endif /* defined(VMSWAP) */ /* * finally free the aobj itself */ uvm_obj_destroy(uobj, true); kmem_free(aobj, sizeof(struct uvm_aobj)); } /* * pager functions */ /* * uao_create: create an aobj of the given size and return its uvm_object. * * => for normal use, flags are always zero * => for the kernel object, the flags are: * UAO_FLAG_KERNOBJ - allocate the kernel object (can only happen once) * UAO_FLAG_KERNSWAP - enable swapping of kernel object (" ") */ struct uvm_object * uao_create(voff_t size, int flags) { static struct uvm_aobj kernel_object_store; static krwlock_t bootstrap_kernel_object_lock; static int kobj_alloced __diagused = 0; pgoff_t pages = round_page((uint64_t)size) >> PAGE_SHIFT; struct uvm_aobj *aobj; int refs; /* * Allocate a new aobj, unless kernel object is requested. */ if (flags & UAO_FLAG_KERNOBJ) { KASSERT(!kobj_alloced); aobj = &kernel_object_store; aobj->u_pages = pages; aobj->u_flags = UAO_FLAG_NOSWAP; refs = UVM_OBJ_KERN; kobj_alloced = UAO_FLAG_KERNOBJ; } else if (flags & UAO_FLAG_KERNSWAP) { KASSERT(kobj_alloced == UAO_FLAG_KERNOBJ); aobj = &kernel_object_store; kobj_alloced = UAO_FLAG_KERNSWAP; refs = 0xdeadbeaf; /* XXX: gcc */ } else { aobj = kmem_alloc(sizeof(struct uvm_aobj), KM_SLEEP); aobj->u_pages = pages; aobj->u_flags = 0; refs = 1; } /* * no freelist by default */ aobj->u_freelist = VM_NFREELIST; /* * allocate hash/array if necessary * * note: in the KERNSWAP case no need to worry about locking since * we are still booting we should be the only thread around. */ const int kernswap = (flags & UAO_FLAG_KERNSWAP) != 0; if (flags == 0 || kernswap) { #if defined(VMSWAP) /* allocate hash table or array depending on object size */ if (UAO_USES_SWHASH(aobj)) { aobj->u_swhash = hashinit(UAO_SWHASH_BUCKETS(aobj), HASH_LIST, true, &aobj->u_swhashmask); } else { aobj->u_swslots = kmem_zalloc(pages * sizeof(int), KM_SLEEP); } #endif /* defined(VMSWAP) */ /* * Replace kernel_object's temporary static lock with * a regular rw_obj. We cannot use uvm_obj_setlock() * because that would try to free the old lock. */ if (kernswap) { aobj->u_obj.vmobjlock = rw_obj_alloc(); rw_destroy(&bootstrap_kernel_object_lock); } if (flags) { aobj->u_flags &= ~UAO_FLAG_NOSWAP; /* clear noswap */ return &aobj->u_obj; } } /* * Initialise UVM object. */ const bool kernobj = (flags & UAO_FLAG_KERNOBJ) != 0; uvm_obj_init(&aobj->u_obj, &aobj_pager, !kernobj, refs); if (__predict_false(kernobj)) { /* Use a temporary static lock for kernel_object. */ rw_init(&bootstrap_kernel_object_lock); uvm_obj_setlock(&aobj->u_obj, &bootstrap_kernel_object_lock); } /* * now that aobj is ready, add it to the global list */ mutex_enter(&uao_list_lock); LIST_INSERT_HEAD(&uao_list, aobj, u_list); mutex_exit(&uao_list_lock); return(&aobj->u_obj); } /* * uao_set_pgfl: allocate pages only from the specified freelist. * * => must be called before any pages are allocated for the object. * => reset by setting it to VM_NFREELIST, meaning any freelist. */ void uao_set_pgfl(struct uvm_object *uobj, int freelist) { struct uvm_aobj *aobj = (struct uvm_aobj *)uobj; KASSERTMSG((0 <= freelist), "invalid freelist %d", freelist); KASSERTMSG((freelist <= VM_NFREELIST), "invalid freelist %d", freelist); aobj->u_freelist = freelist; } /* * uao_pagealloc: allocate a page for aobj. */ static inline struct vm_page * uao_pagealloc(struct uvm_object *uobj, voff_t offset, int flags) { struct uvm_aobj *aobj = (struct uvm_aobj *)uobj; if (__predict_true(aobj->u_freelist == VM_NFREELIST)) return uvm_pagealloc(uobj, offset, NULL, flags); else return uvm_pagealloc_strat(uobj, offset, NULL, flags, UVM_PGA_STRAT_ONLY, aobj->u_freelist); } /* * uao_init: set up aobj pager subsystem * * => called at boot time from uvm_pager_init() */ void uao_init(void) { static int uao_initialized; if (uao_initialized) return; uao_initialized = true; LIST_INIT(&uao_list); mutex_init(&uao_list_lock, MUTEX_DEFAULT, IPL_NONE); pool_init(&uao_swhash_elt_pool, sizeof(struct uao_swhash_elt), 0, 0, 0, "uaoeltpl", NULL, IPL_VM); } /* * uao_reference: hold a reference to an anonymous UVM object. */ void uao_reference(struct uvm_object *uobj) { /* Kernel object is persistent. */ if (UVM_OBJ_IS_KERN_OBJECT(uobj)) { return; } atomic_inc_uint(&uobj->uo_refs); } /* * uao_detach: drop a reference to an anonymous UVM object. */ void uao_detach(struct uvm_object *uobj) { struct uvm_aobj *aobj = (struct uvm_aobj *)uobj; struct uvm_page_array a; struct vm_page *pg; UVMHIST_FUNC(__func__); UVMHIST_CALLED(maphist); /* * Detaching from kernel object is a NOP. */ if (UVM_OBJ_IS_KERN_OBJECT(uobj)) return; /* * Drop the reference. If it was the last one, destroy the object. */ KASSERT(uobj->uo_refs > 0); UVMHIST_LOG(maphist," (uobj=%#jx) ref=%jd", (uintptr_t)uobj, uobj->uo_refs, 0, 0); membar_release(); if (atomic_dec_uint_nv(&uobj->uo_refs) > 0) { UVMHIST_LOG(maphist, "<- done (rc>0)", 0,0,0,0); return; } membar_acquire(); /* * Remove the aobj from the global list. */ mutex_enter(&uao_list_lock); LIST_REMOVE(aobj, u_list); mutex_exit(&uao_list_lock); /* * Free all the pages left in the aobj. For each page, when the * page is no longer busy (and thus after any disk I/O that it is * involved in is complete), release any swap resources and free * the page itself. */ uvm_page_array_init(&a, uobj, 0); rw_enter(uobj->vmobjlock, RW_WRITER); while ((pg = uvm_page_array_fill_and_peek(&a, 0, 0)) != NULL) { uvm_page_array_advance(&a); pmap_page_protect(pg, VM_PROT_NONE); if (pg->flags & PG_BUSY) { uvm_pagewait(pg, uobj->vmobjlock, "uao_det"); uvm_page_array_clear(&a); rw_enter(uobj->vmobjlock, RW_WRITER); continue; } uao_dropswap(&aobj->u_obj, pg->offset >> PAGE_SHIFT); uvm_pagefree(pg); } uvm_page_array_fini(&a); /* * Finally, free the anonymous UVM object itself. */ uao_free(aobj); } /* * uao_put: flush pages out of a uvm object * * => object should be locked by caller. we may _unlock_ the object * if (and only if) we need to clean a page (PGO_CLEANIT). * XXXJRT Currently, however, we don't. In the case of cleaning * XXXJRT a page, we simply just deactivate it. Should probably * XXXJRT handle this better, in the future (although "flushing" * XXXJRT anonymous memory isn't terribly important). * => if PGO_CLEANIT is not set, then we will neither unlock the object * or block. * => if PGO_ALLPAGE is set, then all pages in the object are valid targets * for flushing. * => we return 0 unless we encountered some sort of I/O error * XXXJRT currently never happens, as we never directly initiate * XXXJRT I/O */ static int uao_put(struct uvm_object *uobj, voff_t start, voff_t stop, int flags) { struct uvm_aobj *aobj = (struct uvm_aobj *)uobj; struct uvm_page_array a; struct vm_page *pg; voff_t curoff; UVMHIST_FUNC(__func__); UVMHIST_CALLED(maphist); KASSERT(UVM_OBJ_IS_AOBJ(uobj)); KASSERT(rw_write_held(uobj->vmobjlock)); if (flags & PGO_ALLPAGES) { start = 0; stop = aobj->u_pages << PAGE_SHIFT; } else { start = trunc_page(start); if (stop == 0) { stop = aobj->u_pages << PAGE_SHIFT; } else { stop = round_page(stop); } if (stop > (uint64_t)(aobj->u_pages << PAGE_SHIFT)) { printf("uao_put: strange, got an out of range " "flush %#jx > %#jx (fixed)\n", (uintmax_t)stop, (uintmax_t)(aobj->u_pages << PAGE_SHIFT)); stop = aobj->u_pages << PAGE_SHIFT; } } UVMHIST_LOG(maphist, " flush start=%#jx, stop=%#jx, flags=%#jx", start, stop, flags, 0); /* * Don't need to do any work here if we're not freeing * or deactivating pages. */ if ((flags & (PGO_DEACTIVATE|PGO_FREE)) == 0) { rw_exit(uobj->vmobjlock); return 0; } /* locked: uobj */ uvm_page_array_init(&a, uobj, 0); curoff = start; while ((pg = uvm_page_array_fill_and_peek(&a, curoff, 0)) != NULL) { if (pg->offset >= stop) { break; } /* * wait and try again if the page is busy. */ if (pg->flags & PG_BUSY) { uvm_pagewait(pg, uobj->vmobjlock, "uao_put"); uvm_page_array_clear(&a); rw_enter(uobj->vmobjlock, RW_WRITER); continue; } uvm_page_array_advance(&a); curoff = pg->offset + PAGE_SIZE; switch (flags & (PGO_CLEANIT|PGO_FREE|PGO_DEACTIVATE)) { /* * XXX In these first 3 cases, we always just * XXX deactivate the page. We may want to * XXX handle the different cases more specifically * XXX in the future. */ case PGO_CLEANIT|PGO_FREE: case PGO_CLEANIT|PGO_DEACTIVATE: case PGO_DEACTIVATE: deactivate_it: uvm_pagelock(pg); uvm_pagedeactivate(pg); uvm_pageunlock(pg); break; case PGO_FREE: /* * If there are multiple references to * the object, just deactivate the page. */ if (uobj->uo_refs > 1) goto deactivate_it; /* * free the swap slot and the page. */ pmap_page_protect(pg, VM_PROT_NONE); /* * freeing swapslot here is not strictly necessary. * however, leaving it here doesn't save much * because we need to update swap accounting anyway. */ uao_dropswap(uobj, pg->offset >> PAGE_SHIFT); uvm_pagefree(pg); break; default: panic("%s: impossible", __func__); } } rw_exit(uobj->vmobjlock); uvm_page_array_fini(&a); return 0; } /* * uao_get: fetch me a page * * we have three cases: * 1: page is resident -> just return the page. * 2: page is zero-fill -> allocate a new page and zero it. * 3: page is swapped out -> fetch the page from swap. * * case 1 can be handled with PGO_LOCKED, cases 2 and 3 cannot. * so, if the "center" page hits case 2/3 then we will need to return EBUSY. * * => prefer map unlocked (not required) * => object must be locked! we will _unlock_ it before starting any I/O. * => flags: PGO_LOCKED: fault data structures are locked * => NOTE: offset is the offset of pps[0], _NOT_ pps[centeridx] * => NOTE: caller must check for released pages!! */ static int uao_get(struct uvm_object *uobj, voff_t offset, struct vm_page **pps, int *npagesp, int centeridx, vm_prot_t access_type, int advice, int flags) { voff_t current_offset; struct vm_page *ptmp; int lcv, gotpages, maxpages, swslot, pageidx; bool overwrite = ((flags & PGO_OVERWRITE) != 0); struct uvm_page_array a; UVMHIST_FUNC(__func__); UVMHIST_CALLARGS(pdhist, "aobj=%#jx offset=%jd, flags=%#jx", (uintptr_t)uobj, offset, flags,0); /* * the object must be locked. it can only be a read lock when * processing a read fault with PGO_LOCKED. */ KASSERT(UVM_OBJ_IS_AOBJ(uobj)); KASSERT(rw_lock_held(uobj->vmobjlock)); KASSERT(rw_write_held(uobj->vmobjlock) || ((flags & PGO_LOCKED) != 0 && (access_type & VM_PROT_WRITE) == 0)); /* * get number of pages */ maxpages = *npagesp; /* * step 1: handled the case where fault data structures are locked. */ if (flags & PGO_LOCKED) { /* * step 1a: get pages that are already resident. only do * this if the data structures are locked (i.e. the first * time through). */ uvm_page_array_init(&a, uobj, 0); gotpages = 0; /* # of pages we got so far */ for (lcv = 0; lcv < maxpages; lcv++) { ptmp = uvm_page_array_fill_and_peek(&a, offset + (lcv << PAGE_SHIFT), maxpages); if (ptmp == NULL) { break; } KASSERT(ptmp->offset >= offset); lcv = (ptmp->offset - offset) >> PAGE_SHIFT; if (lcv >= maxpages) { break; } uvm_page_array_advance(&a); /* * to be useful must get a non-busy page */ if ((ptmp->flags & PG_BUSY) != 0) { continue; } /* * useful page: plug it in our result array */ KASSERT(uvm_pagegetdirty(ptmp) != UVM_PAGE_STATUS_CLEAN); pps[lcv] = ptmp; gotpages++; } uvm_page_array_fini(&a); /* * step 1b: now we've either done everything needed or we * to unlock and do some waiting or I/O. */ UVMHIST_LOG(pdhist, "<- done (done=%jd)", (pps[centeridx] != NULL), 0,0,0); *npagesp = gotpages; return pps[centeridx] != NULL ? 0 : EBUSY; } /* * step 2: get non-resident or busy pages. * object is locked. data structures are unlocked. */ if ((flags & PGO_SYNCIO) == 0) { goto done; } uvm_page_array_init(&a, uobj, 0); for (lcv = 0, current_offset = offset ; lcv < maxpages ;) { /* * we have yet to locate the current page (pps[lcv]). we * first look for a page that is already at the current offset. * if we find a page, we check to see if it is busy or * released. if that is the case, then we sleep on the page * until it is no longer busy or released and repeat the lookup. * if the page we found is neither busy nor released, then we * busy it (so we own it) and plug it into pps[lcv]. we are * ready to move on to the next page. */ ptmp = uvm_page_array_fill_and_peek(&a, current_offset, maxpages - lcv); if (ptmp != NULL && ptmp->offset == current_offset) { /* page is there, see if we need to wait on it */ if ((ptmp->flags & PG_BUSY) != 0) { UVMHIST_LOG(pdhist, "sleeping, ptmp->flags %#jx\n", ptmp->flags,0,0,0); uvm_pagewait(ptmp, uobj->vmobjlock, "uao_get"); rw_enter(uobj->vmobjlock, RW_WRITER); uvm_page_array_clear(&a); continue; } /* * if we get here then the page is resident and * unbusy. we busy it now (so we own it). if * overwriting, mark the page dirty up front as * it will be zapped via an unmanaged mapping. */ KASSERT(uvm_pagegetdirty(ptmp) != UVM_PAGE_STATUS_CLEAN); if (overwrite) { uvm_pagemarkdirty(ptmp, UVM_PAGE_STATUS_DIRTY); } /* we own it, caller must un-busy */ ptmp->flags |= PG_BUSY; UVM_PAGE_OWN(ptmp, "uao_get2"); pps[lcv++] = ptmp; current_offset += PAGE_SIZE; uvm_page_array_advance(&a); continue; } else { KASSERT(ptmp == NULL || ptmp->offset > current_offset); } /* * not resident. allocate a new busy/fake/clean page in the * object. if it's in swap we need to do I/O to fill in the * data, otherwise the page needs to be cleared: if it's not * destined to be overwritten, then zero it here and now. */ pageidx = current_offset >> PAGE_SHIFT; swslot = uao_find_swslot(uobj, pageidx); ptmp = uao_pagealloc(uobj, current_offset, swslot != 0 || overwrite ? 0 : UVM_PGA_ZERO); /* out of RAM? */ if (ptmp == NULL) { rw_exit(uobj->vmobjlock); UVMHIST_LOG(pdhist, "sleeping, ptmp == NULL",0,0,0,0); uvm_wait("uao_getpage"); rw_enter(uobj->vmobjlock, RW_WRITER); uvm_page_array_clear(&a); continue; } /* * if swslot == 0, page hasn't existed before and is zeroed. * otherwise we have a "fake/busy/clean" page that we just * allocated. do the needed "i/o", reading from swap. */ if (swslot != 0) { #if defined(VMSWAP) int error; UVMHIST_LOG(pdhist, "pagein from swslot %jd", swslot, 0,0,0); /* * page in the swapped-out page. * unlock object for i/o, relock when done. */ uvm_page_array_clear(&a); rw_exit(uobj->vmobjlock); error = uvm_swap_get(ptmp, swslot, PGO_SYNCIO); rw_enter(uobj->vmobjlock, RW_WRITER); /* * I/O done. check for errors. */ if (error != 0) { UVMHIST_LOG(pdhist, "<- done (error=%jd)", error,0,0,0); /* * remove the swap slot from the aobj * and mark the aobj as having no real slot. * don't free the swap slot, thus preventing * it from being used again. */ swslot = uao_set_swslot(uobj, pageidx, SWSLOT_BAD); if (swslot > 0) { uvm_swap_markbad(swslot, 1); } uvm_pagefree(ptmp); rw_exit(uobj->vmobjlock); UVMHIST_LOG(pdhist, "<- done (error)", error,lcv,0,0); if (lcv != 0) { uvm_page_unbusy(pps, lcv); } memset(pps, 0, maxpages * sizeof(pps[0])); uvm_page_array_fini(&a); return error; } #else /* defined(VMSWAP) */ panic("%s: pagein", __func__); #endif /* defined(VMSWAP) */ } /* * note that we will allow the page being writably-mapped * (!PG_RDONLY) regardless of access_type. if overwrite, * the page can be modified through an unmanaged mapping * so mark it dirty up front. */ if (overwrite) { uvm_pagemarkdirty(ptmp, UVM_PAGE_STATUS_DIRTY); } else { uvm_pagemarkdirty(ptmp, UVM_PAGE_STATUS_UNKNOWN); } /* * we got the page! clear the fake flag (indicates valid * data now in page) and plug into our result array. note * that page is still busy. * * it is the callers job to: * => check if the page is released * => unbusy the page * => activate the page */ KASSERT(uvm_pagegetdirty(ptmp) != UVM_PAGE_STATUS_CLEAN); KASSERT((ptmp->flags & PG_FAKE) != 0); KASSERT(ptmp->offset == current_offset); ptmp->flags &= ~PG_FAKE; pps[lcv++] = ptmp; current_offset += PAGE_SIZE; } uvm_page_array_fini(&a); /* * finally, unlock object and return. */ done: rw_exit(uobj->vmobjlock); UVMHIST_LOG(pdhist, "<- done (OK)",0,0,0,0); return 0; } #if defined(VMSWAP) /* * uao_dropswap: release any swap resources from this aobj page. * * => aobj must be locked or have a reference count of 0. */ void uao_dropswap(struct uvm_object *uobj, int pageidx) { int slot; KASSERT(UVM_OBJ_IS_AOBJ(uobj)); slot = uao_set_swslot(uobj, pageidx, 0); if (slot) { uvm_swap_free(slot, 1); } } /* * page in every page in every aobj that is paged-out to a range of swslots. * * => nothing should be locked. * => returns true if pagein was aborted due to lack of memory. */ bool uao_swap_off(int startslot, int endslot) { struct uvm_aobj *aobj; /* * Walk the list of all anonymous UVM objects. Grab the first. */ mutex_enter(&uao_list_lock); if ((aobj = LIST_FIRST(&uao_list)) == NULL) { mutex_exit(&uao_list_lock); return false; } uao_reference(&aobj->u_obj); do { struct uvm_aobj *nextaobj; bool rv; /* * Prefetch the next object and immediately hold a reference * on it, so neither the current nor the next entry could * disappear while we are iterating. */ if ((nextaobj = LIST_NEXT(aobj, u_list)) != NULL) { uao_reference(&nextaobj->u_obj); } mutex_exit(&uao_list_lock); /* * Page in all pages in the swap slot range. */ rw_enter(aobj->u_obj.vmobjlock, RW_WRITER); rv = uao_pagein(aobj, startslot, endslot); rw_exit(aobj->u_obj.vmobjlock); /* Drop the reference of the current object. */ uao_detach(&aobj->u_obj); if (rv) { if (nextaobj) { uao_detach(&nextaobj->u_obj); } return rv; } aobj = nextaobj; mutex_enter(&uao_list_lock); } while (aobj); mutex_exit(&uao_list_lock); return false; } /* * page in any pages from aobj in the given range. * * => aobj must be locked and is returned locked. * => returns true if pagein was aborted due to lack of memory. */ static bool uao_pagein(struct uvm_aobj *aobj, int startslot, int endslot) { bool rv; if (UAO_USES_SWHASH(aobj)) { struct uao_swhash_elt *elt; int buck; restart: for (buck = aobj->u_swhashmask; buck >= 0; buck--) { for (elt = LIST_FIRST(&aobj->u_swhash[buck]); elt != NULL; elt = LIST_NEXT(elt, list)) { int i; for (i = 0; i < UAO_SWHASH_CLUSTER_SIZE; i++) { int slot = elt->slots[i]; /* * if the slot isn't in range, skip it. */ if (slot < startslot || slot >= endslot) { continue; } /* * process the page, * the start over on this object * since the swhash elt * may have been freed. */ rv = uao_pagein_page(aobj, UAO_SWHASH_ELT_PAGEIDX_BASE(elt) + i); if (rv) { return rv; } goto restart; } } } } else { int i; for (i = 0; i < aobj->u_pages; i++) { int slot = aobj->u_swslots[i]; /* * if the slot isn't in range, skip it */ if (slot < startslot || slot >= endslot) { continue; } /* * process the page. */ rv = uao_pagein_page(aobj, i); if (rv) { return rv; } } } return false; } /* * uao_pagein_page: page in a single page from an anonymous UVM object. * * => Returns true if pagein was aborted due to lack of memory. * => Object must be locked and is returned locked. */ static bool uao_pagein_page(struct uvm_aobj *aobj, int pageidx) { struct uvm_object *uobj = &aobj->u_obj; struct vm_page *pg; int rv, npages; pg = NULL; npages = 1; KASSERT(rw_write_held(uobj->vmobjlock)); rv = uao_get(uobj, (voff_t)pageidx << PAGE_SHIFT, &pg, &npages, 0, VM_PROT_READ | VM_PROT_WRITE, 0, PGO_SYNCIO); /* * relock and finish up. */ rw_enter(uobj->vmobjlock, RW_WRITER); switch (rv) { case 0: break; case EIO: case ERESTART: /* * nothing more to do on errors. * ERESTART can only mean that the anon was freed, * so again there's nothing to do. */ return false; default: return true; } /* * ok, we've got the page now. * mark it as dirty, clear its swslot and un-busy it. */ uao_dropswap(&aobj->u_obj, pageidx); /* * make sure it's on a page queue. */ uvm_pagelock(pg); uvm_pageenqueue(pg); uvm_pagewakeup(pg); uvm_pageunlock(pg); pg->flags &= ~(PG_BUSY|PG_FAKE); uvm_pagemarkdirty(pg, UVM_PAGE_STATUS_DIRTY); UVM_PAGE_OWN(pg, NULL); return false; } /* * uao_dropswap_range: drop swapslots in the range. * * => aobj must be locked and is returned locked. * => start is inclusive. end is exclusive. */ void uao_dropswap_range(struct uvm_object *uobj, voff_t start, voff_t end) { struct uvm_aobj *aobj = (struct uvm_aobj *)uobj; int swpgonlydelta = 0; KASSERT(UVM_OBJ_IS_AOBJ(uobj)); KASSERT(rw_write_held(uobj->vmobjlock)); if (end == 0) { end = INT64_MAX; } if (UAO_USES_SWHASH(aobj)) { int i, hashbuckets = aobj->u_swhashmask + 1; voff_t taghi; voff_t taglo; taglo = UAO_SWHASH_ELT_TAG(start); taghi = UAO_SWHASH_ELT_TAG(end); for (i = 0; i < hashbuckets; i++) { struct uao_swhash_elt *elt, *next; for (elt = LIST_FIRST(&aobj->u_swhash[i]); elt != NULL; elt = next) { int startidx, endidx; int j; next = LIST_NEXT(elt, list); if (elt->tag < taglo || taghi < elt->tag) { continue; } if (elt->tag == taglo) { startidx = UAO_SWHASH_ELT_PAGESLOT_IDX(start); } else { startidx = 0; } if (elt->tag == taghi) { endidx = UAO_SWHASH_ELT_PAGESLOT_IDX(end); } else { endidx = UAO_SWHASH_CLUSTER_SIZE; } for (j = startidx; j < endidx; j++) { int slot = elt->slots[j]; KASSERT(uvm_pagelookup(&aobj->u_obj, (UAO_SWHASH_ELT_PAGEIDX_BASE(elt) + j) << PAGE_SHIFT) == NULL); if (slot > 0) { uvm_swap_free(slot, 1); swpgonlydelta++; KASSERT(elt->count > 0); elt->slots[j] = 0; elt->count--; } } if (elt->count == 0) { LIST_REMOVE(elt, list); pool_put(&uao_swhash_elt_pool, elt); } } } } else { int i; if (aobj->u_pages < end) { end = aobj->u_pages; } for (i = start; i < end; i++) { int slot = aobj->u_swslots[i]; if (slot > 0) { uvm_swap_free(slot, 1); swpgonlydelta++; } } } /* * adjust the counter of pages only in swap for all * the swap slots we've freed. */ if (swpgonlydelta > 0) { KASSERT(uvmexp.swpgonly >= swpgonlydelta); atomic_add_int(&uvmexp.swpgonly, -swpgonlydelta); } } #endif /* defined(VMSWAP) */
18 2 18 18 18 6 12 18 18 3 3 57 8 1 8 48 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 /* $NetBSD: if_loop.c,v 1.118 2022/09/04 23:34:51 thorpej Exp $ */ /* * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the project nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * Copyright (c) 1982, 1986, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)if_loop.c 8.2 (Berkeley) 1/9/95 */ /* * Loopback interface driver for protocol testing and timing. */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: if_loop.c,v 1.118 2022/09/04 23:34:51 thorpej Exp $"); #ifdef _KERNEL_OPT #include "opt_inet.h" #include "opt_atalk.h" #include "opt_mbuftrace.h" #include "opt_mpls.h" #include "opt_net_mpsafe.h" #endif #include <sys/param.h> #include <sys/systm.h> #include <sys/kernel.h> #include <sys/mbuf.h> #include <sys/socket.h> #include <sys/errno.h> #include <sys/ioctl.h> #include <sys/time.h> #include <sys/device.h> #include <sys/module.h> #include <sys/cpu.h> #include <net/if.h> #include <net/if_types.h> #include <net/route.h> #ifdef INET #include <netinet/in.h> #include <netinet/in_systm.h> #include <netinet/in_var.h> #include <netinet/in_offload.h> #include <netinet/ip.h> #include <netinet/ip_var.h> #endif #ifdef INET6 #ifndef INET #include <netinet/in.h> #endif #include <netinet6/in6_var.h> #include <netinet6/in6_offload.h> #include <netinet/ip6.h> #endif #ifdef MPLS #include <netmpls/mpls.h> #include <netmpls/mpls_var.h> #endif #ifdef NETATALK #include <netatalk/at.h> #include <netatalk/at_var.h> #endif #include <net/bpf.h> #if defined(LARGE_LOMTU) #define LOMTU (131072 + MHLEN + MLEN) #define LOMTU_MAX LOMTU #else #define LOMTU (32768 + MHLEN + MLEN) #define LOMTU_MAX (65536 + MHLEN + MLEN) #endif #ifdef ALTQ static void lostart(struct ifnet *); #endif static int loop_clone_create(struct if_clone *, int); static int loop_clone_destroy(struct ifnet *); static void loop_rtrequest(int, struct rtentry *, const struct rt_addrinfo *); static struct if_clone loop_cloner = IF_CLONE_INITIALIZER("lo", loop_clone_create, loop_clone_destroy); void loopattach(int n) { #ifndef _MODULE loop_clone_create(&loop_cloner, 0); /* lo0 always exists */ #endif } void loopinit(void) { if (lo0ifp != NULL) /* can happen in rump kernel */ return; #ifdef _MODULE loop_clone_create(&loop_cloner, 0); /* lo0 always exists */ #endif if_clone_attach(&loop_cloner); } static int loopdetach(void) { /* no detach for now; we don't allow lo0 to be deleted */ return EBUSY; } static int loop_clone_create(struct if_clone *ifc, int unit) { struct ifnet *ifp; ifp = if_alloc(IFT_LOOP); if_initname(ifp, ifc->ifc_name, unit); ifp->if_mtu = LOMTU; ifp->if_flags = IFF_LOOPBACK | IFF_MULTICAST; #ifdef NET_MPSAFE ifp->if_extflags = IFEF_MPSAFE; #endif ifp->if_ioctl = loioctl; ifp->if_output = looutput; #ifdef ALTQ ifp->if_start = lostart; #endif ifp->if_type = IFT_LOOP; ifp->if_hdrlen = 0; ifp->if_addrlen = 0; ifp->if_dlt = DLT_NULL; IFQ_SET_READY(&ifp->if_snd); if (unit == 0) lo0ifp = ifp; if_initialize(ifp); ifp->if_link_state = LINK_STATE_UP; if_alloc_sadl(ifp); bpf_attach(ifp, DLT_NULL, sizeof(u_int)); #ifdef MBUFTRACE ifp->if_mowner = malloc(sizeof(struct mowner), M_DEVBUF, M_WAITOK | M_ZERO); strlcpy(ifp->if_mowner->mo_name, ifp->if_xname, sizeof(ifp->if_mowner->mo_name)); MOWNER_ATTACH(ifp->if_mowner); #endif ifp->if_flags |= IFF_RUNNING; if_register(ifp); return (0); } static int loop_clone_destroy(struct ifnet *ifp) { if (ifp == lo0ifp) return (EPERM); ifp->if_flags &= ~IFF_RUNNING; #ifdef MBUFTRACE MOWNER_DETACH(ifp->if_mowner); free(ifp->if_mowner, M_DEVBUF); #endif bpf_detach(ifp); if_detach(ifp); if_free(ifp); return (0); } int looutput(struct ifnet *ifp, struct mbuf *m, const struct sockaddr *dst, const struct rtentry *rt) { pktqueue_t *pktq = NULL; int s; int csum_flags; int error = 0; size_t pktlen; MCLAIM(m, ifp->if_mowner); KERNEL_LOCK_UNLESS_NET_MPSAFE(); if ((m->m_flags & M_PKTHDR) == 0) panic("looutput: no header mbuf"); if (ifp->if_flags & IFF_LOOPBACK) bpf_mtap_af(ifp, dst->sa_family, m, BPF_D_OUT); m_set_rcvif(m, ifp); if (rt && rt->rt_flags & (RTF_REJECT|RTF_BLACKHOLE)) { m_freem(m); error = (rt->rt_flags & RTF_BLACKHOLE ? 0 : rt->rt_flags & RTF_HOST ? EHOSTUNREACH : ENETUNREACH); goto out; } pktlen = m->m_pkthdr.len; if_statadd2(ifp, if_opackets, 1, if_obytes, pktlen); #ifdef ALTQ /* * ALTQ on the loopback interface is just for debugging. It's * used only for loopback interfaces, not for a simplex interface. */ if ((ALTQ_IS_ENABLED(&ifp->if_snd) || TBR_IS_ENABLED(&ifp->if_snd)) && ifp->if_start == lostart) { /* * If the queueing discipline needs packet classification, * do it before prepending the link headers. */ IFQ_CLASSIFY(&ifp->if_snd, m, dst->sa_family); M_PREPEND(m, sizeof(uint32_t), M_DONTWAIT); if (m == NULL) { if_statinc(ifp, if_oerrors); error = ENOBUFS; goto out; } *(mtod(m, uint32_t *)) = dst->sa_family; error = if_transmit_lock(ifp, m); goto out; } #endif /* ALTQ */ m_tag_delete_chain(m); #ifdef MPLS bool is_mpls = false; if (rt != NULL && rt_gettag(rt) != NULL && rt_gettag(rt)->sa_family == AF_MPLS && (m->m_flags & (M_MCAST | M_BCAST)) == 0) { union mpls_shim msh; msh.s_addr = MPLS_GETSADDR(rt); if (msh.shim.label != MPLS_LABEL_IMPLNULL) { is_mpls = true; pktq = mpls_pktq; } } if (!is_mpls) #endif switch (dst->sa_family) { #ifdef INET case AF_INET: csum_flags = m->m_pkthdr.csum_flags; KASSERT((csum_flags & ~(M_CSUM_IPv4|M_CSUM_UDPv4)) == 0); if (csum_flags != 0 && IN_LOOPBACK_NEED_CHECKSUM(csum_flags)) { in_undefer_cksum(m, 0, csum_flags); m->m_pkthdr.csum_flags = 0; } else { /* * Do nothing. Pass M_CSUM_IPv4 and M_CSUM_UDPv4 as * they are to tell those are calculated and good. */ } pktq = ip_pktq; break; #endif #ifdef INET6 case AF_INET6: csum_flags = m->m_pkthdr.csum_flags; KASSERT((csum_flags & ~M_CSUM_UDPv6) == 0); if (csum_flags != 0 && IN6_LOOPBACK_NEED_CHECKSUM(csum_flags)) { in6_undefer_cksum(m, 0, csum_flags); m->m_pkthdr.csum_flags = 0; } else { /* * Do nothing. Pass M_CSUM_UDPv6 as * they are to tell those are calculated and good. */ } m->m_flags |= M_LOOP; pktq = ip6_pktq; break; #endif #ifdef NETATALK case AF_APPLETALK: pktq = at_pktq2; break; #endif default: printf("%s: can't handle af%d\n", ifp->if_xname, dst->sa_family); m_freem(m); error = EAFNOSUPPORT; goto out; } KASSERT(pktq != NULL); error = 0; s = splnet(); if (__predict_true(pktq_enqueue(pktq, m, 0))) { if_statadd2(ifp, if_ipackets, 1, if_ibytes, pktlen); } else { m_freem(m); if_statinc(ifp, if_oerrors); error = ENOBUFS; } splx(s); out: KERNEL_UNLOCK_UNLESS_NET_MPSAFE(); return error; } #ifdef ALTQ static void lostart(struct ifnet *ifp) { for (;;) { pktqueue_t *pktq = NULL; struct mbuf *m; size_t pktlen; uint32_t af; int s; IFQ_DEQUEUE(&ifp->if_snd, m); if (m == NULL) return; af = *(mtod(m, uint32_t *)); m_adj(m, sizeof(uint32_t)); switch (af) { #ifdef INET case AF_INET: pktq = ip_pktq; break; #endif #ifdef INET6 case AF_INET6: m->m_flags |= M_LOOP; pktq = ip6_pktq; break; #endif #ifdef NETATALK case AF_APPLETALK: pktq = at_pktq2; break; #endif default: printf("%s: can't handle af%d\n", ifp->if_xname, af); m_freem(m); return; } pktlen = m->m_pkthdr.len; KASSERT(pktq != NULL); s = splnet(); if (__predict_false(pktq_enqueue(pktq, m, 0))) { m_freem(m); splx(s); return; } if_statadd2(ifp, if_ipackets, 1, if_ibytes, pktlen); splx(s); } } #endif /* ALTQ */ /* ARGSUSED */ static void loop_rtrequest(int cmd, struct rtentry *rt, const struct rt_addrinfo *info) { if (rt) rt->rt_rmx.rmx_mtu = lo0ifp->if_mtu; } /* * Process an ioctl request. */ /* ARGSUSED */ int loioctl(struct ifnet *ifp, u_long cmd, void *data) { struct ifaddr *ifa; struct ifreq *ifr = data; int error = 0; switch (cmd) { case SIOCINITIFADDR: ifp->if_flags |= IFF_UP; ifa = (struct ifaddr *)data; if (ifa != NULL) ifa->ifa_rtrequest = loop_rtrequest; /* * Everything else is done at a higher level. */ break; case SIOCSIFMTU: if ((unsigned)ifr->ifr_mtu > LOMTU_MAX) error = EINVAL; else if ((error = ifioctl_common(ifp, cmd, data)) == ENETRESET){ error = 0; } break; case SIOCADDMULTI: case SIOCDELMULTI: if (ifr == NULL) { error = EAFNOSUPPORT; /* XXX */ break; } switch (ifreq_getaddr(cmd, ifr)->sa_family) { #ifdef INET case AF_INET: break; #endif #ifdef INET6 case AF_INET6: break; #endif default: error = EAFNOSUPPORT; break; } break; default: error = ifioctl_common(ifp, cmd, data); } return (error); } /* * Module infrastructure */ #include "if_module.h" IF_MODULE(MODULE_CLASS_DRIVER, loop, NULL)
9 2 9 2 2 2 2 2 2 1 1 1 1 2 2 2 9 1 8 9 9 4 1 3 1 2 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 /* $NetBSD: ccd.c,v 1.191 2024/04/12 05:04:02 pgoyette Exp $ */ /*- * Copyright (c) 1996, 1997, 1998, 1999, 2007, 2009 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Jason R. Thorpe, and by Andrew Doran. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /* * Copyright (c) 1988 University of Utah. * Copyright (c) 1990, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * the Systems Programming Group of the University of Utah Computer * Science Department. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: Utah $Hdr: cd.c 1.6 90/11/28$ * * @(#)cd.c 8.2 (Berkeley) 11/16/93 */ /* * "Concatenated" disk driver. * * Notes on concurrency: * * => sc_dvlock serializes access to the device nodes, excluding block I/O. * * => sc_iolock serializes access to (sc_flags & CCDF_INITED), disk stats, * sc_stop, sc_bufq and b_resid from master buffers. * * => a combination of CCDF_INITED, sc_inflight, and sc_iolock is used to * serialize I/O and configuration changes. * * => the in-core disk label does not change while the device is open. * * On memory consumption: ccd fans out I/O requests and so needs to * allocate memory. If the system is desperately low on memory, we * single thread I/O. */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: ccd.c,v 1.191 2024/04/12 05:04:02 pgoyette Exp $"); #include <sys/param.h> #include <sys/systm.h> #include <sys/kernel.h> #include <sys/proc.h> #include <sys/errno.h> #include <sys/buf.h> #include <sys/kmem.h> #include <sys/pool.h> #include <sys/module.h> #include <sys/namei.h> #include <sys/stat.h> #include <sys/ioctl.h> #include <sys/disklabel.h> #include <sys/device.h> #include <sys/disk.h> #include <sys/syslog.h> #include <sys/fcntl.h> #include <sys/vnode.h> #include <sys/conf.h> #include <sys/mutex.h> #include <sys/queue.h> #include <sys/kauth.h> #include <sys/kthread.h> #include <sys/bufq.h> #include <sys/sysctl.h> #include <sys/compat_stub.h> #include <uvm/uvm_extern.h> #include <dev/ccdvar.h> #include <dev/dkvar.h> #include <miscfs/specfs/specdev.h> /* for v_rdev */ #include "ioconf.h" #if defined(CCDDEBUG) && !defined(DEBUG) #define DEBUG #endif #ifdef DEBUG #define CCDB_FOLLOW 0x01 #define CCDB_INIT 0x02 #define CCDB_IO 0x04 #define CCDB_LABEL 0x08 #define CCDB_VNODE 0x10 int ccddebug = 0x00; #endif #define ccdunit(x) DISKUNIT(x) struct ccdbuf { struct buf cb_buf; /* new I/O buf */ struct buf *cb_obp; /* ptr. to original I/O buf */ struct ccd_softc *cb_sc; /* pointer to ccd softc */ int cb_comp; /* target component */ SIMPLEQ_ENTRY(ccdbuf) cb_q; /* fifo of component buffers */ }; /* component buffer pool */ static pool_cache_t ccd_cache; #define CCD_GETBUF(wait) pool_cache_get(ccd_cache, wait) #define CCD_PUTBUF(cbp) pool_cache_put(ccd_cache, cbp) #define CCDLABELDEV(dev) \ (MAKEDISKDEV(major((dev)), ccdunit((dev)), RAW_PART)) /* called by main() at boot time */ void ccddetach(void); /* called by biodone() at interrupt time */ static void ccdiodone(struct buf *); static void ccdinterleave(struct ccd_softc *); static int ccdinit(struct ccd_softc *, char **, struct vnode **, struct lwp *); static struct ccdbuf *ccdbuffer(struct ccd_softc *, struct buf *, daddr_t, void *, long, int); static void ccdgetdefaultlabel(struct ccd_softc *, struct disklabel *); static void ccdgetdisklabel(dev_t); static void ccdmakedisklabel(struct ccd_softc *); static int ccdstart(struct ccd_softc *, struct buf *, int); static void ccdthread(void *); static dev_type_open(ccdopen); static dev_type_close(ccdclose); static dev_type_read(ccdread); static dev_type_write(ccdwrite); static dev_type_ioctl(ccdioctl); static dev_type_strategy(ccdstrategy); static dev_type_size(ccdsize); const struct bdevsw ccd_bdevsw = { .d_open = ccdopen, .d_close = ccdclose, .d_strategy = ccdstrategy, .d_ioctl = ccdioctl, .d_dump = nodump, .d_psize = ccdsize, .d_discard = nodiscard, .d_flag = D_DISK | D_MPSAFE }; const struct cdevsw ccd_cdevsw = { .d_open = ccdopen, .d_close = ccdclose, .d_read = ccdread, .d_write = ccdwrite, .d_ioctl = ccdioctl, .d_stop = nostop, .d_tty = notty, .d_poll = nopoll, .d_mmap = nommap, .d_kqfilter = nokqfilter, .d_discard = nodiscard, .d_flag = D_DISK | D_MPSAFE }; static const struct dkdriver ccddkdriver = { .d_strategy = ccdstrategy, .d_minphys = minphys }; #ifdef DEBUG static void printiinfo(struct ccdiinfo *); #endif static LIST_HEAD(, ccd_softc) ccds = LIST_HEAD_INITIALIZER(ccds); static kmutex_t ccd_lock; SYSCTL_SETUP_PROTO(sysctl_kern_ccd_setup); static struct ccd_softc * ccdcreate(int unit) { struct ccd_softc *sc = kmem_zalloc(sizeof(*sc), KM_SLEEP); /* Initialize per-softc structures. */ snprintf(sc->sc_xname, sizeof(sc->sc_xname), "ccd%d", unit); sc->sc_unit = unit; mutex_init(&sc->sc_dvlock, MUTEX_DEFAULT, IPL_NONE); sc->sc_iolock = mutex_obj_alloc(MUTEX_DEFAULT, IPL_NONE); cv_init(&sc->sc_stop, "ccdstop"); cv_init(&sc->sc_push, "ccdthr"); disk_init(&sc->sc_dkdev, sc->sc_xname, &ccddkdriver); return sc; } static void ccddestroy(struct ccd_softc *sc) { mutex_obj_free(sc->sc_iolock); mutex_exit(&sc->sc_dvlock); mutex_destroy(&sc->sc_dvlock); cv_destroy(&sc->sc_stop); cv_destroy(&sc->sc_push); disk_destroy(&sc->sc_dkdev); kmem_free(sc, sizeof(*sc)); } static struct ccd_softc * ccdget(int unit, int make) { struct ccd_softc *sc; if (unit < 0) { #ifdef DIAGNOSTIC panic("%s: unit %d!", __func__, unit); #endif return NULL; } mutex_enter(&ccd_lock); LIST_FOREACH(sc, &ccds, sc_link) { if (sc->sc_unit == unit) { mutex_exit(&ccd_lock); return sc; } } mutex_exit(&ccd_lock); if (!make) return NULL; if ((sc = ccdcreate(unit)) == NULL) return NULL; mutex_enter(&ccd_lock); LIST_INSERT_HEAD(&ccds, sc, sc_link); mutex_exit(&ccd_lock); return sc; } static void ccdput(struct ccd_softc *sc) { mutex_enter(&ccd_lock); LIST_REMOVE(sc, sc_link); mutex_exit(&ccd_lock); ccddestroy(sc); } /* * Called by main() during pseudo-device attachment. All we need * to do is allocate enough space for devices to be configured later. */ void ccdattach(int num) { mutex_init(&ccd_lock, MUTEX_DEFAULT, IPL_NONE); /* Initialize the component buffer pool. */ ccd_cache = pool_cache_init(sizeof(struct ccdbuf), 0, 0, 0, "ccdbuf", NULL, IPL_BIO, NULL, NULL, NULL); } void ccddetach(void) { pool_cache_destroy(ccd_cache); mutex_destroy(&ccd_lock); } static int ccdinit(struct ccd_softc *cs, char **cpaths, struct vnode **vpp, struct lwp *l) { struct ccdcinfo *ci = NULL; int ix; struct ccdgeom *ccg = &cs->sc_geom; char *tmppath; int error, path_alloced; uint64_t psize, minsize; unsigned secsize, maxsecsize; struct disk_geom *dg; #ifdef DEBUG if (ccddebug & (CCDB_FOLLOW|CCDB_INIT)) printf("%s: ccdinit\n", cs->sc_xname); #endif /* Allocate space for the component info. */ cs->sc_cinfo = kmem_alloc(cs->sc_nccdisks * sizeof(*cs->sc_cinfo), KM_SLEEP); tmppath = kmem_alloc(MAXPATHLEN, KM_SLEEP); cs->sc_size = 0; /* * Verify that each component piece exists and record * relevant information about it. */ maxsecsize = 0; minsize = 0; for (ix = 0, path_alloced = 0; ix < cs->sc_nccdisks; ix++) { ci = &cs->sc_cinfo[ix]; ci->ci_vp = vpp[ix]; /* * Copy in the pathname of the component. */ memset(tmppath, 0, MAXPATHLEN); /* sanity */ error = copyinstr(cpaths[ix], tmppath, MAXPATHLEN, &ci->ci_pathlen); if (ci->ci_pathlen == 0) error = EINVAL; if (error) { #ifdef DEBUG if (ccddebug & (CCDB_FOLLOW|CCDB_INIT)) printf("%s: can't copy path, error = %d\n", cs->sc_xname, error); #endif goto out; } ci->ci_path = kmem_alloc(ci->ci_pathlen, KM_SLEEP); memcpy(ci->ci_path, tmppath, ci->ci_pathlen); path_alloced++; /* * XXX: Cache the component's dev_t. */ ci->ci_dev = vpp[ix]->v_rdev; /* * Get partition information for the component. */ error = getdisksize(vpp[ix], &psize, &secsize); if (error) { #ifdef DEBUG if (ccddebug & (CCDB_FOLLOW|CCDB_INIT)) printf("%s: %s: disksize failed, error = %d\n", cs->sc_xname, ci->ci_path, error); #endif goto out; } /* * Calculate the size, truncating to an interleave * boundary if necessary. */ maxsecsize = secsize > maxsecsize ? secsize : maxsecsize; if (cs->sc_ileave > 1) psize -= psize % cs->sc_ileave; if (psize == 0) { #ifdef DEBUG if (ccddebug & (CCDB_FOLLOW|CCDB_INIT)) printf("%s: %s: size == 0\n", cs->sc_xname, ci->ci_path); #endif error = ENODEV; goto out; } if (minsize == 0 || psize < minsize) minsize = psize; ci->ci_size = psize; cs->sc_size += psize; } /* * Don't allow the interleave to be smaller than * the biggest component sector. */ if ((cs->sc_ileave > 0) && (cs->sc_ileave < (maxsecsize / DEV_BSIZE))) { #ifdef DEBUG if (ccddebug & (CCDB_FOLLOW|CCDB_INIT)) printf("%s: interleave must be at least %d\n", cs->sc_xname, (maxsecsize / DEV_BSIZE)); #endif error = EINVAL; goto out; } /* * If uniform interleave is desired set all sizes to that of * the smallest component. */ if (cs->sc_flags & CCDF_UNIFORM) { for (ci = cs->sc_cinfo; ci < &cs->sc_cinfo[cs->sc_nccdisks]; ci++) ci->ci_size = minsize; cs->sc_size = cs->sc_nccdisks * minsize; } /* * Construct the interleave table. */ ccdinterleave(cs); /* * Create pseudo-geometry based on 1MB cylinders. It's * pretty close. */ ccg->ccg_secsize = DEV_BSIZE; ccg->ccg_ntracks = 1; ccg->ccg_nsectors = 1024 * (1024 / ccg->ccg_secsize); ccg->ccg_ncylinders = cs->sc_size / ccg->ccg_nsectors; dg = &cs->sc_dkdev.dk_geom; memset(dg, 0, sizeof(*dg)); dg->dg_secperunit = cs->sc_size; dg->dg_secsize = ccg->ccg_secsize; dg->dg_nsectors = ccg->ccg_nsectors; dg->dg_ntracks = ccg->ccg_ntracks; dg->dg_ncylinders = ccg->ccg_ncylinders; if (cs->sc_ileave > 0) aprint_normal("%s: Interleaving %d component%s " "(%d block interleave)\n", cs->sc_xname, cs->sc_nccdisks, (cs->sc_nccdisks != 0 ? "s" : ""), cs->sc_ileave); else aprint_normal("%s: Concatenating %d component%s\n", cs->sc_xname, cs->sc_nccdisks, (cs->sc_nccdisks != 0 ? "s" : "")); for (ix = 0; ix < cs->sc_nccdisks; ix++) { ci = &cs->sc_cinfo[ix]; aprint_normal("%s: %s (%ju blocks)\n", cs->sc_xname, ci->ci_path, (uintmax_t)ci->ci_size); } aprint_normal("%s: total %ju blocks\n", cs->sc_xname, cs->sc_size); /* * Create thread to handle deferred I/O. */ cs->sc_zap = false; error = kthread_create(PRI_BIO, KTHREAD_MPSAFE, NULL, ccdthread, cs, &cs->sc_thread, "%s", cs->sc_xname); if (error) { printf("ccdinit: can't create thread: %d\n", error); goto out; } /* * Only now that everything is set up can we enable the device. */ mutex_enter(cs->sc_iolock); cs->sc_flags |= CCDF_INITED; mutex_exit(cs->sc_iolock); kmem_free(tmppath, MAXPATHLEN); return (0); out: for (ix = 0; ix < path_alloced; ix++) { kmem_free(cs->sc_cinfo[ix].ci_path, cs->sc_cinfo[ix].ci_pathlen); } kmem_free(cs->sc_cinfo, cs->sc_nccdisks * sizeof(struct ccdcinfo)); kmem_free(tmppath, MAXPATHLEN); return (error); } static void ccdinterleave(struct ccd_softc *cs) { struct ccdcinfo *ci, *smallci; struct ccdiinfo *ii; daddr_t bn, lbn; int ix; u_long size; #ifdef DEBUG if (ccddebug & CCDB_INIT) printf("ccdinterleave(%p): ileave %d\n", cs, cs->sc_ileave); #endif /* * Allocate an interleave table. * Chances are this is too big, but we don't care. */ size = (cs->sc_nccdisks + 1) * sizeof(struct ccdiinfo); cs->sc_itable = kmem_zalloc(size, KM_SLEEP); /* * Trivial case: no interleave (actually interleave of disk size). * Each table entry represents a single component in its entirety. */ if (cs->sc_ileave == 0) { bn = 0; ii = cs->sc_itable; for (ix = 0; ix < cs->sc_nccdisks; ix++) { /* Allocate space for ii_index. */ ii->ii_indexsz = sizeof(int); ii->ii_index = kmem_alloc(ii->ii_indexsz, KM_SLEEP); ii->ii_ndisk = 1; ii->ii_startblk = bn; ii->ii_startoff = 0; ii->ii_index[0] = ix; bn += cs->sc_cinfo[ix].ci_size; ii++; } ii->ii_ndisk = 0; #ifdef DEBUG if (ccddebug & CCDB_INIT) printiinfo(cs->sc_itable); #endif return; } /* * The following isn't fast or pretty; it doesn't have to be. */ size = 0; bn = lbn = 0; for (ii = cs->sc_itable; ; ii++) { /* Allocate space for ii_index. */ ii->ii_indexsz = sizeof(int) * cs->sc_nccdisks; ii->ii_index = kmem_alloc(ii->ii_indexsz, KM_SLEEP); /* * Locate the smallest of the remaining components */ smallci = NULL; for (ci = cs->sc_cinfo; ci < &cs->sc_cinfo[cs->sc_nccdisks]; ci++) if (ci->ci_size > size && (smallci == NULL || ci->ci_size < smallci->ci_size)) smallci = ci; /* * Nobody left, all done */ if (smallci == NULL) { ii->ii_ndisk = 0; break; } /* * Record starting logical block and component offset */ ii->ii_startblk = bn / cs->sc_ileave; ii->ii_startoff = lbn; /* * Determine how many disks take part in this interleave * and record their indices. */ ix = 0; for (ci = cs->sc_cinfo; ci < &cs->sc_cinfo[cs->sc_nccdisks]; ci++) if (ci->ci_size >= smallci->ci_size) ii->ii_index[ix++] = ci - cs->sc_cinfo; ii->ii_ndisk = ix; bn += ix * (smallci->ci_size - size); lbn = smallci->ci_size / cs->sc_ileave; size = smallci->ci_size; } #ifdef DEBUG if (ccddebug & CCDB_INIT) printiinfo(cs->sc_itable); #endif } /* ARGSUSED */ static int ccdopen(dev_t dev, int flags, int fmt, struct lwp *l) { int unit = ccdunit(dev); struct ccd_softc *cs; struct disklabel *lp; int error = 0, part, pmask; #ifdef DEBUG if (ccddebug & CCDB_FOLLOW) printf("ccdopen(0x%"PRIx64", 0x%x)\n", dev, flags); #endif if ((cs = ccdget(unit, 1)) == NULL) return ENXIO; mutex_enter(&cs->sc_dvlock); lp = cs->sc_dkdev.dk_label; part = DISKPART(dev); pmask = (1 << part); /* * If we're initialized, check to see if there are any other * open partitions. If not, then it's safe to update * the in-core disklabel. Only read the disklabel if it is * not already valid. */ if ((cs->sc_flags & (CCDF_INITED|CCDF_VLABEL)) == CCDF_INITED && cs->sc_dkdev.dk_openmask == 0) ccdgetdisklabel(dev); /* Check that the partition exists. */ if (part != RAW_PART) { if (((cs->sc_flags & CCDF_INITED) == 0) || ((part >= lp->d_npartitions) || (lp->d_partitions[part].p_fstype == FS_UNUSED))) { error = ENXIO; goto done; } } /* Prevent our unit from being unconfigured while open. */ switch (fmt) { case S_IFCHR: cs->sc_dkdev.dk_copenmask |= pmask; break; case S_IFBLK: cs->sc_dkdev.dk_bopenmask |= pmask; break; } cs->sc_dkdev.dk_openmask = cs->sc_dkdev.dk_copenmask | cs->sc_dkdev.dk_bopenmask; done: mutex_exit(&cs->sc_dvlock); return (error); } /* ARGSUSED */ static int ccdclose(dev_t dev, int flags, int fmt, struct lwp *l) { int unit = ccdunit(dev); struct ccd_softc *cs; int part; #ifdef DEBUG if (ccddebug & CCDB_FOLLOW) printf("ccdclose(0x%"PRIx64", 0x%x)\n", dev, flags); #endif if ((cs = ccdget(unit, 0)) == NULL) return ENXIO; mutex_enter(&cs->sc_dvlock); part = DISKPART(dev); /* ...that much closer to allowing unconfiguration... */ switch (fmt) { case S_IFCHR: cs->sc_dkdev.dk_copenmask &= ~(1 << part); break; case S_IFBLK: cs->sc_dkdev.dk_bopenmask &= ~(1 << part); break; } cs->sc_dkdev.dk_openmask = cs->sc_dkdev.dk_copenmask | cs->sc_dkdev.dk_bopenmask; if (cs->sc_dkdev.dk_openmask == 0) { if ((cs->sc_flags & CCDF_KLABEL) == 0) cs->sc_flags &= ~CCDF_VLABEL; } mutex_exit(&cs->sc_dvlock); return (0); } static void ccdthread(void *cookie) { int error; struct ccd_softc *cs; struct buf *bp; cs = cookie; #ifdef DEBUG if (ccddebug & CCDB_FOLLOW) printf("ccdthread: hello\n"); #endif mutex_enter(cs->sc_iolock); while (__predict_true(!cs->sc_zap)) { bp = bufq_get(cs->sc_bufq); if (bp == NULL) { /* Nothing to do. */ cv_wait(&cs->sc_push, cs->sc_iolock); continue; } #ifdef DEBUG if (ccddebug & CCDB_FOLLOW) printf("ccdthread: dispatching I/O\n"); #endif error = ccdstart(cs, bp, PR_WAITOK); KASSERT(error == 0); mutex_enter(cs->sc_iolock); } cs->sc_thread = NULL; mutex_exit(cs->sc_iolock); #ifdef DEBUG if (ccddebug & CCDB_FOLLOW) printf("ccdthread: goodbye\n"); #endif kthread_exit(0); } static void ccdstrategy(struct buf *bp) { int unit = ccdunit(bp->b_dev); struct ccd_softc *cs; if ((cs = ccdget(unit, 0)) == NULL) return; /* Must be open or reading label. */ KASSERT(cs->sc_dkdev.dk_openmask != 0 || (cs->sc_flags & CCDF_RLABEL) != 0); mutex_enter(cs->sc_iolock); /* Synchronize with device init/uninit. */ if (__predict_false((cs->sc_flags & CCDF_INITED) == 0)) { mutex_exit(cs->sc_iolock); #ifdef DEBUG if (ccddebug & CCDB_FOLLOW) printf("ccdstrategy: unit %d: not inited\n", unit); #endif bp->b_error = ENXIO; bp->b_resid = bp->b_bcount; biodone(bp); return; } if (ccdstart(cs, bp, PR_NOWAIT) != 0) { /* Defer to thread if system is low on memory. */ bufq_put(cs->sc_bufq, bp); cv_broadcast(&cs->sc_push); mutex_exit(cs->sc_iolock); } } static int ccdstart(struct ccd_softc *cs, struct buf *bp, int wait) { daddr_t blkno; int wlabel; struct disklabel *lp; long bcount, rcount; struct ccdbuf *cbp; char *addr; daddr_t bn; vnode_t *vp; SIMPLEQ_HEAD(, ccdbuf) cbufq; KASSERT(mutex_owned(cs->sc_iolock)); KASSERT(bp != NULL); disk_busy(&cs->sc_dkdev); #ifdef DEBUG if (ccddebug & CCDB_FOLLOW) printf("ccdstart(%s, %p)\n", cs->sc_xname, bp); #endif /* If it's a nil transfer, wake up the top half now. */ if (bp->b_bcount == 0) goto done; lp = cs->sc_dkdev.dk_label; /* * Do bounds checking and adjust transfer. If there's an * error, the bounds check will flag that for us. Convert * the partition relative block number to an absolute. */ blkno = bp->b_blkno; wlabel = cs->sc_flags & (CCDF_WLABEL|CCDF_LABELLING); if (DISKPART(bp->b_dev) != RAW_PART) { if (bounds_check_with_label(&cs->sc_dkdev, bp, wlabel) <= 0) goto done; blkno += lp->d_partitions[DISKPART(bp->b_dev)].p_offset; } mutex_exit(cs->sc_iolock); bp->b_rawblkno = blkno; /* Allocate the component buffers. */ SIMPLEQ_INIT(&cbufq); bp->b_resid = bp->b_bcount; bn = bp->b_rawblkno; addr = bp->b_data; for (bcount = bp->b_bcount; bcount > 0; bcount -= rcount) { cbp = ccdbuffer(cs, bp, bn, addr, bcount, wait); KASSERT(cbp != NULL || wait == PR_NOWAIT); if (cbp == NULL) { while ((cbp = SIMPLEQ_FIRST(&cbufq)) != NULL) { SIMPLEQ_REMOVE_HEAD(&cbufq, cb_q); CCD_PUTBUF(cbp); } mutex_enter(cs->sc_iolock); disk_unbusy(&cs->sc_dkdev, 0, 0); return ENOMEM; } SIMPLEQ_INSERT_TAIL(&cbufq, cbp, cb_q); rcount = cbp->cb_buf.b_bcount; bn += btodb(rcount); addr += rcount; } /* All buffers set up, now fire off the requests. */ while ((cbp = SIMPLEQ_FIRST(&cbufq)) != NULL) { SIMPLEQ_REMOVE_HEAD(&cbufq, cb_q); vp = cbp->cb_buf.b_vp; if ((cbp->cb_buf.b_flags & B_READ) == 0) { mutex_enter(vp->v_interlock); vp->v_numoutput++; mutex_exit(vp->v_interlock); } (void)VOP_STRATEGY(vp, &cbp->cb_buf); } return 0; done: disk_unbusy(&cs->sc_dkdev, 0, 0); cv_broadcast(&cs->sc_stop); cv_broadcast(&cs->sc_push); mutex_exit(cs->sc_iolock); bp->b_resid = bp->b_bcount; biodone(bp); return 0; } /* * Build a component buffer header. */ static struct ccdbuf * ccdbuffer(struct ccd_softc *cs, struct buf *bp, daddr_t bn, void *addr, long bcount, int wait) { struct ccdcinfo *ci; struct ccdbuf *cbp; daddr_t cbn, cboff; u_int64_t cbc; int ccdisk; #ifdef DEBUG if (ccddebug & CCDB_IO) printf("ccdbuffer(%p, %p, %" PRId64 ", %p, %ld)\n", cs, bp, bn, addr, bcount); #endif /* * Determine which component bn falls in. */ cbn = bn; cboff = 0; /* * Serially concatenated */ if (cs->sc_ileave == 0) { daddr_t sblk; sblk = 0; for (ccdisk = 0, ci = &cs->sc_cinfo[ccdisk]; cbn >= sblk + ci->ci_size; ccdisk++, ci = &cs->sc_cinfo[ccdisk]) sblk += ci->ci_size; cbn -= sblk; } /* * Interleaved */ else { struct ccdiinfo *ii; int off; cboff = cbn % cs->sc_ileave; cbn /= cs->sc_ileave; for (ii = cs->sc_itable; ii->ii_ndisk; ii++) if (ii->ii_startblk > cbn) break; ii--; off = cbn - ii->ii_startblk; if (ii->ii_ndisk == 1) { ccdisk = ii->ii_index[0]; cbn = ii->ii_startoff + off; } else { ccdisk = ii->ii_index[off % ii->ii_ndisk]; cbn = ii->ii_startoff + off / ii->ii_ndisk; } cbn *= cs->sc_ileave; ci = &cs->sc_cinfo[ccdisk]; } /* * Fill in the component buf structure. */ cbp = CCD_GETBUF(wait); if (cbp == NULL) return NULL; buf_init(&cbp->cb_buf); cbp->cb_buf.b_flags = bp->b_flags; cbp->cb_buf.b_oflags = bp->b_oflags; cbp->cb_buf.b_cflags = bp->b_cflags; cbp->cb_buf.b_iodone = ccdiodone; cbp->cb_buf.b_proc = bp->b_proc; cbp->cb_buf.b_dev = ci->ci_dev; cbp->cb_buf.b_blkno = cbn + cboff; cbp->cb_buf.b_data = addr; cbp->cb_buf.b_vp = ci->ci_vp; cbp->cb_buf.b_objlock = ci->ci_vp->v_interlock; if (cs->sc_ileave == 0) cbc = dbtob((u_int64_t)(ci->ci_size - cbn)); else cbc = dbtob((u_int64_t)(cs->sc_ileave - cboff)); cbp->cb_buf.b_bcount = cbc < bcount ? cbc : bcount; /* * context for ccdiodone */ cbp->cb_obp = bp; cbp->cb_sc = cs; cbp->cb_comp = ccdisk; BIO_COPYPRIO(&cbp->cb_buf, bp); #ifdef DEBUG if (ccddebug & CCDB_IO) printf(" dev 0x%"PRIx64"(u%lu): cbp %p bn %" PRId64 " addr %p" " bcnt %d\n", ci->ci_dev, (unsigned long) (ci-cs->sc_cinfo), cbp, cbp->cb_buf.b_blkno, cbp->cb_buf.b_data, cbp->cb_buf.b_bcount); #endif return (cbp); } /* * Called at interrupt time. * Mark the component as done and if all components are done, * take a ccd interrupt. */ static void ccdiodone(struct buf *vbp) { struct ccdbuf *cbp = (struct ccdbuf *) vbp; struct buf *bp = cbp->cb_obp; struct ccd_softc *cs = cbp->cb_sc; int count; #ifdef DEBUG if (ccddebug & CCDB_FOLLOW) printf("ccdiodone(%p)\n", cbp); if (ccddebug & CCDB_IO) { printf("ccdiodone: bp %p bcount %d resid %d\n", bp, bp->b_bcount, bp->b_resid); printf(" dev 0x%"PRIx64"(u%d), cbp %p bn %" PRId64 " addr %p" " bcnt %d\n", cbp->cb_buf.b_dev, cbp->cb_comp, cbp, cbp->cb_buf.b_blkno, cbp->cb_buf.b_data, cbp->cb_buf.b_bcount); } #endif if (cbp->cb_buf.b_error != 0) { bp->b_error = cbp->cb_buf.b_error; printf("%s: error %d on component %d\n", cs->sc_xname, bp->b_error, cbp->cb_comp); } count = cbp->cb_buf.b_bcount; buf_destroy(&cbp->cb_buf); CCD_PUTBUF(cbp); /* * If all done, "interrupt". */ mutex_enter(cs->sc_iolock); bp->b_resid -= count; if (bp->b_resid < 0) panic("ccdiodone: count"); if (bp->b_resid == 0) { /* * Request is done for better or worse, wakeup the top half. */ if (bp->b_error != 0) bp->b_resid = bp->b_bcount; disk_unbusy(&cs->sc_dkdev, (bp->b_bcount - bp->b_resid), (bp->b_flags & B_READ)); if (!disk_isbusy(&cs->sc_dkdev)) { if (bufq_peek(cs->sc_bufq) != NULL) { cv_broadcast(&cs->sc_push); } cv_broadcast(&cs->sc_stop); } mutex_exit(cs->sc_iolock); biodone(bp); } else mutex_exit(cs->sc_iolock); } /* ARGSUSED */ static int ccdread(dev_t dev, struct uio *uio, int flags) { int unit = ccdunit(dev); struct ccd_softc *cs; #ifdef DEBUG if (ccddebug & CCDB_FOLLOW) printf("ccdread(0x%"PRIx64", %p)\n", dev, uio); #endif if ((cs = ccdget(unit, 0)) == NULL) return 0; /* Unlocked advisory check, ccdstrategy check is synchronous. */ if ((cs->sc_flags & CCDF_INITED) == 0) return (ENXIO); return (physio(ccdstrategy, NULL, dev, B_READ, minphys, uio)); } /* ARGSUSED */ static int ccdwrite(dev_t dev, struct uio *uio, int flags) { int unit = ccdunit(dev); struct ccd_softc *cs; #ifdef DEBUG if (ccddebug & CCDB_FOLLOW) printf("ccdwrite(0x%"PRIx64", %p)\n", dev, uio); #endif if ((cs = ccdget(unit, 0)) == NULL) return ENOENT; /* Unlocked advisory check, ccdstrategy check is synchronous. */ if ((cs->sc_flags & CCDF_INITED) == 0) return (ENXIO); return (physio(ccdstrategy, NULL, dev, B_WRITE, minphys, uio)); } int (*compat_ccd_ioctl_60)(dev_t, u_long, void *, int, struct lwp *, int (*)(dev_t, u_long, void *, int, struct lwp *)) = (void *)enosys; static int ccdioctl(dev_t dev, u_long cmd, void *data, int flag, struct lwp *l) { int unit = ccdunit(dev); int i, j, lookedup = 0, error = 0; int part, pmask, make, hook; struct ccd_softc *cs; struct ccd_ioctl *ccio = (struct ccd_ioctl *)data; kauth_cred_t uc; char **cpp; struct pathbuf *pb; struct vnode **vpp; #ifdef __HAVE_OLD_DISKLABEL struct disklabel newlabel; #endif switch (cmd) { case CCDIOCSET: make = 1; break; default: MODULE_HOOK_CALL(ccd_ioctl_60_hook, (0, cmd, NULL, 0, NULL, NULL), enosys(), hook); if (hook == 0) make = 1; else make = 0; break; } if ((cs = ccdget(unit, make)) == NULL) return ENOENT; uc = kauth_cred_get(); MODULE_HOOK_CALL(ccd_ioctl_60_hook, (dev, cmd, data, flag, l, ccdioctl), enosys(), error); if (error != ENOSYS) return error; /* Must be open for writes for these commands... */ switch (cmd) { case CCDIOCSET: case CCDIOCCLR: case DIOCSDINFO: case DIOCWDINFO: case DIOCCACHESYNC: case DIOCAWEDGE: case DIOCDWEDGE: case DIOCRMWEDGES: case DIOCMWEDGES: #ifdef __HAVE_OLD_DISKLABEL case ODIOCSDINFO: case ODIOCWDINFO: #endif case DIOCKLABEL: case DIOCWLABEL: if ((flag & FWRITE) == 0) return (EBADF); } /* Must be initialized for these... */ switch (cmd) { case CCDIOCCLR: case DIOCGDINFO: case DIOCGSTRATEGY: case DIOCGCACHE: case DIOCCACHESYNC: case DIOCAWEDGE: case DIOCDWEDGE: case DIOCLWEDGES: case DIOCMWEDGES: case DIOCSDINFO: case DIOCWDINFO: case DIOCGPARTINFO: case DIOCWLABEL: case DIOCKLABEL: case DIOCGDEFLABEL: #ifdef __HAVE_OLD_DISKLABEL case ODIOCGDINFO: case ODIOCSDINFO: case ODIOCWDINFO: case ODIOCGDEFLABEL: #endif if ((cs->sc_flags & CCDF_INITED) == 0) return ENXIO; } error = disk_ioctl(&cs->sc_dkdev, dev, cmd, data, flag, l); if (error != EPASSTHROUGH) return error; switch (cmd) { case DIOCGSTRATEGY: { struct disk_strategy *dks = (void *)data; mutex_enter(cs->sc_iolock); if (cs->sc_bufq != NULL) strlcpy(dks->dks_name, bufq_getstrategyname(cs->sc_bufq), sizeof(dks->dks_name)); else error = EINVAL; mutex_exit(cs->sc_iolock); dks->dks_paramlen = 0; break; } case DIOCWDINFO: case DIOCSDINFO: #ifdef __HAVE_OLD_DISKLABEL case ODIOCWDINFO: case ODIOCSDINFO: #endif { struct disklabel *lp; #ifdef __HAVE_OLD_DISKLABEL if (cmd == ODIOCSDINFO || cmd == ODIOCWDINFO) { memset(&newlabel, 0, sizeof newlabel); memcpy(&newlabel, data, sizeof (struct olddisklabel)); lp = &newlabel; } else #endif lp = (struct disklabel *)data; cs->sc_flags |= CCDF_LABELLING; error = setdisklabel(cs->sc_dkdev.dk_label, lp, 0, cs->sc_dkdev.dk_cpulabel); if (error == 0) { if (cmd == DIOCWDINFO #ifdef __HAVE_OLD_DISKLABEL || cmd == ODIOCWDINFO #endif ) error = writedisklabel(CCDLABELDEV(dev), ccdstrategy, cs->sc_dkdev.dk_label, cs->sc_dkdev.dk_cpulabel); } cs->sc_flags &= ~CCDF_LABELLING; break; } case DIOCKLABEL: if (*(int *)data != 0) cs->sc_flags |= CCDF_KLABEL; else cs->sc_flags &= ~CCDF_KLABEL; break; case DIOCWLABEL: if (*(int *)data != 0) cs->sc_flags |= CCDF_WLABEL; else cs->sc_flags &= ~CCDF_WLABEL; break; case DIOCGDEFLABEL: ccdgetdefaultlabel(cs, (struct disklabel *)data); break; #ifdef __HAVE_OLD_DISKLABEL case ODIOCGDEFLABEL: ccdgetdefaultlabel(cs, &newlabel); if (newlabel.d_npartitions > OLDMAXPARTITIONS) return ENOTTY; memcpy(data, &newlabel, sizeof (struct olddisklabel)); break; #endif default: error = ENOTTY; break; } if (error != ENOTTY) return error; mutex_enter(&cs->sc_dvlock); error = 0; switch (cmd) { case CCDIOCSET: if (cs->sc_flags & CCDF_INITED) { error = EBUSY; goto out; } /* Validate the flags. */ if ((ccio->ccio_flags & CCDF_USERMASK) != ccio->ccio_flags) { error = EINVAL; goto out; } if (ccio->ccio_ndisks > CCD_MAXNDISKS || ccio->ccio_ndisks == 0) { error = EINVAL; goto out; } /* Fill in some important bits. */ cs->sc_ileave = ccio->ccio_ileave; cs->sc_nccdisks = ccio->ccio_ndisks; cs->sc_flags = ccio->ccio_flags & CCDF_USERMASK; /* * Allocate space for and copy in the array of * component pathnames and device numbers. */ cpp = kmem_alloc(ccio->ccio_ndisks * sizeof(*cpp), KM_SLEEP); vpp = kmem_alloc(ccio->ccio_ndisks * sizeof(*vpp), KM_SLEEP); error = copyin(ccio->ccio_disks, cpp, ccio->ccio_ndisks * sizeof(*cpp)); if (error) { kmem_free(vpp, ccio->ccio_ndisks * sizeof(*vpp)); kmem_free(cpp, ccio->ccio_ndisks * sizeof(*cpp)); goto out; } #ifdef DEBUG if (ccddebug & CCDB_INIT) for (i = 0; i < ccio->ccio_ndisks; ++i) printf("ccdioctl: component %d: %p\n", i, cpp[i]); #endif for (i = 0; i < ccio->ccio_ndisks; ++i) { #ifdef DEBUG if (ccddebug & CCDB_INIT) printf("ccdioctl: lookedup = %d\n", lookedup); #endif error = pathbuf_copyin(cpp[i], &pb); if (error == 0) { error = vn_bdev_openpath(pb, &vpp[i], l); pathbuf_destroy(pb); } if (error != 0) { for (j = 0; j < lookedup; ++j) (void)vn_close(vpp[j], FREAD|FWRITE, uc); kmem_free(vpp, ccio->ccio_ndisks * sizeof(*vpp)); kmem_free(cpp, ccio->ccio_ndisks * sizeof(*cpp)); /* * No component data is allocated, * nothing is to be freed. */ cs->sc_nccdisks = 0; goto out; } ++lookedup; } /* Attach the disk. */ disk_attach(&cs->sc_dkdev); bufq_alloc(&cs->sc_bufq, "fcfs", 0); /* * Initialize the ccd. Fills in the softc for us. */ if ((error = ccdinit(cs, cpp, vpp, l)) != 0) { for (j = 0; j < lookedup; ++j) (void)vn_close(vpp[j], FREAD|FWRITE, uc); kmem_free(vpp, ccio->ccio_ndisks * sizeof(*vpp)); kmem_free(cpp, ccio->ccio_ndisks * sizeof(*cpp)); disk_detach(&cs->sc_dkdev); mutex_exit(&cs->sc_dvlock); bufq_free(cs->sc_bufq); return error; } /* We can free the temporary variables now. */ kmem_free(vpp, ccio->ccio_ndisks * sizeof(*vpp)); kmem_free(cpp, ccio->ccio_ndisks * sizeof(*cpp)); /* * The ccd has been successfully initialized, so * we can place it into the array. Don't try to * read the disklabel until the disk has been attached, * because space for the disklabel is allocated * in disk_attach(); */ ccio->ccio_unit = unit; ccio->ccio_size = cs->sc_size; /* Try and read the disklabel. */ ccdgetdisklabel(dev); disk_set_info(NULL, &cs->sc_dkdev, NULL); /* discover wedges */ mutex_exit(&cs->sc_dvlock); dkwedge_discover(&cs->sc_dkdev); return 0; case CCDIOCCLR: /* * Don't unconfigure if any other partitions are open * or if both the character and block flavors of this * partition are open. */ part = DISKPART(dev); pmask = (1 << part); if ((cs->sc_dkdev.dk_openmask & ~pmask) || ((cs->sc_dkdev.dk_bopenmask & pmask) && (cs->sc_dkdev.dk_copenmask & pmask))) { error = EBUSY; goto out; } /* Delete all of our wedges. */ dkwedge_delall(&cs->sc_dkdev); /* Stop new I/O, wait for in-flight I/O to complete. */ mutex_enter(cs->sc_iolock); cs->sc_flags &= ~(CCDF_INITED|CCDF_VLABEL); cs->sc_zap = true; while (disk_isbusy(&cs->sc_dkdev) || bufq_peek(cs->sc_bufq) != NULL || cs->sc_thread != NULL) { cv_broadcast(&cs->sc_push); (void)cv_timedwait(&cs->sc_stop, cs->sc_iolock, hz); } mutex_exit(cs->sc_iolock); /* * Free ccd_softc information and clear entry. */ /* Close the components and free their pathnames. */ for (i = 0; i < cs->sc_nccdisks; ++i) { /* * XXX: this close could potentially fail and * cause Bad Things. Maybe we need to force * the close to happen? */ #ifdef DEBUG if (ccddebug & CCDB_VNODE) vprint("CCDIOCCLR: vnode info", cs->sc_cinfo[i].ci_vp); #endif (void)vn_close(cs->sc_cinfo[i].ci_vp, FREAD|FWRITE, uc); kmem_free(cs->sc_cinfo[i].ci_path, cs->sc_cinfo[i].ci_pathlen); } if (cs->sc_nccdisks != 0) { /* Free interleave index. */ for (i = 0; cs->sc_itable[i].ii_ndisk; ++i) { kmem_free(cs->sc_itable[i].ii_index, cs->sc_itable[i].ii_indexsz); } /* Free component info and interleave table. */ kmem_free(cs->sc_cinfo, cs->sc_nccdisks * sizeof(struct ccdcinfo)); kmem_free(cs->sc_itable, (cs->sc_nccdisks + 1) * sizeof(struct ccdiinfo)); } aprint_normal("%s: detached\n", cs->sc_xname); /* Detach the disk. */ disk_detach(&cs->sc_dkdev); bufq_free(cs->sc_bufq); /* also releases sc_dvlock */ ccdput(cs); /* Don't break, otherwise cs is read again. */ return 0; case DIOCGCACHE: { int dkcache = 0; /* * We pass this call down to all components and report * intersection of the flags returned by the components. * If any errors out, we return error. CCD components * can not change unless the device is unconfigured, so * device feature flags will remain static. RCE/WCE can change * of course, if set directly on underlying device. */ for (error = 0, i = 0; i < cs->sc_nccdisks; i++) { error = VOP_IOCTL(cs->sc_cinfo[i].ci_vp, cmd, &j, flag, uc); if (error) break; if (i == 0) dkcache = j; else dkcache = DKCACHE_COMBINE(dkcache, j); } *((int *)data) = dkcache; break; } case DIOCCACHESYNC: /* * We pass this call down to all components and report * the first error we encounter. */ for (error = 0, i = 0; i < cs->sc_nccdisks; i++) { j = VOP_IOCTL(cs->sc_cinfo[i].ci_vp, cmd, data, flag, uc); if (j != 0 && error == 0) error = j; } break; default: error = ENOTTY; break; } out: mutex_exit(&cs->sc_dvlock); return (error); } static int ccdsize(dev_t dev) { struct ccd_softc *cs; struct disklabel *lp; int part, unit, omask, size; unit = ccdunit(dev); if ((cs = ccdget(unit, 0)) == NULL) return -1; if ((cs->sc_flags & CCDF_INITED) == 0) return (-1); part = DISKPART(dev); omask = cs->sc_dkdev.dk_openmask & (1 << part); lp = cs->sc_dkdev.dk_label; if (omask == 0 && ccdopen(dev, 0, S_IFBLK, curlwp)) return (-1); if (lp->d_partitions[part].p_fstype != FS_SWAP) size = -1; else size = lp->d_partitions[part].p_size * (lp->d_secsize / DEV_BSIZE); if (omask == 0 && ccdclose(dev, 0, S_IFBLK, curlwp)) return (-1); return (size); } static void ccdgetdefaultlabel(struct ccd_softc *cs, struct disklabel *lp) { struct ccdgeom *ccg = &cs->sc_geom; memset(lp, 0, sizeof(*lp)); if (cs->sc_size > UINT32_MAX) lp->d_secperunit = UINT32_MAX; else lp->d_secperunit = cs->sc_size; lp->d_secsize = ccg->ccg_secsize; lp->d_nsectors = ccg->ccg_nsectors; lp->d_ntracks = ccg->ccg_ntracks; lp->d_ncylinders = ccg->ccg_ncylinders; lp->d_secpercyl = lp->d_ntracks * lp->d_nsectors; strncpy(lp->d_typename, "ccd", sizeof(lp->d_typename)); lp->d_type = DKTYPE_CCD; strncpy(lp->d_packname, "fictitious", sizeof(lp->d_packname)); lp->d_rpm = 3600; lp->d_interleave = 1; lp->d_flags = 0; lp->d_partitions[RAW_PART].p_offset = 0; lp->d_partitions[RAW_PART].p_size = lp->d_secperunit; lp->d_partitions[RAW_PART].p_fstype = FS_UNUSED; lp->d_npartitions = RAW_PART + 1; lp->d_magic = DISKMAGIC; lp->d_magic2 = DISKMAGIC; lp->d_checksum = dkcksum(cs->sc_dkdev.dk_label); } /* * Read the disklabel from the ccd. If one is not present, fake one * up. */ static void ccdgetdisklabel(dev_t dev) { int unit = ccdunit(dev); struct ccd_softc *cs; const char *errstring; struct disklabel *lp; struct cpu_disklabel *clp; if ((cs = ccdget(unit, 0)) == NULL) return; lp = cs->sc_dkdev.dk_label; clp = cs->sc_dkdev.dk_cpulabel; KASSERT(mutex_owned(&cs->sc_dvlock)); memset(clp, 0, sizeof(*clp)); ccdgetdefaultlabel(cs, lp); /* * Call the generic disklabel extraction routine. */ cs->sc_flags |= CCDF_RLABEL; if ((cs->sc_flags & CCDF_NOLABEL) != 0) errstring = "CCDF_NOLABEL set; ignoring on-disk label"; else errstring = readdisklabel(CCDLABELDEV(dev), ccdstrategy, cs->sc_dkdev.dk_label, cs->sc_dkdev.dk_cpulabel); if (errstring) ccdmakedisklabel(cs); else { int i; struct partition *pp; /* * Sanity check whether the found disklabel is valid. * * This is necessary since total size of ccd may vary * when an interleave is changed even though exactly * same componets are used, and old disklabel may used * if that is found. */ if (lp->d_secperunit < UINT32_MAX ? lp->d_secperunit != cs->sc_size : lp->d_secperunit > cs->sc_size) printf("WARNING: %s: " "total sector size in disklabel (%ju) != " "the size of ccd (%ju)\n", cs->sc_xname, (uintmax_t)lp->d_secperunit, (uintmax_t)cs->sc_size); for (i = 0; i < lp->d_npartitions; i++) { pp = &lp->d_partitions[i]; if (pp->p_offset + pp->p_size > cs->sc_size) printf("WARNING: %s: end of partition `%c' " "exceeds the size of ccd (%ju)\n", cs->sc_xname, 'a' + i, (uintmax_t)cs->sc_size); } } #ifdef DEBUG /* It's actually extremely common to have unlabeled ccds. */ if (ccddebug & CCDB_LABEL) if (errstring != NULL) printf("%s: %s\n", cs->sc_xname, errstring); #endif /* In-core label now valid. */ cs->sc_flags = (cs->sc_flags | CCDF_VLABEL) & ~CCDF_RLABEL; } /* * Take care of things one might want to take care of in the event * that a disklabel isn't present. */ static void ccdmakedisklabel(struct ccd_softc *cs) { struct disklabel *lp = cs->sc_dkdev.dk_label; /* * For historical reasons, if there's no disklabel present * the raw partition must be marked FS_BSDFFS. */ lp->d_partitions[RAW_PART].p_fstype = FS_BSDFFS; strncpy(lp->d_packname, "default label", sizeof(lp->d_packname)); lp->d_checksum = dkcksum(lp); } #ifdef DEBUG static void printiinfo(struct ccdiinfo *ii) { int ix, i; for (ix = 0; ii->ii_ndisk; ix++, ii++) { printf(" itab[%d]: #dk %d sblk %" PRId64 " soff %" PRId64, ix, ii->ii_ndisk, ii->ii_startblk, ii->ii_startoff); for (i = 0; i < ii->ii_ndisk; i++) printf(" %d", ii->ii_index[i]); printf("\n"); } } #endif MODULE(MODULE_CLASS_DRIVER, ccd, "dk_subr,bufq_fcfs"); static int ccd_modcmd(modcmd_t cmd, void *arg) { int error = 0; #ifdef _MODULE int bmajor = -1, cmajor = -1; #endif switch (cmd) { case MODULE_CMD_INIT: #ifdef _MODULE ccdattach(0); error = devsw_attach("ccd", &ccd_bdevsw, &bmajor, &ccd_cdevsw, &cmajor); #endif break; case MODULE_CMD_FINI: #ifdef _MODULE mutex_enter(&ccd_lock); if (!LIST_EMPTY(&ccds)) { mutex_exit(&ccd_lock); error = EBUSY; } else { mutex_exit(&ccd_lock); devsw_detach(&ccd_bdevsw, &ccd_cdevsw); ccddetach(); } #endif break; case MODULE_CMD_STAT: return ENOTTY; default: return ENOTTY; } return error; } static int ccd_units_sysctl(SYSCTLFN_ARGS) { struct sysctlnode node; struct ccd_softc *sc; int error, i, nccd, *units; size_t size; nccd = 0; mutex_enter(&ccd_lock); LIST_FOREACH(sc, &ccds, sc_link) nccd++; mutex_exit(&ccd_lock); if (nccd != 0) { size = nccd * sizeof(*units); units = kmem_zalloc(size, KM_SLEEP); i = 0; mutex_enter(&ccd_lock); LIST_FOREACH(sc, &ccds, sc_link) { if (i >= nccd) break; units[i] = sc->sc_unit; } mutex_exit(&ccd_lock); } else { units = NULL; size = 0; } node = *rnode; node.sysctl_data = units; node.sysctl_size = size; error = sysctl_lookup(SYSCTLFN_CALL(&node)); if (units) kmem_free(units, size); return error; } static int ccd_info_sysctl(SYSCTLFN_ARGS) { struct sysctlnode node; struct ccddiskinfo ccd; struct ccd_softc *sc; int unit, error; if (newp == NULL || newlen != sizeof(int)) return EINVAL; error = sysctl_copyin(l, newp, &unit, sizeof unit); if (error) return error; newlen = 0; ccd.ccd_ndisks = ~0; mutex_enter(&ccd_lock); LIST_FOREACH(sc, &ccds, sc_link) { if (sc->sc_unit == unit) { ccd.ccd_ileave = sc->sc_ileave; ccd.ccd_size = sc->sc_size; ccd.ccd_ndisks = sc->sc_nccdisks; ccd.ccd_flags = sc->sc_flags; break; } } mutex_exit(&ccd_lock); if (ccd.ccd_ndisks == ~0) return ENOENT; node = *rnode; node.sysctl_data = &ccd; node.sysctl_size = sizeof(ccd); return sysctl_lookup(SYSCTLFN_CALL(&node)); } static int ccd_components_sysctl(SYSCTLFN_ARGS) { struct sysctlnode node; int error, unit; size_t size; char *names, *p, *ep; struct ccd_softc *sc; if (newp == NULL || newlen != sizeof(int)) return EINVAL; size = 0; error = sysctl_copyin(l, newp, &unit, sizeof unit); if (error) return error; newlen = 0; mutex_enter(&ccd_lock); LIST_FOREACH(sc, &ccds, sc_link) if (sc->sc_unit == unit) { for (size_t i = 0; i < sc->sc_nccdisks; i++) size += strlen(sc->sc_cinfo[i].ci_path) + 1; break; } mutex_exit(&ccd_lock); if (size == 0) return ENOENT; names = kmem_zalloc(size, KM_SLEEP); p = names; ep = names + size; mutex_enter(&ccd_lock); LIST_FOREACH(sc, &ccds, sc_link) if (sc->sc_unit == unit) { for (size_t i = 0; i < sc->sc_nccdisks; i++) { char *d = sc->sc_cinfo[i].ci_path; while (p < ep && (*p++ = *d++) != '\0') continue; } break; } mutex_exit(&ccd_lock); node = *rnode; node.sysctl_data = names; node.sysctl_size = ep - names; error = sysctl_lookup(SYSCTLFN_CALL(&node)); kmem_free(names, size); return error; } SYSCTL_SETUP(sysctl_kern_ccd_setup, "sysctl kern.ccd subtree setup") { const struct sysctlnode *node = NULL; sysctl_createv(clog, 0, NULL, &node, CTLFLAG_PERMANENT, CTLTYPE_NODE, "ccd", SYSCTL_DESCR("ConCatenated Disk state"), NULL, 0, NULL, 0, CTL_KERN, CTL_CREATE, CTL_EOL); if (node == NULL) return; sysctl_createv(clog, 0, &node, NULL, CTLFLAG_PERMANENT | CTLFLAG_READONLY, CTLTYPE_STRUCT, "units", SYSCTL_DESCR("List of ccd unit numbers"), ccd_units_sysctl, 0, NULL, 0, CTL_CREATE, CTL_EOL); sysctl_createv(clog, 0, &node, NULL, CTLFLAG_PERMANENT | CTLFLAG_READWRITE, CTLTYPE_STRUCT, "info", SYSCTL_DESCR("Information about a CCD unit"), ccd_info_sysctl, 0, NULL, 0, CTL_CREATE, CTL_EOL); sysctl_createv(clog, 0, &node, NULL, CTLFLAG_PERMANENT | CTLFLAG_READWRITE, CTLTYPE_STRUCT, "components", SYSCTL_DESCR("Information about CCD components"), ccd_components_sysctl, 0, NULL, 0, CTL_CREATE, CTL_EOL); }
12 12 46 47 2 46 2 47 47 39 39 39 39 3 3 3 1 1 1 1 2 2 80 79 72 72 70 72 2 2 2 2 2 2 2 2 6 6 6 6 6 2 5 6 6 22 22 22 81 81 81 17 17 17 16 19 18 18 16 16 3 19 6 6 79 79 12 67 67 67 1 83 83 84 84 3 12 71 1 78 79 15 15 1 6 14 4 4 4 4 8 8 8 8 18 18 18 17 1 18 17 1 18 18 18 3 3 3 3 2 2 2 26 26 26 13 11 1 3 11 12 3 2 13 18 4 78 79 79 79 78 78 79 78 1 1 1 1 1 1 1 1 7 7 7 7 7 565 69 69 41 12 41 42 241 242 241 1 1 1 241 240 238 8 8 8 8 6 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 /* $NetBSD: uipc_socket2.c,v 1.143 2024/01/03 18:10:42 andvar Exp $ */ /*- * Copyright (c) 2008 The NetBSD Foundation, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /* * Copyright (c) 1982, 1986, 1988, 1990, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)uipc_socket2.c 8.2 (Berkeley) 2/14/95 */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: uipc_socket2.c,v 1.143 2024/01/03 18:10:42 andvar Exp $"); #ifdef _KERNEL_OPT #include "opt_ddb.h" #include "opt_inet.h" #include "opt_mbuftrace.h" #include "opt_sb_max.h" #endif #include <sys/param.h> #include <sys/systm.h> #include <sys/proc.h> #include <sys/file.h> #include <sys/buf.h> #include <sys/mbuf.h> #include <sys/protosw.h> #include <sys/domain.h> #include <sys/poll.h> #include <sys/socket.h> #include <sys/socketvar.h> #include <sys/signalvar.h> #include <sys/kauth.h> #include <sys/pool.h> #include <sys/uidinfo.h> #ifdef DDB #include <sys/filedesc.h> #include <ddb/db_active.h> #endif /* * Primitive routines for operating on sockets and socket buffers. * * Connection life-cycle: * * Normal sequence from the active (originating) side: * * - soisconnecting() is called during processing of connect() call, * - resulting in an eventual call to soisconnected() if/when the * connection is established. * * When the connection is torn down during processing of disconnect(): * * - soisdisconnecting() is called and, * - soisdisconnected() is called when the connection to the peer * is totally severed. * * The semantics of these routines are such that connectionless protocols * can call soisconnected() and soisdisconnected() only, bypassing the * in-progress calls when setting up a ``connection'' takes no time. * * From the passive side, a socket is created with two queues of sockets: * * - so_q0 (0) for partial connections (i.e. connections in progress) * - so_q (1) for connections already made and awaiting user acceptance. * * As a protocol is preparing incoming connections, it creates a socket * structure queued on so_q0 by calling sonewconn(). When the connection * is established, soisconnected() is called, and transfers the * socket structure to so_q, making it available to accept(). * * If a socket is closed with sockets on either so_q0 or so_q, these * sockets are dropped. * * Locking rules and assumptions: * * o socket::so_lock can change on the fly. The low level routines used * to lock sockets are aware of this. When so_lock is acquired, the * routine locking must check to see if so_lock still points to the * lock that was acquired. If so_lock has changed in the meantime, the * now irrelevant lock that was acquired must be dropped and the lock * operation retried. Although not proven here, this is completely safe * on a multiprocessor system, even with relaxed memory ordering, given * the next two rules: * * o In order to mutate so_lock, the lock pointed to by the current value * of so_lock must be held: i.e., the socket must be held locked by the * changing thread. The thread must issue membar_release() to prevent * memory accesses being reordered, and can set so_lock to the desired * value. If the lock pointed to by the new value of so_lock is not * held by the changing thread, the socket must then be considered * unlocked. * * o If so_lock is mutated, and the previous lock referred to by so_lock * could still be visible to other threads in the system (e.g. via file * descriptor or protocol-internal reference), then the old lock must * remain valid until the socket and/or protocol control block has been * torn down. * * o If a socket has a non-NULL so_head value (i.e. is in the process of * connecting), then locking the socket must also lock the socket pointed * to by so_head: their lock pointers must match. * * o If a socket has connections in progress (so_q, so_q0 not empty) then * locking the socket must also lock the sockets attached to both queues. * Again, their lock pointers must match. * * o Beyond the initial lock assignment in socreate(), assigning locks to * sockets is the responsibility of the individual protocols / protocol * domains. */ static pool_cache_t socket_cache; u_long sb_max = SB_MAX;/* maximum socket buffer size */ static u_long sb_max_adj; /* adjusted sb_max */ void soisconnecting(struct socket *so) { KASSERT(solocked(so)); so->so_state &= ~(SS_ISCONNECTED|SS_ISDISCONNECTING); so->so_state |= SS_ISCONNECTING; } void soisconnected(struct socket *so) { struct socket *head; head = so->so_head; KASSERT(solocked(so)); KASSERT(head == NULL || solocked2(so, head)); so->so_state &= ~(SS_ISCONNECTING | SS_ISDISCONNECTING); so->so_state |= SS_ISCONNECTED; if (head && so->so_onq == &head->so_q0) { if ((so->so_options & SO_ACCEPTFILTER) == 0) { /* * Re-enqueue and wake up any waiters, e.g. * processes blocking on accept(). */ soqremque(so, 0); soqinsque(head, so, 1); sorwakeup(head); cv_broadcast(&head->so_cv); } else { so->so_upcall = head->so_accf->so_accept_filter->accf_callback; so->so_upcallarg = head->so_accf->so_accept_filter_arg; so->so_rcv.sb_flags |= SB_UPCALL; so->so_options &= ~SO_ACCEPTFILTER; (*so->so_upcall)(so, so->so_upcallarg, POLLIN|POLLRDNORM, M_DONTWAIT); } } else { cv_broadcast(&so->so_cv); sorwakeup(so); sowwakeup(so); } } void soisdisconnecting(struct socket *so) { KASSERT(solocked(so)); so->so_state &= ~SS_ISCONNECTING; so->so_state |= (SS_ISDISCONNECTING|SS_CANTRCVMORE|SS_CANTSENDMORE); cv_broadcast(&so->so_cv); sowwakeup(so); sorwakeup(so); } void soisdisconnected(struct socket *so) { KASSERT(solocked(so)); so->so_state &= ~(SS_ISCONNECTING|SS_ISCONNECTED|SS_ISDISCONNECTING); so->so_state |= (SS_CANTRCVMORE|SS_CANTSENDMORE|SS_ISDISCONNECTED); cv_broadcast(&so->so_cv); sowwakeup(so); sorwakeup(so); } void soinit2(void) { socket_cache = pool_cache_init(sizeof(struct socket), 0, 0, 0, "socket", NULL, IPL_SOFTNET, NULL, NULL, NULL); } /* * sonewconn: accept a new connection. * * When an attempt at a new connection is noted on a socket which accepts * connections, sonewconn(9) is called. If the connection is possible * (subject to space constraints, etc) then we allocate a new structure, * properly linked into the data structure of the original socket. * * => If 'soready' is true, then socket will become ready for accept() i.e. * inserted into the so_q queue, SS_ISCONNECTED set and waiters awoken. * => May be called from soft-interrupt context. * => Listening socket should be locked. * => Returns the new socket locked. */ struct socket * sonewconn(struct socket *head, bool soready) { struct socket *so; int soqueue, error; KASSERT(solocked(head)); if (head->so_qlen + head->so_q0len > 3 * head->so_qlimit / 2) { /* * Listen queue overflow. If there is an accept filter * active, pass through the oldest cxn it's handling. */ if (head->so_accf == NULL) { return NULL; } else { struct socket *so2, *next; /* Pass the oldest connection waiting in the accept filter */ for (so2 = TAILQ_FIRST(&head->so_q0); so2 != NULL; so2 = next) { next = TAILQ_NEXT(so2, so_qe); if (so2->so_upcall == NULL) { continue; } so2->so_upcall = NULL; so2->so_upcallarg = NULL; so2->so_options &= ~SO_ACCEPTFILTER; so2->so_rcv.sb_flags &= ~SB_UPCALL; soisconnected(so2); break; } /* If nothing was nudged out of the acept filter, bail * out; otherwise proceed allocating the socket. */ if (so2 == NULL) { return NULL; } } } if ((head->so_options & SO_ACCEPTFILTER) != 0) { soready = false; } soqueue = soready ? 1 : 0; if ((so = soget(false)) == NULL) { return NULL; } so->so_type = head->so_type; so->so_options = head->so_options & ~SO_ACCEPTCONN; so->so_linger = head->so_linger; so->so_state = head->so_state | SS_NOFDREF; so->so_proto = head->so_proto; so->so_timeo = head->so_timeo; so->so_pgid = head->so_pgid; so->so_send = head->so_send; so->so_receive = head->so_receive; so->so_uidinfo = head->so_uidinfo; so->so_egid = head->so_egid; so->so_cpid = head->so_cpid; /* * Share the lock with the listening-socket, it may get unshared * once the connection is complete. * * so_lock is stable while we hold the socket locked, so no * need for atomic_load_* here. */ mutex_obj_hold(head->so_lock); so->so_lock = head->so_lock; /* * Reserve the space for socket buffers. */ #ifdef MBUFTRACE so->so_mowner = head->so_mowner; so->so_rcv.sb_mowner = head->so_rcv.sb_mowner; so->so_snd.sb_mowner = head->so_snd.sb_mowner; #endif if (soreserve(so, head->so_snd.sb_hiwat, head->so_rcv.sb_hiwat)) { goto out; } so->so_snd.sb_lowat = head->so_snd.sb_lowat; so->so_rcv.sb_lowat = head->so_rcv.sb_lowat; so->so_rcv.sb_timeo = head->so_rcv.sb_timeo; so->so_snd.sb_timeo = head->so_snd.sb_timeo; so->so_rcv.sb_flags |= head->so_rcv.sb_flags & (SB_AUTOSIZE | SB_ASYNC); so->so_snd.sb_flags |= head->so_snd.sb_flags & (SB_AUTOSIZE | SB_ASYNC); /* * Finally, perform the protocol attach. Note: a new socket * lock may be assigned at this point (if so, it will be held). */ error = (*so->so_proto->pr_usrreqs->pr_attach)(so, 0); if (error) { out: KASSERT(solocked(so)); KASSERT(so->so_accf == NULL); soput(so); /* Note: the listening socket shall stay locked. */ KASSERT(solocked(head)); return NULL; } KASSERT(solocked2(head, so)); /* * Insert into the queue. If ready, update the connection status * and wake up any waiters, e.g. processes blocking on accept(). */ soqinsque(head, so, soqueue); if (soready) { so->so_state |= SS_ISCONNECTED; sorwakeup(head); cv_broadcast(&head->so_cv); } return so; } struct socket * soget(bool waitok) { struct socket *so; so = pool_cache_get(socket_cache, (waitok ? PR_WAITOK : PR_NOWAIT)); if (__predict_false(so == NULL)) return (NULL); memset(so, 0, sizeof(*so)); TAILQ_INIT(&so->so_q0); TAILQ_INIT(&so->so_q); cv_init(&so->so_cv, "socket"); cv_init(&so->so_rcv.sb_cv, "netio"); cv_init(&so->so_snd.sb_cv, "netio"); selinit(&so->so_rcv.sb_sel); selinit(&so->so_snd.sb_sel); so->so_rcv.sb_so = so; so->so_snd.sb_so = so; return so; } void soput(struct socket *so) { KASSERT(!cv_has_waiters(&so->so_cv)); KASSERT(!cv_has_waiters(&so->so_rcv.sb_cv)); KASSERT(!cv_has_waiters(&so->so_snd.sb_cv)); seldestroy(&so->so_rcv.sb_sel); seldestroy(&so->so_snd.sb_sel); mutex_obj_free(so->so_lock); cv_destroy(&so->so_cv); cv_destroy(&so->so_rcv.sb_cv); cv_destroy(&so->so_snd.sb_cv); pool_cache_put(socket_cache, so); } /* * soqinsque: insert socket of a new connection into the specified * accept queue of the listening socket (head). * * q = 0: queue of partial connections * q = 1: queue of incoming connections */ void soqinsque(struct socket *head, struct socket *so, int q) { KASSERT(q == 0 || q == 1); KASSERT(solocked2(head, so)); KASSERT(so->so_onq == NULL); KASSERT(so->so_head == NULL); so->so_head = head; if (q == 0) { head->so_q0len++; so->so_onq = &head->so_q0; } else { head->so_qlen++; so->so_onq = &head->so_q; } TAILQ_INSERT_TAIL(so->so_onq, so, so_qe); } /* * soqremque: remove socket from the specified queue. * * => Returns true if socket was removed from the specified queue. * => False if socket was not removed (because it was in other queue). */ bool soqremque(struct socket *so, int q) { struct socket *head = so->so_head; KASSERT(q == 0 || q == 1); KASSERT(solocked(so)); KASSERT(so->so_onq != NULL); KASSERT(head != NULL); if (q == 0) { if (so->so_onq != &head->so_q0) return false; head->so_q0len--; } else { if (so->so_onq != &head->so_q) return false; head->so_qlen--; } KASSERT(solocked2(so, head)); TAILQ_REMOVE(so->so_onq, so, so_qe); so->so_onq = NULL; so->so_head = NULL; return true; } /* * socantsendmore: indicates that no more data will be sent on the * socket; it would normally be applied to a socket when the user * informs the system that no more data is to be sent, by the protocol * code (in case pr_shutdown()). */ void socantsendmore(struct socket *so) { KASSERT(solocked(so)); so->so_state |= SS_CANTSENDMORE; sowwakeup(so); } /* * socantrcvmore(): indicates that no more data will be received and * will normally be applied to the socket by a protocol when it detects * that the peer will send no more data. Data queued for reading in * the socket may yet be read. */ void socantrcvmore(struct socket *so) { KASSERT(solocked(so)); so->so_state |= SS_CANTRCVMORE; sorwakeup(so); } /* * soroverflow(): indicates that data was attempted to be sent * but the receiving buffer overflowed. */ void soroverflow(struct socket *so) { KASSERT(solocked(so)); so->so_rcv.sb_overflowed++; if (so->so_options & SO_RERROR) { so->so_rerror = ENOBUFS; sorwakeup(so); } } /* * Wait for data to arrive at/drain from a socket buffer. */ int sbwait(struct sockbuf *sb) { struct socket *so; kmutex_t *lock; int error; so = sb->sb_so; KASSERT(solocked(so)); sb->sb_flags |= SB_NOTIFY; lock = so->so_lock; if ((sb->sb_flags & SB_NOINTR) != 0) error = cv_timedwait(&sb->sb_cv, lock, sb->sb_timeo); else error = cv_timedwait_sig(&sb->sb_cv, lock, sb->sb_timeo); if (__predict_false(lock != atomic_load_relaxed(&so->so_lock))) solockretry(so, lock); return error; } /* * Wakeup processes waiting on a socket buffer. * Do asynchronous notification via SIGIO * if the socket buffer has the SB_ASYNC flag set. */ void sowakeup(struct socket *so, struct sockbuf *sb, int code) { int band; KASSERT(solocked(so)); KASSERT(sb->sb_so == so); switch (code) { case POLL_IN: band = POLLIN|POLLRDNORM; break; case POLL_OUT: band = POLLOUT|POLLWRNORM; break; case POLL_HUP: band = POLLHUP; break; default: band = 0; #ifdef DIAGNOSTIC printf("bad siginfo code %d in socket notification.\n", code); #endif break; } sb->sb_flags &= ~SB_NOTIFY; selnotify(&sb->sb_sel, band, NOTE_SUBMIT); cv_broadcast(&sb->sb_cv); if (sb->sb_flags & SB_ASYNC) fownsignal(so->so_pgid, SIGIO, code, band, so); if (sb->sb_flags & SB_UPCALL) (*so->so_upcall)(so, so->so_upcallarg, band, M_DONTWAIT); } /* * Reset a socket's lock pointer. Wake all threads waiting on the * socket's condition variables so that they can restart their waits * using the new lock. The existing lock must be held. * * Caller must have issued membar_release before this. */ void solockreset(struct socket *so, kmutex_t *lock) { KASSERT(solocked(so)); so->so_lock = lock; cv_broadcast(&so->so_snd.sb_cv); cv_broadcast(&so->so_rcv.sb_cv); cv_broadcast(&so->so_cv); } /* * Socket buffer (struct sockbuf) utility routines. * * Each socket contains two socket buffers: one for sending data and * one for receiving data. Each buffer contains a queue of mbufs, * information about the number of mbufs and amount of data in the * queue, and other fields allowing poll() statements and notification * on data availability to be implemented. * * Data stored in a socket buffer is maintained as a list of records. * Each record is a list of mbufs chained together with the m_next * field. Records are chained together with the m_nextpkt field. The upper * level routine soreceive() expects the following conventions to be * observed when placing information in the receive buffer: * * 1. If the protocol requires each message be preceded by the sender's * name, then a record containing that name must be present before * any associated data (mbuf's must be of type MT_SONAME). * 2. If the protocol supports the exchange of ``access rights'' (really * just additional data associated with the message), and there are * ``rights'' to be received, then a record containing this data * should be present (mbuf's must be of type MT_CONTROL). * 3. If a name or rights record exists, then it must be followed by * a data record, perhaps of zero length. * * Before using a new socket structure it is first necessary to reserve * buffer space to the socket, by calling sbreserve(). This should commit * some of the available buffer space in the system buffer pool for the * socket (currently, it does nothing but enforce limits). The space * should be released by calling sbrelease() when the socket is destroyed. */ int sb_max_set(u_long new_sbmax) { int s; if (new_sbmax < (16 * 1024)) return (EINVAL); s = splsoftnet(); sb_max = new_sbmax; sb_max_adj = (u_quad_t)new_sbmax * MCLBYTES / (MSIZE + MCLBYTES); splx(s); return (0); } int soreserve(struct socket *so, u_long sndcc, u_long rcvcc) { KASSERT(so->so_pcb == NULL || solocked(so)); /* * there's at least one application (a configure script of screen) * which expects a fifo is writable even if it has "some" bytes * in its buffer. * so we want to make sure (hiwat - lowat) >= (some bytes). * * PIPE_BUF here is an arbitrary value chosen as (some bytes) above. * we expect it's large enough for such applications. */ u_long lowat = MAX(sock_loan_thresh, MCLBYTES); u_long hiwat = lowat + PIPE_BUF; if (sndcc < hiwat) sndcc = hiwat; if (sbreserve(&so->so_snd, sndcc, so) == 0) goto bad; if (sbreserve(&so->so_rcv, rcvcc, so) == 0) goto bad2; if (so->so_rcv.sb_lowat == 0) so->so_rcv.sb_lowat = 1; if (so->so_snd.sb_lowat == 0) so->so_snd.sb_lowat = lowat; if (so->so_snd.sb_lowat > so->so_snd.sb_hiwat) so->so_snd.sb_lowat = so->so_snd.sb_hiwat; return (0); bad2: sbrelease(&so->so_snd, so); bad: return (ENOBUFS); } /* * Allot mbufs to a sockbuf. * Attempt to scale mbmax so that mbcnt doesn't become limiting * if buffering efficiency is near the normal case. */ int sbreserve(struct sockbuf *sb, u_long cc, struct socket *so) { struct lwp *l = curlwp; /* XXX */ rlim_t maxcc; struct uidinfo *uidinfo; KASSERT(so->so_pcb == NULL || solocked(so)); KASSERT(sb->sb_so == so); KASSERT(sb_max_adj != 0); if (cc == 0 || cc > sb_max_adj) return (0); maxcc = l->l_proc->p_rlimit[RLIMIT_SBSIZE].rlim_cur; uidinfo = so->so_uidinfo; if (!chgsbsize(uidinfo, &sb->sb_hiwat, cc, maxcc)) return 0; sb->sb_mbmax = uimin(cc * 2, sb_max); if (sb->sb_lowat > sb->sb_hiwat) sb->sb_lowat = sb->sb_hiwat; return (1); } /* * Free mbufs held by a socket, and reserved mbuf space. We do not assert * that the socket is held locked here: see sorflush(). */ void sbrelease(struct sockbuf *sb, struct socket *so) { KASSERT(sb->sb_so == so); sbflush(sb); (void)chgsbsize(so->so_uidinfo, &sb->sb_hiwat, 0, RLIM_INFINITY); sb->sb_mbmax = 0; } /* * Routines to add and remove * data from an mbuf queue. * * The routines sbappend() or sbappendrecord() are normally called to * append new mbufs to a socket buffer, after checking that adequate * space is available, comparing the function sbspace() with the amount * of data to be added. sbappendrecord() differs from sbappend() in * that data supplied is treated as the beginning of a new record. * To place a sender's address, optional access rights, and data in a * socket receive buffer, sbappendaddr() should be used. To place * access rights and data in a socket receive buffer, sbappendrights() * should be used. In either case, the new data begins a new record. * Note that unlike sbappend() and sbappendrecord(), these routines check * for the caller that there will be enough space to store the data. * Each fails if there is not enough space, or if it cannot find mbufs * to store additional information in. * * Reliable protocols may use the socket send buffer to hold data * awaiting acknowledgement. Data is normally copied from a socket * send buffer in a protocol with m_copym for output to a peer, * and then removing the data from the socket buffer with sbdrop() * or sbdroprecord() when the data is acknowledged by the peer. */ #ifdef SOCKBUF_DEBUG void sblastrecordchk(struct sockbuf *sb, const char *where) { struct mbuf *m = sb->sb_mb; KASSERT(solocked(sb->sb_so)); while (m && m->m_nextpkt) m = m->m_nextpkt; if (m != sb->sb_lastrecord) { printf("sblastrecordchk: sb_mb %p sb_lastrecord %p last %p\n", sb->sb_mb, sb->sb_lastrecord, m); printf("packet chain:\n"); for (m = sb->sb_mb; m != NULL; m = m->m_nextpkt) printf("\t%p\n", m); panic("sblastrecordchk from %s", where); } } void sblastmbufchk(struct sockbuf *sb, const char *where) { struct mbuf *m = sb->sb_mb; struct mbuf *n; KASSERT(solocked(sb->sb_so)); while (m && m->m_nextpkt) m = m->m_nextpkt; while (m && m->m_next) m = m->m_next; if (m != sb->sb_mbtail) { printf("sblastmbufchk: sb_mb %p sb_mbtail %p last %p\n", sb->sb_mb, sb->sb_mbtail, m); printf("packet tree:\n"); for (m = sb->sb_mb; m != NULL; m = m->m_nextpkt) { printf("\t"); for (n = m; n != NULL; n = n->m_next) printf("%p ", n); printf("\n"); } panic("sblastmbufchk from %s", where); } } #endif /* SOCKBUF_DEBUG */ /* * Link a chain of records onto a socket buffer */ #define SBLINKRECORDCHAIN(sb, m0, mlast) \ do { \ if ((sb)->sb_lastrecord != NULL) \ (sb)->sb_lastrecord->m_nextpkt = (m0); \ else \ (sb)->sb_mb = (m0); \ (sb)->sb_lastrecord = (mlast); \ } while (/*CONSTCOND*/0) #define SBLINKRECORD(sb, m0) \ SBLINKRECORDCHAIN(sb, m0, m0) /* * Append mbuf chain m to the last record in the * socket buffer sb. The additional space associated * the mbuf chain is recorded in sb. Empty mbufs are * discarded and mbufs are compacted where possible. */ void sbappend(struct sockbuf *sb, struct mbuf *m) { struct mbuf *n; KASSERT(solocked(sb->sb_so)); if (m == NULL) return; #ifdef MBUFTRACE m_claimm(m, sb->sb_mowner); #endif SBLASTRECORDCHK(sb, "sbappend 1"); if ((n = sb->sb_lastrecord) != NULL) { /* * XXX Would like to simply use sb_mbtail here, but * XXX I need to verify that I won't miss an EOR that * XXX way. */ do { if (n->m_flags & M_EOR) { sbappendrecord(sb, m); /* XXXXXX!!!! */ return; } } while (n->m_next && (n = n->m_next)); } else { /* * If this is the first record in the socket buffer, it's * also the last record. */ sb->sb_lastrecord = m; } sbcompress(sb, m, n); SBLASTRECORDCHK(sb, "sbappend 2"); } /* * This version of sbappend() should only be used when the caller * absolutely knows that there will never be more than one record * in the socket buffer, that is, a stream protocol (such as TCP). */ void sbappendstream(struct sockbuf *sb, struct mbuf *m) { KASSERT(solocked(sb->sb_so)); KDASSERT(m->m_nextpkt == NULL); KASSERT(sb->sb_mb == sb->sb_lastrecord); SBLASTMBUFCHK(sb, __func__); #ifdef MBUFTRACE m_claimm(m, sb->sb_mowner); #endif sbcompress(sb, m, sb->sb_mbtail); sb->sb_lastrecord = sb->sb_mb; SBLASTRECORDCHK(sb, __func__); } #ifdef SOCKBUF_DEBUG void sbcheck(struct sockbuf *sb) { struct mbuf *m, *m2; u_long len, mbcnt; KASSERT(solocked(sb->sb_so)); len = 0; mbcnt = 0; for (m = sb->sb_mb; m; m = m->m_nextpkt) { for (m2 = m; m2 != NULL; m2 = m2->m_next) { len += m2->m_len; mbcnt += MSIZE; if (m2->m_flags & M_EXT) mbcnt += m2->m_ext.ext_size; if (m2->m_nextpkt != NULL) panic("sbcheck nextpkt"); } } if (len != sb->sb_cc || mbcnt != sb->sb_mbcnt) { printf("cc %lu != %lu || mbcnt %lu != %lu\n", len, sb->sb_cc, mbcnt, sb->sb_mbcnt); panic("sbcheck"); } } #endif /* * As above, except the mbuf chain * begins a new record. */ void sbappendrecord(struct sockbuf *sb, struct mbuf *m0) { struct mbuf *m; KASSERT(solocked(sb->sb_so)); if (m0 == NULL) return; #ifdef MBUFTRACE m_claimm(m0, sb->sb_mowner); #endif /* * Put the first mbuf on the queue. * Note this permits zero length records. */ sballoc(sb, m0); SBLASTRECORDCHK(sb, "sbappendrecord 1"); SBLINKRECORD(sb, m0); m = m0->m_next; m0->m_next = 0; if (m && (m0->m_flags & M_EOR)) { m0->m_flags &= ~M_EOR; m->m_flags |= M_EOR; } sbcompress(sb, m, m0); SBLASTRECORDCHK(sb, "sbappendrecord 2"); } /* * As above except that OOB data * is inserted at the beginning of the sockbuf, * but after any other OOB data. */ void sbinsertoob(struct sockbuf *sb, struct mbuf *m0) { struct mbuf *m, **mp; KASSERT(solocked(sb->sb_so)); if (m0 == NULL) return; SBLASTRECORDCHK(sb, "sbinsertoob 1"); for (mp = &sb->sb_mb; (m = *mp) != NULL; mp = &((*mp)->m_nextpkt)) { again: switch (m->m_type) { case MT_OOBDATA: continue; /* WANT next train */ case MT_CONTROL: if ((m = m->m_next) != NULL) goto again; /* inspect THIS train further */ } break; } /* * Put the first mbuf on the queue. * Note this permits zero length records. */ sballoc(sb, m0); m0->m_nextpkt = *mp; if (*mp == NULL) { /* m0 is actually the new tail */ sb->sb_lastrecord = m0; } *mp = m0; m = m0->m_next; m0->m_next = 0; if (m && (m0->m_flags & M_EOR)) { m0->m_flags &= ~M_EOR; m->m_flags |= M_EOR; } sbcompress(sb, m, m0); SBLASTRECORDCHK(sb, "sbinsertoob 2"); } /* * Append address and data, and optionally, control (ancillary) data * to the receive queue of a socket. If present, * m0 must include a packet header with total length. * Returns 0 if no space in sockbuf or insufficient mbufs. */ int sbappendaddr(struct sockbuf *sb, const struct sockaddr *asa, struct mbuf *m0, struct mbuf *control) { struct mbuf *m, *n, *nlast; int space, len; KASSERT(solocked(sb->sb_so)); space = asa->sa_len; if (m0 != NULL) { if ((m0->m_flags & M_PKTHDR) == 0) panic("sbappendaddr"); space += m0->m_pkthdr.len; #ifdef MBUFTRACE m_claimm(m0, sb->sb_mowner); #endif } for (n = control; n; n = n->m_next) { space += n->m_len; MCLAIM(n, sb->sb_mowner); if (n->m_next == NULL) /* keep pointer to last control buf */ break; } if (space > sbspace(sb)) return (0); m = m_get(M_DONTWAIT, MT_SONAME); if (m == NULL) return (0); MCLAIM(m, sb->sb_mowner); /* * XXX avoid 'comparison always true' warning which isn't easily * avoided. */ len = asa->sa_len; if (len > MLEN) { MEXTMALLOC(m, asa->sa_len, M_NOWAIT); if ((m->m_flags & M_EXT) == 0) { m_free(m); return (0); } } m->m_len = asa->sa_len; memcpy(mtod(m, void *), asa, asa->sa_len); if (n) n->m_next = m0; /* concatenate data to control */ else control = m0; m->m_next = control; SBLASTRECORDCHK(sb, "sbappendaddr 1"); for (n = m; n->m_next != NULL; n = n->m_next) sballoc(sb, n); sballoc(sb, n); nlast = n; SBLINKRECORD(sb, m); sb->sb_mbtail = nlast; SBLASTMBUFCHK(sb, "sbappendaddr"); SBLASTRECORDCHK(sb, "sbappendaddr 2"); return (1); } /* * Helper for sbappendchainaddr: prepend a struct sockaddr* to * an mbuf chain. */ static inline struct mbuf * m_prepend_sockaddr(struct sockbuf *sb, struct mbuf *m0, const struct sockaddr *asa) { struct mbuf *m; const int salen = asa->sa_len; KASSERT(solocked(sb->sb_so)); /* only the first in each chain need be a pkthdr */ m = m_gethdr(M_DONTWAIT, MT_SONAME); if (m == NULL) return NULL; MCLAIM(m, sb->sb_mowner); #ifdef notyet if (salen > MHLEN) { MEXTMALLOC(m, salen, M_NOWAIT); if ((m->m_flags & M_EXT) == 0) { m_free(m); return NULL; } } #else KASSERT(salen <= MHLEN); #endif m->m_len = salen; memcpy(mtod(m, void *), asa, salen); m->m_next = m0; m->m_pkthdr.len = salen + m0->m_pkthdr.len; return m; } int sbappendaddrchain(struct sockbuf *sb, const struct sockaddr *asa, struct mbuf *m0, int sbprio) { struct mbuf *m, *n, *n0, *nlast; int error; KASSERT(solocked(sb->sb_so)); /* * XXX sbprio reserved for encoding priority of this* request: * SB_PRIO_NONE --> honour normal sb limits * SB_PRIO_ONESHOT_OVERFLOW --> if socket has any space, * take whole chain. Intended for large requests * that should be delivered atomically (all, or none). * SB_PRIO_OVERDRAFT -- allow a small (2*MLEN) overflow * over normal socket limits, for messages indicating * buffer overflow in earlier normal/lower-priority messages * SB_PRIO_BESTEFFORT --> ignore limits entirely. * Intended for kernel-generated messages only. * Up to generator to avoid total mbuf resource exhaustion. */ (void)sbprio; if (m0 && (m0->m_flags & M_PKTHDR) == 0) panic("sbappendaddrchain"); #ifdef notyet space = sbspace(sb); /* * Enforce SB_PRIO_* limits as described above. */ #endif n0 = NULL; nlast = NULL; for (m = m0; m; m = m->m_nextpkt) { struct mbuf *np; #ifdef MBUFTRACE m_claimm(m, sb->sb_mowner); #endif /* Prepend sockaddr to this record (m) of input chain m0 */ n = m_prepend_sockaddr(sb, m, asa); if (n == NULL) { error = ENOBUFS; goto bad; } /* Append record (asa+m) to end of new chain n0 */ if (n0 == NULL) { n0 = n; } else { nlast->m_nextpkt = n; } /* Keep track of last record on new chain */ nlast = n; for (np = n; np; np = np->m_next) sballoc(sb, np); } SBLASTRECORDCHK(sb, "sbappendaddrchain 1"); /* Drop the entire chain of (asa+m) records onto the socket */ SBLINKRECORDCHAIN(sb, n0, nlast); SBLASTRECORDCHK(sb, "sbappendaddrchain 2"); for (m = nlast; m->m_next; m = m->m_next) ; sb->sb_mbtail = m; SBLASTMBUFCHK(sb, "sbappendaddrchain"); return (1); bad: /* * On error, free the prepended addresses. For consistency * with sbappendaddr(), leave it to our caller to free * the input record chain passed to us as m0. */ while ((n = n0) != NULL) { struct mbuf *np; /* Undo the sballoc() of this record */ for (np = n; np; np = np->m_next) sbfree(sb, np); n0 = n->m_nextpkt; /* iterate at next prepended address */ np = m_free(n); /* free prepended address (not data) */ } return error; } int sbappendcontrol(struct sockbuf *sb, struct mbuf *m0, struct mbuf *control) { struct mbuf *m, *mlast, *n; int space; KASSERT(solocked(sb->sb_so)); space = 0; if (control == NULL) panic("sbappendcontrol"); for (m = control; ; m = m->m_next) { space += m->m_len; MCLAIM(m, sb->sb_mowner); if (m->m_next == NULL) break; } n = m; /* save pointer to last control buffer */ for (m = m0; m; m = m->m_next) { MCLAIM(m, sb->sb_mowner); space += m->m_len; } if (space > sbspace(sb)) return (0); n->m_next = m0; /* concatenate data to control */ SBLASTRECORDCHK(sb, "sbappendcontrol 1"); for (m = control; m->m_next != NULL; m = m->m_next) sballoc(sb, m); sballoc(sb, m); mlast = m; SBLINKRECORD(sb, control); sb->sb_mbtail = mlast; SBLASTMBUFCHK(sb, "sbappendcontrol"); SBLASTRECORDCHK(sb, "sbappendcontrol 2"); return (1); } /* * Compress mbuf chain m into the socket * buffer sb following mbuf n. If n * is null, the buffer is presumed empty. */ void sbcompress(struct sockbuf *sb, struct mbuf *m, struct mbuf *n) { int eor; struct mbuf *o; KASSERT(solocked(sb->sb_so)); eor = 0; while (m) { eor |= m->m_flags & M_EOR; if (m->m_len == 0 && (eor == 0 || (((o = m->m_next) || (o = n)) && o->m_type == m->m_type))) { if (sb->sb_lastrecord == m) sb->sb_lastrecord = m->m_next; m = m_free(m); continue; } if (n && (n->m_flags & M_EOR) == 0 && /* M_TRAILINGSPACE() checks buffer writeability */ m->m_len <= MCLBYTES / 4 && /* XXX Don't copy too much */ m->m_len <= M_TRAILINGSPACE(n) && n->m_type == m->m_type) { memcpy(mtod(n, char *) + n->m_len, mtod(m, void *), (unsigned)m->m_len); n->m_len += m->m_len; sb->sb_cc += m->m_len; m = m_free(m); continue; } if (n) n->m_next = m; else sb->sb_mb = m; sb->sb_mbtail = m; sballoc(sb, m); n = m; m->m_flags &= ~M_EOR; m = m->m_next; n->m_next = 0; } if (eor) { if (n) n->m_flags |= eor; else printf("semi-panic: sbcompress\n"); } SBLASTMBUFCHK(sb, __func__); } /* * Free all mbufs in a sockbuf. * Check that all resources are reclaimed. */ void sbflush(struct sockbuf *sb) { KASSERT(solocked(sb->sb_so)); KASSERT((sb->sb_flags & SB_LOCK) == 0); while (sb->sb_mbcnt) sbdrop(sb, (int)sb->sb_cc); KASSERT(sb->sb_cc == 0); KASSERT(sb->sb_mb == NULL); KASSERT(sb->sb_mbtail == NULL); KASSERT(sb->sb_lastrecord == NULL); } /* * Drop data from (the front of) a sockbuf. */ void sbdrop(struct sockbuf *sb, int len) { struct mbuf *m, *next; KASSERT(solocked(sb->sb_so)); next = (m = sb->sb_mb) ? m->m_nextpkt : NULL; while (len > 0) { if (m == NULL) { if (next == NULL) panic("sbdrop(%p,%d): cc=%lu", sb, len, sb->sb_cc); m = next; next = m->m_nextpkt; continue; } if (m->m_len > len) { m->m_len -= len; m->m_data += len; sb->sb_cc -= len; break; } len -= m->m_len; sbfree(sb, m); m = m_free(m); } while (m && m->m_len == 0) { sbfree(sb, m); m = m_free(m); } if (m) { sb->sb_mb = m; m->m_nextpkt = next; } else sb->sb_mb = next; /* * First part is an inline SB_EMPTY_FIXUP(). Second part * makes sure sb_lastrecord is up-to-date if we dropped * part of the last record. */ m = sb->sb_mb; if (m == NULL) { sb->sb_mbtail = NULL; sb->sb_lastrecord = NULL; } else if (m->m_nextpkt == NULL) sb->sb_lastrecord = m; } /* * Drop a record off the front of a sockbuf * and move the next record to the front. */ void sbdroprecord(struct sockbuf *sb) { struct mbuf *m, *mn; KASSERT(solocked(sb->sb_so)); m = sb->sb_mb; if (m) { sb->sb_mb = m->m_nextpkt; do { sbfree(sb, m); mn = m_free(m); } while ((m = mn) != NULL); } SB_EMPTY_FIXUP(sb); } /* * Create a "control" mbuf containing the specified data * with the specified type for presentation on a socket buffer. */ struct mbuf * sbcreatecontrol1(void **p, int size, int type, int level, int flags) { struct cmsghdr *cp; struct mbuf *m; int space = CMSG_SPACE(size); if ((flags & M_DONTWAIT) && space > MCLBYTES) { printf("%s: message too large %d\n", __func__, space); return NULL; } if ((m = m_get(flags, MT_CONTROL)) == NULL) return NULL; if (space > MLEN) { if (space > MCLBYTES) MEXTMALLOC(m, space, M_WAITOK); else MCLGET(m, flags); if ((m->m_flags & M_EXT) == 0) { m_free(m); return NULL; } } cp = mtod(m, struct cmsghdr *); *p = CMSG_DATA(cp); m->m_len = space; cp->cmsg_len = CMSG_LEN(size); cp->cmsg_level = level; cp->cmsg_type = type; memset(cp + 1, 0, CMSG_LEN(0) - sizeof(*cp)); memset((uint8_t *)*p + size, 0, CMSG_ALIGN(size) - size); return m; } struct mbuf * sbcreatecontrol(void *p, int size, int type, int level) { struct mbuf *m; void *v; m = sbcreatecontrol1(&v, size, type, level, M_DONTWAIT); if (m == NULL) return NULL; memcpy(v, p, size); return m; } void solockretry(struct socket *so, kmutex_t *lock) { while (lock != atomic_load_relaxed(&so->so_lock)) { mutex_exit(lock); lock = atomic_load_consume(&so->so_lock); mutex_enter(lock); } } bool solocked(const struct socket *so) { /* * Used only for diagnostic assertions, so so_lock should be * stable at this point, hence on need for atomic_load_*. */ return mutex_owned(so->so_lock); } bool solocked2(const struct socket *so1, const struct socket *so2) { const kmutex_t *lock; /* * Used only for diagnostic assertions, so so_lock should be * stable at this point, hence on need for atomic_load_*. */ lock = so1->so_lock; if (lock != so2->so_lock) return false; return mutex_owned(lock); } /* * sosetlock: assign a default lock to a new socket. */ void sosetlock(struct socket *so) { if (so->so_lock == NULL) { kmutex_t *lock = softnet_lock; so->so_lock = lock; mutex_obj_hold(lock); mutex_enter(lock); } KASSERT(solocked(so)); } /* * Set lock on sockbuf sb; sleep if lock is already held. * Unless SB_NOINTR is set on sockbuf, sleep is interruptible. * Returns error without lock if sleep is interrupted. */ int sblock(struct sockbuf *sb, int wf) { struct socket *so; kmutex_t *lock; int error; KASSERT(solocked(sb->sb_so)); for (;;) { if (__predict_true((sb->sb_flags & SB_LOCK) == 0)) { sb->sb_flags |= SB_LOCK; return 0; } if (wf != M_WAITOK) return EWOULDBLOCK; so = sb->sb_so; lock = so->so_lock; if ((sb->sb_flags & SB_NOINTR) != 0) { cv_wait(&so->so_cv, lock); error = 0; } else error = cv_wait_sig(&so->so_cv, lock); if (__predict_false(lock != atomic_load_relaxed(&so->so_lock))) solockretry(so, lock); if (error != 0) return error; } } void sbunlock(struct sockbuf *sb) { struct socket *so; so = sb->sb_so; KASSERT(solocked(so)); KASSERT((sb->sb_flags & SB_LOCK) != 0); sb->sb_flags &= ~SB_LOCK; cv_broadcast(&so->so_cv); } int sowait(struct socket *so, bool catch_p, int timo) { kmutex_t *lock; int error; KASSERT(solocked(so)); KASSERT(catch_p || timo != 0); lock = so->so_lock; if (catch_p) error = cv_timedwait_sig(&so->so_cv, lock, timo); else error = cv_timedwait(&so->so_cv, lock, timo); if (__predict_false(lock != atomic_load_relaxed(&so->so_lock))) solockretry(so, lock); return error; } #ifdef DDB /* * Currently, sofindproc() is used only from DDB. It could be used from others * by using db_mutex_enter() */ static inline int db_mutex_enter(kmutex_t *mtx) { int rv; if (!db_active) { mutex_enter(mtx); rv = 1; } else rv = mutex_tryenter(mtx); return rv; } int sofindproc(struct socket *so, int all, void (*pr)(const char *, ...)) { proc_t *p; filedesc_t *fdp; fdtab_t *dt; fdfile_t *ff; file_t *fp = NULL; int found = 0; int i, t; if (so == NULL) return 0; t = db_mutex_enter(&proc_lock); if (!t) { pr("could not acquire proc_lock mutex\n"); return 0; } PROCLIST_FOREACH(p, &allproc) { if (p->p_stat == SIDL) continue; fdp = p->p_fd; t = db_mutex_enter(&fdp->fd_lock); if (!t) { pr("could not acquire fd_lock mutex\n"); continue; } dt = atomic_load_consume(&fdp->fd_dt); for (i = 0; i < dt->dt_nfiles; i++) { ff = dt->dt_ff[i]; if (ff == NULL) continue; fp = atomic_load_consume(&ff->ff_file); if (fp == NULL) continue; t = db_mutex_enter(&fp->f_lock); if (!t) { pr("could not acquire f_lock mutex\n"); continue; } if ((struct socket *)fp->f_data != so) { mutex_exit(&fp->f_lock); continue; } found++; if (pr) pr("socket %p: owner %s(pid=%d)\n", so, p->p_comm, p->p_pid); mutex_exit(&fp->f_lock); if (all == 0) break; } mutex_exit(&fdp->fd_lock); if (all == 0 && found != 0) break; } mutex_exit(&proc_lock); return found; } void socket_print(const char *modif, void (*pr)(const char *, ...)) { file_t *fp; struct socket *so; struct sockbuf *sb_snd, *sb_rcv; struct mbuf *m_rec, *m; bool opt_v = false; bool opt_m = false; bool opt_a = false; bool opt_p = false; int nrecs, nmbufs; char ch; const char *family; while ( (ch = *(modif++)) != '\0') { switch (ch) { case 'v': opt_v = true; break; case 'm': opt_m = true; break; case 'a': opt_a = true; break; case 'p': opt_p = true; break; } } if (opt_v == false && pr) (pr)("Ignore empty sockets. use /v to print all.\n"); if (opt_p == true && pr) (pr)("Don't search owner process.\n"); LIST_FOREACH(fp, &filehead, f_list) { if (fp->f_type != DTYPE_SOCKET) continue; so = (struct socket *)fp->f_data; if (so == NULL) continue; if (so->so_proto->pr_domain->dom_family == AF_INET) family = "INET"; #ifdef INET6 else if (so->so_proto->pr_domain->dom_family == AF_INET6) family = "INET6"; #endif else if (so->so_proto->pr_domain->dom_family == pseudo_AF_KEY) family = "KEY"; else if (so->so_proto->pr_domain->dom_family == AF_ROUTE) family = "ROUTE"; else continue; sb_snd = &so->so_snd; sb_rcv = &so->so_rcv; if (opt_v != true && sb_snd->sb_cc == 0 && sb_rcv->sb_cc == 0) continue; pr("---SOCKET %p: type %s\n", so, family); if (opt_p != true) sofindproc(so, opt_a == true ? 1 : 0, pr); pr("Send Buffer Bytes: %d [bytes]\n", sb_snd->sb_cc); pr("Send Buffer mbufs:\n"); m_rec = m = sb_snd->sb_mb; nrecs = 0; nmbufs = 0; while (m_rec) { nrecs++; if (opt_m == true) pr(" mbuf chain %p\n", m_rec); while (m) { nmbufs++; m = m->m_next; } m_rec = m = m_rec->m_nextpkt; } pr(" Total %d records, %d mbufs.\n", nrecs, nmbufs); pr("Recv Buffer Usage: %d [bytes]\n", sb_rcv->sb_cc); pr("Recv Buffer mbufs:\n"); m_rec = m = sb_rcv->sb_mb; nrecs = 0; nmbufs = 0; while (m_rec) { nrecs++; if (opt_m == true) pr(" mbuf chain %p\n", m_rec); while (m) { nmbufs++; m = m->m_next; } m_rec = m = m_rec->m_nextpkt; } pr(" Total %d records, %d mbufs.\n", nrecs, nmbufs); } } #endif /* DDB */
6 6 6 1 5 1 1 1 4 6 8 8 8 8 8 8 5 5 5 5 5 5 2 3 3 3 3 3 30 1 1 3 3 2 1 1 5 1 7 5 3 10 1 1 4 1 6 6 6 1 6 6 1 2 3 2 2 3 2 1 1 2 4 1 2 2 1 1 1 4 2 2 3 3 1 1 1 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 /* $NetBSD: uvm_swap.c,v 1.208 2023/04/09 09:00:56 riastradh Exp $ */ /* * Copyright (c) 1995, 1996, 1997, 2009 Matthew R. Green * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: NetBSD: vm_swap.c,v 1.52 1997/12/02 13:47:37 pk Exp * from: Id: uvm_swap.c,v 1.1.2.42 1998/02/02 20:38:06 chuck Exp */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: uvm_swap.c,v 1.208 2023/04/09 09:00:56 riastradh Exp $"); #include "opt_uvmhist.h" #include "opt_compat_netbsd.h" #include "opt_ddb.h" #include "opt_vmswap.h" #include <sys/param.h> #include <sys/systm.h> #include <sys/atomic.h> #include <sys/buf.h> #include <sys/bufq.h> #include <sys/conf.h> #include <sys/cprng.h> #include <sys/proc.h> #include <sys/namei.h> #include <sys/disklabel.h> #include <sys/errno.h> #include <sys/kernel.h> #include <sys/vnode.h> #include <sys/file.h> #include <sys/vmem.h> #include <sys/blist.h> #include <sys/mount.h> #include <sys/pool.h> #include <sys/kmem.h> #include <sys/syscallargs.h> #include <sys/swap.h> #include <sys/kauth.h> #include <sys/sysctl.h> #include <sys/workqueue.h> #include <uvm/uvm.h> #include <miscfs/specfs/specdev.h> #include <crypto/aes/aes.h> #include <crypto/aes/aes_cbc.h> /* * uvm_swap.c: manage configuration and i/o to swap space. */ /* * swap space is managed in the following way: * * each swap partition or file is described by a "swapdev" structure. * each "swapdev" structure contains a "swapent" structure which contains * information that is passed up to the user (via system calls). * * each swap partition is assigned a "priority" (int) which controls * swap partition usage. * * the system maintains a global data structure describing all swap * partitions/files. there is a sorted LIST of "swappri" structures * which describe "swapdev"'s at that priority. this LIST is headed * by the "swap_priority" global var. each "swappri" contains a * TAILQ of "swapdev" structures at that priority. * * locking: * - swap_syscall_lock (krwlock_t): this lock serializes the swapctl * system call and prevents the swap priority list from changing * while we are in the middle of a system call (e.g. SWAP_STATS). * - uvm_swap_data_lock (kmutex_t): this lock protects all swap data * structures including the priority list, the swapdev structures, * and the swapmap arena. * * each swap device has the following info: * - swap device in use (could be disabled, preventing future use) * - swap enabled (allows new allocations on swap) * - map info in /dev/drum * - vnode pointer * for swap files only: * - block size * - max byte count in buffer * - buffer * * userland controls and configures swap with the swapctl(2) system call. * the sys_swapctl performs the following operations: * [1] SWAP_NSWAP: returns the number of swap devices currently configured * [2] SWAP_STATS: given a pointer to an array of swapent structures * (passed in via "arg") of a size passed in via "misc" ... we load * the current swap config into the array. The actual work is done * in the uvm_swap_stats() function. * [3] SWAP_ON: given a pathname in arg (could be device or file) and a * priority in "misc", start swapping on it. * [4] SWAP_OFF: as SWAP_ON, but stops swapping to a device * [5] SWAP_CTL: changes the priority of a swap device (new priority in * "misc") */ /* * swapdev: describes a single swap partition/file * * note the following should be true: * swd_inuse <= swd_nblks [number of blocks in use is <= total blocks] * swd_nblks <= swd_mapsize [because mapsize includes miniroot+disklabel] */ struct swapdev { dev_t swd_dev; /* device id */ int swd_flags; /* flags:inuse/enable/fake */ int swd_priority; /* our priority */ int swd_nblks; /* blocks in this device */ char *swd_path; /* saved pathname of device */ int swd_pathlen; /* length of pathname */ int swd_npages; /* #pages we can use */ int swd_npginuse; /* #pages in use */ int swd_npgbad; /* #pages bad */ int swd_drumoffset; /* page0 offset in drum */ int swd_drumsize; /* #pages in drum */ blist_t swd_blist; /* blist for this swapdev */ struct vnode *swd_vp; /* backing vnode */ TAILQ_ENTRY(swapdev) swd_next; /* priority tailq */ int swd_bsize; /* blocksize (bytes) */ int swd_maxactive; /* max active i/o reqs */ struct bufq_state *swd_tab; /* buffer list */ int swd_active; /* number of active buffers */ volatile uint32_t *swd_encmap; /* bitmap of encrypted slots */ struct aesenc swd_enckey; /* AES key expanded for enc */ struct aesdec swd_deckey; /* AES key expanded for dec */ bool swd_encinit; /* true if keys initialized */ }; /* * swap device priority entry; the list is kept sorted on `spi_priority'. */ struct swappri { int spi_priority; /* priority */ TAILQ_HEAD(spi_swapdev, swapdev) spi_swapdev; /* tailq of swapdevs at this priority */ LIST_ENTRY(swappri) spi_swappri; /* global list of pri's */ }; /* * The following two structures are used to keep track of data transfers * on swap devices associated with regular files. * NOTE: this code is more or less a copy of vnd.c; we use the same * structure names here to ease porting.. */ struct vndxfer { struct buf *vx_bp; /* Pointer to parent buffer */ struct swapdev *vx_sdp; int vx_error; int vx_pending; /* # of pending aux buffers */ int vx_flags; #define VX_BUSY 1 #define VX_DEAD 2 }; struct vndbuf { struct buf vb_buf; struct vndxfer *vb_xfer; }; /* * We keep a of pool vndbuf's and vndxfer structures. */ static struct pool vndxfer_pool, vndbuf_pool; /* * local variables */ static vmem_t *swapmap; /* controls the mapping of /dev/drum */ /* list of all active swap devices [by priority] */ LIST_HEAD(swap_priority, swappri); static struct swap_priority swap_priority; /* locks */ static kmutex_t uvm_swap_data_lock __cacheline_aligned; static krwlock_t swap_syscall_lock; bool uvm_swap_init_done = false; /* workqueue and use counter for swap to regular files */ static int sw_reg_count = 0; static struct workqueue *sw_reg_workqueue; /* tuneables */ u_int uvm_swapisfull_factor = 99; #if VMSWAP_DEFAULT_PLAINTEXT bool uvm_swap_encrypt = false; #else bool uvm_swap_encrypt = true; #endif /* * prototypes */ static struct swapdev *swapdrum_getsdp(int); static struct swapdev *swaplist_find(struct vnode *, bool); static void swaplist_insert(struct swapdev *, struct swappri *, int); static void swaplist_trim(void); static int swap_on(struct lwp *, struct swapdev *); static int swap_off(struct lwp *, struct swapdev *); static void sw_reg_strategy(struct swapdev *, struct buf *, int); static void sw_reg_biodone(struct buf *); static void sw_reg_iodone(struct work *wk, void *dummy); static void sw_reg_start(struct swapdev *); static int uvm_swap_io(struct vm_page **, int, int, int); static void uvm_swap_genkey(struct swapdev *); static void uvm_swap_encryptpage(struct swapdev *, void *, int); static void uvm_swap_decryptpage(struct swapdev *, void *, int); static size_t encmap_size(size_t npages) { struct swapdev *sdp; const size_t bytesperword = sizeof(sdp->swd_encmap[0]); const size_t bitsperword = NBBY * bytesperword; const size_t nbits = npages; /* one bit for each page */ const size_t nwords = howmany(nbits, bitsperword); const size_t nbytes = nwords * bytesperword; return nbytes; } /* * uvm_swap_init: init the swap system data structures and locks * * => called at boot time from init_main.c after the filesystems * are brought up (which happens after uvm_init()) */ void uvm_swap_init(void) { UVMHIST_FUNC(__func__); UVMHIST_CALLED(pdhist); /* * first, init the swap list, its counter, and its lock. * then get a handle on the vnode for /dev/drum by using * the its dev_t number ("swapdev", from MD conf.c). */ LIST_INIT(&swap_priority); uvmexp.nswapdev = 0; rw_init(&swap_syscall_lock); mutex_init(&uvm_swap_data_lock, MUTEX_DEFAULT, IPL_NONE); if (bdevvp(swapdev, &swapdev_vp)) panic("%s: can't get vnode for swap device", __func__); if (vn_lock(swapdev_vp, LK_EXCLUSIVE | LK_RETRY)) panic("%s: can't lock swap device", __func__); if (VOP_OPEN(swapdev_vp, FREAD | FWRITE, NOCRED)) panic("%s: can't open swap device", __func__); VOP_UNLOCK(swapdev_vp); /* * create swap block resource map to map /dev/drum. the range * from 1 to INT_MAX allows 2 gigablocks of swap space. note * that block 0 is reserved (used to indicate an allocation * failure, or no allocation). */ swapmap = vmem_create("swapmap", 1, INT_MAX - 1, 1, NULL, NULL, NULL, 0, VM_NOSLEEP, IPL_NONE); if (swapmap == 0) { panic("%s: vmem_create failed", __func__); } pool_init(&vndxfer_pool, sizeof(struct vndxfer), 0, 0, 0, "swp vnx", NULL, IPL_BIO); pool_init(&vndbuf_pool, sizeof(struct vndbuf), 0, 0, 0, "swp vnd", NULL, IPL_BIO); uvm_swap_init_done = true; UVMHIST_LOG(pdhist, "<- done", 0, 0, 0, 0); } /* * swaplist functions: functions that operate on the list of swap * devices on the system. */ /* * swaplist_insert: insert swap device "sdp" into the global list * * => caller must hold both swap_syscall_lock and uvm_swap_data_lock * => caller must provide a newly allocated swappri structure (we will * FREE it if we don't need it... this it to prevent allocation * blocking here while adding swap) */ static void swaplist_insert(struct swapdev *sdp, struct swappri *newspp, int priority) { struct swappri *spp, *pspp; UVMHIST_FUNC(__func__); UVMHIST_CALLED(pdhist); KASSERT(rw_write_held(&swap_syscall_lock)); KASSERT(mutex_owned(&uvm_swap_data_lock)); /* * find entry at or after which to insert the new device. */ pspp = NULL; LIST_FOREACH(spp, &swap_priority, spi_swappri) { if (priority <= spp->spi_priority) break; pspp = spp; } /* * new priority? */ if (spp == NULL || spp->spi_priority != priority) { spp = newspp; /* use newspp! */ UVMHIST_LOG(pdhist, "created new swappri = %jd", priority, 0, 0, 0); spp->spi_priority = priority; TAILQ_INIT(&spp->spi_swapdev); if (pspp) LIST_INSERT_AFTER(pspp, spp, spi_swappri); else LIST_INSERT_HEAD(&swap_priority, spp, spi_swappri); } else { /* we don't need a new priority structure, free it */ kmem_free(newspp, sizeof(*newspp)); } /* * priority found (or created). now insert on the priority's * tailq list and bump the total number of swapdevs. */ sdp->swd_priority = priority; TAILQ_INSERT_TAIL(&spp->spi_swapdev, sdp, swd_next); uvmexp.nswapdev++; } /* * swaplist_find: find and optionally remove a swap device from the * global list. * * => caller must hold both swap_syscall_lock and uvm_swap_data_lock * => we return the swapdev we found (and removed) */ static struct swapdev * swaplist_find(struct vnode *vp, bool remove) { struct swapdev *sdp; struct swappri *spp; KASSERT(rw_lock_held(&swap_syscall_lock)); KASSERT(remove ? rw_write_held(&swap_syscall_lock) : 1); KASSERT(mutex_owned(&uvm_swap_data_lock)); /* * search the lists for the requested vp */ LIST_FOREACH(spp, &swap_priority, spi_swappri) { TAILQ_FOREACH(sdp, &spp->spi_swapdev, swd_next) { if (sdp->swd_vp == vp) { if (remove) { TAILQ_REMOVE(&spp->spi_swapdev, sdp, swd_next); uvmexp.nswapdev--; } return(sdp); } } } return (NULL); } /* * swaplist_trim: scan priority list for empty priority entries and kill * them. * * => caller must hold both swap_syscall_lock and uvm_swap_data_lock */ static void swaplist_trim(void) { struct swappri *spp, *nextspp; KASSERT(rw_write_held(&swap_syscall_lock)); KASSERT(mutex_owned(&uvm_swap_data_lock)); LIST_FOREACH_SAFE(spp, &swap_priority, spi_swappri, nextspp) { if (!TAILQ_EMPTY(&spp->spi_swapdev)) continue; LIST_REMOVE(spp, spi_swappri); kmem_free(spp, sizeof(*spp)); } } /* * swapdrum_getsdp: given a page offset in /dev/drum, convert it back * to the "swapdev" that maps that section of the drum. * * => each swapdev takes one big contig chunk of the drum * => caller must hold uvm_swap_data_lock */ static struct swapdev * swapdrum_getsdp(int pgno) { struct swapdev *sdp; struct swappri *spp; KASSERT(mutex_owned(&uvm_swap_data_lock)); LIST_FOREACH(spp, &swap_priority, spi_swappri) { TAILQ_FOREACH(sdp, &spp->spi_swapdev, swd_next) { if (sdp->swd_flags & SWF_FAKE) continue; if (pgno >= sdp->swd_drumoffset && pgno < (sdp->swd_drumoffset + sdp->swd_drumsize)) { return sdp; } } } return NULL; } /* * swapdrum_sdp_is: true iff the swap device for pgno is sdp * * => for use in positive assertions only; result is not stable */ static bool __debugused swapdrum_sdp_is(int pgno, struct swapdev *sdp) { bool result; mutex_enter(&uvm_swap_data_lock); result = swapdrum_getsdp(pgno) == sdp; mutex_exit(&uvm_swap_data_lock); return result; } void swapsys_lock(krw_t op) { rw_enter(&swap_syscall_lock, op); } void swapsys_unlock(void) { rw_exit(&swap_syscall_lock); } static void swapent_cvt(struct swapent *se, const struct swapdev *sdp, int inuse) { se->se_dev = sdp->swd_dev; se->se_flags = sdp->swd_flags; se->se_nblks = sdp->swd_nblks; se->se_inuse = inuse; se->se_priority = sdp->swd_priority; KASSERT(sdp->swd_pathlen < sizeof(se->se_path)); strcpy(se->se_path, sdp->swd_path); } int (*uvm_swap_stats13)(const struct sys_swapctl_args *, register_t *) = (void *)enosys; int (*uvm_swap_stats50)(const struct sys_swapctl_args *, register_t *) = (void *)enosys; /* * sys_swapctl: main entry point for swapctl(2) system call * [with two helper functions: swap_on and swap_off] */ int sys_swapctl(struct lwp *l, const struct sys_swapctl_args *uap, register_t *retval) { /* { syscallarg(int) cmd; syscallarg(void *) arg; syscallarg(int) misc; } */ struct vnode *vp; struct nameidata nd; struct swappri *spp; struct swapdev *sdp; #define SWAP_PATH_MAX (PATH_MAX + 1) char *userpath; size_t len = 0; int error; int priority; UVMHIST_FUNC(__func__); UVMHIST_CALLED(pdhist); /* * we handle the non-priv NSWAP and STATS request first. * * SWAP_NSWAP: return number of config'd swap devices * [can also be obtained with uvmexp sysctl] */ if (SCARG(uap, cmd) == SWAP_NSWAP) { const int nswapdev = uvmexp.nswapdev; UVMHIST_LOG(pdhist, "<- done SWAP_NSWAP=%jd", nswapdev, 0, 0, 0); *retval = nswapdev; return 0; } userpath = kmem_alloc(SWAP_PATH_MAX, KM_SLEEP); /* * ensure serialized syscall access by grabbing the swap_syscall_lock */ rw_enter(&swap_syscall_lock, RW_WRITER); /* * SWAP_STATS: get stats on current # of configured swap devs * * note that the swap_priority list can't change as long * as we are holding the swap_syscall_lock. we don't want * to grab the uvm_swap_data_lock because we may fault&sleep during * copyout() and we don't want to be holding that lock then! */ switch (SCARG(uap, cmd)) { case SWAP_STATS13: error = (*uvm_swap_stats13)(uap, retval); goto out; case SWAP_STATS50: error = (*uvm_swap_stats50)(uap, retval); goto out; case SWAP_STATS: error = uvm_swap_stats(SCARG(uap, arg), SCARG(uap, misc), NULL, sizeof(struct swapent), retval); UVMHIST_LOG(pdhist, "<- done SWAP_STATS", 0, 0, 0, 0); goto out; case SWAP_GETDUMPDEV: error = copyout(&dumpdev, SCARG(uap, arg), sizeof(dumpdev)); goto out; default: break; } /* * all other requests require superuser privs. verify. */ if ((error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_SWAPCTL, 0, NULL, NULL, NULL))) goto out; if (SCARG(uap, cmd) == SWAP_DUMPOFF) { /* drop the current dump device */ dumpdev = NODEV; dumpcdev = NODEV; cpu_dumpconf(); goto out; } /* * at this point we expect a path name in arg. we will * use namei() to gain a vnode reference (vref), and lock * the vnode (VOP_LOCK). * * XXX: a NULL arg means use the root vnode pointer (e.g. for * miniroot) */ if (SCARG(uap, arg) == NULL) { vp = rootvp; /* miniroot */ vref(vp); if (vn_lock(vp, LK_EXCLUSIVE)) { vrele(vp); error = EBUSY; goto out; } if (SCARG(uap, cmd) == SWAP_ON && copystr("miniroot", userpath, SWAP_PATH_MAX, &len)) panic("swapctl: miniroot copy failed"); } else { struct pathbuf *pb; /* * This used to allow copying in one extra byte * (SWAP_PATH_MAX instead of PATH_MAX) for SWAP_ON. * This was completely pointless because if anyone * used that extra byte namei would fail with * ENAMETOOLONG anyway, so I've removed the excess * logic. - dholland 20100215 */ error = pathbuf_copyin(SCARG(uap, arg), &pb); if (error) { goto out; } if (SCARG(uap, cmd) == SWAP_ON) { /* get a copy of the string */ pathbuf_copystring(pb, userpath, SWAP_PATH_MAX); len = strlen(userpath) + 1; } NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | TRYEMULROOT, pb); if ((error = namei(&nd))) { pathbuf_destroy(pb); goto out; } vp = nd.ni_vp; pathbuf_destroy(pb); } /* note: "vp" is referenced and locked */ error = 0; /* assume no error */ switch(SCARG(uap, cmd)) { case SWAP_DUMPDEV: if (vp->v_type != VBLK) { error = ENOTBLK; break; } if (bdevsw_lookup(vp->v_rdev)) { dumpdev = vp->v_rdev; dumpcdev = devsw_blk2chr(dumpdev); } else dumpdev = NODEV; cpu_dumpconf(); break; case SWAP_CTL: /* * get new priority, remove old entry (if any) and then * reinsert it in the correct place. finally, prune out * any empty priority structures. */ priority = SCARG(uap, misc); spp = kmem_alloc(sizeof(*spp), KM_SLEEP); mutex_enter(&uvm_swap_data_lock); if ((sdp = swaplist_find(vp, true)) == NULL) { error = ENOENT; } else { swaplist_insert(sdp, spp, priority); swaplist_trim(); } mutex_exit(&uvm_swap_data_lock); if (error) kmem_free(spp, sizeof(*spp)); break; case SWAP_ON: /* * check for duplicates. if none found, then insert a * dummy entry on the list to prevent someone else from * trying to enable this device while we are working on * it. */ priority = SCARG(uap, misc); sdp = kmem_zalloc(sizeof(*sdp), KM_SLEEP); spp = kmem_alloc(sizeof(*spp), KM_SLEEP); sdp->swd_flags = SWF_FAKE; sdp->swd_vp = vp; sdp->swd_dev = (vp->v_type == VBLK) ? vp->v_rdev : NODEV; bufq_alloc(&sdp->swd_tab, "disksort", BUFQ_SORT_RAWBLOCK); mutex_enter(&uvm_swap_data_lock); if (swaplist_find(vp, false) != NULL) { error = EBUSY; mutex_exit(&uvm_swap_data_lock); bufq_free(sdp->swd_tab); kmem_free(sdp, sizeof(*sdp)); kmem_free(spp, sizeof(*spp)); break; } swaplist_insert(sdp, spp, priority); mutex_exit(&uvm_swap_data_lock); KASSERT(len > 0); sdp->swd_pathlen = len; sdp->swd_path = kmem_alloc(len, KM_SLEEP); if (copystr(userpath, sdp->swd_path, len, 0) != 0) panic("swapctl: copystr"); /* * we've now got a FAKE placeholder in the swap list. * now attempt to enable swap on it. if we fail, undo * what we've done and kill the fake entry we just inserted. * if swap_on is a success, it will clear the SWF_FAKE flag */ if ((error = swap_on(l, sdp)) != 0) { mutex_enter(&uvm_swap_data_lock); (void) swaplist_find(vp, true); /* kill fake entry */ swaplist_trim(); mutex_exit(&uvm_swap_data_lock); bufq_free(sdp->swd_tab); kmem_free(sdp->swd_path, sdp->swd_pathlen); kmem_free(sdp, sizeof(*sdp)); break; } break; case SWAP_OFF: mutex_enter(&uvm_swap_data_lock); if ((sdp = swaplist_find(vp, false)) == NULL) { mutex_exit(&uvm_swap_data_lock); error = ENXIO; break; } /* * If a device isn't in use or enabled, we * can't stop swapping from it (again). */ if ((sdp->swd_flags & (SWF_INUSE|SWF_ENABLE)) == 0) { mutex_exit(&uvm_swap_data_lock); error = EBUSY; break; } /* * do the real work. */ error = swap_off(l, sdp); break; default: error = EINVAL; } /* * done! release the ref gained by namei() and unlock. */ vput(vp); out: rw_exit(&swap_syscall_lock); kmem_free(userpath, SWAP_PATH_MAX); UVMHIST_LOG(pdhist, "<- done! error=%jd", error, 0, 0, 0); return (error); } /* * uvm_swap_stats: implements swapctl(SWAP_STATS). The function is kept * away from sys_swapctl() in order to allow COMPAT_* swapctl() * emulation to use it directly without going through sys_swapctl(). * The problem with using sys_swapctl() there is that it involves * copying the swapent array to the stackgap, and this array's size * is not known at build time. Hence it would not be possible to * ensure it would fit in the stackgap in any case. */ int uvm_swap_stats(char *ptr, int misc, void (*f)(void *, const struct swapent *), size_t len, register_t *retval) { struct swappri *spp; struct swapdev *sdp; struct swapent sep; int count = 0; int error; KASSERT(len <= sizeof(sep)); if (len == 0) return ENOSYS; if (misc < 0) return EINVAL; if (misc == 0 || uvmexp.nswapdev == 0) return 0; /* Make sure userland cannot exhaust kernel memory */ if ((size_t)misc > (size_t)uvmexp.nswapdev) misc = uvmexp.nswapdev; KASSERT(rw_lock_held(&swap_syscall_lock)); LIST_FOREACH(spp, &swap_priority, spi_swappri) { TAILQ_FOREACH(sdp, &spp->spi_swapdev, swd_next) { int inuse; if (misc-- <= 0) break; inuse = btodb((uint64_t)sdp->swd_npginuse << PAGE_SHIFT); memset(&sep, 0, sizeof(sep)); swapent_cvt(&sep, sdp, inuse); if (f) (*f)(&sep, &sep); if ((error = copyout(&sep, ptr, len)) != 0) return error; ptr += len; count++; } } *retval = count; return 0; } /* * swap_on: attempt to enable a swapdev for swapping. note that the * swapdev is already on the global list, but disabled (marked * SWF_FAKE). * * => we avoid the start of the disk (to protect disk labels) * => we also avoid the miniroot, if we are swapping to root. * => caller should leave uvm_swap_data_lock unlocked, we may lock it * if needed. */ static int swap_on(struct lwp *l, struct swapdev *sdp) { struct vnode *vp; int error, npages, nblocks, size; long addr; vmem_addr_t result; struct vattr va; dev_t dev; UVMHIST_FUNC(__func__); UVMHIST_CALLED(pdhist); /* * we want to enable swapping on sdp. the swd_vp contains * the vnode we want (locked and ref'd), and the swd_dev * contains the dev_t of the file, if it a block device. */ vp = sdp->swd_vp; dev = sdp->swd_dev; /* * open the swap file (mostly useful for block device files to * let device driver know what is up). * * we skip the open/close for root on swap because the root * has already been opened when root was mounted (mountroot). */ if (vp != rootvp) { if ((error = VOP_OPEN(vp, FREAD|FWRITE, l->l_cred))) return (error); } /* XXX this only works for block devices */ UVMHIST_LOG(pdhist, " dev=%jd, major(dev)=%jd", dev, major(dev), 0, 0); /* * we now need to determine the size of the swap area. for * block specials we can call the d_psize function. * for normal files, we must stat [get attrs]. * * we put the result in nblks. * for normal files, we also want the filesystem block size * (which we get with statfs). */ switch (vp->v_type) { case VBLK: if ((nblocks = bdev_size(dev)) == -1) { error = ENXIO; goto bad; } break; case VREG: if ((error = VOP_GETATTR(vp, &va, l->l_cred))) goto bad; nblocks = (int)btodb(va.va_size); sdp->swd_bsize = 1 << vp->v_mount->mnt_fs_bshift; /* * limit the max # of outstanding I/O requests we issue * at any one time. take it easy on NFS servers. */ if (vp->v_tag == VT_NFS) sdp->swd_maxactive = 2; /* XXX */ else sdp->swd_maxactive = 8; /* XXX */ break; default: error = ENXIO; goto bad; } /* * save nblocks in a safe place and convert to pages. */ sdp->swd_nblks = nblocks; npages = dbtob((uint64_t)nblocks) >> PAGE_SHIFT; /* * for block special files, we want to make sure that leave * the disklabel and bootblocks alone, so we arrange to skip * over them (arbitrarily choosing to skip PAGE_SIZE bytes). * note that because of this the "size" can be less than the * actual number of blocks on the device. */ if (vp->v_type == VBLK) { /* we use pages 1 to (size - 1) [inclusive] */ size = npages - 1; addr = 1; } else { /* we use pages 0 to (size - 1) [inclusive] */ size = npages; addr = 0; } /* * make sure we have enough blocks for a reasonable sized swap * area. we want at least one page. */ if (size < 1) { UVMHIST_LOG(pdhist, " size <= 1!!", 0, 0, 0, 0); error = EINVAL; goto bad; } UVMHIST_LOG(pdhist, " dev=%#jx: size=%jd addr=%jd", dev, size, addr, 0); /* * now we need to allocate an extent to manage this swap device */ sdp->swd_blist = blist_create(npages); /* mark all expect the `saved' region free. */ blist_free(sdp->swd_blist, addr, size); /* * allocate space to for swap encryption state and mark the * keys uninitialized so we generate them lazily */ sdp->swd_encmap = kmem_zalloc(encmap_size(npages), KM_SLEEP); sdp->swd_encinit = false; /* * if the vnode we are swapping to is the root vnode * (i.e. we are swapping to the miniroot) then we want * to make sure we don't overwrite it. do a statfs to * find its size and skip over it. */ if (vp == rootvp) { struct mount *mp; struct statvfs *sp; int rootblocks, rootpages; mp = rootvnode->v_mount; sp = &mp->mnt_stat; rootblocks = sp->f_blocks * btodb(sp->f_frsize); /* * XXX: sp->f_blocks isn't the total number of * blocks in the filesystem, it's the number of * data blocks. so, our rootblocks almost * definitely underestimates the total size * of the filesystem - how badly depends on the * details of the filesystem type. there isn't * an obvious way to deal with this cleanly * and perfectly, so for now we just pad our * rootblocks estimate with an extra 5 percent. */ rootblocks += (rootblocks >> 5) + (rootblocks >> 6) + (rootblocks >> 7); rootpages = round_page(dbtob(rootblocks)) >> PAGE_SHIFT; if (rootpages > size) panic("swap_on: miniroot larger than swap?"); if (rootpages != blist_fill(sdp->swd_blist, addr, rootpages)) { panic("swap_on: unable to preserve miniroot"); } size -= rootpages; printf("Preserved %d pages of miniroot ", rootpages); printf("leaving %d pages of swap\n", size); } /* * add a ref to vp to reflect usage as a swap device. */ vref(vp); /* * now add the new swapdev to the drum and enable. */ error = vmem_alloc(swapmap, npages, VM_BESTFIT | VM_SLEEP, &result); if (error != 0) panic("swapdrum_add"); /* * If this is the first regular swap create the workqueue. * => Protected by swap_syscall_lock. */ if (vp->v_type != VBLK) { if (sw_reg_count++ == 0) { KASSERT(sw_reg_workqueue == NULL); if (workqueue_create(&sw_reg_workqueue, "swapiod", sw_reg_iodone, NULL, PRIBIO, IPL_BIO, 0) != 0) panic("%s: workqueue_create failed", __func__); } } sdp->swd_drumoffset = (int)result; sdp->swd_drumsize = npages; sdp->swd_npages = size; mutex_enter(&uvm_swap_data_lock); sdp->swd_flags &= ~SWF_FAKE; /* going live */ sdp->swd_flags |= (SWF_INUSE|SWF_ENABLE); uvmexp.swpages += size; uvmexp.swpgavail += size; mutex_exit(&uvm_swap_data_lock); return (0); /* * failure: clean up and return error. */ bad: if (sdp->swd_blist) { blist_destroy(sdp->swd_blist); } if (vp != rootvp) { (void)VOP_CLOSE(vp, FREAD|FWRITE, l->l_cred); } return (error); } /* * swap_off: stop swapping on swapdev * * => swap data should be locked, we will unlock. */ static int swap_off(struct lwp *l, struct swapdev *sdp) { int npages = sdp->swd_npages; int error = 0; UVMHIST_FUNC(__func__); UVMHIST_CALLARGS(pdhist, " dev=%#jx, npages=%jd", sdp->swd_dev,npages, 0, 0); KASSERT(rw_write_held(&swap_syscall_lock)); KASSERT(mutex_owned(&uvm_swap_data_lock)); /* disable the swap area being removed */ sdp->swd_flags &= ~SWF_ENABLE; uvmexp.swpgavail -= npages; mutex_exit(&uvm_swap_data_lock); /* * the idea is to find all the pages that are paged out to this * device, and page them all in. in uvm, swap-backed pageable * memory can take two forms: aobjs and anons. call the * swapoff hook for each subsystem to bring in pages. */ if (uao_swap_off(sdp->swd_drumoffset, sdp->swd_drumoffset + sdp->swd_drumsize) || amap_swap_off(sdp->swd_drumoffset, sdp->swd_drumoffset + sdp->swd_drumsize)) { error = ENOMEM; } else if (sdp->swd_npginuse > sdp->swd_npgbad) { error = EBUSY; } if (error) { mutex_enter(&uvm_swap_data_lock); sdp->swd_flags |= SWF_ENABLE; uvmexp.swpgavail += npages; mutex_exit(&uvm_swap_data_lock); return error; } /* * If this is the last regular swap destroy the workqueue. * => Protected by swap_syscall_lock. */ if (sdp->swd_vp->v_type != VBLK) { KASSERT(sw_reg_count > 0); KASSERT(sw_reg_workqueue != NULL); if (--sw_reg_count == 0) { workqueue_destroy(sw_reg_workqueue); sw_reg_workqueue = NULL; } } /* * done with the vnode. * drop our ref on the vnode before calling VOP_CLOSE() * so that spec_close() can tell if this is the last close. */ vrele(sdp->swd_vp); if (sdp->swd_vp != rootvp) { (void) VOP_CLOSE(sdp->swd_vp, FREAD|FWRITE, l->l_cred); } mutex_enter(&uvm_swap_data_lock); uvmexp.swpages -= npages; uvmexp.swpginuse -= sdp->swd_npgbad; if (swaplist_find(sdp->swd_vp, true) == NULL) panic("%s: swapdev not in list", __func__); swaplist_trim(); mutex_exit(&uvm_swap_data_lock); /* * free all resources! */ vmem_free(swapmap, sdp->swd_drumoffset, sdp->swd_drumsize); blist_destroy(sdp->swd_blist); bufq_free(sdp->swd_tab); kmem_free(__UNVOLATILE(sdp->swd_encmap), encmap_size(sdp->swd_drumsize)); explicit_memset(&sdp->swd_enckey, 0, sizeof sdp->swd_enckey); explicit_memset(&sdp->swd_deckey, 0, sizeof sdp->swd_deckey); kmem_free(sdp, sizeof(*sdp)); return (0); } void uvm_swap_shutdown(struct lwp *l) { struct swapdev *sdp; struct swappri *spp; struct vnode *vp; int error; if (!uvm_swap_init_done || uvmexp.nswapdev == 0) return; printf("turning off swap..."); rw_enter(&swap_syscall_lock, RW_WRITER); mutex_enter(&uvm_swap_data_lock); again: LIST_FOREACH(spp, &swap_priority, spi_swappri) TAILQ_FOREACH(sdp, &spp->spi_swapdev, swd_next) { if (sdp->swd_flags & SWF_FAKE) continue; if ((sdp->swd_flags & (SWF_INUSE|SWF_ENABLE)) == 0) continue; #ifdef DEBUG printf("\nturning off swap on %s...", sdp->swd_path); #endif /* Have to lock and reference vnode for swap_off(). */ vn_lock(vp = sdp->swd_vp, LK_EXCLUSIVE|LK_RETRY); vref(vp); error = swap_off(l, sdp); vput(vp); mutex_enter(&uvm_swap_data_lock); if (error) { printf("stopping swap on %s failed " "with error %d\n", sdp->swd_path, error); TAILQ_REMOVE(&spp->spi_swapdev, sdp, swd_next); uvmexp.nswapdev--; swaplist_trim(); } goto again; } printf(" done\n"); mutex_exit(&uvm_swap_data_lock); rw_exit(&swap_syscall_lock); } /* * /dev/drum interface and i/o functions */ /* * swopen: allow the initial open from uvm_swap_init() and reject all others. */ static int swopen(dev_t dev, int flag, int mode, struct lwp *l) { static bool inited = false; if (!inited) { inited = true; return 0; } return ENODEV; } /* * swstrategy: perform I/O on the drum * * => we must map the i/o request from the drum to the correct swapdev. */ static void swstrategy(struct buf *bp) { struct swapdev *sdp; struct vnode *vp; int pageno, bn; UVMHIST_FUNC(__func__); UVMHIST_CALLED(pdhist); /* * convert block number to swapdev. note that swapdev can't * be yanked out from under us because we are holding resources * in it (i.e. the blocks we are doing I/O on). */ pageno = dbtob((int64_t)bp->b_blkno) >> PAGE_SHIFT; mutex_enter(&uvm_swap_data_lock); sdp = swapdrum_getsdp(pageno); mutex_exit(&uvm_swap_data_lock); if (sdp == NULL) { bp->b_error = EINVAL; bp->b_resid = bp->b_bcount; biodone(bp); UVMHIST_LOG(pdhist, " failed to get swap device", 0, 0, 0, 0); return; } /* * convert drum page number to block number on this swapdev. */ pageno -= sdp->swd_drumoffset; /* page # on swapdev */ bn = btodb((uint64_t)pageno << PAGE_SHIFT); /* convert to diskblock */ UVMHIST_LOG(pdhist, " Rd/Wr (0/1) %jd: mapoff=%#jx bn=%#jx bcount=%jd", ((bp->b_flags & B_READ) == 0) ? 1 : 0, sdp->swd_drumoffset, bn, bp->b_bcount); /* * for block devices we finish up here. * for regular files we have to do more work which we delegate * to sw_reg_strategy(). */ vp = sdp->swd_vp; /* swapdev vnode pointer */ switch (vp->v_type) { default: panic("%s: vnode type 0x%x", __func__, vp->v_type); case VBLK: /* * must convert "bp" from an I/O on /dev/drum to an I/O * on the swapdev (sdp). */ bp->b_blkno = bn; /* swapdev block number */ bp->b_dev = sdp->swd_dev; /* swapdev dev_t */ /* * if we are doing a write, we have to redirect the i/o on * drum's v_numoutput counter to the swapdevs. */ if ((bp->b_flags & B_READ) == 0) { mutex_enter(bp->b_objlock); vwakeup(bp); /* kills one 'v_numoutput' on drum */ mutex_exit(bp->b_objlock); mutex_enter(vp->v_interlock); vp->v_numoutput++; /* put it on swapdev */ mutex_exit(vp->v_interlock); } /* * finally plug in swapdev vnode and start I/O */ bp->b_vp = vp; bp->b_objlock = vp->v_interlock; VOP_STRATEGY(vp, bp); return; case VREG: /* * delegate to sw_reg_strategy function. */ sw_reg_strategy(sdp, bp, bn); return; } /* NOTREACHED */ } /* * swread: the read function for the drum (just a call to physio) */ /*ARGSUSED*/ static int swread(dev_t dev, struct uio *uio, int ioflag) { UVMHIST_FUNC(__func__); UVMHIST_CALLARGS(pdhist, " dev=%#jx offset=%#jx", dev, uio->uio_offset, 0, 0); return (physio(swstrategy, NULL, dev, B_READ, minphys, uio)); } /* * swwrite: the write function for the drum (just a call to physio) */ /*ARGSUSED*/ static int swwrite(dev_t dev, struct uio *uio, int ioflag) { UVMHIST_FUNC(__func__); UVMHIST_CALLARGS(pdhist, " dev=%#jx offset=%#jx", dev, uio->uio_offset, 0, 0); return (physio(swstrategy, NULL, dev, B_WRITE, minphys, uio)); } const struct bdevsw swap_bdevsw = { .d_open = swopen, .d_close = noclose, .d_strategy = swstrategy, .d_ioctl = noioctl, .d_dump = nodump, .d_psize = nosize, .d_discard = nodiscard, .d_flag = D_OTHER }; const struct cdevsw swap_cdevsw = { .d_open = nullopen, .d_close = nullclose, .d_read = swread, .d_write = swwrite, .d_ioctl = noioctl, .d_stop = nostop, .d_tty = notty, .d_poll = nopoll, .d_mmap = nommap, .d_kqfilter = nokqfilter, .d_discard = nodiscard, .d_flag = D_OTHER, }; /* * sw_reg_strategy: handle swap i/o to regular files */ static void sw_reg_strategy(struct swapdev *sdp, struct buf *bp, int bn) { struct vnode *vp; struct vndxfer *vnx; daddr_t nbn; char *addr; off_t byteoff; int s, off, nra, error, sz, resid; UVMHIST_FUNC(__func__); UVMHIST_CALLED(pdhist); /* * allocate a vndxfer head for this transfer and point it to * our buffer. */ vnx = pool_get(&vndxfer_pool, PR_WAITOK); vnx->vx_flags = VX_BUSY; vnx->vx_error = 0; vnx->vx_pending = 0; vnx->vx_bp = bp; vnx->vx_sdp = sdp; /* * setup for main loop where we read filesystem blocks into * our buffer. */ error = 0; bp->b_resid = bp->b_bcount; /* nothing transferred yet! */ addr = bp->b_data; /* current position in buffer */ byteoff = dbtob((uint64_t)bn); for (resid = bp->b_resid; resid; resid -= sz) { struct vndbuf *nbp; /* * translate byteoffset into block number. return values: * vp = vnode of underlying device * nbn = new block number (on underlying vnode dev) * nra = num blocks we can read-ahead (excludes requested * block) */ nra = 0; error = VOP_BMAP(sdp->swd_vp, byteoff / sdp->swd_bsize, &vp, &nbn, &nra); if (error == 0 && nbn == (daddr_t)-1) { /* * this used to just set error, but that doesn't * do the right thing. Instead, it causes random * memory errors. The panic() should remain until * this condition doesn't destabilize the system. */ #if 1 panic("%s: swap to sparse file", __func__); #else error = EIO; /* failure */ #endif } /* * punt if there was an error or a hole in the file. * we must wait for any i/o ops we have already started * to finish before returning. * * XXX we could deal with holes here but it would be * a hassle (in the write case). */ if (error) { s = splbio(); vnx->vx_error = error; /* pass error up */ goto out; } /* * compute the size ("sz") of this transfer (in bytes). */ off = byteoff % sdp->swd_bsize; sz = (1 + nra) * sdp->swd_bsize - off; if (sz > resid) sz = resid; UVMHIST_LOG(pdhist, "sw_reg_strategy: " "vp %#jx/%#jx offset %#jx/%#jx", (uintptr_t)sdp->swd_vp, (uintptr_t)vp, byteoff, nbn); /* * now get a buf structure. note that the vb_buf is * at the front of the nbp structure so that you can * cast pointers between the two structure easily. */ nbp = pool_get(&vndbuf_pool, PR_WAITOK); buf_init(&nbp->vb_buf); nbp->vb_buf.b_flags = bp->b_flags; nbp->vb_buf.b_cflags = bp->b_cflags; nbp->vb_buf.b_oflags = bp->b_oflags; nbp->vb_buf.b_bcount = sz; nbp->vb_buf.b_bufsize = sz; nbp->vb_buf.b_error = 0; nbp->vb_buf.b_data = addr; nbp->vb_buf.b_lblkno = 0; nbp->vb_buf.b_blkno = nbn + btodb(off); nbp->vb_buf.b_rawblkno = nbp->vb_buf.b_blkno; nbp->vb_buf.b_iodone = sw_reg_biodone; nbp->vb_buf.b_vp = vp; nbp->vb_buf.b_objlock = vp->v_interlock; if (vp->v_type == VBLK) { nbp->vb_buf.b_dev = vp->v_rdev; } nbp->vb_xfer = vnx; /* patch it back in to vnx */ /* * Just sort by block number */ s = splbio(); if (vnx->vx_error != 0) { buf_destroy(&nbp->vb_buf); pool_put(&vndbuf_pool, nbp); goto out; } vnx->vx_pending++; /* sort it in and start I/O if we are not over our limit */ /* XXXAD locking */ bufq_put(sdp->swd_tab, &nbp->vb_buf); sw_reg_start(sdp); splx(s); /* * advance to the next I/O */ byteoff += sz; addr += sz; } s = splbio(); out: /* Arrive here at splbio */ vnx->vx_flags &= ~VX_BUSY; if (vnx->vx_pending == 0) { error = vnx->vx_error; pool_put(&vndxfer_pool, vnx); bp->b_error = error; biodone(bp); } splx(s); } /* * sw_reg_start: start an I/O request on the requested swapdev * * => reqs are sorted by b_rawblkno (above) */ static void sw_reg_start(struct swapdev *sdp) { struct buf *bp; struct vnode *vp; UVMHIST_FUNC(__func__); UVMHIST_CALLED(pdhist); /* recursion control */ if ((sdp->swd_flags & SWF_BUSY) != 0) return; sdp->swd_flags |= SWF_BUSY; while (sdp->swd_active < sdp->swd_maxactive) { bp = bufq_get(sdp->swd_tab); if (bp == NULL) break; sdp->swd_active++; UVMHIST_LOG(pdhist, "sw_reg_start: bp %#jx vp %#jx blkno %#jx cnt %#jx", (uintptr_t)bp, (uintptr_t)bp->b_vp, (uintptr_t)bp->b_blkno, bp->b_bcount); vp = bp->b_vp; KASSERT(bp->b_objlock == vp->v_interlock); if ((bp->b_flags & B_READ) == 0) { mutex_enter(vp->v_interlock); vp->v_numoutput++; mutex_exit(vp->v_interlock); } VOP_STRATEGY(vp, bp); } sdp->swd_flags &= ~SWF_BUSY; } /* * sw_reg_biodone: one of our i/o's has completed */ static void sw_reg_biodone(struct buf *bp) { workqueue_enqueue(sw_reg_workqueue, &bp->b_work, NULL); } /* * sw_reg_iodone: one of our i/o's has completed and needs post-i/o cleanup * * => note that we can recover the vndbuf struct by casting the buf ptr */ static void sw_reg_iodone(struct work *wk, void *dummy) { struct vndbuf *vbp = (void *)wk; struct vndxfer *vnx = vbp->vb_xfer; struct buf *pbp = vnx->vx_bp; /* parent buffer */ struct swapdev *sdp = vnx->vx_sdp; int s, resid, error; KASSERT(&vbp->vb_buf.b_work == wk); UVMHIST_FUNC(__func__); UVMHIST_CALLARGS(pdhist, " vbp=%#jx vp=%#jx blkno=%#jx addr=%#jx", (uintptr_t)vbp, (uintptr_t)vbp->vb_buf.b_vp, vbp->vb_buf.b_blkno, (uintptr_t)vbp->vb_buf.b_data); UVMHIST_LOG(pdhist, " cnt=%#jx resid=%#jx", vbp->vb_buf.b_bcount, vbp->vb_buf.b_resid, 0, 0); /* * protect vbp at splbio and update. */ s = splbio(); resid = vbp->vb_buf.b_bcount - vbp->vb_buf.b_resid; pbp->b_resid -= resid; vnx->vx_pending--; if (vbp->vb_buf.b_error != 0) { /* pass error upward */ error = vbp->vb_buf.b_error ? vbp->vb_buf.b_error : EIO; UVMHIST_LOG(pdhist, " got error=%jd !", error, 0, 0, 0); vnx->vx_error = error; } /* * kill vbp structure */ buf_destroy(&vbp->vb_buf); pool_put(&vndbuf_pool, vbp); /* * wrap up this transaction if it has run to completion or, in * case of an error, when all auxiliary buffers have returned. */ if (vnx->vx_error != 0) { /* pass error upward */ error = vnx->vx_error; if ((vnx->vx_flags & VX_BUSY) == 0 && vnx->vx_pending == 0) { pbp->b_error = error; biodone(pbp); pool_put(&vndxfer_pool, vnx); } } else if (pbp->b_resid == 0) { KASSERT(vnx->vx_pending == 0); if ((vnx->vx_flags & VX_BUSY) == 0) { UVMHIST_LOG(pdhist, " iodone, pbp=%#jx error=%jd !", (uintptr_t)pbp, vnx->vx_error, 0, 0); biodone(pbp); pool_put(&vndxfer_pool, vnx); } } /* * done! start next swapdev I/O if one is pending */ sdp->swd_active--; sw_reg_start(sdp); splx(s); } /* * uvm_swap_alloc: allocate space on swap * * => allocation is done "round robin" down the priority list, as we * allocate in a priority we "rotate" the circle queue. * => space can be freed with uvm_swap_free * => we return the page slot number in /dev/drum (0 == invalid slot) * => we lock uvm_swap_data_lock * => XXXMRG: "LESSOK" INTERFACE NEEDED TO EXTENT SYSTEM */ int uvm_swap_alloc(int *nslots /* IN/OUT */, bool lessok) { struct swapdev *sdp; struct swappri *spp; UVMHIST_FUNC(__func__); UVMHIST_CALLED(pdhist); /* * no swap devices configured yet? definite failure. */ if (uvmexp.nswapdev < 1) return 0; /* * XXXJAK: BEGIN HACK * * blist_alloc() in subr_blist.c will panic if we try to allocate * too many slots. */ if (*nslots > BLIST_MAX_ALLOC) { if (__predict_false(lessok == false)) return 0; *nslots = BLIST_MAX_ALLOC; } /* XXXJAK: END HACK */ /* * lock data lock, convert slots into blocks, and enter loop */ mutex_enter(&uvm_swap_data_lock); ReTry: /* XXXMRG */ LIST_FOREACH(spp, &swap_priority, spi_swappri) { TAILQ_FOREACH(sdp, &spp->spi_swapdev, swd_next) { uint64_t result; /* if it's not enabled, then we can't swap from it */ if ((sdp->swd_flags & SWF_ENABLE) == 0) continue; if (sdp->swd_npginuse + *nslots > sdp->swd_npages) continue; result = blist_alloc(sdp->swd_blist, *nslots); if (result == BLIST_NONE) { continue; } KASSERT(result < sdp->swd_drumsize); /* * successful allocation! now rotate the tailq. */ TAILQ_REMOVE(&spp->spi_swapdev, sdp, swd_next); TAILQ_INSERT_TAIL(&spp->spi_swapdev, sdp, swd_next); sdp->swd_npginuse += *nslots; uvmexp.swpginuse += *nslots; mutex_exit(&uvm_swap_data_lock); /* done! return drum slot number */ UVMHIST_LOG(pdhist, "success! returning %jd slots starting at %jd", *nslots, result + sdp->swd_drumoffset, 0, 0); return (result + sdp->swd_drumoffset); } } /* XXXMRG: BEGIN HACK */ if (*nslots > 1 && lessok) { *nslots = 1; /* XXXMRG: ugh! blist should support this for us */ goto ReTry; } /* XXXMRG: END HACK */ mutex_exit(&uvm_swap_data_lock); return 0; } /* * uvm_swapisfull: return true if most of available swap is allocated * and in use. we don't count some small portion as it may be inaccessible * to us at any given moment, for example if there is lock contention or if * pages are busy. */ bool uvm_swapisfull(void) { int swpgonly; bool rv; if (uvmexp.swpages == 0) { return true; } mutex_enter(&uvm_swap_data_lock); KASSERT(uvmexp.swpgonly <= uvmexp.swpages); swpgonly = (int)((uint64_t)uvmexp.swpgonly * 100 / uvm_swapisfull_factor); rv = (swpgonly >= uvmexp.swpgavail); mutex_exit(&uvm_swap_data_lock); return (rv); } /* * uvm_swap_markbad: keep track of swap ranges where we've had i/o errors * * => we lock uvm_swap_data_lock */ void uvm_swap_markbad(int startslot, int nslots) { struct swapdev *sdp; UVMHIST_FUNC(__func__); UVMHIST_CALLED(pdhist); mutex_enter(&uvm_swap_data_lock); sdp = swapdrum_getsdp(startslot); KASSERT(sdp != NULL); /* * we just keep track of how many pages have been marked bad * in this device, to make everything add up in swap_off(). * we assume here that the range of slots will all be within * one swap device. */ KASSERT(uvmexp.swpgonly >= nslots); atomic_add_int(&uvmexp.swpgonly, -nslots); sdp->swd_npgbad += nslots; UVMHIST_LOG(pdhist, "now %jd bad", sdp->swd_npgbad, 0,0,0); mutex_exit(&uvm_swap_data_lock); } /* * uvm_swap_free: free swap slots * * => this can be all or part of an allocation made by uvm_swap_alloc * => we lock uvm_swap_data_lock */ void uvm_swap_free(int startslot, int nslots) { struct swapdev *sdp; UVMHIST_FUNC(__func__); UVMHIST_CALLARGS(pdhist, "freeing %jd slots starting at %jd", nslots, startslot, 0, 0); /* * ignore attempts to free the "bad" slot. */ if (startslot == SWSLOT_BAD) { return; } /* * convert drum slot offset back to sdp, free the blocks * in the extent, and return. must hold pri lock to do * lookup and access the extent. */ mutex_enter(&uvm_swap_data_lock); sdp = swapdrum_getsdp(startslot); KASSERT(uvmexp.nswapdev >= 1); KASSERT(sdp != NULL); KASSERT(sdp->swd_npginuse >= nslots); blist_free(sdp->swd_blist, startslot - sdp->swd_drumoffset, nslots); sdp->swd_npginuse -= nslots; uvmexp.swpginuse -= nslots; mutex_exit(&uvm_swap_data_lock); } /* * uvm_swap_put: put any number of pages into a contig place on swap * * => can be sync or async */ int uvm_swap_put(int swslot, struct vm_page **ppsp, int npages, int flags) { int error; error = uvm_swap_io(ppsp, swslot, npages, B_WRITE | ((flags & PGO_SYNCIO) ? 0 : B_ASYNC)); return error; } /* * uvm_swap_get: get a single page from swap * * => usually a sync op (from fault) */ int uvm_swap_get(struct vm_page *page, int swslot, int flags) { int error; atomic_inc_uint(&uvmexp.nswget); KASSERT(flags & PGO_SYNCIO); if (swslot == SWSLOT_BAD) { return EIO; } error = uvm_swap_io(&page, swslot, 1, B_READ | ((flags & PGO_SYNCIO) ? 0 : B_ASYNC)); if (error == 0) { /* * this page is no longer only in swap. */ KASSERT(uvmexp.swpgonly > 0); atomic_dec_uint(&uvmexp.swpgonly); } return error; } /* * uvm_swap_io: do an i/o operation to swap */ static int uvm_swap_io(struct vm_page **pps, int startslot, int npages, int flags) { daddr_t startblk; struct buf *bp; vaddr_t kva; int error, mapinflags; bool write, async, swap_encrypt; UVMHIST_FUNC(__func__); UVMHIST_CALLARGS(pdhist, "<- called, startslot=%jd, npages=%jd, flags=%#jx", startslot, npages, flags, 0); write = (flags & B_READ) == 0; async = (flags & B_ASYNC) != 0; swap_encrypt = atomic_load_relaxed(&uvm_swap_encrypt); /* * allocate a buf for the i/o. */ KASSERT(curlwp != uvm.pagedaemon_lwp || write); KASSERT(curlwp != uvm.pagedaemon_lwp || async); bp = getiobuf(swapdev_vp, curlwp != uvm.pagedaemon_lwp); if (bp == NULL) { uvm_aio_aiodone_pages(pps, npages, true, ENOMEM); return ENOMEM; } /* * convert starting drum slot to block number */ startblk = btodb((uint64_t)startslot << PAGE_SHIFT); /* * first, map the pages into the kernel. */ mapinflags = !write ? UVMPAGER_MAPIN_WAITOK|UVMPAGER_MAPIN_READ : UVMPAGER_MAPIN_WAITOK|UVMPAGER_MAPIN_WRITE; if (write && swap_encrypt) /* need to encrypt in-place */ mapinflags |= UVMPAGER_MAPIN_READ; kva = uvm_pagermapin(pps, npages, mapinflags); /* * encrypt writes in place if requested */ if (write) do { struct swapdev *sdp; int i; /* * Get the swapdev so we can discriminate on the * encryption state. There may or may not be an * encryption key generated; we may or may not be asked * to encrypt swap. * * 1. NO KEY, NO ENCRYPTION: Nothing to do. * * 2. NO KEY, BUT ENCRYPTION: Generate a key, encrypt, * and mark the slots encrypted. * * 3. KEY, BUT NO ENCRYPTION: The slots may already be * marked encrypted from a past life. Mark them not * encrypted. * * 4. KEY, ENCRYPTION: Encrypt and mark the slots * encrypted. */ mutex_enter(&uvm_swap_data_lock); sdp = swapdrum_getsdp(startslot); if (!sdp->swd_encinit) { if (!swap_encrypt) { mutex_exit(&uvm_swap_data_lock); break; } uvm_swap_genkey(sdp); } KASSERT(sdp->swd_encinit); mutex_exit(&uvm_swap_data_lock); for (i = 0; i < npages; i++) { int s = startslot + i; KDASSERT(swapdrum_sdp_is(s, sdp)); KASSERT(s >= sdp->swd_drumoffset); s -= sdp->swd_drumoffset; KASSERT(s < sdp->swd_drumsize); if (swap_encrypt) { uvm_swap_encryptpage(sdp, (void *)(kva + (vsize_t)i*PAGE_SIZE), s); atomic_or_32(&sdp->swd_encmap[s/32], __BIT(s%32)); } else { atomic_and_32(&sdp->swd_encmap[s/32], ~__BIT(s%32)); } } } while (0); /* * fill in the bp/sbp. we currently route our i/o through * /dev/drum's vnode [swapdev_vp]. */ bp->b_cflags = BC_BUSY | BC_NOCACHE; bp->b_flags = (flags & (B_READ|B_ASYNC)); bp->b_proc = &proc0; /* XXX */ bp->b_vnbufs.le_next = NOLIST; bp->b_data = (void *)kva; bp->b_blkno = startblk; bp->b_bufsize = bp->b_bcount = npages << PAGE_SHIFT; /* * bump v_numoutput (counter of number of active outputs). */ if (write) { mutex_enter(swapdev_vp->v_interlock); swapdev_vp->v_numoutput++; mutex_exit(swapdev_vp->v_interlock); } /* * for async ops we must set up the iodone handler. */ if (async) { bp->b_iodone = uvm_aio_aiodone; UVMHIST_LOG(pdhist, "doing async!", 0, 0, 0, 0); if (curlwp == uvm.pagedaemon_lwp) BIO_SETPRIO(bp, BPRIO_TIMECRITICAL); else BIO_SETPRIO(bp, BPRIO_TIMELIMITED); } else { bp->b_iodone = NULL; BIO_SETPRIO(bp, BPRIO_TIMECRITICAL); } UVMHIST_LOG(pdhist, "about to start io: data = %#jx blkno = %#jx, bcount = %jd", (uintptr_t)bp->b_data, bp->b_blkno, bp->b_bcount, 0); /* * now we start the I/O, and if async, return. */ VOP_STRATEGY(swapdev_vp, bp); if (async) { /* * Reads are always synchronous; if this changes, we * need to add an asynchronous path for decryption. */ KASSERT(write); return 0; } /* * must be sync i/o. wait for it to finish */ error = biowait(bp); if (error) goto out; /* * decrypt reads in place if needed */ if (!write) do { struct swapdev *sdp; bool encinit; int i; /* * Get the sdp. Everything about it except the encinit * bit, saying whether the encryption key is * initialized or not, and the encrypted bit for each * page, is stable until all swap pages have been * released and the device is removed. */ mutex_enter(&uvm_swap_data_lock); sdp = swapdrum_getsdp(startslot); encinit = sdp->swd_encinit; mutex_exit(&uvm_swap_data_lock); if (!encinit) /* * If there's no encryption key, there's no way * any of these slots can be encrypted, so * nothing to do here. */ break; for (i = 0; i < npages; i++) { int s = startslot + i; KDASSERT(swapdrum_sdp_is(s, sdp)); KASSERT(s >= sdp->swd_drumoffset); s -= sdp->swd_drumoffset; KASSERT(s < sdp->swd_drumsize); if ((atomic_load_relaxed(&sdp->swd_encmap[s/32]) & __BIT(s%32)) == 0) continue; uvm_swap_decryptpage(sdp, (void *)(kva + (vsize_t)i*PAGE_SIZE), s); } } while (0); out: /* * kill the pager mapping */ uvm_pagermapout(kva, npages); /* * now dispose of the buf and we're done. */ if (write) { mutex_enter(swapdev_vp->v_interlock); vwakeup(bp); mutex_exit(swapdev_vp->v_interlock); } putiobuf(bp); UVMHIST_LOG(pdhist, "<- done (sync) error=%jd", error, 0, 0, 0); return (error); } /* * uvm_swap_genkey(sdp) * * Generate a key for swap encryption. */ static void uvm_swap_genkey(struct swapdev *sdp) { uint8_t key[32]; KASSERT(!sdp->swd_encinit); cprng_strong(kern_cprng, key, sizeof key, 0); aes_setenckey256(&sdp->swd_enckey, key); aes_setdeckey256(&sdp->swd_deckey, key); explicit_memset(key, 0, sizeof key); sdp->swd_encinit = true; } /* * uvm_swap_encryptpage(sdp, kva, slot) * * Encrypt one page of data at kva for the specified slot number * in the swap device. */ static void uvm_swap_encryptpage(struct swapdev *sdp, void *kva, int slot) { uint8_t preiv[16] __aligned(16) = {0}, iv[16] __aligned(16); /* iv := AES_k(le32enc(slot) || 0^96) */ le32enc(preiv, slot); aes_enc(&sdp->swd_enckey, (const void *)preiv, iv, AES_256_NROUNDS); /* *kva := AES-CBC_k(iv, *kva) */ aes_cbc_enc(&sdp->swd_enckey, kva, kva, PAGE_SIZE, iv, AES_256_NROUNDS); explicit_memset(&iv, 0, sizeof iv); } /* * uvm_swap_decryptpage(sdp, kva, slot) * * Decrypt one page of data at kva for the specified slot number * in the swap device. */ static void uvm_swap_decryptpage(struct swapdev *sdp, void *kva, int slot) { uint8_t preiv[16] __aligned(16) = {0}, iv[16] __aligned(16); /* iv := AES_k(le32enc(slot) || 0^96) */ le32enc(preiv, slot); aes_enc(&sdp->swd_enckey, (const void *)preiv, iv, AES_256_NROUNDS); /* *kva := AES-CBC^{-1}_k(iv, *kva) */ aes_cbc_dec(&sdp->swd_deckey, kva, kva, PAGE_SIZE, iv, AES_256_NROUNDS); explicit_memset(&iv, 0, sizeof iv); } SYSCTL_SETUP(sysctl_uvmswap_setup, "sysctl uvmswap setup") { sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_BOOL, "swap_encrypt", SYSCTL_DESCR("Encrypt data when swapped out to disk"), NULL, 0, &uvm_swap_encrypt, 0, CTL_VM, CTL_CREATE, CTL_EOL); }
1 1 1 4 4 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 /* $NetBSD: sysmon.c,v 1.32 2022/03/28 12:33:21 riastradh Exp $ */ /*- * Copyright (c) 2000 Zembu Labs, Inc. * All rights reserved. * * Author: Jason R. Thorpe <thorpej@zembu.com> * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by Zembu Labs, Inc. * 4. Neither the name of Zembu Labs nor the names of its employees may * be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY ZEMBU LABS, INC. ``AS IS'' AND ANY EXPRESS * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WAR- * RANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DIS- * CLAIMED. IN NO EVENT SHALL ZEMBU LABS BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* * Clearing house for system monitoring hardware. We currently * handle environmental sensors, watchdog timers, and power management. */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: sysmon.c,v 1.32 2022/03/28 12:33:21 riastradh Exp $"); #include <sys/param.h> #include <sys/conf.h> #include <sys/errno.h> #include <sys/fcntl.h> #include <sys/callout.h> #include <sys/kernel.h> #include <sys/systm.h> #include <sys/proc.h> #include <sys/module.h> #include <sys/mutex.h> #include <sys/device.h> #include <sys/once.h> #include <dev/sysmon/sysmonvar.h> dev_type_open(sysmonopen); dev_type_close(sysmonclose); dev_type_ioctl(sysmonioctl); dev_type_read(sysmonread); dev_type_poll(sysmonpoll); dev_type_kqfilter(sysmonkqfilter); const struct cdevsw sysmon_cdevsw = { .d_open = sysmonopen, .d_close = sysmonclose, .d_read = sysmonread, .d_write = nowrite, .d_ioctl = sysmonioctl, .d_stop = nostop, .d_tty = notty, .d_poll = sysmonpoll, .d_mmap = nommap, .d_kqfilter = sysmonkqfilter, .d_discard = nodiscard, .d_flag = D_OTHER | D_MPSAFE }; static int sysmon_modcmd(modcmd_t, void *); static int sm_init_once(void); /* * Info about our minor "devices" */ static struct sysmon_opvec *sysmon_opvec_table[] = { NULL, NULL, NULL }; static int sysmon_refcnt[] = { 0, 0, 0 }; static const char *sysmon_mod[] = { "sysmon_envsys", "sysmon_wdog", "sysmon_power" }; static kmutex_t sysmon_minor_mtx; #ifdef _MODULE static bool sm_is_attached; #endif ONCE_DECL(once_sm); /* * sysmon_attach_minor * * Attach a minor device for wdog, power, or envsys. Manage a * reference count so we can prevent the device from being * detached if there are still users with the minor device opened. * * If the opvec argument is NULL, this is a request to detach the * minor device - make sure the refcnt is zero! */ int sysmon_attach_minor(int minor, struct sysmon_opvec *opvec) { int ret; mutex_enter(&sysmon_minor_mtx); if (opvec) { if (sysmon_opvec_table[minor] == NULL) { sysmon_refcnt[minor] = 0; sysmon_opvec_table[minor] = opvec; ret = 0; } else ret = EEXIST; } else { if (sysmon_refcnt[minor] == 0) { sysmon_opvec_table[minor] = NULL; ret = 0; } else ret = EBUSY; } mutex_exit(&sysmon_minor_mtx); return ret; } /* * sysmonopen: * * Open the system monitor device. */ int sysmonopen(dev_t dev, int flag, int mode, struct lwp *l) { int error; mutex_enter(&sysmon_minor_mtx); switch (minor(dev)) { case SYSMON_MINOR_ENVSYS: case SYSMON_MINOR_WDOG: case SYSMON_MINOR_POWER: if (sysmon_opvec_table[minor(dev)] == NULL) { mutex_exit(&sysmon_minor_mtx); error = module_autoload(sysmon_mod[minor(dev)], MODULE_CLASS_DRIVER); if (error) return error; mutex_enter(&sysmon_minor_mtx); if (sysmon_opvec_table[minor(dev)] == NULL) { error = ENODEV; break; } } error = (sysmon_opvec_table[minor(dev)]->so_open)(dev, flag, mode, l); if (error == 0) sysmon_refcnt[minor(dev)]++; break; default: error = ENODEV; } mutex_exit(&sysmon_minor_mtx); return error; } /* * sysmonclose: * * Close the system monitor device. */ int sysmonclose(dev_t dev, int flag, int mode, struct lwp *l) { int error; switch (minor(dev)) { case SYSMON_MINOR_ENVSYS: case SYSMON_MINOR_WDOG: case SYSMON_MINOR_POWER: if (sysmon_opvec_table[minor(dev)] == NULL) error = ENODEV; else { error = (sysmon_opvec_table[minor(dev)]->so_close)(dev, flag, mode, l); if (error == 0) { sysmon_refcnt[minor(dev)]--; KASSERT(sysmon_refcnt[minor(dev)] >= 0); } } break; default: error = ENODEV; } return (error); } /* * sysmonioctl: * * Perform a control request. */ int sysmonioctl(dev_t dev, u_long cmd, void *data, int flag, struct lwp *l) { int error; switch (minor(dev)) { case SYSMON_MINOR_ENVSYS: case SYSMON_MINOR_WDOG: case SYSMON_MINOR_POWER: if (sysmon_opvec_table[minor(dev)] == NULL) error = ENODEV; else error = (sysmon_opvec_table[minor(dev)]->so_ioctl)(dev, cmd, data, flag, l); break; default: error = ENODEV; } return (error); } /* * sysmonread: * * Perform a read request. */ int sysmonread(dev_t dev, struct uio *uio, int flags) { int error; switch (minor(dev)) { case SYSMON_MINOR_POWER: if (sysmon_opvec_table[minor(dev)] == NULL) error = ENODEV; else error = (sysmon_opvec_table[minor(dev)]->so_read)(dev, uio, flags); break; default: error = ENODEV; } return (error); } /* * sysmonpoll: * * Poll the system monitor device. */ int sysmonpoll(dev_t dev, int events, struct lwp *l) { int rv; switch (minor(dev)) { case SYSMON_MINOR_POWER: if (sysmon_opvec_table[minor(dev)] == NULL) rv = events; else rv = (sysmon_opvec_table[minor(dev)]->so_poll)(dev, events, l); break; default: rv = events; } return (rv); } /* * sysmonkqfilter: * * Kqueue filter for the system monitor device. */ int sysmonkqfilter(dev_t dev, struct knote *kn) { int error; switch (minor(dev)) { case SYSMON_MINOR_POWER: if (sysmon_opvec_table[minor(dev)] == NULL) error = ENODEV; else error = (sysmon_opvec_table[minor(dev)]->so_filter)(dev, kn); break; default: error = 1; } return (error); } MODULE(MODULE_CLASS_DRIVER, sysmon, NULL); static int sm_init_once(void) { mutex_init(&sysmon_minor_mtx, MUTEX_DEFAULT, IPL_NONE); return 0; } int sysmon_init(void) { int error; #ifdef _MODULE devmajor_t bmajor, cmajor; #endif error = RUN_ONCE(&once_sm, sm_init_once); #ifdef _MODULE mutex_enter(&sysmon_minor_mtx); if (!sm_is_attached) { bmajor = cmajor = -1; error = devsw_attach("sysmon", NULL, &bmajor, &sysmon_cdevsw, &cmajor); sm_is_attached = (error != 0); } mutex_exit(&sysmon_minor_mtx); #endif return error; } int sysmon_fini(void) { int error = 0; if ((sysmon_opvec_table[SYSMON_MINOR_ENVSYS] != NULL) || (sysmon_opvec_table[SYSMON_MINOR_WDOG] != NULL) || (sysmon_opvec_table[SYSMON_MINOR_POWER] != NULL)) error = EBUSY; #ifdef _MODULE if (error == 0) { mutex_enter(&sysmon_minor_mtx); sm_is_attached = false; devsw_detach(NULL, &sysmon_cdevsw); mutex_exit(&sysmon_minor_mtx); } #endif return error; } static int sysmon_modcmd(modcmd_t cmd, void *arg) { int ret; switch (cmd) { case MODULE_CMD_INIT: ret = sysmon_init(); break; case MODULE_CMD_FINI: ret = sysmon_fini(); break; case MODULE_CMD_STAT: default: ret = ENOTTY; } return ret; }
48 11 11 49 6 34 15 48 48 48 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 /* $NetBSD: umap_subr.c,v 1.29 2014/11/09 18:08:07 maxv Exp $ */ /* * Copyright (c) 1999 National Aeronautics & Space Administration * All rights reserved. * * This software was written by William Studenmund of the * Numerical Aerospace Simulation Facility, NASA Ames Research Center. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the National Aeronautics & Space Administration * nor the names of its contributors may be used to endorse or promote * products derived from this software without specific prior written * permission. * * THIS SOFTWARE IS PROVIDED BY THE NATIONAL AERONAUTICS & SPACE ADMINISTRATION * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE ADMINISTRATION OR CONTRIB- * UTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /* * Copyright (c) 1992, 1993, 1995 * The Regents of the University of California. All rights reserved. * * This code is derived from software donated to Berkeley by * Jan-Simon Pendry. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: Id: lofs_subr.c, v 1.11 1992/05/30 10:05:43 jsp Exp * @(#)umap_subr.c 8.9 (Berkeley) 5/14/95 */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: umap_subr.c,v 1.29 2014/11/09 18:08:07 maxv Exp $"); #include <sys/param.h> #include <sys/systm.h> #include <sys/proc.h> #include <sys/time.h> #include <sys/vnode.h> #include <sys/mount.h> #include <sys/namei.h> #include <sys/kauth.h> #include <miscfs/specfs/specdev.h> #include <miscfs/umapfs/umap.h> u_long umap_findid(u_long, u_long [][2], int); int umap_node_alloc(struct mount *, struct vnode *, struct vnode **); /* * umap_findid is called by various routines in umap_vnodeops.c to * find a user or group id in a map. */ u_long umap_findid(u_long id, u_long map[][2], int nentries) { int i; /* Find uid entry in map */ i = 0; while ((i<nentries) && ((map[i][0]) != id)) i++; if (i < nentries) return (map[i][1]); else return (-1); } /* * umap_reverse_findid is called by umap_getattr() in umap_vnodeops.c to * find a user or group id in a map, in reverse. */ u_long umap_reverse_findid(u_long id, u_long map[][2], int nentries) { int i; /* Find uid entry in map */ i = 0; while ((i<nentries) && ((map[i][1]) != id)) i++; if (i < nentries) return (map[i][0]); else return (-1); } /* umap_mapids maps all of the ids in a credential, both user and group. */ void umap_mapids(struct mount *v_mount, kauth_cred_t credp) { int i, unentries, gnentries; uid_t uid; gid_t gid; u_long (*usermap)[2], (*groupmap)[2]; gid_t groups[NGROUPS]; uint16_t ngroups; if (credp == NOCRED || credp == FSCRED) return; unentries = MOUNTTOUMAPMOUNT(v_mount)->info_nentries; usermap = MOUNTTOUMAPMOUNT(v_mount)->info_mapdata; gnentries = MOUNTTOUMAPMOUNT(v_mount)->info_gnentries; groupmap = MOUNTTOUMAPMOUNT(v_mount)->info_gmapdata; /* Find uid entry in map */ uid = (uid_t) umap_findid(kauth_cred_geteuid(credp), usermap, unentries); if (uid != -1) kauth_cred_seteuid(credp, uid); else kauth_cred_seteuid(credp, (uid_t)NOBODY); #if 1 /* cr_gid is the same as cr_groups[0] in 4BSD, but not in NetBSD */ /* Find gid entry in map */ gid = (gid_t) umap_findid(kauth_cred_getegid(credp), groupmap, gnentries); if (gid != -1) kauth_cred_setegid(credp, gid); else kauth_cred_setegid(credp, NULLGROUP); #endif /* Now we must map each of the set of groups in the cr_groups structure. */ ngroups = kauth_cred_ngroups(credp); for (i = 0; i < ngroups; i++) { /* XXX elad: can't we just skip cases where gid == -1? */ groups[i] = kauth_cred_group(credp, i); gid = (gid_t) umap_findid(groups[i], groupmap, gnentries); if (gid != -1) groups[i] = gid; else groups[i] = NULLGROUP; } kauth_cred_setgroups(credp, groups, ngroups, -1, UIO_SYSSPACE); }
50 50 50 50 50 50 50 8 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 /* $NetBSD: kern_cfglock.c,v 1.1 2010/08/21 13:17:31 pgoyette Exp $ */ /*- * Copyright (c) 2002, 2006, 2007, 2008, 2009 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility, * NASA Ames Research Center, and by Andrew Doran. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: kern_cfglock.c,v 1.1 2010/08/21 13:17:31 pgoyette Exp $"); #include <sys/param.h> #include <sys/cpu.h> #include <sys/mutex.h> #include <sys/lwp.h> #include <sys/systm.h> static kmutex_t kernconfig_mutex; static lwp_t *kernconfig_lwp; static int kernconfig_recurse; /* * Functions for manipulating the kernel configuration lock. This * recursive lock should be used to protect all additions and removals * of kernel functionality, such as device configuration and loading * of modular kernel components. */ void kernconfig_lock_init(void) { mutex_init(&kernconfig_mutex, MUTEX_DEFAULT, IPL_NONE); kernconfig_lwp = NULL; kernconfig_recurse = 0; } void kernconfig_lock(void) { lwp_t *my_lwp; /* * It's OK to check this unlocked, since it could only be set to * curlwp by the current thread itself, and not by an interrupt * or any other LWP. */ KASSERT(!cpu_intr_p()); my_lwp = curlwp; if (kernconfig_lwp == my_lwp) { kernconfig_recurse++; KASSERT(kernconfig_recurse > 1); } else { mutex_enter(&kernconfig_mutex); kernconfig_lwp = my_lwp; kernconfig_recurse = 1; } } void kernconfig_unlock(void) { KASSERT(kernconfig_is_held()); KASSERT(kernconfig_recurse != 0); if (--kernconfig_recurse == 0) { kernconfig_lwp = NULL; mutex_exit(&kernconfig_mutex); } } bool kernconfig_is_held(void) { return mutex_owned(&kernconfig_mutex); }
11 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 /* $NetBSD: procfs.h,v 1.84 2024/01/17 10:20:12 hannken Exp $ */ /* * Copyright (c) 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * Jan-Simon Pendry. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)procfs.h 8.9 (Berkeley) 5/14/95 */ /* * Copyright (c) 1993 Jan-Simon Pendry * * This code is derived from software contributed to Berkeley by * Jan-Simon Pendry. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)procfs.h 8.9 (Berkeley) 5/14/95 */ /* This also pulls in __HAVE_PROCFS_MACHDEP */ #include <sys/ptrace.h> #ifdef _KERNEL #include <sys/proc.h> /* * The different types of node in a procfs filesystem */ typedef enum { PFSauxv, /* ELF Auxiliary Vector */ PFSchroot, /* the process's current root directory */ PFScmdline, /* process command line args */ PFScpuinfo, /* CPU info (if -o linux) */ PFScpustat, /* status info (if -o linux) */ PFScurproc, /* symbolic link for curproc */ PFScwd, /* the process's current working directory */ PFSdevices, /* major/device name mappings (if -o linux) */ PFSemul, /* the process's emulation */ PFSenviron, /* process environment */ PFSexe, /* symlink to the executable file */ PFSfd, /* a directory containing the processes open fd's */ PFSfile, /* the executable file */ PFSfpregs, /* the process's FP register set */ PFSloadavg, /* load average (if -o linux) */ PFSlimit, /* resource limits */ PFSmap, /* memory map */ PFSmaps, /* memory map, Linux style (if -o linux) */ PFSmem, /* the process's memory image */ PFSmeminfo, /* system memory info (if -o linux) */ PFSmounts, /* mounted filesystems (if -o linux) */ PFSnote, /* process notifier */ PFSnotepg, /* process group notifier */ PFSproc, /* a process-specific sub-directory */ PFSregs, /* the process's register set */ PFSroot, /* the filesystem root */ PFSself, /* like curproc, but this is the Linux name */ PFSstat, /* process status (if -o linux) */ PFSstatm, /* process memory info (if -o linux) */ PFSstatus, /* process status */ PFStask, /* task subdirector (if -o linux) */ PFSuptime, /* elapsed time since (if -o linux) */ PFSversion, /* kernel version (if -o linux) */ #ifdef __HAVE_PROCFS_MACHDEP PROCFS_MACHDEP_NODE_TYPES #endif PFSlast, /* track number of types */ } pfstype; /* * control data for the proc file system. */ struct pfskey { pfstype pk_type; /* type of procfs node */ pid_t pk_pid; /* associated process */ int pk_fd; /* associated fd if not -1 */ }; struct pfsnode { LIST_ENTRY(pfsnode) pfs_hash; /* per pid hash list */ struct vnode *pfs_vnode; /* vnode associated with this pfsnode */ struct mount *pfs_mount; /* mount associated with this pfsnode */ struct pfskey pfs_key; #define pfs_type pfs_key.pk_type #define pfs_pid pfs_key.pk_pid #define pfs_fd pfs_key.pk_fd mode_t pfs_mode; /* mode bits for stat() */ u_long pfs_flags; /* open flags */ uint64_t pfs_fileno; /* unique file id */ }; #define PROCFS_NOTELEN 64 /* max length of a note (/proc/$pid/note) */ #define PROCFS_MAXNAMLEN 255 #endif /* _KERNEL */ struct procfs_args { int version; int flags; }; #define PROCFS_ARGSVERSION 1 #define PROCFSMNT_LINUXCOMPAT 0x01 #define PROCFSMNT_BITS "\177\20" \ "b\00linuxcompat\0" /* * Kernel stuff follows */ #ifdef _KERNEL #define CNEQ(cnp, s, len) \ ((cnp)->cn_namelen == (len) && \ (memcmp((s), (cnp)->cn_nameptr, (len)) == 0)) #define UIO_MX 32 static __inline ino_t procfs_fileno(pid_t _pid, pfstype _type, int _fd) { ino_t _ino; switch (_type) { case PFSroot: return 2; case PFScurproc: return 3; case PFSself: return 4; default: _ino = _pid + 1; if (_fd != -1) _ino = _ino << 32 | _fd; return _ino * PFSlast + _type; } } #define PROCFS_FILENO(pid, type, fd) procfs_fileno(pid, type, fd) #define PROCFS_TYPE(type) ((type) % PFSlast) struct procfsmount { int pmnt_flags; }; #define VFSTOPROC(mp) ((struct procfsmount *)(mp)->mnt_data) /* * Convert between pfsnode vnode */ #define VTOPFS(vp) ((struct pfsnode *)(vp)->v_data) #define PFSTOV(pfs) ((pfs)->pfs_vnode) typedef struct vfs_namemap vfs_namemap_t; struct vfs_namemap { const char *nm_name; int nm_val; }; int vfs_getuserstr(struct uio *, char *, int *); const vfs_namemap_t *vfs_findname(const vfs_namemap_t *, const char *, int); struct mount; struct proc *procfs_proc_find(struct mount *, pid_t); bool procfs_use_linux_compat(struct mount *); static inline bool procfs_proc_is_linux_compat(void) { const char *emulname = curlwp->l_proc->p_emul->e_name; return (strncmp(emulname, "linux", 5) == 0); } int procfs_proc_lock(struct mount *, int, struct proc **, int); void procfs_proc_unlock(struct proc *); int procfs_allocvp(struct mount *, struct vnode **, pid_t, pfstype, int); int procfs_donote(struct lwp *, struct proc *, struct pfsnode *, struct uio *); int procfs_doregs(struct lwp *, struct lwp *, struct pfsnode *, struct uio *); int procfs_dofpregs(struct lwp *, struct lwp *, struct pfsnode *, struct uio *); int procfs_domem(struct lwp *, struct lwp *, struct pfsnode *, struct uio *); int procfs_do_pid_stat(struct lwp *, struct lwp *, struct pfsnode *, struct uio *); int procfs_dostatus(struct lwp *, struct lwp *, struct pfsnode *, struct uio *); int procfs_domap(struct lwp *, struct proc *, struct pfsnode *, struct uio *, int); int procfs_doprocargs(struct lwp *, struct proc *, struct pfsnode *, struct uio *, int); int procfs_domeminfo(struct lwp *, struct proc *, struct pfsnode *, struct uio *); int procfs_dodevices(struct lwp *, struct proc *, struct pfsnode *, struct uio *); int procfs_docpuinfo(struct lwp *, struct proc *, struct pfsnode *, struct uio *); int procfs_docpustat(struct lwp *, struct proc *, struct pfsnode *, struct uio *); int procfs_doloadavg(struct lwp *, struct proc *, struct pfsnode *, struct uio *); int procfs_do_pid_statm(struct lwp *, struct lwp *, struct pfsnode *, struct uio *); int procfs_dofd(struct lwp *, struct proc *, struct pfsnode *, struct uio *); int procfs_douptime(struct lwp *, struct proc *, struct pfsnode *, struct uio *); int procfs_domounts(struct lwp *, struct proc *, struct pfsnode *, struct uio *); int procfs_doemul(struct lwp *, struct proc *, struct pfsnode *, struct uio *); int procfs_doversion(struct lwp *, struct proc *, struct pfsnode *, struct uio *); int procfs_doauxv(struct lwp *, struct proc *, struct pfsnode *, struct uio *); int procfs_dolimit(struct lwp *, struct proc *, struct pfsnode *, struct uio *); void procfs_hashrem(struct pfsnode *); int procfs_getfp(struct pfsnode *, struct proc *, struct file **); /* functions to check whether or not files should be displayed */ int procfs_validauxv(struct lwp *, struct mount *); int procfs_validfile(struct lwp *, struct mount *); int procfs_validfpregs(struct lwp *, struct mount *); int procfs_validregs(struct lwp *, struct mount *); int procfs_validmap(struct lwp *, struct mount *); int procfs_rw(void *); int procfs_getcpuinfstr(char *, size_t *); #define PROCFS_LOCKED 0x01 #define PROCFS_WANT 0x02 extern int (**procfs_vnodeop_p)(void *); extern struct vfsops procfs_vfsops; int procfs_root(struct mount *, int, struct vnode **); #ifdef __HAVE_PROCFS_MACHDEP struct vattr; void procfs_machdep_allocvp(struct vnode *); int procfs_machdep_rw(struct lwp *, struct lwp *, struct pfsnode *, struct uio *); int procfs_machdep_getattr(struct vnode *, struct vattr *, struct proc *); #endif #endif /* _KERNEL */
44 47 50 45 63 161 159 161 161 161 1 145 1 144 161 133 37 161 11 11 11 11 11 11 11 10 1 33 33 33 33 24 33 33 4 33 1 1 1 28 1 1 1 1 1 2 4 22 6 27 6 22 1 2 3 3 3 22 22 22 6 10 10 10 10 10 10 10 10 10 1 185 184 35 142 143 167 144 15 25 1 25 25 25 25 25 25 25 25 4 4 4 4 4 4 4 4 52 52 52 52 52 39 30 49 49 48 48 50 50 41 44 29 23 23 44 49 50 50 46 37 37 50 50 29 49 33 26 26 26 8 7 21 23 23 13 22 2 21 11 11 11 11 11 10 251 250 250 251 251 243 113 114 236 236 235 235 234 30 30 217 219 4 4 4 4 4 4 4 1 44 44 44 22 44 22 19 6 36 35 21 10 21 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 /* $NetBSD: uvm_amap.c,v 1.129 2023/09/10 14:54:34 ad Exp $ */ /* * Copyright (c) 1997 Charles D. Cranor and Washington University. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* * uvm_amap.c: amap operations */ /* * this file contains functions that perform operations on amaps. see * uvm_amap.h for a brief explanation of the role of amaps in uvm. */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: uvm_amap.c,v 1.129 2023/09/10 14:54:34 ad Exp $"); #include "opt_uvmhist.h" #include <sys/param.h> #include <sys/systm.h> #include <sys/kernel.h> #include <sys/kmem.h> #include <sys/pool.h> #include <sys/atomic.h> #include <uvm/uvm.h> #include <uvm/uvm_swap.h> /* * cache for allocation of vm_map structures. note that in order to * avoid an endless loop, the amap cache's allocator cannot allocate * memory from an amap (it currently goes through the kernel uobj, so * we are ok). */ static struct pool_cache uvm_amap_cache; static kmutex_t amap_list_lock __cacheline_aligned; static LIST_HEAD(, vm_amap) amap_list; /* * local functions */ static int amap_roundup_slots(int slots) { return kmem_roundup_size(slots * sizeof(int)) / sizeof(int); } #ifdef UVM_AMAP_PPREF /* * what is ppref? ppref is an _optional_ amap feature which is used * to keep track of reference counts on a per-page basis. it is enabled * when UVM_AMAP_PPREF is defined. * * when enabled, an array of ints is allocated for the pprefs. this * array is allocated only when a partial reference is added to the * map (either by unmapping part of the amap, or gaining a reference * to only a part of an amap). if the allocation of the array fails * (KM_NOSLEEP), then we set the array pointer to PPREF_NONE to indicate * that we tried to do ppref's but couldn't alloc the array so just * give up (after all, this is an optional feature!). * * the array is divided into page sized "chunks." for chunks of length 1, * the chunk reference count plus one is stored in that chunk's slot. * for chunks of length > 1 the first slot contains (the reference count * plus one) * -1. [the negative value indicates that the length is * greater than one.] the second slot of the chunk contains the length * of the chunk. here is an example: * * actual REFS: 2 2 2 2 3 1 1 0 0 0 4 4 0 1 1 1 * ppref: -3 4 x x 4 -2 2 -1 3 x -5 2 1 -2 3 x * <----------><-><----><-------><----><-><-------> * (x = don't care) * * this allows us to allow one int to contain the ref count for the whole * chunk. note that the "plus one" part is needed because a reference * count of zero is neither positive or negative (need a way to tell * if we've got one zero or a bunch of them). * * here are some in-line functions to help us. */ /* * pp_getreflen: get the reference and length for a specific offset * * => ppref's amap must be locked */ static inline void pp_getreflen(int *ppref, int offset, int *refp, int *lenp) { if (ppref[offset] > 0) { /* chunk size must be 1 */ *refp = ppref[offset] - 1; /* don't forget to adjust */ *lenp = 1; } else { *refp = (ppref[offset] * -1) - 1; *lenp = ppref[offset+1]; } } /* * pp_setreflen: set the reference and length for a specific offset * * => ppref's amap must be locked */ static inline void pp_setreflen(int *ppref, int offset, int ref, int len) { if (len == 0) return; if (len == 1) { ppref[offset] = ref + 1; } else { ppref[offset] = (ref + 1) * -1; ppref[offset+1] = len; } } #endif /* UVM_AMAP_PPREF */ /* * amap_alloc1: allocate an amap, but do not initialise the overlay. * * => Note: lock is not set. */ static struct vm_amap * amap_alloc1(int slots, int padslots, int flags) { const bool nowait = (flags & UVM_FLAG_NOWAIT) != 0; const km_flag_t kmflags = nowait ? KM_NOSLEEP : KM_SLEEP; struct vm_amap *amap; krwlock_t *newlock, *oldlock; int totalslots; amap = pool_cache_get(&uvm_amap_cache, nowait ? PR_NOWAIT : PR_WAITOK); if (amap == NULL) { return NULL; } KASSERT(amap->am_lock != NULL); KASSERT(amap->am_nused == 0); /* Try to privatize the lock if currently shared. */ if (rw_obj_refcnt(amap->am_lock) > 1) { newlock = rw_obj_tryalloc(); if (newlock != NULL) { oldlock = amap->am_lock; mutex_enter(&amap_list_lock); amap->am_lock = newlock; mutex_exit(&amap_list_lock); rw_obj_free(oldlock); } } totalslots = amap_roundup_slots(slots + padslots); amap->am_ref = 1; amap->am_flags = 0; #ifdef UVM_AMAP_PPREF amap->am_ppref = NULL; #endif amap->am_maxslot = totalslots; amap->am_nslot = slots; /* * Note: since allocations are likely big, we expect to reduce the * memory fragmentation by allocating them in separate blocks. */ amap->am_slots = kmem_alloc(totalslots * sizeof(int), kmflags); if (amap->am_slots == NULL) goto fail1; amap->am_bckptr = kmem_alloc(totalslots * sizeof(int), kmflags); if (amap->am_bckptr == NULL) goto fail2; amap->am_anon = kmem_alloc(totalslots * sizeof(struct vm_anon *), kmflags); if (amap->am_anon == NULL) goto fail3; return amap; fail3: kmem_free(amap->am_bckptr, totalslots * sizeof(int)); fail2: kmem_free(amap->am_slots, totalslots * sizeof(int)); fail1: pool_cache_put(&uvm_amap_cache, amap); /* * XXX hack to tell the pagedaemon how many pages we need, * since we can need more than it would normally free. */ if (nowait) { extern u_int uvm_extrapages; atomic_add_int(&uvm_extrapages, ((sizeof(int) * 2 + sizeof(struct vm_anon *)) * totalslots) >> PAGE_SHIFT); } return NULL; } /* * amap_alloc: allocate an amap to manage "sz" bytes of anonymous VM * * => caller should ensure sz is a multiple of PAGE_SIZE * => reference count to new amap is set to one * => new amap is returned unlocked */ struct vm_amap * amap_alloc(vaddr_t sz, vaddr_t padsz, int waitf) { struct vm_amap *amap; int slots, padslots; UVMHIST_FUNC(__func__); UVMHIST_CALLED(maphist); AMAP_B2SLOT(slots, sz); AMAP_B2SLOT(padslots, padsz); amap = amap_alloc1(slots, padslots, waitf); if (amap) { memset(amap->am_anon, 0, amap->am_maxslot * sizeof(struct vm_anon *)); } UVMHIST_LOG(maphist,"<- done, amap = %#jx, sz=%jd", (uintptr_t)amap, sz, 0, 0); return(amap); } /* * amap_ctor: pool_cache constructor for new amaps * * => carefully synchronize with amap_swap_off() */ static int amap_ctor(void *arg, void *obj, int flags) { struct vm_amap *amap = obj; if ((flags & PR_NOWAIT) != 0) { amap->am_lock = rw_obj_tryalloc(); if (amap->am_lock == NULL) { return ENOMEM; } } else { amap->am_lock = rw_obj_alloc(); } amap->am_nused = 0; amap->am_flags = 0; mutex_enter(&amap_list_lock); LIST_INSERT_HEAD(&amap_list, amap, am_list); mutex_exit(&amap_list_lock); return 0; } /* * amap_ctor: pool_cache destructor for amaps * * => carefully synchronize with amap_swap_off() */ static void amap_dtor(void *arg, void *obj) { struct vm_amap *amap = obj; KASSERT(amap->am_nused == 0); mutex_enter(&amap_list_lock); LIST_REMOVE(amap, am_list); mutex_exit(&amap_list_lock); rw_obj_free(amap->am_lock); } /* * uvm_amap_init: initialize the amap system. */ void uvm_amap_init(void) { mutex_init(&amap_list_lock, MUTEX_DEFAULT, IPL_NONE); pool_cache_bootstrap(&uvm_amap_cache, sizeof(struct vm_amap), COHERENCY_UNIT, 0, 0, "amappl", NULL, IPL_NONE, amap_ctor, amap_dtor, NULL); } /* * amap_free: free an amap * * => the amap must be unlocked * => the amap should have a zero reference count and be empty */ void amap_free(struct vm_amap *amap) { int slots; UVMHIST_FUNC(__func__); UVMHIST_CALLED(maphist); KASSERT(amap->am_ref == 0); KASSERT(amap->am_nused == 0); KASSERT((amap->am_flags & AMAP_SWAPOFF) == 0); slots = amap->am_maxslot; kmem_free(amap->am_slots, slots * sizeof(*amap->am_slots)); kmem_free(amap->am_bckptr, slots * sizeof(*amap->am_bckptr)); kmem_free(amap->am_anon, slots * sizeof(*amap->am_anon)); #ifdef UVM_AMAP_PPREF if (amap->am_ppref && amap->am_ppref != PPREF_NONE) kmem_free(amap->am_ppref, slots * sizeof(*amap->am_ppref)); #endif pool_cache_put(&uvm_amap_cache, amap); UVMHIST_LOG(maphist,"<- done, freed amap = %#jx", (uintptr_t)amap, 0, 0, 0); } /* * amap_extend: extend the size of an amap (if needed) * * => called from uvm_map when we want to extend an amap to cover * a new mapping (rather than allocate a new one) * => amap should be unlocked (we will lock it) * => to safely extend an amap it should have a reference count of * one (thus it can't be shared) */ int amap_extend(struct vm_map_entry *entry, vsize_t addsize, int flags) { struct vm_amap *amap = entry->aref.ar_amap; int slotoff = entry->aref.ar_pageoff; int slotmapped, slotadd, slotneed, slotadded, slotalloc; int slotadj, slotarea, slotendoff; int oldnslots; #ifdef UVM_AMAP_PPREF int *newppref, *oldppref; #endif int i, *newsl, *newbck, *oldsl, *oldbck; struct vm_anon **newover, **oldover; const km_flag_t kmflags = (flags & AMAP_EXTEND_NOWAIT) ? KM_NOSLEEP : KM_SLEEP; UVMHIST_FUNC(__func__); UVMHIST_CALLARGS(maphist, " (entry=%#jx, addsize=%#jx, flags=%#jx)", (uintptr_t)entry, addsize, flags, 0); /* * first, determine how many slots we need in the amap. don't * forget that ar_pageoff could be non-zero: this means that * there are some unused slots before us in the amap. */ amap_lock(amap, RW_WRITER); KASSERT(amap_refs(amap) == 1); /* amap can't be shared */ AMAP_B2SLOT(slotmapped, entry->end - entry->start); /* slots mapped */ AMAP_B2SLOT(slotadd, addsize); /* slots to add */ if (flags & AMAP_EXTEND_FORWARDS) { slotneed = slotoff + slotmapped + slotadd; slotadj = 0; slotarea = 0; } else { slotneed = slotadd + slotmapped; slotadj = slotadd - slotoff; slotarea = amap->am_maxslot - slotmapped; } /* * Because this amap only has 1 ref, we know that there is * only one vm_map_entry pointing to it, and the one entry is * using slots between slotoff and slotoff + slotmapped. If * we have been using ppref then we know that only slots in * the one map entry's range can have anons, since ppref * allowed us to free any anons outside that range as other map * entries which used this amap were removed. But without ppref, * we couldn't know which slots were still needed by other map * entries, so we couldn't free any anons as we removed map * entries, and so any slot from 0 to am_nslot can have an * anon. But now that we know there is only one map entry * left and we know its range, we can free up any anons * outside that range. This is necessary because the rest of * this function assumes that there are no anons in the amap * outside of the one map entry's range. */ slotendoff = slotoff + slotmapped; if (amap->am_ppref == PPREF_NONE) { amap_wiperange(amap, 0, slotoff); amap_wiperange(amap, slotendoff, amap->am_nslot - slotendoff); } for (i = 0; i < slotoff; i++) { KASSERT(amap->am_anon[i] == NULL); } for (i = slotendoff; i < amap->am_nslot - slotendoff; i++) { KASSERT(amap->am_anon[i] == NULL); } /* * case 1: we already have enough slots in the map and thus * only need to bump the reference counts on the slots we are * adding. */ if (flags & AMAP_EXTEND_FORWARDS) { if (amap->am_nslot >= slotneed) { #ifdef UVM_AMAP_PPREF if (amap->am_ppref && amap->am_ppref != PPREF_NONE) { amap_pp_adjref(amap, slotoff + slotmapped, slotadd, 1); } #endif amap_unlock(amap); UVMHIST_LOG(maphist, "<- done (case 1f), amap = %#jx, sltneed=%jd", (uintptr_t)amap, slotneed, 0, 0); return 0; } } else { if (slotadj <= 0) { slotoff -= slotadd; entry->aref.ar_pageoff = slotoff; #ifdef UVM_AMAP_PPREF if (amap->am_ppref && amap->am_ppref != PPREF_NONE) { amap_pp_adjref(amap, slotoff, slotadd, 1); } #endif amap_unlock(amap); UVMHIST_LOG(maphist, "<- done (case 1b), amap = %#jx, sltneed=%jd", (uintptr_t)amap, slotneed, 0, 0); return 0; } } /* * case 2: we pre-allocated slots for use and we just need to * bump nslot up to take account for these slots. */ if (amap->am_maxslot >= slotneed) { if (flags & AMAP_EXTEND_FORWARDS) { #ifdef UVM_AMAP_PPREF if (amap->am_ppref && amap->am_ppref != PPREF_NONE) { if ((slotoff + slotmapped) < amap->am_nslot) amap_pp_adjref(amap, slotoff + slotmapped, (amap->am_nslot - (slotoff + slotmapped)), 1); pp_setreflen(amap->am_ppref, amap->am_nslot, 1, slotneed - amap->am_nslot); } #endif amap->am_nslot = slotneed; amap_unlock(amap); /* * no need to zero am_anon since that was done at * alloc time and we never shrink an allocation. */ UVMHIST_LOG(maphist,"<- done (case 2f), amap = %#jx, " "slotneed=%jd", (uintptr_t)amap, slotneed, 0, 0); return 0; } else { #ifdef UVM_AMAP_PPREF if (amap->am_ppref && amap->am_ppref != PPREF_NONE) { /* * Slide up the ref counts on the pages that * are actually in use. */ memmove(amap->am_ppref + slotarea, amap->am_ppref + slotoff, slotmapped * sizeof(int)); /* * Mark the (adjusted) gap at the front as * referenced/not referenced. */ pp_setreflen(amap->am_ppref, 0, 0, slotarea - slotadd); pp_setreflen(amap->am_ppref, slotarea - slotadd, 1, slotadd); } #endif /* * Slide the anon pointers up and clear out * the space we just made. */ memmove(amap->am_anon + slotarea, amap->am_anon + slotoff, slotmapped * sizeof(struct vm_anon*)); memset(amap->am_anon + slotoff, 0, (slotarea - slotoff) * sizeof(struct vm_anon *)); /* * Slide the backpointers up, but don't bother * wiping out the old slots. */ memmove(amap->am_bckptr + slotarea, amap->am_bckptr + slotoff, slotmapped * sizeof(int)); /* * Adjust all the useful active slot numbers. */ for (i = 0; i < amap->am_nused; i++) amap->am_slots[i] += (slotarea - slotoff); /* * We just filled all the empty space in the * front of the amap by activating a few new * slots. */ amap->am_nslot = amap->am_maxslot; entry->aref.ar_pageoff = slotarea - slotadd; amap_unlock(amap); UVMHIST_LOG(maphist,"<- done (case 2b), amap = %#jx, " "slotneed=%jd", (uintptr_t)amap, slotneed, 0, 0); return 0; } } /* * Case 3: we need to allocate a new amap and copy all the amap * data over from old amap to the new one. Drop the lock before * performing allocation. * * Note: since allocations are likely big, we expect to reduce the * memory fragmentation by allocating them in separate blocks. */ amap_unlock(amap); if (slotneed >= UVM_AMAP_LARGE) { return E2BIG; } slotalloc = amap_roundup_slots(slotneed); #ifdef UVM_AMAP_PPREF newppref = NULL; if (amap->am_ppref && amap->am_ppref != PPREF_NONE) { /* Will be handled later if fails. */ newppref = kmem_alloc(slotalloc * sizeof(*newppref), kmflags); } #endif newsl = kmem_alloc(slotalloc * sizeof(*newsl), kmflags); newbck = kmem_alloc(slotalloc * sizeof(*newbck), kmflags); newover = kmem_alloc(slotalloc * sizeof(*newover), kmflags); if (newsl == NULL || newbck == NULL || newover == NULL) { #ifdef UVM_AMAP_PPREF if (newppref != NULL) { kmem_free(newppref, slotalloc * sizeof(*newppref)); } #endif if (newsl != NULL) { kmem_free(newsl, slotalloc * sizeof(*newsl)); } if (newbck != NULL) { kmem_free(newbck, slotalloc * sizeof(*newbck)); } if (newover != NULL) { kmem_free(newover, slotalloc * sizeof(*newover)); } return ENOMEM; } amap_lock(amap, RW_WRITER); KASSERT(amap->am_maxslot < slotneed); /* * Copy everything over to new allocated areas. */ slotadded = slotalloc - amap->am_nslot; if (!(flags & AMAP_EXTEND_FORWARDS)) slotarea = slotalloc - slotmapped; /* do am_slots */ oldsl = amap->am_slots; if (flags & AMAP_EXTEND_FORWARDS) memcpy(newsl, oldsl, sizeof(int) * amap->am_nused); else for (i = 0; i < amap->am_nused; i++) newsl[i] = oldsl[i] + slotarea - slotoff; amap->am_slots = newsl; /* do am_anon */ oldover = amap->am_anon; if (flags & AMAP_EXTEND_FORWARDS) { memcpy(newover, oldover, sizeof(struct vm_anon *) * amap->am_nslot); memset(newover + amap->am_nslot, 0, sizeof(struct vm_anon *) * slotadded); } else { memcpy(newover + slotarea, oldover + slotoff, sizeof(struct vm_anon *) * slotmapped); memset(newover, 0, sizeof(struct vm_anon *) * slotarea); } amap->am_anon = newover; /* do am_bckptr */ oldbck = amap->am_bckptr; if (flags & AMAP_EXTEND_FORWARDS) memcpy(newbck, oldbck, sizeof(int) * amap->am_nslot); else memcpy(newbck + slotarea, oldbck + slotoff, sizeof(int) * slotmapped); amap->am_bckptr = newbck; #ifdef UVM_AMAP_PPREF /* do ppref */ oldppref = amap->am_ppref; if (newppref) { if (flags & AMAP_EXTEND_FORWARDS) { memcpy(newppref, oldppref, sizeof(int) * amap->am_nslot); memset(newppref + amap->am_nslot, 0, sizeof(int) * slotadded); } else { memcpy(newppref + slotarea, oldppref + slotoff, sizeof(int) * slotmapped); } amap->am_ppref = newppref; if ((flags & AMAP_EXTEND_FORWARDS) && (slotoff + slotmapped) < amap->am_nslot) amap_pp_adjref(amap, slotoff + slotmapped, (amap->am_nslot - (slotoff + slotmapped)), 1); if (flags & AMAP_EXTEND_FORWARDS) pp_setreflen(newppref, amap->am_nslot, 1, slotneed - amap->am_nslot); else { pp_setreflen(newppref, 0, 0, slotalloc - slotneed); pp_setreflen(newppref, slotalloc - slotneed, 1, slotneed - slotmapped); } } else { if (amap->am_ppref) amap->am_ppref = PPREF_NONE; } #endif /* update master values */ if (flags & AMAP_EXTEND_FORWARDS) amap->am_nslot = slotneed; else { entry->aref.ar_pageoff = slotarea - slotadd; amap->am_nslot = slotalloc; } oldnslots = amap->am_maxslot; amap->am_maxslot = slotalloc; amap_unlock(amap); kmem_free(oldsl, oldnslots * sizeof(*oldsl)); kmem_free(oldbck, oldnslots * sizeof(*oldbck)); kmem_free(oldover, oldnslots * sizeof(*oldover)); #ifdef UVM_AMAP_PPREF if (oldppref && oldppref != PPREF_NONE) kmem_free(oldppref, oldnslots * sizeof(*oldppref)); #endif UVMHIST_LOG(maphist,"<- done (case 3), amap = %#jx, slotneed=%jd", (uintptr_t)amap, slotneed, 0, 0); return 0; } /* * amap_share_protect: change protection of anons in a shared amap * * for shared amaps, given the current data structure layout, it is * not possible for us to directly locate all maps referencing the * shared anon (to change the protection). in order to protect data * in shared maps we use pmap_page_protect(). [this is useful for IPC * mechanisms like map entry passing that may want to write-protect * all mappings of a shared amap.] we traverse am_anon or am_slots * depending on the current state of the amap. * * => entry's map and amap must be locked by the caller */ void amap_share_protect(struct vm_map_entry *entry, vm_prot_t prot) { struct vm_amap *amap = entry->aref.ar_amap; u_int slots, lcv, slot, stop; struct vm_anon *anon; KASSERT(rw_write_held(amap->am_lock)); AMAP_B2SLOT(slots, (entry->end - entry->start)); stop = entry->aref.ar_pageoff + slots; if (slots < amap->am_nused) { /* * Cheaper to traverse am_anon. */ for (lcv = entry->aref.ar_pageoff ; lcv < stop ; lcv++) { anon = amap->am_anon[lcv]; if (anon == NULL) { continue; } if (anon->an_page) { pmap_page_protect(anon->an_page, prot); } } return; } /* * Cheaper to traverse am_slots. */ for (lcv = 0 ; lcv < amap->am_nused ; lcv++) { slot = amap->am_slots[lcv]; if (slot < entry->aref.ar_pageoff || slot >= stop) { continue; } anon = amap->am_anon[slot]; if (anon->an_page) { pmap_page_protect(anon->an_page, prot); } } } /* * amap_wipeout: wipeout all anon's in an amap; then free the amap! * * => Called from amap_unref(), when reference count drops to zero. * => amap must be locked. */ void amap_wipeout(struct vm_amap *amap) { u_int lcv; UVMHIST_FUNC(__func__); UVMHIST_CALLARGS(maphist,"(amap=%#jx)", (uintptr_t)amap, 0,0,0); KASSERT(rw_write_held(amap->am_lock)); KASSERT(amap->am_ref == 0); if (__predict_false(amap->am_flags & AMAP_SWAPOFF)) { /* * Note: amap_swap_off() will call us again. */ amap_unlock(amap); return; } for (lcv = 0 ; lcv < amap->am_nused ; lcv++) { struct vm_anon *anon; u_int slot; slot = amap->am_slots[lcv]; anon = amap->am_anon[slot]; KASSERT(anon != NULL); KASSERT(anon->an_ref != 0); KASSERT(anon->an_lock == amap->am_lock); UVMHIST_LOG(maphist," processing anon %#jx, ref=%jd", (uintptr_t)anon, anon->an_ref, 0, 0); /* * Drop the reference. */ if (__predict_true(--anon->an_ref == 0)) { uvm_anfree(anon); } if (__predict_false((lcv & 31) == 31)) { preempt_point(); } } /* * Finally, destroy the amap. */ amap->am_nused = 0; amap_unlock(amap); amap_free(amap); UVMHIST_LOG(maphist,"<- done!", 0,0,0,0); } /* * amap_copy: ensure that a map entry's "needs_copy" flag is false * by copying the amap if necessary. * * => an entry with a null amap pointer will get a new (blank) one. * => the map that the map entry belongs to must be locked by caller. * => the amap currently attached to "entry" (if any) must be unlocked. * => if canchunk is true, then we may clip the entry into a chunk * => "startva" and "endva" are used only if canchunk is true. they are * used to limit chunking (e.g. if you have a large space that you * know you are going to need to allocate amaps for, there is no point * in allowing that to be chunked) */ void amap_copy(struct vm_map *map, struct vm_map_entry *entry, int flags, vaddr_t startva, vaddr_t endva) { const int waitf = (flags & AMAP_COPY_NOWAIT) ? UVM_FLAG_NOWAIT : 0; struct vm_amap *amap, *srcamap; u_int slots, lcv; krwlock_t *oldlock; vsize_t len; UVMHIST_FUNC(__func__); UVMHIST_CALLARGS(maphist, " (map=%#jx, entry=%#jx, flags=%#jx)", (uintptr_t)map, (uintptr_t)entry, flags, -2); KASSERT(map != kernel_map); /* we use nointr pool */ srcamap = entry->aref.ar_amap; len = entry->end - entry->start; /* * Is there an amap to copy? If not, create one. */ if (srcamap == NULL) { const bool canchunk = (flags & AMAP_COPY_NOCHUNK) == 0; /* * Check to see if we have a large amap that we can * chunk. We align startva/endva to chunk-sized * boundaries and then clip to them. */ if (canchunk && atop(len) >= UVM_AMAP_LARGE) { vsize_t chunksize; /* Convert slots to bytes. */ chunksize = UVM_AMAP_CHUNK << PAGE_SHIFT; startva = (startva / chunksize) * chunksize; endva = roundup(endva, chunksize); UVMHIST_LOG(maphist, " chunk amap ==> clip %#jx->%#jx to %#jx->%#jx", entry->start, entry->end, startva, endva); UVM_MAP_CLIP_START(map, entry, startva); /* Watch out for endva wrap-around! */ if (endva >= startva) { UVM_MAP_CLIP_END(map, entry, endva); } } if ((flags & AMAP_COPY_NOMERGE) == 0 && uvm_mapent_trymerge(map, entry, UVM_MERGE_COPYING)) { return; } UVMHIST_LOG(maphist, "<- done [creating new amap %#jx->%#jx]", entry->start, entry->end, 0, 0); /* * Allocate an initialised amap and install it. * Note: we must update the length after clipping. */ len = entry->end - entry->start; entry->aref.ar_pageoff = 0; entry->aref.ar_amap = amap_alloc(len, 0, waitf); if (entry->aref.ar_amap != NULL) { entry->etype &= ~UVM_ET_NEEDSCOPY; } return; } /* * First check and see if we are the only map entry referencing * he amap we currently have. If so, then just take it over instead * of copying it. Note that we are reading am_ref without lock held * as the value can only be one if we have the only reference * to the amap (via our locked map). If the value is greater than * one, then allocate amap and re-check the value. */ if (srcamap->am_ref == 1) { entry->etype &= ~UVM_ET_NEEDSCOPY; UVMHIST_LOG(maphist, "<- done [ref cnt = 1, took it over]", 0, 0, 0, 0); return; } UVMHIST_LOG(maphist," amap=%#jx, ref=%jd, must copy it", (uintptr_t)srcamap, srcamap->am_ref, 0, 0); /* * Allocate a new amap (note: not initialised, etc). */ AMAP_B2SLOT(slots, len); amap = amap_alloc1(slots, 0, waitf); if (amap == NULL) { UVMHIST_LOG(maphist, " amap_alloc1 failed", 0,0,0,0); return; } /* * Make the new amap share the source amap's lock, and then lock * both. We must do this before we set am_nused != 0, otherwise * amap_swap_off() can become interested in the amap. */ oldlock = amap->am_lock; mutex_enter(&amap_list_lock); amap->am_lock = srcamap->am_lock; mutex_exit(&amap_list_lock); rw_obj_hold(amap->am_lock); rw_obj_free(oldlock); amap_lock(srcamap, RW_WRITER); /* * Re-check the reference count with the lock held. If it has * dropped to one - we can take over the existing map. */ if (srcamap->am_ref == 1) { /* Just take over the existing amap. */ entry->etype &= ~UVM_ET_NEEDSCOPY; amap_unlock(srcamap); /* Destroy the new (unused) amap. */ amap->am_ref--; amap_free(amap); return; } /* * Copy the slots. Zero the padded part. */ UVMHIST_LOG(maphist, " copying amap now",0, 0, 0, 0); for (lcv = 0 ; lcv < slots; lcv++) { amap->am_anon[lcv] = srcamap->am_anon[entry->aref.ar_pageoff + lcv]; if (amap->am_anon[lcv] == NULL) continue; KASSERT(amap->am_anon[lcv]->an_lock == srcamap->am_lock); KASSERT(amap->am_anon[lcv]->an_ref > 0); KASSERT(amap->am_nused < amap->am_maxslot); amap->am_anon[lcv]->an_ref++; amap->am_bckptr[lcv] = amap->am_nused; amap->am_slots[amap->am_nused] = lcv; amap->am_nused++; } memset(&amap->am_anon[lcv], 0, (amap->am_maxslot - lcv) * sizeof(struct vm_anon *)); /* * Drop our reference to the old amap (srcamap) and unlock. * Since the reference count on srcamap is greater than one, * (we checked above), it cannot drop to zero while it is locked. */ srcamap->am_ref--; KASSERT(srcamap->am_ref > 0); if (srcamap->am_ref == 1 && (srcamap->am_flags & AMAP_SHARED) != 0) { srcamap->am_flags &= ~AMAP_SHARED; } #ifdef UVM_AMAP_PPREF if (srcamap->am_ppref && srcamap->am_ppref != PPREF_NONE) { amap_pp_adjref(srcamap, entry->aref.ar_pageoff, len >> PAGE_SHIFT, -1); } #endif amap_unlock(srcamap); /* * Install new amap. */ entry->aref.ar_pageoff = 0; entry->aref.ar_amap = amap; entry->etype &= ~UVM_ET_NEEDSCOPY; UVMHIST_LOG(maphist, "<- done",0, 0, 0, 0); } /* * amap_cow_now: resolve all copy-on-write faults in an amap now for fork(2) * * called during fork(2) when the parent process has a wired map * entry. in that case we want to avoid write-protecting pages * in the parent's map (e.g. like what you'd do for a COW page) * so we resolve the COW here. * * => assume parent's entry was wired, thus all pages are resident. * => assume pages that are loaned out (loan_count) are already mapped * read-only in all maps, and thus no need for us to worry about them * => assume both parent and child vm_map's are locked * => caller passes child's map/entry in to us * => if we run out of memory we will unlock the amap and sleep _with_ the * parent and child vm_map's locked(!). we have to do this since * we are in the middle of a fork(2) and we can't let the parent * map change until we are done copying all the map entrys. * => XXXCDC: out of memory should cause fork to fail, but there is * currently no easy way to do this (needs fix) */ void amap_cow_now(struct vm_map *map, struct vm_map_entry *entry) { struct vm_amap *amap = entry->aref.ar_amap; struct vm_anon *anon, *nanon; struct vm_page *pg, *npg; u_int lcv, slot; /* * note that if we unlock the amap then we must ReStart the "lcv" for * loop because some other process could reorder the anon's in the * am_anon[] array on us while the lock is dropped. */ ReStart: amap_lock(amap, RW_WRITER); for (lcv = 0 ; lcv < amap->am_nused ; lcv++) { slot = amap->am_slots[lcv]; anon = amap->am_anon[slot]; KASSERT(anon->an_lock == amap->am_lock); /* * If anon has only one reference - we must have already * copied it. This can happen if we needed to sleep waiting * for memory in a previous run through this loop. The new * page might even have been paged out, since is not wired. */ if (anon->an_ref == 1) { KASSERT(anon->an_page != NULL || anon->an_swslot != 0); continue; } /* * The old page must be resident since the parent is wired. */ pg = anon->an_page; KASSERT(pg != NULL); KASSERT(pg->wire_count > 0); /* * If the page is loaned then it must already be mapped * read-only and we don't need to copy it. */ if (pg->loan_count != 0) { continue; } KASSERT(pg->uanon == anon); KASSERT(pg->uobject == NULL); /* * If the page is busy, then we have to unlock, wait for * it and then restart. */ if (pg->flags & PG_BUSY) { uvm_pagewait(pg, amap->am_lock, "cownow"); goto ReStart; } /* * Perform a copy-on-write. * First - get a new anon and a page. */ nanon = uvm_analloc(); if (nanon) { nanon->an_lock = amap->am_lock; npg = uvm_pagealloc(NULL, 0, nanon, 0); } else { npg = NULL; } if (nanon == NULL || npg == NULL) { amap_unlock(amap); if (nanon) { nanon->an_lock = NULL; nanon->an_ref--; KASSERT(nanon->an_ref == 0); uvm_anfree(nanon); } uvm_wait("cownowpage"); goto ReStart; } /* * Copy the data and replace anon with the new one. * Also, setup its lock (share the with amap's lock). */ uvm_pagecopy(pg, npg); anon->an_ref--; KASSERT(anon->an_ref > 0); amap->am_anon[slot] = nanon; /* * Drop PG_BUSY on new page. Since its owner was write * locked all this time - it cannot be PG_RELEASED or * waited on. */ uvm_pagelock(npg); uvm_pageactivate(npg); uvm_pageunlock(npg); npg->flags &= ~(PG_BUSY|PG_FAKE); UVM_PAGE_OWN(npg, NULL); } amap_unlock(amap); } /* * amap_splitref: split a single reference into two separate references * * => called from uvm_map's clip routines * => origref's map should be locked * => origref->ar_amap should be unlocked (we will lock) */ void amap_splitref(struct vm_aref *origref, struct vm_aref *splitref, vaddr_t offset) { struct vm_amap *amap = origref->ar_amap; u_int leftslots; KASSERT(splitref->ar_amap == origref->ar_amap); AMAP_B2SLOT(leftslots, offset); KASSERT(leftslots != 0); amap_lock(amap, RW_WRITER); KASSERT(amap->am_nslot - origref->ar_pageoff - leftslots > 0); #ifdef UVM_AMAP_PPREF /* Establish ppref before we add a duplicate reference to the amap. */ if (amap->am_ppref == NULL) { amap_pp_establish(amap, origref->ar_pageoff); } #endif /* Note: not a share reference. */ amap->am_ref++; splitref->ar_pageoff = origref->ar_pageoff + leftslots; amap_unlock(amap); } #ifdef UVM_AMAP_PPREF /* * amap_pp_establish: add a ppref array to an amap, if possible. * * => amap should be locked by caller. */ void amap_pp_establish(struct vm_amap *amap, vaddr_t offset) { const size_t sz = amap->am_maxslot * sizeof(*amap->am_ppref); KASSERT(rw_write_held(amap->am_lock)); amap->am_ppref = kmem_zalloc(sz, KM_NOSLEEP); if (amap->am_ppref == NULL) { /* Failure - just do not use ppref. */ amap->am_ppref = PPREF_NONE; return; } pp_setreflen(amap->am_ppref, 0, 0, offset); pp_setreflen(amap->am_ppref, offset, amap->am_ref, amap->am_nslot - offset); } /* * amap_pp_adjref: adjust reference count to a part of an amap using the * per-page reference count array. * * => caller must check that ppref != PPREF_NONE before calling. * => map and amap must be locked. */ void amap_pp_adjref(struct vm_amap *amap, int curslot, vsize_t slotlen, int adjval) { int stopslot, *ppref, lcv, prevlcv; int ref, len, prevref, prevlen; KASSERT(rw_write_held(amap->am_lock)); stopslot = curslot + slotlen; ppref = amap->am_ppref; prevlcv = 0; /* * Advance to the correct place in the array, fragment if needed. */ for (lcv = 0 ; lcv < curslot ; lcv += len) { pp_getreflen(ppref, lcv, &ref, &len); if (lcv + len > curslot) { /* goes past start? */ pp_setreflen(ppref, lcv, ref, curslot - lcv); pp_setreflen(ppref, curslot, ref, len - (curslot -lcv)); len = curslot - lcv; /* new length of entry @ lcv */ } prevlcv = lcv; } if (lcv == 0) { /* * Ensure that the "prevref == ref" test below always * fails, since we are starting from the beginning of * the ppref array; that is, there is no previous chunk. */ prevref = -1; prevlen = 0; } else { pp_getreflen(ppref, prevlcv, &prevref, &prevlen); } /* * Now adjust reference counts in range. Merge the first * changed entry with the last unchanged entry if possible. */ KASSERT(lcv == curslot); for (/* lcv already set */; lcv < stopslot ; lcv += len) { pp_getreflen(ppref, lcv, &ref, &len); if (lcv + len > stopslot) { /* goes past end? */ pp_setreflen(ppref, lcv, ref, stopslot - lcv); pp_setreflen(ppref, stopslot, ref, len - (stopslot - lcv)); len = stopslot - lcv; } ref += adjval; KASSERT(ref >= 0); KASSERT(ref <= amap->am_ref); if (lcv == prevlcv + prevlen && ref == prevref) { pp_setreflen(ppref, prevlcv, ref, prevlen + len); } else { pp_setreflen(ppref, lcv, ref, len); } if (ref == 0) { amap_wiperange(amap, lcv, len); } } } /* * amap_wiperange: wipe out a range of an amap. * Note: different from amap_wipeout because the amap is kept intact. * * => Both map and amap must be locked by caller. */ void amap_wiperange(struct vm_amap *amap, int slotoff, int slots) { u_int lcv, stop, slotend; bool byanon; KASSERT(rw_write_held(amap->am_lock)); /* * We can either traverse the amap by am_anon or by am_slots. * Determine which way is less expensive. */ if (slots < amap->am_nused) { byanon = true; lcv = slotoff; stop = slotoff + slots; slotend = 0; } else { byanon = false; lcv = 0; stop = amap->am_nused; slotend = slotoff + slots; } while (lcv < stop) { struct vm_anon *anon; u_int curslot, ptr, last; if (byanon) { curslot = lcv++; /* lcv advances here */ if (amap->am_anon[curslot] == NULL) continue; } else { curslot = amap->am_slots[lcv]; if (curslot < slotoff || curslot >= slotend) { lcv++; /* lcv advances here */ continue; } stop--; /* drop stop, since anon will be removed */ } anon = amap->am_anon[curslot]; KASSERT(anon->an_lock == amap->am_lock); /* * Remove anon from the amap. */ amap->am_anon[curslot] = NULL; ptr = amap->am_bckptr[curslot]; last = amap->am_nused - 1; if (ptr != last) { amap->am_slots[ptr] = amap->am_slots[last]; amap->am_bckptr[amap->am_slots[ptr]] = ptr; } amap->am_nused--; /* * Drop its reference count. */ KASSERT(anon->an_lock == amap->am_lock); if (--anon->an_ref == 0) { uvm_anfree(anon); } } } #endif #if defined(VMSWAP) /* * amap_swap_off: pagein anonymous pages in amaps and drop swap slots. * * => called with swap_syscall_lock held. * => note that we don't always traverse all anons. * eg. amaps being wiped out, released anons. * => return true if failed. */ bool amap_swap_off(int startslot, int endslot) { struct vm_amap *am; struct vm_amap *am_next; struct vm_amap marker_prev; struct vm_amap marker_next; bool rv = false; #if defined(DIAGNOSTIC) memset(&marker_prev, 0, sizeof(marker_prev)); memset(&marker_next, 0, sizeof(marker_next)); #endif /* defined(DIAGNOSTIC) */ mutex_enter(&amap_list_lock); for (am = LIST_FIRST(&amap_list); am != NULL && !rv; am = am_next) { int i; LIST_INSERT_BEFORE(am, &marker_prev, am_list); LIST_INSERT_AFTER(am, &marker_next, am_list); /* amap_list_lock prevents the lock pointer from changing. */ if (!amap_lock_try(am, RW_WRITER)) { (void)kpause("amapswpo", false, 1, &amap_list_lock); am_next = LIST_NEXT(&marker_prev, am_list); if (am_next == &marker_next) { am_next = LIST_NEXT(am_next, am_list); } else { KASSERT(LIST_NEXT(am_next, am_list) == &marker_next); } LIST_REMOVE(&marker_prev, am_list); LIST_REMOVE(&marker_next, am_list); continue; } mutex_exit(&amap_list_lock); /* If am_nused == 0, the amap could be free - careful. */ for (i = 0; i < am->am_nused; i++) { int slot; int swslot; struct vm_anon *anon; slot = am->am_slots[i]; anon = am->am_anon[slot]; KASSERT(anon->an_lock == am->am_lock); swslot = anon->an_swslot; if (swslot < startslot || endslot <= swslot) { continue; } am->am_flags |= AMAP_SWAPOFF; rv = uvm_anon_pagein(am, anon); amap_lock(am, RW_WRITER); am->am_flags &= ~AMAP_SWAPOFF; if (amap_refs(am) == 0) { amap_wipeout(am); am = NULL; break; } if (rv) { break; } i = 0; } if (am) { amap_unlock(am); } mutex_enter(&amap_list_lock); KASSERT(LIST_NEXT(&marker_prev, am_list) == &marker_next || LIST_NEXT(LIST_NEXT(&marker_prev, am_list), am_list) == &marker_next); am_next = LIST_NEXT(&marker_next, am_list); LIST_REMOVE(&marker_prev, am_list); LIST_REMOVE(&marker_next, am_list); } mutex_exit(&amap_list_lock); return rv; } #endif /* defined(VMSWAP) */ /* * amap_lookup: look up a page in an amap. * * => amap should be locked by caller. */ struct vm_anon * amap_lookup(struct vm_aref *aref, vaddr_t offset) { struct vm_amap *amap = aref->ar_amap; struct vm_anon *an; u_int slot; UVMHIST_FUNC(__func__); UVMHIST_CALLED(maphist); KASSERT(rw_lock_held(amap->am_lock)); AMAP_B2SLOT(slot, offset); slot += aref->ar_pageoff; an = amap->am_anon[slot]; UVMHIST_LOG(maphist, "<- done (amap=%#jx, offset=%#jx, result=%#jx)", (uintptr_t)amap, offset, (uintptr_t)an, 0); KASSERT(slot < amap->am_nslot); KASSERT(an == NULL || an->an_ref != 0); KASSERT(an == NULL || an->an_lock == amap->am_lock); return an; } /* * amap_lookups: look up a range of pages in an amap. * * => amap should be locked by caller. */ void amap_lookups(struct vm_aref *aref, vaddr_t offset, struct vm_anon **anons, int npages) { struct vm_amap *amap = aref->ar_amap; u_int slot; UVMHIST_FUNC(__func__); UVMHIST_CALLED(maphist); KASSERT(rw_lock_held(amap->am_lock)); AMAP_B2SLOT(slot, offset); slot += aref->ar_pageoff; UVMHIST_LOG(maphist, " slot=%u, npages=%d, nslot=%d", slot, npages, amap->am_nslot, 0); KASSERT((slot + (npages - 1)) < amap->am_nslot); memcpy(anons, &amap->am_anon[slot], npages * sizeof(struct vm_anon *)); #if defined(DIAGNOSTIC) for (int i = 0; i < npages; i++) { struct vm_anon * const an = anons[i]; if (an == NULL) { continue; } KASSERT(an->an_ref != 0); KASSERT(an->an_lock == amap->am_lock); } #endif UVMHIST_LOG(maphist, "<- done", 0, 0, 0, 0); } /* * amap_add: add (or replace) a page to an amap. * * => amap should be locked by caller. * => anon must have the lock associated with this amap. */ void amap_add(struct vm_aref *aref, vaddr_t offset, struct vm_anon *anon, bool replace) { struct vm_amap *amap = aref->ar_amap; u_int slot; UVMHIST_FUNC(__func__); UVMHIST_CALLED(maphist); KASSERT(rw_write_held(amap->am_lock)); KASSERT(anon->an_lock == amap->am_lock); AMAP_B2SLOT(slot, offset); slot += aref->ar_pageoff; KASSERT(slot < amap->am_nslot); if (replace) { struct vm_anon *oanon = amap->am_anon[slot]; KASSERT(oanon != NULL); if (oanon->an_page && (amap->am_flags & AMAP_SHARED) != 0) { pmap_page_protect(oanon->an_page, VM_PROT_NONE); /* * XXX: suppose page is supposed to be wired somewhere? */ } } else { KASSERT(amap->am_anon[slot] == NULL); KASSERT(amap->am_nused < amap->am_maxslot); amap->am_bckptr[slot] = amap->am_nused; amap->am_slots[amap->am_nused] = slot; amap->am_nused++; } amap->am_anon[slot] = anon; UVMHIST_LOG(maphist, "<- done (amap=%#jx, offset=%#x, anon=%#jx, rep=%d)", (uintptr_t)amap, offset, (uintptr_t)anon, replace); } /* * amap_unadd: remove a page from an amap. * * => amap should be locked by caller. */ void amap_unadd(struct vm_aref *aref, vaddr_t offset) { struct vm_amap *amap = aref->ar_amap; u_int slot, ptr, last; UVMHIST_FUNC(__func__); UVMHIST_CALLED(maphist); KASSERT(rw_write_held(amap->am_lock)); AMAP_B2SLOT(slot, offset); slot += aref->ar_pageoff; KASSERT(slot < amap->am_nslot); KASSERT(amap->am_anon[slot] != NULL); KASSERT(amap->am_anon[slot]->an_lock == amap->am_lock); amap->am_anon[slot] = NULL; ptr = amap->am_bckptr[slot]; last = amap->am_nused - 1; if (ptr != last) { /* Move the last entry to keep the slots contiguous. */ amap->am_slots[ptr] = amap->am_slots[last]; amap->am_bckptr[amap->am_slots[ptr]] = ptr; } amap->am_nused--; UVMHIST_LOG(maphist, "<- done (amap=%#jx, slot=%#jx)", (uintptr_t)amap, slot,0, 0); } /* * amap_adjref_anons: adjust the reference count(s) on amap and its anons. */ static void amap_adjref_anons(struct vm_amap *amap, vaddr_t offset, vsize_t len, int refv, bool all) { #ifdef UVM_AMAP_PPREF KASSERT(rw_write_held(amap->am_lock)); /* * We must establish the ppref array before changing am_ref * so that the ppref values match the current amap refcount. */ if (amap->am_ppref == NULL) { amap_pp_establish(amap, offset); } #endif amap->am_ref += refv; #ifdef UVM_AMAP_PPREF if (amap->am_ppref && amap->am_ppref != PPREF_NONE) { amap_pp_adjref(amap, offset, len, refv); } #endif amap_unlock(amap); } /* * amap_ref: gain a reference to an amap. * * => amap must not be locked (we will lock). * => "offset" and "len" are in units of pages. * => Called at fork time to gain the child's reference. */ void amap_ref(struct vm_amap *amap, vaddr_t offset, vsize_t len, int flags) { UVMHIST_FUNC(__func__); UVMHIST_CALLED(maphist); amap_lock(amap, RW_WRITER); if (flags & AMAP_SHARED) { amap->am_flags |= AMAP_SHARED; } amap_adjref_anons(amap, offset, len, 1, (flags & AMAP_REFALL) != 0); UVMHIST_LOG(maphist,"<- done! amap=%#jx", (uintptr_t)amap, 0, 0, 0); } /* * amap_unref: remove a reference to an amap. * * => All pmap-level references to this amap must be already removed. * => Called from uvm_unmap_detach(); entry is already removed from the map. * => We will lock amap, so it must be unlocked. */ void amap_unref(struct vm_amap *amap, vaddr_t offset, vsize_t len, bool all) { UVMHIST_FUNC(__func__); UVMHIST_CALLED(maphist); amap_lock(amap, RW_WRITER); UVMHIST_LOG(maphist," amap=%#jx refs=%d, nused=%d", (uintptr_t)amap, amap->am_ref, amap->am_nused, 0); KASSERT(amap->am_ref > 0); if (amap->am_ref == 1) { /* * If the last reference - wipeout and destroy the amap. */ amap->am_ref--; amap_wipeout(amap); UVMHIST_LOG(maphist,"<- done (was last ref)!", 0, 0, 0, 0); return; } /* * Otherwise, drop the reference count(s) on anons. */ if (amap->am_ref == 2 && (amap->am_flags & AMAP_SHARED) != 0) { amap->am_flags &= ~AMAP_SHARED; } amap_adjref_anons(amap, offset, len, -1, all); UVMHIST_LOG(maphist,"<- done!", 0, 0, 0, 0); }
1 1 1 1 1 2 47 49 49 49 49 49 48 49 2 2 2 2 2 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 /* $NetBSD: pfil.c,v 1.42 2022/08/16 04:35:57 knakahara Exp $ */ /* * Copyright (c) 2013 Mindaugas Rasiukevicius <rmind at NetBSD org> * Copyright (c) 1996 Matthew R. Green * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: pfil.c,v 1.42 2022/08/16 04:35:57 knakahara Exp $"); #if defined(_KERNEL_OPT) #include "opt_net_mpsafe.h" #endif #include <sys/param.h> #include <sys/systm.h> #include <sys/queue.h> #include <sys/kmem.h> #include <sys/psref.h> #include <sys/cpu.h> #include <net/if.h> #include <net/pfil.h> #define MAX_HOOKS 8 /* Func is either pfil_func_t or pfil_ifunc_t. */ typedef void (*pfil_polyfunc_t)(void); typedef struct { pfil_polyfunc_t pfil_func; void * pfil_arg; } pfil_hook_t; typedef struct { pfil_hook_t hooks[MAX_HOOKS]; u_int nhooks; struct psref_target psref; } pfil_list_t; typedef struct { pfil_list_t *active; /* lists[0] or lists[1] */ pfil_list_t lists[2]; } pfil_listset_t; CTASSERT(PFIL_IN == 1); CTASSERT(PFIL_OUT == 2); struct pfil_head { pfil_listset_t ph_in; pfil_listset_t ph_out; pfil_listset_t ph_ifaddr; pfil_listset_t ph_ifevent; int ph_type; void * ph_key; LIST_ENTRY(pfil_head) ph_list; }; static const int pfil_flag_cases[] = { PFIL_IN, PFIL_OUT }; static LIST_HEAD(, pfil_head) pfil_head_list __read_mostly = LIST_HEAD_INITIALIZER(&pfil_head_list); static kmutex_t pfil_mtx __cacheline_aligned; static struct psref_class *pfil_psref_class __read_mostly; #ifdef NET_MPSAFE static pserialize_t pfil_psz; #endif void pfil_init(void) { mutex_init(&pfil_mtx, MUTEX_DEFAULT, IPL_NONE); #ifdef NET_MPSAFE pfil_psz = pserialize_create(); #endif pfil_psref_class = psref_class_create("pfil", IPL_SOFTNET); } static inline void pfil_listset_init(pfil_listset_t *pflistset) { pflistset->active = &pflistset->lists[0]; psref_target_init(&pflistset->active->psref, pfil_psref_class); } /* * pfil_head_create: create and register a packet filter head. */ pfil_head_t * pfil_head_create(int type, void *key) { pfil_head_t *ph; if (pfil_head_get(type, key)) { return NULL; } ph = kmem_zalloc(sizeof(pfil_head_t), KM_SLEEP); ph->ph_type = type; ph->ph_key = key; pfil_listset_init(&ph->ph_in); pfil_listset_init(&ph->ph_out); pfil_listset_init(&ph->ph_ifaddr); pfil_listset_init(&ph->ph_ifevent); LIST_INSERT_HEAD(&pfil_head_list, ph, ph_list); return ph; } /* * pfil_head_destroy: remove and destroy a packet filter head. */ void pfil_head_destroy(pfil_head_t *pfh) { LIST_REMOVE(pfh, ph_list); psref_target_destroy(&pfh->ph_in.active->psref, pfil_psref_class); psref_target_destroy(&pfh->ph_out.active->psref, pfil_psref_class); psref_target_destroy(&pfh->ph_ifaddr.active->psref, pfil_psref_class); psref_target_destroy(&pfh->ph_ifevent.active->psref, pfil_psref_class); kmem_free(pfh, sizeof(pfil_head_t)); } /* * pfil_head_get: returns the packer filter head for a given key. */ pfil_head_t * pfil_head_get(int type, void *key) { pfil_head_t *ph; LIST_FOREACH(ph, &pfil_head_list, ph_list) { if (ph->ph_type == type && ph->ph_key == key) break; } return ph; } static pfil_listset_t * pfil_hook_get(int dir, pfil_head_t *ph) { switch (dir) { case PFIL_IN: return &ph->ph_in; case PFIL_OUT: return &ph->ph_out; case PFIL_IFADDR: return &ph->ph_ifaddr; case PFIL_IFNET: return &ph->ph_ifevent; } return NULL; } static int pfil_list_add(pfil_listset_t *phlistset, pfil_polyfunc_t func, void *arg, int flags) { u_int nhooks; pfil_list_t *newlist, *oldlist; pfil_hook_t *pfh; mutex_enter(&pfil_mtx); /* Check if we have a free slot. */ nhooks = phlistset->active->nhooks; if (nhooks == MAX_HOOKS) { mutex_exit(&pfil_mtx); return ENOSPC; } KASSERT(nhooks < MAX_HOOKS); if (phlistset->active == &phlistset->lists[0]) { oldlist = &phlistset->lists[0]; newlist = &phlistset->lists[1]; } else{ oldlist = &phlistset->lists[1]; newlist = &phlistset->lists[0]; } /* Make sure the hook is not already added. */ for (u_int i = 0; i < nhooks; i++) { pfh = &oldlist->hooks[i]; if (pfh->pfil_func == func && pfh->pfil_arg == arg) { mutex_exit(&pfil_mtx); return EEXIST; } } /* create new pfil_list_t copied from old */ memcpy(newlist, oldlist, sizeof(pfil_list_t)); psref_target_init(&newlist->psref, pfil_psref_class); /* * Finally, add the hook. Note: for PFIL_IN we insert the hooks in * reverse order of the PFIL_OUT so that the same path is followed * in or out of the kernel. */ if (flags & PFIL_IN) { /* XXX: May want to revisit this later; */ size_t len = sizeof(pfil_hook_t) * nhooks; pfh = &newlist->hooks[0]; memmove(&newlist->hooks[1], pfh, len); } else { pfh = &newlist->hooks[nhooks]; } newlist->nhooks++; pfh->pfil_func = func; pfh->pfil_arg = arg; /* switch from oldlist to newlist */ atomic_store_release(&phlistset->active, newlist); #ifdef NET_MPSAFE pserialize_perform(pfil_psz); #endif mutex_exit(&pfil_mtx); /* Wait for all readers */ #ifdef NET_MPSAFE psref_target_destroy(&oldlist->psref, pfil_psref_class); #endif return 0; } /* * pfil_add_hook: add a function (hook) to the packet filter head. * The possible flags are: * * PFIL_IN call on incoming packets * PFIL_OUT call on outgoing packets * PFIL_ALL call on all of the above */ int pfil_add_hook(pfil_func_t func, void *arg, int flags, pfil_head_t *ph) { int error = 0; KASSERT(func != NULL); KASSERT((flags & ~PFIL_ALL) == 0); ASSERT_SLEEPABLE(); for (u_int i = 0; i < __arraycount(pfil_flag_cases); i++) { const int fcase = pfil_flag_cases[i]; pfil_listset_t *phlistset; if ((flags & fcase) == 0) { continue; } phlistset = pfil_hook_get(fcase, ph); error = pfil_list_add(phlistset, (pfil_polyfunc_t)func, arg, flags); if (error && (error != EEXIST)) break; } if (error && (error != EEXIST)) { pfil_remove_hook(func, arg, flags, ph); } return error; } /* * pfil_add_ihook: add an interface-event function (hook) to the packet * filter head. The possible flags are: * * PFIL_IFADDR call on interface reconfig (cmd is ioctl #) * PFIL_IFNET call on interface attach/detach (cmd is PFIL_IFNET_*) */ int pfil_add_ihook(pfil_ifunc_t func, void *arg, int flags, pfil_head_t *ph) { pfil_listset_t *phlistset; KASSERT(func != NULL); KASSERT(flags == PFIL_IFADDR || flags == PFIL_IFNET); ASSERT_SLEEPABLE(); phlistset = pfil_hook_get(flags, ph); return pfil_list_add(phlistset, (pfil_polyfunc_t)func, arg, flags); } /* * pfil_list_remove: remove the hook from a specified list. */ static int pfil_list_remove(pfil_listset_t *phlistset, pfil_polyfunc_t func, void *arg) { u_int nhooks; pfil_list_t *oldlist, *newlist; mutex_enter(&pfil_mtx); /* create new pfil_list_t copied from old */ if (phlistset->active == &phlistset->lists[0]) { oldlist = &phlistset->lists[0]; newlist = &phlistset->lists[1]; } else{ oldlist = &phlistset->lists[1]; newlist = &phlistset->lists[0]; } memcpy(newlist, oldlist, sizeof(*newlist)); psref_target_init(&newlist->psref, pfil_psref_class); nhooks = newlist->nhooks; for (u_int i = 0; i < nhooks; i++) { pfil_hook_t *last, *pfh = &newlist->hooks[i]; if (pfh->pfil_func != func || pfh->pfil_arg != arg) { continue; } if ((last = &newlist->hooks[nhooks - 1]) != pfh) { memcpy(pfh, last, sizeof(pfil_hook_t)); } newlist->nhooks--; /* switch from oldlist to newlist */ atomic_store_release(&phlistset->active, newlist); #ifdef NET_MPSAFE pserialize_perform(pfil_psz); #endif mutex_exit(&pfil_mtx); /* Wait for all readers */ #ifdef NET_MPSAFE psref_target_destroy(&oldlist->psref, pfil_psref_class); #endif return 0; } mutex_exit(&pfil_mtx); return ENOENT; } /* * pfil_remove_hook: remove the hook from the packet filter head. */ int pfil_remove_hook(pfil_func_t func, void *arg, int flags, pfil_head_t *ph) { KASSERT((flags & ~PFIL_ALL) == 0); ASSERT_SLEEPABLE(); for (u_int i = 0; i < __arraycount(pfil_flag_cases); i++) { const int fcase = pfil_flag_cases[i]; pfil_listset_t *pflistset; if ((flags & fcase) == 0) { continue; } pflistset = pfil_hook_get(fcase, ph); (void)pfil_list_remove(pflistset, (pfil_polyfunc_t)func, arg); } return 0; } int pfil_remove_ihook(pfil_ifunc_t func, void *arg, int flags, pfil_head_t *ph) { pfil_listset_t *pflistset; KASSERT(flags == PFIL_IFADDR || flags == PFIL_IFNET); ASSERT_SLEEPABLE(); pflistset = pfil_hook_get(flags, ph); (void)pfil_list_remove(pflistset, (pfil_polyfunc_t)func, arg); return 0; } /* * pfil_run_hooks: run the specified packet filter hooks. */ int pfil_run_hooks(pfil_head_t *ph, struct mbuf **mp, ifnet_t *ifp, int dir) { struct mbuf *m = mp ? *mp : NULL; pfil_listset_t *phlistset; pfil_list_t *phlist; struct psref psref; int s, bound; int ret = 0; KASSERT(dir == PFIL_IN || dir == PFIL_OUT); KASSERT(!cpu_intr_p()); if (ph == NULL) { return ret; } if (__predict_false((phlistset = pfil_hook_get(dir, ph)) == NULL)) { return ret; } bound = curlwp_bind(); s = pserialize_read_enter(); phlist = atomic_load_consume(&phlistset->active); if (phlist->nhooks == 0) { pserialize_read_exit(s); curlwp_bindx(bound); return ret; } psref_acquire(&psref, &phlist->psref, pfil_psref_class); pserialize_read_exit(s); for (u_int i = 0; i < phlist->nhooks; i++) { pfil_hook_t *pfh = &phlist->hooks[i]; pfil_func_t func = (pfil_func_t)pfh->pfil_func; ret = (*func)(pfh->pfil_arg, &m, ifp, dir); if (m == NULL || ret) break; } psref_release(&psref, &phlist->psref, pfil_psref_class); curlwp_bindx(bound); if (mp) { *mp = m; } return ret; } static void pfil_run_arg(pfil_listset_t *phlistset, u_long cmd, void *arg) { pfil_list_t *phlist; struct psref psref; int s, bound; KASSERT(!cpu_intr_p()); bound = curlwp_bind(); s = pserialize_read_enter(); phlist = atomic_load_consume(&phlistset->active); psref_acquire(&psref, &phlist->psref, pfil_psref_class); pserialize_read_exit(s); for (u_int i = 0; i < phlist->nhooks; i++) { pfil_hook_t *pfh = &phlist->hooks[i]; pfil_ifunc_t func = (pfil_ifunc_t)pfh->pfil_func; (*func)(pfh->pfil_arg, cmd, arg); } psref_release(&psref, &phlist->psref, pfil_psref_class); curlwp_bindx(bound); } void pfil_run_addrhooks(pfil_head_t *ph, u_long cmd, struct ifaddr *ifa) { pfil_run_arg(&ph->ph_ifaddr, cmd, ifa); } void pfil_run_ifhooks(pfil_head_t *ph, u_long cmd, struct ifnet *ifp) { pfil_run_arg(&ph->ph_ifevent, cmd, ifp); }
2 1 1 9 9 4 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 /* $NetBSD: kern_uipc_socket_50.c,v 1.4 2019/12/12 02:15:42 pgoyette Exp $ */ /* * Copyright (c) 2002, 2007, 2008, 2009 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Jason R. Thorpe of Wasabi Systems, Inc, and by Andrew Doran. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /* * Copyright (c) 2004 The FreeBSD Foundation * Copyright (c) 2004 Robert Watson * Copyright (c) 1982, 1986, 1988, 1990, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)uipc_socket.c 8.6 (Berkeley) 5/2/95 */ /* * Copyright (c) 1988 University of Utah. * Copyright (c) 1990, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * the Systems Programming Group of the University of Utah Computer * Science Department. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: Utah $Hdr: vn.c 1.13 94/04/02$ * * @(#)vn.c 8.9 (Berkeley) 5/14/95 */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: kern_uipc_socket_50.c,v 1.4 2019/12/12 02:15:42 pgoyette Exp $"); #if defined(_KERNEL_OPT) #include "opt_compat_netbsd.h" #endif #include <sys/param.h> #include <sys/systm.h> #include <sys/kernel.h> #include <sys/proc.h> #include <sys/file.h> #include <sys/compat_stub.h> #include <sys/socketvar.h> #include <compat/sys/time.h> #include <compat/sys/socket.h> #include <compat/common/compat_mod.h> static int uipc_socket_50_getopt1(int opt, struct socket *so, struct sockopt *sopt) { int optval, error; struct timeval50 otv; switch (opt) { case SO_OSNDTIMEO: case SO_ORCVTIMEO: optval = (opt == SO_OSNDTIMEO ? so->so_snd.sb_timeo : so->so_rcv.sb_timeo); otv.tv_sec = optval / hz; otv.tv_usec = (optval % hz) * tick; error = sockopt_set(sopt, &otv, sizeof(otv)); break; case SO_OTIMESTAMP: error = sockopt_setint(sopt, (so->so_options & opt) ? 1 : 0); break; default: error = EPASSTHROUGH; } return error; } static int uipc_socket_50_setopt1(int opt, struct socket *so, const struct sockopt *sopt) { int optval, error; struct timeval50 otv; struct timeval tv; switch (opt) { case SO_OSNDTIMEO: case SO_ORCVTIMEO: solock(so); error = sockopt_get(sopt, &otv, sizeof(otv)); if (error) break; timeval50_to_timeval(&otv, &tv); /* Code duplicated from sys/kern/uipc_socket.c */ if (tv.tv_sec < 0 || tv.tv_usec < 0 || tv.tv_usec >= 1000000) { error = EDOM; break; } if (tv.tv_sec > (INT_MAX - tv.tv_usec / tick) / hz) { error = EDOM; break; } optval = tv.tv_sec * hz + tv.tv_usec / tick; if (optval == 0 && tv.tv_usec != 0) optval = 1; switch (opt) { case SO_OSNDTIMEO: so->so_snd.sb_timeo = optval; break; case SO_ORCVTIMEO: so->so_rcv.sb_timeo = optval; break; } break; case SO_OTIMESTAMP: error = sockopt_getint(sopt, &optval); solock(so); if (error) break; if (optval) so->so_options |= opt; else so->so_options &= ~opt; break; default: error = EPASSTHROUGH; } return error; } static int uipc_socket_50_sbts(int opt, struct mbuf ***mp) { struct timeval50 tv50; struct timeval tv; microtime(&tv); if (opt & SO_OTIMESTAMP) { timeval_to_timeval50(&tv, &tv50); **mp = sbcreatecontrol(&tv50, sizeof(tv50), SCM_OTIMESTAMP, SOL_SOCKET); if (**mp) *mp = &(**mp)->m_next; return 0; } else return EPASSTHROUGH; } void kern_uipc_socket_50_init(void) { MODULE_HOOK_SET(uipc_socket_50_setopt1_hook, uipc_socket_50_setopt1); MODULE_HOOK_SET(uipc_socket_50_getopt1_hook, uipc_socket_50_getopt1); MODULE_HOOK_SET(uipc_socket_50_sbts_hook, uipc_socket_50_sbts); } void kern_uipc_socket_50_fini(void) { MODULE_HOOK_UNSET(uipc_socket_50_setopt1_hook); MODULE_HOOK_UNSET(uipc_socket_50_getopt1_hook); MODULE_HOOK_UNSET(uipc_socket_50_sbts_hook); }
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 /* $NetBSD: exec_elf32.c,v 1.143 2019/11/20 19:37:53 pgoyette Exp $ */ /* * Copyright (c) 1996 Christopher G. Demetriou * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by Christopher G. Demetriou * for the NetBSD Project. * 4. The name of the author may not be used to endorse or promote products * derived from this software without specific prior written permission * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: exec_elf32.c,v 1.143 2019/11/20 19:37:53 pgoyette Exp $"); #define ELFSIZE 32 #include "exec_elf.c" #include <sys/module.h> #define ELF32_AUXSIZE (ELF_AUX_ENTRIES * sizeof(Aux32Info) \ + MAXPATHLEN + ALIGN(1)) MODULE(MODULE_CLASS_EXEC, exec_elf32, NULL); static struct execsw exec_elf32_execsw[] = { { .es_hdrsz = sizeof (Elf32_Ehdr), .es_makecmds = exec_elf32_makecmds, .u = { .elf_probe_func = netbsd_elf32_probe, }, .es_emul = &emul_netbsd, .es_prio = EXECSW_PRIO_FIRST, .es_arglen = ELF32_AUXSIZE, .es_copyargs = elf32_copyargs, .es_setregs = NULL, .es_coredump = coredump_elf32, .es_setup_stack = exec_setup_stack, }, #if EXEC_ELF_NOTELESS { .es_hdrsz = sizeof (Elf32_Ehdr), .es_makecmds = exec_elf32_makecmds, .u { elf_probe_func = NULL, }, .es_emul = &emul_netbsd, .es_prio = EXECSW_PRIO_LAST, .es_arglen = ELF32_AUXSIZE, .es_copyargs = elf32_copyargs, .es_setregs = NULL, .es_coredump = coredump_elf32, .es_setup_stack = exec_setup_stack, }, #endif }; static int exec_elf32_modcmd(modcmd_t cmd, void *arg) { #if ARCH_ELFSIZE == 64 /* * If we are on a 64bit system, we don't want the 32bit execsw[] to be * added in the global array, because the exec_elf32 module only works * on 32bit systems. * * However, we need the exec_elf32 module, because it will make the 32bit * functions available for netbsd32 and linux32. * * Therefore, allow this module on 64bit systems, but make it dormant. */ (void)exec_elf32_execsw; /* unused */ switch (cmd) { case MODULE_CMD_INIT: case MODULE_CMD_FINI: return 0; default: return ENOTTY; } #else /* ARCH_ELFSIZE == 64 */ switch (cmd) { case MODULE_CMD_INIT: return exec_add(exec_elf32_execsw, __arraycount(exec_elf32_execsw)); case MODULE_CMD_FINI: return exec_remove(exec_elf32_execsw, __arraycount(exec_elf32_execsw)); default: return ENOTTY; } #endif /* ARCH_ELFSIZE == 64 */ }
21 21 21 21 21 21 21 21 20 20 2 21 21 21 21 20 10 32 31 32 20 20 20 20 32 11 23 22 8 8 8 7 3 8 8 8 30 32 32 31 8 8 8 8 8 8 8 8 3 7 8 31 32 29 29 29 29 29 9 9 81 82 81 4 4 4 4 4 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 /* $NetBSD: uvm_bio.c,v 1.128 2023/04/09 09:00:56 riastradh Exp $ */ /* * Copyright (c) 1998 Chuck Silvers. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The name of the author may not be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * */ /* * uvm_bio.c: buffered i/o object mapping cache */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: uvm_bio.c,v 1.128 2023/04/09 09:00:56 riastradh Exp $"); #include "opt_uvmhist.h" #include "opt_ubc.h" #include <sys/param.h> #include <sys/systm.h> #include <sys/kmem.h> #include <sys/kernel.h> #include <sys/proc.h> #include <sys/sysctl.h> #include <sys/vnode.h> #include <sys/bitops.h> /* for ilog2() */ #include <uvm/uvm.h> #include <uvm/uvm_pdpolicy.h> #ifdef PMAP_DIRECT # define UBC_USE_PMAP_DIRECT #endif /* * local functions */ static int ubc_fault(struct uvm_faultinfo *, vaddr_t, struct vm_page **, int, int, vm_prot_t, int); static struct ubc_map *ubc_find_mapping(struct uvm_object *, voff_t); static int ubchash_stats(struct hashstat_sysctl *hs, bool fill); #ifdef UBC_USE_PMAP_DIRECT static int __noinline ubc_uiomove_direct(struct uvm_object *, struct uio *, vsize_t, int, int); static void __noinline ubc_zerorange_direct(struct uvm_object *, off_t, size_t, int); /* XXX disabled by default until the kinks are worked out. */ bool ubc_direct = false; #endif /* * local data structures */ #define UBC_HASH(uobj, offset) \ (((((u_long)(uobj)) >> 8) + (((u_long)(offset)) >> PAGE_SHIFT)) & \ ubc_object.hashmask) #define UBC_QUEUE(offset) \ (&ubc_object.inactive[(((u_long)(offset)) >> ubc_winshift) & \ (UBC_NQUEUES - 1)]) #define UBC_UMAP_ADDR(u) \ (vaddr_t)(ubc_object.kva + (((u) - ubc_object.umap) << ubc_winshift)) #define UMAP_PAGES_LOCKED 0x0001 #define UMAP_MAPPING_CACHED 0x0002 struct ubc_map { struct uvm_object * uobj; /* mapped object */ voff_t offset; /* offset into uobj */ voff_t writeoff; /* write offset */ vsize_t writelen; /* write len */ int refcount; /* refcount on mapping */ int flags; /* extra state */ int advice; LIST_ENTRY(ubc_map) hash; /* hash table */ TAILQ_ENTRY(ubc_map) inactive; /* inactive queue */ LIST_ENTRY(ubc_map) list; /* per-object list */ }; TAILQ_HEAD(ubc_inactive_head, ubc_map); static struct ubc_object { struct uvm_object uobj; /* glue for uvm_map() */ char *kva; /* where ubc_object is mapped */ struct ubc_map *umap; /* array of ubc_map's */ LIST_HEAD(, ubc_map) *hash; /* hashtable for cached ubc_map's */ u_long hashmask; /* mask for hashtable */ struct ubc_inactive_head *inactive; /* inactive queues for ubc_map's */ } ubc_object; const struct uvm_pagerops ubc_pager = { .pgo_fault = ubc_fault, /* ... rest are NULL */ }; /* Use value at least as big as maximum page size supported by architecture */ #define UBC_MAX_WINSHIFT \ ((1 << UBC_WINSHIFT) > MAX_PAGE_SIZE ? UBC_WINSHIFT : ilog2(MAX_PAGE_SIZE)) int ubc_nwins = UBC_NWINS; const int ubc_winshift = UBC_MAX_WINSHIFT; const int ubc_winsize = 1 << UBC_MAX_WINSHIFT; #if defined(PMAP_PREFER) int ubc_nqueues; #define UBC_NQUEUES ubc_nqueues #else #define UBC_NQUEUES 1 #endif #if defined(UBC_STATS) #define UBC_EVCNT_DEFINE(name) \ struct evcnt ubc_evcnt_##name = \ EVCNT_INITIALIZER(EVCNT_TYPE_MISC, NULL, "ubc", #name); \ EVCNT_ATTACH_STATIC(ubc_evcnt_##name); #define UBC_EVCNT_INCR(name) ubc_evcnt_##name.ev_count++ #else /* defined(UBC_STATS) */ #define UBC_EVCNT_DEFINE(name) /* nothing */ #define UBC_EVCNT_INCR(name) /* nothing */ #endif /* defined(UBC_STATS) */ UBC_EVCNT_DEFINE(wincachehit) UBC_EVCNT_DEFINE(wincachemiss) UBC_EVCNT_DEFINE(faultbusy) /* * ubc_init * * init pager private data structures. */ void ubc_init(void) { /* * Make sure ubc_winshift is sane. */ KASSERT(ubc_winshift >= PAGE_SHIFT); /* * init ubc_object. * alloc and init ubc_map's. * init inactive queues. * alloc and init hashtable. * map in ubc_object. */ uvm_obj_init(&ubc_object.uobj, &ubc_pager, true, UVM_OBJ_KERN); ubc_object.umap = kmem_zalloc(ubc_nwins * sizeof(struct ubc_map), KM_SLEEP); if (ubc_object.umap == NULL) panic("ubc_init: failed to allocate ubc_map"); vaddr_t va = (vaddr_t)1L; #ifdef PMAP_PREFER PMAP_PREFER(0, &va, 0, 0); /* kernel is never topdown */ ubc_nqueues = va >> ubc_winshift; if (ubc_nqueues == 0) { ubc_nqueues = 1; } #endif ubc_object.inactive = kmem_alloc(UBC_NQUEUES * sizeof(struct ubc_inactive_head), KM_SLEEP); for (int i = 0; i < UBC_NQUEUES; i++) { TAILQ_INIT(&ubc_object.inactive[i]); } for (int i = 0; i < ubc_nwins; i++) { struct ubc_map *umap; umap = &ubc_object.umap[i]; TAILQ_INSERT_TAIL(&ubc_object.inactive[i & (UBC_NQUEUES - 1)], umap, inactive); } ubc_object.hash = hashinit(ubc_nwins, HASH_LIST, true, &ubc_object.hashmask); for (int i = 0; i <= ubc_object.hashmask; i++) { LIST_INIT(&ubc_object.hash[i]); } if (uvm_map(kernel_map, (vaddr_t *)&ubc_object.kva, ubc_nwins << ubc_winshift, &ubc_object.uobj, 0, (vsize_t)va, UVM_MAPFLAG(UVM_PROT_RW, UVM_PROT_RW, UVM_INH_NONE, UVM_ADV_RANDOM, UVM_FLAG_NOMERGE)) != 0) { panic("ubc_init: failed to map ubc_object"); } hashstat_register("ubchash", ubchash_stats); } void ubchist_init(void) { UVMHIST_INIT(ubchist, 300); } /* * ubc_fault_page: helper of ubc_fault to handle a single page. * * => Caller has UVM object locked. * => Caller will perform pmap_update(). */ static inline int ubc_fault_page(const struct uvm_faultinfo *ufi, const struct ubc_map *umap, struct vm_page *pg, vm_prot_t prot, vm_prot_t access_type, vaddr_t va) { vm_prot_t mask; int error; bool rdonly; KASSERT(rw_write_held(pg->uobject->vmobjlock)); KASSERT((pg->flags & PG_FAKE) == 0); if (pg->flags & PG_RELEASED) { uvm_pagefree(pg); return 0; } if (pg->loan_count != 0) { /* * Avoid unneeded loan break, if possible. */ if ((access_type & VM_PROT_WRITE) == 0) { prot &= ~VM_PROT_WRITE; } if (prot & VM_PROT_WRITE) { struct vm_page *newpg; newpg = uvm_loanbreak(pg); if (newpg == NULL) { uvm_page_unbusy(&pg, 1); return ENOMEM; } pg = newpg; } } /* * Note that a page whose backing store is partially allocated * is marked as PG_RDONLY. * * it's a responsibility of ubc_alloc's caller to allocate backing * blocks before writing to the window. */ KASSERT((pg->flags & PG_RDONLY) == 0 || (access_type & VM_PROT_WRITE) == 0 || pg->offset < umap->writeoff || pg->offset + PAGE_SIZE > umap->writeoff + umap->writelen); rdonly = uvm_pagereadonly_p(pg); mask = rdonly ? ~VM_PROT_WRITE : VM_PROT_ALL; error = pmap_enter(ufi->orig_map->pmap, va, VM_PAGE_TO_PHYS(pg), prot & mask, PMAP_CANFAIL | (access_type & mask)); uvm_pagelock(pg); uvm_pageactivate(pg); uvm_pagewakeup(pg); uvm_pageunlock(pg); pg->flags &= ~PG_BUSY; UVM_PAGE_OWN(pg, NULL); return error; } /* * ubc_fault: fault routine for ubc mapping */ static int ubc_fault(struct uvm_faultinfo *ufi, vaddr_t ign1, struct vm_page **ign2, int ign3, int ign4, vm_prot_t access_type, int flags) { struct uvm_object *uobj; struct ubc_map *umap; vaddr_t va, eva, ubc_offset, slot_offset; struct vm_page *pgs[howmany(ubc_winsize, MIN_PAGE_SIZE)]; int i, error, npages; vm_prot_t prot; UVMHIST_FUNC(__func__); UVMHIST_CALLED(ubchist); /* * no need to try with PGO_LOCKED... * we don't need to have the map locked since we know that * no one will mess with it until our reference is released. */ if (flags & PGO_LOCKED) { uvmfault_unlockall(ufi, NULL, &ubc_object.uobj); flags &= ~PGO_LOCKED; } va = ufi->orig_rvaddr; ubc_offset = va - (vaddr_t)ubc_object.kva; umap = &ubc_object.umap[ubc_offset >> ubc_winshift]; KASSERT(umap->refcount != 0); KASSERT((umap->flags & UMAP_PAGES_LOCKED) == 0); slot_offset = ubc_offset & (ubc_winsize - 1); /* * some platforms cannot write to individual bytes atomically, so * software has to do read/modify/write of larger quantities instead. * this means that the access_type for "write" operations * can be VM_PROT_READ, which confuses us mightily. * * deal with this by resetting access_type based on the info * that ubc_alloc() stores for us. */ access_type = umap->writelen ? VM_PROT_WRITE : VM_PROT_READ; UVMHIST_LOG(ubchist, "va %#jx ubc_offset %#jx access_type %jd", va, ubc_offset, access_type, 0); if ((access_type & VM_PROT_WRITE) != 0) { #ifndef PRIxOFF /* XXX */ #define PRIxOFF "jx" /* XXX */ #endif /* XXX */ KASSERTMSG((trunc_page(umap->writeoff) <= slot_offset), "out of range write: slot=%#"PRIxVSIZE" off=%#"PRIxOFF, slot_offset, (intmax_t)umap->writeoff); KASSERTMSG((slot_offset < umap->writeoff + umap->writelen), "out of range write: slot=%#"PRIxVADDR " off=%#"PRIxOFF" len=%#"PRIxVSIZE, slot_offset, (intmax_t)umap->writeoff, umap->writelen); } /* no umap locking needed since we have a ref on the umap */ uobj = umap->uobj; if ((access_type & VM_PROT_WRITE) == 0) { npages = (ubc_winsize - slot_offset) >> PAGE_SHIFT; } else { npages = (round_page(umap->offset + umap->writeoff + umap->writelen) - (umap->offset + slot_offset)) >> PAGE_SHIFT; flags |= PGO_PASTEOF; } again: memset(pgs, 0, sizeof (pgs)); rw_enter(uobj->vmobjlock, RW_WRITER); UVMHIST_LOG(ubchist, "slot_offset %#jx writeoff %#jx writelen %#jx ", slot_offset, umap->writeoff, umap->writelen, 0); UVMHIST_LOG(ubchist, "getpages uobj %#jx offset %#jx npages %jd", (uintptr_t)uobj, umap->offset + slot_offset, npages, 0); error = (*uobj->pgops->pgo_get)(uobj, umap->offset + slot_offset, pgs, &npages, 0, access_type, umap->advice, flags | PGO_NOBLOCKALLOC | PGO_NOTIMESTAMP); UVMHIST_LOG(ubchist, "getpages error %jd npages %jd", error, npages, 0, 0); if (error == EAGAIN) { kpause("ubc_fault", false, hz >> 2, NULL); goto again; } if (error) { return error; } /* * For virtually-indexed, virtually-tagged caches we should avoid * creating writable mappings when we do not absolutely need them, * since the "compatible alias" trick does not work on such caches. * Otherwise, we can always map the pages writable. */ #ifdef PMAP_CACHE_VIVT prot = VM_PROT_READ | access_type; #else prot = VM_PROT_READ | VM_PROT_WRITE; #endif va = ufi->orig_rvaddr; eva = ufi->orig_rvaddr + (npages << PAGE_SHIFT); UVMHIST_LOG(ubchist, "va %#jx eva %#jx", va, eva, 0, 0); /* * Note: normally all returned pages would have the same UVM object. * However, layered file-systems and e.g. tmpfs, may return pages * which belong to underlying UVM object. In such case, lock is * shared amongst the objects. */ rw_enter(uobj->vmobjlock, RW_WRITER); for (i = 0; va < eva; i++, va += PAGE_SIZE) { struct vm_page *pg; UVMHIST_LOG(ubchist, "pgs[%jd] = %#jx", i, (uintptr_t)pgs[i], 0, 0); pg = pgs[i]; if (pg == NULL || pg == PGO_DONTCARE) { continue; } KASSERT(uobj->vmobjlock == pg->uobject->vmobjlock); error = ubc_fault_page(ufi, umap, pg, prot, access_type, va); if (error) { /* * Flush (there might be pages entered), drop the lock, * and perform uvm_wait(). Note: page will re-fault. */ pmap_update(ufi->orig_map->pmap); rw_exit(uobj->vmobjlock); uvm_wait("ubc_fault"); rw_enter(uobj->vmobjlock, RW_WRITER); } } /* Must make VA visible before the unlock. */ pmap_update(ufi->orig_map->pmap); rw_exit(uobj->vmobjlock); return 0; } /* * local functions */ static struct ubc_map * ubc_find_mapping(struct uvm_object *uobj, voff_t offset) { struct ubc_map *umap; LIST_FOREACH(umap, &ubc_object.hash[UBC_HASH(uobj, offset)], hash) { if (umap->uobj == uobj && umap->offset == offset) { return umap; } } return NULL; } /* * ubc interface functions */ /* * ubc_alloc: allocate a file mapping window */ static void * __noinline ubc_alloc(struct uvm_object *uobj, voff_t offset, vsize_t *lenp, int advice, int flags, struct vm_page **pgs, int *npagesp) { vaddr_t slot_offset, va; struct ubc_map *umap; voff_t umap_offset; int error; UVMHIST_FUNC(__func__); UVMHIST_CALLARGS(ubchist, "uobj %#jx offset %#jx len %#jx", (uintptr_t)uobj, offset, *lenp, 0); KASSERT(*lenp > 0); umap_offset = (offset & ~((voff_t)ubc_winsize - 1)); slot_offset = (vaddr_t)(offset & ((voff_t)ubc_winsize - 1)); *lenp = MIN(*lenp, ubc_winsize - slot_offset); KASSERT(*lenp > 0); rw_enter(ubc_object.uobj.vmobjlock, RW_WRITER); again: /* * The UVM object is already referenced. * Lock order: UBC object -> ubc_map::uobj. */ umap = ubc_find_mapping(uobj, umap_offset); if (umap == NULL) { struct uvm_object *oobj; UBC_EVCNT_INCR(wincachemiss); umap = TAILQ_FIRST(UBC_QUEUE(offset)); if (umap == NULL) { rw_exit(ubc_object.uobj.vmobjlock); kpause("ubc_alloc", false, hz >> 2, NULL); rw_enter(ubc_object.uobj.vmobjlock, RW_WRITER); goto again; } va = UBC_UMAP_ADDR(umap); oobj = umap->uobj; /* * Remove from old hash (if any), add to new hash. */ if (oobj != NULL) { /* * Mapping must be removed before the list entry, * since there is a race with ubc_purge(). */ if (umap->flags & UMAP_MAPPING_CACHED) { umap->flags &= ~UMAP_MAPPING_CACHED; rw_enter(oobj->vmobjlock, RW_WRITER); pmap_remove(pmap_kernel(), va, va + ubc_winsize); pmap_update(pmap_kernel()); rw_exit(oobj->vmobjlock); } LIST_REMOVE(umap, hash); LIST_REMOVE(umap, list); } else { KASSERT((umap->flags & UMAP_MAPPING_CACHED) == 0); } umap->uobj = uobj; umap->offset = umap_offset; LIST_INSERT_HEAD(&ubc_object.hash[UBC_HASH(uobj, umap_offset)], umap, hash); LIST_INSERT_HEAD(&uobj->uo_ubc, umap, list); } else { UBC_EVCNT_INCR(wincachehit); va = UBC_UMAP_ADDR(umap); } if (umap->refcount == 0) { TAILQ_REMOVE(UBC_QUEUE(offset), umap, inactive); } if (flags & UBC_WRITE) { KASSERTMSG(umap->writeoff == 0, "ubc_alloc: concurrent writes to uobj %p", uobj); KASSERTMSG(umap->writelen == 0, "ubc_alloc: concurrent writes to uobj %p", uobj); umap->writeoff = slot_offset; umap->writelen = *lenp; } umap->refcount++; umap->advice = advice; rw_exit(ubc_object.uobj.vmobjlock); UVMHIST_LOG(ubchist, "umap %#jx refs %jd va %#jx flags %#jx", (uintptr_t)umap, umap->refcount, (uintptr_t)va, flags); if (flags & UBC_FAULTBUSY) { int npages = (*lenp + (offset & (PAGE_SIZE - 1)) + PAGE_SIZE - 1) >> PAGE_SHIFT; int gpflags = PGO_SYNCIO|PGO_OVERWRITE|PGO_PASTEOF|PGO_NOBLOCKALLOC| PGO_NOTIMESTAMP; int i; KDASSERT(flags & UBC_WRITE); KASSERT(npages <= *npagesp); KASSERT(umap->refcount == 1); UBC_EVCNT_INCR(faultbusy); again_faultbusy: rw_enter(uobj->vmobjlock, RW_WRITER); if (umap->flags & UMAP_MAPPING_CACHED) { umap->flags &= ~UMAP_MAPPING_CACHED; pmap_remove(pmap_kernel(), va, va + ubc_winsize); } memset(pgs, 0, *npagesp * sizeof(pgs[0])); error = (*uobj->pgops->pgo_get)(uobj, trunc_page(offset), pgs, &npages, 0, VM_PROT_READ | VM_PROT_WRITE, advice, gpflags); UVMHIST_LOG(ubchist, "faultbusy getpages %jd", error, 0, 0, 0); if (error) { /* * Flush: the mapping above might have been removed. */ pmap_update(pmap_kernel()); goto out; } for (i = 0; i < npages; i++) { struct vm_page *pg = pgs[i]; KASSERT(pg->uobject == uobj); if (pg->loan_count != 0) { rw_enter(uobj->vmobjlock, RW_WRITER); if (pg->loan_count != 0) { pg = uvm_loanbreak(pg); } if (pg == NULL) { pmap_kremove(va, ubc_winsize); pmap_update(pmap_kernel()); uvm_page_unbusy(pgs, npages); rw_exit(uobj->vmobjlock); uvm_wait("ubc_alloc"); goto again_faultbusy; } rw_exit(uobj->vmobjlock); pgs[i] = pg; } pmap_kenter_pa( va + trunc_page(slot_offset) + (i << PAGE_SHIFT), VM_PAGE_TO_PHYS(pg), VM_PROT_READ | VM_PROT_WRITE, 0); } pmap_update(pmap_kernel()); umap->flags |= UMAP_PAGES_LOCKED; *npagesp = npages; } else { KASSERT((umap->flags & UMAP_PAGES_LOCKED) == 0); } out: return (void *)(va + slot_offset); } /* * ubc_release: free a file mapping window. */ static void __noinline ubc_release(void *va, int flags, struct vm_page **pgs, int npages) { struct ubc_map *umap; struct uvm_object *uobj; vaddr_t umapva; bool unmapped; UVMHIST_FUNC(__func__); UVMHIST_CALLARGS(ubchist, "va %#jx", (uintptr_t)va, 0, 0, 0); umap = &ubc_object.umap[((char *)va - ubc_object.kva) >> ubc_winshift]; umapva = UBC_UMAP_ADDR(umap); uobj = umap->uobj; KASSERT(uobj != NULL); if (umap->flags & UMAP_PAGES_LOCKED) { const voff_t endoff = umap->writeoff + umap->writelen; const voff_t zerolen = round_page(endoff) - endoff; KASSERT(npages == (round_page(endoff) - trunc_page(umap->writeoff)) >> PAGE_SHIFT); KASSERT((umap->flags & UMAP_MAPPING_CACHED) == 0); if (zerolen) { memset((char *)umapva + endoff, 0, zerolen); } umap->flags &= ~UMAP_PAGES_LOCKED; rw_enter(uobj->vmobjlock, RW_WRITER); for (u_int i = 0; i < npages; i++) { struct vm_page *pg = pgs[i]; #ifdef DIAGNOSTIC paddr_t pa; bool rv; rv = pmap_extract(pmap_kernel(), umapva + umap->writeoff + (i << PAGE_SHIFT), &pa); KASSERT(rv); KASSERT(PHYS_TO_VM_PAGE(pa) == pg); #endif pg->flags &= ~PG_FAKE; KASSERTMSG(uvm_pagegetdirty(pg) == UVM_PAGE_STATUS_DIRTY, "page %p not dirty", pg); KASSERT(pg->loan_count == 0); if (uvmpdpol_pageactivate_p(pg)) { uvm_pagelock(pg); uvm_pageactivate(pg); uvm_pageunlock(pg); } } pmap_kremove(umapva, ubc_winsize); pmap_update(pmap_kernel()); uvm_page_unbusy(pgs, npages); rw_exit(uobj->vmobjlock); unmapped = true; } else { unmapped = false; } rw_enter(ubc_object.uobj.vmobjlock, RW_WRITER); umap->writeoff = 0; umap->writelen = 0; umap->refcount--; if (umap->refcount == 0) { if (flags & UBC_UNMAP) { /* * Invalidate any cached mappings if requested. * This is typically used to avoid leaving * incompatible cache aliases around indefinitely. */ rw_enter(uobj->vmobjlock, RW_WRITER); pmap_remove(pmap_kernel(), umapva, umapva + ubc_winsize); pmap_update(pmap_kernel()); rw_exit(uobj->vmobjlock); umap->flags &= ~UMAP_MAPPING_CACHED; LIST_REMOVE(umap, hash); LIST_REMOVE(umap, list); umap->uobj = NULL; TAILQ_INSERT_HEAD(UBC_QUEUE(umap->offset), umap, inactive); } else { if (!unmapped) { umap->flags |= UMAP_MAPPING_CACHED; } TAILQ_INSERT_TAIL(UBC_QUEUE(umap->offset), umap, inactive); } } UVMHIST_LOG(ubchist, "umap %#jx refs %jd", (uintptr_t)umap, umap->refcount, 0, 0); rw_exit(ubc_object.uobj.vmobjlock); } /* * ubc_uiomove: move data to/from an object. */ int ubc_uiomove(struct uvm_object *uobj, struct uio *uio, vsize_t todo, int advice, int flags) { const bool overwrite = (flags & UBC_FAULTBUSY) != 0; struct vm_page *pgs[howmany(ubc_winsize, MIN_PAGE_SIZE)]; voff_t off; int error, npages; KASSERT(todo <= uio->uio_resid); KASSERT(((flags & UBC_WRITE) != 0 && uio->uio_rw == UIO_WRITE) || ((flags & UBC_READ) != 0 && uio->uio_rw == UIO_READ)); #ifdef UBC_USE_PMAP_DIRECT /* * during direct access pages need to be held busy to prevent them * changing identity, and therefore if we read or write an object * into a mapped view of same we could deadlock while faulting. * * avoid the problem by disallowing direct access if the object * might be visible somewhere via mmap(). * * XXX concurrent reads cause thundering herd issues with PG_BUSY. * In the future enable by default for writes or if ncpu<=2, and * make the toggle override that. */ if ((ubc_direct && (flags & UBC_ISMAPPED) == 0) || (flags & UBC_FAULTBUSY) != 0) { return ubc_uiomove_direct(uobj, uio, todo, advice, flags); } #endif off = uio->uio_offset; error = 0; while (todo > 0) { vsize_t bytelen = todo; void *win; npages = __arraycount(pgs); win = ubc_alloc(uobj, off, &bytelen, advice, flags, pgs, &npages); if (error == 0) { error = uiomove(win, bytelen, uio); } if (error != 0 && overwrite) { /* * if we haven't initialized the pages yet, * do it now. it's safe to use memset here * because we just mapped the pages above. */ memset(win, 0, bytelen); } ubc_release(win, flags, pgs, npages); off += bytelen; todo -= bytelen; if (error != 0 && (flags & UBC_PARTIALOK) != 0) { break; } } return error; } /* * ubc_zerorange: set a range of bytes in an object to zero. */ void ubc_zerorange(struct uvm_object *uobj, off_t off, size_t len, int flags) { struct vm_page *pgs[howmany(ubc_winsize, MIN_PAGE_SIZE)]; int npages; #ifdef UBC_USE_PMAP_DIRECT if (ubc_direct || (flags & UBC_FAULTBUSY) != 0) { ubc_zerorange_direct(uobj, off, len, flags); return; } #endif /* * XXXUBC invent kzero() and use it */ while (len) { void *win; vsize_t bytelen = len; npages = __arraycount(pgs); win = ubc_alloc(uobj, off, &bytelen, UVM_ADV_NORMAL, UBC_WRITE, pgs, &npages); memset(win, 0, bytelen); ubc_release(win, flags, pgs, npages); off += bytelen; len -= bytelen; } } #ifdef UBC_USE_PMAP_DIRECT /* Copy data using direct map */ /* * ubc_alloc_direct: allocate a file mapping window using direct map */ static int __noinline ubc_alloc_direct(struct uvm_object *uobj, voff_t offset, vsize_t *lenp, int advice, int flags, struct vm_page **pgs, int *npages) { voff_t pgoff; int error; int gpflags = flags | PGO_NOTIMESTAMP | PGO_SYNCIO; int access_type = VM_PROT_READ; UVMHIST_FUNC(__func__); UVMHIST_CALLED(ubchist); if (flags & UBC_WRITE) { if (flags & UBC_FAULTBUSY) gpflags |= PGO_OVERWRITE | PGO_NOBLOCKALLOC; #if 0 KASSERT(!UVM_OBJ_NEEDS_WRITEFAULT(uobj)); #endif /* * Tell genfs_getpages() we already have the journal lock, * allow allocation past current EOF. */ gpflags |= PGO_JOURNALLOCKED | PGO_PASTEOF; access_type |= VM_PROT_WRITE; } else { /* Don't need the empty blocks allocated, PG_RDONLY is okay */ gpflags |= PGO_NOBLOCKALLOC; } pgoff = (offset & PAGE_MASK); *lenp = MIN(*lenp, ubc_winsize - pgoff); again: *npages = (*lenp + pgoff + PAGE_SIZE - 1) >> PAGE_SHIFT; KASSERT((*npages * PAGE_SIZE) <= ubc_winsize); KASSERT(*lenp + pgoff <= ubc_winsize); memset(pgs, 0, *npages * sizeof(pgs[0])); rw_enter(uobj->vmobjlock, RW_WRITER); error = (*uobj->pgops->pgo_get)(uobj, trunc_page(offset), pgs, npages, 0, access_type, advice, gpflags); UVMHIST_LOG(ubchist, "alloc_direct getpages %jd", error, 0, 0, 0); if (error) { if (error == EAGAIN) { kpause("ubc_alloc_directg", false, hz >> 2, NULL); goto again; } return error; } rw_enter(uobj->vmobjlock, RW_WRITER); for (int i = 0; i < *npages; i++) { struct vm_page *pg = pgs[i]; KASSERT(pg != NULL); KASSERT(pg != PGO_DONTCARE); KASSERT((pg->flags & PG_FAKE) == 0 || (gpflags & PGO_OVERWRITE)); KASSERT(pg->uobject->vmobjlock == uobj->vmobjlock); /* Avoid breaking loan if possible, only do it on write */ if ((flags & UBC_WRITE) && pg->loan_count != 0) { pg = uvm_loanbreak(pg); if (pg == NULL) { uvm_page_unbusy(pgs, *npages); rw_exit(uobj->vmobjlock); uvm_wait("ubc_alloc_directl"); goto again; } pgs[i] = pg; } /* Page must be writable by now */ KASSERT((pg->flags & PG_RDONLY) == 0 || (flags & UBC_WRITE) == 0); /* * XXX For aobj pages. No managed mapping - mark the page * dirty. */ if ((flags & UBC_WRITE) != 0) { uvm_pagemarkdirty(pg, UVM_PAGE_STATUS_DIRTY); } } rw_exit(uobj->vmobjlock); return 0; } static void __noinline ubc_direct_release(struct uvm_object *uobj, int flags, struct vm_page **pgs, int npages) { rw_enter(uobj->vmobjlock, RW_WRITER); for (int i = 0; i < npages; i++) { struct vm_page *pg = pgs[i]; pg->flags &= ~PG_BUSY; UVM_PAGE_OWN(pg, NULL); if (pg->flags & PG_RELEASED) { pg->flags &= ~PG_RELEASED; uvm_pagefree(pg); continue; } if (uvm_pagewanted_p(pg) || uvmpdpol_pageactivate_p(pg)) { uvm_pagelock(pg); uvm_pageactivate(pg); uvm_pagewakeup(pg); uvm_pageunlock(pg); } /* Page was changed, no longer fake and neither clean. */ if (flags & UBC_WRITE) { KASSERTMSG(uvm_pagegetdirty(pg) == UVM_PAGE_STATUS_DIRTY, "page %p not dirty", pg); pg->flags &= ~PG_FAKE; } } rw_exit(uobj->vmobjlock); } static int ubc_uiomove_process(void *win, size_t len, void *arg) { struct uio *uio = (struct uio *)arg; return uiomove(win, len, uio); } static int ubc_zerorange_process(void *win, size_t len, void *arg) { memset(win, 0, len); return 0; } static int __noinline ubc_uiomove_direct(struct uvm_object *uobj, struct uio *uio, vsize_t todo, int advice, int flags) { const bool overwrite = (flags & UBC_FAULTBUSY) != 0; voff_t off; int error, npages; struct vm_page *pgs[howmany(ubc_winsize, MIN_PAGE_SIZE)]; KASSERT(todo <= uio->uio_resid); KASSERT(((flags & UBC_WRITE) != 0 && uio->uio_rw == UIO_WRITE) || ((flags & UBC_READ) != 0 && uio->uio_rw == UIO_READ)); off = uio->uio_offset; error = 0; while (todo > 0) { vsize_t bytelen = todo; error = ubc_alloc_direct(uobj, off, &bytelen, advice, flags, pgs, &npages); if (error != 0) { /* can't do anything, failed to get the pages */ break; } if (error == 0) { error = uvm_direct_process(pgs, npages, off, bytelen, ubc_uiomove_process, uio); } if (overwrite) { voff_t endoff; /* * if we haven't initialized the pages yet due to an * error above, do it now. */ if (error != 0) { (void) uvm_direct_process(pgs, npages, off, bytelen, ubc_zerorange_process, NULL); } off += bytelen; todo -= bytelen; endoff = off & (PAGE_SIZE - 1); /* * zero out the remaining portion of the final page * (if any). */ if (todo == 0 && endoff != 0) { vsize_t zlen = PAGE_SIZE - endoff; (void) uvm_direct_process(pgs + npages - 1, 1, off, zlen, ubc_zerorange_process, NULL); } } else { off += bytelen; todo -= bytelen; } ubc_direct_release(uobj, flags, pgs, npages); if (error != 0 && ISSET(flags, UBC_PARTIALOK)) { break; } } return error; } static void __noinline ubc_zerorange_direct(struct uvm_object *uobj, off_t off, size_t todo, int flags) { int error, npages; struct vm_page *pgs[howmany(ubc_winsize, MIN_PAGE_SIZE)]; flags |= UBC_WRITE; error = 0; while (todo > 0) { vsize_t bytelen = todo; error = ubc_alloc_direct(uobj, off, &bytelen, UVM_ADV_NORMAL, flags, pgs, &npages); if (error != 0) { /* can't do anything, failed to get the pages */ break; } error = uvm_direct_process(pgs, npages, off, bytelen, ubc_zerorange_process, NULL); ubc_direct_release(uobj, flags, pgs, npages); off += bytelen; todo -= bytelen; } } #endif /* UBC_USE_PMAP_DIRECT */ /* * ubc_purge: disassociate ubc_map structures from an empty uvm_object. */ void ubc_purge(struct uvm_object *uobj) { struct ubc_map *umap; vaddr_t va; KASSERT(uobj->uo_npages == 0); /* * Safe to check without lock held, as ubc_alloc() removes * the mapping and list entry in the correct order. */ if (__predict_true(LIST_EMPTY(&uobj->uo_ubc))) { return; } rw_enter(ubc_object.uobj.vmobjlock, RW_WRITER); while ((umap = LIST_FIRST(&uobj->uo_ubc)) != NULL) { KASSERT(umap->refcount == 0); for (va = 0; va < ubc_winsize; va += PAGE_SIZE) { KASSERT(!pmap_extract(pmap_kernel(), va + UBC_UMAP_ADDR(umap), NULL)); } LIST_REMOVE(umap, list); LIST_REMOVE(umap, hash); umap->flags &= ~UMAP_MAPPING_CACHED; umap->uobj = NULL; } rw_exit(ubc_object.uobj.vmobjlock); } static int ubchash_stats(struct hashstat_sysctl *hs, bool fill) { struct ubc_map *umap; uint64_t chain; strlcpy(hs->hash_name, "ubchash", sizeof(hs->hash_name)); strlcpy(hs->hash_desc, "ubc object hash", sizeof(hs->hash_desc)); if (!fill) return 0; hs->hash_size = ubc_object.hashmask + 1; for (size_t i = 0; i < hs->hash_size; i++) { chain = 0; rw_enter(ubc_object.uobj.vmobjlock, RW_READER); LIST_FOREACH(umap, &ubc_object.hash[i], hash) { chain++; } rw_exit(ubc_object.uobj.vmobjlock); if (chain > 0) { hs->hash_used++; hs->hash_items += chain; if (chain > hs->hash_maxchain) hs->hash_maxchain = chain; } preempt_point(); } return 0; }
414 348 8 5 53 3 67 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 /* $NetBSD: sysv_ipc.c,v 1.42 2022/03/27 16:23:08 christos Exp $ */ /*- * Copyright (c) 1998, 2007 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Charles M. Hannum. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: sysv_ipc.c,v 1.42 2022/03/27 16:23:08 christos Exp $"); #ifdef _KERNEL_OPT #include "opt_sysv.h" #include "opt_sysvparam.h" #include "opt_compat_netbsd.h" #endif #include <sys/syscall.h> #include <sys/syscallargs.h> #include <sys/syscallvar.h> #include <sys/param.h> #include <sys/kernel.h> #include <sys/proc.h> #include <sys/ipc.h> #ifdef SYSVMSG #include <sys/msg.h> #endif #ifdef SYSVSEM #include <sys/sem.h> #endif #ifdef SYSVSHM #include <sys/shm.h> #endif #include <sys/systm.h> #include <sys/kmem.h> #include <sys/module.h> #include <sys/mount.h> #include <sys/vnode.h> #include <sys/stat.h> #include <sys/sysctl.h> #include <sys/kauth.h> #include <sys/compat_stub.h> #include <compat/common/compat_sysv_mod.h> /* for sysctl routine vector */ /* * Values in support of System V compatible shared memory. XXX * (originally located in sys/conf/param.c) */ #ifdef SYSVSHM #if !defined(SHMMAX) && defined(SHMMAXPGS) #define SHMMAX SHMMAXPGS /* shminit() performs a `*= PAGE_SIZE' */ #elif !defined(SHMMAX) #define SHMMAX 0 #endif #ifndef SHMMIN #define SHMMIN 1 #endif #ifndef SHMMNI #define SHMMNI 128 /* <64k, see IPCID_TO_IX in ipc.h */ #endif #ifndef SHMSEG #define SHMSEG 128 #endif struct shminfo shminfo = { SHMMAX, SHMMIN, SHMMNI, SHMSEG, 0 }; #endif /* * Values in support of System V compatible semaphores. */ #ifdef SYSVSEM struct seminfo seminfo = { SEMMAP, /* # of entries in semaphore map */ SEMMNI, /* # of semaphore identifiers */ SEMMNS, /* # of semaphores in system */ SEMMNU, /* # of undo structures in system */ SEMMSL, /* max # of semaphores per id */ SEMOPM, /* max # of operations per semop call */ SEMUME, /* max # of undo entries per process */ SEMUSZ, /* size in bytes of undo structure */ SEMVMX, /* semaphore maximum value */ SEMAEM /* adjust on exit max value */ }; #endif /* * Values in support of System V compatible messages. */ #ifdef SYSVMSG struct msginfo msginfo = { MSGMAX, /* max chars in a message */ MSGMNI, /* # of message queue identifiers */ MSGMNB, /* max chars in a queue */ MSGTQL, /* max messages in system */ MSGSSZ, /* size of a message segment */ /* (must be small power of 2 greater than 4) */ MSGSEG /* number of message segments */ }; #endif MODULE(MODULE_CLASS_EXEC, sysv_ipc, NULL); SYSCTL_SETUP_PROTO(sysctl_ipc_setup); static const struct syscall_package sysvipc_syscalls[] = { #if defined(SYSVSHM) { SYS___shmctl50, 0, (sy_call_t *)sys___shmctl50 }, { SYS_shmat, 0, (sy_call_t *)sys_shmat }, { SYS_shmdt, 0, (sy_call_t *)sys_shmdt }, { SYS_shmget, 0, (sy_call_t *)sys_shmget }, #endif /* SYSVSHM */ #if defined(SYSVSEM) { SYS_____semctl50, 0, (sy_call_t *)sys_____semctl50 }, { SYS_semget, 0, (sy_call_t *)sys_semget }, { SYS_semop, 0, (sy_call_t *)sys_semop }, { SYS_semconfig, 0, (sy_call_t *)sys_semconfig }, #endif /* SYSVSEM */ #if defined(SYSVMSG) { SYS___msgctl50, 0, (sy_call_t *)sys___msgctl50 }, { SYS_msgget, 0, (sy_call_t *)sys_msgget }, { SYS_msgsnd, 0, (sy_call_t *)sys_msgsnd }, { SYS_msgrcv, 0, (sy_call_t *)sys_msgrcv }, #endif /* SYSVMSG */ { 0, 0, NULL } }; static int sysv_ipc_modcmd(modcmd_t cmd, void *arg) { int error = 0; switch (cmd) { case MODULE_CMD_INIT: /* Set up the kauth listener */ sysvipcinit(); /* Link the system calls */ error = syscall_establish(NULL, sysvipc_syscalls); if (error) { sysvipcfini(); return error; } /* * Initialize each sub-component, including their * sysctl data */ #ifdef SYSVSHM error = shminit(); if (error != 0) return error; #endif #ifdef SYSVSEM error = seminit(); if (error != 0) { #ifdef SYSVSHM shmfini(); #endif return error; } #endif #ifdef SYSVMSG error = msginit(); if (error != 0) { #ifdef SYSVSEM semfini(); #endif #ifdef SYSVSHM shmfini(); #endif return error; } #endif break; case MODULE_CMD_FINI: /* * Make sure no subcomponents are active. Each one * tells us if it is busy, and if it was _not_ busy, * we assume it has already done its own clean-up. * So we might need to re-init any components that * are successfully fini'd if we find one that is * still busy. */ #ifdef SYSVSHM if (shmfini()) { return EBUSY; } #endif #ifdef SYSVSEM if (semfini()) { #ifdef SYSVSHM shminit(); #endif return EBUSY; } #endif #ifdef SYSVMSG if (msgfini()) { #ifdef SYSVSEM seminit(); #endif #ifdef SYSVSHM shminit(); #endif return EBUSY; } #endif /* Unlink the system calls. */ error = syscall_disestablish(NULL, sysvipc_syscalls); if (error) return error; /* Remove the kauth listener */ sysvipcfini(); break; default: return ENOTTY; } return error; } static kauth_listener_t sysvipc_listener = NULL; static int sysvipc_listener_cb(kauth_cred_t cred, kauth_action_t action, void *cookie, void *arg0, void *arg1, void *arg2, void *arg3) { mode_t mask; struct ipc_perm *perm; int mode; enum kauth_system_req req; req = (enum kauth_system_req)(uintptr_t)arg0; if (!(action == KAUTH_SYSTEM_SYSVIPC && req == KAUTH_REQ_SYSTEM_SYSVIPC_BYPASS)) return KAUTH_RESULT_DEFER; perm = arg1; mode = (int)(uintptr_t)arg2; if (mode == IPC_M) { if (kauth_cred_geteuid(cred) == perm->uid || kauth_cred_geteuid(cred) == perm->cuid) return (KAUTH_RESULT_ALLOW); return (KAUTH_RESULT_DEFER); /* EPERM */ } mask = 0; if (kauth_cred_geteuid(cred) == perm->uid || kauth_cred_geteuid(cred) == perm->cuid) { if (mode & IPC_R) mask |= S_IRUSR; if (mode & IPC_W) mask |= S_IWUSR; return ((perm->mode & mask) == mask ? KAUTH_RESULT_ALLOW : KAUTH_RESULT_DEFER /* EACCES */); } if (kauth_cred_groupmember(cred, perm->gid) == 0 || kauth_cred_groupmember(cred, perm->cgid) == 0) { if (mode & IPC_R) mask |= S_IRGRP; if (mode & IPC_W) mask |= S_IWGRP; return ((perm->mode & mask) == mask ? KAUTH_RESULT_ALLOW : KAUTH_RESULT_DEFER /* EACCES */); } if (mode & IPC_R) mask |= S_IROTH; if (mode & IPC_W) mask |= S_IWOTH; return ((perm->mode & mask) == mask ? KAUTH_RESULT_ALLOW : KAUTH_RESULT_DEFER /* EACCES */); } /* * Check for ipc permission */ int ipcperm(kauth_cred_t cred, struct ipc_perm *perm, int mode) { int error; error = kauth_authorize_system(cred, KAUTH_SYSTEM_SYSVIPC, KAUTH_REQ_SYSTEM_SYSVIPC_BYPASS, perm, KAUTH_ARG(mode), NULL); if (error == 0) return (0); /* Adjust EPERM and EACCES errors until there's a better way to do this. */ if (mode != IPC_M) error = EACCES; return error; } void sysvipcfini(void) { KASSERT(sysvipc_listener != NULL); kauth_unlisten_scope(sysvipc_listener); sysvipc_listener = NULL; } void sysvipcinit(void) { KASSERT(sysvipc_listener == NULL); sysvipc_listener = kauth_listen_scope(KAUTH_SCOPE_SYSTEM, sysvipc_listener_cb, NULL); } static int stub_sysvipc50_sysctl(SYSCTLFN_ARGS) { return EPASSTHROUGH; } static int sysctl_kern_sysvipc(SYSCTLFN_ARGS) { void *where = oldp; size_t sz, *sizep = oldlenp; #ifdef SYSVMSG struct msg_sysctl_info *msgsi = NULL; #endif #ifdef SYSVSEM struct sem_sysctl_info *semsi = NULL; #endif #ifdef SYSVSHM struct shm_sysctl_info *shmsi = NULL; #endif size_t infosize, dssize, tsize, buflen; void *bf = NULL; char *start; int32_t nds; int i, error, ret; /* * If present, call the compat sysctl() code. If it handles the request * completely (either success or error), return. Otherwise fallthrough * to the non-compat sysctl code. */ MODULE_HOOK_CALL(sysvipc_sysctl_50_hook, (SYSCTLFN_CALL(rnode)), stub_sysvipc50_sysctl(SYSCTLFN_CALL(rnode)), error); if (error != EPASSTHROUGH) return error; if (namelen != 1) return EINVAL; start = where; buflen = *sizep; switch (*name) { case KERN_SYSVIPC_MSG_INFO: #ifdef SYSVMSG infosize = sizeof(msgsi->msginfo); nds = msginfo.msgmni; dssize = sizeof(msgsi->msgids[0]); break; #else return EINVAL; #endif case KERN_SYSVIPC_SEM_INFO: #ifdef SYSVSEM infosize = sizeof(semsi->seminfo); nds = seminfo.semmni; dssize = sizeof(semsi->semids[0]); break; #else return EINVAL; #endif case KERN_SYSVIPC_SHM_INFO: #ifdef SYSVSHM infosize = sizeof(shmsi->shminfo); nds = shminfo.shmmni; dssize = sizeof(shmsi->shmids[0]); break; #else return EINVAL; #endif default: return EINVAL; } /* * Round infosize to 64 bit boundary if requesting more than just * the info structure or getting the total data size. */ if (where == NULL || *sizep > infosize) infosize = roundup(infosize, sizeof(quad_t)); tsize = infosize + nds * dssize; /* Return just the total size required. */ if (where == NULL) { *sizep = tsize; return 0; } /* Not enough room for even the info struct. */ if (buflen < infosize) { *sizep = 0; return ENOMEM; } sz = uimin(tsize, buflen); bf = kmem_zalloc(sz, KM_SLEEP); switch (*name) { #ifdef SYSVMSG case KERN_SYSVIPC_MSG_INFO: msgsi = (struct msg_sysctl_info *)bf; msgsi->msginfo = msginfo; break; #endif #ifdef SYSVSEM case KERN_SYSVIPC_SEM_INFO: semsi = (struct sem_sysctl_info *)bf; semsi->seminfo = seminfo; break; #endif #ifdef SYSVSHM case KERN_SYSVIPC_SHM_INFO: shmsi = (struct shm_sysctl_info *)bf; shmsi->shminfo = shminfo; break; #endif } buflen -= infosize; ret = 0; if (buflen > 0) { /* Fill in the IPC data structures. */ for (i = 0; i < nds; i++) { if (buflen < dssize) { ret = ENOMEM; break; } switch (*name) { #ifdef SYSVMSG case KERN_SYSVIPC_MSG_INFO: mutex_enter(&msgmutex); SYSCTL_FILL_MSG(msqs[i].msq_u, msgsi->msgids[i]); mutex_exit(&msgmutex); break; #endif #ifdef SYSVSEM case KERN_SYSVIPC_SEM_INFO: SYSCTL_FILL_SEM(sema[i], semsi->semids[i]); break; #endif #ifdef SYSVSHM case KERN_SYSVIPC_SHM_INFO: SYSCTL_FILL_SHM(shmsegs[i], shmsi->shmids[i]); break; #endif } buflen -= dssize; } } *sizep -= buflen; error = copyout(bf, start, *sizep); /* If copyout succeeded, use return code set earlier. */ if (error == 0) error = ret; if (bf) kmem_free(bf, sz); return error; } SYSCTL_SETUP(sysctl_ipc_setup, "sysctl kern.ipc subtree setup") { sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT, CTLTYPE_NODE, "ipc", SYSCTL_DESCR("SysV IPC options"), NULL, 0, NULL, 0, CTL_KERN, KERN_SYSVIPC, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT, CTLTYPE_STRUCT, "sysvipc_info", SYSCTL_DESCR("System V style IPC information"), sysctl_kern_sysvipc, 0, NULL, 0, CTL_KERN, KERN_SYSVIPC, KERN_SYSVIPC_INFO, CTL_EOL); }
2 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 /* $NetBSD: rndpseudo_50.c,v 1.7 2020/04/30 03:30:10 riastradh Exp $ */ /*- * Copyright (c) 1997-2011 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Michael Graff <explorer@flame.org> and Thor Lancelot Simon. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: rndpseudo_50.c,v 1.7 2020/04/30 03:30:10 riastradh Exp $"); #if defined(_KERNEL_OPT) #include "opt_compat_netbsd.h" #endif #include <sys/param.h> #include <sys/file.h> #include <sys/module_hook.h> #include <sys/compat_stub.h> #include <compat/sys/rnd.h> #include <compat/common/compat_mod.h> /* * Convert from rndsource_t to rndsource50_t, for the results from * RNDGETNUM50 and RNDGETNAME50. */ static void rndsource_to_rndsource50(rndsource_t *r, rndsource50_t *r50) { memset(r50, 0, sizeof(*r50)); strlcpy(r50->name, r->name, sizeof(r50->name)); r50->total = r->total; r50->type = r->type; r50->flags = r->flags; } /* * COMPAT_50 handling for rnd_ioctl. This is called from rnd_ioctl. * * It also handles the case of (COMPAT_50 && COMPAT_NETBSD32). */ int compat_50_rnd_ioctl(struct file *fp, u_long cmd, void *addr) { int ret = 0; switch (cmd) { case RNDGETSRCNUM50: { rndstat_t rstbuf = {.start = 0}; rndstat50_t *rst50 = (rndstat50_t *)addr; size_t count; if (rst50->count > RND_MAXSTATCOUNT50) return EINVAL; rstbuf.start = rst50->start; rstbuf.count = rst50->count; ret = (fp->f_ops->fo_ioctl)(fp, RNDGETSRCNUM, &rstbuf); if (ret != 0) return ret; for (count = 0; count < rst50->count; count++) { rndsource_to_rndsource50(&rstbuf.source[count], &rst50->source[count]); } rst50->count = rstbuf.count; break; } case RNDGETSRCNAME50: { rndstat_name_t rstnmbuf = {.name[0] = 0}; rndstat_name50_t *rstnm50; rstnm50 = (rndstat_name50_t *)addr; strlcpy(rstnmbuf.name, rstnm50->name, sizeof(rstnmbuf.name)); ret = (fp->f_ops->fo_ioctl)(fp, RNDGETSRCNAME, &rstnmbuf); if (ret != 0) return ret; rndsource_to_rndsource50(&rstnmbuf.source, &rstnm50->source); break; } default: return ENOTTY; } return ret; } void rndpseudo_50_init(void) { MODULE_HOOK_SET(rnd_ioctl_50_hook, compat_50_rnd_ioctl); } void rndpseudo_50_fini(void) { MODULE_HOOK_UNSET(rnd_ioctl_50_hook); }
23 23 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 /* $NetBSD: tty_60.c,v 1.11 2021/07/21 06:35:44 skrll Exp $ */ /*- * Copyright (c) 2012 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Alan Barrett * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: tty_60.c,v 1.11 2021/07/21 06:35:44 skrll Exp $"); #if defined(_KERNEL_OPT) #include "opt_compat_netbsd.h" #endif #include <sys/param.h> #include <sys/types.h> #include <sys/conf.h> #include <sys/errno.h> #include <sys/systm.h> #include <sys/compat_stub.h> #include <sys/kmem.h> #include <sys/tty.h> #include <compat/common/compat_mod.h> #include <compat/sys/ttycom.h> /* convert struct ptmget to struct compat_60_ptmget */ static int ptmget_to_ptmget60(struct ptmget *pg, struct compat_60_ptmget *pg60) { memset(pg60, 0, sizeof(*pg60)); pg60->cfd = pg->cfd; pg60->sfd = pg->sfd; strlcpy(pg60->cn, pg->cn, sizeof(pg60->cn)); strlcpy(pg60->sn, pg->sn, sizeof(pg60->sn)); if (strlen(pg->cn) >= sizeof(pg60->cn) || strlen(pg->sn) >= sizeof(pg60->sn)) return E2BIG; return 0; } /* Helper for compat ioctls that use struct compat_60_ptmget. */ static int compat_60_ptmget_ioctl(dev_t dev, u_long cmd, void *data, int flag, struct lwp *l) { int ret; u_long newcmd; struct ptmget *pg; const struct cdevsw *cd = cdevsw_lookup(dev); if (cd == NULL || cd->d_ioctl == NULL) return ENXIO; switch (cmd) { case COMPAT_60_TIOCPTMGET: newcmd = TIOCPTMGET; break; case COMPAT_60_TIOCPTSNAME: newcmd = TIOCPTSNAME; break; default: return ENOTTY; } pg = kmem_alloc(sizeof(*pg), KM_SLEEP); ret = (cd->d_ioctl)(dev, newcmd, pg, flag, l); if (ret != 0) goto out; ret = ptmget_to_ptmget60(pg, data); out: kmem_free(pg, sizeof(*pg)); return ret; } /* * COMPAT_60 versions of ttioctl and ptmioctl. */ int compat_60_ttioctl(struct tty *tp, u_long cmd, void *data, int flag, struct lwp *l) { switch (cmd) { case COMPAT_60_TIOCPTMGET: case COMPAT_60_TIOCPTSNAME: return compat_60_ptmget_ioctl(tp->t_dev, cmd, data, flag, l); default: return EPASSTHROUGH; } } int compat_60_ptmioctl(dev_t dev, u_long cmd, void *data, int flag, struct lwp *l) { switch (cmd) { case COMPAT_60_TIOCPTMGET: return compat_60_ptmget_ioctl(dev, cmd, data, flag, l); default: return EPASSTHROUGH; } } void kern_tty_60_init(void) { MODULE_HOOK_SET(tty_ttioctl_60_hook, compat_60_ttioctl); MODULE_HOOK_SET(tty_ptmioctl_60_hook, compat_60_ptmioctl); } void kern_tty_60_fini(void) { MODULE_HOOK_UNSET(tty_ttioctl_60_hook); MODULE_HOOK_UNSET(tty_ptmioctl_60_hook); }
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 /* $NetBSD: exec_elf64.c,v 1.8 2019/11/20 19:37:53 pgoyette Exp $ */ /* * Copyright (c) 1996 Christopher G. Demetriou * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by Christopher G. Demetriou * for the NetBSD Project. * 4. The name of the author may not be used to endorse or promote products * derived from this software without specific prior written permission * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: exec_elf64.c,v 1.8 2019/11/20 19:37:53 pgoyette Exp $"); #define ELFSIZE 64 #include "exec_elf.c" #include <sys/module.h> #define ELF64_AUXSIZE (ELF_AUX_ENTRIES * sizeof(Aux64Info) \ + MAXPATHLEN + ALIGN(1)) MODULE(MODULE_CLASS_EXEC, exec_elf64, NULL); static struct execsw exec_elf64_execsw[] = { /* Native Elf64 */ { .es_hdrsz = sizeof (Elf64_Ehdr), .es_makecmds = exec_elf64_makecmds, .u = { .elf_probe_func = netbsd_elf64_probe, }, .es_emul = &emul_netbsd, .es_prio = EXECSW_PRIO_FIRST, .es_arglen = ELF64_AUXSIZE, .es_copyargs = elf64_copyargs, .es_setregs = NULL, .es_coredump = coredump_elf64, .es_setup_stack = exec_setup_stack, }, #if EXEC_ELF_NOTELESS /* Generic Elf64 -- run at NetBSD Elf64 */ { .es_hdrsz = sizeof (Elf64_Ehdr), .es_makecmds = exec_elf64_makecmds, .u = { .elf_probe_func = NULL, }, .es_emul = &emul_netbsd, .es_prio = EXECSW_PRIO_ANY, .es_arglen = ELF64_AUXSIZE, .es_copyargs = elf64_copyargs, .es_setregs = NULL, .es_coredump = coredump_elf64, .es_setup_stack = exec_setup_stack, }, #endif }; static int exec_elf64_modcmd(modcmd_t cmd, void *arg) { switch (cmd) { case MODULE_CMD_INIT: return exec_add(exec_elf64_execsw, __arraycount(exec_elf64_execsw)); case MODULE_CMD_FINI: return exec_remove(exec_elf64_execsw, __arraycount(exec_elf64_execsw)); default: return ENOTTY; } }
1 1 11 11 12 11 9 7 12 4 2 3 3 4 4 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 /* $NetBSD: tty_subr.c,v 1.43 2019/12/27 09:41:51 msaitoh Exp $ */ /* * Copyright (c) 1993, 1994 Theo de Raadt * All rights reserved. * * Per Lindqvist <pgd@compuram.bbt.se> supplied an almost fully working * set of true clist functions that this is very loosely based on. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: tty_subr.c,v 1.43 2019/12/27 09:41:51 msaitoh Exp $"); #include <sys/param.h> #include <sys/systm.h> #include <sys/buf.h> #include <sys/ioctl.h> #include <sys/tty.h> #include <sys/kmem.h> /* * At compile time, choose: * There are two ways the TTY_QUOTE bit can be stored. If QBITS is * defined we allocate an array of bits -- 1/8th as much memory but * setbit(), clrbit(), and isset() take more CPU. If QBITS is * undefined, we just use an array of bytes. * * If TTY_QUOTE functionality isn't required by a line discipline, * it can free c_cq and set it to NULL. This speeds things up, * and also does not use any extra memory. This is useful for (say) * a SLIP line discipline that wants a 32K ring buffer for data * but doesn't need quoting. */ #define QBITS #ifdef QBITS #define QMEM(n) ((((n)-1)/NBBY)+1) #else #define QMEM(n) (n) #endif #ifdef QBITS static void clrbits(u_char *, unsigned int, unsigned int); #endif /* * Initialize a particular clist. Ok, they are really ring buffers, * of the specified length, with/without quoting support. */ int clalloc(struct clist *clp, int size, int quot) { clp->c_cs = kmem_zalloc(size, KM_SLEEP); if (quot) clp->c_cq = kmem_zalloc(QMEM(size), KM_SLEEP); else clp->c_cq = NULL; clp->c_cf = clp->c_cl = NULL; clp->c_ce = clp->c_cs + size; clp->c_cn = size; clp->c_cc = 0; return (0); } void clfree(struct clist *clp) { if (clp->c_cs) kmem_free(clp->c_cs, clp->c_cn); if (clp->c_cq) kmem_free(clp->c_cq, QMEM(clp->c_cn)); clp->c_cs = clp->c_cq = NULL; } /* * Get a character from a clist. */ int getc(struct clist *clp) { int c = -1; int s; s = spltty(); if (clp->c_cc == 0) goto out; c = *clp->c_cf & 0xff; if (clp->c_cq) { #ifdef QBITS if (isset(clp->c_cq, clp->c_cf - clp->c_cs) ) c |= TTY_QUOTE; #else if (*(clp->c_cf - clp->c_cs + clp->c_cq)) c |= TTY_QUOTE; #endif } *clp->c_cf = 0; /* wipe out to avoid information disclosure */ if (++clp->c_cf == clp->c_ce) clp->c_cf = clp->c_cs; if (--clp->c_cc == 0) clp->c_cf = clp->c_cl = (u_char *)0; out: splx(s); return c; } /* * Copy clist to buffer. * Return number of bytes moved. */ int q_to_b(struct clist *clp, u_char *cp, int count) { int cc; u_char *p = cp; int s; s = spltty(); /* optimize this while loop */ while (count > 0 && clp->c_cc > 0) { cc = clp->c_cl - clp->c_cf; if (clp->c_cf >= clp->c_cl) cc = clp->c_ce - clp->c_cf; if (cc > count) cc = count; memcpy(p, clp->c_cf, cc); count -= cc; p += cc; clp->c_cc -= cc; clp->c_cf += cc; if (clp->c_cf == clp->c_ce) clp->c_cf = clp->c_cs; } if (clp->c_cc == 0) clp->c_cf = clp->c_cl = (u_char *)0; splx(s); return p - cp; } /* * Return count of contiguous characters in clist. * Stop counting if flag&character is non-null. */ int ndqb(struct clist *clp, int flag) { int count = 0; int i; int cc; int s; s = spltty(); if ((cc = clp->c_cc) == 0) goto out; if (flag == 0) { count = clp->c_cl - clp->c_cf; if (count <= 0) count = clp->c_ce - clp->c_cf; goto out; } i = clp->c_cf - clp->c_cs; if (flag & TTY_QUOTE) { while (cc-- > 0 && !(clp->c_cs[i++] & (flag & ~TTY_QUOTE) || isset(clp->c_cq, i))) { count++; if (i == clp->c_cn) break; } } else { while (cc-- > 0 && !(clp->c_cs[i++] & flag)) { count++; if (i == clp->c_cn) break; } } out: splx(s); return count; } /* * Flush count bytes from clist. */ void ndflush(struct clist *clp, int count) { int cc; int s; s = spltty(); if (count == clp->c_cc) { clp->c_cc = 0; clp->c_cf = clp->c_cl = (u_char *)0; goto out; } /* optimize this while loop */ while (count > 0 && clp->c_cc > 0) { cc = clp->c_cl - clp->c_cf; if (clp->c_cf >= clp->c_cl) cc = clp->c_ce - clp->c_cf; if (cc > count) cc = count; count -= cc; clp->c_cc -= cc; clp->c_cf += cc; if (clp->c_cf == clp->c_ce) clp->c_cf = clp->c_cs; } if (clp->c_cc == 0) clp->c_cf = clp->c_cl = (u_char *)0; out: splx(s); } /* * Put a character into the output queue. */ int putc(int c, struct clist *clp) { int i; int s; s = spltty(); if (clp->c_cc == clp->c_cn) goto out; if (clp->c_cc == 0) { if (!clp->c_cs) { #if defined(DIAGNOSTIC) || 1 printf("putc: required clalloc\n"); #endif if (clalloc(clp, clp->c_cn, 1)) { out: splx(s); return -1; } } clp->c_cf = clp->c_cl = clp->c_cs; } *clp->c_cl = c & 0xff; i = clp->c_cl - clp->c_cs; if (clp->c_cq) { #ifdef QBITS if (c & TTY_QUOTE) setbit(clp->c_cq, i); else clrbit(clp->c_cq, i); #else q = clp->c_cq + i; *q = (c & TTY_QUOTE) ? 1 : 0; #endif } clp->c_cc++; clp->c_cl++; if (clp->c_cl == clp->c_ce) clp->c_cl = clp->c_cs; splx(s); return 0; } #ifdef QBITS /* * optimized version of * * for (i = 0; i < len; i++) * clrbit(cp, off + len); */ static void clrbits(u_char *cp, unsigned int off, unsigned int len) { unsigned int sbi, ebi; u_char *scp, *ecp; unsigned int end; unsigned char mask; scp = cp + off / NBBY; sbi = off % NBBY; end = off + len + NBBY - 1; ecp = cp + end / NBBY - 1; ebi = end % NBBY + 1; if (scp >= ecp) { mask = ((1 << len) - 1) << sbi; *scp &= ~mask; } else { mask = (1 << sbi) - 1; *scp++ &= mask; mask = (1 << ebi) - 1; *ecp &= ~mask; while (scp < ecp) *scp++ = 0x00; } } #endif /* * Copy buffer to clist. * Return number of bytes not transferred. */ int b_to_q(const u_char *cp, int count, struct clist *clp) { int cc; const u_char *p = cp; int s; if (count <= 0) return 0; s = spltty(); if (clp->c_cc == clp->c_cn) goto out; if (clp->c_cc == 0) { if (!clp->c_cs) { #if defined(DIAGNOSTIC) || 1 printf("b_to_q: required clalloc\n"); #endif if (clalloc(clp, clp->c_cn, 1)) goto out; } clp->c_cf = clp->c_cl = clp->c_cs; } /* optimize this while loop */ while (count > 0 && clp->c_cc < clp->c_cn) { cc = clp->c_ce - clp->c_cl; if (clp->c_cf > clp->c_cl) cc = clp->c_cf - clp->c_cl; if (cc > count) cc = count; memcpy(clp->c_cl, p, cc); if (clp->c_cq) { #ifdef QBITS clrbits(clp->c_cq, clp->c_cl - clp->c_cs, cc); #else memset(clp->c_cl - clp->c_cs + clp->c_cq, 0, cc); #endif } p += cc; count -= cc; clp->c_cc += cc; clp->c_cl += cc; if (clp->c_cl == clp->c_ce) clp->c_cl = clp->c_cs; } out: splx(s); return count; } static int tty_global_cc; /* * Given a non-NULL pointer into the clist return the pointer * to the next character in the list or return NULL if no more chars. * * Callers must not allow getc's to happen between firstc's and getc's * so that the pointer becomes invalid. Note that interrupts are NOT * masked. */ u_char * nextc(struct clist *clp, u_char *cp, int *c) { if (clp->c_cf == cp) { /* * First time initialization. */ tty_global_cc = clp->c_cc; } if (tty_global_cc == 0 || cp == NULL) return NULL; if (--tty_global_cc == 0) return NULL; if (++cp == clp->c_ce) cp = clp->c_cs; *c = *cp & 0xff; if (clp->c_cq) { #ifdef QBITS if (isset(clp->c_cq, cp - clp->c_cs)) *c |= TTY_QUOTE; #else if (*(clp->c_cf - clp->c_cs + clp->c_cq)) *c |= TTY_QUOTE; #endif } return cp; } /* * Given a non-NULL pointer into the clist return the pointer * to the first character in the list or return NULL if no more chars. * * Callers must not allow getc's to happen between firstc's and getc's * so that the pointer becomes invalid. Note that interrupts are NOT * masked. * * *c is set to the NEXT character */ u_char * firstc(struct clist *clp, int *c) { u_char *cp; tty_global_cc = clp->c_cc; if (tty_global_cc == 0) return NULL; cp = clp->c_cf; *c = *cp & 0xff; if (clp->c_cq) { #ifdef QBITS if (isset(clp->c_cq, cp - clp->c_cs)) *c |= TTY_QUOTE; #else if (*(cp - clp->c_cs + clp->c_cq)) *c |= TTY_QUOTE; #endif } return clp->c_cf; } /* * Remove the last character in the clist and return it. */ int unputc(struct clist *clp) { unsigned int c = -1; int s; s = spltty(); if (clp->c_cc == 0) goto out; if (clp->c_cl == clp->c_cs) clp->c_cl = clp->c_ce - 1; else --clp->c_cl; clp->c_cc--; c = *clp->c_cl & 0xff; if (clp->c_cq) { #ifdef QBITS if (isset(clp->c_cq, clp->c_cl - clp->c_cs)) c |= TTY_QUOTE; #else if (*(clp->c_cf - clp->c_cs + clp->c_cq)) c |= TTY_QUOTE; #endif } if (clp->c_cc == 0) clp->c_cf = clp->c_cl = (u_char *)0; out: splx(s); return c; } /* * Put the chars in the from queue on the end of the to queue. */ void catq(struct clist *from, struct clist *to) { int c; while ((c = getc(from)) != -1) putc(c, to); }
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 /* $NetBSD: lfs_accessors.h,v 1.51 2022/04/24 20:32:44 rillig Exp $ */ /* from NetBSD: lfs.h,v 1.165 2015/07/24 06:59:32 dholland Exp */ /* from NetBSD: dinode.h,v 1.25 2016/01/22 23:06:10 dholland Exp */ /* from NetBSD: dir.h,v 1.25 2015/09/01 06:16:03 dholland Exp */ /*- * Copyright (c) 1999, 2000, 2001, 2002, 2003 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Konrad E. Schroder <perseant@hhhh.org>. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /*- * Copyright (c) 1991, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)lfs.h 8.9 (Berkeley) 5/8/95 */ /* * Copyright (c) 2002 Networks Associates Technology, Inc. * All rights reserved. * * This software was developed for the FreeBSD Project by Marshall * Kirk McKusick and Network Associates Laboratories, the Security * Research Division of Network Associates, Inc. under DARPA/SPAWAR * contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA CHATS * research program * * Copyright (c) 1982, 1989, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)dinode.h 8.9 (Berkeley) 3/29/95 */ /* * Copyright (c) 1982, 1986, 1989, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)dir.h 8.5 (Berkeley) 4/27/95 */ #ifndef _UFS_LFS_LFS_ACCESSORS_H_ #define _UFS_LFS_LFS_ACCESSORS_H_ #if defined(_KERNEL_OPT) #include "opt_lfs.h" #endif #include <sys/bswap.h> #include <ufs/lfs/lfs.h> #if !defined(_KERNEL) && !defined(_STANDALONE) #include <assert.h> #include <string.h> #define KASSERT assert #else #include <sys/systm.h> #endif /* * STRUCT_LFS is used by the libsa code to get accessors that work * with struct salfs instead of struct lfs, and by the cleaner to * get accessors that work with struct clfs. */ #ifndef STRUCT_LFS #define STRUCT_LFS struct lfs #endif /* * byte order */ /* * For now at least, the bootblocks shall not be endian-independent. * We can see later if it fits in the size budget. Also disable the * byteswapping if LFS_EI is off. * * Caution: these functions "know" that bswap16/32/64 are unsigned, * and if that changes will likely break silently. */ #if defined(_STANDALONE) || (defined(_KERNEL) && !defined(LFS_EI)) #define LFS_SWAP_int16_t(fs, val) (val) #define LFS_SWAP_int32_t(fs, val) (val) #define LFS_SWAP_int64_t(fs, val) (val) #define LFS_SWAP_uint16_t(fs, val) (val) #define LFS_SWAP_uint32_t(fs, val) (val) #define LFS_SWAP_uint64_t(fs, val) (val) #else #define LFS_SWAP_int16_t(fs, val) \ ((fs)->lfs_dobyteswap ? (int16_t)bswap16(val) : (val)) #define LFS_SWAP_int32_t(fs, val) \ ((fs)->lfs_dobyteswap ? (int32_t)bswap32(val) : (val)) #define LFS_SWAP_int64_t(fs, val) \ ((fs)->lfs_dobyteswap ? (int64_t)bswap64(val) : (val)) #define LFS_SWAP_uint16_t(fs, val) \ ((fs)->lfs_dobyteswap ? bswap16(val) : (val)) #define LFS_SWAP_uint32_t(fs, val) \ ((fs)->lfs_dobyteswap ? bswap32(val) : (val)) #define LFS_SWAP_uint64_t(fs, val) \ ((fs)->lfs_dobyteswap ? bswap64(val) : (val)) #endif /* * For handling directories we will need to know if the volume is * little-endian. */ #if BYTE_ORDER == LITTLE_ENDIAN #define LFS_LITTLE_ENDIAN_ONDISK(fs) (!(fs)->lfs_dobyteswap) #else #define LFS_LITTLE_ENDIAN_ONDISK(fs) ((fs)->lfs_dobyteswap) #endif /* * Suppress spurious warnings -- we use * * type *foo = &obj->member; * * in macros to verify that obj->member has the right type. When the * object is a packed structure with misaligned members, this causes * some compiles to squeal that taking the address might lead to * undefined behaviour later on -- which is helpful in general, not * relevant in this case, because we don't do anything with foo * afterward; we only declare it to get a type check and then we * discard it. */ #ifdef __GNUC__ #if defined(__clang__) #pragma clang diagnostic push #pragma clang diagnostic ignored "-Waddress-of-packed-member" #elif __GNUC_PREREQ__(9,0) #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Waddress-of-packed-member" #endif #endif /* * directories */ #define LFS_DIRHEADERSIZE(fs) \ ((fs)->lfs_is64 ? sizeof(struct lfs_dirheader64) : sizeof(struct lfs_dirheader32)) /* * The LFS_DIRSIZ macro gives the minimum record length which will hold * the directory entry. This requires the amount of space in struct lfs_direct * without the d_name field, plus enough space for the name with a terminating * null byte (dp->d_namlen+1), rounded up to a 4 byte boundary. */ #define LFS_DIRECTSIZ(fs, namlen) \ (LFS_DIRHEADERSIZE(fs) + (((namlen)+1 + 3) &~ 3)) /* * The size of the largest possible directory entry. This is * used by ulfs_dirhash to figure the size of an array, so we * need a single constant value true for both lfs32 and lfs64. */ #define LFS_MAXDIRENTRYSIZE \ (sizeof(struct lfs_dirheader64) + (((LFS_MAXNAMLEN+1)+1 + 3) & ~3)) #if (BYTE_ORDER == LITTLE_ENDIAN) #define LFS_OLDDIRSIZ(oldfmt, dp, needswap) \ (((oldfmt) && !(needswap)) ? \ LFS_DIRECTSIZ((dp)->d_type) : LFS_DIRECTSIZ((dp)->d_namlen)) #else #define LFS_OLDDIRSIZ(oldfmt, dp, needswap) \ (((oldfmt) && (needswap)) ? \ LFS_DIRECTSIZ((dp)->d_type) : LFS_DIRECTSIZ((dp)->d_namlen)) #endif #define LFS_DIRSIZ(fs, dp) LFS_DIRECTSIZ(fs, lfs_dir_getnamlen(fs, dp)) /* Constants for the first argument of LFS_OLDDIRSIZ */ #define LFS_OLDDIRFMT 1 #define LFS_NEWDIRFMT 0 #define LFS_NEXTDIR(fs, dp) \ ((LFS_DIRHEADER *)((char *)(dp) + lfs_dir_getreclen(fs, dp))) static __inline char * lfs_dir_nameptr(const STRUCT_LFS *fs, LFS_DIRHEADER *dh) { if (fs->lfs_is64) { return (char *)(&dh->u_64 + 1); } else { return (char *)(&dh->u_32 + 1); } } static __inline uint64_t lfs_dir_getino(const STRUCT_LFS *fs, const LFS_DIRHEADER *dh) { if (fs->lfs_is64) { return LFS_SWAP_uint64_t(fs, dh->u_64.dh_ino); } else { return LFS_SWAP_uint32_t(fs, dh->u_32.dh_ino); } } static __inline uint16_t lfs_dir_getreclen(const STRUCT_LFS *fs, const LFS_DIRHEADER *dh) { if (fs->lfs_is64) { return LFS_SWAP_uint16_t(fs, dh->u_64.dh_reclen); } else { return LFS_SWAP_uint16_t(fs, dh->u_32.dh_reclen); } } static __inline uint8_t lfs_dir_gettype(const STRUCT_LFS *fs, const LFS_DIRHEADER *dh) { if (fs->lfs_is64) { KASSERT(fs->lfs_hasolddirfmt == 0); return dh->u_64.dh_type; } else if (fs->lfs_hasolddirfmt) { return LFS_DT_UNKNOWN; } else { return dh->u_32.dh_type; } } static __inline uint8_t lfs_dir_getnamlen(const STRUCT_LFS *fs, const LFS_DIRHEADER *dh) { if (fs->lfs_is64) { KASSERT(fs->lfs_hasolddirfmt == 0); return dh->u_64.dh_namlen; } else if (fs->lfs_hasolddirfmt && LFS_LITTLE_ENDIAN_ONDISK(fs)) { /* low-order byte of old 16-bit namlen field */ return dh->u_32.dh_type; } else { return dh->u_32.dh_namlen; } } static __inline void lfs_dir_setino(STRUCT_LFS *fs, LFS_DIRHEADER *dh, uint64_t ino) { if (fs->lfs_is64) { dh->u_64.dh_ino = LFS_SWAP_uint64_t(fs, ino); } else { dh->u_32.dh_ino = LFS_SWAP_uint32_t(fs, ino); } } static __inline void lfs_dir_setreclen(STRUCT_LFS *fs, LFS_DIRHEADER *dh, uint16_t reclen) { if (fs->lfs_is64) { dh->u_64.dh_reclen = LFS_SWAP_uint16_t(fs, reclen); } else { dh->u_32.dh_reclen = LFS_SWAP_uint16_t(fs, reclen); } } static __inline void lfs_dir_settype(const STRUCT_LFS *fs, LFS_DIRHEADER *dh, uint8_t type) { if (fs->lfs_is64) { KASSERT(fs->lfs_hasolddirfmt == 0); dh->u_64.dh_type = type; } else if (fs->lfs_hasolddirfmt) { /* do nothing */ return; } else { dh->u_32.dh_type = type; } } static __inline void lfs_dir_setnamlen(const STRUCT_LFS *fs, LFS_DIRHEADER *dh, uint8_t namlen) { if (fs->lfs_is64) { KASSERT(fs->lfs_hasolddirfmt == 0); dh->u_64.dh_namlen = namlen; } else if (fs->lfs_hasolddirfmt && LFS_LITTLE_ENDIAN_ONDISK(fs)) { /* low-order byte of old 16-bit namlen field */ dh->u_32.dh_type = namlen; } else { dh->u_32.dh_namlen = namlen; } } static __inline void lfs_copydirname(STRUCT_LFS *fs, char *dest, const char *src, unsigned namlen, unsigned reclen) { unsigned spacelen; KASSERT(reclen > LFS_DIRHEADERSIZE(fs)); spacelen = reclen - LFS_DIRHEADERSIZE(fs); /* must always be at least 1 byte as a null terminator */ KASSERT(spacelen > namlen); memcpy(dest, src, namlen); memset(dest + namlen, '\0', spacelen - namlen); } static __inline LFS_DIRHEADER * lfs_dirtemplate_dotdot(STRUCT_LFS *fs, union lfs_dirtemplate *dt) { /* XXX blah, be nice to have a way to do this w/o casts */ if (fs->lfs_is64) { return (LFS_DIRHEADER *)&dt->u_64.dotdot_header; } else { return (LFS_DIRHEADER *)&dt->u_32.dotdot_header; } } static __inline char * lfs_dirtemplate_dotdotname(STRUCT_LFS *fs, union lfs_dirtemplate *dt) { if (fs->lfs_is64) { return dt->u_64.dotdot_name; } else { return dt->u_32.dotdot_name; } } /* * dinodes */ /* * Maximum length of a symlink that can be stored within the inode. */ #define LFS32_MAXSYMLINKLEN ((ULFS_NDADDR + ULFS_NIADDR) * sizeof(int32_t)) #define LFS64_MAXSYMLINKLEN ((ULFS_NDADDR + ULFS_NIADDR) * sizeof(int64_t)) #define LFS_MAXSYMLINKLEN(fs) \ ((fs)->lfs_is64 ? LFS64_MAXSYMLINKLEN : LFS32_MAXSYMLINKLEN) #define DINOSIZE(fs) ((fs)->lfs_is64 ? sizeof(struct lfs64_dinode) : sizeof(struct lfs32_dinode)) #define DINO_IN_BLOCK(fs, base, ix) \ ((union lfs_dinode *)((char *)(base) + DINOSIZE(fs) * (ix))) static __inline void lfs_copy_dinode(STRUCT_LFS *fs, union lfs_dinode *dst, const union lfs_dinode *src) { /* * We can do structure assignment of the structs, but not of * the whole union, as the union is the size of the (larger) * 64-bit struct and on a 32-bit fs the upper half of it might * be off the end of a buffer or otherwise invalid. */ if (fs->lfs_is64) { dst->u_64 = src->u_64; } else { dst->u_32 = src->u_32; } } #define LFS_DEF_DINO_ACCESSOR(type, type32, field) \ static __inline type \ lfs_dino_get##field(STRUCT_LFS *fs, union lfs_dinode *dip) \ { \ if (fs->lfs_is64) { \ return LFS_SWAP_##type(fs, dip->u_64.di_##field); \ } else { \ return LFS_SWAP_##type32(fs, dip->u_32.di_##field); \ } \ } \ static __inline void \ lfs_dino_set##field(STRUCT_LFS *fs, union lfs_dinode *dip, type val) \ { \ if (fs->lfs_is64) { \ type *p = &dip->u_64.di_##field; \ (void)p; \ dip->u_64.di_##field = LFS_SWAP_##type(fs, val); \ } else { \ type32 *p = &dip->u_32.di_##field; \ (void)p; \ dip->u_32.di_##field = LFS_SWAP_##type32(fs, val); \ } \ } \ LFS_DEF_DINO_ACCESSOR(uint16_t, uint16_t, mode) LFS_DEF_DINO_ACCESSOR(int16_t, int16_t, nlink) LFS_DEF_DINO_ACCESSOR(uint64_t, uint32_t, inumber) LFS_DEF_DINO_ACCESSOR(uint64_t, uint64_t, size) LFS_DEF_DINO_ACCESSOR(int64_t, int32_t, atime) LFS_DEF_DINO_ACCESSOR(int32_t, int32_t, atimensec) LFS_DEF_DINO_ACCESSOR(int64_t, int32_t, mtime) LFS_DEF_DINO_ACCESSOR(int32_t, int32_t, mtimensec) LFS_DEF_DINO_ACCESSOR(int64_t, int32_t, ctime) LFS_DEF_DINO_ACCESSOR(int32_t, int32_t, ctimensec) LFS_DEF_DINO_ACCESSOR(uint32_t, uint32_t, flags) LFS_DEF_DINO_ACCESSOR(uint64_t, uint32_t, blocks) LFS_DEF_DINO_ACCESSOR(int32_t, int32_t, gen) LFS_DEF_DINO_ACCESSOR(uint32_t, uint32_t, uid) LFS_DEF_DINO_ACCESSOR(uint32_t, uint32_t, gid) /* XXX this should be done differently (it's a fake field) */ LFS_DEF_DINO_ACCESSOR(int64_t, int32_t, rdev) static __inline daddr_t lfs_dino_getdb(STRUCT_LFS *fs, union lfs_dinode *dip, unsigned ix) { KASSERT(ix < ULFS_NDADDR); if (fs->lfs_is64) { return LFS_SWAP_int64_t(fs, dip->u_64.di_db[ix]); } else { /* note: this must sign-extend or UNWRITTEN gets trashed */ return (int32_t)LFS_SWAP_int32_t(fs, dip->u_32.di_db[ix]); } } static __inline daddr_t lfs_dino_getib(STRUCT_LFS *fs, union lfs_dinode *dip, unsigned ix) { KASSERT(ix < ULFS_NIADDR); if (fs->lfs_is64) { return LFS_SWAP_int64_t(fs, dip->u_64.di_ib[ix]); } else { /* note: this must sign-extend or UNWRITTEN gets trashed */ return (int32_t)LFS_SWAP_int32_t(fs, dip->u_32.di_ib[ix]); } } static __inline void lfs_dino_setdb(STRUCT_LFS *fs, union lfs_dinode *dip, unsigned ix, daddr_t val) { KASSERT(ix < ULFS_NDADDR); if (fs->lfs_is64) { dip->u_64.di_db[ix] = LFS_SWAP_int64_t(fs, val); } else { dip->u_32.di_db[ix] = LFS_SWAP_uint32_t(fs, val); } } static __inline void lfs_dino_setib(STRUCT_LFS *fs, union lfs_dinode *dip, unsigned ix, daddr_t val) { KASSERT(ix < ULFS_NIADDR); if (fs->lfs_is64) { dip->u_64.di_ib[ix] = LFS_SWAP_int64_t(fs, val); } else { dip->u_32.di_ib[ix] = LFS_SWAP_uint32_t(fs, val); } } /* birthtime is present only in the 64-bit inode */ static __inline void lfs_dino_setbirthtime(STRUCT_LFS *fs, union lfs_dinode *dip, const struct timespec *ts) { if (fs->lfs_is64) { dip->u_64.di_birthtime = ts->tv_sec; dip->u_64.di_birthnsec = ts->tv_nsec; } else { /* drop it on the floor */ } } /* * indirect blocks */ static __inline daddr_t lfs_iblock_get(STRUCT_LFS *fs, void *block, unsigned ix) { if (fs->lfs_is64) { // XXX re-enable these asserts after reorging this file //KASSERT(ix < lfs_sb_getbsize(fs) / sizeof(int64_t)); return (daddr_t)(((int64_t *)block)[ix]); } else { //KASSERT(ix < lfs_sb_getbsize(fs) / sizeof(int32_t)); /* must sign-extend or UNWRITTEN gets trashed */ return (daddr_t)(int64_t)(((int32_t *)block)[ix]); } } static __inline void lfs_iblock_set(STRUCT_LFS *fs, void *block, unsigned ix, daddr_t val) { if (fs->lfs_is64) { //KASSERT(ix < lfs_sb_getbsize(fs) / sizeof(int64_t)); ((int64_t *)block)[ix] = val; } else { //KASSERT(ix < lfs_sb_getbsize(fs) / sizeof(int32_t)); ((int32_t *)block)[ix] = val; } } /* * "struct buf" associated definitions */ # define LFS_LOCK_BUF(bp) do { \ if (((bp)->b_flags & B_LOCKED) == 0 && bp->b_iodone == NULL) { \ mutex_enter(&lfs_lock); \ ++locked_queue_count; \ locked_queue_bytes += bp->b_bufsize; \ mutex_exit(&lfs_lock); \ } \ (bp)->b_flags |= B_LOCKED; \ } while (0) # define LFS_UNLOCK_BUF(bp) do { \ if (((bp)->b_flags & B_LOCKED) != 0 && bp->b_iodone == NULL) { \ mutex_enter(&lfs_lock); \ --locked_queue_count; \ locked_queue_bytes -= bp->b_bufsize; \ if (locked_queue_count < LFS_WAIT_BUFS && \ locked_queue_bytes < LFS_WAIT_BYTES) \ cv_broadcast(&locked_queue_cv); \ mutex_exit(&lfs_lock); \ } \ (bp)->b_flags &= ~B_LOCKED; \ } while (0) /* * "struct inode" associated definitions */ #define LFS_SET_UINO(ip, states) do { \ if (((states) & IN_ACCESSED) && !((ip)->i_state & IN_ACCESSED)) \ lfs_sb_adduinodes((ip)->i_lfs, 1); \ if (((states) & IN_CLEANING) && !((ip)->i_state & IN_CLEANING)) \ lfs_sb_adduinodes((ip)->i_lfs, 1); \ if (((states) & IN_MODIFIED) && !((ip)->i_state & IN_MODIFIED)) \ lfs_sb_adduinodes((ip)->i_lfs, 1); \ (ip)->i_state |= (states); \ } while (0) #define LFS_CLR_UINO(ip, states) do { \ if (((states) & IN_ACCESSED) && ((ip)->i_state & IN_ACCESSED)) \ lfs_sb_subuinodes((ip)->i_lfs, 1); \ if (((states) & IN_CLEANING) && ((ip)->i_state & IN_CLEANING)) \ lfs_sb_subuinodes((ip)->i_lfs, 1); \ if (((states) & IN_MODIFIED) && ((ip)->i_state & IN_MODIFIED)) \ lfs_sb_subuinodes((ip)->i_lfs, 1); \ (ip)->i_state &= ~(states); \ if (lfs_sb_getuinodes((ip)->i_lfs) < 0) { \ panic("lfs_uinodes < 0"); \ } \ } while (0) #define LFS_ITIMES(ip, acc, mod, cre) \ while ((ip)->i_state & (IN_ACCESS | IN_CHANGE | IN_UPDATE | IN_MODIFY)) \ lfs_itimes(ip, acc, mod, cre) /* * On-disk and in-memory checkpoint segment usage structure. */ #define SEGUPB(fs) (lfs_sb_getsepb(fs)) #define SEGTABSIZE_SU(fs) \ ((lfs_sb_getnseg(fs) + SEGUPB(fs) - 1) / lfs_sb_getsepb(fs)) #ifdef _KERNEL # define SHARE_IFLOCK(F) \ do { \ rw_enter(&(F)->lfs_iflock, RW_READER); \ } while(0) # define UNSHARE_IFLOCK(F) \ do { \ rw_exit(&(F)->lfs_iflock); \ } while(0) #else /* ! _KERNEL */ # define SHARE_IFLOCK(F) # define UNSHARE_IFLOCK(F) #endif /* ! _KERNEL */ /* Read in the block with a specific segment usage entry from the ifile. */ #define LFS_SEGENTRY(SP, F, IN, BP) do { \ int _e; \ SHARE_IFLOCK(F); \ VTOI((F)->lfs_ivnode)->i_state |= IN_ACCESS; \ if ((_e = bread((F)->lfs_ivnode, \ ((IN) / lfs_sb_getsepb(F)) + lfs_sb_getcleansz(F), \ lfs_sb_getbsize(F), 0, &(BP))) != 0) \ panic("lfs: ifile read: segentry %llu: error %d\n", \ (unsigned long long)(IN), _e); \ if (lfs_sb_getversion(F) == 1) \ (SP) = (SEGUSE *)((SEGUSE_V1 *)(BP)->b_data + \ ((IN) & (lfs_sb_getsepb(F) - 1))); \ else \ (SP) = (SEGUSE *)(BP)->b_data + ((IN) % lfs_sb_getsepb(F)); \ UNSHARE_IFLOCK(F); \ } while (0) #define LFS_WRITESEGENTRY(SP, F, IN, BP) do { \ if ((SP)->su_nbytes == 0) \ (SP)->su_flags |= SEGUSE_EMPTY; \ else \ (SP)->su_flags &= ~SEGUSE_EMPTY; \ (F)->lfs_suflags[(F)->lfs_activesb][(IN)] = (SP)->su_flags; \ LFS_BWRITE_LOG(BP); \ } while (0) /* * FINFO (file info) entries. */ /* Size of an on-disk block pointer, e.g. in an indirect block. */ /* XXX: move to a more suitable location in this file */ #define LFS_BLKPTRSIZE(fs) ((fs)->lfs_is64 ? sizeof(int64_t) : sizeof(int32_t)) /* Size of an on-disk inode number. */ /* XXX: move to a more suitable location in this file */ #define LFS_INUMSIZE(fs) ((fs)->lfs_is64 ? sizeof(int64_t) : sizeof(int32_t)) /* size of a FINFO, without the block pointers */ #define FINFOSIZE(fs) ((fs)->lfs_is64 ? sizeof(FINFO64) : sizeof(FINFO32)) /* Full size of the provided FINFO record, including its block pointers. */ #define FINFO_FULLSIZE(fs, fip) \ (FINFOSIZE(fs) + lfs_fi_getnblocks(fs, fip) * LFS_BLKPTRSIZE(fs)) #define NEXT_FINFO(fs, fip) \ ((FINFO *)((char *)(fip) + FINFO_FULLSIZE(fs, fip))) #define LFS_DEF_FI_ACCESSOR(type, type32, field) \ static __inline type \ lfs_fi_get##field(STRUCT_LFS *fs, FINFO *fip) \ { \ if (fs->lfs_is64) { \ return fip->u_64.fi_##field; \ } else { \ return fip->u_32.fi_##field; \ } \ } \ static __inline void \ lfs_fi_set##field(STRUCT_LFS *fs, FINFO *fip, type val) \ { \ if (fs->lfs_is64) { \ type *p = &fip->u_64.fi_##field; \ (void)p; \ fip->u_64.fi_##field = val; \ } else { \ type32 *p = &fip->u_32.fi_##field; \ (void)p; \ fip->u_32.fi_##field = val; \ } \ } \ LFS_DEF_FI_ACCESSOR(uint32_t, uint32_t, nblocks) LFS_DEF_FI_ACCESSOR(uint32_t, uint32_t, version) LFS_DEF_FI_ACCESSOR(uint64_t, uint32_t, ino) LFS_DEF_FI_ACCESSOR(uint32_t, uint32_t, lastlength) static __inline daddr_t lfs_fi_getblock(STRUCT_LFS *fs, FINFO *fip, unsigned idx) { void *firstblock; firstblock = (char *)fip + FINFOSIZE(fs); KASSERT(idx < lfs_fi_getnblocks(fs, fip)); if (fs->lfs_is64) { return ((int64_t *)firstblock)[idx]; } else { return ((int32_t *)firstblock)[idx]; } } static __inline void lfs_fi_setblock(STRUCT_LFS *fs, FINFO *fip, unsigned idx, daddr_t blk) { void *firstblock; firstblock = (char *)fip + FINFOSIZE(fs); KASSERT(idx < lfs_fi_getnblocks(fs, fip)); if (fs->lfs_is64) { ((int64_t *)firstblock)[idx] = blk; } else { ((int32_t *)firstblock)[idx] = blk; } } /* * inode info entries (in the segment summary) */ #define IINFOSIZE(fs) ((fs)->lfs_is64 ? sizeof(IINFO64) : sizeof(IINFO32)) /* iinfos scroll backward from the end of the segment summary block */ #define SEGSUM_IINFOSTART(fs, buf) \ ((IINFO *)((char *)buf + lfs_sb_getsumsize(fs) - IINFOSIZE(fs))) #define NEXTLOWER_IINFO(fs, iip) \ ((IINFO *)((char *)(iip) - IINFOSIZE(fs))) #define NTH_IINFO(fs, buf, n) \ ((IINFO *)((char *)SEGSUM_IINFOSTART(fs, buf) - (n)*IINFOSIZE(fs))) static __inline uint64_t lfs_ii_getblock(STRUCT_LFS *fs, IINFO *iip) { if (fs->lfs_is64) { return iip->u_64.ii_block; } else { return iip->u_32.ii_block; } } static __inline void lfs_ii_setblock(STRUCT_LFS *fs, IINFO *iip, uint64_t block) { if (fs->lfs_is64) { iip->u_64.ii_block = block; } else { iip->u_32.ii_block = block; } } /* * Index file inode entries. */ #define IFILE_ENTRYSIZE(fs) \ ((fs)->lfs_is64 ? sizeof(IFILE64) : sizeof(IFILE32)) /* * LFSv1 compatibility code is not allowed to touch if_atime, since it * may not be mapped! */ /* Read in the block with a specific inode from the ifile. */ #define LFS_IENTRY(IP, F, IN, BP) do { \ int _e; \ SHARE_IFLOCK(F); \ VTOI((F)->lfs_ivnode)->i_state |= IN_ACCESS; \ if ((_e = bread((F)->lfs_ivnode, \ (IN) / lfs_sb_getifpb(F) + lfs_sb_getcleansz(F) + lfs_sb_getsegtabsz(F), \ lfs_sb_getbsize(F), 0, &(BP))) != 0) \ panic("lfs: ifile ino %d read %d", (int)(IN), _e); \ if ((F)->lfs_is64) { \ (IP) = (IFILE *)((IFILE64 *)(BP)->b_data + \ (IN) % lfs_sb_getifpb(F)); \ } else if (lfs_sb_getversion(F) > 1) { \ (IP) = (IFILE *)((IFILE32 *)(BP)->b_data + \ (IN) % lfs_sb_getifpb(F)); \ } else { \ (IP) = (IFILE *)((IFILE_V1 *)(BP)->b_data + \ (IN) % lfs_sb_getifpb(F)); \ } \ UNSHARE_IFLOCK(F); \ } while (0) #define LFS_IENTRY_NEXT(IP, F) do { \ if ((F)->lfs_is64) { \ (IP) = (IFILE *)((IFILE64 *)(IP) + 1); \ } else if (lfs_sb_getversion(F) > 1) { \ (IP) = (IFILE *)((IFILE32 *)(IP) + 1); \ } else { \ (IP) = (IFILE *)((IFILE_V1 *)(IP) + 1); \ } \ } while (0) #define LFS_DEF_IF_ACCESSOR(type, type32, field) \ static __inline type \ lfs_if_get##field(STRUCT_LFS *fs, IFILE *ifp) \ { \ if (fs->lfs_is64) { \ return ifp->u_64.if_##field; \ } else { \ return ifp->u_32.if_##field; \ } \ } \ static __inline void \ lfs_if_set##field(STRUCT_LFS *fs, IFILE *ifp, type val) \ { \ if (fs->lfs_is64) { \ type *p = &ifp->u_64.if_##field; \ (void)p; \ ifp->u_64.if_##field = val; \ } else { \ type32 *p = &ifp->u_32.if_##field; \ (void)p; \ ifp->u_32.if_##field = val; \ } \ } \ LFS_DEF_IF_ACCESSOR(uint32_t, uint32_t, version) LFS_DEF_IF_ACCESSOR(int64_t, int32_t, daddr) LFS_DEF_IF_ACCESSOR(uint64_t, uint32_t, nextfree) LFS_DEF_IF_ACCESSOR(uint64_t, uint32_t, atime_sec) LFS_DEF_IF_ACCESSOR(uint32_t, uint32_t, atime_nsec) /* * Cleaner information structure. This resides in the ifile and is used * to pass information from the kernel to the cleaner. */ #define CLEANSIZE_SU(fs) \ ((((fs)->lfs_is64 ? sizeof(CLEANERINFO64) : sizeof(CLEANERINFO32)) + \ lfs_sb_getbsize(fs) - 1) >> lfs_sb_getbshift(fs)) #define LFS_DEF_CI_ACCESSOR(type, type32, field) \ static __inline type \ lfs_ci_get##field(STRUCT_LFS *fs, CLEANERINFO *cip) \ { \ if (fs->lfs_is64) { \ return cip->u_64.field; \ } else { \ return cip->u_32.field; \ } \ } \ static __inline void \ lfs_ci_set##field(STRUCT_LFS *fs, CLEANERINFO *cip, type val) \ { \ if (fs->lfs_is64) { \ type *p = &cip->u_64.field; \ (void)p; \ cip->u_64.field = val; \ } else { \ type32 *p = &cip->u_32.field; \ (void)p; \ cip->u_32.field = val; \ } \ } \ LFS_DEF_CI_ACCESSOR(uint32_t, uint32_t, clean) LFS_DEF_CI_ACCESSOR(uint32_t, uint32_t, dirty) LFS_DEF_CI_ACCESSOR(int64_t, int32_t, bfree) LFS_DEF_CI_ACCESSOR(int64_t, int32_t, avail) LFS_DEF_CI_ACCESSOR(uint64_t, uint32_t, free_head) LFS_DEF_CI_ACCESSOR(uint64_t, uint32_t, free_tail) LFS_DEF_CI_ACCESSOR(uint32_t, uint32_t, flags) static __inline void lfs_ci_shiftcleantodirty(STRUCT_LFS *fs, CLEANERINFO *cip, unsigned num) { lfs_ci_setclean(fs, cip, lfs_ci_getclean(fs, cip) - num); lfs_ci_setdirty(fs, cip, lfs_ci_getdirty(fs, cip) + num); } static __inline void lfs_ci_shiftdirtytoclean(STRUCT_LFS *fs, CLEANERINFO *cip, unsigned num) { lfs_ci_setdirty(fs, cip, lfs_ci_getdirty(fs, cip) - num); lfs_ci_setclean(fs, cip, lfs_ci_getclean(fs, cip) + num); } /* Read in the block with the cleaner info from the ifile. */ #define LFS_CLEANERINFO(CP, F, BP) do { \ int _e; \ SHARE_IFLOCK(F); \ VTOI((F)->lfs_ivnode)->i_state |= IN_ACCESS; \ _e = bread((F)->lfs_ivnode, \ (daddr_t)0, lfs_sb_getbsize(F), 0, &(BP)); \ if (_e) \ panic("lfs: ifile read: cleanerinfo: error %d\n", _e); \ (CP) = (CLEANERINFO *)(BP)->b_data; \ UNSHARE_IFLOCK(F); \ } while (0) /* * Synchronize the Ifile cleaner info with current avail and bfree. */ #define LFS_SYNC_CLEANERINFO(cip, fs, bp, w) do { \ mutex_enter(&lfs_lock); \ if ((w) || lfs_ci_getbfree(fs, cip) != lfs_sb_getbfree(fs) || \ lfs_ci_getavail(fs, cip) != lfs_sb_getavail(fs) - fs->lfs_ravail - \ fs->lfs_favail) { \ lfs_ci_setbfree(fs, cip, lfs_sb_getbfree(fs)); \ lfs_ci_setavail(fs, cip, lfs_sb_getavail(fs) - fs->lfs_ravail - \ fs->lfs_favail); \ if (((bp)->b_flags & B_GATHERED) == 0) { \ fs->lfs_flags |= LFS_IFDIRTY; \ } \ mutex_exit(&lfs_lock); \ (void) LFS_BWRITE_LOG(bp); /* Ifile */ \ } else { \ mutex_exit(&lfs_lock); \ brelse(bp, 0); \ } \ } while (0) /* * Get the head of the inode free list. * Always called with the segment lock held. */ #define LFS_GET_HEADFREE(FS, CIP, BP, FREEP) do { \ if (lfs_sb_getversion(FS) > 1) { \ LFS_CLEANERINFO((CIP), (FS), (BP)); \ lfs_sb_setfreehd(FS, lfs_ci_getfree_head(FS, CIP)); \ brelse(BP, 0); \ } \ *(FREEP) = lfs_sb_getfreehd(FS); \ } while (0) #define LFS_PUT_HEADFREE(FS, CIP, BP, VAL) do { \ lfs_sb_setfreehd(FS, VAL); \ if (lfs_sb_getversion(FS) > 1) { \ LFS_CLEANERINFO((CIP), (FS), (BP)); \ lfs_ci_setfree_head(FS, CIP, VAL); \ LFS_BWRITE_LOG(BP); \ mutex_enter(&lfs_lock); \ (FS)->lfs_flags |= LFS_IFDIRTY; \ mutex_exit(&lfs_lock); \ } \ } while (0) #define LFS_GET_TAILFREE(FS, CIP, BP, FREEP) do { \ LFS_CLEANERINFO((CIP), (FS), (BP)); \ *(FREEP) = lfs_ci_getfree_tail(FS, CIP); \ brelse(BP, 0); \ } while (0) #define LFS_PUT_TAILFREE(FS, CIP, BP, VAL) do { \ LFS_CLEANERINFO((CIP), (FS), (BP)); \ lfs_ci_setfree_tail(FS, CIP, VAL); \ LFS_BWRITE_LOG(BP); \ mutex_enter(&lfs_lock); \ (FS)->lfs_flags |= LFS_IFDIRTY; \ mutex_exit(&lfs_lock); \ } while (0) /* * On-disk segment summary information */ #define SEGSUM_SIZE(fs) \ (fs->lfs_is64 ? sizeof(SEGSUM64) : \ lfs_sb_getversion(fs) > 1 ? sizeof(SEGSUM32) : sizeof(SEGSUM_V1)) /* * The SEGSUM structure is followed by FINFO structures. Get the pointer * to the first FINFO. * * XXX this can't be a macro yet; this file needs to be resorted. */ #if 0 static __inline FINFO * segsum_finfobase(STRUCT_LFS *fs, SEGSUM *ssp) { return (FINFO *)((char *)ssp + SEGSUM_SIZE(fs)); } #else #define SEGSUM_FINFOBASE(fs, ssp) \ ((FINFO *)((char *)(ssp) + SEGSUM_SIZE(fs))); #endif #define LFS_DEF_SS_ACCESSOR(type, type32, field) \ static __inline type \ lfs_ss_get##field(STRUCT_LFS *fs, SEGSUM *ssp) \ { \ if (fs->lfs_is64) { \ return ssp->u_64.ss_##field; \ } else { \ return ssp->u_32.ss_##field; \ } \ } \ static __inline void \ lfs_ss_set##field(STRUCT_LFS *fs, SEGSUM *ssp, type val) \ { \ if (fs->lfs_is64) { \ type *p = &ssp->u_64.ss_##field; \ (void)p; \ ssp->u_64.ss_##field = val; \ } else { \ type32 *p = &ssp->u_32.ss_##field; \ (void)p; \ ssp->u_32.ss_##field = val; \ } \ } \ LFS_DEF_SS_ACCESSOR(uint32_t, uint32_t, sumsum) LFS_DEF_SS_ACCESSOR(uint32_t, uint32_t, datasum) LFS_DEF_SS_ACCESSOR(uint32_t, uint32_t, magic) LFS_DEF_SS_ACCESSOR(uint32_t, uint32_t, ident) LFS_DEF_SS_ACCESSOR(int64_t, int32_t, next) LFS_DEF_SS_ACCESSOR(uint16_t, uint16_t, nfinfo) LFS_DEF_SS_ACCESSOR(uint16_t, uint16_t, ninos) LFS_DEF_SS_ACCESSOR(uint16_t, uint16_t, flags) LFS_DEF_SS_ACCESSOR(uint64_t, uint32_t, reclino) LFS_DEF_SS_ACCESSOR(uint64_t, uint64_t, serial) LFS_DEF_SS_ACCESSOR(uint64_t, uint64_t, create) static __inline size_t lfs_ss_getsumstart(STRUCT_LFS *fs) { /* These are actually all the same. */ if (fs->lfs_is64) { return offsetof(SEGSUM64, ss_datasum); } else /* if (lfs_sb_getversion(fs) > 1) */ { return offsetof(SEGSUM32, ss_datasum); } /* else { return offsetof(SEGSUM_V1, ss_datasum); } */ /* * XXX ^^^ until this file is resorted lfs_sb_getversion isn't * defined yet. */ } static __inline uint32_t lfs_ss_getocreate(STRUCT_LFS *fs, SEGSUM *ssp) { KASSERT(fs->lfs_is64 == 0); /* XXX need to resort this file before we can do this */ //KASSERT(lfs_sb_getversion(fs) == 1); return ssp->u_v1.ss_create; } static __inline void lfs_ss_setocreate(STRUCT_LFS *fs, SEGSUM *ssp, uint32_t val) { KASSERT(fs->lfs_is64 == 0); /* XXX need to resort this file before we can do this */ //KASSERT(lfs_sb_getversion(fs) == 1); ssp->u_v1.ss_create = val; } /* * Super block. */ /* * Generate accessors for the on-disk superblock fields with cpp. */ #define LFS_DEF_SB_ACCESSOR_FULL(type, type32, field) \ static __inline type \ lfs_sb_get##field(STRUCT_LFS *fs) \ { \ if (fs->lfs_is64) { \ return fs->lfs_dlfs_u.u_64.dlfs_##field; \ } else { \ return fs->lfs_dlfs_u.u_32.dlfs_##field; \ } \ } \ static __inline void \ lfs_sb_set##field(STRUCT_LFS *fs, type val) \ { \ if (fs->lfs_is64) { \ fs->lfs_dlfs_u.u_64.dlfs_##field = val; \ } else { \ fs->lfs_dlfs_u.u_32.dlfs_##field = val; \ } \ } \ static __inline void \ lfs_sb_add##field(STRUCT_LFS *fs, type val) \ { \ if (fs->lfs_is64) { \ type *p64 = &fs->lfs_dlfs_u.u_64.dlfs_##field; \ *p64 += val; \ } else { \ type32 *p32 = &fs->lfs_dlfs_u.u_32.dlfs_##field; \ *p32 += val; \ } \ } \ static __inline void \ lfs_sb_sub##field(STRUCT_LFS *fs, type val) \ { \ if (fs->lfs_is64) { \ type *p64 = &fs->lfs_dlfs_u.u_64.dlfs_##field; \ *p64 -= val; \ } else { \ type32 *p32 = &fs->lfs_dlfs_u.u_32.dlfs_##field; \ *p32 -= val; \ } \ } #define LFS_DEF_SB_ACCESSOR(t, f) LFS_DEF_SB_ACCESSOR_FULL(t, t, f) #define LFS_DEF_SB_ACCESSOR_32ONLY(type, field, val64) \ static __inline type \ lfs_sb_get##field(STRUCT_LFS *fs) \ { \ if (fs->lfs_is64) { \ return val64; \ } else { \ return fs->lfs_dlfs_u.u_32.dlfs_##field; \ } \ } LFS_DEF_SB_ACCESSOR(uint32_t, version) LFS_DEF_SB_ACCESSOR_FULL(uint64_t, uint32_t, size) LFS_DEF_SB_ACCESSOR(uint32_t, ssize) LFS_DEF_SB_ACCESSOR_FULL(uint64_t, uint32_t, dsize) LFS_DEF_SB_ACCESSOR(uint32_t, bsize) LFS_DEF_SB_ACCESSOR(uint32_t, fsize) LFS_DEF_SB_ACCESSOR(uint32_t, frag) LFS_DEF_SB_ACCESSOR_FULL(uint64_t, uint32_t, freehd) LFS_DEF_SB_ACCESSOR_FULL(int64_t, int32_t, bfree) LFS_DEF_SB_ACCESSOR_FULL(uint64_t, uint32_t, nfiles) LFS_DEF_SB_ACCESSOR_FULL(int64_t, int32_t, avail) LFS_DEF_SB_ACCESSOR(int32_t, uinodes) LFS_DEF_SB_ACCESSOR_FULL(int64_t, int32_t, idaddr) LFS_DEF_SB_ACCESSOR_32ONLY(uint32_t, ifile, LFS_IFILE_INUM) LFS_DEF_SB_ACCESSOR_FULL(int64_t, int32_t, lastseg) LFS_DEF_SB_ACCESSOR_FULL(int64_t, int32_t, nextseg) LFS_DEF_SB_ACCESSOR_FULL(int64_t, int32_t, curseg) LFS_DEF_SB_ACCESSOR_FULL(int64_t, int32_t, offset) LFS_DEF_SB_ACCESSOR_FULL(int64_t, int32_t, lastpseg) LFS_DEF_SB_ACCESSOR(uint32_t, inopf) LFS_DEF_SB_ACCESSOR(uint32_t, minfree) LFS_DEF_SB_ACCESSOR(uint64_t, maxfilesize) LFS_DEF_SB_ACCESSOR(uint32_t, fsbpseg) LFS_DEF_SB_ACCESSOR(uint32_t, inopb) LFS_DEF_SB_ACCESSOR(uint32_t, ifpb) LFS_DEF_SB_ACCESSOR(uint32_t, sepb) LFS_DEF_SB_ACCESSOR(uint32_t, nindir) LFS_DEF_SB_ACCESSOR(uint32_t, nseg) LFS_DEF_SB_ACCESSOR(uint32_t, nspf) LFS_DEF_SB_ACCESSOR(uint32_t, cleansz) LFS_DEF_SB_ACCESSOR(uint32_t, segtabsz) LFS_DEF_SB_ACCESSOR_32ONLY(uint32_t, segmask, 0) LFS_DEF_SB_ACCESSOR_32ONLY(uint32_t, segshift, 0) LFS_DEF_SB_ACCESSOR(uint64_t, bmask) LFS_DEF_SB_ACCESSOR(uint32_t, bshift) LFS_DEF_SB_ACCESSOR(uint64_t, ffmask) LFS_DEF_SB_ACCESSOR(uint32_t, ffshift) LFS_DEF_SB_ACCESSOR(uint64_t, fbmask) LFS_DEF_SB_ACCESSOR(uint32_t, fbshift) LFS_DEF_SB_ACCESSOR(uint32_t, blktodb) LFS_DEF_SB_ACCESSOR(uint32_t, fsbtodb) LFS_DEF_SB_ACCESSOR(uint32_t, sushift) LFS_DEF_SB_ACCESSOR(int32_t, maxsymlinklen) LFS_DEF_SB_ACCESSOR(uint32_t, cksum) LFS_DEF_SB_ACCESSOR(uint16_t, pflags) LFS_DEF_SB_ACCESSOR(uint32_t, nclean) LFS_DEF_SB_ACCESSOR(int32_t, dmeta) LFS_DEF_SB_ACCESSOR(uint32_t, minfreeseg) LFS_DEF_SB_ACCESSOR(uint32_t, sumsize) LFS_DEF_SB_ACCESSOR(uint64_t, serial) LFS_DEF_SB_ACCESSOR(uint32_t, ibsize) LFS_DEF_SB_ACCESSOR_FULL(int64_t, int32_t, s0addr) LFS_DEF_SB_ACCESSOR(uint64_t, tstamp) LFS_DEF_SB_ACCESSOR(uint32_t, inodefmt) LFS_DEF_SB_ACCESSOR(uint32_t, interleave) LFS_DEF_SB_ACCESSOR(uint32_t, ident) LFS_DEF_SB_ACCESSOR(uint32_t, resvseg) /* special-case accessors */ /* * the v1 otstamp field lives in what's now dlfs_inopf */ #define lfs_sb_getotstamp(fs) lfs_sb_getinopf(fs) #define lfs_sb_setotstamp(fs, val) lfs_sb_setinopf(fs, val) /* * lfs_sboffs is an array */ static __inline int32_t lfs_sb_getsboff(STRUCT_LFS *fs, unsigned n) { #ifdef KASSERT /* ugh */ KASSERT(n < LFS_MAXNUMSB); #endif if (fs->lfs_is64) { return fs->lfs_dlfs_u.u_64.dlfs_sboffs[n]; } else { return fs->lfs_dlfs_u.u_32.dlfs_sboffs[n]; } } static __inline void lfs_sb_setsboff(STRUCT_LFS *fs, unsigned n, int32_t val) { #ifdef KASSERT /* ugh */ KASSERT(n < LFS_MAXNUMSB); #endif if (fs->lfs_is64) { fs->lfs_dlfs_u.u_64.dlfs_sboffs[n] = val; } else { fs->lfs_dlfs_u.u_32.dlfs_sboffs[n] = val; } } /* * lfs_fsmnt is a string */ static __inline const char * lfs_sb_getfsmnt(STRUCT_LFS *fs) { if (fs->lfs_is64) { return (const char *)fs->lfs_dlfs_u.u_64.dlfs_fsmnt; } else { return (const char *)fs->lfs_dlfs_u.u_32.dlfs_fsmnt; } } static __inline void lfs_sb_setfsmnt(STRUCT_LFS *fs, const char *str) { if (fs->lfs_is64) { (void)strncpy((char *)fs->lfs_dlfs_u.u_64.dlfs_fsmnt, str, sizeof(fs->lfs_dlfs_u.u_64.dlfs_fsmnt)); } else { (void)strncpy((char *)fs->lfs_dlfs_u.u_32.dlfs_fsmnt, str, sizeof(fs->lfs_dlfs_u.u_32.dlfs_fsmnt)); } } /* Highest addressable fsb */ #define LFS_MAX_DADDR(fs) \ ((fs)->lfs_is64 ? 0x7fffffffffffffff : 0x7fffffff) /* LFS_NINDIR is the number of indirects in a file system block. */ #define LFS_NINDIR(fs) (lfs_sb_getnindir(fs)) /* LFS_INOPB is the number of inodes in a secondary storage block. */ #define LFS_INOPB(fs) (lfs_sb_getinopb(fs)) /* LFS_INOPF is the number of inodes in a fragment. */ #define LFS_INOPF(fs) (lfs_sb_getinopf(fs)) #define lfs_blkoff(fs, loc) ((int)((loc) & lfs_sb_getbmask(fs))) #define lfs_fragoff(fs, loc) /* calculates (loc % fs->lfs_fsize) */ \ ((int)((loc) & lfs_sb_getffmask(fs))) /* XXX: lowercase these as they're no longer macros */ /* Frags to diskblocks */ static __inline uint64_t LFS_FSBTODB(STRUCT_LFS *fs, uint64_t b) { #if defined(_KERNEL) return b << (lfs_sb_getffshift(fs) - DEV_BSHIFT); #else return b << lfs_sb_getfsbtodb(fs); #endif } /* Diskblocks to frags */ static __inline uint64_t LFS_DBTOFSB(STRUCT_LFS *fs, uint64_t b) { #if defined(_KERNEL) return b >> (lfs_sb_getffshift(fs) - DEV_BSHIFT); #else return b >> lfs_sb_getfsbtodb(fs); #endif } #define lfs_lblkno(fs, loc) ((loc) >> lfs_sb_getbshift(fs)) #define lfs_lblktosize(fs, blk) ((blk) << lfs_sb_getbshift(fs)) /* Frags to bytes */ static __inline uint64_t lfs_fsbtob(STRUCT_LFS *fs, uint64_t b) { return b << lfs_sb_getffshift(fs); } /* Bytes to frags */ static __inline uint64_t lfs_btofsb(STRUCT_LFS *fs, uint64_t b) { return b >> lfs_sb_getffshift(fs); } #define lfs_numfrags(fs, loc) /* calculates (loc / fs->lfs_fsize) */ \ ((loc) >> lfs_sb_getffshift(fs)) #define lfs_blkroundup(fs, size)/* calculates roundup(size, lfs_sb_getbsize(fs)) */ \ ((off_t)(((size) + lfs_sb_getbmask(fs)) & (~lfs_sb_getbmask(fs)))) #define lfs_fragroundup(fs, size)/* calculates roundup(size, fs->lfs_fsize) */ \ ((off_t)(((size) + lfs_sb_getffmask(fs)) & (~lfs_sb_getffmask(fs)))) #define lfs_fragstoblks(fs, frags)/* calculates (frags / fs->fs_frag) */ \ ((frags) >> lfs_sb_getfbshift(fs)) #define lfs_blkstofrags(fs, blks)/* calculates (blks * fs->fs_frag) */ \ ((blks) << lfs_sb_getfbshift(fs)) #define lfs_fragnum(fs, fsb) /* calculates (fsb % fs->lfs_frag) */ \ ((fsb) & ((fs)->lfs_frag - 1)) #define lfs_blknum(fs, fsb) /* calculates rounddown(fsb, fs->lfs_frag) */ \ ((fsb) &~ ((fs)->lfs_frag - 1)) #define lfs_dblksize(fs, dp, lbn) \ (((lbn) >= ULFS_NDADDR || lfs_dino_getsize(fs, dp) >= ((lbn) + 1) << lfs_sb_getbshift(fs)) \ ? lfs_sb_getbsize(fs) \ : (lfs_fragroundup(fs, lfs_blkoff(fs, lfs_dino_getsize(fs, dp))))) #define lfs_segsize(fs) (lfs_sb_getversion(fs) == 1 ? \ lfs_lblktosize((fs), lfs_sb_getssize(fs)) : \ lfs_sb_getssize(fs)) /* XXX segtod produces a result in frags despite the 'd' */ #define lfs_segtod(fs, seg) (lfs_btofsb(fs, lfs_segsize(fs)) * (seg)) #define lfs_dtosn(fs, daddr) /* block address to segment number */ \ ((uint32_t)(((daddr) - lfs_sb_gets0addr(fs)) / lfs_segtod((fs), 1))) #define lfs_sntod(fs, sn) /* segment number to disk address */ \ ((daddr_t)(lfs_segtod((fs), (sn)) + lfs_sb_gets0addr(fs))) /* XXX, blah. make this appear only if struct inode is defined */ #ifdef _UFS_LFS_LFS_INODE_H_ static __inline uint32_t lfs_blksize(STRUCT_LFS *fs, struct inode *ip, uint64_t lbn) { if (lbn >= ULFS_NDADDR || lfs_dino_getsize(fs, ip->i_din) >= (lbn + 1) << lfs_sb_getbshift(fs)) { return lfs_sb_getbsize(fs); } else { return lfs_fragroundup(fs, lfs_blkoff(fs, lfs_dino_getsize(fs, ip->i_din))); } } #endif /* * union lfs_blocks */ static __inline void lfs_blocks_fromvoid(STRUCT_LFS *fs, union lfs_blocks *bp, void *p) { if (fs->lfs_is64) { bp->b64 = p; } else { bp->b32 = p; } } static __inline void lfs_blocks_fromfinfo(STRUCT_LFS *fs, union lfs_blocks *bp, FINFO *fip) { void *firstblock; firstblock = (char *)fip + FINFOSIZE(fs); if (fs->lfs_is64) { bp->b64 = (int64_t *)firstblock; } else { bp->b32 = (int32_t *)firstblock; } } static __inline daddr_t lfs_blocks_get(STRUCT_LFS *fs, union lfs_blocks *bp, unsigned idx) { if (fs->lfs_is64) { return bp->b64[idx]; } else { return bp->b32[idx]; } } static __inline void lfs_blocks_set(STRUCT_LFS *fs, union lfs_blocks *bp, unsigned idx, daddr_t val) { if (fs->lfs_is64) { bp->b64[idx] = val; } else { bp->b32[idx] = val; } } static __inline void lfs_blocks_inc(STRUCT_LFS *fs, union lfs_blocks *bp) { if (fs->lfs_is64) { bp->b64++; } else { bp->b32++; } } static __inline int lfs_blocks_eq(STRUCT_LFS *fs, union lfs_blocks *bp1, union lfs_blocks *bp2) { if (fs->lfs_is64) { return bp1->b64 == bp2->b64; } else { return bp1->b32 == bp2->b32; } } static __inline int lfs_blocks_sub(STRUCT_LFS *fs, union lfs_blocks *bp1, union lfs_blocks *bp2) { /* (remember that the pointers are typed) */ if (fs->lfs_is64) { return bp1->b64 - bp2->b64; } else { return bp1->b32 - bp2->b32; } } /* * struct segment */ /* * Macros for determining free space on the disk, with the variable metadata * of segment summaries and inode blocks taken into account. */ /* * Estimate number of clean blocks not available for writing because * they will contain metadata or overhead. This is calculated as * * E = ((C * M / D) * D + (0) * (T - D)) / T * or more simply * E = (C * M) / T * * where * C is the clean space, * D is the dirty space, * M is the dirty metadata, and * T = C + D is the total space on disk. * * This approximates the old formula of E = C * M / D when D is close to T, * but avoids falsely reporting "disk full" when the sample size (D) is small. */ #define LFS_EST_CMETA(F) (( \ (lfs_sb_getdmeta(F) * (int64_t)lfs_sb_getnclean(F)) / \ (lfs_sb_getnseg(F)))) /* Estimate total size of the disk not including metadata */ #define LFS_EST_NONMETA(F) (lfs_sb_getdsize(F) - lfs_sb_getdmeta(F) - LFS_EST_CMETA(F)) /* Estimate number of blocks actually available for writing */ #define LFS_EST_BFREE(F) (lfs_sb_getbfree(F) > LFS_EST_CMETA(F) ? \ lfs_sb_getbfree(F) - LFS_EST_CMETA(F) : 0) /* Amount of non-meta space not available to mortal man */ #define LFS_EST_RSVD(F) ((LFS_EST_NONMETA(F) * \ (uint64_t)lfs_sb_getminfree(F)) / \ 100) /* Can credential C write BB blocks? XXX: kauth_cred_geteuid is abusive */ #define ISSPACE(F, BB, C) \ ((((C) == NOCRED || kauth_cred_geteuid(C) == 0) && \ LFS_EST_BFREE(F) >= (BB)) || \ (kauth_cred_geteuid(C) != 0 && IS_FREESPACE(F, BB))) /* Can an ordinary user write BB blocks */ #define IS_FREESPACE(F, BB) \ (LFS_EST_BFREE(F) >= (BB) + LFS_EST_RSVD(F)) /* * The minimum number of blocks to create a new inode. This is: * directory direct block (1) + ULFS_NIADDR indirect blocks + inode block (1) + * ifile direct block (1) + ULFS_NIADDR indirect blocks = 3 + 2 * ULFS_NIADDR blocks. */ #define LFS_NRESERVE(F) (lfs_btofsb((F), (2 * ULFS_NIADDR + 3) << lfs_sb_getbshift(F))) /* * Suppress spurious clang warnings */ #ifdef __GNUC__ #if defined(__clang__) #pragma clang diagnostic pop #elif __GNUC_PREREQ__(9,0) #pragma GCC diagnostic pop #endif #endif #endif /* _UFS_LFS_LFS_ACCESSORS_H_ */
9 1 3 5 5 5 5 5 5 5 2 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 /* $NetBSD: kern_mod_80.c,v 1.6 2019/12/12 02:15:42 pgoyette Exp $ */ /*- * Copyright (c) 2008 The NetBSD Foundation, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /* * System calls relating to loadable modules. */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: kern_mod_80.c,v 1.6 2019/12/12 02:15:42 pgoyette Exp $"); #ifdef _KERNEL_OPT #include "opt_compat_netbsd.h" #include "opt_modular.h" #endif #include <sys/param.h> #include <sys/systm.h> #include <sys/proc.h> #include <sys/namei.h> #include <sys/kauth.h> #include <sys/kmem.h> #include <sys/kobj.h> #include <sys/module.h> #include <sys/syscall.h> #include <sys/syscallargs.h> #include <sys/compat_stub.h> #include <compat/sys/module.h> #include <compat/common/compat_mod.h> static int compat_80_modstat(int cmd, struct iovec *iov, void *arg) { omodstat_t *oms, *omso; modinfo_t *mi; module_t *mod; vaddr_t addr; size_t size; size_t omslen; size_t used; int error; int omscnt; bool stataddr; const char *suffix = "..."; if (cmd != MODCTL_OSTAT) return EINVAL; error = copyin(arg, iov, sizeof(*iov)); if (error != 0) { return error; } /* If not privileged, don't expose kernel addresses. */ error = kauth_authorize_system(kauth_cred_get(), KAUTH_SYSTEM_MODULE, 0, (void *)(uintptr_t)MODCTL_STAT, NULL, NULL); stataddr = (error == 0); kernconfig_lock(); omscnt = 0; TAILQ_FOREACH(mod, &module_list, mod_chain) { omscnt++; mi = mod->mod_info; } TAILQ_FOREACH(mod, &module_builtins, mod_chain) { omscnt++; mi = mod->mod_info; } omslen = omscnt * sizeof(omodstat_t); omso = kmem_zalloc(omslen, KM_SLEEP); oms = omso; TAILQ_FOREACH(mod, &module_list, mod_chain) { mi = mod->mod_info; strlcpy(oms->oms_name, mi->mi_name, sizeof(oms->oms_name)); if (mi->mi_required != NULL) { used = strlcpy(oms->oms_required, mi->mi_required, sizeof(oms->oms_required)); if (used >= sizeof(oms->oms_required)) { oms->oms_required[sizeof(oms->oms_required) - strlen(suffix) - 1] = '\0'; strlcat(oms->oms_required, suffix, sizeof(oms->oms_required)); } } if (mod->mod_kobj != NULL && stataddr) { kobj_stat(mod->mod_kobj, &addr, &size); oms->oms_addr = addr; oms->oms_size = size; } oms->oms_class = mi->mi_class; oms->oms_refcnt = mod->mod_refcnt; oms->oms_source = mod->mod_source; oms->oms_flags = mod->mod_flags; oms++; } TAILQ_FOREACH(mod, &module_builtins, mod_chain) { mi = mod->mod_info; strlcpy(oms->oms_name, mi->mi_name, sizeof(oms->oms_name)); if (mi->mi_required != NULL) { used = strlcpy(oms->oms_required, mi->mi_required, sizeof(oms->oms_required)); if (used >= sizeof(oms->oms_required)) { oms->oms_required[sizeof(oms->oms_required) - strlen(suffix) - 1] = '\0'; strlcat(oms->oms_required, suffix, sizeof(oms->oms_required)); } } if (mod->mod_kobj != NULL && stataddr) { kobj_stat(mod->mod_kobj, &addr, &size); oms->oms_addr = addr; oms->oms_size = size; } oms->oms_class = mi->mi_class; oms->oms_refcnt = -1; KASSERT(mod->mod_source == MODULE_SOURCE_KERNEL); oms->oms_source = mod->mod_source; oms++; } kernconfig_unlock(); error = copyout(omso, iov->iov_base, uimin(omslen, iov->iov_len)); kmem_free(omso, omslen); if (error == 0) { iov->iov_len = omslen; error = copyout(iov, arg, sizeof(*iov)); } return error; } void kern_mod_80_init(void) { MODULE_HOOK_SET(compat_modstat_80_hook, compat_80_modstat); } void kern_mod_80_fini(void) { MODULE_HOOK_UNSET(compat_modstat_80_hook); }
2 2 2 2 2 2 2 2 2 10 10 10 9 1 9 1 25 25 25 25 17 17 17 17 26 549 534 537 10 985 980 69 842 60 194 151 132 132 835 372 676 842 2039 1595 1716 2759 2757 2776 1015 1018 1015 1013 1016 66 1016 1016 1016 1020 1017 1019 1012 1015 1018 1019 511 836 837 839 1020 1016 1017 511 830 1013 823 524 1014 3 1009 1014 1015 1013 1010 1012 990 991 993 991 991 988 32 988 3 989 989 993 62 994 992 48 48 48 48 48 46 1 1 1 5 40 5 42 23 22 20 2 2 209 209 70 160 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 /* $NetBSD: kern_synch.c,v 1.366 2023/11/22 13:18:48 riastradh Exp $ */ /*- * Copyright (c) 1999, 2000, 2004, 2006, 2007, 2008, 2009, 2019, 2020, 2023 * The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility, * NASA Ames Research Center, by Charles M. Hannum, Andrew Doran and * Daniel Sieger. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /*- * Copyright (c) 1982, 1986, 1990, 1991, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)kern_synch.c 8.9 (Berkeley) 5/19/95 */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: kern_synch.c,v 1.366 2023/11/22 13:18:48 riastradh Exp $"); #include "opt_kstack.h" #include "opt_ddb.h" #include "opt_dtrace.h" #define __MUTEX_PRIVATE #include <sys/param.h> #include <sys/atomic.h> #include <sys/cpu.h> #include <sys/dtrace_bsd.h> #include <sys/evcnt.h> #include <sys/intr.h> #include <sys/kernel.h> #include <sys/lockdebug.h> #include <sys/lwpctl.h> #include <sys/proc.h> #include <sys/pserialize.h> #include <sys/resource.h> #include <sys/resourcevar.h> #include <sys/rwlock.h> #include <sys/sched.h> #include <sys/sleepq.h> #include <sys/syncobj.h> #include <sys/syscall_stats.h> #include <sys/syslog.h> #include <sys/systm.h> #include <uvm/uvm_extern.h> #include <dev/lockstat.h> int dtrace_vtime_active=0; dtrace_vtime_switch_func_t dtrace_vtime_switch_func; #ifdef DDB #include <ddb/ddb.h> #endif static void sched_unsleep(struct lwp *, bool); static void sched_changepri(struct lwp *, pri_t); static void sched_lendpri(struct lwp *, pri_t); syncobj_t sleep_syncobj = { .sobj_name = "sleep", .sobj_flag = SOBJ_SLEEPQ_SORTED, .sobj_boostpri = PRI_KERNEL, .sobj_unsleep = sleepq_unsleep, .sobj_changepri = sleepq_changepri, .sobj_lendpri = sleepq_lendpri, .sobj_owner = syncobj_noowner, }; syncobj_t sched_syncobj = { .sobj_name = "sched", .sobj_flag = SOBJ_SLEEPQ_SORTED, .sobj_boostpri = PRI_USER, .sobj_unsleep = sched_unsleep, .sobj_changepri = sched_changepri, .sobj_lendpri = sched_lendpri, .sobj_owner = syncobj_noowner, }; syncobj_t kpause_syncobj = { .sobj_name = "kpause", .sobj_flag = SOBJ_SLEEPQ_NULL, .sobj_boostpri = PRI_KERNEL, .sobj_unsleep = sleepq_unsleep, .sobj_changepri = sleepq_changepri, .sobj_lendpri = sleepq_lendpri, .sobj_owner = syncobj_noowner, }; /* "Lightning bolt": once a second sleep address. */ kcondvar_t lbolt __cacheline_aligned; u_int sched_pstats_ticks __cacheline_aligned; /* Preemption event counters. */ static struct evcnt kpreempt_ev_crit __cacheline_aligned; static struct evcnt kpreempt_ev_klock __cacheline_aligned; static struct evcnt kpreempt_ev_immed __cacheline_aligned; void synch_init(void) { cv_init(&lbolt, "lbolt"); evcnt_attach_dynamic(&kpreempt_ev_crit, EVCNT_TYPE_MISC, NULL, "kpreempt", "defer: critical section"); evcnt_attach_dynamic(&kpreempt_ev_klock, EVCNT_TYPE_MISC, NULL, "kpreempt", "defer: kernel_lock"); evcnt_attach_dynamic(&kpreempt_ev_immed, EVCNT_TYPE_MISC, NULL, "kpreempt", "immediate"); } /* * OBSOLETE INTERFACE * * General sleep call. Suspends the current LWP until a wakeup is * performed on the specified identifier. The LWP will then be made * runnable with the specified priority. Sleeps at most timo/hz seconds (0 * means no timeout). If pri includes PCATCH flag, signals are checked * before and after sleeping, else signals are not checked. Returns 0 if * awakened, EWOULDBLOCK if the timeout expires. If PCATCH is set and a * signal needs to be delivered, ERESTART is returned if the current system * call should be restarted if possible, and EINTR is returned if the system * call should be interrupted by the signal (return EINTR). */ int tsleep(wchan_t ident, pri_t priority, const char *wmesg, int timo) { struct lwp *l = curlwp; sleepq_t *sq; kmutex_t *mp; bool catch_p; int nlocks; KASSERT((l->l_pflag & LP_INTR) == 0); KASSERT(ident != &lbolt); //KASSERT(KERNEL_LOCKED_P()); if (sleepq_dontsleep(l)) { (void)sleepq_abort(NULL, 0); return 0; } catch_p = priority & PCATCH; sq = sleeptab_lookup(&sleeptab, ident, &mp); nlocks = sleepq_enter(sq, l, mp); sleepq_enqueue(sq, ident, wmesg, &sleep_syncobj, catch_p); return sleepq_block(timo, catch_p, &sleep_syncobj, nlocks); } int mtsleep(wchan_t ident, pri_t priority, const char *wmesg, int timo, kmutex_t *mtx) { struct lwp *l = curlwp; sleepq_t *sq; kmutex_t *mp; bool catch_p; int error, nlocks; KASSERT((l->l_pflag & LP_INTR) == 0); KASSERT(ident != &lbolt); if (sleepq_dontsleep(l)) { (void)sleepq_abort(mtx, (priority & PNORELOCK) != 0); return 0; } catch_p = priority & PCATCH; sq = sleeptab_lookup(&sleeptab, ident, &mp); nlocks = sleepq_enter(sq, l, mp); sleepq_enqueue(sq, ident, wmesg, &sleep_syncobj, catch_p); mutex_exit(mtx); error = sleepq_block(timo, catch_p, &sleep_syncobj, nlocks); if ((priority & PNORELOCK) == 0) mutex_enter(mtx); return error; } /* * General sleep call for situations where a wake-up is not expected. */ int kpause(const char *wmesg, bool intr, int timo, kmutex_t *mtx) { struct lwp *l = curlwp; int error, nlocks; KASSERTMSG(timo != 0 || intr, "wmesg=%s intr=%s timo=%d mtx=%p", wmesg, intr ? "true" : "false", timo, mtx); if (sleepq_dontsleep(l)) return sleepq_abort(NULL, 0); if (mtx != NULL) mutex_exit(mtx); nlocks = sleepq_enter(NULL, l, NULL); sleepq_enqueue(NULL, l, wmesg, &kpause_syncobj, intr); error = sleepq_block(timo, intr, &kpause_syncobj, nlocks); if (mtx != NULL) mutex_enter(mtx); return error; } /* * OBSOLETE INTERFACE * * Make all LWPs sleeping on the specified identifier runnable. */ void wakeup(wchan_t ident) { sleepq_t *sq; kmutex_t *mp; if (__predict_false(cold)) return; sq = sleeptab_lookup(&sleeptab, ident, &mp); sleepq_wake(sq, ident, (u_int)-1, mp); } /* * General yield call. Puts the current LWP back on its run queue and * performs a context switch. */ void yield(void) { struct lwp *l = curlwp; int nlocks; KERNEL_UNLOCK_ALL(l, &nlocks); lwp_lock(l); KASSERT(lwp_locked(l, l->l_cpu->ci_schedstate.spc_lwplock)); KASSERT(l->l_stat == LSONPROC); spc_lock(l->l_cpu); mi_switch(l); KERNEL_LOCK(nlocks, l); } /* * General preemption call. Puts the current LWP back on its run queue * and performs an involuntary context switch. Different from yield() * in that: * * - It's counted differently (involuntary vs. voluntary). * - Realtime threads go to the head of their runqueue vs. tail for yield(). */ void preempt(void) { struct lwp *l = curlwp; int nlocks; KERNEL_UNLOCK_ALL(l, &nlocks); lwp_lock(l); KASSERT(lwp_locked(l, l->l_cpu->ci_schedstate.spc_lwplock)); KASSERT(l->l_stat == LSONPROC); spc_lock(l->l_cpu); l->l_pflag |= LP_PREEMPTING; mi_switch(l); KERNEL_LOCK(nlocks, l); } /* * Return true if the current LWP should yield the processor. Intended to * be used by long-running code in kernel. */ inline bool preempt_needed(void) { lwp_t *l = curlwp; int needed; KPREEMPT_DISABLE(l); needed = l->l_cpu->ci_want_resched; KPREEMPT_ENABLE(l); return (needed != 0); } /* * A breathing point for long running code in kernel. */ void preempt_point(void) { if (__predict_false(preempt_needed())) { preempt(); } } /* * Handle a request made by another agent to preempt the current LWP * in-kernel. Usually called when l_dopreempt may be non-zero. * * Character addresses for lockstat only. */ static char kpreempt_is_disabled; static char kernel_lock_held; static char is_softint_lwp; static char spl_is_raised; bool kpreempt(uintptr_t where) { uintptr_t failed; lwp_t *l; int s, dop, lsflag; l = curlwp; failed = 0; while ((dop = l->l_dopreempt) != 0) { if (l->l_stat != LSONPROC) { /* * About to block (or die), let it happen. * Doesn't really count as "preemption has * been blocked", since we're going to * context switch. */ atomic_swap_uint(&l->l_dopreempt, 0); return true; } KASSERT((l->l_flag & LW_IDLE) == 0); if (__predict_false(l->l_nopreempt != 0)) { /* LWP holds preemption disabled, explicitly. */ if ((dop & DOPREEMPT_COUNTED) == 0) { kpreempt_ev_crit.ev_count++; } failed = (uintptr_t)&kpreempt_is_disabled; break; } if (__predict_false((l->l_pflag & LP_INTR) != 0)) { /* Can't preempt soft interrupts yet. */ atomic_swap_uint(&l->l_dopreempt, 0); failed = (uintptr_t)&is_softint_lwp; break; } s = splsched(); if (__predict_false(l->l_blcnt != 0 || curcpu()->ci_biglock_wanted != NULL)) { /* Hold or want kernel_lock, code is not MT safe. */ splx(s); if ((dop & DOPREEMPT_COUNTED) == 0) { kpreempt_ev_klock.ev_count++; } failed = (uintptr_t)&kernel_lock_held; break; } if (__predict_false(!cpu_kpreempt_enter(where, s))) { /* * It may be that the IPL is too high. * kpreempt_enter() can schedule an * interrupt to retry later. */ splx(s); failed = (uintptr_t)&spl_is_raised; break; } /* Do it! */ if (__predict_true((dop & DOPREEMPT_COUNTED) == 0)) { kpreempt_ev_immed.ev_count++; } lwp_lock(l); l->l_pflag |= LP_PREEMPTING; spc_lock(l->l_cpu); mi_switch(l); l->l_nopreempt++; splx(s); /* Take care of any MD cleanup. */ cpu_kpreempt_exit(where); l->l_nopreempt--; } if (__predict_true(!failed)) { return false; } /* Record preemption failure for reporting via lockstat. */ atomic_or_uint(&l->l_dopreempt, DOPREEMPT_COUNTED); lsflag = 0; LOCKSTAT_ENTER(lsflag); if (__predict_false(lsflag)) { if (where == 0) { where = (uintptr_t)__builtin_return_address(0); } /* Preemption is on, might recurse, so make it atomic. */ if (atomic_cas_ptr_ni((void *)&l->l_pfailaddr, NULL, (void *)where) == NULL) { LOCKSTAT_START_TIMER(lsflag, l->l_pfailtime); l->l_pfaillock = failed; } } LOCKSTAT_EXIT(lsflag); return true; } /* * Return true if preemption is explicitly disabled. */ bool kpreempt_disabled(void) { const lwp_t *l = curlwp; return l->l_nopreempt != 0 || l->l_stat == LSZOMB || (l->l_flag & LW_IDLE) != 0 || (l->l_pflag & LP_INTR) != 0 || cpu_kpreempt_disabled(); } /* * Disable kernel preemption. */ void kpreempt_disable(void) { KPREEMPT_DISABLE(curlwp); } /* * Reenable kernel preemption. */ void kpreempt_enable(void) { KPREEMPT_ENABLE(curlwp); } /* * Compute the amount of time during which the current lwp was running. * * - update l_rtime unless it's an idle lwp. */ void updatertime(lwp_t *l, const struct bintime *now) { static bool backwards = false; if (__predict_false(l->l_flag & LW_IDLE)) return; if (__predict_false(bintimecmp(now, &l->l_stime, <)) && !backwards) { char caller[128]; #ifdef DDB db_symstr(caller, sizeof(caller), (db_expr_t)(intptr_t)__builtin_return_address(0), DB_STGY_PROC); #else snprintf(caller, sizeof(caller), "%p", __builtin_return_address(0)); #endif backwards = true; printf("WARNING: lwp %ld (%s%s%s) flags 0x%x:" " timecounter went backwards" " from (%jd + 0x%016"PRIx64"/2^64) sec" " to (%jd + 0x%016"PRIx64"/2^64) sec" " in %s\n", (long)l->l_lid, l->l_proc->p_comm, l->l_name ? " " : "", l->l_name ? l->l_name : "", l->l_pflag, (intmax_t)l->l_stime.sec, l->l_stime.frac, (intmax_t)now->sec, now->frac, caller); } /* rtime += now - stime */ bintime_add(&l->l_rtime, now); bintime_sub(&l->l_rtime, &l->l_stime); } /* * Select next LWP from the current CPU to run.. */ static inline lwp_t * nextlwp(struct cpu_info *ci, struct schedstate_percpu *spc) { lwp_t *newl; /* * Let sched_nextlwp() select the LWP to run the CPU next. * If no LWP is runnable, select the idle LWP. * * On arrival here LWPs on a run queue are locked by spc_mutex which * is currently held. Idle LWPs are always locked by spc_lwplock, * which may or may not be held here. On exit from this code block, * in all cases newl is locked by spc_lwplock. */ newl = sched_nextlwp(); if (newl != NULL) { sched_dequeue(newl); KASSERT(lwp_locked(newl, spc->spc_mutex)); KASSERT(newl->l_cpu == ci); newl->l_stat = LSONPROC; newl->l_pflag |= LP_RUNNING; newl->l_boostpri = PRI_NONE; spc->spc_curpriority = lwp_eprio(newl); spc->spc_flags &= ~(SPCF_SWITCHCLEAR | SPCF_IDLE); lwp_setlock(newl, spc->spc_lwplock); } else { /* * The idle LWP does not get set to LSONPROC, because * otherwise it screws up the output from top(1) etc. */ newl = ci->ci_data.cpu_idlelwp; newl->l_pflag |= LP_RUNNING; spc->spc_curpriority = PRI_IDLE; spc->spc_flags = (spc->spc_flags & ~SPCF_SWITCHCLEAR) | SPCF_IDLE; } /* * Only clear want_resched if there are no pending (slow) software * interrupts. We can do this without an atomic, because no new * LWPs can appear in the queue due to our hold on spc_mutex, and * the update to ci_want_resched will become globally visible before * the release of spc_mutex becomes globally visible. */ if (ci->ci_data.cpu_softints == 0) ci->ci_want_resched = 0; return newl; } /* * The machine independent parts of context switch. * * NOTE: l->l_cpu is not changed in this routine, because an LWP never * changes its own l_cpu (that would screw up curcpu on many ports and could * cause all kinds of other evil stuff). l_cpu is always changed by some * other actor, when it's known the LWP is not running (the LP_RUNNING flag * is checked under lock). */ void mi_switch(lwp_t *l) { struct cpu_info *ci; struct schedstate_percpu *spc; struct lwp *newl; kmutex_t *lock; int oldspl; struct bintime bt; bool returning; KASSERT(lwp_locked(l, NULL)); KASSERT(kpreempt_disabled()); KASSERT(mutex_owned(curcpu()->ci_schedstate.spc_mutex)); KASSERTMSG(l->l_blcnt == 0, "kernel_lock leaked"); kstack_check_magic(l); binuptime(&bt); KASSERTMSG(l == curlwp, "l %p curlwp %p", l, curlwp); KASSERT((l->l_pflag & LP_RUNNING) != 0); KASSERT(l->l_cpu == curcpu() || l->l_stat == LSRUN); ci = curcpu(); spc = &ci->ci_schedstate; returning = false; newl = NULL; /* * If we have been asked to switch to a specific LWP, then there * is no need to inspect the run queues. If a soft interrupt is * blocking, then return to the interrupted thread without adjusting * VM context or its start time: neither have been changed in order * to take the interrupt. */ if (l->l_switchto != NULL) { if ((l->l_pflag & LP_INTR) != 0) { returning = true; softint_block(l); if ((l->l_pflag & LP_TIMEINTR) != 0) updatertime(l, &bt); } newl = l->l_switchto; l->l_switchto = NULL; } #ifndef __HAVE_FAST_SOFTINTS else if (ci->ci_data.cpu_softints != 0) { /* There are pending soft interrupts, so pick one. */ newl = softint_picklwp(); newl->l_stat = LSONPROC; newl->l_pflag |= LP_RUNNING; } #endif /* !__HAVE_FAST_SOFTINTS */ /* * If on the CPU and we have gotten this far, then we must yield. */ if (l->l_stat == LSONPROC && l != newl) { KASSERT(lwp_locked(l, spc->spc_lwplock)); KASSERT((l->l_flag & LW_IDLE) == 0); l->l_stat = LSRUN; lwp_setlock(l, spc->spc_mutex); sched_enqueue(l); sched_preempted(l); /* * Handle migration. Note that "migrating LWP" may * be reset here, if interrupt/preemption happens * early in idle LWP. */ if (l->l_target_cpu != NULL && (l->l_pflag & LP_BOUND) == 0) { KASSERT((l->l_pflag & LP_INTR) == 0); spc->spc_migrating = l; } } /* Pick new LWP to run. */ if (newl == NULL) { newl = nextlwp(ci, spc); } /* Items that must be updated with the CPU locked. */ if (!returning) { /* Count time spent in current system call */ SYSCALL_TIME_SLEEP(l); updatertime(l, &bt); /* Update the new LWP's start time. */ newl->l_stime = bt; /* * ci_curlwp changes when a fast soft interrupt occurs. * We use ci_onproc to keep track of which kernel or * user thread is running 'underneath' the software * interrupt. This is important for time accounting, * itimers and forcing user threads to preempt (aston). */ ci->ci_onproc = newl; } /* * Preemption related tasks. Must be done holding spc_mutex. Clear * l_dopreempt without an atomic - it's only ever set non-zero by * sched_resched_cpu() which also holds spc_mutex, and only ever * cleared by the LWP itself (us) with atomics when not under lock. */ l->l_dopreempt = 0; if (__predict_false(l->l_pfailaddr != 0)) { LOCKSTAT_FLAG(lsflag); LOCKSTAT_ENTER(lsflag); LOCKSTAT_STOP_TIMER(lsflag, l->l_pfailtime); LOCKSTAT_EVENT_RA(lsflag, l->l_pfaillock, LB_NOPREEMPT|LB_SPIN, 1, l->l_pfailtime, l->l_pfailaddr); LOCKSTAT_EXIT(lsflag); l->l_pfailtime = 0; l->l_pfaillock = 0; l->l_pfailaddr = 0; } if (l != newl) { struct lwp *prevlwp; /* Release all locks, but leave the current LWP locked */ if (l->l_mutex == spc->spc_mutex) { /* * Drop spc_lwplock, if the current LWP has been moved * to the run queue (it is now locked by spc_mutex). */ mutex_spin_exit(spc->spc_lwplock); } else { /* * Otherwise, drop the spc_mutex, we are done with the * run queues. */ mutex_spin_exit(spc->spc_mutex); } /* We're down to only one lock, so do debug checks. */ LOCKDEBUG_BARRIER(l->l_mutex, 1); /* Count the context switch. */ CPU_COUNT(CPU_COUNT_NSWTCH, 1); if ((l->l_pflag & LP_PREEMPTING) != 0) { l->l_ru.ru_nivcsw++; l->l_pflag &= ~LP_PREEMPTING; } else { l->l_ru.ru_nvcsw++; } /* * Increase the count of spin-mutexes before the release * of the last lock - we must remain at IPL_SCHED after * releasing the lock. */ KASSERTMSG(ci->ci_mtx_count == -1, "%s: cpu%u: ci_mtx_count (%d) != -1 " "(block with spin-mutex held)", __func__, cpu_index(ci), ci->ci_mtx_count); oldspl = MUTEX_SPIN_OLDSPL(ci); ci->ci_mtx_count = -2; /* Update status for lwpctl, if present. */ if (l->l_lwpctl != NULL) { l->l_lwpctl->lc_curcpu = (l->l_stat == LSZOMB ? LWPCTL_CPU_EXITED : LWPCTL_CPU_NONE); } /* * If curlwp is a soft interrupt LWP, there's nobody on the * other side to unlock - we're returning into an assembly * trampoline. Unlock now. This is safe because this is a * kernel LWP and is bound to current CPU: the worst anyone * else will do to it, is to put it back onto this CPU's run * queue (and the CPU is busy here right now!). */ if (returning) { /* Keep IPL_SCHED after this; MD code will fix up. */ l->l_pflag &= ~LP_RUNNING; lwp_unlock(l); } else { /* A normal LWP: save old VM context. */ pmap_deactivate(l); } /* * If DTrace has set the active vtime enum to anything * other than INACTIVE (0), then it should have set the * function to call. */ if (__predict_false(dtrace_vtime_active)) { (*dtrace_vtime_switch_func)(newl); } /* * We must ensure not to come here from inside a read section. */ KASSERT(pserialize_not_in_read_section()); /* Switch to the new LWP.. */ #ifdef MULTIPROCESSOR KASSERT(curlwp == ci->ci_curlwp); #endif KASSERTMSG(l == curlwp, "l %p curlwp %p", l, curlwp); prevlwp = cpu_switchto(l, newl, returning); ci = curcpu(); #ifdef MULTIPROCESSOR KASSERT(curlwp == ci->ci_curlwp); #endif KASSERTMSG(l == curlwp, "l %p curlwp %p prevlwp %p", l, curlwp, prevlwp); KASSERT(prevlwp != NULL); KASSERT(l->l_cpu == ci); KASSERT(ci->ci_mtx_count == -2); /* * Immediately mark the previous LWP as no longer running * and unlock (to keep lock wait times short as possible). * We'll still be at IPL_SCHED afterwards. If a zombie, * don't touch after clearing LP_RUNNING as it could be * reaped by another CPU. Issue a memory barrier to ensure * this. * * atomic_store_release matches atomic_load_acquire in * lwp_free. */ KASSERT((prevlwp->l_pflag & LP_RUNNING) != 0); lock = prevlwp->l_mutex; if (__predict_false(prevlwp->l_stat == LSZOMB)) { atomic_store_release(&prevlwp->l_pflag, prevlwp->l_pflag & ~LP_RUNNING); } else { prevlwp->l_pflag &= ~LP_RUNNING; } mutex_spin_exit(lock); /* * Switched away - we have new curlwp. * Restore VM context and IPL. */ pmap_activate(l); pcu_switchpoint(l); /* Update status for lwpctl, if present. */ if (l->l_lwpctl != NULL) { l->l_lwpctl->lc_curcpu = (int)cpu_index(ci); l->l_lwpctl->lc_pctr++; } /* * Normalize the spin mutex count and restore the previous * SPL. Note that, unless the caller disabled preemption, * we can be preempted at any time after this splx(). */ KASSERT(l->l_cpu == ci); KASSERT(ci->ci_mtx_count == -1); ci->ci_mtx_count = 0; splx(oldspl); } else { /* Nothing to do - just unlock and return. */ mutex_spin_exit(spc->spc_mutex); l->l_pflag &= ~LP_PREEMPTING; lwp_unlock(l); } KASSERT(l == curlwp); KASSERT(l->l_stat == LSONPROC || (l->l_flag & LW_IDLE) != 0); SYSCALL_TIME_WAKEUP(l); LOCKDEBUG_BARRIER(NULL, 1); } /* * setrunnable: change LWP state to be runnable, placing it on the run queue. * * Call with the process and LWP locked. Will return with the LWP unlocked. */ void setrunnable(struct lwp *l) { struct proc *p = l->l_proc; struct cpu_info *ci; kmutex_t *oldlock; KASSERT((l->l_flag & LW_IDLE) == 0); KASSERT((l->l_flag & LW_DBGSUSPEND) == 0); KASSERT(mutex_owned(p->p_lock)); KASSERT(lwp_locked(l, NULL)); KASSERT(l->l_mutex != l->l_cpu->ci_schedstate.spc_mutex); switch (l->l_stat) { case LSSTOP: /* * If we're being traced (possibly because someone attached us * while we were stopped), check for a signal from the debugger. */ if ((p->p_slflag & PSL_TRACED) != 0 && p->p_xsig != 0) signotify(l); p->p_nrlwps++; break; case LSSUSPENDED: KASSERT(lwp_locked(l, l->l_cpu->ci_schedstate.spc_lwplock)); l->l_flag &= ~LW_WSUSPEND; p->p_nrlwps++; cv_broadcast(&p->p_lwpcv); break; case LSSLEEP: KASSERT(l->l_wchan != NULL); break; case LSIDL: KASSERT(lwp_locked(l, l->l_cpu->ci_schedstate.spc_lwplock)); break; default: panic("setrunnable: lwp %p state was %d", l, l->l_stat); } /* * If the LWP was sleeping, start it again. */ if (l->l_wchan != NULL) { l->l_stat = LSSLEEP; /* lwp_unsleep() will release the lock. */ lwp_unsleep(l, true); return; } /* * If the LWP is still on the CPU, mark it as LSONPROC. It may be * about to call mi_switch(), in which case it will yield. */ if ((l->l_pflag & LP_RUNNING) != 0) { l->l_stat = LSONPROC; l->l_slptime = 0; lwp_unlock(l); return; } /* * Look for a CPU to run. * Set the LWP runnable. */ ci = sched_takecpu(l); l->l_cpu = ci; spc_lock(ci); oldlock = lwp_setlock(l, l->l_cpu->ci_schedstate.spc_mutex); sched_setrunnable(l); l->l_stat = LSRUN; l->l_slptime = 0; sched_enqueue(l); sched_resched_lwp(l, true); /* SPC & LWP now unlocked. */ mutex_spin_exit(oldlock); } /* * suspendsched: * * Convert all non-LW_SYSTEM LSSLEEP or LSRUN LWPs to LSSUSPENDED. */ void suspendsched(void) { CPU_INFO_ITERATOR cii; struct cpu_info *ci; struct lwp *l; struct proc *p; /* * We do this by process in order not to violate the locking rules. */ mutex_enter(&proc_lock); PROCLIST_FOREACH(p, &allproc) { mutex_enter(p->p_lock); if ((p->p_flag & PK_SYSTEM) != 0) { mutex_exit(p->p_lock); continue; } if (p->p_stat != SSTOP) { if (p->p_stat != SZOMB && p->p_stat != SDEAD) { p->p_pptr->p_nstopchild++; p->p_waited = 0; } p->p_stat = SSTOP; } LIST_FOREACH(l, &p->p_lwps, l_sibling) { if (l == curlwp) continue; lwp_lock(l); /* * Set L_WREBOOT so that the LWP will suspend itself * when it tries to return to user mode. We want to * try and get to get as many LWPs as possible to * the user / kernel boundary, so that they will * release any locks that they hold. */ l->l_flag |= (LW_WREBOOT | LW_WSUSPEND); if (l->l_stat == LSSLEEP && (l->l_flag & LW_SINTR) != 0) { /* setrunnable() will release the lock. */ setrunnable(l); continue; } lwp_unlock(l); } mutex_exit(p->p_lock); } mutex_exit(&proc_lock); /* * Kick all CPUs to make them preempt any LWPs running in user mode. * They'll trap into the kernel and suspend themselves in userret(). * * Unusually, we don't hold any other scheduler object locked, which * would keep preemption off for sched_resched_cpu(), so disable it * explicitly. */ kpreempt_disable(); for (CPU_INFO_FOREACH(cii, ci)) { spc_lock(ci); sched_resched_cpu(ci, PRI_KERNEL, true); /* spc now unlocked */ } kpreempt_enable(); } /* * sched_unsleep: * * The is called when the LWP has not been awoken normally but instead * interrupted: for example, if the sleep timed out. Because of this, * it's not a valid action for running or idle LWPs. */ static void sched_unsleep(struct lwp *l, bool cleanup) { lwp_unlock(l); panic("sched_unsleep"); } static void sched_changepri(struct lwp *l, pri_t pri) { struct schedstate_percpu *spc; struct cpu_info *ci; KASSERT(lwp_locked(l, NULL)); ci = l->l_cpu; spc = &ci->ci_schedstate; if (l->l_stat == LSRUN) { KASSERT(lwp_locked(l, spc->spc_mutex)); sched_dequeue(l); l->l_priority = pri; sched_enqueue(l); sched_resched_lwp(l, false); } else if (l->l_stat == LSONPROC && l->l_class != SCHED_OTHER) { /* On priority drop, only evict realtime LWPs. */ KASSERT(lwp_locked(l, spc->spc_lwplock)); l->l_priority = pri; spc_lock(ci); sched_resched_cpu(ci, spc->spc_maxpriority, true); /* spc now unlocked */ } else { l->l_priority = pri; } } static void sched_lendpri(struct lwp *l, pri_t pri) { struct schedstate_percpu *spc; struct cpu_info *ci; KASSERT(lwp_locked(l, NULL)); ci = l->l_cpu; spc = &ci->ci_schedstate; if (l->l_stat == LSRUN) { KASSERT(lwp_locked(l, spc->spc_mutex)); sched_dequeue(l); l->l_inheritedprio = pri; l->l_auxprio = MAX(l->l_inheritedprio, l->l_protectprio); sched_enqueue(l); sched_resched_lwp(l, false); } else if (l->l_stat == LSONPROC && l->l_class != SCHED_OTHER) { /* On priority drop, only evict realtime LWPs. */ KASSERT(lwp_locked(l, spc->spc_lwplock)); l->l_inheritedprio = pri; l->l_auxprio = MAX(l->l_inheritedprio, l->l_protectprio); spc_lock(ci); sched_resched_cpu(ci, spc->spc_maxpriority, true); /* spc now unlocked */ } else { l->l_inheritedprio = pri; l->l_auxprio = MAX(l->l_inheritedprio, l->l_protectprio); } } struct lwp * syncobj_noowner(wchan_t wchan) { return NULL; } /* Decay 95% of proc::p_pctcpu in 60 seconds, ccpu = exp(-1/20) */ const fixpt_t ccpu = 0.95122942450071400909 * FSCALE; /* * Constants for averages over 1, 5 and 15 minutes when sampling at * 5 second intervals. */ static const fixpt_t cexp[ ] = { 0.9200444146293232 * FSCALE, /* exp(-1/12) */ 0.9834714538216174 * FSCALE, /* exp(-1/60) */ 0.9944598480048967 * FSCALE, /* exp(-1/180) */ }; /* * sched_pstats: * * => Update process statistics and check CPU resource allocation. * => Call scheduler-specific hook to eventually adjust LWP priorities. * => Compute load average of a quantity on 1, 5 and 15 minute intervals. */ void sched_pstats(void) { struct loadavg *avg = &averunnable; const int clkhz = (stathz != 0 ? stathz : hz); static bool backwardslwp = false; static bool backwardsproc = false; static u_int lavg_count = 0; struct proc *p; int nrun; sched_pstats_ticks++; if (++lavg_count >= 5) { lavg_count = 0; nrun = 0; } mutex_enter(&proc_lock); PROCLIST_FOREACH(p, &allproc) { struct lwp *l; struct rlimit *rlim; time_t runtm; int sig; /* Increment sleep time (if sleeping), ignore overflow. */ mutex_enter(p->p_lock); runtm = p->p_rtime.sec; LIST_FOREACH(l, &p->p_lwps, l_sibling) { fixpt_t lpctcpu; u_int lcpticks; if (__predict_false((l->l_flag & LW_IDLE) != 0)) continue; lwp_lock(l); if (__predict_false(l->l_rtime.sec < 0) && !backwardslwp) { backwardslwp = true; printf("WARNING: lwp %ld (%s%s%s): " "negative runtime: " "(%jd + 0x%016"PRIx64"/2^64) sec\n", (long)l->l_lid, l->l_proc->p_comm, l->l_name ? " " : "", l->l_name ? l->l_name : "", (intmax_t)l->l_rtime.sec, l->l_rtime.frac); } runtm += l->l_rtime.sec; l->l_swtime++; sched_lwp_stats(l); /* For load average calculation. */ if (__predict_false(lavg_count == 0) && (l->l_flag & (LW_SINTR | LW_SYSTEM)) == 0) { switch (l->l_stat) { case LSSLEEP: if (l->l_slptime > 1) { break; } /* FALLTHROUGH */ case LSRUN: case LSONPROC: case LSIDL: nrun++; } } lwp_unlock(l); l->l_pctcpu = (l->l_pctcpu * ccpu) >> FSHIFT; if (l->l_slptime != 0) continue; lpctcpu = l->l_pctcpu; lcpticks = atomic_swap_uint(&l->l_cpticks, 0); lpctcpu += ((FSCALE - ccpu) * (lcpticks * FSCALE / clkhz)) >> FSHIFT; l->l_pctcpu = lpctcpu; } /* Calculating p_pctcpu only for ps(1) */ p->p_pctcpu = (p->p_pctcpu * ccpu) >> FSHIFT; if (__predict_false(runtm < 0)) { if (!backwardsproc) { backwardsproc = true; printf("WARNING: pid %ld (%s): " "negative runtime; " "monotonic clock has gone backwards\n", (long)p->p_pid, p->p_comm); } mutex_exit(p->p_lock); continue; } /* * Check if the process exceeds its CPU resource allocation. * If over the hard limit, kill it with SIGKILL. * If over the soft limit, send SIGXCPU and raise * the soft limit a little. */ rlim = &p->p_rlimit[RLIMIT_CPU]; sig = 0; if (__predict_false(runtm >= rlim->rlim_cur)) { if (runtm >= rlim->rlim_max) { sig = SIGKILL; log(LOG_NOTICE, "pid %d, command %s, is killed: %s\n", p->p_pid, p->p_comm, "exceeded RLIMIT_CPU"); uprintf("pid %d, command %s, is killed: %s\n", p->p_pid, p->p_comm, "exceeded RLIMIT_CPU"); } else { sig = SIGXCPU; if (rlim->rlim_cur < rlim->rlim_max) rlim->rlim_cur += 5; } } mutex_exit(p->p_lock); if (__predict_false(sig)) { KASSERT((p->p_flag & PK_SYSTEM) == 0); psignal(p, sig); } } /* Load average calculation. */ if (__predict_false(lavg_count == 0)) { int i; CTASSERT(__arraycount(cexp) == __arraycount(avg->ldavg)); for (i = 0; i < __arraycount(cexp); i++) { avg->ldavg[i] = (cexp[i] * avg->ldavg[i] + nrun * FSCALE * (FSCALE - cexp[i])) >> FSHIFT; } } /* Lightning bolt. */ cv_broadcast(&lbolt); mutex_exit(&proc_lock); }
8 19 19 17 17 2 2 1 1 9 6 7 15 6 1 2 6 9 9 1 9 1 1 1 1 9 9 9 9 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 /* $NetBSD: ip6_input.c,v 1.227 2022/10/28 05:18:39 ozaki-r Exp $ */ /* $KAME: ip6_input.c,v 1.188 2001/03/29 05:34:31 itojun Exp $ */ /* * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the project nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * Copyright (c) 1982, 1986, 1988, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)ip_input.c 8.2 (Berkeley) 1/4/94 */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: ip6_input.c,v 1.227 2022/10/28 05:18:39 ozaki-r Exp $"); #ifdef _KERNEL_OPT #include "opt_gateway.h" #include "opt_inet.h" #include "opt_inet6.h" #include "opt_ipsec.h" #include "opt_net_mpsafe.h" #endif #include <sys/param.h> #include <sys/systm.h> #include <sys/mbuf.h> #include <sys/domain.h> #include <sys/protosw.h> #include <sys/socket.h> #include <sys/socketvar.h> #include <sys/errno.h> #include <sys/time.h> #include <sys/kernel.h> #include <sys/syslog.h> #include <sys/proc.h> #include <sys/sysctl.h> #include <sys/cprng.h> #include <sys/percpu.h> #include <net/if.h> #include <net/if_types.h> #include <net/if_dl.h> #include <net/route.h> #include <net/pktqueue.h> #include <net/pfil.h> #include <netinet/in.h> #include <netinet/in_systm.h> #ifdef INET #include <netinet/ip.h> #include <netinet/ip_var.h> #include <netinet/ip_icmp.h> #endif /* INET */ #include <netinet/ip6.h> #include <netinet/portalgo.h> #include <netinet6/in6_var.h> #include <netinet6/ip6_var.h> #include <netinet6/ip6_private.h> #include <netinet6/in6_pcb.h> #include <netinet/icmp6.h> #include <netinet6/scope6_var.h> #include <netinet6/in6_ifattach.h> #include <netinet6/nd6.h> #ifdef IPSEC #include <netipsec/ipsec.h> #include <netipsec/ipsec6.h> #include <netipsec/key.h> #endif /* IPSEC */ #include <netinet6/ip6protosw.h> #include "faith.h" extern struct domain inet6domain; u_char ip6_protox[IPPROTO_MAX]; pktqueue_t *ip6_pktq __read_mostly; pfil_head_t *inet6_pfil_hook; percpu_t *ip6stat_percpu; percpu_t *ip6_forward_rt_percpu __cacheline_aligned; static void ip6intr(void *); static void ip6_input(struct mbuf *, struct ifnet *); static bool ip6_badaddr(struct ip6_hdr *); static struct m_tag *ip6_setdstifaddr(struct mbuf *, const struct in6_ifaddr *); static struct m_tag *ip6_addaux(struct mbuf *); static struct m_tag *ip6_findaux(struct mbuf *); static void ip6_delaux(struct mbuf *); static int ip6_process_hopopts(struct mbuf *, u_int8_t *, int, u_int32_t *, u_int32_t *); static struct mbuf *ip6_pullexthdr(struct mbuf *, size_t, int); static void sysctl_net_inet6_ip6_setup(struct sysctllog **); #ifdef NET_MPSAFE #define SOFTNET_LOCK() mutex_enter(softnet_lock) #define SOFTNET_UNLOCK() mutex_exit(softnet_lock) #else #define SOFTNET_LOCK() KASSERT(mutex_owned(softnet_lock)) #define SOFTNET_UNLOCK() KASSERT(mutex_owned(softnet_lock)) #endif /* Ensure that non packed structures are the desired size. */ __CTASSERT(sizeof(struct ip6_hdr) == 40); __CTASSERT(sizeof(struct ip6_ext) == 2); __CTASSERT(sizeof(struct ip6_hbh) == 2); __CTASSERT(sizeof(struct ip6_dest) == 2); __CTASSERT(sizeof(struct ip6_opt) == 2); __CTASSERT(sizeof(struct ip6_opt_jumbo) == 6); __CTASSERT(sizeof(struct ip6_opt_nsap) == 4); __CTASSERT(sizeof(struct ip6_opt_tunnel) == 3); __CTASSERT(sizeof(struct ip6_opt_router) == 4); __CTASSERT(sizeof(struct ip6_rthdr) == 4); __CTASSERT(sizeof(struct ip6_rthdr0) == 8); __CTASSERT(sizeof(struct ip6_frag) == 8); /* * IP6 initialization: fill in IP6 protocol switch table. * All protocols not implemented in kernel go to raw IP6 protocol handler. */ void ip6_init(void) { const struct ip6protosw *pr; int i; in6_init(); ip6_pktq = pktq_create(IFQ_MAXLEN, ip6intr, NULL); KASSERT(ip6_pktq != NULL); sysctl_net_inet6_ip6_setup(NULL); pr = (const struct ip6protosw *)pffindproto(PF_INET6, IPPROTO_RAW, SOCK_RAW); if (pr == 0) panic("ip6_init"); for (i = 0; i < IPPROTO_MAX; i++) ip6_protox[i] = pr - inet6sw; for (pr = (const struct ip6protosw *)inet6domain.dom_protosw; pr < (const struct ip6protosw *)inet6domain.dom_protoswNPROTOSW; pr++) if (pr->pr_domain->dom_family == PF_INET6 && pr->pr_protocol && pr->pr_protocol != IPPROTO_RAW) ip6_protox[pr->pr_protocol] = pr - inet6sw; scope6_init(); addrsel_policy_init(); nd6_init(); frag6_init(); #ifdef GATEWAY ip6flow_init(ip6_hashsize); #endif /* Register our Packet Filter hook. */ inet6_pfil_hook = pfil_head_create(PFIL_TYPE_AF, (void *)AF_INET6); KASSERT(inet6_pfil_hook != NULL); ip6stat_percpu = percpu_alloc(sizeof(uint64_t) * IP6_NSTATS); ip6_forward_rt_percpu = rtcache_percpu_alloc(); } /* * IP6 input interrupt handling. Just pass the packet to ip6_input. */ static void ip6intr(void *arg __unused) { struct mbuf *m; SOFTNET_KERNEL_LOCK_UNLESS_NET_MPSAFE(); while ((m = pktq_dequeue(ip6_pktq)) != NULL) { struct psref psref; struct ifnet *rcvif = m_get_rcvif_psref(m, &psref); if (rcvif == NULL) { IP6_STATINC(IP6_STAT_IFDROP); m_freem(m); continue; } /* * Drop the packet if IPv6 is disabled on the interface. */ if ((ND_IFINFO(rcvif)->flags & ND6_IFF_IFDISABLED)) { m_put_rcvif_psref(rcvif, &psref); IP6_STATINC(IP6_STAT_IFDROP); m_freem(m); continue; } ip6_input(m, rcvif); m_put_rcvif_psref(rcvif, &psref); } SOFTNET_KERNEL_UNLOCK_UNLESS_NET_MPSAFE(); } static void ip6_input(struct mbuf *m, struct ifnet *rcvif) { struct ip6_hdr *ip6; int hit, off = sizeof(struct ip6_hdr), nest; u_int32_t plen; u_int32_t rtalert = ~0; int nxt, ours = 0, rh_present = 0, frg_present; struct ifnet *deliverifp = NULL; int srcrt = 0; struct rtentry *rt = NULL; union { struct sockaddr dst; struct sockaddr_in6 dst6; } u; struct route *ro; KASSERT(rcvif != NULL); /* * make sure we don't have onion peering information into m_tag. */ ip6_delaux(m); /* * mbuf statistics */ if (m->m_flags & M_EXT) { if (m->m_next) IP6_STATINC(IP6_STAT_MEXT2M); else IP6_STATINC(IP6_STAT_MEXT1); } else { #define M2MMAX 32 if (m->m_next) { if (m->m_flags & M_LOOP) /*XXX*/ IP6_STATINC(IP6_STAT_M2M + lo0ifp->if_index); else if (rcvif->if_index < M2MMAX) IP6_STATINC(IP6_STAT_M2M + rcvif->if_index); else IP6_STATINC(IP6_STAT_M2M); } else IP6_STATINC(IP6_STAT_M1); #undef M2MMAX } in6_ifstat_inc(rcvif, ifs6_in_receive); IP6_STATINC(IP6_STAT_TOTAL); /* * If the IPv6 header is not aligned, slurp it up into a new * mbuf with space for link headers, in the event we forward * it. Otherwise, if it is aligned, make sure the entire base * IPv6 header is in the first mbuf of the chain. */ if (M_GET_ALIGNED_HDR(&m, struct ip6_hdr, true) != 0) { /* XXXJRT new stat, please */ IP6_STATINC(IP6_STAT_TOOSMALL); in6_ifstat_inc(rcvif, ifs6_in_hdrerr); return; } ip6 = mtod(m, struct ip6_hdr *); if ((ip6->ip6_vfc & IPV6_VERSION_MASK) != IPV6_VERSION) { IP6_STATINC(IP6_STAT_BADVERS); in6_ifstat_inc(rcvif, ifs6_in_hdrerr); goto bad; } if (ip6_badaddr(ip6)) { IP6_STATINC(IP6_STAT_BADSCOPE); in6_ifstat_inc(rcvif, ifs6_in_addrerr); goto bad; } /* * Assume that we can create a fast-forward IP flow entry * based on this packet. */ m->m_flags |= M_CANFASTFWD; /* * Run through list of hooks for input packets. If there are any * filters which require that additional packets in the flow are * not fast-forwarded, they must clear the M_CANFASTFWD flag. * Note that filters must _never_ set this flag, as another filter * in the list may have previously cleared it. * * Don't call hooks if the packet has already been processed by * IPsec (encapsulated, tunnel mode). */ #if defined(IPSEC) if (!ipsec_used || !ipsec_skip_pfil(m)) #else if (1) #endif { struct in6_addr odst; int error; odst = ip6->ip6_dst; error = pfil_run_hooks(inet6_pfil_hook, &m, rcvif, PFIL_IN); if (error != 0 || m == NULL) { IP6_STATINC(IP6_STAT_PFILDROP_IN); return; } if (m->m_len < sizeof(struct ip6_hdr)) { if ((m = m_pullup(m, sizeof(struct ip6_hdr))) == NULL) { IP6_STATINC(IP6_STAT_TOOSMALL); in6_ifstat_inc(rcvif, ifs6_in_hdrerr); return; } } ip6 = mtod(m, struct ip6_hdr *); srcrt = !IN6_ARE_ADDR_EQUAL(&odst, &ip6->ip6_dst); } IP6_STATINC(IP6_STAT_NXTHIST + ip6->ip6_nxt); #ifdef ALTQ if (altq_input != NULL) { SOFTNET_LOCK(); if ((*altq_input)(m, AF_INET6) == 0) { SOFTNET_UNLOCK(); /* packet is dropped by traffic conditioner */ return; } SOFTNET_UNLOCK(); } #endif /* * Disambiguate address scope zones (if there is ambiguity). * We first make sure that the original source or destination address * is not in our internal form for scoped addresses. Such addresses * are not necessarily invalid spec-wise, but we cannot accept them due * to the usage conflict. * in6_setscope() then also checks and rejects the cases where src or * dst are the loopback address and the receiving interface * is not loopback. */ if (__predict_false( m_makewritable(&m, 0, sizeof(struct ip6_hdr), M_DONTWAIT))) { IP6_STATINC(IP6_STAT_IDROPPED); goto bad; } ip6 = mtod(m, struct ip6_hdr *); if (in6_clearscope(&ip6->ip6_src) || in6_clearscope(&ip6->ip6_dst)) { IP6_STATINC(IP6_STAT_BADSCOPE); /* XXX */ goto bad; } if (in6_setscope(&ip6->ip6_src, rcvif, NULL) || in6_setscope(&ip6->ip6_dst, rcvif, NULL)) { IP6_STATINC(IP6_STAT_BADSCOPE); goto bad; } ro = rtcache_percpu_getref(ip6_forward_rt_percpu); /* * Multicast check */ if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) { bool ingroup; in6_ifstat_inc(rcvif, ifs6_in_mcast); /* * See if we belong to the destination multicast group on the * arrival interface. */ ingroup = in6_multi_group(&ip6->ip6_dst, rcvif); if (ingroup) { ours = 1; } else if (!ip6_mrouter) { uint64_t *ip6s = IP6_STAT_GETREF(); ip6s[IP6_STAT_NOTMEMBER]++; ip6s[IP6_STAT_CANTFORWARD]++; IP6_STAT_PUTREF(); in6_ifstat_inc(rcvif, ifs6_in_discard); goto bad_unref; } deliverifp = rcvif; goto hbhcheck; } sockaddr_in6_init(&u.dst6, &ip6->ip6_dst, 0, 0, 0); /* * Unicast check */ rt = rtcache_lookup2(ro, &u.dst, 1, &hit); if (hit) IP6_STATINC(IP6_STAT_FORWARD_CACHEHIT); else IP6_STATINC(IP6_STAT_FORWARD_CACHEMISS); /* * Accept the packet if the forwarding interface to the destination * (according to the routing table) is the loopback interface, * unless the associated route has a gateway. * * We don't explicitly match ip6_dst against an interface here. It * is already done in rtcache_lookup2: rt->rt_ifp->if_type will be * IFT_LOOP if the packet is for us. * * Note that this approach causes to accept a packet if there is a * route to the loopback interface for the destination of the packet. * But we think it's even useful in some situations, e.g. when using * a special daemon which wants to intercept the packet. */ if (rt != NULL && (rt->rt_flags & (RTF_HOST|RTF_GATEWAY)) == RTF_HOST && rt->rt_ifp->if_type == IFT_LOOP) { struct in6_ifaddr *ia6 = (struct in6_ifaddr *)rt->rt_ifa; int addrok; if (ia6->ia6_flags & IN6_IFF_ANYCAST) m->m_flags |= M_ANYCAST6; /* * packets to a tentative, duplicated, or somehow invalid * address must not be accepted. */ if (ia6->ia6_flags & IN6_IFF_NOTREADY) addrok = 0; else if (ia6->ia6_flags & IN6_IFF_DETACHED && !IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_src)) { /* Allow internal traffic to DETACHED addresses */ struct sockaddr_in6 sin6; int s; memset(&sin6, 0, sizeof(sin6)); sin6.sin6_family = AF_INET6; sin6.sin6_len = sizeof(sin6); sin6.sin6_addr = ip6->ip6_src; s = pserialize_read_enter(); addrok = (ifa_ifwithaddr(sin6tosa(&sin6)) != NULL); pserialize_read_exit(s); } else addrok = 1; if (addrok) { /* this address is ready */ ours = 1; deliverifp = ia6->ia_ifp; /* correct? */ goto hbhcheck; } else { /* address is not ready, so discard the packet. */ char ip6bufs[INET6_ADDRSTRLEN]; char ip6bufd[INET6_ADDRSTRLEN]; nd6log(LOG_INFO, "packet to an unready address %s->%s\n", IN6_PRINT(ip6bufs, &ip6->ip6_src), IN6_PRINT(ip6bufd, &ip6->ip6_dst)); IP6_STATINC(IP6_STAT_IDROPPED); goto bad_unref; } } /* * FAITH (Firewall Aided Internet Translator) */ #if defined(NFAITH) && 0 < NFAITH if (ip6_keepfaith) { if (rt != NULL && rt->rt_ifp != NULL && rt->rt_ifp->if_type == IFT_FAITH) { /* XXX do we need more sanity checks? */ ours = 1; deliverifp = rt->rt_ifp; /* faith */ goto hbhcheck; } } #endif /* * Now there is no reason to process the packet if it's not our own * and we're not a router. */ if (!ip6_forwarding) { IP6_STATINC(IP6_STAT_CANTFORWARD); in6_ifstat_inc(rcvif, ifs6_in_discard); goto bad_unref; } hbhcheck: /* * Record address information into m_tag, if we don't have one yet. * Note that we are unable to record it, if the address is not listed * as our interface address (e.g. multicast addresses, addresses * within FAITH prefixes and such). */ if (deliverifp && ip6_getdstifaddr(m) == NULL) { struct in6_ifaddr *ia6; int s = pserialize_read_enter(); ia6 = in6_ifawithifp(deliverifp, &ip6->ip6_dst); /* Depends on ip6_setdstifaddr never sleep */ if (ia6 != NULL && ip6_setdstifaddr(m, ia6) == NULL) { /* * XXX maybe we should drop the packet here, * as we could not provide enough information * to the upper layers. */ } pserialize_read_exit(s); } /* * Process Hop-by-Hop options header if it's contained. * m may be modified in ip6_hopopts_input(). * If a JumboPayload option is included, plen will also be modified. */ plen = (u_int32_t)ntohs(ip6->ip6_plen); if (ip6->ip6_nxt == IPPROTO_HOPOPTS) { struct ip6_hbh *hbh; if (ip6_hopopts_input(&plen, &rtalert, &m, &off)) { /* m already freed */ in6_ifstat_inc(rcvif, ifs6_in_discard); rtcache_unref(rt, ro); rtcache_percpu_putref(ip6_forward_rt_percpu); return; } /* adjust pointer */ ip6 = mtod(m, struct ip6_hdr *); /* * if the payload length field is 0 and the next header field * indicates Hop-by-Hop Options header, then a Jumbo Payload * option MUST be included. */ if (ip6->ip6_plen == 0 && plen == 0) { /* * Note that if a valid jumbo payload option is * contained, ip6_hopopts_input() must set a valid * (non-zero) payload length to the variable plen. */ IP6_STATINC(IP6_STAT_BADOPTIONS); in6_ifstat_inc(rcvif, ifs6_in_discard); in6_ifstat_inc(rcvif, ifs6_in_hdrerr); icmp6_error(m, ICMP6_PARAM_PROB, ICMP6_PARAMPROB_HEADER, (char *)&ip6->ip6_plen - (char *)ip6); rtcache_unref(rt, ro); rtcache_percpu_putref(ip6_forward_rt_percpu); return; } IP6_EXTHDR_GET(hbh, struct ip6_hbh *, m, sizeof(struct ip6_hdr), sizeof(struct ip6_hbh)); if (hbh == NULL) { IP6_STATINC(IP6_STAT_TOOSHORT); rtcache_unref(rt, ro); rtcache_percpu_putref(ip6_forward_rt_percpu); return; } KASSERT(ACCESSIBLE_POINTER(hbh, struct ip6_hdr)); nxt = hbh->ip6h_nxt; /* * accept the packet if a router alert option is included * and we act as an IPv6 router. */ if (rtalert != ~0 && ip6_forwarding) ours = 1; } else nxt = ip6->ip6_nxt; /* * Check that the amount of data in the buffers is at least much as * the IPv6 header would have us expect. Trim mbufs if longer than we * expect. Drop packet if shorter than we expect. */ if (m->m_pkthdr.len - sizeof(struct ip6_hdr) < plen) { IP6_STATINC(IP6_STAT_TOOSHORT); in6_ifstat_inc(rcvif, ifs6_in_truncated); goto bad_unref; } if (m->m_pkthdr.len > sizeof(struct ip6_hdr) + plen) { if (m->m_len == m->m_pkthdr.len) { m->m_len = sizeof(struct ip6_hdr) + plen; m->m_pkthdr.len = sizeof(struct ip6_hdr) + plen; } else m_adj(m, sizeof(struct ip6_hdr) + plen - m->m_pkthdr.len); } /* * Forward if desirable. */ if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) { /* * If we are acting as a multicast router, all * incoming multicast packets are passed to the * kernel-level multicast forwarding function. * The packet is returned (relatively) intact; if * ip6_mforward() returns a non-zero value, the packet * must be discarded, else it may be accepted below. */ if (ip6_mrouter != NULL) { int error; SOFTNET_LOCK(); error = ip6_mforward(ip6, rcvif, m); SOFTNET_UNLOCK(); if (error != 0) { rtcache_unref(rt, ro); rtcache_percpu_putref(ip6_forward_rt_percpu); IP6_STATINC(IP6_STAT_CANTFORWARD); goto bad; } } if (!ours) { IP6_STATINC(IP6_STAT_CANTFORWARD); goto bad_unref; } } else if (!ours) { rtcache_unref(rt, ro); rtcache_percpu_putref(ip6_forward_rt_percpu); ip6_forward(m, srcrt, rcvif); return; } ip6 = mtod(m, struct ip6_hdr *); /* * Malicious party may be able to use IPv4 mapped addr to confuse * tcp/udp stack and bypass security checks (act as if it was from * 127.0.0.1 by using IPv6 src ::ffff:127.0.0.1). Be cautious. * * For SIIT end node behavior, you may want to disable the check. * However, you will become vulnerable to attacks using IPv4 mapped * source. */ if (IN6_IS_ADDR_V4MAPPED(&ip6->ip6_src) || IN6_IS_ADDR_V4MAPPED(&ip6->ip6_dst)) { IP6_STATINC(IP6_STAT_BADSCOPE); in6_ifstat_inc(rcvif, ifs6_in_addrerr); goto bad_unref; } #ifdef IFA_STATS if (deliverifp != NULL) { struct in6_ifaddr *ia6; int s = pserialize_read_enter(); ia6 = in6_ifawithifp(deliverifp, &ip6->ip6_dst); if (ia6) ia6->ia_ifa.ifa_data.ifad_inbytes += m->m_pkthdr.len; pserialize_read_exit(s); } #endif IP6_STATINC(IP6_STAT_DELIVERED); in6_ifstat_inc(deliverifp, ifs6_in_deliver); nest = 0; if (rt != NULL) { rtcache_unref(rt, ro); rt = NULL; } rtcache_percpu_putref(ip6_forward_rt_percpu); rh_present = 0; frg_present = 0; while (nxt != IPPROTO_DONE) { if (ip6_hdrnestlimit && (++nest > ip6_hdrnestlimit)) { IP6_STATINC(IP6_STAT_TOOMANYHDR); in6_ifstat_inc(rcvif, ifs6_in_hdrerr); goto bad; } M_VERIFY_PACKET(m); /* * protection against faulty packet - there should be * more sanity checks in header chain processing. */ if (m->m_pkthdr.len < off) { IP6_STATINC(IP6_STAT_TOOSHORT); in6_ifstat_inc(rcvif, ifs6_in_truncated); goto bad; } if (nxt == IPPROTO_ROUTING) { if (rh_present++) { in6_ifstat_inc(rcvif, ifs6_in_hdrerr); IP6_STATINC(IP6_STAT_BADOPTIONS); goto bad; } } else if (nxt == IPPROTO_FRAGMENT) { if (frg_present++) { in6_ifstat_inc(rcvif, ifs6_in_hdrerr); IP6_STATINC(IP6_STAT_BADOPTIONS); goto bad; } } #ifdef IPSEC if (ipsec_used) { /* * Enforce IPsec policy checking if we are seeing last * header. Note that we do not visit this with * protocols with pcb layer code - like udp/tcp/raw ip. */ if ((inet6sw[ip6_protox[nxt]].pr_flags & PR_LASTHDR) != 0) { int error; error = ipsec_ip_input_checkpolicy(m, false); if (error) { IP6_STATINC(IP6_STAT_IPSECDROP_IN); goto bad; } } } #endif nxt = (*inet6sw[ip6_protox[nxt]].pr_input)(&m, &off, nxt); } return; bad_unref: rtcache_unref(rt, ro); rtcache_percpu_putref(ip6_forward_rt_percpu); bad: m_freem(m); return; } static bool ip6_badaddr(struct ip6_hdr *ip6) { /* Check against address spoofing/corruption. */ if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_src) || IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_dst)) { return true; } /* * The following check is not documented in specs. A malicious * party may be able to use IPv4 mapped addr to confuse tcp/udp stack * and bypass security checks (act as if it was from 127.0.0.1 by using * IPv6 src ::ffff:127.0.0.1). Be cautious. * * This check chokes if we are in an SIIT cloud. As none of BSDs * support IPv4-less kernel compilation, we cannot support SIIT * environment at all. So, it makes more sense for us to reject any * malicious packets for non-SIIT environment, than try to do a * partial support for SIIT environment. */ if (IN6_IS_ADDR_V4MAPPED(&ip6->ip6_src) || IN6_IS_ADDR_V4MAPPED(&ip6->ip6_dst)) { return true; } /* * Reject packets with IPv4-compatible IPv6 addresses (RFC4291). */ if (IN6_IS_ADDR_V4COMPAT(&ip6->ip6_src) || IN6_IS_ADDR_V4COMPAT(&ip6->ip6_dst)) { return true; } return false; } /* * set/grab in6_ifaddr correspond to IPv6 destination address. */ static struct m_tag * ip6_setdstifaddr(struct mbuf *m, const struct in6_ifaddr *ia) { struct m_tag *mtag; struct ip6aux *ip6a; mtag = ip6_addaux(m); if (mtag == NULL) return NULL; ip6a = (struct ip6aux *)(mtag + 1); if (in6_setscope(&ip6a->ip6a_src, ia->ia_ifp, &ip6a->ip6a_scope_id)) { IP6_STATINC(IP6_STAT_BADSCOPE); return NULL; } ip6a->ip6a_src = ia->ia_addr.sin6_addr; ip6a->ip6a_flags = ia->ia6_flags; return mtag; } const struct ip6aux * ip6_getdstifaddr(struct mbuf *m) { struct m_tag *mtag; mtag = ip6_findaux(m); if (mtag != NULL) return (struct ip6aux *)(mtag + 1); else return NULL; } /* * Hop-by-Hop options header processing. If a valid jumbo payload option is * included, the real payload length will be stored in plenp. * * rtalertp - XXX: should be stored more smart way */ int ip6_hopopts_input(u_int32_t *plenp, u_int32_t *rtalertp, struct mbuf **mp, int *offp) { struct mbuf *m = *mp; int off = *offp, hbhlen; struct ip6_hbh *hbh; /* validation of the length of the header */ IP6_EXTHDR_GET(hbh, struct ip6_hbh *, m, sizeof(struct ip6_hdr), sizeof(struct ip6_hbh)); if (hbh == NULL) { IP6_STATINC(IP6_STAT_TOOSHORT); return -1; } hbhlen = (hbh->ip6h_len + 1) << 3; IP6_EXTHDR_GET(hbh, struct ip6_hbh *, m, sizeof(struct ip6_hdr), hbhlen); if (hbh == NULL) { IP6_STATINC(IP6_STAT_TOOSHORT); return -1; } KASSERT(ACCESSIBLE_POINTER(hbh, struct ip6_hdr)); off += hbhlen; hbhlen -= sizeof(struct ip6_hbh); if (ip6_process_hopopts(m, (u_int8_t *)hbh + sizeof(struct ip6_hbh), hbhlen, rtalertp, plenp) < 0) return -1; *offp = off; *mp = m; return 0; } /* * Search header for all Hop-by-hop options and process each option. * This function is separate from ip6_hopopts_input() in order to * handle a case where the sending node itself process its hop-by-hop * options header. In such a case, the function is called from ip6_output(). * * The function assumes that hbh header is located right after the IPv6 header * (RFC2460 p7), opthead is pointer into data content in m, and opthead to * opthead + hbhlen is located in continuous memory region. */ static int ip6_process_hopopts(struct mbuf *m, u_int8_t *opthead, int hbhlen, u_int32_t *rtalertp, u_int32_t *plenp) { struct ip6_hdr *ip6; int optlen = 0; u_int8_t *opt = opthead; u_int16_t rtalert_val; u_int32_t jumboplen; const int erroff = sizeof(struct ip6_hdr) + sizeof(struct ip6_hbh); for (; hbhlen > 0; hbhlen -= optlen, opt += optlen) { switch (*opt) { case IP6OPT_PAD1: optlen = 1; break; case IP6OPT_PADN: if (hbhlen < IP6OPT_MINLEN) { IP6_STATINC(IP6_STAT_TOOSMALL); goto bad; } optlen = *(opt + 1) + 2; break; case IP6OPT_RTALERT: /* XXX may need check for alignment */ if (hbhlen < IP6OPT_RTALERT_LEN) { IP6_STATINC(IP6_STAT_TOOSMALL); goto bad; } if (*(opt + 1) != IP6OPT_RTALERT_LEN - 2) { IP6_STATINC(IP6_STAT_BADOPTIONS); icmp6_error(m, ICMP6_PARAM_PROB, ICMP6_PARAMPROB_HEADER, erroff + opt + 1 - opthead); return (-1); } optlen = IP6OPT_RTALERT_LEN; memcpy((void *)&rtalert_val, (void *)(opt + 2), 2); *rtalertp = ntohs(rtalert_val); break; case IP6OPT_JUMBO: /* XXX may need check for alignment */ if (hbhlen < IP6OPT_JUMBO_LEN) { IP6_STATINC(IP6_STAT_TOOSMALL); goto bad; } if (*(opt + 1) != IP6OPT_JUMBO_LEN - 2) { IP6_STATINC(IP6_STAT_BADOPTIONS); icmp6_error(m, ICMP6_PARAM_PROB, ICMP6_PARAMPROB_HEADER, erroff + opt + 1 - opthead); return (-1); } optlen = IP6OPT_JUMBO_LEN; /* * IPv6 packets that have non 0 payload length * must not contain a jumbo payload option. */ ip6 = mtod(m, struct ip6_hdr *); if (ip6->ip6_plen) { IP6_STATINC(IP6_STAT_BADOPTIONS); icmp6_error(m, ICMP6_PARAM_PROB, ICMP6_PARAMPROB_HEADER, erroff + opt - opthead); return (-1); } /* * We may see jumbolen in unaligned location, so * we'd need to perform memcpy(). */ memcpy(&jumboplen, opt + 2, sizeof(jumboplen)); jumboplen = (u_int32_t)htonl(jumboplen); #if 1 /* * if there are multiple jumbo payload options, * *plenp will be non-zero and the packet will be * rejected. * the behavior may need some debate in ipngwg - * multiple options does not make sense, however, * there's no explicit mention in specification. */ if (*plenp != 0) { IP6_STATINC(IP6_STAT_BADOPTIONS); icmp6_error(m, ICMP6_PARAM_PROB, ICMP6_PARAMPROB_HEADER, erroff + opt + 2 - opthead); return (-1); } #endif /* * jumbo payload length must be larger than 65535. */ if (jumboplen <= IPV6_MAXPACKET) { IP6_STATINC(IP6_STAT_BADOPTIONS); icmp6_error(m, ICMP6_PARAM_PROB, ICMP6_PARAMPROB_HEADER, erroff + opt + 2 - opthead); return (-1); } *plenp = jumboplen; break; default: /* unknown option */ if (hbhlen < IP6OPT_MINLEN) { IP6_STATINC(IP6_STAT_TOOSMALL); goto bad; } optlen = ip6_unknown_opt(opt, m, erroff + opt - opthead); if (optlen == -1) return (-1); optlen += 2; break; } } return (0); bad: m_freem(m); return (-1); } /* * Unknown option processing. * The third argument `off' is the offset from the IPv6 header to the option, * which is necessary if the IPv6 header the and option header and IPv6 header * is not continuous in order to return an ICMPv6 error. */ int ip6_unknown_opt(u_int8_t *optp, struct mbuf *m, int off) { struct ip6_hdr *ip6; switch (IP6OPT_TYPE(*optp)) { case IP6OPT_TYPE_SKIP: /* ignore the option */ return ((int)*(optp + 1)); case IP6OPT_TYPE_DISCARD: /* silently discard */ m_freem(m); return (-1); case IP6OPT_TYPE_FORCEICMP: /* send ICMP even if multicasted */ IP6_STATINC(IP6_STAT_BADOPTIONS); icmp6_error(m, ICMP6_PARAM_PROB, ICMP6_PARAMPROB_OPTION, off); return (-1); case IP6OPT_TYPE_ICMP: /* send ICMP if not multicasted */ IP6_STATINC(IP6_STAT_BADOPTIONS); ip6 = mtod(m, struct ip6_hdr *); if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) || (m->m_flags & (M_BCAST|M_MCAST))) m_freem(m); else icmp6_error(m, ICMP6_PARAM_PROB, ICMP6_PARAMPROB_OPTION, off); return (-1); } m_freem(m); /* XXX: NOTREACHED */ return (-1); } void ip6_savecontrol(struct inpcb *inp, struct mbuf **mp, struct ip6_hdr *ip6, struct mbuf *m) { struct socket *so = inp->inp_socket; #ifdef RFC2292 #define IS2292(x, y) ((inp->inp_flags & IN6P_RFC2292) ? (x) : (y)) #else #define IS2292(x, y) (y) #endif KASSERT(m->m_flags & M_PKTHDR); if (SOOPT_TIMESTAMP(so->so_options)) mp = sbsavetimestamp(so->so_options, mp); /* some OSes call this logic with IPv4 packet, for SO_TIMESTAMP */ if ((ip6->ip6_vfc & IPV6_VERSION_MASK) != IPV6_VERSION) return; /* RFC 2292 sec. 5 */ if ((inp->inp_flags & IN6P_PKTINFO) != 0) { struct in6_pktinfo pi6; memcpy(&pi6.ipi6_addr, &ip6->ip6_dst, sizeof(struct in6_addr)); in6_clearscope(&pi6.ipi6_addr); /* XXX */ pi6.ipi6_ifindex = m->m_pkthdr.rcvif_index; *mp = sbcreatecontrol(&pi6, sizeof(pi6), IS2292(IPV6_2292PKTINFO, IPV6_PKTINFO), IPPROTO_IPV6); if (*mp) mp = &(*mp)->m_next; } if (inp->inp_flags & IN6P_HOPLIMIT) { int hlim = ip6->ip6_hlim & 0xff; *mp = sbcreatecontrol(&hlim, sizeof(hlim), IS2292(IPV6_2292HOPLIMIT, IPV6_HOPLIMIT), IPPROTO_IPV6); if (*mp) mp = &(*mp)->m_next; } if ((inp->inp_flags & IN6P_TCLASS) != 0) { u_int32_t flowinfo; int tclass; flowinfo = (u_int32_t)ntohl(ip6->ip6_flow & IPV6_FLOWINFO_MASK); flowinfo >>= 20; tclass = flowinfo & 0xff; *mp = sbcreatecontrol(&tclass, sizeof(tclass), IPV6_TCLASS, IPPROTO_IPV6); if (*mp) mp = &(*mp)->m_next; } /* * IPV6_HOPOPTS socket option. Recall that we required super-user * privilege for the option (see ip6_ctloutput), but it might be too * strict, since there might be some hop-by-hop options which can be * returned to normal user. * See also RFC3542 section 8 (or RFC2292 section 6). */ if ((inp->inp_flags & IN6P_HOPOPTS) != 0) { /* * Check if a hop-by-hop options header is contatined in the * received packet, and if so, store the options as ancillary * data. Note that a hop-by-hop options header must be * just after the IPv6 header, which fact is assured through * the IPv6 input processing. */ struct ip6_hdr *xip6 = mtod(m, struct ip6_hdr *); if (xip6->ip6_nxt == IPPROTO_HOPOPTS) { struct ip6_hbh *hbh; int hbhlen; struct mbuf *ext; ext = ip6_pullexthdr(m, sizeof(struct ip6_hdr), xip6->ip6_nxt); if (ext == NULL) { IP6_STATINC(IP6_STAT_TOOSHORT); return; } hbh = mtod(ext, struct ip6_hbh *); hbhlen = (hbh->ip6h_len + 1) << 3; if (hbhlen != ext->m_len) { m_freem(ext); IP6_STATINC(IP6_STAT_TOOSHORT); return; } /* * XXX: We copy whole the header even if a jumbo * payload option is included, which option is to * be removed before returning in the RFC 2292. * Note: this constraint is removed in RFC3542. */ *mp = sbcreatecontrol(hbh, hbhlen, IS2292(IPV6_2292HOPOPTS, IPV6_HOPOPTS), IPPROTO_IPV6); if (*mp) mp = &(*mp)->m_next; m_freem(ext); } } /* IPV6_DSTOPTS and IPV6_RTHDR socket options */ if (inp->inp_flags & (IN6P_DSTOPTS | IN6P_RTHDR)) { struct ip6_hdr *xip6 = mtod(m, struct ip6_hdr *); int nxt = xip6->ip6_nxt, off = sizeof(struct ip6_hdr); /* * Search for destination options headers or routing * header(s) through the header chain, and stores each * header as ancillary data. * Note that the order of the headers remains in * the chain of ancillary data. */ for (;;) { /* is explicit loop prevention necessary? */ struct ip6_ext *ip6e = NULL; int elen; struct mbuf *ext = NULL; /* * if it is not an extension header, don't try to * pull it from the chain. */ switch (nxt) { case IPPROTO_DSTOPTS: case IPPROTO_ROUTING: case IPPROTO_HOPOPTS: case IPPROTO_AH: /* is it possible? */ break; default: goto loopend; } ext = ip6_pullexthdr(m, off, nxt); if (ext == NULL) { IP6_STATINC(IP6_STAT_TOOSHORT); return; } ip6e = mtod(ext, struct ip6_ext *); if (nxt == IPPROTO_AH) elen = (ip6e->ip6e_len + 2) << 2; else elen = (ip6e->ip6e_len + 1) << 3; if (elen != ext->m_len) { m_freem(ext); IP6_STATINC(IP6_STAT_TOOSHORT); return; } KASSERT(ACCESSIBLE_POINTER(ip6e, struct ip6_hdr)); switch (nxt) { case IPPROTO_DSTOPTS: if (!(inp->inp_flags & IN6P_DSTOPTS)) break; *mp = sbcreatecontrol(ip6e, elen, IS2292(IPV6_2292DSTOPTS, IPV6_DSTOPTS), IPPROTO_IPV6); if (*mp) mp = &(*mp)->m_next; break; case IPPROTO_ROUTING: if (!(inp->inp_flags & IN6P_RTHDR)) break; *mp = sbcreatecontrol(ip6e, elen, IS2292(IPV6_2292RTHDR, IPV6_RTHDR), IPPROTO_IPV6); if (*mp) mp = &(*mp)->m_next; break; case IPPROTO_HOPOPTS: case IPPROTO_AH: /* is it possible? */ break; default: /* * other cases have been filtered in the above. * none will visit this case. here we supply * the code just in case (nxt overwritten or * other cases). */ m_freem(ext); goto loopend; } /* proceed with the next header. */ off += elen; nxt = ip6e->ip6e_nxt; ip6e = NULL; m_freem(ext); ext = NULL; } loopend: ; } } #undef IS2292 void ip6_notify_pmtu(struct inpcb *inp, const struct sockaddr_in6 *dst, uint32_t *mtu) { struct socket *so; struct mbuf *m_mtu; struct ip6_mtuinfo mtuctl; so = inp->inp_socket; if (mtu == NULL) return; KASSERT(so != NULL); memset(&mtuctl, 0, sizeof(mtuctl)); /* zero-clear for safety */ mtuctl.ip6m_mtu = *mtu; mtuctl.ip6m_addr = *dst; if (sa6_recoverscope(&mtuctl.ip6m_addr)) return; if ((m_mtu = sbcreatecontrol(&mtuctl, sizeof(mtuctl), IPV6_PATHMTU, IPPROTO_IPV6)) == NULL) return; if (sbappendaddr(&so->so_rcv, (const struct sockaddr *)dst, NULL, m_mtu) == 0) { soroverflow(so); m_freem(m_mtu); } else sorwakeup(so); return; } /* * pull single extension header from mbuf chain. returns single mbuf that * contains the result, or NULL on error. */ static struct mbuf * ip6_pullexthdr(struct mbuf *m, size_t off, int nxt) { struct ip6_ext ip6e; size_t elen; struct mbuf *n; if (off + sizeof(ip6e) > m->m_pkthdr.len) return NULL; m_copydata(m, off, sizeof(ip6e), (void *)&ip6e); if (nxt == IPPROTO_AH) elen = (ip6e.ip6e_len + 2) << 2; else elen = (ip6e.ip6e_len + 1) << 3; if (off + elen > m->m_pkthdr.len) return NULL; MGET(n, M_DONTWAIT, MT_DATA); if (n && elen >= MLEN) { MCLGET(n, M_DONTWAIT); if ((n->m_flags & M_EXT) == 0) { m_free(n); n = NULL; } } if (!n) return NULL; n->m_len = 0; if (elen >= M_TRAILINGSPACE(n)) { m_free(n); return NULL; } m_copydata(m, off, elen, mtod(n, void *)); n->m_len = elen; return n; } /* * Get offset to the previous header followed by the header * currently processed. */ int ip6_get_prevhdr(struct mbuf *m, int off) { struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *); if (off == sizeof(struct ip6_hdr)) { return offsetof(struct ip6_hdr, ip6_nxt); } else if (off < sizeof(struct ip6_hdr)) { panic("%s: off < sizeof(struct ip6_hdr)", __func__); } else { int len, nlen, nxt; struct ip6_ext ip6e; nxt = ip6->ip6_nxt; len = sizeof(struct ip6_hdr); nlen = 0; while (len < off) { m_copydata(m, len, sizeof(ip6e), &ip6e); switch (nxt) { case IPPROTO_FRAGMENT: nlen = sizeof(struct ip6_frag); break; case IPPROTO_AH: nlen = (ip6e.ip6e_len + 2) << 2; break; default: nlen = (ip6e.ip6e_len + 1) << 3; break; } len += nlen; nxt = ip6e.ip6e_nxt; } return (len - nlen); } } /* * get next header offset. m will be retained. */ int ip6_nexthdr(struct mbuf *m, int off, int proto, int *nxtp) { struct ip6_hdr ip6; struct ip6_ext ip6e; struct ip6_frag fh; /* just in case */ if (m == NULL) panic("%s: m == NULL", __func__); if ((m->m_flags & M_PKTHDR) == 0 || m->m_pkthdr.len < off) return -1; switch (proto) { case IPPROTO_IPV6: /* do not chase beyond intermediate IPv6 headers */ if (off != 0) return -1; if (m->m_pkthdr.len < off + sizeof(ip6)) return -1; m_copydata(m, off, sizeof(ip6), (void *)&ip6); if (nxtp) *nxtp = ip6.ip6_nxt; off += sizeof(ip6); return off; case IPPROTO_FRAGMENT: /* * terminate parsing if it is not the first fragment, * it does not make sense to parse through it. */ if (m->m_pkthdr.len < off + sizeof(fh)) return -1; m_copydata(m, off, sizeof(fh), (void *)&fh); if ((fh.ip6f_offlg & IP6F_OFF_MASK) != 0) return -1; if (nxtp) *nxtp = fh.ip6f_nxt; off += sizeof(struct ip6_frag); return off; case IPPROTO_AH: if (m->m_pkthdr.len < off + sizeof(ip6e)) return -1; m_copydata(m, off, sizeof(ip6e), (void *)&ip6e); if (nxtp) *nxtp = ip6e.ip6e_nxt; off += (ip6e.ip6e_len + 2) << 2; if (m->m_pkthdr.len < off) return -1; return off; case IPPROTO_HOPOPTS: case IPPROTO_ROUTING: case IPPROTO_DSTOPTS: if (m->m_pkthdr.len < off + sizeof(ip6e)) return -1; m_copydata(m, off, sizeof(ip6e), (void *)&ip6e); if (nxtp) *nxtp = ip6e.ip6e_nxt; off += (ip6e.ip6e_len + 1) << 3; if (m->m_pkthdr.len < off) return -1; return off; case IPPROTO_NONE: case IPPROTO_ESP: case IPPROTO_IPCOMP: /* give up */ return -1; default: return -1; } } /* * get offset for the last header in the chain. m will be kept untainted. */ int ip6_lasthdr(struct mbuf *m, int off, int proto, int *nxtp) { int newoff; int nxt; if (!nxtp) { nxt = -1; nxtp = &nxt; } for (;;) { newoff = ip6_nexthdr(m, off, proto, nxtp); if (newoff < 0) return off; else if (newoff < off) return -1; /* invalid */ else if (newoff == off) return newoff; off = newoff; proto = *nxtp; } } static struct m_tag * ip6_addaux(struct mbuf *m) { struct m_tag *mtag; mtag = m_tag_find(m, PACKET_TAG_INET6); if (!mtag) { mtag = m_tag_get(PACKET_TAG_INET6, sizeof(struct ip6aux), M_NOWAIT); if (mtag) { m_tag_prepend(m, mtag); memset(mtag + 1, 0, sizeof(struct ip6aux)); } } return mtag; } static struct m_tag * ip6_findaux(struct mbuf *m) { struct m_tag *mtag; mtag = m_tag_find(m, PACKET_TAG_INET6); return mtag; } static void ip6_delaux(struct mbuf *m) { struct m_tag *mtag; mtag = m_tag_find(m, PACKET_TAG_INET6); if (mtag) m_tag_delete(m, mtag); } /* * System control for IP6 */ const u_char inet6ctlerrmap[PRC_NCMDS] = { 0, 0, 0, 0, 0, EMSGSIZE, EHOSTDOWN, EHOSTUNREACH, EHOSTUNREACH, EHOSTUNREACH, ECONNREFUSED, ECONNREFUSED, EMSGSIZE, EHOSTUNREACH, 0, 0, 0, 0, 0, 0, ENOPROTOOPT }; extern int sysctl_net_inet6_addrctlpolicy(SYSCTLFN_ARGS); static int sysctl_net_inet6_ip6_stats(SYSCTLFN_ARGS) { return (NETSTAT_SYSCTL(ip6stat_percpu, IP6_NSTATS)); } static void sysctl_net_inet6_ip6_setup(struct sysctllog **clog) { const struct sysctlnode *ip6_node; sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT, CTLTYPE_NODE, "inet6", SYSCTL_DESCR("PF_INET6 related settings"), NULL, 0, NULL, 0, CTL_NET, PF_INET6, CTL_EOL); sysctl_createv(clog, 0, NULL, &ip6_node, CTLFLAG_PERMANENT, CTLTYPE_NODE, "ip6", SYSCTL_DESCR("IPv6 related settings"), NULL, 0, NULL, 0, CTL_NET, PF_INET6, IPPROTO_IPV6, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_INT, "forwarding", SYSCTL_DESCR("Enable forwarding of INET6 datagrams"), NULL, 0, &ip6_forwarding, 0, CTL_NET, PF_INET6, IPPROTO_IPV6, IPV6CTL_FORWARDING, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_INT, "redirect", SYSCTL_DESCR("Enable sending of ICMPv6 redirect messages"), NULL, 0, &ip6_sendredirects, 0, CTL_NET, PF_INET6, IPPROTO_IPV6, IPV6CTL_SENDREDIRECTS, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_INT, "hlim", SYSCTL_DESCR("Hop limit for an INET6 datagram"), NULL, 0, &ip6_defhlim, 0, CTL_NET, PF_INET6, IPPROTO_IPV6, IPV6CTL_DEFHLIM, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_INT, "maxfragpackets", SYSCTL_DESCR("Maximum number of fragments to buffer " "for reassembly"), NULL, 0, &ip6_maxfragpackets, 0, CTL_NET, PF_INET6, IPPROTO_IPV6, IPV6CTL_MAXFRAGPACKETS, CTL_EOL); pktq_sysctl_setup(ip6_pktq, clog, ip6_node, IPV6CTL_IFQ); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_INT, "keepfaith", SYSCTL_DESCR("Activate faith interface"), NULL, 0, &ip6_keepfaith, 0, CTL_NET, PF_INET6, IPPROTO_IPV6, IPV6CTL_KEEPFAITH, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_INT, "log_interval", SYSCTL_DESCR("Minimum interval between logging " "unroutable packets"), NULL, 0, &ip6_log_interval, 0, CTL_NET, PF_INET6, IPPROTO_IPV6, IPV6CTL_LOG_INTERVAL, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_INT, "hdrnestlimit", SYSCTL_DESCR("Maximum number of nested IPv6 headers"), NULL, 0, &ip6_hdrnestlimit, 0, CTL_NET, PF_INET6, IPPROTO_IPV6, IPV6CTL_HDRNESTLIMIT, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_INT, "dad_count", SYSCTL_DESCR("Number of Duplicate Address Detection " "probes to send"), NULL, 0, &ip6_dad_count, 0, CTL_NET, PF_INET6, IPPROTO_IPV6, IPV6CTL_DAD_COUNT, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_INT, "auto_flowlabel", SYSCTL_DESCR("Assign random IPv6 flow labels"), NULL, 0, &ip6_auto_flowlabel, 0, CTL_NET, PF_INET6, IPPROTO_IPV6, IPV6CTL_AUTO_FLOWLABEL, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_INT, "defmcasthlim", SYSCTL_DESCR("Default multicast hop limit"), NULL, 0, &ip6_defmcasthlim, 0, CTL_NET, PF_INET6, IPPROTO_IPV6, IPV6CTL_DEFMCASTHLIM, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT, CTLTYPE_STRING, "kame_version", SYSCTL_DESCR("KAME Version"), NULL, 0, __UNCONST(__KAME_VERSION), 0, CTL_NET, PF_INET6, IPPROTO_IPV6, IPV6CTL_KAME_VERSION, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_INT, "use_deprecated", SYSCTL_DESCR("Allow use of deprecated addresses as " "source addresses"), NULL, 0, &ip6_use_deprecated, 0, CTL_NET, PF_INET6, IPPROTO_IPV6, IPV6CTL_USE_DEPRECATED, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT #ifndef INET6_BINDV6ONLY |CTLFLAG_READWRITE, #endif CTLTYPE_INT, "v6only", SYSCTL_DESCR("Disallow PF_INET6 sockets from connecting " "to PF_INET sockets"), NULL, 0, &ip6_v6only, 0, CTL_NET, PF_INET6, IPPROTO_IPV6, IPV6CTL_V6ONLY, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_INT, "anonportmin", SYSCTL_DESCR("Lowest ephemeral port number to assign"), sysctl_net_inet_ip_ports, 0, &ip6_anonportmin, 0, CTL_NET, PF_INET6, IPPROTO_IPV6, IPV6CTL_ANONPORTMIN, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_INT, "anonportmax", SYSCTL_DESCR("Highest ephemeral port number to assign"), sysctl_net_inet_ip_ports, 0, &ip6_anonportmax, 0, CTL_NET, PF_INET6, IPPROTO_IPV6, IPV6CTL_ANONPORTMAX, CTL_EOL); #ifndef IPNOPRIVPORTS sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_INT, "lowportmin", SYSCTL_DESCR("Lowest privileged ephemeral port number " "to assign"), sysctl_net_inet_ip_ports, 0, &ip6_lowportmin, 0, CTL_NET, PF_INET6, IPPROTO_IPV6, IPV6CTL_LOWPORTMIN, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_INT, "lowportmax", SYSCTL_DESCR("Highest privileged ephemeral port number " "to assign"), sysctl_net_inet_ip_ports, 0, &ip6_lowportmax, 0, CTL_NET, PF_INET6, IPPROTO_IPV6, IPV6CTL_LOWPORTMAX, CTL_EOL); #endif /* IPNOPRIVPORTS */ sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_INT, "auto_linklocal", SYSCTL_DESCR("Default value of per-interface flag for " "adding an IPv6 link-local address to " "interfaces when attached"), NULL, 0, &ip6_auto_linklocal, 0, CTL_NET, PF_INET6, IPPROTO_IPV6, IPV6CTL_AUTO_LINKLOCAL, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_READONLY, CTLTYPE_STRUCT, "addctlpolicy", SYSCTL_DESCR("Return the current address control" " policy"), sysctl_net_inet6_addrctlpolicy, 0, NULL, 0, CTL_NET, PF_INET6, IPPROTO_IPV6, IPV6CTL_ADDRCTLPOLICY, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_INT, "prefer_tempaddr", SYSCTL_DESCR("Prefer temporary address as source " "address"), NULL, 0, &ip6_prefer_tempaddr, 0, CTL_NET, PF_INET6, IPPROTO_IPV6, CTL_CREATE, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_INT, "maxfrags", SYSCTL_DESCR("Maximum fragments in reassembly queue"), NULL, 0, &ip6_maxfrags, 0, CTL_NET, PF_INET6, IPPROTO_IPV6, IPV6CTL_MAXFRAGS, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT, CTLTYPE_STRUCT, "stats", SYSCTL_DESCR("IPv6 statistics"), sysctl_net_inet6_ip6_stats, 0, NULL, 0, CTL_NET, PF_INET6, IPPROTO_IPV6, IPV6CTL_STATS, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_INT, "use_defaultzone", SYSCTL_DESCR("Whether to use the default scope zones"), NULL, 0, &ip6_use_defzone, 0, CTL_NET, PF_INET6, IPPROTO_IPV6, IPV6CTL_USE_DEFAULTZONE, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_INT, "mcast_pmtu", SYSCTL_DESCR("Enable pMTU discovery for multicast packet"), NULL, 0, &ip6_mcast_pmtu, 0, CTL_NET, PF_INET6, IPPROTO_IPV6, CTL_CREATE, CTL_EOL); /* anonportalgo RFC6056 subtree */ const struct sysctlnode *portalgo_node; sysctl_createv(clog, 0, NULL, &portalgo_node, CTLFLAG_PERMANENT, CTLTYPE_NODE, "anonportalgo", SYSCTL_DESCR("Anonymous port algorithm selection (RFC 6056)"), NULL, 0, NULL, 0, CTL_NET, PF_INET6, IPPROTO_IPV6, CTL_CREATE, CTL_EOL); sysctl_createv(clog, 0, &portalgo_node, NULL, CTLFLAG_PERMANENT, CTLTYPE_STRING, "available", SYSCTL_DESCR("available algorithms"), sysctl_portalgo_available, 0, NULL, PORTALGO_MAXLEN, CTL_CREATE, CTL_EOL); sysctl_createv(clog, 0, &portalgo_node, NULL, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_STRING, "selected", SYSCTL_DESCR("selected algorithm"), sysctl_portalgo_selected6, 0, NULL, PORTALGO_MAXLEN, CTL_CREATE, CTL_EOL); sysctl_createv(clog, 0, &portalgo_node, NULL, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_STRUCT, "reserve", SYSCTL_DESCR("bitmap of reserved ports"), sysctl_portalgo_reserve6, 0, NULL, 0, CTL_CREATE, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_INT, "neighborgcthresh", SYSCTL_DESCR("Maximum number of entries in neighbor" " cache"), NULL, 1, &ip6_neighborgcthresh, 0, CTL_NET, PF_INET6, IPPROTO_IPV6, CTL_CREATE, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_INT, "maxdynroutes", SYSCTL_DESCR("Maximum number of routes created via" " redirect"), NULL, 1, &ip6_maxdynroutes, 0, CTL_NET, PF_INET6, IPPROTO_IPV6, CTL_CREATE, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_INT, "param_rt_msg", SYSCTL_DESCR("How to send parameter changing" " routing message"), NULL, 0, &ip6_param_rt_msg, 0, CTL_NET, PF_INET6, IPPROTO_IPV6, CTL_CREATE, CTL_EOL); } void ip6_statinc(u_int stat) { KASSERT(stat < IP6_NSTATS); IP6_STATINC(stat); }
69 107 107 121 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 /* $NetBSD: tmpfs.h,v 1.56 2020/05/17 19:39:15 ad Exp $ */ /* * Copyright (c) 2005, 2006, 2007, 2020 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Julio M. Merino Vidal, developed as part of Google's Summer of Code * 2005 program. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #ifndef _FS_TMPFS_TMPFS_H_ #define _FS_TMPFS_TMPFS_H_ #if !defined(_KERNEL) && !defined(_KMEMUSER) #error "not supposed to be exposed to userland" #endif #include <sys/dirent.h> #include <sys/mount.h> #include <sys/pool.h> #include <sys/queue.h> #include <sys/vnode.h> /* * Internal representation of a tmpfs directory entry. * * All fields are protected by vnode lock. */ typedef struct tmpfs_dirent { TAILQ_ENTRY(tmpfs_dirent) td_entries; /* Pointer to the inode this entry refers to. */ struct tmpfs_node * td_node; /* Sequence number, see tmpfs_dir_getseq(). */ uint32_t td_seq; /* Name and its length. */ char * td_name; uint16_t td_namelen; } tmpfs_dirent_t; TAILQ_HEAD(tmpfs_dir, tmpfs_dirent); /* * Internal representation of a tmpfs file system node -- inode. * * This structure is split in two parts: one holds attributes common * to all file types and the other holds data that is only applicable to * a particular type. * * All fields are protected by vnode lock. The vnode association itself * is protected by vcache. */ typedef struct tmpfs_node { LIST_ENTRY(tmpfs_node) tn_entries; /* * Each inode has a corresponding vnode. It is a bi-directional * association. Whenever vnode is allocated, its v_data field is * set to the inode it reference, and tmpfs_node_t::tn_vnode is * set to point to the said vnode. * * Further attempts to allocate a vnode for this same node will * result in returning a new reference to the value stored in * tn_vnode. It may be NULL when the node is unused (that is, * no vnode has been allocated or it has been reclaimed). */ vnode_t * tn_vnode; /* Prevent node from being reclaimed. */ uint32_t tn_holdcount; /* Directory entry. Only a hint, since hard link can have multiple. */ tmpfs_dirent_t * tn_dirent_hint; /* The inode type: VBLK, VCHR, VDIR, VFIFO, VLNK, VREG or VSOCK. */ enum vtype tn_type; /* Inode identifier and generation number. */ ino_t tn_id; uint32_t tn_gen; /* The inode size. */ off_t tn_size; /* Generic node attributes. */ uid_t tn_uid; gid_t tn_gid; mode_t tn_mode; int tn_flags; nlink_t tn_links; unsigned tn_tflags; struct timespec tn_atime; struct timespec tn_mtime; struct timespec tn_ctime; struct timespec tn_birthtime; kmutex_t tn_timelock; /* Head of byte-level lock list (used by tmpfs_advlock). */ struct lockf * tn_lockf; union { /* Type case: VBLK or VCHR. */ struct { dev_t tn_rdev; } tn_dev; /* Type case: VDIR. */ struct { /* Parent directory (root inode points to itself). */ struct tmpfs_node * tn_parent; /* List of directory entries. */ struct tmpfs_dir tn_dir; /* Last given sequence number and their arena. */ uint32_t tn_next_seq; void * tn_seq_arena; /* * Pointer of the last directory entry returned * by the readdir(3) operation. */ struct tmpfs_dirent * tn_readdir_lastp; } tn_dir; /* Type case: VLNK. */ struct tn_lnk { /* The link's target. */ char * tn_link; } tn_lnk; /* Type case: VREG. */ struct tn_reg { /* Underlying UVM object to store contents. */ struct uvm_object * tn_aobj; size_t tn_aobj_pages; } tn_reg; } tn_spec; } tmpfs_node_t; #if defined(_KERNEL) VFS_PROTOS(tmpfs); LIST_HEAD(tmpfs_node_list, tmpfs_node); #define TMPFS_MAXNAMLEN 255 /* Validate maximum td_namelen length. */ CTASSERT(TMPFS_MAXNAMLEN < UINT16_MAX); /* * Reserved values for the virtual entries (the first must be 0) and EOF. * The start/end of the incremental range, see tmpfs_dir_getseq(). */ #define TMPFS_DIRSEQ_DOT 0 #define TMPFS_DIRSEQ_DOTDOT 1 #define TMPFS_DIRSEQ_EOF 2 #define TMPFS_DIRSEQ_START 3 /* inclusive */ #define TMPFS_DIRSEQ_END (1U << 30) /* exclusive */ /* Mark to indicate that the number is not set. */ #define TMPFS_DIRSEQ_NONE (1U << 31) /* Flags: time update requests. */ #define TMPFS_UPDATE_ATIME 0x01 #define TMPFS_UPDATE_MTIME 0x02 #define TMPFS_UPDATE_CTIME 0x04 /* * Bits indicating whiteout use for the directory. * We abuse tmpfs_node_t::tn_gen for that. */ #define TMPFS_WHITEOUT_BIT (1U << 31) #define TMPFS_NODE_GEN_MASK (TMPFS_WHITEOUT_BIT - 1) #define TMPFS_NODE_GEN(node) \ ((node)->tn_gen & TMPFS_NODE_GEN_MASK) /* White-out inode indicator. */ #define TMPFS_NODE_WHITEOUT ((tmpfs_node_t *)-1) /* * Bit indicating this node must be reclaimed when holdcount reaches zero. * Ored into tmpfs_node_t::tn_holdcount. */ #define TMPFS_NODE_RECLAIMED (1U << 30) /* * Internal representation of a tmpfs mount point. */ typedef struct tmpfs_mount { /* Limit and number of bytes in use by the file system. */ uint64_t tm_mem_limit; uint64_t tm_bytes_used; kmutex_t tm_acc_lock; /* Pointer to the root inode. */ tmpfs_node_t * tm_root; /* Maximum number of possible nodes for this file system. */ unsigned int tm_nodes_max; /* Number of nodes currently allocated. */ unsigned int tm_nodes_cnt; /* List of inodes and the lock protecting it. */ kmutex_t tm_lock; struct tmpfs_node_list tm_nodes; } tmpfs_mount_t; /* * This structure maps a file identifier to a tmpfs node. Used by the * NFS code. */ typedef struct tmpfs_fid { uint16_t tf_len; uint16_t tf_pad; uint32_t tf_gen; ino_t tf_id; } tmpfs_fid_t; /* * Prototypes for tmpfs_subr.c. */ void tmpfs_free_node(tmpfs_mount_t *, tmpfs_node_t *); int tmpfs_construct_node(vnode_t *, vnode_t **, struct vattr *, struct componentname *, char *); int tmpfs_alloc_dirent(tmpfs_mount_t *, const char *, uint16_t, tmpfs_dirent_t **); void tmpfs_free_dirent(tmpfs_mount_t *, tmpfs_dirent_t *); void tmpfs_dir_attach(tmpfs_node_t *, tmpfs_dirent_t *, tmpfs_node_t *); void tmpfs_dir_detach(tmpfs_node_t *, tmpfs_dirent_t *); tmpfs_dirent_t *tmpfs_dir_lookup(tmpfs_node_t *, struct componentname *); tmpfs_dirent_t *tmpfs_dir_cached(tmpfs_node_t *); uint32_t tmpfs_dir_getseq(tmpfs_node_t *, tmpfs_dirent_t *); tmpfs_dirent_t *tmpfs_dir_lookupbyseq(tmpfs_node_t *, off_t); int tmpfs_dir_getdents(tmpfs_node_t *, struct uio *, off_t *); int tmpfs_reg_resize(vnode_t *, off_t); int tmpfs_chflags(vnode_t *, int, kauth_cred_t, lwp_t *); int tmpfs_chmod(vnode_t *, mode_t, kauth_cred_t, lwp_t *); int tmpfs_chown(vnode_t *, uid_t, gid_t, kauth_cred_t, lwp_t *); int tmpfs_chsize(vnode_t *, u_quad_t, kauth_cred_t, lwp_t *); int tmpfs_chtimes(vnode_t *, const struct timespec *, const struct timespec *, const struct timespec *, int, kauth_cred_t, lwp_t *); void tmpfs_update(vnode_t *, unsigned); void tmpfs_update_locked(vnode_t *, unsigned); void tmpfs_update_lazily(vnode_t *, unsigned); /* * Prototypes for tmpfs_mem.c. */ void tmpfs_mntmem_init(tmpfs_mount_t *, uint64_t); void tmpfs_mntmem_destroy(tmpfs_mount_t *); int tmpfs_mntmem_set(tmpfs_mount_t *, uint64_t); size_t tmpfs_mem_info(bool); uint64_t tmpfs_bytes_max(tmpfs_mount_t *); size_t tmpfs_pages_avail(tmpfs_mount_t *); bool tmpfs_mem_incr(tmpfs_mount_t *, size_t); void tmpfs_mem_decr(tmpfs_mount_t *, size_t); tmpfs_dirent_t *tmpfs_dirent_get(tmpfs_mount_t *); void tmpfs_dirent_put(tmpfs_mount_t *, tmpfs_dirent_t *); tmpfs_node_t * tmpfs_node_get(tmpfs_mount_t *); void tmpfs_node_put(tmpfs_mount_t *, tmpfs_node_t *); char * tmpfs_strname_alloc(tmpfs_mount_t *, size_t); void tmpfs_strname_free(tmpfs_mount_t *, char *, size_t); bool tmpfs_strname_neqlen(struct componentname *, struct componentname *); /* * Ensures that the node pointed by 'node' is a directory and that its * contents are consistent with respect to directories. */ #define TMPFS_VALIDATE_DIR(node) \ KASSERT((node)->tn_vnode == NULL || VOP_ISLOCKED((node)->tn_vnode)); \ KASSERT((node)->tn_type == VDIR); \ KASSERT((node)->tn_size % sizeof(tmpfs_dirent_t) == 0); /* * Routines to convert VFS structures to tmpfs internal ones. */ static __inline tmpfs_mount_t * VFS_TO_TMPFS(struct mount *mp) { tmpfs_mount_t *tmp = mp->mnt_data; KASSERT(tmp != NULL); return tmp; } static __inline tmpfs_node_t * VP_TO_TMPFS_DIR(vnode_t *vp) { tmpfs_node_t *node = vp->v_data; KASSERT(node != NULL); TMPFS_VALIDATE_DIR(node); return node; } #endif /* defined(_KERNEL) */ static __inline tmpfs_node_t * VP_TO_TMPFS_NODE(vnode_t *vp) { tmpfs_node_t *node = vp->v_data; #ifdef KASSERT KASSERT(node != NULL); #endif return node; } #endif /* _FS_TMPFS_TMPFS_H_ */
13 1 1 1 1 2 1 6 1 5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 /* $NetBSD: umap_vfsops.c,v 1.104 2022/11/04 11:20:40 hannken Exp $ */ /* * Copyright (c) 1992, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software donated to Berkeley by * the UCLA Ficus project. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)null_vfsops.c 1.5 (Berkeley) 7/10/92 * @(#)umap_vfsops.c 8.8 (Berkeley) 5/14/95 */ /* * Umap Layer * (See mount_umap(8) for a description of this layer.) */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: umap_vfsops.c,v 1.104 2022/11/04 11:20:40 hannken Exp $"); #include <sys/param.h> #include <sys/systm.h> #include <sys/sysctl.h> #include <sys/proc.h> #include <sys/time.h> #include <sys/vnode.h> #include <sys/mount.h> #include <sys/namei.h> #include <sys/syslog.h> #include <sys/kauth.h> #include <sys/module.h> #include <miscfs/umapfs/umap.h> #include <miscfs/genfs/layer_extern.h> MODULE(MODULE_CLASS_VFS, umap, "layerfs"); VFS_PROTOS(umapfs); /* * Mount umap layer */ int umapfs_mount(struct mount *mp, const char *path, void *data, size_t *data_len) { struct lwp *l = curlwp; struct pathbuf *pb; struct nameidata nd; struct umap_args *args = data; struct vnode *lowerrootvp, *vp; struct umap_mount *amp; int error; #ifdef UMAPFS_DIAGNOSTIC int i; #endif fsid_t tfsid; if (args == NULL) return EINVAL; if (*data_len < sizeof *args) { #ifdef UMAPFS_DIAGNOSTIC printf("mount_umap: data len %d < args %d\n", (int)*data_len, (int)(sizeof *args)); #endif return EINVAL; } if (mp->mnt_flag & MNT_GETARGS) { amp = MOUNTTOUMAPMOUNT(mp); if (amp == NULL) return EIO; args->la.target = NULL; args->nentries = amp->info_nentries; args->gnentries = amp->info_gnentries; *data_len = sizeof *args; return 0; } /* only for root */ error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_MOUNT, KAUTH_REQ_SYSTEM_MOUNT_UMAP, NULL, NULL, NULL); if (error) return error; #ifdef UMAPFS_DIAGNOSTIC printf("umapfs_mount(mp = %p)\n", mp); #endif /* * Update is not supported */ if (mp->mnt_flag & MNT_UPDATE) return EOPNOTSUPP; /* * Find lower node */ error = pathbuf_copyin(args->umap_target, &pb); if (error) { return error; } NDINIT(&nd, LOOKUP, FOLLOW|LOCKLEAF, pb); if ((error = namei(&nd)) != 0) { pathbuf_destroy(pb); return error; } /* * Sanity check on lower vnode */ lowerrootvp = nd.ni_vp; pathbuf_destroy(pb); #ifdef UMAPFS_DIAGNOSTIC printf("vp = %p, check for VDIR...\n", lowerrootvp); #endif if (lowerrootvp->v_type != VDIR) { vput(lowerrootvp); return (EINVAL); } #ifdef UMAPFS_DIAGNOSTIC printf("mp = %p\n", mp); #endif amp = kmem_zalloc(sizeof(struct umap_mount), KM_SLEEP); mp->mnt_data = amp; /* * Now copy in the number of entries and maps for umap mapping. */ if (args->nentries < 0 || args->nentries > MAPFILEENTRIES || args->gnentries < 0 || args->gnentries > GMAPFILEENTRIES) { vput(lowerrootvp); return (EINVAL); } amp->info_nentries = args->nentries; amp->info_gnentries = args->gnentries; error = copyin(args->mapdata, amp->info_mapdata, 2*sizeof(u_long)*args->nentries); if (error) { vput(lowerrootvp); return (error); } #ifdef UMAPFS_DIAGNOSTIC printf("umap_mount:nentries %d\n",args->nentries); for (i = 0; i < args->nentries; i++) printf(" %ld maps to %ld\n", amp->info_mapdata[i][0], amp->info_mapdata[i][1]); #endif error = copyin(args->gmapdata, amp->info_gmapdata, 2*sizeof(u_long)*args->gnentries); if (error) { vput(lowerrootvp); return (error); } #ifdef UMAPFS_DIAGNOSTIC printf("umap_mount:gnentries %d\n",args->gnentries); for (i = 0; i < args->gnentries; i++) printf("\tgroup %ld maps to %ld\n", amp->info_gmapdata[i][0], amp->info_gmapdata[i][1]); #endif /* * Make sure the mount point's sufficiently initialized * that the node create call will work. */ tfsid.__fsid_val[0] = (int32_t)args->fsid; tfsid.__fsid_val[1] = makefstype(MOUNT_UMAP); if (tfsid.__fsid_val[0] == 0) { log(LOG_WARNING, "umapfs: fsid given as 0, ignoring\n"); vfs_getnewfsid(mp); } else if (vfs_getvfs(&tfsid)) { log(LOG_WARNING, "umapfs: fsid %x already mounted\n", tfsid.__fsid_val[0]); vfs_getnewfsid(mp); } else { mp->mnt_stat.f_fsidx.__fsid_val[0] = tfsid.__fsid_val[0]; mp->mnt_stat.f_fsidx.__fsid_val[1] = tfsid.__fsid_val[1]; mp->mnt_stat.f_fsid = tfsid.__fsid_val[0]; } log(LOG_DEBUG, "umapfs: using fsid %x/%x\n", mp->mnt_stat.f_fsidx.__fsid_val[0], mp->mnt_stat.f_fsidx.__fsid_val[1]); error = vfs_set_lowermount(mp, lowerrootvp->v_mount); if (error) { vput(lowerrootvp); kmem_free(amp, sizeof(struct umap_mount)); return error; } amp->umapm_size = sizeof(struct umap_node); amp->umapm_tag = VT_UMAP; amp->umapm_bypass = umap_bypass; amp->umapm_vnodeop_p = umap_vnodeop_p; /* * fix up umap node for root vnode. */ VOP_UNLOCK(lowerrootvp); error = layer_node_create(mp, lowerrootvp, &vp); /* * Make sure the node alias worked */ if (error) { vrele(lowerrootvp); kmem_free(amp, sizeof(struct umap_mount)); return error; } /* * Keep a held reference to the root vnode. * It is vrele'd in umapfs_unmount. */ vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); vp->v_vflag |= VV_ROOT; amp->umapm_rootvp = vp; VOP_UNLOCK(vp); error = set_statvfs_info(path, UIO_USERSPACE, args->umap_target, UIO_USERSPACE, mp->mnt_op->vfs_name, mp, l); if (error) return error; if (mp->mnt_lower->mnt_flag & MNT_LOCAL) mp->mnt_flag |= MNT_LOCAL; #ifdef UMAPFS_DIAGNOSTIC printf("umapfs_mount: lower %s, alias at %s\n", mp->mnt_stat.f_mntfromname, mp->mnt_stat.f_mntonname); #endif return 0; } /* * Free reference to umap layer */ int umapfs_unmount(struct mount *mp, int mntflags) { struct umap_mount *amp = MOUNTTOUMAPMOUNT(mp); struct vnode *rtvp = amp->umapm_rootvp; int error; int flags = 0; #ifdef UMAPFS_DIAGNOSTIC printf("umapfs_unmount(mp = %p)\n", mp); #endif if (mntflags & MNT_FORCE) flags |= FORCECLOSE; if (vrefcnt(rtvp) > 1 && (mntflags & MNT_FORCE) == 0) return (EBUSY); if ((error = vflush(mp, rtvp, flags)) != 0) return (error); #ifdef UMAPFS_DIAGNOSTIC vprint("alias root of lower", rtvp); #endif /* * Blow it away for future re-use */ vgone(rtvp); /* * Finally, throw away the umap_mount structure */ kmem_free(amp, sizeof(struct umap_mount)); mp->mnt_data = NULL; return 0; } extern const struct vnodeopv_desc umapfs_vnodeop_opv_desc; const struct vnodeopv_desc * const umapfs_vnodeopv_descs[] = { &umapfs_vnodeop_opv_desc, NULL, }; struct vfsops umapfs_vfsops = { .vfs_name = MOUNT_UMAP, .vfs_min_mount_data = sizeof (struct umap_args), .vfs_mount = umapfs_mount, .vfs_start = layerfs_start, .vfs_unmount = umapfs_unmount, .vfs_root = layerfs_root, .vfs_quotactl = layerfs_quotactl, .vfs_statvfs = layerfs_statvfs, .vfs_sync = layerfs_sync, .vfs_loadvnode = layerfs_loadvnode, .vfs_vget = layerfs_vget, .vfs_fhtovp = layerfs_fhtovp, .vfs_vptofh = layerfs_vptofh, .vfs_init = layerfs_init, .vfs_done = layerfs_done, .vfs_snapshot = layerfs_snapshot, .vfs_extattrctl = vfs_stdextattrctl, .vfs_suspendctl = layerfs_suspendctl, .vfs_renamelock_enter = layerfs_renamelock_enter, .vfs_renamelock_exit = layerfs_renamelock_exit, .vfs_fsync = (void *)eopnotsupp, .vfs_opv_descs = umapfs_vnodeopv_descs }; SYSCTL_SETUP(umapfs_sysctl_setup, "umapfs sysctl") { sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT, CTLTYPE_NODE, "umap", SYSCTL_DESCR("UID/GID remapping file system"), NULL, 0, NULL, 0, CTL_VFS, 10, CTL_EOL); /* * XXX the "10" above could be dynamic, thereby eliminating * one more instance of the "number to vfs" mapping problem, * but "10" is the order as taken from sys/mount.h */ } static int umap_modcmd(modcmd_t cmd, void *arg) { int error; switch (cmd) { case MODULE_CMD_INIT: error = vfs_attach(&umapfs_vfsops); if (error != 0) break; break; case MODULE_CMD_FINI: error = vfs_detach(&umapfs_vfsops); if (error != 0) break; break; default: error = ENOTTY; break; } return (error); }
6 1 5 1 2 1 1 1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 /* $NetBSD: cd9660_vfsops.c,v 1.103 2024/02/03 22:39:27 christos Exp $ */ /*- * Copyright (c) 1994 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley * by Pace Willisson (pace@blitz.com). The Rock Ridge Extension * Support code is derived from software contributed to Berkeley * by Atsushi Murai (amurai@spec.co.jp). * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)cd9660_vfsops.c 8.18 (Berkeley) 5/22/95 */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: cd9660_vfsops.c,v 1.103 2024/02/03 22:39:27 christos Exp $"); #if defined(_KERNEL_OPT) #include "opt_compat_netbsd.h" #endif #include <sys/param.h> #include <sys/sysctl.h> #include <sys/systm.h> #include <sys/namei.h> #include <sys/proc.h> #include <sys/kernel.h> #include <sys/vnode.h> #include <miscfs/genfs/genfs.h> #include <miscfs/specfs/specdev.h> #include <sys/mount.h> #include <sys/buf.h> #include <sys/file.h> #include <sys/disklabel.h> #include <sys/device.h> #include <sys/ioctl.h> #include <sys/cdio.h> #include <sys/errno.h> #include <sys/malloc.h> #include <sys/pool.h> #include <sys/stat.h> #include <sys/conf.h> #include <sys/dirent.h> #include <sys/kauth.h> #include <sys/module.h> #include <fs/cd9660/iso.h> #include <fs/cd9660/cd9660_extern.h> #include <fs/cd9660/iso_rrip.h> #include <fs/cd9660/cd9660_node.h> #include <fs/cd9660/cd9660_mount.h> MODULE(MODULE_CLASS_VFS, cd9660, NULL); MALLOC_JUSTDEFINE(M_ISOFSMNT, "ISOFS mount", "ISOFS mount structure"); extern const struct vnodeopv_desc cd9660_vnodeop_opv_desc; extern const struct vnodeopv_desc cd9660_specop_opv_desc; extern const struct vnodeopv_desc cd9660_fifoop_opv_desc; const struct vnodeopv_desc * const cd9660_vnodeopv_descs[] = { &cd9660_vnodeop_opv_desc, &cd9660_specop_opv_desc, &cd9660_fifoop_opv_desc, NULL, }; struct vfsops cd9660_vfsops = { .vfs_name = MOUNT_CD9660, .vfs_min_mount_data = sizeof (struct iso_args), .vfs_mount = cd9660_mount, .vfs_start = cd9660_start, .vfs_unmount = cd9660_unmount, .vfs_root = cd9660_root, .vfs_quotactl = (void *)eopnotsupp, .vfs_statvfs = cd9660_statvfs, .vfs_sync = cd9660_sync, .vfs_vget = cd9660_vget, .vfs_loadvnode = cd9660_loadvnode, .vfs_fhtovp = cd9660_fhtovp, .vfs_vptofh = cd9660_vptofh, .vfs_init = cd9660_init, .vfs_reinit = cd9660_reinit, .vfs_done = cd9660_done, .vfs_mountroot = cd9660_mountroot, .vfs_snapshot = (void *)eopnotsupp, .vfs_extattrctl = vfs_stdextattrctl, .vfs_suspendctl = genfs_suspendctl, .vfs_renamelock_enter = genfs_renamelock_enter, .vfs_renamelock_exit = genfs_renamelock_exit, .vfs_fsync = (void *)eopnotsupp, .vfs_opv_descs = cd9660_vnodeopv_descs }; static const struct genfs_ops cd9660_genfsops = { .gop_size = genfs_size, }; /* * Called by vfs_mountroot when iso is going to be mounted as root. * * Name is updated by mount(8) after booting. */ static int iso_makemp(struct iso_mnt *isomp, struct buf *bp, int *ea_len); static int iso_mountfs(struct vnode *devvp, struct mount *mp, struct lwp *l, struct iso_args *argp); SYSCTL_SETUP(cd9660_sysctl_setup, "cd9660 sysctl") { sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT, CTLTYPE_NODE, "cd9660", SYSCTL_DESCR("ISO-9660 file system"), NULL, 0, NULL, 0, CTL_VFS, 14, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_INT, "utf8_joliet", SYSCTL_DESCR("Encode Joliet filenames to UTF-8"), NULL, 0, &cd9660_utf8_joliet, 0, CTL_VFS, 14, CD9660_UTF8_JOLIET, CTL_EOL); /* * XXX the "14" above could be dynamic, thereby eliminating * one more instance of the "number to vfs" mapping problem, * but "14" is the order as taken from sys/mount.h */ } static int cd9660_modcmd(modcmd_t cmd, void *arg) { int error; switch (cmd) { case MODULE_CMD_INIT: error = vfs_attach(&cd9660_vfsops); if (error != 0) break; break; case MODULE_CMD_FINI: error = vfs_detach(&cd9660_vfsops); if (error != 0) break; break; default: error = ENOTTY; break; } return (error); } /* Compat with pre uid/gid/fsize/dsize mount call */ #define OSIZE sizeof(struct { \ const char *fspec; \ struct export_args30 _pad1; \ int flags; \ }) static int iso_checkupdate(const struct vnode *devvp, const struct iso_mnt *imp, const struct iso_args *args) { if (devvp != imp->im_devvp && devvp->v_rdev != imp->im_devvp->v_rdev) return EINVAL; if (((imp->im_flags & ISOFSMNT_UID) && args->uid != imp->im_uid) || ((imp->im_flags & ISOFSMNT_GID) && args->gid != imp->im_gid) || args->fmask != imp->im_fmask || args->dmask != imp->im_dmask) return EPERM; return 0; } static void iso_copyidmask(struct iso_args *args, const struct iso_mnt *imp) { if (imp == NULL) { args->uid = args->gid = 0; args->fmask = args->dmask = S_IRWXU|S_IRWXG|S_IRWXO; return; } args->uid = imp->im_uid; args->gid = imp->im_gid; args->fmask = imp->im_fmask; args->dmask = imp->im_dmask; } int cd9660_mountroot(void) { struct mount *mp; struct lwp *l = curlwp; int error; struct iso_args args; if (device_class(root_device) != DV_DISK) return (ENODEV); if ((error = vfs_rootmountalloc(MOUNT_CD9660, "root_device", &mp)) != 0) { vrele(rootvp); return (error); } args.flags = ISOFSMNT_ROOT; iso_copyidmask(&args, NULL); if ((error = iso_mountfs(rootvp, mp, l, &args)) != 0) { vfs_unbusy(mp); vfs_rele(mp); return (error); } mountlist_append(mp); (void)cd9660_statvfs(mp, &mp->mnt_stat); vfs_unbusy(mp); return (0); } /* * VFS Operations. * * mount system call */ int cd9660_mount(struct mount *mp, const char *path, void *data, size_t *data_len) { struct lwp *l = curlwp; struct vnode *devvp; struct iso_args aa, *args = data; int error; struct iso_mnt *imp = VFSTOISOFS(mp); if (args == NULL) return EINVAL; if (*data_len != OSIZE && *data_len < sizeof(*args)) return EINVAL; if (mp->mnt_flag & MNT_GETARGS) { if (imp == NULL) return EIO; args->fspec = NULL; args->flags = imp->im_flags; if (*data_len == OSIZE) return 0; iso_copyidmask(args, imp); *data_len = sizeof(*args); return 0; } if (*data_len == OSIZE) { memcpy(&aa, args, OSIZE); args = &aa; iso_copyidmask(args, (mp->mnt_flag & MNT_UPDATE) ? imp : NULL); } if ((mp->mnt_flag & MNT_RDONLY) == 0) return EROFS; if ((mp->mnt_flag & MNT_UPDATE) && args->fspec == NULL) return EINVAL; /* * Not an update, or updating the name: look up the name * and verify that it refers to a sensible block device. */ error = namei_simple_user(args->fspec, NSM_FOLLOW_NOEMULROOT, &devvp); if (error != 0) return error; if (devvp->v_type != VBLK) { vrele(devvp); return ENOTBLK; } if (bdevsw_lookup(devvp->v_rdev) == NULL) { vrele(devvp); return ENXIO; } /* * If mount by non-root, then verify that user has necessary * permissions on the device. */ vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY); error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_MOUNT, KAUTH_REQ_SYSTEM_MOUNT_DEVICE, mp, devvp, KAUTH_ARG(VREAD)); if (error) { goto fail; } if ((mp->mnt_flag & MNT_UPDATE) == 0) { error = VOP_OPEN(devvp, FREAD, FSCRED); if (error) goto fail; VOP_UNLOCK(devvp); error = iso_mountfs(devvp, mp, l, args); vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY); if (error) { (void)VOP_CLOSE(devvp, FREAD, NOCRED); goto fail; } VOP_UNLOCK(devvp); /* reference to devvp is donated through iso_mountfs */ } else { if ((error = iso_checkupdate(devvp, imp, args)) != 0) goto fail; VOP_UNLOCK(devvp); vrele(devvp); } return set_statvfs_info(path, UIO_USERSPACE, args->fspec, UIO_USERSPACE, mp->mnt_op->vfs_name, mp, l); fail: VOP_UNLOCK(devvp); vrele(devvp); return error; } /* * Make a mount point from a volume descriptor */ static int iso_makemp(struct iso_mnt *isomp, struct buf *bp, int *ea_len) { struct iso_primary_descriptor *pri; int logical_block_size; struct iso_directory_record *rootp; pri = (struct iso_primary_descriptor *)bp->b_data; logical_block_size = isonum_723 (pri->logical_block_size); if (logical_block_size < DEV_BSIZE || logical_block_size > MAXBSIZE || (logical_block_size & (logical_block_size - 1)) != 0) return -1; rootp = (struct iso_directory_record *)pri->root_directory_record; isomp->logical_block_size = logical_block_size; isomp->volume_space_size = isonum_733 (pri->volume_space_size); memcpy(isomp->root, rootp, sizeof(isomp->root)); isomp->root_extent = isonum_733 (rootp->extent); isomp->root_size = isonum_733 (rootp->size); isomp->im_joliet_level = 0; isomp->im_bmask = logical_block_size - 1; isomp->im_bshift = 0; while ((1 << isomp->im_bshift) < isomp->logical_block_size) isomp->im_bshift++; if (ea_len != NULL) *ea_len = isonum_711(rootp->ext_attr_length); return 0; } /* * Common code for mount and mountroot */ static int iso_mountfs(struct vnode *devvp, struct mount *mp, struct lwp *l, struct iso_args *argp) { struct iso_mnt *isomp = (struct iso_mnt *)0; struct buf *bp = NULL, *pribp = NULL, *supbp = NULL; dev_t dev = devvp->v_rdev; int error = EINVAL; int ronly = (mp->mnt_flag & MNT_RDONLY) != 0; int iso_bsize; int iso_blknum; int joliet_level; struct iso_volume_descriptor *vdp; struct iso_supplementary_descriptor *sup; int sess = 0; int ext_attr_length; struct disklabel label; if (!ronly) return EROFS; /* Flush out any old buffers remaining from a previous use. */ vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY); error = vinvalbuf(devvp, V_SAVE, l->l_cred, l, 0, 0); VOP_UNLOCK(devvp); if (error != 0) return (error); /* This is the "logical sector size". The standard says this * should be 2048 or the physical sector size on the device, * whichever is greater. For now, we'll just use a constant. */ iso_bsize = ISO_DEFAULT_BLOCK_SIZE; error = VOP_IOCTL(devvp, DIOCGDINFO, &label, FREAD, FSCRED); if (!error) { /* XXX more sanity checks? */ sess = label.d_partitions[DISKPART(dev)].p_cdsession; } else { /* fallback to old method */ error = VOP_IOCTL(devvp, CDIOREADMSADDR, &sess, 0, FSCRED); if (error) sess = 0; /* never mind */ } #ifdef ISO_DEBUG printf("isofs: session offset (part %"PRId32") %d\n", DISKPART(dev), sess); #endif for (iso_blknum = 16; iso_blknum < 100; iso_blknum++) { if ((error = bread(devvp, (iso_blknum+sess) * btodb(iso_bsize), iso_bsize, 0, &bp)) != 0) goto out; vdp = (struct iso_volume_descriptor *)bp->b_data; if (memcmp(vdp->id, ISO_STANDARD_ID, sizeof(vdp->id)) != 0) { error = EINVAL; goto out; } switch (isonum_711(vdp->type)) { case ISO_VD_PRIMARY: if (pribp == NULL) { pribp = bp; bp = NULL; } break; case ISO_VD_SUPPLEMENTARY: if (supbp == NULL) { supbp = bp; bp = NULL; } break; default: break; } if (isonum_711 (vdp->type) == ISO_VD_END) { brelse(bp, 0); bp = NULL; break; } if (bp != NULL) { brelse(bp, 0); bp = NULL; } } if (pribp == NULL) { error = EINVAL; goto out; } isomp = malloc(sizeof *isomp, M_ISOFSMNT, M_WAITOK); memset(isomp, 0, sizeof *isomp); if (iso_makemp(isomp, pribp, &ext_attr_length) == -1) { error = EINVAL; goto out; } isomp->volume_space_size += sess; brelse(pribp, BC_AGE); pribp = NULL; mp->mnt_data = isomp; mp->mnt_stat.f_fsidx.__fsid_val[0] = (long)dev; mp->mnt_stat.f_fsidx.__fsid_val[1] = makefstype(MOUNT_CD9660); mp->mnt_stat.f_fsid = mp->mnt_stat.f_fsidx.__fsid_val[0]; mp->mnt_stat.f_namemax = ISO_MAXNAMLEN; mp->mnt_flag |= MNT_LOCAL; mp->mnt_iflag |= IMNT_MPSAFE | IMNT_SHRLOOKUP; mp->mnt_dev_bshift = iso_bsize; mp->mnt_fs_bshift = isomp->im_bshift; isomp->im_mountp = mp; isomp->im_dev = dev; isomp->im_devvp = devvp; if (argp->flags & ISOFSMNT_UID) isomp->im_uid = argp->uid; if (argp->flags & ISOFSMNT_GID) isomp->im_gid = argp->gid; isomp->im_fmask = argp->fmask & ACCESSPERMS; isomp->im_dmask = argp->dmask & ACCESSPERMS; /* Check the Rock Ridge Extension support */ if (!(argp->flags & ISOFSMNT_NORRIP)) { struct iso_directory_record *rootp; if ((error = bread(isomp->im_devvp, (isomp->root_extent + ext_attr_length) << (isomp->im_bshift - DEV_BSHIFT), isomp->logical_block_size, 0, &bp)) != 0) goto out; rootp = (struct iso_directory_record *)bp->b_data; if ((isomp->rr_skip = cd9660_rrip_offset(rootp,isomp)) < 0) { argp->flags |= ISOFSMNT_NORRIP; } else { argp->flags &= ~ISOFSMNT_GENS; } /* * The contents are valid, * but they will get reread as part of another vnode, so... */ brelse(bp, BC_AGE); bp = NULL; } isomp->im_flags = argp->flags & (ISOFSMNT_NORRIP | ISOFSMNT_GENS | ISOFSMNT_EXTATT | ISOFSMNT_NOJOLIET | ISOFSMNT_RRCASEINS | ISOFSMNT_UID | ISOFSMNT_GID); if (isomp->im_flags & ISOFSMNT_GENS) isomp->iso_ftype = ISO_FTYPE_9660; else if (isomp->im_flags & ISOFSMNT_NORRIP) { isomp->iso_ftype = ISO_FTYPE_DEFAULT; if (argp->flags & ISOFSMNT_NOCASETRANS) isomp->im_flags |= ISOFSMNT_NOCASETRANS; } else isomp->iso_ftype = ISO_FTYPE_RRIP; /* Check the Joliet Extension support */ if ((argp->flags & ISOFSMNT_NORRIP) != 0 && (argp->flags & ISOFSMNT_NOJOLIET) == 0 && supbp != NULL) { joliet_level = 0; sup = (struct iso_supplementary_descriptor *)supbp->b_data; if ((isonum_711(sup->flags) & 1) == 0) { if (memcmp(sup->escape, "%/@", 3) == 0) joliet_level = 1; if (memcmp(sup->escape, "%/C", 3) == 0) joliet_level = 2; if (memcmp(sup->escape, "%/E", 3) == 0) joliet_level = 3; } if (joliet_level != 0) { if (iso_makemp(isomp, supbp, NULL) == -1) { error = EINVAL; goto out; } isomp->im_joliet_level = joliet_level; } } if (supbp != NULL) { brelse(supbp, 0); supbp = NULL; } spec_node_setmountedfs(devvp, mp); return 0; out: if (bp) brelse(bp, 0); if (pribp) brelse(pribp, 0); if (supbp) brelse(supbp, 0); if (isomp) { free(isomp, M_ISOFSMNT); mp->mnt_data = NULL; } return error; } /* * Make a filesystem operational. * Nothing to do at the moment. */ /* ARGSUSED */ int cd9660_start(struct mount *mp, int flags) { return 0; } /* * unmount system call */ int cd9660_unmount(struct mount *mp, int mntflags) { struct iso_mnt *isomp; int error, flags = 0; if (mntflags & MNT_FORCE) flags |= FORCECLOSE; if ((error = vflush(mp, NULLVP, flags)) != 0) return (error); isomp = VFSTOISOFS(mp); if (isomp->im_devvp->v_type != VBAD) spec_node_setmountedfs(isomp->im_devvp, NULL); vn_lock(isomp->im_devvp, LK_EXCLUSIVE | LK_RETRY); error = VOP_CLOSE(isomp->im_devvp, FREAD, NOCRED); vput(isomp->im_devvp); free(isomp, M_ISOFSMNT); mp->mnt_data = NULL; mp->mnt_flag &= ~MNT_LOCAL; return (error); } /* * Return root of a filesystem */ int cd9660_root(struct mount *mp, int lktype, struct vnode **vpp) { struct iso_mnt *imp = VFSTOISOFS(mp); struct iso_directory_record *dp = (struct iso_directory_record *)imp->root; ino_t ino = isodirino(dp, imp); return cd9660_vget(mp, ino, lktype, vpp); } /* * Get file system statistics. */ int cd9660_statvfs(struct mount *mp, struct statvfs *sbp) { struct iso_mnt *isomp; isomp = VFSTOISOFS(mp); sbp->f_bsize = isomp->logical_block_size; sbp->f_frsize = sbp->f_bsize; sbp->f_iosize = sbp->f_bsize; /* XXX */ sbp->f_blocks = isomp->volume_space_size; sbp->f_bfree = 0; /* total free blocks */ sbp->f_bavail = 0; /* blocks free for non superuser */ sbp->f_bresvd = 0; /* total reserved blocks */ sbp->f_files = 0; /* total files */ sbp->f_ffree = 0; /* free file nodes */ sbp->f_favail = 0; /* free file nodes for non superuser */ sbp->f_fresvd = 0; /* reserved file nodes */ copy_statvfs_info(sbp, mp); /* Use the first spare for flags: */ sbp->f_spare[0] = isomp->im_flags; return 0; } /* ARGSUSED */ int cd9660_sync(struct mount *mp, int waitfor, kauth_cred_t cred) { return 0; } /* * File handle to vnode * * Have to be really careful about stale file handles: * - check that the inode number is in range * - call iget() to get the locked inode * - check for an unallocated inode (i_mode == 0) * - check that the generation number matches */ struct ifid { ushort ifid_len; ushort ifid_pad; ino_t ifid_ino; #ifdef ISOFS_DBG u_long ifid_start; #endif }; /* ARGSUSED */ int cd9660_fhtovp(struct mount *mp, struct fid *fhp, int lktype, struct vnode **vpp) { struct ifid ifh; struct iso_node *ip; struct vnode *nvp; int error; if (fhp->fid_len != sizeof(ifh)) return EINVAL; memcpy(&ifh, fhp, sizeof(ifh)); #ifdef ISOFS_DBG printf("fhtovp: ino %"PRIu64", start %lu\n", ifh.ifid_ino, ifh.ifid_start); #endif if ((error = VFS_VGET(mp, ifh.ifid_ino, lktype, &nvp)) != 0) { *vpp = NULLVP; return (error); } ip = VTOI(nvp); if (ip->inode.iso_mode == 0) { vput(nvp); *vpp = NULLVP; return (ESTALE); } *vpp = nvp; return (0); } int cd9660_vget(struct mount *mp, ino_t ino, int lktype, struct vnode **vpp) { int error; error = vcache_get(mp, &ino, sizeof(ino), vpp); if (error) return error; error = vn_lock(*vpp, lktype); if (error) { vrele(*vpp); *vpp = NULL; return error; } return 0; } int cd9660_loadvnode(struct mount *mp, struct vnode *vp, const void *key, size_t key_len, const void **new_key) { struct iso_mnt *imp; struct iso_node *ip; struct iso_directory_record *isodir; struct buf *bp; dev_t dev; ino_t ino; int lbn, off; int error; KASSERT(key_len == sizeof(ino)); memcpy(&ino, key, key_len); imp = VFSTOISOFS(mp); dev = imp->im_dev; ip = pool_get(&cd9660_node_pool, PR_WAITOK); memset(ip, 0, sizeof(struct iso_node)); ip->i_vnode = vp; ip->i_dev = dev; ip->i_number = ino; ip->i_mnt = imp; ip->i_devvp = imp->im_devvp; lbn = cd9660_lblkno(imp, ino); if (lbn >= imp->volume_space_size) { pool_put(&cd9660_node_pool, ip); printf("fhtovp: lbn exceed volume space %d\n", lbn); return (ESTALE); } off = cd9660_blkoff(imp, ino); if (off + ISO_DIRECTORY_RECORD_SIZE > imp->logical_block_size) { pool_put(&cd9660_node_pool, ip); printf("fhtovp: crosses block boundary %d\n", off + ISO_DIRECTORY_RECORD_SIZE); return (ESTALE); } error = bread(imp->im_devvp, lbn << (imp->im_bshift - DEV_BSHIFT), imp->logical_block_size, 0, &bp); if (error) { pool_put(&cd9660_node_pool, ip); printf("fhtovp: bread error %d\n",error); return (error); } isodir = (struct iso_directory_record *)((char *)bp->b_data + off); if (off + isonum_711(isodir->length) > imp->logical_block_size) { pool_put(&cd9660_node_pool, ip); brelse(bp, 0); printf("fhtovp: directory crosses block boundary %d[off=%d/len=%d]\n", off +isonum_711(isodir->length), off, isonum_711(isodir->length)); return (ESTALE); } #if 0 if (isonum_733(isodir->extent) + isonum_711(isodir->ext_attr_length) != ifhp->ifid_start) { pool_put(&cd9660_node_pool, ip); if (bp != 0) brelse(bp, 0); printf("fhtovp: file start miss %d vs %d\n", isonum_733(isodir->extent) + isonum_711(isodir->ext_attr_length), ifhp->ifid_start); return (ESTALE); } #endif ip->iso_extent = isonum_733(isodir->extent); ip->i_size = isonum_733(isodir->size); ip->iso_start = isonum_711(isodir->ext_attr_length) + ip->iso_extent; vp->v_tag = VT_ISOFS; vp->v_op = cd9660_vnodeop_p; vp->v_data = ip; genfs_node_init(vp, &cd9660_genfsops); /* * Setup time stamp, attribute */ switch (imp->iso_ftype) { default: /* ISO_FTYPE_9660 */ { struct buf *bp2; if ((imp->im_flags & ISOFSMNT_EXTATT) && (off = isonum_711(isodir->ext_attr_length))) cd9660_blkatoff(vp, (off_t)-(off << imp->im_bshift), NULL, &bp2); else bp2 = NULL; cd9660_defattr(isodir, ip, bp2); cd9660_deftstamp(isodir, ip, bp2); if (bp2) brelse(bp2, 0); break; } case ISO_FTYPE_RRIP: cd9660_rrip_analyze(isodir, ip, imp); break; } brelse(bp, 0); /* * Initialize the associated vnode */ switch (vp->v_type = IFTOVT(ip->inode.iso_mode)) { case VFIFO: vp->v_op = cd9660_fifoop_p; break; case VCHR: case VBLK: /* * if device, look at device number table for translation */ vp->v_op = cd9660_specop_p; spec_node_init(vp, ip->inode.iso_rdev); break; case VLNK: case VNON: case VSOCK: case VDIR: case VBAD: break; case VREG: uvm_vnp_setsize(vp, ip->i_size); break; } if (vp->v_type != VREG) uvm_vnp_setsize(vp, 0); if (ip->iso_extent == imp->root_extent) vp->v_vflag |= VV_ROOT; /* * XXX need generation number? */ *new_key = &ip->i_number; return 0; } /* * Vnode pointer to File handle */ /* ARGSUSED */ int cd9660_vptofh(struct vnode *vp, struct fid *fhp, size_t *fh_size) { struct iso_node *ip = VTOI(vp); struct ifid ifh; if (*fh_size < sizeof(struct ifid)) { *fh_size = sizeof(struct ifid); return E2BIG; } *fh_size = sizeof(struct ifid); memset(&ifh, 0, sizeof(ifh)); ifh.ifid_len = sizeof(struct ifid); ifh.ifid_ino = ip->i_number; #ifdef ISOFS_DBG ifh.ifid_start = ip->iso_start; #endif memcpy(fhp, &ifh, sizeof(ifh)); #ifdef ISOFS_DBG printf("vptofh: ino %"PRIu64", start %lu\n", ifh.ifid_ino,ifh.ifid_start); #endif return 0; }
2 2 2 2 2 2 2 2 204 204 204 203 204 204 204 203 204 3 86 138 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 3 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 /* $NetBSD: dk.c,v 1.171 2023/05/22 15:00:17 riastradh Exp $ */ /*- * Copyright (c) 2004, 2005, 2006, 2007 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Jason R. Thorpe. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: dk.c,v 1.171 2023/05/22 15:00:17 riastradh Exp $"); #ifdef _KERNEL_OPT #include "opt_dkwedge.h" #endif #include <sys/param.h> #include <sys/types.h> #include <sys/buf.h> #include <sys/bufq.h> #include <sys/callout.h> #include <sys/conf.h> #include <sys/device.h> #include <sys/disk.h> #include <sys/disklabel.h> #include <sys/errno.h> #include <sys/fcntl.h> #include <sys/ioctl.h> #include <sys/kauth.h> #include <sys/kernel.h> #include <sys/malloc.h> #include <sys/pool.h> #include <sys/proc.h> #include <sys/rwlock.h> #include <sys/stat.h> #include <sys/systm.h> #include <sys/vnode.h> #include <miscfs/specfs/specdev.h> MALLOC_DEFINE(M_DKWEDGE, "dkwedge", "Disk wedge structures"); typedef enum { DKW_STATE_LARVAL = 0, DKW_STATE_RUNNING = 1, DKW_STATE_DYING = 2, DKW_STATE_DEAD = 666 } dkwedge_state_t; /* * Lock order: * * sc->sc_dk.dk_openlock * => sc->sc_parent->dk_rawlock * => sc->sc_parent->dk_openlock * => dkwedges_lock * => sc->sc_sizelock * * Locking notes: * * W dkwedges_lock * D device reference * O sc->sc_dk.dk_openlock * P sc->sc_parent->dk_openlock * R sc->sc_parent->dk_rawlock * S sc->sc_sizelock * I sc->sc_iolock * $ stable after initialization * 1 used only by a single thread * * x&y means both x and y must be held to write (with a write lock if * one is rwlock), and either x or y must be held to read. */ struct dkwedge_softc { device_t sc_dev; /* P&W: pointer to our pseudo-device */ /* sc_dev is also stable while device is referenced */ struct cfdata sc_cfdata; /* 1: our cfdata structure */ uint8_t sc_wname[128]; /* $: wedge name (Unicode, UTF-8) */ dkwedge_state_t sc_state; /* state this wedge is in */ /* stable while device is referenced */ /* used only in assertions when stable, and in dump in ddb */ struct disk *sc_parent; /* $: parent disk */ /* P: sc_parent->dk_openmask */ /* P: sc_parent->dk_nwedges */ /* P: sc_parent->dk_wedges */ /* R: sc_parent->dk_rawopens */ /* R: sc_parent->dk_rawvp (also stable while wedge is open) */ daddr_t sc_offset; /* $: LBA offset of wedge in parent */ krwlock_t sc_sizelock; uint64_t sc_size; /* S: size of wedge in blocks */ char sc_ptype[32]; /* $: partition type */ dev_t sc_pdev; /* $: cached parent's dev_t */ /* P: link on parent's wedge list */ LIST_ENTRY(dkwedge_softc) sc_plink; struct disk sc_dk; /* our own disk structure */ /* O&R: sc_dk.dk_bopenmask */ /* O&R: sc_dk.dk_copenmask */ /* O&R: sc_dk.dk_openmask */ struct bufq_state *sc_bufq; /* $: buffer queue */ struct callout sc_restart_ch; /* I: callout to restart I/O */ kmutex_t sc_iolock; bool sc_iostop; /* I: don't schedule restart */ int sc_mode; /* O&R: parent open mode */ }; static int dkwedge_match(device_t, cfdata_t, void *); static void dkwedge_attach(device_t, device_t, void *); static int dkwedge_detach(device_t, int); static void dk_set_geometry(struct dkwedge_softc *, struct disk *); static void dkstart(struct dkwedge_softc *); static void dkiodone(struct buf *); static void dkrestart(void *); static void dkminphys(struct buf *); static int dkfirstopen(struct dkwedge_softc *, int); static void dklastclose(struct dkwedge_softc *); static int dkwedge_detach(device_t, int); static void dkwedge_delall1(struct disk *, bool); static int dkwedge_del1(struct dkwedge_info *, int); static int dk_open_parent(dev_t, int, struct vnode **); static int dk_close_parent(struct vnode *, int); static dev_type_open(dkopen); static dev_type_close(dkclose); static dev_type_cancel(dkcancel); static dev_type_read(dkread); static dev_type_write(dkwrite); static dev_type_ioctl(dkioctl); static dev_type_strategy(dkstrategy); static dev_type_dump(dkdump); static dev_type_size(dksize); static dev_type_discard(dkdiscard); CFDRIVER_DECL(dk, DV_DISK, NULL); CFATTACH_DECL3_NEW(dk, 0, dkwedge_match, dkwedge_attach, dkwedge_detach, NULL, NULL, NULL, DVF_DETACH_SHUTDOWN); const struct bdevsw dk_bdevsw = { .d_open = dkopen, .d_close = dkclose, .d_cancel = dkcancel, .d_strategy = dkstrategy, .d_ioctl = dkioctl, .d_dump = dkdump, .d_psize = dksize, .d_discard = dkdiscard, .d_cfdriver = &dk_cd, .d_devtounit = dev_minor_unit, .d_flag = D_DISK | D_MPSAFE }; const struct cdevsw dk_cdevsw = { .d_open = dkopen, .d_close = dkclose, .d_cancel = dkcancel, .d_read = dkread, .d_write = dkwrite, .d_ioctl = dkioctl, .d_stop = nostop, .d_tty = notty, .d_poll = nopoll, .d_mmap = nommap, .d_kqfilter = nokqfilter, .d_discard = dkdiscard, .d_cfdriver = &dk_cd, .d_devtounit = dev_minor_unit, .d_flag = D_DISK | D_MPSAFE }; static struct dkwedge_softc **dkwedges; static u_int ndkwedges; static krwlock_t dkwedges_lock; static LIST_HEAD(, dkwedge_discovery_method) dkwedge_discovery_methods; static krwlock_t dkwedge_discovery_methods_lock; /* * dkwedge_match: * * Autoconfiguration match function for pseudo-device glue. */ static int dkwedge_match(device_t parent, cfdata_t match, void *aux) { /* Pseudo-device; always present. */ return 1; } /* * dkwedge_attach: * * Autoconfiguration attach function for pseudo-device glue. */ static void dkwedge_attach(device_t parent, device_t self, void *aux) { struct dkwedge_softc *sc = aux; struct disk *pdk = sc->sc_parent; int unit = device_unit(self); KASSERTMSG(unit >= 0, "unit=%d", unit); if (!pmf_device_register(self, NULL, NULL)) aprint_error_dev(self, "couldn't establish power handler\n"); mutex_enter(&pdk->dk_openlock); rw_enter(&dkwedges_lock, RW_WRITER); KASSERTMSG(unit < ndkwedges, "unit=%d ndkwedges=%u", unit, ndkwedges); KASSERTMSG(sc == dkwedges[unit], "sc=%p dkwedges[%d]=%p", sc, unit, dkwedges[unit]); KASSERTMSG(sc->sc_dev == NULL, "sc=%p sc->sc_dev=%p", sc, sc->sc_dev); sc->sc_dev = self; rw_exit(&dkwedges_lock); mutex_exit(&pdk->dk_openlock); disk_init(&sc->sc_dk, device_xname(sc->sc_dev), NULL); mutex_enter(&pdk->dk_openlock); dk_set_geometry(sc, pdk); mutex_exit(&pdk->dk_openlock); disk_attach(&sc->sc_dk); /* Disk wedge is ready for use! */ device_set_private(self, sc); sc->sc_state = DKW_STATE_RUNNING; } /* * dkwedge_compute_pdev: * * Compute the parent disk's dev_t. */ static int dkwedge_compute_pdev(const char *pname, dev_t *pdevp, enum vtype type) { const char *name, *cp; devmajor_t pmaj; int punit; char devname[16]; name = pname; switch (type) { case VBLK: pmaj = devsw_name2blk(name, devname, sizeof(devname)); break; case VCHR: pmaj = devsw_name2chr(name, devname, sizeof(devname)); break; default: pmaj = NODEVMAJOR; break; } if (pmaj == NODEVMAJOR) return ENXIO; name += strlen(devname); for (cp = name, punit = 0; *cp >= '0' && *cp <= '9'; cp++) punit = (punit * 10) + (*cp - '0'); if (cp == name) { /* Invalid parent disk name. */ return ENXIO; } *pdevp = MAKEDISKDEV(pmaj, punit, RAW_PART); return 0; } /* * dkwedge_array_expand: * * Expand the dkwedges array. * * Releases and reacquires dkwedges_lock as a writer. */ static int dkwedge_array_expand(void) { const unsigned incr = 16; unsigned newcnt, oldcnt; struct dkwedge_softc **newarray = NULL, **oldarray = NULL; KASSERT(rw_write_held(&dkwedges_lock)); oldcnt = ndkwedges; oldarray = dkwedges; if (oldcnt >= INT_MAX - incr) return ENFILE; /* XXX */ newcnt = oldcnt + incr; rw_exit(&dkwedges_lock); newarray = malloc(newcnt * sizeof(*newarray), M_DKWEDGE, M_WAITOK|M_ZERO); rw_enter(&dkwedges_lock, RW_WRITER); if (ndkwedges != oldcnt || dkwedges != oldarray) { oldarray = NULL; /* already recycled */ goto out; } if (oldarray != NULL) memcpy(newarray, dkwedges, ndkwedges * sizeof(*newarray)); dkwedges = newarray; newarray = NULL; /* transferred to dkwedges */ ndkwedges = newcnt; out: rw_exit(&dkwedges_lock); if (oldarray != NULL) free(oldarray, M_DKWEDGE); if (newarray != NULL) free(newarray, M_DKWEDGE); rw_enter(&dkwedges_lock, RW_WRITER); return 0; } static void dkwedge_size_init(struct dkwedge_softc *sc, uint64_t size) { rw_init(&sc->sc_sizelock); sc->sc_size = size; } static void dkwedge_size_fini(struct dkwedge_softc *sc) { rw_destroy(&sc->sc_sizelock); } static uint64_t dkwedge_size(struct dkwedge_softc *sc) { uint64_t size; rw_enter(&sc->sc_sizelock, RW_READER); size = sc->sc_size; rw_exit(&sc->sc_sizelock); return size; } static void dkwedge_size_increase(struct dkwedge_softc *sc, uint64_t size) { KASSERT(mutex_owned(&sc->sc_parent->dk_openlock)); rw_enter(&sc->sc_sizelock, RW_WRITER); KASSERTMSG(size >= sc->sc_size, "decreasing dkwedge size from %"PRIu64" to %"PRIu64, sc->sc_size, size); sc->sc_size = size; rw_exit(&sc->sc_sizelock); } static void dk_set_geometry(struct dkwedge_softc *sc, struct disk *pdk) { struct disk *dk = &sc->sc_dk; struct disk_geom *dg = &dk->dk_geom; KASSERT(mutex_owned(&pdk->dk_openlock)); memset(dg, 0, sizeof(*dg)); dg->dg_secperunit = dkwedge_size(sc); dg->dg_secsize = DEV_BSIZE << pdk->dk_blkshift; /* fake numbers, 1 cylinder is 1 MB with default sector size */ dg->dg_nsectors = 32; dg->dg_ntracks = 64; dg->dg_ncylinders = dg->dg_secperunit / (dg->dg_nsectors * dg->dg_ntracks); disk_set_info(sc->sc_dev, dk, NULL); } /* * dkwedge_add: [exported function] * * Add a disk wedge based on the provided information. * * The incoming dkw_devname[] is ignored, instead being * filled in and returned to the caller. */ int dkwedge_add(struct dkwedge_info *dkw) { struct dkwedge_softc *sc, *lsc; struct disk *pdk; u_int unit; int error; dev_t pdev; device_t dev __diagused; dkw->dkw_parent[sizeof(dkw->dkw_parent) - 1] = '\0'; pdk = disk_find(dkw->dkw_parent); if (pdk == NULL) return ENXIO; error = dkwedge_compute_pdev(pdk->dk_name, &pdev, VBLK); if (error) return error; if (dkw->dkw_offset < 0) return EINVAL; /* * Check for an existing wedge at the same disk offset. Allow * updating a wedge if the only change is the size, and the new * size is larger than the old. */ sc = NULL; mutex_enter(&pdk->dk_openlock); LIST_FOREACH(lsc, &pdk->dk_wedges, sc_plink) { if (lsc->sc_offset != dkw->dkw_offset) continue; if (strcmp(lsc->sc_wname, dkw->dkw_wname) != 0) break; if (strcmp(lsc->sc_ptype, dkw->dkw_ptype) != 0) break; if (dkwedge_size(lsc) > dkw->dkw_size) break; if (lsc->sc_dev == NULL) break; sc = lsc; device_acquire(sc->sc_dev); dkwedge_size_increase(sc, dkw->dkw_size); dk_set_geometry(sc, pdk); break; } mutex_exit(&pdk->dk_openlock); if (sc != NULL) goto announce; sc = malloc(sizeof(*sc), M_DKWEDGE, M_WAITOK|M_ZERO); sc->sc_state = DKW_STATE_LARVAL; sc->sc_parent = pdk; sc->sc_pdev = pdev; sc->sc_offset = dkw->dkw_offset; dkwedge_size_init(sc, dkw->dkw_size); memcpy(sc->sc_wname, dkw->dkw_wname, sizeof(sc->sc_wname)); sc->sc_wname[sizeof(sc->sc_wname) - 1] = '\0'; memcpy(sc->sc_ptype, dkw->dkw_ptype, sizeof(sc->sc_ptype)); sc->sc_ptype[sizeof(sc->sc_ptype) - 1] = '\0'; bufq_alloc(&sc->sc_bufq, "fcfs", 0); callout_init(&sc->sc_restart_ch, 0); callout_setfunc(&sc->sc_restart_ch, dkrestart, sc); mutex_init(&sc->sc_iolock, MUTEX_DEFAULT, IPL_BIO); /* * Wedge will be added; increment the wedge count for the parent. * Only allow this to happen if RAW_PART is the only thing open. */ mutex_enter(&pdk->dk_openlock); if (pdk->dk_openmask & ~(1 << RAW_PART)) error = EBUSY; else { /* Check for wedge overlap. */ LIST_FOREACH(lsc, &pdk->dk_wedges, sc_plink) { /* XXX arithmetic overflow */ uint64_t size = dkwedge_size(sc); uint64_t lsize = dkwedge_size(lsc); daddr_t lastblk = sc->sc_offset + size - 1; daddr_t llastblk = lsc->sc_offset + lsize - 1; if (sc->sc_offset >= lsc->sc_offset && sc->sc_offset <= llastblk) { /* Overlaps the tail of the existing wedge. */ break; } if (lastblk >= lsc->sc_offset && lastblk <= llastblk) { /* Overlaps the head of the existing wedge. */ break; } } if (lsc != NULL) { if (sc->sc_offset == lsc->sc_offset && dkwedge_size(sc) == dkwedge_size(lsc) && strcmp(sc->sc_wname, lsc->sc_wname) == 0) error = EEXIST; else error = EINVAL; } else { pdk->dk_nwedges++; LIST_INSERT_HEAD(&pdk->dk_wedges, sc, sc_plink); } } mutex_exit(&pdk->dk_openlock); if (error) { mutex_destroy(&sc->sc_iolock); bufq_free(sc->sc_bufq); dkwedge_size_fini(sc); free(sc, M_DKWEDGE); return error; } /* Fill in our cfdata for the pseudo-device glue. */ sc->sc_cfdata.cf_name = dk_cd.cd_name; sc->sc_cfdata.cf_atname = dk_ca.ca_name; /* sc->sc_cfdata.cf_unit set below */ sc->sc_cfdata.cf_fstate = FSTATE_NOTFOUND; /* use chosen cf_unit */ /* Insert the larval wedge into the array. */ rw_enter(&dkwedges_lock, RW_WRITER); for (error = 0;;) { struct dkwedge_softc **scpp; /* * Check for a duplicate wname while searching for * a slot. */ for (scpp = NULL, unit = 0; unit < ndkwedges; unit++) { if (dkwedges[unit] == NULL) { if (scpp == NULL) { scpp = &dkwedges[unit]; sc->sc_cfdata.cf_unit = unit; } } else { /* XXX Unicode. */ if (strcmp(dkwedges[unit]->sc_wname, sc->sc_wname) == 0) { error = EEXIST; break; } } } if (error) break; KASSERT(unit == ndkwedges); if (scpp == NULL) { error = dkwedge_array_expand(); if (error) break; } else { KASSERT(scpp == &dkwedges[sc->sc_cfdata.cf_unit]); *scpp = sc; break; } } rw_exit(&dkwedges_lock); if (error) { mutex_enter(&pdk->dk_openlock); pdk->dk_nwedges--; LIST_REMOVE(sc, sc_plink); mutex_exit(&pdk->dk_openlock); mutex_destroy(&sc->sc_iolock); bufq_free(sc->sc_bufq); dkwedge_size_fini(sc); free(sc, M_DKWEDGE); return error; } /* * Now that we know the unit #, attach a pseudo-device for * this wedge instance. This will provide us with the * device_t necessary for glue to other parts of the system. * * This should never fail, unless we're almost totally out of * memory. */ if ((dev = config_attach_pseudo_acquire(&sc->sc_cfdata, sc)) == NULL) { aprint_error("%s%u: unable to attach pseudo-device\n", sc->sc_cfdata.cf_name, sc->sc_cfdata.cf_unit); rw_enter(&dkwedges_lock, RW_WRITER); KASSERT(dkwedges[sc->sc_cfdata.cf_unit] == sc); dkwedges[sc->sc_cfdata.cf_unit] = NULL; rw_exit(&dkwedges_lock); mutex_enter(&pdk->dk_openlock); pdk->dk_nwedges--; LIST_REMOVE(sc, sc_plink); mutex_exit(&pdk->dk_openlock); mutex_destroy(&sc->sc_iolock); bufq_free(sc->sc_bufq); dkwedge_size_fini(sc); free(sc, M_DKWEDGE); return ENOMEM; } KASSERT(dev == sc->sc_dev); announce: /* Announce our arrival. */ aprint_normal( "%s at %s: \"%s\", %"PRIu64" blocks at %"PRId64", type: %s\n", device_xname(sc->sc_dev), pdk->dk_name, sc->sc_wname, /* XXX Unicode */ dkwedge_size(sc), sc->sc_offset, sc->sc_ptype[0] == '\0' ? "<unknown>" : sc->sc_ptype); /* Return the devname to the caller. */ strlcpy(dkw->dkw_devname, device_xname(sc->sc_dev), sizeof(dkw->dkw_devname)); device_release(sc->sc_dev); return 0; } /* * dkwedge_find_acquire: * * Lookup a disk wedge based on the provided information. * NOTE: We look up the wedge based on the wedge devname, * not wname. * * Return NULL if the wedge is not found, otherwise return * the wedge's softc. Assign the wedge's unit number to unitp * if unitp is not NULL. The wedge's sc_dev is referenced and * must be released by device_release or equivalent. */ static struct dkwedge_softc * dkwedge_find_acquire(struct dkwedge_info *dkw, u_int *unitp) { struct dkwedge_softc *sc = NULL; u_int unit; /* Find our softc. */ dkw->dkw_devname[sizeof(dkw->dkw_devname) - 1] = '\0'; rw_enter(&dkwedges_lock, RW_READER); for (unit = 0; unit < ndkwedges; unit++) { if ((sc = dkwedges[unit]) != NULL && sc->sc_dev != NULL && strcmp(device_xname(sc->sc_dev), dkw->dkw_devname) == 0 && strcmp(sc->sc_parent->dk_name, dkw->dkw_parent) == 0) { device_acquire(sc->sc_dev); break; } } rw_exit(&dkwedges_lock); if (sc == NULL) return NULL; if (unitp != NULL) *unitp = unit; return sc; } /* * dkwedge_del: [exported function] * * Delete a disk wedge based on the provided information. * NOTE: We look up the wedge based on the wedge devname, * not wname. */ int dkwedge_del(struct dkwedge_info *dkw) { return dkwedge_del1(dkw, 0); } int dkwedge_del1(struct dkwedge_info *dkw, int flags) { struct dkwedge_softc *sc = NULL; /* Find our softc. */ if ((sc = dkwedge_find_acquire(dkw, NULL)) == NULL) return ESRCH; return config_detach_release(sc->sc_dev, flags); } /* * dkwedge_detach: * * Autoconfiguration detach function for pseudo-device glue. */ static int dkwedge_detach(device_t self, int flags) { struct dkwedge_softc *const sc = device_private(self); const u_int unit = device_unit(self); int bmaj, cmaj, error; error = disk_begindetach(&sc->sc_dk, /*lastclose*/NULL, self, flags); if (error) return error; /* Mark the wedge as dying. */ sc->sc_state = DKW_STATE_DYING; pmf_device_deregister(self); /* Kill any pending restart. */ mutex_enter(&sc->sc_iolock); sc->sc_iostop = true; mutex_exit(&sc->sc_iolock); callout_halt(&sc->sc_restart_ch, NULL); /* Locate the wedge major numbers. */ bmaj = bdevsw_lookup_major(&dk_bdevsw); cmaj = cdevsw_lookup_major(&dk_cdevsw); /* Nuke the vnodes for any open instances. */ vdevgone(bmaj, unit, unit, VBLK); vdevgone(cmaj, unit, unit, VCHR); /* * At this point, all block device opens have been closed, * synchronously flushing any buffered writes; and all * character device I/O operations have completed * synchronously, and character device opens have been closed. * * So there can be no more opens or queued buffers by now. */ KASSERT(sc->sc_dk.dk_openmask == 0); KASSERT(bufq_peek(sc->sc_bufq) == NULL); bufq_drain(sc->sc_bufq); /* Announce our departure. */ aprint_normal("%s at %s (%s) deleted\n", device_xname(sc->sc_dev), sc->sc_parent->dk_name, sc->sc_wname); /* XXX Unicode */ mutex_enter(&sc->sc_parent->dk_openlock); sc->sc_parent->dk_nwedges--; LIST_REMOVE(sc, sc_plink); mutex_exit(&sc->sc_parent->dk_openlock); /* Delete our buffer queue. */ bufq_free(sc->sc_bufq); /* Detach from the disk list. */ disk_detach(&sc->sc_dk); disk_destroy(&sc->sc_dk); /* Poof. */ rw_enter(&dkwedges_lock, RW_WRITER); KASSERT(dkwedges[unit] == sc); dkwedges[unit] = NULL; sc->sc_state = DKW_STATE_DEAD; rw_exit(&dkwedges_lock); mutex_destroy(&sc->sc_iolock); dkwedge_size_fini(sc); free(sc, M_DKWEDGE); return 0; } /* * dkwedge_delall: [exported function] * * Forcibly delete all of the wedges on the specified disk. Used * when a disk is being detached. */ void dkwedge_delall(struct disk *pdk) { dkwedge_delall1(pdk, /*idleonly*/false); } /* * dkwedge_delidle: [exported function] * * Delete all of the wedges on the specified disk if idle. Used * by ioctl(DIOCRMWEDGES). */ void dkwedge_delidle(struct disk *pdk) { dkwedge_delall1(pdk, /*idleonly*/true); } static void dkwedge_delall1(struct disk *pdk, bool idleonly) { struct dkwedge_softc *sc; int flags; flags = DETACH_QUIET; if (!idleonly) flags |= DETACH_FORCE; for (;;) { mutex_enter(&pdk->dk_rawlock); /* for sc->sc_dk.dk_openmask */ mutex_enter(&pdk->dk_openlock); LIST_FOREACH(sc, &pdk->dk_wedges, sc_plink) { /* * Wedge is not yet created. This is a race -- * it may as well have been added just after we * deleted all the wedges, so pretend it's not * here yet. */ if (sc->sc_dev == NULL) continue; if (!idleonly || sc->sc_dk.dk_openmask == 0) { device_acquire(sc->sc_dev); break; } } if (sc == NULL) { KASSERT(idleonly || pdk->dk_nwedges == 0); mutex_exit(&pdk->dk_openlock); mutex_exit(&pdk->dk_rawlock); return; } mutex_exit(&pdk->dk_openlock); mutex_exit(&pdk->dk_rawlock); (void)config_detach_release(sc->sc_dev, flags); } } /* * dkwedge_list: [exported function] * * List all of the wedges on a particular disk. */ int dkwedge_list(struct disk *pdk, struct dkwedge_list *dkwl, struct lwp *l) { struct uio uio; struct iovec iov; struct dkwedge_softc *sc; struct dkwedge_info dkw; int error = 0; iov.iov_base = dkwl->dkwl_buf; iov.iov_len = dkwl->dkwl_bufsize; uio.uio_iov = &iov; uio.uio_iovcnt = 1; uio.uio_offset = 0; uio.uio_resid = dkwl->dkwl_bufsize; uio.uio_rw = UIO_READ; KASSERT(l == curlwp); uio.uio_vmspace = l->l_proc->p_vmspace; dkwl->dkwl_ncopied = 0; mutex_enter(&pdk->dk_openlock); LIST_FOREACH(sc, &pdk->dk_wedges, sc_plink) { if (uio.uio_resid < sizeof(dkw)) break; if (sc->sc_dev == NULL) continue; strlcpy(dkw.dkw_devname, device_xname(sc->sc_dev), sizeof(dkw.dkw_devname)); memcpy(dkw.dkw_wname, sc->sc_wname, sizeof(dkw.dkw_wname)); dkw.dkw_wname[sizeof(dkw.dkw_wname) - 1] = '\0'; strlcpy(dkw.dkw_parent, sc->sc_parent->dk_name, sizeof(dkw.dkw_parent)); dkw.dkw_offset = sc->sc_offset; dkw.dkw_size = dkwedge_size(sc); strlcpy(dkw.dkw_ptype, sc->sc_ptype, sizeof(dkw.dkw_ptype)); /* * Acquire a device reference so this wedge doesn't go * away before our next iteration in LIST_FOREACH, and * then release the lock for uiomove. */ device_acquire(sc->sc_dev); mutex_exit(&pdk->dk_openlock); error = uiomove(&dkw, sizeof(dkw), &uio); mutex_enter(&pdk->dk_openlock); device_release(sc->sc_dev); if (error) break; dkwl->dkwl_ncopied++; } dkwl->dkwl_nwedges = pdk->dk_nwedges; mutex_exit(&pdk->dk_openlock); return error; } static device_t dkwedge_find_by_wname_acquire(const char *wname) { device_t dv = NULL; struct dkwedge_softc *sc; int i; rw_enter(&dkwedges_lock, RW_READER); for (i = 0; i < ndkwedges; i++) { if ((sc = dkwedges[i]) == NULL || sc->sc_dev == NULL) continue; if (strcmp(sc->sc_wname, wname) == 0) { if (dv != NULL) { printf( "WARNING: double match for wedge name %s " "(%s, %s)\n", wname, device_xname(dv), device_xname(sc->sc_dev)); continue; } device_acquire(sc->sc_dev); dv = sc->sc_dev; } } rw_exit(&dkwedges_lock); return dv; } static device_t dkwedge_find_by_parent_acquire(const char *name, size_t *i) { rw_enter(&dkwedges_lock, RW_READER); for (; *i < (size_t)ndkwedges; (*i)++) { struct dkwedge_softc *sc; if ((sc = dkwedges[*i]) == NULL || sc->sc_dev == NULL) continue; if (strcmp(sc->sc_parent->dk_name, name) != 0) continue; device_acquire(sc->sc_dev); rw_exit(&dkwedges_lock); return sc->sc_dev; } rw_exit(&dkwedges_lock); return NULL; } /* XXX unsafe */ device_t dkwedge_find_by_wname(const char *wname) { device_t dv; if ((dv = dkwedge_find_by_wname_acquire(wname)) == NULL) return NULL; device_release(dv); return dv; } /* XXX unsafe */ device_t dkwedge_find_by_parent(const char *name, size_t *i) { device_t dv; if ((dv = dkwedge_find_by_parent_acquire(name, i)) == NULL) return NULL; device_release(dv); return dv; } void dkwedge_print_wnames(void) { struct dkwedge_softc *sc; int i; rw_enter(&dkwedges_lock, RW_READER); for (i = 0; i < ndkwedges; i++) { if ((sc = dkwedges[i]) == NULL || sc->sc_dev == NULL) continue; printf(" wedge:%s", sc->sc_wname); } rw_exit(&dkwedges_lock); } /* * We need a dummy object to stuff into the dkwedge discovery method link * set to ensure that there is always at least one object in the set. */ static struct dkwedge_discovery_method dummy_discovery_method; __link_set_add_bss(dkwedge_methods, dummy_discovery_method); /* * dkwedge_init: * * Initialize the disk wedge subsystem. */ void dkwedge_init(void) { __link_set_decl(dkwedge_methods, struct dkwedge_discovery_method); struct dkwedge_discovery_method * const *ddmp; struct dkwedge_discovery_method *lddm, *ddm; rw_init(&dkwedges_lock); rw_init(&dkwedge_discovery_methods_lock); if (config_cfdriver_attach(&dk_cd) != 0) panic("dkwedge: unable to attach cfdriver"); if (config_cfattach_attach(dk_cd.cd_name, &dk_ca) != 0) panic("dkwedge: unable to attach cfattach"); rw_enter(&dkwedge_discovery_methods_lock, RW_WRITER); LIST_INIT(&dkwedge_discovery_methods); __link_set_foreach(ddmp, dkwedge_methods) { ddm = *ddmp; if (ddm == &dummy_discovery_method) continue; if (LIST_EMPTY(&dkwedge_discovery_methods)) { LIST_INSERT_HEAD(&dkwedge_discovery_methods, ddm, ddm_list); continue; } LIST_FOREACH(lddm, &dkwedge_discovery_methods, ddm_list) { if (ddm->ddm_priority == lddm->ddm_priority) { aprint_error("dk-method-%s: method \"%s\" " "already exists at priority %d\n", ddm->ddm_name, lddm->ddm_name, lddm->ddm_priority); /* Not inserted. */ break; } if (ddm->ddm_priority < lddm->ddm_priority) { /* Higher priority; insert before. */ LIST_INSERT_BEFORE(lddm, ddm, ddm_list); break; } if (LIST_NEXT(lddm, ddm_list) == NULL) { /* Last one; insert after. */ KASSERT(lddm->ddm_priority < ddm->ddm_priority); LIST_INSERT_AFTER(lddm, ddm, ddm_list); break; } } } rw_exit(&dkwedge_discovery_methods_lock); } #ifdef DKWEDGE_AUTODISCOVER int dkwedge_autodiscover = 1; #else int dkwedge_autodiscover = 0; #endif /* * dkwedge_discover: [exported function] * * Discover the wedges on a newly attached disk. * Remove all unused wedges on the disk first. */ void dkwedge_discover(struct disk *pdk) { struct dkwedge_discovery_method *ddm; struct vnode *vp; int error; dev_t pdev; /* * Require people playing with wedges to enable this explicitly. */ if (dkwedge_autodiscover == 0) return; rw_enter(&dkwedge_discovery_methods_lock, RW_READER); /* * Use the character device for scanning, the block device * is busy if there are already wedges attached. */ error = dkwedge_compute_pdev(pdk->dk_name, &pdev, VCHR); if (error) { aprint_error("%s: unable to compute pdev, error = %d\n", pdk->dk_name, error); goto out; } error = cdevvp(pdev, &vp); if (error) { aprint_error("%s: unable to find vnode for pdev, error = %d\n", pdk->dk_name, error); goto out; } error = vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); if (error) { aprint_error("%s: unable to lock vnode for pdev, error = %d\n", pdk->dk_name, error); vrele(vp); goto out; } error = VOP_OPEN(vp, FREAD | FSILENT, NOCRED); if (error) { if (error != ENXIO) aprint_error("%s: unable to open device, error = %d\n", pdk->dk_name, error); vput(vp); goto out; } VOP_UNLOCK(vp); /* * Remove unused wedges */ dkwedge_delidle(pdk); /* * For each supported partition map type, look to see if * this map type exists. If so, parse it and add the * corresponding wedges. */ LIST_FOREACH(ddm, &dkwedge_discovery_methods, ddm_list) { error = (*ddm->ddm_discover)(pdk, vp); if (error == 0) { /* Successfully created wedges; we're done. */ break; } } error = vn_close(vp, FREAD, NOCRED); if (error) { aprint_error("%s: unable to close device, error = %d\n", pdk->dk_name, error); /* We'll just assume the vnode has been cleaned up. */ } out: rw_exit(&dkwedge_discovery_methods_lock); } /* * dkwedge_read: * * Read some data from the specified disk, used for * partition discovery. */ int dkwedge_read(struct disk *pdk, struct vnode *vp, daddr_t blkno, void *tbuf, size_t len) { buf_t *bp; int error; bool isopen; dev_t bdev; struct vnode *bdvp; /* * The kernel cannot read from a character device vnode * as physio() only handles user memory. * * If the block device has already been opened by a wedge * use that vnode and temporarily bump the open counter. * * Otherwise try to open the block device. */ bdev = devsw_chr2blk(vp->v_rdev); mutex_enter(&pdk->dk_rawlock); if (pdk->dk_rawopens != 0) { KASSERT(pdk->dk_rawvp != NULL); isopen = true; ++pdk->dk_rawopens; bdvp = pdk->dk_rawvp; error = 0; } else { isopen = false; error = dk_open_parent(bdev, FREAD, &bdvp); } mutex_exit(&pdk->dk_rawlock); if (error) return error; bp = getiobuf(bdvp, true); bp->b_flags = B_READ; bp->b_cflags = BC_BUSY; bp->b_dev = bdev; bp->b_data = tbuf; bp->b_bufsize = bp->b_bcount = len; bp->b_blkno = blkno; bp->b_cylinder = 0; bp->b_error = 0; VOP_STRATEGY(bdvp, bp); error = biowait(bp); putiobuf(bp); mutex_enter(&pdk->dk_rawlock); if (isopen) { --pdk->dk_rawopens; } else { dk_close_parent(bdvp, FREAD); } mutex_exit(&pdk->dk_rawlock); return error; } /* * dkwedge_lookup: * * Look up a dkwedge_softc based on the provided dev_t. * * Caller must guarantee the wedge is referenced. */ static struct dkwedge_softc * dkwedge_lookup(dev_t dev) { return device_lookup_private(&dk_cd, minor(dev)); } static struct dkwedge_softc * dkwedge_lookup_acquire(dev_t dev) { device_t dv = device_lookup_acquire(&dk_cd, minor(dev)); if (dv == NULL) return NULL; return device_private(dv); } static int dk_open_parent(dev_t dev, int mode, struct vnode **vpp) { struct vnode *vp; int error; error = bdevvp(dev, &vp); if (error) return error; error = vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); if (error) { vrele(vp); return error; } error = VOP_OPEN(vp, mode, NOCRED); if (error) { vput(vp); return error; } /* VOP_OPEN() doesn't do this for us. */ if (mode & FWRITE) { mutex_enter(vp->v_interlock); vp->v_writecount++; mutex_exit(vp->v_interlock); } VOP_UNLOCK(vp); *vpp = vp; return 0; } static int dk_close_parent(struct vnode *vp, int mode) { int error; error = vn_close(vp, mode, NOCRED); return error; } /* * dkopen: [devsw entry point] * * Open a wedge. */ static int dkopen(dev_t dev, int flags, int fmt, struct lwp *l) { struct dkwedge_softc *sc = dkwedge_lookup(dev); int error = 0; if (sc == NULL) return ENXIO; KASSERT(sc->sc_dev != NULL); KASSERT(sc->sc_state == DKW_STATE_RUNNING); /* * We go through a complicated little dance to only open the parent * vnode once per wedge, no matter how many times the wedge is * opened. The reason? We see one dkopen() per open call, but * only dkclose() on the last close. */ mutex_enter(&sc->sc_dk.dk_openlock); mutex_enter(&sc->sc_parent->dk_rawlock); if (sc->sc_dk.dk_openmask == 0) { error = dkfirstopen(sc, flags); if (error) goto out; } else if (flags & ~sc->sc_mode & FWRITE) { /* * The parent is already open, but the previous attempt * to open it read/write failed and fell back to * read-only. In that case, we assume the medium is * read-only and fail to open the wedge read/write. */ error = EROFS; goto out; } KASSERT(sc->sc_mode != 0); KASSERTMSG(sc->sc_mode & FREAD, "%s: sc_mode=%x", device_xname(sc->sc_dev), sc->sc_mode); KASSERTMSG((flags & FWRITE) ? (sc->sc_mode & FWRITE) : 1, "%s: flags=%x sc_mode=%x", device_xname(sc->sc_dev), flags, sc->sc_mode); if (fmt == S_IFCHR) sc->sc_dk.dk_copenmask |= 1; else sc->sc_dk.dk_bopenmask |= 1; sc->sc_dk.dk_openmask = sc->sc_dk.dk_copenmask | sc->sc_dk.dk_bopenmask; out: mutex_exit(&sc->sc_parent->dk_rawlock); mutex_exit(&sc->sc_dk.dk_openlock); return error; } static int dkfirstopen(struct dkwedge_softc *sc, int flags) { struct dkwedge_softc *nsc; struct vnode *vp; int mode; int error; KASSERT(mutex_owned(&sc->sc_dk.dk_openlock)); KASSERT(mutex_owned(&sc->sc_parent->dk_rawlock)); if (sc->sc_parent->dk_rawopens == 0) { KASSERT(sc->sc_parent->dk_rawvp == NULL); /* * Try open read-write. If this fails for EROFS * and wedge is read-only, retry to open read-only. */ mode = FREAD | FWRITE; error = dk_open_parent(sc->sc_pdev, mode, &vp); if (error == EROFS && (flags & FWRITE) == 0) { mode &= ~FWRITE; error = dk_open_parent(sc->sc_pdev, mode, &vp); } if (error) return error; KASSERT(vp != NULL); sc->sc_parent->dk_rawvp = vp; } else { /* * Retrieve mode from an already opened wedge. * * At this point, dk_rawopens is bounded by the number * of dkwedge devices in the system, which is limited * by autoconf device numbering to INT_MAX. Since * dk_rawopens is unsigned, this can't overflow. */ KASSERT(sc->sc_parent->dk_rawopens < UINT_MAX); KASSERT(sc->sc_parent->dk_rawvp != NULL); mode = 0; mutex_enter(&sc->sc_parent->dk_openlock); LIST_FOREACH(nsc, &sc->sc_parent->dk_wedges, sc_plink) { if (nsc == sc || nsc->sc_dk.dk_openmask == 0) continue; mode = nsc->sc_mode; break; } mutex_exit(&sc->sc_parent->dk_openlock); } sc->sc_mode = mode; sc->sc_parent->dk_rawopens++; return 0; } static void dklastclose(struct dkwedge_softc *sc) { KASSERT(mutex_owned(&sc->sc_dk.dk_openlock)); KASSERT(mutex_owned(&sc->sc_parent->dk_rawlock)); KASSERT(sc->sc_parent->dk_rawopens > 0); KASSERT(sc->sc_parent->dk_rawvp != NULL); if (--sc->sc_parent->dk_rawopens == 0) { struct vnode *const vp = sc->sc_parent->dk_rawvp; const int mode = sc->sc_mode; sc->sc_parent->dk_rawvp = NULL; sc->sc_mode = 0; dk_close_parent(vp, mode); } } /* * dkclose: [devsw entry point] * * Close a wedge. */ static int dkclose(dev_t dev, int flags, int fmt, struct lwp *l) { struct dkwedge_softc *sc = dkwedge_lookup(dev); /* * dkclose can be called even if dkopen didn't succeed, so we * have to handle the same possibility that the wedge may not * exist. */ if (sc == NULL) return ENXIO; KASSERT(sc->sc_dev != NULL); KASSERT(sc->sc_state != DKW_STATE_LARVAL); KASSERT(sc->sc_state != DKW_STATE_DEAD); mutex_enter(&sc->sc_dk.dk_openlock); mutex_enter(&sc->sc_parent->dk_rawlock); KASSERT(sc->sc_dk.dk_openmask != 0); if (fmt == S_IFCHR) sc->sc_dk.dk_copenmask &= ~1; else sc->sc_dk.dk_bopenmask &= ~1; sc->sc_dk.dk_openmask = sc->sc_dk.dk_copenmask | sc->sc_dk.dk_bopenmask; if (sc->sc_dk.dk_openmask == 0) { dklastclose(sc); } mutex_exit(&sc->sc_parent->dk_rawlock); mutex_exit(&sc->sc_dk.dk_openlock); return 0; } /* * dkcancel: [devsw entry point] * * Cancel any pending I/O operations waiting on a wedge. */ static int dkcancel(dev_t dev, int flags, int fmt, struct lwp *l) { struct dkwedge_softc *sc = dkwedge_lookup(dev); KASSERT(sc != NULL); KASSERT(sc->sc_dev != NULL); KASSERT(sc->sc_state != DKW_STATE_LARVAL); KASSERT(sc->sc_state != DKW_STATE_DEAD); /* * Disk I/O is expected to complete or fail within a reasonable * timeframe -- it's storage, not communication. Further, the * character and block device interface guarantees that prior * reads and writes have completed or failed by the time close * returns -- we are not to cancel them here. If the parent * device's hardware is gone, the parent driver can make them * fail. Nothing for dk(4) itself to do. */ return 0; } /* * dkstrategy: [devsw entry point] * * Perform I/O based on the wedge I/O strategy. */ static void dkstrategy(struct buf *bp) { struct dkwedge_softc *sc = dkwedge_lookup(bp->b_dev); uint64_t p_size, p_offset; KASSERT(sc != NULL); KASSERT(sc->sc_dev != NULL); KASSERT(sc->sc_state != DKW_STATE_LARVAL); KASSERT(sc->sc_state != DKW_STATE_DEAD); KASSERT(sc->sc_parent->dk_rawvp != NULL); /* If it's an empty transfer, wake up the top half now. */ if (bp->b_bcount == 0) goto done; p_offset = sc->sc_offset << sc->sc_parent->dk_blkshift; p_size = dkwedge_size(sc) << sc->sc_parent->dk_blkshift; /* Make sure it's in-range. */ if (bounds_check_with_mediasize(bp, DEV_BSIZE, p_size) <= 0) goto done; /* Translate it to the parent's raw LBA. */ bp->b_rawblkno = bp->b_blkno + p_offset; /* Place it in the queue and start I/O on the unit. */ mutex_enter(&sc->sc_iolock); disk_wait(&sc->sc_dk); bufq_put(sc->sc_bufq, bp); mutex_exit(&sc->sc_iolock); dkstart(sc); return; done: bp->b_resid = bp->b_bcount; biodone(bp); } /* * dkstart: * * Start I/O that has been enqueued on the wedge. */ static void dkstart(struct dkwedge_softc *sc) { struct vnode *vp; struct buf *bp, *nbp; mutex_enter(&sc->sc_iolock); /* Do as much work as has been enqueued. */ while ((bp = bufq_peek(sc->sc_bufq)) != NULL) { if (sc->sc_iostop) { (void) bufq_get(sc->sc_bufq); mutex_exit(&sc->sc_iolock); bp->b_error = ENXIO; bp->b_resid = bp->b_bcount; biodone(bp); mutex_enter(&sc->sc_iolock); continue; } /* fetch an I/O buf with sc_iolock dropped */ mutex_exit(&sc->sc_iolock); nbp = getiobuf(sc->sc_parent->dk_rawvp, false); mutex_enter(&sc->sc_iolock); if (nbp == NULL) { /* * No resources to run this request; leave the * buffer queued up, and schedule a timer to * restart the queue in 1/2 a second. */ if (!sc->sc_iostop) callout_schedule(&sc->sc_restart_ch, hz/2); break; } /* * fetch buf, this can fail if another thread * has already processed the queue, it can also * return a completely different buf. */ bp = bufq_get(sc->sc_bufq); if (bp == NULL) { mutex_exit(&sc->sc_iolock); putiobuf(nbp); mutex_enter(&sc->sc_iolock); continue; } /* Instrumentation. */ disk_busy(&sc->sc_dk); /* release lock for VOP_STRATEGY */ mutex_exit(&sc->sc_iolock); nbp->b_data = bp->b_data; nbp->b_flags = bp->b_flags; nbp->b_oflags = bp->b_oflags; nbp->b_cflags = bp->b_cflags; nbp->b_iodone = dkiodone; nbp->b_proc = bp->b_proc; nbp->b_blkno = bp->b_rawblkno; nbp->b_dev = sc->sc_parent->dk_rawvp->v_rdev; nbp->b_bcount = bp->b_bcount; nbp->b_private = bp; BIO_COPYPRIO(nbp, bp); vp = nbp->b_vp; if ((nbp->b_flags & B_READ) == 0) { mutex_enter(vp->v_interlock); vp->v_numoutput++; mutex_exit(vp->v_interlock); } VOP_STRATEGY(vp, nbp); mutex_enter(&sc->sc_iolock); } mutex_exit(&sc->sc_iolock); } /* * dkiodone: * * I/O to a wedge has completed; alert the top half. */ static void dkiodone(struct buf *bp) { struct buf *obp = bp->b_private; struct dkwedge_softc *sc = dkwedge_lookup(obp->b_dev); KASSERT(sc != NULL); KASSERT(sc->sc_dev != NULL); if (bp->b_error != 0) obp->b_error = bp->b_error; obp->b_resid = bp->b_resid; putiobuf(bp); mutex_enter(&sc->sc_iolock); disk_unbusy(&sc->sc_dk, obp->b_bcount - obp->b_resid, obp->b_flags & B_READ); mutex_exit(&sc->sc_iolock); biodone(obp); /* Kick the queue in case there is more work we can do. */ dkstart(sc); } /* * dkrestart: * * Restart the work queue after it was stalled due to * a resource shortage. Invoked via a callout. */ static void dkrestart(void *v) { struct dkwedge_softc *sc = v; dkstart(sc); } /* * dkminphys: * * Call parent's minphys function. */ static void dkminphys(struct buf *bp) { struct dkwedge_softc *sc = dkwedge_lookup(bp->b_dev); dev_t dev; KASSERT(sc != NULL); KASSERT(sc->sc_dev != NULL); dev = bp->b_dev; bp->b_dev = sc->sc_pdev; if (sc->sc_parent->dk_driver && sc->sc_parent->dk_driver->d_minphys) (*sc->sc_parent->dk_driver->d_minphys)(bp); else minphys(bp); bp->b_dev = dev; } /* * dkread: [devsw entry point] * * Read from a wedge. */ static int dkread(dev_t dev, struct uio *uio, int flags) { struct dkwedge_softc *sc __diagused = dkwedge_lookup(dev); KASSERT(sc != NULL); KASSERT(sc->sc_dev != NULL); KASSERT(sc->sc_state != DKW_STATE_LARVAL); KASSERT(sc->sc_state != DKW_STATE_DEAD); return physio(dkstrategy, NULL, dev, B_READ, dkminphys, uio); } /* * dkwrite: [devsw entry point] * * Write to a wedge. */ static int dkwrite(dev_t dev, struct uio *uio, int flags) { struct dkwedge_softc *sc __diagused = dkwedge_lookup(dev); KASSERT(sc != NULL); KASSERT(sc->sc_dev != NULL); KASSERT(sc->sc_state != DKW_STATE_LARVAL); KASSERT(sc->sc_state != DKW_STATE_DEAD); return physio(dkstrategy, NULL, dev, B_WRITE, dkminphys, uio); } /* * dkioctl: [devsw entry point] * * Perform an ioctl request on a wedge. */ static int dkioctl(dev_t dev, u_long cmd, void *data, int flag, struct lwp *l) { struct dkwedge_softc *sc = dkwedge_lookup(dev); int error = 0; KASSERT(sc != NULL); KASSERT(sc->sc_dev != NULL); KASSERT(sc->sc_state != DKW_STATE_LARVAL); KASSERT(sc->sc_state != DKW_STATE_DEAD); KASSERT(sc->sc_parent->dk_rawvp != NULL); /* * We pass NODEV instead of our device to indicate we don't * want to handle disklabel ioctls */ error = disk_ioctl(&sc->sc_dk, NODEV, cmd, data, flag, l); if (error != EPASSTHROUGH) return error; error = 0; switch (cmd) { case DIOCGSTRATEGY: case DIOCGCACHE: case DIOCCACHESYNC: error = VOP_IOCTL(sc->sc_parent->dk_rawvp, cmd, data, flag, l != NULL ? l->l_cred : NOCRED); break; case DIOCGWEDGEINFO: { struct dkwedge_info *dkw = data; strlcpy(dkw->dkw_devname, device_xname(sc->sc_dev), sizeof(dkw->dkw_devname)); memcpy(dkw->dkw_wname, sc->sc_wname, sizeof(dkw->dkw_wname)); dkw->dkw_wname[sizeof(dkw->dkw_wname) - 1] = '\0'; strlcpy(dkw->dkw_parent, sc->sc_parent->dk_name, sizeof(dkw->dkw_parent)); dkw->dkw_offset = sc->sc_offset; dkw->dkw_size = dkwedge_size(sc); strlcpy(dkw->dkw_ptype, sc->sc_ptype, sizeof(dkw->dkw_ptype)); break; } case DIOCGSECTORALIGN: { struct disk_sectoralign *dsa = data; uint32_t r; error = VOP_IOCTL(sc->sc_parent->dk_rawvp, cmd, dsa, flag, l != NULL ? l->l_cred : NOCRED); if (error) break; r = sc->sc_offset % dsa->dsa_alignment; if (r < dsa->dsa_firstaligned) dsa->dsa_firstaligned = dsa->dsa_firstaligned - r; else dsa->dsa_firstaligned = (dsa->dsa_firstaligned + dsa->dsa_alignment) - r; break; } default: error = ENOTTY; } return error; } /* * dkdiscard: [devsw entry point] * * Perform a discard-range request on a wedge. */ static int dkdiscard(dev_t dev, off_t pos, off_t len) { struct dkwedge_softc *sc = dkwedge_lookup(dev); uint64_t size = dkwedge_size(sc); unsigned shift; off_t offset, maxlen; int error; KASSERT(sc != NULL); KASSERT(sc->sc_dev != NULL); KASSERT(sc->sc_state != DKW_STATE_LARVAL); KASSERT(sc->sc_state != DKW_STATE_DEAD); KASSERT(sc->sc_parent->dk_rawvp != NULL); /* XXX check bounds on size/offset up front */ shift = (sc->sc_parent->dk_blkshift + DEV_BSHIFT); KASSERT(__type_fit(off_t, size)); KASSERT(__type_fit(off_t, sc->sc_offset)); KASSERT(0 <= sc->sc_offset); KASSERT(size <= (__type_max(off_t) >> shift)); KASSERT(sc->sc_offset <= ((__type_max(off_t) >> shift) - size)); offset = ((off_t)sc->sc_offset << shift); maxlen = ((off_t)size << shift); if (len > maxlen) return EINVAL; if (pos > (maxlen - len)) return EINVAL; pos += offset; vn_lock(sc->sc_parent->dk_rawvp, LK_EXCLUSIVE | LK_RETRY); error = VOP_FDISCARD(sc->sc_parent->dk_rawvp, pos, len); VOP_UNLOCK(sc->sc_parent->dk_rawvp); return error; } /* * dksize: [devsw entry point] * * Query the size of a wedge for the purpose of performing a dump * or for swapping to. */ static int dksize(dev_t dev) { /* * Don't bother taking a reference because this is only used * either (a) while the device is open (for swap), or (b) while * any multiprocessing is quiescent (for crash dumps). */ struct dkwedge_softc *sc = dkwedge_lookup(dev); uint64_t p_size; int rv = -1; if (sc == NULL) return -1; if (sc->sc_state != DKW_STATE_RUNNING) return -1; /* Our content type is static, no need to open the device. */ p_size = dkwedge_size(sc) << sc->sc_parent->dk_blkshift; if (strcmp(sc->sc_ptype, DKW_PTYPE_SWAP) == 0) { /* Saturate if we are larger than INT_MAX. */ if (p_size > INT_MAX) rv = INT_MAX; else rv = (int)p_size; } return rv; } /* * dkdump: [devsw entry point] * * Perform a crash dump to a wedge. */ static int dkdump(dev_t dev, daddr_t blkno, void *va, size_t size) { /* * Don't bother taking a reference because this is only used * while any multiprocessing is quiescent. */ struct dkwedge_softc *sc = dkwedge_lookup(dev); const struct bdevsw *bdev; uint64_t p_size, p_offset; if (sc == NULL) return ENXIO; if (sc->sc_state != DKW_STATE_RUNNING) return ENXIO; /* Our content type is static, no need to open the device. */ if (strcmp(sc->sc_ptype, DKW_PTYPE_SWAP) != 0 && strcmp(sc->sc_ptype, DKW_PTYPE_RAID) != 0 && strcmp(sc->sc_ptype, DKW_PTYPE_CGD) != 0) return ENXIO; if (size % DEV_BSIZE != 0) return EINVAL; p_offset = sc->sc_offset << sc->sc_parent->dk_blkshift; p_size = dkwedge_size(sc) << sc->sc_parent->dk_blkshift; if (blkno < 0 || blkno + size/DEV_BSIZE > p_size) { printf("%s: blkno (%" PRIu64 ") + size / DEV_BSIZE (%zu) > " "p_size (%" PRIu64 ")\n", __func__, blkno, size/DEV_BSIZE, p_size); return EINVAL; } bdev = bdevsw_lookup(sc->sc_pdev); return (*bdev->d_dump)(sc->sc_pdev, blkno + p_offset, va, size); } /* * config glue */ /* * dkwedge_find_partition * * Find wedge corresponding to the specified parent name * and offset/length. */ static device_t dkwedge_find_partition_acquire(device_t parent, daddr_t startblk, uint64_t nblks) { struct dkwedge_softc *sc; int i; device_t wedge = NULL; rw_enter(&dkwedges_lock, RW_READER); for (i = 0; i < ndkwedges; i++) { if ((sc = dkwedges[i]) == NULL || sc->sc_dev == NULL) continue; if (strcmp(sc->sc_parent->dk_name, device_xname(parent)) == 0 && sc->sc_offset == startblk && dkwedge_size(sc) == nblks) { if (wedge) { printf("WARNING: double match for boot wedge " "(%s, %s)\n", device_xname(wedge), device_xname(sc->sc_dev)); continue; } wedge = sc->sc_dev; device_acquire(wedge); } } rw_exit(&dkwedges_lock); return wedge; } /* XXX unsafe */ device_t dkwedge_find_partition(device_t parent, daddr_t startblk, uint64_t nblks) { device_t dv; if ((dv = dkwedge_find_partition_acquire(parent, startblk, nblks)) == NULL) return NULL; device_release(dv); return dv; } const char * dkwedge_get_parent_name(dev_t dev) { /* XXX: perhaps do this in lookup? */ int bmaj = bdevsw_lookup_major(&dk_bdevsw); int cmaj = cdevsw_lookup_major(&dk_cdevsw); if (major(dev) != bmaj && major(dev) != cmaj) return NULL; struct dkwedge_softc *const sc = dkwedge_lookup_acquire(dev); if (sc == NULL) return NULL; const char *const name = sc->sc_parent->dk_name; device_release(sc->sc_dev); return name; }
2 2 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 /*- * Copyright (c) 2014 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Mindaugas Rasiukevicius. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /* * NPF network interface handling module. */ #ifdef _KERNEL #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: npf_ifaddr.c,v 1.8 2022/02/13 19:20:11 riastradh Exp $"); #include <sys/param.h> #include <sys/types.h> #include <sys/kmem.h> #include <net/if.h> #include <netinet/in.h> #include <netinet6/in6_var.h> #endif #include "npf_impl.h" static npf_table_t * lookup_ifnet_table(npf_t *npf, ifnet_t *ifp) { const npf_ifops_t *ifops = npf->ifops; char tname[NPF_TABLE_MAXNAMELEN]; const char *ifname; npf_config_t *nc; npf_table_t *t; unsigned tid; /* Get the interface name and prefix it. */ ifname = ifops->getname(npf, ifp); snprintf(tname, sizeof(tname), ".ifnet-%s", ifname); KERNEL_LOCK(1, NULL); nc = npf_config_enter(npf); /* * Check whether this interface is of any interest to us. */ t = npf_tableset_getbyname(nc->tableset, tname); if (!t) { goto out; } tid = npf_table_getid(t); /* Create a new NPF table for the interface. */ t = npf_table_create(tname, tid, NPF_TABLE_IFADDR, NULL, 0); if (!t) { goto out; } return t; out: npf_config_exit(npf); KERNEL_UNLOCK_ONE(NULL); return NULL; } static void replace_ifnet_table(npf_t *npf, npf_table_t *newt) { npf_tableset_t *ts = atomic_load_relaxed(&npf->config)->tableset; npf_table_t *oldt; KASSERT(npf_config_locked_p(npf)); KERNEL_UNLOCK_ONE(NULL); /* * Finally, swap the tables and issue a sync barrier. */ oldt = npf_tableset_swap(ts, newt); npf_config_sync(npf); npf_config_exit(npf); /* At this point, it is safe to destroy the old table. */ npf_table_destroy(oldt); } void npf_ifaddr_sync(npf_t *npf, ifnet_t *ifp) { npf_table_t *t; struct ifaddr *ifa; /* * First, check whether this interface is of any interest to us. * * => Acquires npf-config-lock and kernel-lock on success. */ t = lookup_ifnet_table(npf, ifp); if (!t) return; /* * Populate the table with the interface addresses. * Note: currently, this list is protected by the kernel-lock. */ IFADDR_FOREACH(ifa, ifp) { struct sockaddr *sa = ifa->ifa_addr; const void *p = NULL; int alen = 0; if (sa->sa_family == AF_INET) { const struct sockaddr_in *sin4 = satosin(sa); alen = sizeof(struct in_addr); p = &sin4->sin_addr; } if (sa->sa_family == AF_INET6) { const struct sockaddr_in6 *sin6 = satosin6(sa); alen = sizeof(struct in6_addr); p = &sin6->sin6_addr; } if (alen) { npf_addr_t addr; memcpy(&addr, p, alen); npf_table_insert(t, alen, &addr, NPF_NO_NETMASK); } } /* Publish the new table. */ replace_ifnet_table(npf, t); } void npf_ifaddr_flush(npf_t *npf, ifnet_t *ifp) { npf_table_t *t; /* * Flush: just load an empty table. */ t = lookup_ifnet_table(npf, ifp); if (!t) { return; } replace_ifnet_table(npf, t); } void npf_ifaddr_syncall(npf_t *npf) { ifnet_t *ifp; KERNEL_LOCK(1, NULL); IFNET_GLOBAL_LOCK(); IFNET_WRITER_FOREACH(ifp) { npf_ifaddr_sync(npf, ifp); } IFNET_GLOBAL_UNLOCK(); KERNEL_UNLOCK_ONE(NULL); }
58 82 79 3 5 56 56 53 2 2 54 57 56 8 9 9 9 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 /* $NetBSD: kern_pax.c,v 1.63 2022/10/26 23:22:38 riastradh Exp $ */ /* * Copyright (c) 2015, 2020 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Maxime Villard. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /* * Copyright (c) 2006 Elad Efrat <elad@NetBSD.org> * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The name of the author may not be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: kern_pax.c,v 1.63 2022/10/26 23:22:38 riastradh Exp $"); #include "opt_pax.h" #include <sys/param.h> #include <sys/proc.h> #include <sys/exec.h> #include <sys/exec_elf.h> #include <sys/pax.h> #include <sys/sysctl.h> #include <sys/kmem.h> #include <sys/mman.h> #include <sys/syslog.h> #include <sys/vnode.h> #include <sys/queue.h> #include <sys/bitops.h> #include <sys/kauth.h> #include <sys/cprng.h> #ifdef PAX_ASLR_DEBUG #define PAX_DPRINTF(_fmt, args...) \ do if (pax_aslr_debug) uprintf("%s: " _fmt "\n", __func__, ##args); \ while (/*CONSTCOND*/0) #else #define PAX_DPRINTF(_fmt, args...) do {} while (/*CONSTCOND*/0) #endif #ifdef PAX_ASLR #include <sys/mman.h> #include <sys/resourcevar.h> int pax_aslr_enabled = 1; int pax_aslr_global = PAX_ASLR; #ifndef PAX_ASLR_DELTA_MMAP_LSB #define PAX_ASLR_DELTA_MMAP_LSB PGSHIFT #endif #ifndef PAX_ASLR_DELTA_MMAP_LEN #define PAX_ASLR_DELTA_MMAP_LEN ((sizeof(void *) * NBBY) / 2) #endif #ifndef PAX_ASLR_DELTA_MMAP_LEN32 #define PAX_ASLR_DELTA_MMAP_LEN32 ((sizeof(uint32_t) * NBBY) / 2) #endif #ifndef PAX_ASLR_DELTA_STACK_LSB #define PAX_ASLR_DELTA_STACK_LSB PGSHIFT #endif #ifndef PAX_ASLR_DELTA_STACK_LEN #define PAX_ASLR_DELTA_STACK_LEN ((sizeof(void *) * NBBY) / 4) #endif #ifndef PAX_ASLR_DELTA_STACK_LEN32 #define PAX_ASLR_DELTA_STACK_LEN32 ((sizeof(uint32_t) * NBBY) / 4) #endif #define PAX_ASLR_MAX_STACK_WASTE 8 #ifdef PAX_ASLR_DEBUG int pax_aslr_debug; /* flag set means disable */ int pax_aslr_flags; uint32_t pax_aslr_rand; #define PAX_ASLR_STACK 0x01 #define PAX_ASLR_STACK_GAP 0x02 #define PAX_ASLR_MMAP 0x04 #define PAX_ASLR_EXEC_OFFSET 0x08 #define PAX_ASLR_RTLD_OFFSET 0x10 #define PAX_ASLR_FIXED 0x20 #endif static bool pax_aslr_elf_flags_active(uint32_t); #endif /* PAX_ASLR */ #ifdef PAX_MPROTECT static int pax_mprotect_enabled = 1; static int pax_mprotect_global = PAX_MPROTECT; static int pax_mprotect_ptrace = 1; static bool pax_mprotect_elf_flags_active(uint32_t); #endif /* PAX_MPROTECT */ #ifdef PAX_MPROTECT_DEBUG int pax_mprotect_debug; #endif #ifdef PAX_SEGVGUARD #ifndef PAX_SEGVGUARD_EXPIRY #define PAX_SEGVGUARD_EXPIRY (2 * 60) #endif #ifndef PAX_SEGVGUARD_SUSPENSION #define PAX_SEGVGUARD_SUSPENSION (10 * 60) #endif #ifndef PAX_SEGVGUARD_MAXCRASHES #define PAX_SEGVGUARD_MAXCRASHES 5 #endif static int pax_segvguard_enabled = 1; static int pax_segvguard_global = PAX_SEGVGUARD; static int pax_segvguard_expiry = PAX_SEGVGUARD_EXPIRY; static int pax_segvguard_suspension = PAX_SEGVGUARD_SUSPENSION; static int pax_segvguard_maxcrashes = PAX_SEGVGUARD_MAXCRASHES; struct pax_segvguard_uid_entry { uid_t sue_uid; size_t sue_ncrashes; time_t sue_expiry; time_t sue_suspended; LIST_ENTRY(pax_segvguard_uid_entry) sue_list; }; struct pax_segvguard_entry { LIST_HEAD(, pax_segvguard_uid_entry) segv_uids; }; static bool pax_segvguard_elf_flags_active(uint32_t); #endif /* PAX_SEGVGUARD */ SYSCTL_SETUP(sysctl_security_pax_setup, "sysctl security.pax setup") { const struct sysctlnode *rnode = NULL, *cnode; sysctl_createv(clog, 0, NULL, &rnode, CTLFLAG_PERMANENT, CTLTYPE_NODE, "pax", SYSCTL_DESCR("PaX (exploit mitigation) features."), NULL, 0, NULL, 0, CTL_SECURITY, CTL_CREATE, CTL_EOL); cnode = rnode; #ifdef PAX_MPROTECT rnode = cnode; sysctl_createv(clog, 0, &rnode, &rnode, CTLFLAG_PERMANENT, CTLTYPE_NODE, "mprotect", SYSCTL_DESCR("mprotect(2) W^X restrictions."), NULL, 0, NULL, 0, CTL_CREATE, CTL_EOL); sysctl_createv(clog, 0, &rnode, NULL, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_INT, "enabled", SYSCTL_DESCR("Restrictions enabled."), NULL, 0, &pax_mprotect_enabled, 0, CTL_CREATE, CTL_EOL); sysctl_createv(clog, 0, &rnode, NULL, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_INT, "global", SYSCTL_DESCR("When enabled, unless explicitly " "specified, apply restrictions to " "all processes."), NULL, 0, &pax_mprotect_global, 0, CTL_CREATE, CTL_EOL); sysctl_createv(clog, 0, &rnode, NULL, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_INT, "ptrace", SYSCTL_DESCR("When enabled, allow ptrace(2) to " "override mprotect permissions on traced " "processes"), NULL, 0, &pax_mprotect_ptrace, 0, CTL_CREATE, CTL_EOL); #ifdef PAX_MPROTECT_DEBUG sysctl_createv(clog, 0, &rnode, NULL, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_INT, "debug", SYSCTL_DESCR("print mprotect changes."), NULL, 0, &pax_mprotect_debug, 0, CTL_CREATE, CTL_EOL); #endif #endif /* PAX_MPROTECT */ #ifdef PAX_SEGVGUARD rnode = cnode; sysctl_createv(clog, 0, &rnode, &rnode, CTLFLAG_PERMANENT, CTLTYPE_NODE, "segvguard", SYSCTL_DESCR("PaX segvguard."), NULL, 0, NULL, 0, CTL_CREATE, CTL_EOL); sysctl_createv(clog, 0, &rnode, NULL, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_INT, "enabled", SYSCTL_DESCR("segvguard enabled."), NULL, 0, &pax_segvguard_enabled, 0, CTL_CREATE, CTL_EOL); sysctl_createv(clog, 0, &rnode, NULL, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_INT, "global", SYSCTL_DESCR("segvguard all programs."), NULL, 0, &pax_segvguard_global, 0, CTL_CREATE, CTL_EOL); sysctl_createv(clog, 0, &rnode, NULL, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_INT, "expiry_timeout", SYSCTL_DESCR("Entry expiry timeout (in seconds)."), NULL, 0, &pax_segvguard_expiry, 0, CTL_CREATE, CTL_EOL); sysctl_createv(clog, 0, &rnode, NULL, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_INT, "suspend_timeout", SYSCTL_DESCR("Entry suspension timeout (in seconds)."), NULL, 0, &pax_segvguard_suspension, 0, CTL_CREATE, CTL_EOL); sysctl_createv(clog, 0, &rnode, NULL, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_INT, "max_crashes", SYSCTL_DESCR("Max number of crashes before expiry."), NULL, 0, &pax_segvguard_maxcrashes, 0, CTL_CREATE, CTL_EOL); #endif /* PAX_SEGVGUARD */ #ifdef PAX_ASLR rnode = cnode; sysctl_createv(clog, 0, &rnode, &rnode, CTLFLAG_PERMANENT, CTLTYPE_NODE, "aslr", SYSCTL_DESCR("Address Space Layout Randomization."), NULL, 0, NULL, 0, CTL_CREATE, CTL_EOL); sysctl_createv(clog, 0, &rnode, NULL, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_INT, "enabled", SYSCTL_DESCR("Restrictions enabled."), NULL, 0, &pax_aslr_enabled, 0, CTL_CREATE, CTL_EOL); sysctl_createv(clog, 0, &rnode, NULL, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_INT, "global", SYSCTL_DESCR("When enabled, unless explicitly " "specified, apply to all processes."), NULL, 0, &pax_aslr_global, 0, CTL_CREATE, CTL_EOL); #ifdef PAX_ASLR_DEBUG sysctl_createv(clog, 0, &rnode, NULL, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_INT, "debug", SYSCTL_DESCR("Print ASLR selected addresses."), NULL, 0, &pax_aslr_debug, 0, CTL_CREATE, CTL_EOL); sysctl_createv(clog, 0, &rnode, NULL, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_INT, "flags", SYSCTL_DESCR("Disable/Enable select ASLR features."), NULL, 0, &pax_aslr_flags, 0, CTL_CREATE, CTL_EOL); sysctl_createv(clog, 0, &rnode, NULL, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_INT, "rand", SYSCTL_DESCR("Use the given fixed random value"), NULL, 0, &pax_aslr_rand, 0, CTL_CREATE, CTL_EOL); #endif sysctl_createv(clog, 0, &rnode, NULL, CTLFLAG_PERMANENT|CTLFLAG_IMMEDIATE, CTLTYPE_INT, "mmap_len", SYSCTL_DESCR("Number of bits randomized for " "mmap(2) calls."), NULL, PAX_ASLR_DELTA_MMAP_LEN, NULL, 0, CTL_CREATE, CTL_EOL); sysctl_createv(clog, 0, &rnode, NULL, CTLFLAG_PERMANENT|CTLFLAG_IMMEDIATE, CTLTYPE_INT, "stack_len", SYSCTL_DESCR("Number of bits randomized for " "the stack."), NULL, PAX_ASLR_DELTA_STACK_LEN, NULL, 0, CTL_CREATE, CTL_EOL); sysctl_createv(clog, 0, &rnode, NULL, CTLFLAG_PERMANENT|CTLFLAG_IMMEDIATE, CTLTYPE_INT, "exec_len", SYSCTL_DESCR("Number of bits randomized for " "the PIE exec base."), NULL, PAX_ASLR_DELTA_EXEC_LEN, NULL, 0, CTL_CREATE, CTL_EOL); #endif /* PAX_ASLR */ } /* * Initialize PaX. */ void pax_init(void) { #ifdef PAX_ASLR /* Adjust maximum stack by the size we can consume for ASLR */ maxsmap = MAXSSIZ - (MAXSSIZ / PAX_ASLR_MAX_STACK_WASTE); // XXX: compat32 is not handled. #endif } void pax_set_flags(struct exec_package *epp, struct proc *p) { p->p_pax = epp->ep_pax_flags; #ifdef PAX_MPROTECT if (pax_mprotect_ptrace == 0) return; /* * If we are running under the debugger, turn off MPROTECT so * the debugger can insert/delete breakpoints */ if (p->p_slflag & PSL_TRACED) p->p_pax &= ~P_PAX_MPROTECT; #endif } void pax_setup_elf_flags(struct exec_package *epp, uint32_t elf_flags) { uint32_t flags = 0; #ifdef PAX_ASLR if (pax_aslr_elf_flags_active(elf_flags)) { flags |= P_PAX_ASLR; } #endif #ifdef PAX_MPROTECT if (pax_mprotect_elf_flags_active(elf_flags)) { flags |= P_PAX_MPROTECT; } #endif #ifdef PAX_SEGVGUARD if (pax_segvguard_elf_flags_active(elf_flags)) { flags |= P_PAX_GUARD; } #endif epp->ep_pax_flags = flags; } #if defined(PAX_MPROTECT) || defined(PAX_SEGVGUARD) || defined(PAX_ASLR) static inline bool pax_flags_active(uint32_t flags, uint32_t opt) { if (!(flags & opt)) return false; return true; } #endif /* PAX_MPROTECT || PAX_SEGVGUARD || PAX_ASLR */ #ifdef PAX_MPROTECT static bool pax_mprotect_elf_flags_active(uint32_t flags) { if (!pax_mprotect_enabled) return false; if (pax_mprotect_global && (flags & ELF_NOTE_PAX_NOMPROTECT) != 0) { /* Mprotect explicitly disabled */ return false; } if (!pax_mprotect_global && (flags & ELF_NOTE_PAX_MPROTECT) == 0) { /* Mprotect not requested */ return false; } return true; } vm_prot_t pax_mprotect_maxprotect( #ifdef PAX_MPROTECT_DEBUG const char *file, size_t line, #endif struct lwp *l, vm_prot_t active, vm_prot_t extra, vm_prot_t maxprot) { uint32_t flags; flags = l->l_proc->p_pax; if (!pax_flags_active(flags, P_PAX_MPROTECT)) return maxprot; return (active|extra) & maxprot; } int pax_mprotect_validate( #ifdef PAX_MPROTECT_DEBUG const char *file, size_t line, #endif struct lwp *l, vm_prot_t prot) { uint32_t flags; flags = l->l_proc->p_pax; if (!pax_flags_active(flags, P_PAX_MPROTECT)) return 0; if ((prot & (VM_PROT_WRITE|VM_PROT_EXECUTE)) == (VM_PROT_WRITE|VM_PROT_EXECUTE)) { #ifdef PAX_MPROTECT_DEBUG struct proc *p = l->l_proc; if (pax_mprotect_debug) printf("%s: %s,%zu: %d.%d (%s): WX rejected\n", __func__, file, line, p->p_pid, l->l_lid, p->p_comm); #endif return EACCES; } return 0; } /* * Bypass MPROTECT for traced processes */ int pax_mprotect_prot(struct lwp *l) { uint32_t flags; flags = l->l_proc->p_pax; if (!pax_flags_active(flags, P_PAX_MPROTECT)) return 0; if (pax_mprotect_ptrace < 2) return 0; return UVM_EXTRACT_PROT_ALL; } #endif /* PAX_MPROTECT */ #ifdef PAX_ASLR static bool pax_aslr_elf_flags_active(uint32_t flags) { if (!pax_aslr_enabled) return false; if (pax_aslr_global && (flags & ELF_NOTE_PAX_NOASLR) != 0) { /* ASLR explicitly disabled */ return false; } if (!pax_aslr_global && (flags & ELF_NOTE_PAX_ASLR) == 0) { /* ASLR not requested */ return false; } return true; } static bool pax_aslr_epp_active(struct exec_package *epp) { if (__predict_false((epp->ep_flags & (EXEC_32|EXEC_TOPDOWN_VM)) == 0)) return false; return pax_flags_active(epp->ep_pax_flags, P_PAX_ASLR); } static bool pax_aslr_active(struct lwp *l) { return pax_flags_active(l->l_proc->p_pax, P_PAX_ASLR); } void pax_aslr_init_vm(struct lwp *l, struct vmspace *vm, struct exec_package *ep) { if (!pax_aslr_active(l)) return; if (__predict_false((ep->ep_flags & (EXEC_32|EXEC_TOPDOWN_VM)) == 0)) return; #ifdef PAX_ASLR_DEBUG if (pax_aslr_flags & PAX_ASLR_MMAP) return; #endif uint32_t len = (ep->ep_flags & EXEC_32) ? PAX_ASLR_DELTA_MMAP_LEN32 : PAX_ASLR_DELTA_MMAP_LEN; uint32_t rand = cprng_fast32(); #ifdef PAX_ASLR_DEBUG if (pax_aslr_flags & PAX_ASLR_FIXED) rand = pax_aslr_rand; #endif vm->vm_aslr_delta_mmap = PAX_ASLR_DELTA(rand, PAX_ASLR_DELTA_MMAP_LSB, len); PAX_DPRINTF("delta_mmap=%#jx/%u", (uintmax_t)vm->vm_aslr_delta_mmap, len); } void pax_aslr_mmap(struct lwp *l, vaddr_t *addr, vaddr_t orig_addr, int f) { if (!pax_aslr_active(l)) return; #ifdef PAX_ASLR_DEBUG char buf[256]; if (pax_aslr_flags & PAX_ASLR_MMAP) return; if (pax_aslr_debug) snprintb(buf, sizeof(buf), MAP_FMT, f); else buf[0] = '\0'; #endif if (!(f & MAP_FIXED) && ((orig_addr == 0) || !(f & MAP_ANON))) { PAX_DPRINTF("applying to %#jx orig_addr=%#jx f=%s", (uintmax_t)*addr, (uintmax_t)orig_addr, buf); if (!(l->l_proc->p_vmspace->vm_map.flags & VM_MAP_TOPDOWN)) *addr += l->l_proc->p_vmspace->vm_aslr_delta_mmap; else *addr -= l->l_proc->p_vmspace->vm_aslr_delta_mmap; PAX_DPRINTF("result %#jx", (uintmax_t)*addr); } else { PAX_DPRINTF("not applying to %#jx orig_addr=%#jx f=%s", (uintmax_t)*addr, (uintmax_t)orig_addr, buf); } } static vaddr_t pax_aslr_offset(vaddr_t align) { size_t pax_align, l2, delta; uint32_t rand; vaddr_t offset; pax_align = align == 0 ? PAGE_SIZE : align; l2 = ilog2(pax_align); rand = cprng_fast32(); #ifdef PAX_ASLR_DEBUG if (pax_aslr_flags & PAX_ASLR_FIXED) rand = pax_aslr_rand; #endif #define PAX_TRUNC(a, b) ((a) & ~((b) - 1)) delta = PAX_ASLR_DELTA(rand, l2, PAX_ASLR_DELTA_EXEC_LEN); offset = PAX_TRUNC(delta, pax_align); offset = MAX(offset, pax_align); PAX_DPRINTF("rand=%#x l2=%#zx pax_align=%#zx delta=%#zx offset=%#jx", rand, l2, pax_align, delta, (uintmax_t)offset); return offset; } vaddr_t pax_aslr_exec_offset(struct exec_package *epp, vaddr_t align) { if (!pax_aslr_epp_active(epp)) goto out; #ifdef PAX_ASLR_DEBUG if (pax_aslr_flags & PAX_ASLR_EXEC_OFFSET) goto out; #endif return pax_aslr_offset(align); out: return MAX(align, PAGE_SIZE); } voff_t pax_aslr_rtld_offset(struct exec_package *epp, vaddr_t align, int use_topdown) { voff_t offset; if (!pax_aslr_epp_active(epp)) return 0; #ifdef PAX_ASLR_DEBUG if (pax_aslr_flags & PAX_ASLR_RTLD_OFFSET) return 0; #endif offset = pax_aslr_offset(align); if (use_topdown) offset = -offset; return offset; } void pax_aslr_stack(struct exec_package *epp, vsize_t *max_stack_size) { if (!pax_aslr_epp_active(epp)) return; #ifdef PAX_ASLR_DEBUG if (pax_aslr_flags & PAX_ASLR_STACK) return; #endif uint32_t len = (epp->ep_flags & EXEC_32) ? PAX_ASLR_DELTA_STACK_LEN32 : PAX_ASLR_DELTA_STACK_LEN; uint32_t rand = cprng_fast32(); #ifdef PAX_ASLR_DEBUG if (pax_aslr_flags & PAX_ASLR_FIXED) rand = pax_aslr_rand; #endif u_long d = PAX_ASLR_DELTA(rand, PAX_ASLR_DELTA_STACK_LSB, len); d &= (*max_stack_size / PAX_ASLR_MAX_STACK_WASTE) - 1; u_long newminsaddr = (u_long)STACK_GROW(epp->ep_minsaddr, d); PAX_DPRINTF("old minsaddr=%#jx delta=%#lx new minsaddr=%#lx", (uintmax_t)epp->ep_minsaddr, d, newminsaddr); epp->ep_minsaddr = (vaddr_t)newminsaddr; *max_stack_size -= d; } uint32_t pax_aslr_stack_gap(struct exec_package *epp) { if (!pax_aslr_epp_active(epp)) return 0; #ifdef PAX_ASLR_DEBUG if (pax_aslr_flags & PAX_ASLR_STACK_GAP) return 0; #endif uint32_t rand = cprng_fast32(); #ifdef PAX_ASLR_DEBUG if (pax_aslr_flags & PAX_ASLR_FIXED) rand = pax_aslr_rand; #endif rand %= PAGE_SIZE; PAX_DPRINTF("stack gap=%#x\n", rand); return rand; } #endif /* PAX_ASLR */ #ifdef PAX_SEGVGUARD static bool pax_segvguard_elf_flags_active(uint32_t flags) { if (!pax_segvguard_enabled) return false; if (pax_segvguard_global && (flags & ELF_NOTE_PAX_NOGUARD) != 0) { /* Segvguard explicitly disabled */ return false; } if (!pax_segvguard_global && (flags & ELF_NOTE_PAX_GUARD) == 0) { /* Segvguard not requested */ return false; } return true; } void pax_segvguard_cleanup(struct vnode *vp) { struct pax_segvguard_entry *p = vp->v_segvguard; struct pax_segvguard_uid_entry *up; if (__predict_true(p == NULL)) { return; } while ((up = LIST_FIRST(&p->segv_uids)) != NULL) { LIST_REMOVE(up, sue_list); kmem_free(up, sizeof(*up)); } kmem_free(p, sizeof(*p)); vp->v_segvguard = NULL; } /* * Called when a process of image vp generated a segfault. * * => exec_lock must be held by the caller * => if "crashed" is true, exec_lock must be held for write */ int pax_segvguard(struct lwp *l, struct vnode *vp, const char *name, bool crashed) { struct pax_segvguard_entry *p; struct pax_segvguard_uid_entry *up; struct timeval tv; uid_t uid; uint32_t flags; bool have_uid; KASSERT(rw_lock_held(&exec_lock)); KASSERT(!crashed || rw_write_held(&exec_lock)); flags = l->l_proc->p_pax; if (!pax_flags_active(flags, P_PAX_GUARD)) return 0; if (vp == NULL) return EFAULT; /* Fast-path if starting a program we don't know. */ if ((p = vp->v_segvguard) == NULL && !crashed) return 0; microtime(&tv); /* * If a program we don't know crashed, we need to create a new entry * for it. */ if (p == NULL) { p = kmem_alloc(sizeof(*p), KM_SLEEP); vp->v_segvguard = p; LIST_INIT(&p->segv_uids); /* * Initialize a new entry with "crashes so far" of 1. * The expiry time is when we purge the entry if it didn't * reach the limit. */ up = kmem_alloc(sizeof(*up), KM_SLEEP); up->sue_uid = kauth_cred_getuid(l->l_cred); up->sue_ncrashes = 1; up->sue_expiry = tv.tv_sec + pax_segvguard_expiry; up->sue_suspended = 0; LIST_INSERT_HEAD(&p->segv_uids, up, sue_list); return 0; } /* * A program we "know" either executed or crashed again. * See if it's a culprit we're familiar with. */ uid = kauth_cred_getuid(l->l_cred); have_uid = false; LIST_FOREACH(up, &p->segv_uids, sue_list) { if (up->sue_uid == uid) { have_uid = true; break; } } /* * It's someone else. Add an entry for him if we crashed. */ if (!have_uid) { if (crashed) { up = kmem_alloc(sizeof(*up), KM_SLEEP); up->sue_uid = uid; up->sue_ncrashes = 1; up->sue_expiry = tv.tv_sec + pax_segvguard_expiry; up->sue_suspended = 0; LIST_INSERT_HEAD(&p->segv_uids, up, sue_list); } return 0; } if (crashed) { /* Check if timer on previous crashes expired first. */ if (up->sue_expiry < tv.tv_sec) { log(LOG_INFO, "PaX Segvguard: [%s] Suspension" " expired.\n", name ? name : "unknown"); up->sue_ncrashes = 1; up->sue_expiry = tv.tv_sec + pax_segvguard_expiry; up->sue_suspended = 0; return 0; } up->sue_ncrashes++; if (up->sue_ncrashes >= pax_segvguard_maxcrashes) { log(LOG_ALERT, "PaX Segvguard: [%s] Suspending " "execution for %d seconds after %zu crashes.\n", name ? name : "unknown", pax_segvguard_suspension, up->sue_ncrashes); /* Suspend this program for a while. */ up->sue_suspended = tv.tv_sec + pax_segvguard_suspension; up->sue_ncrashes = 0; up->sue_expiry = 0; } } else { /* Are we supposed to be suspended? */ if (up->sue_suspended > tv.tv_sec) { log(LOG_ALERT, "PaX Segvguard: [%s] Preventing " "execution due to repeated segfaults.\n", name ? name : "unknown"); return EPERM; } } return 0; } #endif /* PAX_SEGVGUARD */
1 1 19 19 19 19 19 19 65 65 64 65 65 64 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 /* $NetBSD: subr_log.c,v 1.63 2022/10/26 23:28:30 riastradh Exp $ */ /*- * Copyright (c) 2007, 2008 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Andrew Doran. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /* * Copyright (c) 1982, 1986, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)subr_log.c 8.3 (Berkeley) 2/14/95 */ /* * Error log buffer for kernel printf's. */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: subr_log.c,v 1.63 2022/10/26 23:28:30 riastradh Exp $"); #include <sys/param.h> #include <sys/systm.h> #include <sys/kernel.h> #include <sys/proc.h> #include <sys/vnode.h> #include <sys/ioctl.h> #include <sys/msgbuf.h> #include <sys/file.h> #include <sys/syslog.h> #include <sys/conf.h> #include <sys/select.h> #include <sys/poll.h> #include <sys/intr.h> #include <sys/sysctl.h> #include <sys/ktrace.h> static int sysctl_msgbuf(SYSCTLFN_PROTO); static void logsoftintr(void *); static bool log_async; static struct selinfo log_selp; /* process waiting on select call */ static pid_t log_pgid; /* process/group for async I/O */ static kcondvar_t log_cv; static void *log_sih; static kmutex_t log_lock; int log_open; /* also used in log() */ int msgbufmapped; /* is the message buffer mapped */ int msgbufenabled; /* is logging to the buffer enabled */ struct kern_msgbuf *msgbufp; /* the mapped buffer, itself. */ void initmsgbuf(void *bf, size_t bufsize) { struct kern_msgbuf *mbp; long new_bufs; /* Sanity-check the given size. */ if (bufsize < sizeof(struct kern_msgbuf)) return; mbp = msgbufp = (struct kern_msgbuf *)bf; new_bufs = bufsize - offsetof(struct kern_msgbuf, msg_bufc); if ((mbp->msg_magic != MSG_MAGIC) || (mbp->msg_bufs != new_bufs) || (mbp->msg_bufr < 0) || (mbp->msg_bufr >= mbp->msg_bufs) || (mbp->msg_bufx < 0) || (mbp->msg_bufx >= mbp->msg_bufs)) { /* * If the buffer magic number is wrong, has changed * size (which shouldn't happen often), or is * internally inconsistent, initialize it. */ memset(bf, 0, bufsize); mbp->msg_magic = MSG_MAGIC; mbp->msg_bufs = new_bufs; } /* mark it as ready for use. */ msgbufmapped = msgbufenabled = 1; } void loginit(void) { mutex_init(&log_lock, MUTEX_DEFAULT, IPL_VM); selinit(&log_selp); cv_init(&log_cv, "klog"); log_sih = softint_establish(SOFTINT_CLOCK | SOFTINT_MPSAFE, logsoftintr, NULL); sysctl_createv(NULL, 0, NULL, NULL, CTLFLAG_PERMANENT, CTLTYPE_INT, "msgbufsize", SYSCTL_DESCR("Size of the kernel message buffer"), sysctl_msgbuf, 0, NULL, 0, CTL_KERN, KERN_MSGBUFSIZE, CTL_EOL); sysctl_createv(NULL, 0, NULL, NULL, CTLFLAG_PERMANENT, CTLTYPE_INT, "msgbuf", SYSCTL_DESCR("Kernel message buffer"), sysctl_msgbuf, 0, NULL, 0, CTL_KERN, KERN_MSGBUF, CTL_EOL); } /*ARGSUSED*/ static int logopen(dev_t dev, int flags, int mode, struct lwp *l) { struct kern_msgbuf *mbp = msgbufp; int error = 0; mutex_spin_enter(&log_lock); if (log_open) { error = EBUSY; } else { log_open = 1; log_pgid = l->l_proc->p_pid; /* signal process only */ /* * The message buffer is initialized during system * configuration. If it's been clobbered, note that * and return an error. (This allows a user to read * the buffer via /dev/kmem, and try to figure out * what clobbered it. */ if (mbp->msg_magic != MSG_MAGIC) { msgbufenabled = 0; error = ENXIO; } } mutex_spin_exit(&log_lock); return error; } /*ARGSUSED*/ static int logclose(dev_t dev, int flag, int mode, struct lwp *l) { mutex_spin_enter(&log_lock); log_pgid = 0; log_open = 0; log_async = 0; mutex_spin_exit(&log_lock); return 0; } /*ARGSUSED*/ static int logread(dev_t dev, struct uio *uio, int flag) { struct kern_msgbuf *mbp = msgbufp; long l; int error = 0; mutex_spin_enter(&log_lock); while (mbp->msg_bufr == mbp->msg_bufx) { if (flag & IO_NDELAY) { mutex_spin_exit(&log_lock); return EWOULDBLOCK; } error = cv_wait_sig(&log_cv, &log_lock); if (error) { mutex_spin_exit(&log_lock); return error; } } while (uio->uio_resid > 0) { l = mbp->msg_bufx - mbp->msg_bufr; if (l < 0) l = mbp->msg_bufs - mbp->msg_bufr; l = uimin(l, uio->uio_resid); if (l == 0) break; mutex_spin_exit(&log_lock); error = uiomove(&mbp->msg_bufc[mbp->msg_bufr], (int)l, uio); mutex_spin_enter(&log_lock); if (error) break; mbp->msg_bufr += l; if (mbp->msg_bufr < 0 || mbp->msg_bufr >= mbp->msg_bufs) mbp->msg_bufr = 0; } mutex_spin_exit(&log_lock); return error; } /*ARGSUSED*/ static int logpoll(dev_t dev, int events, struct lwp *l) { int revents = 0; if (events & (POLLIN | POLLRDNORM)) { mutex_spin_enter(&log_lock); if (msgbufp->msg_bufr != msgbufp->msg_bufx) revents |= events & (POLLIN | POLLRDNORM); else selrecord(l, &log_selp); mutex_spin_exit(&log_lock); } return revents; } static void filt_logrdetach(struct knote *kn) { mutex_spin_enter(&log_lock); selremove_knote(&log_selp, kn); mutex_spin_exit(&log_lock); } static int filt_logread(struct knote *kn, long hint) { int rv; if ((hint & NOTE_SUBMIT) == 0) mutex_spin_enter(&log_lock); if (msgbufp->msg_bufr == msgbufp->msg_bufx) { rv = 0; } else if (msgbufp->msg_bufr < msgbufp->msg_bufx) { kn->kn_data = msgbufp->msg_bufx - msgbufp->msg_bufr; rv = 1; } else { kn->kn_data = (msgbufp->msg_bufs - msgbufp->msg_bufr) + msgbufp->msg_bufx; rv = 1; } if ((hint & NOTE_SUBMIT) == 0) mutex_spin_exit(&log_lock); return rv; } static const struct filterops logread_filtops = { .f_flags = FILTEROP_ISFD | FILTEROP_MPSAFE, .f_attach = NULL, .f_detach = filt_logrdetach, .f_event = filt_logread, }; static int logkqfilter(dev_t dev, struct knote *kn) { switch (kn->kn_filter) { case EVFILT_READ: kn->kn_fop = &logread_filtops; mutex_spin_enter(&log_lock); selrecord_knote(&log_selp, kn); mutex_spin_exit(&log_lock); break; default: return (EINVAL); } return (0); } void logwakeup(void) { if (!cold && log_open) { mutex_spin_enter(&log_lock); selnotify(&log_selp, 0, NOTE_SUBMIT); if (log_async) softint_schedule(log_sih); cv_broadcast(&log_cv); mutex_spin_exit(&log_lock); } } static void logsoftintr(void *cookie) { pid_t pid; if ((pid = log_pgid) != 0) fownsignal(pid, SIGIO, 0, 0, NULL); } /*ARGSUSED*/ static int logioctl(dev_t dev, u_long com, void *data, int flag, struct lwp *lwp) { long l; switch (com) { /* return number of characters immediately available */ case FIONREAD: mutex_spin_enter(&log_lock); l = msgbufp->msg_bufx - msgbufp->msg_bufr; if (l < 0) l += msgbufp->msg_bufs; mutex_spin_exit(&log_lock); *(int *)data = l; break; case FIONBIO: break; case FIOASYNC: /* No locking needed, 'thread private'. */ log_async = (*((int *)data) != 0); break; case TIOCSPGRP: case FIOSETOWN: return fsetown(&log_pgid, com, data); case TIOCGPGRP: case FIOGETOWN: return fgetown(log_pgid, com, data); default: return (EPASSTHROUGH); } return (0); } static void logskip(struct kern_msgbuf *mbp) { /* * Move forward read pointer to the next line * in the buffer. Note that the buffer is * a ring buffer so we should reset msg_bufr * to 0 when msg_bufr exceeds msg_bufs. * * To prevent to loop forever, give up if we * cannot find a newline in mbp->msg_bufs * characters (the max size of the buffer). */ for (int i = 0; i < mbp->msg_bufs; i++) { char c0 = mbp->msg_bufc[mbp->msg_bufr]; if (++mbp->msg_bufr >= mbp->msg_bufs) mbp->msg_bufr = 0; if (c0 == '\n') break; } } static void logaddchar(struct kern_msgbuf *mbp, int c) { mbp->msg_bufc[mbp->msg_bufx++] = c; if (mbp->msg_bufx < 0 || mbp->msg_bufx >= mbp->msg_bufs) mbp->msg_bufx = 0; /* If the buffer is full, keep the most recent data. */ if (mbp->msg_bufr == mbp->msg_bufx) logskip(mbp); } void logputchar(int c) { struct kern_msgbuf *mbp; if (!cold) mutex_spin_enter(&log_lock); if (!msgbufenabled) goto out; mbp = msgbufp; if (mbp->msg_magic != MSG_MAGIC) { /* * Arguably should panic or somehow notify the * user... but how? Panic may be too drastic, * and would obliterate the message being kicked * out (maybe a panic itself), and printf * would invoke us recursively. Silently punt * for now. If syslog is running, it should * notice. */ msgbufenabled = 0; goto out; } logaddchar(mbp, c); out: if (!cold) mutex_spin_exit(&log_lock); } /* * sysctl helper routine for kern.msgbufsize and kern.msgbuf. For the * former it merely checks the message buffer is set up. For the latter, * it also copies out the data if necessary. */ static int sysctl_msgbuf(SYSCTLFN_ARGS) { char *where = oldp; size_t len, maxlen; long beg, end; int error; if (!logenabled(msgbufp)) { msgbufenabled = 0; return (ENXIO); } switch (rnode->sysctl_num) { case KERN_MSGBUFSIZE: { struct sysctlnode node = *rnode; int msg_bufs = (int)msgbufp->msg_bufs; node.sysctl_data = &msg_bufs; return (sysctl_lookup(SYSCTLFN_CALL(&node))); } case KERN_MSGBUF: break; default: return (EOPNOTSUPP); } if (newp != NULL) return (EPERM); if (oldp == NULL) { /* always return full buffer size */ *oldlenp = msgbufp->msg_bufs; return (0); } sysctl_unlock(); /* * First, copy from the write pointer to the end of * message buffer. */ error = 0; mutex_spin_enter(&log_lock); maxlen = MIN(msgbufp->msg_bufs, *oldlenp); beg = msgbufp->msg_bufx; end = msgbufp->msg_bufs; mutex_spin_exit(&log_lock); while (maxlen > 0) { len = MIN(end - beg, maxlen); if (len == 0) break; /* XXX unlocked, but hardly matters. */ error = copyout(&msgbufp->msg_bufc[beg], where, len); ktrmibio(-1, UIO_READ, where, len, error); if (error) break; where += len; maxlen -= len; /* * ... then, copy from the beginning of message buffer to * the write pointer. */ beg = 0; end = msgbufp->msg_bufx; } sysctl_relock(); return (error); } const struct cdevsw log_cdevsw = { .d_open = logopen, .d_close = logclose, .d_read = logread, .d_write = nowrite, .d_ioctl = logioctl, .d_stop = nostop, .d_tty = notty, .d_poll = logpoll, .d_mmap = nommap, .d_kqfilter = logkqfilter, .d_discard = nodiscard, .d_flag = D_OTHER | D_MPSAFE };
993 262 940 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 /* $NetBSD: spectre.c,v 1.36 2021/10/07 12:52:27 msaitoh Exp $ */ /* * Copyright (c) 2018-2019 NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Maxime Villard. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /* * Mitigations for the SpectreV2, SpectreV4, MDS and TAA CPU flaws. */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: spectre.c,v 1.36 2021/10/07 12:52:27 msaitoh Exp $"); #include "opt_spectre.h" #include <sys/param.h> #include <sys/systm.h> #include <sys/cpu.h> #include <sys/sysctl.h> #include <sys/xcall.h> #include <machine/cpufunc.h> #include <machine/cpuvar.h> #include <machine/specialreg.h> #include <machine/frameasm.h> #include <x86/cputypes.h> enum v2_mitigation { V2_MITIGATION_NONE, V2_MITIGATION_AMD_DIS_IND, V2_MITIGATION_INTEL_IBRS, V2_MITIGATION_INTEL_ENHANCED_IBRS }; enum v4_mitigation { V4_MITIGATION_NONE, V4_MITIGATION_INTEL_SSBD, V4_MITIGATION_INTEL_SSB_NO, V4_MITIGATION_AMD_SSB_NO, V4_MITIGATION_AMD_NONARCH_F15H, V4_MITIGATION_AMD_NONARCH_F16H, V4_MITIGATION_AMD_NONARCH_F17H }; static enum v2_mitigation v2_mitigation_method = V2_MITIGATION_NONE; static enum v4_mitigation v4_mitigation_method = V4_MITIGATION_NONE; static bool v2_mitigation_enabled __read_mostly = false; static bool v4_mitigation_enabled __read_mostly = false; static char v2_mitigation_name[64] = "(none)"; static char v4_mitigation_name[64] = "(none)"; /* --------------------------------------------------------------------- */ static void v2_set_name(void) { char name[64] = ""; size_t nmitig = 0; #if defined(SPECTRE_V2_GCC_MITIGATION) strlcat(name, "[GCC retpoline]", sizeof(name)); nmitig++; #endif if (!v2_mitigation_enabled) { if (nmitig == 0) strlcat(name, "(none)", sizeof(name)); } else { if (nmitig) strlcat(name, " + ", sizeof(name)); switch (v2_mitigation_method) { case V2_MITIGATION_AMD_DIS_IND: strlcat(name, "[AMD DIS_IND]", sizeof(name)); break; case V2_MITIGATION_INTEL_IBRS: strlcat(name, "[Intel IBRS]", sizeof(name)); break; case V2_MITIGATION_INTEL_ENHANCED_IBRS: strlcat(name, "[Intel Enhanced IBRS]", sizeof(name)); break; default: panic("%s: impossible", __func__); } } strlcpy(v2_mitigation_name, name, sizeof(v2_mitigation_name)); } static void v2_detect_method(void) { struct cpu_info *ci = curcpu(); u_int descs[4]; uint64_t msr; if (cpu_vendor == CPUVENDOR_INTEL) { if (cpuid_level >= 7) { x86_cpuid(7, descs); if (descs[3] & CPUID_SEF_IBRS) { if (descs[3] & CPUID_SEF_ARCH_CAP) { msr = rdmsr(MSR_IA32_ARCH_CAPABILITIES); if (msr & IA32_ARCH_IBRS_ALL) { v2_mitigation_method = V2_MITIGATION_INTEL_ENHANCED_IBRS; return; } } #ifdef __x86_64__ v2_mitigation_method = V2_MITIGATION_INTEL_IBRS; return; #endif } } v2_mitigation_method = V2_MITIGATION_NONE; } else if (cpu_vendor == CPUVENDOR_AMD) { /* * The AMD Family 10h manual documents the IC_CFG.DIS_IND bit. * This bit disables the Indirect Branch Predictor. * * Families 12h and 16h are believed to have this bit too, but * their manuals don't document it. */ switch (CPUID_TO_FAMILY(ci->ci_signature)) { case 0x10: v2_mitigation_method = V2_MITIGATION_AMD_DIS_IND; break; default: v2_mitigation_method = V2_MITIGATION_NONE; break; } } else { v2_mitigation_method = V2_MITIGATION_NONE; } } /* -------------------------------------------------------------------------- */ static volatile unsigned long ibrs_cpu_barrier1 __cacheline_aligned; static volatile unsigned long ibrs_cpu_barrier2 __cacheline_aligned; #ifdef __x86_64__ /* IBRS_ENTER. */ extern uint8_t noibrs_enter, noibrs_enter_end; extern uint8_t ibrs_enter, ibrs_enter_end; static const struct x86_hotpatch_source hp_noibrs_enter_source = { .saddr = &noibrs_enter, .eaddr = &noibrs_enter_end }; static const struct x86_hotpatch_source hp_ibrs_enter_source = { .saddr = &ibrs_enter, .eaddr = &ibrs_enter_end }; static const struct x86_hotpatch_descriptor hp_ibrs_enter_desc = { .name = HP_NAME_IBRS_ENTER, .nsrc = 2, .srcs = { &hp_noibrs_enter_source, &hp_ibrs_enter_source } }; __link_set_add_rodata(x86_hotpatch_descriptors, hp_ibrs_enter_desc); /* IBRS_LEAVE. */ extern uint8_t noibrs_leave, noibrs_leave_end; extern uint8_t ibrs_leave, ibrs_leave_end; static const struct x86_hotpatch_source hp_noibrs_leave_source = { .saddr = &noibrs_leave, .eaddr = &noibrs_leave_end }; static const struct x86_hotpatch_source hp_ibrs_leave_source = { .saddr = &ibrs_leave, .eaddr = &ibrs_leave_end }; static const struct x86_hotpatch_descriptor hp_ibrs_leave_desc = { .name = HP_NAME_IBRS_LEAVE, .nsrc = 2, .srcs = { &hp_noibrs_leave_source, &hp_ibrs_leave_source } }; __link_set_add_rodata(x86_hotpatch_descriptors, hp_ibrs_leave_desc); static void ibrs_disable_hotpatch(void) { x86_hotpatch(HP_NAME_IBRS_ENTER, /* noibrs */ 0); x86_hotpatch(HP_NAME_IBRS_LEAVE, /* noibrs */ 0); } static void ibrs_enable_hotpatch(void) { x86_hotpatch(HP_NAME_IBRS_ENTER, /* ibrs */ 1); x86_hotpatch(HP_NAME_IBRS_LEAVE, /* ibrs */ 1); } #else /* IBRS not supported on i386 */ static void ibrs_disable_hotpatch(void) { panic("%s: impossible", __func__); } static void ibrs_enable_hotpatch(void) { panic("%s: impossible", __func__); } #endif /* -------------------------------------------------------------------------- */ static void mitigation_v2_apply_cpu(struct cpu_info *ci, bool enabled) { uint64_t msr; switch (v2_mitigation_method) { case V2_MITIGATION_NONE: panic("impossible"); case V2_MITIGATION_INTEL_IBRS: /* cpu0 is the one that does the hotpatch job */ if (ci == &cpu_info_primary) { if (enabled) { ibrs_enable_hotpatch(); } else { ibrs_disable_hotpatch(); } } if (!enabled) { wrmsr(MSR_IA32_SPEC_CTRL, 0); } break; case V2_MITIGATION_INTEL_ENHANCED_IBRS: msr = rdmsr(MSR_IA32_SPEC_CTRL); if (enabled) { msr |= IA32_SPEC_CTRL_IBRS; } else { msr &= ~IA32_SPEC_CTRL_IBRS; } wrmsr(MSR_IA32_SPEC_CTRL, msr); break; case V2_MITIGATION_AMD_DIS_IND: msr = rdmsr(MSR_IC_CFG); if (enabled) { msr |= IC_CFG_DIS_IND; } else { msr &= ~IC_CFG_DIS_IND; } wrmsr(MSR_IC_CFG, msr); break; } } /* * Note: IBRS requires hotpatching, so we need barriers. */ static void mitigation_v2_change_cpu(void *arg1, void *arg2) { struct cpu_info *ci = curcpu(); bool enabled = arg1 != NULL; u_long psl = 0; /* Rendez-vous 1 (IBRS only). */ if (v2_mitigation_method == V2_MITIGATION_INTEL_IBRS) { psl = x86_read_psl(); x86_disable_intr(); atomic_dec_ulong(&ibrs_cpu_barrier1); while (atomic_cas_ulong(&ibrs_cpu_barrier1, 0, 0) != 0) { x86_pause(); } } mitigation_v2_apply_cpu(ci, enabled); /* Rendez-vous 2 (IBRS only). */ if (v2_mitigation_method == V2_MITIGATION_INTEL_IBRS) { atomic_dec_ulong(&ibrs_cpu_barrier2); while (atomic_cas_ulong(&ibrs_cpu_barrier2, 0, 0) != 0) { x86_pause(); } /* Write back and invalidate cache, flush pipelines. */ wbinvd(); x86_flush(); x86_write_psl(psl); } } static int mitigation_v2_change(bool enabled) { uint64_t xc; v2_detect_method(); switch (v2_mitigation_method) { case V2_MITIGATION_NONE: printf("[!] No mitigation available\n"); return EOPNOTSUPP; case V2_MITIGATION_AMD_DIS_IND: case V2_MITIGATION_INTEL_IBRS: case V2_MITIGATION_INTEL_ENHANCED_IBRS: /* Initialize the barriers */ ibrs_cpu_barrier1 = ncpu; ibrs_cpu_barrier2 = ncpu; printf("[+] %s SpectreV2 Mitigation...", enabled ? "Enabling" : "Disabling"); xc = xc_broadcast(XC_HIGHPRI, mitigation_v2_change_cpu, (void *)enabled, NULL); xc_wait(xc); printf(" done!\n"); v2_mitigation_enabled = enabled; v2_set_name(); return 0; default: panic("impossible"); } } static int sysctl_machdep_spectreV2_mitigated(SYSCTLFN_ARGS) { struct sysctlnode node; int error; bool val; val = *(bool *)rnode->sysctl_data; node = *rnode; node.sysctl_data = &val; error = sysctl_lookup(SYSCTLFN_CALL(&node)); if (error != 0 || newp == NULL) return error; if (val == v2_mitigation_enabled) return 0; return mitigation_v2_change(val); } /* -------------------------------------------------------------------------- */ static void v4_set_name(void) { char name[64] = ""; if (!v4_mitigation_enabled) { strlcat(name, "(none)", sizeof(name)); } else { switch (v4_mitigation_method) { case V4_MITIGATION_NONE: panic("%s: impossible", __func__); case V4_MITIGATION_INTEL_SSBD: strlcat(name, "[Intel SSBD]", sizeof(name)); break; case V4_MITIGATION_INTEL_SSB_NO: strlcat(name, "[Intel SSB_NO]", sizeof(name)); break; case V4_MITIGATION_AMD_SSB_NO: strlcat(name, "[AMD SSB_NO]", sizeof(name)); break; case V4_MITIGATION_AMD_NONARCH_F15H: case V4_MITIGATION_AMD_NONARCH_F16H: case V4_MITIGATION_AMD_NONARCH_F17H: strlcat(name, "[AMD NONARCH]", sizeof(name)); break; } } strlcpy(v4_mitigation_name, name, sizeof(v4_mitigation_name)); } static void v4_detect_method(void) { struct cpu_info *ci = curcpu(); u_int descs[4]; uint64_t msr; if (cpu_vendor == CPUVENDOR_INTEL) { if (cpu_info_primary.ci_feat_val[7] & CPUID_SEF_ARCH_CAP) { msr = rdmsr(MSR_IA32_ARCH_CAPABILITIES); if (msr & IA32_ARCH_SSB_NO) { /* Not vulnerable to SpectreV4. */ v4_mitigation_method = V4_MITIGATION_INTEL_SSB_NO; return; } } if (cpuid_level >= 7) { x86_cpuid(7, descs); if (descs[3] & CPUID_SEF_SSBD) { /* descs[3] = %edx */ v4_mitigation_method = V4_MITIGATION_INTEL_SSBD; return; } } } else if (cpu_vendor == CPUVENDOR_AMD) { switch (CPUID_TO_FAMILY(ci->ci_signature)) { case 0x15: v4_mitigation_method = V4_MITIGATION_AMD_NONARCH_F15H; return; case 0x16: v4_mitigation_method = V4_MITIGATION_AMD_NONARCH_F16H; return; case 0x17: v4_mitigation_method = V4_MITIGATION_AMD_NONARCH_F17H; return; default: if (cpu_info_primary.ci_max_ext_cpuid < 0x80000008) { break; } x86_cpuid(0x80000008, descs); if (descs[1] & CPUID_CAPEX_SSB_NO) { /* Not vulnerable to SpectreV4. */ v4_mitigation_method = V4_MITIGATION_AMD_SSB_NO; return; } break; } } v4_mitigation_method = V4_MITIGATION_NONE; } static void mitigation_v4_apply_cpu(bool enabled) { uint64_t msr, msrval = 0, msrbit = 0; switch (v4_mitigation_method) { case V4_MITIGATION_NONE: case V4_MITIGATION_INTEL_SSB_NO: case V4_MITIGATION_AMD_SSB_NO: panic("impossible"); case V4_MITIGATION_INTEL_SSBD: msrval = MSR_IA32_SPEC_CTRL; msrbit = IA32_SPEC_CTRL_SSBD; break; case V4_MITIGATION_AMD_NONARCH_F15H: msrval = MSR_LS_CFG; msrbit = LS_CFG_DIS_SSB_F15H; break; case V4_MITIGATION_AMD_NONARCH_F16H: msrval = MSR_LS_CFG; msrbit = LS_CFG_DIS_SSB_F16H; break; case V4_MITIGATION_AMD_NONARCH_F17H: msrval = MSR_LS_CFG; msrbit = LS_CFG_DIS_SSB_F17H; break; } msr = rdmsr(msrval); if (enabled) { msr |= msrbit; } else { msr &= ~msrbit; } wrmsr(msrval, msr); } static void mitigation_v4_change_cpu(void *arg1, void *arg2) { bool enabled = arg1 != NULL; mitigation_v4_apply_cpu(enabled); } static int mitigation_v4_change(bool enabled) { uint64_t xc; v4_detect_method(); switch (v4_mitigation_method) { case V4_MITIGATION_NONE: printf("[!] No mitigation available\n"); return EOPNOTSUPP; case V4_MITIGATION_INTEL_SSBD: case V4_MITIGATION_AMD_NONARCH_F15H: case V4_MITIGATION_AMD_NONARCH_F16H: case V4_MITIGATION_AMD_NONARCH_F17H: printf("[+] %s SpectreV4 Mitigation...", enabled ? "Enabling" : "Disabling"); xc = xc_broadcast(0, mitigation_v4_change_cpu, (void *)enabled, NULL); xc_wait(xc); printf(" done!\n"); v4_mitigation_enabled = enabled; v4_set_name(); return 0; case V4_MITIGATION_INTEL_SSB_NO: case V4_MITIGATION_AMD_SSB_NO: printf("[+] The CPU is not affected by SpectreV4\n"); return 0; default: panic("impossible"); } } static int sysctl_machdep_spectreV4_mitigated(SYSCTLFN_ARGS) { struct sysctlnode node; int error; bool val; val = *(bool *)rnode->sysctl_data; node = *rnode; node.sysctl_data = &val; error = sysctl_lookup(SYSCTLFN_CALL(&node)); if (error != 0 || newp == NULL) return error; if (val == v4_mitigation_enabled) return 0; return mitigation_v4_change(val); } /* -------------------------------------------------------------------------- */ enum mds_mitigation { MDS_MITIGATION_NONE, MDS_MITIGATION_VERW, MDS_MITIGATION_MDS_NO }; static char mds_mitigation_name[64] = "(none)"; static enum mds_mitigation mds_mitigation_method = MDS_MITIGATION_NONE; static bool mds_mitigation_enabled __read_mostly = false; static volatile unsigned long mds_cpu_barrier1 __cacheline_aligned; static volatile unsigned long mds_cpu_barrier2 __cacheline_aligned; #ifdef __x86_64__ /* MDS_LEAVE. */ extern uint8_t nomds_leave, nomds_leave_end; extern uint8_t mds_leave, mds_leave_end; static const struct x86_hotpatch_source hp_nomds_leave_source = { .saddr = &nomds_leave, .eaddr = &nomds_leave_end }; static const struct x86_hotpatch_source hp_mds_leave_source = { .saddr = &mds_leave, .eaddr = &mds_leave_end }; static const struct x86_hotpatch_descriptor hp_mds_leave_desc = { .name = HP_NAME_MDS_LEAVE, .nsrc = 2, .srcs = { &hp_nomds_leave_source, &hp_mds_leave_source } }; __link_set_add_rodata(x86_hotpatch_descriptors, hp_mds_leave_desc); static void mds_disable_hotpatch(void) { x86_hotpatch(HP_NAME_MDS_LEAVE, /* nomds */ 0); } static void mds_enable_hotpatch(void) { x86_hotpatch(HP_NAME_MDS_LEAVE, /* mds */ 1); } #else /* MDS not supported on i386 */ static void mds_disable_hotpatch(void) { panic("%s: impossible", __func__); } static void mds_enable_hotpatch(void) { panic("%s: impossible", __func__); } #endif static void mitigation_mds_apply_cpu(struct cpu_info *ci, bool enabled) { switch (mds_mitigation_method) { case MDS_MITIGATION_NONE: case MDS_MITIGATION_MDS_NO: panic("impossible"); case MDS_MITIGATION_VERW: /* cpu0 is the one that does the hotpatch job */ if (ci == &cpu_info_primary) { if (enabled) { mds_enable_hotpatch(); } else { mds_disable_hotpatch(); } } break; } } static void mitigation_mds_change_cpu(void *arg1, void *arg2) { struct cpu_info *ci = curcpu(); bool enabled = arg1 != NULL; u_long psl = 0; /* Rendez-vous 1. */ psl = x86_read_psl(); x86_disable_intr(); atomic_dec_ulong(&mds_cpu_barrier1); while (atomic_cas_ulong(&mds_cpu_barrier1, 0, 0) != 0) { x86_pause(); } mitigation_mds_apply_cpu(ci, enabled); /* Rendez-vous 2. */ atomic_dec_ulong(&mds_cpu_barrier2); while (atomic_cas_ulong(&mds_cpu_barrier2, 0, 0) != 0) { x86_pause(); } /* Write back and invalidate cache, flush pipelines. */ wbinvd(); x86_flush(); x86_write_psl(psl); } static void mds_detect_method(void) { u_int descs[4]; uint64_t msr; if (cpu_vendor != CPUVENDOR_INTEL) { mds_mitigation_method = MDS_MITIGATION_MDS_NO; return; } if (cpuid_level < 7) { return; } x86_cpuid(0x7, descs); if (descs[3] & CPUID_SEF_ARCH_CAP) { msr = rdmsr(MSR_IA32_ARCH_CAPABILITIES); if (msr & IA32_ARCH_MDS_NO) { mds_mitigation_method = MDS_MITIGATION_MDS_NO; return; } } #ifdef __x86_64__ if (descs[3] & CPUID_SEF_MD_CLEAR) { mds_mitigation_method = MDS_MITIGATION_VERW; } #endif } static void mds_set_name(void) { char name[64] = ""; if (!mds_mitigation_enabled) { strlcat(name, "(none)", sizeof(name)); } else { switch (mds_mitigation_method) { case MDS_MITIGATION_NONE: panic("%s: impossible", __func__); case MDS_MITIGATION_MDS_NO: strlcat(name, "[MDS_NO]", sizeof(name)); break; case MDS_MITIGATION_VERW: strlcat(name, "[VERW]", sizeof(name)); break; } } strlcpy(mds_mitigation_name, name, sizeof(mds_mitigation_name)); } static int mitigation_mds_change(bool enabled) { uint64_t xc; mds_detect_method(); switch (mds_mitigation_method) { case MDS_MITIGATION_NONE: printf("[!] No mitigation available\n"); return EOPNOTSUPP; case MDS_MITIGATION_VERW: /* Initialize the barriers */ mds_cpu_barrier1 = ncpu; mds_cpu_barrier2 = ncpu; printf("[+] %s MDS Mitigation...", enabled ? "Enabling" : "Disabling"); xc = xc_broadcast(XC_HIGHPRI, mitigation_mds_change_cpu, (void *)enabled, NULL); xc_wait(xc); printf(" done!\n"); mds_mitigation_enabled = enabled; mds_set_name(); return 0; case MDS_MITIGATION_MDS_NO: printf("[+] The CPU is not affected by MDS\n"); return 0; default: panic("impossible"); } } static int sysctl_machdep_mds_mitigated(SYSCTLFN_ARGS) { struct sysctlnode node; int error; bool val; val = *(bool *)rnode->sysctl_data; node = *rnode; node.sysctl_data = &val; error = sysctl_lookup(SYSCTLFN_CALL(&node)); if (error != 0 || newp == NULL) return error; if (val == mds_mitigation_enabled) return 0; return mitigation_mds_change(val); } /* -------------------------------------------------------------------------- */ enum taa_mitigation { TAA_MITIGATION_NONE, TAA_MITIGATION_TAA_NO, TAA_MITIGATION_MDS, TAA_MITIGATION_RTM_DISABLE }; static char taa_mitigation_name[64] = "(none)"; static enum taa_mitigation taa_mitigation_method = TAA_MITIGATION_NONE; static bool taa_mitigation_enabled __read_mostly = false; static bool *taa_mitigation_enabled_ptr = &taa_mitigation_enabled; static void mitigation_taa_apply_cpu(struct cpu_info *ci, bool enabled) { uint64_t msr; switch (taa_mitigation_method) { case TAA_MITIGATION_NONE: case TAA_MITIGATION_TAA_NO: case TAA_MITIGATION_MDS: panic("impossible"); case TAA_MITIGATION_RTM_DISABLE: msr = rdmsr(MSR_IA32_TSX_CTRL); if (enabled) { msr |= IA32_TSX_CTRL_RTM_DISABLE; } else { msr &= ~IA32_TSX_CTRL_RTM_DISABLE; } wrmsr(MSR_IA32_TSX_CTRL, msr); break; } } static void mitigation_taa_change_cpu(void *arg1, void *arg2) { struct cpu_info *ci = curcpu(); bool enabled = arg1 != NULL; mitigation_taa_apply_cpu(ci, enabled); } static void taa_detect_method(void) { u_int descs[4]; uint64_t msr; taa_mitigation_enabled_ptr = &taa_mitigation_enabled; if (cpu_vendor != CPUVENDOR_INTEL) { taa_mitigation_method = TAA_MITIGATION_TAA_NO; return; } if (!(cpu_feature[5] & CPUID_SEF_RTM)) { taa_mitigation_method = TAA_MITIGATION_TAA_NO; return; } /* * If the CPU doesn't have MDS_NO set, then the TAA mitigation is based * on the MDS mitigation. */ if (cpuid_level < 7) { taa_mitigation_method = TAA_MITIGATION_MDS; taa_mitigation_enabled_ptr = &mds_mitigation_enabled; return; } x86_cpuid(0x7, descs); if (!(descs[3] & CPUID_SEF_ARCH_CAP)) { taa_mitigation_method = TAA_MITIGATION_MDS; taa_mitigation_enabled_ptr = &mds_mitigation_enabled; return; } msr = rdmsr(MSR_IA32_ARCH_CAPABILITIES); if (!(msr & IA32_ARCH_MDS_NO)) { taa_mitigation_method = TAA_MITIGATION_MDS; taa_mitigation_enabled_ptr = &mds_mitigation_enabled; return; } /* * Otherwise, we need the TAA-specific mitigation. */ if (msr & IA32_ARCH_TAA_NO) { taa_mitigation_method = TAA_MITIGATION_TAA_NO; return; } if (msr & IA32_ARCH_TSX_CTRL) { taa_mitigation_method = TAA_MITIGATION_RTM_DISABLE; return; } } static void taa_set_name(void) { char name[64] = ""; switch (taa_mitigation_method) { case TAA_MITIGATION_NONE: strlcpy(name, "(none)", sizeof(name)); break; case TAA_MITIGATION_TAA_NO: strlcpy(name, "[TAA_NO]", sizeof(name)); break; case TAA_MITIGATION_MDS: strlcpy(name, "[MDS]", sizeof(name)); break; case TAA_MITIGATION_RTM_DISABLE: if (!taa_mitigation_enabled) { strlcpy(name, "(none)", sizeof(name)); } else { strlcpy(name, "[RTM_DISABLE]", sizeof(name)); } break; } strlcpy(taa_mitigation_name, name, sizeof(taa_mitigation_name)); } static int mitigation_taa_change(bool enabled) { uint64_t xc; taa_detect_method(); switch (taa_mitigation_method) { case TAA_MITIGATION_NONE: printf("[!] No mitigation available\n"); return EOPNOTSUPP; case TAA_MITIGATION_TAA_NO: printf("[+] The CPU is not affected by TAA\n"); return 0; case TAA_MITIGATION_MDS: printf("[!] Mitigation based on MDS, use machdep.mds\n"); taa_set_name(); return EINVAL; case TAA_MITIGATION_RTM_DISABLE: printf("[+] %s TAA Mitigation...", enabled ? "Enabling" : "Disabling"); xc = xc_broadcast(XC_HIGHPRI, mitigation_taa_change_cpu, (void *)enabled, NULL); xc_wait(xc); printf(" done!\n"); taa_mitigation_enabled = enabled; taa_set_name(); return 0; default: panic("impossible"); } } static int sysctl_machdep_taa_mitigated(SYSCTLFN_ARGS) { struct sysctlnode node; int error; bool val; val = *(bool *)rnode->sysctl_data; node = *rnode; node.sysctl_data = &val; error = sysctl_lookup(SYSCTLFN_CALL(&node)); if (error != 0 || newp == NULL) return error; if (val == *taa_mitigation_enabled_ptr) return 0; return mitigation_taa_change(val); } /* -------------------------------------------------------------------------- */ void speculation_barrier(struct lwp *, struct lwp *); void speculation_barrier(struct lwp *oldlwp, struct lwp *newlwp) { /* * Speculation barriers are applicable only to Spectre V2. */ if (!v2_mitigation_enabled) return; /* * From kernel thread to kernel thread, no need for a barrier. */ if ((oldlwp->l_flag & LW_SYSTEM) && (newlwp->l_flag & LW_SYSTEM)) return; switch (v2_mitigation_method) { case V2_MITIGATION_INTEL_IBRS: wrmsr(MSR_IA32_PRED_CMD, IA32_PRED_CMD_IBPB); break; default: /* nothing */ break; } } /* * cpu0 is the one that detects the method and sets the global 'enabled' * variable for each mitigation. */ void cpu_speculation_init(struct cpu_info *ci) { /* * Spectre V2. */ if (ci == &cpu_info_primary) { v2_detect_method(); v2_mitigation_enabled = (v2_mitigation_method != V2_MITIGATION_NONE); v2_set_name(); } if (v2_mitigation_method != V2_MITIGATION_NONE) { mitigation_v2_apply_cpu(ci, true); } /* * Spectre V4. * * Disabled by default, as recommended by AMD, but can be enabled * dynamically. We only detect if the CPU is not vulnerable, to * mark it as 'mitigated' in the sysctl. */ #if 0 if (ci == &cpu_info_primary) { v4_detect_method(); v4_mitigation_enabled = (v4_mitigation_method != V4_MITIGATION_NONE); v4_set_name(); } if (v4_mitigation_method != V4_MITIGATION_NONE && v4_mitigation_method != V4_MITIGATION_INTEL_SSB_NO && v4_mitigation_method != V4_MITIGATION_AMD_SSB_NO) { mitigation_v4_apply_cpu(ci, true); } #else if (ci == &cpu_info_primary) { v4_detect_method(); if (v4_mitigation_method == V4_MITIGATION_INTEL_SSB_NO || v4_mitigation_method == V4_MITIGATION_AMD_SSB_NO) { v4_mitigation_enabled = true; v4_set_name(); } } #endif /* * Microarchitectural Data Sampling. */ if (ci == &cpu_info_primary) { mds_detect_method(); mds_mitigation_enabled = (mds_mitigation_method != MDS_MITIGATION_NONE); mds_set_name(); } if (mds_mitigation_method != MDS_MITIGATION_NONE && mds_mitigation_method != MDS_MITIGATION_MDS_NO) { mitigation_mds_apply_cpu(ci, true); } /* * TSX Asynchronous Abort. */ if (ci == &cpu_info_primary) { taa_detect_method(); taa_mitigation_enabled = (taa_mitigation_method == TAA_MITIGATION_RTM_DISABLE) || (taa_mitigation_method == TAA_MITIGATION_TAA_NO); taa_set_name(); } if (taa_mitigation_method == TAA_MITIGATION_RTM_DISABLE) { mitigation_taa_apply_cpu(ci, true); } } void sysctl_speculation_init(struct sysctllog **); void sysctl_speculation_init(struct sysctllog **clog) { const struct sysctlnode *spec_rnode; /* SpectreV1 */ spec_rnode = NULL; sysctl_createv(clog, 0, NULL, &spec_rnode, CTLFLAG_PERMANENT, CTLTYPE_NODE, "spectre_v1", NULL, NULL, 0, NULL, 0, CTL_MACHDEP, CTL_CREATE); sysctl_createv(clog, 0, &spec_rnode, &spec_rnode, CTLFLAG_PERMANENT | CTLFLAG_IMMEDIATE, CTLTYPE_BOOL, "mitigated", SYSCTL_DESCR("Whether Spectre Variant 1 is mitigated"), NULL, 0 /* mitigated=0 */, NULL, 0, CTL_CREATE, CTL_EOL); /* SpectreV2 */ spec_rnode = NULL; sysctl_createv(clog, 0, NULL, &spec_rnode, CTLFLAG_PERMANENT, CTLTYPE_NODE, "spectre_v2", NULL, NULL, 0, NULL, 0, CTL_MACHDEP, CTL_CREATE); sysctl_createv(clog, 0, &spec_rnode, NULL, CTLFLAG_READWRITE, CTLTYPE_BOOL, "hwmitigated", SYSCTL_DESCR("Whether Spectre Variant 2 is HW-mitigated"), sysctl_machdep_spectreV2_mitigated, 0, &v2_mitigation_enabled, 0, CTL_CREATE, CTL_EOL); sysctl_createv(clog, 0, &spec_rnode, NULL, CTLFLAG_PERMANENT | CTLFLAG_IMMEDIATE, CTLTYPE_BOOL, "swmitigated", SYSCTL_DESCR("Whether Spectre Variant 2 is SW-mitigated"), #if defined(SPECTRE_V2_GCC_MITIGATION) NULL, 1, #else NULL, 0, #endif NULL, 0, CTL_CREATE, CTL_EOL); sysctl_createv(clog, 0, &spec_rnode, NULL, CTLFLAG_PERMANENT, CTLTYPE_STRING, "method", SYSCTL_DESCR("Mitigation method in use"), NULL, 0, v2_mitigation_name, 0, CTL_CREATE, CTL_EOL); /* SpectreV4 */ spec_rnode = NULL; sysctl_createv(clog, 0, NULL, &spec_rnode, CTLFLAG_PERMANENT, CTLTYPE_NODE, "spectre_v4", NULL, NULL, 0, NULL, 0, CTL_MACHDEP, CTL_CREATE); sysctl_createv(clog, 0, &spec_rnode, NULL, CTLFLAG_READWRITE, CTLTYPE_BOOL, "mitigated", SYSCTL_DESCR("Whether Spectre Variant 4 is mitigated"), sysctl_machdep_spectreV4_mitigated, 0, &v4_mitigation_enabled, 0, CTL_CREATE, CTL_EOL); sysctl_createv(clog, 0, &spec_rnode, NULL, CTLFLAG_PERMANENT, CTLTYPE_STRING, "method", SYSCTL_DESCR("Mitigation method in use"), NULL, 0, v4_mitigation_name, 0, CTL_CREATE, CTL_EOL); /* Microarchitectural Data Sampling */ spec_rnode = NULL; sysctl_createv(clog, 0, NULL, &spec_rnode, CTLFLAG_PERMANENT, CTLTYPE_NODE, "mds", NULL, NULL, 0, NULL, 0, CTL_MACHDEP, CTL_CREATE); sysctl_createv(clog, 0, &spec_rnode, NULL, CTLFLAG_READWRITE, CTLTYPE_BOOL, "mitigated", SYSCTL_DESCR("Whether MDS is mitigated"), sysctl_machdep_mds_mitigated, 0, &mds_mitigation_enabled, 0, CTL_CREATE, CTL_EOL); sysctl_createv(clog, 0, &spec_rnode, NULL, CTLFLAG_PERMANENT, CTLTYPE_STRING, "method", SYSCTL_DESCR("Mitigation method in use"), NULL, 0, mds_mitigation_name, 0, CTL_CREATE, CTL_EOL); /* TSX Asynchronous Abort */ spec_rnode = NULL; sysctl_createv(clog, 0, NULL, &spec_rnode, CTLFLAG_PERMANENT, CTLTYPE_NODE, "taa", NULL, NULL, 0, NULL, 0, CTL_MACHDEP, CTL_CREATE); sysctl_createv(clog, 0, &spec_rnode, NULL, CTLFLAG_READWRITE, CTLTYPE_BOOL, "mitigated", SYSCTL_DESCR("Whether TAA is mitigated"), sysctl_machdep_taa_mitigated, 0, taa_mitigation_enabled_ptr, 0, CTL_CREATE, CTL_EOL); sysctl_createv(clog, 0, &spec_rnode, NULL, CTLFLAG_PERMANENT, CTLTYPE_STRING, "method", SYSCTL_DESCR("Mitigation method in use"), NULL, 0, taa_mitigation_name, 0, CTL_CREATE, CTL_EOL); }
1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 /* $NetBSD: subr_disk_open.c,v 1.15 2020/02/29 14:44:44 mlelstv Exp $ */ /*- * Copyright (c) 2006 The NetBSD Foundation, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: subr_disk_open.c,v 1.15 2020/02/29 14:44:44 mlelstv Exp $"); #include <sys/param.h> #include <sys/conf.h> #include <sys/device.h> #include <sys/disk.h> #include <sys/disklabel.h> #include <sys/fcntl.h> #include <sys/kauth.h> #include <sys/vnode.h> #include <miscfs/specfs/specdev.h> struct vnode * opendisk(device_t dv) { devmajor_t bmajor; int unit; struct vnode *tmpvn; int error; dev_t dev; /* * Lookup major number for disk block device. */ bmajor = devsw_name2blk(device_xname(dv), NULL, 0); if (bmajor == -1) return NULL; unit = device_unit(dv); /* * Fake a temporary vnode for the disk, open it, and read * and hash the sectors. */ dev = device_is_a(dv, "dk") ? makedev(bmajor, unit) : MAKEDISKDEV(bmajor, unit, RAW_PART); if (bdevvp(dev, &tmpvn)) panic("%s: can't alloc vnode for %s", __func__, device_xname(dv)); vn_lock(tmpvn, LK_EXCLUSIVE | LK_RETRY); error = VOP_OPEN(tmpvn, FREAD | FSILENT, NOCRED); if (error) { /* * Ignore errors caused by missing device, partition, * medium, or busy [presumably because of a wedge covering it] */ switch (error) { case ENXIO: case ENODEV: case EBUSY: break; default: printf("%s: can't open dev %s (%d)\n", __func__, device_xname(dv), error); break; } vput(tmpvn); return NULL; } return tmpvn; } int getdisksize(struct vnode *vp, uint64_t *numsecp, unsigned int *secsizep) { struct partinfo pi; struct dkwedge_info dkw; struct disk *pdk; unsigned int secsize; uint64_t numsec; int error; /* * We attempt to get the wedge information first if it exists, * because the label does not support larger size disks. */ error = VOP_IOCTL(vp, DIOCGWEDGEINFO, &dkw, FREAD, NOCRED); if (error == 0) { pdk = disk_find(dkw.dkw_parent); if (pdk != NULL) { secsize = DEV_BSIZE << pdk->dk_blkshift; numsec = dkw.dkw_size; } else error = ENODEV; } if (error) { error = VOP_IOCTL(vp, DIOCGPARTINFO, &pi, FREAD, NOCRED); if (error == 0) { secsize = pi.pi_secsize; numsec = pi.pi_size; } } if (error == 0 && (secsize == 0 || secsize > MAXBSIZE || !powerof2(secsize) || numsec == 0)) { #ifdef DIAGNOSTIC printf("%s: %s returns invalid disksize values" " (secsize = %u, numsec = %" PRIu64 ")\n", __func__, devsw_blk2name(major(vp->v_specnode->sn_rdev)), secsize, numsec); #endif error = EINVAL; } if (error == 0) { *secsizep = secsize; *numsecp = numsec; } return error; } int getdiskinfo(struct vnode *vp, struct dkwedge_info *dkw) { struct partinfo pi; int error; dev_t dev = vp->v_specnode->sn_rdev; if (VOP_IOCTL(vp, DIOCGWEDGEINFO, dkw, FREAD, NOCRED) == 0) return 0; if ((error = VOP_IOCTL(vp, DIOCGPARTINFO, &pi, FREAD, NOCRED)) != 0) return error; snprintf(dkw->dkw_devname, sizeof(dkw->dkw_devname), "%s%" PRId32 "%c", devsw_blk2name(major(dev)), DISKUNIT(dev), (char)DISKPART(dev) + 'a'); dkw->dkw_wname[0] = '\0'; snprintf(dkw->dkw_parent, sizeof(dkw->dkw_parent), "%s%" PRId32, devsw_blk2name(major(dev)), DISKUNIT(dev)); dkw->dkw_size = pi.pi_size; dkw->dkw_offset = pi.pi_offset; strlcpy(dkw->dkw_ptype, getfstypename(pi.pi_fstype), sizeof(dkw->dkw_ptype)); return 0; }
1 21 21 21 21 21 21 21 21 65 65 65 21 65 77 76 38 38 38 38 77 77 76 65 65 65 1 38 38 37 38 65 65 65 62 16 38 38 65 77 76 77 77 77 77 11 11 11 58 57 58 58 58 58 58 58 58 65 65 65 65 65 64 65 65 64 64 332 331 332 332 43 307 302 65 332 64 65 65 65 3 63 65 64 65 64 65 64 64 65 64 64 63 65 58 58 58 65 65 65 58 58 65 65 65 65 58 65 58 65 64 64 95 95 95 59 38 38 38 38 38 38 38 38 38 35 34 34 38 9 9 9 38 37 35 35 35 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 /* $NetBSD: subr_vmem.c,v 1.116 2024/04/24 02:08:03 thorpej Exp $ */ /*- * Copyright (c)2006,2007,2008,2009 YAMAMOTO Takashi, * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * reference: * - Magazines and Vmem: Extending the Slab Allocator * to Many CPUs and Arbitrary Resources * http://www.usenix.org/event/usenix01/bonwick.html * * locking & the boundary tag pool: * - A pool(9) is used for vmem boundary tags * - During a pool get call the global vmem_btag_refill_lock is taken, * to serialize access to the allocation reserve, but no other * vmem arena locks. * - During pool_put calls no vmem mutexes are locked. * - pool_drain doesn't hold the pool's mutex while releasing memory to * its backing therefore no interference with any vmem mutexes. * - The boundary tag pool is forced to put page headers into pool pages * (PR_PHINPAGE) and not off page to avoid pool recursion. * (due to sizeof(bt_t) it should be the case anyway) */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: subr_vmem.c,v 1.116 2024/04/24 02:08:03 thorpej Exp $"); #if defined(_KERNEL) && defined(_KERNEL_OPT) #include "opt_ddb.h" #endif /* defined(_KERNEL) && defined(_KERNEL_OPT) */ #include <sys/param.h> #include <sys/hash.h> #include <sys/queue.h> #include <sys/bitops.h> #if defined(_KERNEL) #include <sys/systm.h> #include <sys/kernel.h> /* hz */ #include <sys/callout.h> #include <sys/kmem.h> #include <sys/pool.h> #include <sys/vmem.h> #include <sys/vmem_impl.h> #include <sys/workqueue.h> #include <sys/atomic.h> #include <uvm/uvm.h> #include <uvm/uvm_extern.h> #include <uvm/uvm_km.h> #include <uvm/uvm_page.h> #include <uvm/uvm_pdaemon.h> #else /* defined(_KERNEL) */ #include <stdio.h> #include <errno.h> #include <assert.h> #include <stdlib.h> #include <string.h> #include "../sys/vmem.h" #include "../sys/vmem_impl.h" #endif /* defined(_KERNEL) */ #if defined(_KERNEL) #include <sys/evcnt.h> #define VMEM_EVCNT_DEFINE(name) \ struct evcnt vmem_evcnt_##name = EVCNT_INITIALIZER(EVCNT_TYPE_MISC, NULL, \ "vmem", #name); \ EVCNT_ATTACH_STATIC(vmem_evcnt_##name); #define VMEM_EVCNT_INCR(ev) vmem_evcnt_##ev.ev_count++ #define VMEM_EVCNT_DECR(ev) vmem_evcnt_##ev.ev_count-- VMEM_EVCNT_DEFINE(static_bt_count) VMEM_EVCNT_DEFINE(static_bt_inuse) #define VMEM_CONDVAR_INIT(vm, wchan) cv_init(&vm->vm_cv, wchan) #define VMEM_CONDVAR_DESTROY(vm) cv_destroy(&vm->vm_cv) #define VMEM_CONDVAR_WAIT(vm) cv_wait(&vm->vm_cv, &vm->vm_lock) #define VMEM_CONDVAR_BROADCAST(vm) cv_broadcast(&vm->vm_cv) #else /* defined(_KERNEL) */ #define VMEM_EVCNT_INCR(ev) /* nothing */ #define VMEM_EVCNT_DECR(ev) /* nothing */ #define VMEM_CONDVAR_INIT(vm, wchan) /* nothing */ #define VMEM_CONDVAR_DESTROY(vm) /* nothing */ #define VMEM_CONDVAR_WAIT(vm) /* nothing */ #define VMEM_CONDVAR_BROADCAST(vm) /* nothing */ #define UNITTEST #define KASSERT(a) assert(a) #define KASSERTMSG(a, m, ...) assert(a) #define mutex_init(a, b, c) /* nothing */ #define mutex_destroy(a) /* nothing */ #define mutex_enter(a) /* nothing */ #define mutex_tryenter(a) true #define mutex_exit(a) /* nothing */ #define mutex_owned(a) true #define ASSERT_SLEEPABLE() /* nothing */ #define panic(...) printf(__VA_ARGS__); abort() #endif /* defined(_KERNEL) */ #if defined(VMEM_SANITY) static void vmem_check(vmem_t *); #else /* defined(VMEM_SANITY) */ #define vmem_check(vm) /* nothing */ #endif /* defined(VMEM_SANITY) */ #define VMEM_HASHSIZE_MIN 1 /* XXX */ #define VMEM_HASHSIZE_MAX 65536 /* XXX */ #define VMEM_HASHSIZE_INIT 1 #define VM_FITMASK (VM_BESTFIT | VM_INSTANTFIT) #if defined(_KERNEL) static bool vmem_bootstrapped = false; static kmutex_t vmem_list_lock; static LIST_HEAD(, vmem) vmem_list = LIST_HEAD_INITIALIZER(vmem_list); #endif /* defined(_KERNEL) */ /* ---- misc */ #define VMEM_LOCK(vm) mutex_enter(&(vm)->vm_lock) #define VMEM_TRYLOCK(vm) mutex_tryenter(&(vm)->vm_lock) #define VMEM_UNLOCK(vm) mutex_exit(&(vm)->vm_lock) #define VMEM_LOCK_INIT(vm, ipl) mutex_init(&(vm)->vm_lock, MUTEX_DEFAULT, (ipl)) #define VMEM_LOCK_DESTROY(vm) mutex_destroy(&(vm)->vm_lock) #define VMEM_ASSERT_LOCKED(vm) KASSERT(mutex_owned(&(vm)->vm_lock)) #define VMEM_ALIGNUP(addr, align) \ (-(-(addr) & -(align))) #define VMEM_CROSS_P(addr1, addr2, boundary) \ ((((addr1) ^ (addr2)) & -(boundary)) != 0) #define ORDER2SIZE(order) ((vmem_size_t)1 << (order)) #define SIZE2ORDER(size) ((int)ilog2(size)) static void vmem_kick_pdaemon(void) { #if defined(_KERNEL) uvm_kick_pdaemon(); #endif } static void vmem_xfree_bt(vmem_t *, bt_t *); #if !defined(_KERNEL) #define xmalloc(sz, flags) malloc(sz) #define xfree(p, sz) free(p) #define bt_alloc(vm, flags) malloc(sizeof(bt_t)) #define bt_free(vm, bt) free(bt) #define bt_freetrim(vm, l) /* nothing */ #else /* defined(_KERNEL) */ #define xmalloc(sz, flags) \ kmem_alloc(sz, ((flags) & VM_SLEEP) ? KM_SLEEP : KM_NOSLEEP); #define xfree(p, sz) kmem_free(p, sz); /* * BT_RESERVE calculation: * we allocate memory for boundary tags with vmem; therefore we have * to keep a reserve of bts used to allocated memory for bts. * This reserve is 4 for each arena involved in allocating vmems memory. * BT_MAXFREE: don't cache excessive counts of bts in arenas */ #define STATIC_BT_COUNT 200 #define BT_MINRESERVE 4 #define BT_MAXFREE 64 static struct vmem_btag static_bts[STATIC_BT_COUNT]; static int static_bt_count = STATIC_BT_COUNT; static struct vmem kmem_va_meta_arena_store; vmem_t *kmem_va_meta_arena; static struct vmem kmem_meta_arena_store; vmem_t *kmem_meta_arena = NULL; static kmutex_t vmem_btag_refill_lock; static kmutex_t vmem_btag_lock; static LIST_HEAD(, vmem_btag) vmem_btag_freelist; static size_t vmem_btag_freelist_count = 0; static struct pool vmem_btag_pool; static bool vmem_btag_pool_initialized __read_mostly; /* ---- boundary tag */ static int bt_refill(vmem_t *vm); static int bt_refill_locked(vmem_t *vm); static void * pool_page_alloc_vmem_meta(struct pool *pp, int flags) { const vm_flag_t vflags = (flags & PR_WAITOK) ? VM_SLEEP: VM_NOSLEEP; vmem_addr_t va; int ret; ret = vmem_alloc(kmem_meta_arena, pp->pr_alloc->pa_pagesz, (vflags & ~VM_FITMASK) | VM_INSTANTFIT | VM_POPULATING, &va); return ret ? NULL : (void *)va; } static void pool_page_free_vmem_meta(struct pool *pp, void *v) { vmem_free(kmem_meta_arena, (vmem_addr_t)v, pp->pr_alloc->pa_pagesz); } /* allocator for vmem-pool metadata */ struct pool_allocator pool_allocator_vmem_meta = { .pa_alloc = pool_page_alloc_vmem_meta, .pa_free = pool_page_free_vmem_meta, .pa_pagesz = 0 }; static int bt_refill_locked(vmem_t *vm) { bt_t *bt; VMEM_ASSERT_LOCKED(vm); if (vm->vm_nfreetags > BT_MINRESERVE) { return 0; } mutex_enter(&vmem_btag_lock); while (!LIST_EMPTY(&vmem_btag_freelist) && vm->vm_nfreetags <= BT_MINRESERVE && (vm->vm_flags & VM_PRIVTAGS) == 0) { bt = LIST_FIRST(&vmem_btag_freelist); LIST_REMOVE(bt, bt_freelist); bt->bt_flags = 0; LIST_INSERT_HEAD(&vm->vm_freetags, bt, bt_freelist); vm->vm_nfreetags++; vmem_btag_freelist_count--; VMEM_EVCNT_INCR(static_bt_inuse); } mutex_exit(&vmem_btag_lock); while (vm->vm_nfreetags <= BT_MINRESERVE) { VMEM_UNLOCK(vm); KASSERT(vmem_btag_pool_initialized); mutex_enter(&vmem_btag_refill_lock); bt = pool_get(&vmem_btag_pool, PR_NOWAIT); mutex_exit(&vmem_btag_refill_lock); VMEM_LOCK(vm); if (bt == NULL) break; bt->bt_flags = 0; LIST_INSERT_HEAD(&vm->vm_freetags, bt, bt_freelist); vm->vm_nfreetags++; } if (vm->vm_nfreetags <= BT_MINRESERVE) { return ENOMEM; } if (kmem_meta_arena != NULL) { VMEM_UNLOCK(vm); (void)bt_refill(kmem_arena); (void)bt_refill(kmem_va_meta_arena); (void)bt_refill(kmem_meta_arena); VMEM_LOCK(vm); } return 0; } static int bt_refill(vmem_t *vm) { int rv; VMEM_LOCK(vm); rv = bt_refill_locked(vm); VMEM_UNLOCK(vm); return rv; } static bt_t * bt_alloc(vmem_t *vm, vm_flag_t flags) { bt_t *bt; VMEM_ASSERT_LOCKED(vm); while (vm->vm_nfreetags <= BT_MINRESERVE && (flags & VM_POPULATING) == 0) { if (bt_refill_locked(vm)) { if ((flags & VM_NOSLEEP) != 0) { return NULL; } /* * It would be nice to wait for something specific here * but there are multiple ways that a retry could * succeed and we can't wait for multiple things * simultaneously. So we'll just sleep for an arbitrary * short period of time and retry regardless. * This should be a very rare case. */ vmem_kick_pdaemon(); kpause("btalloc", false, 1, &vm->vm_lock); } } bt = LIST_FIRST(&vm->vm_freetags); LIST_REMOVE(bt, bt_freelist); vm->vm_nfreetags--; return bt; } static void bt_free(vmem_t *vm, bt_t *bt) { VMEM_ASSERT_LOCKED(vm); LIST_INSERT_HEAD(&vm->vm_freetags, bt, bt_freelist); vm->vm_nfreetags++; } static void bt_freetrim(vmem_t *vm, int freelimit) { bt_t *bt, *next_bt; LIST_HEAD(, vmem_btag) tofree; VMEM_ASSERT_LOCKED(vm); LIST_INIT(&tofree); LIST_FOREACH_SAFE(bt, &vm->vm_freetags, bt_freelist, next_bt) { if (vm->vm_nfreetags <= freelimit) { break; } if (bt->bt_flags & BT_F_PRIVATE) { continue; } LIST_REMOVE(bt, bt_freelist); vm->vm_nfreetags--; if (bt >= static_bts && bt < &static_bts[STATIC_BT_COUNT]) { mutex_enter(&vmem_btag_lock); LIST_INSERT_HEAD(&vmem_btag_freelist, bt, bt_freelist); vmem_btag_freelist_count++; mutex_exit(&vmem_btag_lock); VMEM_EVCNT_DECR(static_bt_inuse); } else { LIST_INSERT_HEAD(&tofree, bt, bt_freelist); } } VMEM_UNLOCK(vm); while (!LIST_EMPTY(&tofree)) { bt = LIST_FIRST(&tofree); LIST_REMOVE(bt, bt_freelist); pool_put(&vmem_btag_pool, bt); } } /* * Add private boundary tags (statically-allocated by the caller) * to a vmem arena's free tag list. */ void vmem_add_bts(vmem_t *vm, struct vmem_btag *bts, unsigned int nbts) { VMEM_LOCK(vm); while (nbts != 0) { bts->bt_flags = BT_F_PRIVATE; LIST_INSERT_HEAD(&vm->vm_freetags, bts, bt_freelist); vm->vm_nfreetags++; bts++; nbts--; } VMEM_UNLOCK(vm); } #endif /* defined(_KERNEL) */ /* * freelist[0] ... [1, 1] * freelist[1] ... [2, 3] * freelist[2] ... [4, 7] * freelist[3] ... [8, 15] * : * freelist[n] ... [(1 << n), (1 << (n + 1)) - 1] * : */ static struct vmem_freelist * bt_freehead_tofree(vmem_t *vm, vmem_size_t size) { const vmem_size_t qsize = size >> vm->vm_quantum_shift; const int idx = SIZE2ORDER(qsize); KASSERT(size != 0); KASSERT(qsize != 0); KASSERT((size & vm->vm_quantum_mask) == 0); KASSERT(idx >= 0); KASSERT(idx < VMEM_MAXORDER); return &vm->vm_freelist[idx]; } /* * bt_freehead_toalloc: return the freelist for the given size and allocation * strategy. * * for VM_INSTANTFIT, return the list in which any blocks are large enough * for the requested size. otherwise, return the list which can have blocks * large enough for the requested size. */ static struct vmem_freelist * bt_freehead_toalloc(vmem_t *vm, vmem_size_t size, vm_flag_t strat) { const vmem_size_t qsize = size >> vm->vm_quantum_shift; int idx = SIZE2ORDER(qsize); KASSERT(size != 0); KASSERT(qsize != 0); KASSERT((size & vm->vm_quantum_mask) == 0); if (strat == VM_INSTANTFIT && ORDER2SIZE(idx) != qsize) { idx++; /* check too large request? */ } KASSERT(idx >= 0); KASSERT(idx < VMEM_MAXORDER); return &vm->vm_freelist[idx]; } /* ---- boundary tag hash */ static struct vmem_hashlist * bt_hashhead(vmem_t *vm, vmem_addr_t addr) { struct vmem_hashlist *list; unsigned int hash; hash = hash32_buf(&addr, sizeof(addr), HASH32_BUF_INIT); list = &vm->vm_hashlist[hash & vm->vm_hashmask]; return list; } static bt_t * bt_lookupbusy(vmem_t *vm, vmem_addr_t addr) { struct vmem_hashlist *list; bt_t *bt; list = bt_hashhead(vm, addr); LIST_FOREACH(bt, list, bt_hashlist) { if (bt->bt_start == addr) { break; } } return bt; } static void bt_rembusy(vmem_t *vm, bt_t *bt) { KASSERT(vm->vm_nbusytag > 0); vm->vm_inuse -= bt->bt_size; vm->vm_nbusytag--; LIST_REMOVE(bt, bt_hashlist); } static void bt_insbusy(vmem_t *vm, bt_t *bt) { struct vmem_hashlist *list; KASSERT(bt->bt_type == BT_TYPE_BUSY); list = bt_hashhead(vm, bt->bt_start); LIST_INSERT_HEAD(list, bt, bt_hashlist); if (++vm->vm_nbusytag > vm->vm_maxbusytag) { vm->vm_maxbusytag = vm->vm_nbusytag; } vm->vm_inuse += bt->bt_size; } /* ---- boundary tag list */ static void bt_remseg(vmem_t *vm, bt_t *bt) { TAILQ_REMOVE(&vm->vm_seglist, bt, bt_seglist); } static void bt_insseg(vmem_t *vm, bt_t *bt, bt_t *prev) { TAILQ_INSERT_AFTER(&vm->vm_seglist, prev, bt, bt_seglist); } static void bt_insseg_tail(vmem_t *vm, bt_t *bt) { TAILQ_INSERT_TAIL(&vm->vm_seglist, bt, bt_seglist); } static void bt_remfree(vmem_t *vm, bt_t *bt) { KASSERT(bt->bt_type == BT_TYPE_FREE); LIST_REMOVE(bt, bt_freelist); } static void bt_insfree(vmem_t *vm, bt_t *bt) { struct vmem_freelist *list; list = bt_freehead_tofree(vm, bt->bt_size); LIST_INSERT_HEAD(list, bt, bt_freelist); } /* ---- vmem internal functions */ #if defined(QCACHE) static inline vm_flag_t prf_to_vmf(int prflags) { vm_flag_t vmflags; KASSERT((prflags & ~(PR_LIMITFAIL | PR_WAITOK | PR_NOWAIT)) == 0); if ((prflags & PR_WAITOK) != 0) { vmflags = VM_SLEEP; } else { vmflags = VM_NOSLEEP; } return vmflags; } static inline int vmf_to_prf(vm_flag_t vmflags) { int prflags; if ((vmflags & VM_SLEEP) != 0) { prflags = PR_WAITOK; } else { prflags = PR_NOWAIT; } return prflags; } static size_t qc_poolpage_size(size_t qcache_max) { int i; for (i = 0; ORDER2SIZE(i) <= qcache_max * 3; i++) { /* nothing */ } return ORDER2SIZE(i); } static void * qc_poolpage_alloc(struct pool *pool, int prflags) { qcache_t *qc = QC_POOL_TO_QCACHE(pool); vmem_t *vm = qc->qc_vmem; vmem_addr_t addr; if (vmem_alloc(vm, pool->pr_alloc->pa_pagesz, prf_to_vmf(prflags) | VM_INSTANTFIT, &addr) != 0) return NULL; return (void *)addr; } static void qc_poolpage_free(struct pool *pool, void *addr) { qcache_t *qc = QC_POOL_TO_QCACHE(pool); vmem_t *vm = qc->qc_vmem; vmem_free(vm, (vmem_addr_t)addr, pool->pr_alloc->pa_pagesz); } static void qc_init(vmem_t *vm, size_t qcache_max, int ipl) { qcache_t *prevqc; struct pool_allocator *pa; int qcache_idx_max; int i; KASSERT((qcache_max & vm->vm_quantum_mask) == 0); if (qcache_max > (VMEM_QCACHE_IDX_MAX << vm->vm_quantum_shift)) { qcache_max = VMEM_QCACHE_IDX_MAX << vm->vm_quantum_shift; } vm->vm_qcache_max = qcache_max; pa = &vm->vm_qcache_allocator; memset(pa, 0, sizeof(*pa)); pa->pa_alloc = qc_poolpage_alloc; pa->pa_free = qc_poolpage_free; pa->pa_pagesz = qc_poolpage_size(qcache_max); qcache_idx_max = qcache_max >> vm->vm_quantum_shift; prevqc = NULL; for (i = qcache_idx_max; i > 0; i--) { qcache_t *qc = &vm->vm_qcache_store[i - 1]; size_t size = i << vm->vm_quantum_shift; pool_cache_t pc; qc->qc_vmem = vm; snprintf(qc->qc_name, sizeof(qc->qc_name), "%s-%zu", vm->vm_name, size); pc = pool_cache_init(size, ORDER2SIZE(vm->vm_quantum_shift), 0, PR_NOALIGN | PR_NOTOUCH | PR_RECURSIVE /* XXX */, qc->qc_name, pa, ipl, NULL, NULL, NULL); KASSERT(pc); qc->qc_cache = pc; KASSERT(qc->qc_cache != NULL); /* XXX */ if (prevqc != NULL && qc->qc_cache->pc_pool.pr_itemsperpage == prevqc->qc_cache->pc_pool.pr_itemsperpage) { pool_cache_destroy(qc->qc_cache); vm->vm_qcache[i - 1] = prevqc; continue; } qc->qc_cache->pc_pool.pr_qcache = qc; vm->vm_qcache[i - 1] = qc; prevqc = qc; } } static void qc_destroy(vmem_t *vm) { const qcache_t *prevqc; int i; int qcache_idx_max; qcache_idx_max = vm->vm_qcache_max >> vm->vm_quantum_shift; prevqc = NULL; for (i = 0; i < qcache_idx_max; i++) { qcache_t *qc = vm->vm_qcache[i]; if (prevqc == qc) { continue; } pool_cache_destroy(qc->qc_cache); prevqc = qc; } } #endif #if defined(_KERNEL) static void vmem_bootstrap(void) { mutex_init(&vmem_list_lock, MUTEX_DEFAULT, IPL_NONE); mutex_init(&vmem_btag_lock, MUTEX_DEFAULT, IPL_VM); mutex_init(&vmem_btag_refill_lock, MUTEX_DEFAULT, IPL_VM); while (static_bt_count-- > 0) { bt_t *bt = &static_bts[static_bt_count]; LIST_INSERT_HEAD(&vmem_btag_freelist, bt, bt_freelist); VMEM_EVCNT_INCR(static_bt_count); vmem_btag_freelist_count++; } vmem_bootstrapped = TRUE; } void vmem_subsystem_init(vmem_t *vm) { kmem_va_meta_arena = vmem_init(&kmem_va_meta_arena_store, "vmem-va", 0, 0, PAGE_SIZE, vmem_alloc, vmem_free, vm, 0, VM_NOSLEEP | VM_BOOTSTRAP | VM_LARGEIMPORT, IPL_VM); kmem_meta_arena = vmem_init(&kmem_meta_arena_store, "vmem-meta", 0, 0, PAGE_SIZE, uvm_km_kmem_alloc, uvm_km_kmem_free, kmem_va_meta_arena, 0, VM_NOSLEEP | VM_BOOTSTRAP, IPL_VM); pool_init(&vmem_btag_pool, sizeof(bt_t), coherency_unit, 0, PR_PHINPAGE, "vmembt", &pool_allocator_vmem_meta, IPL_VM); vmem_btag_pool_initialized = true; } #endif /* defined(_KERNEL) */ static int vmem_add1(vmem_t *vm, vmem_addr_t addr, vmem_size_t size, vm_flag_t flags, int spanbttype) { bt_t *btspan; bt_t *btfree; VMEM_ASSERT_LOCKED(vm); KASSERT((flags & (VM_SLEEP|VM_NOSLEEP)) != 0); KASSERT((~flags & (VM_SLEEP|VM_NOSLEEP)) != 0); KASSERT(spanbttype == BT_TYPE_SPAN || spanbttype == BT_TYPE_SPAN_STATIC); btspan = bt_alloc(vm, flags); if (btspan == NULL) { return ENOMEM; } btfree = bt_alloc(vm, flags); if (btfree == NULL) { bt_free(vm, btspan); return ENOMEM; } btspan->bt_type = spanbttype; btspan->bt_start = addr; btspan->bt_size = size; btfree->bt_type = BT_TYPE_FREE; btfree->bt_start = addr; btfree->bt_size = size; bt_insseg_tail(vm, btspan); bt_insseg(vm, btfree, btspan); bt_insfree(vm, btfree); vm->vm_size += size; return 0; } static void vmem_destroy1(vmem_t *vm) { #if defined(QCACHE) qc_destroy(vm); #endif /* defined(QCACHE) */ VMEM_LOCK(vm); for (int i = 0; i < vm->vm_hashsize; i++) { bt_t *bt; while ((bt = LIST_FIRST(&vm->vm_hashlist[i])) != NULL) { KASSERT(bt->bt_type == BT_TYPE_SPAN_STATIC); LIST_REMOVE(bt, bt_hashlist); bt_free(vm, bt); } } /* bt_freetrim() drops the lock. */ bt_freetrim(vm, 0); if (vm->vm_hashlist != &vm->vm_hash0) { xfree(vm->vm_hashlist, sizeof(struct vmem_hashlist) * vm->vm_hashsize); } VMEM_CONDVAR_DESTROY(vm); VMEM_LOCK_DESTROY(vm); xfree(vm, sizeof(*vm)); } static int vmem_import(vmem_t *vm, vmem_size_t size, vm_flag_t flags) { vmem_addr_t addr; int rc; VMEM_ASSERT_LOCKED(vm); if (vm->vm_importfn == NULL) { return EINVAL; } if (vm->vm_flags & VM_LARGEIMPORT) { size *= 16; } VMEM_UNLOCK(vm); if (vm->vm_flags & VM_XIMPORT) { rc = __FPTRCAST(vmem_ximport_t *, vm->vm_importfn)(vm->vm_arg, size, &size, flags, &addr); } else { rc = (vm->vm_importfn)(vm->vm_arg, size, flags, &addr); } VMEM_LOCK(vm); if (rc) { return ENOMEM; } if (vmem_add1(vm, addr, size, flags, BT_TYPE_SPAN) != 0) { VMEM_UNLOCK(vm); (*vm->vm_releasefn)(vm->vm_arg, addr, size); VMEM_LOCK(vm); return ENOMEM; } return 0; } #if defined(_KERNEL) static int vmem_rehash(vmem_t *vm, size_t newhashsize, vm_flag_t flags) { bt_t *bt; int i; struct vmem_hashlist *newhashlist; struct vmem_hashlist *oldhashlist; size_t oldhashsize; KASSERT(newhashsize > 0); /* Round hash size up to a power of 2. */ newhashsize = 1 << (ilog2(newhashsize) + 1); newhashlist = xmalloc(sizeof(struct vmem_hashlist) * newhashsize, flags); if (newhashlist == NULL) { return ENOMEM; } for (i = 0; i < newhashsize; i++) { LIST_INIT(&newhashlist[i]); } VMEM_LOCK(vm); /* Decay back to a small hash slowly. */ if (vm->vm_maxbusytag >= 2) { vm->vm_maxbusytag = vm->vm_maxbusytag / 2 - 1; if (vm->vm_nbusytag > vm->vm_maxbusytag) { vm->vm_maxbusytag = vm->vm_nbusytag; } } else { vm->vm_maxbusytag = vm->vm_nbusytag; } oldhashlist = vm->vm_hashlist; oldhashsize = vm->vm_hashsize; vm->vm_hashlist = newhashlist; vm->vm_hashsize = newhashsize; vm->vm_hashmask = newhashsize - 1; if (oldhashlist == NULL) { VMEM_UNLOCK(vm); return 0; } for (i = 0; i < oldhashsize; i++) { while ((bt = LIST_FIRST(&oldhashlist[i])) != NULL) { bt_rembusy(vm, bt); /* XXX */ bt_insbusy(vm, bt); } } VMEM_UNLOCK(vm); if (oldhashlist != &vm->vm_hash0) { xfree(oldhashlist, sizeof(struct vmem_hashlist) * oldhashsize); } return 0; } #endif /* _KERNEL */ /* * vmem_fit: check if a bt can satisfy the given restrictions. * * it's a caller's responsibility to ensure the region is big enough * before calling us. */ static int vmem_fit(const bt_t *bt, vmem_size_t size, vmem_size_t align, vmem_size_t phase, vmem_size_t nocross, vmem_addr_t minaddr, vmem_addr_t maxaddr, vmem_addr_t *addrp) { vmem_addr_t start; vmem_addr_t end; KASSERT(size > 0); KASSERT(bt->bt_size >= size); /* caller's responsibility */ /* * XXX assumption: vmem_addr_t and vmem_size_t are * unsigned integer of the same size. */ start = bt->bt_start; if (start < minaddr) { start = minaddr; } end = BT_END(bt); if (end > maxaddr) { end = maxaddr; } if (start > end) { return ENOMEM; } start = VMEM_ALIGNUP(start - phase, align) + phase; if (start < bt->bt_start) { start += align; } if (VMEM_CROSS_P(start, start + size - 1, nocross)) { KASSERT(align < nocross); start = VMEM_ALIGNUP(start - phase, nocross) + phase; } if (start <= end && end - start >= size - 1) { KASSERT((start & (align - 1)) == phase); KASSERT(!VMEM_CROSS_P(start, start + size - 1, nocross)); KASSERT(minaddr <= start); KASSERT(maxaddr == 0 || start + size - 1 <= maxaddr); KASSERT(bt->bt_start <= start); KASSERT(BT_END(bt) - start >= size - 1); *addrp = start; return 0; } return ENOMEM; } /* ---- vmem API */ /* * vmem_init: creates a vmem arena. */ vmem_t * vmem_init(vmem_t *vm, const char *name, vmem_addr_t base, vmem_size_t size, vmem_size_t quantum, vmem_import_t *importfn, vmem_release_t *releasefn, vmem_t *arg, vmem_size_t qcache_max, vm_flag_t flags, int ipl) { int i; KASSERT((flags & (VM_SLEEP|VM_NOSLEEP)) != 0); KASSERT((~flags & (VM_SLEEP|VM_NOSLEEP)) != 0); KASSERT(quantum > 0); KASSERT(powerof2(quantum)); /* * If private tags are going to be used, they must * be added to the arena before the first span is * added. */ KASSERT((flags & VM_PRIVTAGS) == 0 || size == 0); #if defined(_KERNEL) /* XXX: SMP, we get called early... */ if (!vmem_bootstrapped) { vmem_bootstrap(); } #endif /* defined(_KERNEL) */ if (vm == NULL) { vm = xmalloc(sizeof(*vm), flags); } if (vm == NULL) { return NULL; } VMEM_CONDVAR_INIT(vm, "vmem"); VMEM_LOCK_INIT(vm, ipl); vm->vm_flags = flags; vm->vm_nfreetags = 0; LIST_INIT(&vm->vm_freetags); strlcpy(vm->vm_name, name, sizeof(vm->vm_name)); vm->vm_quantum_mask = quantum - 1; vm->vm_quantum_shift = SIZE2ORDER(quantum); KASSERT(ORDER2SIZE(vm->vm_quantum_shift) == quantum); vm->vm_importfn = importfn; vm->vm_releasefn = releasefn; vm->vm_arg = arg; vm->vm_nbusytag = 0; vm->vm_maxbusytag = 0; vm->vm_size = 0; vm->vm_inuse = 0; #if defined(QCACHE) qc_init(vm, qcache_max, ipl); #endif /* defined(QCACHE) */ TAILQ_INIT(&vm->vm_seglist); for (i = 0; i < VMEM_MAXORDER; i++) { LIST_INIT(&vm->vm_freelist[i]); } memset(&vm->vm_hash0, 0, sizeof(vm->vm_hash0)); vm->vm_hashsize = 1; vm->vm_hashmask = vm->vm_hashsize - 1; vm->vm_hashlist = &vm->vm_hash0; if (size != 0) { if (vmem_add(vm, base, size, flags) != 0) { vmem_destroy1(vm); return NULL; } } #if defined(_KERNEL) if (flags & VM_BOOTSTRAP) { bt_refill(vm); } mutex_enter(&vmem_list_lock); LIST_INSERT_HEAD(&vmem_list, vm, vm_alllist); mutex_exit(&vmem_list_lock); #endif /* defined(_KERNEL) */ return vm; } /* * vmem_create: create an arena. * * => must not be called from interrupt context. */ vmem_t * vmem_create(const char *name, vmem_addr_t base, vmem_size_t size, vmem_size_t quantum, vmem_import_t *importfn, vmem_release_t *releasefn, vmem_t *source, vmem_size_t qcache_max, vm_flag_t flags, int ipl) { KASSERT((flags & (VM_XIMPORT)) == 0); return vmem_init(NULL, name, base, size, quantum, importfn, releasefn, source, qcache_max, flags, ipl); } /* * vmem_xcreate: create an arena takes alternative import func. * * => must not be called from interrupt context. */ vmem_t * vmem_xcreate(const char *name, vmem_addr_t base, vmem_size_t size, vmem_size_t quantum, vmem_ximport_t *importfn, vmem_release_t *releasefn, vmem_t *source, vmem_size_t qcache_max, vm_flag_t flags, int ipl) { KASSERT((flags & (VM_XIMPORT)) == 0); return vmem_init(NULL, name, base, size, quantum, __FPTRCAST(vmem_import_t *, importfn), releasefn, source, qcache_max, flags | VM_XIMPORT, ipl); } void vmem_destroy(vmem_t *vm) { #if defined(_KERNEL) mutex_enter(&vmem_list_lock); LIST_REMOVE(vm, vm_alllist); mutex_exit(&vmem_list_lock); #endif /* defined(_KERNEL) */ vmem_destroy1(vm); } vmem_size_t vmem_roundup_size(vmem_t *vm, vmem_size_t size) { return (size + vm->vm_quantum_mask) & ~vm->vm_quantum_mask; } /* * vmem_alloc: allocate resource from the arena. */ int vmem_alloc(vmem_t *vm, vmem_size_t size, vm_flag_t flags, vmem_addr_t *addrp) { const vm_flag_t strat __diagused = flags & VM_FITMASK; int error; KASSERT((flags & (VM_SLEEP|VM_NOSLEEP)) != 0); KASSERT((~flags & (VM_SLEEP|VM_NOSLEEP)) != 0); KASSERT(size > 0); KASSERT(strat == VM_BESTFIT || strat == VM_INSTANTFIT); if ((flags & VM_SLEEP) != 0) { ASSERT_SLEEPABLE(); } #if defined(QCACHE) if (size <= vm->vm_qcache_max) { void *p; int qidx = (size + vm->vm_quantum_mask) >> vm->vm_quantum_shift; qcache_t *qc = vm->vm_qcache[qidx - 1]; p = pool_cache_get(qc->qc_cache, vmf_to_prf(flags)); if (addrp != NULL) *addrp = (vmem_addr_t)p; error = (p == NULL) ? ENOMEM : 0; goto out; } #endif /* defined(QCACHE) */ error = vmem_xalloc(vm, size, 0, 0, 0, VMEM_ADDR_MIN, VMEM_ADDR_MAX, flags, addrp); #if defined(QCACHE) out: #endif /* defined(QCACHE) */ KASSERTMSG(error || addrp == NULL || (*addrp & vm->vm_quantum_mask) == 0, "vmem %s mask=0x%jx addr=0x%jx", vm->vm_name, (uintmax_t)vm->vm_quantum_mask, (uintmax_t)*addrp); KASSERT(error == 0 || (flags & VM_SLEEP) == 0); return error; } int vmem_xalloc_addr(vmem_t *vm, const vmem_addr_t addr, const vmem_size_t size, vm_flag_t flags) { vmem_addr_t result; int error; KASSERT((addr & vm->vm_quantum_mask) == 0); KASSERT(size != 0); flags = (flags & ~VM_INSTANTFIT) | VM_BESTFIT; error = vmem_xalloc(vm, size, 0, 0, 0, addr, addr + size - 1, flags, &result); KASSERT(error || result == addr); KASSERT(error == 0 || (flags & VM_SLEEP) == 0); return error; } int vmem_xalloc(vmem_t *vm, const vmem_size_t size0, vmem_size_t align, const vmem_size_t phase, const vmem_size_t nocross, const vmem_addr_t minaddr, const vmem_addr_t maxaddr, const vm_flag_t flags, vmem_addr_t *addrp) { struct vmem_freelist *list; struct vmem_freelist *first; struct vmem_freelist *end; bt_t *bt; bt_t *btnew; bt_t *btnew2; const vmem_size_t size = vmem_roundup_size(vm, size0); vm_flag_t strat = flags & VM_FITMASK; vmem_addr_t start; int rc; KASSERT(size0 > 0); KASSERT(size > 0); KASSERT(strat == VM_BESTFIT || strat == VM_INSTANTFIT); if ((flags & VM_SLEEP) != 0) { ASSERT_SLEEPABLE(); } KASSERT((align & vm->vm_quantum_mask) == 0); KASSERT((align & (align - 1)) == 0); KASSERT((phase & vm->vm_quantum_mask) == 0); KASSERT((nocross & vm->vm_quantum_mask) == 0); KASSERT((nocross & (nocross - 1)) == 0); KASSERT(align == 0 || phase < align); KASSERT(phase == 0 || phase < align); KASSERT(nocross == 0 || nocross >= size); KASSERT(minaddr <= maxaddr); KASSERT(!VMEM_CROSS_P(phase, phase + size - 1, nocross)); if (align == 0) { align = vm->vm_quantum_mask + 1; } /* * allocate boundary tags before acquiring the vmem lock. */ VMEM_LOCK(vm); btnew = bt_alloc(vm, flags); if (btnew == NULL) { VMEM_UNLOCK(vm); return ENOMEM; } btnew2 = bt_alloc(vm, flags); /* XXX not necessary if no restrictions */ if (btnew2 == NULL) { bt_free(vm, btnew); VMEM_UNLOCK(vm); return ENOMEM; } /* * choose a free block from which we allocate. */ retry_strat: first = bt_freehead_toalloc(vm, size, strat); end = &vm->vm_freelist[VMEM_MAXORDER]; retry: bt = NULL; vmem_check(vm); if (strat == VM_INSTANTFIT) { /* * just choose the first block which satisfies our restrictions. * * note that we don't need to check the size of the blocks * because any blocks found on these list should be larger than * the given size. */ for (list = first; list < end; list++) { bt = LIST_FIRST(list); if (bt != NULL) { rc = vmem_fit(bt, size, align, phase, nocross, minaddr, maxaddr, &start); if (rc == 0) { goto gotit; } /* * don't bother to follow the bt_freelist link * here. the list can be very long and we are * told to run fast. blocks from the later free * lists are larger and have better chances to * satisfy our restrictions. */ } } } else { /* VM_BESTFIT */ /* * we assume that, for space efficiency, it's better to * allocate from a smaller block. thus we will start searching * from the lower-order list than VM_INSTANTFIT. * however, don't bother to find the smallest block in a free * list because the list can be very long. we can revisit it * if/when it turns out to be a problem. * * note that the 'first' list can contain blocks smaller than * the requested size. thus we need to check bt_size. */ for (list = first; list < end; list++) { LIST_FOREACH(bt, list, bt_freelist) { if (bt->bt_size >= size) { rc = vmem_fit(bt, size, align, phase, nocross, minaddr, maxaddr, &start); if (rc == 0) { goto gotit; } } } } } #if 1 if (strat == VM_INSTANTFIT) { strat = VM_BESTFIT; goto retry_strat; } #endif if (align != vm->vm_quantum_mask + 1 || phase != 0 || nocross != 0) { /* * XXX should try to import a region large enough to * satisfy restrictions? */ goto fail; } /* XXX eeek, minaddr & maxaddr not respected */ if (vmem_import(vm, size, flags) == 0) { goto retry; } /* XXX */ if ((flags & VM_SLEEP) != 0) { vmem_kick_pdaemon(); VMEM_CONDVAR_WAIT(vm); goto retry; } fail: bt_free(vm, btnew); bt_free(vm, btnew2); VMEM_UNLOCK(vm); return ENOMEM; gotit: KASSERT(bt->bt_type == BT_TYPE_FREE); KASSERT(bt->bt_size >= size); bt_remfree(vm, bt); vmem_check(vm); if (bt->bt_start != start) { btnew2->bt_type = BT_TYPE_FREE; btnew2->bt_start = bt->bt_start; btnew2->bt_size = start - bt->bt_start; bt->bt_start = start; bt->bt_size -= btnew2->bt_size; bt_insfree(vm, btnew2); bt_insseg(vm, btnew2, TAILQ_PREV(bt, vmem_seglist, bt_seglist)); btnew2 = NULL; vmem_check(vm); } KASSERT(bt->bt_start == start); if (bt->bt_size != size && bt->bt_size - size > vm->vm_quantum_mask) { /* split */ btnew->bt_type = BT_TYPE_BUSY; btnew->bt_start = bt->bt_start; btnew->bt_size = size; bt->bt_start = bt->bt_start + size; bt->bt_size -= size; bt_insfree(vm, bt); bt_insseg(vm, btnew, TAILQ_PREV(bt, vmem_seglist, bt_seglist)); bt_insbusy(vm, btnew); vmem_check(vm); } else { bt->bt_type = BT_TYPE_BUSY; bt_insbusy(vm, bt); vmem_check(vm); bt_free(vm, btnew); btnew = bt; } if (btnew2 != NULL) { bt_free(vm, btnew2); } KASSERT(btnew->bt_size >= size); btnew->bt_type = BT_TYPE_BUSY; if (addrp != NULL) *addrp = btnew->bt_start; VMEM_UNLOCK(vm); KASSERTMSG(addrp == NULL || (*addrp & vm->vm_quantum_mask) == 0, "vmem %s mask=0x%jx addr=0x%jx", vm->vm_name, (uintmax_t)vm->vm_quantum_mask, (uintmax_t)*addrp); return 0; } /* * vmem_free: free the resource to the arena. */ void vmem_free(vmem_t *vm, vmem_addr_t addr, vmem_size_t size) { KASSERT(size > 0); KASSERTMSG((addr & vm->vm_quantum_mask) == 0, "vmem %s mask=0x%jx addr=0x%jx", vm->vm_name, (uintmax_t)vm->vm_quantum_mask, (uintmax_t)addr); #if defined(QCACHE) if (size <= vm->vm_qcache_max) { int qidx = (size + vm->vm_quantum_mask) >> vm->vm_quantum_shift; qcache_t *qc = vm->vm_qcache[qidx - 1]; pool_cache_put(qc->qc_cache, (void *)addr); return; } #endif /* defined(QCACHE) */ vmem_xfree(vm, addr, size); } void vmem_xfree(vmem_t *vm, vmem_addr_t addr, vmem_size_t size) { bt_t *bt; KASSERT(size > 0); KASSERTMSG((addr & vm->vm_quantum_mask) == 0, "vmem %s mask=0x%jx addr=0x%jx", vm->vm_name, (uintmax_t)vm->vm_quantum_mask, (uintmax_t)addr); VMEM_LOCK(vm); bt = bt_lookupbusy(vm, addr); KASSERTMSG(bt != NULL, "vmem %s addr 0x%jx size 0x%jx", vm->vm_name, (uintmax_t)addr, (uintmax_t)size); KASSERT(bt->bt_start == addr); KASSERT(bt->bt_size == vmem_roundup_size(vm, size) || bt->bt_size - vmem_roundup_size(vm, size) <= vm->vm_quantum_mask); /* vmem_xfree_bt() drops the lock. */ vmem_xfree_bt(vm, bt); } void vmem_xfreeall(vmem_t *vm) { bt_t *bt; #if defined(QCACHE) /* This can't be used if the arena has a quantum cache. */ KASSERT(vm->vm_qcache_max == 0); #endif /* defined(QCACHE) */ for (;;) { VMEM_LOCK(vm); TAILQ_FOREACH(bt, &vm->vm_seglist, bt_seglist) { if (bt->bt_type == BT_TYPE_BUSY) break; } if (bt != NULL) { /* vmem_xfree_bt() drops the lock. */ vmem_xfree_bt(vm, bt); } else { VMEM_UNLOCK(vm); return; } } } static void vmem_xfree_bt(vmem_t *vm, bt_t *bt) { bt_t *t; VMEM_ASSERT_LOCKED(vm); KASSERT(bt->bt_type == BT_TYPE_BUSY); bt_rembusy(vm, bt); bt->bt_type = BT_TYPE_FREE; /* coalesce */ t = TAILQ_NEXT(bt, bt_seglist); if (t != NULL && t->bt_type == BT_TYPE_FREE) { KASSERT(BT_END(bt) < t->bt_start); /* YYY */ bt_remfree(vm, t); bt_remseg(vm, t); bt->bt_size += t->bt_size; bt_free(vm, t); } t = TAILQ_PREV(bt, vmem_seglist, bt_seglist); if (t != NULL && t->bt_type == BT_TYPE_FREE) { KASSERT(BT_END(t) < bt->bt_start); /* YYY */ bt_remfree(vm, t); bt_remseg(vm, t); bt->bt_size += t->bt_size; bt->bt_start = t->bt_start; bt_free(vm, t); } t = TAILQ_PREV(bt, vmem_seglist, bt_seglist); KASSERT(t != NULL); KASSERT(BT_ISSPAN_P(t) || t->bt_type == BT_TYPE_BUSY); if (vm->vm_releasefn != NULL && t->bt_type == BT_TYPE_SPAN && t->bt_size == bt->bt_size) { vmem_addr_t spanaddr; vmem_size_t spansize; KASSERT(t->bt_start == bt->bt_start); spanaddr = bt->bt_start; spansize = bt->bt_size; bt_remseg(vm, bt); bt_free(vm, bt); bt_remseg(vm, t); bt_free(vm, t); vm->vm_size -= spansize; VMEM_CONDVAR_BROADCAST(vm); /* bt_freetrim() drops the lock. */ bt_freetrim(vm, BT_MAXFREE); (*vm->vm_releasefn)(vm->vm_arg, spanaddr, spansize); } else { bt_insfree(vm, bt); VMEM_CONDVAR_BROADCAST(vm); /* bt_freetrim() drops the lock. */ bt_freetrim(vm, BT_MAXFREE); } } /* * vmem_add: * * => caller must ensure appropriate spl, * if the arena can be accessed from interrupt context. */ int vmem_add(vmem_t *vm, vmem_addr_t addr, vmem_size_t size, vm_flag_t flags) { int rv; VMEM_LOCK(vm); rv = vmem_add1(vm, addr, size, flags, BT_TYPE_SPAN_STATIC); VMEM_UNLOCK(vm); return rv; } /* * vmem_size: information about arenas size * * => return free/allocated size in arena */ vmem_size_t vmem_size(vmem_t *vm, int typemask) { switch (typemask) { case VMEM_ALLOC: return vm->vm_inuse; case VMEM_FREE: return vm->vm_size - vm->vm_inuse; case VMEM_FREE|VMEM_ALLOC: return vm->vm_size; default: panic("vmem_size"); } } /* ---- rehash */ #if defined(_KERNEL) static struct callout vmem_rehash_ch; static int vmem_rehash_interval; static struct workqueue *vmem_rehash_wq; static struct work vmem_rehash_wk; static void vmem_rehash_all(struct work *wk, void *dummy) { vmem_t *vm; KASSERT(wk == &vmem_rehash_wk); mutex_enter(&vmem_list_lock); LIST_FOREACH(vm, &vmem_list, vm_alllist) { size_t desired; size_t current; desired = atomic_load_relaxed(&vm->vm_maxbusytag); current = atomic_load_relaxed(&vm->vm_hashsize); if (desired > VMEM_HASHSIZE_MAX) { desired = VMEM_HASHSIZE_MAX; } else if (desired < VMEM_HASHSIZE_MIN) { desired = VMEM_HASHSIZE_MIN; } if (desired > current * 2 || desired * 2 < current) { vmem_rehash(vm, desired, VM_NOSLEEP); } } mutex_exit(&vmem_list_lock); callout_schedule(&vmem_rehash_ch, vmem_rehash_interval); } static void vmem_rehash_all_kick(void *dummy) { workqueue_enqueue(vmem_rehash_wq, &vmem_rehash_wk, NULL); } void vmem_rehash_start(void) { int error; error = workqueue_create(&vmem_rehash_wq, "vmem_rehash", vmem_rehash_all, NULL, PRI_VM, IPL_SOFTCLOCK, WQ_MPSAFE); if (error) { panic("%s: workqueue_create %d\n", __func__, error); } callout_init(&vmem_rehash_ch, CALLOUT_MPSAFE); callout_setfunc(&vmem_rehash_ch, vmem_rehash_all_kick, NULL); vmem_rehash_interval = hz * 10; callout_schedule(&vmem_rehash_ch, vmem_rehash_interval); } #endif /* defined(_KERNEL) */ /* ---- debug */ #if defined(DDB) || defined(UNITTEST) || defined(VMEM_SANITY) static void bt_dump(const bt_t *, void (*)(const char *, ...) __printflike(1, 2)); static const char * bt_type_string(int type) { static const char * const table[] = { [BT_TYPE_BUSY] = "busy", [BT_TYPE_FREE] = "free", [BT_TYPE_SPAN] = "span", [BT_TYPE_SPAN_STATIC] = "static span", }; if (type >= __arraycount(table)) { return "BOGUS"; } return table[type]; } static void bt_dump(const bt_t *bt, void (*pr)(const char *, ...)) { (*pr)("\t%p: %" PRIu64 ", %" PRIu64 ", %d(%s)\n", bt, (uint64_t)bt->bt_start, (uint64_t)bt->bt_size, bt->bt_type, bt_type_string(bt->bt_type)); } static void vmem_dump(const vmem_t *vm , void (*pr)(const char *, ...) __printflike(1, 2)) { const bt_t *bt; int i; (*pr)("vmem %p '%s'\n", vm, vm->vm_name); TAILQ_FOREACH(bt, &vm->vm_seglist, bt_seglist) { bt_dump(bt, pr); } for (i = 0; i < VMEM_MAXORDER; i++) { const struct vmem_freelist *fl = &vm->vm_freelist[i]; if (LIST_EMPTY(fl)) { continue; } (*pr)("freelist[%d]\n", i); LIST_FOREACH(bt, fl, bt_freelist) { bt_dump(bt, pr); } } } #endif /* defined(DDB) || defined(UNITTEST) || defined(VMEM_SANITY) */ #if defined(DDB) static bt_t * vmem_whatis_lookup(vmem_t *vm, uintptr_t addr) { bt_t *bt; TAILQ_FOREACH(bt, &vm->vm_seglist, bt_seglist) { if (BT_ISSPAN_P(bt)) { continue; } if (bt->bt_start <= addr && addr <= BT_END(bt)) { return bt; } } return NULL; } void vmem_whatis(uintptr_t addr, void (*pr)(const char *, ...)) { vmem_t *vm; LIST_FOREACH(vm, &vmem_list, vm_alllist) { bt_t *bt; bt = vmem_whatis_lookup(vm, addr); if (bt == NULL) { continue; } (*pr)("%p is %p+%zu in VMEM '%s' (%s)\n", (void *)addr, (void *)bt->bt_start, (size_t)(addr - bt->bt_start), vm->vm_name, (bt->bt_type == BT_TYPE_BUSY) ? "allocated" : "free"); } } void vmem_printall(const char *modif, void (*pr)(const char *, ...)) { const vmem_t *vm; LIST_FOREACH(vm, &vmem_list, vm_alllist) { vmem_dump(vm, pr); } } void vmem_print(uintptr_t addr, const char *modif, void (*pr)(const char *, ...)) { const vmem_t *vm = (const void *)addr; vmem_dump(vm, pr); } #endif /* defined(DDB) */ #if defined(_KERNEL) #define vmem_printf printf #else #include <stdio.h> #include <stdarg.h> static void vmem_printf(const char *fmt, ...) { va_list ap; va_start(ap, fmt); vprintf(fmt, ap); va_end(ap); } #endif #if defined(VMEM_SANITY) static bool vmem_check_sanity(vmem_t *vm) { const bt_t *bt, *bt2; KASSERT(vm != NULL); TAILQ_FOREACH(bt, &vm->vm_seglist, bt_seglist) { if (bt->bt_start > BT_END(bt)) { printf("corrupted tag\n"); bt_dump(bt, vmem_printf); return false; } } TAILQ_FOREACH(bt, &vm->vm_seglist, bt_seglist) { TAILQ_FOREACH(bt2, &vm->vm_seglist, bt_seglist) { if (bt == bt2) { continue; } if (BT_ISSPAN_P(bt) != BT_ISSPAN_P(bt2)) { continue; } if (bt->bt_start <= BT_END(bt2) && bt2->bt_start <= BT_END(bt)) { printf("overwrapped tags\n"); bt_dump(bt, vmem_printf); bt_dump(bt2, vmem_printf); return false; } } } return true; } static void vmem_check(vmem_t *vm) { if (!vmem_check_sanity(vm)) { panic("insanity vmem %p", vm); } } #endif /* defined(VMEM_SANITY) */ #if defined(UNITTEST) int main(void) { int rc; vmem_t *vm; vmem_addr_t p; struct reg { vmem_addr_t p; vmem_size_t sz; bool x; } *reg = NULL; int nreg = 0; int nalloc = 0; int nfree = 0; vmem_size_t total = 0; #if 1 vm_flag_t strat = VM_INSTANTFIT; #else vm_flag_t strat = VM_BESTFIT; #endif vm = vmem_create("test", 0, 0, 1, NULL, NULL, NULL, 0, VM_SLEEP, #ifdef _KERNEL IPL_NONE #else 0 #endif ); if (vm == NULL) { printf("vmem_create\n"); exit(EXIT_FAILURE); } vmem_dump(vm, vmem_printf); rc = vmem_add(vm, 0, 50, VM_SLEEP); assert(rc == 0); rc = vmem_add(vm, 100, 200, VM_SLEEP); assert(rc == 0); rc = vmem_add(vm, 2000, 1, VM_SLEEP); assert(rc == 0); rc = vmem_add(vm, 40000, 65536, VM_SLEEP); assert(rc == 0); rc = vmem_add(vm, 10000, 10000, VM_SLEEP); assert(rc == 0); rc = vmem_add(vm, 500, 1000, VM_SLEEP); assert(rc == 0); rc = vmem_add(vm, 0xffffff00, 0x100, VM_SLEEP); assert(rc == 0); rc = vmem_xalloc(vm, 0x101, 0, 0, 0, 0xffffff00, 0xffffffff, strat|VM_SLEEP, &p); assert(rc != 0); rc = vmem_xalloc(vm, 50, 0, 0, 0, 0, 49, strat|VM_SLEEP, &p); assert(rc == 0 && p == 0); vmem_xfree(vm, p, 50); rc = vmem_xalloc(vm, 25, 0, 0, 0, 0, 24, strat|VM_SLEEP, &p); assert(rc == 0 && p == 0); rc = vmem_xalloc(vm, 0x100, 0, 0, 0, 0xffffff01, 0xffffffff, strat|VM_SLEEP, &p); assert(rc != 0); rc = vmem_xalloc(vm, 0x100, 0, 0, 0, 0xffffff00, 0xfffffffe, strat|VM_SLEEP, &p); assert(rc != 0); rc = vmem_xalloc(vm, 0x100, 0, 0, 0, 0xffffff00, 0xffffffff, strat|VM_SLEEP, &p); assert(rc == 0); vmem_dump(vm, vmem_printf); for (;;) { struct reg *r; int t = rand() % 100; if (t > 45) { /* alloc */ vmem_size_t sz = rand() % 500 + 1; bool x; vmem_size_t align, phase, nocross; vmem_addr_t minaddr, maxaddr; if (t > 70) { x = true; /* XXX */ align = 1 << (rand() % 15); phase = rand() % 65536; nocross = 1 << (rand() % 15); if (align <= phase) { phase = 0; } if (VMEM_CROSS_P(phase, phase + sz - 1, nocross)) { nocross = 0; } do { minaddr = rand() % 50000; maxaddr = rand() % 70000; } while (minaddr > maxaddr); printf("=== xalloc %" PRIu64 " align=%" PRIu64 ", phase=%" PRIu64 ", nocross=%" PRIu64 ", min=%" PRIu64 ", max=%" PRIu64 "\n", (uint64_t)sz, (uint64_t)align, (uint64_t)phase, (uint64_t)nocross, (uint64_t)minaddr, (uint64_t)maxaddr); rc = vmem_xalloc(vm, sz, align, phase, nocross, minaddr, maxaddr, strat|VM_SLEEP, &p); } else { x = false; printf("=== alloc %" PRIu64 "\n", (uint64_t)sz); rc = vmem_alloc(vm, sz, strat|VM_SLEEP, &p); } printf("-> %" PRIu64 "\n", (uint64_t)p); vmem_dump(vm, vmem_printf); if (rc != 0) { if (x) { continue; } break; } nreg++; reg = realloc(reg, sizeof(*reg) * nreg); r = &reg[nreg - 1]; r->p = p; r->sz = sz; r->x = x; total += sz; nalloc++; } else if (nreg != 0) { /* free */ r = &reg[rand() % nreg]; printf("=== free %" PRIu64 ", %" PRIu64 "\n", (uint64_t)r->p, (uint64_t)r->sz); if (r->x) { vmem_xfree(vm, r->p, r->sz); } else { vmem_free(vm, r->p, r->sz); } total -= r->sz; vmem_dump(vm, vmem_printf); *r = reg[nreg - 1]; nreg--; nfree++; } printf("total=%" PRIu64 "\n", (uint64_t)total); } fprintf(stderr, "total=%" PRIu64 ", nalloc=%d, nfree=%d\n", (uint64_t)total, nalloc, nfree); exit(EXIT_SUCCESS); } #endif /* defined(UNITTEST) */
2 34 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 /* $NetBSD: signalvar.h,v 1.104 2021/11/01 05:07:17 thorpej Exp $ */ /* * Copyright (c) 1991, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)signalvar.h 8.6 (Berkeley) 2/19/95 */ #ifndef _SYS_SIGNALVAR_H_ #define _SYS_SIGNALVAR_H_ #include <sys/siginfo.h> #include <sys/queue.h> #include <sys/mutex.h> #include <sys/stdbool.h> #ifndef _KERNEL #include <string.h> /* Required for memset(3) and memcpy(3) prototypes */ #endif /* _KERNEL */ /* * Kernel signal definitions and data structures, * not exported to user programs. */ /* * Queue of signals. */ typedef TAILQ_HEAD(ksiginfoq, ksiginfo) ksiginfoq_t; /* * Process signal actions, possibly shared between processes. */ struct sigacts { struct sigact_sigdesc { struct sigaction sd_sigact; const void *sd_tramp; int sd_vers; } sa_sigdesc[NSIG]; /* disposition of signals */ int sa_refcnt; /* reference count */ kmutex_t sa_mutex; /* lock on sa_refcnt */ }; /* * Pending signals, per LWP and per process. */ typedef struct sigpend { ksiginfoq_t sp_info; sigset_t sp_set; } sigpend_t; /* * Process signal state. */ struct sigctx { struct _ksiginfo ps_info; /* for core dump/debugger XXX */ int ps_lwp; /* for core dump/debugger XXX */ bool ps_faked; /* for core dump/debugger XXX */ void *ps_sigcode; /* address of signal trampoline */ sigset_t ps_sigignore; /* Signals being ignored. */ sigset_t ps_sigcatch; /* Signals being caught by user. */ sigset_t ps_sigpass; /* Signals evading the debugger. */ }; /* additional signal action values, used only temporarily/internally */ #define SIG_CATCH (void (*)(int))2 /* * get signal action for process and signal; currently only for current process */ #define SIGACTION(p, sig) (p->p_sigacts->sa_sigdesc[(sig)].sd_sigact) #define SIGACTION_PS(ps, sig) (ps->sa_sigdesc[(sig)].sd_sigact) /* * Copy a sigaction structure without padding. */ static __inline void sigaction_copy(struct sigaction *dst, const struct sigaction *src) { memset(dst, 0, sizeof(*dst)); dst->_sa_u._sa_handler = src->_sa_u._sa_handler; memcpy(&dst->sa_mask, &src->sa_mask, sizeof(dst->sa_mask)); dst->sa_flags = src->sa_flags; } /* * Signal properties and actions. * The array below categorizes the signals and their default actions * according to the following properties: */ #define SA_KILL 0x0001 /* terminates process by default */ #define SA_CORE 0x0002 /* ditto and coredumps */ #define SA_STOP 0x0004 /* suspend process */ #define SA_TTYSTOP 0x0008 /* ditto, from tty */ #define SA_IGNORE 0x0010 /* ignore by default */ #define SA_CONT 0x0020 /* continue if suspended */ #define SA_CANTMASK 0x0040 /* non-maskable, catchable */ #define SA_NORESET 0x0080 /* not reset when caught */ #define SA_TOLWP 0x0100 /* to LWP that generated, if local */ #define SA_TOALL 0x0200 /* always to all LWPs */ #ifdef _KERNEL #include <sys/systm.h> /* for copyin_t/copyout_t */ extern sigset_t contsigmask, stopsigmask, sigcantmask; struct vnode; struct coredump_iostate; /* * Machine-independent functions: */ int coredump_netbsd(struct lwp *, struct coredump_iostate *); int coredump_netbsd32(struct lwp *, struct coredump_iostate *); int real_coredump_netbsd(struct lwp *, struct coredump_iostate *); void execsigs(struct proc *); int issignal(struct lwp *); void pgsignal(struct pgrp *, int, int); void kpgsignal(struct pgrp *, struct ksiginfo *, void *, int); void postsig(int); void psignal(struct proc *, int); void kpsignal(struct proc *, struct ksiginfo *, void *); void child_psignal(struct proc *, int); void siginit(struct proc *); void trapsignal(struct lwp *, struct ksiginfo *); void sigexit(struct lwp *, int) __dead; void killproc(struct proc *, const char *); void setsigvec(struct proc *, int, struct sigaction *); int killpg1(struct lwp *, struct ksiginfo *, int, int); void proc_unstop(struct proc *p); void eventswitch(int, int, int); void eventswitchchild(struct proc *, int, int); int sigaction1(struct lwp *, int, const struct sigaction *, struct sigaction *, const void *, int); int sigprocmask1(struct lwp *, int, const sigset_t *, sigset_t *); void sigpending1(struct lwp *, sigset_t *); void sigsuspendsetup(struct lwp *, const sigset_t *); void sigsuspendteardown(struct lwp *); int sigsuspend1(struct lwp *, const sigset_t *); int sigaltstack1(struct lwp *, const stack_t *, stack_t *); int sigismasked(struct lwp *, int); int sigget(sigpend_t *, ksiginfo_t *, int, const sigset_t *); void sigclear(sigpend_t *, const sigset_t *, ksiginfoq_t *); void sigclearall(struct proc *, const sigset_t *, ksiginfoq_t *); int kpsignal2(struct proc *, ksiginfo_t *); void signal_init(void); struct sigacts *sigactsinit(struct proc *, int); void sigactsunshare(struct proc *); void sigactsfree(struct sigacts *); void kpsendsig(struct lwp *, const struct ksiginfo *, const sigset_t *); void sendsig_reset(struct lwp *, int); void sendsig(const struct ksiginfo *, const sigset_t *); ksiginfo_t *ksiginfo_alloc(struct proc *, ksiginfo_t *, int); void ksiginfo_free(ksiginfo_t *); void ksiginfo_queue_drain0(ksiginfoq_t *); struct sys_____sigtimedwait50_args; int sigtimedwait1(struct lwp *, const struct sys_____sigtimedwait50_args *, register_t *, copyin_t, copyout_t, copyin_t, copyout_t); void signotify(struct lwp *); int sigispending(struct lwp *, int); /* * Machine-dependent functions: */ void sendsig_sigcontext(const struct ksiginfo *, const sigset_t *); void sendsig_siginfo(const struct ksiginfo *, const sigset_t *); extern struct pool ksiginfo_pool; /* * firstsig: * * Return the first signal in a signal set. */ static __inline int firstsig(const sigset_t *ss) { int sig; sig = ffs(ss->__bits[0]); if (sig != 0) return (sig); #if NSIG > 33 sig = ffs(ss->__bits[1]); if (sig != 0) return (sig + 32); #endif #if NSIG > 65 sig = ffs(ss->__bits[2]); if (sig != 0) return (sig + 64); #endif #if NSIG > 97 sig = ffs(ss->__bits[3]); if (sig != 0) return (sig + 96); #endif return (0); } static __inline void ksiginfo_queue_init(ksiginfoq_t *kq) { TAILQ_INIT(kq); } static __inline void ksiginfo_queue_drain(ksiginfoq_t *kq) { if (!TAILQ_EMPTY(kq)) ksiginfo_queue_drain0(kq); } #endif /* _KERNEL */ #ifdef _KERNEL #ifdef SIGPROP const int sigprop[NSIG] = { 0, /* 0 unused */ SA_KILL, /* 1 SIGHUP */ SA_KILL, /* 2 SIGINT */ SA_KILL|SA_CORE, /* 3 SIGQUIT */ SA_KILL|SA_CORE|SA_NORESET|SA_TOLWP, /* 4 SIGILL */ SA_KILL|SA_CORE|SA_NORESET|SA_TOLWP, /* 5 SIGTRAP */ SA_KILL|SA_CORE, /* 6 SIGABRT */ SA_KILL|SA_CORE|SA_TOLWP, /* 7 SIGEMT */ SA_KILL|SA_CORE|SA_TOLWP, /* 8 SIGFPE */ SA_KILL|SA_CANTMASK|SA_TOALL, /* 9 SIGKILL */ SA_KILL|SA_CORE|SA_TOLWP, /* 10 SIGBUS */ SA_KILL|SA_CORE|SA_TOLWP, /* 11 SIGSEGV */ SA_KILL|SA_CORE|SA_TOLWP, /* 12 SIGSYS */ SA_KILL, /* 13 SIGPIPE */ SA_KILL, /* 14 SIGALRM */ SA_KILL, /* 15 SIGTERM */ SA_IGNORE, /* 16 SIGURG */ SA_STOP|SA_CANTMASK|SA_TOALL, /* 17 SIGSTOP */ SA_STOP|SA_TTYSTOP|SA_TOALL, /* 18 SIGTSTP */ SA_IGNORE|SA_CONT|SA_TOALL, /* 19 SIGCONT */ SA_IGNORE, /* 20 SIGCHLD */ SA_STOP|SA_TTYSTOP|SA_TOALL, /* 21 SIGTTIN */ SA_STOP|SA_TTYSTOP|SA_TOALL, /* 22 SIGTTOU */ SA_IGNORE, /* 23 SIGIO */ SA_KILL, /* 24 SIGXCPU */ SA_KILL, /* 25 SIGXFSZ */ SA_KILL, /* 26 SIGVTALRM */ SA_KILL, /* 27 SIGPROF */ SA_IGNORE, /* 28 SIGWINCH */ SA_IGNORE, /* 29 SIGINFO */ SA_KILL, /* 30 SIGUSR1 */ SA_KILL, /* 31 SIGUSR2 */ SA_IGNORE|SA_NORESET, /* 32 SIGPWR */ SA_KILL, /* 33 SIGRTMIN + 0 */ SA_KILL, /* 34 SIGRTMIN + 1 */ SA_KILL, /* 35 SIGRTMIN + 2 */ SA_KILL, /* 36 SIGRTMIN + 3 */ SA_KILL, /* 37 SIGRTMIN + 4 */ SA_KILL, /* 38 SIGRTMIN + 5 */ SA_KILL, /* 39 SIGRTMIN + 6 */ SA_KILL, /* 40 SIGRTMIN + 7 */ SA_KILL, /* 41 SIGRTMIN + 8 */ SA_KILL, /* 42 SIGRTMIN + 9 */ SA_KILL, /* 43 SIGRTMIN + 10 */ SA_KILL, /* 44 SIGRTMIN + 11 */ SA_KILL, /* 45 SIGRTMIN + 12 */ SA_KILL, /* 46 SIGRTMIN + 13 */ SA_KILL, /* 47 SIGRTMIN + 14 */ SA_KILL, /* 48 SIGRTMIN + 15 */ SA_KILL, /* 49 SIGRTMIN + 16 */ SA_KILL, /* 50 SIGRTMIN + 17 */ SA_KILL, /* 51 SIGRTMIN + 18 */ SA_KILL, /* 52 SIGRTMIN + 19 */ SA_KILL, /* 53 SIGRTMIN + 20 */ SA_KILL, /* 54 SIGRTMIN + 21 */ SA_KILL, /* 55 SIGRTMIN + 22 */ SA_KILL, /* 56 SIGRTMIN + 23 */ SA_KILL, /* 57 SIGRTMIN + 24 */ SA_KILL, /* 58 SIGRTMIN + 25 */ SA_KILL, /* 59 SIGRTMIN + 26 */ SA_KILL, /* 60 SIGRTMIN + 27 */ SA_KILL, /* 61 SIGRTMIN + 28 */ SA_KILL, /* 62 SIGRTMIN + 29 */ SA_KILL, /* 63 SIGRTMIN + 30 */ }; #undef SIGPROP #else extern const int sigprop[NSIG]; #endif /* SIGPROP */ #endif /* _KERNEL */ #endif /* !_SYS_SIGNALVAR_H_ */
165 35 21 166 191 191 80 53 80 37 123 160 162 1341 1338 1336 78 162 162 162 162 161 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 /* $NetBSD: x86_tlb.c,v 1.21 2023/12/08 21:46:02 andvar Exp $ */ /*- * Copyright (c) 2008-2020 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Andrew Doran and Mindaugas Rasiukevicius. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /* * x86 pmap(9) module: TLB shootdowns. * * TLB shootdowns are hard interrupts that operate outside the SPL framework. * They do not need to be blocked, provided that the pmap module gets the * order of events correct. The calls are made by poking the LAPIC directly. * The interrupt handler is short and does one of the following: invalidate * a set of pages, all user TLB entries or the entire TLB. */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: x86_tlb.c,v 1.21 2023/12/08 21:46:02 andvar Exp $"); #include <sys/param.h> #include <sys/kernel.h> #include <sys/systm.h> #include <sys/atomic.h> #include <sys/cpu.h> #include <sys/intr.h> #include <uvm/uvm.h> #include <machine/cpuvar.h> #include <machine/pmap_private.h> #ifdef XENPV #include <xen/xenpmap.h> #endif /* XENPV */ #include <x86/i82489reg.h> #include <x86/i82489var.h> /* * TLB shootdown packet. Each CPU has a copy of this packet, where we build * sets of TLB shootdowns. If shootdowns need to occur on remote CPUs, the * packet is copied into a shared mailbox kept on the initiator's kernel * stack. Once the copy is made, no further updates to the mailbox are made * until the request is completed. This keeps the cache line in the shared * state, and bus traffic to a minimum. * * In order to make maximal use of the available space, control fields are * overlaid into the lower 12 bits of the first 4 virtual addresses. This * is very ugly, but it counts. * * On i386 the packet is 64 bytes in size. On amd64 it's 128 bytes. This * is sized in concert with UBC_WINSIZE, otherwise excessive shootdown * interrupts could be issued. */ #define TP_MAXVA 16 /* for individual mappings */ #define TP_ALLVA PAGE_MASK /* special: shoot all mappings */ typedef struct { uintptr_t tp_store[TP_MAXVA]; } pmap_tlb_packet_t; #define TP_COUNT 0 #define TP_USERPMAP 1 #define TP_GLOBAL 2 #define TP_DONE 3 #define TP_GET_COUNT(tp) ((tp)->tp_store[TP_COUNT] & PAGE_MASK) #define TP_GET_USERPMAP(tp) ((tp)->tp_store[TP_USERPMAP] & 1) #define TP_GET_GLOBAL(tp) ((tp)->tp_store[TP_GLOBAL] & 1) #define TP_GET_DONE(tp) (atomic_load_relaxed(&(tp)->tp_store[TP_DONE]) & 1) #define TP_GET_VA(tp, i) ((tp)->tp_store[(i)] & ~PAGE_MASK) #define TP_INC_COUNT(tp) ((tp)->tp_store[TP_COUNT]++) #define TP_SET_ALLVA(tp) ((tp)->tp_store[TP_COUNT] |= TP_ALLVA) #define TP_SET_VA(tp, c, va) ((tp)->tp_store[(c)] |= ((va) & ~PAGE_MASK)) #define TP_SET_USERPMAP(tp) ((tp)->tp_store[TP_USERPMAP] |= 1) #define TP_SET_GLOBAL(tp) ((tp)->tp_store[TP_GLOBAL] |= 1) #define TP_SET_DONE(tp) \ do { \ uintptr_t v = atomic_load_relaxed(&(tp)->tp_store[TP_DONE]); \ atomic_store_relaxed(&(tp)->tp_store[TP_DONE], v | 1); \ } while (/* CONSTCOND */ 0); #define TP_CLEAR(tp) memset(__UNVOLATILE(tp), 0, sizeof(*(tp))); /* * TLB shootdown state. */ static volatile pmap_tlb_packet_t *volatile pmap_tlb_packet __cacheline_aligned; static volatile u_int pmap_tlb_pendcount __cacheline_aligned; static struct evcnt pmap_tlb_evcnt __cacheline_aligned; /* * TLB shootdown statistics. */ #ifdef TLBSTATS static struct evcnt tlbstat_local[TLBSHOOT__MAX]; static struct evcnt tlbstat_remote[TLBSHOOT__MAX]; static struct evcnt tlbstat_kernel[TLBSHOOT__MAX]; static struct evcnt tlbstat_single_req; static struct evcnt tlbstat_single_issue; static const char * tlbstat_name[ ] = { "REMOVE_ALL", "KENTER", "KREMOVE", "FREE_PTP", "REMOVE_PTE", "SYNC_PV", "WRITE_PROTECT", "ENTER", "NVMM", "BUS_DMA", "BUS_SPACE", }; #endif void pmap_tlb_init(void) { evcnt_attach_dynamic(&pmap_tlb_evcnt, EVCNT_TYPE_INTR, NULL, "TLB", "shootdown"); #ifdef TLBSTATS int i; for (i = 0; i < TLBSHOOT__MAX; i++) { evcnt_attach_dynamic(&tlbstat_local[i], EVCNT_TYPE_MISC, NULL, "tlbshoot local", tlbstat_name[i]); } for (i = 0; i < TLBSHOOT__MAX; i++) { evcnt_attach_dynamic(&tlbstat_remote[i], EVCNT_TYPE_MISC, NULL, "tlbshoot remote", tlbstat_name[i]); } for (i = 0; i < TLBSHOOT__MAX; i++) { evcnt_attach_dynamic(&tlbstat_kernel[i], EVCNT_TYPE_MISC, NULL, "tlbshoot kernel", tlbstat_name[i]); } evcnt_attach_dynamic(&tlbstat_single_req, EVCNT_TYPE_MISC, NULL, "tlbshoot single page", "requests"); evcnt_attach_dynamic(&tlbstat_single_issue, EVCNT_TYPE_MISC, NULL, "tlbshoot single page", "issues"); #endif } void pmap_tlb_cpu_init(struct cpu_info *ci) { pmap_tlb_packet_t *tp = (pmap_tlb_packet_t *)ci->ci_pmap_data; memset(tp, 0, sizeof(pmap_tlb_packet_t)); kcpuset_create(&ci->ci_tlb_cpuset, true); } static inline void pmap_tlbstat_count(struct pmap *pm, vaddr_t va, tlbwhy_t why) { #ifdef TLBSTATS const cpuid_t cid = cpu_index(curcpu()); bool local = false, remote = false; if (va != (vaddr_t)-1LL) { atomic_inc_64(&tlbstat_single_req.ev_count); } if (pm == pmap_kernel()) { atomic_inc_64(&tlbstat_kernel[why].ev_count); return; } if (va >= VM_MAXUSER_ADDRESS) { remote = kcpuset_isotherset(pm->pm_kernel_cpus, cid); local = kcpuset_isset(pm->pm_kernel_cpus, cid); } remote |= kcpuset_isotherset(pm->pm_cpus, cid); local |= kcpuset_isset(pm->pm_cpus, cid); if (local) { atomic_inc_64(&tlbstat_local[why].ev_count); } if (remote) { atomic_inc_64(&tlbstat_remote[why].ev_count); } #endif } static inline void pmap_tlb_invalidate(volatile pmap_tlb_packet_t *tp) { int i = TP_GET_COUNT(tp); /* Find out what we need to invalidate. */ if (i == TP_ALLVA) { if (TP_GET_GLOBAL(tp) != 0) { /* Invalidating all TLB entries. */ tlbflushg(); } else { /* Invalidating non-global TLB entries only. */ tlbflush(); } } else { /* Invalidating a single page or a range of pages. */ KASSERT(i != 0); do { --i; pmap_update_pg(TP_GET_VA(tp, i)); } while (i > 0); } } /* * pmap_tlb_shootdown: invalidate a page on all CPUs using pmap 'pm'. */ void pmap_tlb_shootdown(struct pmap *pm, vaddr_t va, pt_entry_t pte, tlbwhy_t why) { pmap_tlb_packet_t *tp; struct cpu_info *ci; uint8_t count; int s; #ifndef XENPV KASSERT((pte & PTE_G) == 0 || pm == pmap_kernel()); #endif if (__predict_false(pm->pm_tlb_flush != NULL)) { (*pm->pm_tlb_flush)(pm); return; } if ((pte & PTE_PS) != 0) { va &= PTE_LGFRAME; } /* * Add the shootdown operation to our pending set. */ s = splvm(); ci = curcpu(); tp = (pmap_tlb_packet_t *)ci->ci_pmap_data; /* Whole address flush will be needed if PTE_G is set. */ if ((pte & PTE_G) != 0) { TP_SET_GLOBAL(tp); } count = TP_GET_COUNT(tp); if (count < TP_MAXVA && va != (vaddr_t)-1LL) { /* Flush a single page. */ TP_SET_VA(tp, count, va); TP_INC_COUNT(tp); } else { /* Flush everything - may already be set. */ TP_SET_ALLVA(tp); } if (pm != pmap_kernel()) { kcpuset_merge(ci->ci_tlb_cpuset, pm->pm_cpus); if (va >= VM_MAXUSER_ADDRESS) { kcpuset_merge(ci->ci_tlb_cpuset, pm->pm_kernel_cpus); } TP_SET_USERPMAP(tp); } else { kcpuset_copy(ci->ci_tlb_cpuset, kcpuset_running); } pmap_tlbstat_count(pm, va, why); splx(s); } #ifdef XENPV static inline void pmap_tlb_processpacket(volatile pmap_tlb_packet_t *tp, kcpuset_t *target) { #ifdef MULTIPROCESSOR int i = TP_GET_COUNT(tp); if (i != TP_ALLVA) { /* Invalidating a single page or a range of pages. */ KASSERT(i != 0); do { --i; xen_mcast_invlpg(TP_GET_VA(tp, i), target); } while (i > 0); } else { xen_mcast_tlbflush(target); } /* Remote CPUs have been synchronously flushed. */ pmap_tlb_pendcount = 0; pmap_tlb_packet = NULL; TP_SET_DONE(tp); #endif /* MULTIPROCESSOR */ } #else static inline void pmap_tlb_processpacket(volatile pmap_tlb_packet_t *tp, kcpuset_t *target) { #ifdef MULTIPROCESSOR int err = 0; if (!kcpuset_match(target, kcpuset_attached)) { const struct cpu_info * const self = curcpu(); CPU_INFO_ITERATOR cii; struct cpu_info *lci; for (CPU_INFO_FOREACH(cii, lci)) { const cpuid_t lcid = cpu_index(lci); if (__predict_false(lci == self) || !kcpuset_isset(target, lcid)) { continue; } err |= x86_ipi(LAPIC_TLB_VECTOR, lci->ci_cpuid, LAPIC_DLMODE_FIXED); } } else { err = x86_ipi(LAPIC_TLB_VECTOR, LAPIC_DEST_ALLEXCL, LAPIC_DLMODE_FIXED); } KASSERT(err == 0); #endif /* MULTIPROCESSOR */ } #endif /* XENPV */ /* * pmap_tlb_shootnow: process pending TLB shootdowns queued on current CPU. * * => Must be called with preemption disabled. */ void pmap_tlb_shootnow(void) { volatile pmap_tlb_packet_t *tp, *ts; volatile uint8_t stackbuf[sizeof(*tp) + COHERENCY_UNIT]; struct cpu_info *ci; kcpuset_t *target; u_int local, rcpucount; cpuid_t cid; int s; KASSERT(kpreempt_disabled()); /* Pre-check first. */ ci = curcpu(); tp = (pmap_tlb_packet_t *)ci->ci_pmap_data; if (TP_GET_COUNT(tp) == 0) { return; } /* An interrupt may have flushed our updates, so check again. */ s = splvm(); if (TP_GET_COUNT(tp) == 0) { splx(s); return; } cid = cpu_index(ci); target = ci->ci_tlb_cpuset; local = kcpuset_isset(target, cid) ? 1 : 0; rcpucount = kcpuset_countset(target) - local; /* * Fast path for local shootdowns only. Do the shootdowns, and * clear out the buffer for the next user. */ if (rcpucount == 0) { pmap_tlb_invalidate(tp); kcpuset_zero(ci->ci_tlb_cpuset); TP_CLEAR(tp); splx(s); return; } /* * Copy the packet into the stack buffer, and gain ownership of the * global pointer. We must keep interrupts blocked once we own the * pointer and until the IPIs are triggered, or we could deadlock * against an interrupt on the current CPU trying the same. */ KASSERT(rcpucount < ncpu); ts = (void *)roundup2((uintptr_t)stackbuf, COHERENCY_UNIT); *ts = *tp; KASSERT(TP_GET_DONE(ts) == 0); while (atomic_cas_ptr(&pmap_tlb_packet, NULL, __UNVOLATILE(ts)) != NULL) { KASSERT(atomic_load_relaxed(&pmap_tlb_packet) != ts); /* * Don't bother with exponentional backoff, as the pointer * is in a dedicated cache line and only updated twice per * IPI (in contrast to the pending counter). The cache * line will spend most of its time in the SHARED state. */ splx(s); do { x86_pause(); } while (atomic_load_relaxed(&pmap_tlb_packet) != NULL); s = splvm(); /* * An interrupt might have done the shootdowns for * us while we spun. */ if (TP_GET_COUNT(tp) == 0) { splx(s); return; } } /* * Ownership of the global pointer provides serialization of the * update to the count and the event counter. With those values * updated, start shootdowns on remote CPUs. */ pmap_tlb_pendcount = rcpucount; pmap_tlb_evcnt.ev_count++; pmap_tlb_processpacket(ts, target); /* * Clear out the local CPU's buffer for the next user. Once done, * we can drop the IPL. */ #ifdef TLBSTATS if (TP_GET_COUNT(tp) != TP_ALLVA) { atomic_add_64(&tlbstat_single_issue.ev_count, TP_GET_COUNT(tp)); } #endif kcpuset_zero(ci->ci_tlb_cpuset); TP_CLEAR(tp); splx(s); /* * Shootdowns on remote CPUs are now in flight. In the meantime, * perform local shootdown if needed, using our copy of the packet. */ if (local) { pmap_tlb_invalidate(ts); } /* * Wait for the updates to be processed by remote CPUs. Poll the * flag in the packet in order to limit bus traffic (only the last * CPU out will update it and only we are reading it). No memory * barrier required due to prior stores - yay x86. */ while (TP_GET_DONE(ts) == 0) { x86_pause(); } } /* * pmap_tlb_intr: pmap shootdown interrupt handler to invalidate TLB entries. * * Called from IPI only. We are outside the SPL framework, with interrupts * disabled on the CPU: be careful. * * TLB flush and the interrupt that brought us here are serializing * operations (they defeat speculative execution). Any speculative load * producing a TLB fill between receipt of the interrupt and the TLB flush * will load "current" PTEs. None of the mappings relied on by this ISR for * its execution will be changing. So it's safe to acknowledge the request * and allow the initiator to proceed before performing the flush. */ void pmap_tlb_intr(void) { pmap_tlb_packet_t copy; volatile pmap_tlb_packet_t *source; struct cpu_info *ci; /* Make a private copy of the packet. */ source = pmap_tlb_packet; copy = *source; /* * If we are the last CPU out, clear the active pointer and mark the * packet as done. Both can be done without using an atomic, and * the one atomic we do use serves as our memory barrier. * * It's important to clear the active pointer before setting * TP_DONE, to ensure a remote CPU does not exit & re-enter * pmap_tlb_shootnow() only to find its current pointer still * seemingly active. */ if (atomic_dec_uint_nv(&pmap_tlb_pendcount) == 0) { atomic_store_relaxed(&pmap_tlb_packet, NULL); __insn_barrier(); TP_SET_DONE(source); } pmap_tlb_invalidate(&copy); /* * Check the current TLB state. If we don't want further flushes * for this pmap, then take the CPU out of the pmap's set. The * order of updates to the set and TLB state must closely align with * the pmap code, as we can interrupt code running in the pmap * module. */ ci = curcpu(); if (ci->ci_tlbstate == TLBSTATE_LAZY && TP_GET_USERPMAP(&copy) != 0) { kcpuset_atomic_clear(ci->ci_pmap->pm_cpus, cpu_index(ci)); ci->ci_tlbstate = TLBSTATE_STALE; } }
241 163 102 155 9 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 /* $NetBSD: kern_malloc.c,v 1.158 2019/11/14 16:23:52 maxv Exp $ */ /* * Copyright (c) 1987, 1991, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)kern_malloc.c 8.4 (Berkeley) 5/20/95 */ /* * Copyright (c) 1996 Christopher G. Demetriou. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)kern_malloc.c 8.4 (Berkeley) 5/20/95 */ /* * Wrapper interface for obsolete malloc(9). */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: kern_malloc.c,v 1.158 2019/11/14 16:23:52 maxv Exp $"); #include <sys/param.h> #include <sys/malloc.h> #include <sys/kmem.h> #include <sys/asan.h> #include <sys/msan.h> /* * Built-in malloc types. Note: ought to be removed. */ MALLOC_DEFINE(M_DEVBUF, "devbuf", "device driver memory"); MALLOC_DEFINE(M_DMAMAP, "DMA map", "bus_dma(9) structures"); MALLOC_DEFINE(M_FREE, "free", "should be on free list"); MALLOC_DEFINE(M_TEMP, "temp", "misc. temporary data buffers"); MALLOC_DEFINE(M_RTABLE, "routetbl", "routing tables"); MALLOC_DEFINE(M_FTABLE, "fragtbl", "fragment reassembly header"); MALLOC_DEFINE(M_UFSMNT, "UFS mount", "UFS mount structure"); MALLOC_DEFINE(M_NETADDR, "Export Host", "Export host address structure"); MALLOC_DEFINE(M_MRTABLE, "mrt", "multicast routing tables"); /* * Header contains total size, including the header itself. */ struct malloc_header { size_t mh_size; #ifdef KASAN size_t mh_rqsz; #endif } __aligned(ALIGNBYTES + 1); void * kern_malloc(unsigned long reqsize, int flags) { const int kmflags = (flags & M_NOWAIT) ? KM_NOSLEEP : KM_SLEEP; #ifdef KASAN const size_t origsize = reqsize; #endif size_t size = reqsize; size_t allocsize, hdroffset; struct malloc_header *mh; void *p; kasan_add_redzone(&size); if (size >= PAGE_SIZE) { if (size > (ULONG_MAX-PAGE_SIZE)) allocsize = ULONG_MAX; /* this will fail later */ else allocsize = PAGE_SIZE + size; /* for page alignment */ hdroffset = PAGE_SIZE - sizeof(struct malloc_header); } else { allocsize = sizeof(struct malloc_header) + size; hdroffset = 0; } p = kmem_intr_alloc(allocsize, kmflags); if (p == NULL) return NULL; kmsan_mark(p, allocsize, KMSAN_STATE_UNINIT); kmsan_orig(p, allocsize, KMSAN_TYPE_MALLOC, __RET_ADDR); if ((flags & M_ZERO) != 0) { memset(p, 0, allocsize); } mh = (void *)((char *)p + hdroffset); mh->mh_size = allocsize - hdroffset; #ifdef KASAN mh->mh_rqsz = origsize; #endif mh++; kasan_mark(mh, origsize, size, KASAN_MALLOC_REDZONE); return mh; } void kern_free(void *addr) { struct malloc_header *mh; mh = addr; mh--; kasan_mark(addr, mh->mh_size - sizeof(struct malloc_header), mh->mh_size - sizeof(struct malloc_header), KASAN_MALLOC_REDZONE); if (mh->mh_size >= PAGE_SIZE + sizeof(struct malloc_header)) { kmsan_mark((char *)addr - PAGE_SIZE, mh->mh_size + PAGE_SIZE - sizeof(struct malloc_header), KMSAN_STATE_INITED); kmem_intr_free((char *)addr - PAGE_SIZE, mh->mh_size + PAGE_SIZE - sizeof(struct malloc_header)); } else { kmsan_mark(mh, mh->mh_size, KMSAN_STATE_INITED); kmem_intr_free(mh, mh->mh_size); } } void * kern_realloc(void *curaddr, unsigned long newsize, int flags) { struct malloc_header *mh; unsigned long cursize; void *newaddr; /* * realloc() with a NULL pointer is the same as malloc(). */ if (curaddr == NULL) return malloc(newsize, ksp, flags); /* * realloc() with zero size is the same as free(). */ if (newsize == 0) { free(curaddr, ksp); return NULL; } if ((flags & M_NOWAIT) == 0) { ASSERT_SLEEPABLE(); } mh = curaddr; mh--; #ifdef KASAN cursize = mh->mh_rqsz; #else cursize = mh->mh_size - sizeof(struct malloc_header); #endif /* * If we already actually have as much as they want, we're done. */ if (newsize <= cursize) return curaddr; /* * Can't satisfy the allocation with the existing block. * Allocate a new one and copy the data. */ newaddr = malloc(newsize, ksp, flags); if (__predict_false(newaddr == NULL)) { /* * malloc() failed, because flags included M_NOWAIT. * Return NULL to indicate that failure. The old * pointer is still valid. */ return NULL; } memcpy(newaddr, curaddr, cursize); /* * We were successful: free the old allocation and return * the new one. */ free(curaddr, ksp); return newaddr; }
1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 /* $NetBSD: sysctl.h,v 1.239 2024/01/20 13:15:46 christos Exp $ */ /* * Copyright (c) 1989, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * Mike Karels at Berkeley Software Design, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)sysctl.h 8.1 (Berkeley) 6/2/93 */ #ifndef _SYS_SYSCTL_H_ #define _SYS_SYSCTL_H_ #include <sys/param.h> /* precautionary upon removal from ucred.h */ #include <sys/proc.h> /* Needed for things like P_ZOMBIE() and LW_SINTR */ #include <uvm/uvm_param.h> #if defined(_KERNEL) || defined(_KMEMUSER) /* * These are for the eproc structure defined below. */ #include <sys/time.h> #include <sys/ucred.h> #include <sys/ucontext.h> #include <sys/mallocvar.h> #include <uvm/uvm_extern.h> #endif /* For offsetof() */ #if defined(_KERNEL) || defined(_STANDALONE) #include <sys/systm.h> #else #include <stddef.h> #include <stdbool.h> #endif /* * Definitions for sysctl call. The sysctl call uses a hierarchical name * for objects that can be examined or modified. The name is expressed as * a sequence of integers. Like a file path name, the meaning of each * component depends on its place in the hierarchy. The top-level and kern * identifiers are defined here, and other identifiers are defined in the * respective subsystem header files. */ struct sysctlnode; #define CTL_MAXNAME 12 /* largest number of components supported */ #define SYSCTL_NAMELEN 32 /* longest name allowed for a node */ #define CREATE_BASE (1024) /* start of dynamic mib allocation */ #define SYSCTL_DEFSIZE 8 /* initial size of a child set */ /* * Each subsystem defined by sysctl defines a list of variables * for that subsystem. Each name is either a node with further * levels defined below it, or it is a leaf of some particular * type given below. Each sysctl level defines a set of name/type * pairs to be used by sysctl(1) in manipulating the subsystem. */ struct ctlname { const char *ctl_name; /* subsystem name */ int ctl_type; /* type of name */ }; #define CTLTYPE_NODE 1 /* name is a node */ #define CTLTYPE_INT 2 /* name describes an integer */ #define CTLTYPE_STRING 3 /* name describes a string */ #define CTLTYPE_QUAD 4 /* name describes a 64-bit number */ #define CTLTYPE_STRUCT 5 /* name describes a structure */ #define CTLTYPE_BOOL 6 /* name describes a bool */ #ifdef _LP64 #define CTLTYPE_LONG CTLTYPE_QUAD #else #define CTLTYPE_LONG CTLTYPE_INT #endif /* * Flags that apply to each node, governing access and other features */ #define CTLFLAG_READONLY 0x00000000 /* #define CTLFLAG_UNUSED1 0x00000010 */ /* #define CTLFLAG_UNUSED2 0x00000020 */ /* #define CTLFLAG_READ* 0x00000040 */ #define CTLFLAG_READWRITE 0x00000070 #define CTLFLAG_ANYWRITE 0x00000080 #define CTLFLAG_PRIVATE 0x00000100 #define CTLFLAG_PERMANENT 0x00000200 #define CTLFLAG_OWNDATA 0x00000400 #define CTLFLAG_IMMEDIATE 0x00000800 #define CTLFLAG_HEX 0x00001000 #define CTLFLAG_ROOT 0x00002000 #define CTLFLAG_ANYNUMBER 0x00004000 #define CTLFLAG_HIDDEN 0x00008000 #define CTLFLAG_ALIAS 0x00010000 #define CTLFLAG_MMAP 0x00020000 #define CTLFLAG_OWNDESC 0x00040000 #define CTLFLAG_UNSIGNED 0x00080000 /* * sysctl API version */ #define SYSCTL_VERS_MASK 0xff000000 #define SYSCTL_VERS_0 0x00000000 #define SYSCTL_VERS_1 0x01000000 #define SYSCTL_VERSION SYSCTL_VERS_1 #define SYSCTL_VERS(f) ((f) & SYSCTL_VERS_MASK) /* * Flags that can be set by a create request from user-space */ #define SYSCTL_USERFLAGS (CTLFLAG_READWRITE|\ CTLFLAG_ANYWRITE|\ CTLFLAG_PRIVATE|\ CTLFLAG_OWNDATA|\ CTLFLAG_IMMEDIATE|\ CTLFLAG_HEX|\ CTLFLAG_HIDDEN) /* * Accessor macros */ #define SYSCTL_TYPEMASK 0x0000000f #define SYSCTL_TYPE(x) ((x) & SYSCTL_TYPEMASK) #define SYSCTL_FLAGMASK 0x00fffff0 #define SYSCTL_FLAGS(x) ((x) & SYSCTL_FLAGMASK) /* * Meta-identifiers */ #define CTL_EOL (-1) /* end of createv/destroyv list */ #define CTL_QUERY (-2) /* enumerates children of a node */ #define CTL_CREATE (-3) /* node create request */ #define CTL_CREATESYM (-4) /* node create request with symbol */ #define CTL_DESTROY (-5) /* node destroy request */ #define CTL_MMAP (-6) /* mmap request */ #define CTL_DESCRIBE (-7) /* get node descriptions */ /* * Top-level identifiers */ #define CTL_UNSPEC 0 /* unused */ #define CTL_KERN 1 /* "high kernel": proc, limits */ #define CTL_VM 2 /* virtual memory */ #define CTL_VFS 3 /* file system, mount type is next */ #define CTL_NET 4 /* network, see socket.h */ #define CTL_DEBUG 5 /* debugging parameters */ #define CTL_HW 6 /* generic CPU/io */ #define CTL_MACHDEP 7 /* machine dependent */ #define CTL_USER 8 /* user-level */ #define CTL_DDB 9 /* in-kernel debugger */ #define CTL_PROC 10 /* per-proc attr */ #define CTL_VENDOR 11 /* vendor-specific data */ #define CTL_EMUL 12 /* emulation-specific data */ #define CTL_SECURITY 13 /* security */ /* * The "vendor" toplevel name is to be used by vendors who wish to * have their own private MIB tree. If you do that, please use * vendor.<yourname>.* */ /* * CTL_KERN identifiers */ #define KERN_OSTYPE 1 /* string: system version */ #define KERN_OSRELEASE 2 /* string: system release */ #define KERN_OSREV 3 /* int: system revision */ #define KERN_VERSION 4 /* string: compile time info */ #define KERN_MAXVNODES 5 /* int: max vnodes */ #define KERN_MAXPROC 6 /* int: max processes */ #define KERN_MAXFILES 7 /* int: max open files */ #define KERN_ARGMAX 8 /* int: max arguments to exec */ #define KERN_SECURELVL 9 /* int: system security level */ #define KERN_HOSTNAME 10 /* string: hostname */ #define KERN_HOSTID 11 /* int: host identifier */ #define KERN_CLOCKRATE 12 /* struct: struct clockinfo */ #define KERN_VNODE 13 /* struct: vnode structures */ #define KERN_PROC 14 /* struct: process entries */ #define KERN_FILE 15 /* struct: file entries */ #define KERN_PROF 16 /* node: kernel profiling info */ #define KERN_POSIX1 17 /* int: POSIX.1 version */ #define KERN_NGROUPS 18 /* int: # of supplemental group ids */ #define KERN_JOB_CONTROL 19 /* int: is job control available */ #define KERN_SAVED_IDS 20 /* int: saved set-user/group-ID */ #define KERN_OBOOTTIME 21 /* struct: time kernel was booted */ #define KERN_DOMAINNAME 22 /* string: (YP) domainname */ #define KERN_MAXPARTITIONS 23 /* int: number of partitions/disk */ #define KERN_RAWPARTITION 24 /* int: raw partition number */ #define KERN_NTPTIME 25 /* struct: extended-precision time */ #define KERN_TIMEX 26 /* struct: ntp timekeeping state */ #define KERN_AUTONICETIME 27 /* int: proc time before autonice */ #define KERN_AUTONICEVAL 28 /* int: auto nice value */ #define KERN_RTC_OFFSET 29 /* int: offset of rtc from gmt */ #define KERN_ROOT_DEVICE 30 /* string: root device */ #define KERN_MSGBUFSIZE 31 /* int: max # of chars in msg buffer */ #define KERN_FSYNC 32 /* int: file synchronization support */ #define KERN_OLDSYSVMSG 33 /* old: SysV message queue support */ #define KERN_OLDSYSVSEM 34 /* old: SysV semaphore support */ #define KERN_OLDSYSVSHM 35 /* old: SysV shared memory support */ #define KERN_OLDSHORTCORENAME 36 /* old, unimplemented */ #define KERN_SYNCHRONIZED_IO 37 /* int: POSIX synchronized I/O */ #define KERN_IOV_MAX 38 /* int: max iovec's for readv(2) etc. */ #define KERN_MBUF 39 /* node: mbuf parameters */ #define KERN_MAPPED_FILES 40 /* int: POSIX memory mapped files */ #define KERN_MEMLOCK 41 /* int: POSIX memory locking */ #define KERN_MEMLOCK_RANGE 42 /* int: POSIX memory range locking */ #define KERN_MEMORY_PROTECTION 43 /* int: POSIX memory protections */ #define KERN_LOGIN_NAME_MAX 44 /* int: max length login name + NUL */ #define KERN_DEFCORENAME 45 /* old: sort core name format */ #define KERN_LOGSIGEXIT 46 /* int: log signaled processes */ #define KERN_PROC2 47 /* struct: process entries */ #define KERN_PROC_ARGS 48 /* struct: process argv/env */ #define KERN_FSCALE 49 /* int: fixpt FSCALE */ #define KERN_CCPU 50 /* old: fixpt ccpu */ #define KERN_CP_TIME 51 /* struct: CPU time counters */ #define KERN_OLDSYSVIPC_INFO 52 /* old: number of valid kern ids */ #define KERN_MSGBUF 53 /* kernel message buffer */ #define KERN_CONSDEV 54 /* dev_t: console terminal device */ #define KERN_MAXPTYS 55 /* int: maximum number of ptys */ #define KERN_PIPE 56 /* node: pipe limits */ #define KERN_MAXPHYS 57 /* int: kernel value of MAXPHYS */ #define KERN_SBMAX 58 /* int: max socket buffer size */ #define KERN_TKSTAT 59 /* tty in/out counters */ #define KERN_MONOTONIC_CLOCK 60 /* int: POSIX monotonic clock */ #define KERN_URND 61 /* int: random integer from urandom */ #define KERN_LABELSECTOR 62 /* int: disklabel sector */ #define KERN_LABELOFFSET 63 /* int: offset of label within sector */ #define KERN_LWP 64 /* struct: lwp entries */ #define KERN_FORKFSLEEP 65 /* int: sleep length on failed fork */ #define KERN_POSIX_THREADS 66 /* int: POSIX Threads option */ #define KERN_POSIX_SEMAPHORES 67 /* int: POSIX Semaphores option */ #define KERN_POSIX_BARRIERS 68 /* int: POSIX Barriers option */ #define KERN_POSIX_TIMERS 69 /* int: POSIX Timers option */ #define KERN_POSIX_SPIN_LOCKS 70 /* int: POSIX Spin Locks option */ #define KERN_POSIX_READER_WRITER_LOCKS 71 /* int: POSIX R/W Locks option */ #define KERN_DUMP_ON_PANIC 72 /* int: dump on panic */ #define KERN_SOMAXKVA 73 /* int: max socket kernel virtual mem */ #define KERN_ROOT_PARTITION 74 /* int: root partition */ #define KERN_DRIVERS 75 /* struct: driver names and majors #s */ #define KERN_BUF 76 /* struct: buffers */ #define KERN_FILE2 77 /* struct: file entries */ #define KERN_VERIEXEC 78 /* node: verified exec */ #define KERN_CP_ID 79 /* struct: cpu id numbers */ #define KERN_HARDCLOCK_TICKS 80 /* int: number of hardclock ticks */ #define KERN_ARND 81 /* void *buf, size_t siz random */ #define KERN_SYSVIPC 82 /* node: SysV IPC parameters */ #define KERN_BOOTTIME 83 /* struct: time kernel was booted */ #define KERN_EVCNT 84 /* struct: evcnts */ #define KERN_SOFIXEDBUF 85 /* bool: fixed socket buffer sizes */ /* * KERN_CLOCKRATE structure */ struct clockinfo { int hz; /* clock frequency */ int tick; /* micro-seconds per hz tick */ int tickadj; /* clock skew rate for adjtime() */ int stathz; /* statistics clock frequency */ int profhz; /* profiling clock frequency */ }; /* * KERN_PROC subtypes */ #define KERN_PROC_ALL 0 /* everything */ #define KERN_PROC_PID 1 /* by process id */ #define KERN_PROC_PGRP 2 /* by process group id */ #define KERN_PROC_SESSION 3 /* by session of pid */ #define KERN_PROC_TTY 4 /* by controlling tty */ #define KERN_PROC_UID 5 /* by effective uid */ #define KERN_PROC_RUID 6 /* by real uid */ #define KERN_PROC_GID 7 /* by effective gid */ #define KERN_PROC_RGID 8 /* by real gid */ /* * KERN_PROC_TTY sub-subtypes */ #define KERN_PROC_TTY_NODEV NODEV /* no controlling tty */ #define KERN_PROC_TTY_REVOKE ((dev_t)-2) /* revoked tty */ struct ki_pcred { void *p_pad; uid_t p_ruid; /* Real user id */ uid_t p_svuid; /* Saved effective user id */ gid_t p_rgid; /* Real group id */ gid_t p_svgid; /* Saved effective group id */ int p_refcnt; /* Number of references */ }; struct ki_ucred { uint32_t cr_ref; /* reference count */ uid_t cr_uid; /* effective user id */ gid_t cr_gid; /* effective group id */ uint32_t cr_ngroups; /* number of groups */ gid_t cr_groups[NGROUPS]; /* groups */ }; #if defined(_KERNEL) || defined(_KMEMUSER) struct eproc { struct proc *e_paddr; /* address of proc */ struct session *e_sess; /* session pointer */ struct ki_pcred e_pcred; /* process credentials */ struct ki_ucred e_ucred; /* current credentials */ struct vmspace e_vm; /* address space */ pid_t e_ppid; /* parent process id */ pid_t e_pgid; /* process group id */ short e_jobc; /* job control counter */ uint32_t e_tdev; /* XXX: controlling tty dev */ pid_t e_tpgid; /* tty process group id */ struct session *e_tsess; /* tty session pointer */ #define WMESGLEN 8 char e_wmesg[WMESGLEN]; /* wchan message */ segsz_t e_xsize; /* text size */ short e_xrssize; /* text rss */ short e_xccount; /* text references */ short e_xswrss; long e_flag; /* see p_eflag below */ char e_login[MAXLOGNAME]; /* setlogin() name */ pid_t e_sid; /* session id */ long e_spare[3]; }; /* * KERN_PROC subtype ops return arrays of augmented proc structures: */ struct kinfo_proc { struct proc kp_proc; /* proc structure */ struct eproc kp_eproc; /* eproc structure */ }; #endif /* defined(_KERNEL) || defined(_KMEMUSER) */ /* * Convert pointer to 64 bit unsigned integer for struct * kinfo_proc2, etc. */ #define PTRTOUINT64(p) ((uint64_t)(uintptr_t)(p)) #define UINT64TOPTR(u) ((void *)(uintptr_t)(u)) /* * KERN_PROC2 subtype ops return arrays of relatively fixed size * structures of process info. Use 8 byte alignment, and new * elements should only be added to the end of this structure so * binary compatibility can be preserved. */ #define KI_NGROUPS 16 #define KI_MAXCOMLEN 24 /* extra for 8 byte alignment */ #define KI_WMESGLEN 8 #define KI_MAXLOGNAME 24 /* extra for 8 byte alignment */ #define KI_MAXEMULLEN 16 #define KI_LNAMELEN 20 /* extra 4 for alignment */ #define KI_NOCPU (~(uint64_t)0) typedef struct { uint32_t __bits[4]; } ki_sigset_t; struct kinfo_proc2 { uint64_t p_forw; /* PTR: linked run/sleep queue. */ uint64_t p_back; uint64_t p_paddr; /* PTR: address of proc */ uint64_t p_addr; /* PTR: Kernel virtual addr of u-area */ uint64_t p_fd; /* PTR: Ptr to open files structure. */ uint64_t p_cwdi; /* PTR: cdir/rdir/cmask info */ uint64_t p_stats; /* PTR: Accounting/statistics */ uint64_t p_limit; /* PTR: Process limits. */ uint64_t p_vmspace; /* PTR: Address space. */ uint64_t p_sigacts; /* PTR: Signal actions, state */ uint64_t p_sess; /* PTR: session pointer */ uint64_t p_tsess; /* PTR: tty session pointer */ uint64_t p_ru; /* PTR: Exit information. XXX */ int32_t p_eflag; /* LONG: extra kinfo_proc2 flags */ #define EPROC_CTTY 0x01 /* controlling tty vnode active */ #define EPROC_SLEADER 0x02 /* session leader */ int32_t p_exitsig; /* INT: signal to sent to parent on exit */ int32_t p_flag; /* INT: P_* flags. */ int32_t p_pid; /* PID_T: Process identifier. */ int32_t p_ppid; /* PID_T: Parent process id */ int32_t p_sid; /* PID_T: session id */ int32_t p__pgid; /* PID_T: process group id */ /* XXX: <sys/proc.h> hijacks p_pgid */ int32_t p_tpgid; /* PID_T: tty process group id */ uint32_t p_uid; /* UID_T: effective user id */ uint32_t p_ruid; /* UID_T: real user id */ uint32_t p_gid; /* GID_T: effective group id */ uint32_t p_rgid; /* GID_T: real group id */ uint32_t p_groups[KI_NGROUPS]; /* GID_T: groups */ int16_t p_ngroups; /* SHORT: number of groups */ int16_t p_jobc; /* SHORT: job control counter */ uint32_t p_tdev; /* XXX: DEV_T: controlling tty dev */ uint32_t p_estcpu; /* U_INT: Time averaged value of p_cpticks. */ uint32_t p_rtime_sec; /* STRUCT TIMEVAL: Real time. */ uint32_t p_rtime_usec; /* STRUCT TIMEVAL: Real time. */ int32_t p_cpticks; /* INT: Ticks of CPU time. */ uint32_t p_pctcpu; /* FIXPT_T: %cpu for this process during p_swtime */ uint32_t p_swtime; /* U_INT: Time swapped in or out. */ uint32_t p_slptime; /* U_INT: Time since last blocked. */ int32_t p_schedflags; /* INT: PSCHED_* flags */ uint64_t p_uticks; /* U_QUAD_T: Statclock hits in user mode. */ uint64_t p_sticks; /* U_QUAD_T: Statclock hits in system mode. */ uint64_t p_iticks; /* U_QUAD_T: Statclock hits processing intr. */ uint64_t p_tracep; /* PTR: Trace to vnode or file */ int32_t p_traceflag; /* INT: Kernel trace points. */ int32_t p_holdcnt; /* INT: If non-zero, don't swap. */ ki_sigset_t p_siglist; /* SIGSET_T: Signals arrived but not delivered. */ ki_sigset_t p_sigmask; /* SIGSET_T: Current signal mask. */ ki_sigset_t p_sigignore; /* SIGSET_T: Signals being ignored. */ ki_sigset_t p_sigcatch; /* SIGSET_T: Signals being caught by user. */ int8_t p_stat; /* CHAR: S* process status (from LWP). */ uint8_t p_priority; /* U_CHAR: Process priority. */ uint8_t p_usrpri; /* U_CHAR: User-priority based on p_cpu and p_nice. */ uint8_t p_nice; /* U_CHAR: Process "nice" value. */ uint16_t p_xstat; /* U_SHORT: Exit status for wait; also stop signal. */ uint16_t p_acflag; /* U_SHORT: Accounting flags. */ char p_comm[KI_MAXCOMLEN]; char p_wmesg[KI_WMESGLEN]; /* wchan message */ uint64_t p_wchan; /* PTR: sleep address. */ char p_login[KI_MAXLOGNAME]; /* setlogin() name */ int32_t p_vm_rssize; /* SEGSZ_T: current resident set size in pages */ int32_t p_vm_tsize; /* SEGSZ_T: text size (pages) */ int32_t p_vm_dsize; /* SEGSZ_T: data size (pages) */ int32_t p_vm_ssize; /* SEGSZ_T: stack size (pages) */ int64_t p_uvalid; /* CHAR: following p_u* parameters are valid */ /* XXX 64 bits for alignment */ uint32_t p_ustart_sec; /* STRUCT TIMEVAL: starting time. */ uint32_t p_ustart_usec; /* STRUCT TIMEVAL: starting time. */ uint32_t p_uutime_sec; /* STRUCT TIMEVAL: user time. */ uint32_t p_uutime_usec; /* STRUCT TIMEVAL: user time. */ uint32_t p_ustime_sec; /* STRUCT TIMEVAL: system time. */ uint32_t p_ustime_usec; /* STRUCT TIMEVAL: system time. */ uint64_t p_uru_maxrss; /* LONG: max resident set size. */ uint64_t p_uru_ixrss; /* LONG: integral shared memory size. */ uint64_t p_uru_idrss; /* LONG: integral unshared data ". */ uint64_t p_uru_isrss; /* LONG: integral unshared stack ". */ uint64_t p_uru_minflt; /* LONG: page reclaims. */ uint64_t p_uru_majflt; /* LONG: page faults. */ uint64_t p_uru_nswap; /* LONG: swaps. */ uint64_t p_uru_inblock; /* LONG: block input operations. */ uint64_t p_uru_oublock; /* LONG: block output operations. */ uint64_t p_uru_msgsnd; /* LONG: messages sent. */ uint64_t p_uru_msgrcv; /* LONG: messages received. */ uint64_t p_uru_nsignals; /* LONG: signals received. */ uint64_t p_uru_nvcsw; /* LONG: voluntary context switches. */ uint64_t p_uru_nivcsw; /* LONG: involuntary ". */ uint32_t p_uctime_sec; /* STRUCT TIMEVAL: child u+s time. */ uint32_t p_uctime_usec; /* STRUCT TIMEVAL: child u+s time. */ uint64_t p_cpuid; /* LONG: CPU id */ uint64_t p_realflag; /* INT: P_* flags (not including LWPs). */ uint64_t p_nlwps; /* LONG: Number of LWPs */ uint64_t p_nrlwps; /* LONG: Number of running LWPs */ uint64_t p_realstat; /* LONG: non-LWP process status */ uint32_t p_svuid; /* UID_T: saved user id */ uint32_t p_svgid; /* GID_T: saved group id */ char p_ename[KI_MAXEMULLEN]; /* emulation name */ int64_t p_vm_vsize; /* SEGSZ_T: total map size (pages) */ int64_t p_vm_msize; /* SEGSZ_T: stack-adjusted map size (pages) */ }; /* * Compat flags for kinfo_proc, kinfo_proc2. Not guaranteed to be stable. * Some of them used to be shared with LWP flags. * XXXAD Trim to the minimum necessary... */ #define P_ADVLOCK 0x00000001 #define P_CONTROLT 0x00000002 #define L_INMEM 0x00000004 #define P_INMEM /* 0x00000004 */ L_INMEM #define P_NOCLDSTOP 0x00000008 #define P_PPWAIT 0x00000010 #define P_PROFIL 0x00000020 #define L_SELECT 0x00000040 #define P_SELECT /* 0x00000040 */ L_SELECT #define L_SINTR 0x00000080 #define P_SINTR /* 0x00000080 */ L_SINTR #define P_SUGID 0x00000100 #define L_SYSTEM 0x00000200 #define P_SYSTEM /* 0x00000200 */ L_SYSTEM #define L_SA 0x00000400 #define P_SA /* 0x00000400 */ L_SA #define P_TRACED 0x00000800 #define P_WAITED 0x00001000 #define P_WEXIT 0x00002000 #define P_EXEC 0x00004000 #define P_OWEUPC 0x00008000 #define P_NOCLDWAIT 0x00020000 #define P_32 0x00040000 #define P_CLDSIGIGN 0x00080000 #define P_SYSTRACE 0x00200000 #define P_CHTRACED 0x00400000 #define P_STOPFORK 0x00800000 #define P_STOPEXEC 0x01000000 #define P_STOPEXIT 0x02000000 #define P_SYSCALL 0x04000000 /* * LWP compat flags. */ #define L_DETACHED 0x00800000 #define __SYSCTL_PROC_FLAG_BITS \ "\20" \ "\1ADVLOCK" \ "\2CONTROLT" \ "\3INMEM" \ "\4NOCLDSTOP" \ "\5PPWAIT" \ "\6PROFIL" \ "\7SELECT" \ "\10SINTR" \ "\11SUGID" \ "\12SYSTEM" \ "\13SA" \ "\14TRACED" \ "\15WAITED" \ "\16WEXIT" \ "\17EXEC" \ "\20OWEUPC" \ "\22NOCLDWAIT" \ "\23P32" \ "\24CLDSIGIGN" \ "\26SYSTRACE" \ "\27CHTRACED" \ "\30STOPFORK" \ "\31STOPEXEC" \ "\32STOPEXIT" \ "\33SYSCALL" /* * KERN_LWP structure. See notes on KERN_PROC2 about adding elements. */ struct kinfo_lwp { uint64_t l_forw; /* PTR: linked run/sleep queue. */ uint64_t l_back; uint64_t l_laddr; /* PTR: Address of LWP */ uint64_t l_addr; /* PTR: Kernel virtual addr of u-area */ int32_t l_lid; /* LWPID_T: LWP identifier */ int32_t l_flag; /* INT: L_* flags. */ uint32_t l_swtime; /* U_INT: Time swapped in or out. */ uint32_t l_slptime; /* U_INT: Time since last blocked. */ int32_t l_schedflags; /* INT: PSCHED_* flags */ int32_t l_holdcnt; /* INT: If non-zero, don't swap. */ uint8_t l_priority; /* U_CHAR: Process priority. */ uint8_t l_usrpri; /* U_CHAR: User-priority based on l_cpu and p_nice. */ int8_t l_stat; /* CHAR: S* process status. */ int8_t l_pad1; /* fill out to 4-byte boundary */ int32_t l_pad2; /* .. and then to an 8-byte boundary */ char l_wmesg[KI_WMESGLEN]; /* wchan message */ uint64_t l_wchan; /* PTR: sleep address. */ uint64_t l_cpuid; /* LONG: CPU id */ uint32_t l_rtime_sec; /* STRUCT TIMEVAL: Real time. */ uint32_t l_rtime_usec; /* STRUCT TIMEVAL: Real time. */ uint32_t l_cpticks; /* INT: ticks during l_swtime */ uint32_t l_pctcpu; /* FIXPT_T: cpu usage for ps */ uint32_t l_pid; /* PID_T: process identifier */ char l_name[KI_LNAMELEN]; /* CHAR[]: name, may be empty */ }; /* * KERN_PROC_ARGS subtypes */ #define KERN_PROC_ARGV 1 /* argv */ #define KERN_PROC_NARGV 2 /* number of strings in above */ #define KERN_PROC_ENV 3 /* environ */ #define KERN_PROC_NENV 4 /* number of strings in above */ #define KERN_PROC_PATHNAME 5 /* path to executable */ #define KERN_PROC_CWD 6 /* current working dir */ /* * KERN_SYSVIPC subtypes */ #define KERN_SYSVIPC_INFO 1 /* struct: number of valid kern ids */ #define KERN_SYSVIPC_MSG 2 /* int: SysV message queue support */ #define KERN_SYSVIPC_SEM 3 /* int: SysV semaphore support */ #define KERN_SYSVIPC_SHM 4 /* int: SysV shared memory support */ #define KERN_SYSVIPC_SHMMAX 5 /* int: max shared memory segment size (bytes) */ #define KERN_SYSVIPC_SHMMNI 6 /* int: max number of shared memory identifiers */ #define KERN_SYSVIPC_SHMSEG 7 /* int: max shared memory segments per process */ #define KERN_SYSVIPC_SHMMAXPGS 8 /* int: max amount of shared memory (pages) */ #define KERN_SYSVIPC_SHMUSEPHYS 9 /* int: physical memory usage */ /* * KERN_SYSVIPC_INFO subtypes */ /* KERN_SYSVIPC_OMSG_INFO 1 */ /* KERN_SYSVIPC_OSEM_INFO 2 */ /* KERN_SYSVIPC_OSHM_INFO 3 */ #define KERN_SYSVIPC_MSG_INFO 4 /* msginfo and msgid_ds */ #define KERN_SYSVIPC_SEM_INFO 5 /* seminfo and semid_ds */ #define KERN_SYSVIPC_SHM_INFO 6 /* shminfo and shmid_ds */ /* * tty counter sysctl variables */ #define KERN_TKSTAT_NIN 1 /* total input character */ #define KERN_TKSTAT_NOUT 2 /* total output character */ #define KERN_TKSTAT_CANCC 3 /* canonical input character */ #define KERN_TKSTAT_RAWCC 4 /* raw input character */ /* * kern.drivers returns an array of these. */ struct kinfo_drivers { devmajor_t d_cmajor; devmajor_t d_bmajor; char d_name[24]; }; /* * KERN_BUF subtypes, like KERN_PROC2, where the four following mib * entries specify "which type of buf", "which particular buf", * "sizeof buf", and "how many". Currently, only "all buf" is * defined. */ #define KERN_BUF_ALL 0 /* all buffers */ /* * kern.buf returns an array of these structures, which are designed * both to be immune to 32/64 bit emulation issues and to provide * backwards compatibility. Note that the order here differs slightly * from the real struct buf in order to achieve proper 64 bit * alignment. */ struct buf_sysctl { uint32_t b_flags; /* LONG: B_* flags */ int32_t b_error; /* INT: Errno value */ int32_t b_prio; /* INT: Hint for buffer queue discipline */ uint32_t b_dev; /* DEV_T: Device associated with buffer */ uint64_t b_bufsize; /* LONG: Allocated buffer size */ uint64_t b_bcount; /* LONG: Valid bytes in buffer */ uint64_t b_resid; /* LONG: Remaining I/O */ uint64_t b_addr; /* CADDR_T: Memory, superblocks, indirect... */ uint64_t b_blkno; /* DADDR_T: Underlying physical block number */ uint64_t b_rawblkno; /* DADDR_T: Raw underlying physical block */ uint64_t b_iodone; /* PTR: Function called upon completion */ uint64_t b_proc; /* PTR: Associated proc if B_PHYS set */ uint64_t b_vp; /* PTR: File vnode */ uint64_t b_saveaddr; /* PTR: Original b_addr for physio */ uint64_t b_lblkno; /* DADDR_T: Logical block number */ }; #define KERN_BUFSLOP 20 /* * kern.file2 returns an array of these structures, which are designed * both to be immune to 32/64 bit emulation issues and to * provide backwards compatibility. The order differs slightly from * that of the real struct file, and some fields are taken from other * structures (struct vnode, struct proc) in order to make the file * information more useful. */ struct kinfo_file { uint64_t ki_fileaddr; /* PTR: address of struct file */ uint32_t ki_flag; /* INT: flags (see fcntl.h) */ uint32_t ki_iflags; /* INT: internal flags */ uint32_t ki_ftype; /* INT: descriptor type */ uint32_t ki_count; /* UINT: reference count */ uint32_t ki_msgcount; /* UINT: references from msg queue */ uint32_t ki_usecount; /* INT: number active users */ uint64_t ki_fucred; /* PTR: creds for descriptor */ uint32_t ki_fuid; /* UID_T: descriptor credentials */ uint32_t ki_fgid; /* GID_T: descriptor credentials */ uint64_t ki_fops; /* PTR: address of fileops */ uint64_t ki_foffset; /* OFF_T: offset */ uint64_t ki_fdata; /* PTR: descriptor data */ /* vnode information to glue this file to something */ uint64_t ki_vun; /* PTR: socket, specinfo, etc */ uint64_t ki_vsize; /* OFF_T: size of file */ uint32_t ki_vtype; /* ENUM: vnode type */ uint32_t ki_vtag; /* ENUM: type of underlying data */ uint64_t ki_vdata; /* PTR: private data for fs */ /* process information when retrieved via KERN_FILE_BYPID */ uint32_t ki_pid; /* PID_T: process id */ int32_t ki_fd; /* INT: descriptor number */ uint32_t ki_ofileflags; /* CHAR: open file flags */ uint32_t _ki_padto64bits; }; #define KERN_FILE_BYFILE 1 #define KERN_FILE_BYPID 2 #define KERN_FILESLOP 10 /* * kern.evcnt returns an array of these structures, which are designed both to * be immune to 32/64 bit emulation issues. Note that the struct here differs * from the real struct evcnt but contains the same information in order to * accommodate sysctl. */ struct evcnt_sysctl { uint64_t ev_count; /* current count */ uint64_t ev_addr; /* kernel address of evcnt */ uint64_t ev_parent; /* kernel address of parent */ uint8_t ev_type; /* EVCNT_TRAP_* */ uint8_t ev_grouplen; /* length of group with NUL */ uint8_t ev_namelen; /* length of name with NUL */ uint8_t ev_len; /* multiply by 8 */ /* * Now the group and name strings follow (both include the trailing * NUL). ev_name start at &ev_strings[ev_grouplen+1] */ char ev_strings[]; }; #define KERN_EVCNT_COUNT_ANY 0 #define KERN_EVCNT_COUNT_NONZERO 1 /* * kern.hashstat returns an array of these structures, which are designed * to be immune to 32/64 bit emulation issues. * * Hash users can register a filler function to fill the hashstat_sysctl * which can then be exposed via vmstat(1). * * See comments for hashstat_sysctl() in kern/subr_hash.c for details * on sysctl(3) usage. */ struct hashstat_sysctl { char hash_name[SYSCTL_NAMELEN]; char hash_desc[SYSCTL_NAMELEN]; uint64_t hash_size; uint64_t hash_used; uint64_t hash_items; uint64_t hash_maxchain; }; typedef int (*hashstat_func_t)(struct hashstat_sysctl *, bool); void hashstat_register(const char *, hashstat_func_t); /* * CTL_VM identifiers in <uvm/uvm_param.h> */ /* * The vm.proc.map sysctl allows a process to dump the VM layout of * another process as a series of entries. */ #define KVME_TYPE_NONE 0 #define KVME_TYPE_OBJECT 1 #define KVME_TYPE_VNODE 2 #define KVME_TYPE_KERN 3 #define KVME_TYPE_DEVICE 4 #define KVME_TYPE_ANON 5 #define KVME_TYPE_SUBMAP 6 #define KVME_TYPE_UNKNOWN 255 #define KVME_PROT_READ 0x00000001 #define KVME_PROT_WRITE 0x00000002 #define KVME_PROT_EXEC 0x00000004 #define KVME_FLAG_COW 0x00000001 #define KVME_FLAG_NEEDS_COPY 0x00000002 #define KVME_FLAG_NOCOREDUMP 0x00000004 #define KVME_FLAG_PAGEABLE 0x00000008 #define KVME_FLAG_GROWS_UP 0x00000010 #define KVME_FLAG_GROWS_DOWN 0x00000020 struct kinfo_vmentry { uint64_t kve_start; /* Starting address. */ uint64_t kve_end; /* Finishing address. */ uint64_t kve_offset; /* Mapping offset in object */ uint32_t kve_type; /* Type of map entry. */ uint32_t kve_flags; /* Flags on map entry. */ uint32_t kve_count; /* Number of pages/entries */ uint32_t kve_wired_count; /* Number of wired pages */ uint32_t kve_advice; /* Advice */ uint32_t kve_attributes; /* Map attribute */ uint32_t kve_protection; /* Protection bitmask. */ uint32_t kve_max_protection; /* Max protection bitmask */ uint32_t kve_ref_count; /* VM obj ref count. */ uint32_t kve_inheritance; /* Inheritance */ uint64_t kve_vn_fileid; /* inode number if vnode */ uint64_t kve_vn_size; /* File size. */ uint64_t kve_vn_fsid; /* dev_t of vnode location */ uint64_t kve_vn_rdev; /* Device id if device. */ uint32_t kve_vn_type; /* Vnode type. */ uint32_t kve_vn_mode; /* File mode. */ char kve_path[PATH_MAX]; /* Path to VM obj, if any. */ }; /* * CTL_HW identifiers */ #define HW_MACHINE 1 /* string: machine class */ #define HW_MODEL 2 /* string: specific machine model */ #define HW_NCPU 3 /* int: number of cpus */ #define HW_BYTEORDER 4 /* int: machine byte order */ #define HW_PHYSMEM 5 /* int: total memory (bytes) */ #define HW_USERMEM 6 /* int: non-kernel memory (bytes) */ #define HW_PAGESIZE 7 /* int: software page size */ #define HW_DISKNAMES 8 /* string: disk drive names */ #define HW_IOSTATS 9 /* struct: iostats[] */ #define HW_MACHINE_ARCH 10 /* string: machine architecture */ #define HW_ALIGNBYTES 11 /* int: ALIGNBYTES for the kernel */ #define HW_CNMAGIC 12 /* string: console magic sequence(s) */ #define HW_PHYSMEM64 13 /* quad: total memory (bytes) */ #define HW_USERMEM64 14 /* quad: non-kernel memory (bytes) */ #define HW_IOSTATNAMES 15 /* string: iostat names */ #define HW_NCPUONLINE 16 /* number CPUs online */ /* * CTL_USER definitions */ #define USER_CS_PATH 1 /* string: _CS_PATH */ #define USER_BC_BASE_MAX 2 /* int: BC_BASE_MAX */ #define USER_BC_DIM_MAX 3 /* int: BC_DIM_MAX */ #define USER_BC_SCALE_MAX 4 /* int: BC_SCALE_MAX */ #define USER_BC_STRING_MAX 5 /* int: BC_STRING_MAX */ #define USER_COLL_WEIGHTS_MAX 6 /* int: COLL_WEIGHTS_MAX */ #define USER_EXPR_NEST_MAX 7 /* int: EXPR_NEST_MAX */ #define USER_LINE_MAX 8 /* int: LINE_MAX */ #define USER_RE_DUP_MAX 9 /* int: RE_DUP_MAX */ #define USER_POSIX2_VERSION 10 /* int: POSIX2_VERSION */ #define USER_POSIX2_C_BIND 11 /* int: POSIX2_C_BIND */ #define USER_POSIX2_C_DEV 12 /* int: POSIX2_C_DEV */ #define USER_POSIX2_CHAR_TERM 13 /* int: POSIX2_CHAR_TERM */ #define USER_POSIX2_FORT_DEV 14 /* int: POSIX2_FORT_DEV */ #define USER_POSIX2_FORT_RUN 15 /* int: POSIX2_FORT_RUN */ #define USER_POSIX2_LOCALEDEF 16 /* int: POSIX2_LOCALEDEF */ #define USER_POSIX2_SW_DEV 17 /* int: POSIX2_SW_DEV */ #define USER_POSIX2_UPE 18 /* int: POSIX2_UPE */ #define USER_STREAM_MAX 19 /* int: POSIX2_STREAM_MAX */ #define USER_TZNAME_MAX 20 /* int: _POSIX_TZNAME_MAX */ #define USER_ATEXIT_MAX 21 /* int: {ATEXIT_MAX} */ /* * CTL_DDB definitions */ #define DDBCTL_RADIX 1 /* int: Input and output radix */ #define DDBCTL_MAXOFF 2 /* int: max symbol offset */ #define DDBCTL_MAXWIDTH 3 /* int: width of the display line */ #define DDBCTL_LINES 4 /* int: number of display lines */ #define DDBCTL_TABSTOPS 5 /* int: tab width */ #define DDBCTL_ONPANIC 6 /* int: DDB on panic if non-zero */ #define DDBCTL_FROMCONSOLE 7 /* int: DDB via console if non-zero */ /* * CTL_DEBUG definitions * * Second level identifier specifies which debug variable. * Third level identifier specifies which structure component. */ #define CTL_DEBUG_NAME 0 /* string: variable name */ #define CTL_DEBUG_VALUE 1 /* int: variable value */ /* * CTL_PROC subtype. Either a PID, or a magic value for the current proc. */ #define PROC_CURPROC (~((u_int)1 << 31)) /* * CTL_PROC tree: either corename (string), a limit * (rlimit.<type>.{hard,soft}, int), a process stop * condition, or paxflags. */ #define PROC_PID_CORENAME 1 #define PROC_PID_LIMIT 2 #define PROC_PID_STOPFORK 3 #define PROC_PID_STOPEXEC 4 #define PROC_PID_STOPEXIT 5 #define PROC_PID_PAXFLAGS 6 /* Limit types from <sys/resources.h> */ #define PROC_PID_LIMIT_CPU (RLIMIT_CPU+1) #define PROC_PID_LIMIT_FSIZE (RLIMIT_FSIZE+1) #define PROC_PID_LIMIT_DATA (RLIMIT_DATA+1) #define PROC_PID_LIMIT_STACK (RLIMIT_STACK+1) #define PROC_PID_LIMIT_CORE (RLIMIT_CORE+1) #define PROC_PID_LIMIT_RSS (RLIMIT_RSS+1) #define PROC_PID_LIMIT_MEMLOCK (RLIMIT_MEMLOCK+1) #define PROC_PID_LIMIT_NPROC (RLIMIT_NPROC+1) #define PROC_PID_LIMIT_NOFILE (RLIMIT_NOFILE+1) #define PROC_PID_LIMIT_SBSIZE (RLIMIT_SBSIZE+1) #define PROC_PID_LIMIT_AS (RLIMIT_AS+1) #define PROC_PID_LIMIT_NTHR (RLIMIT_NTHR+1) /* for each type, either hard or soft value */ #define PROC_PID_LIMIT_TYPE_SOFT 1 #define PROC_PID_LIMIT_TYPE_HARD 2 /* * Export PAX flag definitions to userland. * * XXX These are duplicated from sys/pax.h but that header is not * XXX installed. */ #define CTL_PROC_PAXFLAGS_ASLR 0x01 #define CTL_PROC_PAXFLAGS_MPROTECT 0x02 #define CTL_PROC_PAXFLAGS_GUARD 0x04 /* * CTL_EMUL definitions * * Second level identifier specifies which emulation variable. * Subsequent levels are specified in the emulations themselves. */ #define EMUL_LINUX 1 #define EMUL_LINUX32 5 #ifdef _KERNEL #if defined(_KERNEL_OPT) #include "opt_sysctl.h" #endif /* Root node of the kernel sysctl tree */ extern struct sysctlnode sysctl_root; /* * A log of nodes created by a setup function or set of setup * functions so that they can be torn down in one "transaction" * when no longer needed. * * Users of the log merely pass a pointer to a pointer, and the sysctl * infrastructure takes care of the rest. */ struct sysctllog; /* * CTL_DEBUG variables. * * These are declared as separate variables so that they can be * individually initialized at the location of their associated * variable. The loader prevents multiple use by issuing errors * if a variable is initialized in more than one place. They are * aggregated into an array in debug_sysctl(), so that it can * conveniently locate them when queried. If more debugging * variables are added, they must also be declared here and also * entered into the array. * * Note that the debug subtree is largely obsolescent in terms of * functionality now that we have dynamic sysctl, but the * infrastructure is retained for backwards compatibility. */ struct ctldebug { const char *debugname; /* name of debugging variable */ int *debugvar; /* pointer to debugging variable */ }; #ifdef DEBUG extern struct ctldebug debug0, debug1, debug2, debug3, debug4; extern struct ctldebug debug5, debug6, debug7, debug8, debug9; extern struct ctldebug debug10, debug11, debug12, debug13, debug14; extern struct ctldebug debug15, debug16, debug17, debug18, debug19; #endif /* DEBUG */ #define SYSCTLFN_PROTO const int *, u_int, void *, \ size_t *, const void *, size_t, \ const int *, struct lwp *, const struct sysctlnode * #define SYSCTLFN_ARGS const int *name, u_int namelen, \ void *oldp, size_t *oldlenp, \ const void *newp, size_t newlen, \ const int *oname, struct lwp *l, \ const struct sysctlnode *rnode #define SYSCTLFN_CALL(node) name, namelen, oldp, \ oldlenp, newp, newlen, \ oname, l, node #ifdef RUMP_USE_CTOR #include <sys/kernel.h> struct sysctl_setup_chain { void (*ssc_func)(struct sysctllog **); LIST_ENTRY(sysctl_setup_chain) ssc_entries; }; LIST_HEAD(sysctl_boot_chain, sysctl_setup_chain); #define _SYSCTL_REGISTER(name) \ static struct sysctl_setup_chain __CONCAT(ssc,name) = { \ .ssc_func = name, \ }; \ static void sysctlctor_##name(void) __attribute__((constructor)); \ static void sysctlctor_##name(void) \ { \ struct sysctl_setup_chain *ssc = &__CONCAT(ssc,name); \ extern struct sysctl_boot_chain sysctl_boot_chain; \ if (cold) { \ LIST_INSERT_HEAD(&sysctl_boot_chain, ssc, ssc_entries); \ } \ } \ static void sysctldtor_##name(void) __attribute__((destructor)); \ static void sysctldtor_##name(void) \ { \ struct sysctl_setup_chain *ssc = &__CONCAT(ssc,name); \ if (cold) { \ LIST_REMOVE(ssc, ssc_entries); \ } \ } #else /* RUMP_USE_CTOR */ #define _SYSCTL_REGISTER(name) __link_set_add_text(sysctl_funcs, name); #endif /* RUMP_USE_CTOR */ #ifdef _MODULE #define SYSCTL_SETUP_PROTO(name) \ void name(struct sysctllog **) #ifdef SYSCTL_DEBUG_SETUP #define SYSCTL_SETUP(name, desc) \ SYSCTL_SETUP_PROTO(name); \ static void __CONCAT(___,name)(struct sysctllog **); \ void name(struct sysctllog **clog) { \ printf("%s\n", desc); \ __CONCAT(___,name)(clog); } \ _SYSCTL_REGISTER(name); \ static void __CONCAT(___,name)(struct sysctllog **clog) #else /* !SYSCTL_DEBUG_SETUP */ #define SYSCTL_SETUP(name, desc) \ SYSCTL_SETUP_PROTO(name); \ _SYSCTL_REGISTER(name); \ void name(struct sysctllog **clog) #endif /* !SYSCTL_DEBUG_SETUP */ #else /* !_MODULE */ #define SYSCTL_SETUP_PROTO(name) #ifdef SYSCTL_DEBUG_SETUP #define SYSCTL_SETUP(name, desc) \ static void __CONCAT(___,name)(struct sysctllog **); \ static void name(struct sysctllog **clog) { \ printf("%s\n", desc); \ __CONCAT(___,name)(clog); } \ _SYSCTL_REGISTER(name); \ static void __CONCAT(___,name)(struct sysctllog **clog) #else /* !SYSCTL_DEBUG_SETUP */ #define SYSCTL_SETUP(name, desc) \ static void name(struct sysctllog **); \ _SYSCTL_REGISTER(name); \ static void name(struct sysctllog **clog) #endif /* !SYSCTL_DEBUG_SETUP */ #endif /* !_MODULE */ /* * Internal sysctl function calling convention: * * (*sysctlfn)(name, namelen, oldval, oldlenp, newval, newlen, * origname, lwp, node); * * The name parameter points at the next component of the name to be * interpreted. The namelen parameter is the number of integers in * the name. The origname parameter points to the start of the name * being parsed. The node parameter points to the node on which the * current operation is to be performed. */ typedef int (*sysctlfn)(SYSCTLFN_PROTO); /* * used in more than just sysctl */ void fill_eproc(struct proc *, struct eproc *, bool, bool); void fill_kproc2(struct proc *, struct kinfo_proc2 *, bool, bool); /* * subsystem setup */ void sysctl_init(void); void sysctl_basenode_init(void); void sysctl_finalize(void); /* * typical syscall call order */ void sysctl_lock(bool); int sysctl_dispatch(SYSCTLFN_PROTO); void sysctl_unlock(void); void sysctl_relock(void); /* * tree navigation primitives (must obtain lock before using these) */ int sysctl_locate(struct lwp *, const int *, u_int, const struct sysctlnode **, int *); int sysctl_query(SYSCTLFN_PROTO); int sysctl_create(SYSCTLFN_PROTO); int sysctl_destroy(SYSCTLFN_PROTO); int sysctl_lookup(SYSCTLFN_PROTO); int sysctl_describe(SYSCTLFN_PROTO); /* * simple variadic interface for adding/removing nodes */ int sysctl_createv(struct sysctllog **, int, const struct sysctlnode **, const struct sysctlnode **, int, int, const char *, const char *, sysctlfn, u_quad_t, void *, size_t, ...); int sysctl_destroyv(struct sysctlnode *, ...); #define VERIFY_FN(ctl_type, c_type) \ __always_inline static __inline void * \ __sysctl_verify_##ctl_type##_arg(c_type *arg) \ { \ return arg; \ } VERIFY_FN(CTLTYPE_NODE, struct sysctlnode); VERIFY_FN(CTLTYPE_INT, int); VERIFY_FN(CTLTYPE_STRING, char); VERIFY_FN(CTLTYPE_QUAD, int64_t); VERIFY_FN(CTLTYPE_STRUCT, void); VERIFY_FN(CTLTYPE_BOOL, bool); VERIFY_FN(CTLTYPE_LONG, long); #undef VERIFY_FN #define sysctl_createv(lg, cfl, rn, cn, fl, type, nm, desc, fn, qv, newp, ...) \ sysctl_createv(lg, cfl, rn, cn, fl, type, nm, desc, fn, qv, \ __sysctl_verify_##type##_arg(newp), __VA_ARGS__) /* * miscellany */ void sysctl_dump(const struct sysctlnode *); void sysctl_free(struct sysctlnode *); void sysctl_teardown(struct sysctllog **); void sysctl_log_print(const struct sysctllog *); #ifdef SYSCTL_INCLUDE_DESCR #define SYSCTL_DESCR(s) s #else /* SYSCTL_INCLUDE_DESCR */ #define SYSCTL_DESCR(s) NULL #endif /* SYSCTL_INCLUDE_DESCR */ /* * simple interface similar to old interface for in-kernel consumption */ int old_sysctl(int *, u_int, void *, size_t *, void *, size_t, struct lwp *); /* * these helpers are in other files (XXX so should the nodes be) or * are used by more than one node */ int sysctl_hw_tapenames(SYSCTLFN_PROTO); int sysctl_hw_tapestats(SYSCTLFN_PROTO); int sysctl_kern_vnode(SYSCTLFN_PROTO); int sysctl_net_inet_ip_ports(SYSCTLFN_PROTO); int sysctl_consdev(SYSCTLFN_PROTO); int sysctl_root_device(SYSCTLFN_PROTO); int sysctl_vfs_generic_fstypes(SYSCTLFN_PROTO); /* * primitive helper stubs */ int sysctl_needfunc(SYSCTLFN_PROTO); int sysctl_notavail(SYSCTLFN_PROTO); int sysctl_null(SYSCTLFN_PROTO); int sysctl_copyin(struct lwp *, const void *, void *, size_t); int sysctl_copyout(struct lwp *, const void *, void *, size_t); int sysctl_copyinstr(struct lwp *, const void *, void *, size_t, size_t *); u_int sysctl_map_flags(const u_int *, u_int); MALLOC_DECLARE(M_SYSCTLNODE); MALLOC_DECLARE(M_SYSCTLDATA); extern const u_int sysctl_lwpflagmap[]; #else /* !_KERNEL */ #include <sys/cdefs.h> typedef void *sysctlfn; __BEGIN_DECLS int sysctl(const int *, u_int, void *, size_t *, const void *, size_t); int sysctlbyname(const char *, void *, size_t *, const void *, size_t); int sysctlgetmibinfo(const char *, int *, u_int *, char *, size_t *, struct sysctlnode **, int); int sysctlnametomib(const char *, int *, size_t *); int proc_compare(const struct kinfo_proc2 *, const struct kinfo_lwp *, const struct kinfo_proc2 *, const struct kinfo_lwp *); void *asysctl(const int *, size_t, size_t *); void *asysctlbyname(const char *, size_t *); int __learn_tree(int *, u_int, struct sysctlnode *); __END_DECLS #endif /* !_KERNEL */ #ifdef __COMPAT_SYSCTL /* * old node definitions go here */ #endif /* __COMPAT_SYSCTL */ /* * padding makes alignment magically "work" for 32/64 compatibility at * the expense of making things bigger on 32 bit platforms. */ #if defined(_LP64) || (BYTE_ORDER == LITTLE_ENDIAN) #define __sysc_pad(type) union { uint64_t __sysc_upad; \ struct { type __sysc_sdatum; } __sysc_ustr; } #else #define __sysc_pad(type) union { uint64_t __sysc_upad; \ struct { uint32_t __sysc_spad; type __sysc_sdatum; } __sysc_ustr; } #endif #define __sysc_unpad(x) x.__sysc_ustr.__sysc_sdatum /* * The following is for gcc2, which doesn't handle __sysc_unpad(). * The code gets a little less ugly this way. */ #define sysc_init_field(field, value) \ .field = { .__sysc_ustr = { .__sysc_sdatum = (value), }, } struct sysctlnode { uint32_t sysctl_flags; /* flags and type */ int32_t sysctl_num; /* mib number */ char sysctl_name[SYSCTL_NAMELEN]; /* node name */ uint32_t sysctl_ver; /* node's version vs. rest of tree */ uint32_t __rsvd; union { struct { uint32_t suc_csize; /* size of child node array */ uint32_t suc_clen; /* number of valid children */ __sysc_pad(struct sysctlnode*) _suc_child; /* array of child nodes */ } scu_child; struct { __sysc_pad(void*) _sud_data; /* pointer to external data */ __sysc_pad(size_t) _sud_offset; /* offset to data */ } scu_data; int32_t scu_alias; /* node this node refers to */ int32_t scu_idata; /* immediate "int" data */ u_quad_t scu_qdata; /* immediate "u_quad_t" data */ bool scu_bdata; /* immediate bool data */ } sysctl_un; __sysc_pad(size_t) _sysctl_size; /* size of instrumented data */ __sysc_pad(sysctlfn) _sysctl_func; /* access helper function */ __sysc_pad(struct sysctlnode*) _sysctl_parent; /* parent of this node */ __sysc_pad(const char *) _sysctl_desc; /* description of node */ }; /* * padded data */ #define suc_child __sysc_unpad(_suc_child) #define sud_data __sysc_unpad(_sud_data) #define sud_offset __sysc_unpad(_sud_offset) #define sysctl_size __sysc_unpad(_sysctl_size) #define sysctl_func __sysc_unpad(_sysctl_func) #define sysctl_parent __sysc_unpad(_sysctl_parent) #define sysctl_desc __sysc_unpad(_sysctl_desc) /* * nested data (may also be padded) */ #define sysctl_csize sysctl_un.scu_child.suc_csize #define sysctl_clen sysctl_un.scu_child.suc_clen #define sysctl_child sysctl_un.scu_child.suc_child #define sysctl_data sysctl_un.scu_data.sud_data #define sysctl_offset sysctl_un.scu_data.sud_offset #define sysctl_alias sysctl_un.scu_alias #define sysctl_idata sysctl_un.scu_idata #define sysctl_qdata sysctl_un.scu_qdata #define sysctl_bdata sysctl_un.scu_bdata /* * when requesting a description of a node (a set of nodes, actually), * you get back an "array" of these, where the actual length of the * descr_str is noted in descr_len (which includes the trailing nul * byte), rounded up to the nearest four (sizeof(int32_t) actually). * * NEXT_DESCR() will take a pointer to a description and advance it to * the next description. */ struct sysctldesc { int32_t descr_num; /* mib number of node */ uint32_t descr_ver; /* version of node */ uint32_t descr_len; /* length of description string */ char descr_str[1]; /* not really 1...see above */ }; #define __sysc_desc_roundup(x) ((((x) - 1) | (sizeof(int32_t) - 1)) + 1) #define __sysc_desc_len(l) (offsetof(struct sysctldesc, descr_str) +\ __sysc_desc_roundup(l)) #define __sysc_desc_adv(d, l) \ (/*XXXUNCONST ptr cast*/(struct sysctldesc *) \ __UNCONST(((const char*)(d)) + __sysc_desc_len(l))) #define NEXT_DESCR(d) __sysc_desc_adv((d), (d)->descr_len) static __inline const struct sysctlnode * sysctl_rootof(const struct sysctlnode *n) { while (n->sysctl_parent != NULL) n = n->sysctl_parent; return (n); } #endif /* !_SYS_SYSCTL_H_ */
7 7 7 7 7 7 7 12 12 12 12 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 /* $NetBSD: tcp_congctl.c,v 1.28 2021/07/31 20:29:37 andvar Exp $ */ /*- * Copyright (c) 1997, 1998, 1999, 2001, 2005, 2006 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Jason R. Thorpe and Kevin M. Lahey of the Numerical Aerospace Simulation * Facility, NASA Ames Research Center. * This code is derived from software contributed to The NetBSD Foundation * by Charles M. Hannum. * This code is derived from software contributed to The NetBSD Foundation * by Rui Paulo. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /* * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the project nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * @(#)COPYRIGHT 1.1 (NRL) 17 January 1995 * * NRL grants permission for redistribution and use in source and binary * forms, with or without modification, of the software and documentation * created at NRL provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgements: * This product includes software developed by the University of * California, Berkeley and its contributors. * This product includes software developed at the Information * Technology Division, US Naval Research Laboratory. * 4. Neither the name of the NRL nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THE SOFTWARE PROVIDED BY NRL IS PROVIDED BY NRL AND CONTRIBUTORS ``AS * IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NRL OR * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * The views and conclusions contained in the software and documentation * are those of the authors and should not be interpreted as representing * official policies, either expressed or implied, of the US Naval * Research Laboratory (NRL). */ /* * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994, 1995 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)tcp_input.c 8.12 (Berkeley) 5/24/95 */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: tcp_congctl.c,v 1.28 2021/07/31 20:29:37 andvar Exp $"); #ifdef _KERNEL_OPT #include "opt_inet.h" #include "opt_tcp_debug.h" #include "opt_tcp_congctl.h" #endif #include <sys/param.h> #include <sys/systm.h> #include <sys/malloc.h> #include <sys/mbuf.h> #include <sys/protosw.h> #include <sys/socket.h> #include <sys/socketvar.h> #include <sys/errno.h> #include <sys/syslog.h> #include <sys/pool.h> #include <sys/domain.h> #include <sys/kernel.h> #include <sys/mutex.h> #include <net/if.h> #include <netinet/in.h> #include <netinet/in_systm.h> #include <netinet/ip.h> #include <netinet/in_pcb.h> #include <netinet/in_var.h> #include <netinet/ip_var.h> #ifdef INET6 #include <netinet/ip6.h> #include <netinet6/ip6_var.h> #include <netinet6/in6_pcb.h> #include <netinet6/ip6_var.h> #include <netinet6/in6_var.h> #include <netinet/icmp6.h> #endif #include <netinet/tcp.h> #include <netinet/tcp_fsm.h> #include <netinet/tcp_seq.h> #include <netinet/tcp_timer.h> #include <netinet/tcp_var.h> #include <netinet/tcp_congctl.h> #ifdef TCP_DEBUG #include <netinet/tcp_debug.h> #endif /* * TODO: * consider separating the actual implementations in another file. */ static void tcp_common_congestion_exp(struct tcpcb *, int, int); static int tcp_reno_do_fast_retransmit(struct tcpcb *, const struct tcphdr *); static int tcp_reno_fast_retransmit(struct tcpcb *, const struct tcphdr *); static void tcp_reno_slow_retransmit(struct tcpcb *); static void tcp_reno_fast_retransmit_newack(struct tcpcb *, const struct tcphdr *); static void tcp_reno_newack(struct tcpcb *, const struct tcphdr *); static void tcp_reno_congestion_exp(struct tcpcb *tp); static int tcp_newreno_fast_retransmit(struct tcpcb *, const struct tcphdr *); static void tcp_newreno_fast_retransmit_newack(struct tcpcb *, const struct tcphdr *); static void tcp_newreno_newack(struct tcpcb *, const struct tcphdr *); static int tcp_cubic_fast_retransmit(struct tcpcb *, const struct tcphdr *); static void tcp_cubic_slow_retransmit(struct tcpcb *tp); static void tcp_cubic_newack(struct tcpcb *, const struct tcphdr *); static void tcp_cubic_congestion_exp(struct tcpcb *); static void tcp_congctl_fillnames(void); extern int tcprexmtthresh; MALLOC_DEFINE(M_TCPCONGCTL, "tcpcongctl", "TCP congestion control structures"); /* currently selected global congestion control */ char tcp_congctl_global_name[TCPCC_MAXLEN]; /* available global congestion control algorithms */ char tcp_congctl_avail[10 * TCPCC_MAXLEN]; /* * Used to list the available congestion control algorithms. */ TAILQ_HEAD(, tcp_congctlent) tcp_congctlhd = TAILQ_HEAD_INITIALIZER(tcp_congctlhd); static struct tcp_congctlent * tcp_congctl_global; static kmutex_t tcp_congctl_mtx; void tcp_congctl_init(void) { int r __diagused; mutex_init(&tcp_congctl_mtx, MUTEX_DEFAULT, IPL_NONE); /* Base algorithms. */ r = tcp_congctl_register("reno", &tcp_reno_ctl); KASSERT(r == 0); r = tcp_congctl_register("newreno", &tcp_newreno_ctl); KASSERT(r == 0); r = tcp_congctl_register("cubic", &tcp_cubic_ctl); KASSERT(r == 0); /* NewReno is the default. */ #ifndef TCP_CONGCTL_DEFAULT #define TCP_CONGCTL_DEFAULT "newreno" #endif r = tcp_congctl_select(NULL, TCP_CONGCTL_DEFAULT); KASSERT(r == 0); } /* * Register a congestion algorithm and select it if we have none. */ int tcp_congctl_register(const char *name, const struct tcp_congctl *tcc) { struct tcp_congctlent *ntcc, *tccp; TAILQ_FOREACH(tccp, &tcp_congctlhd, congctl_ent) if (!strcmp(name, tccp->congctl_name)) { /* name already registered */ return EEXIST; } ntcc = malloc(sizeof(*ntcc), M_TCPCONGCTL, M_WAITOK|M_ZERO); strlcpy(ntcc->congctl_name, name, sizeof(ntcc->congctl_name) - 1); ntcc->congctl_ctl = tcc; TAILQ_INSERT_TAIL(&tcp_congctlhd, ntcc, congctl_ent); tcp_congctl_fillnames(); if (TAILQ_FIRST(&tcp_congctlhd) == ntcc) tcp_congctl_select(NULL, name); return 0; } int tcp_congctl_unregister(const char *name) { struct tcp_congctlent *tccp, *rtccp; unsigned int size; rtccp = NULL; size = 0; TAILQ_FOREACH(tccp, &tcp_congctlhd, congctl_ent) { if (!strcmp(name, tccp->congctl_name)) rtccp = tccp; size++; } if (!rtccp) return ENOENT; if (size <= 1 || tcp_congctl_global == rtccp || rtccp->congctl_refcnt) return EBUSY; TAILQ_REMOVE(&tcp_congctlhd, rtccp, congctl_ent); free(rtccp, M_TCPCONGCTL); tcp_congctl_fillnames(); return 0; } /* * Select a congestion algorithm by name. */ int tcp_congctl_select(struct tcpcb *tp, const char *name) { struct tcp_congctlent *tccp, *old_tccp, *new_tccp; bool old_found, new_found; KASSERT(name); old_found = (tp == NULL || tp->t_congctl == NULL); old_tccp = NULL; new_found = false; new_tccp = NULL; TAILQ_FOREACH(tccp, &tcp_congctlhd, congctl_ent) { if (!old_found && tccp->congctl_ctl == tp->t_congctl) { old_tccp = tccp; old_found = true; } if (!new_found && !strcmp(name, tccp->congctl_name)) { new_tccp = tccp; new_found = true; } if (new_found && old_found) { if (tp) { mutex_enter(&tcp_congctl_mtx); if (old_tccp) old_tccp->congctl_refcnt--; tp->t_congctl = new_tccp->congctl_ctl; new_tccp->congctl_refcnt++; mutex_exit(&tcp_congctl_mtx); } else { tcp_congctl_global = new_tccp; strlcpy(tcp_congctl_global_name, new_tccp->congctl_name, sizeof(tcp_congctl_global_name) - 1); } return 0; } } return EINVAL; } void tcp_congctl_release(struct tcpcb *tp) { struct tcp_congctlent *tccp; KASSERT(tp->t_congctl); TAILQ_FOREACH(tccp, &tcp_congctlhd, congctl_ent) { if (tccp->congctl_ctl == tp->t_congctl) { tccp->congctl_refcnt--; return; } } } /* * Returns the name of a congestion algorithm. */ const char * tcp_congctl_bystruct(const struct tcp_congctl *tcc) { struct tcp_congctlent *tccp; KASSERT(tcc); TAILQ_FOREACH(tccp, &tcp_congctlhd, congctl_ent) if (tccp->congctl_ctl == tcc) return tccp->congctl_name; return NULL; } static void tcp_congctl_fillnames(void) { struct tcp_congctlent *tccp; const char *delim = " "; tcp_congctl_avail[0] = '\0'; TAILQ_FOREACH(tccp, &tcp_congctlhd, congctl_ent) { strlcat(tcp_congctl_avail, tccp->congctl_name, sizeof(tcp_congctl_avail) - 1); if (TAILQ_NEXT(tccp, congctl_ent)) strlcat(tcp_congctl_avail, delim, sizeof(tcp_congctl_avail) - 1); } } /* ------------------------------------------------------------------------ */ /* * Common stuff */ /* Window reduction (1-beta) for [New]Reno: 0.5 */ #define RENO_BETAA 1 #define RENO_BETAB 2 /* Window reduction (1-beta) for Cubic: 0.8 */ #define CUBIC_BETAA 4 #define CUBIC_BETAB 5 /* Draft Rhee Section 4.1 */ #define CUBIC_CA 4 #define CUBIC_CB 10 static void tcp_common_congestion_exp(struct tcpcb *tp, int betaa, int betab) { u_long win; /* * Reduce the congestion window and the slow start threshold. */ win = ulmin(tp->snd_wnd, tp->snd_cwnd) * betaa / betab / tp->t_segsz; if (win < 2) win = 2; tp->snd_ssthresh = win * tp->t_segsz; tp->snd_recover = tp->snd_max; tp->snd_cwnd = tp->snd_ssthresh; /* * When using TCP ECN, notify the peer that * we reduced the cwnd. */ if (TCP_ECN_ALLOWED(tp)) tp->t_flags |= TF_ECN_SND_CWR; } /* ------------------------------------------------------------------------ */ /* * TCP/Reno congestion control. */ static void tcp_reno_congestion_exp(struct tcpcb *tp) { tcp_common_congestion_exp(tp, RENO_BETAA, RENO_BETAB); } static int tcp_reno_do_fast_retransmit(struct tcpcb *tp, const struct tcphdr *th) { /* * Dup acks mean that packets have left the * network (they're now cached at the receiver) * so bump cwnd by the amount in the receiver * to keep a constant cwnd packets in the * network. * * If we are using TCP/SACK, then enter * Fast Recovery if the receiver SACKs * data that is tcprexmtthresh * MSS * bytes past the last ACKed segment, * irrespective of the number of DupAcks. */ tcp_seq onxt = tp->snd_nxt; tp->t_partialacks = 0; TCP_TIMER_DISARM(tp, TCPT_REXMT); tp->t_rtttime = 0; if (TCP_SACK_ENABLED(tp)) { tp->t_dupacks = tcprexmtthresh; tp->sack_newdata = tp->snd_nxt; tp->snd_cwnd = tp->t_segsz; (void) tcp_output(tp); return 0; } tp->snd_nxt = th->th_ack; tp->snd_cwnd = tp->t_segsz; (void) tcp_output(tp); tp->snd_cwnd = tp->snd_ssthresh + tp->t_segsz * tp->t_dupacks; if (SEQ_GT(onxt, tp->snd_nxt)) tp->snd_nxt = onxt; return 0; } static int tcp_reno_fast_retransmit(struct tcpcb *tp, const struct tcphdr *th) { /* * We know we're losing at the current * window size so do congestion avoidance * (set ssthresh to half the current window * and pull our congestion window back to * the new ssthresh). */ tcp_reno_congestion_exp(tp); return tcp_reno_do_fast_retransmit(tp, th); } static void tcp_reno_slow_retransmit(struct tcpcb *tp) { u_long win; /* * Close the congestion window down to one segment * (we'll open it by one segment for each ack we get). * Since we probably have a window's worth of unacked * data accumulated, this "slow start" keeps us from * dumping all that data as back-to-back packets (which * might overwhelm an intermediate gateway). * * There are two phases to the opening: Initially we * open by one mss on each ack. This makes the window * size increase exponentially with time. If the * window is larger than the path can handle, this * exponential growth results in dropped packet(s) * almost immediately. To get more time between * drops but still "push" the network to take advantage * of improving conditions, we switch from exponential * to linear window opening at some threshold size. * For a threshold, we use half the current window * size, truncated to a multiple of the mss. * * (the minimum cwnd that will give us exponential * growth is 2 mss. We don't allow the threshold * to go below this.) */ win = ulmin(tp->snd_wnd, tp->snd_cwnd) / 2 / tp->t_segsz; if (win < 2) win = 2; /* Loss Window MUST be one segment. */ tp->snd_cwnd = tp->t_segsz; tp->snd_ssthresh = win * tp->t_segsz; tp->t_partialacks = -1; tp->t_dupacks = 0; tp->t_bytes_acked = 0; if (TCP_ECN_ALLOWED(tp)) tp->t_flags |= TF_ECN_SND_CWR; } static void tcp_reno_fast_retransmit_newack(struct tcpcb *tp, const struct tcphdr *th) { if (tp->t_partialacks < 0) { /* * We were not in fast recovery. Reset the duplicate ack * counter. */ tp->t_dupacks = 0; } else { /* * Clamp the congestion window to the crossover point and * exit fast recovery. */ if (tp->snd_cwnd > tp->snd_ssthresh) tp->snd_cwnd = tp->snd_ssthresh; tp->t_partialacks = -1; tp->t_dupacks = 0; tp->t_bytes_acked = 0; if (TCP_SACK_ENABLED(tp) && SEQ_GT(th->th_ack, tp->snd_fack)) tp->snd_fack = th->th_ack; } } static void tcp_reno_newack(struct tcpcb *tp, const struct tcphdr *th) { /* * When new data is acked, open the congestion window. */ u_int cw = tp->snd_cwnd; u_int incr = tp->t_segsz; if (tcp_do_abc) { /* * RFC 3465 Appropriate Byte Counting (ABC) */ int acked = th->th_ack - tp->snd_una; if (cw >= tp->snd_ssthresh) { tp->t_bytes_acked += acked; if (tp->t_bytes_acked >= cw) { /* Time to increase the window. */ tp->t_bytes_acked -= cw; } else { /* No need to increase yet. */ incr = 0; } } else { /* * use 2*SMSS or 1*SMSS for the "L" param, * depending on sysctl setting. * * (See RFC 3465 2.3 Choosing the Limit) */ u_int abc_lim; abc_lim = (tcp_abc_aggressive == 0 || tp->snd_nxt != tp->snd_max) ? incr : incr * 2; incr = uimin(acked, abc_lim); } } else { /* * If the window gives us less than ssthresh packets * in flight, open exponentially (segsz per packet). * Otherwise open linearly: segsz per window * (segsz^2 / cwnd per packet). */ if (cw >= tp->snd_ssthresh) { incr = incr * incr / cw; } } tp->snd_cwnd = uimin(cw + incr, TCP_MAXWIN << tp->snd_scale); } const struct tcp_congctl tcp_reno_ctl = { .fast_retransmit = tcp_reno_fast_retransmit, .slow_retransmit = tcp_reno_slow_retransmit, .fast_retransmit_newack = tcp_reno_fast_retransmit_newack, .newack = tcp_reno_newack, .cong_exp = tcp_reno_congestion_exp, }; /* * TCP/NewReno Congestion control. */ static int tcp_newreno_fast_retransmit(struct tcpcb *tp, const struct tcphdr *th) { if (SEQ_LT(th->th_ack, tp->snd_high)) { /* * False fast retransmit after timeout. * Do not enter fast recovery */ tp->t_dupacks = 0; return 1; } /* * Fast retransmit is same as reno. */ return tcp_reno_fast_retransmit(tp, th); } /* * Implement the NewReno response to a new ack, checking for partial acks in * fast recovery. */ static void tcp_newreno_fast_retransmit_newack(struct tcpcb *tp, const struct tcphdr *th) { if (tp->t_partialacks < 0) { /* * We were not in fast recovery. Reset the duplicate ack * counter. */ tp->t_dupacks = 0; } else if (SEQ_LT(th->th_ack, tp->snd_recover)) { /* * This is a partial ack. Retransmit the first unacknowledged * segment and deflate the congestion window by the amount of * acknowledged data. Do not exit fast recovery. */ tcp_seq onxt = tp->snd_nxt; u_long ocwnd = tp->snd_cwnd; int sack_num_segs = 1, sack_bytes_rxmt = 0; /* * snd_una has not yet been updated and the socket's send * buffer has not yet drained off the ACK'd data, so we * have to leave snd_una as it was to get the correct data * offset in tcp_output(). */ tp->t_partialacks++; TCP_TIMER_DISARM(tp, TCPT_REXMT); tp->t_rtttime = 0; if (TCP_SACK_ENABLED(tp)) { /* * Partial ack handling within a sack recovery episode. * Keeping this very simple for now. When a partial ack * is received, force snd_cwnd to a value that will * allow the sender to transmit no more than 2 segments. * If necessary, a fancier scheme can be adopted at a * later point, but for now, the goal is to prevent the * sender from bursting a large amount of data in the * midst of sack recovery. */ /* * send one or 2 segments based on how much * new data was acked */ if (((th->th_ack - tp->snd_una) / tp->t_segsz) > 2) sack_num_segs = 2; (void)tcp_sack_output(tp, &sack_bytes_rxmt); tp->snd_cwnd = sack_bytes_rxmt + (tp->snd_nxt - tp->sack_newdata) + sack_num_segs * tp->t_segsz; tp->t_flags |= TF_ACKNOW; (void) tcp_output(tp); } else { tp->snd_nxt = th->th_ack; /* * Set snd_cwnd to one segment beyond ACK'd offset * snd_una is not yet updated when we're called */ tp->snd_cwnd = tp->t_segsz + (th->th_ack - tp->snd_una); (void) tcp_output(tp); tp->snd_cwnd = ocwnd; if (SEQ_GT(onxt, tp->snd_nxt)) tp->snd_nxt = onxt; /* * Partial window deflation. Relies on fact that * tp->snd_una not updated yet. */ tp->snd_cwnd -= (th->th_ack - tp->snd_una - tp->t_segsz); } } else { /* * Complete ack. Inflate the congestion window to ssthresh * and exit fast recovery. * * Window inflation should have left us with approx. * snd_ssthresh outstanding data. But in case we * would be inclined to send a burst, better to do * it via the slow start mechanism. */ if (SEQ_SUB(tp->snd_max, th->th_ack) < tp->snd_ssthresh) tp->snd_cwnd = SEQ_SUB(tp->snd_max, th->th_ack) + tp->t_segsz; else tp->snd_cwnd = tp->snd_ssthresh; tp->t_partialacks = -1; tp->t_dupacks = 0; tp->t_bytes_acked = 0; if (TCP_SACK_ENABLED(tp) && SEQ_GT(th->th_ack, tp->snd_fack)) tp->snd_fack = th->th_ack; } } static void tcp_newreno_newack(struct tcpcb *tp, const struct tcphdr *th) { /* * If we are still in fast recovery (meaning we are using * NewReno and we have only received partial acks), do not * inflate the window yet. */ if (tp->t_partialacks < 0) tcp_reno_newack(tp, th); } const struct tcp_congctl tcp_newreno_ctl = { .fast_retransmit = tcp_newreno_fast_retransmit, .slow_retransmit = tcp_reno_slow_retransmit, .fast_retransmit_newack = tcp_newreno_fast_retransmit_newack, .newack = tcp_newreno_newack, .cong_exp = tcp_reno_congestion_exp, }; /* * CUBIC - http://tools.ietf.org/html/draft-rhee-tcpm-cubic-02 */ /* Cubic prototypes */ static void tcp_cubic_update_ctime(struct tcpcb *tp); static uint32_t tcp_cubic_diff_ctime(struct tcpcb *); static uint32_t tcp_cubic_cbrt(uint32_t); static ulong tcp_cubic_getW(struct tcpcb *, uint32_t, uint32_t); /* Cubic TIME functions - XXX I don't like using timevals and microuptime */ /* * Set congestion timer to now */ static void tcp_cubic_update_ctime(struct tcpcb *tp) { struct timeval now_timeval; getmicrouptime(&now_timeval); tp->snd_cubic_ctime = now_timeval.tv_sec * 1000 + now_timeval.tv_usec / 1000; } /* * miliseconds from last congestion */ static uint32_t tcp_cubic_diff_ctime(struct tcpcb *tp) { struct timeval now_timeval; getmicrouptime(&now_timeval); return now_timeval.tv_sec * 1000 + now_timeval.tv_usec / 1000 - tp->snd_cubic_ctime; } /* * Approximate cubic root */ #define CBRT_ROUNDS 30 static uint32_t tcp_cubic_cbrt(uint32_t v) { int i, rounds = CBRT_ROUNDS; uint64_t x = v / 3; /* We fail to calculate correct for small numbers */ if (v == 0) return 0; else if (v < 4) return 1; /* * largest x that 2*x^3+3*x fits 64bit * Avoid overflow for a time cost */ if (x > 2097151) rounds += 10; for (i = 0; i < rounds; i++) if (rounds == CBRT_ROUNDS) x = (v + 2 * x * x * x) / (3 * x * x); else /* Avoid overflow */ x = v / (3 * x * x) + 2 * x / 3; return (uint32_t)x; } /* Draft Rhee Section 3.1 - get W(t+rtt) - Eq. 1 */ static ulong tcp_cubic_getW(struct tcpcb *tp, uint32_t ms_elapsed, uint32_t rtt) { uint32_t K; long tK3; /* Section 3.1 Eq. 2 */ K = tcp_cubic_cbrt(tp->snd_cubic_wmax / CUBIC_BETAB * CUBIC_CB / CUBIC_CA); /* (t-K)^3 - not clear why is the measure unit mattering */ tK3 = (long)(ms_elapsed + rtt) - (long)K; tK3 = tK3 * tK3 * tK3; return CUBIC_CA * tK3 / CUBIC_CB + tp->snd_cubic_wmax; } static void tcp_cubic_congestion_exp(struct tcpcb *tp) { /* * Congestion - Set WMax and shrink cwnd */ tcp_cubic_update_ctime(tp); /* Section 3.6 - Fast Convergence */ if (tp->snd_cubic_wmax < tp->snd_cubic_wmax_last) { tp->snd_cubic_wmax_last = tp->snd_cubic_wmax; tp->snd_cubic_wmax = tp->snd_cubic_wmax / 2 + tp->snd_cubic_wmax * CUBIC_BETAA / CUBIC_BETAB / 2; } else { tp->snd_cubic_wmax_last = tp->snd_cubic_wmax; tp->snd_cubic_wmax = tp->snd_cwnd; } tp->snd_cubic_wmax = uimax(tp->t_segsz, tp->snd_cubic_wmax); /* Shrink CWND */ tcp_common_congestion_exp(tp, CUBIC_BETAA, CUBIC_BETAB); } static int tcp_cubic_fast_retransmit(struct tcpcb *tp, const struct tcphdr *th) { if (SEQ_LT(th->th_ack, tp->snd_high)) { /* See newreno */ tp->t_dupacks = 0; return 1; } /* * mark WMax */ tcp_cubic_congestion_exp(tp); /* Do fast retransmit */ return tcp_reno_do_fast_retransmit(tp, th); } static void tcp_cubic_newack(struct tcpcb *tp, const struct tcphdr *th) { uint32_t ms_elapsed, rtt; u_long w_tcp; /* Congestion avoidance and not in fast recovery and usable rtt */ if (tp->snd_cwnd > tp->snd_ssthresh && tp->t_partialacks < 0 && /* * t_srtt is 1/32 units of slow ticks * converting it in ms would be equal to * (t_srtt >> 5) * 1000 / PR_SLOWHZ ~= (t_srtt << 5) / PR_SLOWHZ */ (rtt = (tp->t_srtt << 5) / PR_SLOWHZ) > 0) { ms_elapsed = tcp_cubic_diff_ctime(tp); /* Compute W_tcp(t) */ w_tcp = tp->snd_cubic_wmax * CUBIC_BETAA / CUBIC_BETAB + ms_elapsed / rtt / 3; if (tp->snd_cwnd > w_tcp) { /* Not in TCP friendly mode */ tp->snd_cwnd += (tcp_cubic_getW(tp, ms_elapsed, rtt) - tp->snd_cwnd) / tp->snd_cwnd; } else { /* friendly TCP mode */ tp->snd_cwnd = w_tcp; } /* Make sure we are within limits */ tp->snd_cwnd = uimax(tp->snd_cwnd, tp->t_segsz); tp->snd_cwnd = uimin(tp->snd_cwnd, TCP_MAXWIN << tp->snd_scale); } else { /* Use New Reno */ tcp_newreno_newack(tp, th); } } static void tcp_cubic_slow_retransmit(struct tcpcb *tp) { /* Timeout - Mark new congestion */ tcp_cubic_congestion_exp(tp); /* Loss Window MUST be one segment. */ tp->snd_cwnd = tp->t_segsz; tp->t_partialacks = -1; tp->t_dupacks = 0; tp->t_bytes_acked = 0; if (TCP_ECN_ALLOWED(tp)) tp->t_flags |= TF_ECN_SND_CWR; } const struct tcp_congctl tcp_cubic_ctl = { .fast_retransmit = tcp_cubic_fast_retransmit, .slow_retransmit = tcp_cubic_slow_retransmit, .fast_retransmit_newack = tcp_newreno_fast_retransmit_newack, .newack = tcp_cubic_newack, .cong_exp = tcp_cubic_congestion_exp, };
4 4 7 1 5 1 1 1 1 1 17 17 17 17 16 17 17 16 17 3 13 10 1 3 7 3 3 2 2 2 1 3 1 1 2 1 2 2 2 2 2 2 2 2 2 1 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 /* $NetBSD: procfs_vnops.c,v 1.230 2024/01/17 10:19:21 hannken Exp $ */ /*- * Copyright (c) 2006, 2007, 2008, 2020 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Andrew Doran. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /* * Copyright (c) 1993, 1995 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * Jan-Simon Pendry. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)procfs_vnops.c 8.18 (Berkeley) 5/21/95 */ /* * Copyright (c) 1993 Jan-Simon Pendry * * This code is derived from software contributed to Berkeley by * Jan-Simon Pendry. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)procfs_vnops.c 8.18 (Berkeley) 5/21/95 */ /* * procfs vnode interface */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: procfs_vnops.c,v 1.230 2024/01/17 10:19:21 hannken Exp $"); #include <sys/param.h> #include <sys/atomic.h> #include <sys/systm.h> #include <sys/time.h> #include <sys/kernel.h> #include <sys/file.h> #include <sys/filedesc.h> #include <sys/proc.h> #include <sys/vnode.h> #include <sys/namei.h> #include <sys/malloc.h> #include <sys/mount.h> #include <sys/dirent.h> #include <sys/resourcevar.h> #include <sys/stat.h> #include <sys/ptrace.h> #include <sys/kauth.h> #include <sys/exec.h> #include <uvm/uvm_extern.h> /* for PAGE_SIZE */ #include <machine/reg.h> #include <miscfs/genfs/genfs.h> #include <miscfs/procfs/procfs.h> /* * Vnode Operations. * */ static int procfs_validfile_linux(struct lwp *, struct mount *); static int procfs_root_readdir_callback(struct proc *, void *); static void procfs_dir(pfstype, struct lwp *, struct proc *, char **, char *, size_t); /* * This is a list of the valid names in the * process-specific sub-directories. It is * used in procfs_lookup and procfs_readdir */ static const struct proc_target { u_char pt_type; u_char pt_namlen; const char *pt_name; pfstype pt_pfstype; int (*pt_valid)(struct lwp *, struct mount *); } proc_targets[] = { #define N(s) sizeof(s)-1, s /* name type validp */ { DT_DIR, N("."), PFSproc, NULL }, { DT_DIR, N(".."), PFSroot, NULL }, { DT_DIR, N("fd"), PFSfd, NULL }, { DT_DIR, N("task"), PFStask, procfs_validfile_linux }, { DT_LNK, N("cwd"), PFScwd, NULL }, { DT_REG, N("emul"), PFSemul, NULL }, { DT_LNK, N("root"), PFSchroot, NULL }, { DT_REG, N("auxv"), PFSauxv, procfs_validauxv }, { DT_REG, N("cmdline"), PFScmdline, NULL }, { DT_REG, N("environ"), PFSenviron, NULL }, { DT_LNK, N("exe"), PFSexe, procfs_validfile }, { DT_REG, N("file"), PFSfile, procfs_validfile }, { DT_REG, N("fpregs"), PFSfpregs, procfs_validfpregs }, { DT_REG, N("limit"), PFSlimit, NULL }, { DT_REG, N("map"), PFSmap, procfs_validmap }, { DT_REG, N("maps"), PFSmaps, procfs_validmap }, { DT_REG, N("mem"), PFSmem, NULL }, { DT_REG, N("note"), PFSnote, NULL }, { DT_REG, N("notepg"), PFSnotepg, NULL }, { DT_REG, N("regs"), PFSregs, procfs_validregs }, { DT_REG, N("stat"), PFSstat, procfs_validfile_linux }, { DT_REG, N("statm"), PFSstatm, procfs_validfile_linux }, { DT_REG, N("status"), PFSstatus, NULL }, #ifdef __HAVE_PROCFS_MACHDEP PROCFS_MACHDEP_NODETYPE_DEFNS #endif #undef N }; static const int nproc_targets = sizeof(proc_targets) / sizeof(proc_targets[0]); /* * List of files in the root directory. Note: the validate function will * be called with p == NULL for these ones. */ static const struct proc_target proc_root_targets[] = { #define N(s) sizeof(s)-1, s /* name type validp */ { DT_REG, N("meminfo"), PFSmeminfo, procfs_validfile_linux }, { DT_REG, N("cpuinfo"), PFScpuinfo, procfs_validfile_linux }, { DT_REG, N("uptime"), PFSuptime, procfs_validfile_linux }, { DT_REG, N("mounts"), PFSmounts, procfs_validfile_linux }, { DT_REG, N("devices"), PFSdevices, procfs_validfile_linux }, { DT_REG, N("stat"), PFScpustat, procfs_validfile_linux }, { DT_REG, N("loadavg"), PFSloadavg, procfs_validfile_linux }, { DT_REG, N("version"), PFSversion, procfs_validfile_linux }, #undef N }; static const int nproc_root_targets = sizeof(proc_root_targets) / sizeof(proc_root_targets[0]); int procfs_lookup(void *); int procfs_open(void *); int procfs_close(void *); int procfs_access(void *); int procfs_getattr(void *); int procfs_setattr(void *); int procfs_readdir(void *); int procfs_readlink(void *); int procfs_inactive(void *); int procfs_reclaim(void *); int procfs_print(void *); int procfs_pathconf(void *); int procfs_getpages(void *); static uint8_t fttodt(file_t *); static int atoi(const char *, size_t); /* * procfs vnode operations. */ int (**procfs_vnodeop_p)(void *); const struct vnodeopv_entry_desc procfs_vnodeop_entries[] = { { &vop_default_desc, vn_default_error }, { &vop_parsepath_desc, genfs_parsepath }, /* parsepath */ { &vop_lookup_desc, procfs_lookup }, /* lookup */ { &vop_create_desc, genfs_eopnotsupp }, /* create */ { &vop_mknod_desc, genfs_eopnotsupp }, /* mknod */ { &vop_open_desc, procfs_open }, /* open */ { &vop_close_desc, procfs_close }, /* close */ { &vop_access_desc, procfs_access }, /* access */ { &vop_accessx_desc, genfs_accessx }, /* accessx */ { &vop_getattr_desc, procfs_getattr }, /* getattr */ { &vop_setattr_desc, procfs_setattr }, /* setattr */ { &vop_read_desc, procfs_rw }, /* read */ { &vop_write_desc, procfs_rw }, /* write */ { &vop_fallocate_desc, genfs_eopnotsupp }, /* fallocate */ { &vop_fdiscard_desc, genfs_eopnotsupp }, /* fdiscard */ { &vop_fcntl_desc, genfs_fcntl }, /* fcntl */ { &vop_ioctl_desc, genfs_enoioctl }, /* ioctl */ { &vop_poll_desc, genfs_poll }, /* poll */ { &vop_kqfilter_desc, genfs_kqfilter }, /* kqfilter */ { &vop_revoke_desc, genfs_revoke }, /* revoke */ { &vop_fsync_desc, genfs_nullop }, /* fsync */ { &vop_seek_desc, genfs_nullop }, /* seek */ { &vop_remove_desc, genfs_eopnotsupp }, /* remove */ { &vop_link_desc, genfs_erofs_link }, /* link */ { &vop_rename_desc, genfs_eopnotsupp }, /* rename */ { &vop_mkdir_desc, genfs_eopnotsupp }, /* mkdir */ { &vop_rmdir_desc, genfs_eopnotsupp }, /* rmdir */ { &vop_symlink_desc, genfs_erofs_symlink }, /* symlink */ { &vop_readdir_desc, procfs_readdir }, /* readdir */ { &vop_readlink_desc, procfs_readlink }, /* readlink */ { &vop_abortop_desc, genfs_abortop }, /* abortop */ { &vop_inactive_desc, procfs_inactive }, /* inactive */ { &vop_reclaim_desc, procfs_reclaim }, /* reclaim */ { &vop_lock_desc, genfs_lock }, /* lock */ { &vop_unlock_desc, genfs_unlock }, /* unlock */ { &vop_bmap_desc, genfs_eopnotsupp }, /* bmap */ { &vop_strategy_desc, genfs_badop }, /* strategy */ { &vop_print_desc, procfs_print }, /* print */ { &vop_islocked_desc, genfs_islocked }, /* islocked */ { &vop_pathconf_desc, procfs_pathconf }, /* pathconf */ { &vop_advlock_desc, genfs_einval }, /* advlock */ { &vop_getpages_desc, procfs_getpages }, /* getpages */ { &vop_putpages_desc, genfs_null_putpages }, /* putpages */ { NULL, NULL } }; const struct vnodeopv_desc procfs_vnodeop_opv_desc = { &procfs_vnodeop_p, procfs_vnodeop_entries }; /* * set things up for doing i/o on * the pfsnode (vp). (vp) is locked * on entry, and should be left locked * on exit. * * for procfs we don't need to do anything * in particular for i/o. all that is done * is to support exclusive open on process * memory images. */ int procfs_open(void *v) { struct vop_open_args /* { struct vnode *a_vp; int a_mode; kauth_cred_t a_cred; } */ *ap = v; struct vnode *vp = ap->a_vp; struct pfsnode *pfs = VTOPFS(vp); struct lwp *l1; struct proc *p2; int error; if ((error = procfs_proc_lock(vp->v_mount, pfs->pfs_pid, &p2, ENOENT)) != 0) return error; l1 = curlwp; /* tracer */ #define M2K(m) (((m) & FREAD) && ((m) & FWRITE) ? \ KAUTH_REQ_PROCESS_PROCFS_RW : \ (m) & FWRITE ? KAUTH_REQ_PROCESS_PROCFS_WRITE : \ KAUTH_REQ_PROCESS_PROCFS_READ) mutex_enter(p2->p_lock); error = kauth_authorize_process(l1->l_cred, KAUTH_PROCESS_PROCFS, p2, pfs, KAUTH_ARG(M2K(ap->a_mode)), NULL); mutex_exit(p2->p_lock); if (error) { procfs_proc_unlock(p2); return (error); } #undef M2K switch (pfs->pfs_type) { case PFSmem: if (((pfs->pfs_flags & FWRITE) && (ap->a_mode & O_EXCL)) || ((pfs->pfs_flags & O_EXCL) && (ap->a_mode & FWRITE))) { error = EBUSY; break; } if (!proc_isunder(p2, l1)) { error = EPERM; break; } if (ap->a_mode & FWRITE) pfs->pfs_flags = ap->a_mode & (FWRITE|O_EXCL); break; case PFSregs: case PFSfpregs: if (!proc_isunder(p2, l1)) { error = EPERM; break; } break; default: break; } procfs_proc_unlock(p2); return (error); } /* * close the pfsnode (vp) after doing i/o. * (vp) is not locked on entry or exit. * * nothing to do for procfs other than undo * any exclusive open flag (see _open above). */ int procfs_close(void *v) { struct vop_close_args /* { struct vnode *a_vp; int a_fflag; kauth_cred_t a_cred; } */ *ap = v; struct pfsnode *pfs = VTOPFS(ap->a_vp); switch (pfs->pfs_type) { case PFSmem: if ((ap->a_fflag & FWRITE) && (pfs->pfs_flags & O_EXCL)) pfs->pfs_flags &= ~(FWRITE|O_EXCL); break; default: break; } return (0); } /* * _inactive is called when the pfsnode * is vrele'd and the reference count goes * to zero. (vp) will be on the vnode free * list, so to get it back vget() must be * used. * * (vp) is locked on entry, but must be unlocked on exit. */ int procfs_inactive(void *v) { struct vop_inactive_v2_args /* { struct vnode *a_vp; bool *a_recycle; } */ *ap = v; struct vnode *vp = ap->a_vp; struct pfsnode *pfs = VTOPFS(vp); mutex_enter(&proc_lock); *ap->a_recycle = (procfs_proc_find(vp->v_mount, pfs->pfs_pid) == NULL); mutex_exit(&proc_lock); return (0); } /* * _reclaim is called when getnewvnode() * wants to make use of an entry on the vnode * free list. at this time the filesystem needs * to free any private data and remove the node * from any private lists. */ int procfs_reclaim(void *v) { struct vop_reclaim_v2_args /* { struct vnode *a_vp; } */ *ap = v; struct vnode *vp = ap->a_vp; struct pfsnode *pfs = VTOPFS(vp); VOP_UNLOCK(vp); /* * To interlock with procfs_revoke_vnodes(). */ mutex_enter(vp->v_interlock); vp->v_data = NULL; mutex_exit(vp->v_interlock); procfs_hashrem(pfs); kmem_free(pfs, sizeof(*pfs)); return 0; } /* * Return POSIX pathconf information applicable to special devices. */ int procfs_pathconf(void *v) { struct vop_pathconf_args /* { struct vnode *a_vp; int a_name; register_t *a_retval; } */ *ap = v; switch (ap->a_name) { case _PC_LINK_MAX: *ap->a_retval = LINK_MAX; return (0); case _PC_MAX_CANON: *ap->a_retval = MAX_CANON; return (0); case _PC_MAX_INPUT: *ap->a_retval = MAX_INPUT; return (0); case _PC_PIPE_BUF: *ap->a_retval = PIPE_BUF; return (0); case _PC_CHOWN_RESTRICTED: *ap->a_retval = 1; return (0); case _PC_VDISABLE: *ap->a_retval = _POSIX_VDISABLE; return (0); case _PC_SYNC_IO: *ap->a_retval = 1; return (0); default: return genfs_pathconf(ap); } /* NOTREACHED */ } /* * _print is used for debugging. * just print a readable description * of (vp). */ int procfs_print(void *v) { struct vop_print_args /* { struct vnode *a_vp; } */ *ap = v; struct pfsnode *pfs = VTOPFS(ap->a_vp); printf("tag VT_PROCFS, type %d, pid %d, mode %x, flags %lx\n", pfs->pfs_type, pfs->pfs_pid, pfs->pfs_mode, pfs->pfs_flags); return 0; } /* * Works out the path to the target process's current * working directory or chroot. If the caller is in a chroot and * can't "reach" the target's cwd or root (or some other error * occurs), a "/" is returned for the path. */ static void procfs_dir(pfstype t, struct lwp *caller, struct proc *target, char **bpp, char *path, size_t len) { struct cwdinfo *cwdi; struct vnode *vp, *rvp; char *bp; /* * Lock target cwdi and take a reference to the vnode * we are interested in to prevent it from disappearing * before getcwd_common() below. */ rw_enter(&target->p_cwdi->cwdi_lock, RW_READER); switch (t) { case PFScwd: vp = target->p_cwdi->cwdi_cdir; break; case PFSchroot: vp = target->p_cwdi->cwdi_rdir; break; default: rw_exit(&target->p_cwdi->cwdi_lock); return; } if (vp != NULL) vref(vp); rw_exit(&target->p_cwdi->cwdi_lock); cwdi = caller->l_proc->p_cwdi; rw_enter(&cwdi->cwdi_lock, RW_READER); rvp = cwdi->cwdi_rdir; bp = bpp ? *bpp : NULL; /* * XXX: this horrible kludge avoids locking panics when * attempting to lookup links that point to within procfs */ if (vp != NULL && vp->v_tag == VT_PROCFS) { if (bpp) { *--bp = '/'; *bpp = bp; } vrele(vp); rw_exit(&cwdi->cwdi_lock); return; } if (rvp == NULL) rvp = rootvnode; if (vp == NULL || getcwd_common(vp, rvp, bp ? &bp : NULL, path, len / 2, 0, caller) != 0) { if (bpp) { bp = *bpp; *--bp = '/'; } } if (bpp) *bpp = bp; if (vp != NULL) vrele(vp); rw_exit(&cwdi->cwdi_lock); } /* * Invent attributes for pfsnode (vp) and store * them in (vap). * Directories lengths are returned as zero since * any real length would require the genuine size * to be computed, and nothing cares anyway. * * this is relatively minimal for procfs. */ int procfs_getattr(void *v) { struct vop_getattr_args /* { struct vnode *a_vp; struct vattr *a_vap; kauth_cred_t a_cred; } */ *ap = v; struct vnode *vp = ap->a_vp; struct pfsnode *pfs = VTOPFS(vp); struct vattr *vap = ap->a_vap; struct proc *procp; char *path, *bp, bf[16]; int error; /* first check the process still exists */ switch (pfs->pfs_type) { case PFSroot: case PFScurproc: case PFSself: procp = NULL; break; default: error = procfs_proc_lock(vp->v_mount, pfs->pfs_pid, &procp, ENOENT); if (error != 0) return (error); break; } switch (pfs->pfs_type) { case PFStask: if (pfs->pfs_fd == -1) { path = NULL; break; } /*FALLTHROUGH*/ case PFScwd: case PFSchroot: path = malloc(MAXPATHLEN + 4, M_TEMP, M_WAITOK); if (path == NULL && procp != NULL) { procfs_proc_unlock(procp); return (ENOMEM); } break; default: path = NULL; break; } if (procp != NULL) { mutex_enter(procp->p_lock); error = kauth_authorize_process(kauth_cred_get(), KAUTH_PROCESS_CANSEE, procp, KAUTH_ARG(KAUTH_REQ_PROCESS_CANSEE_ENTRY), NULL, NULL); mutex_exit(procp->p_lock); if (error != 0) { procfs_proc_unlock(procp); if (path != NULL) free(path, M_TEMP); return (ENOENT); } } error = 0; /* start by zeroing out the attributes */ vattr_null(vap); /* next do all the common fields */ vap->va_type = ap->a_vp->v_type; vap->va_mode = pfs->pfs_mode; vap->va_fileid = pfs->pfs_fileno; vap->va_flags = 0; vap->va_blocksize = PAGE_SIZE; /* * Make all times be current TOD. * * It would be possible to get the process start * time from the p_stats structure, but there's * no "file creation" time stamp anyway, and the * p_stats structure is not addressable if u. gets * swapped out for that process. */ getnanotime(&vap->va_ctime); vap->va_atime = vap->va_mtime = vap->va_ctime; if (procp) TIMEVAL_TO_TIMESPEC(&procp->p_stats->p_start, &vap->va_birthtime); else getnanotime(&vap->va_birthtime); switch (pfs->pfs_type) { case PFSmem: case PFSregs: case PFSfpregs: #if defined(__HAVE_PROCFS_MACHDEP) && defined(PROCFS_MACHDEP_PROTECT_CASES) PROCFS_MACHDEP_PROTECT_CASES #endif /* * If the process has exercised some setuid or setgid * privilege, then rip away read/write permission so * that only root can gain access. */ if (procp->p_flag & PK_SUGID) vap->va_mode &= ~(S_IRUSR|S_IWUSR); /* FALLTHROUGH */ case PFSstatus: case PFSstat: case PFSnote: case PFSnotepg: case PFScmdline: case PFSenviron: case PFSemul: case PFSstatm: case PFSmap: case PFSmaps: case PFSlimit: case PFSauxv: vap->va_nlink = 1; vap->va_uid = kauth_cred_geteuid(procp->p_cred); vap->va_gid = kauth_cred_getegid(procp->p_cred); break; case PFScwd: case PFSchroot: case PFSmeminfo: case PFSdevices: case PFScpuinfo: case PFSuptime: case PFSmounts: case PFScpustat: case PFSloadavg: case PFSversion: case PFSexe: case PFSself: case PFScurproc: case PFSroot: vap->va_nlink = 1; vap->va_uid = vap->va_gid = 0; break; case PFSproc: case PFStask: case PFSfile: case PFSfd: break; default: panic("%s: %d/1", __func__, pfs->pfs_type); } /* * now do the object specific fields * * The size could be set from struct reg, but it's hardly * worth the trouble, and it puts some (potentially) machine * dependent data into this machine-independent code. If it * becomes important then this function should break out into * a per-file stat function in the corresponding .c file. */ switch (pfs->pfs_type) { case PFSroot: vap->va_bytes = vap->va_size = DEV_BSIZE; break; case PFSself: case PFScurproc: vap->va_bytes = vap->va_size = snprintf(bf, sizeof(bf), "%ld", (long)curproc->p_pid); break; case PFStask: if (pfs->pfs_fd != -1) { vap->va_nlink = 1; vap->va_uid = 0; vap->va_gid = 0; vap->va_bytes = vap->va_size = snprintf(bf, sizeof(bf), ".."); break; } /*FALLTHROUGH*/ case PFSfd: if (pfs->pfs_fd != -1) { file_t *fp; fp = fd_getfile2(procp, pfs->pfs_fd); if (fp == NULL) { error = EBADF; break; } vap->va_nlink = 1; vap->va_uid = kauth_cred_geteuid(fp->f_cred); vap->va_gid = kauth_cred_getegid(fp->f_cred); switch (fp->f_type) { case DTYPE_VNODE: vap->va_bytes = vap->va_size = fp->f_vnode->v_size; break; default: vap->va_bytes = vap->va_size = 0; break; } closef(fp); break; } /*FALLTHROUGH*/ case PFSproc: vap->va_nlink = 2; vap->va_uid = kauth_cred_geteuid(procp->p_cred); vap->va_gid = kauth_cred_getegid(procp->p_cred); vap->va_bytes = vap->va_size = DEV_BSIZE; break; case PFSfile: error = EOPNOTSUPP; break; case PFSmem: vap->va_bytes = vap->va_size = ctob(procp->p_vmspace->vm_tsize + procp->p_vmspace->vm_dsize + procp->p_vmspace->vm_ssize); break; case PFSauxv: vap->va_bytes = vap->va_size = procp->p_execsw->es_arglen; break; #if defined(PT_GETREGS) || defined(PT_SETREGS) case PFSregs: vap->va_bytes = vap->va_size = sizeof(struct reg); break; #endif #if defined(PT_GETFPREGS) || defined(PT_SETFPREGS) case PFSfpregs: vap->va_bytes = vap->va_size = sizeof(struct fpreg); break; #endif case PFSstatus: case PFSstat: case PFSnote: case PFSnotepg: case PFScmdline: case PFSenviron: case PFSmeminfo: case PFSdevices: case PFScpuinfo: case PFSuptime: case PFSmounts: case PFScpustat: case PFSloadavg: case PFSstatm: case PFSversion: vap->va_bytes = vap->va_size = 0; break; case PFSlimit: case PFSmap: case PFSmaps: /* * Advise a larger blocksize for the map files, so that * they may be read in one pass. */ vap->va_blocksize = 4 * PAGE_SIZE; vap->va_bytes = vap->va_size = 0; break; case PFScwd: case PFSchroot: bp = path + MAXPATHLEN; *--bp = '\0'; procfs_dir(pfs->pfs_type, curlwp, procp, &bp, path, MAXPATHLEN); vap->va_bytes = vap->va_size = strlen(bp); break; case PFSexe: vap->va_bytes = vap->va_size = strlen(procp->p_path); break; case PFSemul: vap->va_bytes = vap->va_size = strlen(procp->p_emul->e_name); break; #ifdef __HAVE_PROCFS_MACHDEP PROCFS_MACHDEP_NODETYPE_CASES error = procfs_machdep_getattr(ap->a_vp, vap, procp); break; #endif default: panic("%s: %d/2", __func__, pfs->pfs_type); } if (procp != NULL) procfs_proc_unlock(procp); if (path != NULL) free(path, M_TEMP); return (error); } /*ARGSUSED*/ int procfs_setattr(void *v) { /* * just fake out attribute setting * it's not good to generate an error * return, otherwise things like creat() * will fail when they try to set the * file length to 0. worse, this means * that echo $note > /proc/$pid/note will fail. */ return (0); } /* * implement access checking. * * actually, the check for super-user is slightly * broken since it will allow read access to write-only * objects. this doesn't cause any particular trouble * but does mean that the i/o entry points need to check * that the operation really does make sense. */ int procfs_access(void *v) { struct vop_access_args /* { struct vnode *a_vp; accmode_t a_accmode; kauth_cred_t a_cred; } */ *ap = v; struct vattr va; int error; if ((error = VOP_GETATTR(ap->a_vp, &va, ap->a_cred)) != 0) return (error); return kauth_authorize_vnode(ap->a_cred, KAUTH_ACCESS_ACTION(ap->a_accmode, ap->a_vp->v_type, va.va_mode), ap->a_vp, NULL, genfs_can_access(ap->a_vp, ap->a_cred, va.va_uid, va.va_gid, va.va_mode, NULL, ap->a_accmode)); } /* * lookup. this is incredibly complicated in the * general case, however for most pseudo-filesystems * very little needs to be done. * * Locking isn't hard here, just poorly documented. * * If we're looking up ".", just vref the parent & return it. * * If we're looking up "..", unlock the parent, and lock "..". If everything * went ok, and we're on the last component and the caller requested the * parent locked, try to re-lock the parent. We do this to prevent lock * races. * * For anything else, get the needed node. Then unlock the parent if not * the last component or not LOCKPARENT (i.e. if we wouldn't re-lock the * parent in the .. case). * * We try to exit with the parent locked in error cases. */ int procfs_lookup(void *v) { struct vop_lookup_v2_args /* { struct vnode * a_dvp; struct vnode ** a_vpp; struct componentname * a_cnp; } */ *ap = v; struct componentname *cnp = ap->a_cnp; struct vnode **vpp = ap->a_vpp; struct vnode *dvp = ap->a_dvp; const char *pname = cnp->cn_nameptr; const struct proc_target *pt = NULL; struct vnode *fvp; pid_t pid, vnpid; struct pfsnode *pfs; struct proc *p = NULL; struct lwp *plwp; int i, error; pfstype type; *vpp = NULL; if ((error = VOP_ACCESS(dvp, VEXEC, cnp->cn_cred)) != 0) return (error); if (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME) return (EROFS); if (cnp->cn_namelen == 1 && *pname == '.') { *vpp = dvp; vref(dvp); return (0); } pfs = VTOPFS(dvp); switch (pfs->pfs_type) { case PFSroot: /* * Shouldn't get here with .. in the root node. */ if (cnp->cn_flags & ISDOTDOT) return (EIO); for (i = 0; i < nproc_root_targets; i++) { pt = &proc_root_targets[i]; /* * check for node match. proc is always NULL here, * so call pt_valid with constant NULL lwp. */ if (cnp->cn_namelen == pt->pt_namlen && memcmp(pt->pt_name, pname, cnp->cn_namelen) == 0 && (pt->pt_valid == NULL || (*pt->pt_valid)(NULL, dvp->v_mount))) break; } if (i != nproc_root_targets) { error = procfs_allocvp(dvp->v_mount, vpp, 0, pt->pt_pfstype, -1); return (error); } if (CNEQ(cnp, "curproc", 7)) { pid = curproc->p_pid; vnpid = 0; type = PFScurproc; } else if (CNEQ(cnp, "self", 4)) { pid = curproc->p_pid; vnpid = 0; type = PFSself; } else { pid = (pid_t)atoi(pname, cnp->cn_namelen); vnpid = pid; type = PFSproc; } if (procfs_proc_lock(dvp->v_mount, pid, &p, ESRCH) != 0) break; error = procfs_allocvp(dvp->v_mount, vpp, vnpid, type, -1); procfs_proc_unlock(p); return (error); case PFSproc: if (cnp->cn_flags & ISDOTDOT) { error = procfs_allocvp(dvp->v_mount, vpp, 0, PFSroot, -1); return (error); } if (procfs_proc_lock(dvp->v_mount, pfs->pfs_pid, &p, ESRCH) != 0) break; mutex_enter(p->p_lock); LIST_FOREACH(plwp, &p->p_lwps, l_sibling) { if (plwp->l_stat != LSZOMB) break; } /* Process is exiting if no-LWPS or all LWPs are LSZOMB */ if (plwp == NULL) { mutex_exit(p->p_lock); procfs_proc_unlock(p); return ESRCH; } lwp_addref(plwp); mutex_exit(p->p_lock); for (pt = proc_targets, i = 0; i < nproc_targets; pt++, i++) { int found; found = cnp->cn_namelen == pt->pt_namlen && memcmp(pt->pt_name, pname, cnp->cn_namelen) == 0 && (pt->pt_valid == NULL || (*pt->pt_valid)(plwp, dvp->v_mount)); if (found) break; } lwp_delref(plwp); if (i == nproc_targets) { procfs_proc_unlock(p); break; } if (pt->pt_pfstype == PFSfile) { fvp = p->p_textvp; /* We already checked that it exists. */ vref(fvp); procfs_proc_unlock(p); *vpp = fvp; return (0); } error = procfs_allocvp(dvp->v_mount, vpp, pfs->pfs_pid, pt->pt_pfstype, -1); procfs_proc_unlock(p); return (error); case PFSfd: { int fd; file_t *fp; if ((error = procfs_proc_lock(dvp->v_mount, pfs->pfs_pid, &p, ENOENT)) != 0) return error; if (cnp->cn_flags & ISDOTDOT) { error = procfs_allocvp(dvp->v_mount, vpp, pfs->pfs_pid, PFSproc, -1); procfs_proc_unlock(p); return (error); } fd = atoi(pname, cnp->cn_namelen); fp = fd_getfile2(p, fd); if (fp == NULL) { procfs_proc_unlock(p); return ENOENT; } fvp = fp->f_vnode; /* Don't show directories */ if (fp->f_type == DTYPE_VNODE && fvp->v_type != VDIR && !procfs_proc_is_linux_compat()) { vref(fvp); closef(fp); procfs_proc_unlock(p); *vpp = fvp; return 0; } closef(fp); error = procfs_allocvp(dvp->v_mount, vpp, pfs->pfs_pid, PFSfd, fd); procfs_proc_unlock(p); return error; } case PFStask: { int xpid; if ((error = procfs_proc_lock(dvp->v_mount, pfs->pfs_pid, &p, ENOENT)) != 0) return error; if (cnp->cn_flags & ISDOTDOT) { error = procfs_allocvp(dvp->v_mount, vpp, pfs->pfs_pid, PFSproc, -1); procfs_proc_unlock(p); return (error); } xpid = atoi(pname, cnp->cn_namelen); if (xpid != pfs->pfs_pid) { procfs_proc_unlock(p); return ENOENT; } error = procfs_allocvp(dvp->v_mount, vpp, pfs->pfs_pid, PFStask, 0); procfs_proc_unlock(p); return error; } default: return (ENOTDIR); } return (cnp->cn_nameiop == LOOKUP ? ENOENT : EROFS); } int procfs_validfile(struct lwp *l, struct mount *mp) { return l != NULL && l->l_proc != NULL && l->l_proc->p_textvp != NULL; } static int procfs_validfile_linux(struct lwp *l, struct mount *mp) { return procfs_use_linux_compat(mp) && (l == NULL || l->l_proc == NULL || procfs_validfile(l, mp)); } struct procfs_root_readdir_ctx { struct uio *uiop; off_t *cookies; int ncookies; off_t off; off_t startoff; int error; }; static int procfs_root_readdir_callback(struct proc *p, void *arg) { struct procfs_root_readdir_ctx *ctxp = arg; struct dirent d; struct uio *uiop; int error; uiop = ctxp->uiop; if (uiop->uio_resid < UIO_MX) return -1; /* no space */ if (kauth_authorize_process(kauth_cred_get(), KAUTH_PROCESS_CANSEE, p, KAUTH_ARG(KAUTH_REQ_PROCESS_CANSEE_ENTRY), NULL, NULL) != 0) return 0; if (ctxp->off < ctxp->startoff) { ctxp->off++; return 0; } memset(&d, 0, UIO_MX); d.d_reclen = UIO_MX; d.d_fileno = PROCFS_FILENO(p->p_pid, PFSproc, -1); d.d_namlen = snprintf(d.d_name, UIO_MX - offsetof(struct dirent, d_name), "%ld", (long)p->p_pid); d.d_type = DT_DIR; mutex_exit(&proc_lock); error = uiomove(&d, UIO_MX, uiop); mutex_enter(&proc_lock); if (error) { ctxp->error = error; return -1; } ctxp->ncookies++; if (ctxp->cookies) *(ctxp->cookies)++ = ctxp->off + 1; ctxp->off++; return 0; } /* * readdir returns directory entries from pfsnode (vp). * * the strategy here with procfs is to generate a single * directory entry at a time (struct dirent) and then * copy that out to userland using uiomove. a more efficient * though more complex implementation, would try to minimize * the number of calls to uiomove(). for procfs, this is * hardly worth the added code complexity. * * this should just be done through read() */ int procfs_readdir(void *v) { struct vop_readdir_args /* { struct vnode *a_vp; struct uio *a_uio; kauth_cred_t a_cred; int *a_eofflag; off_t **a_cookies; int *a_ncookies; } */ *ap = v; struct uio *uio = ap->a_uio; struct dirent d; struct pfsnode *pfs; off_t i; int error; off_t *cookies = NULL; int ncookies; struct vnode *vp; const struct proc_target *pt; struct procfs_root_readdir_ctx ctx; struct proc *p = NULL; struct lwp *l; int nfd; int nc = 0; vp = ap->a_vp; pfs = VTOPFS(vp); if (uio->uio_resid < UIO_MX) return (EINVAL); if (uio->uio_offset < 0) return (EINVAL); error = 0; i = uio->uio_offset; memset(&d, 0, UIO_MX); d.d_reclen = UIO_MX; ncookies = uio->uio_resid / UIO_MX; switch (pfs->pfs_type) { /* * this is for the process-specific sub-directories. * all that is needed to is copy out all the entries * from the procent[] table (top of this file). */ case PFSproc: { if (i >= nproc_targets) return 0; if (procfs_proc_lock(vp->v_mount, pfs->pfs_pid, &p, ESRCH) != 0) break; if (ap->a_ncookies) { ncookies = uimin(ncookies, (nproc_targets - i)); cookies = malloc(ncookies * sizeof (off_t), M_TEMP, M_WAITOK); *ap->a_cookies = cookies; } for (pt = &proc_targets[i]; uio->uio_resid >= UIO_MX && i < nproc_targets; pt++, i++) { if (pt->pt_valid) { /* XXXSMP LWP can disappear */ mutex_enter(p->p_lock); l = LIST_FIRST(&p->p_lwps); KASSERT(l != NULL); mutex_exit(p->p_lock); if ((*pt->pt_valid)(l, vp->v_mount) == 0) continue; } d.d_fileno = PROCFS_FILENO(pfs->pfs_pid, pt->pt_pfstype, -1); d.d_namlen = pt->pt_namlen; memcpy(d.d_name, pt->pt_name, pt->pt_namlen + 1); d.d_type = pt->pt_type; if ((error = uiomove(&d, UIO_MX, uio)) != 0) break; if (cookies) *cookies++ = i + 1; } procfs_proc_unlock(p); break; } case PFSfd: { file_t *fp; int lim; if ((error = procfs_proc_lock(vp->v_mount, pfs->pfs_pid, &p, ESRCH)) != 0) return error; /* XXX Should this be by file as well? */ if (kauth_authorize_process(kauth_cred_get(), KAUTH_PROCESS_CANSEE, p, KAUTH_ARG(KAUTH_REQ_PROCESS_CANSEE_OPENFILES), NULL, NULL) != 0) { procfs_proc_unlock(p); return ESRCH; } nfd = atomic_load_consume(&p->p_fd->fd_dt)->dt_nfiles; lim = uimin((int)p->p_rlimit[RLIMIT_NOFILE].rlim_cur, maxfiles); if (i >= lim) { procfs_proc_unlock(p); return 0; } if (ap->a_ncookies) { ncookies = uimin(ncookies, (nfd + 2 - i)); cookies = malloc(ncookies * sizeof (off_t), M_TEMP, M_WAITOK); *ap->a_cookies = cookies; } for (; i < 2 && uio->uio_resid >= UIO_MX; i++) { pt = &proc_targets[i]; d.d_namlen = pt->pt_namlen; d.d_fileno = PROCFS_FILENO(pfs->pfs_pid, pt->pt_pfstype, -1); (void)memcpy(d.d_name, pt->pt_name, pt->pt_namlen + 1); d.d_type = pt->pt_type; if ((error = uiomove(&d, UIO_MX, uio)) != 0) break; if (cookies) *cookies++ = i + 1; nc++; } if (error) goto out; for (; uio->uio_resid >= UIO_MX && i < nfd; i++) { /* check the descriptor exists */ if ((fp = fd_getfile2(p, i - 2)) == NULL) continue; closef(fp); d.d_fileno = PROCFS_FILENO(pfs->pfs_pid, PFSfd, i - 2); d.d_namlen = snprintf(d.d_name, sizeof(d.d_name), "%lld", (long long)(i - 2)); d.d_type = fttodt(fp); if ((error = uiomove(&d, UIO_MX, uio)) != 0) break; if (cookies) *cookies++ = i + 1; nc++; } goto out; } case PFStask: { if ((error = procfs_proc_lock(vp->v_mount, pfs->pfs_pid, &p, ESRCH)) != 0) return error; nfd = 3; /* ., .., pid */ if (ap->a_ncookies) { ncookies = uimin(ncookies, (nfd + 2 - i)); cookies = malloc(ncookies * sizeof (off_t), M_TEMP, M_WAITOK); *ap->a_cookies = cookies; } for (; i < 2 && uio->uio_resid >= UIO_MX; i++) { pt = &proc_targets[i]; d.d_namlen = pt->pt_namlen; d.d_fileno = PROCFS_FILENO(pfs->pfs_pid, pt->pt_pfstype, -1); (void)memcpy(d.d_name, pt->pt_name, pt->pt_namlen + 1); d.d_type = pt->pt_type; if ((error = uiomove(&d, UIO_MX, uio)) != 0) break; if (cookies) *cookies++ = i + 1; nc++; } if (error) goto out; for (; uio->uio_resid >= UIO_MX && i < nfd; i++) { /* check the descriptor exists */ d.d_fileno = PROCFS_FILENO(pfs->pfs_pid, PFStask, i - 2); d.d_namlen = snprintf(d.d_name, sizeof(d.d_name), "%ld", (long)pfs->pfs_pid); d.d_type = DT_LNK; if ((error = uiomove(&d, UIO_MX, uio)) != 0) break; if (cookies) *cookies++ = i + 1; nc++; } goto out; } /* * this is for the root of the procfs filesystem * what is needed are special entries for "curproc" * and "self" followed by an entry for each process * on allproc. */ case PFSroot: { if (ap->a_ncookies) { /* * XXX Potentially allocating too much space here, * but I'm lazy. This loop needs some work. */ cookies = malloc(ncookies * sizeof (off_t), M_TEMP, M_WAITOK); *ap->a_cookies = cookies; } /* 0 ... 3 are static entries. */ for (; i <= 3 && uio->uio_resid >= UIO_MX; i++) { switch (i) { case 0: /* `.' */ case 1: /* `..' */ d.d_fileno = PROCFS_FILENO(0, PFSroot, -1); d.d_namlen = i + 1; memcpy(d.d_name, "..", d.d_namlen); d.d_name[i + 1] = '\0'; d.d_type = DT_DIR; break; case 2: d.d_fileno = PROCFS_FILENO(0, PFScurproc, -1); d.d_namlen = sizeof("curproc") - 1; memcpy(d.d_name, "curproc", sizeof("curproc")); d.d_type = DT_LNK; break; case 3: d.d_fileno = PROCFS_FILENO(0, PFSself, -1); d.d_namlen = sizeof("self") - 1; memcpy(d.d_name, "self", sizeof("self")); d.d_type = DT_LNK; break; } if ((error = uiomove(&d, UIO_MX, uio)) != 0) break; nc++; if (cookies) *cookies++ = i + 1; } if (error) break; /* 4 ... are process entries. */ ctx.uiop = uio; ctx.error = 0; ctx.off = 4; ctx.startoff = i; ctx.cookies = cookies; ctx.ncookies = nc; proclist_foreach_call(&allproc, procfs_root_readdir_callback, &ctx); cookies = ctx.cookies; nc = ctx.ncookies; error = ctx.error; if (error) break; /* misc entries. */ if (i < ctx.off) i = ctx.off; if (i >= ctx.off + nproc_root_targets) break; error = procfs_proc_lock(vp->v_mount, pfs->pfs_pid, &p, ESRCH); if (error) break; for (pt = &proc_root_targets[i - ctx.off]; uio->uio_resid >= UIO_MX && pt < &proc_root_targets[nproc_root_targets]; pt++, i++) { if (pt->pt_valid && (*pt->pt_valid)(NULL, vp->v_mount) == 0) continue; if (kauth_authorize_process(kauth_cred_get(), KAUTH_PROCESS_CANSEE, p, KAUTH_ARG(KAUTH_REQ_PROCESS_CANSEE_ENTRY), NULL, NULL) != 0) continue; d.d_fileno = PROCFS_FILENO(0, pt->pt_pfstype, -1); d.d_namlen = pt->pt_namlen; memcpy(d.d_name, pt->pt_name, pt->pt_namlen + 1); d.d_type = pt->pt_type; if ((error = uiomove(&d, UIO_MX, uio)) != 0) break; nc++; if (cookies) *cookies++ = i + 1; } out: KASSERT(p != NULL); ncookies = nc; procfs_proc_unlock(p); break; } default: error = ENOTDIR; break; } if (ap->a_ncookies) { if (error) { if (cookies) free(*ap->a_cookies, M_TEMP); *ap->a_ncookies = 0; *ap->a_cookies = NULL; } else *ap->a_ncookies = ncookies; } uio->uio_offset = i; return (error); } /* * readlink reads the link of `curproc' and others */ int procfs_readlink(void *v) { struct vop_readlink_args *ap = v; char bf[16]; /* should be enough */ char *bp = bf; char *path = NULL; int len = 0; int error = 0; struct vnode *vp = ap->a_vp; struct pfsnode *pfs = VTOPFS(vp); struct proc *pown = NULL; if (pfs->pfs_fileno == PROCFS_FILENO(0, PFScurproc, -1)) len = snprintf(bf, sizeof(bf), "%ld", (long)curproc->p_pid); else if (pfs->pfs_fileno == PROCFS_FILENO(0, PFSself, -1)) len = snprintf(bf, sizeof(bf), "%s", "curproc"); else if (pfs->pfs_fileno == PROCFS_FILENO(pfs->pfs_pid, PFStask, 0)) len = snprintf(bf, sizeof(bf), ".."); else if (pfs->pfs_fileno == PROCFS_FILENO(pfs->pfs_pid, PFSexe, -1)) { if ((error = procfs_proc_lock(vp->v_mount, pfs->pfs_pid, &pown, ESRCH)) != 0) return error; bp = pown->p_path; len = strlen(bp); } else if (pfs->pfs_fileno == PROCFS_FILENO(pfs->pfs_pid, PFScwd, -1) || pfs->pfs_fileno == PROCFS_FILENO(pfs->pfs_pid, PFSchroot, -1)) { if ((error = procfs_proc_lock(vp->v_mount, pfs->pfs_pid, &pown, ESRCH)) != 0) return error; path = malloc(MAXPATHLEN + 4, M_TEMP, M_WAITOK); if (path == NULL) { procfs_proc_unlock(pown); return (ENOMEM); } bp = path + MAXPATHLEN; *--bp = '\0'; procfs_dir(PROCFS_TYPE(pfs->pfs_fileno), curlwp, pown, &bp, path, MAXPATHLEN); len = strlen(bp); } else { file_t *fp; struct vnode *vxp; if ((error = procfs_proc_lock(vp->v_mount, pfs->pfs_pid, &pown, ESRCH)) != 0) return error; fp = fd_getfile2(pown, pfs->pfs_fd); if (fp == NULL) { procfs_proc_unlock(pown); return EBADF; } switch (fp->f_type) { case DTYPE_VNODE: vxp = fp->f_vnode; if (vxp->v_type != VDIR && !procfs_proc_is_linux_compat()) { error = EINVAL; break; } if ((path = malloc(MAXPATHLEN, M_TEMP, M_WAITOK)) == NULL) { error = ENOMEM; break; } bp = path + MAXPATHLEN; *--bp = '\0'; /* * XXX: kludge to avoid locking against ourselves * in getcwd() */ if (vxp->v_tag == VT_PROCFS) { *--bp = '/'; } else { rw_enter(&curproc->p_cwdi->cwdi_lock, RW_READER); vp = curproc->p_cwdi->cwdi_rdir; if (vp == NULL) vp = rootvnode; error = getcwd_common(vxp, vp, &bp, path, MAXPATHLEN / 2, 0, curlwp); rw_exit(&curproc->p_cwdi->cwdi_lock); } if (error) break; len = strlen(bp); break; case DTYPE_MISC: len = snprintf(bf, sizeof(bf), "%s", "[misc]"); break; case DTYPE_KQUEUE: len = snprintf(bf, sizeof(bf), "%s", "[kqueue]"); break; case DTYPE_SEM: len = snprintf(bf, sizeof(bf), "%s", "[ksem]"); break; default: error = EINVAL; break; } closef(fp); } if (error == 0) error = uiomove(bp, len, ap->a_uio); if (pown) procfs_proc_unlock(pown); if (path) free(path, M_TEMP); return error; } int procfs_getpages(void *v) { struct vop_getpages_args /* { struct vnode *a_vp; voff_t a_offset; struct vm_page **a_m; int *a_count; int a_centeridx; vm_prot_t a_access_type; int a_advice; int a_flags; } */ *ap = v; if ((ap->a_flags & PGO_LOCKED) == 0) rw_exit(ap->a_vp->v_uobj.vmobjlock); return (EFAULT); } /* * convert decimal ascii to int */ static int atoi(const char *b, size_t len) { int p = 0; while (len--) { char c = *b++; if (c < '0' || c > '9') return -1; p = 10 * p + (c - '0'); } return p; } /** * convert DTYPE_XXX to corresponding DT_XXX * matching what procfs_loadvnode() does. */ static uint8_t fttodt(file_t *fp) { switch (fp->f_type) { case DTYPE_VNODE: switch (fp->f_vnode->v_type) { case VREG: return DT_REG; case VDIR: return DT_LNK; /* symlink */ case VBLK: return DT_BLK; case VCHR: return DT_CHR; case VLNK: return DT_LNK; case VSOCK: return DT_SOCK; case VFIFO: return DT_FIFO; default: return DT_UNKNOWN; } case DTYPE_PIPE: return DT_FIFO; case DTYPE_SOCKET: return DT_SOCK; case DTYPE_KQUEUE: /*FALLTHROUGH*/ case DTYPE_MISC: /*FALLTHROUGH*/ case DTYPE_SEM: return DT_LNK; /* symlinks */ default: return DT_UNKNOWN; } }
18 56 56 33 33 3 100 100 81 5 6 6 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 /* $NetBSD: layer_vfsops.c,v 1.56 2022/12/09 10:33:18 hannken Exp $ */ /* * Copyright (c) 1999 National Aeronautics & Space Administration * All rights reserved. * * This software was written by William Studenmund of the * Numerical Aerospace Simulation Facility, NASA Ames Research Center. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the National Aeronautics & Space Administration * nor the names of its contributors may be used to endorse or promote * products derived from this software without specific prior written * permission. * * THIS SOFTWARE IS PROVIDED BY THE NATIONAL AERONAUTICS & SPACE ADMINISTRATION * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE ADMINISTRATION OR CONTRIB- * UTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /* * Copyright (c) 1992, 1993, 1995 * The Regents of the University of California. All rights reserved. * * This code is derived from software donated to Berkeley by * Jan-Simon Pendry. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: Id: lofs_vfsops.c,v 1.9 1992/05/30 10:26:24 jsp Exp * from: @(#)lofs_vfsops.c 1.2 (Berkeley) 6/18/92 * @(#)null_vfsops.c 8.7 (Berkeley) 5/14/95 */ /* * Generic layer VFS operations. */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: layer_vfsops.c,v 1.56 2022/12/09 10:33:18 hannken Exp $"); #include <sys/param.h> #include <sys/sysctl.h> #include <sys/systm.h> #include <sys/vnode.h> #include <sys/mount.h> #include <sys/namei.h> #include <sys/kauth.h> #include <sys/module.h> #include <miscfs/specfs/specdev.h> #include <miscfs/genfs/genfs.h> #include <miscfs/genfs/layer.h> #include <miscfs/genfs/layer_extern.h> SYSCTL_SETUP_PROTO(sysctl_vfs_layerfs_setup); MODULE(MODULE_CLASS_MISC, layerfs, NULL); static int layerfs_modcmd(modcmd_t cmd, void *arg) { switch (cmd) { case MODULE_CMD_INIT: return 0; case MODULE_CMD_FINI: return 0; default: return ENOTTY; } return 0; } /* * VFS start. Nothing needed here - the start routine on the underlying * filesystem will have been called when that filesystem was mounted. */ int layerfs_start(struct mount *mp, int flags) { #ifdef notyet return VFS_START(mp->mnt_lower, flags); #else return 0; #endif } int layerfs_root(struct mount *mp, int lktype, struct vnode **vpp) { struct vnode *vp; vp = MOUNTTOLAYERMOUNT(mp)->layerm_rootvp; if (vp == NULL) { *vpp = NULL; return EINVAL; } /* * Return root vnode with locked and with a reference held. */ vref(vp); vn_lock(vp, lktype | LK_RETRY); *vpp = vp; return 0; } int layerfs_quotactl(struct mount *mp, struct quotactl_args *args) { int error; error = vfs_busy(mp); if (error == 0) { error = VFS_QUOTACTL(mp->mnt_lower, args); vfs_unbusy(mp); } return error; } int layerfs_statvfs(struct mount *mp, struct statvfs *sbp) { struct statvfs *sbuf; int error; sbuf = kmem_zalloc(sizeof(*sbuf), KM_SLEEP); error = vfs_busy(mp); if (error == 0) { error = VFS_STATVFS(mp->mnt_lower, sbuf); vfs_unbusy(mp); } if (error) { goto done; } /* Copy across the relevant data and fake the rest. */ sbp->f_flag = sbuf->f_flag; sbp->f_bsize = sbuf->f_bsize; sbp->f_frsize = sbuf->f_frsize; sbp->f_iosize = sbuf->f_iosize; sbp->f_blocks = sbuf->f_blocks; sbp->f_bfree = sbuf->f_bfree; sbp->f_bavail = sbuf->f_bavail; sbp->f_bresvd = sbuf->f_bresvd; sbp->f_files = sbuf->f_files; sbp->f_ffree = sbuf->f_ffree; sbp->f_favail = sbuf->f_favail; sbp->f_fresvd = sbuf->f_fresvd; sbp->f_namemax = sbuf->f_namemax; copy_statvfs_info(sbp, mp); done: kmem_free(sbuf, sizeof(*sbuf)); return error; } int layerfs_sync(struct mount *mp, int waitfor, kauth_cred_t cred) { /* * XXX - Assumes no data cached at layer. */ return 0; } int layerfs_loadvnode(struct mount *mp, struct vnode *vp, const void *key, size_t key_len, const void **new_key) { struct layer_mount *lmp = MOUNTTOLAYERMOUNT(mp); struct vnode *lowervp; struct layer_node *xp; KASSERT(key_len == sizeof(struct vnode *)); memcpy(&lowervp, key, key_len); xp = kmem_alloc(lmp->layerm_size, KM_SLEEP); /* Share the interlock, vmobjlock, and klist with the lower node. */ vshareilock(vp, lowervp); rw_obj_hold(lowervp->v_uobj.vmobjlock); uvm_obj_setlock(&vp->v_uobj, lowervp->v_uobj.vmobjlock); vshareklist(vp, lowervp); vp->v_tag = lmp->layerm_tag; vp->v_type = lowervp->v_type; vp->v_op = lmp->layerm_vnodeop_p; if (vp->v_type == VBLK || vp->v_type == VCHR) spec_node_init(vp, lowervp->v_rdev); vp->v_data = xp; xp->layer_vnode = vp; xp->layer_lowervp = lowervp; xp->layer_flags = 0; uvm_vnp_setsize(vp, 0); /* Add a reference to the lower node. */ vref(lowervp); *new_key = &xp->layer_lowervp; return 0; } int layerfs_vget(struct mount *mp, ino_t ino, int lktype, struct vnode **vpp) { struct vnode *vp; int error; error = vfs_busy(mp); if (error == 0) { error = VFS_VGET(mp->mnt_lower, ino, lktype, &vp); vfs_unbusy(mp); } if (error) { *vpp = NULL; return error; } VOP_UNLOCK(vp); error = layer_node_create(mp, vp, vpp); if (error) { vrele(vp); *vpp = NULL; return error; } error = vn_lock(*vpp, lktype); if (error) { vrele(*vpp); *vpp = NULL; return error; } return 0; } int layerfs_fhtovp(struct mount *mp, struct fid *fidp, int lktype, struct vnode **vpp) { struct vnode *vp; int error; error = vfs_busy(mp); if (error == 0) { error = VFS_FHTOVP(mp->mnt_lower, fidp, lktype, &vp); vfs_unbusy(mp); } if (error) { *vpp = NULL; return error; } VOP_UNLOCK(vp); error = layer_node_create(mp, vp, vpp); if (error) { vput(vp); *vpp = NULL; return (error); } error = vn_lock(*vpp, lktype); if (error) { vrele(*vpp); *vpp = NULL; return error; } return 0; } int layerfs_vptofh(struct vnode *vp, struct fid *fhp, size_t *fh_size) { return VFS_VPTOFH(LAYERVPTOLOWERVP(vp), fhp, fh_size); } /* * layerfs_snapshot - handle a snapshot through a layered file system * * At present, we do NOT support snapshotting through a layered file * system as the ffs implementation changes v_vnlock of the snapshot * vnodes to point to one common lock. As there is no way for us to * absolutely pass this change up the stack, a layered file system * would end up referencing the wrong lock. * * This routine serves as a central resource for this behavior; all * layered file systems don't need to worry about the above. Also, if * things get fixed, all layers get the benefit. */ int layerfs_snapshot(struct mount *mp, struct vnode *vp, struct timespec *ts) { return EOPNOTSUPP; } /* * layerfs_suspendctl - suspend a layered file system * * Here we should suspend the lower file system(s) too. At present * this will deadlock as we don't know which to suspend first. * * This routine serves as a central resource for this behavior; all * layered file systems don't need to worry about the above. Also, if * things get fixed, all layers get the benefit. */ int layerfs_suspendctl(struct mount *mp, int cmd) { return genfs_suspendctl(mp, cmd); } SYSCTL_SETUP(sysctl_vfs_layerfs_setup, "sysctl vfs.layerfs subtree setup") { const struct sysctlnode *layerfs_node = NULL; sysctl_createv(clog, 0, NULL, &layerfs_node, #ifdef _MODULE 0, #else CTLFLAG_PERMANENT, #endif CTLTYPE_NODE, "layerfs", SYSCTL_DESCR("Generic layered file system"), NULL, 0, NULL, 0, CTL_VFS, CTL_CREATE, CTL_EOL); #ifdef LAYERFS_DIAGNOSTIC sysctl_createv(clog, 0, &layerfs_node, NULL, #ifndef _MODULE CTLFLAG_PERMANENT | #endif CTLFLAG_READWRITE, CTLTYPE_INT, "debug", SYSCTL_DESCR("Verbose debugging messages"), NULL, 0, &layerfs_debug, 0, CTL_CREATE, CTL_EOL); #endif /* * other subtrees should really be aliases to this, but since * they can't tell if layerfs has been instantiated yet, they * can't do that...not easily. not yet. :-) */ } int layerfs_renamelock_enter(struct mount *mp) { return VFS_RENAMELOCK_ENTER(mp->mnt_lower); } void layerfs_renamelock_exit(struct mount *mp) { VFS_RENAMELOCK_EXIT(mp->mnt_lower); }
1 1 1 204 204 204 204 203 204 204 204 6 11 203 203 202 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 /* $NetBSD: dksubr.c,v 1.114 2023/07/11 23:26:41 christos Exp $ */ /*- * Copyright (c) 1996, 1997, 1998, 1999, 2002, 2008 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Jason R. Thorpe and Roland C. Dowdeswell. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: dksubr.c,v 1.114 2023/07/11 23:26:41 christos Exp $"); #include <sys/param.h> #include <sys/systm.h> #include <sys/stat.h> #include <sys/proc.h> #include <sys/ioctl.h> #include <sys/device.h> #include <sys/disk.h> #include <sys/disklabel.h> #include <sys/buf.h> #include <sys/bufq.h> #include <sys/vnode.h> #include <sys/fcntl.h> #include <sys/namei.h> #include <sys/module.h> #include <sys/syslog.h> #include <dev/dkvar.h> #include <miscfs/specfs/specdev.h> /* for v_rdev */ int dkdebug = 0; #ifdef DEBUG #define DKDB_FOLLOW 0x1 #define DKDB_INIT 0x2 #define DKDB_VNODE 0x4 #define DKDB_DUMP 0x8 #define IFDEBUG(x,y) if (dkdebug & (x)) y #define DPRINTF(x,y) IFDEBUG(x, printf y) #define DPRINTF_FOLLOW(y) DPRINTF(DKDB_FOLLOW, y) #else #define IFDEBUG(x,y) #define DPRINTF(x,y) #define DPRINTF_FOLLOW(y) #endif #define DKF_READYFORDUMP (DKF_INITED|DKF_TAKEDUMP) static int dk_subr_modcmd(modcmd_t, void *); #define DKLABELDEV(dev) \ (MAKEDISKDEV(major((dev)), DISKUNIT((dev)), RAW_PART)) static void dk_makedisklabel(struct dk_softc *); static int dk_translate(struct dk_softc *, struct buf *); void dk_init(struct dk_softc *dksc, device_t dev, int dtype) { memset(dksc, 0x0, sizeof(*dksc)); dksc->sc_dtype = dtype; dksc->sc_dev = dev; strlcpy(dksc->sc_xname, device_xname(dev), DK_XNAME_SIZE); dksc->sc_dkdev.dk_name = dksc->sc_xname; } void dk_attach(struct dk_softc *dksc) { KASSERT(dksc->sc_dev != NULL); mutex_init(&dksc->sc_iolock, MUTEX_DEFAULT, IPL_VM); dksc->sc_flags |= DKF_READYFORDUMP; #ifdef DIAGNOSTIC dksc->sc_flags |= DKF_WARNLABEL | DKF_LABELSANITY; #endif if ((dksc->sc_flags & DKF_NO_RND) == 0) { /* Attach the device into the rnd source list. */ rnd_attach_source(&dksc->sc_rnd_source, dksc->sc_xname, RND_TYPE_DISK, RND_FLAG_DEFAULT); } } void dk_detach(struct dk_softc *dksc) { if ((dksc->sc_flags & DKF_NO_RND) == 0) { /* Unhook the entropy source. */ rnd_detach_source(&dksc->sc_rnd_source); } dksc->sc_flags &= ~DKF_READYFORDUMP; mutex_destroy(&dksc->sc_iolock); } /* ARGSUSED */ int dk_open(struct dk_softc *dksc, dev_t dev, int flags, int fmt, struct lwp *l) { const struct dkdriver *dkd = dksc->sc_dkdev.dk_driver; struct disklabel *lp = dksc->sc_dkdev.dk_label; int part = DISKPART(dev); int pmask = 1 << part; int ret = 0; struct disk *dk = &dksc->sc_dkdev; DPRINTF_FOLLOW(("%s(%s, %p, 0x%"PRIx64", 0x%x)\n", __func__, dksc->sc_xname, dksc, dev, flags)); mutex_enter(&dk->dk_openlock); /* * If there are wedges, and this is not RAW_PART, then we * need to fail. */ if (dk->dk_nwedges != 0 && part != RAW_PART) { ret = EBUSY; goto done; } /* If no dkdriver attached, bail */ if (dkd == NULL) { ret = ENXIO; goto done; } /* * initialize driver for the first opener */ if (dk->dk_openmask == 0 && dkd->d_firstopen != NULL) { ret = (*dkd->d_firstopen)(dksc->sc_dev, dev, flags, fmt); if (ret) goto done; } /* * If we're init'ed and there are no other open partitions then * update the in-core disklabel. */ if ((dksc->sc_flags & DKF_INITED)) { if ((dksc->sc_flags & DKF_VLABEL) == 0) { dksc->sc_flags |= DKF_VLABEL; dk_getdisklabel(dksc, dev); } } /* Fail if we can't find the partition. */ if (part != RAW_PART && ((dksc->sc_flags & DKF_VLABEL) == 0 || part >= lp->d_npartitions || lp->d_partitions[part].p_fstype == FS_UNUSED)) { ret = ENXIO; goto done; } /* Mark our unit as open. */ switch (fmt) { case S_IFCHR: dk->dk_copenmask |= pmask; break; case S_IFBLK: dk->dk_bopenmask |= pmask; break; } dk->dk_openmask = dk->dk_copenmask | dk->dk_bopenmask; done: mutex_exit(&dk->dk_openlock); return ret; } /* ARGSUSED */ int dk_close(struct dk_softc *dksc, dev_t dev, int flags, int fmt, struct lwp *l) { const struct dkdriver *dkd = dksc->sc_dkdev.dk_driver; int part = DISKPART(dev); int pmask = 1 << part; struct disk *dk = &dksc->sc_dkdev; DPRINTF_FOLLOW(("%s(%s, %p, 0x%"PRIx64", 0x%x)\n", __func__, dksc->sc_xname, dksc, dev, flags)); mutex_enter(&dk->dk_openlock); switch (fmt) { case S_IFCHR: dk->dk_copenmask &= ~pmask; break; case S_IFBLK: dk->dk_bopenmask &= ~pmask; break; } dk->dk_openmask = dk->dk_copenmask | dk->dk_bopenmask; if (dk->dk_openmask == 0) { if (dkd->d_lastclose != NULL) (*dkd->d_lastclose)(dksc->sc_dev); if ((dksc->sc_flags & DKF_KLABEL) == 0) dksc->sc_flags &= ~DKF_VLABEL; } mutex_exit(&dk->dk_openlock); return 0; } static int dk_translate(struct dk_softc *dksc, struct buf *bp) { int part; int wlabel; daddr_t blkno; struct disklabel *lp; struct disk *dk; uint64_t numsecs; unsigned secsize; lp = dksc->sc_dkdev.dk_label; dk = &dksc->sc_dkdev; part = DISKPART(bp->b_dev); numsecs = dk->dk_geom.dg_secperunit; secsize = dk->dk_geom.dg_secsize; /* * The transfer must be a whole number of blocks and the offset must * not be negative. */ if ((bp->b_bcount % secsize) != 0 || bp->b_blkno < 0) { bp->b_error = EINVAL; goto done; } /* If there is nothing to do, then we are done */ if (bp->b_bcount == 0) goto done; wlabel = dksc->sc_flags & (DKF_WLABEL|DKF_LABELLING); if (part == RAW_PART) { uint64_t numblocks = btodb(numsecs * secsize); if (bounds_check_with_mediasize(bp, DEV_BSIZE, numblocks) <= 0) goto done; } else { if (bounds_check_with_label(&dksc->sc_dkdev, bp, wlabel) <= 0) goto done; } /* * Convert the block number to absolute and put it in terms * of the device's logical block size. */ if (secsize >= DEV_BSIZE) blkno = bp->b_blkno / (secsize / DEV_BSIZE); else blkno = bp->b_blkno * (DEV_BSIZE / secsize); if (part != RAW_PART) blkno += lp->d_partitions[DISKPART(bp->b_dev)].p_offset; bp->b_rawblkno = blkno; return -1; done: bp->b_resid = bp->b_bcount; return bp->b_error; } static int dk_strategy1(struct dk_softc *dksc, struct buf *bp) { int error; DPRINTF_FOLLOW(("%s(%s, %p, %p)\n", __func__, dksc->sc_xname, dksc, bp)); if (!(dksc->sc_flags & DKF_INITED)) { DPRINTF_FOLLOW(("%s: not inited\n", __func__)); bp->b_error = ENXIO; bp->b_resid = bp->b_bcount; biodone(bp); return 1; } error = dk_translate(dksc, bp); if (error >= 0) { biodone(bp); return 1; } return 0; } void dk_strategy(struct dk_softc *dksc, struct buf *bp) { int error; error = dk_strategy1(dksc, bp); if (error) return; /* * Queue buffer and start unit */ dk_start(dksc, bp); } int dk_strategy_defer(struct dk_softc *dksc, struct buf *bp) { int error; error = dk_strategy1(dksc, bp); if (error) return error; /* * Queue buffer only */ mutex_enter(&dksc->sc_iolock); disk_wait(&dksc->sc_dkdev); bufq_put(dksc->sc_bufq, bp); mutex_exit(&dksc->sc_iolock); return 0; } int dk_strategy_pending(struct dk_softc *dksc) { struct buf *bp; if (!(dksc->sc_flags & DKF_INITED)) { DPRINTF_FOLLOW(("%s: not inited\n", __func__)); return 0; } mutex_enter(&dksc->sc_iolock); bp = bufq_peek(dksc->sc_bufq); mutex_exit(&dksc->sc_iolock); return bp != NULL; } void dk_start(struct dk_softc *dksc, struct buf *bp) { const struct dkdriver *dkd = dksc->sc_dkdev.dk_driver; int error; if (!(dksc->sc_flags & DKF_INITED)) { DPRINTF_FOLLOW(("%s: not inited\n", __func__)); return; } mutex_enter(&dksc->sc_iolock); if (bp != NULL) { bp->b_ci = curcpu(); disk_wait(&dksc->sc_dkdev); bufq_put(dksc->sc_bufq, bp); } /* * If another thread is running the queue, increment * busy counter to 2 so that the queue is retried, * because the driver may now accept additional * requests. */ if (dksc->sc_busy < 2) dksc->sc_busy++; if (dksc->sc_busy > 1) goto done; /* * Peeking at the buffer queue and committing the operation * only after success isn't atomic. * * So when a diskstart fails, the buffer is saved * and tried again before the next buffer is fetched. * dk_drain() handles flushing of a saved buffer. * * This keeps order of I/O operations, unlike bufq_put. */ while (dksc->sc_busy > 0) { bp = dksc->sc_deferred; dksc->sc_deferred = NULL; if (bp == NULL) bp = bufq_get(dksc->sc_bufq); while (bp != NULL) { disk_busy(&dksc->sc_dkdev); mutex_exit(&dksc->sc_iolock); error = dkd->d_diskstart(dksc->sc_dev, bp); mutex_enter(&dksc->sc_iolock); if (error == EAGAIN || error == ENOMEM) { /* * Not a disk error. Retry later. */ KASSERT(dksc->sc_deferred == NULL); dksc->sc_deferred = bp; disk_unbusy(&dksc->sc_dkdev, 0, (bp->b_flags & B_READ)); disk_wait(&dksc->sc_dkdev); break; } if (error != 0) { bp->b_error = error; bp->b_resid = bp->b_bcount; mutex_exit(&dksc->sc_iolock); dk_done(dksc, bp); mutex_enter(&dksc->sc_iolock); } bp = bufq_get(dksc->sc_bufq); } dksc->sc_busy--; } done: mutex_exit(&dksc->sc_iolock); } void dk_done(struct dk_softc *dksc, struct buf *bp) { struct disk *dk = &dksc->sc_dkdev; if (bp->b_error != 0) { struct cfdriver *cd = device_cfdriver(dksc->sc_dev); diskerr(bp, cd->cd_name, "error", LOG_PRINTF, 0, dk->dk_label); printf("\n"); } mutex_enter(&dksc->sc_iolock); disk_unbusy(dk, bp->b_bcount - bp->b_resid, (bp->b_flags & B_READ)); mutex_exit(&dksc->sc_iolock); if ((dksc->sc_flags & DKF_NO_RND) == 0) rnd_add_uint32(&dksc->sc_rnd_source, bp->b_rawblkno); biodone(bp); } void dk_drain(struct dk_softc *dksc) { struct buf *bp; mutex_enter(&dksc->sc_iolock); bp = dksc->sc_deferred; dksc->sc_deferred = NULL; if (bp != NULL) { bp->b_error = EIO; bp->b_resid = bp->b_bcount; biodone(bp); } bufq_drain(dksc->sc_bufq); mutex_exit(&dksc->sc_iolock); } int dk_discard(struct dk_softc *dksc, dev_t dev, off_t pos, off_t len) { const struct dkdriver *dkd = dksc->sc_dkdev.dk_driver; unsigned secsize = dksc->sc_dkdev.dk_geom.dg_secsize; struct buf tmp, *bp = &tmp; int maxsz; int error = 0; KASSERT(len >= 0); DPRINTF_FOLLOW(("%s(%s, %p, 0x"PRIx64", %jd, %jd)\n", __func__, dksc->sc_xname, dksc, (intmax_t)pos, (intmax_t)len)); if (!(dksc->sc_flags & DKF_INITED)) { DPRINTF_FOLLOW(("%s: not inited\n", __func__)); return ENXIO; } if (secsize == 0 || (pos % secsize) != 0 || (len % secsize) != 0) return EINVAL; /* largest value that b_bcount can store */ maxsz = rounddown(INT_MAX, secsize); while (len > 0) { /* enough data to please the bounds checking code */ bp->b_dev = dev; bp->b_blkno = (daddr_t)(pos / secsize); bp->b_bcount = uimin(len, maxsz); bp->b_flags = B_WRITE; error = dk_translate(dksc, bp); if (error >= 0) break; error = dkd->d_discard(dksc->sc_dev, (off_t)bp->b_rawblkno * secsize, (off_t)bp->b_bcount); if (error) break; pos += bp->b_bcount; len -= bp->b_bcount; } return error; } int dk_size(struct dk_softc *dksc, dev_t dev) { const struct dkdriver *dkd = dksc->sc_dkdev.dk_driver; struct disklabel *lp; int is_open; int part; int size; if ((dksc->sc_flags & DKF_INITED) == 0) return -1; part = DISKPART(dev); is_open = dksc->sc_dkdev.dk_openmask & (1 << part); if (!is_open && dkd->d_open(dev, 0, S_IFBLK, curlwp)) return -1; lp = dksc->sc_dkdev.dk_label; if (lp->d_partitions[part].p_fstype != FS_SWAP) size = -1; else size = lp->d_partitions[part].p_size * (lp->d_secsize / DEV_BSIZE); if (!is_open && dkd->d_close(dev, 0, S_IFBLK, curlwp)) return -1; return size; } int dk_ioctl(struct dk_softc *dksc, dev_t dev, u_long cmd, void *data, int flag, struct lwp *l) { const struct dkdriver *dkd = dksc->sc_dkdev.dk_driver; struct disklabel *lp; struct disk *dk = &dksc->sc_dkdev; #ifdef __HAVE_OLD_DISKLABEL struct disklabel newlabel; #endif int error; DPRINTF_FOLLOW(("%s(%s, %p, 0x%"PRIx64", 0x%lx)\n", __func__, dksc->sc_xname, dksc, dev, cmd)); /* ensure that the pseudo disk is open for writes for these commands */ switch (cmd) { case DIOCSDINFO: case DIOCWDINFO: #ifdef __HAVE_OLD_DISKLABEL case ODIOCSDINFO: case ODIOCWDINFO: #endif case DIOCKLABEL: case DIOCWLABEL: case DIOCAWEDGE: case DIOCDWEDGE: case DIOCSSTRATEGY: if ((flag & FWRITE) == 0) return EBADF; } /* ensure that the pseudo-disk is initialized for these */ switch (cmd) { case DIOCGDINFO: case DIOCSDINFO: case DIOCWDINFO: case DIOCGPARTINFO: case DIOCKLABEL: case DIOCWLABEL: case DIOCGDEFLABEL: case DIOCAWEDGE: case DIOCDWEDGE: case DIOCLWEDGES: case DIOCMWEDGES: case DIOCRMWEDGES: case DIOCCACHESYNC: #ifdef __HAVE_OLD_DISKLABEL case ODIOCGDINFO: case ODIOCSDINFO: case ODIOCWDINFO: case ODIOCGDEFLABEL: #endif if ((dksc->sc_flags & DKF_INITED) == 0) return ENXIO; } error = disk_ioctl(dk, dev, cmd, data, flag, l); if (error != EPASSTHROUGH) return error; else error = 0; switch (cmd) { case DIOCWDINFO: case DIOCSDINFO: #ifdef __HAVE_OLD_DISKLABEL case ODIOCWDINFO: case ODIOCSDINFO: #endif #ifdef __HAVE_OLD_DISKLABEL if (cmd == ODIOCSDINFO || cmd == ODIOCWDINFO) { memset(&newlabel, 0, sizeof newlabel); memcpy(&newlabel, data, sizeof (struct olddisklabel)); lp = &newlabel; } else #endif lp = (struct disklabel *)data; mutex_enter(&dk->dk_openlock); dksc->sc_flags |= DKF_LABELLING; error = setdisklabel(dksc->sc_dkdev.dk_label, lp, 0, dksc->sc_dkdev.dk_cpulabel); if (error == 0) { if (cmd == DIOCWDINFO #ifdef __HAVE_OLD_DISKLABEL || cmd == ODIOCWDINFO #endif ) error = writedisklabel(DKLABELDEV(dev), dkd->d_strategy, dksc->sc_dkdev.dk_label, dksc->sc_dkdev.dk_cpulabel); } dksc->sc_flags &= ~DKF_LABELLING; mutex_exit(&dk->dk_openlock); break; case DIOCKLABEL: if (*(int *)data != 0) dksc->sc_flags |= DKF_KLABEL; else dksc->sc_flags &= ~DKF_KLABEL; break; case DIOCWLABEL: if (*(int *)data != 0) dksc->sc_flags |= DKF_WLABEL; else dksc->sc_flags &= ~DKF_WLABEL; break; case DIOCGDEFLABEL: dk_getdefaultlabel(dksc, (struct disklabel *)data); break; #ifdef __HAVE_OLD_DISKLABEL case ODIOCGDEFLABEL: dk_getdefaultlabel(dksc, &newlabel); if (newlabel.d_npartitions > OLDMAXPARTITIONS) return ENOTTY; memcpy(data, &newlabel, sizeof (struct olddisklabel)); break; #endif case DIOCGSTRATEGY: { struct disk_strategy *dks = (void *)data; mutex_enter(&dksc->sc_iolock); if (dksc->sc_bufq != NULL) strlcpy(dks->dks_name, bufq_getstrategyname(dksc->sc_bufq), sizeof(dks->dks_name)); else error = EINVAL; mutex_exit(&dksc->sc_iolock); dks->dks_paramlen = 0; break; } case DIOCSSTRATEGY: { struct disk_strategy *dks = (void *)data; struct bufq_state *new; struct bufq_state *old; if (dks->dks_param != NULL) { return EINVAL; } dks->dks_name[sizeof(dks->dks_name) - 1] = 0; /* ensure term */ error = bufq_alloc(&new, dks->dks_name, BUFQ_EXACT|BUFQ_SORT_RAWBLOCK); if (error) { return error; } mutex_enter(&dksc->sc_iolock); old = dksc->sc_bufq; if (old) bufq_move(new, old); dksc->sc_bufq = new; mutex_exit(&dksc->sc_iolock); if (old) bufq_free(old); break; } default: error = ENOTTY; } return error; } /* * dk_dump dumps all of physical memory into the partition specified. * This requires substantially more framework than {s,w}ddump, and hence * is probably much more fragile. * */ #define DKFF_READYFORDUMP(x) (((x) & DKF_READYFORDUMP) == DKF_READYFORDUMP) static volatile int dk_dumping = 0; /* ARGSUSED */ int dk_dump(struct dk_softc *dksc, dev_t dev, daddr_t blkno, void *vav, size_t size, int flags) { const struct dkdriver *dkd = dksc->sc_dkdev.dk_driver; struct disk_geom *dg = &dksc->sc_dkdev.dk_geom; char *va = vav; struct disklabel *lp; struct partition *p; int part, towrt, maxblkcnt, nblk; int maxxfer, rv = 0; /* * ensure that we consider this device to be safe for dumping, * and that the device is configured. */ if (!DKFF_READYFORDUMP(dksc->sc_flags)) { DPRINTF(DKDB_DUMP, ("%s: bad dump flags 0x%x\n", __func__, dksc->sc_flags)); return ENXIO; } /* ensure that we are not already dumping */ if (dk_dumping) return EFAULT; if ((flags & DK_DUMP_RECURSIVE) == 0) dk_dumping = 1; if (dkd->d_dumpblocks == NULL) { DPRINTF(DKDB_DUMP, ("%s: no dumpblocks\n", __func__)); return ENXIO; } /* device specific max transfer size */ maxxfer = MAXPHYS; if (dkd->d_iosize != NULL) (*dkd->d_iosize)(dksc->sc_dev, &maxxfer); /* Convert to disk sectors. Request must be a multiple of size. */ part = DISKPART(dev); lp = dksc->sc_dkdev.dk_label; if ((size % lp->d_secsize) != 0) { DPRINTF(DKDB_DUMP, ("%s: odd size %zu\n", __func__, size)); return EFAULT; } towrt = size / lp->d_secsize; blkno = dbtob(blkno) / lp->d_secsize; /* blkno in secsize units */ p = &lp->d_partitions[part]; if (part == RAW_PART) { if (p->p_fstype != FS_UNUSED) { DPRINTF(DKDB_DUMP, ("%s: bad fstype %d\n", __func__, p->p_fstype)); return ENXIO; } /* Check whether dump goes to a wedge */ if (dksc->sc_dkdev.dk_nwedges == 0) { DPRINTF(DKDB_DUMP, ("%s: dump to raw\n", __func__)); return ENXIO; } /* Check transfer bounds against media size */ if (blkno < 0 || (blkno + towrt) > dg->dg_secperunit) { DPRINTF(DKDB_DUMP, ("%s: out of bounds blkno=%jd, towrt=%d, " "nsects=%jd\n", __func__, (intmax_t)blkno, towrt, dg->dg_secperunit)); return EINVAL; } } else { int nsects, sectoff; if (p->p_fstype != FS_SWAP) { DPRINTF(DKDB_DUMP, ("%s: bad fstype %d\n", __func__, p->p_fstype)); return ENXIO; } nsects = p->p_size; sectoff = p->p_offset; /* Check transfer bounds against partition size. */ if ((blkno < 0) || ((blkno + towrt) > nsects)) { DPRINTF(DKDB_DUMP, ("%s: out of bounds blkno=%jd, towrt=%d, " "nsects=%d\n", __func__, (intmax_t)blkno, towrt, nsects)); return EINVAL; } /* Offset block number to start of partition. */ blkno += sectoff; } /* Start dumping and return when done. */ maxblkcnt = howmany(maxxfer, lp->d_secsize); while (towrt > 0) { nblk = uimin(maxblkcnt, towrt); if ((rv = (*dkd->d_dumpblocks)(dksc->sc_dev, va, blkno, nblk)) != 0) { DPRINTF(DKDB_DUMP, ("%s: dumpblocks %d\n", __func__, rv)); return rv; } towrt -= nblk; blkno += nblk; va += nblk * lp->d_secsize; } if ((flags & DK_DUMP_RECURSIVE) == 0) dk_dumping = 0; return 0; } /* ARGSUSED */ void dk_getdefaultlabel(struct dk_softc *dksc, struct disklabel *lp) { const struct dkdriver *dkd = dksc->sc_dkdev.dk_driver; struct disk_geom *dg = &dksc->sc_dkdev.dk_geom; memset(lp, 0, sizeof(*lp)); if (dg->dg_secperunit > UINT32_MAX) lp->d_secperunit = UINT32_MAX; else lp->d_secperunit = dg->dg_secperunit; lp->d_secsize = dg->dg_secsize; lp->d_nsectors = dg->dg_nsectors; lp->d_ntracks = dg->dg_ntracks; lp->d_ncylinders = dg->dg_ncylinders; lp->d_secpercyl = lp->d_ntracks * lp->d_nsectors; strlcpy(lp->d_typename, dksc->sc_xname, sizeof(lp->d_typename)); lp->d_type = dksc->sc_dtype; strlcpy(lp->d_packname, "fictitious", sizeof(lp->d_packname)); lp->d_rpm = 3600; lp->d_interleave = 1; lp->d_flags = 0; lp->d_partitions[RAW_PART].p_offset = 0; lp->d_partitions[RAW_PART].p_size = lp->d_secperunit; lp->d_partitions[RAW_PART].p_fstype = FS_UNUSED; lp->d_npartitions = RAW_PART + 1; lp->d_magic = DISKMAGIC; lp->d_magic2 = DISKMAGIC; if (dkd->d_label) dkd->d_label(dksc->sc_dev, lp); lp->d_checksum = dkcksum(lp); } /* ARGSUSED */ void dk_getdisklabel(struct dk_softc *dksc, dev_t dev) { const struct dkdriver *dkd = dksc->sc_dkdev.dk_driver; struct disklabel *lp = dksc->sc_dkdev.dk_label; struct cpu_disklabel *clp = dksc->sc_dkdev.dk_cpulabel; struct disk_geom *dg = &dksc->sc_dkdev.dk_geom; struct partition *pp; int i, lpratio, dgratio; const char *errstring; memset(clp, 0x0, sizeof(*clp)); dk_getdefaultlabel(dksc, lp); errstring = readdisklabel(DKLABELDEV(dev), dkd->d_strategy, dksc->sc_dkdev.dk_label, dksc->sc_dkdev.dk_cpulabel); if (errstring) { dk_makedisklabel(dksc); if (dksc->sc_flags & DKF_WARNLABEL) printf("%s: %s\n", dksc->sc_xname, errstring); return; } if ((dksc->sc_flags & DKF_LABELSANITY) == 0) return; /* Convert sector counts to multiple of DEV_BSIZE for comparison */ lpratio = dgratio = 1; if (lp->d_secsize > DEV_BSIZE) lpratio = lp->d_secsize / DEV_BSIZE; if (dg->dg_secsize > DEV_BSIZE) dgratio = dg->dg_secsize / DEV_BSIZE; /* Sanity check */ if ((uint64_t)lp->d_secperunit * lpratio > dg->dg_secperunit * dgratio) printf("WARNING: %s: " "total unit size in disklabel (%" PRIu64 ") " "!= the size of %s (%" PRIu64 ")\n", dksc->sc_xname, (uint64_t)lp->d_secperunit * lpratio, dksc->sc_xname, dg->dg_secperunit * dgratio); else if (lp->d_secperunit < UINT32_MAX && (uint64_t)lp->d_secperunit * lpratio < dg->dg_secperunit * dgratio) printf("%s: %" PRIu64 " trailing sectors not covered" " by disklabel\n", dksc->sc_xname, (dg->dg_secperunit * dgratio) - (lp->d_secperunit * lpratio)); for (i=0; i < lp->d_npartitions; i++) { uint64_t pend; pp = &lp->d_partitions[i]; pend = pp->p_offset + pp->p_size; if (pend * lpratio > dg->dg_secperunit * dgratio) printf("WARNING: %s: end of partition `%c' exceeds " "the size of %s (%" PRIu64 ")\n", dksc->sc_xname, 'a' + i, dksc->sc_xname, dg->dg_secperunit * dgratio); } } /* * Heuristic to conjure a disklabel if reading a disklabel failed. * * This is to allow the raw partition to be used for a filesystem * without caring about the write protected label sector. * * If the driver provides it's own callback, use that instead. */ /* ARGSUSED */ static void dk_makedisklabel(struct dk_softc *dksc) { const struct dkdriver *dkd = dksc->sc_dkdev.dk_driver; struct disklabel *lp = dksc->sc_dkdev.dk_label; strlcpy(lp->d_packname, "default label", sizeof(lp->d_packname)); if (dkd->d_label) dkd->d_label(dksc->sc_dev, lp); else lp->d_partitions[RAW_PART].p_fstype = FS_BSDFFS; lp->d_checksum = dkcksum(lp); } MODULE(MODULE_CLASS_MISC, dk_subr, NULL); static int dk_subr_modcmd(modcmd_t cmd, void *arg) { switch (cmd) { case MODULE_CMD_INIT: case MODULE_CMD_FINI: return 0; case MODULE_CMD_STAT: case MODULE_CMD_AUTOUNLOAD: default: return ENOTTY; } }
289 286 288 3 14 286 285 285 19 284 286 74 75 75 74 75 75 527 3 527 511 511 512 511 147 146 146 147 147 75 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 /* $NetBSD: uvm_pgflcache.c,v 1.6 2020/10/18 18:31:31 chs Exp $ */ /*- * Copyright (c) 2019 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Andrew Doran. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /* * uvm_pgflcache.c: page freelist cache. * * This implements a tiny per-CPU cache of pages that sits between the main * page allocator and the freelists. By allocating and freeing pages in * batch, it reduces freelist contention by an order of magnitude. * * The cache can be paused & resumed at runtime so that UVM_HOTPLUG, * uvm_pglistalloc() and uvm_page_redim() can have a consistent view of the * world. On system with one CPU per physical package (e.g. a uniprocessor) * the cache is not enabled. */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: uvm_pgflcache.c,v 1.6 2020/10/18 18:31:31 chs Exp $"); #include "opt_uvm.h" #include "opt_multiprocessor.h" #include <sys/param.h> #include <sys/systm.h> #include <sys/sched.h> #include <sys/kernel.h> #include <sys/vnode.h> #include <sys/proc.h> #include <sys/atomic.h> #include <sys/cpu.h> #include <sys/xcall.h> #include <uvm/uvm.h> #include <uvm/uvm_pglist.h> #include <uvm/uvm_pgflcache.h> /* There is no point doing any of this on a uniprocessor. */ #ifdef MULTIPROCESSOR /* * MAXPGS - maximum pages per color, per bucket. * FILLPGS - number of pages to allocate at once, per color, per bucket. * * Why the chosen values: * * (1) In 2019, an average Intel system has 4kB pages and 8x L2 cache * colors. We make the assumption that most of the time allocation activity * will be centered around one UVM freelist, so most of the time there will * be no more than 224kB worth of cached pages per-CPU. That's tiny, but * enough to hugely reduce contention on the freelist locks, and give us a * small pool of pages which if we're very lucky may have some L1/L2 cache * locality, and do so without subtracting too much from the L2/L3 cache * benefits of having per-package free lists in the page allocator. * * (2) With the chosen values on _LP64, the data structure for each color * takes up a single cache line (64 bytes) giving this very low overhead * even in the "miss" case. * * (3) We don't want to cause too much pressure by hiding away memory that * could otherwise be put to good use. */ #define MAXPGS 7 #define FILLPGS 6 /* Variable size, according to # colors. */ struct pgflcache { struct pccolor { intptr_t count; struct vm_page *pages[MAXPGS]; } color[1]; }; static kmutex_t uvm_pgflcache_lock; static int uvm_pgflcache_sem; /* * uvm_pgflcache_fill: fill specified freelist/color from global list * * => must be called at IPL_VM * => must be called with given bucket lock held * => must only fill from the correct bucket for this CPU */ void uvm_pgflcache_fill(struct uvm_cpu *ucpu, int fl, int b, int c) { struct pgflbucket *pgb; struct pgflcache *pc; struct pccolor *pcc; struct pgflist *head; struct vm_page *pg; int count; KASSERT(mutex_owned(&uvm_freelist_locks[b].lock)); KASSERT(ucpu->pgflbucket == b); /* If caching is off, then bail out. */ if (__predict_false((pc = ucpu->pgflcache[fl]) == NULL)) { return; } /* Fill only to the limit. */ pcc = &pc->color[c]; pgb = uvm.page_free[fl].pgfl_buckets[b]; head = &pgb->pgb_colors[c]; if (pcc->count >= FILLPGS) { return; } /* Pull pages from the bucket until it's empty, or we are full. */ count = pcc->count; pg = LIST_FIRST(head); while (__predict_true(pg != NULL && count < FILLPGS)) { KASSERT(pg->flags & PG_FREE); KASSERT(uvm_page_get_bucket(pg) == b); pcc->pages[count++] = pg; pg = LIST_NEXT(pg, pageq.list); } /* Violate LIST abstraction to remove all pages at once. */ head->lh_first = pg; if (__predict_true(pg != NULL)) { pg->pageq.list.le_prev = &head->lh_first; } pgb->pgb_nfree -= (count - pcc->count); CPU_COUNT(CPU_COUNT_FREEPAGES, -(count - pcc->count)); pcc->count = count; } /* * uvm_pgflcache_spill: spill specified freelist/color to global list * * => must be called at IPL_VM * => mark __noinline so we don't pull it into uvm_pgflcache_free() */ static void __noinline uvm_pgflcache_spill(struct uvm_cpu *ucpu, int fl, int c) { struct pgflbucket *pgb; struct pgfreelist *pgfl; struct pgflcache *pc; struct pccolor *pcc; struct pgflist *head; kmutex_t *lock; int b, adj; pc = ucpu->pgflcache[fl]; pcc = &pc->color[c]; pgfl = &uvm.page_free[fl]; b = ucpu->pgflbucket; pgb = pgfl->pgfl_buckets[b]; head = &pgb->pgb_colors[c]; lock = &uvm_freelist_locks[b].lock; mutex_spin_enter(lock); for (adj = pcc->count; pcc->count != 0;) { pcc->count--; KASSERT(pcc->pages[pcc->count] != NULL); KASSERT(pcc->pages[pcc->count]->flags & PG_FREE); LIST_INSERT_HEAD(head, pcc->pages[pcc->count], pageq.list); } pgb->pgb_nfree += adj; CPU_COUNT(CPU_COUNT_FREEPAGES, adj); mutex_spin_exit(lock); } /* * uvm_pgflcache_alloc: try to allocate a cached page. * * => must be called at IPL_VM * => allocate only from the given freelist and given page color */ struct vm_page * uvm_pgflcache_alloc(struct uvm_cpu *ucpu, int fl, int c) { struct pgflcache *pc; struct pccolor *pcc; struct vm_page *pg; /* If caching is off, then bail out. */ if (__predict_false((pc = ucpu->pgflcache[fl]) == NULL)) { return NULL; } /* Very simple: if we have a page then return it. */ pcc = &pc->color[c]; if (__predict_false(pcc->count == 0)) { return NULL; } pg = pcc->pages[--(pcc->count)]; KASSERT(pg != NULL); KASSERT(pg->flags == PG_FREE); KASSERT(uvm_page_get_freelist(pg) == fl); KASSERT(uvm_page_get_bucket(pg) == ucpu->pgflbucket); pg->flags = PG_BUSY | PG_CLEAN | PG_FAKE; return pg; } /* * uvm_pgflcache_free: cache a page, if possible. * * => must be called at IPL_VM * => must only send pages for the correct bucket for this CPU */ bool uvm_pgflcache_free(struct uvm_cpu *ucpu, struct vm_page *pg) { struct pgflcache *pc; struct pccolor *pcc; int fl, c; KASSERT(uvm_page_get_bucket(pg) == ucpu->pgflbucket); /* If caching is off, then bail out. */ fl = uvm_page_get_freelist(pg); if (__predict_false((pc = ucpu->pgflcache[fl]) == NULL)) { return false; } /* If the array is full spill it first, then add page to array. */ c = VM_PGCOLOR(pg); pcc = &pc->color[c]; KASSERT((pg->flags & PG_FREE) == 0); if (__predict_false(pcc->count == MAXPGS)) { uvm_pgflcache_spill(ucpu, fl, c); } pg->flags = PG_FREE; pcc->pages[pcc->count] = pg; pcc->count++; return true; } /* * uvm_pgflcache_init: allocate and initialize per-CPU data structures for * the free page cache. Don't set anything in motion - that's taken care * of by uvm_pgflcache_resume(). */ static void uvm_pgflcache_init_cpu(struct cpu_info *ci) { struct uvm_cpu *ucpu; size_t sz; ucpu = ci->ci_data.cpu_uvm; KASSERT(ucpu->pgflcachemem == NULL); KASSERT(ucpu->pgflcache[0] == NULL); sz = offsetof(struct pgflcache, color[uvmexp.ncolors]); ucpu->pgflcachememsz = (roundup2(sz * VM_NFREELIST, coherency_unit) + coherency_unit - 1); ucpu->pgflcachemem = kmem_zalloc(ucpu->pgflcachememsz, KM_SLEEP); } /* * uvm_pgflcache_fini_cpu: dump all cached pages back to global free list * and shut down caching on the CPU. Called on each CPU in the system via * xcall. */ static void uvm_pgflcache_fini_cpu(void *arg1 __unused, void *arg2 __unused) { struct uvm_cpu *ucpu; int fl, color, s; ucpu = curcpu()->ci_data.cpu_uvm; for (fl = 0; fl < VM_NFREELIST; fl++) { s = splvm(); for (color = 0; color < uvmexp.ncolors; color++) { uvm_pgflcache_spill(ucpu, fl, color); } ucpu->pgflcache[fl] = NULL; splx(s); } } /* * uvm_pgflcache_pause: pause operation of the caches */ void uvm_pgflcache_pause(void) { uint64_t where; /* First one in starts draining. Everyone else waits. */ mutex_enter(&uvm_pgflcache_lock); if (uvm_pgflcache_sem++ == 0) { where = xc_broadcast(XC_HIGHPRI, uvm_pgflcache_fini_cpu, (void *)1, NULL); xc_wait(where); } mutex_exit(&uvm_pgflcache_lock); } /* * uvm_pgflcache_resume: resume operation of the caches */ void uvm_pgflcache_resume(void) { CPU_INFO_ITERATOR cii; struct cpu_info *ci; struct uvm_cpu *ucpu; uintptr_t addr; size_t sz; int fl; /* Last guy out takes care of business. */ mutex_enter(&uvm_pgflcache_lock); KASSERT(uvm_pgflcache_sem > 0); if (uvm_pgflcache_sem-- > 1) { mutex_exit(&uvm_pgflcache_lock); return; } /* * Make sure dependant data structure updates are remotely visible. * Essentially this functions as a global memory barrier. */ xc_barrier(XC_HIGHPRI); /* * Then set all of the pointers in place on each CPU. As soon as * each pointer is set, caching is operational in that dimension. */ sz = offsetof(struct pgflcache, color[uvmexp.ncolors]); for (CPU_INFO_FOREACH(cii, ci)) { ucpu = ci->ci_data.cpu_uvm; addr = roundup2((uintptr_t)ucpu->pgflcachemem, coherency_unit); for (fl = 0; fl < VM_NFREELIST; fl++) { ucpu->pgflcache[fl] = (struct pgflcache *)addr; addr += sz; } } mutex_exit(&uvm_pgflcache_lock); } /* * uvm_pgflcache_start: start operation of the cache. * * => called once only, when init(8) is about to be started */ void uvm_pgflcache_start(void) { CPU_INFO_ITERATOR cii; struct cpu_info *ci; KASSERT(uvm_pgflcache_sem > 0); /* * There's not much point doing this if every CPU has its own * bucket (and that includes the uniprocessor case). */ if (ncpu == uvm.bucketcount) { return; } /* Create data structures for each CPU. */ for (CPU_INFO_FOREACH(cii, ci)) { uvm_pgflcache_init_cpu(ci); } /* Kick it into action. */ uvm_pgflcache_resume(); } /* * uvm_pgflcache_init: set up data structures for the free page cache. */ void uvm_pgflcache_init(void) { uvm_pgflcache_sem = 1; mutex_init(&uvm_pgflcache_lock, MUTEX_DEFAULT, IPL_NONE); } #else /* MULTIPROCESSOR */ struct vm_page * uvm_pgflcache_alloc(struct uvm_cpu *ucpu, int fl, int c) { return NULL; } bool uvm_pgflcache_free(struct uvm_cpu *ucpu, struct vm_page *pg) { return false; } void uvm_pgflcache_fill(struct uvm_cpu *ucpu, int fl, int b, int c) { } void uvm_pgflcache_pause(void) { } void uvm_pgflcache_resume(void) { } void uvm_pgflcache_start(void) { } void uvm_pgflcache_init(void) { } #endif /* MULTIPROCESSOR */
4 4 4 6 6 3 6 210 210 77 16 61 16 63 1 1 62 62 3 3 3 3 3 2 2 2 2 2 2 2 2 2 2 2 2 5 3 3 3 3 2 1 3 3 1 2 4 2 2 4 4 4 6 6 6 4 3 1 2 2 1 6 6 6 6 6 6 6 6 4 2 2 1 3 4 1 1 4 3 1 1 1 4 2 2 2 4 3 74 75 75 75 16 58 58 73 20 13 83 36 58 12 2 24 66 67 5 43 43 43 56 55 55 66 42 32 75 75 74 2 74 74 42 23 20 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 /* $NetBSD: route.c,v 1.237 2023/06/05 03:51:45 ozaki-r Exp $ */ /*- * Copyright (c) 1998, 2008 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Kevin M. Lahey of the Numerical Aerospace Simulation Facility, * NASA Ames Research Center. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /* * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the project nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * Copyright (c) 1980, 1986, 1991, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)route.c 8.3 (Berkeley) 1/9/95 */ #ifdef _KERNEL_OPT #include "opt_inet.h" #include "opt_route.h" #include "opt_net_mpsafe.h" #endif #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: route.c,v 1.237 2023/06/05 03:51:45 ozaki-r Exp $"); #include <sys/param.h> #ifdef RTFLUSH_DEBUG #include <sys/sysctl.h> #endif #include <sys/systm.h> #include <sys/callout.h> #include <sys/proc.h> #include <sys/mbuf.h> #include <sys/socket.h> #include <sys/socketvar.h> #include <sys/domain.h> #include <sys/kernel.h> #include <sys/ioctl.h> #include <sys/pool.h> #include <sys/kauth.h> #include <sys/workqueue.h> #include <sys/syslog.h> #include <sys/rwlock.h> #include <sys/mutex.h> #include <sys/cpu.h> #include <sys/kmem.h> #include <net/if.h> #include <net/if_dl.h> #include <net/route.h> #if defined(INET) || defined(INET6) #include <net/if_llatbl.h> #endif #include <netinet/in.h> #include <netinet/in_var.h> #define PRESERVED_RTF (RTF_UP | RTF_GATEWAY | RTF_HOST | RTF_DONE | RTF_MASK) #ifdef RTFLUSH_DEBUG #define rtcache_debug() __predict_false(_rtcache_debug) #else /* RTFLUSH_DEBUG */ #define rtcache_debug() 0 #endif /* RTFLUSH_DEBUG */ #ifdef RT_DEBUG #define RT_REFCNT_TRACE(rt) printf("%s:%d: rt=%p refcnt=%d\n", \ __func__, __LINE__, (rt), (rt)->rt_refcnt) #else #define RT_REFCNT_TRACE(rt) do {} while (0) #endif #ifdef RT_DEBUG #define dlog(level, fmt, args...) log(level, fmt, ##args) #else #define dlog(level, fmt, args...) do {} while (0) #endif struct rtstat rtstat; static int rttrash; /* routes not in table but not freed */ static struct pool rtentry_pool; static struct pool rttimer_pool; static struct callout rt_timer_ch; /* callout for rt_timer_timer() */ static struct workqueue *rt_timer_wq; static struct work rt_timer_wk; static void rt_timer_init(void); static void rt_timer_queue_remove_all(struct rttimer_queue *); static void rt_timer_remove_all(struct rtentry *); static void rt_timer_timer(void *); /* * Locking notes: * - The routing table is protected by a global rwlock * - API: RT_RLOCK and friends * - rtcaches are NOT protected by the framework * - Callers must guarantee a rtcache isn't accessed simultaneously * - How the constraint is guaranteed in the wild * - Protect a rtcache by a mutex (e.g., inp_route) * - Make rtcache per-CPU and allow only accesses from softint * (e.g., ipforward_rt_percpu) * - References to a rtentry is managed by reference counting and psref * - Reference counting is used for temporal reference when a rtentry * is fetched from the routing table * - psref is used for temporal reference when a rtentry is fetched * from a rtcache * - struct route (rtcache) has struct psref, so we cannot obtain * a reference twice on the same struct route * - Before destroying or updating a rtentry, we have to wait for * all references left (see below for details) * - APIs * - An obtained rtentry via rtalloc1 or rtrequest* must be * unreferenced by rt_unref * - An obtained rtentry via rtcache_* must be unreferenced by * rtcache_unref * - TODO: once we get a lockless routing table, we should use only * psref for rtentries * - rtentry destruction * - A rtentry is destroyed (freed) only when we call rtrequest(RTM_DELETE) * - If a caller of rtrequest grabs a reference of a rtentry, the caller * has a responsibility to destroy the rtentry by itself by calling * rt_free * - If not, rtrequest itself does that * - If rt_free is called in softint, the actual destruction routine is * deferred to a workqueue * - rtentry update * - When updating a rtentry, RTF_UPDATING flag is set * - If a rtentry is set RTF_UPDATING, fetching the rtentry from * the routing table or a rtcache results in either of the following * cases: * - if the caller runs in softint, the caller fails to fetch * - otherwise, the caller waits for the update completed and retries * to fetch (probably succeed to fetch for the second time) * - rtcache invalidation * - There is a global generation counter that is incremented when * any routes have been added or deleted * - When a rtcache caches a rtentry into itself, it also stores * a snapshot of the generation counter * - If the snapshot equals to the global counter, the cache is valid, * otherwise the cache is invalidated */ /* * Global lock for the routing table. */ static krwlock_t rt_lock __cacheline_aligned; #ifdef NET_MPSAFE #define RT_RLOCK() rw_enter(&rt_lock, RW_READER) #define RT_WLOCK() rw_enter(&rt_lock, RW_WRITER) #define RT_UNLOCK() rw_exit(&rt_lock) #define RT_WLOCKED() rw_write_held(&rt_lock) #define RT_ASSERT_WLOCK() KASSERT(rw_write_held(&rt_lock)) #define RT_WQ_FLAGS WQ_MPSAFE #else #define RT_RLOCK() do {} while (0) #define RT_WLOCK() do {} while (0) #define RT_UNLOCK() do {} while (0) #define RT_WLOCKED() true #define RT_ASSERT_WLOCK() do {} while (0) #define RT_WQ_FLAGS 0 #endif static uint64_t rtcache_generation; /* * mutex and cv that are used to wait for references to a rtentry left * before updating the rtentry. */ static struct { kmutex_t lock; kcondvar_t cv; bool ongoing; const struct lwp *lwp; } rt_update_global __cacheline_aligned; /* * A workqueue and stuff that are used to defer the destruction routine * of rtentries. */ static struct { struct workqueue *wq; struct work wk; kmutex_t lock; SLIST_HEAD(, rtentry) queue; bool enqueued; } rt_free_global __cacheline_aligned; /* psref for rtentry */ static struct psref_class *rt_psref_class __read_mostly; #ifdef RTFLUSH_DEBUG static int _rtcache_debug = 0; #endif /* RTFLUSH_DEBUG */ static kauth_listener_t route_listener; static int rtdeletemsg(struct rtentry *); static void rt_maskedcopy(const struct sockaddr *, struct sockaddr *, const struct sockaddr *); static void rtcache_invalidate(void); static void rt_ref(struct rtentry *); static struct rtentry * rtalloc1_locked(const struct sockaddr *, int, bool, bool); static struct ifaddr *rt_getifa(struct rt_addrinfo *, struct psref *); static struct ifnet *rt_getifp(struct rt_addrinfo *, struct psref *); static struct ifaddr *ifa_ifwithroute_psref(int, const struct sockaddr *, const struct sockaddr *, struct psref *); static void rtcache_ref(struct rtentry *, struct route *); #ifdef NET_MPSAFE static void rt_update_wait(void); #endif static bool rt_wait_ok(void); static void rt_wait_refcnt(const char *, struct rtentry *, int); static void rt_wait_psref(struct rtentry *); #ifdef DDB static void db_print_sa(const struct sockaddr *); static void db_print_ifa(struct ifaddr *); static int db_show_rtentry(struct rtentry *, void *); #endif #ifdef RTFLUSH_DEBUG static void sysctl_net_rtcache_setup(struct sysctllog **); static void sysctl_net_rtcache_setup(struct sysctllog **clog) { const struct sysctlnode *rnode; if (sysctl_createv(clog, 0, NULL, &rnode, CTLFLAG_PERMANENT, CTLTYPE_NODE, "rtcache", SYSCTL_DESCR("Route cache related settings"), NULL, 0, NULL, 0, CTL_NET, CTL_CREATE, CTL_EOL) != 0) return; if (sysctl_createv(clog, 0, &rnode, &rnode, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_INT, "debug", SYSCTL_DESCR("Debug route caches"), NULL, 0, &_rtcache_debug, 0, CTL_CREATE, CTL_EOL) != 0) return; } #endif /* RTFLUSH_DEBUG */ static inline void rt_destroy(struct rtentry *rt) { if (rt->_rt_key != NULL) sockaddr_free(rt->_rt_key); if (rt->rt_gateway != NULL) sockaddr_free(rt->rt_gateway); if (rt_gettag(rt) != NULL) sockaddr_free(rt_gettag(rt)); rt->_rt_key = rt->rt_gateway = rt->rt_tag = NULL; } static inline const struct sockaddr * rt_setkey(struct rtentry *rt, const struct sockaddr *key, int flags) { if (rt->_rt_key == key) goto out; if (rt->_rt_key != NULL) sockaddr_free(rt->_rt_key); rt->_rt_key = sockaddr_dup(key, flags); out: rt->rt_nodes->rn_key = (const char *)rt->_rt_key; return rt->_rt_key; } struct ifaddr * rt_get_ifa(struct rtentry *rt) { struct ifaddr *ifa; ifa = rt->rt_ifa; if (ifa->ifa_getifa == NULL) return ifa; #if 0 else if (ifa->ifa_seqno != NULL && *ifa->ifa_seqno == rt->rt_ifa_seqno) return ifa; #endif else { ifa = (*ifa->ifa_getifa)(ifa, rt_getkey(rt)); if (ifa == NULL) return NULL; rt_replace_ifa(rt, ifa); return ifa; } } static void rt_set_ifa1(struct rtentry *rt, struct ifaddr *ifa) { rt->rt_ifa = ifa; if (ifa->ifa_seqno != NULL) rt->rt_ifa_seqno = *ifa->ifa_seqno; } /* * Is this route the connected route for the ifa? */ static int rt_ifa_connected(const struct rtentry *rt, const struct ifaddr *ifa) { const struct sockaddr *key, *dst, *odst; struct sockaddr_storage maskeddst; key = rt_getkey(rt); dst = rt->rt_flags & RTF_HOST ? ifa->ifa_dstaddr : ifa->ifa_addr; if (dst == NULL || dst->sa_family != key->sa_family || dst->sa_len != key->sa_len) return 0; if ((rt->rt_flags & RTF_HOST) == 0 && ifa->ifa_netmask) { odst = dst; dst = (struct sockaddr *)&maskeddst; rt_maskedcopy(odst, (struct sockaddr *)&maskeddst, ifa->ifa_netmask); } return (memcmp(dst, key, dst->sa_len) == 0); } void rt_replace_ifa(struct rtentry *rt, struct ifaddr *ifa) { struct ifaddr *old; if (rt->rt_ifa == ifa) return; if (rt->rt_ifa != ifa && rt->rt_ifa->ifa_flags & IFA_ROUTE && rt_ifa_connected(rt, rt->rt_ifa)) { RT_DPRINTF("rt->_rt_key = %p, ifa = %p, " "replace deleted IFA_ROUTE\n", (void *)rt->_rt_key, (void *)rt->rt_ifa); rt->rt_ifa->ifa_flags &= ~IFA_ROUTE; if (rt_ifa_connected(rt, ifa)) { RT_DPRINTF("rt->_rt_key = %p, ifa = %p, " "replace added IFA_ROUTE\n", (void *)rt->_rt_key, (void *)ifa); ifa->ifa_flags |= IFA_ROUTE; } } ifaref(ifa); old = rt->rt_ifa; rt_set_ifa1(rt, ifa); ifafree(old); } static void rt_set_ifa(struct rtentry *rt, struct ifaddr *ifa) { ifaref(ifa); rt_set_ifa1(rt, ifa); } static int route_listener_cb(kauth_cred_t cred, kauth_action_t action, void *cookie, void *arg0, void *arg1, void *arg2, void *arg3) { struct rt_msghdr *rtm; int result; result = KAUTH_RESULT_DEFER; rtm = arg1; if (action != KAUTH_NETWORK_ROUTE) return result; if (rtm->rtm_type == RTM_GET) result = KAUTH_RESULT_ALLOW; return result; } static void rt_free_work(struct work *, void *); void rt_init(void) { int error; #ifdef RTFLUSH_DEBUG sysctl_net_rtcache_setup(NULL); #endif mutex_init(&rt_free_global.lock, MUTEX_DEFAULT, IPL_SOFTNET); SLIST_INIT(&rt_free_global.queue); rt_free_global.enqueued = false; rt_psref_class = psref_class_create("rtentry", IPL_SOFTNET); error = workqueue_create(&rt_free_global.wq, "rt_free", rt_free_work, NULL, PRI_SOFTNET, IPL_SOFTNET, RT_WQ_FLAGS); if (error) panic("%s: workqueue_create failed (%d)\n", __func__, error); mutex_init(&rt_update_global.lock, MUTEX_DEFAULT, IPL_SOFTNET); cv_init(&rt_update_global.cv, "rt_update"); pool_init(&rtentry_pool, sizeof(struct rtentry), 0, 0, 0, "rtentpl", NULL, IPL_SOFTNET); pool_init(&rttimer_pool, sizeof(struct rttimer), 0, 0, 0, "rttmrpl", NULL, IPL_SOFTNET); rn_init(); /* initialize all zeroes, all ones, mask table */ rtbl_init(); route_listener = kauth_listen_scope(KAUTH_SCOPE_NETWORK, route_listener_cb, NULL); } static void rtcache_invalidate(void) { RT_ASSERT_WLOCK(); if (rtcache_debug()) printf("%s: enter\n", __func__); rtcache_generation++; } #ifdef RT_DEBUG static void dump_rt(const struct rtentry *rt) { char buf[512]; log(LOG_DEBUG, "rt: "); log(LOG_DEBUG, "p=%p ", rt); if (rt->_rt_key == NULL) { log(LOG_DEBUG, "dst=(NULL) "); } else { sockaddr_format(rt->_rt_key, buf, sizeof(buf)); log(LOG_DEBUG, "dst=%s ", buf); } if (rt->rt_gateway == NULL) { log(LOG_DEBUG, "gw=(NULL) "); } else { sockaddr_format(rt->_rt_key, buf, sizeof(buf)); log(LOG_DEBUG, "gw=%s ", buf); } log(LOG_DEBUG, "flags=%x ", rt->rt_flags); if (rt->rt_ifp == NULL) { log(LOG_DEBUG, "if=(NULL) "); } else { log(LOG_DEBUG, "if=%s ", rt->rt_ifp->if_xname); } log(LOG_DEBUG, "\n"); } #endif /* RT_DEBUG */ /* * Packet routing routines. If success, refcnt of a returned rtentry * will be incremented. The caller has to rtfree it by itself. */ struct rtentry * rtalloc1_locked(const struct sockaddr *dst, int report, bool wait_ok, bool wlock) { rtbl_t *rtbl; struct rtentry *rt; int s; #ifdef NET_MPSAFE retry: #endif s = splsoftnet(); rtbl = rt_gettable(dst->sa_family); if (rtbl == NULL) goto miss; rt = rt_matchaddr(rtbl, dst); if (rt == NULL) goto miss; if (!ISSET(rt->rt_flags, RTF_UP)) goto miss; #ifdef NET_MPSAFE if (ISSET(rt->rt_flags, RTF_UPDATING) && /* XXX updater should be always able to acquire */ curlwp != rt_update_global.lwp) { if (!wait_ok || !rt_wait_ok()) goto miss; RT_UNLOCK(); splx(s); /* We can wait until the update is complete */ rt_update_wait(); if (wlock) RT_WLOCK(); else RT_RLOCK(); goto retry; } #endif /* NET_MPSAFE */ rt_ref(rt); RT_REFCNT_TRACE(rt); splx(s); return rt; miss: rtstat.rts_unreach++; if (report) { struct rt_addrinfo info; memset(&info, 0, sizeof(info)); info.rti_info[RTAX_DST] = dst; rt_missmsg(RTM_MISS, &info, 0, 0); } splx(s); return NULL; } struct rtentry * rtalloc1(const struct sockaddr *dst, int report) { struct rtentry *rt; RT_RLOCK(); rt = rtalloc1_locked(dst, report, true, false); RT_UNLOCK(); return rt; } static void rt_ref(struct rtentry *rt) { KASSERTMSG(rt->rt_refcnt >= 0, "rt_refcnt=%d", rt->rt_refcnt); atomic_inc_uint(&rt->rt_refcnt); } void rt_unref(struct rtentry *rt) { KASSERT(rt != NULL); KASSERTMSG(rt->rt_refcnt > 0, "refcnt=%d", rt->rt_refcnt); atomic_dec_uint(&rt->rt_refcnt); if (!ISSET(rt->rt_flags, RTF_UP) || ISSET(rt->rt_flags, RTF_UPDATING)) { mutex_enter(&rt_free_global.lock); cv_broadcast(&rt->rt_cv); mutex_exit(&rt_free_global.lock); } } static bool rt_wait_ok(void) { /* * This originally returned !cpu_softintr_p(), but that doesn't * work: the caller may hold a lock (probably softnet lock) * that a softint is waiting for, in which case waiting here * would cause a deadlock. See https://gnats.netbsd.org/56844 * for details. For now, until the locking paths are sorted * out, we just disable the waiting option altogether and * always defer to workqueue. */ KASSERT(!cpu_intr_p()); return false; } void rt_wait_refcnt(const char *title, struct rtentry *rt, int cnt) { mutex_enter(&rt_free_global.lock); while (rt->rt_refcnt > cnt) { dlog(LOG_DEBUG, "%s: %s waiting (refcnt=%d)\n", __func__, title, rt->rt_refcnt); cv_wait(&rt->rt_cv, &rt_free_global.lock); dlog(LOG_DEBUG, "%s: %s waited (refcnt=%d)\n", __func__, title, rt->rt_refcnt); } mutex_exit(&rt_free_global.lock); } void rt_wait_psref(struct rtentry *rt) { psref_target_destroy(&rt->rt_psref, rt_psref_class); psref_target_init(&rt->rt_psref, rt_psref_class); } static void _rt_free(struct rtentry *rt) { struct ifaddr *ifa; /* * Need to avoid a deadlock on rt_wait_refcnt of update * and a conflict on psref_target_destroy of update. */ #ifdef NET_MPSAFE rt_update_wait(); #endif RT_REFCNT_TRACE(rt); KASSERTMSG(rt->rt_refcnt >= 0, "refcnt=%d", rt->rt_refcnt); rt_wait_refcnt("free", rt, 0); #ifdef NET_MPSAFE psref_target_destroy(&rt->rt_psref, rt_psref_class); #endif rt_assert_inactive(rt); rttrash--; ifa = rt->rt_ifa; rt->rt_ifa = NULL; ifafree(ifa); rt->rt_ifp = NULL; cv_destroy(&rt->rt_cv); rt_destroy(rt); pool_put(&rtentry_pool, rt); } static void rt_free_work(struct work *wk, void *arg) { for (;;) { struct rtentry *rt; mutex_enter(&rt_free_global.lock); if ((rt = SLIST_FIRST(&rt_free_global.queue)) == NULL) { rt_free_global.enqueued = false; mutex_exit(&rt_free_global.lock); return; } SLIST_REMOVE_HEAD(&rt_free_global.queue, rt_free); mutex_exit(&rt_free_global.lock); atomic_dec_uint(&rt->rt_refcnt); _rt_free(rt); } } void rt_free(struct rtentry *rt) { KASSERTMSG(rt->rt_refcnt > 0, "rt_refcnt=%d", rt->rt_refcnt); if (rt_wait_ok()) { atomic_dec_uint(&rt->rt_refcnt); _rt_free(rt); return; } mutex_enter(&rt_free_global.lock); /* No need to add a reference here. */ SLIST_INSERT_HEAD(&rt_free_global.queue, rt, rt_free); if (!rt_free_global.enqueued) { workqueue_enqueue(rt_free_global.wq, &rt_free_global.wk, NULL); rt_free_global.enqueued = true; } mutex_exit(&rt_free_global.lock); } #ifdef NET_MPSAFE static void rt_update_wait(void) { mutex_enter(&rt_update_global.lock); while (rt_update_global.ongoing) { dlog(LOG_DEBUG, "%s: waiting lwp=%p\n", __func__, curlwp); cv_wait(&rt_update_global.cv, &rt_update_global.lock); dlog(LOG_DEBUG, "%s: waited lwp=%p\n", __func__, curlwp); } mutex_exit(&rt_update_global.lock); } #endif int rt_update_prepare(struct rtentry *rt) { dlog(LOG_DEBUG, "%s: updating rt=%p lwp=%p\n", __func__, rt, curlwp); RT_WLOCK(); /* If the entry is being destroyed, don't proceed the update. */ if (!ISSET(rt->rt_flags, RTF_UP)) { RT_UNLOCK(); return ESRCH; } rt->rt_flags |= RTF_UPDATING; RT_UNLOCK(); mutex_enter(&rt_update_global.lock); while (rt_update_global.ongoing) { dlog(LOG_DEBUG, "%s: waiting ongoing updating rt=%p lwp=%p\n", __func__, rt, curlwp); cv_wait(&rt_update_global.cv, &rt_update_global.lock); dlog(LOG_DEBUG, "%s: waited ongoing updating rt=%p lwp=%p\n", __func__, rt, curlwp); } rt_update_global.ongoing = true; /* XXX need it to avoid rt_update_wait by updater itself. */ rt_update_global.lwp = curlwp; mutex_exit(&rt_update_global.lock); rt_wait_refcnt("update", rt, 1); rt_wait_psref(rt); return 0; } void rt_update_finish(struct rtentry *rt) { RT_WLOCK(); rt->rt_flags &= ~RTF_UPDATING; RT_UNLOCK(); mutex_enter(&rt_update_global.lock); rt_update_global.ongoing = false; rt_update_global.lwp = NULL; cv_broadcast(&rt_update_global.cv); mutex_exit(&rt_update_global.lock); dlog(LOG_DEBUG, "%s: updated rt=%p lwp=%p\n", __func__, rt, curlwp); } /* * Force a routing table entry to the specified * destination to go through the given gateway. * Normally called as a result of a routing redirect * message from the network layer. * * N.B.: must be called at splsoftnet */ void rtredirect(const struct sockaddr *dst, const struct sockaddr *gateway, const struct sockaddr *netmask, int flags, const struct sockaddr *src, struct rtentry **rtp) { struct rtentry *rt; int error = 0; uint64_t *stat = NULL; struct rt_addrinfo info; struct ifaddr *ifa; struct psref psref; /* verify the gateway is directly reachable */ if ((ifa = ifa_ifwithnet_psref(gateway, &psref)) == NULL) { error = ENETUNREACH; goto out; } rt = rtalloc1(dst, 0); /* * If the redirect isn't from our current router for this dst, * it's either old or wrong. If it redirects us to ourselves, * we have a routing loop, perhaps as a result of an interface * going down recently. */ if (!(flags & RTF_DONE) && rt && (sockaddr_cmp(src, rt->rt_gateway) != 0 || rt->rt_ifa != ifa)) error = EINVAL; else { int s = pserialize_read_enter(); struct ifaddr *_ifa; _ifa = ifa_ifwithaddr(gateway); if (_ifa != NULL) error = EHOSTUNREACH; pserialize_read_exit(s); } if (error) goto done; /* * Create a new entry if we just got back a wildcard entry * or the lookup failed. This is necessary for hosts * which use routing redirects generated by smart gateways * to dynamically build the routing tables. */ if (rt == NULL || (rt_mask(rt) && rt_mask(rt)->sa_len < 2)) goto create; /* * Don't listen to the redirect if it's * for a route to an interface. */ if (rt->rt_flags & RTF_GATEWAY) { if (((rt->rt_flags & RTF_HOST) == 0) && (flags & RTF_HOST)) { /* * Changing from route to net => route to host. * Create new route, rather than smashing route to net. */ create: if (rt != NULL) rt_unref(rt); flags |= RTF_GATEWAY | RTF_DYNAMIC; memset(&info, 0, sizeof(info)); info.rti_info[RTAX_DST] = dst; info.rti_info[RTAX_GATEWAY] = gateway; info.rti_info[RTAX_NETMASK] = netmask; info.rti_ifa = ifa; info.rti_flags = flags; rt = NULL; error = rtrequest1(RTM_ADD, &info, &rt); if (rt != NULL) flags = rt->rt_flags; if (error == 0) rt_newmsg_dynamic(RTM_ADD, rt); stat = &rtstat.rts_dynamic; } else { /* * Smash the current notion of the gateway to * this destination. Should check about netmask!!! */ #ifdef NET_MPSAFE KASSERT(!cpu_softintr_p()); error = rt_update_prepare(rt); if (error == 0) { #endif RT_WLOCK(); error = rt_setgate(rt, gateway); if (error == 0) { rt->rt_flags |= RTF_MODIFIED; flags |= RTF_MODIFIED; } RT_UNLOCK(); #ifdef NET_MPSAFE rt_update_finish(rt); } else { /* * If error != 0, the rtentry is being * destroyed, so doing nothing doesn't * matter. */ } #endif stat = &rtstat.rts_newgateway; } } else error = EHOSTUNREACH; done: if (rt) { if (rtp != NULL && !error) *rtp = rt; else rt_unref(rt); } out: if (error) rtstat.rts_badredirect++; else if (stat != NULL) (*stat)++; memset(&info, 0, sizeof(info)); info.rti_info[RTAX_DST] = dst; info.rti_info[RTAX_GATEWAY] = gateway; info.rti_info[RTAX_NETMASK] = netmask; info.rti_info[RTAX_AUTHOR] = src; rt_missmsg(RTM_REDIRECT, &info, flags, error); ifa_release(ifa, &psref); } /* * Delete a route and generate a message. * It doesn't free a passed rt. */ static int rtdeletemsg(struct rtentry *rt) { int error; struct rt_addrinfo info; struct rtentry *retrt; /* * Request the new route so that the entry is not actually * deleted. That will allow the information being reported to * be accurate (and consistent with route_output()). */ memset(&info, 0, sizeof(info)); info.rti_info[RTAX_DST] = rt_getkey(rt); info.rti_info[RTAX_NETMASK] = rt_mask(rt); info.rti_info[RTAX_GATEWAY] = rt->rt_gateway; info.rti_flags = rt->rt_flags; error = rtrequest1(RTM_DELETE, &info, &retrt); rt_missmsg(RTM_DELETE, &info, info.rti_flags, error); return error; } static struct ifaddr * ifa_ifwithroute_psref(int flags, const struct sockaddr *dst, const struct sockaddr *gateway, struct psref *psref) { struct ifaddr *ifa = NULL; if ((flags & RTF_GATEWAY) == 0) { /* * If we are adding a route to an interface, * and the interface is a pt to pt link * we should search for the destination * as our clue to the interface. Otherwise * we can use the local address. */ if ((flags & RTF_HOST) && gateway->sa_family != AF_LINK) ifa = ifa_ifwithdstaddr_psref(dst, psref); if (ifa == NULL) ifa = ifa_ifwithaddr_psref(gateway, psref); } else { /* * If we are adding a route to a remote net * or host, the gateway may still be on the * other end of a pt to pt link. */ ifa = ifa_ifwithdstaddr_psref(gateway, psref); } if (ifa == NULL) ifa = ifa_ifwithnet_psref(gateway, psref); if (ifa == NULL) { int s; struct rtentry *rt; rt = rtalloc1_locked(gateway, 0, true, true); if (rt == NULL) return NULL; if (rt->rt_flags & RTF_GATEWAY) { rt_unref(rt); return NULL; } /* * Just in case. May not need to do this workaround. * Revisit when working on rtentry MP-ification. */ s = pserialize_read_enter(); IFADDR_READER_FOREACH(ifa, rt->rt_ifp) { if (ifa == rt->rt_ifa) break; } if (ifa != NULL) ifa_acquire(ifa, psref); pserialize_read_exit(s); rt_unref(rt); if (ifa == NULL) return NULL; } if (ifa->ifa_addr->sa_family != dst->sa_family) { struct ifaddr *nifa; int s; s = pserialize_read_enter(); nifa = ifaof_ifpforaddr(dst, ifa->ifa_ifp); if (nifa != NULL) { ifa_release(ifa, psref); ifa_acquire(nifa, psref); ifa = nifa; } pserialize_read_exit(s); } return ifa; } /* * If it suceeds and ret_nrt isn't NULL, refcnt of ret_nrt is incremented. * The caller has to rtfree it by itself. */ int rtrequest(int req, const struct sockaddr *dst, const struct sockaddr *gateway, const struct sockaddr *netmask, int flags, struct rtentry **ret_nrt) { struct rt_addrinfo info; memset(&info, 0, sizeof(info)); info.rti_flags = flags; info.rti_info[RTAX_DST] = dst; info.rti_info[RTAX_GATEWAY] = gateway; info.rti_info[RTAX_NETMASK] = netmask; return rtrequest1(req, &info, ret_nrt); } static struct ifnet * rt_getifp(struct rt_addrinfo *info, struct psref *psref) { const struct sockaddr *ifpaddr = info->rti_info[RTAX_IFP]; if (info->rti_ifp != NULL) return NULL; /* * ifp may be specified by sockaddr_dl when protocol address * is ambiguous */ if (ifpaddr != NULL && ifpaddr->sa_family == AF_LINK) { struct ifaddr *ifa; int s = pserialize_read_enter(); ifa = ifa_ifwithnet(ifpaddr); if (ifa != NULL) info->rti_ifp = if_get_byindex(ifa->ifa_ifp->if_index, psref); pserialize_read_exit(s); } return info->rti_ifp; } static struct ifaddr * rt_getifa(struct rt_addrinfo *info, struct psref *psref) { struct ifaddr *ifa = NULL; const struct sockaddr *dst = info->rti_info[RTAX_DST]; const struct sockaddr *gateway = info->rti_info[RTAX_GATEWAY]; const struct sockaddr *ifaaddr = info->rti_info[RTAX_IFA]; int flags = info->rti_flags; const struct sockaddr *sa; if (info->rti_ifa == NULL && ifaaddr != NULL) { ifa = ifa_ifwithaddr_psref(ifaaddr, psref); if (ifa != NULL) goto got; } sa = ifaaddr != NULL ? ifaaddr : (gateway != NULL ? gateway : dst); if (sa != NULL && info->rti_ifp != NULL) ifa = ifaof_ifpforaddr_psref(sa, info->rti_ifp, psref); else if (dst != NULL && gateway != NULL) ifa = ifa_ifwithroute_psref(flags, dst, gateway, psref); else if (sa != NULL) ifa = ifa_ifwithroute_psref(flags, sa, sa, psref); if (ifa == NULL) return NULL; got: if (ifa->ifa_getifa != NULL) { /* FIXME ifa_getifa is NOMPSAFE */ ifa = (*ifa->ifa_getifa)(ifa, dst); if (ifa == NULL) return NULL; ifa_acquire(ifa, psref); } info->rti_ifa = ifa; if (info->rti_ifp == NULL) info->rti_ifp = ifa->ifa_ifp; return ifa; } /* * If it suceeds and ret_nrt isn't NULL, refcnt of ret_nrt is incremented. * The caller has to rtfree it by itself. */ int rtrequest1(int req, struct rt_addrinfo *info, struct rtentry **ret_nrt) { int s = splsoftnet(), ss; int error = 0, rc; struct rtentry *rt; rtbl_t *rtbl; struct ifaddr *ifa = NULL; struct sockaddr_storage maskeddst; const struct sockaddr *dst = info->rti_info[RTAX_DST]; const struct sockaddr *gateway = info->rti_info[RTAX_GATEWAY]; const struct sockaddr *netmask = info->rti_info[RTAX_NETMASK]; int flags = info->rti_flags; struct psref psref_ifp, psref_ifa; int bound = 0; struct ifnet *ifp = NULL; bool need_to_release_ifa = true; bool need_unlock = true; #define senderr(x) { error = x ; goto bad; } RT_WLOCK(); bound = curlwp_bind(); if ((rtbl = rt_gettable(dst->sa_family)) == NULL) senderr(ESRCH); if (flags & RTF_HOST) netmask = NULL; switch (req) { case RTM_DELETE: if (netmask) { rt_maskedcopy(dst, (struct sockaddr *)&maskeddst, netmask); dst = (struct sockaddr *)&maskeddst; } if ((rt = rt_lookup(rtbl, dst, netmask)) == NULL) senderr(ESRCH); if ((rt = rt_deladdr(rtbl, dst, netmask)) == NULL) senderr(ESRCH); rt->rt_flags &= ~RTF_UP; ifa = rt->rt_ifa; if (ifa->ifa_flags & IFA_ROUTE && rt_ifa_connected(rt, ifa)) { RT_DPRINTF("rt->_rt_key = %p, ifa = %p, " "deleted IFA_ROUTE\n", (void *)rt->_rt_key, (void *)ifa); ifa->ifa_flags &= ~IFA_ROUTE; } if (ifa->ifa_rtrequest) ifa->ifa_rtrequest(RTM_DELETE, rt, info); ifa = NULL; rttrash++; if (ret_nrt) { *ret_nrt = rt; rt_ref(rt); RT_REFCNT_TRACE(rt); } rtcache_invalidate(); RT_UNLOCK(); need_unlock = false; rt_timer_remove_all(rt); #if defined(INET) || defined(INET6) if (netmask != NULL) lltable_prefix_free(dst->sa_family, dst, netmask, 0); #endif if (ret_nrt == NULL) { /* Adjust the refcount */ rt_ref(rt); RT_REFCNT_TRACE(rt); rt_free(rt); } break; case RTM_ADD: if (info->rti_ifa == NULL) { ifp = rt_getifp(info, &psref_ifp); ifa = rt_getifa(info, &psref_ifa); if (ifa == NULL) senderr(ENETUNREACH); } else { /* Caller should have a reference of ifa */ ifa = info->rti_ifa; need_to_release_ifa = false; } rt = pool_get(&rtentry_pool, PR_NOWAIT); if (rt == NULL) senderr(ENOBUFS); memset(rt, 0, sizeof(*rt)); rt->rt_flags = RTF_UP | (flags & ~RTF_DONTCHANGEIFA); LIST_INIT(&rt->rt_timer); RT_DPRINTF("rt->_rt_key = %p\n", (void *)rt->_rt_key); if (netmask) { rt_maskedcopy(dst, (struct sockaddr *)&maskeddst, netmask); rt_setkey(rt, (struct sockaddr *)&maskeddst, M_NOWAIT); } else { rt_setkey(rt, dst, M_NOWAIT); } RT_DPRINTF("rt->_rt_key = %p\n", (void *)rt->_rt_key); if (rt_getkey(rt) == NULL || rt_setgate(rt, gateway) != 0) { pool_put(&rtentry_pool, rt); senderr(ENOBUFS); } rt_set_ifa(rt, ifa); if (info->rti_info[RTAX_TAG] != NULL) { const struct sockaddr *tag; tag = rt_settag(rt, info->rti_info[RTAX_TAG]); if (tag == NULL) senderr(ENOBUFS); } RT_DPRINTF("rt->_rt_key = %p\n", (void *)rt->_rt_key); ss = pserialize_read_enter(); if (info->rti_info[RTAX_IFP] != NULL) { struct ifaddr *ifa2; ifa2 = ifa_ifwithnet(info->rti_info[RTAX_IFP]); if (ifa2 != NULL) rt->rt_ifp = ifa2->ifa_ifp; else rt->rt_ifp = ifa->ifa_ifp; } else rt->rt_ifp = ifa->ifa_ifp; pserialize_read_exit(ss); cv_init(&rt->rt_cv, "rtentry"); psref_target_init(&rt->rt_psref, rt_psref_class); RT_DPRINTF("rt->_rt_key = %p\n", (void *)rt->_rt_key); rc = rt_addaddr(rtbl, rt, netmask); RT_DPRINTF("rt->_rt_key = %p\n", (void *)rt->_rt_key); if (rc != 0) { ifafree(ifa); /* for rt_set_ifa above */ cv_destroy(&rt->rt_cv); rt_destroy(rt); pool_put(&rtentry_pool, rt); senderr(rc); } RT_DPRINTF("rt->_rt_key = %p\n", (void *)rt->_rt_key); if (ifa->ifa_rtrequest) ifa->ifa_rtrequest(req, rt, info); if (need_to_release_ifa) ifa_release(ifa, &psref_ifa); ifa = NULL; if_put(ifp, &psref_ifp); ifp = NULL; RT_DPRINTF("rt->_rt_key = %p\n", (void *)rt->_rt_key); if (ret_nrt) { *ret_nrt = rt; rt_ref(rt); RT_REFCNT_TRACE(rt); } rtcache_invalidate(); RT_UNLOCK(); need_unlock = false; break; case RTM_GET: if (netmask != NULL) { rt_maskedcopy(dst, (struct sockaddr *)&maskeddst, netmask); dst = (struct sockaddr *)&maskeddst; } if ((rt = rt_lookup(rtbl, dst, netmask)) == NULL) senderr(ESRCH); if (ret_nrt != NULL) { *ret_nrt = rt; rt_ref(rt); RT_REFCNT_TRACE(rt); } break; } bad: if (need_to_release_ifa) ifa_release(ifa, &psref_ifa); if_put(ifp, &psref_ifp); curlwp_bindx(bound); if (need_unlock) RT_UNLOCK(); splx(s); return error; } int rt_setgate(struct rtentry *rt, const struct sockaddr *gate) { struct sockaddr *new, *old; KASSERT(RT_WLOCKED()); KASSERT(rt->_rt_key != NULL); RT_DPRINTF("rt->_rt_key = %p\n", (void *)rt->_rt_key); new = sockaddr_dup(gate, M_ZERO | M_NOWAIT); if (new == NULL) return ENOMEM; old = rt->rt_gateway; rt->rt_gateway = new; if (old != NULL) sockaddr_free(old); KASSERT(rt->_rt_key != NULL); RT_DPRINTF("rt->_rt_key = %p\n", (void *)rt->_rt_key); if (rt->rt_flags & RTF_GATEWAY) { struct rtentry *gwrt; gwrt = rtalloc1_locked(gate, 1, false, true); /* * If we switched gateways, grab the MTU from the new * gateway route if the current MTU, if the current MTU is * greater than the MTU of gateway. * Note that, if the MTU of gateway is 0, we will reset the * MTU of the route to run PMTUD again from scratch. XXX */ if (gwrt != NULL) { KASSERT(gwrt->_rt_key != NULL); RT_DPRINTF("gwrt->_rt_key = %p\n", gwrt->_rt_key); if ((rt->rt_rmx.rmx_locks & RTV_MTU) == 0 && rt->rt_rmx.rmx_mtu && rt->rt_rmx.rmx_mtu > gwrt->rt_rmx.rmx_mtu) { rt->rt_rmx.rmx_mtu = gwrt->rt_rmx.rmx_mtu; } rt_unref(gwrt); } } KASSERT(rt->_rt_key != NULL); RT_DPRINTF("rt->_rt_key = %p\n", (void *)rt->_rt_key); return 0; } static struct ifaddr * rt_update_get_ifa(const struct rt_addrinfo *info, const struct rtentry *rt, struct ifnet **ifp, struct psref *psref_ifp, struct psref *psref) { struct ifaddr *ifa = NULL; *ifp = NULL; if (info->rti_info[RTAX_IFP] != NULL) { ifa = ifa_ifwithnet_psref(info->rti_info[RTAX_IFP], psref); if (ifa == NULL) goto next; if (ifa->ifa_ifp->if_flags & IFF_UNNUMBERED) { ifa_release(ifa, psref); ifa = NULL; goto next; } *ifp = ifa->ifa_ifp; if_acquire(*ifp, psref_ifp); if (info->rti_info[RTAX_IFA] == NULL && info->rti_info[RTAX_GATEWAY] == NULL) goto out; ifa_release(ifa, psref); if (info->rti_info[RTAX_IFA] == NULL) { /* route change <dst> <gw> -ifp <if> */ ifa = ifaof_ifpforaddr_psref( info->rti_info[RTAX_GATEWAY], *ifp, psref); } else { /* route change <dst> -ifp <if> -ifa <addr> */ ifa = ifa_ifwithaddr_psref(info->rti_info[RTAX_IFA], psref); if (ifa != NULL) goto out; ifa = ifaof_ifpforaddr_psref(info->rti_info[RTAX_IFA], *ifp, psref); } goto out; } next: if (info->rti_info[RTAX_IFA] != NULL) { /* route change <dst> <gw> -ifa <addr> */ ifa = ifa_ifwithaddr_psref(info->rti_info[RTAX_IFA], psref); if (ifa != NULL) goto out; } if (info->rti_info[RTAX_GATEWAY] != NULL) { /* route change <dst> <gw> */ ifa = ifa_ifwithroute_psref(rt->rt_flags, rt_getkey(rt), info->rti_info[RTAX_GATEWAY], psref); } out: if (ifa != NULL && *ifp == NULL) { *ifp = ifa->ifa_ifp; if_acquire(*ifp, psref_ifp); } if (ifa == NULL && *ifp != NULL) { if_put(*ifp, psref_ifp); *ifp = NULL; } return ifa; } int rt_update(struct rtentry *rt, struct rt_addrinfo *info, void *rtm) { int error = 0; struct ifnet *ifp = NULL, *new_ifp = NULL; struct ifaddr *ifa = NULL, *new_ifa; struct psref psref_ifa, psref_new_ifa, psref_ifp, psref_new_ifp; bool newgw, ifp_changed = false; RT_WLOCK(); /* * New gateway could require new ifaddr, ifp; * flags may also be different; ifp may be specified * by ll sockaddr when protocol address is ambiguous */ newgw = info->rti_info[RTAX_GATEWAY] != NULL && sockaddr_cmp(info->rti_info[RTAX_GATEWAY], rt->rt_gateway) != 0; if (newgw || info->rti_info[RTAX_IFP] != NULL || info->rti_info[RTAX_IFA] != NULL) { ifp = rt_getifp(info, &psref_ifp); /* info refers ifp so we need to keep a reference */ ifa = rt_getifa(info, &psref_ifa); if (ifa == NULL) { error = ENETUNREACH; goto out; } } if (newgw) { error = rt_setgate(rt, info->rti_info[RTAX_GATEWAY]); if (error != 0) goto out; } if (info->rti_info[RTAX_TAG]) { const struct sockaddr *tag; tag = rt_settag(rt, info->rti_info[RTAX_TAG]); if (tag == NULL) { error = ENOBUFS; goto out; } } /* * New gateway could require new ifaddr, ifp; * flags may also be different; ifp may be specified * by ll sockaddr when protocol address is ambiguous */ new_ifa = rt_update_get_ifa(info, rt, &new_ifp, &psref_new_ifp, &psref_new_ifa); if (new_ifa != NULL) { ifa_release(ifa, &psref_ifa); ifa = new_ifa; } if (ifa) { struct ifaddr *oifa = rt->rt_ifa; if (oifa != ifa && !ifa_is_destroying(ifa) && new_ifp != NULL && !if_is_deactivated(new_ifp)) { if (oifa && oifa->ifa_rtrequest) oifa->ifa_rtrequest(RTM_DELETE, rt, info); rt_replace_ifa(rt, ifa); rt->rt_ifp = new_ifp; ifp_changed = true; } if (new_ifa == NULL) ifa_release(ifa, &psref_ifa); /* To avoid ifa_release below */ ifa = NULL; } ifa_release(new_ifa, &psref_new_ifa); if (new_ifp && rt->rt_ifp != new_ifp && !if_is_deactivated(new_ifp)) { rt->rt_ifp = new_ifp; ifp_changed = true; } rt_setmetrics(rtm, rt); if (rt->rt_flags != info->rti_flags) { rt->rt_flags = (info->rti_flags & ~PRESERVED_RTF) | (rt->rt_flags & PRESERVED_RTF); } if (rt->rt_ifa->ifa_rtrequest) rt->rt_ifa->ifa_rtrequest(RTM_ADD, rt, info); #if defined(INET) || defined(INET6) if (ifp_changed && rt_mask(rt) != NULL) lltable_prefix_free(rt_getkey(rt)->sa_family, rt_getkey(rt), rt_mask(rt), 0); #else (void)ifp_changed; /* XXX gcc */ #endif out: ifa_release(ifa, &psref_ifa); if_put(new_ifp, &psref_new_ifp); if_put(ifp, &psref_ifp); RT_UNLOCK(); return error; } static void rt_maskedcopy(const struct sockaddr *src, struct sockaddr *dst, const struct sockaddr *netmask) { const char *netmaskp = &netmask->sa_data[0], *srcp = &src->sa_data[0]; char *dstp = &dst->sa_data[0]; const char *maskend = (char *)dst + MIN(netmask->sa_len, src->sa_len); const char *srcend = (char *)dst + src->sa_len; dst->sa_len = src->sa_len; dst->sa_family = src->sa_family; while (dstp < maskend) *dstp++ = *srcp++ & *netmaskp++; if (dstp < srcend) memset(dstp, 0, (size_t)(srcend - dstp)); } /* * Inform the routing socket of a route change. */ void rt_newmsg(const int cmd, const struct rtentry *rt) { struct rt_addrinfo info; memset((void *)&info, 0, sizeof(info)); info.rti_info[RTAX_DST] = rt_getkey(rt); info.rti_info[RTAX_GATEWAY] = rt->rt_gateway; info.rti_info[RTAX_NETMASK] = rt_mask(rt); if (rt->rt_ifp) { info.rti_info[RTAX_IFP] = rt->rt_ifp->if_dl->ifa_addr; info.rti_info[RTAX_IFA] = rt->rt_ifa->ifa_addr; } rt_missmsg(cmd, &info, rt->rt_flags, 0); } /* * Inform the routing socket of a route change for RTF_DYNAMIC. */ void rt_newmsg_dynamic(const int cmd, const struct rtentry *rt) { struct rt_addrinfo info; struct sockaddr *gateway = rt->rt_gateway; if (gateway == NULL) return; switch(gateway->sa_family) { #ifdef INET case AF_INET: { extern bool icmp_dynamic_rt_msg; if (!icmp_dynamic_rt_msg) return; break; } #endif #ifdef INET6 case AF_INET6: { extern bool icmp6_dynamic_rt_msg; if (!icmp6_dynamic_rt_msg) return; break; } #endif default: return; } memset((void *)&info, 0, sizeof(info)); info.rti_info[RTAX_DST] = rt_getkey(rt); info.rti_info[RTAX_GATEWAY] = gateway; info.rti_info[RTAX_NETMASK] = rt_mask(rt); if (rt->rt_ifp) { info.rti_info[RTAX_IFP] = rt->rt_ifp->if_dl->ifa_addr; info.rti_info[RTAX_IFA] = rt->rt_ifa->ifa_addr; } rt_missmsg(cmd, &info, rt->rt_flags, 0); } /* * Set up or tear down a routing table entry, normally * for an interface. */ int rtinit(struct ifaddr *ifa, int cmd, int flags) { struct rtentry *rt; struct sockaddr *dst, *odst; struct sockaddr_storage maskeddst; struct rtentry *nrt = NULL; int error; struct rt_addrinfo info; dst = flags & RTF_HOST ? ifa->ifa_dstaddr : ifa->ifa_addr; if (cmd == RTM_DELETE) { if ((flags & RTF_HOST) == 0 && ifa->ifa_netmask) { /* Delete subnet route for this interface */ odst = dst; dst = (struct sockaddr *)&maskeddst; rt_maskedcopy(odst, dst, ifa->ifa_netmask); } if ((rt = rtalloc1(dst, 0)) != NULL) { if (rt->rt_ifa != ifa) { rt_unref(rt); return (flags & RTF_HOST) ? EHOSTUNREACH : ENETUNREACH; } rt_unref(rt); } } memset(&info, 0, sizeof(info)); info.rti_ifa = ifa; info.rti_flags = flags | ifa->ifa_flags | RTF_DONTCHANGEIFA; info.rti_info[RTAX_DST] = dst; info.rti_info[RTAX_GATEWAY] = ifa->ifa_addr; /* * XXX here, it seems that we are assuming that ifa_netmask is NULL * for RTF_HOST. bsdi4 passes NULL explicitly (via intermediate * variable) when RTF_HOST is 1. still not sure if i can safely * change it to meet bsdi4 behavior. */ if (cmd != RTM_LLINFO_UPD) info.rti_info[RTAX_NETMASK] = ifa->ifa_netmask; error = rtrequest1((cmd == RTM_LLINFO_UPD) ? RTM_GET : cmd, &info, &nrt); if (error != 0) return error; rt = nrt; RT_REFCNT_TRACE(rt); switch (cmd) { case RTM_DELETE: rt_newmsg(cmd, rt); rt_free(rt); break; case RTM_LLINFO_UPD: if (cmd == RTM_LLINFO_UPD && ifa->ifa_rtrequest != NULL) ifa->ifa_rtrequest(RTM_LLINFO_UPD, rt, &info); rt_newmsg(RTM_CHANGE, rt); rt_unref(rt); break; case RTM_ADD: KASSERT(rt->rt_ifa == ifa); rt_newmsg(cmd, rt); rt_unref(rt); RT_REFCNT_TRACE(rt); break; } return error; } /* * Create a local route entry for the address. * Announce the addition of the address and the route to the routing socket. */ int rt_ifa_addlocal(struct ifaddr *ifa) { struct rtentry *rt; int e; /* If there is no loopback entry, allocate one. */ rt = rtalloc1(ifa->ifa_addr, 0); #ifdef RT_DEBUG if (rt != NULL) dump_rt(rt); #endif if (rt == NULL || (rt->rt_flags & RTF_HOST) == 0 || (rt->rt_ifp->if_flags & IFF_LOOPBACK) == 0) { struct rt_addrinfo info; struct rtentry *nrt; memset(&info, 0, sizeof(info)); info.rti_flags = RTF_HOST | RTF_LOCAL | RTF_DONTCHANGEIFA; info.rti_info[RTAX_DST] = ifa->ifa_addr; info.rti_info[RTAX_GATEWAY] = (const struct sockaddr *)ifa->ifa_ifp->if_sadl; info.rti_ifa = ifa; nrt = NULL; e = rtrequest1(RTM_ADD, &info, &nrt); rt_addrmsg_rt(RTM_ADD, ifa, e, nrt); if (nrt != NULL) { KASSERT(nrt->rt_ifa == ifa); #ifdef RT_DEBUG dump_rt(nrt); #endif rt_unref(nrt); RT_REFCNT_TRACE(nrt); } } else { e = 0; rt_addrmsg(RTM_NEWADDR, ifa); } if (rt != NULL) rt_unref(rt); return e; } /* * Remove the local route entry for the address. * Announce the removal of the address and the route to the routing socket. */ int rt_ifa_remlocal(struct ifaddr *ifa, struct ifaddr *alt_ifa) { struct rtentry *rt; int e = 0; rt = rtalloc1(ifa->ifa_addr, 0); /* * Before deleting, check if a corresponding loopbacked * host route surely exists. With this check, we can avoid * deleting an interface direct route whose destination is * the same as the address being removed. This can happen * when removing a subnet-router anycast address on an * interface attached to a shared medium. */ if (rt != NULL && (rt->rt_flags & RTF_HOST) && (rt->rt_ifp->if_flags & IFF_LOOPBACK)) { /* If we cannot replace the route's ifaddr with the equivalent * ifaddr of another interface, I believe it is safest to * delete the route. */ if (alt_ifa == NULL) { e = rtdeletemsg(rt); if (e == 0) { rt_unref(rt); rt_free(rt); rt = NULL; } rt_addrmsg(RTM_DELADDR, ifa); } else { #ifdef NET_MPSAFE int error = rt_update_prepare(rt); if (error == 0) { rt_replace_ifa(rt, alt_ifa); rt_update_finish(rt); } else { /* * If error != 0, the rtentry is being * destroyed, so doing nothing doesn't * matter. */ } #else rt_replace_ifa(rt, alt_ifa); #endif rt_newmsg(RTM_CHANGE, rt); } } else rt_addrmsg(RTM_DELADDR, ifa); if (rt != NULL) rt_unref(rt); return e; } /* * Route timer routines. These routes allow functions to be called * for various routes at any time. This is useful in supporting * path MTU discovery and redirect route deletion. * * This is similar to some BSDI internal functions, but it provides * for multiple queues for efficiency's sake... */ LIST_HEAD(, rttimer_queue) rttimer_queue_head; static int rt_init_done = 0; /* * Some subtle order problems with domain initialization mean that * we cannot count on this being run from rt_init before various * protocol initializations are done. Therefore, we make sure * that this is run when the first queue is added... */ static void rt_timer_work(struct work *, void *); static void rt_timer_init(void) { int error; assert(rt_init_done == 0); /* XXX should be in rt_init */ rw_init(&rt_lock); LIST_INIT(&rttimer_queue_head); callout_init(&rt_timer_ch, CALLOUT_MPSAFE); error = workqueue_create(&rt_timer_wq, "rt_timer", rt_timer_work, NULL, PRI_SOFTNET, IPL_SOFTNET, RT_WQ_FLAGS); if (error) panic("%s: workqueue_create failed (%d)\n", __func__, error); callout_reset(&rt_timer_ch, hz, rt_timer_timer, NULL); rt_init_done = 1; } struct rttimer_queue * rt_timer_queue_create(u_int timeout) { struct rttimer_queue *rtq; if (rt_init_done == 0) rt_timer_init(); R_Malloc(rtq, struct rttimer_queue *, sizeof *rtq); if (rtq == NULL) return NULL; memset(rtq, 0, sizeof(*rtq)); rtq->rtq_timeout = timeout; TAILQ_INIT(&rtq->rtq_head); RT_WLOCK(); LIST_INSERT_HEAD(&rttimer_queue_head, rtq, rtq_link); RT_UNLOCK(); return rtq; } void rt_timer_queue_change(struct rttimer_queue *rtq, long timeout) { rtq->rtq_timeout = timeout; } static void rt_timer_queue_remove_all(struct rttimer_queue *rtq) { struct rttimer *r; RT_ASSERT_WLOCK(); while ((r = TAILQ_FIRST(&rtq->rtq_head)) != NULL) { LIST_REMOVE(r, rtt_link); TAILQ_REMOVE(&rtq->rtq_head, r, rtt_next); rt_ref(r->rtt_rt); /* XXX */ RT_REFCNT_TRACE(r->rtt_rt); RT_UNLOCK(); (*r->rtt_func)(r->rtt_rt, r); pool_put(&rttimer_pool, r); RT_WLOCK(); if (rtq->rtq_count > 0) rtq->rtq_count--; else printf("rt_timer_queue_remove_all: " "rtq_count reached 0\n"); } } void rt_timer_queue_destroy(struct rttimer_queue *rtq) { RT_WLOCK(); rt_timer_queue_remove_all(rtq); LIST_REMOVE(rtq, rtq_link); RT_UNLOCK(); /* * Caller is responsible for freeing the rttimer_queue structure. */ } unsigned long rt_timer_count(struct rttimer_queue *rtq) { return rtq->rtq_count; } static void rt_timer_remove_all(struct rtentry *rt) { struct rttimer *r; RT_WLOCK(); while ((r = LIST_FIRST(&rt->rt_timer)) != NULL) { LIST_REMOVE(r, rtt_link); TAILQ_REMOVE(&r->rtt_queue->rtq_head, r, rtt_next); if (r->rtt_queue->rtq_count > 0) r->rtt_queue->rtq_count--; else printf("rt_timer_remove_all: rtq_count reached 0\n"); pool_put(&rttimer_pool, r); } RT_UNLOCK(); } int rt_timer_add(struct rtentry *rt, void (*func)(struct rtentry *, struct rttimer *), struct rttimer_queue *queue) { struct rttimer *r; KASSERT(func != NULL); RT_WLOCK(); /* * If there's already a timer with this action, destroy it before * we add a new one. */ LIST_FOREACH(r, &rt->rt_timer, rtt_link) { if (r->rtt_func == func) break; } if (r != NULL) { LIST_REMOVE(r, rtt_link); TAILQ_REMOVE(&r->rtt_queue->rtq_head, r, rtt_next); if (r->rtt_queue->rtq_count > 0) r->rtt_queue->rtq_count--; else printf("rt_timer_add: rtq_count reached 0\n"); } else { r = pool_get(&rttimer_pool, PR_NOWAIT); if (r == NULL) { RT_UNLOCK(); return ENOBUFS; } } memset(r, 0, sizeof(*r)); r->rtt_rt = rt; r->rtt_time = time_uptime; r->rtt_func = func; r->rtt_queue = queue; LIST_INSERT_HEAD(&rt->rt_timer, r, rtt_link); TAILQ_INSERT_TAIL(&queue->rtq_head, r, rtt_next); r->rtt_queue->rtq_count++; RT_UNLOCK(); return 0; } static void rt_timer_work(struct work *wk, void *arg) { struct rttimer_queue *rtq; struct rttimer *r; RT_WLOCK(); LIST_FOREACH(rtq, &rttimer_queue_head, rtq_link) { while ((r = TAILQ_FIRST(&rtq->rtq_head)) != NULL && (r->rtt_time + rtq->rtq_timeout) < time_uptime) { LIST_REMOVE(r, rtt_link); TAILQ_REMOVE(&rtq->rtq_head, r, rtt_next); /* * Take a reference to avoid the rtentry is freed * accidentally after RT_UNLOCK. The callback * (rtt_func) must rt_unref it by itself. */ rt_ref(r->rtt_rt); RT_REFCNT_TRACE(r->rtt_rt); RT_UNLOCK(); (*r->rtt_func)(r->rtt_rt, r); pool_put(&rttimer_pool, r); RT_WLOCK(); if (rtq->rtq_count > 0) rtq->rtq_count--; else printf("rt_timer_timer: rtq_count reached 0\n"); } } RT_UNLOCK(); callout_reset(&rt_timer_ch, hz, rt_timer_timer, NULL); } static void rt_timer_timer(void *arg) { workqueue_enqueue(rt_timer_wq, &rt_timer_wk, NULL); } static struct rtentry * _rtcache_init(struct route *ro, int flag) { struct rtentry *rt; rtcache_invariants(ro); KASSERT(ro->_ro_rt == NULL); if (rtcache_getdst(ro) == NULL) return NULL; rt = rtalloc1(rtcache_getdst(ro), flag); if (rt != NULL) { RT_RLOCK(); if (ISSET(rt->rt_flags, RTF_UP)) { ro->_ro_rt = rt; ro->ro_rtcache_generation = rtcache_generation; rtcache_ref(rt, ro); } RT_UNLOCK(); rt_unref(rt); } rtcache_invariants(ro); return ro->_ro_rt; } struct rtentry * rtcache_init(struct route *ro) { return _rtcache_init(ro, 1); } struct rtentry * rtcache_init_noclone(struct route *ro) { return _rtcache_init(ro, 0); } struct rtentry * rtcache_update(struct route *ro, int clone) { ro->_ro_rt = NULL; return _rtcache_init(ro, clone); } void rtcache_copy(struct route *new_ro, struct route *old_ro) { struct rtentry *rt; int ret; KASSERT(new_ro != old_ro); rtcache_invariants(new_ro); rtcache_invariants(old_ro); rt = rtcache_validate(old_ro); if (rtcache_getdst(old_ro) == NULL) goto out; ret = rtcache_setdst(new_ro, rtcache_getdst(old_ro)); if (ret != 0) goto out; RT_RLOCK(); new_ro->_ro_rt = rt; new_ro->ro_rtcache_generation = rtcache_generation; RT_UNLOCK(); rtcache_invariants(new_ro); out: rtcache_unref(rt, old_ro); return; } #if defined(RT_DEBUG) && defined(NET_MPSAFE) static void rtcache_trace(const char *func, struct rtentry *rt, struct route *ro) { char dst[64]; sockaddr_format(ro->ro_sa, dst, 64); printf("trace: %s:\tdst=%s cpu=%d lwp=%p psref=%p target=%p\n", func, dst, cpu_index(curcpu()), curlwp, &ro->ro_psref, &rt->rt_psref); } #define RTCACHE_PSREF_TRACE(rt, ro) rtcache_trace(__func__, (rt), (ro)) #else #define RTCACHE_PSREF_TRACE(rt, ro) do {} while (0) #endif static void rtcache_ref(struct rtentry *rt, struct route *ro) { KASSERT(rt != NULL); #ifdef NET_MPSAFE RTCACHE_PSREF_TRACE(rt, ro); ro->ro_bound = curlwp_bind(); /* XXX Use a real caller's address */ PSREF_DEBUG_FILL_RETURN_ADDRESS(&ro->ro_psref); psref_acquire(&ro->ro_psref, &rt->rt_psref, rt_psref_class); #endif } void rtcache_unref(struct rtentry *rt, struct route *ro) { if (rt == NULL) return; #ifdef NET_MPSAFE psref_release(&ro->ro_psref, &rt->rt_psref, rt_psref_class); curlwp_bindx(ro->ro_bound); RTCACHE_PSREF_TRACE(rt, ro); #endif } struct rtentry * rtcache_validate(struct route *ro) { struct rtentry *rt = NULL; #ifdef NET_MPSAFE retry: #endif rtcache_invariants(ro); RT_RLOCK(); if (ro->ro_rtcache_generation != rtcache_generation) { /* The cache is invalidated */ rt = NULL; goto out; } rt = ro->_ro_rt; if (rt == NULL) goto out; if ((rt->rt_flags & RTF_UP) == 0) { rt = NULL; goto out; } #ifdef NET_MPSAFE if (ISSET(rt->rt_flags, RTF_UPDATING)) { if (rt_wait_ok()) { RT_UNLOCK(); /* We can wait until the update is complete */ rt_update_wait(); goto retry; } else { rt = NULL; } } else #endif rtcache_ref(rt, ro); out: RT_UNLOCK(); return rt; } struct rtentry * rtcache_lookup2(struct route *ro, const struct sockaddr *dst, int clone, int *hitp) { const struct sockaddr *odst; struct rtentry *rt = NULL; odst = rtcache_getdst(ro); if (odst == NULL) goto miss; if (sockaddr_cmp(odst, dst) != 0) { rtcache_free(ro); goto miss; } rt = rtcache_validate(ro); if (rt == NULL) { ro->_ro_rt = NULL; goto miss; } rtcache_invariants(ro); if (hitp != NULL) *hitp = 1; return rt; miss: if (hitp != NULL) *hitp = 0; if (rtcache_setdst(ro, dst) == 0) rt = _rtcache_init(ro, clone); rtcache_invariants(ro); return rt; } void rtcache_free(struct route *ro) { ro->_ro_rt = NULL; if (ro->ro_sa != NULL) { sockaddr_free(ro->ro_sa); ro->ro_sa = NULL; } rtcache_invariants(ro); } int rtcache_setdst(struct route *ro, const struct sockaddr *sa) { KASSERT(sa != NULL); rtcache_invariants(ro); if (ro->ro_sa != NULL) { if (ro->ro_sa->sa_family == sa->sa_family) { ro->_ro_rt = NULL; sockaddr_copy(ro->ro_sa, ro->ro_sa->sa_len, sa); rtcache_invariants(ro); return 0; } /* free ro_sa, wrong family */ rtcache_free(ro); } KASSERT(ro->_ro_rt == NULL); if ((ro->ro_sa = sockaddr_dup(sa, M_ZERO | M_NOWAIT)) == NULL) { rtcache_invariants(ro); return ENOMEM; } rtcache_invariants(ro); return 0; } static void rtcache_percpu_init_cpu(void *p, void *arg __unused, struct cpu_info *ci __unused) { struct route **rop = p; /* * We can't have struct route as percpu data because it can be destroyed * over a memory enlargement processing of percpu. */ *rop = kmem_zalloc(sizeof(**rop), KM_SLEEP); } percpu_t * rtcache_percpu_alloc(void) { return percpu_create(sizeof(struct route *), rtcache_percpu_init_cpu, NULL, NULL); } const struct sockaddr * rt_settag(struct rtentry *rt, const struct sockaddr *tag) { if (rt->rt_tag != tag) { if (rt->rt_tag != NULL) sockaddr_free(rt->rt_tag); rt->rt_tag = sockaddr_dup(tag, M_ZERO | M_NOWAIT); } return rt->rt_tag; } struct sockaddr * rt_gettag(const struct rtentry *rt) { return rt->rt_tag; } int rt_check_reject_route(const struct rtentry *rt, const struct ifnet *ifp) { if ((rt->rt_flags & RTF_REJECT) != 0) { /* Mimic looutput */ if (ifp->if_flags & IFF_LOOPBACK) return (rt->rt_flags & RTF_HOST) ? EHOSTUNREACH : ENETUNREACH; else if (rt->rt_rmx.rmx_expire == 0 || time_uptime < rt->rt_rmx.rmx_expire) return (rt->rt_flags & RTF_GATEWAY) ? EHOSTUNREACH : EHOSTDOWN; } return 0; } void rt_delete_matched_entries(sa_family_t family, int (*f)(struct rtentry *, void *), void *v, bool notify) { for (;;) { int s; int error; struct rtentry *rt, *retrt = NULL; RT_RLOCK(); s = splsoftnet(); rt = rtbl_search_matched_entry(family, f, v); if (rt == NULL) { splx(s); RT_UNLOCK(); return; } rt_ref(rt); RT_REFCNT_TRACE(rt); splx(s); RT_UNLOCK(); error = rtrequest(RTM_DELETE, rt_getkey(rt), rt->rt_gateway, rt_mask(rt), rt->rt_flags, &retrt); if (error == 0) { KASSERT(retrt == rt); KASSERT((retrt->rt_flags & RTF_UP) == 0); if (notify) rt_newmsg(RTM_DELETE, retrt); retrt->rt_ifp = NULL; rt_unref(rt); RT_REFCNT_TRACE(rt); rt_free(retrt); } else if (error == ESRCH) { /* Someone deleted the entry already. */ rt_unref(rt); RT_REFCNT_TRACE(rt); } else { log(LOG_ERR, "%s: unable to delete rtentry @ %p, " "error = %d\n", rt->rt_ifp->if_xname, rt, error); /* XXX how to treat this case? */ } } } static int rt_walktree_locked(sa_family_t family, int (*f)(struct rtentry *, void *), void *v) { return rtbl_walktree(family, f, v); } void rt_replace_ifa_matched_entries(sa_family_t family, int (*f)(struct rtentry *, void *), void *v, struct ifaddr *ifa) { for (;;) { int s; #ifdef NET_MPSAFE int error; #endif struct rtentry *rt; RT_RLOCK(); s = splsoftnet(); rt = rtbl_search_matched_entry(family, f, v); if (rt == NULL) { splx(s); RT_UNLOCK(); return; } rt_ref(rt); RT_REFCNT_TRACE(rt); splx(s); RT_UNLOCK(); #ifdef NET_MPSAFE error = rt_update_prepare(rt); if (error == 0) { rt_replace_ifa(rt, ifa); rt_update_finish(rt); rt_newmsg(RTM_CHANGE, rt); } else { /* * If error != 0, the rtentry is being * destroyed, so doing nothing doesn't * matter. */ } #else rt_replace_ifa(rt, ifa); rt_newmsg(RTM_CHANGE, rt); #endif rt_unref(rt); RT_REFCNT_TRACE(rt); } } int rt_walktree(sa_family_t family, int (*f)(struct rtentry *, void *), void *v) { int error; RT_RLOCK(); error = rt_walktree_locked(family, f, v); RT_UNLOCK(); return error; } #ifdef DDB #include <machine/db_machdep.h> #include <ddb/db_interface.h> #include <ddb/db_output.h> #define rt_expire rt_rmx.rmx_expire static void db_print_sa(const struct sockaddr *sa) { int len; const u_char *p; if (sa == NULL) { db_printf("[NULL]"); return; } p = (const u_char *)sa; len = sa->sa_len; db_printf("["); while (len > 0) { db_printf("%d", *p); p++; len--; if (len) db_printf(","); } db_printf("]\n"); } static void db_print_ifa(struct ifaddr *ifa) { if (ifa == NULL) return; db_printf(" ifa_addr="); db_print_sa(ifa->ifa_addr); db_printf(" ifa_dsta="); db_print_sa(ifa->ifa_dstaddr); db_printf(" ifa_mask="); db_print_sa(ifa->ifa_netmask); db_printf(" flags=0x%x,refcnt=%d,metric=%d\n", ifa->ifa_flags, ifa->ifa_refcnt, ifa->ifa_metric); } /* * Function to pass to rt_walktree(). * Return non-zero error to abort walk. */ static int db_show_rtentry(struct rtentry *rt, void *w) { db_printf("rtentry=%p", rt); db_printf(" flags=0x%x refcnt=%d use=%"PRId64" expire=%"PRId64"\n", rt->rt_flags, rt->rt_refcnt, rt->rt_use, (uint64_t)rt->rt_expire); db_printf(" key="); db_print_sa(rt_getkey(rt)); db_printf(" mask="); db_print_sa(rt_mask(rt)); db_printf(" gw="); db_print_sa(rt->rt_gateway); db_printf(" ifp=%p ", rt->rt_ifp); if (rt->rt_ifp) db_printf("(%s)", rt->rt_ifp->if_xname); else db_printf("(NULL)"); db_printf(" ifa=%p\n", rt->rt_ifa); db_print_ifa(rt->rt_ifa); db_printf(" gwroute=%p llinfo=%p\n", rt->rt_gwroute, rt->rt_llinfo); return 0; } /* * Function to print all the route trees. * Use this from ddb: "show routes" */ void db_show_routes(db_expr_t addr, bool have_addr, db_expr_t count, const char *modif) { /* Taking RT_LOCK will fail if LOCKDEBUG is enabled. */ rt_walktree_locked(AF_INET, db_show_rtentry, NULL); } #endif
1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 /* $NetBSD: in4_cksum.c,v 1.20 2014/11/30 18:15:41 christos Exp $ */ /*- * Copyright (c) 2008 Joerg Sonnenberger <joerg@NetBSD.org>. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in * the documentation and/or other materials provided with the * distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: in4_cksum.c,v 1.20 2014/11/30 18:15:41 christos Exp $"); #include <sys/param.h> #include <sys/mbuf.h> #include <netinet/in.h> #include <netinet/in_systm.h> #include <netinet/ip.h> /* * Checksum of the IPv4 pseudo header. * * off is supposed to be the skipped IPv4 header, len is the payload size. */ #ifdef DIAGNOSTIC #define PANIC(a,...) panic(a, __VA_ARGS__) #else #define PANIC(a,...) do { \ printf(a, __VA_ARGS__); \ return -1; \ } while (/*CONSTCOND*/0) #endif int in4_cksum(struct mbuf *m, u_int8_t nxt, int off, int len) { uint32_t sum; uint16_t *w; if (__predict_false(m->m_len < sizeof(struct ip))) PANIC("%s: mbuf %d too short for IP header %zu", __func__, m->m_len, sizeof(struct ip)); if (nxt == 0) return cpu_in_cksum(m, len, off, 0); if (__predict_false(off < sizeof(struct ip))) PANIC("%s: offset %d too short for IP header %zu", __func__, off, sizeof(struct ip)); /* * Compute the equivalent of: * struct ipovly ip; * * bzero(sizeof(*ip)); * ip.ih_pr = nxt; * ip.ip_len = htons(len); * ip.ih_src = mtod(m, struct ip *)->ip_src; * ip.ih_dst = mtod(m, struct ip *)->ip_dst; * sum = one_add(&ip); */ #if BYTE_ORDER == LITTLE_ENDIAN sum = ((len & 0xffff) + nxt) << 8; #else sum = (len & 0xffff) + nxt; #endif w = (uint16_t *)(mtod(m, char *) + offsetof(struct ip, ip_src)); if (__predict_true((uintptr_t)w % 2 == 0)) { sum += w[0]; sum += w[1]; sum += w[2]; sum += w[3]; } else { uint32_t partial; w = (void *)((uintptr_t)w - 1); #if BYTE_ORDER == LITTLE_ENDIAN partial = w[0] & 0xff00; #else partial = w[0] & 0x00ff; #endif partial += w[1]; partial += w[2]; partial += w[3]; #if BYTE_ORDER == LITTLE_ENDIAN partial += w[4] & 0x00ff; #else partial += w[4] & 0xff00; #endif sum += partial << 8; } return cpu_in_cksum(m, len, off, sum); }
10 9 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 /* $NetBSD: if_ether.h,v 1.91 2024/02/05 21:46:06 andvar Exp $ */ /* * Copyright (c) 1982, 1986, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)if_ether.h 8.1 (Berkeley) 6/10/93 */ #ifndef _NET_IF_ETHER_H_ #define _NET_IF_ETHER_H_ #ifdef _KERNEL #ifdef _KERNEL_OPT #include "opt_mbuftrace.h" #endif #include <sys/mbuf.h> #endif #ifndef _STANDALONE #include <net/if.h> #endif /* * Some basic Ethernet constants. */ #define ETHER_ADDR_LEN 6 /* length of an Ethernet address */ #define ETHER_TYPE_LEN 2 /* length of the Ethernet type field */ #define ETHER_CRC_LEN 4 /* length of the Ethernet CRC */ #define ETHER_HDR_LEN ((ETHER_ADDR_LEN * 2) + ETHER_TYPE_LEN) #define ETHER_MIN_LEN 64 /* minimum frame length, including CRC */ #define ETHER_MAX_LEN 1518 /* maximum frame length, including CRC */ #define ETHER_MAX_LEN_JUMBO 9018 /* maximum jumbo frame len, including CRC */ /* * Some Ethernet extensions. */ #define ETHER_VLAN_ENCAP_LEN 4 /* length of 802.1Q VLAN encapsulation */ #define EVL_VLANOFTAG(tag) ((tag) & 4095) /* VLAN ID */ #define EVL_PRIOFTAG(tag) (((tag) >> 13) & 7) /* Priority */ #define EVL_CFIOFTAG(tag) (((tag) >> 12) & 1) /* CFI */ #define ETHER_PPPOE_ENCAP_LEN 8 /* length of PPPoE encapsulation */ /* * Mbuf adjust factor to force 32-bit alignment of IP header. * Drivers should do m_adj(m, ETHER_ALIGN) when setting up a * receive so the upper layers get the IP header properly aligned * past the 14-byte Ethernet header. */ #define ETHER_ALIGN 2 /* driver adjust for IP hdr alignment */ /* * Ethernet address - 6 octets * this is only used by the ethers(3) functions. */ struct ether_addr { uint8_t ether_addr_octet[ETHER_ADDR_LEN]; }; /* * Structure of a 10Mb/s Ethernet header. */ struct ether_header { uint8_t ether_dhost[ETHER_ADDR_LEN]; uint8_t ether_shost[ETHER_ADDR_LEN]; uint16_t ether_type; }; #include <net/ethertypes.h> #define ETHER_IS_MULTICAST(addr) (*(addr) & 0x01) /* is address mcast/bcast? */ #define ETHER_IS_LOCAL(addr) (*(addr) & 0x02) /* is address local? */ #define ETHERMTU_JUMBO (ETHER_MAX_LEN_JUMBO - ETHER_HDR_LEN - ETHER_CRC_LEN) #define ETHERMTU (ETHER_MAX_LEN - ETHER_HDR_LEN - ETHER_CRC_LEN) #define ETHERMIN (ETHER_MIN_LEN - ETHER_HDR_LEN - ETHER_CRC_LEN) /* * Compute the maximum frame size based on ethertype (i.e. possible * encapsulation) and whether or not an FCS is present. */ #define ETHER_MAX_FRAME(ifp, etype, hasfcs) \ ((ifp)->if_mtu + ETHER_HDR_LEN + \ ((hasfcs) ? ETHER_CRC_LEN : 0) + \ (((etype) == ETHERTYPE_VLAN) ? ETHER_VLAN_ENCAP_LEN : 0) + \ (((etype) == ETHERTYPE_PPPOE) ? ETHER_PPPOE_ENCAP_LEN : 0)) /* * Ethernet CRC32 polynomials (big- and little-endian versions). */ #define ETHER_CRC_POLY_LE 0xedb88320 #define ETHER_CRC_POLY_BE 0x04c11db6 #ifndef _STANDALONE /* * Ethernet-specific mbuf flags. */ #define M_HASFCS M_LINK0 /* FCS included at end of frame */ #define M_PROMISC M_LINK1 /* this packet is not for us */ #ifdef _KERNEL /* * Macro to map an IP multicast address to an Ethernet multicast address. * The high-order 25 bits of the Ethernet address are statically assigned, * and the low-order 23 bits are taken from the low end of the IP address. */ #define ETHER_MAP_IP_MULTICAST(ipaddr, enaddr) \ /* const struct in_addr *ipaddr; */ \ /* uint8_t enaddr[ETHER_ADDR_LEN]; */ \ do { \ (enaddr)[0] = 0x01; \ (enaddr)[1] = 0x00; \ (enaddr)[2] = 0x5e; \ (enaddr)[3] = ((const uint8_t *)ipaddr)[1] & 0x7f; \ (enaddr)[4] = ((const uint8_t *)ipaddr)[2]; \ (enaddr)[5] = ((const uint8_t *)ipaddr)[3]; \ } while (/*CONSTCOND*/0) /* * Macro to map an IP6 multicast address to an Ethernet multicast address. * The high-order 16 bits of the Ethernet address are statically assigned, * and the low-order 32 bits are taken from the low end of the IP6 address. */ #define ETHER_MAP_IPV6_MULTICAST(ip6addr, enaddr) \ /* struct in6_addr *ip6addr; */ \ /* uint8_t enaddr[ETHER_ADDR_LEN]; */ \ { \ (enaddr)[0] = 0x33; \ (enaddr)[1] = 0x33; \ (enaddr)[2] = ((const uint8_t *)ip6addr)[12]; \ (enaddr)[3] = ((const uint8_t *)ip6addr)[13]; \ (enaddr)[4] = ((const uint8_t *)ip6addr)[14]; \ (enaddr)[5] = ((const uint8_t *)ip6addr)[15]; \ } #endif struct mii_data; struct ethercom; typedef int (*ether_cb_t)(struct ethercom *); typedef int (*ether_vlancb_t)(struct ethercom *, uint16_t, bool); /* * Structure shared between the ethernet driver modules and * the multicast list code. For example, each ec_softc or il_softc * begins with this structure. */ struct ethercom { struct ifnet ec_if; /* network-visible interface */ LIST_HEAD(, ether_multi) ec_multiaddrs; /* list of ether multicast addrs */ int ec_multicnt; /* length of ec_multiaddrs list */ int ec_capabilities; /* capabilities, provided by driver */ int ec_capenable; /* tells hardware which capabilities to enable */ int ec_nvlans; /* # VLANs on this interface */ SIMPLEQ_HEAD(, vlanid_list) ec_vids; /* list of VLAN IDs */ /* The device handle for the MII bus child device. */ struct mii_data *ec_mii; struct ifmedia *ec_ifmedia; /* * Called after a change to ec_if.if_flags. Returns * ENETRESET if the device should be reinitialized with * ec_if.if_init, 0 on success, not 0 on failure. */ ether_cb_t ec_ifflags_cb; /* * Called whenever a vlan interface is configured or unconfigured. * Args include the vlan tag and a flag indicating whether the tag is * being added or removed. */ ether_vlancb_t ec_vlan_cb; /* Hooks called at the beginning of detach of this interface */ khook_list_t *ec_ifdetach_hooks; kmutex_t *ec_lock; /* Flags used only by the kernel */ int ec_flags; #ifdef MBUFTRACE struct mowner ec_rx_mowner; /* mbufs received */ struct mowner ec_tx_mowner; /* mbufs transmitted */ #endif }; #define ETHERCAP_VLAN_MTU 0x00000001 /* VLAN-compatible MTU */ #define ETHERCAP_VLAN_HWTAGGING 0x00000002 /* hardware VLAN tag support */ #define ETHERCAP_JUMBO_MTU 0x00000004 /* 9000 byte MTU supported */ #define ETHERCAP_VLAN_HWFILTER 0x00000008 /* iface hw can filter vlan tag */ #define ETHERCAP_EEE 0x00000010 /* Energy Efficiency Ethernet */ #define ETHERCAP_MASK 0x0000001f #define ECCAPBITS \ "\020" \ "\1VLAN_MTU" \ "\2VLAN_HWTAGGING" \ "\3JUMBO_MTU" \ "\4VLAN_HWFILTER" \ "\5EEE" /* ioctl() for Ethernet capabilities */ struct eccapreq { char eccr_name[IFNAMSIZ]; /* if name, e.g. "en0" */ int eccr_capabilities; /* supported capabiliites */ int eccr_capenable; /* capabilities enabled */ }; /* sysctl for Ethernet multicast addresses */ struct ether_multi_sysctl { u_int enm_refcount; uint8_t enm_addrlo[ETHER_ADDR_LEN]; uint8_t enm_addrhi[ETHER_ADDR_LEN]; }; #ifdef _KERNEL /* * Flags for ec_flags */ /* Store IFF_ALLMULTI in ec_flags instead of if_flags to avoid data races. */ #define ETHER_F_ALLMULTI __BIT(0) extern const uint8_t etherbroadcastaddr[ETHER_ADDR_LEN]; extern const uint8_t ethermulticastaddr_slowprotocols[ETHER_ADDR_LEN]; extern const uint8_t ether_ipmulticast_min[ETHER_ADDR_LEN]; extern const uint8_t ether_ipmulticast_max[ETHER_ADDR_LEN]; void ether_set_ifflags_cb(struct ethercom *, ether_cb_t); void ether_set_vlan_cb(struct ethercom *, ether_vlancb_t); int ether_ioctl(struct ifnet *, u_long, void *); int ether_addmulti(const struct sockaddr *, struct ethercom *); int ether_delmulti(const struct sockaddr *, struct ethercom *); int ether_multiaddr(const struct sockaddr *, uint8_t[ETHER_ADDR_LEN], uint8_t[ETHER_ADDR_LEN]); void ether_input(struct ifnet *, struct mbuf *); /* * Ethernet multicast address structure. There is one of these for each * multicast address or range of multicast addresses that we are supposed * to listen to on a particular interface. They are kept in a linked list, * rooted in the interface's ethercom structure. */ struct ether_multi { uint8_t enm_addrlo[ETHER_ADDR_LEN]; /* low or only address of range */ uint8_t enm_addrhi[ETHER_ADDR_LEN]; /* high or only address of range */ u_int enm_refcount; /* no. claims to this addr/range */ LIST_ENTRY(ether_multi) enm_list; }; /* * Structure used by macros below to remember position when stepping through * all of the ether_multi records. */ struct ether_multistep { struct ether_multi *e_enm; }; /* * lookup the ether_multi record for a given range of Ethernet * multicast addresses connected to a given ethercom structure. * If no matching record is found, NULL is returned. */ static __inline struct ether_multi * ether_lookup_multi(const uint8_t *addrlo, const uint8_t *addrhi, const struct ethercom *ec) { struct ether_multi *enm; LIST_FOREACH(enm, &ec->ec_multiaddrs, enm_list) { if (memcmp(enm->enm_addrlo, addrlo, ETHER_ADDR_LEN) != 0) continue; if (memcmp(enm->enm_addrhi, addrhi, ETHER_ADDR_LEN) != 0) continue; break; } return enm; } /* * step through all of the ether_multi records, one at a time. * The current position is remembered in "step", which the caller must * provide. ether_first_multi(), below, must be called to initialize "step" * and get the first record. Both functions return a NULL when there * are no remaining records. */ static __inline struct ether_multi * ether_next_multi(struct ether_multistep *step) { struct ether_multi *enm; enm = step->e_enm; if (enm != NULL) step->e_enm = LIST_NEXT(enm, enm_list); return enm; } #define ETHER_NEXT_MULTI(step, enm) \ /* struct ether_multistep step; */ \ /* struct ether_multi *enm; */ \ (enm) = ether_next_multi(&(step)) static __inline struct ether_multi * ether_first_multi(struct ether_multistep *step, const struct ethercom *ec) { step->e_enm = LIST_FIRST(&ec->ec_multiaddrs); return ether_next_multi(step); } #define ETHER_FIRST_MULTI(step, ec, enm) \ /* struct ether_multistep step; */ \ /* struct ethercom *ec; */ \ /* struct ether_multi *enm; */ \ (enm) = ether_first_multi(&(step), (ec)) #define ETHER_LOCK(ec) mutex_enter((ec)->ec_lock) #define ETHER_UNLOCK(ec) mutex_exit((ec)->ec_lock) /* * Ethernet 802.1Q VLAN structures. */ /* for ethercom */ struct vlanid_list { uint16_t vid; SIMPLEQ_ENTRY(vlanid_list) vid_list; }; /* add VLAN tag to input/received packet */ static __inline void vlan_set_tag(struct mbuf *m, uint16_t vlantag) { /* VLAN tag contains priority, CFI and VLAN ID */ KASSERT((m->m_flags & M_PKTHDR) != 0); m->m_pkthdr.ether_vtag = vlantag; m->m_flags |= M_VLANTAG; return; } /* extract VLAN ID value from a VLAN tag */ static __inline uint16_t vlan_get_tag(struct mbuf *m) { KASSERT((m->m_flags & M_PKTHDR) != 0); KASSERT(m->m_flags & M_VLANTAG); return m->m_pkthdr.ether_vtag; } static __inline bool vlan_has_tag(struct mbuf *m) { return (m->m_flags & M_VLANTAG) != 0; } static __inline bool vlan_is_hwtag_enabled(struct ifnet *_ifp) { struct ethercom *ec = (void *)_ifp; if (ec->ec_capenable & ETHERCAP_VLAN_HWTAGGING) return true; return false; } /* test if any VLAN is configured for this interface */ #define VLAN_ATTACHED(ec) ((ec)->ec_nvlans > 0) void etherinit(void); void ether_ifattach(struct ifnet *, const uint8_t *); void ether_ifdetach(struct ifnet *); int ether_mediachange(struct ifnet *); void ether_mediastatus(struct ifnet *, struct ifmediareq *); void * ether_ifdetachhook_establish(struct ifnet *, void (*)(void *), void *arg); void ether_ifdetachhook_disestablish(struct ifnet *, void *, kmutex_t *); char *ether_sprintf(const uint8_t *); char *ether_snprintf(char *, size_t, const uint8_t *); uint32_t ether_crc32_le(const uint8_t *, size_t); uint32_t ether_crc32_be(const uint8_t *, size_t); int ether_aton_r(u_char *, size_t, const char *); int ether_enable_vlan_mtu(struct ifnet *); int ether_disable_vlan_mtu(struct ifnet *); int ether_add_vlantag(struct ifnet *, uint16_t, bool *); int ether_del_vlantag(struct ifnet *, uint16_t); int ether_inject_vlantag(struct mbuf **, uint16_t, uint16_t); struct mbuf * ether_strip_vlantag(struct mbuf *); #else /* * Prototype ethers(3) functions. */ #include <sys/cdefs.h> __BEGIN_DECLS char * ether_ntoa(const struct ether_addr *); struct ether_addr * ether_aton(const char *); int ether_ntohost(char *, const struct ether_addr *); int ether_hostton(const char *, struct ether_addr *); int ether_line(const char *, struct ether_addr *, char *); __END_DECLS #endif #endif /* _STANDALONE */ #endif /* !_NET_IF_ETHER_H_ */
12 1 11 12 12 1 11 11 7 6 1 12 12 12 11 1 12 12 12 1 1 1 1 1 1 1 11 2 9 11 12 1 11 12 12 16 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 /* $NetBSD: tcp_subr.c,v 1.296 2022/11/04 09:01:53 ozaki-r Exp $ */ /* * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the project nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * Copyright (c) 1997, 1998, 2000, 2001, 2008 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Jason R. Thorpe and Kevin M. Lahey of the Numerical Aerospace Simulation * Facility, NASA Ames Research Center. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /* * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)tcp_subr.c 8.2 (Berkeley) 5/24/95 */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: tcp_subr.c,v 1.296 2022/11/04 09:01:53 ozaki-r Exp $"); #ifdef _KERNEL_OPT #include "opt_inet.h" #include "opt_ipsec.h" #include "opt_inet_csum.h" #include "opt_mbuftrace.h" #endif #include <sys/param.h> #include <sys/atomic.h> #include <sys/proc.h> #include <sys/systm.h> #include <sys/mbuf.h> #include <sys/once.h> #include <sys/socket.h> #include <sys/socketvar.h> #include <sys/protosw.h> #include <sys/errno.h> #include <sys/kernel.h> #include <sys/pool.h> #include <sys/md5.h> #include <sys/cprng.h> #include <net/route.h> #include <net/if.h> #include <netinet/in.h> #include <netinet/in_systm.h> #include <netinet/ip.h> #include <netinet/in_pcb.h> #include <netinet/ip_var.h> #include <netinet/ip_icmp.h> #ifdef INET6 #include <netinet/ip6.h> #include <netinet6/in6_pcb.h> #include <netinet6/ip6_var.h> #include <netinet6/in6_var.h> #include <netinet6/ip6protosw.h> #include <netinet/icmp6.h> #include <netinet6/nd6.h> #endif #include <netinet/tcp.h> #include <netinet/tcp_fsm.h> #include <netinet/tcp_seq.h> #include <netinet/tcp_timer.h> #include <netinet/tcp_var.h> #include <netinet/tcp_vtw.h> #include <netinet/tcp_private.h> #include <netinet/tcp_congctl.h> #include <netinet/tcp_syncache.h> #ifdef IPSEC #include <netipsec/ipsec.h> #ifdef INET6 #include <netipsec/ipsec6.h> #endif #include <netipsec/key.h> #endif struct inpcbtable tcbtable; /* head of queue of active tcpcb's */ u_int32_t tcp_now; /* slow ticks, for RFC 1323 timestamps */ percpu_t *tcpstat_percpu; /* patchable/settable parameters for tcp */ int tcp_mssdflt = TCP_MSS; int tcp_minmss = TCP_MINMSS; int tcp_rttdflt = TCPTV_SRTTDFLT / PR_SLOWHZ; int tcp_do_rfc1323 = 1; /* window scaling / timestamps (obsolete) */ int tcp_do_rfc1948 = 0; /* ISS by cryptographic hash */ int tcp_do_sack = 1; /* selective acknowledgement */ int tcp_do_win_scale = 1; /* RFC1323 window scaling */ int tcp_do_timestamps = 1; /* RFC1323 timestamps */ int tcp_ack_on_push = 0; /* set to enable immediate ACK-on-PUSH */ int tcp_do_ecn = 0; /* Explicit Congestion Notification */ #ifndef TCP_INIT_WIN #define TCP_INIT_WIN 4 /* initial slow start window */ #endif #ifndef TCP_INIT_WIN_LOCAL #define TCP_INIT_WIN_LOCAL 4 /* initial slow start window for local nets */ #endif /* * Up to 5 we scale linearly, to reach 3 * 1460; then (iw) * 1460. * This is to simulate current behavior for iw == 4 */ int tcp_init_win_max[] = { 1 * 1460, 1 * 1460, 2 * 1460, 2 * 1460, 3 * 1460, 5 * 1460, 6 * 1460, 7 * 1460, 8 * 1460, 9 * 1460, 10 * 1460 }; int tcp_init_win = TCP_INIT_WIN; int tcp_init_win_local = TCP_INIT_WIN_LOCAL; int tcp_mss_ifmtu = 0; int tcp_rst_ppslim = 100; /* 100pps */ int tcp_ackdrop_ppslim = 100; /* 100pps */ int tcp_do_loopback_cksum = 0; int tcp_do_abc = 1; /* RFC3465 Appropriate byte counting. */ int tcp_abc_aggressive = 1; /* 1: L=2*SMSS 0: L=1*SMSS */ int tcp_sack_tp_maxholes = 32; int tcp_sack_globalmaxholes = 1024; int tcp_sack_globalholes = 0; int tcp_ecn_maxretries = 1; int tcp_msl_enable = 1; /* enable TIME_WAIT truncation */ int tcp_msl_loop = PR_SLOWHZ; /* MSL for loopback */ int tcp_msl_local = 5 * PR_SLOWHZ; /* MSL for 'local' */ int tcp_msl_remote = TCPTV_MSL; /* MSL otherwise */ int tcp_msl_remote_threshold = TCPTV_SRTTDFLT; /* RTT threshold */ int tcp_rttlocal = 0; /* Use RTT to decide who's 'local' */ int tcp4_vtw_enable = 0; /* 1 to enable */ int tcp6_vtw_enable = 0; /* 1 to enable */ int tcp_vtw_was_enabled = 0; int tcp_vtw_entries = 1 << 4; /* 16 vestigial TIME_WAIT entries */ /* tcb hash */ #ifndef TCBHASHSIZE #define TCBHASHSIZE 128 #endif int tcbhashsize = TCBHASHSIZE; int tcp_freeq(struct tcpcb *); static int tcp_iss_secret_init(void); static void tcp_mtudisc_callback(struct in_addr); #ifdef INET6 static void tcp6_mtudisc(struct inpcb *, int); #endif static struct pool tcpcb_pool; static int tcp_drainwanted; #ifdef TCP_CSUM_COUNTERS #include <sys/device.h> struct evcnt tcp_hwcsum_bad = EVCNT_INITIALIZER(EVCNT_TYPE_MISC, NULL, "tcp", "hwcsum bad"); struct evcnt tcp_hwcsum_ok = EVCNT_INITIALIZER(EVCNT_TYPE_MISC, NULL, "tcp", "hwcsum ok"); struct evcnt tcp_hwcsum_data = EVCNT_INITIALIZER(EVCNT_TYPE_MISC, NULL, "tcp", "hwcsum data"); struct evcnt tcp_swcsum = EVCNT_INITIALIZER(EVCNT_TYPE_MISC, NULL, "tcp", "swcsum"); EVCNT_ATTACH_STATIC(tcp_hwcsum_bad); EVCNT_ATTACH_STATIC(tcp_hwcsum_ok); EVCNT_ATTACH_STATIC(tcp_hwcsum_data); EVCNT_ATTACH_STATIC(tcp_swcsum); #if defined(INET6) struct evcnt tcp6_hwcsum_bad = EVCNT_INITIALIZER(EVCNT_TYPE_MISC, NULL, "tcp6", "hwcsum bad"); struct evcnt tcp6_hwcsum_ok = EVCNT_INITIALIZER(EVCNT_TYPE_MISC, NULL, "tcp6", "hwcsum ok"); struct evcnt tcp6_hwcsum_data = EVCNT_INITIALIZER(EVCNT_TYPE_MISC, NULL, "tcp6", "hwcsum data"); struct evcnt tcp6_swcsum = EVCNT_INITIALIZER(EVCNT_TYPE_MISC, NULL, "tcp6", "swcsum"); EVCNT_ATTACH_STATIC(tcp6_hwcsum_bad); EVCNT_ATTACH_STATIC(tcp6_hwcsum_ok); EVCNT_ATTACH_STATIC(tcp6_hwcsum_data); EVCNT_ATTACH_STATIC(tcp6_swcsum); #endif /* defined(INET6) */ #endif /* TCP_CSUM_COUNTERS */ #ifdef TCP_OUTPUT_COUNTERS #include <sys/device.h> struct evcnt tcp_output_bigheader = EVCNT_INITIALIZER(EVCNT_TYPE_MISC, NULL, "tcp", "output big header"); struct evcnt tcp_output_predict_hit = EVCNT_INITIALIZER(EVCNT_TYPE_MISC, NULL, "tcp", "output predict hit"); struct evcnt tcp_output_predict_miss = EVCNT_INITIALIZER(EVCNT_TYPE_MISC, NULL, "tcp", "output predict miss"); struct evcnt tcp_output_copysmall = EVCNT_INITIALIZER(EVCNT_TYPE_MISC, NULL, "tcp", "output copy small"); struct evcnt tcp_output_copybig = EVCNT_INITIALIZER(EVCNT_TYPE_MISC, NULL, "tcp", "output copy big"); struct evcnt tcp_output_refbig = EVCNT_INITIALIZER(EVCNT_TYPE_MISC, NULL, "tcp", "output reference big"); EVCNT_ATTACH_STATIC(tcp_output_bigheader); EVCNT_ATTACH_STATIC(tcp_output_predict_hit); EVCNT_ATTACH_STATIC(tcp_output_predict_miss); EVCNT_ATTACH_STATIC(tcp_output_copysmall); EVCNT_ATTACH_STATIC(tcp_output_copybig); EVCNT_ATTACH_STATIC(tcp_output_refbig); #endif /* TCP_OUTPUT_COUNTERS */ #ifdef TCP_REASS_COUNTERS #include <sys/device.h> struct evcnt tcp_reass_ = EVCNT_INITIALIZER(EVCNT_TYPE_MISC, NULL, "tcp_reass", "calls"); struct evcnt tcp_reass_empty = EVCNT_INITIALIZER(EVCNT_TYPE_MISC, &tcp_reass_, "tcp_reass", "insert into empty queue"); struct evcnt tcp_reass_iteration[8] = { EVCNT_INITIALIZER(EVCNT_TYPE_MISC, &tcp_reass_, "tcp_reass", ">7 iterations"), EVCNT_INITIALIZER(EVCNT_TYPE_MISC, &tcp_reass_, "tcp_reass", "1 iteration"), EVCNT_INITIALIZER(EVCNT_TYPE_MISC, &tcp_reass_, "tcp_reass", "2 iterations"), EVCNT_INITIALIZER(EVCNT_TYPE_MISC, &tcp_reass_, "tcp_reass", "3 iterations"), EVCNT_INITIALIZER(EVCNT_TYPE_MISC, &tcp_reass_, "tcp_reass", "4 iterations"), EVCNT_INITIALIZER(EVCNT_TYPE_MISC, &tcp_reass_, "tcp_reass", "5 iterations"), EVCNT_INITIALIZER(EVCNT_TYPE_MISC, &tcp_reass_, "tcp_reass", "6 iterations"), EVCNT_INITIALIZER(EVCNT_TYPE_MISC, &tcp_reass_, "tcp_reass", "7 iterations"), }; struct evcnt tcp_reass_prependfirst = EVCNT_INITIALIZER(EVCNT_TYPE_MISC, &tcp_reass_, "tcp_reass", "prepend to first"); struct evcnt tcp_reass_prepend = EVCNT_INITIALIZER(EVCNT_TYPE_MISC, &tcp_reass_, "tcp_reass", "prepend"); struct evcnt tcp_reass_insert = EVCNT_INITIALIZER(EVCNT_TYPE_MISC, &tcp_reass_, "tcp_reass", "insert"); struct evcnt tcp_reass_inserttail = EVCNT_INITIALIZER(EVCNT_TYPE_MISC, &tcp_reass_, "tcp_reass", "insert at tail"); struct evcnt tcp_reass_append = EVCNT_INITIALIZER(EVCNT_TYPE_MISC, &tcp_reass_, "tcp_reass", "append"); struct evcnt tcp_reass_appendtail = EVCNT_INITIALIZER(EVCNT_TYPE_MISC, &tcp_reass_, "tcp_reass", "append to tail fragment"); struct evcnt tcp_reass_overlaptail = EVCNT_INITIALIZER(EVCNT_TYPE_MISC, &tcp_reass_, "tcp_reass", "overlap at end"); struct evcnt tcp_reass_overlapfront = EVCNT_INITIALIZER(EVCNT_TYPE_MISC, &tcp_reass_, "tcp_reass", "overlap at start"); struct evcnt tcp_reass_segdup = EVCNT_INITIALIZER(EVCNT_TYPE_MISC, &tcp_reass_, "tcp_reass", "duplicate segment"); struct evcnt tcp_reass_fragdup = EVCNT_INITIALIZER(EVCNT_TYPE_MISC, &tcp_reass_, "tcp_reass", "duplicate fragment"); EVCNT_ATTACH_STATIC(tcp_reass_); EVCNT_ATTACH_STATIC(tcp_reass_empty); EVCNT_ATTACH_STATIC2(tcp_reass_iteration, 0); EVCNT_ATTACH_STATIC2(tcp_reass_iteration, 1); EVCNT_ATTACH_STATIC2(tcp_reass_iteration, 2); EVCNT_ATTACH_STATIC2(tcp_reass_iteration, 3); EVCNT_ATTACH_STATIC2(tcp_reass_iteration, 4); EVCNT_ATTACH_STATIC2(tcp_reass_iteration, 5); EVCNT_ATTACH_STATIC2(tcp_reass_iteration, 6); EVCNT_ATTACH_STATIC2(tcp_reass_iteration, 7); EVCNT_ATTACH_STATIC(tcp_reass_prependfirst); EVCNT_ATTACH_STATIC(tcp_reass_prepend); EVCNT_ATTACH_STATIC(tcp_reass_insert); EVCNT_ATTACH_STATIC(tcp_reass_inserttail); EVCNT_ATTACH_STATIC(tcp_reass_append); EVCNT_ATTACH_STATIC(tcp_reass_appendtail); EVCNT_ATTACH_STATIC(tcp_reass_overlaptail); EVCNT_ATTACH_STATIC(tcp_reass_overlapfront); EVCNT_ATTACH_STATIC(tcp_reass_segdup); EVCNT_ATTACH_STATIC(tcp_reass_fragdup); #endif /* TCP_REASS_COUNTERS */ #ifdef MBUFTRACE struct mowner tcp_mowner = MOWNER_INIT("tcp", ""); struct mowner tcp_rx_mowner = MOWNER_INIT("tcp", "rx"); struct mowner tcp_tx_mowner = MOWNER_INIT("tcp", "tx"); struct mowner tcp_sock_mowner = MOWNER_INIT("tcp", "sock"); struct mowner tcp_sock_rx_mowner = MOWNER_INIT("tcp", "sock rx"); struct mowner tcp_sock_tx_mowner = MOWNER_INIT("tcp", "sock tx"); #endif static int do_tcpinit(void) { inpcb_init(&tcbtable, tcbhashsize, tcbhashsize); pool_init(&tcpcb_pool, sizeof(struct tcpcb), 0, 0, 0, "tcpcbpl", NULL, IPL_SOFTNET); tcp_usrreq_init(); /* Initialize timer state. */ tcp_timer_init(); /* Initialize the compressed state engine. */ syn_cache_init(); /* Initialize the congestion control algorithms. */ tcp_congctl_init(); /* Initialize the TCPCB template. */ tcp_tcpcb_template(); /* Initialize reassembly queue */ tcpipqent_init(); /* SACK */ tcp_sack_init(); MOWNER_ATTACH(&tcp_tx_mowner); MOWNER_ATTACH(&tcp_rx_mowner); MOWNER_ATTACH(&tcp_reass_mowner); MOWNER_ATTACH(&tcp_sock_mowner); MOWNER_ATTACH(&tcp_sock_tx_mowner); MOWNER_ATTACH(&tcp_sock_rx_mowner); MOWNER_ATTACH(&tcp_mowner); tcpstat_percpu = percpu_alloc(sizeof(uint64_t) * TCP_NSTATS); vtw_earlyinit(); tcp_slowtimo_init(); return 0; } void tcp_init_common(unsigned basehlen) { static ONCE_DECL(dotcpinit); unsigned hlen = basehlen + sizeof(struct tcphdr); unsigned oldhlen; if (max_linkhdr + hlen > MHLEN) panic("tcp_init"); while ((oldhlen = max_protohdr) < hlen) atomic_cas_uint(&max_protohdr, oldhlen, hlen); RUN_ONCE(&dotcpinit, do_tcpinit); } /* * Tcp initialization */ void tcp_init(void) { icmp_mtudisc_callback_register(tcp_mtudisc_callback); tcp_init_common(sizeof(struct ip)); } /* * Create template to be used to send tcp packets on a connection. * Call after host entry created, allocates an mbuf and fills * in a skeletal tcp/ip header, minimizing the amount of work * necessary when the connection is used. */ struct mbuf * tcp_template(struct tcpcb *tp) { struct inpcb *inp = tp->t_inpcb; struct tcphdr *n; struct mbuf *m; int hlen; switch (tp->t_family) { case AF_INET: hlen = sizeof(struct ip); if (inp->inp_af == AF_INET) break; #ifdef INET6 if (inp->inp_af == AF_INET6) { /* mapped addr case */ if (IN6_IS_ADDR_V4MAPPED(&in6p_laddr(inp)) && IN6_IS_ADDR_V4MAPPED(&in6p_faddr(inp))) break; } #endif return NULL; /*EINVAL*/ #ifdef INET6 case AF_INET6: hlen = sizeof(struct ip6_hdr); if (inp != NULL) { /* more sainty check? */ break; } return NULL; /*EINVAL*/ #endif default: return NULL; /*EAFNOSUPPORT*/ } KASSERT(hlen + sizeof(struct tcphdr) <= MCLBYTES); m = tp->t_template; if (m && m->m_len == hlen + sizeof(struct tcphdr)) { ; } else { if (m) m_freem(m); m = tp->t_template = NULL; MGETHDR(m, M_DONTWAIT, MT_HEADER); if (m && hlen + sizeof(struct tcphdr) > MHLEN) { MCLGET(m, M_DONTWAIT); if ((m->m_flags & M_EXT) == 0) { m_free(m); m = NULL; } } if (m == NULL) return NULL; MCLAIM(m, &tcp_mowner); m->m_pkthdr.len = m->m_len = hlen + sizeof(struct tcphdr); } memset(mtod(m, void *), 0, m->m_len); n = (struct tcphdr *)(mtod(m, char *) + hlen); switch (tp->t_family) { case AF_INET: { struct ipovly *ipov; mtod(m, struct ip *)->ip_v = 4; mtod(m, struct ip *)->ip_hl = hlen >> 2; ipov = mtod(m, struct ipovly *); ipov->ih_pr = IPPROTO_TCP; ipov->ih_len = htons(sizeof(struct tcphdr)); if (inp->inp_af == AF_INET) { ipov->ih_src = in4p_laddr(inp); ipov->ih_dst = in4p_faddr(inp); } #ifdef INET6 else if (inp->inp_af == AF_INET6) { /* mapped addr case */ bcopy(&in6p_laddr(inp).s6_addr32[3], &ipov->ih_src, sizeof(ipov->ih_src)); bcopy(&in6p_faddr(inp).s6_addr32[3], &ipov->ih_dst, sizeof(ipov->ih_dst)); } #endif /* * Compute the pseudo-header portion of the checksum * now. We incrementally add in the TCP option and * payload lengths later, and then compute the TCP * checksum right before the packet is sent off onto * the wire. */ n->th_sum = in_cksum_phdr(ipov->ih_src.s_addr, ipov->ih_dst.s_addr, htons(sizeof(struct tcphdr) + IPPROTO_TCP)); break; } #ifdef INET6 case AF_INET6: { struct ip6_hdr *ip6; mtod(m, struct ip *)->ip_v = 6; ip6 = mtod(m, struct ip6_hdr *); ip6->ip6_nxt = IPPROTO_TCP; ip6->ip6_plen = htons(sizeof(struct tcphdr)); ip6->ip6_src = in6p_laddr(inp); ip6->ip6_dst = in6p_faddr(inp); ip6->ip6_flow = in6p_flowinfo(inp) & IPV6_FLOWINFO_MASK; if (ip6_auto_flowlabel) { ip6->ip6_flow &= ~IPV6_FLOWLABEL_MASK; ip6->ip6_flow |= (htonl(ip6_randomflowlabel()) & IPV6_FLOWLABEL_MASK); } ip6->ip6_vfc &= ~IPV6_VERSION_MASK; ip6->ip6_vfc |= IPV6_VERSION; /* * Compute the pseudo-header portion of the checksum * now. We incrementally add in the TCP option and * payload lengths later, and then compute the TCP * checksum right before the packet is sent off onto * the wire. */ n->th_sum = in6_cksum_phdr(&in6p_laddr(inp), &in6p_faddr(inp), htonl(sizeof(struct tcphdr)), htonl(IPPROTO_TCP)); break; } #endif } n->th_sport = inp->inp_lport; n->th_dport = inp->inp_fport; n->th_seq = 0; n->th_ack = 0; n->th_x2 = 0; n->th_off = 5; n->th_flags = 0; n->th_win = 0; n->th_urp = 0; return m; } /* * Send a single message to the TCP at address specified by * the given TCP/IP header. If m == 0, then we make a copy * of the tcpiphdr at ti and send directly to the addressed host. * This is used to force keep alive messages out using the TCP * template for a connection tp->t_template. If flags are given * then we send a message back to the TCP which originated the * segment ti, and discard the mbuf containing it and any other * attached mbufs. * * In any case the ack and sequence number of the transmitted * segment are as specified by the parameters. */ int tcp_respond(struct tcpcb *tp, struct mbuf *mtemplate, struct mbuf *m, struct tcphdr *th0, tcp_seq ack, tcp_seq seq, int flags) { struct route *ro; int error, tlen, win = 0; int hlen; struct ip *ip; #ifdef INET6 struct ip6_hdr *ip6; #endif int family; /* family on packet, not inpcb! */ struct tcphdr *th; if (tp != NULL && (flags & TH_RST) == 0) { KASSERT(tp->t_inpcb != NULL); win = sbspace(&tp->t_inpcb->inp_socket->so_rcv); } th = NULL; /* Quell uninitialized warning */ ip = NULL; #ifdef INET6 ip6 = NULL; #endif if (m == NULL) { if (!mtemplate) return EINVAL; /* get family information from template */ switch (mtod(mtemplate, struct ip *)->ip_v) { case 4: family = AF_INET; hlen = sizeof(struct ip); break; #ifdef INET6 case 6: family = AF_INET6; hlen = sizeof(struct ip6_hdr); break; #endif default: return EAFNOSUPPORT; } MGETHDR(m, M_DONTWAIT, MT_HEADER); if (m) { MCLAIM(m, &tcp_tx_mowner); MCLGET(m, M_DONTWAIT); if ((m->m_flags & M_EXT) == 0) { m_free(m); m = NULL; } } if (m == NULL) return ENOBUFS; tlen = 0; m->m_data += max_linkhdr; bcopy(mtod(mtemplate, void *), mtod(m, void *), mtemplate->m_len); switch (family) { case AF_INET: ip = mtod(m, struct ip *); th = (struct tcphdr *)(ip + 1); break; #ifdef INET6 case AF_INET6: ip6 = mtod(m, struct ip6_hdr *); th = (struct tcphdr *)(ip6 + 1); break; #endif } flags = TH_ACK; } else { if ((m->m_flags & M_PKTHDR) == 0) { m_freem(m); return EINVAL; } KASSERT(th0 != NULL); /* get family information from m */ switch (mtod(m, struct ip *)->ip_v) { case 4: family = AF_INET; hlen = sizeof(struct ip); ip = mtod(m, struct ip *); break; #ifdef INET6 case 6: family = AF_INET6; hlen = sizeof(struct ip6_hdr); ip6 = mtod(m, struct ip6_hdr *); break; #endif default: m_freem(m); return EAFNOSUPPORT; } /* clear h/w csum flags inherited from rx packet */ m->m_pkthdr.csum_flags = 0; if ((flags & TH_SYN) == 0 || sizeof(*th0) > (th0->th_off << 2)) tlen = sizeof(*th0); else tlen = th0->th_off << 2; if (m->m_len > hlen + tlen && (m->m_flags & M_EXT) == 0 && mtod(m, char *) + hlen == (char *)th0) { m->m_len = hlen + tlen; m_freem(m->m_next); m->m_next = NULL; } else { struct mbuf *n; KASSERT(max_linkhdr + hlen + tlen <= MCLBYTES); MGETHDR(n, M_DONTWAIT, MT_HEADER); if (n && max_linkhdr + hlen + tlen > MHLEN) { MCLGET(n, M_DONTWAIT); if ((n->m_flags & M_EXT) == 0) { m_freem(n); n = NULL; } } if (!n) { m_freem(m); return ENOBUFS; } MCLAIM(n, &tcp_tx_mowner); n->m_data += max_linkhdr; n->m_len = hlen + tlen; m_copyback(n, 0, hlen, mtod(m, void *)); m_copyback(n, hlen, tlen, (void *)th0); m_freem(m); m = n; n = NULL; } #define xchg(a,b,type) { type t; t=a; a=b; b=t; } switch (family) { case AF_INET: ip = mtod(m, struct ip *); th = (struct tcphdr *)(ip + 1); ip->ip_p = IPPROTO_TCP; xchg(ip->ip_dst, ip->ip_src, struct in_addr); ip->ip_p = IPPROTO_TCP; break; #ifdef INET6 case AF_INET6: ip6 = mtod(m, struct ip6_hdr *); th = (struct tcphdr *)(ip6 + 1); ip6->ip6_nxt = IPPROTO_TCP; xchg(ip6->ip6_dst, ip6->ip6_src, struct in6_addr); ip6->ip6_nxt = IPPROTO_TCP; break; #endif } xchg(th->th_dport, th->th_sport, u_int16_t); #undef xchg tlen = 0; /*be friendly with the following code*/ } th->th_seq = htonl(seq); th->th_ack = htonl(ack); th->th_x2 = 0; if ((flags & TH_SYN) == 0) { if (tp) win >>= tp->rcv_scale; if (win > TCP_MAXWIN) win = TCP_MAXWIN; th->th_win = htons((u_int16_t)win); th->th_off = sizeof (struct tcphdr) >> 2; tlen += sizeof(*th); } else { tlen += th->th_off << 2; } m->m_len = hlen + tlen; m->m_pkthdr.len = hlen + tlen; m_reset_rcvif(m); th->th_flags = flags; th->th_urp = 0; switch (family) { case AF_INET: { struct ipovly *ipov = (struct ipovly *)ip; memset(ipov->ih_x1, 0, sizeof ipov->ih_x1); ipov->ih_len = htons((u_int16_t)tlen); th->th_sum = 0; th->th_sum = in_cksum(m, hlen + tlen); ip->ip_len = htons(hlen + tlen); ip->ip_ttl = ip_defttl; break; } #ifdef INET6 case AF_INET6: { th->th_sum = 0; th->th_sum = in6_cksum(m, IPPROTO_TCP, sizeof(struct ip6_hdr), tlen); ip6->ip6_plen = htons(tlen); if (tp && tp->t_inpcb->inp_af == AF_INET6) ip6->ip6_hlim = in6pcb_selecthlim_rt(tp->t_inpcb); else ip6->ip6_hlim = ip6_defhlim; ip6->ip6_flow &= ~IPV6_FLOWINFO_MASK; if (ip6_auto_flowlabel) { ip6->ip6_flow |= (htonl(ip6_randomflowlabel()) & IPV6_FLOWLABEL_MASK); } break; } #endif } if (tp != NULL && tp->t_inpcb->inp_af == AF_INET) { ro = &tp->t_inpcb->inp_route; KASSERT(family == AF_INET); KASSERT(in_hosteq(ip->ip_dst, in4p_faddr(tp->t_inpcb))); } #ifdef INET6 else if (tp != NULL && tp->t_inpcb->inp_af == AF_INET6) { ro = (struct route *)&tp->t_inpcb->inp_route; #ifdef DIAGNOSTIC if (family == AF_INET) { if (!IN6_IS_ADDR_V4MAPPED(&in6p_faddr(tp->t_inpcb))) panic("tcp_respond: not mapped addr"); if (memcmp(&ip->ip_dst, &in6p_faddr(tp->t_inpcb).s6_addr32[3], sizeof(ip->ip_dst)) != 0) { panic("tcp_respond: ip_dst != in6p_faddr"); } } else if (family == AF_INET6) { if (!IN6_ARE_ADDR_EQUAL(&ip6->ip6_dst, &in6p_faddr(tp->t_inpcb))) panic("tcp_respond: ip6_dst != in6p_faddr"); } else panic("tcp_respond: address family mismatch"); #endif } #endif else ro = NULL; switch (family) { case AF_INET: error = ip_output(m, NULL, ro, (tp && tp->t_mtudisc ? IP_MTUDISC : 0), NULL, tp ? tp->t_inpcb : NULL); break; #ifdef INET6 case AF_INET6: error = ip6_output(m, NULL, ro, 0, NULL, tp ? tp->t_inpcb : NULL, NULL); break; #endif default: error = EAFNOSUPPORT; break; } return error; } /* * Template TCPCB. Rather than zeroing a new TCPCB and initializing * a bunch of members individually, we maintain this template for the * static and mostly-static components of the TCPCB, and copy it into * the new TCPCB instead. */ static struct tcpcb tcpcb_template = { .t_srtt = TCPTV_SRTTBASE, .t_rttmin = TCPTV_MIN, .snd_cwnd = TCP_MAXWIN << TCP_MAX_WINSHIFT, .snd_ssthresh = TCP_MAXWIN << TCP_MAX_WINSHIFT, .snd_numholes = 0, .snd_cubic_wmax = 0, .snd_cubic_wmax_last = 0, .snd_cubic_ctime = 0, .t_partialacks = -1, .t_bytes_acked = 0, .t_sndrexmitpack = 0, .t_rcvoopack = 0, .t_sndzerowin = 0, }; /* * Updates the TCPCB template whenever a parameter that would affect * the template is changed. */ void tcp_tcpcb_template(void) { struct tcpcb *tp = &tcpcb_template; int flags; tp->t_peermss = tcp_mssdflt; tp->t_ourmss = tcp_mssdflt; tp->t_segsz = tcp_mssdflt; flags = 0; if (tcp_do_rfc1323 && tcp_do_win_scale) flags |= TF_REQ_SCALE; if (tcp_do_rfc1323 && tcp_do_timestamps) flags |= TF_REQ_TSTMP; tp->t_flags = flags; /* * Init srtt to TCPTV_SRTTBASE (0), so we can tell that we have no * rtt estimate. Set rttvar so that srtt + 2 * rttvar gives * reasonable initial retransmit time. */ tp->t_rttvar = tcp_rttdflt * PR_SLOWHZ << (TCP_RTTVAR_SHIFT + 2 - 1); TCPT_RANGESET(tp->t_rxtcur, TCP_REXMTVAL(tp), TCPTV_MIN, TCPTV_REXMTMAX); /* Keep Alive */ tp->t_keepinit = MIN(tcp_keepinit, TCP_TIMER_MAXTICKS); tp->t_keepidle = MIN(tcp_keepidle, TCP_TIMER_MAXTICKS); tp->t_keepintvl = MIN(tcp_keepintvl, TCP_TIMER_MAXTICKS); tp->t_keepcnt = MAX(1, MIN(tcp_keepcnt, TCP_TIMER_MAXTICKS)); tp->t_maxidle = tp->t_keepcnt * MIN(tp->t_keepintvl, TCP_TIMER_MAXTICKS/tp->t_keepcnt); /* MSL */ tp->t_msl = TCPTV_MSL; } /* * Create a new TCP control block, making an * empty reassembly queue and hooking it to the argument * protocol control block. */ struct tcpcb * tcp_newtcpcb(int family, struct inpcb *inp) { struct tcpcb *tp; int i; /* XXX Consider using a pool_cache for speed. */ tp = pool_get(&tcpcb_pool, PR_NOWAIT); /* splsoftnet via tcp_usrreq */ if (tp == NULL) return NULL; memcpy(tp, &tcpcb_template, sizeof(*tp)); TAILQ_INIT(&tp->segq); TAILQ_INIT(&tp->timeq); tp->t_family = family; /* may be overridden later on */ TAILQ_INIT(&tp->snd_holes); LIST_INIT(&tp->t_sc); /* XXX can template this */ /* Don't sweat this loop; hopefully the compiler will unroll it. */ for (i = 0; i < TCPT_NTIMERS; i++) { callout_init(&tp->t_timer[i], CALLOUT_MPSAFE); TCP_TIMER_INIT(tp, i); } callout_init(&tp->t_delack_ch, CALLOUT_MPSAFE); switch (family) { case AF_INET: in4p_ip(inp).ip_ttl = ip_defttl; inp->inp_ppcb = (void *)tp; tp->t_inpcb = inp; tp->t_mtudisc = ip_mtudisc; break; #ifdef INET6 case AF_INET6: in6p_ip6(inp).ip6_hlim = in6pcb_selecthlim_rt(inp); inp->inp_ppcb = (void *)tp; tp->t_inpcb = inp; /* for IPv6, always try to run path MTU discovery */ tp->t_mtudisc = 1; break; #endif /* INET6 */ default: for (i = 0; i < TCPT_NTIMERS; i++) callout_destroy(&tp->t_timer[i]); callout_destroy(&tp->t_delack_ch); pool_put(&tcpcb_pool, tp); /* splsoftnet via tcp_usrreq */ return NULL; } /* * Initialize our timebase. When we send timestamps, we take * the delta from tcp_now -- this means each connection always * gets a timebase of 1, which makes it, among other things, * more difficult to determine how long a system has been up, * and thus how many TCP sequence increments have occurred. * * We start with 1, because 0 doesn't work with linux, which * considers timestamp 0 in a SYN packet as a bug and disables * timestamps. */ tp->ts_timebase = tcp_now - 1; tcp_congctl_select(tp, tcp_congctl_global_name); return tp; } /* * Drop a TCP connection, reporting * the specified error. If connection is synchronized, * then send a RST to peer. */ struct tcpcb * tcp_drop(struct tcpcb *tp, int errno) { struct socket *so; KASSERT(tp->t_inpcb != NULL); so = tp->t_inpcb->inp_socket; if (so == NULL) return NULL; if (TCPS_HAVERCVDSYN(tp->t_state)) { tp->t_state = TCPS_CLOSED; (void) tcp_output(tp); TCP_STATINC(TCP_STAT_DROPS); } else TCP_STATINC(TCP_STAT_CONNDROPS); if (errno == ETIMEDOUT && tp->t_softerror) errno = tp->t_softerror; so->so_error = errno; return (tcp_close(tp)); } /* * Close a TCP control block: * discard all space held by the tcp * discard internet protocol block * wake up any sleepers */ struct tcpcb * tcp_close(struct tcpcb *tp) { struct inpcb *inp; struct socket *so; #ifdef RTV_RTT struct rtentry *rt = NULL; #endif struct route *ro; int j; inp = tp->t_inpcb; so = inp->inp_socket; ro = &inp->inp_route; #ifdef RTV_RTT /* * If we sent enough data to get some meaningful characteristics, * save them in the routing entry. 'Enough' is arbitrarily * defined as the sendpipesize (default 4K) * 16. This would * give us 16 rtt samples assuming we only get one sample per * window (the usual case on a long haul net). 16 samples is * enough for the srtt filter to converge to within 5% of the correct * value; fewer samples and we could save a very bogus rtt. * * Don't update the default route's characteristics and don't * update anything that the user "locked". */ if (SEQ_LT(tp->iss + so->so_snd.sb_hiwat * 16, tp->snd_max) && ro && (rt = rtcache_validate(ro)) != NULL && !in_nullhost(satocsin(rt_getkey(rt))->sin_addr)) { u_long i = 0; if ((rt->rt_rmx.rmx_locks & RTV_RTT) == 0) { i = tp->t_srtt * ((RTM_RTTUNIT / PR_SLOWHZ) >> (TCP_RTT_SHIFT + 2)); if (rt->rt_rmx.rmx_rtt && i) /* * filter this update to half the old & half * the new values, converting scale. * See route.h and tcp_var.h for a * description of the scaling constants. */ rt->rt_rmx.rmx_rtt = (rt->rt_rmx.rmx_rtt + i) / 2; else rt->rt_rmx.rmx_rtt = i; } if ((rt->rt_rmx.rmx_locks & RTV_RTTVAR) == 0) { i = tp->t_rttvar * ((RTM_RTTUNIT / PR_SLOWHZ) >> (TCP_RTTVAR_SHIFT + 2)); if (rt->rt_rmx.rmx_rttvar && i) rt->rt_rmx.rmx_rttvar = (rt->rt_rmx.rmx_rttvar + i) / 2; else rt->rt_rmx.rmx_rttvar = i; } /* * update the pipelimit (ssthresh) if it has been updated * already or if a pipesize was specified & the threshold * got below half the pipesize. I.e., wait for bad news * before we start updating, then update on both good * and bad news. */ if (((rt->rt_rmx.rmx_locks & RTV_SSTHRESH) == 0 && (i = tp->snd_ssthresh) && rt->rt_rmx.rmx_ssthresh) || i < (rt->rt_rmx.rmx_sendpipe / 2)) { /* * convert the limit from user data bytes to * packets then to packet data bytes. */ i = (i + tp->t_segsz / 2) / tp->t_segsz; if (i < 2) i = 2; i *= (u_long)(tp->t_segsz + sizeof (struct tcpiphdr)); if (rt->rt_rmx.rmx_ssthresh) rt->rt_rmx.rmx_ssthresh = (rt->rt_rmx.rmx_ssthresh + i) / 2; else rt->rt_rmx.rmx_ssthresh = i; } } rtcache_unref(rt, ro); #endif /* RTV_RTT */ /* free the reassembly queue, if any */ TCP_REASS_LOCK(tp); (void) tcp_freeq(tp); TCP_REASS_UNLOCK(tp); /* free the SACK holes list. */ tcp_free_sackholes(tp); tcp_congctl_release(tp); syn_cache_cleanup(tp); if (tp->t_template) { m_free(tp->t_template); tp->t_template = NULL; } /* * Detaching the pcb will unlock the socket/tcpcb, and stopping * the timers can also drop the lock. We need to prevent access * to the tcpcb as it's half torn down. Flag the pcb as dead * (prevents access by timers) and only then detach it. */ tp->t_flags |= TF_DEAD; inp->inp_ppcb = NULL; soisdisconnected(so); inpcb_destroy(inp); /* * pcb is no longer visble elsewhere, so we can safely release * the lock in callout_halt() if needed. */ TCP_STATINC(TCP_STAT_CLOSED); for (j = 0; j < TCPT_NTIMERS; j++) { callout_halt(&tp->t_timer[j], softnet_lock); callout_destroy(&tp->t_timer[j]); } callout_halt(&tp->t_delack_ch, softnet_lock); callout_destroy(&tp->t_delack_ch); pool_put(&tcpcb_pool, tp); return NULL; } int tcp_freeq(struct tcpcb *tp) { struct ipqent *qe; int rv = 0; TCP_REASS_LOCK_CHECK(tp); while ((qe = TAILQ_FIRST(&tp->segq)) != NULL) { TAILQ_REMOVE(&tp->segq, qe, ipqe_q); TAILQ_REMOVE(&tp->timeq, qe, ipqe_timeq); m_freem(qe->ipqe_m); tcpipqent_free(qe); rv = 1; } tp->t_segqlen = 0; KASSERT(TAILQ_EMPTY(&tp->timeq)); return (rv); } void tcp_fasttimo(void) { if (tcp_drainwanted) { tcp_drain(); tcp_drainwanted = 0; } } void tcp_drainstub(void) { tcp_drainwanted = 1; } /* * Protocol drain routine. Called when memory is in short supply. * Called from pr_fasttimo thus a callout context. */ void tcp_drain(void) { struct inpcb *inp; struct tcpcb *tp; mutex_enter(softnet_lock); KERNEL_LOCK(1, NULL); /* * Free the sequence queue of all TCP connections. */ TAILQ_FOREACH(inp, &tcbtable.inpt_queue, inp_queue) { tp = intotcpcb(inp); if (tp != NULL) { /* * If the tcpcb is already busy, * just bail out now. */ if (tcp_reass_lock_try(tp) == 0) continue; if (tcp_freeq(tp)) TCP_STATINC(TCP_STAT_CONNSDRAINED); TCP_REASS_UNLOCK(tp); } } KERNEL_UNLOCK_ONE(NULL); mutex_exit(softnet_lock); } /* * Notify a tcp user of an asynchronous error; * store error as soft error, but wake up user * (for now, won't do anything until can select for soft error). */ void tcp_notify(struct inpcb *inp, int error) { struct tcpcb *tp = (struct tcpcb *)inp->inp_ppcb; struct socket *so = inp->inp_socket; /* * Ignore some errors if we are hooked up. * If connection hasn't completed, has retransmitted several times, * and receives a second error, give up now. This is better * than waiting a long time to establish a connection that * can never complete. */ if (tp->t_state == TCPS_ESTABLISHED && (error == EHOSTUNREACH || error == ENETUNREACH || error == EHOSTDOWN)) { return; } else if (TCPS_HAVEESTABLISHED(tp->t_state) == 0 && tp->t_rxtshift > 3 && tp->t_softerror) so->so_error = error; else tp->t_softerror = error; cv_broadcast(&so->so_cv); sorwakeup(so); sowwakeup(so); } #ifdef INET6 void * tcp6_ctlinput(int cmd, const struct sockaddr *sa, void *d) { struct tcphdr th; void (*notify)(struct inpcb *, int) = tcp_notify; int nmatch; struct ip6_hdr *ip6; const struct sockaddr_in6 *sa6_src = NULL; const struct sockaddr_in6 *sa6 = (const struct sockaddr_in6 *)sa; struct mbuf *m; int off; if (sa->sa_family != AF_INET6 || sa->sa_len != sizeof(struct sockaddr_in6)) return NULL; if ((unsigned)cmd >= PRC_NCMDS) return NULL; else if (cmd == PRC_QUENCH) { /* * Don't honor ICMP Source Quench messages meant for * TCP connections. */ return NULL; } else if (PRC_IS_REDIRECT(cmd)) notify = in6pcb_rtchange, d = NULL; else if (cmd == PRC_MSGSIZE) ; /* special code is present, see below */ else if (cmd == PRC_HOSTDEAD) d = NULL; else if (inet6ctlerrmap[cmd] == 0) return NULL; /* if the parameter is from icmp6, decode it. */ if (d != NULL) { struct ip6ctlparam *ip6cp = (struct ip6ctlparam *)d; m = ip6cp->ip6c_m; ip6 = ip6cp->ip6c_ip6; off = ip6cp->ip6c_off; sa6_src = ip6cp->ip6c_src; } else { m = NULL; ip6 = NULL; sa6_src = &sa6_any; off = 0; } if (ip6) { /* check if we can safely examine src and dst ports */ if (m->m_pkthdr.len < off + sizeof(th)) { if (cmd == PRC_MSGSIZE) icmp6_mtudisc_update((struct ip6ctlparam *)d, 0); return NULL; } memset(&th, 0, sizeof(th)); m_copydata(m, off, sizeof(th), (void *)&th); if (cmd == PRC_MSGSIZE) { int valid = 0; /* * Check to see if we have a valid TCP connection * corresponding to the address in the ICMPv6 message * payload. */ if (in6pcb_lookup(&tcbtable, &sa6->sin6_addr, th.th_dport, (const struct in6_addr *)&sa6_src->sin6_addr, th.th_sport, 0, 0)) valid++; /* * Depending on the value of "valid" and routing table * size (mtudisc_{hi,lo}wat), we will: * - recalcurate the new MTU and create the * corresponding routing entry, or * - ignore the MTU change notification. */ icmp6_mtudisc_update((struct ip6ctlparam *)d, valid); /* * no need to call in6pcb_notify, it should have been * called via callback if necessary */ return NULL; } nmatch = in6pcb_notify(&tcbtable, sa, th.th_dport, (const struct sockaddr *)sa6_src, th.th_sport, cmd, NULL, notify); if (nmatch == 0 && syn_cache_count && (inet6ctlerrmap[cmd] == EHOSTUNREACH || inet6ctlerrmap[cmd] == ENETUNREACH || inet6ctlerrmap[cmd] == EHOSTDOWN)) syn_cache_unreach((const struct sockaddr *)sa6_src, sa, &th); } else { (void) in6pcb_notify(&tcbtable, sa, 0, (const struct sockaddr *)sa6_src, 0, cmd, NULL, notify); } return NULL; } #endif /* assumes that ip header and tcp header are contiguous on mbuf */ void * tcp_ctlinput(int cmd, const struct sockaddr *sa, void *v) { struct ip *ip = v; struct tcphdr *th; struct icmp *icp; extern const int inetctlerrmap[]; void (*notify)(struct inpcb *, int) = tcp_notify; int errno; int nmatch; struct tcpcb *tp; u_int mtu; tcp_seq seq; struct inpcb *inp; #ifdef INET6 struct in6_addr src6, dst6; #endif if (sa->sa_family != AF_INET || sa->sa_len != sizeof(struct sockaddr_in)) return NULL; if ((unsigned)cmd >= PRC_NCMDS) return NULL; errno = inetctlerrmap[cmd]; if (cmd == PRC_QUENCH) /* * Don't honor ICMP Source Quench messages meant for * TCP connections. */ return NULL; else if (PRC_IS_REDIRECT(cmd)) notify = inpcb_rtchange, ip = 0; else if (cmd == PRC_MSGSIZE && ip && ip->ip_v == 4) { /* * Check to see if we have a valid TCP connection * corresponding to the address in the ICMP message * payload. * * Boundary check is made in icmp_input(), with ICMP_ADVLENMIN. */ th = (struct tcphdr *)((char *)ip + (ip->ip_hl << 2)); #ifdef INET6 in6_in_2_v4mapin6(&ip->ip_src, &src6); in6_in_2_v4mapin6(&ip->ip_dst, &dst6); #endif if ((inp = inpcb_lookup(&tcbtable, ip->ip_dst, th->th_dport, ip->ip_src, th->th_sport, 0)) != NULL) ; #ifdef INET6 else if ((inp = in6pcb_lookup(&tcbtable, &dst6, th->th_dport, &src6, th->th_sport, 0, 0)) != NULL) ; #endif else return NULL; /* * Now that we've validated that we are actually communicating * with the host indicated in the ICMP message, locate the * ICMP header, recalculate the new MTU, and create the * corresponding routing entry. */ icp = (struct icmp *)((char *)ip - offsetof(struct icmp, icmp_ip)); tp = intotcpcb(inp); if (tp == NULL) return NULL; seq = ntohl(th->th_seq); if (SEQ_LT(seq, tp->snd_una) || SEQ_GT(seq, tp->snd_max)) return NULL; /* * If the ICMP message advertises a Next-Hop MTU * equal or larger than the maximum packet size we have * ever sent, drop the message. */ mtu = (u_int)ntohs(icp->icmp_nextmtu); if (mtu >= tp->t_pmtud_mtu_sent) return NULL; if (mtu >= tcp_hdrsz(tp) + tp->t_pmtud_mss_acked) { /* * Calculate new MTU, and create corresponding * route (traditional PMTUD). */ tp->t_flags &= ~TF_PMTUD_PEND; icmp_mtudisc(icp, ip->ip_dst); } else { /* * Record the information got in the ICMP * message; act on it later. * If we had already recorded an ICMP message, * replace the old one only if the new message * refers to an older TCP segment */ if (tp->t_flags & TF_PMTUD_PEND) { if (SEQ_LT(tp->t_pmtud_th_seq, seq)) return NULL; } else tp->t_flags |= TF_PMTUD_PEND; tp->t_pmtud_th_seq = seq; tp->t_pmtud_nextmtu = icp->icmp_nextmtu; tp->t_pmtud_ip_len = icp->icmp_ip.ip_len; tp->t_pmtud_ip_hl = icp->icmp_ip.ip_hl; } return NULL; } else if (cmd == PRC_HOSTDEAD) ip = 0; else if (errno == 0) return NULL; if (ip && ip->ip_v == 4 && sa->sa_family == AF_INET) { th = (struct tcphdr *)((char *)ip + (ip->ip_hl << 2)); nmatch = inpcb_notify(&tcbtable, satocsin(sa)->sin_addr, th->th_dport, ip->ip_src, th->th_sport, errno, notify); if (nmatch == 0 && syn_cache_count && (inetctlerrmap[cmd] == EHOSTUNREACH || inetctlerrmap[cmd] == ENETUNREACH || inetctlerrmap[cmd] == EHOSTDOWN)) { struct sockaddr_in sin; memset(&sin, 0, sizeof(sin)); sin.sin_len = sizeof(sin); sin.sin_family = AF_INET; sin.sin_port = th->th_sport; sin.sin_addr = ip->ip_src; syn_cache_unreach((struct sockaddr *)&sin, sa, th); } /* XXX mapped address case */ } else inpcb_notifyall(&tcbtable, satocsin(sa)->sin_addr, errno, notify); return NULL; } /* * When a source quench is received, we are being notified of congestion. * Close the congestion window down to the Loss Window (one segment). * We will gradually open it again as we proceed. */ void tcp_quench(struct inpcb *inp) { struct tcpcb *tp = intotcpcb(inp); if (tp) { tp->snd_cwnd = tp->t_segsz; tp->t_bytes_acked = 0; } } /* * Path MTU Discovery handlers. */ void tcp_mtudisc_callback(struct in_addr faddr) { #ifdef INET6 struct in6_addr in6; #endif inpcb_notifyall(&tcbtable, faddr, EMSGSIZE, tcp_mtudisc); #ifdef INET6 in6_in_2_v4mapin6(&faddr, &in6); tcp6_mtudisc_callback(&in6); #endif } /* * On receipt of path MTU corrections, flush old route and replace it * with the new one. Retransmit all unacknowledged packets, to ensure * that all packets will be received. */ void tcp_mtudisc(struct inpcb *inp, int errno) { struct tcpcb *tp = intotcpcb(inp); struct rtentry *rt; if (tp == NULL) return; rt = inpcb_rtentry(inp); if (rt != NULL) { /* * If this was not a host route, remove and realloc. */ if ((rt->rt_flags & RTF_HOST) == 0) { inpcb_rtentry_unref(rt, inp); inpcb_rtchange(inp, errno); if ((rt = inpcb_rtentry(inp)) == NULL) return; } /* * Slow start out of the error condition. We * use the MTU because we know it's smaller * than the previously transmitted segment. * * Note: This is more conservative than the * suggestion in draft-floyd-incr-init-win-03. */ if (rt->rt_rmx.rmx_mtu != 0) tp->snd_cwnd = TCP_INITIAL_WINDOW(tcp_init_win, rt->rt_rmx.rmx_mtu); inpcb_rtentry_unref(rt, inp); } /* * Resend unacknowledged packets. */ tp->snd_nxt = tp->sack_newdata = tp->snd_una; tcp_output(tp); } #ifdef INET6 /* * Path MTU Discovery handlers. */ void tcp6_mtudisc_callback(struct in6_addr *faddr) { struct sockaddr_in6 sin6; memset(&sin6, 0, sizeof(sin6)); sin6.sin6_family = AF_INET6; sin6.sin6_len = sizeof(struct sockaddr_in6); sin6.sin6_addr = *faddr; (void) in6pcb_notify(&tcbtable, (struct sockaddr *)&sin6, 0, (const struct sockaddr *)&sa6_any, 0, PRC_MSGSIZE, NULL, tcp6_mtudisc); } void tcp6_mtudisc(struct inpcb *inp, int errno) { struct tcpcb *tp = intotcpcb(inp); struct rtentry *rt; if (tp == NULL) return; rt = in6pcb_rtentry(inp); if (rt != NULL) { /* * If this was not a host route, remove and realloc. */ if ((rt->rt_flags & RTF_HOST) == 0) { in6pcb_rtentry_unref(rt, inp); in6pcb_rtchange(inp, errno); rt = in6pcb_rtentry(inp); if (rt == NULL) return; } /* * Slow start out of the error condition. We * use the MTU because we know it's smaller * than the previously transmitted segment. * * Note: This is more conservative than the * suggestion in draft-floyd-incr-init-win-03. */ if (rt->rt_rmx.rmx_mtu != 0) { tp->snd_cwnd = TCP_INITIAL_WINDOW(tcp_init_win, rt->rt_rmx.rmx_mtu); } in6pcb_rtentry_unref(rt, inp); } /* * Resend unacknowledged packets. */ tp->snd_nxt = tp->sack_newdata = tp->snd_una; tcp_output(tp); } #endif /* INET6 */ /* * Compute the MSS to advertise to the peer. Called only during * the 3-way handshake. If we are the server (peer initiated * connection), we are called with a pointer to the interface * on which the SYN packet arrived. If we are the client (we * initiated connection), we are called with a pointer to the * interface out which this connection should go. * * NOTE: Do not subtract IP option/extension header size nor IPsec * header size from MSS advertisement. MSS option must hold the maximum * segment size we can accept, so it must always be: * max(if mtu) - ip header - tcp header */ u_long tcp_mss_to_advertise(const struct ifnet *ifp, int af) { extern u_long in_maxmtu; u_long mss = 0; u_long hdrsiz; /* * In order to avoid defeating path MTU discovery on the peer, * we advertise the max MTU of all attached networks as our MSS, * per RFC 1191, section 3.1. * * We provide the option to advertise just the MTU of * the interface on which we hope this connection will * be receiving. If we are responding to a SYN, we * will have a pretty good idea about this, but when * initiating a connection there is a bit more doubt. * * We also need to ensure that loopback has a large enough * MSS, as the loopback MTU is never included in in_maxmtu. */ if (ifp != NULL) switch (af) { #ifdef INET6 case AF_INET6: /* FALLTHROUGH */ #endif case AF_INET: mss = ifp->if_mtu; break; } if (tcp_mss_ifmtu == 0) switch (af) { #ifdef INET6 case AF_INET6: /* FALLTHROUGH */ #endif case AF_INET: mss = uimax(in_maxmtu, mss); break; } switch (af) { case AF_INET: hdrsiz = sizeof(struct ip); break; #ifdef INET6 case AF_INET6: hdrsiz = sizeof(struct ip6_hdr); break; #endif default: hdrsiz = 0; break; } hdrsiz += sizeof(struct tcphdr); if (mss > hdrsiz) mss -= hdrsiz; mss = uimax(tcp_mssdflt, mss); return (mss); } /* * Set connection variables based on the peer's advertised MSS. * We are passed the TCPCB for the actual connection. If we * are the server, we are called by the compressed state engine * when the 3-way handshake is complete. If we are the client, * we are called when we receive the SYN,ACK from the server. * * NOTE: Our advertised MSS value must be initialized in the TCPCB * before this routine is called! */ void tcp_mss_from_peer(struct tcpcb *tp, int offer) { struct socket *so; #if defined(RTV_SPIPE) || defined(RTV_SSTHRESH) struct rtentry *rt; #endif u_long bufsize; int mss; KASSERT(tp->t_inpcb != NULL); so = NULL; rt = NULL; so = tp->t_inpcb->inp_socket; #if defined(RTV_SPIPE) || defined(RTV_SSTHRESH) rt = inpcb_rtentry(tp->t_inpcb); #endif /* * As per RFC1122, use the default MSS value, unless they * sent us an offer. Do not accept offers less than 256 bytes. */ mss = tcp_mssdflt; if (offer) mss = offer; mss = uimax(mss, 256); /* sanity */ tp->t_peermss = mss; mss -= tcp_optlen(tp); if (tp->t_inpcb->inp_af == AF_INET) mss -= ip_optlen(tp->t_inpcb); #ifdef INET6 if (tp->t_inpcb->inp_af == AF_INET6) mss -= ip6_optlen(tp->t_inpcb); #endif /* * XXX XXX What if mss goes negative or zero? This can happen if a * socket has large IPv6 options. We crash below. */ /* * If there's a pipesize, change the socket buffer to that size. * Make the socket buffer an integral number of MSS units. If * the MSS is larger than the socket buffer, artificially decrease * the MSS. */ #ifdef RTV_SPIPE if (rt != NULL && rt->rt_rmx.rmx_sendpipe != 0) bufsize = rt->rt_rmx.rmx_sendpipe; else #endif { KASSERT(so != NULL); bufsize = so->so_snd.sb_hiwat; } if (bufsize < mss) mss = bufsize; else { bufsize = roundup(bufsize, mss); if (bufsize > sb_max) bufsize = sb_max; (void) sbreserve(&so->so_snd, bufsize, so); } tp->t_segsz = mss; #ifdef RTV_SSTHRESH if (rt != NULL && rt->rt_rmx.rmx_ssthresh) { /* * There's some sort of gateway or interface buffer * limit on the path. Use this to set the slow * start threshold, but set the threshold to no less * than 2 * MSS. */ tp->snd_ssthresh = uimax(2 * mss, rt->rt_rmx.rmx_ssthresh); } #endif #if defined(RTV_SPIPE) || defined(RTV_SSTHRESH) inpcb_rtentry_unref(rt, tp->t_inpcb); #endif } /* * Processing necessary when a TCP connection is established. */ void tcp_established(struct tcpcb *tp) { struct socket *so; #ifdef RTV_RPIPE struct rtentry *rt; #endif u_long bufsize; KASSERT(tp->t_inpcb != NULL); so = NULL; rt = NULL; /* This is a while() to reduce the dreadful stairstepping below */ while (tp->t_inpcb->inp_af == AF_INET) { so = tp->t_inpcb->inp_socket; #if defined(RTV_RPIPE) rt = inpcb_rtentry(tp->t_inpcb); #endif if (__predict_true(tcp_msl_enable)) { if (in4p_laddr(tp->t_inpcb).s_addr == INADDR_LOOPBACK) { tp->t_msl = tcp_msl_loop ? tcp_msl_loop : (TCPTV_MSL >> 2); break; } if (__predict_false(tcp_rttlocal)) { /* This may be adjusted by tcp_input */ tp->t_msl = tcp_msl_local ? tcp_msl_local : (TCPTV_MSL >> 1); break; } if (in_localaddr(in4p_faddr(tp->t_inpcb))) { tp->t_msl = tcp_msl_local ? tcp_msl_local : (TCPTV_MSL >> 1); break; } } tp->t_msl = tcp_msl_remote ? tcp_msl_remote : TCPTV_MSL; break; } /* Clamp to a reasonable range. */ tp->t_msl = MIN(tp->t_msl, TCP_MAXMSL); #ifdef INET6 while (tp->t_inpcb->inp_af == AF_INET6) { so = tp->t_inpcb->inp_socket; #if defined(RTV_RPIPE) rt = in6pcb_rtentry(tp->t_inpcb); #endif if (__predict_true(tcp_msl_enable)) { extern const struct in6_addr in6addr_loopback; if (IN6_ARE_ADDR_EQUAL(&in6p_laddr(tp->t_inpcb), &in6addr_loopback)) { tp->t_msl = tcp_msl_loop ? tcp_msl_loop : (TCPTV_MSL >> 2); break; } if (__predict_false(tcp_rttlocal)) { /* This may be adjusted by tcp_input */ tp->t_msl = tcp_msl_local ? tcp_msl_local : (TCPTV_MSL >> 1); break; } if (in6_localaddr(&in6p_faddr(tp->t_inpcb))) { tp->t_msl = tcp_msl_local ? tcp_msl_local : (TCPTV_MSL >> 1); break; } } tp->t_msl = tcp_msl_remote ? tcp_msl_remote : TCPTV_MSL; break; } /* Clamp to a reasonable range. */ tp->t_msl = MIN(tp->t_msl, TCP_MAXMSL); #endif tp->t_state = TCPS_ESTABLISHED; TCP_TIMER_ARM(tp, TCPT_KEEP, tp->t_keepidle); #ifdef RTV_RPIPE if (rt != NULL && rt->rt_rmx.rmx_recvpipe != 0) bufsize = rt->rt_rmx.rmx_recvpipe; else #endif { KASSERT(so != NULL); bufsize = so->so_rcv.sb_hiwat; } if (bufsize > tp->t_ourmss) { bufsize = roundup(bufsize, tp->t_ourmss); if (bufsize > sb_max) bufsize = sb_max; (void) sbreserve(&so->so_rcv, bufsize, so); } #ifdef RTV_RPIPE inpcb_rtentry_unref(rt, tp->t_inpcb); #endif } /* * Check if there's an initial rtt or rttvar. Convert from the * route-table units to scaled multiples of the slow timeout timer. * Called only during the 3-way handshake. */ void tcp_rmx_rtt(struct tcpcb *tp) { #ifdef RTV_RTT struct rtentry *rt = NULL; int rtt; KASSERT(tp->t_inpcb != NULL); rt = inpcb_rtentry(tp->t_inpcb); if (rt == NULL) return; if (tp->t_srtt == 0 && (rtt = rt->rt_rmx.rmx_rtt)) { /* * XXX The lock bit for MTU indicates that the value * is also a minimum value; this is subject to time. */ if (rt->rt_rmx.rmx_locks & RTV_RTT) TCPT_RANGESET(tp->t_rttmin, rtt / (RTM_RTTUNIT / PR_SLOWHZ), TCPTV_MIN, TCPTV_REXMTMAX); tp->t_srtt = rtt / ((RTM_RTTUNIT / PR_SLOWHZ) >> (TCP_RTT_SHIFT + 2)); if (rt->rt_rmx.rmx_rttvar) { tp->t_rttvar = rt->rt_rmx.rmx_rttvar / ((RTM_RTTUNIT / PR_SLOWHZ) >> (TCP_RTTVAR_SHIFT + 2)); } else { /* Default variation is +- 1 rtt */ tp->t_rttvar = tp->t_srtt >> (TCP_RTT_SHIFT - TCP_RTTVAR_SHIFT); } TCPT_RANGESET(tp->t_rxtcur, ((tp->t_srtt >> 2) + tp->t_rttvar) >> (1 + 2), tp->t_rttmin, TCPTV_REXMTMAX); } inpcb_rtentry_unref(rt, tp->t_inpcb); #endif } tcp_seq tcp_iss_seq = 0; /* tcp initial seq # */ /* * Get a new sequence value given a tcp control block */ tcp_seq tcp_new_iss(struct tcpcb *tp) { if (tp->t_inpcb->inp_af == AF_INET) { return tcp_new_iss1(&in4p_laddr(tp->t_inpcb), &in4p_faddr(tp->t_inpcb), tp->t_inpcb->inp_lport, tp->t_inpcb->inp_fport, sizeof(in4p_laddr(tp->t_inpcb))); } #ifdef INET6 if (tp->t_inpcb->inp_af == AF_INET6) { return tcp_new_iss1(&in6p_laddr(tp->t_inpcb), &in6p_faddr(tp->t_inpcb), tp->t_inpcb->inp_lport, tp->t_inpcb->inp_fport, sizeof(in6p_laddr(tp->t_inpcb))); } #endif panic("tcp_new_iss: unreachable"); } static u_int8_t tcp_iss_secret[16]; /* 128 bits; should be plenty */ /* * Initialize RFC 1948 ISS Secret */ static int tcp_iss_secret_init(void) { cprng_strong(kern_cprng, tcp_iss_secret, sizeof(tcp_iss_secret), 0); return 0; } /* * This routine actually generates a new TCP initial sequence number. */ tcp_seq tcp_new_iss1(void *laddr, void *faddr, u_int16_t lport, u_int16_t fport, size_t addrsz) { tcp_seq tcp_iss; if (tcp_do_rfc1948) { MD5_CTX ctx; u_int8_t hash[16]; /* XXX MD5 knowledge */ static ONCE_DECL(tcp_iss_secret_control); /* * If we haven't been here before, initialize our cryptographic * hash secret. */ RUN_ONCE(&tcp_iss_secret_control, tcp_iss_secret_init); /* * Compute the base value of the ISS. It is a hash * of (saddr, sport, daddr, dport, secret). */ MD5Init(&ctx); MD5Update(&ctx, (u_char *) laddr, addrsz); MD5Update(&ctx, (u_char *) &lport, sizeof(lport)); MD5Update(&ctx, (u_char *) faddr, addrsz); MD5Update(&ctx, (u_char *) &fport, sizeof(fport)); MD5Update(&ctx, tcp_iss_secret, sizeof(tcp_iss_secret)); MD5Final(hash, &ctx); memcpy(&tcp_iss, hash, sizeof(tcp_iss)); #ifdef TCPISS_DEBUG printf("ISS hash 0x%08x, ", tcp_iss); #endif } else { /* * Randomize. */ tcp_iss = cprng_fast32() & TCP_ISS_RANDOM_MASK; #ifdef TCPISS_DEBUG printf("ISS random 0x%08x, ", tcp_iss); #endif } /* * Add the offset in to the computed value. */ tcp_iss += tcp_iss_seq; #ifdef TCPISS_DEBUG printf("ISS %08x\n", tcp_iss); #endif return tcp_iss; } #if defined(IPSEC) /* compute ESP/AH header size for TCP, including outer IP header. */ size_t ipsec4_hdrsiz_tcp(struct tcpcb *tp) { struct inpcb *inp; size_t hdrsiz; /* XXX mapped addr case (tp->t_inpcb) */ if (!tp || !tp->t_template || !(inp = tp->t_inpcb)) return 0; switch (tp->t_family) { case AF_INET: /* XXX: should use correct direction. */ hdrsiz = ipsec_hdrsiz(tp->t_template, IPSEC_DIR_OUTBOUND, inp); break; default: hdrsiz = 0; break; } return hdrsiz; } #ifdef INET6 size_t ipsec6_hdrsiz_tcp(struct tcpcb *tp) { struct inpcb *inp; size_t hdrsiz; if (!tp || !tp->t_template || !(inp = tp->t_inpcb)) return 0; switch (tp->t_family) { case AF_INET6: /* XXX: should use correct direction. */ hdrsiz = ipsec_hdrsiz(tp->t_template, IPSEC_DIR_OUTBOUND, inp); break; case AF_INET: /* mapped address case - tricky */ default: hdrsiz = 0; break; } return hdrsiz; } #endif #endif /*IPSEC*/ /* * Determine the length of the TCP options for this connection. * * XXX: What do we do for SACK, when we add that? Just reserve * all of the space? Otherwise we can't exactly be incrementing * cwnd by an amount that varies depending on the amount we last * had to SACK! */ u_int tcp_optlen(struct tcpcb *tp) { u_int optlen; optlen = 0; if ((tp->t_flags & (TF_REQ_TSTMP|TF_RCVD_TSTMP|TF_NOOPT)) == (TF_REQ_TSTMP | TF_RCVD_TSTMP)) optlen += TCPOLEN_TSTAMP_APPA; #ifdef TCP_SIGNATURE if (tp->t_flags & TF_SIGNATURE) optlen += TCPOLEN_SIGLEN; #endif return optlen; } u_int tcp_hdrsz(struct tcpcb *tp) { u_int hlen; switch (tp->t_family) { #ifdef INET6 case AF_INET6: hlen = sizeof(struct ip6_hdr); break; #endif case AF_INET: hlen = sizeof(struct ip); break; default: hlen = 0; break; } hlen += sizeof(struct tcphdr); if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP && (tp->t_flags & TF_RCVD_TSTMP) == TF_RCVD_TSTMP) hlen += TCPOLEN_TSTAMP_APPA; #ifdef TCP_SIGNATURE if (tp->t_flags & TF_SIGNATURE) hlen += TCPOLEN_SIGLEN; #endif return hlen; } void tcp_statinc(u_int stat) { KASSERT(stat < TCP_NSTATS); TCP_STATINC(stat); } void tcp_statadd(u_int stat, uint64_t val) { KASSERT(stat < TCP_NSTATS); TCP_STATADD(stat, val); }
18 18 18 18 17 18 18 18 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 /* $NetBSD: subr_ipi.c,v 1.11 2023/02/24 11:02:27 riastradh Exp $ */ /*- * Copyright (c) 2014 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Mindaugas Rasiukevicius. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /* * Inter-processor interrupt (IPI) interface: asynchronous IPIs to * invoke functions with a constant argument and synchronous IPIs * with the cross-call support. */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: subr_ipi.c,v 1.11 2023/02/24 11:02:27 riastradh Exp $"); #include <sys/param.h> #include <sys/types.h> #include <sys/atomic.h> #include <sys/evcnt.h> #include <sys/cpu.h> #include <sys/ipi.h> #include <sys/intr.h> #include <sys/kcpuset.h> #include <sys/kmem.h> #include <sys/lock.h> #include <sys/mutex.h> /* * An array of the IPI handlers used for asynchronous invocation. * The lock protects the slot allocation. */ typedef struct { ipi_func_t func; void * arg; } ipi_intr_t; static kmutex_t ipi_mngmt_lock; static ipi_intr_t ipi_intrs[IPI_MAXREG] __cacheline_aligned; /* * Per-CPU mailbox for IPI messages: it is a single cache line storing * up to IPI_MSG_MAX messages. This interface is built on top of the * synchronous IPIs. */ #define IPI_MSG_SLOTS (CACHE_LINE_SIZE / sizeof(ipi_msg_t *)) #define IPI_MSG_MAX IPI_MSG_SLOTS typedef struct { ipi_msg_t * msg[IPI_MSG_SLOTS]; } ipi_mbox_t; /* Mailboxes for the synchronous IPIs. */ static ipi_mbox_t * ipi_mboxes __read_mostly; static struct evcnt ipi_mboxfull_ev __cacheline_aligned; static void ipi_msg_cpu_handler(void *); /* Handler for the synchronous IPIs - it must be zero. */ #define IPI_SYNCH_ID 0 #ifndef MULTIPROCESSOR #define cpu_ipi(ci) KASSERT(ci == NULL) #endif void ipi_sysinit(void) { mutex_init(&ipi_mngmt_lock, MUTEX_DEFAULT, IPL_NONE); memset(ipi_intrs, 0, sizeof(ipi_intrs)); /* * Register the handler for synchronous IPIs. This mechanism * is built on top of the asynchronous interface. Slot zero is * reserved permanently; it is also handy to use zero as a failure * for other registers (as it is potentially less error-prone). */ ipi_intrs[IPI_SYNCH_ID].func = ipi_msg_cpu_handler; evcnt_attach_dynamic(&ipi_mboxfull_ev, EVCNT_TYPE_MISC, NULL, "ipi", "full"); } void ipi_percpu_init(void) { const size_t len = ncpu * sizeof(ipi_mbox_t); /* Initialise the per-CPU bit fields. */ for (u_int i = 0; i < ncpu; i++) { struct cpu_info *ci = cpu_lookup(i); memset(&ci->ci_ipipend, 0, sizeof(ci->ci_ipipend)); } /* Allocate per-CPU IPI mailboxes. */ ipi_mboxes = kmem_zalloc(len, KM_SLEEP); KASSERT(ipi_mboxes != NULL); } /* * ipi_register: register an asynchronous IPI handler. * * => Returns IPI ID which is greater than zero; on failure - zero. */ u_int ipi_register(ipi_func_t func, void *arg) { mutex_enter(&ipi_mngmt_lock); for (u_int i = 0; i < IPI_MAXREG; i++) { if (ipi_intrs[i].func == NULL) { /* Register the function. */ ipi_intrs[i].func = func; ipi_intrs[i].arg = arg; mutex_exit(&ipi_mngmt_lock); KASSERT(i != IPI_SYNCH_ID); return i; } } mutex_exit(&ipi_mngmt_lock); printf("WARNING: ipi_register: table full, increase IPI_MAXREG\n"); return 0; } /* * ipi_unregister: release the IPI handler given the ID. */ void ipi_unregister(u_int ipi_id) { ipi_msg_t ipimsg = { .func = __FPTRCAST(ipi_func_t, nullop) }; KASSERT(ipi_id != IPI_SYNCH_ID); KASSERT(ipi_id < IPI_MAXREG); /* Release the slot. */ mutex_enter(&ipi_mngmt_lock); KASSERT(ipi_intrs[ipi_id].func != NULL); ipi_intrs[ipi_id].func = NULL; /* Ensure that there are no IPIs in flight. */ kpreempt_disable(); ipi_broadcast(&ipimsg, false); ipi_wait(&ipimsg); kpreempt_enable(); mutex_exit(&ipi_mngmt_lock); } /* * ipi_mark_pending: internal routine to mark an IPI pending on the * specified CPU (which might be curcpu()). */ static bool ipi_mark_pending(u_int ipi_id, struct cpu_info *ci) { const u_int i = ipi_id >> IPI_BITW_SHIFT; const uint32_t bitm = 1U << (ipi_id & IPI_BITW_MASK); KASSERT(ipi_id < IPI_MAXREG); KASSERT(kpreempt_disabled()); /* Mark as pending and return true if not previously marked. */ if ((atomic_load_acquire(&ci->ci_ipipend[i]) & bitm) == 0) { membar_release(); atomic_or_32(&ci->ci_ipipend[i], bitm); return true; } return false; } /* * ipi_trigger: asynchronously send an IPI to the specified CPU. */ void ipi_trigger(u_int ipi_id, struct cpu_info *ci) { KASSERT(curcpu() != ci); if (ipi_mark_pending(ipi_id, ci)) { cpu_ipi(ci); } } /* * ipi_trigger_multi_internal: the guts of ipi_trigger_multi() and * ipi_trigger_broadcast(). */ static void ipi_trigger_multi_internal(u_int ipi_id, const kcpuset_t *target, bool skip_self) { const cpuid_t selfid = cpu_index(curcpu()); CPU_INFO_ITERATOR cii; struct cpu_info *ci; KASSERT(kpreempt_disabled()); KASSERT(target != NULL); for (CPU_INFO_FOREACH(cii, ci)) { const cpuid_t cpuid = cpu_index(ci); if (!kcpuset_isset(target, cpuid) || cpuid == selfid) { continue; } ipi_trigger(ipi_id, ci); } if (!skip_self && kcpuset_isset(target, selfid)) { ipi_mark_pending(ipi_id, curcpu()); int s = splhigh(); ipi_cpu_handler(); splx(s); } } /* * ipi_trigger_multi: same as ipi_trigger() but sends to the multiple * CPUs given the target CPU set. */ void ipi_trigger_multi(u_int ipi_id, const kcpuset_t *target) { ipi_trigger_multi_internal(ipi_id, target, false); } /* * ipi_trigger_broadcast: same as ipi_trigger_multi() to kcpuset_attached, * optionally skipping the sending CPU. */ void ipi_trigger_broadcast(u_int ipi_id, bool skip_self) { ipi_trigger_multi_internal(ipi_id, kcpuset_attached, skip_self); } /* * put_msg: insert message into the mailbox. * * Caller is responsible for issuing membar_release first. */ static inline void put_msg(ipi_mbox_t *mbox, ipi_msg_t *msg) { int count = SPINLOCK_BACKOFF_MIN; again: for (u_int i = 0; i < IPI_MSG_MAX; i++) { if (atomic_cas_ptr(&mbox->msg[i], NULL, msg) == NULL) { return; } } /* All slots are full: we have to spin-wait. */ ipi_mboxfull_ev.ev_count++; SPINLOCK_BACKOFF(count); goto again; } /* * ipi_cpu_handler: the IPI handler. */ void ipi_cpu_handler(void) { struct cpu_info * const ci = curcpu(); /* * Handle asynchronous IPIs: inspect per-CPU bit field, extract * IPI ID numbers and execute functions in those slots. */ for (u_int i = 0; i < IPI_BITWORDS; i++) { uint32_t pending, bit; if (atomic_load_relaxed(&ci->ci_ipipend[i]) == 0) { continue; } pending = atomic_swap_32(&ci->ci_ipipend[i], 0); membar_acquire(); while ((bit = ffs(pending)) != 0) { const u_int ipi_id = (i << IPI_BITW_SHIFT) | --bit; ipi_intr_t *ipi_hdl = &ipi_intrs[ipi_id]; pending &= ~(1U << bit); KASSERT(ipi_hdl->func != NULL); ipi_hdl->func(ipi_hdl->arg); } } } /* * ipi_msg_cpu_handler: handle synchronous IPIs - iterate mailbox, * execute the passed functions and acknowledge the messages. */ static void ipi_msg_cpu_handler(void *arg __unused) { const struct cpu_info * const ci = curcpu(); ipi_mbox_t *mbox = &ipi_mboxes[cpu_index(ci)]; for (u_int i = 0; i < IPI_MSG_MAX; i++) { ipi_msg_t *msg; /* Get the message. */ if ((msg = atomic_load_acquire(&mbox->msg[i])) == NULL) { continue; } atomic_store_relaxed(&mbox->msg[i], NULL); /* Execute the handler. */ KASSERT(msg->func); msg->func(msg->arg); /* Ack the request. */ membar_release(); atomic_dec_uint(&msg->_pending); } } /* * ipi_unicast: send an IPI to a single CPU. * * => The CPU must be remote; must not be local. * => The caller must ipi_wait() on the message for completion. */ void ipi_unicast(ipi_msg_t *msg, struct cpu_info *ci) { const cpuid_t id = cpu_index(ci); KASSERT(msg->func != NULL); KASSERT(kpreempt_disabled()); KASSERT(curcpu() != ci); msg->_pending = 1; membar_release(); put_msg(&ipi_mboxes[id], msg); ipi_trigger(IPI_SYNCH_ID, ci); } /* * ipi_multicast: send an IPI to each CPU in the specified set. * * => The caller must ipi_wait() on the message for completion. */ void ipi_multicast(ipi_msg_t *msg, const kcpuset_t *target) { const struct cpu_info * const self = curcpu(); CPU_INFO_ITERATOR cii; struct cpu_info *ci; u_int local; KASSERT(msg->func != NULL); KASSERT(kpreempt_disabled()); local = !!kcpuset_isset(target, cpu_index(self)); msg->_pending = kcpuset_countset(target) - local; membar_release(); for (CPU_INFO_FOREACH(cii, ci)) { cpuid_t id; if (__predict_false(ci == self)) { continue; } id = cpu_index(ci); if (!kcpuset_isset(target, id)) { continue; } put_msg(&ipi_mboxes[id], msg); ipi_trigger(IPI_SYNCH_ID, ci); } if (local) { msg->func(msg->arg); } } /* * ipi_broadcast: send an IPI to all CPUs. * * => The caller must ipi_wait() on the message for completion. */ void ipi_broadcast(ipi_msg_t *msg, bool skip_self) { const struct cpu_info * const self = curcpu(); CPU_INFO_ITERATOR cii; struct cpu_info *ci; KASSERT(msg->func != NULL); KASSERT(kpreempt_disabled()); msg->_pending = ncpu - 1; membar_release(); /* Broadcast IPIs for remote CPUs. */ for (CPU_INFO_FOREACH(cii, ci)) { cpuid_t id; if (__predict_false(ci == self)) { continue; } id = cpu_index(ci); put_msg(&ipi_mboxes[id], msg); ipi_trigger(IPI_SYNCH_ID, ci); } if (!skip_self) { /* Finally, execute locally. */ msg->func(msg->arg); } } /* * ipi_wait: spin-wait until the message is processed. */ void ipi_wait(ipi_msg_t *msg) { int count = SPINLOCK_BACKOFF_MIN; while (atomic_load_acquire(&msg->_pending)) { KASSERT(atomic_load_relaxed(&msg->_pending) < ncpu); SPINLOCK_BACKOFF(count); } }
4 2 2 4 3 1 2 1 1 4 3 4 5 5 5 2 3 3 3 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 /* $NetBSD: vfs_syscalls_50.c,v 1.26 2021/08/15 07:57:46 christos Exp $ */ /*- * Copyright (c) 2008 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Christos Zoulas. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: vfs_syscalls_50.c,v 1.26 2021/08/15 07:57:46 christos Exp $"); #if defined(_KERNEL_OPT) #include "opt_compat_netbsd.h" #include "opt_quota.h" #endif #include <sys/param.h> #include <sys/systm.h> #include <sys/namei.h> #include <sys/filedesc.h> #include <sys/kernel.h> #include <sys/file.h> #include <sys/stat.h> #include <sys/socketvar.h> #include <sys/vnode.h> #include <sys/mount.h> #include <sys/proc.h> #include <sys/uio.h> #include <sys/dirent.h> #include <sys/kauth.h> #include <sys/time.h> #include <sys/syscall.h> #include <sys/syscallvar.h> #include <sys/syscallargs.h> #include <sys/vfs_syscalls.h> #ifndef LFS #define LFS #endif #include <sys/syscallargs.h> #include <ufs/lfs/lfs_extern.h> #include <compat/common/compat_util.h> #include <compat/common/compat_mod.h> #include <compat/sys/time.h> #include <compat/sys/stat.h> #include <compat/sys/dirent.h> #include <compat/sys/mount.h> static const struct syscall_package vfs_syscalls_50_syscalls[] = { { SYS_compat_50___stat30, 0, (sy_call_t *)compat_50_sys___stat30 }, { SYS_compat_50___fstat30, 0, (sy_call_t *)compat_50_sys___fstat30 }, { SYS_compat_50___lstat30, 0, (sy_call_t *)compat_50_sys___lstat30 }, { SYS_compat_50___fhstat40, 0, (sy_call_t *)compat_50_sys___fhstat40 }, { SYS_compat_50_utimes, 0, (sy_call_t *)compat_50_sys_utimes }, { SYS_compat_50_lfs_segwait, 0, (sy_call_t *)compat_50_sys_lfs_segwait } , { SYS_compat_50_futimes, 0, (sy_call_t *)compat_50_sys_futimes }, { SYS_compat_50_lutimes, 0, (sy_call_t *)compat_50_sys_lutimes }, { SYS_compat_50_mknod, 0, (sy_call_t *)compat_50_sys_mknod }, { 0, 0, NULL } }; /* * Convert from a new to an old stat structure. */ static void cvtstat(struct stat30 *ost, const struct stat *st) { /* Handle any padding. */ memset(ost, 0, sizeof(*ost)); ost->st_dev = st->st_dev; ost->st_ino = st->st_ino; ost->st_mode = st->st_mode; ost->st_nlink = st->st_nlink; ost->st_uid = st->st_uid; ost->st_gid = st->st_gid; ost->st_rdev = st->st_rdev; timespec_to_timespec50(&st->st_atimespec, &ost->st_atimespec); timespec_to_timespec50(&st->st_mtimespec, &ost->st_mtimespec); timespec_to_timespec50(&st->st_ctimespec, &ost->st_ctimespec); timespec_to_timespec50(&st->st_birthtimespec, &ost->st_birthtimespec); ost->st_size = st->st_size; ost->st_blocks = st->st_blocks; ost->st_blksize = st->st_blksize; ost->st_flags = st->st_flags; ost->st_gen = st->st_gen; memset(ost->st_spare, 0, sizeof(ost->st_spare)); } /* * Get file status; this version follows links. */ /* ARGSUSED */ int compat_50_sys___stat30(struct lwp *l, const struct compat_50_sys___stat30_args *uap, register_t *retval) { /* { syscallarg(const char *) path; syscallarg(struct stat30 *) ub; } */ struct stat sb; struct stat30 osb; int error; error = do_sys_stat(SCARG(uap, path), FOLLOW, &sb); if (error) return error; cvtstat(&osb, &sb); return copyout(&osb, SCARG(uap, ub), sizeof(osb)); } /* * Get file status; this version does not follow links. */ /* ARGSUSED */ int compat_50_sys___lstat30(struct lwp *l, const struct compat_50_sys___lstat30_args *uap, register_t *retval) { /* { syscallarg(const char *) path; syscallarg(struct stat30 *) ub; } */ struct stat sb; struct stat30 osb; int error; error = do_sys_stat(SCARG(uap, path), NOFOLLOW, &sb); if (error) return error; cvtstat(&osb, &sb); return copyout(&osb, SCARG(uap, ub), sizeof(osb)); } /* * Return status information about a file descriptor. */ /* ARGSUSED */ int compat_50_sys___fstat30(struct lwp *l, const struct compat_50_sys___fstat30_args *uap, register_t *retval) { /* { syscallarg(int) fd; syscallarg(struct stat30 *) sb; } */ struct stat sb; struct stat30 osb; int error; error = do_sys_fstat(SCARG(uap, fd), &sb); if (error) return error; cvtstat(&osb, &sb); return copyout(&osb, SCARG(uap, sb), sizeof(osb)); } /* ARGSUSED */ int compat_50_sys___fhstat40(struct lwp *l, const struct compat_50_sys___fhstat40_args *uap, register_t *retval) { /* { syscallarg(const void *) fhp; syscallarg(size_t) fh_size; syscallarg(struct stat30 *) sb; } */ struct stat sb; struct stat30 osb; int error; error = do_fhstat(l, SCARG(uap, fhp), SCARG(uap, fh_size), &sb); if (error) return error; cvtstat(&osb, &sb); return copyout(&osb, SCARG(uap, sb), sizeof(osb)); } static int compat_50_do_sys_utimes(struct lwp *l, struct vnode *vp, const char *path, int flag, const struct timeval50 *tptr) { struct timeval tv[2], *tvp; struct timeval50 tv50[2]; if (tptr) { int error = copyin(tptr, tv50, sizeof(tv50)); if (error) return error; timeval50_to_timeval(&tv50[0], &tv[0]); timeval50_to_timeval(&tv50[1], &tv[1]); tvp = tv; } else tvp = NULL; return do_sys_utimes(l, vp, path, flag, tvp, UIO_SYSSPACE); } /* * Set the access and modification times given a path name; this * version follows links. */ /* ARGSUSED */ int compat_50_sys_utimes(struct lwp *l, const struct compat_50_sys_utimes_args *uap, register_t *retval) { /* { syscallarg(const char *) path; syscallarg(const struct timeval50 *) tptr; } */ return compat_50_do_sys_utimes(l, NULL, SCARG(uap, path), FOLLOW, SCARG(uap, tptr)); } /* * Set the access and modification times given a file descriptor. */ /* ARGSUSED */ int compat_50_sys_futimes(struct lwp *l, const struct compat_50_sys_futimes_args *uap, register_t *retval) { /* { syscallarg(int) fd; syscallarg(const struct timeval50 *) tptr; } */ int error; struct file *fp; /* fd_getvnode() will use the descriptor for us */ if ((error = fd_getvnode(SCARG(uap, fd), &fp)) != 0) return error; error = compat_50_do_sys_utimes(l, fp->f_vnode, NULL, 0, SCARG(uap, tptr)); fd_putfile(SCARG(uap, fd)); return error; } /* * Set the access and modification times given a path name; this * version does not follow links. */ int compat_50_sys_lutimes(struct lwp *l, const struct compat_50_sys_lutimes_args *uap, register_t *retval) { /* { syscallarg(const char *) path; syscallarg(const struct timeval50 *) tptr; } */ return compat_50_do_sys_utimes(l, NULL, SCARG(uap, path), NOFOLLOW, SCARG(uap, tptr)); } int compat_50_sys_lfs_segwait(struct lwp *l, const struct compat_50_sys_lfs_segwait_args *uap, register_t *retval) { /* { syscallarg(fsid_t *) fsidp; syscallarg(struct timeval50 *) tv; } */ #ifdef notyet /* XXX need to check presence of LFS at run-time XXX */ struct timeval atv; struct timeval50 atv50; fsid_t fsid; int error; /* XXX need we be su to segwait? */ error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_LFS, KAUTH_REQ_SYSTEM_LFS_SEGWAIT, NULL, NULL, NULL); if (error) return (error); if ((error = copyin(SCARG(uap, fsidp), &fsid, sizeof(fsid_t))) != 0) return (error); if (SCARG(uap, tv)) { error = copyin(SCARG(uap, tv), &atv50, sizeof(atv50)); if (error) return (error); timeval50_to_timeval(&atv50, &atv); if (itimerfix(&atv)) return (EINVAL); } else /* NULL or invalid */ atv.tv_sec = atv.tv_usec = 0; return lfs_segwait(&fsid, &atv); #else return ENOSYS; #endif } int compat_50_sys_mknod(struct lwp *l, const struct compat_50_sys_mknod_args *uap, register_t *retval) { /* { syscallarg(const char *) path; syscallarg(mode_t) mode; syscallarg(uint32_t) dev; } */ return do_sys_mknod(l, SCARG(uap, path), SCARG(uap, mode), SCARG(uap, dev), UIO_USERSPACE); } int vfs_syscalls_50_init(void) { return syscall_establish(NULL, vfs_syscalls_50_syscalls); } int vfs_syscalls_50_fini(void) { return syscall_disestablish(NULL, vfs_syscalls_50_syscalls); }
415 240 6 13 43 117 15 5 192 184 4 49 212 210 5 5 98 19 84 50 25 7 3 768 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 /* $NetBSD: secmodel_securelevel.c,v 1.37 2020/12/05 17:33:53 thorpej Exp $ */ /*- * Copyright (c) 2006 Elad Efrat <elad@NetBSD.org> * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The name of the author may not be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* * This file contains kauth(9) listeners needed to implement the traditional * NetBSD securelevel. * * The securelevel is a system-global indication on what operations are * allowed or not. It affects all users, including root. */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: secmodel_securelevel.c,v 1.37 2020/12/05 17:33:53 thorpej Exp $"); #ifdef _KERNEL_OPT #include "opt_insecure.h" #endif /* _KERNEL_OPT */ #include <sys/types.h> #include <sys/param.h> #include <sys/kauth.h> #include <sys/conf.h> #include <sys/mount.h> #include <sys/sysctl.h> #include <sys/vnode.h> #include <sys/module.h> #include <miscfs/specfs/specdev.h> #include <secmodel/secmodel.h> #include <secmodel/securelevel/securelevel.h> MODULE(MODULE_CLASS_SECMODEL, securelevel, NULL); static int securelevel; static kauth_listener_t l_system, l_process, l_network, l_machdep, l_device, l_vnode; static secmodel_t securelevel_sm; /* * Sysctl helper routine for securelevel. Ensures that the value only rises * unless the caller is init. */ int secmodel_securelevel_sysctl(SYSCTLFN_ARGS) { int newsecurelevel, error; struct sysctlnode node; newsecurelevel = securelevel; node = *rnode; node.sysctl_data = &newsecurelevel; error = sysctl_lookup(SYSCTLFN_CALL(&node)); if (error || newp == NULL) return (error); if ((newsecurelevel < securelevel) && (l->l_proc != initproc)) return (EPERM); securelevel = newsecurelevel; return (error); } SYSCTL_SETUP(sysctl_security_securelevel_setup, "securelevel sysctl") { const struct sysctlnode *rnode, *rnode2; sysctl_createv(clog, 0, NULL, &rnode, CTLFLAG_PERMANENT, CTLTYPE_NODE, "models", NULL, NULL, 0, NULL, 0, CTL_SECURITY, CTL_CREATE, CTL_EOL); /* Compatibility: security.models.bsd44 */ rnode2 = rnode; sysctl_createv(clog, 0, &rnode2, &rnode2, CTLFLAG_PERMANENT, CTLTYPE_NODE, "bsd44", NULL, NULL, 0, NULL, 0, CTL_CREATE, CTL_EOL); /* Compatibility: security.models.bsd44.securelevel */ sysctl_createv(clog, 0, &rnode2, NULL, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_INT, "securelevel", SYSCTL_DESCR("System security level"), secmodel_securelevel_sysctl, 0, NULL, 0, CTL_CREATE, CTL_EOL); sysctl_createv(clog, 0, &rnode, &rnode, CTLFLAG_PERMANENT, CTLTYPE_NODE, "securelevel", NULL, NULL, 0, NULL, 0, CTL_CREATE, CTL_EOL); sysctl_createv(clog, 0, &rnode, NULL, CTLFLAG_PERMANENT, CTLTYPE_STRING, "name", NULL, NULL, 0, __UNCONST(SECMODEL_SECURELEVEL_NAME), 0, CTL_CREATE, CTL_EOL); sysctl_createv(clog, 0, &rnode, NULL, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_INT, "securelevel", SYSCTL_DESCR("System security level"), secmodel_securelevel_sysctl, 0, NULL, 0, CTL_CREATE, CTL_EOL); /* Compatibility: kern.securelevel */ sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_INT, "securelevel", SYSCTL_DESCR("System security level"), secmodel_securelevel_sysctl, 0, NULL, 0, CTL_KERN, KERN_SECURELVL, CTL_EOL); } void secmodel_securelevel_init(void) { #ifdef INSECURE securelevel = -1; #else securelevel = 0; #endif /* INSECURE */ } void secmodel_securelevel_start(void) { l_system = kauth_listen_scope(KAUTH_SCOPE_SYSTEM, secmodel_securelevel_system_cb, NULL); l_process = kauth_listen_scope(KAUTH_SCOPE_PROCESS, secmodel_securelevel_process_cb, NULL); l_network = kauth_listen_scope(KAUTH_SCOPE_NETWORK, secmodel_securelevel_network_cb, NULL); l_machdep = kauth_listen_scope(KAUTH_SCOPE_MACHDEP, secmodel_securelevel_machdep_cb, NULL); l_device = kauth_listen_scope(KAUTH_SCOPE_DEVICE, secmodel_securelevel_device_cb, NULL); l_vnode = kauth_listen_scope(KAUTH_SCOPE_VNODE, secmodel_securelevel_vnode_cb, NULL); } void secmodel_securelevel_stop(void) { kauth_unlisten_scope(l_system); kauth_unlisten_scope(l_process); kauth_unlisten_scope(l_network); kauth_unlisten_scope(l_machdep); kauth_unlisten_scope(l_device); kauth_unlisten_scope(l_vnode); } static int securelevel_eval(const char *what, void *arg, void *ret) { int error = 0; if (strcasecmp(what, "is-securelevel-above") == 0) { int level = (int)(uintptr_t)arg; bool *bp = ret; *bp = (securelevel > level); } else { error = ENOENT; } return error; } static int securelevel_modcmd(modcmd_t cmd, void *arg) { int error = 0; switch (cmd) { case MODULE_CMD_INIT: secmodel_securelevel_init(); error = secmodel_register(&securelevel_sm, SECMODEL_SECURELEVEL_ID, SECMODEL_SECURELEVEL_NAME, NULL, securelevel_eval, NULL); if (error != 0) printf("securelevel_modcmd::init: secmodel_register " "returned %d\n", error); secmodel_securelevel_start(); break; case MODULE_CMD_FINI: secmodel_securelevel_stop(); error = secmodel_deregister(securelevel_sm); if (error != 0) printf("securelevel_modcmd::fini: secmodel_deregister " "returned %d\n", error); break; case MODULE_CMD_AUTOUNLOAD: error = EPERM; break; default: error = ENOTTY; break; } return (error); } /* * kauth(9) listener * * Security model: Traditional NetBSD * Scope: System * Responsibility: Securelevel */ int secmodel_securelevel_system_cb(kauth_cred_t cred, kauth_action_t action, void *cookie, void *arg0, void *arg1, void *arg2, void *arg3) { int result; enum kauth_system_req req; result = KAUTH_RESULT_DEFER; req = (enum kauth_system_req)(uintptr_t)arg0; switch (action) { case KAUTH_SYSTEM_CHSYSFLAGS: /* Deprecated. */ if (securelevel > 0) result = KAUTH_RESULT_DENY; break; case KAUTH_SYSTEM_TIME: switch (req) { case KAUTH_REQ_SYSTEM_TIME_RTCOFFSET: if (securelevel > 0) result = KAUTH_RESULT_DENY; break; case KAUTH_REQ_SYSTEM_TIME_SYSTEM: { struct timespec *ts = arg1; struct timespec *delta = arg2; if (securelevel > 1 && time_wraps(ts, delta)) result = KAUTH_RESULT_DENY; break; } default: break; } break; case KAUTH_SYSTEM_MAP_VA_ZERO: if (securelevel > 0) result = KAUTH_RESULT_DENY; break; case KAUTH_SYSTEM_MODULE: if (securelevel > 0) result = KAUTH_RESULT_DENY; break; case KAUTH_SYSTEM_MOUNT: switch (req) { case KAUTH_REQ_SYSTEM_MOUNT_NEW: if (securelevel > 1) result = KAUTH_RESULT_DENY; break; case KAUTH_REQ_SYSTEM_MOUNT_UPDATE: if (securelevel > 1) { struct mount *mp = arg1; u_long flags = (u_long)arg2; /* Can only degrade from read/write to read-only. */ if (flags != (mp->mnt_flag | MNT_RDONLY | MNT_RELOAD | MNT_FORCE | MNT_UPDATE)) result = KAUTH_RESULT_DENY; } break; default: break; } break; case KAUTH_SYSTEM_SYSCTL: switch (req) { case KAUTH_REQ_SYSTEM_SYSCTL_ADD: case KAUTH_REQ_SYSTEM_SYSCTL_DELETE: case KAUTH_REQ_SYSTEM_SYSCTL_DESC: if (securelevel > 0) result = KAUTH_RESULT_DENY; break; default: break; } break; case KAUTH_SYSTEM_SETIDCORE: if (securelevel > 0) result = KAUTH_RESULT_DENY; break; case KAUTH_SYSTEM_DEBUG: default: break; } return (result); } /* * kauth(9) listener * * Security model: Traditional NetBSD * Scope: Process * Responsibility: Securelevel */ int secmodel_securelevel_process_cb(kauth_cred_t cred, kauth_action_t action, void *cookie, void *arg0, void *arg1, void *arg2, void *arg3) { struct proc *p; int result; result = KAUTH_RESULT_DEFER; p = arg0; switch (action) { case KAUTH_PROCESS_PROCFS: { enum kauth_process_req req; req = (enum kauth_process_req)(uintptr_t)arg2; switch (req) { case KAUTH_REQ_PROCESS_PROCFS_READ: break; case KAUTH_REQ_PROCESS_PROCFS_RW: case KAUTH_REQ_PROCESS_PROCFS_WRITE: if ((p == initproc) && (securelevel > -1)) result = KAUTH_RESULT_DENY; break; default: break; } break; } case KAUTH_PROCESS_PTRACE: if ((p == initproc) && (securelevel > -1)) result = KAUTH_RESULT_DENY; break; case KAUTH_PROCESS_CORENAME: if (securelevel > 1) result = KAUTH_RESULT_DENY; break; default: break; } return (result); } /* * kauth(9) listener * * Security model: Traditional NetBSD * Scope: Network * Responsibility: Securelevel */ int secmodel_securelevel_network_cb(kauth_cred_t cred, kauth_action_t action, void *cookie, void *arg0, void *arg1, void *arg2, void *arg3) { int result; enum kauth_network_req req; result = KAUTH_RESULT_DEFER; req = (enum kauth_network_req)(uintptr_t)arg0; switch (action) { case KAUTH_NETWORK_FIREWALL: switch (req) { case KAUTH_REQ_NETWORK_FIREWALL_FW: case KAUTH_REQ_NETWORK_FIREWALL_NAT: if (securelevel > 1) result = KAUTH_RESULT_DENY; break; default: break; } break; case KAUTH_NETWORK_FORWSRCRT: if (securelevel > 0) result = KAUTH_RESULT_DENY; break; default: break; } return (result); } /* * kauth(9) listener * * Security model: Traditional NetBSD * Scope: Machdep * Responsibility: Securelevel */ int secmodel_securelevel_machdep_cb(kauth_cred_t cred, kauth_action_t action, void *cookie, void *arg0, void *arg1, void *arg2, void *arg3) { int result; result = KAUTH_RESULT_DEFER; switch (action) { case KAUTH_MACHDEP_IOPERM_SET: case KAUTH_MACHDEP_IOPL: if (securelevel > 0) result = KAUTH_RESULT_DENY; break; case KAUTH_MACHDEP_UNMANAGEDMEM: if (securelevel > 0) result = KAUTH_RESULT_DENY; break; case KAUTH_MACHDEP_SVS_DISABLE: /* Deprecated. */ if (securelevel > 0) result = KAUTH_RESULT_DENY; break; case KAUTH_MACHDEP_CPU_UCODE_APPLY: if (securelevel > 1) result = KAUTH_RESULT_DENY; break; default: break; } return (result); } /* * kauth(9) listener * * Security model: Traditional NetBSD * Scope: Device * Responsibility: Securelevel */ int secmodel_securelevel_device_cb(kauth_cred_t cred, kauth_action_t action, void *cookie, void *arg0, void *arg1, void *arg2, void *arg3) { int result; result = KAUTH_RESULT_DEFER; switch (action) { case KAUTH_DEVICE_RAWIO_SPEC: { struct vnode *vp; enum kauth_device_req req; req = (enum kauth_device_req)(uintptr_t)arg0; vp = arg1; KASSERT(vp != NULL); /* Handle /dev/mem and /dev/kmem. */ if (iskmemvp(vp)) { switch (req) { case KAUTH_REQ_DEVICE_RAWIO_SPEC_READ: break; case KAUTH_REQ_DEVICE_RAWIO_SPEC_WRITE: case KAUTH_REQ_DEVICE_RAWIO_SPEC_RW: if (securelevel > 0) result = KAUTH_RESULT_DENY; break; default: break; } break; } switch (req) { case KAUTH_REQ_DEVICE_RAWIO_SPEC_READ: break; case KAUTH_REQ_DEVICE_RAWIO_SPEC_WRITE: case KAUTH_REQ_DEVICE_RAWIO_SPEC_RW: { int error; error = rawdev_mounted(vp, NULL); /* Not a disk. */ if (error == EINVAL) break; if (error && securelevel > 0) result = KAUTH_RESULT_DENY; if (securelevel > 1) result = KAUTH_RESULT_DENY; break; } default: break; } break; } case KAUTH_DEVICE_RAWIO_PASSTHRU: if (securelevel > 0) { u_long bits; bits = (u_long)arg0; KASSERT(bits != 0); KASSERT((bits & ~KAUTH_REQ_DEVICE_RAWIO_PASSTHRU_ALL) == 0); if (bits & ~KAUTH_REQ_DEVICE_RAWIO_PASSTHRU_READCONF) result = KAUTH_RESULT_DENY; } break; case KAUTH_DEVICE_GPIO_PINSET: if (securelevel > 0) result = KAUTH_RESULT_DENY; break; case KAUTH_DEVICE_RND_ADDDATA_ESTIMATE: if (securelevel > 1) result = KAUTH_RESULT_DENY; break; default: break; } return (result); } int secmodel_securelevel_vnode_cb(kauth_cred_t cred, kauth_action_t action, void *cookie, void *arg0, void *arg1, void *arg2, void *arg3) { int result; result = KAUTH_RESULT_DEFER; if ((action & KAUTH_VNODE_WRITE_SYSFLAGS) && (action & KAUTH_VNODE_HAS_SYSFLAGS)) { if (securelevel > 0) result = KAUTH_RESULT_DENY; } return (result); }
5 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 /* $NetBSD: efs_vfsops.c,v 1.30 2022/03/19 13:53:32 hannken Exp $ */ /* * Copyright (c) 2006 Stephen M. Rumble <rumble@ephemeral.org> * * Permission to use, copy, modify, and distribute this software for any * purpose with or without fee is hereby granted, provided that the above * copyright notice and this permission notice appear in all copies. * * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: efs_vfsops.c,v 1.30 2022/03/19 13:53:32 hannken Exp $"); #include <sys/param.h> #include <sys/systm.h> #include <sys/malloc.h> #include <sys/mount.h> #include <sys/fstypes.h> #include <sys/vnode.h> #include <sys/buf.h> #include <sys/namei.h> #include <sys/fcntl.h> #include <sys/stat.h> #include <sys/kauth.h> #include <sys/proc.h> #include <sys/module.h> #include <miscfs/genfs/genfs_node.h> #include <miscfs/genfs/genfs.h> #include <miscfs/specfs/specdev.h> #include <fs/efs/efs.h> #include <fs/efs/efs_sb.h> #include <fs/efs/efs_dir.h> #include <fs/efs/efs_genfs.h> #include <fs/efs/efs_mount.h> #include <fs/efs/efs_extent.h> #include <fs/efs/efs_dinode.h> #include <fs/efs/efs_inode.h> #include <fs/efs/efs_subr.h> MODULE(MODULE_CLASS_VFS, efs, NULL); MALLOC_JUSTDEFINE(M_EFSMNT, "efsmnt", "efs mount structure"); MALLOC_JUSTDEFINE(M_EFSINO, "efsino", "efs in-core inode structure"); MALLOC_JUSTDEFINE(M_EFSTMP, "efstmp", "efs temporary allocations"); extern int (**efs_vnodeop_p)(void *); /* for getnewvnode() */ extern int (**efs_specop_p)(void *); /* for getnewvnode() */ extern int (**efs_fifoop_p)(void *); /* for getnewvnode() */ static int efs_statvfs(struct mount *, struct statvfs *); /* * efs_mount and efs_mountroot common functions. */ static int efs_mount_common(struct mount *mp, const char *path, struct vnode *devvp, struct efs_args *args) { int err; struct buf *bp; const char *why; struct efs_mount *emp; struct lwp *l = curlwp; emp = malloc(sizeof(*emp), M_EFSMNT, M_WAITOK); emp->em_dev = devvp->v_rdev; emp->em_devvp = devvp; emp->em_mnt = mp; /* read in the superblock */ err = efs_bread(emp, EFS_BB_SB, l, &bp); if (err) { EFS_DPRINTF(("superblock read failed\n")); free(emp, M_EFSMNT); return (err); } memcpy(&emp->em_sb, bp->b_data, sizeof(emp->em_sb)); brelse(bp, 0); /* validate the superblock */ if (efs_sb_validate(&emp->em_sb, &why)) { printf("efs: invalid superblock: %s\n", why); if (!(mp->mnt_flag & MNT_FORCE)) { free(emp, M_EFSMNT); return (EIO); } } /* check that it's clean */ if (be16toh(emp->em_sb.sb_dirty) != EFS_SB_CLEAN) { printf("efs: filesystem is dirty (sb_dirty = 0x%x); please " "run fsck_efs(8)\n", be16toh(emp->em_sb.sb_dirty)); /* XXX - default to readonly unless forced?? */ } /* if the superblock was replicated, verify that it is the same */ if (be32toh(emp->em_sb.sb_replsb) != 0) { struct buf *rbp; bool skip = false; err = efs_bread(emp, be32toh(emp->em_sb.sb_replsb), l, &rbp); if (err) { printf("efs: read of superblock replicant failed; " "please run fsck_efs(8)\n"); if (mp->mnt_flag & MNT_FORCE) { skip = true; } else { free(emp, M_EFSMNT); return (err); } } if (!skip) { if (memcmp(rbp->b_data, &emp->em_sb, sizeof(emp->em_sb))) { printf("efs: superblock differs from " "replicant; please run fsck_efs(8)\n"); if (!(mp->mnt_flag & MNT_FORCE)) { brelse(rbp, 0); free(emp, M_EFSMNT); return (EIO); } } brelse(rbp, 0); } } /* ensure we can read last block */ err = efs_bread(emp, be32toh(emp->em_sb.sb_size) - 1, l, &bp); if (err) { printf("efs: cannot access all filesystem blocks; please run " "fsck_efs(8)\n"); if (!(mp->mnt_flag & MNT_FORCE)) { free(emp, M_EFSMNT); return (err); } } else { brelse(bp, 0); } mp->mnt_data = emp; mp->mnt_flag |= MNT_LOCAL; mp->mnt_fs_bshift = EFS_BB_SHFT; mp->mnt_dev_bshift = DEV_BSHIFT; vfs_getnewfsid(mp); efs_statvfs(mp, &mp->mnt_stat); err = set_statvfs_info(path, UIO_USERSPACE, args->fspec, UIO_USERSPACE, mp->mnt_op->vfs_name, mp, l); if (err) free(emp, M_EFSMNT); return (err); } /* * mount syscall vfsop. * * Returns 0 on success. */ static int efs_mount(struct mount *mp, const char *path, void *data, size_t *data_len) { struct lwp *l = curlwp; struct efs_args *args = data; struct pathbuf *pb; struct nameidata devnd; struct efs_mount *emp; struct vnode *devvp; int err, mode; if (args == NULL) return EINVAL; if (*data_len < sizeof *args) return EINVAL; if (mp->mnt_flag & MNT_GETARGS) { if ((emp = VFSTOEFS(mp)) == NULL) return (EIO); args->fspec = NULL; args->version = EFS_MNT_VERSION; *data_len = sizeof *args; return 0; } if (mp->mnt_flag & MNT_UPDATE) return (EOPNOTSUPP); /* XXX read-only */ /* look up our device's vnode. it is returned locked */ err = pathbuf_copyin(args->fspec, &pb); if (err) { return err; } NDINIT(&devnd, LOOKUP, FOLLOW | LOCKLEAF, pb); if ((err = namei(&devnd))) { pathbuf_destroy(pb); return (err); } devvp = devnd.ni_vp; pathbuf_destroy(pb); if (devvp->v_type != VBLK) { vput(devvp); return (ENOTBLK); } /* XXX - rdonly */ mode = FREAD; /* * If mount by non-root, then verify that user has necessary * permissions on the device. */ err = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_MOUNT, KAUTH_REQ_SYSTEM_MOUNT_DEVICE, mp, devvp, KAUTH_ARG(VREAD)); if (err) { vput(devvp); return (err); } if ((err = VOP_OPEN(devvp, mode, l->l_cred))) { vput(devvp); return (err); } err = efs_mount_common(mp, path, devvp, args); if (err) { VOP_CLOSE(devvp, mode, l->l_cred); vput(devvp); return (err); } VOP_UNLOCK(devvp); return (0); } /* * Initialisation routine. * * Returns 0 on success. */ static int efs_start(struct mount *mp, int flags) { return (0); } /* * unmount syscall vfsop. * * Returns 0 on success. */ static int efs_unmount(struct mount *mp, int mntflags) { struct efs_mount *emp; struct lwp *l = curlwp; int err; emp = VFSTOEFS(mp); err = vflush(mp, NULL, (mntflags & MNT_FORCE) ? FORCECLOSE : 0); if (err) return (err); cache_purgevfs(mp); vn_lock(emp->em_devvp, LK_EXCLUSIVE | LK_RETRY); err = VOP_CLOSE(emp->em_devvp, FREAD, l->l_cred); vput(emp->em_devvp); free(mp->mnt_data, M_EFSMNT); mp->mnt_data = NULL; mp->mnt_flag &= ~MNT_LOCAL; return (err); } /* * Return the root vnode. * * Returns 0 on success. */ static int efs_root(struct mount *mp, int lktype, struct vnode **vpp) { int err; struct vnode *vp; if ((err = VFS_VGET(mp, EFS_ROOTINO, lktype, &vp))) return (err); *vpp = vp; return (0); } /* * statvfs syscall vfsop. * * Returns 0 on success. */ static int efs_statvfs(struct mount *mp, struct statvfs *sbp) { struct efs_mount *emp; emp = VFSTOEFS(mp); sbp->f_bsize = EFS_BB_SIZE; sbp->f_frsize = EFS_BB_SIZE; sbp->f_iosize = EFS_BB_SIZE; sbp->f_blocks = be32toh(emp->em_sb.sb_size); sbp->f_bfree = be32toh(emp->em_sb.sb_tfree); sbp->f_bavail = sbp->f_bfree; // XXX same?? sbp->f_bresvd = 0; sbp->f_files = be32toh(emp->em_sb.sb_tinode); sbp->f_ffree = be16toh(emp->em_sb.sb_cgisize) * be16toh(emp->em_sb.sb_ncg) * EFS_DINODES_PER_BB; sbp->f_favail = sbp->f_ffree; // XXX same?? sbp->f_fresvd = 0; sbp->f_namemax = EFS_DIRENT_NAMELEN_MAX; copy_statvfs_info(sbp, mp); return (0); } /* * Obtain a locked vnode for the given on-disk inode number. * * Returns 0 on success. */ static int efs_vget(struct mount *mp, ino_t ino, int lktype, struct vnode **vpp) { int error; error = vcache_get(mp, &ino, sizeof(ino), vpp); if (error) return error; error = vn_lock(*vpp, lktype); if (error) { vrele(*vpp); *vpp = NULL; return error; } return 0; } /* * Initialize this vnode / inode pair. * Caller assures no other thread will try to load this inode. */ static int efs_loadvnode(struct mount *mp, struct vnode *vp, const void *key, size_t key_len, const void **new_key) { int error; ino_t ino; struct efs_inode *eip; struct efs_mount *emp; KASSERT(key_len == sizeof(ino)); memcpy(&ino, key, key_len); emp = VFSTOEFS(mp); eip = pool_get(&efs_inode_pool, PR_WAITOK); eip->ei_mode = 0; eip->ei_lockf = NULL; eip->ei_number = ino; eip->ei_dev = emp->em_dev; eip->ei_vp = vp; error = efs_read_inode(emp, ino, NULL, &eip->ei_di); if (error) { pool_put(&efs_inode_pool, eip); return error; } efs_sync_dinode_to_inode(eip); if (ino == EFS_ROOTINO && !S_ISDIR(eip->ei_mode)) { printf("efs: root inode (%lu) is not a directory!\n", (ulong)EFS_ROOTINO); pool_put(&efs_inode_pool, eip); return EIO; } switch (eip->ei_mode & S_IFMT) { case S_IFIFO: vp->v_type = VFIFO; vp->v_op = efs_fifoop_p; break; case S_IFCHR: vp->v_type = VCHR; vp->v_op = efs_specop_p; spec_node_init(vp, eip->ei_dev); break; case S_IFDIR: vp->v_type = VDIR; vp->v_op = efs_vnodeop_p; if (ino == EFS_ROOTINO) vp->v_vflag |= VV_ROOT; break; case S_IFBLK: vp->v_type = VBLK; vp->v_op = efs_specop_p; spec_node_init(vp, eip->ei_dev); break; case S_IFREG: vp->v_type = VREG; vp->v_op = efs_vnodeop_p; break; case S_IFLNK: vp->v_type = VLNK; vp->v_op = efs_vnodeop_p; break; case S_IFSOCK: vp->v_type = VSOCK; vp->v_op = efs_vnodeop_p; break; default: printf("efs: invalid mode 0x%x in inode %lu on mount %s\n", eip->ei_mode, (ulong)ino, mp->mnt_stat.f_mntonname); pool_put(&efs_inode_pool, eip); return EIO; } vp->v_tag = VT_EFS; vp->v_data = eip; genfs_node_init(vp, &efs_genfsops); uvm_vnp_setsize(vp, eip->ei_size); *new_key = &eip->ei_number; return 0; } /* * Convert the provided opaque, unique file handle into a vnode. * * Returns 0 on success. */ static int efs_fhtovp(struct mount *mp, struct fid *fhp, int lktype, struct vnode **vpp) { int err; struct vnode *vp; struct efs_fid *efp; struct efs_inode *eip; if (fhp->fid_len != sizeof(struct efs_fid)) return (EINVAL); efp = (struct efs_fid *)fhp; if ((err = VFS_VGET(mp, efp->ef_ino, lktype, &vp))) { *vpp = NULL; return (err); } eip = EFS_VTOI(vp); if (eip->ei_mode == 0 || eip->ei_gen != efp->ef_gen) { vput(vp); *vpp = NULL; return (ESTALE); } *vpp = vp; return (0); } /* * Convert the provided vnode into an opaque, unique file handle. * * Returns 0 on success. */ static int efs_vptofh(struct vnode *vp, struct fid *fhp, size_t *fh_size) { struct efs_fid *efp; struct efs_inode *eip; if (*fh_size < sizeof(struct efs_fid)) { *fh_size = sizeof(struct efs_fid); return (E2BIG); } *fh_size = sizeof(struct efs_fid); eip = EFS_VTOI(vp); efp = (struct efs_fid *)fhp; fhp->fid_len = sizeof(struct efs_fid); efp->ef_ino = eip->ei_number; efp->ef_gen = eip->ei_gen; return (0); } /* * Globally initialise the filesystem. */ static void efs_init(void) { malloc_type_attach(M_EFSMNT); malloc_type_attach(M_EFSINO); malloc_type_attach(M_EFSTMP); pool_init(&efs_inode_pool, sizeof(struct efs_inode), 0, 0, 0, "efsinopl", &pool_allocator_nointr, IPL_NONE); } /* * Globally reinitialise the filesystem. */ static void efs_reinit(void) { } /* * Globally clean up the filesystem. */ static void efs_done(void) { pool_destroy(&efs_inode_pool); malloc_type_detach(M_EFSMNT); malloc_type_detach(M_EFSINO); malloc_type_detach(M_EFSTMP); } extern const struct vnodeopv_desc efs_vnodeop_opv_desc; extern const struct vnodeopv_desc efs_specop_opv_desc; extern const struct vnodeopv_desc efs_fifoop_opv_desc; const struct vnodeopv_desc * const efs_vnodeopv_descs[] = { &efs_vnodeop_opv_desc, &efs_specop_opv_desc, &efs_fifoop_opv_desc, NULL }; struct vfsops efs_vfsops = { .vfs_name = MOUNT_EFS, .vfs_min_mount_data = sizeof (struct efs_args), .vfs_mount = efs_mount, .vfs_start = efs_start, .vfs_unmount = efs_unmount, .vfs_root = efs_root, .vfs_quotactl = (void *)eopnotsupp, .vfs_statvfs = efs_statvfs, .vfs_sync = (void *)nullop, .vfs_vget = efs_vget, .vfs_loadvnode = efs_loadvnode, .vfs_fhtovp = efs_fhtovp, .vfs_vptofh = efs_vptofh, .vfs_init = efs_init, .vfs_reinit = efs_reinit, .vfs_done = efs_done, .vfs_mountroot = (void *)eopnotsupp, .vfs_snapshot = (void *)eopnotsupp, .vfs_extattrctl = vfs_stdextattrctl, .vfs_suspendctl = genfs_suspendctl, .vfs_opv_descs = efs_vnodeopv_descs /* .vfs_refcount */ /* .vfs_list */ }; static int efs_modcmd(modcmd_t cmd, void *arg) { switch (cmd) { case MODULE_CMD_INIT: return vfs_attach(&efs_vfsops); case MODULE_CMD_FINI: return vfs_detach(&efs_vfsops); default: return ENOTTY; } }
68 66 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 /* $NetBSD: if_media_80.c,v 1.5 2022/08/03 01:38:51 riastradh Exp $ */ /*- * Copyright (c) 1998 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility, * NASA Ames Research Center. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /* * Copyright (c) 1997 * Jonathan Stone and Jason R. Thorpe. All rights reserved. * * This software is derived from information provided by Matt Thomas. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by Jonathan Stone * and Jason R. Thorpe for the NetBSD Project. * 4. The names of the authors may not be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: if_media_80.c,v 1.5 2022/08/03 01:38:51 riastradh Exp $"); #include <sys/param.h> #include <sys/kernel.h> #include <sys/syscallargs.h> #include <sys/errno.h> #include <sys/malloc.h> #include <sys/proc.h> #include <sys/compat_stub.h> #include <net/if.h> #include <net/if_media.h> #include <compat/sys/sockio.h> #include <compat/common/compat_mod.h> static void ifmword_n2o(int *oldwd, int *newwd) { if (IFM_SUBTYPE(*newwd) > IFM_OTHER) *oldwd = (*newwd & ~(_IFM_ETH_XTMASK | IFM_TMASK)) | IFM_OTHER; else *oldwd = *newwd; } /*ARGSUSED*/ static int compat_ifmediareq_pre(struct ifreq *ifr, u_long *cmd, bool *do_post) { struct ifmediareq *ifmr = (struct ifmediareq *)ifr; switch (*cmd) { case SIOCSIFMEDIA_80: *cmd = SIOCSIFMEDIA; /* Convert to new one */ if ((IFM_TYPE(ifr->ifr_media) == IFM_ETHER) && IFM_SUBTYPE(ifr->ifr_media) > IFM_OTHER) { /* Clear unused bits to not to change to wrong media */ ifr->ifr_media &= ~_IFM_ETH_XTMASK; } return 0; case SIOCGIFMEDIA_80: *cmd = SIOCGIFMEDIA; /* Convert to new one */ if (ifmr->ifm_count != 0) { /* * Tell the upper layer to try to convert each ifmedia * entry in the post process. */ *do_post = true; } return 0; default: return 0; } } /*ARGSUSED*/ static int compat_ifmediareq_post(struct ifreq *ifr, u_long cmd) { struct ifmediareq *ifmr = (struct ifmediareq *)ifr; size_t minwords; size_t count; int error, *kptr; switch (cmd) { case SIOCSIFMEDIA: return 0; case SIOCGIFMEDIA: if (ifmr->ifm_count < 0) return EINVAL; /* * ifmr->ifm_count was already ajusted in ifmedia_ioctl(), so * there is no problem to trust ifm_count. */ minwords = ifmr->ifm_count; kptr = malloc(minwords * sizeof(*kptr), M_TEMP, M_WAITOK|M_ZERO); if (kptr == NULL) return ENOMEM; /* * Convert ifm_current and ifm_active. * It's not required to convert ifm_mask. */ ifmword_n2o(&ifmr->ifm_current, &ifmr->ifm_current); ifmword_n2o(&ifmr->ifm_active, &ifmr->ifm_active); /* Convert ifm_ulist array */ for (count = 0; count < minwords; count++) { int oldmwd; error = ufetch_int(&ifmr->ifm_ulist[count], &oldmwd); if (error != 0) goto out; ifmword_n2o(&kptr[count], &oldmwd); } /* Copy to userland in old format */ error = copyout(kptr, ifmr->ifm_ulist, minwords * sizeof(*kptr)); out: free(kptr, M_TEMP); return error; default: return 0; } } void ifmedia_80_init(void) { MODULE_HOOK_SET(ifmedia_80_pre_hook, compat_ifmediareq_pre); MODULE_HOOK_SET(ifmedia_80_post_hook, compat_ifmediareq_post); } void ifmedia_80_fini(void) { MODULE_HOOK_UNSET(ifmedia_80_post_hook); MODULE_HOOK_UNSET(ifmedia_80_pre_hook); }
4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 /* $NetBSD: explicit_memset.c,v 1.4 2014/06/24 16:39:39 drochner Exp $ */ /* * Written by Matthias Drochner <drochner@NetBSD.org>. * Public domain. */ #if !defined(_KERNEL) && !defined(_STANDALONE) #include "namespace.h" #include <string.h> #ifdef __weak_alias __weak_alias(explicit_memset,_explicit_memset) #endif #define explicit_memset_impl __explicit_memset_impl #else #include <lib/libkern/libkern.h> #endif /* * The use of a volatile pointer guarantees that the compiler * will not optimise the call away. */ void *(* volatile explicit_memset_impl)(void *, int, size_t) = memset; void * explicit_memset(void *b, int c, size_t len) { return (*explicit_memset_impl)(b, c, len); }
15 12 15 15 15 15 11 4 4 11 1 2 2 19 19 19 1 18 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 /* $NetBSD: raw_usrreq.c,v 1.65 2022/09/02 23:48:11 thorpej Exp $ */ /* * Copyright (c) 1980, 1986, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)raw_usrreq.c 8.1 (Berkeley) 6/10/93 */ /* * Raw protocol interface. */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: raw_usrreq.c,v 1.65 2022/09/02 23:48:11 thorpej Exp $"); #include <sys/param.h> #include <sys/mbuf.h> #include <sys/domain.h> #include <sys/protosw.h> #include <sys/socket.h> #include <sys/socketvar.h> #include <sys/errno.h> #include <sys/systm.h> #include <sys/proc.h> #include <sys/kauth.h> #include <net/if.h> #include <net/route.h> #include <net/raw_cb.h> static inline int equal(const struct sockaddr *a1, const struct sockaddr *a2) { return memcmp(a1, a2, a1->sa_len) == 0; } /* * raw_input: find the socket associated with the packet and move it over. * If nothing exists for this packet, drop it. */ void raw_input(struct mbuf *m0, struct sockproto *proto, struct sockaddr *src, struct sockaddr *dst, struct rawcbhead *rawcbhead) { struct rawcb *rp; struct mbuf *m = m0; struct socket *last; last = NULL; LIST_FOREACH(rp, rawcbhead, rcb_list) { if (rp->rcb_proto.sp_family != proto->sp_family) continue; if (rp->rcb_proto.sp_protocol && rp->rcb_proto.sp_protocol != proto->sp_protocol) continue; /* * We assume the lower level routines have * placed the address in a canonical format * suitable for a structure comparison. * * Note that if the lengths are not the same * the comparison will fail at the first byte. */ if (rp->rcb_laddr && !equal(rp->rcb_laddr, dst)) continue; if (rp->rcb_faddr && !equal(rp->rcb_faddr, src)) continue; /* Run any filtering that may have been installed. */ if (rp->rcb_filter != NULL && rp->rcb_filter(m, proto, rp) != 0) continue; if (last != NULL) { struct mbuf *n; if ((n = m_copypacket(m, M_DONTWAIT)) == NULL || sbappendaddr(&last->so_rcv, src, n, NULL) == 0) { if (n != NULL) m_freem(n); soroverflow(last); } else sorwakeup(last); } last = rp->rcb_socket; } if (last != NULL) { if (sbappendaddr(&last->so_rcv, src, m, NULL) == 0) { m_freem(m); soroverflow(last); } else sorwakeup(last); } else { m_freem(m); } } void * raw_ctlinput(int cmd, const struct sockaddr *arg, void *d) { if ((unsigned)cmd >= PRC_NCMDS) return NULL; return NULL; /* INCOMPLETE */ } void raw_setsockaddr(struct rawcb *rp, struct sockaddr *nam) { memcpy(nam, rp->rcb_laddr, rp->rcb_laddr->sa_len); } void raw_setpeeraddr(struct rawcb *rp, struct sockaddr *nam) { memcpy(nam, rp->rcb_faddr, rp->rcb_faddr->sa_len); } int raw_send(struct socket *so, struct mbuf *m, struct sockaddr *nam, struct mbuf *control, struct lwp *l, int (*output)(struct mbuf *, struct socket *)) { struct rawcb *rp = sotorawcb(so); int error = 0; KASSERT(rp != NULL); /* * Ship a packet out. The appropriate raw output * routine handles any massaging necessary. */ if (control && control->m_len) { m_freem(control); m_freem(m); return EINVAL; } if (nam) { if ((so->so_state & SS_ISCONNECTED) != 0) { error = EISCONN; goto die; } error = (*so->so_proto->pr_usrreqs->pr_connect)(so, nam, l); if (error) { die: m_freem(m); return error; } } else { if ((so->so_state & SS_ISCONNECTED) == 0) { error = ENOTCONN; goto die; } } error = (*output)(m, so); if (nam) raw_disconnect(rp); return error; } int raw_usrreq(struct socket *so, int req, struct mbuf *m, struct mbuf *nam, struct mbuf *control, struct lwp *l) { KASSERT(req != PRU_ATTACH); KASSERT(req != PRU_DETACH); KASSERT(req != PRU_ACCEPT); KASSERT(req != PRU_BIND); KASSERT(req != PRU_LISTEN); KASSERT(req != PRU_CONNECT); KASSERT(req != PRU_CONNECT2); KASSERT(req != PRU_DISCONNECT); KASSERT(req != PRU_SHUTDOWN); KASSERT(req != PRU_ABORT); KASSERT(req != PRU_CONTROL); KASSERT(req != PRU_SENSE); KASSERT(req != PRU_PEERADDR); KASSERT(req != PRU_SOCKADDR); KASSERT(req != PRU_RCVD); KASSERT(req != PRU_RCVOOB); KASSERT(req != PRU_SEND); KASSERT(req != PRU_SENDOOB); KASSERT(req != PRU_PURGEIF); if (sotorawcb(so) == NULL) return EINVAL; panic("raw_usrreq"); return 0; }
1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 /* $NetBSD: fd.c,v 1.117 2022/09/25 17:11:48 thorpej Exp $ */ /*- * Copyright (c) 1998, 2003, 2008 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Charles M. Hannum. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /*- * Copyright (c) 1990 The Regents of the University of California. * All rights reserved. * * This code is derived from software contributed to Berkeley by * Don Ahn. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)fd.c 7.4 (Berkeley) 5/25/91 */ /* * Floppy formatting facilities merged from FreeBSD fd.c driver: * Id: fd.c,v 1.53 1995/03/12 22:40:56 joerg Exp * which carries the same copyright/redistribution notice as shown above with * the addition of the following statement before the "Redistribution and * use ..." clause: * * Copyright (c) 1993, 1994 by * jc@irbs.UUCP (John Capo) * vak@zebub.msk.su (Serge Vakulenko) * ache@astral.msk.su (Andrew A. Chernov) * * Copyright (c) 1993, 1994, 1995 by * joerg_wunsch@uriah.sax.de (Joerg Wunsch) * dufault@hda.com (Peter Dufault) */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: fd.c,v 1.117 2022/09/25 17:11:48 thorpej Exp $"); #include "opt_ddb.h" /* * XXX This driver should be properly MI'd some day, but this allows us * XXX to eliminate a lot of code duplication for now. */ #if !defined(alpha) && !defined(algor) && !defined(atari) && \ !defined(bebox) && !defined(evbmips) && !defined(i386) && \ !defined(prep) && !defined(sandpoint) && !defined(x86_64) && \ !defined(mvmeppc) && !defined(ofppc) #error platform not supported by this driver, yet #endif #include <sys/param.h> #include <sys/systm.h> #include <sys/callout.h> #include <sys/kernel.h> #include <sys/file.h> #include <sys/ioctl.h> #include <sys/device.h> #include <sys/disklabel.h> #include <sys/disk.h> #include <sys/buf.h> #include <sys/bufq.h> #include <sys/kmem.h> #include <sys/uio.h> #include <sys/syslog.h> #include <sys/queue.h> #include <sys/proc.h> #include <sys/fdio.h> #include <sys/conf.h> #include <sys/vnode.h> #include <sys/rndsource.h> #include <prop/proplib.h> #include <dev/cons.h> #include <sys/cpu.h> #include <sys/bus.h> #include "locators.h" #if defined(atari) /* * On the atari, it is configured as fdcisa */ #define FDCCF_DRIVE FDCISACF_DRIVE #define FDCCF_DRIVE_DEFAULT FDCISACF_DRIVE_DEFAULT #define fd_cd fdisa_cd #endif /* atari */ #include <sys/intr.h> #include <dev/isa/isavar.h> #include <dev/isa/isadmavar.h> #include <dev/isa/fdreg.h> #include <dev/isa/fdcvar.h> #if defined(i386) || defined(x86_64) #include <dev/ic/mc146818reg.h> /* for NVRAM access */ #include <i386/isa/nvram.h> #if defined(i386) #include "mca.h" #if NMCA > 0 #include <machine/mca_machdep.h> /* for MCA_system */ #endif #endif #endif /* i386 || x86_64 */ #include <dev/isa/fdvar.h> #define FDUNIT(dev) (minor(dev) / 8) #define FDTYPE(dev) (minor(dev) % 8) /* (mis)use device use flag to identify format operation */ #define B_FORMAT B_DEVPRIVATE /* controller driver configuration */ int fdprint(void *, const char *); #if NMCA > 0 /* MCA - specific entries */ const struct fd_type mca_fd_types[] = { { 18,2,36,2,0xff,0x0f,0x1b,0x6c,80,2880,1,FDC_500KBPS,0xf6,1, "1.44MB" }, /* 1.44MB diskette - XXX try 16ms step rate */ { 9,2,18,2,0xff,0x4f,0x2a,0x50,80,1440,1,FDC_250KBPS,0xf6,1, "720KB" }, /* 3.5 inch 720kB diskette - XXX try 24ms step rate */ }; #endif /* NMCA > 0 */ /* The order of entries in the following table is important -- BEWARE! */ #if defined(atari) const struct fd_type fd_types[] = { { 9,2,18,2,0xff,0xdf,0x2a,0x50,40, 720,1,FDC_250KBPS,0xf6,1, "360KB/PC" }, /* 360kB PC diskettes */ { 9,2,18,2,0xff,0xdf,0x2a,0x50,80,1440,1,FDC_250KBPS,0xf6,1, "720KB" }, /* 3.5 inch 720kB diskette */ { 18,2,36,2,0xff,0xcf,0x1b,0x6c,80,2880,1,FDC_500KBPS,0xf6,1, "1.44MB" }, /* 1.44MB diskette */ }; #else const struct fd_type fd_types[] = { { 18,2,36,2,0xff,0xcf,0x1b,0x6c,80,2880,1,FDC_500KBPS,0xf6,1, "1.44MB" }, /* 1.44MB diskette */ { 15,2,30,2,0xff,0xdf,0x1b,0x54,80,2400,1,FDC_500KBPS,0xf6,1, "1.2MB" }, /* 1.2 MB AT-diskettes */ { 9,2,18,2,0xff,0xdf,0x23,0x50,40, 720,2,FDC_300KBPS,0xf6,1, "360KB/AT" }, /* 360kB in 1.2MB drive */ { 9,2,18,2,0xff,0xdf,0x2a,0x50,40, 720,1,FDC_250KBPS,0xf6,1, "360KB/PC" }, /* 360kB PC diskettes */ { 9,2,18,2,0xff,0xdf,0x2a,0x50,80,1440,1,FDC_250KBPS,0xf6,1, "720KB" }, /* 3.5 inch 720kB diskette */ { 9,2,18,2,0xff,0xdf,0x23,0x50,80,1440,1,FDC_300KBPS,0xf6,1, "720KB/x" }, /* 720kB in 1.2MB drive */ { 9,2,18,2,0xff,0xdf,0x2a,0x50,40, 720,2,FDC_250KBPS,0xf6,1, "360KB/x" }, /* 360kB in 720kB drive */ }; #endif /* defined(atari) */ void fdcfinishattach(device_t); int fdprobe(device_t, cfdata_t, void *); void fdattach(device_t, device_t, void *); static int fddetach(device_t, int); static int fdcintr1(struct fdc_softc *); static void fdcintrcb(void *); static bool fdcsuspend(device_t, const pmf_qual_t *); static bool fdcresume(device_t, const pmf_qual_t *); extern struct cfdriver fd_cd; #ifdef atari CFATTACH_DECL_NEW(fdisa, sizeof(struct fd_softc), fdprobe, fdattach, fddetach, NULL); #else CFATTACH_DECL_NEW(fd, sizeof(struct fd_softc), fdprobe, fdattach, fddetach, NULL); #endif dev_type_open(fdopen); dev_type_close(fdclose); dev_type_read(fdread); dev_type_write(fdwrite); dev_type_ioctl(fdioctl); dev_type_strategy(fdstrategy); const struct bdevsw fd_bdevsw = { .d_open = fdopen, .d_close = fdclose, .d_strategy = fdstrategy, .d_ioctl = fdioctl, .d_dump = nodump, .d_psize = nosize, .d_discard = nodiscard, .d_flag = D_DISK }; const struct cdevsw fd_cdevsw = { .d_open = fdopen, .d_close = fdclose, .d_read = fdread, .d_write = fdwrite, .d_ioctl = fdioctl, .d_stop = nostop, .d_tty = notty, .d_poll = nopoll, .d_mmap = nommap, .d_kqfilter = nokqfilter, .d_discard = nodiscard, .d_flag = D_DISK }; void fdgetdisklabel(struct fd_softc *); int fd_get_parms(struct fd_softc *); void fdstart(struct fd_softc *); struct dkdriver fddkdriver = { .d_strategy = fdstrategy, .d_minphys = minphys }; #if defined(i386) || defined(x86_64) const struct fd_type *fd_nvtotype(const char *, int, int); #endif /* i386 || x86_64 */ void fd_set_motor(struct fdc_softc *fdc, int reset); void fd_motor_off(void *arg); void fd_motor_on(void *arg); int fdcresult(struct fdc_softc *fdc); void fdcstart(struct fdc_softc *fdc); void fdcstatus(device_t, int, const char *); void fdctimeout(void *arg); void fdcretry(struct fdc_softc *fdc); void fdfinish(struct fd_softc *fd, struct buf *bp); static const struct fd_type *fd_dev_to_type(struct fd_softc *, dev_t); int fdformat(dev_t, struct ne7_fd_formb *, struct lwp *); static void fd_set_geometry(struct fd_softc *fd); void fd_mountroot_hook(device_t); /* * Arguments passed between fdcattach and fdprobe. */ struct fdc_attach_args { int fa_drive; const struct fd_type *fa_deftype; }; /* * Print the location of a disk drive (called just before attaching the * the drive). If `fdc' is not NULL, the drive was found but was not * in the system config file; print the drive name as well. * Return QUIET (config_find ignores this if the device was configured) to * avoid printing `fdN not configured' messages. */ int fdprint(void *aux, const char *fdc) { struct fdc_attach_args *fa = aux; if (!fdc) aprint_normal(" drive %d", fa->fa_drive); return QUIET; } static bool fdcresume(device_t self, const pmf_qual_t *qual) { struct fdc_softc *fdc = device_private(self); mutex_enter(&fdc->sc_mtx); (void)fdcintr1(fdc); mutex_exit(&fdc->sc_mtx); return true; } static bool fdcsuspend(device_t self, const pmf_qual_t *qual) { struct fdc_softc *fdc = device_private(self); int drive; struct fd_softc *fd; mutex_enter(&fdc->sc_mtx); while (fdc->sc_state != DEVIDLE) cv_wait(&fdc->sc_cv, &fdc->sc_mtx); for (drive = 0; drive < 4; drive++) { if ((fd = fdc->sc_fd[drive]) == NULL) continue; fd->sc_flags &= ~(FD_MOTOR|FD_MOTOR_WAIT); } fd_set_motor(fdc, 0); mutex_exit(&fdc->sc_mtx); return true; } void fdc_childdet(device_t self, device_t child) { struct fdc_softc *fdc = device_private(self); struct fd_softc *fd = device_private(child); int drive = fd->sc_drive; KASSERT(fdc->sc_fd[drive] == fd); /* but the kid is not my son */ fdc->sc_fd[drive] = NULL; } int fdcdetach(device_t self, int flags) { int rc; struct fdc_softc *fdc = device_private(self); if ((rc = config_detach_children(self, flags)) != 0) return rc; pmf_device_deregister(self); isa_dmamap_destroy(fdc->sc_ic, fdc->sc_drq); isa_drq_free(fdc->sc_ic, fdc->sc_drq); callout_destroy(&fdc->sc_intr_ch); callout_destroy(&fdc->sc_timo_ch); cv_destroy(&fdc->sc_cv); mutex_destroy(&fdc->sc_mtx); return 0; } void fdcattach(struct fdc_softc *fdc) { mutex_init(&fdc->sc_mtx, MUTEX_DEFAULT, IPL_BIO); cv_init(&fdc->sc_cv, "fdcwake"); callout_init(&fdc->sc_timo_ch, 0); callout_init(&fdc->sc_intr_ch, 0); fdc->sc_state = DEVIDLE; TAILQ_INIT(&fdc->sc_drives); fdc->sc_maxiosize = isa_dmamaxsize(fdc->sc_ic, fdc->sc_drq); if (isa_drq_alloc(fdc->sc_ic, fdc->sc_drq) != 0) { aprint_normal_dev(fdc->sc_dev, "can't reserve drq %d\n", fdc->sc_drq); return; } if (isa_dmamap_create(fdc->sc_ic, fdc->sc_drq, fdc->sc_maxiosize, BUS_DMA_NOWAIT|BUS_DMA_ALLOCNOW)) { aprint_normal_dev(fdc->sc_dev, "can't set up ISA DMA map\n"); return; } config_interrupts(fdc->sc_dev, fdcfinishattach); if (!pmf_device_register(fdc->sc_dev, fdcsuspend, fdcresume)) { aprint_error_dev(fdc->sc_dev, "cannot set power mgmt handler\n"); } } void fdcfinishattach(device_t self) { struct fdc_softc *fdc = device_private(self); bus_space_tag_t iot = fdc->sc_iot; bus_space_handle_t ioh = fdc->sc_ioh; struct fdc_attach_args fa; /* * Reset the controller to get it into a known state. Not all * probes necessarily need do this to discover the controller up * front, so don't assume anything. */ bus_space_write_1(iot, ioh, fdout, 0); delay(100); bus_space_write_1(iot, ioh, fdout, FDO_FRST); /* see if it can handle a command */ if (out_fdc(iot, ioh, NE7CMD_SPECIFY) < 0) { aprint_normal_dev(fdc->sc_dev, "can't reset controller\n"); return; } out_fdc(iot, ioh, 0xdf); out_fdc(iot, ioh, 2); #if defined(i386) || defined(x86_64) /* * The NVRAM info only tells us about the first two disks on the * `primary' floppy controller. */ /* XXX device_unit() abuse */ if (device_unit(fdc->sc_dev) == 0) { int type = mc146818_read(NULL, NVRAM_DISKETTE); /* XXX softc */ fdc->sc_known = 1; fdc->sc_knownfds[0] = fd_nvtotype(device_xname(fdc->sc_dev), type, 0); if (fdc->sc_knownfds[0] != NULL) fdc->sc_present |= 1; fdc->sc_knownfds[1] = fd_nvtotype(device_xname(fdc->sc_dev), type, 1); if (fdc->sc_knownfds[1] != NULL) fdc->sc_present |= 2; } #endif /* i386 || x86_64 */ /* physical limit: four drives per controller. */ fdc->sc_state = PROBING; for (fa.fa_drive = 0; fa.fa_drive < 4; fa.fa_drive++) { if (fdc->sc_known) { if (fdc->sc_present & (1 << fa.fa_drive)) { fa.fa_deftype = fdc->sc_knownfds[fa.fa_drive]; config_found(fdc->sc_dev, (void *)&fa, fdprint, CFARGS_NONE); } } else { #if defined(atari) /* * Atari has a different ordening, defaults to 1.44 */ fa.fa_deftype = &fd_types[2]; /* Atari also configures ISA fdc(4) as "fdcisa" */ config_found(fdc->sc_dev, &fa, fdprint, CFARGS(.iattr = "fdcisa")); #else /* * Default to 1.44MB on Alpha and BeBox. How do we tell * on these platforms? */ fa.fa_deftype = &fd_types[0]; config_found(fdc->sc_dev, &fa, fdprint, CFARGS(.iattr = "fdc")); #endif } } fdc->sc_state = DEVIDLE; } int fdprobe(device_t parent, cfdata_t match, void *aux) { struct fdc_softc *fdc = device_private(parent); cfdata_t cf = match; struct fdc_attach_args *fa = aux; int drive = fa->fa_drive; bus_space_tag_t iot = fdc->sc_iot; bus_space_handle_t ioh = fdc->sc_ioh; int n; if (cf->cf_loc[FDCCF_DRIVE] != FDCCF_DRIVE_DEFAULT && cf->cf_loc[FDCCF_DRIVE] != drive) return 0; /* * XXX * This is to work around some odd interactions between this driver * and SMC Ethernet cards. */ if (cf->cf_loc[FDCCF_DRIVE] == FDCCF_DRIVE_DEFAULT && drive >= 2) return 0; /* Use PNP information if available */ if (fdc->sc_known) return 1; mutex_enter(&fdc->sc_mtx); /* toss any interrupt status */ for (n = 0; n < 4; n++) { out_fdc(iot, ioh, NE7CMD_SENSEI); (void) fdcresult(fdc); } /* select drive and turn on motor */ bus_space_write_1(iot, ioh, fdout, drive | FDO_FRST | FDO_MOEN(drive)); /* wait for motor to spin up */ /* XXX check sc_probe */ (void) cv_timedwait(&fdc->sc_cv, &fdc->sc_mtx, hz / 4); out_fdc(iot, ioh, NE7CMD_RECAL); out_fdc(iot, ioh, drive); /* wait for recalibrate, up to 2s */ /* XXX check sc_probe */ if (cv_timedwait(&fdc->sc_cv, &fdc->sc_mtx, 2 * hz) != EWOULDBLOCK){ #ifdef FD_DEBUG /* XXX */ printf("fdprobe: got intr\n"); #endif } out_fdc(iot, ioh, NE7CMD_SENSEI); n = fdcresult(fdc); #ifdef FD_DEBUG { int i; printf("fdprobe: status"); for (i = 0; i < n; i++) printf(" %x", fdc->sc_status[i]); printf("\n"); } #endif /* turn off motor */ bus_space_write_1(iot, ioh, fdout, FDO_FRST); mutex_exit(&fdc->sc_mtx); #if defined(bebox) /* XXX What is this about? --thorpej@NetBSD.org */ if (n != 2 || (fdc->sc_status[1] != 0)) return 0; #else if (n != 2 || (fdc->sc_status[0] & 0xf8) != 0x20) return 0; #endif /* bebox */ return 1; } /* * Controller is working, and drive responded. Attach it. */ void fdattach(device_t parent, device_t self, void *aux) { struct fdc_softc *fdc = device_private(parent); struct fd_softc *fd = device_private(self); struct fdc_attach_args *fa = aux; const struct fd_type *type = fa->fa_deftype; int drive = fa->fa_drive; fd->sc_dev = self; callout_init(&fd->sc_motoron_ch, 0); callout_init(&fd->sc_motoroff_ch, 0); /* XXX Allow `flags' to override device type? */ if (type) aprint_normal(": %s, %d cyl, %d head, %d sec\n", type->name, type->cyls, type->heads, type->sectrac); else aprint_normal(": density unknown\n"); bufq_alloc(&fd->sc_q, "disksort", BUFQ_SORT_CYLINDER); fd->sc_cylin = -1; fd->sc_drive = drive; fd->sc_deftype = type; fdc->sc_fd[drive] = fd; /* * Initialize and attach the disk structure. */ disk_init(&fd->sc_dk, device_xname(fd->sc_dev), &fddkdriver); disk_attach(&fd->sc_dk); /* * Establish a mountroot hook. */ fd->sc_roothook = mountroothook_establish(fd_mountroot_hook, fd->sc_dev); rnd_attach_source(&fd->rnd_source, device_xname(fd->sc_dev), RND_TYPE_DISK, RND_FLAG_DEFAULT); fd_set_geometry(fd); if (!pmf_device_register(self, NULL, NULL)) aprint_error_dev(self, "cannot set power mgmt handler\n"); } static int fddetach(device_t self, int flags) { struct fd_softc *fd = device_private(self); int bmaj, cmaj, i, mn; fd_motor_off(fd); /* locate the major number */ bmaj = bdevsw_lookup_major(&fd_bdevsw); cmaj = cdevsw_lookup_major(&fd_cdevsw); /* Nuke the vnodes for any open instances. */ for (i = 0; i < MAXPARTITIONS; i++) { mn = DISKMINOR(device_unit(self), i); vdevgone(bmaj, mn, mn, VBLK); vdevgone(cmaj, mn, mn, VCHR); } pmf_device_deregister(self); #if 0 /* XXX need to undo at detach? */ fd_set_geometry(fd); #endif rnd_detach_source(&fd->rnd_source); disk_detach(&fd->sc_dk); disk_destroy(&fd->sc_dk); /* Kill off any queued buffers. */ bufq_drain(fd->sc_q); bufq_free(fd->sc_q); callout_destroy(&fd->sc_motoroff_ch); callout_destroy(&fd->sc_motoron_ch); return 0; } #if defined(i386) || defined(x86_64) /* * Translate nvram type into internal data structure. Return NULL for * none/unknown/unusable. */ const struct fd_type * fd_nvtotype(const char *fdc, int nvraminfo, int drive) { int type; type = (drive == 0 ? nvraminfo : nvraminfo << 4) & 0xf0; switch (type) { case NVRAM_DISKETTE_NONE: return NULL; case NVRAM_DISKETTE_12M: return &fd_types[1]; case NVRAM_DISKETTE_TYPE5: case NVRAM_DISKETTE_TYPE6: /* XXX We really ought to handle 2.88MB format. */ case NVRAM_DISKETTE_144M: #if NMCA > 0 if (MCA_system) return &mca_fd_types[0]; else #endif /* NMCA > 0 */ return &fd_types[0]; case NVRAM_DISKETTE_360K: return &fd_types[3]; case NVRAM_DISKETTE_720K: #if NMCA > 0 if (MCA_system) return &mca_fd_types[1]; else #endif /* NMCA > 0 */ return &fd_types[4]; default: printf("%s: drive %d: unknown device type 0x%x\n", fdc, drive, type); return NULL; } } #endif /* i386 || x86_64 */ static const struct fd_type * fd_dev_to_type(struct fd_softc *fd, dev_t dev) { u_int type = FDTYPE(dev); if (type > __arraycount(fd_types)) return NULL; return type ? &fd_types[type - 1] : fd->sc_deftype; } void fdstrategy(struct buf *bp) { struct fd_softc *fd = device_lookup_private(&fd_cd, FDUNIT(bp->b_dev)); struct fdc_softc *fdc = device_private(device_parent(fd->sc_dev)); int sz; /* Valid unit, controller, and request? */ if (bp->b_blkno < 0 || ((bp->b_bcount % FDC_BSIZE) != 0 && (bp->b_flags & B_FORMAT) == 0)) { bp->b_error = EINVAL; goto done; } /* If it's a null transfer, return immediately. */ if (bp->b_bcount == 0) goto done; sz = howmany(bp->b_bcount, FDC_BSIZE); if (bp->b_blkno + sz > fd->sc_type->size) { sz = fd->sc_type->size - bp->b_blkno; if (sz == 0) { /* If exactly at end of disk, return EOF. */ goto done; } if (sz < 0) { /* If past end of disk, return EINVAL. */ bp->b_error = EINVAL; goto done; } /* Otherwise, truncate request. */ bp->b_bcount = sz << DEV_BSHIFT; } bp->b_rawblkno = bp->b_blkno; bp->b_cylinder = bp->b_blkno / (FDC_BSIZE / DEV_BSIZE) / fd->sc_type->seccyl; #ifdef FD_DEBUG printf("fdstrategy: b_blkno %llu b_bcount %d blkno %llu cylin %d " "sz %d\n", (unsigned long long)bp->b_blkno, bp->b_bcount, (unsigned long long)fd->sc_blkno, bp->b_cylinder, sz); #endif /* Queue transfer on drive, activate drive and controller if idle. */ mutex_enter(&fdc->sc_mtx); bufq_put(fd->sc_q, bp); callout_stop(&fd->sc_motoroff_ch); /* a good idea */ if (fd->sc_active == 0) fdstart(fd); #ifdef DIAGNOSTIC else { if (fdc->sc_state == DEVIDLE) { printf("fdstrategy: controller inactive\n"); fdcstart(fdc); } } #endif mutex_exit(&fdc->sc_mtx); return; done: /* Toss transfer; we're done early. */ bp->b_resid = bp->b_bcount; biodone(bp); } void fdstart(struct fd_softc *fd) { struct fdc_softc *fdc = device_private(device_parent(fd->sc_dev)); int active = !TAILQ_EMPTY(&fdc->sc_drives); KASSERT(mutex_owned(&fdc->sc_mtx)); /* Link into controller queue. */ fd->sc_active = 1; TAILQ_INSERT_TAIL(&fdc->sc_drives, fd, sc_drivechain); /* If controller not already active, start it. */ if (!active) fdcstart(fdc); } void fdfinish(struct fd_softc *fd, struct buf *bp) { struct fdc_softc *fdc = device_private(device_parent(fd->sc_dev)); /* * Move this drive to the end of the queue to give others a `fair' * chance. We only force a switch if N operations are completed while * another drive is waiting to be serviced, since there is a long motor * startup delay whenever we switch. */ (void)bufq_get(fd->sc_q); if (TAILQ_NEXT(fd, sc_drivechain) && ++fd->sc_ops >= 8) { fd->sc_ops = 0; TAILQ_REMOVE(&fdc->sc_drives, fd, sc_drivechain); if (bufq_peek(fd->sc_q) != NULL) TAILQ_INSERT_TAIL(&fdc->sc_drives, fd, sc_drivechain); else fd->sc_active = 0; } bp->b_resid = fd->sc_bcount; fd->sc_skip = 0; rnd_add_uint32(&fd->rnd_source, bp->b_blkno); biodone(bp); /* turn off motor 5s from now */ callout_reset(&fd->sc_motoroff_ch, 5 * hz, fd_motor_off, fd); fdc->sc_state = DEVIDLE; } int fdread(dev_t dev, struct uio *uio, int flags) { return (physio(fdstrategy, NULL, dev, B_READ, minphys, uio)); } int fdwrite(dev_t dev, struct uio *uio, int flags) { return (physio(fdstrategy, NULL, dev, B_WRITE, minphys, uio)); } void fd_set_motor(struct fdc_softc *fdc, int reset) { struct fd_softc *fd; u_char status; int n; if ((fd = TAILQ_FIRST(&fdc->sc_drives)) != NULL) status = fd->sc_drive; else status = 0; if (!reset) status |= FDO_FRST | FDO_FDMAEN; for (n = 0; n < 4; n++) if ((fd = fdc->sc_fd[n]) && (fd->sc_flags & FD_MOTOR)) status |= FDO_MOEN(n); bus_space_write_1(fdc->sc_iot, fdc->sc_ioh, fdout, status); } void fd_motor_off(void *arg) { struct fd_softc *fd = arg; struct fdc_softc *fdc; fdc = device_private(device_parent(fd->sc_dev)); mutex_enter(&fdc->sc_mtx); fd->sc_flags &= ~(FD_MOTOR | FD_MOTOR_WAIT); fd_set_motor(fdc, 0); mutex_exit(&fdc->sc_mtx); } void fd_motor_on(void *arg) { struct fd_softc *fd = arg; struct fdc_softc *fdc = device_private(device_parent(fd->sc_dev)); mutex_enter(&fdc->sc_mtx); fd->sc_flags &= ~FD_MOTOR_WAIT; if (TAILQ_FIRST(&fdc->sc_drives) == fd && fdc->sc_state == MOTORWAIT) (void)fdcintr1(fdc); mutex_exit(&fdc->sc_mtx); } int fdcresult(struct fdc_softc *fdc) { bus_space_tag_t iot = fdc->sc_iot; bus_space_handle_t ioh = fdc->sc_ioh; u_char i; u_int j = 100000, n = 0; for (; j; j--) { i = bus_space_read_1(iot, ioh, fdsts) & (NE7_DIO | NE7_RQM | NE7_CB); if (i == NE7_RQM) return n; if (i == (NE7_DIO | NE7_RQM | NE7_CB)) { if (n >= sizeof(fdc->sc_status)) { log(LOG_ERR, "fdcresult: overrun\n"); return -1; } fdc->sc_status[n++] = bus_space_read_1(iot, ioh, fddata); } delay(10); } log(LOG_ERR, "fdcresult: timeout\n"); return -1; } int out_fdc(bus_space_tag_t iot, bus_space_handle_t ioh, u_char x) { u_char i; u_int j = 100000; for (; j; j--) { i = bus_space_read_1(iot, ioh, fdsts) & (NE7_DIO | NE7_RQM); if (i == NE7_RQM) { bus_space_write_1(iot, ioh, fddata, x); return 0; } delay(10); } return -1; } int fdopen(dev_t dev, int flags, int mode, struct lwp *l) { struct fd_softc *fd; const struct fd_type *type; fd = device_lookup_private(&fd_cd, FDUNIT(dev)); if (fd == NULL) return (ENXIO); type = fd_dev_to_type(fd, dev); if (type == NULL) return ENXIO; if ((fd->sc_flags & FD_OPEN) != 0 && memcmp(fd->sc_type, type, sizeof(*type))) return EBUSY; fd->sc_type_copy = *type; fd->sc_type = &fd->sc_type_copy; fd->sc_cylin = -1; fd->sc_flags |= FD_OPEN; fd_set_geometry(fd); return 0; } int fdclose(dev_t dev, int flags, int mode, struct lwp *l) { struct fd_softc *fd = device_lookup_private(&fd_cd, FDUNIT(dev)); fd->sc_flags &= ~FD_OPEN; fd->sc_opts &= ~(FDOPT_NORETRY|FDOPT_SILENT); return 0; } void fdcstart(struct fdc_softc *fdc) { KASSERT(mutex_owned(&fdc->sc_mtx)); if (!device_is_active(fdc->sc_dev)) return; #ifdef DIAGNOSTIC /* only got here if controller's drive queue was inactive; should be in idle state */ if (fdc->sc_state != DEVIDLE) { printf("fdcstart: not idle\n"); return; } #endif (void)fdcintr1(fdc); } static void fdcpstatus(int n, struct fdc_softc *fdc) { char bits[64]; switch (n) { case 0: printf("\n"); break; case 2: snprintb(bits, sizeof(bits), NE7_ST0BITS, fdc->sc_status[0]); printf(" (st0 %s cyl %d)\n", bits, fdc->sc_status[1]); break; case 7: snprintb(bits, sizeof(bits), NE7_ST0BITS, fdc->sc_status[0]); printf(" (st0 %s", bits); snprintb(bits, sizeof(bits), NE7_ST1BITS, fdc->sc_status[1]); printf(" st1 %s", bits); snprintb(bits, sizeof(bits), NE7_ST2BITS, fdc->sc_status[2]); printf(" st2 %s", bits); printf(" cyl %d head %d sec %d)\n", fdc->sc_status[3], fdc->sc_status[4], fdc->sc_status[5]); break; #ifdef DIAGNOSTIC default: printf("\nfdcstatus: weird size"); break; #endif } } void fdcstatus(device_t dv, int n, const char *s) { struct fdc_softc *fdc = device_private(device_parent(dv)); if (n == 0) { out_fdc(fdc->sc_iot, fdc->sc_ioh, NE7CMD_SENSEI); (void) fdcresult(fdc); n = 2; } fdcpstatus(n, fdc); aprint_normal_dev(dv, "%s", s); } void fdctimeout(void *arg) { struct fdc_softc *fdc = arg; struct fd_softc *fd = TAILQ_FIRST(&fdc->sc_drives); mutex_enter(&fdc->sc_mtx); #ifdef DEBUG log(LOG_ERR, "fdctimeout: state %d\n", fdc->sc_state); #endif fdcstatus(fd->sc_dev, 0, "timeout"); if (bufq_peek(fd->sc_q) != NULL) fdc->sc_state++; else fdc->sc_state = DEVIDLE; (void)fdcintr1(fdc); mutex_exit(&fdc->sc_mtx); } static int fdcintr1(struct fdc_softc *fdc) { #define st0 fdc->sc_status[0] #define cyl fdc->sc_status[1] struct fd_softc *fd; struct buf *bp; bus_space_tag_t iot = fdc->sc_iot; bus_space_handle_t ioh = fdc->sc_ioh; int read, head, sec, i, nblks; struct fd_type *type; struct ne7_fd_formb *finfo = NULL; KASSERT(mutex_owned(&fdc->sc_mtx)); if (fdc->sc_state == PROBING) { #ifdef DEBUG printf("fdcintr: got probe interrupt\n"); #endif fdc->sc_probe++; goto out; } loop: /* Is there a drive for the controller to do a transfer with? */ fd = TAILQ_FIRST(&fdc->sc_drives); if (fd == NULL) { fdc->sc_state = DEVIDLE; goto out; } /* Is there a transfer to this drive? If not, deactivate drive. */ bp = bufq_peek(fd->sc_q); if (bp == NULL) { fd->sc_ops = 0; TAILQ_REMOVE(&fdc->sc_drives, fd, sc_drivechain); fd->sc_active = 0; goto loop; } if (bp->b_flags & B_FORMAT) finfo = (struct ne7_fd_formb *)bp->b_data; switch (fdc->sc_state) { case DEVIDLE: fdc->sc_errors = 0; fd->sc_skip = 0; fd->sc_bcount = bp->b_bcount; fd->sc_blkno = bp->b_blkno / (FDC_BSIZE / DEV_BSIZE); callout_stop(&fd->sc_motoroff_ch); if ((fd->sc_flags & FD_MOTOR_WAIT) != 0) { fdc->sc_state = MOTORWAIT; return 1; } if ((fd->sc_flags & FD_MOTOR) == 0) { /* Turn on the motor, being careful about pairing. */ struct fd_softc *ofd = fdc->sc_fd[fd->sc_drive ^ 1]; if (ofd && ofd->sc_flags & FD_MOTOR) { callout_stop(&ofd->sc_motoroff_ch); ofd->sc_flags &= ~(FD_MOTOR | FD_MOTOR_WAIT); } fd->sc_flags |= FD_MOTOR | FD_MOTOR_WAIT; fd_set_motor(fdc, 0); fdc->sc_state = MOTORWAIT; /* Allow .25s for motor to stabilize. */ callout_reset(&fd->sc_motoron_ch, hz / 4, fd_motor_on, fd); return 1; } /* Make sure the right drive is selected. */ fd_set_motor(fdc, 0); /* fall through */ case DOSEEK: doseek: if (fd->sc_cylin == bp->b_cylinder) goto doio; out_fdc(iot, ioh, NE7CMD_SPECIFY);/* specify command */ out_fdc(iot, ioh, fd->sc_type->steprate); out_fdc(iot, ioh, 6); /* XXX head load time == 6ms */ out_fdc(iot, ioh, NE7CMD_SEEK); /* seek function */ out_fdc(iot, ioh, fd->sc_drive); /* drive number */ out_fdc(iot, ioh, bp->b_cylinder * fd->sc_type->step); fd->sc_cylin = -1; fdc->sc_state = SEEKWAIT; iostat_seek(fd->sc_dk.dk_stats); disk_busy(&fd->sc_dk); callout_reset(&fdc->sc_timo_ch, 4 * hz, fdctimeout, fdc); return 1; case DOIO: doio: type = fd->sc_type; if (finfo) fd->sc_skip = (char *)&(finfo->fd_formb_cylno(0)) - (char *)finfo; sec = fd->sc_blkno % type->seccyl; nblks = type->seccyl - sec; nblks = uimin(nblks, fd->sc_bcount / FDC_BSIZE); nblks = uimin(nblks, fdc->sc_maxiosize / FDC_BSIZE); fd->sc_nblks = nblks; fd->sc_nbytes = finfo ? bp->b_bcount : nblks * FDC_BSIZE; head = sec / type->sectrac; sec -= head * type->sectrac; #ifdef DIAGNOSTIC { int block; block = (fd->sc_cylin * type->heads + head) * type->sectrac + sec; if (block != fd->sc_blkno) { printf("fdcintr: block %d != blkno " "%" PRId64 "\n", block, fd->sc_blkno); #ifdef DDB Debugger(); #endif } } #endif read = bp->b_flags & B_READ ? DMAMODE_READ : DMAMODE_WRITE; isa_dmastart(fdc->sc_ic, fdc->sc_drq, (char *)bp->b_data + fd->sc_skip, fd->sc_nbytes, NULL, read | DMAMODE_DEMAND, BUS_DMA_NOWAIT); bus_space_write_1(iot, fdc->sc_fdctlioh, 0, type->rate); #ifdef FD_DEBUG printf("fdcintr: %s drive %d track %d head %d sec %d nblks %d\n", read ? "read" : "write", fd->sc_drive, fd->sc_cylin, head, sec, nblks); #endif if (finfo) { /* formatting */ if (out_fdc(iot, ioh, NE7CMD_FORMAT) < 0) { fdc->sc_errors = 4; fdcretry(fdc); goto loop; } out_fdc(iot, ioh, (head << 2) | fd->sc_drive); out_fdc(iot, ioh, finfo->fd_formb_secshift); out_fdc(iot, ioh, finfo->fd_formb_nsecs); out_fdc(iot, ioh, finfo->fd_formb_gaplen); out_fdc(iot, ioh, finfo->fd_formb_fillbyte); } else { if (read) out_fdc(iot, ioh, NE7CMD_READ); /* READ */ else out_fdc(iot, ioh, NE7CMD_WRITE); /* WRITE */ out_fdc(iot, ioh, (head << 2) | fd->sc_drive); out_fdc(iot, ioh, fd->sc_cylin); /* track */ out_fdc(iot, ioh, head); out_fdc(iot, ioh, sec + 1); /* sector +1 */ out_fdc(iot, ioh, type->secsize);/* sector size */ out_fdc(iot, ioh, type->sectrac);/* sectors/track */ out_fdc(iot, ioh, type->gap1); /* gap1 size */ out_fdc(iot, ioh, type->datalen);/* data length */ } fdc->sc_state = IOCOMPLETE; disk_busy(&fd->sc_dk); /* allow 2 seconds for operation */ callout_reset(&fdc->sc_timo_ch, 2 * hz, fdctimeout, fdc); return 1; /* will return later */ case SEEKWAIT: callout_stop(&fdc->sc_timo_ch); fdc->sc_state = SEEKCOMPLETE; /* allow 1/50 second for heads to settle */ callout_reset(&fdc->sc_intr_ch, hz / 50, fdcintrcb, fdc); return 1; case SEEKCOMPLETE: /* no data on seek */ disk_unbusy(&fd->sc_dk, 0, 0); /* Make sure seek really happened. */ out_fdc(iot, ioh, NE7CMD_SENSEI); if (fdcresult(fdc) != 2 || (st0 & 0xf8) != 0x20 || cyl != bp->b_cylinder * fd->sc_type->step) { #ifdef FD_DEBUG fdcstatus(fd->sc_dev, 2, "seek failed"); #endif fdcretry(fdc); goto loop; } fd->sc_cylin = bp->b_cylinder; goto doio; case IOTIMEDOUT: isa_dmaabort(fdc->sc_ic, fdc->sc_drq); /* FALLTHROUGH */ case SEEKTIMEDOUT: case RECALTIMEDOUT: case RESETTIMEDOUT: fdcretry(fdc); goto loop; case IOCOMPLETE: /* IO DONE, post-analyze */ callout_stop(&fdc->sc_timo_ch); disk_unbusy(&fd->sc_dk, (bp->b_bcount - bp->b_resid), (bp->b_flags & B_READ)); if (fdcresult(fdc) != 7 || (st0 & 0xf8) != 0) { isa_dmaabort(fdc->sc_ic, fdc->sc_drq); #ifdef FD_DEBUG fdcstatus(fd->sc_dev, 7, bp->b_flags & B_READ ? "read failed" : "write failed"); printf("blkno %llu nblks %d\n", (unsigned long long)fd->sc_blkno, fd->sc_nblks); #endif fdcretry(fdc); goto loop; } isa_dmadone(fdc->sc_ic, fdc->sc_drq); if (fdc->sc_errors) { diskerr(bp, "fd", "soft error (corrected)", LOG_PRINTF, fd->sc_skip / FDC_BSIZE, NULL); printf("\n"); fdc->sc_errors = 0; } fd->sc_blkno += fd->sc_nblks; fd->sc_skip += fd->sc_nbytes; fd->sc_bcount -= fd->sc_nbytes; if (!finfo && fd->sc_bcount > 0) { bp->b_cylinder = fd->sc_blkno / fd->sc_type->seccyl; goto doseek; } fdfinish(fd, bp); goto loop; case DORESET: /* try a reset, keep motor on */ fd_set_motor(fdc, 1); delay(100); fd_set_motor(fdc, 0); fdc->sc_state = RESETCOMPLETE; callout_reset(&fdc->sc_timo_ch, hz / 2, fdctimeout, fdc); return 1; /* will return later */ case RESETCOMPLETE: callout_stop(&fdc->sc_timo_ch); /* clear the controller output buffer */ for (i = 0; i < 4; i++) { out_fdc(iot, ioh, NE7CMD_SENSEI); (void) fdcresult(fdc); } /* fall through */ case DORECAL: out_fdc(iot, ioh, NE7CMD_RECAL); /* recalibrate function */ out_fdc(iot, ioh, fd->sc_drive); fdc->sc_state = RECALWAIT; callout_reset(&fdc->sc_timo_ch, 5 * hz, fdctimeout, fdc); return 1; /* will return later */ case RECALWAIT: callout_stop(&fdc->sc_timo_ch); fdc->sc_state = RECALCOMPLETE; /* allow 1/30 second for heads to settle */ callout_reset(&fdc->sc_intr_ch, hz / 30, fdcintrcb, fdc); return 1; /* will return later */ case RECALCOMPLETE: out_fdc(iot, ioh, NE7CMD_SENSEI); if (fdcresult(fdc) != 2 || (st0 & 0xf8) != 0x20 || cyl != 0) { #ifdef FD_DEBUG fdcstatus(fd->sc_dev, 2, "recalibrate failed"); #endif fdcretry(fdc); goto loop; } fd->sc_cylin = 0; goto doseek; case MOTORWAIT: if (fd->sc_flags & FD_MOTOR_WAIT) return 1; /* time's not up yet */ goto doseek; default: fdcstatus(fd->sc_dev, 0, "stray interrupt"); return 1; } #undef st0 #undef cyl out: cv_signal(&fdc->sc_cv); return 1; } static void fdcintrcb(void *arg) { (void)fdcintr(arg); } int fdcintr(void *arg) { int rc; struct fdc_softc *fdc = arg; mutex_enter(&fdc->sc_mtx); rc = fdcintr1(fdc); mutex_exit(&fdc->sc_mtx); return rc; } void fdcretry(struct fdc_softc *fdc) { struct fd_softc *fd; struct buf *bp; fd = TAILQ_FIRST(&fdc->sc_drives); bp = bufq_peek(fd->sc_q); if (fd->sc_opts & FDOPT_NORETRY) goto fail; switch (fdc->sc_errors) { case 0: /* try again */ fdc->sc_state = DOSEEK; break; case 1: case 2: case 3: /* didn't work; try recalibrating */ fdc->sc_state = DORECAL; break; case 4: /* still no go; reset the bastard */ fdc->sc_state = DORESET; break; default: fail: if ((fd->sc_opts & FDOPT_SILENT) == 0) { diskerr(bp, "fd", "hard error", LOG_PRINTF, fd->sc_skip / FDC_BSIZE, NULL); fdcpstatus(7, fdc); } bp->b_error = EIO; fdfinish(fd, bp); } fdc->sc_errors++; } int fdioctl(dev_t dev, u_long cmd, void *addr, int flag, struct lwp *l) { struct fd_softc *fd = device_lookup_private(&fd_cd, FDUNIT(dev)); struct fdformat_parms *form_parms; struct fdformat_cmd *form_cmd; struct ne7_fd_formb *fd_formb; struct disklabel *lp = fd->sc_dk.dk_label; int error; unsigned int scratch; int il[FD_MAX_NSEC + 1]; int i, j; #ifdef __HAVE_OLD_DISKLABEL struct disklabel newlabel; #endif switch (cmd) { case DIOCGPARTINFO: case DIOCGDINFO: #ifdef __HAVE_OLD_DISKLABEL case ODIOCGDINFO: #endif memset(lp, 0, sizeof(*lp)); lp->d_type = DKTYPE_FLOPPY; lp->d_secsize = FDC_BSIZE; lp->d_nsectors = fd->sc_type->sectrac; lp->d_ntracks = fd->sc_type->heads; lp->d_ncylinders = fd->sc_type->cyls; lp->d_secpercyl = fd->sc_type->seccyl; lp->d_secperunit = fd->sc_type->size; if (readdisklabel(dev, fdstrategy, lp, NULL) != NULL) return EINVAL; break; } error = disk_ioctl(&fd->sc_dk, dev, cmd, addr, flag, l); if (error != EPASSTHROUGH) return error; switch (cmd) { case DIOCWLABEL: if ((flag & FWRITE) == 0) return EBADF; /* XXX do something */ return 0; case DIOCWDINFO: #ifdef __HAVE_OLD_DISKLABEL case ODIOCWDINFO: #endif { if ((flag & FWRITE) == 0) return EBADF; #ifdef __HAVE_OLD_DISKLABEL if (cmd == ODIOCWDINFO) { memset(&newlabel, 0, sizeof newlabel); memcpy(&newlabel, addr, sizeof (struct olddisklabel)); addr = &newlabel; } #endif error = setdisklabel(lp, addr, 0, NULL); if (error) return error; error = writedisklabel(dev, fdstrategy, lp, NULL); return error; } case FDIOCGETFORMAT: form_parms = (struct fdformat_parms *)addr; form_parms->fdformat_version = FDFORMAT_VERSION; form_parms->nbps = 128 * (1 << fd->sc_type->secsize); form_parms->ncyl = fd->sc_type->cyls; form_parms->nspt = fd->sc_type->sectrac; form_parms->ntrk = fd->sc_type->heads; form_parms->stepspercyl = fd->sc_type->step; form_parms->gaplen = fd->sc_type->gap2; form_parms->fillbyte = fd->sc_type->fillbyte; form_parms->interleave = fd->sc_type->interleave; switch (fd->sc_type->rate) { case FDC_500KBPS: form_parms->xfer_rate = 500 * 1024; break; case FDC_300KBPS: form_parms->xfer_rate = 300 * 1024; break; case FDC_250KBPS: form_parms->xfer_rate = 250 * 1024; break; default: return EINVAL; } return 0; case FDIOCSETFORMAT: if((flag & FWRITE) == 0) return EBADF; /* must be opened for writing */ form_parms = (struct fdformat_parms *)addr; if (form_parms->fdformat_version != FDFORMAT_VERSION) return EINVAL; /* wrong version of formatting prog */ scratch = form_parms->nbps >> 7; if ((form_parms->nbps & 0x7f) || ffs(scratch) == 0 || scratch & ~(1 << (ffs(scratch)-1))) /* not a power-of-two multiple of 128 */ return EINVAL; switch (form_parms->xfer_rate) { case 500 * 1024: fd->sc_type->rate = FDC_500KBPS; break; case 300 * 1024: fd->sc_type->rate = FDC_300KBPS; break; case 250 * 1024: fd->sc_type->rate = FDC_250KBPS; break; default: return EINVAL; } if (form_parms->nspt > FD_MAX_NSEC || form_parms->fillbyte > 0xff || form_parms->interleave > 0xff) return EINVAL; fd->sc_type->sectrac = form_parms->nspt; if (form_parms->ntrk != 2 && form_parms->ntrk != 1) return EINVAL; fd->sc_type->heads = form_parms->ntrk; fd->sc_type->seccyl = form_parms->nspt * form_parms->ntrk; fd->sc_type->secsize = ffs(scratch)-1; fd->sc_type->gap2 = form_parms->gaplen; fd->sc_type->cyls = form_parms->ncyl; fd->sc_type->size = fd->sc_type->seccyl * form_parms->ncyl * form_parms->nbps / DEV_BSIZE; fd->sc_type->step = form_parms->stepspercyl; fd->sc_type->fillbyte = form_parms->fillbyte; fd->sc_type->interleave = form_parms->interleave; return 0; case FDIOCFORMAT_TRACK: if((flag & FWRITE) == 0) return EBADF; /* must be opened for writing */ form_cmd = (struct fdformat_cmd *)addr; if (form_cmd->formatcmd_version != FDFORMAT_VERSION) return EINVAL; /* wrong version of formatting prog */ if (form_cmd->head >= fd->sc_type->heads || form_cmd->cylinder >= fd->sc_type->cyls) { return EINVAL; } fd_formb = kmem_alloc(sizeof(*fd_formb), KM_SLEEP); fd_formb->head = form_cmd->head; fd_formb->cyl = form_cmd->cylinder; fd_formb->transfer_rate = fd->sc_type->rate; fd_formb->fd_formb_secshift = fd->sc_type->secsize; fd_formb->fd_formb_nsecs = fd->sc_type->sectrac; fd_formb->fd_formb_gaplen = fd->sc_type->gap2; fd_formb->fd_formb_fillbyte = fd->sc_type->fillbyte; memset(il, 0, sizeof il); for (j = 0, i = 1; i <= fd_formb->fd_formb_nsecs; i++) { while (il[(j%fd_formb->fd_formb_nsecs)+1]) j++; il[(j%fd_formb->fd_formb_nsecs)+1] = i; j += fd->sc_type->interleave; } for (i = 0; i < fd_formb->fd_formb_nsecs; i++) { fd_formb->fd_formb_cylno(i) = form_cmd->cylinder; fd_formb->fd_formb_headno(i) = form_cmd->head; fd_formb->fd_formb_secno(i) = il[i+1]; fd_formb->fd_formb_secsize(i) = fd->sc_type->secsize; } error = fdformat(dev, fd_formb, l); kmem_free(fd_formb, sizeof(*fd_formb)); return error; case FDIOCGETOPTS: /* get drive options */ *(int *)addr = fd->sc_opts; return 0; case FDIOCSETOPTS: /* set drive options */ fd->sc_opts = *(int *)addr; return 0; default: return ENOTTY; } #ifdef DIAGNOSTIC panic("fdioctl: impossible"); #endif } int fdformat(dev_t dev, struct ne7_fd_formb *finfo, struct lwp *l) { int rv = 0; struct fd_softc *fd = device_lookup_private(&fd_cd, FDUNIT(dev)); struct fd_type *type = fd->sc_type; struct buf *bp; /* set up a buffer header for fdstrategy() */ bp = getiobuf(NULL, false); if (bp == NULL) return ENOBUFS; bp->b_cflags = BC_BUSY; bp->b_flags = B_PHYS | B_FORMAT; bp->b_proc = l->l_proc; bp->b_dev = dev; /* * calculate a fake blkno, so fdstrategy() would initiate a * seek to the requested cylinder */ bp->b_blkno = (finfo->cyl * (type->sectrac * type->heads) + finfo->head * type->sectrac) * FDC_BSIZE / DEV_BSIZE; bp->b_bcount = sizeof(struct fd_idfield_data) * finfo->fd_formb_nsecs; bp->b_data = (void *)finfo; #ifdef FD_DEBUG printf("fdformat: blkno %" PRIx64 " count %x\n", bp->b_blkno, bp->b_bcount); #endif /* now do the format */ fdstrategy(bp); /* ...and wait for it to complete */ rv = biowait(bp); putiobuf(bp); return rv; } /* * Mountroot hook: prompt the user to enter the root file system * floppy. */ void fd_mountroot_hook(device_t dev) { int c; printf("Insert filesystem floppy and press return."); cnpollc(1); for (;;) { c = cngetc(); if ((c == '\r') || (c == '\n')) { printf("\n"); break; } } cnpollc(0); } static void fd_set_geometry(struct fd_softc *fd) { const struct fd_type *fdt; fdt = fd->sc_type; if (fdt == NULL) { fdt = fd->sc_deftype; if (fdt == NULL) return; } struct disk_geom *dg = &fd->sc_dk.dk_geom; memset(dg, 0, sizeof(*dg)); dg->dg_secperunit = fdt->size; dg->dg_nsectors = fdt->sectrac; switch (fdt->secsize) { case 2: dg->dg_secsize = 512; break; case 3: dg->dg_secsize = 1024; break; default: break; } dg->dg_ntracks = fdt->heads; dg->dg_ncylinders = fdt->cyls; disk_set_info(fd->sc_dev, &fd->sc_dk, NULL); }
181 181 182 182 152 18 20 22 3 13 182 182 181 48 48 48 48 34 10 6 10 1 4 48 48 756 49 524 528 289 511 14 296 296 295 14 289 293 294 295 43 525 509 509 294 527 525 523 526 527 524 525 511 161 524 528 476 234 523 330 266 528 234 330 24 161 182 352 238 34 34 34 147 147 147 146 146 147 136 15 146 15 127 34 146 1 48 48 127 146 28 20 28 28 28 1 27 27 27 28 2 2 2 2 31 41 41 501 504 504 460 49 49 49 4 49 49 32 32 32 32 32 1 31 13 13 13 1 12 462 462 464 49 441 81 82 82 44 57 2 2 2 2 486 483 235 463 31 539 537 508 185 402 364 240 265 446 277 219 2 424 32 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 /* $NetBSD: uvm_page.c,v 1.256 2024/03/05 14:33:50 thorpej Exp $ */ /*- * Copyright (c) 2019, 2020 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Andrew Doran. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /* * Copyright (c) 1997 Charles D. Cranor and Washington University. * Copyright (c) 1991, 1993, The Regents of the University of California. * * All rights reserved. * * This code is derived from software contributed to Berkeley by * The Mach Operating System project at Carnegie-Mellon University. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)vm_page.c 8.3 (Berkeley) 3/21/94 * from: Id: uvm_page.c,v 1.1.2.18 1998/02/06 05:24:42 chs Exp * * * Copyright (c) 1987, 1990 Carnegie-Mellon University. * All rights reserved. * * Permission to use, copy, modify and distribute this software and * its documentation is hereby granted, provided that both the copyright * notice and this permission notice appear in all copies of the * software, derivative works or modified versions, and any portions * thereof, and that both notices appear in supporting documentation. * * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. * * Carnegie Mellon requests users of this software to return to * * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU * School of Computer Science * Carnegie Mellon University * Pittsburgh PA 15213-3890 * * any improvements or extensions that they make and grant Carnegie the * rights to redistribute these changes. */ /* * uvm_page.c: page ops. */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: uvm_page.c,v 1.256 2024/03/05 14:33:50 thorpej Exp $"); #include "opt_ddb.h" #include "opt_uvm.h" #include "opt_uvmhist.h" #include "opt_readahead.h" #include <sys/param.h> #include <sys/systm.h> #include <sys/sched.h> #include <sys/kernel.h> #include <sys/vnode.h> #include <sys/proc.h> #include <sys/radixtree.h> #include <sys/atomic.h> #include <sys/cpu.h> #include <ddb/db_active.h> #include <uvm/uvm.h> #include <uvm/uvm_ddb.h> #include <uvm/uvm_pdpolicy.h> #include <uvm/uvm_pgflcache.h> /* * number of pages per-CPU to reserve for the kernel. */ #ifndef UVM_RESERVED_PAGES_PER_CPU #define UVM_RESERVED_PAGES_PER_CPU 5 #endif int vm_page_reserve_kernel = UVM_RESERVED_PAGES_PER_CPU; /* * physical memory size; */ psize_t physmem; /* * local variables */ /* * these variables record the values returned by vm_page_bootstrap, * for debugging purposes. The implementation of uvm_pageboot_alloc * and pmap_startup here also uses them internally. */ static vaddr_t virtual_space_start; static vaddr_t virtual_space_end; /* * we allocate an initial number of page colors in uvm_page_init(), * and remember them. We may re-color pages as cache sizes are * discovered during the autoconfiguration phase. But we can never * free the initial set of buckets, since they are allocated using * uvm_pageboot_alloc(). */ static size_t recolored_pages_memsize /* = 0 */; static char *recolored_pages_mem; /* * freelist locks - one per bucket. */ union uvm_freelist_lock uvm_freelist_locks[PGFL_MAX_BUCKETS] __cacheline_aligned; /* * basic NUMA information. */ static struct uvm_page_numa_region { struct uvm_page_numa_region *next; paddr_t start; paddr_t size; u_int numa_id; } *uvm_page_numa_region; #ifdef DEBUG kmutex_t uvm_zerochecklock __cacheline_aligned; vaddr_t uvm_zerocheckkva; #endif /* DEBUG */ /* * These functions are reserved for uvm(9) internal use and are not * exported in the header file uvm_physseg.h * * Thus they are redefined here. */ void uvm_physseg_init_seg(uvm_physseg_t, struct vm_page *); void uvm_physseg_seg_chomp_slab(uvm_physseg_t, struct vm_page *, size_t); /* returns a pgs array */ struct vm_page *uvm_physseg_seg_alloc_from_slab(uvm_physseg_t, size_t); /* * inline functions */ /* * uvm_pageinsert: insert a page in the object. * * => caller must lock object * => call should have already set pg's object and offset pointers * and bumped the version counter */ static inline void uvm_pageinsert_object(struct uvm_object *uobj, struct vm_page *pg) { KASSERT(uobj == pg->uobject); KASSERT(rw_write_held(uobj->vmobjlock)); KASSERT((pg->flags & PG_TABLED) == 0); if ((pg->flags & PG_STAT) != 0) { /* Cannot use uvm_pagegetdirty(): not yet in radix tree. */ const unsigned int status = pg->flags & (PG_CLEAN | PG_DIRTY); if ((pg->flags & PG_FILE) != 0) { if (uobj->uo_npages == 0) { struct vnode *vp = (struct vnode *)uobj; mutex_enter(vp->v_interlock); KASSERT((vp->v_iflag & VI_PAGES) == 0); vp->v_iflag |= VI_PAGES; vholdl(vp); mutex_exit(vp->v_interlock); } if (UVM_OBJ_IS_VTEXT(uobj)) { cpu_count(CPU_COUNT_EXECPAGES, 1); } cpu_count(CPU_COUNT_FILEUNKNOWN + status, 1); } else { cpu_count(CPU_COUNT_ANONUNKNOWN + status, 1); } } pg->flags |= PG_TABLED; uobj->uo_npages++; } static inline int uvm_pageinsert_tree(struct uvm_object *uobj, struct vm_page *pg) { const uint64_t idx = pg->offset >> PAGE_SHIFT; int error; KASSERT(rw_write_held(uobj->vmobjlock)); error = radix_tree_insert_node(&uobj->uo_pages, idx, pg); if (error != 0) { return error; } if ((pg->flags & PG_CLEAN) == 0) { uvm_obj_page_set_dirty(pg); } KASSERT(((pg->flags & PG_CLEAN) == 0) == uvm_obj_page_dirty_p(pg)); return 0; } /* * uvm_page_remove: remove page from object. * * => caller must lock object */ static inline void uvm_pageremove_object(struct uvm_object *uobj, struct vm_page *pg) { KASSERT(uobj == pg->uobject); KASSERT(rw_write_held(uobj->vmobjlock)); KASSERT(pg->flags & PG_TABLED); if ((pg->flags & PG_STAT) != 0) { /* Cannot use uvm_pagegetdirty(): no longer in radix tree. */ const unsigned int status = pg->flags & (PG_CLEAN | PG_DIRTY); if ((pg->flags & PG_FILE) != 0) { if (uobj->uo_npages == 1) { struct vnode *vp = (struct vnode *)uobj; mutex_enter(vp->v_interlock); KASSERT((vp->v_iflag & VI_PAGES) != 0); vp->v_iflag &= ~VI_PAGES; holdrelel(vp); mutex_exit(vp->v_interlock); } if (UVM_OBJ_IS_VTEXT(uobj)) { cpu_count(CPU_COUNT_EXECPAGES, -1); } cpu_count(CPU_COUNT_FILEUNKNOWN + status, -1); } else { cpu_count(CPU_COUNT_ANONUNKNOWN + status, -1); } } uobj->uo_npages--; pg->flags &= ~PG_TABLED; pg->uobject = NULL; } static inline void uvm_pageremove_tree(struct uvm_object *uobj, struct vm_page *pg) { struct vm_page *opg __unused; KASSERT(rw_write_held(uobj->vmobjlock)); opg = radix_tree_remove_node(&uobj->uo_pages, pg->offset >> PAGE_SHIFT); KASSERT(pg == opg); } static void uvm_page_init_bucket(struct pgfreelist *pgfl, struct pgflbucket *pgb, int num) { int i; pgb->pgb_nfree = 0; for (i = 0; i < uvmexp.ncolors; i++) { LIST_INIT(&pgb->pgb_colors[i]); } pgfl->pgfl_buckets[num] = pgb; } /* * uvm_page_init: init the page system. called from uvm_init(). * * => we return the range of kernel virtual memory in kvm_startp/kvm_endp */ void uvm_page_init(vaddr_t *kvm_startp, vaddr_t *kvm_endp) { static struct uvm_cpu uvm_boot_cpu __cacheline_aligned; psize_t freepages, pagecount, bucketsize, n; struct pgflbucket *pgb; struct vm_page *pagearray; char *bucketarray; uvm_physseg_t bank; int fl, b; KASSERT(ncpu <= 1); /* * init the page queues and free page queue locks, except the * free list; we allocate that later (with the initial vm_page * structures). */ curcpu()->ci_data.cpu_uvm = &uvm_boot_cpu; uvmpdpol_init(); for (b = 0; b < __arraycount(uvm_freelist_locks); b++) { mutex_init(&uvm_freelist_locks[b].lock, MUTEX_DEFAULT, IPL_VM); } /* * allocate vm_page structures. */ /* * sanity check: * before calling this function the MD code is expected to register * some free RAM with the uvm_page_physload() function. our job * now is to allocate vm_page structures for this memory. */ if (uvm_physseg_get_last() == UVM_PHYSSEG_TYPE_INVALID) panic("uvm_page_bootstrap: no memory pre-allocated"); /* * first calculate the number of free pages... * * note that we use start/end rather than avail_start/avail_end. * this allows us to allocate extra vm_page structures in case we * want to return some memory to the pool after booting. */ freepages = 0; for (bank = uvm_physseg_get_first(); uvm_physseg_valid_p(bank) ; bank = uvm_physseg_get_next(bank)) { freepages += (uvm_physseg_get_end(bank) - uvm_physseg_get_start(bank)); } /* * Let MD code initialize the number of colors, or default * to 1 color if MD code doesn't care. */ if (uvmexp.ncolors == 0) uvmexp.ncolors = 1; uvmexp.colormask = uvmexp.ncolors - 1; KASSERT((uvmexp.colormask & uvmexp.ncolors) == 0); /* We always start with only 1 bucket. */ uvm.bucketcount = 1; /* * we now know we have (PAGE_SIZE * freepages) bytes of memory we can * use. for each page of memory we use we need a vm_page structure. * thus, the total number of pages we can use is the total size of * the memory divided by the PAGE_SIZE plus the size of the vm_page * structure. we add one to freepages as a fudge factor to avoid * truncation errors (since we can only allocate in terms of whole * pages). */ pagecount = ((freepages + 1) << PAGE_SHIFT) / (PAGE_SIZE + sizeof(struct vm_page)); bucketsize = offsetof(struct pgflbucket, pgb_colors[uvmexp.ncolors]); bucketsize = roundup2(bucketsize, coherency_unit); bucketarray = (void *)uvm_pageboot_alloc( bucketsize * VM_NFREELIST + pagecount * sizeof(struct vm_page)); pagearray = (struct vm_page *) (bucketarray + bucketsize * VM_NFREELIST); for (fl = 0; fl < VM_NFREELIST; fl++) { pgb = (struct pgflbucket *)(bucketarray + bucketsize * fl); uvm_page_init_bucket(&uvm.page_free[fl], pgb, 0); } memset(pagearray, 0, pagecount * sizeof(struct vm_page)); /* * init the freelist cache in the disabled state. */ uvm_pgflcache_init(); /* * init the vm_page structures and put them in the correct place. */ /* First init the extent */ for (bank = uvm_physseg_get_first(), uvm_physseg_seg_chomp_slab(bank, pagearray, pagecount); uvm_physseg_valid_p(bank); bank = uvm_physseg_get_next(bank)) { n = uvm_physseg_get_end(bank) - uvm_physseg_get_start(bank); uvm_physseg_seg_alloc_from_slab(bank, n); uvm_physseg_init_seg(bank, pagearray); /* set up page array pointers */ pagearray += n; pagecount -= n; } /* * pass up the values of virtual_space_start and * virtual_space_end (obtained by uvm_pageboot_alloc) to the upper * layers of the VM. */ *kvm_startp = round_page(virtual_space_start); *kvm_endp = trunc_page(virtual_space_end); /* * init various thresholds. */ uvmexp.reserve_pagedaemon = 1; uvmexp.reserve_kernel = vm_page_reserve_kernel; /* * done! */ uvm.page_init_done = true; } /* * uvm_pgfl_lock: lock all freelist buckets */ void uvm_pgfl_lock(void) { int i; for (i = 0; i < __arraycount(uvm_freelist_locks); i++) { mutex_spin_enter(&uvm_freelist_locks[i].lock); } } /* * uvm_pgfl_unlock: unlock all freelist buckets */ void uvm_pgfl_unlock(void) { int i; for (i = 0; i < __arraycount(uvm_freelist_locks); i++) { mutex_spin_exit(&uvm_freelist_locks[i].lock); } } /* * uvm_setpagesize: set the page size * * => sets page_shift and page_mask from uvmexp.pagesize. */ void uvm_setpagesize(void) { /* * If uvmexp.pagesize is 0 at this point, we expect PAGE_SIZE * to be a constant (indicated by being a non-zero value). */ if (uvmexp.pagesize == 0) { if (PAGE_SIZE == 0) panic("uvm_setpagesize: uvmexp.pagesize not set"); uvmexp.pagesize = PAGE_SIZE; } uvmexp.pagemask = uvmexp.pagesize - 1; if ((uvmexp.pagemask & uvmexp.pagesize) != 0) panic("uvm_setpagesize: page size %u (%#x) not a power of two", uvmexp.pagesize, uvmexp.pagesize); for (uvmexp.pageshift = 0; ; uvmexp.pageshift++) if ((1 << uvmexp.pageshift) == uvmexp.pagesize) break; } /* * uvm_pageboot_alloc: steal memory from physmem for bootstrapping */ vaddr_t uvm_pageboot_alloc(vsize_t size) { static bool initialized = false; vaddr_t addr; #if !defined(PMAP_STEAL_MEMORY) vaddr_t vaddr; paddr_t paddr; #endif /* * on first call to this function, initialize ourselves. */ if (initialized == false) { pmap_virtual_space(&virtual_space_start, &virtual_space_end); /* round it the way we like it */ virtual_space_start = round_page(virtual_space_start); virtual_space_end = trunc_page(virtual_space_end); initialized = true; } /* round to page size */ size = round_page(size); uvmexp.bootpages += atop(size); #if defined(PMAP_STEAL_MEMORY) /* * defer bootstrap allocation to MD code (it may want to allocate * from a direct-mapped segment). pmap_steal_memory should adjust * virtual_space_start/virtual_space_end if necessary. */ addr = pmap_steal_memory(size, &virtual_space_start, &virtual_space_end); return addr; #else /* !PMAP_STEAL_MEMORY */ /* * allocate virtual memory for this request */ if (virtual_space_start == virtual_space_end || (virtual_space_end - virtual_space_start) < size) panic("uvm_pageboot_alloc: out of virtual space"); addr = virtual_space_start; #ifdef PMAP_GROWKERNEL /* * If the kernel pmap can't map the requested space, * then allocate more resources for it. */ if (uvm_maxkaddr < (addr + size)) { uvm_maxkaddr = pmap_growkernel(addr + size); if (uvm_maxkaddr < (addr + size)) panic("uvm_pageboot_alloc: pmap_growkernel() failed"); } #endif virtual_space_start += size; /* * allocate and mapin physical pages to back new virtual pages */ for (vaddr = round_page(addr) ; vaddr < addr + size ; vaddr += PAGE_SIZE) { if (!uvm_page_physget(&paddr)) panic("uvm_pageboot_alloc: out of memory"); /* * Note this memory is no longer managed, so using * pmap_kenter is safe. */ pmap_kenter_pa(vaddr, paddr, VM_PROT_READ|VM_PROT_WRITE, 0); } pmap_update(pmap_kernel()); return addr; #endif /* PMAP_STEAL_MEMORY */ } #if !defined(PMAP_STEAL_MEMORY) /* * uvm_page_physget: "steal" one page from the vm_physmem structure. * * => attempt to allocate it off the end of a segment in which the "avail" * values match the start/end values. if we can't do that, then we * will advance both values (making them equal, and removing some * vm_page structures from the non-avail area). * => return false if out of memory. */ /* subroutine: try to allocate from memory chunks on the specified freelist */ static bool uvm_page_physget_freelist(paddr_t *, int); static bool uvm_page_physget_freelist(paddr_t *paddrp, int freelist) { uvm_physseg_t lcv; /* pass 1: try allocating from a matching end */ #if (VM_PHYSSEG_STRAT == VM_PSTRAT_BIGFIRST) for (lcv = uvm_physseg_get_last(); uvm_physseg_valid_p(lcv); lcv = uvm_physseg_get_prev(lcv)) #else for (lcv = uvm_physseg_get_first(); uvm_physseg_valid_p(lcv); lcv = uvm_physseg_get_next(lcv)) #endif { if (uvm.page_init_done == true) panic("uvm_page_physget: called _after_ bootstrap"); /* Try to match at front or back on unused segment */ if (uvm_page_physunload(lcv, freelist, paddrp)) return true; } /* pass2: forget about matching ends, just allocate something */ #if (VM_PHYSSEG_STRAT == VM_PSTRAT_BIGFIRST) for (lcv = uvm_physseg_get_last(); uvm_physseg_valid_p(lcv); lcv = uvm_physseg_get_prev(lcv)) #else for (lcv = uvm_physseg_get_first(); uvm_physseg_valid_p(lcv); lcv = uvm_physseg_get_next(lcv)) #endif { /* Try the front regardless. */ if (uvm_page_physunload_force(lcv, freelist, paddrp)) return true; } return false; } bool uvm_page_physget(paddr_t *paddrp) { int i; /* try in the order of freelist preference */ for (i = 0; i < VM_NFREELIST; i++) if (uvm_page_physget_freelist(paddrp, i) == true) return (true); return (false); } #endif /* PMAP_STEAL_MEMORY */ paddr_t uvm_vm_page_to_phys(const struct vm_page *pg) { return pg->phys_addr & ~(PAGE_SIZE - 1); } /* * uvm_page_numa_load: load NUMA range description. */ void uvm_page_numa_load(paddr_t start, paddr_t size, u_int numa_id) { struct uvm_page_numa_region *d; KASSERT(numa_id < PGFL_MAX_BUCKETS); d = kmem_alloc(sizeof(*d), KM_SLEEP); d->start = start; d->size = size; d->numa_id = numa_id; d->next = uvm_page_numa_region; uvm_page_numa_region = d; } /* * uvm_page_numa_lookup: lookup NUMA node for the given page. */ static u_int uvm_page_numa_lookup(struct vm_page *pg) { struct uvm_page_numa_region *d; static bool warned; paddr_t pa; KASSERT(uvm_page_numa_region != NULL); pa = VM_PAGE_TO_PHYS(pg); for (d = uvm_page_numa_region; d != NULL; d = d->next) { if (pa >= d->start && pa < d->start + d->size) { return d->numa_id; } } if (!warned) { printf("uvm_page_numa_lookup: failed, first pg=%p pa=%#" PRIxPADDR "\n", pg, VM_PAGE_TO_PHYS(pg)); warned = true; } return 0; } /* * uvm_page_redim: adjust freelist dimensions if they have changed. */ static void uvm_page_redim(int newncolors, int newnbuckets) { struct pgfreelist npgfl; struct pgflbucket *opgb, *npgb; struct pgflist *ohead, *nhead; struct vm_page *pg; size_t bucketsize, bucketmemsize, oldbucketmemsize; int fl, ob, oc, nb, nc, obuckets, ocolors; char *bucketarray, *oldbucketmem, *bucketmem; KASSERT(((newncolors - 1) & newncolors) == 0); /* Anything to do? */ if (newncolors <= uvmexp.ncolors && newnbuckets == uvm.bucketcount) { return; } if (uvm.page_init_done == false) { uvmexp.ncolors = newncolors; return; } bucketsize = offsetof(struct pgflbucket, pgb_colors[newncolors]); bucketsize = roundup2(bucketsize, coherency_unit); bucketmemsize = bucketsize * newnbuckets * VM_NFREELIST + coherency_unit - 1; bucketmem = kmem_zalloc(bucketmemsize, KM_SLEEP); bucketarray = (char *)roundup2((uintptr_t)bucketmem, coherency_unit); ocolors = uvmexp.ncolors; obuckets = uvm.bucketcount; /* Freelist cache mustn't be enabled. */ uvm_pgflcache_pause(); /* Make sure we should still do this. */ uvm_pgfl_lock(); if (newncolors <= uvmexp.ncolors && newnbuckets == uvm.bucketcount) { uvm_pgfl_unlock(); uvm_pgflcache_resume(); kmem_free(bucketmem, bucketmemsize); return; } uvmexp.ncolors = newncolors; uvmexp.colormask = uvmexp.ncolors - 1; uvm.bucketcount = newnbuckets; for (fl = 0; fl < VM_NFREELIST; fl++) { /* Init new buckets in new freelist. */ memset(&npgfl, 0, sizeof(npgfl)); for (nb = 0; nb < newnbuckets; nb++) { npgb = (struct pgflbucket *)bucketarray; uvm_page_init_bucket(&npgfl, npgb, nb); bucketarray += bucketsize; } /* Now transfer pages from the old freelist. */ for (nb = ob = 0; ob < obuckets; ob++) { opgb = uvm.page_free[fl].pgfl_buckets[ob]; for (oc = 0; oc < ocolors; oc++) { ohead = &opgb->pgb_colors[oc]; while ((pg = LIST_FIRST(ohead)) != NULL) { LIST_REMOVE(pg, pageq.list); /* * Here we decide on the NEW color & * bucket for the page. For NUMA * we'll use the info that the * hardware gave us. For non-NUMA * assign take physical page frame * number and cache color into * account. We do this to try and * avoid defeating any memory * interleaving in the hardware. */ KASSERT( uvm_page_get_bucket(pg) == ob); KASSERT(fl == uvm_page_get_freelist(pg)); if (uvm_page_numa_region != NULL) { nb = uvm_page_numa_lookup(pg); } else { nb = atop(VM_PAGE_TO_PHYS(pg)) / uvmexp.ncolors / 8 % newnbuckets; } uvm_page_set_bucket(pg, nb); npgb = npgfl.pgfl_buckets[nb]; npgb->pgb_nfree++; nc = VM_PGCOLOR(pg); nhead = &npgb->pgb_colors[nc]; LIST_INSERT_HEAD(nhead, pg, pageq.list); } } } /* Install the new freelist. */ memcpy(&uvm.page_free[fl], &npgfl, sizeof(npgfl)); } /* Unlock and free the old memory. */ oldbucketmemsize = recolored_pages_memsize; oldbucketmem = recolored_pages_mem; recolored_pages_memsize = bucketmemsize; recolored_pages_mem = bucketmem; uvm_pgfl_unlock(); uvm_pgflcache_resume(); if (oldbucketmemsize) { kmem_free(oldbucketmem, oldbucketmemsize); } /* * this calls uvm_km_alloc() which may want to hold * uvm_freelist_lock. */ uvm_pager_realloc_emerg(); } /* * uvm_page_recolor: Recolor the pages if the new color count is * larger than the old one. */ void uvm_page_recolor(int newncolors) { uvm_page_redim(newncolors, uvm.bucketcount); } /* * uvm_page_rebucket: Determine a bucket structure and redim the free * lists to match. */ void uvm_page_rebucket(void) { u_int min_numa, max_numa, npackage, shift; struct cpu_info *ci, *ci2, *ci3; CPU_INFO_ITERATOR cii; /* * If we have more than one NUMA node, and the maximum NUMA node ID * is less than PGFL_MAX_BUCKETS, then we'll use NUMA distribution * for free pages. */ min_numa = (u_int)-1; max_numa = 0; for (CPU_INFO_FOREACH(cii, ci)) { if (ci->ci_numa_id < min_numa) { min_numa = ci->ci_numa_id; } if (ci->ci_numa_id > max_numa) { max_numa = ci->ci_numa_id; } } if (min_numa != max_numa && max_numa < PGFL_MAX_BUCKETS) { aprint_debug("UVM: using NUMA allocation scheme\n"); for (CPU_INFO_FOREACH(cii, ci)) { ci->ci_data.cpu_uvm->pgflbucket = ci->ci_numa_id; } uvm_page_redim(uvmexp.ncolors, max_numa + 1); return; } /* * Otherwise we'll go with a scheme to maximise L2/L3 cache locality * and minimise lock contention. Count the total number of CPU * packages, and then try to distribute the buckets among CPU * packages evenly. */ npackage = curcpu()->ci_nsibling[CPUREL_PACKAGE1ST]; /* * Figure out how to arrange the packages & buckets, and the total * number of buckets we need. XXX 2 may not be the best factor. */ for (shift = 0; npackage > PGFL_MAX_BUCKETS; shift++) { npackage >>= 1; } uvm_page_redim(uvmexp.ncolors, npackage); /* * Now tell each CPU which bucket to use. In the outer loop, scroll * through all CPU packages. */ npackage = 0; ci = curcpu(); ci2 = ci->ci_sibling[CPUREL_PACKAGE1ST]; do { /* * In the inner loop, scroll through all CPUs in the package * and assign the same bucket ID. */ ci3 = ci2; do { ci3->ci_data.cpu_uvm->pgflbucket = npackage >> shift; ci3 = ci3->ci_sibling[CPUREL_PACKAGE]; } while (ci3 != ci2); npackage++; ci2 = ci2->ci_sibling[CPUREL_PACKAGE1ST]; } while (ci2 != ci->ci_sibling[CPUREL_PACKAGE1ST]); aprint_debug("UVM: using package allocation scheme, " "%d package(s) per bucket\n", 1 << shift); } /* * uvm_cpu_attach: initialize per-CPU data structures. */ void uvm_cpu_attach(struct cpu_info *ci) { struct uvm_cpu *ucpu; /* Already done in uvm_page_init(). */ if (!CPU_IS_PRIMARY(ci)) { /* Add more reserve pages for this CPU. */ uvmexp.reserve_kernel += vm_page_reserve_kernel; /* Allocate per-CPU data structures. */ ucpu = kmem_zalloc(sizeof(struct uvm_cpu) + coherency_unit - 1, KM_SLEEP); ucpu = (struct uvm_cpu *)roundup2((uintptr_t)ucpu, coherency_unit); ci->ci_data.cpu_uvm = ucpu; } else { ucpu = ci->ci_data.cpu_uvm; } uvmpdpol_init_cpu(ucpu); } /* * uvm_availmem: fetch the total amount of free memory in pages. this can * have a detrimental effect on performance due to false sharing; don't call * unless needed. * * some users can request the amount of free memory so often that it begins * to impact upon performance. if calling frequently and an inexact value * is okay, call with cached = true. */ int uvm_availmem(bool cached) { int64_t fp; cpu_count_sync(cached); if ((fp = cpu_count_get(CPU_COUNT_FREEPAGES)) < 0) { /* * XXXAD could briefly go negative because it's impossible * to get a clean snapshot. address this for other counters * used as running totals before NetBSD 10 although less * important for those. */ fp = 0; } return (int)fp; } /* * uvm_pagealloc_pgb: helper routine that tries to allocate any color from a * specific freelist and specific bucket only. * * => must be at IPL_VM or higher to protect per-CPU data structures. */ static struct vm_page * uvm_pagealloc_pgb(struct uvm_cpu *ucpu, int f, int b, int *trycolorp, int flags) { int c, trycolor, colormask; struct pgflbucket *pgb; struct vm_page *pg; kmutex_t *lock; bool fill; /* * Skip the bucket if empty, no lock needed. There could be many * empty freelists/buckets. */ pgb = uvm.page_free[f].pgfl_buckets[b]; if (pgb->pgb_nfree == 0) { return NULL; } /* Skip bucket if low on memory. */ lock = &uvm_freelist_locks[b].lock; mutex_spin_enter(lock); if (__predict_false(pgb->pgb_nfree <= uvmexp.reserve_kernel)) { if ((flags & UVM_PGA_USERESERVE) == 0 || (pgb->pgb_nfree <= uvmexp.reserve_pagedaemon && curlwp != uvm.pagedaemon_lwp)) { mutex_spin_exit(lock); return NULL; } fill = false; } else { fill = true; } /* Try all page colors as needed. */ c = trycolor = *trycolorp; colormask = uvmexp.colormask; do { pg = LIST_FIRST(&pgb->pgb_colors[c]); if (__predict_true(pg != NULL)) { /* * Got a free page! PG_FREE must be cleared under * lock because of uvm_pglistalloc(). */ LIST_REMOVE(pg, pageq.list); KASSERT(pg->flags == PG_FREE); pg->flags = PG_BUSY | PG_CLEAN | PG_FAKE; pgb->pgb_nfree--; CPU_COUNT(CPU_COUNT_FREEPAGES, -1); /* * While we have the bucket locked and our data * structures fresh in L1 cache, we have an ideal * opportunity to grab some pages for the freelist * cache without causing extra contention. Only do * so if we found pages in this CPU's preferred * bucket. */ if (__predict_true(b == ucpu->pgflbucket && fill)) { uvm_pgflcache_fill(ucpu, f, b, c); } mutex_spin_exit(lock); KASSERT(uvm_page_get_bucket(pg) == b); CPU_COUNT(c == trycolor ? CPU_COUNT_COLORHIT : CPU_COUNT_COLORMISS, 1); CPU_COUNT(CPU_COUNT_CPUMISS, 1); *trycolorp = c; return pg; } c = (c + 1) & colormask; } while (c != trycolor); mutex_spin_exit(lock); return NULL; } /* * uvm_pagealloc_pgfl: helper routine for uvm_pagealloc_strat that allocates * any color from any bucket, in a specific freelist. * * => must be at IPL_VM or higher to protect per-CPU data structures. */ static struct vm_page * uvm_pagealloc_pgfl(struct uvm_cpu *ucpu, int f, int *trycolorp, int flags) { int b, trybucket, bucketcount; struct vm_page *pg; /* Try for the exact thing in the per-CPU cache. */ if ((pg = uvm_pgflcache_alloc(ucpu, f, *trycolorp)) != NULL) { CPU_COUNT(CPU_COUNT_CPUHIT, 1); CPU_COUNT(CPU_COUNT_COLORHIT, 1); return pg; } /* Walk through all buckets, trying our preferred bucket first. */ trybucket = ucpu->pgflbucket; b = trybucket; bucketcount = uvm.bucketcount; do { pg = uvm_pagealloc_pgb(ucpu, f, b, trycolorp, flags); if (pg != NULL) { return pg; } b = (b + 1 == bucketcount ? 0 : b + 1); } while (b != trybucket); return NULL; } /* * uvm_pagealloc_strat: allocate vm_page from a particular free list. * * => return null if no pages free * => wake up pagedaemon if number of free pages drops below low water mark * => if obj != NULL, obj must be locked (to put in obj's tree) * => if anon != NULL, anon must be locked (to put in anon) * => only one of obj or anon can be non-null * => caller must activate/deactivate page if it is not wired. * => free_list is ignored if strat == UVM_PGA_STRAT_NORMAL. * => policy decision: it is more important to pull a page off of the * appropriate priority free list than it is to get a page from the * correct bucket or color bin. This is because we live with the * consequences of a bad free list decision for the entire * lifetime of the page, e.g. if the page comes from memory that * is slower to access. */ struct vm_page * uvm_pagealloc_strat(struct uvm_object *obj, voff_t off, struct vm_anon *anon, int flags, int strat, int free_list) { int color, lcv, error, s; struct uvm_cpu *ucpu; struct vm_page *pg; lwp_t *l; KASSERT(obj == NULL || anon == NULL); KASSERT(anon == NULL || (flags & UVM_FLAG_COLORMATCH) || off == 0); KASSERT(off == trunc_page(off)); KASSERT(obj == NULL || rw_write_held(obj->vmobjlock)); KASSERT(anon == NULL || anon->an_lock == NULL || rw_write_held(anon->an_lock)); /* * This implements a global round-robin page coloring * algorithm. */ s = splvm(); ucpu = curcpu()->ci_data.cpu_uvm; if (flags & UVM_FLAG_COLORMATCH) { color = atop(off) & uvmexp.colormask; } else { color = ucpu->pgflcolor; } /* * fail if any of these conditions is true: * [1] there really are no free pages, or * [2] only kernel "reserved" pages remain and * reserved pages have not been requested. * [3] only pagedaemon "reserved" pages remain and * the requestor isn't the pagedaemon. * we make kernel reserve pages available if called by a * kernel thread. */ l = curlwp; if (__predict_true(l != NULL) && (l->l_flag & LW_SYSTEM) != 0) { flags |= UVM_PGA_USERESERVE; } again: switch (strat) { case UVM_PGA_STRAT_NORMAL: /* Check freelists: descending priority (ascending id) order. */ for (lcv = 0; lcv < VM_NFREELIST; lcv++) { pg = uvm_pagealloc_pgfl(ucpu, lcv, &color, flags); if (pg != NULL) { goto gotit; } } /* No pages free! Have pagedaemon free some memory. */ splx(s); uvm_kick_pdaemon(); return NULL; case UVM_PGA_STRAT_ONLY: case UVM_PGA_STRAT_FALLBACK: /* Attempt to allocate from the specified free list. */ KASSERT(free_list >= 0); KASSERT(free_list < VM_NFREELIST); pg = uvm_pagealloc_pgfl(ucpu, free_list, &color, flags); if (pg != NULL) { goto gotit; } /* Fall back, if possible. */ if (strat == UVM_PGA_STRAT_FALLBACK) { strat = UVM_PGA_STRAT_NORMAL; goto again; } /* No pages free! Have pagedaemon free some memory. */ splx(s); uvm_kick_pdaemon(); return NULL; case UVM_PGA_STRAT_NUMA: /* * NUMA strategy (experimental): allocating from the correct * bucket is more important than observing freelist * priority. Look only to the current NUMA node; if that * fails, we need to look to other NUMA nodes, so retry with * the normal strategy. */ for (lcv = 0; lcv < VM_NFREELIST; lcv++) { pg = uvm_pgflcache_alloc(ucpu, lcv, color); if (pg != NULL) { CPU_COUNT(CPU_COUNT_CPUHIT, 1); CPU_COUNT(CPU_COUNT_COLORHIT, 1); goto gotit; } pg = uvm_pagealloc_pgb(ucpu, lcv, ucpu->pgflbucket, &color, flags); if (pg != NULL) { goto gotit; } } strat = UVM_PGA_STRAT_NORMAL; goto again; default: panic("uvm_pagealloc_strat: bad strat %d", strat); /* NOTREACHED */ } gotit: /* * We now know which color we actually allocated from; set * the next color accordingly. */ ucpu->pgflcolor = (color + 1) & uvmexp.colormask; /* * while still at IPL_VM, update allocation statistics. */ if (anon) { CPU_COUNT(CPU_COUNT_ANONCLEAN, 1); } splx(s); KASSERT(pg->flags == (PG_BUSY|PG_CLEAN|PG_FAKE)); /* * assign the page to the object. as the page was free, we know * that pg->uobject and pg->uanon are NULL. we only need to take * the page's interlock if we are changing the values. */ if (anon != NULL || obj != NULL) { mutex_enter(&pg->interlock); } pg->offset = off; pg->uobject = obj; pg->uanon = anon; KASSERT(uvm_page_owner_locked_p(pg, true)); if (anon) { anon->an_page = pg; pg->flags |= PG_ANON; mutex_exit(&pg->interlock); } else if (obj) { /* * set PG_FILE|PG_AOBJ before the first uvm_pageinsert. */ if (UVM_OBJ_IS_VNODE(obj)) { pg->flags |= PG_FILE; } else if (UVM_OBJ_IS_AOBJ(obj)) { pg->flags |= PG_AOBJ; } uvm_pageinsert_object(obj, pg); mutex_exit(&pg->interlock); error = uvm_pageinsert_tree(obj, pg); if (error != 0) { mutex_enter(&pg->interlock); uvm_pageremove_object(obj, pg); mutex_exit(&pg->interlock); uvm_pagefree(pg); return NULL; } } #if defined(UVM_PAGE_TRKOWN) pg->owner_tag = NULL; #endif UVM_PAGE_OWN(pg, "new alloc"); if (flags & UVM_PGA_ZERO) { /* A zero'd page is not clean. */ if (obj != NULL || anon != NULL) { uvm_pagemarkdirty(pg, UVM_PAGE_STATUS_DIRTY); } pmap_zero_page(VM_PAGE_TO_PHYS(pg)); } return(pg); } /* * uvm_pagereplace: replace a page with another * * => object must be locked * => page interlocks must be held */ void uvm_pagereplace(struct vm_page *oldpg, struct vm_page *newpg) { struct uvm_object *uobj = oldpg->uobject; struct vm_page *pg __diagused; uint64_t idx; KASSERT((oldpg->flags & PG_TABLED) != 0); KASSERT(uobj != NULL); KASSERT((newpg->flags & PG_TABLED) == 0); KASSERT(newpg->uobject == NULL); KASSERT(rw_write_held(uobj->vmobjlock)); KASSERT(mutex_owned(&oldpg->interlock)); KASSERT(mutex_owned(&newpg->interlock)); newpg->uobject = uobj; newpg->offset = oldpg->offset; idx = newpg->offset >> PAGE_SHIFT; pg = radix_tree_replace_node(&uobj->uo_pages, idx, newpg); KASSERT(pg == oldpg); if (((oldpg->flags ^ newpg->flags) & PG_CLEAN) != 0) { if ((newpg->flags & PG_CLEAN) != 0) { uvm_obj_page_clear_dirty(newpg); } else { uvm_obj_page_set_dirty(newpg); } } /* * oldpg's PG_STAT is stable. newpg is not reachable by others yet. */ newpg->flags |= (newpg->flags & ~PG_STAT) | (oldpg->flags & PG_STAT); uvm_pageinsert_object(uobj, newpg); uvm_pageremove_object(uobj, oldpg); } /* * uvm_pagerealloc: reallocate a page from one object to another * * => both objects must be locked */ int uvm_pagerealloc(struct vm_page *pg, struct uvm_object *newobj, voff_t newoff) { int error = 0; /* * remove it from the old object */ if (pg->uobject) { uvm_pageremove_tree(pg->uobject, pg); uvm_pageremove_object(pg->uobject, pg); } /* * put it in the new object */ if (newobj) { mutex_enter(&pg->interlock); pg->uobject = newobj; pg->offset = newoff; if (UVM_OBJ_IS_VNODE(newobj)) { pg->flags |= PG_FILE; } else if (UVM_OBJ_IS_AOBJ(newobj)) { pg->flags |= PG_AOBJ; } uvm_pageinsert_object(newobj, pg); mutex_exit(&pg->interlock); error = uvm_pageinsert_tree(newobj, pg); if (error != 0) { mutex_enter(&pg->interlock); uvm_pageremove_object(newobj, pg); mutex_exit(&pg->interlock); } } return error; } /* * uvm_pagefree: free page * * => erase page's identity (i.e. remove from object) * => put page on free list * => caller must lock owning object (either anon or uvm_object) * => assumes all valid mappings of pg are gone */ void uvm_pagefree(struct vm_page *pg) { struct pgfreelist *pgfl; struct pgflbucket *pgb; struct uvm_cpu *ucpu; kmutex_t *lock; int bucket, s; bool locked; #ifdef DEBUG if (pg->uobject == (void *)0xdeadbeef && pg->uanon == (void *)0xdeadbeef) { panic("uvm_pagefree: freeing free page %p", pg); } #endif /* DEBUG */ KASSERT((pg->flags & PG_PAGEOUT) == 0); KASSERT(!(pg->flags & PG_FREE)); KASSERT(pg->uobject == NULL || rw_write_held(pg->uobject->vmobjlock)); KASSERT(pg->uobject != NULL || pg->uanon == NULL || rw_write_held(pg->uanon->an_lock)); /* * remove the page from the object's tree before acquiring any page * interlocks: this can acquire locks to free radixtree nodes. */ if (pg->uobject != NULL) { uvm_pageremove_tree(pg->uobject, pg); } /* * if the page is loaned, resolve the loan instead of freeing. */ if (pg->loan_count) { KASSERT(pg->wire_count == 0); /* * if the page is owned by an anon then we just want to * drop anon ownership. the kernel will free the page when * it is done with it. if the page is owned by an object, * remove it from the object and mark it dirty for the benefit * of possible anon owners. * * regardless of previous ownership, wakeup any waiters, * unbusy the page, and we're done. */ uvm_pagelock(pg); locked = true; if (pg->uobject != NULL) { uvm_pageremove_object(pg->uobject, pg); pg->flags &= ~(PG_FILE|PG_AOBJ); } else if (pg->uanon != NULL) { if ((pg->flags & PG_ANON) == 0) { pg->loan_count--; } else { const unsigned status = uvm_pagegetdirty(pg); pg->flags &= ~PG_ANON; cpu_count(CPU_COUNT_ANONUNKNOWN + status, -1); } pg->uanon->an_page = NULL; pg->uanon = NULL; } if (pg->pqflags & PQ_WANTED) { wakeup(pg); } pg->pqflags &= ~PQ_WANTED; pg->flags &= ~(PG_BUSY|PG_RELEASED|PG_PAGER1); #ifdef UVM_PAGE_TRKOWN pg->owner_tag = NULL; #endif KASSERT((pg->flags & PG_STAT) == 0); if (pg->loan_count) { KASSERT(pg->uobject == NULL); if (pg->uanon == NULL) { uvm_pagedequeue(pg); } uvm_pageunlock(pg); return; } } else if (pg->uobject != NULL || pg->uanon != NULL || pg->wire_count != 0) { uvm_pagelock(pg); locked = true; } else { locked = false; } /* * remove page from its object or anon. */ if (pg->uobject != NULL) { uvm_pageremove_object(pg->uobject, pg); } else if (pg->uanon != NULL) { const unsigned int status = uvm_pagegetdirty(pg); pg->uanon->an_page = NULL; pg->uanon = NULL; cpu_count(CPU_COUNT_ANONUNKNOWN + status, -1); } /* * if the page was wired, unwire it now. */ if (pg->wire_count) { pg->wire_count = 0; atomic_dec_uint(&uvmexp.wired); } if (locked) { /* * wake anyone waiting on the page. */ if ((pg->pqflags & PQ_WANTED) != 0) { pg->pqflags &= ~PQ_WANTED; wakeup(pg); } /* * now remove the page from the queues. */ uvm_pagedequeue(pg); uvm_pageunlock(pg); } else { KASSERT(!uvmpdpol_pageisqueued_p(pg)); } /* * and put on free queue */ #ifdef DEBUG pg->uobject = (void *)0xdeadbeef; pg->uanon = (void *)0xdeadbeef; #endif /* DEBUG */ /* Try to send the page to the per-CPU cache. */ s = splvm(); ucpu = curcpu()->ci_data.cpu_uvm; bucket = uvm_page_get_bucket(pg); if (bucket == ucpu->pgflbucket && uvm_pgflcache_free(ucpu, pg)) { splx(s); return; } /* Didn't work. Never mind, send it to a global bucket. */ pgfl = &uvm.page_free[uvm_page_get_freelist(pg)]; pgb = pgfl->pgfl_buckets[bucket]; lock = &uvm_freelist_locks[bucket].lock; mutex_spin_enter(lock); /* PG_FREE must be set under lock because of uvm_pglistalloc(). */ pg->flags = PG_FREE; LIST_INSERT_HEAD(&pgb->pgb_colors[VM_PGCOLOR(pg)], pg, pageq.list); pgb->pgb_nfree++; CPU_COUNT(CPU_COUNT_FREEPAGES, 1); mutex_spin_exit(lock); splx(s); } /* * uvm_page_unbusy: unbusy an array of pages. * * => pages must either all belong to the same object, or all belong to anons. * => if pages are object-owned, object must be locked. * => if pages are anon-owned, anons must be locked. * => caller must make sure that anon-owned pages are not PG_RELEASED. */ void uvm_page_unbusy(struct vm_page **pgs, int npgs) { struct vm_page *pg; int i, pageout_done; UVMHIST_FUNC(__func__); UVMHIST_CALLED(ubchist); pageout_done = 0; for (i = 0; i < npgs; i++) { pg = pgs[i]; if (pg == NULL || pg == PGO_DONTCARE) { continue; } KASSERT(uvm_page_owner_locked_p(pg, true)); KASSERT(pg->flags & PG_BUSY); if (pg->flags & PG_PAGEOUT) { pg->flags &= ~PG_PAGEOUT; pg->flags |= PG_RELEASED; pageout_done++; atomic_inc_uint(&uvmexp.pdfreed); } if (pg->flags & PG_RELEASED) { UVMHIST_LOG(ubchist, "releasing pg %#jx", (uintptr_t)pg, 0, 0, 0); KASSERT(pg->uobject != NULL || (pg->uanon != NULL && pg->uanon->an_ref > 0)); pg->flags &= ~PG_RELEASED; uvm_pagefree(pg); } else { UVMHIST_LOG(ubchist, "unbusying pg %#jx", (uintptr_t)pg, 0, 0, 0); KASSERT((pg->flags & PG_FAKE) == 0); pg->flags &= ~PG_BUSY; uvm_pagelock(pg); uvm_pagewakeup(pg); uvm_pageunlock(pg); UVM_PAGE_OWN(pg, NULL); } } if (pageout_done != 0) { uvm_pageout_done(pageout_done); } } /* * uvm_pagewait: wait for a busy page * * => page must be known PG_BUSY * => object must be read or write locked * => object will be unlocked on return */ void uvm_pagewait(struct vm_page *pg, krwlock_t *lock, const char *wmesg) { KASSERT(rw_lock_held(lock)); KASSERT((pg->flags & PG_BUSY) != 0); KASSERT(uvm_page_owner_locked_p(pg, false)); mutex_enter(&pg->interlock); pg->pqflags |= PQ_WANTED; rw_exit(lock); UVM_UNLOCK_AND_WAIT(pg, &pg->interlock, false, wmesg, 0); } /* * uvm_pagewakeup: wake anyone waiting on a page * * => page interlock must be held */ void uvm_pagewakeup(struct vm_page *pg) { UVMHIST_FUNC(__func__); UVMHIST_CALLED(ubchist); KASSERT(mutex_owned(&pg->interlock)); UVMHIST_LOG(ubchist, "waking pg %#jx", (uintptr_t)pg, 0, 0, 0); if ((pg->pqflags & PQ_WANTED) != 0) { wakeup(pg); pg->pqflags &= ~PQ_WANTED; } } /* * uvm_pagewanted_p: return true if someone is waiting on the page * * => object must be write locked (lock out all concurrent access) */ bool uvm_pagewanted_p(struct vm_page *pg) { KASSERT(uvm_page_owner_locked_p(pg, true)); return (atomic_load_relaxed(&pg->pqflags) & PQ_WANTED) != 0; } #if defined(UVM_PAGE_TRKOWN) /* * uvm_page_own: set or release page ownership * * => this is a debugging function that keeps track of who sets PG_BUSY * and where they do it. it can be used to track down problems * such a process setting "PG_BUSY" and never releasing it. * => page's object [if any] must be locked * => if "tag" is NULL then we are releasing page ownership */ void uvm_page_own(struct vm_page *pg, const char *tag) { KASSERT((pg->flags & (PG_PAGEOUT|PG_RELEASED)) == 0); KASSERT(uvm_page_owner_locked_p(pg, true)); /* gain ownership? */ if (tag) { KASSERT((pg->flags & PG_BUSY) != 0); if (pg->owner_tag) { printf("uvm_page_own: page %p already owned " "by proc %d.%d [%s]\n", pg, pg->owner, pg->lowner, pg->owner_tag); panic("uvm_page_own"); } pg->owner = curproc->p_pid; pg->lowner = curlwp->l_lid; pg->owner_tag = tag; return; } /* drop ownership */ KASSERT((pg->flags & PG_BUSY) == 0); if (pg->owner_tag == NULL) { printf("uvm_page_own: dropping ownership of an non-owned " "page (%p)\n", pg); panic("uvm_page_own"); } pg->owner_tag = NULL; } #endif /* * uvm_pagelookup: look up a page * * => caller should lock object to keep someone from pulling the page * out from under it */ struct vm_page * uvm_pagelookup(struct uvm_object *obj, voff_t off) { struct vm_page *pg; KASSERT(db_active || rw_lock_held(obj->vmobjlock)); pg = radix_tree_lookup_node(&obj->uo_pages, off >> PAGE_SHIFT); KASSERT(pg == NULL || obj->uo_npages != 0); KASSERT(pg == NULL || (pg->flags & (PG_RELEASED|PG_PAGEOUT)) == 0 || (pg->flags & PG_BUSY) != 0); return pg; } /* * uvm_pagewire: wire the page, thus removing it from the daemon's grasp * * => caller must lock objects * => caller must hold pg->interlock */ void uvm_pagewire(struct vm_page *pg) { KASSERT(uvm_page_owner_locked_p(pg, true)); KASSERT(mutex_owned(&pg->interlock)); #if defined(READAHEAD_STATS) if ((pg->flags & PG_READAHEAD) != 0) { uvm_ra_hit.ev_count++; pg->flags &= ~PG_READAHEAD; } #endif /* defined(READAHEAD_STATS) */ if (pg->wire_count == 0) { uvm_pagedequeue(pg); atomic_inc_uint(&uvmexp.wired); } pg->wire_count++; KASSERT(pg->wire_count > 0); /* detect wraparound */ } /* * uvm_pageunwire: unwire the page. * * => activate if wire count goes to zero. * => caller must lock objects * => caller must hold pg->interlock */ void uvm_pageunwire(struct vm_page *pg) { KASSERT(uvm_page_owner_locked_p(pg, true)); KASSERT(pg->wire_count != 0); KASSERT(!uvmpdpol_pageisqueued_p(pg)); KASSERT(mutex_owned(&pg->interlock)); pg->wire_count--; if (pg->wire_count == 0) { uvm_pageactivate(pg); KASSERT(uvmexp.wired != 0); atomic_dec_uint(&uvmexp.wired); } } /* * uvm_pagedeactivate: deactivate page * * => caller must lock objects * => caller must check to make sure page is not wired * => object that page belongs to must be locked (so we can adjust pg->flags) * => caller must clear the reference on the page before calling * => caller must hold pg->interlock */ void uvm_pagedeactivate(struct vm_page *pg) { KASSERT(uvm_page_owner_locked_p(pg, false)); KASSERT(mutex_owned(&pg->interlock)); if (pg->wire_count == 0) { KASSERT(uvmpdpol_pageisqueued_p(pg)); uvmpdpol_pagedeactivate(pg); } } /* * uvm_pageactivate: activate page * * => caller must lock objects * => caller must hold pg->interlock */ void uvm_pageactivate(struct vm_page *pg) { KASSERT(uvm_page_owner_locked_p(pg, false)); KASSERT(mutex_owned(&pg->interlock)); #if defined(READAHEAD_STATS) if ((pg->flags & PG_READAHEAD) != 0) { uvm_ra_hit.ev_count++; pg->flags &= ~PG_READAHEAD; } #endif /* defined(READAHEAD_STATS) */ if (pg->wire_count == 0) { uvmpdpol_pageactivate(pg); } } /* * uvm_pagedequeue: remove a page from any paging queue * * => caller must lock objects * => caller must hold pg->interlock */ void uvm_pagedequeue(struct vm_page *pg) { KASSERT(uvm_page_owner_locked_p(pg, true)); KASSERT(mutex_owned(&pg->interlock)); if (uvmpdpol_pageisqueued_p(pg)) { uvmpdpol_pagedequeue(pg); } } /* * uvm_pageenqueue: add a page to a paging queue without activating. * used where a page is not really demanded (yet). eg. read-ahead * * => caller must lock objects * => caller must hold pg->interlock */ void uvm_pageenqueue(struct vm_page *pg) { KASSERT(uvm_page_owner_locked_p(pg, false)); KASSERT(mutex_owned(&pg->interlock)); if (pg->wire_count == 0 && !uvmpdpol_pageisqueued_p(pg)) { uvmpdpol_pageenqueue(pg); } } /* * uvm_pagelock: acquire page interlock */ void uvm_pagelock(struct vm_page *pg) { mutex_enter(&pg->interlock); } /* * uvm_pagelock2: acquire two page interlocks */ void uvm_pagelock2(struct vm_page *pg1, struct vm_page *pg2) { if (pg1 < pg2) { mutex_enter(&pg1->interlock); mutex_enter(&pg2->interlock); } else { mutex_enter(&pg2->interlock); mutex_enter(&pg1->interlock); } } /* * uvm_pageunlock: release page interlock, and if a page replacement intent * is set on the page, pass it to uvmpdpol to make real. * * => caller must hold pg->interlock */ void uvm_pageunlock(struct vm_page *pg) { if ((pg->pqflags & PQ_INTENT_SET) == 0 || (pg->pqflags & PQ_INTENT_QUEUED) != 0) { mutex_exit(&pg->interlock); return; } pg->pqflags |= PQ_INTENT_QUEUED; mutex_exit(&pg->interlock); uvmpdpol_pagerealize(pg); } /* * uvm_pageunlock2: release two page interlocks, and for both pages if a * page replacement intent is set on the page, pass it to uvmpdpol to make * real. * * => caller must hold pg->interlock */ void uvm_pageunlock2(struct vm_page *pg1, struct vm_page *pg2) { if ((pg1->pqflags & PQ_INTENT_SET) == 0 || (pg1->pqflags & PQ_INTENT_QUEUED) != 0) { mutex_exit(&pg1->interlock); pg1 = NULL; } else { pg1->pqflags |= PQ_INTENT_QUEUED; mutex_exit(&pg1->interlock); } if ((pg2->pqflags & PQ_INTENT_SET) == 0 || (pg2->pqflags & PQ_INTENT_QUEUED) != 0) { mutex_exit(&pg2->interlock); pg2 = NULL; } else { pg2->pqflags |= PQ_INTENT_QUEUED; mutex_exit(&pg2->interlock); } if (pg1 != NULL) { uvmpdpol_pagerealize(pg1); } if (pg2 != NULL) { uvmpdpol_pagerealize(pg2); } } /* * uvm_pagezero: zero fill a page * * => if page is part of an object then the object should be locked * to protect pg->flags. */ void uvm_pagezero(struct vm_page *pg) { uvm_pagemarkdirty(pg, UVM_PAGE_STATUS_DIRTY); pmap_zero_page(VM_PAGE_TO_PHYS(pg)); } /* * uvm_pagecopy: copy a page * * => if page is part of an object then the object should be locked * to protect pg->flags. */ void uvm_pagecopy(struct vm_page *src, struct vm_page *dst) { uvm_pagemarkdirty(dst, UVM_PAGE_STATUS_DIRTY); pmap_copy_page(VM_PAGE_TO_PHYS(src), VM_PAGE_TO_PHYS(dst)); } /* * uvm_pageismanaged: test it see that a page (specified by PA) is managed. */ bool uvm_pageismanaged(paddr_t pa) { return (uvm_physseg_find(atop(pa), NULL) != UVM_PHYSSEG_TYPE_INVALID); } /* * uvm_page_lookup_freelist: look up the free list for the specified page */ int uvm_page_lookup_freelist(struct vm_page *pg) { uvm_physseg_t upm; upm = uvm_physseg_find(atop(VM_PAGE_TO_PHYS(pg)), NULL); KASSERT(upm != UVM_PHYSSEG_TYPE_INVALID); return uvm_physseg_get_free_list(upm); } /* * uvm_page_owner_locked_p: return true if object associated with page is * locked. this is a weak check for runtime assertions only. */ bool uvm_page_owner_locked_p(struct vm_page *pg, bool exclusive) { if (pg->uobject != NULL) { return exclusive ? rw_write_held(pg->uobject->vmobjlock) : rw_lock_held(pg->uobject->vmobjlock); } if (pg->uanon != NULL) { return exclusive ? rw_write_held(pg->uanon->an_lock) : rw_lock_held(pg->uanon->an_lock); } return true; } /* * uvm_pagereadonly_p: return if the page should be mapped read-only */ bool uvm_pagereadonly_p(struct vm_page *pg) { struct uvm_object * const uobj = pg->uobject; KASSERT(uobj == NULL || rw_lock_held(uobj->vmobjlock)); KASSERT(uobj != NULL || rw_lock_held(pg->uanon->an_lock)); if ((pg->flags & PG_RDONLY) != 0) { return true; } if (uvm_pagegetdirty(pg) == UVM_PAGE_STATUS_CLEAN) { return true; } if (uobj == NULL) { return false; } return UVM_OBJ_NEEDS_WRITEFAULT(uobj); } #ifdef PMAP_DIRECT /* * Call pmap to translate physical address into a virtual and to run a callback * for it. Used to avoid actually mapping the pages, pmap most likely uses direct map * or equivalent. */ int uvm_direct_process(struct vm_page **pgs, u_int npages, voff_t off, vsize_t len, int (*process)(void *, size_t, void *), void *arg) { int error = 0; paddr_t pa; size_t todo; voff_t pgoff = (off & PAGE_MASK); struct vm_page *pg; KASSERT(npages > 0); KASSERT(len > 0); for (int i = 0; i < npages; i++) { pg = pgs[i]; KASSERT(len > 0); /* * Caller is responsible for ensuring all the pages are * available. */ KASSERT(pg != NULL); KASSERT(pg != PGO_DONTCARE); pa = VM_PAGE_TO_PHYS(pg); todo = MIN(len, PAGE_SIZE - pgoff); error = pmap_direct_process(pa, pgoff, todo, process, arg); if (error) break; pgoff = 0; len -= todo; } KASSERTMSG(error != 0 || len == 0, "len %lu != 0 for non-error", len); return error; } #endif /* PMAP_DIRECT */ #if defined(DDB) || defined(DEBUGPRINT) /* * uvm_page_printit: actually print the page */ static const char page_flagbits[] = UVM_PGFLAGBITS; static const char page_pqflagbits[] = UVM_PQFLAGBITS; void uvm_page_printit(struct vm_page *pg, bool full, void (*pr)(const char *, ...)) { struct vm_page *tpg; struct uvm_object *uobj; struct pgflbucket *pgb; struct pgflist *pgl; char pgbuf[128]; (*pr)("PAGE %p:\n", pg); snprintb(pgbuf, sizeof(pgbuf), page_flagbits, pg->flags); (*pr)(" flags=%s\n", pgbuf); snprintb(pgbuf, sizeof(pgbuf), page_pqflagbits, pg->pqflags); (*pr)(" pqflags=%s\n", pgbuf); (*pr)(" uobject=%p, uanon=%p, offset=0x%llx\n", pg->uobject, pg->uanon, (long long)pg->offset); (*pr)(" loan_count=%d wire_count=%d bucket=%d freelist=%d\n", pg->loan_count, pg->wire_count, uvm_page_get_bucket(pg), uvm_page_get_freelist(pg)); (*pr)(" pa=0x%lx\n", (long)VM_PAGE_TO_PHYS(pg)); #if defined(UVM_PAGE_TRKOWN) if (pg->flags & PG_BUSY) (*pr)(" owning process = %d.%d, tag=%s\n", pg->owner, pg->lowner, pg->owner_tag); else (*pr)(" page not busy, no owner\n"); #else (*pr)(" [page ownership tracking disabled]\n"); #endif if (!full) return; /* cross-verify object/anon */ if ((pg->flags & PG_FREE) == 0) { if (pg->flags & PG_ANON) { if (pg->uanon == NULL || pg->uanon->an_page != pg) (*pr)(" >>> ANON DOES NOT POINT HERE <<< (%p)\n", (pg->uanon) ? pg->uanon->an_page : NULL); else (*pr)(" anon backpointer is OK\n"); } else { uobj = pg->uobject; if (uobj) { (*pr)(" checking object list\n"); tpg = uvm_pagelookup(uobj, pg->offset); if (tpg) (*pr)(" page found on object list\n"); else (*pr)(" >>> PAGE NOT FOUND ON OBJECT LIST! <<<\n"); } } } /* cross-verify page queue */ if (pg->flags & PG_FREE) { int fl = uvm_page_get_freelist(pg); int b = uvm_page_get_bucket(pg); pgb = uvm.page_free[fl].pgfl_buckets[b]; pgl = &pgb->pgb_colors[VM_PGCOLOR(pg)]; (*pr)(" checking pageq list\n"); LIST_FOREACH(tpg, pgl, pageq.list) { if (tpg == pg) { break; } } if (tpg) (*pr)(" page found on pageq list\n"); else (*pr)(" >>> PAGE NOT FOUND ON PAGEQ LIST! <<<\n"); } } /* * uvm_page_printall - print a summary of all managed pages */ void uvm_page_printall(void (*pr)(const char *, ...)) { uvm_physseg_t i; paddr_t pfn; struct vm_page *pg; (*pr)("%18s %4s %4s %18s %18s" #ifdef UVM_PAGE_TRKOWN " OWNER" #endif "\n", "PAGE", "FLAG", "PQ", "UOBJECT", "UANON"); for (i = uvm_physseg_get_first(); uvm_physseg_valid_p(i); i = uvm_physseg_get_next(i)) { for (pfn = uvm_physseg_get_start(i); pfn < uvm_physseg_get_end(i); pfn++) { pg = PHYS_TO_VM_PAGE(ptoa(pfn)); (*pr)("%18p %04x %08x %18p %18p", pg, pg->flags, pg->pqflags, pg->uobject, pg->uanon); #ifdef UVM_PAGE_TRKOWN if (pg->flags & PG_BUSY) (*pr)(" %d [%s]", pg->owner, pg->owner_tag); #endif (*pr)("\n"); } } } /* * uvm_page_print_freelists - print a summary freelists */ void uvm_page_print_freelists(void (*pr)(const char *, ...)) { struct pgfreelist *pgfl; struct pgflbucket *pgb; int fl, b, c; (*pr)("There are %d freelists with %d buckets of %d colors.\n\n", VM_NFREELIST, uvm.bucketcount, uvmexp.ncolors); for (fl = 0; fl < VM_NFREELIST; fl++) { pgfl = &uvm.page_free[fl]; (*pr)("freelist(%d) @ %p\n", fl, pgfl); for (b = 0; b < uvm.bucketcount; b++) { pgb = uvm.page_free[fl].pgfl_buckets[b]; (*pr)(" bucket(%d) @ %p, nfree = %d, lock @ %p:\n", b, pgb, pgb->pgb_nfree, &uvm_freelist_locks[b].lock); for (c = 0; c < uvmexp.ncolors; c++) { (*pr)(" color(%d) @ %p, ", c, &pgb->pgb_colors[c]); (*pr)("first page = %p\n", LIST_FIRST(&pgb->pgb_colors[c])); } } } } #endif /* DDB || DEBUGPRINT */
49 20 30 31 31 30 3 15 3 14 14 5 1 1 9 7 2 2 3 5 1 1 4 4 4 5 1 9 7 4 1 2 7 3 4 4 67 67 63 4 61 43 21 3 21 29 8 1 20 16 4 13 6 1 54 46 1 45 115 115 115 116 61 1 8 57 3 55 10 115 115 2 67 48 91 2 23 106 6 67 43 34 38 43 107 6 115 6 72 44 115 99 13 13 95 20 5 7 6 3 22 3 2 5 8 5 13 7 6 1 13 22 1 18 13 17 4 4 4 4 1 1 10 10 2 4 6 4 2 2 1 2 61 61 51 13 40 2 11 35 3 16 15 60 58 2 57 2 57 58 1 57 2 38 18 4 55 10 7 4 4 7 30 24 1 5 7 1 15 16 23 1 16 1 16 16 12 2 24 17 7 26 2 25 221 220 30 57 163 1 212 65 2 19 5 60 47 2 12 66 10 3 7 1 6 23 5 20 13 4 5 1 2 10 30 2 2 11 1 4 2 23 7 18 10 4 6 3 4 88 60 29 73 3 3 62 3 62 4 43 14 30 20 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 /* $NetBSD: uipc_syscalls.c,v 1.211 2024/02/03 19:05:14 jdolecek Exp $ */ /*- * Copyright (c) 2008, 2009, 2023 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Andrew Doran. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /* * Copyright (c) 1982, 1986, 1989, 1990, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)uipc_syscalls.c 8.6 (Berkeley) 2/14/95 */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: uipc_syscalls.c,v 1.211 2024/02/03 19:05:14 jdolecek Exp $"); #ifdef _KERNEL_OPT #include "opt_pipe.h" #include "opt_sctp.h" #endif #define MBUFTYPES #include <sys/param.h> #include <sys/systm.h> #include <sys/filedesc.h> #include <sys/proc.h> #include <sys/file.h> #include <sys/buf.h> #include <sys/mbuf.h> #include <sys/protosw.h> #include <sys/socket.h> #include <sys/socketvar.h> #include <sys/signalvar.h> #include <sys/un.h> #include <sys/ktrace.h> #include <sys/event.h> #include <sys/atomic.h> #include <sys/kauth.h> #ifdef SCTP #include <netinet/sctp_uio.h> #include <netinet/sctp_peeloff.h> #endif #include <sys/mount.h> #include <sys/syscallargs.h> /* * System call interface to the socket abstraction. */ extern const struct fileops socketops; static int sockargs_sb(struct sockaddr_big *, const void *, socklen_t); int sys___socket30(struct lwp *l, const struct sys___socket30_args *uap, register_t *retval) { /* { syscallarg(int) domain; syscallarg(int) type; syscallarg(int) protocol; } */ int fd, error; file_t *fp; error = fsocreate(SCARG(uap, domain), NULL, SCARG(uap, type), SCARG(uap, protocol), &fd, &fp, NULL); if (error == 0) { fd_affix(l->l_proc, fp, fd); *retval = fd; } return error; } int sys_bind(struct lwp *l, const struct sys_bind_args *uap, register_t *retval) { /* { syscallarg(int) s; syscallarg(const struct sockaddr *) name; syscallarg(unsigned int) namelen; } */ int error; struct sockaddr_big sb; error = sockargs_sb(&sb, SCARG(uap, name), SCARG(uap, namelen)); if (error) return error; return do_sys_bind(l, SCARG(uap, s), (struct sockaddr *)&sb); } int do_sys_bind(struct lwp *l, int fd, struct sockaddr *nam) { struct socket *so; int error; if ((error = fd_getsock(fd, &so)) != 0) return error; error = sobind(so, nam, l); fd_putfile(fd); return error; } int sys_listen(struct lwp *l, const struct sys_listen_args *uap, register_t *retval) { /* { syscallarg(int) s; syscallarg(int) backlog; } */ struct socket *so; int error; if ((error = fd_getsock(SCARG(uap, s), &so)) != 0) return (error); error = solisten(so, SCARG(uap, backlog), l); fd_putfile(SCARG(uap, s)); return error; } int do_sys_accept(struct lwp *l, int sock, struct sockaddr *name, register_t *new_sock, const sigset_t *mask, int flags, int clrflags) { file_t *fp, *fp2; int error, fd; struct socket *so, *so2; short wakeup_state = 0; if ((fp = fd_getfile(sock)) == NULL) return EBADF; if (fp->f_type != DTYPE_SOCKET) { fd_putfile(sock); return ENOTSOCK; } if ((error = fd_allocfile(&fp2, &fd)) != 0) { fd_putfile(sock); return error; } *new_sock = fd; so = fp->f_socket; solock(so); if (__predict_false(mask)) sigsuspendsetup(l, mask); if (!(so->so_proto->pr_flags & PR_LISTEN)) { error = EOPNOTSUPP; goto bad; } if ((so->so_options & SO_ACCEPTCONN) == 0) { error = EINVAL; goto bad; } if ((so->so_state & SS_NBIO) && so->so_qlen == 0) { error = EWOULDBLOCK; goto bad; } while (so->so_qlen == 0 && so->so_error == 0) { if (so->so_state & SS_CANTRCVMORE) { so->so_error = ECONNABORTED; break; } if (wakeup_state & SS_RESTARTSYS) { error = ERESTART; goto bad; } error = sowait(so, true, 0); if (error) { goto bad; } wakeup_state = so->so_state; } if (so->so_error) { error = so->so_error; so->so_error = 0; goto bad; } /* connection has been removed from the listen queue */ KNOTE(&so->so_rcv.sb_sel.sel_klist, NOTE_SUBMIT); so2 = TAILQ_FIRST(&so->so_q); if (soqremque(so2, 1) == 0) panic("accept"); fp2->f_type = DTYPE_SOCKET; fp2->f_flag = (fp->f_flag & ~clrflags) | ((flags & SOCK_NONBLOCK) ? FNONBLOCK : 0)| ((flags & SOCK_NOSIGPIPE) ? FNOSIGPIPE : 0); fp2->f_ops = &socketops; fp2->f_socket = so2; if (fp2->f_flag & FNONBLOCK) so2->so_state |= SS_NBIO; else so2->so_state &= ~SS_NBIO; error = soaccept(so2, name); so2->so_cred = kauth_cred_hold(so->so_cred); sounlock(so); if (error) { /* an error occurred, free the file descriptor and mbuf */ mutex_enter(&fp2->f_lock); fp2->f_count++; mutex_exit(&fp2->f_lock); closef(fp2); fd_abort(curproc, NULL, fd); } else { fd_set_exclose(l, fd, (flags & SOCK_CLOEXEC) != 0); fd_affix(curproc, fp2, fd); } fd_putfile(sock); if (__predict_false(mask)) sigsuspendteardown(l); return error; bad: sounlock(so); fd_putfile(sock); fd_abort(curproc, fp2, fd); if (__predict_false(mask)) sigsuspendteardown(l); return error; } int sys_accept(struct lwp *l, const struct sys_accept_args *uap, register_t *retval) { /* { syscallarg(int) s; syscallarg(struct sockaddr *) name; syscallarg(unsigned int *) anamelen; } */ int error, fd; struct sockaddr_big name; name.sb_len = UCHAR_MAX; error = do_sys_accept(l, SCARG(uap, s), (struct sockaddr *)&name, retval, NULL, 0, 0); if (error != 0) return error; error = copyout_sockname_sb(SCARG(uap, name), SCARG(uap, anamelen), MSG_LENUSRSPACE, &name); if (error != 0) { fd = (int)*retval; if (fd_getfile(fd) != NULL) (void)fd_close(fd); } return error; } int sys_paccept(struct lwp *l, const struct sys_paccept_args *uap, register_t *retval) { /* { syscallarg(int) s; syscallarg(struct sockaddr *) name; syscallarg(unsigned int *) anamelen; syscallarg(const sigset_t *) mask; syscallarg(int) flags; } */ int error, fd; struct sockaddr_big name; sigset_t *mask, amask; if (SCARG(uap, mask) != NULL) { error = copyin(SCARG(uap, mask), &amask, sizeof(amask)); if (error) return error; mask = &amask; } else mask = NULL; name.sb_len = UCHAR_MAX; error = do_sys_accept(l, SCARG(uap, s), (struct sockaddr *)&name, retval, mask, SCARG(uap, flags), FNONBLOCK); if (error != 0) return error; error = copyout_sockname_sb(SCARG(uap, name), SCARG(uap, anamelen), MSG_LENUSRSPACE, &name); if (error != 0) { fd = (int)*retval; if (fd_getfile(fd) != NULL) (void)fd_close(fd); } return error; } int sys_connect(struct lwp *l, const struct sys_connect_args *uap, register_t *retval) { /* { syscallarg(int) s; syscallarg(const struct sockaddr *) name; syscallarg(unsigned int) namelen; } */ int error; struct sockaddr_big sbig; error = sockargs_sb(&sbig, SCARG(uap, name), SCARG(uap, namelen)); if (error) return error; return do_sys_connect(l, SCARG(uap, s), (struct sockaddr *)&sbig); } int do_sys_connect(struct lwp *l, int fd, struct sockaddr *nam) { struct socket *so; int error; int interrupted = 0; if ((error = fd_getsock(fd, &so)) != 0) { return (error); } solock(so); if ((so->so_state & SS_ISCONNECTING) != 0) { error = EALREADY; goto out; } error = soconnect(so, nam, l); if (error) goto bad; if ((so->so_state & (SS_NBIO|SS_ISCONNECTING)) == (SS_NBIO|SS_ISCONNECTING)) { error = EINPROGRESS; goto out; } while ((so->so_state & SS_ISCONNECTING) != 0 && so->so_error == 0) { error = sowait(so, true, 0); if (__predict_false((so->so_state & SS_ISABORTING) != 0)) { error = EPIPE; interrupted = 1; break; } if (error) { if (error == EINTR || error == ERESTART) interrupted = 1; break; } } if (error == 0) { error = so->so_error; so->so_error = 0; } bad: if (!interrupted) so->so_state &= ~SS_ISCONNECTING; if (error == ERESTART) error = EINTR; out: sounlock(so); fd_putfile(fd); return error; } int sys_socketpair(struct lwp *l, const struct sys_socketpair_args *uap, register_t *retval) { /* { syscallarg(int) domain; syscallarg(int) type; syscallarg(int) protocol; syscallarg(int *) rsv; } */ file_t *fp1, *fp2; struct socket *so1, *so2; int fd, error, sv[2]; proc_t *p = curproc; int flags = SCARG(uap, type) & SOCK_FLAGS_MASK; int type = SCARG(uap, type) & ~SOCK_FLAGS_MASK; int domain = SCARG(uap, domain); int proto = SCARG(uap, protocol); error = fsocreate(domain, &so1, type|flags, proto, &fd, &fp1, NULL); if (error) return error; sv[0] = fd; error = fsocreate(domain, &so2, type|flags, proto, &fd, &fp2, so1); if (error) goto out; sv[1] = fd; solock(so1); error = soconnect2(so1, so2); if (error == 0 && type == SOCK_DGRAM) { /* * Datagram socket connection is asymmetric. */ error = soconnect2(so2, so1); } sounlock(so1); if (error == 0) error = copyout(sv, SCARG(uap, rsv), sizeof(sv)); if (error == 0) { fd_affix(p, fp2, sv[1]); fd_affix(p, fp1, sv[0]); return 0; } fd_abort(p, fp2, sv[1]); (void)soclose(so2); out: fd_abort(p, fp1, sv[0]); (void)soclose(so1); return error; } int sys_sendto(struct lwp *l, const struct sys_sendto_args *uap, register_t *retval) { /* { syscallarg(int) s; syscallarg(const void *) buf; syscallarg(size_t) len; syscallarg(int) flags; syscallarg(const struct sockaddr *) to; syscallarg(unsigned int) tolen; } */ struct msghdr msg = {0}; struct iovec aiov; msg.msg_name = __UNCONST(SCARG(uap, to)); /* XXXUNCONST kills const */ msg.msg_namelen = SCARG(uap, tolen); msg.msg_iov = &aiov; msg.msg_iovlen = 1; msg.msg_control = NULL; msg.msg_flags = 0; aiov.iov_base = __UNCONST(SCARG(uap, buf)); /* XXXUNCONST kills const */ aiov.iov_len = SCARG(uap, len); return do_sys_sendmsg(l, SCARG(uap, s), &msg, SCARG(uap, flags), retval); } int sys_sendmsg(struct lwp *l, const struct sys_sendmsg_args *uap, register_t *retval) { /* { syscallarg(int) s; syscallarg(const struct msghdr *) msg; syscallarg(int) flags; } */ struct msghdr msg; int error; error = copyin(SCARG(uap, msg), &msg, sizeof(msg)); if (error) return (error); msg.msg_flags = MSG_IOVUSRSPACE; return do_sys_sendmsg(l, SCARG(uap, s), &msg, SCARG(uap, flags), retval); } int do_sys_sendmsg_so(struct lwp *l, int s, struct socket *so, file_t *fp, struct msghdr *mp, int flags, register_t *retsize) { struct iovec aiov[UIO_SMALLIOV], *iov = aiov, *tiov, *ktriov = NULL; struct sockaddr *sa = NULL; struct mbuf *to, *control; struct uio auio; size_t len, iovsz; int i, error; ktrkuser("msghdr", mp, sizeof(*mp)); /* If the caller passed us stuff in mbufs, we must free them. */ to = (mp->msg_flags & MSG_NAMEMBUF) ? mp->msg_name : NULL; control = (mp->msg_flags & MSG_CONTROLMBUF) ? mp->msg_control : NULL; iovsz = mp->msg_iovlen * sizeof(struct iovec); if (mp->msg_flags & MSG_IOVUSRSPACE) { if ((unsigned int)mp->msg_iovlen > UIO_SMALLIOV) { if ((unsigned int)mp->msg_iovlen > IOV_MAX) { error = EMSGSIZE; goto bad; } iov = kmem_alloc(iovsz, KM_SLEEP); } if (mp->msg_iovlen != 0) { error = copyin(mp->msg_iov, iov, iovsz); if (error) goto bad; } auio.uio_iov = iov; } else auio.uio_iov = mp->msg_iov; auio.uio_iovcnt = mp->msg_iovlen; auio.uio_rw = UIO_WRITE; auio.uio_offset = 0; /* XXX */ auio.uio_resid = 0; KASSERT(l == curlwp); auio.uio_vmspace = l->l_proc->p_vmspace; tiov = auio.uio_iov; for (i = 0; i < auio.uio_iovcnt; i++, tiov++) { /* * Writes return ssize_t because -1 is returned on error. * Therefore, we must restrict the length to SSIZE_MAX to * avoid garbage return values. */ auio.uio_resid += tiov->iov_len; if (tiov->iov_len > SSIZE_MAX || auio.uio_resid > SSIZE_MAX) { error = EINVAL; goto bad; } } if (mp->msg_name && to == NULL) { error = sockargs(&to, mp->msg_name, mp->msg_namelen, UIO_USERSPACE, MT_SONAME); if (error) goto bad; } if (mp->msg_control) { if (mp->msg_controllen < CMSG_ALIGN(sizeof(struct cmsghdr))) { error = EINVAL; goto bad; } if (control == NULL) { error = sockargs(&control, mp->msg_control, mp->msg_controllen, UIO_USERSPACE, MT_CONTROL); if (error) goto bad; } } if (ktrpoint(KTR_GENIO) && iovsz > 0) { ktriov = kmem_alloc(iovsz, KM_SLEEP); memcpy(ktriov, auio.uio_iov, iovsz); } if (mp->msg_name) MCLAIM(to, so->so_mowner); if (mp->msg_control) MCLAIM(control, so->so_mowner); if (to) { sa = mtod(to, struct sockaddr *); } len = auio.uio_resid; error = (*so->so_send)(so, sa, &auio, NULL, control, flags, l); /* Protocol is responsible for freeing 'control' */ control = NULL; if (error) { if (auio.uio_resid != len && (error == ERESTART || error == EINTR || error == EWOULDBLOCK)) error = 0; if (error == EPIPE && (fp->f_flag & FNOSIGPIPE) == 0 && (flags & MSG_NOSIGNAL) == 0) { mutex_enter(&proc_lock); psignal(l->l_proc, SIGPIPE); mutex_exit(&proc_lock); } } if (error == 0) *retsize = len - auio.uio_resid; bad: if (ktriov != NULL) { ktrgeniov(s, UIO_WRITE, ktriov, *retsize, error); kmem_free(ktriov, iovsz); } if (iov != aiov) kmem_free(iov, iovsz); if (to) m_freem(to); if (control) m_freem(control); return error; } int do_sys_sendmsg(struct lwp *l, int s, struct msghdr *mp, int flags, register_t *retsize) { int error; struct socket *so; file_t *fp; if ((error = fd_getsock1(s, &so, &fp)) != 0) { /* We have to free msg_name and msg_control ourselves */ if (mp->msg_flags & MSG_NAMEMBUF) m_freem(mp->msg_name); if (mp->msg_flags & MSG_CONTROLMBUF) m_freem(mp->msg_control); return error; } error = do_sys_sendmsg_so(l, s, so, fp, mp, flags, retsize); /* msg_name and msg_control freed */ fd_putfile(s); return error; } int sys_recvfrom(struct lwp *l, const struct sys_recvfrom_args *uap, register_t *retval) { /* { syscallarg(int) s; syscallarg(void *) buf; syscallarg(size_t) len; syscallarg(int) flags; syscallarg(struct sockaddr *) from; syscallarg(unsigned int *) fromlenaddr; } */ struct msghdr msg = {0}; struct iovec aiov; int error; struct mbuf *from; msg.msg_name = NULL; msg.msg_iov = &aiov; msg.msg_iovlen = 1; aiov.iov_base = SCARG(uap, buf); aiov.iov_len = SCARG(uap, len); msg.msg_control = NULL; msg.msg_flags = SCARG(uap, flags) & MSG_USERFLAGS; error = do_sys_recvmsg(l, SCARG(uap, s), &msg, &from, NULL, retval); if (error != 0) return error; error = copyout_sockname(SCARG(uap, from), SCARG(uap, fromlenaddr), MSG_LENUSRSPACE, from); if (from != NULL) m_free(from); return error; } int sys_recvmsg(struct lwp *l, const struct sys_recvmsg_args *uap, register_t *retval) { /* { syscallarg(int) s; syscallarg(struct msghdr *) msg; syscallarg(int) flags; } */ struct msghdr msg; int error; struct mbuf *from, *control; error = copyin(SCARG(uap, msg), &msg, sizeof(msg)); if (error) return error; msg.msg_flags = (SCARG(uap, flags) & MSG_USERFLAGS) | MSG_IOVUSRSPACE; error = do_sys_recvmsg(l, SCARG(uap, s), &msg, &from, msg.msg_control != NULL ? &control : NULL, retval); if (error != 0) return error; if (msg.msg_control != NULL) error = copyout_msg_control(l, &msg, control); if (error == 0) error = copyout_sockname(msg.msg_name, &msg.msg_namelen, 0, from); if (from != NULL) m_free(from); if (error == 0) { ktrkuser("msghdr", &msg, sizeof(msg)); error = copyout(&msg, SCARG(uap, msg), sizeof(msg)); } return error; } int sys_sendmmsg(struct lwp *l, const struct sys_sendmmsg_args *uap, register_t *retval) { /* { syscallarg(int) s; syscallarg(struct mmsghdr *) mmsg; syscallarg(unsigned int) vlen; syscallarg(unsigned int) flags; } */ struct mmsghdr mmsg; struct socket *so; file_t *fp; struct msghdr *msg = &mmsg.msg_hdr; int error, s; unsigned int vlen, flags, dg; s = SCARG(uap, s); if ((error = fd_getsock1(s, &so, &fp)) != 0) return error; vlen = SCARG(uap, vlen); if (vlen > 1024) vlen = 1024; flags = (SCARG(uap, flags) & MSG_USERFLAGS) | MSG_IOVUSRSPACE; for (dg = 0; dg < vlen;) { error = copyin(SCARG(uap, mmsg) + dg, &mmsg, sizeof(mmsg)); if (error) break; msg->msg_flags = flags; error = do_sys_sendmsg_so(l, s, so, fp, msg, flags, retval); if (error) break; ktrkuser("msghdr", msg, sizeof(*msg)); mmsg.msg_len = *retval; error = copyout(&mmsg, SCARG(uap, mmsg) + dg, sizeof(mmsg)); if (error) break; dg++; } *retval = dg; fd_putfile(s); /* * If we succeeded at least once, return 0. */ if (dg) return 0; return error; } /* * Adjust for a truncated SCM_RIGHTS control message. * This means closing any file descriptors that aren't present * in the returned buffer. * m is the mbuf holding the (already externalized) SCM_RIGHTS message. */ static void free_rights(struct mbuf *m) { struct cmsghdr *cm; int *fdv; unsigned int nfds, i; KASSERT(sizeof(*cm) <= m->m_len); cm = mtod(m, struct cmsghdr *); KASSERT(CMSG_ALIGN(sizeof(*cm)) <= cm->cmsg_len); KASSERT(cm->cmsg_len <= m->m_len); nfds = (cm->cmsg_len - CMSG_ALIGN(sizeof(*cm))) / sizeof(int); fdv = (int *)CMSG_DATA(cm); for (i = 0; i < nfds; i++) if (fd_getfile(fdv[i]) != NULL) (void)fd_close(fdv[i]); } void free_control_mbuf(struct lwp *l, struct mbuf *control, struct mbuf *uncopied) { struct mbuf *next; struct cmsghdr *cmsg; bool do_free_rights = false; while (control != NULL) { cmsg = mtod(control, struct cmsghdr *); if (control == uncopied) do_free_rights = true; if (do_free_rights && cmsg->cmsg_level == SOL_SOCKET && cmsg->cmsg_type == SCM_RIGHTS) free_rights(control); next = control->m_next; m_free(control); control = next; } } /* Copy socket control/CMSG data to user buffer, frees the mbuf */ int copyout_msg_control(struct lwp *l, struct msghdr *mp, struct mbuf *control) { int i, len, error = 0; struct cmsghdr *cmsg; struct mbuf *m; char *q; len = mp->msg_controllen; if (len <= 0 || control == 0) { mp->msg_controllen = 0; free_control_mbuf(l, control, control); return 0; } q = (char *)mp->msg_control; for (m = control; m != NULL; ) { cmsg = mtod(m, struct cmsghdr *); i = m->m_len; if (len < i) { mp->msg_flags |= MSG_CTRUNC; if (cmsg->cmsg_level == SOL_SOCKET && cmsg->cmsg_type == SCM_RIGHTS) /* Do not truncate me ... */ break; i = len; } error = copyout(mtod(m, void *), q, i); ktrkuser(mbuftypes[MT_CONTROL], cmsg, cmsg->cmsg_len); if (error != 0) { /* We must free all the SCM_RIGHTS */ m = control; break; } m = m->m_next; if (m) i = ALIGN(i); q += i; len -= i; if (len <= 0) break; } free_control_mbuf(l, control, m); mp->msg_controllen = q - (char *)mp->msg_control; return error; } int do_sys_recvmsg_so(struct lwp *l, int s, struct socket *so, struct msghdr *mp, struct mbuf **from, struct mbuf **control, register_t *retsize) { struct iovec aiov[UIO_SMALLIOV], *iov = aiov, *tiov, *ktriov = NULL; struct uio auio; size_t len, iovsz; int i, error; ktrkuser("msghdr", mp, sizeof(*mp)); *from = NULL; if (control != NULL) *control = NULL; iovsz = mp->msg_iovlen * sizeof(struct iovec); if (mp->msg_flags & MSG_IOVUSRSPACE) { if ((unsigned int)mp->msg_iovlen > UIO_SMALLIOV) { if ((unsigned int)mp->msg_iovlen > IOV_MAX) { error = EMSGSIZE; goto out; } iov = kmem_alloc(iovsz, KM_SLEEP); } if (mp->msg_iovlen != 0) { error = copyin(mp->msg_iov, iov, iovsz); if (error) goto out; } auio.uio_iov = iov; } else auio.uio_iov = mp->msg_iov; auio.uio_iovcnt = mp->msg_iovlen; auio.uio_rw = UIO_READ; auio.uio_offset = 0; /* XXX */ auio.uio_resid = 0; KASSERT(l == curlwp); auio.uio_vmspace = l->l_proc->p_vmspace; tiov = auio.uio_iov; for (i = 0; i < auio.uio_iovcnt; i++, tiov++) { /* * Reads return ssize_t because -1 is returned on error. * Therefore we must restrict the length to SSIZE_MAX to * avoid garbage return values. */ auio.uio_resid += tiov->iov_len; if (tiov->iov_len > SSIZE_MAX || auio.uio_resid > SSIZE_MAX) { error = EINVAL; goto out; } } if (ktrpoint(KTR_GENIO) && iovsz > 0) { ktriov = kmem_alloc(iovsz, KM_SLEEP); memcpy(ktriov, auio.uio_iov, iovsz); } len = auio.uio_resid; mp->msg_flags &= MSG_USERFLAGS; error = (*so->so_receive)(so, from, &auio, NULL, control, &mp->msg_flags); KASSERT(*from == NULL || (*from)->m_next == NULL); len -= auio.uio_resid; *retsize = len; if (error != 0 && len != 0 && (error == ERESTART || error == EINTR || error == EWOULDBLOCK)) /* Some data transferred */ error = 0; if (ktriov != NULL) { ktrgeniov(s, UIO_READ, ktriov, len, error); kmem_free(ktriov, iovsz); } if (error != 0) { m_freem(*from); *from = NULL; if (control != NULL) { free_control_mbuf(l, *control, *control); *control = NULL; } } out: if (iov != aiov) kmem_free(iov, iovsz); return error; } int do_sys_recvmsg(struct lwp *l, int s, struct msghdr *mp, struct mbuf **from, struct mbuf **control, register_t *retsize) { int error; struct socket *so; if ((error = fd_getsock(s, &so)) != 0) return error; error = do_sys_recvmsg_so(l, s, so, mp, from, control, retsize); fd_putfile(s); return error; } int sys_recvmmsg(struct lwp *l, const struct sys_recvmmsg_args *uap, register_t *retval) { /* { syscallarg(int) s; syscallarg(struct mmsghdr *) mmsg; syscallarg(unsigned int) vlen; syscallarg(unsigned int) flags; syscallarg(struct timespec *) timeout; } */ struct mmsghdr mmsg; struct socket *so; struct msghdr *msg = &mmsg.msg_hdr; int error, s; struct mbuf *from, *control; struct timespec ts, now; unsigned int vlen, flags, dg; if (SCARG(uap, timeout)) { if ((error = copyin(SCARG(uap, timeout), &ts, sizeof(ts))) != 0) return error; if (ts.tv_nsec < 0 || ts.tv_nsec >= 1000000000L) return EINVAL; getnanotime(&now); if (timespecaddok(&now, &ts)) { timespecadd(&now, &ts, &ts); } else { ts.tv_sec = __type_max(time_t); ts.tv_nsec = 999999999L; } } s = SCARG(uap, s); if ((error = fd_getsock(s, &so)) != 0) return error; /* * If so->so_rerror holds a deferred error return it now. */ if (so->so_rerror) { error = so->so_rerror; so->so_rerror = 0; fd_putfile(s); return error; } vlen = SCARG(uap, vlen); if (vlen > 1024) vlen = 1024; from = NULL; flags = (SCARG(uap, flags) & MSG_USERFLAGS) | MSG_IOVUSRSPACE; for (dg = 0; dg < vlen;) { error = copyin(SCARG(uap, mmsg) + dg, &mmsg, sizeof(mmsg)); if (error) break; msg->msg_flags = flags & ~MSG_WAITFORONE; if (from != NULL) { m_free(from); from = NULL; } error = do_sys_recvmsg_so(l, s, so, msg, &from, msg->msg_control != NULL ? &control : NULL, retval); if (error) { if (error == EAGAIN && dg > 0) error = 0; break; } if (msg->msg_control != NULL) error = copyout_msg_control(l, msg, control); if (error) break; error = copyout_sockname(msg->msg_name, &msg->msg_namelen, 0, from); if (error) break; ktrkuser("msghdr", msg, sizeof *msg); mmsg.msg_len = *retval; error = copyout(&mmsg, SCARG(uap, mmsg) + dg, sizeof(mmsg)); if (error) break; dg++; if (msg->msg_flags & MSG_OOB) break; if (SCARG(uap, timeout)) { getnanotime(&now); if (timespeccmp(&ts, &now, <)) break; } if (flags & MSG_WAITFORONE) flags |= MSG_DONTWAIT; } if (from != NULL) m_free(from); *retval = dg; /* * If we succeeded at least once, return 0, hopefully so->so_rerror * will catch it next time. */ if (error && dg > 0) { so->so_rerror = error; error = 0; } fd_putfile(s); return error; } int sys_shutdown(struct lwp *l, const struct sys_shutdown_args *uap, register_t *retval) { /* { syscallarg(int) s; syscallarg(int) how; } */ struct socket *so; int error; if ((error = fd_getsock(SCARG(uap, s), &so)) != 0) return error; solock(so); error = soshutdown(so, SCARG(uap, how)); sounlock(so); fd_putfile(SCARG(uap, s)); return error; } int sys_setsockopt(struct lwp *l, const struct sys_setsockopt_args *uap, register_t *retval) { /* { syscallarg(int) s; syscallarg(int) level; syscallarg(int) name; syscallarg(const void *) val; syscallarg(unsigned int) valsize; } */ struct sockopt sopt; struct socket *so; file_t *fp; int error; unsigned int len; len = SCARG(uap, valsize); if (len > 0 && SCARG(uap, val) == NULL) return EINVAL; if (len > MCLBYTES) return EINVAL; if ((error = fd_getsock1(SCARG(uap, s), &so, &fp)) != 0) return (error); sockopt_init(&sopt, SCARG(uap, level), SCARG(uap, name), len); if (len > 0) { error = copyin(SCARG(uap, val), sopt.sopt_data, len); if (error) goto out; } error = sosetopt(so, &sopt); if (so->so_options & SO_NOSIGPIPE) atomic_or_uint(&fp->f_flag, FNOSIGPIPE); else atomic_and_uint(&fp->f_flag, ~FNOSIGPIPE); out: sockopt_destroy(&sopt); fd_putfile(SCARG(uap, s)); return error; } static int getsockopt(struct lwp *l, const struct sys_getsockopt_args *uap, register_t *retval, bool copyarg) { struct sockopt sopt; struct socket *so; file_t *fp; unsigned int valsize, len; int error; if (SCARG(uap, val) != NULL) { error = copyin(SCARG(uap, avalsize), &valsize, sizeof(valsize)); if (error) return error; } else valsize = 0; if (valsize > MCLBYTES) return EINVAL; if ((error = fd_getsock1(SCARG(uap, s), &so, &fp)) != 0) return error; sockopt_init(&sopt, SCARG(uap, level), SCARG(uap, name), valsize); if (copyarg && valsize > 0) { error = copyin(SCARG(uap, val), sopt.sopt_data, valsize); if (error) goto out; } if (fp->f_flag & FNOSIGPIPE) so->so_options |= SO_NOSIGPIPE; else so->so_options &= ~SO_NOSIGPIPE; error = sogetopt(so, &sopt); if (error || valsize == 0) goto out; len = uimin(valsize, sopt.sopt_retsize); error = copyout(sopt.sopt_data, SCARG(uap, val), len); if (error) goto out; error = copyout(&len, SCARG(uap, avalsize), sizeof(len)); out: sockopt_destroy(&sopt); fd_putfile(SCARG(uap, s)); return error; } int sys_getsockopt(struct lwp *l, const struct sys_getsockopt_args *uap, register_t *retval) { /* { syscallarg(int) s; syscallarg(int) level; syscallarg(int) name; syscallarg(void *) val; syscallarg(unsigned int *) avalsize; } */ return getsockopt(l, uap, retval, false); } int sys_getsockopt2(struct lwp *l, const struct sys_getsockopt2_args *uap, register_t *retval) { /* { syscallarg(int) s; syscallarg(int) level; syscallarg(int) name; syscallarg(void *) val; syscallarg(unsigned int *) avalsize; } */ return getsockopt(l, (const struct sys_getsockopt_args *) uap, retval, true); } #ifdef PIPE_SOCKETPAIR int pipe1(struct lwp *l, int *fildes, int flags) { file_t *rf, *wf; struct socket *rso, *wso; int error, soflags = 0; unsigned rfd, wfd; proc_t *p = l->l_proc; if (flags & ~(O_CLOEXEC|O_NONBLOCK|O_NOSIGPIPE)) return EINVAL; if (flags & O_CLOEXEC) soflags |= SOCK_CLOEXEC; if (flags & O_NONBLOCK) soflags |= SOCK_NONBLOCK; if (flags & O_NOSIGPIPE) soflags |= SOCK_NOSIGPIPE; error = fsocreate(AF_LOCAL, &rso, SOCK_STREAM|soflags, 0, &rfd, &rf, NULL); if (error) goto free1; error = fsocreate(AF_LOCAL, &wso, SOCK_STREAM|soflags, 0, &wfd, &wf, rso); if (error) goto free2; /* make sure the descriptors are uni-directional */ rf->f_type = rf->f_type & ~(FWRITE); wf->f_type = wf->f_type & ~(FREAD); /* remember this socket pair implements a pipe */ rso->so_state |= SS_ISAPIPE; wso->so_state |= SS_ISAPIPE; solock(wso); /* * Pipes must be readable when there is at least 1 * byte of data available in the receive buffer. * * Pipes must be writable when there is space for * at least PIPE_BUF bytes in the send buffer. * If we're increasing the low water mark for the * send buffer, then mimic how soreserve() would * have set the high water mark. */ rso->so_rcv.sb_lowat = 1; if (wso->so_snd.sb_lowat < PIPE_BUF) { wso->so_snd.sb_hiwat = PIPE_BUF * 2; } wso->so_snd.sb_lowat = PIPE_BUF; error = unp_connect2(wso, rso); sounlock(wso); if (error != 0) goto free3; fd_affix(p, wf, wfd); fd_affix(p, rf, rfd); fildes[0] = rfd; fildes[1] = wfd; return (0); free3: (void)soclose(wso); fd_abort(p, wf, wfd); free2: (void)soclose(rso); fd_abort(p, rf, rfd); free1: return error; } #endif /* PIPE_SOCKETPAIR */ /* * Get peer socket name. */ int do_sys_getpeername(int fd, struct sockaddr *nam) { struct socket *so; int error; if ((error = fd_getsock(fd, &so)) != 0) return error; solock(so); if ((so->so_state & SS_ISCONNECTED) == 0) error = ENOTCONN; else { error = (*so->so_proto->pr_usrreqs->pr_peeraddr)(so, nam); } sounlock(so); fd_putfile(fd); return error; } /* * Get local socket name. */ int do_sys_getsockname(int fd, struct sockaddr *nam) { struct socket *so; int error; if ((error = fd_getsock(fd, &so)) != 0) return error; solock(so); error = (*so->so_proto->pr_usrreqs->pr_sockaddr)(so, nam); sounlock(so); fd_putfile(fd); return error; } int copyout_sockname_sb(struct sockaddr *asa, unsigned int *alen, int flags, struct sockaddr_big *addr) { unsigned int len; int error; if (asa == NULL) /* Assume application not interested */ return 0; if (flags & MSG_LENUSRSPACE) { error = copyin(alen, &len, sizeof(len)); if (error) return error; } else len = *alen; if (addr == NULL) { len = 0; error = 0; } else { if (len > addr->sb_len) len = addr->sb_len; /* XXX addr isn't an mbuf... */ ktrkuser(mbuftypes[MT_SONAME], addr, len); error = copyout(addr, asa, len); } if (error == 0) { if (flags & MSG_LENUSRSPACE) error = copyout(&len, alen, sizeof(len)); else *alen = len; } return error; } int copyout_sockname(struct sockaddr *asa, unsigned int *alen, int flags, struct mbuf *addr) { int len; int error; if (asa == NULL) /* Assume application not interested */ return 0; if (flags & MSG_LENUSRSPACE) { error = copyin(alen, &len, sizeof(len)); if (error) return error; } else len = *alen; if (len < 0) return EINVAL; if (addr == NULL) { len = 0; error = 0; } else { if (len > addr->m_len) len = addr->m_len; /* Maybe this ought to copy a chain ? */ ktrkuser(mbuftypes[MT_SONAME], mtod(addr, void *), len); error = copyout(mtod(addr, void *), asa, len); } if (error == 0) { if (flags & MSG_LENUSRSPACE) error = copyout(&len, alen, sizeof(len)); else *alen = len; } return error; } /* * Get socket name. */ int sys_getsockname(struct lwp *l, const struct sys_getsockname_args *uap, register_t *retval) { /* { syscallarg(int) fdes; syscallarg(struct sockaddr *) asa; syscallarg(unsigned int *) alen; } */ struct sockaddr_big sbig; int error; sbig.sb_len = UCHAR_MAX; error = do_sys_getsockname(SCARG(uap, fdes), (struct sockaddr *)&sbig); if (error != 0) return error; error = copyout_sockname_sb(SCARG(uap, asa), SCARG(uap, alen), MSG_LENUSRSPACE, &sbig); return error; } /* * Get name of peer for connected socket. */ int sys_getpeername(struct lwp *l, const struct sys_getpeername_args *uap, register_t *retval) { /* { syscallarg(int) fdes; syscallarg(struct sockaddr *) asa; syscallarg(unsigned int *) alen; } */ struct sockaddr_big sbig; int error; sbig.sb_len = UCHAR_MAX; error = do_sys_getpeername(SCARG(uap, fdes), (struct sockaddr *)&sbig); if (error != 0) return error; error = copyout_sockname_sb(SCARG(uap, asa), SCARG(uap, alen), MSG_LENUSRSPACE, &sbig); return error; } static int sockargs_sb(struct sockaddr_big *sb, const void *name, socklen_t buflen) { int error; /* * We can't allow socket names > UCHAR_MAX in length, since that * will overflow sb_len. Further no reasonable buflen is <= * offsetof(sockaddr_big, sb_data) since it shall be at least * the size of the preamble sb_len and sb_family members. */ if (buflen > UCHAR_MAX || buflen <= offsetof(struct sockaddr_big, sb_data)) return EINVAL; error = copyin(name, (void *)sb, buflen); if (error) return error; ktrkuser(mbuftypes[MT_SONAME], sb, buflen); #if BYTE_ORDER != BIG_ENDIAN /* * 4.3BSD compat thing - need to stay, since bind(2), * connect(2), sendto(2) were not versioned for COMPAT_43. */ if (sb->sb_family == 0 && sb->sb_len < AF_MAX) sb->sb_family = sb->sb_len; #endif sb->sb_len = buflen; return 0; } /* * XXX In a perfect world, we wouldn't pass around socket control * XXX arguments in mbufs, and this could go away. */ int sockargs(struct mbuf **mp, const void *bf, size_t buflen, enum uio_seg seg, int type) { struct mbuf *m; int error; /* * We can't allow socket names > UCHAR_MAX in length, since that * will overflow sa_len. Control data more than a page size in * length is just too much. */ if (buflen > (type == MT_SONAME ? UCHAR_MAX : PAGE_SIZE)) return EINVAL; /* * length must greater than sizeof(sa_family) + sizeof(sa_len) */ if (type == MT_SONAME && buflen <= 2) return EINVAL; /* Allocate an mbuf to hold the arguments. */ m = m_get(M_WAIT, type); /* can't claim. don't who to assign it to. */ if (buflen > MLEN) { /* * Won't fit into a regular mbuf, so we allocate just * enough external storage to hold the argument. */ MEXTMALLOC(m, buflen, M_WAITOK); } m->m_len = buflen; if (seg == UIO_USERSPACE) { error = copyin(bf, mtod(m, void *), buflen); if (error) { (void)m_free(m); return error; } } else { memcpy(mtod(m, void *), bf, buflen); } *mp = m; switch (type) { case MT_SONAME: ktrkuser(mbuftypes[type], mtod(m, void *), buflen); struct sockaddr *sa = mtod(m, struct sockaddr *); #if BYTE_ORDER != BIG_ENDIAN /* * 4.3BSD compat thing - need to stay, since bind(2), * connect(2), sendto(2) were not versioned for COMPAT_43. */ if (sa->sa_family == 0 && sa->sa_len < AF_MAX) sa->sa_family = sa->sa_len; #endif sa->sa_len = buflen; return 0; case MT_CONTROL: if (!KTRPOINT(curproc, KTR_USER)) return 0; struct msghdr mhdr; mhdr.msg_control = mtod(m, void *); mhdr.msg_controllen = buflen; for (struct cmsghdr *cmsg = CMSG_FIRSTHDR(&mhdr); cmsg; cmsg = CMSG_NXTHDR(&mhdr, cmsg)) { KASSERT(((char *)cmsg - mtod(m, char *)) <= buflen); if (cmsg->cmsg_len > buflen - ((char *)cmsg - mtod(m, char *))) break; ktrkuser(mbuftypes[type], cmsg, cmsg->cmsg_len); } return 0; default: return EINVAL; } } int do_sys_peeloff(struct socket *head, void *data) { #ifdef SCTP /*file_t *lfp = NULL;*/ file_t *nfp = NULL; int error; struct socket *so; int fd; uint32_t name; /*short fflag;*/ /* type must match fp->f_flag */ name = *(uint32_t *) data; error = sctp_can_peel_off(head, name); if (error) { printf("peeloff failed\n"); return error; } /* * At this point we know we do have a assoc to pull * we proceed to get the fd setup. This may block * but that is ok. */ error = fd_allocfile(&nfp, &fd); if (error) { /* * Probably ran out of file descriptors. Put the * unaccepted connection back onto the queue and * do another wakeup so some other process might * have a chance at it. */ return error; } *(int *) data = fd; so = sctp_get_peeloff(head, name, &error); if (so == NULL) { /* * Either someone else peeled it off OR * we can't get a socket. * close the new descriptor, assuming someone hasn't ripped it * out from under us. */ mutex_enter(&nfp->f_lock); nfp->f_count++; mutex_exit(&nfp->f_lock); fd_abort(curlwp->l_proc, nfp, fd); return error; } so->so_state &= ~SS_NOFDREF; so->so_state &= ~SS_ISCONNECTING; so->so_head = NULL; so->so_cred = kauth_cred_hold(head->so_cred); nfp->f_socket = so; nfp->f_flag = FREAD|FWRITE; nfp->f_ops = &socketops; nfp->f_type = DTYPE_SOCKET; fd_affix(curlwp->l_proc, nfp, fd); return error; #else return EOPNOTSUPP; #endif }
675 43 496 472 494 496 33 472 500 499 499 492 321 276 19 242 242 364 168 7 36 1001 629 490 488 73 104 490 1004 1004 628 489 487 489 489 488 105 731 353 350 353 353 354 352 121 121 122 76 113 122 122 122 121 33 33 33 32 3 3 3 385 33 33 386 387 384 40 40 40 384 12 12 12 12 12 6 6 3 3 3 509 512 512 3 3 3 491 490 492 261 490 477 487 49 78 77 78 78 73 9 5 4 470 470 469 452 468 470 64 38 462 44 198 387 458 298 241 77 78 78 77 66 78 70 71 8 9 9 9 9 9 78 477 477 461 224 224 153 370 37 37 37 37 3 35 37 37 37 37 37 37 37 37 152 153 152 153 2 153 153 152 153 149 153 153 153 153 153 19 19 19 19 10 10 19 10 10 742 743 10 8 3 10 10 10 10 10 10 10 10 19 19 990 989 993 991 604 606 605 603 603 2 912 1 912 911 911 912 558 680 740 739 743 740 741 1012 1014 462 465 3 1006 1009 1009 45 904 888 624 362 886 472 747 625 358 349 652 182 239 239 31 31 1 26 26 1 26 26 1 1 26 26 43 43 43 43 75 75 74 26 68 6 62 1 66 2 2 2 68 85 60 3 48 46 52 52 7 36 43 32 85 85 26 26 11 11 26 11 19 26 26 26 26 11 5 26 26 26 26 5 26 26 19 11 5 5 2 4 47 47 47 47 41 11 11 11 11 8 5 4 11 11 5 11 11 1 5 5 8 8 11 47 32 32 44 13 13 13 32 28 28 28 28 26 26 26 26 20 21 28 31 31 31 31 472 474 473 472 472 473 472 472 28 456 120 371 153 470 4 375 153 28 470 4 393 153 474 474 28 75 444 456 457 79 79 461 460 462 303 242 242 471 19 765 768 34 34 34 34 34 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082 5083 5084 5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 5095 5096 5097 5098 5099 5100 5101 5102 5103 5104 5105 5106 5107 5108 5109 5110 5111 5112 5113 5114 5115 5116 5117 5118 5119 5120 5121 5122 5123 5124 5125 5126 5127 5128 5129 5130 5131 5132 5133 5134 5135 5136 5137 5138 5139 5140 5141 5142 5143 5144 5145 5146 5147 5148 5149 5150 5151 5152 5153 5154 5155 5156 5157 5158 5159 5160 5161 5162 5163 5164 5165 5166 5167 5168 5169 5170 5171 5172 5173 5174 5175 5176 5177 5178 5179 5180 5181 5182 5183 5184 5185 5186 5187 5188 5189 5190 5191 5192 5193 5194 5195 5196 5197 5198 5199 5200 5201 5202 5203 5204 5205 5206 5207 5208 5209 5210 5211 5212 5213 5214 5215 5216 5217 5218 5219 5220 5221 5222 5223 5224 5225 5226 5227 5228 5229 5230 5231 5232 5233 5234 5235 5236 5237 5238 5239 5240 5241 5242 5243 5244 5245 5246 5247 5248 5249 5250 5251 5252 5253 5254 5255 5256 5257 5258 5259 5260 5261 5262 5263 5264 5265 5266 5267 5268 5269 5270 5271 5272 5273 5274 5275 5276 5277 5278 5279 5280 5281 5282 5283 5284 5285 5286 5287 5288 5289 5290 5291 5292 5293 5294 5295 5296 5297 5298 5299 5300 5301 5302 5303 5304 5305 5306 5307 5308 5309 5310 5311 5312 5313 5314 5315 5316 5317 5318 5319 5320 5321 5322 5323 5324 5325 5326 5327 5328 5329 5330 5331 5332 5333 5334 5335 5336 5337 5338 5339 5340 5341 5342 5343 5344 5345 5346 5347 5348 5349 5350 5351 5352 5353 5354 5355 5356 5357 5358 5359 5360 5361 5362 5363 5364 5365 5366 5367 5368 5369 5370 5371 5372 5373 5374 5375 5376 5377 5378 5379 5380 5381 5382 5383 5384 5385 5386 5387 5388 5389 5390 5391 5392 5393 5394 5395 5396 5397 5398 5399 5400 5401 5402 5403 5404 5405 5406 5407 5408 5409 5410 5411 5412 5413 5414 5415 5416 5417 5418 5419 5420 5421 5422 5423 5424 5425 5426 5427 5428 5429 5430 5431 5432 5433 5434 5435 5436 5437 5438 5439 5440 5441 5442 5443 5444 5445 5446 5447 5448 5449 5450 5451 5452 5453 5454 5455 5456 5457 5458 5459 5460 5461 5462 5463 5464 5465 5466 5467 5468 5469 5470 5471 5472 5473 5474 5475 5476 5477 5478 5479 5480 5481 5482 5483 5484 5485 5486 5487 5488 5489 5490 5491 5492 5493 5494 5495 5496 5497 5498 5499 5500 5501 5502 5503 5504 5505 5506 5507 5508 5509 5510 5511 5512 5513 5514 5515 5516 5517 5518 5519 5520 5521 5522 5523 5524 5525 5526 5527 5528 5529 5530 5531 5532 5533 5534 5535 5536 5537 5538 5539 5540 5541 5542 5543 5544 5545 5546 5547 5548 5549 5550 5551 5552 5553 5554 5555 5556 5557 5558 5559 5560 5561 5562 5563 5564 5565 5566 5567 5568 5569 5570 5571 5572 5573 5574 5575 5576 5577 5578 5579 5580 5581 5582 5583 5584 5585 5586 5587 5588 5589 5590 5591 5592 5593 5594 5595 5596 5597 5598 5599 5600 5601 5602 5603 5604 5605 5606 5607 5608 5609 5610 5611 5612 5613 5614 5615 5616 5617 5618 5619 5620 5621 5622 5623 5624 5625 5626 5627 5628 5629 5630 5631 5632 5633 5634 5635 5636 5637 5638 5639 5640 5641 5642 5643 5644 5645 5646 5647 5648 5649 5650 5651 5652 5653 5654 5655 5656 5657 5658 5659 5660 5661 5662 5663 5664 5665 5666 5667 5668 5669 5670 5671 5672 5673 5674 5675 5676 5677 5678 5679 5680 5681 5682 5683 5684 5685 5686 5687 5688 5689 5690 5691 5692 5693 5694 5695 5696 5697 5698 5699 5700 5701 5702 5703 5704 5705 5706 5707 5708 5709 5710 5711 5712 5713 5714 5715 5716 5717 5718 5719 5720 5721 5722 5723 5724 5725 5726 5727 5728 5729 5730 5731 5732 5733 5734 5735 5736 5737 5738 5739 5740 5741 5742 5743 5744 5745 5746 5747 5748 5749 5750 5751 5752 5753 5754 5755 5756 5757 5758 5759 5760 5761 5762 5763 5764 5765 5766 5767 5768 5769 5770 5771 5772 5773 5774 5775 5776 5777 5778 5779 5780 5781 5782 5783 5784 5785 5786 5787 5788 5789 5790 5791 5792 5793 5794 5795 5796 5797 5798 5799 5800 5801 5802 5803 5804 5805 5806 5807 5808 5809 5810 5811 5812 5813 5814 5815 5816 5817 5818 5819 5820 5821 5822 5823 5824 5825 5826 5827 5828 5829 5830 5831 5832 5833 5834 5835 5836 5837 5838 5839 5840 5841 5842 5843 5844 5845 5846 5847 5848 5849 5850 5851 5852 5853 5854 5855 5856 5857 5858 5859 5860 5861 5862 5863 5864 5865 5866 5867 5868 5869 5870 5871 5872 5873 5874 5875 5876 5877 5878 5879 5880 5881 5882 5883 5884 5885 5886 5887 5888 5889 5890 5891 5892 5893 5894 5895 5896 5897 5898 5899 5900 5901 5902 5903 5904 5905 5906 5907 5908 5909 5910 5911 5912 5913 5914 5915 5916 5917 5918 5919 5920 5921 5922 5923 5924 5925 5926 5927 5928 5929 5930 5931 5932 5933 5934 5935 5936 5937 5938 5939 5940 5941 5942 5943 5944 5945 5946 5947 5948 5949 5950 5951 5952 5953 5954 5955 5956 5957 5958 5959 5960 5961 5962 5963 5964 5965 5966 5967 5968 5969 5970 5971 5972 5973 5974 5975 5976 5977 5978 5979 5980 5981 5982 5983 5984 5985 5986 5987 5988 5989 5990 5991 5992 5993 5994 5995 5996 5997 5998 5999 6000 6001 6002 6003 6004 6005 6006 6007 6008 6009 6010 6011 6012 6013 6014 6015 6016 6017 6018 6019 6020 6021 6022 6023 6024 6025 6026 6027 6028 6029 6030 6031 6032 6033 6034 6035 6036 6037 6038 6039 6040 6041 6042 6043 6044 6045 6046 6047 6048 6049 6050 6051 6052 6053 6054 6055 6056 6057 6058 6059 6060 6061 6062 6063 6064 6065 6066 6067 6068 6069 6070 6071 6072 6073 6074 6075 6076 6077 6078 6079 6080 6081 6082 6083 6084 6085 6086 6087 6088 6089 6090 6091 6092 6093 6094 6095 6096 6097 6098 6099 6100 6101 6102 6103 6104 6105 6106 6107 6108 6109 6110 6111 6112 6113 6114 6115 6116 6117 6118 6119 6120 6121 6122 6123 6124 6125 6126 6127 6128 6129 6130 6131 6132 6133 6134 6135 6136 6137 6138 6139 6140 6141 6142 6143 6144 6145 6146 6147 6148 6149 6150 6151 6152 6153 6154 6155 6156 6157 6158 6159 6160 6161 6162 6163 6164 6165 6166 6167 6168 6169 6170 6171 6172 6173 6174 6175 6176 6177 6178 6179 6180 6181 6182 6183 6184 6185 6186 6187 6188 6189 6190 6191 6192 6193 6194 6195 6196 6197 6198 6199 6200 6201 6202 6203 6204 6205 6206 6207 6208 6209 6210 6211 6212 6213 6214 6215 6216 6217 6218 6219 6220 6221 6222 6223 6224 6225 6226 6227 6228 6229 6230 6231 6232 6233 6234 6235 6236 6237 6238 6239 6240 6241 6242 6243 6244 6245 6246 6247 6248 6249 6250 6251 6252 6253 6254 6255 6256 6257 6258 6259 6260 6261 6262 6263 6264 6265 6266 6267 6268 6269 6270 6271 6272 6273 6274 6275 6276 6277 6278 6279 6280 6281 6282 6283 6284 6285 6286 6287 6288 6289 6290 6291 6292 6293 6294 6295 6296 6297 6298 6299 6300 6301 6302 6303 6304 6305 6306 6307 6308 6309 6310 6311 6312 6313 6314 6315 6316 6317 6318 6319 6320 6321 6322 6323 6324 6325 6326 6327 6328 6329 6330 6331 6332 6333 6334 6335 6336 6337 6338 6339 6340 6341 6342 6343 6344 6345 6346 6347 6348 6349 6350 6351 6352 6353 6354 6355 6356 6357 6358 6359 6360 6361 6362 6363 6364 6365 6366 6367 6368 6369 6370 6371 6372 6373 6374 6375 6376 6377 6378 6379 6380 6381 6382 6383 6384 6385 6386 6387 6388 6389 6390 6391 6392 6393 6394 6395 6396 6397 6398 6399 6400 6401 6402 6403 6404 6405 6406 6407 6408 6409 6410 6411 6412 6413 6414 6415 6416 6417 6418 6419 6420 6421 6422 6423 6424 6425 6426 6427 6428 6429 6430 6431 6432 6433 6434 6435 6436 6437 6438 6439 6440 6441 6442 6443 6444 6445 6446 6447 6448 6449 6450 6451 6452 6453 6454 6455 6456 6457 6458 6459 6460 6461 6462 6463 6464 6465 6466 6467 6468 6469 6470 6471 6472 6473 6474 6475 6476 6477 6478 6479 6480 6481 6482 6483 6484 6485 6486 6487 6488 6489 6490 6491 6492 6493 6494 6495 6496 6497 6498 6499 6500 6501 6502 6503 6504 6505 6506 6507 6508 6509 6510 6511 6512 6513 6514 6515 6516 6517 6518 6519 6520 6521 6522 6523 6524 6525 6526 6527 6528 6529 6530 6531 6532 6533 6534 6535 6536 6537 6538 6539 6540 6541 6542 6543 6544 6545 6546 6547 6548 6549 6550 6551 6552 6553 6554 6555 6556 6557 6558 6559 6560 6561 6562 6563 6564 6565 6566 6567 6568 6569 6570 6571 6572 6573 6574 6575 6576 6577 6578 6579 6580 6581 6582 6583 6584 6585 6586 6587 6588 6589 6590 6591 6592 6593 6594 6595 6596 6597 6598 6599 6600 6601 6602 6603 6604 6605 6606 6607 6608 6609 6610 6611 6612 6613 6614 6615 6616 6617 6618 6619 6620 6621 6622 6623 6624 6625 6626 6627 6628 6629 6630 6631 6632 6633 6634 6635 6636 6637 6638 6639 6640 6641 6642 6643 6644 6645 6646 6647 6648 6649 6650 6651 6652 6653 6654 6655 6656 6657 6658 6659 6660 6661 6662 6663 6664 6665 6666 6667 6668 6669 6670 6671 6672 6673 6674 6675 6676 6677 6678 6679 6680 6681 6682 6683 6684 6685 6686 6687 6688 6689 6690 6691 6692 6693 6694 6695 6696 6697 6698 6699 6700 6701 6702 6703 6704 6705 6706 6707 6708 6709 6710 6711 6712 6713 6714 6715 6716 6717 6718 6719 6720 6721 6722 6723 6724 6725 6726 6727 6728 6729 6730 6731 6732 6733 6734 6735 6736 6737 6738 6739 6740 6741 6742 6743 6744 6745 6746 6747 6748 6749 6750 6751 6752 6753 6754 6755 6756 6757 6758 6759 6760 6761 6762 6763 6764 6765 6766 6767 6768 6769 6770 6771 6772 6773 6774 6775 6776 6777 6778 6779 6780 6781 6782 6783 6784 6785 6786 6787 6788 6789 6790 6791 6792 6793 6794 6795 6796 6797 6798 6799 6800 6801 6802 6803 6804 6805 6806 6807 6808 6809 6810 6811 6812 6813 6814 6815 6816 6817 6818 6819 6820 6821 6822 6823 6824 6825 6826 6827 6828 6829 6830 6831 6832 6833 6834 6835 6836 6837 6838 6839 6840 6841 6842 6843 6844 6845 6846 6847 6848 6849 6850 6851 6852 6853 6854 6855 6856 6857 6858 6859 6860 6861 6862 6863 6864 6865 6866 6867 6868 6869 6870 6871 6872 6873 6874 6875 6876 6877 6878 6879 6880 6881 6882 6883 6884 6885 6886 6887 6888 6889 6890 6891 6892 6893 6894 6895 6896 6897 6898 6899 6900 6901 6902 6903 6904 6905 6906 6907 6908 6909 6910 6911 6912 6913 6914 6915 6916 6917 6918 6919 6920 6921 6922 6923 /* $NetBSD: pmap.c,v 1.426 2023/10/04 20:28:06 ad Exp $ */ /* * Copyright (c) 2008, 2010, 2016, 2017, 2019, 2020 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Andrew Doran, and by Maxime Villard. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /* * Copyright (c) 2007 Manuel Bouyer. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* * Copyright (c) 2006 Mathieu Ropert <mro@adviseo.fr> * * Permission to use, copy, modify, and distribute this software for any * purpose with or without fee is hereby granted, provided that the above * copyright notice and this permission notice appear in all copies. * * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ /* * Copyright 2001 (c) Wasabi Systems, Inc. * All rights reserved. * * Written by Frank van der Linden for Wasabi Systems, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed for the NetBSD Project by * Wasabi Systems, Inc. * 4. The name of Wasabi Systems, Inc. may not be used to endorse * or promote products derived from this software without specific prior * written permission. * * THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL WASABI SYSTEMS, INC * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /* * Copyright (c) 1997 Charles D. Cranor and Washington University. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: pmap.c,v 1.426 2023/10/04 20:28:06 ad Exp $"); #include "opt_user_ldt.h" #include "opt_lockdebug.h" #include "opt_multiprocessor.h" #include "opt_xen.h" #include "opt_svs.h" #include "opt_kaslr.h" #include "opt_efi.h" #define __MUTEX_PRIVATE /* for assertions */ #include <sys/param.h> #include <sys/systm.h> #include <sys/proc.h> #include <sys/pool.h> #include <sys/kernel.h> #include <sys/atomic.h> #include <sys/cpu.h> #include <sys/intr.h> #include <sys/xcall.h> #include <sys/kcore.h> #include <sys/kmem.h> #include <sys/asan.h> #include <sys/msan.h> #include <sys/entropy.h> #include <uvm/uvm.h> #include <uvm/pmap/pmap_pvt.h> #include <dev/isa/isareg.h> #include <machine/specialreg.h> #include <machine/gdt.h> #include <machine/isa_machdep.h> #include <machine/cpuvar.h> #include <machine/cputypes.h> #include <machine/pmap_private.h> #include <x86/bootspace.h> #include <x86/pat.h> #include <x86/pmap_pv.h> #include <x86/i82489reg.h> #include <x86/i82489var.h> #ifdef XEN #include <xen/include/public/xen.h> #include <xen/hypervisor.h> #include <xen/xenpmap.h> #endif #ifdef __HAVE_DIRECT_MAP #include <crypto/nist_hash_drbg/nist_hash_drbg.h> #endif /* * general info: * * - for an explanation of how the x86 MMU hardware works see * the comments in <machine/pte.h>. * * - for an explanation of the general memory structure used by * this pmap (including the recursive mapping), see the comments * in <machine/pmap.h>. * * this file contains the code for the "pmap module." the module's * job is to manage the hardware's virtual to physical address mappings. * note that there are two levels of mapping in the VM system: * * [1] the upper layer of the VM system uses vm_map's and vm_map_entry's * to map ranges of virtual address space to objects/files. for * example, the vm_map may say: "map VA 0x1000 to 0x22000 read-only * to the file /bin/ls starting at offset zero." note that * the upper layer mapping is not concerned with how individual * vm_pages are mapped. * * [2] the lower layer of the VM system (the pmap) maintains the mappings * from virtual addresses. it is concerned with which vm_page is * mapped where. for example, when you run /bin/ls and start * at page 0x1000 the fault routine may lookup the correct page * of the /bin/ls file and then ask the pmap layer to establish * a mapping for it. * * note that information in the lower layer of the VM system can be * thrown away since it can easily be reconstructed from the info * in the upper layer. * * data structures we use include: * * - struct pmap: describes the address space of one thread * - struct pmap_page: describes one pv-tracked page, without * necessarily a corresponding vm_page * - struct pv_entry: describes one <PMAP,VA> mapping of a PA * - pmap_page::pp_pvlist: there is one list per pv-tracked page of * physical memory. the pp_pvlist points to a list of pv_entry * structures which describe all the <PMAP,VA> pairs that this * page is mapped in. this is critical for page based operations * such as pmap_page_protect() [change protection on _all_ mappings * of a page] */ /* * Locking * * We have the following locks that we must deal with, listed in the order * that they are acquired: * * pg->uobject->vmobjlock, pg->uanon->an_lock * * For managed pages, these per-object locks are taken by the VM system * before calling into the pmap module - either a read or write hold. * The lock hold prevent pages from changing identity while the pmap is * operating on them. For example, the same lock is held across a call * to pmap_remove() and the following call to pmap_update(), so that a * page does not gain a new identity while its TLB visibility is stale. * * pmap->pm_lock * * This lock protects the fields in the pmap structure including the * non-kernel PDEs in the PDP, the PTEs, and PTPs and connected data * structures. For modifying unmanaged kernel PTEs it is not needed as * kernel PDEs are never freed, and the kernel is expected to be self * consistent (and the lock can't be taken for unmanaged kernel PTEs, * because they can be modified from interrupt context). * * pmaps_lock * * This lock protects the list of active pmaps (headed by "pmaps"). * It's acquired when adding or removing pmaps or adjusting kernel PDEs. * * pp_lock * * This per-page lock protects PV entry lists and the embedded PV entry * in each vm_page, allowing for concurrent operation on pages by * different pmaps. This is a spin mutex at IPL_VM, because at the * points it is taken context switching is usually not tolerable, and * spin mutexes must block out interrupts that could take kernel_lock. */ /* uvm_object is abused here to index pmap_pages; make assertions happy. */ #ifdef DIAGNOSTIC #define PMAP_DUMMY_LOCK(pm) rw_enter(&(pm)->pm_dummy_lock, RW_WRITER) #define PMAP_DUMMY_UNLOCK(pm) rw_exit(&(pm)->pm_dummy_lock) #else #define PMAP_DUMMY_LOCK(pm) #define PMAP_DUMMY_UNLOCK(pm) #endif static const struct uvm_pagerops pmap_pager = { /* nothing */ }; /* * pl_i(va, X) == plX_i(va) <= pl_i_roundup(va, X) */ #define pl_i(va, lvl) \ (((VA_SIGN_POS(va)) & ptp_frames[(lvl)-1]) >> ptp_shifts[(lvl)-1]) #define pl_i_roundup(va, lvl) pl_i((va)+ ~ptp_frames[(lvl)-1], (lvl)) /* * PTP macros: * a PTP's index is the PD index of the PDE that points to it * a PTP's offset is the byte-offset in the PTE space that this PTP is at * a PTP's VA is the first VA mapped by that PTP */ #define ptp_va2o(va, lvl) (pl_i(va, (lvl)+1) * PAGE_SIZE) const vaddr_t ptp_masks[] = PTP_MASK_INITIALIZER; const vaddr_t ptp_frames[] = PTP_FRAME_INITIALIZER; const int ptp_shifts[] = PTP_SHIFT_INITIALIZER; const long nkptpmax[] = NKPTPMAX_INITIALIZER; const long nbpd[] = NBPD_INITIALIZER; #ifdef i386 pd_entry_t * const normal_pdes[] = PDES_INITIALIZER; #else pd_entry_t *normal_pdes[3]; #endif long nkptp[] = NKPTP_INITIALIZER; struct pmap_head pmaps; kmutex_t pmaps_lock __cacheline_aligned; struct pcpu_area *pcpuarea __read_mostly; static vaddr_t pmap_maxkvaddr; /* * Misc. event counters. */ struct evcnt pmap_iobmp_evcnt; struct evcnt pmap_ldt_evcnt; /* * PAT */ static bool cpu_pat_enabled __read_mostly = false; /* * Global data structures */ static struct pmap kernel_pmap_store __cacheline_aligned; /* kernel's pmap */ struct pmap *const kernel_pmap_ptr = &kernel_pmap_store; static rb_tree_t pmap_kernel_rb __cacheline_aligned; struct bootspace bootspace __read_mostly; struct slotspace slotspace __read_mostly; /* Set to PTE_NX if supported. */ pd_entry_t pmap_pg_nx __read_mostly = 0; /* Set to PTE_G if supported. */ pd_entry_t pmap_pg_g __read_mostly = 0; /* Set to true if large pages are supported. */ int pmap_largepages __read_mostly = 0; paddr_t lowmem_rsvd __read_mostly; paddr_t avail_start __read_mostly; /* PA of first available physical page */ paddr_t avail_end __read_mostly; /* PA of last available physical page */ #ifdef XENPV paddr_t pmap_pa_start; /* PA of first physical page for this domain */ paddr_t pmap_pa_end; /* PA of last physical page for this domain */ #endif #define VM_PAGE_TO_PP(pg) (&(pg)->mdpage.mp_pp) #define PMAP_CHECK_PP(pp) \ KASSERTMSG((pp)->pp_lock.mtx_ipl._ipl == IPL_VM, "bad pmap_page %p", pp) #define PAGE_ALIGNED(pp) \ __builtin_assume_aligned((void *)(pp), PAGE_SIZE) /* * Other data structures */ static pt_entry_t protection_codes[8] __read_mostly; static bool pmap_initialized __read_mostly = false; /* pmap_init done yet? */ /* * The following two vaddr_t's are used during system startup to keep track of * how much of the kernel's VM space we have used. Once the system is started, * the management of the remaining kernel VM space is turned over to the * kernel_map vm_map. */ static vaddr_t virtual_avail __read_mostly; /* VA of first free KVA */ static vaddr_t virtual_end __read_mostly; /* VA of last free KVA */ #ifndef XENPV /* * LAPIC virtual address, and fake physical address. */ volatile vaddr_t local_apic_va __read_mostly; paddr_t local_apic_pa __read_mostly; #endif /* * pool that pmap structures are allocated from */ struct pool_cache pmap_cache; static int pmap_ctor(void *, void *, int); static void pmap_dtor(void *, void *); /* * pv_page cache */ static struct pool_cache pmap_pvp_cache; #ifdef __HAVE_DIRECT_MAP vaddr_t pmap_direct_base __read_mostly; vaddr_t pmap_direct_end __read_mostly; #endif #ifndef __HAVE_DIRECT_MAP /* * Special VAs and the PTEs that map them */ static pt_entry_t *early_zero_pte; static void pmap_vpage_cpualloc(struct cpu_info *); #ifdef XENPV char *early_zerop; /* also referenced from xen_locore() */ #else static char *early_zerop; #endif #endif int pmap_enter_default(pmap_t, vaddr_t, paddr_t, vm_prot_t, u_int); /* PDP pool and its callbacks */ static struct pool pmap_pdp_pool; static void pmap_pdp_init(pd_entry_t *); static void pmap_pdp_fini(pd_entry_t *); #ifdef PAE /* need to allocate items of 4 pages */ static void *pmap_pdp_alloc(struct pool *, int); static void pmap_pdp_free(struct pool *, void *); static struct pool_allocator pmap_pdp_allocator = { .pa_alloc = pmap_pdp_alloc, .pa_free = pmap_pdp_free, .pa_pagesz = PAGE_SIZE * PDP_SIZE, }; #endif extern vaddr_t idt_vaddr; extern paddr_t idt_paddr; extern vaddr_t gdt_vaddr; extern paddr_t gdt_paddr; extern vaddr_t ldt_vaddr; extern paddr_t ldt_paddr; #ifdef i386 /* stuff to fix the pentium f00f bug */ extern vaddr_t pentium_idt_vaddr; #endif /* Array of freshly allocated PTPs, for pmap_get_ptp(). */ struct pmap_ptparray { struct vm_page *pg[PTP_LEVELS + 1]; bool alloced[PTP_LEVELS + 1]; }; /* * PV entries are allocated in page-sized chunks and cached per-pmap to * avoid intense pressure on memory allocators. */ struct pv_page { LIST_HEAD(, pv_entry) pvp_pves; LIST_ENTRY(pv_page) pvp_list; long pvp_nfree; struct pmap *pvp_pmap; }; #define PVE_PER_PVP ((PAGE_SIZE / sizeof(struct pv_entry)) - 1) /* * PV tree prototypes */ static int pmap_compare_key(void *, const void *, const void *); static int pmap_compare_nodes(void *, const void *, const void *); /* Read-black tree */ static const rb_tree_ops_t pmap_rbtree_ops = { .rbto_compare_nodes = pmap_compare_nodes, .rbto_compare_key = pmap_compare_key, .rbto_node_offset = offsetof(struct pv_entry, pve_rb), .rbto_context = NULL }; /* * Local prototypes */ #ifdef __HAVE_PCPU_AREA static void pmap_init_pcpu(void); #endif #ifdef __HAVE_DIRECT_MAP static void pmap_init_directmap(struct pmap *); #endif #if !defined(XENPV) static void pmap_remap_global(void); #endif #ifndef XENPV static void pmap_init_lapic(void); static void pmap_remap_largepages(void); #endif static int pmap_get_ptp(struct pmap *, struct pmap_ptparray *, vaddr_t, int, struct vm_page **); static void pmap_unget_ptp(struct pmap *, struct pmap_ptparray *); static void pmap_install_ptp(struct pmap *, struct pmap_ptparray *, vaddr_t, pd_entry_t * const *); static struct vm_page *pmap_find_ptp(struct pmap *, vaddr_t, int); static void pmap_freepage(struct pmap *, struct vm_page *, int); static void pmap_free_ptp(struct pmap *, struct vm_page *, vaddr_t, pt_entry_t *, pd_entry_t * const *); static bool pmap_remove_pte(struct pmap *, struct vm_page *, pt_entry_t *, vaddr_t); static void pmap_remove_ptes(struct pmap *, struct vm_page *, vaddr_t, vaddr_t, vaddr_t); static int pmap_pvp_ctor(void *, void *, int); static void pmap_pvp_dtor(void *, void *); static struct pv_entry *pmap_alloc_pv(struct pmap *); static void pmap_free_pv(struct pmap *, struct pv_entry *); static void pmap_drain_pv(struct pmap *); static void pmap_alloc_level(struct pmap *, vaddr_t, long *); static void pmap_load1(struct lwp *, struct pmap *, struct pmap *); static void pmap_reactivate(struct pmap *); long pmap_resident_count(struct pmap *pmap) { return pmap->pm_stats.resident_count; } long pmap_wired_count(struct pmap *pmap) { return pmap->pm_stats.wired_count; } /* * p m a p h e l p e r f u n c t i o n s */ static inline void pmap_stats_update(struct pmap *pmap, int resid_diff, int wired_diff) { KASSERT(cold || mutex_owned(&pmap->pm_lock)); pmap->pm_stats.resident_count += resid_diff; pmap->pm_stats.wired_count += wired_diff; } static inline void pmap_stats_update_bypte(struct pmap *pmap, pt_entry_t npte, pt_entry_t opte) { int resid_diff = ((npte & PTE_P) ? 1 : 0) - ((opte & PTE_P) ? 1 : 0); int wired_diff = ((npte & PTE_WIRED) ? 1 : 0) - ((opte & PTE_WIRED) ? 1 : 0); KASSERT((npte & (PTE_P | PTE_WIRED)) != PTE_WIRED); KASSERT((opte & (PTE_P | PTE_WIRED)) != PTE_WIRED); pmap_stats_update(pmap, resid_diff, wired_diff); } /* * ptp_to_pmap: lookup pmap by ptp */ static inline struct pmap * ptp_to_pmap(struct vm_page *ptp) { struct pmap *pmap; if (ptp == NULL) { return pmap_kernel(); } pmap = (struct pmap *)ptp->uobject; KASSERT(pmap != NULL); KASSERT(&pmap->pm_obj[0] == ptp->uobject); return pmap; } static inline struct pv_pte * pve_to_pvpte(struct pv_entry *pve) { if (pve == NULL) return NULL; KASSERT((void *)&pve->pve_pte == (void *)pve); return &pve->pve_pte; } static inline struct pv_entry * pvpte_to_pve(struct pv_pte *pvpte) { struct pv_entry *pve = (void *)pvpte; KASSERT(pve_to_pvpte(pve) == pvpte); return pve; } /* * Return true if the pmap page has an embedded PV entry. */ static inline bool pv_pte_embedded(struct pmap_page *pp) { KASSERT(mutex_owned(&pp->pp_lock)); return (bool)((vaddr_t)pp->pp_pte.pte_ptp | pp->pp_pte.pte_va); } /* * pv_pte_first, pv_pte_next: PV list iterator. */ static inline struct pv_pte * pv_pte_first(struct pmap_page *pp) { KASSERT(mutex_owned(&pp->pp_lock)); if (pv_pte_embedded(pp)) { return &pp->pp_pte; } return pve_to_pvpte(LIST_FIRST(&pp->pp_pvlist)); } static inline struct pv_pte * pv_pte_next(struct pmap_page *pp, struct pv_pte *pvpte) { KASSERT(mutex_owned(&pp->pp_lock)); KASSERT(pvpte != NULL); if (pvpte == &pp->pp_pte) { return pve_to_pvpte(LIST_FIRST(&pp->pp_pvlist)); } return pve_to_pvpte(LIST_NEXT(pvpte_to_pve(pvpte), pve_list)); } static inline uint8_t pmap_pte_to_pp_attrs(pt_entry_t pte) { uint8_t ret = 0; if (pte & PTE_D) ret |= PP_ATTRS_D; if (pte & PTE_A) ret |= PP_ATTRS_A; if (pte & PTE_W) ret |= PP_ATTRS_W; return ret; } static inline pt_entry_t pmap_pp_attrs_to_pte(uint8_t attrs) { pt_entry_t pte = 0; if (attrs & PP_ATTRS_D) pte |= PTE_D; if (attrs & PP_ATTRS_A) pte |= PTE_A; if (attrs & PP_ATTRS_W) pte |= PTE_W; return pte; } /* * pmap_is_curpmap: is this pmap the one currently loaded [in %cr3]? * of course the kernel is always loaded */ bool pmap_is_curpmap(struct pmap *pmap) { return ((pmap == pmap_kernel()) || (pmap == curcpu()->ci_pmap)); } inline void pmap_reference(struct pmap *pmap) { atomic_inc_uint(&pmap->pm_obj[0].uo_refs); } /* * rbtree: compare two nodes. */ static int pmap_compare_nodes(void *context, const void *n1, const void *n2) { const struct pv_entry *pve1 = n1; const struct pv_entry *pve2 = n2; KASSERT(pve1->pve_pte.pte_ptp == pve2->pve_pte.pte_ptp); if (pve1->pve_pte.pte_va < pve2->pve_pte.pte_va) { return -1; } if (pve1->pve_pte.pte_va > pve2->pve_pte.pte_va) { return 1; } return 0; } /* * rbtree: compare a node and a key. */ static int pmap_compare_key(void *context, const void *n, const void *k) { const struct pv_entry *pve = n; const vaddr_t key = (vaddr_t)k; if (pve->pve_pte.pte_va < key) { return -1; } if (pve->pve_pte.pte_va > key) { return 1; } return 0; } /* * pmap_ptp_range_set: abuse ptp->uanon to record minimum VA of PTE */ static inline void pmap_ptp_range_set(struct vm_page *ptp, vaddr_t va) { vaddr_t *min = (vaddr_t *)&ptp->uanon; if (va < *min) { *min = va; } } /* * pmap_ptp_range_clip: abuse ptp->uanon to clip range of PTEs to remove */ static inline void pmap_ptp_range_clip(struct vm_page *ptp, vaddr_t *startva, pt_entry_t **pte) { vaddr_t sclip; if (ptp == NULL) { return; } sclip = (vaddr_t)ptp->uanon; sclip = (*startva < sclip ? sclip : *startva); *pte += (sclip - *startva) / PAGE_SIZE; *startva = sclip; } /* * pmap_map_ptes: map a pmap's PTEs into KVM and lock them in * * there are several pmaps involved. some or all of them might be same. * * - the pmap given by the first argument * our caller wants to access this pmap's PTEs. * * - pmap_kernel() * the kernel pmap. note that it only contains the kernel part * of the address space which is shared by any pmap. ie. any * pmap can be used instead of pmap_kernel() for our purpose. * * - ci->ci_pmap * pmap currently loaded on the cpu. * * - vm_map_pmap(&curproc->p_vmspace->vm_map) * current process' pmap. * * => caller must lock pmap first (if not the kernel pmap) * => must be undone with pmap_unmap_ptes before returning * => disables kernel preemption */ void pmap_map_ptes(struct pmap *pmap, struct pmap **pmap2, pd_entry_t **ptepp, pd_entry_t * const **pdeppp) { struct pmap *curpmap; struct cpu_info *ci; lwp_t *l; kpreempt_disable(); /* The kernel's pmap is always accessible. */ if (pmap == pmap_kernel()) { *pmap2 = NULL; *ptepp = PTE_BASE; *pdeppp = normal_pdes; return; } KASSERT(mutex_owned(&pmap->pm_lock)); l = curlwp; ci = l->l_cpu; curpmap = ci->ci_pmap; if (pmap == curpmap) { /* * Already on the CPU: make it valid. This is very * often the case during exit(), when we have switched * to the kernel pmap in order to destroy a user pmap. */ if (__predict_false(ci->ci_tlbstate != TLBSTATE_VALID)) { pmap_reactivate(pmap); } *pmap2 = NULL; } else { /* * Toss current pmap from CPU and install new pmap, but keep * a reference to the old one. Dropping the reference can * can block as it needs to take locks, so defer that to * pmap_unmap_ptes(). */ pmap_reference(pmap); pmap_load1(l, pmap, curpmap); *pmap2 = curpmap; } KASSERT(ci->ci_tlbstate == TLBSTATE_VALID); #ifdef DIAGNOSTIC pmap->pm_pctr = lwp_pctr(); #endif *ptepp = PTE_BASE; #if defined(XENPV) && defined(__x86_64__) KASSERT(ci->ci_normal_pdes[PTP_LEVELS - 2] == L4_BASE); ci->ci_normal_pdes[PTP_LEVELS - 2] = pmap->pm_pdir; *pdeppp = ci->ci_normal_pdes; #else *pdeppp = normal_pdes; #endif } /* * pmap_unmap_ptes: unlock the PTE mapping of "pmap" * * => we cannot tolerate context switches while mapped in: assert this. * => reenables kernel preemption. * => does not unlock pmap. */ void pmap_unmap_ptes(struct pmap *pmap, struct pmap * pmap2) { struct cpu_info *ci; struct pmap *mypmap; struct lwp *l; KASSERT(kpreempt_disabled()); /* The kernel's pmap is always accessible. */ if (pmap == pmap_kernel()) { kpreempt_enable(); return; } l = curlwp; ci = l->l_cpu; KASSERT(mutex_owned(&pmap->pm_lock)); KASSERT(pmap->pm_pctr == lwp_pctr()); #if defined(XENPV) && defined(__x86_64__) KASSERT(ci->ci_normal_pdes[PTP_LEVELS - 2] != L4_BASE); ci->ci_normal_pdes[PTP_LEVELS - 2] = L4_BASE; #endif /* If not our own pmap, mark whatever's on the CPU now as lazy. */ KASSERT(ci->ci_tlbstate == TLBSTATE_VALID); mypmap = vm_map_pmap(&l->l_proc->p_vmspace->vm_map); if (ci->ci_pmap == vm_map_pmap(&l->l_proc->p_vmspace->vm_map)) { ci->ci_want_pmapload = 0; } else { ci->ci_want_pmapload = (mypmap != pmap_kernel()); ci->ci_tlbstate = TLBSTATE_LAZY; } /* Now safe to re-enable preemption. */ kpreempt_enable(); /* Toss reference to other pmap taken earlier. */ if (pmap2 != NULL) { pmap_destroy(pmap2); } } inline static void pmap_exec_account(struct pmap *pm, vaddr_t va, pt_entry_t opte, pt_entry_t npte) { #if !defined(__x86_64__) if (curproc == NULL || curproc->p_vmspace == NULL || pm != vm_map_pmap(&curproc->p_vmspace->vm_map)) return; if ((opte ^ npte) & PTE_X) pmap_update_pg(va); /* * Executability was removed on the last executable change. * Reset the code segment to something conservative and * let the trap handler deal with setting the right limit. * We can't do that because of locking constraints on the vm map. */ if ((opte & PTE_X) && (npte & PTE_X) == 0 && va == pm->pm_hiexec) { struct trapframe *tf = curlwp->l_md.md_regs; tf->tf_cs = GSEL(GUCODE_SEL, SEL_UPL); pm->pm_hiexec = I386_MAX_EXE_ADDR; } #endif /* !defined(__x86_64__) */ } #if !defined(__x86_64__) /* * Fixup the code segment to cover all potential executable mappings. * returns 0 if no changes to the code segment were made. */ int pmap_exec_fixup(struct vm_map *map, struct trapframe *tf, struct pcb *pcb) { struct vm_map_entry *ent; struct pmap *pm = vm_map_pmap(map); vaddr_t va = 0; vm_map_lock_read(map); for (ent = (&map->header)->next; ent != &map->header; ent = ent->next) { /* * This entry has greater va than the entries before. * We need to make it point to the last page, not past it. */ if (ent->protection & VM_PROT_EXECUTE) va = trunc_page(ent->end) - PAGE_SIZE; } vm_map_unlock_read(map); if (va == pm->pm_hiexec && tf->tf_cs == GSEL(GUCODEBIG_SEL, SEL_UPL)) return 0; pm->pm_hiexec = va; if (pm->pm_hiexec > I386_MAX_EXE_ADDR) { tf->tf_cs = GSEL(GUCODEBIG_SEL, SEL_UPL); } else { tf->tf_cs = GSEL(GUCODE_SEL, SEL_UPL); return 0; } return 1; } #endif /* !defined(__x86_64__) */ void pat_init(struct cpu_info *ci) { #ifndef XENPV uint64_t pat; if (!(ci->ci_feat_val[0] & CPUID_PAT)) return; /* We change WT to WC. Leave all other entries the default values. */ pat = PATENTRY(0, PAT_WB) | PATENTRY(1, PAT_WC) | PATENTRY(2, PAT_UCMINUS) | PATENTRY(3, PAT_UC) | PATENTRY(4, PAT_WB) | PATENTRY(5, PAT_WC) | PATENTRY(6, PAT_UCMINUS) | PATENTRY(7, PAT_UC); wrmsr(MSR_CR_PAT, pat); cpu_pat_enabled = true; #endif } static pt_entry_t pmap_pat_flags(u_int flags) { u_int cacheflags = (flags & PMAP_CACHE_MASK); if (!cpu_pat_enabled) { switch (cacheflags) { case PMAP_NOCACHE: case PMAP_NOCACHE_OVR: /* results in PGC_UCMINUS on cpus which have * the cpuid PAT but PAT "disabled" */ return PTE_PCD; default: return 0; } } switch (cacheflags) { case PMAP_NOCACHE: return PGC_UC; case PMAP_WRITE_COMBINE: return PGC_WC; case PMAP_WRITE_BACK: return PGC_WB; case PMAP_NOCACHE_OVR: return PGC_UCMINUS; } return 0; } /* * p m a p k e n t e r f u n c t i o n s * * functions to quickly enter/remove pages from the kernel address * space. pmap_kremove is exported to MI kernel. we make use of * the recursive PTE mappings. */ /* * pmap_kenter_pa: enter a kernel mapping without R/M (pv_entry) tracking * * => no need to lock anything, assume va is already allocated * => should be faster than normal pmap enter function */ void pmap_kenter_pa(vaddr_t va, paddr_t pa, vm_prot_t prot, u_int flags) { pt_entry_t *pte, opte, npte; KASSERT(!(prot & ~VM_PROT_ALL)); if (va < VM_MIN_KERNEL_ADDRESS) pte = vtopte(va); else pte = kvtopte(va); #if defined(XENPV) && defined(DOM0OPS) if (pa < pmap_pa_start || pa >= pmap_pa_end) { #ifdef DEBUG printf_nolog("%s: pa %#" PRIxPADDR " for va %#" PRIxVADDR " outside range\n", __func__, pa, va); #endif /* DEBUG */ npte = pa; } else #endif /* XENPV && DOM0OPS */ npte = pmap_pa2pte(pa); npte |= protection_codes[prot] | PTE_P | pmap_pg_g; npte |= pmap_pat_flags(flags); opte = pmap_pte_testset(pte, npte); /* zap! */ /* * XXX: make sure we are not dealing with a large page, since the only * large pages created are for the kernel image, and they should never * be kentered. */ KASSERTMSG(!(opte & PTE_PS), "PTE_PS va=%#"PRIxVADDR, va); if ((opte & (PTE_P | PTE_A)) == (PTE_P | PTE_A)) { /* This should not happen. */ printf_nolog("%s: mapping already present\n", __func__); kpreempt_disable(); pmap_tlb_shootdown(pmap_kernel(), va, opte, TLBSHOOT_KENTER); kpreempt_enable(); } } __strict_weak_alias(pmap_kenter_ma, pmap_kenter_pa); #if defined(__x86_64__) /* * Change protection for a virtual address. Local for a CPU only, don't * care about TLB shootdowns. * * => must be called with preemption disabled */ void pmap_changeprot_local(vaddr_t va, vm_prot_t prot) { pt_entry_t *pte, opte, npte; KASSERT(kpreempt_disabled()); if (va < VM_MIN_KERNEL_ADDRESS) pte = vtopte(va); else pte = kvtopte(va); npte = opte = *pte; if ((prot & VM_PROT_WRITE) != 0) npte |= PTE_W; else npte &= ~(PTE_W|PTE_D); if (opte != npte) { pmap_pte_set(pte, npte); pmap_pte_flush(); invlpg(va); } } #endif /* defined(__x86_64__) */ /* * pmap_kremove: remove a kernel mapping(s) without R/M (pv_entry) tracking * * => no need to lock anything * => caller must dispose of any vm_page mapped in the va range * => note: not an inline function * => we assume the va is page aligned and the len is a multiple of PAGE_SIZE * => we assume kernel only unmaps valid addresses and thus don't bother * checking the valid bit before doing TLB flushing * => must be followed by call to pmap_update() before reuse of page */ static void pmap_kremove1(vaddr_t sva, vsize_t len, bool localonly) { pt_entry_t *pte, opte; vaddr_t va, eva; eva = sva + len; kpreempt_disable(); for (va = sva; va < eva; va += PAGE_SIZE) { pte = kvtopte(va); opte = pmap_pte_testset(pte, 0); /* zap! */ if ((opte & (PTE_P | PTE_A)) == (PTE_P | PTE_A) && !localonly) { pmap_tlb_shootdown(pmap_kernel(), va, opte, TLBSHOOT_KREMOVE); } KASSERTMSG((opte & PTE_PS) == 0, "va %#" PRIxVADDR " is a large page", va); KASSERTMSG((opte & PTE_PVLIST) == 0, "va %#" PRIxVADDR " is a pv tracked page", va); } if (localonly) { tlbflushg(); } kpreempt_enable(); } void pmap_kremove(vaddr_t sva, vsize_t len) { pmap_kremove1(sva, len, false); } /* * pmap_kremove_local: like pmap_kremove(), but only worry about * TLB invalidations on the current CPU. this is only intended * for use while writing kernel crash dumps, either after panic * or via reboot -d. */ void pmap_kremove_local(vaddr_t sva, vsize_t len) { pmap_kremove1(sva, len, true); } /* * p m a p i n i t f u n c t i o n s * * pmap_bootstrap and pmap_init are called during system startup * to init the pmap module. pmap_bootstrap() does a low level * init just to get things rolling. pmap_init() finishes the job. */ /* * pmap_bootstrap_valloc: allocate a virtual address in the bootstrap area. * This function is to be used before any VM system has been set up. * * The va is taken from virtual_avail. */ static vaddr_t pmap_bootstrap_valloc(size_t npages) { vaddr_t va = virtual_avail; virtual_avail += npages * PAGE_SIZE; return va; } /* * pmap_bootstrap_palloc: allocate a physical address in the bootstrap area. * This function is to be used before any VM system has been set up. * * The pa is taken from avail_start. */ static paddr_t pmap_bootstrap_palloc(size_t npages) { paddr_t pa = avail_start; avail_start += npages * PAGE_SIZE; return pa; } /* * pmap_bootstrap: get the system in a state where it can run with VM properly * enabled (called before main()). The VM system is fully init'd later. * * => on i386, locore.S has already enabled the MMU by allocating a PDP for the * kernel, and nkpde PTP's for the kernel. * => kva_start is the first free virtual address in kernel space. */ void pmap_bootstrap(vaddr_t kva_start) { struct pmap *kpm; int i; vaddr_t kva; pmap_pg_nx = (cpu_feature[2] & CPUID_NOX ? PTE_NX : 0); /* * Set up our local static global vars that keep track of the usage of * KVM before kernel_map is set up. */ virtual_avail = kva_start; /* first free KVA */ virtual_end = VM_MAX_KERNEL_ADDRESS; /* last KVA */ /* * Set up protection_codes: we need to be able to convert from a MI * protection code (some combo of VM_PROT...) to something we can jam * into a x86 PTE. */ protection_codes[VM_PROT_NONE] = pmap_pg_nx; protection_codes[VM_PROT_EXECUTE] = PTE_X; protection_codes[VM_PROT_READ] = pmap_pg_nx; protection_codes[VM_PROT_READ|VM_PROT_EXECUTE] = PTE_X; protection_codes[VM_PROT_WRITE] = PTE_W | pmap_pg_nx; protection_codes[VM_PROT_WRITE|VM_PROT_EXECUTE] = PTE_W | PTE_X; protection_codes[VM_PROT_WRITE|VM_PROT_READ] = PTE_W | pmap_pg_nx; protection_codes[VM_PROT_ALL] = PTE_W | PTE_X; /* * Now we init the kernel's pmap. * * The kernel pmap's pm_obj is not used for much. However, in user pmaps * the pm_obj contains the list of active PTPs. */ kpm = pmap_kernel(); mutex_init(&kpm->pm_lock, MUTEX_DEFAULT, IPL_NONE); rw_init(&kpm->pm_dummy_lock); for (i = 0; i < PTP_LEVELS - 1; i++) { uvm_obj_init(&kpm->pm_obj[i], &pmap_pager, false, 1); uvm_obj_setlock(&kpm->pm_obj[i], &kpm->pm_dummy_lock); kpm->pm_ptphint[i] = NULL; } memset(&kpm->pm_list, 0, sizeof(kpm->pm_list)); /* pm_list not used */ kpm->pm_pdir = (pd_entry_t *)bootspace.pdir; for (i = 0; i < PDP_SIZE; i++) kpm->pm_pdirpa[i] = PDPpaddr + PAGE_SIZE * i; kpm->pm_stats.wired_count = kpm->pm_stats.resident_count = x86_btop(kva_start - VM_MIN_KERNEL_ADDRESS); kcpuset_create(&kpm->pm_cpus, true); kcpuset_create(&kpm->pm_kernel_cpus, true); kpm->pm_ldt = NULL; kpm->pm_ldt_sel = GSYSSEL(GLDT_SEL, SEL_KPL); /* * the above is just a rough estimate and not critical to the proper * operation of the system. */ #if !defined(XENPV) /* * Begin to enable global TLB entries if they are supported: add PTE_G * attribute to already mapped kernel pages. Do that only if SVS is * disabled. * * The G bit has no effect until the CR4_PGE bit is set in CR4, which * happens later in cpu_init(). */ #ifdef SVS if (!svs_enabled && (cpu_feature[0] & CPUID_PGE)) { #else if (cpu_feature[0] & CPUID_PGE) { #endif pmap_pg_g = PTE_G; pmap_remap_global(); } #endif #ifndef XENPV /* * Enable large pages if they are supported. */ if (cpu_feature[0] & CPUID_PSE) { lcr4(rcr4() | CR4_PSE); /* enable hardware (via %cr4) */ pmap_largepages = 1; /* enable software */ /* * The TLB must be flushed after enabling large pages on Pentium * CPUs, according to section 3.6.2.2 of "Intel Architecture * Software Developer's Manual, Volume 3: System Programming". */ tlbflushg(); /* Remap the kernel. */ pmap_remap_largepages(); } pmap_init_lapic(); #endif /* !XENPV */ #ifdef __HAVE_PCPU_AREA pmap_init_pcpu(); #endif #ifdef __HAVE_DIRECT_MAP pmap_init_directmap(kpm); #else pmap_vpage_cpualloc(&cpu_info_primary); if (VM_MIN_KERNEL_ADDRESS == KERNBASE) { /* i386 */ early_zerop = (void *)cpu_info_primary.vpage[VPAGE_ZER]; early_zero_pte = cpu_info_primary.vpage_pte[VPAGE_ZER]; } else { /* amd64 */ /* * zero_pte is stuck at the end of mapped space for the kernel * image (disjunct from kva space). This is done so that it * can safely be used in pmap_growkernel (pmap_get_physpage), * when it's called for the first time. * XXXfvdl fix this for MULTIPROCESSOR later. */ #ifdef XENPV /* early_zerop initialized in xen_locore() */ #else early_zerop = (void *)bootspace.spareva; #endif early_zero_pte = PTE_BASE + pl1_i((vaddr_t)early_zerop); } #endif #if defined(XENPV) && defined(__x86_64__) extern vaddr_t xen_dummy_page; paddr_t xen_dummy_user_pgd; /* * We want a dummy page directory for Xen: when deactivating a pmap, * Xen will still consider it active. So we set user PGD to this one * to lift all protection on the now inactive page tables set. */ xen_dummy_user_pgd = xen_dummy_page - KERNBASE; /* Zero fill it, the less checks in Xen it requires the better */ memset(PAGE_ALIGNED(xen_dummy_user_pgd + KERNBASE), 0, PAGE_SIZE); /* Mark read-only */ HYPERVISOR_update_va_mapping(xen_dummy_user_pgd + KERNBASE, pmap_pa2pte(xen_dummy_user_pgd) | PTE_P | pmap_pg_nx, UVMF_INVLPG); /* Pin as L4 */ xpq_queue_pin_l4_table(xpmap_ptom_masked(xen_dummy_user_pgd)); #endif /* * Allocate space for the Interrupt Descriptor Table (IDT), * Global Descriptor Table (GDT), and Local Descriptor Table * (LDT). * * Currently there is an initial temporary GDT allocated on the * stack by the caller of init386/init_x86_64, which is (among * other things) needed on i386 for %fs-relative addressing for * CPU-local data (CPUVAR(...), curcpu(), curlwp). This * initial temporary GDT will be popped off the stack before we * can enter main, so we need to make sure there is space for a * second temporary GDT to continue existing when we enter main * before we allocate space for the permanent GDT with * uvm_km(9) in gdt_init via cpu_startup and switch to that. */ idt_vaddr = pmap_bootstrap_valloc(1); idt_paddr = pmap_bootstrap_palloc(1); gdt_vaddr = pmap_bootstrap_valloc(1); gdt_paddr = pmap_bootstrap_palloc(1); #ifdef __HAVE_PCPU_AREA ldt_vaddr = (vaddr_t)&pcpuarea->ldt; #else ldt_vaddr = pmap_bootstrap_valloc(1); #endif ldt_paddr = pmap_bootstrap_palloc(1); #if !defined(__x86_64__) /* pentium f00f bug stuff */ pentium_idt_vaddr = pmap_bootstrap_valloc(1); #endif #if defined(XENPVHVM) /* XXX: move to hypervisor.c with appropriate API adjustments */ extern paddr_t HYPERVISOR_shared_info_pa; extern volatile struct xencons_interface *xencons_interface; /* XXX */ extern struct xenstore_domain_interface *xenstore_interface; /* XXX */ if (vm_guest != VM_GUEST_XENPVH) { HYPERVISOR_shared_info = (void *) pmap_bootstrap_valloc(1); HYPERVISOR_shared_info_pa = pmap_bootstrap_palloc(1); } xencons_interface = (void *) pmap_bootstrap_valloc(1); xenstore_interface = (void *) pmap_bootstrap_valloc(1); #endif /* * Now we reserve some VM for mapping pages when doing a crash dump. */ virtual_avail = reserve_dumppages(virtual_avail); /* * Init the global lock and global list. */ mutex_init(&pmaps_lock, MUTEX_DEFAULT, IPL_NONE); LIST_INIT(&pmaps); /* * Ensure the TLB is sync'd with reality by flushing it... */ tlbflushg(); /* * Calculate pmap_maxkvaddr from nkptp[]. */ kva = VM_MIN_KERNEL_ADDRESS; for (i = PTP_LEVELS - 1; i >= 1; i--) { kva += nkptp[i] * nbpd[i]; } pmap_maxkvaddr = kva; } #ifndef XENPV static void pmap_init_lapic(void) { /* * On CPUs that have no LAPIC, local_apic_va is never kentered. But our * x86 implementation relies a lot on this address to be valid; so just * allocate a fake physical page that will be kentered into * local_apic_va by machdep. * * If the LAPIC is present, the va will be remapped somewhere else * later in lapic_map. */ local_apic_va = pmap_bootstrap_valloc(1); local_apic_pa = pmap_bootstrap_palloc(1); } #endif #ifdef __x86_64__ static size_t pmap_pagetree_nentries_range(vaddr_t startva, vaddr_t endva, size_t pgsz) { size_t npages; npages = (roundup(endva, pgsz) / pgsz) - (rounddown(startva, pgsz) / pgsz); return npages; } #endif #if defined(__HAVE_DIRECT_MAP) || defined(KASAN) || defined(KMSAN) static inline void slotspace_copy(int type, pd_entry_t *dst, pd_entry_t *src) { size_t sslot = slotspace.area[type].sslot; size_t nslot = slotspace.area[type].nslot; memcpy(&dst[sslot], &src[sslot], nslot * sizeof(pd_entry_t)); } #endif #ifdef __x86_64__ /* * Randomize the location of an area. We count the holes in the VM space. We * randomly select one hole, and then randomly select an area within that hole. * Finally we update the associated entry in the slotspace structure. */ vaddr_t slotspace_rand(int type, size_t sz, size_t align, size_t randhole, vaddr_t randva) { struct { int start; int end; } holes[SLSPACE_NAREAS+1]; size_t i, nholes, hole; size_t startsl, endsl, nslots, winsize; vaddr_t startva, va; sz = roundup(sz, align); /* * Take one more slot with +NBPD_L4, because we may end up choosing * an area that crosses slots: * +------+------+------+ * | Slot | Slot | Slot | * +------+------+------+ * [Chosen Area] * And in that case we must take into account the additional slot * consumed. */ nslots = roundup(sz+NBPD_L4, NBPD_L4) / NBPD_L4; /* Get the holes. */ nholes = 0; size_t curslot = 0 + 256; /* end of SLAREA_USER */ while (1) { /* * Find the first occupied slot after the current one. * The area between the two is a hole. */ size_t minsslot = 512; size_t minnslot = 0; for (i = 0; i < SLSPACE_NAREAS; i++) { if (!slotspace.area[i].active) continue; if (slotspace.area[i].sslot >= curslot && slotspace.area[i].sslot < minsslot) { minsslot = slotspace.area[i].sslot; minnslot = slotspace.area[i].nslot; } } /* No hole anymore, stop here. */ if (minsslot == 512) { break; } /* Register the hole. */ if (minsslot - curslot >= nslots) { holes[nholes].start = curslot; holes[nholes].end = minsslot; nholes++; } /* Skip that hole, and iterate again. */ curslot = minsslot + minnslot; } if (nholes == 0) { panic("%s: impossible", __func__); } /* Select a hole. */ hole = randhole; #ifdef NO_X86_ASLR hole = 0; #endif hole %= nholes; startsl = holes[hole].start; endsl = holes[hole].end; startva = VA_SIGN_NEG(startsl * NBPD_L4); /* Select an area within the hole. */ va = randva; #ifdef NO_X86_ASLR va = 0; #endif winsize = ((endsl - startsl) * NBPD_L4) - sz; va %= winsize; va = rounddown(va, align); va += startva; /* Update the entry. */ slotspace.area[type].sslot = pl4_i(va); slotspace.area[type].nslot = pmap_pagetree_nentries_range(va, va+sz, NBPD_L4); slotspace.area[type].active = true; return va; } #endif #ifdef __HAVE_PCPU_AREA static void pmap_init_pcpu(void) { const vaddr_t startva = PMAP_PCPU_BASE; size_t nL4e, nL3e, nL2e, nL1e; size_t L4e_idx, L3e_idx, L2e_idx, L1e_idx __diagused; paddr_t pa; vaddr_t endva; vaddr_t tmpva; pt_entry_t *pte; size_t size; int i; const pd_entry_t pteflags = PTE_P | PTE_W | pmap_pg_nx; size = sizeof(struct pcpu_area); endva = startva + size; /* We will use this temporary va. */ tmpva = bootspace.spareva; pte = PTE_BASE + pl1_i(tmpva); /* Build L4 */ L4e_idx = pl4_i(startva); nL4e = pmap_pagetree_nentries_range(startva, endva, NBPD_L4); KASSERT(nL4e == 1); for (i = 0; i < nL4e; i++) { KASSERT(L4_BASE[L4e_idx+i] == 0); pa = pmap_bootstrap_palloc(1); *pte = (pa & PTE_FRAME) | pteflags; pmap_update_pg(tmpva); memset(PAGE_ALIGNED(tmpva), 0, PAGE_SIZE); L4_BASE[L4e_idx+i] = pa | pteflags | PTE_A; } /* Build L3 */ L3e_idx = pl3_i(startva); nL3e = pmap_pagetree_nentries_range(startva, endva, NBPD_L3); for (i = 0; i < nL3e; i++) { KASSERT(L3_BASE[L3e_idx+i] == 0); pa = pmap_bootstrap_palloc(1); *pte = (pa & PTE_FRAME) | pteflags; pmap_update_pg(tmpva); memset(PAGE_ALIGNED(tmpva), 0, PAGE_SIZE); L3_BASE[L3e_idx+i] = pa | pteflags | PTE_A; } /* Build L2 */ L2e_idx = pl2_i(startva); nL2e = pmap_pagetree_nentries_range(startva, endva, NBPD_L2); for (i = 0; i < nL2e; i++) { KASSERT(L2_BASE[L2e_idx+i] == 0); pa = pmap_bootstrap_palloc(1); *pte = (pa & PTE_FRAME) | pteflags; pmap_update_pg(tmpva); memset(PAGE_ALIGNED(tmpva), 0, PAGE_SIZE); L2_BASE[L2e_idx+i] = pa | pteflags | PTE_A; } /* Build L1 */ L1e_idx = pl1_i(startva); nL1e = pmap_pagetree_nentries_range(startva, endva, NBPD_L1); for (i = 0; i < nL1e; i++) { /* * Nothing to do, the PTEs will be entered via * pmap_kenter_pa. */ KASSERT(L1_BASE[L1e_idx+i] == 0); } *pte = 0; pmap_update_pg(tmpva); pcpuarea = (struct pcpu_area *)startva; tlbflush(); } #endif #ifdef __HAVE_DIRECT_MAP static void randomize_hole(size_t *randholep, vaddr_t *randvap) { struct nist_hash_drbg drbg; uint8_t seed[NIST_HASH_DRBG_SEEDLEN_BYTES]; const char p[] = "x86/directmap"; int error; entropy_extract(seed, sizeof(seed), 0); error = nist_hash_drbg_instantiate(&drbg, seed, sizeof(seed), /*nonce*/NULL, 0, /*personalization*/p, strlen(p)); KASSERTMSG(error == 0, "error=%d", error); error = nist_hash_drbg_generate(&drbg, randholep, sizeof(*randholep), /*additional*/NULL, 0); KASSERTMSG(error == 0, "error=%d", error); error = nist_hash_drbg_generate(&drbg, randvap, sizeof(*randvap), /*additional*/NULL, 0); KASSERTMSG(error == 0, "error=%d", error); explicit_memset(seed, 0, sizeof(seed)); explicit_memset(&drbg, 0, sizeof(drbg)); } /* * Create the amd64 direct map. Called only once at boot time. We map all of * the physical memory contiguously using 2MB large pages, with RW permissions. * However there is a hole: the kernel is mapped with RO permissions. */ static void pmap_init_directmap(struct pmap *kpm) { extern phys_ram_seg_t mem_clusters[]; extern int mem_cluster_cnt; vaddr_t startva; size_t nL4e, nL3e, nL2e; size_t L4e_idx, L3e_idx, L2e_idx; size_t spahole, epahole; paddr_t lastpa, pa; vaddr_t endva; vaddr_t tmpva; pt_entry_t *pte; phys_ram_seg_t *mc; int i; size_t randhole; vaddr_t randva; const pd_entry_t pteflags = PTE_P | PTE_W | pmap_pg_nx; const pd_entry_t holepteflags = PTE_P | pmap_pg_nx; CTASSERT(NL4_SLOT_DIRECT * NBPD_L4 == MAXPHYSMEM); spahole = roundup(bootspace.head.pa, NBPD_L2); epahole = rounddown(bootspace.boot.pa, NBPD_L2); /* Get the last physical address available */ lastpa = 0; for (i = 0; i < mem_cluster_cnt; i++) { mc = &mem_clusters[i]; lastpa = MAX(lastpa, mc->start + mc->size); } /* * x86_add_cluster should have truncated the memory to MAXPHYSMEM. */ if (lastpa > MAXPHYSMEM) { panic("pmap_init_directmap: lastpa incorrect"); } randomize_hole(&randhole, &randva); startva = slotspace_rand(SLAREA_DMAP, lastpa, NBPD_L2, randhole, randva); endva = startva + lastpa; /* We will use this temporary va. */ tmpva = bootspace.spareva; pte = PTE_BASE + pl1_i(tmpva); /* Build L4 */ L4e_idx = pl4_i(startva); nL4e = pmap_pagetree_nentries_range(startva, endva, NBPD_L4); KASSERT(nL4e <= NL4_SLOT_DIRECT); for (i = 0; i < nL4e; i++) { KASSERT(L4_BASE[L4e_idx+i] == 0); pa = pmap_bootstrap_palloc(1); *pte = (pa & PTE_FRAME) | pteflags; pmap_update_pg(tmpva); memset(PAGE_ALIGNED(tmpva), 0, PAGE_SIZE); L4_BASE[L4e_idx+i] = pa | pteflags | PTE_A; } /* Build L3 */ L3e_idx = pl3_i(startva); nL3e = pmap_pagetree_nentries_range(startva, endva, NBPD_L3); for (i = 0; i < nL3e; i++) { KASSERT(L3_BASE[L3e_idx+i] == 0); pa = pmap_bootstrap_palloc(1); *pte = (pa & PTE_FRAME) | pteflags; pmap_update_pg(tmpva); memset(PAGE_ALIGNED(tmpva), 0, PAGE_SIZE); L3_BASE[L3e_idx+i] = pa | pteflags | PTE_A; } /* Build L2 */ L2e_idx = pl2_i(startva); nL2e = pmap_pagetree_nentries_range(startva, endva, NBPD_L2); for (i = 0; i < nL2e; i++) { KASSERT(L2_BASE[L2e_idx+i] == 0); pa = (paddr_t)(i * NBPD_L2); if (spahole <= pa && pa < epahole) { L2_BASE[L2e_idx+i] = pa | holepteflags | PTE_A | PTE_PS | pmap_pg_g; } else { L2_BASE[L2e_idx+i] = pa | pteflags | PTE_A | PTE_PS | pmap_pg_g; } } *pte = 0; pmap_update_pg(tmpva); pmap_direct_base = startva; pmap_direct_end = endva; tlbflush(); } #endif /* __HAVE_DIRECT_MAP */ #if !defined(XENPV) /* * Remap all of the virtual pages created so far with the PTE_G bit. */ static void pmap_remap_global(void) { vaddr_t kva, kva_end; unsigned long p1i; size_t i; /* head */ kva = bootspace.head.va; kva_end = kva + bootspace.head.sz; for ( ; kva < kva_end; kva += PAGE_SIZE) { p1i = pl1_i(kva); if (pmap_valid_entry(PTE_BASE[p1i])) PTE_BASE[p1i] |= pmap_pg_g; } /* kernel segments */ for (i = 0; i < BTSPACE_NSEGS; i++) { if (bootspace.segs[i].type == BTSEG_NONE) { continue; } kva = bootspace.segs[i].va; kva_end = kva + bootspace.segs[i].sz; for ( ; kva < kva_end; kva += PAGE_SIZE) { p1i = pl1_i(kva); if (pmap_valid_entry(PTE_BASE[p1i])) PTE_BASE[p1i] |= pmap_pg_g; } } /* boot space */ kva = bootspace.boot.va; kva_end = kva + bootspace.boot.sz; for ( ; kva < kva_end; kva += PAGE_SIZE) { p1i = pl1_i(kva); if (pmap_valid_entry(PTE_BASE[p1i])) PTE_BASE[p1i] |= pmap_pg_g; } } #endif #ifndef XENPV /* * Remap several kernel segments with large pages. We cover as many pages as we * can. Called only once at boot time, if the CPU supports large pages. */ static void pmap_remap_largepages(void) { pd_entry_t *pde; vaddr_t kva, kva_end; paddr_t pa; size_t i; /* Remap the kernel text using large pages. */ for (i = 0; i < BTSPACE_NSEGS; i++) { if (bootspace.segs[i].type != BTSEG_TEXT) { continue; } kva = roundup(bootspace.segs[i].va, NBPD_L2); if (kva < bootspace.segs[i].va) { continue; } kva_end = rounddown(bootspace.segs[i].va + bootspace.segs[i].sz, NBPD_L2); pa = roundup(bootspace.segs[i].pa, NBPD_L2); for (/* */; kva < kva_end; kva += NBPD_L2, pa += NBPD_L2) { pde = &L2_BASE[pl2_i(kva)]; *pde = pa | pmap_pg_g | PTE_PS | PTE_P; tlbflushg(); } } /* Remap the kernel rodata using large pages. */ for (i = 0; i < BTSPACE_NSEGS; i++) { if (bootspace.segs[i].type != BTSEG_RODATA) { continue; } kva = roundup(bootspace.segs[i].va, NBPD_L2); if (kva < bootspace.segs[i].va) { continue; } kva_end = rounddown(bootspace.segs[i].va + bootspace.segs[i].sz, NBPD_L2); pa = roundup(bootspace.segs[i].pa, NBPD_L2); for (/* */; kva < kva_end; kva += NBPD_L2, pa += NBPD_L2) { pde = &L2_BASE[pl2_i(kva)]; *pde = pa | pmap_pg_g | PTE_PS | pmap_pg_nx | PTE_P; tlbflushg(); } } /* Remap the kernel data+bss using large pages. */ for (i = 0; i < BTSPACE_NSEGS; i++) { if (bootspace.segs[i].type != BTSEG_DATA) { continue; } kva = roundup(bootspace.segs[i].va, NBPD_L2); if (kva < bootspace.segs[i].va) { continue; } kva_end = rounddown(bootspace.segs[i].va + bootspace.segs[i].sz, NBPD_L2); pa = roundup(bootspace.segs[i].pa, NBPD_L2); for (/* */; kva < kva_end; kva += NBPD_L2, pa += NBPD_L2) { pde = &L2_BASE[pl2_i(kva)]; *pde = pa | pmap_pg_g | PTE_PS | pmap_pg_nx | PTE_W | PTE_P; tlbflushg(); } } } #endif /* !XENPV */ /* * pmap_init: called from uvm_init, our job is to get the pmap system ready * to manage mappings. */ void pmap_init(void) { int flags; /* * initialize caches. */ pool_cache_bootstrap(&pmap_cache, sizeof(struct pmap), COHERENCY_UNIT, 0, 0, "pmappl", NULL, IPL_NONE, pmap_ctor, pmap_dtor, NULL); #ifdef XENPV /* * pool_cache(9) should not touch cached objects, since they * are pinned on xen and R/O for the domU */ flags = PR_NOTOUCH; #else flags = 0; #endif #ifdef PAE pool_init(&pmap_pdp_pool, PAGE_SIZE * PDP_SIZE, 0, 0, flags, "pdppl", &pmap_pdp_allocator, IPL_NONE); #else pool_init(&pmap_pdp_pool, PAGE_SIZE, 0, 0, flags, "pdppl", NULL, IPL_NONE); #endif pool_cache_bootstrap(&pmap_pvp_cache, PAGE_SIZE, PAGE_SIZE, 0, 0, "pvpage", &pool_allocator_kmem, IPL_NONE, pmap_pvp_ctor, pmap_pvp_dtor, NULL); pmap_tlb_init(); /* XXX: Since cpu_hatch() is only for secondary CPUs. */ pmap_tlb_cpu_init(curcpu()); evcnt_attach_dynamic(&pmap_iobmp_evcnt, EVCNT_TYPE_MISC, NULL, "x86", "io bitmap copy"); evcnt_attach_dynamic(&pmap_ldt_evcnt, EVCNT_TYPE_MISC, NULL, "x86", "ldt sync"); /* * The kernel doesn't keep track of PTPs, so there's nowhere handy * to hang a tree of pv_entry records. Dynamically allocated * pv_entry lists are not heavily used in the kernel's pmap (the * usual case is embedded), so cop out and use a single RB tree * to cover them. */ rb_tree_init(&pmap_kernel_rb, &pmap_rbtree_ops); /* * done: pmap module is up (and ready for business) */ pmap_initialized = true; } #ifndef XENPV /* * pmap_cpu_init_late: perform late per-CPU initialization. */ void pmap_cpu_init_late(struct cpu_info *ci) { /* * The BP has already its own PD page allocated during early * MD startup. */ if (ci == &cpu_info_primary) return; #ifdef PAE cpu_alloc_l3_page(ci); #endif } #endif #ifndef __HAVE_DIRECT_MAP CTASSERT(CACHE_LINE_SIZE > sizeof(pt_entry_t)); CTASSERT(CACHE_LINE_SIZE % sizeof(pt_entry_t) == 0); static void pmap_vpage_cpualloc(struct cpu_info *ci) { bool primary = (ci == &cpu_info_primary); size_t i, npages; vaddr_t vabase; vsize_t vrange; npages = (CACHE_LINE_SIZE / sizeof(pt_entry_t)); KASSERT(npages >= VPAGE_MAX); vrange = npages * PAGE_SIZE; if (primary) { while ((vabase = pmap_bootstrap_valloc(1)) % vrange != 0) { /* Waste some pages to align properly */ } /* The base is aligned, allocate the rest (contiguous) */ pmap_bootstrap_valloc(npages - 1); } else { vabase = uvm_km_alloc(kernel_map, vrange, vrange, UVM_KMF_VAONLY); if (vabase == 0) { panic("%s: failed to allocate tmp VA for CPU %d\n", __func__, cpu_index(ci)); } } KASSERT((vaddr_t)&PTE_BASE[pl1_i(vabase)] % CACHE_LINE_SIZE == 0); for (i = 0; i < VPAGE_MAX; i++) { ci->vpage[i] = vabase + i * PAGE_SIZE; ci->vpage_pte[i] = PTE_BASE + pl1_i(ci->vpage[i]); } } void pmap_vpage_cpu_init(struct cpu_info *ci) { if (ci == &cpu_info_primary) { /* cpu0 already taken care of in pmap_bootstrap */ return; } pmap_vpage_cpualloc(ci); } #endif /* * p v _ e n t r y f u n c t i o n s */ /* * pmap_pvp_dtor: pool_cache constructor for PV pages. */ static int pmap_pvp_ctor(void *arg, void *obj, int flags) { struct pv_page *pvp = (struct pv_page *)obj; struct pv_entry *pve = (struct pv_entry *)obj + 1; struct pv_entry *maxpve = pve + PVE_PER_PVP; KASSERT(sizeof(struct pv_page) <= sizeof(struct pv_entry)); KASSERT(trunc_page((vaddr_t)obj) == (vaddr_t)obj); LIST_INIT(&pvp->pvp_pves); pvp->pvp_nfree = PVE_PER_PVP; pvp->pvp_pmap = NULL; for (; pve < maxpve; pve++) { LIST_INSERT_HEAD(&pvp->pvp_pves, pve, pve_list); } return 0; } /* * pmap_pvp_dtor: pool_cache destructor for PV pages. */ static void pmap_pvp_dtor(void *arg, void *obj) { struct pv_page *pvp __diagused = obj; KASSERT(pvp->pvp_pmap == NULL); KASSERT(pvp->pvp_nfree == PVE_PER_PVP); } /* * pmap_alloc_pv: allocate a PV entry (likely cached with pmap). */ static struct pv_entry * pmap_alloc_pv(struct pmap *pmap) { struct pv_entry *pve; struct pv_page *pvp; KASSERT(mutex_owned(&pmap->pm_lock)); if (__predict_false((pvp = LIST_FIRST(&pmap->pm_pvp_part)) == NULL)) { if ((pvp = LIST_FIRST(&pmap->pm_pvp_full)) != NULL) { LIST_REMOVE(pvp, pvp_list); } else { pvp = pool_cache_get(&pmap_pvp_cache, PR_NOWAIT); } if (__predict_false(pvp == NULL)) { return NULL; } /* full -> part */ LIST_INSERT_HEAD(&pmap->pm_pvp_part, pvp, pvp_list); pvp->pvp_pmap = pmap; } KASSERT(pvp->pvp_pmap == pmap); KASSERT(pvp->pvp_nfree > 0); pve = LIST_FIRST(&pvp->pvp_pves); LIST_REMOVE(pve, pve_list); pvp->pvp_nfree--; if (__predict_false(pvp->pvp_nfree == 0)) { /* part -> empty */ KASSERT(LIST_EMPTY(&pvp->pvp_pves)); LIST_REMOVE(pvp, pvp_list); LIST_INSERT_HEAD(&pmap->pm_pvp_empty, pvp, pvp_list); } else { KASSERT(!LIST_EMPTY(&pvp->pvp_pves)); } return pve; } /* * pmap_free_pv: delayed free of a PV entry. */ static void pmap_free_pv(struct pmap *pmap, struct pv_entry *pve) { struct pv_page *pvp = (struct pv_page *)trunc_page((vaddr_t)pve); KASSERT(mutex_owned(&pmap->pm_lock)); KASSERT(pvp->pvp_pmap == pmap); KASSERT(pvp->pvp_nfree >= 0); LIST_INSERT_HEAD(&pvp->pvp_pves, pve, pve_list); pvp->pvp_nfree++; if (__predict_false(pvp->pvp_nfree == 1)) { /* empty -> part */ LIST_REMOVE(pvp, pvp_list); LIST_INSERT_HEAD(&pmap->pm_pvp_part, pvp, pvp_list); } else if (__predict_false(pvp->pvp_nfree == PVE_PER_PVP)) { /* part -> full */ LIST_REMOVE(pvp, pvp_list); LIST_INSERT_HEAD(&pmap->pm_pvp_full, pvp, pvp_list); } } /* * pmap_drain_pv: free full PV pages. */ static void pmap_drain_pv(struct pmap *pmap) { struct pv_page *pvp; KASSERT(mutex_owned(&pmap->pm_lock)); while ((pvp = LIST_FIRST(&pmap->pm_pvp_full)) != NULL) { LIST_REMOVE(pvp, pvp_list); KASSERT(pvp->pvp_pmap == pmap); KASSERT(pvp->pvp_nfree == PVE_PER_PVP); pvp->pvp_pmap = NULL; pool_cache_put(&pmap_pvp_cache, pvp); } } /* * pmap_check_pv: verify {VA, PTP} pair is either tracked/untracked by page */ static void pmap_check_pv(struct pmap *pmap, struct vm_page *ptp, struct pmap_page *pp, vaddr_t va, bool tracked) { #ifdef DEBUG struct pv_pte *pvpte; PMAP_CHECK_PP(pp); mutex_spin_enter(&pp->pp_lock); for (pvpte = pv_pte_first(pp); pvpte; pvpte = pv_pte_next(pp, pvpte)) { if (pvpte->pte_ptp == ptp && pvpte->pte_va == va) { break; } } mutex_spin_exit(&pp->pp_lock); if (pvpte && !tracked) { panic("pmap_check_pv: %p/%lx found on pp %p", ptp, va, pp); } else if (!pvpte && tracked) { panic("pmap_check_pv: %p/%lx missing on pp %p", ptp, va, pp); } #endif } /* * pmap_treelookup_pv: search the PV tree for a dynamic entry * * => pmap must be locked */ static struct pv_entry * pmap_treelookup_pv(const struct pmap *pmap, const struct vm_page *ptp, const rb_tree_t *tree, const vaddr_t va) { struct pv_entry *pve; rb_node_t *node; /* * Inlined lookup tailored for exactly what's needed here that is * quite a bit faster than using rb_tree_find_node(). */ for (node = tree->rbt_root;;) { if (__predict_false(RB_SENTINEL_P(node))) { return NULL; } pve = (struct pv_entry *) ((uintptr_t)node - offsetof(struct pv_entry, pve_rb)); if (pve->pve_pte.pte_va == va) { KASSERT(pve->pve_pte.pte_ptp == ptp); return pve; } node = node->rb_nodes[pve->pve_pte.pte_va < va]; } } /* * pmap_lookup_pv: look up a non-embedded pv entry for the given pmap * * => a PV entry must be known present (doesn't check for existence) * => pmap must be locked */ static struct pv_entry * pmap_lookup_pv(const struct pmap *pmap, const struct vm_page *ptp, const struct pmap_page * const old_pp, const vaddr_t va) { struct pv_entry *pve; const rb_tree_t *tree; KASSERT(mutex_owned(&pmap->pm_lock)); KASSERT(ptp != NULL || pmap == pmap_kernel()); /* * [This mostly deals with the case of process-private pages, i.e. * anonymous memory allocations or COW.] * * If the page is tracked with an embedded entry then the tree * lookup can be avoided. It's safe to check for this specific * set of values without pp_lock because both will only ever be * set together for this pmap. * */ if (atomic_load_relaxed(&old_pp->pp_pte.pte_ptp) == ptp && atomic_load_relaxed(&old_pp->pp_pte.pte_va) == va) { return NULL; } /* * [This mostly deals with shared mappings, for example shared libs * and executables.] * * Optimise for pmap_remove_ptes() which works by ascending scan: * look at the lowest numbered node in the tree first. The tree is * known non-empty because of the check above. For short lived * processes where pmap_remove() isn't used much this gets close to * a 100% hit rate. */ tree = (ptp != NULL ? &VM_PAGE_TO_PP(ptp)->pp_rb : &pmap_kernel_rb); KASSERT(!RB_SENTINEL_P(tree->rbt_root)); pve = (struct pv_entry *) ((uintptr_t)tree->rbt_minmax[RB_DIR_LEFT] - offsetof(struct pv_entry, pve_rb)); if (__predict_true(pve->pve_pte.pte_va == va)) { KASSERT(pve->pve_pte.pte_ptp == ptp); return pve; } /* Search the RB tree for the key (uncommon). */ return pmap_treelookup_pv(pmap, ptp, tree, va); } /* * pmap_enter_pv: enter a mapping onto a pmap_page lst * * => pmap must be locked * => does NOT insert dynamic entries to tree (pmap_enter() does later) */ static int pmap_enter_pv(struct pmap *pmap, struct pmap_page *pp, struct vm_page *ptp, vaddr_t va, struct pv_entry **new_pve, struct pv_entry **old_pve, bool *samepage, bool *new_embedded, rb_tree_t *tree) { struct pv_entry *pve; int error; KASSERT(mutex_owned(&pmap->pm_lock)); KASSERT(ptp_to_pmap(ptp) == pmap); KASSERT(ptp == NULL || ptp->uobject != NULL); KASSERT(ptp == NULL || ptp_va2o(va, 1) == ptp->offset); PMAP_CHECK_PP(pp); /* * If entering the same page and it's already tracked with an * embedded entry, we can avoid the expense below. It's safe * to check for this very specific set of values without a lock * because both will only ever be set together for this pmap. */ if (atomic_load_relaxed(&pp->pp_pte.pte_ptp) == ptp && atomic_load_relaxed(&pp->pp_pte.pte_va) == va) { *samepage = true; pmap_check_pv(pmap, ptp, pp, va, true); return 0; } /* * Check for an existing dynamic mapping at this address. If it's * for the same page, then it will be reused and nothing needs to be * changed. */ *old_pve = pmap_treelookup_pv(pmap, ptp, tree, va); if (*old_pve != NULL && (*old_pve)->pve_pp == pp) { *samepage = true; pmap_check_pv(pmap, ptp, pp, va, true); return 0; } /* * Need to put a new mapping in place. Grab a spare pv_entry in * case it's needed; won't know for sure until the lock is taken. */ if (pmap->pm_pve == NULL) { pmap->pm_pve = pmap_alloc_pv(pmap); } error = 0; pmap_check_pv(pmap, ptp, pp, va, false); mutex_spin_enter(&pp->pp_lock); if (!pv_pte_embedded(pp)) { /* * Embedded PV tracking available - easy. */ pp->pp_pte.pte_ptp = ptp; pp->pp_pte.pte_va = va; *new_embedded = true; } else if (__predict_false(pmap->pm_pve == NULL)) { /* * No memory. */ error = ENOMEM; } else { /* * Install new pv_entry on the page. */ pve = pmap->pm_pve; pmap->pm_pve = NULL; *new_pve = pve; pve->pve_pte.pte_ptp = ptp; pve->pve_pte.pte_va = va; pve->pve_pp = pp; LIST_INSERT_HEAD(&pp->pp_pvlist, pve, pve_list); } mutex_spin_exit(&pp->pp_lock); if (error == 0) { pmap_check_pv(pmap, ptp, pp, va, true); } return error; } /* * pmap_remove_pv: try to remove a mapping from a pv_list * * => pmap must be locked * => removes dynamic entries from tree and frees them * => caller should adjust ptp's wire_count and free PTP if needed */ static void pmap_remove_pv(struct pmap *pmap, struct pmap_page *pp, struct vm_page *ptp, vaddr_t va, struct pv_entry *pve, uint8_t oattrs) { rb_tree_t *tree = (ptp != NULL ? &VM_PAGE_TO_PP(ptp)->pp_rb : &pmap_kernel_rb); KASSERT(mutex_owned(&pmap->pm_lock)); KASSERT(ptp_to_pmap(ptp) == pmap); KASSERT(ptp == NULL || ptp->uobject != NULL); KASSERT(ptp == NULL || ptp_va2o(va, 1) == ptp->offset); KASSERT(ptp != NULL || pmap == pmap_kernel()); pmap_check_pv(pmap, ptp, pp, va, true); if (pve == NULL) { mutex_spin_enter(&pp->pp_lock); KASSERT(pp->pp_pte.pte_ptp == ptp); KASSERT(pp->pp_pte.pte_va == va); pp->pp_attrs |= oattrs; pp->pp_pte.pte_ptp = NULL; pp->pp_pte.pte_va = 0; mutex_spin_exit(&pp->pp_lock); } else { mutex_spin_enter(&pp->pp_lock); KASSERT(pp->pp_pte.pte_ptp != ptp || pp->pp_pte.pte_va != va); KASSERT(pve->pve_pte.pte_ptp == ptp); KASSERT(pve->pve_pte.pte_va == va); KASSERT(pve->pve_pp == pp); pp->pp_attrs |= oattrs; LIST_REMOVE(pve, pve_list); mutex_spin_exit(&pp->pp_lock); KASSERT(pmap_treelookup_pv(pmap, ptp, tree, va) == pve); rb_tree_remove_node(tree, pve); #ifdef DIAGNOSTIC memset(pve, 0, sizeof(*pve)); #endif pmap_free_pv(pmap, pve); } KASSERT(pmap_treelookup_pv(pmap, ptp, tree, va) == NULL); pmap_check_pv(pmap, ptp, pp, va, false); } /* * p t p f u n c t i o n s */ static struct vm_page * pmap_find_ptp(struct pmap *pmap, vaddr_t va, int level) { int lidx = level - 1; off_t off = ptp_va2o(va, level); struct vm_page *pg; KASSERT(mutex_owned(&pmap->pm_lock)); if (pmap->pm_ptphint[lidx] && off == pmap->pm_ptphint[lidx]->offset) { KASSERT(pmap->pm_ptphint[lidx]->wire_count > 0); pg = pmap->pm_ptphint[lidx]; PMAP_CHECK_PP(VM_PAGE_TO_PP(pg)); return pg; } PMAP_DUMMY_LOCK(pmap); pg = uvm_pagelookup(&pmap->pm_obj[lidx], off); PMAP_DUMMY_UNLOCK(pmap); if (pg != NULL && __predict_false(pg->wire_count == 0)) { /* This page is queued to be freed - ignore. */ pg = NULL; } if (pg != NULL) { PMAP_CHECK_PP(VM_PAGE_TO_PP(pg)); } pmap->pm_ptphint[lidx] = pg; return pg; } static inline void pmap_freepage(struct pmap *pmap, struct vm_page *ptp, int level) { int lidx; KASSERT(ptp->wire_count <= 1); PMAP_CHECK_PP(VM_PAGE_TO_PP(ptp)); lidx = level - 1; pmap_stats_update(pmap, -ptp->wire_count, 0); if (pmap->pm_ptphint[lidx] == ptp) pmap->pm_ptphint[lidx] = NULL; ptp->wire_count = 0; ptp->uanon = NULL; KASSERT(RB_TREE_MIN(&VM_PAGE_TO_PP(ptp)->pp_rb) == NULL); /* * Enqueue the PTP to be freed by pmap_update(). We can't remove * the page from the uvm_object, as that can take further locks * (intolerable right now because the PTEs are likely mapped in). * Instead mark the PTP as free and if we bump into it again, we'll * either ignore or reuse (depending on what's useful at the time). */ LIST_INSERT_HEAD(&pmap->pm_gc_ptp, ptp, mdpage.mp_pp.pp_link); } static void pmap_free_ptp(struct pmap *pmap, struct vm_page *ptp, vaddr_t va, pt_entry_t *ptes, pd_entry_t * const *pdes) { unsigned long index; int level; vaddr_t invaladdr; pd_entry_t opde; KASSERT(pmap != pmap_kernel()); KASSERT(mutex_owned(&pmap->pm_lock)); KASSERT(kpreempt_disabled()); level = 1; do { index = pl_i(va, level + 1); opde = pmap_pte_testset(&pdes[level - 1][index], 0); /* * On Xen-amd64 or SVS, we need to sync the top level page * directory on each CPU. */ #if defined(XENPV) && defined(__x86_64__) if (level == PTP_LEVELS - 1) { xen_kpm_sync(pmap, index); } #elif defined(SVS) if (svs_enabled && level == PTP_LEVELS - 1 && pmap_is_user(pmap)) { svs_pmap_sync(pmap, index); } #endif invaladdr = level == 1 ? (vaddr_t)ptes : (vaddr_t)pdes[level - 2]; pmap_tlb_shootdown(pmap, invaladdr + index * PAGE_SIZE, opde, TLBSHOOT_FREE_PTP); #if defined(XENPV) pmap_tlb_shootnow(); #endif pmap_freepage(pmap, ptp, level); if (level < PTP_LEVELS - 1) { ptp = pmap_find_ptp(pmap, va, level + 1); ptp->wire_count--; if (ptp->wire_count > 1) break; } } while (++level < PTP_LEVELS); pmap_pte_flush(); } /* * pmap_get_ptp: get a PTP (if there isn't one, allocate a new one) * * => pmap should NOT be pmap_kernel() * => pmap should be locked * => we are not touching any PTEs yet, so they need not be mapped in */ static int pmap_get_ptp(struct pmap *pmap, struct pmap_ptparray *pt, vaddr_t va, int flags, struct vm_page **resultp) { struct vm_page *ptp; int i, aflags; struct uvm_object *obj; voff_t off; KASSERT(pmap != pmap_kernel()); KASSERT(mutex_owned(&pmap->pm_lock)); /* * Loop through all page table levels allocating a page * for any level where we don't already have one. */ memset(pt, 0, sizeof(*pt)); aflags = ((flags & PMAP_CANFAIL) ? 0 : UVM_PGA_USERESERVE) | UVM_PGA_ZERO; for (i = PTP_LEVELS; i > 1; i--) { obj = &pmap->pm_obj[i - 2]; off = ptp_va2o(va, i - 1); PMAP_DUMMY_LOCK(pmap); pt->pg[i] = uvm_pagelookup(obj, off); if (pt->pg[i] == NULL) { pt->pg[i] = uvm_pagealloc(obj, off, NULL, aflags); pt->alloced[i] = (pt->pg[i] != NULL); } else if (pt->pg[i]->wire_count == 0) { /* This page was queued to be freed; dequeue it. */ LIST_REMOVE(pt->pg[i], mdpage.mp_pp.pp_link); pt->alloced[i] = true; } PMAP_DUMMY_UNLOCK(pmap); if (pt->pg[i] == NULL) { pmap_unget_ptp(pmap, pt); return ENOMEM; } else if (pt->alloced[i]) { pt->pg[i]->uanon = (struct vm_anon *)(vaddr_t)~0L; rb_tree_init(&VM_PAGE_TO_PP(pt->pg[i])->pp_rb, &pmap_rbtree_ops); PMAP_CHECK_PP(VM_PAGE_TO_PP(pt->pg[i])); } } ptp = pt->pg[2]; KASSERT(ptp != NULL); *resultp = ptp; pmap->pm_ptphint[0] = ptp; return 0; } /* * pmap_install_ptp: install any freshly allocated PTPs * * => pmap should NOT be pmap_kernel() * => pmap should be locked * => PTEs must be mapped * => preemption must be disabled */ static void pmap_install_ptp(struct pmap *pmap, struct pmap_ptparray *pt, vaddr_t va, pd_entry_t * const *pdes) { struct vm_page *ptp; unsigned long index; pd_entry_t *pva; paddr_t pa; int i; KASSERT(pmap != pmap_kernel()); KASSERT(mutex_owned(&pmap->pm_lock)); KASSERT(kpreempt_disabled()); /* * Now that we have all the pages looked up or allocated, * loop through again installing any new ones into the tree. */ for (i = PTP_LEVELS; i > 1; i--) { index = pl_i(va, i); pva = pdes[i - 2]; if (pmap_valid_entry(pva[index])) { KASSERT(!pt->alloced[i]); continue; } ptp = pt->pg[i]; ptp->flags &= ~PG_BUSY; /* never busy */ ptp->wire_count = 1; pmap->pm_ptphint[i - 2] = ptp; pa = VM_PAGE_TO_PHYS(ptp); pmap_pte_set(&pva[index], (pd_entry_t) (pmap_pa2pte(pa) | PTE_U | PTE_W | PTE_P)); /* * On Xen-amd64 or SVS, we need to sync the top level page * directory on each CPU. */ #if defined(XENPV) && defined(__x86_64__) if (i == PTP_LEVELS) { xen_kpm_sync(pmap, index); } #elif defined(SVS) if (svs_enabled && i == PTP_LEVELS && pmap_is_user(pmap)) { svs_pmap_sync(pmap, index); } #endif pmap_pte_flush(); pmap_stats_update(pmap, 1, 0); /* * If we're not in the top level, increase the * wire count of the parent page. */ if (i < PTP_LEVELS) { pt->pg[i + 1]->wire_count++; } } } /* * pmap_unget_ptp: free unusued PTPs * * => pmap should NOT be pmap_kernel() * => pmap should be locked */ static void pmap_unget_ptp(struct pmap *pmap, struct pmap_ptparray *pt) { int i; KASSERT(pmap != pmap_kernel()); KASSERT(mutex_owned(&pmap->pm_lock)); for (i = PTP_LEVELS; i > 1; i--) { if (!pt->alloced[i]) { continue; } KASSERT(pt->pg[i]->wire_count == 0); PMAP_CHECK_PP(VM_PAGE_TO_PP(pt->pg[i])); pmap_freepage(pmap, pt->pg[i], i - 1); } } /* * p m a p l i f e c y c l e f u n c t i o n s */ /* * pmap_pdp_init: constructor a new PDP. */ static void pmap_pdp_init(pd_entry_t *pdir) { paddr_t pdirpa = 0; vaddr_t object; int i; #if !defined(XENPV) || !defined(__x86_64__) int npde; #endif #ifdef XENPV int s; #endif memset(PAGE_ALIGNED(pdir), 0, PDP_SIZE * PAGE_SIZE); /* * NOTE: This is all done unlocked, but we will check afterwards * if we have raced with pmap_growkernel(). */ #if defined(XENPV) && defined(__x86_64__) /* Fetch the physical address of the page directory */ (void)pmap_extract(pmap_kernel(), (vaddr_t)pdir, &pdirpa); /* * This pdir will NEVER be active in kernel mode, so mark * recursive entry invalid. */ pdir[PDIR_SLOT_PTE] = pmap_pa2pte(pdirpa); /* * PDP constructed this way won't be for the kernel, hence we * don't put kernel mappings on Xen. * * But we need to make pmap_create() happy, so put a dummy * (without PTE_P) value at the right place. */ pdir[PDIR_SLOT_KERN + nkptp[PTP_LEVELS - 1] - 1] = (pd_entry_t)-1 & PTE_FRAME; #else /* XENPV && __x86_64__*/ object = (vaddr_t)pdir; for (i = 0; i < PDP_SIZE; i++, object += PAGE_SIZE) { /* Fetch the physical address of the page directory */ (void)pmap_extract(pmap_kernel(), object, &pdirpa); /* Put in recursive PDE to map the PTEs */ pdir[PDIR_SLOT_PTE + i] = pmap_pa2pte(pdirpa) | PTE_P | pmap_pg_nx; #ifndef XENPV pdir[PDIR_SLOT_PTE + i] |= PTE_W; #endif } /* Copy the kernel's top level PDE */ npde = nkptp[PTP_LEVELS - 1]; memcpy(&pdir[PDIR_SLOT_KERN], &PDP_BASE[PDIR_SLOT_KERN], npde * sizeof(pd_entry_t)); if (VM_MIN_KERNEL_ADDRESS != KERNBASE) { int idx = pl_i(KERNBASE, PTP_LEVELS); pdir[idx] = PDP_BASE[idx]; } #ifdef __HAVE_PCPU_AREA pdir[PDIR_SLOT_PCPU] = PDP_BASE[PDIR_SLOT_PCPU]; #endif #ifdef __HAVE_DIRECT_MAP slotspace_copy(SLAREA_DMAP, pdir, PDP_BASE); #endif #ifdef KASAN slotspace_copy(SLAREA_ASAN, pdir, PDP_BASE); #endif #ifdef KMSAN slotspace_copy(SLAREA_MSAN, pdir, PDP_BASE); #endif #endif /* XENPV && __x86_64__*/ #ifdef XENPV s = splvm(); object = (vaddr_t)pdir; pmap_protect(pmap_kernel(), object, object + (PAGE_SIZE * PDP_SIZE), VM_PROT_READ); pmap_update(pmap_kernel()); for (i = 0; i < PDP_SIZE; i++, object += PAGE_SIZE) { /* * pin as L2/L4 page, we have to do the page with the * PDIR_SLOT_PTE entries last */ #ifdef PAE if (i == l2tol3(PDIR_SLOT_PTE)) continue; #endif (void) pmap_extract(pmap_kernel(), object, &pdirpa); #ifdef __x86_64__ xpq_queue_pin_l4_table(xpmap_ptom_masked(pdirpa)); #else xpq_queue_pin_l2_table(xpmap_ptom_masked(pdirpa)); #endif } #ifdef PAE object = ((vaddr_t)pdir) + PAGE_SIZE * l2tol3(PDIR_SLOT_PTE); (void)pmap_extract(pmap_kernel(), object, &pdirpa); xpq_queue_pin_l2_table(xpmap_ptom_masked(pdirpa)); #endif splx(s); #endif /* XENPV */ } /* * pmap_pdp_fini: destructor for the PDPs. */ static void pmap_pdp_fini(pd_entry_t *pdir) { #ifdef XENPV paddr_t pdirpa = 0; /* XXX: GCC */ vaddr_t object = (vaddr_t)pdir; int i; int s = splvm(); pt_entry_t *pte; for (i = 0; i < PDP_SIZE; i++, object += PAGE_SIZE) { /* fetch the physical address of the page directory. */ (void) pmap_extract(pmap_kernel(), object, &pdirpa); /* unpin page table */ xpq_queue_unpin_table(xpmap_ptom_masked(pdirpa)); } object = (vaddr_t)pdir; for (i = 0; i < PDP_SIZE; i++, object += PAGE_SIZE) { /* Set page RW again */ pte = kvtopte(object); pmap_pte_set(pte, *pte | PTE_W); xen_bcast_invlpg((vaddr_t)object); } splx(s); #endif /* XENPV */ } #ifdef PAE static void * pmap_pdp_alloc(struct pool *pp, int flags) { return (void *)uvm_km_alloc(kernel_map, PAGE_SIZE * PDP_SIZE, PAGE_SIZE * PDP_SIZE, ((flags & PR_WAITOK) ? 0 : UVM_KMF_NOWAIT | UVM_KMF_TRYLOCK) | UVM_KMF_WIRED); } static void pmap_pdp_free(struct pool *pp, void *v) { uvm_km_free(kernel_map, (vaddr_t)v, PAGE_SIZE * PDP_SIZE, UVM_KMF_WIRED); } #endif /* PAE */ /* * pmap_ctor: constructor for the pmap cache. */ static int pmap_ctor(void *arg, void *obj, int flags) { struct pmap *pmap = obj; pt_entry_t p; int i; KASSERT((flags & PR_WAITOK) != 0); mutex_init(&pmap->pm_lock, MUTEX_DEFAULT, IPL_NONE); rw_init(&pmap->pm_dummy_lock); kcpuset_create(&pmap->pm_cpus, true); kcpuset_create(&pmap->pm_kernel_cpus, true); #ifdef XENPV kcpuset_create(&pmap->pm_xen_ptp_cpus, true); #endif LIST_INIT(&pmap->pm_gc_ptp); pmap->pm_pve = NULL; LIST_INIT(&pmap->pm_pvp_full); LIST_INIT(&pmap->pm_pvp_part); LIST_INIT(&pmap->pm_pvp_empty); /* allocate and init PDP */ pmap->pm_pdir = pool_get(&pmap_pdp_pool, PR_WAITOK); for (;;) { pmap_pdp_init(pmap->pm_pdir); mutex_enter(&pmaps_lock); p = pmap->pm_pdir[PDIR_SLOT_KERN + nkptp[PTP_LEVELS - 1] - 1]; if (__predict_true(p != 0)) { break; } mutex_exit(&pmaps_lock); } for (i = 0; i < PDP_SIZE; i++) pmap->pm_pdirpa[i] = pmap_pte2pa(pmap->pm_pdir[PDIR_SLOT_PTE + i]); LIST_INSERT_HEAD(&pmaps, pmap, pm_list); mutex_exit(&pmaps_lock); return 0; } /* * pmap_ctor: destructor for the pmap cache. */ static void pmap_dtor(void *arg, void *obj) { struct pmap *pmap = obj; mutex_enter(&pmaps_lock); LIST_REMOVE(pmap, pm_list); mutex_exit(&pmaps_lock); pmap_pdp_fini(pmap->pm_pdir); pool_put(&pmap_pdp_pool, pmap->pm_pdir); mutex_destroy(&pmap->pm_lock); rw_destroy(&pmap->pm_dummy_lock); kcpuset_destroy(pmap->pm_cpus); kcpuset_destroy(pmap->pm_kernel_cpus); #ifdef XENPV kcpuset_destroy(pmap->pm_xen_ptp_cpus); #endif } /* * pmap_create: create a pmap object. */ struct pmap * pmap_create(void) { struct pmap *pmap; int i; pmap = pool_cache_get(&pmap_cache, PR_WAITOK); /* init uvm_object */ for (i = 0; i < PTP_LEVELS - 1; i++) { uvm_obj_init(&pmap->pm_obj[i], &pmap_pager, false, 1); uvm_obj_setlock(&pmap->pm_obj[i], &pmap->pm_dummy_lock); pmap->pm_ptphint[i] = NULL; } pmap->pm_stats.wired_count = 0; /* count the PDP allocd below */ pmap->pm_stats.resident_count = PDP_SIZE; #if !defined(__x86_64__) pmap->pm_hiexec = 0; #endif /* Used by NVMM and Xen */ pmap->pm_enter = NULL; pmap->pm_extract = NULL; pmap->pm_remove = NULL; pmap->pm_sync_pv = NULL; pmap->pm_pp_remove_ent = NULL; pmap->pm_write_protect = NULL; pmap->pm_unwire = NULL; pmap->pm_tlb_flush = NULL; pmap->pm_data = NULL; /* init the LDT */ pmap->pm_ldt = NULL; pmap->pm_ldt_sel = GSYSSEL(GLDT_SEL, SEL_KPL); return pmap; } /* * pmap_check_ptps: verify that none of the pmap's page table objects * have any pages allocated to them. */ static void pmap_check_ptps(struct pmap *pmap) { int i; for (i = 0; i < PTP_LEVELS - 1; i++) { KASSERTMSG(pmap->pm_obj[i].uo_npages == 0, "pmap %p level %d still has %d pages", pmap, i, (int)pmap->pm_obj[i].uo_npages); } } static void pmap_check_inuse(struct pmap *pmap) { #ifdef DEBUG CPU_INFO_ITERATOR cii; struct cpu_info *ci; for (CPU_INFO_FOREACH(cii, ci)) { if (ci->ci_pmap == pmap) panic("destroying pmap being used"); #if defined(XENPV) && defined(__x86_64__) for (int i = 0; i < PDIR_SLOT_USERLIM; i++) { if (pmap->pm_pdir[i] != 0 && ci->ci_kpm_pdir[i] == pmap->pm_pdir[i]) { printf("pmap_destroy(%p) pmap_kernel %p " "curcpu %d cpu %d ci_pmap %p " "ci->ci_kpm_pdir[%d]=%" PRIx64 " pmap->pm_pdir[%d]=%" PRIx64 "\n", pmap, pmap_kernel(), curcpu()->ci_index, ci->ci_index, ci->ci_pmap, i, ci->ci_kpm_pdir[i], i, pmap->pm_pdir[i]); panic("%s: used pmap", __func__); } } #endif } #endif /* DEBUG */ } /* * pmap_destroy: drop reference count on pmap. free pmap if reference * count goes to zero. * * => we can be called from pmap_unmap_ptes() with a different, unrelated * pmap's lock held. be careful! */ void pmap_destroy(struct pmap *pmap) { int i; /* * drop reference count and verify not in use. */ if (atomic_dec_uint_nv(&pmap->pm_obj[0].uo_refs) > 0) { return; } pmap_check_inuse(pmap); /* * handle any deferred frees. */ mutex_enter(&pmap->pm_lock); if (pmap->pm_pve != NULL) { pmap_free_pv(pmap, pmap->pm_pve); pmap->pm_pve = NULL; } pmap_drain_pv(pmap); mutex_exit(&pmap->pm_lock); pmap_update(pmap); /* * Reference count is zero, free pmap resources and then free pmap. */ pmap_check_ptps(pmap); KASSERT(LIST_EMPTY(&pmap->pm_gc_ptp)); #ifdef USER_LDT if (pmap->pm_ldt != NULL) { /* * No need to switch the LDT; this address space is gone, * nothing is using it. * * No need to lock the pmap for ldt_free (or anything else), * we're the last one to use it. */ /* XXXAD can't take cpu_lock here - fix soon. */ mutex_enter(&cpu_lock); ldt_free(pmap->pm_ldt_sel); mutex_exit(&cpu_lock); uvm_km_free(kernel_map, (vaddr_t)pmap->pm_ldt, MAX_USERLDT_SIZE, UVM_KMF_WIRED); } #endif for (i = 0; i < PTP_LEVELS - 1; i++) { uvm_obj_destroy(&pmap->pm_obj[i], false); } kcpuset_zero(pmap->pm_cpus); kcpuset_zero(pmap->pm_kernel_cpus); #ifdef XENPV kcpuset_zero(pmap->pm_xen_ptp_cpus); #endif KASSERT(LIST_EMPTY(&pmap->pm_pvp_full)); KASSERT(LIST_EMPTY(&pmap->pm_pvp_part)); KASSERT(LIST_EMPTY(&pmap->pm_pvp_empty)); pmap_check_ptps(pmap); if (__predict_false(pmap->pm_enter != NULL)) { /* XXX make this a different cache */ pool_cache_destruct_object(&pmap_cache, pmap); } else { pool_cache_put(&pmap_cache, pmap); } } /* * pmap_zap_ptp: clear out an entire PTP without modifying PTEs * * => caller must hold pmap's lock * => PTP must be mapped into KVA * => must be called with kernel preemption disabled * => does as little work as possible */ static void pmap_zap_ptp(struct pmap *pmap, struct vm_page *ptp, pt_entry_t *pte, vaddr_t startva, vaddr_t blkendva) { #ifndef XENPV struct pv_entry *pve; struct vm_page *pg; struct pmap_page *pp; pt_entry_t opte; rb_tree_t *tree; vaddr_t va; int wired; uint8_t oattrs; u_int cnt; KASSERT(mutex_owned(&pmap->pm_lock)); KASSERT(kpreempt_disabled()); KASSERT(pmap != pmap_kernel()); KASSERT(ptp->wire_count > 1); KASSERT(ptp->wire_count - 1 <= PAGE_SIZE / sizeof(pt_entry_t)); /* * Start at the lowest entered VA, and scan until there are no more * PTEs in the PTPs. */ tree = &VM_PAGE_TO_PP(ptp)->pp_rb; pve = RB_TREE_MIN(tree); wired = 0; va = (vaddr_t)ptp->uanon; pte += ((va - startva) >> PAGE_SHIFT); for (cnt = ptp->wire_count; cnt > 1; pte++, va += PAGE_SIZE) { /* * No need for an atomic to clear the PTE. Nothing else can * see the address space any more and speculative access (if * possible) won't modify. Therefore there's no need to * track the accessed/dirty bits. */ opte = *pte; if (!pmap_valid_entry(opte)) { continue; } /* * Count the PTE. If it's not for a managed mapping * there's noting more to do. */ cnt--; wired -= (opte & PTE_WIRED); if ((opte & PTE_PVLIST) == 0) { #ifndef DOM0OPS KASSERTMSG((PHYS_TO_VM_PAGE(pmap_pte2pa(opte)) == NULL), "managed page without PTE_PVLIST for %#" PRIxVADDR, va); KASSERTMSG((pmap_pv_tracked(pmap_pte2pa(opte)) == NULL), "pv-tracked page without PTE_PVLIST for %#" PRIxVADDR, va); #endif KASSERT(pmap_treelookup_pv(pmap, ptp, (ptp != NULL ? &VM_PAGE_TO_PP(ptp)->pp_rb : &pmap_kernel_rb), va) == NULL); continue; } /* * "pve" now points to the lowest (by VA) dynamic PV entry * in the PTP. If it's for this VA, take advantage of it to * avoid calling PHYS_TO_VM_PAGE(). Avoid modifying the RB * tree by skipping to the next VA in the tree whenever * there is a match here. The tree will be cleared out in * one pass before return to pmap_remove_all(). */ oattrs = pmap_pte_to_pp_attrs(opte); if (pve != NULL && pve->pve_pte.pte_va == va) { pp = pve->pve_pp; KASSERT(pve->pve_pte.pte_ptp == ptp); KASSERT(pp->pp_pte.pte_ptp != ptp || pp->pp_pte.pte_va != va); mutex_spin_enter(&pp->pp_lock); pp->pp_attrs |= oattrs; LIST_REMOVE(pve, pve_list); mutex_spin_exit(&pp->pp_lock); /* * pve won't be touched again until pmap_drain_pv(), * so it's still safe to traverse the tree. */ pmap_free_pv(pmap, pve); pve = RB_TREE_NEXT(tree, pve); continue; } /* * No entry in the tree so it must be embedded. Look up the * page and cancel the embedded entry. */ if ((pg = PHYS_TO_VM_PAGE(pmap_pte2pa(opte))) != NULL) { pp = VM_PAGE_TO_PP(pg); } else if ((pp = pmap_pv_tracked(pmap_pte2pa(opte))) == NULL) { paddr_t pa = pmap_pte2pa(opte); panic("%s: PTE_PVLIST with pv-untracked page" " va = %#"PRIxVADDR"pa = %#"PRIxPADDR "(%#"PRIxPADDR")", __func__, va, pa, atop(pa)); } mutex_spin_enter(&pp->pp_lock); KASSERT(pp->pp_pte.pte_ptp == ptp); KASSERT(pp->pp_pte.pte_va == va); pp->pp_attrs |= oattrs; pp->pp_pte.pte_ptp = NULL; pp->pp_pte.pte_va = 0; mutex_spin_exit(&pp->pp_lock); } /* PTP now empty - adjust the tree & stats to match. */ pmap_stats_update(pmap, -(ptp->wire_count - 1), wired / PTE_WIRED); ptp->wire_count = 1; #ifdef DIAGNOSTIC rb_tree_init(tree, &pmap_rbtree_ops); #endif #else /* !XENPV */ /* * XXXAD For XEN, it's not clear to me that we can do this, because * I guess the hypervisor keeps track of PTEs too. */ pmap_remove_ptes(pmap, ptp, (vaddr_t)pte, startva, blkendva); #endif /* !XENPV */ } /* * pmap_remove_all: remove all mappings from pmap in bulk. * * Ordinarily when removing mappings it's important to hold the UVM object's * lock, so that pages do not gain a new identity while retaining stale TLB * entries (the same lock hold covers both pmap_remove() and pmap_update()). * Here it's known that the address space is no longer visible to any user * process, so we don't need to worry about that. */ bool pmap_remove_all(struct pmap *pmap) { struct vm_page *ptps[32]; vaddr_t va, blkendva; struct pmap *pmap2; pt_entry_t *ptes; pd_entry_t pde __diagused; pd_entry_t * const *pdes; int lvl __diagused, i, n; /* XXX Can't handle EPT just yet. */ if (pmap->pm_remove != NULL) { return false; } for (;;) { /* Fetch a block of PTPs from tree. */ mutex_enter(&pmap->pm_lock); n = radix_tree_gang_lookup_node(&pmap->pm_obj[0].uo_pages, 0, (void **)ptps, __arraycount(ptps), false); if (n == 0) { mutex_exit(&pmap->pm_lock); break; } /* Remove all mappings in the set of PTPs. */ pmap_map_ptes(pmap, &pmap2, &ptes, &pdes); for (i = 0; i < n; i++) { if (ptps[i]->wire_count == 0) { /* It's dead: pmap_update() will expunge. */ continue; } /* Determine range of block. */ va = ptps[i]->offset * PAGE_SIZE / sizeof(pt_entry_t); blkendva = x86_round_pdr(va + 1); /* Make sure everything squares up... */ KASSERT(pmap_pdes_valid(va, pdes, &pde, &lvl)); KASSERT(lvl == 1); KASSERT(pmap_find_ptp(pmap, va, 1) == ptps[i]); /* Zap! */ pmap_zap_ptp(pmap, ptps[i], &ptes[pl1_i(va)], va, blkendva); /* PTP should now be unused - free it. */ KASSERT(ptps[i]->wire_count == 1); pmap_free_ptp(pmap, ptps[i], va, ptes, pdes); } pmap_unmap_ptes(pmap, pmap2); pmap_drain_pv(pmap); pmap_tlb_shootdown(pmap, -1L, 0, TLBSHOOT_REMOVE_ALL); mutex_exit(&pmap->pm_lock); /* Process deferred frees. */ pmap_update(pmap); /* A breathing point. */ preempt_point(); } /* Verify that the pmap is now completely empty. */ pmap_check_ptps(pmap); KASSERTMSG(pmap->pm_stats.resident_count == PDP_SIZE, "pmap %p not empty", pmap); return true; } #if defined(PMAP_FORK) /* * pmap_fork: perform any necessary data structure manipulation when * a VM space is forked. */ void pmap_fork(struct pmap *pmap1, struct pmap *pmap2) { #ifdef USER_LDT union descriptor *new_ldt; int sel; if (__predict_true(pmap1->pm_ldt == NULL)) { return; } /* * Copy the LDT into the new process. * * Read pmap1's ldt pointer unlocked; if it changes behind our back * we'll retry. This will starve if there's a stream of LDT changes * in another thread but that should not happen. */ retry: if (pmap1->pm_ldt != NULL) { /* Allocate space for the new process's LDT */ new_ldt = (union descriptor *)uvm_km_alloc(kernel_map, MAX_USERLDT_SIZE, 0, UVM_KMF_WIRED); if (new_ldt == NULL) { printf("WARNING: %s: unable to allocate LDT space\n", __func__); return; } mutex_enter(&cpu_lock); /* Get a GDT slot for it */ sel = ldt_alloc(new_ldt, MAX_USERLDT_SIZE); if (sel == -1) { mutex_exit(&cpu_lock); uvm_km_free(kernel_map, (vaddr_t)new_ldt, MAX_USERLDT_SIZE, UVM_KMF_WIRED); printf("WARNING: %s: unable to allocate LDT selector\n", __func__); return; } } else { /* Wasn't anything there after all. */ new_ldt = NULL; sel = -1; mutex_enter(&cpu_lock); } /* * Now that we have cpu_lock, ensure the LDT status is the same. */ if (pmap1->pm_ldt != NULL) { if (new_ldt == NULL) { /* A wild LDT just appeared. */ mutex_exit(&cpu_lock); goto retry; } /* Copy the LDT data and install it in pmap2 */ memcpy(new_ldt, pmap1->pm_ldt, MAX_USERLDT_SIZE); pmap2->pm_ldt = new_ldt; pmap2->pm_ldt_sel = sel; mutex_exit(&cpu_lock); } else { if (new_ldt != NULL) { /* The LDT disappeared, drop what we did. */ ldt_free(sel); mutex_exit(&cpu_lock); uvm_km_free(kernel_map, (vaddr_t)new_ldt, MAX_USERLDT_SIZE, UVM_KMF_WIRED); return; } /* We're good, just leave. */ mutex_exit(&cpu_lock); } #endif /* USER_LDT */ } #endif /* PMAP_FORK */ #ifdef USER_LDT /* * pmap_ldt_xcall: cross call used by pmap_ldt_sync. if the named pmap * is active, reload LDTR. */ static void pmap_ldt_xcall(void *arg1, void *arg2) { struct pmap *pm; kpreempt_disable(); pm = arg1; if (curcpu()->ci_pmap == pm) { #if defined(SVS) if (svs_enabled) { svs_ldt_sync(pm); } else #endif lldt(pm->pm_ldt_sel); } kpreempt_enable(); } /* * pmap_ldt_sync: LDT selector for the named pmap is changing. swap * in the new selector on all CPUs. */ void pmap_ldt_sync(struct pmap *pm) { uint64_t where; KASSERT(mutex_owned(&cpu_lock)); pmap_ldt_evcnt.ev_count++; where = xc_broadcast(0, pmap_ldt_xcall, pm, NULL); xc_wait(where); } /* * pmap_ldt_cleanup: if the pmap has a local LDT, deallocate it, and * restore the default. */ void pmap_ldt_cleanup(struct lwp *l) { pmap_t pmap = l->l_proc->p_vmspace->vm_map.pmap; union descriptor *ldt; int sel; if (__predict_true(pmap->pm_ldt == NULL)) { return; } mutex_enter(&cpu_lock); if (pmap->pm_ldt != NULL) { sel = pmap->pm_ldt_sel; ldt = pmap->pm_ldt; pmap->pm_ldt_sel = GSYSSEL(GLDT_SEL, SEL_KPL); pmap->pm_ldt = NULL; pmap_ldt_sync(pmap); ldt_free(sel); uvm_km_free(kernel_map, (vaddr_t)ldt, MAX_USERLDT_SIZE, UVM_KMF_WIRED); } mutex_exit(&cpu_lock); } #endif /* USER_LDT */ /* * pmap_activate: activate a process' pmap * * => must be called with kernel preemption disabled * => if lwp is the curlwp, then set ci_want_pmapload so that * actual MMU context switch will be done by pmap_load() later */ void pmap_activate(struct lwp *l) { struct cpu_info *ci; struct pmap *pmap = vm_map_pmap(&l->l_proc->p_vmspace->vm_map); KASSERT(kpreempt_disabled()); ci = curcpu(); if (l != ci->ci_curlwp) return; KASSERT(ci->ci_want_pmapload == 0); KASSERT(ci->ci_tlbstate != TLBSTATE_VALID); /* * no need to switch to kernel vmspace because * it's a subset of any vmspace. */ if (pmap == pmap_kernel()) { ci->ci_want_pmapload = 0; return; } ci->ci_want_pmapload = 1; } #if defined(XENPV) && defined(__x86_64__) #define KASSERT_PDIRPA(pmap) \ KASSERT(pmap_pdirpa(pmap, 0) == ci->ci_xen_current_user_pgd || \ pmap == pmap_kernel()) #elif defined(PAE) #define KASSERT_PDIRPA(pmap) \ KASSERT(pmap_pdirpa(pmap, 0) == pmap_pte2pa(ci->ci_pae_l3_pdir[0])) #elif !defined(XENPV) #define KASSERT_PDIRPA(pmap) \ KASSERT(pmap_pdirpa(pmap, 0) == pmap_pte2pa(rcr3())) #else #define KASSERT_PDIRPA(pmap) KASSERT(true) /* nothing to do */ #endif /* * pmap_reactivate: try to regain reference to the pmap. * * => Must be called with kernel preemption disabled. */ static void pmap_reactivate(struct pmap *pmap) { struct cpu_info * const ci = curcpu(); const cpuid_t cid = cpu_index(ci); KASSERT(kpreempt_disabled()); KASSERT_PDIRPA(pmap); /* * If we still have a lazy reference to this pmap, we can assume * that there was no TLB shootdown for this pmap in the meantime. * * The order of events here is important as we must synchronize * with TLB shootdown interrupts. Declare interest in invalidations * (TLBSTATE_VALID) and then check the CPU set, which the IPIs can * change only when the state is TLBSTATE_LAZY. */ ci->ci_tlbstate = TLBSTATE_VALID; KASSERT(kcpuset_isset(pmap->pm_kernel_cpus, cid)); if (__predict_true(kcpuset_isset(pmap->pm_cpus, cid))) { /* We have the reference, state is valid. */ } else { /* * Must reload the TLB, pmap has been changed during * deactivated. */ kcpuset_atomic_set(pmap->pm_cpus, cid); tlbflush(); } } /* * pmap_load: perform the actual pmap switch, i.e. fill in %cr3 register * and relevant LDT info. * * Ensures that the current process' pmap is loaded on the current CPU's * MMU and that there are no stale TLB entries. * * => The caller should disable kernel preemption or do check-and-retry * to prevent a preemption from undoing our efforts. * => This function may block. */ void pmap_load(void) { struct cpu_info *ci; struct pmap *pmap, *oldpmap; struct lwp *l; uint64_t pctr; int ilevel __diagused; u_long psl __diagused; kpreempt_disable(); retry: ci = curcpu(); if (!ci->ci_want_pmapload) { kpreempt_enable(); return; } l = ci->ci_curlwp; pctr = lwp_pctr(); __insn_barrier(); /* should be able to take ipis. */ KASSERTMSG((ilevel = ci->ci_ilevel) < IPL_HIGH, "ilevel=%d", ilevel); #ifdef XENPV /* Check to see if interrupts are enabled (ie; no events are masked) */ KASSERTMSG((psl = x86_read_psl()) == 0, "psl=0x%lx", psl); #else KASSERTMSG(((psl = x86_read_psl()) & PSL_I) != 0, "psl=0x%lx", psl); #endif KASSERT(l != NULL); pmap = vm_map_pmap(&l->l_proc->p_vmspace->vm_map); KASSERT(pmap != pmap_kernel()); oldpmap = ci->ci_pmap; if (pmap == oldpmap) { pmap_reactivate(pmap); ci->ci_want_pmapload = 0; kpreempt_enable(); return; } /* * Acquire a reference to the new pmap and perform the switch. */ pmap_reference(pmap); pmap_load1(l, pmap, oldpmap); ci->ci_want_pmapload = 0; /* * we're now running with the new pmap. drop the reference * to the old pmap. if we block, we need to go around again. */ pmap_destroy(oldpmap); __insn_barrier(); if (lwp_pctr() != pctr) { goto retry; } kpreempt_enable(); } /* * pmap_load1: the guts of pmap load, shared by pmap_map_ptes() and * pmap_load(). It's critically important that this function does not * block. */ static void pmap_load1(struct lwp *l, struct pmap *pmap, struct pmap *oldpmap) { struct cpu_info *ci; struct pcb *pcb; cpuid_t cid; KASSERT(kpreempt_disabled()); pcb = lwp_getpcb(l); ci = l->l_cpu; cid = cpu_index(ci); kcpuset_atomic_clear(oldpmap->pm_cpus, cid); kcpuset_atomic_clear(oldpmap->pm_kernel_cpus, cid); KASSERT_PDIRPA(oldpmap); KASSERT(!kcpuset_isset(pmap->pm_cpus, cid)); KASSERT(!kcpuset_isset(pmap->pm_kernel_cpus, cid)); /* * Mark the pmap in use by this CPU. Again, we must synchronize * with TLB shootdown interrupts, so set the state VALID first, * then register us for shootdown events on this pmap. */ ci->ci_tlbstate = TLBSTATE_VALID; kcpuset_atomic_set(pmap->pm_cpus, cid); kcpuset_atomic_set(pmap->pm_kernel_cpus, cid); ci->ci_pmap = pmap; /* * update tss. now that we have registered for invalidations * from other CPUs, we're good to load the page tables. */ #ifdef PAE pcb->pcb_cr3 = ci->ci_pae_l3_pdirpa; #else pcb->pcb_cr3 = pmap_pdirpa(pmap, 0); #endif #ifdef i386 #ifndef XENPV ci->ci_tss->tss.tss_ldt = pmap->pm_ldt_sel; ci->ci_tss->tss.tss_cr3 = pcb->pcb_cr3; #endif #endif #if defined(SVS) && defined(USER_LDT) if (svs_enabled) { svs_ldt_sync(pmap); } else #endif lldt(pmap->pm_ldt_sel); cpu_load_pmap(pmap, oldpmap); } /* * pmap_deactivate: deactivate a process' pmap. * * => Must be called with kernel preemption disabled (high IPL is enough). */ void pmap_deactivate(struct lwp *l) { struct pmap *pmap; struct cpu_info *ci; KASSERT(kpreempt_disabled()); if (l != curlwp) { return; } /* * Wait for pending TLB shootdowns to complete. Necessary because * TLB shootdown state is per-CPU, and the LWP may be coming off * the CPU before it has a chance to call pmap_update(), e.g. due * to kernel preemption or blocking routine in between. */ pmap_tlb_shootnow(); ci = curcpu(); if (ci->ci_want_pmapload) { /* * ci_want_pmapload means that our pmap is not loaded on * the CPU or TLB might be stale. note that pmap_kernel() * is always considered loaded. */ KASSERT(vm_map_pmap(&l->l_proc->p_vmspace->vm_map) != pmap_kernel()); KASSERT(vm_map_pmap(&l->l_proc->p_vmspace->vm_map) != ci->ci_pmap || ci->ci_tlbstate != TLBSTATE_VALID); /* * userspace has not been touched. * nothing to do here. */ ci->ci_want_pmapload = 0; return; } pmap = vm_map_pmap(&l->l_proc->p_vmspace->vm_map); if (pmap == pmap_kernel()) { return; } KASSERT_PDIRPA(pmap); KASSERT(ci->ci_pmap == pmap); /* * we aren't interested in TLB invalidations for this pmap, * at least for the time being. */ KASSERT(ci->ci_tlbstate == TLBSTATE_VALID); ci->ci_tlbstate = TLBSTATE_LAZY; } #ifdef EFI_RUNTIME extern struct pmap *efi_runtime_pmap; /* * pmap_is_user: true if pmap, which must not be the kernel pmap, is * for an unprivileged user process */ bool pmap_is_user(struct pmap *pmap) { KASSERT(pmap != pmap_kernel()); return (pmap != efi_runtime_pmap); } /* * pmap_activate_sync: synchronously activate specified pmap. * * => Must be called with kernel preemption disabled (high IPL is enough). * => Must not sleep before pmap_deactivate_sync. */ void * pmap_activate_sync(struct pmap *pmap) { struct cpu_info *ci = curcpu(); struct pmap *oldpmap = ci->ci_pmap; unsigned cid = cpu_index(ci); KASSERT(kpreempt_disabled()); KASSERT(pmap != pmap_kernel()); KASSERT(!kcpuset_isset(pmap->pm_cpus, cid)); KASSERT(!kcpuset_isset(pmap->pm_kernel_cpus, cid)); if (oldpmap) { KASSERT_PDIRPA(oldpmap); kcpuset_atomic_clear(oldpmap->pm_cpus, cid); kcpuset_atomic_clear(oldpmap->pm_kernel_cpus, cid); } ci->ci_tlbstate = TLBSTATE_VALID; kcpuset_atomic_set(pmap->pm_cpus, cid); kcpuset_atomic_set(pmap->pm_kernel_cpus, cid); ci->ci_pmap = pmap; #if defined(SVS) && defined(USER_LDT) if (svs_enabled) { svs_ldt_sync(pmap); } else #endif lldt(pmap->pm_ldt_sel); cpu_load_pmap(pmap, oldpmap); return oldpmap; } /* * pmap_deactivate_sync: synchronously deactivate specified pmap and * restore whatever was active before pmap_activate_sync. * * => Must be called with kernel preemption disabled (high IPL is enough). * => Must not have slept since pmap_activate_sync. */ void pmap_deactivate_sync(struct pmap *pmap, void *cookie) { struct cpu_info *ci = curcpu(); struct pmap *oldpmap = cookie; unsigned cid = cpu_index(ci); KASSERT(kpreempt_disabled()); KASSERT(pmap != pmap_kernel()); KASSERT(ci->ci_pmap == pmap); KASSERT_PDIRPA(pmap); KASSERT(kcpuset_isset(pmap->pm_cpus, cid)); KASSERT(kcpuset_isset(pmap->pm_kernel_cpus, cid)); pmap_tlb_shootnow(); kcpuset_atomic_clear(pmap->pm_cpus, cid); kcpuset_atomic_clear(pmap->pm_kernel_cpus, cid); ci->ci_tlbstate = TLBSTATE_VALID; ci->ci_pmap = oldpmap; if (oldpmap) { kcpuset_atomic_set(oldpmap->pm_cpus, cid); kcpuset_atomic_set(oldpmap->pm_kernel_cpus, cid); #if defined(SVS) && defined(USER_LDT) if (svs_enabled) { svs_ldt_sync(oldpmap); } else #endif lldt(oldpmap->pm_ldt_sel); cpu_load_pmap(oldpmap, pmap); } else { lcr3(pmap_pdirpa(pmap_kernel(), 0)); } } #endif /* EFI_RUNTIME */ /* * some misc. functions */ bool pmap_pdes_valid(vaddr_t va, pd_entry_t * const *pdes, pd_entry_t *lastpde, int *lastlvl) { unsigned long index; pd_entry_t pde; int i; for (i = PTP_LEVELS; i > 1; i--) { index = pl_i(va, i); pde = pdes[i - 2][index]; if ((pde & PTE_P) == 0) { *lastlvl = i; return false; } if (pde & PTE_PS) break; } if (lastpde != NULL) *lastpde = pde; *lastlvl = i; return true; } /* * pmap_extract: extract a PA for the given VA */ bool pmap_extract(struct pmap *pmap, vaddr_t va, paddr_t *pap) { pt_entry_t *ptes, pte; pd_entry_t pde; pd_entry_t * const *pdes; struct pmap *pmap2; paddr_t pa; bool rv; int lvl; if (__predict_false(pmap->pm_extract != NULL)) { return (*pmap->pm_extract)(pmap, va, pap); } #ifdef __HAVE_DIRECT_MAP if (va >= PMAP_DIRECT_BASE && va < PMAP_DIRECT_END) { if (pap != NULL) { *pap = PMAP_DIRECT_UNMAP(va); } return true; } #endif rv = false; pa = 0; if (pmap != pmap_kernel()) { mutex_enter(&pmap->pm_lock); } pmap_map_ptes(pmap, &pmap2, &ptes, &pdes); if (pmap_pdes_valid(va, pdes, &pde, &lvl)) { if (lvl == 2) { pa = (pde & PTE_LGFRAME) | (va & (NBPD_L2 - 1)); rv = true; } else { KASSERT(lvl == 1); pte = ptes[pl1_i(va)]; if (__predict_true((pte & PTE_P) != 0)) { pa = pmap_pte2pa(pte) | (va & (NBPD_L1 - 1)); rv = true; } } } pmap_unmap_ptes(pmap, pmap2); if (pmap != pmap_kernel()) { mutex_exit(&pmap->pm_lock); } if (pap != NULL) { *pap = pa; } return rv; } /* * vtophys: virtual address to physical address. For use by * machine-dependent code only. */ paddr_t vtophys(vaddr_t va) { paddr_t pa; if (pmap_extract(pmap_kernel(), va, &pa) == true) return pa; return 0; } __strict_weak_alias(pmap_extract_ma, pmap_extract); #ifdef XENPV /* * vtomach: virtual address to machine address. For use by * machine-dependent code only. */ paddr_t vtomach(vaddr_t va) { paddr_t pa; if (pmap_extract_ma(pmap_kernel(), va, &pa) == true) return pa; return 0; } #endif /* * pmap_virtual_space: used during bootup [pmap_steal_memory] to * determine the bounds of the kernel virtual address space. */ void pmap_virtual_space(vaddr_t *startp, vaddr_t *endp) { *startp = virtual_avail; *endp = virtual_end; } void pmap_zero_page(paddr_t pa) { #if defined(__HAVE_DIRECT_MAP) memset(PAGE_ALIGNED(PMAP_DIRECT_MAP(pa)), 0, PAGE_SIZE); #else #if defined(XENPV) if (XEN_VERSION_SUPPORTED(3, 4)) { xen_pagezero(pa); return; } #endif struct cpu_info *ci; pt_entry_t *zpte; vaddr_t zerova; const pd_entry_t pteflags = PTE_P | PTE_W | pmap_pg_nx | PTE_D | PTE_A; kpreempt_disable(); ci = curcpu(); zerova = ci->vpage[VPAGE_ZER]; zpte = ci->vpage_pte[VPAGE_ZER]; KASSERTMSG(!*zpte, "pmap_zero_page: lock botch"); pmap_pte_set(zpte, pmap_pa2pte(pa) | pteflags); pmap_pte_flush(); pmap_update_pg(zerova); /* flush TLB */ memset(PAGE_ALIGNED(zerova), 0, PAGE_SIZE); #if defined(DIAGNOSTIC) || defined(XENPV) pmap_pte_set(zpte, 0); /* zap ! */ pmap_pte_flush(); #endif kpreempt_enable(); #endif /* defined(__HAVE_DIRECT_MAP) */ } void pmap_copy_page(paddr_t srcpa, paddr_t dstpa) { #if defined(__HAVE_DIRECT_MAP) vaddr_t srcva = PMAP_DIRECT_MAP(srcpa); vaddr_t dstva = PMAP_DIRECT_MAP(dstpa); memcpy(PAGE_ALIGNED(dstva), PAGE_ALIGNED(srcva), PAGE_SIZE); #else #if defined(XENPV) if (XEN_VERSION_SUPPORTED(3, 4)) { xen_copy_page(srcpa, dstpa); return; } #endif struct cpu_info *ci; pt_entry_t *srcpte, *dstpte; vaddr_t srcva, dstva; const pd_entry_t pteflags = PTE_P | PTE_W | pmap_pg_nx | PTE_A; kpreempt_disable(); ci = curcpu(); srcva = ci->vpage[VPAGE_SRC]; dstva = ci->vpage[VPAGE_DST]; srcpte = ci->vpage_pte[VPAGE_SRC]; dstpte = ci->vpage_pte[VPAGE_DST]; KASSERT(*srcpte == 0 && *dstpte == 0); pmap_pte_set(srcpte, pmap_pa2pte(srcpa) | pteflags); pmap_pte_set(dstpte, pmap_pa2pte(dstpa) | pteflags | PTE_D); pmap_pte_flush(); pmap_update_pg(srcva); pmap_update_pg(dstva); memcpy(PAGE_ALIGNED(dstva), PAGE_ALIGNED(srcva), PAGE_SIZE); #if defined(DIAGNOSTIC) || defined(XENPV) pmap_pte_set(srcpte, 0); pmap_pte_set(dstpte, 0); pmap_pte_flush(); #endif kpreempt_enable(); #endif /* defined(__HAVE_DIRECT_MAP) */ } static pt_entry_t * pmap_map_ptp(struct vm_page *ptp) { #ifdef __HAVE_DIRECT_MAP return (void *)PMAP_DIRECT_MAP(VM_PAGE_TO_PHYS(ptp)); #else struct cpu_info *ci; pt_entry_t *ptppte; vaddr_t ptpva; KASSERT(kpreempt_disabled()); #ifndef XENPV const pd_entry_t pteflags = PTE_P | PTE_W | pmap_pg_nx | PTE_A | PTE_D; #else const pd_entry_t pteflags = PTE_P | pmap_pg_nx | PTE_A | PTE_D; #endif ci = curcpu(); ptpva = ci->vpage[VPAGE_PTP]; ptppte = ci->vpage_pte[VPAGE_PTP]; pmap_pte_set(ptppte, pmap_pa2pte(VM_PAGE_TO_PHYS(ptp)) | pteflags); pmap_pte_flush(); pmap_update_pg(ptpva); return (pt_entry_t *)ptpva; #endif } static void pmap_unmap_ptp(void) { #ifndef __HAVE_DIRECT_MAP #if defined(DIAGNOSTIC) || defined(XENPV) struct cpu_info *ci; pt_entry_t *pte; KASSERT(kpreempt_disabled()); ci = curcpu(); pte = ci->vpage_pte[VPAGE_PTP]; if (*pte != 0) { pmap_pte_set(pte, 0); pmap_pte_flush(); } #endif #endif } static pt_entry_t * pmap_map_pte(struct pmap *pmap, struct vm_page *ptp, vaddr_t va) { KASSERT(kpreempt_disabled()); if (pmap_is_curpmap(pmap)) { return &PTE_BASE[pl1_i(va)]; /* (k)vtopte */ } KASSERT(ptp != NULL); return pmap_map_ptp(ptp) + pl1_pi(va); } static void pmap_unmap_pte(void) { KASSERT(kpreempt_disabled()); pmap_unmap_ptp(); } /* * p m a p r e m o v e f u n c t i o n s * * functions that remove mappings */ /* * pmap_remove_ptes: remove PTEs from a PTP * * => caller must hold pmap's lock * => PTP must be mapped into KVA * => PTP should be null if pmap == pmap_kernel() * => must be called with kernel preemption disabled * => returns composite pte if at least one page should be shot down */ static void pmap_remove_ptes(struct pmap *pmap, struct vm_page *ptp, vaddr_t ptpva, vaddr_t startva, vaddr_t endva) { pt_entry_t *pte = (pt_entry_t *)ptpva; KASSERT(mutex_owned(&pmap->pm_lock)); KASSERT(kpreempt_disabled()); /* * mappings are very often sparse, so clip the given range to the * range of PTEs that are known present in the PTP. */ pmap_ptp_range_clip(ptp, &startva, &pte); /* * note that ptpva points to the PTE that maps startva. this may * or may not be the first PTE in the PTP. * * we loop through the PTP while there are still PTEs to look at * and the wire_count is greater than 1 (because we use the wire_count * to keep track of the number of real PTEs in the PTP). */ while (startva < endva && (ptp == NULL || ptp->wire_count > 1)) { (void)pmap_remove_pte(pmap, ptp, pte, startva); startva += PAGE_SIZE; pte++; } } /* * pmap_remove_pte: remove a single PTE from a PTP. * * => caller must hold pmap's lock * => PTP must be mapped into KVA * => PTP should be null if pmap == pmap_kernel() * => returns true if we removed a mapping * => must be called with kernel preemption disabled */ static bool pmap_remove_pte(struct pmap *pmap, struct vm_page *ptp, pt_entry_t *pte, vaddr_t va) { struct pv_entry *pve; struct vm_page *pg; struct pmap_page *pp; pt_entry_t opte; KASSERT(mutex_owned(&pmap->pm_lock)); KASSERT(kpreempt_disabled()); if (!pmap_valid_entry(*pte)) { /* VA not mapped. */ return false; } /* Atomically save the old PTE and zap it. */ opte = pmap_pte_testset(pte, 0); if (!pmap_valid_entry(opte)) { return false; } pmap_exec_account(pmap, va, opte, 0); pmap_stats_update_bypte(pmap, 0, opte); if (ptp) { /* * Dropping a PTE. Make sure that the PDE is flushed. */ ptp->wire_count--; if (ptp->wire_count <= 1) { opte |= PTE_A; } } if ((opte & PTE_A) != 0) { pmap_tlb_shootdown(pmap, va, opte, TLBSHOOT_REMOVE_PTE); } /* * If we are not on a pv list - we are done. */ if ((opte & PTE_PVLIST) == 0) { #ifndef DOM0OPS KASSERTMSG((PHYS_TO_VM_PAGE(pmap_pte2pa(opte)) == NULL), "managed page without PTE_PVLIST for %#"PRIxVADDR, va); KASSERTMSG((pmap_pv_tracked(pmap_pte2pa(opte)) == NULL), "pv-tracked page without PTE_PVLIST for %#"PRIxVADDR, va); #endif KASSERT(pmap_treelookup_pv(pmap, ptp, (ptp != NULL ? &VM_PAGE_TO_PP(ptp)->pp_rb : &pmap_kernel_rb), va) == NULL); return true; } if ((pg = PHYS_TO_VM_PAGE(pmap_pte2pa(opte))) != NULL) { pp = VM_PAGE_TO_PP(pg); } else if ((pp = pmap_pv_tracked(pmap_pte2pa(opte))) == NULL) { paddr_t pa = pmap_pte2pa(opte); panic("%s: PTE_PVLIST with pv-untracked page" " va = %#"PRIxVADDR"pa = %#"PRIxPADDR" (%#"PRIxPADDR")", __func__, va, pa, atop(pa)); } /* Sync R/M bits. */ pve = pmap_lookup_pv(pmap, ptp, pp, va); pmap_remove_pv(pmap, pp, ptp, va, pve, pmap_pte_to_pp_attrs(opte)); return true; } static void pmap_remove_locked(struct pmap *pmap, vaddr_t sva, vaddr_t eva) { pt_entry_t *ptes; pd_entry_t pde; pd_entry_t * const *pdes; bool result; vaddr_t blkendva, va = sva; struct vm_page *ptp; struct pmap *pmap2; int lvl; KASSERT(mutex_owned(&pmap->pm_lock)); pmap_map_ptes(pmap, &pmap2, &ptes, &pdes); /* * removing one page? take shortcut function. */ if (va + PAGE_SIZE == eva) { if (pmap_pdes_valid(va, pdes, &pde, &lvl)) { KASSERT(lvl == 1); /* Get PTP if non-kernel mapping. */ if (pmap != pmap_kernel()) { ptp = pmap_find_ptp(pmap, va, 1); KASSERTMSG(ptp != NULL, "%s: unmanaged PTP detected", __func__); } else { /* Never free kernel PTPs. */ ptp = NULL; } result = pmap_remove_pte(pmap, ptp, &ptes[pl1_i(va)], va); /* * if mapping removed and the PTP is no longer * being used, free it! */ if (result && ptp && ptp->wire_count <= 1) pmap_free_ptp(pmap, ptp, va, ptes, pdes); } } else for (/* null */ ; va < eva ; va = blkendva) { /* determine range of block */ blkendva = x86_round_pdr(va+1); if (blkendva > eva) blkendva = eva; if (!pmap_pdes_valid(va, pdes, &pde, &lvl)) { /* Skip a range corresponding to an invalid pde. */ blkendva = (va & ptp_frames[lvl - 1]) + nbpd[lvl - 1]; continue; } KASSERT(lvl == 1); /* Get PTP if non-kernel mapping. */ if (pmap != pmap_kernel()) { ptp = pmap_find_ptp(pmap, va, 1); KASSERTMSG(ptp != NULL, "%s: unmanaged PTP detected", __func__); } else { /* Never free kernel PTPs. */ ptp = NULL; } pmap_remove_ptes(pmap, ptp, (vaddr_t)&ptes[pl1_i(va)], va, blkendva); /* If PTP is no longer being used, free it. */ if (ptp && ptp->wire_count <= 1) { pmap_free_ptp(pmap, ptp, va, ptes, pdes); } } pmap_unmap_ptes(pmap, pmap2); pmap_drain_pv(pmap); } /* * pmap_remove: mapping removal function. * * => caller should not be holding any pmap locks */ void pmap_remove(struct pmap *pmap, vaddr_t sva, vaddr_t eva) { if (__predict_false(pmap->pm_remove != NULL)) { (*pmap->pm_remove)(pmap, sva, eva); return; } mutex_enter(&pmap->pm_lock); pmap_remove_locked(pmap, sva, eva); mutex_exit(&pmap->pm_lock); } /* * pmap_sync_pv: clear pte bits and return the old value of the pp_attrs. * * => The 'clearbits' parameter is either ~0 or PP_ATTRS_... * => Caller should disable kernel preemption. * => issues tlb shootdowns if necessary. */ static int pmap_sync_pv(struct pv_pte *pvpte, paddr_t pa, int clearbits, uint8_t *oattrs, pt_entry_t *optep) { struct pmap *pmap; struct vm_page *ptp; vaddr_t va; pt_entry_t *ptep; pt_entry_t opte; pt_entry_t npte; pt_entry_t expect; bool need_shootdown; ptp = pvpte->pte_ptp; va = pvpte->pte_va; KASSERT(ptp == NULL || ptp->uobject != NULL); KASSERT(ptp == NULL || ptp_va2o(va, 1) == ptp->offset); pmap = ptp_to_pmap(ptp); KASSERT(kpreempt_disabled()); if (__predict_false(pmap->pm_sync_pv != NULL)) { return (*pmap->pm_sync_pv)(ptp, va, pa, clearbits, oattrs, optep); } expect = pmap_pa2pte(pa) | PTE_P; if (clearbits != ~0) { KASSERT((clearbits & ~(PP_ATTRS_D|PP_ATTRS_A|PP_ATTRS_W)) == 0); clearbits = pmap_pp_attrs_to_pte(clearbits); } ptep = pmap_map_pte(pmap, ptp, va); do { opte = *ptep; KASSERT((opte & (PTE_D | PTE_A)) != PTE_D); KASSERT((opte & (PTE_A | PTE_P)) != PTE_A); KASSERT(opte == 0 || (opte & PTE_P) != 0); if ((opte & (PTE_FRAME | PTE_P)) != expect) { /* * We lost a race with a V->P operation like * pmap_remove(). Wait for the competitor * reflecting pte bits into mp_attrs. */ pmap_unmap_pte(); return EAGAIN; } /* * Check if there's anything to do on this PTE. */ if ((opte & clearbits) == 0) { need_shootdown = false; break; } /* * We need a shootdown if the PTE is cached (PTE_A) ... * ... Unless we are clearing only the PTE_W bit and * it isn't cached as RW (PTE_D). */ need_shootdown = (opte & PTE_A) != 0 && !(clearbits == PTE_W && (opte & PTE_D) == 0); npte = opte & ~clearbits; /* * If we need a shootdown anyway, clear PTE_A and PTE_D. */ if (need_shootdown) { npte &= ~(PTE_A | PTE_D); } KASSERT((npte & (PTE_D | PTE_A)) != PTE_D); KASSERT((npte & (PTE_A | PTE_P)) != PTE_A); KASSERT(npte == 0 || (opte & PTE_P) != 0); } while (pmap_pte_cas(ptep, opte, npte) != opte); if (need_shootdown) { pmap_tlb_shootdown(pmap, va, opte, TLBSHOOT_SYNC_PV); } pmap_unmap_pte(); *oattrs = pmap_pte_to_pp_attrs(opte); if (optep != NULL) *optep = opte; return 0; } static void pmap_pp_remove_ent(struct pmap *pmap, struct vm_page *ptp, pt_entry_t opte, vaddr_t va) { struct pmap *pmap2; pt_entry_t *ptes; pd_entry_t * const *pdes; KASSERT(mutex_owned(&pmap->pm_lock)); pmap_map_ptes(pmap, &pmap2, &ptes, &pdes); pmap_stats_update_bypte(pmap, 0, opte); ptp->wire_count--; if (ptp->wire_count <= 1) { pmap_free_ptp(pmap, ptp, va, ptes, pdes); } pmap_unmap_ptes(pmap, pmap2); } static void pmap_pp_remove(struct pmap_page *pp, paddr_t pa) { struct pv_pte *pvpte; struct vm_page *ptp; uintptr_t sum; uint8_t oattrs; bool locked; /* * Do an unlocked check to see if the page has no mappings, eg when * pmap_remove_all() was called before amap_wipeout() for a process * private amap - common. The page being removed must be on the way * out, so we don't have to worry about concurrent attempts to enter * it (otherwise the caller either doesn't care or has screwed up). */ sum = (uintptr_t)atomic_load_relaxed(&pp->pp_pte.pte_va); sum |= (uintptr_t)atomic_load_relaxed(&pp->pp_pte.pte_ptp); sum |= (uintptr_t)atomic_load_relaxed(&pp->pp_pvlist.lh_first); if (sum == 0) { return; } kpreempt_disable(); for (;;) { struct pmap *pmap; struct pv_entry *pve; pt_entry_t opte; vaddr_t va; mutex_spin_enter(&pp->pp_lock); if ((pvpte = pv_pte_first(pp)) == NULL) { mutex_spin_exit(&pp->pp_lock); break; } /* * Add a reference to the pmap before clearing the pte. * Otherwise the pmap can disappear behind us. */ ptp = pvpte->pte_ptp; pmap = ptp_to_pmap(ptp); KASSERT(pmap->pm_obj[0].uo_refs > 0); if (ptp != NULL) { pmap_reference(pmap); } /* * Now try to lock it. We need a direct handoff between * pp_lock and pm_lock to know the pv_entry is kept intact * and kept associated with this pmap. If that can't be * had, wait for the pmap's lock to become free and then * retry. */ locked = mutex_tryenter(&pmap->pm_lock); mutex_spin_exit(&pp->pp_lock); if (!locked) { mutex_enter(&pmap->pm_lock); /* nothing, just wait for it */ mutex_exit(&pmap->pm_lock); if (ptp != NULL) { pmap_destroy(pmap); } continue; } va = pvpte->pte_va; KASSERTMSG(pmap->pm_stats.resident_count > PDP_SIZE, "va %lx pmap %p ptp %p is empty", va, pmap, ptp); KASSERTMSG(ptp == NULL || (ptp->flags & PG_FREE) == 0, "va %lx pmap %p ptp %p is free", va, pmap, ptp); KASSERTMSG(ptp == NULL || ptp->wire_count > 1, "va %lx pmap %p ptp %p is empty", va, pmap, ptp); #ifdef DEBUG pmap_check_pv(pmap, ptp, pp, pvpte->pte_va, true); rb_tree_t *tree = (ptp != NULL ? &VM_PAGE_TO_PP(ptp)->pp_rb : &pmap_kernel_rb); pve = pmap_treelookup_pv(pmap, ptp, tree, va); if (pve == NULL) { KASSERTMSG(&pp->pp_pte == pvpte, "va %lx pmap %p ptp %p pvpte %p pve %p oops 1", va, pmap, ptp, pvpte, pve); } else { KASSERTMSG(&pve->pve_pte == pvpte, "va %lx pmap %p ptp %p pvpte %p pve %p oops 2", va, pmap, ptp, pvpte, pve); } #endif if (pmap_sync_pv(pvpte, pa, ~0, &oattrs, &opte)) { panic("pmap_pp_remove: mapping not present"); } pve = pmap_lookup_pv(pmap, ptp, pp, va); pmap_remove_pv(pmap, pp, ptp, va, pve, oattrs); /* Update the PTP reference count. Free if last reference. */ if (ptp != NULL) { KASSERT(pmap != pmap_kernel()); pmap_tlb_shootnow(); if (__predict_false(pmap->pm_pp_remove_ent != NULL)) { (*pmap->pm_pp_remove_ent)(pmap, ptp, opte, va); } else { pmap_pp_remove_ent(pmap, ptp, opte, va); } } else { KASSERT(pmap == pmap_kernel()); pmap_stats_update_bypte(pmap, 0, opte); } pmap_tlb_shootnow(); pmap_drain_pv(pmap); mutex_exit(&pmap->pm_lock); if (ptp != NULL) { pmap_destroy(pmap); } } kpreempt_enable(); } /* * pmap_page_remove: remove a managed vm_page from all pmaps that map it * * => R/M bits are sync'd back to attrs */ void pmap_page_remove(struct vm_page *pg) { struct pmap_page *pp; paddr_t pa; pp = VM_PAGE_TO_PP(pg); pa = VM_PAGE_TO_PHYS(pg); pmap_pp_remove(pp, pa); } /* * pmap_pv_remove: remove an unmanaged pv-tracked page from all pmaps * that map it */ void pmap_pv_remove(paddr_t pa) { struct pmap_page *pp; pp = pmap_pv_tracked(pa); if (pp == NULL) panic("%s: page not pv-tracked: %#"PRIxPADDR, __func__, pa); pmap_pp_remove(pp, pa); } /* * p m a p a t t r i b u t e f u n c t i o n s * functions that test/change managed page's attributes * since a page can be mapped multiple times we must check each PTE that * maps it by going down the pv lists. */ /* * pmap_test_attrs: test a page's attributes */ bool pmap_test_attrs(struct vm_page *pg, unsigned testbits) { struct pmap_page *pp; struct pv_pte *pvpte; struct pmap *pmap; uint8_t oattrs; u_int result; paddr_t pa; pp = VM_PAGE_TO_PP(pg); if ((pp->pp_attrs & testbits) != 0) { return true; } pa = VM_PAGE_TO_PHYS(pg); startover: mutex_spin_enter(&pp->pp_lock); for (pvpte = pv_pte_first(pp); pvpte; pvpte = pv_pte_next(pp, pvpte)) { if ((pp->pp_attrs & testbits) != 0) { break; } if (pmap_sync_pv(pvpte, pa, 0, &oattrs, NULL)) { /* * raced with a V->P operation. wait for the other * side to finish by acquiring pmap's lock. if no * wait, updates to pp_attrs by the other side may * go unseen. */ pmap = ptp_to_pmap(pvpte->pte_ptp); pmap_reference(pmap); mutex_spin_exit(&pp->pp_lock); mutex_enter(&pmap->pm_lock); /* nothing. */ mutex_exit(&pmap->pm_lock); pmap_destroy(pmap); goto startover; } pp->pp_attrs |= oattrs; } result = pp->pp_attrs & testbits; mutex_spin_exit(&pp->pp_lock); /* * note that we will exit the for loop with a non-null pve if * we have found the bits we are testing for. */ return result != 0; } static bool pmap_pp_clear_attrs(struct pmap_page *pp, paddr_t pa, unsigned clearbits) { struct pv_pte *pvpte; struct pmap *pmap; uint8_t oattrs; u_int result; startover: mutex_spin_enter(&pp->pp_lock); for (pvpte = pv_pte_first(pp); pvpte; pvpte = pv_pte_next(pp, pvpte)) { if (pmap_sync_pv(pvpte, pa, clearbits, &oattrs, NULL)) { /* * raced with a V->P operation. wait for the other * side to finish by acquiring pmap's lock. it is * probably unmapping the page, and it will be gone * when the loop is restarted. */ pmap = ptp_to_pmap(pvpte->pte_ptp); pmap_reference(pmap); mutex_spin_exit(&pp->pp_lock); mutex_enter(&pmap->pm_lock); /* nothing. */ mutex_exit(&pmap->pm_lock); pmap_destroy(pmap); goto startover; } pp->pp_attrs |= oattrs; } result = pp->pp_attrs & clearbits; pp->pp_attrs &= ~clearbits; pmap_tlb_shootnow(); mutex_spin_exit(&pp->pp_lock); return result != 0; } /* * pmap_clear_attrs: clear the specified attribute for a page. * * => we return true if we cleared one of the bits we were asked to */ bool pmap_clear_attrs(struct vm_page *pg, unsigned clearbits) { struct pmap_page *pp; paddr_t pa; pp = VM_PAGE_TO_PP(pg); pa = VM_PAGE_TO_PHYS(pg); /* * If this is a new page, assert it has no mappings and simply zap * the stored attributes without taking any locks. */ if ((pg->flags & PG_FAKE) != 0) { KASSERT(atomic_load_relaxed(&pp->pp_pte.pte_va) == 0); KASSERT(atomic_load_relaxed(&pp->pp_pte.pte_ptp) == NULL); KASSERT(atomic_load_relaxed(&pp->pp_pvlist.lh_first) == NULL); atomic_store_relaxed(&pp->pp_attrs, 0); return false; } else { return pmap_pp_clear_attrs(pp, pa, clearbits); } } /* * pmap_pv_clear_attrs: clear the specified attributes for an unmanaged * pv-tracked page. */ bool pmap_pv_clear_attrs(paddr_t pa, unsigned clearbits) { struct pmap_page *pp; pp = pmap_pv_tracked(pa); if (pp == NULL) panic("%s: page not pv-tracked: %#"PRIxPADDR, __func__, pa); return pmap_pp_clear_attrs(pp, pa, clearbits); } /* * p m a p p r o t e c t i o n f u n c t i o n s */ /* * pmap_page_protect: change the protection of all recorded mappings * of a managed page * * => NOTE: this is an inline function in pmap.h */ /* see pmap.h */ /* * pmap_pv_protect: change the protection of all recorded mappings * of an unmanaged pv-tracked page * * => NOTE: this is an inline function in pmap.h */ /* see pmap.h */ /* * pmap_protect: set the protection in of the pages in a pmap * * => NOTE: this is an inline function in pmap.h */ /* see pmap.h */ /* * pmap_write_protect: write-protect pages in a pmap. * * Note for Xen-amd64. Xen automatically adds PTE_U to the kernel pages, but we * don't need to remove this bit when re-entering the PTEs here: Xen tracks the * kernel pages with a reserved bit (_PAGE_GUEST_KERNEL), so even if PTE_U is * present the page will still be considered as a kernel page, and the privilege * separation will be enforced correctly. */ void pmap_write_protect(struct pmap *pmap, vaddr_t sva, vaddr_t eva, vm_prot_t prot) { pt_entry_t bit_rem, bit_put; pt_entry_t *ptes; pt_entry_t * const *pdes; struct pmap *pmap2; vaddr_t blockend, va; int lvl, i; if (__predict_false(pmap->pm_write_protect != NULL)) { (*pmap->pm_write_protect)(pmap, sva, eva, prot); return; } bit_rem = 0; if (!(prot & VM_PROT_WRITE)) bit_rem = PTE_W; bit_put = 0; if (!(prot & VM_PROT_EXECUTE)) bit_put = pmap_pg_nx; sva &= ~PAGE_MASK; eva &= ~PAGE_MASK; /* * Acquire pmap. No need to lock the kernel pmap as we won't * be touching PV entries nor stats and kernel PDEs aren't * freed. */ if (pmap != pmap_kernel()) { mutex_enter(&pmap->pm_lock); } pmap_map_ptes(pmap, &pmap2, &ptes, &pdes); for (va = sva ; va < eva; va = blockend) { pt_entry_t *spte, *epte; blockend = x86_round_pdr(va + 1); if (blockend > eva) blockend = eva; /* Is it a valid block? */ if (!pmap_pdes_valid(va, pdes, NULL, &lvl)) { continue; } KASSERT(va < VM_MAXUSER_ADDRESS || va >= VM_MAX_ADDRESS); KASSERT(lvl == 1); spte = &ptes[pl1_i(va)]; epte = &ptes[pl1_i(blockend)]; for (i = 0; spte < epte; spte++, i++) { pt_entry_t opte, npte; do { opte = *spte; if (!pmap_valid_entry(opte)) { goto next; } npte = (opte & ~bit_rem) | bit_put; } while (pmap_pte_cas(spte, opte, npte) != opte); if ((opte & PTE_D) != 0) { vaddr_t tva = va + x86_ptob(i); pmap_tlb_shootdown(pmap, tva, opte, TLBSHOOT_WRITE_PROTECT); } next:; } } /* Release pmap. */ pmap_unmap_ptes(pmap, pmap2); if (pmap != pmap_kernel()) { mutex_exit(&pmap->pm_lock); } } /* * pmap_unwire: clear the wired bit in the PTE. * * => Mapping should already be present. */ void pmap_unwire(struct pmap *pmap, vaddr_t va) { pt_entry_t *ptes, *ptep, opte; pd_entry_t * const *pdes; struct pmap *pmap2; int lvl; if (__predict_false(pmap->pm_unwire != NULL)) { (*pmap->pm_unwire)(pmap, va); return; } /* * Acquire pmap. Need to lock the kernel pmap only to protect the * statistics. */ mutex_enter(&pmap->pm_lock); pmap_map_ptes(pmap, &pmap2, &ptes, &pdes); if (!pmap_pdes_valid(va, pdes, NULL, &lvl)) { panic("%s: invalid PDE va=%#" PRIxVADDR, __func__, va); } KASSERT(lvl == 1); ptep = &ptes[pl1_i(va)]; opte = *ptep; KASSERT(pmap_valid_entry(opte)); if (opte & PTE_WIRED) { pt_entry_t npte = opte & ~PTE_WIRED; opte = pmap_pte_testset(ptep, npte); pmap_stats_update_bypte(pmap, npte, opte); } else { printf("%s: wiring for pmap %p va %#" PRIxVADDR " did not change!\n", __func__, pmap, va); } /* Release pmap. */ pmap_unmap_ptes(pmap, pmap2); mutex_exit(&pmap->pm_lock); } /* * pmap_copy: copy mappings from one pmap to another * * => optional function * void pmap_copy(dst_pmap, src_pmap, dst_addr, len, src_addr) */ /* * defined as macro in pmap.h */ __strict_weak_alias(pmap_enter, pmap_enter_default); int pmap_enter_default(pmap_t pmap, vaddr_t va, paddr_t pa, vm_prot_t prot, u_int flags) { if (__predict_false(pmap->pm_enter != NULL)) { return (*pmap->pm_enter)(pmap, va, pa, prot, flags); } return pmap_enter_ma(pmap, va, pa, pa, prot, flags, 0); } /* * pmap_enter: enter a mapping into a pmap * * => must be done "now" ... no lazy-evaluation */ int pmap_enter_ma(struct pmap *pmap, vaddr_t va, paddr_t ma, paddr_t pa, vm_prot_t prot, u_int flags, int domid) { pt_entry_t *ptes, opte, npte; pt_entry_t *ptep; pd_entry_t * const *pdes; struct vm_page *ptp; struct vm_page *new_pg, *old_pg; struct pmap_page *new_pp, *old_pp; struct pv_entry *old_pve, *new_pve; bool wired = (flags & PMAP_WIRED) != 0; struct pmap *pmap2; struct pmap_ptparray pt; int error; bool getptp, samepage, new_embedded; rb_tree_t *tree; KASSERT(pmap_initialized); KASSERT(va < VM_MAX_KERNEL_ADDRESS); KASSERTMSG(va != (vaddr_t)PDP_BASE, "%s: trying to map va=%#" PRIxVADDR " over PDP!", __func__, va); KASSERTMSG(va < VM_MIN_KERNEL_ADDRESS || pmap_valid_entry(pmap->pm_pdir[pl_i(va, PTP_LEVELS)]), "%s: missing kernel PTP for va=%#" PRIxVADDR, __func__, va); #ifdef XENPV KASSERT(domid == DOMID_SELF || pa == 0); #endif npte = ma | protection_codes[prot] | PTE_P; npte |= pmap_pat_flags(flags); if (wired) npte |= PTE_WIRED; if (va < VM_MAXUSER_ADDRESS) { KASSERTMSG(pmap != pmap_kernel(), "entering user va %#"PRIxVADDR" into kernel pmap", va); if (pmap_is_user(pmap)) npte |= PTE_U; } if (pmap == pmap_kernel()) npte |= pmap_pg_g; if (flags & VM_PROT_ALL) { npte |= PTE_A; if (flags & VM_PROT_WRITE) { KASSERT((npte & PTE_W) != 0); npte |= PTE_D; } } #ifdef XENPV if (domid != DOMID_SELF) new_pg = NULL; else #endif new_pg = PHYS_TO_VM_PAGE(pa); if (new_pg != NULL) { /* This is a managed page */ npte |= PTE_PVLIST; new_pp = VM_PAGE_TO_PP(new_pg); PMAP_CHECK_PP(new_pp); } else if ((new_pp = pmap_pv_tracked(pa)) != NULL) { /* This is an unmanaged pv-tracked page */ npte |= PTE_PVLIST; PMAP_CHECK_PP(new_pp); } else { new_pp = NULL; } /* Begin by locking the pmap. */ mutex_enter(&pmap->pm_lock); /* Look up the PTP. Allocate if none present. */ ptp = NULL; getptp = false; if (pmap != pmap_kernel()) { ptp = pmap_find_ptp(pmap, va, 1); if (ptp == NULL) { getptp = true; error = pmap_get_ptp(pmap, &pt, va, flags, &ptp); if (error != 0) { if (flags & PMAP_CANFAIL) { mutex_exit(&pmap->pm_lock); return error; } panic("%s: get ptp failed, error=%d", __func__, error); } } tree = &VM_PAGE_TO_PP(ptp)->pp_rb; } else { /* Embedded PV entries rely on this. */ KASSERT(va != 0); tree = &pmap_kernel_rb; } /* * Look up the old PV entry at this VA (if any), and insert a new PV * entry if required for the new mapping. Temporarily track the old * and new mappings concurrently. Only after the old mapping is * evicted from the pmap will we remove its PV entry. Otherwise, * our picture of modified/accessed state for either page could get * out of sync (we need any P->V operation for either page to stall * on pmap->pm_lock until done here). */ new_pve = NULL; old_pve = NULL; samepage = false; new_embedded = false; if (new_pp != NULL) { error = pmap_enter_pv(pmap, new_pp, ptp, va, &new_pve, &old_pve, &samepage, &new_embedded, tree); /* * If a new pv_entry was needed and none was available, we * can go no further. */ if (error != 0) { if (flags & PMAP_CANFAIL) { if (getptp) { pmap_unget_ptp(pmap, &pt); } mutex_exit(&pmap->pm_lock); return error; } panic("%s: alloc pve failed", __func__); } } else { old_pve = pmap_treelookup_pv(pmap, ptp, tree, va); } /* Map PTEs into address space. */ pmap_map_ptes(pmap, &pmap2, &ptes, &pdes); /* Install any newly allocated PTPs. */ if (getptp) { pmap_install_ptp(pmap, &pt, va, pdes); } /* Check if there is an existing mapping. */ ptep = &ptes[pl1_i(va)]; opte = *ptep; bool have_oldpa = pmap_valid_entry(opte); paddr_t oldpa = pmap_pte2pa(opte); /* * Update the pte. */ do { opte = *ptep; /* * if the same page, inherit PTE_A and PTE_D. */ if (((opte ^ npte) & (PTE_FRAME | PTE_P)) == 0) { npte |= opte & (PTE_A | PTE_D); } #if defined(XENPV) if (domid != DOMID_SELF) { /* pmap_pte_cas with error handling */ int s = splvm(); if (opte != *ptep) { splx(s); continue; } error = xpq_update_foreign( vtomach((vaddr_t)ptep), npte, domid, flags); splx(s); if (error) { /* Undo pv_entry tracking - oof. */ if (new_pp != NULL) { mutex_spin_enter(&new_pp->pp_lock); if (new_pve != NULL) { LIST_REMOVE(new_pve, pve_list); KASSERT(pmap->pm_pve == NULL); pmap->pm_pve = new_pve; } else if (new_embedded) { new_pp->pp_pte.pte_ptp = NULL; new_pp->pp_pte.pte_va = 0; } mutex_spin_exit(&new_pp->pp_lock); } pmap_unmap_ptes(pmap, pmap2); /* Free new PTP. */ if (ptp != NULL && ptp->wire_count <= 1) { pmap_free_ptp(pmap, ptp, va, ptes, pdes); } mutex_exit(&pmap->pm_lock); return error; } break; } #endif /* defined(XENPV) */ } while (pmap_pte_cas(ptep, opte, npte) != opte); /* * Done with the PTEs: they can now be unmapped. */ pmap_unmap_ptes(pmap, pmap2); /* * Update statistics and PTP's reference count. */ pmap_stats_update_bypte(pmap, npte, opte); if (ptp != NULL) { if (!have_oldpa) { ptp->wire_count++; } /* Remember minimum VA in PTP. */ pmap_ptp_range_set(ptp, va); } KASSERT(ptp == NULL || ptp->wire_count > 1); /* * If the same page, we can skip pv_entry handling. */ if (((opte ^ npte) & (PTE_FRAME | PTE_P)) == 0) { KASSERT(((opte ^ npte) & PTE_PVLIST) == 0); if ((npte & PTE_PVLIST) != 0) { KASSERT(samepage); pmap_check_pv(pmap, ptp, new_pp, va, true); } goto same_pa; } else if ((npte & PTE_PVLIST) != 0) { KASSERT(!samepage); } /* * If old page is pv-tracked, remove pv_entry from its list. */ if ((~opte & (PTE_P | PTE_PVLIST)) == 0) { if ((old_pg = PHYS_TO_VM_PAGE(oldpa)) != NULL) { old_pp = VM_PAGE_TO_PP(old_pg); } else if ((old_pp = pmap_pv_tracked(oldpa)) == NULL) { panic("%s: PTE_PVLIST with pv-untracked page" " va = %#"PRIxVADDR " pa = %#" PRIxPADDR " (%#" PRIxPADDR ")", __func__, va, oldpa, atop(pa)); } pmap_remove_pv(pmap, old_pp, ptp, va, old_pve, pmap_pte_to_pp_attrs(opte)); } else { KASSERT(old_pve == NULL); KASSERT(pmap_treelookup_pv(pmap, ptp, tree, va) == NULL); } /* * If new page is dynamically PV tracked, insert to tree. */ if (new_pve != NULL) { KASSERT(pmap_treelookup_pv(pmap, ptp, tree, va) == NULL); old_pve = rb_tree_insert_node(tree, new_pve); KASSERT(old_pve == new_pve); pmap_check_pv(pmap, ptp, new_pp, va, true); } same_pa: /* * shootdown tlb if necessary. */ if ((~opte & (PTE_P | PTE_A)) == 0 && ((opte ^ npte) & (PTE_FRAME | PTE_W)) != 0) { pmap_tlb_shootdown(pmap, va, opte, TLBSHOOT_ENTER); } pmap_drain_pv(pmap); mutex_exit(&pmap->pm_lock); return 0; } #if defined(XEN) && defined(DOM0OPS) struct pmap_data_gnt { SLIST_ENTRY(pmap_data_gnt) pd_gnt_list; vaddr_t pd_gnt_sva; vaddr_t pd_gnt_eva; /* range covered by this gnt */ int pd_gnt_refs; /* ref counter */ struct gnttab_map_grant_ref pd_gnt_ops[1]; /* variable length */ }; SLIST_HEAD(pmap_data_gnt_head, pmap_data_gnt); static void pmap_remove_gnt(struct pmap *, vaddr_t, vaddr_t); static struct pmap_data_gnt * pmap_find_gnt(struct pmap *pmap, vaddr_t sva, vaddr_t eva) { struct pmap_data_gnt_head *headp; struct pmap_data_gnt *pgnt; KASSERT(mutex_owned(&pmap->pm_lock)); headp = pmap->pm_data; KASSERT(headp != NULL); SLIST_FOREACH(pgnt, headp, pd_gnt_list) { if (pgnt->pd_gnt_sva <= sva && eva <= pgnt->pd_gnt_eva) return pgnt; /* check that we're not overlapping part of a region */ KASSERT(pgnt->pd_gnt_sva >= eva || pgnt->pd_gnt_eva <= sva); } return NULL; } static void pmap_alloc_gnt(struct pmap *pmap, vaddr_t sva, int nentries, const struct gnttab_map_grant_ref *ops) { struct pmap_data_gnt_head *headp; struct pmap_data_gnt *pgnt; vaddr_t eva = sva + nentries * PAGE_SIZE; KASSERT(mutex_owned(&pmap->pm_lock)); KASSERT(nentries >= 1); if (pmap->pm_remove == NULL) { pmap->pm_remove = pmap_remove_gnt; KASSERT(pmap->pm_data == NULL); headp = kmem_alloc(sizeof(*headp), KM_SLEEP); SLIST_INIT(headp); pmap->pm_data = headp; } else { KASSERT(pmap->pm_remove == pmap_remove_gnt); KASSERT(pmap->pm_data != NULL); headp = pmap->pm_data; } pgnt = pmap_find_gnt(pmap, sva, eva); if (pgnt != NULL) { KASSERT(pgnt->pd_gnt_sva == sva); KASSERT(pgnt->pd_gnt_eva == eva); return; } /* new entry */ pgnt = kmem_alloc(sizeof(*pgnt) + (nentries - 1) * sizeof(struct gnttab_map_grant_ref), KM_SLEEP); pgnt->pd_gnt_sva = sva; pgnt->pd_gnt_eva = eva; pgnt->pd_gnt_refs = 0; memcpy(pgnt->pd_gnt_ops, ops, sizeof(struct gnttab_map_grant_ref) * nentries); SLIST_INSERT_HEAD(headp, pgnt, pd_gnt_list); } static void pmap_free_gnt(struct pmap *pmap, struct pmap_data_gnt *pgnt) { struct pmap_data_gnt_head *headp = pmap->pm_data; int nentries = (pgnt->pd_gnt_eva - pgnt->pd_gnt_sva) / PAGE_SIZE; KASSERT(nentries >= 1); KASSERT(mutex_owned(&pmap->pm_lock)); KASSERT(pgnt->pd_gnt_refs == 0); SLIST_REMOVE(headp, pgnt, pmap_data_gnt, pd_gnt_list); kmem_free(pgnt, sizeof(*pgnt) + (nentries - 1) * sizeof(struct gnttab_map_grant_ref)); if (SLIST_EMPTY(headp)) { kmem_free(headp, sizeof(*headp)); pmap->pm_data = NULL; pmap->pm_remove = NULL; } } /* * pmap_enter_gnt: enter a grant entry into a pmap * * => must be done "now" ... no lazy-evaluation */ int pmap_enter_gnt(struct pmap *pmap, vaddr_t va, vaddr_t sva, int nentries, const struct gnttab_map_grant_ref *oops) { struct pmap_data_gnt *pgnt; pt_entry_t *ptes, opte; #ifndef XENPV pt_entry_t npte; #endif pt_entry_t *ptep; pd_entry_t * const *pdes; struct vm_page *ptp; struct vm_page *old_pg; struct pmap_page *old_pp; struct pv_entry *old_pve; struct pmap *pmap2; struct pmap_ptparray pt; int error; bool getptp; rb_tree_t *tree; struct gnttab_map_grant_ref *op; int ret; int idx; KASSERT(pmap_initialized); KASSERT(va < VM_MAX_KERNEL_ADDRESS); KASSERTMSG(va != (vaddr_t)PDP_BASE, "%s: trying to map va=%#" PRIxVADDR " over PDP!", __func__, va); KASSERT(pmap != pmap_kernel()); /* Begin by locking the pmap. */ mutex_enter(&pmap->pm_lock); pmap_alloc_gnt(pmap, sva, nentries, oops); pgnt = pmap_find_gnt(pmap, va, va + PAGE_SIZE); KASSERT(pgnt != NULL); /* Look up the PTP. Allocate if none present. */ ptp = NULL; getptp = false; ptp = pmap_find_ptp(pmap, va, 1); if (ptp == NULL) { getptp = true; error = pmap_get_ptp(pmap, &pt, va, PMAP_CANFAIL, &ptp); if (error != 0) { mutex_exit(&pmap->pm_lock); return error; } } tree = &VM_PAGE_TO_PP(ptp)->pp_rb; /* * Look up the old PV entry at this VA (if any), and insert a new PV * entry if required for the new mapping. Temporarily track the old * and new mappings concurrently. Only after the old mapping is * evicted from the pmap will we remove its PV entry. Otherwise, * our picture of modified/accessed state for either page could get * out of sync (we need any P->V operation for either page to stall * on pmap->pm_lock until done here). */ old_pve = NULL; old_pve = pmap_treelookup_pv(pmap, ptp, tree, va); /* Map PTEs into address space. */ pmap_map_ptes(pmap, &pmap2, &ptes, &pdes); /* Install any newly allocated PTPs. */ if (getptp) { pmap_install_ptp(pmap, &pt, va, pdes); } /* Check if there is an existing mapping. */ ptep = &ptes[pl1_i(va)]; opte = *ptep; bool have_oldpa = pmap_valid_entry(opte); paddr_t oldpa = pmap_pte2pa(opte); /* * Update the pte. */ idx = (va - pgnt->pd_gnt_sva) / PAGE_SIZE; op = &pgnt->pd_gnt_ops[idx]; #ifdef XENPV KASSERT(op->flags & GNTMAP_contains_pte); op->host_addr = xpmap_ptetomach(ptep); #else KASSERT((op->flags & GNTMAP_contains_pte) == 0); KASSERT(op->flags != 0); KASSERT(op->host_addr != 0); #endif op->dev_bus_addr = 0; op->status = GNTST_general_error; ret = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, op, 1); if (__predict_false(ret)) { printf("%s: GNTTABOP_map_grant_ref failed: %d\n", __func__, ret); op->status = GNTST_general_error; } for (int d = 0; d < 256 && op->status == GNTST_eagain; d++) { kpause("gntmap", false, mstohz(1), NULL); ret = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, op, 1); if (__predict_false(ret)) { printf("%s: GNTTABOP_map_grant_ref failed: %d\n", __func__, ret); op->status = GNTST_general_error; } } if (__predict_false(op->status != GNTST_okay)) { printf("%s: GNTTABOP_map_grant_ref status: %d\n", __func__, op->status); if (have_oldpa) { /* XXX did the pte really change if XENPV ?*/ ptp->wire_count--; } } else { #ifndef XENPV npte = op->host_addr | pmap_pg_nx | PTE_U | PTE_P; if ((op->flags & GNTMAP_readonly) == 0) npte |= PTE_W; do { opte = *ptep; } while (pmap_pte_cas(ptep, opte, npte) != opte); #endif pgnt->pd_gnt_refs++; if (!have_oldpa) { ptp->wire_count++; } KASSERT(ptp->wire_count > 1); /* Remember minimum VA in PTP. */ pmap_ptp_range_set(ptp, va); } if (ptp->wire_count <= 1) pmap_free_ptp(pmap, ptp, va, ptes, pdes); /* * Done with the PTEs: they can now be unmapped. */ pmap_unmap_ptes(pmap, pmap2); /* * Update statistics and PTP's reference count. */ pmap_stats_update_bypte(pmap, 0, opte); /* * If old page is pv-tracked, remove pv_entry from its list. */ if ((~opte & (PTE_P | PTE_PVLIST)) == 0) { if ((old_pg = PHYS_TO_VM_PAGE(oldpa)) != NULL) { old_pp = VM_PAGE_TO_PP(old_pg); } else if ((old_pp = pmap_pv_tracked(oldpa)) == NULL) { panic("%s: PTE_PVLIST with pv-untracked page" " va = %#"PRIxVADDR " pa = %#" PRIxPADDR, __func__, va, oldpa); } pmap_remove_pv(pmap, old_pp, ptp, va, old_pve, pmap_pte_to_pp_attrs(opte)); } else { KASSERT(old_pve == NULL); KASSERT(pmap_treelookup_pv(pmap, ptp, tree, va) == NULL); } pmap_drain_pv(pmap); mutex_exit(&pmap->pm_lock); return op->status; } /* * pmap_remove_gnt: grant mapping removal function. * * => caller should not be holding any pmap locks */ static void pmap_remove_gnt(struct pmap *pmap, vaddr_t sva, vaddr_t eva) { struct pmap_data_gnt *pgnt; pt_entry_t *ptes; pd_entry_t pde; pd_entry_t * const *pdes; struct vm_page *ptp; struct pmap *pmap2; vaddr_t va; int lvl; int idx; struct gnttab_map_grant_ref *op; struct gnttab_unmap_grant_ref unmap_op; int ret; KASSERT(pmap != pmap_kernel()); KASSERT(pmap->pm_remove == pmap_remove_gnt); mutex_enter(&pmap->pm_lock); for (va = sva; va < eva; va += PAGE_SIZE) { pgnt = pmap_find_gnt(pmap, va, va + PAGE_SIZE); if (pgnt == NULL) { pmap_remove_locked(pmap, sva, eva); continue; } pmap_map_ptes(pmap, &pmap2, &ptes, &pdes); if (!pmap_pdes_valid(va, pdes, &pde, &lvl)) { panic("pmap_remove_gnt pdes not valid"); } idx = (va - pgnt->pd_gnt_sva) / PAGE_SIZE; op = &pgnt->pd_gnt_ops[idx]; KASSERT(lvl == 1); /* Get PTP if non-kernel mapping. */ ptp = pmap_find_ptp(pmap, va, 1); KASSERTMSG(ptp != NULL, "%s: unmanaged PTP detected", __func__); if (op->status == GNTST_okay) { KASSERT(pmap_valid_entry(ptes[pl1_i(va)])); #ifdef XENPV unmap_op.host_addr = xpmap_ptetomach(&ptes[pl1_i(va)]); #else unmap_op.host_addr = op->host_addr; pmap_pte_testset(&ptes[pl1_i(va)], 0); #endif unmap_op.handle = op->handle; unmap_op.dev_bus_addr = 0; ret = HYPERVISOR_grant_table_op( GNTTABOP_unmap_grant_ref, &unmap_op, 1); if (ret) { printf("%s: GNTTABOP_unmap_grant_ref " "failed: %d\n", __func__, ret); } ptp->wire_count--; pgnt->pd_gnt_refs--; } if (pgnt->pd_gnt_refs == 0) { pmap_free_gnt(pmap, pgnt); } /* * if mapping removed and the PTP is no longer * being used, free it! */ if (ptp->wire_count <= 1) pmap_free_ptp(pmap, ptp, va, ptes, pdes); pmap_unmap_ptes(pmap, pmap2); } mutex_exit(&pmap->pm_lock); } #endif /* XEN && DOM0OPS */ paddr_t pmap_get_physpage(void) { struct vm_page *ptp; struct pmap *kpm = pmap_kernel(); paddr_t pa; if (!uvm.page_init_done) { /* * We're growing the kernel pmap early (from * uvm_pageboot_alloc()). This case must be * handled a little differently. */ if (!uvm_page_physget(&pa)) panic("%s: out of memory", __func__); #if defined(__HAVE_DIRECT_MAP) memset(PAGE_ALIGNED(PMAP_DIRECT_MAP(pa)), 0, PAGE_SIZE); #else #if defined(XENPV) if (XEN_VERSION_SUPPORTED(3, 4)) { xen_pagezero(pa); return pa; } #endif kpreempt_disable(); pmap_pte_set(early_zero_pte, pmap_pa2pte(pa) | PTE_P | PTE_W | pmap_pg_nx); pmap_pte_flush(); pmap_update_pg((vaddr_t)early_zerop); memset(PAGE_ALIGNED(early_zerop), 0, PAGE_SIZE); #if defined(DIAGNOSTIC) || defined(XENPV) pmap_pte_set(early_zero_pte, 0); pmap_pte_flush(); #endif /* defined(DIAGNOSTIC) */ kpreempt_enable(); #endif /* defined(__HAVE_DIRECT_MAP) */ } else { /* XXX */ ptp = uvm_pagealloc(NULL, 0, NULL, UVM_PGA_USERESERVE|UVM_PGA_ZERO); if (ptp == NULL) panic("%s: out of memory", __func__); ptp->flags &= ~PG_BUSY; ptp->wire_count = 1; pa = VM_PAGE_TO_PHYS(ptp); } pmap_stats_update(kpm, 1, 0); return pa; } /* * Expand the page tree with the specified amount of PTPs, mapping virtual * addresses starting at kva. We populate all the levels but the last one * (L1). The nodes of the tree are created as RW, but the pages covered * will be kentered in L1, with proper permissions. * * Used only by pmap_growkernel. */ static void pmap_alloc_level(struct pmap *cpm, vaddr_t kva, long *needed_ptps) { unsigned long i; paddr_t pa; unsigned long index, endindex; int level; pd_entry_t *pdep; #ifdef XENPV int s = splvm(); /* protect xpq_* */ #endif for (level = PTP_LEVELS; level > 1; level--) { if (level == PTP_LEVELS) pdep = cpm->pm_pdir; else pdep = normal_pdes[level - 2]; index = pl_i_roundup(kva, level); endindex = index + needed_ptps[level - 1] - 1; for (i = index; i <= endindex; i++) { pt_entry_t pte; KASSERT(!pmap_valid_entry(pdep[i])); pa = pmap_get_physpage(); pte = pmap_pa2pte(pa) | PTE_P | PTE_W; #ifdef __x86_64__ pte |= pmap_pg_nx; #endif pmap_pte_set(&pdep[i], pte); #ifdef XENPV if (level == PTP_LEVELS && i >= PDIR_SLOT_KERN) { if (__predict_true( cpu_info_primary.ci_flags & CPUF_PRESENT)) { /* update per-cpu PMDs on all cpus */ xen_kpm_sync(pmap_kernel(), i); } else { /* * too early; update primary CPU * PMD only (without locks) */ #ifdef __x86_64__ pd_entry_t *cpu_pdep = &cpu_info_primary.ci_kpm_pdir[i]; #else pd_entry_t *cpu_pdep = &cpu_info_primary.ci_kpm_pdir[l2tol2(i)]; #endif pmap_pte_set(cpu_pdep, pte); } } #endif KASSERT(level != PTP_LEVELS || nkptp[level - 1] + pl_i(VM_MIN_KERNEL_ADDRESS, level) == i); nkptp[level - 1]++; } pmap_pte_flush(); } #ifdef XENPV splx(s); #endif } /* * pmap_growkernel: increase usage of KVM space. * * => we allocate new PTPs for the kernel and install them in all * the pmaps on the system. */ vaddr_t pmap_growkernel(vaddr_t maxkvaddr) { struct pmap *kpm = pmap_kernel(); struct pmap *cpm; #if !defined(XENPV) || !defined(__x86_64__) struct pmap *pm; long old; #endif int s, i; long needed_kptp[PTP_LEVELS], target_nptp; bool invalidate = false; s = splvm(); /* to be safe */ mutex_enter(&kpm->pm_lock); if (maxkvaddr <= pmap_maxkvaddr) { mutex_exit(&kpm->pm_lock); splx(s); return pmap_maxkvaddr; } maxkvaddr = x86_round_pdr(maxkvaddr); #if !defined(XENPV) || !defined(__x86_64__) old = nkptp[PTP_LEVELS - 1]; #endif /* Initialize needed_kptp. */ for (i = PTP_LEVELS - 1; i >= 1; i--) { target_nptp = pl_i_roundup(maxkvaddr, i + 1) - pl_i_roundup(VM_MIN_KERNEL_ADDRESS, i + 1); if (target_nptp > nkptpmax[i]) panic("out of KVA space"); KASSERT(target_nptp >= nkptp[i]); needed_kptp[i] = target_nptp - nkptp[i]; } #ifdef XENPV /* only pmap_kernel(), or the per-cpu map, has kernel entries */ cpm = kpm; #else /* Get the current pmap */ if (__predict_true(cpu_info_primary.ci_flags & CPUF_PRESENT)) { cpm = curcpu()->ci_pmap; } else { cpm = kpm; } #endif kasan_shadow_map((void *)pmap_maxkvaddr, (size_t)(maxkvaddr - pmap_maxkvaddr)); kmsan_shadow_map((void *)pmap_maxkvaddr, (size_t)(maxkvaddr - pmap_maxkvaddr)); pmap_alloc_level(cpm, pmap_maxkvaddr, needed_kptp); /* * If the number of top level entries changed, update all pmaps. */ if (needed_kptp[PTP_LEVELS - 1] != 0) { #ifdef XENPV #ifdef __x86_64__ /* nothing, kernel entries are never entered in user pmap */ #else int pdkidx; mutex_enter(&pmaps_lock); LIST_FOREACH(pm, &pmaps, pm_list) { for (pdkidx = PDIR_SLOT_KERN + old; pdkidx < PDIR_SLOT_KERN + nkptp[PTP_LEVELS - 1]; pdkidx++) { pmap_pte_set(&pm->pm_pdir[pdkidx], kpm->pm_pdir[pdkidx]); } pmap_pte_flush(); } mutex_exit(&pmaps_lock); #endif /* __x86_64__ */ #else /* XENPV */ size_t newpdes; newpdes = nkptp[PTP_LEVELS - 1] - old; if (cpm != kpm) { memcpy(&kpm->pm_pdir[PDIR_SLOT_KERN + old], &cpm->pm_pdir[PDIR_SLOT_KERN + old], newpdes * sizeof(pd_entry_t)); } mutex_enter(&pmaps_lock); LIST_FOREACH(pm, &pmaps, pm_list) { if (__predict_false(pm->pm_enter != NULL)) { /* * Not a native pmap, the kernel is not mapped, * so nothing to synchronize. */ continue; } memcpy(&pm->pm_pdir[PDIR_SLOT_KERN + old], &kpm->pm_pdir[PDIR_SLOT_KERN + old], newpdes * sizeof(pd_entry_t)); } mutex_exit(&pmaps_lock); #endif invalidate = true; } pmap_maxkvaddr = maxkvaddr; mutex_exit(&kpm->pm_lock); splx(s); if (invalidate && pmap_initialized) { /* Invalidate the pmap cache. */ pool_cache_invalidate(&pmap_cache); } return maxkvaddr; } #ifdef DEBUG void pmap_dump(struct pmap *, vaddr_t, vaddr_t); /* * pmap_dump: dump all the mappings from a pmap * * => caller should not be holding any pmap locks */ void pmap_dump(struct pmap *pmap, vaddr_t sva, vaddr_t eva) { pt_entry_t *ptes, *pte; pd_entry_t * const *pdes; struct pmap *pmap2; vaddr_t blkendva; int lvl; /* * if end is out of range truncate. * if (end == start) update to max. */ if (eva > VM_MAXUSER_ADDRESS || eva <= sva) eva = VM_MAXUSER_ADDRESS; mutex_enter(&pmap->pm_lock); pmap_map_ptes(pmap, &pmap2, &ptes, &pdes); /* * dumping a range of pages: we dump in PTP sized blocks (4MB) */ for (/* null */ ; sva < eva ; sva = blkendva) { /* determine range of block */ blkendva = x86_round_pdr(sva+1); if (blkendva > eva) blkendva = eva; /* valid block? */ if (!pmap_pdes_valid(sva, pdes, NULL, &lvl)) continue; KASSERT(lvl == 1); pte = &ptes[pl1_i(sva)]; for (/* null */; sva < blkendva ; sva += PAGE_SIZE, pte++) { if (!pmap_valid_entry(*pte)) continue; printf("va %#" PRIxVADDR " -> pa %#" PRIxPADDR " (pte=%#" PRIxPADDR ")\n", sva, (paddr_t)pmap_pte2pa(*pte), (paddr_t)*pte); } } pmap_unmap_ptes(pmap, pmap2); mutex_exit(&pmap->pm_lock); } #endif /* * pmap_update: process deferred invalidations and frees. */ void pmap_update(struct pmap *pmap) { struct pmap_page *pp; struct vm_page *ptp; /* * Initiate any pending TLB shootdowns. Wait for them to * complete before returning control to the caller. */ kpreempt_disable(); pmap_tlb_shootnow(); kpreempt_enable(); /* * Now that shootdowns are complete, process deferred frees. This * is an unlocked check, but is safe as we're only interested in * work done in this LWP - we won't get a false negative. */ if (atomic_load_relaxed(&pmap->pm_gc_ptp.lh_first) == NULL) { return; } mutex_enter(&pmap->pm_lock); while ((ptp = LIST_FIRST(&pmap->pm_gc_ptp)) != NULL) { KASSERT(ptp->wire_count == 0); KASSERT(ptp->uanon == NULL); LIST_REMOVE(ptp, mdpage.mp_pp.pp_link); pp = VM_PAGE_TO_PP(ptp); LIST_INIT(&pp->pp_pvlist); pp->pp_attrs = 0; pp->pp_pte.pte_ptp = NULL; pp->pp_pte.pte_va = 0; PMAP_CHECK_PP(VM_PAGE_TO_PP(ptp)); /* * XXX Hack to avoid extra locking, and lock * assertions in uvm_pagefree(). Despite uobject * being set, this isn't a managed page. */ PMAP_DUMMY_LOCK(pmap); uvm_pagerealloc(ptp, NULL, 0); PMAP_DUMMY_UNLOCK(pmap); uvm_pagefree(ptp); } mutex_exit(&pmap->pm_lock); } #if PTP_LEVELS > 4 #error "Unsupported number of page table mappings" #endif paddr_t pmap_init_tmp_pgtbl(paddr_t pg) { static bool maps_loaded; static const paddr_t x86_tmp_pml_paddr[] = { 4 * PAGE_SIZE, /* L1 */ 5 * PAGE_SIZE, /* L2 */ 6 * PAGE_SIZE, /* L3 */ 7 * PAGE_SIZE /* L4 */ }; static vaddr_t x86_tmp_pml_vaddr[] = { 0, 0, 0, 0 }; pd_entry_t *tmp_pml, *kernel_pml; int level; if (!maps_loaded) { for (level = 0; level < PTP_LEVELS; ++level) { x86_tmp_pml_vaddr[level] = uvm_km_alloc(kernel_map, PAGE_SIZE, 0, UVM_KMF_VAONLY); if (x86_tmp_pml_vaddr[level] == 0) panic("mapping of real mode PML failed\n"); pmap_kenter_pa(x86_tmp_pml_vaddr[level], x86_tmp_pml_paddr[level], VM_PROT_READ | VM_PROT_WRITE, 0); } pmap_update(pmap_kernel()); maps_loaded = true; } /* Zero levels 1-3 */ for (level = 0; level < PTP_LEVELS - 1; ++level) { tmp_pml = (void *)x86_tmp_pml_vaddr[level]; memset(PAGE_ALIGNED(tmp_pml), 0, PAGE_SIZE); } /* Copy PML4 */ kernel_pml = pmap_kernel()->pm_pdir; tmp_pml = (void *)x86_tmp_pml_vaddr[PTP_LEVELS - 1]; memcpy(PAGE_ALIGNED(tmp_pml), PAGE_ALIGNED(kernel_pml), PAGE_SIZE); #ifdef PAE /* * Use the last 4 entries of the L2 page as L3 PD entries. These * last entries are unlikely to be used for temporary mappings. * 508: maps 0->1GB (userland) * 509: unused * 510: unused * 511: maps 3->4GB (kernel) */ tmp_pml[508] = x86_tmp_pml_paddr[PTP_LEVELS - 1] | PTE_P; tmp_pml[509] = 0; tmp_pml[510] = 0; tmp_pml[511] = pmap_pdirpa(pmap_kernel(), PDIR_SLOT_KERN) | PTE_P; #endif for (level = PTP_LEVELS - 1; level > 0; --level) { tmp_pml = (void *)x86_tmp_pml_vaddr[level]; tmp_pml[pl_i(pg, level + 1)] = (x86_tmp_pml_paddr[level - 1] & PTE_FRAME) | PTE_W | PTE_P; } tmp_pml = (void *)x86_tmp_pml_vaddr[0]; tmp_pml[pl_i(pg, 1)] = (pg & PTE_FRAME) | PTE_W | PTE_P; #ifdef PAE /* Return the PA of the L3 page (entry 508 of the L2 page) */ return x86_tmp_pml_paddr[PTP_LEVELS - 1] + 508 * sizeof(pd_entry_t); #endif return x86_tmp_pml_paddr[PTP_LEVELS - 1]; } u_int x86_mmap_flags(paddr_t mdpgno) { u_int nflag = (mdpgno >> X86_MMAP_FLAG_SHIFT) & X86_MMAP_FLAG_MASK; u_int pflag = 0; if (nflag & X86_MMAP_FLAG_PREFETCH) pflag |= PMAP_WRITE_COMBINE; return pflag; } #if defined(__HAVE_DIRECT_MAP) && defined(__x86_64__) && !defined(XENPV) /* * ----------------------------------------------------------------------------- * ***************************************************************************** * ***************************************************************************** * ***************************************************************************** * ***************************************************************************** * **************** HERE BEGINS THE EPT CODE, USED BY INTEL-VMX **************** * ***************************************************************************** * ***************************************************************************** * ***************************************************************************** * ***************************************************************************** * ----------------------------------------------------------------------------- * * These functions are invoked as callbacks from the code above. Contrary to * native, EPT does not have a recursive slot; therefore, it is not possible * to call pmap_map_ptes(). Instead, we use the direct map and walk down the * tree manually. * * Apart from that, the logic is mostly the same as native. Once a pmap has * been created, NVMM calls pmap_ept_transform() to make it an EPT pmap. * After that we're good, and the callbacks will handle the translations * for us. * * ----------------------------------------------------------------------------- */ /* Hardware bits. */ #define EPT_R __BIT(0) /* read */ #define EPT_W __BIT(1) /* write */ #define EPT_X __BIT(2) /* execute */ #define EPT_T __BITS(5,3) /* type */ #define TYPE_UC 0 #define TYPE_WC 1 #define TYPE_WT 4 #define TYPE_WP 5 #define TYPE_WB 6 #define EPT_NOPAT __BIT(6) #define EPT_L __BIT(7) /* large */ #define EPT_A __BIT(8) /* accessed */ #define EPT_D __BIT(9) /* dirty */ /* Software bits. */ #define EPT_PVLIST __BIT(60) #define EPT_WIRED __BIT(61) #define pmap_ept_valid_entry(pte) (pte & EPT_R) bool pmap_ept_has_ad __read_mostly; static inline void pmap_ept_stats_update_bypte(struct pmap *pmap, pt_entry_t npte, pt_entry_t opte) { int resid_diff = ((npte & EPT_R) ? 1 : 0) - ((opte & EPT_R) ? 1 : 0); int wired_diff = ((npte & EPT_WIRED) ? 1 : 0) - ((opte & EPT_WIRED) ? 1 : 0); KASSERT((npte & (EPT_R | EPT_WIRED)) != EPT_WIRED); KASSERT((opte & (EPT_R | EPT_WIRED)) != EPT_WIRED); pmap_stats_update(pmap, resid_diff, wired_diff); } static pt_entry_t pmap_ept_type(u_int flags) { u_int cacheflags = (flags & PMAP_CACHE_MASK); pt_entry_t ret; switch (cacheflags) { case PMAP_NOCACHE: case PMAP_NOCACHE_OVR: ret = __SHIFTIN(TYPE_UC, EPT_T); break; case PMAP_WRITE_COMBINE: ret = __SHIFTIN(TYPE_WC, EPT_T); break; case PMAP_WRITE_BACK: default: ret = __SHIFTIN(TYPE_WB, EPT_T); break; } ret |= EPT_NOPAT; return ret; } static inline pt_entry_t pmap_ept_prot(vm_prot_t prot) { pt_entry_t res = 0; if (prot & VM_PROT_READ) res |= EPT_R; if (prot & VM_PROT_WRITE) res |= EPT_W; if (prot & VM_PROT_EXECUTE) res |= EPT_X; return res; } static inline uint8_t pmap_ept_to_pp_attrs(pt_entry_t ept) { uint8_t ret = 0; if (pmap_ept_has_ad) { if (ept & EPT_D) ret |= PP_ATTRS_D; if (ept & EPT_A) ret |= PP_ATTRS_A; } else { ret |= (PP_ATTRS_D|PP_ATTRS_A); } if (ept & EPT_W) ret |= PP_ATTRS_W; return ret; } static inline pt_entry_t pmap_pp_attrs_to_ept(uint8_t attrs) { pt_entry_t ept = 0; if (attrs & PP_ATTRS_D) ept |= EPT_D; if (attrs & PP_ATTRS_A) ept |= EPT_A; if (attrs & PP_ATTRS_W) ept |= EPT_W; return ept; } /* * Helper for pmap_ept_free_ptp. * tree[0] = &L2[L2idx] * tree[1] = &L3[L3idx] * tree[2] = &L4[L4idx] */ static void pmap_ept_get_tree(struct pmap *pmap, vaddr_t va, pd_entry_t **tree) { pt_entry_t *pteva; paddr_t ptepa; int i, index; ptepa = pmap->pm_pdirpa[0]; for (i = PTP_LEVELS; i > 1; i--) { index = pl_pi(va, i); pteva = (pt_entry_t *)PMAP_DIRECT_MAP(ptepa); KASSERT(pmap_ept_valid_entry(pteva[index])); tree[i - 2] = &pteva[index]; ptepa = pmap_pte2pa(pteva[index]); } } static void pmap_ept_free_ptp(struct pmap *pmap, struct vm_page *ptp, vaddr_t va) { pd_entry_t *tree[3]; int level; KASSERT(pmap != pmap_kernel()); KASSERT(mutex_owned(&pmap->pm_lock)); KASSERT(kpreempt_disabled()); pmap_ept_get_tree(pmap, va, tree); level = 1; do { (void)pmap_pte_testset(tree[level - 1], 0); pmap_freepage(pmap, ptp, level); if (level < PTP_LEVELS - 1) { ptp = pmap_find_ptp(pmap, va, level + 1); ptp->wire_count--; if (ptp->wire_count > 1) break; } } while (++level < PTP_LEVELS); pmap_pte_flush(); } /* Allocate L4->L3->L2. Return L2. */ static void pmap_ept_install_ptp(struct pmap *pmap, struct pmap_ptparray *pt, vaddr_t va) { struct vm_page *ptp; unsigned long index; pd_entry_t *pteva; paddr_t ptepa; int i; KASSERT(pmap != pmap_kernel()); KASSERT(mutex_owned(&pmap->pm_lock)); KASSERT(kpreempt_disabled()); /* * Now that we have all the pages looked up or allocated, * loop through again installing any new ones into the tree. */ ptepa = pmap->pm_pdirpa[0]; for (i = PTP_LEVELS; i > 1; i--) { index = pl_pi(va, i); pteva = (pt_entry_t *)PMAP_DIRECT_MAP(ptepa); if (pmap_ept_valid_entry(pteva[index])) { KASSERT(!pt->alloced[i]); ptepa = pmap_pte2pa(pteva[index]); continue; } ptp = pt->pg[i]; ptp->flags &= ~PG_BUSY; /* never busy */ ptp->wire_count = 1; pmap->pm_ptphint[i - 2] = ptp; ptepa = VM_PAGE_TO_PHYS(ptp); pmap_pte_set(&pteva[index], ptepa | EPT_R | EPT_W | EPT_X); pmap_pte_flush(); pmap_stats_update(pmap, 1, 0); /* * If we're not in the top level, increase the * wire count of the parent page. */ if (i < PTP_LEVELS) { pt->pg[i + 1]->wire_count++; } } } static int pmap_ept_enter(struct pmap *pmap, vaddr_t va, paddr_t pa, vm_prot_t prot, u_int flags) { pt_entry_t *ptes, opte, npte; pt_entry_t *ptep; struct vm_page *ptp; struct vm_page *new_pg, *old_pg; struct pmap_page *new_pp, *old_pp; struct pv_entry *old_pve, *new_pve; bool wired = (flags & PMAP_WIRED) != 0; bool accessed; struct pmap_ptparray pt; int error; bool getptp, samepage, new_embedded; rb_tree_t *tree; KASSERT(pmap_initialized); KASSERT(va < VM_MAXUSER_ADDRESS); npte = pa | pmap_ept_prot(prot) | pmap_ept_type(flags); if (wired) npte |= EPT_WIRED; if (flags & VM_PROT_ALL) { npte |= EPT_A; if (flags & VM_PROT_WRITE) { KASSERT((npte & EPT_W) != 0); npte |= EPT_D; } } new_pg = PHYS_TO_VM_PAGE(pa); if (new_pg != NULL) { /* This is a managed page */ npte |= EPT_PVLIST; new_pp = VM_PAGE_TO_PP(new_pg); } else if ((new_pp = pmap_pv_tracked(pa)) != NULL) { /* This is an unmanaged pv-tracked page */ npte |= EPT_PVLIST; } else { new_pp = NULL; } /* Begin by locking the pmap. */ mutex_enter(&pmap->pm_lock); /* Look up the PTP. Allocate if none present. */ ptp = NULL; getptp = false; if (pmap != pmap_kernel()) { ptp = pmap_find_ptp(pmap, va, 1); if (ptp == NULL) { getptp = true; error = pmap_get_ptp(pmap, &pt, va, flags, &ptp); if (error != 0) { if (flags & PMAP_CANFAIL) { mutex_exit(&pmap->pm_lock); return error; } panic("%s: get ptp failed, error=%d", __func__, error); } } tree = &VM_PAGE_TO_PP(ptp)->pp_rb; } else { /* Embedded PV entries rely on this. */ KASSERT(va != 0); tree = &pmap_kernel_rb; } /* * Look up the old PV entry at this VA (if any), and insert a new PV * entry if required for the new mapping. Temporarily track the old * and new mappings concurrently. Only after the old mapping is * evicted from the pmap will we remove its PV entry. Otherwise, * our picture of modified/accessed state for either page could get * out of sync (we need any P->V operation for either page to stall * on pmap->pm_lock until done here). */ new_pve = NULL; old_pve = NULL; samepage = false; new_embedded = false; if (new_pp != NULL) { error = pmap_enter_pv(pmap, new_pp, ptp, va, &new_pve, &old_pve, &samepage, &new_embedded, tree); /* * If a new pv_entry was needed and none was available, we * can go no further. */ if (error != 0) { if (flags & PMAP_CANFAIL) { if (getptp) { pmap_unget_ptp(pmap, &pt); } mutex_exit(&pmap->pm_lock); return error; } panic("%s: alloc pve failed", __func__); } } else { old_pve = pmap_treelookup_pv(pmap, ptp, tree, va); } /* Map PTEs into address space. */ kpreempt_disable(); /* Install any newly allocated PTPs. */ if (getptp) { pmap_ept_install_ptp(pmap, &pt, va); } /* Check if there is an existing mapping. */ ptes = (pt_entry_t *)PMAP_DIRECT_MAP(VM_PAGE_TO_PHYS(ptp)); ptep = &ptes[pl1_pi(va)]; opte = *ptep; bool have_oldpa = pmap_ept_valid_entry(opte); paddr_t oldpa = pmap_pte2pa(opte); /* * Update the pte. */ do { opte = *ptep; /* * if the same page, inherit PTE_A and PTE_D. */ if (((opte ^ npte) & (PTE_FRAME | EPT_R)) == 0) { npte |= opte & (EPT_A | EPT_D); } } while (pmap_pte_cas(ptep, opte, npte) != opte); /* * Done with the PTEs: they can now be unmapped. */ kpreempt_enable(); /* * Update statistics and PTP's reference count. */ pmap_ept_stats_update_bypte(pmap, npte, opte); if (ptp != NULL) { if (!have_oldpa) { ptp->wire_count++; } /* Remember minimum VA in PTP. */ pmap_ptp_range_set(ptp, va); } KASSERT(ptp == NULL || ptp->wire_count > 1); /* * If the same page, we can skip pv_entry handling. */ if (((opte ^ npte) & (PTE_FRAME | EPT_R)) == 0) { KASSERT(((opte ^ npte) & EPT_PVLIST) == 0); if ((npte & EPT_PVLIST) != 0) { KASSERT(samepage); pmap_check_pv(pmap, ptp, new_pp, va, true); } goto same_pa; } else if ((npte & EPT_PVLIST) != 0) { KASSERT(!samepage); } /* * If old page is pv-tracked, remove pv_entry from its list. */ if ((~opte & (EPT_R | EPT_PVLIST)) == 0) { if ((old_pg = PHYS_TO_VM_PAGE(oldpa)) != NULL) { old_pp = VM_PAGE_TO_PP(old_pg); } else if ((old_pp = pmap_pv_tracked(oldpa)) == NULL) { panic("%s: EPT_PVLIST with pv-untracked page" " va = %#"PRIxVADDR " pa = %#" PRIxPADDR " (%#" PRIxPADDR ")", __func__, va, oldpa, atop(pa)); } pmap_remove_pv(pmap, old_pp, ptp, va, old_pve, pmap_ept_to_pp_attrs(opte)); } else { KASSERT(old_pve == NULL); KASSERT(pmap_treelookup_pv(pmap, ptp, tree, va) == NULL); } /* * If new page is dynamically PV tracked, insert to tree. */ if (new_pve != NULL) { KASSERT(pmap_treelookup_pv(pmap, ptp, tree, va) == NULL); old_pve = rb_tree_insert_node(tree, new_pve); KASSERT(old_pve == new_pve); pmap_check_pv(pmap, ptp, new_pp, va, true); } same_pa: /* * shootdown tlb if necessary. */ if (pmap_ept_has_ad) { accessed = (~opte & (EPT_R | EPT_A)) == 0; } else { accessed = (opte & EPT_R) != 0; } if (accessed && ((opte ^ npte) & (PTE_FRAME | EPT_W)) != 0) { pmap_tlb_shootdown(pmap, va, 0, TLBSHOOT_ENTER); } pmap_drain_pv(pmap); mutex_exit(&pmap->pm_lock); return 0; } /* Pay close attention, this returns L2. */ static int pmap_ept_pdes_invalid(struct pmap *pmap, vaddr_t va, pd_entry_t *lastpde) { pt_entry_t *pteva; paddr_t ptepa; int i, index; KASSERT(mutex_owned(&pmap->pm_lock)); ptepa = pmap->pm_pdirpa[0]; for (i = PTP_LEVELS; i > 1; i--) { pteva = (pt_entry_t *)PMAP_DIRECT_MAP(ptepa); index = pl_pi(va, i); if (!pmap_ept_valid_entry(pteva[index])) return i; ptepa = pmap_pte2pa(pteva[index]); } if (lastpde != NULL) { *lastpde = pteva[index]; } return 0; } static bool pmap_ept_extract(struct pmap *pmap, vaddr_t va, paddr_t *pap) { pt_entry_t *ptes, pte; pd_entry_t pde; paddr_t ptppa, pa; bool rv; #ifdef __HAVE_DIRECT_MAP if (va >= PMAP_DIRECT_BASE && va < PMAP_DIRECT_END) { if (pap != NULL) { *pap = PMAP_DIRECT_UNMAP(va); } return true; } #endif rv = false; pa = 0; mutex_enter(&pmap->pm_lock); kpreempt_disable(); if (!pmap_ept_pdes_invalid(pmap, va, &pde)) { ptppa = pmap_pte2pa(pde); ptes = (pt_entry_t *)PMAP_DIRECT_MAP(ptppa); pte = ptes[pl1_pi(va)]; if (__predict_true((pte & EPT_R) != 0)) { pa = pmap_pte2pa(pte) | (va & (NBPD_L1 - 1)); rv = true; } } kpreempt_enable(); mutex_exit(&pmap->pm_lock); if (pap != NULL) { *pap = pa; } return rv; } static bool pmap_ept_remove_pte(struct pmap *pmap, struct vm_page *ptp, pt_entry_t *pte, vaddr_t va) { struct pv_entry *pve; struct vm_page *pg; struct pmap_page *pp; pt_entry_t opte; bool accessed; KASSERT(pmap != pmap_kernel()); KASSERT(mutex_owned(&pmap->pm_lock)); KASSERT(kpreempt_disabled()); if (!pmap_ept_valid_entry(*pte)) { /* VA not mapped. */ return false; } /* Atomically save the old PTE and zap it. */ opte = pmap_pte_testset(pte, 0); if (!pmap_ept_valid_entry(opte)) { return false; } pmap_ept_stats_update_bypte(pmap, 0, opte); if (ptp) { /* * Dropping a PTE. Make sure that the PDE is flushed. */ ptp->wire_count--; if (ptp->wire_count <= 1) { opte |= EPT_A; } } if (pmap_ept_has_ad) { accessed = (opte & EPT_A) != 0; } else { accessed = true; } if (accessed) { pmap_tlb_shootdown(pmap, va, 0, TLBSHOOT_REMOVE_PTE); } /* * If we are not on a pv list - we are done. */ if ((opte & EPT_PVLIST) == 0) { KASSERTMSG((PHYS_TO_VM_PAGE(pmap_pte2pa(opte)) == NULL), "managed page without EPT_PVLIST for %#"PRIxVADDR, va); KASSERTMSG((pmap_pv_tracked(pmap_pte2pa(opte)) == NULL), "pv-tracked page without EPT_PVLIST for %#"PRIxVADDR, va); KASSERT(pmap_treelookup_pv(pmap, ptp, (ptp != NULL ? &VM_PAGE_TO_PP(ptp)->pp_rb : &pmap_kernel_rb), va) == NULL); return true; } if ((pg = PHYS_TO_VM_PAGE(pmap_pte2pa(opte))) != NULL) { pp = VM_PAGE_TO_PP(pg); } else if ((pp = pmap_pv_tracked(pmap_pte2pa(opte))) == NULL) { paddr_t pa = pmap_pte2pa(opte); panic("%s: EPT_PVLIST with pv-untracked page" " va = %#"PRIxVADDR"pa = %#"PRIxPADDR" (%#"PRIxPADDR")", __func__, va, pa, atop(pa)); } /* Sync R/M bits. */ pve = pmap_lookup_pv(pmap, ptp, pp, va); pmap_remove_pv(pmap, pp, ptp, va, pve, pmap_ept_to_pp_attrs(opte)); return true; } static void pmap_ept_remove_ptes(struct pmap *pmap, struct vm_page *ptp, vaddr_t ptpva, vaddr_t startva, vaddr_t endva) { pt_entry_t *pte = (pt_entry_t *)ptpva; KASSERT(pmap != pmap_kernel()); KASSERT(mutex_owned(&pmap->pm_lock)); KASSERT(kpreempt_disabled()); /* * mappings are very often sparse, so clip the given range to the * range of PTEs that are known present in the PTP. */ pmap_ptp_range_clip(ptp, &startva, &pte); /* * note that ptpva points to the PTE that maps startva. this may * or may not be the first PTE in the PTP. * * we loop through the PTP while there are still PTEs to look at * and the wire_count is greater than 1 (because we use the wire_count * to keep track of the number of real PTEs in the PTP). */ while (startva < endva && (ptp == NULL || ptp->wire_count > 1)) { (void)pmap_ept_remove_pte(pmap, ptp, pte, startva); startva += PAGE_SIZE; pte++; } } static void pmap_ept_remove(struct pmap *pmap, vaddr_t sva, vaddr_t eva) { pt_entry_t *ptes; pd_entry_t pde; paddr_t ptppa; vaddr_t blkendva, va = sva; struct vm_page *ptp; mutex_enter(&pmap->pm_lock); kpreempt_disable(); for (/* null */ ; va < eva ; va = blkendva) { int lvl; /* determine range of block */ blkendva = x86_round_pdr(va+1); if (blkendva > eva) blkendva = eva; lvl = pmap_ept_pdes_invalid(pmap, va, &pde); if (lvl != 0) { /* Skip a range corresponding to an invalid pde. */ blkendva = (va & ptp_frames[lvl - 1]) + nbpd[lvl - 1]; continue; } /* PA of the PTP */ ptppa = pmap_pte2pa(pde); ptp = pmap_find_ptp(pmap, va, 1); KASSERTMSG(ptp != NULL, "%s: unmanaged PTP detected", __func__); ptes = (pt_entry_t *)PMAP_DIRECT_MAP(ptppa); pmap_ept_remove_ptes(pmap, ptp, (vaddr_t)&ptes[pl1_pi(va)], va, blkendva); /* If PTP is no longer being used, free it. */ if (ptp && ptp->wire_count <= 1) { pmap_ept_free_ptp(pmap, ptp, va); } } kpreempt_enable(); pmap_drain_pv(pmap); mutex_exit(&pmap->pm_lock); } static int pmap_ept_sync_pv(struct vm_page *ptp, vaddr_t va, paddr_t pa, int clearbits, uint8_t *oattrs, pt_entry_t *optep) { struct pmap *pmap; pt_entry_t *ptep; pt_entry_t opte; pt_entry_t npte; pt_entry_t expect; bool need_shootdown; expect = pmap_pa2pte(pa) | EPT_R; pmap = ptp_to_pmap(ptp); if (clearbits != ~0) { KASSERT((clearbits & ~(PP_ATTRS_D|PP_ATTRS_A|PP_ATTRS_W)) == 0); clearbits = pmap_pp_attrs_to_ept(clearbits); } ptep = pmap_map_pte(pmap, ptp, va); do { opte = *ptep; KASSERT((opte & (EPT_D | EPT_A)) != EPT_D); KASSERT((opte & (EPT_A | EPT_R)) != EPT_A); KASSERT(opte == 0 || (opte & EPT_R) != 0); if ((opte & (PTE_FRAME | EPT_R)) != expect) { /* * We lost a race with a V->P operation like * pmap_remove(). Wait for the competitor * reflecting pte bits into mp_attrs. */ pmap_unmap_pte(); return EAGAIN; } /* * Check if there's anything to do on this PTE. */ if ((opte & clearbits) == 0) { need_shootdown = false; break; } /* * We need a shootdown if the PTE is cached (EPT_A) ... * ... Unless we are clearing only the EPT_W bit and * it isn't cached as RW (EPT_D). */ if (pmap_ept_has_ad) { need_shootdown = (opte & EPT_A) != 0 && !(clearbits == EPT_W && (opte & EPT_D) == 0); } else { need_shootdown = true; } npte = opte & ~clearbits; /* * If we need a shootdown anyway, clear EPT_A and EPT_D. */ if (need_shootdown) { npte &= ~(EPT_A | EPT_D); } KASSERT((npte & (EPT_D | EPT_A)) != EPT_D); KASSERT((npte & (EPT_A | EPT_R)) != EPT_A); KASSERT(npte == 0 || (opte & EPT_R) != 0); } while (pmap_pte_cas(ptep, opte, npte) != opte); if (need_shootdown) { pmap_tlb_shootdown(pmap, va, 0, TLBSHOOT_SYNC_PV); } pmap_unmap_pte(); *oattrs = pmap_ept_to_pp_attrs(opte); if (optep != NULL) *optep = opte; return 0; } static void pmap_ept_pp_remove_ent(struct pmap *pmap, struct vm_page *ptp, pt_entry_t opte, vaddr_t va) { KASSERT(mutex_owned(&pmap->pm_lock)); pmap_ept_stats_update_bypte(pmap, 0, opte); ptp->wire_count--; if (ptp->wire_count <= 1) { pmap_ept_free_ptp(pmap, ptp, va); } } static void pmap_ept_write_protect(struct pmap *pmap, vaddr_t sva, vaddr_t eva, vm_prot_t prot) { pt_entry_t bit_rem; pt_entry_t *ptes, *spte; pt_entry_t opte, npte; pd_entry_t pde; paddr_t ptppa; vaddr_t va; bool modified; bit_rem = 0; if (!(prot & VM_PROT_WRITE)) bit_rem = EPT_W; sva &= PTE_FRAME; eva &= PTE_FRAME; /* Acquire pmap. */ mutex_enter(&pmap->pm_lock); kpreempt_disable(); for (va = sva; va < eva; va += PAGE_SIZE) { if (pmap_ept_pdes_invalid(pmap, va, &pde)) { continue; } ptppa = pmap_pte2pa(pde); ptes = (pt_entry_t *)PMAP_DIRECT_MAP(ptppa); spte = &ptes[pl1_pi(va)]; do { opte = *spte; if (!pmap_ept_valid_entry(opte)) { goto next; } npte = (opte & ~bit_rem); } while (pmap_pte_cas(spte, opte, npte) != opte); if (pmap_ept_has_ad) { modified = (opte & EPT_D) != 0; } else { modified = true; } if (modified) { vaddr_t tva = x86_ptob(spte - ptes); pmap_tlb_shootdown(pmap, tva, 0, TLBSHOOT_WRITE_PROTECT); } next:; } kpreempt_enable(); mutex_exit(&pmap->pm_lock); } static void pmap_ept_unwire(struct pmap *pmap, vaddr_t va) { pt_entry_t *ptes, *ptep, opte; pd_entry_t pde; paddr_t ptppa; /* Acquire pmap. */ mutex_enter(&pmap->pm_lock); kpreempt_disable(); if (pmap_ept_pdes_invalid(pmap, va, &pde)) { panic("%s: invalid PDE va=%#" PRIxVADDR, __func__, va); } ptppa = pmap_pte2pa(pde); ptes = (pt_entry_t *)PMAP_DIRECT_MAP(ptppa); ptep = &ptes[pl1_pi(va)]; opte = *ptep; KASSERT(pmap_ept_valid_entry(opte)); if (opte & EPT_WIRED) { pt_entry_t npte = opte & ~EPT_WIRED; opte = pmap_pte_testset(ptep, npte); pmap_ept_stats_update_bypte(pmap, npte, opte); } else { printf("%s: wiring for pmap %p va %#" PRIxVADDR "did not change!\n", __func__, pmap, va); } /* Release pmap. */ kpreempt_enable(); mutex_exit(&pmap->pm_lock); } /* -------------------------------------------------------------------------- */ void pmap_ept_transform(struct pmap *pmap) { pmap->pm_enter = pmap_ept_enter; pmap->pm_extract = pmap_ept_extract; pmap->pm_remove = pmap_ept_remove; pmap->pm_sync_pv = pmap_ept_sync_pv; pmap->pm_pp_remove_ent = pmap_ept_pp_remove_ent; pmap->pm_write_protect = pmap_ept_write_protect; pmap->pm_unwire = pmap_ept_unwire; memset(PAGE_ALIGNED(pmap->pm_pdir), 0, PAGE_SIZE); } #endif /* __HAVE_DIRECT_MAP && __x86_64__ && !XENPV */
4 4 4 4 4 4 4 116 116 116 116 115 116 107 116 91 91 91 92 57 57 57 56 57 57 2 56 44 44 44 44 20 13 21 21 21 4 4 4 4 4 324 326 325 325 325 90 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 /* $NetBSD: uvm_km.c,v 1.165 2023/04/09 09:00:56 riastradh Exp $ */ /* * Copyright (c) 1997 Charles D. Cranor and Washington University. * Copyright (c) 1991, 1993, The Regents of the University of California. * * All rights reserved. * * This code is derived from software contributed to Berkeley by * The Mach Operating System project at Carnegie-Mellon University. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)vm_kern.c 8.3 (Berkeley) 1/12/94 * from: Id: uvm_km.c,v 1.1.2.14 1998/02/06 05:19:27 chs Exp * * * Copyright (c) 1987, 1990 Carnegie-Mellon University. * All rights reserved. * * Permission to use, copy, modify and distribute this software and * its documentation is hereby granted, provided that both the copyright * notice and this permission notice appear in all copies of the * software, derivative works or modified versions, and any portions * thereof, and that both notices appear in supporting documentation. * * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. * * Carnegie Mellon requests users of this software to return to * * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU * School of Computer Science * Carnegie Mellon University * Pittsburgh PA 15213-3890 * * any improvements or extensions that they make and grant Carnegie the * rights to redistribute these changes. */ /* * uvm_km.c: handle kernel memory allocation and management */ /* * overview of kernel memory management: * * the kernel virtual address space is mapped by "kernel_map." kernel_map * starts at VM_MIN_KERNEL_ADDRESS and goes to VM_MAX_KERNEL_ADDRESS. * note that VM_MIN_KERNEL_ADDRESS is equal to vm_map_min(kernel_map). * * the kernel_map has several "submaps." submaps can only appear in * the kernel_map (user processes can't use them). submaps "take over" * the management of a sub-range of the kernel's address space. submaps * are typically allocated at boot time and are never released. kernel * virtual address space that is mapped by a submap is locked by the * submap's lock -- not the kernel_map's lock. * * thus, the useful feature of submaps is that they allow us to break * up the locking and protection of the kernel address space into smaller * chunks. * * the vm system has several standard kernel submaps/arenas, including: * kmem_arena => used for kmem/pool (memoryallocators(9)) * pager_map => used to map "buf" structures into kernel space * exec_map => used during exec to handle exec args * etc... * * The kmem_arena is a "special submap", as it lives in a fixed map entry * within the kernel_map and is controlled by vmem(9). * * the kernel allocates its private memory out of special uvm_objects whose * reference count is set to UVM_OBJ_KERN (thus indicating that the objects * are "special" and never die). all kernel objects should be thought of * as large, fixed-sized, sparsely populated uvm_objects. each kernel * object is equal to the size of kernel virtual address space (i.e. the * value "VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS"). * * note that just because a kernel object spans the entire kernel virtual * address space doesn't mean that it has to be mapped into the entire space. * large chunks of a kernel object's space go unused either because * that area of kernel VM is unmapped, or there is some other type of * object mapped into that range (e.g. a vnode). for submap's kernel * objects, the only part of the object that can ever be populated is the * offsets that are managed by the submap. * * note that the "offset" in a kernel object is always the kernel virtual * address minus the VM_MIN_KERNEL_ADDRESS (aka vm_map_min(kernel_map)). * example: * suppose VM_MIN_KERNEL_ADDRESS is 0xf8000000 and the kernel does a * uvm_km_alloc(kernel_map, PAGE_SIZE) [allocate 1 wired down page in the * kernel map]. if uvm_km_alloc returns virtual address 0xf8235000, * then that means that the page at offset 0x235000 in kernel_object is * mapped at 0xf8235000. * * kernel object have one other special property: when the kernel virtual * memory mapping them is unmapped, the backing memory in the object is * freed right away. this is done with the uvm_km_pgremove() function. * this has to be done because there is no backing store for kernel pages * and no need to save them after they are no longer referenced. * * Generic arenas: * * kmem_arena: * Main arena controlling the kernel KVA used by other arenas. * * kmem_va_arena: * Implements quantum caching in order to speedup allocations and * reduce fragmentation. The pool(9), unless created with a custom * meta-data allocator, and kmem(9) subsystems use this arena. * * Arenas for meta-data allocations are used by vmem(9) and pool(9). * These arenas cannot use quantum cache. However, kmem_va_meta_arena * compensates this by importing larger chunks from kmem_arena. * * kmem_va_meta_arena: * Space for meta-data. * * kmem_meta_arena: * Imports from kmem_va_meta_arena. Allocations from this arena are * backed with the pages. * * Arena stacking: * * kmem_arena * kmem_va_arena * kmem_va_meta_arena * kmem_meta_arena */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: uvm_km.c,v 1.165 2023/04/09 09:00:56 riastradh Exp $"); #include "opt_uvmhist.h" #include "opt_kmempages.h" #ifndef NKMEMPAGES #define NKMEMPAGES 0 #endif /* * Defaults for lower and upper-bounds for the kmem_arena page count. * Can be overridden by kernel config options. */ #ifndef NKMEMPAGES_MIN #define NKMEMPAGES_MIN NKMEMPAGES_MIN_DEFAULT #endif #ifndef NKMEMPAGES_MAX #define NKMEMPAGES_MAX NKMEMPAGES_MAX_DEFAULT #endif #include <sys/param.h> #include <sys/systm.h> #include <sys/atomic.h> #include <sys/proc.h> #include <sys/pool.h> #include <sys/vmem.h> #include <sys/vmem_impl.h> #include <sys/kmem.h> #include <sys/msan.h> #include <uvm/uvm.h> /* * global data structures */ struct vm_map *kernel_map = NULL; /* * local data structures */ static struct vm_map kernel_map_store; static struct vm_map_entry kernel_image_mapent_store; static struct vm_map_entry kernel_kmem_mapent_store; size_t nkmempages = 0; vaddr_t kmembase; vsize_t kmemsize; static struct vmem kmem_arena_store; vmem_t *kmem_arena = NULL; static struct vmem kmem_va_arena_store; vmem_t *kmem_va_arena; /* * kmeminit_nkmempages: calculate the size of kmem_arena. */ void kmeminit_nkmempages(void) { size_t npages; if (nkmempages != 0) { /* * It's already been set (by us being here before) * bail out now; */ return; } #if defined(NKMEMPAGES_MAX_UNLIMITED) && !defined(KMSAN) npages = physmem; #else #if defined(KMSAN) npages = (physmem / 4); #elif defined(PMAP_MAP_POOLPAGE) npages = (physmem / 4); #else npages = (physmem / 3) * 2; #endif /* defined(PMAP_MAP_POOLPAGE) */ #if !defined(NKMEMPAGES_MAX_UNLIMITED) if (npages > NKMEMPAGES_MAX) npages = NKMEMPAGES_MAX; #endif #endif if (npages < NKMEMPAGES_MIN) npages = NKMEMPAGES_MIN; nkmempages = npages; } /* * uvm_km_bootstrap: init kernel maps and objects to reflect reality (i.e. * KVM already allocated for text, data, bss, and static data structures). * * => KVM is defined by VM_MIN_KERNEL_ADDRESS/VM_MAX_KERNEL_ADDRESS. * we assume that [vmin -> start] has already been allocated and that * "end" is the end. */ void uvm_km_bootstrap(vaddr_t start, vaddr_t end) { bool kmem_arena_small; vaddr_t base = VM_MIN_KERNEL_ADDRESS; struct uvm_map_args args; int error; UVMHIST_FUNC(__func__); UVMHIST_CALLARGS(maphist, "start=%#jx end=%#jx", start, end, 0,0); kmeminit_nkmempages(); kmemsize = (vsize_t)nkmempages * PAGE_SIZE; kmem_arena_small = kmemsize < 64 * 1024 * 1024; UVMHIST_LOG(maphist, "kmemsize=%#jx", kmemsize, 0,0,0); /* * next, init kernel memory objects. */ /* kernel_object: for pageable anonymous kernel memory */ uvm_kernel_object = uao_create(VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS, UAO_FLAG_KERNOBJ); /* * init the map and reserve any space that might already * have been allocated kernel space before installing. */ uvm_map_setup(&kernel_map_store, base, end, VM_MAP_PAGEABLE); kernel_map_store.pmap = pmap_kernel(); if (start != base) { error = uvm_map_prepare(&kernel_map_store, base, start - base, NULL, UVM_UNKNOWN_OFFSET, 0, UVM_MAPFLAG(UVM_PROT_ALL, UVM_PROT_ALL, UVM_INH_NONE, UVM_ADV_RANDOM, UVM_FLAG_FIXED), &args); if (!error) { kernel_image_mapent_store.flags = UVM_MAP_KERNEL | UVM_MAP_STATIC | UVM_MAP_NOMERGE; error = uvm_map_enter(&kernel_map_store, &args, &kernel_image_mapent_store); } if (error) panic( "uvm_km_bootstrap: could not reserve space for kernel"); kmembase = args.uma_start + args.uma_size; } else { kmembase = base; } error = uvm_map_prepare(&kernel_map_store, kmembase, kmemsize, NULL, UVM_UNKNOWN_OFFSET, 0, UVM_MAPFLAG(UVM_PROT_ALL, UVM_PROT_ALL, UVM_INH_NONE, UVM_ADV_RANDOM, UVM_FLAG_FIXED), &args); if (!error) { kernel_kmem_mapent_store.flags = UVM_MAP_KERNEL | UVM_MAP_STATIC | UVM_MAP_NOMERGE; error = uvm_map_enter(&kernel_map_store, &args, &kernel_kmem_mapent_store); } if (error) panic("uvm_km_bootstrap: could not reserve kernel kmem"); /* * install! */ kernel_map = &kernel_map_store; pool_subsystem_init(); kmem_arena = vmem_init(&kmem_arena_store, "kmem", kmembase, kmemsize, PAGE_SIZE, NULL, NULL, NULL, 0, VM_NOSLEEP | VM_BOOTSTRAP, IPL_VM); #ifdef PMAP_GROWKERNEL /* * kmem_arena VA allocations happen independently of uvm_map. * grow kernel to accommodate the kmem_arena. */ if (uvm_maxkaddr < kmembase + kmemsize) { uvm_maxkaddr = pmap_growkernel(kmembase + kmemsize); KASSERTMSG(uvm_maxkaddr >= kmembase + kmemsize, "%#"PRIxVADDR" %#"PRIxVADDR" %#"PRIxVSIZE, uvm_maxkaddr, kmembase, kmemsize); } #endif vmem_subsystem_init(kmem_arena); UVMHIST_LOG(maphist, "kmem vmem created (base=%#jx, size=%#jx", kmembase, kmemsize, 0,0); kmem_va_arena = vmem_init(&kmem_va_arena_store, "kva", 0, 0, PAGE_SIZE, vmem_alloc, vmem_free, kmem_arena, (kmem_arena_small ? 4 : VMEM_QCACHE_IDX_MAX) * PAGE_SIZE, VM_NOSLEEP, IPL_VM); UVMHIST_LOG(maphist, "<- done", 0,0,0,0); } /* * uvm_km_init: init the kernel maps virtual memory caches * and start the pool/kmem allocator. */ void uvm_km_init(void) { kmem_init(); } /* * uvm_km_suballoc: allocate a submap in the kernel map. once a submap * is allocated all references to that area of VM must go through it. this * allows the locking of VAs in kernel_map to be broken up into regions. * * => if `fixed' is true, *vmin specifies where the region described * pager_map => used to map "buf" structures into kernel space * by the submap must start * => if submap is non NULL we use that as the submap, otherwise we * alloc a new map */ struct vm_map * uvm_km_suballoc(struct vm_map *map, vaddr_t *vmin /* IN/OUT */, vaddr_t *vmax /* OUT */, vsize_t size, int flags, bool fixed, struct vm_map *submap) { int mapflags = UVM_FLAG_NOMERGE | (fixed ? UVM_FLAG_FIXED : 0); UVMHIST_FUNC(__func__); UVMHIST_CALLED(maphist); KASSERT(vm_map_pmap(map) == pmap_kernel()); size = round_page(size); /* round up to pagesize */ /* * first allocate a blank spot in the parent map */ if (uvm_map(map, vmin, size, NULL, UVM_UNKNOWN_OFFSET, 0, UVM_MAPFLAG(UVM_PROT_ALL, UVM_PROT_ALL, UVM_INH_NONE, UVM_ADV_RANDOM, mapflags)) != 0) { panic("%s: unable to allocate space in parent map", __func__); } /* * set VM bounds (vmin is filled in by uvm_map) */ *vmax = *vmin + size; /* * add references to pmap and create or init the submap */ pmap_reference(vm_map_pmap(map)); if (submap == NULL) { submap = kmem_alloc(sizeof(*submap), KM_SLEEP); } uvm_map_setup(submap, *vmin, *vmax, flags); submap->pmap = vm_map_pmap(map); /* * now let uvm_map_submap plug in it... */ if (uvm_map_submap(map, *vmin, *vmax, submap) != 0) panic("uvm_km_suballoc: submap allocation failed"); return(submap); } /* * uvm_km_pgremove: remove pages from a kernel uvm_object and KVA. */ void uvm_km_pgremove(vaddr_t startva, vaddr_t endva) { struct uvm_object * const uobj = uvm_kernel_object; const voff_t start = startva - vm_map_min(kernel_map); const voff_t end = endva - vm_map_min(kernel_map); struct vm_page *pg; voff_t curoff, nextoff; int swpgonlydelta = 0; UVMHIST_FUNC(__func__); UVMHIST_CALLED(maphist); KASSERT(VM_MIN_KERNEL_ADDRESS <= startva); KASSERT(startva < endva); KASSERT(endva <= VM_MAX_KERNEL_ADDRESS); rw_enter(uobj->vmobjlock, RW_WRITER); pmap_remove(pmap_kernel(), startva, endva); for (curoff = start; curoff < end; curoff = nextoff) { nextoff = curoff + PAGE_SIZE; pg = uvm_pagelookup(uobj, curoff); if (pg != NULL && pg->flags & PG_BUSY) { uvm_pagewait(pg, uobj->vmobjlock, "km_pgrm"); rw_enter(uobj->vmobjlock, RW_WRITER); nextoff = curoff; continue; } /* * free the swap slot, then the page. */ if (pg == NULL && uao_find_swslot(uobj, curoff >> PAGE_SHIFT) > 0) { swpgonlydelta++; } uao_dropswap(uobj, curoff >> PAGE_SHIFT); if (pg != NULL) { uvm_pagefree(pg); } } rw_exit(uobj->vmobjlock); if (swpgonlydelta > 0) { KASSERT(uvmexp.swpgonly >= swpgonlydelta); atomic_add_int(&uvmexp.swpgonly, -swpgonlydelta); } } /* * uvm_km_pgremove_intrsafe: like uvm_km_pgremove(), but for non object backed * regions. * * => when you unmap a part of anonymous kernel memory you want to toss * the pages right away. (this is called from uvm_unmap_...). * => none of the pages will ever be busy, and none of them will ever * be on the active or inactive queues (because they have no object). */ void uvm_km_pgremove_intrsafe(struct vm_map *map, vaddr_t start, vaddr_t end) { #define __PGRM_BATCH 16 struct vm_page *pg; paddr_t pa[__PGRM_BATCH]; int npgrm, i; vaddr_t va, batch_vastart; UVMHIST_FUNC(__func__); UVMHIST_CALLED(maphist); KASSERT(VM_MAP_IS_KERNEL(map)); KASSERTMSG(vm_map_min(map) <= start, "vm_map_min(map) [%#"PRIxVADDR"] <= start [%#"PRIxVADDR"]" " (size=%#"PRIxVSIZE")", vm_map_min(map), start, end - start); KASSERT(start < end); KASSERT(end <= vm_map_max(map)); for (va = start; va < end;) { batch_vastart = va; /* create a batch of at most __PGRM_BATCH pages to free */ for (i = 0; i < __PGRM_BATCH && va < end; va += PAGE_SIZE) { if (!pmap_extract(pmap_kernel(), va, &pa[i])) { continue; } i++; } npgrm = i; /* now remove the mappings */ pmap_kremove(batch_vastart, va - batch_vastart); /* and free the pages */ for (i = 0; i < npgrm; i++) { pg = PHYS_TO_VM_PAGE(pa[i]); KASSERT(pg); KASSERT(pg->uobject == NULL); KASSERT(pg->uanon == NULL); KASSERT((pg->flags & PG_BUSY) == 0); uvm_pagefree(pg); } } #undef __PGRM_BATCH } #if defined(DEBUG) void uvm_km_check_empty(struct vm_map *map, vaddr_t start, vaddr_t end) { vaddr_t va; UVMHIST_FUNC(__func__); UVMHIST_CALLED(maphist); KDASSERT(VM_MAP_IS_KERNEL(map)); KDASSERT(vm_map_min(map) <= start); KDASSERT(start < end); KDASSERT(end <= vm_map_max(map)); for (va = start; va < end; va += PAGE_SIZE) { paddr_t pa; if (pmap_extract(pmap_kernel(), va, &pa)) { panic("uvm_km_check_empty: va %p has pa %#llx", (void *)va, (long long)pa); } /* * kernel_object should not have pages for the corresponding * region. check it. * * why trylock? because: * - caller might not want to block. * - we can recurse when allocating radix_node for * kernel_object. */ if (rw_tryenter(uvm_kernel_object->vmobjlock, RW_READER)) { struct vm_page *pg; pg = uvm_pagelookup(uvm_kernel_object, va - vm_map_min(kernel_map)); rw_exit(uvm_kernel_object->vmobjlock); if (pg) { panic("uvm_km_check_empty: " "has page hashed at %p", (const void *)va); } } } } #endif /* defined(DEBUG) */ /* * uvm_km_alloc: allocate an area of kernel memory. * * => NOTE: we can return 0 even if we can wait if there is not enough * free VM space in the map... caller should be prepared to handle * this case. * => we return KVA of memory allocated */ vaddr_t uvm_km_alloc(struct vm_map *map, vsize_t size, vsize_t align, uvm_flag_t flags) { vaddr_t kva, loopva; vaddr_t offset; vsize_t loopsize; struct vm_page *pg; struct uvm_object *obj; int pgaflags; vm_prot_t prot, vaprot; UVMHIST_FUNC(__func__); UVMHIST_CALLED(maphist); KASSERT(vm_map_pmap(map) == pmap_kernel()); KASSERT((flags & UVM_KMF_TYPEMASK) == UVM_KMF_WIRED || (flags & UVM_KMF_TYPEMASK) == UVM_KMF_PAGEABLE || (flags & UVM_KMF_TYPEMASK) == UVM_KMF_VAONLY); KASSERT((flags & UVM_KMF_VAONLY) != 0 || (flags & UVM_KMF_COLORMATCH) == 0); KASSERT((flags & UVM_KMF_COLORMATCH) == 0 || (flags & UVM_KMF_VAONLY) != 0); /* * setup for call */ kva = vm_map_min(map); /* hint */ size = round_page(size); obj = (flags & UVM_KMF_PAGEABLE) ? uvm_kernel_object : NULL; UVMHIST_LOG(maphist," (map=%#jx, obj=%#jx, size=%#jx, flags=%#jx)", (uintptr_t)map, (uintptr_t)obj, size, flags); /* * allocate some virtual space */ vaprot = (flags & UVM_KMF_EXEC) ? UVM_PROT_ALL : UVM_PROT_RW; if (__predict_false(uvm_map(map, &kva, size, obj, UVM_UNKNOWN_OFFSET, align, UVM_MAPFLAG(vaprot, UVM_PROT_ALL, UVM_INH_NONE, UVM_ADV_RANDOM, (flags & (UVM_KMF_TRYLOCK | UVM_KMF_NOWAIT | UVM_KMF_WAITVA | UVM_KMF_COLORMATCH)))) != 0)) { UVMHIST_LOG(maphist, "<- done (no VM)",0,0,0,0); return(0); } /* * if all we wanted was VA, return now */ if (flags & (UVM_KMF_VAONLY | UVM_KMF_PAGEABLE)) { UVMHIST_LOG(maphist,"<- done valloc (kva=%#jx)", kva,0,0,0); return(kva); } /* * recover object offset from virtual address */ offset = kva - vm_map_min(kernel_map); UVMHIST_LOG(maphist, " kva=%#jx, offset=%#jx", kva, offset,0,0); /* * now allocate and map in the memory... note that we are the only ones * whom should ever get a handle on this area of VM. */ loopva = kva; loopsize = size; pgaflags = UVM_FLAG_COLORMATCH; if (flags & UVM_KMF_NOWAIT) pgaflags |= UVM_PGA_USERESERVE; if (flags & UVM_KMF_ZERO) pgaflags |= UVM_PGA_ZERO; prot = VM_PROT_READ | VM_PROT_WRITE; if (flags & UVM_KMF_EXEC) prot |= VM_PROT_EXECUTE; while (loopsize) { KASSERTMSG(!pmap_extract(pmap_kernel(), loopva, NULL), "loopva=%#"PRIxVADDR, loopva); pg = uvm_pagealloc_strat(NULL, offset, NULL, pgaflags, #ifdef UVM_KM_VMFREELIST UVM_PGA_STRAT_ONLY, UVM_KM_VMFREELIST #else UVM_PGA_STRAT_NORMAL, 0 #endif ); /* * out of memory? */ if (__predict_false(pg == NULL)) { if ((flags & UVM_KMF_NOWAIT) || ((flags & UVM_KMF_CANFAIL) && !uvm_reclaimable())) { /* free everything! */ uvm_km_free(map, kva, size, flags & UVM_KMF_TYPEMASK); return (0); } else { uvm_wait("km_getwait2"); /* sleep here */ continue; } } pg->flags &= ~PG_BUSY; /* new page */ UVM_PAGE_OWN(pg, NULL); /* * map it in */ pmap_kenter_pa(loopva, VM_PAGE_TO_PHYS(pg), prot, PMAP_KMPAGE); loopva += PAGE_SIZE; offset += PAGE_SIZE; loopsize -= PAGE_SIZE; } pmap_update(pmap_kernel()); if ((flags & UVM_KMF_ZERO) == 0) { kmsan_orig((void *)kva, size, KMSAN_TYPE_UVM, __RET_ADDR); kmsan_mark((void *)kva, size, KMSAN_STATE_UNINIT); } UVMHIST_LOG(maphist,"<- done (kva=%#jx)", kva,0,0,0); return(kva); } /* * uvm_km_protect: change the protection of an allocated area */ int uvm_km_protect(struct vm_map *map, vaddr_t addr, vsize_t size, vm_prot_t prot) { return uvm_map_protect(map, addr, addr + round_page(size), prot, false); } /* * uvm_km_free: free an area of kernel memory */ void uvm_km_free(struct vm_map *map, vaddr_t addr, vsize_t size, uvm_flag_t flags) { UVMHIST_FUNC(__func__); UVMHIST_CALLED(maphist); KASSERT((flags & UVM_KMF_TYPEMASK) == UVM_KMF_WIRED || (flags & UVM_KMF_TYPEMASK) == UVM_KMF_PAGEABLE || (flags & UVM_KMF_TYPEMASK) == UVM_KMF_VAONLY); KASSERT((addr & PAGE_MASK) == 0); KASSERT(vm_map_pmap(map) == pmap_kernel()); size = round_page(size); if (flags & UVM_KMF_PAGEABLE) { uvm_km_pgremove(addr, addr + size); } else if (flags & UVM_KMF_WIRED) { /* * Note: uvm_km_pgremove_intrsafe() extracts mapping, thus * remove it after. See comment below about KVA visibility. */ uvm_km_pgremove_intrsafe(map, addr, addr + size); } /* * Note: uvm_unmap_remove() calls pmap_update() for us, before * KVA becomes globally available. */ uvm_unmap1(map, addr, addr + size, UVM_FLAG_VAONLY); } /* Sanity; must specify both or none. */ #if (defined(PMAP_MAP_POOLPAGE) || defined(PMAP_UNMAP_POOLPAGE)) && \ (!defined(PMAP_MAP_POOLPAGE) || !defined(PMAP_UNMAP_POOLPAGE)) #error Must specify MAP and UNMAP together. #endif #if defined(PMAP_ALLOC_POOLPAGE) && \ !defined(PMAP_MAP_POOLPAGE) && !defined(PMAP_UNMAP_POOLPAGE) #error Must specify ALLOC with MAP and UNMAP #endif int uvm_km_kmem_alloc(vmem_t *vm, vmem_size_t size, vm_flag_t flags, vmem_addr_t *addr) { struct vm_page *pg; vmem_addr_t va; int rc; vaddr_t loopva; vsize_t loopsize; size = round_page(size); #if defined(PMAP_MAP_POOLPAGE) if (size == PAGE_SIZE) { again: #ifdef PMAP_ALLOC_POOLPAGE pg = PMAP_ALLOC_POOLPAGE((flags & VM_SLEEP) ? 0 : UVM_PGA_USERESERVE); #else pg = uvm_pagealloc(NULL, 0, NULL, (flags & VM_SLEEP) ? 0 : UVM_PGA_USERESERVE); #endif /* PMAP_ALLOC_POOLPAGE */ if (__predict_false(pg == NULL)) { if (flags & VM_SLEEP) { uvm_wait("plpg"); goto again; } return ENOMEM; } va = PMAP_MAP_POOLPAGE(VM_PAGE_TO_PHYS(pg)); KASSERT(va != 0); *addr = va; return 0; } #endif /* PMAP_MAP_POOLPAGE */ rc = vmem_alloc(vm, size, flags, &va); if (rc != 0) return rc; #ifdef PMAP_GROWKERNEL /* * These VA allocations happen independently of uvm_map * so this allocation must not extend beyond the current limit. */ KASSERTMSG(uvm_maxkaddr >= va + size, "%#"PRIxVADDR" %#"PRIxPTR" %#zx", uvm_maxkaddr, va, size); #endif loopva = va; loopsize = size; while (loopsize) { paddr_t pa __diagused; KASSERTMSG(!pmap_extract(pmap_kernel(), loopva, &pa), "loopva=%#"PRIxVADDR" loopsize=%#"PRIxVSIZE " pa=%#"PRIxPADDR" vmem=%p", loopva, loopsize, pa, vm); pg = uvm_pagealloc(NULL, loopva, NULL, UVM_FLAG_COLORMATCH | ((flags & VM_SLEEP) ? 0 : UVM_PGA_USERESERVE)); if (__predict_false(pg == NULL)) { if (flags & VM_SLEEP) { uvm_wait("plpg"); continue; } else { uvm_km_pgremove_intrsafe(kernel_map, va, va + size); vmem_free(vm, va, size); return ENOMEM; } } pg->flags &= ~PG_BUSY; /* new page */ UVM_PAGE_OWN(pg, NULL); pmap_kenter_pa(loopva, VM_PAGE_TO_PHYS(pg), VM_PROT_READ|VM_PROT_WRITE, PMAP_KMPAGE); loopva += PAGE_SIZE; loopsize -= PAGE_SIZE; } pmap_update(pmap_kernel()); *addr = va; return 0; } void uvm_km_kmem_free(vmem_t *vm, vmem_addr_t addr, size_t size) { size = round_page(size); #if defined(PMAP_UNMAP_POOLPAGE) if (size == PAGE_SIZE) { paddr_t pa; pa = PMAP_UNMAP_POOLPAGE(addr); uvm_pagefree(PHYS_TO_VM_PAGE(pa)); return; } #endif /* PMAP_UNMAP_POOLPAGE */ uvm_km_pgremove_intrsafe(kernel_map, addr, addr + size); pmap_update(pmap_kernel()); vmem_free(vm, addr, size); } bool uvm_km_va_starved_p(void) { vmem_size_t total; vmem_size_t free; if (kmem_arena == NULL) return false; total = vmem_size(kmem_arena, VMEM_ALLOC|VMEM_FREE); free = vmem_size(kmem_arena, VMEM_FREE); return (free < (total / 10)); }
152 332 331 318 258 18 147 114 168 97 18 4 4 3 4 4 1 4 4 4 4 4 4 4 1 3 98 144 24 7 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 /* $NetBSD: subr_copy.c,v 1.19 2023/05/22 14:07:24 riastradh Exp $ */ /*- * Copyright (c) 1997, 1998, 1999, 2002, 2007, 2008, 2019 * The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility, * NASA Ames Research Center. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /* * Copyright (c) 1982, 1986, 1991, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Copyright (c) 1992, 1993 * The Regents of the University of California. All rights reserved. * * This software was developed by the Computer Systems Engineering group * at Lawrence Berkeley Laboratory under DARPA contract BG 91-66 and * contributed to Berkeley. * * All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Lawrence Berkeley Laboratory. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)kern_subr.c 8.4 (Berkeley) 2/14/95 */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: subr_copy.c,v 1.19 2023/05/22 14:07:24 riastradh Exp $"); #define __UFETCHSTORE_PRIVATE #define __UCAS_PRIVATE #include <sys/param.h> #include <sys/fcntl.h> #include <sys/proc.h> #include <sys/systm.h> #include <uvm/uvm_extern.h> void uio_setup_sysspace(struct uio *uio) { uio->uio_vmspace = vmspace_kernel(); } int uiomove(void *buf, size_t n, struct uio *uio) { struct vmspace *vm = uio->uio_vmspace; struct iovec *iov; size_t cnt; int error = 0; char *cp = buf; ASSERT_SLEEPABLE(); KASSERT(uio->uio_rw == UIO_READ || uio->uio_rw == UIO_WRITE); while (n > 0 && uio->uio_resid) { KASSERT(uio->uio_iovcnt > 0); iov = uio->uio_iov; cnt = iov->iov_len; if (cnt == 0) { KASSERT(uio->uio_iovcnt > 1); uio->uio_iov++; uio->uio_iovcnt--; continue; } if (cnt > n) cnt = n; if (!VMSPACE_IS_KERNEL_P(vm)) { preempt_point(); } if (uio->uio_rw == UIO_READ) { error = copyout_vmspace(vm, cp, iov->iov_base, cnt); } else { error = copyin_vmspace(vm, iov->iov_base, cp, cnt); } if (error) { break; } iov->iov_base = (char *)iov->iov_base + cnt; iov->iov_len -= cnt; uio->uio_resid -= cnt; uio->uio_offset += cnt; cp += cnt; KDASSERT(cnt <= n); n -= cnt; } return (error); } /* * Wrapper for uiomove() that validates the arguments against a known-good * kernel buffer. */ int uiomove_frombuf(void *buf, size_t buflen, struct uio *uio) { size_t offset; if (uio->uio_offset < 0 || /* uio->uio_resid < 0 || */ (offset = uio->uio_offset) != uio->uio_offset) return (EINVAL); if (offset >= buflen) return (0); return (uiomove((char *)buf + offset, buflen - offset, uio)); } int uiopeek(void *buf, size_t n, struct uio *uio) { struct vmspace *vm = uio->uio_vmspace; struct iovec *iov; size_t cnt; int error = 0; char *cp = buf; size_t resid = uio->uio_resid; int iovcnt = uio->uio_iovcnt; char *base; size_t len; KASSERT(uio->uio_rw == UIO_READ || uio->uio_rw == UIO_WRITE); if (n == 0 || resid == 0) return 0; iov = uio->uio_iov; base = iov->iov_base; len = iov->iov_len; while (n > 0 && resid > 0) { KASSERT(iovcnt > 0); cnt = len; if (cnt == 0) { KASSERT(iovcnt > 1); iov++; iovcnt--; base = iov->iov_base; len = iov->iov_len; continue; } if (cnt > n) cnt = n; if (!VMSPACE_IS_KERNEL_P(vm)) { preempt_point(); } if (uio->uio_rw == UIO_READ) { error = copyout_vmspace(vm, cp, base, cnt); } else { error = copyin_vmspace(vm, base, cp, cnt); } if (error) { break; } base += cnt; len -= cnt; resid -= cnt; cp += cnt; KDASSERT(cnt <= n); n -= cnt; } return error; } void uioskip(size_t n, struct uio *uio) { struct iovec *iov; size_t cnt; KASSERTMSG(n <= uio->uio_resid, "n=%zu resid=%zu", n, uio->uio_resid); KASSERT(uio->uio_rw == UIO_READ || uio->uio_rw == UIO_WRITE); while (n > 0 && uio->uio_resid) { KASSERT(uio->uio_iovcnt > 0); iov = uio->uio_iov; cnt = iov->iov_len; if (cnt == 0) { KASSERT(uio->uio_iovcnt > 1); uio->uio_iov++; uio->uio_iovcnt--; continue; } if (cnt > n) cnt = n; iov->iov_base = (char *)iov->iov_base + cnt; iov->iov_len -= cnt; uio->uio_resid -= cnt; uio->uio_offset += cnt; KDASSERT(cnt <= n); n -= cnt; } } /* * Give next character to user as result of read. */ int ureadc(int c, struct uio *uio) { struct iovec *iov; if (uio->uio_resid <= 0) panic("ureadc: non-positive resid"); again: if (uio->uio_iovcnt <= 0) panic("ureadc: non-positive iovcnt"); iov = uio->uio_iov; if (iov->iov_len <= 0) { uio->uio_iovcnt--; uio->uio_iov++; goto again; } if (!VMSPACE_IS_KERNEL_P(uio->uio_vmspace)) { int error; if ((error = ustore_char(iov->iov_base, c)) != 0) return (error); } else { *(char *)iov->iov_base = c; } iov->iov_base = (char *)iov->iov_base + 1; iov->iov_len--; uio->uio_resid--; uio->uio_offset++; return (0); } /* * Like copyin(), but operates on an arbitrary vmspace. */ int copyin_vmspace(struct vmspace *vm, const void *uaddr, void *kaddr, size_t len) { struct iovec iov; struct uio uio; int error; if (len == 0) return (0); if (VMSPACE_IS_KERNEL_P(vm)) { return kcopy(uaddr, kaddr, len); } if (__predict_true(vm == curproc->p_vmspace)) { return copyin(uaddr, kaddr, len); } iov.iov_base = kaddr; iov.iov_len = len; uio.uio_iov = &iov; uio.uio_iovcnt = 1; uio.uio_offset = (off_t)(uintptr_t)uaddr; uio.uio_resid = len; uio.uio_rw = UIO_READ; UIO_SETUP_SYSSPACE(&uio); error = uvm_io(&vm->vm_map, &uio, 0); return (error); } /* * Like copyout(), but operates on an arbitrary vmspace. */ int copyout_vmspace(struct vmspace *vm, const void *kaddr, void *uaddr, size_t len) { struct iovec iov; struct uio uio; int error; if (len == 0) return (0); if (VMSPACE_IS_KERNEL_P(vm)) { return kcopy(kaddr, uaddr, len); } if (__predict_true(vm == curproc->p_vmspace)) { return copyout(kaddr, uaddr, len); } iov.iov_base = __UNCONST(kaddr); /* XXXUNCONST cast away const */ iov.iov_len = len; uio.uio_iov = &iov; uio.uio_iovcnt = 1; uio.uio_offset = (off_t)(uintptr_t)uaddr; uio.uio_resid = len; uio.uio_rw = UIO_WRITE; UIO_SETUP_SYSSPACE(&uio); error = uvm_io(&vm->vm_map, &uio, 0); return (error); } /* * Like copyin(), but operates on an arbitrary process. */ int copyin_proc(struct proc *p, const void *uaddr, void *kaddr, size_t len) { struct vmspace *vm; int error; error = proc_vmspace_getref(p, &vm); if (error) { return error; } error = copyin_vmspace(vm, uaddr, kaddr, len); uvmspace_free(vm); return error; } /* * Like copyout(), but operates on an arbitrary process. */ int copyout_proc(struct proc *p, const void *kaddr, void *uaddr, size_t len) { struct vmspace *vm; int error; error = proc_vmspace_getref(p, &vm); if (error) { return error; } error = copyout_vmspace(vm, kaddr, uaddr, len); uvmspace_free(vm); return error; } /* * Like copyin(), but operates on an arbitrary pid. */ int copyin_pid(pid_t pid, const void *uaddr, void *kaddr, size_t len) { struct proc *p; struct vmspace *vm; int error; mutex_enter(&proc_lock); p = proc_find(pid); if (p == NULL) { mutex_exit(&proc_lock); return ESRCH; } mutex_enter(p->p_lock); error = proc_vmspace_getref(p, &vm); mutex_exit(p->p_lock); mutex_exit(&proc_lock); if (error == 0) { error = copyin_vmspace(vm, uaddr, kaddr, len); uvmspace_free(vm); } return error; } /* * Like copyin(), except it operates on kernel addresses when the FKIOCTL * flag is passed in `ioctlflags' from the ioctl call. */ int ioctl_copyin(int ioctlflags, const void *src, void *dst, size_t len) { if (ioctlflags & FKIOCTL) return kcopy(src, dst, len); return copyin(src, dst, len); } /* * Like copyout(), except it operates on kernel addresses when the FKIOCTL * flag is passed in `ioctlflags' from the ioctl call. */ int ioctl_copyout(int ioctlflags, const void *src, void *dst, size_t len) { if (ioctlflags & FKIOCTL) return kcopy(src, dst, len); return copyout(src, dst, len); } /* * User-space CAS / fetch / store */ #ifdef __NO_STRICT_ALIGNMENT #define CHECK_ALIGNMENT(x) __nothing #else /* ! __NO_STRICT_ALIGNMENT */ static bool ufetchstore_aligned(uintptr_t uaddr, size_t size) { return (uaddr & (size - 1)) == 0; } #define CHECK_ALIGNMENT() \ do { \ if (!ufetchstore_aligned((uintptr_t)uaddr, sizeof(*uaddr))) \ return EFAULT; \ } while (/*CONSTCOND*/0) #endif /* __NO_STRICT_ALIGNMENT */ /* * __HAVE_UCAS_FULL platforms provide _ucas_32() and _ucas_64() themselves. * _RUMPKERNEL also provides it's own _ucas_32() and _ucas_64(). * * In all other cases, we provide generic implementations that work on * all platforms. */ #if !defined(__HAVE_UCAS_FULL) && !defined(_RUMPKERNEL) #if !defined(__HAVE_UCAS_MP) && defined(MULTIPROCESSOR) #include <sys/atomic.h> #include <sys/cpu.h> #include <sys/once.h> #include <sys/mutex.h> #include <sys/ipi.h> static int ucas_critical_splcookie; static volatile u_int ucas_critical_pausing_cpus; static u_int ucas_critical_ipi; static ONCE_DECL(ucas_critical_init_once) static void ucas_critical_cpu_gate(void *arg __unused) { int count = SPINLOCK_BACKOFF_MIN; KASSERT(atomic_load_relaxed(&ucas_critical_pausing_cpus) > 0); /* * Notify ucas_critical_wait that we have stopped. Using * store-release ensures all our memory operations up to the * IPI happen before the ucas -- no buffered stores on our end * can clobber it later on, for instance. * * Matches atomic_load_acquire in ucas_critical_wait -- turns * the following atomic_dec_uint into a store-release. */ membar_release(); atomic_dec_uint(&ucas_critical_pausing_cpus); /* * Wait for ucas_critical_exit to reopen the gate and let us * proceed. Using a load-acquire ensures the ucas happens * before any of our memory operations when we return from the * IPI and proceed -- we won't observe any stale cached value * that the ucas overwrote, for instance. * * Matches atomic_store_release in ucas_critical_exit. */ while (atomic_load_acquire(&ucas_critical_pausing_cpus) != (u_int)-1) { SPINLOCK_BACKOFF(count); } } static int ucas_critical_init(void) { ucas_critical_ipi = ipi_register(ucas_critical_cpu_gate, NULL); return 0; } static void ucas_critical_wait(void) { int count = SPINLOCK_BACKOFF_MIN; /* * Wait for all CPUs to stop at the gate. Using a load-acquire * ensures all memory operations before they stop at the gate * happen before the ucas -- no buffered stores in other CPUs * can clobber it later on, for instance. * * Matches membar_release/atomic_dec_uint (store-release) in * ucas_critical_cpu_gate. */ while (atomic_load_acquire(&ucas_critical_pausing_cpus) > 0) { SPINLOCK_BACKOFF(count); } } #endif /* ! __HAVE_UCAS_MP && MULTIPROCESSOR */ static inline void ucas_critical_enter(lwp_t * const l) { #if !defined(__HAVE_UCAS_MP) && defined(MULTIPROCESSOR) if (ncpu > 1) { RUN_ONCE(&ucas_critical_init_once, ucas_critical_init); /* * Acquire the mutex first, then go to splhigh() and * broadcast the IPI to lock all of the other CPUs * behind the gate. * * N.B. Going to splhigh() implicitly disables preemption, * so there's no need to do it explicitly. */ mutex_enter(&cpu_lock); ucas_critical_splcookie = splhigh(); ucas_critical_pausing_cpus = ncpu - 1; ipi_trigger_broadcast(ucas_critical_ipi, true); ucas_critical_wait(); return; } #endif /* ! __HAVE_UCAS_MP && MULTIPROCESSOR */ KPREEMPT_DISABLE(l); } static inline void ucas_critical_exit(lwp_t * const l) { #if !defined(__HAVE_UCAS_MP) && defined(MULTIPROCESSOR) if (ncpu > 1) { /* * Open the gate and notify all CPUs in * ucas_critical_cpu_gate that they can now proceed. * Using a store-release ensures the ucas happens * before any memory operations they issue after the * IPI -- they won't observe any stale cache of the * target word, for instance. * * Matches atomic_load_acquire in ucas_critical_cpu_gate. */ atomic_store_release(&ucas_critical_pausing_cpus, (u_int)-1); splx(ucas_critical_splcookie); mutex_exit(&cpu_lock); return; } #endif /* ! __HAVE_UCAS_MP && MULTIPROCESSOR */ KPREEMPT_ENABLE(l); } int _ucas_32(volatile uint32_t *uaddr, uint32_t old, uint32_t new, uint32_t *ret) { lwp_t * const l = curlwp; uint32_t *uva = ((void *)(uintptr_t)uaddr); int error; /* * Wire the user address down to avoid taking a page fault during * the critical section. */ error = uvm_vslock(l->l_proc->p_vmspace, uva, sizeof(*uaddr), VM_PROT_READ | VM_PROT_WRITE); if (error) return error; ucas_critical_enter(l); error = _ufetch_32(uva, ret); if (error == 0 && *ret == old) { error = _ustore_32(uva, new); } ucas_critical_exit(l); uvm_vsunlock(l->l_proc->p_vmspace, uva, sizeof(*uaddr)); return error; } #ifdef _LP64 int _ucas_64(volatile uint64_t *uaddr, uint64_t old, uint64_t new, uint64_t *ret) { lwp_t * const l = curlwp; uint64_t *uva = ((void *)(uintptr_t)uaddr); int error; /* * Wire the user address down to avoid taking a page fault during * the critical section. */ error = uvm_vslock(l->l_proc->p_vmspace, uva, sizeof(*uaddr), VM_PROT_READ | VM_PROT_WRITE); if (error) return error; ucas_critical_enter(l); error = _ufetch_64(uva, ret); if (error == 0 && *ret == old) { error = _ustore_64(uva, new); } ucas_critical_exit(l); uvm_vsunlock(l->l_proc->p_vmspace, uva, sizeof(*uaddr)); return error; } #endif /* _LP64 */ #endif /* ! __HAVE_UCAS_FULL && ! _RUMPKERNEL */ int ucas_32(volatile uint32_t *uaddr, uint32_t old, uint32_t new, uint32_t *ret) { ASSERT_SLEEPABLE(); CHECK_ALIGNMENT(); #if (defined(__HAVE_UCAS_MP) && defined(MULTIPROCESSOR)) && \ !defined(_RUMPKERNEL) if (ncpu > 1) { return _ucas_32_mp(uaddr, old, new, ret); } #endif /* __HAVE_UCAS_MP && MULTIPROCESSOR */ return _ucas_32(uaddr, old, new, ret); } #ifdef _LP64 int ucas_64(volatile uint64_t *uaddr, uint64_t old, uint64_t new, uint64_t *ret) { ASSERT_SLEEPABLE(); CHECK_ALIGNMENT(); #if (defined(__HAVE_UCAS_MP) && defined(MULTIPROCESSOR)) && \ !defined(_RUMPKERNEL) if (ncpu > 1) { return _ucas_64_mp(uaddr, old, new, ret); } #endif /* __HAVE_UCAS_MP && MULTIPROCESSOR */ return _ucas_64(uaddr, old, new, ret); } #endif /* _LP64 */ __strong_alias(ucas_int,ucas_32); #ifdef _LP64 __strong_alias(ucas_ptr,ucas_64); #else __strong_alias(ucas_ptr,ucas_32); #endif /* _LP64 */ int ufetch_8(const uint8_t *uaddr, uint8_t *valp) { ASSERT_SLEEPABLE(); CHECK_ALIGNMENT(); return _ufetch_8(uaddr, valp); } int ufetch_16(const uint16_t *uaddr, uint16_t *valp) { ASSERT_SLEEPABLE(); CHECK_ALIGNMENT(); return _ufetch_16(uaddr, valp); } int ufetch_32(const uint32_t *uaddr, uint32_t *valp) { ASSERT_SLEEPABLE(); CHECK_ALIGNMENT(); return _ufetch_32(uaddr, valp); } #ifdef _LP64 int ufetch_64(const uint64_t *uaddr, uint64_t *valp) { ASSERT_SLEEPABLE(); CHECK_ALIGNMENT(); return _ufetch_64(uaddr, valp); } #endif /* _LP64 */ __strong_alias(ufetch_char,ufetch_8); __strong_alias(ufetch_short,ufetch_16); __strong_alias(ufetch_int,ufetch_32); #ifdef _LP64 __strong_alias(ufetch_long,ufetch_64); __strong_alias(ufetch_ptr,ufetch_64); #else __strong_alias(ufetch_long,ufetch_32); __strong_alias(ufetch_ptr,ufetch_32); #endif /* _LP64 */ int ustore_8(uint8_t *uaddr, uint8_t val) { ASSERT_SLEEPABLE(); CHECK_ALIGNMENT(); return _ustore_8(uaddr, val); } int ustore_16(uint16_t *uaddr, uint16_t val) { ASSERT_SLEEPABLE(); CHECK_ALIGNMENT(); return _ustore_16(uaddr, val); } int ustore_32(uint32_t *uaddr, uint32_t val) { ASSERT_SLEEPABLE(); CHECK_ALIGNMENT(); return _ustore_32(uaddr, val); } #ifdef _LP64 int ustore_64(uint64_t *uaddr, uint64_t val) { ASSERT_SLEEPABLE(); CHECK_ALIGNMENT(); return _ustore_64(uaddr, val); } #endif /* _LP64 */ __strong_alias(ustore_char,ustore_8); __strong_alias(ustore_short,ustore_16); __strong_alias(ustore_int,ustore_32); #ifdef _LP64 __strong_alias(ustore_long,ustore_64); __strong_alias(ustore_ptr,ustore_64); #else __strong_alias(ustore_long,ustore_32); __strong_alias(ustore_ptr,ustore_32); #endif /* _LP64 */
40 40 5 688 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 /* $NetBSD: subr_lwp_specificdata.c,v 1.4 2019/05/17 03:34:26 ozaki-r Exp $ */ /*- * Copyright (c) 2006 The NetBSD Foundation, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #define _LWP_API_PRIVATE #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: subr_lwp_specificdata.c,v 1.4 2019/05/17 03:34:26 ozaki-r Exp $"); #include <sys/param.h> #include <sys/lwp.h> #include <sys/specificdata.h> static specificdata_domain_t lwp_specificdata_domain; void lwpinit_specificdata(void) { lwp_specificdata_domain = specificdata_domain_create(); KASSERT(lwp_specificdata_domain != NULL); } /* * lwp_specific_key_create -- * Create a key for subsystem lwp-specific data. */ int lwp_specific_key_create(specificdata_key_t *keyp, specificdata_dtor_t dtor) { return (specificdata_key_create(lwp_specificdata_domain, keyp, dtor)); } /* * lwp_specific_key_delete -- * Delete a key for subsystem lwp-specific data. */ void lwp_specific_key_delete(specificdata_key_t key) { specificdata_key_delete(lwp_specificdata_domain, key); } /* * lwp_initspecific -- * Initialize an LWP's specificdata container. */ void lwp_initspecific(struct lwp *l) { int error __diagused; error = specificdata_init(lwp_specificdata_domain, &l->l_specdataref); KASSERT(error == 0); } /* * lwp_finispecific -- * Finalize an LWP's specificdata container. */ void lwp_finispecific(struct lwp *l) { specificdata_fini(lwp_specificdata_domain, &l->l_specdataref); } /* * lwp_getspecific -- * Return lwp-specific data corresponding to the specified key. * * Note: LWP specific data is NOT INTERLOCKED. An LWP should access * only its OWN SPECIFIC DATA. If it is necessary to access another * LWP's specifc data, care must be taken to ensure that doing so * would not cause internal data structure inconsistency (i.e. caller * can guarantee that the target LWP is not inside an lwp_getspecific() * or lwp_setspecific() call). */ void * lwp_getspecific(specificdata_key_t key) { return (specificdata_getspecific_unlocked(lwp_specificdata_domain, &curlwp->l_specdataref, key)); } void * _lwp_getspecific_by_lwp(struct lwp *l, specificdata_key_t key) { return (specificdata_getspecific_unlocked(lwp_specificdata_domain, &l->l_specdataref, key)); } /* * lwp_setspecific -- * Set lwp-specific data corresponding to the specified key. */ void lwp_setspecific(specificdata_key_t key, void *data) { specificdata_setspecific(lwp_specificdata_domain, &curlwp->l_specdataref, key, data); } void lwp_setspecific_by_lwp(struct lwp *l, specificdata_key_t key, void *data) { specificdata_setspecific(lwp_specificdata_domain, &l->l_specdataref, key, data); }
23 40 40 39 34 6 40 5 5 30 30 30 3 3 3 13 13 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 /* $NetBSD: vm_machdep.c,v 1.46 2023/10/06 11:53:27 skrll Exp $ */ /*- * Copyright (c) 1982, 1986 The Regents of the University of California. * All rights reserved. * * This code is derived from software contributed to Berkeley by * the Systems Programming Group of the University of Utah Computer * Science Department, and William Jolitz. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)vm_machdep.c 7.3 (Berkeley) 5/13/91 */ /*- * Copyright (c) 1995 Charles M. Hannum. All rights reserved. * Copyright (c) 1989, 1990 William Jolitz * All rights reserved. * * This code is derived from software contributed to Berkeley by * the Systems Programming Group of the University of Utah Computer * Science Department, and William Jolitz. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)vm_machdep.c 7.3 (Berkeley) 5/13/91 */ /* * Utah $Hdr: vm_machdep.c 1.16.1.1 89/06/23$ */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: vm_machdep.c,v 1.46 2023/10/06 11:53:27 skrll Exp $"); #include "opt_mtrr.h" #include <sys/param.h> #include <sys/systm.h> #include <sys/proc.h> #include <sys/vnode.h> #include <sys/buf.h> #include <sys/core.h> #include <sys/exec.h> #include <sys/ptrace.h> #include <uvm/uvm.h> #include <machine/cpu.h> #include <machine/gdt.h> #include <machine/reg.h> #include <machine/specialreg.h> #ifdef MTRR #include <machine/mtrr.h> #endif #include <x86/fpu.h> #include <x86/dbregs.h> extern struct pool x86_dbregspl; void cpu_proc_fork(struct proc *p1, struct proc *p2) { p2->p_md.md_flags = p1->p_md.md_flags; } /* * cpu_lwp_fork: finish a new LWP (l2) operation. * * First LWP (l1) is the process being forked. If it is &lwp0, then we * are creating a kthread, where return path and argument are specified * with `func' and `arg'. * * If an alternate user-level stack is requested (with non-zero values * in both the stack and stacksize arguments), then set up the user stack * pointer accordingly. */ void cpu_lwp_fork(struct lwp *l1, struct lwp *l2, void *stack, size_t stacksize, void (*func)(void *), void *arg) { struct pcb *pcb1, *pcb2; struct trapframe *tf; struct switchframe *sf; vaddr_t uv; KASSERT(l1 == curlwp || l1 == &lwp0); pcb1 = lwp_getpcb(l1); pcb2 = lwp_getpcb(l2); /* Copy the PCB from parent, except the FPU state. */ memcpy(pcb2, pcb1, offsetof(struct pcb, pcb_savefpu)); /* Fork the FPU state. */ fpu_lwp_fork(l1, l2); /* Never inherit CPU Debug Registers */ pcb2->pcb_dbregs = NULL; pcb2->pcb_flags &= ~PCB_DBREGS; #if defined(XENPV) pcb2->pcb_iopl = IOPL_KPL; #endif /* * Set the kernel stack address (from the address to uarea) and * trapframe address for child. * * Rig kernel stack so that it would start out in lwp_trampoline() * and call child_return() with l2 as an argument. This causes the * newly-created child process to go directly to user level with a * parent return value of 0 from fork(), while the parent process * returns normally. */ uv = uvm_lwp_getuarea(l2); KASSERT(uv % PAGE_SIZE == 0); #ifdef __x86_64__ #ifdef SVS pcb2->pcb_rsp0 = (uv + USPACE - PAGE_SIZE + sizeof(struct trapframe)); KASSERT((pcb2->pcb_rsp0 & 0xF) == 0); #else pcb2->pcb_rsp0 = (uv + USPACE - 16); #endif tf = (struct trapframe *)pcb2->pcb_rsp0 - 1; #else pcb2->pcb_esp0 = (uv + USPACE - 16); tf = (struct trapframe *)pcb2->pcb_esp0 - 1; pcb2->pcb_iomap = NULL; #endif l2->l_md.md_regs = tf; /* * Copy the trapframe from parent, so that return to userspace * will be to right address, with correct registers. */ memcpy(tf, l1->l_md.md_regs, sizeof(struct trapframe)); /* Child LWP might get aston() before returning to userspace. */ tf->tf_trapno = T_ASTFLT; /* If specified, set a different user stack for a child. */ if (stack != NULL) { #ifdef __x86_64__ tf->tf_rsp = (uint64_t)stack + stacksize; #else tf->tf_esp = (uint32_t)stack + stacksize; #endif } l2->l_md.md_flags = l1->l_md.md_flags; KASSERT(l2->l_md.md_astpending == 0); sf = (struct switchframe *)tf - 1; #ifdef __x86_64__ sf->sf_r12 = (uint64_t)func; sf->sf_r13 = (uint64_t)arg; sf->sf_rip = (uint64_t)lwp_trampoline; pcb2->pcb_rsp = (uint64_t)sf; pcb2->pcb_rbp = (uint64_t)l2; #else /* * XXX Is there a reason sf->sf_edi isn't initialized here? * Could this leak potentially sensitive information to new * userspace processes? */ sf->sf_esi = (int)func; sf->sf_ebx = (int)arg; sf->sf_eip = (int)lwp_trampoline; pcb2->pcb_esp = (int)sf; pcb2->pcb_ebp = (int)l2; #endif } /* * cpu_lwp_free is called from exit() to let machine-dependent * code free machine-dependent resources. Note that this routine * must not block. NB: this may be called with l != curlwp in * error paths. */ void cpu_lwp_free(struct lwp *l, int proc) { if (l != curlwp) return; /* Abandon the FPU state. */ fpu_lwp_abandon(l); /* Abandon the dbregs state. */ x86_dbregs_abandon(l); #ifdef MTRR if (proc && l->l_proc->p_md.md_flags & MDP_USEDMTRR) mtrr_clean(l->l_proc); #endif } /* * cpu_lwp_free2 is called when an LWP is being reaped. * This routine may block. */ void cpu_lwp_free2(struct lwp *l) { struct pcb *pcb; pcb = lwp_getpcb(l); KASSERT((pcb->pcb_flags & PCB_DBREGS) == 0); if (pcb->pcb_dbregs) { pool_put(&x86_dbregspl, pcb->pcb_dbregs); pcb->pcb_dbregs = NULL; } } /* * Convert kernel VA to physical address */ paddr_t kvtop(void *addr) { paddr_t pa; bool ret __diagused; ret = pmap_extract(pmap_kernel(), (vaddr_t)addr, &pa); KASSERT(ret == true); return pa; } /* * Map a user I/O request into kernel virtual address space. * Note: the pages are already locked by uvm_vslock(), so we * do not need to pass an access_type to pmap_enter(). */ int vmapbuf(struct buf *bp, vsize_t len) { vaddr_t faddr, taddr, off; paddr_t fpa; KASSERT((bp->b_flags & B_PHYS) != 0); bp->b_saveaddr = bp->b_data; faddr = trunc_page((vaddr_t)bp->b_data); off = (vaddr_t)bp->b_data - faddr; len = round_page(off + len); taddr = uvm_km_alloc(phys_map, len, 0, UVM_KMF_VAONLY | UVM_KMF_WAITVA); bp->b_data = (void *)(taddr + off); /* * The region is locked, so we expect that pmap_extract() will return * true. * XXX: unwise to expect this in a multithreaded environment. * anything can happen to a pmap between the time we lock a * region, release the pmap lock, and then relock it for * the pmap_extract(). * * no need to flush TLB since we expect nothing to be mapped * where we just allocated (TLB will be flushed when our * mapping is removed). */ while (len) { (void) pmap_extract(vm_map_pmap(&bp->b_proc->p_vmspace->vm_map), faddr, &fpa); pmap_kenter_pa(taddr, fpa, VM_PROT_READ|VM_PROT_WRITE, 0); faddr += PAGE_SIZE; taddr += PAGE_SIZE; len -= PAGE_SIZE; } pmap_update(pmap_kernel()); return 0; } /* * Unmap a previously-mapped user I/O request. */ void vunmapbuf(struct buf *bp, vsize_t len) { vaddr_t addr, off; KASSERT((bp->b_flags & B_PHYS) != 0); addr = trunc_page((vaddr_t)bp->b_data); off = (vaddr_t)bp->b_data - addr; len = round_page(off + len); pmap_kremove(addr, len); pmap_update(pmap_kernel()); uvm_km_free(phys_map, addr, len, UVM_KMF_VAONLY); bp->b_data = bp->b_saveaddr; bp->b_saveaddr = 0; } #ifdef __HAVE_CPU_UAREA_ROUTINES /* * Layout of the uarea: * Page[0] = PCB * Page[1] = RedZone * Page[2] = Stack * Page[...] = Stack * Page[UPAGES-1] = Stack * Page[UPAGES] = RedZone * There is a redzone at the beginning of the stack, and another one at the * end. The former is to protect against deep recursions that could corrupt * the PCB, the latter to protect against severe stack overflows. */ void * cpu_uarea_alloc(bool system) { vaddr_t base, va; paddr_t pa; base = uvm_km_alloc(kernel_map, USPACE + PAGE_SIZE, 0, UVM_KMF_WIRED|UVM_KMF_WAITVA); /* Page[1] = RedZone */ va = base + PAGE_SIZE; if (!pmap_extract(pmap_kernel(), va, &pa)) { panic("%s: impossible, Page[1] unmapped", __func__); } pmap_kremove(va, PAGE_SIZE); uvm_pagefree(PHYS_TO_VM_PAGE(pa)); /* Page[UPAGES] = RedZone */ va = base + USPACE; if (!pmap_extract(pmap_kernel(), va, &pa)) { panic("%s: impossible, Page[UPAGES] unmapped", __func__); } pmap_kremove(va, PAGE_SIZE); uvm_pagefree(PHYS_TO_VM_PAGE(pa)); pmap_update(pmap_kernel()); return (void *)base; } bool cpu_uarea_free(void *addr) { vaddr_t base = (vaddr_t)addr; KASSERT(!pmap_extract(pmap_kernel(), base + PAGE_SIZE, NULL)); KASSERT(!pmap_extract(pmap_kernel(), base + USPACE, NULL)); uvm_km_free(kernel_map, base, USPACE + PAGE_SIZE, UVM_KMF_WIRED); return true; } #endif /* __HAVE_CPU_UAREA_ROUTINES */
2 2 2 2 2 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 /* $NetBSD: mfs_vnops.c,v 1.64 2022/03/19 13:48:42 hannken Exp $ */ /* * Copyright (c) 1989, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)mfs_vnops.c 8.11 (Berkeley) 5/22/95 */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: mfs_vnops.c,v 1.64 2022/03/19 13:48:42 hannken Exp $"); #include <sys/param.h> #include <sys/systm.h> #include <sys/time.h> #include <sys/kernel.h> #include <sys/proc.h> #include <sys/buf.h> #include <sys/bufq.h> #include <sys/vnode.h> #include <sys/kmem.h> #include <miscfs/genfs/genfs.h> #include <miscfs/specfs/specdev.h> #include <machine/vmparam.h> #include <ufs/mfs/mfsnode.h> #include <ufs/mfs/mfs_extern.h> /* * mfs vnode operations. */ int (**mfs_vnodeop_p)(void *); const struct vnodeopv_entry_desc mfs_vnodeop_entries[] = { { &vop_default_desc, vn_default_error }, { &vop_parsepath_desc, genfs_parsepath }, /* parsepath */ { &vop_lookup_desc, genfs_badop }, /* lookup */ { &vop_create_desc, genfs_badop }, /* create */ { &vop_mknod_desc, genfs_badop }, /* mknod */ { &vop_open_desc, mfs_open }, /* open */ { &vop_close_desc, mfs_close }, /* close */ { &vop_access_desc, genfs_badop }, /* access */ { &vop_accessx_desc, genfs_badop }, /* accessx */ { &vop_getattr_desc, genfs_badop }, /* getattr */ { &vop_setattr_desc, genfs_badop }, /* setattr */ { &vop_read_desc, genfs_badop }, /* read */ { &vop_write_desc, genfs_badop }, /* write */ { &vop_fallocate_desc, genfs_eopnotsupp }, /* fallocate */ { &vop_fdiscard_desc, genfs_eopnotsupp }, /* fdiscard */ { &vop_ioctl_desc, genfs_enoioctl }, /* ioctl */ { &vop_poll_desc, genfs_badop }, /* poll */ { &vop_revoke_desc, genfs_revoke }, /* revoke */ { &vop_mmap_desc, genfs_badop }, /* mmap */ { &vop_fsync_desc, spec_fsync }, /* fsync */ { &vop_seek_desc, genfs_badop }, /* seek */ { &vop_remove_desc, genfs_badop }, /* remove */ { &vop_link_desc, genfs_badop }, /* link */ { &vop_rename_desc, genfs_badop }, /* rename */ { &vop_mkdir_desc, genfs_badop }, /* mkdir */ { &vop_rmdir_desc, genfs_badop }, /* rmdir */ { &vop_symlink_desc, genfs_badop }, /* symlink */ { &vop_readdir_desc, genfs_badop }, /* readdir */ { &vop_readlink_desc, genfs_badop }, /* readlink */ { &vop_abortop_desc, genfs_badop }, /* abortop */ { &vop_inactive_desc, mfs_inactive }, /* inactive */ { &vop_reclaim_desc, mfs_reclaim }, /* reclaim */ { &vop_lock_desc, genfs_lock }, /* lock */ { &vop_unlock_desc, genfs_unlock }, /* unlock */ { &vop_bmap_desc, mfs_bmap }, /* bmap */ { &vop_strategy_desc, mfs_strategy }, /* strategy */ { &vop_print_desc, mfs_print }, /* print */ { &vop_islocked_desc, genfs_islocked }, /* islocked */ { &vop_pathconf_desc, genfs_badop }, /* pathconf */ { &vop_advlock_desc, genfs_badop }, /* advlock */ { &vop_bwrite_desc, vn_bwrite }, /* bwrite */ { &vop_putpages_desc, genfs_null_putpages }, /* putpages */ { NULL, NULL } }; const struct vnodeopv_desc mfs_vnodeop_opv_desc = { &mfs_vnodeop_p, mfs_vnodeop_entries }; /* * Vnode Operations. * * Open called to allow memory filesystem to initialize and * validate before actual IO. Record our process identifier * so we can tell when we are doing I/O to ourself. */ /* ARGSUSED */ int mfs_open(void *v) { struct vop_open_args /* { struct vnode *a_vp; int a_mode; kauth_cred_t a_cred; } */ *ap = v; if (ap->a_vp->v_type != VBLK) { panic("mfs_open not VBLK"); /* NOTREACHED */ } return (0); } /* * Pass I/O requests to the memory filesystem process. */ int mfs_strategy(void *v) { struct vop_strategy_args /* { struct vnode *a_vp; struct buf *a_bp; } */ *ap = v; struct vnode *vp = ap->a_vp; struct buf *bp = ap->a_bp; struct mfsnode *mfsp; if (vp->v_type != VBLK || vrefcnt(vp) == 0) panic("mfs_strategy: bad dev"); mfsp = VTOMFS(vp); /* check for mini-root access */ if (mfsp->mfs_proc == NULL) { void *base; base = (char *)mfsp->mfs_baseoff + (bp->b_blkno << DEV_BSHIFT); if (bp->b_flags & B_READ) memcpy(bp->b_data, base, bp->b_bcount); else memcpy(base, bp->b_data, bp->b_bcount); bp->b_resid = 0; biodone(bp); } else if (mfsp->mfs_proc == curproc) { mfs_doio(bp, mfsp->mfs_baseoff); } else if (doing_shutdown) { /* * bitbucket I/O during shutdown. * Note that reads should *not* happen here, but.. */ if (bp->b_flags & B_READ) printf("warning: mfs read during shutdown\n"); bp->b_resid = 0; biodone(bp); } else { mutex_enter(&mfs_lock); bufq_put(mfsp->mfs_buflist, bp); cv_broadcast(&mfsp->mfs_cv); mutex_exit(&mfs_lock); } return (0); } /* * Memory file system I/O. */ void mfs_doio(struct buf *bp, void *base) { base = (char *)base + (bp->b_blkno << DEV_BSHIFT); if (bp->b_flags & B_READ) bp->b_error = copyin(base, bp->b_data, bp->b_bcount); else bp->b_error = copyout(bp->b_data, base, bp->b_bcount); if (bp->b_error == 0) bp->b_resid = 0; biodone(bp); } /* * This is a noop, simply returning what one has been given. */ int mfs_bmap(void *v) { struct vop_bmap_args /* { struct vnode *a_vp; daddr_t a_bn; struct vnode **a_vpp; daddr_t *a_bnp; int *a_runp; } */ *ap = v; if (ap->a_vpp != NULL) *ap->a_vpp = ap->a_vp; if (ap->a_bnp != NULL) *ap->a_bnp = ap->a_bn; if (ap->a_runp != NULL) *ap->a_runp = 0; return (0); } /* * Memory filesystem close routine */ /* ARGSUSED */ int mfs_close(void *v) { struct vop_close_args /* { struct vnode *a_vp; int a_fflag; kauth_cred_t a_cred; } */ *ap = v; struct vnode *vp = ap->a_vp; struct mfsnode *mfsp = VTOMFS(vp); struct buf *bp; int error; /* * Finish any pending I/O requests. */ mutex_enter(&mfs_lock); while ((bp = bufq_get(mfsp->mfs_buflist)) != NULL) { mutex_exit(&mfs_lock); mfs_doio(bp, mfsp->mfs_baseoff); mutex_enter(&mfs_lock); } mutex_exit(&mfs_lock); /* * On last close of a memory filesystem * we must invalidate any in core blocks, so that * we can, free up its vnode. */ if ((error = vinvalbuf(vp, V_SAVE, ap->a_cred, curlwp, 0, 0)) != 0) return (error); /* * There should be no way to have any more uses of this * vnode, so if we find any other uses, it is a panic. */ if (bufq_peek(mfsp->mfs_buflist) != NULL) panic("mfs_close"); /* * Send a request to the filesystem server to exit. */ mutex_enter(&mfs_lock); mfsp->mfs_shutdown = 1; cv_broadcast(&mfsp->mfs_cv); mutex_exit(&mfs_lock); return (0); } /* * Memory filesystem inactive routine */ /* ARGSUSED */ int mfs_inactive(void *v) { struct vop_inactive_v2_args /* { struct vnode *a_vp; } */ *ap = v; struct vnode *vp = ap->a_vp; struct mfsnode *mfsp = VTOMFS(vp); if (bufq_peek(mfsp->mfs_buflist) != NULL) panic("mfs_inactive: not inactive (mfs_buflist %p)", bufq_peek(mfsp->mfs_buflist)); return VOCALL(spec_vnodeop_p, VOFFSET(vop_inactive), ap); } /* * Reclaim a memory filesystem devvp so that it can be reused. */ int mfs_reclaim(void *v) { struct vop_reclaim_v2_args /* { struct vnode *a_vp; } */ *ap = v; struct vnode *vp = ap->a_vp; struct mfsnode *mfsp = VTOMFS(vp); int refcnt; mutex_enter(&mfs_lock); vp->v_data = NULL; refcnt = --mfsp->mfs_refcnt; mutex_exit(&mfs_lock); if (refcnt == 0) { bufq_free(mfsp->mfs_buflist); cv_destroy(&mfsp->mfs_cv); kmem_free(mfsp, sizeof(*mfsp)); } return VOCALL(spec_vnodeop_p, VOFFSET(vop_reclaim), ap); } /* * Print out the contents of an mfsnode. */ int mfs_print(void *v) { struct vop_print_args /* { struct vnode *a_vp; } */ *ap = v; struct mfsnode *mfsp = VTOMFS(ap->a_vp); printf("tag VT_MFS, pid %d, base %p, size %ld\n", (mfsp->mfs_proc != NULL) ? mfsp->mfs_proc->p_pid : 0, mfsp->mfs_baseoff, mfsp->mfs_size); return (0); }
363 302 96 116 14 749 115 116 116 116 116 11 82 48 115 14 106 747 746 744 735 684 621 659 621 576 576 116 553 578 358 534 481 480 67 67 12 9 3 3 203 203 762 765 219 608 607 389 55 605 606 605 2 2 152 225 346 346 11 11 11 517 517 518 50 299 518 33 34 24 22 22 22 12 31 33 31 31 31 5 27 30 31 33 437 440 53 53 424 424 424 423 1 1 1 1 51 414 4 253 278 147 42 108 22 22 2 22 113 113 113 113 112 112 116 116 56 217 54 54 54 97 98 48 104 104 104 97 98 98 97 21 21 21 422 423 424 2 2 4 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 /* $NetBSD: vfs_cache.c,v 1.156 2023/10/02 21:50:18 ad Exp $ */ /*- * Copyright (c) 2008, 2019, 2020, 2023 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Andrew Doran. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /* * Copyright (c) 1989, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)vfs_cache.c 8.3 (Berkeley) 8/22/94 */ /* * Name caching: * * Names found by directory scans are retained in a cache for future * reference. It is managed LRU, so frequently used names will hang * around. The cache is indexed by hash value obtained from the name. * * The name cache is the brainchild of Robert Elz and was introduced in * 4.3BSD. See "Using gprof to Tune the 4.2BSD Kernel", Marshall Kirk * McKusick, May 21 1984. * * Data structures: * * Most Unix namecaches very sensibly use a global hash table to index * names. The global hash table works well, but can cause concurrency * headaches for the kernel hacker. In the NetBSD 10.0 implementation * we are not sensible, and use a per-directory data structure to index * names, but the cache otherwise functions the same. * * The index is a red-black tree. It should not be difficult to * experiment with other types of index, however note that a tree * can trivially be made to support lockless lookup. * * Each cached name is stored in a struct namecache, along with a * pointer to the associated vnode (nc_vp). Names longer than a * maximum length of NCHNAMLEN are allocated with kmem_alloc(); they * occur infrequently, and names shorter than this are stored directly * in struct namecache. If it is a "negative" entry, (i.e. for a name * that is known NOT to exist) the vnode pointer will be NULL. * * In practice this implementation is not any slower than the hash * table that preceeded it and in some cases it significantly * outperforms the hash table. Some reasons why this might be: * * - natural partitioning provided by the file system structure, which * the prior implementation discarded (global hash table). * - worst case tree traversal of O(log n), the hash table could have * many collisions. * - minimized cache misses & total L2/L3 CPU cache footprint; struct * namecache and vnode_impl_t are laid out to keep cache footprint * minimal in the lookup path; no hash table buckets to cache. * - minimized number of conditionals & string comparisons. * * For a directory with 3 cached names for 3 distinct vnodes, the * various vnodes and namecache structs would be connected like this * (the root is at the bottom of the diagram): * * ... * ^ * |- vi_nc_tree * | * +----o----+ +---------+ +---------+ * | VDIR | | VCHR | | VREG | * | vnode o-----+ | vnode o-----+ | vnode o------+ * +---------+ | +---------+ | +---------+ | * ^ | ^ | ^ | * |- nc_vp |- vi_nc_list |- nc_vp |- vi_nc_list |- nc_vp | * | | | | | | * +----o----+ | +----o----+ | +----o----+ | * +---onamecache|<----+ +---onamecache|<----+ +---onamecache|<-----+ * | +---------+ | +---------+ | +---------+ * | ^ | ^ | ^ * | | | | | | * | | +----------------------+ | | * |-nc_dvp | +-------------------------------------------------+ * | |/- vi_nc_tree | | * | | |- nc_dvp |- nc_dvp * | +----o----+ | | * +-->| VDIR |<----------+ | * | vnode |<------------------------------------+ * +---------+ * * START HERE * * Replacement: * * As the cache becomes full, old and unused entries are purged as new * entries are added. The synchronization overhead in maintaining a * strict ordering would be prohibitive, so the VM system's "clock" or * "second chance" page replacement algorithm is aped here. New * entries go to the tail of the active list. After they age out and * reach the head of the list, they are moved to the tail of the * inactive list. Any use of the deactivated cache entry reactivates * it, saving it from impending doom; if not reactivated, the entry * eventually reaches the head of the inactive list and is purged. * * Concurrency: * * From a performance perspective, cache_lookup(nameiop == LOOKUP) is * what really matters; insertion of new entries with cache_enter() is * comparatively infrequent, and overshadowed by the cost of expensive * file system metadata operations (which may involve disk I/O). We * therefore want to make everything simplest in the lookup path. * * struct namecache is mostly stable except for list and tree related * entries, changes to which don't affect the cached name or vnode. * For changes to name+vnode, entries are purged in preference to * modifying them. * * Read access to namecache entries is made via tree, list, or LRU * list. A lock corresponding to the direction of access should be * held. See definition of "struct namecache" in src/sys/namei.src, * and the definition of "struct vnode" for the particulars. * * Per-CPU statistics, and LRU list totals are read unlocked, since an * approximate value is OK. We maintain 32-bit sized per-CPU counters * and 64-bit global counters since 32-bit sized counters can be * observed locklessly while the global counters are protected by a * mutex. * * The lock order is: * * 1) vi->vi_nc_lock (tree or parent -> child direction, * used during forward lookup) * * 2) vi->vi_nc_listlock (list or child -> parent direction, * used during reverse lookup) * * 3) cache_lru_lock (LRU list direction, used during reclaim) */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: vfs_cache.c,v 1.156 2023/10/02 21:50:18 ad Exp $"); #define __NAMECACHE_PRIVATE #ifdef _KERNEL_OPT #include "opt_ddb.h" #include "opt_dtrace.h" #endif #include <sys/param.h> #include <sys/types.h> #include <sys/atomic.h> #include <sys/callout.h> #include <sys/cpu.h> #include <sys/errno.h> #include <sys/evcnt.h> #include <sys/hash.h> #include <sys/kernel.h> #include <sys/mount.h> #include <sys/mutex.h> #include <sys/namei.h> #include <sys/param.h> #include <sys/pool.h> #include <sys/sdt.h> #include <sys/sysctl.h> #include <sys/systm.h> #include <sys/time.h> #include <sys/vnode_impl.h> #include <miscfs/genfs/genfs.h> /* * Assert that data structure layout hasn't changed unintentionally. */ #ifdef _LP64 CTASSERT(sizeof(struct namecache) == 128); #else CTASSERT(sizeof(struct namecache) == 64); #endif CTASSERT(NC_NLEN_MASK >= MAXPATHLEN); static void cache_activate(struct namecache *); static void cache_update_stats(void *); static int cache_compare_nodes(void *, const void *, const void *); static void cache_deactivate(void); static void cache_reclaim(void); static int cache_stat_sysctl(SYSCTLFN_ARGS); /* * Global pool cache. */ static pool_cache_t cache_pool __read_mostly; /* * LRU replacement. */ enum cache_lru_id { LRU_ACTIVE, LRU_INACTIVE, LRU_COUNT }; static struct { TAILQ_HEAD(, namecache) list[LRU_COUNT]; u_int count[LRU_COUNT]; } cache_lru __cacheline_aligned; static kmutex_t cache_lru_lock __cacheline_aligned; /* * Cache effectiveness statistics. nchstats holds system-wide total. */ struct nchstats nchstats; struct nchstats_percpu _NAMEI_CACHE_STATS(uint32_t); struct nchcpu { struct nchstats_percpu cur; struct nchstats_percpu last; }; static callout_t cache_stat_callout; static kmutex_t cache_stat_lock __cacheline_aligned; #define COUNT(f) do { \ lwp_t *l = curlwp; \ KPREEMPT_DISABLE(l); \ struct nchcpu *nchcpu = curcpu()->ci_data.cpu_nch; \ nchcpu->cur.f++; \ KPREEMPT_ENABLE(l); \ } while (/* CONSTCOND */ 0); #define UPDATE(nchcpu, f) do { \ uint32_t cur = atomic_load_relaxed(&nchcpu->cur.f); \ nchstats.f += (uint32_t)(cur - nchcpu->last.f); \ nchcpu->last.f = cur; \ } while (/* CONSTCOND */ 0) /* * Tunables. cache_maxlen replaces the historical doingcache: * set it zero to disable caching for debugging purposes. */ int cache_lru_maxdeact __read_mostly = 2; /* max # to deactivate */ int cache_lru_maxscan __read_mostly = 64; /* max # to scan/reclaim */ int cache_maxlen __read_mostly = NC_NLEN_MASK; /* max name length to cache */ int cache_stat_interval __read_mostly = 300; /* in seconds */ /* * sysctl stuff. */ static struct sysctllog *cache_sysctllog; /* * This is a dummy name that cannot usually occur anywhere in the cache nor * file system. It's used when caching the root vnode of mounted file * systems. The name is attached to the directory that the file system is * mounted on. */ static const char cache_mp_name[] = ""; static const int cache_mp_nlen = sizeof(cache_mp_name) - 1; /* * Red-black tree stuff. */ static const rb_tree_ops_t cache_rbtree_ops = { .rbto_compare_nodes = cache_compare_nodes, .rbto_compare_key = cache_compare_nodes, .rbto_node_offset = offsetof(struct namecache, nc_tree), .rbto_context = NULL }; /* * dtrace probes. */ SDT_PROBE_DEFINE1(vfs, namecache, invalidate, done, "struct vnode *"); SDT_PROBE_DEFINE1(vfs, namecache, purge, parents, "struct vnode *"); SDT_PROBE_DEFINE1(vfs, namecache, purge, children, "struct vnode *"); SDT_PROBE_DEFINE2(vfs, namecache, purge, name, "char *", "size_t"); SDT_PROBE_DEFINE1(vfs, namecache, purge, vfs, "struct mount *"); SDT_PROBE_DEFINE3(vfs, namecache, lookup, hit, "struct vnode *", "char *", "size_t"); SDT_PROBE_DEFINE3(vfs, namecache, lookup, miss, "struct vnode *", "char *", "size_t"); SDT_PROBE_DEFINE3(vfs, namecache, lookup, toolong, "struct vnode *", "char *", "size_t"); SDT_PROBE_DEFINE2(vfs, namecache, revlookup, success, "struct vnode *", "struct vnode *"); SDT_PROBE_DEFINE2(vfs, namecache, revlookup, fail, "struct vnode *", "int"); SDT_PROBE_DEFINE2(vfs, namecache, prune, done, "int", "int"); SDT_PROBE_DEFINE3(vfs, namecache, enter, toolong, "struct vnode *", "char *", "size_t"); SDT_PROBE_DEFINE3(vfs, namecache, enter, done, "struct vnode *", "char *", "size_t"); /* * rbtree: compare two nodes. */ static int cache_compare_nodes(void *context, const void *n1, const void *n2) { const struct namecache *nc1 = n1; const struct namecache *nc2 = n2; if (nc1->nc_key < nc2->nc_key) { return -1; } if (nc1->nc_key > nc2->nc_key) { return 1; } KASSERT(NC_NLEN(nc1) == NC_NLEN(nc2)); return memcmp(nc1->nc_name, nc2->nc_name, NC_NLEN(nc1)); } /* * Compute a key value for the given name. The name length is encoded in * the key value to try and improve uniqueness, and so that length doesn't * need to be compared separately for string comparisons. */ static uintptr_t cache_key(const char *name, size_t nlen) { uintptr_t key; KASSERT((nlen & ~NC_NLEN_MASK) == 0); key = hash32_buf(name, nlen, HASH32_STR_INIT); return (key << NC_NLEN_BITS) | (uintptr_t)nlen; } /* * Remove an entry from the cache. vi_nc_lock must be held, and if dir2node * is true, then we're locking in the conventional direction and the list * lock will be acquired when removing the entry from the vnode list. */ static void cache_remove(struct namecache *ncp, const bool dir2node) { struct vnode *vp, *dvp = ncp->nc_dvp; vnode_impl_t *dvi = VNODE_TO_VIMPL(dvp); size_t namelen = NC_NLEN(ncp); KASSERT(rw_write_held(&dvi->vi_nc_lock)); KASSERT(cache_key(ncp->nc_name, namelen) == ncp->nc_key); KASSERT(rb_tree_find_node(&dvi->vi_nc_tree, ncp) == ncp); SDT_PROBE(vfs, namecache, invalidate, done, ncp, 0, 0, 0, 0); /* * Remove from the vnode's list. This excludes cache_revlookup(), * and then it's safe to remove from the LRU lists. */ if ((vp = ncp->nc_vp) != NULL) { vnode_impl_t *vi = VNODE_TO_VIMPL(vp); if (__predict_true(dir2node)) { rw_enter(&vi->vi_nc_listlock, RW_WRITER); TAILQ_REMOVE(&vi->vi_nc_list, ncp, nc_list); rw_exit(&vi->vi_nc_listlock); } else { TAILQ_REMOVE(&vi->vi_nc_list, ncp, nc_list); } } /* Remove from the directory's rbtree. */ rb_tree_remove_node(&dvi->vi_nc_tree, ncp); /* Remove from the LRU lists. */ mutex_enter(&cache_lru_lock); TAILQ_REMOVE(&cache_lru.list[ncp->nc_lrulist], ncp, nc_lru); cache_lru.count[ncp->nc_lrulist]--; mutex_exit(&cache_lru_lock); /* Finally, free it. */ if (namelen > NCHNAMLEN) { size_t sz = offsetof(struct namecache, nc_name[namelen]); kmem_free(ncp, sz); } else { pool_cache_put(cache_pool, ncp); } } /* * Find a single cache entry and return it. vi_nc_lock must be held. */ static struct namecache * __noinline cache_lookup_entry(struct vnode *dvp, const char *name, size_t namelen, uintptr_t key) { vnode_impl_t *dvi = VNODE_TO_VIMPL(dvp); struct rb_node *node = dvi->vi_nc_tree.rbt_root; struct namecache *ncp; enum cache_lru_id lrulist; int diff; KASSERT(namelen <= MAXPATHLEN); KASSERT(rw_lock_held(&dvi->vi_nc_lock)); /* * Search the RB tree for the key. This is an inlined lookup * tailored for exactly what's needed here that turns out to be * quite a bit faster than using rb_tree_find_node(). * * For a matching key memcmp() needs to be called once to confirm * that the correct name has been found. Very rarely there will be * a key value collision and the search will continue. */ for (;;) { if (__predict_false(RB_SENTINEL_P(node))) { return NULL; } ncp = (struct namecache *)node; KASSERT((void *)&ncp->nc_tree == (void *)ncp); KASSERT(ncp->nc_dvp == dvp); if (ncp->nc_key == key) { KASSERT(NC_NLEN(ncp) == namelen); diff = memcmp(ncp->nc_name, name, namelen); if (__predict_true(diff == 0)) { break; } node = node->rb_nodes[diff < 0]; } else { node = node->rb_nodes[ncp->nc_key < key]; } } /* * If the entry is on the wrong LRU list, requeue it. This is an * unlocked check, but it will rarely be wrong and even then there * will be no harm caused. */ lrulist = atomic_load_relaxed(&ncp->nc_lrulist); if (__predict_false(lrulist != LRU_ACTIVE)) { cache_activate(ncp); } return ncp; } /* * Look for a the name in the cache. We don't do this * if the segment name is long, simply so the cache can avoid * holding long names (which would either waste space, or * add greatly to the complexity). * * Lookup is called with DVP pointing to the directory to search, * and CNP providing the name of the entry being sought: cn_nameptr * is the name, cn_namelen is its length, and cn_flags is the flags * word from the namei operation. * * DVP must be locked. * * There are three possible non-error return states: * 1. Nothing was found in the cache. Nothing is known about * the requested name. * 2. A negative entry was found in the cache, meaning that the * requested name definitely does not exist. * 3. A positive entry was found in the cache, meaning that the * requested name does exist and that we are providing the * vnode. * In these cases the results are: * 1. 0 returned; VN is set to NULL. * 2. 1 returned; VN is set to NULL. * 3. 1 returned; VN is set to the vnode found. * * The additional result argument ISWHT is set to zero, unless a * negative entry is found that was entered as a whiteout, in which * case ISWHT is set to one. * * The ISWHT_RET argument pointer may be null. In this case an * assertion is made that the whiteout flag is not set. File systems * that do not support whiteouts can/should do this. * * Filesystems that do support whiteouts should add ISWHITEOUT to * cnp->cn_flags if ISWHT comes back nonzero. * * When a vnode is returned, it is locked, as per the vnode lookup * locking protocol. * * There is no way for this function to fail, in the sense of * generating an error that requires aborting the namei operation. * * (Prior to October 2012, this function returned an integer status, * and a vnode, and mucked with the flags word in CNP for whiteouts. * The integer status was -1 for "nothing found", ENOENT for "a * negative entry found", 0 for "a positive entry found", and possibly * other errors, and the value of VN might or might not have been set * depending on what error occurred.) */ bool cache_lookup(struct vnode *dvp, const char *name, size_t namelen, uint32_t nameiop, uint32_t cnflags, int *iswht_ret, struct vnode **vn_ret) { vnode_impl_t *dvi = VNODE_TO_VIMPL(dvp); struct namecache *ncp; struct vnode *vp; uintptr_t key; int error; bool hit; krw_t op; KASSERT(namelen != cache_mp_nlen || name == cache_mp_name); /* Establish default result values */ if (iswht_ret != NULL) { *iswht_ret = 0; } *vn_ret = NULL; if (__predict_false(namelen > cache_maxlen)) { SDT_PROBE(vfs, namecache, lookup, toolong, dvp, name, namelen, 0, 0); COUNT(ncs_long); return false; } /* Compute the key up front - don't need the lock. */ key = cache_key(name, namelen); /* Could the entry be purged below? */ if ((cnflags & ISLASTCN) != 0 && ((cnflags & MAKEENTRY) == 0 || nameiop == CREATE)) { op = RW_WRITER; } else { op = RW_READER; } /* Now look for the name. */ rw_enter(&dvi->vi_nc_lock, op); ncp = cache_lookup_entry(dvp, name, namelen, key); if (__predict_false(ncp == NULL)) { rw_exit(&dvi->vi_nc_lock); COUNT(ncs_miss); SDT_PROBE(vfs, namecache, lookup, miss, dvp, name, namelen, 0, 0); return false; } if (__predict_false((cnflags & MAKEENTRY) == 0)) { /* * Last component and we are renaming or deleting, * the cache entry is invalid, or otherwise don't * want cache entry to exist. */ KASSERT((cnflags & ISLASTCN) != 0); cache_remove(ncp, true); rw_exit(&dvi->vi_nc_lock); COUNT(ncs_badhits); return false; } if ((vp = ncp->nc_vp) == NULL) { if (iswht_ret != NULL) { /* * Restore the ISWHITEOUT flag saved earlier. */ *iswht_ret = ncp->nc_whiteout; } else { KASSERT(!ncp->nc_whiteout); } if (nameiop == CREATE && (cnflags & ISLASTCN) != 0) { /* * Last component and we are preparing to create * the named object, so flush the negative cache * entry. */ COUNT(ncs_badhits); cache_remove(ncp, true); hit = false; } else { COUNT(ncs_neghits); SDT_PROBE(vfs, namecache, lookup, hit, dvp, name, namelen, 0, 0); /* found neg entry; vn is already null from above */ hit = true; } rw_exit(&dvi->vi_nc_lock); return hit; } error = vcache_tryvget(vp); rw_exit(&dvi->vi_nc_lock); if (error) { KASSERT(error == EBUSY); /* * This vnode is being cleaned out. * XXX badhits? */ COUNT(ncs_falsehits); return false; } COUNT(ncs_goodhits); SDT_PROBE(vfs, namecache, lookup, hit, dvp, name, namelen, 0, 0); /* found it */ *vn_ret = vp; return true; } /* * Version of the above without the nameiop argument, for NFS. */ bool cache_lookup_raw(struct vnode *dvp, const char *name, size_t namelen, uint32_t cnflags, int *iswht_ret, struct vnode **vn_ret) { return cache_lookup(dvp, name, namelen, LOOKUP, cnflags | MAKEENTRY, iswht_ret, vn_ret); } /* * Used by namei() to walk down a path, component by component by looking up * names in the cache. The node locks are chained along the way: a parent's * lock is not dropped until the child's is acquired. */ bool cache_lookup_linked(struct vnode *dvp, const char *name, size_t namelen, struct vnode **vn_ret, krwlock_t **plock, kauth_cred_t cred) { vnode_impl_t *dvi = VNODE_TO_VIMPL(dvp); struct namecache *ncp; krwlock_t *oldlock, *newlock; struct vnode *vp; uintptr_t key; int error; KASSERT(namelen != cache_mp_nlen || name == cache_mp_name); /* If disabled, or file system doesn't support this, bail out. */ if (__predict_false((dvp->v_mount->mnt_iflag & IMNT_NCLOOKUP) == 0)) { return false; } if (__predict_false(namelen > cache_maxlen)) { COUNT(ncs_long); return false; } /* Compute the key up front - don't need the lock. */ key = cache_key(name, namelen); /* * Acquire the directory lock. Once we have that, we can drop the * previous one (if any). * * The two lock holds mean that the directory can't go away while * here: the directory must be purged with cache_purge() before * being freed, and both parent & child's vi_nc_lock must be taken * before that point is passed. * * However if there's no previous lock, like at the root of the * chain, then "dvp" must be referenced to prevent dvp going away * before we get its lock. * * Note that the two locks can be the same if looking up a dot, for * example: /usr/bin/. If looking up the parent (..) we can't wait * on the lock as child -> parent is the wrong direction. */ if (*plock != &dvi->vi_nc_lock) { oldlock = *plock; newlock = &dvi->vi_nc_lock; if (!rw_tryenter(&dvi->vi_nc_lock, RW_READER)) { return false; } } else { oldlock = NULL; newlock = NULL; if (*plock == NULL) { KASSERT(vrefcnt(dvp) > 0); } } /* * First up check if the user is allowed to look up files in this * directory. */ if (cred != FSCRED) { if (dvi->vi_nc_mode == VNOVAL) { if (newlock != NULL) { rw_exit(newlock); } return false; } KASSERT(dvi->vi_nc_uid != VNOVAL); KASSERT(dvi->vi_nc_gid != VNOVAL); error = kauth_authorize_vnode(cred, KAUTH_ACCESS_ACTION(VEXEC, dvp->v_type, dvi->vi_nc_mode & ALLPERMS), dvp, NULL, genfs_can_access(dvp, cred, dvi->vi_nc_uid, dvi->vi_nc_gid, dvi->vi_nc_mode & ALLPERMS, NULL, VEXEC)); if (error != 0) { if (newlock != NULL) { rw_exit(newlock); } COUNT(ncs_denied); return false; } } /* * Now look for a matching cache entry. */ ncp = cache_lookup_entry(dvp, name, namelen, key); if (__predict_false(ncp == NULL)) { if (newlock != NULL) { rw_exit(newlock); } COUNT(ncs_miss); SDT_PROBE(vfs, namecache, lookup, miss, dvp, name, namelen, 0, 0); return false; } if ((vp = ncp->nc_vp) == NULL) { /* found negative entry; vn is already null from above */ KASSERT(namelen != cache_mp_nlen); KASSERT(name != cache_mp_name); COUNT(ncs_neghits); } else { COUNT(ncs_goodhits); /* XXX can be "badhits" */ } SDT_PROBE(vfs, namecache, lookup, hit, dvp, name, namelen, 0, 0); /* * Return with the directory lock still held. It will either be * returned to us with another call to cache_lookup_linked() when * looking up the next component, or the caller will release it * manually when finished. */ if (oldlock) { rw_exit(oldlock); } if (newlock) { *plock = newlock; } *vn_ret = vp; return true; } /* * Scan cache looking for name of directory entry pointing at vp. * Will not search for "." or "..". * * If the lookup succeeds the vnode is referenced and stored in dvpp. * * If bufp is non-NULL, also place the name in the buffer which starts * at bufp, immediately before *bpp, and move bpp backwards to point * at the start of it. (Yes, this is a little baroque, but it's done * this way to cater to the whims of getcwd). * * Returns 0 on success, -1 on cache miss, positive errno on failure. */ int cache_revlookup(struct vnode *vp, struct vnode **dvpp, char **bpp, char *bufp, bool checkaccess, accmode_t accmode) { vnode_impl_t *vi = VNODE_TO_VIMPL(vp); struct namecache *ncp; enum cache_lru_id lrulist; struct vnode *dvp; int error, nlen; char *bp; KASSERT(vp != NULL); if (cache_maxlen == 0) goto out; rw_enter(&vi->vi_nc_listlock, RW_READER); if (checkaccess) { /* * Check if the user is allowed to see. NOTE: this is * checking for access on the "wrong" directory. getcwd() * wants to see that there is access on every component * along the way, not that there is access to any individual * component. Don't use this to check you can look in vp. * * I don't like it, I didn't come up with it, don't blame me! */ if (vi->vi_nc_mode == VNOVAL) { rw_exit(&vi->vi_nc_listlock); return -1; } KASSERT(vi->vi_nc_uid != VNOVAL); KASSERT(vi->vi_nc_gid != VNOVAL); error = kauth_authorize_vnode(kauth_cred_get(), KAUTH_ACCESS_ACTION(VEXEC, vp->v_type, vi->vi_nc_mode & ALLPERMS), vp, NULL, genfs_can_access(vp, curlwp->l_cred, vi->vi_nc_uid, vi->vi_nc_gid, vi->vi_nc_mode & ALLPERMS, NULL, accmode)); if (error != 0) { rw_exit(&vi->vi_nc_listlock); COUNT(ncs_denied); return EACCES; } } TAILQ_FOREACH(ncp, &vi->vi_nc_list, nc_list) { KASSERT(ncp->nc_vp == vp); KASSERT(ncp->nc_dvp != NULL); nlen = NC_NLEN(ncp); /* * Ignore mountpoint entries. */ if (nlen == cache_mp_nlen) { continue; } /* * The queue is partially sorted. Once we hit dots, nothing * else remains but dots and dotdots, so bail out. */ if (ncp->nc_name[0] == '.') { if (nlen == 1 || (nlen == 2 && ncp->nc_name[1] == '.')) { break; } } /* * Record a hit on the entry. This is an unlocked read but * even if wrong it doesn't matter too much. */ lrulist = atomic_load_relaxed(&ncp->nc_lrulist); if (lrulist != LRU_ACTIVE) { cache_activate(ncp); } if (bufp) { bp = *bpp; bp -= nlen; if (bp <= bufp) { *dvpp = NULL; rw_exit(&vi->vi_nc_listlock); SDT_PROBE(vfs, namecache, revlookup, fail, vp, ERANGE, 0, 0, 0); return (ERANGE); } memcpy(bp, ncp->nc_name, nlen); *bpp = bp; } dvp = ncp->nc_dvp; error = vcache_tryvget(dvp); rw_exit(&vi->vi_nc_listlock); if (error) { KASSERT(error == EBUSY); if (bufp) (*bpp) += nlen; *dvpp = NULL; SDT_PROBE(vfs, namecache, revlookup, fail, vp, error, 0, 0, 0); return -1; } *dvpp = dvp; SDT_PROBE(vfs, namecache, revlookup, success, vp, dvp, 0, 0, 0); COUNT(ncs_revhits); return (0); } rw_exit(&vi->vi_nc_listlock); COUNT(ncs_revmiss); out: *dvpp = NULL; return (-1); } /* * Add an entry to the cache. */ void cache_enter(struct vnode *dvp, struct vnode *vp, const char *name, size_t namelen, uint32_t cnflags) { vnode_impl_t *dvi = VNODE_TO_VIMPL(dvp); struct namecache *ncp, *oncp; int total; KASSERT(namelen != cache_mp_nlen || name == cache_mp_name); /* First, check whether we can/should add a cache entry. */ if ((cnflags & MAKEENTRY) == 0 || __predict_false(namelen > cache_maxlen)) { SDT_PROBE(vfs, namecache, enter, toolong, vp, name, namelen, 0, 0); return; } SDT_PROBE(vfs, namecache, enter, done, vp, name, namelen, 0, 0); /* * Reclaim some entries if over budget. This is an unlocked check, * but it doesn't matter. Just need to catch up with things * eventually: it doesn't matter if we go over temporarily. */ total = atomic_load_relaxed(&cache_lru.count[LRU_ACTIVE]); total += atomic_load_relaxed(&cache_lru.count[LRU_INACTIVE]); if (__predict_false(total > desiredvnodes)) { cache_reclaim(); } /* Now allocate a fresh entry. */ if (__predict_true(namelen <= NCHNAMLEN)) { ncp = pool_cache_get(cache_pool, PR_WAITOK); } else { size_t sz = offsetof(struct namecache, nc_name[namelen]); ncp = kmem_alloc(sz, KM_SLEEP); } /* * Fill in cache info. For negative hits, save the ISWHITEOUT flag * so we can restore it later when the cache entry is used again. */ ncp->nc_vp = vp; ncp->nc_dvp = dvp; ncp->nc_key = cache_key(name, namelen); ncp->nc_whiteout = ((cnflags & ISWHITEOUT) != 0); memcpy(ncp->nc_name, name, namelen); /* * Insert to the directory. Concurrent lookups may race for a cache * entry. If there's a entry there already, purge it. */ rw_enter(&dvi->vi_nc_lock, RW_WRITER); oncp = rb_tree_insert_node(&dvi->vi_nc_tree, ncp); if (oncp != ncp) { KASSERT(oncp->nc_key == ncp->nc_key); KASSERT(NC_NLEN(oncp) == NC_NLEN(ncp)); KASSERT(memcmp(oncp->nc_name, name, namelen) == 0); cache_remove(oncp, true); oncp = rb_tree_insert_node(&dvi->vi_nc_tree, ncp); KASSERT(oncp == ncp); } /* * With the directory lock still held, insert to the tail of the * ACTIVE LRU list (new) and take the opportunity to incrementally * balance the lists. */ mutex_enter(&cache_lru_lock); ncp->nc_lrulist = LRU_ACTIVE; cache_lru.count[LRU_ACTIVE]++; TAILQ_INSERT_TAIL(&cache_lru.list[LRU_ACTIVE], ncp, nc_lru); cache_deactivate(); mutex_exit(&cache_lru_lock); /* * Finally, insert to the vnode and unlock. With everything set up * it's safe to let cache_revlookup() see the entry. Partially sort * the per-vnode list: dots go to back so cache_revlookup() doesn't * have to consider them. */ if (vp != NULL) { vnode_impl_t *vi = VNODE_TO_VIMPL(vp); rw_enter(&vi->vi_nc_listlock, RW_WRITER); if ((namelen == 1 && name[0] == '.') || (namelen == 2 && name[0] == '.' && name[1] == '.')) { TAILQ_INSERT_TAIL(&vi->vi_nc_list, ncp, nc_list); } else { TAILQ_INSERT_HEAD(&vi->vi_nc_list, ncp, nc_list); } rw_exit(&vi->vi_nc_listlock); } rw_exit(&dvi->vi_nc_lock); } /* * Set identity info in cache for a vnode. We only care about directories * so ignore other updates. The cached info may be marked invalid if the * inode has an ACL. */ void cache_enter_id(struct vnode *vp, mode_t mode, uid_t uid, gid_t gid, bool valid) { vnode_impl_t *vi = VNODE_TO_VIMPL(vp); if (vp->v_type == VDIR) { /* Grab both locks, for forward & reverse lookup. */ rw_enter(&vi->vi_nc_lock, RW_WRITER); rw_enter(&vi->vi_nc_listlock, RW_WRITER); if (valid) { vi->vi_nc_mode = mode; vi->vi_nc_uid = uid; vi->vi_nc_gid = gid; } else { vi->vi_nc_mode = VNOVAL; vi->vi_nc_uid = VNOVAL; vi->vi_nc_gid = VNOVAL; } rw_exit(&vi->vi_nc_listlock); rw_exit(&vi->vi_nc_lock); } } /* * Return true if we have identity for the given vnode, and use as an * opportunity to confirm that everything squares up. * * Because of shared code, some file systems could provide partial * information, missing some updates, so check the mount flag too. */ bool cache_have_id(struct vnode *vp) { if (vp->v_type == VDIR && (vp->v_mount->mnt_iflag & IMNT_NCLOOKUP) != 0 && atomic_load_relaxed(&VNODE_TO_VIMPL(vp)->vi_nc_mode) != VNOVAL) { return true; } else { return false; } } /* * Enter a mount point. cvp is the covered vnode, and rvp is the root of * the mounted file system. */ void cache_enter_mount(struct vnode *cvp, struct vnode *rvp) { KASSERT(vrefcnt(cvp) > 0); KASSERT(vrefcnt(rvp) > 0); KASSERT(cvp->v_type == VDIR); KASSERT((rvp->v_vflag & VV_ROOT) != 0); if (rvp->v_type == VDIR) { cache_enter(cvp, rvp, cache_mp_name, cache_mp_nlen, MAKEENTRY); } } /* * Look up a cached mount point. Used in the strongly locked path. */ bool cache_lookup_mount(struct vnode *dvp, struct vnode **vn_ret) { bool ret; ret = cache_lookup(dvp, cache_mp_name, cache_mp_nlen, LOOKUP, MAKEENTRY, NULL, vn_ret); KASSERT((*vn_ret != NULL) == ret); return ret; } /* * Try to cross a mount point. For use with cache_lookup_linked(). */ bool cache_cross_mount(struct vnode **dvp, krwlock_t **plock) { return cache_lookup_linked(*dvp, cache_mp_name, cache_mp_nlen, dvp, plock, FSCRED); } /* * Name cache initialization, from vfs_init() when the system is booting. */ void nchinit(void) { cache_pool = pool_cache_init(sizeof(struct namecache), coherency_unit, 0, 0, "namecache", NULL, IPL_NONE, NULL, NULL, NULL); KASSERT(cache_pool != NULL); mutex_init(&cache_lru_lock, MUTEX_DEFAULT, IPL_NONE); TAILQ_INIT(&cache_lru.list[LRU_ACTIVE]); TAILQ_INIT(&cache_lru.list[LRU_INACTIVE]); mutex_init(&cache_stat_lock, MUTEX_DEFAULT, IPL_NONE); callout_init(&cache_stat_callout, CALLOUT_MPSAFE); callout_setfunc(&cache_stat_callout, cache_update_stats, NULL); callout_schedule(&cache_stat_callout, cache_stat_interval * hz); KASSERT(cache_sysctllog == NULL); sysctl_createv(&cache_sysctllog, 0, NULL, NULL, CTLFLAG_PERMANENT, CTLTYPE_STRUCT, "namecache_stats", SYSCTL_DESCR("namecache statistics"), cache_stat_sysctl, 0, NULL, 0, CTL_VFS, CTL_CREATE, CTL_EOL); } /* * Called once for each CPU in the system as attached. */ void cache_cpu_init(struct cpu_info *ci) { size_t sz; sz = roundup2(sizeof(struct nchcpu), coherency_unit); ci->ci_data.cpu_nch = kmem_zalloc(sz, KM_SLEEP); KASSERT(((uintptr_t)ci->ci_data.cpu_nch & (coherency_unit - 1)) == 0); } /* * A vnode is being allocated: set up cache structures. */ void cache_vnode_init(struct vnode *vp) { vnode_impl_t *vi = VNODE_TO_VIMPL(vp); rw_init(&vi->vi_nc_lock); rw_init(&vi->vi_nc_listlock); rb_tree_init(&vi->vi_nc_tree, &cache_rbtree_ops); TAILQ_INIT(&vi->vi_nc_list); vi->vi_nc_mode = VNOVAL; vi->vi_nc_uid = VNOVAL; vi->vi_nc_gid = VNOVAL; } /* * A vnode is being freed: finish cache structures. */ void cache_vnode_fini(struct vnode *vp) { vnode_impl_t *vi = VNODE_TO_VIMPL(vp); KASSERT(RB_TREE_MIN(&vi->vi_nc_tree) == NULL); KASSERT(TAILQ_EMPTY(&vi->vi_nc_list)); rw_destroy(&vi->vi_nc_lock); rw_destroy(&vi->vi_nc_listlock); } /* * Helper for cache_purge1(): purge cache entries for the given vnode from * all directories that the vnode is cached in. */ static void cache_purge_parents(struct vnode *vp) { vnode_impl_t *dvi, *vi = VNODE_TO_VIMPL(vp); struct vnode *dvp, *blocked; struct namecache *ncp; SDT_PROBE(vfs, namecache, purge, parents, vp, 0, 0, 0, 0); blocked = NULL; rw_enter(&vi->vi_nc_listlock, RW_WRITER); while ((ncp = TAILQ_FIRST(&vi->vi_nc_list)) != NULL) { /* * Locking in the wrong direction. Try for a hold on the * directory node's lock, and if we get it then all good, * nuke the entry and move on to the next. */ dvp = ncp->nc_dvp; dvi = VNODE_TO_VIMPL(dvp); if (rw_tryenter(&dvi->vi_nc_lock, RW_WRITER)) { cache_remove(ncp, false); rw_exit(&dvi->vi_nc_lock); blocked = NULL; continue; } /* * We can't wait on the directory node's lock with our list * lock held or the system could deadlock. * * Take a hold on the directory vnode to prevent it from * being freed (taking the vnode & lock with it). Then * wait for the lock to become available with no other locks * held, and retry. * * If this happens twice in a row, give the other side a * breather; we can do nothing until it lets go. */ vhold(dvp); rw_exit(&vi->vi_nc_listlock); rw_enter(&dvi->vi_nc_lock, RW_WRITER); /* Do nothing. */ rw_exit(&dvi->vi_nc_lock); holdrele(dvp); if (blocked == dvp) { kpause("ncpurge", false, 1, NULL); } rw_enter(&vi->vi_nc_listlock, RW_WRITER); blocked = dvp; } rw_exit(&vi->vi_nc_listlock); } /* * Helper for cache_purge1(): purge all cache entries hanging off the given * directory vnode. */ static void cache_purge_children(struct vnode *dvp) { vnode_impl_t *dvi = VNODE_TO_VIMPL(dvp); struct namecache *ncp; SDT_PROBE(vfs, namecache, purge, children, dvp, 0, 0, 0, 0); rw_enter(&dvi->vi_nc_lock, RW_WRITER); while ((ncp = RB_TREE_MIN(&dvi->vi_nc_tree)) != NULL) { cache_remove(ncp, true); } rw_exit(&dvi->vi_nc_lock); } /* * Helper for cache_purge1(): purge cache entry from the given vnode, * finding it by name. */ static void cache_purge_name(struct vnode *dvp, const char *name, size_t namelen) { vnode_impl_t *dvi = VNODE_TO_VIMPL(dvp); struct namecache *ncp; uintptr_t key; SDT_PROBE(vfs, namecache, purge, name, name, namelen, 0, 0, 0); key = cache_key(name, namelen); rw_enter(&dvi->vi_nc_lock, RW_WRITER); ncp = cache_lookup_entry(dvp, name, namelen, key); if (ncp) { cache_remove(ncp, true); } rw_exit(&dvi->vi_nc_lock); } /* * Cache flush, a particular vnode; called when a vnode is renamed to * hide entries that would now be invalid. */ void cache_purge1(struct vnode *vp, const char *name, size_t namelen, int flags) { if (flags & PURGE_PARENTS) { cache_purge_parents(vp); } if (flags & PURGE_CHILDREN) { cache_purge_children(vp); } if (name != NULL) { cache_purge_name(vp, name, namelen); } } /* * vnode filter for cache_purgevfs(). */ static bool cache_vdir_filter(void *cookie, vnode_t *vp) { return vp->v_type == VDIR; } /* * Cache flush, a whole filesystem; called when filesys is umounted to * remove entries that would now be invalid. */ void cache_purgevfs(struct mount *mp) { struct vnode_iterator *iter; vnode_t *dvp; vfs_vnode_iterator_init(mp, &iter); for (;;) { dvp = vfs_vnode_iterator_next(iter, cache_vdir_filter, NULL); if (dvp == NULL) { break; } cache_purge_children(dvp); vrele(dvp); } vfs_vnode_iterator_destroy(iter); } /* * Re-queue an entry onto the tail of the active LRU list, after it has * scored a hit. */ static void cache_activate(struct namecache *ncp) { mutex_enter(&cache_lru_lock); TAILQ_REMOVE(&cache_lru.list[ncp->nc_lrulist], ncp, nc_lru); TAILQ_INSERT_TAIL(&cache_lru.list[LRU_ACTIVE], ncp, nc_lru); cache_lru.count[ncp->nc_lrulist]--; cache_lru.count[LRU_ACTIVE]++; ncp->nc_lrulist = LRU_ACTIVE; mutex_exit(&cache_lru_lock); } /* * Try to balance the LRU lists. Pick some victim entries, and re-queue * them from the head of the active list to the tail of the inactive list. */ static void cache_deactivate(void) { struct namecache *ncp; int total, i; KASSERT(mutex_owned(&cache_lru_lock)); /* If we're nowhere near budget yet, don't bother. */ total = cache_lru.count[LRU_ACTIVE] + cache_lru.count[LRU_INACTIVE]; if (total < (desiredvnodes >> 1)) { return; } /* * Aim for a 1:1 ratio of active to inactive. This is to allow each * potential victim a reasonable amount of time to cycle through the * inactive list in order to score a hit and be reactivated, while * trying not to cause reactivations too frequently. */ if (cache_lru.count[LRU_ACTIVE] < cache_lru.count[LRU_INACTIVE]) { return; } /* Move only a few at a time; will catch up eventually. */ for (i = 0; i < cache_lru_maxdeact; i++) { ncp = TAILQ_FIRST(&cache_lru.list[LRU_ACTIVE]); if (ncp == NULL) { break; } KASSERT(ncp->nc_lrulist == LRU_ACTIVE); ncp->nc_lrulist = LRU_INACTIVE; TAILQ_REMOVE(&cache_lru.list[LRU_ACTIVE], ncp, nc_lru); TAILQ_INSERT_TAIL(&cache_lru.list[LRU_INACTIVE], ncp, nc_lru); cache_lru.count[LRU_ACTIVE]--; cache_lru.count[LRU_INACTIVE]++; } } /* * Free some entries from the cache, when we have gone over budget. * * We don't want to cause too much work for any individual caller, and it * doesn't matter if we temporarily go over budget. This is also "just a * cache" so it's not a big deal if we screw up and throw out something we * shouldn't. So we take a relaxed attitude to this process to reduce its * impact. */ static void cache_reclaim(void) { struct namecache *ncp; vnode_impl_t *dvi; int toscan; /* * Scan up to a preset maximum number of entries, but no more than * 0.8% of the total at once (to allow for very small systems). * * On bigger systems, do a larger chunk of work to reduce the number * of times that cache_lru_lock is held for any length of time. */ mutex_enter(&cache_lru_lock); toscan = MIN(cache_lru_maxscan, desiredvnodes >> 7); toscan = MAX(toscan, 1); SDT_PROBE(vfs, namecache, prune, done, cache_lru.count[LRU_ACTIVE] + cache_lru.count[LRU_INACTIVE], toscan, 0, 0, 0); while (toscan-- != 0) { /* First try to balance the lists. */ cache_deactivate(); /* Now look for a victim on head of inactive list (old). */ ncp = TAILQ_FIRST(&cache_lru.list[LRU_INACTIVE]); if (ncp == NULL) { break; } dvi = VNODE_TO_VIMPL(ncp->nc_dvp); KASSERT(ncp->nc_lrulist == LRU_INACTIVE); KASSERT(dvi != NULL); /* * Locking in the wrong direction. If we can't get the * lock, the directory is actively busy, and it could also * cause problems for the next guy in here, so send the * entry to the back of the list. */ if (!rw_tryenter(&dvi->vi_nc_lock, RW_WRITER)) { TAILQ_REMOVE(&cache_lru.list[LRU_INACTIVE], ncp, nc_lru); TAILQ_INSERT_TAIL(&cache_lru.list[LRU_INACTIVE], ncp, nc_lru); continue; } /* * Now have the victim entry locked. Drop the LRU list * lock, purge the entry, and start over. The hold on * vi_nc_lock will prevent the vnode from vanishing until * finished (cache_purge() will be called on dvp before it * disappears, and that will wait on vi_nc_lock). */ mutex_exit(&cache_lru_lock); cache_remove(ncp, true); rw_exit(&dvi->vi_nc_lock); mutex_enter(&cache_lru_lock); } mutex_exit(&cache_lru_lock); } /* * For file system code: count a lookup that required a full re-scan of * directory metadata. */ void namecache_count_pass2(void) { COUNT(ncs_pass2); } /* * For file system code: count a lookup that scored a hit in the directory * metadata near the location of the last lookup. */ void namecache_count_2passes(void) { COUNT(ncs_2passes); } /* * Sum the stats from all CPUs into nchstats. This needs to run at least * once within every window where a 32-bit counter could roll over. It's * called regularly by timer to ensure this. */ static void cache_update_stats(void *cookie) { CPU_INFO_ITERATOR cii; struct cpu_info *ci; mutex_enter(&cache_stat_lock); for (CPU_INFO_FOREACH(cii, ci)) { struct nchcpu *nchcpu = ci->ci_data.cpu_nch; UPDATE(nchcpu, ncs_goodhits); UPDATE(nchcpu, ncs_neghits); UPDATE(nchcpu, ncs_badhits); UPDATE(nchcpu, ncs_falsehits); UPDATE(nchcpu, ncs_miss); UPDATE(nchcpu, ncs_long); UPDATE(nchcpu, ncs_pass2); UPDATE(nchcpu, ncs_2passes); UPDATE(nchcpu, ncs_revhits); UPDATE(nchcpu, ncs_revmiss); UPDATE(nchcpu, ncs_denied); } if (cookie != NULL) { memcpy(cookie, &nchstats, sizeof(nchstats)); } /* Reset the timer; arrive back here in N minutes at latest. */ callout_schedule(&cache_stat_callout, cache_stat_interval * hz); mutex_exit(&cache_stat_lock); } /* * Fetch the current values of the stats for sysctl. */ static int cache_stat_sysctl(SYSCTLFN_ARGS) { struct nchstats stats; if (oldp == NULL) { *oldlenp = sizeof(nchstats); return 0; } if (*oldlenp <= 0) { *oldlenp = 0; return 0; } /* Refresh the global stats. */ sysctl_unlock(); cache_update_stats(&stats); sysctl_relock(); *oldlenp = MIN(sizeof(stats), *oldlenp); return sysctl_copyout(l, &stats, oldp, *oldlenp); } /* * For the debugger, given the address of a vnode, print all associated * names in the cache. */ #ifdef DDB void namecache_print(struct vnode *vp, void (*pr)(const char *, ...)) { struct vnode *dvp = NULL; struct namecache *ncp; enum cache_lru_id id; for (id = 0; id < LRU_COUNT; id++) { TAILQ_FOREACH(ncp, &cache_lru.list[id], nc_lru) { if (ncp->nc_vp == vp) { (*pr)("name %.*s\n", NC_NLEN(ncp), ncp->nc_name); dvp = ncp->nc_dvp; } } } if (dvp == NULL) { (*pr)("name not found\n"); return; } for (id = 0; id < LRU_COUNT; id++) { TAILQ_FOREACH(ncp, &cache_lru.list[id], nc_lru) { if (ncp->nc_vp == dvp) { (*pr)("parent %.*s\n", NC_NLEN(ncp), ncp->nc_name); } } } } #endif
2762 10 2788 1650 1406 6 37 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 /* $NetBSD: syscall.c,v 1.22 2023/10/05 19:41:06 ad Exp $ */ /*- * Copyright (c) 1998, 2000, 2009 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Charles M. Hannum. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: syscall.c,v 1.22 2023/10/05 19:41:06 ad Exp $"); #include <sys/param.h> #include <sys/systm.h> #include <sys/proc.h> #include <sys/signal.h> #include <sys/ktrace.h> #include <sys/syscall.h> #include <sys/syscallvar.h> #include <sys/syscall_stats.h> #include <uvm/uvm_extern.h> #include <machine/cpu.h> #include <machine/psl.h> #include <machine/userret.h> #include "opt_dtrace.h" #ifndef __x86_64__ int x86_copyargs(void *, void *, size_t); #endif void syscall_intern(struct proc *); static void syscall(struct trapframe *); void md_child_return(struct lwp *l) { struct trapframe *tf = l->l_md.md_regs; X86_TF_RAX(tf) = 0; X86_TF_RFLAGS(tf) &= ~PSL_C; userret(l); } /* * Process the tail end of a posix_spawn() for the child. */ void cpu_spawn_return(struct lwp *l) { userret(l); } /* * syscall(frame): * System call request from POSIX system call gate interface to kernel. * Like trap(), argument is call by reference. */ #ifdef KDTRACE_HOOKS void syscall(struct trapframe *); #else static #endif void syscall(struct trapframe *frame) { const struct sysent *callp; struct proc *p; struct lwp *l; int error; register_t code, rval[2]; #ifdef __x86_64__ /* Verify that the syscall args will fit in the trapframe space */ CTASSERT(offsetof(struct trapframe, tf_arg9) >= sizeof(register_t) * (2 + SYS_MAXSYSARGS - 1)); #define args (&frame->tf_rdi) #else register_t args[2 + SYS_MAXSYSARGS]; #endif l = curlwp; p = l->l_proc; code = X86_TF_RAX(frame) & (SYS_NSYSENT - 1); callp = p->p_emul->e_sysent + code; SYSCALL_COUNT(syscall_counts, code); SYSCALL_TIME_SYS_ENTRY(l, syscall_times, code); #ifdef __x86_64__ /* * The first 6 syscall args are passed in rdi, rsi, rdx, r10, r8 and r9 * (rcx gets copied to r10 in the libc stub because the syscall * instruction overwrites %cx) and are together in the trap frame * with space following for 4 more entries. */ if (__predict_false(callp->sy_argsize > 6 * 8)) { error = copyin((register_t *)frame->tf_rsp + 1, &frame->tf_arg6, callp->sy_argsize - 6 * 8); if (error != 0) goto bad; } #else if (callp->sy_argsize) { error = x86_copyargs((char *)frame->tf_esp + sizeof(int), args, callp->sy_argsize); if (__predict_false(error != 0)) goto bad; } #endif error = sy_invoke(callp, l, args, rval, code); if (__predict_true(error == 0)) { X86_TF_RAX(frame) = rval[0]; X86_TF_RDX(frame) = rval[1]; X86_TF_RFLAGS(frame) &= ~PSL_C; /* carry bit */ } else { switch (error) { case ERESTART: /* * The offset to adjust the PC by depends on whether we * entered the kernel through the trap or call gate. * We saved the instruction size in tf_err on entry. */ X86_TF_RIP(frame) -= frame->tf_err; break; case EJUSTRETURN: /* nothing to do */ break; default: bad: X86_TF_RAX(frame) = error; X86_TF_RFLAGS(frame) |= PSL_C; /* carry bit */ break; } } SYSCALL_TIME_SYS_EXIT(l); userret(l); } void syscall_intern(struct proc *p) { p->p_md.md_syscall = syscall; }
5 5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 /* $NetBSD: in6_var.h,v 1.104 2020/06/16 17:12:18 maxv Exp $ */ /* $KAME: in6_var.h,v 1.81 2002/06/08 11:16:51 itojun Exp $ */ /* * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the project nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * Copyright (c) 1985, 1986, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)in_var.h 8.1 (Berkeley) 6/10/93 */ #ifndef _NETINET6_IN6_VAR_H_ #define _NETINET6_IN6_VAR_H_ #include <sys/callout.h> #include <sys/ioccom.h> /* * Interface address, Internet version. One of these structures * is allocated for each interface with an Internet address. * The ifaddr structure contains the protocol-independent part * of the structure and is assumed to be first. */ /* * pltime/vltime are just for future reference (required to implements 2 * hour rule for hosts). they should never be modified by nd6_timeout or * anywhere else. * userland -> kernel: accept pltime/vltime * kernel -> userland: throw up everything * in kernel: modify preferred/expire only */ struct in6_addrlifetime { time_t ia6t_expire; /* valid lifetime expiration time */ time_t ia6t_preferred; /* preferred lifetime expiration time */ u_int32_t ia6t_vltime; /* valid lifetime */ u_int32_t ia6t_pltime; /* prefix lifetime */ }; struct lltable; struct nd_kifinfo; struct in6_ifextra { struct in6_ifstat *in6_ifstat; struct icmp6_ifstat *icmp6_ifstat; struct nd_kifinfo *nd_ifinfo; struct scope6_id *scope6_id; struct lltable *lltable; }; LIST_HEAD(in6_multihead, in6_multi); struct in6_ifaddr { struct ifaddr ia_ifa; /* protocol-independent info */ #define ia_ifp ia_ifa.ifa_ifp #define ia_flags ia_ifa.ifa_flags struct sockaddr_in6 ia_addr; /* interface address */ struct sockaddr_in6 ia_net; /* network number of interface */ struct sockaddr_in6 ia_dstaddr; /* space for destination addr */ struct sockaddr_in6 ia_prefixmask; /* prefix mask */ u_int32_t ia_plen; /* prefix length */ /* DEPRECATED. Keep it to avoid breaking kvm(3) users */ struct in6_ifaddr *ia_next; /* next in6 list of IP6 addresses */ /* DEPRECATED. Keep it to avoid breaking kvm(3) users */ struct in6_multihead _ia6_multiaddrs; /* list of multicast addresses */ int ia6_flags; struct in6_addrlifetime ia6_lifetime; time_t ia6_createtime; /* the creation time of this address, which is * currently used for temporary addresses only. */ time_t ia6_updatetime; /* multicast addresses joined from the kernel */ LIST_HEAD(, in6_multi_mship) ia6_memberships; #ifdef _KERNEL struct pslist_entry ia6_pslist_entry; #endif }; #ifdef _KERNEL static __inline void ia6_acquire(struct in6_ifaddr *ia, struct psref *psref) { KASSERT(ia != NULL); ifa_acquire(&ia->ia_ifa, psref); } static __inline void ia6_release(struct in6_ifaddr *ia, struct psref *psref) { if (ia == NULL) return; ifa_release(&ia->ia_ifa, psref); } #endif /* control structure to manage address selection policy */ struct in6_addrpolicy { struct sockaddr_in6 addr; /* prefix address */ struct sockaddr_in6 addrmask; /* prefix mask */ int preced; /* precedence */ int label; /* matching label */ u_quad_t use; /* statistics */ }; /* * IPv6 interface statistics, as defined in RFC2465 Ipv6IfStatsEntry (p12). */ struct in6_ifstat { u_quad_t ifs6_in_receive; /* # of total input datagram */ u_quad_t ifs6_in_hdrerr; /* # of datagrams with invalid hdr */ u_quad_t ifs6_in_toobig; /* # of datagrams exceeded MTU */ u_quad_t ifs6_in_noroute; /* # of datagrams with no route */ u_quad_t ifs6_in_addrerr; /* # of datagrams with invalid dst */ u_quad_t ifs6_in_protounknown; /* # of datagrams with unknown proto */ /* NOTE: increment on final dst if */ u_quad_t ifs6_in_truncated; /* # of truncated datagrams */ u_quad_t ifs6_in_discard; /* # of discarded datagrams */ /* NOTE: fragment timeout is not here */ u_quad_t ifs6_in_deliver; /* # of datagrams delivered to ULP */ /* NOTE: increment on final dst if */ u_quad_t ifs6_out_forward; /* # of datagrams forwarded */ /* NOTE: increment on outgoing if */ u_quad_t ifs6_out_request; /* # of outgoing datagrams from ULP */ /* NOTE: does not include forwrads */ u_quad_t ifs6_out_discard; /* # of discarded datagrams */ u_quad_t ifs6_out_fragok; /* # of datagrams fragmented */ u_quad_t ifs6_out_fragfail; /* # of datagrams failed on fragment */ u_quad_t ifs6_out_fragcreat; /* # of fragment datagrams */ /* NOTE: this is # after fragment */ u_quad_t ifs6_reass_reqd; /* # of incoming fragmented packets */ /* NOTE: increment on final dst if */ u_quad_t ifs6_reass_ok; /* # of reassembled packets */ /* NOTE: this is # after reass */ /* NOTE: increment on final dst if */ u_quad_t ifs6_reass_fail; /* # of reass failures */ /* NOTE: may not be packet count */ /* NOTE: increment on final dst if */ u_quad_t ifs6_in_mcast; /* # of inbound multicast datagrams */ u_quad_t ifs6_out_mcast; /* # of outbound multicast datagrams */ }; /* * ICMPv6 interface statistics, as defined in RFC2466 Ipv6IfIcmpEntry. * XXX: I'm not sure if this file is the right place for this structure... */ struct icmp6_ifstat { /* * Input statistics */ /* ipv6IfIcmpInMsgs, total # of input messages */ u_quad_t ifs6_in_msg; /* ipv6IfIcmpInErrors, # of input error messages */ u_quad_t ifs6_in_error; /* ipv6IfIcmpInDestUnreachs, # of input dest unreach errors */ u_quad_t ifs6_in_dstunreach; /* ipv6IfIcmpInAdminProhibs, # of input administratively prohibited errs */ u_quad_t ifs6_in_adminprohib; /* ipv6IfIcmpInTimeExcds, # of input time exceeded errors */ u_quad_t ifs6_in_timeexceed; /* ipv6IfIcmpInParmProblems, # of input parameter problem errors */ u_quad_t ifs6_in_paramprob; /* ipv6IfIcmpInPktTooBigs, # of input packet too big errors */ u_quad_t ifs6_in_pkttoobig; /* ipv6IfIcmpInEchos, # of input echo requests */ u_quad_t ifs6_in_echo; /* ipv6IfIcmpInEchoReplies, # of input echo replies */ u_quad_t ifs6_in_echoreply; /* ipv6IfIcmpInRouterSolicits, # of input router solicitations */ u_quad_t ifs6_in_routersolicit; /* ipv6IfIcmpInRouterAdvertisements, # of input router advertisements */ u_quad_t ifs6_in_routeradvert; /* ipv6IfIcmpInNeighborSolicits, # of input neighbor solicitations */ u_quad_t ifs6_in_neighborsolicit; /* ipv6IfIcmpInNeighborAdvertisements, # of input neighbor advertisements */ u_quad_t ifs6_in_neighboradvert; /* ipv6IfIcmpInRedirects, # of input redirects */ u_quad_t ifs6_in_redirect; /* ipv6IfIcmpInGroupMembQueries, # of input MLD queries */ u_quad_t ifs6_in_mldquery; /* ipv6IfIcmpInGroupMembResponses, # of input MLD reports */ u_quad_t ifs6_in_mldreport; /* ipv6IfIcmpInGroupMembReductions, # of input MLD done */ u_quad_t ifs6_in_mlddone; /* * Output statistics. We should solve unresolved routing problem... */ /* ipv6IfIcmpOutMsgs, total # of output messages */ u_quad_t ifs6_out_msg; /* ipv6IfIcmpOutErrors, # of output error messages */ u_quad_t ifs6_out_error; /* ipv6IfIcmpOutDestUnreachs, # of output dest unreach errors */ u_quad_t ifs6_out_dstunreach; /* ipv6IfIcmpOutAdminProhibs, # of output administratively prohibited errs */ u_quad_t ifs6_out_adminprohib; /* ipv6IfIcmpOutTimeExcds, # of output time exceeded errors */ u_quad_t ifs6_out_timeexceed; /* ipv6IfIcmpOutParmProblems, # of output parameter problem errors */ u_quad_t ifs6_out_paramprob; /* ipv6IfIcmpOutPktTooBigs, # of output packet too big errors */ u_quad_t ifs6_out_pkttoobig; /* ipv6IfIcmpOutEchos, # of output echo requests */ u_quad_t ifs6_out_echo; /* ipv6IfIcmpOutEchoReplies, # of output echo replies */ u_quad_t ifs6_out_echoreply; /* ipv6IfIcmpOutRouterSolicits, # of output router solicitations */ u_quad_t ifs6_out_routersolicit; /* ipv6IfIcmpOutRouterAdvertisements, # of output router advertisements */ u_quad_t ifs6_out_routeradvert; /* ipv6IfIcmpOutNeighborSolicits, # of output neighbor solicitations */ u_quad_t ifs6_out_neighborsolicit; /* ipv6IfIcmpOutNeighborAdvertisements, # of output neighbor advertisements */ u_quad_t ifs6_out_neighboradvert; /* ipv6IfIcmpOutRedirects, # of output redirects */ u_quad_t ifs6_out_redirect; /* ipv6IfIcmpOutGroupMembQueries, # of output MLD queries */ u_quad_t ifs6_out_mldquery; /* ipv6IfIcmpOutGroupMembResponses, # of output MLD reports */ u_quad_t ifs6_out_mldreport; /* ipv6IfIcmpOutGroupMembReductions, # of output MLD done */ u_quad_t ifs6_out_mlddone; }; /* * If you make changes that change the size of in6_ifreq, * make sure you fix compat/netinet6/in6_var.h */ struct in6_ifreq { char ifr_name[IFNAMSIZ]; union { struct sockaddr_in6 ifru_addr; struct sockaddr_in6 ifru_dstaddr; short ifru_flags; int ifru_flags6; int ifru_metric; void * ifru_data; struct in6_addrlifetime ifru_lifetime; struct in6_ifstat ifru_stat; struct icmp6_ifstat ifru_icmp6stat; } ifr_ifru; }; struct in6_aliasreq { char ifra_name[IFNAMSIZ]; struct sockaddr_in6 ifra_addr; struct sockaddr_in6 ifra_dstaddr; struct sockaddr_in6 ifra_prefixmask; int ifra_flags; struct in6_addrlifetime ifra_lifetime; }; /* * Given a pointer to an in6_ifaddr (ifaddr), * return a pointer to the addr as a sockaddr_in6 */ #define IA6_IN6(ia) (&((ia)->ia_addr.sin6_addr)) #define IA6_DSTIN6(ia) (&((ia)->ia_dstaddr.sin6_addr)) #define IA6_MASKIN6(ia) (&((ia)->ia_prefixmask.sin6_addr)) #define IA6_SIN6(ia) (&((ia)->ia_addr)) #define IA6_DSTSIN6(ia) (&((ia)->ia_dstaddr)) #define IFA_IN6(x) (&((struct sockaddr_in6 *)((x)->ifa_addr))->sin6_addr) #define IFA_DSTIN6(x) (&((struct sockaddr_in6 *)((x)->ifa_dstaddr))->sin6_addr) #ifdef _KERNEL #define IN6_ARE_MASKED_ADDR_EQUAL(d, a, m) ( \ (((d)->s6_addr32[0] ^ (a)->s6_addr32[0]) & (m)->s6_addr32[0]) == 0 && \ (((d)->s6_addr32[1] ^ (a)->s6_addr32[1]) & (m)->s6_addr32[1]) == 0 && \ (((d)->s6_addr32[2] ^ (a)->s6_addr32[2]) & (m)->s6_addr32[2]) == 0 && \ (((d)->s6_addr32[3] ^ (a)->s6_addr32[3]) & (m)->s6_addr32[3]) == 0 ) #endif #define SIOCSIFADDR_IN6 _IOW('i', 12, struct in6_ifreq) #define SIOCGIFADDR_IN6 _IOWR('i', 33, struct in6_ifreq) #ifdef _KERNEL /* * SIOCSxxx ioctls should be unused (see comments in in6.c), but * we do not shift numbers for binary compatibility. */ #define SIOCSIFDSTADDR_IN6 _IOW('i', 14, struct in6_ifreq) #define SIOCSIFNETMASK_IN6 _IOW('i', 22, struct in6_ifreq) #endif #define SIOCGIFDSTADDR_IN6 _IOWR('i', 34, struct in6_ifreq) #define SIOCGIFNETMASK_IN6 _IOWR('i', 37, struct in6_ifreq) #define SIOCDIFADDR_IN6 _IOW('i', 25, struct in6_ifreq) /* 26 was OSIOCAIFADDR_IN6 */ /* 70 was OSIOCSIFPHYADDR_IN6 */ #define SIOCGIFPSRCADDR_IN6 _IOWR('i', 71, struct in6_ifreq) #define SIOCGIFPDSTADDR_IN6 _IOWR('i', 72, struct in6_ifreq) #define SIOCGIFAFLAG_IN6 _IOWR('i', 73, struct in6_ifreq) /* * 74 was SIOCGDRLST_IN6 * 75 was SIOCGPRLST_IN6 * 76 was OSIOCGIFINFO_IN6 * 77 was SIOCSNDFLUSH_IN6 */ #define SIOCGNBRINFO_IN6 _IOWR('i', 78, struct in6_nbrinfo) /* * 79 was SIOCSPFXFLUSH_IN6 * 80 was SIOCSRTRFLUSH_IN6 * 81 was SIOCGIFALIFETIME_IN6 */ #if 0 /* withdrawn - do not reuse number 82 */ #define SIOCSIFALIFETIME_IN6 _IOWR('i', 82, struct in6_ifreq) #endif #define SIOCGIFSTAT_IN6 _IOWR('i', 83, struct in6_ifreq) #define SIOCGIFSTAT_ICMP6 _IOWR('i', 84, struct in6_ifreq) /* * 85 was SIOCSDEFIFACE_IN6 * 86 was SIOCGDEFIFACE_IN6 * 87 was OSIOCSIFINFO_FLAGS * 100 was SIOCSIFPREFIX_IN6 * 101 was SIOCGIFPREFIX_IN6 * 102 was SIOCDIFPREFIX_IN6 * 103 was SIOCAIFPREFIX_IN6 * 104 was SIOCCIFPREFIX_IN6 * 105 was SIOCSGIFPREFIX_IN6 */ #define SIOCGIFALIFETIME_IN6 _IOWR('i', 106, struct in6_ifreq) #define SIOCAIFADDR_IN6 _IOW('i', 107, struct in6_aliasreq) /* 108 was OSIOCGIFINFO_IN6_90 * 109 was OSIOCSIFINFO_IN6_90 */ #define SIOCSIFPHYADDR_IN6 _IOW('i', 110, struct in6_aliasreq) /* 110 - 112 are defined in net/if_pppoe.h */ #define SIOCGIFINFO_IN6 _IOWR('i', 113, struct in6_ndireq) #define SIOCSIFINFO_IN6 _IOWR('i', 114, struct in6_ndireq) #define SIOCSIFINFO_FLAGS _IOWR('i', 115, struct in6_ndireq) /* XXX: Someone decided to switch to 'u' here for unknown reasons! */ #define SIOCGETSGCNT_IN6 _IOWR('u', 106, \ struct sioc_sg_req6) /* get s,g pkt cnt */ #define SIOCGETMIFCNT_IN6 _IOWR('u', 107, \ struct sioc_mif_req6) /* get pkt cnt per if */ #define SIOCAADDRCTL_POLICY _IOW('u', 108, struct in6_addrpolicy) #define SIOCDADDRCTL_POLICY _IOW('u', 109, struct in6_addrpolicy) #define IN6_IFF_ANYCAST 0x01 /* anycast address */ #define IN6_IFF_TENTATIVE 0x02 /* tentative address */ #define IN6_IFF_DUPLICATED 0x04 /* DAD detected duplicate */ #define IN6_IFF_DETACHED 0x08 /* may be detached from the link */ #define IN6_IFF_DEPRECATED 0x10 /* deprecated address */ #define IN6_IFF_NODAD 0x20 /* don't perform DAD on this address * (used only at first SIOC* call) */ #define IN6_IFF_AUTOCONF 0x40 /* autoconfigurable address. */ #define IN6_IFF_TEMPORARY 0x80 /* temporary (anonymous) address. */ #define IN6_IFFBITS \ "\020\1ANYCAST\2TENTATIVE\3DUPLICATED\4DETACHED\5DEPRECATED\6NODAD" \ "\7AUTOCONF\10TEMPORARY" /* do not input/output */ #define IN6_IFF_NOTREADY (IN6_IFF_TENTATIVE|IN6_IFF_DUPLICATED) #ifdef _KERNEL #define IN6_ARE_SCOPE_CMP(a,b) ((a)-(b)) #define IN6_ARE_SCOPE_EQUAL(a,b) ((a)==(b)) #endif #ifdef _KERNEL #include <sys/mutex.h> #include <sys/pserialize.h> #include <net/pktqueue.h> extern pktqueue_t *ip6_pktq; MALLOC_DECLARE(M_IP6OPT); extern struct pslist_head in6_ifaddr_list; extern kmutex_t in6_ifaddr_lock; #define IN6_ADDRLIST_ENTRY_INIT(__ia) \ PSLIST_ENTRY_INIT((__ia), ia6_pslist_entry) #define IN6_ADDRLIST_ENTRY_DESTROY(__ia) \ PSLIST_ENTRY_DESTROY((__ia), ia6_pslist_entry) #define IN6_ADDRLIST_READER_EMPTY() \ (PSLIST_READER_FIRST(&in6_ifaddr_list, struct in6_ifaddr, \ ia6_pslist_entry) == NULL) #define IN6_ADDRLIST_READER_FIRST() \ PSLIST_READER_FIRST(&in6_ifaddr_list, struct in6_ifaddr, \ ia6_pslist_entry) #define IN6_ADDRLIST_READER_NEXT(__ia) \ PSLIST_READER_NEXT((__ia), struct in6_ifaddr, ia6_pslist_entry) #define IN6_ADDRLIST_READER_FOREACH(__ia) \ PSLIST_READER_FOREACH((__ia), &in6_ifaddr_list, \ struct in6_ifaddr, ia6_pslist_entry) #define IN6_ADDRLIST_WRITER_INSERT_HEAD(__ia) \ PSLIST_WRITER_INSERT_HEAD(&in6_ifaddr_list, (__ia), ia6_pslist_entry) #define IN6_ADDRLIST_WRITER_REMOVE(__ia) \ PSLIST_WRITER_REMOVE((__ia), ia6_pslist_entry) #define IN6_ADDRLIST_WRITER_FOREACH(__ia) \ PSLIST_WRITER_FOREACH((__ia), &in6_ifaddr_list, struct in6_ifaddr, \ ia6_pslist_entry) #define IN6_ADDRLIST_WRITER_FIRST() \ PSLIST_WRITER_FIRST(&in6_ifaddr_list, struct in6_ifaddr, \ ia6_pslist_entry) #define IN6_ADDRLIST_WRITER_NEXT(__ia) \ PSLIST_WRITER_NEXT((__ia), struct in6_ifaddr, ia6_pslist_entry) #define IN6_ADDRLIST_WRITER_INSERT_AFTER(__ia, __new) \ PSLIST_WRITER_INSERT_AFTER((__ia), (__new), ia6_pslist_entry) #define IN6_ADDRLIST_WRITER_EMPTY() \ (PSLIST_WRITER_FIRST(&in6_ifaddr_list, struct in6_ifaddr, \ ia6_pslist_entry) == NULL) #define IN6_ADDRLIST_WRITER_INSERT_TAIL(__new) \ do { \ if (IN6_ADDRLIST_WRITER_EMPTY()) { \ IN6_ADDRLIST_WRITER_INSERT_HEAD((__new)); \ } else { \ struct in6_ifaddr *__ia; \ IN6_ADDRLIST_WRITER_FOREACH(__ia) { \ if (IN6_ADDRLIST_WRITER_NEXT(__ia) == NULL) { \ IN6_ADDRLIST_WRITER_INSERT_AFTER(__ia,\ (__new)); \ break; \ } \ } \ } \ } while (0) #define in6_ifstat_inc(ifp, tag) \ do { \ if (ifp) \ ((struct in6_ifextra *)((ifp)->if_afdata[AF_INET6]))->in6_ifstat->tag++; \ } while (/*CONSTCOND*/ 0) extern const struct in6_addr zeroin6_addr; extern const u_char inet6ctlerrmap[]; extern bool in6_present; /* * Macro for finding the internet address structure (in6_ifaddr) corresponding * to a given interface (ifnet structure). */ static __inline struct in6_ifaddr * in6_get_ia_from_ifp(struct ifnet *ifp) { struct ifaddr *ifa; IFADDR_READER_FOREACH(ifa, ifp) { if (ifa->ifa_addr->sa_family == AF_INET6) break; } return (struct in6_ifaddr *)ifa; } static __inline struct in6_ifaddr * in6_get_ia_from_ifp_psref(struct ifnet *ifp, struct psref *psref) { struct in6_ifaddr *ia; int s; s = pserialize_read_enter(); ia = in6_get_ia_from_ifp(ifp); if (ia != NULL) ia6_acquire(ia, psref); pserialize_read_exit(s); return ia; } #endif /* _KERNEL */ /* * Multi-cast membership entry. One for each group/ifp that a PCB * belongs to. */ struct in6_multi_mship { struct in6_multi *i6mm_maddr; /* Multicast address pointer */ LIST_ENTRY(in6_multi_mship) i6mm_chain; /* multicast options chain */ }; struct in6_multi { LIST_ENTRY(in6_multi) in6m_entry; /* list glue */ struct in6_addr in6m_addr; /* IP6 multicast address */ struct ifnet *in6m_ifp; /* back pointer to ifnet */ /* DEPRECATED. Keep it to avoid breaking kvm(3) users */ struct in6_ifaddr *_in6m_ia; /* back pointer to in6_ifaddr */ u_int in6m_refcount; /* # membership claims by sockets */ u_int in6m_state; /* state of the membership */ int in6m_timer; /* delay to send the 1st report */ struct timeval in6m_timer_expire; /* when the timer expires */ callout_t in6m_timer_ch; }; #define IN6M_TIMER_UNDEF -1 #ifdef _KERNEL /* flags to in6_update_ifa */ #define IN6_IFAUPDATE_DADDELAY 0x1 /* first time to configure an address */ #if 0 /* * Macros for looking up the in6_multi_mship record for a given IP6 multicast * address on a given interface. If no matching record is found, "imm" * returns NULL. */ static __inline struct in6_multi_mship * in6_lookup_mship(struct in6_addr *addr, struct ifnet *ifp, struct ip6_moptions *imop) { struct in6_multi_mship *imm; LIST_FOREACH(imm, &imop->im6o_memberships, i6mm_chain) { if (imm->i6mm_maddr->in6m_ifp != ifp) continue; if (IN6_ARE_ADDR_EQUAL(&imm->i6mm_maddr->in6m_addr, addr)) break; } return imm; } #define IN6_LOOKUP_MSHIP(__addr, __ifp, __imop, __imm) \ /* struct in6_addr __addr; */ \ /* struct ifnet *__ifp; */ \ /* struct ip6_moptions *__imop */ \ /* struct in6_multi_mship *__imm; */ \ do { \ (__imm) = in6_lookup_mship(&(__addr), (__ifp), (__imop)); \ } while (/*CONSTCOND*/ 0) #endif void in6_init(void); void in6_multi_lock(int); void in6_multi_unlock(void); bool in6_multi_locked(int); struct in6_multi * in6_lookup_multi(const struct in6_addr *, const struct ifnet *); bool in6_multi_group(const struct in6_addr *, const struct ifnet *); void in6_purge_multi(struct ifnet *); struct in6_multi *in6_addmulti(struct in6_addr *, struct ifnet *, int *, int); void in6_delmulti(struct in6_multi *); void in6_delmulti_locked(struct in6_multi *); void in6_lookup_and_delete_multi(const struct in6_addr *, const struct ifnet *); struct in6_multi_mship *in6_joingroup(struct ifnet *, struct in6_addr *, int *, int); int in6_leavegroup(struct in6_multi_mship *); int in6_mask2len(struct in6_addr *, u_char *); int in6_control(struct socket *, u_long, void *, struct ifnet *); int in6_update_ifa(struct ifnet *, struct in6_aliasreq *, int); void in6_purgeaddr(struct ifaddr *); void in6_purgeif(struct ifnet *); void *in6_domifattach(struct ifnet *); void in6_domifdetach(struct ifnet *, void *); void in6_ifremlocal(struct ifaddr *); void in6_ifaddlocal(struct ifaddr *); struct in6_ifaddr * in6ifa_ifpforlinklocal(const struct ifnet *, int); struct in6_ifaddr * in6ifa_ifpforlinklocal_psref(const struct ifnet *, int, struct psref *); struct in6_ifaddr * in6ifa_ifpwithaddr(const struct ifnet *, const struct in6_addr *); struct in6_ifaddr * in6ifa_ifpwithaddr_psref(const struct ifnet *, const struct in6_addr *, struct psref *); struct in6_ifaddr *in6ifa_ifwithaddr(const struct in6_addr *, uint32_t); int in6_matchlen(struct in6_addr *, struct in6_addr *); void in6_prefixlen2mask(struct in6_addr *, int); void in6_purge_mcast_references(struct in6_multi *); int ip6flow_fastforward(struct mbuf **); /* IPv6 fast forward routine */ int in6_src_ioctl(u_long, void *); int in6_is_addr_deprecated(struct sockaddr_in6 *); struct in6pcb; #define LLTABLE6(ifp) (((struct in6_ifextra *)(ifp)->if_afdata[AF_INET6])->lltable) void in6_sysctl_multicast_setup(struct sysctllog **); #endif /* _KERNEL */ #endif /* !_NETINET6_IN6_VAR_H_ */
1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 /* $NetBSD: joy.c,v 1.21 2017/10/28 04:53:55 riastradh Exp $ */ /*- * Copyright (c) 2008 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software developed for The NetBSD Foundation * by Andrew Doran. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /*- * Copyright (c) 1995 Jean-Marc Zucconi * All rights reserved. * * Ported to NetBSD by Matthieu Herrb <matthieu@laas.fr> * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer * in this position and unchanged. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The name of the author may not be used to endorse or promote products * derived from this software without specific prior written permission * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: joy.c,v 1.21 2017/10/28 04:53:55 riastradh Exp $"); #include <sys/param.h> #include <sys/systm.h> #include <sys/kernel.h> #include <sys/device.h> #include <sys/errno.h> #include <sys/conf.h> #include <sys/event.h> #include <sys/vnode.h> #include <sys/bus.h> #include <sys/joystick.h> #include <dev/ic/joyvar.h> #include "ioconf.h" /* * The game port can manage 4 buttons and 4 variable resistors (usually 2 * joysticks, each with 2 buttons and 2 pots.) via the port at address 0x201. * Getting the state of the buttons is done by reading the game port; * buttons 1-4 correspond to bits 4-7 and resistors 1-4 (X1, Y1, X2, Y2) * to bits 0-3. If button 1 (resp 2, 3, 4) is pressed, the bit 4 (resp 5, * 6, 7) is set to 0 to get the value of a resistor, write the value 0xff * at port and wait until the corresponding bit returns to 0. */ #define JOYPART(d) (minor(d) & 1) #define JOYUNIT(d) (minor(d) >> 1) #ifndef JOY_TIMEOUT #define JOY_TIMEOUT 2000 /* 2 milliseconds */ #endif static dev_type_open(joyopen); static dev_type_close(joyclose); static dev_type_read(joyread); static dev_type_ioctl(joyioctl); const struct cdevsw joy_cdevsw = { .d_open = joyopen, .d_close = joyclose, .d_read = joyread, .d_write = nowrite, .d_ioctl = joyioctl, .d_stop = nostop, .d_tty = notty, .d_poll = nopoll, .d_mmap = nommap, .d_kqfilter = nokqfilter, .d_discard = nodiscard, .d_flag = D_OTHER | D_MPSAFE }; void joyattach(struct joy_softc *sc) { if (sc->sc_lock == NULL) { panic("joyattach: no lock"); } sc->timeout[0] = 0; sc->timeout[1] = 0; mutex_enter(sc->sc_lock); bus_space_write_1(sc->sc_iot, sc->sc_ioh, 0, 0xff); DELAY(10000); /* 10 ms delay */ aprint_normal_dev(sc->sc_dev, "joystick %sconnected\n", (bus_space_read_1(sc->sc_iot, sc->sc_ioh, 0) & 0x0f) == 0x0f ? "not " : ""); mutex_exit(sc->sc_lock); } int joydetach(struct joy_softc *sc, int flags) { int maj, mn; maj = cdevsw_lookup_major(&joy_cdevsw); mn = device_unit(sc->sc_dev) << 1; vdevgone(maj, mn, mn, VCHR); vdevgone(maj, mn + 1, mn + 1, VCHR); return 0; } static int joyopen(dev_t dev, int flag, int mode, struct lwp *l) { int unit = JOYUNIT(dev); int i = JOYPART(dev); struct joy_softc *sc; sc = device_lookup_private(&joy_cd, unit); if (sc == NULL) return ENXIO; mutex_enter(sc->sc_lock); if (sc->timeout[i]) { mutex_exit(sc->sc_lock); return EBUSY; } sc->x_off[i] = sc->y_off[i] = 0; sc->timeout[i] = JOY_TIMEOUT; mutex_exit(sc->sc_lock); return 0; } static int joyclose(dev_t dev, int flag, int mode, struct lwp *l) { int unit = JOYUNIT(dev); int i = JOYPART(dev); struct joy_softc *sc = device_lookup_private(&joy_cd, unit); mutex_enter(sc->sc_lock); sc->timeout[i] = 0; mutex_exit(sc->sc_lock); return 0; } static int joyread(dev_t dev, struct uio *uio, int flag) { int unit = JOYUNIT(dev); struct joy_softc *sc = device_lookup_private(&joy_cd, unit); bus_space_tag_t iot = sc->sc_iot; bus_space_handle_t ioh = sc->sc_ioh; struct joystick c; struct timeval start, now, diff; int state = 0, x = 0, y = 0, i; mutex_enter(sc->sc_lock); bus_space_write_1(iot, ioh, 0, 0xff); microtime(&start); now = start; /* structure assignment */ i = sc->timeout[JOYPART(dev)]; for (;;) { timersub(&now, &start, &diff); if (diff.tv_sec > 0 || diff.tv_usec > i) break; state = bus_space_read_1(iot, ioh, 0); if (JOYPART(dev) == 1) state >>= 2; if (!x && !(state & 0x01)) x = diff.tv_usec; if (!y && !(state & 0x02)) y = diff.tv_usec; if (x && y) break; microtime(&now); } mutex_exit(sc->sc_lock); c.x = x ? sc->x_off[JOYPART(dev)] + x : 0x80000000; c.y = y ? sc->y_off[JOYPART(dev)] + y : 0x80000000; state >>= 4; c.b1 = ~state & 1; c.b2 = ~(state >> 1) & 1; return uiomove(&c, sizeof(struct joystick), uio); } static int joyioctl(dev_t dev, u_long cmd, void *data, int flag, struct lwp *l) { int unit = JOYUNIT(dev); struct joy_softc *sc = device_lookup_private(&joy_cd, unit); int i = JOYPART(dev), x, error; mutex_enter(sc->sc_lock); error = 0; switch (cmd) { case JOY_SETTIMEOUT: x = *(int *)data; if (x < 1 || x > 10000) { /* 10ms maximum! */ error = EINVAL; break; } sc->timeout[i] = x; break; case JOY_GETTIMEOUT: *(int *)data = sc->timeout[i]; break; case JOY_SET_X_OFFSET: sc->x_off[i] = *(int *)data; break; case JOY_SET_Y_OFFSET: sc->y_off[i] = *(int *)data; break; case JOY_GET_X_OFFSET: *(int *)data = sc->x_off[i]; break; case JOY_GET_Y_OFFSET: *(int *)data = sc->y_off[i]; break; default: error = ENXIO; break; } mutex_exit(sc->sc_lock); return error; }
9 2 3 7 2 5 2 2 3 2 5 5 5 5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 /* $NetBSD: exec_script.c,v 1.83 2021/05/03 10:25:14 fcambus Exp $ */ /* * Copyright (c) 1993, 1994, 1996 Christopher G. Demetriou * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by Christopher G. Demetriou. * 4. The name of the author may not be used to endorse or promote products * derived from this software without specific prior written permission * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: exec_script.c,v 1.83 2021/05/03 10:25:14 fcambus Exp $"); #ifdef _KERNEL_OPT #include "opt_script.h" #endif #if defined(SETUIDSCRIPTS) && !defined(FDSCRIPTS) #define FDSCRIPTS /* Need this for safe set-id scripts. */ #endif #include <sys/param.h> #include <sys/systm.h> #include <sys/proc.h> #include <sys/kmem.h> #include <sys/vnode.h> #include <sys/namei.h> #include <sys/file.h> #ifdef SETUIDSCRIPTS #include <sys/stat.h> #endif #include <sys/filedesc.h> #include <sys/exec.h> #include <sys/resourcevar.h> #include <sys/module.h> #include <sys/exec_script.h> #include <sys/exec_elf.h> MODULE(MODULE_CLASS_EXEC, exec_script, NULL); static struct execsw exec_script_execsw = { .es_hdrsz = SCRIPT_HDR_SIZE, .es_makecmds = exec_script_makecmds, .u = { .elf_probe_func = NULL, }, .es_emul = NULL, .es_prio = EXECSW_PRIO_ANY, .es_arglen = 0, .es_copyargs = NULL, .es_setregs = NULL, .es_coredump = NULL, .es_setup_stack = exec_setup_stack, }; static int exec_script_modcmd(modcmd_t cmd, void *arg) { switch (cmd) { case MODULE_CMD_INIT: return exec_add(&exec_script_execsw, 1); case MODULE_CMD_FINI: return exec_remove(&exec_script_execsw, 1); case MODULE_CMD_AUTOUNLOAD: /* * We don't want to be autounloaded because our use is * transient: no executables with p_execsw equal to * exec_script_execsw will exist, so FINI will never * return EBUSY. However, the system will run scripts * often. Return EBUSY here to prevent this module from * ping-ponging in and out of the kernel. */ return EBUSY; default: return ENOTTY; } } /* * exec_script_makecmds(): Check if it's an executable shell script. * * Given a proc pointer and an exec package pointer, see if the referent * of the epp is in shell script. If it is, then set thing up so that * the script can be run. This involves preparing the address space * and arguments for the shell which will run the script. * * This function is ultimately responsible for creating a set of vmcmds * which can be used to build the process's vm space and inserting them * into the exec package. */ int exec_script_makecmds(struct lwp *l, struct exec_package *epp) { int error, hdrlinelen, shellnamelen, shellarglen; char *hdrstr = epp->ep_hdr; char *cp, *shellname, *shellarg; size_t shellargp_len; struct exec_fakearg *shellargp; struct exec_fakearg *tmpsap; struct pathbuf *shell_pathbuf; struct vnode *scriptvp; #ifdef SETUIDSCRIPTS /* Gcc needs those initialized for spurious uninitialized warning */ uid_t script_uid = (uid_t) -1; gid_t script_gid = NOGROUP; u_short script_sbits; #endif /* * if the magic isn't that of a shell script, or we've already * done shell script processing for this exec, punt on it. */ if ((epp->ep_flags & EXEC_INDIR) != 0 || epp->ep_hdrvalid < EXEC_SCRIPT_MAGICLEN || strncmp(hdrstr, EXEC_SCRIPT_MAGIC, EXEC_SCRIPT_MAGICLEN)) return ENOEXEC; /* * Check that the shell spec is terminated by a newline, and that * it isn't too large. */ hdrlinelen = uimin(epp->ep_hdrvalid, SCRIPT_HDR_SIZE); for (cp = hdrstr + EXEC_SCRIPT_MAGICLEN; cp < hdrstr + hdrlinelen; cp++) { if (*cp == '\n') { *cp = '\0'; break; } } if (cp >= hdrstr + hdrlinelen) return ENOEXEC; /* strip spaces before the shell name */ for (cp = hdrstr + EXEC_SCRIPT_MAGICLEN; *cp == ' ' || *cp == '\t'; cp++) ; if (*cp == '\0') return ENOEXEC; shellarg = NULL; shellarglen = 0; /* collect the shell name; remember its length for later */ shellname = cp; shellnamelen = 0; for ( /* cp = cp */ ; *cp != '\0' && *cp != ' ' && *cp != '\t'; cp++) shellnamelen++; if (*cp == '\0') goto check_shell; *cp++ = '\0'; /* skip spaces before any argument */ for ( /* cp = cp */ ; *cp == ' ' || *cp == '\t'; cp++) ; if (*cp == '\0') goto check_shell; /* * collect the shell argument. everything after the shell name * is passed as ONE argument; that's the correct (historical) * behaviour. */ shellarg = cp; for ( /* cp = cp */ ; *cp != '\0'; cp++) shellarglen++; *cp++ = '\0'; check_shell: #ifdef SETUIDSCRIPTS /* * MNT_NOSUID has already taken care of by check_exec, * so we don't need to worry about it now or later. We * will need to check PSL_TRACED later, however. */ script_sbits = epp->ep_vap->va_mode & (S_ISUID | S_ISGID); if (script_sbits != 0) { script_uid = epp->ep_vap->va_uid; script_gid = epp->ep_vap->va_gid; } #endif #ifdef FDSCRIPTS /* * if the script isn't readable, or it's set-id, then we've * gotta supply a "/dev/fd/..." for the shell to read. * Note that stupid shells (csh) do the wrong thing, and * close all open fd's when they start. That kills this * method of implementing "safe" set-id and x-only scripts. */ vn_lock(epp->ep_vp, LK_SHARED | LK_RETRY); error = VOP_ACCESS(epp->ep_vp, VREAD, l->l_cred); VOP_UNLOCK(epp->ep_vp); if (error == EACCES #ifdef SETUIDSCRIPTS || script_sbits #endif ) { struct file *fp; KASSERT(!(epp->ep_flags & EXEC_HASFD)); if ((error = fd_allocfile(&fp, &epp->ep_fd)) != 0) { scriptvp = NULL; shellargp = NULL; goto fail; } epp->ep_flags |= EXEC_HASFD; fp->f_type = DTYPE_VNODE; fp->f_ops = &vnops; fp->f_vnode = epp->ep_vp; fp->f_flag = FREAD; fd_affix(curproc, fp, epp->ep_fd); } #endif /* set up the fake args list */ shellargp_len = 4 * sizeof(*shellargp); shellargp = kmem_alloc(shellargp_len, KM_SLEEP); tmpsap = shellargp; tmpsap->fa_len = shellnamelen + 1; tmpsap->fa_arg = kmem_alloc(tmpsap->fa_len, KM_SLEEP); strlcpy(tmpsap->fa_arg, shellname, tmpsap->fa_len); tmpsap++; if (shellarg != NULL) { tmpsap->fa_len = shellarglen + 1; tmpsap->fa_arg = kmem_alloc(tmpsap->fa_len, KM_SLEEP); strlcpy(tmpsap->fa_arg, shellarg, tmpsap->fa_len); tmpsap++; } tmpsap->fa_len = MAXPATHLEN; tmpsap->fa_arg = kmem_alloc(tmpsap->fa_len, KM_SLEEP); #ifdef FDSCRIPTS if ((epp->ep_flags & EXEC_HASFD) == 0) { #endif /* normally can't fail, but check for it if diagnostic */ error = copystr(epp->ep_kname, tmpsap->fa_arg, MAXPATHLEN, NULL); KASSERT(error == 0); tmpsap++; #ifdef FDSCRIPTS } else { snprintf(tmpsap->fa_arg, MAXPATHLEN, "/dev/fd/%d", epp->ep_fd); tmpsap++; } #endif tmpsap->fa_arg = NULL; /* Save the old vnode so we can clean it up later. */ scriptvp = epp->ep_vp; epp->ep_vp = NULL; /* Note that we're trying recursively. */ epp->ep_flags |= EXEC_INDIR; /* * mark the header we have as invalid; check_exec will read * the header from the new executable */ epp->ep_hdrvalid = 0; /* try loading the interpreter */ if ((error = exec_makepathbuf(l, shellname, UIO_SYSSPACE, &shell_pathbuf, NULL)) == 0) { error = check_exec(l, epp, shell_pathbuf, NULL); pathbuf_destroy(shell_pathbuf); } /* note that we've clobbered the header */ epp->ep_flags |= EXEC_DESTR; if (error == 0) { /* * It succeeded. Unlock the script and * close it if we aren't using it any more. * Also, set things up so that the fake args * list will be used. */ if ((epp->ep_flags & EXEC_HASFD) == 0) { vn_lock(scriptvp, LK_EXCLUSIVE | LK_RETRY); VOP_CLOSE(scriptvp, FREAD, l->l_cred); vput(scriptvp); } epp->ep_flags |= (EXEC_HASARGL | EXEC_SKIPARG); epp->ep_fa = shellargp; epp->ep_fa_len = shellargp_len; #ifdef SETUIDSCRIPTS /* * set thing up so that set-id scripts will be * handled appropriately. PSL_TRACED will be * checked later when the shell is actually * exec'd. */ epp->ep_vap->va_mode |= script_sbits; if (script_sbits & S_ISUID) epp->ep_vap->va_uid = script_uid; if (script_sbits & S_ISGID) epp->ep_vap->va_gid = script_gid; #endif return (0); } #ifdef FDSCRIPTS fail: #endif /* kill the opened file descriptor, else close the file */ if (epp->ep_flags & EXEC_HASFD) { epp->ep_flags &= ~EXEC_HASFD; fd_close(epp->ep_fd); } else if (scriptvp) { vn_lock(scriptvp, LK_EXCLUSIVE | LK_RETRY); VOP_CLOSE(scriptvp, FREAD, l->l_cred); vput(scriptvp); } /* free the fake arg list, because we're not returning it */ if ((tmpsap = shellargp) != NULL) { while (tmpsap->fa_arg != NULL) { kmem_free(tmpsap->fa_arg, tmpsap->fa_len); tmpsap++; } kmem_free(shellargp, shellargp_len); } /* * free any vmspace-creation commands, * and release their references */ kill_vmcmds(&epp->ep_vmcmds); return error; }
8 19 8 12 12 12 8 8 8 8 8 8 8 8 7 8 8 8 12 12 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 /* $NetBSD: if_vioif.c,v 1.111 2024/03/21 12:33:21 isaki Exp $ */ /* * Copyright (c) 2020 The NetBSD Foundation, Inc. * Copyright (c) 2010 Minoura Makoto. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: if_vioif.c,v 1.111 2024/03/21 12:33:21 isaki Exp $"); #ifdef _KERNEL_OPT #include "opt_net_mpsafe.h" #endif #include <sys/param.h> #include <sys/systm.h> #include <sys/kernel.h> #include <sys/atomic.h> #include <sys/bus.h> #include <sys/condvar.h> #include <sys/device.h> #include <sys/evcnt.h> #include <sys/intr.h> #include <sys/kmem.h> #include <sys/mbuf.h> #include <sys/mutex.h> #include <sys/sockio.h> #include <sys/syslog.h> #include <sys/cpu.h> #include <sys/module.h> #include <sys/pcq.h> #include <sys/workqueue.h> #include <sys/xcall.h> #include <dev/pci/virtioreg.h> #include <dev/pci/virtiovar.h> #include <net/if.h> #include <net/if_dl.h> #include <net/if_media.h> #include <net/if_ether.h> #include <net/bpf.h> #include "ioconf.h" #ifdef NET_MPSAFE #define VIOIF_MPSAFE 1 #define VIOIF_MULTIQ 1 #endif /* * if_vioifreg.h: */ /* Configuration registers */ #define VIRTIO_NET_CONFIG_MAC 0 /* 8bit x 6byte */ #define VIRTIO_NET_CONFIG_STATUS 6 /* 16bit */ #define VIRTIO_NET_CONFIG_MAX_VQ_PAIRS 8 /* 16bit */ #define VIRTIO_NET_CONFIG_MTU 10 /* 16bit */ /* Feature bits */ #define VIRTIO_NET_F_CSUM __BIT(0) #define VIRTIO_NET_F_GUEST_CSUM __BIT(1) #define VIRTIO_NET_F_MAC __BIT(5) #define VIRTIO_NET_F_GSO __BIT(6) #define VIRTIO_NET_F_GUEST_TSO4 __BIT(7) #define VIRTIO_NET_F_GUEST_TSO6 __BIT(8) #define VIRTIO_NET_F_GUEST_ECN __BIT(9) #define VIRTIO_NET_F_GUEST_UFO __BIT(10) #define VIRTIO_NET_F_HOST_TSO4 __BIT(11) #define VIRTIO_NET_F_HOST_TSO6 __BIT(12) #define VIRTIO_NET_F_HOST_ECN __BIT(13) #define VIRTIO_NET_F_HOST_UFO __BIT(14) #define VIRTIO_NET_F_MRG_RXBUF __BIT(15) #define VIRTIO_NET_F_STATUS __BIT(16) #define VIRTIO_NET_F_CTRL_VQ __BIT(17) #define VIRTIO_NET_F_CTRL_RX __BIT(18) #define VIRTIO_NET_F_CTRL_VLAN __BIT(19) #define VIRTIO_NET_F_CTRL_RX_EXTRA __BIT(20) #define VIRTIO_NET_F_GUEST_ANNOUNCE __BIT(21) #define VIRTIO_NET_F_MQ __BIT(22) #define VIRTIO_NET_F_CTRL_MAC_ADDR __BIT(23) #define VIRTIO_NET_FLAG_BITS \ VIRTIO_COMMON_FLAG_BITS \ "b\x17" "CTRL_MAC\0" \ "b\x16" "MQ\0" \ "b\x15" "GUEST_ANNOUNCE\0" \ "b\x14" "CTRL_RX_EXTRA\0" \ "b\x13" "CTRL_VLAN\0" \ "b\x12" "CTRL_RX\0" \ "b\x11" "CTRL_VQ\0" \ "b\x10" "STATUS\0" \ "b\x0f" "MRG_RXBUF\0" \ "b\x0e" "HOST_UFO\0" \ "b\x0d" "HOST_ECN\0" \ "b\x0c" "HOST_TSO6\0" \ "b\x0b" "HOST_TSO4\0" \ "b\x0a" "GUEST_UFO\0" \ "b\x09" "GUEST_ECN\0" \ "b\x08" "GUEST_TSO6\0" \ "b\x07" "GUEST_TSO4\0" \ "b\x06" "GSO\0" \ "b\x05" "MAC\0" \ "b\x01" "GUEST_CSUM\0" \ "b\x00" "CSUM\0" /* Status */ #define VIRTIO_NET_S_LINK_UP 1 /* Packet header structure */ struct virtio_net_hdr { uint8_t flags; uint8_t gso_type; uint16_t hdr_len; uint16_t gso_size; uint16_t csum_start; uint16_t csum_offset; uint16_t num_buffers; /* VIRTIO_NET_F_MRG_RXBUF enabled or v1 */ } __packed; #define VIRTIO_NET_HDR_F_NEEDS_CSUM 1 /* flags */ #define VIRTIO_NET_HDR_GSO_NONE 0 /* gso_type */ #define VIRTIO_NET_HDR_GSO_TCPV4 1 /* gso_type */ #define VIRTIO_NET_HDR_GSO_UDP 3 /* gso_type */ #define VIRTIO_NET_HDR_GSO_TCPV6 4 /* gso_type */ #define VIRTIO_NET_HDR_GSO_ECN 0x80 /* gso_type, |'ed */ #define VIRTIO_NET_MAX_GSO_LEN (65536+ETHER_HDR_LEN) /* Control virtqueue */ struct virtio_net_ctrl_cmd { uint8_t class; uint8_t command; } __packed; #define VIRTIO_NET_CTRL_RX 0 # define VIRTIO_NET_CTRL_RX_PROMISC 0 # define VIRTIO_NET_CTRL_RX_ALLMULTI 1 #define VIRTIO_NET_CTRL_MAC 1 # define VIRTIO_NET_CTRL_MAC_TABLE_SET 0 # define VIRTIO_NET_CTRL_MAC_ADDR_SET 1 #define VIRTIO_NET_CTRL_VLAN 2 # define VIRTIO_NET_CTRL_VLAN_ADD 0 # define VIRTIO_NET_CTRL_VLAN_DEL 1 #define VIRTIO_NET_CTRL_MQ 4 # define VIRTIO_NET_CTRL_MQ_VQ_PAIRS_SET 0 # define VIRTIO_NET_CTRL_MQ_VQ_PAIRS_MIN 1 # define VIRTIO_NET_CTRL_MQ_VQ_PAIRS_MAX 0x8000 struct virtio_net_ctrl_status { uint8_t ack; } __packed; #define VIRTIO_NET_OK 0 #define VIRTIO_NET_ERR 1 struct virtio_net_ctrl_rx { uint8_t onoff; } __packed; struct virtio_net_ctrl_mac_tbl { uint32_t nentries; uint8_t macs[][ETHER_ADDR_LEN]; } __packed; struct virtio_net_ctrl_mac_addr { uint8_t mac[ETHER_ADDR_LEN]; } __packed; struct virtio_net_ctrl_vlan { uint16_t id; } __packed; struct virtio_net_ctrl_mq { uint16_t virtqueue_pairs; } __packed; /* * if_vioifvar.h: */ /* * Locking notes: * + a field in vioif_netqueue is protected by netq_lock (a spin mutex) * - more than one lock cannot be held at onece * + a field in vioif_tx_context and vioif_rx_context is also protected * by netq_lock. * + ctrlq_inuse is protected by ctrlq_wait_lock. * - other fields in vioif_ctrlqueue are protected by ctrlq_inuse * - netq_lock cannot be held along with ctrlq_wait_lock * + fields in vioif_softc except queues are protected by * sc->sc_lock(an adaptive mutex) * - the lock is held before acquisition of other locks */ struct vioif_ctrl_cmdspec { bus_dmamap_t dmamap; void *buf; bus_size_t bufsize; }; struct vioif_work { struct work cookie; void (*func)(void *); void *arg; unsigned int added; }; struct vioif_net_map { struct virtio_net_hdr *vnm_hdr; bus_dmamap_t vnm_hdr_map; struct mbuf *vnm_mbuf; bus_dmamap_t vnm_mbuf_map; }; #define VIOIF_NETQ_RX 0 #define VIOIF_NETQ_TX 1 #define VIOIF_NETQ_IDX 2 #define VIOIF_NETQ_DIR(n) ((n) % VIOIF_NETQ_IDX) #define VIOIF_NETQ_PAIRIDX(n) ((n) / VIOIF_NETQ_IDX) #define VIOIF_NETQ_RXQID(n) ((n) * VIOIF_NETQ_IDX + VIOIF_NETQ_RX) #define VIOIF_NETQ_TXQID(n) ((n) * VIOIF_NETQ_IDX + VIOIF_NETQ_TX) struct vioif_netqueue { kmutex_t netq_lock; struct virtqueue *netq_vq; bool netq_stopping; bool netq_running_handle; void *netq_maps_kva; struct vioif_net_map *netq_maps; void *netq_softint; struct vioif_work netq_work; bool netq_workqueue; char netq_evgroup[32]; struct evcnt netq_mbuf_load_failed; struct evcnt netq_enqueue_failed; void *netq_ctx; }; struct vioif_tx_context { bool txc_link_active; bool txc_no_free_slots; pcq_t *txc_intrq; void *txc_deferred_transmit; struct evcnt txc_defrag_failed; }; struct vioif_rx_context { struct evcnt rxc_mbuf_enobufs; }; struct vioif_ctrlqueue { struct virtqueue *ctrlq_vq; enum { FREE, INUSE, DONE } ctrlq_inuse; kcondvar_t ctrlq_wait; kmutex_t ctrlq_wait_lock; struct lwp *ctrlq_owner; struct virtio_net_ctrl_cmd *ctrlq_cmd; struct virtio_net_ctrl_status *ctrlq_status; struct virtio_net_ctrl_rx *ctrlq_rx; struct virtio_net_ctrl_mac_tbl *ctrlq_mac_tbl_uc; struct virtio_net_ctrl_mac_tbl *ctrlq_mac_tbl_mc; struct virtio_net_ctrl_mac_addr *ctrlq_mac_addr; struct virtio_net_ctrl_mq *ctrlq_mq; bus_dmamap_t ctrlq_cmd_dmamap; bus_dmamap_t ctrlq_status_dmamap; bus_dmamap_t ctrlq_rx_dmamap; bus_dmamap_t ctrlq_tbl_uc_dmamap; bus_dmamap_t ctrlq_tbl_mc_dmamap; bus_dmamap_t ctrlq_mac_addr_dmamap; bus_dmamap_t ctrlq_mq_dmamap; struct evcnt ctrlq_cmd_load_failed; struct evcnt ctrlq_cmd_failed; }; struct vioif_softc { device_t sc_dev; kmutex_t sc_lock; struct sysctllog *sc_sysctllog; struct virtio_softc *sc_virtio; struct virtqueue *sc_vqs; u_int sc_hdr_size; int sc_max_nvq_pairs; int sc_req_nvq_pairs; int sc_act_nvq_pairs; uint8_t sc_mac[ETHER_ADDR_LEN]; struct ethercom sc_ethercom; int sc_link_state; struct vioif_netqueue *sc_netqs; bool sc_has_ctrl; struct vioif_ctrlqueue sc_ctrlq; bus_dma_segment_t sc_segs[1]; void *sc_dmamem; void *sc_kmem; void *sc_cfg_softint; struct workqueue *sc_txrx_workqueue; bool sc_txrx_workqueue_sysctl; u_int sc_tx_intr_process_limit; u_int sc_tx_process_limit; u_int sc_rx_intr_process_limit; u_int sc_rx_process_limit; }; #define VIRTIO_NET_TX_MAXNSEGS (16) /* XXX */ #define VIRTIO_NET_CTRL_MAC_MAXENTRIES (64) /* XXX */ #define VIOIF_TX_INTR_PROCESS_LIMIT 256 #define VIOIF_TX_PROCESS_LIMIT 256 #define VIOIF_RX_INTR_PROCESS_LIMIT 0U #define VIOIF_RX_PROCESS_LIMIT 256 #define VIOIF_WORKQUEUE_PRI PRI_SOFTNET #define VIOIF_IS_LINK_ACTIVE(_sc) ((_sc)->sc_link_state == LINK_STATE_UP ? \ true : false) /* cfattach interface functions */ static int vioif_match(device_t, cfdata_t, void *); static void vioif_attach(device_t, device_t, void *); static int vioif_finalize_teardown(device_t); /* ifnet interface functions */ static int vioif_init(struct ifnet *); static void vioif_stop(struct ifnet *, int); static void vioif_start(struct ifnet *); static int vioif_transmit(struct ifnet *, struct mbuf *); static int vioif_ioctl(struct ifnet *, u_long, void *); static void vioif_watchdog(struct ifnet *); static int vioif_ifflags(struct vioif_softc *); static int vioif_ifflags_cb(struct ethercom *); /* tx & rx */ static int vioif_netqueue_init(struct vioif_softc *, struct virtio_softc *, size_t, u_int); static void vioif_netqueue_teardown(struct vioif_softc *, struct virtio_softc *, size_t); static void vioif_net_intr_enable(struct vioif_softc *, struct virtio_softc *); static void vioif_net_intr_disable(struct vioif_softc *, struct virtio_softc *); static void vioif_net_sched_handle(struct vioif_softc *, struct vioif_netqueue *); /* rx */ static void vioif_populate_rx_mbufs_locked(struct vioif_softc *, struct vioif_netqueue *); static int vioif_rx_intr(void *); static void vioif_rx_handle(void *); static void vioif_rx_queue_clear(struct vioif_softc *, struct virtio_softc *, struct vioif_netqueue *); /* tx */ static void vioif_start_locked(struct ifnet *, struct vioif_netqueue *); static void vioif_transmit_locked(struct ifnet *, struct vioif_netqueue *); static void vioif_deferred_transmit(void *); static int vioif_tx_intr(void *); static void vioif_tx_handle(void *); static void vioif_tx_queue_clear(struct vioif_softc *, struct virtio_softc *, struct vioif_netqueue *); /* controls */ static int vioif_ctrl_intr(void *); static int vioif_ctrl_rx(struct vioif_softc *, int, bool); static int vioif_set_promisc(struct vioif_softc *, bool); static int vioif_set_allmulti(struct vioif_softc *, bool); static int vioif_set_rx_filter(struct vioif_softc *); static int vioif_rx_filter(struct vioif_softc *); static int vioif_set_mac_addr(struct vioif_softc *); static int vioif_ctrl_mq_vq_pairs_set(struct vioif_softc *, int); /* config interrupt */ static int vioif_config_change(struct virtio_softc *); static void vioif_cfg_softint(void *); static void vioif_update_link_status(struct vioif_softc *); /* others */ static void vioif_alloc_queues(struct vioif_softc *); static void vioif_free_queues(struct vioif_softc *); static int vioif_alloc_mems(struct vioif_softc *); static struct workqueue* vioif_workq_create(const char *, pri_t, int, int); static void vioif_workq_destroy(struct workqueue *); static void vioif_work_set(struct vioif_work *, void(*)(void *), void *); static void vioif_work_add(struct workqueue *, struct vioif_work *); static void vioif_work_wait(struct workqueue *, struct vioif_work *); static int vioif_setup_sysctl(struct vioif_softc *); static void vioif_setup_stats(struct vioif_softc *); CFATTACH_DECL_NEW(vioif, sizeof(struct vioif_softc), vioif_match, vioif_attach, NULL, NULL); static void vioif_intr_barrier(void) { /* wait for finish all interrupt handler */ xc_barrier(0); } static void vioif_notify(struct virtio_softc *vsc, struct virtqueue *vq) { virtio_enqueue_commit(vsc, vq, -1, true); } static int vioif_match(device_t parent, cfdata_t match, void *aux) { struct virtio_attach_args *va = aux; if (va->sc_childdevid == VIRTIO_DEVICE_ID_NETWORK) return 1; return 0; } static void vioif_attach(device_t parent, device_t self, void *aux) { struct vioif_softc *sc = device_private(self); struct virtio_softc *vsc = device_private(parent); struct vioif_netqueue *txq0; struct vioif_ctrlqueue *ctrlq = &sc->sc_ctrlq; uint64_t features, req_features; struct ifnet *ifp = &sc->sc_ethercom.ec_if; u_int softint_flags; int r, i, req_flags; char xnamebuf[MAXCOMLEN]; size_t nvqs; if (virtio_child(vsc) != NULL) { aprint_normal(": child already attached for %s; " "something wrong...\n", device_xname(parent)); return; } sc->sc_dev = self; sc->sc_virtio = vsc; sc->sc_link_state = LINK_STATE_UNKNOWN; sc->sc_max_nvq_pairs = 1; sc->sc_req_nvq_pairs = 1; sc->sc_act_nvq_pairs = 1; sc->sc_txrx_workqueue_sysctl = true; sc->sc_tx_intr_process_limit = VIOIF_TX_INTR_PROCESS_LIMIT; sc->sc_tx_process_limit = VIOIF_TX_PROCESS_LIMIT; sc->sc_rx_intr_process_limit = VIOIF_RX_INTR_PROCESS_LIMIT; sc->sc_rx_process_limit = VIOIF_RX_PROCESS_LIMIT; mutex_init(&sc->sc_lock, MUTEX_DEFAULT, IPL_NONE); snprintf(xnamebuf, sizeof(xnamebuf), "%s_txrx", device_xname(self)); sc->sc_txrx_workqueue = vioif_workq_create(xnamebuf, VIOIF_WORKQUEUE_PRI, IPL_NET, WQ_PERCPU | WQ_MPSAFE); if (sc->sc_txrx_workqueue == NULL) goto err; req_flags = 0; #ifdef VIOIF_MPSAFE req_flags |= VIRTIO_F_INTR_MPSAFE; #endif req_flags |= VIRTIO_F_INTR_MSIX; req_features = VIRTIO_NET_F_MAC | VIRTIO_NET_F_STATUS | VIRTIO_NET_F_CTRL_VQ | VIRTIO_NET_F_CTRL_RX | VIRTIO_F_NOTIFY_ON_EMPTY; req_features |= VIRTIO_F_RING_EVENT_IDX; req_features |= VIRTIO_NET_F_CTRL_MAC_ADDR; #ifdef VIOIF_MULTIQ req_features |= VIRTIO_NET_F_MQ; #endif virtio_child_attach_start(vsc, self, IPL_NET, req_features, VIRTIO_NET_FLAG_BITS); features = virtio_features(vsc); if (features == 0) goto err; if (features & VIRTIO_NET_F_MAC) { for (i = 0; i < __arraycount(sc->sc_mac); i++) { sc->sc_mac[i] = virtio_read_device_config_1(vsc, VIRTIO_NET_CONFIG_MAC + i); } } else { /* code stolen from sys/net/if_tap.c */ struct timeval tv; uint32_t ui; getmicrouptime(&tv); ui = (tv.tv_sec ^ tv.tv_usec) & 0xffffff; memcpy(sc->sc_mac+3, (uint8_t *)&ui, 3); for (i = 0; i < __arraycount(sc->sc_mac); i++) { virtio_write_device_config_1(vsc, VIRTIO_NET_CONFIG_MAC + i, sc->sc_mac[i]); } } /* 'Ethernet' with capital follows other ethernet driver attachment */ aprint_normal_dev(self, "Ethernet address %s\n", ether_sprintf(sc->sc_mac)); if (features & (VIRTIO_NET_F_MRG_RXBUF | VIRTIO_F_VERSION_1)) { sc->sc_hdr_size = sizeof(struct virtio_net_hdr); } else { sc->sc_hdr_size = offsetof(struct virtio_net_hdr, num_buffers); } if ((features & VIRTIO_NET_F_CTRL_VQ) && (features & VIRTIO_NET_F_CTRL_RX)) { sc->sc_has_ctrl = true; cv_init(&ctrlq->ctrlq_wait, "ctrl_vq"); mutex_init(&ctrlq->ctrlq_wait_lock, MUTEX_DEFAULT, IPL_NET); ctrlq->ctrlq_inuse = FREE; } else { sc->sc_has_ctrl = false; } if (sc->sc_has_ctrl && (features & VIRTIO_NET_F_MQ)) { sc->sc_max_nvq_pairs = virtio_read_device_config_2(vsc, VIRTIO_NET_CONFIG_MAX_VQ_PAIRS); if (sc->sc_max_nvq_pairs > VIRTIO_NET_CTRL_MQ_VQ_PAIRS_MAX) goto err; /* Limit the number of queue pairs to use */ sc->sc_req_nvq_pairs = MIN(sc->sc_max_nvq_pairs, ncpu); if (sc->sc_max_nvq_pairs > 1) req_flags |= VIRTIO_F_INTR_PERVQ; } vioif_alloc_queues(sc); #ifdef VIOIF_MPSAFE softint_flags = SOFTINT_NET | SOFTINT_MPSAFE; #else softint_flags = SOFTINT_NET; #endif /* * Initialize network queues */ nvqs = sc->sc_max_nvq_pairs * 2; for (i = 0; i < nvqs; i++) { r = vioif_netqueue_init(sc, vsc, i, softint_flags); if (r != 0) goto err; } if (sc->sc_has_ctrl) { int ctrlq_idx = nvqs; nvqs++; /* * Allocating a virtqueue for control channel */ sc->sc_ctrlq.ctrlq_vq = &sc->sc_vqs[ctrlq_idx]; virtio_init_vq(vsc, ctrlq->ctrlq_vq, ctrlq_idx, vioif_ctrl_intr, ctrlq); r = virtio_alloc_vq(vsc, ctrlq->ctrlq_vq, NBPG, 1, "control"); if (r != 0) { aprint_error_dev(self, "failed to allocate " "a virtqueue for control channel, error code %d\n", r); sc->sc_has_ctrl = false; cv_destroy(&ctrlq->ctrlq_wait); mutex_destroy(&ctrlq->ctrlq_wait_lock); } } sc->sc_cfg_softint = softint_establish(softint_flags, vioif_cfg_softint, sc); if (sc->sc_cfg_softint == NULL) { aprint_error_dev(self, "cannot establish ctl softint\n"); goto err; } if (vioif_alloc_mems(sc) < 0) goto err; r = virtio_child_attach_finish(vsc, sc->sc_vqs, nvqs, vioif_config_change, req_flags); if (r != 0) goto err; if (vioif_setup_sysctl(sc) != 0) { aprint_error_dev(self, "unable to create sysctl node\n"); /* continue */ } vioif_setup_stats(sc); strlcpy(ifp->if_xname, device_xname(self), IFNAMSIZ); ifp->if_softc = sc; ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST; #ifdef VIOIF_MPSAFE ifp->if_extflags = IFEF_MPSAFE; #endif ifp->if_start = vioif_start; if (sc->sc_req_nvq_pairs > 1) ifp->if_transmit = vioif_transmit; ifp->if_ioctl = vioif_ioctl; ifp->if_init = vioif_init; ifp->if_stop = vioif_stop; ifp->if_capabilities = 0; ifp->if_watchdog = vioif_watchdog; txq0 = &sc->sc_netqs[VIOIF_NETQ_TXQID(0)]; IFQ_SET_MAXLEN(&ifp->if_snd, MAX(txq0->netq_vq->vq_num, IFQ_MAXLEN)); IFQ_SET_READY(&ifp->if_snd); sc->sc_ethercom.ec_capabilities |= ETHERCAP_VLAN_MTU; if_attach(ifp); if_deferred_start_init(ifp, NULL); ether_ifattach(ifp, sc->sc_mac); ether_set_ifflags_cb(&sc->sc_ethercom, vioif_ifflags_cb); return; err: nvqs = sc->sc_max_nvq_pairs * 2; for (i = 0; i < nvqs; i++) { vioif_netqueue_teardown(sc, vsc, i); } if (sc->sc_has_ctrl) { cv_destroy(&ctrlq->ctrlq_wait); mutex_destroy(&ctrlq->ctrlq_wait_lock); virtio_free_vq(vsc, ctrlq->ctrlq_vq); ctrlq->ctrlq_vq = NULL; } vioif_free_queues(sc); mutex_destroy(&sc->sc_lock); virtio_child_attach_failed(vsc); config_finalize_register(self, vioif_finalize_teardown); return; } static int vioif_finalize_teardown(device_t self) { struct vioif_softc *sc = device_private(self); if (sc->sc_txrx_workqueue != NULL) { vioif_workq_destroy(sc->sc_txrx_workqueue); sc->sc_txrx_workqueue = NULL; } return 0; } /* * Interface functions for ifnet */ static int vioif_init(struct ifnet *ifp) { struct vioif_softc *sc = ifp->if_softc; struct virtio_softc *vsc = sc->sc_virtio; struct vioif_netqueue *netq; struct vioif_ctrlqueue *ctrlq = &sc->sc_ctrlq; int r, i; vioif_stop(ifp, 0); r = virtio_reinit_start(vsc); if (r != 0) { log(LOG_ERR, "%s: reset failed\n", ifp->if_xname); return EIO; } virtio_negotiate_features(vsc, virtio_features(vsc)); for (i = 0; i < sc->sc_req_nvq_pairs; i++) { netq = &sc->sc_netqs[VIOIF_NETQ_RXQID(i)]; mutex_enter(&netq->netq_lock); vioif_populate_rx_mbufs_locked(sc, netq); mutex_exit(&netq->netq_lock); } virtio_reinit_end(vsc); if (sc->sc_has_ctrl) virtio_start_vq_intr(vsc, ctrlq->ctrlq_vq); r = vioif_ctrl_mq_vq_pairs_set(sc, sc->sc_req_nvq_pairs); if (r == 0) sc->sc_act_nvq_pairs = sc->sc_req_nvq_pairs; else sc->sc_act_nvq_pairs = 1; SET(ifp->if_flags, IFF_RUNNING); vioif_net_intr_enable(sc, vsc); vioif_update_link_status(sc); r = vioif_rx_filter(sc); return r; } static void vioif_stop(struct ifnet *ifp, int disable) { struct vioif_softc *sc = ifp->if_softc; struct virtio_softc *vsc = sc->sc_virtio; struct vioif_netqueue *netq; struct vioif_ctrlqueue *ctrlq = &sc->sc_ctrlq; size_t i, act_qnum; act_qnum = sc->sc_act_nvq_pairs * 2; CLR(ifp->if_flags, IFF_RUNNING); for (i = 0; i < act_qnum; i++) { netq = &sc->sc_netqs[i]; mutex_enter(&netq->netq_lock); netq->netq_stopping = true; mutex_exit(&netq->netq_lock); } /* disable interrupts */ vioif_net_intr_disable(sc, vsc); if (sc->sc_has_ctrl) virtio_stop_vq_intr(vsc, ctrlq->ctrlq_vq); /* * only way to stop interrupt, I/O and DMA is resetting... * * NOTE: Devices based on VirtIO draft specification can not * stop interrupt completely even if virtio_stop_vq_intr() is called. */ virtio_reset(vsc); vioif_intr_barrier(); for (i = 0; i < act_qnum; i++) { netq = &sc->sc_netqs[i]; vioif_work_wait(sc->sc_txrx_workqueue, &netq->netq_work); } for (i = 0; i < sc->sc_act_nvq_pairs; i++) { netq = &sc->sc_netqs[VIOIF_NETQ_RXQID(i)]; vioif_rx_queue_clear(sc, vsc, netq); netq = &sc->sc_netqs[VIOIF_NETQ_TXQID(i)]; vioif_tx_queue_clear(sc, vsc, netq); } /* all packet processing is stopped */ for (i = 0; i < act_qnum; i++) { netq = &sc->sc_netqs[i]; mutex_enter(&netq->netq_lock); netq->netq_stopping = false; mutex_exit(&netq->netq_lock); } } static void vioif_start(struct ifnet *ifp) { struct vioif_softc *sc = ifp->if_softc; struct vioif_netqueue *txq0 = &sc->sc_netqs[VIOIF_NETQ_TXQID(0)]; #ifdef VIOIF_MPSAFE KASSERT(if_is_mpsafe(ifp)); #endif mutex_enter(&txq0->netq_lock); vioif_start_locked(ifp, txq0); mutex_exit(&txq0->netq_lock); } static inline int vioif_select_txqueue(struct ifnet *ifp, struct mbuf *m) { struct vioif_softc *sc = ifp->if_softc; u_int cpuid = cpu_index(curcpu()); return VIOIF_NETQ_TXQID(cpuid % sc->sc_act_nvq_pairs); } static int vioif_transmit(struct ifnet *ifp, struct mbuf *m) { struct vioif_softc *sc = ifp->if_softc; struct vioif_netqueue *netq; struct vioif_tx_context *txc; int qid; qid = vioif_select_txqueue(ifp, m); netq = &sc->sc_netqs[qid]; txc = netq->netq_ctx; if (__predict_false(!pcq_put(txc->txc_intrq, m))) { m_freem(m); return ENOBUFS; } net_stat_ref_t nsr = IF_STAT_GETREF(ifp); if_statadd_ref(nsr, if_obytes, m->m_pkthdr.len); if (m->m_flags & M_MCAST) if_statinc_ref(nsr, if_omcasts); IF_STAT_PUTREF(ifp); if (mutex_tryenter(&netq->netq_lock)) { vioif_transmit_locked(ifp, netq); mutex_exit(&netq->netq_lock); } return 0; } void vioif_watchdog(struct ifnet *ifp) { struct vioif_softc *sc = ifp->if_softc; struct vioif_netqueue *netq; int i; if (ISSET(ifp->if_flags, IFF_RUNNING)) { if (ISSET(ifp->if_flags, IFF_DEBUG)) { log(LOG_DEBUG, "%s: watchdog timed out\n", ifp->if_xname); } for (i = 0; i < sc->sc_act_nvq_pairs; i++) { netq = &sc->sc_netqs[VIOIF_NETQ_TXQID(i)]; mutex_enter(&netq->netq_lock); if (!netq->netq_running_handle) { netq->netq_running_handle = true; vioif_net_sched_handle(sc, netq); } mutex_exit(&netq->netq_lock); } } } static int vioif_ioctl(struct ifnet *ifp, u_long cmd, void *data) { int s, r; s = splnet(); r = ether_ioctl(ifp, cmd, data); if (r == ENETRESET && (cmd == SIOCADDMULTI || cmd == SIOCDELMULTI)) { if (ifp->if_flags & IFF_RUNNING) { r = vioif_rx_filter(ifp->if_softc); } else { r = 0; } } splx(s); return r; } static int vioif_ifflags(struct vioif_softc *sc) { struct ifnet *ifp = &sc->sc_ethercom.ec_if; bool onoff; int r; if (!sc->sc_has_ctrl) { /* no ctrl vq; always promisc and allmulti */ ifp->if_flags |= (IFF_PROMISC | IFF_ALLMULTI); return 0; } onoff = ifp->if_flags & IFF_ALLMULTI ? true : false; r = vioif_set_allmulti(sc, onoff); if (r != 0) { log(LOG_WARNING, "%s: couldn't %sable ALLMULTI\n", ifp->if_xname, onoff ? "en" : "dis"); if (onoff) { CLR(ifp->if_flags, IFF_ALLMULTI); } else { SET(ifp->if_flags, IFF_ALLMULTI); } } onoff = ifp->if_flags & IFF_PROMISC ? true : false; r = vioif_set_promisc(sc, onoff); if (r != 0) { log(LOG_WARNING, "%s: couldn't %sable PROMISC\n", ifp->if_xname, onoff ? "en" : "dis"); if (onoff) { CLR(ifp->if_flags, IFF_PROMISC); } else { SET(ifp->if_flags, IFF_PROMISC); } } return 0; } static int vioif_ifflags_cb(struct ethercom *ec) { struct ifnet *ifp = &ec->ec_if; struct vioif_softc *sc = ifp->if_softc; return vioif_ifflags(sc); } static int vioif_setup_sysctl(struct vioif_softc *sc) { const char *devname; struct sysctllog **log; const struct sysctlnode *rnode, *rxnode, *txnode; int error; log = &sc->sc_sysctllog; devname = device_xname(sc->sc_dev); error = sysctl_createv(log, 0, NULL, &rnode, 0, CTLTYPE_NODE, devname, SYSCTL_DESCR("virtio-net information and settings"), NULL, 0, NULL, 0, CTL_HW, CTL_CREATE, CTL_EOL); if (error) goto out; error = sysctl_createv(log, 0, &rnode, NULL, CTLFLAG_READWRITE, CTLTYPE_BOOL, "txrx_workqueue", SYSCTL_DESCR("Use workqueue for packet processing"), NULL, 0, &sc->sc_txrx_workqueue_sysctl, 0, CTL_CREATE, CTL_EOL); if (error) goto out; error = sysctl_createv(log, 0, &rnode, &rxnode, 0, CTLTYPE_NODE, "rx", SYSCTL_DESCR("virtio-net information and settings for Rx"), NULL, 0, NULL, 0, CTL_CREATE, CTL_EOL); if (error) goto out; error = sysctl_createv(log, 0, &rxnode, NULL, CTLFLAG_READWRITE, CTLTYPE_INT, "intr_process_limit", SYSCTL_DESCR("max number of Rx packets to process for interrupt processing"), NULL, 0, &sc->sc_rx_intr_process_limit, 0, CTL_CREATE, CTL_EOL); if (error) goto out; error = sysctl_createv(log, 0, &rxnode, NULL, CTLFLAG_READWRITE, CTLTYPE_INT, "process_limit", SYSCTL_DESCR("max number of Rx packets to process for deferred processing"), NULL, 0, &sc->sc_rx_process_limit, 0, CTL_CREATE, CTL_EOL); if (error) goto out; error = sysctl_createv(log, 0, &rnode, &txnode, 0, CTLTYPE_NODE, "tx", SYSCTL_DESCR("virtio-net information and settings for Tx"), NULL, 0, NULL, 0, CTL_CREATE, CTL_EOL); if (error) goto out; error = sysctl_createv(log, 0, &txnode, NULL, CTLFLAG_READWRITE, CTLTYPE_INT, "intr_process_limit", SYSCTL_DESCR("max number of Tx packets to process for interrupt processing"), NULL, 0, &sc->sc_tx_intr_process_limit, 0, CTL_CREATE, CTL_EOL); if (error) goto out; error = sysctl_createv(log, 0, &txnode, NULL, CTLFLAG_READWRITE, CTLTYPE_INT, "process_limit", SYSCTL_DESCR("max number of Tx packets to process for deferred processing"), NULL, 0, &sc->sc_tx_process_limit, 0, CTL_CREATE, CTL_EOL); out: if (error) sysctl_teardown(log); return error; } static void vioif_setup_stats(struct vioif_softc *sc) { struct vioif_netqueue *netq; struct vioif_tx_context *txc; struct vioif_rx_context *rxc; size_t i, netq_num; netq_num = sc->sc_max_nvq_pairs * 2; for (i = 0; i < netq_num; i++) { netq = &sc->sc_netqs[i]; evcnt_attach_dynamic(&netq->netq_mbuf_load_failed, EVCNT_TYPE_MISC, NULL, netq->netq_evgroup, "failed to load mbuf to DMA"); evcnt_attach_dynamic(&netq->netq_enqueue_failed, EVCNT_TYPE_MISC, NULL, netq->netq_evgroup, "virtqueue enqueue failed failed"); switch (VIOIF_NETQ_DIR(i)) { case VIOIF_NETQ_RX: rxc = netq->netq_ctx; evcnt_attach_dynamic(&rxc->rxc_mbuf_enobufs, EVCNT_TYPE_MISC, NULL, netq->netq_evgroup, "no receive buffer"); break; case VIOIF_NETQ_TX: txc = netq->netq_ctx; evcnt_attach_dynamic(&txc->txc_defrag_failed, EVCNT_TYPE_MISC, NULL, netq->netq_evgroup, "m_defrag() failed"); break; } } evcnt_attach_dynamic(&sc->sc_ctrlq.ctrlq_cmd_load_failed, EVCNT_TYPE_MISC, NULL, device_xname(sc->sc_dev), "control command dmamap load failed"); evcnt_attach_dynamic(&sc->sc_ctrlq.ctrlq_cmd_failed, EVCNT_TYPE_MISC, NULL, device_xname(sc->sc_dev), "control command failed"); } /* * allocate memory */ static int vioif_dmamap_create(struct vioif_softc *sc, bus_dmamap_t *map, bus_size_t size, int nsegs, const char *usage) { int r; r = bus_dmamap_create(virtio_dmat(sc->sc_virtio), size, nsegs, size, 0, BUS_DMA_NOWAIT | BUS_DMA_ALLOCNOW, map); if (r != 0) { aprint_error_dev(sc->sc_dev, "%s dmamap creation failed, " "error code %d\n", usage, r); } return r; } static void vioif_dmamap_destroy(struct vioif_softc *sc, bus_dmamap_t *map) { if (*map) { bus_dmamap_destroy(virtio_dmat(sc->sc_virtio), *map); *map = NULL; } } static int vioif_dmamap_create_load(struct vioif_softc *sc, bus_dmamap_t *map, void *buf, bus_size_t size, int nsegs, int rw, const char *usage) { int r; r = vioif_dmamap_create(sc, map, size, nsegs, usage); if (r != 0) return 1; r = bus_dmamap_load(virtio_dmat(sc->sc_virtio), *map, buf, size, NULL, rw | BUS_DMA_NOWAIT); if (r != 0) { vioif_dmamap_destroy(sc, map); aprint_error_dev(sc->sc_dev, "%s dmamap load failed. " "error code %d\n", usage, r); } return r; } static void * vioif_assign_mem(intptr_t *p, size_t size) { intptr_t rv; rv = *p; *p += size; return (void *)rv; } /* * dma memory is used for: * netq_maps_kva: metadata array for received frames (READ) and * sent frames (WRITE) * ctrlq_cmd: command to be sent via ctrl vq (WRITE) * ctrlq_status: return value for a command via ctrl vq (READ) * ctrlq_rx: parameter for a VIRTIO_NET_CTRL_RX class command * (WRITE) * ctrlq_mac_tbl_uc: unicast MAC address filter for a VIRTIO_NET_CTRL_MAC * class command (WRITE) * ctrlq_mac_tbl_mc: multicast MAC address filter for a VIRTIO_NET_CTRL_MAC * class command (WRITE) * ctrlq_* structures are allocated only one each; they are protected by * ctrlq_inuse variable and ctrlq_wait condvar. */ static int vioif_alloc_mems(struct vioif_softc *sc) { struct virtio_softc *vsc = sc->sc_virtio; struct vioif_netqueue *netq; struct vioif_ctrlqueue *ctrlq = &sc->sc_ctrlq; struct vioif_net_map *maps; unsigned int vq_num; int r, rsegs; bus_size_t dmamemsize; size_t qid, i, netq_num, kmemsize; void *vaddr; intptr_t p; netq_num = sc->sc_max_nvq_pairs * 2; /* allocate DMA memory */ dmamemsize = 0; for (qid = 0; qid < netq_num; qid++) { maps = sc->sc_netqs[qid].netq_maps; vq_num = sc->sc_netqs[qid].netq_vq->vq_num; dmamemsize += sizeof(*maps[0].vnm_hdr) * vq_num; } if (sc->sc_has_ctrl) { dmamemsize += sizeof(struct virtio_net_ctrl_cmd); dmamemsize += sizeof(struct virtio_net_ctrl_status); dmamemsize += sizeof(struct virtio_net_ctrl_rx); dmamemsize += sizeof(struct virtio_net_ctrl_mac_tbl) + ETHER_ADDR_LEN; dmamemsize += sizeof(struct virtio_net_ctrl_mac_tbl) + ETHER_ADDR_LEN * VIRTIO_NET_CTRL_MAC_MAXENTRIES; dmamemsize += sizeof(struct virtio_net_ctrl_mac_addr); dmamemsize += sizeof(struct virtio_net_ctrl_mq); } r = bus_dmamem_alloc(virtio_dmat(vsc), dmamemsize, 0, 0, &sc->sc_segs[0], 1, &rsegs, BUS_DMA_NOWAIT); if (r != 0) { aprint_error_dev(sc->sc_dev, "DMA memory allocation failed, size %" PRIuBUSSIZE ", " "error code %d\n", dmamemsize, r); goto err_none; } r = bus_dmamem_map(virtio_dmat(vsc), &sc->sc_segs[0], 1, dmamemsize, &vaddr, BUS_DMA_NOWAIT); if (r != 0) { aprint_error_dev(sc->sc_dev, "DMA memory map failed, error code %d\n", r); goto err_dmamem_alloc; } /* assign DMA memory */ memset(vaddr, 0, dmamemsize); sc->sc_dmamem = vaddr; p = (intptr_t) vaddr; for (qid = 0; qid < netq_num; qid++) { netq = &sc->sc_netqs[qid]; maps = netq->netq_maps; vq_num = netq->netq_vq->vq_num; netq->netq_maps_kva = vioif_assign_mem(&p, sizeof(*maps[0].vnm_hdr) * vq_num); } if (sc->sc_has_ctrl) { ctrlq->ctrlq_cmd = vioif_assign_mem(&p, sizeof(*ctrlq->ctrlq_cmd)); ctrlq->ctrlq_status = vioif_assign_mem(&p, sizeof(*ctrlq->ctrlq_status)); ctrlq->ctrlq_rx = vioif_assign_mem(&p, sizeof(*ctrlq->ctrlq_rx)); ctrlq->ctrlq_mac_tbl_uc = vioif_assign_mem(&p, sizeof(*ctrlq->ctrlq_mac_tbl_uc) + ETHER_ADDR_LEN); ctrlq->ctrlq_mac_tbl_mc = vioif_assign_mem(&p, sizeof(*ctrlq->ctrlq_mac_tbl_mc) + ETHER_ADDR_LEN * VIRTIO_NET_CTRL_MAC_MAXENTRIES); ctrlq->ctrlq_mac_addr = vioif_assign_mem(&p, sizeof(*ctrlq->ctrlq_mac_addr)); ctrlq->ctrlq_mq = vioif_assign_mem(&p, sizeof(*ctrlq->ctrlq_mq)); } /* allocate kmem */ kmemsize = 0; for (qid = 0; qid < netq_num; qid++) { netq = &sc->sc_netqs[qid]; vq_num = netq->netq_vq->vq_num; kmemsize += sizeof(netq->netq_maps[0]) * vq_num; } vaddr = kmem_zalloc(kmemsize, KM_SLEEP); sc->sc_kmem = vaddr; /* assign allocated kmem */ p = (intptr_t) vaddr; for (qid = 0; qid < netq_num; qid++) { netq = &sc->sc_netqs[qid]; vq_num = netq->netq_vq->vq_num; netq->netq_maps = vioif_assign_mem(&p, sizeof(netq->netq_maps[0]) * vq_num); } /* prepare dmamaps */ for (qid = 0; qid < netq_num; qid++) { static const struct { const char *msg_hdr; const char *msg_payload; int dma_flag; bus_size_t dma_size; int dma_nsegs; } dmaparams[VIOIF_NETQ_IDX] = { [VIOIF_NETQ_RX] = { .msg_hdr = "rx header", .msg_payload = "rx payload", .dma_flag = BUS_DMA_READ, .dma_size = MCLBYTES - ETHER_ALIGN, .dma_nsegs = 1, }, [VIOIF_NETQ_TX] = { .msg_hdr = "tx header", .msg_payload = "tx payload", .dma_flag = BUS_DMA_WRITE, .dma_size = ETHER_MAX_LEN, .dma_nsegs = VIRTIO_NET_TX_MAXNSEGS, } }; struct virtio_net_hdr *hdrs; int dir; int nsegs; dir = VIOIF_NETQ_DIR(qid); netq = &sc->sc_netqs[qid]; vq_num = netq->netq_vq->vq_num; maps = netq->netq_maps; hdrs = netq->netq_maps_kva; nsegs = uimin(dmaparams[dir].dma_nsegs, vq_num - 1/*hdr*/); for (i = 0; i < vq_num; i++) { maps[i].vnm_hdr = &hdrs[i]; r = vioif_dmamap_create_load(sc, &maps[i].vnm_hdr_map, maps[i].vnm_hdr, sc->sc_hdr_size, 1, dmaparams[dir].dma_flag, dmaparams[dir].msg_hdr); if (r != 0) goto err_reqs; r = vioif_dmamap_create(sc, &maps[i].vnm_mbuf_map, dmaparams[dir].dma_size, nsegs, dmaparams[dir].msg_payload); if (r != 0) goto err_reqs; } } if (sc->sc_has_ctrl) { /* control vq class & command */ r = vioif_dmamap_create_load(sc, &ctrlq->ctrlq_cmd_dmamap, ctrlq->ctrlq_cmd, sizeof(*ctrlq->ctrlq_cmd), 1, BUS_DMA_WRITE, "control command"); if (r != 0) goto err_reqs; r = vioif_dmamap_create_load(sc, &ctrlq->ctrlq_status_dmamap, ctrlq->ctrlq_status, sizeof(*ctrlq->ctrlq_status), 1, BUS_DMA_READ, "control status"); if (r != 0) goto err_reqs; /* control vq rx mode command parameter */ r = vioif_dmamap_create_load(sc, &ctrlq->ctrlq_rx_dmamap, ctrlq->ctrlq_rx, sizeof(*ctrlq->ctrlq_rx), 1, BUS_DMA_WRITE, "rx mode control command"); if (r != 0) goto err_reqs; /* multiqueue set command */ r = vioif_dmamap_create_load(sc, &ctrlq->ctrlq_mq_dmamap, ctrlq->ctrlq_mq, sizeof(*ctrlq->ctrlq_mq), 1, BUS_DMA_WRITE, "multiqueue set command"); if (r != 0) goto err_reqs; /* control vq MAC filter table for unicast */ /* do not load now since its length is variable */ r = vioif_dmamap_create(sc, &ctrlq->ctrlq_tbl_uc_dmamap, sizeof(*ctrlq->ctrlq_mac_tbl_uc) + ETHER_ADDR_LEN, 1, "unicast MAC address filter command"); if (r != 0) goto err_reqs; /* control vq MAC filter table for multicast */ r = vioif_dmamap_create(sc, &ctrlq->ctrlq_tbl_mc_dmamap, sizeof(*ctrlq->ctrlq_mac_tbl_mc) + ETHER_ADDR_LEN * VIRTIO_NET_CTRL_MAC_MAXENTRIES, 1, "multicast MAC address filter command"); if (r != 0) goto err_reqs; /* control vq MAC address set command */ r = vioif_dmamap_create_load(sc, &ctrlq->ctrlq_mac_addr_dmamap, ctrlq->ctrlq_mac_addr, sizeof(*ctrlq->ctrlq_mac_addr), 1, BUS_DMA_WRITE, "mac addr set command"); if (r != 0) goto err_reqs; } return 0; err_reqs: vioif_dmamap_destroy(sc, &ctrlq->ctrlq_tbl_mc_dmamap); vioif_dmamap_destroy(sc, &ctrlq->ctrlq_tbl_uc_dmamap); vioif_dmamap_destroy(sc, &ctrlq->ctrlq_rx_dmamap); vioif_dmamap_destroy(sc, &ctrlq->ctrlq_status_dmamap); vioif_dmamap_destroy(sc, &ctrlq->ctrlq_cmd_dmamap); vioif_dmamap_destroy(sc, &ctrlq->ctrlq_mac_addr_dmamap); for (qid = 0; qid < netq_num; qid++) { vq_num = sc->sc_netqs[qid].netq_vq->vq_num; maps = sc->sc_netqs[qid].netq_maps; for (i = 0; i < vq_num; i++) { vioif_dmamap_destroy(sc, &maps[i].vnm_mbuf_map); vioif_dmamap_destroy(sc, &maps[i].vnm_hdr_map); } } if (sc->sc_kmem) { kmem_free(sc->sc_kmem, kmemsize); sc->sc_kmem = NULL; } bus_dmamem_unmap(virtio_dmat(vsc), sc->sc_dmamem, dmamemsize); err_dmamem_alloc: bus_dmamem_free(virtio_dmat(vsc), &sc->sc_segs[0], 1); err_none: return -1; } static void vioif_alloc_queues(struct vioif_softc *sc) { int nvq_pairs = sc->sc_max_nvq_pairs; size_t nvqs, netq_num; KASSERT(nvq_pairs <= VIRTIO_NET_CTRL_MQ_VQ_PAIRS_MAX); nvqs = netq_num = sc->sc_max_nvq_pairs * 2; if (sc->sc_has_ctrl) nvqs++; sc->sc_vqs = kmem_zalloc(sizeof(sc->sc_vqs[0]) * nvqs, KM_SLEEP); sc->sc_netqs = kmem_zalloc(sizeof(sc->sc_netqs[0]) * netq_num, KM_SLEEP); } static void vioif_free_queues(struct vioif_softc *sc) { size_t nvqs, netq_num; nvqs = netq_num = sc->sc_max_nvq_pairs * 2; if (sc->sc_ctrlq.ctrlq_vq) nvqs++; kmem_free(sc->sc_netqs, sizeof(sc->sc_netqs[0]) * netq_num); kmem_free(sc->sc_vqs, sizeof(sc->sc_vqs[0]) * nvqs); sc->sc_netqs = NULL; sc->sc_vqs = NULL; } /* * Network queues */ static int vioif_netqueue_init(struct vioif_softc *sc, struct virtio_softc *vsc, size_t qid, u_int softint_flags) { static const struct { const char *dirname; int segsize; int nsegs; int (*intrhand)(void *); void (*sihand)(void *); } params[VIOIF_NETQ_IDX] = { [VIOIF_NETQ_RX] = { .dirname = "rx", .segsize = MCLBYTES, .nsegs = 2, .intrhand = vioif_rx_intr, .sihand = vioif_rx_handle, }, [VIOIF_NETQ_TX] = { .dirname = "tx", .segsize = ETHER_MAX_LEN - ETHER_HDR_LEN, .nsegs = 2, .intrhand = vioif_tx_intr, .sihand = vioif_tx_handle, } }; struct virtqueue *vq; struct vioif_netqueue *netq; struct vioif_tx_context *txc; struct vioif_rx_context *rxc; char qname[32]; int r, dir; txc = NULL; rxc = NULL; netq = &sc->sc_netqs[qid]; vq = &sc->sc_vqs[qid]; dir = VIOIF_NETQ_DIR(qid); netq->netq_vq = &sc->sc_vqs[qid]; netq->netq_stopping = false; netq->netq_running_handle = false; snprintf(qname, sizeof(qname), "%s%zu", params[dir].dirname, VIOIF_NETQ_PAIRIDX(qid)); snprintf(netq->netq_evgroup, sizeof(netq->netq_evgroup), "%s-%s", device_xname(sc->sc_dev), qname); mutex_init(&netq->netq_lock, MUTEX_DEFAULT, IPL_NET); virtio_init_vq(vsc, vq, qid, params[dir].intrhand, netq); r = virtio_alloc_vq(vsc, vq, params[dir].segsize + sc->sc_hdr_size, params[dir].nsegs, qname); if (r != 0) goto err; netq->netq_vq = vq; netq->netq_softint = softint_establish(softint_flags, params[dir].sihand, netq); if (netq->netq_softint == NULL) { aprint_error_dev(sc->sc_dev, "couldn't establish %s softint\n", params[dir].dirname); goto err; } vioif_work_set(&netq->netq_work, params[dir].sihand, netq); switch (dir) { case VIOIF_NETQ_RX: rxc = kmem_zalloc(sizeof(*rxc), KM_SLEEP); netq->netq_ctx = rxc; /* nothing to do */ break; case VIOIF_NETQ_TX: txc = kmem_zalloc(sizeof(*txc), KM_SLEEP); netq->netq_ctx = (void *)txc; txc->txc_deferred_transmit = softint_establish(softint_flags, vioif_deferred_transmit, netq); if (txc->txc_deferred_transmit == NULL) { aprint_error_dev(sc->sc_dev, "couldn't establish softint for " "tx deferred transmit\n"); goto err; } txc->txc_link_active = VIOIF_IS_LINK_ACTIVE(sc); txc->txc_no_free_slots = false; txc->txc_intrq = pcq_create(vq->vq_num, KM_SLEEP); break; } return 0; err: netq->netq_ctx = NULL; if (rxc != NULL) { kmem_free(rxc, sizeof(*rxc)); } if (txc != NULL) { if (txc->txc_deferred_transmit != NULL) softint_disestablish(txc->txc_deferred_transmit); if (txc->txc_intrq != NULL) pcq_destroy(txc->txc_intrq); kmem_free(txc, sizeof(txc)); } vioif_work_set(&netq->netq_work, NULL, NULL); if (netq->netq_softint != NULL) { softint_disestablish(netq->netq_softint); netq->netq_softint = NULL; } virtio_free_vq(vsc, vq); mutex_destroy(&netq->netq_lock); netq->netq_vq = NULL; return -1; } static void vioif_netqueue_teardown(struct vioif_softc *sc, struct virtio_softc *vsc, size_t qid) { struct vioif_netqueue *netq; struct vioif_rx_context *rxc; struct vioif_tx_context *txc; int dir; netq = &sc->sc_netqs[qid]; if (netq->netq_vq == NULL) return; netq = &sc->sc_netqs[qid]; dir = VIOIF_NETQ_DIR(qid); switch (dir) { case VIOIF_NETQ_RX: rxc = netq->netq_ctx; netq->netq_ctx = NULL; kmem_free(rxc, sizeof(*rxc)); break; case VIOIF_NETQ_TX: txc = netq->netq_ctx; netq->netq_ctx = NULL; softint_disestablish(txc->txc_deferred_transmit); pcq_destroy(txc->txc_intrq); kmem_free(txc, sizeof(*txc)); break; } softint_disestablish(netq->netq_softint); virtio_free_vq(vsc, netq->netq_vq); mutex_destroy(&netq->netq_lock); netq->netq_vq = NULL; } static void vioif_net_sched_handle(struct vioif_softc *sc, struct vioif_netqueue *netq) { KASSERT(mutex_owned(&netq->netq_lock)); KASSERT(!netq->netq_stopping); if (netq->netq_workqueue) { vioif_work_add(sc->sc_txrx_workqueue, &netq->netq_work); } else { softint_schedule(netq->netq_softint); } } static int vioif_net_load_mbuf(struct virtio_softc *vsc, struct vioif_net_map *map, struct mbuf *m, int dma_flags) { int r; KASSERT(map->vnm_mbuf == NULL); r = bus_dmamap_load_mbuf(virtio_dmat(vsc), map->vnm_mbuf_map, m, dma_flags | BUS_DMA_NOWAIT); if (r == 0) { map->vnm_mbuf = m; } return r; } static void vioif_net_unload_mbuf(struct virtio_softc *vsc, struct vioif_net_map *map) { KASSERT(map->vnm_mbuf != NULL); bus_dmamap_unload(virtio_dmat(vsc), map->vnm_mbuf_map); map->vnm_mbuf = NULL; } static int vioif_net_enqueue(struct virtio_softc *vsc, struct virtqueue *vq, int slot, struct vioif_net_map *map, int dma_ops, bool is_write) { int r; KASSERT(map->vnm_mbuf != NULL); /* This should actually never fail */ r = virtio_enqueue_reserve(vsc, vq, slot, map->vnm_mbuf_map->dm_nsegs + 1); if (r != 0) { /* slot already freed by virtio_enqueue_reserve */ return r; } bus_dmamap_sync(virtio_dmat(vsc), map->vnm_mbuf_map, 0, map->vnm_mbuf_map->dm_mapsize, dma_ops); bus_dmamap_sync(virtio_dmat(vsc), map->vnm_hdr_map, 0, map->vnm_hdr_map->dm_mapsize, dma_ops); virtio_enqueue(vsc, vq, slot, map->vnm_hdr_map, is_write); virtio_enqueue(vsc, vq, slot, map->vnm_mbuf_map, is_write); virtio_enqueue_commit(vsc, vq, slot, false); return 0; } static int vioif_net_enqueue_tx(struct virtio_softc *vsc, struct virtqueue *vq, int slot, struct vioif_net_map *map) { return vioif_net_enqueue(vsc, vq, slot, map, BUS_DMASYNC_PREWRITE, true); } static int vioif_net_enqueue_rx(struct virtio_softc *vsc, struct virtqueue *vq, int slot, struct vioif_net_map *map) { return vioif_net_enqueue(vsc, vq, slot, map, BUS_DMASYNC_PREREAD, false); } static struct mbuf * vioif_net_dequeue_commit(struct virtio_softc *vsc, struct virtqueue *vq, int slot, struct vioif_net_map *map, int dma_flags) { struct mbuf *m; m = map->vnm_mbuf; KASSERT(m != NULL); map->vnm_mbuf = NULL; bus_dmamap_sync(virtio_dmat(vsc), map->vnm_hdr_map, 0, map->vnm_hdr_map->dm_mapsize, dma_flags); bus_dmamap_sync(virtio_dmat(vsc), map->vnm_mbuf_map, 0, map->vnm_mbuf_map->dm_mapsize, dma_flags); bus_dmamap_unload(virtio_dmat(vsc), map->vnm_mbuf_map); virtio_dequeue_commit(vsc, vq, slot); return m; } static void vioif_net_intr_enable(struct vioif_softc *sc, struct virtio_softc *vsc) { struct vioif_netqueue *netq; size_t i, act_qnum; int enqueued; act_qnum = sc->sc_act_nvq_pairs * 2; for (i = 0; i < act_qnum; i++) { netq = &sc->sc_netqs[i]; KASSERT(!netq->netq_stopping); KASSERT(!netq->netq_running_handle); enqueued = virtio_start_vq_intr(vsc, netq->netq_vq); if (enqueued != 0) { virtio_stop_vq_intr(vsc, netq->netq_vq); mutex_enter(&netq->netq_lock); netq->netq_running_handle = true; vioif_net_sched_handle(sc, netq); mutex_exit(&netq->netq_lock); } } } static void vioif_net_intr_disable(struct vioif_softc *sc, struct virtio_softc *vsc) { struct vioif_netqueue *netq; size_t i, act_qnum; act_qnum = sc->sc_act_nvq_pairs * 2; for (i = 0; i < act_qnum; i++) { netq = &sc->sc_netqs[i]; virtio_stop_vq_intr(vsc, netq->netq_vq); } } /* * Receive implementation */ /* enqueue mbufs to receive slots */ static void vioif_populate_rx_mbufs_locked(struct vioif_softc *sc, struct vioif_netqueue *netq) { struct virtqueue *vq = netq->netq_vq; struct virtio_softc *vsc = vq->vq_owner; struct vioif_rx_context *rxc; struct vioif_net_map *map; struct mbuf *m; int i, r, ndone = 0; KASSERT(mutex_owned(&netq->netq_lock)); rxc = netq->netq_ctx; for (i = 0; i < vq->vq_num; i++) { int slot; r = virtio_enqueue_prep(vsc, vq, &slot); if (r == EAGAIN) break; if (__predict_false(r != 0)) panic("enqueue_prep for rx buffers"); MGETHDR(m, M_DONTWAIT, MT_DATA); if (m == NULL) { virtio_enqueue_abort(vsc, vq, slot); rxc->rxc_mbuf_enobufs.ev_count++; break; } MCLGET(m, M_DONTWAIT); if ((m->m_flags & M_EXT) == 0) { virtio_enqueue_abort(vsc, vq, slot); m_freem(m); rxc->rxc_mbuf_enobufs.ev_count++; break; } m->m_len = m->m_pkthdr.len = MCLBYTES; m_adj(m, ETHER_ALIGN); map = &netq->netq_maps[slot]; r = vioif_net_load_mbuf(vsc, map, m, BUS_DMA_READ); if (r != 0) { virtio_enqueue_abort(vsc, vq, slot); m_freem(m); netq->netq_mbuf_load_failed.ev_count++; break; } r = vioif_net_enqueue_rx(vsc, vq, slot, map); if (r != 0) { vioif_net_unload_mbuf(vsc, map); netq->netq_enqueue_failed.ev_count++; m_freem(m); /* slot already freed by vioif_net_enqueue_rx */ break; } ndone++; } if (ndone > 0) vioif_notify(vsc, vq); } /* dequeue received packets */ static bool vioif_rx_deq_locked(struct vioif_softc *sc, struct virtio_softc *vsc, struct vioif_netqueue *netq, u_int limit, size_t *ndeqp) { struct virtqueue *vq = netq->netq_vq; struct ifnet *ifp = &sc->sc_ethercom.ec_if; struct vioif_net_map *map; struct mbuf *m; int slot, len; bool more; size_t ndeq; KASSERT(mutex_owned(&netq->netq_lock)); more = false; ndeq = 0; if (virtio_vq_is_enqueued(vsc, vq) == false) goto done; for (;;ndeq++) { if (ndeq >= limit) { more = true; break; } if (virtio_dequeue(vsc, vq, &slot, &len) != 0) break; map = &netq->netq_maps[slot]; KASSERT(map->vnm_mbuf != NULL); m = vioif_net_dequeue_commit(vsc, vq, slot, map, BUS_DMASYNC_POSTREAD); KASSERT(m != NULL); m->m_len = m->m_pkthdr.len = len - sc->sc_hdr_size; m_set_rcvif(m, ifp); if_percpuq_enqueue(ifp->if_percpuq, m); } done: if (ndeqp != NULL) *ndeqp = ndeq; return more; } static void vioif_rx_queue_clear(struct vioif_softc *sc, struct virtio_softc *vsc, struct vioif_netqueue *netq) { struct vioif_net_map *map; struct mbuf *m; unsigned int i, vq_num; bool more; mutex_enter(&netq->netq_lock); vq_num = netq->netq_vq->vq_num; for (;;) { more = vioif_rx_deq_locked(sc, vsc, netq, vq_num, NULL); if (more == false) break; } for (i = 0; i < vq_num; i++) { map = &netq->netq_maps[i]; m = map->vnm_mbuf; if (m == NULL) continue; vioif_net_unload_mbuf(vsc, map); m_freem(m); } mutex_exit(&netq->netq_lock); } static void vioif_rx_handle_locked(void *xnetq, u_int limit) { struct vioif_netqueue *netq = xnetq; struct virtqueue *vq = netq->netq_vq; struct virtio_softc *vsc = vq->vq_owner; struct vioif_softc *sc = device_private(virtio_child(vsc)); bool more; int enqueued; size_t ndeq; KASSERT(mutex_owned(&netq->netq_lock)); KASSERT(!netq->netq_stopping); more = vioif_rx_deq_locked(sc, vsc, netq, limit, &ndeq); if (ndeq > 0) vioif_populate_rx_mbufs_locked(sc, netq); if (more) { vioif_net_sched_handle(sc, netq); return; } enqueued = virtio_start_vq_intr(vsc, netq->netq_vq); if (enqueued != 0) { virtio_stop_vq_intr(vsc, netq->netq_vq); vioif_net_sched_handle(sc, netq); return; } netq->netq_running_handle = false; } static int vioif_rx_intr(void *arg) { struct vioif_netqueue *netq = arg; struct virtqueue *vq = netq->netq_vq; struct virtio_softc *vsc = vq->vq_owner; struct vioif_softc *sc = device_private(virtio_child(vsc)); u_int limit; mutex_enter(&netq->netq_lock); /* handler is already running in softint/workqueue */ if (netq->netq_running_handle) goto done; if (netq->netq_stopping) goto done; netq->netq_running_handle = true; limit = sc->sc_rx_intr_process_limit; virtio_stop_vq_intr(vsc, vq); vioif_rx_handle_locked(netq, limit); done: mutex_exit(&netq->netq_lock); return 1; } static void vioif_rx_handle(void *xnetq) { struct vioif_netqueue *netq = xnetq; struct virtqueue *vq = netq->netq_vq; struct virtio_softc *vsc = vq->vq_owner; struct vioif_softc *sc = device_private(virtio_child(vsc)); u_int limit; mutex_enter(&netq->netq_lock); KASSERT(netq->netq_running_handle); if (netq->netq_stopping) { netq->netq_running_handle = false; goto done; } limit = sc->sc_rx_process_limit; vioif_rx_handle_locked(netq, limit); done: mutex_exit(&netq->netq_lock); } /* * Transmission implementation */ /* enqueue mbufs to send */ static void vioif_send_common_locked(struct ifnet *ifp, struct vioif_netqueue *netq, bool is_transmit) { struct vioif_softc *sc = ifp->if_softc; struct virtio_softc *vsc = sc->sc_virtio; struct virtqueue *vq = netq->netq_vq; struct vioif_tx_context *txc; struct vioif_net_map *map; struct mbuf *m; int queued = 0; KASSERT(mutex_owned(&netq->netq_lock)); if (netq->netq_stopping || !ISSET(ifp->if_flags, IFF_RUNNING)) return; txc = netq->netq_ctx; if (!txc->txc_link_active || txc->txc_no_free_slots) return; for (;;) { int slot, r; r = virtio_enqueue_prep(vsc, vq, &slot); if (r == EAGAIN) { txc->txc_no_free_slots = true; break; } if (__predict_false(r != 0)) panic("enqueue_prep for tx buffers"); if (is_transmit) m = pcq_get(txc->txc_intrq); else IFQ_DEQUEUE(&ifp->if_snd, m); if (m == NULL) { virtio_enqueue_abort(vsc, vq, slot); break; } map = &netq->netq_maps[slot]; KASSERT(map->vnm_mbuf == NULL); r = vioif_net_load_mbuf(vsc, map, m, BUS_DMA_WRITE); if (r != 0) { /* maybe just too fragmented */ struct mbuf *newm; newm = m_defrag(m, M_NOWAIT); if (newm != NULL) { m = newm; r = vioif_net_load_mbuf(vsc, map, m, BUS_DMA_WRITE); } else { txc->txc_defrag_failed.ev_count++; r = -1; } if (r != 0) { netq->netq_mbuf_load_failed.ev_count++; m_freem(m); if_statinc(ifp, if_oerrors); virtio_enqueue_abort(vsc, vq, slot); continue; } } memset(map->vnm_hdr, 0, sc->sc_hdr_size); r = vioif_net_enqueue_tx(vsc, vq, slot, map); if (r != 0) { netq->netq_enqueue_failed.ev_count++; vioif_net_unload_mbuf(vsc, map); m_freem(m); /* slot already freed by vioif_net_enqueue_tx */ if_statinc(ifp, if_oerrors); continue; } queued++; bpf_mtap(ifp, m, BPF_D_OUT); } if (queued > 0) { vioif_notify(vsc, vq); ifp->if_timer = 5; } } /* dequeue sent mbufs */ static bool vioif_tx_deq_locked(struct vioif_softc *sc, struct virtio_softc *vsc, struct vioif_netqueue *netq, u_int limit, size_t *ndeqp) { struct virtqueue *vq = netq->netq_vq; struct ifnet *ifp = &sc->sc_ethercom.ec_if; struct vioif_net_map *map; struct mbuf *m; int slot, len; bool more; size_t ndeq; KASSERT(mutex_owned(&netq->netq_lock)); more = false; ndeq = 0; if (virtio_vq_is_enqueued(vsc, vq) == false) goto done; for (;;ndeq++) { if (limit-- == 0) { more = true; break; } if (virtio_dequeue(vsc, vq, &slot, &len) != 0) break; map = &netq->netq_maps[slot]; KASSERT(map->vnm_mbuf != NULL); m = vioif_net_dequeue_commit(vsc, vq, slot, map, BUS_DMASYNC_POSTWRITE); KASSERT(m != NULL); if_statinc(ifp, if_opackets); m_freem(m); } done: if (ndeqp != NULL) *ndeqp = ndeq; return more; } static void vioif_tx_queue_clear(struct vioif_softc *sc, struct virtio_softc *vsc, struct vioif_netqueue *netq) { struct vioif_tx_context *txc; struct vioif_net_map *map; struct mbuf *m; unsigned int i, vq_num; bool more; mutex_enter(&netq->netq_lock); txc = netq->netq_ctx; vq_num = netq->netq_vq->vq_num; for (;;) { more = vioif_tx_deq_locked(sc, vsc, netq, vq_num, NULL); if (more == false) break; } for (i = 0; i < vq_num; i++) { map = &netq->netq_maps[i]; m = map->vnm_mbuf; if (m == NULL) continue; vioif_net_unload_mbuf(vsc, map); m_freem(m); } txc->txc_no_free_slots = false; mutex_exit(&netq->netq_lock); } static void vioif_start_locked(struct ifnet *ifp, struct vioif_netqueue *netq) { /* * ifp->if_obytes and ifp->if_omcasts are added in if_transmit()@if.c. */ vioif_send_common_locked(ifp, netq, false); } static void vioif_transmit_locked(struct ifnet *ifp, struct vioif_netqueue *netq) { vioif_send_common_locked(ifp, netq, true); } static void vioif_deferred_transmit(void *arg) { struct vioif_netqueue *netq = arg; struct virtio_softc *vsc = netq->netq_vq->vq_owner; struct vioif_softc *sc = device_private(virtio_child(vsc)); struct ifnet *ifp = &sc->sc_ethercom.ec_if; mutex_enter(&netq->netq_lock); vioif_send_common_locked(ifp, netq, true); mutex_exit(&netq->netq_lock); } static void vioif_tx_handle_locked(struct vioif_netqueue *netq, u_int limit) { struct virtqueue *vq = netq->netq_vq; struct vioif_tx_context *txc = netq->netq_ctx; struct virtio_softc *vsc = vq->vq_owner; struct vioif_softc *sc = device_private(virtio_child(vsc)); struct ifnet *ifp = &sc->sc_ethercom.ec_if; bool more; int enqueued; size_t ndeq; KASSERT(mutex_owned(&netq->netq_lock)); KASSERT(!netq->netq_stopping); more = vioif_tx_deq_locked(sc, vsc, netq, limit, &ndeq); if (txc->txc_no_free_slots && ndeq > 0) { txc->txc_no_free_slots = false; softint_schedule(txc->txc_deferred_transmit); } if (more) { vioif_net_sched_handle(sc, netq); return; } enqueued = (virtio_features(vsc) & VIRTIO_F_RING_EVENT_IDX) ? virtio_postpone_intr_smart(vsc, vq): virtio_start_vq_intr(vsc, vq); if (enqueued != 0) { virtio_stop_vq_intr(vsc, vq); vioif_net_sched_handle(sc, netq); return; } netq->netq_running_handle = false; /* for ALTQ */ if (netq == &sc->sc_netqs[VIOIF_NETQ_TXQID(0)]) if_schedule_deferred_start(ifp); softint_schedule(txc->txc_deferred_transmit); } static int vioif_tx_intr(void *arg) { struct vioif_netqueue *netq = arg; struct virtqueue *vq = netq->netq_vq; struct virtio_softc *vsc = vq->vq_owner; struct vioif_softc *sc = device_private(virtio_child(vsc)); u_int limit; mutex_enter(&netq->netq_lock); /* tx handler is already running in softint/workqueue */ if (netq->netq_running_handle) goto done; if (netq->netq_stopping) goto done; netq->netq_running_handle = true; virtio_stop_vq_intr(vsc, vq); netq->netq_workqueue = sc->sc_txrx_workqueue_sysctl; limit = sc->sc_tx_intr_process_limit; vioif_tx_handle_locked(netq, limit); done: mutex_exit(&netq->netq_lock); return 1; } static void vioif_tx_handle(void *xnetq) { struct vioif_netqueue *netq = xnetq; struct virtqueue *vq = netq->netq_vq; struct virtio_softc *vsc = vq->vq_owner; struct vioif_softc *sc = device_private(virtio_child(vsc)); u_int limit; mutex_enter(&netq->netq_lock); KASSERT(netq->netq_running_handle); if (netq->netq_stopping) { netq->netq_running_handle = false; goto done; } limit = sc->sc_tx_process_limit; vioif_tx_handle_locked(netq, limit); done: mutex_exit(&netq->netq_lock); } /* * Control vq */ /* issue a VIRTIO_NET_CTRL_RX class command and wait for completion */ static void vioif_ctrl_acquire(struct vioif_softc *sc) { struct vioif_ctrlqueue *ctrlq = &sc->sc_ctrlq; mutex_enter(&ctrlq->ctrlq_wait_lock); while (ctrlq->ctrlq_inuse != FREE) cv_wait(&ctrlq->ctrlq_wait, &ctrlq->ctrlq_wait_lock); ctrlq->ctrlq_inuse = INUSE; ctrlq->ctrlq_owner = curlwp; mutex_exit(&ctrlq->ctrlq_wait_lock); } static void vioif_ctrl_release(struct vioif_softc *sc) { struct vioif_ctrlqueue *ctrlq = &sc->sc_ctrlq; KASSERT(ctrlq->ctrlq_inuse != FREE); KASSERT(ctrlq->ctrlq_owner == curlwp); mutex_enter(&ctrlq->ctrlq_wait_lock); ctrlq->ctrlq_inuse = FREE; ctrlq->ctrlq_owner = NULL; cv_signal(&ctrlq->ctrlq_wait); mutex_exit(&ctrlq->ctrlq_wait_lock); } static int vioif_ctrl_load_cmdspec(struct vioif_softc *sc, struct vioif_ctrl_cmdspec *specs, int nspecs) { struct virtio_softc *vsc = sc->sc_virtio; int i, r, loaded; loaded = 0; for (i = 0; i < nspecs; i++) { r = bus_dmamap_load(virtio_dmat(vsc), specs[i].dmamap, specs[i].buf, specs[i].bufsize, NULL, BUS_DMA_WRITE | BUS_DMA_NOWAIT); if (r) { sc->sc_ctrlq.ctrlq_cmd_load_failed.ev_count++; goto err; } loaded++; } return r; err: for (i = 0; i < loaded; i++) { bus_dmamap_unload(virtio_dmat(vsc), specs[i].dmamap); } return r; } static void vioif_ctrl_unload_cmdspec(struct vioif_softc *sc, struct vioif_ctrl_cmdspec *specs, int nspecs) { struct virtio_softc *vsc = sc->sc_virtio; int i; for (i = 0; i < nspecs; i++) { bus_dmamap_unload(virtio_dmat(vsc), specs[i].dmamap); } } static int vioif_ctrl_send_command(struct vioif_softc *sc, uint8_t class, uint8_t cmd, struct vioif_ctrl_cmdspec *specs, int nspecs) { struct vioif_ctrlqueue *ctrlq = &sc->sc_ctrlq; struct virtqueue *vq = ctrlq->ctrlq_vq; struct virtio_softc *vsc = sc->sc_virtio; int i, r, slot; ctrlq->ctrlq_cmd->class = class; ctrlq->ctrlq_cmd->command = cmd; bus_dmamap_sync(virtio_dmat(vsc), ctrlq->ctrlq_cmd_dmamap, 0, sizeof(struct virtio_net_ctrl_cmd), BUS_DMASYNC_PREWRITE); for (i = 0; i < nspecs; i++) { bus_dmamap_sync(virtio_dmat(vsc), specs[i].dmamap, 0, specs[i].bufsize, BUS_DMASYNC_PREWRITE); } bus_dmamap_sync(virtio_dmat(vsc), ctrlq->ctrlq_status_dmamap, 0, sizeof(struct virtio_net_ctrl_status), BUS_DMASYNC_PREREAD); /* we need to explicitly (re)start vq intr when using RING EVENT IDX */ if (virtio_features(vsc) & VIRTIO_F_RING_EVENT_IDX) virtio_start_vq_intr(vsc, ctrlq->ctrlq_vq); r = virtio_enqueue_prep(vsc, vq, &slot); if (r != 0) panic("%s: control vq busy!?", device_xname(sc->sc_dev)); r = virtio_enqueue_reserve(vsc, vq, slot, nspecs + 2); if (r != 0) panic("%s: control vq busy!?", device_xname(sc->sc_dev)); virtio_enqueue(vsc, vq, slot, ctrlq->ctrlq_cmd_dmamap, true); for (i = 0; i < nspecs; i++) { virtio_enqueue(vsc, vq, slot, specs[i].dmamap, true); } virtio_enqueue(vsc, vq, slot, ctrlq->ctrlq_status_dmamap, false); virtio_enqueue_commit(vsc, vq, slot, true); /* wait for done */ mutex_enter(&ctrlq->ctrlq_wait_lock); while (ctrlq->ctrlq_inuse != DONE) cv_wait(&ctrlq->ctrlq_wait, &ctrlq->ctrlq_wait_lock); mutex_exit(&ctrlq->ctrlq_wait_lock); /* already dequeued */ bus_dmamap_sync(virtio_dmat(vsc), ctrlq->ctrlq_cmd_dmamap, 0, sizeof(struct virtio_net_ctrl_cmd), BUS_DMASYNC_POSTWRITE); for (i = 0; i < nspecs; i++) { bus_dmamap_sync(virtio_dmat(vsc), specs[i].dmamap, 0, specs[i].bufsize, BUS_DMASYNC_POSTWRITE); } bus_dmamap_sync(virtio_dmat(vsc), ctrlq->ctrlq_status_dmamap, 0, sizeof(struct virtio_net_ctrl_status), BUS_DMASYNC_POSTREAD); if (ctrlq->ctrlq_status->ack == VIRTIO_NET_OK) r = 0; else { device_printf(sc->sc_dev, "failed setting rx mode\n"); sc->sc_ctrlq.ctrlq_cmd_failed.ev_count++; r = EIO; } return r; } /* ctrl vq interrupt; wake up the command issuer */ static int vioif_ctrl_intr(void *arg) { struct vioif_ctrlqueue *ctrlq = arg; struct virtqueue *vq = ctrlq->ctrlq_vq; struct virtio_softc *vsc = vq->vq_owner; int r, slot; if (virtio_vq_is_enqueued(vsc, vq) == false) return 0; r = virtio_dequeue(vsc, vq, &slot, NULL); if (r == ENOENT) return 0; virtio_dequeue_commit(vsc, vq, slot); mutex_enter(&ctrlq->ctrlq_wait_lock); ctrlq->ctrlq_inuse = DONE; cv_signal(&ctrlq->ctrlq_wait); mutex_exit(&ctrlq->ctrlq_wait_lock); return 1; } static int vioif_ctrl_rx(struct vioif_softc *sc, int cmd, bool onoff) { struct virtio_net_ctrl_rx *rx = sc->sc_ctrlq.ctrlq_rx; struct vioif_ctrl_cmdspec specs[1]; int r; if (!sc->sc_has_ctrl) return ENOTSUP; vioif_ctrl_acquire(sc); rx->onoff = onoff; specs[0].dmamap = sc->sc_ctrlq.ctrlq_rx_dmamap; specs[0].buf = rx; specs[0].bufsize = sizeof(*rx); r = vioif_ctrl_send_command(sc, VIRTIO_NET_CTRL_RX, cmd, specs, __arraycount(specs)); vioif_ctrl_release(sc); return r; } static int vioif_set_promisc(struct vioif_softc *sc, bool onoff) { return vioif_ctrl_rx(sc, VIRTIO_NET_CTRL_RX_PROMISC, onoff); } static int vioif_set_allmulti(struct vioif_softc *sc, bool onoff) { return vioif_ctrl_rx(sc, VIRTIO_NET_CTRL_RX_ALLMULTI, onoff); } static int vioif_ctrl_mq_vq_pairs_set(struct vioif_softc *sc, int nvq_pairs) { struct virtio_net_ctrl_mq *mq = sc->sc_ctrlq.ctrlq_mq; struct vioif_ctrl_cmdspec specs[1]; int r; if (!sc->sc_has_ctrl) return ENOTSUP; if (nvq_pairs <= 1) return EINVAL; vioif_ctrl_acquire(sc); mq->virtqueue_pairs = virtio_rw16(sc->sc_virtio, nvq_pairs); specs[0].dmamap = sc->sc_ctrlq.ctrlq_mq_dmamap; specs[0].buf = mq; specs[0].bufsize = sizeof(*mq); r = vioif_ctrl_send_command(sc, VIRTIO_NET_CTRL_MQ, VIRTIO_NET_CTRL_MQ_VQ_PAIRS_SET, specs, __arraycount(specs)); vioif_ctrl_release(sc); return r; } static int vioif_set_mac_addr(struct vioif_softc *sc) { struct virtio_net_ctrl_mac_addr *ma = sc->sc_ctrlq.ctrlq_mac_addr; struct vioif_ctrl_cmdspec specs[1]; struct ifnet *ifp = &sc->sc_ethercom.ec_if; int nspecs = __arraycount(specs); uint64_t features; int r; size_t i; if (!sc->sc_has_ctrl) return ENOTSUP; if (memcmp(CLLADDR(ifp->if_sadl), sc->sc_mac, ETHER_ADDR_LEN) == 0) { return 0; } memcpy(sc->sc_mac, CLLADDR(ifp->if_sadl), ETHER_ADDR_LEN); features = virtio_features(sc->sc_virtio); if (features & VIRTIO_NET_F_CTRL_MAC_ADDR) { vioif_ctrl_acquire(sc); memcpy(ma->mac, sc->sc_mac, ETHER_ADDR_LEN); specs[0].dmamap = sc->sc_ctrlq.ctrlq_mac_addr_dmamap; specs[0].buf = ma; specs[0].bufsize = sizeof(*ma); r = vioif_ctrl_send_command(sc, VIRTIO_NET_CTRL_MAC, VIRTIO_NET_CTRL_MAC_ADDR_SET, specs, nspecs); vioif_ctrl_release(sc); } else { for (i = 0; i < __arraycount(sc->sc_mac); i++) { virtio_write_device_config_1(sc->sc_virtio, VIRTIO_NET_CONFIG_MAC + i, sc->sc_mac[i]); } r = 0; } return r; } static int vioif_set_rx_filter(struct vioif_softc *sc) { /* filter already set in ctrlq->ctrlq_mac_tbl */ struct virtio_softc *vsc = sc->sc_virtio; struct virtio_net_ctrl_mac_tbl *mac_tbl_uc, *mac_tbl_mc; struct vioif_ctrl_cmdspec specs[2]; int nspecs = __arraycount(specs); int r; mac_tbl_uc = sc->sc_ctrlq.ctrlq_mac_tbl_uc; mac_tbl_mc = sc->sc_ctrlq.ctrlq_mac_tbl_mc; if (!sc->sc_has_ctrl) return ENOTSUP; vioif_ctrl_acquire(sc); specs[0].dmamap = sc->sc_ctrlq.ctrlq_tbl_uc_dmamap; specs[0].buf = mac_tbl_uc; specs[0].bufsize = sizeof(*mac_tbl_uc) + (ETHER_ADDR_LEN * virtio_rw32(vsc, mac_tbl_uc->nentries)); specs[1].dmamap = sc->sc_ctrlq.ctrlq_tbl_mc_dmamap; specs[1].buf = mac_tbl_mc; specs[1].bufsize = sizeof(*mac_tbl_mc) + (ETHER_ADDR_LEN * virtio_rw32(vsc, mac_tbl_mc->nentries)); r = vioif_ctrl_load_cmdspec(sc, specs, nspecs); if (r != 0) goto out; r = vioif_ctrl_send_command(sc, VIRTIO_NET_CTRL_MAC, VIRTIO_NET_CTRL_MAC_TABLE_SET, specs, nspecs); vioif_ctrl_unload_cmdspec(sc, specs, nspecs); out: vioif_ctrl_release(sc); return r; } /* * If multicast filter small enough (<=MAXENTRIES) set rx filter * If large multicast filter exist use ALLMULTI * If setting rx filter fails fall back to ALLMULTI */ static int vioif_rx_filter(struct vioif_softc *sc) { struct virtio_softc *vsc = sc->sc_virtio; struct ethercom *ec = &sc->sc_ethercom; struct ifnet *ifp = &ec->ec_if; struct ether_multi *enm; struct ether_multistep step; struct vioif_ctrlqueue *ctrlq = &sc->sc_ctrlq; int nentries; bool allmulti = 0; int r; if (!sc->sc_has_ctrl) { goto set_ifflags; } memcpy(ctrlq->ctrlq_mac_tbl_uc->macs[0], CLLADDR(ifp->if_sadl), ETHER_ADDR_LEN); nentries = 0; allmulti = false; ETHER_LOCK(ec); for (ETHER_FIRST_MULTI(step, ec, enm); enm != NULL; ETHER_NEXT_MULTI(step, enm)) { if (nentries >= VIRTIO_NET_CTRL_MAC_MAXENTRIES) { allmulti = true; break; } if (memcmp(enm->enm_addrlo, enm->enm_addrhi, ETHER_ADDR_LEN)) { allmulti = true; break; } memcpy(ctrlq->ctrlq_mac_tbl_mc->macs[nentries], enm->enm_addrlo, ETHER_ADDR_LEN); nentries++; } ETHER_UNLOCK(ec); r = vioif_set_mac_addr(sc); if (r != 0) { log(LOG_WARNING, "%s: couldn't set MAC address\n", ifp->if_xname); } if (!allmulti) { ctrlq->ctrlq_mac_tbl_uc->nentries = virtio_rw32(vsc, 1); ctrlq->ctrlq_mac_tbl_mc->nentries = virtio_rw32(vsc, nentries); r = vioif_set_rx_filter(sc); if (r != 0) { allmulti = true; /* fallback */ } } if (allmulti) { ctrlq->ctrlq_mac_tbl_uc->nentries = virtio_rw32(vsc, 0); ctrlq->ctrlq_mac_tbl_mc->nentries = virtio_rw32(vsc, 0); r = vioif_set_rx_filter(sc); if (r != 0) { log(LOG_DEBUG, "%s: couldn't clear RX filter\n", ifp->if_xname); /* what to do on failure? */ } ifp->if_flags |= IFF_ALLMULTI; } set_ifflags: r = vioif_ifflags(sc); return r; } /* * VM configuration changes */ static int vioif_config_change(struct virtio_softc *vsc) { struct vioif_softc *sc = device_private(virtio_child(vsc)); softint_schedule(sc->sc_cfg_softint); return 0; } static void vioif_cfg_softint(void *arg) { struct vioif_softc *sc = arg; struct ifnet *ifp = &sc->sc_ethercom.ec_if; vioif_update_link_status(sc); vioif_start(ifp); } static int vioif_get_link_status(struct vioif_softc *sc) { struct virtio_softc *vsc = sc->sc_virtio; uint16_t status; if (virtio_features(vsc) & VIRTIO_NET_F_STATUS) status = virtio_read_device_config_2(vsc, VIRTIO_NET_CONFIG_STATUS); else status = VIRTIO_NET_S_LINK_UP; if ((status & VIRTIO_NET_S_LINK_UP) != 0) return LINK_STATE_UP; return LINK_STATE_DOWN; } static void vioif_update_link_status(struct vioif_softc *sc) { struct ifnet *ifp = &sc->sc_ethercom.ec_if; struct vioif_netqueue *netq; struct vioif_tx_context *txc; bool active; int link, i; mutex_enter(&sc->sc_lock); link = vioif_get_link_status(sc); if (link == sc->sc_link_state) goto done; sc->sc_link_state = link; active = VIOIF_IS_LINK_ACTIVE(sc); for (i = 0; i < sc->sc_act_nvq_pairs; i++) { netq = &sc->sc_netqs[VIOIF_NETQ_TXQID(i)]; mutex_enter(&netq->netq_lock); txc = netq->netq_ctx; txc->txc_link_active = active; mutex_exit(&netq->netq_lock); } if_link_state_change(ifp, sc->sc_link_state); done: mutex_exit(&sc->sc_lock); } static void vioif_workq_work(struct work *wk, void *context) { struct vioif_work *work; work = container_of(wk, struct vioif_work, cookie); atomic_store_relaxed(&work->added, 0); work->func(work->arg); } static struct workqueue * vioif_workq_create(const char *name, pri_t prio, int ipl, int flags) { struct workqueue *wq; int error; error = workqueue_create(&wq, name, vioif_workq_work, NULL, prio, ipl, flags); if (error) return NULL; return wq; } static void vioif_workq_destroy(struct workqueue *wq) { workqueue_destroy(wq); } static void vioif_work_set(struct vioif_work *work, void (*func)(void *), void *arg) { memset(work, 0, sizeof(*work)); work->func = func; work->arg = arg; } static void vioif_work_add(struct workqueue *wq, struct vioif_work *work) { if (atomic_load_relaxed(&work->added) != 0) return; atomic_store_relaxed(&work->added, 1); kpreempt_disable(); workqueue_enqueue(wq, &work->cookie, NULL); kpreempt_enable(); } static void vioif_work_wait(struct workqueue *wq, struct vioif_work *work) { workqueue_wait(wq, &work->cookie); } MODULE(MODULE_CLASS_DRIVER, if_vioif, "virtio"); #ifdef _MODULE #include "ioconf.c" #endif static int if_vioif_modcmd(modcmd_t cmd, void *opaque) { int error = 0; #ifdef _MODULE switch (cmd) { case MODULE_CMD_INIT: error = config_init_component(cfdriver_ioconf_if_vioif, cfattach_ioconf_if_vioif, cfdata_ioconf_if_vioif); break; case MODULE_CMD_FINI: error = config_fini_component(cfdriver_ioconf_if_vioif, cfattach_ioconf_if_vioif, cfdata_ioconf_if_vioif); break; default: error = ENOTTY; break; } #endif return error; }
2 2 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 /* $NetBSD: tty_ptm.c,v 1.46 2023/04/09 09:18:09 riastradh Exp $ */ /*- * Copyright (c) 2004, 2020 The NetBSD Foundation, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: tty_ptm.c,v 1.46 2023/04/09 09:18:09 riastradh Exp $"); #ifdef _KERNEL_OPT #include "opt_compat_netbsd.h" #include "opt_ptm.h" #endif /* pty multiplexor driver /dev/ptm{,x} */ #include <sys/param.h> #include <sys/systm.h> #include <sys/ioctl.h> #include <sys/proc.h> #include <sys/tty.h> #include <sys/stat.h> #include <sys/file.h> #include <sys/uio.h> #include <sys/kernel.h> #include <sys/vnode.h> #include <sys/namei.h> #include <sys/signalvar.h> #include <sys/filedesc.h> #include <sys/conf.h> #include <sys/poll.h> #include <sys/pty.h> #include <sys/kauth.h> #include <sys/compat_stub.h> #include <miscfs/specfs/specdev.h> #include <compat/sys/ttycom.h> #include "ioconf.h" #ifdef DEBUG_PTM #define DPRINTF(a) printf a #else #define DPRINTF(a) #endif #ifdef NO_DEV_PTM const struct cdevsw ptm_cdevsw = { .d_open = noopen, .d_close = noclose, .d_read = noread, .d_write = nowrite, .d_ioctl = noioctl, .d_stop = nostop, .d_tty = notty, .d_poll = nopoll, .d_mmap = nommap, .d_kqfilter = nokqfilter, .d_discard = nodiscard, .d_flag = D_TTY }; #else int pts_major, ptc_major; static dev_t pty_getfree(void); static int pty_alloc_master(struct lwp *, int *, dev_t *, struct mount *, int); static int pty_alloc_slave(struct lwp *, int *, dev_t, struct mount *); static int pty_vn_open(struct vnode *, struct lwp *); int pty_getmp(struct lwp *l, struct mount **mpp) { if (ptm == NULL) return EOPNOTSUPP; return (*ptm->getmp)(l, mpp); } dev_t pty_makedev(char ms, int minor) { return makedev(ms == 't' ? pts_major : ptc_major, minor); } static dev_t pty_getfree(void) { extern kmutex_t pt_softc_mutex; int i; mutex_enter(&pt_softc_mutex); for (i = 0; i < npty; i++) { if (pty_isfree(i, 0)) break; } mutex_exit(&pt_softc_mutex); return pty_makedev('t', i); } /* * Hacked up version of vn_open. We _only_ handle ptys and only open * them with FREAD|FWRITE and never deal with creat or stuff like that. * * We need it because we have to fake up root credentials to open the pty. */ int pty_vn_open(struct vnode *vp, struct lwp *l) { int error; if (vp->v_type != VCHR) { vput(vp); return EINVAL; } error = VOP_OPEN(vp, FREAD|FWRITE, lwp0.l_cred); if (error) { /* only ptys mean we can't get these */ KASSERT(error != EDUPFD); KASSERT(error != EMOVEFD); vput(vp); return error; } mutex_enter(vp->v_interlock); vp->v_writecount++; mutex_exit(vp->v_interlock); return 0; } static int pty_alloc_master(struct lwp *l, int *fd, dev_t *dev, struct mount *mp, int flags) { int error; struct file *fp; struct vnode *vp; int md; if ((error = fd_allocfile(&fp, fd)) != 0) { DPRINTF(("fd_allocfile %d\n", error)); return error; } retry: /* Find and open a free master pty. */ *dev = pty_getfree(); md = minor(*dev); if ((error = pty_check(md)) != 0) { DPRINTF(("pty_check %d\n", error)); goto bad; } if (ptm == NULL) { DPRINTF(("no ptm\n")); error = EOPNOTSUPP; goto bad; } if ((error = (*ptm->allocvp)(mp, l, &vp, *dev, 'p')) != 0) { DPRINTF(("pty_allocvp %d\n", error)); goto bad; } if ((error = pty_vn_open(vp, l)) != 0) { DPRINTF(("pty_vn_open %d\n", error)); /* * Check if the master open failed because we lost * the race to grab it. */ if (error != EIO) goto bad; error = !pty_isfree(md, 1); DPRINTF(("pty_isfree %d\n", error)); if (error) goto retry; else goto bad; } fp->f_flag = FREAD|FWRITE|(flags&FMASK); fp->f_type = DTYPE_VNODE; fp->f_ops = &vnops; fp->f_vnode = vp; VOP_UNLOCK(vp); fd_set_exclose(l, *fd, (flags & O_CLOEXEC) != 0); fd_affix(curproc, fp, *fd); return 0; bad: fd_abort(curproc, fp, *fd); return error; } int pty_grant_slave(struct lwp *l, dev_t dev, struct mount *mp) { int error; struct vnode *vp; /* * Open the slave. * namei -> setattr -> unlock -> revoke -> vrele -> * namei -> open -> unlock * Three stage rocket: * 1. Change the owner and permissions on the slave. * 2. Revoke all the users of the slave. * 3. open the slave. */ if (ptm == NULL) return EOPNOTSUPP; if ((error = (*ptm->allocvp)(mp, l, &vp, dev, 't')) != 0) return error; if ((vp->v_mount->mnt_flag & MNT_RDONLY) == 0) { struct vattr vattr; (*ptm->getvattr)(mp, l, &vattr); /* Do the VOP_SETATTR() as root. */ error = VOP_SETATTR(vp, &vattr, lwp0.l_cred); if (error) { DPRINTF(("setattr %d\n", error)); vput(vp); return error; } } VOP_UNLOCK(vp); VOP_REVOKE(vp, REVOKEALL); /* * The vnode is useless after the revoke, we need to get it again. */ vrele(vp); return 0; } static int pty_alloc_slave(struct lwp *l, int *fd, dev_t dev, struct mount *mp) { int error; struct file *fp; struct vnode *vp; /* Grab a filedescriptor for the slave */ if ((error = fd_allocfile(&fp, fd)) != 0) { DPRINTF(("fd_allocfile %d\n", error)); return error; } if (ptm == NULL) { error = EOPNOTSUPP; goto bad; } if ((error = (*ptm->allocvp)(mp, l, &vp, dev, 't')) != 0) goto bad; if ((error = pty_vn_open(vp, l)) != 0) goto bad; fp->f_flag = FREAD|FWRITE; fp->f_type = DTYPE_VNODE; fp->f_ops = &vnops; fp->f_vnode = vp; VOP_UNLOCK(vp); fd_affix(curproc, fp, *fd); return 0; bad: fd_abort(curproc, fp, *fd); return error; } struct ptm_pty * pty_sethandler(struct ptm_pty *nptm) { struct ptm_pty *optm = ptm; ptm = nptm; return optm; } int pty_fill_ptmget(struct lwp *l, dev_t dev, int cfd, int sfd, void *data, struct mount *mp) { struct ptmget *ptmg = data; int error; if (ptm == NULL) return EOPNOTSUPP; ptmg->cfd = cfd == -1 ? minor(dev) : cfd; ptmg->sfd = sfd == -1 ? minor(dev) : sfd; error = (*ptm->makename)(mp, l, ptmg->cn, sizeof(ptmg->cn), dev, 'p'); if (error) return error; return (*ptm->makename)(mp, l, ptmg->sn, sizeof(ptmg->sn), dev, 't'); } void /*ARGSUSED*/ ptmattach(int n) { extern const struct cdevsw pts_cdevsw, ptc_cdevsw; /* find the major and minor of the pty devices */ if ((pts_major = cdevsw_lookup_major(&pts_cdevsw)) == -1) panic("%s: Can't find pty slave in cdevsw", __func__); if ((ptc_major = cdevsw_lookup_major(&ptc_cdevsw)) == -1) panic("%s: Can't find pty master in cdevsw", __func__); #ifdef COMPAT_BSDPTY ptm = &ptm_bsdpty; #endif } static int /*ARGSUSED*/ ptmopen(dev_t dev, int flag, int mode, struct lwp *l) { int error; int fd; dev_t ttydev; struct mount *mp; switch(minor(dev)) { case 0: /* /dev/ptmx */ case 2: /* /emul/linux/dev/ptmx */ if ((error = pty_getmp(l, &mp)) != 0) return error; if ((error = pty_alloc_master(l, &fd, &ttydev, mp, flag)) != 0) return error; if (minor(dev) == 2) { /* * Linux ptyfs grants the pty right here. * Handle this case here, instead of writing * a new linux module. */ if ((error = pty_grant_slave(l, ttydev, mp)) != 0) { file_t *fp = fd_getfile(fd); if (fp != NULL) { fd_close(fd); } return error; } } curlwp->l_dupfd = fd; return EMOVEFD; case 1: /* /dev/ptm */ return 0; default: return ENODEV; } } static int /*ARGSUSED*/ ptmclose(dev_t dev, int flag, int mode, struct lwp *l) { return (0); } static int /*ARGSUSED*/ ptmioctl(dev_t dev, u_long cmd, void *data, int flag, struct lwp *l) { int error; dev_t newdev; int cfd, sfd; file_t *fp; struct mount *mp; error = 0; switch (cmd) { case TIOCPTMGET: if ((error = pty_getmp(l, &mp)) != 0) return error; if ((error = pty_alloc_master(l, &cfd, &newdev, mp, 0)) != 0) return error; if ((error = pty_grant_slave(l, newdev, mp)) != 0) goto bad; if ((error = pty_alloc_slave(l, &sfd, newdev, mp)) != 0) goto bad; /* now, put the indices and names into struct ptmget */ if ((error = pty_fill_ptmget(l, newdev, cfd, sfd, data, mp)) != 0) goto bad2; return 0; default: MODULE_HOOK_CALL(tty_ptmioctl_60_hook, (dev, cmd, data, flag, l), EPASSTHROUGH, error); if (error != EPASSTHROUGH) return error; DPRINTF(("ptmioctl EINVAL\n")); return EINVAL; } bad2: fp = fd_getfile(sfd); if (fp != NULL) { fd_close(sfd); } bad: fp = fd_getfile(cfd); if (fp != NULL) { fd_close(cfd); } return error; } const struct cdevsw ptm_cdevsw = { .d_open = ptmopen, .d_close = ptmclose, .d_read = noread, .d_write = nowrite, .d_ioctl = ptmioctl, .d_stop = nullstop, .d_tty = notty, .d_poll = nopoll, .d_mmap = nommap, .d_kqfilter = nokqfilter, .d_discard = nodiscard, .d_flag = D_TTY }; #endif
7 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 /* $NetBSD: kern_pmf.c,v 1.51 2022/08/24 11:41:39 riastradh Exp $ */ /*- * Copyright (c) 2007 Jared D. McNeill <jmcneill@invisible.ca> * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: kern_pmf.c,v 1.51 2022/08/24 11:41:39 riastradh Exp $"); #include <sys/types.h> #include <sys/param.h> #include <sys/kmem.h> #include <sys/buf.h> #include <sys/callout.h> #include <sys/kernel.h> #include <sys/device.h> #include <sys/device_impl.h> #include <sys/pmf.h> #include <sys/queue.h> #include <sys/sched.h> #include <sys/workqueue.h> #include <prop/proplib.h> #include <sys/condvar.h> #include <sys/mutex.h> #include <sys/proc.h> #include <sys/reboot.h> /* for RB_NOSYNC */ #include <sys/sched.h> #include <sys/sysctl.h> #include <sys/vfs_syscalls.h> /* XXX ugly special case, but for now the only client */ #include "wsdisplay.h" #if NWSDISPLAY > 0 #include <dev/wscons/wsdisplayvar.h> #endif #define PMF_DEBUG #ifdef PMF_DEBUG int pmf_debug_event; int pmf_debug_suspend; int pmf_debug_suspensor; int pmf_debug_idle; int pmf_debug_transition; #define PMF_SUSPENSOR_PRINTF(x) if (pmf_debug_suspensor) printf x #define PMF_SUSPEND_PRINTF(x) if (pmf_debug_suspend) printf x #define PMF_EVENT_PRINTF(x) if (pmf_debug_event) printf x #define PMF_IDLE_PRINTF(x) if (pmf_debug_idle) printf x #define PMF_TRANSITION_PRINTF(x) if (pmf_debug_transition) printf x #define PMF_TRANSITION_PRINTF2(y,x) if (pmf_debug_transition>y) printf x #else #define PMF_SUSPENSOR_PRINTF(x) do { } while (0) #define PMF_SUSPEND_PRINTF(x) do { } while (0) #define PMF_EVENT_PRINTF(x) do { } while (0) #define PMF_IDLE_PRINTF(x) do { } while (0) #define PMF_TRANSITION_PRINTF(x) do { } while (0) #define PMF_TRANSITION_PRINTF2(y,x) do { } while (0) #endif static prop_dictionary_t pmf_platform = NULL; static struct workqueue *pmf_event_workqueue; static struct workqueue *pmf_suspend_workqueue; typedef struct pmf_event_handler { TAILQ_ENTRY(pmf_event_handler) pmf_link; pmf_generic_event_t pmf_event; void (*pmf_handler)(device_t); device_t pmf_device; bool pmf_global; } pmf_event_handler_t; static TAILQ_HEAD(, pmf_event_handler) pmf_all_events = TAILQ_HEAD_INITIALIZER(pmf_all_events); typedef struct pmf_event_workitem { struct work pew_work; pmf_generic_event_t pew_event; device_t pew_device; } pmf_event_workitem_t; typedef struct pmf_suspend_workitem { struct work psw_work; device_t psw_dev; pmf_qual_t psw_qual; } pmf_suspend_workitem_t; static struct pool pew_pl; static pmf_event_workitem_t *pmf_event_workitem_get(void); static void pmf_event_workitem_put(pmf_event_workitem_t *); static bool pmf_device_resume_locked(device_t, const pmf_qual_t *); static bool pmf_device_suspend_locked(device_t, const pmf_qual_t *); static bool device_pmf_any_suspensor(device_t, devact_level_t); static bool complete_suspension(device_t dev, const device_suspensor_t **susp, const pmf_qual_t *pqp) { int i; pmf_qual_t pq; const device_suspensor_t *ds; ds = pmf_qual_suspension(pqp); KASSERT(ds->ds_delegator != NULL); pq = *pqp; pq.pq_suspensor = ds->ds_delegator; for (i = 0; i < DEVICE_SUSPENSORS_MAX; i++) { if (susp[i] != ds) continue; if (!pmf_device_suspend(dev, &pq)) return false; } return true; } static void pmf_suspend_worker(struct work *wk, void *dummy) { pmf_suspend_workitem_t *psw; deviter_t di; device_t dev; psw = (void *)wk; KASSERT(wk == &psw->psw_work); KASSERT(psw != NULL); for (dev = deviter_first(&di, 0); dev != NULL; dev = deviter_next(&di)) { if (dev == psw->psw_dev && device_pmf_lock(dev)) break; } deviter_release(&di); if (dev == NULL) return; switch (pmf_qual_depth(&psw->psw_qual)) { case DEVACT_LEVEL_FULL: if (!complete_suspension(dev, dev->dv_class_suspensors, &psw->psw_qual)) break; /*FALLTHROUGH*/ case DEVACT_LEVEL_DRIVER: if (!complete_suspension(dev, dev->dv_driver_suspensors, &psw->psw_qual)) break; /*FALLTHROUGH*/ case DEVACT_LEVEL_BUS: if (!complete_suspension(dev, dev->dv_bus_suspensors, &psw->psw_qual)) break; } device_pmf_unlock(dev); kmem_free(psw, sizeof(*psw)); } static void pmf_event_worker(struct work *wk, void *dummy) { pmf_event_workitem_t *pew; pmf_event_handler_t *event; pew = (void *)wk; KASSERT(wk == &pew->pew_work); KASSERT(pew != NULL); TAILQ_FOREACH(event, &pmf_all_events, pmf_link) { if (event->pmf_event != pew->pew_event) continue; if (event->pmf_device == pew->pew_device || event->pmf_global) (*event->pmf_handler)(event->pmf_device); } pmf_event_workitem_put(pew); } static bool pmf_check_system_drivers(void) { device_t curdev; bool unsupported_devs; deviter_t di; unsupported_devs = false; for (curdev = deviter_first(&di, 0); curdev != NULL; curdev = deviter_next(&di)) { if (device_pmf_is_registered(curdev)) continue; if (!unsupported_devs) printf("Devices without power management support:"); printf(" %s", device_xname(curdev)); unsupported_devs = true; } deviter_release(&di); if (unsupported_devs) { printf("\n"); return false; } return true; } bool pmf_system_bus_resume(const pmf_qual_t *qual) { bool rv; device_t curdev; deviter_t di; aprint_debug("Powering devices:"); /* D0 handlers are run in order */ rv = true; for (curdev = deviter_first(&di, DEVITER_F_ROOT_FIRST); curdev != NULL; curdev = deviter_next(&di)) { if (!device_pmf_is_registered(curdev)) continue; if (device_is_active(curdev) || !device_is_enabled(curdev)) continue; aprint_debug(" %s", device_xname(curdev)); if (!device_pmf_bus_resume(curdev, qual)) { rv = false; aprint_debug("(failed)"); } } deviter_release(&di); aprint_debug("\n"); return rv; } bool pmf_system_resume(const pmf_qual_t *qual) { bool rv; device_t curdev, parent; deviter_t di; if (!pmf_check_system_drivers()) return false; aprint_debug("Resuming devices:"); /* D0 handlers are run in order */ rv = true; for (curdev = deviter_first(&di, DEVITER_F_ROOT_FIRST); curdev != NULL; curdev = deviter_next(&di)) { if (device_is_active(curdev) || !device_is_enabled(curdev)) continue; parent = device_parent(curdev); if (parent != NULL && !device_is_active(parent)) continue; aprint_debug(" %s", device_xname(curdev)); if (!pmf_device_resume(curdev, qual)) { rv = false; aprint_debug("(failed)"); } } deviter_release(&di); aprint_debug(".\n"); KERNEL_UNLOCK_ONE(0); #if NWSDISPLAY > 0 if (rv) wsdisplay_handlex(1); #endif return rv; } bool pmf_system_suspend(const pmf_qual_t *qual) { device_t curdev; deviter_t di; if (!pmf_check_system_drivers()) return false; #if NWSDISPLAY > 0 if (wsdisplay_handlex(0)) return false; #endif KERNEL_LOCK(1, NULL); /* * Flush buffers only if the shutdown didn't do so * already and if there was no panic. */ if (doing_shutdown == 0 && panicstr == NULL) { printf("Flushing disk caches: "); do_sys_sync(&lwp0); if (vfs_syncwait() != 0) printf("giving up\n"); else printf("done\n"); } aprint_debug("Suspending devices:"); for (curdev = deviter_first(&di, DEVITER_F_LEAVES_FIRST); curdev != NULL; curdev = deviter_next(&di)) { if (!device_is_active(curdev)) continue; aprint_debug(" %s", device_xname(curdev)); /* XXX joerg check return value and abort suspend */ if (!pmf_device_suspend(curdev, qual)) aprint_debug("(failed)"); } deviter_release(&di); aprint_debug(".\n"); return true; } static bool shutdown_all(int how) { static struct shutdown_state s; device_t curdev; bool progress = false; KERNEL_LOCK(1, NULL); for (curdev = shutdown_first(&s); curdev != NULL; curdev = shutdown_next(&s)) { aprint_debug(" shutting down %s, ", device_xname(curdev)); if (!device_pmf_is_registered(curdev)) aprint_debug("skipped."); #if 0 /* needed? */ else if (!device_pmf_class_shutdown(curdev, how)) aprint_debug("failed."); #endif else if (!device_pmf_driver_shutdown(curdev, how)) aprint_debug("failed."); else if (!device_pmf_bus_shutdown(curdev, how)) aprint_debug("failed."); else { progress = true; aprint_debug("success."); } } KERNEL_UNLOCK_ONE(NULL); return progress; } void pmf_system_shutdown(int how) { if (panicstr != NULL) return; aprint_debug("Shutting down devices:"); shutdown_all(how); } bool pmf_set_platform(const char *key, const char *value) { if (pmf_platform == NULL) pmf_platform = prop_dictionary_create(); if (pmf_platform == NULL) return false; return prop_dictionary_set_string(pmf_platform, key, value); } const char * pmf_get_platform(const char *key) { const char *value; if (pmf_platform == NULL) return NULL; if (!prop_dictionary_get_string(pmf_platform, key, &value)) return NULL; return value; } bool pmf_device_register1(device_t dev, bool (*suspend)(device_t, const pmf_qual_t *), bool (*resume)(device_t, const pmf_qual_t *), bool (*shutdown)(device_t, int)) { device_pmf_driver_register(dev, suspend, resume, shutdown); device_pmf_driver_child_register(dev); return true; } void pmf_device_deregister(device_t dev) { device_pmf_class_deregister(dev); device_pmf_bus_deregister(dev); device_pmf_driver_deregister(dev); } static const device_suspensor_t _device_suspensor_drvctl = { .ds_delegator = NULL, .ds_name = "drvctl", }; static const device_suspensor_t _device_suspensor_self = { .ds_delegator = NULL, .ds_name = "self", }; #if 0 static const device_suspensor_t _device_suspensor_self_delegate = { .ds_delegator = &_device_suspensor_self, .ds_name = "self delegate", }; #endif static const device_suspensor_t _device_suspensor_system = { .ds_delegator = NULL, .ds_name = "system", }; const device_suspensor_t * const device_suspensor_self = &_device_suspensor_self, #if 0 * const device_suspensor_self_delegate = &_device_suspensor_self_delegate, #endif * const device_suspensor_system = &_device_suspensor_system, * const device_suspensor_drvctl = &_device_suspensor_drvctl; static const pmf_qual_t _pmf_qual_system = { .pq_actlvl = DEVACT_LEVEL_FULL, .pq_suspensor = &_device_suspensor_system, }; static const pmf_qual_t _pmf_qual_drvctl = { .pq_actlvl = DEVACT_LEVEL_FULL, .pq_suspensor = &_device_suspensor_drvctl, }; static const pmf_qual_t _pmf_qual_self = { .pq_actlvl = DEVACT_LEVEL_DRIVER, .pq_suspensor = &_device_suspensor_self, }; const pmf_qual_t * const PMF_Q_DRVCTL = &_pmf_qual_drvctl, * const PMF_Q_NONE = &_pmf_qual_system, * const PMF_Q_SELF = &_pmf_qual_self; static bool device_suspensor_delegates_to(const device_suspensor_t *ds, const device_suspensor_t *delegate) { const device_suspensor_t *iter; for (iter = delegate->ds_delegator; iter != NULL; iter = iter->ds_delegator) { if (ds == iter) return true; } return false; } static bool add_suspensor(device_t dev, const char *kind, const device_suspensor_t **susp, const device_suspensor_t *ds) { int i; for (i = 0; i < DEVICE_SUSPENSORS_MAX; i++) { if (susp[i] == NULL) continue; if (ds == susp[i]) { PMF_SUSPENSOR_PRINTF(( "%s: %s-suspended by %s (delegator %s) already\n", device_xname(dev), kind, susp[i]->ds_name, (susp[i]->ds_delegator != NULL) ? susp[i]->ds_delegator->ds_name : "<none>")); return true; } if (device_suspensor_delegates_to(ds, susp[i])) { PMF_SUSPENSOR_PRINTF(( "%s: %s assumes %s-suspension by %s " "(delegator %s)\n", device_xname(dev), ds->ds_name, kind, susp[i]->ds_name, (susp[i]->ds_delegator != NULL) ? susp[i]->ds_delegator->ds_name : "<none>")); susp[i] = ds; return true; } } for (i = 0; i < DEVICE_SUSPENSORS_MAX; i++) { if (susp[i] == NULL) { susp[i] = ds; PMF_SUSPENSOR_PRINTF(( "%s: newly %s-suspended by %s (delegator %s)\n", device_xname(dev), kind, susp[i]->ds_name, (susp[i]->ds_delegator != NULL) ? susp[i]->ds_delegator->ds_name : "<none>")); return true; } } return false; } static bool device_pmf_add_suspensor(device_t dev, const pmf_qual_t *pq) { const device_suspensor_t *ds; KASSERT(pq != NULL); ds = pmf_qual_suspension(pq); KASSERT(ds != NULL); if (!add_suspensor(dev, "class", dev->dv_class_suspensors, ds)) return false; if (!add_suspensor(dev, "driver", dev->dv_driver_suspensors, ds)) return false; if (!add_suspensor(dev, "bus", dev->dv_bus_suspensors, ds)) return false; return true; } #if 0 static bool device_pmf_has_suspension(device_t dev, const device_suspensor_t *ds) { int i; for (i = 0; i < DEVICE_SUSPENSORS_MAX; i++) { if (dev->dv_suspensions[i] == ds) return true; if (device_suspensor_delegates_to(dev->dv_suspensions[i], ds)) return true; } return false; } #endif static bool any_suspensor(device_t dev, const char *kind, const device_suspensor_t **susp) { int i; bool suspended = false; for (i = 0; i < DEVICE_SUSPENSORS_MAX; i++) { if (susp[i] != NULL) { PMF_SUSPENSOR_PRINTF(("%s: %s is suspended by %s " "(delegator %s)\n", device_xname(dev), kind, susp[i]->ds_name, (susp[i]->ds_delegator != NULL) ? susp[i]->ds_delegator->ds_name : "<none>")); suspended = true; } } return suspended; } static bool device_pmf_any_suspensor(device_t dev, devact_level_t depth) { switch (depth) { case DEVACT_LEVEL_FULL: if (any_suspensor(dev, "class", dev->dv_class_suspensors)) return true; /*FALLTHROUGH*/ case DEVACT_LEVEL_DRIVER: if (any_suspensor(dev, "driver", dev->dv_driver_suspensors)) return true; /*FALLTHROUGH*/ case DEVACT_LEVEL_BUS: if (any_suspensor(dev, "bus", dev->dv_bus_suspensors)) return true; } return false; } static bool remove_suspensor(device_t dev, const char *kind, const device_suspensor_t **susp, const device_suspensor_t *ds) { int i; for (i = 0; i < DEVICE_SUSPENSORS_MAX; i++) { if (susp[i] == NULL) continue; if (ds == susp[i] || device_suspensor_delegates_to(ds, susp[i])) { PMF_SUSPENSOR_PRINTF(("%s: %s suspension %s " "(delegator %s) removed by %s\n", device_xname(dev), kind, susp[i]->ds_name, (susp[i]->ds_delegator != NULL) ? susp[i]->ds_delegator->ds_name : "<none>", ds->ds_name)); susp[i] = NULL; return true; } } return false; } static bool device_pmf_remove_suspensor(device_t dev, const pmf_qual_t *pq) { const device_suspensor_t *ds; KASSERT(pq != NULL); ds = pmf_qual_suspension(pq); KASSERT(ds != NULL); if (!remove_suspensor(dev, "class", dev->dv_class_suspensors, ds)) return false; if (!remove_suspensor(dev, "driver", dev->dv_driver_suspensors, ds)) return false; if (!remove_suspensor(dev, "bus", dev->dv_bus_suspensors, ds)) return false; return true; } void pmf_self_suspensor_init(device_t dev, device_suspensor_t *ds, pmf_qual_t *pq) { ds->ds_delegator = device_suspensor_self; snprintf(ds->ds_name, sizeof(ds->ds_name), "%s-self", device_xname(dev)); pq->pq_actlvl = DEVACT_LEVEL_DRIVER; pq->pq_suspensor = ds; } bool pmf_device_suspend(device_t dev, const pmf_qual_t *qual) { bool rc; PMF_TRANSITION_PRINTF(("%s: suspend enter\n", device_xname(dev))); if (!device_pmf_is_registered(dev)) return false; if (!device_pmf_lock(dev)) return false; rc = pmf_device_suspend_locked(dev, qual); device_pmf_unlock(dev); PMF_TRANSITION_PRINTF(("%s: suspend exit\n", device_xname(dev))); return rc; } bool pmf_device_suspend_locked(device_t dev, const pmf_qual_t *qual) { if (!device_pmf_add_suspensor(dev, qual)) return false; PMF_TRANSITION_PRINTF2(1, ("%s: class suspend\n", device_xname(dev))); if (!device_pmf_class_suspend(dev, qual)) return false; PMF_TRANSITION_PRINTF2(1, ("%s: driver suspend\n", device_xname(dev))); if (!device_pmf_driver_suspend(dev, qual)) return false; PMF_TRANSITION_PRINTF2(1, ("%s: bus suspend\n", device_xname(dev))); if (!device_pmf_bus_suspend(dev, qual)) return false; return true; } bool pmf_device_resume(device_t dev, const pmf_qual_t *qual) { bool rc; PMF_TRANSITION_PRINTF(("%s: resume enter\n", device_xname(dev))); if (!device_pmf_is_registered(dev)) return false; if (!device_pmf_lock(dev)) return false; rc = pmf_device_resume_locked(dev, qual); device_pmf_unlock(dev); PMF_TRANSITION_PRINTF(("%s: resume exit\n", device_xname(dev))); return rc; } bool pmf_device_resume_locked(device_t dev, const pmf_qual_t *qual) { device_pmf_remove_suspensor(dev, qual); if (device_pmf_any_suspensor(dev, DEVACT_LEVEL_FULL)) return true; PMF_TRANSITION_PRINTF2(1, ("%s: bus resume\n", device_xname(dev))); if (!device_pmf_bus_resume(dev, qual)) return false; PMF_TRANSITION_PRINTF2(1, ("%s: driver resume\n", device_xname(dev))); if (!device_pmf_driver_resume(dev, qual)) return false; PMF_TRANSITION_PRINTF2(1, ("%s: class resume\n", device_xname(dev))); if (!device_pmf_class_resume(dev, qual)) return false; return true; } bool pmf_device_recursive_suspend(device_t dv, const pmf_qual_t *qual) { bool rv = true; device_t curdev; deviter_t di; pmf_qual_t pq; pmf_qual_recursive_copy(&pq, qual); for (curdev = deviter_first(&di, 0); curdev != NULL; curdev = deviter_next(&di)) { if (device_parent(curdev) != dv) continue; if (!pmf_device_recursive_suspend(curdev, &pq)) { rv = false; break; } } deviter_release(&di); return rv && pmf_device_suspend(dv, qual); } void pmf_qual_recursive_copy(pmf_qual_t *dst, const pmf_qual_t *src) { *dst = *src; dst->pq_actlvl = DEVACT_LEVEL_FULL; } bool pmf_device_recursive_resume(device_t dv, const pmf_qual_t *qual) { device_t parent; pmf_qual_t pq; if (device_is_active(dv)) return true; pmf_qual_recursive_copy(&pq, qual); parent = device_parent(dv); if (parent != NULL) { if (!pmf_device_recursive_resume(parent, &pq)) return false; } return pmf_device_resume(dv, qual); } bool pmf_device_descendants_release(device_t dv, const pmf_qual_t *qual) { bool rv = true; device_t curdev; deviter_t di; for (curdev = deviter_first(&di, 0); curdev != NULL; curdev = deviter_next(&di)) { if (device_parent(curdev) != dv) continue; device_pmf_remove_suspensor(curdev, qual); if (!pmf_device_descendants_release(curdev, qual)) { rv = false; break; } } deviter_release(&di); return rv; } bool pmf_device_descendants_resume(device_t dv, const pmf_qual_t *qual) { bool rv = true; device_t curdev; deviter_t di; KASSERT(pmf_qual_descend_ok(qual)); for (curdev = deviter_first(&di, 0); curdev != NULL; curdev = deviter_next(&di)) { if (device_parent(curdev) != dv) continue; if (!pmf_device_resume(curdev, qual) || !pmf_device_descendants_resume(curdev, qual)) { rv = false; break; } } deviter_release(&di); return rv; } bool pmf_device_subtree_release(device_t dv, const pmf_qual_t *qual) { pmf_qual_t pq; device_pmf_remove_suspensor(dv, qual); pmf_qual_recursive_copy(&pq, qual); return pmf_device_descendants_release(dv, &pq); } bool pmf_device_subtree_resume(device_t dv, const pmf_qual_t *qual) { pmf_qual_t pq; if (!pmf_device_subtree_release(dv, qual)) return false; if (!pmf_device_recursive_resume(dv, qual)) return false; pmf_qual_recursive_copy(&pq, qual); return pmf_device_descendants_resume(dv, &pq); } #include <net/if.h> static bool pmf_class_network_suspend(device_t dev, const pmf_qual_t *qual) { struct ifnet *ifp = device_pmf_class_private(dev); int s; s = splnet(); IFNET_LOCK(ifp); (*ifp->if_stop)(ifp, 0); IFNET_UNLOCK(ifp); splx(s); return true; } static bool pmf_class_network_resume(device_t dev, const pmf_qual_t *qual) { struct ifnet *ifp = device_pmf_class_private(dev); int s; bool restart = false; s = splnet(); IFNET_LOCK(ifp); if (ifp->if_flags & IFF_UP) { ifp->if_flags &= ~IFF_RUNNING; if ((*ifp->if_init)(ifp) != 0) aprint_normal_ifnet(ifp, "resume failed\n"); restart = true; } IFNET_UNLOCK(ifp); if (restart) if_start_lock(ifp); splx(s); return true; } void pmf_class_network_register(device_t dev, struct ifnet *ifp) { device_pmf_class_register(dev, ifp, pmf_class_network_suspend, pmf_class_network_resume, NULL); } bool pmf_event_inject(device_t dv, pmf_generic_event_t ev) { pmf_event_workitem_t *pew; pew = pmf_event_workitem_get(); if (pew == NULL) { PMF_EVENT_PRINTF(("%s: PMF event %d dropped (no memory)\n", dv ? device_xname(dv) : "<anonymous>", ev)); return false; } pew->pew_event = ev; pew->pew_device = dv; workqueue_enqueue(pmf_event_workqueue, &pew->pew_work, NULL); PMF_EVENT_PRINTF(("%s: PMF event %d injected\n", dv ? device_xname(dv) : "<anonymous>", ev)); return true; } bool pmf_event_register(device_t dv, pmf_generic_event_t ev, void (*handler)(device_t), bool global) { pmf_event_handler_t *event; event = kmem_alloc(sizeof(*event), KM_SLEEP); event->pmf_event = ev; event->pmf_handler = handler; event->pmf_device = dv; event->pmf_global = global; TAILQ_INSERT_TAIL(&pmf_all_events, event, pmf_link); return true; } void pmf_event_deregister(device_t dv, pmf_generic_event_t ev, void (*handler)(device_t), bool global) { pmf_event_handler_t *event; TAILQ_FOREACH(event, &pmf_all_events, pmf_link) { if (event->pmf_event != ev) continue; if (event->pmf_device != dv) continue; if (event->pmf_global != global) continue; if (event->pmf_handler != handler) continue; TAILQ_REMOVE(&pmf_all_events, event, pmf_link); kmem_free(event, sizeof(*event)); return; } } struct display_class_softc { TAILQ_ENTRY(display_class_softc) dc_link; device_t dc_dev; }; static TAILQ_HEAD(, display_class_softc) all_displays; static callout_t global_idle_counter; static int idle_timeout = 30; static void input_idle(void *dummy) { PMF_IDLE_PRINTF(("Input idle handler called\n")); pmf_event_inject(NULL, PMFE_DISPLAY_OFF); } static void input_activity_handler(device_t dv, devactive_t type) { if (!TAILQ_EMPTY(&all_displays)) callout_schedule(&global_idle_counter, idle_timeout * hz); } static void pmf_class_input_deregister(device_t dv) { device_active_deregister(dv, input_activity_handler); } bool pmf_class_input_register(device_t dv) { if (!device_active_register(dv, input_activity_handler)) return false; device_pmf_class_register(dv, NULL, NULL, NULL, pmf_class_input_deregister); return true; } static void pmf_class_display_deregister(device_t dv) { struct display_class_softc *sc = device_pmf_class_private(dv); int s; s = splsoftclock(); TAILQ_REMOVE(&all_displays, sc, dc_link); if (TAILQ_EMPTY(&all_displays)) callout_stop(&global_idle_counter); splx(s); kmem_free(sc, sizeof(*sc)); } bool pmf_class_display_register(device_t dv) { struct display_class_softc *sc; int s; sc = kmem_alloc(sizeof(*sc), KM_SLEEP); s = splsoftclock(); if (TAILQ_EMPTY(&all_displays)) callout_schedule(&global_idle_counter, idle_timeout * hz); TAILQ_INSERT_HEAD(&all_displays, sc, dc_link); splx(s); device_pmf_class_register(dv, sc, NULL, NULL, pmf_class_display_deregister); return true; } static void pmf_event_workitem_put(pmf_event_workitem_t *pew) { KASSERT(pew != NULL); pool_put(&pew_pl, pew); } static pmf_event_workitem_t * pmf_event_workitem_get(void) { return pool_get(&pew_pl, PR_NOWAIT); } SYSCTL_SETUP(sysctl_pmf_setup, "PMF subtree setup") { const struct sysctlnode *node = NULL; sysctl_createv(clog, 0, NULL, &node, CTLFLAG_PERMANENT, CTLTYPE_NODE, "pmf", SYSCTL_DESCR("pmf controls"), NULL, 0, NULL, 0, CTL_KERN, CTL_CREATE, CTL_EOL); #ifdef PMF_DEBUG sysctl_createv(clog, 0, &node, &node, CTLFLAG_PERMANENT, CTLTYPE_NODE, "debug", SYSCTL_DESCR("debug levels"), NULL, 0, NULL, 0, CTL_CREATE, CTL_EOL); sysctl_createv(clog, 0, &node, NULL, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_INT, "event", SYSCTL_DESCR("event"), NULL, 0, &pmf_debug_event, sizeof(pmf_debug_event), CTL_CREATE, CTL_EOL); sysctl_createv(clog, 0, &node, NULL, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_INT, "suspend", SYSCTL_DESCR("suspend"), NULL, 0, &pmf_debug_suspend, sizeof(pmf_debug_suspend), CTL_CREATE, CTL_EOL); sysctl_createv(clog, 0, &node, NULL, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_INT, "suspensor", SYSCTL_DESCR("suspensor"), NULL, 0, &pmf_debug_suspensor, sizeof(pmf_debug_suspensor), CTL_CREATE, CTL_EOL); sysctl_createv(clog, 0, &node, NULL, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_INT, "idle", SYSCTL_DESCR("idle"), NULL, 0, &pmf_debug_idle, sizeof(pmf_debug_idle), CTL_CREATE, CTL_EOL); sysctl_createv(clog, 0, &node, NULL, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_INT, "transition", SYSCTL_DESCR("event"), NULL, 0, &pmf_debug_transition, sizeof(pmf_debug_transition), CTL_CREATE, CTL_EOL); #endif } void pmf_init(void) { int err; pool_init(&pew_pl, sizeof(pmf_event_workitem_t), 0, 0, 0, "pewpl", NULL, IPL_HIGH); pool_setlowat(&pew_pl, 1); pool_sethiwat(&pew_pl, 8); KASSERT(pmf_event_workqueue == NULL); err = workqueue_create(&pmf_event_workqueue, "pmfevent", pmf_event_worker, NULL, PRI_NONE, IPL_VM, 0); if (err) panic("couldn't create pmfevent workqueue"); KASSERT(pmf_suspend_workqueue == NULL); err = workqueue_create(&pmf_suspend_workqueue, "pmfsuspend", pmf_suspend_worker, NULL, PRI_NONE, IPL_VM, 0); if (err) panic("couldn't create pmfsuspend workqueue"); callout_init(&global_idle_counter, 0); callout_setfunc(&global_idle_counter, input_idle, NULL); }
6 7 3 96 96 75 21 94 94 7 7 7 7 7 103 103 102 80 75 22 22 22 21 22 22 82 82 82 81 7 7 75 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 /* $NetBSD: subr_xcall.c,v 1.38 2024/03/01 04:32:38 mrg Exp $ */ /*- * Copyright (c) 2007-2010, 2019 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Andrew Doran and Mindaugas Rasiukevicius. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /* * Cross call support * * Background * * Sometimes it is necessary to modify hardware state that is tied * directly to individual CPUs (such as a CPU's local timer), and * these updates can not be done remotely by another CPU. The LWP * requesting the update may be unable to guarantee that it will be * running on the CPU where the update must occur, when the update * occurs. * * Additionally, it's sometimes necessary to modify per-CPU software * state from a remote CPU. Where these update operations are so * rare or the access to the per-CPU data so frequent that the cost * of using locking or atomic operations to provide coherency is * prohibitive, another way must be found. * * Cross calls help to solve these types of problem by allowing * any LWP in the system to request that an arbitrary function be * executed on a specific CPU. * * Implementation * * A slow mechanism for making low priority cross calls is * provided. The function to be executed runs on the remote CPU * within a bound kthread. No queueing is provided, and the * implementation uses global state. The function being called may * block briefly on locks, but in doing so must be careful to not * interfere with other cross calls in the system. The function is * called with thread context and not from a soft interrupt, so it * can ensure that it is not interrupting other code running on the * CPU, and so has exclusive access to the CPU. Since this facility * is heavyweight, it's expected that it will not be used often. * * Cross calls must not allocate memory, as the pagedaemon uses cross * calls (and memory allocation may need to wait on the pagedaemon). * * A low-overhead mechanism for high priority calls (XC_HIGHPRI) is * also provided. The function to be executed runs in software * interrupt context at IPL_SOFTSERIAL level, and is expected to * be very lightweight, e.g. avoid blocking. */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: subr_xcall.c,v 1.38 2024/03/01 04:32:38 mrg Exp $"); #include <sys/types.h> #include <sys/param.h> #include <sys/xcall.h> #include <sys/mutex.h> #include <sys/condvar.h> #include <sys/evcnt.h> #include <sys/kthread.h> #include <sys/cpu.h> #include <sys/atomic.h> #ifdef _RUMPKERNEL #include "rump_private.h" #endif /* Cross-call state box. */ typedef struct { kmutex_t xc_lock; kcondvar_t xc_busy; xcfunc_t xc_func; void * xc_arg1; void * xc_arg2; uint64_t xc_headp; uint64_t xc_donep; unsigned int xc_ipl; } xc_state_t; /* Bit indicating high (1) or low (0) priority. */ #define XC_PRI_BIT (1ULL << 63) /* Low priority xcall structures. */ static xc_state_t xc_low_pri __cacheline_aligned; /* High priority xcall structures. */ static xc_state_t xc_high_pri __cacheline_aligned; static void * xc_sihs[4] __cacheline_aligned; /* Event counters. */ static struct evcnt xc_unicast_ev __cacheline_aligned; static struct evcnt xc_broadcast_ev __cacheline_aligned; static void xc_init(void); static void xc_thread(void *); static inline uint64_t xc_highpri(xcfunc_t, void *, void *, struct cpu_info *, unsigned int); static inline uint64_t xc_lowpri(xcfunc_t, void *, void *, struct cpu_info *); /* The internal form of IPL */ #define XC_IPL_MASK 0xff00 /* * Assign 0 to XC_IPL_SOFTSERIAL to treat IPL_SOFTSERIAL as the default value * (just XC_HIGHPRI). */ #define XC_IPL_SOFTSERIAL 0 #define XC_IPL_SOFTNET 1 #define XC_IPL_SOFTBIO 2 #define XC_IPL_SOFTCLOCK 3 #define XC_IPL_MAX XC_IPL_SOFTCLOCK CTASSERT(XC_IPL_MAX <= __arraycount(xc_sihs)); /* * xc_init: * * Initialize low and high priority cross-call structures. */ static void xc_init(void) { xc_state_t *xclo = &xc_low_pri, *xchi = &xc_high_pri; memset(xclo, 0, sizeof(xc_state_t)); mutex_init(&xclo->xc_lock, MUTEX_DEFAULT, IPL_NONE); cv_init(&xclo->xc_busy, "xclow"); memset(xchi, 0, sizeof(xc_state_t)); mutex_init(&xchi->xc_lock, MUTEX_DEFAULT, IPL_SOFTSERIAL); cv_init(&xchi->xc_busy, "xchigh"); /* Set up a softint for each IPL_SOFT*. */ #define SETUP_SOFTINT(xipl, sipl) do { \ xc_sihs[(xipl)] = softint_establish( (sipl) | SOFTINT_MPSAFE,\ xc__highpri_intr, NULL); \ KASSERT(xc_sihs[(xipl)] != NULL); \ } while (0) SETUP_SOFTINT(XC_IPL_SOFTSERIAL, SOFTINT_SERIAL); /* * If a IPL_SOFTXXX have the same value of the previous, we don't use * the IPL (see xc_encode_ipl). So we don't need to allocate a softint * for it. */ #if IPL_SOFTNET != IPL_SOFTSERIAL SETUP_SOFTINT(XC_IPL_SOFTNET, SOFTINT_NET); #endif #if IPL_SOFTBIO != IPL_SOFTNET SETUP_SOFTINT(XC_IPL_SOFTBIO, SOFTINT_BIO); #endif #if IPL_SOFTCLOCK != IPL_SOFTBIO SETUP_SOFTINT(XC_IPL_SOFTCLOCK, SOFTINT_CLOCK); #endif #undef SETUP_SOFTINT evcnt_attach_dynamic(&xc_unicast_ev, EVCNT_TYPE_MISC, NULL, "crosscall", "unicast"); evcnt_attach_dynamic(&xc_broadcast_ev, EVCNT_TYPE_MISC, NULL, "crosscall", "broadcast"); } /* * Encode an IPL to a form that can be embedded into flags of xc_broadcast * or xc_unicast. */ unsigned int xc_encode_ipl(int ipl) { switch (ipl) { case IPL_SOFTSERIAL: return __SHIFTIN(XC_IPL_SOFTSERIAL, XC_IPL_MASK); /* IPL_SOFT* can be the same value (e.g., on sparc or mips). */ #if IPL_SOFTNET != IPL_SOFTSERIAL case IPL_SOFTNET: return __SHIFTIN(XC_IPL_SOFTNET, XC_IPL_MASK); #endif #if IPL_SOFTBIO != IPL_SOFTNET case IPL_SOFTBIO: return __SHIFTIN(XC_IPL_SOFTBIO, XC_IPL_MASK); #endif #if IPL_SOFTCLOCK != IPL_SOFTBIO case IPL_SOFTCLOCK: return __SHIFTIN(XC_IPL_SOFTCLOCK, XC_IPL_MASK); #endif } panic("Invalid IPL: %d", ipl); } /* * Extract an XC_IPL from flags of xc_broadcast or xc_unicast. */ static inline unsigned int xc_extract_ipl(unsigned int flags) { return __SHIFTOUT(flags, XC_IPL_MASK); } /* * xc_init_cpu: * * Initialize the cross-call subsystem. Called once for each CPU * in the system as they are attached. */ void xc_init_cpu(struct cpu_info *ci) { static bool again = false; int error __diagused; if (!again) { /* Autoconfiguration will prevent re-entry. */ xc_init(); again = true; } cv_init(&ci->ci_data.cpu_xcall, "xcall"); error = kthread_create(PRI_XCALL, KTHREAD_MPSAFE, ci, xc_thread, NULL, NULL, "xcall/%u", ci->ci_index); KASSERT(error == 0); } /* * xc_broadcast: * * Trigger a call on all CPUs in the system. */ uint64_t xc_broadcast(unsigned int flags, xcfunc_t func, void *arg1, void *arg2) { KASSERT(!cpu_intr_p()); KASSERT(!cpu_softintr_p()); ASSERT_SLEEPABLE(); if (__predict_false(!mp_online)) { int s, bound; if (flags & XC_HIGHPRI) s = splsoftserial(); else bound = curlwp_bind(); (*func)(arg1, arg2); if (flags & XC_HIGHPRI) splx(s); else curlwp_bindx(bound); return 0; } if ((flags & XC_HIGHPRI) != 0) { int ipl = xc_extract_ipl(flags); return xc_highpri(func, arg1, arg2, NULL, ipl); } else { return xc_lowpri(func, arg1, arg2, NULL); } } static void xc_nop(void *arg1, void *arg2) { return; } /* * xc_barrier: * * Broadcast a nop to all CPUs in the system. */ void xc_barrier(unsigned int flags) { uint64_t where; where = xc_broadcast(flags, xc_nop, NULL, NULL); xc_wait(where); } /* * xc_unicast: * * Trigger a call on one CPU. */ uint64_t xc_unicast(unsigned int flags, xcfunc_t func, void *arg1, void *arg2, struct cpu_info *ci) { KASSERT(ci != NULL); KASSERT(!cpu_intr_p()); KASSERT(!cpu_softintr_p()); ASSERT_SLEEPABLE(); if (__predict_false(!mp_online)) { int s, bound; KASSERT(ci == curcpu()); if (flags & XC_HIGHPRI) s = splsoftserial(); else bound = curlwp_bind(); (*func)(arg1, arg2); if (flags & XC_HIGHPRI) splx(s); else curlwp_bindx(bound); return 0; } if ((flags & XC_HIGHPRI) != 0) { int ipl = xc_extract_ipl(flags); return xc_highpri(func, arg1, arg2, ci, ipl); } else { return xc_lowpri(func, arg1, arg2, ci); } } /* * xc_wait: * * Wait for a cross call to complete. */ void xc_wait(uint64_t where) { xc_state_t *xc; KASSERT(!cpu_intr_p()); KASSERT(!cpu_softintr_p()); ASSERT_SLEEPABLE(); if (__predict_false(!mp_online)) { return; } /* Determine whether it is high or low priority cross-call. */ if ((where & XC_PRI_BIT) != 0) { xc = &xc_high_pri; where &= ~XC_PRI_BIT; } else { xc = &xc_low_pri; } #ifdef __HAVE_ATOMIC64_LOADSTORE /* Fast path, if already done. */ if (atomic_load_acquire(&xc->xc_donep) >= where) { return; } #endif /* Slow path: block until awoken. */ mutex_enter(&xc->xc_lock); while (xc->xc_donep < where) { cv_wait(&xc->xc_busy, &xc->xc_lock); } mutex_exit(&xc->xc_lock); } /* * xc_lowpri: * * Trigger a low priority call on one or more CPUs. */ static inline uint64_t xc_lowpri(xcfunc_t func, void *arg1, void *arg2, struct cpu_info *ci) { xc_state_t *xc = &xc_low_pri; CPU_INFO_ITERATOR cii; uint64_t where; mutex_enter(&xc->xc_lock); while (xc->xc_headp != xc->xc_donep) { cv_wait(&xc->xc_busy, &xc->xc_lock); } xc->xc_arg1 = arg1; xc->xc_arg2 = arg2; xc->xc_func = func; if (ci == NULL) { xc_broadcast_ev.ev_count++; for (CPU_INFO_FOREACH(cii, ci)) { if ((ci->ci_schedstate.spc_flags & SPCF_RUNNING) == 0) continue; xc->xc_headp += 1; ci->ci_data.cpu_xcall_pending = true; cv_signal(&ci->ci_data.cpu_xcall); } } else { xc_unicast_ev.ev_count++; xc->xc_headp += 1; ci->ci_data.cpu_xcall_pending = true; cv_signal(&ci->ci_data.cpu_xcall); } KASSERT(xc->xc_donep < xc->xc_headp); where = xc->xc_headp; mutex_exit(&xc->xc_lock); /* Return a low priority ticket. */ KASSERT((where & XC_PRI_BIT) == 0); return where; } /* * xc_thread: * * One thread per-CPU to dispatch low priority calls. */ static void xc_thread(void *cookie) { struct cpu_info *ci = curcpu(); xc_state_t *xc = &xc_low_pri; void *arg1, *arg2; xcfunc_t func; struct lwp *l = curlwp; KASSERTMSG(l->l_nopreempt == 0, "lwp %p nopreempt %d", l, l->l_nopreempt); mutex_enter(&xc->xc_lock); for (;;) { while (!ci->ci_data.cpu_xcall_pending) { if (xc->xc_headp == xc->xc_donep) { cv_broadcast(&xc->xc_busy); } cv_wait(&ci->ci_data.cpu_xcall, &xc->xc_lock); KASSERT(ci == curcpu()); } ci->ci_data.cpu_xcall_pending = false; func = xc->xc_func; arg1 = xc->xc_arg1; arg2 = xc->xc_arg2; mutex_exit(&xc->xc_lock); KASSERT(func != NULL); (*func)(arg1, arg2); KASSERTMSG(l->l_nopreempt == 0, "lwp %p nopreempt %d func %p", l, l->l_nopreempt, func); mutex_enter(&xc->xc_lock); #ifdef __HAVE_ATOMIC64_LOADSTORE atomic_store_release(&xc->xc_donep, xc->xc_donep + 1); #else xc->xc_donep++; #endif } /* NOTREACHED */ } /* * xc_ipi_handler: * * Handler of cross-call IPI. */ void xc_ipi_handler(void) { xc_state_t *xc = & xc_high_pri; KASSERT(xc->xc_ipl < __arraycount(xc_sihs)); KASSERT(xc_sihs[xc->xc_ipl] != NULL); /* Executes xc__highpri_intr() via software interrupt. */ softint_schedule(xc_sihs[xc->xc_ipl]); } /* * xc__highpri_intr: * * A software interrupt handler for high priority calls. */ void xc__highpri_intr(void *dummy) { xc_state_t *xc = &xc_high_pri; void *arg1, *arg2; xcfunc_t func; KASSERTMSG(!cpu_intr_p(), "high priority xcall for function %p", xc->xc_func); /* * Lock-less fetch of function and its arguments. * Safe since it cannot change at this point. */ func = xc->xc_func; arg1 = xc->xc_arg1; arg2 = xc->xc_arg2; KASSERT(func != NULL); (*func)(arg1, arg2); /* * Note the request as done, and if we have reached the head, * cross-call has been processed - notify waiters, if any. */ mutex_enter(&xc->xc_lock); KASSERT(xc->xc_donep < xc->xc_headp); #ifdef __HAVE_ATOMIC64_LOADSTORE atomic_store_release(&xc->xc_donep, xc->xc_donep + 1); #else xc->xc_donep++; #endif if (xc->xc_donep == xc->xc_headp) { cv_broadcast(&xc->xc_busy); } mutex_exit(&xc->xc_lock); } /* * xc_highpri: * * Trigger a high priority call on one or more CPUs. */ static inline uint64_t xc_highpri(xcfunc_t func, void *arg1, void *arg2, struct cpu_info *ci, unsigned int ipl) { xc_state_t *xc = &xc_high_pri; uint64_t where; mutex_enter(&xc->xc_lock); while (xc->xc_headp != xc->xc_donep) { cv_wait(&xc->xc_busy, &xc->xc_lock); } xc->xc_func = func; xc->xc_arg1 = arg1; xc->xc_arg2 = arg2; xc->xc_headp += (ci ? 1 : ncpu); xc->xc_ipl = ipl; where = xc->xc_headp; mutex_exit(&xc->xc_lock); /* * Send the IPI once lock is released. * Note: it will handle the local CPU case. */ #ifdef _RUMPKERNEL rump_xc_highpri(ci); #else #ifdef MULTIPROCESSOR kpreempt_disable(); if (curcpu() == ci) { /* Unicast: local CPU. */ xc_ipi_handler(); } else if (ci) { /* Unicast: remote CPU. */ xc_send_ipi(ci); } else { /* Broadcast: all, including local. */ xc_send_ipi(NULL); xc_ipi_handler(); } kpreempt_enable(); #else KASSERT(ci == NULL || curcpu() == ci); xc_ipi_handler(); #endif #endif /* Indicate a high priority ticket. */ return (where | XC_PRI_BIT); }
5 2743 2736 2789 2789 844 214 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 /* $NetBSD: lwp.h,v 1.231 2023/11/02 10:31:55 martin Exp $ */ /* * Copyright (c) 2001, 2006, 2007, 2008, 2009, 2010, 2019, 2020, 2023 * The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Nathan J. Williams and Andrew Doran. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #ifndef _SYS_LWP_H_ #define _SYS_LWP_H_ #if defined(_KERNEL) || defined(_KMEMUSER) #include <sys/param.h> #include <sys/callout.h> #include <sys/condvar.h> #include <sys/kcpuset.h> #include <sys/mutex.h> #include <sys/queue.h> #include <sys/resource.h> #include <sys/sched.h> #include <sys/signalvar.h> #include <sys/specificdata.h> #include <sys/time.h> #include <sys/wchan.h> #if defined(_KERNEL) struct lwp; /* forward declare this for <machine/cpu.h> so it can get l_cpu. */ static __inline struct cpu_info *lwp_getcpu(struct lwp *); #include <machine/cpu.h> /* curcpu() and cpu_info */ #include <sys/atomic.h> #ifdef _KERNEL_OPT #include "opt_kcov.h" #include "opt_kmsan.h" #include "opt_maxlwp.h" #endif #endif #include <machine/proc.h> /* Machine-dependent proc substruct. */ /* * Lightweight process. Field markings and the corresponding locks: * * a: proc_lock * c: condition variable interlock, passed to cv_wait() * l: *l_mutex * p: l_proc->p_lock * s: spc_mutex, which may or may not be referenced by l_mutex * S: l_selcluster->sc_lock * (: unlocked, stable * !: unlocked, may only be reliably accessed by the LWP itself * * Fields are clustered together by usage (to increase the likelihood * of cache hits) and by size (to reduce dead space in the structure). */ #include <sys/pcu.h> struct lockdebug; struct sysent; struct lwp { /* Must not be zeroed on free. */ struct cpu_info *volatile l_cpu;/* s: CPU we're on if LSONPROC */ kmutex_t * volatile l_mutex; /* l: ptr to mutex on sched state */ struct turnstile *l_ts; /* l: current turnstile */ int l_stat; /* l: overall LWP status */ int l__reserved; /* : padding - reuse as needed */ /* Scheduling and overall state. */ #define l_startzero l_runq TAILQ_ENTRY(lwp) l_runq; /* s: run queue */ union { void * info; /* s: scheduler-specific structure */ u_int timeslice; /* l: time-quantum for SCHED_M2 */ } l_sched; void *l_addr; /* l: PCB address; use lwp_getpcb() */ struct mdlwp l_md; /* l: machine-dependent fields. */ struct bintime l_rtime; /* l: real time */ struct bintime l_stime; /* l: start time (while ONPROC) */ int l_flag; /* l: misc flag values */ u_int l_swtime; /* l: time swapped in or out */ u_int l_rticks; /* l: Saved start time of run */ u_int l_rticksum; /* l: Sum of ticks spent running */ u_int l_slpticks; /* l: Saved start time of sleep */ u_int l_slpticksum; /* l: Sum of ticks spent sleeping */ int l_class; /* l: scheduling class */ pri_t l_boostpri; /* l: boosted priority after blocking */ pri_t l_priority; /* l: scheduler priority */ pri_t l_inheritedprio;/* l: inherited priority */ pri_t l_protectprio; /* l: for PTHREAD_PRIO_PROTECT */ pri_t l_auxprio; /* l: max(inherit,protect) priority */ int l_protectdepth; /* l: for PTHREAD_PRIO_PROTECT */ u_int l_cpticks; /* (: Ticks of CPU time */ psetid_t l_psid; /* l: assigned processor-set ID */ fixpt_t l_pctcpu; /* p: %cpu during l_swtime */ fixpt_t l_estcpu; /* l: cpu time for SCHED_4BSD */ SLIST_HEAD(, turnstile) l_pi_lenders; /* l: ts lending us priority */ struct cpu_info *l_target_cpu; /* l: target CPU to migrate */ struct lwpctl *l_lwpctl; /* p: lwpctl block kernel address */ struct lcpage *l_lcpage; /* p: lwpctl containing page */ kcpuset_t *l_affinity; /* l: CPU set for affinity */ /* Synchronisation. */ const struct syncobj *l_syncobj;/* l: sync object operations set */ LIST_ENTRY(lwp) l_sleepchain; /* l: sleep queue */ wchan_t l_wchan; /* l: sleep address */ const char *l_wmesg; /* l: reason for sleep */ struct sleepq *l_sleepq; /* l: current sleep queue */ callout_t l_timeout_ch; /* !: callout for tsleep */ kcondvar_t l_waitcv; /* a: vfork() wait */ u_int l_slptime; /* l: time since last blocked */ bool l_vforkwaiting; /* a: vfork() waiting */ /* User-space synchronization. */ uintptr_t l_robust_head; /* !: list of robust futexes */ uint32_t l___rsvd1; /* reserved for future use */ #if PCU_UNIT_COUNT > 0 struct cpu_info * volatile l_pcu_cpu[PCU_UNIT_COUNT]; uint32_t l_pcu_valid; #endif /* Process level and global state, misc. */ lwpid_t l_lid; /* (: LWP identifier; local to proc */ LIST_ENTRY(lwp) l_list; /* a: entry on list of all LWPs */ void *l_ctxlink; /* p: uc_link {get,set}context */ struct proc *l_proc; /* p: parent process */ LIST_ENTRY(lwp) l_sibling; /* p: entry on proc's list of LWPs */ char *l_name; /* (: name, optional */ lwpid_t l_waiter; /* p: first LWP waiting on us */ lwpid_t l_waitingfor; /* p: specific LWP we are waiting on */ int l_prflag; /* p: process level flags */ u_int l_refcnt; /* p: reference count on this LWP */ /* State of select() or poll(). */ int l_selflag; /* S: polling state flags */ int l_selret; /* S: return value of select/poll */ SLIST_HEAD(,selinfo) l_selwait; /* S: descriptors waited on */ uintptr_t l_selrec; /* !: argument for selrecord() */ struct selcluster *l_selcluster;/* !: associated cluster data */ void * l_selbits; /* (: select() bit-field */ size_t l_selni; /* (: size of a single bit-field */ /* Signals. */ int l_sigrestore; /* p: need to restore old sig mask */ sigset_t l_sigwaitset; /* p: signals being waited for */ kcondvar_t l_sigcv; /* p: for sigsuspend() */ struct ksiginfo *l_sigwaited; /* p: delivered signals from set */ sigpend_t *l_sigpendset; /* p: XXX issignal()/postsig() baton */ LIST_ENTRY(lwp) l_sigwaiter; /* p: chain on list of waiting LWPs */ stack_t l_sigstk; /* p: sp & on stack state variable */ sigset_t l_sigmask; /* p: signal mask */ sigpend_t l_sigpend; /* p: signals to this LWP */ sigset_t l_sigoldmask; /* p: mask for sigpause */ /* Private data. */ specificdata_reference l_specdataref; /* !: subsystem lwp-specific data */ struct timespec l_ktrcsw; /* !: for ktrace CSW trace XXX */ void *l_private; /* !: svr4-style lwp-private data */ struct lwp *l_switchto; /* !: mi_switch: switch to this LWP */ struct kauth_cred *l_cred; /* !: cached credentials */ struct filedesc *l_fd; /* !: cached copy of proc::p_fd */ void *l_emuldata; /* !: kernel lwp-private data */ struct fstrans_lwp_info *l_fstrans; /* (: fstrans private data */ u_short l_shlocks; /* !: lockdebug: shared locks held */ u_short l_exlocks; /* !: lockdebug: excl. locks held */ u_short l_psrefs; /* !: count of psref held */ u_short l_blcnt; /* !: count of kernel_lock held */ volatile int l_nopreempt; /* !: don't preempt me! */ volatile u_int l_dopreempt; /* s: kernel preemption pending */ int l_pflag; /* !: LWP private flags */ int l_dupfd; /* !: side return from cloning devs XXX */ const struct sysent * volatile l_sysent;/* !: currently active syscall */ struct rusage l_ru; /* !: accounting information */ uint64_t l_pfailtime; /* !: for kernel preemption */ uintptr_t l_pfailaddr; /* !: for kernel preemption */ uintptr_t l_pfaillock; /* !: for kernel preemption */ _TAILQ_HEAD(,struct lockdebug,volatile) l_ld_locks;/* !: locks held by LWP */ volatile void *l_ld_wanted; /* !: lock currently wanted by LWP */ uintptr_t l_rwcallsite; /* !: rwlock actual callsite */ int l_tcgen; /* !: for timecounter removal */ /* These are only used by 'options SYSCALL_TIMES'. */ uint32_t l_syscall_time; /* !: time epoch for current syscall */ uint64_t *l_syscall_counter; /* !: counter for current process */ struct kdtrace_thread *l_dtrace; /* (: DTrace-specific data. */ #ifdef KMSAN void *l_kmsan; /* !: KMSAN private data. */ #endif #ifdef KCOV void *l_kcov; /* !: KCOV private data. */ #endif }; /* * UAREA_PCB_OFFSET: an offset of PCB structure in the uarea. MD code may * define it in <machine/proc.h>, to indicate a different uarea layout. */ #ifndef UAREA_PCB_OFFSET #define UAREA_PCB_OFFSET 0 #endif LIST_HEAD(lwplist, lwp); /* A list of LWPs. */ #ifdef _KERNEL extern struct lwplist alllwp; /* List of all LWPs. */ extern lwp_t lwp0; /* LWP for proc0. */ extern int maxlwp __read_mostly; /* max number of lwps */ #ifndef MAXLWP #define MAXLWP 4096 /* default max */ #endif #ifndef MAXMAXLWP #define MAXMAXLWP 65535 /* absolute max */ #endif #endif #endif /* _KERNEL || _KMEMUSER */ /* * These flags are kept in l_flag, and they are modified only with the LWP * locked. */ #define LW_IDLE 0x00000001 /* Idle lwp. */ #define LW_LWPCTL 0x00000002 /* Adjust lwpctl in userret */ #define LW_STIMO 0x00000040 /* Sleep timed out */ #define LW_SINTR 0x00000080 /* Sleep is interruptible. */ #define LW_CATCHINTR 0x00000100 /* LW_SINTR intent; see sleepq_block(). */ #define LW_SYSTEM 0x00000200 /* Kernel thread */ #define LW_SYSTEM_FPU 0x00000400 /* Kernel thread with vector/FP enabled */ #define LW_DBGSUSPEND 0x00010000 /* Suspend by debugger */ #define LW_WSUSPEND 0x00020000 /* Suspend before return to user */ #define LW_BATCH 0x00040000 /* LWP tends to hog CPU */ #define LW_WCORE 0x00080000 /* Stop for core dump on return to user */ #define LW_WEXIT 0x00100000 /* Exit before return to user */ #define LW_PENDSIG 0x01000000 /* Pending signal for us */ #define LW_CANCELLED 0x02000000 /* tsleep should not sleep */ #define LW_CACHECRED 0x04000000 /* Cache new process credential */ #define LW_WREBOOT 0x08000000 /* System is rebooting, please suspend */ #define LW_UNPARKED 0x10000000 /* Unpark op pending */ #define LW_RUMP_CLEAR 0x40000000 /* Clear curlwp in RUMP scheduler */ #define LW_RUMP_QEXIT 0x80000000 /* LWP should exit ASAP */ /* * The second set of flags is kept in l_pflag, and they are modified only by * the LWP itself, or modified when it's known the LWP cannot be running. * LP_RUNNING is typically updated with the LWP locked, but not always in * the case of soft interrupt handlers. */ #define LP_KTRACTIVE 0x00000001 /* Executing ktrace operation */ #define LP_KTRCSW 0x00000002 /* ktrace context switch marker */ #define LP_KTRCSWUSER 0x00000004 /* ktrace context switch marker */ /* 0x00000008 was LP_PIDLID */ #define LP_OWEUPC 0x00000010 /* Owe user profiling tick */ #define LP_MPSAFE 0x00000020 /* Starts life without kernel_lock */ #define LP_INTR 0x00000040 /* Soft interrupt handler */ #define LP_SYSCTLWRITE 0x00000080 /* sysctl write lock held */ #define LP_MUSTJOIN 0x00000100 /* Must join kthread on exit */ #define LP_SINGLESTEP 0x00000400 /* Single step thread in ptrace(2) */ #define LP_TIMEINTR 0x00010000 /* Time this soft interrupt */ #define LP_PREEMPTING 0x00020000 /* mi_switch called involuntarily */ #define LP_RUNNING 0x20000000 /* Active on a CPU */ #define LP_TELEPORT 0x40000000 /* Teleport to new CPU on preempt() */ #define LP_BOUND 0x80000000 /* Bound to a CPU */ /* * The third set of flags is kept in l_prflag and they are modified only * with p_lock held. */ #define LPR_DETACHED 0x00800000 /* Won't be waited for. */ #define LPR_DRAINING 0x80000000 /* Draining references before exiting */ /* * Mask indicating that there is "exceptional" work to be done on return to * user. */ #define LW_USERRET (LW_WEXIT | LW_PENDSIG | LW_WREBOOT | LW_WSUSPEND \ | LW_WCORE | LW_LWPCTL | LW_CACHECRED) /* * Status values. * * A note about LSRUN and LSONPROC: LSRUN indicates that a process is * runnable but *not* yet running, i.e. is on a run queue. LSONPROC * indicates that the process is actually executing on a CPU, i.e. * it is no longer on a run queue. * * These values are set in stone and must not be reused with future changes. */ #define LSIDL 1 /* Process being created by fork. */ #define LSRUN 2 /* Currently runnable. */ #define LSSLEEP 3 /* Sleeping on an address. */ #define LSSTOP 4 /* Process debugging or suspension. */ #define LSZOMB 5 /* Awaiting collection by parent. */ /* define LSDEAD 6 Process is almost a zombie. (removed in 5.0) */ #define LSONPROC 7 /* Process is currently on a CPU. */ #define LSSUSPENDED 8 /* Not running, not signalable. */ #if defined(_KERNEL) || defined(_KMEMUSER) static __inline void * lwp_getpcb(struct lwp *l) { return l->l_addr; } #endif /* _KERNEL || _KMEMUSER */ #ifdef _KERNEL void lwpinit(void); void lwp0_init(void); void lwp_startup(lwp_t *, lwp_t *); void startlwp(void *); void lwp_lock(lwp_t *); void lwp_unlock(lwp_t *); pri_t lwp_eprio(lwp_t *); int lwp_locked(lwp_t *, kmutex_t *); kmutex_t *lwp_setlock(lwp_t *, kmutex_t *); void lwp_unlock_to(lwp_t *, kmutex_t *); int lwp_trylock(lwp_t *); void lwp_changepri(lwp_t *, pri_t); void lwp_lendpri(lwp_t *, pri_t); void lwp_addref(lwp_t *); void lwp_delref(lwp_t *); void lwp_delref2(lwp_t *); bool lwp_drainrefs(lwp_t *); bool lwp_alive(lwp_t *); lwp_t *lwp_find_first(proc_t *); int lwp_wait(lwp_t *, lwpid_t, lwpid_t *, bool); void lwp_continue(lwp_t *); void lwp_unsleep(lwp_t *, bool); void lwp_unstop(lwp_t *); void lwp_exit(lwp_t *); int lwp_suspend(lwp_t *, lwp_t *); int lwp_create1(lwp_t *, const void *, size_t, u_long, lwpid_t *); void lwp_start(lwp_t *, int); void lwp_migrate(lwp_t *, struct cpu_info *); lwp_t * lwp_find2(pid_t, lwpid_t); lwp_t * lwp_find(proc_t *, int); void lwp_userret(lwp_t *); void lwp_need_userret(lwp_t *); void lwp_free(lwp_t *, bool, bool); long lwp_pctr(void); int lwp_setprivate(lwp_t *, void *); int do_lwp_create(lwp_t *, void *, u_long, lwp_t **, const sigset_t *, const stack_t *); void lwp_thread_cleanup(lwp_t *); void lwpinit_specificdata(void); int lwp_specific_key_create(specificdata_key_t *, specificdata_dtor_t); void lwp_specific_key_delete(specificdata_key_t); void lwp_initspecific(lwp_t *); void lwp_finispecific(lwp_t *); void *lwp_getspecific(specificdata_key_t); #if defined(_LWP_API_PRIVATE) void *_lwp_getspecific_by_lwp(lwp_t *, specificdata_key_t); #endif void lwp_setspecific(specificdata_key_t, void *); void lwp_setspecific_by_lwp(lwp_t *, specificdata_key_t, void *); /* Syscalls. */ int lwp_park(clockid_t, int, struct timespec *); int lwp_unpark(const lwpid_t *, const u_int); /* DDB. */ void lwp_whatis(uintptr_t, void (*)(const char *, ...) __printflike(1, 2)); int lwp_create(lwp_t *, struct proc *, vaddr_t, int, void *, size_t, void (*)(void *), void *, lwp_t **, int, const sigset_t *, const stack_t *); /* * XXX _MODULE * We should provide real stubs for the below that modules can use. */ static __inline void spc_lock(struct cpu_info *ci) { mutex_spin_enter(ci->ci_schedstate.spc_mutex); } static __inline void spc_unlock(struct cpu_info *ci) { mutex_spin_exit(ci->ci_schedstate.spc_mutex); } static __inline void spc_dlock(struct cpu_info *ci1, struct cpu_info *ci2) { struct schedstate_percpu *spc1 = &ci1->ci_schedstate; struct schedstate_percpu *spc2 = &ci2->ci_schedstate; KASSERT(ci1 != ci2); if (ci1 < ci2) { mutex_spin_enter(spc1->spc_mutex); mutex_spin_enter(spc2->spc_mutex); } else { mutex_spin_enter(spc2->spc_mutex); mutex_spin_enter(spc1->spc_mutex); } } /* * Allow machine-dependent code to override curlwp in <machine/cpu.h> for * its own convenience. Otherwise, we declare it as appropriate. */ #if !defined(curlwp) #if defined(MULTIPROCESSOR) #define curlwp curcpu()->ci_curlwp /* Current running LWP */ #else extern struct lwp *curlwp; /* Current running LWP */ #endif /* MULTIPROCESSOR */ #endif /* ! curlwp */ #define curproc (curlwp->l_proc) /* * This provides a way for <machine/cpu.h> to get l_cpu for curlwp before * struct lwp is defined. */ static __inline struct cpu_info * lwp_getcpu(struct lwp *l) { return l->l_cpu; } static __inline bool CURCPU_IDLE_P(void) { struct cpu_info *ci = curcpu(); return ci->ci_onproc == ci->ci_data.cpu_idlelwp; } /* * Disable and re-enable preemption. Only for low-level kernel * use. Device drivers and anything that could potentially be * compiled as a module should use kpreempt_disable() and * kpreempt_enable(). */ static __inline void KPREEMPT_DISABLE(lwp_t *l) { struct lwp *l1 __diagused; KASSERTMSG(l == (l1 = curlwp), "l=%p curlwp=%p", l, l1); l->l_nopreempt++; __insn_barrier(); } static __inline void KPREEMPT_ENABLE(lwp_t *l) { struct lwp *l1 __diagused; KASSERTMSG(l == (l1 = curlwp), "l=%p curlwp=%p", l, l1); KASSERT(l->l_nopreempt > 0); __insn_barrier(); l->l_nopreempt--; __insn_barrier(); if (__predict_false(l->l_dopreempt)) kpreempt(0); } /* For lwp::l_dopreempt */ #define DOPREEMPT_ACTIVE 0x01 #define DOPREEMPT_COUNTED 0x02 /* * Prevent curlwp from migrating between CPUs between curlwp_bind and * curlwp_bindx. One use case is psref(9) that has a contract that * forbids migrations. */ static __inline int curlwp_bind(void) { int bound; bound = curlwp->l_pflag & LP_BOUND; curlwp->l_pflag |= LP_BOUND; __insn_barrier(); return bound; } static __inline void curlwp_bindx(int bound) { KASSERT(curlwp->l_pflag & LP_BOUND); __insn_barrier(); curlwp->l_pflag ^= bound ^ LP_BOUND; } #endif /* _KERNEL */ /* Flags for _lwp_create(), as per Solaris. */ #define LWP_DETACHED 0x00000040 #define LWP_SUSPENDED 0x00000080 /* Kernel-internal flags for LWP creation. */ /* 0x40000000 was LWP_PIDLID */ #define LWP_VFORK 0x80000000 #endif /* !_SYS_LWP_H_ */
3 3 3 3 3 6 6 6 6 6 3 4 2 5 5 1 4 2 6 5 2 4 4 2 2 4 6 5 2 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 /* $NetBSD: kern_physio.c,v 1.102 2022/07/10 23:11:55 riastradh Exp $ */ /*- * Copyright (c) 1982, 1986, 1990, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)kern_physio.c 8.1 (Berkeley) 6/10/93 */ /*- * Copyright (c) 1994 Christopher G. Demetriou * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)kern_physio.c 8.1 (Berkeley) 6/10/93 */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: kern_physio.c,v 1.102 2022/07/10 23:11:55 riastradh Exp $"); #include <sys/param.h> #include <sys/systm.h> #include <sys/conf.h> #include <sys/buf.h> #include <sys/proc.h> #include <sys/once.h> #include <sys/workqueue.h> #include <sys/kmem.h> #include <uvm/uvm_extern.h> ONCE_DECL(physio_initialized); struct workqueue *physio_workqueue; int physio_concurrency = 16; /* #define PHYSIO_DEBUG */ #if defined(PHYSIO_DEBUG) #define DPRINTF(a) printf a #else /* defined(PHYSIO_DEBUG) */ #define DPRINTF(a) /* nothing */ #endif /* defined(PHYSIO_DEBUG) */ struct physio_stat { int ps_running; int ps_error; int ps_failed; off_t ps_endoffset; size_t ps_resid; buf_t *ps_orig_bp; kmutex_t ps_lock; kcondvar_t ps_cv; }; static void physio_done(struct work *wk, void *dummy) { struct buf *bp = (void *)wk; size_t todo = bp->b_bufsize; size_t done = bp->b_bcount - bp->b_resid; struct physio_stat *ps = bp->b_private; bool is_iobuf; KASSERT(&bp->b_work == wk); KASSERT(bp->b_bcount <= todo); KASSERT(bp->b_resid <= bp->b_bcount); KASSERT((bp->b_flags & B_PHYS) != 0); KASSERT(dummy == NULL); vunmapbuf(bp, todo); uvm_vsunlock(bp->b_proc->p_vmspace, bp->b_data, todo); mutex_enter(&ps->ps_lock); is_iobuf = (bp != ps->ps_orig_bp); if (__predict_false(done != todo)) { off_t endoffset = dbtob(bp->b_blkno) + done; /* * we got an error or hit EOM. * * we only care about the first one. * ie. the one at the lowest offset. */ KASSERT(ps->ps_endoffset != endoffset); DPRINTF(("%s: error=%d at %" PRIu64 " - %" PRIu64 ", blkno=%" PRIu64 ", bcount=%d, flags=0x%x\n", __func__, bp->b_error, dbtob(bp->b_blkno), endoffset, bp->b_blkno, bp->b_bcount, bp->b_flags)); if (ps->ps_endoffset == -1 || endoffset < ps->ps_endoffset) { DPRINTF(("%s: ps=%p, error %d -> %d, endoff %" PRIu64 " -> %" PRIu64 "\n", __func__, ps, ps->ps_error, bp->b_error, ps->ps_endoffset, endoffset)); ps->ps_endoffset = endoffset; ps->ps_error = bp->b_error; } ps->ps_failed++; ps->ps_resid += todo - done; } else { KASSERT(bp->b_error == 0); } ps->ps_running--; cv_signal(&ps->ps_cv); mutex_exit(&ps->ps_lock); if (is_iobuf) putiobuf(bp); } static void physio_biodone(struct buf *bp) { #if defined(DIAGNOSTIC) struct physio_stat *ps = bp->b_private; size_t todo = bp->b_bufsize; size_t done = bp->b_bcount - bp->b_resid; KASSERT(ps->ps_running > 0); KASSERT(bp->b_bcount <= todo); KASSERT(bp->b_resid <= bp->b_bcount); if (done == todo) KASSERTMSG(bp->b_error == 0, "error=%d", bp->b_error); #endif /* defined(DIAGNOSTIC) */ workqueue_enqueue(physio_workqueue, &bp->b_work, NULL); } static void physio_wait(struct physio_stat *ps, int n) { KASSERT(mutex_owned(&ps->ps_lock)); while (ps->ps_running > n) cv_wait(&ps->ps_cv, &ps->ps_lock); } static int physio_init(void) { int error; KASSERT(physio_workqueue == NULL); error = workqueue_create(&physio_workqueue, "physiod", physio_done, NULL, PRI_BIO, IPL_BIO, WQ_MPSAFE); return error; } /* * Do "physical I/O" on behalf of a user. "Physical I/O" is I/O directly * from the raw device to user buffers, and bypasses the buffer cache. */ int physio(void (*strategy)(struct buf *), struct buf *obp, dev_t dev, int flags, void (*min_phys)(struct buf *), struct uio *uio) { struct iovec *iovp; struct lwp *l = curlwp; struct proc *p = l->l_proc; int i, error; struct buf *bp = NULL; struct physio_stat *ps; int concurrency = physio_concurrency - 1; int isdisk; error = RUN_ONCE(&physio_initialized, physio_init); if (__predict_false(error != 0)) { return error; } DPRINTF(("%s: called: off=%" PRIu64 ", resid=%zu\n", __func__, uio->uio_offset, uio->uio_resid)); flags &= B_READ | B_WRITE; ps = kmem_zalloc(sizeof(*ps), KM_SLEEP); /* ps->ps_running = 0; */ /* ps->ps_error = 0; */ /* ps->ps_failed = 0; */ ps->ps_orig_bp = obp; ps->ps_endoffset = -1; ps->ps_resid = 0; mutex_init(&ps->ps_lock, MUTEX_DEFAULT, IPL_NONE); cv_init(&ps->ps_cv, "physio"); /* Allow concurrent I/O only for disks */ isdisk = cdev_type(dev) == D_DISK; if (!isdisk) concurrency = 0; /* Make sure we have a buffer, creating one if necessary. */ if (obp != NULL) { mutex_enter(&bufcache_lock); /* Mark it busy, so nobody else will use it. */ while (bbusy(obp, false, 0, NULL) == EPASSTHROUGH) ; mutex_exit(&bufcache_lock); concurrency = 0; /* see "XXXkludge" comment below */ } for (i = 0; i < uio->uio_iovcnt; i++) { bool sync = true; iovp = &uio->uio_iov[i]; while (iovp->iov_len > 0) { size_t todo; vaddr_t endp; mutex_enter(&ps->ps_lock); if (ps->ps_failed != 0) { goto done_locked; } physio_wait(ps, sync ? 0 : concurrency); mutex_exit(&ps->ps_lock); if (obp != NULL) { /* * XXXkludge * some drivers use "obp" as an identifier. */ bp = obp; } else { bp = getiobuf(NULL, true); bp->b_cflags |= BC_BUSY; } bp->b_dev = dev; bp->b_proc = p; bp->b_private = ps; /* * Mrk the buffer busy for physical I/O. Also set * B_PHYS because it's an I/O to user memory, and * B_RAW because B_RAW is to be "set by physio for * raw transfers". */ bp->b_oflags = 0; bp->b_cflags |= BC_BUSY; bp->b_flags = flags | B_PHYS | B_RAW; bp->b_iodone = physio_biodone; /* Set up the buffer for a maximum-sized transfer. */ bp->b_blkno = btodb(uio->uio_offset); if (isdisk) { /* * For disks, check that offsets are at least block * aligned, the block addresses are used to track * errors of finished requests. */ if (uio->uio_offset & (DEV_BSIZE - 1)) { error = EINVAL; goto done; } /* * Split request into MAXPHYS chunks */ bp->b_bcount = MIN(MAXPHYS, iovp->iov_len); } else { bp->b_bcount = MIN(INT_MAX, iovp->iov_len); } bp->b_data = iovp->iov_base; /* * Call minphys to bound the transfer size, * and remember the amount of data to transfer, * for later comparison. */ (*min_phys)(bp); todo = bp->b_bufsize = bp->b_bcount; #if defined(DIAGNOSTIC) if (todo > MAXPHYS) panic("todo(%zu) > MAXPHYS; minphys broken", todo); #endif /* defined(DIAGNOSTIC) */ sync = false; endp = (vaddr_t)bp->b_data + todo; if (trunc_page(endp) != endp) { /* * Following requests can overlap. * note that uvm_vslock does round_page. */ sync = true; } /* * Lock the part of the user address space involved * in the transfer. */ error = uvm_vslock(p->p_vmspace, bp->b_data, todo, (flags & B_READ) ? VM_PROT_WRITE : VM_PROT_READ); if (error) { goto done; } /* * Beware vmapbuf(); if successful it clobbers * b_data and saves it in b_saveaddr. * However, vunmapbuf() restores b_data. */ if ((error = vmapbuf(bp, todo)) != 0) { uvm_vsunlock(p->p_vmspace, bp->b_data, todo); goto done; } BIO_SETPRIO(bp, BPRIO_TIMECRITICAL); mutex_enter(&ps->ps_lock); ps->ps_running++; mutex_exit(&ps->ps_lock); /* Call strategy to start the transfer. */ (*strategy)(bp); bp = NULL; iovp->iov_len -= todo; iovp->iov_base = (char *)iovp->iov_base + todo; uio->uio_offset += todo; uio->uio_resid -= todo; } } done: mutex_enter(&ps->ps_lock); done_locked: physio_wait(ps, 0); mutex_exit(&ps->ps_lock); KASSERT(ps->ps_failed || ps->ps_endoffset == -1); /* * Compute residual, for disks adjust for the * lowest numbered block that returned an error. */ if (isdisk) { if (ps->ps_failed != 0) { off_t delta; delta = uio->uio_offset - ps->ps_endoffset; KASSERT(delta > 0); uio->uio_resid += delta; /* uio->uio_offset = ps->ps_endoffset; */ } } else { uio->uio_resid += ps->ps_resid; } if (bp != NULL && bp != obp) { putiobuf(bp); } if (error == 0) { error = ps->ps_error; } mutex_destroy(&ps->ps_lock); cv_destroy(&ps->ps_cv); kmem_free(ps, sizeof(*ps)); /* * Clean up the state of the buffer. Remember if somebody wants * it, so we can wake them up below. Also, if we had to steal it, * give it back. */ if (obp != NULL) { KASSERT((obp->b_cflags & BC_BUSY) != 0); /* * If another process is waiting for the raw I/O buffer, * wake up processes waiting to do physical I/O; */ mutex_enter(&bufcache_lock); obp->b_cflags &= ~(BC_BUSY | BC_WANTED); obp->b_flags &= ~(B_PHYS | B_RAW); obp->b_iodone = NULL; cv_broadcast(&obp->b_busy); mutex_exit(&bufcache_lock); } DPRINTF(("%s: done: off=%" PRIu64 ", resid=%zu\n", __func__, uio->uio_offset, uio->uio_resid)); return error; } /* * A minphys() routine is called by physio() to adjust the size of each * I/O transfer before the latter is passed to the strategy routine. * * This minphys() is a default that must be called to enforce limits * that are applicable to all devices, because of limitations in the * kernel or the hardware platform. */ void minphys(struct buf *bp) { if (bp->b_bcount > MAXPHYS) bp->b_bcount = MAXPHYS; }
5 5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 /* $NetBSD: keccak.c,v 1.1 2017/11/30 05:47:24 riastradh Exp $ */ /*- * Copyright (c) 2015 Taylor R. Campbell * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include <sys/cdefs.h> #if defined(_KERNEL) || defined(_STANDALONE) __KERNEL_RCSID(0, "$NetBSD: keccak.c,v 1.1 2017/11/30 05:47:24 riastradh Exp $"); #include <sys/types.h> #else __RCSID("$NetBSD: keccak.c,v 1.1 2017/11/30 05:47:24 riastradh Exp $"); #include <stdint.h> #endif #include "keccak.h" #define secret /* can't use in variable-time operations, should zero */ #define FOR5(X, STMT) do \ { \ (X) = 0; STMT; \ (X) = 1; STMT; \ (X) = 2; STMT; \ (X) = 3; STMT; \ (X) = 4; STMT; \ } while (0) static inline secret uint64_t rol64(secret uint64_t v, unsigned c) { return ((v << c) | (v >> (64 - c))); } static inline void keccakf1600_theta(secret uint64_t A[25]) { secret uint64_t C0, C1, C2, C3, C4; unsigned y; C0 = C1 = C2 = C3 = C4 = 0; FOR5(y, { C0 ^= A[0 + 5*y]; C1 ^= A[1 + 5*y]; C2 ^= A[2 + 5*y]; C3 ^= A[3 + 5*y]; C4 ^= A[4 + 5*y]; }); FOR5(y, { A[0 + 5*y] ^= C4 ^ rol64(C1, 1); A[1 + 5*y] ^= C0 ^ rol64(C2, 1); A[2 + 5*y] ^= C1 ^ rol64(C3, 1); A[3 + 5*y] ^= C2 ^ rol64(C4, 1); A[4 + 5*y] ^= C3 ^ rol64(C0, 1); }); } static inline void keccakf1600_rho_pi(secret uint64_t A[25]) { secret uint64_t T, U; /* * Permute by (x,y) |---> (y, 2x + 3y mod 5) starting at (1,0), * rotate the ith element by (i + 1)(i + 2)/2 mod 64. */ U = A[ 1]; T = U; U = A[10]; A[10] = rol64(T, 1); T = U; U = A[ 7]; A[ 7] = rol64(T, 3); T = U; U = A[11]; A[11] = rol64(T, 6); T = U; U = A[17]; A[17] = rol64(T, 10); T = U; U = A[18]; A[18] = rol64(T, 15); T = U; U = A[ 3]; A[ 3] = rol64(T, 21); T = U; U = A[ 5]; A[ 5] = rol64(T, 28); T = U; U = A[16]; A[16] = rol64(T, 36); T = U; U = A[ 8]; A[ 8] = rol64(T, 45); T = U; U = A[21]; A[21] = rol64(T, 55); T = U; U = A[24]; A[24] = rol64(T, 2); T = U; U = A[ 4]; A[ 4] = rol64(T, 14); T = U; U = A[15]; A[15] = rol64(T, 27); T = U; U = A[23]; A[23] = rol64(T, 41); T = U; U = A[19]; A[19] = rol64(T, 56); T = U; U = A[13]; A[13] = rol64(T, 8); T = U; U = A[12]; A[12] = rol64(T, 25); T = U; U = A[ 2]; A[ 2] = rol64(T, 43); T = U; U = A[20]; A[20] = rol64(T, 62); T = U; U = A[14]; A[14] = rol64(T, 18); T = U; U = A[22]; A[22] = rol64(T, 39); T = U; U = A[ 9]; A[ 9] = rol64(T, 61); T = U; U = A[ 6]; A[ 6] = rol64(T, 20); T = U; A[ 1] = rol64(T, 44); } static inline void keccakf1600_chi(secret uint64_t A[25]) { secret uint64_t B0, B1, B2, B3, B4; unsigned y; FOR5(y, { B0 = A[0 + 5*y]; B1 = A[1 + 5*y]; B2 = A[2 + 5*y]; B3 = A[3 + 5*y]; B4 = A[4 + 5*y]; A[0 + 5*y] ^= ~B1 & B2; A[1 + 5*y] ^= ~B2 & B3; A[2 + 5*y] ^= ~B3 & B4; A[3 + 5*y] ^= ~B4 & B0; A[4 + 5*y] ^= ~B0 & B1; }); } static void keccakf1600_round(secret uint64_t A[25]) { keccakf1600_theta(A); keccakf1600_rho_pi(A); keccakf1600_chi(A); } void keccakf1600(secret uint64_t A[25]) { /* * RC[i] = \sum_{j = 0,...,6} rc(j + 7i) 2^(2^j - 1), * rc(t) = (x^t mod x^8 + x^6 + x^5 + x^4 + 1) mod x in GF(2)[x] */ static const uint64_t RC[24] = { 0x0000000000000001ULL, 0x0000000000008082ULL, 0x800000000000808aULL, 0x8000000080008000ULL, 0x000000000000808bULL, 0x0000000080000001ULL, 0x8000000080008081ULL, 0x8000000000008009ULL, 0x000000000000008aULL, 0x0000000000000088ULL, 0x0000000080008009ULL, 0x000000008000000aULL, 0x000000008000808bULL, 0x800000000000008bULL, 0x8000000000008089ULL, 0x8000000000008003ULL, 0x8000000000008002ULL, 0x8000000000000080ULL, 0x000000000000800aULL, 0x800000008000000aULL, 0x8000000080008081ULL, 0x8000000000008080ULL, 0x0000000080000001ULL, 0x8000000080008008ULL, }; unsigned i; for (i = 0; i < 24; i++) { keccakf1600_round(A); A[0] ^= RC[i]; } }
2 29 29 29 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 /* $NetBSD: ffs_snapshot.c,v 1.155 2023/05/11 23:11:25 chs Exp $ */ /* * Copyright 2000 Marshall Kirk McKusick. All Rights Reserved. * * Further information about snapshots can be obtained from: * * Marshall Kirk McKusick http://www.mckusick.com/softdep/ * 1614 Oxford Street mckusick@mckusick.com * Berkeley, CA 94709-1608 +1-510-843-9542 * USA * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY MARSHALL KIRK MCKUSICK ``AS IS'' AND ANY * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL MARSHALL KIRK MCKUSICK BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)ffs_snapshot.c 8.11 (McKusick) 7/23/00 * * from FreeBSD: ffs_snapshot.c,v 1.79 2004/02/13 02:02:06 kuriyama Exp */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: ffs_snapshot.c,v 1.155 2023/05/11 23:11:25 chs Exp $"); #if defined(_KERNEL_OPT) #include "opt_ffs.h" #include "opt_quota.h" #endif #include <sys/param.h> #include <sys/kernel.h> #include <sys/systm.h> #include <sys/conf.h> #include <sys/buf.h> #include <sys/proc.h> #include <sys/namei.h> #include <sys/sched.h> #include <sys/stat.h> #include <sys/malloc.h> #include <sys/mount.h> #include <sys/resource.h> #include <sys/resourcevar.h> #include <sys/vnode.h> #include <sys/kauth.h> #include <sys/fstrans.h> #include <sys/wapbl.h> #include <miscfs/specfs/specdev.h> #include <ufs/ufs/quota.h> #include <ufs/ufs/ufsmount.h> #include <ufs/ufs/inode.h> #include <ufs/ufs/ufs_extern.h> #include <ufs/ufs/ufs_bswap.h> #include <ufs/ufs/ufs_wapbl.h> #include <ufs/ffs/fs.h> #include <ufs/ffs/ffs_extern.h> #include <uvm/uvm.h> TAILQ_HEAD(inodelst, inode); /* List of active snapshots */ struct snap_info { kmutex_t si_lock; /* Lock this snapinfo */ kmutex_t si_snaplock; /* Snapshot vnode common lock */ lwp_t *si_owner; /* Snaplock owner */ struct inodelst si_snapshots; /* List of active snapshots */ daddr_t *si_snapblklist; /* Snapshot block hints list */ uint32_t si_gen; /* Incremented on change */ }; #if !defined(FFS_NO_SNAPSHOT) typedef int (*acctfunc_t) (struct vnode *, void *, int, int, struct fs *, daddr_t, int); static int snapshot_setup(struct mount *, struct vnode *); static int snapshot_copyfs(struct mount *, struct vnode *, void **); static int snapshot_expunge(struct mount *, struct vnode *, struct fs *, daddr_t *, daddr_t **); static int snapshot_expunge_snap(struct mount *, struct vnode *, struct fs *, daddr_t); static int snapshot_writefs(struct mount *, struct vnode *, void *); static int cgaccount(struct vnode *, int, int *); static int cgaccount1(int, struct vnode *, void *, int); static int expunge(struct vnode *, struct inode *, struct fs *, acctfunc_t, int); static int indiracct(struct vnode *, struct vnode *, int, daddr_t, daddr_t, daddr_t, daddr_t, daddr_t, struct fs *, acctfunc_t, int); static int fullacct(struct vnode *, void *, int, int, struct fs *, daddr_t, int); static int snapacct(struct vnode *, void *, int, int, struct fs *, daddr_t, int); static int mapacct(struct vnode *, void *, int, int, struct fs *, daddr_t, int); #endif /* !defined(FFS_NO_SNAPSHOT) */ static int ffs_copyonwrite(void *, struct buf *, bool); static int snapblkaddr(struct vnode *, daddr_t, daddr_t *); static int rwfsblk(struct vnode *, int, void *, daddr_t); static int syncsnap(struct vnode *); static int wrsnapblk(struct vnode *, void *, daddr_t); #if !defined(FFS_NO_SNAPSHOT) static int blocks_in_journal(struct fs *); #endif static inline bool is_active_snapshot(struct snap_info *, struct inode *); static inline daddr_t db_get(struct inode *, int); static inline void db_assign(struct inode *, int, daddr_t); static inline daddr_t ib_get(struct inode *, int); static inline daddr_t idb_get(struct inode *, void *, int); static inline void idb_assign(struct inode *, void *, int, daddr_t); #ifdef DEBUG static int snapdebug = 0; #endif int ffs_snapshot_init(struct ufsmount *ump) { struct snap_info *si; si = ump->um_snapinfo = kmem_alloc(sizeof(*si), KM_SLEEP); TAILQ_INIT(&si->si_snapshots); mutex_init(&si->si_lock, MUTEX_DEFAULT, IPL_NONE); mutex_init(&si->si_snaplock, MUTEX_DEFAULT, IPL_NONE); si->si_owner = NULL; si->si_gen = 0; si->si_snapblklist = NULL; return 0; } void ffs_snapshot_fini(struct ufsmount *ump) { struct snap_info *si; si = ump->um_snapinfo; ump->um_snapinfo = NULL; KASSERT(TAILQ_EMPTY(&si->si_snapshots)); mutex_destroy(&si->si_lock); mutex_destroy(&si->si_snaplock); KASSERT(si->si_snapblklist == NULL); kmem_free(si, sizeof(*si)); } /* * Create a snapshot file and initialize it for the filesystem. * Vnode is locked on entry and return. */ int ffs_snapshot(struct mount *mp, struct vnode *vp, struct timespec *ctime) { #if defined(FFS_NO_SNAPSHOT) return EOPNOTSUPP; } #else /* defined(FFS_NO_SNAPSHOT) */ bool suspended = false; int error, redo = 0, snaploc; void *sbbuf = NULL; daddr_t *snaplist = NULL, snaplistsize = 0; struct buf *bp, *nbp; struct fs *copy_fs = NULL; struct fs *fs = VFSTOUFS(mp)->um_fs; struct inode *ip = VTOI(vp); struct lwp *l = curlwp; struct snap_info *si = VFSTOUFS(mp)->um_snapinfo; struct timespec ts; struct timeval starttime; #ifdef DEBUG struct timeval endtime; #endif struct vnode *devvp = ip->i_devvp; /* * If the vnode already is a snapshot, return. */ if ((ip->i_flags & SF_SNAPSHOT)) { if ((ip->i_flags & SF_SNAPINVAL)) return EINVAL; if (ctime) { ctime->tv_sec = DIP(ip, mtime); ctime->tv_nsec = DIP(ip, mtimensec); } return 0; } /* * Check for free snapshot slot in the superblock. */ for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++) if (fs->fs_snapinum[snaploc] == 0) break; if (snaploc == FSMAXSNAP) return (ENOSPC); /* * Prepare the vnode to become a snapshot. */ error = snapshot_setup(mp, vp); if (error) goto out; /* * Copy all the cylinder group maps. Although the * filesystem is still active, we hope that only a few * cylinder groups will change between now and when we * suspend operations. Thus, we will be able to quickly * touch up the few cylinder groups that changed during * the suspension period. */ error = cgaccount(vp, 1, NULL); if (error) goto out; /* * snapshot is now valid */ ip->i_flags &= ~SF_SNAPINVAL; DIP_ASSIGN(ip, flags, ip->i_flags); ip->i_flag |= IN_CHANGE | IN_UPDATE; /* * Ensure that the snapshot is completely on disk. * Since we have marked it as a snapshot it is safe to * unlock it as no process will be allowed to write to it. */ error = VOP_FSYNC(vp, l->l_cred, FSYNC_WAIT, 0, 0); if (error) goto out; VOP_UNLOCK(vp); /* * All allocations are done, so we can now suspend the filesystem. */ error = vfs_suspend(vp->v_mount, 0); if (error == 0) { suspended = true; vrele_flush(vp->v_mount); error = VFS_SYNC(vp->v_mount, MNT_WAIT, curlwp->l_cred); } vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); if (error) goto out; getmicrotime(&starttime); /* * First, copy all the cylinder group maps that have changed. */ error = cgaccount(vp, 2, &redo); if (error) goto out; /* * Create a copy of the superblock and its summary information. */ error = snapshot_copyfs(mp, vp, &sbbuf); if (error) goto out; copy_fs = (struct fs *)((char *)sbbuf + ffs_blkoff(fs, fs->fs_sblockloc)); /* * Expunge unlinked files from our view. */ error = snapshot_expunge(mp, vp, copy_fs, &snaplistsize, &snaplist); if (error) goto out; /* * Record snapshot inode. Since this is the newest snapshot, * it must be placed at the end of the list. */ if (ip->i_nlink > 0) fs->fs_snapinum[snaploc] = ip->i_number; mutex_enter(&si->si_lock); if (is_active_snapshot(si, ip)) panic("ffs_snapshot: %"PRIu64" already on list", ip->i_number); TAILQ_INSERT_TAIL(&si->si_snapshots, ip, i_nextsnap); if (TAILQ_FIRST(&si->si_snapshots) == ip) { /* * If this is the first snapshot on this filesystem, put the * preliminary list in place and establish the cow handler. */ si->si_snapblklist = snaplist; fscow_establish(mp, ffs_copyonwrite, devvp); } si->si_gen++; mutex_exit(&si->si_lock); vp->v_vflag |= VV_SYSTEM; /* * Set the mtime to the time the snapshot has been taken. */ TIMEVAL_TO_TIMESPEC(&starttime, &ts); if (ctime) *ctime = ts; DIP_ASSIGN(ip, mtime, ts.tv_sec); DIP_ASSIGN(ip, mtimensec, ts.tv_nsec); ip->i_flag |= IN_CHANGE | IN_UPDATE; /* * Copy allocation information from all snapshots and then * expunge them from our view. */ error = snapshot_expunge_snap(mp, vp, copy_fs, snaplistsize); if (error) goto out; /* * Write the superblock and its summary information to the snapshot. */ error = snapshot_writefs(mp, vp, sbbuf); if (error) goto out; /* * We're nearly done, ensure that the snapshot is completely on disk. */ error = VOP_FSYNC(vp, l->l_cred, FSYNC_WAIT, 0, 0); if (error) goto out; /* * Invalidate and free all pages on the snapshot vnode. * We will read and write through the buffercache. */ rw_enter(vp->v_uobj.vmobjlock, RW_WRITER); error = VOP_PUTPAGES(vp, 0, 0, PGO_ALLPAGES | PGO_CLEANIT | PGO_SYNCIO | PGO_FREE); if (error) goto out; /* * Invalidate short ( < fs_bsize ) buffers. We will always read * full size buffers later. */ mutex_enter(&bufcache_lock); KASSERT(LIST_FIRST(&vp->v_dirtyblkhd) == NULL); for (bp = LIST_FIRST(&vp->v_cleanblkhd); bp; bp = nbp) { nbp = LIST_NEXT(bp, b_vnbufs); if (bp->b_bcount == fs->fs_bsize) continue; error = bbusy(bp, false, 0, NULL); if (error != 0) { if (error == EPASSTHROUGH) { nbp = LIST_FIRST(&vp->v_cleanblkhd); continue; } break; } brelsel(bp, BC_INVAL | BC_VFLUSH); } mutex_exit(&bufcache_lock); out: if (sbbuf != NULL) { free(copy_fs->fs_csp, M_UFSMNT); free(sbbuf, M_UFSMNT); } if (fs->fs_active != NULL) { free(fs->fs_active, M_DEVBUF); fs->fs_active = NULL; } mutex_enter(&si->si_lock); if (snaplist != NULL) { if (si->si_snapblklist == snaplist) si->si_snapblklist = NULL; free(snaplist, M_UFSMNT); } if (error) { fs->fs_snapinum[snaploc] = 0; } else { /* * As this is the newest list, it is the most inclusive, so * should replace the previous list. */ si->si_snapblklist = ip->i_snapblklist; } si->si_gen++; mutex_exit(&si->si_lock); if (suspended) { VOP_UNLOCK(vp); vfs_resume(vp->v_mount); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); #ifdef DEBUG getmicrotime(&endtime); timersub(&endtime, &starttime, &endtime); printf("%s: suspended %lld.%03d sec, redo %d of %d\n", mp->mnt_stat.f_mntonname, (long long)endtime.tv_sec, endtime.tv_usec / 1000, redo, fs->fs_ncg); #endif } if (error) { if (UFS_WAPBL_BEGIN(mp) == 0) { /* * We depend on ffs_truncate() to call ffs_snapremove() * before it may return an error. On failed * ffs_truncate() we have normal file with leaked * (meta-) data, but no snapshot to use. */ (void) ffs_truncate(vp, (off_t)0, 0, NOCRED); UFS_WAPBL_END(mp); } } else if (ip->i_nlink > 0) vref(vp); return (error); } /* * Prepare vnode to become a snapshot. */ static int snapshot_setup(struct mount *mp, struct vnode *vp) { int error, n, len, loc, cg; daddr_t blkno, numblks; struct buf *ibp, *nbp; struct fs *fs = VFSTOUFS(mp)->um_fs; struct lwp *l = curlwp; const int wbreak = blocks_in_journal(fs)/8; struct inode *ip = VTOI(vp); /* * Check mount, readonly reference and owner. */ if (vp->v_mount != mp) return EXDEV; if (vp->v_writecount != 0) return EBUSY; error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_FS_SNAPSHOT, 0, mp, vp, NULL); if (error) return EACCES; /* * Must completely truncate the file here. Allocated * blocks on a snapshot mean that block has been copied * on write, see ffs_copyonwrite() testing "blkno != 0" */ error = ufs_truncate_all(vp); if (error) return error; /* Change inode to snapshot type file. */ error = UFS_WAPBL_BEGIN(mp); if (error) return error; #if defined(QUOTA) || defined(QUOTA2) /* snapshot inodes are not accounted in quotas */ chkiq(ip, -1, l->l_cred, 0); #endif ip->i_flags |= (SF_SNAPSHOT | SF_SNAPINVAL); DIP_ASSIGN(ip, flags, ip->i_flags); ip->i_flag |= IN_CHANGE | IN_UPDATE; ffs_update(vp, NULL, NULL, UPDATE_WAIT); UFS_WAPBL_END(mp); KASSERT(ip->i_flags & SF_SNAPSHOT); /* * Write an empty list of preallocated blocks to the end of * the snapshot to set size to at least that of the filesystem. */ numblks = howmany(fs->fs_size, fs->fs_frag); blkno = 1; blkno = ufs_rw64(blkno, UFS_FSNEEDSWAP(fs)); error = vn_rdwr(UIO_WRITE, vp, (void *)&blkno, sizeof(blkno), ffs_lblktosize(fs, (off_t)numblks), UIO_SYSSPACE, IO_NODELOCKED|IO_UNIT, l->l_cred, NULL, NULL); if (error) return error; /* * Preallocate critical data structures so that we can copy * them in without further allocation after we suspend all * operations on the filesystem. We would like to just release * the allocated buffers without writing them since they will * be filled in below once we are ready to go, but this upsets * the soft update code, so we go ahead and write the new buffers. * * Allocate all indirect blocks and mark all of them as not * needing to be copied. */ error = UFS_WAPBL_BEGIN(mp); if (error) return error; for (blkno = UFS_NDADDR, n = 0; blkno < numblks; blkno += FFS_NINDIR(fs)) { error = ffs_balloc(vp, ffs_lblktosize(fs, (off_t)blkno), fs->fs_bsize, l->l_cred, B_METAONLY, &ibp); if (error) goto out; brelse(ibp, 0); if (wbreak > 0 && (++n % wbreak) == 0) { UFS_WAPBL_END(mp); error = UFS_WAPBL_BEGIN(mp); if (error) return error; } } /* * Allocate copies for the superblock and its summary information. */ error = ffs_balloc(vp, fs->fs_sblockloc, fs->fs_sbsize, l->l_cred, 0, &nbp); if (error) goto out; bawrite(nbp); blkno = ffs_fragstoblks(fs, fs->fs_csaddr); len = howmany(fs->fs_cssize, fs->fs_bsize); for (loc = 0; loc < len; loc++) { error = ffs_balloc(vp, ffs_lblktosize(fs, (off_t)(blkno + loc)), fs->fs_bsize, l->l_cred, 0, &nbp); if (error) goto out; bawrite(nbp); if (wbreak > 0 && (++n % wbreak) == 0) { UFS_WAPBL_END(mp); error = UFS_WAPBL_BEGIN(mp); if (error) return error; } } /* * Allocate all cylinder group blocks. */ for (cg = 0; cg < fs->fs_ncg; cg++) { error = ffs_balloc(vp, ffs_lfragtosize(fs, cgtod(fs, cg)), fs->fs_bsize, l->l_cred, 0, &nbp); if (error) goto out; bawrite(nbp); if (wbreak > 0 && (++n % wbreak) == 0) { UFS_WAPBL_END(mp); error = UFS_WAPBL_BEGIN(mp); if (error) return error; } } out: UFS_WAPBL_END(mp); return error; } /* * Create a copy of the superblock and its summary information. * It is up to the caller to free copyfs and copy_fs->fs_csp. */ static int snapshot_copyfs(struct mount *mp, struct vnode *vp, void **sbbuf) { int error, i, len, loc, size; void *space; int32_t *lp; struct buf *bp; struct fs *copyfs, *fs = VFSTOUFS(mp)->um_fs; struct vnode *devvp = VTOI(vp)->i_devvp; /* * Grab a copy of the superblock and its summary information. * We delay writing it until the suspension is released below. */ *sbbuf = malloc(fs->fs_bsize, M_UFSMNT, M_WAITOK); loc = ffs_blkoff(fs, fs->fs_sblockloc); if (loc > 0) memset(*sbbuf, 0, loc); copyfs = (struct fs *)((char *)(*sbbuf) + loc); memcpy(copyfs, fs, fs->fs_sbsize); size = fs->fs_bsize < SBLOCKSIZE ? fs->fs_bsize : SBLOCKSIZE; if (fs->fs_sbsize < size) memset((char *)(*sbbuf) + loc + fs->fs_sbsize, 0, size - fs->fs_sbsize); size = ffs_blkroundup(fs, fs->fs_cssize); if (fs->fs_contigsumsize > 0) size += fs->fs_ncg * sizeof(int32_t); space = malloc(size, M_UFSMNT, M_WAITOK); copyfs->fs_csp = space; memcpy(copyfs->fs_csp, fs->fs_csp, fs->fs_cssize); space = (char *)space + fs->fs_cssize; loc = howmany(fs->fs_cssize, fs->fs_fsize); i = fs->fs_frag - loc % fs->fs_frag; len = (i == fs->fs_frag) ? 0 : i * fs->fs_fsize; if (len > 0) { if ((error = bread(devvp, FFS_FSBTODB(fs, fs->fs_csaddr + loc), len, 0, &bp)) != 0) { free(copyfs->fs_csp, M_UFSMNT); free(*sbbuf, M_UFSMNT); *sbbuf = NULL; return error; } memcpy(space, bp->b_data, (u_int)len); space = (char *)space + len; brelse(bp, BC_INVAL | BC_NOCACHE); } if (fs->fs_contigsumsize > 0) { copyfs->fs_maxcluster = lp = space; for (i = 0; i < fs->fs_ncg; i++) *lp++ = fs->fs_contigsumsize; } if (mp->mnt_wapbl) copyfs->fs_flags &= ~FS_DOWAPBL; return 0; } struct snapshot_expunge_ctx { struct vnode *logvp; struct vnode *vp; struct fs *copy_fs; }; static bool snapshot_expunge_selector(void *cl, struct vnode *xvp) { struct snapshot_expunge_ctx *c = cl; struct inode *xp; KASSERT(mutex_owned(xvp->v_interlock)); xp = VTOI(xvp); if (xvp->v_type == VNON || VTOI(xvp) == NULL || (xp->i_flags & SF_SNAPSHOT)) return false; #ifdef DEBUG if (snapdebug) vprint("ffs_snapshot: busy vnode", xvp); #endif if (xvp == c->logvp) return true; if (xp->i_nlink > 0) return false; if (ffs_checkfreefile(c->copy_fs, c->vp, xp->i_number)) return false; return true; } /* * We must check for active files that have been unlinked (e.g., with a zero * link count). We have to expunge all trace of these files from the snapshot * so that they are not reclaimed prematurely by fsck or unnecessarily dumped. * Note that we skip unlinked snapshot files as they will be handled separately. * Calculate the snapshot list size and create a preliminary list. */ static int snapshot_expunge(struct mount *mp, struct vnode *vp, struct fs *copy_fs, daddr_t *snaplistsize, daddr_t **snaplist) { int cg, error = 0, len, loc; daddr_t blkno, *blkp; struct fs *fs = VFSTOUFS(mp)->um_fs; struct inode *xp; struct vnode *logvp = NULL, *xvp; struct vnode_iterator *marker; struct snapshot_expunge_ctx ctx; *snaplist = NULL; /* * Get the log inode if any. */ if ((fs->fs_flags & FS_DOWAPBL) && fs->fs_journal_location == UFS_WAPBL_JOURNALLOC_IN_FILESYSTEM) { error = VFS_VGET(mp, fs->fs_journallocs[UFS_WAPBL_INFS_INO], LK_EXCLUSIVE, &logvp); if (error) goto out; } /* * We also calculate the needed size for the snapshot list. */ *snaplistsize = fs->fs_ncg + howmany(fs->fs_cssize, fs->fs_bsize) + FSMAXSNAP + 1 /* superblock */ + 1 /* last block */ + 1 /* size */; vfs_vnode_iterator_init(mp, &marker); ctx.logvp = logvp; ctx.vp = vp; ctx.copy_fs = copy_fs; while ((xvp = vfs_vnode_iterator_next(marker, snapshot_expunge_selector, &ctx))) { /* * If there is a fragment, clear it here. */ xp = VTOI(xvp); blkno = 0; loc = howmany(xp->i_size, fs->fs_bsize) - 1; if (loc < UFS_NDADDR) { len = ffs_fragroundup(fs, ffs_blkoff(fs, xp->i_size)); if (len > 0 && len < fs->fs_bsize) { error = UFS_WAPBL_BEGIN(mp); if (error) { vrele(xvp); vfs_vnode_iterator_destroy(marker); goto out; } ffs_blkfree_snap(copy_fs, vp, db_get(xp, loc), len, xp->i_number); blkno = db_get(xp, loc); db_assign(xp, loc, 0); UFS_WAPBL_END(mp); } } *snaplistsize += 1; error = expunge(vp, xp, copy_fs, fullacct, BLK_NOCOPY); if (blkno) db_assign(xp, loc, blkno); if (!error) { error = UFS_WAPBL_BEGIN(mp); if (!error) { error = ffs_freefile_snap(copy_fs, vp, xp->i_number, xp->i_mode); UFS_WAPBL_END(mp); } } vrele(xvp); if (error) { vfs_vnode_iterator_destroy(marker); goto out; } } vfs_vnode_iterator_destroy(marker); /* * Create a preliminary list of preallocated snapshot blocks. */ *snaplist = malloc(*snaplistsize * sizeof(daddr_t), M_UFSMNT, M_WAITOK); blkp = &(*snaplist)[1]; *blkp++ = ffs_lblkno(fs, fs->fs_sblockloc); blkno = ffs_fragstoblks(fs, fs->fs_csaddr); for (cg = 0; cg < fs->fs_ncg; cg++) { if (ffs_fragstoblks(fs, cgtod(fs, cg)) > blkno) break; *blkp++ = ffs_fragstoblks(fs, cgtod(fs, cg)); } len = howmany(fs->fs_cssize, fs->fs_bsize); for (loc = 0; loc < len; loc++) *blkp++ = blkno + loc; for (; cg < fs->fs_ncg; cg++) *blkp++ = ffs_fragstoblks(fs, cgtod(fs, cg)); (*snaplist)[0] = blkp - &(*snaplist)[0]; out: if (logvp != NULL) vput(logvp); if (error && *snaplist != NULL) { free(*snaplist, M_UFSMNT); *snaplist = NULL; } return error; } /* * Copy allocation information from all the snapshots in this snapshot and * then expunge them from its view. Also, collect the list of allocated * blocks in i_snapblklist. */ static int snapshot_expunge_snap(struct mount *mp, struct vnode *vp, struct fs *copy_fs, daddr_t snaplistsize) { int error = 0, i; daddr_t numblks, *snaplist = NULL; struct fs *fs = VFSTOUFS(mp)->um_fs; struct inode *ip = VTOI(vp), *xp; struct lwp *l = curlwp; struct snap_info *si = VFSTOUFS(mp)->um_snapinfo; TAILQ_FOREACH(xp, &si->si_snapshots, i_nextsnap) { if (xp != ip) { error = expunge(vp, xp, fs, snapacct, BLK_SNAP); if (error) break; } if (xp->i_nlink != 0) continue; error = UFS_WAPBL_BEGIN(mp); if (error) break; error = ffs_freefile_snap(copy_fs, vp, xp->i_number, xp->i_mode); UFS_WAPBL_END(mp); if (error) break; } if (error) goto out; /* * Allocate space for the full list of preallocated snapshot blocks. */ snaplist = malloc(snaplistsize * sizeof(daddr_t), M_UFSMNT, M_WAITOK); ip->i_snapblklist = &snaplist[1]; /* * Expunge the blocks used by the snapshots from the set of * blocks marked as used in the snapshot bitmaps. Also, collect * the list of allocated blocks in i_snapblklist. */ error = expunge(vp, ip, copy_fs, mapacct, BLK_SNAP); if (error) goto out; if (snaplistsize < ip->i_snapblklist - snaplist) panic("ffs_snapshot: list too small"); snaplistsize = ip->i_snapblklist - snaplist; snaplist[0] = snaplistsize; ip->i_snapblklist = &snaplist[0]; /* * Write out the list of allocated blocks to the end of the snapshot. */ numblks = howmany(fs->fs_size, fs->fs_frag); for (i = 0; i < snaplistsize; i++) snaplist[i] = ufs_rw64(snaplist[i], UFS_FSNEEDSWAP(fs)); error = vn_rdwr(UIO_WRITE, vp, (void *)snaplist, snaplistsize * sizeof(daddr_t), ffs_lblktosize(fs, (off_t)numblks), UIO_SYSSPACE, IO_NODELOCKED | IO_UNIT, l->l_cred, NULL, NULL); for (i = 0; i < snaplistsize; i++) snaplist[i] = ufs_rw64(snaplist[i], UFS_FSNEEDSWAP(fs)); out: if (error && snaplist != NULL) { free(snaplist, M_UFSMNT); ip->i_snapblklist = NULL; } return error; } /* * Write the superblock and its summary information to the snapshot. * Make sure, the first UFS_NDADDR blocks get copied to the snapshot. */ static int snapshot_writefs(struct mount *mp, struct vnode *vp, void *sbbuf) { int error, len, loc; void *space; daddr_t blkno; struct buf *bp; struct fs *copyfs, *fs = VFSTOUFS(mp)->um_fs; struct inode *ip = VTOI(vp); struct lwp *l = curlwp; copyfs = (struct fs *)((char *)sbbuf + ffs_blkoff(fs, fs->fs_sblockloc)); /* * Write the superblock and its summary information * to the snapshot. */ blkno = ffs_fragstoblks(fs, fs->fs_csaddr); len = howmany(fs->fs_cssize, fs->fs_bsize); space = copyfs->fs_csp; #ifdef FFS_EI if (UFS_FSNEEDSWAP(fs)) { ffs_sb_swap(copyfs, copyfs); ffs_csum_swap(space, space, fs->fs_cssize); } #endif error = UFS_WAPBL_BEGIN(mp); if (error) return error; for (loc = 0; loc < len; loc++) { error = bread(vp, blkno + loc, fs->fs_bsize, B_MODIFY, &bp); if (error) { break; } memcpy(bp->b_data, space, fs->fs_bsize); space = (char *)space + fs->fs_bsize; bawrite(bp); } if (error) goto out; error = bread(vp, ffs_lblkno(fs, fs->fs_sblockloc), fs->fs_bsize, B_MODIFY, &bp); if (error) { goto out; } else { memcpy(bp->b_data, sbbuf, fs->fs_bsize); bawrite(bp); } /* * Copy the first UFS_NDADDR blocks to the snapshot so * ffs_copyonwrite() and ffs_snapblkfree() will always work on * indirect blocks. */ for (loc = 0; loc < UFS_NDADDR; loc++) { if (db_get(ip, loc) != 0) continue; error = ffs_balloc(vp, ffs_lblktosize(fs, (off_t)loc), fs->fs_bsize, l->l_cred, 0, &bp); if (error) break; error = rwfsblk(vp, B_READ, bp->b_data, loc); if (error) { brelse(bp, 0); break; } bawrite(bp); } out: UFS_WAPBL_END(mp); return error; } /* * Copy all cylinder group maps. */ static int cgaccount(struct vnode *vp, int passno, int *redo) { int cg, error = 0; struct buf *nbp; struct fs *fs = VTOI(vp)->i_fs; if (redo != NULL) *redo = 0; if (passno == 1) fs->fs_active = malloc(howmany(fs->fs_ncg, NBBY), M_DEVBUF, M_WAITOK | M_ZERO); for (cg = 0; cg < fs->fs_ncg; cg++) { if (passno == 2 && ACTIVECG_ISSET(fs, cg)) continue; if (redo != NULL) *redo += 1; error = UFS_WAPBL_BEGIN(vp->v_mount); if (error) return error; error = ffs_balloc(vp, ffs_lfragtosize(fs, cgtod(fs, cg)), fs->fs_bsize, curlwp->l_cred, 0, &nbp); if (error) { UFS_WAPBL_END(vp->v_mount); break; } error = cgaccount1(cg, vp, nbp->b_data, passno); bawrite(nbp); UFS_WAPBL_END(vp->v_mount); if (error) break; } return error; } /* * Copy a cylinder group map. All the unallocated blocks are marked * BLK_NOCOPY so that the snapshot knows that it need not copy them * if they are later written. If passno is one, then this is a first * pass, so only setting needs to be done. If passno is 2, then this * is a revision to a previous pass which must be undone as the * replacement pass is done. */ static int cgaccount1(int cg, struct vnode *vp, void *data, int passno) { struct buf *bp, *ibp; struct inode *ip; struct cg *cgp; struct fs *fs; struct lwp *l = curlwp; daddr_t base, numblks; int error, len, loc, ns __unused, indiroff; ip = VTOI(vp); fs = ip->i_fs; ns = UFS_FSNEEDSWAP(fs); error = bread(ip->i_devvp, FFS_FSBTODB(fs, cgtod(fs, cg)), (int)fs->fs_cgsize, 0, &bp); if (error) { return (error); } cgp = (struct cg *)bp->b_data; if (!cg_chkmagic(cgp, ns)) { brelse(bp, 0); return (EIO); } ACTIVECG_SET(fs, cg); memcpy(data, bp->b_data, fs->fs_cgsize); brelse(bp, 0); if (fs->fs_cgsize < fs->fs_bsize) memset((char *)data + fs->fs_cgsize, 0, fs->fs_bsize - fs->fs_cgsize); numblks = howmany(fs->fs_size, fs->fs_frag); len = howmany(fs->fs_fpg, fs->fs_frag); base = cgbase(fs, cg) / fs->fs_frag; if (base + len >= numblks) len = numblks - base - 1; loc = 0; if (base < UFS_NDADDR) { for ( ; loc < UFS_NDADDR; loc++) { if (ffs_isblock(fs, cg_blksfree(cgp, ns), loc)) db_assign(ip, loc, BLK_NOCOPY); else if (db_get(ip, loc) == BLK_NOCOPY) { if (passno == 2) db_assign(ip, loc, 0); else if (passno == 1) panic("ffs_snapshot: lost direct block"); } } } if ((error = ffs_balloc(vp, ffs_lblktosize(fs, (off_t)(base + loc)), fs->fs_bsize, l->l_cred, B_METAONLY, &ibp)) != 0) return (error); indiroff = (base + loc - UFS_NDADDR) % FFS_NINDIR(fs); for ( ; loc < len; loc++, indiroff++) { if (indiroff >= FFS_NINDIR(fs)) { bawrite(ibp); if ((error = ffs_balloc(vp, ffs_lblktosize(fs, (off_t)(base + loc)), fs->fs_bsize, l->l_cred, B_METAONLY, &ibp)) != 0) return (error); indiroff = 0; } if (ffs_isblock(fs, cg_blksfree(cgp, ns), loc)) idb_assign(ip, ibp->b_data, indiroff, BLK_NOCOPY); else if (idb_get(ip, ibp->b_data, indiroff) == BLK_NOCOPY) { if (passno == 2) idb_assign(ip, ibp->b_data, indiroff, 0); else if (passno == 1) panic("ffs_snapshot: lost indirect block"); } } bdwrite(ibp); return (0); } /* * Before expunging a snapshot inode, note all the * blocks that it claims with BLK_SNAP so that fsck will * be able to account for those blocks properly and so * that this snapshot knows that it need not copy them * if the other snapshot holding them is freed. */ static int expunge(struct vnode *snapvp, struct inode *cancelip, struct fs *fs, acctfunc_t acctfunc, int expungetype) { int i, error, ns __unused; daddr_t lbn, rlbn; daddr_t len, blkno, numblks, blksperindir; struct ufs1_dinode *dip1; struct ufs2_dinode *dip2; struct lwp *l = curlwp; void *bap; struct buf *bp; struct mount *mp; ns = UFS_FSNEEDSWAP(fs); mp = snapvp->v_mount; error = UFS_WAPBL_BEGIN(mp); if (error) return error; /* * Prepare to expunge the inode. If its inode block has not * yet been copied, then allocate and fill the copy. */ lbn = ffs_fragstoblks(fs, ino_to_fsba(fs, cancelip->i_number)); error = snapblkaddr(snapvp, lbn, &blkno); if (error) return error; if (blkno != 0) { error = bread(snapvp, lbn, fs->fs_bsize, B_MODIFY, &bp); } else { error = ffs_balloc(snapvp, ffs_lblktosize(fs, (off_t)lbn), fs->fs_bsize, l->l_cred, 0, &bp); if (! error) error = rwfsblk(snapvp, B_READ, bp->b_data, lbn); } if (error) { UFS_WAPBL_END(mp); return error; } /* * Set a snapshot inode to be a zero length file, regular files * or unlinked snapshots to be completely unallocated. */ if (fs->fs_magic == FS_UFS1_MAGIC) { dip1 = (struct ufs1_dinode *)bp->b_data + ino_to_fsbo(fs, cancelip->i_number); if (cancelip->i_flags & SF_SNAPSHOT) { dip1->di_flags = ufs_rw32(ufs_rw32(dip1->di_flags, ns) | SF_SNAPINVAL, ns); } if (expungetype == BLK_NOCOPY || cancelip->i_nlink == 0) dip1->di_mode = 0; dip1->di_size = 0; dip1->di_blocks = 0; memset(&dip1->di_db[0], 0, (UFS_NDADDR + UFS_NIADDR) * sizeof(int32_t)); } else { dip2 = (struct ufs2_dinode *)bp->b_data + ino_to_fsbo(fs, cancelip->i_number); if (cancelip->i_flags & SF_SNAPSHOT) { dip2->di_flags = ufs_rw32(ufs_rw32(dip2->di_flags, ns) | SF_SNAPINVAL, ns); } if (expungetype == BLK_NOCOPY || cancelip->i_nlink == 0) dip2->di_mode = 0; dip2->di_size = 0; dip2->di_blocks = 0; memset(&dip2->di_db[0], 0, (UFS_NDADDR + UFS_NIADDR) * sizeof(int64_t)); } bdwrite(bp); UFS_WAPBL_END(mp); /* * Now go through and expunge all the blocks in the file * using the function requested. */ numblks = howmany(cancelip->i_size, fs->fs_bsize); if (fs->fs_magic == FS_UFS1_MAGIC) bap = &cancelip->i_ffs1_db[0]; else bap = &cancelip->i_ffs2_db[0]; error = (*acctfunc)(snapvp, bap, 0, UFS_NDADDR, fs, 0, expungetype); if (error) return (error); if (fs->fs_magic == FS_UFS1_MAGIC) bap = &cancelip->i_ffs1_ib[0]; else bap = &cancelip->i_ffs2_ib[0]; error = (*acctfunc)(snapvp, bap, 0, UFS_NIADDR, fs, -1, expungetype); if (error) return (error); blksperindir = 1; lbn = -UFS_NDADDR; len = numblks - UFS_NDADDR; rlbn = UFS_NDADDR; for (i = 0; len > 0 && i < UFS_NIADDR; i++) { error = indiracct(snapvp, ITOV(cancelip), i, ib_get(cancelip, i), lbn, rlbn, len, blksperindir, fs, acctfunc, expungetype); if (error) return (error); blksperindir *= FFS_NINDIR(fs); lbn -= blksperindir + 1; len -= blksperindir; rlbn += blksperindir; } return (0); } /* * Descend an indirect block chain for vnode cancelvp accounting for all * its indirect blocks in snapvp. */ static int indiracct(struct vnode *snapvp, struct vnode *cancelvp, int level, daddr_t blkno, daddr_t lbn, daddr_t rlbn, daddr_t remblks, daddr_t blksperindir, struct fs *fs, acctfunc_t acctfunc, int expungetype) { int error, num, i; daddr_t subblksperindir; struct indir indirs[UFS_NIADDR + 2]; daddr_t last; void *bap; struct buf *bp; if (blkno == 0) { if (expungetype == BLK_NOCOPY) return (0); panic("indiracct: missing indir"); } if ((error = ufs_getlbns(cancelvp, rlbn, indirs, &num)) != 0) return (error); if (lbn != indirs[num - 1 - level].in_lbn || num < 2) panic("indiracct: botched params"); /* * We have to expand bread here since it will deadlock looking * up the block number for any blocks that are not in the cache. */ error = ffs_getblk(cancelvp, lbn, FFS_FSBTODB(fs, blkno), fs->fs_bsize, false, &bp); if (error) return error; if ((bp->b_oflags & (BO_DONE | BO_DELWRI)) == 0 && (error = rwfsblk(bp->b_vp, B_READ, bp->b_data, ffs_fragstoblks(fs, blkno)))) { brelse(bp, 0); return (error); } /* * Account for the block pointers in this indirect block. */ last = howmany(remblks, blksperindir); if (last > FFS_NINDIR(fs)) last = FFS_NINDIR(fs); bap = malloc(fs->fs_bsize, M_DEVBUF, M_WAITOK | M_ZERO); memcpy((void *)bap, bp->b_data, fs->fs_bsize); brelse(bp, 0); error = (*acctfunc)(snapvp, bap, 0, last, fs, level == 0 ? rlbn : -1, expungetype); if (error || level == 0) goto out; /* * Account for the block pointers in each of the indirect blocks * in the levels below us. */ subblksperindir = blksperindir / FFS_NINDIR(fs); for (lbn++, level--, i = 0; i < last; i++) { error = indiracct(snapvp, cancelvp, level, idb_get(VTOI(snapvp), bap, i), lbn, rlbn, remblks, subblksperindir, fs, acctfunc, expungetype); if (error) goto out; rlbn += blksperindir; lbn -= blksperindir; remblks -= blksperindir; } out: free(bap, M_DEVBUF); return (error); } /* * Do both snap accounting and map accounting. */ static int fullacct(struct vnode *vp, void *bap, int oldblkp, int lastblkp, struct fs *fs, daddr_t lblkno, int exptype /* BLK_SNAP or BLK_NOCOPY */) { int error; if ((error = snapacct(vp, bap, oldblkp, lastblkp, fs, lblkno, exptype))) return (error); return (mapacct(vp, bap, oldblkp, lastblkp, fs, lblkno, exptype)); } /* * Identify a set of blocks allocated in a snapshot inode. */ static int snapacct(struct vnode *vp, void *bap, int oldblkp, int lastblkp, struct fs *fs, daddr_t lblkno, int expungetype /* BLK_SNAP or BLK_NOCOPY */) { struct inode *ip = VTOI(vp); struct lwp *l = curlwp; struct mount *mp = vp->v_mount; daddr_t blkno; daddr_t lbn; struct buf *ibp; int error, n; const int wbreak = blocks_in_journal(VFSTOUFS(mp)->um_fs)/8; error = UFS_WAPBL_BEGIN(mp); if (error) return error; for ( n = 0; oldblkp < lastblkp; oldblkp++) { blkno = idb_get(ip, bap, oldblkp); if (blkno == 0 || blkno == BLK_NOCOPY || blkno == BLK_SNAP) continue; lbn = ffs_fragstoblks(fs, blkno); if (lbn < UFS_NDADDR) { blkno = db_get(ip, lbn); ip->i_flag |= IN_CHANGE | IN_UPDATE; } else { error = ffs_balloc(vp, ffs_lblktosize(fs, (off_t)lbn), fs->fs_bsize, l->l_cred, B_METAONLY, &ibp); if (error) break; blkno = idb_get(ip, ibp->b_data, (lbn - UFS_NDADDR) % FFS_NINDIR(fs)); } /* * If we are expunging a snapshot vnode and we * find a block marked BLK_NOCOPY, then it is * one that has been allocated to this snapshot after * we took our current snapshot and can be ignored. */ if (expungetype == BLK_SNAP && blkno == BLK_NOCOPY) { if (lbn >= UFS_NDADDR) brelse(ibp, 0); } else { if (blkno != 0) panic("snapacct: bad block"); if (lbn < UFS_NDADDR) db_assign(ip, lbn, expungetype); else { idb_assign(ip, ibp->b_data, (lbn - UFS_NDADDR) % FFS_NINDIR(fs), expungetype); bdwrite(ibp); } } if (wbreak > 0 && (++n % wbreak) == 0) { UFS_WAPBL_END(mp); error = UFS_WAPBL_BEGIN(mp); if (error) return error; } } UFS_WAPBL_END(mp); return error; } /* * Account for a set of blocks allocated in a snapshot inode. */ static int mapacct(struct vnode *vp, void *bap, int oldblkp, int lastblkp, struct fs *fs, daddr_t lblkno, int expungetype) { daddr_t blkno; struct inode *ip; struct mount *mp = vp->v_mount; ino_t inum; int acctit, error, n; const int wbreak = blocks_in_journal(VFSTOUFS(mp)->um_fs)/8; error = UFS_WAPBL_BEGIN(mp); if (error) return error; ip = VTOI(vp); inum = ip->i_number; if (lblkno == -1) acctit = 0; else acctit = 1; for ( n = 0; oldblkp < lastblkp; oldblkp++, lblkno++) { blkno = idb_get(ip, bap, oldblkp); if (blkno == 0 || blkno == BLK_NOCOPY) continue; if (acctit && expungetype == BLK_SNAP && blkno != BLK_SNAP) *ip->i_snapblklist++ = lblkno; if (blkno == BLK_SNAP) blkno = ffs_blkstofrags(fs, lblkno); ffs_blkfree_snap(fs, vp, blkno, fs->fs_bsize, inum); if (wbreak > 0 && (++n % wbreak) == 0) { UFS_WAPBL_END(mp); error = UFS_WAPBL_BEGIN(mp); if (error) return error; } } UFS_WAPBL_END(mp); return (0); } /* * Number of blocks that fit into the journal or zero if not logging. */ static int blocks_in_journal(struct fs *fs) { off_t bpj; if ((fs->fs_flags & FS_DOWAPBL) == 0) return 0; bpj = 1; if (fs->fs_journal_version == UFS_WAPBL_VERSION) { switch (fs->fs_journal_location) { case UFS_WAPBL_JOURNALLOC_END_PARTITION: bpj = (off_t)fs->fs_journallocs[UFS_WAPBL_EPART_BLKSZ]* fs->fs_journallocs[UFS_WAPBL_EPART_COUNT]; break; case UFS_WAPBL_JOURNALLOC_IN_FILESYSTEM: bpj = (off_t)fs->fs_journallocs[UFS_WAPBL_INFS_BLKSZ]* fs->fs_journallocs[UFS_WAPBL_INFS_COUNT]; break; } } bpj /= fs->fs_bsize; return (bpj > 0 ? bpj : 1); } #endif /* defined(FFS_NO_SNAPSHOT) */ /* * Decrement extra reference on snapshot when last name is removed. * It will not be freed until the last open reference goes away. */ void ffs_snapgone(struct vnode *vp) { struct inode *xp, *ip = VTOI(vp); struct mount *mp = spec_node_getmountedfs(ip->i_devvp); struct fs *fs; struct snap_info *si; int snaploc; si = VFSTOUFS(mp)->um_snapinfo; /* * Find snapshot in incore list. */ mutex_enter(&si->si_lock); TAILQ_FOREACH(xp, &si->si_snapshots, i_nextsnap) if (xp == ip) break; mutex_exit(&si->si_lock); if (xp != NULL) vrele(ITOV(ip)); #ifdef DEBUG else if (snapdebug) printf("ffs_snapgone: lost snapshot vnode %llu\n", (unsigned long long)ip->i_number); #endif /* * Delete snapshot inode from superblock. Keep list dense. */ mutex_enter(&si->si_lock); fs = ip->i_fs; for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++) if (fs->fs_snapinum[snaploc] == ip->i_number) break; if (snaploc < FSMAXSNAP) { for (snaploc++; snaploc < FSMAXSNAP; snaploc++) { if (fs->fs_snapinum[snaploc] == 0) break; fs->fs_snapinum[snaploc - 1] = fs->fs_snapinum[snaploc]; } fs->fs_snapinum[snaploc - 1] = 0; } si->si_gen++; mutex_exit(&si->si_lock); } /* * Prepare a snapshot file for being removed. */ void ffs_snapremove(struct vnode *vp) { struct inode *ip = VTOI(vp), *xp; struct vnode *devvp = ip->i_devvp; struct fs *fs = ip->i_fs; struct mount *mp = spec_node_getmountedfs(devvp); struct buf *ibp; struct snap_info *si; struct lwp *l = curlwp; daddr_t numblks, blkno, dblk; int error, loc, last; si = VFSTOUFS(mp)->um_snapinfo; /* * If active, delete from incore list (this snapshot may * already have been in the process of being deleted, so * would not have been active). * * Clear copy-on-write flag if last snapshot. */ mutex_enter(&si->si_snaplock); mutex_enter(&si->si_lock); if (is_active_snapshot(si, ip)) { TAILQ_REMOVE(&si->si_snapshots, ip, i_nextsnap); if (TAILQ_FIRST(&si->si_snapshots) != 0) { /* Roll back the list of preallocated blocks. */ xp = TAILQ_LAST(&si->si_snapshots, inodelst); si->si_snapblklist = xp->i_snapblklist; si->si_gen++; mutex_exit(&si->si_lock); mutex_exit(&si->si_snaplock); } else { si->si_snapblklist = 0; si->si_gen++; mutex_exit(&si->si_lock); mutex_exit(&si->si_snaplock); fscow_disestablish(mp, ffs_copyonwrite, devvp); } if (ip->i_snapblklist != NULL) { free(ip->i_snapblklist, M_UFSMNT); ip->i_snapblklist = NULL; } } else { mutex_exit(&si->si_lock); mutex_exit(&si->si_snaplock); } /* * Clear all BLK_NOCOPY fields. Pass any block claims to other * snapshots that want them (see ffs_snapblkfree below). */ for (blkno = 1; blkno < UFS_NDADDR; blkno++) { dblk = db_get(ip, blkno); if (dblk == BLK_NOCOPY || dblk == BLK_SNAP) db_assign(ip, blkno, 0); else if ((dblk == ffs_blkstofrags(fs, blkno) && ffs_snapblkfree(fs, ip->i_devvp, dblk, fs->fs_bsize, ip->i_number))) { DIP_ADD(ip, blocks, -btodb(fs->fs_bsize)); db_assign(ip, blkno, 0); } } numblks = howmany(ip->i_size, fs->fs_bsize); for (blkno = UFS_NDADDR; blkno < numblks; blkno += FFS_NINDIR(fs)) { error = ffs_balloc(vp, ffs_lblktosize(fs, (off_t)blkno), fs->fs_bsize, l->l_cred, B_METAONLY, &ibp); if (error) continue; if (fs->fs_size - blkno > FFS_NINDIR(fs)) last = FFS_NINDIR(fs); else last = fs->fs_size - blkno; for (loc = 0; loc < last; loc++) { dblk = idb_get(ip, ibp->b_data, loc); if (dblk == BLK_NOCOPY || dblk == BLK_SNAP) idb_assign(ip, ibp->b_data, loc, 0); else if (dblk == ffs_blkstofrags(fs, blkno) && ffs_snapblkfree(fs, ip->i_devvp, dblk, fs->fs_bsize, ip->i_number)) { DIP_ADD(ip, blocks, -btodb(fs->fs_bsize)); idb_assign(ip, ibp->b_data, loc, 0); } } bawrite(ibp); UFS_WAPBL_END(mp); error = UFS_WAPBL_BEGIN(mp); KASSERT(error == 0); } /* * Clear snapshot flag and drop reference. */ ip->i_flags &= ~(SF_SNAPSHOT | SF_SNAPINVAL); DIP_ASSIGN(ip, flags, ip->i_flags); ip->i_flag |= IN_CHANGE | IN_UPDATE; #if defined(QUOTA) || defined(QUOTA2) chkdq(ip, DIP(ip, blocks), l->l_cred, FORCE); chkiq(ip, 1, l->l_cred, FORCE); #endif } /* * Notification that a block is being freed. Return zero if the free * should be allowed to proceed. Return non-zero if the snapshot file * wants to claim the block. The block will be claimed if it is an * uncopied part of one of the snapshots. It will be freed if it is * either a BLK_NOCOPY or has already been copied in all of the snapshots. * If a fragment is being freed, then all snapshots that care about * it must make a copy since a snapshot file can only claim full sized * blocks. Note that if more than one snapshot file maps the block, * we can pick one at random to claim it. Since none of the snapshots * can change, we are assurred that they will all see the same unmodified * image. When deleting a snapshot file (see ffs_snapremove above), we * must push any of these claimed blocks to one of the other snapshots * that maps it. These claimed blocks are easily identified as they will * have a block number equal to their logical block number within the * snapshot. A copied block can never have this property because they * must always have been allocated from a BLK_NOCOPY location. */ int ffs_snapblkfree(struct fs *fs, struct vnode *devvp, daddr_t bno, long size, ino_t inum) { struct mount *mp = spec_node_getmountedfs(devvp); struct buf *ibp; struct inode *ip; struct vnode *vp = NULL; struct snap_info *si; void *saved_data = NULL; daddr_t lbn; daddr_t blkno; uint32_t gen; int indiroff = 0, error = 0, claimedblk = 0; si = VFSTOUFS(mp)->um_snapinfo; lbn = ffs_fragstoblks(fs, bno); mutex_enter(&si->si_snaplock); mutex_enter(&si->si_lock); si->si_owner = curlwp; retry: gen = si->si_gen; TAILQ_FOREACH(ip, &si->si_snapshots, i_nextsnap) { vp = ITOV(ip); /* * Lookup block being written. */ if (lbn < UFS_NDADDR) { blkno = db_get(ip, lbn); } else { mutex_exit(&si->si_lock); error = ffs_balloc(vp, ffs_lblktosize(fs, (off_t)lbn), fs->fs_bsize, FSCRED, B_METAONLY, &ibp); if (error) { mutex_enter(&si->si_lock); break; } indiroff = (lbn - UFS_NDADDR) % FFS_NINDIR(fs); blkno = idb_get(ip, ibp->b_data, indiroff); mutex_enter(&si->si_lock); if (gen != si->si_gen) { brelse(ibp, 0); goto retry; } } /* * Check to see if block needs to be copied. */ if (blkno == 0) { /* * A block that we map is being freed. If it has not * been claimed yet, we will claim or copy it (below). */ claimedblk = 1; } else if (blkno == BLK_SNAP) { /* * No previous snapshot claimed the block, * so it will be freed and become a BLK_NOCOPY * (don't care) for us. */ if (claimedblk) panic("snapblkfree: inconsistent block type"); if (lbn < UFS_NDADDR) { db_assign(ip, lbn, BLK_NOCOPY); ip->i_flag |= IN_CHANGE | IN_UPDATE; } else { idb_assign(ip, ibp->b_data, indiroff, BLK_NOCOPY); mutex_exit(&si->si_lock); if (ip->i_nlink > 0) bwrite(ibp); else bdwrite(ibp); mutex_enter(&si->si_lock); if (gen != si->si_gen) goto retry; } continue; } else /* BLK_NOCOPY or default */ { /* * If the snapshot has already copied the block * (default), or does not care about the block, * it is not needed. */ if (lbn >= UFS_NDADDR) brelse(ibp, 0); continue; } /* * If this is a full size block, we will just grab it * and assign it to the snapshot inode. Otherwise we * will proceed to copy it. See explanation for this * routine as to why only a single snapshot needs to * claim this block. */ if (size == fs->fs_bsize) { #ifdef DEBUG if (snapdebug) printf("%s %llu lbn %" PRId64 "from inum %llu\n", "Grabonremove: snapino", (unsigned long long)ip->i_number, lbn, (unsigned long long)inum); #endif mutex_exit(&si->si_lock); if (lbn < UFS_NDADDR) { db_assign(ip, lbn, bno); } else { idb_assign(ip, ibp->b_data, indiroff, bno); if (ip->i_nlink > 0) bwrite(ibp); else bdwrite(ibp); } DIP_ADD(ip, blocks, btodb(size)); ip->i_flag |= IN_CHANGE | IN_UPDATE; if (ip->i_nlink > 0 && mp->mnt_wapbl) error = syncsnap(vp); else error = 0; mutex_enter(&si->si_lock); si->si_owner = NULL; mutex_exit(&si->si_lock); mutex_exit(&si->si_snaplock); return (error == 0); } if (lbn >= UFS_NDADDR) brelse(ibp, 0); #ifdef DEBUG if (snapdebug) printf("%s%llu lbn %" PRId64 " %s %llu size %ld\n", "Copyonremove: snapino ", (unsigned long long)ip->i_number, lbn, "for inum", (unsigned long long)inum, size); #endif /* * If we have already read the old block contents, then * simply copy them to the new block. Note that we need * to synchronously write snapshots that have not been * unlinked, and hence will be visible after a crash, * to ensure their integrity. */ mutex_exit(&si->si_lock); if (saved_data == NULL) { saved_data = malloc(fs->fs_bsize, M_UFSMNT, M_WAITOK); error = rwfsblk(vp, B_READ, saved_data, lbn); if (error) { free(saved_data, M_UFSMNT); saved_data = NULL; mutex_enter(&si->si_lock); break; } } error = wrsnapblk(vp, saved_data, lbn); if (error == 0 && ip->i_nlink > 0 && mp->mnt_wapbl) error = syncsnap(vp); mutex_enter(&si->si_lock); if (error) break; if (gen != si->si_gen) goto retry; } si->si_owner = NULL; mutex_exit(&si->si_lock); mutex_exit(&si->si_snaplock); if (saved_data) free(saved_data, M_UFSMNT); /* * If we have been unable to allocate a block in which to do * the copy, then return non-zero so that the fragment will * not be freed. Although space will be lost, the snapshot * will stay consistent. */ return (error); } /* * Associate snapshot files when mounting. */ void ffs_snapshot_mount(struct mount *mp) { struct vnode *devvp = VFSTOUFS(mp)->um_devvp; struct fs *fs = VFSTOUFS(mp)->um_fs; struct lwp *l = curlwp; struct vnode *vp; struct inode *ip, *xp; struct snap_info *si; daddr_t snaplistsize, *snapblklist; int i, error, ns __unused, snaploc, loc; /* * No persistent snapshots on apple ufs file systems. */ if (UFS_MPISAPPLEUFS(VFSTOUFS(mp))) return; si = VFSTOUFS(mp)->um_snapinfo; ns = UFS_FSNEEDSWAP(fs); /* * XXX The following needs to be set before ffs_truncate or * VOP_READ can be called. */ mp->mnt_stat.f_iosize = fs->fs_bsize; /* * Process each snapshot listed in the superblock. */ vp = NULL; mutex_enter(&si->si_lock); for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++) { if (fs->fs_snapinum[snaploc] == 0) break; if ((error = VFS_VGET(mp, fs->fs_snapinum[snaploc], LK_EXCLUSIVE, &vp)) != 0) { printf("ffs_snapshot_mount: vget failed %d\n", error); continue; } ip = VTOI(vp); if ((ip->i_flags & (SF_SNAPSHOT | SF_SNAPINVAL)) != SF_SNAPSHOT) { printf("ffs_snapshot_mount: non-snapshot inode %d\n", fs->fs_snapinum[snaploc]); vput(vp); vp = NULL; for (loc = snaploc + 1; loc < FSMAXSNAP; loc++) { if (fs->fs_snapinum[loc] == 0) break; fs->fs_snapinum[loc - 1] = fs->fs_snapinum[loc]; } fs->fs_snapinum[loc - 1] = 0; snaploc--; continue; } /* * Read the block hints list. Use an empty list on * read errors. */ error = vn_rdwr(UIO_READ, vp, (void *)&snaplistsize, sizeof(snaplistsize), ffs_lblktosize(fs, howmany(fs->fs_size, fs->fs_frag)), UIO_SYSSPACE, IO_NODELOCKED|IO_UNIT|IO_ALTSEMANTICS, l->l_cred, NULL, NULL); if (error) { printf("ffs_snapshot_mount: read_1 failed %d\n", error); snaplistsize = 1; } else snaplistsize = ufs_rw64(snaplistsize, ns); snapblklist = malloc( snaplistsize * sizeof(daddr_t), M_UFSMNT, M_WAITOK); if (error) snapblklist[0] = 1; else { error = vn_rdwr(UIO_READ, vp, (void *)snapblklist, snaplistsize * sizeof(daddr_t), ffs_lblktosize(fs, howmany(fs->fs_size, fs->fs_frag)), UIO_SYSSPACE, IO_NODELOCKED|IO_UNIT|IO_ALTSEMANTICS, l->l_cred, NULL, NULL); for (i = 0; i < snaplistsize; i++) snapblklist[i] = ufs_rw64(snapblklist[i], ns); if (error) { printf("ffs_snapshot_mount: read_2 failed %d\n", error); snapblklist[0] = 1; } } ip->i_snapblklist = &snapblklist[0]; /* * Link it onto the active snapshot list. */ if (is_active_snapshot(si, ip)) panic("ffs_snapshot_mount: %"PRIu64" already on list", ip->i_number); else TAILQ_INSERT_TAIL(&si->si_snapshots, ip, i_nextsnap); vp->v_vflag |= VV_SYSTEM; VOP_UNLOCK(vp); } /* * No usable snapshots found. */ if (vp == NULL) { mutex_exit(&si->si_lock); return; } /* * Attach the block hints list. We always want to * use the list from the newest snapshot. */ xp = TAILQ_LAST(&si->si_snapshots, inodelst); si->si_snapblklist = xp->i_snapblklist; fscow_establish(mp, ffs_copyonwrite, devvp); si->si_gen++; mutex_exit(&si->si_lock); } /* * Disassociate snapshot files when unmounting. */ void ffs_snapshot_unmount(struct mount *mp) { struct vnode *devvp = VFSTOUFS(mp)->um_devvp; struct inode *xp; struct vnode *vp = NULL; struct snap_info *si; si = VFSTOUFS(mp)->um_snapinfo; mutex_enter(&si->si_lock); while ((xp = TAILQ_FIRST(&si->si_snapshots)) != 0) { vp = ITOV(xp); TAILQ_REMOVE(&si->si_snapshots, xp, i_nextsnap); if (xp->i_snapblklist == si->si_snapblklist) si->si_snapblklist = NULL; free(xp->i_snapblklist, M_UFSMNT); if (xp->i_nlink > 0) { si->si_gen++; mutex_exit(&si->si_lock); vrele(vp); mutex_enter(&si->si_lock); } } si->si_gen++; mutex_exit(&si->si_lock); if (vp) fscow_disestablish(mp, ffs_copyonwrite, devvp); } /* * Check for need to copy block that is about to be written, * copying the block if necessary. */ static int ffs_copyonwrite(void *v, struct buf *bp, bool data_valid) { struct fs *fs; struct inode *ip; struct vnode *devvp = v, *vp = NULL; struct mount *mp = spec_node_getmountedfs(devvp); struct snap_info *si; void *saved_data = NULL; daddr_t lbn, blkno, *snapblklist; uint32_t gen; int lower, upper, mid, snapshot_locked = 0, error = 0; /* * Check for valid snapshots. */ si = VFSTOUFS(mp)->um_snapinfo; mutex_enter(&si->si_lock); ip = TAILQ_FIRST(&si->si_snapshots); if (ip == NULL) { mutex_exit(&si->si_lock); return 0; } /* * First check to see if it is after the file system, * in the journal or in the preallocated list. * By doing these checks we avoid several potential deadlocks. */ fs = ip->i_fs; lbn = ffs_fragstoblks(fs, FFS_DBTOFSB(fs, bp->b_blkno)); if (bp->b_blkno >= FFS_FSBTODB(fs, fs->fs_size)) { mutex_exit(&si->si_lock); return 0; } if ((fs->fs_flags & FS_DOWAPBL) && fs->fs_journal_location == UFS_WAPBL_JOURNALLOC_IN_FILESYSTEM) { off_t blk_off, log_start, log_end; log_start = (off_t)fs->fs_journallocs[UFS_WAPBL_INFS_ADDR] * fs->fs_journallocs[UFS_WAPBL_INFS_BLKSZ]; log_end = log_start + fs->fs_journallocs[UFS_WAPBL_INFS_COUNT] * fs->fs_journallocs[UFS_WAPBL_INFS_BLKSZ]; blk_off = dbtob(bp->b_blkno); if (blk_off >= log_start && blk_off < log_end) { mutex_exit(&si->si_lock); return 0; } } snapblklist = si->si_snapblklist; upper = (snapblklist != NULL ? snapblklist[0] - 1 : 0); lower = 1; while (lower <= upper) { mid = (lower + upper) / 2; if (snapblklist[mid] == lbn) break; if (snapblklist[mid] < lbn) lower = mid + 1; else upper = mid - 1; } if (lower <= upper) { mutex_exit(&si->si_lock); return 0; } /* * Not in the precomputed list, so check the snapshots. */ if (si->si_owner != curlwp) { if (!mutex_tryenter(&si->si_snaplock)) { mutex_exit(&si->si_lock); mutex_enter(&si->si_snaplock); mutex_enter(&si->si_lock); } si->si_owner = curlwp; snapshot_locked = 1; } if (data_valid && bp->b_bcount == fs->fs_bsize) saved_data = bp->b_data; retry: gen = si->si_gen; TAILQ_FOREACH(ip, &si->si_snapshots, i_nextsnap) { vp = ITOV(ip); /* * We ensure that everything of our own that needs to be * copied will be done at the time that ffs_snapshot is * called. Thus we can skip the check here which can * deadlock in doing the lookup in ffs_balloc. */ if (bp->b_vp == vp) continue; /* * Check to see if block needs to be copied. */ if (lbn < UFS_NDADDR) { blkno = db_get(ip, lbn); } else { mutex_exit(&si->si_lock); blkno = 0; /* XXX: GCC */ if ((error = snapblkaddr(vp, lbn, &blkno)) != 0) { mutex_enter(&si->si_lock); break; } mutex_enter(&si->si_lock); if (gen != si->si_gen) goto retry; } KASSERTMSG((blkno != BLK_SNAP || bp->b_lblkno < 0), "ffs_copyonwrite: bad copy block: blkno %jd, lblkno %jd", (intmax_t)blkno, (intmax_t)bp->b_lblkno); if (blkno != 0) continue; if (curlwp == uvm.pagedaemon_lwp) { error = ENOMEM; break; } /* Only one level of recursion allowed. */ KASSERT(snapshot_locked); /* * Allocate the block into which to do the copy. Since * multiple processes may all try to copy the same block, * we have to recheck our need to do a copy if we sleep * waiting for the lock. * * Because all snapshots on a filesystem share a single * lock, we ensure that we will never be in competition * with another process to allocate a block. */ #ifdef DEBUG if (snapdebug) { printf("Copyonwrite: snapino %llu lbn %" PRId64 " for ", (unsigned long long)ip->i_number, lbn); if (bp->b_vp == devvp) printf("fs metadata"); else printf("inum %llu", (unsigned long long) VTOI(bp->b_vp)->i_number); printf(" lblkno %" PRId64 "\n", bp->b_lblkno); } #endif /* * If we have already read the old block contents, then * simply copy them to the new block. Note that we need * to synchronously write snapshots that have not been * unlinked, and hence will be visible after a crash, * to ensure their integrity. */ mutex_exit(&si->si_lock); if (saved_data == NULL) { saved_data = malloc(fs->fs_bsize, M_UFSMNT, M_WAITOK); error = rwfsblk(vp, B_READ, saved_data, lbn); if (error) { free(saved_data, M_UFSMNT); saved_data = NULL; mutex_enter(&si->si_lock); break; } } error = wrsnapblk(vp, saved_data, lbn); if (error == 0 && ip->i_nlink > 0 && mp->mnt_wapbl) error = syncsnap(vp); mutex_enter(&si->si_lock); if (error) break; if (gen != si->si_gen) goto retry; } /* * Note that we need to synchronously write snapshots that * have not been unlinked, and hence will be visible after * a crash, to ensure their integrity. */ if (snapshot_locked) { si->si_owner = NULL; mutex_exit(&si->si_lock); mutex_exit(&si->si_snaplock); } else mutex_exit(&si->si_lock); if (saved_data && saved_data != bp->b_data) free(saved_data, M_UFSMNT); return error; } /* * Read from a snapshot. */ int ffs_snapshot_read(struct vnode *vp, struct uio *uio, int ioflag) { struct inode *ip = VTOI(vp); struct fs *fs = ip->i_fs; struct snap_info *si = VFSTOUFS(vp->v_mount)->um_snapinfo; struct buf *bp; daddr_t lbn, nextlbn; off_t fsbytes, bytesinfile; long size, xfersize, blkoffset; int error; mutex_enter(&si->si_snaplock); if (ioflag & IO_ALTSEMANTICS) fsbytes = ip->i_size; else fsbytes = ffs_lfragtosize(fs, fs->fs_size); for (error = 0, bp = NULL; uio->uio_resid > 0; bp = NULL) { bytesinfile = fsbytes - uio->uio_offset; if (bytesinfile <= 0) break; lbn = ffs_lblkno(fs, uio->uio_offset); nextlbn = lbn + 1; size = fs->fs_bsize; blkoffset = ffs_blkoff(fs, uio->uio_offset); xfersize = MIN(MIN(fs->fs_bsize - blkoffset, uio->uio_resid), bytesinfile); if (ffs_lblktosize(fs, nextlbn + 1) >= fsbytes) { if (ffs_lblktosize(fs, lbn) + size > fsbytes) size = ffs_fragroundup(fs, fsbytes - ffs_lblktosize(fs, lbn)); error = bread(vp, lbn, size, 0, &bp); } else { int nextsize = fs->fs_bsize; error = breadn(vp, lbn, size, &nextlbn, &nextsize, 1, 0, &bp); } if (error) break; /* * We should only get non-zero b_resid when an I/O error * has occurred, which should cause us to break above. * However, if the short read did not cause an error, * then we want to ensure that we do not uiomove bad * or uninitialized data. */ size -= bp->b_resid; if (size < blkoffset + xfersize) { xfersize = size - blkoffset; if (xfersize <= 0) break; } error = uiomove((char *)bp->b_data + blkoffset, xfersize, uio); if (error) break; brelse(bp, BC_AGE); } if (bp != NULL) brelse(bp, BC_AGE); mutex_exit(&si->si_snaplock); return error; } /* * Lookup a snapshots data block address. * Simpler than UFS_BALLOC() as we know all metadata is already allocated * and safe even for the pagedaemon where we cannot bread(). */ static int snapblkaddr(struct vnode *vp, daddr_t lbn, daddr_t *res) { struct indir indirs[UFS_NIADDR + 2]; struct inode *ip = VTOI(vp); struct fs *fs = ip->i_fs; struct buf *bp; int error, num; KASSERT(lbn >= 0); if (lbn < UFS_NDADDR) { *res = db_get(ip, lbn); return 0; } if ((error = ufs_getlbns(vp, lbn, indirs, &num)) != 0) return error; if (curlwp == uvm.pagedaemon_lwp) { mutex_enter(&bufcache_lock); bp = incore(vp, indirs[num-1].in_lbn); if (bp && (bp->b_oflags & (BO_DONE | BO_DELWRI))) { *res = idb_get(ip, bp->b_data, indirs[num-1].in_off); error = 0; } else error = ENOMEM; mutex_exit(&bufcache_lock); return error; } error = bread(vp, indirs[num-1].in_lbn, fs->fs_bsize, 0, &bp); if (error == 0) { *res = idb_get(ip, bp->b_data, indirs[num-1].in_off); brelse(bp, 0); } return error; } /* * Read or write the specified block of the filesystem vp resides on * from or to the disk bypassing the buffer cache. */ static int rwfsblk(struct vnode *vp, int flags, void *data, daddr_t lbn) { int error; struct inode *ip = VTOI(vp); struct fs *fs = ip->i_fs; struct buf *nbp; nbp = getiobuf(NULL, true); nbp->b_flags = flags; nbp->b_bcount = nbp->b_bufsize = fs->fs_bsize; nbp->b_error = 0; nbp->b_data = data; nbp->b_blkno = nbp->b_rawblkno = FFS_FSBTODB(fs, ffs_blkstofrags(fs, lbn)); nbp->b_proc = NULL; nbp->b_dev = ip->i_devvp->v_rdev; SET(nbp->b_cflags, BC_BUSY); /* mark buffer busy */ bdev_strategy(nbp); error = biowait(nbp); putiobuf(nbp); return error; } /* * Write all dirty buffers to disk and invalidate them. */ static int syncsnap(struct vnode *vp) { int error; buf_t *bp; struct fs *fs = VTOI(vp)->i_fs; mutex_enter(&bufcache_lock); while ((bp = LIST_FIRST(&vp->v_dirtyblkhd))) { error = bbusy(bp, false, 0, NULL); if (error == EPASSTHROUGH) continue; else if (error != 0) { mutex_exit(&bufcache_lock); return error; } KASSERT(bp->b_bcount == fs->fs_bsize); mutex_exit(&bufcache_lock); error = rwfsblk(vp, B_WRITE, bp->b_data, ffs_fragstoblks(fs, FFS_DBTOFSB(fs, bp->b_blkno))); brelse(bp, BC_INVAL | BC_VFLUSH); if (error) return error; mutex_enter(&bufcache_lock); } mutex_exit(&bufcache_lock); return 0; } /* * Write the specified block to a snapshot. */ static int wrsnapblk(struct vnode *vp, void *data, daddr_t lbn) { struct inode *ip = VTOI(vp); struct fs *fs = ip->i_fs; struct buf *bp; int error; error = ffs_balloc(vp, ffs_lblktosize(fs, (off_t)lbn), fs->fs_bsize, FSCRED, (ip->i_nlink > 0 ? B_SYNC : 0), &bp); if (error) return error; memcpy(bp->b_data, data, fs->fs_bsize); if (ip->i_nlink > 0) error = bwrite(bp); else bawrite(bp); return error; } /* * Check if this inode is present on the active snapshot list. * Must be called with snapinfo locked. */ static inline bool is_active_snapshot(struct snap_info *si, struct inode *ip) { struct inode *xp; KASSERT(mutex_owned(&si->si_lock)); TAILQ_FOREACH(xp, &si->si_snapshots, i_nextsnap) if (xp == ip) return true; return false; } /* * Get/Put direct block from inode or buffer containing disk addresses. Take * care for fs type (UFS1/UFS2) and byte swapping. These functions should go * into a global include. */ static inline daddr_t db_get(struct inode *ip, int loc) { if (ip->i_ump->um_fstype == UFS1) return ufs_rw32(ip->i_ffs1_db[loc], UFS_IPNEEDSWAP(ip)); else return ufs_rw64(ip->i_ffs2_db[loc], UFS_IPNEEDSWAP(ip)); } static inline void db_assign(struct inode *ip, int loc, daddr_t val) { if (ip->i_ump->um_fstype == UFS1) ip->i_ffs1_db[loc] = ufs_rw32(val, UFS_IPNEEDSWAP(ip)); else ip->i_ffs2_db[loc] = ufs_rw64(val, UFS_IPNEEDSWAP(ip)); } __unused static inline daddr_t ib_get(struct inode *ip, int loc) { if (ip->i_ump->um_fstype == UFS1) return ufs_rw32(ip->i_ffs1_ib[loc], UFS_IPNEEDSWAP(ip)); else return ufs_rw64(ip->i_ffs2_ib[loc], UFS_IPNEEDSWAP(ip)); } static inline daddr_t idb_get(struct inode *ip, void *bf, int loc) { if (ip->i_ump->um_fstype == UFS1) return ufs_rw32(((int32_t *)(bf))[loc], UFS_IPNEEDSWAP(ip)); else return ufs_rw64(((int64_t *)(bf))[loc], UFS_IPNEEDSWAP(ip)); } static inline void idb_assign(struct inode *ip, void *bf, int loc, daddr_t val) { if (ip->i_ump->um_fstype == UFS1) ((int32_t *)(bf))[loc] = ufs_rw32(val, UFS_IPNEEDSWAP(ip)); else ((int64_t *)(bf))[loc] = ufs_rw64(val, UFS_IPNEEDSWAP(ip)); }
414 413 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 /* $NetBSD: sys_mqueue.c,v 1.48 2020/05/23 23:42:43 ad Exp $ */ /* * Copyright (c) 2007-2011 Mindaugas Rasiukevicius <rmind at NetBSD org> * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * Implementation of POSIX message queues. * Defined in the Base Definitions volume of IEEE Std 1003.1-2001. * * Locking * * Global list of message queues (mqueue_head) is protected by mqlist_lock. * Each message queue and its members are protected by mqueue::mq_mtx. * Note that proc_t::p_mqueue_cnt is updated atomically. * * Lock order: * * mqlist_lock -> * mqueue::mq_mtx */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: sys_mqueue.c,v 1.48 2020/05/23 23:42:43 ad Exp $"); #include <sys/param.h> #include <sys/types.h> #include <sys/atomic.h> #include <sys/file.h> #include <sys/filedesc.h> #include <sys/kauth.h> #include <sys/lwp.h> #include <sys/mqueue.h> #include <sys/module.h> #include <sys/poll.h> #include <sys/select.h> #include <sys/signal.h> #include <sys/signalvar.h> #include <sys/stat.h> #include <sys/sysctl.h> #include <sys/syscall.h> #include <sys/syscallvar.h> #include <sys/syscallargs.h> #include <miscfs/genfs/genfs.h> MODULE(MODULE_CLASS_MISC, mqueue, NULL); /* System-wide limits. */ static u_int mq_open_max = MQ_OPEN_MAX; static u_int mq_prio_max = MQ_PRIO_MAX; static u_int mq_max_msgsize = 16 * MQ_DEF_MSGSIZE; static u_int mq_def_maxmsg = 32; static u_int mq_max_maxmsg = 16 * 32; static pool_cache_t mqmsg_cache __read_mostly; static kmutex_t mqlist_lock __cacheline_aligned; static LIST_HEAD(, mqueue) mqueue_head __cacheline_aligned; static kauth_listener_t mq_listener; static int mqueue_sysinit(void); static int mqueue_sysfini(bool); static int mq_poll_fop(file_t *, int); static int mq_stat_fop(file_t *, struct stat *); static int mq_close_fop(file_t *); static const struct fileops mqops = { .fo_name = "mq", .fo_read = fbadop_read, .fo_write = fbadop_write, .fo_ioctl = fbadop_ioctl, .fo_fcntl = fnullop_fcntl, .fo_poll = mq_poll_fop, .fo_stat = mq_stat_fop, .fo_close = mq_close_fop, .fo_kqfilter = fnullop_kqfilter, .fo_restart = fnullop_restart, }; static const struct syscall_package mqueue_syscalls[] = { { SYS_mq_open, 0, (sy_call_t *)sys_mq_open }, { SYS_mq_close, 0, (sy_call_t *)sys_mq_close }, { SYS_mq_unlink, 0, (sy_call_t *)sys_mq_unlink }, { SYS_mq_getattr, 0, (sy_call_t *)sys_mq_getattr }, { SYS_mq_setattr, 0, (sy_call_t *)sys_mq_setattr }, { SYS_mq_notify, 0, (sy_call_t *)sys_mq_notify }, { SYS_mq_send, 0, (sy_call_t *)sys_mq_send }, { SYS_mq_receive, 0, (sy_call_t *)sys_mq_receive }, { SYS___mq_timedsend50, 0, (sy_call_t *)sys___mq_timedsend50 }, { SYS___mq_timedreceive50, 0, (sy_call_t *)sys___mq_timedreceive50 }, { 0, 0, NULL } }; static int mq_listener_cb(kauth_cred_t cred, kauth_action_t action, void *cookie, void *arg0, void *arg1, void *arg2, void *arg3) { mqueue_t *mq; int result; if (action != KAUTH_SYSTEM_MQUEUE) return KAUTH_RESULT_DEFER; result = KAUTH_RESULT_DEFER; mq = arg1; if (kauth_cred_geteuid(cred) == mq->mq_euid) result = KAUTH_RESULT_ALLOW; return result; } /* * Initialisation and unloading of POSIX message queue subsystem. */ static int mqueue_sysinit(void) { int error; mqmsg_cache = pool_cache_init(MQ_DEF_MSGSIZE, coherency_unit, 0, 0, "mqmsgpl", NULL, IPL_NONE, NULL, NULL, NULL); mutex_init(&mqlist_lock, MUTEX_DEFAULT, IPL_NONE); LIST_INIT(&mqueue_head); error = syscall_establish(NULL, mqueue_syscalls); mq_listener = kauth_listen_scope(KAUTH_SCOPE_SYSTEM, mq_listener_cb, NULL); return error; } static int mqueue_sysfini(bool interface) { if (interface) { int error; bool inuse; /* Stop syscall activity. */ error = syscall_disestablish(NULL, mqueue_syscalls); if (error) return error; /* Check if there are any message queues in use. */ mutex_enter(&mqlist_lock); inuse = !LIST_EMPTY(&mqueue_head); mutex_exit(&mqlist_lock); if (inuse) { error = syscall_establish(NULL, mqueue_syscalls); KASSERT(error == 0); return EBUSY; } } kauth_unlisten_scope(mq_listener); mutex_destroy(&mqlist_lock); pool_cache_destroy(mqmsg_cache); return 0; } /* * Module interface. */ static int mqueue_modcmd(modcmd_t cmd, void *arg) { switch (cmd) { case MODULE_CMD_INIT: return mqueue_sysinit(); case MODULE_CMD_FINI: return mqueue_sysfini(true); default: return ENOTTY; } } /* * Free the message. */ static void mqueue_freemsg(struct mq_msg *msg, const size_t size) { if (size > MQ_DEF_MSGSIZE) { kmem_free(msg, size); } else { pool_cache_put(mqmsg_cache, msg); } } /* * Destroy the message queue. */ static void mqueue_destroy(struct mqueue *mq) { struct mq_msg *msg; size_t msz; u_int i; /* Note MQ_PQSIZE + 1. */ for (i = 0; i <= MQ_PQSIZE; i++) { while ((msg = TAILQ_FIRST(&mq->mq_head[i])) != NULL) { TAILQ_REMOVE(&mq->mq_head[i], msg, msg_queue); msz = sizeof(struct mq_msg) + msg->msg_len; mqueue_freemsg(msg, msz); } } if (mq->mq_name) { kmem_free(mq->mq_name, MQ_NAMELEN); } seldestroy(&mq->mq_rsel); seldestroy(&mq->mq_wsel); cv_destroy(&mq->mq_send_cv); cv_destroy(&mq->mq_recv_cv); mutex_destroy(&mq->mq_mtx); kmem_free(mq, sizeof(struct mqueue)); } /* * mqueue_lookup: lookup for file name in general list of message queues. * * => locks the message queue on success */ static mqueue_t * mqueue_lookup(const char *name) { mqueue_t *mq; KASSERT(mutex_owned(&mqlist_lock)); LIST_FOREACH(mq, &mqueue_head, mq_list) { if (strncmp(mq->mq_name, name, MQ_NAMELEN) == 0) { mutex_enter(&mq->mq_mtx); return mq; } } return NULL; } /* * mqueue_get: get the mqueue from the descriptor. * * => locks the message queue, if found. * => holds a reference on the file descriptor. */ int mqueue_get(mqd_t mqd, int fflag, mqueue_t **mqret) { const int fd = (int)mqd; mqueue_t *mq; file_t *fp; fp = fd_getfile(fd); if (__predict_false(fp == NULL)) { return EBADF; } if (__predict_false(fp->f_type != DTYPE_MQUEUE)) { fd_putfile(fd); return EBADF; } if (fflag && (fp->f_flag & fflag) == 0) { fd_putfile(fd); return EBADF; } mq = fp->f_mqueue; mutex_enter(&mq->mq_mtx); *mqret = mq; return 0; } /* * mqueue_linear_insert: perform linear insert according to the message * priority into the reserved queue (MQ_PQRESQ). Reserved queue is a * sorted list used only when mq_prio_max is increased via sysctl. */ static inline void mqueue_linear_insert(struct mqueue *mq, struct mq_msg *msg) { struct mq_msg *mit; TAILQ_FOREACH(mit, &mq->mq_head[MQ_PQRESQ], msg_queue) { if (msg->msg_prio > mit->msg_prio) break; } if (mit == NULL) { TAILQ_INSERT_TAIL(&mq->mq_head[MQ_PQRESQ], msg, msg_queue); } else { TAILQ_INSERT_BEFORE(mit, msg, msg_queue); } } static int mq_stat_fop(file_t *fp, struct stat *st) { struct mqueue *mq = fp->f_mqueue; memset(st, 0, sizeof(*st)); mutex_enter(&mq->mq_mtx); st->st_mode = mq->mq_mode; st->st_uid = mq->mq_euid; st->st_gid = mq->mq_egid; st->st_atimespec = mq->mq_atime; st->st_mtimespec = mq->mq_mtime; st->st_ctimespec = st->st_birthtimespec = mq->mq_btime; st->st_uid = kauth_cred_geteuid(fp->f_cred); st->st_gid = kauth_cred_getegid(fp->f_cred); mutex_exit(&mq->mq_mtx); return 0; } static int mq_poll_fop(file_t *fp, int events) { struct mqueue *mq = fp->f_mqueue; struct mq_attr *mqattr; int revents = 0; mutex_enter(&mq->mq_mtx); mqattr = &mq->mq_attrib; if (events & (POLLIN | POLLRDNORM)) { /* Ready for receiving, if there are messages in the queue. */ if (mqattr->mq_curmsgs) revents |= events & (POLLIN | POLLRDNORM); else selrecord(curlwp, &mq->mq_rsel); } if (events & (POLLOUT | POLLWRNORM)) { /* Ready for sending, if the message queue is not full. */ if (mqattr->mq_curmsgs < mqattr->mq_maxmsg) revents |= events & (POLLOUT | POLLWRNORM); else selrecord(curlwp, &mq->mq_wsel); } mutex_exit(&mq->mq_mtx); return revents; } static int mq_close_fop(file_t *fp) { proc_t *p = curproc; mqueue_t *mq = fp->f_mqueue; bool destroy = false; mutex_enter(&mq->mq_mtx); KASSERT(mq->mq_refcnt > 0); if (--mq->mq_refcnt == 0) { /* Destroy if the last reference and unlinked. */ destroy = (mq->mq_attrib.mq_flags & MQ_UNLINKED) != 0; } mutex_exit(&mq->mq_mtx); if (destroy) { mqueue_destroy(mq); } atomic_dec_uint(&p->p_mqueue_cnt); return 0; } static int mqueue_access(mqueue_t *mq, int access, kauth_cred_t cred) { accmode_t accmode = 0; /* Note the difference between VREAD/VWRITE and FREAD/FWRITE. */ if (access & FREAD) { accmode |= VREAD; } if (access & FWRITE) { accmode |= VWRITE; } if (genfs_can_access(NULL, cred, mq->mq_euid, mq->mq_egid, mq->mq_mode, NULL, accmode)) { return EACCES; } return 0; } static int mqueue_create(lwp_t *l, char *name, struct mq_attr *attr, mode_t mode, int oflag, mqueue_t **mqret) { proc_t *p = l->l_proc; struct cwdinfo *cwdi = p->p_cwdi; mqueue_t *mq; u_int i; /* Empty name is invalid. */ if (name[0] == '\0') { return EINVAL; } /* Check for mqueue attributes. */ if (attr) { if (attr->mq_maxmsg <= 0 || attr->mq_maxmsg > mq_max_maxmsg || attr->mq_msgsize <= 0 || attr->mq_msgsize > mq_max_msgsize) { return EINVAL; } attr->mq_curmsgs = 0; } /* * Allocate new message queue, initialize data structures, copy the * name attributes. Note that the initial reference is set here. */ mq = kmem_zalloc(sizeof(mqueue_t), KM_SLEEP); mutex_init(&mq->mq_mtx, MUTEX_DEFAULT, IPL_NONE); cv_init(&mq->mq_send_cv, "mqsendcv"); cv_init(&mq->mq_recv_cv, "mqrecvcv"); for (i = 0; i < (MQ_PQSIZE + 1); i++) { TAILQ_INIT(&mq->mq_head[i]); } selinit(&mq->mq_rsel); selinit(&mq->mq_wsel); mq->mq_name = name; mq->mq_refcnt = 1; if (attr != NULL) { memcpy(&mq->mq_attrib, attr, sizeof(struct mq_attr)); } else { memset(&mq->mq_attrib, 0, sizeof(struct mq_attr)); mq->mq_attrib.mq_maxmsg = mq_def_maxmsg; mq->mq_attrib.mq_msgsize = MQ_DEF_MSGSIZE - sizeof(struct mq_msg); } CTASSERT((O_MASK & (MQ_UNLINKED | MQ_RECEIVE)) == 0); mq->mq_attrib.mq_flags = (O_MASK & oflag); /* Store mode and effective UID with GID. */ mq->mq_mode = ((mode & ~cwdi->cwdi_cmask) & ALLPERMS) & ~S_ISTXT; mq->mq_euid = kauth_cred_geteuid(l->l_cred); mq->mq_egid = kauth_cred_getegid(l->l_cred); *mqret = mq; return 0; } /* * Helper function for mq_open() - note that "u_name" is a userland pointer, * while "attr" is a kernel pointer! */ int mq_handle_open(struct lwp *l, const char *u_name, int oflag, mode_t mode, struct mq_attr *attr, register_t *retval) { struct proc *p = l->l_proc; struct mqueue *mq, *mq_new = NULL; int mqd, error; file_t *fp; char *name; /* Get the name from the user-space. */ name = kmem_alloc(MQ_NAMELEN, KM_SLEEP); error = copyinstr(u_name, name, MQ_NAMELEN - 1, NULL); if (error) { kmem_free(name, MQ_NAMELEN); return error; } /* Allocate file structure and descriptor. */ error = fd_allocfile(&fp, &mqd); if (error) { kmem_free(name, MQ_NAMELEN); return error; } /* Account and check for the limit. */ if (atomic_inc_uint_nv(&p->p_mqueue_cnt) > mq_open_max) { atomic_dec_uint(&p->p_mqueue_cnt); error = EMFILE; goto err; } fp->f_type = DTYPE_MQUEUE; fp->f_flag = FFLAGS(oflag) & (FREAD | FWRITE); fp->f_ops = &mqops; if (oflag & O_CREAT) { /* Create a new message queue. */ error = mqueue_create(l, name, attr, mode, oflag, &mq_new); if (error) { goto err; } KASSERT(mq_new != NULL); } /* Lookup for a message queue with such name. */ mutex_enter(&mqlist_lock); mq = mqueue_lookup(name); if (mq) { KASSERT(mutex_owned(&mq->mq_mtx)); mutex_exit(&mqlist_lock); /* Check for exclusive create. */ if (oflag & O_EXCL) { mutex_exit(&mq->mq_mtx); error = EEXIST; goto err; } /* Verify permissions. */ if (mqueue_access(mq, fp->f_flag, l->l_cred) != 0) { mutex_exit(&mq->mq_mtx); error = EACCES; goto err; } /* If we have the access, add a new reference. */ mq->mq_refcnt++; mutex_exit(&mq->mq_mtx); } else { /* Fail if not found and not creating. */ if ((oflag & O_CREAT) == 0) { mutex_exit(&mqlist_lock); KASSERT(mq_new == NULL); error = ENOENT; goto err; } /* Initial timestamps. */ mq = mq_new; getnanotime(&mq->mq_btime); mq->mq_atime = mq->mq_mtime = mq->mq_btime; /* * Finally, insert message queue into the list. * Note: it already has the initial reference. */ LIST_INSERT_HEAD(&mqueue_head, mq, mq_list); mutex_exit(&mqlist_lock); mq_new = NULL; name = NULL; } KASSERT(mq != NULL); fp->f_mqueue = mq; fd_affix(p, fp, mqd); *retval = mqd; err: if (error) { fd_abort(p, fp, mqd); } if (mq_new) { /* Note: will free the 'name'. */ mqueue_destroy(mq_new); } else if (name) { kmem_free(name, MQ_NAMELEN); } return error; } /* * General mqueue system calls. */ int sys_mq_open(struct lwp *l, const struct sys_mq_open_args *uap, register_t *retval) { /* { syscallarg(const char *) name; syscallarg(int) oflag; syscallarg(mode_t) mode; syscallarg(struct mq_attr) attr; } */ struct mq_attr *attr = NULL, a; int error; if ((SCARG(uap, oflag) & O_EXEC) != 0) return EINVAL; if ((SCARG(uap, oflag) & O_CREAT) != 0 && SCARG(uap, attr) != NULL) { error = copyin(SCARG(uap, attr), &a, sizeof(a)); if (error) return error; attr = &a; } return mq_handle_open(l, SCARG(uap, name), SCARG(uap, oflag), SCARG(uap, mode), attr, retval); } int sys_mq_close(struct lwp *l, const struct sys_mq_close_args *uap, register_t *retval) { return sys_close(l, (const void *)uap, retval); } /* * Primary mq_recv1() function. */ int mq_recv1(mqd_t mqdes, void *msg_ptr, size_t msg_len, u_int *msg_prio, struct timespec *ts, ssize_t *mlen) { struct mqueue *mq; struct mq_msg *msg = NULL; struct mq_attr *mqattr; u_int idx; int error; error = mqueue_get(mqdes, FREAD, &mq); if (error) { return error; } getnanotime(&mq->mq_atime); mqattr = &mq->mq_attrib; /* Check the message size limits */ if (msg_len < mqattr->mq_msgsize) { error = EMSGSIZE; goto error; } /* Check if queue is empty */ while (mqattr->mq_curmsgs == 0) { int t; if (mqattr->mq_flags & O_NONBLOCK) { error = EAGAIN; goto error; } if (ts) { error = ts2timo(CLOCK_REALTIME, TIMER_ABSTIME, ts, &t, NULL); if (error) goto error; } else t = 0; /* * Block until someone sends the message. * While doing this, notification should not be sent. */ mqattr->mq_flags |= MQ_RECEIVE; error = cv_timedwait_sig(&mq->mq_send_cv, &mq->mq_mtx, t); mqattr->mq_flags &= ~MQ_RECEIVE; if (error || (mqattr->mq_flags & MQ_UNLINKED)) { error = (error == EWOULDBLOCK) ? ETIMEDOUT : EINTR; goto error; } } /* * Find the highest priority message, and remove it from the queue. * At first, reserved queue is checked, bitmap is next. */ msg = TAILQ_FIRST(&mq->mq_head[MQ_PQRESQ]); if (__predict_true(msg == NULL)) { idx = ffs(mq->mq_bitmap); msg = TAILQ_FIRST(&mq->mq_head[idx]); KASSERT(msg != NULL); } else { idx = MQ_PQRESQ; } TAILQ_REMOVE(&mq->mq_head[idx], msg, msg_queue); /* Unmark the bit, if last message. */ if (__predict_true(idx) && TAILQ_EMPTY(&mq->mq_head[idx])) { KASSERT((MQ_PQSIZE - idx) == msg->msg_prio); mq->mq_bitmap &= ~(1U << --idx); } /* Decrement the counter and signal waiter, if any */ mqattr->mq_curmsgs--; cv_signal(&mq->mq_recv_cv); /* Ready for sending now */ selnotify(&mq->mq_wsel, POLLOUT | POLLWRNORM, 0); error: mutex_exit(&mq->mq_mtx); fd_putfile((int)mqdes); if (error) return error; /* * Copy the data to the user-space. * Note: According to POSIX, no message should be removed from the * queue in case of fail - this would be violated. */ *mlen = msg->msg_len; error = copyout(msg->msg_ptr, msg_ptr, msg->msg_len); if (error == 0 && msg_prio) error = copyout(&msg->msg_prio, msg_prio, sizeof(unsigned)); mqueue_freemsg(msg, sizeof(struct mq_msg) + msg->msg_len); return error; } int sys_mq_receive(struct lwp *l, const struct sys_mq_receive_args *uap, register_t *retval) { /* { syscallarg(mqd_t) mqdes; syscallarg(char *) msg_ptr; syscallarg(size_t) msg_len; syscallarg(unsigned *) msg_prio; } */ ssize_t mlen; int error; error = mq_recv1(SCARG(uap, mqdes), SCARG(uap, msg_ptr), SCARG(uap, msg_len), SCARG(uap, msg_prio), NULL, &mlen); if (error == 0) *retval = mlen; return error; } int sys___mq_timedreceive50(struct lwp *l, const struct sys___mq_timedreceive50_args *uap, register_t *retval) { /* { syscallarg(mqd_t) mqdes; syscallarg(char *) msg_ptr; syscallarg(size_t) msg_len; syscallarg(unsigned *) msg_prio; syscallarg(const struct timespec *) abs_timeout; } */ struct timespec ts, *tsp; ssize_t mlen; int error; /* Get and convert time value */ if (SCARG(uap, abs_timeout)) { error = copyin(SCARG(uap, abs_timeout), &ts, sizeof(ts)); if (error) return error; tsp = &ts; } else { tsp = NULL; } error = mq_recv1(SCARG(uap, mqdes), SCARG(uap, msg_ptr), SCARG(uap, msg_len), SCARG(uap, msg_prio), tsp, &mlen); if (error == 0) *retval = mlen; return error; } /* * Primary mq_send1() function. */ int mq_send1(mqd_t mqdes, const char *msg_ptr, size_t msg_len, u_int msg_prio, struct timespec *ts) { struct mqueue *mq; struct mq_msg *msg; struct mq_attr *mqattr; struct proc *notify = NULL; ksiginfo_t ksi; size_t size; int error; /* Check the priority range */ if (msg_prio >= mq_prio_max) return EINVAL; /* Allocate a new message */ if (msg_len > mq_max_msgsize) return EMSGSIZE; size = sizeof(struct mq_msg) + msg_len; if (size > mq_max_msgsize) return EMSGSIZE; if (size > MQ_DEF_MSGSIZE) { msg = kmem_alloc(size, KM_SLEEP); } else { msg = pool_cache_get(mqmsg_cache, PR_WAITOK); } /* Get the data from user-space */ error = copyin(msg_ptr, msg->msg_ptr, msg_len); if (error) { mqueue_freemsg(msg, size); return error; } msg->msg_len = msg_len; msg->msg_prio = msg_prio; error = mqueue_get(mqdes, FWRITE, &mq); if (error) { mqueue_freemsg(msg, size); return error; } getnanotime(&mq->mq_mtime); mqattr = &mq->mq_attrib; /* Check the message size limit */ if (msg_len <= 0 || msg_len > mqattr->mq_msgsize) { error = EMSGSIZE; goto error; } /* Check if queue is full */ while (mqattr->mq_curmsgs >= mqattr->mq_maxmsg) { int t; if (mqattr->mq_flags & O_NONBLOCK) { error = EAGAIN; goto error; } if (ts) { error = ts2timo(CLOCK_REALTIME, TIMER_ABSTIME, ts, &t, NULL); if (error) goto error; } else t = 0; /* Block until queue becomes available */ error = cv_timedwait_sig(&mq->mq_recv_cv, &mq->mq_mtx, t); if (error || (mqattr->mq_flags & MQ_UNLINKED)) { error = (error == EWOULDBLOCK) ? ETIMEDOUT : error; goto error; } } KASSERT(mqattr->mq_curmsgs < mqattr->mq_maxmsg); /* * Insert message into the queue, according to the priority. * Note the difference between index and priority. */ if (__predict_true(msg_prio < MQ_PQSIZE)) { u_int idx = MQ_PQSIZE - msg_prio; KASSERT(idx != MQ_PQRESQ); TAILQ_INSERT_TAIL(&mq->mq_head[idx], msg, msg_queue); mq->mq_bitmap |= (1U << --idx); } else { mqueue_linear_insert(mq, msg); } /* Check for the notify */ if (mqattr->mq_curmsgs == 0 && mq->mq_notify_proc && (mqattr->mq_flags & MQ_RECEIVE) == 0 && mq->mq_sig_notify.sigev_notify == SIGEV_SIGNAL) { /* Initialize the signal */ KSI_INIT(&ksi); ksi.ksi_signo = mq->mq_sig_notify.sigev_signo; ksi.ksi_code = SI_MESGQ; ksi.ksi_value = mq->mq_sig_notify.sigev_value; /* Unregister the process */ notify = mq->mq_notify_proc; mq->mq_notify_proc = NULL; } /* Increment the counter and signal waiter, if any */ mqattr->mq_curmsgs++; cv_signal(&mq->mq_send_cv); /* Ready for receiving now */ selnotify(&mq->mq_rsel, POLLIN | POLLRDNORM, 0); error: mutex_exit(&mq->mq_mtx); fd_putfile((int)mqdes); if (error) { mqueue_freemsg(msg, size); } else if (notify) { /* Send the notify, if needed */ mutex_enter(&proc_lock); kpsignal(notify, &ksi, NULL); mutex_exit(&proc_lock); } return error; } int sys_mq_send(struct lwp *l, const struct sys_mq_send_args *uap, register_t *retval) { /* { syscallarg(mqd_t) mqdes; syscallarg(const char *) msg_ptr; syscallarg(size_t) msg_len; syscallarg(unsigned) msg_prio; } */ return mq_send1(SCARG(uap, mqdes), SCARG(uap, msg_ptr), SCARG(uap, msg_len), SCARG(uap, msg_prio), NULL); } int sys___mq_timedsend50(struct lwp *l, const struct sys___mq_timedsend50_args *uap, register_t *retval) { /* { syscallarg(mqd_t) mqdes; syscallarg(const char *) msg_ptr; syscallarg(size_t) msg_len; syscallarg(unsigned) msg_prio; syscallarg(const struct timespec *) abs_timeout; } */ struct timespec ts, *tsp; int error; /* Get and convert time value */ if (SCARG(uap, abs_timeout)) { error = copyin(SCARG(uap, abs_timeout), &ts, sizeof(ts)); if (error) return error; tsp = &ts; } else { tsp = NULL; } return mq_send1(SCARG(uap, mqdes), SCARG(uap, msg_ptr), SCARG(uap, msg_len), SCARG(uap, msg_prio), tsp); } int sys_mq_notify(struct lwp *l, const struct sys_mq_notify_args *uap, register_t *retval) { /* { syscallarg(mqd_t) mqdes; syscallarg(const struct sigevent *) notification; } */ struct mqueue *mq; struct sigevent sig; int error; if (SCARG(uap, notification)) { /* Get the signal from user-space */ error = copyin(SCARG(uap, notification), &sig, sizeof(struct sigevent)); if (error) return error; if (sig.sigev_notify == SIGEV_SIGNAL && (sig.sigev_signo <=0 || sig.sigev_signo >= NSIG)) return EINVAL; } error = mqueue_get(SCARG(uap, mqdes), 0, &mq); if (error) { return error; } if (SCARG(uap, notification)) { /* Register notification: set the signal and target process */ if (mq->mq_notify_proc == NULL) { memcpy(&mq->mq_sig_notify, &sig, sizeof(struct sigevent)); mq->mq_notify_proc = l->l_proc; } else { /* Fail if someone else already registered */ error = EBUSY; } } else { /* Unregister the notification */ mq->mq_notify_proc = NULL; } mutex_exit(&mq->mq_mtx); fd_putfile((int)SCARG(uap, mqdes)); return error; } int sys_mq_getattr(struct lwp *l, const struct sys_mq_getattr_args *uap, register_t *retval) { /* { syscallarg(mqd_t) mqdes; syscallarg(struct mq_attr *) mqstat; } */ struct mqueue *mq; struct mq_attr attr; int error; error = mqueue_get(SCARG(uap, mqdes), 0, &mq); if (error) { return error; } memcpy(&attr, &mq->mq_attrib, sizeof(struct mq_attr)); mutex_exit(&mq->mq_mtx); fd_putfile((int)SCARG(uap, mqdes)); return copyout(&attr, SCARG(uap, mqstat), sizeof(struct mq_attr)); } int sys_mq_setattr(struct lwp *l, const struct sys_mq_setattr_args *uap, register_t *retval) { /* { syscallarg(mqd_t) mqdes; syscallarg(const struct mq_attr *) mqstat; syscallarg(struct mq_attr *) omqstat; } */ struct mqueue *mq; struct mq_attr attr; int error, nonblock; error = copyin(SCARG(uap, mqstat), &attr, sizeof(struct mq_attr)); if (error) return error; nonblock = (attr.mq_flags & O_NONBLOCK); error = mqueue_get(SCARG(uap, mqdes), 0, &mq); if (error) { return error; } /* Copy the old attributes, if needed */ if (SCARG(uap, omqstat)) { memcpy(&attr, &mq->mq_attrib, sizeof(struct mq_attr)); } /* Ignore everything, except O_NONBLOCK */ if (nonblock) mq->mq_attrib.mq_flags |= O_NONBLOCK; else mq->mq_attrib.mq_flags &= ~O_NONBLOCK; mutex_exit(&mq->mq_mtx); fd_putfile((int)SCARG(uap, mqdes)); /* * Copy the data to the user-space. * Note: According to POSIX, the new attributes should not be set in * case of fail - this would be violated. */ if (SCARG(uap, omqstat)) error = copyout(&attr, SCARG(uap, omqstat), sizeof(struct mq_attr)); return error; } int sys_mq_unlink(struct lwp *l, const struct sys_mq_unlink_args *uap, register_t *retval) { /* { syscallarg(const char *) name; } */ mqueue_t *mq; char *name; int error, refcnt = 0; /* Get the name from the user-space */ name = kmem_alloc(MQ_NAMELEN, KM_SLEEP); error = copyinstr(SCARG(uap, name), name, MQ_NAMELEN - 1, NULL); if (error) { kmem_free(name, MQ_NAMELEN); return error; } mutex_enter(&mqlist_lock); mq = mqueue_lookup(name); if (mq == NULL) { error = ENOENT; goto err; } KASSERT(mutex_owned(&mq->mq_mtx)); /* Verify permissions. */ if (kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_MQUEUE, 0, mq, NULL, NULL)) { mutex_exit(&mq->mq_mtx); error = EACCES; goto err; } /* Remove and destroy if no references. */ LIST_REMOVE(mq, mq_list); refcnt = mq->mq_refcnt; if (refcnt) { /* Mark as unlinked, if there are references. */ mq->mq_attrib.mq_flags |= MQ_UNLINKED; } /* Wake up waiters, if there are any. */ cv_broadcast(&mq->mq_send_cv); cv_broadcast(&mq->mq_recv_cv); selnotify(&mq->mq_rsel, POLLHUP, 0); selnotify(&mq->mq_wsel, POLLHUP, 0); mutex_exit(&mq->mq_mtx); err: mutex_exit(&mqlist_lock); /* * If last reference - destroy the message queue. Otherwise, * the last mq_close() call will do that. */ if (!error && refcnt == 0) { mqueue_destroy(mq); } kmem_free(name, MQ_NAMELEN); return error; } /* * System control nodes. */ SYSCTL_SETUP(mqueue_sysctl_init, "mqueue systl") { const struct sysctlnode *node = NULL; sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_IMMEDIATE, CTLTYPE_INT, "posix_msg", SYSCTL_DESCR("Version of IEEE Std 1003.1 and its " "Message Passing option to which the " "system attempts to conform"), NULL, _POSIX_MESSAGE_PASSING, NULL, 0, CTL_KERN, CTL_CREATE, CTL_EOL); sysctl_createv(clog, 0, NULL, &node, CTLFLAG_PERMANENT, CTLTYPE_NODE, "mqueue", SYSCTL_DESCR("Message queue options"), NULL, 0, NULL, 0, CTL_KERN, CTL_CREATE, CTL_EOL); if (node == NULL) return; sysctl_createv(clog, 0, &node, NULL, CTLFLAG_PERMANENT | CTLFLAG_READWRITE, CTLTYPE_INT, "mq_open_max", SYSCTL_DESCR("Maximal number of message queue descriptors " "that process could open"), NULL, 0, &mq_open_max, 0, CTL_CREATE, CTL_EOL); sysctl_createv(clog, 0, &node, NULL, CTLFLAG_PERMANENT | CTLFLAG_READWRITE, CTLTYPE_INT, "mq_prio_max", SYSCTL_DESCR("Maximal priority of the message"), NULL, 0, &mq_prio_max, 0, CTL_CREATE, CTL_EOL); sysctl_createv(clog, 0, &node, NULL, CTLFLAG_PERMANENT | CTLFLAG_READWRITE, CTLTYPE_INT, "mq_max_msgsize", SYSCTL_DESCR("Maximal allowed size of the message"), NULL, 0, &mq_max_msgsize, 0, CTL_CREATE, CTL_EOL); sysctl_createv(clog, 0, &node, NULL, CTLFLAG_PERMANENT | CTLFLAG_READWRITE, CTLTYPE_INT, "mq_def_maxmsg", SYSCTL_DESCR("Default maximal message count"), NULL, 0, &mq_def_maxmsg, 0, CTL_CREATE, CTL_EOL); sysctl_createv(clog, 0, &node, NULL, CTLFLAG_PERMANENT | CTLFLAG_READWRITE, CTLTYPE_INT, "mq_max_maxmsg", SYSCTL_DESCR("Maximal allowed message count"), NULL, 0, &mq_max_maxmsg, 0, CTL_CREATE, CTL_EOL); return; } /* * Debugging. */ #if defined(DDB) void mqueue_print_list(void (*pr)(const char *, ...)) { struct mqueue *mq; (*pr)("Global list of the message queues:\n"); (*pr)("%20s %10s %8s %8s %3s %4s %4s %4s\n", "Name", "Ptr", "Mode", "Flags", "Ref", "MaxMsg", "MsgSze", "CurMsg"); LIST_FOREACH(mq, &mqueue_head, mq_list) { (*pr)("%20s %10p %8x %8x %3u %6lu %6lu %6lu\n", mq->mq_name, mq, mq->mq_mode, mq->mq_attrib.mq_flags, mq->mq_refcnt, mq->mq_attrib.mq_maxmsg, mq->mq_attrib.mq_msgsize, mq->mq_attrib.mq_curmsgs); } } #endif /* defined(DDB) */
1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 /* $NetBSD: subr_kobj_vfs.c,v 1.12 2021/06/29 22:40:53 dholland Exp $ */ /*- * Copyright (c) 2008 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software developed for The NetBSD Foundation * by Andrew Doran. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /*- * Copyright (c) 1998-2000 Doug Rabson * Copyright (c) 2004 Peter Wemm * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * Kernel loader vfs routines. */ #include <sys/kobj_impl.h> #ifdef _KERNEL_OPT #include "opt_modular.h" #endif #ifdef MODULAR #include <sys/param.h> #include <sys/fcntl.h> #include <sys/module.h> #include <sys/namei.h> #include <sys/vnode.h> #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: subr_kobj_vfs.c,v 1.12 2021/06/29 22:40:53 dholland Exp $"); static void kobj_close_vfs(kobj_t ko) { VOP_UNLOCK(ko->ko_source); vn_close(ko->ko_source, FREAD, kauth_cred_get()); } /* * kobj_read: * * Utility function: read from the object. */ static int kobj_read_vfs(kobj_t ko, void **basep, size_t size, off_t off, bool allocate) { size_t resid; void *base; int error; KASSERT(ko->ko_source != NULL); if (allocate) { base = kmem_alloc(size, KM_SLEEP); } else { base = *basep; #ifdef DIAGNOSTIC bool ok = false; if ((uintptr_t)base >= (uintptr_t)ko->ko_text_address && (uintptr_t)base + size <= (uintptr_t)ko->ko_text_address + ko->ko_text_size) ok = true; if ((uintptr_t)base >= (uintptr_t)ko->ko_data_address && (uintptr_t)base + size <= (uintptr_t)ko->ko_data_address + ko->ko_data_size) ok = true; if ((uintptr_t)base >= (uintptr_t)ko->ko_rodata_address && (uintptr_t)base + size <= (uintptr_t)ko->ko_rodata_address + ko->ko_rodata_size) ok = true; if (!ok) panic("kobj_read_vfs: not in a dedicated segment"); #endif } error = vn_rdwr(UIO_READ, ko->ko_source, base, size, off, UIO_SYSSPACE, IO_NODELOCKED, curlwp->l_cred, &resid, curlwp); if (error == 0 && resid != 0) { error = EINVAL; } if (allocate && error != 0) { kmem_free(base, size); base = NULL; } if (allocate) *basep = base; return error; } /* * kobj_load_vfs: * * Load an object located in the file system. */ int kobj_load_vfs(kobj_t *kop, const char *path, const bool nochroot) { struct pathbuf *pb; struct vnode *vp; int error; kobj_t ko; KASSERT(path != NULL); if (strchr(path, '/') == NULL) return ENOENT; ko = kmem_zalloc(sizeof(*ko), KM_SLEEP); pb = pathbuf_create(path); if (pb == NULL) { kmem_free(ko, sizeof(*ko)); return ENOMEM; } error = vn_open(NULL, pb, (nochroot ? NOCHROOT : 0), FREAD, 0, &vp, NULL, NULL); if (error != 0) { pathbuf_destroy(pb); kmem_free(ko, sizeof(*ko)); return error; } ko->ko_type = KT_VNODE; kobj_setname(ko, path); ko->ko_source = vp; ko->ko_read = kobj_read_vfs; ko->ko_close = kobj_close_vfs; pathbuf_destroy(pb); *kop = ko; return kobj_load(ko); } #else /* MODULAR */ int kobj_load_vfs(kobj_t *kop, const char *path, const bool nochroot) { return ENOSYS; } #endif
9 5 5 5 5 5 1 3 9 4 5 9 4 5 8 9 9 9 9 9 9 9 8 8 8 8 8 8 9 9 9 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 /* $NetBSD: prop_object.c,v 1.35 2022/08/07 23:49:46 riastradh Exp $ */ /*- * Copyright (c) 2006, 2007 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Jason R. Thorpe. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #include "prop_object_impl.h" #include <prop/prop_object.h> #ifdef _PROP_NEED_REFCNT_MTX static pthread_mutex_t _prop_refcnt_mtx = PTHREAD_MUTEX_INITIALIZER; #endif /* _PROP_NEED_REFCNT_MTX */ #if !defined(_KERNEL) && !defined(_STANDALONE) #include <sys/mman.h> #include <sys/stat.h> #include <errno.h> #include <fcntl.h> #include <limits.h> #include <unistd.h> #endif #ifdef _STANDALONE void * _prop_standalone_calloc(size_t size) { void *rv; rv = alloc(size); if (rv != NULL) memset(rv, 0, size); return (rv); } void * _prop_standalone_realloc(void *v, size_t size) { void *rv; rv = alloc(size); if (rv != NULL) { memcpy(rv, v, size); /* XXX */ dealloc(v, 0); /* XXX */ } return (rv); } #endif /* _STANDALONE */ /* * _prop_object_init -- * Initialize an object. Called when sub-classes create * an instance. */ void _prop_object_init(struct _prop_object *po, const struct _prop_object_type *pot) { po->po_type = pot; po->po_refcnt = 1; } /* * _prop_object_fini -- * Finalize an object. Called when sub-classes destroy * an instance. */ /*ARGSUSED*/ void _prop_object_fini(struct _prop_object *po _PROP_ARG_UNUSED) { /* Nothing to do, currently. */ } /* * _prop_object_externalize_start_tag -- * Append an XML-style start tag to the externalize buffer. */ bool _prop_object_externalize_start_tag( struct _prop_object_externalize_context *ctx, const char *tag) { unsigned int i; for (i = 0; i < ctx->poec_depth; i++) { if (_prop_object_externalize_append_char(ctx, '\t') == false) return (false); } if (_prop_object_externalize_append_char(ctx, '<') == false || _prop_object_externalize_append_cstring(ctx, tag) == false || _prop_object_externalize_append_char(ctx, '>') == false) return (false); return (true); } /* * _prop_object_externalize_end_tag -- * Append an XML-style end tag to the externalize buffer. */ bool _prop_object_externalize_end_tag( struct _prop_object_externalize_context *ctx, const char *tag) { if (_prop_object_externalize_append_char(ctx, '<') == false || _prop_object_externalize_append_char(ctx, '/') == false || _prop_object_externalize_append_cstring(ctx, tag) == false || _prop_object_externalize_append_char(ctx, '>') == false || _prop_object_externalize_append_char(ctx, '\n') == false) return (false); return (true); } /* * _prop_object_externalize_empty_tag -- * Append an XML-style empty tag to the externalize buffer. */ bool _prop_object_externalize_empty_tag( struct _prop_object_externalize_context *ctx, const char *tag) { unsigned int i; for (i = 0; i < ctx->poec_depth; i++) { if (_prop_object_externalize_append_char(ctx, '\t') == false) return (false); } if (_prop_object_externalize_append_char(ctx, '<') == false || _prop_object_externalize_append_cstring(ctx, tag) == false || _prop_object_externalize_append_char(ctx, '/') == false || _prop_object_externalize_append_char(ctx, '>') == false || _prop_object_externalize_append_char(ctx, '\n') == false) return (false); return (true); } /* * _prop_object_externalize_append_cstring -- * Append a C string to the externalize buffer. */ bool _prop_object_externalize_append_cstring( struct _prop_object_externalize_context *ctx, const char *cp) { while (*cp != '\0') { if (_prop_object_externalize_append_char(ctx, (unsigned char) *cp) == false) return (false); cp++; } return (true); } /* * _prop_object_externalize_append_encoded_cstring -- * Append an encoded C string to the externalize buffer. */ bool _prop_object_externalize_append_encoded_cstring( struct _prop_object_externalize_context *ctx, const char *cp) { while (*cp != '\0') { switch (*cp) { case '<': if (_prop_object_externalize_append_cstring(ctx, "&lt;") == false) return (false); break; case '>': if (_prop_object_externalize_append_cstring(ctx, "&gt;") == false) return (false); break; case '&': if (_prop_object_externalize_append_cstring(ctx, "&amp;") == false) return (false); break; default: if (_prop_object_externalize_append_char(ctx, (unsigned char) *cp) == false) return (false); break; } cp++; } return (true); } #define BUF_EXPAND 256 /* * _prop_object_externalize_append_char -- * Append a single character to the externalize buffer. */ bool _prop_object_externalize_append_char( struct _prop_object_externalize_context *ctx, unsigned char c) { _PROP_ASSERT(ctx->poec_capacity != 0); _PROP_ASSERT(ctx->poec_buf != NULL); _PROP_ASSERT(ctx->poec_len <= ctx->poec_capacity); if (ctx->poec_len == ctx->poec_capacity) { char *cp = _PROP_REALLOC(ctx->poec_buf, ctx->poec_capacity + BUF_EXPAND, M_TEMP); if (cp == NULL) return (false); ctx->poec_capacity = ctx->poec_capacity + BUF_EXPAND; ctx->poec_buf = cp; } ctx->poec_buf[ctx->poec_len++] = c; return (true); } /* * _prop_object_externalize_header -- * Append the standard XML header to the externalize buffer. */ bool _prop_object_externalize_header(struct _prop_object_externalize_context *ctx) { static const char _plist_xml_header[] = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n" "<!DOCTYPE plist PUBLIC \"-//Apple Computer//DTD PLIST 1.0//EN\" \"http://www.apple.com/DTDs/PropertyList-1.0.dtd\">\n"; if (_prop_object_externalize_append_cstring(ctx, _plist_xml_header) == false || _prop_object_externalize_start_tag(ctx, "plist version=\"1.0\"") == false || _prop_object_externalize_append_char(ctx, '\n') == false) return (false); return (true); } /* * _prop_object_externalize_footer -- * Append the standard XML footer to the externalize buffer. This * also NUL-terminates the buffer. */ bool _prop_object_externalize_footer(struct _prop_object_externalize_context *ctx) { if (_prop_object_externalize_end_tag(ctx, "plist") == false || _prop_object_externalize_append_char(ctx, '\0') == false) return (false); return (true); } /* * _prop_object_externalize_context_alloc -- * Allocate an externalize context. */ struct _prop_object_externalize_context * _prop_object_externalize_context_alloc(void) { struct _prop_object_externalize_context *ctx; ctx = _PROP_MALLOC(sizeof(*ctx), M_TEMP); if (ctx != NULL) { ctx->poec_buf = _PROP_MALLOC(BUF_EXPAND, M_TEMP); if (ctx->poec_buf == NULL) { _PROP_FREE(ctx, M_TEMP); return (NULL); } ctx->poec_len = 0; ctx->poec_capacity = BUF_EXPAND; ctx->poec_depth = 0; } return (ctx); } /* * _prop_object_externalize_context_free -- * Free an externalize context. */ void _prop_object_externalize_context_free( struct _prop_object_externalize_context *ctx) { /* Buffer is always freed by the caller. */ _PROP_FREE(ctx, M_TEMP); } /* * _prop_object_internalize_skip_comment -- * Skip the body and end tag of a comment. */ static bool _prop_object_internalize_skip_comment( struct _prop_object_internalize_context *ctx) { const char *cp = ctx->poic_cp; while (!_PROP_EOF(*cp)) { if (cp[0] == '-' && cp[1] == '-' && cp[2] == '>') { ctx->poic_cp = cp + 3; return (true); } cp++; } return (false); /* ran out of buffer */ } /* * _prop_object_internalize_find_tag -- * Find the next tag in an XML stream. Optionally compare the found * tag to an expected tag name. State of the context is undefined * if this routine returns false. Upon success, the context points * to the first octet after the tag. */ bool _prop_object_internalize_find_tag(struct _prop_object_internalize_context *ctx, const char *tag, _prop_tag_type_t type) { const char *cp; size_t taglen; if (tag != NULL) taglen = strlen(tag); else taglen = 0; start_over: cp = ctx->poic_cp; /* * Find the start of the tag. */ while (_PROP_ISSPACE(*cp)) cp++; if (_PROP_EOF(*cp)) return (false); if (*cp != '<') return (false); ctx->poic_tag_start = cp++; if (_PROP_EOF(*cp)) return (false); if (*cp == '!') { if (cp[1] != '-' || cp[2] != '-') return (false); /* * Comment block -- only allowed if we are allowed to * return a start tag. */ if (type == _PROP_TAG_TYPE_END) return (false); ctx->poic_cp = cp + 3; if (_prop_object_internalize_skip_comment(ctx) == false) return (false); goto start_over; } if (*cp == '/') { if (type != _PROP_TAG_TYPE_END && type != _PROP_TAG_TYPE_EITHER) return (false); cp++; if (_PROP_EOF(*cp)) return (false); ctx->poic_tag_type = _PROP_TAG_TYPE_END; } else { if (type != _PROP_TAG_TYPE_START && type != _PROP_TAG_TYPE_EITHER) return (false); ctx->poic_tag_type = _PROP_TAG_TYPE_START; } ctx->poic_tagname = cp; while (!_PROP_ISSPACE(*cp) && *cp != '/' && *cp != '>') { if (_PROP_EOF(*cp)) return (false); cp++; } ctx->poic_tagname_len = cp - ctx->poic_tagname; /* Make sure this is the tag we're looking for. */ if (tag != NULL && (taglen != ctx->poic_tagname_len || memcmp(tag, ctx->poic_tagname, taglen) != 0)) return (false); /* Check for empty tag. */ if (*cp == '/') { if (ctx->poic_tag_type != _PROP_TAG_TYPE_START) return(false); /* only valid on start tags */ ctx->poic_is_empty_element = true; cp++; if (_PROP_EOF(*cp) || *cp != '>') return (false); } else ctx->poic_is_empty_element = false; /* Easy case of no arguments. */ if (*cp == '>') { ctx->poic_tagattr = NULL; ctx->poic_tagattr_len = 0; ctx->poic_tagattrval = NULL; ctx->poic_tagattrval_len = 0; ctx->poic_cp = cp + 1; return (true); } _PROP_ASSERT(!_PROP_EOF(*cp)); cp++; if (_PROP_EOF(*cp)) return (false); while (_PROP_ISSPACE(*cp)) cp++; if (_PROP_EOF(*cp)) return (false); ctx->poic_tagattr = cp; while (!_PROP_ISSPACE(*cp) && *cp != '=') { if (_PROP_EOF(*cp)) return (false); cp++; } ctx->poic_tagattr_len = cp - ctx->poic_tagattr; cp++; if (*cp != '\"') return (false); cp++; if (_PROP_EOF(*cp)) return (false); ctx->poic_tagattrval = cp; while (*cp != '\"') { if (_PROP_EOF(*cp)) return (false); cp++; } ctx->poic_tagattrval_len = cp - ctx->poic_tagattrval; cp++; if (*cp != '>') return (false); ctx->poic_cp = cp + 1; return (true); } /* * _prop_object_internalize_decode_string -- * Decode an encoded string. */ bool _prop_object_internalize_decode_string( struct _prop_object_internalize_context *ctx, char *target, size_t targsize, size_t *sizep, const char **cpp) { const char *src; size_t tarindex; char c; tarindex = 0; src = ctx->poic_cp; for (;;) { if (_PROP_EOF(*src)) return (false); if (*src == '<') { break; } if ((c = *src) == '&') { if (src[1] == 'a' && src[2] == 'm' && src[3] == 'p' && src[4] == ';') { c = '&'; src += 5; } else if (src[1] == 'l' && src[2] == 't' && src[3] == ';') { c = '<'; src += 4; } else if (src[1] == 'g' && src[2] == 't' && src[3] == ';') { c = '>'; src += 4; } else if (src[1] == 'a' && src[2] == 'p' && src[3] == 'o' && src[4] == 's' && src[5] == ';') { c = '\''; src += 6; } else if (src[1] == 'q' && src[2] == 'u' && src[3] == 'o' && src[4] == 't' && src[5] == ';') { c = '\"'; src += 6; } else return (false); } else src++; if (target) { if (tarindex >= targsize) return (false); target[tarindex] = c; } tarindex++; } _PROP_ASSERT(*src == '<'); if (sizep != NULL) *sizep = tarindex; if (cpp != NULL) *cpp = src; return (true); } /* * _prop_object_internalize_match -- * Returns true if the two character streams match. */ bool _prop_object_internalize_match(const char *str1, size_t len1, const char *str2, size_t len2) { return (len1 == len2 && memcmp(str1, str2, len1) == 0); } #define INTERNALIZER(t, f) \ { t, sizeof(t) - 1, f } static const struct _prop_object_internalizer { const char *poi_tag; size_t poi_taglen; prop_object_internalizer_t poi_intern; } _prop_object_internalizer_table[] = { INTERNALIZER("array", _prop_array_internalize), INTERNALIZER("true", _prop_bool_internalize), INTERNALIZER("false", _prop_bool_internalize), INTERNALIZER("data", _prop_data_internalize), INTERNALIZER("dict", _prop_dictionary_internalize), INTERNALIZER("integer", _prop_number_internalize), INTERNALIZER("string", _prop_string_internalize), { 0, 0, NULL } }; #undef INTERNALIZER /* * _prop_object_internalize_by_tag -- * Determine the object type from the tag in the context and * internalize it. */ prop_object_t _prop_object_internalize_by_tag(struct _prop_object_internalize_context *ctx) { const struct _prop_object_internalizer *poi; prop_object_t obj, parent_obj; void *data, *iter; prop_object_internalizer_continue_t iter_func; struct _prop_stack stack; _prop_stack_init(&stack); match_start: for (poi = _prop_object_internalizer_table; poi->poi_tag != NULL; poi++) { if (_prop_object_internalize_match(ctx->poic_tagname, ctx->poic_tagname_len, poi->poi_tag, poi->poi_taglen)) break; } if ((poi == NULL) || (poi->poi_tag == NULL)) { while (_prop_stack_pop(&stack, &obj, &iter, &data, NULL)) { iter_func = (prop_object_internalizer_continue_t)iter; (*iter_func)(&stack, &obj, ctx, data, NULL); } return (NULL); } obj = NULL; if (!(*poi->poi_intern)(&stack, &obj, ctx)) goto match_start; parent_obj = obj; while (_prop_stack_pop(&stack, &parent_obj, &iter, &data, NULL)) { iter_func = (prop_object_internalizer_continue_t)iter; if (!(*iter_func)(&stack, &parent_obj, ctx, data, obj)) goto match_start; obj = parent_obj; } return (parent_obj); } prop_object_t _prop_generic_internalize(const char *xml, const char *master_tag) { prop_object_t obj = NULL; struct _prop_object_internalize_context *ctx; ctx = _prop_object_internalize_context_alloc(xml); if (ctx == NULL) return (NULL); /* We start with a <plist> tag. */ if (_prop_object_internalize_find_tag(ctx, "plist", _PROP_TAG_TYPE_START) == false) goto out; /* Plist elements cannot be empty. */ if (ctx->poic_is_empty_element) goto out; /* * We don't understand any plist attributes, but Apple XML * property lists often have a "version" attribute. If we * see that one, we simply ignore it. */ if (ctx->poic_tagattr != NULL && !_PROP_TAGATTR_MATCH(ctx, "version")) goto out; /* Next we expect to see opening master_tag. */ if (_prop_object_internalize_find_tag(ctx, master_tag, _PROP_TAG_TYPE_START) == false) goto out; obj = _prop_object_internalize_by_tag(ctx); if (obj == NULL) goto out; /* * We've advanced past the closing master_tag. * Now we want </plist>. */ if (_prop_object_internalize_find_tag(ctx, "plist", _PROP_TAG_TYPE_END) == false) { prop_object_release(obj); obj = NULL; } out: _prop_object_internalize_context_free(ctx); return (obj); } /* * _prop_object_internalize_context_alloc -- * Allocate an internalize context. */ struct _prop_object_internalize_context * _prop_object_internalize_context_alloc(const char *xml) { struct _prop_object_internalize_context *ctx; ctx = _PROP_MALLOC(sizeof(*ctx), M_TEMP); if (ctx == NULL) return (NULL); ctx->poic_xml = ctx->poic_cp = xml; /* * Skip any whitespace and XML preamble stuff that we don't * know about / care about. */ for (;;) { while (_PROP_ISSPACE(*xml)) xml++; if (_PROP_EOF(*xml) || *xml != '<') goto bad; #define MATCH(str) (strncmp(&xml[1], str, strlen(str)) == 0) /* * Skip over the XML preamble that Apple XML property * lists usually include at the top of the file. */ if (MATCH("?xml ") || MATCH("!DOCTYPE plist")) { while (*xml != '>' && !_PROP_EOF(*xml)) xml++; if (_PROP_EOF(*xml)) goto bad; xml++; /* advance past the '>' */ continue; } if (MATCH("<!--")) { ctx->poic_cp = xml + 4; if (_prop_object_internalize_skip_comment(ctx) == false) goto bad; xml = ctx->poic_cp; continue; } #undef MATCH /* * We don't think we should skip it, so let's hope we can * parse it. */ break; } ctx->poic_cp = xml; return (ctx); bad: _PROP_FREE(ctx, M_TEMP); return (NULL); } /* * _prop_object_internalize_context_free -- * Free an internalize context. */ void _prop_object_internalize_context_free( struct _prop_object_internalize_context *ctx) { _PROP_FREE(ctx, M_TEMP); } #if !defined(_KERNEL) && !defined(_STANDALONE) /* * _prop_object_externalize_file_dirname -- * dirname(3), basically. We have to roll our own because the * system dirname(3) isn't reentrant. */ static void _prop_object_externalize_file_dirname(const char *path, char *result) { const char *lastp; size_t len; /* * If `path' is a NULL pointer or points to an empty string, * return ".". */ if (path == NULL || *path == '\0') goto singledot; /* String trailing slashes, if any. */ lastp = path + strlen(path) - 1; while (lastp != path && *lastp == '/') lastp--; /* Terminate path at the last occurrence of '/'. */ do { if (*lastp == '/') { /* Strip trailing slashes, if any. */ while (lastp != path && *lastp == '/') lastp--; /* ...and copy the result into the result buffer. */ len = (lastp - path) + 1 /* last char */; if (len > (PATH_MAX - 1)) len = PATH_MAX - 1; memcpy(result, path, len); result[len] = '\0'; return; } } while (--lastp >= path); /* No /'s found, return ".". */ singledot: strcpy(result, "."); } /* * _prop_object_externalize_write_file -- * Write an externalized dictionary to the specified file. * The file is written atomically from the caller's perspective, * and the mode set to 0666 modified by the caller's umask. */ bool _prop_object_externalize_write_file(const char *fname, const char *xml, size_t len) { char tname[PATH_MAX]; int fd; int save_errno; mode_t myumask; if (len > SSIZE_MAX) { errno = EFBIG; return (false); } /* * Get the directory name where the file is to be written * and create the temporary file. */ _prop_object_externalize_file_dirname(fname, tname); #define PLISTTMP "/.plistXXXXXX" if (strlen(tname) + strlen(PLISTTMP) >= sizeof(tname)) { errno = ENAMETOOLONG; return (false); } strcat(tname, PLISTTMP); #undef PLISTTMP if ((fd = mkstemp(tname)) == -1) return (false); if (write(fd, xml, len) != (ssize_t)len) goto bad; if (fsync(fd) == -1) goto bad; myumask = umask(0); (void)umask(myumask); if (fchmod(fd, 0666 & ~myumask) == -1) goto bad; (void) close(fd); fd = -1; if (rename(tname, fname) == -1) goto bad; return (true); bad: save_errno = errno; if (fd != -1) (void) close(fd); (void) unlink(tname); errno = save_errno; return (false); } /* * _prop_object_internalize_map_file -- * Map a file for the purpose of internalizing it. */ struct _prop_object_internalize_mapped_file * _prop_object_internalize_map_file(const char *fname) { struct stat sb; struct _prop_object_internalize_mapped_file *mf; size_t pgsize = (size_t)sysconf(_SC_PAGESIZE); size_t pgmask = pgsize - 1; bool need_guard = false; int fd; mf = _PROP_MALLOC(sizeof(*mf), M_TEMP); if (mf == NULL) return (NULL); fd = open(fname, O_RDONLY, 0400); if (fd == -1) { _PROP_FREE(mf, M_TEMP); return (NULL); } if (fstat(fd, &sb) == -1) { (void) close(fd); _PROP_FREE(mf, M_TEMP); return (NULL); } mf->poimf_mapsize = ((size_t)sb.st_size + pgmask) & ~pgmask; if (mf->poimf_mapsize < (size_t)sb.st_size) { (void) close(fd); _PROP_FREE(mf, M_TEMP); return (NULL); } /* * If the file length is an integral number of pages, then we * need to map a guard page at the end in order to provide the * necessary NUL-termination of the buffer. */ if ((sb.st_size & pgmask) == 0) need_guard = true; mf->poimf_xml = mmap(NULL, need_guard ? mf->poimf_mapsize + pgsize : mf->poimf_mapsize, PROT_READ, MAP_FILE|MAP_SHARED, fd, (off_t)0); (void) close(fd); if (mf->poimf_xml == MAP_FAILED) { _PROP_FREE(mf, M_TEMP); return (NULL); } #ifdef POSIX_MADV_SEQUENTIAL (void) posix_madvise(mf->poimf_xml, mf->poimf_mapsize, POSIX_MADV_SEQUENTIAL); #endif if (need_guard) { if (mmap(mf->poimf_xml + mf->poimf_mapsize, pgsize, PROT_READ, MAP_ANON|MAP_PRIVATE|MAP_FIXED, -1, (off_t)0) == MAP_FAILED) { (void) munmap(mf->poimf_xml, mf->poimf_mapsize); _PROP_FREE(mf, M_TEMP); return (NULL); } mf->poimf_mapsize += pgsize; } return (mf); } /* * _prop_object_internalize_unmap_file -- * Unmap a file previously mapped for internalizing. */ void _prop_object_internalize_unmap_file( struct _prop_object_internalize_mapped_file *mf) { #ifdef POSIX_MADV_DONTNEED (void) posix_madvise(mf->poimf_xml, mf->poimf_mapsize, POSIX_MADV_DONTNEED); #endif (void) munmap(mf->poimf_xml, mf->poimf_mapsize); _PROP_FREE(mf, M_TEMP); } #endif /* !_KERNEL && !_STANDALONE */ /* * prop_object_retain -- * Increment the reference count on an object. */ void prop_object_retain(prop_object_t obj) { struct _prop_object *po = obj; uint32_t ncnt __unused; _PROP_ATOMIC_INC32_NV(&po->po_refcnt, ncnt); _PROP_ASSERT(ncnt != 0); } /* * prop_object_release_emergency * A direct free with prop_object_release failed. * Walk down the tree until a leaf is found and * free that. Do not recurse to avoid stack overflows. * * This is a slow edge condition, but necessary to * guarantee that an object can always be freed. */ static void prop_object_release_emergency(prop_object_t obj) { struct _prop_object *po; void (*unlock)(void); prop_object_t parent = NULL; uint32_t ocnt; for (;;) { po = obj; _PROP_ASSERT(obj); if (po->po_type->pot_lock != NULL) po->po_type->pot_lock(); /* Save pointerto unlock function */ unlock = po->po_type->pot_unlock; /* Dance a bit to make sure we always get the non-racy ocnt */ _PROP_ATOMIC_DEC32_NV(&po->po_refcnt, ocnt); ocnt++; _PROP_ASSERT(ocnt != 0); if (ocnt != 1) { if (unlock != NULL) unlock(); break; } _PROP_ASSERT(po->po_type); if ((po->po_type->pot_free)(NULL, &obj) == _PROP_OBJECT_FREE_DONE) { if (unlock != NULL) unlock(); break; } if (unlock != NULL) unlock(); parent = po; _PROP_ATOMIC_INC32(&po->po_refcnt); } _PROP_ASSERT(parent); /* One object was just freed. */ po = parent; (*po->po_type->pot_emergency_free)(parent); } /* * prop_object_release -- * Decrement the reference count on an object. * * Free the object if we are releasing the final * reference. */ void prop_object_release(prop_object_t obj) { struct _prop_object *po; struct _prop_stack stack; void (*unlock)(void); int ret; uint32_t ocnt; _prop_stack_init(&stack); do { do { po = obj; _PROP_ASSERT(obj); if (po->po_type->pot_lock != NULL) po->po_type->pot_lock(); /* Save pointer to object unlock function */ unlock = po->po_type->pot_unlock; _PROP_ATOMIC_DEC32_NV(&po->po_refcnt, ocnt); ocnt++; _PROP_ASSERT(ocnt != 0); if (ocnt != 1) { ret = 0; if (unlock != NULL) unlock(); break; } ret = (po->po_type->pot_free)(&stack, &obj); if (unlock != NULL) unlock(); if (ret == _PROP_OBJECT_FREE_DONE) break; _PROP_ATOMIC_INC32(&po->po_refcnt); } while (ret == _PROP_OBJECT_FREE_RECURSE); if (ret == _PROP_OBJECT_FREE_FAILED) prop_object_release_emergency(obj); } while (_prop_stack_pop(&stack, &obj, NULL, NULL, NULL)); } /* * prop_object_type -- * Return the type of an object. */ prop_type_t prop_object_type(prop_object_t obj) { struct _prop_object *po = obj; if (obj == NULL) return (PROP_TYPE_UNKNOWN); return (po->po_type->pot_type); } /* * prop_object_equals -- * Returns true if thw two objects are equivalent. */ bool prop_object_equals(prop_object_t obj1, prop_object_t obj2) { return (prop_object_equals_with_error(obj1, obj2, NULL)); } bool prop_object_equals_with_error(prop_object_t obj1, prop_object_t obj2, bool *error_flag) { struct _prop_object *po1; struct _prop_object *po2; void *stored_pointer1, *stored_pointer2; prop_object_t next_obj1, next_obj2; struct _prop_stack stack; _prop_object_equals_rv_t ret; _prop_stack_init(&stack); if (error_flag) *error_flag = false; start_subtree: stored_pointer1 = NULL; stored_pointer2 = NULL; po1 = obj1; po2 = obj2; if (po1->po_type != po2->po_type) return (false); continue_subtree: ret = (*po1->po_type->pot_equals)(obj1, obj2, &stored_pointer1, &stored_pointer2, &next_obj1, &next_obj2); if (ret == _PROP_OBJECT_EQUALS_FALSE) goto finish; if (ret == _PROP_OBJECT_EQUALS_TRUE) { if (!_prop_stack_pop(&stack, &obj1, &obj2, &stored_pointer1, &stored_pointer2)) return true; po1 = obj1; po2 = obj2; goto continue_subtree; } _PROP_ASSERT(ret == _PROP_OBJECT_EQUALS_RECURSE); if (!_prop_stack_push(&stack, obj1, obj2, stored_pointer1, stored_pointer2)) { if (error_flag) *error_flag = true; goto finish; } obj1 = next_obj1; obj2 = next_obj2; goto start_subtree; finish: while (_prop_stack_pop(&stack, &obj1, &obj2, NULL, NULL)) { po1 = obj1; (*po1->po_type->pot_equals_finish)(obj1, obj2); } return (false); } /* * prop_object_iterator_next -- * Return the next item during an iteration. */ prop_object_t prop_object_iterator_next(prop_object_iterator_t pi) { return ((*pi->pi_next_object)(pi)); } /* * prop_object_iterator_reset -- * Reset the iterator to the first object so as to restart * iteration. */ void prop_object_iterator_reset(prop_object_iterator_t pi) { (*pi->pi_reset)(pi); } /* * prop_object_iterator_release -- * Release the object iterator. */ void prop_object_iterator_release(prop_object_iterator_t pi) { prop_object_release(pi->pi_obj); _PROP_FREE(pi, M_TEMP); }
4 1 1 2 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 /* $NetBSD: mfs_vfsops.c,v 1.116 2022/03/19 13:53:33 hannken Exp $ */ /* * Copyright (c) 1989, 1990, 1993, 1994 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)mfs_vfsops.c 8.11 (Berkeley) 6/19/95 */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: mfs_vfsops.c,v 1.116 2022/03/19 13:53:33 hannken Exp $"); #if defined(_KERNEL_OPT) #include "opt_compat_netbsd.h" #endif #include <sys/param.h> #include <sys/systm.h> #include <sys/sysctl.h> #include <sys/time.h> #include <sys/kernel.h> #include <sys/proc.h> #include <sys/buf.h> #include <sys/bufq.h> #include <sys/mount.h> #include <sys/signalvar.h> #include <sys/vnode.h> #include <sys/kmem.h> #include <sys/module.h> #include <miscfs/genfs/genfs.h> #include <miscfs/specfs/specdev.h> #include <ufs/ufs/quota.h> #include <ufs/ufs/inode.h> #include <ufs/ufs/ufsmount.h> #include <ufs/ufs/ufs_extern.h> #include <ufs/ffs/fs.h> #include <ufs/ffs/ffs_extern.h> #include <ufs/mfs/mfsnode.h> #include <ufs/mfs/mfs_extern.h> MODULE(MODULE_CLASS_VFS, mfs, "ffs"); kmutex_t mfs_lock; /* global lock */ /* used for building internal dev_t, minor == 0 reserved for miniroot */ static devminor_t mfs_minor = 1; static int mfs_initcnt; extern int (**mfs_vnodeop_p)(void *); /* * mfs vfs operations. */ extern const struct vnodeopv_desc mfs_vnodeop_opv_desc; const struct vnodeopv_desc * const mfs_vnodeopv_descs[] = { &mfs_vnodeop_opv_desc, NULL, }; struct vfsops mfs_vfsops = { .vfs_name = MOUNT_MFS, .vfs_min_mount_data = sizeof (struct mfs_args), .vfs_mount = mfs_mount, .vfs_start = mfs_start, .vfs_unmount = ffs_unmount, .vfs_root = ufs_root, .vfs_quotactl = ufs_quotactl, .vfs_statvfs = mfs_statvfs, .vfs_sync = ffs_sync, .vfs_vget = ufs_vget, .vfs_loadvnode = ffs_loadvnode, .vfs_newvnode = ffs_newvnode, .vfs_fhtovp = ffs_fhtovp, .vfs_vptofh = ffs_vptofh, .vfs_init = mfs_init, .vfs_reinit = mfs_reinit, .vfs_done = mfs_done, .vfs_snapshot = (void *)eopnotsupp, .vfs_extattrctl = vfs_stdextattrctl, .vfs_suspendctl = genfs_suspendctl, .vfs_renamelock_enter = genfs_renamelock_enter, .vfs_renamelock_exit = genfs_renamelock_exit, .vfs_fsync = (void *)eopnotsupp, .vfs_opv_descs = mfs_vnodeopv_descs }; SYSCTL_SETUP(mfs_sysctl_setup, "mfs sysctl") { sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_ALIAS, CTLTYPE_NODE, "mfs", SYSCTL_DESCR("Memory based file system"), NULL, 1, NULL, 0, CTL_VFS, 3, CTL_EOL); /* * XXX the "1" and the "3" above could be dynamic, thereby * eliminating one more instance of the "number to vfs" * mapping problem, but they are in order as taken from * sys/mount.h */ } static int mfs_modcmd(modcmd_t cmd, void *arg) { int error; switch (cmd) { case MODULE_CMD_INIT: error = vfs_attach(&mfs_vfsops); if (error != 0) break; break; case MODULE_CMD_FINI: error = vfs_detach(&mfs_vfsops); if (error != 0) break; break; default: error = ENOTTY; break; } return (error); } /* * Memory based filesystem initialization. */ void mfs_init(void) { if (mfs_initcnt++ == 0) { mutex_init(&mfs_lock, MUTEX_DEFAULT, IPL_NONE); ffs_init(); } } void mfs_reinit(void) { ffs_reinit(); } void mfs_done(void) { if (--mfs_initcnt == 0) { ffs_done(); mutex_destroy(&mfs_lock); } } /* * Called by main() when mfs is going to be mounted as root. */ int mfs_mountroot(void) { struct fs *fs; struct mount *mp; struct lwp *l = curlwp; /* XXX */ struct ufsmount *ump; struct mfsnode *mfsp; int error = 0; if ((error = vfs_rootmountalloc(MOUNT_MFS, "mfs_root", &mp))) { vrele(rootvp); return (error); } mfsp = kmem_alloc(sizeof(*mfsp), KM_SLEEP); rootvp->v_data = mfsp; rootvp->v_op = mfs_vnodeop_p; rootvp->v_tag = VT_MFS; mfsp->mfs_baseoff = mfs_rootbase; mfsp->mfs_size = mfs_rootsize; mfsp->mfs_vnode = rootvp; mfsp->mfs_proc = NULL; /* indicate kernel space */ mfsp->mfs_shutdown = 0; cv_init(&mfsp->mfs_cv, "mfs"); mfsp->mfs_refcnt = 1; bufq_alloc(&mfsp->mfs_buflist, "fcfs", 0); if ((error = ffs_mountfs(rootvp, mp, l)) != 0) { vfs_unbusy(mp); bufq_free(mfsp->mfs_buflist); vfs_rele(mp); kmem_free(mfsp, sizeof(*mfsp)); return (error); } mountlist_append(mp); mp->mnt_vnodecovered = NULLVP; ump = VFSTOUFS(mp); fs = ump->um_fs; (void) copystr(mp->mnt_stat.f_mntonname, fs->fs_fsmnt, MNAMELEN - 1, 0); (void)ffs_statvfs(mp, &mp->mnt_stat); vfs_unbusy(mp); return (0); } /* * VFS Operations. * * mount system call */ /* ARGSUSED */ int mfs_mount(struct mount *mp, const char *path, void *data, size_t *data_len) { struct lwp *l = curlwp; struct vnode *devvp; struct mfs_args *args = data; struct ufsmount *ump; struct fs *fs; struct mfsnode *mfsp; struct proc *p; devminor_t minor; int flags, error = 0; if (args == NULL) return EINVAL; if (*data_len < sizeof *args) return EINVAL; p = l->l_proc; if (mp->mnt_flag & MNT_GETARGS) { struct vnode *vp; ump = VFSTOUFS(mp); if (ump == NULL) return EIO; vp = ump->um_devvp; if (vp == NULL) return EIO; mfsp = VTOMFS(vp); if (mfsp == NULL) return EIO; args->fspec = NULL; args->base = mfsp->mfs_baseoff; args->size = mfsp->mfs_size; *data_len = sizeof *args; return 0; } /* * XXX turn off async to avoid hangs when writing lots of data. * the problem is that MFS needs to allocate pages to clean pages, * so if we wait until the last minute to clean pages then there * may not be any pages available to do the cleaning. * ... and since the default partially-synchronous mode turns out * to not be sufficient under heavy load, make it full synchronous. */ mp->mnt_flag &= ~MNT_ASYNC; mp->mnt_flag |= MNT_SYNCHRONOUS; /* * If updating, check whether changing from read-only to * read/write; if there is no device name, that's all we do. */ if (mp->mnt_flag & MNT_UPDATE) { ump = VFSTOUFS(mp); fs = ump->um_fs; if (fs->fs_ronly == 0 && (mp->mnt_flag & MNT_RDONLY)) { flags = WRITECLOSE; if (mp->mnt_flag & MNT_FORCE) flags |= FORCECLOSE; error = ffs_flushfiles(mp, flags, l); if (error) return (error); } if (fs->fs_ronly && (mp->mnt_iflag & IMNT_WANTRDWR)) fs->fs_ronly = 0; if (args->fspec == NULL) return EINVAL; return (0); } mutex_enter(&mfs_lock); minor = mfs_minor++; mutex_exit(&mfs_lock); error = bdevvp(makedev(255, minor), &devvp); if (error) return (error); mfsp = kmem_alloc(sizeof(*mfsp), KM_SLEEP); /* * Changing v_op and v_data here is safe as we are * the exclusive owner of this device node. */ KASSERT(devvp->v_op == spec_vnodeop_p); KASSERT(devvp->v_data == NULL); devvp->v_op = mfs_vnodeop_p; devvp->v_data = mfsp; mfsp->mfs_baseoff = args->base; mfsp->mfs_size = args->size; mfsp->mfs_vnode = devvp; mfsp->mfs_proc = p; mfsp->mfs_shutdown = 0; cv_init(&mfsp->mfs_cv, "mfsidl"); mfsp->mfs_refcnt = 1; bufq_alloc(&mfsp->mfs_buflist, "fcfs", 0); if ((error = ffs_mountfs(devvp, mp, l)) != 0) { mfsp->mfs_shutdown = 1; vrele(devvp); return (error); } ump = VFSTOUFS(mp); fs = ump->um_fs; error = set_statvfs_info(path, UIO_USERSPACE, args->fspec, UIO_USERSPACE, mp->mnt_op->vfs_name, mp, l); if (error) return error; (void)strncpy(fs->fs_fsmnt, mp->mnt_stat.f_mntonname, sizeof(fs->fs_fsmnt)); fs->fs_fsmnt[sizeof(fs->fs_fsmnt) - 1] = '\0'; /* XXX: cleanup on error */ return 0; } /* * Used to grab the process and keep it in the kernel to service * memory filesystem I/O requests. * * Loop servicing I/O requests. * Copy the requested data into or out of the memory filesystem * address space. */ /* ARGSUSED */ int mfs_start(struct mount *mp, int flags) { struct vnode *vp; struct mfsnode *mfsp; struct proc *p; struct buf *bp; void *base; int sleepreturn = 0, refcnt, error; ksiginfoq_t kq; /* * Ensure that file system is still mounted when getting mfsnode. * Add a reference to the mfsnode to prevent it disappearing in * this routine. */ if ((error = vfs_busy(mp)) != 0) return error; vp = VFSTOUFS(mp)->um_devvp; mfsp = VTOMFS(vp); mutex_enter(&mfs_lock); mfsp->mfs_refcnt++; mutex_exit(&mfs_lock); vfs_unbusy(mp); base = mfsp->mfs_baseoff; mutex_enter(&mfs_lock); while (mfsp->mfs_shutdown != 1) { while ((bp = bufq_get(mfsp->mfs_buflist)) != NULL) { mutex_exit(&mfs_lock); mfs_doio(bp, base); mutex_enter(&mfs_lock); } /* * If a non-ignored signal is received, try to unmount. * If that fails, or the filesystem is already in the * process of being unmounted, clear the signal (it has been * "processed"), otherwise we will loop here, as tsleep * will always return EINTR/ERESTART. */ if (sleepreturn != 0) { mutex_exit(&mfs_lock); if (dounmount(mp, 0, curlwp) != 0) { p = curproc; ksiginfo_queue_init(&kq); mutex_enter(p->p_lock); sigclearall(p, NULL, &kq); mutex_exit(p->p_lock); ksiginfo_queue_drain(&kq); } sleepreturn = 0; mutex_enter(&mfs_lock); continue; } sleepreturn = cv_wait_sig(&mfsp->mfs_cv, &mfs_lock); } KASSERT(bufq_peek(mfsp->mfs_buflist) == NULL); refcnt = --mfsp->mfs_refcnt; mutex_exit(&mfs_lock); if (refcnt == 0) { bufq_free(mfsp->mfs_buflist); cv_destroy(&mfsp->mfs_cv); kmem_free(mfsp, sizeof(*mfsp)); } return (sleepreturn); } /* * Get file system statistics. */ int mfs_statvfs(struct mount *mp, struct statvfs *sbp) { int error; error = ffs_statvfs(mp, sbp); if (error) return error; (void)strncpy(sbp->f_fstypename, mp->mnt_op->vfs_name, sizeof(sbp->f_fstypename)); sbp->f_fstypename[sizeof(sbp->f_fstypename) - 1] = '\0'; return 0; }
414 411 6 1 1 1 2 1 2 2 2 2 2 2 2 2 39 1 1 1 1 1 1 1 1 65 65 65 65 65 65 65 65 65 64 64 65 64 65 53 12 65 65 65 10 6 8 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 /* $NetBSD: ffs_vfsops.c,v 1.382 2023/09/08 23:21:55 riastradh Exp $ */ /*- * Copyright (c) 2008, 2009 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Wasabi Systems, Inc, and by Andrew Doran. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /* * Copyright (c) 1989, 1991, 1993, 1994 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)ffs_vfsops.c 8.31 (Berkeley) 5/20/95 */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: ffs_vfsops.c,v 1.382 2023/09/08 23:21:55 riastradh Exp $"); #if defined(_KERNEL_OPT) #include "opt_ffs.h" #include "opt_quota.h" #include "opt_wapbl.h" #endif #include <sys/param.h> #include <sys/systm.h> #include <sys/namei.h> #include <sys/proc.h> #include <sys/kernel.h> #include <sys/vnode.h> #include <sys/fstrans.h> #include <sys/socket.h> #include <sys/mount.h> #include <sys/buf.h> #include <sys/device.h> #include <sys/disk.h> #include <sys/file.h> #include <sys/disklabel.h> #include <sys/ioctl.h> #include <sys/errno.h> #include <sys/kmem.h> #include <sys/pool.h> #include <sys/lock.h> #include <sys/sysctl.h> #include <sys/conf.h> #include <sys/kauth.h> #include <sys/wapbl.h> #include <sys/module.h> #include <miscfs/genfs/genfs.h> #include <miscfs/specfs/specdev.h> #include <ufs/ufs/quota.h> #include <ufs/ufs/ufsmount.h> #include <ufs/ufs/inode.h> #include <ufs/ufs/dir.h> #include <ufs/ufs/ufs_extern.h> #include <ufs/ufs/ufs_bswap.h> #include <ufs/ufs/ufs_wapbl.h> #include <ufs/ffs/fs.h> #include <ufs/ffs/ffs_extern.h> #ifdef WAPBL MODULE(MODULE_CLASS_VFS, ffs, "ufs,wapbl"); #else MODULE(MODULE_CLASS_VFS, ffs, "ufs"); #endif static int ffs_vfs_fsync(vnode_t *, int); static int ffs_superblock_validate(struct fs *); static int ffs_is_appleufs(struct vnode *, struct fs *); static int ffs_init_vnode(struct ufsmount *, struct vnode *, ino_t); static void ffs_deinit_vnode(struct ufsmount *, struct vnode *); static kauth_listener_t ffs_snapshot_listener; /* how many times ffs_init() was called */ int ffs_initcount = 0; #ifdef DEBUG_FFS_MOUNT #define DPRINTF(_fmt, args...) printf("%s: " _fmt "\n", __func__, ##args) #else #define DPRINTF(_fmt, args...) do {} while (/*CONSTCOND*/0) #endif extern const struct vnodeopv_desc ffs_vnodeop_opv_desc; extern const struct vnodeopv_desc ffs_specop_opv_desc; extern const struct vnodeopv_desc ffs_fifoop_opv_desc; const struct vnodeopv_desc * const ffs_vnodeopv_descs[] = { &ffs_vnodeop_opv_desc, &ffs_specop_opv_desc, &ffs_fifoop_opv_desc, NULL, }; struct vfsops ffs_vfsops = { .vfs_name = MOUNT_FFS, .vfs_min_mount_data = sizeof (struct ufs_args), .vfs_mount = ffs_mount, .vfs_start = ufs_start, .vfs_unmount = ffs_unmount, .vfs_root = ufs_root, .vfs_quotactl = ufs_quotactl, .vfs_statvfs = ffs_statvfs, .vfs_sync = ffs_sync, .vfs_vget = ufs_vget, .vfs_loadvnode = ffs_loadvnode, .vfs_newvnode = ffs_newvnode, .vfs_fhtovp = ffs_fhtovp, .vfs_vptofh = ffs_vptofh, .vfs_init = ffs_init, .vfs_reinit = ffs_reinit, .vfs_done = ffs_done, .vfs_mountroot = ffs_mountroot, .vfs_snapshot = ffs_snapshot, .vfs_extattrctl = ffs_extattrctl, .vfs_suspendctl = genfs_suspendctl, .vfs_renamelock_enter = genfs_renamelock_enter, .vfs_renamelock_exit = genfs_renamelock_exit, .vfs_fsync = ffs_vfs_fsync, .vfs_opv_descs = ffs_vnodeopv_descs }; static const struct genfs_ops ffs_genfsops = { .gop_size = ffs_gop_size, .gop_alloc = ufs_gop_alloc, .gop_write = genfs_gop_write, .gop_markupdate = ufs_gop_markupdate, .gop_putrange = genfs_gop_putrange, }; static const struct ufs_ops ffs_ufsops = { .uo_itimes = ffs_itimes, .uo_update = ffs_update, .uo_truncate = ffs_truncate, .uo_balloc = ffs_balloc, .uo_snapgone = ffs_snapgone, .uo_bufrd = ffs_bufrd, .uo_bufwr = ffs_bufwr, }; static int ffs_checkrange(struct mount *mp, ino_t ino) { struct fs *fs = VFSTOUFS(mp)->um_fs; if (ino < UFS_ROOTINO || ino >= fs->fs_ncg * fs->fs_ipg) { DPRINTF("out of range %" PRIu64 "\n", ino); return ESTALE; } /* * Need to check if inode is initialized because ffsv2 does * lazy initialization and we can get here from nfs_fhtovp */ if (fs->fs_magic != FS_UFS2_MAGIC) return 0; struct buf *bp; int cg = ino_to_cg(fs, ino); struct ufsmount *ump = VFSTOUFS(mp); int error = bread(ump->um_devvp, FFS_FSBTODB(fs, cgtod(fs, cg)), (int)fs->fs_cgsize, B_MODIFY, &bp); if (error) { DPRINTF("error %d reading cg %d ino %" PRIu64 "\n", error, cg, ino); return error; } const int needswap = UFS_FSNEEDSWAP(fs); struct cg *cgp = (struct cg *)bp->b_data; if (!cg_chkmagic(cgp, needswap)) { brelse(bp, 0); DPRINTF("bad cylinder group magic cg %d ino %" PRIu64 "\n", cg, ino); return ESTALE; } int32_t initediblk = ufs_rw32(cgp->cg_initediblk, needswap); brelse(bp, 0); if (cg * fs->fs_ipg + initediblk < ino) { DPRINTF("cg=%d fs->fs_ipg=%d initediblk=%d ino=%" PRIu64 "\n", cg, fs->fs_ipg, initediblk, ino); return ESTALE; } return 0; } static int ffs_snapshot_cb(kauth_cred_t cred, kauth_action_t action, void *cookie, void *arg0, void *arg1, void *arg2, void *arg3) { vnode_t *vp = arg2; int result = KAUTH_RESULT_DEFER; if (action != KAUTH_SYSTEM_FS_SNAPSHOT) return result; if (VTOI(vp)->i_uid == kauth_cred_geteuid(cred)) result = KAUTH_RESULT_ALLOW; return result; } SYSCTL_SETUP(ffs_sysctl_setup, "ffs sysctls") { #ifdef UFS_EXTATTR extern int ufs_extattr_autocreate; #endif extern int ffs_log_changeopt; sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT, CTLTYPE_NODE, "ffs", SYSCTL_DESCR("Berkeley Fast File System"), NULL, 0, NULL, 0, CTL_VFS, 1, CTL_EOL); /* * @@@ should we even bother with these first three? */ sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_INT, "doclusterread", NULL, sysctl_notavail, 0, NULL, 0, CTL_VFS, 1, FFS_CLUSTERREAD, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_INT, "doclusterwrite", NULL, sysctl_notavail, 0, NULL, 0, CTL_VFS, 1, FFS_CLUSTERWRITE, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_INT, "doreallocblks", NULL, sysctl_notavail, 0, NULL, 0, CTL_VFS, 1, FFS_REALLOCBLKS, CTL_EOL); #if 0 sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_INT, "doasyncfree", SYSCTL_DESCR("Release dirty blocks asynchronously"), NULL, 0, &doasyncfree, 0, CTL_VFS, 1, FFS_ASYNCFREE, CTL_EOL); #endif sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_INT, "log_changeopt", SYSCTL_DESCR("Log changes in optimization strategy"), NULL, 0, &ffs_log_changeopt, 0, CTL_VFS, 1, FFS_LOG_CHANGEOPT, CTL_EOL); #ifdef UFS_EXTATTR sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_INT, "extattr_autocreate", SYSCTL_DESCR("Size of attribute for " "backing file autocreation"), NULL, 0, &ufs_extattr_autocreate, 0, CTL_VFS, 1, FFS_EXTATTR_AUTOCREATE, CTL_EOL); #endif /* UFS_EXTATTR */ } static int ffs_modcmd(modcmd_t cmd, void *arg) { int error; #if 0 extern int doasyncfree; #endif switch (cmd) { case MODULE_CMD_INIT: error = vfs_attach(&ffs_vfsops); if (error != 0) break; ffs_snapshot_listener = kauth_listen_scope(KAUTH_SCOPE_SYSTEM, ffs_snapshot_cb, NULL); if (ffs_snapshot_listener == NULL) printf("ffs_modcmd: can't listen on system scope.\n"); break; case MODULE_CMD_FINI: error = vfs_detach(&ffs_vfsops); if (error != 0) break; if (ffs_snapshot_listener != NULL) kauth_unlisten_scope(ffs_snapshot_listener); break; default: error = ENOTTY; break; } return (error); } pool_cache_t ffs_inode_cache; pool_cache_t ffs_dinode1_cache; pool_cache_t ffs_dinode2_cache; static void ffs_oldfscompat_read(struct fs *, struct ufsmount *, daddr_t); static void ffs_oldfscompat_write(struct fs *, struct ufsmount *); /* * Called by main() when ffs is going to be mounted as root. */ int ffs_mountroot(void) { struct fs *fs; struct mount *mp; struct lwp *l = curlwp; /* XXX */ struct ufsmount *ump; int error; if (device_class(root_device) != DV_DISK) return (ENODEV); if ((error = vfs_rootmountalloc(MOUNT_FFS, "root_device", &mp))) { vrele(rootvp); return (error); } /* * We always need to be able to mount the root file system. */ mp->mnt_flag |= MNT_FORCE; if ((error = ffs_mountfs(rootvp, mp, l)) != 0) { vfs_unbusy(mp); vfs_rele(mp); return (error); } mp->mnt_flag &= ~MNT_FORCE; mountlist_append(mp); ump = VFSTOUFS(mp); fs = ump->um_fs; memset(fs->fs_fsmnt, 0, sizeof(fs->fs_fsmnt)); (void)copystr(mp->mnt_stat.f_mntonname, fs->fs_fsmnt, MNAMELEN - 1, 0); (void)ffs_statvfs(mp, &mp->mnt_stat); vfs_unbusy(mp); setrootfstime((time_t)fs->fs_time); return (0); } static int ffs_acls(struct mount *mp, int fs_flags) { struct ufsmount *ump; ump = VFSTOUFS(mp); if (ump->um_fstype == UFS2 && (ump->um_flags & UFS_EA) == 0 && ((mp->mnt_flag & (MNT_POSIX1EACLS | MNT_NFS4ACLS)) != 0 || (fs_flags & (FS_POSIX1EACLS | FS_NFS4ACLS)) != 0)) { printf("%s: ACLs requested but not supported by this fs\n", mp->mnt_stat.f_mntonname); return EINVAL; } if ((fs_flags & FS_POSIX1EACLS) != 0) { #ifdef UFS_ACL if (mp->mnt_flag & MNT_NFS4ACLS) printf("WARNING: %s: POSIX.1e ACLs flag on fs conflicts " "with \"nfsv4acls\" mount option; option ignored\n", mp->mnt_stat.f_mntonname); mp->mnt_flag &= ~MNT_NFS4ACLS; mp->mnt_flag |= MNT_POSIX1EACLS; #else printf("WARNING: %s: POSIX.1e ACLs flag on fs but no " "ACLs support\n", mp->mnt_stat.f_mntonname); #endif } if ((fs_flags & FS_NFS4ACLS) != 0) { #ifdef UFS_ACL if (mp->mnt_flag & MNT_POSIX1EACLS) printf("WARNING: %s: NFSv4 ACLs flag on fs conflicts " "with \"posix1eacls\" mount option; option ignored\n", mp->mnt_stat.f_mntonname); mp->mnt_flag &= ~MNT_POSIX1EACLS; mp->mnt_flag |= MNT_NFS4ACLS; #else printf("WARNING: %s: NFSv4 ACLs flag on fs but no " "ACLs support\n", mp->mnt_stat.f_mntonname); #endif } if ((mp->mnt_flag & (MNT_NFS4ACLS | MNT_POSIX1EACLS)) == (MNT_NFS4ACLS | MNT_POSIX1EACLS)) { printf("%s: \"posix1eacls\" and \"nfsv4acls\" options " "are mutually exclusive\n", mp->mnt_stat.f_mntonname); return EINVAL; } if (mp->mnt_flag & (MNT_NFS4ACLS | MNT_POSIX1EACLS)) mp->mnt_iflag &= ~(IMNT_SHRLOOKUP|IMNT_NCLOOKUP); else mp->mnt_iflag |= IMNT_SHRLOOKUP|IMNT_NCLOOKUP; return 0; } /* * VFS Operations. * * mount system call */ int ffs_mount(struct mount *mp, const char *path, void *data, size_t *data_len) { struct lwp *l = curlwp; struct vnode *devvp = NULL; struct ufs_args *args = data; struct ufsmount *ump = NULL; struct fs *fs; int error = 0, flags, update; mode_t accessmode; if (args == NULL) { DPRINTF("NULL args"); return EINVAL; } if (*data_len < sizeof(*args)) { DPRINTF("bad size args %zu != %zu", *data_len, sizeof(*args)); return EINVAL; } ump = VFSTOUFS(mp); if ((mp->mnt_flag & (MNT_GETARGS|MNT_UPDATE)) && ump == NULL) { DPRINTF("no ump"); return EIO; } if (mp->mnt_flag & MNT_GETARGS) { args->fspec = NULL; *data_len = sizeof *args; return 0; } update = mp->mnt_flag & MNT_UPDATE; /* Check arguments */ if (args->fspec == NULL) { if (!update) { /* New mounts must have a filename for the device */ DPRINTF("no filename for mount"); return EINVAL; } } else { /* * Look up the name and verify that it's sane. */ error = namei_simple_user(args->fspec, NSM_FOLLOW_NOEMULROOT, &devvp); if (error != 0) { DPRINTF("namei_simple_user returned %d", error); return error; } /* * Be sure this is a valid block device */ if (devvp->v_type != VBLK) { DPRINTF("non block device %d", devvp->v_type); error = ENOTBLK; goto fail; } if (bdevsw_lookup(devvp->v_rdev) == NULL) { DPRINTF("can't find block device 0x%jx", devvp->v_rdev); error = ENXIO; goto fail; } if (update) { /* * Be sure we're still naming the same device * used for our initial mount */ if (devvp != ump->um_devvp && devvp->v_rdev != ump->um_devvp->v_rdev) { DPRINTF("wrong device 0x%jx != 0x%jx", (uintmax_t)devvp->v_rdev, (uintmax_t)ump->um_devvp->v_rdev); error = EINVAL; goto fail; } vrele(devvp); devvp = NULL; } } if (devvp == NULL) { devvp = ump->um_devvp; vref(devvp); } /* * If mount by non-root, then verify that user has necessary * permissions on the device. * * Permission to update a mount is checked higher, so here we presume * updating the mount is okay (for example, as far as securelevel goes) * which leaves us with the normal check. */ accessmode = VREAD; if (update ? (mp->mnt_iflag & IMNT_WANTRDWR) != 0 : (mp->mnt_flag & MNT_RDONLY) == 0) accessmode |= VWRITE; vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY); error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_MOUNT, KAUTH_REQ_SYSTEM_MOUNT_DEVICE, mp, devvp, KAUTH_ARG(accessmode)); VOP_UNLOCK(devvp); if (error) { DPRINTF("kauth returned %d", error); goto fail; } #ifdef WAPBL /* WAPBL can only be enabled on a r/w mount. */ if (((mp->mnt_flag & MNT_RDONLY) && !(mp->mnt_iflag & IMNT_WANTRDWR)) || (mp->mnt_iflag & IMNT_WANTRDONLY)) { mp->mnt_flag &= ~MNT_LOG; } #else /* !WAPBL */ mp->mnt_flag &= ~MNT_LOG; #endif /* !WAPBL */ error = set_statvfs_info(path, UIO_USERSPACE, args->fspec, UIO_USERSPACE, mp->mnt_op->vfs_name, mp, l); if (error) goto fail; if (!update) { int xflags; if (mp->mnt_flag & MNT_RDONLY) xflags = FREAD; else xflags = FREAD | FWRITE; vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY); error = VOP_OPEN(devvp, xflags, FSCRED); VOP_UNLOCK(devvp); if (error) { DPRINTF("VOP_OPEN returned %d", error); goto fail; } /* Need fstrans_start() for assertion in ufs_strategy(). */ if ((mp->mnt_flag & MNT_RDONLY) == 0) fstrans_start(mp); error = ffs_mountfs(devvp, mp, l); if ((mp->mnt_flag & MNT_RDONLY) == 0) fstrans_done(mp); if (error) { DPRINTF("ffs_mountfs returned %d", error); vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY); (void)VOP_CLOSE(devvp, xflags, NOCRED); VOP_UNLOCK(devvp); goto fail; } ump = VFSTOUFS(mp); fs = ump->um_fs; } else { /* * Update the mount. */ /* * The initial mount got a reference on this * device, so drop the one obtained via * namei(), above. */ vrele(devvp); ump = VFSTOUFS(mp); fs = ump->um_fs; if (fs->fs_ronly == 0 && (mp->mnt_iflag & IMNT_WANTRDONLY)) { /* * Changing from r/w to r/o */ flags = WRITECLOSE; if (mp->mnt_flag & MNT_FORCE) flags |= FORCECLOSE; error = ffs_flushfiles(mp, flags, l); if (error) return error; error = UFS_WAPBL_BEGIN(mp); if (error) { DPRINTF("wapbl %d", error); return error; } if (ffs_cgupdate(ump, MNT_WAIT) == 0 && fs->fs_clean & FS_WASCLEAN) { if (mp->mnt_flag & MNT_SOFTDEP) fs->fs_flags &= ~FS_DOSOFTDEP; fs->fs_clean = FS_ISCLEAN; (void) ffs_sbupdate(ump, MNT_WAIT); } UFS_WAPBL_END(mp); } #ifdef WAPBL if ((mp->mnt_flag & MNT_LOG) == 0) { error = ffs_wapbl_stop(mp, mp->mnt_flag & MNT_FORCE); if (error) { DPRINTF("ffs_wapbl_stop returned %d", error); return error; } } #endif /* WAPBL */ if (fs->fs_ronly == 0 && (mp->mnt_iflag & IMNT_WANTRDONLY)) { /* * Finish change from r/w to r/o */ fs->fs_ronly = 1; fs->fs_fmod = 0; } error = ffs_acls(mp, fs->fs_flags); if (error) return error; if (mp->mnt_flag & MNT_RELOAD) { error = ffs_reload(mp, l->l_cred, l); if (error) { DPRINTF("ffs_reload returned %d", error); return error; } } if (fs->fs_ronly && (mp->mnt_iflag & IMNT_WANTRDWR)) { /* * Changing from read-only to read/write */ #ifndef QUOTA2 if (fs->fs_flags & FS_DOQUOTA2) { ump->um_flags |= UFS_QUOTA2; uprintf("%s: options QUOTA2 not enabled%s\n", mp->mnt_stat.f_mntonname, (mp->mnt_flag & MNT_FORCE) ? "" : ", not mounting"); DPRINTF("ffs_quota2 %d", EINVAL); return EINVAL; } #endif fs->fs_ronly = 0; fs->fs_clean = fs->fs_clean == FS_ISCLEAN ? FS_WASCLEAN : 0; fs->fs_fmod = 1; #ifdef WAPBL if (fs->fs_flags & FS_DOWAPBL) { const char *nm = mp->mnt_stat.f_mntonname; if (!mp->mnt_wapbl_replay) { printf("%s: log corrupted;" " replay cancelled\n", nm); return EFTYPE; } printf("%s: replaying log to disk\n", nm); error = wapbl_replay_write(mp->mnt_wapbl_replay, devvp); if (error) { DPRINTF("%s: wapbl_replay_write %d", nm, error); return error; } wapbl_replay_stop(mp->mnt_wapbl_replay); fs->fs_clean = FS_WASCLEAN; } #endif /* WAPBL */ if (fs->fs_snapinum[0] != 0) ffs_snapshot_mount(mp); } #ifdef WAPBL error = ffs_wapbl_start(mp); if (error) { DPRINTF("ffs_wapbl_start returned %d", error); return error; } #endif /* WAPBL */ #ifdef QUOTA2 if (!fs->fs_ronly) { error = ffs_quota2_mount(mp); if (error) { DPRINTF("ffs_quota2_mount returned %d", error); return error; } } #endif if ((mp->mnt_flag & MNT_DISCARD) && !(ump->um_discarddata)) ump->um_discarddata = ffs_discard_init(devvp, fs); if (args->fspec == NULL) return 0; } (void)strncpy(fs->fs_fsmnt, mp->mnt_stat.f_mntonname, sizeof(fs->fs_fsmnt)); fs->fs_flags &= ~FS_DOSOFTDEP; if ((fs->fs_ronly && (fs->fs_clean & FS_ISCLEAN) == 0) || (!fs->fs_ronly && (fs->fs_clean & FS_WASCLEAN) == 0)) { printf("%s: file system not clean (fs_clean=%#x); " "please fsck(8)\n", mp->mnt_stat.f_mntfromname, fs->fs_clean); } if (fs->fs_fmod != 0) { int err; KASSERT(!fs->fs_ronly); if (fs->fs_clean & FS_WASCLEAN) fs->fs_time = time_second; fs->fs_fmod = 0; err = UFS_WAPBL_BEGIN(mp); if (err == 0) { (void) ffs_cgupdate(ump, MNT_WAIT); UFS_WAPBL_END(mp); } } if ((mp->mnt_flag & MNT_SOFTDEP) != 0) { printf("%s: `-o softdep' is no longer supported, " "consider `-o log'\n", mp->mnt_stat.f_mntfromname); mp->mnt_flag &= ~MNT_SOFTDEP; } return (error); fail: vrele(devvp); return (error); } /* * Reload all incore data for a filesystem (used after running fsck on * the root filesystem and finding things to fix). The filesystem must * be mounted read-only. * * Things to do to update the mount: * 1) invalidate all cached meta-data. * 2) re-read superblock from disk. * 3) re-read summary information from disk. * 4) invalidate all inactive vnodes. * 5) invalidate all cached file data. * 6) re-read inode data for all active vnodes. */ int ffs_reload(struct mount *mp, kauth_cred_t cred, struct lwp *l) { struct vnode *vp, *devvp; struct inode *ip; void *space; struct buf *bp; struct fs *fs, *newfs; int i, bsize, blks, error; int32_t *lp, fs_sbsize; struct ufsmount *ump; daddr_t sblockloc; struct vnode_iterator *marker; if ((mp->mnt_flag & MNT_RDONLY) == 0) return (EINVAL); ump = VFSTOUFS(mp); /* * Step 1: invalidate all cached meta-data. */ devvp = ump->um_devvp; vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY); error = vinvalbuf(devvp, 0, cred, l, 0, 0); VOP_UNLOCK(devvp); if (error) panic("%s: dirty1", __func__); /* * Step 2: re-read superblock from disk. XXX: We don't handle * possibility that superblock moved. Which implies that we don't * want its size to change either. */ fs = ump->um_fs; fs_sbsize = fs->fs_sbsize; error = bread(devvp, fs->fs_sblockloc / DEV_BSIZE, fs_sbsize, 0, &bp); if (error) return (error); newfs = kmem_alloc(fs_sbsize, KM_SLEEP); memcpy(newfs, bp->b_data, fs_sbsize); #ifdef FFS_EI if (ump->um_flags & UFS_NEEDSWAP) { ffs_sb_swap((struct fs *)bp->b_data, newfs); newfs->fs_flags |= FS_SWAPPED; } else #endif newfs->fs_flags &= ~FS_SWAPPED; brelse(bp, 0); /* Allow converting from UFS2 to UFS2EA but not vice versa. */ if (newfs->fs_magic == FS_UFS2EA_MAGIC) { ump->um_flags |= UFS_EA; newfs->fs_magic = FS_UFS2_MAGIC; } else { if ((ump->um_flags & UFS_EA) != 0) return EINVAL; } if ((newfs->fs_magic != FS_UFS1_MAGIC) && (newfs->fs_magic != FS_UFS2_MAGIC)) { kmem_free(newfs, fs_sbsize); return (EIO); /* XXX needs translation */ } if (!ffs_superblock_validate(newfs)) { kmem_free(newfs, fs_sbsize); return (EINVAL); } /* * The current implementation doesn't handle the possibility that * these values may have changed. */ if ((newfs->fs_sbsize != fs_sbsize) || (newfs->fs_cssize != fs->fs_cssize) || (newfs->fs_contigsumsize != fs->fs_contigsumsize) || (newfs->fs_ncg != fs->fs_ncg)) { kmem_free(newfs, fs_sbsize); return (EINVAL); } /* Store off old fs_sblockloc for fs_oldfscompat_read. */ sblockloc = fs->fs_sblockloc; /* * Copy pointer fields back into superblock before copying in XXX * new superblock. These should really be in the ufsmount. XXX * Note that important parameters (eg fs_ncg) are unchanged. */ newfs->fs_csp = fs->fs_csp; newfs->fs_maxcluster = fs->fs_maxcluster; newfs->fs_contigdirs = fs->fs_contigdirs; newfs->fs_ronly = fs->fs_ronly; newfs->fs_active = fs->fs_active; memcpy(fs, newfs, (u_int)fs_sbsize); kmem_free(newfs, fs_sbsize); /* * Recheck for Apple UFS filesystem. */ ump->um_flags &= ~UFS_ISAPPLEUFS; if (ffs_is_appleufs(devvp, fs)) { #ifdef APPLE_UFS ump->um_flags |= UFS_ISAPPLEUFS; #else DPRINTF("AppleUFS not supported"); return (EIO); /* XXX: really? */ #endif } if (UFS_MPISAPPLEUFS(ump)) { /* see comment about NeXT below */ ump->um_maxsymlinklen = APPLEUFS_MAXSYMLINKLEN; ump->um_dirblksiz = APPLEUFS_DIRBLKSIZ; mp->mnt_iflag |= IMNT_DTYPE; } else { ump->um_maxsymlinklen = fs->fs_maxsymlinklen; ump->um_dirblksiz = UFS_DIRBLKSIZ; if (ump->um_maxsymlinklen > 0) mp->mnt_iflag |= IMNT_DTYPE; else mp->mnt_iflag &= ~IMNT_DTYPE; } ffs_oldfscompat_read(fs, ump, sblockloc); mutex_enter(&ump->um_lock); ump->um_maxfilesize = fs->fs_maxfilesize; if (fs->fs_flags & ~(FS_KNOWN_FLAGS | FS_INTERNAL)) { uprintf("%s: unknown ufs flags: 0x%08"PRIx32"%s\n", mp->mnt_stat.f_mntonname, fs->fs_flags, (mp->mnt_flag & MNT_FORCE) ? "" : ", not mounting"); if ((mp->mnt_flag & MNT_FORCE) == 0) { mutex_exit(&ump->um_lock); return (EINVAL); } } if (fs->fs_pendingblocks != 0 || fs->fs_pendinginodes != 0) { fs->fs_pendingblocks = 0; fs->fs_pendinginodes = 0; } mutex_exit(&ump->um_lock); ffs_statvfs(mp, &mp->mnt_stat); /* * Step 3: re-read summary information from disk. */ blks = howmany(fs->fs_cssize, fs->fs_fsize); space = fs->fs_csp; for (i = 0; i < blks; i += fs->fs_frag) { bsize = fs->fs_bsize; if (i + fs->fs_frag > blks) bsize = (blks - i) * fs->fs_fsize; error = bread(devvp, FFS_FSBTODB(fs, fs->fs_csaddr + i), bsize, 0, &bp); if (error) { return (error); } #ifdef FFS_EI if (UFS_FSNEEDSWAP(fs)) ffs_csum_swap((struct csum *)bp->b_data, (struct csum *)space, bsize); else #endif memcpy(space, bp->b_data, (size_t)bsize); space = (char *)space + bsize; brelse(bp, 0); } /* * We no longer know anything about clusters per cylinder group. */ if (fs->fs_contigsumsize > 0) { lp = fs->fs_maxcluster; for (i = 0; i < fs->fs_ncg; i++) *lp++ = fs->fs_contigsumsize; } vfs_vnode_iterator_init(mp, &marker); while ((vp = vfs_vnode_iterator_next(marker, NULL, NULL))) { /* * Step 4: invalidate all inactive vnodes. */ if (vrecycle(vp)) continue; /* * Step 5: invalidate all cached file data. */ if (vn_lock(vp, LK_EXCLUSIVE)) { vrele(vp); continue; } if (vinvalbuf(vp, 0, cred, l, 0, 0)) panic("%s: dirty2", __func__); /* * Step 6: re-read inode data for all active vnodes. */ ip = VTOI(vp); error = bread(devvp, FFS_FSBTODB(fs, ino_to_fsba(fs, ip->i_number)), (int)fs->fs_bsize, 0, &bp); if (error) { vput(vp); break; } ffs_load_inode(bp, ip, fs, ip->i_number); brelse(bp, 0); vput(vp); } vfs_vnode_iterator_destroy(marker); return (error); } /* * Possible superblock locations ordered from most to least likely. */ static const int sblock_try[] = SBLOCKSEARCH; static int ffs_superblock_validate(struct fs *fs) { int32_t i, fs_bshift = 0, fs_fshift = 0, fs_fragshift = 0, fs_frag; int32_t fs_inopb; /* Check the superblock size */ if (fs->fs_sbsize > SBLOCKSIZE || fs->fs_sbsize < sizeof(struct fs)) return 0; /* Check the file system blocksize */ if (fs->fs_bsize > MAXBSIZE || fs->fs_bsize < MINBSIZE) return 0; if (!powerof2(fs->fs_bsize)) return 0; /* Check the size of frag blocks */ if (!powerof2(fs->fs_fsize)) return 0; if (fs->fs_fsize == 0) return 0; /* * XXX: these values are just zero-checked to prevent obvious * bugs. We need more strict checks. */ if (fs->fs_size == 0 && fs->fs_old_size == 0) return 0; if (fs->fs_cssize == 0) return 0; if (fs->fs_ipg == 0) return 0; if (fs->fs_fpg == 0) return 0; if (fs->fs_ncg == 0) return 0; if (fs->fs_maxbpg == 0) return 0; /* Check the number of inodes per block */ if (fs->fs_magic == FS_UFS1_MAGIC) fs_inopb = fs->fs_bsize / sizeof(struct ufs1_dinode); else /* fs->fs_magic == FS_UFS2_MAGIC */ fs_inopb = fs->fs_bsize / sizeof(struct ufs2_dinode); if (fs->fs_inopb != fs_inopb) return 0; /* Block size cannot be smaller than fragment size */ if (fs->fs_bsize < fs->fs_fsize) return 0; /* Compute fs_bshift and ensure it is consistent */ for (i = fs->fs_bsize; i > 1; i >>= 1) fs_bshift++; if (fs->fs_bshift != fs_bshift) return 0; /* Compute fs_fshift and ensure it is consistent */ for (i = fs->fs_fsize; i > 1; i >>= 1) fs_fshift++; if (fs->fs_fshift != fs_fshift) return 0; /* Compute fs_fragshift and ensure it is consistent */ for (i = fs->fs_frag; i > 1; i >>= 1) fs_fragshift++; if (fs->fs_fragshift != fs_fragshift) return 0; /* Check the masks */ if (fs->fs_bmask != ~(fs->fs_bsize - 1)) return 0; if (fs->fs_fmask != ~(fs->fs_fsize - 1)) return 0; /* * Now that the shifts and masks are sanitized, we can use the ffs_ API. */ /* Check the number of frag blocks */ if ((fs_frag = ffs_numfrags(fs, fs->fs_bsize)) > MAXFRAG) return 0; if (fs->fs_frag != fs_frag) return 0; /* Check the size of cylinder groups */ if ((fs->fs_cgsize < sizeof(struct cg)) || (fs->fs_cgsize > fs->fs_bsize)) return 0; return 1; } static int ffs_is_appleufs(struct vnode *devvp, struct fs *fs) { struct dkwedge_info dkw; int ret = 0; /* * First check to see if this is tagged as an Apple UFS filesystem * in the disklabel. */ if (getdiskinfo(devvp, &dkw) == 0 && strcmp(dkw.dkw_ptype, DKW_PTYPE_APPLEUFS) == 0) ret = 1; #ifdef APPLE_UFS else { struct appleufslabel *applefs; struct buf *bp; daddr_t blkno = APPLEUFS_LABEL_OFFSET / DEV_BSIZE; int error; /* * Manually look for an Apple UFS label, and if a valid one * is found, then treat it like an Apple UFS filesystem anyway. */ error = bread(devvp, blkno, APPLEUFS_LABEL_SIZE, 0, &bp); if (error) { DPRINTF("bread@0x%jx returned %d", (intmax_t)blkno, error); return 0; } applefs = (struct appleufslabel *)bp->b_data; error = ffs_appleufs_validate(fs->fs_fsmnt, applefs, NULL); if (error == 0) ret = 1; brelse(bp, 0); } #endif return ret; } /* * Common code for mount and mountroot */ int ffs_mountfs(struct vnode *devvp, struct mount *mp, struct lwp *l) { struct ufsmount *ump = NULL; struct buf *bp = NULL; struct fs *fs = NULL; dev_t dev; void *space; daddr_t sblockloc = 0; int blks, fstype = 0; int error, i, bsize, ronly, bset = 0; #ifdef FFS_EI int needswap = 0; /* keep gcc happy */ #endif int32_t *lp; kauth_cred_t cred; u_int32_t allocsbsize, fs_sbsize = 0; dev = devvp->v_rdev; cred = l ? l->l_cred : NOCRED; /* Flush out any old buffers remaining from a previous use. */ vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY); error = vinvalbuf(devvp, V_SAVE, cred, l, 0, 0); VOP_UNLOCK(devvp); if (error) { DPRINTF("vinvalbuf returned %d", error); return error; } ronly = (mp->mnt_flag & MNT_RDONLY) != 0; ump = kmem_zalloc(sizeof(*ump), KM_SLEEP); mutex_init(&ump->um_lock, MUTEX_DEFAULT, IPL_NONE); error = ffs_snapshot_init(ump); if (error) { DPRINTF("ffs_snapshot_init returned %d", error); goto out; } ump->um_ops = &ffs_ufsops; #ifdef WAPBL sbagain: #endif /* * Try reading the superblock in each of its possible locations. */ for (i = 0; ; i++) { daddr_t fs_sblockloc; if (bp != NULL) { brelse(bp, BC_NOCACHE); bp = NULL; } if (sblock_try[i] == -1) { DPRINTF("no superblock found"); error = EINVAL; fs = NULL; goto out; } error = bread(devvp, sblock_try[i] / DEV_BSIZE, SBLOCKSIZE, 0, &bp); if (error) { DPRINTF("bread@0x%x returned %d", sblock_try[i] / DEV_BSIZE, error); fs = NULL; goto out; } fs = (struct fs *)bp->b_data; sblockloc = sblock_try[i]; DPRINTF("fs_magic 0x%x", fs->fs_magic); /* * Swap: here, we swap fs->fs_sbsize in order to get the correct * size to read the superblock. Once read, we swap the whole * superblock structure. */ if (fs->fs_magic == FS_UFS2EA_MAGIC) { ump->um_flags |= UFS_EA; fs->fs_magic = FS_UFS2_MAGIC; } else if (fs->fs_magic == FS_UFS2EA_MAGIC_SWAPPED) { ump->um_flags |= UFS_EA; fs->fs_magic = FS_UFS2_MAGIC_SWAPPED; } if (fs->fs_magic == FS_UFS1_MAGIC) { fs_sbsize = fs->fs_sbsize; fstype = UFS1; #ifdef FFS_EI needswap = 0; } else if (fs->fs_magic == FS_UFS1_MAGIC_SWAPPED) { fs_sbsize = bswap32(fs->fs_sbsize); fstype = UFS1; needswap = 1; #endif } else if (fs->fs_magic == FS_UFS2_MAGIC) { fs_sbsize = fs->fs_sbsize; fstype = UFS2; #ifdef FFS_EI needswap = 0; } else if (fs->fs_magic == FS_UFS2_MAGIC_SWAPPED) { fs_sbsize = bswap32(fs->fs_sbsize); fstype = UFS2; needswap = 1; #endif } else continue; /* fs->fs_sblockloc isn't defined for old filesystems */ if (fstype == UFS1 && !(fs->fs_old_flags & FS_FLAGS_UPDATED)) { if (sblockloc == SBLOCK_UFS2) /* * This is likely to be the first alternate * in a filesystem with 64k blocks. * Don't use it. */ continue; fs_sblockloc = sblockloc; } else { fs_sblockloc = fs->fs_sblockloc; #ifdef FFS_EI if (needswap) fs_sblockloc = bswap64(fs_sblockloc); #endif } /* Check we haven't found an alternate superblock */ if (fs_sblockloc != sblockloc) continue; /* Check the superblock size */ if (fs_sbsize > SBLOCKSIZE || fs_sbsize < sizeof(struct fs)) continue; fs = kmem_alloc((u_long)fs_sbsize, KM_SLEEP); memcpy(fs, bp->b_data, fs_sbsize); /* Swap the whole superblock structure, if necessary. */ #ifdef FFS_EI if (needswap) { ffs_sb_swap((struct fs*)bp->b_data, fs); fs->fs_flags |= FS_SWAPPED; } else #endif fs->fs_flags &= ~FS_SWAPPED; /* * Now that everything is swapped, the superblock is ready to * be sanitized. */ if (!ffs_superblock_validate(fs)) { kmem_free(fs, fs_sbsize); continue; } /* Ok seems to be a good superblock */ break; } ump->um_fs = fs; #ifdef WAPBL if ((mp->mnt_wapbl_replay == 0) && (fs->fs_flags & FS_DOWAPBL)) { error = ffs_wapbl_replay_start(mp, fs, devvp); if (error && (mp->mnt_flag & MNT_FORCE) == 0) { DPRINTF("ffs_wapbl_replay_start returned %d", error); goto out; } if (!error) { if (!ronly) { /* XXX fsmnt may be stale. */ printf("%s: replaying log to disk\n", fs->fs_fsmnt); error = wapbl_replay_write(mp->mnt_wapbl_replay, devvp); if (error) { DPRINTF("wapbl_replay_write returned %d", error); goto out; } wapbl_replay_stop(mp->mnt_wapbl_replay); fs->fs_clean = FS_WASCLEAN; } else { /* XXX fsmnt may be stale */ printf("%s: replaying log to memory\n", fs->fs_fsmnt); } /* Force a re-read of the superblock */ brelse(bp, BC_INVAL); bp = NULL; kmem_free(fs, fs_sbsize); fs = NULL; goto sbagain; } } #else /* !WAPBL */ if ((fs->fs_flags & FS_DOWAPBL) && (mp->mnt_flag & MNT_FORCE) == 0) { error = EPERM; DPRINTF("no force %d", error); goto out; } #endif /* !WAPBL */ ffs_oldfscompat_read(fs, ump, sblockloc); ump->um_maxfilesize = fs->fs_maxfilesize; if (fs->fs_flags & ~(FS_KNOWN_FLAGS | FS_INTERNAL)) { uprintf("%s: unknown ufs flags: 0x%08"PRIx32"%s\n", mp->mnt_stat.f_mntonname, fs->fs_flags, (mp->mnt_flag & MNT_FORCE) ? "" : ", not mounting"); if ((mp->mnt_flag & MNT_FORCE) == 0) { error = EINVAL; DPRINTF("no force %d", error); goto out; } } fs->fs_fmod = 0; if (fs->fs_pendingblocks != 0 || fs->fs_pendinginodes != 0) { fs->fs_pendingblocks = 0; fs->fs_pendinginodes = 0; } ump->um_fstype = fstype; if (fs->fs_sbsize < SBLOCKSIZE) brelse(bp, BC_INVAL); else brelse(bp, 0); bp = NULL; if (ffs_is_appleufs(devvp, fs)) { #ifdef APPLE_UFS ump->um_flags |= UFS_ISAPPLEUFS; #else DPRINTF("AppleUFS not supported"); error = EINVAL; goto out; #endif } #if 0 /* * XXX This code changes the behaviour of mounting dirty filesystems, to * XXX require "mount -f ..." to mount them. This doesn't match what * XXX mount(8) describes and is disabled for now. */ /* * If the file system is not clean, don't allow it to be mounted * unless MNT_FORCE is specified. (Note: MNT_FORCE is always set * for the root file system.) */ if (fs->fs_flags & FS_DOWAPBL) { /* * wapbl normally expects to be FS_WASCLEAN when the FS_DOWAPBL * bit is set, although there's a window in unmount where it * could be FS_ISCLEAN */ if ((mp->mnt_flag & MNT_FORCE) == 0 && (fs->fs_clean & (FS_WASCLEAN | FS_ISCLEAN)) == 0) { error = EPERM; goto out; } } else if ((fs->fs_clean & FS_ISCLEAN) == 0 && (mp->mnt_flag & MNT_FORCE) == 0) { error = EPERM; goto out; } #endif /* * Verify that we can access the last block in the fs * if we're mounting read/write. */ if (!ronly) { error = bread(devvp, FFS_FSBTODB(fs, fs->fs_size - 1), fs->fs_fsize, 0, &bp); if (error) { DPRINTF("bread@0x%jx returned %d", (intmax_t)FFS_FSBTODB(fs, fs->fs_size - 1), error); bset = BC_INVAL; goto out; } if (bp->b_bcount != fs->fs_fsize) { DPRINTF("bcount %x != fsize %x", bp->b_bcount, fs->fs_fsize); error = EINVAL; bset = BC_INVAL; goto out; } brelse(bp, BC_INVAL); bp = NULL; } fs->fs_ronly = ronly; /* Don't bump fs_clean if we're replaying journal */ if (!((fs->fs_flags & FS_DOWAPBL) && (fs->fs_clean & FS_WASCLEAN))) { if (ronly == 0) { fs->fs_clean = fs->fs_clean == FS_ISCLEAN ? FS_WASCLEAN : 0; fs->fs_fmod = 1; } } bsize = fs->fs_cssize; blks = howmany(bsize, fs->fs_fsize); if (fs->fs_contigsumsize > 0) bsize += fs->fs_ncg * sizeof(int32_t); bsize += fs->fs_ncg * sizeof(*fs->fs_contigdirs); allocsbsize = bsize; space = kmem_alloc((u_long)allocsbsize, KM_SLEEP); fs->fs_csp = space; for (i = 0; i < blks; i += fs->fs_frag) { bsize = fs->fs_bsize; if (i + fs->fs_frag > blks) bsize = (blks - i) * fs->fs_fsize; error = bread(devvp, FFS_FSBTODB(fs, fs->fs_csaddr + i), bsize, 0, &bp); if (error) { DPRINTF("bread@0x%jx %d", (intmax_t)FFS_FSBTODB(fs, fs->fs_csaddr + i), error); goto out1; } #ifdef FFS_EI if (needswap) ffs_csum_swap((struct csum *)bp->b_data, (struct csum *)space, bsize); else #endif memcpy(space, bp->b_data, (u_int)bsize); space = (char *)space + bsize; brelse(bp, 0); bp = NULL; } if (fs->fs_contigsumsize > 0) { fs->fs_maxcluster = lp = space; for (i = 0; i < fs->fs_ncg; i++) *lp++ = fs->fs_contigsumsize; space = lp; } bsize = fs->fs_ncg * sizeof(*fs->fs_contigdirs); fs->fs_contigdirs = space; space = (char *)space + bsize; memset(fs->fs_contigdirs, 0, bsize); /* Compatibility for old filesystems - XXX */ if (fs->fs_avgfilesize <= 0) fs->fs_avgfilesize = AVFILESIZ; if (fs->fs_avgfpdir <= 0) fs->fs_avgfpdir = AFPDIR; fs->fs_active = NULL; mp->mnt_data = ump; mp->mnt_stat.f_fsidx.__fsid_val[0] = (long)dev; mp->mnt_stat.f_fsidx.__fsid_val[1] = makefstype(MOUNT_FFS); mp->mnt_stat.f_fsid = mp->mnt_stat.f_fsidx.__fsid_val[0]; mp->mnt_stat.f_namemax = FFS_MAXNAMLEN; if (UFS_MPISAPPLEUFS(ump)) { /* NeXT used to keep short symlinks in the inode even * when using FS_42INODEFMT. In that case fs->fs_maxsymlinklen * is probably -1, but we still need to be able to identify * short symlinks. */ ump->um_maxsymlinklen = APPLEUFS_MAXSYMLINKLEN; ump->um_dirblksiz = APPLEUFS_DIRBLKSIZ; mp->mnt_iflag |= IMNT_DTYPE; } else { ump->um_maxsymlinklen = fs->fs_maxsymlinklen; ump->um_dirblksiz = UFS_DIRBLKSIZ; if (ump->um_maxsymlinklen > 0) mp->mnt_iflag |= IMNT_DTYPE; else mp->mnt_iflag &= ~IMNT_DTYPE; } mp->mnt_fs_bshift = fs->fs_bshift; mp->mnt_dev_bshift = DEV_BSHIFT; /* XXX */ mp->mnt_flag |= MNT_LOCAL; mp->mnt_iflag |= IMNT_MPSAFE | IMNT_CAN_RWTORO | IMNT_SHRLOOKUP | IMNT_NCLOOKUP; #ifdef FFS_EI if (needswap) ump->um_flags |= UFS_NEEDSWAP; #endif error = ffs_acls(mp, fs->fs_flags); if (error) goto out1; ump->um_mountp = mp; ump->um_dev = dev; ump->um_devvp = devvp; ump->um_nindir = fs->fs_nindir; ump->um_lognindir = ffs(fs->fs_nindir) - 1; ump->um_bptrtodb = fs->fs_fshift - DEV_BSHIFT; ump->um_seqinc = fs->fs_frag; for (i = 0; i < MAXQUOTAS; i++) ump->um_quotas[i] = NULLVP; spec_node_setmountedfs(devvp, mp); if (ronly == 0 && fs->fs_snapinum[0] != 0) ffs_snapshot_mount(mp); #ifdef WAPBL if (!ronly) { KDASSERT(fs->fs_ronly == 0); /* * ffs_wapbl_start() needs mp->mnt_stat initialised if it * needs to create a new log file in-filesystem. */ error = ffs_statvfs(mp, &mp->mnt_stat); if (error) { DPRINTF("ffs_statvfs returned %d", error); goto out1; } error = ffs_wapbl_start(mp); if (error) { DPRINTF("ffs_wapbl_start returned %d", error); goto out1; } } #endif /* WAPBL */ if (ronly == 0) { #ifdef QUOTA2 error = ffs_quota2_mount(mp); if (error) { DPRINTF("ffs_quota2_mount returned %d", error); goto out1; } #else if (fs->fs_flags & FS_DOQUOTA2) { ump->um_flags |= UFS_QUOTA2; uprintf("%s: options QUOTA2 not enabled%s\n", mp->mnt_stat.f_mntonname, (mp->mnt_flag & MNT_FORCE) ? "" : ", not mounting"); if ((mp->mnt_flag & MNT_FORCE) == 0) { error = EINVAL; DPRINTF("quota disabled %d", error); goto out1; } } #endif } if (mp->mnt_flag & MNT_DISCARD) ump->um_discarddata = ffs_discard_init(devvp, fs); return (0); out1: kmem_free(fs->fs_csp, allocsbsize); out: #ifdef WAPBL if (mp->mnt_wapbl_replay) { wapbl_replay_stop(mp->mnt_wapbl_replay); wapbl_replay_free(mp->mnt_wapbl_replay); mp->mnt_wapbl_replay = 0; } #endif if (fs) kmem_free(fs, fs->fs_sbsize); spec_node_setmountedfs(devvp, NULL); if (bp) brelse(bp, bset); if (ump) { if (ump->um_oldfscompat) kmem_free(ump->um_oldfscompat, 512 + 3*sizeof(int32_t)); mutex_destroy(&ump->um_lock); kmem_free(ump, sizeof(*ump)); mp->mnt_data = NULL; } return (error); } /* * Sanity checks for loading old filesystem superblocks. * See ffs_oldfscompat_write below for unwound actions. * * XXX - Parts get retired eventually. * Unfortunately new bits get added. */ static void ffs_oldfscompat_read(struct fs *fs, struct ufsmount *ump, daddr_t sblockloc) { off_t maxfilesize; int32_t *extrasave; if ((fs->fs_magic != FS_UFS1_MAGIC) || (fs->fs_old_flags & FS_FLAGS_UPDATED)) return; if (!ump->um_oldfscompat) ump->um_oldfscompat = kmem_alloc(512 + 3*sizeof(int32_t), KM_SLEEP); memcpy(ump->um_oldfscompat, &fs->fs_old_postbl_start, 512); extrasave = ump->um_oldfscompat; extrasave += 512/sizeof(int32_t); extrasave[0] = fs->fs_old_npsect; extrasave[1] = fs->fs_old_interleave; extrasave[2] = fs->fs_old_trackskew; /* These fields will be overwritten by their * original values in fs_oldfscompat_write, so it is harmless * to modify them here. */ fs->fs_cstotal.cs_ndir = fs->fs_old_cstotal.cs_ndir; fs->fs_cstotal.cs_nbfree = fs->fs_old_cstotal.cs_nbfree; fs->fs_cstotal.cs_nifree = fs->fs_old_cstotal.cs_nifree; fs->fs_cstotal.cs_nffree = fs->fs_old_cstotal.cs_nffree; fs->fs_maxbsize = fs->fs_bsize; fs->fs_time = fs->fs_old_time; fs->fs_size = fs->fs_old_size; fs->fs_dsize = fs->fs_old_dsize; fs->fs_csaddr = fs->fs_old_csaddr; fs->fs_sblockloc = sblockloc; fs->fs_flags = fs->fs_old_flags | (fs->fs_flags & FS_INTERNAL); if (fs->fs_old_postblformat == FS_42POSTBLFMT) { fs->fs_old_nrpos = 8; fs->fs_old_npsect = fs->fs_old_nsect; fs->fs_old_interleave = 1; fs->fs_old_trackskew = 0; } if (fs->fs_magic == FS_UFS1_MAGIC && fs->fs_old_inodefmt < FS_44INODEFMT) { fs->fs_maxfilesize = (u_quad_t) 1LL << 39; fs->fs_qbmask = ~fs->fs_bmask; fs->fs_qfmask = ~fs->fs_fmask; } maxfilesize = (u_int64_t)0x80000000 * fs->fs_bsize - 1; if (fs->fs_maxfilesize > maxfilesize) fs->fs_maxfilesize = maxfilesize; /* Compatibility for old filesystems */ if (fs->fs_avgfilesize <= 0) fs->fs_avgfilesize = AVFILESIZ; if (fs->fs_avgfpdir <= 0) fs->fs_avgfpdir = AFPDIR; #if 0 if (bigcgs) { fs->fs_save_cgsize = fs->fs_cgsize; fs->fs_cgsize = fs->fs_bsize; } #endif } /* * Unwinding superblock updates for old filesystems. * See ffs_oldfscompat_read above for details. * * XXX - Parts get retired eventually. * Unfortunately new bits get added. */ static void ffs_oldfscompat_write(struct fs *fs, struct ufsmount *ump) { int32_t *extrasave; if ((fs->fs_magic != FS_UFS1_MAGIC) || (fs->fs_old_flags & FS_FLAGS_UPDATED)) return; fs->fs_old_time = fs->fs_time; fs->fs_old_cstotal.cs_ndir = fs->fs_cstotal.cs_ndir; fs->fs_old_cstotal.cs_nbfree = fs->fs_cstotal.cs_nbfree; fs->fs_old_cstotal.cs_nifree = fs->fs_cstotal.cs_nifree; fs->fs_old_cstotal.cs_nffree = fs->fs_cstotal.cs_nffree; fs->fs_old_flags = fs->fs_flags; #if 0 if (bigcgs) { fs->fs_cgsize = fs->fs_save_cgsize; } #endif memcpy(&fs->fs_old_postbl_start, ump->um_oldfscompat, 512); extrasave = ump->um_oldfscompat; extrasave += 512/sizeof(int32_t); fs->fs_old_npsect = extrasave[0]; fs->fs_old_interleave = extrasave[1]; fs->fs_old_trackskew = extrasave[2]; } /* * unmount vfs operation */ int ffs_unmount(struct mount *mp, int mntflags) { struct lwp *l = curlwp; struct ufsmount *ump = VFSTOUFS(mp); struct fs *fs = ump->um_fs; int error, flags; u_int32_t bsize; #ifdef WAPBL extern int doforce; #endif if (ump->um_discarddata) { ffs_discard_finish(ump->um_discarddata, mntflags); ump->um_discarddata = NULL; } flags = 0; if (mntflags & MNT_FORCE) flags |= FORCECLOSE; if ((error = ffs_flushfiles(mp, flags, l)) != 0) return (error); error = UFS_WAPBL_BEGIN(mp); if (error == 0) if (fs->fs_ronly == 0 && ffs_cgupdate(ump, MNT_WAIT) == 0 && fs->fs_clean & FS_WASCLEAN) { fs->fs_clean = FS_ISCLEAN; fs->fs_fmod = 0; (void) ffs_sbupdate(ump, MNT_WAIT); } if (error == 0) UFS_WAPBL_END(mp); #ifdef WAPBL KASSERT(!(mp->mnt_wapbl_replay && mp->mnt_wapbl)); if (mp->mnt_wapbl_replay) { KDASSERT(fs->fs_ronly); wapbl_replay_stop(mp->mnt_wapbl_replay); wapbl_replay_free(mp->mnt_wapbl_replay); mp->mnt_wapbl_replay = 0; } error = ffs_wapbl_stop(mp, doforce && (mntflags & MNT_FORCE)); if (error) { return error; } #endif /* WAPBL */ if (ump->um_devvp->v_type != VBAD) spec_node_setmountedfs(ump->um_devvp, NULL); vn_lock(ump->um_devvp, LK_EXCLUSIVE | LK_RETRY); (void)VOP_CLOSE(ump->um_devvp, fs->fs_ronly ? FREAD : FREAD | FWRITE, NOCRED); vput(ump->um_devvp); bsize = fs->fs_cssize; if (fs->fs_contigsumsize > 0) bsize += fs->fs_ncg * sizeof(int32_t); bsize += fs->fs_ncg * sizeof(*fs->fs_contigdirs); kmem_free(fs->fs_csp, bsize); kmem_free(fs, fs->fs_sbsize); if (ump->um_oldfscompat != NULL) kmem_free(ump->um_oldfscompat, 512 + 3*sizeof(int32_t)); mutex_destroy(&ump->um_lock); ffs_snapshot_fini(ump); kmem_free(ump, sizeof(*ump)); mp->mnt_data = NULL; mp->mnt_flag &= ~MNT_LOCAL; return (0); } /* * Flush out all the files in a filesystem. */ int ffs_flushfiles(struct mount *mp, int flags, struct lwp *l) { extern int doforce; struct ufsmount *ump; int error; if (!doforce) flags &= ~FORCECLOSE; ump = VFSTOUFS(mp); #ifdef QUOTA if ((error = quota1_umount(mp, flags)) != 0) return (error); #endif #ifdef QUOTA2 if ((error = quota2_umount(mp, flags)) != 0) return (error); #endif #ifdef UFS_EXTATTR if (ump->um_fstype == UFS1) { if (ump->um_extattr.uepm_flags & UFS_EXTATTR_UEPM_STARTED) ufs_extattr_stop(mp, l); if (ump->um_extattr.uepm_flags & UFS_EXTATTR_UEPM_INITIALIZED) ufs_extattr_uepm_destroy(&ump->um_extattr); mp->mnt_flag &= ~MNT_EXTATTR; } #endif if ((error = vflush(mp, 0, SKIPSYSTEM | flags)) != 0) return (error); ffs_snapshot_unmount(mp); /* * Flush all the files. */ error = vflush(mp, NULLVP, flags); if (error) return (error); /* * Flush filesystem metadata. */ vn_lock(ump->um_devvp, LK_EXCLUSIVE | LK_RETRY); error = VOP_FSYNC(ump->um_devvp, l->l_cred, FSYNC_WAIT, 0, 0); VOP_UNLOCK(ump->um_devvp); if (flags & FORCECLOSE) /* XXXDBJ */ error = 0; #ifdef WAPBL if (error) return error; if (mp->mnt_wapbl) { error = wapbl_flush(mp->mnt_wapbl, 1); if (flags & FORCECLOSE) error = 0; } #endif return (error); } /* * Get file system statistics. */ int ffs_statvfs(struct mount *mp, struct statvfs *sbp) { struct ufsmount *ump; struct fs *fs; ump = VFSTOUFS(mp); fs = ump->um_fs; mutex_enter(&ump->um_lock); sbp->f_bsize = fs->fs_bsize; sbp->f_frsize = fs->fs_fsize; sbp->f_iosize = fs->fs_bsize; sbp->f_blocks = fs->fs_dsize; sbp->f_bfree = ffs_blkstofrags(fs, fs->fs_cstotal.cs_nbfree) + fs->fs_cstotal.cs_nffree + FFS_DBTOFSB(fs, fs->fs_pendingblocks); sbp->f_bresvd = ((u_int64_t) fs->fs_dsize * (u_int64_t) fs->fs_minfree) / (u_int64_t) 100; if (sbp->f_bfree > sbp->f_bresvd) sbp->f_bavail = sbp->f_bfree - sbp->f_bresvd; else sbp->f_bavail = 0; sbp->f_files = fs->fs_ncg * fs->fs_ipg - UFS_ROOTINO; sbp->f_ffree = fs->fs_cstotal.cs_nifree + fs->fs_pendinginodes; sbp->f_favail = sbp->f_ffree; sbp->f_fresvd = 0; mutex_exit(&ump->um_lock); copy_statvfs_info(sbp, mp); return (0); } struct ffs_sync_ctx { int waitfor; }; static bool ffs_sync_selector(void *cl, struct vnode *vp) { struct ffs_sync_ctx *c = cl; struct inode *ip; KASSERT(mutex_owned(vp->v_interlock)); ip = VTOI(vp); /* * Skip the vnode/inode if inaccessible. */ if (ip == NULL || vp->v_type == VNON) return false; /* * We deliberately update inode times here. This will * prevent a massive queue of updates accumulating, only * to be handled by a call to unmount. * * XXX It would be better to have the syncer trickle these * out. Adjustment needed to allow registering vnodes for * sync when the vnode is clean, but the inode dirty. Or * have ufs itself trickle out inode updates. * * If doing a lazy sync, we don't care about metadata or * data updates, because they are handled by each vnode's * synclist entry. In this case we are only interested in * writing back modified inodes. */ if ((ip->i_flag & (IN_ACCESS | IN_CHANGE | IN_UPDATE | IN_MODIFY | IN_MODIFIED | IN_ACCESSED)) == 0 && (c->waitfor == MNT_LAZY || (LIST_EMPTY(&vp->v_dirtyblkhd) && (vp->v_iflag & VI_ONWORKLST) == 0))) return false; return true; } /* * Go through the disk queues to initiate sandbagged IO; * go through the inodes to write those that have been modified; * initiate the writing of the super block if it has been modified. * * Note: we are always called with the filesystem marked `MPBUSY'. */ int ffs_sync(struct mount *mp, int waitfor, kauth_cred_t cred) { struct vnode *vp; struct ufsmount *ump = VFSTOUFS(mp); struct fs *fs; struct vnode_iterator *marker; int error, allerror = 0; struct ffs_sync_ctx ctx; fs = ump->um_fs; if (fs->fs_fmod != 0 && fs->fs_ronly != 0) { /* XXX */ panic("%s: rofs mod, fs=%s", __func__, fs->fs_fsmnt); } /* * Write back each (modified) inode. */ vfs_vnode_iterator_init(mp, &marker); ctx.waitfor = waitfor; while ((vp = vfs_vnode_iterator_next(marker, ffs_sync_selector, &ctx))) { error = vn_lock(vp, LK_EXCLUSIVE | (waitfor == MNT_LAZY ? LK_NOWAIT : 0)); if (error) { vrele(vp); continue; } if (waitfor == MNT_LAZY) { error = UFS_WAPBL_BEGIN(vp->v_mount); if (!error) { error = ffs_update(vp, NULL, NULL, UPDATE_CLOSE); UFS_WAPBL_END(vp->v_mount); } } else { error = VOP_FSYNC(vp, cred, FSYNC_NOLOG | (waitfor == MNT_WAIT ? FSYNC_WAIT : 0), 0, 0); } if (error) allerror = error; vput(vp); } vfs_vnode_iterator_destroy(marker); /* * Force stale file system control information to be flushed. */ if (waitfor != MNT_LAZY) { bool need_devvp_fsync; mutex_enter(ump->um_devvp->v_interlock); need_devvp_fsync = (ump->um_devvp->v_numoutput > 0 || !LIST_EMPTY(&ump->um_devvp->v_dirtyblkhd)); mutex_exit(ump->um_devvp->v_interlock); if (need_devvp_fsync) { int flags = FSYNC_NOLOG; if (waitfor == MNT_WAIT) flags |= FSYNC_WAIT; vn_lock(ump->um_devvp, LK_EXCLUSIVE | LK_RETRY); if ((error = VOP_FSYNC(ump->um_devvp, cred, flags, 0, 0)) != 0) allerror = error; VOP_UNLOCK(ump->um_devvp); } } #if defined(QUOTA) || defined(QUOTA2) qsync(mp); #endif /* * Write back modified superblock. */ if (fs->fs_fmod != 0) { fs->fs_fmod = 0; fs->fs_time = time_second; error = UFS_WAPBL_BEGIN(mp); if (error) allerror = error; else { if ((error = ffs_cgupdate(ump, waitfor))) allerror = error; UFS_WAPBL_END(mp); } } #ifdef WAPBL if (mp->mnt_wapbl) { error = wapbl_flush(mp->mnt_wapbl, (waitfor == MNT_WAIT)); if (error) allerror = error; } #endif return (allerror); } /* * Load inode from disk and initialize vnode. */ static int ffs_init_vnode(struct ufsmount *ump, struct vnode *vp, ino_t ino) { struct fs *fs; struct inode *ip; struct buf *bp; int error; fs = ump->um_fs; /* Read in the disk contents for the inode. */ error = bread(ump->um_devvp, FFS_FSBTODB(fs, ino_to_fsba(fs, ino)), (int)fs->fs_bsize, 0, &bp); if (error) return error; /* Allocate and initialize inode. */ ip = pool_cache_get(ffs_inode_cache, PR_WAITOK); memset(ip, 0, sizeof(struct inode)); ip->i_ump = ump; ip->i_fs = fs; ip->i_dev = ump->um_dev; ip->i_number = ino; if (ump->um_fstype == UFS1) ip->i_din.ffs1_din = pool_cache_get(ffs_dinode1_cache, PR_WAITOK); else ip->i_din.ffs2_din = pool_cache_get(ffs_dinode2_cache, PR_WAITOK); ffs_load_inode(bp, ip, fs, ino); brelse(bp, 0); ip->i_vnode = vp; #if defined(QUOTA) || defined(QUOTA2) ufsquota_init(ip); #endif /* Initialise vnode with this inode. */ vp->v_tag = VT_UFS; vp->v_op = ffs_vnodeop_p; vp->v_data = ip; /* Initialize genfs node. */ genfs_node_init(vp, &ffs_genfsops); return 0; } /* * Undo ffs_init_vnode(). */ static void ffs_deinit_vnode(struct ufsmount *ump, struct vnode *vp) { struct inode *ip = VTOI(vp); genfs_node_destroy(vp); vp->v_data = NULL; if (ump->um_fstype == UFS1) pool_cache_put(ffs_dinode1_cache, ip->i_din.ffs1_din); else pool_cache_put(ffs_dinode2_cache, ip->i_din.ffs2_din); pool_cache_put(ffs_inode_cache, ip); } /* * Read an inode from disk and initialize this vnode / inode pair. * Caller assures no other thread will try to load this inode. */ int ffs_loadvnode(struct mount *mp, struct vnode *vp, const void *key, size_t key_len, const void **new_key) { ino_t ino; struct fs *fs; struct inode *ip; struct ufsmount *ump; int error; KASSERT(key_len == sizeof(ino)); memcpy(&ino, key, key_len); ump = VFSTOUFS(mp); fs = ump->um_fs; error = ffs_init_vnode(ump, vp, ino); if (error) return error; ip = VTOI(vp); if (ip->i_mode == 0) { ffs_deinit_vnode(ump, vp); return ENOENT; } /* Initialize the vnode from the inode. */ ufs_vinit(mp, ffs_specop_p, ffs_fifoop_p, &vp); /* Finish inode initialization. */ ip->i_devvp = ump->um_devvp; vref(ip->i_devvp); /* * Ensure that uid and gid are correct. This is a temporary * fix until fsck has been changed to do the update. */ if (fs->fs_magic == FS_UFS1_MAGIC && /* XXX */ fs->fs_old_inodefmt < FS_44INODEFMT) { /* XXX */ ip->i_uid = ip->i_ffs1_ouid; /* XXX */ ip->i_gid = ip->i_ffs1_ogid; /* XXX */ } /* XXX */ uvm_vnp_setsize(vp, ip->i_size); cache_enter_id(vp, ip->i_mode, ip->i_uid, ip->i_gid, !HAS_ACLS(ip)); *new_key = &ip->i_number; return 0; } /* * Create a new inode on disk and initialize this vnode / inode pair. */ int ffs_newvnode(struct mount *mp, struct vnode *dvp, struct vnode *vp, struct vattr *vap, kauth_cred_t cred, void *extra, size_t *key_len, const void **new_key) { ino_t ino; struct fs *fs; struct inode *ip; struct timespec ts; struct ufsmount *ump; int error, mode; KASSERT(dvp->v_mount == mp); KASSERT(vap->va_type != VNON); *key_len = sizeof(ino); ump = VFSTOUFS(mp); fs = ump->um_fs; mode = MAKEIMODE(vap->va_type, vap->va_mode); /* Allocate fresh inode. */ error = ffs_valloc(dvp, mode, cred, &ino); if (error) return error; /* Attach inode to vnode. */ error = ffs_init_vnode(ump, vp, ino); if (error) { if (UFS_WAPBL_BEGIN(mp) == 0) { ffs_vfree(dvp, ino, mode); UFS_WAPBL_END(mp); } return error; } ip = VTOI(vp); if (ip->i_mode) { panic("%s: dup alloc ino=%" PRId64 " on %s: mode %o/%o " "gen %x/%x size %" PRIx64 " blocks %" PRIx64, __func__, ino, fs->fs_fsmnt, DIP(ip, mode), ip->i_mode, DIP(ip, gen), ip->i_gen, DIP(ip, size), DIP(ip, blocks)); } if (DIP(ip, size) || DIP(ip, blocks)) { printf("%s: ino=%" PRId64 " on %s: " "gen %x/%x has non zero blocks %" PRIx64 " or size %" PRIx64 "\n", __func__, ino, fs->fs_fsmnt, DIP(ip, gen), ip->i_gen, DIP(ip, blocks), DIP(ip, size)); if ((ip)->i_ump->um_fstype == UFS1) panic("%s: dirty filesystem?", __func__); DIP_ASSIGN(ip, blocks, 0); DIP_ASSIGN(ip, size, 0); } /* Set uid / gid. */ if (cred == NOCRED || cred == FSCRED) { ip->i_gid = 0; ip->i_uid = 0; } else { ip->i_gid = VTOI(dvp)->i_gid; ip->i_uid = kauth_cred_geteuid(cred); } DIP_ASSIGN(ip, gid, ip->i_gid); DIP_ASSIGN(ip, uid, ip->i_uid); #if defined(QUOTA) || defined(QUOTA2) error = UFS_WAPBL_BEGIN(mp); if (error) { ffs_deinit_vnode(ump, vp); return error; } error = chkiq(ip, 1, cred, 0); if (error) { ffs_vfree(dvp, ino, mode); UFS_WAPBL_END(mp); ffs_deinit_vnode(ump, vp); return error; } UFS_WAPBL_END(mp); #endif /* Set type and finalize. */ ip->i_flags = 0; DIP_ASSIGN(ip, flags, 0); ip->i_mode = mode; DIP_ASSIGN(ip, mode, mode); if (vap->va_rdev != VNOVAL) { /* * Want to be able to use this to make badblock * inodes, so don't truncate the dev number. */ if (ump->um_fstype == UFS1) ip->i_ffs1_rdev = ufs_rw32(vap->va_rdev, UFS_MPNEEDSWAP(ump)); else ip->i_ffs2_rdev = ufs_rw64(vap->va_rdev, UFS_MPNEEDSWAP(ump)); } ufs_vinit(mp, ffs_specop_p, ffs_fifoop_p, &vp); ip->i_devvp = ump->um_devvp; vref(ip->i_devvp); /* Set up a new generation number for this inode. */ ip->i_gen++; DIP_ASSIGN(ip, gen, ip->i_gen); if (fs->fs_magic == FS_UFS2_MAGIC) { vfs_timestamp(&ts); ip->i_ffs2_birthtime = ts.tv_sec; ip->i_ffs2_birthnsec = ts.tv_nsec; } uvm_vnp_setsize(vp, ip->i_size); cache_enter_id(vp, ip->i_mode, ip->i_uid, ip->i_gid, !HAS_ACLS(ip)); *new_key = &ip->i_number; return 0; } /* * File handle to vnode * * Have to be really careful about stale file handles: * - check that the inode number is valid * - call ffs_vget() to get the locked inode * - check for an unallocated inode (i_mode == 0) * - check that the given client host has export rights and return * those rights via. exflagsp and credanonp */ int ffs_fhtovp(struct mount *mp, struct fid *fhp, int lktype, struct vnode **vpp) { struct ufid ufh; int error; if (fhp->fid_len != sizeof(struct ufid)) return EINVAL; memcpy(&ufh, fhp, sizeof(ufh)); if ((error = ffs_checkrange(mp, ufh.ufid_ino)) != 0) return error; return (ufs_fhtovp(mp, &ufh, lktype, vpp)); } /* * Vnode pointer to File handle */ /* ARGSUSED */ int ffs_vptofh(struct vnode *vp, struct fid *fhp, size_t *fh_size) { struct inode *ip; struct ufid ufh; if (*fh_size < sizeof(struct ufid)) { *fh_size = sizeof(struct ufid); return E2BIG; } ip = VTOI(vp); *fh_size = sizeof(struct ufid); memset(&ufh, 0, sizeof(ufh)); ufh.ufid_len = sizeof(struct ufid); ufh.ufid_ino = ip->i_number; ufh.ufid_gen = ip->i_gen; memcpy(fhp, &ufh, sizeof(ufh)); return (0); } void ffs_init(void) { if (ffs_initcount++ > 0) return; ffs_inode_cache = pool_cache_init(sizeof(struct inode), 0, 0, 0, "ffsino", NULL, IPL_NONE, NULL, NULL, NULL); ffs_dinode1_cache = pool_cache_init(sizeof(struct ufs1_dinode), 0, 0, 0, "ffsdino1", NULL, IPL_NONE, NULL, NULL, NULL); ffs_dinode2_cache = pool_cache_init(sizeof(struct ufs2_dinode), 0, 0, 0, "ffsdino2", NULL, IPL_NONE, NULL, NULL, NULL); ufs_init(); } void ffs_reinit(void) { ufs_reinit(); } void ffs_done(void) { if (--ffs_initcount > 0) return; ufs_done(); pool_cache_destroy(ffs_dinode2_cache); pool_cache_destroy(ffs_dinode1_cache); pool_cache_destroy(ffs_inode_cache); } /* * Write a superblock and associated information back to disk. */ int ffs_sbupdate(struct ufsmount *mp, int waitfor) { struct fs *fs = mp->um_fs; struct buf *bp; int error; u_int32_t saveflag; error = ffs_getblk(mp->um_devvp, fs->fs_sblockloc / DEV_BSIZE, FFS_NOBLK, fs->fs_sbsize, false, &bp); if (error) return error; saveflag = fs->fs_flags & FS_INTERNAL; fs->fs_flags &= ~FS_INTERNAL; memcpy(bp->b_data, fs, fs->fs_sbsize); ffs_oldfscompat_write((struct fs *)bp->b_data, mp); if (mp->um_flags & UFS_EA) { struct fs *bfs = (struct fs *)bp->b_data; KASSERT(bfs->fs_magic == FS_UFS2_MAGIC); bfs->fs_magic = FS_UFS2EA_MAGIC; } #ifdef FFS_EI if (mp->um_flags & UFS_NEEDSWAP) ffs_sb_swap((struct fs *)bp->b_data, (struct fs *)bp->b_data); #endif fs->fs_flags |= saveflag; if (waitfor == MNT_WAIT) error = bwrite(bp); else bawrite(bp); return (error); } int ffs_cgupdate(struct ufsmount *mp, int waitfor) { struct fs *fs = mp->um_fs; struct buf *bp; int blks; void *space; int i, size, error = 0, allerror = 0; UFS_WAPBL_JLOCK_ASSERT(mp->um_mountp); allerror = ffs_sbupdate(mp, waitfor); blks = howmany(fs->fs_cssize, fs->fs_fsize); space = fs->fs_csp; for (i = 0; i < blks; i += fs->fs_frag) { size = fs->fs_bsize; if (i + fs->fs_frag > blks) size = (blks - i) * fs->fs_fsize; error = ffs_getblk(mp->um_devvp, FFS_FSBTODB(fs, fs->fs_csaddr + i), FFS_NOBLK, size, false, &bp); if (error) break; #ifdef FFS_EI if (mp->um_flags & UFS_NEEDSWAP) ffs_csum_swap((struct csum*)space, (struct csum*)bp->b_data, size); else #endif memcpy(bp->b_data, space, (u_int)size); space = (char *)space + size; if (waitfor == MNT_WAIT) error = bwrite(bp); else bawrite(bp); } if (!allerror && error) allerror = error; return (allerror); } int ffs_extattrctl(struct mount *mp, int cmd, struct vnode *vp, int attrnamespace, const char *attrname) { #ifdef UFS_EXTATTR /* * File-backed extended attributes are only supported on UFS1. * UFS2 has native extended attributes. */ if (VFSTOUFS(mp)->um_fstype == UFS1) return (ufs_extattrctl(mp, cmd, vp, attrnamespace, attrname)); #endif return (vfs_stdextattrctl(mp, cmd, vp, attrnamespace, attrname)); } /* * Synch vnode for a mounted file system. */ static int ffs_vfs_fsync(vnode_t *vp, int flags) { int error, i, pflags; #ifdef WAPBL struct mount *mp; #endif KASSERT(vp->v_type == VBLK); KASSERT(spec_node_getmountedfs(vp) != NULL); /* * Flush all dirty data associated with the vnode. */ pflags = PGO_ALLPAGES | PGO_CLEANIT; if ((flags & FSYNC_WAIT) != 0) pflags |= PGO_SYNCIO; rw_enter(vp->v_uobj.vmobjlock, RW_WRITER); error = VOP_PUTPAGES(vp, 0, 0, pflags); if (error) return error; #ifdef WAPBL mp = spec_node_getmountedfs(vp); if (mp && mp->mnt_wapbl) { /* * Don't bother writing out metadata if the syncer is * making the request. We will let the sync vnode * write it out in a single burst through a call to * VFS_SYNC(). */ if ((flags & (FSYNC_DATAONLY | FSYNC_LAZY | FSYNC_NOLOG)) != 0) return 0; /* * Don't flush the log if the vnode being flushed * contains no dirty buffers that could be in the log. */ if (!LIST_EMPTY(&vp->v_dirtyblkhd)) { error = wapbl_flush(mp->mnt_wapbl, 0); if (error) return error; } if ((flags & FSYNC_WAIT) != 0) { mutex_enter(vp->v_interlock); while (vp->v_numoutput) cv_wait(&vp->v_cv, vp->v_interlock); mutex_exit(vp->v_interlock); } return 0; } #endif /* WAPBL */ error = vflushbuf(vp, flags); if (error == 0 && (flags & FSYNC_CACHE) != 0) { i = 1; (void)VOP_IOCTL(vp, DIOCCACHESYNC, &i, FWRITE, kauth_cred_get()); } return error; }
3 1 22 2 20 23 22 1 1 23 1 22 23 23 1 21 2 21 19 4 3 20 22 1 23 23 23 23 23 23 23 23 23 20 3 19 4 23 22 4 23 23 23 23 23 23 23 21 21 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 /* $NetBSD: kern_fork.c,v 1.230 2023/02/25 08:22:00 skrll Exp $ */ /*- * Copyright (c) 1999, 2001, 2004, 2006, 2007, 2008, 2019 * The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility, * NASA Ames Research Center, by Charles M. Hannum, and by Andrew Doran. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /* * Copyright (c) 1982, 1986, 1989, 1991, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)kern_fork.c 8.8 (Berkeley) 2/14/95 */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: kern_fork.c,v 1.230 2023/02/25 08:22:00 skrll Exp $"); #include "opt_ktrace.h" #include "opt_dtrace.h" #include <sys/param.h> #include <sys/systm.h> #include <sys/filedesc.h> #include <sys/kernel.h> #include <sys/pool.h> #include <sys/mount.h> #include <sys/proc.h> #include <sys/ras.h> #include <sys/resourcevar.h> #include <sys/vnode.h> #include <sys/file.h> #include <sys/acct.h> #include <sys/ktrace.h> #include <sys/sched.h> #include <sys/signalvar.h> #include <sys/syscall.h> #include <sys/kauth.h> #include <sys/atomic.h> #include <sys/syscallargs.h> #include <sys/uidinfo.h> #include <sys/sdt.h> #include <sys/ptrace.h> /* * DTrace SDT provider definitions */ SDT_PROVIDER_DECLARE(proc); SDT_PROBE_DEFINE3(proc, kernel, , create, "struct proc *", /* new process */ "struct proc *", /* parent process */ "int" /* flags */); u_int nprocs __cacheline_aligned = 1; /* process 0 */ /* * Number of ticks to sleep if fork() would fail due to process hitting * limits. Exported in miliseconds to userland via sysctl. */ int forkfsleep = 0; int sys_fork(struct lwp *l, const void *v, register_t *retval) { return fork1(l, 0, SIGCHLD, NULL, 0, NULL, NULL, retval); } /* * vfork(2) system call compatible with 4.4BSD (i.e. BSD with Mach VM). * Address space is not shared, but parent is blocked until child exit. */ int sys_vfork(struct lwp *l, const void *v, register_t *retval) { return fork1(l, FORK_PPWAIT, SIGCHLD, NULL, 0, NULL, NULL, retval); } /* * New vfork(2) system call for NetBSD, which implements original 3BSD vfork(2) * semantics. Address space is shared, and parent is blocked until child exit. */ int sys___vfork14(struct lwp *l, const void *v, register_t *retval) { return fork1(l, FORK_PPWAIT|FORK_SHAREVM, SIGCHLD, NULL, 0, NULL, NULL, retval); } /* * Linux-compatible __clone(2) system call. */ int sys___clone(struct lwp *l, const struct sys___clone_args *uap, register_t *retval) { /* { syscallarg(int) flags; syscallarg(void *) stack; } */ int flags, sig; /* * We don't support the CLONE_PTRACE flag. */ if (SCARG(uap, flags) & (CLONE_PTRACE)) return EINVAL; /* * Linux enforces CLONE_VM with CLONE_SIGHAND, do same. */ if (SCARG(uap, flags) & CLONE_SIGHAND && (SCARG(uap, flags) & CLONE_VM) == 0) return EINVAL; flags = 0; if (SCARG(uap, flags) & CLONE_VM) flags |= FORK_SHAREVM; if (SCARG(uap, flags) & CLONE_FS) flags |= FORK_SHARECWD; if (SCARG(uap, flags) & CLONE_FILES) flags |= FORK_SHAREFILES; if (SCARG(uap, flags) & CLONE_SIGHAND) flags |= FORK_SHARESIGS; if (SCARG(uap, flags) & CLONE_VFORK) flags |= FORK_PPWAIT; sig = SCARG(uap, flags) & CLONE_CSIGNAL; if (sig < 0 || sig >= _NSIG) return EINVAL; /* * Note that the Linux API does not provide a portable way of * specifying the stack area; the caller must know if the stack * grows up or down. So, we pass a stack size of 0, so that the * code that makes this adjustment is a noop. */ return fork1(l, flags, sig, SCARG(uap, stack), 0, NULL, NULL, retval); } /* * Print the 'table full' message once per 10 seconds. */ static struct timeval fork_tfmrate = { 10, 0 }; /* * Check if a process is traced and shall inform about FORK events. */ static inline bool tracefork(struct proc *p, int flags) { return (p->p_slflag & (PSL_TRACEFORK|PSL_TRACED)) == (PSL_TRACEFORK|PSL_TRACED) && (flags & FORK_PPWAIT) == 0; } /* * Check if a process is traced and shall inform about VFORK events. */ static inline bool tracevfork(struct proc *p, int flags) { return (p->p_slflag & (PSL_TRACEVFORK|PSL_TRACED)) == (PSL_TRACEVFORK|PSL_TRACED) && (flags & FORK_PPWAIT) != 0; } /* * Check if a process is traced and shall inform about VFORK_DONE events. */ static inline bool tracevforkdone(struct proc *p, int flags) { return (p->p_slflag & (PSL_TRACEVFORK_DONE|PSL_TRACED)) == (PSL_TRACEVFORK_DONE|PSL_TRACED) && (flags & FORK_PPWAIT); } /* * General fork call. Note that another LWP in the process may call exec() * or exit() while we are forking. It's safe to continue here, because * neither operation will complete until all LWPs have exited the process. */ int fork1(struct lwp *l1, int flags, int exitsig, void *stack, size_t stacksize, void (*func)(void *), void *arg, register_t *retval) { struct proc *p1, *p2, *parent; struct plimit *p1_lim; uid_t uid; struct lwp *l2; int count; vaddr_t uaddr; int tnprocs; int error = 0; p1 = l1->l_proc; uid = kauth_cred_getuid(l1->l_cred); tnprocs = atomic_inc_uint_nv(&nprocs); /* * Although process entries are dynamically created, we still keep * a global limit on the maximum number we will create. */ if (__predict_false(tnprocs >= maxproc)) error = -1; else error = kauth_authorize_process(l1->l_cred, KAUTH_PROCESS_FORK, p1, KAUTH_ARG(tnprocs), NULL, NULL); if (error) { static struct timeval lasttfm; atomic_dec_uint(&nprocs); if (ratecheck(&lasttfm, &fork_tfmrate)) tablefull("proc", "increase kern.maxproc or NPROC"); if (forkfsleep) kpause("forkmx", false, forkfsleep, NULL); return EAGAIN; } /* * Enforce limits. */ count = chgproccnt(uid, 1); if (__predict_false(count > p1->p_rlimit[RLIMIT_NPROC].rlim_cur)) { if (kauth_authorize_process(l1->l_cred, KAUTH_PROCESS_RLIMIT, p1, KAUTH_ARG(KAUTH_REQ_PROCESS_RLIMIT_BYPASS), &p1->p_rlimit[RLIMIT_NPROC], KAUTH_ARG(RLIMIT_NPROC)) != 0) { (void)chgproccnt(uid, -1); atomic_dec_uint(&nprocs); if (forkfsleep) kpause("forkulim", false, forkfsleep, NULL); return EAGAIN; } } /* * Allocate virtual address space for the U-area now, while it * is still easy to abort the fork operation if we're out of * kernel virtual address space. */ uaddr = uvm_uarea_alloc(); if (__predict_false(uaddr == 0)) { (void)chgproccnt(uid, -1); atomic_dec_uint(&nprocs); return ENOMEM; } /* Allocate new proc. */ p2 = proc_alloc(); if (p2 == NULL) { /* We were unable to allocate a process ID. */ uvm_uarea_free(uaddr); mutex_enter(p1->p_lock); uid = kauth_cred_getuid(p1->p_cred); (void)chgproccnt(uid, -1); mutex_exit(p1->p_lock); atomic_dec_uint(&nprocs); return EAGAIN; } /* * We are now committed to the fork. From here on, we may * block on resources, but resource allocation may NOT fail. */ /* * Make a proc table entry for the new process. * Start by zeroing the section of proc that is zero-initialized, * then copy the section that is copied directly from the parent. */ memset(&p2->p_startzero, 0, (unsigned) ((char *)&p2->p_endzero - (char *)&p2->p_startzero)); memcpy(&p2->p_startcopy, &p1->p_startcopy, (unsigned) ((char *)&p2->p_endcopy - (char *)&p2->p_startcopy)); TAILQ_INIT(&p2->p_sigpend.sp_info); LIST_INIT(&p2->p_lwps); LIST_INIT(&p2->p_sigwaiters); /* * Duplicate sub-structures as needed. * Increase reference counts on shared objects. * Inherit flags we want to keep. The flags related to SIGCHLD * handling are important in order to keep a consistent behaviour * for the child after the fork. If we are a 32-bit process, the * child will be too. */ p2->p_flag = p1->p_flag & (PK_SUGID | PK_NOCLDWAIT | PK_CLDSIGIGN | PK_32); p2->p_emul = p1->p_emul; p2->p_execsw = p1->p_execsw; if (flags & FORK_SYSTEM) { /* * Mark it as a system process. Set P_NOCLDWAIT so that * children are reparented to init(8) when they exit. * init(8) can easily wait them out for us. */ p2->p_flag |= (PK_SYSTEM | PK_NOCLDWAIT); } mutex_init(&p2->p_stmutex, MUTEX_DEFAULT, IPL_HIGH); mutex_init(&p2->p_auxlock, MUTEX_DEFAULT, IPL_NONE); rw_init(&p2->p_reflock); cv_init(&p2->p_waitcv, "wait"); cv_init(&p2->p_lwpcv, "lwpwait"); /* * Share a lock between the processes if they are to share signal * state: we must synchronize access to it. */ if (flags & FORK_SHARESIGS) { p2->p_lock = p1->p_lock; mutex_obj_hold(p1->p_lock); } else p2->p_lock = mutex_obj_alloc(MUTEX_DEFAULT, IPL_NONE); kauth_proc_fork(p1, p2); p2->p_raslist = NULL; #if defined(__HAVE_RAS) ras_fork(p1, p2); #endif /* bump references to the text vnode (for procfs) */ p2->p_textvp = p1->p_textvp; if (p2->p_textvp) vref(p2->p_textvp); if (p1->p_path) p2->p_path = kmem_strdupsize(p1->p_path, NULL, KM_SLEEP); else p2->p_path = NULL; if (flags & FORK_SHAREFILES) fd_share(p2); else if (flags & FORK_CLEANFILES) p2->p_fd = fd_init(NULL); else p2->p_fd = fd_copy(); /* XXX racy */ p2->p_mqueue_cnt = p1->p_mqueue_cnt; if (flags & FORK_SHARECWD) cwdshare(p2); else p2->p_cwdi = cwdinit(); /* * Note: p_limit (rlimit stuff) is copy-on-write, so normally * we just need increase pl_refcnt. */ p1_lim = p1->p_limit; if (!p1_lim->pl_writeable) { lim_addref(p1_lim); p2->p_limit = p1_lim; } else { p2->p_limit = lim_copy(p1_lim); } if (flags & FORK_PPWAIT) { /* Mark ourselves as waiting for a child. */ p2->p_lflag = PL_PPWAIT; l1->l_vforkwaiting = true; p2->p_vforklwp = l1; } else { p2->p_lflag = 0; l1->l_vforkwaiting = false; } p2->p_sflag = 0; p2->p_slflag = 0; parent = (flags & FORK_NOWAIT) ? initproc : p1; p2->p_pptr = parent; p2->p_ppid = parent->p_pid; LIST_INIT(&p2->p_children); p2->p_aio = NULL; #ifdef KTRACE /* * Copy traceflag and tracefile if enabled. * If not inherited, these were zeroed above. */ if (p1->p_traceflag & KTRFAC_INHERIT) { mutex_enter(&ktrace_lock); p2->p_traceflag = p1->p_traceflag; if ((p2->p_tracep = p1->p_tracep) != NULL) ktradref(p2); mutex_exit(&ktrace_lock); } #endif /* * Create signal actions for the child process. */ p2->p_sigacts = sigactsinit(p1, flags & FORK_SHARESIGS); mutex_enter(p1->p_lock); p2->p_sflag |= (p1->p_sflag & (PS_STOPFORK | PS_STOPEXEC | PS_NOCLDSTOP)); sched_proc_fork(p1, p2); mutex_exit(p1->p_lock); p2->p_stflag = p1->p_stflag; /* * p_stats. * Copy parts of p_stats, and zero out the rest. */ p2->p_stats = pstatscopy(p1->p_stats); /* * Set up the new process address space. */ uvm_proc_fork(p1, p2, (flags & FORK_SHAREVM) ? true : false); /* * Finish creating the child process. * It will return through a different path later. */ lwp_create(l1, p2, uaddr, (flags & FORK_PPWAIT) ? LWP_VFORK : 0, stack, stacksize, (func != NULL) ? func : child_return, arg, &l2, l1->l_class, &l1->l_sigmask, &l1->l_sigstk); /* * Inherit l_private from the parent. * Note that we cannot use lwp_setprivate() here since that * also sets the CPU TLS register, which is incorrect if the * process has changed that without letting the kernel know. */ l2->l_private = l1->l_private; /* * If emulation has a process fork hook, call it now. */ if (p2->p_emul->e_proc_fork) (*p2->p_emul->e_proc_fork)(p2, l1, flags); /* * ...and finally, any other random fork hooks that subsystems * might have registered. */ doforkhooks(p2, p1); SDT_PROBE(proc, kernel, , create, p2, p1, flags, 0, 0); /* * It's now safe for the scheduler and other processes to see the * child process. */ mutex_enter(&proc_lock); if (p1->p_session->s_ttyvp != NULL && p1->p_lflag & PL_CONTROLT) p2->p_lflag |= PL_CONTROLT; LIST_INSERT_HEAD(&parent->p_children, p2, p_sibling); p2->p_exitsig = exitsig; /* signal for parent on exit */ /* * Trace fork(2) and vfork(2)-like events on demand in a debugger. */ if (tracefork(p1, flags) || tracevfork(p1, flags)) { proc_changeparent(p2, p1->p_pptr); SET(p2->p_slflag, PSL_TRACEDCHILD); } p2->p_oppid = p1->p_pid; /* Remember the original parent id. */ LIST_INSERT_AFTER(p1, p2, p_pglist); LIST_INSERT_HEAD(&allproc, p2, p_list); p2->p_trace_enabled = trace_is_enabled(p2); #ifdef __HAVE_SYSCALL_INTERN (*p2->p_emul->e_syscall_intern)(p2); #endif /* * Update stats now that we know the fork was successful. */ KPREEMPT_DISABLE(l1); CPU_COUNT(CPU_COUNT_FORKS, 1); if (flags & FORK_PPWAIT) CPU_COUNT(CPU_COUNT_FORKS_PPWAIT, 1); if (flags & FORK_SHAREVM) CPU_COUNT(CPU_COUNT_FORKS_SHAREVM, 1); KPREEMPT_ENABLE(l1); if (ktrpoint(KTR_EMUL)) p2->p_traceflag |= KTRFAC_TRC_EMUL; /* * Notify any interested parties about the new process. */ if (!SLIST_EMPTY(&p1->p_klist)) { mutex_exit(&proc_lock); knote_proc_fork(p1, p2); mutex_enter(&proc_lock); } /* * Make child runnable, set start time, and add to run queue except * if the parent requested the child to start in SSTOP state. */ mutex_enter(p2->p_lock); /* * Start profiling. */ if ((p2->p_stflag & PST_PROFIL) != 0) { mutex_spin_enter(&p2->p_stmutex); startprofclock(p2); mutex_spin_exit(&p2->p_stmutex); } getmicrotime(&p2->p_stats->p_start); p2->p_acflag = AFORK; lwp_lock(l2); KASSERT(p2->p_nrlwps == 1); KASSERT(l2->l_stat == LSIDL); if (p2->p_sflag & PS_STOPFORK) { p2->p_nrlwps = 0; p2->p_stat = SSTOP; p2->p_waited = 0; p1->p_nstopchild++; l2->l_stat = LSSTOP; KASSERT(l2->l_wchan == NULL); lwp_unlock(l2); } else { p2->p_nrlwps = 1; p2->p_stat = SACTIVE; setrunnable(l2); /* LWP now unlocked */ } /* * Return child pid to parent process, * marking us as parent via retval[1]. */ if (retval != NULL) { retval[0] = p2->p_pid; retval[1] = 0; } mutex_exit(p2->p_lock); /* * Let the parent know that we are tracing its child. */ if (tracefork(p1, flags) || tracevfork(p1, flags)) { mutex_enter(p1->p_lock); eventswitch(TRAP_CHLD, tracefork(p1, flags) ? PTRACE_FORK : PTRACE_VFORK, retval[0]); mutex_enter(&proc_lock); } /* * Preserve synchronization semantics of vfork. If waiting for * child to exec or exit, sleep until it clears p_vforkwaiting. */ while (l1->l_vforkwaiting) cv_wait(&l1->l_waitcv, &proc_lock); /* * Let the parent know that we are tracing its child. */ if (tracevforkdone(p1, flags)) { mutex_enter(p1->p_lock); eventswitch(TRAP_CHLD, PTRACE_VFORK_DONE, retval[0]); } else mutex_exit(&proc_lock); return 0; } /* * MI code executed in each newly spawned process before returning to userland. */ void child_return(void *arg) { struct lwp *l = curlwp; struct proc *p = l->l_proc; if ((p->p_slflag & (PSL_TRACED|PSL_TRACEDCHILD)) == (PSL_TRACED|PSL_TRACEDCHILD)) { eventswitchchild(p, TRAP_CHLD, ISSET(p->p_lflag, PL_PPWAIT) ? PTRACE_VFORK : PTRACE_FORK); } md_child_return(l); /* * Return SYS_fork for all fork types, including vfork(2) and clone(2). * * This approach simplifies the code and avoids extra locking. */ ktrsysret(SYS_fork, 0, 0); }
16 1274 1274 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 /* $NetBSD: tsc.c,v 1.60 2024/02/19 20:10:09 mrg Exp $ */ /*- * Copyright (c) 2008, 2020 The NetBSD Foundation, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: tsc.c,v 1.60 2024/02/19 20:10:09 mrg Exp $"); #include <sys/param.h> #include <sys/systm.h> #include <sys/time.h> #include <sys/timetc.h> #include <sys/lwp.h> #include <sys/atomic.h> #include <sys/kernel.h> #include <sys/cpu.h> #include <sys/xcall.h> #include <sys/lock.h> #include <machine/cpu_counter.h> #include <machine/cpuvar.h> #include <machine/cpufunc.h> #include <machine/specialreg.h> #include <machine/cputypes.h> #include "tsc.h" #define TSC_SYNC_ROUNDS 1000 #define ABS(a) ((a) >= 0 ? (a) : -(a)) static u_int tsc_get_timecount(struct timecounter *); static void tsc_delay(unsigned int); static uint64_t tsc_dummy_cacheline __cacheline_aligned; uint64_t tsc_freq __read_mostly; /* exported for sysctl */ static int64_t tsc_drift_max = 1000; /* max cycles */ static int64_t tsc_drift_observed; uint64_t (*rdtsc)(void) = rdtsc_cpuid; uint64_t (*cpu_counter)(void) = cpu_counter_cpuid; uint32_t (*cpu_counter32)(void) = cpu_counter32_cpuid; int tsc_user_enabled = 1; static volatile int64_t tsc_sync_val; static volatile struct cpu_info *tsc_sync_cpu; static struct timecounter tsc_timecounter = { .tc_get_timecount = tsc_get_timecount, .tc_counter_mask = ~0U, .tc_name = "TSC", .tc_quality = 3000, }; bool tsc_is_invariant(void) { struct cpu_info *ci; uint32_t descs[4]; uint32_t family; bool invariant; if (!cpu_hascounter()) return false; ci = curcpu(); invariant = false; if (cpu_vendor == CPUVENDOR_INTEL) { /* * From Intel(tm) 64 and IA-32 Architectures Software * Developer's Manual Volume 3A: System Programming Guide, * Part 1, 17.13 TIME_STAMP COUNTER, these are the processors * where the TSC is known invariant: * * Pentium 4, Intel Xeon (family 0f, models 03 and higher) * Core Solo and Core Duo processors (family 06, model 0e) * Xeon 5100 series and Core 2 Duo (family 06, model 0f) * Core 2 and Xeon (family 06, model 17) * Atom (family 06, model 1c) * * We'll also assume that it's safe on the Pentium, and * that it's safe on P-II and P-III Xeons due to the * typical configuration of those systems. * */ switch (CPUID_TO_BASEFAMILY(ci->ci_signature)) { case 0x05: invariant = true; break; case 0x06: invariant = CPUID_TO_MODEL(ci->ci_signature) == 0x0e || CPUID_TO_MODEL(ci->ci_signature) == 0x0f || CPUID_TO_MODEL(ci->ci_signature) == 0x17 || CPUID_TO_MODEL(ci->ci_signature) == 0x1c; break; case 0x0f: invariant = CPUID_TO_MODEL(ci->ci_signature) >= 0x03; break; } } else if (cpu_vendor == CPUVENDOR_AMD) { /* * TSC and Power Management Events on AMD Processors * Nov 2, 2005 Rich Brunner, AMD Fellow * http://lkml.org/lkml/2005/11/4/173 * * See Appendix E.4.7 CPUID Fn8000_0007_EDX Advanced Power * Management Features, AMD64 Architecture Programmer's * Manual Volume 3: General-Purpose and System Instructions. * The check is done below. */ /* * AMD Errata 778: Processor Core Time Stamp Counters May * Experience Drift * * This affects all family 15h and family 16h processors. */ switch (CPUID_TO_FAMILY(ci->ci_signature)) { case 0x15: case 0x16: return false; } } /* * The best way to check whether the TSC counter is invariant or not * is to check CPUID 80000007. */ family = CPUID_TO_BASEFAMILY(ci->ci_signature); if (((cpu_vendor == CPUVENDOR_INTEL) || (cpu_vendor == CPUVENDOR_AMD)) && ((family == 0x06) || (family == 0x0f))) { x86_cpuid(0x80000000, descs); if (descs[0] >= 0x80000007) { x86_cpuid(0x80000007, descs); invariant = (descs[3] & CPUID_APM_ITSC) != 0; } } return invariant; } /* Setup function pointers for rdtsc() and timecounter(9). */ void tsc_setfunc(struct cpu_info *ci) { bool use_lfence, use_mfence; use_lfence = use_mfence = false; /* * XXX On AMD, we might be able to use lfence for some cases: * a) if MSR_DE_CFG exist and the bit 1 is set. * b) family == 0x0f or 0x11. Those have no MSR_DE_CFG and * lfence is always serializing. * * We don't use it because the test result showed mfence was better * than lfence with MSR_DE_CFG. */ if (cpu_vendor == CPUVENDOR_AMD) use_mfence = true; else if (cpu_vendor == CPUVENDOR_INTEL) use_lfence = true; /* LFENCE and MFENCE are applicable if SSE2 is set. */ if ((ci->ci_feat_val[0] & CPUID_SSE2) == 0) use_lfence = use_mfence = false; #define TSC_SETFUNC(fence) \ do { \ rdtsc = rdtsc_##fence; \ cpu_counter = cpu_counter_##fence; \ cpu_counter32 = cpu_counter32_##fence; \ } while (/* CONSTCOND */ 0) if (use_lfence) TSC_SETFUNC(lfence); else if (use_mfence) TSC_SETFUNC(mfence); else TSC_SETFUNC(cpuid); aprint_verbose_dev(ci->ci_dev, "Use %s to serialize rdtsc\n", use_lfence ? "lfence" : (use_mfence ? "mfence" : "cpuid")); } /* * Initialize timecounter(9) and DELAY() function of TSC. * * This function is called after all secondary processors were brought up * and drift has been measured, and after any other potential delay funcs * have been installed (e.g. lapic_delay()). */ void tsc_tc_init(void) { struct cpu_info *ci; bool invariant; if (!cpu_hascounter()) return; ci = curcpu(); tsc_freq = ci->ci_data.cpu_cc_freq; invariant = tsc_is_invariant(); if (!invariant) { aprint_debug("TSC not known invariant on this CPU\n"); tsc_timecounter.tc_quality = -100; } else if (tsc_drift_observed > tsc_drift_max) { aprint_error("ERROR: %lld cycle TSC drift observed\n", (long long)tsc_drift_observed); tsc_timecounter.tc_quality = -100; invariant = false; } else if (vm_guest == VM_GUEST_NO) { delay_func = tsc_delay; } else if (vm_guest == VM_GUEST_VIRTUALBOX) { tsc_timecounter.tc_quality = -100; } if (tsc_freq != 0) { tsc_timecounter.tc_frequency = tsc_freq; tc_init(&tsc_timecounter); } } /* * Record drift (in clock cycles). Called during AP startup. */ void tsc_sync_drift(int64_t drift) { if (drift < 0) drift = -drift; if (drift > tsc_drift_observed) tsc_drift_observed = drift; } /* * Called during startup of APs, by the boot processor. Interrupts * are disabled on entry. */ static void __noinline tsc_read_bp(struct cpu_info *ci, uint64_t *bptscp, uint64_t *aptscp) { uint64_t bptsc; if (atomic_swap_ptr(&tsc_sync_cpu, ci) != NULL) { panic("tsc_sync_bp: 1"); } /* Prepare a cache miss for the other side. */ (void)atomic_swap_uint((void *)&tsc_dummy_cacheline, 0); /* Flag our readiness. */ atomic_or_uint(&ci->ci_flags, CPUF_SYNCTSC); /* Wait for other side then read our TSC. */ while ((ci->ci_flags & CPUF_SYNCTSC) != 0) { __insn_barrier(); } bptsc = rdtsc(); /* Wait for the results to come in. */ while (tsc_sync_cpu == ci) { x86_pause(); } if (tsc_sync_cpu != NULL) { panic("tsc_sync_bp: 2"); } *bptscp = bptsc; *aptscp = tsc_sync_val; } void tsc_sync_bp(struct cpu_info *ci) { int64_t bptsc, aptsc, val, diff; if (!cpu_hascounter()) return; val = INT64_MAX; for (int i = 0; i < TSC_SYNC_ROUNDS; i++) { tsc_read_bp(ci, &bptsc, &aptsc); diff = bptsc - aptsc; if (ABS(diff) < ABS(val)) { val = diff; } } ci->ci_data.cpu_cc_skew = val; } /* * Called during startup of AP, by the AP itself. Interrupts are * disabled on entry. */ static void __noinline tsc_post_ap(struct cpu_info *ci) { uint64_t tsc; /* Wait for go-ahead from primary. */ while ((ci->ci_flags & CPUF_SYNCTSC) == 0) { __insn_barrier(); } /* Instruct primary to read its counter. */ atomic_and_uint(&ci->ci_flags, ~CPUF_SYNCTSC); /* Suffer a cache miss, then read TSC. */ __insn_barrier(); tsc = tsc_dummy_cacheline; __insn_barrier(); tsc += rdtsc(); /* Post result. Ensure the whole value goes out atomically. */ (void)atomic_swap_64(&tsc_sync_val, tsc); if (atomic_swap_ptr(&tsc_sync_cpu, NULL) != ci) { panic("tsc_sync_ap"); } } void tsc_sync_ap(struct cpu_info *ci) { if (!cpu_hascounter()) return; for (int i = 0; i < TSC_SYNC_ROUNDS; i++) { tsc_post_ap(ci); } } static void tsc_apply_cpu(void *arg1, void *arg2) { bool enable = arg1 != NULL; if (enable) { lcr4(rcr4() & ~CR4_TSD); } else { lcr4(rcr4() | CR4_TSD); } } void tsc_user_enable(void) { uint64_t xc; xc = xc_broadcast(0, tsc_apply_cpu, (void *)true, NULL); xc_wait(xc); } void tsc_user_disable(void) { uint64_t xc; xc = xc_broadcast(0, tsc_apply_cpu, (void *)false, NULL); xc_wait(xc); } uint64_t cpu_frequency(struct cpu_info *ci) { return ci->ci_data.cpu_cc_freq; } int cpu_hascounter(void) { return cpu_feature[0] & CPUID_TSC; } static void tsc_delay(unsigned int us) { uint64_t start, delta; start = cpu_counter(); delta = (uint64_t)us * tsc_freq / 1000000; while ((cpu_counter() - start) < delta) { x86_pause(); } } static u_int tsc_get_timecount(struct timecounter *tc) { #if defined(_LP64) && defined(DIAGNOSTIC) /* requires atomic 64-bit store */ static __cpu_simple_lock_t lock = __SIMPLELOCK_UNLOCKED; static int lastwarn; uint64_t cur, prev; lwp_t *l = curlwp; int ticks; /* * Previous value must be read before the counter and stored to * after, because this routine can be called from interrupt context * and may run over the top of an existing invocation. Ordering is * guaranteed by "volatile" on md_tsc. */ prev = l->l_md.md_tsc; cur = cpu_counter(); if (__predict_false(cur < prev) && (cur >> 63) == (prev >> 63) && __cpu_simple_lock_try(&lock)) { ticks = getticks(); if (ticks - lastwarn >= hz) { printf( "WARNING: %s TSC went backwards by %u - " "change sysctl(7) kern.timecounter?\n", cpu_name(curcpu()), (unsigned)(prev - cur)); lastwarn = ticks; } __cpu_simple_unlock(&lock); } l->l_md.md_tsc = cur; return (uint32_t)cur; #else return cpu_counter32(); #endif } /* * tsc has been reset; zero the cached tsc of every lwp in the system * so we don't spuriously report that the tsc has gone backward. * Caller must ensure all LWPs are quiescent (except the current one, * obviously) and interrupts are blocked while we update this. */ void tsc_tc_reset(void) { struct lwp *l; LIST_FOREACH(l, &alllwp, l_list) l->l_md.md_tsc = 0; }
7 83 83 37 146 145 148 147 83 146 147 148 144 147 147 147 37 147 147 148 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 /* $NetBSD: subr_psref.c,v 1.18 2022/02/12 16:31:06 macallan Exp $ */ /*- * Copyright (c) 2016 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Taylor R. Campbell. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /* * Passive references * * Passive references are references to objects that guarantee the * object will not be destroyed until the reference is released. * * Passive references require no interprocessor synchronization to * acquire or release. However, destroying the target of passive * references requires expensive interprocessor synchronization -- * xcalls to determine on which CPUs the object is still in use. * * Passive references may be held only on a single CPU and by a * single LWP. They require the caller to allocate a little stack * space, a struct psref object. Sleeping while a passive * reference is held is allowed, provided that the owner's LWP is * bound to a CPU -- e.g., the owner is a softint or a bound * kthread. However, sleeping should be kept to a short duration, * e.g. sleeping on an adaptive lock. * * Passive references serve as an intermediate stage between * reference counting and passive serialization (pserialize(9)): * * - If you need references to transfer from CPU to CPU or LWP to * LWP, or if you need long-term references, you must use * reference counting, e.g. with atomic operations or locks, * which incurs interprocessor synchronization for every use -- * cheaper than an xcall, but not scalable. * * - If all users *guarantee* that they will not sleep, then it is * not necessary to use passive references: you may as well just * use the even cheaper pserialize(9), because you have * satisfied the requirements of a pserialize read section. */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: subr_psref.c,v 1.18 2022/02/12 16:31:06 macallan Exp $"); #include <sys/param.h> #include <sys/types.h> #include <sys/condvar.h> #include <sys/cpu.h> #include <sys/intr.h> #include <sys/kmem.h> #include <sys/lwp.h> #include <sys/mutex.h> #include <sys/percpu.h> #include <sys/psref.h> #include <sys/queue.h> #include <sys/xcall.h> #include <sys/lwp.h> SLIST_HEAD(psref_head, psref); static bool _psref_held(const struct psref_target *, struct psref_class *, bool); /* * struct psref_class * * Private global state for a class of passive reference targets. * Opaque to callers. */ struct psref_class { kmutex_t prc_lock; kcondvar_t prc_cv; struct percpu *prc_percpu; /* struct psref_cpu */ ipl_cookie_t prc_iplcookie; unsigned int prc_xc_flags; }; /* * struct psref_cpu * * Private per-CPU state for a class of passive reference targets. * Not exposed by the API. */ struct psref_cpu { struct psref_head pcpu_head; }; /* * Data structures and functions for debugging. */ #ifndef PSREF_DEBUG_NITEMS #define PSREF_DEBUG_NITEMS 16 #endif struct psref_debug_item { void *prdi_caller; struct psref *prdi_psref; }; struct psref_debug { int prd_refs_peek; struct psref_debug_item prd_items[PSREF_DEBUG_NITEMS]; }; #ifdef PSREF_DEBUG static void psref_debug_acquire(struct psref *); static void psref_debug_release(struct psref *); static void psref_debug_lwp_free(void *); static specificdata_key_t psref_debug_lwp_key; #endif /* * psref_init() */ void psref_init(void) { #ifdef PSREF_DEBUG lwp_specific_key_create(&psref_debug_lwp_key, psref_debug_lwp_free); #endif } /* * psref_class_create(name, ipl) * * Create a new passive reference class, with the given wchan name * and ipl. */ struct psref_class * psref_class_create(const char *name, int ipl) { struct psref_class *class; ASSERT_SLEEPABLE(); class = kmem_alloc(sizeof(*class), KM_SLEEP); class->prc_percpu = percpu_alloc(sizeof(struct psref_cpu)); mutex_init(&class->prc_lock, MUTEX_DEFAULT, ipl); cv_init(&class->prc_cv, name); class->prc_iplcookie = makeiplcookie(ipl); class->prc_xc_flags = XC_HIGHPRI_IPL(ipl); return class; } static void __diagused psref_cpu_drained_p(void *p, void *cookie, struct cpu_info *ci __unused) { const struct psref_cpu *pcpu = p; bool *retp = cookie; if (!SLIST_EMPTY(&pcpu->pcpu_head)) *retp = false; } static bool __diagused psref_class_drained_p(const struct psref_class *prc) { bool ret = true; percpu_foreach(prc->prc_percpu, &psref_cpu_drained_p, &ret); return ret; } /* * psref_class_destroy(class) * * Destroy a passive reference class and free memory associated * with it. All targets in this class must have been drained and * destroyed already. */ void psref_class_destroy(struct psref_class *class) { KASSERT(psref_class_drained_p(class)); cv_destroy(&class->prc_cv); mutex_destroy(&class->prc_lock); percpu_free(class->prc_percpu, sizeof(struct psref_cpu)); kmem_free(class, sizeof(*class)); } /* * psref_target_init(target, class) * * Initialize a passive reference target in the specified class. * The caller is responsible for issuing a membar_producer after * psref_target_init and before exposing a pointer to the target * to other CPUs. */ void psref_target_init(struct psref_target *target, struct psref_class *class) { target->prt_class = class; target->prt_draining = false; } #ifdef DEBUG static bool psref_exist(struct psref_cpu *pcpu, struct psref *psref) { struct psref *_psref; SLIST_FOREACH(_psref, &pcpu->pcpu_head, psref_entry) { if (_psref == psref) return true; } return false; } static void psref_check_duplication(struct psref_cpu *pcpu, struct psref *psref, const struct psref_target *target) { bool found = false; found = psref_exist(pcpu, psref); if (found) { panic("The psref is already in the list (acquiring twice?): " "psref=%p target=%p", psref, target); } } static void psref_check_existence(struct psref_cpu *pcpu, struct psref *psref, const struct psref_target *target) { bool found = false; found = psref_exist(pcpu, psref); if (!found) { panic("The psref isn't in the list (releasing unused psref?): " "psref=%p target=%p", psref, target); } } #endif /* DEBUG */ /* * psref_acquire(psref, target, class) * * Acquire a passive reference to the specified target, which must * be in the specified class. * * The caller must guarantee that the target will not be destroyed * before psref_acquire returns. * * The caller must additionally guarantee that it will not switch * CPUs before releasing the passive reference, either by * disabling kpreemption and avoiding sleeps, or by being in a * softint or in an LWP bound to a CPU. */ void psref_acquire(struct psref *psref, const struct psref_target *target, struct psref_class *class) { struct psref_cpu *pcpu; int s; KASSERTMSG((kpreempt_disabled() || cpu_softintr_p() || ISSET(curlwp->l_pflag, LP_BOUND)), "passive references are CPU-local," " but preemption is enabled and the caller is not" " in a softint or CPU-bound LWP"); KASSERTMSG(!target->prt_draining, "psref target already destroyed: %p", target); KASSERTMSG((target->prt_class == class), "mismatched psref target class: %p (ref) != %p (expected)", target->prt_class, class); /* Block interrupts and acquire the current CPU's reference list. */ s = splraiseipl(class->prc_iplcookie); pcpu = percpu_getref(class->prc_percpu); #ifdef DEBUG /* Sanity-check if the target is already acquired with the same psref. */ psref_check_duplication(pcpu, psref, target); #endif /* Record our reference. */ SLIST_INSERT_HEAD(&pcpu->pcpu_head, psref, psref_entry); psref->psref_target = target; psref->psref_lwp = curlwp; psref->psref_cpu = curcpu(); /* Release the CPU list and restore interrupts. */ percpu_putref(class->prc_percpu); splx(s); #if defined(DIAGNOSTIC) || defined(PSREF_DEBUG) curlwp->l_psrefs++; #endif #ifdef PSREF_DEBUG psref_debug_acquire(psref); #endif } /* * psref_release(psref, target, class) * * Release a passive reference to the specified target, which must * be in the specified class. * * The caller must not have switched CPUs or LWPs since acquiring * the passive reference. */ void psref_release(struct psref *psref, const struct psref_target *target, struct psref_class *class) { struct psref_cpu *pcpu; int s; KASSERTMSG((kpreempt_disabled() || cpu_softintr_p() || ISSET(curlwp->l_pflag, LP_BOUND)), "passive references are CPU-local," " but preemption is enabled and the caller is not" " in a softint or CPU-bound LWP"); KASSERTMSG((target->prt_class == class), "mismatched psref target class: %p (ref) != %p (expected)", target->prt_class, class); /* Make sure the psref looks sensible. */ KASSERTMSG((psref->psref_target == target), "passive reference target mismatch: %p (ref) != %p (expected)", psref->psref_target, target); KASSERTMSG((psref->psref_lwp == curlwp), "passive reference transferred from lwp %p to lwp %p", psref->psref_lwp, curlwp); KASSERTMSG((psref->psref_cpu == curcpu()), "passive reference transferred from CPU %u to CPU %u", cpu_index(psref->psref_cpu), cpu_index(curcpu())); /* * Block interrupts and remove the psref from the current CPU's * list. No need to percpu_getref or get the head of the list, * and the caller guarantees that we are bound to a CPU anyway * (as does blocking interrupts). */ s = splraiseipl(class->prc_iplcookie); pcpu = percpu_getref(class->prc_percpu); #ifdef DEBUG /* Sanity-check if the target is surely acquired before. */ psref_check_existence(pcpu, psref, target); #endif SLIST_REMOVE(&pcpu->pcpu_head, psref, psref, psref_entry); percpu_putref(class->prc_percpu); splx(s); #if defined(DIAGNOSTIC) || defined(PSREF_DEBUG) KASSERT(curlwp->l_psrefs > 0); curlwp->l_psrefs--; #endif #ifdef PSREF_DEBUG psref_debug_release(psref); #endif /* If someone is waiting for users to drain, notify 'em. */ if (__predict_false(target->prt_draining)) cv_broadcast(&class->prc_cv); } /* * psref_copy(pto, pfrom, class) * * Copy a passive reference from pfrom, which must be in the * specified class, to pto. Both pfrom and pto must later be * released with psref_release. * * The caller must not have switched CPUs or LWPs since acquiring * pfrom, and must not switch CPUs or LWPs before releasing both * pfrom and pto. */ void psref_copy(struct psref *pto, const struct psref *pfrom, struct psref_class *class) { struct psref_cpu *pcpu; int s; KASSERTMSG((kpreempt_disabled() || cpu_softintr_p() || ISSET(curlwp->l_pflag, LP_BOUND)), "passive references are CPU-local," " but preemption is enabled and the caller is not" " in a softint or CPU-bound LWP"); KASSERTMSG((pto != pfrom), "can't copy passive reference to itself: %p", pto); /* Make sure the pfrom reference looks sensible. */ KASSERTMSG((pfrom->psref_lwp == curlwp), "passive reference transferred from lwp %p to lwp %p", pfrom->psref_lwp, curlwp); KASSERTMSG((pfrom->psref_cpu == curcpu()), "passive reference transferred from CPU %u to CPU %u", cpu_index(pfrom->psref_cpu), cpu_index(curcpu())); KASSERTMSG((pfrom->psref_target->prt_class == class), "mismatched psref target class: %p (ref) != %p (expected)", pfrom->psref_target->prt_class, class); /* Block interrupts and acquire the current CPU's reference list. */ s = splraiseipl(class->prc_iplcookie); pcpu = percpu_getref(class->prc_percpu); /* Record the new reference. */ SLIST_INSERT_HEAD(&pcpu->pcpu_head, pto, psref_entry); pto->psref_target = pfrom->psref_target; pto->psref_lwp = curlwp; pto->psref_cpu = curcpu(); /* Release the CPU list and restore interrupts. */ percpu_putref(class->prc_percpu); splx(s); #if defined(DIAGNOSTIC) || defined(PSREF_DEBUG) curlwp->l_psrefs++; #endif } /* * struct psreffed * * Global state for draining a psref target. */ struct psreffed { struct psref_class *class; struct psref_target *target; bool ret; }; static void psreffed_p_xc(void *cookie0, void *cookie1 __unused) { struct psreffed *P = cookie0; /* * If we hold a psref to the target, then answer true. * * This is the only dynamic decision that may be made with * psref_held. * * No need to lock anything here: every write transitions from * false to true, so there can be no conflicting writes. No * need for a memory barrier here because P->ret is read only * after xc_wait, which has already issued any necessary memory * barriers. */ if (_psref_held(P->target, P->class, true)) P->ret = true; } static bool psreffed_p(struct psref_target *target, struct psref_class *class) { struct psreffed P = { .class = class, .target = target, .ret = false, }; if (__predict_true(mp_online)) { /* * Ask all CPUs to say whether they hold a psref to the * target. */ xc_wait(xc_broadcast(class->prc_xc_flags, &psreffed_p_xc, &P, NULL)); } else psreffed_p_xc(&P, NULL); return P.ret; } /* * psref_target_destroy(target, class) * * Destroy a passive reference target. Waits for all existing * references to drain. Caller must guarantee no new references * will be acquired once it calls psref_target_destroy, e.g. by * removing the target from a global list first. May sleep. */ void psref_target_destroy(struct psref_target *target, struct psref_class *class) { ASSERT_SLEEPABLE(); KASSERTMSG(!target->prt_draining, "psref target already destroyed: %p", target); KASSERTMSG((target->prt_class == class), "mismatched psref target class: %p (ref) != %p (expected)", target->prt_class, class); /* Request psref_release to notify us when done. */ target->prt_draining = true; /* Wait until there are no more references on any CPU. */ while (psreffed_p(target, class)) { /* * This enter/wait/exit business looks wrong, but it is * both necessary, because psreffed_p performs a * low-priority xcall and hence cannot run while a * mutex is locked, and OK, because the wait is timed * -- explicit wakeups are only an optimization. */ mutex_enter(&class->prc_lock); (void)cv_timedwait(&class->prc_cv, &class->prc_lock, 1); mutex_exit(&class->prc_lock); } /* No more references. Cause subsequent psref_acquire to kassert. */ target->prt_class = NULL; } static bool _psref_held(const struct psref_target *target, struct psref_class *class, bool lwp_mismatch_ok) { const struct psref_cpu *pcpu; const struct psref *psref; int s; bool held = false; KASSERTMSG((kpreempt_disabled() || cpu_softintr_p() || ISSET(curlwp->l_pflag, LP_BOUND)), "passive references are CPU-local," " but preemption is enabled and the caller is not" " in a softint or CPU-bound LWP"); KASSERTMSG((target->prt_class == class), "mismatched psref target class: %p (ref) != %p (expected)", target->prt_class, class); /* Block interrupts and acquire the current CPU's reference list. */ s = splraiseipl(class->prc_iplcookie); pcpu = percpu_getref(class->prc_percpu); /* Search through all the references on this CPU. */ SLIST_FOREACH(psref, &pcpu->pcpu_head, psref_entry) { /* Sanity-check the reference's CPU. */ KASSERTMSG((psref->psref_cpu == curcpu()), "passive reference transferred from CPU %u to CPU %u", cpu_index(psref->psref_cpu), cpu_index(curcpu())); /* If it doesn't match, skip it and move on. */ if (psref->psref_target != target) continue; /* * Sanity-check the reference's LWP if we are asserting * via psref_held that this LWP holds it, but not if we * are testing in psref_target_destroy whether any LWP * still holds it. */ KASSERTMSG((lwp_mismatch_ok || psref->psref_lwp == curlwp), "passive reference transferred from lwp %p to lwp %p", psref->psref_lwp, curlwp); /* Stop here and report that we found it. */ held = true; break; } /* Release the CPU list and restore interrupts. */ percpu_putref(class->prc_percpu); splx(s); return held; } /* * psref_held(target, class) * * True if the current CPU holds a passive reference to target, * false otherwise. May be used only inside assertions. */ bool psref_held(const struct psref_target *target, struct psref_class *class) { return _psref_held(target, class, false); } #ifdef PSREF_DEBUG void psref_debug_init_lwp(struct lwp *l) { struct psref_debug *prd; prd = kmem_zalloc(sizeof(*prd), KM_SLEEP); lwp_setspecific_by_lwp(l, psref_debug_lwp_key, prd); } static void psref_debug_lwp_free(void *arg) { struct psref_debug *prd = arg; kmem_free(prd, sizeof(*prd)); } static void psref_debug_acquire(struct psref *psref) { struct psref_debug *prd; struct lwp *l = curlwp; int s, i; prd = lwp_getspecific(psref_debug_lwp_key); if (__predict_false(prd == NULL)) { psref->psref_debug = NULL; return; } s = splserial(); if (l->l_psrefs > prd->prd_refs_peek) { prd->prd_refs_peek = l->l_psrefs; if (__predict_false(prd->prd_refs_peek > PSREF_DEBUG_NITEMS)) panic("exceeded PSREF_DEBUG_NITEMS"); } for (i = 0; i < prd->prd_refs_peek; i++) { struct psref_debug_item *prdi = &prd->prd_items[i]; if (prdi->prdi_psref != NULL) continue; prdi->prdi_caller = psref->psref_debug; prdi->prdi_psref = psref; psref->psref_debug = prdi; break; } if (__predict_false(i == prd->prd_refs_peek)) panic("out of range: %d", i); splx(s); } static void psref_debug_release(struct psref *psref) { int s; s = splserial(); if (__predict_true(psref->psref_debug != NULL)) { struct psref_debug_item *prdi = psref->psref_debug; prdi->prdi_psref = NULL; } splx(s); } void psref_debug_barrier(void) { struct psref_debug *prd; struct lwp *l = curlwp; int s, i; prd = lwp_getspecific(psref_debug_lwp_key); if (__predict_false(prd == NULL)) return; s = splserial(); for (i = 0; i < prd->prd_refs_peek; i++) { struct psref_debug_item *prdi = &prd->prd_items[i]; if (__predict_true(prdi->prdi_psref == NULL)) continue; panic("psref leaked: lwp(%p) acquired at %p", l, prdi->prdi_caller); } prd->prd_refs_peek = 0; /* Reset the counter */ splx(s); } #endif /* PSREF_DEBUG */
5 5 5 7 7 7 6 2 5 3 2 3 3 3 2 2 3 5 5 5 4 1 5 11 91 116 116 91 116 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 /* $NetBSD: pslist.h,v 1.7 2019/12/01 15:28:19 riastradh Exp $ */ /*- * Copyright (c) 2016 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Taylor R. Campbell. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #ifndef _SYS_PSLIST_H #define _SYS_PSLIST_H #include <sys/param.h> #include <sys/atomic.h> struct pslist_head; struct pslist_entry; struct pslist_head { struct pslist_entry *plh_first; }; struct pslist_entry { struct pslist_entry **ple_prevp; struct pslist_entry *ple_next; }; #ifdef _KERNEL #define _PSLIST_ASSERT KASSERT #else #include <assert.h> #define _PSLIST_ASSERT assert #endif #define _PSLIST_POISON ((void *)1ul) /* * Initialization. Allowed only when the caller has exclusive access, * excluding writers and readers. */ static __inline void pslist_init(struct pslist_head *head) { head->plh_first = NULL; /* not yet published, so no atomic */ } static __inline void pslist_destroy(struct pslist_head *head __diagused) { _PSLIST_ASSERT(head->plh_first == NULL); } static __inline void pslist_entry_init(struct pslist_entry *entry) { entry->ple_next = NULL; entry->ple_prevp = NULL; } static __inline void pslist_entry_destroy(struct pslist_entry *entry) { _PSLIST_ASSERT(entry->ple_prevp == NULL); /* * Poison the next entry. If we used NULL here, then readers * would think they were simply at the end of the list. * Instead, cause readers to crash. */ atomic_store_relaxed(&entry->ple_next, _PSLIST_POISON); } /* * Writer operations. Caller must exclude other writers, but not * necessarily readers. * * Writes to initialize a new entry must precede its publication by * writing to plh_first / ple_next / *ple_prevp. * * The ple_prevp field is serialized by the caller's exclusive lock and * not read by readers, and hence its ordering relative to the internal * memory barriers is inconsequential. */ static __inline void pslist_writer_insert_head(struct pslist_head *head, struct pslist_entry *new) { _PSLIST_ASSERT(head->plh_first == NULL || head->plh_first->ple_prevp == &head->plh_first); _PSLIST_ASSERT(new->ple_next == NULL); _PSLIST_ASSERT(new->ple_prevp == NULL); new->ple_prevp = &head->plh_first; new->ple_next = head->plh_first; /* not yet published, so no atomic */ if (head->plh_first != NULL) head->plh_first->ple_prevp = &new->ple_next; atomic_store_release(&head->plh_first, new); } static __inline void pslist_writer_insert_before(struct pslist_entry *entry, struct pslist_entry *new) { _PSLIST_ASSERT(entry->ple_next != _PSLIST_POISON); _PSLIST_ASSERT(entry->ple_prevp != NULL); _PSLIST_ASSERT(*entry->ple_prevp == entry); _PSLIST_ASSERT(new->ple_next == NULL); _PSLIST_ASSERT(new->ple_prevp == NULL); new->ple_prevp = entry->ple_prevp; new->ple_next = entry; /* not yet published, so no atomic */ /* * Pairs with atomic_load_consume in pslist_reader_first or * pslist_reader_next. */ atomic_store_release(entry->ple_prevp, new); entry->ple_prevp = &new->ple_next; } static __inline void pslist_writer_insert_after(struct pslist_entry *entry, struct pslist_entry *new) { _PSLIST_ASSERT(entry->ple_next != _PSLIST_POISON); _PSLIST_ASSERT(entry->ple_prevp != NULL); _PSLIST_ASSERT(*entry->ple_prevp == entry); _PSLIST_ASSERT(new->ple_next == NULL); _PSLIST_ASSERT(new->ple_prevp == NULL); new->ple_prevp = &entry->ple_next; new->ple_next = entry->ple_next; /* not yet published, so no atomic */ if (new->ple_next != NULL) new->ple_next->ple_prevp = &new->ple_next; /* Pairs with atomic_load_consume in pslist_reader_next. */ atomic_store_release(&entry->ple_next, new); } static __inline void pslist_writer_remove(struct pslist_entry *entry) { _PSLIST_ASSERT(entry->ple_next != _PSLIST_POISON); _PSLIST_ASSERT(entry->ple_prevp != NULL); _PSLIST_ASSERT(*entry->ple_prevp == entry); if (entry->ple_next != NULL) entry->ple_next->ple_prevp = entry->ple_prevp; /* * No need for atomic_store_release because there's no * initialization that this must happen after -- the store * transitions from a good state with the entry to a good state * without the entry, both of which are valid for readers to * witness. */ atomic_store_relaxed(entry->ple_prevp, entry->ple_next); entry->ple_prevp = NULL; /* * Leave entry->ple_next intact so that any extant readers can * continue iterating through the list. The caller must then * wait for readers to drain, e.g. with pserialize_perform, * before destroying and reusing the entry. */ } static __inline struct pslist_entry * pslist_writer_first(const struct pslist_head *head) { return head->plh_first; } static __inline struct pslist_entry * pslist_writer_next(const struct pslist_entry *entry) { _PSLIST_ASSERT(entry->ple_next != _PSLIST_POISON); return entry->ple_next; } static __inline void * _pslist_writer_first_container(const struct pslist_head *head, const ptrdiff_t offset) { struct pslist_entry *first = head->plh_first; return (first == NULL ? NULL : (char *)first - offset); } static __inline void * _pslist_writer_next_container(const struct pslist_entry *entry, const ptrdiff_t offset) { struct pslist_entry *next = entry->ple_next; _PSLIST_ASSERT(next != _PSLIST_POISON); return (next == NULL ? NULL : (char *)next - offset); } /* * Reader operations. Caller must block pserialize_perform or * equivalent and be bound to a CPU. Only plh_first/ple_next may be * read, and only with consuming memory order so that data-dependent * loads happen afterward. */ static __inline struct pslist_entry * pslist_reader_first(const struct pslist_head *head) { /* * Pairs with atomic_store_release in pslist_writer_insert_head * or pslist_writer_insert_before. */ return atomic_load_consume(&head->plh_first); } static __inline struct pslist_entry * pslist_reader_next(const struct pslist_entry *entry) { /* * Pairs with atomic_store_release in * pslist_writer_insert_before or pslist_writer_insert_after. */ struct pslist_entry *next = atomic_load_consume(&entry->ple_next); _PSLIST_ASSERT(next != _PSLIST_POISON); return next; } static __inline void * _pslist_reader_first_container(const struct pslist_head *head, const ptrdiff_t offset) { struct pslist_entry *first = pslist_reader_first(head); if (first == NULL) return NULL; return (char *)first - offset; } static __inline void * _pslist_reader_next_container(const struct pslist_entry *entry, const ptrdiff_t offset) { struct pslist_entry *next = pslist_reader_next(entry); if (next == NULL) return NULL; return (char *)next - offset; } /* * Type-safe macros for convenience. */ #if defined(__COVERITY__) || defined(__LGTM_BOT__) #define _PSLIST_VALIDATE_PTRS(P, Q) 0 #define _PSLIST_VALIDATE_CONTAINER(P, T, F) 0 #else #define _PSLIST_VALIDATE_PTRS(P, Q) \ (0 * sizeof((P) - (Q)) * sizeof(*(P)) * sizeof(*(Q))) #define _PSLIST_VALIDATE_CONTAINER(P, T, F) \ (0 * sizeof((P) - &((T *)(((char *)(P)) - offsetof(T, F)))->F)) #endif #define PSLIST_INITIALIZER { .plh_first = NULL } #define PSLIST_ENTRY_INITIALIZER { .ple_next = NULL, .ple_prevp = NULL } #define PSLIST_INIT(H) pslist_init((H)) #define PSLIST_DESTROY(H) pslist_destroy((H)) #define PSLIST_ENTRY_INIT(E, F) pslist_entry_init(&(E)->F) #define PSLIST_ENTRY_DESTROY(E, F) pslist_entry_destroy(&(E)->F) #define PSLIST_WRITER_INSERT_HEAD(H, V, F) \ pslist_writer_insert_head((H), &(V)->F) #define PSLIST_WRITER_INSERT_BEFORE(E, N, F) \ pslist_writer_insert_before(&(E)->F + _PSLIST_VALIDATE_PTRS(E, N), \ &(N)->F) #define PSLIST_WRITER_INSERT_AFTER(E, N, F) \ pslist_writer_insert_after(&(E)->F + _PSLIST_VALIDATE_PTRS(E, N), \ &(N)->F) #define PSLIST_WRITER_REMOVE(E, F) \ pslist_writer_remove(&(E)->F) #define PSLIST_WRITER_FIRST(H, T, F) \ ((T *)(_pslist_writer_first_container((H), offsetof(T, F))) + \ _PSLIST_VALIDATE_CONTAINER(pslist_writer_first(H), T, F)) #define PSLIST_WRITER_NEXT(V, T, F) \ ((T *)(_pslist_writer_next_container(&(V)->F, offsetof(T, F))) + \ _PSLIST_VALIDATE_CONTAINER(pslist_writer_next(&(V)->F), T, F)) #define PSLIST_WRITER_FOREACH(V, H, T, F) \ for ((V) = PSLIST_WRITER_FIRST((H), T, F); \ (V) != NULL; \ (V) = PSLIST_WRITER_NEXT((V), T, F)) #define PSLIST_READER_FIRST(H, T, F) \ ((T *)(_pslist_reader_first_container((H), offsetof(T, F))) + \ _PSLIST_VALIDATE_CONTAINER(pslist_reader_first(H), T, F)) #define PSLIST_READER_NEXT(V, T, F) \ ((T *)(_pslist_reader_next_container(&(V)->F, offsetof(T, F))) + \ _PSLIST_VALIDATE_CONTAINER(pslist_reader_next(&(V)->F), T, F)) #define PSLIST_READER_FOREACH(V, H, T, F) \ for ((V) = PSLIST_READER_FIRST((H), T, F); \ (V) != NULL; \ (V) = PSLIST_READER_NEXT((V), T, F)) #endif /* _SYS_PSLIST_H */
2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 /* $NetBSD: init_sysctl.c,v 1.228 2023/09/09 16:01:09 christos Exp $ */ /*- * Copyright (c) 2003, 2007, 2008, 2009 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Andrew Brown, and by Andrew Doran. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: init_sysctl.c,v 1.228 2023/09/09 16:01:09 christos Exp $"); #include "opt_sysv.h" #include "opt_compat_netbsd.h" #include "opt_modular.h" #include "opt_gprof.h" #include "pty.h" #include <sys/param.h> #include <sys/types.h> #include <dev/cons.h> #include <sys/conf.h> #include <sys/cprng.h> #include <sys/cpu.h> #include <sys/device.h> #include <sys/disklabel.h> #include <sys/errno.h> #include <sys/exec.h> #include <sys/filedesc.h> #include <sys/file.h> #include <sys/kauth.h> #include <sys/kernel.h> #include <sys/kmem.h> #include <sys/ktrace.h> #include <sys/mount.h> #include <sys/namei.h> #include <sys/reboot.h> #include <sys/resource.h> #include <sys/resourcevar.h> #include <sys/socketvar.h> #include <sys/stat.h> #include <sys/sysctl.h> #include <sys/systm.h> #include <sys/tty.h> #include <sys/unistd.h> #include <sys/vnode_impl.h> /* For vfs_drainvnodes(). */ int security_setidcore_dump; char security_setidcore_path[MAXPATHLEN] = "/var/crash/%n.core"; uid_t security_setidcore_owner = 0; gid_t security_setidcore_group = 0; mode_t security_setidcore_mode = (S_IRUSR|S_IWUSR); /* * Current status of SysV IPC capability. Initially, these are * 0 if the capability is not built-in to the kernel, but can * be updated if the appropriate kernel module is (auto)loaded. */ int kern_has_sysvmsg = 0; int kern_has_sysvshm = 0; int kern_has_sysvsem = 0; static const u_int sysctl_lwpprflagmap[] = { LPR_DETACHED, L_DETACHED, 0 }; /* * try over estimating by 5 procs/lwps */ #define KERN_LWPSLOP (5 * sizeof(struct kinfo_lwp)) static int dcopyout(struct lwp *, const void *, void *, size_t); static int dcopyout(struct lwp *l, const void *kaddr, void *uaddr, size_t len) { int error; error = copyout(kaddr, uaddr, len); ktrmibio(-1, UIO_READ, uaddr, len, error); return error; } static int sysctl_kern_maxvnodes(SYSCTLFN_PROTO); static int sysctl_kern_messages(SYSCTLFN_PROTO); static int sysctl_kern_boottime(SYSCTLFN_PROTO); static int sysctl_kern_rtc_offset(SYSCTLFN_PROTO); static int sysctl_kern_maxproc(SYSCTLFN_PROTO); static int sysctl_kern_hostid(SYSCTLFN_PROTO); static int sysctl_kern_defcorename(SYSCTLFN_PROTO); static int sysctl_kern_cptime(SYSCTLFN_PROTO); #if NPTY > 0 static int sysctl_kern_maxptys(SYSCTLFN_PROTO); #endif /* NPTY > 0 */ static int sysctl_kern_lwp(SYSCTLFN_PROTO); static int sysctl_kern_forkfsleep(SYSCTLFN_PROTO); static int sysctl_kern_root_partition(SYSCTLFN_PROTO); static int sysctl_kern_drivers(SYSCTLFN_PROTO); static int sysctl_security_setidcore(SYSCTLFN_PROTO); static int sysctl_security_setidcorename(SYSCTLFN_PROTO); static int sysctl_kern_cpid(SYSCTLFN_PROTO); static int sysctl_hw_usermem(SYSCTLFN_PROTO); static int sysctl_hw_cnmagic(SYSCTLFN_PROTO); static void fill_lwp(struct lwp *l, struct kinfo_lwp *kl); /* * ******************************************************************** * section 1: setup routines * ******************************************************************** * These functions are stuffed into a link set for sysctl setup * functions. They're never called or referenced from anywhere else. * ******************************************************************** */ /* * this setup routine is a replacement for kern_sysctl() */ SYSCTL_SETUP(sysctl_kern_setup, "sysctl kern subtree setup") { extern int kern_logsigexit; /* defined in kern/kern_sig.c */ extern fixpt_t ccpu; /* defined in kern/kern_synch.c */ extern int dumponpanic; /* defined in kern/subr_prf.c */ const struct sysctlnode *rnode; sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_INT, "maxvnodes", SYSCTL_DESCR("Maximum number of vnodes"), sysctl_kern_maxvnodes, 0, NULL, 0, CTL_KERN, KERN_MAXVNODES, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_INT, "maxproc", SYSCTL_DESCR("Maximum number of simultaneous processes"), sysctl_kern_maxproc, 0, NULL, 0, CTL_KERN, KERN_MAXPROC, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_INT, "maxfiles", SYSCTL_DESCR("Maximum number of open files"), NULL, 0, &maxfiles, 0, CTL_KERN, KERN_MAXFILES, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_IMMEDIATE, CTLTYPE_INT, "argmax", SYSCTL_DESCR("Maximum number of bytes of arguments to " "execve(2)"), NULL, ARG_MAX, NULL, 0, CTL_KERN, KERN_ARGMAX, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_READWRITE|CTLFLAG_HEX, CTLTYPE_INT, "hostid", SYSCTL_DESCR("System host ID number"), sysctl_kern_hostid, 0, NULL, 0, CTL_KERN, KERN_HOSTID, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT, CTLTYPE_STRUCT, "vnode", SYSCTL_DESCR("System vnode table"), sysctl_kern_vnode, 0, NULL, 0, CTL_KERN, KERN_VNODE, CTL_EOL); #ifndef GPROF sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT, CTLTYPE_NODE, "profiling", SYSCTL_DESCR("Profiling information (not available)"), sysctl_notavail, 0, NULL, 0, CTL_KERN, KERN_PROF, CTL_EOL); #endif sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_IMMEDIATE, CTLTYPE_INT, "posix1version", SYSCTL_DESCR("Version of ISO/IEC 9945 (POSIX 1003.1) " "with which the operating system attempts " "to comply"), NULL, _POSIX_VERSION, NULL, 0, CTL_KERN, KERN_POSIX1, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_IMMEDIATE, CTLTYPE_INT, "ngroups", SYSCTL_DESCR("Maximum number of supplemental groups"), NULL, NGROUPS_MAX, NULL, 0, CTL_KERN, KERN_NGROUPS, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_IMMEDIATE, CTLTYPE_INT, "job_control", SYSCTL_DESCR("Whether job control is available"), NULL, 1, NULL, 0, CTL_KERN, KERN_JOB_CONTROL, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_IMMEDIATE, CTLTYPE_INT, "saved_ids", SYSCTL_DESCR("Whether POSIX saved set-group/user ID is " "available"), NULL, #ifdef _POSIX_SAVED_IDS 1, #else /* _POSIX_SAVED_IDS */ 0, #endif /* _POSIX_SAVED_IDS */ NULL, 0, CTL_KERN, KERN_SAVED_IDS, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_HEX, CTLTYPE_INT, "boothowto", SYSCTL_DESCR("Flags from boot loader"), NULL, 0, &boothowto, sizeof(boothowto), CTL_KERN, CTL_CREATE, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT, CTLTYPE_STRUCT, "boottime", SYSCTL_DESCR("System boot time"), sysctl_kern_boottime, 0, NULL, sizeof(struct timespec), CTL_KERN, KERN_BOOTTIME, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_IMMEDIATE, CTLTYPE_INT, "maxpartitions", SYSCTL_DESCR("Maximum number of partitions allowed per " "disk"), NULL, MAXPARTITIONS, NULL, 0, CTL_KERN, KERN_MAXPARTITIONS, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT, CTLTYPE_STRUCT, "timex", NULL, sysctl_notavail, 0, NULL, 0, CTL_KERN, KERN_TIMEX, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_INT, "rtc_offset", SYSCTL_DESCR("Offset of real time clock from UTC in " "minutes"), sysctl_kern_rtc_offset, 0, &rtc_offset, 0, CTL_KERN, KERN_RTC_OFFSET, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT, CTLTYPE_STRING, "root_device", SYSCTL_DESCR("Name of the root device"), sysctl_root_device, 0, NULL, 0, CTL_KERN, KERN_ROOT_DEVICE, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_IMMEDIATE, CTLTYPE_INT, "fsync", SYSCTL_DESCR("Whether the POSIX 1003.1b File " "Synchronization Option is available on " "this system"), NULL, 1, NULL, 0, CTL_KERN, KERN_FSYNC, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT, CTLTYPE_NODE, "ipc", SYSCTL_DESCR("SysV IPC options"), NULL, 0, NULL, 0, CTL_KERN, KERN_SYSVIPC, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_READONLY, CTLTYPE_INT, "sysvmsg", SYSCTL_DESCR("System V style message support available"), NULL, 0, &kern_has_sysvmsg, sizeof(int), CTL_KERN, KERN_SYSVIPC, KERN_SYSVIPC_MSG, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_READONLY, CTLTYPE_INT, "sysvsem", SYSCTL_DESCR("System V style semaphore support " "available"), NULL, 0, &kern_has_sysvsem, sizeof(int), CTL_KERN, KERN_SYSVIPC, KERN_SYSVIPC_SEM, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_READONLY, CTLTYPE_INT, "sysvshm", SYSCTL_DESCR("System V style shared memory support " "available"), NULL, 0, &kern_has_sysvshm, sizeof(int), CTL_KERN, KERN_SYSVIPC, KERN_SYSVIPC_SHM, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_IMMEDIATE, CTLTYPE_INT, "synchronized_io", SYSCTL_DESCR("Whether the POSIX 1003.1b Synchronized " "I/O Option is available on this system"), NULL, 1, NULL, 0, CTL_KERN, KERN_SYNCHRONIZED_IO, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_IMMEDIATE, CTLTYPE_INT, "iov_max", SYSCTL_DESCR("Maximum number of iovec structures per " "process"), NULL, IOV_MAX, NULL, 0, CTL_KERN, KERN_IOV_MAX, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_IMMEDIATE, CTLTYPE_INT, "mapped_files", SYSCTL_DESCR("Whether the POSIX 1003.1b Memory Mapped " "Files Option is available on this system"), NULL, 1, NULL, 0, CTL_KERN, KERN_MAPPED_FILES, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_IMMEDIATE, CTLTYPE_INT, "memlock", SYSCTL_DESCR("Whether the POSIX 1003.1b Process Memory " "Locking Option is available on this " "system"), NULL, 1, NULL, 0, CTL_KERN, KERN_MEMLOCK, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_IMMEDIATE, CTLTYPE_INT, "memlock_range", SYSCTL_DESCR("Whether the POSIX 1003.1b Range Memory " "Locking Option is available on this " "system"), NULL, 1, NULL, 0, CTL_KERN, KERN_MEMLOCK_RANGE, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_IMMEDIATE, CTLTYPE_INT, "memory_protection", SYSCTL_DESCR("Whether the POSIX 1003.1b Memory " "Protection Option is available on this " "system"), NULL, 1, NULL, 0, CTL_KERN, KERN_MEMORY_PROTECTION, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_IMMEDIATE, CTLTYPE_INT, "login_name_max", SYSCTL_DESCR("Maximum login name length"), NULL, LOGIN_NAME_MAX, NULL, 0, CTL_KERN, KERN_LOGIN_NAME_MAX, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_STRING, "defcorename", SYSCTL_DESCR("Default core file name"), sysctl_kern_defcorename, 0, defcorename, MAXPATHLEN, CTL_KERN, KERN_DEFCORENAME, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_INT, "logsigexit", SYSCTL_DESCR("Log process exit when caused by signals"), NULL, 0, &kern_logsigexit, 0, CTL_KERN, KERN_LOGSIGEXIT, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_IMMEDIATE, CTLTYPE_INT, "fscale", SYSCTL_DESCR("Kernel fixed-point scale factor"), NULL, FSCALE, NULL, 0, CTL_KERN, KERN_FSCALE, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT, CTLTYPE_INT, "ccpu", SYSCTL_DESCR("Scheduler exponential decay value"), NULL, 0, &ccpu, 0, CTL_KERN, KERN_CCPU, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT, CTLTYPE_STRUCT, "cp_time", SYSCTL_DESCR("Clock ticks spent in different CPU states"), sysctl_kern_cptime, 0, NULL, 0, CTL_KERN, KERN_CP_TIME, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT, CTLTYPE_STRUCT, "consdev", SYSCTL_DESCR("Console device"), sysctl_consdev, 0, NULL, sizeof(dev_t), CTL_KERN, KERN_CONSDEV, CTL_EOL); #if NPTY > 0 sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT, CTLTYPE_INT, "maxptys", SYSCTL_DESCR("Maximum number of pseudo-ttys"), sysctl_kern_maxptys, 0, NULL, 0, CTL_KERN, KERN_MAXPTYS, CTL_EOL); #endif /* NPTY > 0 */ sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_IMMEDIATE, CTLTYPE_INT, "maxphys", SYSCTL_DESCR("Maximum raw I/O transfer size"), NULL, MAXPHYS, NULL, 0, CTL_KERN, KERN_MAXPHYS, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_IMMEDIATE, CTLTYPE_INT, "monotonic_clock", SYSCTL_DESCR("Implementation version of the POSIX " "1003.1b Monotonic Clock Option"), /* XXX _POSIX_VERSION */ NULL, _POSIX_MONOTONIC_CLOCK, NULL, 0, CTL_KERN, KERN_MONOTONIC_CLOCK, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_IMMEDIATE, CTLTYPE_INT, "labelsector", SYSCTL_DESCR("Sector number containing the disklabel"), NULL, LABELSECTOR, NULL, 0, CTL_KERN, KERN_LABELSECTOR, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_IMMEDIATE, CTLTYPE_INT, "labeloffset", SYSCTL_DESCR("Offset of the disklabel within the " "sector"), NULL, LABELOFFSET, NULL, 0, CTL_KERN, KERN_LABELOFFSET, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_IMMEDIATE, CTLTYPE_INT, "labelusesmbr", SYSCTL_DESCR("disklabel is inside MBR partition"), NULL, LABELUSESMBR, NULL, 0, CTL_KERN, CTL_CREATE, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT, CTLTYPE_NODE, "lwp", SYSCTL_DESCR("System-wide LWP information"), sysctl_kern_lwp, 0, NULL, 0, CTL_KERN, KERN_LWP, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_INT, "forkfsleep", SYSCTL_DESCR("Milliseconds to sleep on fork failure due " "to process limits"), sysctl_kern_forkfsleep, 0, NULL, 0, CTL_KERN, KERN_FORKFSLEEP, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_IMMEDIATE, CTLTYPE_INT, "posix_threads", SYSCTL_DESCR("Version of IEEE Std 1003.1 and its " "Threads option to which the system " "attempts to conform"), /* XXX _POSIX_VERSION */ NULL, _POSIX_THREADS, NULL, 0, CTL_KERN, KERN_POSIX_THREADS, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_IMMEDIATE, CTLTYPE_INT, "posix_semaphores", SYSCTL_DESCR("Version of IEEE Std 1003.1 and its " "Semaphores option to which the system " "attempts to conform"), NULL, 200112, NULL, 0, CTL_KERN, KERN_POSIX_SEMAPHORES, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_IMMEDIATE, CTLTYPE_INT, "posix_barriers", SYSCTL_DESCR("Version of IEEE Std 1003.1 and its " "Barriers option to which the system " "attempts to conform"), /* XXX _POSIX_VERSION */ NULL, _POSIX_BARRIERS, NULL, 0, CTL_KERN, KERN_POSIX_BARRIERS, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_IMMEDIATE, CTLTYPE_INT, "posix_timers", SYSCTL_DESCR("Version of IEEE Std 1003.1 and its " "Timers option to which the system " "attempts to conform"), /* XXX _POSIX_VERSION */ NULL, _POSIX_TIMERS, NULL, 0, CTL_KERN, KERN_POSIX_TIMERS, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_IMMEDIATE, CTLTYPE_INT, "posix_spin_locks", SYSCTL_DESCR("Version of IEEE Std 1003.1 and its Spin " "Locks option to which the system attempts " "to conform"), /* XXX _POSIX_VERSION */ NULL, _POSIX_SPIN_LOCKS, NULL, 0, CTL_KERN, KERN_POSIX_SPIN_LOCKS, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_IMMEDIATE, CTLTYPE_INT, "posix_reader_writer_locks", SYSCTL_DESCR("Version of IEEE Std 1003.1 and its " "Read-Write Locks option to which the " "system attempts to conform"), /* XXX _POSIX_VERSION */ NULL, _POSIX_READER_WRITER_LOCKS, NULL, 0, CTL_KERN, KERN_POSIX_READER_WRITER_LOCKS, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_INT, "dump_on_panic", SYSCTL_DESCR("Perform a crash dump on system panic"), NULL, 0, &dumponpanic, 0, CTL_KERN, KERN_DUMP_ON_PANIC, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT, CTLTYPE_INT, "root_partition", SYSCTL_DESCR("Root partition on the root device"), sysctl_kern_root_partition, 0, NULL, 0, CTL_KERN, KERN_ROOT_PARTITION, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT, CTLTYPE_STRUCT, "drivers", SYSCTL_DESCR("List of all drivers with block and " "character device numbers"), sysctl_kern_drivers, 0, NULL, 0, CTL_KERN, KERN_DRIVERS, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT, CTLTYPE_STRUCT, "cp_id", SYSCTL_DESCR("Mapping of CPU number to CPU id"), sysctl_kern_cpid, 0, NULL, 0, CTL_KERN, KERN_CP_ID, CTL_EOL); sysctl_createv(clog, 0, NULL, &rnode, CTLFLAG_PERMANENT, CTLTYPE_NODE, "coredump", SYSCTL_DESCR("Coredump settings."), NULL, 0, NULL, 0, CTL_KERN, CTL_CREATE, CTL_EOL); sysctl_createv(clog, 0, &rnode, &rnode, CTLFLAG_PERMANENT, CTLTYPE_NODE, "setid", SYSCTL_DESCR("Set-id processes' coredump settings."), NULL, 0, NULL, 0, CTL_CREATE, CTL_EOL); sysctl_createv(clog, 0, &rnode, NULL, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_INT, "dump", SYSCTL_DESCR("Allow set-id processes to dump core."), sysctl_security_setidcore, 0, &security_setidcore_dump, sizeof(security_setidcore_dump), CTL_CREATE, CTL_EOL); sysctl_createv(clog, 0, &rnode, NULL, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_STRING, "path", SYSCTL_DESCR("Path pattern for set-id coredumps."), sysctl_security_setidcorename, 0, security_setidcore_path, sizeof(security_setidcore_path), CTL_CREATE, CTL_EOL); sysctl_createv(clog, 0, &rnode, NULL, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_INT, "owner", SYSCTL_DESCR("Owner id for set-id processes' cores."), sysctl_security_setidcore, 0, &security_setidcore_owner, 0, CTL_CREATE, CTL_EOL); sysctl_createv(clog, 0, &rnode, NULL, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_INT, "group", SYSCTL_DESCR("Group id for set-id processes' cores."), sysctl_security_setidcore, 0, &security_setidcore_group, 0, CTL_CREATE, CTL_EOL); sysctl_createv(clog, 0, &rnode, NULL, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_INT, "mode", SYSCTL_DESCR("Mode for set-id processes' cores."), sysctl_security_setidcore, 0, &security_setidcore_mode, 0, CTL_CREATE, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_IMMEDIATE|CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_INT, "no_sa_support", SYSCTL_DESCR("0 if the kernel supports SA, otherwise " "it doesn't"), NULL, 1, NULL, 0, CTL_KERN, CTL_CREATE, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT, CTLTYPE_STRING, "configname", SYSCTL_DESCR("Name of config file"), NULL, 0, __UNCONST(kernel_ident), 0, CTL_KERN, CTL_CREATE, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT, CTLTYPE_STRING, "buildinfo", SYSCTL_DESCR("Information from build environment"), NULL, 0, __UNCONST(buildinfo), 0, CTL_KERN, CTL_CREATE, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_INT, "messages", SYSCTL_DESCR("Kernel message verbosity"), sysctl_kern_messages, 0, NULL, 0, CTL_KERN, CTL_CREATE, CTL_EOL); } SYSCTL_SETUP(sysctl_hw_misc_setup, "sysctl hw subtree misc setup") { sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT, CTLTYPE_INT, "usermem", SYSCTL_DESCR("Bytes of non-kernel memory"), sysctl_hw_usermem, 0, NULL, 0, CTL_HW, HW_USERMEM, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_READWRITE|CTLFLAG_HEX, CTLTYPE_STRING, "cnmagic", SYSCTL_DESCR("Console magic key sequence"), sysctl_hw_cnmagic, 0, NULL, CNS_LEN, CTL_HW, HW_CNMAGIC, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT, CTLTYPE_QUAD, "usermem64", SYSCTL_DESCR("Bytes of non-kernel memory"), sysctl_hw_usermem, 0, NULL, 0, CTL_HW, HW_USERMEM64, CTL_EOL); } #ifdef DEBUG /* * Debugging related system variables. */ struct ctldebug /* debug0, */ /* debug1, */ debug2, debug3, debug4; struct ctldebug debug5, debug6, debug7, debug8, debug9; struct ctldebug debug10, debug11, debug12, debug13, debug14; struct ctldebug debug15, debug16, debug17, debug18, debug19; static struct ctldebug *debugvars[] = { &debug0, &debug1, &debug2, &debug3, &debug4, &debug5, &debug6, &debug7, &debug8, &debug9, &debug10, &debug11, &debug12, &debug13, &debug14, &debug15, &debug16, &debug17, &debug18, &debug19, }; /* * this setup routine is a replacement for debug_sysctl() * * note that it creates several nodes per defined debug variable */ SYSCTL_SETUP(sysctl_debug_setup, "sysctl debug subtree setup") { struct ctldebug *cdp; char nodename[20]; int i; /* * two ways here: * * the "old" way (debug.name -> value) which was emulated by * the sysctl(8) binary * * the new way, which the sysctl(8) binary was actually using node debug node debug.0 string debug.0.name int debug.0.value int debug.name */ for (i = 0; i < __arraycount(debugvars); i++) { cdp = debugvars[i]; if (cdp->debugname == NULL || cdp->debugvar == NULL) continue; snprintf(nodename, sizeof(nodename), "debug%d", i); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_HIDDEN, CTLTYPE_NODE, nodename, NULL, NULL, 0, NULL, 0, CTL_DEBUG, i, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_HIDDEN, CTLTYPE_STRING, "name", NULL, /*XXXUNCONST*/ NULL, 0, __UNCONST(cdp->debugname), 0, CTL_DEBUG, i, CTL_DEBUG_NAME, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_HIDDEN, CTLTYPE_INT, "value", NULL, NULL, 0, cdp->debugvar, 0, CTL_DEBUG, i, CTL_DEBUG_VALUE, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT, CTLTYPE_INT, cdp->debugname, NULL, NULL, 0, cdp->debugvar, 0, CTL_DEBUG, CTL_CREATE, CTL_EOL); } } #endif /* DEBUG */ /* * ******************************************************************** * section 2: private node-specific helper routines. * ******************************************************************** */ /* * sysctl helper routine for kern.maxvnodes. Drain vnodes if * new value is lower than desiredvnodes and then calls reinit * routines that needs to adjust to the new value. */ static int sysctl_kern_maxvnodes(SYSCTLFN_ARGS) { int error, new_vnodes, old_vnodes, new_max; struct sysctlnode node; new_vnodes = desiredvnodes; node = *rnode; node.sysctl_data = &new_vnodes; error = sysctl_lookup(SYSCTLFN_CALL(&node)); if (error || newp == NULL) return (error); /* * sysctl passes down unsigned values, require them * to be positive */ if (new_vnodes <= 0) return (EINVAL); /* Limits: 75% of kmem and physical memory. */ new_max = calc_cache_size(vmem_size(kmem_arena, VMEM_FREE|VMEM_ALLOC), 75, 75) / VNODE_COST; if (new_vnodes > new_max) new_vnodes = new_max; old_vnodes = desiredvnodes; desiredvnodes = new_vnodes; error = vfs_drainvnodes(); if (error) { desiredvnodes = old_vnodes; return (error); } vfs_reinit(); return (0); } /* * sysctl helper routine for kern.messages. * Alters boothowto to display kernel messages in increasing verbosity * from 0 to 4. */ #define MAXMESSAGES 4 static int sysctl_kern_messages(SYSCTLFN_ARGS) { int error, messageverbose, messagemask, newboothowto; struct sysctlnode node; messagemask = (AB_NORMAL|AB_QUIET|AB_SILENT|AB_VERBOSE|AB_DEBUG); switch (boothowto & messagemask) { case AB_SILENT: messageverbose = 0; break; case AB_QUIET: messageverbose = 1; break; case AB_VERBOSE: messageverbose = 3; break; case AB_DEBUG: messageverbose = 4; break; case AB_NORMAL: default: messageverbose = 2; } node = *rnode; node.sysctl_data = &messageverbose; error = sysctl_lookup(SYSCTLFN_CALL(&node)); if (error || newp == NULL) return (error); if (messageverbose < 0 || messageverbose > MAXMESSAGES) return EINVAL; /* Set boothowto */ newboothowto = boothowto & ~messagemask; switch (messageverbose) { case 0: newboothowto |= AB_SILENT; break; case 1: newboothowto |= AB_QUIET; break; case 3: newboothowto |= AB_VERBOSE; break; case 4: newboothowto |= AB_DEBUG; break; case 2: default: /* Messages default to normal. */ break; } boothowto = newboothowto; return (0); } /* * sysctl helper routine for the kern.boottime node */ static int sysctl_kern_boottime(SYSCTLFN_ARGS) { struct sysctlnode node; struct timespec ts; getnanoboottime(&ts); node = *rnode; node.sysctl_data = &ts; return (sysctl_lookup(SYSCTLFN_CALL(&node))); } /* * sysctl helper routine for rtc_offset - set time after changes */ static int sysctl_kern_rtc_offset(SYSCTLFN_ARGS) { struct timespec ts, delta; int error, new_rtc_offset; struct sysctlnode node; new_rtc_offset = rtc_offset; node = *rnode; node.sysctl_data = &new_rtc_offset; error = sysctl_lookup(SYSCTLFN_CALL(&node)); if (error || newp == NULL) return (error); if (kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_TIME, KAUTH_REQ_SYSTEM_TIME_RTCOFFSET, KAUTH_ARG(new_rtc_offset), NULL, NULL)) return (EPERM); if (rtc_offset == new_rtc_offset) return (0); /* if we change the offset, adjust the time */ nanotime(&ts); delta.tv_sec = 60 * (new_rtc_offset - rtc_offset); delta.tv_nsec = 0; timespecadd(&ts, &delta, &ts); rtc_offset = new_rtc_offset; return (settime(l->l_proc, &ts)); } /* * sysctl helper routine for kern.maxproc. Ensures that the new * values are not too low or too high. */ static int sysctl_kern_maxproc(SYSCTLFN_ARGS) { int error, nmaxproc; struct sysctlnode node; nmaxproc = maxproc; node = *rnode; node.sysctl_data = &nmaxproc; error = sysctl_lookup(SYSCTLFN_CALL(&node)); if (error || newp == NULL) return (error); if (nmaxproc < 0 || nmaxproc >= PID_MAX) return (EINVAL); #ifdef __HAVE_CPU_MAXPROC if (nmaxproc > cpu_maxproc()) return (EINVAL); #endif error = 0; #ifdef __HAVE_MAXPROC_HOOK error = cpu_maxproc_hook(nmaxproc); #endif if (error) return error; maxproc = nmaxproc; return (0); } /* * sysctl helper function for kern.hostid. The hostid is a long, but * we export it as an int, so we need to give it a little help. */ static int sysctl_kern_hostid(SYSCTLFN_ARGS) { int error, inthostid; struct sysctlnode node; inthostid = hostid; /* XXX assumes sizeof int <= sizeof long */ node = *rnode; node.sysctl_data = &inthostid; error = sysctl_lookup(SYSCTLFN_CALL(&node)); if (error || newp == NULL) return (error); hostid = (unsigned)inthostid; return (0); } /* * sysctl helper routine for kern.defcorename. In the case of a new * string being assigned, check that it's not a zero-length string. * (XXX the check in -current doesn't work, but do we really care?) */ static int sysctl_kern_defcorename(SYSCTLFN_ARGS) { int error; char *newcorename; struct sysctlnode node; newcorename = PNBUF_GET(); node = *rnode; node.sysctl_data = &newcorename[0]; memcpy(node.sysctl_data, rnode->sysctl_data, MAXPATHLEN); error = sysctl_lookup(SYSCTLFN_CALL(&node)); if (error || newp == NULL) { goto done; } /* * when sysctl_lookup() deals with a string, it's guaranteed * to come back nul terminated. So there. :) */ if (strlen(newcorename) == 0) { error = EINVAL; } else { memcpy(rnode->sysctl_data, node.sysctl_data, MAXPATHLEN); error = 0; } done: PNBUF_PUT(newcorename); return error; } /* * sysctl helper routine for kern.cp_time node. Adds up cpu time * across all cpus. */ static int sysctl_kern_cptime(SYSCTLFN_ARGS) { struct sysctlnode node = *rnode; uint64_t *cp_time = NULL; int error, n = ncpu, i; struct cpu_info *ci; CPU_INFO_ITERATOR cii; /* * if you specifically pass a buffer that is the size of the * sum, or if you are probing for the size, you get the "sum" * of cp_time (and the size thereof) across all processors. * * alternately, you can pass an additional mib number and get * cp_time for that particular processor. */ switch (namelen) { case 0: if (*oldlenp == sizeof(uint64_t) * CPUSTATES || oldp == NULL) { node.sysctl_size = sizeof(uint64_t) * CPUSTATES; n = -1; /* SUM */ } else { node.sysctl_size = n * sizeof(uint64_t) * CPUSTATES; n = -2; /* ALL */ } break; case 1: if (name[0] < 0 || name[0] >= n) return (ENOENT); /* ENOSUCHPROCESSOR */ node.sysctl_size = sizeof(uint64_t) * CPUSTATES; n = name[0]; /* * adjust these so that sysctl_lookup() will be happy */ name++; namelen--; break; default: return (EINVAL); } cp_time = kmem_alloc(node.sysctl_size, KM_SLEEP); node.sysctl_data = cp_time; memset(cp_time, 0, node.sysctl_size); for (CPU_INFO_FOREACH(cii, ci)) { if (n <= 0) { for (i = 0; i < CPUSTATES; i++) { cp_time[i] += ci->ci_schedstate.spc_cp_time[i]; } } /* * if a specific processor was requested and we just * did it, we're done here */ if (n == 0) break; /* * if doing "all", skip to next cp_time set for next processor */ if (n == -2) cp_time += CPUSTATES; /* * if we're doing a specific processor, we're one * processor closer */ if (n > 0) n--; } error = sysctl_lookup(SYSCTLFN_CALL(&node)); kmem_free(node.sysctl_data, node.sysctl_size); return (error); } #if NPTY > 0 /* * sysctl helper routine for kern.maxptys. Ensures that any new value * is acceptable to the pty subsystem. */ static int sysctl_kern_maxptys(SYSCTLFN_ARGS) { int pty_maxptys(int, int); /* defined in kern/tty_pty.c */ int error, xmax; struct sysctlnode node; /* get current value of maxptys */ xmax = pty_maxptys(0, 0); node = *rnode; node.sysctl_data = &xmax; error = sysctl_lookup(SYSCTLFN_CALL(&node)); if (error || newp == NULL) return (error); if (xmax != pty_maxptys(xmax, 1)) return (EINVAL); return (0); } #endif /* NPTY > 0 */ /* * sysctl helper routine to do kern.lwp.* work. */ static int sysctl_kern_lwp(SYSCTLFN_ARGS) { struct kinfo_lwp klwp; struct proc *p; struct lwp *l2, *l3; char *where, *dp; int pid, elem_size, elem_count; int buflen, needed, error; bool gotit; hash_value_ensure_initialized(); if (namelen == 1 && name[0] == CTL_QUERY) return (sysctl_query(SYSCTLFN_CALL(rnode))); dp = where = oldp; buflen = where != NULL ? *oldlenp : 0; error = needed = 0; if (newp != NULL || namelen != 3) return (EINVAL); pid = name[0]; elem_size = name[1]; elem_count = name[2]; sysctl_unlock(); if (pid == -1) { mutex_enter(&proc_lock); PROCLIST_FOREACH(p, &allproc) { /* Grab a hold on the process. */ if (!rw_tryenter(&p->p_reflock, RW_READER)) { continue; } mutex_exit(&proc_lock); mutex_enter(p->p_lock); LIST_FOREACH(l2, &p->p_lwps, l_sibling) { if (buflen >= elem_size && elem_count > 0) { lwp_lock(l2); fill_lwp(l2, &klwp); lwp_unlock(l2); mutex_exit(p->p_lock); /* * Copy out elem_size, but not * larger than the size of a * struct kinfo_proc2. */ error = dcopyout(l, &klwp, dp, uimin(sizeof(klwp), elem_size)); if (error) { rw_exit(&p->p_reflock); goto cleanup; } mutex_enter(p->p_lock); LIST_FOREACH(l3, &p->p_lwps, l_sibling) { if (l2 == l3) break; } if (l3 == NULL) { mutex_exit(p->p_lock); rw_exit(&p->p_reflock); error = EAGAIN; goto cleanup; } dp += elem_size; buflen -= elem_size; elem_count--; } needed += elem_size; } mutex_exit(p->p_lock); /* Drop reference to process. */ mutex_enter(&proc_lock); rw_exit(&p->p_reflock); } mutex_exit(&proc_lock); } else { mutex_enter(&proc_lock); p = proc_find(pid); if (p == NULL) { error = ESRCH; mutex_exit(&proc_lock); goto cleanup; } /* Grab a hold on the process. */ gotit = rw_tryenter(&p->p_reflock, RW_READER); mutex_exit(&proc_lock); if (!gotit) { error = ESRCH; goto cleanup; } mutex_enter(p->p_lock); LIST_FOREACH(l2, &p->p_lwps, l_sibling) { if (buflen >= elem_size && elem_count > 0) { lwp_lock(l2); fill_lwp(l2, &klwp); lwp_unlock(l2); mutex_exit(p->p_lock); /* * Copy out elem_size, but not larger than * the size of a struct kinfo_proc2. */ error = dcopyout(l, &klwp, dp, uimin(sizeof(klwp), elem_size)); if (error) { rw_exit(&p->p_reflock); goto cleanup; } mutex_enter(p->p_lock); LIST_FOREACH(l3, &p->p_lwps, l_sibling) { if (l2 == l3) break; } if (l3 == NULL) { mutex_exit(p->p_lock); rw_exit(&p->p_reflock); error = EAGAIN; goto cleanup; } dp += elem_size; buflen -= elem_size; elem_count--; } needed += elem_size; } mutex_exit(p->p_lock); /* Drop reference to process. */ rw_exit(&p->p_reflock); } if (where != NULL) { *oldlenp = dp - where; if (needed > *oldlenp) { sysctl_relock(); return (ENOMEM); } } else { needed += KERN_LWPSLOP; *oldlenp = needed; } error = 0; cleanup: sysctl_relock(); return (error); } /* * sysctl helper routine for kern.forkfsleep node. Ensures that the * given value is not too large or two small, and is at least one * timer tick if not zero. */ static int sysctl_kern_forkfsleep(SYSCTLFN_ARGS) { /* userland sees value in ms, internally is in ticks */ extern int forkfsleep; /* defined in kern/kern_fork.c */ int error, timo, lsleep; struct sysctlnode node; lsleep = forkfsleep * 1000 / hz; node = *rnode; node.sysctl_data = &lsleep; error = sysctl_lookup(SYSCTLFN_CALL(&node)); if (error || newp == NULL) return (error); /* refuse negative values, and overly 'long time' */ if (lsleep < 0 || lsleep > MAXSLP * 1000) return (EINVAL); timo = mstohz(lsleep); /* if the interval is >0 ms && <1 tick, use 1 tick */ if (lsleep != 0 && timo == 0) forkfsleep = 1; else forkfsleep = timo; return (0); } /* * sysctl helper routine for kern.root_partition */ static int sysctl_kern_root_partition(SYSCTLFN_ARGS) { int rootpart = DISKPART(rootdev); struct sysctlnode node = *rnode; node.sysctl_data = &rootpart; return (sysctl_lookup(SYSCTLFN_CALL(&node))); } /* * sysctl helper function for kern.drivers */ static int sysctl_kern_drivers(SYSCTLFN_ARGS) { int error; size_t buflen; struct kinfo_drivers kd; char *start, *where; const char *dname; int i; extern struct devsw_conv *devsw_conv; extern int max_devsw_convs; start = where = oldp; buflen = *oldlenp; if (where == NULL) { *oldlenp = max_devsw_convs * sizeof kd; return 0; } /* * An array of kinfo_drivers structures */ error = 0; sysctl_unlock(); mutex_enter(&device_lock); for (i = 0; i < max_devsw_convs; i++) { dname = devsw_conv[i].d_name; if (dname == NULL) continue; if (buflen < sizeof kd) { error = ENOMEM; break; } memset(&kd, 0, sizeof(kd)); kd.d_bmajor = devsw_conv[i].d_bmajor; kd.d_cmajor = devsw_conv[i].d_cmajor; strlcpy(kd.d_name, dname, sizeof kd.d_name); mutex_exit(&device_lock); error = dcopyout(l, &kd, where, sizeof kd); mutex_enter(&device_lock); if (error != 0) break; buflen -= sizeof kd; where += sizeof kd; } mutex_exit(&device_lock); sysctl_relock(); *oldlenp = where - start; return error; } static int sysctl_security_setidcore(SYSCTLFN_ARGS) { int newsize, error; struct sysctlnode node; node = *rnode; node.sysctl_data = &newsize; newsize = *(int *)rnode->sysctl_data; error = sysctl_lookup(SYSCTLFN_CALL(&node)); if (error || newp == NULL) return error; if (kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_SETIDCORE, 0, NULL, NULL, NULL)) return (EPERM); *(int *)rnode->sysctl_data = newsize; return 0; } static int sysctl_security_setidcorename(SYSCTLFN_ARGS) { int error; char *newsetidcorename; struct sysctlnode node; newsetidcorename = PNBUF_GET(); node = *rnode; node.sysctl_data = newsetidcorename; memcpy(node.sysctl_data, rnode->sysctl_data, MAXPATHLEN); error = sysctl_lookup(SYSCTLFN_CALL(&node)); if (error || newp == NULL) { goto out; } if (kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_SETIDCORE, 0, NULL, NULL, NULL)) { error = EPERM; goto out; } if (strlen(newsetidcorename) == 0) { error = EINVAL; goto out; } memcpy(rnode->sysctl_data, node.sysctl_data, MAXPATHLEN); out: PNBUF_PUT(newsetidcorename); return error; } /* * sysctl helper routine for kern.cp_id node. Maps cpus to their * cpuids. */ static int sysctl_kern_cpid(SYSCTLFN_ARGS) { struct sysctlnode node = *rnode; uint64_t *cp_id = NULL; int error, n = ncpu; struct cpu_info *ci; CPU_INFO_ITERATOR cii; /* * Here you may either retrieve a single cpu id or the whole * set. The size you get back when probing depends on what * you ask for. */ switch (namelen) { case 0: node.sysctl_size = n * sizeof(uint64_t); n = -2; /* ALL */ break; case 1: if (name[0] < 0 || name[0] >= n) return (ENOENT); /* ENOSUCHPROCESSOR */ node.sysctl_size = sizeof(uint64_t); n = name[0]; /* * adjust these so that sysctl_lookup() will be happy */ name++; namelen--; break; default: return (EINVAL); } cp_id = kmem_alloc(node.sysctl_size, KM_SLEEP); node.sysctl_data = cp_id; memset(cp_id, 0, node.sysctl_size); for (CPU_INFO_FOREACH(cii, ci)) { if (n <= 0) cp_id[0] = cpu_index(ci); /* * if a specific processor was requested and we just * did it, we're done here */ if (n == 0) break; /* * if doing "all", skip to next cp_id slot for next processor */ if (n == -2) cp_id++; /* * if we're doing a specific processor, we're one * processor closer */ if (n > 0) n--; } error = sysctl_lookup(SYSCTLFN_CALL(&node)); kmem_free(node.sysctl_data, node.sysctl_size); return (error); } /* * sysctl helper routine for hw.usermem and hw.usermem64. Values are * calculate on the fly taking into account integer overflow and the * current wired count. */ static int sysctl_hw_usermem(SYSCTLFN_ARGS) { u_int ui; u_quad_t uq; struct sysctlnode node; node = *rnode; switch (rnode->sysctl_num) { case HW_USERMEM: if ((ui = physmem - uvmexp.wired) > (UINT_MAX / PAGE_SIZE)) ui = UINT_MAX; else ui *= PAGE_SIZE; node.sysctl_data = &ui; break; case HW_USERMEM64: uq = (u_quad_t)(physmem - uvmexp.wired) * PAGE_SIZE; node.sysctl_data = &uq; break; default: return (EINVAL); } return (sysctl_lookup(SYSCTLFN_CALL(&node))); } /* * sysctl helper routine for kern.cnmagic node. Pulls the old value * out, encoded, and stuffs the new value in for decoding. */ static int sysctl_hw_cnmagic(SYSCTLFN_ARGS) { char magic[CNS_LEN]; int error; struct sysctlnode node; if (oldp) cn_get_magic(magic, CNS_LEN); node = *rnode; node.sysctl_data = &magic[0]; error = sysctl_lookup(SYSCTLFN_CALL(&node)); if (error || newp == NULL) return (error); return (cn_set_magic(magic)); } /* * ******************************************************************** * section 3: public helper routines that are used for more than one * node * ******************************************************************** */ /* * sysctl helper routine for the kern.root_device node and some ports' * machdep.root_device nodes. */ int sysctl_root_device(SYSCTLFN_ARGS) { struct sysctlnode node; node = *rnode; node.sysctl_data = __UNCONST(device_xname(root_device)); node.sysctl_size = strlen(device_xname(root_device)) + 1; return (sysctl_lookup(SYSCTLFN_CALL(&node))); } /* * sysctl helper routine for kern.consdev, dependent on the current * state of the console. Also used for machdep.console_device on some * ports. */ int sysctl_consdev(SYSCTLFN_ARGS) { dev_t consdev; uint32_t oconsdev; struct sysctlnode node; if (cn_tab != NULL) consdev = cn_tab->cn_dev; else consdev = NODEV; node = *rnode; switch (*oldlenp) { case sizeof(consdev): node.sysctl_data = &consdev; node.sysctl_size = sizeof(consdev); break; case sizeof(oconsdev): oconsdev = (uint32_t)consdev; node.sysctl_data = &oconsdev; node.sysctl_size = sizeof(oconsdev); break; default: return EINVAL; } return (sysctl_lookup(SYSCTLFN_CALL(&node))); } /* * ******************************************************************** * section 4: support for some helpers * ******************************************************************** */ /* * Fill in a kinfo_lwp structure for the specified lwp. */ static void fill_lwp(struct lwp *l, struct kinfo_lwp *kl) { const bool allowaddr = get_expose_address(curproc); struct proc *p = l->l_proc; struct timeval tv; KASSERT(lwp_locked(l, NULL)); memset(kl, 0, sizeof(*kl)); kl->l_forw = 0; kl->l_back = 0; COND_SET_VALUE(kl->l_laddr, PTRTOUINT64(l), allowaddr); COND_SET_VALUE(kl->l_addr, PTRTOUINT64(l->l_addr), allowaddr); kl->l_stat = l->l_stat; kl->l_lid = l->l_lid; kl->l_flag = L_INMEM; kl->l_flag |= sysctl_map_flags(sysctl_lwpprflagmap, l->l_prflag); kl->l_flag |= sysctl_map_flags(sysctl_lwpflagmap, l->l_flag); kl->l_swtime = l->l_swtime; kl->l_slptime = l->l_slptime; if (l->l_stat == LSONPROC) kl->l_schedflags = l->l_cpu->ci_schedstate.spc_flags; else kl->l_schedflags = 0; kl->l_priority = lwp_eprio(l); kl->l_usrpri = l->l_priority; if (l->l_wchan) strncpy(kl->l_wmesg, l->l_wmesg, sizeof(kl->l_wmesg)); COND_SET_VALUE(kl->l_wchan, PTRTOUINT64(l->l_wchan), allowaddr); kl->l_cpuid = cpu_index(l->l_cpu); bintime2timeval(&l->l_rtime, &tv); kl->l_rtime_sec = tv.tv_sec; kl->l_rtime_usec = tv.tv_usec; kl->l_cpticks = l->l_cpticks; kl->l_pctcpu = l->l_pctcpu; kl->l_pid = p->p_pid; if (l->l_name == NULL) kl->l_name[0] = '\0'; else strlcpy(kl->l_name, l->l_name, sizeof(kl->l_name)); }
7 7 7 5 3 4 7 7 7 2 5 7 10 7 7 7 5 5 5 9 1 3 2 1 2 2 1 1 1 1 1 2 2 2 2 2 4 4 1 1 3 3 2 2 3 2 2 2 2 1 1 1 1 2 2 1 2 5 3 1 1 1 32 13 21 30 4 4 9 21 30 2 1 1 3 2 8 7 7 5 5 5 5 5 5 5 2 3 5 5 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 /* $NetBSD: sys_pipe.c,v 1.167 2024/02/10 09:21:54 andvar Exp $ */ /*- * Copyright (c) 2003, 2007, 2008, 2009, 2023 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Paul Kranenburg, and by Andrew Doran. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /* * Copyright (c) 1996 John S. Dyson * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice immediately at the beginning of the file, without modification, * this list of conditions, and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Absolutely no warranty of function or purpose is made by the author * John S. Dyson. * 4. Modifications may be freely made to this file if the above conditions * are met. */ /* * This file contains a high-performance replacement for the socket-based * pipes scheme originally used. It does not support all features of * sockets, but does do everything that pipes normally do. */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: sys_pipe.c,v 1.167 2024/02/10 09:21:54 andvar Exp $"); #include <sys/param.h> #include <sys/systm.h> #include <sys/proc.h> #include <sys/fcntl.h> #include <sys/file.h> #include <sys/filedesc.h> #include <sys/filio.h> #include <sys/kernel.h> #include <sys/ttycom.h> #include <sys/stat.h> #include <sys/poll.h> #include <sys/signalvar.h> #include <sys/vnode.h> #include <sys/uio.h> #include <sys/select.h> #include <sys/mount.h> #include <sys/syscallargs.h> #include <sys/sysctl.h> #include <sys/kauth.h> #include <sys/atomic.h> #include <sys/pipe.h> static int pipe_read(file_t *, off_t *, struct uio *, kauth_cred_t, int); static int pipe_write(file_t *, off_t *, struct uio *, kauth_cred_t, int); static int pipe_close(file_t *); static int pipe_poll(file_t *, int); static int pipe_kqfilter(file_t *, struct knote *); static int pipe_stat(file_t *, struct stat *); static int pipe_ioctl(file_t *, u_long, void *); static void pipe_restart(file_t *); static int pipe_fpathconf(file_t *, int, register_t *); static int pipe_posix_fadvise(file_t *, off_t, off_t, int); static const struct fileops pipeops = { .fo_name = "pipe", .fo_read = pipe_read, .fo_write = pipe_write, .fo_ioctl = pipe_ioctl, .fo_fcntl = fnullop_fcntl, .fo_poll = pipe_poll, .fo_stat = pipe_stat, .fo_close = pipe_close, .fo_kqfilter = pipe_kqfilter, .fo_restart = pipe_restart, .fo_fpathconf = pipe_fpathconf, .fo_posix_fadvise = pipe_posix_fadvise, }; /* * Default pipe buffer size(s), this can be kind-of large now because pipe * space is pageable. The pipe code will try to maintain locality of * reference for performance reasons, so small amounts of outstanding I/O * will not wipe the cache. */ #define MINPIPESIZE (PIPE_SIZE / 3) #define MAXPIPESIZE (2 * PIPE_SIZE / 3) /* * Limit the number of "big" pipes */ #define LIMITBIGPIPES 32 static u_int maxbigpipes __read_mostly = LIMITBIGPIPES; static u_int nbigpipe = 0; /* * Amount of KVA consumed by pipe buffers. */ static u_int amountpipekva = 0; static void pipeclose(struct pipe *); static void pipe_free_kmem(struct pipe *); static int pipe_create(struct pipe **, pool_cache_t, struct timespec *); static int pipelock(struct pipe *, bool); static inline void pipeunlock(struct pipe *); static void pipeselwakeup(struct pipe *, struct pipe *, int); static int pipespace(struct pipe *, int); static int pipe_ctor(void *, void *, int); static void pipe_dtor(void *, void *); static pool_cache_t pipe_wr_cache; static pool_cache_t pipe_rd_cache; void pipe_init(void) { /* Writer side is not automatically allocated KVA. */ pipe_wr_cache = pool_cache_init(sizeof(struct pipe), 0, 0, 0, "pipewr", NULL, IPL_NONE, pipe_ctor, pipe_dtor, NULL); KASSERT(pipe_wr_cache != NULL); /* Reader side gets preallocated KVA. */ pipe_rd_cache = pool_cache_init(sizeof(struct pipe), 0, 0, 0, "piperd", NULL, IPL_NONE, pipe_ctor, pipe_dtor, (void *)1); KASSERT(pipe_rd_cache != NULL); } static int pipe_ctor(void *arg, void *obj, int flags) { struct pipe *pipe; vaddr_t va; pipe = obj; memset(pipe, 0, sizeof(struct pipe)); if (arg != NULL) { /* Preallocate space. */ va = uvm_km_alloc(kernel_map, PIPE_SIZE, 0, UVM_KMF_PAGEABLE | UVM_KMF_WAITVA); KASSERT(va != 0); pipe->pipe_kmem = va; atomic_add_int(&amountpipekva, PIPE_SIZE); } cv_init(&pipe->pipe_rcv, "pipe_rd"); cv_init(&pipe->pipe_wcv, "pipe_wr"); cv_init(&pipe->pipe_draincv, "pipe_drn"); cv_init(&pipe->pipe_lkcv, "pipe_lk"); selinit(&pipe->pipe_sel); pipe->pipe_state = PIPE_SIGNALR; return 0; } static void pipe_dtor(void *arg, void *obj) { struct pipe *pipe; pipe = obj; cv_destroy(&pipe->pipe_rcv); cv_destroy(&pipe->pipe_wcv); cv_destroy(&pipe->pipe_draincv); cv_destroy(&pipe->pipe_lkcv); seldestroy(&pipe->pipe_sel); if (pipe->pipe_kmem != 0) { uvm_km_free(kernel_map, pipe->pipe_kmem, PIPE_SIZE, UVM_KMF_PAGEABLE); atomic_add_int(&amountpipekva, -PIPE_SIZE); } } /* * The pipe system call for the DTYPE_PIPE type of pipes */ int pipe1(struct lwp *l, int *fildes, int flags) { struct pipe *rpipe, *wpipe; struct timespec nt; file_t *rf, *wf; int fd, error; proc_t *p; if (flags & ~(O_CLOEXEC|O_NONBLOCK|O_NOSIGPIPE)) return EINVAL; p = curproc; rpipe = wpipe = NULL; getnanotime(&nt); if ((error = pipe_create(&rpipe, pipe_rd_cache, &nt)) || (error = pipe_create(&wpipe, pipe_wr_cache, &nt))) { goto free2; } rpipe->pipe_lock = mutex_obj_alloc(MUTEX_DEFAULT, IPL_NONE); wpipe->pipe_lock = rpipe->pipe_lock; mutex_obj_hold(wpipe->pipe_lock); error = fd_allocfile(&rf, &fd); if (error) goto free2; fildes[0] = fd; error = fd_allocfile(&wf, &fd); if (error) goto free3; fildes[1] = fd; rf->f_flag = FREAD | flags; rf->f_type = DTYPE_PIPE; rf->f_pipe = rpipe; rf->f_ops = &pipeops; fd_set_exclose(l, fildes[0], (flags & O_CLOEXEC) != 0); wf->f_flag = FWRITE | flags; wf->f_type = DTYPE_PIPE; wf->f_pipe = wpipe; wf->f_ops = &pipeops; fd_set_exclose(l, fildes[1], (flags & O_CLOEXEC) != 0); rpipe->pipe_peer = wpipe; wpipe->pipe_peer = rpipe; fd_affix(p, rf, fildes[0]); fd_affix(p, wf, fildes[1]); return (0); free3: fd_abort(p, rf, fildes[0]); free2: pipeclose(wpipe); pipeclose(rpipe); return (error); } /* * Allocate kva for pipe circular buffer, the space is pageable * This routine will 'realloc' the size of a pipe safely, if it fails * it will retain the old buffer. * If it fails it will return ENOMEM. */ static int pipespace(struct pipe *pipe, int size) { void *buffer; /* * Allocate pageable virtual address space. Physical memory is * allocated on demand. */ if (size == PIPE_SIZE && pipe->pipe_kmem != 0) { buffer = (void *)pipe->pipe_kmem; } else { buffer = (void *)uvm_km_alloc(kernel_map, round_page(size), 0, UVM_KMF_PAGEABLE); if (buffer == NULL) return (ENOMEM); atomic_add_int(&amountpipekva, size); } /* free old resources if we're resizing */ pipe_free_kmem(pipe); pipe->pipe_buffer.buffer = buffer; pipe->pipe_buffer.size = size; pipe->pipe_buffer.in = 0; pipe->pipe_buffer.out = 0; pipe->pipe_buffer.cnt = 0; return (0); } /* * Initialize and allocate VM and memory for pipe. */ static int pipe_create(struct pipe **pipep, pool_cache_t cache, struct timespec *nt) { struct pipe *pipe; int error; pipe = pool_cache_get(cache, PR_WAITOK); KASSERT(pipe != NULL); *pipep = pipe; error = 0; pipe->pipe_atime = pipe->pipe_mtime = pipe->pipe_btime = *nt; pipe->pipe_lock = NULL; if (cache == pipe_rd_cache) { error = pipespace(pipe, PIPE_SIZE); } else { pipe->pipe_buffer.buffer = NULL; pipe->pipe_buffer.size = 0; pipe->pipe_buffer.in = 0; pipe->pipe_buffer.out = 0; pipe->pipe_buffer.cnt = 0; } return error; } /* * Lock a pipe for I/O, blocking other access * Called with pipe spin lock held. */ static int pipelock(struct pipe *pipe, bool catch_p) { int error; KASSERT(mutex_owned(pipe->pipe_lock)); while (pipe->pipe_state & PIPE_LOCKFL) { if (catch_p) { error = cv_wait_sig(&pipe->pipe_lkcv, pipe->pipe_lock); if (error != 0) { return error; } } else cv_wait(&pipe->pipe_lkcv, pipe->pipe_lock); } pipe->pipe_state |= PIPE_LOCKFL; return 0; } /* * unlock a pipe I/O lock */ static inline void pipeunlock(struct pipe *pipe) { KASSERT(pipe->pipe_state & PIPE_LOCKFL); pipe->pipe_state &= ~PIPE_LOCKFL; cv_signal(&pipe->pipe_lkcv); } /* * Select/poll wakeup. This also sends SIGIO to peer connected to * 'sigpipe' side of pipe. */ static void pipeselwakeup(struct pipe *selp, struct pipe *sigp, int code) { int band; switch (code) { case POLL_IN: band = POLLIN|POLLRDNORM; break; case POLL_OUT: band = POLLOUT|POLLWRNORM; break; case POLL_HUP: band = POLLHUP; break; case POLL_ERR: band = POLLERR; break; default: band = 0; #ifdef DIAGNOSTIC printf("bad siginfo code %d in pipe notification.\n", code); #endif break; } selnotify(&selp->pipe_sel, band, NOTE_SUBMIT); if (sigp == NULL || (sigp->pipe_state & PIPE_ASYNC) == 0) return; fownsignal(sigp->pipe_pgid, SIGIO, code, band, selp); } static int pipe_read(file_t *fp, off_t *offset, struct uio *uio, kauth_cred_t cred, int flags) { struct pipe *rpipe = fp->f_pipe; struct pipebuf *bp = &rpipe->pipe_buffer; kmutex_t *lock = rpipe->pipe_lock; int error; size_t nread = 0; size_t size; size_t ocnt; unsigned int wakeup_state = 0; /* * Try to avoid locking the pipe if we have nothing to do. * * There are programs which share one pipe amongst multiple processes * and perform non-blocking reads in parallel, even if the pipe is * empty. This in particular is the case with BSD make, which when * spawned with a high -j number can find itself with over half of the * calls failing to find anything. */ if ((fp->f_flag & FNONBLOCK) != 0) { if (__predict_false(uio->uio_resid == 0)) return (0); if (atomic_load_relaxed(&bp->cnt) == 0 && (atomic_load_relaxed(&rpipe->pipe_state) & PIPE_EOF) == 0) return (EAGAIN); } mutex_enter(lock); ++rpipe->pipe_busy; ocnt = bp->cnt; again: error = pipelock(rpipe, true); if (error) goto unlocked_error; while (uio->uio_resid) { /* * Normal pipe buffer receive. */ if (bp->cnt > 0) { size = bp->size - bp->out; if (size > bp->cnt) size = bp->cnt; if (size > uio->uio_resid) size = uio->uio_resid; mutex_exit(lock); error = uiomove((char *)bp->buffer + bp->out, size, uio); mutex_enter(lock); if (error) break; bp->out += size; if (bp->out >= bp->size) bp->out = 0; bp->cnt -= size; /* * If there is no more to read in the pipe, reset * its pointers to the beginning. This improves * cache hit stats. */ if (bp->cnt == 0) { bp->in = 0; bp->out = 0; } nread += size; continue; } /* * Break if some data was read. */ if (nread > 0) break; /* * Detect EOF condition. * Read returns 0 on EOF, no need to set error. */ if (rpipe->pipe_state & PIPE_EOF) break; /* * Don't block on non-blocking I/O. */ if (fp->f_flag & FNONBLOCK) { error = EAGAIN; break; } /* * Unlock the pipe buffer for our remaining processing. * We will either break out with an error or we will * sleep and relock to loop. */ pipeunlock(rpipe); #if 1 /* XXX (dsl) I'm sure these aren't needed here ... */ /* * We want to read more, wake up select/poll. */ pipeselwakeup(rpipe, rpipe->pipe_peer, POLL_OUT); /* * If the "write-side" is blocked, wake it up now. */ cv_broadcast(&rpipe->pipe_wcv); #endif if (wakeup_state & PIPE_RESTART) { error = ERESTART; goto unlocked_error; } /* Now wait until the pipe is filled */ error = cv_wait_sig(&rpipe->pipe_rcv, lock); if (error != 0) goto unlocked_error; wakeup_state = rpipe->pipe_state; goto again; } if (error == 0) getnanotime(&rpipe->pipe_atime); pipeunlock(rpipe); unlocked_error: --rpipe->pipe_busy; if (rpipe->pipe_busy == 0) { rpipe->pipe_state &= ~PIPE_RESTART; cv_broadcast(&rpipe->pipe_draincv); } if (bp->cnt < MINPIPESIZE) { cv_broadcast(&rpipe->pipe_wcv); } /* * If anything was read off the buffer, signal to the writer it's * possible to write more data. Also send signal if we are here for the * first time after last write. */ if ((bp->size - bp->cnt) >= PIPE_BUF && (ocnt != bp->cnt || (rpipe->pipe_state & PIPE_SIGNALR))) { pipeselwakeup(rpipe, rpipe->pipe_peer, POLL_OUT); rpipe->pipe_state &= ~PIPE_SIGNALR; } mutex_exit(lock); return (error); } static int pipe_write(file_t *fp, off_t *offset, struct uio *uio, kauth_cred_t cred, int flags) { struct pipe *wpipe, *rpipe; struct pipebuf *bp; kmutex_t *lock; int error; unsigned int wakeup_state = 0; /* We want to write to our peer */ rpipe = fp->f_pipe; lock = rpipe->pipe_lock; error = 0; mutex_enter(lock); wpipe = rpipe->pipe_peer; /* * Detect loss of pipe read side, issue SIGPIPE if lost. */ if (wpipe == NULL || (wpipe->pipe_state & PIPE_EOF) != 0) { mutex_exit(lock); return EPIPE; } ++wpipe->pipe_busy; /* Acquire the long-term pipe lock */ if ((error = pipelock(wpipe, true)) != 0) { --wpipe->pipe_busy; if (wpipe->pipe_busy == 0) { wpipe->pipe_state &= ~PIPE_RESTART; cv_broadcast(&wpipe->pipe_draincv); } mutex_exit(lock); return (error); } bp = &wpipe->pipe_buffer; /* * If it is advantageous to resize the pipe buffer, do so. */ if ((uio->uio_resid > PIPE_SIZE) && (nbigpipe < maxbigpipes) && (bp->size <= PIPE_SIZE) && (bp->cnt == 0)) { if (pipespace(wpipe, BIG_PIPE_SIZE) == 0) atomic_inc_uint(&nbigpipe); } while (uio->uio_resid) { size_t space; space = bp->size - bp->cnt; /* Writes of size <= PIPE_BUF must be atomic. */ if ((space < uio->uio_resid) && (uio->uio_resid <= PIPE_BUF)) space = 0; if (space > 0) { int size; /* Transfer size */ int segsize; /* first segment to transfer */ /* * Transfer size is minimum of uio transfer * and free space in pipe buffer. */ if (space > uio->uio_resid) size = uio->uio_resid; else size = space; /* * First segment to transfer is minimum of * transfer size and contiguous space in * pipe buffer. If first segment to transfer * is less than the transfer size, we've got * a wraparound in the buffer. */ segsize = bp->size - bp->in; if (segsize > size) segsize = size; /* Transfer first segment */ mutex_exit(lock); error = uiomove((char *)bp->buffer + bp->in, segsize, uio); if (error == 0 && segsize < size) { /* * Transfer remaining part now, to * support atomic writes. Wraparound * happened. */ KASSERT(bp->in + segsize == bp->size); error = uiomove(bp->buffer, size - segsize, uio); } mutex_enter(lock); if (error) break; bp->in += size; if (bp->in >= bp->size) { KASSERT(bp->in == size - segsize + bp->size); bp->in = size - segsize; } bp->cnt += size; KASSERT(bp->cnt <= bp->size); wakeup_state = 0; } else { /* * If the "read-side" has been blocked, wake it up now. */ cv_broadcast(&wpipe->pipe_rcv); /* * Don't block on non-blocking I/O. */ if (fp->f_flag & FNONBLOCK) { error = EAGAIN; break; } /* * We have no more space and have something to offer, * wake up select/poll. */ if (bp->cnt) pipeselwakeup(wpipe, wpipe, POLL_IN); if (wakeup_state & PIPE_RESTART) { error = ERESTART; break; } /* * If read side wants to go away, we just issue a signal * to ourselves. */ if (wpipe->pipe_state & PIPE_EOF) { error = EPIPE; break; } pipeunlock(wpipe); error = cv_wait_sig(&wpipe->pipe_wcv, lock); (void)pipelock(wpipe, false); if (error != 0) break; wakeup_state = wpipe->pipe_state; } } --wpipe->pipe_busy; if (wpipe->pipe_busy == 0) { wpipe->pipe_state &= ~PIPE_RESTART; cv_broadcast(&wpipe->pipe_draincv); } if (bp->cnt > 0) { cv_broadcast(&wpipe->pipe_rcv); } /* * Don't return EPIPE if I/O was successful */ if (error == EPIPE && bp->cnt == 0 && uio->uio_resid == 0) error = 0; if (error == 0) getnanotime(&wpipe->pipe_mtime); /* * We have something to offer, wake up select/poll. */ if (bp->cnt) pipeselwakeup(wpipe, wpipe, POLL_IN); /* * Arrange for next read(2) to do a signal. */ wpipe->pipe_state |= PIPE_SIGNALR; pipeunlock(wpipe); mutex_exit(lock); return (error); } /* * We implement a very minimal set of ioctls for compatibility with sockets. */ int pipe_ioctl(file_t *fp, u_long cmd, void *data) { struct pipe *pipe = fp->f_pipe; kmutex_t *lock = pipe->pipe_lock; switch (cmd) { case FIONBIO: return (0); case FIOASYNC: mutex_enter(lock); if (*(int *)data) { pipe->pipe_state |= PIPE_ASYNC; } else { pipe->pipe_state &= ~PIPE_ASYNC; } mutex_exit(lock); return (0); case FIONREAD: mutex_enter(lock); *(int *)data = pipe->pipe_buffer.cnt; mutex_exit(lock); return (0); case FIONWRITE: /* Look at other side */ mutex_enter(lock); pipe = pipe->pipe_peer; if (pipe == NULL) *(int *)data = 0; else *(int *)data = pipe->pipe_buffer.cnt; mutex_exit(lock); return (0); case FIONSPACE: /* Look at other side */ mutex_enter(lock); pipe = pipe->pipe_peer; if (pipe == NULL) *(int *)data = 0; else *(int *)data = pipe->pipe_buffer.size - pipe->pipe_buffer.cnt; mutex_exit(lock); return (0); case TIOCSPGRP: case FIOSETOWN: return fsetown(&pipe->pipe_pgid, cmd, data); case TIOCGPGRP: case FIOGETOWN: return fgetown(pipe->pipe_pgid, cmd, data); } return (EPASSTHROUGH); } int pipe_poll(file_t *fp, int events) { struct pipe *rpipe = fp->f_pipe; struct pipe *wpipe; int eof = 0; int revents = 0; mutex_enter(rpipe->pipe_lock); wpipe = rpipe->pipe_peer; if (events & (POLLIN | POLLRDNORM)) if ((rpipe->pipe_buffer.cnt > 0) || (rpipe->pipe_state & PIPE_EOF)) revents |= events & (POLLIN | POLLRDNORM); eof |= (rpipe->pipe_state & PIPE_EOF); if (wpipe == NULL) revents |= events & (POLLOUT | POLLWRNORM); else { if (events & (POLLOUT | POLLWRNORM)) if ((wpipe->pipe_state & PIPE_EOF) || ( (wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt) >= PIPE_BUF)) revents |= events & (POLLOUT | POLLWRNORM); eof |= (wpipe->pipe_state & PIPE_EOF); } if (wpipe == NULL || eof) revents |= POLLHUP; if (revents == 0) { if (events & (POLLIN | POLLRDNORM)) selrecord(curlwp, &rpipe->pipe_sel); if (events & (POLLOUT | POLLWRNORM)) selrecord(curlwp, &wpipe->pipe_sel); } mutex_exit(rpipe->pipe_lock); return (revents); } static int pipe_stat(file_t *fp, struct stat *ub) { struct pipe *pipe = fp->f_pipe; mutex_enter(pipe->pipe_lock); memset(ub, 0, sizeof(*ub)); ub->st_mode = S_IFIFO | S_IRUSR | S_IWUSR; ub->st_blksize = pipe->pipe_buffer.size; if (ub->st_blksize == 0 && pipe->pipe_peer) ub->st_blksize = pipe->pipe_peer->pipe_buffer.size; ub->st_size = pipe->pipe_buffer.cnt; ub->st_blocks = (ub->st_size) ? 1 : 0; ub->st_atimespec = pipe->pipe_atime; ub->st_mtimespec = pipe->pipe_mtime; ub->st_ctimespec = ub->st_birthtimespec = pipe->pipe_btime; ub->st_uid = kauth_cred_geteuid(fp->f_cred); ub->st_gid = kauth_cred_getegid(fp->f_cred); /* * Left as 0: st_dev, st_ino, st_nlink, st_rdev, st_flags, st_gen. * XXX (st_dev, st_ino) should be unique. */ mutex_exit(pipe->pipe_lock); return 0; } static int pipe_close(file_t *fp) { struct pipe *pipe = fp->f_pipe; fp->f_pipe = NULL; pipeclose(pipe); return (0); } static void pipe_restart(file_t *fp) { struct pipe *pipe = fp->f_pipe; /* * Unblock blocked reads/writes in order to allow close() to complete. * System calls return ERESTART so that the fd is revalidated. * (Partial writes return the transfer length.) */ mutex_enter(pipe->pipe_lock); pipe->pipe_state |= PIPE_RESTART; /* Wakeup both cvs, maybe we only need one, but maybe there are some * other paths where wakeup is needed, and it saves deciding which! */ cv_broadcast(&pipe->pipe_rcv); cv_broadcast(&pipe->pipe_wcv); mutex_exit(pipe->pipe_lock); } static int pipe_fpathconf(struct file *fp, int name, register_t *retval) { switch (name) { case _PC_PIPE_BUF: *retval = PIPE_BUF; return 0; default: return EINVAL; } } static int pipe_posix_fadvise(struct file *fp, off_t offset, off_t len, int advice) { return ESPIPE; } static void pipe_free_kmem(struct pipe *pipe) { if (pipe->pipe_buffer.buffer != NULL) { if (pipe->pipe_buffer.size > PIPE_SIZE) { atomic_dec_uint(&nbigpipe); } if (pipe->pipe_buffer.buffer != (void *)pipe->pipe_kmem) { uvm_km_free(kernel_map, (vaddr_t)pipe->pipe_buffer.buffer, pipe->pipe_buffer.size, UVM_KMF_PAGEABLE); atomic_add_int(&amountpipekva, -pipe->pipe_buffer.size); } pipe->pipe_buffer.buffer = NULL; } } /* * Shutdown the pipe. */ static void pipeclose(struct pipe *pipe) { kmutex_t *lock; struct pipe *ppipe; if (pipe == NULL) return; KASSERT(cv_is_valid(&pipe->pipe_rcv)); KASSERT(cv_is_valid(&pipe->pipe_wcv)); KASSERT(cv_is_valid(&pipe->pipe_draincv)); KASSERT(cv_is_valid(&pipe->pipe_lkcv)); lock = pipe->pipe_lock; if (lock == NULL) /* Must have failed during create */ goto free_resources; mutex_enter(lock); pipeselwakeup(pipe, pipe, POLL_HUP); /* * If the other side is blocked, wake it up saying that * we want to close it down. */ pipe->pipe_state |= PIPE_EOF; if (pipe->pipe_busy) { while (pipe->pipe_busy) { cv_broadcast(&pipe->pipe_wcv); cv_wait_sig(&pipe->pipe_draincv, lock); } } /* * Disconnect from peer. */ if ((ppipe = pipe->pipe_peer) != NULL) { pipeselwakeup(ppipe, ppipe, POLL_HUP); ppipe->pipe_state |= PIPE_EOF; cv_broadcast(&ppipe->pipe_rcv); ppipe->pipe_peer = NULL; } /* * Any knote objects still left in the list are * the one attached by peer. Since no one will * traverse this list, we just clear it. * * XXX Exposes select/kqueue internals. */ SLIST_INIT(&pipe->pipe_sel.sel_klist); KASSERT((pipe->pipe_state & PIPE_LOCKFL) == 0); mutex_exit(lock); mutex_obj_free(lock); /* * Free resources. */ free_resources: pipe->pipe_pgid = 0; pipe->pipe_state = PIPE_SIGNALR; pipe->pipe_peer = NULL; pipe->pipe_lock = NULL; pipe_free_kmem(pipe); if (pipe->pipe_kmem != 0) { pool_cache_put(pipe_rd_cache, pipe); } else { pool_cache_put(pipe_wr_cache, pipe); } } static void filt_pipedetach(struct knote *kn) { struct pipe *pipe; kmutex_t *lock; pipe = ((file_t *)kn->kn_obj)->f_pipe; lock = pipe->pipe_lock; mutex_enter(lock); switch(kn->kn_filter) { case EVFILT_WRITE: /* Need the peer structure, not our own. */ pipe = pipe->pipe_peer; /* If reader end already closed, just return. */ if (pipe == NULL) { mutex_exit(lock); return; } break; default: /* Nothing to do. */ break; } KASSERT(kn->kn_hook == pipe); selremove_knote(&pipe->pipe_sel, kn); mutex_exit(lock); } static int filt_piperead(struct knote *kn, long hint) { struct pipe *rpipe = ((file_t *)kn->kn_obj)->f_pipe; struct pipe *wpipe; int rv; if ((hint & NOTE_SUBMIT) == 0) { mutex_enter(rpipe->pipe_lock); } wpipe = rpipe->pipe_peer; kn->kn_data = rpipe->pipe_buffer.cnt; if ((rpipe->pipe_state & PIPE_EOF) || (wpipe == NULL) || (wpipe->pipe_state & PIPE_EOF)) { knote_set_eof(kn, 0); rv = 1; } else { rv = kn->kn_data > 0; } if ((hint & NOTE_SUBMIT) == 0) { mutex_exit(rpipe->pipe_lock); } return rv; } static int filt_pipewrite(struct knote *kn, long hint) { struct pipe *rpipe = ((file_t *)kn->kn_obj)->f_pipe; struct pipe *wpipe; int rv; if ((hint & NOTE_SUBMIT) == 0) { mutex_enter(rpipe->pipe_lock); } wpipe = rpipe->pipe_peer; if ((wpipe == NULL) || (wpipe->pipe_state & PIPE_EOF)) { kn->kn_data = 0; knote_set_eof(kn, 0); rv = 1; } else { kn->kn_data = wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt; rv = kn->kn_data >= PIPE_BUF; } if ((hint & NOTE_SUBMIT) == 0) { mutex_exit(rpipe->pipe_lock); } return rv; } static const struct filterops pipe_rfiltops = { .f_flags = FILTEROP_ISFD | FILTEROP_MPSAFE, .f_attach = NULL, .f_detach = filt_pipedetach, .f_event = filt_piperead, }; static const struct filterops pipe_wfiltops = { .f_flags = FILTEROP_ISFD | FILTEROP_MPSAFE, .f_attach = NULL, .f_detach = filt_pipedetach, .f_event = filt_pipewrite, }; static int pipe_kqfilter(file_t *fp, struct knote *kn) { struct pipe *pipe; kmutex_t *lock; pipe = ((file_t *)kn->kn_obj)->f_pipe; lock = pipe->pipe_lock; mutex_enter(lock); switch (kn->kn_filter) { case EVFILT_READ: kn->kn_fop = &pipe_rfiltops; break; case EVFILT_WRITE: kn->kn_fop = &pipe_wfiltops; pipe = pipe->pipe_peer; if (pipe == NULL) { /* Other end of pipe has been closed. */ mutex_exit(lock); return (EBADF); } break; default: mutex_exit(lock); return (EINVAL); } kn->kn_hook = pipe; selrecord_knote(&pipe->pipe_sel, kn); mutex_exit(lock); return (0); } /* * Handle pipe sysctls. */ SYSCTL_SETUP(sysctl_kern_pipe_setup, "sysctl kern.pipe subtree setup") { sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT, CTLTYPE_NODE, "pipe", SYSCTL_DESCR("Pipe settings"), NULL, 0, NULL, 0, CTL_KERN, KERN_PIPE, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_INT, "maxbigpipes", SYSCTL_DESCR("Maximum number of \"big\" pipes"), NULL, 0, &maxbigpipes, 0, CTL_KERN, KERN_PIPE, KERN_PIPE_MAXBIGPIPES, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT, CTLTYPE_INT, "nbigpipes", SYSCTL_DESCR("Number of \"big\" pipes"), NULL, 0, &nbigpipe, 0, CTL_KERN, KERN_PIPE, KERN_PIPE_NBIGPIPES, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT, CTLTYPE_INT, "kvasize", SYSCTL_DESCR("Amount of kernel memory consumed by pipe " "buffers"), NULL, 0, &amountpipekva, 0, CTL_KERN, KERN_PIPE, KERN_PIPE_KVASIZE, CTL_EOL); }
8 211 211 211 203 211 211 210 211 211 204 211 211 211 8 8 210 209 211 211 209 211 211 211 211 209 211 209 210 211 211 210 87 95 148 201 203 203 203 202 203 203 203 199 138 203 211 8 211 211 8 211 210 8 8 201 202 8 211 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 /* $NetBSD: virtio.c,v 1.81 2024/02/10 02:25:15 isaki Exp $ */ /* * Copyright (c) 2020 The NetBSD Foundation, Inc. * Copyright (c) 2012 Stefan Fritsch, Alexander Fiveg. * Copyright (c) 2010 Minoura Makoto. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: virtio.c,v 1.81 2024/02/10 02:25:15 isaki Exp $"); #include <sys/param.h> #include <sys/systm.h> #include <sys/kernel.h> #include <sys/atomic.h> #include <sys/bus.h> #include <sys/device.h> #include <sys/kmem.h> #include <sys/module.h> #define VIRTIO_PRIVATE #include <dev/pci/virtioreg.h> /* XXX: move to non-pci */ #include <dev/pci/virtiovar.h> /* XXX: move to non-pci */ #define MINSEG_INDIRECT 2 /* use indirect if nsegs >= this value */ /* * The maximum descriptor size is 2^15. Use that value as the end of * descriptor chain terminator since it will never be a valid index * in the descriptor table. */ #define VRING_DESC_CHAIN_END 32768 /* incomplete list */ static const char *virtio_device_name[] = { "unknown (0)", /* 0 */ "network", /* 1 */ "block", /* 2 */ "console", /* 3 */ "entropy", /* 4 */ "memory balloon", /* 5 */ "I/O memory", /* 6 */ "remote processor messaging", /* 7 */ "SCSI", /* 8 */ "9P transport", /* 9 */ }; #define NDEVNAMES __arraycount(virtio_device_name) static void virtio_reset_vq(struct virtio_softc *, struct virtqueue *); void virtio_set_status(struct virtio_softc *sc, int status) { sc->sc_ops->set_status(sc, status); } /* * Reset the device. */ /* * To reset the device to a known state, do following: * virtio_reset(sc); // this will stop the device activity * <dequeue finished requests>; // virtio_dequeue() still can be called * <revoke pending requests in the vqs if any>; * virtio_reinit_start(sc); // dequeue prohibited * newfeatures = virtio_negotiate_features(sc, requestedfeatures); * <some other initialization>; * virtio_reinit_end(sc); // device activated; enqueue allowed * Once attached, feature negotiation can only be allowed after virtio_reset. */ void virtio_reset(struct virtio_softc *sc) { virtio_device_reset(sc); } int virtio_reinit_start(struct virtio_softc *sc) { int i, r; virtio_set_status(sc, VIRTIO_CONFIG_DEVICE_STATUS_ACK); virtio_set_status(sc, VIRTIO_CONFIG_DEVICE_STATUS_DRIVER); for (i = 0; i < sc->sc_nvqs; i++) { int n; struct virtqueue *vq = &sc->sc_vqs[i]; n = sc->sc_ops->read_queue_size(sc, vq->vq_index); if (n == 0) /* vq disappeared */ continue; if (n != vq->vq_num) { panic("%s: virtqueue size changed, vq index %d\n", device_xname(sc->sc_dev), vq->vq_index); } virtio_reset_vq(sc, vq); sc->sc_ops->setup_queue(sc, vq->vq_index, vq->vq_dmamap->dm_segs[0].ds_addr); } r = sc->sc_ops->setup_interrupts(sc, 1); if (r != 0) goto fail; return 0; fail: virtio_set_status(sc, VIRTIO_CONFIG_DEVICE_STATUS_FAILED); return 1; } void virtio_reinit_end(struct virtio_softc *sc) { virtio_set_status(sc, VIRTIO_CONFIG_DEVICE_STATUS_DRIVER_OK); } /* * Feature negotiation. */ void virtio_negotiate_features(struct virtio_softc *sc, uint64_t guest_features) { if (!(device_cfdata(sc->sc_dev)->cf_flags & 1) && !(device_cfdata(sc->sc_child)->cf_flags & 1)) /* XXX */ guest_features |= VIRTIO_F_RING_INDIRECT_DESC; sc->sc_ops->neg_features(sc, guest_features); if (sc->sc_active_features & VIRTIO_F_RING_INDIRECT_DESC) sc->sc_indirect = true; else sc->sc_indirect = false; } /* * Device configuration registers readers/writers */ #if 0 #define DPRINTFR(n, fmt, val, index, num) \ printf("\n%s (", n); \ for (int i = 0; i < num; i++) \ printf("%02x ", bus_space_read_1(sc->sc_devcfg_iot, sc->sc_devcfg_ioh, index+i)); \ printf(") -> "); printf(fmt, val); printf("\n"); #define DPRINTFR2(n, fmt, val_s, val_n) \ printf("%s ", n); \ printf("\n stream "); printf(fmt, val_s); printf(" norm "); printf(fmt, val_n); printf("\n"); #else #define DPRINTFR(n, fmt, val, index, num) #define DPRINTFR2(n, fmt, val_s, val_n) #endif uint8_t virtio_read_device_config_1(struct virtio_softc *sc, int index) { bus_space_tag_t iot = sc->sc_devcfg_iot; bus_space_handle_t ioh = sc->sc_devcfg_ioh; uint8_t val; val = bus_space_read_1(iot, ioh, index); DPRINTFR("read_1", "%02x", val, index, 1); return val; } uint16_t virtio_read_device_config_2(struct virtio_softc *sc, int index) { bus_space_tag_t iot = sc->sc_devcfg_iot; bus_space_handle_t ioh = sc->sc_devcfg_ioh; uint16_t val; val = bus_space_read_2(iot, ioh, index); if (BYTE_ORDER != sc->sc_bus_endian) val = bswap16(val); DPRINTFR("read_2", "%04x", val, index, 2); DPRINTFR2("read_2", "%04x", bus_space_read_stream_2(sc->sc_devcfg_iot, sc->sc_devcfg_ioh, index), bus_space_read_2(sc->sc_devcfg_iot, sc->sc_devcfg_ioh, index)); return val; } uint32_t virtio_read_device_config_4(struct virtio_softc *sc, int index) { bus_space_tag_t iot = sc->sc_devcfg_iot; bus_space_handle_t ioh = sc->sc_devcfg_ioh; uint32_t val; val = bus_space_read_4(iot, ioh, index); if (BYTE_ORDER != sc->sc_bus_endian) val = bswap32(val); DPRINTFR("read_4", "%08x", val, index, 4); DPRINTFR2("read_4", "%08x", bus_space_read_stream_4(sc->sc_devcfg_iot, sc->sc_devcfg_ioh, index), bus_space_read_4(sc->sc_devcfg_iot, sc->sc_devcfg_ioh, index)); return val; } /* * The Virtio spec explicitly tells that reading and writing 8 bytes are not * considered atomic and no triggers may be connected to reading or writing * it. We access it using two 32 reads. See virtio spec 4.1.3.1. */ uint64_t virtio_read_device_config_8(struct virtio_softc *sc, int index) { bus_space_tag_t iot = sc->sc_devcfg_iot; bus_space_handle_t ioh = sc->sc_devcfg_ioh; union { uint64_t u64; uint32_t l[2]; } v; uint64_t val; v.l[0] = bus_space_read_4(iot, ioh, index); v.l[1] = bus_space_read_4(iot, ioh, index + 4); if (sc->sc_bus_endian != sc->sc_struct_endian) { v.l[0] = bswap32(v.l[0]); v.l[1] = bswap32(v.l[1]); } val = v.u64; if (BYTE_ORDER != sc->sc_struct_endian) val = bswap64(val); DPRINTFR("read_8", "%08"PRIx64, val, index, 8); DPRINTFR2("read_8 low ", "%08x", bus_space_read_stream_4(sc->sc_devcfg_iot, sc->sc_devcfg_ioh, index), bus_space_read_4(sc->sc_devcfg_iot, sc->sc_devcfg_ioh, index)); DPRINTFR2("read_8 high ", "%08x", bus_space_read_stream_4(sc->sc_devcfg_iot, sc->sc_devcfg_ioh, index + 4), bus_space_read_4(sc->sc_devcfg_iot, sc->sc_devcfg_ioh, index + 4)); return val; } /* * In the older virtio spec, device config registers are host endian. On newer * they are little endian. Some newer devices however explicitly specify their * register to always be little endian. These functions cater for these. */ uint16_t virtio_read_device_config_le_2(struct virtio_softc *sc, int index) { bus_space_tag_t iot = sc->sc_devcfg_iot; bus_space_handle_t ioh = sc->sc_devcfg_ioh; uint16_t val; val = bus_space_read_2(iot, ioh, index); #if !defined(__aarch64__) && !defined(__arm__) /* * For big-endian aarch64/armv7, bus endian is always LSB, but * byte-order is automatically swapped by bus_space(9) (see also * comments in virtio_pci.c). Therefore, no need to swap here. */ if (sc->sc_bus_endian != LITTLE_ENDIAN) val = bswap16(val); #endif DPRINTFR("read_le_2", "%04x", val, index, 2); DPRINTFR2("read_le_2", "%04x", bus_space_read_stream_2(sc->sc_devcfg_iot, sc->sc_devcfg_ioh, 0), bus_space_read_2(sc->sc_devcfg_iot, sc->sc_devcfg_ioh, 0)); return val; } uint32_t virtio_read_device_config_le_4(struct virtio_softc *sc, int index) { bus_space_tag_t iot = sc->sc_devcfg_iot; bus_space_handle_t ioh = sc->sc_devcfg_ioh; uint32_t val; val = bus_space_read_4(iot, ioh, index); #if !defined(__aarch64__) && !defined(__arm__) /* See virtio_read_device_config_le_2() above. */ if (sc->sc_bus_endian != LITTLE_ENDIAN) val = bswap32(val); #endif DPRINTFR("read_le_4", "%08x", val, index, 4); DPRINTFR2("read_le_4", "%08x", bus_space_read_stream_4(sc->sc_devcfg_iot, sc->sc_devcfg_ioh, 0), bus_space_read_4(sc->sc_devcfg_iot, sc->sc_devcfg_ioh, 0)); return val; } void virtio_write_device_config_1(struct virtio_softc *sc, int index, uint8_t value) { bus_space_tag_t iot = sc->sc_devcfg_iot; bus_space_handle_t ioh = sc->sc_devcfg_ioh; bus_space_write_1(iot, ioh, index, value); } void virtio_write_device_config_2(struct virtio_softc *sc, int index, uint16_t value) { bus_space_tag_t iot = sc->sc_devcfg_iot; bus_space_handle_t ioh = sc->sc_devcfg_ioh; if (BYTE_ORDER != sc->sc_bus_endian) value = bswap16(value); bus_space_write_2(iot, ioh, index, value); } void virtio_write_device_config_4(struct virtio_softc *sc, int index, uint32_t value) { bus_space_tag_t iot = sc->sc_devcfg_iot; bus_space_handle_t ioh = sc->sc_devcfg_ioh; if (BYTE_ORDER != sc->sc_bus_endian) value = bswap32(value); bus_space_write_4(iot, ioh, index, value); } /* * The Virtio spec explicitly tells that reading and writing 8 bytes are not * considered atomic and no triggers may be connected to reading or writing * it. We access it using two 32 bit writes. For good measure it is stated to * always write lsb first just in case of a hypervisor bug. See See virtio * spec 4.1.3.1. */ void virtio_write_device_config_8(struct virtio_softc *sc, int index, uint64_t value) { bus_space_tag_t iot = sc->sc_devcfg_iot; bus_space_handle_t ioh = sc->sc_devcfg_ioh; union { uint64_t u64; uint32_t l[2]; } v; if (BYTE_ORDER != sc->sc_struct_endian) value = bswap64(value); v.u64 = value; if (sc->sc_bus_endian != sc->sc_struct_endian) { v.l[0] = bswap32(v.l[0]); v.l[1] = bswap32(v.l[1]); } if (sc->sc_struct_endian == LITTLE_ENDIAN) { bus_space_write_4(iot, ioh, index, v.l[0]); bus_space_write_4(iot, ioh, index + 4, v.l[1]); } else { bus_space_write_4(iot, ioh, index + 4, v.l[1]); bus_space_write_4(iot, ioh, index, v.l[0]); } } /* * In the older virtio spec, device config registers are host endian. On newer * they are little endian. Some newer devices however explicitly specify their * register to always be little endian. These functions cater for these. */ void virtio_write_device_config_le_2(struct virtio_softc *sc, int index, uint16_t value) { bus_space_tag_t iot = sc->sc_devcfg_iot; bus_space_handle_t ioh = sc->sc_devcfg_ioh; if (sc->sc_bus_endian != LITTLE_ENDIAN) value = bswap16(value); bus_space_write_2(iot, ioh, index, value); } void virtio_write_device_config_le_4(struct virtio_softc *sc, int index, uint32_t value) { bus_space_tag_t iot = sc->sc_devcfg_iot; bus_space_handle_t ioh = sc->sc_devcfg_ioh; if (sc->sc_bus_endian != LITTLE_ENDIAN) value = bswap32(value); bus_space_write_4(iot, ioh, index, value); } /* * data structures endian helpers */ uint16_t virtio_rw16(struct virtio_softc *sc, uint16_t val) { KASSERT(sc); return BYTE_ORDER != sc->sc_struct_endian ? bswap16(val) : val; } uint32_t virtio_rw32(struct virtio_softc *sc, uint32_t val) { KASSERT(sc); return BYTE_ORDER != sc->sc_struct_endian ? bswap32(val) : val; } uint64_t virtio_rw64(struct virtio_softc *sc, uint64_t val) { KASSERT(sc); return BYTE_ORDER != sc->sc_struct_endian ? bswap64(val) : val; } /* * Interrupt handler. */ static void virtio_soft_intr(void *arg) { struct virtio_softc *sc = arg; KASSERT(sc->sc_intrhand != NULL); (*sc->sc_intrhand)(sc); } /* set to vq->vq_intrhand in virtio_init_vq_vqdone() */ static int virtio_vq_done(void *xvq) { struct virtqueue *vq = xvq; return vq->vq_done(vq); } static int virtio_vq_intr(struct virtio_softc *sc) { struct virtqueue *vq; int i, r = 0; for (i = 0; i < sc->sc_nvqs; i++) { vq = &sc->sc_vqs[i]; if (virtio_vq_is_enqueued(sc, vq) == 1) { r |= (*vq->vq_intrhand)(vq->vq_intrhand_arg); } } return r; } /* * dmamap sync operations for a virtqueue. */ static inline void vq_sync_descs(struct virtio_softc *sc, struct virtqueue *vq, int ops) { /* availoffset == sizeof(vring_desc) * vq_num */ bus_dmamap_sync(sc->sc_dmat, vq->vq_dmamap, 0, vq->vq_availoffset, ops); } static inline void vq_sync_aring_all(struct virtio_softc *sc, struct virtqueue *vq, int ops) { uint16_t hdrlen = offsetof(struct vring_avail, ring); size_t payloadlen = vq->vq_num * sizeof(uint16_t); size_t usedlen = 0; if (sc->sc_active_features & VIRTIO_F_RING_EVENT_IDX) usedlen = sizeof(uint16_t); bus_dmamap_sync(sc->sc_dmat, vq->vq_dmamap, vq->vq_availoffset, hdrlen + payloadlen + usedlen, ops); } static inline void vq_sync_aring_header(struct virtio_softc *sc, struct virtqueue *vq, int ops) { uint16_t hdrlen = offsetof(struct vring_avail, ring); bus_dmamap_sync(sc->sc_dmat, vq->vq_dmamap, vq->vq_availoffset, hdrlen, ops); } static inline void vq_sync_aring_payload(struct virtio_softc *sc, struct virtqueue *vq, int ops) { uint16_t hdrlen = offsetof(struct vring_avail, ring); size_t payloadlen = vq->vq_num * sizeof(uint16_t); bus_dmamap_sync(sc->sc_dmat, vq->vq_dmamap, vq->vq_availoffset + hdrlen, payloadlen, ops); } static inline void vq_sync_aring_used(struct virtio_softc *sc, struct virtqueue *vq, int ops) { uint16_t hdrlen = offsetof(struct vring_avail, ring); size_t payloadlen = vq->vq_num * sizeof(uint16_t); size_t usedlen = sizeof(uint16_t); if ((sc->sc_active_features & VIRTIO_F_RING_EVENT_IDX) == 0) return; bus_dmamap_sync(sc->sc_dmat, vq->vq_dmamap, vq->vq_availoffset + hdrlen + payloadlen, usedlen, ops); } static inline void vq_sync_uring_all(struct virtio_softc *sc, struct virtqueue *vq, int ops) { uint16_t hdrlen = offsetof(struct vring_used, ring); size_t payloadlen = vq->vq_num * sizeof(struct vring_used_elem); size_t availlen = 0; if (sc->sc_active_features & VIRTIO_F_RING_EVENT_IDX) availlen = sizeof(uint16_t); bus_dmamap_sync(sc->sc_dmat, vq->vq_dmamap, vq->vq_usedoffset, hdrlen + payloadlen + availlen, ops); } static inline void vq_sync_uring_header(struct virtio_softc *sc, struct virtqueue *vq, int ops) { uint16_t hdrlen = offsetof(struct vring_used, ring); bus_dmamap_sync(sc->sc_dmat, vq->vq_dmamap, vq->vq_usedoffset, hdrlen, ops); } static inline void vq_sync_uring_payload(struct virtio_softc *sc, struct virtqueue *vq, int ops) { uint16_t hdrlen = offsetof(struct vring_used, ring); size_t payloadlen = vq->vq_num * sizeof(struct vring_used_elem); bus_dmamap_sync(sc->sc_dmat, vq->vq_dmamap, vq->vq_usedoffset + hdrlen, payloadlen, ops); } static inline void vq_sync_uring_avail(struct virtio_softc *sc, struct virtqueue *vq, int ops) { uint16_t hdrlen = offsetof(struct vring_used, ring); size_t payloadlen = vq->vq_num * sizeof(struct vring_used_elem); size_t availlen = sizeof(uint16_t); if ((sc->sc_active_features & VIRTIO_F_RING_EVENT_IDX) == 0) return; bus_dmamap_sync(sc->sc_dmat, vq->vq_dmamap, vq->vq_usedoffset + hdrlen + payloadlen, availlen, ops); } static inline void vq_sync_indirect(struct virtio_softc *sc, struct virtqueue *vq, int slot, int ops) { int offset = vq->vq_indirectoffset + sizeof(struct vring_desc) * vq->vq_maxnsegs * slot; bus_dmamap_sync(sc->sc_dmat, vq->vq_dmamap, offset, sizeof(struct vring_desc) * vq->vq_maxnsegs, ops); } bool virtio_vq_is_enqueued(struct virtio_softc *sc, struct virtqueue *vq) { if (vq->vq_queued) { vq->vq_queued = 0; vq_sync_aring_all(sc, vq, BUS_DMASYNC_POSTWRITE); } vq_sync_uring_header(sc, vq, BUS_DMASYNC_POSTREAD); if (vq->vq_used_idx == virtio_rw16(sc, vq->vq_used->idx)) return 0; vq_sync_uring_payload(sc, vq, BUS_DMASYNC_POSTREAD); return 1; } /* * Increase the event index in order to delay interrupts. */ int virtio_postpone_intr(struct virtio_softc *sc, struct virtqueue *vq, uint16_t nslots) { uint16_t idx, nused; idx = vq->vq_used_idx + nslots; /* set the new event index: avail_ring->used_event = idx */ *vq->vq_used_event = virtio_rw16(sc, idx); vq_sync_aring_used(vq->vq_owner, vq, BUS_DMASYNC_PREWRITE); vq->vq_queued++; nused = (uint16_t) (virtio_rw16(sc, vq->vq_used->idx) - vq->vq_used_idx); KASSERT(nused <= vq->vq_num); return nslots < nused; } /* * Postpone interrupt until 3/4 of the available descriptors have been * consumed. */ int virtio_postpone_intr_smart(struct virtio_softc *sc, struct virtqueue *vq) { uint16_t nslots; nslots = (uint16_t) (virtio_rw16(sc, vq->vq_avail->idx) - vq->vq_used_idx) * 3 / 4; return virtio_postpone_intr(sc, vq, nslots); } /* * Postpone interrupt until all of the available descriptors have been * consumed. */ int virtio_postpone_intr_far(struct virtio_softc *sc, struct virtqueue *vq) { uint16_t nslots; nslots = (uint16_t) (virtio_rw16(sc, vq->vq_avail->idx) - vq->vq_used_idx); return virtio_postpone_intr(sc, vq, nslots); } /* * Start/stop vq interrupt. No guarantee. */ void virtio_stop_vq_intr(struct virtio_softc *sc, struct virtqueue *vq) { if (sc->sc_active_features & VIRTIO_F_RING_EVENT_IDX) { /* * No way to disable the interrupt completely with * RingEventIdx. Instead advance used_event by half the * possible value. This won't happen soon and is far enough in * the past to not trigger a spurious interrupt. */ *vq->vq_used_event = virtio_rw16(sc, vq->vq_used_idx + 0x8000); vq_sync_aring_used(sc, vq, BUS_DMASYNC_PREWRITE); } else { vq->vq_avail->flags |= virtio_rw16(sc, VRING_AVAIL_F_NO_INTERRUPT); vq_sync_aring_header(sc, vq, BUS_DMASYNC_PREWRITE); } vq->vq_queued++; } int virtio_start_vq_intr(struct virtio_softc *sc, struct virtqueue *vq) { if (sc->sc_active_features & VIRTIO_F_RING_EVENT_IDX) { /* * If event index feature is negotiated, enabling interrupts * is done through setting the latest consumed index in the * used_event field */ *vq->vq_used_event = virtio_rw16(sc, vq->vq_used_idx); vq_sync_aring_used(sc, vq, BUS_DMASYNC_PREWRITE); } else { vq->vq_avail->flags &= ~virtio_rw16(sc, VRING_AVAIL_F_NO_INTERRUPT); vq_sync_aring_header(sc, vq, BUS_DMASYNC_PREWRITE); } vq->vq_queued++; vq_sync_uring_header(sc, vq, BUS_DMASYNC_POSTREAD); if (vq->vq_used_idx == virtio_rw16(sc, vq->vq_used->idx)) return 0; vq_sync_uring_payload(sc, vq, BUS_DMASYNC_POSTREAD); return 1; } /* * Initialize vq structure. */ /* * Reset virtqueue parameters */ static void virtio_reset_vq(struct virtio_softc *sc, struct virtqueue *vq) { struct vring_desc *vds; int i, j; int vq_size = vq->vq_num; memset(vq->vq_vaddr, 0, vq->vq_bytesize); /* build the descriptor chain for free slot management */ vds = vq->vq_desc; for (i = 0; i < vq_size - 1; i++) { vds[i].next = virtio_rw16(sc, i + 1); } vds[i].next = virtio_rw16(sc, VRING_DESC_CHAIN_END); vq->vq_free_idx = 0; /* build the indirect descriptor chain */ if (vq->vq_indirect != NULL) { struct vring_desc *vd; for (i = 0; i < vq_size; i++) { vd = vq->vq_indirect; vd += vq->vq_maxnsegs * i; for (j = 0; j < vq->vq_maxnsegs - 1; j++) { vd[j].next = virtio_rw16(sc, j + 1); } } } /* enqueue/dequeue status */ vq->vq_avail_idx = 0; vq->vq_used_idx = 0; vq->vq_queued = 0; vq_sync_uring_all(sc, vq, BUS_DMASYNC_PREREAD); vq->vq_queued++; } /* Initialize vq */ void virtio_init_vq_vqdone(struct virtio_softc *sc, struct virtqueue *vq, int index, int (*vq_done)(struct virtqueue *)) { virtio_init_vq(sc, vq, index, virtio_vq_done, vq); vq->vq_done = vq_done; } void virtio_init_vq(struct virtio_softc *sc, struct virtqueue *vq, int index, int (*func)(void *), void *arg) { memset(vq, 0, sizeof(*vq)); vq->vq_owner = sc; vq->vq_num = sc->sc_ops->read_queue_size(sc, index); vq->vq_index = index; vq->vq_intrhand = func; vq->vq_intrhand_arg = arg; } /* * Allocate/free a vq. */ int virtio_alloc_vq(struct virtio_softc *sc, struct virtqueue *vq, int maxsegsize, int maxnsegs, const char *name) { bus_size_t size_desc, size_avail, size_used, size_indirect; bus_size_t allocsize = 0, size_desc_avail; int rsegs, r, hdrlen; unsigned int vq_num; #define VIRTQUEUE_ALIGN(n) roundup(n, VIRTIO_PAGE_SIZE) vq_num = vq->vq_num; if (vq_num == 0) { aprint_error_dev(sc->sc_dev, "virtqueue not exist, index %d for %s\n", vq->vq_index, name); goto err; } hdrlen = sc->sc_active_features & VIRTIO_F_RING_EVENT_IDX ? 3 : 2; size_desc = sizeof(vq->vq_desc[0]) * vq_num; size_avail = sizeof(uint16_t) * hdrlen + sizeof(vq->vq_avail[0].ring[0]) * vq_num; size_used = sizeof(uint16_t) *hdrlen + sizeof(vq->vq_used[0].ring[0]) * vq_num; size_indirect = (sc->sc_indirect && maxnsegs >= MINSEG_INDIRECT) ? sizeof(struct vring_desc) * maxnsegs * vq_num : 0; size_desc_avail = VIRTQUEUE_ALIGN(size_desc + size_avail); size_used = VIRTQUEUE_ALIGN(size_used); allocsize = size_desc_avail + size_used + size_indirect; /* alloc and map the memory */ r = bus_dmamem_alloc(sc->sc_dmat, allocsize, VIRTIO_PAGE_SIZE, 0, &vq->vq_segs[0], 1, &rsegs, BUS_DMA_WAITOK); if (r != 0) { aprint_error_dev(sc->sc_dev, "virtqueue %d for %s allocation failed, " "error code %d\n", vq->vq_index, name, r); goto err; } r = bus_dmamem_map(sc->sc_dmat, &vq->vq_segs[0], rsegs, allocsize, &vq->vq_vaddr, BUS_DMA_WAITOK); if (r != 0) { aprint_error_dev(sc->sc_dev, "virtqueue %d for %s map failed, " "error code %d\n", vq->vq_index, name, r); goto err; } r = bus_dmamap_create(sc->sc_dmat, allocsize, 1, allocsize, 0, BUS_DMA_WAITOK, &vq->vq_dmamap); if (r != 0) { aprint_error_dev(sc->sc_dev, "virtqueue %d for %s dmamap creation failed, " "error code %d\n", vq->vq_index, name, r); goto err; } r = bus_dmamap_load(sc->sc_dmat, vq->vq_dmamap, vq->vq_vaddr, allocsize, NULL, BUS_DMA_WAITOK); if (r != 0) { aprint_error_dev(sc->sc_dev, "virtqueue %d for %s dmamap load failed, " "error code %d\n", vq->vq_index, name, r); goto err; } vq->vq_bytesize = allocsize; vq->vq_maxsegsize = maxsegsize; vq->vq_maxnsegs = maxnsegs; #define VIRTIO_PTR(base, offset) (void *)((intptr_t)(base) + (offset)) /* initialize vring pointers */ vq->vq_desc = VIRTIO_PTR(vq->vq_vaddr, 0); vq->vq_availoffset = size_desc; vq->vq_avail = VIRTIO_PTR(vq->vq_vaddr, vq->vq_availoffset); vq->vq_used_event = VIRTIO_PTR(vq->vq_avail, offsetof(struct vring_avail, ring[vq_num])); vq->vq_usedoffset = size_desc_avail; vq->vq_used = VIRTIO_PTR(vq->vq_vaddr, vq->vq_usedoffset); vq->vq_avail_event = VIRTIO_PTR(vq->vq_used, offsetof(struct vring_used, ring[vq_num])); if (size_indirect > 0) { vq->vq_indirectoffset = size_desc_avail + size_used; vq->vq_indirect = VIRTIO_PTR(vq->vq_vaddr, vq->vq_indirectoffset); } #undef VIRTIO_PTR vq->vq_descx = kmem_zalloc(sizeof(vq->vq_descx[0]) * vq_num, KM_SLEEP); mutex_init(&vq->vq_freedesc_lock, MUTEX_SPIN, sc->sc_ipl); mutex_init(&vq->vq_aring_lock, MUTEX_SPIN, sc->sc_ipl); mutex_init(&vq->vq_uring_lock, MUTEX_SPIN, sc->sc_ipl); virtio_reset_vq(sc, vq); aprint_verbose_dev(sc->sc_dev, "allocated %" PRIuBUSSIZE " byte for virtqueue %d for %s, " "size %d\n", allocsize, vq->vq_index, name, vq_num); if (size_indirect > 0) aprint_verbose_dev(sc->sc_dev, "using %" PRIuBUSSIZE " byte (%d entries) indirect " "descriptors\n", size_indirect, maxnsegs * vq_num); return 0; err: sc->sc_ops->setup_queue(sc, vq->vq_index, 0); if (vq->vq_dmamap) bus_dmamap_destroy(sc->sc_dmat, vq->vq_dmamap); if (vq->vq_vaddr) bus_dmamem_unmap(sc->sc_dmat, vq->vq_vaddr, allocsize); if (vq->vq_segs[0].ds_addr) bus_dmamem_free(sc->sc_dmat, &vq->vq_segs[0], 1); memset(vq, 0, sizeof(*vq)); return -1; } int virtio_free_vq(struct virtio_softc *sc, struct virtqueue *vq) { uint16_t s; size_t i; if (vq->vq_vaddr == NULL) return 0; /* device must be already deactivated */ /* confirm the vq is empty */ s = vq->vq_free_idx; i = 0; while (s != virtio_rw16(sc, VRING_DESC_CHAIN_END)) { s = vq->vq_desc[s].next; i++; } if (i != vq->vq_num) { printf("%s: freeing non-empty vq, index %d\n", device_xname(sc->sc_dev), vq->vq_index); return EBUSY; } /* tell device that there's no virtqueue any longer */ sc->sc_ops->setup_queue(sc, vq->vq_index, 0); vq_sync_aring_all(sc, vq, BUS_DMASYNC_POSTWRITE); kmem_free(vq->vq_descx, sizeof(vq->vq_descx[0]) * vq->vq_num); bus_dmamap_unload(sc->sc_dmat, vq->vq_dmamap); bus_dmamap_destroy(sc->sc_dmat, vq->vq_dmamap); bus_dmamem_unmap(sc->sc_dmat, vq->vq_vaddr, vq->vq_bytesize); bus_dmamem_free(sc->sc_dmat, &vq->vq_segs[0], 1); mutex_destroy(&vq->vq_freedesc_lock); mutex_destroy(&vq->vq_uring_lock); mutex_destroy(&vq->vq_aring_lock); memset(vq, 0, sizeof(*vq)); return 0; } /* * Free descriptor management. */ static int vq_alloc_slot_locked(struct virtio_softc *sc, struct virtqueue *vq, size_t nslots) { struct vring_desc *vd; uint16_t head, tail; size_t i; KASSERT(mutex_owned(&vq->vq_freedesc_lock)); head = tail = virtio_rw16(sc, vq->vq_free_idx); for (i = 0; i < nslots - 1; i++) { if (tail == VRING_DESC_CHAIN_END) return VRING_DESC_CHAIN_END; vd = &vq->vq_desc[tail]; vd->flags = virtio_rw16(sc, VRING_DESC_F_NEXT); tail = virtio_rw16(sc, vd->next); } if (tail == VRING_DESC_CHAIN_END) return VRING_DESC_CHAIN_END; vd = &vq->vq_desc[tail]; vd->flags = virtio_rw16(sc, 0); vq->vq_free_idx = vd->next; return head; } static uint16_t vq_alloc_slot(struct virtio_softc *sc, struct virtqueue *vq, size_t nslots) { uint16_t rv; mutex_enter(&vq->vq_freedesc_lock); rv = vq_alloc_slot_locked(sc, vq, nslots); mutex_exit(&vq->vq_freedesc_lock); return rv; } static void vq_free_slot(struct virtio_softc *sc, struct virtqueue *vq, uint16_t slot) { struct vring_desc *vd; uint16_t s; mutex_enter(&vq->vq_freedesc_lock); vd = &vq->vq_desc[slot]; while ((vd->flags & virtio_rw16(sc, VRING_DESC_F_NEXT)) != 0) { s = virtio_rw16(sc, vd->next); vd = &vq->vq_desc[s]; } vd->next = vq->vq_free_idx; vq->vq_free_idx = virtio_rw16(sc, slot); mutex_exit(&vq->vq_freedesc_lock); } /* * Enqueue several dmamaps as a single request. */ /* * Typical usage: * <queue size> number of followings are stored in arrays * - command blocks (in dmamem) should be pre-allocated and mapped * - dmamaps for command blocks should be pre-allocated and loaded * - dmamaps for payload should be pre-allocated * r = virtio_enqueue_prep(sc, vq, &slot); // allocate a slot * if (r) // currently 0 or EAGAIN * return r; * r = bus_dmamap_load(dmat, dmamap_payload[slot], data, count, ..); * if (r) { * virtio_enqueue_abort(sc, vq, slot); * return r; * } * r = virtio_enqueue_reserve(sc, vq, slot, * dmamap_payload[slot]->dm_nsegs + 1); * // ^ +1 for command * if (r) { // currently 0 or EAGAIN * bus_dmamap_unload(dmat, dmamap_payload[slot]); * return r; // do not call abort() * } * <setup and prepare commands> * bus_dmamap_sync(dmat, dmamap_cmd[slot],... BUS_DMASYNC_PREWRITE); * bus_dmamap_sync(dmat, dmamap_payload[slot],...); * virtio_enqueue(sc, vq, slot, dmamap_cmd[slot], false); * virtio_enqueue(sc, vq, slot, dmamap_payload[slot], iswrite); * virtio_enqueue_commit(sc, vq, slot, true); */ /* * enqueue_prep: allocate a slot number */ int virtio_enqueue_prep(struct virtio_softc *sc, struct virtqueue *vq, int *slotp) { uint16_t slot; KASSERT(slotp != NULL); slot = vq_alloc_slot(sc, vq, 1); if (slot == VRING_DESC_CHAIN_END) return EAGAIN; *slotp = slot; return 0; } /* * enqueue_reserve: allocate remaining slots and build the descriptor chain. */ int virtio_enqueue_reserve(struct virtio_softc *sc, struct virtqueue *vq, int slot, int nsegs) { struct vring_desc *vd; struct vring_desc_extra *vdx; int i; KASSERT(1 <= nsegs); KASSERT(nsegs <= vq->vq_num); vdx = &vq->vq_descx[slot]; vd = &vq->vq_desc[slot]; KASSERT((vd->flags & virtio_rw16(sc, VRING_DESC_F_NEXT)) == 0); if ((vq->vq_indirect != NULL) && (nsegs >= MINSEG_INDIRECT) && (nsegs <= vq->vq_maxnsegs)) vdx->use_indirect = true; else vdx->use_indirect = false; if (vdx->use_indirect) { uint64_t addr; addr = vq->vq_dmamap->dm_segs[0].ds_addr + vq->vq_indirectoffset; addr += sizeof(struct vring_desc) * vq->vq_maxnsegs * slot; vd->addr = virtio_rw64(sc, addr); vd->len = virtio_rw32(sc, sizeof(struct vring_desc) * nsegs); vd->flags = virtio_rw16(sc, VRING_DESC_F_INDIRECT); vd = &vq->vq_indirect[vq->vq_maxnsegs * slot]; vdx->desc_base = vd; vdx->desc_free_idx = 0; for (i = 0; i < nsegs - 1; i++) { vd[i].flags = virtio_rw16(sc, VRING_DESC_F_NEXT); } vd[i].flags = virtio_rw16(sc, 0); } else { if (nsegs > 1) { uint16_t s; s = vq_alloc_slot(sc, vq, nsegs - 1); if (s == VRING_DESC_CHAIN_END) { vq_free_slot(sc, vq, slot); return EAGAIN; } vd->next = virtio_rw16(sc, s); vd->flags = virtio_rw16(sc, VRING_DESC_F_NEXT); } vdx->desc_base = &vq->vq_desc[0]; vdx->desc_free_idx = slot; } return 0; } /* * enqueue: enqueue a single dmamap. */ int virtio_enqueue(struct virtio_softc *sc, struct virtqueue *vq, int slot, bus_dmamap_t dmamap, bool write) { struct vring_desc *vds; struct vring_desc_extra *vdx; uint16_t s; int i; KASSERT(dmamap->dm_nsegs > 0); vdx = &vq->vq_descx[slot]; vds = vdx->desc_base; s = vdx->desc_free_idx; KASSERT(vds != NULL); for (i = 0; i < dmamap->dm_nsegs; i++) { KASSERT(s != VRING_DESC_CHAIN_END); vds[s].addr = virtio_rw64(sc, dmamap->dm_segs[i].ds_addr); vds[s].len = virtio_rw32(sc, dmamap->dm_segs[i].ds_len); if (!write) vds[s].flags |= virtio_rw16(sc, VRING_DESC_F_WRITE); if ((vds[s].flags & virtio_rw16(sc, VRING_DESC_F_NEXT)) == 0) { s = VRING_DESC_CHAIN_END; } else { s = virtio_rw16(sc, vds[s].next); } } vdx->desc_free_idx = s; return 0; } int virtio_enqueue_p(struct virtio_softc *sc, struct virtqueue *vq, int slot, bus_dmamap_t dmamap, bus_addr_t start, bus_size_t len, bool write) { struct vring_desc_extra *vdx; struct vring_desc *vds; uint16_t s; vdx = &vq->vq_descx[slot]; vds = vdx->desc_base; s = vdx->desc_free_idx; KASSERT(s != VRING_DESC_CHAIN_END); KASSERT(vds != NULL); KASSERT(dmamap->dm_nsegs == 1); /* XXX */ KASSERT(dmamap->dm_segs[0].ds_len > start); KASSERT(dmamap->dm_segs[0].ds_len >= start + len); vds[s].addr = virtio_rw64(sc, dmamap->dm_segs[0].ds_addr + start); vds[s].len = virtio_rw32(sc, len); if (!write) vds[s].flags |= virtio_rw16(sc, VRING_DESC_F_WRITE); if ((vds[s].flags & virtio_rw16(sc, VRING_DESC_F_NEXT)) == 0) { s = VRING_DESC_CHAIN_END; } else { s = virtio_rw16(sc, vds[s].next); } vdx->desc_free_idx = s; return 0; } /* * enqueue_commit: add it to the aring. */ int virtio_enqueue_commit(struct virtio_softc *sc, struct virtqueue *vq, int slot, bool notifynow) { if (slot < 0) { mutex_enter(&vq->vq_aring_lock); goto notify; } vq_sync_descs(sc, vq, BUS_DMASYNC_PREWRITE); if (vq->vq_descx[slot].use_indirect) vq_sync_indirect(sc, vq, slot, BUS_DMASYNC_PREWRITE); mutex_enter(&vq->vq_aring_lock); vq->vq_avail->ring[(vq->vq_avail_idx++) % vq->vq_num] = virtio_rw16(sc, slot); notify: if (notifynow) { uint16_t o, n, t; uint16_t flags; o = virtio_rw16(sc, vq->vq_avail->idx) - 1; n = vq->vq_avail_idx; /* * Prepare for `device->CPU' (host->guest) transfer * into the buffer. This must happen before we commit * the vq->vq_avail->idx update to ensure we're not * still using the buffer in case program-prior loads * or stores in it get delayed past the store to * vq->vq_avail->idx. */ vq_sync_uring_all(sc, vq, BUS_DMASYNC_PREREAD); /* ensure payload is published, then avail idx */ vq_sync_aring_payload(sc, vq, BUS_DMASYNC_PREWRITE); vq->vq_avail->idx = virtio_rw16(sc, vq->vq_avail_idx); vq_sync_aring_header(sc, vq, BUS_DMASYNC_PREWRITE); vq->vq_queued++; if (sc->sc_active_features & VIRTIO_F_RING_EVENT_IDX) { vq_sync_uring_avail(sc, vq, BUS_DMASYNC_POSTREAD); t = virtio_rw16(sc, *vq->vq_avail_event) + 1; if ((uint16_t) (n - t) < (uint16_t) (n - o)) sc->sc_ops->kick(sc, vq->vq_index); } else { vq_sync_uring_header(sc, vq, BUS_DMASYNC_POSTREAD); flags = virtio_rw16(sc, vq->vq_used->flags); if (!(flags & VRING_USED_F_NO_NOTIFY)) sc->sc_ops->kick(sc, vq->vq_index); } } mutex_exit(&vq->vq_aring_lock); return 0; } /* * enqueue_abort: rollback. */ int virtio_enqueue_abort(struct virtio_softc *sc, struct virtqueue *vq, int slot) { struct vring_desc_extra *vdx; vdx = &vq->vq_descx[slot]; vdx->desc_free_idx = VRING_DESC_CHAIN_END; vdx->desc_base = NULL; vq_free_slot(sc, vq, slot); return 0; } /* * Dequeue a request. */ /* * dequeue: dequeue a request from uring; dmamap_sync for uring is * already done in the interrupt handler. */ int virtio_dequeue(struct virtio_softc *sc, struct virtqueue *vq, int *slotp, int *lenp) { uint16_t slot, usedidx; if (vq->vq_used_idx == virtio_rw16(sc, vq->vq_used->idx)) return ENOENT; mutex_enter(&vq->vq_uring_lock); usedidx = vq->vq_used_idx++; mutex_exit(&vq->vq_uring_lock); usedidx %= vq->vq_num; slot = virtio_rw32(sc, vq->vq_used->ring[usedidx].id); if (vq->vq_descx[slot].use_indirect) vq_sync_indirect(sc, vq, slot, BUS_DMASYNC_POSTWRITE); if (slotp) *slotp = slot; if (lenp) *lenp = virtio_rw32(sc, vq->vq_used->ring[usedidx].len); return 0; } /* * dequeue_commit: complete dequeue; the slot is recycled for future use. * if you forget to call this the slot will be leaked. */ int virtio_dequeue_commit(struct virtio_softc *sc, struct virtqueue *vq, int slot) { struct vring_desc_extra *vdx; vdx = &vq->vq_descx[slot]; vdx->desc_base = NULL; vdx->desc_free_idx = VRING_DESC_CHAIN_END; vq_free_slot(sc, vq, slot); return 0; } /* * Attach a child, fill all the members. */ void virtio_child_attach_start(struct virtio_softc *sc, device_t child, int ipl, uint64_t req_features, const char *feat_bits) { char buf[1024]; KASSERT(sc->sc_child == NULL); KASSERT(sc->sc_child_state == VIRTIO_NO_CHILD); sc->sc_child = child; sc->sc_ipl = ipl; virtio_negotiate_features(sc, req_features); snprintb(buf, sizeof(buf), feat_bits, sc->sc_active_features); aprint_normal(": features: %s\n", buf); aprint_naive("\n"); } int virtio_child_attach_finish(struct virtio_softc *sc, struct virtqueue *vqs, size_t nvqs, virtio_callback config_change, int req_flags) { size_t i; int r; #ifdef DIAGNOSTIC KASSERT(nvqs > 0); #define VIRTIO_ASSERT_FLAGS (VIRTIO_F_INTR_SOFTINT | VIRTIO_F_INTR_PERVQ) KASSERT((req_flags & VIRTIO_ASSERT_FLAGS) != VIRTIO_ASSERT_FLAGS); #undef VIRTIO_ASSERT_FLAGS for (i = 0; i < nvqs; i++){ KASSERT(vqs[i].vq_index == i); KASSERT(vqs[i].vq_intrhand != NULL); KASSERT(vqs[i].vq_done == NULL || vqs[i].vq_intrhand == virtio_vq_done); } #endif sc->sc_vqs = vqs; sc->sc_nvqs = nvqs; sc->sc_config_change = config_change; sc->sc_intrhand = virtio_vq_intr; sc->sc_flags = req_flags; /* set the vq address */ for (i = 0; i < nvqs; i++) { sc->sc_ops->setup_queue(sc, vqs[i].vq_index, vqs[i].vq_dmamap->dm_segs[0].ds_addr); } r = sc->sc_ops->alloc_interrupts(sc); if (r != 0) { aprint_error_dev(sc->sc_dev, "failed to allocate interrupts\n"); goto fail; } r = sc->sc_ops->setup_interrupts(sc, 0); if (r != 0) { aprint_error_dev(sc->sc_dev, "failed to setup interrupts\n"); goto fail; } KASSERT(sc->sc_soft_ih == NULL); if (sc->sc_flags & VIRTIO_F_INTR_SOFTINT) { u_int flags = SOFTINT_NET; if (sc->sc_flags & VIRTIO_F_INTR_MPSAFE) flags |= SOFTINT_MPSAFE; sc->sc_soft_ih = softint_establish(flags, virtio_soft_intr, sc); if (sc->sc_soft_ih == NULL) { sc->sc_ops->free_interrupts(sc); aprint_error_dev(sc->sc_dev, "failed to establish soft interrupt\n"); goto fail; } } sc->sc_child_state = VIRTIO_CHILD_ATTACH_FINISHED; virtio_set_status(sc, VIRTIO_CONFIG_DEVICE_STATUS_DRIVER_OK); return 0; fail: if (sc->sc_soft_ih) { softint_disestablish(sc->sc_soft_ih); sc->sc_soft_ih = NULL; } sc->sc_ops->free_interrupts(sc); virtio_set_status(sc, VIRTIO_CONFIG_DEVICE_STATUS_FAILED); return 1; } void virtio_child_detach(struct virtio_softc *sc) { /* already detached */ if (sc->sc_child == NULL) return; virtio_device_reset(sc); sc->sc_ops->free_interrupts(sc); if (sc->sc_soft_ih) { softint_disestablish(sc->sc_soft_ih); sc->sc_soft_ih = NULL; } sc->sc_vqs = NULL; sc->sc_child = NULL; } void virtio_child_attach_failed(struct virtio_softc *sc) { virtio_child_detach(sc); virtio_set_status(sc, VIRTIO_CONFIG_DEVICE_STATUS_FAILED); sc->sc_child_state = VIRTIO_CHILD_ATTACH_FAILED; } bus_dma_tag_t virtio_dmat(struct virtio_softc *sc) { return sc->sc_dmat; } device_t virtio_child(struct virtio_softc *sc) { return sc->sc_child; } int virtio_intrhand(struct virtio_softc *sc) { return (*sc->sc_intrhand)(sc); } uint64_t virtio_features(struct virtio_softc *sc) { return sc->sc_active_features; } int virtio_attach_failed(struct virtio_softc *sc) { device_t self = sc->sc_dev; /* no error if its not connected, but its failed */ if (sc->sc_childdevid == 0) return 1; if (sc->sc_child == NULL) { switch (sc->sc_child_state) { case VIRTIO_CHILD_ATTACH_FAILED: aprint_error_dev(self, "virtio configuration failed\n"); break; case VIRTIO_NO_CHILD: aprint_error_dev(self, "no matching child driver; not configured\n"); break; default: /* sanity check */ aprint_error_dev(self, "virtio internal error, " "child driver is not configured\n"); break; } return 1; } /* sanity check */ if (sc->sc_child_state != VIRTIO_CHILD_ATTACH_FINISHED) { aprint_error_dev(self, "virtio internal error, child driver " "signaled OK but didn't initialize interrupts\n"); return 1; } return 0; } void virtio_print_device_type(device_t self, int id, int revision) { aprint_normal_dev(self, "%s device (id %d, rev. 0x%02x)\n", (id < NDEVNAMES ? virtio_device_name[id] : "Unknown"), id, revision); } MODULE(MODULE_CLASS_DRIVER, virtio, NULL); #ifdef _MODULE #include "ioconf.c" #endif static int virtio_modcmd(modcmd_t cmd, void *opaque) { int error = 0; #ifdef _MODULE switch (cmd) { case MODULE_CMD_INIT: error = config_init_component(cfdriver_ioconf_virtio, cfattach_ioconf_virtio, cfdata_ioconf_virtio); break; case MODULE_CMD_FINI: error = config_fini_component(cfdriver_ioconf_virtio, cfattach_ioconf_virtio, cfdata_ioconf_virtio); break; default: error = ENOTTY; break; } #endif return error; }
202 203 202 200 86 139 139 87 203 203 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 /* $NetBSD: vioscsi.c,v 1.36 2023/03/25 11:04:34 mlelstv Exp $ */ /* $OpenBSD: vioscsi.c,v 1.3 2015/03/14 03:38:49 jsg Exp $ */ /* * Copyright (c) 2013 Google Inc. * * Permission to use, copy, modify, and distribute this software for any * purpose with or without fee is hereby granted, provided that the above * copyright notice and this permission notice appear in all copies. * * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: vioscsi.c,v 1.36 2023/03/25 11:04:34 mlelstv Exp $"); #include <sys/param.h> #include <sys/systm.h> #include <sys/device.h> #include <sys/bus.h> #include <sys/buf.h> #include <sys/module.h> #include <dev/pci/vioscsireg.h> #include <dev/pci/virtiovar.h> #include <dev/scsipi/scsi_all.h> #include <dev/scsipi/scsiconf.h> #ifdef VIOSCSI_DEBUG static int vioscsi_debug = 1; #define DPRINTF(f) do { if (vioscsi_debug) printf f; } while (/*CONSTCOND*/0) #else #define DPRINTF(f) ((void)0) #endif struct vioscsi_req { struct virtio_scsi_req_hdr vr_req; struct virtio_scsi_res_hdr vr_res; struct scsipi_xfer *vr_xs; bus_dmamap_t vr_control; bus_dmamap_t vr_data; }; struct vioscsi_softc { device_t sc_dev; struct scsipi_adapter sc_adapter; struct scsipi_channel sc_channel; struct virtqueue sc_vqs[3]; #define VIOSCSI_VQ_CONTROL 0 #define VIOSCSI_VQ_EVENT 1 #define VIOSCSI_VQ_REQUEST 2 struct vioscsi_req *sc_reqs; int sc_nreqs; bus_dma_segment_t sc_reqs_segs[1]; u_int32_t sc_seg_max; kmutex_t sc_mutex; }; /* * Each block request uses at least two segments - one for the header * and one for the status. */ #define VIRTIO_SCSI_MIN_SEGMENTS 2 static int vioscsi_match(device_t, cfdata_t, void *); static void vioscsi_attach(device_t, device_t, void *); static int vioscsi_detach(device_t, int); static int vioscsi_alloc_reqs(struct vioscsi_softc *, struct virtio_softc *, int); static void vioscsi_free_reqs(struct vioscsi_softc *, struct virtio_softc *); static void vioscsi_scsipi_request(struct scsipi_channel *, scsipi_adapter_req_t, void *); static int vioscsi_vq_done(struct virtqueue *); static void vioscsi_req_done(struct vioscsi_softc *, struct virtio_softc *, struct vioscsi_req *, struct virtqueue *, int); static struct vioscsi_req *vioscsi_req_get(struct vioscsi_softc *); static void vioscsi_bad_target(struct scsipi_xfer *); static const char *const vioscsi_vq_names[] = { "control", "event", "request", }; CFATTACH_DECL3_NEW(vioscsi, sizeof(struct vioscsi_softc), vioscsi_match, vioscsi_attach, vioscsi_detach, NULL, NULL, NULL, DVF_DETACH_SHUTDOWN); static int vioscsi_match(device_t parent, cfdata_t match, void *aux) { struct virtio_attach_args *va = aux; if (va->sc_childdevid == VIRTIO_DEVICE_ID_SCSI) return 1; return 0; } static void vioscsi_attach(device_t parent, device_t self, void *aux) { struct vioscsi_softc *sc = device_private(self); struct virtio_softc *vsc = device_private(parent); struct scsipi_adapter *adapt = &sc->sc_adapter; struct scsipi_channel *chan = &sc->sc_channel; int rv, qsize = 0, i = 0; int ipl = IPL_BIO; if (virtio_child(vsc) != NULL) { aprint_error(": parent %s already has a child\n", device_xname(parent)); return; } sc->sc_dev = self; virtio_child_attach_start(vsc, self, ipl, 0, VIRTIO_COMMON_FLAG_BITS); mutex_init(&sc->sc_mutex, MUTEX_DEFAULT, ipl); uint32_t cmd_per_lun = virtio_read_device_config_4(vsc, VIRTIO_SCSI_CONFIG_CMD_PER_LUN); uint32_t seg_max = virtio_read_device_config_4(vsc, VIRTIO_SCSI_CONFIG_SEG_MAX); uint16_t max_target = virtio_read_device_config_2(vsc, VIRTIO_SCSI_CONFIG_MAX_TARGET); uint32_t max_lun = virtio_read_device_config_4(vsc, VIRTIO_SCSI_CONFIG_MAX_LUN); sc->sc_seg_max = seg_max; for(i=0; i < __arraycount(sc->sc_vqs); i++) { virtio_init_vq_vqdone(vsc, &sc->sc_vqs[i], i, vioscsi_vq_done); rv = virtio_alloc_vq(vsc, &sc->sc_vqs[i], MAXPHYS, VIRTIO_SCSI_MIN_SEGMENTS + howmany(MAXPHYS, NBPG), vioscsi_vq_names[i]); if (rv) { aprint_error_dev(sc->sc_dev, "failed to allocate virtqueue %d\n", i); goto err; } if (i == VIOSCSI_VQ_REQUEST) sc->sc_vqs[i].vq_done = vioscsi_vq_done; } qsize = sc->sc_vqs[VIOSCSI_VQ_REQUEST].vq_num; if (vioscsi_alloc_reqs(sc, vsc, qsize)) goto err; aprint_normal_dev(sc->sc_dev, "cmd_per_lun %u qsize %d seg_max %u max_target %hu" " max_lun %u\n", cmd_per_lun, qsize, seg_max, max_target, max_lun); if (virtio_child_attach_finish(vsc, sc->sc_vqs, __arraycount(sc->sc_vqs), NULL, VIRTIO_F_INTR_MSIX | VIRTIO_F_INTR_MPSAFE) != 0) goto err; /* * Fill in the scsipi_adapter. */ memset(adapt, 0, sizeof(*adapt)); adapt->adapt_dev = sc->sc_dev; adapt->adapt_nchannels = 1; adapt->adapt_openings = MIN(qsize, cmd_per_lun); adapt->adapt_max_periph = adapt->adapt_openings; adapt->adapt_request = vioscsi_scsipi_request; adapt->adapt_minphys = minphys; adapt->adapt_flags = SCSIPI_ADAPT_MPSAFE; /* * Fill in the scsipi_channel. */ memset(chan, 0, sizeof(*chan)); chan->chan_adapter = adapt; chan->chan_bustype = &scsi_bustype; chan->chan_channel = 0; chan->chan_ntargets = MIN(1 + max_target, 256); /* cap reasonably */ chan->chan_nluns = MIN(1 + max_lun, 16384); /* cap reasonably */ chan->chan_id = max_target + 1; chan->chan_flags = SCSIPI_CHAN_NOSETTLE; config_found(self, &sc->sc_channel, scsiprint, CFARGS_NONE); return; err: if (qsize > 0) vioscsi_free_reqs(sc, vsc); for (i=0; i < __arraycount(sc->sc_vqs); i++) { virtio_free_vq(vsc, &sc->sc_vqs[i]); } virtio_child_attach_failed(vsc); } static int vioscsi_detach(device_t self, int flags) { struct vioscsi_softc *sc = device_private(self); struct virtio_softc *vsc = device_private(device_parent(sc->sc_dev)); int rc, i; /* * Dequeue all pending finished requests. Must be done * before we try to detach children so that we process * their pending requests while they still exist. */ if (sc->sc_vqs[VIOSCSI_VQ_REQUEST].vq_num > 0) vioscsi_vq_done(&sc->sc_vqs[VIOSCSI_VQ_REQUEST]); if ((rc = config_detach_children(self, flags)) != 0) return rc; virtio_reset(vsc); for (i = 0; i < __arraycount(sc->sc_vqs); i++) { if (sc->sc_vqs[i].vq_num > 0) virtio_free_vq(vsc, &sc->sc_vqs[i]); } vioscsi_free_reqs(sc, vsc); virtio_child_detach(vsc); mutex_destroy(&sc->sc_mutex); return 0; } #define XS2DMA(xs) \ ((((xs)->xs_control & XS_CTL_DATA_IN) ? BUS_DMA_READ : BUS_DMA_WRITE) | \ (((xs)->xs_control & XS_CTL_NOSLEEP) ? BUS_DMA_NOWAIT : BUS_DMA_WAITOK) | \ BUS_DMA_STREAMING) #define XS2DMAPRE(xs) (((xs)->xs_control & XS_CTL_DATA_IN) ? \ BUS_DMASYNC_PREREAD : BUS_DMASYNC_PREWRITE) #define XS2DMAPOST(xs) (((xs)->xs_control & XS_CTL_DATA_IN) ? \ BUS_DMASYNC_POSTREAD : BUS_DMASYNC_POSTWRITE) static void vioscsi_scsipi_request(struct scsipi_channel *chan, scsipi_adapter_req_t request, void *arg) { struct vioscsi_softc *sc = device_private(chan->chan_adapter->adapt_dev); struct virtio_softc *vsc = device_private(device_parent(sc->sc_dev)); struct scsipi_xfer *xs; struct scsipi_periph *periph; struct vioscsi_req *vr; struct virtio_scsi_req_hdr *req; struct virtqueue *vq = &sc->sc_vqs[VIOSCSI_VQ_REQUEST]; int slot, error; bool dopoll; DPRINTF(("%s: enter\n", __func__)); switch (request) { case ADAPTER_REQ_RUN_XFER: break; case ADAPTER_REQ_SET_XFER_MODE: { struct scsipi_xfer_mode *xm = arg; xm->xm_mode = PERIPH_CAP_TQING; xm->xm_period = 0; xm->xm_offset = 0; scsipi_async_event(chan, ASYNC_EVENT_XFER_MODE, xm); return; } default: DPRINTF(("%s: unhandled %d\n", __func__, request)); return; } xs = arg; periph = xs->xs_periph; /* * This can happen when we run out of queue slots. */ vr = vioscsi_req_get(sc); if (vr == NULL) { xs->error = XS_BUSY; scsipi_done(xs); return; } req = &vr->vr_req; slot = vr - sc->sc_reqs; /* * "The only supported format for the LUN field is: first byte set to * 1, second byte set to target, third and fourth byte representing a * single level LUN structure, followed by four zero bytes." */ if (periph->periph_target >= 256 || periph->periph_lun >= 16384 || periph->periph_target < 0 || periph->periph_lun < 0) { goto stuffup; } req->lun[0] = 1; req->lun[1] = periph->periph_target; req->lun[2] = 0x40 | ((periph->periph_lun >> 8) & 0x3F); req->lun[3] = periph->periph_lun & 0xFF; memset(req->lun + 4, 0, 4); DPRINTF(("%s: command %p for %d:%d at slot %d\n", __func__, xs, periph->periph_target, periph->periph_lun, slot)); /* tag */ switch (XS_CTL_TAGTYPE(xs)) { case XS_CTL_HEAD_TAG: req->task_attr = VIRTIO_SCSI_S_HEAD; break; #if 0 /* XXX */ case XS_CTL_ACA_TAG: req->task_attr = VIRTIO_SCSI_S_ACA; break; #endif case XS_CTL_ORDERED_TAG: req->task_attr = VIRTIO_SCSI_S_ORDERED; break; case XS_CTL_SIMPLE_TAG: default: req->task_attr = VIRTIO_SCSI_S_SIMPLE; break; } req->id = virtio_rw64(vsc, slot); if ((size_t)xs->cmdlen > sizeof(req->cdb)) { DPRINTF(("%s: bad cmdlen %zu > %zu\n", __func__, (size_t)xs->cmdlen, sizeof(req->cdb))); goto stuffup; } memset(req->cdb, 0, sizeof(req->cdb)); memcpy(req->cdb, xs->cmd, xs->cmdlen); error = bus_dmamap_load(virtio_dmat(vsc), vr->vr_data, xs->data, xs->datalen, NULL, XS2DMA(xs)); if (error) { aprint_error_dev(sc->sc_dev, "%s: error %d loading DMA map\n", __func__, error); if (error == ENOMEM || error == EAGAIN) { /* * Map is allocated with ALLOCNOW, so this should * actually never ever happen. */ xs->error = XS_RESOURCE_SHORTAGE; } else { stuffup: /* not a temporary condition */ xs->error = XS_DRIVER_STUFFUP; } virtio_enqueue_abort(vsc, vq, slot); scsipi_done(xs); return; } int nsegs = VIRTIO_SCSI_MIN_SEGMENTS; if ((xs->xs_control & (XS_CTL_DATA_IN|XS_CTL_DATA_OUT)) != 0) nsegs += vr->vr_data->dm_nsegs; error = virtio_enqueue_reserve(vsc, vq, slot, nsegs); if (error) { bus_dmamap_unload(virtio_dmat(vsc), vr->vr_data); /* slot already freed by virtio_enqueue_reserve() */ xs->error = XS_BUSY; scsipi_done(xs); return; } vr->vr_xs = xs; bus_dmamap_sync(virtio_dmat(vsc), vr->vr_control, offsetof(struct vioscsi_req, vr_req), sizeof(struct virtio_scsi_req_hdr), BUS_DMASYNC_PREWRITE); bus_dmamap_sync(virtio_dmat(vsc), vr->vr_control, offsetof(struct vioscsi_req, vr_res), sizeof(struct virtio_scsi_res_hdr), BUS_DMASYNC_PREREAD); if ((xs->xs_control & (XS_CTL_DATA_IN|XS_CTL_DATA_OUT)) != 0) bus_dmamap_sync(virtio_dmat(vsc), vr->vr_data, 0, xs->datalen, XS2DMAPRE(xs)); virtio_enqueue_p(vsc, vq, slot, vr->vr_control, offsetof(struct vioscsi_req, vr_req), sizeof(struct virtio_scsi_req_hdr), 1); if (xs->xs_control & XS_CTL_DATA_OUT) virtio_enqueue(vsc, vq, slot, vr->vr_data, 1); virtio_enqueue_p(vsc, vq, slot, vr->vr_control, offsetof(struct vioscsi_req, vr_res), sizeof(struct virtio_scsi_res_hdr), 0); if (xs->xs_control & XS_CTL_DATA_IN) virtio_enqueue(vsc, vq, slot, vr->vr_data, 0); dopoll = (xs->xs_control & XS_CTL_POLL) != 0; virtio_enqueue_commit(vsc, vq, slot, 1); if (!dopoll) return; DPRINTF(("%s: polling...\n", __func__)); // XXX: do this better. int timeout = 1000; do { virtio_intrhand(vsc); if (vr->vr_xs != xs) break; delay(1000); } while (--timeout > 0); if (vr->vr_xs == xs) { // XXX: Abort! xs->error = XS_TIMEOUT; xs->resid = xs->datalen; DPRINTF(("%s: polling timeout\n", __func__)); scsipi_done(xs); } DPRINTF(("%s: command %p done (timeout=%d)\n", __func__, xs, timeout)); } static void vioscsi_req_done(struct vioscsi_softc *sc, struct virtio_softc *vsc, struct vioscsi_req *vr, struct virtqueue *vq, int slot) { struct scsipi_xfer *xs = vr->vr_xs; size_t sense_len; DPRINTF(("%s: enter\n", __func__)); bus_dmamap_sync(virtio_dmat(vsc), vr->vr_control, offsetof(struct vioscsi_req, vr_req), sizeof(struct virtio_scsi_req_hdr), BUS_DMASYNC_POSTWRITE); bus_dmamap_sync(virtio_dmat(vsc), vr->vr_control, offsetof(struct vioscsi_req, vr_res), sizeof(struct virtio_scsi_res_hdr), BUS_DMASYNC_POSTREAD); if (xs->datalen) bus_dmamap_sync(virtio_dmat(vsc), vr->vr_data, 0, xs->datalen, XS2DMAPOST(xs)); xs->status = vr->vr_res.status; xs->resid = virtio_rw32(vsc, vr->vr_res.residual); switch (vr->vr_res.response) { case VIRTIO_SCSI_S_OK: sense_len = MIN(sizeof(xs->sense), virtio_rw32(vsc, vr->vr_res.sense_len)); memcpy(&xs->sense, vr->vr_res.sense, sense_len); xs->error = (sense_len == 0) ? XS_NOERROR : XS_SENSE; break; case VIRTIO_SCSI_S_BAD_TARGET: vioscsi_bad_target(xs); break; default: DPRINTF(("%s: stuffup: %d\n", __func__, vr->vr_res.response)); xs->error = XS_DRIVER_STUFFUP; xs->resid = xs->datalen; break; } DPRINTF(("%s: command %p done %d, %d, %d\n", __func__, xs, xs->error, xs->status, xs->resid)); bus_dmamap_unload(virtio_dmat(vsc), vr->vr_data); vr->vr_xs = NULL; virtio_dequeue_commit(vsc, vq, slot); mutex_exit(&sc->sc_mutex); scsipi_done(xs); mutex_enter(&sc->sc_mutex); } static void vioscsi_bad_target(struct scsipi_xfer *xs) { struct scsi_sense_data *sense = &xs->sense.scsi_sense; DPRINTF(("%s: bad target %d:%d\n", __func__, xs->xs_periph->periph_target, xs->xs_periph->periph_lun)); memset(sense, 0, sizeof(*sense)); sense->response_code = 0x70; sense->flags = SKEY_ILLEGAL_REQUEST; xs->error = XS_SENSE; xs->status = 0; xs->resid = 0; } static int vioscsi_vq_done(struct virtqueue *vq) { struct virtio_softc *vsc = vq->vq_owner; struct vioscsi_softc *sc = device_private(virtio_child(vsc)); int ret = 0; DPRINTF(("%s: enter %d\n", __func__, vq->vq_index)); mutex_enter(&sc->sc_mutex); for (;;) { int r, slot; r = virtio_dequeue(vsc, vq, &slot, NULL); if (r != 0) break; DPRINTF(("%s: slot=%d\n", __func__, slot)); vioscsi_req_done(sc, vsc, &sc->sc_reqs[slot], vq, slot); ret = 1; } mutex_exit(&sc->sc_mutex); DPRINTF(("%s: exit %d: %d\n", __func__, vq->vq_index, ret)); return ret; } static struct vioscsi_req * vioscsi_req_get(struct vioscsi_softc *sc) { struct virtio_softc *vsc = device_private(device_parent(sc->sc_dev)); struct virtqueue *vq = &sc->sc_vqs[VIOSCSI_VQ_REQUEST]; struct vioscsi_req *vr = NULL; int r, slot; mutex_enter(&sc->sc_mutex); if ((r = virtio_enqueue_prep(vsc, vq, &slot)) != 0) { DPRINTF(("%s: virtio_enqueue_get error %d\n", __func__, r)); goto out; } KASSERT(slot < sc->sc_nreqs); vr = &sc->sc_reqs[slot]; DPRINTF(("%s: %p, %d\n", __func__, vr, slot)); out: mutex_exit(&sc->sc_mutex); return vr; } static int vioscsi_alloc_reqs(struct vioscsi_softc *sc, struct virtio_softc *vsc, int qsize) { size_t allocsize; int r, rsegs, slot; void *vaddr; struct vioscsi_req *vr; allocsize = qsize * sizeof(struct vioscsi_req); r = bus_dmamem_alloc(virtio_dmat(vsc), allocsize, 0, 0, &sc->sc_reqs_segs[0], 1, &rsegs, BUS_DMA_NOWAIT); if (r != 0) { aprint_error_dev(sc->sc_dev, "%s: bus_dmamem_alloc, size %zu, error %d\n", __func__, allocsize, r); return r; } r = bus_dmamem_map(virtio_dmat(vsc), &sc->sc_reqs_segs[0], 1, allocsize, &vaddr, BUS_DMA_NOWAIT); if (r != 0) { aprint_error_dev(sc->sc_dev, "%s: bus_dmamem_map failed, error %d\n", __func__, r); bus_dmamem_free(virtio_dmat(vsc), &sc->sc_reqs_segs[0], 1); return r; } memset(vaddr, 0, allocsize); sc->sc_reqs = vaddr; sc->sc_nreqs = qsize; /* Prepare maps for the requests */ for (slot=0; slot < qsize; slot++) { vr = &sc->sc_reqs[slot]; r = bus_dmamap_create(virtio_dmat(vsc), offsetof(struct vioscsi_req, vr_xs), 1, offsetof(struct vioscsi_req, vr_xs), 0, BUS_DMA_NOWAIT|BUS_DMA_ALLOCNOW, &vr->vr_control); if (r != 0) { aprint_error_dev(sc->sc_dev, "%s: bus_dmamap_create ctrl failed, error %d\n", __func__, r); goto cleanup; } r = bus_dmamap_create(virtio_dmat(vsc), MAXPHYS, sc->sc_seg_max, MAXPHYS, 0, BUS_DMA_NOWAIT|BUS_DMA_ALLOCNOW, &vr->vr_data); if (r != 0) { aprint_error_dev(sc->sc_dev, "%s: bus_dmamap_create data failed, error %d\n", __func__, r); goto cleanup; } r = bus_dmamap_load(virtio_dmat(vsc), vr->vr_control, vr, offsetof(struct vioscsi_req, vr_xs), NULL, BUS_DMA_NOWAIT); if (r != 0) { aprint_error_dev(sc->sc_dev, "%s: bus_dmamap_load ctrl error %d\n", __func__, r); goto cleanup; } } return 0; cleanup: for (; slot > 0; slot--) { vr = &sc->sc_reqs[slot]; if (vr->vr_control) { /* this will also unload the mapping if loaded */ bus_dmamap_destroy(virtio_dmat(vsc), vr->vr_control); vr->vr_control = NULL; } if (vr->vr_data) { bus_dmamap_destroy(virtio_dmat(vsc), vr->vr_data); vr->vr_data = NULL; } } bus_dmamem_unmap(virtio_dmat(vsc), vaddr, allocsize); bus_dmamem_free(virtio_dmat(vsc), &sc->sc_reqs_segs[0], 1); return r; } static void vioscsi_free_reqs(struct vioscsi_softc *sc, struct virtio_softc *vsc) { int slot; struct vioscsi_req *vr; if (sc->sc_nreqs == 0) { /* Not allocated */ return; } /* Free request maps */ for (slot=0; slot < sc->sc_nreqs; slot++) { vr = &sc->sc_reqs[slot]; bus_dmamap_destroy(virtio_dmat(vsc), vr->vr_control); bus_dmamap_destroy(virtio_dmat(vsc), vr->vr_data); } bus_dmamem_unmap(virtio_dmat(vsc), sc->sc_reqs, sc->sc_nreqs * sizeof(struct vioscsi_req)); bus_dmamem_free(virtio_dmat(vsc), &sc->sc_reqs_segs[0], 1); } MODULE(MODULE_CLASS_DRIVER, vioscsi, "virtio"); #ifdef _MODULE #include "ioconf.c" #endif static int vioscsi_modcmd(modcmd_t cmd, void *opaque) { int error = 0; #ifdef _MODULE switch (cmd) { case MODULE_CMD_INIT: error = config_init_component(cfdriver_ioconf_vioscsi, cfattach_ioconf_vioscsi, cfdata_ioconf_vioscsi); break; case MODULE_CMD_FINI: error = config_fini_component(cfdriver_ioconf_vioscsi, cfattach_ioconf_vioscsi, cfdata_ioconf_vioscsi); break; default: error = ENOTTY; break; } #endif return error; }
3 18 118 10 3 3 3 2 3 2 2 2 1 84 12 17 13 13 39 39 3 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 /* $NetBSD: sys_socket.c,v 1.81 2023/04/22 13:53:02 riastradh Exp $ */ /*- * Copyright (c) 2008, 2009 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Andrew Doran. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /* * Copyright (c) 1982, 1986, 1990, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)sys_socket.c 8.3 (Berkeley) 2/14/95 */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: sys_socket.c,v 1.81 2023/04/22 13:53:02 riastradh Exp $"); #include <sys/param.h> #include <sys/systm.h> #include <sys/systm.h> #include <sys/file.h> #include <sys/mbuf.h> #include <sys/protosw.h> #include <sys/socket.h> #include <sys/socketvar.h> #include <sys/ioctl.h> #include <sys/stat.h> #include <sys/poll.h> #include <sys/proc.h> #include <sys/kauth.h> #include <net/if.h> #include <net/route.h> static int soo_fpathconf(struct file *, int, register_t *); static int soo_posix_fadvise(struct file *, off_t, off_t, int); const struct fileops socketops = { .fo_name = "socket", .fo_read = soo_read, .fo_write = soo_write, .fo_ioctl = soo_ioctl, .fo_fcntl = fnullop_fcntl, .fo_poll = soo_poll, .fo_stat = soo_stat, .fo_close = soo_close, .fo_kqfilter = soo_kqfilter, .fo_restart = soo_restart, .fo_fpathconf = soo_fpathconf, .fo_posix_fadvise = soo_posix_fadvise, }; int (*ifioctl)(struct socket *, u_long, void *, struct lwp *) = (void *)eopnotsupp; /* ARGSUSED */ int soo_read(file_t *fp, off_t *offset, struct uio *uio, kauth_cred_t cred, int flags) { struct socket *so = fp->f_socket; int error; error = (*so->so_receive)(so, NULL, uio, NULL, NULL, NULL); return error; } /* ARGSUSED */ int soo_write(file_t *fp, off_t *offset, struct uio *uio, kauth_cred_t cred, int flags) { struct socket *so = fp->f_socket; int error; error = (*so->so_send)(so, NULL, uio, NULL, NULL, 0, curlwp); return error; } int soo_ioctl(file_t *fp, u_long cmd, void *data) { struct socket *so = fp->f_socket; int error = 0; switch (cmd) { case FIONBIO: solock(so); if (*(int *)data) so->so_state |= SS_NBIO; else so->so_state &= ~SS_NBIO; sounlock(so); break; case FIOASYNC: solock(so); if (*(int *)data) { so->so_rcv.sb_flags |= SB_ASYNC; so->so_snd.sb_flags |= SB_ASYNC; } else { so->so_rcv.sb_flags &= ~SB_ASYNC; so->so_snd.sb_flags &= ~SB_ASYNC; } sounlock(so); break; case FIONREAD: *(int *)data = so->so_rcv.sb_cc; break; case FIONWRITE: *(int *)data = so->so_snd.sb_cc; break; case FIONSPACE: /* * See the comment around sbspace()'s definition * in sys/socketvar.h in face of counts about maximum * to understand the following test. We detect overflow * and return zero. */ solock(so); if ((so->so_snd.sb_hiwat < so->so_snd.sb_cc) || (so->so_snd.sb_mbmax < so->so_snd.sb_mbcnt)) *(int *)data = 0; else *(int *)data = sbspace(&so->so_snd); sounlock(so); break; case SIOCSPGRP: case FIOSETOWN: case TIOCSPGRP: error = fsetown(&so->so_pgid, cmd, data); break; case SIOCGPGRP: case FIOGETOWN: case TIOCGPGRP: error = fgetown(so->so_pgid, cmd, data); break; case SIOCATMARK: *(int *)data = (so->so_state&SS_RCVATMARK) != 0; break; case SIOCPEELOFF: solock(so); error = do_sys_peeloff(so, data); sounlock(so); break; default: /* * Interface/routing/protocol specific ioctls: * interface and routing ioctls should have a * different entry since a socket's unnecessary */ if (IOCGROUP(cmd) == 'i') /* * KERNEL_LOCK will be held later if if_ioctl() of the * interface isn't MP-safe. */ error = ifioctl(so, cmd, data, curlwp); else { KERNEL_LOCK(1, NULL); error = (*so->so_proto->pr_usrreqs->pr_ioctl)(so, cmd, data, NULL); KERNEL_UNLOCK_ONE(NULL); } break; } return error; } int soo_poll(file_t *fp, int events) { return sopoll(fp->f_socket, events); } int soo_stat(file_t *fp, struct stat *ub) { struct socket *so = fp->f_socket; int error; memset(ub, 0, sizeof(*ub)); ub->st_mode = S_IFSOCK; solock(so); error = (*so->so_proto->pr_usrreqs->pr_stat)(so, ub); sounlock(so); return error; } /* ARGSUSED */ int soo_close(file_t *fp) { int error = 0; if (fp->f_socket) error = soclose(fp->f_socket); fp->f_socket = NULL; return error; } void soo_restart(file_t *fp) { sorestart(fp->f_socket); } static int soo_fpathconf(struct file *fp, int name, register_t *retval) { switch (name) { case _PC_PIPE_BUF: *retval = PIPE_BUF; return 0; default: return EINVAL; } } static int soo_posix_fadvise(struct file *fp, off_t offset, off_t len, int advice) { return ESPIPE; }
1763 2 2 2 2 2 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 /* $NetBSD: kern_clock.c,v 1.151 2023/09/02 17:44:59 riastradh Exp $ */ /*- * Copyright (c) 2000, 2004, 2006, 2007, 2008 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility, * NASA Ames Research Center. * This code is derived from software contributed to The NetBSD Foundation * by Charles M. Hannum. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /*- * Copyright (c) 1982, 1986, 1991, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)kern_clock.c 8.5 (Berkeley) 1/21/94 */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: kern_clock.c,v 1.151 2023/09/02 17:44:59 riastradh Exp $"); #ifdef _KERNEL_OPT #include "opt_dtrace.h" #include "opt_gprof.h" #include "opt_multiprocessor.h" #endif #include <sys/param.h> #include <sys/systm.h> #include <sys/callout.h> #include <sys/kernel.h> #include <sys/proc.h> #include <sys/resourcevar.h> #include <sys/signalvar.h> #include <sys/sysctl.h> #include <sys/timex.h> #include <sys/sched.h> #include <sys/time.h> #include <sys/timetc.h> #include <sys/cpu.h> #include <sys/atomic.h> #include <sys/rndsource.h> #include <sys/heartbeat.h> #ifdef GPROF #include <sys/gmon.h> #endif #ifdef KDTRACE_HOOKS #include <sys/dtrace_bsd.h> #include <sys/cpu.h> cyclic_clock_func_t cyclic_clock_func[MAXCPUS]; #endif static int sysctl_kern_clockrate(SYSCTLFN_PROTO); /* * Clock handling routines. * * This code is written to operate with two timers that run independently of * each other. The main clock, running hz times per second, is used to keep * track of real time. The second timer handles kernel and user profiling, * and does resource use estimation. If the second timer is programmable, * it is randomized to avoid aliasing between the two clocks. For example, * the randomization prevents an adversary from always giving up the CPU * just before its quantum expires. Otherwise, it would never accumulate * CPU ticks. The mean frequency of the second timer is stathz. * * If no second timer exists, stathz will be zero; in this case we drive * profiling and statistics off the main clock. This WILL NOT be accurate; * do not do it unless absolutely necessary. * * The statistics clock may (or may not) be run at a higher rate while * profiling. This profile clock runs at profhz. We require that profhz * be an integral multiple of stathz. * * If the statistics clock is running fast, it must be divided by the ratio * profhz/stathz for statistics. (For profiling, every tick counts.) */ int stathz; int profhz; int profsrc; int schedhz; int profprocs; static int hardclock_ticks; static int hardscheddiv; /* hard => sched divider (used if schedhz == 0) */ static int psdiv; /* prof => stat divider */ int psratio; /* ratio: prof / stat */ struct clockrnd { struct krndsource source; unsigned needed; }; static struct clockrnd hardclockrnd __aligned(COHERENCY_UNIT); static struct clockrnd statclockrnd __aligned(COHERENCY_UNIT); static void clockrnd_get(size_t needed, void *cookie) { struct clockrnd *C = cookie; /* Start sampling. */ atomic_store_relaxed(&C->needed, 2*NBBY*needed); } static void clockrnd_sample(struct clockrnd *C) { struct cpu_info *ci = curcpu(); /* If there's nothing needed right now, stop here. */ if (__predict_true(atomic_load_relaxed(&C->needed) == 0)) return; /* * If we're not the primary core of a package, we're probably * driven by the same clock as the primary core, so don't * bother. */ if (ci != ci->ci_package1st) return; /* Take a sample and enter it into the pool. */ rnd_add_uint32(&C->source, 0); /* * On the primary CPU, count down. Using an atomic decrement * here isn't really necessary -- on every platform we care * about, stores to unsigned int are atomic, and the only other * memory operation that could happen here is for another CPU * to store a higher value for needed. But using an atomic * decrement avoids giving the impression of data races, and is * unlikely to hurt because only one CPU will ever be writing * to the location. */ if (CPU_IS_PRIMARY(curcpu())) { unsigned needed __diagused; needed = atomic_dec_uint_nv(&C->needed); KASSERT(needed != UINT_MAX); } } static u_int get_intr_timecount(struct timecounter *); static struct timecounter intr_timecounter = { .tc_get_timecount = get_intr_timecount, .tc_poll_pps = NULL, .tc_counter_mask = ~0u, .tc_frequency = 0, .tc_name = "clockinterrupt", /* quality - minimum implementation level for a clock */ .tc_quality = 0, .tc_priv = NULL, }; static u_int get_intr_timecount(struct timecounter *tc) { return (u_int)getticks(); } int getticks(void) { return atomic_load_relaxed(&hardclock_ticks); } /* * Initialize clock frequencies and start both clocks running. */ void initclocks(void) { static struct sysctllog *clog; int i; /* * Set divisors to 1 (normal case) and let the machine-specific * code do its bit. */ psdiv = 1; /* * Call cpu_initclocks() before registering the default * timecounter, in case it needs to adjust hz. */ const int old_hz = hz; cpu_initclocks(); if (old_hz != hz) { tick = 1000000 / hz; tickadj = (240000 / (60 * hz)) ? (240000 / (60 * hz)) : 1; } /* * provide minimum default time counter * will only run at interrupt resolution */ intr_timecounter.tc_frequency = hz; tc_init(&intr_timecounter); /* * Compute profhz and stathz, fix profhz if needed. */ i = stathz ? stathz : hz; if (profhz == 0) profhz = i; psratio = profhz / i; if (schedhz == 0) { /* 16Hz is best */ hardscheddiv = hz / 16; if (hardscheddiv <= 0) panic("hardscheddiv"); } sysctl_createv(&clog, 0, NULL, NULL, CTLFLAG_PERMANENT, CTLTYPE_STRUCT, "clockrate", SYSCTL_DESCR("Kernel clock rates"), sysctl_kern_clockrate, 0, NULL, sizeof(struct clockinfo), CTL_KERN, KERN_CLOCKRATE, CTL_EOL); sysctl_createv(&clog, 0, NULL, NULL, CTLFLAG_PERMANENT, CTLTYPE_INT, "hardclock_ticks", SYSCTL_DESCR("Number of hardclock ticks"), NULL, 0, &hardclock_ticks, sizeof(hardclock_ticks), CTL_KERN, KERN_HARDCLOCK_TICKS, CTL_EOL); rndsource_setcb(&hardclockrnd.source, clockrnd_get, &hardclockrnd); rnd_attach_source(&hardclockrnd.source, "hardclock", RND_TYPE_SKEW, RND_FLAG_COLLECT_TIME|RND_FLAG_ESTIMATE_TIME|RND_FLAG_HASCB); if (stathz) { rndsource_setcb(&statclockrnd.source, clockrnd_get, &statclockrnd); rnd_attach_source(&statclockrnd.source, "statclock", RND_TYPE_SKEW, (RND_FLAG_COLLECT_TIME|RND_FLAG_ESTIMATE_TIME| RND_FLAG_HASCB)); } } /* * The real-time timer, interrupting hz times per second. */ void hardclock(struct clockframe *frame) { struct lwp *l; struct cpu_info *ci; clockrnd_sample(&hardclockrnd); ci = curcpu(); l = ci->ci_onproc; ptimer_tick(l, CLKF_USERMODE(frame)); /* * If no separate statistics clock is available, run it from here. */ if (stathz == 0) statclock(frame); /* * If no separate schedclock is provided, call it here * at about 16 Hz. */ if (schedhz == 0) { if ((int)(--ci->ci_schedstate.spc_schedticks) <= 0) { schedclock(l); ci->ci_schedstate.spc_schedticks = hardscheddiv; } } if ((--ci->ci_schedstate.spc_ticks) <= 0) sched_tick(ci); if (CPU_IS_PRIMARY(ci)) { atomic_store_relaxed(&hardclock_ticks, atomic_load_relaxed(&hardclock_ticks) + 1); tc_ticktock(); } /* * Make sure the CPUs and timecounter are making progress. */ heartbeat(); /* * Update real-time timeout queue. */ callout_hardclock(); } /* * Start profiling on a process. * * Kernel profiling passes proc0 which never exits and hence * keeps the profile clock running constantly. */ void startprofclock(struct proc *p) { KASSERT(mutex_owned(&p->p_stmutex)); if ((p->p_stflag & PST_PROFIL) == 0) { p->p_stflag |= PST_PROFIL; /* * This is only necessary if using the clock as the * profiling source. */ if (++profprocs == 1 && stathz != 0) psdiv = psratio; } } /* * Stop profiling on a process. */ void stopprofclock(struct proc *p) { KASSERT(mutex_owned(&p->p_stmutex)); if (p->p_stflag & PST_PROFIL) { p->p_stflag &= ~PST_PROFIL; /* * This is only necessary if using the clock as the * profiling source. */ if (--profprocs == 0 && stathz != 0) psdiv = 1; } } void schedclock(struct lwp *l) { if ((l->l_flag & LW_IDLE) != 0) return; sched_schedclock(l); } /* * Statistics clock. Grab profile sample, and if divider reaches 0, * do process and kernel statistics. */ void statclock(struct clockframe *frame) { #ifdef GPROF struct gmonparam *g; intptr_t i; #endif struct cpu_info *ci = curcpu(); struct schedstate_percpu *spc = &ci->ci_schedstate; struct proc *p; struct lwp *l; if (stathz) clockrnd_sample(&statclockrnd); /* * Notice changes in divisor frequency, and adjust clock * frequency accordingly. */ if (spc->spc_psdiv != psdiv) { spc->spc_psdiv = psdiv; spc->spc_pscnt = psdiv; if (psdiv == 1) { setstatclockrate(stathz); } else { setstatclockrate(profhz); } } l = ci->ci_onproc; if ((l->l_flag & LW_IDLE) != 0) { /* * don't account idle lwps as swapper. */ p = NULL; } else { p = l->l_proc; mutex_spin_enter(&p->p_stmutex); } if (CLKF_USERMODE(frame)) { KASSERT(p != NULL); if ((p->p_stflag & PST_PROFIL) && profsrc == PROFSRC_CLOCK) addupc_intr(l, CLKF_PC(frame)); if (--spc->spc_pscnt > 0) { mutex_spin_exit(&p->p_stmutex); return; } /* * Came from user mode; CPU was in user state. * If this process is being profiled record the tick. */ p->p_uticks++; if (p->p_nice > NZERO) spc->spc_cp_time[CP_NICE]++; else spc->spc_cp_time[CP_USER]++; } else { #ifdef GPROF /* * Kernel statistics are just like addupc_intr, only easier. */ #if defined(MULTIPROCESSOR) && !defined(_RUMPKERNEL) g = curcpu()->ci_gmon; if (g != NULL && profsrc == PROFSRC_CLOCK && g->state == GMON_PROF_ON) { #else g = &_gmonparam; if (profsrc == PROFSRC_CLOCK && g->state == GMON_PROF_ON) { #endif i = CLKF_PC(frame) - g->lowpc; if (i < g->textsize) { i /= HISTFRACTION * sizeof(*g->kcount); g->kcount[i]++; } } #endif #ifdef LWP_PC if (p != NULL && profsrc == PROFSRC_CLOCK && (p->p_stflag & PST_PROFIL)) { addupc_intr(l, LWP_PC(l)); } #endif if (--spc->spc_pscnt > 0) { if (p != NULL) mutex_spin_exit(&p->p_stmutex); return; } /* * Came from kernel mode, so we were: * - handling an interrupt, * - doing syscall or trap work on behalf of the current * user process, or * - spinning in the idle loop. * Whichever it is, charge the time as appropriate. * Note that we charge interrupts to the current process, * regardless of whether they are ``for'' that process, * so that we know how much of its real time was spent * in ``non-process'' (i.e., interrupt) work. */ if (CLKF_INTR(frame) || (curlwp->l_pflag & LP_INTR) != 0) { if (p != NULL) { p->p_iticks++; } spc->spc_cp_time[CP_INTR]++; } else if (p != NULL) { p->p_sticks++; spc->spc_cp_time[CP_SYS]++; } else { spc->spc_cp_time[CP_IDLE]++; } } spc->spc_pscnt = psdiv; if (p != NULL) { atomic_inc_uint(&l->l_cpticks); mutex_spin_exit(&p->p_stmutex); } #ifdef KDTRACE_HOOKS cyclic_clock_func_t func = cyclic_clock_func[cpu_index(ci)]; if (func) { (*func)((struct clockframe *)frame); } #endif } /* * sysctl helper routine for kern.clockrate. Assembles a struct on * the fly to be returned to the caller. */ static int sysctl_kern_clockrate(SYSCTLFN_ARGS) { struct clockinfo clkinfo; struct sysctlnode node; clkinfo.tick = tick; clkinfo.tickadj = tickadj; clkinfo.hz = hz; clkinfo.profhz = profhz; clkinfo.stathz = stathz ? stathz : hz; node = *rnode; node.sysctl_data = &clkinfo; return (sysctl_lookup(SYSCTLFN_CALL(&node))); }
1 44 45 46 18 8 61 84 84 70 22 2 5 49 49 49 3 11 69 72 52 1 3 46 68 2 2 57 50 69 40 40 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 /* $NetBSD: scope6.c,v 1.23 2020/06/16 17:12:18 maxv Exp $ */ /* $KAME$ */ /* * Copyright (C) 2000 WIDE Project. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the project nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: scope6.c,v 1.23 2020/06/16 17:12:18 maxv Exp $"); #include <sys/param.h> #include <sys/malloc.h> #include <sys/mbuf.h> #include <sys/socket.h> #include <sys/systm.h> #include <sys/queue.h> #include <sys/syslog.h> #include <net/if.h> #include <netinet/in.h> #include <netinet6/in6_var.h> #include <netinet6/scope6_var.h> #ifdef ENABLE_DEFAULT_SCOPE int ip6_use_defzone = 1; #else int ip6_use_defzone = 0; #endif static struct scope6_id sid_default; #define SID(ifp) \ ((ifp)->if_afdata[AF_INET6] == NULL ? NULL : \ ((struct in6_ifextra *)(ifp)->if_afdata[AF_INET6])->scope6_id) void scope6_init(void) { memset(&sid_default, 0, sizeof(sid_default)); } struct scope6_id * scope6_ifattach(struct ifnet *ifp) { struct scope6_id *sid; sid = malloc(sizeof(*sid), M_IFADDR, M_WAITOK | M_ZERO); /* * XXX: IPV6_ADDR_SCOPE_xxx macros are not standard. * Should we rather hardcode here? */ sid->s6id_list[IPV6_ADDR_SCOPE_INTFACELOCAL] = ifp->if_index; sid->s6id_list[IPV6_ADDR_SCOPE_LINKLOCAL] = ifp->if_index; #ifdef MULTI_SCOPE /* by default, we don't care about scope boundary for these scopes. */ sid->s6id_list[IPV6_ADDR_SCOPE_SITELOCAL] = 1; sid->s6id_list[IPV6_ADDR_SCOPE_ORGLOCAL] = 1; #endif return sid; } void scope6_ifdetach(struct scope6_id *sid) { free(sid, M_IFADDR); } /* * Get a scope of the address. Interface-local, link-local, site-local * or global. */ int in6_addrscope(const struct in6_addr *addr) { int scope; if (addr->s6_addr[0] == 0xfe) { scope = addr->s6_addr[1] & 0xc0; switch (scope) { case 0x80: return IPV6_ADDR_SCOPE_LINKLOCAL; case 0xc0: return IPV6_ADDR_SCOPE_SITELOCAL; default: return IPV6_ADDR_SCOPE_GLOBAL; /* just in case */ } } if (addr->s6_addr[0] == 0xff) { scope = addr->s6_addr[1] & 0x0f; /* * due to other scope such as reserved, * return scope doesn't work. */ switch (scope) { case IPV6_ADDR_SCOPE_INTFACELOCAL: return IPV6_ADDR_SCOPE_INTFACELOCAL; case IPV6_ADDR_SCOPE_LINKLOCAL: return IPV6_ADDR_SCOPE_LINKLOCAL; case IPV6_ADDR_SCOPE_SITELOCAL: return IPV6_ADDR_SCOPE_SITELOCAL; default: return IPV6_ADDR_SCOPE_GLOBAL; } } if (memcmp(&in6addr_loopback, addr, sizeof(*addr) - 1) == 0) { if (addr->s6_addr[15] == 1) /* loopback */ return IPV6_ADDR_SCOPE_LINKLOCAL; if (addr->s6_addr[15] == 0) { /* * Regard the unspecified addresses as global, * since it has no ambiguity. * XXX: not sure if it's correct... */ return IPV6_ADDR_SCOPE_GLOBAL; } } return IPV6_ADDR_SCOPE_GLOBAL; } uint32_t scope6_addr2default(const struct in6_addr *addr) { uint32_t id; /* * special case: The loopback address should be considered as * link-local, but there's no ambiguity in the syntax. */ if (IN6_IS_ADDR_LOOPBACK(addr)) return 0; /* * XXX: 32-bit read is atomic on all our platforms, is it OK * not to lock here? */ id = sid_default.s6id_list[in6_addrscope(addr)]; return id; } /* * Validate the specified scope zone ID in the sin6_scope_id field. If the ID * is unspecified (=0), needs to be specified, and the default zone ID can be * used, the default value will be used. * This routine then generates the kernel-internal form: if the address scope * of is interface-local or link-local, embed the interface index in the * address. */ int sa6_embedscope(struct sockaddr_in6 *sin6, int defaultok) { struct ifnet *ifp; uint32_t zoneid; if ((zoneid = sin6->sin6_scope_id) == 0 && defaultok) zoneid = scope6_addr2default(&sin6->sin6_addr); if (zoneid != 0 && (IN6_IS_SCOPE_LINKLOCAL(&sin6->sin6_addr) || IN6_IS_ADDR_MC_INTFACELOCAL(&sin6->sin6_addr))) { int s; /* * At this moment, we only check interface-local and * link-local scope IDs, and use interface indices as the * zone IDs assuming a one-to-one mapping between interfaces * and links. */ s = pserialize_read_enter(); ifp = if_byindex(zoneid); if (ifp == NULL) { pserialize_read_exit(s); return ENXIO; } pserialize_read_exit(s); /* XXX assignment to 16bit from 32bit variable */ sin6->sin6_addr.s6_addr16[1] = htons(zoneid & 0xffff); sin6->sin6_scope_id = 0; } return 0; } struct sockaddr * sockaddr_in6_externalize(struct sockaddr *dst, socklen_t socklen, const struct sockaddr *src) { struct sockaddr_in6 *sin6; sin6 = satosin6(sockaddr_copy(dst, socklen, src)); if (sin6 == NULL || sa6_recoverscope(sin6) != 0) return NULL; return dst; } /* * generate standard sockaddr_in6 from embedded form. */ int sa6_recoverscope(struct sockaddr_in6 *sin6) { uint32_t zoneid; char ip6buf[INET6_ADDRSTRLEN]; if (sin6->sin6_scope_id != 0) { log(LOG_NOTICE, "%s: assumption failure (non 0 ID): %s%%%d\n", __func__, IN6_PRINT(ip6buf, &sin6->sin6_addr), sin6->sin6_scope_id); /* XXX: proceed anyway... */ } if (IN6_IS_SCOPE_LINKLOCAL(&sin6->sin6_addr) || IN6_IS_ADDR_MC_INTFACELOCAL(&sin6->sin6_addr)) { /* * KAME assumption: link id == interface id */ zoneid = ntohs(sin6->sin6_addr.s6_addr16[1]); if (zoneid) { int s = pserialize_read_enter(); if (!if_byindex(zoneid)) { pserialize_read_exit(s); return ENXIO; } pserialize_read_exit(s); sin6->sin6_addr.s6_addr16[1] = 0; sin6->sin6_scope_id = zoneid; } } return 0; } int in6_setzoneid(struct in6_addr *in6, uint32_t zoneid) { if (IN6_IS_SCOPE_EMBEDDABLE(in6)) in6->s6_addr16[1] = htons(zoneid & 0xffff); /* XXX */ return 0; } /* * Determine the appropriate scope zone ID for in6 and ifp. If ret_id is * non NULL, it is set to the zone ID. If the zone ID needs to be embedded * in the in6_addr structure, in6 will be modified. */ int in6_setscope(struct in6_addr *in6, const struct ifnet *ifp, uint32_t *ret_id) { int scope; uint32_t zoneid = 0; const struct scope6_id *sid = SID(ifp); if (sid == NULL) { log(LOG_NOTICE, "%s: no scope id for %s\n", __func__, if_name(ifp)); return EINVAL; } /* * special case: the loopback address can only belong to a loopback * interface. */ if (IN6_IS_ADDR_LOOPBACK(in6)) { if (!(ifp->if_flags & IFF_LOOPBACK)) { char ip6buf[INET6_ADDRSTRLEN]; log(LOG_NOTICE, "%s: can't set scope for not loopback " "interface %s and loopback address %s\n", __func__, if_name(ifp), IN6_PRINT(ip6buf, in6)); return EINVAL; } else { if (ret_id != NULL) *ret_id = 0; /* there's no ambiguity */ return 0; } } scope = in6_addrscope(in6); switch (scope) { case IPV6_ADDR_SCOPE_INTFACELOCAL: /* should be interface index */ zoneid = sid->s6id_list[IPV6_ADDR_SCOPE_INTFACELOCAL]; break; case IPV6_ADDR_SCOPE_LINKLOCAL: zoneid = sid->s6id_list[IPV6_ADDR_SCOPE_LINKLOCAL]; break; case IPV6_ADDR_SCOPE_SITELOCAL: zoneid = sid->s6id_list[IPV6_ADDR_SCOPE_SITELOCAL]; break; case IPV6_ADDR_SCOPE_ORGLOCAL: zoneid = sid->s6id_list[IPV6_ADDR_SCOPE_ORGLOCAL]; break; default: zoneid = 0; /* XXX: treat as global. */ break; } if (ret_id != NULL) *ret_id = zoneid; return in6_setzoneid(in6, zoneid); } const char * in6_getscopename(const struct in6_addr *addr) { switch (in6_addrscope(addr)) { case IPV6_ADDR_SCOPE_INTFACELOCAL: return "interface"; #if IPV6_ADDR_SCOPE_INTFACELOCAL != IPV6_ADDR_SCOPE_NODELOCAL case IPV6_ADDR_SCOPE_NODELOCAL: return "node"; #endif case IPV6_ADDR_SCOPE_LINKLOCAL: return "link"; case IPV6_ADDR_SCOPE_SITELOCAL: return "site"; case IPV6_ADDR_SCOPE_ORGLOCAL: return "organization"; case IPV6_ADDR_SCOPE_GLOBAL: return "global"; default: return "unknown"; } } /* * Just clear the embedded scope identifier. Return 0 if the original address * is intact; return non 0 if the address is modified. */ int in6_clearscope(struct in6_addr *in6) { int modified = 0; if (IN6_IS_SCOPE_LINKLOCAL(in6) || IN6_IS_ADDR_MC_INTFACELOCAL(in6)) { if (in6->s6_addr16[1] != 0) modified = 1; in6->s6_addr16[1] = 0; } return modified; }
3 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 /* $NetBSD: usbdivar.h,v 1.138 2024/02/04 05:43:06 mrg Exp $ */ /* * Copyright (c) 1998, 2012 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Lennart Augustsson (lennart@augustsson.net) at * Carlstedt Research & Technology and Matthew R. Green (mrg@eterna23.net). * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #ifndef _DEV_USB_USBDIVAR_H_ #define _DEV_USB_USBDIVAR_H_ /* * Discussion about locking in the USB code: * * The host controller presents one lock at IPL_SOFTUSB (aka IPL_SOFTNET). * * List of hardware interface methods, and whether the lock is held * when each is called by this module: * * BUS METHOD LOCK NOTES * ----------------------- ------- ------------------------- * ubm_open - might want to take lock? * ubm_softint x may release/reacquire lock * ubm_dopoll - might want to take lock? * ubm_allocx - * ubm_freex - * ubm_abortx x must not release/reacquire lock * ubm_getlock - Called at attach time * ubm_newdev - Will take lock * ubm_rhctrl - * * PIPE METHOD LOCK NOTES * ----------------------- ------- ------------------------- * upm_init - * upm_fini - * upm_transfer x * upm_start x * upm_abort x * upm_close x * upm_cleartoggle - * upm_done x * * The above semantics are likely to change. Little performance * evaluation has been done on this code and the locking strategy. * * USB functions known to expect the lock taken include (this list is * probably not exhaustive): * usb_transfer_complete() * usb_start_next() * */ #include <sys/callout.h> #include <sys/mutex.h> #include <sys/bus.h> /* From usb_mem.h */ struct usb_dma_block; typedef struct { struct usb_dma_block *udma_block; u_int udma_offs; } usb_dma_t; struct usbd_xfer; struct usbd_pipe; struct usbd_port; struct usbd_endpoint { usb_endpoint_descriptor_t *ue_edesc; int ue_refcnt; int ue_toggle; }; struct usbd_bus_methods { usbd_status (*ubm_open)(struct usbd_pipe *); void (*ubm_softint)(void *); void (*ubm_dopoll)(struct usbd_bus *); struct usbd_xfer *(*ubm_allocx)(struct usbd_bus *, unsigned int); void (*ubm_freex)(struct usbd_bus *, struct usbd_xfer *); void (*ubm_abortx)(struct usbd_xfer *); bool (*ubm_dying)(struct usbd_bus *); void (*ubm_getlock)(struct usbd_bus *, kmutex_t **); usbd_status (*ubm_newdev)(device_t, struct usbd_bus *, int, int, int, struct usbd_port *); int (*ubm_rhctrl)(struct usbd_bus *, usb_device_request_t *, void *, int); }; struct usbd_pipe_methods { int (*upm_init)(struct usbd_xfer *); void (*upm_fini)(struct usbd_xfer *); usbd_status (*upm_transfer)(struct usbd_xfer *); usbd_status (*upm_start)(struct usbd_xfer *); void (*upm_abort)(struct usbd_xfer *); void (*upm_close)(struct usbd_pipe *); void (*upm_cleartoggle)(struct usbd_pipe *); void (*upm_done)(struct usbd_xfer *); }; struct usbd_tt { struct usbd_hub *utt_hub; }; struct usbd_port { usb_port_status_t up_status; uint16_t up_power; /* mA of current on port */ uint8_t up_portno; uint8_t up_restartcnt; #define USBD_RESTART_MAX 5 uint8_t up_reattach; struct usbd_device *up_dev; /* Connected device */ struct usbd_device *up_parent; /* The ports hub */ struct usbd_tt *up_tt; /* Transaction translator (if any) */ }; struct usbd_hub { usbd_status (*uh_explore)(struct usbd_device *hub); void *uh_hubsoftc; usb_hub_descriptor_t uh_hubdesc; struct usbd_port uh_ports[1]; }; /*****/ /* 0, root, and 1->127 */ #define USB_ROOTHUB_INDEX 1 #define USB_TOTAL_DEVICES (USB_MAX_DEVICES + 1) struct usbd_bus { /* Filled by HC driver */ void *ub_hcpriv; int ub_revision; /* USB revision */ #define USBREV_UNKNOWN 0 #define USBREV_PRE_1_0 1 #define USBREV_1_0 2 #define USBREV_1_1 3 #define USBREV_2_0 4 #define USBREV_3_0 5 #define USBREV_3_1 6 #define USBREV_STR { "unknown", "pre 1.0", "1.0", "1.1", "2.0", "3.0", "3.1" } int ub_hctype; #define USBHCTYPE_UNKNOWN 0 #define USBHCTYPE_MOTG 1 #define USBHCTYPE_OHCI 2 #define USBHCTYPE_UHCI 3 #define USBHCTYPE_EHCI 4 #define USBHCTYPE_XHCI 5 #define USBHCTYPE_VHCI 6 int ub_busnum; const struct usbd_bus_methods *ub_methods; uint32_t ub_pipesize; /* size of a pipe struct */ bool ub_usedma; /* Does this HC support DMA */ int ub_dmaflags; bus_dma_tag_t ub_dmatag; /* DMA tag */ /* Filled by usb driver */ kmutex_t *ub_lock; struct usbd_device *ub_roothub; struct usbd_xfer *ub_rhxfer; /* roothub xfer in progress */ kcondvar_t ub_rhxfercv; uint8_t ub_rhaddr; /* roothub address */ uint8_t ub_rhconf; /* roothub configuration */ struct usbd_device *ub_devices[USB_TOTAL_DEVICES]; kcondvar_t ub_needsexplore_cv; char ub_needsexplore;/* a hub a signalled a change */ char ub_usepolling; device_t ub_usbctl; struct usb_device_stats ub_stats; void *ub_soft; /* soft interrupt cookie */ }; struct usbd_device { struct usbd_bus *ud_bus; /* our controller */ struct usbd_pipe *ud_pipe0; /* pipe 0 */ uint8_t ud_addr; /* device address */ uint8_t ud_config; /* current configuration # */ uint8_t ud_depth; /* distance from root hub */ uint8_t ud_speed; /* low/full/high speed */ uint8_t ud_selfpowered; /* flag for self powered */ uint16_t ud_power; /* mA the device uses */ int16_t ud_langid; /* language for strings */ #define USBD_NOLANG (-1) usb_event_cookie_t ud_cookie; /* unique connection id */ struct usbd_port *ud_powersrc; /* upstream hub port, or 0 */ struct usbd_device *ud_myhub; /* upstream hub */ struct usbd_port *ud_myhsport; /* closest high speed port */ struct usbd_endpoint ud_ep0; /* for pipe 0 */ usb_endpoint_descriptor_t ud_ep0desc; /* for pipe 0 */ struct usbd_interface *ud_ifaces; /* array of all interfaces */ usb_device_descriptor_t ud_ddesc; /* device descriptor */ usb_config_descriptor_t *ud_cdesc; /* full config descr */ usb_bos_descriptor_t *ud_bdesc; /* full BOS descr */ const struct usbd_quirks *ud_quirks; /* device quirks, always set */ struct usbd_hub *ud_hub; /* only if this is a hub */ u_int ud_subdevlen; /* array length of following */ device_t *ud_subdevs; /* sub-devices */ int ud_nifaces_claimed; /* number of ifaces in use */ void *ud_hcpriv; char *ud_serial; /* serial number, can be NULL */ char *ud_vendor; /* vendor string, can be NULL */ char *ud_product; /* product string can be NULL */ }; struct usbd_interface { struct usbd_device *ui_dev; usb_interface_descriptor_t *ui_idesc; int ui_index; int ui_altindex; struct usbd_endpoint *ui_endpoints; int64_t ui_busy; /* #pipes, or -1 if setting */ }; struct usbd_pipe { struct usbd_interface *up_iface; struct usbd_device *up_dev; struct usbd_endpoint *up_endpoint; char up_running; char up_aborting; bool up_serialise; SIMPLEQ_HEAD(, usbd_xfer) up_queue; struct usb_task up_async_task; struct usbd_xfer *up_intrxfer; /* used for repeating requests */ char up_repeat; int up_interval; uint8_t up_flags; struct usbd_xfer *up_callingxfer; /* currently in callback */ kcondvar_t up_callingcv; struct lwp *up_abortlwp; /* lwp currently aborting */ /* Filled by HC driver. */ const struct usbd_pipe_methods *up_methods; }; struct usbd_xfer { struct usbd_pipe *ux_pipe; void *ux_priv; void *ux_buffer; kcondvar_t ux_cv; uint32_t ux_length; uint32_t ux_actlen; uint16_t ux_flags; uint32_t ux_timeout; usbd_status ux_status; usbd_callback ux_callback; volatile uint8_t ux_done; uint8_t ux_state; /* used for DIAGNOSTIC */ #define XFER_FREE 0x46 #define XFER_BUSY 0x55 #define XFER_ONQU 0x9e /* For control pipe */ usb_device_request_t ux_request; /* For isoc */ uint16_t *ux_frlengths; int ux_nframes; const struct usbd_pipe_methods *ux_methods; /* For memory allocation and softc */ struct usbd_bus *ux_bus; usb_dma_t ux_dmabuf; void *ux_buf; uint32_t ux_bufsize; uint8_t ux_rqflags; #define URQ_REQUEST 0x01 SIMPLEQ_ENTRY(usbd_xfer) ux_next; void *ux_hcpriv; /* private use by the HC driver */ struct usb_task ux_aborttask; struct callout ux_callout; /* * Protected by bus lock. * * - ux_timeout_set: The timeout is scheduled as a callout or * usb task, and has not yet acquired the bus lock. * * - ux_timeout_reset: The xfer completed, and was resubmitted * before the callout or task was able to acquire the bus * lock, so one or the other needs to schedule a new callout. */ bool ux_timeout_set; bool ux_timeout_reset; }; void usbd_init(void); void usbd_finish(void); #if defined(USB_DEBUG) void usbd_dump_iface(struct usbd_interface *); void usbd_dump_device(struct usbd_device *); void usbd_dump_endpoint(struct usbd_endpoint *); void usbd_dump_queue(struct usbd_pipe *); void usbd_dump_pipe(struct usbd_pipe *); #endif /* Routines from usb_subr.c */ int usbctlprint(void *, const char *); void usbd_get_device_strings(struct usbd_device *); void usb_delay_ms_locked(struct usbd_bus *, u_int, kmutex_t *); void usb_delay_ms(struct usbd_bus *, u_int); void usbd_delay_ms_locked(struct usbd_device *, u_int, kmutex_t *); void usbd_delay_ms(struct usbd_device *, u_int); usbd_status usbd_reset_port(struct usbd_device *, int, usb_port_status_t *); usbd_status usbd_setup_pipe(struct usbd_device *, struct usbd_interface *, struct usbd_endpoint *, int, struct usbd_pipe **); usbd_status usbd_setup_pipe_flags(struct usbd_device *, struct usbd_interface *, struct usbd_endpoint *, int, struct usbd_pipe **, uint8_t); usbd_status usbd_new_device(device_t, struct usbd_bus *, int, int, int, struct usbd_port *); usbd_status usbd_reattach_device(device_t, struct usbd_device *, int, const int *); void usbd_remove_device(struct usbd_device *, struct usbd_port *); bool usbd_iface_locked(struct usbd_interface *); usbd_status usbd_iface_lock(struct usbd_interface *); void usbd_iface_unlock(struct usbd_interface *); usbd_status usbd_iface_piperef(struct usbd_interface *); void usbd_iface_pipeunref(struct usbd_interface *); usbd_status usbd_fill_iface_data(struct usbd_device *, int, int); void usb_free_device(struct usbd_device *); void usb_transfer_complete(struct usbd_xfer *); int usb_disconnect_port(struct usbd_port *, device_t, int); usbd_status usbd_endpoint_acquire(struct usbd_device *, struct usbd_endpoint *, int); void usbd_endpoint_release(struct usbd_device *, struct usbd_endpoint *); void usbd_kill_pipe(struct usbd_pipe *); usbd_status usbd_attach_roothub(device_t, struct usbd_device *); usbd_status usbd_probe_and_attach(device_t, struct usbd_device *, int, int); /* Routines from usb.c */ void usb_needs_explore(struct usbd_device *); void usb_needs_reattach(struct usbd_device *); void usb_schedsoftintr(struct usbd_bus *); static __inline int usbd_xfer_isread(struct usbd_xfer *xfer) { if (xfer->ux_rqflags & URQ_REQUEST) return xfer->ux_request.bmRequestType & UT_READ; return xfer->ux_pipe->up_endpoint->ue_edesc->bEndpointAddress & UE_DIR_IN; } static __inline size_t usb_addr2dindex(int addr) { return USB_ROOTHUB_INDEX + addr; } /* * These macros reflect the current locking scheme. They might change. */ #define usbd_lock_pipe(p) mutex_enter((p)->up_dev->ud_bus->ub_lock) #define usbd_unlock_pipe(p) mutex_exit((p)->up_dev->ud_bus->ub_lock) #endif /* _DEV_USB_USBDIVAR_H_ */
1 1 1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 /* $NetBSD: link_proto.c,v 1.40 2021/12/31 14:25:24 riastradh Exp $ */ /*- * Copyright (c) 1982, 1986, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)uipc_proto.c 8.2 (Berkeley) 2/14/95 */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: link_proto.c,v 1.40 2021/12/31 14:25:24 riastradh Exp $"); #include <sys/param.h> #include <sys/socket.h> #include <sys/protosw.h> #include <sys/domain.h> #include <sys/mbuf.h> #include <sys/un.h> #include <sys/socketvar.h> #include <net/if.h> #include <net/if_dl.h> #include <net/raw_cb.h> #include <net/route.h> static int sockaddr_dl_cmp(const struct sockaddr *, const struct sockaddr *); static int link_attach(struct socket *, int); static void link_detach(struct socket *); static int link_accept(struct socket *, struct sockaddr *); static int link_bind(struct socket *, struct sockaddr *, struct lwp *); static int link_listen(struct socket *, struct lwp *); static int link_connect(struct socket *, struct sockaddr *, struct lwp *); static int link_connect2(struct socket *, struct socket *); static int link_disconnect(struct socket *); static int link_shutdown(struct socket *); static int link_abort(struct socket *); static int link_ioctl(struct socket *, u_long, void *, struct ifnet *); static int link_stat(struct socket *, struct stat *); static int link_peeraddr(struct socket *, struct sockaddr *); static int link_sockaddr(struct socket *, struct sockaddr *); static int link_rcvd(struct socket *, int, struct lwp *); static int link_recvoob(struct socket *, struct mbuf *, int); static int link_send(struct socket *, struct mbuf *, struct sockaddr *, struct mbuf *, struct lwp *); static int link_sendoob(struct socket *, struct mbuf *, struct mbuf *); static int link_purgeif(struct socket *, struct ifnet *); static void link_init(void); /* * Definitions of protocols supported in the link-layer domain. */ DOMAIN_DEFINE(linkdomain); /* forward define and add to link set */ static const struct pr_usrreqs link_usrreqs = { .pr_attach = link_attach, .pr_detach = link_detach, .pr_accept = link_accept, .pr_bind = link_bind, .pr_listen = link_listen, .pr_connect = link_connect, .pr_connect2 = link_connect2, .pr_disconnect = link_disconnect, .pr_shutdown = link_shutdown, .pr_abort = link_abort, .pr_ioctl = link_ioctl, .pr_stat = link_stat, .pr_peeraddr = link_peeraddr, .pr_sockaddr = link_sockaddr, .pr_rcvd = link_rcvd, .pr_recvoob = link_recvoob, .pr_send = link_send, .pr_sendoob = link_sendoob, .pr_purgeif = link_purgeif, }; const struct protosw linksw[] = { { .pr_type = SOCK_DGRAM, .pr_domain = &linkdomain, .pr_protocol = 0, /* XXX */ .pr_flags = PR_ATOMIC|PR_ADDR|PR_PURGEIF, .pr_input = NULL, .pr_ctlinput = NULL, .pr_ctloutput = NULL, .pr_usrreqs = &link_usrreqs, .pr_init = link_init, }, }; struct domain linkdomain = { .dom_family = AF_LINK, .dom_name = "link", .dom_externalize = NULL, .dom_dispose = NULL, .dom_protosw = linksw, .dom_protoswNPROTOSW = &linksw[__arraycount(linksw)], .dom_sockaddr_cmp = sockaddr_dl_cmp }; static void link_init(void) { return; } static int link_control(struct socket *so, unsigned long cmd, void *data, struct ifnet *ifp) { int error, s; bool isactive, mkactive; struct if_laddrreq *iflr; union { struct sockaddr sa; struct sockaddr_dl sdl; struct sockaddr_storage ss; } u; struct ifaddr *ifa; const struct sockaddr_dl *asdl, *nsdl; struct psref psref; switch (cmd) { case SIOCALIFADDR: case SIOCDLIFADDR: case SIOCGLIFADDR: iflr = data; if (iflr->addr.ss_family != AF_LINK) return EINVAL; asdl = satocsdl(sstocsa(&iflr->addr)); if (asdl->sdl_alen != ifp->if_addrlen) return EINVAL; if (sockaddr_dl_init(&u.sdl, sizeof(u.ss), ifp->if_index, ifp->if_type, ifp->if_xname, strlen(ifp->if_xname), CLLADDR(asdl), asdl->sdl_alen) == NULL) return EINVAL; if ((iflr->flags & IFLR_PREFIX) == 0) ; else if (iflr->prefixlen != NBBY * ifp->if_addrlen) return EINVAL; /* XXX match with prefix */ error = 0; s = pserialize_read_enter(); IFADDR_READER_FOREACH(ifa, ifp) { if (sockaddr_cmp(&u.sa, ifa->ifa_addr) == 0) { ifa_acquire(ifa, &psref); break; } } pserialize_read_exit(s); switch (cmd) { case SIOCGLIFADDR: ifa_release(ifa, &psref); s = pserialize_read_enter(); if ((iflr->flags & IFLR_PREFIX) == 0) { IFADDR_READER_FOREACH(ifa, ifp) { if (ifa->ifa_addr->sa_family == AF_LINK) break; } } if (ifa == NULL) { pserialize_read_exit(s); error = EADDRNOTAVAIL; break; } if (ifa == ifp->if_dl) iflr->flags = IFLR_ACTIVE; else iflr->flags = 0; if (ifa == ifp->if_hwdl) iflr->flags |= IFLR_FACTORY; sockaddr_copy(sstosa(&iflr->addr), sizeof(iflr->addr), ifa->ifa_addr); pserialize_read_exit(s); ifa = NULL; break; case SIOCDLIFADDR: if (ifa == NULL) error = EADDRNOTAVAIL; else if (ifa == ifp->if_dl || ifa == ifp->if_hwdl) error = EBUSY; else { /* TBD routing socket */ rt_addrmsg(RTM_DELETE, ifa); /* We need to release psref for ifa_remove */ ifaref(ifa); ifa_release(ifa, &psref); ifa_remove(ifp, ifa); KASSERTMSG(ifa->ifa_refcnt == 1, "ifa_refcnt=%d", ifa->ifa_refcnt); ifafree(ifa); ifa = NULL; } break; case SIOCALIFADDR: if (ifa == NULL) { ifa = if_dl_create(ifp, &nsdl); if (ifa == NULL) { error = ENOMEM; break; } ifa_acquire(ifa, &psref); sockaddr_copy(ifa->ifa_addr, ifa->ifa_addr->sa_len, &u.sa); ifa_insert(ifp, ifa); rt_addrmsg(RTM_ADD, ifa); } mkactive = (iflr->flags & IFLR_ACTIVE) != 0; isactive = (ifa == ifp->if_dl); if (!isactive && mkactive) { if_activate_sadl(ifp, ifa, nsdl); rt_addrmsg(RTM_CHANGE, ifa); error = ENETRESET; } break; } ifa_release(ifa, &psref); if (error != ENETRESET) return error; else if ((ifp->if_flags & IFF_RUNNING) != 0 && ifp->if_init != NULL) return if_init(ifp); else return 0; default: return ENOTTY; } } static int link_attach(struct socket *so, int proto) { sosetlock(so); KASSERT(solocked(so)); return 0; } static void link_detach(struct socket *so) { KASSERT(solocked(so)); sofree(so); } static int link_accept(struct socket *so, struct sockaddr *nam) { KASSERT(solocked(so)); return EOPNOTSUPP; } static int link_bind(struct socket *so, struct sockaddr *nam, struct lwp *l) { KASSERT(solocked(so)); return EOPNOTSUPP; } static int link_listen(struct socket *so, struct lwp *l) { KASSERT(solocked(so)); return EOPNOTSUPP; } static int link_connect(struct socket *so, struct sockaddr *nam, struct lwp *l) { KASSERT(solocked(so)); return EOPNOTSUPP; } static int link_connect2(struct socket *so, struct socket *so2) { KASSERT(solocked(so)); return EOPNOTSUPP; } static int link_disconnect(struct socket *so) { KASSERT(solocked(so)); return EOPNOTSUPP; } static int link_shutdown(struct socket *so) { KASSERT(solocked(so)); return EOPNOTSUPP; } static int link_abort(struct socket *so) { KASSERT(solocked(so)); return EOPNOTSUPP; } static int link_ioctl(struct socket *so, u_long cmd, void *nam, struct ifnet *ifp) { return link_control(so, cmd, nam, ifp); } static int link_stat(struct socket *so, struct stat *ub) { KASSERT(solocked(so)); return 0; } static int link_peeraddr(struct socket *so, struct sockaddr *nam) { KASSERT(solocked(so)); return EOPNOTSUPP; } static int link_sockaddr(struct socket *so, struct sockaddr *nam) { KASSERT(solocked(so)); return EOPNOTSUPP; } static int link_rcvd(struct socket *so, int flags, struct lwp *l) { KASSERT(solocked(so)); return EOPNOTSUPP; } static int link_recvoob(struct socket *so, struct mbuf *m, int flags) { KASSERT(solocked(so)); return EOPNOTSUPP; } static int link_send(struct socket *so, struct mbuf *m, struct sockaddr *nam, struct mbuf *control, struct lwp *l) { KASSERT(solocked(so)); return EOPNOTSUPP; } static int link_sendoob(struct socket *so, struct mbuf *m, struct mbuf *control) { KASSERT(solocked(so)); m_freem(m); m_freem(control); return EOPNOTSUPP; } static int link_purgeif(struct socket *so, struct ifnet *ifp) { return EOPNOTSUPP; } /* Compare the field at byte offsets [fieldstart, fieldend) in * two memory regions, [l, l + llen) and [r, r + llen). */ static inline int submemcmp(const void *l, const void *r, const uint_fast8_t llen, const uint_fast8_t rlen, const uint_fast8_t fieldstart, const uint_fast8_t fieldend) { uint_fast8_t cmpend, minlen; const uint8_t *lb = l, *rb = r; int rc; minlen = MIN(llen, rlen); /* The field is missing from one region. The shorter region is the * lesser region. */ if (fieldstart >= minlen) return llen - rlen; /* Two empty, present fields are always equal. */ if (fieldstart > fieldend) return 0; cmpend = MIN(fieldend, minlen); rc = memcmp(&lb[fieldstart], &rb[fieldstart], cmpend - fieldstart); if (rc != 0) return rc; /* If one or both fields are truncated, then the shorter is the lesser * field. */ if (minlen < fieldend) return llen - rlen; /* Fields are full-length and equal. The fields are equal. */ return 0; } uint8_t sockaddr_dl_measure(uint8_t namelen, uint8_t addrlen) { return offsetof(struct sockaddr_dl, sdl_data[namelen + addrlen]); } struct sockaddr * sockaddr_dl_alloc(uint16_t ifindex, uint8_t type, const void *name, uint8_t namelen, const void *addr, uint8_t addrlen, int flags) { struct sockaddr *sa; socklen_t len; len = sockaddr_dl_measure(namelen, addrlen); sa = sockaddr_alloc(AF_LINK, len, flags); if (sa == NULL) return NULL; if (sockaddr_dl_init(satosdl(sa), len, ifindex, type, name, namelen, addr, addrlen) == NULL) { sockaddr_free(sa); return NULL; } return sa; } struct sockaddr_dl * sockaddr_dl_init(struct sockaddr_dl *sdl, socklen_t socklen, uint16_t ifindex, uint8_t type, const void *name, uint8_t namelen, const void *addr, uint8_t addrlen) { socklen_t len; sdl->sdl_family = AF_LINK; sdl->sdl_slen = 0; len = sockaddr_dl_measure(namelen, addrlen); if (len > socklen) { sdl->sdl_len = socklen; #ifdef DIAGNOSTIC printf("%s: too long: %u > %u\n", __func__, (u_int)len, (u_int)socklen); #endif return NULL; } sdl->sdl_len = len; sdl->sdl_index = ifindex; sdl->sdl_type = type; memset(&sdl->sdl_data[0], 0, namelen + addrlen); if (name != NULL) { memcpy(&sdl->sdl_data[0], name, namelen); sdl->sdl_nlen = namelen; } else sdl->sdl_nlen = 0; if (addr != NULL) { memcpy(&sdl->sdl_data[sdl->sdl_nlen], addr, addrlen); sdl->sdl_alen = addrlen; } else sdl->sdl_alen = 0; return sdl; } static int sockaddr_dl_cmp(const struct sockaddr *sa1, const struct sockaddr *sa2) { int rc; const uint_fast8_t indexofs = offsetof(struct sockaddr_dl, sdl_index); const uint_fast8_t nlenofs = offsetof(struct sockaddr_dl, sdl_nlen); uint_fast8_t dataofs = offsetof(struct sockaddr_dl, sdl_data[0]); const struct sockaddr_dl *sdl1, *sdl2; sdl1 = satocsdl(sa1); sdl2 = satocsdl(sa2); rc = submemcmp(sdl1, sdl2, sdl1->sdl_len, sdl2->sdl_len, indexofs, nlenofs); if (rc != 0) return rc; rc = submemcmp(sdl1, sdl2, sdl1->sdl_len, sdl2->sdl_len, dataofs, dataofs + MIN(sdl1->sdl_nlen, sdl2->sdl_nlen)); if (rc != 0) return rc; if (sdl1->sdl_nlen != sdl2->sdl_nlen) return sdl1->sdl_nlen - sdl2->sdl_nlen; dataofs += sdl1->sdl_nlen; rc = submemcmp(sdl1, sdl2, sdl1->sdl_len, sdl2->sdl_len, dataofs, dataofs + MIN(sdl1->sdl_alen, sdl2->sdl_alen)); if (rc != 0) return rc; if (sdl1->sdl_alen != sdl2->sdl_alen) return sdl1->sdl_alen - sdl2->sdl_alen; dataofs += sdl1->sdl_alen; rc = submemcmp(sdl1, sdl2, sdl1->sdl_len, sdl2->sdl_len, dataofs, dataofs + MIN(sdl1->sdl_slen, sdl2->sdl_slen)); if (sdl1->sdl_slen != sdl2->sdl_slen) return sdl1->sdl_slen - sdl2->sdl_slen; return sdl1->sdl_len - sdl2->sdl_len; } struct sockaddr_dl * sockaddr_dl_setaddr(struct sockaddr_dl *sdl, socklen_t socklen, const void *addr, uint8_t addrlen) { socklen_t len; len = sockaddr_dl_measure(sdl->sdl_nlen, addrlen); if (len > socklen) { #ifdef DIAGNOSTIC printf("%s: too long: %u > %u\n", __func__, (u_int)len, (u_int)socklen); #endif return NULL; } memcpy(&sdl->sdl_data[sdl->sdl_nlen], addr, addrlen); sdl->sdl_alen = addrlen; sdl->sdl_len = len; return sdl; }
16 6 77 38 502 25 39 4 331 23 132 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 /* $NetBSD: ktrace.h,v 1.69 2024/05/10 09:30:47 rillig Exp $ */ /* * Copyright (c) 1988, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)ktrace.h 8.2 (Berkeley) 2/19/95 */ #ifndef _SYS_KTRACE_H_ #define _SYS_KTRACE_H_ #include <sys/mutex.h> #include <sys/lwp.h> #include <sys/signal.h> #include <sys/time.h> #include <sys/uio.h> /* * operations to ktrace system call (KTROP(op)) */ #define KTROP_SET 0 /* set trace points */ #define KTROP_CLEAR 1 /* clear trace points */ #define KTROP_CLEARFILE 2 /* stop all tracing to file */ #define KTROP_MASK 0x3 #define KTROP(o) ((o)&KTROP_MASK) /* macro to extract operation */ /* * flags (ORed in with operation) */ #define KTRFLAG_DESCEND 4 /* perform op on all children too */ /* * ktrace record header */ struct ktr_header { int ktr_len; /* length of record minus length of old header */ #if BYTE_ORDER == LITTLE_ENDIAN short ktr_type; /* trace record type */ short ktr_version; /* trace record version */ #else short ktr_version; /* trace record version */ short ktr_type; /* trace record type */ #endif pid_t ktr_pid; /* process id */ char ktr_comm[MAXCOMLEN+1]; /* command name */ union { struct { /* v0 */ struct { int32_t tv_sec; long tv_usec; } _tv; const void *_buf; } _v0; struct { /* v1 */ struct { int32_t tv_sec; long tv_nsec; } _ts; lwpid_t _lid; } _v1; struct { /* v2 */ struct timespec _ts; lwpid_t _lid; } _v2; } _v; }; #define ktr_lid _v._v2._lid #define ktr_olid _v._v1._lid #define ktr_time _v._v2._ts #define ktr_otv _v._v0._tv #define ktr_ots _v._v1._ts #define ktr_ts _v._v2._ts #define ktr_unused _v._v0._buf #define KTR_SHIMLEN offsetof(struct ktr_header, ktr_pid) /* * Test for kernel trace point */ #define KTRPOINT(p, type) \ (((p)->p_traceflag & (1<<(type))) != 0) /* * ktrace record types */ /* * KTR_SYSCALL - system call record */ #define KTR_SYSCALL 1 struct ktr_syscall { int ktr_code; /* syscall number */ int ktr_argsize; /* size of arguments */ /* * followed by ktr_argsize/sizeof(register_t) "register_t"s */ }; /* * KTR_SYSRET - return from system call record */ #define KTR_SYSRET 2 struct ktr_sysret { short ktr_code; short ktr_eosys; /* XXX unused */ int ktr_error; __register_t ktr_retval; __register_t ktr_retval_1; }; /* * KTR_NAMEI - namei record */ #define KTR_NAMEI 3 /* record contains pathname */ /* * KTR_GENIO - trace generic process i/o */ #define KTR_GENIO 4 struct ktr_genio { int ktr_fd; enum uio_rw ktr_rw; /* * followed by data successfully read/written */ }; /* * KTR_PSIG - trace processed signal */ #define KTR_PSIG 5 struct ktr_psig { int signo; sig_t action; sigset_t mask; int code; /* * followed by optional siginfo_t */ }; /* * KTR_CSW - trace context switches */ #define KTR_CSW 6 struct ktr_csw { int out; /* 1 if switch out, 0 if switch in */ int user; /* 1 if usermode (ivcsw), 0 if kernel (vcsw) */ }; /* * KTR_EMUL - emulation change */ #define KTR_EMUL 7 /* record contains emulation name */ /* * KTR_USER - user record */ #define KTR_USER 8 #define KTR_USER_MAXIDLEN 20 #define KTR_USER_MAXLEN 2048 /* maximum length of passed data */ struct ktr_user { char ktr_id[KTR_USER_MAXIDLEN]; /* string id of caller */ /* * Followed by ktr_len - sizeof(struct ktr_user) of user data. */ }; /* * KTR_EXEC_ARG, KTR_EXEC_ENV - Arguments and environment from exec */ #define KTR_EXEC_ARG 10 #define KTR_EXEC_ENV 11 /* record contains arg/env string */ /* * KTR_SAUPCALL - scheduler activated upcall. * * The structure is no longer used, but retained for compatibility. */ #define KTR_SAUPCALL 13 struct ktr_saupcall { int ktr_type; int ktr_nevent; int ktr_nint; void *ktr_sas; void *ktr_ap; /* * followed by nevent sa_t's from sas[] */ }; /* * KTR_MIB - MIB name and data */ #define KTR_MIB 14 /* Record contains MIB name */ /* * KTR_EXEC_FD - Opened file descriptor from exec */ #define KTR_EXEC_FD 15 struct ktr_execfd { int ktr_fd; u_int ktr_dtype; /* one of DTYPE_* constants */ }; /* * kernel trace points (in p_traceflag) */ #define KTRFAC_MASK 0x00ffffff #define KTRFAC_SYSCALL (1<<KTR_SYSCALL) #define KTRFAC_SYSRET (1<<KTR_SYSRET) #define KTRFAC_NAMEI (1<<KTR_NAMEI) #define KTRFAC_GENIO (1<<KTR_GENIO) #define KTRFAC_PSIG (1<<KTR_PSIG) #define KTRFAC_CSW (1<<KTR_CSW) #define KTRFAC_EMUL (1<<KTR_EMUL) #define KTRFAC_USER (1<<KTR_USER) #define KTRFAC_EXEC_ARG (1<<KTR_EXEC_ARG) #define KTRFAC_EXEC_ENV (1<<KTR_EXEC_ENV) #define KTRFAC_MIB (1<<KTR_MIB) #define KTRFAC_EXEC_FD (1<<KTR_EXEC_FD) #define __KTRACE_FLAG_BITS \ "\177\020" \ "b\1SYSCALL\0" \ "b\2SYSRET\0" \ "b\3NAMEI\0" \ "b\4GENIO\0" \ "b\5PSIG\0" \ "b\6CSW\0" \ "b\7EMUL\0" \ "b\10USER\0" \ "b\12EXEC_ARG\0" \ "b\13EXEC_ENV\0" \ "b\15SAUPCALL\0" \ "b\16MIB\0" \ "b\17EXEC_FD\0" \ "f\30\4VERSION\0" \ "b\35TRC_EMUL\0" \ "b\36INHERIT\0" \ "b\37PERSISTENT\0" /* * trace flags (also in p_traceflags) */ #define KTRFAC_PERSISTENT 0x80000000 /* persistent trace across sugid exec (exclusive) */ #define KTRFAC_INHERIT 0x40000000 /* pass trace flags to children */ #define KTRFAC_TRC_EMUL 0x10000000 /* ktrace KTR_EMUL before next trace */ #define KTRFAC_VER_MASK 0x0f000000 /* record version mask */ #define KTRFAC_VER_SHIFT 24 /* record version shift */ #define KTRFAC_VERSION(tf) (((tf) & KTRFAC_VER_MASK) >> KTRFAC_VER_SHIFT) #define KTRFACv0 (0 << KTRFAC_VER_SHIFT) #define KTRFACv1 (1 << KTRFAC_VER_SHIFT) #define KTRFACv2 (2 << KTRFAC_VER_SHIFT) #ifndef _KERNEL #include <sys/cdefs.h> __BEGIN_DECLS int ktrace(const char *, int, int, pid_t); int fktrace(int, int, int, pid_t); int utrace(const char *, void *, size_t); __END_DECLS #else struct syncobj; void ktrinit(void); void ktrderef(struct proc *); void ktradref(struct proc *); extern kmutex_t ktrace_lock; extern int ktrace_on; int ktruser(const char *, void *, size_t, int); bool ktr_point(int); void ktr_csw(int, int, const struct syncobj *); void ktr_emul(void); void ktr_geniov(int, enum uio_rw, struct iovec *, size_t, int); void ktr_genio(int, enum uio_rw, const void *, size_t, int); void ktr_mibio(int, enum uio_rw, const void *, size_t, int); void ktr_namei(const char *, size_t); void ktr_namei2(const char *, size_t, const char *, size_t); void ktr_psig(int, sig_t, const sigset_t *, const ksiginfo_t *); void ktr_syscall(register_t, const register_t [], int); void ktr_sysret(register_t, int, register_t *); void ktr_kuser(const char *, const void *, size_t); void ktr_mib(const int *a , u_int b); void ktr_execarg(const void *, size_t); void ktr_execenv(const void *, size_t); void ktr_execfd(int, u_int); int ktrace_common(lwp_t *, int, int, int, file_t **); static __inline int ktrenter(lwp_t *l) { if ((l->l_pflag & LP_KTRACTIVE) != 0) return 1; l->l_pflag |= LP_KTRACTIVE; return 0; } static __inline void ktrexit(lwp_t *l) { l->l_pflag &= ~LP_KTRACTIVE; } static __inline bool ktrpoint(int fac) { return __predict_false(ktrace_on) && __predict_false(ktr_point(1 << fac)); } static __inline void ktrcsw(int a, int b, const struct syncobj *c) { if (__predict_false(ktrace_on)) ktr_csw(a, b, c); } static __inline void ktremul(void) { if (__predict_false(ktrace_on)) ktr_emul(); } static __inline void ktrgenio(int a, enum uio_rw b, const void *c, size_t d, int e) { if (__predict_false(ktrace_on)) ktr_genio(a, b, c, d, e); } static __inline void ktrgeniov(int a, enum uio_rw b, struct iovec *c, int d, int e) { if (__predict_false(ktrace_on)) ktr_geniov(a, b, c, d, e); } static __inline void ktrmibio(int a, enum uio_rw b, const void *c, size_t d, int e) { if (__predict_false(ktrace_on)) ktr_mibio(a, b, c, d, e); } static __inline void ktrnamei(const char *a, size_t b) { if (__predict_false(ktrace_on)) ktr_namei(a, b); } static __inline void ktrnamei2(const char *a, size_t b, const char *c, size_t d) { if (__predict_false(ktrace_on)) ktr_namei2(a, b, c, d); } static __inline void ktrpsig(int a, sig_t b, const sigset_t *c, const ksiginfo_t * d) { if (__predict_false(ktrace_on)) ktr_psig(a, b, c, d); } static __inline void ktrsyscall(register_t code, const register_t args[], int narg) { if (__predict_false(ktrace_on)) ktr_syscall(code, args, narg); } static __inline void ktrsysret(register_t a, int b, register_t *c) { if (__predict_false(ktrace_on)) ktr_sysret(a, b, c); } static __inline void ktrkuser(const char *a, const void *b, size_t c) { if (__predict_false(ktrace_on)) ktr_kuser(a, b, c); } static __inline void ktrmib(const int *a , u_int b) { if (__predict_false(ktrace_on)) ktr_mib(a, b); } static __inline void ktrexecarg(const void *a, size_t b) { if (__predict_false(ktrace_on)) ktr_execarg(a, b); } static __inline void ktrexecenv(const void *a, size_t b) { if (__predict_false(ktrace_on)) ktr_execenv(a, b); } static __inline void ktrexecfd(int fd, u_int dtype) { if (__predict_false(ktrace_on)) ktr_execfd(fd, dtype); } struct ktrace_entry; int ktealloc(struct ktrace_entry **, void **, lwp_t *, int, size_t); void ktesethdrlen(struct ktrace_entry *, size_t); void ktraddentry(lwp_t *, struct ktrace_entry *, int); /* Flags for ktraddentry (3rd arg) */ #define KTA_NOWAIT 0x0000 #define KTA_WAITOK 0x0001 #define KTA_LARGE 0x0002 #endif /* !_KERNEL */ #endif /* _SYS_KTRACE_H_ */
4 4 4 2 3 3 3 3 6 6 2 4 12 4 16 1 7 7 7 1 7 5 1 13 13 13 13 3 10 11 1 1 34 34 34 34 29 7 7 4 23 8 7 1 7 7 23 23 23 3 7 15 23 23 1 3 3 7 7 2 1 2 2 4 25 25 21 19 19 4 4 25 23 24 16 16 16 15 2 16 1 1 1 1 15 4 4 4 4 1 3 3 2 4 4 4 2 1 4 1 1 1 1 1 2 2 2 2 1 1 1 2 2 2 2 12 12 12 6 6 6 6 1 1 1 1 1 1 23 23 23 21 2 2 18 5 8 2 1 1 1 4 1 3 1 2 2 3 2 2 8 20 20 20 13 11 2 1 1 11 6 1 4 2 2 4 4 4 3 2 2 2 2 2 2 4 1 1 4 1 4 3 3 3 4 3 4 4 1 24 20 4 4 1 2 3 2 2 2 2 2 2 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 /* $NetBSD: uipc_usrreq.c,v 1.203 2022/05/28 22:08:46 andvar Exp $ */ /*- * Copyright (c) 1998, 2000, 2004, 2008, 2009, 2020 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility, * NASA Ames Research Center, and by Andrew Doran. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /* * Copyright (c) 1982, 1986, 1989, 1991, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)uipc_usrreq.c 8.9 (Berkeley) 5/14/95 */ /* * Copyright (c) 1997 Christopher G. Demetriou. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)uipc_usrreq.c 8.9 (Berkeley) 5/14/95 */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: uipc_usrreq.c,v 1.203 2022/05/28 22:08:46 andvar Exp $"); #ifdef _KERNEL_OPT #include "opt_compat_netbsd.h" #endif #include <sys/param.h> #include <sys/systm.h> #include <sys/proc.h> #include <sys/filedesc.h> #include <sys/domain.h> #include <sys/protosw.h> #include <sys/socket.h> #include <sys/socketvar.h> #include <sys/unpcb.h> #include <sys/un.h> #include <sys/namei.h> #include <sys/vnode.h> #include <sys/file.h> #include <sys/stat.h> #include <sys/mbuf.h> #include <sys/kauth.h> #include <sys/kmem.h> #include <sys/atomic.h> #include <sys/uidinfo.h> #include <sys/kernel.h> #include <sys/kthread.h> #include <sys/compat_stub.h> #include <compat/sys/socket.h> #include <compat/net/route_70.h> /* * Unix communications domain. * * TODO: * RDM * rethink name space problems * need a proper out-of-band * * Notes on locking: * * The generic rules noted in uipc_socket2.c apply. In addition: * * o We have a global lock, uipc_lock. * * o All datagram sockets are locked by uipc_lock. * * o For stream socketpairs, the two endpoints are created sharing the same * independent lock. Sockets presented to PRU_CONNECT2 must already have * matching locks. * * o Stream sockets created via socket() start life with their own * independent lock. * * o Stream connections to a named endpoint are slightly more complicated. * Sockets that have called listen() have their lock pointer mutated to * the global uipc_lock. When establishing a connection, the connecting * socket also has its lock mutated to uipc_lock, which matches the head * (listening socket). We create a new socket for accept() to return, and * that also shares the head's lock. Until the connection is completely * done on both ends, all three sockets are locked by uipc_lock. Once the * connection is complete, the association with the head's lock is broken. * The connecting socket and the socket returned from accept() have their * lock pointers mutated away from uipc_lock, and back to the connecting * socket's original, independent lock. The head continues to be locked * by uipc_lock. * * o If uipc_lock is determined to be a significant source of contention, * it could easily be hashed out. It is difficult to simply make it an * independent lock because of visibility / garbage collection issues: * if a socket has been associated with a lock at any point, that lock * must remain valid until the socket is no longer visible in the system. * The lock must not be freed or otherwise destroyed until any sockets * that had referenced it have also been destroyed. */ const struct sockaddr_un sun_noname = { .sun_len = offsetof(struct sockaddr_un, sun_path), .sun_family = AF_LOCAL, }; ino_t unp_ino; /* prototype for fake inode numbers */ static struct mbuf * unp_addsockcred(struct lwp *, struct mbuf *); static void unp_discard_later(file_t *); static void unp_discard_now(file_t *); static void unp_disconnect1(struct unpcb *); static bool unp_drop(struct unpcb *, int); static int unp_internalize(struct mbuf **); static void unp_mark(file_t *); static void unp_scan(struct mbuf *, void (*)(file_t *), int); static void unp_shutdown1(struct unpcb *); static void unp_thread(void *); static void unp_thread_kick(void); static kmutex_t *uipc_lock; static kcondvar_t unp_thread_cv; static lwp_t *unp_thread_lwp; static SLIST_HEAD(,file) unp_thread_discard; static int unp_defer; static struct sysctllog *usrreq_sysctllog; static void unp_sysctl_create(void); /* Compat interface */ struct mbuf * stub_compat_70_unp_addsockcred(lwp_t *, struct mbuf *); struct mbuf * stub_compat_70_unp_addsockcred(struct lwp *lwp, struct mbuf *control) { /* just copy our initial argument */ return control; } bool compat70_ocreds_valid = false; /* * Initialize Unix protocols. */ void uipc_init(void) { int error; unp_sysctl_create(); uipc_lock = mutex_obj_alloc(MUTEX_DEFAULT, IPL_NONE); cv_init(&unp_thread_cv, "unpgc"); error = kthread_create(PRI_NONE, KTHREAD_MPSAFE, NULL, unp_thread, NULL, &unp_thread_lwp, "unpgc"); if (error != 0) panic("uipc_init %d", error); } static void unp_connid(struct lwp *l, struct unpcb *unp, int flags) { unp->unp_connid.unp_pid = l->l_proc->p_pid; unp->unp_connid.unp_euid = kauth_cred_geteuid(l->l_cred); unp->unp_connid.unp_egid = kauth_cred_getegid(l->l_cred); unp->unp_flags |= flags; } /* * A connection succeeded: disassociate both endpoints from the head's * lock, and make them share their own lock. There is a race here: for * a very brief time one endpoint will be locked by a different lock * than the other end. However, since the current thread holds the old * lock (the listening socket's lock, the head) access can still only be * made to one side of the connection. */ static void unp_setpeerlocks(struct socket *so, struct socket *so2) { struct unpcb *unp; kmutex_t *lock; KASSERT(solocked2(so, so2)); /* * Bail out if either end of the socket is not yet fully * connected or accepted. We only break the lock association * with the head when the pair of sockets stand completely * on their own. */ KASSERT(so->so_head == NULL); if (so2->so_head != NULL) return; /* * Drop references to old lock. A third reference (from the * queue head) must be held as we still hold its lock. Bonus: * we don't need to worry about garbage collecting the lock. */ lock = so->so_lock; KASSERT(lock == uipc_lock); mutex_obj_free(lock); mutex_obj_free(lock); /* * Grab stream lock from the initiator and share between the two * endpoints. Issue memory barrier to ensure all modifications * become globally visible before the lock change. so2 is * assumed not to have a stream lock, because it was created * purely for the server side to accept this connection and * started out life using the domain-wide lock. */ unp = sotounpcb(so); KASSERT(unp->unp_streamlock != NULL); KASSERT(sotounpcb(so2)->unp_streamlock == NULL); lock = unp->unp_streamlock; unp->unp_streamlock = NULL; mutex_obj_hold(lock); /* * Ensure lock is initialized before publishing it with * solockreset. Pairs with atomic_load_consume in solock and * various loops to reacquire lock after wakeup. */ membar_release(); /* * possible race if lock is not held - see comment in * uipc_usrreq(PRU_ACCEPT). */ KASSERT(mutex_owned(lock)); solockreset(so, lock); solockreset(so2, lock); } /* * Reset a socket's lock back to the domain-wide lock. */ static void unp_resetlock(struct socket *so) { kmutex_t *olock, *nlock; struct unpcb *unp; KASSERT(solocked(so)); olock = so->so_lock; nlock = uipc_lock; if (olock == nlock) return; unp = sotounpcb(so); KASSERT(unp->unp_streamlock == NULL); unp->unp_streamlock = olock; mutex_obj_hold(nlock); mutex_enter(nlock); solockreset(so, nlock); mutex_exit(olock); } static void unp_free(struct unpcb *unp) { if (unp->unp_addr) free(unp->unp_addr, M_SONAME); if (unp->unp_streamlock != NULL) mutex_obj_free(unp->unp_streamlock); kmem_free(unp, sizeof(*unp)); } static int unp_output(struct mbuf *m, struct mbuf *control, struct unpcb *unp) { struct socket *so2; const struct sockaddr_un *sun; /* XXX: server side closed the socket */ if (unp->unp_conn == NULL) return ECONNREFUSED; so2 = unp->unp_conn->unp_socket; KASSERT(solocked(so2)); if (unp->unp_addr) sun = unp->unp_addr; else sun = &sun_noname; if (unp->unp_conn->unp_flags & UNP_WANTCRED) control = unp_addsockcred(curlwp, control); if (unp->unp_conn->unp_flags & UNP_OWANTCRED) MODULE_HOOK_CALL(uipc_unp_70_hook, (curlwp, control), stub_compat_70_unp_addsockcred(curlwp, control), control); if (sbappendaddr(&so2->so_rcv, (const struct sockaddr *)sun, m, control) == 0) { unp_dispose(control); m_freem(control); m_freem(m); /* Don't call soroverflow because we're returning this * error directly to the sender. */ so2->so_rcv.sb_overflowed++; return ENOBUFS; } else { sorwakeup(so2); return 0; } } static void unp_setaddr(struct socket *so, struct sockaddr *nam, bool peeraddr) { const struct sockaddr_un *sun = NULL; struct unpcb *unp; KASSERT(solocked(so)); unp = sotounpcb(so); if (peeraddr) { if (unp->unp_conn && unp->unp_conn->unp_addr) sun = unp->unp_conn->unp_addr; } else { if (unp->unp_addr) sun = unp->unp_addr; } if (sun == NULL) sun = &sun_noname; memcpy(nam, sun, sun->sun_len); } static int unp_rcvd(struct socket *so, int flags, struct lwp *l) { struct unpcb *unp = sotounpcb(so); struct socket *so2; u_int newhiwat; KASSERT(solocked(so)); KASSERT(unp != NULL); switch (so->so_type) { case SOCK_DGRAM: panic("uipc 1"); /*NOTREACHED*/ case SOCK_SEQPACKET: /* FALLTHROUGH */ case SOCK_STREAM: #define rcv (&so->so_rcv) #define snd (&so2->so_snd) if (unp->unp_conn == 0) break; so2 = unp->unp_conn->unp_socket; KASSERT(solocked2(so, so2)); /* * Adjust backpressure on sender * and wakeup any waiting to write. */ snd->sb_mbmax += unp->unp_mbcnt - rcv->sb_mbcnt; unp->unp_mbcnt = rcv->sb_mbcnt; newhiwat = snd->sb_hiwat + unp->unp_cc - rcv->sb_cc; (void)chgsbsize(so2->so_uidinfo, &snd->sb_hiwat, newhiwat, RLIM_INFINITY); unp->unp_cc = rcv->sb_cc; sowwakeup(so2); #undef snd #undef rcv break; default: panic("uipc 2"); } return 0; } static int unp_recvoob(struct socket *so, struct mbuf *m, int flags) { KASSERT(solocked(so)); return EOPNOTSUPP; } static int unp_send(struct socket *so, struct mbuf *m, struct sockaddr *nam, struct mbuf *control, struct lwp *l) { struct unpcb *unp = sotounpcb(so); int error = 0; u_int newhiwat; struct socket *so2; KASSERT(solocked(so)); KASSERT(unp != NULL); KASSERT(m != NULL); /* * Note: unp_internalize() rejects any control message * other than SCM_RIGHTS, and only allows one. This * has the side-effect of preventing a caller from * forging SCM_CREDS. */ if (control) { sounlock(so); error = unp_internalize(&control); solock(so); if (error != 0) { m_freem(control); m_freem(m); return error; } } switch (so->so_type) { case SOCK_DGRAM: { KASSERT(so->so_lock == uipc_lock); if (nam) { if ((so->so_state & SS_ISCONNECTED) != 0) error = EISCONN; else { /* * Note: once connected, the * socket's lock must not be * dropped until we have sent * the message and disconnected. * This is necessary to prevent * intervening control ops, like * another connection. */ error = unp_connect(so, nam, l); } } else { if ((so->so_state & SS_ISCONNECTED) == 0) error = ENOTCONN; } if (error) { unp_dispose(control); m_freem(control); m_freem(m); return error; } error = unp_output(m, control, unp); if (nam) unp_disconnect1(unp); break; } case SOCK_SEQPACKET: /* FALLTHROUGH */ case SOCK_STREAM: #define rcv (&so2->so_rcv) #define snd (&so->so_snd) if (unp->unp_conn == NULL) { error = ENOTCONN; break; } so2 = unp->unp_conn->unp_socket; KASSERT(solocked2(so, so2)); if (unp->unp_conn->unp_flags & UNP_WANTCRED) { /* * Credentials are passed only once on * SOCK_STREAM and SOCK_SEQPACKET. */ unp->unp_conn->unp_flags &= ~UNP_WANTCRED; control = unp_addsockcred(l, control); } if (unp->unp_conn->unp_flags & UNP_OWANTCRED) { /* * Credentials are passed only once on * SOCK_STREAM and SOCK_SEQPACKET. */ unp->unp_conn->unp_flags &= ~UNP_OWANTCRED; MODULE_HOOK_CALL(uipc_unp_70_hook, (curlwp, control), stub_compat_70_unp_addsockcred(curlwp, control), control); } /* * Send to paired receive port, and then reduce * send buffer hiwater marks to maintain backpressure. * Wake up readers. */ if (control) { if (sbappendcontrol(rcv, m, control) != 0) control = NULL; } else { switch(so->so_type) { case SOCK_SEQPACKET: sbappendrecord(rcv, m); break; case SOCK_STREAM: sbappend(rcv, m); break; default: panic("uipc_usrreq"); break; } } snd->sb_mbmax -= rcv->sb_mbcnt - unp->unp_conn->unp_mbcnt; unp->unp_conn->unp_mbcnt = rcv->sb_mbcnt; newhiwat = snd->sb_hiwat - (rcv->sb_cc - unp->unp_conn->unp_cc); (void)chgsbsize(so->so_uidinfo, &snd->sb_hiwat, newhiwat, RLIM_INFINITY); unp->unp_conn->unp_cc = rcv->sb_cc; sorwakeup(so2); #undef snd #undef rcv if (control != NULL) { unp_dispose(control); m_freem(control); } break; default: panic("uipc 4"); } return error; } static int unp_sendoob(struct socket *so, struct mbuf *m, struct mbuf * control) { KASSERT(solocked(so)); m_freem(m); m_freem(control); return EOPNOTSUPP; } /* * Unix domain socket option processing. */ int uipc_ctloutput(int op, struct socket *so, struct sockopt *sopt) { struct unpcb *unp = sotounpcb(so); int optval = 0, error = 0; KASSERT(solocked(so)); if (sopt->sopt_level != SOL_LOCAL) { error = ENOPROTOOPT; } else switch (op) { case PRCO_SETOPT: switch (sopt->sopt_name) { case LOCAL_OCREDS: if (!compat70_ocreds_valid) { error = ENOPROTOOPT; break; } /* FALLTHROUGH */ case LOCAL_CREDS: case LOCAL_CONNWAIT: error = sockopt_getint(sopt, &optval); if (error) break; switch (sopt->sopt_name) { #define OPTSET(bit) \ if (optval) \ unp->unp_flags |= (bit); \ else \ unp->unp_flags &= ~(bit); case LOCAL_CREDS: OPTSET(UNP_WANTCRED); break; case LOCAL_CONNWAIT: OPTSET(UNP_CONNWAIT); break; case LOCAL_OCREDS: OPTSET(UNP_OWANTCRED); break; } break; #undef OPTSET default: error = ENOPROTOOPT; break; } break; case PRCO_GETOPT: sounlock(so); switch (sopt->sopt_name) { case LOCAL_PEEREID: if (unp->unp_flags & UNP_EIDSVALID) { error = sockopt_set(sopt, &unp->unp_connid, sizeof(unp->unp_connid)); } else { error = EINVAL; } break; case LOCAL_CREDS: #define OPTBIT(bit) (unp->unp_flags & (bit) ? 1 : 0) optval = OPTBIT(UNP_WANTCRED); error = sockopt_setint(sopt, optval); break; case LOCAL_OCREDS: if (compat70_ocreds_valid) { optval = OPTBIT(UNP_OWANTCRED); error = sockopt_setint(sopt, optval); break; } #undef OPTBIT /* FALLTHROUGH */ default: error = ENOPROTOOPT; break; } solock(so); break; } return (error); } /* * Both send and receive buffers are allocated PIPSIZ bytes of buffering * for stream sockets, although the total for sender and receiver is * actually only PIPSIZ. * Datagram sockets really use the sendspace as the maximum datagram size, * and don't really want to reserve the sendspace. Their recvspace should * be large enough for at least one max-size datagram plus address. */ #ifndef PIPSIZ #define PIPSIZ 8192 #endif u_long unpst_sendspace = PIPSIZ; u_long unpst_recvspace = PIPSIZ; u_long unpdg_sendspace = 2*1024; /* really max datagram size */ u_long unpdg_recvspace = 16*1024; u_int unp_rights; /* files in flight */ u_int unp_rights_ratio = 2; /* limit, fraction of maxfiles */ static int unp_attach(struct socket *so, int proto) { struct unpcb *unp = sotounpcb(so); u_long sndspc, rcvspc; int error; KASSERT(unp == NULL); switch (so->so_type) { case SOCK_SEQPACKET: /* FALLTHROUGH */ case SOCK_STREAM: if (so->so_lock == NULL) { so->so_lock = mutex_obj_alloc(MUTEX_DEFAULT, IPL_NONE); solock(so); } sndspc = unpst_sendspace; rcvspc = unpst_recvspace; break; case SOCK_DGRAM: if (so->so_lock == NULL) { mutex_obj_hold(uipc_lock); so->so_lock = uipc_lock; solock(so); } sndspc = unpdg_sendspace; rcvspc = unpdg_recvspace; break; default: panic("unp_attach"); } if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0) { error = soreserve(so, sndspc, rcvspc); if (error) { return error; } } unp = kmem_zalloc(sizeof(*unp), KM_SLEEP); nanotime(&unp->unp_ctime); unp->unp_socket = so; so->so_pcb = unp; KASSERT(solocked(so)); return 0; } static void unp_detach(struct socket *so) { struct unpcb *unp; vnode_t *vp; unp = sotounpcb(so); KASSERT(unp != NULL); KASSERT(solocked(so)); retry: if ((vp = unp->unp_vnode) != NULL) { sounlock(so); /* Acquire v_interlock to protect against unp_connect(). */ /* XXXAD racy */ mutex_enter(vp->v_interlock); vp->v_socket = NULL; mutex_exit(vp->v_interlock); vrele(vp); solock(so); unp->unp_vnode = NULL; } if (unp->unp_conn) unp_disconnect1(unp); while (unp->unp_refs) { KASSERT(solocked2(so, unp->unp_refs->unp_socket)); if (unp_drop(unp->unp_refs, ECONNRESET)) { solock(so); goto retry; } } soisdisconnected(so); so->so_pcb = NULL; if (unp_rights) { /* * Normally the receive buffer is flushed later, in sofree, * but if our receive buffer holds references to files that * are now garbage, we will enqueue those file references to * the garbage collector and kick it into action. */ sorflush(so); unp_free(unp); unp_thread_kick(); } else unp_free(unp); } static int unp_accept(struct socket *so, struct sockaddr *nam) { struct unpcb *unp = sotounpcb(so); struct socket *so2; KASSERT(solocked(so)); KASSERT(nam != NULL); /* XXX code review required to determine if unp can ever be NULL */ if (unp == NULL) return EINVAL; KASSERT(so->so_lock == uipc_lock); /* * Mark the initiating STREAM socket as connected *ONLY* * after it's been accepted. This prevents a client from * overrunning a server and receiving ECONNREFUSED. */ if (unp->unp_conn == NULL) { /* * This will use the empty socket and will not * allocate. */ unp_setaddr(so, nam, true); return 0; } so2 = unp->unp_conn->unp_socket; if (so2->so_state & SS_ISCONNECTING) { KASSERT(solocked2(so, so->so_head)); KASSERT(solocked2(so2, so->so_head)); soisconnected(so2); } /* * If the connection is fully established, break the * association with uipc_lock and give the connected * pair a separate lock to share. * There is a race here: sotounpcb(so2)->unp_streamlock * is not locked, so when changing so2->so_lock * another thread can grab it while so->so_lock is still * pointing to the (locked) uipc_lock. * this should be harmless, except that this makes * solocked2() and solocked() unreliable. * Another problem is that unp_setaddr() expects the * the socket locked. Grabbing sotounpcb(so2)->unp_streamlock * fixes both issues. */ mutex_enter(sotounpcb(so2)->unp_streamlock); unp_setpeerlocks(so2, so); /* * Only now return peer's address, as we may need to * block in order to allocate memory. * * XXX Minor race: connection can be broken while * lock is dropped in unp_setaddr(). We will return * error == 0 and sun_noname as the peer address. */ unp_setaddr(so, nam, true); /* so_lock now points to unp_streamlock */ mutex_exit(so2->so_lock); return 0; } static int unp_ioctl(struct socket *so, u_long cmd, void *nam, struct ifnet *ifp) { return EOPNOTSUPP; } static int unp_stat(struct socket *so, struct stat *ub) { struct unpcb *unp; struct socket *so2; KASSERT(solocked(so)); unp = sotounpcb(so); if (unp == NULL) return EINVAL; ub->st_blksize = so->so_snd.sb_hiwat; switch (so->so_type) { case SOCK_SEQPACKET: /* FALLTHROUGH */ case SOCK_STREAM: if (unp->unp_conn == 0) break; so2 = unp->unp_conn->unp_socket; KASSERT(solocked2(so, so2)); ub->st_blksize += so2->so_rcv.sb_cc; break; default: break; } ub->st_dev = NODEV; if (unp->unp_ino == 0) unp->unp_ino = unp_ino++; ub->st_atimespec = ub->st_mtimespec = ub->st_ctimespec = unp->unp_ctime; ub->st_ino = unp->unp_ino; ub->st_uid = so->so_uidinfo->ui_uid; ub->st_gid = so->so_egid; return (0); } static int unp_peeraddr(struct socket *so, struct sockaddr *nam) { KASSERT(solocked(so)); KASSERT(sotounpcb(so) != NULL); KASSERT(nam != NULL); unp_setaddr(so, nam, true); return 0; } static int unp_sockaddr(struct socket *so, struct sockaddr *nam) { KASSERT(solocked(so)); KASSERT(sotounpcb(so) != NULL); KASSERT(nam != NULL); unp_setaddr(so, nam, false); return 0; } /* * we only need to perform this allocation until syscalls other than * bind are adjusted to use sockaddr_big. */ static struct sockaddr_un * makeun_sb(struct sockaddr *nam, size_t *addrlen) { struct sockaddr_un *sun; *addrlen = nam->sa_len + 1; sun = malloc(*addrlen, M_SONAME, M_WAITOK); memcpy(sun, nam, nam->sa_len); *(((char *)sun) + nam->sa_len) = '\0'; return sun; } static int unp_bind(struct socket *so, struct sockaddr *nam, struct lwp *l) { struct sockaddr_un *sun; struct unpcb *unp; vnode_t *vp; struct vattr vattr; size_t addrlen; int error; struct pathbuf *pb; struct nameidata nd; proc_t *p; unp = sotounpcb(so); KASSERT(solocked(so)); KASSERT(unp != NULL); KASSERT(nam != NULL); if (unp->unp_vnode != NULL) return (EINVAL); if ((unp->unp_flags & UNP_BUSY) != 0) { /* * EALREADY may not be strictly accurate, but since this * is a major application error it's hardly a big deal. */ return (EALREADY); } unp->unp_flags |= UNP_BUSY; sounlock(so); p = l->l_proc; sun = makeun_sb(nam, &addrlen); pb = pathbuf_create(sun->sun_path); if (pb == NULL) { error = ENOMEM; goto bad; } NDINIT(&nd, CREATE, FOLLOW | LOCKPARENT | TRYEMULROOT, pb); /* SHOULD BE ABLE TO ADOPT EXISTING AND wakeup() ALA FIFO's */ if ((error = namei(&nd)) != 0) { pathbuf_destroy(pb); goto bad; } vp = nd.ni_vp; if (vp != NULL) { VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd); if (nd.ni_dvp == vp) vrele(nd.ni_dvp); else vput(nd.ni_dvp); vrele(vp); pathbuf_destroy(pb); error = EADDRINUSE; goto bad; } vattr_null(&vattr); vattr.va_type = VSOCK; vattr.va_mode = ACCESSPERMS & ~(p->p_cwdi->cwdi_cmask); error = VOP_CREATE(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr); if (error) { vput(nd.ni_dvp); pathbuf_destroy(pb); goto bad; } vp = nd.ni_vp; vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); solock(so); vp->v_socket = unp->unp_socket; unp->unp_vnode = vp; unp->unp_addrlen = addrlen; unp->unp_addr = sun; VOP_UNLOCK(vp); vput(nd.ni_dvp); unp->unp_flags &= ~UNP_BUSY; pathbuf_destroy(pb); return (0); bad: free(sun, M_SONAME); solock(so); unp->unp_flags &= ~UNP_BUSY; return (error); } static int unp_listen(struct socket *so, struct lwp *l) { struct unpcb *unp = sotounpcb(so); KASSERT(solocked(so)); KASSERT(unp != NULL); /* * If the socket can accept a connection, it must be * locked by uipc_lock. */ unp_resetlock(so); if (unp->unp_vnode == NULL) return EINVAL; unp_connid(l, unp, UNP_EIDSBIND); return 0; } static int unp_disconnect(struct socket *so) { KASSERT(solocked(so)); KASSERT(sotounpcb(so) != NULL); unp_disconnect1(sotounpcb(so)); return 0; } static int unp_shutdown(struct socket *so) { KASSERT(solocked(so)); KASSERT(sotounpcb(so) != NULL); socantsendmore(so); unp_shutdown1(sotounpcb(so)); return 0; } static int unp_abort(struct socket *so) { KASSERT(solocked(so)); KASSERT(sotounpcb(so) != NULL); (void)unp_drop(sotounpcb(so), ECONNABORTED); KASSERT(so->so_head == NULL); KASSERT(so->so_pcb != NULL); unp_detach(so); return 0; } static int unp_connect1(struct socket *so, struct socket *so2, struct lwp *l) { struct unpcb *unp = sotounpcb(so); struct unpcb *unp2; if (so2->so_type != so->so_type) return EPROTOTYPE; /* * All three sockets involved must be locked by same lock: * * local endpoint (so) * remote endpoint (so2) * queue head (so2->so_head, only if PR_CONNREQUIRED) */ KASSERT(solocked2(so, so2)); KASSERT(so->so_head == NULL); if (so2->so_head != NULL) { KASSERT(so2->so_lock == uipc_lock); KASSERT(solocked2(so2, so2->so_head)); } unp2 = sotounpcb(so2); unp->unp_conn = unp2; switch (so->so_type) { case SOCK_DGRAM: unp->unp_nextref = unp2->unp_refs; unp2->unp_refs = unp; soisconnected(so); break; case SOCK_SEQPACKET: /* FALLTHROUGH */ case SOCK_STREAM: /* * SOCK_SEQPACKET and SOCK_STREAM cases are handled by callers * which are unp_connect() or unp_connect2(). */ break; default: panic("unp_connect1"); } return 0; } int unp_connect(struct socket *so, struct sockaddr *nam, struct lwp *l) { struct sockaddr_un *sun; vnode_t *vp; struct socket *so2, *so3; struct unpcb *unp, *unp2, *unp3; size_t addrlen; int error; struct pathbuf *pb; struct nameidata nd; unp = sotounpcb(so); if ((unp->unp_flags & UNP_BUSY) != 0) { /* * EALREADY may not be strictly accurate, but since this * is a major application error it's hardly a big deal. */ return (EALREADY); } unp->unp_flags |= UNP_BUSY; sounlock(so); sun = makeun_sb(nam, &addrlen); pb = pathbuf_create(sun->sun_path); if (pb == NULL) { error = ENOMEM; goto bad2; } NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | TRYEMULROOT, pb); if ((error = namei(&nd)) != 0) { pathbuf_destroy(pb); goto bad2; } vp = nd.ni_vp; pathbuf_destroy(pb); if (vp->v_type != VSOCK) { error = ENOTSOCK; goto bad; } if ((error = VOP_ACCESS(vp, VWRITE, l->l_cred)) != 0) goto bad; /* Acquire v_interlock to protect against unp_detach(). */ mutex_enter(vp->v_interlock); so2 = vp->v_socket; if (so2 == NULL) { mutex_exit(vp->v_interlock); error = ECONNREFUSED; goto bad; } if (so->so_type != so2->so_type) { mutex_exit(vp->v_interlock); error = EPROTOTYPE; goto bad; } solock(so); unp_resetlock(so); mutex_exit(vp->v_interlock); if ((so->so_proto->pr_flags & PR_CONNREQUIRED) != 0) { /* * This may seem somewhat fragile but is OK: if we can * see SO_ACCEPTCONN set on the endpoint, then it must * be locked by the domain-wide uipc_lock. */ KASSERT((so2->so_options & SO_ACCEPTCONN) == 0 || so2->so_lock == uipc_lock); if ((so2->so_options & SO_ACCEPTCONN) == 0 || (so3 = sonewconn(so2, false)) == NULL) { error = ECONNREFUSED; sounlock(so); goto bad; } unp2 = sotounpcb(so2); unp3 = sotounpcb(so3); if (unp2->unp_addr) { unp3->unp_addr = malloc(unp2->unp_addrlen, M_SONAME, M_WAITOK); memcpy(unp3->unp_addr, unp2->unp_addr, unp2->unp_addrlen); unp3->unp_addrlen = unp2->unp_addrlen; } unp3->unp_flags = unp2->unp_flags; so2 = so3; /* * The connector's (client's) credentials are copied from its * process structure at the time of connect() (which is now). */ unp_connid(l, unp3, UNP_EIDSVALID); /* * The receiver's (server's) credentials are copied from the * unp_peercred member of socket on which the former called * listen(); unp_listen() cached that process's credentials * at that time so we can use them now. */ if (unp2->unp_flags & UNP_EIDSBIND) { memcpy(&unp->unp_connid, &unp2->unp_connid, sizeof(unp->unp_connid)); unp->unp_flags |= UNP_EIDSVALID; } } error = unp_connect1(so, so2, l); if (error) { sounlock(so); goto bad; } unp2 = sotounpcb(so2); switch (so->so_type) { /* * SOCK_DGRAM and default cases are handled in prior call to * unp_connect1(), do not add a default case without fixing * unp_connect1(). */ case SOCK_SEQPACKET: /* FALLTHROUGH */ case SOCK_STREAM: unp2->unp_conn = unp; if ((unp->unp_flags | unp2->unp_flags) & UNP_CONNWAIT) soisconnecting(so); else soisconnected(so); soisconnected(so2); /* * If the connection is fully established, break the * association with uipc_lock and give the connected * pair a separate lock to share. */ KASSERT(so2->so_head != NULL); unp_setpeerlocks(so, so2); break; } sounlock(so); bad: vput(vp); bad2: free(sun, M_SONAME); solock(so); unp->unp_flags &= ~UNP_BUSY; return (error); } int unp_connect2(struct socket *so, struct socket *so2) { struct unpcb *unp = sotounpcb(so); struct unpcb *unp2; int error = 0; KASSERT(solocked2(so, so2)); error = unp_connect1(so, so2, curlwp); if (error) return error; unp2 = sotounpcb(so2); switch (so->so_type) { /* * SOCK_DGRAM and default cases are handled in prior call to * unp_connect1(), do not add a default case without fixing * unp_connect1(). */ case SOCK_SEQPACKET: /* FALLTHROUGH */ case SOCK_STREAM: unp2->unp_conn = unp; soisconnected(so); soisconnected(so2); break; } return error; } static void unp_disconnect1(struct unpcb *unp) { struct unpcb *unp2 = unp->unp_conn; struct socket *so; if (unp2 == 0) return; unp->unp_conn = 0; so = unp->unp_socket; switch (so->so_type) { case SOCK_DGRAM: if (unp2->unp_refs == unp) unp2->unp_refs = unp->unp_nextref; else { unp2 = unp2->unp_refs; for (;;) { KASSERT(solocked2(so, unp2->unp_socket)); if (unp2 == 0) panic("unp_disconnect1"); if (unp2->unp_nextref == unp) break; unp2 = unp2->unp_nextref; } unp2->unp_nextref = unp->unp_nextref; } unp->unp_nextref = 0; so->so_state &= ~SS_ISCONNECTED; break; case SOCK_SEQPACKET: /* FALLTHROUGH */ case SOCK_STREAM: KASSERT(solocked2(so, unp2->unp_socket)); soisdisconnected(so); unp2->unp_conn = 0; soisdisconnected(unp2->unp_socket); break; } } static void unp_shutdown1(struct unpcb *unp) { struct socket *so; switch(unp->unp_socket->so_type) { case SOCK_SEQPACKET: /* FALLTHROUGH */ case SOCK_STREAM: if (unp->unp_conn && (so = unp->unp_conn->unp_socket)) socantrcvmore(so); break; default: break; } } static bool unp_drop(struct unpcb *unp, int errno) { struct socket *so = unp->unp_socket; KASSERT(solocked(so)); so->so_error = errno; unp_disconnect1(unp); if (so->so_head) { so->so_pcb = NULL; /* sofree() drops the socket lock */ sofree(so); unp_free(unp); return true; } return false; } #ifdef notdef unp_drain(void) { } #endif int unp_externalize(struct mbuf *rights, struct lwp *l, int flags) { struct cmsghdr * const cm = mtod(rights, struct cmsghdr *); struct proc * const p = l->l_proc; file_t **rp; int error = 0; const size_t nfds = (cm->cmsg_len - CMSG_ALIGN(sizeof(*cm))) / sizeof(file_t *); if (nfds == 0) goto noop; int * const fdp = kmem_alloc(nfds * sizeof(int), KM_SLEEP); rw_enter(&p->p_cwdi->cwdi_lock, RW_READER); /* Make sure the recipient should be able to see the files.. */ rp = (file_t **)CMSG_DATA(cm); for (size_t i = 0; i < nfds; i++) { file_t * const fp = *rp++; if (fp == NULL) { error = EINVAL; goto out; } /* * If we are in a chroot'ed directory, and * someone wants to pass us a directory, make * sure it's inside the subtree we're allowed * to access. */ if (p->p_cwdi->cwdi_rdir != NULL && fp->f_type == DTYPE_VNODE) { vnode_t *vp = fp->f_vnode; if ((vp->v_type == VDIR) && !vn_isunder(vp, p->p_cwdi->cwdi_rdir, l)) { error = EPERM; goto out; } } } restart: /* * First loop -- allocate file descriptor table slots for the * new files. */ for (size_t i = 0; i < nfds; i++) { if ((error = fd_alloc(p, 0, &fdp[i])) != 0) { /* * Back out what we've done so far. */ while (i-- > 0) { fd_abort(p, NULL, fdp[i]); } if (error == ENOSPC) { fd_tryexpand(p); error = 0; goto restart; } /* * This is the error that has historically * been returned, and some callers may * expect it. */ error = EMSGSIZE; goto out; } } /* * Now that adding them has succeeded, update all of the * file passing state and affix the descriptors. */ rp = (file_t **)CMSG_DATA(cm); int *ofdp = (int *)CMSG_DATA(cm); for (size_t i = 0; i < nfds; i++) { file_t * const fp = *rp++; const int fd = fdp[i]; atomic_dec_uint(&unp_rights); fd_set_exclose(l, fd, (flags & O_CLOEXEC) != 0); fd_affix(p, fp, fd); /* * Done with this file pointer, replace it with a fd; */ *ofdp++ = fd; mutex_enter(&fp->f_lock); fp->f_msgcount--; mutex_exit(&fp->f_lock); /* * Note that fd_affix() adds a reference to the file. * The file may already have been closed by another * LWP in the process, so we must drop the reference * added by unp_internalize() with closef(). */ closef(fp); } /* * Adjust length, in case of transition from large file_t * pointers to ints. */ if (sizeof(file_t *) != sizeof(int)) { cm->cmsg_len = CMSG_LEN(nfds * sizeof(int)); rights->m_len = CMSG_SPACE(nfds * sizeof(int)); } out: if (__predict_false(error != 0)) { file_t **const fpp = (file_t **)CMSG_DATA(cm); for (size_t i = 0; i < nfds; i++) unp_discard_now(fpp[i]); /* * Truncate the array so that nobody will try to interpret * what is now garbage in it. */ cm->cmsg_len = CMSG_LEN(0); rights->m_len = CMSG_SPACE(0); } rw_exit(&p->p_cwdi->cwdi_lock); kmem_free(fdp, nfds * sizeof(int)); noop: /* * Don't disclose kernel memory in the alignment space. */ KASSERT(cm->cmsg_len <= rights->m_len); memset(&mtod(rights, char *)[cm->cmsg_len], 0, rights->m_len - cm->cmsg_len); return error; } static int unp_internalize(struct mbuf **controlp) { filedesc_t *fdescp = curlwp->l_fd; fdtab_t *dt; struct mbuf *control = *controlp; struct cmsghdr *newcm, *cm = mtod(control, struct cmsghdr *); file_t **rp, **files; file_t *fp; int i, fd, *fdp; int nfds, error; u_int maxmsg; error = 0; newcm = NULL; /* Sanity check the control message header. */ if (cm->cmsg_type != SCM_RIGHTS || cm->cmsg_level != SOL_SOCKET || cm->cmsg_len > control->m_len || cm->cmsg_len < CMSG_ALIGN(sizeof(*cm))) return (EINVAL); /* * Verify that the file descriptors are valid, and acquire * a reference to each. */ nfds = (cm->cmsg_len - CMSG_ALIGN(sizeof(*cm))) / sizeof(int); fdp = (int *)CMSG_DATA(cm); maxmsg = maxfiles / unp_rights_ratio; for (i = 0; i < nfds; i++) { fd = *fdp++; if (atomic_inc_uint_nv(&unp_rights) > maxmsg) { atomic_dec_uint(&unp_rights); nfds = i; error = EAGAIN; goto out; } if ((fp = fd_getfile(fd)) == NULL || fp->f_type == DTYPE_KQUEUE) { if (fp) fd_putfile(fd); atomic_dec_uint(&unp_rights); nfds = i; error = EBADF; goto out; } } /* Allocate new space and copy header into it. */ newcm = malloc(CMSG_SPACE(nfds * sizeof(file_t *)), M_MBUF, M_WAITOK); if (newcm == NULL) { error = E2BIG; goto out; } memcpy(newcm, cm, sizeof(struct cmsghdr)); memset(newcm + 1, 0, CMSG_LEN(0) - sizeof(struct cmsghdr)); files = (file_t **)CMSG_DATA(newcm); /* * Transform the file descriptors into file_t pointers, in * reverse order so that if pointers are bigger than ints, the * int won't get until we're done. No need to lock, as we have * already validated the descriptors with fd_getfile(). */ fdp = (int *)CMSG_DATA(cm) + nfds; rp = files + nfds; for (i = 0; i < nfds; i++) { dt = atomic_load_consume(&fdescp->fd_dt); fp = atomic_load_consume(&dt->dt_ff[*--fdp]->ff_file); KASSERT(fp != NULL); mutex_enter(&fp->f_lock); *--rp = fp; fp->f_count++; fp->f_msgcount++; mutex_exit(&fp->f_lock); } out: /* Release descriptor references. */ fdp = (int *)CMSG_DATA(cm); for (i = 0; i < nfds; i++) { fd_putfile(*fdp++); if (error != 0) { atomic_dec_uint(&unp_rights); } } if (error == 0) { if (control->m_flags & M_EXT) { m_freem(control); *controlp = control = m_get(M_WAIT, MT_CONTROL); } MEXTADD(control, newcm, CMSG_SPACE(nfds * sizeof(file_t *)), M_MBUF, NULL, NULL); cm = newcm; /* * Adjust message & mbuf to note amount of space * actually used. */ cm->cmsg_len = CMSG_LEN(nfds * sizeof(file_t *)); control->m_len = CMSG_SPACE(nfds * sizeof(file_t *)); } return error; } struct mbuf * unp_addsockcred(struct lwp *l, struct mbuf *control) { struct sockcred *sc; struct mbuf *m; void *p; m = sbcreatecontrol1(&p, SOCKCREDSIZE(kauth_cred_ngroups(l->l_cred)), SCM_CREDS, SOL_SOCKET, M_WAITOK); if (m == NULL) return control; sc = p; sc->sc_pid = l->l_proc->p_pid; sc->sc_uid = kauth_cred_getuid(l->l_cred); sc->sc_euid = kauth_cred_geteuid(l->l_cred); sc->sc_gid = kauth_cred_getgid(l->l_cred); sc->sc_egid = kauth_cred_getegid(l->l_cred); sc->sc_ngroups = kauth_cred_ngroups(l->l_cred); for (int i = 0; i < sc->sc_ngroups; i++) sc->sc_groups[i] = kauth_cred_group(l->l_cred, i); return m_add(control, m); } /* * Do a mark-sweep GC of files in the system, to free up any which are * caught in flight to an about-to-be-closed socket. Additionally, * process deferred file closures. */ static void unp_gc(file_t *dp) { extern struct domain unixdomain; file_t *fp, *np; struct socket *so, *so1; u_int i, oflags, rflags; bool didwork; KASSERT(curlwp == unp_thread_lwp); KASSERT(mutex_owned(&filelist_lock)); /* * First, process deferred file closures. */ while (!SLIST_EMPTY(&unp_thread_discard)) { fp = SLIST_FIRST(&unp_thread_discard); KASSERT(fp->f_unpcount > 0); KASSERT(fp->f_count > 0); KASSERT(fp->f_msgcount > 0); KASSERT(fp->f_count >= fp->f_unpcount); KASSERT(fp->f_count >= fp->f_msgcount); KASSERT(fp->f_msgcount >= fp->f_unpcount); SLIST_REMOVE_HEAD(&unp_thread_discard, f_unplist); i = fp->f_unpcount; fp->f_unpcount = 0; mutex_exit(&filelist_lock); for (; i != 0; i--) { unp_discard_now(fp); } mutex_enter(&filelist_lock); } /* * Clear mark bits. Ensure that we don't consider new files * entering the file table during this loop (they will not have * FSCAN set). */ unp_defer = 0; LIST_FOREACH(fp, &filehead, f_list) { for (oflags = fp->f_flag;; oflags = rflags) { rflags = atomic_cas_uint(&fp->f_flag, oflags, (oflags | FSCAN) & ~(FMARK|FDEFER)); if (__predict_true(oflags == rflags)) { break; } } } /* * Iterate over the set of sockets, marking ones believed (based on * refcount) to be referenced from a process, and marking for rescan * sockets which are queued on a socket. Recan continues descending * and searching for sockets referenced by sockets (FDEFER), until * there are no more socket->socket references to be discovered. */ do { didwork = false; for (fp = LIST_FIRST(&filehead); fp != NULL; fp = np) { KASSERT(mutex_owned(&filelist_lock)); np = LIST_NEXT(fp, f_list); mutex_enter(&fp->f_lock); if ((fp->f_flag & FDEFER) != 0) { atomic_and_uint(&fp->f_flag, ~FDEFER); unp_defer--; if (fp->f_count == 0) { /* * XXX: closef() doesn't pay attention * to FDEFER */ mutex_exit(&fp->f_lock); continue; } } else { if (fp->f_count == 0 || (fp->f_flag & FMARK) != 0 || fp->f_count == fp->f_msgcount || fp->f_unpcount != 0) { mutex_exit(&fp->f_lock); continue; } } atomic_or_uint(&fp->f_flag, FMARK); if (fp->f_type != DTYPE_SOCKET || (so = fp->f_socket) == NULL || so->so_proto->pr_domain != &unixdomain || (so->so_proto->pr_flags & PR_RIGHTS) == 0) { mutex_exit(&fp->f_lock); continue; } /* Gain file ref, mark our position, and unlock. */ didwork = true; LIST_INSERT_AFTER(fp, dp, f_list); fp->f_count++; mutex_exit(&fp->f_lock); mutex_exit(&filelist_lock); /* * Mark files referenced from sockets queued on the * accept queue as well. */ solock(so); unp_scan(so->so_rcv.sb_mb, unp_mark, 0); if ((so->so_options & SO_ACCEPTCONN) != 0) { TAILQ_FOREACH(so1, &so->so_q0, so_qe) { unp_scan(so1->so_rcv.sb_mb, unp_mark, 0); } TAILQ_FOREACH(so1, &so->so_q, so_qe) { unp_scan(so1->so_rcv.sb_mb, unp_mark, 0); } } sounlock(so); /* Re-lock and restart from where we left off. */ closef(fp); mutex_enter(&filelist_lock); np = LIST_NEXT(dp, f_list); LIST_REMOVE(dp, f_list); } /* * Bail early if we did nothing in the loop above. Could * happen because of concurrent activity causing unp_defer * to get out of sync. */ } while (unp_defer != 0 && didwork); /* * Sweep pass. * * We grab an extra reference to each of the files that are * not otherwise accessible and then free the rights that are * stored in messages on them. */ for (fp = LIST_FIRST(&filehead); fp != NULL; fp = np) { KASSERT(mutex_owned(&filelist_lock)); np = LIST_NEXT(fp, f_list); mutex_enter(&fp->f_lock); /* * Ignore non-sockets. * Ignore dead sockets, or sockets with pending close. * Ignore sockets obviously referenced elsewhere. * Ignore sockets marked as referenced by our scan. * Ignore new sockets that did not exist during the scan. */ if (fp->f_type != DTYPE_SOCKET || fp->f_count == 0 || fp->f_unpcount != 0 || fp->f_count != fp->f_msgcount || (fp->f_flag & (FMARK | FSCAN)) != FSCAN) { mutex_exit(&fp->f_lock); continue; } /* Gain file ref, mark our position, and unlock. */ LIST_INSERT_AFTER(fp, dp, f_list); fp->f_count++; mutex_exit(&fp->f_lock); mutex_exit(&filelist_lock); /* * Flush all data from the socket's receive buffer. * This will cause files referenced only by the * socket to be queued for close. */ so = fp->f_socket; solock(so); sorflush(so); sounlock(so); /* Re-lock and restart from where we left off. */ closef(fp); mutex_enter(&filelist_lock); np = LIST_NEXT(dp, f_list); LIST_REMOVE(dp, f_list); } } /* * Garbage collector thread. While SCM_RIGHTS messages are in transit, * wake once per second to garbage collect. Run continually while we * have deferred closes to process. */ static void unp_thread(void *cookie) { file_t *dp; /* Allocate a dummy file for our scans. */ if ((dp = fgetdummy()) == NULL) { panic("unp_thread"); } mutex_enter(&filelist_lock); for (;;) { KASSERT(mutex_owned(&filelist_lock)); if (SLIST_EMPTY(&unp_thread_discard)) { if (unp_rights != 0) { (void)cv_timedwait(&unp_thread_cv, &filelist_lock, hz); } else { cv_wait(&unp_thread_cv, &filelist_lock); } } unp_gc(dp); } /* NOTREACHED */ } /* * Kick the garbage collector into action if there is something for * it to process. */ static void unp_thread_kick(void) { if (!SLIST_EMPTY(&unp_thread_discard) || unp_rights != 0) { mutex_enter(&filelist_lock); cv_signal(&unp_thread_cv); mutex_exit(&filelist_lock); } } void unp_dispose(struct mbuf *m) { if (m) unp_scan(m, unp_discard_later, 1); } void unp_scan(struct mbuf *m0, void (*op)(file_t *), int discard) { struct mbuf *m; file_t **rp, *fp; struct cmsghdr *cm; int i, qfds; while (m0) { for (m = m0; m; m = m->m_next) { if (m->m_type != MT_CONTROL || m->m_len < sizeof(*cm)) { continue; } cm = mtod(m, struct cmsghdr *); if (cm->cmsg_level != SOL_SOCKET || cm->cmsg_type != SCM_RIGHTS) continue; qfds = (cm->cmsg_len - CMSG_ALIGN(sizeof(*cm))) / sizeof(file_t *); rp = (file_t **)CMSG_DATA(cm); for (i = 0; i < qfds; i++) { fp = *rp; if (discard) { *rp = 0; } (*op)(fp); rp++; } } m0 = m0->m_nextpkt; } } void unp_mark(file_t *fp) { if (fp == NULL) return; /* If we're already deferred, don't screw up the defer count */ mutex_enter(&fp->f_lock); if (fp->f_flag & (FMARK | FDEFER)) { mutex_exit(&fp->f_lock); return; } /* * Minimize the number of deferrals... Sockets are the only type of * file which can hold references to another file, so just mark * other files, and defer unmarked sockets for the next pass. */ if (fp->f_type == DTYPE_SOCKET) { unp_defer++; KASSERT(fp->f_count != 0); atomic_or_uint(&fp->f_flag, FDEFER); } else { atomic_or_uint(&fp->f_flag, FMARK); } mutex_exit(&fp->f_lock); } static void unp_discard_now(file_t *fp) { if (fp == NULL) return; KASSERT(fp->f_count > 0); KASSERT(fp->f_msgcount > 0); mutex_enter(&fp->f_lock); fp->f_msgcount--; mutex_exit(&fp->f_lock); atomic_dec_uint(&unp_rights); (void)closef(fp); } static void unp_discard_later(file_t *fp) { if (fp == NULL) return; KASSERT(fp->f_count > 0); KASSERT(fp->f_msgcount > 0); mutex_enter(&filelist_lock); if (fp->f_unpcount++ == 0) { SLIST_INSERT_HEAD(&unp_thread_discard, fp, f_unplist); } mutex_exit(&filelist_lock); } static void unp_sysctl_create(void) { KASSERT(usrreq_sysctllog == NULL); sysctl_createv(&usrreq_sysctllog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_LONG, "sendspace", SYSCTL_DESCR("Default stream send space"), NULL, 0, &unpst_sendspace, 0, CTL_NET, PF_LOCAL, SOCK_STREAM, CTL_CREATE, CTL_EOL); sysctl_createv(&usrreq_sysctllog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_LONG, "recvspace", SYSCTL_DESCR("Default stream recv space"), NULL, 0, &unpst_recvspace, 0, CTL_NET, PF_LOCAL, SOCK_STREAM, CTL_CREATE, CTL_EOL); sysctl_createv(&usrreq_sysctllog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_LONG, "sendspace", SYSCTL_DESCR("Default datagram send space"), NULL, 0, &unpdg_sendspace, 0, CTL_NET, PF_LOCAL, SOCK_DGRAM, CTL_CREATE, CTL_EOL); sysctl_createv(&usrreq_sysctllog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_LONG, "recvspace", SYSCTL_DESCR("Default datagram recv space"), NULL, 0, &unpdg_recvspace, 0, CTL_NET, PF_LOCAL, SOCK_DGRAM, CTL_CREATE, CTL_EOL); sysctl_createv(&usrreq_sysctllog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_READONLY, CTLTYPE_INT, "inflight", SYSCTL_DESCR("File descriptors in flight"), NULL, 0, &unp_rights, 0, CTL_NET, PF_LOCAL, CTL_CREATE, CTL_EOL); sysctl_createv(&usrreq_sysctllog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_READONLY, CTLTYPE_INT, "deferred", SYSCTL_DESCR("File descriptors deferred for close"), NULL, 0, &unp_defer, 0, CTL_NET, PF_LOCAL, CTL_CREATE, CTL_EOL); } const struct pr_usrreqs unp_usrreqs = { .pr_attach = unp_attach, .pr_detach = unp_detach, .pr_accept = unp_accept, .pr_bind = unp_bind, .pr_listen = unp_listen, .pr_connect = unp_connect, .pr_connect2 = unp_connect2, .pr_disconnect = unp_disconnect, .pr_shutdown = unp_shutdown, .pr_abort = unp_abort, .pr_ioctl = unp_ioctl, .pr_stat = unp_stat, .pr_peeraddr = unp_peeraddr, .pr_sockaddr = unp_sockaddr, .pr_rcvd = unp_rcvd, .pr_recvoob = unp_recvoob, .pr_send = unp_send, .pr_sendoob = unp_sendoob, };
131 132 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 /* $NetBSD: layer_subr.c,v 1.39 2022/04/10 09:50:46 andvar Exp $ */ /* * Copyright (c) 1999 National Aeronautics & Space Administration * All rights reserved. * * This software was written by William Studenmund of the * Numerical Aerospace Simulation Facility, NASA Ames Research Center. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the National Aeronautics & Space Administration * nor the names of its contributors may be used to endorse or promote * products derived from this software without specific prior written * permission. * * THIS SOFTWARE IS PROVIDED BY THE NATIONAL AERONAUTICS & SPACE ADMINISTRATION * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE ADMINISTRATION OR CONTRIB- * UTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /* * Copyright (c) 1992, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software donated to Berkeley by * Jan-Simon Pendry. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: Id: lofs_subr.c,v 1.11 1992/05/30 10:05:43 jsp Exp * @(#)null_subr.c 8.7 (Berkeley) 5/14/95 */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: layer_subr.c,v 1.39 2022/04/10 09:50:46 andvar Exp $"); #include <sys/param.h> #include <sys/systm.h> #include <sys/proc.h> #include <sys/time.h> #include <sys/vnode.h> #include <sys/mount.h> #include <sys/namei.h> #include <sys/kmem.h> #include <miscfs/genfs/layer.h> #include <miscfs/genfs/layer_extern.h> #ifdef LAYERFS_DIAGNOSTIC int layerfs_debug = 1; #endif /* * layer cache: * Each cache entry holds a reference to the lower vnode * along with a pointer to the alias vnode. When an * entry is added the lower vnode is VREF'd. When the * alias is removed the lower vnode is vrele'd. */ void layerfs_init(void) { /* Nothing. */ } void layerfs_done(void) { /* Nothing. */ } /* * layer_node_create: try to find an existing layerfs vnode referring to it, * otherwise make a new vnode which contains a reference to the lower vnode. */ int layer_node_create(struct mount *mp, struct vnode *lowervp, struct vnode **nvpp) { int error; struct vnode *aliasvp; error = vcache_get(mp, &lowervp, sizeof(lowervp), &aliasvp); if (error) return error; /* * Now that we acquired a reference on the upper vnode, release one * on the lower node. The existence of the layer_node retains one * reference to the lower node. */ vrele(lowervp); KASSERT(vrefcnt(lowervp) > 0); #ifdef LAYERFS_DIAGNOSTIC if (layerfs_debug) vprint("layer_node_create: alias", aliasvp); #endif *nvpp = aliasvp; return 0; } #ifdef LAYERFS_DIAGNOSTIC struct vnode * layer_checkvp(struct vnode *vp, const char *fil, int lno) { struct layer_node *a = VTOLAYER(vp); #ifdef notyet /* * Can't do this check because vop_reclaim runs * with a funny vop vector. * * WRS - no it doesnt... */ if (vp->v_op != layer_vnodeop_p) { printf ("layer_checkvp: on non-layer-node\n"); #ifdef notyet while (layer_checkvp_barrier) /*WAIT*/ ; #endif panic("layer_checkvp"); }; #endif if (a->layer_lowervp == NULL) { /* Should never happen */ int i; u_long *p; printf("vp = %p, ZERO ptr\n", vp); for (p = (u_long *) a, i = 0; i < 8; i++) printf(" %lx", p[i]); printf("\n"); /* wait for debugger */ panic("layer_checkvp"); } if (vrefcnt(a->layer_lowervp) < 1) { int i; u_long *p; printf("vp = %p, unref'ed lowervp\n", vp); for (p = (u_long *) a, i = 0; i < 8; i++) printf(" %lx", p[i]); printf("\n"); /* wait for debugger */ panic ("layer with unref'ed lowervp"); }; #ifdef notnow printf("layer %p/%d -> %p/%d [%s, %d]\n", LAYERTOV(a), vrefcnt(LAYERTOV(a)), a->layer_lowervp, vrefcnt(a->layer_lowervp), fil, lno); #endif return a->layer_lowervp; } #endif
34 31 24 18 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 /* $NetBSD: copystr.c,v 1.1 2020/06/30 16:20:02 maxv Exp $ */ /* * Copyright (c) 2020 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Maxime Villard. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #include <sys/systm.h> #include <sys/errno.h> int copystr(const void *kfaddr, void *kdaddr, size_t len, size_t *done) { const char *src = kfaddr; char *dst = kdaddr; size_t i; for (i = 0; i < len; i++) { if ((*dst++ = *src++) == '\0') { if (done) *done = i + 1; return 0; } } if (done) *done = i; return ENAMETOOLONG; }
14 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 /* $NetBSD: sleepq.h,v 1.42 2023/10/15 10:30:00 riastradh Exp $ */ /*- * Copyright (c) 2002, 2006, 2007, 2008, 2009, 2019, 2020, 2023 * The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Jason R. Thorpe and Andrew Doran. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #ifndef _SYS_SLEEPQ_H_ #define _SYS_SLEEPQ_H_ #include <sys/param.h> #include <sys/lwp.h> #include <sys/mutex.h> #include <sys/pool.h> #include <sys/queue.h> #include <sys/sched.h> #include <sys/wchan.h> struct syncobj; /* * Generic sleep queues. */ typedef struct sleepq sleepq_t; void sleepq_init(sleepq_t *); void sleepq_remove(sleepq_t *, lwp_t *, bool); int sleepq_enter(sleepq_t *, lwp_t *, kmutex_t *); void sleepq_enqueue(sleepq_t *, wchan_t, const char *, const struct syncobj *, bool); void sleepq_transfer(lwp_t *, sleepq_t *, sleepq_t *, wchan_t, const char *, const struct syncobj *, kmutex_t *, bool); void sleepq_uncatch(lwp_t *); void sleepq_unsleep(lwp_t *, bool); void sleepq_timeout(void *); void sleepq_wake(sleepq_t *, wchan_t, u_int, kmutex_t *); int sleepq_abort(kmutex_t *, int); void sleepq_changepri(lwp_t *, pri_t); void sleepq_lendpri(lwp_t *, pri_t); int sleepq_block(int, bool, const struct syncobj *, int); #ifdef _KERNEL #include <sys/kernel.h> typedef union { kmutex_t lock; uint8_t pad[COHERENCY_UNIT]; } sleepqlock_t; /* * Return non-zero if it is unsafe to sleep. * * XXX This only exists because panic() is broken. */ static __inline bool sleepq_dontsleep(lwp_t *l) { return cold || (doing_shutdown && (panicstr || CURCPU_IDLE_P())); } #endif /* _KERNEL */ #include <sys/sleeptab.h> #endif /* _SYS_SLEEPQ_H_ */
11 5 2 1 2 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 /* $NetBSD: compat_50_quota.c,v 1.4 2022/09/21 07:15:24 dholland Exp $ */ /*- * Copyright (c) 2008 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Christos Zoulas. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: compat_50_quota.c,v 1.4 2022/09/21 07:15:24 dholland Exp $"); #if defined(_KERNEL_OPT) #include "opt_compat_netbsd.h" #endif #include <sys/module.h> #include <sys/namei.h> #include <sys/param.h> #include <sys/quota.h> #include <sys/quotactl.h> #include <sys/systm.h> #include <sys/syscall.h> #include <sys/syscallvar.h> #include <sys/syscallargs.h> #include <sys/vfs_syscalls.h> #include <sys/vnode.h> #include <ufs/ufs/quota1.h> static const struct syscall_package vfs_syscalls_50_quota_syscalls[] = { { SYS_compat_50_quotactl, 0, (sy_call_t *)compat_50_sys_quotactl }, { 0, 0, NULL } }; /* ARGSUSED */ int compat_50_sys_quotactl(struct lwp *l, const struct compat_50_sys_quotactl_args *uap, register_t *retval) { /* { syscallarg(const char *) path; syscallarg(int) cmd; syscallarg(int) uid; syscallarg(void *) arg; } */ struct vnode *vp; struct mount *mp; int q1cmd; int idtype; char *qfile; struct dqblk dqblk; struct quotakey key; struct quotaval blocks, files; struct quotastat qstat; int error; error = namei_simple_user(SCARG(uap, path), NSM_FOLLOW_TRYEMULROOT, &vp); if (error != 0) return (error); mp = vp->v_mount; q1cmd = SCARG(uap, cmd); idtype = quota_idtype_from_ufs(q1cmd & SUBCMDMASK); if (idtype == -1) { return EINVAL; } switch ((q1cmd & ~SUBCMDMASK) >> SUBCMDSHIFT) { case Q_QUOTAON: qfile = PNBUF_GET(); error = copyinstr(SCARG(uap, arg), qfile, PATH_MAX, NULL); if (error != 0) { PNBUF_PUT(qfile); break; } error = vfs_quotactl_quotaon(mp, idtype, qfile); PNBUF_PUT(qfile); break; case Q_QUOTAOFF: error = vfs_quotactl_quotaoff(mp, idtype); break; case Q_GETQUOTA: key.qk_idtype = idtype; key.qk_id = SCARG(uap, uid); key.qk_objtype = QUOTA_OBJTYPE_BLOCKS; error = vfs_quotactl_get(mp, &key, &blocks); if (error) { break; } key.qk_objtype = QUOTA_OBJTYPE_FILES; error = vfs_quotactl_get(mp, &key, &files); if (error) { break; } quotavals_to_dqblk(&blocks, &files, &dqblk); error = copyout(&dqblk, SCARG(uap, arg), sizeof(dqblk)); break; case Q_SETQUOTA: error = copyin(SCARG(uap, arg), &dqblk, sizeof(dqblk)); if (error) { break; } dqblk_to_quotavals(&dqblk, &blocks, &files); key.qk_idtype = idtype; key.qk_id = SCARG(uap, uid); key.qk_objtype = QUOTA_OBJTYPE_BLOCKS; error = vfs_quotactl_put(mp, &key, &blocks); if (error) { break; } key.qk_objtype = QUOTA_OBJTYPE_FILES; error = vfs_quotactl_put(mp, &key, &files); break; case Q_SYNC: /* * not supported but used only to see if quota is supported, * emulate with stat * * XXX should probably be supported */ (void)idtype; /* not used */ error = vfs_quotactl_stat(mp, &qstat); break; case Q_SETUSE: default: error = EOPNOTSUPP; break; } vrele(vp); return error; } MODULE(MODULE_CLASS_EXEC, compat_50_quota, "compat_50,ufs"); static int compat_50_quota_modcmd(modcmd_t cmd, void *arg) { switch (cmd) { case MODULE_CMD_INIT: return syscall_establish(NULL, vfs_syscalls_50_quota_syscalls); case MODULE_CMD_FINI: return syscall_disestablish(NULL, vfs_syscalls_50_quota_syscalls); default: return ENOTTY; } }
90 90 317 316 316 256 256 257 194 73 79 77 78 79 3 76 79 16 60 60 60 23 23 5 5 115 115 1319 1315 14 14 28 28 156 156 11 11 5 5 5 64 64 64 6 6 6 3 3 3 59 59 59 2 2 2 6 32 6 28 1 3 50 50 27 26 53 53 48 48 48 53 53 52 1 2 48 2 4 4 1 3 3 3 42 42 41 42 1103 1266 1266 1265 119 1280 1285 1194 1265 199 753 55 414 412 188 192 210 210 5 5 7 5 14 13 87 87 767 767 736 48 114 114 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 /* $NetBSD: kern_auth.c,v 1.84 2023/10/04 22:17:09 ad Exp $ */ /*- * Copyright (c) 2005, 2006 Elad Efrat <elad@NetBSD.org> * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The name of the author may not be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: kern_auth.c,v 1.84 2023/10/04 22:17:09 ad Exp $"); #include <sys/types.h> #include <sys/param.h> #include <sys/queue.h> #include <sys/proc.h> #include <sys/ucred.h> #include <sys/pool.h> #define __KAUTH_PRIVATE #include <sys/kauth.h> #include <sys/kmem.h> #include <sys/rwlock.h> #include <sys/sysctl.h> #include <sys/atomic.h> #include <sys/specificdata.h> #include <sys/vnode.h> #include <secmodel/secmodel.h> /* * Secmodel-specific credentials. */ struct kauth_key { secmodel_t ks_secmodel; /* secmodel */ specificdata_key_t ks_key; /* key */ }; /* * Listener. */ struct kauth_listener { kauth_scope_callback_t func; /* callback */ kauth_scope_t scope; /* scope backpointer */ u_int refcnt; /* reference count */ SIMPLEQ_ENTRY(kauth_listener) listener_next; /* listener list */ }; /* * Scope. */ struct kauth_scope { const char *id; /* scope name */ void *cookie; /* user cookie */ u_int nlisteners; /* # of listeners */ SIMPLEQ_HEAD(, kauth_listener) listenq; /* listener list */ SIMPLEQ_ENTRY(kauth_scope) next_scope; /* scope list */ }; static int kauth_cred_hook(kauth_cred_t, kauth_action_t, void *, void *); /* List of scopes and its lock. */ static SIMPLEQ_HEAD(, kauth_scope) scope_list = SIMPLEQ_HEAD_INITIALIZER(scope_list); /* Built-in scopes: generic, process. */ static kauth_scope_t kauth_builtin_scope_generic; static kauth_scope_t kauth_builtin_scope_system; static kauth_scope_t kauth_builtin_scope_process; static kauth_scope_t kauth_builtin_scope_network; static kauth_scope_t kauth_builtin_scope_machdep; static kauth_scope_t kauth_builtin_scope_device; static kauth_scope_t kauth_builtin_scope_cred; static kauth_scope_t kauth_builtin_scope_vnode; static specificdata_domain_t kauth_domain; static pool_cache_t kauth_cred_cache; krwlock_t kauth_lock; /* Allocate new, empty kauth credentials. */ kauth_cred_t kauth_cred_alloc(void) { kauth_cred_t cred; cred = pool_cache_get(kauth_cred_cache, PR_WAITOK); cred->cr_refcnt = 1; cred->cr_uid = 0; cred->cr_euid = 0; cred->cr_svuid = 0; cred->cr_gid = 0; cred->cr_egid = 0; cred->cr_svgid = 0; cred->cr_ngroups = 0; specificdata_init(kauth_domain, &cred->cr_sd); kauth_cred_hook(cred, KAUTH_CRED_INIT, NULL, NULL); return (cred); } /* Increment reference count to cred. */ kauth_cred_t kauth_cred_hold(kauth_cred_t cred) { KASSERT(cred != NULL); KASSERT(cred != NOCRED); KASSERT(cred != FSCRED); KASSERT(cred->cr_refcnt > 0); atomic_inc_uint(&cred->cr_refcnt); return cred; } /* Decrease reference count to cred. If reached zero, free it. */ void kauth_cred_free(kauth_cred_t cred) { KASSERT(cred != NULL); KASSERT(cred != NOCRED); KASSERT(cred != FSCRED); KASSERT(cred->cr_refcnt > 0); ASSERT_SLEEPABLE(); membar_release(); if (atomic_dec_uint_nv(&cred->cr_refcnt) > 0) return; membar_acquire(); kauth_cred_hook(cred, KAUTH_CRED_FREE, NULL, NULL); specificdata_fini(kauth_domain, &cred->cr_sd); pool_cache_put(kauth_cred_cache, cred); } static void kauth_cred_clone1(kauth_cred_t from, kauth_cred_t to, bool copy_groups) { KASSERT(from != NULL); KASSERT(from != NOCRED); KASSERT(from != FSCRED); KASSERT(to != NULL); KASSERT(to != NOCRED); KASSERT(to != FSCRED); KASSERT(from->cr_refcnt > 0); to->cr_uid = from->cr_uid; to->cr_euid = from->cr_euid; to->cr_svuid = from->cr_svuid; to->cr_gid = from->cr_gid; to->cr_egid = from->cr_egid; to->cr_svgid = from->cr_svgid; if (copy_groups) { to->cr_ngroups = from->cr_ngroups; memcpy(to->cr_groups, from->cr_groups, sizeof(to->cr_groups)); } kauth_cred_hook(from, KAUTH_CRED_COPY, to, NULL); } void kauth_cred_clone(kauth_cred_t from, kauth_cred_t to) { kauth_cred_clone1(from, to, true); } /* * Duplicate cred and return a new kauth_cred_t. */ kauth_cred_t kauth_cred_dup(kauth_cred_t cred) { kauth_cred_t new_cred; KASSERT(cred != NULL); KASSERT(cred != NOCRED); KASSERT(cred != FSCRED); KASSERT(cred->cr_refcnt > 0); new_cred = kauth_cred_alloc(); kauth_cred_clone(cred, new_cred); return (new_cred); } /* * Similar to crcopy(), only on a kauth_cred_t. * XXX: Is this even needed? [kauth_cred_copy] */ kauth_cred_t kauth_cred_copy(kauth_cred_t cred) { kauth_cred_t new_cred; KASSERT(cred != NULL); KASSERT(cred != NOCRED); KASSERT(cred != FSCRED); KASSERT(cred->cr_refcnt > 0); /* If the provided credentials already have one reference, use them. */ if (cred->cr_refcnt == 1) return (cred); new_cred = kauth_cred_alloc(); kauth_cred_clone(cred, new_cred); kauth_cred_free(cred); return (new_cred); } void kauth_proc_fork(struct proc *parent, struct proc *child) { mutex_enter(parent->p_lock); child->p_cred = kauth_cred_hold(parent->p_cred); mutex_exit(parent->p_lock); /* XXX: relies on parent process stalling during fork() */ kauth_cred_hook(parent->p_cred, KAUTH_CRED_FORK, parent, child); } void kauth_proc_chroot(kauth_cred_t cred, struct cwdinfo *cwdi) { kauth_cred_hook(cred, KAUTH_CRED_CHROOT, cwdi, NULL); } uid_t kauth_cred_getuid(kauth_cred_t cred) { KASSERT(cred != NULL); KASSERT(cred != NOCRED); KASSERT(cred != FSCRED); return (cred->cr_uid); } uid_t kauth_cred_geteuid(kauth_cred_t cred) { KASSERT(cred != NULL); KASSERT(cred != NOCRED); KASSERT(cred != FSCRED); return (cred->cr_euid); } uid_t kauth_cred_getsvuid(kauth_cred_t cred) { KASSERT(cred != NULL); KASSERT(cred != NOCRED); KASSERT(cred != FSCRED); return (cred->cr_svuid); } gid_t kauth_cred_getgid(kauth_cred_t cred) { KASSERT(cred != NULL); KASSERT(cred != NOCRED); KASSERT(cred != FSCRED); return (cred->cr_gid); } gid_t kauth_cred_getegid(kauth_cred_t cred) { KASSERT(cred != NULL); KASSERT(cred != NOCRED); KASSERT(cred != FSCRED); return (cred->cr_egid); } gid_t kauth_cred_getsvgid(kauth_cred_t cred) { KASSERT(cred != NULL); KASSERT(cred != NOCRED); KASSERT(cred != FSCRED); return (cred->cr_svgid); } void kauth_cred_setuid(kauth_cred_t cred, uid_t uid) { KASSERT(cred != NULL); KASSERT(cred != NOCRED); KASSERT(cred != FSCRED); KASSERT(cred->cr_refcnt == 1); cred->cr_uid = uid; } void kauth_cred_seteuid(kauth_cred_t cred, uid_t uid) { KASSERT(cred != NULL); KASSERT(cred != NOCRED); KASSERT(cred != FSCRED); KASSERT(cred->cr_refcnt == 1); cred->cr_euid = uid; } void kauth_cred_setsvuid(kauth_cred_t cred, uid_t uid) { KASSERT(cred != NULL); KASSERT(cred != NOCRED); KASSERT(cred != FSCRED); KASSERT(cred->cr_refcnt == 1); cred->cr_svuid = uid; } void kauth_cred_setgid(kauth_cred_t cred, gid_t gid) { KASSERT(cred != NULL); KASSERT(cred != NOCRED); KASSERT(cred != FSCRED); KASSERT(cred->cr_refcnt == 1); cred->cr_gid = gid; } void kauth_cred_setegid(kauth_cred_t cred, gid_t gid) { KASSERT(cred != NULL); KASSERT(cred != NOCRED); KASSERT(cred != FSCRED); KASSERT(cred->cr_refcnt == 1); cred->cr_egid = gid; } void kauth_cred_setsvgid(kauth_cred_t cred, gid_t gid) { KASSERT(cred != NULL); KASSERT(cred != NOCRED); KASSERT(cred != FSCRED); KASSERT(cred->cr_refcnt == 1); cred->cr_svgid = gid; } /* Checks if gid is a member of the groups in cred. */ int kauth_cred_ismember_gid(kauth_cred_t cred, gid_t gid, int *resultp) { uint32_t i; KASSERT(cred != NULL); KASSERT(cred != NOCRED); KASSERT(cred != FSCRED); KASSERT(resultp != NULL); *resultp = 0; for (i = 0; i < cred->cr_ngroups; i++) if (cred->cr_groups[i] == gid) { *resultp = 1; break; } return (0); } int kauth_cred_groupmember(kauth_cred_t cred, gid_t gid) { int ismember, error; KASSERT(cred != NULL); KASSERT(cred != NOCRED); KASSERT(cred != FSCRED); if (kauth_cred_getegid(cred) == gid) return 0; error = kauth_cred_ismember_gid(cred, gid, &ismember); if (error) return error; return ismember ? 0 : -1; } u_int kauth_cred_ngroups(kauth_cred_t cred) { KASSERT(cred != NULL); KASSERT(cred != NOCRED); KASSERT(cred != FSCRED); return (cred->cr_ngroups); } /* * Return the group at index idx from the groups in cred. */ gid_t kauth_cred_group(kauth_cred_t cred, u_int idx) { KASSERT(cred != NULL); KASSERT(cred != NOCRED); KASSERT(cred != FSCRED); KASSERT(idx < cred->cr_ngroups); return (cred->cr_groups[idx]); } /* XXX elad: gmuid is unused for now. */ int kauth_cred_setgroups(kauth_cred_t cred, const gid_t *grbuf, size_t len, uid_t gmuid, enum uio_seg seg) { int error = 0; KASSERT(cred != NULL); KASSERT(cred != NOCRED); KASSERT(cred != FSCRED); KASSERT(cred->cr_refcnt == 1); if (len > __arraycount(cred->cr_groups)) return EINVAL; if (len) { if (seg == UIO_SYSSPACE) { memcpy(cred->cr_groups, grbuf, len * sizeof(cred->cr_groups[0])); } else { error = copyin(grbuf, cred->cr_groups, len * sizeof(cred->cr_groups[0])); if (error != 0) len = 0; } } memset(cred->cr_groups + len, 0xff, sizeof(cred->cr_groups) - (len * sizeof(cred->cr_groups[0]))); cred->cr_ngroups = len; return error; } /* This supports sys_setgroups() */ int kauth_proc_setgroups(struct lwp *l, kauth_cred_t ncred) { kauth_cred_t cred; int error; /* * At this point we could delete duplicate groups from ncred, * and plausibly sort the list - but in general the later is * a bad idea. */ proc_crmod_enter(); /* Maybe we should use curproc here ? */ cred = l->l_proc->p_cred; kauth_cred_clone1(cred, ncred, false); error = kauth_authorize_process(cred, KAUTH_PROCESS_SETID, l->l_proc, NULL, NULL, NULL); if (error != 0) { proc_crmod_leave(cred, ncred, false); return error; } /* Broadcast our credentials to the process and other LWPs. */ proc_crmod_leave(ncred, cred, true); return 0; } int kauth_cred_getgroups(kauth_cred_t cred, gid_t *grbuf, size_t len, enum uio_seg seg) { KASSERT(cred != NULL); if (len > cred->cr_ngroups) return EINVAL; if (seg == UIO_USERSPACE) return copyout(cred->cr_groups, grbuf, sizeof(*grbuf) * len); memcpy(grbuf, cred->cr_groups, sizeof(*grbuf) * len); return 0; } int kauth_register_key(secmodel_t secmodel, kauth_key_t *result) { kauth_key_t k; specificdata_key_t key; int error; KASSERT(result != NULL); error = specificdata_key_create(kauth_domain, &key, NULL); if (error) return (error); k = kmem_alloc(sizeof(*k), KM_SLEEP); k->ks_secmodel = secmodel; k->ks_key = key; *result = k; return (0); } int kauth_deregister_key(kauth_key_t key) { KASSERT(key != NULL); specificdata_key_delete(kauth_domain, key->ks_key); kmem_free(key, sizeof(*key)); return (0); } void * kauth_cred_getdata(kauth_cred_t cred, kauth_key_t key) { KASSERT(cred != NULL); KASSERT(cred != NOCRED); KASSERT(cred != FSCRED); KASSERT(key != NULL); return (specificdata_getspecific(kauth_domain, &cred->cr_sd, key->ks_key)); } void kauth_cred_setdata(kauth_cred_t cred, kauth_key_t key, void *data) { KASSERT(cred != NULL); KASSERT(cred != NOCRED); KASSERT(cred != FSCRED); KASSERT(key != NULL); specificdata_setspecific(kauth_domain, &cred->cr_sd, key->ks_key, data); } /* * Match uids in two credentials. */ int kauth_cred_uidmatch(kauth_cred_t cred1, kauth_cred_t cred2) { KASSERT(cred1 != NULL); KASSERT(cred1 != NOCRED); KASSERT(cred1 != FSCRED); KASSERT(cred2 != NULL); KASSERT(cred2 != NOCRED); KASSERT(cred2 != FSCRED); if (cred1->cr_uid == cred2->cr_uid || cred1->cr_euid == cred2->cr_uid || cred1->cr_uid == cred2->cr_euid || cred1->cr_euid == cred2->cr_euid) return (1); return (0); } u_int kauth_cred_getrefcnt(kauth_cred_t cred) { KASSERT(cred != NULL); KASSERT(cred != NOCRED); KASSERT(cred != FSCRED); return (cred->cr_refcnt); } /* * Convert userland credentials (struct uucred) to kauth_cred_t. * XXX: For NFS & puffs */ void kauth_uucred_to_cred(kauth_cred_t cred, const struct uucred *uuc) { KASSERT(cred != NULL); KASSERT(cred != NOCRED); KASSERT(cred != FSCRED); KASSERT(uuc != NULL); cred->cr_refcnt = 1; cred->cr_uid = uuc->cr_uid; cred->cr_euid = uuc->cr_uid; cred->cr_svuid = uuc->cr_uid; cred->cr_gid = uuc->cr_gid; cred->cr_egid = uuc->cr_gid; cred->cr_svgid = uuc->cr_gid; cred->cr_ngroups = uimin(uuc->cr_ngroups, NGROUPS); kauth_cred_setgroups(cred, __UNCONST(uuc->cr_groups), cred->cr_ngroups, -1, UIO_SYSSPACE); } /* * Convert kauth_cred_t to userland credentials (struct uucred). * XXX: For NFS & puffs */ void kauth_cred_to_uucred(struct uucred *uuc, const kauth_cred_t cred) { KASSERT(cred != NULL); KASSERT(cred != NOCRED); KASSERT(cred != FSCRED); KASSERT(uuc != NULL); int ng; ng = uimin(cred->cr_ngroups, NGROUPS); uuc->cr_uid = cred->cr_euid; uuc->cr_gid = cred->cr_egid; uuc->cr_ngroups = ng; kauth_cred_getgroups(cred, uuc->cr_groups, ng, UIO_SYSSPACE); } /* * Compare kauth_cred_t and uucred credentials. * XXX: Modelled after crcmp() for NFS. */ int kauth_cred_uucmp(kauth_cred_t cred, const struct uucred *uuc) { KASSERT(cred != NULL); KASSERT(cred != NOCRED); KASSERT(cred != FSCRED); KASSERT(uuc != NULL); if (cred->cr_euid == uuc->cr_uid && cred->cr_egid == uuc->cr_gid && cred->cr_ngroups == (uint32_t)uuc->cr_ngroups) { int i; /* Check if all groups from uuc appear in cred. */ for (i = 0; i < uuc->cr_ngroups; i++) { int ismember; ismember = 0; if (kauth_cred_ismember_gid(cred, uuc->cr_groups[i], &ismember) != 0 || !ismember) return (1); } return (0); } return (1); } /* * Make a struct ucred out of a kauth_cred_t. For compatibility. */ void kauth_cred_toucred(kauth_cred_t cred, struct ki_ucred *uc) { KASSERT(cred != NULL); KASSERT(cred != NOCRED); KASSERT(cred != FSCRED); KASSERT(uc != NULL); uc->cr_ref = cred->cr_refcnt; uc->cr_uid = cred->cr_euid; uc->cr_gid = cred->cr_egid; uc->cr_ngroups = uimin(cred->cr_ngroups, __arraycount(uc->cr_groups)); memcpy(uc->cr_groups, cred->cr_groups, uc->cr_ngroups * sizeof(uc->cr_groups[0])); } /* * Make a struct pcred out of a kauth_cred_t. For compatibility. */ void kauth_cred_topcred(kauth_cred_t cred, struct ki_pcred *pc) { KASSERT(cred != NULL); KASSERT(cred != NOCRED); KASSERT(cred != FSCRED); KASSERT(pc != NULL); pc->p_pad = NULL; pc->p_ruid = cred->cr_uid; pc->p_svuid = cred->cr_svuid; pc->p_rgid = cred->cr_gid; pc->p_svgid = cred->cr_svgid; pc->p_refcnt = cred->cr_refcnt; } /* * Return kauth_cred_t for the current LWP. */ kauth_cred_t kauth_cred_get(void) { return (curlwp->l_cred); } /* * Returns a scope matching the provided id. * Requires the scope list lock to be held by the caller. */ static kauth_scope_t kauth_ifindscope(const char *id) { kauth_scope_t scope; KASSERT(rw_lock_held(&kauth_lock)); scope = NULL; SIMPLEQ_FOREACH(scope, &scope_list, next_scope) { if (strcmp(scope->id, id) == 0) break; } return (scope); } /* * Register a new scope. * * id - identifier for the scope * callback - the scope's default listener * cookie - cookie to be passed to the listener(s) */ kauth_scope_t kauth_register_scope(const char *id, kauth_scope_callback_t callback, void *cookie) { kauth_scope_t scope; kauth_listener_t listener = NULL; /* XXX gcc */ /* Sanitize input */ if (id == NULL) return (NULL); /* Allocate space for a new scope and listener. */ scope = kmem_alloc(sizeof(*scope), KM_SLEEP); if (callback != NULL) listener = kmem_alloc(sizeof(*listener), KM_SLEEP); /* * Acquire scope list lock. */ rw_enter(&kauth_lock, RW_WRITER); /* Check we don't already have a scope with the same id */ if (kauth_ifindscope(id) != NULL) { rw_exit(&kauth_lock); kmem_free(scope, sizeof(*scope)); if (callback != NULL) kmem_free(listener, sizeof(*listener)); return (NULL); } /* Initialize new scope with parameters */ scope->id = id; scope->cookie = cookie; scope->nlisteners = 1; SIMPLEQ_INIT(&scope->listenq); /* Add default listener */ if (callback != NULL) { listener->func = callback; listener->scope = scope; listener->refcnt = 0; SIMPLEQ_INSERT_HEAD(&scope->listenq, listener, listener_next); } /* Insert scope to scopes list */ SIMPLEQ_INSERT_TAIL(&scope_list, scope, next_scope); rw_exit(&kauth_lock); return (scope); } /* * Initialize the kernel authorization subsystem. * * Initialize the scopes list lock. * Create specificdata domain. * Register the credentials scope, used in kauth(9) internally. * Register built-in scopes: generic, system, process, network, machdep, device. */ void kauth_init(void) { rw_init(&kauth_lock); kauth_cred_cache = pool_cache_init(sizeof(struct kauth_cred), coherency_unit, 0, 0, "kcredpl", NULL, IPL_NONE, NULL, NULL, NULL); /* Create specificdata domain. */ kauth_domain = specificdata_domain_create(); /* Register credentials scope. */ kauth_builtin_scope_cred = kauth_register_scope(KAUTH_SCOPE_CRED, NULL, NULL); /* Register generic scope. */ kauth_builtin_scope_generic = kauth_register_scope(KAUTH_SCOPE_GENERIC, NULL, NULL); /* Register system scope. */ kauth_builtin_scope_system = kauth_register_scope(KAUTH_SCOPE_SYSTEM, NULL, NULL); /* Register process scope. */ kauth_builtin_scope_process = kauth_register_scope(KAUTH_SCOPE_PROCESS, NULL, NULL); /* Register network scope. */ kauth_builtin_scope_network = kauth_register_scope(KAUTH_SCOPE_NETWORK, NULL, NULL); /* Register machdep scope. */ kauth_builtin_scope_machdep = kauth_register_scope(KAUTH_SCOPE_MACHDEP, NULL, NULL); /* Register device scope. */ kauth_builtin_scope_device = kauth_register_scope(KAUTH_SCOPE_DEVICE, NULL, NULL); /* Register vnode scope. */ kauth_builtin_scope_vnode = kauth_register_scope(KAUTH_SCOPE_VNODE, NULL, NULL); } /* * Deregister a scope. * Requires scope list lock to be held by the caller. * * scope - the scope to deregister */ void kauth_deregister_scope(kauth_scope_t scope) { if (scope != NULL) { /* Remove scope from list */ SIMPLEQ_REMOVE(&scope_list, scope, kauth_scope, next_scope); kmem_free(scope, sizeof(*scope)); } } /* * Register a listener. * * id - scope identifier. * callback - the callback routine for the listener. * cookie - cookie to pass unmoidfied to the callback. */ kauth_listener_t kauth_listen_scope(const char *id, kauth_scope_callback_t callback, void *cookie) { kauth_scope_t scope; kauth_listener_t listener; listener = kmem_alloc(sizeof(*listener), KM_SLEEP); rw_enter(&kauth_lock, RW_WRITER); /* * Find scope struct. */ scope = kauth_ifindscope(id); if (scope == NULL) { rw_exit(&kauth_lock); kmem_free(listener, sizeof(*listener)); return (NULL); } /* Allocate listener */ /* Initialize listener with parameters */ listener->func = callback; listener->refcnt = 0; /* Add listener to scope */ SIMPLEQ_INSERT_TAIL(&scope->listenq, listener, listener_next); /* Raise number of listeners on scope. */ scope->nlisteners++; listener->scope = scope; rw_exit(&kauth_lock); return (listener); } /* * Deregister a listener. * * listener - listener reference as returned from kauth_listen_scope(). */ void kauth_unlisten_scope(kauth_listener_t listener) { if (listener != NULL) { rw_enter(&kauth_lock, RW_WRITER); SIMPLEQ_REMOVE(&listener->scope->listenq, listener, kauth_listener, listener_next); listener->scope->nlisteners--; rw_exit(&kauth_lock); kmem_free(listener, sizeof(*listener)); } } /* * Authorize a request. * * scope - the scope of the request as defined by KAUTH_SCOPE_* or as * returned from kauth_register_scope(). * credential - credentials of the user ("actor") making the request. * action - request identifier. * arg[0-3] - passed unmodified to listener(s). * * Returns the aggregated result: * - KAUTH_RESULT_ALLOW if there is at least one KAUTH_RESULT_ALLOW and * zero KAUTH_DESULT_DENY * - KAUTH_RESULT_DENY if there is at least one KAUTH_RESULT_DENY * - KAUTH_RESULT_DEFER if there is nothing but KAUTH_RESULT_DEFER */ static int kauth_authorize_action_internal(kauth_scope_t scope, kauth_cred_t cred, kauth_action_t action, void *arg0, void *arg1, void *arg2, void *arg3) { kauth_listener_t listener; int error, allow, fail; KASSERT(cred != NULL); KASSERT(action != 0); /* Short-circuit requests coming from the kernel. */ if (cred == NOCRED || cred == FSCRED) return KAUTH_RESULT_ALLOW; KASSERT(scope != NULL); fail = 0; allow = 0; /* rw_enter(&kauth_lock, RW_READER); XXX not yet */ SIMPLEQ_FOREACH(listener, &scope->listenq, listener_next) { error = listener->func(cred, action, scope->cookie, arg0, arg1, arg2, arg3); if (error == KAUTH_RESULT_ALLOW) allow = 1; else if (error == KAUTH_RESULT_DENY) fail = 1; } /* rw_exit(&kauth_lock); */ if (fail) return (KAUTH_RESULT_DENY); if (allow) return (KAUTH_RESULT_ALLOW); return (KAUTH_RESULT_DEFER); }; int kauth_authorize_action(kauth_scope_t scope, kauth_cred_t cred, kauth_action_t action, void *arg0, void *arg1, void *arg2, void *arg3) { int r; r = kauth_authorize_action_internal(scope, cred, action, arg0, arg1, arg2, arg3); if (r == KAUTH_RESULT_DENY) return (EPERM); if (r == KAUTH_RESULT_ALLOW) return (0); if (secmodel_nsecmodels() == 0) return (0); return (EPERM); } /* * Generic scope authorization wrapper. */ int kauth_authorize_generic(kauth_cred_t cred, kauth_action_t action, void *arg0) { return (kauth_authorize_action(kauth_builtin_scope_generic, cred, action, arg0, NULL, NULL, NULL)); } /* * System scope authorization wrapper. */ int kauth_authorize_system(kauth_cred_t cred, kauth_action_t action, enum kauth_system_req req, void *arg1, void *arg2, void *arg3) { return (kauth_authorize_action(kauth_builtin_scope_system, cred, action, (void *)req, arg1, arg2, arg3)); } /* * Process scope authorization wrapper. */ int kauth_authorize_process(kauth_cred_t cred, kauth_action_t action, struct proc *p, void *arg1, void *arg2, void *arg3) { return (kauth_authorize_action(kauth_builtin_scope_process, cred, action, p, arg1, arg2, arg3)); } /* * Network scope authorization wrapper. */ int kauth_authorize_network(kauth_cred_t cred, kauth_action_t action, enum kauth_network_req req, void *arg1, void *arg2, void *arg3) { return (kauth_authorize_action(kauth_builtin_scope_network, cred, action, (void *)req, arg1, arg2, arg3)); } int kauth_authorize_machdep(kauth_cred_t cred, kauth_action_t action, void *arg0, void *arg1, void *arg2, void *arg3) { return (kauth_authorize_action(kauth_builtin_scope_machdep, cred, action, arg0, arg1, arg2, arg3)); } int kauth_authorize_device(kauth_cred_t cred, kauth_action_t action, void *arg0, void *arg1, void *arg2, void *arg3) { return (kauth_authorize_action(kauth_builtin_scope_device, cred, action, arg0, arg1, arg2, arg3)); } int kauth_authorize_device_tty(kauth_cred_t cred, kauth_action_t action, struct tty *tty) { return (kauth_authorize_action(kauth_builtin_scope_device, cred, action, tty, NULL, NULL, NULL)); } int kauth_authorize_device_spec(kauth_cred_t cred, enum kauth_device_req req, struct vnode *vp) { return (kauth_authorize_action(kauth_builtin_scope_device, cred, KAUTH_DEVICE_RAWIO_SPEC, (void *)req, vp, NULL, NULL)); } int kauth_authorize_device_passthru(kauth_cred_t cred, dev_t dev, u_long bits, void *data) { return (kauth_authorize_action(kauth_builtin_scope_device, cred, KAUTH_DEVICE_RAWIO_PASSTHRU, (void *)bits, (void *)(u_long)dev, data, NULL)); } kauth_action_t kauth_accmode_to_action(accmode_t accmode) { kauth_action_t action = 0; // XXX: Revisit we need to have a richer set of kauth primitives // We also get only the Unix perms here sometimes if (accmode & (VSTAT_PERMS|VREAD)) action |= KAUTH_VNODE_READ_DATA; if (accmode & (VMODIFY_PERMS|VADMIN_PERMS)) action |= KAUTH_VNODE_WRITE_DATA; if (accmode & VEXEC) action |= KAUTH_VNODE_EXECUTE; return action == 0 ? KAUTH_VNODE_ACCESS : action; } kauth_action_t kauth_extattr_action(mode_t access_mode) { kauth_action_t action = 0; if (access_mode & VREAD) action |= KAUTH_VNODE_READ_EXTATTRIBUTES; if (access_mode & VWRITE) action |= KAUTH_VNODE_WRITE_EXTATTRIBUTES; return action; } int kauth_authorize_vnode(kauth_cred_t cred, kauth_action_t action, struct vnode *vp, struct vnode *dvp, int fs_decision) { int error; error = kauth_authorize_action_internal(kauth_builtin_scope_vnode, cred, action, vp, dvp, NULL, NULL); if (error == KAUTH_RESULT_DENY) return (EACCES); if (error == KAUTH_RESULT_ALLOW) return (0); /* * If the file-system does not support decision-before-action, we can * only short-circuit the operation (deny). If we're here, it means no * listener denied it, so our only alternative is to supposedly-allow * it and let the file-system have the last word. */ if (fs_decision == KAUTH_VNODE_REMOTEFS) return (0); return (fs_decision); } static int kauth_cred_hook(kauth_cred_t cred, kauth_action_t action, void *arg0, void *arg1) { int r; r = kauth_authorize_action(kauth_builtin_scope_cred, cred, action, arg0, arg1, NULL, NULL); #ifdef DIAGNOSTIC if (!SIMPLEQ_EMPTY(&kauth_builtin_scope_cred->listenq)) KASSERT(r == 0); #endif /* DIAGNOSTIC */ return (r); }
54 54 44 6 11 14 14 84 84 84 9 2 2 9 11 11 11 11 84 84 84 84 45 46 78 79 79 60 78 79 60 54 24 32 46 46 84 11 46 46 46 41 6 38 13 46 46 46 9 9 9 10 10 9 9 1 1 1 1 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 /* $NetBSD: union_subr.c,v 1.82 2022/07/18 04:30:30 thorpej Exp $ */ /* * Copyright (c) 1994 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * Jan-Simon Pendry. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)union_subr.c 8.20 (Berkeley) 5/20/95 */ /* * Copyright (c) 1994 Jan-Simon Pendry * * This code is derived from software contributed to Berkeley by * Jan-Simon Pendry. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)union_subr.c 8.20 (Berkeley) 5/20/95 */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: union_subr.c,v 1.82 2022/07/18 04:30:30 thorpej Exp $"); #include <sys/param.h> #include <sys/systm.h> #include <sys/proc.h> #include <sys/time.h> #include <sys/kernel.h> #include <sys/vnode.h> #include <sys/namei.h> #include <sys/malloc.h> #include <sys/dirent.h> #include <sys/file.h> #include <sys/filedesc.h> #include <sys/queue.h> #include <sys/mount.h> #include <sys/stat.h> #include <sys/kauth.h> #include <uvm/uvm_extern.h> #include <fs/union/union.h> #include <miscfs/genfs/genfs.h> #include <miscfs/specfs/specdev.h> static LIST_HEAD(uhashhead, union_node) *uhashtbl; static u_long uhash_mask; /* size of hash table - 1 */ #define UNION_HASH(u, l) \ ((((u_long) (u) + (u_long) (l)) >> 8) & uhash_mask) #define NOHASH ((u_long)-1) static kmutex_t uhash_lock; static void union_newupper(struct union_node *, struct vnode *); static void union_newlower(struct union_node *, struct vnode *); static void union_ref(struct union_node *); static void union_rele(struct union_node *); static int union_do_lookup(struct vnode *, struct componentname *, kauth_cred_t, const char *); int union_vn_close(struct vnode *, int, kauth_cred_t, struct lwp *); static void union_dircache_r(struct vnode *, struct vnode ***, int *); struct vnode *union_dircache(struct vnode *, struct lwp *); void union_init(void) { mutex_init(&uhash_lock, MUTEX_DEFAULT, IPL_NONE); uhashtbl = hashinit(desiredvnodes, HASH_LIST, true, &uhash_mask); } void union_reinit(void) { struct union_node *un; struct uhashhead *oldhash, *hash; u_long oldmask, mask, val; int i; hash = hashinit(desiredvnodes, HASH_LIST, true, &mask); mutex_enter(&uhash_lock); oldhash = uhashtbl; oldmask = uhash_mask; uhashtbl = hash; uhash_mask = mask; for (i = 0; i <= oldmask; i++) { while ((un = LIST_FIRST(&oldhash[i])) != NULL) { LIST_REMOVE(un, un_cache); val = UNION_HASH(un->un_uppervp, un->un_lowervp); LIST_INSERT_HEAD(&hash[val], un, un_cache); } } mutex_exit(&uhash_lock); hashdone(oldhash, HASH_LIST, oldmask); } /* * Free global unionfs resources. */ void union_done(void) { hashdone(uhashtbl, HASH_LIST, uhash_mask); mutex_destroy(&uhash_lock); /* Make sure to unset the readdir hook. */ vn_union_readdir_hook = NULL; } void union_newlower(struct union_node *un, struct vnode *lowervp) { int ohash = UNION_HASH(un->un_uppervp, un->un_lowervp); int nhash = UNION_HASH(un->un_uppervp, lowervp); if (un->un_lowervp == lowervp) return; KASSERT(VOP_ISLOCKED(UNIONTOV(un)) == LK_EXCLUSIVE); KASSERT(un->un_lowervp == NULL); mutex_enter(&uhash_lock); if (ohash != nhash && (un->un_cflags & UN_CACHED)) { un->un_cflags &= ~UN_CACHED; LIST_REMOVE(un, un_cache); } mutex_enter(&un->un_lock); un->un_lowervp = lowervp; un->un_lowersz = VNOVAL; mutex_exit(&un->un_lock); if (ohash != nhash) { LIST_INSERT_HEAD(&uhashtbl[nhash], un, un_cache); un->un_cflags |= UN_CACHED; } mutex_exit(&uhash_lock); } void union_newupper(struct union_node *un, struct vnode *uppervp) { int ohash = UNION_HASH(un->un_uppervp, un->un_lowervp); int nhash = UNION_HASH(uppervp, un->un_lowervp); struct vop_lock_args lock_ap; struct vop_unlock_args unlock_ap; int error __diagused; if (un->un_uppervp == uppervp) return; KASSERT(VOP_ISLOCKED(UNIONTOV(un)) == LK_EXCLUSIVE); KASSERT(un->un_uppervp == NULL); /* * We have to transfer the vnode lock from the union vnode to * the upper vnode. Lock the upper vnode first. We cannot use * VOP_LOCK() here as it would break the fstrans state. */ lock_ap.a_desc = VDESC(vop_lock); lock_ap.a_vp = uppervp; lock_ap.a_flags = LK_EXCLUSIVE; error = VCALL(lock_ap.a_vp, VOFFSET(vop_lock), &lock_ap); KASSERT(error == 0); mutex_enter(&uhash_lock); if (ohash != nhash && (un->un_cflags & UN_CACHED)) { un->un_cflags &= ~UN_CACHED; LIST_REMOVE(un, un_cache); } mutex_enter(&un->un_lock); un->un_uppervp = uppervp; un->un_uppersz = VNOVAL; /* * With the upper vnode in place unlock the union vnode to * finalize the lock transfer. */ unlock_ap.a_desc = VDESC(vop_unlock); unlock_ap.a_vp = UNIONTOV(un); genfs_unlock(&unlock_ap); /* Update union vnode interlock, vmobjlock, & klist. */ vshareilock(UNIONTOV(un), uppervp); rw_obj_hold(uppervp->v_uobj.vmobjlock); uvm_obj_setlock(&UNIONTOV(un)->v_uobj, uppervp->v_uobj.vmobjlock); vshareklist(UNIONTOV(un), uppervp); mutex_exit(&un->un_lock); if (ohash != nhash) { LIST_INSERT_HEAD(&uhashtbl[nhash], un, un_cache); un->un_cflags |= UN_CACHED; } mutex_exit(&uhash_lock); } /* * Keep track of size changes in the underlying vnodes. * If the size changes, then callback to the vm layer * giving priority to the upper layer size. * * Mutex un_lock hold on entry and released on return. */ void union_newsize(struct vnode *vp, off_t uppersz, off_t lowersz) { struct union_node *un = VTOUNION(vp); off_t sz; KASSERT(mutex_owned(&un->un_lock)); /* only interested in regular files */ if (vp->v_type != VREG) { mutex_exit(&un->un_lock); uvm_vnp_setsize(vp, 0); return; } sz = VNOVAL; if ((uppersz != VNOVAL) && (un->un_uppersz != uppersz)) { un->un_uppersz = uppersz; if (sz == VNOVAL) sz = un->un_uppersz; } if ((lowersz != VNOVAL) && (un->un_lowersz != lowersz)) { un->un_lowersz = lowersz; if (sz == VNOVAL) sz = un->un_lowersz; } mutex_exit(&un->un_lock); if (sz != VNOVAL) { #ifdef UNION_DIAGNOSTIC printf("union: %s size now %qd\n", uppersz != VNOVAL ? "upper" : "lower", sz); #endif uvm_vnp_setsize(vp, sz); } } static void union_ref(struct union_node *un) { KASSERT(mutex_owned(&uhash_lock)); un->un_refs++; } static void union_rele(struct union_node *un) { mutex_enter(&uhash_lock); un->un_refs--; if (un->un_refs > 0) { mutex_exit(&uhash_lock); return; } if (un->un_cflags & UN_CACHED) { un->un_cflags &= ~UN_CACHED; LIST_REMOVE(un, un_cache); } mutex_exit(&uhash_lock); if (un->un_pvp != NULLVP) vrele(un->un_pvp); if (un->un_uppervp != NULLVP) vrele(un->un_uppervp); if (un->un_lowervp != NULLVP) vrele(un->un_lowervp); if (un->un_dirvp != NULLVP) vrele(un->un_dirvp); if (un->un_path) free(un->un_path, M_TEMP); mutex_destroy(&un->un_lock); free(un, M_TEMP); } /* * allocate a union_node/vnode pair. the vnode is * referenced and unlocked. the new vnode is returned * via (vpp). (mp) is the mountpoint of the union filesystem, * (dvp) is the parent directory where the upper layer object * should exist (but doesn't) and (cnp) is the componentname * information which is partially copied to allow the upper * layer object to be created at a later time. (uppervp) * and (lowervp) reference the upper and lower layer objects * being mapped. either, but not both, can be nil. * both, if supplied, are unlocked. * the reference is either maintained in the new union_node * object which is allocated, or they are vrele'd. * * all union_nodes are maintained on a hash * list. new nodes are only allocated when they cannot * be found on this list. entries on the list are * removed when the vfs reclaim entry is called. * * the vnode gets attached or referenced with vcache_get(). */ int union_allocvp( struct vnode **vpp, struct mount *mp, struct vnode *undvp, /* parent union vnode */ struct vnode *dvp, /* may be null */ struct componentname *cnp, /* may be null */ struct vnode *uppervp, /* may be null */ struct vnode *lowervp, /* may be null */ int docache) { int error; struct union_node *un = NULL, *un1; struct vnode *vp, *xlowervp = NULLVP; u_long hash[3]; int try; bool is_dotdot; is_dotdot = (dvp != NULL && cnp != NULL && (cnp->cn_flags & ISDOTDOT)); if (uppervp == NULLVP && lowervp == NULLVP) panic("union: unidentifiable allocation"); if (uppervp && lowervp && (uppervp->v_type != lowervp->v_type)) { xlowervp = lowervp; lowervp = NULLVP; } /* * If both uppervp and lowervp are not NULL we have to * search union nodes with one vnode as NULL too. */ hash[0] = UNION_HASH(uppervp, lowervp); if (uppervp == NULL || lowervp == NULL) { hash[1] = hash[2] = NOHASH; } else { hash[1] = UNION_HASH(uppervp, NULLVP); hash[2] = UNION_HASH(NULLVP, lowervp); } if (!docache) { un = NULL; goto found; } loop: mutex_enter(&uhash_lock); for (try = 0; try < 3; try++) { if (hash[try] == NOHASH) continue; LIST_FOREACH(un, &uhashtbl[hash[try]], un_cache) { if ((un->un_lowervp && un->un_lowervp != lowervp) || (un->un_uppervp && un->un_uppervp != uppervp) || un->un_mount != mp) continue; union_ref(un); mutex_exit(&uhash_lock); error = vcache_get(mp, &un, sizeof(un), &vp); KASSERT(error != 0 || UNIONTOV(un) == vp); union_rele(un); if (error == ENOENT) goto loop; else if (error) goto out; goto found; } } mutex_exit(&uhash_lock); found: if (un) { if (uppervp != dvp) { if (is_dotdot) VOP_UNLOCK(dvp); vn_lock(UNIONTOV(un), LK_EXCLUSIVE | LK_RETRY); if (is_dotdot) vn_lock(dvp, LK_EXCLUSIVE | LK_RETRY); } /* * Save information about the upper layer. */ if (uppervp != un->un_uppervp) { union_newupper(un, uppervp); } else if (uppervp) { vrele(uppervp); } /* * Save information about the lower layer. * This needs to keep track of pathname * and directory information which union_vn_create * might need. */ if (lowervp != un->un_lowervp) { union_newlower(un, lowervp); if (cnp && (lowervp != NULLVP)) { un->un_path = malloc(cnp->cn_namelen+1, M_TEMP, M_WAITOK); memcpy(un->un_path, cnp->cn_nameptr, cnp->cn_namelen); un->un_path[cnp->cn_namelen] = '\0'; vref(dvp); un->un_dirvp = dvp; } } else if (lowervp) { vrele(lowervp); } *vpp = UNIONTOV(un); if (uppervp != dvp) VOP_UNLOCK(*vpp); error = 0; goto out; } un = malloc(sizeof(struct union_node), M_TEMP, M_WAITOK); mutex_init(&un->un_lock, MUTEX_DEFAULT, IPL_NONE); un->un_refs = 1; un->un_mount = mp; un->un_vnode = NULL; un->un_uppervp = uppervp; un->un_lowervp = lowervp; un->un_pvp = undvp; if (undvp != NULLVP) vref(undvp); un->un_dircache = 0; un->un_openl = 0; un->un_cflags = 0; un->un_hooknode = false; un->un_uppersz = VNOVAL; un->un_lowersz = VNOVAL; if (dvp && cnp && (lowervp != NULLVP)) { un->un_path = malloc(cnp->cn_namelen+1, M_TEMP, M_WAITOK); memcpy(un->un_path, cnp->cn_nameptr, cnp->cn_namelen); un->un_path[cnp->cn_namelen] = '\0'; vref(dvp); un->un_dirvp = dvp; } else { un->un_path = 0; un->un_dirvp = 0; } if (docache) { mutex_enter(&uhash_lock); LIST_FOREACH(un1, &uhashtbl[hash[0]], un_cache) { if (un1->un_lowervp == lowervp && un1->un_uppervp == uppervp && un1->un_mount == mp) { /* * Another thread beat us, push back freshly * allocated node and retry. */ mutex_exit(&uhash_lock); union_rele(un); goto loop; } } LIST_INSERT_HEAD(&uhashtbl[hash[0]], un, un_cache); un->un_cflags |= UN_CACHED; mutex_exit(&uhash_lock); } error = vcache_get(mp, &un, sizeof(un), vpp); KASSERT(error != 0 || UNIONTOV(un) == *vpp); union_rele(un); if (error == ENOENT) goto loop; out: if (xlowervp) vrele(xlowervp); return error; } int union_freevp(struct vnode *vp) { struct union_node *un = VTOUNION(vp); /* Detach vnode from union node. */ un->un_vnode = NULL; un->un_uppersz = VNOVAL; un->un_lowersz = VNOVAL; /* Detach union node from vnode. */ mutex_enter(vp->v_interlock); vp->v_data = NULL; mutex_exit(vp->v_interlock); union_rele(un); return 0; } int union_loadvnode(struct mount *mp, struct vnode *vp, const void *key, size_t key_len, const void **new_key) { struct vattr va; struct vnode *svp; struct union_node *un; struct union_mount *um; voff_t uppersz, lowersz; KASSERT(key_len == sizeof(un)); memcpy(&un, key, key_len); um = MOUNTTOUNIONMOUNT(mp); svp = (un->un_uppervp != NULLVP) ? un->un_uppervp : un->un_lowervp; vp->v_tag = VT_UNION; vp->v_op = union_vnodeop_p; vp->v_data = un; un->un_vnode = vp; vp->v_type = svp->v_type; if (svp->v_type == VCHR || svp->v_type == VBLK) spec_node_init(vp, svp->v_rdev); vshareilock(vp, svp); rw_obj_hold(svp->v_uobj.vmobjlock); uvm_obj_setlock(&vp->v_uobj, svp->v_uobj.vmobjlock); vshareklist(vp, svp); /* detect the root vnode (and aliases) */ if ((un->un_uppervp == um->um_uppervp) && ((un->un_lowervp == NULLVP) || un->un_lowervp == um->um_lowervp)) { if (un->un_lowervp == NULLVP) { un->un_lowervp = um->um_lowervp; if (un->un_lowervp != NULLVP) vref(un->un_lowervp); } vp->v_vflag |= VV_ROOT; } uppersz = lowersz = VNOVAL; if (un->un_uppervp != NULLVP) { if (vn_lock(un->un_uppervp, LK_SHARED) == 0) { if (VOP_GETATTR(un->un_uppervp, &va, FSCRED) == 0) uppersz = va.va_size; VOP_UNLOCK(un->un_uppervp); } } if (un->un_lowervp != NULLVP) { if (vn_lock(un->un_lowervp, LK_SHARED) == 0) { if (VOP_GETATTR(un->un_lowervp, &va, FSCRED) == 0) lowersz = va.va_size; VOP_UNLOCK(un->un_lowervp); } } mutex_enter(&un->un_lock); union_newsize(vp, uppersz, lowersz); mutex_enter(&uhash_lock); union_ref(un); mutex_exit(&uhash_lock); *new_key = &vp->v_data; return 0; } /* * copyfile. copy the vnode (fvp) to the vnode (tvp) * using a sequence of reads and writes. both (fvp) * and (tvp) are locked on entry and exit. */ int union_copyfile(struct vnode *fvp, struct vnode *tvp, kauth_cred_t cred, struct lwp *l) { char *tbuf; struct uio uio; struct iovec iov; int error = 0; /* * strategy: * allocate a buffer of size MAXBSIZE. * loop doing reads and writes, keeping track * of the current uio offset. * give up at the first sign of trouble. */ uio.uio_offset = 0; UIO_SETUP_SYSSPACE(&uio); tbuf = malloc(MAXBSIZE, M_TEMP, M_WAITOK); /* ugly loop follows... */ do { off_t offset = uio.uio_offset; uio.uio_iov = &iov; uio.uio_iovcnt = 1; iov.iov_base = tbuf; iov.iov_len = MAXBSIZE; uio.uio_resid = iov.iov_len; uio.uio_rw = UIO_READ; error = VOP_READ(fvp, &uio, 0, cred); if (error == 0) { uio.uio_iov = &iov; uio.uio_iovcnt = 1; iov.iov_base = tbuf; iov.iov_len = MAXBSIZE - uio.uio_resid; uio.uio_offset = offset; uio.uio_rw = UIO_WRITE; uio.uio_resid = iov.iov_len; if (uio.uio_resid == 0) break; do { error = VOP_WRITE(tvp, &uio, 0, cred); } while ((uio.uio_resid > 0) && (error == 0)); } } while (error == 0); free(tbuf, M_TEMP); return (error); } /* * (un) is assumed to be locked on entry and remains * locked on exit. */ int union_copyup(struct union_node *un, int docopy, kauth_cred_t cred, struct lwp *l) { int error; struct vnode *lvp, *uvp; struct vattr lvattr, uvattr; error = union_vn_create(&uvp, un, l); if (error) return (error); union_newupper(un, uvp); lvp = un->un_lowervp; if (docopy) { /* * XX - should not ignore errors * from VOP_CLOSE */ vn_lock(lvp, LK_EXCLUSIVE | LK_RETRY); error = VOP_GETATTR(lvp, &lvattr, cred); if (error == 0) error = VOP_OPEN(lvp, FREAD, cred); if (error == 0) { error = union_copyfile(lvp, uvp, cred, l); (void) VOP_CLOSE(lvp, FREAD, cred); } if (error == 0) { /* Copy permissions up too */ vattr_null(&uvattr); uvattr.va_mode = lvattr.va_mode; uvattr.va_flags = lvattr.va_flags; error = VOP_SETATTR(uvp, &uvattr, cred); } VOP_UNLOCK(lvp); #ifdef UNION_DIAGNOSTIC if (error == 0) uprintf("union: copied up %s\n", un->un_path); #endif } union_vn_close(uvp, FWRITE, cred, l); /* * Subsequent IOs will go to the top layer, so * call close on the lower vnode and open on the * upper vnode to ensure that the filesystem keeps * its references counts right. This doesn't do * the right thing with (cred) and (FREAD) though. * Ignoring error returns is not right, either. */ if (error == 0) { int i; vn_lock(lvp, LK_EXCLUSIVE | LK_RETRY); for (i = 0; i < un->un_openl; i++) { (void) VOP_CLOSE(lvp, FREAD, cred); (void) VOP_OPEN(uvp, FREAD, cred); } un->un_openl = 0; VOP_UNLOCK(lvp); } return (error); } /* * Prepare the creation of a new node in the upper layer. * * (dvp) is the directory in which to create the new node. * it is locked on entry and exit. * (cnp) is the componentname to be created. * (cred, path, hash) are credentials, path and its hash to fill (cnp). */ static int union_do_lookup(struct vnode *dvp, struct componentname *cnp, kauth_cred_t cred, const char *path) { int error; struct vnode *vp; cnp->cn_nameiop = CREATE; cnp->cn_flags = LOCKPARENT | ISLASTCN; cnp->cn_cred = cred; cnp->cn_nameptr = path; cnp->cn_namelen = strlen(path); error = VOP_LOOKUP(dvp, &vp, cnp); if (error == 0) { KASSERT(vp != NULL); VOP_ABORTOP(dvp, cnp); vrele(vp); error = EEXIST; } else if (error == EJUSTRETURN) { error = 0; } return error; } /* * Create a shadow directory in the upper layer. * The new vnode is returned locked. * * (um) points to the union mount structure for access to the * the mounting process's credentials. * (dvp) is the directory in which to create the shadow directory. * it is unlocked on entry and exit. * (cnp) is the componentname to be created. * (vpp) is the returned newly created shadow directory, which * is returned locked. * * N.B. We still attempt to create shadow directories even if the union * is mounted read-only, which is a little nonintuitive. */ int union_mkshadow(struct union_mount *um, struct vnode *dvp, struct componentname *cnp, struct vnode **vpp) { int error; struct vattr va; struct componentname cn; char *pnbuf; if (cnp->cn_namelen + 1 > MAXPATHLEN) return ENAMETOOLONG; pnbuf = PNBUF_GET(); memcpy(pnbuf, cnp->cn_nameptr, cnp->cn_namelen); pnbuf[cnp->cn_namelen] = '\0'; vn_lock(dvp, LK_EXCLUSIVE | LK_RETRY); error = union_do_lookup(dvp, &cn, (um->um_op == UNMNT_ABOVE ? cnp->cn_cred : um->um_cred), pnbuf); if (error) { VOP_UNLOCK(dvp); PNBUF_PUT(pnbuf); return error; } /* * policy: when creating the shadow directory in the * upper layer, create it owned by the user who did * the mount, group from parent directory, and mode * 777 modified by umask (ie mostly identical to the * mkdir syscall). (jsp, kb) */ vattr_null(&va); va.va_type = VDIR; va.va_mode = um->um_cmode; KASSERT(*vpp == NULL); error = VOP_MKDIR(dvp, vpp, &cn, &va); VOP_UNLOCK(dvp); PNBUF_PUT(pnbuf); return error; } /* * Create a whiteout entry in the upper layer. * * (um) points to the union mount structure for access to the * the mounting process's credentials. * (dvp) is the directory in which to create the whiteout. * it is locked on entry and exit. * (cnp) is the componentname to be created. * (un) holds the path and its hash to be created. */ int union_mkwhiteout(struct union_mount *um, struct vnode *dvp, struct componentname *cnp, struct union_node *un) { int error; struct componentname cn; error = union_do_lookup(dvp, &cn, (um->um_op == UNMNT_ABOVE ? cnp->cn_cred : um->um_cred), un->un_path); if (error) return error; error = VOP_WHITEOUT(dvp, &cn, CREATE); return error; } /* * union_vn_create: creates and opens a new shadow file * on the upper union layer. this function is similar * in spirit to calling vn_open but it avoids calling namei(). * the problem with calling namei is that a) it locks too many * things, and b) it doesn't start at the "right" directory, * whereas union_do_lookup is told where to start. */ int union_vn_create(struct vnode **vpp, struct union_node *un, struct lwp *l) { struct vnode *vp; kauth_cred_t cred = l->l_cred; struct vattr vat; struct vattr *vap = &vat; int fmode = FFLAGS(O_WRONLY|O_CREAT|O_TRUNC|O_EXCL); int error; int cmode = UN_FILEMODE & ~l->l_proc->p_cwdi->cwdi_cmask; struct componentname cn; *vpp = NULLVP; vn_lock(un->un_dirvp, LK_EXCLUSIVE | LK_RETRY); error = union_do_lookup(un->un_dirvp, &cn, l->l_cred, un->un_path); if (error) { VOP_UNLOCK(un->un_dirvp); return error; } /* * Good - there was no race to create the file * so go ahead and create it. The permissions * on the file will be 0666 modified by the * current user's umask. Access to the file, while * it is unioned, will require access to the top *and* * bottom files. Access when not unioned will simply * require access to the top-level file. * TODO: confirm choice of access permissions. */ vattr_null(vap); vap->va_type = VREG; vap->va_mode = cmode; vp = NULL; error = VOP_CREATE(un->un_dirvp, &vp, &cn, vap); if (error) { VOP_UNLOCK(un->un_dirvp); return error; } vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); VOP_UNLOCK(un->un_dirvp); error = VOP_OPEN(vp, fmode, cred); if (error) { vput(vp); return error; } vp->v_writecount++; VOP_UNLOCK(vp); *vpp = vp; return 0; } int union_vn_close(struct vnode *vp, int fmode, kauth_cred_t cred, struct lwp *l) { if (fmode & FWRITE) --vp->v_writecount; return (VOP_CLOSE(vp, fmode, cred)); } void union_removed_upper(struct union_node *un) { struct vnode *vp = UNIONTOV(un); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); #if 1 /* * We do not set the uppervp to NULLVP here, because lowervp * may also be NULLVP, so this routine would end up creating * a bogus union node with no upper or lower VP (that causes * pain in many places that assume at least one VP exists). * Since we've removed this node from the cache hash chains, * it won't be found again. When all current holders * release it, union_inactive() will vgone() it. */ union_diruncache(un); #else union_newupper(un, NULLVP); #endif VOP_UNLOCK(vp); mutex_enter(&uhash_lock); if (un->un_cflags & UN_CACHED) { un->un_cflags &= ~UN_CACHED; LIST_REMOVE(un, un_cache); } mutex_exit(&uhash_lock); } #if 0 struct vnode * union_lowervp(struct vnode *vp) { struct union_node *un = VTOUNION(vp); if ((un->un_lowervp != NULLVP) && (vp->v_type == un->un_lowervp->v_type)) { if (vget(un->un_lowervp, 0, true /* wait */) == 0) return (un->un_lowervp); } return (NULLVP); } #endif /* * determine whether a whiteout is needed * during a remove/rmdir operation. */ int union_dowhiteout(struct union_node *un, kauth_cred_t cred) { struct vattr va; if (un->un_lowervp != NULLVP) return (1); if (VOP_GETATTR(un->un_uppervp, &va, cred) == 0 && (va.va_flags & OPAQUE)) return (1); return (0); } static void union_dircache_r(struct vnode *vp, struct vnode ***vppp, int *cntp) { struct union_node *un; if (vp->v_op != union_vnodeop_p) { if (vppp) { vref(vp); *(*vppp)++ = vp; if (--(*cntp) == 0) panic("union: dircache table too small"); } else { (*cntp)++; } return; } un = VTOUNION(vp); if (un->un_uppervp != NULLVP) union_dircache_r(un->un_uppervp, vppp, cntp); if (un->un_lowervp != NULLVP) union_dircache_r(un->un_lowervp, vppp, cntp); } struct vnode * union_dircache(struct vnode *vp, struct lwp *l) { int cnt; struct vnode *nvp = NULLVP; struct vnode **vpp; struct vnode **dircache; int error; vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); dircache = VTOUNION(vp)->un_dircache; nvp = NULLVP; if (dircache == 0) { cnt = 0; union_dircache_r(vp, 0, &cnt); cnt++; dircache = (struct vnode **) malloc(cnt * sizeof(struct vnode *), M_TEMP, M_WAITOK); vpp = dircache; union_dircache_r(vp, &vpp, &cnt); VTOUNION(vp)->un_dircache = dircache; *vpp = NULLVP; vpp = dircache + 1; } else { vpp = dircache; do { if (*vpp++ == VTOUNION(vp)->un_lowervp) break; } while (*vpp != NULLVP); } if (*vpp == NULLVP) goto out; vref(*vpp); error = union_allocvp(&nvp, vp->v_mount, NULLVP, NULLVP, 0, NULLVP, *vpp, 0); if (!error) { vn_lock(nvp, LK_EXCLUSIVE | LK_RETRY); VTOUNION(vp)->un_dircache = 0; VTOUNION(nvp)->un_hooknode = true; VTOUNION(nvp)->un_dircache = dircache; } out: VOP_UNLOCK(vp); return (nvp); } void union_diruncache(struct union_node *un) { struct vnode **vpp; KASSERT(VOP_ISLOCKED(UNIONTOV(un)) == LK_EXCLUSIVE); if (un->un_dircache != 0) { for (vpp = un->un_dircache; *vpp != NULLVP; vpp++) vrele(*vpp); free(un->un_dircache, M_TEMP); un->un_dircache = 0; } } /* * Check whether node can rmdir (check empty). */ int union_check_rmdir(struct union_node *un, kauth_cred_t cred) { int dirlen, eofflag, error; char *dirbuf; struct vattr va; struct vnode *tvp; struct dirent *dp, *edp; struct componentname cn; struct iovec aiov; struct uio auio; KASSERT(un->un_uppervp != NULL); /* Check upper for being opaque. */ KASSERT(VOP_ISLOCKED(un->un_uppervp)); error = VOP_GETATTR(un->un_uppervp, &va, cred); if (error || (va.va_flags & OPAQUE)) return error; if (un->un_lowervp == NULL) return 0; /* Check lower for being empty. */ vn_lock(un->un_lowervp, LK_SHARED | LK_RETRY); error = VOP_GETATTR(un->un_lowervp, &va, cred); if (error) { VOP_UNLOCK(un->un_lowervp); return error; } dirlen = va.va_blocksize; dirbuf = kmem_alloc(dirlen, KM_SLEEP); /* error = 0; */ eofflag = 0; auio.uio_offset = 0; do { aiov.iov_len = dirlen; aiov.iov_base = dirbuf; auio.uio_iov = &aiov; auio.uio_iovcnt = 1; auio.uio_resid = aiov.iov_len; auio.uio_rw = UIO_READ; UIO_SETUP_SYSSPACE(&auio); error = VOP_READDIR(un->un_lowervp, &auio, cred, &eofflag, NULL, NULL); if (error) break; edp = (struct dirent *)&dirbuf[dirlen - auio.uio_resid]; for (dp = (struct dirent *)dirbuf; error == 0 && dp < edp; dp = (struct dirent *)((char *)dp + dp->d_reclen)) { if (dp->d_reclen == 0) { error = ENOTEMPTY; break; } if (dp->d_type == DT_WHT || (dp->d_namlen == 1 && dp->d_name[0] == '.') || (dp->d_namlen == 2 && !memcmp(dp->d_name, "..", 2))) continue; /* Check for presence in the upper layer. */ cn.cn_nameiop = LOOKUP; cn.cn_flags = ISLASTCN | RDONLY; cn.cn_cred = cred; cn.cn_nameptr = dp->d_name; cn.cn_namelen = dp->d_namlen; error = VOP_LOOKUP(un->un_uppervp, &tvp, &cn); if (error == ENOENT && (cn.cn_flags & ISWHITEOUT)) { error = 0; continue; } if (error == 0) vrele(tvp); error = ENOTEMPTY; } } while (error == 0 && !eofflag); kmem_free(dirbuf, dirlen); VOP_UNLOCK(un->un_lowervp); return error; } /* * This hook is called from vn_readdir() to switch to lower directory * entry after the upper directory is read. */ int union_readdirhook(struct vnode **vpp, struct file *fp, struct lwp *l) { struct vnode *vp = *vpp, *lvp; struct vattr va; int error; if (vp->v_op != union_vnodeop_p) return (0); /* * If the directory is opaque, * then don't show lower entries */ vn_lock(vp, LK_SHARED | LK_RETRY); error = VOP_GETATTR(vp, &va, fp->f_cred); VOP_UNLOCK(vp); if (error || (va.va_flags & OPAQUE)) return error; if ((lvp = union_dircache(vp, l)) == NULLVP) return (0); error = VOP_OPEN(lvp, FREAD, fp->f_cred); if (error) { vput(lvp); return (error); } VOP_UNLOCK(lvp); fp->f_vnode = lvp; fp->f_offset = 0; error = vn_close(vp, FREAD, fp->f_cred); if (error) return (error); *vpp = lvp; return (0); }
2 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 /* $NetBSD: puffs_vfsops.c,v 1.126 2021/04/01 19:00:33 christos Exp $ */ /* * Copyright (c) 2005, 2006 Antti Kantee. All Rights Reserved. * * Development of this software was supported by the * Google Summer of Code program and the Ulla Tuominen Foundation. * The Google SoC project was mentored by Bill Studenmund. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: puffs_vfsops.c,v 1.126 2021/04/01 19:00:33 christos Exp $"); #include <sys/param.h> #include <sys/kernel.h> #include <sys/mount.h> #include <sys/extattr.h> #include <sys/queue.h> #include <sys/vnode.h> #include <sys/dirent.h> #include <sys/kauth.h> #include <sys/proc.h> #include <sys/module.h> #include <sys/kthread.h> #include <uvm/uvm.h> #include <dev/putter/putter_sys.h> #include <miscfs/genfs/genfs.h> #include <fs/puffs/puffs_msgif.h> #include <fs/puffs/puffs_sys.h> #include <lib/libkern/libkern.h> #include <nfs/nfsproto.h> /* for fh sizes */ MODULE(MODULE_CLASS_VFS, puffs, "putter"); VFS_PROTOS(puffs_vfsop); static struct putter_ops puffs_putter = { .pop_getout = puffs_msgif_getout, .pop_releaseout = puffs_msgif_releaseout, .pop_waitcount = puffs_msgif_waitcount, .pop_dispatch = puffs_msgif_dispatch, .pop_close = puffs_msgif_close, }; static const struct genfs_ops puffs_genfsops = { .gop_size = puffs_gop_size, .gop_write = genfs_gop_write, .gop_markupdate = puffs_gop_markupdate, #if 0 .gop_alloc, should ask userspace #endif .gop_putrange = genfs_gop_putrange, }; /* * Try to ensure data structures used by the puffs protocol * do not unexpectedly change. */ #if defined(__i386__) && defined(__ELF__) CTASSERT(sizeof(struct puffs_kargs) == 3928); CTASSERT(sizeof(struct vattr) == 136); CTASSERT(sizeof(struct puffs_req) == 44); #endif int puffs_vfsop_mount(struct mount *mp, const char *path, void *data, size_t *data_len) { struct puffs_mount *pmp = NULL; struct puffs_kargs *args; char fstype[_VFS_NAMELEN]; char *p; int error = 0, i; pid_t mntpid = curlwp->l_proc->p_pid; if (data == NULL) return EINVAL; if (*data_len < sizeof *args) return EINVAL; if (mp->mnt_flag & MNT_GETARGS) { pmp = MPTOPUFFSMP(mp); *(struct puffs_kargs *)data = pmp->pmp_args; *data_len = sizeof *args; return 0; } /* update is not supported currently */ if (mp->mnt_flag & MNT_UPDATE) return EOPNOTSUPP; args = (struct puffs_kargs *)data; if (args->pa_vers != PUFFSVERSION) { printf("puffs_mount: development version mismatch: " "kernel %d, lib %d\n", PUFFSVERSION, args->pa_vers); error = EINVAL; goto out; } if ((args->pa_flags & ~PUFFS_KFLAG_MASK) != 0) { printf("puffs_mount: invalid KFLAGs 0x%x\n", args->pa_flags); error = EINVAL; goto out; } if ((args->pa_fhflags & ~PUFFS_FHFLAG_MASK) != 0) { printf("puffs_mount: invalid FHFLAGs 0x%x\n", args->pa_fhflags); error = EINVAL; goto out; } for (i = 0; i < __arraycount(args->pa_spare); i++) { if (args->pa_spare[i] != 0) { printf("puffs_mount: pa_spare[%d] = 0x%x\n", i, args->pa_spare[i]); error = EINVAL; goto out; } } /* use dummy value for passthrough */ if (args->pa_fhflags & PUFFS_FHFLAG_PASSTHROUGH) args->pa_fhsize = sizeof(struct fid); /* sanitize file handle length */ if (PUFFS_TOFHSIZE(args->pa_fhsize) > FHANDLE_SIZE_MAX) { printf("puffs_mount: handle size %zu too large\n", args->pa_fhsize); error = EINVAL; goto out; } /* sanity check file handle max sizes */ if (args->pa_fhsize && args->pa_fhflags & PUFFS_FHFLAG_PROTOMASK) { size_t kfhsize = PUFFS_TOFHSIZE(args->pa_fhsize); if (args->pa_fhflags & PUFFS_FHFLAG_NFSV2) { if (NFSX_FHTOOBIG_P(kfhsize, 0)) { printf("puffs_mount: fhsize larger than " "NFSv2 max %d\n", PUFFS_FROMFHSIZE(NFSX_V2FH)); error = EINVAL; goto out; } } if (args->pa_fhflags & PUFFS_FHFLAG_NFSV3) { if (NFSX_FHTOOBIG_P(kfhsize, 1)) { printf("puffs_mount: fhsize larger than " "NFSv3 max %d\n", PUFFS_FROMFHSIZE(NFSX_V3FHMAX)); error = EINVAL; goto out; } } } /* don't allow non-printing characters (like my sweet umlauts.. snif) */ args->pa_typename[sizeof(args->pa_typename)-1] = '\0'; for (p = args->pa_typename; *p; p++) if (*p < ' ' || *p > '~') *p = '.'; args->pa_mntfromname[sizeof(args->pa_mntfromname)-1] = '\0'; for (p = args->pa_mntfromname; *p; p++) if (*p < ' ' || *p > '~') *p = '.'; /* build real name */ (void)strlcpy(fstype, PUFFS_TYPEPREFIX, sizeof(fstype)); (void)strlcat(fstype, args->pa_typename, sizeof(fstype)); /* inform user server if it got the max request size it wanted */ if (args->pa_maxmsglen == 0 || args->pa_maxmsglen > PUFFS_MSG_MAXSIZE) args->pa_maxmsglen = PUFFS_MSG_MAXSIZE; else if (args->pa_maxmsglen < 2*PUFFS_MSGSTRUCT_MAX) args->pa_maxmsglen = 2*PUFFS_MSGSTRUCT_MAX; (void)strlcpy(args->pa_typename, fstype, sizeof(args->pa_typename)); error = set_statvfs_info(path, UIO_USERSPACE, args->pa_mntfromname, UIO_SYSSPACE, fstype, mp, curlwp); if (error) goto out; mp->mnt_stat.f_iosize = DEV_BSIZE; mp->mnt_stat.f_namemax = args->pa_svfsb.f_namemax; /* * We can't handle the VFS_STATVFS() mount_domount() does * after VFS_MOUNT() because we'd deadlock, so handle it * here already. */ struct statvfs *sb = STATVFSBUF_GET(); puffs_statvfs_to_statvfs(&args->pa_svfsb, sb); copy_statvfs_info(sb, mp); STATVFSBUF_PUT(sb); statvfs_to_puffs_statvfs(&mp->mnt_stat, &args->pa_svfsb); KASSERT(curlwp != uvm.pagedaemon_lwp); pmp = kmem_zalloc(sizeof(struct puffs_mount), KM_SLEEP); mp->mnt_fs_bshift = DEV_BSHIFT; mp->mnt_dev_bshift = DEV_BSHIFT; mp->mnt_flag &= ~MNT_LOCAL; /* we don't really know, so ... */ mp->mnt_data = pmp; #if 0 /* * XXX: puffs code is MPSAFE. However, VFS really isn't. * Currently, there is nothing which protects an inode from * reclaim while there are threads inside the file system. * This means that in the event of a server crash, an MPSAFE * mount is likely to end up accessing invalid memory. For the * non-mpsafe case, the kernel lock, general structure of * puffs and pmp_refcount protect the threads during escape. * * Fixing this will require: * a) fixing vfs * OR * b) adding a small sleep to puffs_msgif_close() between * userdead() and dounmount(). * (well, this isn't really a fix, but would solve * 99.999% of the race conditions). * * Also, in the event of "b", unmount -f should be used, * like with any other file system, sparingly and only when * it is "known" to be safe. */ mp->mnt_iflags |= IMNT_MPSAFE; #endif pmp->pmp_status = PUFFSTAT_MOUNTING; pmp->pmp_mp = mp; pmp->pmp_msg_maxsize = args->pa_maxmsglen; pmp->pmp_args = *args; /* * Inform the fileops processing code that we have a mountpoint. * If it doesn't know about anyone with our pid/fd having the * device open, punt */ if ((pmp->pmp_pi = putter_attach(mntpid, args->pa_fd, pmp, &puffs_putter)) == NULL) { error = ENOENT; goto out; } /* XXX: check parameters */ pmp->pmp_root_cookie = args->pa_root_cookie; switch (args->pa_root_vtype) { case VNON: case VREG: case VDIR: case VBLK: case VCHR: case VLNK: case VSOCK: case VFIFO: break; default: error = EINVAL; goto out; } pmp->pmp_root_vtype = args->pa_root_vtype; if (args->pa_root_vsize < 0) { error = EINVAL; goto out; } pmp->pmp_root_vsize = args->pa_root_vsize; pmp->pmp_root_rdev = args->pa_root_rdev; pmp->pmp_docompat = args->pa_time32; mutex_init(&pmp->pmp_lock, MUTEX_DEFAULT, IPL_NONE); mutex_init(&pmp->pmp_sopmtx, MUTEX_DEFAULT, IPL_NONE); cv_init(&pmp->pmp_msg_waiter_cv, "puffsget"); cv_init(&pmp->pmp_refcount_cv, "puffsref"); cv_init(&pmp->pmp_unmounting_cv, "puffsum"); cv_init(&pmp->pmp_sopcv, "puffsop"); TAILQ_INIT(&pmp->pmp_msg_touser); TAILQ_INIT(&pmp->pmp_msg_replywait); TAILQ_INIT(&pmp->pmp_sopfastreqs); TAILQ_INIT(&pmp->pmp_sopnodereqs); if ((error = kthread_create(PRI_NONE, KTHREAD_MPSAFE, NULL, puffs_sop_thread, pmp, NULL, "puffsop")) != 0) goto out; pmp->pmp_sopthrcount = 1; DPRINTF(("puffs_mount: mount point at %p, puffs specific at %p\n", mp, MPTOPUFFSMP(mp))); vfs_getnewfsid(mp); out: if (error && pmp && pmp->pmp_pi) putter_detach(pmp->pmp_pi); if (error && pmp) kmem_free(pmp, sizeof(struct puffs_mount)); return error; } int puffs_vfsop_start(struct mount *mp, int flags) { struct puffs_mount *pmp = MPTOPUFFSMP(mp); KASSERT(pmp->pmp_status == PUFFSTAT_MOUNTING); pmp->pmp_status = PUFFSTAT_RUNNING; return 0; } int puffs_vfsop_unmount(struct mount *mp, int mntflags) { PUFFS_MSG_VARS(vfs, unmount); struct puffs_mount *pmp; int error, force; error = 0; force = mntflags & MNT_FORCE; pmp = MPTOPUFFSMP(mp); DPRINTF(("puffs_unmount: detach filesystem from vfs, current " "status 0x%x\n", pmp->pmp_status)); /* * flush all the vnodes. VOP_RECLAIM() takes care that the * root vnode does not get flushed until unmount. The * userspace root node cookie is stored in the mount * structure, so we can always re-instantiate a root vnode, * should userspace unmount decide it doesn't want to * cooperate. */ error = vflush(mp, NULLVP, force ? FORCECLOSE : 0); if (error) goto out; /* * If we are not DYING, we should ask userspace's opinion * about the situation */ mutex_enter(&pmp->pmp_lock); if (pmp->pmp_status != PUFFSTAT_DYING) { pmp->pmp_unmounting = 1; mutex_exit(&pmp->pmp_lock); PUFFS_MSG_ALLOC(vfs, unmount); puffs_msg_setinfo(park_unmount, PUFFSOP_VFS, PUFFS_VFS_UNMOUNT, NULL); unmount_msg->pvfsr_flags = mntflags; PUFFS_MSG_ENQUEUEWAIT(pmp, park_unmount, error); PUFFS_MSG_RELEASE(unmount); error = checkerr(pmp, error, __func__); DPRINTF(("puffs_unmount: error %d force %d\n", error, force)); mutex_enter(&pmp->pmp_lock); pmp->pmp_unmounting = 0; cv_broadcast(&pmp->pmp_unmounting_cv); } /* * if userspace cooperated or we really need to die, * screw what userland thinks and just die. */ if (error == 0 || force) { struct puffs_sopreq *psopr; /* tell waiters & other resources to go unwait themselves */ puffs_userdead(pmp); putter_detach(pmp->pmp_pi); /* * Wait until there are no more users for the mount resource. * Notice that this is hooked against transport_close * and return from touser. In an ideal world, it would * be hooked against final return from all operations. * But currently it works well enough, since nobody * does weird blocking voodoo after return from touser(). */ while (pmp->pmp_refcount != 0) cv_wait(&pmp->pmp_refcount_cv, &pmp->pmp_lock); mutex_exit(&pmp->pmp_lock); /* * Release kernel thread now that there is nothing * it would be wanting to lock. */ KASSERT(curlwp != uvm.pagedaemon_lwp); psopr = kmem_alloc(sizeof(*psopr), KM_SLEEP); psopr->psopr_sopreq = PUFFS_SOPREQSYS_EXIT; mutex_enter(&pmp->pmp_sopmtx); if (pmp->pmp_sopthrcount == 0) { mutex_exit(&pmp->pmp_sopmtx); kmem_free(psopr, sizeof(*psopr)); mutex_enter(&pmp->pmp_sopmtx); KASSERT(pmp->pmp_sopthrcount == 0); } else { TAILQ_INSERT_TAIL(&pmp->pmp_sopfastreqs, psopr, psopr_entries); cv_signal(&pmp->pmp_sopcv); } while (pmp->pmp_sopthrcount > 0) cv_wait(&pmp->pmp_sopcv, &pmp->pmp_sopmtx); mutex_exit(&pmp->pmp_sopmtx); /* free resources now that we hopefully have no waiters left */ cv_destroy(&pmp->pmp_unmounting_cv); cv_destroy(&pmp->pmp_refcount_cv); cv_destroy(&pmp->pmp_msg_waiter_cv); cv_destroy(&pmp->pmp_sopcv); mutex_destroy(&pmp->pmp_lock); mutex_destroy(&pmp->pmp_sopmtx); kmem_free(pmp, sizeof(struct puffs_mount)); error = 0; } else { mutex_exit(&pmp->pmp_lock); } out: DPRINTF(("puffs_unmount: return %d\n", error)); return error; } /* * This doesn't need to travel to userspace */ int puffs_vfsop_root(struct mount *mp, int lktype, struct vnode **vpp) { struct puffs_mount *pmp = MPTOPUFFSMP(mp); int rv; rv = puffs_cookie2vnode(pmp, pmp->pmp_root_cookie, vpp); KASSERT(rv != PUFFS_NOSUCHCOOKIE); if (rv != 0) return rv; rv = vn_lock(*vpp, lktype); if (rv != 0) { vrele(*vpp); *vpp = NULL; return rv; } return 0; } int puffs_vfsop_statvfs(struct mount *mp, struct statvfs *sbp) { PUFFS_MSG_VARS(vfs, statvfs); struct puffs_mount *pmp; int error = 0; pmp = MPTOPUFFSMP(mp); /* * If we are mounting, it means that the userspace counterpart * is calling mount(2), but mount(2) also calls statvfs. So * requesting statvfs from userspace would mean a deadlock. * Compensate. */ if (__predict_false(pmp->pmp_status == PUFFSTAT_MOUNTING)) return EINPROGRESS; PUFFS_MSG_ALLOC(vfs, statvfs); puffs_msg_setinfo(park_statvfs, PUFFSOP_VFS, PUFFS_VFS_STATVFS, NULL); PUFFS_MSG_ENQUEUEWAIT(pmp, park_statvfs, error); error = checkerr(pmp, error, __func__); statvfs_msg->pvfsr_sb.f_iosize = DEV_BSIZE; /* * Try to produce a sensible result even in the event * of userspace error. * * XXX: cache the copy in non-error case */ if (!error) { puffs_statvfs_to_statvfs(&statvfs_msg->pvfsr_sb, sbp); } copy_statvfs_info(sbp, mp); if (!error) { statvfs_to_puffs_statvfs(sbp, &statvfs_msg->pvfsr_sb); } PUFFS_MSG_RELEASE(statvfs); return error; } static bool pageflush_selector(void *cl, struct vnode *vp) { KASSERT(mutex_owned(vp->v_interlock)); return vp->v_type == VREG && !(LIST_EMPTY(&vp->v_dirtyblkhd) && (vp->v_iflag & VI_ONWORKLST) == 0); } static int pageflush(struct mount *mp, kauth_cred_t cred, int waitfor) { struct puffs_node *pn; struct vnode *vp; struct vnode_iterator *marker; int error, rv, fsyncwait; error = 0; fsyncwait = (waitfor == MNT_WAIT) ? FSYNC_WAIT : 0; /* * Sync all cached data from regular vnodes (which are not * currently locked, see below). After this we call VFS_SYNC * for the fs server, which should handle data and metadata for * all the nodes it knows to exist. */ vfs_vnode_iterator_init(mp, &marker); while ((vp = vfs_vnode_iterator_next(marker, pageflush_selector, NULL))) { /* * Here we try to get a reference to the vnode and to * lock it. This is mostly cargo-culted, but I will * offer an explanation to why I believe this might * actually do the right thing. * * If the vnode is a goner, we quite obviously don't need * to sync it. * * If the vnode was busy, we don't need to sync it because * this is never called with MNT_WAIT except from * dounmount(), when we are wait-flushing all the dirty * vnodes through other routes in any case. So there, * sync() doesn't actually sync. Happy now? */ error = vn_lock(vp, LK_EXCLUSIVE | LK_NOWAIT); if (error) { vrele(vp); continue; } pn = VPTOPP(vp); /* hmm.. is the FAF thing entirely sensible? */ if (waitfor == MNT_LAZY) { mutex_enter(vp->v_interlock); pn->pn_stat |= PNODE_FAF; mutex_exit(vp->v_interlock); } rv = VOP_FSYNC(vp, cred, fsyncwait, 0, 0); if (waitfor == MNT_LAZY) { mutex_enter(vp->v_interlock); pn->pn_stat &= ~PNODE_FAF; mutex_exit(vp->v_interlock); } if (rv) error = rv; vput(vp); } vfs_vnode_iterator_destroy(marker); return error; } int puffs_vfsop_sync(struct mount *mp, int waitfor, struct kauth_cred *cred) { PUFFS_MSG_VARS(vfs, sync); struct puffs_mount *pmp = MPTOPUFFSMP(mp); int error, rv; error = pageflush(mp, cred, waitfor); /* sync fs */ PUFFS_MSG_ALLOC(vfs, sync); sync_msg->pvfsr_waitfor = waitfor; puffs_credcvt(&sync_msg->pvfsr_cred, cred); puffs_msg_setinfo(park_sync, PUFFSOP_VFS, PUFFS_VFS_SYNC, NULL); PUFFS_MSG_ENQUEUEWAIT(pmp, park_sync, rv); rv = checkerr(pmp, rv, __func__); if (rv) error = rv; PUFFS_MSG_RELEASE(sync); return error; } int puffs_vfsop_fhtovp(struct mount *mp, struct fid *fhp, int lktype, struct vnode **vpp) { PUFFS_MSG_VARS(vfs, fhtonode); struct puffs_mount *pmp = MPTOPUFFSMP(mp); struct vnode *vp; void *fhdata; size_t argsize, fhlen; int error; if (pmp->pmp_args.pa_fhsize == 0) return EOPNOTSUPP; if (pmp->pmp_args.pa_fhflags & PUFFS_FHFLAG_PASSTHROUGH) { fhlen = fhp->fid_len; fhdata = fhp; } else { fhlen = PUFFS_FROMFHSIZE(fhp->fid_len); fhdata = fhp->fid_data; if (pmp->pmp_args.pa_fhflags & PUFFS_FHFLAG_DYNAMIC) { if (pmp->pmp_args.pa_fhsize < fhlen) return EINVAL; } else { if (pmp->pmp_args.pa_fhsize != fhlen) return EINVAL; } } argsize = sizeof(struct puffs_vfsmsg_fhtonode) + fhlen; puffs_msgmem_alloc(argsize, &park_fhtonode, (void *)&fhtonode_msg, 1); fhtonode_msg->pvfsr_dsize = fhlen; memcpy(fhtonode_msg->pvfsr_data, fhdata, fhlen); puffs_msg_setinfo(park_fhtonode, PUFFSOP_VFS, PUFFS_VFS_FHTOVP, NULL); PUFFS_MSG_ENQUEUEWAIT(pmp, park_fhtonode, error); error = checkerr(pmp, error, __func__); if (error) goto out; error = puffs_getvnode(mp, fhtonode_msg->pvfsr_fhcookie, fhtonode_msg->pvfsr_vtype, fhtonode_msg->pvfsr_size, fhtonode_msg->pvfsr_rdev, &vp); if (error) goto out; vn_lock(vp, lktype | LK_RETRY); *vpp = vp; out: puffs_msgmem_release(park_fhtonode); return error; } int puffs_vfsop_vptofh(struct vnode *vp, struct fid *fhp, size_t *fh_size) { PUFFS_MSG_VARS(vfs, nodetofh); struct puffs_mount *pmp = MPTOPUFFSMP(vp->v_mount); size_t argsize, fhlen; int error; if (pmp->pmp_args.pa_fhsize == 0) return EOPNOTSUPP; /* if file handles are static len, we can test len immediately */ if (((pmp->pmp_args.pa_fhflags & PUFFS_FHFLAG_DYNAMIC) == 0) && ((pmp->pmp_args.pa_fhflags & PUFFS_FHFLAG_PASSTHROUGH) == 0) && (PUFFS_FROMFHSIZE(*fh_size) < pmp->pmp_args.pa_fhsize)) { *fh_size = PUFFS_TOFHSIZE(pmp->pmp_args.pa_fhsize); return E2BIG; } if (pmp->pmp_args.pa_fhflags & PUFFS_FHFLAG_PASSTHROUGH) fhlen = *fh_size; else fhlen = PUFFS_FROMFHSIZE(*fh_size); argsize = sizeof(struct puffs_vfsmsg_nodetofh) + fhlen; puffs_msgmem_alloc(argsize, &park_nodetofh, (void *)&nodetofh_msg, 1); nodetofh_msg->pvfsr_fhcookie = VPTOPNC(vp); nodetofh_msg->pvfsr_dsize = fhlen; puffs_msg_setinfo(park_nodetofh, PUFFSOP_VFS, PUFFS_VFS_VPTOFH, NULL); PUFFS_MSG_ENQUEUEWAIT(pmp, park_nodetofh, error); error = checkerr(pmp, error, __func__); if (pmp->pmp_args.pa_fhflags & PUFFS_FHFLAG_PASSTHROUGH) fhlen = nodetofh_msg->pvfsr_dsize; else if (pmp->pmp_args.pa_fhflags & PUFFS_FHFLAG_DYNAMIC) fhlen = PUFFS_TOFHSIZE(nodetofh_msg->pvfsr_dsize); else fhlen = PUFFS_TOFHSIZE(pmp->pmp_args.pa_fhsize); if (error) { if (error == E2BIG) *fh_size = fhlen; goto out; } if (fhlen > FHANDLE_SIZE_MAX) { puffs_senderr(pmp, PUFFS_ERR_VPTOFH, E2BIG, "file handle too big", VPTOPNC(vp)); error = EPROTO; goto out; } if (*fh_size < fhlen) { *fh_size = fhlen; error = E2BIG; goto out; } *fh_size = fhlen; if (fhp) { if (pmp->pmp_args.pa_fhflags & PUFFS_FHFLAG_PASSTHROUGH) { memcpy(fhp, nodetofh_msg->pvfsr_data, fhlen); } else { fhp->fid_len = *fh_size; memcpy(fhp->fid_data, nodetofh_msg->pvfsr_data, nodetofh_msg->pvfsr_dsize); } } out: puffs_msgmem_release(park_nodetofh); return error; } int puffs_vfsop_loadvnode(struct mount *mp, struct vnode *vp, const void *key, size_t key_len, const void **new_key) { struct puffs_mount *pmp; struct puffs_node *pnode; KASSERT(key_len == sizeof(puffs_cookie_t)); pmp = MPTOPUFFSMP(mp); /* Allocate and initialize the pnode. */ pnode = pool_get(&puffs_pnpool, PR_WAITOK); memset(pnode, 0, sizeof(struct puffs_node)); pnode->pn_vp = vp; memcpy(&pnode->pn_cookie, key, key_len); pnode->pn_refcount = 1; mutex_init(&pnode->pn_mtx, MUTEX_DEFAULT, IPL_NONE); mutex_init(&pnode->pn_sizemtx, MUTEX_DEFAULT, IPL_NONE); selinit(&pnode->pn_sel); vp->v_tag = VT_PUFFS; vp->v_type = VNON; vp->v_op = puffs_vnodeop_p; if (pnode->pn_cookie == pmp->pmp_root_cookie) vp->v_vflag |= VV_ROOT; vp->v_data = pnode; genfs_node_init(vp, &puffs_genfsops); uvm_vnp_setsize(vp, 0); *new_key = &pnode->pn_cookie; return 0; } void puffs_vfsop_init(void) { /* some checks depend on this */ KASSERT(VNOVAL == VSIZENOTSET); pool_init(&puffs_pnpool, sizeof(struct puffs_node), 0, 0, 0, "puffpnpl", &pool_allocator_nointr, IPL_NONE); pool_init(&puffs_vapool, sizeof(struct vattr), 0, 0, 0, "puffvapl", &pool_allocator_nointr, IPL_NONE); puffs_msgif_init(); } void puffs_vfsop_done(void) { puffs_msgif_destroy(); pool_destroy(&puffs_pnpool); pool_destroy(&puffs_vapool); } int puffs_vfsop_snapshot(struct mount *mp, struct vnode *vp, struct timespec *ts) { return EOPNOTSUPP; } int puffs_vfsop_extattrctl(struct mount *mp, int cmd, struct vnode *vp, int attrnamespace, const char *attrname) { PUFFS_MSG_VARS(vfs, extattrctl); struct puffs_mount *pmp = MPTOPUFFSMP(mp); struct puffs_node *pnp; puffs_cookie_t pnc; int error, flags; if (vp) { /* doesn't make sense for puffs servers */ if (vp->v_mount != mp) return EXDEV; pnp = vp->v_data; pnc = pnp->pn_cookie; flags = PUFFS_EXTATTRCTL_HASNODE; } else { pnp = pnc = NULL; flags = 0; } PUFFS_MSG_ALLOC(vfs, extattrctl); extattrctl_msg->pvfsr_cmd = cmd; extattrctl_msg->pvfsr_attrnamespace = attrnamespace; extattrctl_msg->pvfsr_flags = flags; if (attrname) { strlcpy(extattrctl_msg->pvfsr_attrname, attrname, sizeof(extattrctl_msg->pvfsr_attrname)); extattrctl_msg->pvfsr_flags |= PUFFS_EXTATTRCTL_HASATTRNAME; } puffs_msg_setinfo(park_extattrctl, PUFFSOP_VFS, PUFFS_VFS_EXTATTRCTL, pnc); puffs_msg_enqueue(pmp, park_extattrctl); if (vp) { mutex_enter(&pnp->pn_mtx); puffs_referencenode(pnp); mutex_exit(&pnp->pn_mtx); VOP_UNLOCK(vp); } error = puffs_msg_wait2(pmp, park_extattrctl, pnp, NULL); PUFFS_MSG_RELEASE(extattrctl); if (vp) { puffs_releasenode(pnp); } return checkerr(pmp, error, __func__); } const struct vnodeopv_desc * const puffs_vnodeopv_descs[] = { &puffs_vnodeop_opv_desc, &puffs_specop_opv_desc, &puffs_fifoop_opv_desc, &puffs_msgop_opv_desc, NULL, }; struct vfsops puffs_vfsops = { .vfs_name = MOUNT_PUFFS, .vfs_min_mount_data = sizeof (struct puffs_kargs), .vfs_mount = puffs_vfsop_mount, .vfs_start = puffs_vfsop_start, .vfs_unmount = puffs_vfsop_unmount, .vfs_root = puffs_vfsop_root, .vfs_quotactl = (void *)eopnotsupp, .vfs_statvfs = puffs_vfsop_statvfs, .vfs_sync = puffs_vfsop_sync, .vfs_vget = (void *)eopnotsupp, .vfs_loadvnode = puffs_vfsop_loadvnode, .vfs_fhtovp = puffs_vfsop_fhtovp, .vfs_vptofh = puffs_vfsop_vptofh, .vfs_init = puffs_vfsop_init, .vfs_done = puffs_vfsop_done, .vfs_snapshot = puffs_vfsop_snapshot, .vfs_extattrctl = puffs_vfsop_extattrctl, .vfs_suspendctl = genfs_suspendctl, .vfs_renamelock_enter = genfs_renamelock_enter, .vfs_renamelock_exit = genfs_renamelock_exit, .vfs_fsync = (void *)eopnotsupp, .vfs_opv_descs = puffs_vnodeopv_descs }; static int puffs_modcmd(modcmd_t cmd, void *arg) { switch (cmd) { case MODULE_CMD_INIT: return vfs_attach(&puffs_vfsops); case MODULE_CMD_FINI: return vfs_detach(&puffs_vfsops); default: return ENOTTY; } }
9 9 1 1 1 1 1 1 1 1 1 1 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 5 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 5 6 6 6 6 6 6 6 6 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 6 6 6 6 6 6 6 6 6 6 6 6 6 6 3 6 6 6 6 6 6 6 6 6 6 5 6 6 4 2 2 2 4 3 3 3 3 3 2 1 1 1 3 3 4 2 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 8 8 7 7 8 8 8 8 4 8 9 9 9 9 9 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 2 3 3 1 9 9 9 9 9 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 /* $NetBSD: tmpfs_rename.c,v 1.12 2021/10/20 14:28:21 thorpej Exp $ */ /*- * Copyright (c) 2012 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Taylor R Campbell. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /* * tmpfs rename */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: tmpfs_rename.c,v 1.12 2021/10/20 14:28:21 thorpej Exp $"); #include <sys/param.h> #include <sys/errno.h> #include <sys/kauth.h> #include <sys/mount.h> #include <sys/namei.h> #include <sys/stat.h> #include <sys/vnode.h> #include <sys/vnode_if.h> #include <miscfs/genfs/genfs.h> #include <fs/tmpfs/tmpfs_vnops.h> #include <fs/tmpfs/tmpfs.h> /* * Forward declarations */ static int tmpfs_sane_rename(struct vnode *, struct componentname *, struct vnode *, struct componentname *, kauth_cred_t, bool); static bool tmpfs_rmdired_p(struct vnode *); static int tmpfs_gro_lock_directory(struct mount *, struct vnode *); static const struct genfs_rename_ops tmpfs_genfs_rename_ops; /* * tmpfs_sane_rename: The hairiest vop, with the saner API. * * Arguments: * * . fdvp (from directory vnode), * . fcnp (from component name), * . tdvp (to directory vnode), * . tcnp (to component name), * . cred (credentials structure), and * . posixly_correct (flag for behaviour if target & source link same file). * * fdvp and tdvp may be the same, and must be referenced and unlocked. */ static int tmpfs_sane_rename( struct vnode *fdvp, struct componentname *fcnp, struct vnode *tdvp, struct componentname *tcnp, kauth_cred_t cred, bool posixly_correct) { struct tmpfs_dirent *fdirent, *tdirent; return genfs_sane_rename(&tmpfs_genfs_rename_ops, fdvp, fcnp, &fdirent, tdvp, tcnp, &tdirent, cred, posixly_correct); } /* * tmpfs_rename: The hairiest vop, with the insanest API. Defer to * genfs_insane_rename immediately. */ int tmpfs_rename(void *v) { return genfs_insane_rename(v, &tmpfs_sane_rename); } /* * tmpfs_gro_directory_empty_p: Return true if the directory vp is * empty. dvp is its parent. * * vp and dvp must be locked and referenced. */ static bool tmpfs_gro_directory_empty_p(struct mount *mp, kauth_cred_t cred, struct vnode *vp, struct vnode *dvp) { (void)mp; (void)cred; (void)dvp; KASSERT(mp != NULL); KASSERT(vp != NULL); KASSERT(dvp != NULL); KASSERT(vp != dvp); KASSERT(vp->v_mount == mp); KASSERT(dvp->v_mount == mp); KASSERT(VOP_ISLOCKED(vp) == LK_EXCLUSIVE); KASSERT(VOP_ISLOCKED(dvp) == LK_EXCLUSIVE); return (VP_TO_TMPFS_NODE(vp)->tn_size == 0); } /* * tmpfs_gro_rename_check_possible: Check whether a rename is possible * independent of credentials. */ static int tmpfs_gro_rename_check_possible(struct mount *mp, struct vnode *fdvp, struct vnode *fvp, struct vnode *tdvp, struct vnode *tvp) { (void)mp; KASSERT(mp != NULL); KASSERT(fdvp != NULL); KASSERT(fvp != NULL); KASSERT(tdvp != NULL); KASSERT(fdvp != fvp); KASSERT(fdvp != tvp); KASSERT(tdvp != fvp); KASSERT(tdvp != tvp); KASSERT(fvp != tvp); KASSERT(fdvp->v_type == VDIR); KASSERT(tdvp->v_type == VDIR); KASSERT(fdvp->v_mount == mp); KASSERT(fvp->v_mount == mp); KASSERT(tdvp->v_mount == mp); KASSERT((tvp == NULL) || (tvp->v_mount == mp)); KASSERT(VOP_ISLOCKED(fdvp) == LK_EXCLUSIVE); KASSERT(VOP_ISLOCKED(fvp) == LK_EXCLUSIVE); KASSERT(VOP_ISLOCKED(tdvp) == LK_EXCLUSIVE); KASSERT((tvp == NULL) || (VOP_ISLOCKED(tvp) == LK_EXCLUSIVE)); return genfs_ufslike_rename_check_possible( VP_TO_TMPFS_NODE(fdvp)->tn_flags, VP_TO_TMPFS_NODE(fvp)->tn_flags, VP_TO_TMPFS_NODE(tdvp)->tn_flags, (tvp? VP_TO_TMPFS_NODE(tvp)->tn_flags : 0), (tvp != NULL), IMMUTABLE, APPEND); } /* * tmpfs_gro_rename_check_permitted: Check whether a rename is * permitted given our credentials. */ static int tmpfs_gro_rename_check_permitted(struct mount *mp, kauth_cred_t cred, struct vnode *fdvp, struct vnode *fvp, struct vnode *tdvp, struct vnode *tvp) { (void)mp; KASSERT(mp != NULL); KASSERT(fdvp != NULL); KASSERT(fvp != NULL); KASSERT(tdvp != NULL); KASSERT(fdvp != fvp); KASSERT(fdvp != tvp); KASSERT(tdvp != fvp); KASSERT(tdvp != tvp); KASSERT(fvp != tvp); KASSERT(fdvp->v_type == VDIR); KASSERT(tdvp->v_type == VDIR); KASSERT(fdvp->v_mount == mp); KASSERT(fvp->v_mount == mp); KASSERT(tdvp->v_mount == mp); KASSERT((tvp == NULL) || (tvp->v_mount == mp)); KASSERT(VOP_ISLOCKED(fdvp) == LK_EXCLUSIVE); KASSERT(VOP_ISLOCKED(fvp) == LK_EXCLUSIVE); KASSERT(VOP_ISLOCKED(tdvp) == LK_EXCLUSIVE); KASSERT((tvp == NULL) || (VOP_ISLOCKED(tvp) == LK_EXCLUSIVE)); return genfs_ufslike_rename_check_permitted(cred, fdvp, VP_TO_TMPFS_NODE(fdvp)->tn_mode, VP_TO_TMPFS_NODE(fdvp)->tn_uid, fvp, VP_TO_TMPFS_NODE(fvp)->tn_uid, tdvp, VP_TO_TMPFS_NODE(tdvp)->tn_mode, VP_TO_TMPFS_NODE(tdvp)->tn_uid, tvp, (tvp? VP_TO_TMPFS_NODE(tvp)->tn_uid : 0)); } /* * tmpfs_gro_remove_check_possible: Check whether a remove is possible * independent of credentials. */ static int tmpfs_gro_remove_check_possible(struct mount *mp, struct vnode *dvp, struct vnode *vp) { (void)mp; KASSERT(mp != NULL); KASSERT(dvp != NULL); KASSERT(vp != NULL); KASSERT(dvp != vp); KASSERT(dvp->v_type == VDIR); KASSERT(vp->v_type != VDIR); KASSERT(dvp->v_mount == mp); KASSERT(vp->v_mount == mp); KASSERT(VOP_ISLOCKED(dvp) == LK_EXCLUSIVE); KASSERT(VOP_ISLOCKED(vp) == LK_EXCLUSIVE); return genfs_ufslike_remove_check_possible( VP_TO_TMPFS_NODE(dvp)->tn_flags, VP_TO_TMPFS_NODE(vp)->tn_flags, IMMUTABLE, APPEND); } /* * tmpfs_gro_remove_check_permitted: Check whether a remove is * permitted given our credentials. */ static int tmpfs_gro_remove_check_permitted(struct mount *mp, kauth_cred_t cred, struct vnode *dvp, struct vnode *vp) { (void)mp; KASSERT(mp != NULL); KASSERT(dvp != NULL); KASSERT(vp != NULL); KASSERT(dvp != vp); KASSERT(dvp->v_type == VDIR); KASSERT(vp->v_type != VDIR); KASSERT(dvp->v_mount == mp); KASSERT(vp->v_mount == mp); KASSERT(VOP_ISLOCKED(dvp) == LK_EXCLUSIVE); KASSERT(VOP_ISLOCKED(vp) == LK_EXCLUSIVE); return genfs_ufslike_remove_check_permitted(cred, dvp, VP_TO_TMPFS_NODE(dvp)->tn_mode, VP_TO_TMPFS_NODE(dvp)->tn_uid, vp, VP_TO_TMPFS_NODE(vp)->tn_uid); } /* * tmpfs_gro_rename: Actually perform the rename operation. */ static int tmpfs_gro_rename(struct mount *mp, kauth_cred_t cred, struct vnode *fdvp, struct componentname *fcnp, void *fde, struct vnode *fvp, struct vnode *tdvp, struct componentname *tcnp, void *tde, struct vnode *tvp, nlink_t *tvp_nlinkp) { tmpfs_node_t *fdnode = VP_TO_TMPFS_DIR(fdvp); tmpfs_node_t *tdnode = VP_TO_TMPFS_DIR(tdvp); struct tmpfs_dirent **fdep = fde; struct tmpfs_dirent **tdep = tde; char *newname; (void)cred; KASSERT(mp != NULL); KASSERT(fdvp != NULL); KASSERT(fcnp != NULL); KASSERT(fdep != NULL); KASSERT(fvp != NULL); KASSERT(tdvp != NULL); KASSERT(tcnp != NULL); KASSERT(tdep != NULL); KASSERT(fdep != tdep); KASSERT((tvp == NULL) || (*fdep) != (*tdep)); KASSERT((*fdep) != NULL); KASSERT((*fdep)->td_node == VP_TO_TMPFS_NODE(fvp)); KASSERT((tvp == NULL) || ((*tdep) != NULL)); KASSERT((tvp == NULL) || ((*tdep)->td_node == VP_TO_TMPFS_NODE(tvp))); KASSERT(fdvp != fvp); KASSERT(fdvp != tvp); KASSERT(tdvp != fvp); KASSERT(tdvp != tvp); KASSERT(fvp != tvp); KASSERT(fdvp->v_mount == mp); KASSERT(fvp->v_mount == mp); KASSERT(tdvp->v_mount == mp); KASSERT((tvp == NULL) || (tvp->v_mount == mp)); KASSERT(VOP_ISLOCKED(fdvp) == LK_EXCLUSIVE); KASSERT(VOP_ISLOCKED(fvp) == LK_EXCLUSIVE); KASSERT(VOP_ISLOCKED(tdvp) == LK_EXCLUSIVE); KASSERT((tvp == NULL) || (VOP_ISLOCKED(tvp) == LK_EXCLUSIVE)); if (tmpfs_strname_neqlen(fcnp, tcnp)) { newname = tmpfs_strname_alloc(VFS_TO_TMPFS(mp), tcnp->cn_namelen); if (newname == NULL) return ENOSPC; } else { newname = NULL; } /* * If we are moving from one directory to another, detach the * source entry and reattach it to the target directory. */ if (fdvp != tdvp) { tmpfs_dir_detach(fdnode, *fdep); tmpfs_dir_attach(tdnode, *fdep, VP_TO_TMPFS_NODE(fvp)); } /* * If we are replacing an existing target entry, delete it. * * XXX What if the target is a directory with whiteout entries? */ if (tvp != NULL) { tdnode = VP_TO_TMPFS_DIR(tdvp); KASSERT((*tdep) != NULL); KASSERT((*tdep)->td_node == VP_TO_TMPFS_NODE(tvp)); KASSERT((fvp->v_type == VDIR) == (tvp->v_type == VDIR)); if (tvp->v_type == VDIR) { KASSERT(VP_TO_TMPFS_NODE(tvp)->tn_size == 0); KASSERT(VP_TO_TMPFS_NODE(tvp)->tn_links == 2); /* * Decrement the extra link count for `.' so * the vnode will be recycled when released. */ VP_TO_TMPFS_NODE(tvp)->tn_links--; } tmpfs_dir_detach(tdnode, *tdep); tmpfs_free_dirent(VFS_TO_TMPFS(mp), *tdep); *tvp_nlinkp = VP_TO_TMPFS_NODE(tvp)->tn_links; } /* * Update the directory entry's name if necessary, and flag * metadata updates. A memory allocation failure here is not * OK because we've already committed some changes that we * can't back out at this point, hence the early allocation * above. */ if (newname != NULL) { KASSERT(tcnp->cn_namelen <= TMPFS_MAXNAMLEN); tmpfs_strname_free(VFS_TO_TMPFS(mp), (*fdep)->td_name, (*fdep)->td_namelen); (*fdep)->td_namelen = (uint16_t)tcnp->cn_namelen; (void)memcpy(newname, tcnp->cn_nameptr, tcnp->cn_namelen); (*fdep)->td_name = newname; } /* * Update the timestamps of both parent directories and * the renamed file itself. */ tmpfs_update(fdvp, TMPFS_UPDATE_MTIME | TMPFS_UPDATE_CTIME); tmpfs_update(tdvp, TMPFS_UPDATE_MTIME | TMPFS_UPDATE_CTIME); tmpfs_update(fvp, TMPFS_UPDATE_CTIME); genfs_rename_cache_purge(fdvp, fvp, tdvp, tvp); return 0; } /* * tmpfs_gro_remove: Rename an object over another link to itself, * effectively removing just the original link. */ static int tmpfs_gro_remove(struct mount *mp, kauth_cred_t cred, struct vnode *dvp, struct componentname *cnp, void *de, struct vnode *vp, nlink_t *tvp_nlinkp) { tmpfs_node_t *dnode = VP_TO_TMPFS_DIR(dvp); struct tmpfs_dirent **dep = de; (void)vp; KASSERT(mp != NULL); KASSERT(dvp != NULL); KASSERT(cnp != NULL); KASSERT(dep != NULL); KASSERT(vp != NULL); KASSERT(dvp != vp); KASSERT(dvp->v_mount == mp); KASSERT(vp->v_mount == mp); KASSERT(dvp->v_type == VDIR); KASSERT(vp->v_type != VDIR); KASSERT(VOP_ISLOCKED(dvp) == LK_EXCLUSIVE); KASSERT(VOP_ISLOCKED(vp) == LK_EXCLUSIVE); KASSERT((*dep)->td_node == VP_TO_TMPFS_NODE(vp)); tmpfs_dir_detach(dnode, *dep); tmpfs_free_dirent(VFS_TO_TMPFS(mp), *dep); tmpfs_update(dvp, TMPFS_UPDATE_MTIME | TMPFS_UPDATE_CTIME); *tvp_nlinkp = VP_TO_TMPFS_NODE(vp)->tn_links; return 0; } /* * tmpfs_gro_lookup: Look up and save the lookup results. */ static int tmpfs_gro_lookup(struct mount *mp, struct vnode *dvp, struct componentname *cnp, void *de_ret, struct vnode **vp_ret) { struct tmpfs_dirent *dirent, **dep_ret = de_ret; struct vnode *vp; int error; (void)mp; KASSERT(mp != NULL); KASSERT(dvp != NULL); KASSERT(cnp != NULL); KASSERT(dep_ret != NULL); KASSERT(vp_ret != NULL); KASSERT(VOP_ISLOCKED(dvp) == LK_EXCLUSIVE); dirent = tmpfs_dir_lookup(VP_TO_TMPFS_NODE(dvp), cnp); if (dirent == NULL) return ENOENT; error = vcache_get(mp, &dirent->td_node, sizeof(dirent->td_node), &vp); if (error) return error; KASSERT(vp != NULL); *dep_ret = dirent; *vp_ret = vp; return 0; } /* * tmpfs_rmdired_p: Check whether the directory vp has been rmdired. * * vp must be locked and referenced. */ static bool tmpfs_rmdired_p(struct vnode *vp) { KASSERT(vp != NULL); KASSERT(VOP_ISLOCKED(vp) == LK_EXCLUSIVE); KASSERT(vp->v_type == VDIR); return (VP_TO_TMPFS_NODE(vp)->tn_spec.tn_dir.tn_parent == NULL); } /* * tmpfs_gro_genealogy: Analyze the genealogy of the source and target * directories. */ static int tmpfs_gro_genealogy(struct mount *mp, kauth_cred_t cred, struct vnode *fdvp, struct vnode *tdvp, struct vnode **intermediate_node_ret) { struct vnode *vp, *ovp; struct tmpfs_node *dnode; int error; (void)cred; KASSERT(mp != NULL); KASSERT(fdvp != NULL); KASSERT(tdvp != NULL); KASSERT(fdvp != tdvp); KASSERT(intermediate_node_ret != NULL); KASSERT(fdvp->v_mount == mp); KASSERT(tdvp->v_mount == mp); KASSERT(fdvp->v_type == VDIR); KASSERT(tdvp->v_type == VDIR); /* * We need to provisionally lock tdvp to keep rmdir from * deleting it -- or any ancestor -- at an inopportune moment. */ error = tmpfs_gro_lock_directory(mp, tdvp); if (error) return error; vp = tdvp; vref(vp); for (;;) { KASSERT(vp != NULL); KASSERT(VOP_ISLOCKED(vp) == LK_EXCLUSIVE); KASSERT(vp->v_mount == mp); KASSERT(vp->v_type == VDIR); KASSERT(!tmpfs_rmdired_p(vp)); dnode = VP_TO_TMPFS_NODE(vp)->tn_spec.tn_dir.tn_parent; /* * If dnode is null then vp has been rmdir'd, which is * not supposed to happen because we have it locked. */ KASSERT(dnode != NULL); /* Did we hit the root without finding fdvp? */ if (dnode == VP_TO_TMPFS_NODE(vp)) { vput(vp); *intermediate_node_ret = NULL; return 0; } /* Did we find that fdvp is an ancestor of tdvp? */ if (dnode == VP_TO_TMPFS_NODE(fdvp)) { KASSERT(dnode->tn_vnode == fdvp); /* Unlock vp, but keep it referenced. */ VOP_UNLOCK(vp); *intermediate_node_ret = vp; return 0; } /* Neither -- keep ascending the family tree. */ ovp = vp; vp = NULL; error = vcache_get(mp, &dnode, sizeof(dnode), &vp); vput(ovp); if (error) return error; error = vn_lock(vp, LK_EXCLUSIVE); if (error) { vrele(vp); return error; } /* * vcache_get only guarantees that dnode will not * be freed while we get a vnode for it. It does not * preserve any other invariants, so we must check * whether the parent has been removed in the meantime. */ if (tmpfs_rmdired_p(vp)) { vput(vp); return ENOENT; } } } /* * tmpfs_gro_lock_directory: Lock the directory vp, but fail if it has * been rmdir'd. */ static int tmpfs_gro_lock_directory(struct mount *mp, struct vnode *vp) { (void)mp; KASSERT(mp != NULL); KASSERT(vp != NULL); KASSERT(vp->v_mount == mp); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); if (tmpfs_rmdired_p(vp)) { VOP_UNLOCK(vp); return ENOENT; } return 0; } static const struct genfs_rename_ops tmpfs_genfs_rename_ops = { .gro_directory_empty_p = tmpfs_gro_directory_empty_p, .gro_rename_check_possible = tmpfs_gro_rename_check_possible, .gro_rename_check_permitted = tmpfs_gro_rename_check_permitted, .gro_remove_check_possible = tmpfs_gro_remove_check_possible, .gro_remove_check_permitted = tmpfs_gro_remove_check_permitted, .gro_rename = tmpfs_gro_rename, .gro_remove = tmpfs_gro_remove, .gro_lookup = tmpfs_gro_lookup, .gro_genealogy = tmpfs_gro_genealogy, .gro_lock_directory = tmpfs_gro_lock_directory, };
9 9 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 /* $NetBSD: vnd_30.c,v 1.4 2019/12/12 02:15:42 pgoyette Exp $ */ /*- * Copyright (c) 1996, 1997, 1998, 2008 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Jason R. Thorpe. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /* * Copyright (c) 1988 University of Utah. * Copyright (c) 1990, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * the Systems Programming Group of the University of Utah Computer * Science Department. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: Utah $Hdr: vn.c 1.13 94/04/02$ * * @(#)vn.c 8.9 (Berkeley) 5/14/95 */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: vnd_30.c,v 1.4 2019/12/12 02:15:42 pgoyette Exp $"); #if defined(_KERNEL_OPT) #include "opt_compat_netbsd.h" #endif #include <sys/param.h> #include <sys/systm.h> #include <sys/proc.h> #include <sys/errno.h> #include <sys/malloc.h> #include <sys/ioctl.h> #include <sys/device.h> #include <sys/disk.h> #include <sys/stat.h> #include <sys/vnode.h> #include <sys/file.h> #include <sys/uio.h> #include <sys/conf.h> #include <sys/vnode.h> #include <sys/compat_stub.h> #include <net/zlib.h> #include <dev/vndvar.h> #include <compat/common/compat_mod.h> static int compat_30_vndioctl(u_long, struct lwp *, void *, int, struct vattr *, int (*)(struct lwp *, void *, int, struct vattr *)); static int compat_30_vndioctl(u_long cmd, struct lwp *l, void *data, int unit, struct vattr *vattr_p, int (*get)(struct lwp *, void *, int, struct vattr *)) { struct vnd_user30 *vnu = data; int error; if (cmd != VNDIOCGET30) return EPASSTHROUGH; error = (*get)(l, data, unit, vattr_p); if (error != 0) return error; vnu->vnu_dev = vattr_p->va_fsid; vnu->vnu_ino = vattr_p->va_fileid; return 0; } void vnd_30_init(void) { MODULE_HOOK_SET(compat_vndioctl_30_hook, compat_30_vndioctl); } void vnd_30_fini(void) { MODULE_HOOK_UNSET(compat_vndioctl_30_hook); }
8 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 /* $NetBSD: chacha_impl.c,v 1.4 2022/11/05 17:36:33 jmcneill Exp $ */ /*- * Copyright (c) 2020 The NetBSD Foundation, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #include <sys/types.h> #include <sys/cdefs.h> #include <sys/errno.h> #include <sys/module.h> #include <sys/once.h> #include <sys/sysctl.h> #include <lib/libkern/libkern.h> #include "chacha.h" #include "chacha_ref.h" static const struct chacha_impl *chacha_md_impl __read_mostly; static const struct chacha_impl *chacha_impl __read_mostly = &chacha_ref_impl; static int sysctl_kern_crypto_chacha_selected(SYSCTLFN_ARGS) { struct sysctlnode node; node = *rnode; node.sysctl_data = __UNCONST(chacha_impl->ci_name); node.sysctl_size = strlen(chacha_impl->ci_name) + 1; return sysctl_lookup(SYSCTLFN_CALL(&node)); } SYSCTL_SETUP(sysctl_kern_crypto_chacha_setup, "sysctl kern.crypto.chacha setup") { const struct sysctlnode *cnode; const struct sysctlnode *chacha_node; sysctl_createv(clog, 0, NULL, &cnode, 0, CTLTYPE_NODE, "crypto", SYSCTL_DESCR("Kernel cryptography"), NULL, 0, NULL, 0, CTL_KERN, CTL_CREATE, CTL_EOL); sysctl_createv(clog, 0, &cnode, &chacha_node, 0, CTLTYPE_NODE, "chacha", SYSCTL_DESCR("ChaCha"), NULL, 0, NULL, 0, CTL_CREATE, CTL_EOL); sysctl_createv(clog, 0, &chacha_node, NULL, CTLFLAG_PERMANENT|CTLFLAG_READONLY, CTLTYPE_STRING, "selected", SYSCTL_DESCR("Selected ChaCha implementation"), sysctl_kern_crypto_chacha_selected, 0, NULL, 0, CTL_CREATE, CTL_EOL); } static int chacha_select(void) { if (chacha_md_impl) { if (chacha_selftest(chacha_md_impl)) aprint_error("chacha: self-test failed: %s\n", chacha_md_impl->ci_name); else chacha_impl = chacha_md_impl; } aprint_debug("chacha: %s\n", chacha_impl->ci_name); return 0; } MODULE(MODULE_CLASS_MISC, chacha, NULL); static int chacha_modcmd(modcmd_t cmd, void *opaque) { switch (cmd) { case MODULE_CMD_INIT: return chacha_select(); case MODULE_CMD_FINI: return 0; default: return ENOTTY; } } void chacha_md_init(const struct chacha_impl *impl) { KASSERT(cold); KASSERTMSG(chacha_md_impl == NULL, "ChaCha implementation `%s' already offered, can't offer `%s'", chacha_md_impl->ci_name, impl->ci_name); chacha_md_impl = impl; } void chacha_core(uint8_t out[restrict static CHACHA_CORE_OUTBYTES], const uint8_t in[static CHACHA_CORE_INBYTES], const uint8_t k[static CHACHA_CORE_KEYBYTES], const uint8_t c[static CHACHA_CORE_CONSTBYTES], unsigned nr) { (*chacha_impl->ci_chacha_core)(out, in, k, c, nr); } void hchacha(uint8_t out[restrict static HCHACHA_OUTBYTES], const uint8_t in[static HCHACHA_INBYTES], const uint8_t k[static HCHACHA_KEYBYTES], const uint8_t c[static HCHACHA_CONSTBYTES], unsigned nr) { (*chacha_impl->ci_hchacha)(out, in, k, c, nr); } void chacha_stream(uint8_t *restrict s, size_t nbytes, uint32_t blkno, const uint8_t nonce[static CHACHA_STREAM_NONCEBYTES], const uint8_t key[static CHACHA_STREAM_KEYBYTES], unsigned nr) { (*chacha_impl->ci_chacha_stream)(s, nbytes, blkno, nonce, key, nr); } void chacha_stream_xor(uint8_t *c, const uint8_t *p, size_t nbytes, uint32_t blkno, const uint8_t nonce[static CHACHA_STREAM_NONCEBYTES], const uint8_t key[static CHACHA_STREAM_KEYBYTES], unsigned nr) { (*chacha_impl->ci_chacha_stream_xor)(c, p, nbytes, blkno, nonce, key, nr); } void xchacha_stream(uint8_t *restrict s, size_t nbytes, uint32_t blkno, const uint8_t nonce[static XCHACHA_STREAM_NONCEBYTES], const uint8_t key[static XCHACHA_STREAM_KEYBYTES], unsigned nr) { (*chacha_impl->ci_xchacha_stream)(s, nbytes, blkno, nonce, key, nr); } void xchacha_stream_xor(uint8_t *c, const uint8_t *p, size_t nbytes, uint32_t blkno, const uint8_t nonce[static XCHACHA_STREAM_NONCEBYTES], const uint8_t key[static XCHACHA_STREAM_KEYBYTES], unsigned nr) { (*chacha_impl->ci_xchacha_stream_xor)(c, p, nbytes, blkno, nonce, key, nr); }
5 1 2 2 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 /* $NetBSD: ip6_mroute.c,v 1.132 2020/06/12 11:04:45 roy Exp $ */ /* $KAME: ip6_mroute.c,v 1.49 2001/07/25 09:21:18 jinmei Exp $ */ /* * Copyright (C) 1998 WIDE Project. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the project nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* BSDI ip_mroute.c,v 2.10 1996/11/14 00:29:52 jch Exp */ /* * Copyright (c) 1992, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * Stephen Deering of Stanford University. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)ip_mroute.c 8.2 (Berkeley) 11/15/93 */ /* * Copyright (c) 1989 Stephen Deering * * This code is derived from software contributed to Berkeley by * Stephen Deering of Stanford University. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)ip_mroute.c 8.2 (Berkeley) 11/15/93 */ /* * IP multicast forwarding procedures * * Written by David Waitzman, BBN Labs, August 1988. * Modified by Steve Deering, Stanford, February 1989. * Modified by Mark J. Steiglitz, Stanford, May, 1991 * Modified by Van Jacobson, LBL, January 1993 * Modified by Ajit Thyagarajan, PARC, August 1993 * Modified by Bill Fenner, PARC, April 1994 * * MROUTING Revision: 3.5.1.2 + PIM-SMv2 (pimd) Support */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: ip6_mroute.c,v 1.132 2020/06/12 11:04:45 roy Exp $"); #ifdef _KERNEL_OPT #include "opt_inet.h" #include "opt_mrouting.h" #endif #include <sys/param.h> #include <sys/systm.h> #include <sys/callout.h> #include <sys/mbuf.h> #include <sys/socket.h> #include <sys/socketvar.h> #include <sys/sockio.h> #include <sys/errno.h> #include <sys/time.h> #include <sys/kernel.h> #include <sys/ioctl.h> #include <sys/sysctl.h> #include <sys/syslog.h> #include <net/if.h> #include <net/route.h> #include <net/raw_cb.h> #include <net/net_stats.h> #include <netinet/in.h> #include <netinet/in_var.h> #include <netinet/icmp6.h> #include <netinet/ip6.h> #include <netinet6/ip6_var.h> #include <netinet6/ip6_private.h> #include <netinet6/ip6_mroute.h> #include <netinet6/scope6_var.h> #include <netinet6/pim6.h> #include <netinet6/pim6_var.h> #include <netinet6/nd6.h> static int ip6_mdq(struct mbuf *, struct ifnet *, struct mf6c *); static void phyint_send(struct ip6_hdr *, struct mif6 *, struct mbuf *); static int set_pim6(int *); static int socket_send(struct socket *, struct mbuf *, struct sockaddr_in6 *); static int register_send(struct ip6_hdr *, struct mif6 *, struct mbuf *); /* * Globals. All but ip6_mrouter, ip6_mrtproto and mrt6stat could be static, * except for netstat or debugging purposes. */ struct socket *ip6_mrouter = NULL; int ip6_mrouter_ver = 0; int ip6_mrtproto = IPPROTO_PIM; /* for netstat only */ struct mrt6stat mrt6stat; #define NO_RTE_FOUND 0x1 #define RTE_FOUND 0x2 struct mf6c *mf6ctable[MF6CTBLSIZ]; u_char n6expire[MF6CTBLSIZ]; struct mif6 mif6table[MAXMIFS]; #ifdef MRT6DEBUG u_int mrt6debug = 0; /* debug level */ #define DEBUG_MFC 0x02 #define DEBUG_FORWARD 0x04 #define DEBUG_EXPIRE 0x08 #define DEBUG_XMIT 0x10 #define DEBUG_REG 0x20 #define DEBUG_PIM 0x40 #define __mrt6debugused /* empty */ #else #define __mrt6debugused __unused #endif static void expire_upcalls(void *); #define EXPIRE_TIMEOUT (hz / 4) /* 4x / second */ #define UPCALL_EXPIRE 6 /* number of timeouts */ #ifdef INET #ifdef MROUTING extern struct socket *ip_mrouter; #endif #endif /* * 'Interfaces' associated with decapsulator (so we can tell * packets that went through it from ones that get reflected * by a broken gateway). These interfaces are never linked into * the system ifnet list & no routes point to them. I.e., packets * can't be sent this way. They only exist as a placeholder for * multicast source verification. */ struct ifnet multicast_register_if6; #define ENCAP_HOPS 64 /* * Private variables. */ static mifi_t nummifs = 0; static mifi_t reg_mif_num = (mifi_t)-1; static percpu_t *pim6stat_percpu; #define PIM6_STATINC(x) _NET_STATINC(pim6stat_percpu, x) static int pim6; /* * Hash function for a source, group entry */ #define MF6CHASH(a, g) MF6CHASHMOD((a).s6_addr32[0] ^ (a).s6_addr32[1] ^ \ (a).s6_addr32[2] ^ (a).s6_addr32[3] ^ \ (g).s6_addr32[0] ^ (g).s6_addr32[1] ^ \ (g).s6_addr32[2] ^ (g).s6_addr32[3]) /* * Find a route for a given origin IPv6 address and Multicast group address. * Quality of service parameter to be added in the future!!! */ #define MF6CFIND(o, g, rt) do { \ struct mf6c *_rt = mf6ctable[MF6CHASH(o,g)]; \ rt = NULL; \ mrt6stat.mrt6s_mfc_lookups++; \ while (_rt) { \ if (IN6_ARE_ADDR_EQUAL(&_rt->mf6c_origin.sin6_addr, &(o)) && \ IN6_ARE_ADDR_EQUAL(&_rt->mf6c_mcastgrp.sin6_addr, &(g)) && \ (_rt->mf6c_stall == NULL)) { \ rt = _rt; \ break; \ } \ _rt = _rt->mf6c_next; \ } \ if (rt == NULL) { \ mrt6stat.mrt6s_mfc_misses++; \ } \ } while (/*CONSTCOND*/ 0) /* * Macros to compute elapsed time efficiently * Borrowed from Van Jacobson's scheduling code */ #define TV_DELTA(a, b, delta) do { \ int xxs; \ \ delta = (a).tv_usec - (b).tv_usec; \ if ((xxs = (a).tv_sec - (b).tv_sec)) { \ switch (xxs) { \ case 2: \ delta += 1000000; \ /* FALLTHROUGH */ \ case 1: \ delta += 1000000; \ break; \ default: \ delta += (1000000 * xxs); \ } \ } \ } while (/*CONSTCOND*/ 0) #define TV_LT(a, b) (((a).tv_usec < (b).tv_usec && \ (a).tv_sec <= (b).tv_sec) || (a).tv_sec < (b).tv_sec) #ifdef UPCALL_TIMING #define UPCALL_MAX 50 u_long upcall_data[UPCALL_MAX + 1]; static void collate(); #endif /* UPCALL_TIMING */ static int get_sg_cnt(struct sioc_sg_req6 *); static int get_mif6_cnt(struct sioc_mif_req6 *); static int ip6_mrouter_init(struct socket *, int, int); static int add_m6if(struct mif6ctl *); static int del_m6if(mifi_t *); static int add_m6fc(struct mf6cctl *); static int del_m6fc(struct mf6cctl *); static void sysctl_net_inet6_pim6_setup(struct sysctllog **); static callout_t expire_upcalls_ch; void pim6_init(void) { sysctl_net_inet6_pim6_setup(NULL); pim6stat_percpu = percpu_alloc(sizeof(uint64_t) * PIM6_NSTATS); } /* * Handle MRT setsockopt commands to modify the multicast routing tables. */ int ip6_mrouter_set(struct socket *so, struct sockopt *sopt) { int error, optval; struct mif6ctl mifc; struct mf6cctl mfcc; mifi_t mifi; if (sopt->sopt_name != MRT6_INIT && so != ip6_mrouter) return (EACCES); error = 0; switch (sopt->sopt_name) { #ifdef MRT6_OINIT case MRT6_OINIT: #endif case MRT6_INIT: error = sockopt_getint(sopt, &optval); if (error) break; return (ip6_mrouter_init(so, optval, sopt->sopt_name)); case MRT6_DONE: return (ip6_mrouter_done()); case MRT6_ADD_MIF: error = sockopt_get(sopt, &mifc, sizeof(mifc)); if (error) break; return (add_m6if(&mifc)); case MRT6_DEL_MIF: error = sockopt_get(sopt, &mifi, sizeof(mifi)); if (error) break; return (del_m6if(&mifi)); case MRT6_ADD_MFC: error = sockopt_get(sopt, &mfcc, sizeof(mfcc)); if (error) break; return (add_m6fc(&mfcc)); case MRT6_DEL_MFC: error = sockopt_get(sopt, &mfcc, sizeof(mfcc)); if (error) break; return (del_m6fc(&mfcc)); case MRT6_PIM: error = sockopt_getint(sopt, &optval); if (error) break; return (set_pim6(&optval)); default: error = EOPNOTSUPP; } return (error); } /* * Handle MRT getsockopt commands */ int ip6_mrouter_get(struct socket *so, struct sockopt *sopt) { int error; if (so != ip6_mrouter) return EACCES; error = 0; switch (sopt->sopt_name) { case MRT6_PIM: error = sockopt_set(sopt, &pim6, sizeof(pim6)); break; default: error = EOPNOTSUPP; break; } return (error); } /* * Handle ioctl commands to obtain information from the cache */ int mrt6_ioctl(u_long cmd, void *data) { switch (cmd) { case SIOCGETSGCNT_IN6: return (get_sg_cnt((struct sioc_sg_req6 *)data)); case SIOCGETMIFCNT_IN6: return (get_mif6_cnt((struct sioc_mif_req6 *)data)); default: return (EINVAL); } } /* * returns the packet, byte, rpf-failure count for the source group provided */ static int get_sg_cnt(struct sioc_sg_req6 *req) { struct mf6c *rt; int s; s = splsoftnet(); MF6CFIND(req->src.sin6_addr, req->grp.sin6_addr, rt); splx(s); if (rt != NULL) { req->pktcnt = rt->mf6c_pkt_cnt; req->bytecnt = rt->mf6c_byte_cnt; req->wrong_if = rt->mf6c_wrong_if; } else return (ESRCH); #if 0 req->pktcnt = req->bytecnt = req->wrong_if = 0xffffffff; #endif return 0; } /* * returns the input and output packet and byte counts on the mif provided */ static int get_mif6_cnt(struct sioc_mif_req6 *req) { mifi_t mifi = req->mifi; if (mifi >= nummifs) return EINVAL; req->icount = mif6table[mifi].m6_pkt_in; req->ocount = mif6table[mifi].m6_pkt_out; req->ibytes = mif6table[mifi].m6_bytes_in; req->obytes = mif6table[mifi].m6_bytes_out; return 0; } static int set_pim6(int *i) { if ((*i != 1) && (*i != 0)) return EINVAL; pim6 = *i; return 0; } /* * Enable multicast routing */ static int ip6_mrouter_init(struct socket *so, int v, int cmd) { #ifdef MRT6DEBUG if (mrt6debug) log(LOG_DEBUG, "ip6_mrouter_init: so_type = %d, pr_protocol = %d\n", so->so_type, so->so_proto->pr_protocol); #endif if (so->so_type != SOCK_RAW || so->so_proto->pr_protocol != IPPROTO_ICMPV6) return EOPNOTSUPP; if (v != 1) return ENOPROTOOPT; if (ip6_mrouter != NULL) return EADDRINUSE; ip6_mrouter = so; ip6_mrouter_ver = cmd; memset((void *)mf6ctable, 0, sizeof(mf6ctable)); memset((void *)n6expire, 0, sizeof(n6expire)); pim6 = 0;/* used for stubbing out/in pim stuff */ callout_init(&expire_upcalls_ch, CALLOUT_MPSAFE); callout_reset(&expire_upcalls_ch, EXPIRE_TIMEOUT, expire_upcalls, NULL); #ifdef MRT6DEBUG if (mrt6debug) log(LOG_DEBUG, "ip6_mrouter_init\n"); #endif return 0; } /* * Disable multicast routing */ int ip6_mrouter_done(void) { mifi_t mifi; int i; struct ifnet *ifp; struct sockaddr_in6 sin6; struct mf6c *rt; struct rtdetq *rte; int s; s = splsoftnet(); /* * For each phyint in use, disable promiscuous reception of all IPv6 * multicasts. */ #ifdef INET #ifdef MROUTING /* * If there is still IPv4 multicast routing daemon, * we remain interfaces to receive all muliticasted packets. * XXX: there may be an interface in which the IPv4 multicast * daemon is not interested... */ if (!ip_mrouter) #endif #endif { for (mifi = 0; mifi < nummifs; mifi++) { if (mif6table[mifi].m6_ifp && !(mif6table[mifi].m6_flags & MIFF_REGISTER)) { ifp = mif6table[mifi].m6_ifp; sockaddr_in6_init(&sin6, &in6addr_any, 0, 0, 0); if_mcast_op(ifp, SIOCDELMULTI, sin6tocsa(&sin6)); } } } memset((void *)mif6table, 0, sizeof(mif6table)); nummifs = 0; pim6 = 0; /* used to stub out/in pim specific code */ callout_stop(&expire_upcalls_ch); /* * Free all multicast forwarding cache entries. */ for (i = 0; i < MF6CTBLSIZ; i++) { rt = mf6ctable[i]; while (rt) { struct mf6c *frt; for (rte = rt->mf6c_stall; rte != NULL; ) { struct rtdetq *n = rte->next; m_freem(rte->m); free(rte, M_MRTABLE); rte = n; } frt = rt; rt = rt->mf6c_next; free(frt, M_MRTABLE); } } memset((void *)mf6ctable, 0, sizeof(mf6ctable)); /* * Reset register interface */ if (reg_mif_num != (mifi_t)-1) { if_detach(&multicast_register_if6); reg_mif_num = (mifi_t)-1; } ip6_mrouter = NULL; ip6_mrouter_ver = 0; splx(s); #ifdef MRT6DEBUG if (mrt6debug) log(LOG_DEBUG, "ip6_mrouter_done\n"); #endif return 0; } void ip6_mrouter_detach(struct ifnet *ifp) { struct rtdetq *rte; struct mf6c *mfc; mifi_t mifi; int i; if (ip6_mrouter == NULL) return; /* * Delete a mif which points to ifp. */ for (mifi = 0; mifi < nummifs; mifi++) if (mif6table[mifi].m6_ifp == ifp) del_m6if(&mifi); /* * Clear rte->ifp of cache entries received on ifp. */ for (i = 0; i < MF6CTBLSIZ; i++) { if (n6expire[i] == 0) continue; for (mfc = mf6ctable[i]; mfc != NULL; mfc = mfc->mf6c_next) { for (rte = mfc->mf6c_stall; rte != NULL; rte = rte->next) { if (rte->ifp == ifp) rte->ifp = NULL; } } } } /* * Add a mif to the mif table */ static int add_m6if(struct mif6ctl *mifcp) { struct mif6 *mifp; struct ifnet *ifp; struct sockaddr_in6 sin6; int error, s; if (mifcp->mif6c_mifi >= MAXMIFS) return EINVAL; mifp = mif6table + mifcp->mif6c_mifi; if (mifp->m6_ifp) return EADDRINUSE; /* XXX: is it appropriate? */ if (!mifcp->mif6c_pifi || (ifp = if_byindex(mifcp->mif6c_pifi)) == NULL) return ENXIO; if (mifcp->mif6c_flags & MIFF_REGISTER) { ifp = &multicast_register_if6; if (reg_mif_num == (mifi_t)-1) { strlcpy(ifp->if_xname, "register_mif", sizeof(ifp->if_xname)); ifp->if_flags |= IFF_LOOPBACK; ifp->if_index = mifcp->mif6c_mifi; reg_mif_num = mifcp->mif6c_mifi; if_attach(ifp); } } else { /* Make sure the interface supports multicast */ if ((ifp->if_flags & IFF_MULTICAST) == 0) return EOPNOTSUPP; s = splsoftnet(); /* * Enable promiscuous reception of all IPv6 multicasts * from the interface. */ sockaddr_in6_init(&sin6, &in6addr_any, 0, 0, 0); error = if_mcast_op(ifp, SIOCADDMULTI, sin6tosa(&sin6)); splx(s); if (error) return error; } s = splsoftnet(); mifp->m6_flags = mifcp->mif6c_flags; mifp->m6_ifp = ifp; /* initialize per mif pkt counters */ mifp->m6_pkt_in = 0; mifp->m6_pkt_out = 0; mifp->m6_bytes_in = 0; mifp->m6_bytes_out = 0; splx(s); /* Adjust nummifs up if the mifi is higher than nummifs */ if (nummifs <= mifcp->mif6c_mifi) nummifs = mifcp->mif6c_mifi + 1; #ifdef MRT6DEBUG if (mrt6debug) log(LOG_DEBUG, "add_mif #%d, phyint %s\n", mifcp->mif6c_mifi, ifp->if_xname); #endif return 0; } /* * Delete a mif from the mif table */ static int del_m6if(mifi_t *mifip) { struct mif6 *mifp = mif6table + *mifip; mifi_t mifi; struct ifnet *ifp; struct sockaddr_in6 sin6; int s; if (*mifip >= nummifs) return EINVAL; if (mifp->m6_ifp == NULL) return EINVAL; s = splsoftnet(); if (!(mifp->m6_flags & MIFF_REGISTER)) { /* * XXX: what if there is yet IPv4 multicast daemon * using the interface? */ ifp = mifp->m6_ifp; sockaddr_in6_init(&sin6, &in6addr_any, 0, 0, 0); if_mcast_op(ifp, SIOCDELMULTI, sin6tosa(&sin6)); } else { if (reg_mif_num != (mifi_t)-1) { if_detach(&multicast_register_if6); reg_mif_num = (mifi_t)-1; } } memset((void *)mifp, 0, sizeof (*mifp)); /* Adjust nummifs down */ for (mifi = nummifs; mifi > 0; mifi--) if (mif6table[mifi - 1].m6_ifp) break; nummifs = mifi; splx(s); #ifdef MRT6DEBUG if (mrt6debug) log(LOG_DEBUG, "del_m6if %d, nummifs %d\n", *mifip, nummifs); #endif return 0; } /* * Add an mfc entry */ static int add_m6fc(struct mf6cctl *mfccp) { struct mf6c *rt; u_long hash; struct rtdetq *rte; u_short nstl; int s; char ip6bufo[INET6_ADDRSTRLEN], ip6bufm[INET6_ADDRSTRLEN]; MF6CFIND(mfccp->mf6cc_origin.sin6_addr, mfccp->mf6cc_mcastgrp.sin6_addr, rt); /* If an entry already exists, just update the fields */ if (rt) { #ifdef MRT6DEBUG if (mrt6debug & DEBUG_MFC) log(LOG_DEBUG,"add_m6fc update o %s g %s p %x\n", IN6_PRINT(ip6bufo, &mfccp->mf6cc_origin.sin6_addr), IN6_PRINT(ip6bufm, &mfccp->mf6cc_mcastgrp.sin6_addr), mfccp->mf6cc_parent); #endif s = splsoftnet(); rt->mf6c_parent = mfccp->mf6cc_parent; rt->mf6c_ifset = mfccp->mf6cc_ifset; splx(s); return 0; } /* * Find the entry for which the upcall was made and update */ s = splsoftnet(); hash = MF6CHASH(mfccp->mf6cc_origin.sin6_addr, mfccp->mf6cc_mcastgrp.sin6_addr); for (rt = mf6ctable[hash], nstl = 0; rt; rt = rt->mf6c_next) { if (IN6_ARE_ADDR_EQUAL(&rt->mf6c_origin.sin6_addr, &mfccp->mf6cc_origin.sin6_addr) && IN6_ARE_ADDR_EQUAL(&rt->mf6c_mcastgrp.sin6_addr, &mfccp->mf6cc_mcastgrp.sin6_addr) && (rt->mf6c_stall != NULL)) { if (nstl++) log(LOG_ERR, "add_m6fc: %s o %s g %s p %x dbx %p\n", "multiple kernel entries", IN6_PRINT(ip6bufo, &mfccp->mf6cc_origin.sin6_addr), IN6_PRINT(ip6bufm, &mfccp->mf6cc_mcastgrp.sin6_addr), mfccp->mf6cc_parent, rt->mf6c_stall); #ifdef MRT6DEBUG if (mrt6debug & DEBUG_MFC) log(LOG_DEBUG, "add_m6fc o %s g %s p %x dbg %p\n", IN6_PRINT(ip6bufo, &mfccp->mf6cc_origin.sin6_addr), IN6_PRINT(ip6bufm, &mfccp->mf6cc_mcastgrp.sin6_addr), mfccp->mf6cc_parent, rt->mf6c_stall); #endif rt->mf6c_origin = mfccp->mf6cc_origin; rt->mf6c_mcastgrp = mfccp->mf6cc_mcastgrp; rt->mf6c_parent = mfccp->mf6cc_parent; rt->mf6c_ifset = mfccp->mf6cc_ifset; /* initialize pkt counters per src-grp */ rt->mf6c_pkt_cnt = 0; rt->mf6c_byte_cnt = 0; rt->mf6c_wrong_if = 0; rt->mf6c_expire = 0; /* Don't clean this guy up */ n6expire[hash]--; /* free packets Qed at the end of this entry */ for (rte = rt->mf6c_stall; rte != NULL; ) { struct rtdetq *n = rte->next; if (rte->ifp) { ip6_mdq(rte->m, rte->ifp, rt); } m_freem(rte->m); #ifdef UPCALL_TIMING collate(&(rte->t)); #endif free(rte, M_MRTABLE); rte = n; } rt->mf6c_stall = NULL; } } /* * It is possible that an entry is being inserted without an upcall */ if (nstl == 0) { #ifdef MRT6DEBUG if (mrt6debug & DEBUG_MFC) log(LOG_DEBUG, "add_mfc no upcall h %ld o %s g %s p %x\n", hash, IN6_PRINT(ip6bufo, &mfccp->mf6cc_origin.sin6_addr), IN6_PRINT(ip6bufm, &mfccp->mf6cc_mcastgrp.sin6_addr), mfccp->mf6cc_parent); #endif for (rt = mf6ctable[hash]; rt; rt = rt->mf6c_next) { if (IN6_ARE_ADDR_EQUAL(&rt->mf6c_origin.sin6_addr, &mfccp->mf6cc_origin.sin6_addr)&& IN6_ARE_ADDR_EQUAL(&rt->mf6c_mcastgrp.sin6_addr, &mfccp->mf6cc_mcastgrp.sin6_addr)) { rt->mf6c_origin = mfccp->mf6cc_origin; rt->mf6c_mcastgrp = mfccp->mf6cc_mcastgrp; rt->mf6c_parent = mfccp->mf6cc_parent; rt->mf6c_ifset = mfccp->mf6cc_ifset; /* initialize pkt counters per src-grp */ rt->mf6c_pkt_cnt = 0; rt->mf6c_byte_cnt = 0; rt->mf6c_wrong_if = 0; if (rt->mf6c_expire) n6expire[hash]--; rt->mf6c_expire = 0; } } if (rt == NULL) { /* no upcall, so make a new entry */ rt = malloc(sizeof(*rt), M_MRTABLE, M_NOWAIT); if (rt == NULL) { splx(s); return ENOBUFS; } /* insert new entry at head of hash chain */ rt->mf6c_origin = mfccp->mf6cc_origin; rt->mf6c_mcastgrp = mfccp->mf6cc_mcastgrp; rt->mf6c_parent = mfccp->mf6cc_parent; rt->mf6c_ifset = mfccp->mf6cc_ifset; /* initialize pkt counters per src-grp */ rt->mf6c_pkt_cnt = 0; rt->mf6c_byte_cnt = 0; rt->mf6c_wrong_if = 0; rt->mf6c_expire = 0; rt->mf6c_stall = NULL; /* link into table */ rt->mf6c_next = mf6ctable[hash]; mf6ctable[hash] = rt; } } splx(s); return 0; } #ifdef UPCALL_TIMING /* * collect delay statistics on the upcalls */ static void collate(struct timeval *t) { u_long d; struct timeval tp; u_long delta; GET_TIME(tp); if (TV_LT(*t, tp)) { TV_DELTA(tp, *t, delta); d = delta >> 10; if (d > UPCALL_MAX) d = UPCALL_MAX; ++upcall_data[d]; } } #endif /* UPCALL_TIMING */ /* * Delete an mfc entry */ static int del_m6fc(struct mf6cctl *mfccp) { struct sockaddr_in6 origin; struct sockaddr_in6 mcastgrp; struct mf6c *rt; struct mf6c **nptr; u_long hash; int s; origin = mfccp->mf6cc_origin; mcastgrp = mfccp->mf6cc_mcastgrp; hash = MF6CHASH(origin.sin6_addr, mcastgrp.sin6_addr); #ifdef MRT6DEBUG if (mrt6debug & DEBUG_MFC) { char ip6bufo[INET6_ADDRSTRLEN], ip6bufm[INET6_ADDRSTRLEN]; log(LOG_DEBUG,"del_m6fc orig %s mcastgrp %s\n", IN6_PRINT(ip6bufo, &origin.sin6_addr), IN6_PRINT(ip6bufm, &mcastgrp.sin6_addr)); } #endif s = splsoftnet(); nptr = &mf6ctable[hash]; while ((rt = *nptr) != NULL) { if (IN6_ARE_ADDR_EQUAL(&origin.sin6_addr, &rt->mf6c_origin.sin6_addr) && IN6_ARE_ADDR_EQUAL(&mcastgrp.sin6_addr, &rt->mf6c_mcastgrp.sin6_addr) && rt->mf6c_stall == NULL) break; nptr = &rt->mf6c_next; } if (rt == NULL) { splx(s); return EADDRNOTAVAIL; } *nptr = rt->mf6c_next; free(rt, M_MRTABLE); splx(s); return 0; } static int socket_send(struct socket *s, struct mbuf *mm, struct sockaddr_in6 *src) { if (s) { if (sbappendaddr(&s->so_rcv, sin6tosa(src), mm, NULL) != 0) { sorwakeup(s); return 0; } soroverflow(s); } m_freem(mm); return -1; } /* * IPv6 multicast forwarding function. This function assumes that the packet * pointed to by "ip6" has arrived on (or is about to be sent to) the interface * pointed to by "ifp", and the packet is to be relayed to other networks * that have members of the packet's destination IPv6 multicast group. * * The packet is returned unscathed to the caller, unless it is * erroneous, in which case a non-zero return value tells the caller to * discard it. */ int ip6_mforward(struct ip6_hdr *ip6, struct ifnet *ifp, struct mbuf *m) { struct mf6c *rt; struct mif6 *mifp; struct mbuf *mm; int s; mifi_t mifi; struct sockaddr_in6 sin6; char ip6bufs[INET6_ADDRSTRLEN], ip6bufd[INET6_ADDRSTRLEN]; #ifdef MRT6DEBUG if (mrt6debug & DEBUG_FORWARD) log(LOG_DEBUG, "ip6_mforward: src %s, dst %s, ifindex %d\n", IN6_PRINT(ip6bufs, &ip6->ip6_src), IN6_PRINT(ip6bufd, &ip6->ip6_dst), ifp->if_index); #endif /* * Don't forward a packet with Hop limit of zero or one, * or a packet destined to a local-only group. */ if (ip6->ip6_hlim <= 1 || IN6_IS_ADDR_MC_NODELOCAL(&ip6->ip6_dst) || IN6_IS_ADDR_MC_LINKLOCAL(&ip6->ip6_dst)) return 0; ip6->ip6_hlim--; /* * Source address check: do not forward packets with unspecified * source. It was discussed in July 2000, on ipngwg mailing list. * This is rather more serious than unicast cases, because some * MLD packets can be sent with the unspecified source address * (although such packets must normally set the hop limit field to 1). */ if (IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_src)) { IP6_STATINC(IP6_STAT_CANTFORWARD); if (ip6_log_time + ip6_log_interval < time_uptime) { ip6_log_time = time_uptime; log(LOG_DEBUG, "cannot forward " "from %s to %s nxt %d received on %s\n", IN6_PRINT(ip6bufs, &ip6->ip6_src), IN6_PRINT(ip6bufd, &ip6->ip6_dst), ip6->ip6_nxt, m->m_pkthdr.rcvif_index ? if_name(m_get_rcvif_NOMPSAFE(m)) : "?"); } return 0; } /* * Determine forwarding mifs from the forwarding cache table */ s = splsoftnet(); MF6CFIND(ip6->ip6_src, ip6->ip6_dst, rt); /* Entry exists, so forward if necessary */ if (rt) { splx(s); return ip6_mdq(m, ifp, rt); } else { /* * If we don't have a route for packet's origin, make a copy * of the packet and send message to routing daemon. */ struct mbuf *mb0; struct rtdetq *rte; u_long hash; #ifdef UPCALL_TIMING struct timeval tp; GET_TIME(tp); #endif mrt6stat.mrt6s_no_route++; #ifdef MRT6DEBUG if (mrt6debug & (DEBUG_FORWARD | DEBUG_MFC)) log(LOG_DEBUG, "ip6_mforward: no rte s %s g %s\n", IN6_PRINT(ip6bufs, &ip6->ip6_src), IN6_PRINT(ip6bufd, &ip6->ip6_dst)); #endif /* * Allocate mbufs early so that we don't do extra work if we * are just going to fail anyway. */ rte = malloc(sizeof(*rte), M_MRTABLE, M_NOWAIT); if (rte == NULL) { splx(s); return ENOBUFS; } mb0 = m_copypacket(m, M_DONTWAIT); /* * Pullup packet header if needed before storing it, * as other references may modify it in the meantime. */ if (mb0 && M_UNWRITABLE(mb0, sizeof(struct ip6_hdr))) mb0 = m_pullup(mb0, sizeof(struct ip6_hdr)); if (mb0 == NULL) { free(rte, M_MRTABLE); splx(s); return ENOBUFS; } /* is there an upcall waiting for this packet? */ hash = MF6CHASH(ip6->ip6_src, ip6->ip6_dst); for (rt = mf6ctable[hash]; rt; rt = rt->mf6c_next) { if (IN6_ARE_ADDR_EQUAL(&ip6->ip6_src, &rt->mf6c_origin.sin6_addr) && IN6_ARE_ADDR_EQUAL(&ip6->ip6_dst, &rt->mf6c_mcastgrp.sin6_addr) && (rt->mf6c_stall != NULL)) break; } if (rt == NULL) { struct mrt6msg *im; struct omrt6msg *oim; /* no upcall, so make a new entry */ rt = malloc(sizeof(*rt), M_MRTABLE, M_NOWAIT); if (rt == NULL) { free(rte, M_MRTABLE); m_freem(mb0); splx(s); return ENOBUFS; } /* * Make a copy of the header to send to the user * level process */ mm = m_copym(mb0, 0, sizeof(struct ip6_hdr), M_DONTWAIT); if (mm == NULL) { free(rte, M_MRTABLE); m_freem(mb0); free(rt, M_MRTABLE); splx(s); return ENOBUFS; } /* * Send message to routing daemon */ sockaddr_in6_init(&sin6, &ip6->ip6_src, 0, 0, 0); im = NULL; oim = NULL; switch (ip6_mrouter_ver) { case MRT6_OINIT: oim = mtod(mm, struct omrt6msg *); oim->im6_msgtype = MRT6MSG_NOCACHE; oim->im6_mbz = 0; break; case MRT6_INIT: im = mtod(mm, struct mrt6msg *); im->im6_msgtype = MRT6MSG_NOCACHE; im->im6_mbz = 0; break; default: free(rte, M_MRTABLE); m_freem(mb0); free(rt, M_MRTABLE); splx(s); return EINVAL; } #ifdef MRT6DEBUG if (mrt6debug & DEBUG_FORWARD) log(LOG_DEBUG, "getting the iif info in the kernel\n"); #endif for (mifp = mif6table, mifi = 0; mifi < nummifs && mifp->m6_ifp != ifp; mifp++, mifi++) ; switch (ip6_mrouter_ver) { case MRT6_OINIT: oim->im6_mif = mifi; break; case MRT6_INIT: im->im6_mif = mifi; break; } if (socket_send(ip6_mrouter, mm, &sin6) < 0) { log(LOG_WARNING, "ip6_mforward: ip6_mrouter " "socket queue full\n"); mrt6stat.mrt6s_upq_sockfull++; free(rte, M_MRTABLE); m_freem(mb0); free(rt, M_MRTABLE); splx(s); return ENOBUFS; } mrt6stat.mrt6s_upcalls++; /* insert new entry at head of hash chain */ memset(rt, 0, sizeof(*rt)); sockaddr_in6_init(&rt->mf6c_origin, &ip6->ip6_src, 0, 0, 0); sockaddr_in6_init(&rt->mf6c_mcastgrp, &ip6->ip6_dst, 0, 0, 0); rt->mf6c_expire = UPCALL_EXPIRE; n6expire[hash]++; rt->mf6c_parent = MF6C_INCOMPLETE_PARENT; /* link into table */ rt->mf6c_next = mf6ctable[hash]; mf6ctable[hash] = rt; /* Add this entry to the end of the queue */ rt->mf6c_stall = rte; } else { /* determine if q has overflowed */ struct rtdetq **p; int npkts = 0; for (p = &rt->mf6c_stall; *p != NULL; p = &(*p)->next) { if (++npkts > MAX_UPQ6) { mrt6stat.mrt6s_upq_ovflw++; free(rte, M_MRTABLE); m_freem(mb0); splx(s); return 0; } } /* Add this entry to the end of the queue */ *p = rte; } rte->next = NULL; rte->m = mb0; rte->ifp = ifp; #ifdef UPCALL_TIMING rte->t = tp; #endif splx(s); return 0; } } /* * Clean up cache entries if upcalls are not serviced * Call from the Slow Timeout mechanism, every 0.25 seconds. */ static void expire_upcalls(void *unused) { struct rtdetq *rte; struct mf6c *mfc, **nptr; int i; /* XXX NOMPSAFE still need softnet_lock */ mutex_enter(softnet_lock); KERNEL_LOCK(1, NULL); for (i = 0; i < MF6CTBLSIZ; i++) { if (n6expire[i] == 0) continue; nptr = &mf6ctable[i]; while ((mfc = *nptr) != NULL) { rte = mfc->mf6c_stall; /* * Skip real cache entries * Make sure it wasn't marked to not expire (shouldn't happen) * If it expires now */ if (rte != NULL && mfc->mf6c_expire != 0 && --mfc->mf6c_expire == 0) { #ifdef MRT6DEBUG if (mrt6debug & DEBUG_EXPIRE) { char ip6bufo[INET6_ADDRSTRLEN]; char ip6bufm[INET6_ADDRSTRLEN]; log(LOG_DEBUG, "expire_upcalls: expiring (%s %s)\n", IN6_PRINT(ip6bufo, &mfc->mf6c_origin.sin6_addr), IN6_PRINT(ip6bufm, &mfc->mf6c_mcastgrp.sin6_addr)); } #endif /* * drop all the packets * free the mbuf with the pkt, if, timing info */ do { struct rtdetq *n = rte->next; m_freem(rte->m); free(rte, M_MRTABLE); rte = n; } while (rte != NULL); mrt6stat.mrt6s_cache_cleanups++; n6expire[i]--; *nptr = mfc->mf6c_next; free(mfc, M_MRTABLE); } else { nptr = &mfc->mf6c_next; } } } callout_reset(&expire_upcalls_ch, EXPIRE_TIMEOUT, expire_upcalls, NULL); KERNEL_UNLOCK_ONE(NULL); mutex_exit(softnet_lock); } /* * Macro to send packet on mif. Since RSVP packets don't get counted on * input, they shouldn't get counted on output, so statistics keeping is * separate. */ #define MC6_SEND(ip6, mifp, m) do { \ if ((mifp)->m6_flags & MIFF_REGISTER) \ register_send((ip6), (mifp), (m)); \ else \ phyint_send((ip6), (mifp), (m)); \ } while (/*CONSTCOND*/ 0) /* * Packet forwarding routine once entry in the cache is made */ static int ip6_mdq(struct mbuf *m, struct ifnet *ifp, struct mf6c *rt) { struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *); mifi_t mifi, iif; struct mif6 *mifp; int plen = m->m_pkthdr.len; struct in6_addr src0, dst0; /* copies for local work */ u_int32_t iszone, idzone, oszone, odzone; int error = 0; /* * Don't forward if it didn't arrive from the parent mif * for its origin. */ mifi = rt->mf6c_parent; if ((mifi >= nummifs) || (mif6table[mifi].m6_ifp != ifp)) { /* came in the wrong interface */ #ifdef MRT6DEBUG if (mrt6debug & DEBUG_FORWARD) log(LOG_DEBUG, "wrong if: ifid %d mifi %d mififid %x\n", ifp->if_index, mifi, mif6table[mifi].m6_ifp ? mif6table[mifi].m6_ifp->if_index : -1); #endif mrt6stat.mrt6s_wrong_if++; rt->mf6c_wrong_if++; /* * If we are doing PIM processing, and we are forwarding * packets on this interface, send a message to the * routing daemon. */ /* have to make sure this is a valid mif */ if (mifi < nummifs && mif6table[mifi].m6_ifp) { if (pim6 && (m->m_flags & M_LOOP) == 0) { /* * Check the M_LOOP flag to avoid an * unnecessary PIM assert. * XXX: M_LOOP is an ad-hoc hack... */ struct sockaddr_in6 sin6; struct mbuf *mm; struct mrt6msg *im; struct omrt6msg *oim; mm = m_copym(m, 0, sizeof(struct ip6_hdr), M_DONTWAIT); if (mm && M_UNWRITABLE(mm, sizeof(struct ip6_hdr))) mm = m_pullup(mm, sizeof(struct ip6_hdr)); if (mm == NULL) return ENOBUFS; oim = NULL; im = NULL; switch (ip6_mrouter_ver) { case MRT6_OINIT: oim = mtod(mm, struct omrt6msg *); oim->im6_msgtype = MRT6MSG_WRONGMIF; oim->im6_mbz = 0; break; case MRT6_INIT: im = mtod(mm, struct mrt6msg *); im->im6_msgtype = MRT6MSG_WRONGMIF; im->im6_mbz = 0; break; default: m_freem(mm); return EINVAL; } for (mifp = mif6table, iif = 0; iif < nummifs && mifp && mifp->m6_ifp != ifp; mifp++, iif++) ; memset(&sin6, 0, sizeof(sin6)); sin6.sin6_len = sizeof(sin6); sin6.sin6_family = AF_INET6; switch (ip6_mrouter_ver) { case MRT6_OINIT: oim->im6_mif = iif; sin6.sin6_addr = oim->im6_src; break; case MRT6_INIT: im->im6_mif = iif; sin6.sin6_addr = im->im6_src; break; } mrt6stat.mrt6s_upcalls++; if (socket_send(ip6_mrouter, mm, &sin6) < 0) { #ifdef MRT6DEBUG if (mrt6debug) log(LOG_WARNING, "mdq, ip6_mrouter socket queue full\n"); #endif ++mrt6stat.mrt6s_upq_sockfull; return ENOBUFS; } } } return 0; } /* If I sourced this packet, it counts as output, else it was input. */ if (m->m_pkthdr.rcvif_index == 0) { /* XXX: is rcvif really NULL when output?? */ mif6table[mifi].m6_pkt_out++; mif6table[mifi].m6_bytes_out += plen; } else { mif6table[mifi].m6_pkt_in++; mif6table[mifi].m6_bytes_in += plen; } rt->mf6c_pkt_cnt++; rt->mf6c_byte_cnt += plen; /* * For each mif, forward a copy of the packet if there are group * members downstream on the interface. */ src0 = ip6->ip6_src; dst0 = ip6->ip6_dst; if ((error = in6_setscope(&src0, ifp, &iszone)) != 0 || (error = in6_setscope(&dst0, ifp, &idzone)) != 0) { IP6_STATINC(IP6_STAT_BADSCOPE); return error; } for (mifp = mif6table, mifi = 0; mifi < nummifs; mifp++, mifi++) { if (IF_ISSET(mifi, &rt->mf6c_ifset)) { if (mif6table[mifi].m6_ifp == NULL) continue; /* * check if the outgoing packet is going to break * a scope boundary. * XXX: For packets through PIM register tunnel * interface, we believe the routing daemon. */ if ((mif6table[rt->mf6c_parent].m6_flags & MIFF_REGISTER) == 0 && (mif6table[mifi].m6_flags & MIFF_REGISTER) == 0) { if (in6_setscope(&src0, mif6table[mifi].m6_ifp, &oszone) || in6_setscope(&dst0, mif6table[mifi].m6_ifp, &odzone) || iszone != oszone || idzone != odzone) { IP6_STATINC(IP6_STAT_BADSCOPE); continue; } } mifp->m6_pkt_out++; mifp->m6_bytes_out += plen; MC6_SEND(ip6, mifp, m); } } return 0; } static void phyint_send(struct ip6_hdr *ip6, struct mif6 *mifp, struct mbuf *m) { struct mbuf *mb_copy; struct ifnet *ifp = mifp->m6_ifp; int error __mrt6debugused = 0; int s; static struct route ro; bool ingroup; struct sockaddr_in6 dst6; s = splsoftnet(); /* * Make a new reference to the packet; make sure that * the IPv6 header is actually copied, not just referenced, * so that ip6_output() only scribbles on the copy. */ mb_copy = m_copypacket(m, M_DONTWAIT); if (mb_copy && M_UNWRITABLE(mb_copy, sizeof(struct ip6_hdr))) mb_copy = m_pullup(mb_copy, sizeof(struct ip6_hdr)); if (mb_copy == NULL) { splx(s); return; } /* set MCAST flag to the outgoing packet */ mb_copy->m_flags |= M_MCAST; /* * If we sourced the packet, call ip6_output since we may divide * the packet into fragments when the packet is too big for the * outgoing interface. * Otherwise, we can simply send the packet to the interface * sending queue. */ if (m->m_pkthdr.rcvif_index == 0) { struct ip6_moptions im6o; im6o.im6o_multicast_if_index = if_get_index(ifp); /* XXX: ip6_output will override ip6->ip6_hlim */ im6o.im6o_multicast_hlim = ip6->ip6_hlim; im6o.im6o_multicast_loop = 1; error = ip6_output(mb_copy, NULL, &ro, IPV6_FORWARDING, &im6o, NULL, NULL); #ifdef MRT6DEBUG if (mrt6debug & DEBUG_XMIT) log(LOG_DEBUG, "phyint_send on mif %td err %d\n", mifp - mif6table, error); #endif splx(s); return; } /* * If we belong to the destination multicast group * on the outgoing interface, loop back a copy. */ /* * Does not have to check source info, as it's already covered by * ip6_input */ sockaddr_in6_init(&dst6, &ip6->ip6_dst, 0, 0, 0); ingroup = in6_multi_group(&ip6->ip6_dst, ifp); if (ingroup) { ip6_mloopback(ifp, m, satocsin6(rtcache_getdst(&ro))); } /* * Put the packet into the sending queue of the outgoing interface * if it would fit in the MTU of the interface. */ if (mb_copy->m_pkthdr.len <= ifp->if_mtu || ifp->if_mtu < IPV6_MMTU) { error = ip6_if_output(ifp, ifp, mb_copy, &dst6, NULL); #ifdef MRT6DEBUG if (mrt6debug & DEBUG_XMIT) log(LOG_DEBUG, "phyint_send on mif %td err %d\n", mifp - mif6table, error); #endif } else { /* * pMTU discovery is intentionally disabled by default, since * various routers may notify pMTU in multicast, which can be * a DDoS to a router. */ if (ip6_mcast_pmtu) { icmp6_error(mb_copy, ICMP6_PACKET_TOO_BIG, 0, ifp->if_mtu); } else { /* simply discard the packet */ #ifdef MRT6DEBUG if (mrt6debug & DEBUG_XMIT) { char ip6bufs[INET6_ADDRSTRLEN]; char ip6bufd[INET6_ADDRSTRLEN]; log(LOG_DEBUG, "phyint_send: packet too big on %s o %s g %s" " size %d(discarded)\n", if_name(ifp), IN6_PRINT(ip6bufs, &ip6->ip6_src), IN6_PRINT(ip6bufd, &ip6->ip6_dst), mb_copy->m_pkthdr.len); } #endif m_freem(mb_copy); } } splx(s); } static int register_send(struct ip6_hdr *ip6, struct mif6 *mif, struct mbuf *m) { struct mbuf *mm; int i, len = m->m_pkthdr.len; struct sockaddr_in6 sin6; struct mrt6msg *im6; #ifdef MRT6DEBUG if (mrt6debug) { char ip6bufs[INET6_ADDRSTRLEN], ip6bufd[INET6_ADDRSTRLEN]; log(LOG_DEBUG, "** IPv6 register_send **\n src %s dst %s\n", IN6_PRINT(ip6bufs, &ip6->ip6_src), IN6_PRINT(ip6bufd, &ip6->ip6_dst)); } #endif PIM6_STATINC(PIM6_STAT_SND_REGISTERS); /* Make a copy of the packet to send to the user level process */ MGETHDR(mm, M_DONTWAIT, MT_HEADER); if (mm == NULL) return ENOBUFS; mm->m_data += max_linkhdr; mm->m_len = sizeof(struct ip6_hdr); if ((mm->m_next = m_copypacket(m, M_DONTWAIT)) == NULL) { m_freem(mm); return ENOBUFS; } i = MHLEN - M_LEADINGSPACE(mm); if (i > len) i = len; mm = m_pullup(mm, i); if (mm == NULL) return ENOBUFS; mm->m_pkthdr.len = len + sizeof(struct ip6_hdr); /* * Send message to routing daemon */ sockaddr_in6_init(&sin6, &ip6->ip6_src, 0, 0, 0); im6 = mtod(mm, struct mrt6msg *); im6->im6_msgtype = MRT6MSG_WHOLEPKT; im6->im6_mbz = 0; im6->im6_mif = mif - mif6table; /* iif info is not given for reg. encap.n */ mrt6stat.mrt6s_upcalls++; if (socket_send(ip6_mrouter, mm, &sin6) < 0) { #ifdef MRT6DEBUG if (mrt6debug) log(LOG_WARNING, "register_send: ip6_mrouter socket queue full\n"); #endif ++mrt6stat.mrt6s_upq_sockfull; return ENOBUFS; } return 0; } /* * PIM sparse mode hook. Receives the pim control messages, and passes them up * to the listening socket, using rip6_input. * * The only message processed is the REGISTER pim message; the pim header * is stripped off, and the inner packet is passed to register_mforward. */ int pim6_input(struct mbuf **mp, int *offp, int proto) { struct pim *pim; struct ip6_hdr *ip6 __mrt6debugused; int pimlen; struct mbuf *m = *mp; int minlen; int off = *offp; PIM6_STATINC(PIM6_STAT_RCV_TOTAL); ip6 = mtod(m, struct ip6_hdr *); pimlen = m->m_pkthdr.len - off; /* * Validate lengths */ if (pimlen < PIM_MINLEN) { PIM6_STATINC(PIM6_STAT_RCV_TOOSHORT); #ifdef MRT6DEBUG if (mrt6debug & DEBUG_PIM) log(LOG_DEBUG,"pim6_input: PIM packet too short\n"); #endif m_freem(m); return IPPROTO_DONE; } /* * If the packet is at least as big as a REGISTER, go ahead * and grab the PIM REGISTER header size, to avoid another * possible m_pullup() later. * * PIM_MINLEN == pimhdr + u_int32 == 8 * PIM6_REG_MINLEN == pimhdr + reghdr + eip6hdr == 4 + 4 + 40 */ minlen = (pimlen >= PIM6_REG_MINLEN) ? PIM6_REG_MINLEN : PIM_MINLEN; /* * Make sure that the IP6 and PIM headers in contiguous memory, and * possibly the PIM REGISTER header */ IP6_EXTHDR_GET(pim, struct pim *, m, off, minlen); if (pim == NULL) { PIM6_STATINC(PIM6_STAT_RCV_TOOSHORT); return IPPROTO_DONE; } /* PIM version check */ if (pim->pim_ver != PIM_VERSION) { PIM6_STATINC(PIM6_STAT_RCV_BADVERSION); #ifdef MRT6DEBUG log(LOG_ERR, "pim6_input: incorrect version %d, expecting %d\n", pim->pim_ver, PIM_VERSION); #endif m_freem(m); return IPPROTO_DONE; } #define PIM6_CHECKSUM #ifdef PIM6_CHECKSUM { int cksumlen; /* * Validate checksum. * If PIM REGISTER, exclude the data packet */ if (pim->pim_type == PIM_REGISTER) cksumlen = PIM_MINLEN; else cksumlen = pimlen; if (in6_cksum(m, IPPROTO_PIM, off, cksumlen)) { PIM6_STATINC(PIM6_STAT_RCV_BADSUM); #ifdef MRT6DEBUG if (mrt6debug & DEBUG_PIM) log(LOG_DEBUG, "pim6_input: invalid checksum\n"); #endif m_freem(m); return IPPROTO_DONE; } } #endif /* PIM_CHECKSUM */ if (pim->pim_type == PIM_REGISTER) { /* * since this is a REGISTER, we'll make a copy of the register * headers ip6+pim+u_int32_t+encap_ip6, to be passed up to the * routing daemon. */ static const struct sockaddr_in6 dst = { .sin6_len = sizeof(dst), .sin6_family = AF_INET6, }; struct mbuf *mcp; struct ip6_hdr *eip6; u_int32_t *reghdr; PIM6_STATINC(PIM6_STAT_RCV_REGISTERS); if ((reg_mif_num >= nummifs) || (reg_mif_num == (mifi_t) -1)) { #ifdef MRT6DEBUG if (mrt6debug & DEBUG_PIM) log(LOG_DEBUG, "pim6_input: register mif not set: %d\n", reg_mif_num); #endif m_freem(m); return IPPROTO_DONE; } reghdr = (u_int32_t *)(pim + 1); if ((ntohl(*reghdr) & PIM_NULL_REGISTER)) goto pim6_input_to_daemon; /* * Validate length */ if (pimlen < PIM6_REG_MINLEN) { #ifdef MRT6DEBUG char ip6buf[INET6_ADDRSTRLEN]; log(LOG_ERR, "pim6_input: register packet size too " "small %d from %s\n", pimlen, IN6_PRINT(ip6buf, &ip6->ip6_src)); #endif PIM6_STATINC(PIM6_STAT_RCV_TOOSHORT); PIM6_STATINC(PIM6_STAT_RCV_BADREGISTERS); m_freem(m); return IPPROTO_DONE; } eip6 = (struct ip6_hdr *)(reghdr + 1); #ifdef MRT6DEBUG if (mrt6debug & DEBUG_PIM) { char ip6bufs[INET6_ADDRSTRLEN]; char ip6bufd[INET6_ADDRSTRLEN]; log(LOG_DEBUG, "pim6_input[register], eip6: %s -> %s, " "eip6 plen %d\n", IN6_PRINT(ip6bufs, &eip6->ip6_src), IN6_PRINT(ip6bufd, &eip6->ip6_dst), ntohs(eip6->ip6_plen)); } #endif /* verify the version number of the inner packet */ if ((eip6->ip6_vfc & IPV6_VERSION_MASK) != IPV6_VERSION) { PIM6_STATINC(PIM6_STAT_RCV_BADREGISTERS); #ifdef MRT6DEBUG log(LOG_DEBUG, "pim6_input: invalid IP version (%d) " "of the inner packet\n", (eip6->ip6_vfc & IPV6_VERSION)); #endif m_freem(m); return IPPROTO_DONE; } /* verify the inner packet is destined to a mcast group */ if (!IN6_IS_ADDR_MULTICAST(&eip6->ip6_dst)) { PIM6_STATINC(PIM6_STAT_RCV_BADREGISTERS); #ifdef MRT6DEBUG if (mrt6debug & DEBUG_PIM) { char ip6buf[INET6_ADDRSTRLEN]; log(LOG_DEBUG, "pim6_input: inner packet of register " "is not multicast %s\n", IN6_PRINT(ip6buf, &eip6->ip6_dst)); } #endif m_freem(m); return IPPROTO_DONE; } /* * make a copy of the whole header to pass to the daemon later. */ mcp = m_copym(m, 0, off + PIM6_REG_MINLEN, M_DONTWAIT); if (mcp == NULL) { #ifdef MRT6DEBUG log(LOG_ERR, "pim6_input: pim register: " "could not copy register head\n"); #endif m_freem(m); return IPPROTO_DONE; } /* * forward the inner ip6 packet; point m_data at the inner ip6. */ m_adj(m, off + PIM_MINLEN); #ifdef MRT6DEBUG if (mrt6debug & DEBUG_PIM) { char ip6bufs[INET6_ADDRSTRLEN]; char ip6bufd[INET6_ADDRSTRLEN]; log(LOG_DEBUG, "pim6_input: forwarding decapsulated register: " "src %s, dst %s, mif %d\n", IN6_PRINT(ip6bufs, &eip6->ip6_src), IN6_PRINT(ip6bufd, &eip6->ip6_dst), reg_mif_num); } #endif looutput(mif6table[reg_mif_num].m6_ifp, m, sin6tocsa(&dst), NULL); /* prepare the register head to send to the mrouting daemon */ m = mcp; } /* * Pass the PIM message up to the daemon; if it is a register message * pass the 'head' only up to the daemon. This includes the * encapsulator ip6 header, pim header, register header and the * encapsulated ip6 header. */ pim6_input_to_daemon: /* * Currently, rip6_input() is always called holding softnet_lock * by ipintr()(!NET_MPSAFE) or PR_INPUT_WRAP()(NET_MPSAFE). */ KASSERT(mutex_owned(softnet_lock)); rip6_input(&m, offp, proto); return IPPROTO_DONE; } static int sysctl_net_inet6_pim6_stats(SYSCTLFN_ARGS) { return (NETSTAT_SYSCTL(pim6stat_percpu, PIM6_NSTATS)); } static void sysctl_net_inet6_pim6_setup(struct sysctllog **clog) { sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT, CTLTYPE_NODE, "inet6", NULL, NULL, 0, NULL, 0, CTL_NET, PF_INET6, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT, CTLTYPE_NODE, "pim6", SYSCTL_DESCR("PIMv6 settings"), NULL, 0, NULL, 0, CTL_NET, PF_INET6, IPPROTO_PIM, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT, CTLTYPE_STRUCT, "stats", SYSCTL_DESCR("PIMv6 statistics"), sysctl_net_inet6_pim6_stats, 0, NULL, 0, CTL_NET, PF_INET6, IPPROTO_PIM, PIM6CTL_STATS, CTL_EOL); }
337 331 334 336 335 335 336 334 336 272 271 258 11 2 259 13 13 13 271 273 273 273 271 273 271 271 163 163 163 163 162 147 58 163 162 162 150 13 1378 1374 225 1335 8 1333 1333 1329 36 1294 1306 1294 1294 1298 1297 1290 10 130 36 21 87 386 74 10 547 219 220 71 71 70 71 69 10 66 66 65 6 6 60 62 2 18 1 16 15 15 15 14 1 14 14 67 68 45 44 10 6 49 49 49 271 272 272 272 261 3 245 24 259 1 258 259 263 11 254 254 254 252 253 181 182 182 182 181 181 182 182 182 113 114 114 114 114 13 101 100 254 251 147 147 147 22 1 17 22 22 22 22 22 22 22 22 22 22 1 22 22 22 22 22 22 22 22 22 22 22 22 5 5 5 5 5 3 15 1 14 1 1 12 9 1 1 4 3 2 63 63 59 5 4 4 4 3 1 12 12 12 6 1 1 1 1 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 /* $NetBSD: kern_descrip.c,v 1.262 2023/10/04 22:17:09 ad Exp $ */ /*- * Copyright (c) 2008, 2009, 2023 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Andrew Doran. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /* * Copyright (c) 1982, 1986, 1989, 1991, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)kern_descrip.c 8.8 (Berkeley) 2/14/95 */ /* * File descriptor management. */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: kern_descrip.c,v 1.262 2023/10/04 22:17:09 ad Exp $"); #include <sys/param.h> #include <sys/systm.h> #include <sys/filedesc.h> #include <sys/kernel.h> #include <sys/proc.h> #include <sys/file.h> #include <sys/socket.h> #include <sys/socketvar.h> #include <sys/stat.h> #include <sys/ioctl.h> #include <sys/fcntl.h> #include <sys/pool.h> #include <sys/unistd.h> #include <sys/resourcevar.h> #include <sys/conf.h> #include <sys/event.h> #include <sys/kauth.h> #include <sys/atomic.h> #include <sys/syscallargs.h> #include <sys/cpu.h> #include <sys/kmem.h> #include <sys/vnode.h> #include <sys/sysctl.h> #include <sys/ktrace.h> /* * A list (head) of open files, counter, and lock protecting them. */ struct filelist filehead __cacheline_aligned; static u_int nfiles __cacheline_aligned; kmutex_t filelist_lock __cacheline_aligned; static pool_cache_t filedesc_cache __read_mostly; static pool_cache_t file_cache __read_mostly; static int file_ctor(void *, void *, int); static void file_dtor(void *, void *); static void fdfile_ctor(fdfile_t *); static void fdfile_dtor(fdfile_t *); static int filedesc_ctor(void *, void *, int); static void filedesc_dtor(void *, void *); static int filedescopen(dev_t, int, int, lwp_t *); static int sysctl_kern_file(SYSCTLFN_PROTO); static int sysctl_kern_file2(SYSCTLFN_PROTO); static void fill_file(struct file *, const struct file *); static void fill_file2(struct kinfo_file *, const file_t *, const fdfile_t *, int, pid_t); const struct cdevsw filedesc_cdevsw = { .d_open = filedescopen, .d_close = noclose, .d_read = noread, .d_write = nowrite, .d_ioctl = noioctl, .d_stop = nostop, .d_tty = notty, .d_poll = nopoll, .d_mmap = nommap, .d_kqfilter = nokqfilter, .d_discard = nodiscard, .d_flag = D_OTHER | D_MPSAFE }; /* For ease of reading. */ __strong_alias(fd_putvnode,fd_putfile) __strong_alias(fd_putsock,fd_putfile) /* * Initialize the descriptor system. */ void fd_sys_init(void) { static struct sysctllog *clog; mutex_init(&filelist_lock, MUTEX_DEFAULT, IPL_NONE); LIST_INIT(&filehead); file_cache = pool_cache_init(sizeof(file_t), coherency_unit, 0, 0, "file", NULL, IPL_NONE, file_ctor, file_dtor, NULL); KASSERT(file_cache != NULL); filedesc_cache = pool_cache_init(sizeof(filedesc_t), coherency_unit, 0, 0, "filedesc", NULL, IPL_NONE, filedesc_ctor, filedesc_dtor, NULL); KASSERT(filedesc_cache != NULL); sysctl_createv(&clog, 0, NULL, NULL, CTLFLAG_PERMANENT, CTLTYPE_STRUCT, "file", SYSCTL_DESCR("System open file table"), sysctl_kern_file, 0, NULL, 0, CTL_KERN, KERN_FILE, CTL_EOL); sysctl_createv(&clog, 0, NULL, NULL, CTLFLAG_PERMANENT, CTLTYPE_STRUCT, "file2", SYSCTL_DESCR("System open file table"), sysctl_kern_file2, 0, NULL, 0, CTL_KERN, KERN_FILE2, CTL_EOL); } static bool fd_isused(filedesc_t *fdp, unsigned fd) { u_int off = fd >> NDENTRYSHIFT; KASSERT(fd < atomic_load_consume(&fdp->fd_dt)->dt_nfiles); return (fdp->fd_lomap[off] & (1U << (fd & NDENTRYMASK))) != 0; } /* * Verify that the bitmaps match the descriptor table. */ static inline void fd_checkmaps(filedesc_t *fdp) { #ifdef DEBUG fdtab_t *dt; u_int fd; KASSERT(fdp->fd_refcnt <= 1 || mutex_owned(&fdp->fd_lock)); dt = fdp->fd_dt; if (fdp->fd_refcnt == -1) { /* * fd_free tears down the table without maintaining its bitmap. */ return; } for (fd = 0; fd < dt->dt_nfiles; fd++) { if (fd < NDFDFILE) { KASSERT(dt->dt_ff[fd] == (fdfile_t *)fdp->fd_dfdfile[fd]); } if (dt->dt_ff[fd] == NULL) { KASSERT(!fd_isused(fdp, fd)); } else if (dt->dt_ff[fd]->ff_file != NULL) { KASSERT(fd_isused(fdp, fd)); } } #endif } static int fd_next_zero(filedesc_t *fdp, uint32_t *bitmap, int want, u_int bits) { int i, off, maxoff; uint32_t sub; KASSERT(mutex_owned(&fdp->fd_lock)); fd_checkmaps(fdp); if (want > bits) return -1; off = want >> NDENTRYSHIFT; i = want & NDENTRYMASK; if (i) { sub = bitmap[off] | ((u_int)~0 >> (NDENTRIES - i)); if (sub != ~0) goto found; off++; } maxoff = NDLOSLOTS(bits); while (off < maxoff) { if ((sub = bitmap[off]) != ~0) goto found; off++; } return -1; found: return (off << NDENTRYSHIFT) + ffs(~sub) - 1; } static int fd_last_set(filedesc_t *fd, int last) { int off, i; fdfile_t **ff = fd->fd_dt->dt_ff; uint32_t *bitmap = fd->fd_lomap; KASSERT(mutex_owned(&fd->fd_lock)); fd_checkmaps(fd); off = (last - 1) >> NDENTRYSHIFT; while (off >= 0 && !bitmap[off]) off--; if (off < 0) return -1; i = ((off + 1) << NDENTRYSHIFT) - 1; if (i >= last) i = last - 1; /* XXX should use bitmap */ while (i > 0 && (ff[i] == NULL || !ff[i]->ff_allocated)) i--; return i; } static inline void fd_used(filedesc_t *fdp, unsigned fd) { u_int off = fd >> NDENTRYSHIFT; fdfile_t *ff; ff = fdp->fd_dt->dt_ff[fd]; KASSERT(mutex_owned(&fdp->fd_lock)); KASSERT((fdp->fd_lomap[off] & (1U << (fd & NDENTRYMASK))) == 0); KASSERT(ff != NULL); KASSERT(ff->ff_file == NULL); KASSERT(!ff->ff_allocated); ff->ff_allocated = true; fdp->fd_lomap[off] |= 1U << (fd & NDENTRYMASK); if (__predict_false(fdp->fd_lomap[off] == ~0)) { KASSERT((fdp->fd_himap[off >> NDENTRYSHIFT] & (1U << (off & NDENTRYMASK))) == 0); fdp->fd_himap[off >> NDENTRYSHIFT] |= 1U << (off & NDENTRYMASK); } if ((int)fd > fdp->fd_lastfile) { fdp->fd_lastfile = fd; } fd_checkmaps(fdp); } static inline void fd_unused(filedesc_t *fdp, unsigned fd) { u_int off = fd >> NDENTRYSHIFT; fdfile_t *ff; ff = fdp->fd_dt->dt_ff[fd]; KASSERT(mutex_owned(&fdp->fd_lock)); KASSERT(ff != NULL); KASSERT(ff->ff_file == NULL); KASSERT(ff->ff_allocated); if (fd < fdp->fd_freefile) { fdp->fd_freefile = fd; } if (fdp->fd_lomap[off] == ~0) { KASSERT((fdp->fd_himap[off >> NDENTRYSHIFT] & (1U << (off & NDENTRYMASK))) != 0); fdp->fd_himap[off >> NDENTRYSHIFT] &= ~(1U << (off & NDENTRYMASK)); } KASSERT((fdp->fd_lomap[off] & (1U << (fd & NDENTRYMASK))) != 0); fdp->fd_lomap[off] &= ~(1U << (fd & NDENTRYMASK)); ff->ff_allocated = false; KASSERT(fd <= fdp->fd_lastfile); if (fd == fdp->fd_lastfile) { fdp->fd_lastfile = fd_last_set(fdp, fd); } fd_checkmaps(fdp); } /* * Look up the file structure corresponding to a file descriptor * and return the file, holding a reference on the descriptor. */ file_t * fd_getfile(unsigned fd) { filedesc_t *fdp; fdfile_t *ff; file_t *fp; fdtab_t *dt; /* * Look up the fdfile structure representing this descriptor. * We are doing this unlocked. See fd_tryexpand(). */ fdp = curlwp->l_fd; dt = atomic_load_consume(&fdp->fd_dt); if (__predict_false(fd >= dt->dt_nfiles)) { return NULL; } ff = dt->dt_ff[fd]; KASSERT(fd >= NDFDFILE || ff == (fdfile_t *)fdp->fd_dfdfile[fd]); if (__predict_false(ff == NULL)) { return NULL; } /* Now get a reference to the descriptor. */ if (fdp->fd_refcnt == 1) { /* * Single threaded: don't need to worry about concurrent * access (other than earlier calls to kqueue, which may * hold a reference to the descriptor). */ ff->ff_refcnt++; } else { /* * Multi threaded: issue a memory barrier to ensure that we * acquire the file pointer _after_ adding a reference. If * no memory barrier, we could fetch a stale pointer. * * In particular, we must coordinate the following four * memory operations: * * A. fd_close store ff->ff_file = NULL * B. fd_close refcnt = atomic_dec_uint_nv(&ff->ff_refcnt) * C. fd_getfile atomic_inc_uint(&ff->ff_refcnt) * D. fd_getfile load fp = ff->ff_file * * If the order is D;A;B;C: * * 1. D: fp = ff->ff_file * 2. A: ff->ff_file = NULL * 3. B: refcnt = atomic_dec_uint_nv(&ff->ff_refcnt) * 4. C: atomic_inc_uint(&ff->ff_refcnt) * * then fd_close determines that there are no more * references and decides to free fp immediately, at * the same that fd_getfile ends up with an fp that's * about to be freed. *boom* * * By making B a release operation in fd_close, and by * making C an acquire operation in fd_getfile, since * they are atomic operations on the same object, which * has a total modification order, we guarantee either: * * - B happens before C. Then since A is * sequenced before B in fd_close, and C is * sequenced before D in fd_getfile, we * guarantee A happens before D, so fd_getfile * reads a null fp and safely fails. * * - C happens before B. Then fd_getfile may read * null or nonnull, but either way, fd_close * will safely wait for references to drain. */ atomic_inc_uint(&ff->ff_refcnt); membar_acquire(); } /* * If the file is not open or is being closed then put the * reference back. */ fp = atomic_load_consume(&ff->ff_file); if (__predict_true(fp != NULL)) { return fp; } fd_putfile(fd); return NULL; } /* * Release a reference to a file descriptor acquired with fd_getfile(). */ void fd_putfile(unsigned fd) { filedesc_t *fdp; fdfile_t *ff; u_int u, v; fdp = curlwp->l_fd; KASSERT(fd < atomic_load_consume(&fdp->fd_dt)->dt_nfiles); ff = atomic_load_consume(&fdp->fd_dt)->dt_ff[fd]; KASSERT(ff != NULL); KASSERT((ff->ff_refcnt & FR_MASK) > 0); KASSERT(fd >= NDFDFILE || ff == (fdfile_t *)fdp->fd_dfdfile[fd]); if (fdp->fd_refcnt == 1) { /* * Single threaded: don't need to worry about concurrent * access (other than earlier calls to kqueue, which may * hold a reference to the descriptor). */ if (__predict_false((ff->ff_refcnt & FR_CLOSING) != 0)) { fd_close(fd); return; } ff->ff_refcnt--; return; } /* * Ensure that any use of the file is complete and globally * visible before dropping the final reference. If no membar, * the current CPU could still access memory associated with * the file after it has been freed or recycled by another * CPU. */ membar_release(); /* * Be optimistic and start out with the assumption that no other * threads are trying to close the descriptor. If the CAS fails, * we lost a race and/or it's being closed. */ for (u = ff->ff_refcnt & FR_MASK;; u = v) { v = atomic_cas_uint(&ff->ff_refcnt, u, u - 1); if (__predict_true(u == v)) { return; } if (__predict_false((v & FR_CLOSING) != 0)) { break; } } /* Another thread is waiting to close the file: join it. */ (void)fd_close(fd); } /* * Convenience wrapper around fd_getfile() that returns reference * to a vnode. */ int fd_getvnode(unsigned fd, file_t **fpp) { vnode_t *vp; file_t *fp; fp = fd_getfile(fd); if (__predict_false(fp == NULL)) { return EBADF; } if (__predict_false(fp->f_type != DTYPE_VNODE)) { fd_putfile(fd); return EINVAL; } vp = fp->f_vnode; if (__predict_false(vp->v_type == VBAD)) { /* XXX Is this case really necessary? */ fd_putfile(fd); return EBADF; } *fpp = fp; return 0; } /* * Convenience wrapper around fd_getfile() that returns reference * to a socket. */ int fd_getsock1(unsigned fd, struct socket **sop, file_t **fp) { *fp = fd_getfile(fd); if (__predict_false(*fp == NULL)) { return EBADF; } if (__predict_false((*fp)->f_type != DTYPE_SOCKET)) { fd_putfile(fd); return ENOTSOCK; } *sop = (*fp)->f_socket; return 0; } int fd_getsock(unsigned fd, struct socket **sop) { file_t *fp; return fd_getsock1(fd, sop, &fp); } /* * Look up the file structure corresponding to a file descriptor * and return it with a reference held on the file, not the * descriptor. * * This is heavyweight and only used when accessing descriptors * from a foreign process. The caller must ensure that `p' does * not exit or fork across this call. * * To release the file (not descriptor) reference, use closef(). */ file_t * fd_getfile2(proc_t *p, unsigned fd) { filedesc_t *fdp; fdfile_t *ff; file_t *fp; fdtab_t *dt; fdp = p->p_fd; mutex_enter(&fdp->fd_lock); dt = fdp->fd_dt; if (fd >= dt->dt_nfiles) { mutex_exit(&fdp->fd_lock); return NULL; } if ((ff = dt->dt_ff[fd]) == NULL) { mutex_exit(&fdp->fd_lock); return NULL; } if ((fp = atomic_load_consume(&ff->ff_file)) == NULL) { mutex_exit(&fdp->fd_lock); return NULL; } mutex_enter(&fp->f_lock); fp->f_count++; mutex_exit(&fp->f_lock); mutex_exit(&fdp->fd_lock); return fp; } /* * Internal form of close. Must be called with a reference to the * descriptor, and will drop the reference. When all descriptor * references are dropped, releases the descriptor slot and a single * reference to the file structure. */ int fd_close(unsigned fd) { struct flock lf; filedesc_t *fdp; fdfile_t *ff; file_t *fp; proc_t *p; lwp_t *l; u_int refcnt; l = curlwp; p = l->l_proc; fdp = l->l_fd; ff = atomic_load_consume(&fdp->fd_dt)->dt_ff[fd]; KASSERT(fd >= NDFDFILE || ff == (fdfile_t *)fdp->fd_dfdfile[fd]); mutex_enter(&fdp->fd_lock); KASSERT((ff->ff_refcnt & FR_MASK) > 0); fp = atomic_load_consume(&ff->ff_file); if (__predict_false(fp == NULL)) { /* * Another user of the file is already closing, and is * waiting for other users of the file to drain. Release * our reference, and wake up the closer. */ membar_release(); atomic_dec_uint(&ff->ff_refcnt); cv_broadcast(&ff->ff_closing); mutex_exit(&fdp->fd_lock); /* * An application error, so pretend that the descriptor * was already closed. We can't safely wait for it to * be closed without potentially deadlocking. */ return (EBADF); } KASSERT((ff->ff_refcnt & FR_CLOSING) == 0); /* * There may be multiple users of this file within the process. * Notify existing and new users that the file is closing. This * will prevent them from adding additional uses to this file * while we are closing it. */ atomic_store_relaxed(&ff->ff_file, NULL); ff->ff_exclose = false; /* * We expect the caller to hold a descriptor reference - drop it. * The reference count may increase beyond zero at this point due * to an erroneous descriptor reference by an application, but * fd_getfile() will notice that the file is being closed and drop * the reference again. */ if (fdp->fd_refcnt == 1) { /* Single threaded. */ refcnt = --(ff->ff_refcnt); } else { /* Multi threaded. */ membar_release(); refcnt = atomic_dec_uint_nv(&ff->ff_refcnt); membar_acquire(); } if (__predict_false(refcnt != 0)) { /* * Wait for other references to drain. This is typically * an application error - the descriptor is being closed * while still in use. * (Or just a threaded application trying to unblock its * thread that sleeps in (say) accept()). */ atomic_or_uint(&ff->ff_refcnt, FR_CLOSING); /* * Remove any knotes attached to the file. A knote * attached to the descriptor can hold references on it. */ mutex_exit(&fdp->fd_lock); if (!SLIST_EMPTY(&ff->ff_knlist)) { knote_fdclose(fd); } /* * Since the file system code doesn't know which fd * each request came from (think dup()), we have to * ask it to return ERESTART for any long-term blocks. * The re-entry through read/write/etc will detect the * closed fd and return EBAFD. * Blocked partial writes may return a short length. */ (*fp->f_ops->fo_restart)(fp); mutex_enter(&fdp->fd_lock); /* * We need to see the count drop to zero at least once, * in order to ensure that all pre-existing references * have been drained. New references past this point are * of no interest. * XXX (dsl) this may need to call fo_restart() after a * timeout to guarantee that all the system calls exit. */ while ((ff->ff_refcnt & FR_MASK) != 0) { cv_wait(&ff->ff_closing, &fdp->fd_lock); } atomic_and_uint(&ff->ff_refcnt, ~FR_CLOSING); } else { /* If no references, there must be no knotes. */ KASSERT(SLIST_EMPTY(&ff->ff_knlist)); } /* * POSIX record locking dictates that any close releases ALL * locks owned by this process. This is handled by setting * a flag in the unlock to free ONLY locks obeying POSIX * semantics, and not to free BSD-style file locks. * If the descriptor was in a message, POSIX-style locks * aren't passed with the descriptor. */ if (__predict_false((p->p_flag & PK_ADVLOCK) != 0) && fp->f_ops->fo_advlock != NULL) { lf.l_whence = SEEK_SET; lf.l_start = 0; lf.l_len = 0; lf.l_type = F_UNLCK; mutex_exit(&fdp->fd_lock); (void)(*fp->f_ops->fo_advlock)(fp, p, F_UNLCK, &lf, F_POSIX); mutex_enter(&fdp->fd_lock); } /* Free descriptor slot. */ fd_unused(fdp, fd); mutex_exit(&fdp->fd_lock); /* Now drop reference to the file itself. */ return closef(fp); } /* * Duplicate a file descriptor. */ int fd_dup(file_t *fp, int minfd, int *newp, bool exclose) { proc_t *p = curproc; fdtab_t *dt; int error; while ((error = fd_alloc(p, minfd, newp)) != 0) { if (error != ENOSPC) { return error; } fd_tryexpand(p); } dt = atomic_load_consume(&curlwp->l_fd->fd_dt); dt->dt_ff[*newp]->ff_exclose = exclose; fd_affix(p, fp, *newp); return 0; } /* * dup2 operation. */ int fd_dup2(file_t *fp, unsigned newfd, int flags) { filedesc_t *fdp = curlwp->l_fd; fdfile_t *ff; fdtab_t *dt; if (flags & ~(O_CLOEXEC|O_NONBLOCK|O_NOSIGPIPE)) return EINVAL; /* * Ensure there are enough slots in the descriptor table, * and allocate an fdfile_t up front in case we need it. */ while (newfd >= atomic_load_consume(&fdp->fd_dt)->dt_nfiles) { fd_tryexpand(curproc); } ff = kmem_alloc(sizeof(*ff), KM_SLEEP); fdfile_ctor(ff); /* * If there is already a file open, close it. If the file is * half open, wait for it to be constructed before closing it. * XXX Potential for deadlock here? */ mutex_enter(&fdp->fd_lock); while (fd_isused(fdp, newfd)) { mutex_exit(&fdp->fd_lock); if (fd_getfile(newfd) != NULL) { (void)fd_close(newfd); } else { /* * Crummy, but unlikely to happen. * Can occur if we interrupt another * thread while it is opening a file. */ kpause("dup2", false, 1, NULL); } mutex_enter(&fdp->fd_lock); } dt = fdp->fd_dt; if (dt->dt_ff[newfd] == NULL) { KASSERT(newfd >= NDFDFILE); dt->dt_ff[newfd] = ff; ff = NULL; } fd_used(fdp, newfd); mutex_exit(&fdp->fd_lock); dt->dt_ff[newfd]->ff_exclose = (flags & O_CLOEXEC) != 0; fp->f_flag |= flags & (FNONBLOCK|FNOSIGPIPE); /* Slot is now allocated. Insert copy of the file. */ fd_affix(curproc, fp, newfd); if (ff != NULL) { cv_destroy(&ff->ff_closing); kmem_free(ff, sizeof(*ff)); } return 0; } /* * Drop reference to a file structure. */ int closef(file_t *fp) { struct flock lf; int error; /* * Drop reference. If referenced elsewhere it's still open * and we have nothing more to do. */ mutex_enter(&fp->f_lock); KASSERT(fp->f_count > 0); if (--fp->f_count > 0) { mutex_exit(&fp->f_lock); return 0; } KASSERT(fp->f_count == 0); mutex_exit(&fp->f_lock); /* We held the last reference - release locks, close and free. */ if (fp->f_ops->fo_advlock == NULL) { KASSERT((fp->f_flag & FHASLOCK) == 0); } else if (fp->f_flag & FHASLOCK) { lf.l_whence = SEEK_SET; lf.l_start = 0; lf.l_len = 0; lf.l_type = F_UNLCK; (void)(*fp->f_ops->fo_advlock)(fp, fp, F_UNLCK, &lf, F_FLOCK); } if (fp->f_ops != NULL) { error = (*fp->f_ops->fo_close)(fp); } else { error = 0; } KASSERT(fp->f_count == 0); KASSERT(fp->f_cred != NULL); pool_cache_put(file_cache, fp); return error; } /* * Allocate a file descriptor for the process. * * Future idea for experimentation: replace all of this with radixtree. */ int fd_alloc(proc_t *p, int want, int *result) { filedesc_t *fdp = p->p_fd; int i, lim, last, error, hi; u_int off; fdtab_t *dt; KASSERT(p == curproc || p == &proc0); /* * Search for a free descriptor starting at the higher * of want or fd_freefile. */ mutex_enter(&fdp->fd_lock); fd_checkmaps(fdp); dt = fdp->fd_dt; KASSERT(dt->dt_ff[0] == (fdfile_t *)fdp->fd_dfdfile[0]); lim = uimin((int)p->p_rlimit[RLIMIT_NOFILE].rlim_cur, maxfiles); last = uimin(dt->dt_nfiles, lim); for (;;) { if ((i = want) < fdp->fd_freefile) i = fdp->fd_freefile; off = i >> NDENTRYSHIFT; hi = fd_next_zero(fdp, fdp->fd_himap, off, (last + NDENTRIES - 1) >> NDENTRYSHIFT); if (hi == -1) break; i = fd_next_zero(fdp, &fdp->fd_lomap[hi], hi > off ? 0 : i & NDENTRYMASK, NDENTRIES); if (i == -1) { /* * Free file descriptor in this block was * below want, try again with higher want. */ want = (hi + 1) << NDENTRYSHIFT; continue; } i += (hi << NDENTRYSHIFT); if (i >= last) { break; } if (dt->dt_ff[i] == NULL) { KASSERT(i >= NDFDFILE); dt->dt_ff[i] = kmem_alloc(sizeof(fdfile_t), KM_SLEEP); fdfile_ctor(dt->dt_ff[i]); } KASSERT(dt->dt_ff[i]->ff_file == NULL); fd_used(fdp, i); if (want <= fdp->fd_freefile) { fdp->fd_freefile = i; } *result = i; KASSERT(i >= NDFDFILE || dt->dt_ff[i] == (fdfile_t *)fdp->fd_dfdfile[i]); fd_checkmaps(fdp); mutex_exit(&fdp->fd_lock); return 0; } /* No space in current array. Let the caller expand and retry. */ error = (dt->dt_nfiles >= lim) ? EMFILE : ENOSPC; mutex_exit(&fdp->fd_lock); return error; } /* * Allocate memory for a descriptor table. */ static fdtab_t * fd_dtab_alloc(int n) { fdtab_t *dt; size_t sz; KASSERT(n > NDFILE); sz = sizeof(*dt) + (n - NDFILE) * sizeof(dt->dt_ff[0]); dt = kmem_alloc(sz, KM_SLEEP); #ifdef DIAGNOSTIC memset(dt, 0xff, sz); #endif dt->dt_nfiles = n; dt->dt_link = NULL; return dt; } /* * Free a descriptor table, and all tables linked for deferred free. */ static void fd_dtab_free(fdtab_t *dt) { fdtab_t *next; size_t sz; do { next = dt->dt_link; KASSERT(dt->dt_nfiles > NDFILE); sz = sizeof(*dt) + (dt->dt_nfiles - NDFILE) * sizeof(dt->dt_ff[0]); #ifdef DIAGNOSTIC memset(dt, 0xff, sz); #endif kmem_free(dt, sz); dt = next; } while (dt != NULL); } /* * Allocate descriptor bitmap. */ static void fd_map_alloc(int n, uint32_t **lo, uint32_t **hi) { uint8_t *ptr; size_t szlo, szhi; KASSERT(n > NDENTRIES); szlo = NDLOSLOTS(n) * sizeof(uint32_t); szhi = NDHISLOTS(n) * sizeof(uint32_t); ptr = kmem_alloc(szlo + szhi, KM_SLEEP); *lo = (uint32_t *)ptr; *hi = (uint32_t *)(ptr + szlo); } /* * Free descriptor bitmap. */ static void fd_map_free(int n, uint32_t *lo, uint32_t *hi) { size_t szlo, szhi; KASSERT(n > NDENTRIES); szlo = NDLOSLOTS(n) * sizeof(uint32_t); szhi = NDHISLOTS(n) * sizeof(uint32_t); KASSERT(hi == (uint32_t *)((uint8_t *)lo + szlo)); kmem_free(lo, szlo + szhi); } /* * Expand a process' descriptor table. */ void fd_tryexpand(proc_t *p) { filedesc_t *fdp; int i, numfiles, oldnfiles; fdtab_t *newdt, *dt; uint32_t *newhimap, *newlomap; KASSERT(p == curproc || p == &proc0); fdp = p->p_fd; newhimap = NULL; newlomap = NULL; oldnfiles = atomic_load_consume(&fdp->fd_dt)->dt_nfiles; if (oldnfiles < NDEXTENT) numfiles = NDEXTENT; else numfiles = 2 * oldnfiles; newdt = fd_dtab_alloc(numfiles); if (NDHISLOTS(numfiles) > NDHISLOTS(oldnfiles)) { fd_map_alloc(numfiles, &newlomap, &newhimap); } mutex_enter(&fdp->fd_lock); dt = fdp->fd_dt; KASSERT(dt->dt_ff[0] == (fdfile_t *)fdp->fd_dfdfile[0]); if (dt->dt_nfiles != oldnfiles) { /* fdp changed; caller must retry */ mutex_exit(&fdp->fd_lock); fd_dtab_free(newdt); if (NDHISLOTS(numfiles) > NDHISLOTS(oldnfiles)) { fd_map_free(numfiles, newlomap, newhimap); } return; } /* Copy the existing descriptor table and zero the new portion. */ i = sizeof(fdfile_t *) * oldnfiles; memcpy(newdt->dt_ff, dt->dt_ff, i); memset((uint8_t *)newdt->dt_ff + i, 0, numfiles * sizeof(fdfile_t *) - i); /* * Link old descriptor array into list to be discarded. We defer * freeing until the last reference to the descriptor table goes * away (usually process exit). This allows us to do lockless * lookups in fd_getfile(). */ if (oldnfiles > NDFILE) { if (fdp->fd_refcnt > 1) { newdt->dt_link = dt; } else { fd_dtab_free(dt); } } if (NDHISLOTS(numfiles) > NDHISLOTS(oldnfiles)) { i = NDHISLOTS(oldnfiles) * sizeof(uint32_t); memcpy(newhimap, fdp->fd_himap, i); memset((uint8_t *)newhimap + i, 0, NDHISLOTS(numfiles) * sizeof(uint32_t) - i); i = NDLOSLOTS(oldnfiles) * sizeof(uint32_t); memcpy(newlomap, fdp->fd_lomap, i); memset((uint8_t *)newlomap + i, 0, NDLOSLOTS(numfiles) * sizeof(uint32_t) - i); if (NDHISLOTS(oldnfiles) > NDHISLOTS(NDFILE)) { fd_map_free(oldnfiles, fdp->fd_lomap, fdp->fd_himap); } fdp->fd_himap = newhimap; fdp->fd_lomap = newlomap; } /* * All other modifications must become globally visible before * the change to fd_dt. See fd_getfile(). */ atomic_store_release(&fdp->fd_dt, newdt); KASSERT(newdt->dt_ff[0] == (fdfile_t *)fdp->fd_dfdfile[0]); fd_checkmaps(fdp); mutex_exit(&fdp->fd_lock); } /* * Create a new open file structure and allocate a file descriptor * for the current process. */ int fd_allocfile(file_t **resultfp, int *resultfd) { proc_t *p = curproc; kauth_cred_t cred; file_t *fp; int error; while ((error = fd_alloc(p, 0, resultfd)) != 0) { if (error != ENOSPC) { return error; } fd_tryexpand(p); } fp = pool_cache_get(file_cache, PR_WAITOK); if (fp == NULL) { fd_abort(p, NULL, *resultfd); return ENFILE; } KASSERT(fp->f_count == 0); KASSERT(fp->f_msgcount == 0); KASSERT(fp->f_unpcount == 0); /* Replace cached credentials if not what we need. */ cred = curlwp->l_cred; if (__predict_false(cred != fp->f_cred)) { kauth_cred_free(fp->f_cred); fp->f_cred = kauth_cred_hold(cred); } /* * Don't allow recycled files to be scanned. * See uipc_usrreq.c. */ if (__predict_false((fp->f_flag & FSCAN) != 0)) { mutex_enter(&fp->f_lock); atomic_and_uint(&fp->f_flag, ~FSCAN); mutex_exit(&fp->f_lock); } fp->f_advice = 0; fp->f_offset = 0; *resultfp = fp; return 0; } /* * Successful creation of a new descriptor: make visible to the process. */ void fd_affix(proc_t *p, file_t *fp, unsigned fd) { fdfile_t *ff; filedesc_t *fdp; fdtab_t *dt; KASSERT(p == curproc || p == &proc0); /* Add a reference to the file structure. */ mutex_enter(&fp->f_lock); fp->f_count++; mutex_exit(&fp->f_lock); /* * Insert the new file into the descriptor slot. */ fdp = p->p_fd; dt = atomic_load_consume(&fdp->fd_dt); ff = dt->dt_ff[fd]; KASSERT(ff != NULL); KASSERT(ff->ff_file == NULL); KASSERT(ff->ff_allocated); KASSERT(fd_isused(fdp, fd)); KASSERT(fd >= NDFDFILE || ff == (fdfile_t *)fdp->fd_dfdfile[fd]); /* No need to lock in order to make file initially visible. */ atomic_store_release(&ff->ff_file, fp); } /* * Abort creation of a new descriptor: free descriptor slot and file. */ void fd_abort(proc_t *p, file_t *fp, unsigned fd) { filedesc_t *fdp; fdfile_t *ff; KASSERT(p == curproc || p == &proc0); fdp = p->p_fd; ff = atomic_load_consume(&fdp->fd_dt)->dt_ff[fd]; ff->ff_exclose = false; KASSERT(fd >= NDFDFILE || ff == (fdfile_t *)fdp->fd_dfdfile[fd]); mutex_enter(&fdp->fd_lock); KASSERT(fd_isused(fdp, fd)); fd_unused(fdp, fd); mutex_exit(&fdp->fd_lock); if (fp != NULL) { KASSERT(fp->f_count == 0); KASSERT(fp->f_cred != NULL); pool_cache_put(file_cache, fp); } } static int file_ctor(void *arg, void *obj, int flags) { /* * It's easy to exhaust the open file limit on a system with many * CPUs due to caching. Allow a bit of leeway to reduce the element * of surprise. */ u_int slop = PCG_NOBJECTS_NORMAL * (ncpu - 1); file_t *fp = obj; memset(fp, 0, sizeof(*fp)); mutex_enter(&filelist_lock); if (__predict_false(nfiles >= slop + maxfiles)) { mutex_exit(&filelist_lock); tablefull("file", "increase kern.maxfiles or MAXFILES"); return ENFILE; } nfiles++; LIST_INSERT_HEAD(&filehead, fp, f_list); mutex_init(&fp->f_lock, MUTEX_DEFAULT, IPL_NONE); fp->f_cred = kauth_cred_hold(curlwp->l_cred); mutex_exit(&filelist_lock); return 0; } static void file_dtor(void *arg, void *obj) { file_t *fp = obj; mutex_enter(&filelist_lock); nfiles--; LIST_REMOVE(fp, f_list); mutex_exit(&filelist_lock); KASSERT(fp->f_count == 0); kauth_cred_free(fp->f_cred); mutex_destroy(&fp->f_lock); } static void fdfile_ctor(fdfile_t *ff) { memset(ff, 0, sizeof(*ff)); cv_init(&ff->ff_closing, "fdclose"); } static void fdfile_dtor(fdfile_t *ff) { cv_destroy(&ff->ff_closing); } file_t * fgetdummy(void) { file_t *fp; fp = kmem_zalloc(sizeof(*fp), KM_SLEEP); mutex_init(&fp->f_lock, MUTEX_DEFAULT, IPL_NONE); return fp; } void fputdummy(file_t *fp) { mutex_destroy(&fp->f_lock); kmem_free(fp, sizeof(*fp)); } /* * Create an initial filedesc structure. */ filedesc_t * fd_init(filedesc_t *fdp) { #ifdef DIAGNOSTIC unsigned fd; #endif if (__predict_true(fdp == NULL)) { fdp = pool_cache_get(filedesc_cache, PR_WAITOK); } else { KASSERT(fdp == &filedesc0); filedesc_ctor(NULL, fdp, PR_WAITOK); } #ifdef DIAGNOSTIC KASSERT(fdp->fd_lastfile == -1); KASSERT(fdp->fd_lastkqfile == -1); KASSERT(fdp->fd_knhash == NULL); KASSERT(fdp->fd_freefile == 0); KASSERT(fdp->fd_exclose == false); KASSERT(fdp->fd_dt == &fdp->fd_dtbuiltin); KASSERT(fdp->fd_dtbuiltin.dt_nfiles == NDFILE); for (fd = 0; fd < NDFDFILE; fd++) { KASSERT(fdp->fd_dtbuiltin.dt_ff[fd] == (fdfile_t *)fdp->fd_dfdfile[fd]); } for (fd = NDFDFILE; fd < NDFILE; fd++) { KASSERT(fdp->fd_dtbuiltin.dt_ff[fd] == NULL); } KASSERT(fdp->fd_himap == fdp->fd_dhimap); KASSERT(fdp->fd_lomap == fdp->fd_dlomap); #endif /* DIAGNOSTIC */ fdp->fd_refcnt = 1; fd_checkmaps(fdp); return fdp; } /* * Initialize a file descriptor table. */ static int filedesc_ctor(void *arg, void *obj, int flag) { filedesc_t *fdp = obj; fdfile_t **ffp; int i; memset(fdp, 0, sizeof(*fdp)); mutex_init(&fdp->fd_lock, MUTEX_DEFAULT, IPL_NONE); fdp->fd_lastfile = -1; fdp->fd_lastkqfile = -1; fdp->fd_dt = &fdp->fd_dtbuiltin; fdp->fd_dtbuiltin.dt_nfiles = NDFILE; fdp->fd_himap = fdp->fd_dhimap; fdp->fd_lomap = fdp->fd_dlomap; CTASSERT(sizeof(fdp->fd_dfdfile[0]) >= sizeof(fdfile_t)); for (i = 0, ffp = fdp->fd_dt->dt_ff; i < NDFDFILE; i++, ffp++) { fdfile_ctor(*ffp = (fdfile_t *)fdp->fd_dfdfile[i]); } return 0; } static void filedesc_dtor(void *arg, void *obj) { filedesc_t *fdp = obj; int i; for (i = 0; i < NDFDFILE; i++) { fdfile_dtor((fdfile_t *)fdp->fd_dfdfile[i]); } mutex_destroy(&fdp->fd_lock); } /* * Make p share curproc's filedesc structure. */ void fd_share(struct proc *p) { filedesc_t *fdp; fdp = curlwp->l_fd; p->p_fd = fdp; atomic_inc_uint(&fdp->fd_refcnt); } /* * Acquire a hold on a filedesc structure. */ void fd_hold(lwp_t *l) { filedesc_t *fdp = l->l_fd; atomic_inc_uint(&fdp->fd_refcnt); } /* * Copy a filedesc structure. */ filedesc_t * fd_copy(void) { filedesc_t *newfdp, *fdp; fdfile_t *ff, **ffp, **nffp, *ff2; int i, j, numfiles, lastfile, newlast; file_t *fp; fdtab_t *newdt; fdp = curproc->p_fd; newfdp = pool_cache_get(filedesc_cache, PR_WAITOK); newfdp->fd_refcnt = 1; #ifdef DIAGNOSTIC KASSERT(newfdp->fd_lastfile == -1); KASSERT(newfdp->fd_lastkqfile == -1); KASSERT(newfdp->fd_knhash == NULL); KASSERT(newfdp->fd_freefile == 0); KASSERT(newfdp->fd_exclose == false); KASSERT(newfdp->fd_dt == &newfdp->fd_dtbuiltin); KASSERT(newfdp->fd_dtbuiltin.dt_nfiles == NDFILE); for (i = 0; i < NDFDFILE; i++) { KASSERT(newfdp->fd_dtbuiltin.dt_ff[i] == (fdfile_t *)&newfdp->fd_dfdfile[i]); } for (i = NDFDFILE; i < NDFILE; i++) { KASSERT(newfdp->fd_dtbuiltin.dt_ff[i] == NULL); } #endif /* DIAGNOSTIC */ mutex_enter(&fdp->fd_lock); fd_checkmaps(fdp); numfiles = fdp->fd_dt->dt_nfiles; lastfile = fdp->fd_lastfile; /* * If the number of open files fits in the internal arrays * of the open file structure, use them, otherwise allocate * additional memory for the number of descriptors currently * in use. */ if (lastfile < NDFILE) { i = NDFILE; newdt = newfdp->fd_dt; KASSERT(newfdp->fd_dt == &newfdp->fd_dtbuiltin); } else { /* * Compute the smallest multiple of NDEXTENT needed * for the file descriptors currently in use, * allowing the table to shrink. */ i = numfiles; while (i >= 2 * NDEXTENT && i > lastfile * 2) { i /= 2; } KASSERT(i > NDFILE); newdt = fd_dtab_alloc(i); newfdp->fd_dt = newdt; memcpy(newdt->dt_ff, newfdp->fd_dtbuiltin.dt_ff, NDFDFILE * sizeof(fdfile_t **)); memset(newdt->dt_ff + NDFDFILE, 0, (i - NDFDFILE) * sizeof(fdfile_t **)); } if (NDHISLOTS(i) <= NDHISLOTS(NDFILE)) { newfdp->fd_himap = newfdp->fd_dhimap; newfdp->fd_lomap = newfdp->fd_dlomap; } else { fd_map_alloc(i, &newfdp->fd_lomap, &newfdp->fd_himap); KASSERT(i >= NDENTRIES * NDENTRIES); memset(newfdp->fd_himap, 0, NDHISLOTS(i)*sizeof(uint32_t)); memset(newfdp->fd_lomap, 0, NDLOSLOTS(i)*sizeof(uint32_t)); } newfdp->fd_freefile = fdp->fd_freefile; newfdp->fd_exclose = fdp->fd_exclose; ffp = fdp->fd_dt->dt_ff; nffp = newdt->dt_ff; newlast = -1; for (i = 0; i <= lastfile; i++, ffp++, nffp++) { KASSERT(i >= NDFDFILE || *nffp == (fdfile_t *)newfdp->fd_dfdfile[i]); ff = *ffp; if (ff == NULL || (fp = atomic_load_consume(&ff->ff_file)) == NULL) { /* Descriptor unused, or descriptor half open. */ KASSERT(!fd_isused(newfdp, i)); continue; } if (__predict_false(fp->f_type == DTYPE_KQUEUE)) { /* kqueue descriptors cannot be copied. */ if (i < newfdp->fd_freefile) { newfdp->fd_freefile = i; } continue; } /* It's active: add a reference to the file. */ mutex_enter(&fp->f_lock); fp->f_count++; mutex_exit(&fp->f_lock); /* Allocate an fdfile_t to represent it. */ if (i >= NDFDFILE) { ff2 = kmem_alloc(sizeof(*ff2), KM_SLEEP); fdfile_ctor(ff2); *nffp = ff2; } else { ff2 = newdt->dt_ff[i]; } ff2->ff_file = fp; ff2->ff_exclose = ff->ff_exclose; ff2->ff_allocated = true; /* Fix up bitmaps. */ j = i >> NDENTRYSHIFT; KASSERT((newfdp->fd_lomap[j] & (1U << (i & NDENTRYMASK))) == 0); newfdp->fd_lomap[j] |= 1U << (i & NDENTRYMASK); if (__predict_false(newfdp->fd_lomap[j] == ~0)) { KASSERT((newfdp->fd_himap[j >> NDENTRYSHIFT] & (1U << (j & NDENTRYMASK))) == 0); newfdp->fd_himap[j >> NDENTRYSHIFT] |= 1U << (j & NDENTRYMASK); } newlast = i; } KASSERT(newdt->dt_ff[0] == (fdfile_t *)newfdp->fd_dfdfile[0]); newfdp->fd_lastfile = newlast; fd_checkmaps(newfdp); mutex_exit(&fdp->fd_lock); return newfdp; } /* * Release a filedesc structure. */ void fd_free(void) { fdfile_t *ff; file_t *fp; int fd, nf; fdtab_t *dt; lwp_t * const l = curlwp; filedesc_t * const fdp = l->l_fd; const bool noadvlock = (l->l_proc->p_flag & PK_ADVLOCK) == 0; KASSERT(atomic_load_consume(&fdp->fd_dt)->dt_ff[0] == (fdfile_t *)fdp->fd_dfdfile[0]); KASSERT(fdp->fd_dtbuiltin.dt_nfiles == NDFILE); KASSERT(fdp->fd_dtbuiltin.dt_link == NULL); membar_release(); if (atomic_dec_uint_nv(&fdp->fd_refcnt) > 0) return; membar_acquire(); /* * Close any files that the process holds open. */ dt = fdp->fd_dt; fd_checkmaps(fdp); #ifdef DEBUG fdp->fd_refcnt = -1; /* see fd_checkmaps */ #endif for (fd = 0, nf = dt->dt_nfiles; fd < nf; fd++) { ff = dt->dt_ff[fd]; KASSERT(fd >= NDFDFILE || ff == (fdfile_t *)fdp->fd_dfdfile[fd]); if (ff == NULL) continue; if ((fp = atomic_load_consume(&ff->ff_file)) != NULL) { /* * Must use fd_close() here if there is * a reference from kqueue or we might have posix * advisory locks. */ if (__predict_true(ff->ff_refcnt == 0) && (noadvlock || fp->f_type != DTYPE_VNODE)) { ff->ff_file = NULL; ff->ff_exclose = false; ff->ff_allocated = false; closef(fp); } else { ff->ff_refcnt++; fd_close(fd); } } KASSERT(ff->ff_refcnt == 0); KASSERT(ff->ff_file == NULL); KASSERT(!ff->ff_exclose); KASSERT(!ff->ff_allocated); if (fd >= NDFDFILE) { cv_destroy(&ff->ff_closing); kmem_free(ff, sizeof(*ff)); dt->dt_ff[fd] = NULL; } } /* * Clean out the descriptor table for the next user and return * to the cache. */ if (__predict_false(dt != &fdp->fd_dtbuiltin)) { fd_dtab_free(fdp->fd_dt); /* Otherwise, done above. */ memset(&fdp->fd_dtbuiltin.dt_ff[NDFDFILE], 0, (NDFILE - NDFDFILE) * sizeof(fdp->fd_dtbuiltin.dt_ff[0])); fdp->fd_dt = &fdp->fd_dtbuiltin; } if (__predict_false(NDHISLOTS(nf) > NDHISLOTS(NDFILE))) { KASSERT(fdp->fd_himap != fdp->fd_dhimap); KASSERT(fdp->fd_lomap != fdp->fd_dlomap); fd_map_free(nf, fdp->fd_lomap, fdp->fd_himap); } if (__predict_false(fdp->fd_knhash != NULL)) { hashdone(fdp->fd_knhash, HASH_LIST, fdp->fd_knhashmask); fdp->fd_knhash = NULL; fdp->fd_knhashmask = 0; } else { KASSERT(fdp->fd_knhashmask == 0); } fdp->fd_dt = &fdp->fd_dtbuiltin; fdp->fd_lastkqfile = -1; fdp->fd_lastfile = -1; fdp->fd_freefile = 0; fdp->fd_exclose = false; memset(&fdp->fd_startzero, 0, sizeof(*fdp) - offsetof(filedesc_t, fd_startzero)); fdp->fd_himap = fdp->fd_dhimap; fdp->fd_lomap = fdp->fd_dlomap; KASSERT(fdp->fd_dtbuiltin.dt_nfiles == NDFILE); KASSERT(fdp->fd_dtbuiltin.dt_link == NULL); KASSERT(fdp->fd_dt == &fdp->fd_dtbuiltin); #ifdef DEBUG fdp->fd_refcnt = 0; /* see fd_checkmaps */ #endif fd_checkmaps(fdp); pool_cache_put(filedesc_cache, fdp); } /* * File Descriptor pseudo-device driver (/dev/fd/). * * Opening minor device N dup()s the file (if any) connected to file * descriptor N belonging to the calling process. Note that this driver * consists of only the ``open()'' routine, because all subsequent * references to this file will be direct to the other driver. */ static int filedescopen(dev_t dev, int mode, int type, lwp_t *l) { /* * XXX Kludge: set dupfd to contain the value of the * the file descriptor being sought for duplication. The error * return ensures that the vnode for this device will be released * by vn_open. Open will detect this special error and take the * actions in fd_dupopen below. Other callers of vn_open or VOP_OPEN * will simply report the error. */ l->l_dupfd = minor(dev); /* XXX */ return EDUPFD; } /* * Duplicate the specified descriptor to a free descriptor. * * old is the original fd. * moveit is true if we should move rather than duplicate. * flags are the open flags (converted from O_* to F*). * newp returns the new fd on success. * * These two cases are produced by the EDUPFD and EMOVEFD magic * errnos, but in the interest of removing that regrettable interface, * vn_open has been changed to intercept them. Now vn_open returns * either a vnode or a filehandle, and the filehandle is accompanied * by a boolean that says whether we should dup (moveit == false) or * move (moveit == true) the fd. * * The dup case is used by /dev/stderr, /proc/self/fd, and such. The * move case is used by cloner devices that allocate a fd of their * own (a layering violation that should go away eventually) that * then needs to be put in the place open() expects it. */ int fd_dupopen(int old, bool moveit, int flags, int *newp) { filedesc_t *fdp; fdfile_t *ff; file_t *fp; fdtab_t *dt; int error; if ((fp = fd_getfile(old)) == NULL) { return EBADF; } fdp = curlwp->l_fd; dt = atomic_load_consume(&fdp->fd_dt); ff = dt->dt_ff[old]; /* * There are two cases of interest here. * * 1. moveit == false (used to be the EDUPFD magic errno): * simply dup (old) to file descriptor (new) and return. * * 2. moveit == true (used to be the EMOVEFD magic errno): * steal away the file structure from (old) and store it in * (new). (old) is effectively closed by this operation. */ if (moveit == false) { /* * Check that the mode the file is being opened for is a * subset of the mode of the existing descriptor. */ if (((flags & (FREAD|FWRITE)) | fp->f_flag) != fp->f_flag) { error = EACCES; goto out; } /* Copy it. */ error = fd_dup(fp, 0, newp, ff->ff_exclose); } else { /* Copy it. */ error = fd_dup(fp, 0, newp, ff->ff_exclose); if (error != 0) { goto out; } /* Steal away the file pointer from 'old'. */ (void)fd_close(old); return 0; } out: fd_putfile(old); return error; } /* * Close open files on exec. */ void fd_closeexec(void) { proc_t *p; filedesc_t *fdp; fdfile_t *ff; lwp_t *l; fdtab_t *dt; int fd; l = curlwp; p = l->l_proc; fdp = p->p_fd; if (fdp->fd_refcnt > 1) { fdp = fd_copy(); fd_free(); p->p_fd = fdp; l->l_fd = fdp; } if (!fdp->fd_exclose) { return; } fdp->fd_exclose = false; dt = atomic_load_consume(&fdp->fd_dt); for (fd = 0; fd <= fdp->fd_lastfile; fd++) { if ((ff = dt->dt_ff[fd]) == NULL) { KASSERT(fd >= NDFDFILE); continue; } KASSERT(fd >= NDFDFILE || ff == (fdfile_t *)fdp->fd_dfdfile[fd]); if (ff->ff_file == NULL) continue; if (ff->ff_exclose) { /* * We need a reference to close the file. * No other threads can see the fdfile_t at * this point, so don't bother locking. */ KASSERT((ff->ff_refcnt & FR_CLOSING) == 0); ff->ff_refcnt++; fd_close(fd); } } } /* * Sets descriptor owner. If the owner is a process, 'pgid' * is set to positive value, process ID. If the owner is process group, * 'pgid' is set to -pg_id. */ int fsetown(pid_t *pgid, u_long cmd, const void *data) { pid_t id = *(const pid_t *)data; int error; if (id == INT_MIN) return EINVAL; switch (cmd) { case TIOCSPGRP: if (id < 0) return EINVAL; id = -id; break; default: break; } if (id > 0) { mutex_enter(&proc_lock); error = proc_find(id) ? 0 : ESRCH; mutex_exit(&proc_lock); } else if (id < 0) { error = pgid_in_session(curproc, -id); } else { error = 0; } if (!error) { *pgid = id; } return error; } void fd_set_exclose(struct lwp *l, int fd, bool exclose) { filedesc_t *fdp = l->l_fd; fdfile_t *ff = atomic_load_consume(&fdp->fd_dt)->dt_ff[fd]; ff->ff_exclose = exclose; if (exclose) fdp->fd_exclose = true; } /* * Return descriptor owner information. If the value is positive, * it's process ID. If it's negative, it's process group ID and * needs the sign removed before use. */ int fgetown(pid_t pgid, u_long cmd, void *data) { switch (cmd) { case TIOCGPGRP: *(int *)data = -pgid; break; default: *(int *)data = pgid; break; } return 0; } /* * Send signal to descriptor owner, either process or process group. */ void fownsignal(pid_t pgid, int signo, int code, int band, void *fdescdata) { ksiginfo_t ksi; KASSERT(!cpu_intr_p()); if (pgid == 0) { return; } KSI_INIT(&ksi); ksi.ksi_signo = signo; ksi.ksi_code = code; ksi.ksi_band = band; mutex_enter(&proc_lock); if (pgid > 0) { struct proc *p1; p1 = proc_find(pgid); if (p1 != NULL) { kpsignal(p1, &ksi, fdescdata); } } else { struct pgrp *pgrp; KASSERT(pgid < 0); pgrp = pgrp_find(-pgid); if (pgrp != NULL) { kpgsignal(pgrp, &ksi, fdescdata, 0); } } mutex_exit(&proc_lock); } int fd_clone(file_t *fp, unsigned fd, int flag, const struct fileops *fops, void *data) { fdfile_t *ff; filedesc_t *fdp; fp->f_flag = flag & FMASK; fdp = curproc->p_fd; ff = atomic_load_consume(&fdp->fd_dt)->dt_ff[fd]; KASSERT(ff != NULL); ff->ff_exclose = (flag & O_CLOEXEC) != 0; fp->f_type = DTYPE_MISC; fp->f_ops = fops; fp->f_data = data; curlwp->l_dupfd = fd; fd_affix(curproc, fp, fd); return EMOVEFD; } int fnullop_fcntl(file_t *fp, u_int cmd, void *data) { if (cmd == F_SETFL) return 0; return EOPNOTSUPP; } int fnullop_poll(file_t *fp, int which) { return 0; } int fnullop_kqfilter(file_t *fp, struct knote *kn) { return EOPNOTSUPP; } void fnullop_restart(file_t *fp) { } int fbadop_read(file_t *fp, off_t *offset, struct uio *uio, kauth_cred_t cred, int flags) { return EOPNOTSUPP; } int fbadop_write(file_t *fp, off_t *offset, struct uio *uio, kauth_cred_t cred, int flags) { return EOPNOTSUPP; } int fbadop_ioctl(file_t *fp, u_long com, void *data) { return EOPNOTSUPP; } int fbadop_stat(file_t *fp, struct stat *sb) { return EOPNOTSUPP; } int fbadop_close(file_t *fp) { return EOPNOTSUPP; } /* * sysctl routines pertaining to file descriptors */ /* Initialized in sysctl_init() for now... */ extern kmutex_t sysctl_file_marker_lock; static u_int sysctl_file_marker = 1; /* * Expects to be called with proc_lock and sysctl_file_marker_lock locked. */ static void sysctl_file_marker_reset(void) { struct proc *p; PROCLIST_FOREACH(p, &allproc) { struct filedesc *fd = p->p_fd; fdtab_t *dt; u_int i; mutex_enter(&fd->fd_lock); dt = fd->fd_dt; for (i = 0; i < dt->dt_nfiles; i++) { struct file *fp; fdfile_t *ff; if ((ff = dt->dt_ff[i]) == NULL) { continue; } if ((fp = atomic_load_consume(&ff->ff_file)) == NULL) { continue; } fp->f_marker = 0; } mutex_exit(&fd->fd_lock); } } /* * sysctl helper routine for kern.file pseudo-subtree. */ static int sysctl_kern_file(SYSCTLFN_ARGS) { const bool allowaddr = get_expose_address(curproc); struct filelist flist; int error; size_t buflen; struct file *fp, fbuf; char *start, *where; struct proc *p; start = where = oldp; buflen = *oldlenp; if (where == NULL) { /* * overestimate by 10 files */ *oldlenp = sizeof(filehead) + (nfiles + 10) * sizeof(struct file); return 0; } /* * first sysctl_copyout filehead */ if (buflen < sizeof(filehead)) { *oldlenp = 0; return 0; } sysctl_unlock(); if (allowaddr) { memcpy(&flist, &filehead, sizeof(flist)); } else { memset(&flist, 0, sizeof(flist)); } error = sysctl_copyout(l, &flist, where, sizeof(flist)); if (error) { sysctl_relock(); return error; } buflen -= sizeof(flist); where += sizeof(flist); /* * followed by an array of file structures */ mutex_enter(&sysctl_file_marker_lock); mutex_enter(&proc_lock); PROCLIST_FOREACH(p, &allproc) { struct filedesc *fd; fdtab_t *dt; u_int i; if (p->p_stat == SIDL) { /* skip embryonic processes */ continue; } mutex_enter(p->p_lock); error = kauth_authorize_process(l->l_cred, KAUTH_PROCESS_CANSEE, p, KAUTH_ARG(KAUTH_REQ_PROCESS_CANSEE_OPENFILES), NULL, NULL); mutex_exit(p->p_lock); if (error != 0) { /* * Don't leak kauth retval if we're silently * skipping this entry. */ error = 0; continue; } /* * Grab a hold on the process. */ if (!rw_tryenter(&p->p_reflock, RW_READER)) { continue; } mutex_exit(&proc_lock); fd = p->p_fd; mutex_enter(&fd->fd_lock); dt = fd->fd_dt; for (i = 0; i < dt->dt_nfiles; i++) { fdfile_t *ff; if ((ff = dt->dt_ff[i]) == NULL) { continue; } if ((fp = atomic_load_consume(&ff->ff_file)) == NULL) { continue; } mutex_enter(&fp->f_lock); if ((fp->f_count == 0) || (fp->f_marker == sysctl_file_marker)) { mutex_exit(&fp->f_lock); continue; } /* Check that we have enough space. */ if (buflen < sizeof(struct file)) { *oldlenp = where - start; mutex_exit(&fp->f_lock); error = ENOMEM; break; } fill_file(&fbuf, fp); mutex_exit(&fp->f_lock); error = sysctl_copyout(l, &fbuf, where, sizeof(fbuf)); if (error) { break; } buflen -= sizeof(struct file); where += sizeof(struct file); fp->f_marker = sysctl_file_marker; } mutex_exit(&fd->fd_lock); /* * Release reference to process. */ mutex_enter(&proc_lock); rw_exit(&p->p_reflock); if (error) break; } sysctl_file_marker++; /* Reset all markers if wrapped. */ if (sysctl_file_marker == 0) { sysctl_file_marker_reset(); sysctl_file_marker++; } mutex_exit(&proc_lock); mutex_exit(&sysctl_file_marker_lock); *oldlenp = where - start; sysctl_relock(); return error; } /* * sysctl helper function for kern.file2 */ static int sysctl_kern_file2(SYSCTLFN_ARGS) { struct proc *p; struct file *fp; struct filedesc *fd; struct kinfo_file kf; char *dp; u_int i, op; size_t len, needed, elem_size, out_size; int error, arg, elem_count; fdfile_t *ff; fdtab_t *dt; if (namelen == 1 && name[0] == CTL_QUERY) return sysctl_query(SYSCTLFN_CALL(rnode)); if (namelen != 4) return EINVAL; error = 0; dp = oldp; len = (oldp != NULL) ? *oldlenp : 0; op = name[0]; arg = name[1]; elem_size = name[2]; elem_count = name[3]; out_size = MIN(sizeof(kf), elem_size); needed = 0; if (elem_size < 1 || elem_count < 0) return EINVAL; switch (op) { case KERN_FILE_BYFILE: case KERN_FILE_BYPID: /* * We're traversing the process list in both cases; the BYFILE * case does additional work of keeping track of files already * looked at. */ /* doesn't use arg so it must be zero */ if ((op == KERN_FILE_BYFILE) && (arg != 0)) return EINVAL; if ((op == KERN_FILE_BYPID) && (arg < -1)) /* -1 means all processes */ return EINVAL; sysctl_unlock(); if (op == KERN_FILE_BYFILE) mutex_enter(&sysctl_file_marker_lock); mutex_enter(&proc_lock); PROCLIST_FOREACH(p, &allproc) { if (p->p_stat == SIDL) { /* skip embryonic processes */ continue; } if (arg > 0 && p->p_pid != arg) { /* pick only the one we want */ /* XXX want 0 to mean "kernel files" */ continue; } mutex_enter(p->p_lock); error = kauth_authorize_process(l->l_cred, KAUTH_PROCESS_CANSEE, p, KAUTH_ARG(KAUTH_REQ_PROCESS_CANSEE_OPENFILES), NULL, NULL); mutex_exit(p->p_lock); if (error != 0) { /* * Don't leak kauth retval if we're silently * skipping this entry. */ error = 0; continue; } /* * Grab a hold on the process. */ if (!rw_tryenter(&p->p_reflock, RW_READER)) { continue; } mutex_exit(&proc_lock); fd = p->p_fd; mutex_enter(&fd->fd_lock); dt = fd->fd_dt; for (i = 0; i < dt->dt_nfiles; i++) { if ((ff = dt->dt_ff[i]) == NULL) { continue; } if ((fp = atomic_load_consume(&ff->ff_file)) == NULL) { continue; } if ((op == KERN_FILE_BYFILE) && (fp->f_marker == sysctl_file_marker)) { continue; } if (len >= elem_size && elem_count > 0) { mutex_enter(&fp->f_lock); fill_file2(&kf, fp, ff, i, p->p_pid); mutex_exit(&fp->f_lock); mutex_exit(&fd->fd_lock); error = sysctl_copyout(l, &kf, dp, out_size); mutex_enter(&fd->fd_lock); if (error) break; dp += elem_size; len -= elem_size; } if (op == KERN_FILE_BYFILE) fp->f_marker = sysctl_file_marker; needed += elem_size; if (elem_count > 0 && elem_count != INT_MAX) elem_count--; } mutex_exit(&fd->fd_lock); /* * Release reference to process. */ mutex_enter(&proc_lock); rw_exit(&p->p_reflock); } if (op == KERN_FILE_BYFILE) { sysctl_file_marker++; /* Reset all markers if wrapped. */ if (sysctl_file_marker == 0) { sysctl_file_marker_reset(); sysctl_file_marker++; } } mutex_exit(&proc_lock); if (op == KERN_FILE_BYFILE) mutex_exit(&sysctl_file_marker_lock); sysctl_relock(); break; default: return EINVAL; } if (oldp == NULL) needed += KERN_FILESLOP * elem_size; *oldlenp = needed; return error; } static void fill_file(struct file *fp, const struct file *fpsrc) { const bool allowaddr = get_expose_address(curproc); memset(fp, 0, sizeof(*fp)); fp->f_offset = fpsrc->f_offset; COND_SET_PTR(fp->f_cred, fpsrc->f_cred, allowaddr); COND_SET_CPTR(fp->f_ops, fpsrc->f_ops, allowaddr); COND_SET_STRUCT(fp->f_undata, fpsrc->f_undata, allowaddr); COND_SET_STRUCT(fp->f_list, fpsrc->f_list, allowaddr); fp->f_flag = fpsrc->f_flag; fp->f_marker = fpsrc->f_marker; fp->f_type = fpsrc->f_type; fp->f_advice = fpsrc->f_advice; fp->f_count = fpsrc->f_count; fp->f_msgcount = fpsrc->f_msgcount; fp->f_unpcount = fpsrc->f_unpcount; COND_SET_STRUCT(fp->f_unplist, fpsrc->f_unplist, allowaddr); } static void fill_file2(struct kinfo_file *kp, const file_t *fp, const fdfile_t *ff, int i, pid_t pid) { const bool allowaddr = get_expose_address(curproc); memset(kp, 0, sizeof(*kp)); COND_SET_VALUE(kp->ki_fileaddr, PTRTOUINT64(fp), allowaddr); kp->ki_flag = fp->f_flag; kp->ki_iflags = 0; kp->ki_ftype = fp->f_type; kp->ki_count = fp->f_count; kp->ki_msgcount = fp->f_msgcount; COND_SET_VALUE(kp->ki_fucred, PTRTOUINT64(fp->f_cred), allowaddr); kp->ki_fuid = kauth_cred_geteuid(fp->f_cred); kp->ki_fgid = kauth_cred_getegid(fp->f_cred); COND_SET_VALUE(kp->ki_fops, PTRTOUINT64(fp->f_ops), allowaddr); kp->ki_foffset = fp->f_offset; COND_SET_VALUE(kp->ki_fdata, PTRTOUINT64(fp->f_data), allowaddr); /* vnode information to glue this file to something */ if (fp->f_type == DTYPE_VNODE) { struct vnode *vp = fp->f_vnode; COND_SET_VALUE(kp->ki_vun, PTRTOUINT64(vp->v_un.vu_socket), allowaddr); kp->ki_vsize = vp->v_size; kp->ki_vtype = vp->v_type; kp->ki_vtag = vp->v_tag; COND_SET_VALUE(kp->ki_vdata, PTRTOUINT64(vp->v_data), allowaddr); } /* process information when retrieved via KERN_FILE_BYPID */ if (ff != NULL) { kp->ki_pid = pid; kp->ki_fd = i; kp->ki_ofileflags = ff->ff_exclose; kp->ki_usecount = ff->ff_refcnt; } }
1 1 1 1 5 5 4 2 2 1 2 2 1 1 2 9 2 2 3 3 1 2 2 1 3 2 1 3 3 3 3 16 1 1 8 1 1 1 1 1 1 4 4 8 7 1 7 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 /* $NetBSD: tty_pty.c,v 1.149 2021/10/11 01:07:36 thorpej Exp $ */ /* * Copyright (c) 1982, 1986, 1989, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)tty_pty.c 8.4 (Berkeley) 2/20/95 */ /* * Pseudo-teletype Driver * (Actually two drivers, requiring two entries in 'cdevsw') */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: tty_pty.c,v 1.149 2021/10/11 01:07:36 thorpej Exp $"); #include "opt_ptm.h" #define TTY_ALLOW_PRIVATE #include <sys/param.h> #include <sys/systm.h> #include <sys/ioctl.h> #include <sys/ioctl_compat.h> #include <sys/proc.h> #include <sys/tty.h> #include <sys/stat.h> #include <sys/file.h> #include <sys/kernel.h> #include <sys/vnode.h> #include <sys/namei.h> #include <sys/signalvar.h> #include <sys/uio.h> #include <sys/filedesc.h> #include <sys/conf.h> #include <sys/poll.h> #include <sys/pty.h> #include <sys/kauth.h> #include "ioconf.h" #define DEFAULT_NPTYS 16 /* default number of initial ptys */ #define DEFAULT_MAXPTYS 992 /* default maximum number of ptys */ #define BUFSIZ 100 /* Chunk size iomoved to/from user */ struct pt_softc { struct tty *pt_tty; int pt_flags; struct selinfo pt_selr, pt_selw; u_char pt_send; u_char pt_ucntl; }; static struct pt_softc **pt_softc = NULL; /* pty array */ static int maxptys = DEFAULT_MAXPTYS; /* maximum number of ptys (sysctable) */ kmutex_t pt_softc_mutex; int npty = 0; /* for pstat -t */ #define PF_PKT 0x08 /* packet mode */ #define PF_STOPPED 0x10 /* user told stopped */ #define PF_REMOTE 0x20 /* remote and flow controlled input */ #define PF_NOSTOP 0x40 #define PF_UCNTL 0x80 /* user control mode */ void ptcwakeup(struct tty *, int); void ptsstart(struct tty *); int pty_maxptys(int, int); static struct pt_softc **ptyarralloc(int); dev_type_open(ptcopen); dev_type_close(ptcclose); dev_type_read(ptcread); dev_type_write(ptcwrite); dev_type_poll(ptcpoll); dev_type_kqfilter(ptckqfilter); dev_type_open(ptsopen); dev_type_close(ptsclose); dev_type_read(ptsread); dev_type_write(ptswrite); dev_type_stop(ptsstop); dev_type_poll(ptspoll); dev_type_ioctl(ptyioctl); dev_type_tty(ptytty); const struct cdevsw ptc_cdevsw = { .d_open = ptcopen, .d_close = ptcclose, .d_read = ptcread, .d_write = ptcwrite, .d_ioctl = ptyioctl, .d_stop = nullstop, .d_tty = ptytty, .d_poll = ptcpoll, .d_mmap = nommap, .d_kqfilter = ptckqfilter, .d_discard = nodiscard, .d_flag = D_TTY }; const struct cdevsw pts_cdevsw = { .d_open = ptsopen, .d_close = ptsclose, .d_read = ptsread, .d_write = ptswrite, .d_ioctl = ptyioctl, .d_stop = ptsstop, .d_tty = ptytty, .d_poll = ptspoll, .d_mmap = nommap, .d_kqfilter = ttykqfilter, .d_discard = nodiscard, .d_flag = D_TTY }; #if defined(pmax) /* * Used by arch/pmax/conf/majors.pmax, which needs a second copy as it * needs to map this stuff to two pairs of majors. */ const struct cdevsw ptc_ultrix_cdevsw = { .d_open = ptcopen, .d_close = ptcclose, .d_read = ptcread, .d_write = ptcwrite, .d_ioctl = ptyioctl, .d_stop = nullstop, .d_tty = ptytty, .d_poll = ptcpoll, .d_mmap = nommap, .d_kqfilter = ptckqfilter, .d_discard = nodiscard, .d_flag = D_TTY }; const struct cdevsw pts_ultrix_cdevsw = { .d_open = ptsopen, .d_close = ptsclose, .d_read = ptsread, .d_write = ptswrite, .d_ioctl = ptyioctl, .d_stop = ptsstop, .d_tty = ptytty, .d_poll = ptspoll, .d_mmap = nommap, .d_kqfilter = ttykqfilter, .d_discard = nodiscard, .d_flag = D_TTY }; #endif /* defined(pmax) */ /* * Check if a pty is free to use. */ int pty_isfree(int minor, int lock) { struct pt_softc *pt = pt_softc[minor]; if (lock) mutex_enter(&pt_softc_mutex); minor = pt == NULL || pt->pt_tty == NULL || pt->pt_tty->t_oproc == NULL; if (lock) mutex_exit(&pt_softc_mutex); return minor; } /* * Allocate and zero array of nelem elements. */ static struct pt_softc ** ptyarralloc(int nelem) { struct pt_softc **pt; nelem += 10; pt = kmem_zalloc(nelem * sizeof(*pt), KM_SLEEP); return pt; } static void ptyarrfree(struct pt_softc **pt, int nelem) { nelem += 10; kmem_free(pt, nelem * sizeof(*pt)); } /* * Check if the minor is correct and ensure necessary structures * are properly allocated. */ int pty_check(int ptn) { struct pt_softc *pti; if (ptn >= npty) { struct pt_softc **newpt, **oldpt; int newnpty; int oldnpty; /* check if the requested pty can be granted */ if (ptn >= maxptys) { limit_reached: tablefull("pty", "increase kern.maxptys"); return ENXIO; } /* Allocate a larger pty array */ for (newnpty = npty; newnpty <= ptn;) newnpty *= 2; if (newnpty > maxptys) newnpty = maxptys; newpt = ptyarralloc(newnpty); /* * Now grab the pty array mutex - we need to ensure * that the pty array is consistent while copying its * content to newly allocated, larger space; we also * need to be safe against pty_maxptys(). */ mutex_enter(&pt_softc_mutex); if (newnpty >= maxptys) { /* limit cut away beneath us... */ if (ptn >= maxptys) { mutex_exit(&pt_softc_mutex); ptyarrfree(newpt, newnpty); goto limit_reached; } newnpty = maxptys; } /* * If the pty array was not enlarged while we were waiting * for mutex, copy current contents of pt_softc[] to newly * allocated array and start using the new bigger array. */ if (newnpty > npty) { memcpy(newpt, pt_softc, npty*sizeof(struct pt_softc *)); oldpt = pt_softc; oldnpty = npty; pt_softc = newpt; npty = newnpty; } else { /* was enlarged when waited for lock, free new space */ oldpt = newpt; oldnpty = newnpty; } mutex_exit(&pt_softc_mutex); ptyarrfree(oldpt, oldnpty); } /* * If the entry is not yet allocated, allocate one. The mutex is * needed so that the state of pt_softc[] array is consistant * in case it has been lengthened above. */ if (!pt_softc[ptn]) { pti = kmem_zalloc(sizeof(*pti), KM_SLEEP); selinit(&pti->pt_selr); selinit(&pti->pt_selw); pti->pt_tty = tty_alloc(); mutex_enter(&pt_softc_mutex); /* * Check the entry again - it might have been * added while we were waiting for mutex. */ if (pt_softc[ptn]) { mutex_exit(&pt_softc_mutex); tty_free(pti->pt_tty); seldestroy(&pti->pt_selr); seldestroy(&pti->pt_selw); kmem_free(pti, sizeof(*pti)); return 0; } tty_attach(pti->pt_tty); pt_softc[ptn] = pti; mutex_exit(&pt_softc_mutex); } return 0; } /* * Set maxpty in thread-safe way. Returns 0 in case of error, otherwise * new value of maxptys. */ int pty_maxptys(int newmax, int set) { if (!set) return maxptys; /* * We have to grab the pt_softc lock, so that we would pick correct * value of npty (might be modified in pty_check()). */ mutex_enter(&pt_softc_mutex); /* * The value cannot be set to value lower than the highest pty * number ever allocated. */ if (newmax >= npty) maxptys = newmax; else newmax = 0; mutex_exit(&pt_softc_mutex); return newmax; } /* * Establish n (or default if n is 1) ptys in the system. */ void ptyattach(int n) { mutex_init(&pt_softc_mutex, MUTEX_DEFAULT, IPL_NONE); /* maybe should allow 0 => none? */ if (n <= 1) n = DEFAULT_NPTYS; pt_softc = ptyarralloc(n); npty = n; #ifndef NO_DEV_PTM ptmattach(1); #endif } /*ARGSUSED*/ int ptsopen(dev_t dev, int flag, int devtype, struct lwp *l) { struct pt_softc *pti; struct tty *tp; int error; int ptn = minor(dev); if ((error = pty_check(ptn)) != 0) return error; mutex_spin_enter(&tty_lock); pti = pt_softc[ptn]; tp = pti->pt_tty; if (!ISSET(tp->t_state, TS_ISOPEN)) { tp->t_dev = dev; ttychars(tp); /* Set up default chars */ tp->t_iflag = TTYDEF_IFLAG; tp->t_oflag = TTYDEF_OFLAG; tp->t_lflag = TTYDEF_LFLAG; tp->t_cflag = TTYDEF_CFLAG; tp->t_ispeed = tp->t_ospeed = TTYDEF_SPEED; ttsetwater(tp); /* would be done in xxparam() */ } else if (kauth_authorize_device_tty(l->l_cred, KAUTH_DEVICE_TTY_OPEN, tp) != 0) { mutex_spin_exit(&tty_lock); return EBUSY; } if (tp->t_oproc) /* Ctrlr still around. */ SET(tp->t_state, TS_CARR_ON); if (!ISSET(flag, O_NONBLOCK)) { while (!ISSET(tp->t_state, TS_CARR_ON)) { tp->t_wopen++; error = ttysleep(tp, &tp->t_rawcv, true, 0); tp->t_wopen--; if (error != 0) { mutex_spin_exit(&tty_lock); return error; } } } mutex_spin_exit(&tty_lock); error = (*tp->t_linesw->l_open)(dev, tp); ptcwakeup(tp, FREAD|FWRITE); return error; } int ptsclose(dev_t dev, int flag, int mode, struct lwp *l) { struct pt_softc *pti = pt_softc[minor(dev)]; struct tty *tp = pti->pt_tty; int error; error = (*tp->t_linesw->l_close)(tp, flag); error |= ttyclose(tp); ptcwakeup(tp, FREAD|FWRITE); return error; } int ptsread(dev_t dev, struct uio *uio, int flag) { struct proc *p = curproc; struct pt_softc *pti = pt_softc[minor(dev)]; struct tty *tp = pti->pt_tty; int error = 0; int cc, c; again: if (pti->pt_flags & PF_REMOTE) { mutex_spin_enter(&tty_lock); while (isbackground(p, tp)) { /* XXXSMP */ if (sigismasked(curlwp, SIGTTIN) || p->p_pgrp->pg_jobc == 0 || p->p_lflag & PL_PPWAIT) { mutex_spin_exit(&tty_lock); return EIO; } ttysig(tp, TTYSIG_PG1, SIGTTIN); error = ttypause(tp, hz); if (error != 0) { mutex_spin_exit(&tty_lock); return error; } } if (tp->t_canq.c_cc == 0) { if (flag & IO_NDELAY) { mutex_spin_exit(&tty_lock); return EWOULDBLOCK; } error = ttysleep(tp, &tp->t_cancv, true, 0); mutex_spin_exit(&tty_lock); if (error != 0) return error; goto again; } while(error == 0 && tp->t_canq.c_cc > 1 && uio->uio_resid > 0) { c = getc(&tp->t_canq); mutex_spin_exit(&tty_lock); error = ureadc(c, uio); mutex_spin_enter(&tty_lock); /* Re-check terminal state here? */ } if (tp->t_canq.c_cc == 1) (void) getc(&tp->t_canq); cc = tp->t_canq.c_cc; mutex_spin_exit(&tty_lock); if (cc) return error; } else if (tp->t_oproc) error = (*tp->t_linesw->l_read)(tp, uio, flag); ptcwakeup(tp, FWRITE); return error; } /* * Write to pseudo-tty. * Wakeups of controlling tty will happen * indirectly, when tty driver calls ptsstart. */ int ptswrite(dev_t dev, struct uio *uio, int flag) { struct pt_softc *pti = pt_softc[minor(dev)]; struct tty *tp = pti->pt_tty; if (tp->t_oproc == NULL) return EIO; return (*tp->t_linesw->l_write)(tp, uio, flag); } /* * Poll pseudo-tty. */ int ptspoll(dev_t dev, int events, struct lwp *l) { struct pt_softc *pti = pt_softc[minor(dev)]; struct tty *tp = pti->pt_tty; if (tp->t_oproc == NULL) return POLLHUP; return (*tp->t_linesw->l_poll)(tp, events, l); } /* * Start output on pseudo-tty. * Wake up process polling or sleeping for input from controlling tty. */ void ptsstart(struct tty *tp) { struct pt_softc *pti; KASSERT(tp->t_dev != NODEV); pti = pt_softc[minor(tp->t_dev)]; KASSERT(mutex_owned(&tty_lock)); if (ISSET(tp->t_state, TS_TTSTOP)) return; if (pti->pt_flags & PF_STOPPED) { pti->pt_flags &= ~PF_STOPPED; pti->pt_send = TIOCPKT_START; } selnotify(&pti->pt_selr, 0, NOTE_SUBMIT); cv_broadcast(&tp->t_outcvf); } /* * Stop output. */ void ptsstop(struct tty *tp, int flush) { struct pt_softc *pti; KASSERT(tp->t_dev != NODEV); pti = pt_softc[minor(tp->t_dev)]; KASSERT(mutex_owned(&tty_lock)); /* note: FLUSHREAD and FLUSHWRITE already ok */ CTASSERT(TIOCPKT_FLUSHREAD == FREAD); CTASSERT(TIOCPKT_FLUSHWRITE == FWRITE); if (flush == 0) { flush = TIOCPKT_STOP; pti->pt_flags |= PF_STOPPED; } else pti->pt_flags &= ~PF_STOPPED; pti->pt_send |= flush; /* change of perspective */ if (flush & FREAD) { selnotify(&pti->pt_selw, 0, NOTE_SUBMIT); cv_broadcast(&tp->t_rawcvf); } if (flush & FWRITE) { selnotify(&pti->pt_selr, 0, NOTE_SUBMIT); cv_broadcast(&tp->t_outcvf); } } void ptcwakeup(struct tty *tp, int flag) { struct pt_softc *pti; if (tp->t_dev == NODEV) return; /* client side not open yet */ pti = pt_softc[minor(tp->t_dev)]; KASSERT(pti != NULL); mutex_spin_enter(&tty_lock); if (flag & FREAD) { selnotify(&pti->pt_selr, 0, NOTE_SUBMIT); cv_broadcast(&tp->t_outcvf); } if (flag & FWRITE) { selnotify(&pti->pt_selw, 0, NOTE_SUBMIT); cv_broadcast(&tp->t_rawcvf); } mutex_spin_exit(&tty_lock); } /*ARGSUSED*/ int ptcopen(dev_t dev, int flag, int devtype, struct lwp *l) { struct pt_softc *pti; struct tty *tp; int error; int ptn = minor(dev); if ((error = pty_check(ptn)) != 0) return error; pti = pt_softc[ptn]; tp = pti->pt_tty; mutex_spin_enter(&tty_lock); if (tp->t_oproc) { mutex_spin_exit(&tty_lock); return EIO; } tp->t_dev = dev; tp->t_oproc = ptsstart; mutex_spin_exit(&tty_lock); (void)(*tp->t_linesw->l_modem)(tp, 1); CLR(tp->t_lflag, EXTPROC); pti->pt_flags = 0; pti->pt_send = 0; pti->pt_ucntl = 0; return 0; } /*ARGSUSED*/ int ptcclose(dev_t dev, int flag, int devtype, struct lwp *l) { struct pt_softc *pti = pt_softc[minor(dev)]; struct tty *tp = pti->pt_tty; (void)(*tp->t_linesw->l_modem)(tp, 0); mutex_spin_enter(&tty_lock); CLR(tp->t_state, TS_CARR_ON); tp->t_oproc = NULL; /* mark closed */ mutex_spin_exit(&tty_lock); return 0; } int ptcread(dev_t dev, struct uio *uio, int flag) { struct pt_softc *pti = pt_softc[minor(dev)]; struct tty *tp = pti->pt_tty; u_char bf[BUFSIZ]; int error = 0, cc; int c; if (uio->uio_resid <= 0) return EINVAL; /* * We want to block until the slave * is open, and there's something to read; * but if we lost the slave or we're NBIO, * then return the appropriate error instead. */ mutex_spin_enter(&tty_lock); for (;;) { if (ISSET(tp->t_state, TS_ISOPEN)) { if (pti->pt_flags & PF_PKT && (c = pti->pt_send)) { pti->pt_send = 0; mutex_spin_exit(&tty_lock); error = ureadc(c, uio); if (error != 0) return error; /* * Since we don't have the tty locked, there's * a risk of messing up `t_termios'. This is * relevant only if the tty got closed and then * opened again while we were out uiomoving. */ if (c & TIOCPKT_IOCTL) { cc = uimin(uio->uio_resid, sizeof(tp->t_termios)); uiomove((void *) &tp->t_termios, cc, uio); } return 0; } if (pti->pt_flags & PF_UCNTL && (c = pti->pt_ucntl)) { pti->pt_ucntl = 0; mutex_spin_exit(&tty_lock); error = ureadc(c, uio); if (error != 0) return error; return 0; } if (tp->t_outq.c_cc && !ISSET(tp->t_state, TS_TTSTOP)) break; } if (!ISSET(tp->t_state, TS_CARR_ON)) { error = 0; /* EOF */ goto out; } if (flag & IO_NDELAY) { error = EWOULDBLOCK; goto out; } error = cv_wait_sig(&tp->t_outcvf, &tty_lock); if (error != 0) goto out; } if (pti->pt_flags & (PF_PKT|PF_UCNTL)) { mutex_spin_exit(&tty_lock); error = ureadc(0, uio); mutex_spin_enter(&tty_lock); if (error == 0 && !ISSET(tp->t_state, TS_ISOPEN)) error = EIO; } while (uio->uio_resid > 0 && error == 0) { cc = q_to_b(&tp->t_outq, bf, uimin(uio->uio_resid, BUFSIZ)); if (cc <= 0) break; mutex_spin_exit(&tty_lock); error = uiomove(bf, cc, uio); mutex_spin_enter(&tty_lock); if (error == 0 && !ISSET(tp->t_state, TS_ISOPEN)) error = EIO; } ttypull(tp); out: mutex_spin_exit(&tty_lock); return error; } int ptcwrite(dev_t dev, struct uio *uio, int flag) { struct pt_softc *pti = pt_softc[minor(dev)]; struct tty *tp = pti->pt_tty; u_char *cp = NULL; int cc = 0; u_char locbuf[BUFSIZ]; int cnt = 0; int error = 0; again: mutex_spin_enter(&tty_lock); if (!ISSET(tp->t_state, TS_ISOPEN)) goto block; if (pti->pt_flags & PF_REMOTE) { if (tp->t_canq.c_cc) goto block; while (uio->uio_resid > 0 && tp->t_canq.c_cc < TTYHOG) { if (cc == 0) { cc = uimin(uio->uio_resid, BUFSIZ); cc = uimin(cc, TTYHOG - tp->t_canq.c_cc); cp = locbuf; mutex_spin_exit(&tty_lock); error = uiomove(cp, cc, uio); if (error != 0) return error; mutex_spin_enter(&tty_lock); /* check again for safety */ if (!ISSET(tp->t_state, TS_ISOPEN)) { /* * adjust for data copied in but not * written */ uio->uio_resid += cc; error = EIO; goto out; } } if (cc) { cc = b_to_q(cp, cc, &tp->t_outq); if (cc > 0) goto block; } } (void) putc(0, &tp->t_canq); ttwakeup(tp); cv_broadcast(&tp->t_cancv); error = 0; goto out; } while (uio->uio_resid > 0) { if (cc == 0) { cc = uimin(uio->uio_resid, BUFSIZ); cp = locbuf; mutex_spin_exit(&tty_lock); error = uiomove(cp, cc, uio); if (error != 0) return error; mutex_spin_enter(&tty_lock); /* check again for safety */ if (!ISSET(tp->t_state, TS_ISOPEN)) { /* adjust for data copied in but not written */ uio->uio_resid += cc; error = EIO; goto out; } } while (cc > 0) { int used = tp->t_rawq.c_cc + tp->t_canq.c_cc; int canon = ISSET(tp->t_lflag, ICANON) ? 1 : 0; /* * We need space for 2 characters if canonical * because we might need to print ^C */ if (used >= (TTYHOG - canon) && (tp->t_canq.c_cc > 0 || !canon)) { cv_broadcast(&tp->t_rawcv); goto block; } /* * XXX - should change l_rint to be called with lock * see also tty.c:ttyinput_wlock() */ mutex_spin_exit(&tty_lock); (*tp->t_linesw->l_rint)(*cp++, tp); mutex_spin_enter(&tty_lock); cnt++; cc--; } } error = 0; goto out; block: /* * Come here to wait for slave to open, for space * in outq, or space in rawq. */ if (!ISSET(tp->t_state, TS_CARR_ON)) { /* adjust for data copied in but not written */ uio->uio_resid += cc; error = EIO; goto out; } if (flag & IO_NDELAY) { /* adjust for data copied in but not written */ uio->uio_resid += cc; error = cnt == 0 ? EWOULDBLOCK : 0; goto out; } error = cv_wait_sig(&tp->t_rawcvf, &tty_lock); mutex_spin_exit(&tty_lock); if (error != 0) { /* adjust for data copied in but not written */ uio->uio_resid += cc; return error; } goto again; out: mutex_spin_exit(&tty_lock); return error; } int ptcpoll(dev_t dev, int events, struct lwp *l) { struct pt_softc *pti = pt_softc[minor(dev)]; struct tty *tp = pti->pt_tty; int revents = 0; mutex_spin_enter(&tty_lock); if (events & (POLLIN | POLLRDNORM)) if (ISSET(tp->t_state, TS_ISOPEN) && ((tp->t_outq.c_cc > 0 && !ISSET(tp->t_state, TS_TTSTOP)) || ((pti->pt_flags & PF_PKT) && pti->pt_send) || ((pti->pt_flags & PF_UCNTL) && pti->pt_ucntl))) revents |= events & (POLLIN | POLLRDNORM); if (events & (POLLOUT | POLLWRNORM)) if (ISSET(tp->t_state, TS_ISOPEN) && ((pti->pt_flags & PF_REMOTE) ? (tp->t_canq.c_cc == 0) : ((tp->t_rawq.c_cc + tp->t_canq.c_cc < TTYHOG-2) || (tp->t_canq.c_cc == 0 && ISSET(tp->t_lflag, ICANON))))) revents |= events & (POLLOUT | POLLWRNORM); if (events & POLLHUP) if (!ISSET(tp->t_state, TS_CARR_ON)) revents |= POLLHUP; if (revents == 0) { if (events & (POLLIN | POLLHUP | POLLRDNORM)) selrecord(l, &pti->pt_selr); if (events & (POLLOUT | POLLWRNORM)) selrecord(l, &pti->pt_selw); } mutex_spin_exit(&tty_lock); return revents; } static void filt_ptcrdetach(struct knote *kn) { struct pt_softc *pti; pti = kn->kn_hook; mutex_spin_enter(&tty_lock); selremove_knote(&pti->pt_selr, kn); mutex_spin_exit(&tty_lock); } static int filt_ptcread(struct knote *kn, long hint) { struct pt_softc *pti; struct tty *tp; int canread; pti = kn->kn_hook; tp = pti->pt_tty; if ((hint & NOTE_SUBMIT) == 0) { mutex_spin_enter(&tty_lock); } canread = (ISSET(tp->t_state, TS_ISOPEN) && ((tp->t_outq.c_cc > 0 && !ISSET(tp->t_state, TS_TTSTOP)) || ((pti->pt_flags & PF_PKT) && pti->pt_send) || ((pti->pt_flags & PF_UCNTL) && pti->pt_ucntl))); if (canread) { /* * c_cc is number of characters after output post-processing; * the amount of data actually read(2) depends on * setting of input flags for the terminal. */ kn->kn_data = tp->t_outq.c_cc; if (((pti->pt_flags & PF_PKT) && pti->pt_send) || ((pti->pt_flags & PF_UCNTL) && pti->pt_ucntl)) kn->kn_data++; } if (!ISSET(tp->t_state, TS_CARR_ON)) { knote_set_eof(kn, 0); canread = 1; } if ((hint & NOTE_SUBMIT) == 0) { mutex_spin_exit(&tty_lock); } return canread; } static void filt_ptcwdetach(struct knote *kn) { struct pt_softc *pti; pti = kn->kn_hook; mutex_spin_enter(&tty_lock); selremove_knote(&pti->pt_selw, kn); mutex_spin_exit(&tty_lock); } static int filt_ptcwrite(struct knote *kn, long hint) { struct pt_softc *pti; struct tty *tp; int canwrite; int nwrite; pti = kn->kn_hook; tp = pti->pt_tty; if ((hint & NOTE_SUBMIT) == 0) { mutex_spin_enter(&tty_lock); } canwrite = (ISSET(tp->t_state, TS_ISOPEN) && ((pti->pt_flags & PF_REMOTE) ? (tp->t_canq.c_cc == 0) : ((tp->t_rawq.c_cc + tp->t_canq.c_cc < TTYHOG-2) || (tp->t_canq.c_cc == 0 && ISSET(tp->t_lflag, ICANON))))); if (canwrite) { if (pti->pt_flags & PF_REMOTE) nwrite = tp->t_canq.c_cn; else { /* this is guaranteed to be > 0 due to above check */ nwrite = tp->t_canq.c_cn - (tp->t_rawq.c_cc + tp->t_canq.c_cc); } kn->kn_data = nwrite; } if ((hint & NOTE_SUBMIT) == 0) { mutex_spin_exit(&tty_lock); } return canwrite; } static const struct filterops ptcread_filtops = { .f_flags = FILTEROP_ISFD | FILTEROP_MPSAFE, .f_attach = NULL, .f_detach = filt_ptcrdetach, .f_event = filt_ptcread, }; static const struct filterops ptcwrite_filtops = { .f_flags = FILTEROP_ISFD | FILTEROP_MPSAFE, .f_attach = NULL, .f_detach = filt_ptcwdetach, .f_event = filt_ptcwrite, }; int ptckqfilter(dev_t dev, struct knote *kn) { struct pt_softc *pti = pt_softc[minor(dev)]; struct selinfo *sip; switch (kn->kn_filter) { case EVFILT_READ: sip = &pti->pt_selr; kn->kn_fop = &ptcread_filtops; break; case EVFILT_WRITE: sip = &pti->pt_selw; kn->kn_fop = &ptcwrite_filtops; break; default: return EINVAL; } kn->kn_hook = pti; mutex_spin_enter(&tty_lock); selrecord_knote(sip, kn); mutex_spin_exit(&tty_lock); return 0; } struct tty * ptytty(dev_t dev) { struct pt_softc *pti = pt_softc[minor(dev)]; struct tty *tp = pti->pt_tty; return tp; } /*ARGSUSED*/ int ptyioctl(dev_t dev, u_long cmd, void *data, int flag, struct lwp *l) { struct pt_softc *pti = pt_softc[minor(dev)]; struct tty *tp = pti->pt_tty; const struct cdevsw *cdev; u_char *cc = tp->t_cc; int stop, error, sig; #ifndef NO_DEV_PTM struct mount *mp; #endif /* * IF CONTROLLER STTY THEN MUST FLUSH TO PREVENT A HANG. * ttywflush(tp) will hang if there are characters in the outq. */ if (cmd == TIOCEXT) { /* * When the EXTPROC bit is being toggled, we need * to send an TIOCPKT_IOCTL if the packet driver * is turned on. */ if (*(int *)data) { if (pti->pt_flags & PF_PKT) { pti->pt_send |= TIOCPKT_IOCTL; ptcwakeup(tp, FREAD); } SET(tp->t_lflag, EXTPROC); } else { if (ISSET(tp->t_lflag, EXTPROC) && (pti->pt_flags & PF_PKT)) { pti->pt_send |= TIOCPKT_IOCTL; ptcwakeup(tp, FREAD); } CLR(tp->t_lflag, EXTPROC); } return(0); } #ifndef NO_DEV_PTM /* Allow getting the name from either the master or the slave */ if (cmd == TIOCPTSNAME) { if ((error = pty_getmp(l, &mp)) != 0) return error; return pty_fill_ptmget(l, dev, -1, -1, data, mp); } #endif cdev = cdevsw_lookup(dev); if (cdev != NULL && cdev->d_open == ptcopen) switch (cmd) { #ifndef NO_DEV_PTM case TIOCGRANTPT: if ((error = pty_getmp(l, &mp)) != 0) return error; return pty_grant_slave(l, dev, mp); #endif case TIOCGPGRP: /* * We avoid calling ttioctl on the controller since, * in that case, tp must be the controlling terminal. */ *(int *)data = tp->t_pgrp ? tp->t_pgrp->pg_id : 0; return 0; case TIOCPKT: if (*(int *)data) { if (pti->pt_flags & PF_UCNTL) return EINVAL; pti->pt_flags |= PF_PKT; } else pti->pt_flags &= ~PF_PKT; return 0; case TIOCUCNTL: if (*(int *)data) { if (pti->pt_flags & PF_PKT) return EINVAL; pti->pt_flags |= PF_UCNTL; } else pti->pt_flags &= ~PF_UCNTL; return 0; case TIOCREMOTE: if (*(int *)data) pti->pt_flags |= PF_REMOTE; else pti->pt_flags &= ~PF_REMOTE; mutex_spin_enter(&tty_lock); ttyflush(tp, FREAD|FWRITE); mutex_spin_exit(&tty_lock); return 0; case TIOCSETP: case TIOCSETN: case TIOCSETD: case TIOCSETA: case TIOCSETAW: case TIOCSETAF: mutex_spin_enter(&tty_lock); ndflush(&tp->t_outq, tp->t_outq.c_cc); mutex_spin_exit(&tty_lock); break; case TIOCSIG: sig = (int)(long)*(void **)data; if (sig <= 0 || sig >= NSIG) return EINVAL; mutex_spin_enter(&tty_lock); if (!ISSET(tp->t_lflag, NOFLSH)) ttyflush(tp, FREAD|FWRITE); tp->t_state |= TS_SIGINFO; ttysig(tp, TTYSIG_PG1, sig); mutex_spin_exit(&tty_lock); return 0; case FIONREAD: mutex_spin_enter(&tty_lock); *(int *)data = tp->t_outq.c_cc; mutex_spin_exit(&tty_lock); return 0; } error = (*tp->t_linesw->l_ioctl)(tp, cmd, data, flag, l); if (error == EPASSTHROUGH) error = ttioctl(tp, cmd, data, flag, l); if (error == EPASSTHROUGH) { if (pti->pt_flags & PF_UCNTL && (cmd & ~0xff) == UIOCCMD(0)) { if (cmd & 0xff) { pti->pt_ucntl = (u_char)cmd; ptcwakeup(tp, FREAD); } return 0; } } /* * If external processing and packet mode send ioctl packet. */ if (ISSET(tp->t_lflag, EXTPROC) && (pti->pt_flags & PF_PKT)) { switch(cmd) { case TIOCSETA: case TIOCSETAW: case TIOCSETAF: case TIOCSETP: case TIOCSETN: case TIOCSETC: case TIOCSLTC: case TIOCLBIS: case TIOCLBIC: case TIOCLSET: pti->pt_send |= TIOCPKT_IOCTL; ptcwakeup(tp, FREAD); default: break; } } stop = ISSET(tp->t_iflag, IXON) && CCEQ(cc[VSTOP], CTRL('s')) && CCEQ(cc[VSTART], CTRL('q')); if (pti->pt_flags & PF_NOSTOP) { if (stop) { pti->pt_send &= ~TIOCPKT_NOSTOP; pti->pt_send |= TIOCPKT_DOSTOP; pti->pt_flags &= ~PF_NOSTOP; ptcwakeup(tp, FREAD); } } else { if (!stop) { pti->pt_send &= ~TIOCPKT_DOSTOP; pti->pt_send |= TIOCPKT_NOSTOP; pti->pt_flags |= PF_NOSTOP; ptcwakeup(tp, FREAD); } } return error; }
8 2 8 205 205 5 205 13 13 13 2650 4 2636 837 837 825 834 825 824 825 2 1708 5 5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 /* $NetBSD: x86_machdep.c,v 1.154 2023/10/04 20:28:06 ad Exp $ */ /*- * Copyright (c) 2002, 2006, 2007 YAMAMOTO Takashi, * Copyright (c) 2005, 2008, 2009, 2019, 2023 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Julio M. Merino Vidal, and Andrew Doran. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: x86_machdep.c,v 1.154 2023/10/04 20:28:06 ad Exp $"); #include "opt_modular.h" #include "opt_physmem.h" #include "opt_splash.h" #include "opt_kaslr.h" #include "opt_svs.h" #include "opt_xen.h" #include <sys/types.h> #include <sys/param.h> #include <sys/systm.h> #include <sys/kcore.h> #include <sys/errno.h> #include <sys/kauth.h> #include <sys/mutex.h> #include <sys/cpu.h> #include <sys/intr.h> #include <sys/atomic.h> #include <sys/module.h> #include <sys/sysctl.h> #include <sys/extent.h> #include <sys/rnd.h> #include <x86/bootspace.h> #include <x86/cpuvar.h> #include <x86/cputypes.h> #include <x86/efi.h> #include <x86/machdep.h> #include <x86/nmi.h> #include <x86/pio.h> #include <dev/splash/splash.h> #include <dev/isa/isareg.h> #include <dev/ic/i8042reg.h> #include <dev/mm.h> #include <machine/bootinfo.h> #include <machine/pmap_private.h> #include <machine/vmparam.h> #include <uvm/uvm_extern.h> #include "tsc.h" #include "acpica.h" #include "ioapic.h" #include "lapic.h" #if NACPICA > 0 #include <dev/acpi/acpivar.h> #endif #if NIOAPIC > 0 || NACPICA > 0 #include <machine/i82093var.h> #endif #include "opt_md.h" #if defined(MEMORY_DISK_HOOKS) && defined(MEMORY_DISK_DYNAMIC) #include <dev/md.h> #endif void (*x86_cpu_idle)(void); static bool x86_cpu_idle_ipi; static char x86_cpu_idle_text[16]; static bool x86_user_ldt_enabled __read_mostly = false; #ifdef XEN #include <xen/xen.h> #include <xen/hypervisor.h> #endif #ifndef XENPV void (*delay_func)(unsigned int) = i8254_delay; void (*x86_initclock_func)(void) = i8254_initclocks; #else /* XENPV */ void (*delay_func)(unsigned int) = xen_delay; void (*x86_initclock_func)(void) = xen_initclocks; #endif /* --------------------------------------------------------------------- */ /* * Main bootinfo structure. This is filled in by the bootstrap process * done in locore.S based on the information passed by the boot loader. */ struct bootinfo bootinfo; /* --------------------------------------------------------------------- */ bool bootmethod_efi; static kauth_listener_t x86_listener; extern paddr_t lowmem_rsvd, avail_start, avail_end; vaddr_t msgbuf_vaddr; struct msgbuf_p_seg msgbuf_p_seg[VM_PHYSSEG_MAX]; unsigned int msgbuf_p_cnt = 0; void init_x86_msgbuf(void); /* * Given the type of a bootinfo entry, looks for a matching item inside * the bootinfo structure. If found, returns a pointer to it (which must * then be casted to the appropriate bootinfo_* type); otherwise, returns * NULL. */ void * lookup_bootinfo(int type) { bool found; int i; struct btinfo_common *bic; bic = (struct btinfo_common *)(bootinfo.bi_data); found = FALSE; for (i = 0; i < bootinfo.bi_nentries && !found; i++) { if (bic->type == type) found = TRUE; else bic = (struct btinfo_common *) ((uint8_t *)bic + bic->len); } return found ? bic : NULL; } #ifdef notyet /* * List the available bootinfo entries. */ static const char *btinfo_str[] = { BTINFO_STR }; void aprint_bootinfo(void) { int i; struct btinfo_common *bic; aprint_normal("bootinfo:"); bic = (struct btinfo_common *)(bootinfo.bi_data); for (i = 0; i < bootinfo.bi_nentries; i++) { if (bic->type >= 0 && bic->type < __arraycount(btinfo_str)) aprint_normal(" %s", btinfo_str[bic->type]); else aprint_normal(" %d", bic->type); bic = (struct btinfo_common *) ((uint8_t *)bic + bic->len); } aprint_normal("\n"); } #endif /* * mm_md_physacc: check if given pa is accessible. */ int mm_md_physacc(paddr_t pa, vm_prot_t prot) { extern phys_ram_seg_t mem_clusters[VM_PHYSSEG_MAX]; extern int mem_cluster_cnt; int i; for (i = 0; i < mem_cluster_cnt; i++) { const phys_ram_seg_t *seg = &mem_clusters[i]; paddr_t lstart = seg->start; if (lstart <= pa && pa - lstart <= seg->size) { return 0; } } return kauth_authorize_machdep(kauth_cred_get(), KAUTH_MACHDEP_UNMANAGEDMEM, NULL, NULL, NULL, NULL); } #ifdef MODULAR /* * Push any modules loaded by the boot loader. */ void module_init_md(void) { struct btinfo_modulelist *biml; struct bi_modulelist_entry *bi, *bimax; biml = lookup_bootinfo(BTINFO_MODULELIST); if (biml == NULL) { aprint_debug("No module info at boot\n"); return; } bi = (struct bi_modulelist_entry *)((uint8_t *)biml + sizeof(*biml)); bimax = bi + biml->num; for (; bi < bimax; bi++) { switch (bi->type) { case BI_MODULE_ELF: aprint_debug("Prep module path=%s len=%d pa=%x\n", bi->path, bi->len, bi->base); KASSERT(trunc_page(bi->base) == bi->base); module_prime(bi->path, #ifdef KASLR (void *)PMAP_DIRECT_MAP((uintptr_t)bi->base), #else (void *)((uintptr_t)bi->base + KERNBASE), #endif bi->len); break; case BI_MODULE_IMAGE: #ifdef SPLASHSCREEN aprint_debug("Splash image path=%s len=%d pa=%x\n", bi->path, bi->len, bi->base); KASSERT(trunc_page(bi->base) == bi->base); splash_setimage( #ifdef KASLR (void *)PMAP_DIRECT_MAP((uintptr_t)bi->base), #else (void *)((uintptr_t)bi->base + KERNBASE), #endif bi->len); #endif break; case BI_MODULE_RND: /* handled in x86_rndseed */ break; case BI_MODULE_FS: aprint_debug("File-system image path=%s len=%d pa=%x\n", bi->path, bi->len, bi->base); KASSERT(trunc_page(bi->base) == bi->base); #if defined(MEMORY_DISK_HOOKS) && defined(MEMORY_DISK_DYNAMIC) md_root_setconf( #ifdef KASLR (void *)PMAP_DIRECT_MAP((uintptr_t)bi->base), #else (void *)((uintptr_t)bi->base + KERNBASE), #endif bi->len); #endif break; default: aprint_debug("Skipping non-ELF module\n"); break; } } } #endif /* MODULAR */ void x86_rndseed(void) { struct btinfo_modulelist *biml; struct bi_modulelist_entry *bi, *bimax; biml = lookup_bootinfo(BTINFO_MODULELIST); if (biml == NULL) { aprint_debug("No module info at boot\n"); return; } bi = (struct bi_modulelist_entry *)((uint8_t *)biml + sizeof(*biml)); bimax = bi + biml->num; for (; bi < bimax; bi++) { switch (bi->type) { case BI_MODULE_RND: aprint_debug("Random seed data path=%s len=%d pa=%x\n", bi->path, bi->len, bi->base); KASSERT(trunc_page(bi->base) == bi->base); rnd_seed( #ifdef KASLR (void *)PMAP_DIRECT_MAP((uintptr_t)bi->base), #else (void *)((uintptr_t)bi->base + KERNBASE), #endif bi->len); } } } void cpu_need_resched(struct cpu_info *ci, struct lwp *l, int flags) { KASSERT(kpreempt_disabled()); if ((flags & RESCHED_IDLE) != 0) { if ((flags & RESCHED_REMOTE) != 0 && x86_cpu_idle_ipi != false) { cpu_kick(ci); } return; } #ifdef __HAVE_PREEMPTION if ((flags & RESCHED_KPREEMPT) != 0) { if ((flags & RESCHED_REMOTE) != 0) { #ifdef XENPV xen_send_ipi(ci, XEN_IPI_KPREEMPT); #else x86_send_ipi(ci, X86_IPI_KPREEMPT); #endif } else { softint_trigger(1 << SIR_PREEMPT); } return; } #endif KASSERT((flags & RESCHED_UPREEMPT) != 0); if ((flags & RESCHED_REMOTE) != 0) { cpu_kick(ci); } else { aston(l); } } void cpu_signotify(struct lwp *l) { KASSERT(kpreempt_disabled()); if (l->l_cpu != curcpu()) { cpu_kick(l->l_cpu); } else { aston(l); } } void cpu_need_proftick(struct lwp *l) { KASSERT(kpreempt_disabled()); KASSERT(l->l_cpu == curcpu()); l->l_pflag |= LP_OWEUPC; aston(l); } bool cpu_intr_p(void) { int idepth; long pctr; lwp_t *l; l = curlwp; if (__predict_false(l->l_cpu == NULL)) { KASSERT(l == &lwp0); return false; } do { pctr = lwp_pctr(); idepth = l->l_cpu->ci_idepth; } while (__predict_false(pctr != lwp_pctr())); return idepth >= 0; } #ifdef __HAVE_PREEMPTION /* * Called to check MD conditions that would prevent preemption, and to * arrange for those conditions to be rechecked later. */ bool cpu_kpreempt_enter(uintptr_t where, int s) { struct pcb *pcb; lwp_t *l; KASSERT(kpreempt_disabled()); l = curlwp; /* * If SPL raised, can't go. Note this implies that spin * mutexes at IPL_NONE are _not_ valid to use. */ if (s > IPL_PREEMPT) { softint_trigger(1 << SIR_PREEMPT); return false; } /* Must save cr2 or it could be clobbered. */ pcb = lwp_getpcb(l); pcb->pcb_cr2 = rcr2(); return true; } /* * Called after returning from a kernel preemption, and called with * preemption disabled. */ void cpu_kpreempt_exit(uintptr_t where) { extern char x86_copyfunc_start, x86_copyfunc_end; struct pcb *pcb; KASSERT(kpreempt_disabled()); /* * If we interrupted any of the copy functions we must reload * the pmap when resuming, as they cannot tolerate it being * swapped out. */ if (where >= (uintptr_t)&x86_copyfunc_start && where < (uintptr_t)&x86_copyfunc_end) { pmap_load(); } /* Restore cr2 only after the pmap, as pmap_load can block. */ pcb = lwp_getpcb(curlwp); lcr2(pcb->pcb_cr2); } /* * Return true if preemption is disabled for MD reasons. Must be called * with preemption disabled, and thus is only for diagnostic checks. */ bool cpu_kpreempt_disabled(void) { return curcpu()->ci_ilevel > IPL_NONE; } #endif /* __HAVE_PREEMPTION */ SYSCTL_SETUP(sysctl_machdep_cpu_idle, "sysctl machdep cpu_idle") { const struct sysctlnode *mnode, *node; sysctl_createv(NULL, 0, NULL, &mnode, CTLFLAG_PERMANENT, CTLTYPE_NODE, "machdep", NULL, NULL, 0, NULL, 0, CTL_MACHDEP, CTL_EOL); sysctl_createv(NULL, 0, &mnode, &node, CTLFLAG_PERMANENT, CTLTYPE_STRING, "idle-mechanism", SYSCTL_DESCR("Mechanism used for the idle loop."), NULL, 0, x86_cpu_idle_text, 0, CTL_CREATE, CTL_EOL); } void x86_cpu_idle_init(void) { #ifndef XENPV if ((cpu_feature[1] & CPUID2_MONITOR) == 0) x86_cpu_idle_set(x86_cpu_idle_halt, "halt", true); else x86_cpu_idle_set(x86_cpu_idle_mwait, "mwait", false); #else x86_cpu_idle_set(x86_cpu_idle_xen, "xen", true); #endif } void x86_cpu_idle_get(void (**func)(void), char *text, size_t len) { *func = x86_cpu_idle; (void)strlcpy(text, x86_cpu_idle_text, len); } void x86_cpu_idle_set(void (*func)(void), const char *text, bool ipi) { x86_cpu_idle = func; x86_cpu_idle_ipi = ipi; (void)strlcpy(x86_cpu_idle_text, text, sizeof(x86_cpu_idle_text)); } #ifndef XENPV #define KBTOB(x) ((size_t)(x) * 1024UL) #define MBTOB(x) ((size_t)(x) * 1024UL * 1024UL) static struct { int freelist; uint64_t limit; } x86_freelists[VM_NFREELIST] = { { VM_FREELIST_DEFAULT, 0 }, #ifdef VM_FREELIST_FIRST1T /* 40-bit addresses needed for modern graphics. */ { VM_FREELIST_FIRST1T, 1ULL * 1024 * 1024 * 1024 * 1024 }, #endif #ifdef VM_FREELIST_FIRST64G /* 36-bit addresses needed for oldish graphics. */ { VM_FREELIST_FIRST64G, 64ULL * 1024 * 1024 * 1024 }, #endif #ifdef VM_FREELIST_FIRST4G /* 32-bit addresses needed for PCI 32-bit DMA and old graphics. */ { VM_FREELIST_FIRST4G, 4ULL * 1024 * 1024 * 1024 }, #endif /* 30-bit addresses needed for ancient graphics. */ { VM_FREELIST_FIRST1G, 1ULL * 1024 * 1024 * 1024 }, /* 24-bit addresses needed for ISA DMA. */ { VM_FREELIST_FIRST16, 16 * 1024 * 1024 }, }; int x86_select_freelist(uint64_t maxaddr) { unsigned int i; if (avail_end <= maxaddr) return VM_NFREELIST; for (i = 0; i < __arraycount(x86_freelists); i++) { if ((x86_freelists[i].limit - 1) <= maxaddr) return x86_freelists[i].freelist; } panic("no freelist for maximum address %"PRIx64, maxaddr); } static int x86_add_cluster(uint64_t seg_start, uint64_t seg_end, uint32_t type) { extern struct extent *iomem_ex; const uint64_t endext = MAXIOMEM + 1; uint64_t new_physmem = 0; phys_ram_seg_t *cluster; int i; if (seg_end > MAXPHYSMEM) { aprint_verbose("WARNING: skipping large memory map entry: " "0x%"PRIx64"/0x%"PRIx64"/0x%x\n", seg_start, (seg_end - seg_start), type); return 0; } /* * XXX: Chop the last page off the size so that it can fit in avail_end. */ if (seg_end == MAXPHYSMEM) seg_end -= PAGE_SIZE; if (seg_end <= seg_start) return 0; for (i = 0; i < mem_cluster_cnt; i++) { cluster = &mem_clusters[i]; if ((cluster->start == round_page(seg_start)) && (cluster->size == trunc_page(seg_end) - cluster->start)) { #ifdef DEBUG_MEMLOAD printf("WARNING: skipping duplicate segment entry\n"); #endif return 0; } } /* * This cluster is used by RAM. If it is included in the iomem extent, * allocate it from there, so that we won't unintentionally reuse it * later with extent_alloc_region. A way to avoid collision (with UVM * for example). * * This is done before the addresses are page rounded just to make * sure we get them all. */ if (seg_start < endext) { uint64_t io_end; if (seg_end > endext) io_end = endext; else io_end = seg_end; if (iomem_ex != NULL && extent_alloc_region(iomem_ex, seg_start, io_end - seg_start, EX_NOWAIT)) { /* XXX What should we do? */ printf("WARNING: CAN't ALLOCATE MEMORY SEGMENT " "(0x%"PRIx64"/0x%"PRIx64"/0x%x) FROM " "IOMEM EXTENT MAP!\n", seg_start, seg_end - seg_start, type); return 0; } } /* If it's not free memory, skip it. */ if (type != BIM_Memory) return 0; if (mem_cluster_cnt >= VM_PHYSSEG_MAX) { printf("WARNING: too many memory segments" "(increase VM_PHYSSEG_MAX)"); return -1; } #ifdef PHYSMEM_MAX_ADDR if (seg_start >= MBTOB(PHYSMEM_MAX_ADDR)) return 0; if (seg_end > MBTOB(PHYSMEM_MAX_ADDR)) seg_end = MBTOB(PHYSMEM_MAX_ADDR); #endif seg_start = round_page(seg_start); seg_end = trunc_page(seg_end); if (seg_start == seg_end) return 0; cluster = &mem_clusters[mem_cluster_cnt]; cluster->start = seg_start; if (iomem_ex != NULL) new_physmem = physmem + atop(seg_end - seg_start); #ifdef PHYSMEM_MAX_SIZE if (iomem_ex != NULL) { if (physmem >= atop(MBTOB(PHYSMEM_MAX_SIZE))) return 0; if (new_physmem > atop(MBTOB(PHYSMEM_MAX_SIZE))) { seg_end = seg_start + MBTOB(PHYSMEM_MAX_SIZE) - ptoa(physmem); new_physmem = atop(MBTOB(PHYSMEM_MAX_SIZE)); } } #endif cluster->size = seg_end - seg_start; if (iomem_ex != NULL) { if (avail_end < seg_end) avail_end = seg_end; physmem = new_physmem; } mem_cluster_cnt++; return 0; } static int x86_parse_clusters(struct btinfo_memmap *bim) { uint64_t seg_start, seg_end; uint64_t addr, size; uint32_t type; int x; KASSERT(bim != NULL); KASSERT(bim->num > 0); #ifdef DEBUG_MEMLOAD printf("MEMMAP: %s MEMORY MAP (%d ENTRIES):\n", lookup_bootinfo(BTINFO_EFIMEMMAP) != NULL ? "UEFI" : "BIOS", bim->num); #endif for (x = 0; x < bim->num; x++) { addr = bim->entry[x].addr; size = bim->entry[x].size; type = bim->entry[x].type; #ifdef DEBUG_MEMLOAD printf("MEMMAP: 0x%016" PRIx64 "-0x%016" PRIx64 "\n\tsize=0x%016" PRIx64 ", type=%d(%s)\n", addr, addr + size - 1, size, type, (type == BIM_Memory) ? "Memory" : (type == BIM_Reserved) ? "Reserved" : (type == BIM_ACPI) ? "ACPI" : (type == BIM_NVS) ? "NVS" : (type == BIM_PMEM) ? "Persistent" : (type == BIM_PRAM) ? "Persistent (Legacy)" : "unknown"); #endif /* If the segment is not memory, skip it. */ switch (type) { case BIM_Memory: case BIM_ACPI: case BIM_NVS: break; default: continue; } /* If the segment is smaller than a page, skip it. */ if (size < PAGE_SIZE) continue; seg_start = addr; seg_end = addr + size; /* * XXX XXX: Avoid the ISA I/O MEM. * * Some laptops (for example, Toshiba Satellite2550X) report * this area as valid. */ if (seg_start < IOM_END && seg_end > IOM_BEGIN) { printf("WARNING: memory map entry overlaps " "with ``Compatibility Holes'': " "0x%"PRIx64"/0x%"PRIx64"/0x%x\n", seg_start, seg_end - seg_start, type); if (x86_add_cluster(seg_start, IOM_BEGIN, type) == -1) break; if (x86_add_cluster(IOM_END, seg_end, type) == -1) break; } else { if (x86_add_cluster(seg_start, seg_end, type) == -1) break; } } return 0; } static int x86_fake_clusters(void) { extern struct extent *iomem_ex; phys_ram_seg_t *cluster; KASSERT(mem_cluster_cnt == 0); /* * Allocate the physical addresses used by RAM from the iomem extent * map. This is done before the addresses are page rounded just to make * sure we get them all. */ if (extent_alloc_region(iomem_ex, 0, KBTOB(biosbasemem), EX_NOWAIT)) { /* XXX What should we do? */ printf("WARNING: CAN'T ALLOCATE BASE MEMORY FROM " "IOMEM EXTENT MAP!\n"); } cluster = &mem_clusters[0]; cluster->start = 0; cluster->size = trunc_page(KBTOB(biosbasemem)); physmem += atop(cluster->size); if (extent_alloc_region(iomem_ex, IOM_END, KBTOB(biosextmem), EX_NOWAIT)) { /* XXX What should we do? */ printf("WARNING: CAN'T ALLOCATE EXTENDED MEMORY FROM " "IOMEM EXTENT MAP!\n"); } #if NISADMA > 0 /* * Some motherboards/BIOSes remap the 384K of RAM that would * normally be covered by the ISA hole to the end of memory * so that it can be used. However, on a 16M system, this * would cause bounce buffers to be allocated and used. * This is not desirable behaviour, as more than 384K of * bounce buffers might be allocated. As a work-around, * we round memory down to the nearest 1M boundary if * we're using any isadma devices and the remapped memory * is what puts us over 16M. */ if (biosextmem > (15*1024) && biosextmem < (16*1024)) { char pbuf[9]; format_bytes(pbuf, sizeof(pbuf), biosextmem - (15*1024)); printf("Warning: ignoring %s of remapped memory\n", pbuf); biosextmem = (15*1024); } #endif cluster = &mem_clusters[1]; cluster->start = IOM_END; cluster->size = trunc_page(KBTOB(biosextmem)); physmem += atop(cluster->size); mem_cluster_cnt = 2; avail_end = IOM_END + trunc_page(KBTOB(biosextmem)); return 0; } /* * x86_load_region: load the physical memory region from seg_start to seg_end * into the VM system. */ static void x86_load_region(uint64_t seg_start, uint64_t seg_end) { unsigned int i; uint64_t tmp; i = __arraycount(x86_freelists); while (i--) { if (x86_freelists[i].limit <= seg_start) continue; if (x86_freelists[i].freelist == VM_FREELIST_DEFAULT) continue; tmp = MIN(x86_freelists[i].limit, seg_end); if (tmp == seg_start) continue; #ifdef DEBUG_MEMLOAD printf("loading freelist %d 0x%"PRIx64"-0x%"PRIx64 " (0x%"PRIx64"-0x%"PRIx64")\n", x86_freelists[i].freelist, seg_start, tmp, (uint64_t)atop(seg_start), (uint64_t)atop(tmp)); #endif uvm_page_physload(atop(seg_start), atop(tmp), atop(seg_start), atop(tmp), x86_freelists[i].freelist); seg_start = tmp; } if (seg_start != seg_end) { #ifdef DEBUG_MEMLOAD printf("loading default 0x%"PRIx64"-0x%"PRIx64 " (0x%"PRIx64"-0x%"PRIx64")\n", seg_start, seg_end, (uint64_t)atop(seg_start), (uint64_t)atop(seg_end)); #endif uvm_page_physload(atop(seg_start), atop(seg_end), atop(seg_start), atop(seg_end), VM_FREELIST_DEFAULT); } } #ifdef XEN static void x86_add_xen_clusters(void) { if (hvm_start_info->memmap_entries > 0) { struct hvm_memmap_table_entry *map_entry; map_entry = (void *)((uintptr_t)hvm_start_info->memmap_paddr + KERNBASE); for (int i = 0; i < hvm_start_info->memmap_entries; i++) { if (map_entry[i].size < PAGE_SIZE) continue; switch (map_entry[i].type) { case XEN_HVM_MEMMAP_TYPE_RAM: x86_add_cluster(map_entry[i].addr, map_entry[i].addr + map_entry[i].size, BIM_Memory); break; case XEN_HVM_MEMMAP_TYPE_ACPI: x86_add_cluster(map_entry[i].addr, map_entry[i].addr + map_entry[i].size, BIM_ACPI); break; } } } else { struct xen_memory_map memmap; static struct _xen_mmap { struct btinfo_memmap bim; struct bi_memmap_entry map[128]; /* same as FreeBSD */ } __packed xen_mmap; int err; memmap.nr_entries = 128; set_xen_guest_handle(memmap.buffer, &xen_mmap.bim.entry[0]); if ((err = HYPERVISOR_memory_op(XENMEM_memory_map, &memmap)) < 0) panic("XENMEM_memory_map %d", err); xen_mmap.bim.num = memmap.nr_entries; x86_parse_clusters(&xen_mmap.bim); } } #endif /* XEN */ /* * init_x86_clusters: retrieve the memory clusters provided by the BIOS, and * initialize mem_clusters. */ void init_x86_clusters(void) { struct btinfo_memmap *bim; struct btinfo_efimemmap *biem; /* * Check to see if we have a memory map from the BIOS (passed to us by * the boot program). */ #ifdef XEN if (vm_guest == VM_GUEST_XENPVH) { x86_add_xen_clusters(); } #endif /* XEN */ #ifdef i386 extern int biosmem_implicit; biem = lookup_bootinfo(BTINFO_EFIMEMMAP); if (biem != NULL) bim = efi_get_e820memmap(); else bim = lookup_bootinfo(BTINFO_MEMMAP); if ((biosmem_implicit || (biosbasemem == 0 && biosextmem == 0)) && bim != NULL && bim->num > 0) x86_parse_clusters(bim); #else #if !defined(REALBASEMEM) && !defined(REALEXTMEM) biem = lookup_bootinfo(BTINFO_EFIMEMMAP); if (biem != NULL) bim = efi_get_e820memmap(); else bim = lookup_bootinfo(BTINFO_MEMMAP); if (bim != NULL && bim->num > 0) x86_parse_clusters(bim); #else (void)bim, (void)biem; #endif #endif if (mem_cluster_cnt == 0) { /* * If x86_parse_clusters didn't find any valid segment, create * fake clusters. */ x86_fake_clusters(); } } /* * init_x86_vm: initialize the VM system on x86. We basically internalize as * many physical pages as we can, starting at lowmem_rsvd, but we don't * internalize the kernel physical pages (from pa_kstart to pa_kend). */ int init_x86_vm(paddr_t pa_kend) { extern struct bootspace bootspace; paddr_t pa_kstart = bootspace.head.pa; uint64_t seg_start, seg_end; uint64_t seg_start1, seg_end1; int x; unsigned i; for (i = 0; i < __arraycount(x86_freelists); i++) { if (avail_end < x86_freelists[i].limit) x86_freelists[i].freelist = VM_FREELIST_DEFAULT; } /* * Now, load the memory clusters (which have already been rounded and * truncated) into the VM system. * * NOTE: we assume that memory starts at 0. */ for (x = 0; x < mem_cluster_cnt; x++) { const phys_ram_seg_t *cluster = &mem_clusters[x]; seg_start = cluster->start; seg_end = cluster->start + cluster->size; seg_start1 = 0; seg_end1 = 0; #ifdef DEBUG_MEMLOAD printf("segment %" PRIx64 " - %" PRIx64 "\n", seg_start, seg_end); #endif /* Skip memory before our available starting point. */ if (seg_end <= lowmem_rsvd) { #ifdef DEBUG_MEMLOAD printf("discard segment below starting point " "%" PRIx64 " - %" PRIx64 "\n", seg_start, seg_end); #endif continue; } if (seg_start <= lowmem_rsvd && lowmem_rsvd < seg_end) { seg_start = lowmem_rsvd; if (seg_start == seg_end) { #ifdef DEBUG_MEMLOAD printf("discard segment below starting point " "%" PRIx64 " - %" PRIx64 "\n", seg_start, seg_end); #endif continue; } } /* * If this segment contains the kernel, split it in two, around * the kernel. * [seg_start seg_end] * [pa_kstart pa_kend] */ if (seg_start <= pa_kstart && pa_kend <= seg_end) { #ifdef DEBUG_MEMLOAD printf("split kernel overlapping to " "%" PRIx64 " - %" PRIxPADDR " and " "%" PRIxPADDR " - %" PRIx64 "\n", seg_start, pa_kstart, pa_kend, seg_end); #endif seg_start1 = pa_kend; seg_end1 = seg_end; seg_end = pa_kstart; KASSERT(seg_end < seg_end1); } /* * Discard a segment inside the kernel * [pa_kstart pa_kend] * [seg_start seg_end] */ if (pa_kstart < seg_start && seg_end < pa_kend) { #ifdef DEBUG_MEMLOAD printf("discard complete kernel overlap " "%" PRIx64 " - %" PRIx64 "\n", seg_start, seg_end); #endif continue; } /* * Discard leading hunk that overlaps the kernel * [pa_kstart pa_kend] * [seg_start seg_end] */ if (pa_kstart < seg_start && seg_start < pa_kend && pa_kend < seg_end) { #ifdef DEBUG_MEMLOAD printf("discard leading kernel overlap " "%" PRIx64 " - %" PRIxPADDR "\n", seg_start, pa_kend); #endif seg_start = pa_kend; } /* * Discard trailing hunk that overlaps the kernel * [pa_kstart pa_kend] * [seg_start seg_end] */ if (seg_start < pa_kstart && pa_kstart < seg_end && seg_end < pa_kend) { #ifdef DEBUG_MEMLOAD printf("discard trailing kernel overlap " "%" PRIxPADDR " - %" PRIx64 "\n", pa_kstart, seg_end); #endif seg_end = pa_kstart; } /* First hunk */ if (seg_start != seg_end) { x86_load_region(seg_start, seg_end); } /* Second hunk */ if (seg_start1 != seg_end1) { x86_load_region(seg_start1, seg_end1); } } return 0; } #endif /* !XENPV */ void init_x86_msgbuf(void) { /* Message buffer is located at end of core. */ psize_t sz = round_page(MSGBUFSIZE); psize_t reqsz = sz; uvm_physseg_t x; search_again: for (x = uvm_physseg_get_first(); uvm_physseg_valid_p(x); x = uvm_physseg_get_next(x)) { if (ctob(uvm_physseg_get_avail_end(x)) == avail_end) break; } if (uvm_physseg_valid_p(x) == false) panic("init_x86_msgbuf: can't find end of memory"); /* Shrink so it'll fit in the last segment. */ if (uvm_physseg_get_avail_end(x) - uvm_physseg_get_avail_start(x) < atop(sz)) sz = ctob(uvm_physseg_get_avail_end(x) - uvm_physseg_get_avail_start(x)); msgbuf_p_seg[msgbuf_p_cnt].sz = sz; msgbuf_p_seg[msgbuf_p_cnt++].paddr = ctob(uvm_physseg_get_avail_end(x)) - sz; uvm_physseg_unplug(uvm_physseg_get_end(x) - atop(sz), atop(sz)); /* Now find where the new avail_end is. */ avail_end = ctob(uvm_physseg_get_highest_frame()); if (sz == reqsz) return; reqsz -= sz; if (msgbuf_p_cnt == VM_PHYSSEG_MAX) { /* No more segments available, bail out. */ printf("WARNING: MSGBUFSIZE (%zu) too large, using %zu.\n", (size_t)MSGBUFSIZE, (size_t)(MSGBUFSIZE - reqsz)); return; } sz = reqsz; goto search_again; } void x86_reset(void) { uint8_t b; #if NACPICA > 0 /* * If ACPI is active, try to reset using the reset register * defined in the FADT. */ if (acpi_active) { if (acpi_reset() == 0) { delay(500000); /* wait 0.5 sec to see if that did it */ } } #endif /* * The keyboard controller has 4 random output pins, one of which is * connected to the RESET pin on the CPU in many PCs. We tell the * keyboard controller to pulse this line a couple of times. */ outb(IO_KBD + KBCMDP, KBC_PULSE0); delay(100000); outb(IO_KBD + KBCMDP, KBC_PULSE0); delay(100000); /* * Attempt to force a reset via the Reset Control register at * I/O port 0xcf9. Bit 2 forces a system reset when it * transitions from 0 to 1. Bit 1 selects the type of reset * to attempt: 0 selects a "soft" reset, and 1 selects a * "hard" reset. We try a "hard" reset. The first write sets * bit 1 to select a "hard" reset and clears bit 2. The * second write forces a 0 -> 1 transition in bit 2 to trigger * a reset. */ outb(0xcf9, 0x2); outb(0xcf9, 0x6); DELAY(500000); /* wait 0.5 sec to see if that did it */ /* * Attempt to force a reset via the Fast A20 and Init register * at I/O port 0x92. Bit 1 serves as an alternate A20 gate. * Bit 0 asserts INIT# when set to 1. We are careful to only * preserve bit 1 while setting bit 0. We also must clear bit * 0 before setting it if it isn't already clear. */ b = inb(0x92); if (b != 0xff) { if ((b & 0x1) != 0) outb(0x92, b & 0xfe); outb(0x92, b | 0x1); DELAY(500000); /* wait 0.5 sec to see if that did it */ } } static int x86_listener_cb(kauth_cred_t cred, kauth_action_t action, void *cookie, void *arg0, void *arg1, void *arg2, void *arg3) { int result; result = KAUTH_RESULT_DEFER; switch (action) { case KAUTH_MACHDEP_IOPERM_GET: result = KAUTH_RESULT_ALLOW; break; case KAUTH_MACHDEP_LDT_GET: case KAUTH_MACHDEP_LDT_SET: if (x86_user_ldt_enabled) { result = KAUTH_RESULT_ALLOW; } break; default: break; } return result; } void machdep_init(void) { x86_listener = kauth_listen_scope(KAUTH_SCOPE_MACHDEP, x86_listener_cb, NULL); } /* * x86_startup: x86 common startup routine * * called by cpu_startup. */ void x86_startup(void) { #if !defined(XENPV) nmi_init(); #endif } const char * get_booted_kernel(void) { const struct btinfo_bootpath *bibp = lookup_bootinfo(BTINFO_BOOTPATH); return bibp ? bibp->bootpath : NULL; } /* * machine dependent system variables. */ static int sysctl_machdep_booted_kernel(SYSCTLFN_ARGS) { struct btinfo_bootpath *bibp; struct sysctlnode node; bibp = lookup_bootinfo(BTINFO_BOOTPATH); if (!bibp) return ENOENT; /* ??? */ node = *rnode; node.sysctl_data = bibp->bootpath; node.sysctl_size = sizeof(bibp->bootpath); return sysctl_lookup(SYSCTLFN_CALL(&node)); } static int sysctl_machdep_bootmethod(SYSCTLFN_ARGS) { struct sysctlnode node; char buf[5]; node = *rnode; node.sysctl_data = buf; if (bootmethod_efi) memcpy(node.sysctl_data, "UEFI", 5); else memcpy(node.sysctl_data, "BIOS", 5); return sysctl_lookup(SYSCTLFN_CALL(&node)); } static int sysctl_machdep_diskinfo(SYSCTLFN_ARGS) { struct sysctlnode node; extern struct bi_devmatch *x86_alldisks; extern int x86_ndisks; if (x86_alldisks == NULL) return EOPNOTSUPP; node = *rnode; node.sysctl_data = x86_alldisks; node.sysctl_size = sizeof(struct disklist) + (x86_ndisks - 1) * sizeof(struct nativedisk_info); return sysctl_lookup(SYSCTLFN_CALL(&node)); } #ifndef XENPV static int sysctl_machdep_tsc_enable(SYSCTLFN_ARGS) { struct sysctlnode node; int error, val; val = *(int *)rnode->sysctl_data; node = *rnode; node.sysctl_data = &val; error = sysctl_lookup(SYSCTLFN_CALL(&node)); if (error != 0 || newp == NULL) return error; if (val == 1) { tsc_user_enable(); } else if (val == 0) { tsc_user_disable(); } else { error = EINVAL; } if (error) return error; *(int *)rnode->sysctl_data = val; return 0; } #endif static const char * const vm_guest_name[VM_LAST] = { [VM_GUEST_NO] = "none", [VM_GUEST_VM] = "generic", [VM_GUEST_XENPV] = "XenPV", [VM_GUEST_XENPVH] = "XenPVH", [VM_GUEST_XENHVM] = "XenHVM", [VM_GUEST_XENPVHVM] = "XenPVHVM", [VM_GUEST_HV] = "Hyper-V", [VM_GUEST_VMWARE] = "VMware", [VM_GUEST_KVM] = "KVM", [VM_GUEST_VIRTUALBOX] = "VirtualBox", }; static int sysctl_machdep_hypervisor(SYSCTLFN_ARGS) { struct sysctlnode node; const char *t = NULL; char buf[64]; node = *rnode; node.sysctl_data = buf; if (vm_guest >= VM_GUEST_NO && vm_guest < VM_LAST) t = vm_guest_name[vm_guest]; if (t == NULL) t = "unknown"; strlcpy(buf, t, sizeof(buf)); return sysctl_lookup(SYSCTLFN_CALL(&node)); } static void const_sysctl(struct sysctllog **clog, const char *name, int type, u_quad_t value, int tag) { (sysctl_createv)(clog, 0, NULL, NULL, CTLFLAG_PERMANENT | CTLFLAG_IMMEDIATE, type, name, NULL, NULL, value, NULL, 0, CTL_MACHDEP, tag, CTL_EOL); } SYSCTL_SETUP(sysctl_machdep_setup, "sysctl machdep subtree setup") { extern uint64_t tsc_freq; #ifndef XENPV extern int tsc_user_enabled; #endif extern int sparse_dump; sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT, CTLTYPE_NODE, "machdep", NULL, NULL, 0, NULL, 0, CTL_MACHDEP, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT, CTLTYPE_STRUCT, "console_device", NULL, sysctl_consdev, 0, NULL, sizeof(dev_t), CTL_MACHDEP, CPU_CONSDEV, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT, CTLTYPE_STRING, "booted_kernel", NULL, sysctl_machdep_booted_kernel, 0, NULL, 0, CTL_MACHDEP, CPU_BOOTED_KERNEL, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT, CTLTYPE_STRING, "bootmethod", NULL, sysctl_machdep_bootmethod, 0, NULL, 0, CTL_MACHDEP, CTL_CREATE, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT, CTLTYPE_STRUCT, "diskinfo", NULL, sysctl_machdep_diskinfo, 0, NULL, 0, CTL_MACHDEP, CPU_DISKINFO, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT, CTLTYPE_STRING, "cpu_brand", NULL, NULL, 0, cpu_brand_string, 0, CTL_MACHDEP, CTL_CREATE, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_INT, "sparse_dump", NULL, NULL, 0, &sparse_dump, 0, CTL_MACHDEP, CTL_CREATE, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT, CTLTYPE_QUAD, "tsc_freq", NULL, NULL, 0, &tsc_freq, 0, CTL_MACHDEP, CTL_CREATE, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT, CTLTYPE_INT, "pae", SYSCTL_DESCR("Whether the kernel uses PAE"), NULL, 0, &use_pae, 0, CTL_MACHDEP, CTL_CREATE, CTL_EOL); #ifndef XENPV sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_READWRITE, CTLTYPE_INT, "tsc_user_enable", SYSCTL_DESCR("RDTSC instruction enabled in usermode"), sysctl_machdep_tsc_enable, 0, &tsc_user_enabled, 0, CTL_MACHDEP, CTL_CREATE, CTL_EOL); #endif sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT, CTLTYPE_STRING, "hypervisor", NULL, sysctl_machdep_hypervisor, 0, NULL, 0, CTL_MACHDEP, CTL_CREATE, CTL_EOL); #ifdef SVS const struct sysctlnode *svs_rnode = NULL; sysctl_createv(clog, 0, NULL, &svs_rnode, CTLFLAG_PERMANENT, CTLTYPE_NODE, "svs", NULL, NULL, 0, NULL, 0, CTL_MACHDEP, CTL_CREATE); sysctl_createv(clog, 0, &svs_rnode, NULL, CTLFLAG_PERMANENT, CTLTYPE_BOOL, "enabled", SYSCTL_DESCR("Whether the kernel uses SVS"), NULL, 0, &svs_enabled, 0, CTL_CREATE, CTL_EOL); sysctl_createv(clog, 0, &svs_rnode, NULL, CTLFLAG_PERMANENT, CTLTYPE_BOOL, "pcid", SYSCTL_DESCR("Whether SVS uses PCID"), NULL, 0, &svs_pcid, 0, CTL_CREATE, CTL_EOL); #endif sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_READWRITE, CTLTYPE_BOOL, "user_ldt", SYSCTL_DESCR("Whether USER_LDT is enabled"), NULL, 0, &x86_user_ldt_enabled, 0, CTL_MACHDEP, CTL_CREATE, CTL_EOL); #ifndef XENPV void sysctl_speculation_init(struct sysctllog **); sysctl_speculation_init(clog); #endif /* None of these can ever change once the system has booted */ const_sysctl(clog, "fpu_present", CTLTYPE_INT, i386_fpu_present, CPU_FPU_PRESENT); const_sysctl(clog, "osfxsr", CTLTYPE_INT, i386_use_fxsave, CPU_OSFXSR); const_sysctl(clog, "sse", CTLTYPE_INT, i386_has_sse, CPU_SSE); const_sysctl(clog, "sse2", CTLTYPE_INT, i386_has_sse2, CPU_SSE2); const_sysctl(clog, "fpu_save", CTLTYPE_INT, x86_fpu_save, CPU_FPU_SAVE); const_sysctl(clog, "fpu_save_size", CTLTYPE_INT, x86_fpu_save_size, CPU_FPU_SAVE_SIZE); const_sysctl(clog, "xsave_features", CTLTYPE_QUAD, x86_xsave_features, CPU_XSAVE_FEATURES); #ifndef XENPV const_sysctl(clog, "biosbasemem", CTLTYPE_INT, biosbasemem, CPU_BIOSBASEMEM); const_sysctl(clog, "biosextmem", CTLTYPE_INT, biosextmem, CPU_BIOSEXTMEM); #endif } /* Here for want of a better place */ #if defined(DOM0OPS) || !defined(XENPV) struct pic * intr_findpic(int num) { #if NIOAPIC > 0 struct ioapic_softc *pic; pic = ioapic_find_bybase(num); if (pic != NULL) return &pic->sc_pic; #endif if (num < NUM_LEGACY_IRQS) return &i8259_pic; return NULL; } #endif void cpu_initclocks(void) { /* * Re-calibrate TSC on boot CPU using most accurate time source, * thus making accurate TSC available for x86_initclock_func(). */ cpu_get_tsc_freq(curcpu()); /* Now start the clocks on this CPU (the boot CPU). */ (*x86_initclock_func)(); } int x86_cpu_is_lcall(const void *ip) { static const uint8_t lcall[] = { 0x9a, 0, 0, 0, 0 }; int error; const size_t sz = sizeof(lcall) + 2; uint8_t tmp[sizeof(lcall) + 2]; if ((error = copyin(ip, tmp, sz)) != 0) return error; if (memcmp(tmp, lcall, sizeof(lcall)) != 0 || tmp[sz - 1] != 0) return EINVAL; switch (tmp[sz - 2]) { case (uint8_t)0x07: /* NetBSD */ case (uint8_t)0x87: /* BSD/OS */ return 0; default: return EINVAL; } }
929 966 924 925 386 927 927 385 386 807 401 401 403 252 251 251 62 217 223 62 253 62 217 253 27 25 26 26 449 509 506 216 121 268 54 77 247 293 2 294 216 25 25 25 25 25 25 25 25 25 25 25 25 25 826 823 329 820 820 526 527 406 178 471 470 474 474 9 8 9 474 171 373 474 473 473 12 10 2 165 11 327 471 2 471 471 471 471 1 470 42 449 42 41 42 472 472 474 474 54 449 812 809 328 832 835 125 126 126 45 46 46 17 17 17 11 7 1 1 1 1 1 1 1 1 1 1 11 11 11 11 376 421 421 215 7 54 217 54 54 54 54 54 43 12 54 496 645 643 380 379 379 380 375 376 376 356 163 6 162 163 162 122 47 89 89 89 2 87 87 87 1 89 57 57 57 57 57 57 57 4 54 56 57 47 12 56 56 57 57 4 54 54 53 57 56 1 1 1 1 1 1 1 1 10 2 8 8 8 494 493 486 51 496 497 129 129 129 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 /* $NetBSD: vfs_vnode.c,v 1.153 2023/11/27 16:13:59 hannken Exp $ */ /*- * Copyright (c) 1997-2011, 2019, 2020 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility, * NASA Ames Research Center, by Charles M. Hannum, and by Andrew Doran. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /* * Copyright (c) 1989, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)vfs_subr.c 8.13 (Berkeley) 4/18/94 */ /* * The vnode cache subsystem. * * Life-cycle * * Normally, there are two points where new vnodes are created: * VOP_CREATE(9) and VOP_LOOKUP(9). The life-cycle of a vnode * starts in one of the following ways: * * - Allocation, via vcache_get(9) or vcache_new(9). * - Reclamation of inactive vnode, via vcache_vget(9). * * Recycle from a free list, via getnewvnode(9) -> getcleanvnode(9) * was another, traditional way. Currently, only the draining thread * recycles the vnodes. This behaviour might be revisited. * * The life-cycle ends when the last reference is dropped, usually * in VOP_REMOVE(9). In such case, VOP_INACTIVE(9) is called to inform * the file system that vnode is inactive. Via this call, file system * indicates whether vnode can be recycled (usually, it checks its own * references, e.g. count of links, whether the file was removed). * * Depending on indication, vnode can be put into a free list (cache), * or cleaned via vcache_reclaim, which calls VOP_RECLAIM(9) to * disassociate underlying file system from the vnode, and finally * destroyed. * * Vnode state * * Vnode is always in one of six states: * - MARKER This is a marker vnode to help list traversal. It * will never change its state. * - LOADING Vnode is associating underlying file system and not * yet ready to use. * - LOADED Vnode has associated underlying file system and is * ready to use. * - BLOCKED Vnode is active but cannot get new references. * - RECLAIMING Vnode is disassociating from the underlying file * system. * - RECLAIMED Vnode has disassociated from underlying file system * and is dead. * * Valid state changes are: * LOADING -> LOADED * Vnode has been initialised in vcache_get() or * vcache_new() and is ready to use. * BLOCKED -> RECLAIMING * Vnode starts disassociation from underlying file * system in vcache_reclaim(). * RECLAIMING -> RECLAIMED * Vnode finished disassociation from underlying file * system in vcache_reclaim(). * LOADED -> BLOCKED * Either vcache_rekey*() is changing the vnode key or * vrelel() is about to call VOP_INACTIVE(). * BLOCKED -> LOADED * The block condition is over. * LOADING -> RECLAIMED * Either vcache_get() or vcache_new() failed to * associate the underlying file system or vcache_rekey*() * drops a vnode used as placeholder. * * Of these states LOADING, BLOCKED and RECLAIMING are intermediate * and it is possible to wait for state change. * * State is protected with v_interlock with one exception: * to change from LOADING both v_interlock and vcache_lock must be held * so it is possible to check "state == LOADING" without holding * v_interlock. See vcache_get() for details. * * Reference counting * * Vnode is considered active, if reference count (vnode_t::v_usecount) * is non-zero. It is maintained using: vref(9) and vrele(9), as well * as vput(9), routines. Common points holding references are e.g. * file openings, current working directory, mount points, etc. * * v_usecount is adjusted with atomic operations, however to change * from a non-zero value to zero the interlock must also be held. */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: vfs_vnode.c,v 1.153 2023/11/27 16:13:59 hannken Exp $"); #ifdef _KERNEL_OPT #include "opt_pax.h" #endif #include <sys/param.h> #include <sys/kernel.h> #include <sys/atomic.h> #include <sys/buf.h> #include <sys/conf.h> #include <sys/device.h> #include <sys/hash.h> #include <sys/kauth.h> #include <sys/kmem.h> #include <sys/module.h> #include <sys/mount.h> #include <sys/namei.h> #include <sys/pax.h> #include <sys/syscallargs.h> #include <sys/sysctl.h> #include <sys/systm.h> #include <sys/threadpool.h> #include <sys/vnode_impl.h> #include <sys/wapbl.h> #include <sys/fstrans.h> #include <miscfs/deadfs/deadfs.h> #include <miscfs/specfs/specdev.h> #include <uvm/uvm.h> #include <uvm/uvm_readahead.h> #include <uvm/uvm_stat.h> /* Flags to vrelel. */ #define VRELEL_ASYNC 0x0001 /* Always defer to vrele thread. */ #define LRU_VRELE 0 #define LRU_FREE 1 #define LRU_HOLD 2 #define LRU_COUNT 3 /* * There are three lru lists: one holds vnodes waiting for async release, * one is for vnodes which have no buffer/page references and one for those * which do (i.e. v_holdcnt is non-zero). We put the lists into a single, * private cache line as vnodes migrate between them while under the same * lock (vdrain_lock). */ typedef struct { vnode_impl_t *li_marker; } lru_iter_t; u_int numvnodes __cacheline_aligned; static vnodelst_t lru_list[LRU_COUNT] __cacheline_aligned; static struct threadpool *threadpool; static struct threadpool_job vdrain_job; static struct threadpool_job vrele_job; static kmutex_t vdrain_lock __cacheline_aligned; SLIST_HEAD(hashhead, vnode_impl); static kmutex_t vcache_lock __cacheline_aligned; static kcondvar_t vcache_cv; static u_int vcache_hashsize; static u_long vcache_hashmask; static struct hashhead *vcache_hashtab; static pool_cache_t vcache_pool; static void lru_requeue(vnode_t *, vnodelst_t *); static vnodelst_t * lru_which(vnode_t *); static vnode_impl_t * lru_iter_first(int, lru_iter_t *); static vnode_impl_t * lru_iter_next(lru_iter_t *); static void lru_iter_release(lru_iter_t *); static vnode_impl_t * vcache_alloc(void); static void vcache_dealloc(vnode_impl_t *); static void vcache_free(vnode_impl_t *); static void vcache_init(void); static void vcache_reinit(void); static void vcache_reclaim(vnode_t *); static void vrele_deferred(vnode_impl_t *); static void vrelel(vnode_t *, int, int); static void vnpanic(vnode_t *, const char *, ...) __printflike(2, 3); static bool vdrain_one(u_int); static void vdrain_task(struct threadpool_job *); static void vrele_task(struct threadpool_job *); /* Routines having to do with the management of the vnode table. */ /* * The high bit of v_usecount is a gate for vcache_tryvget(). It's set * only when the vnode state is LOADED. * The next bit of v_usecount is a flag for vrelel(). It's set * from vcache_vget() and vcache_tryvget() whenever the operation succeeds. */ #define VUSECOUNT_MASK 0x3fffffff #define VUSECOUNT_GATE 0x80000000 #define VUSECOUNT_VGET 0x40000000 /* * Return the current usecount of a vnode. */ inline int vrefcnt(struct vnode *vp) { return atomic_load_relaxed(&vp->v_usecount) & VUSECOUNT_MASK; } /* Vnode state operations and diagnostics. */ #if defined(DIAGNOSTIC) #define VSTATE_VALID(state) \ ((state) != VS_ACTIVE && (state) != VS_MARKER) #define VSTATE_GET(vp) \ vstate_assert_get((vp), __func__, __LINE__) #define VSTATE_CHANGE(vp, from, to) \ vstate_assert_change((vp), (from), (to), __func__, __LINE__) #define VSTATE_WAIT_STABLE(vp) \ vstate_assert_wait_stable((vp), __func__, __LINE__) void _vstate_assert(vnode_t *vp, enum vnode_state state, const char *func, int line, bool has_lock) { vnode_impl_t *vip = VNODE_TO_VIMPL(vp); int refcnt = vrefcnt(vp); if (!has_lock) { enum vnode_state vstate = atomic_load_relaxed(&vip->vi_state); if (state == VS_ACTIVE && refcnt > 0 && (vstate == VS_LOADED || vstate == VS_BLOCKED)) return; if (vstate == state) return; mutex_enter((vp)->v_interlock); } KASSERTMSG(mutex_owned(vp->v_interlock), "at %s:%d", func, line); if ((state == VS_ACTIVE && refcnt > 0 && (vip->vi_state == VS_LOADED || vip->vi_state == VS_BLOCKED)) || vip->vi_state == state) { if (!has_lock) mutex_exit((vp)->v_interlock); return; } vnpanic(vp, "state is %s, usecount %d, expected %s at %s:%d", vstate_name(vip->vi_state), refcnt, vstate_name(state), func, line); } static enum vnode_state vstate_assert_get(vnode_t *vp, const char *func, int line) { vnode_impl_t *vip = VNODE_TO_VIMPL(vp); KASSERTMSG(mutex_owned(vp->v_interlock), "at %s:%d", func, line); if (! VSTATE_VALID(vip->vi_state)) vnpanic(vp, "state is %s at %s:%d", vstate_name(vip->vi_state), func, line); return vip->vi_state; } static void vstate_assert_wait_stable(vnode_t *vp, const char *func, int line) { vnode_impl_t *vip = VNODE_TO_VIMPL(vp); KASSERTMSG(mutex_owned(vp->v_interlock), "at %s:%d", func, line); if (! VSTATE_VALID(vip->vi_state)) vnpanic(vp, "state is %s at %s:%d", vstate_name(vip->vi_state), func, line); while (vip->vi_state != VS_LOADED && vip->vi_state != VS_RECLAIMED) cv_wait(&vp->v_cv, vp->v_interlock); if (! VSTATE_VALID(vip->vi_state)) vnpanic(vp, "state is %s at %s:%d", vstate_name(vip->vi_state), func, line); } static void vstate_assert_change(vnode_t *vp, enum vnode_state from, enum vnode_state to, const char *func, int line) { bool gated = (atomic_load_relaxed(&vp->v_usecount) & VUSECOUNT_GATE); vnode_impl_t *vip = VNODE_TO_VIMPL(vp); KASSERTMSG(mutex_owned(vp->v_interlock), "at %s:%d", func, line); if (from == VS_LOADING) KASSERTMSG(mutex_owned(&vcache_lock), "at %s:%d", func, line); if (! VSTATE_VALID(from)) vnpanic(vp, "from is %s at %s:%d", vstate_name(from), func, line); if (! VSTATE_VALID(to)) vnpanic(vp, "to is %s at %s:%d", vstate_name(to), func, line); if (vip->vi_state != from) vnpanic(vp, "from is %s, expected %s at %s:%d\n", vstate_name(vip->vi_state), vstate_name(from), func, line); if ((from == VS_LOADED) != gated) vnpanic(vp, "state is %s, gate %d does not match at %s:%d\n", vstate_name(vip->vi_state), gated, func, line); /* Open/close the gate for vcache_tryvget(). */ if (to == VS_LOADED) { membar_release(); atomic_or_uint(&vp->v_usecount, VUSECOUNT_GATE); } else { atomic_and_uint(&vp->v_usecount, ~VUSECOUNT_GATE); } atomic_store_relaxed(&vip->vi_state, to); if (from == VS_LOADING) cv_broadcast(&vcache_cv); if (to == VS_LOADED || to == VS_RECLAIMED) cv_broadcast(&vp->v_cv); } #else /* defined(DIAGNOSTIC) */ #define VSTATE_GET(vp) \ (VNODE_TO_VIMPL((vp))->vi_state) #define VSTATE_CHANGE(vp, from, to) \ vstate_change((vp), (from), (to)) #define VSTATE_WAIT_STABLE(vp) \ vstate_wait_stable((vp)) void _vstate_assert(vnode_t *vp, enum vnode_state state, const char *func, int line, bool has_lock) { } static void vstate_wait_stable(vnode_t *vp) { vnode_impl_t *vip = VNODE_TO_VIMPL(vp); while (vip->vi_state != VS_LOADED && vip->vi_state != VS_RECLAIMED) cv_wait(&vp->v_cv, vp->v_interlock); } static void vstate_change(vnode_t *vp, enum vnode_state from, enum vnode_state to) { vnode_impl_t *vip = VNODE_TO_VIMPL(vp); /* Open/close the gate for vcache_tryvget(). */ if (to == VS_LOADED) { membar_release(); atomic_or_uint(&vp->v_usecount, VUSECOUNT_GATE); } else { atomic_and_uint(&vp->v_usecount, ~VUSECOUNT_GATE); } atomic_store_relaxed(&vip->vi_state, to); if (from == VS_LOADING) cv_broadcast(&vcache_cv); if (to == VS_LOADED || to == VS_RECLAIMED) cv_broadcast(&vp->v_cv); } #endif /* defined(DIAGNOSTIC) */ void vfs_vnode_sysinit(void) { int error __diagused, i; dead_rootmount = vfs_mountalloc(&dead_vfsops, NULL); KASSERT(dead_rootmount != NULL); dead_rootmount->mnt_iflag |= IMNT_MPSAFE; mutex_init(&vdrain_lock, MUTEX_DEFAULT, IPL_NONE); for (i = 0; i < LRU_COUNT; i++) { TAILQ_INIT(&lru_list[i]); } vcache_init(); error = threadpool_get(&threadpool, PRI_NONE); KASSERTMSG((error == 0), "threadpool_get failed: %d", error); threadpool_job_init(&vdrain_job, vdrain_task, &vdrain_lock, "vdrain"); threadpool_job_init(&vrele_job, vrele_task, &vdrain_lock, "vrele"); } /* * Allocate a new marker vnode. */ vnode_t * vnalloc_marker(struct mount *mp) { vnode_impl_t *vip; vnode_t *vp; vip = pool_cache_get(vcache_pool, PR_WAITOK); memset(vip, 0, sizeof(*vip)); vp = VIMPL_TO_VNODE(vip); uvm_obj_init(&vp->v_uobj, &uvm_vnodeops, true, 1); vp->v_mount = mp; vp->v_type = VBAD; vp->v_interlock = mutex_obj_alloc(MUTEX_DEFAULT, IPL_NONE); klist_init(&vip->vi_klist.vk_klist); vp->v_klist = &vip->vi_klist; vip->vi_state = VS_MARKER; return vp; } /* * Free a marker vnode. */ void vnfree_marker(vnode_t *vp) { vnode_impl_t *vip; vip = VNODE_TO_VIMPL(vp); KASSERT(vip->vi_state == VS_MARKER); mutex_obj_free(vp->v_interlock); uvm_obj_destroy(&vp->v_uobj, true); klist_fini(&vip->vi_klist.vk_klist); pool_cache_put(vcache_pool, vip); } /* * Test a vnode for being a marker vnode. */ bool vnis_marker(vnode_t *vp) { return (VNODE_TO_VIMPL(vp)->vi_state == VS_MARKER); } /* * Return the lru list this node should be on. */ static vnodelst_t * lru_which(vnode_t *vp) { KASSERT(mutex_owned(vp->v_interlock)); if (vp->v_holdcnt > 0) return &lru_list[LRU_HOLD]; else return &lru_list[LRU_FREE]; } /* * Put vnode to end of given list. * Both the current and the new list may be NULL, used on vnode alloc/free. * Adjust numvnodes and signal vdrain thread if there is work. */ static void lru_requeue(vnode_t *vp, vnodelst_t *listhd) { vnode_impl_t *vip; int d; /* * If the vnode is on the correct list, and was put there recently, * then leave it be, thus avoiding huge cache and lock contention. */ vip = VNODE_TO_VIMPL(vp); if (listhd == vip->vi_lrulisthd && (getticks() - vip->vi_lrulisttm) < hz) { return; } mutex_enter(&vdrain_lock); d = 0; if (vip->vi_lrulisthd != NULL) TAILQ_REMOVE(vip->vi_lrulisthd, vip, vi_lrulist); else d++; vip->vi_lrulisthd = listhd; vip->vi_lrulisttm = getticks(); if (vip->vi_lrulisthd != NULL) TAILQ_INSERT_TAIL(vip->vi_lrulisthd, vip, vi_lrulist); else d--; if (d != 0) { /* * Looks strange? This is not a bug. Don't store * numvnodes unless there is a change - avoid false * sharing on MP. */ numvnodes += d; } if (listhd == &lru_list[LRU_VRELE]) threadpool_schedule_job(threadpool, &vrele_job); if (d > 0 && numvnodes > desiredvnodes) threadpool_schedule_job(threadpool, &vdrain_job); if (d > 0 && numvnodes > desiredvnodes + desiredvnodes / 16) kpause("vnfull", false, MAX(1, mstohz(10)), &vdrain_lock); mutex_exit(&vdrain_lock); } /* * LRU list iterator. * Caller holds vdrain_lock. */ static vnode_impl_t * lru_iter_first(int idx, lru_iter_t *iterp) { vnode_impl_t *marker; KASSERT(mutex_owned(&vdrain_lock)); mutex_exit(&vdrain_lock); marker = VNODE_TO_VIMPL(vnalloc_marker(NULL)); mutex_enter(&vdrain_lock); marker->vi_lrulisthd = &lru_list[idx]; iterp->li_marker = marker; TAILQ_INSERT_HEAD(marker->vi_lrulisthd, marker, vi_lrulist); return lru_iter_next(iterp); } static vnode_impl_t * lru_iter_next(lru_iter_t *iter) { vnode_impl_t *vip, *marker; vnodelst_t *listhd; KASSERT(mutex_owned(&vdrain_lock)); marker = iter->li_marker; listhd = marker->vi_lrulisthd; while ((vip = TAILQ_NEXT(marker, vi_lrulist))) { TAILQ_REMOVE(listhd, marker, vi_lrulist); TAILQ_INSERT_AFTER(listhd, vip, marker, vi_lrulist); if (!vnis_marker(VIMPL_TO_VNODE(vip))) break; } return vip; } static void lru_iter_release(lru_iter_t *iter) { vnode_impl_t *marker; KASSERT(mutex_owned(&vdrain_lock)); marker = iter->li_marker; TAILQ_REMOVE(marker->vi_lrulisthd, marker, vi_lrulist); mutex_exit(&vdrain_lock); vnfree_marker(VIMPL_TO_VNODE(marker)); mutex_enter(&vdrain_lock); } /* * Release deferred vrele vnodes for this mount. * Called with file system suspended. */ void vrele_flush(struct mount *mp) { lru_iter_t iter; vnode_impl_t *vip; KASSERT(fstrans_is_owner(mp)); mutex_enter(&vdrain_lock); for (vip = lru_iter_first(LRU_VRELE, &iter); vip != NULL; vip = lru_iter_next(&iter)) { if (VIMPL_TO_VNODE(vip)->v_mount != mp) continue; vrele_deferred(vip); } lru_iter_release(&iter); mutex_exit(&vdrain_lock); } /* * One pass through the LRU lists to keep the number of allocated * vnodes below target. Returns true if target met. */ static bool vdrain_one(u_int target) { int ix, lists[] = { LRU_FREE, LRU_HOLD }; lru_iter_t iter; vnode_impl_t *vip; vnode_t *vp; struct mount *mp; KASSERT(mutex_owned(&vdrain_lock)); for (ix = 0; ix < __arraycount(lists); ix++) { for (vip = lru_iter_first(lists[ix], &iter); vip != NULL; vip = lru_iter_next(&iter)) { if (numvnodes < target) { lru_iter_release(&iter); return true; } vp = VIMPL_TO_VNODE(vip); /* Probe usecount (unlocked). */ if (vrefcnt(vp) > 0) continue; /* Try v_interlock -- we lock the wrong direction! */ if (!mutex_tryenter(vp->v_interlock)) continue; /* Probe usecount and state. */ if (vrefcnt(vp) > 0 || VSTATE_GET(vp) != VS_LOADED) { mutex_exit(vp->v_interlock); continue; } mutex_exit(&vdrain_lock); mp = vp->v_mount; if (fstrans_start_nowait(mp) != 0) { mutex_exit(vp->v_interlock); mutex_enter(&vdrain_lock); continue; } if (vcache_vget(vp) == 0) { if (!vrecycle(vp)) { vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); mutex_enter(vp->v_interlock); vrelel(vp, 0, LK_EXCLUSIVE); } } fstrans_done(mp); mutex_enter(&vdrain_lock); } lru_iter_release(&iter); } return false; } /* * threadpool task to keep the number of vnodes below desiredvnodes. */ static void vdrain_task(struct threadpool_job *job) { u_int target; target = desiredvnodes - desiredvnodes / 16; mutex_enter(&vdrain_lock); while (!vdrain_one(target)) kpause("vdrain", false, 1, &vdrain_lock); threadpool_job_done(job); mutex_exit(&vdrain_lock); } /* * threadpool task to process asynchronous vrele. */ static void vrele_task(struct threadpool_job *job) { int skipped; lru_iter_t iter; vnode_impl_t *vip; struct mount *mp; mutex_enter(&vdrain_lock); while ((vip = lru_iter_first(LRU_VRELE, &iter)) != NULL) { for (skipped = 0; vip != NULL; vip = lru_iter_next(&iter)) { mp = VIMPL_TO_VNODE(vip)->v_mount; if (fstrans_start_nowait(mp) == 0) { vrele_deferred(vip); fstrans_done(mp); } else { skipped++; } } lru_iter_release(&iter); if (skipped) kpause("vrele", false, MAX(1, mstohz(10)), &vdrain_lock); } threadpool_job_done(job); lru_iter_release(&iter); mutex_exit(&vdrain_lock); } /* * Try to drop reference on a vnode. Abort if we are releasing the * last reference. Note: this _must_ succeed if not the last reference. */ static bool vtryrele(vnode_t *vp) { u_int use, next; membar_release(); for (use = atomic_load_relaxed(&vp->v_usecount);; use = next) { if (__predict_false((use & VUSECOUNT_MASK) == 1)) { return false; } KASSERT((use & VUSECOUNT_MASK) > 1); next = atomic_cas_uint(&vp->v_usecount, use, use - 1); if (__predict_true(next == use)) { return true; } } } /* * vput: unlock and release the reference. */ void vput(vnode_t *vp) { int lktype; /* * Do an unlocked check of the usecount. If it looks like we're not * about to drop the last reference, then unlock the vnode and try * to drop the reference. If it ends up being the last reference * after all, vrelel() can fix it all up. Most of the time this * will all go to plan. */ if (vrefcnt(vp) > 1) { VOP_UNLOCK(vp); if (vtryrele(vp)) { return; } lktype = LK_NONE; } else { lktype = VOP_ISLOCKED(vp); KASSERT(lktype != LK_NONE); } mutex_enter(vp->v_interlock); vrelel(vp, 0, lktype); } /* * Release a vnode from the deferred list. */ static void vrele_deferred(vnode_impl_t *vip) { vnode_t *vp; KASSERT(mutex_owned(&vdrain_lock)); KASSERT(vip->vi_lrulisthd == &lru_list[LRU_VRELE]); vp = VIMPL_TO_VNODE(vip); /* * First remove the vnode from the vrele list. * Put it on the last lru list, the last vrele() * will put it back onto the right list before * its usecount reaches zero. */ TAILQ_REMOVE(vip->vi_lrulisthd, vip, vi_lrulist); vip->vi_lrulisthd = &lru_list[LRU_HOLD]; vip->vi_lrulisttm = getticks(); TAILQ_INSERT_TAIL(vip->vi_lrulisthd, vip, vi_lrulist); mutex_exit(&vdrain_lock); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); mutex_enter(vp->v_interlock); vrelel(vp, 0, LK_EXCLUSIVE); mutex_enter(&vdrain_lock); } /* * Vnode release. If reference count drops to zero, call inactive * routine and either return to freelist or free to the pool. */ static void vrelel(vnode_t *vp, int flags, int lktype) { const bool async = ((flags & VRELEL_ASYNC) != 0); bool recycle, defer, objlock_held; u_int use, next; int error; objlock_held = false; retry: KASSERT(mutex_owned(vp->v_interlock)); if (__predict_false(vp->v_op == dead_vnodeop_p && VSTATE_GET(vp) != VS_RECLAIMED)) { vnpanic(vp, "dead but not clean"); } /* * If not the last reference, just unlock and drop the reference count. * * Otherwise make sure we pass a point in time where we hold the * last reference with VGET flag unset. */ for (use = atomic_load_relaxed(&vp->v_usecount);; use = next) { if (__predict_false((use & VUSECOUNT_MASK) > 1)) { if (objlock_held) { objlock_held = false; rw_exit(vp->v_uobj.vmobjlock); } if (lktype != LK_NONE) { mutex_exit(vp->v_interlock); lktype = LK_NONE; VOP_UNLOCK(vp); mutex_enter(vp->v_interlock); } if (vtryrele(vp)) { mutex_exit(vp->v_interlock); return; } next = atomic_load_relaxed(&vp->v_usecount); continue; } KASSERT((use & VUSECOUNT_MASK) == 1); next = use & ~VUSECOUNT_VGET; if (next != use) { next = atomic_cas_uint(&vp->v_usecount, use, next); } if (__predict_true(next == use)) { break; } } membar_acquire(); if (vrefcnt(vp) <= 0 || vp->v_writecount != 0) { vnpanic(vp, "%s: bad ref count", __func__); } #ifdef DIAGNOSTIC if ((vp->v_type == VBLK || vp->v_type == VCHR) && vp->v_specnode != NULL && vp->v_specnode->sn_opencnt != 0) { vprint("vrelel: missing VOP_CLOSE()", vp); } #endif /* * If already clean there is no need to lock, defer or * deactivate this node. */ if (VSTATE_GET(vp) == VS_RECLAIMED) { if (objlock_held) { objlock_held = false; rw_exit(vp->v_uobj.vmobjlock); } if (lktype != LK_NONE) { mutex_exit(vp->v_interlock); lktype = LK_NONE; VOP_UNLOCK(vp); mutex_enter(vp->v_interlock); } goto out; } /* * First try to get the vnode locked for VOP_INACTIVE(). * Defer vnode release to vrele task if caller requests * it explicitly, is the pagedaemon or the lock failed. */ defer = false; if ((curlwp == uvm.pagedaemon_lwp) || async) { defer = true; } else if (lktype == LK_SHARED) { /* Excellent chance of getting, if the last ref. */ error = vn_lock(vp, LK_UPGRADE | LK_RETRY | LK_NOWAIT); if (error != 0) { defer = true; } else { lktype = LK_EXCLUSIVE; } } else if (lktype == LK_NONE) { /* Excellent chance of getting, if the last ref. */ error = vn_lock(vp, LK_EXCLUSIVE | LK_RETRY | LK_NOWAIT); if (error != 0) { defer = true; } else { lktype = LK_EXCLUSIVE; } } KASSERT(mutex_owned(vp->v_interlock)); if (defer) { /* * Defer reclaim to the vrele task; it's not safe to * clean it here. We donate it our last reference. */ if (lktype != LK_NONE) { mutex_exit(vp->v_interlock); VOP_UNLOCK(vp); mutex_enter(vp->v_interlock); } lru_requeue(vp, &lru_list[LRU_VRELE]); mutex_exit(vp->v_interlock); return; } KASSERT(lktype == LK_EXCLUSIVE); /* If the node gained another reference, retry. */ use = atomic_load_relaxed(&vp->v_usecount); if ((use & VUSECOUNT_VGET) != 0) { goto retry; } KASSERT((use & VUSECOUNT_MASK) == 1); if ((vp->v_iflag & (VI_TEXT|VI_EXECMAP|VI_WRMAP)) != 0 || (vp->v_vflag & VV_MAPPED) != 0) { /* Take care of space accounting. */ if (!objlock_held) { objlock_held = true; if (!rw_tryenter(vp->v_uobj.vmobjlock, RW_WRITER)) { mutex_exit(vp->v_interlock); rw_enter(vp->v_uobj.vmobjlock, RW_WRITER); mutex_enter(vp->v_interlock); goto retry; } } if ((vp->v_iflag & VI_EXECMAP) != 0) { cpu_count(CPU_COUNT_EXECPAGES, -vp->v_uobj.uo_npages); } vp->v_iflag &= ~(VI_TEXT|VI_EXECMAP|VI_WRMAP); vp->v_vflag &= ~VV_MAPPED; } if (objlock_held) { objlock_held = false; rw_exit(vp->v_uobj.vmobjlock); } /* * Deactivate the vnode, but preserve our reference across * the call to VOP_INACTIVE(). * * If VOP_INACTIVE() indicates that the file has been * deleted, then recycle the vnode. * * Note that VOP_INACTIVE() will not drop the vnode lock. */ mutex_exit(vp->v_interlock); recycle = false; VOP_INACTIVE(vp, &recycle); if (!recycle) { lktype = LK_NONE; VOP_UNLOCK(vp); } mutex_enter(vp->v_interlock); /* * Block new references then check again to see if a * new reference was acquired in the meantime. If * it was, restore the vnode state and try again. */ if (recycle) { VSTATE_CHANGE(vp, VS_LOADED, VS_BLOCKED); use = atomic_load_relaxed(&vp->v_usecount); if ((use & VUSECOUNT_VGET) != 0) { VSTATE_CHANGE(vp, VS_BLOCKED, VS_LOADED); goto retry; } KASSERT((use & VUSECOUNT_MASK) == 1); } /* * Recycle the vnode if the file is now unused (unlinked). */ if (recycle) { VSTATE_ASSERT(vp, VS_BLOCKED); KASSERT(lktype == LK_EXCLUSIVE); /* vcache_reclaim drops the lock. */ lktype = LK_NONE; vcache_reclaim(vp); } KASSERT(vrefcnt(vp) > 0); KASSERT(lktype == LK_NONE); out: for (use = atomic_load_relaxed(&vp->v_usecount);; use = next) { if (__predict_false((use & VUSECOUNT_VGET) != 0 && (use & VUSECOUNT_MASK) == 1)) { /* Gained and released another reference, retry. */ goto retry; } next = atomic_cas_uint(&vp->v_usecount, use, use - 1); if (__predict_true(next == use)) { if (__predict_false((use & VUSECOUNT_MASK) != 1)) { /* Gained another reference. */ mutex_exit(vp->v_interlock); return; } break; } } membar_acquire(); if (VSTATE_GET(vp) == VS_RECLAIMED && vp->v_holdcnt == 0) { /* * It's clean so destroy it. It isn't referenced * anywhere since it has been reclaimed. */ vcache_free(VNODE_TO_VIMPL(vp)); } else { /* * Otherwise, put it back onto the freelist. It * can't be destroyed while still associated with * a file system. */ lru_requeue(vp, lru_which(vp)); mutex_exit(vp->v_interlock); } } void vrele(vnode_t *vp) { if (vtryrele(vp)) { return; } mutex_enter(vp->v_interlock); vrelel(vp, 0, LK_NONE); } /* * Asynchronous vnode release, vnode is released in different context. */ void vrele_async(vnode_t *vp) { if (vtryrele(vp)) { return; } mutex_enter(vp->v_interlock); vrelel(vp, VRELEL_ASYNC, LK_NONE); } /* * Vnode reference, where a reference is already held by some other * object (for example, a file structure). * * NB: lockless code sequences may rely on this not blocking. */ void vref(vnode_t *vp) { KASSERT(vrefcnt(vp) > 0); atomic_inc_uint(&vp->v_usecount); } /* * Page or buffer structure gets a reference. * Called with v_interlock held. */ void vholdl(vnode_t *vp) { KASSERT(mutex_owned(vp->v_interlock)); if (vp->v_holdcnt++ == 0 && vrefcnt(vp) == 0) lru_requeue(vp, lru_which(vp)); } /* * Page or buffer structure gets a reference. */ void vhold(vnode_t *vp) { mutex_enter(vp->v_interlock); vholdl(vp); mutex_exit(vp->v_interlock); } /* * Page or buffer structure frees a reference. * Called with v_interlock held. */ void holdrelel(vnode_t *vp) { KASSERT(mutex_owned(vp->v_interlock)); if (vp->v_holdcnt <= 0) { vnpanic(vp, "%s: holdcnt vp %p", __func__, vp); } vp->v_holdcnt--; if (vp->v_holdcnt == 0 && vrefcnt(vp) == 0) lru_requeue(vp, lru_which(vp)); } /* * Page or buffer structure frees a reference. */ void holdrele(vnode_t *vp) { mutex_enter(vp->v_interlock); holdrelel(vp); mutex_exit(vp->v_interlock); } /* * Recycle an unused vnode if caller holds the last reference. */ bool vrecycle(vnode_t *vp) { int error __diagused; mutex_enter(vp->v_interlock); /* If the vnode is already clean we're done. */ VSTATE_WAIT_STABLE(vp); if (VSTATE_GET(vp) != VS_LOADED) { VSTATE_ASSERT(vp, VS_RECLAIMED); vrelel(vp, 0, LK_NONE); return true; } /* Prevent further references until the vnode is locked. */ VSTATE_CHANGE(vp, VS_LOADED, VS_BLOCKED); /* Make sure we hold the last reference. */ if (vrefcnt(vp) != 1) { VSTATE_CHANGE(vp, VS_BLOCKED, VS_LOADED); mutex_exit(vp->v_interlock); return false; } mutex_exit(vp->v_interlock); /* * On a leaf file system this lock will always succeed as we hold * the last reference and prevent further references. * On layered file systems waiting for the lock would open a can of * deadlocks as the lower vnodes may have other active references. */ error = vn_lock(vp, LK_EXCLUSIVE | LK_RETRY | LK_NOWAIT); mutex_enter(vp->v_interlock); if (error) { VSTATE_CHANGE(vp, VS_BLOCKED, VS_LOADED); mutex_exit(vp->v_interlock); return false; } KASSERT(vrefcnt(vp) == 1); vcache_reclaim(vp); vrelel(vp, 0, LK_NONE); return true; } /* * Helper for vrevoke() to propagate suspension from lastmp * to thismp. Both args may be NULL. * Returns the currently suspended file system or NULL. */ static struct mount * vrevoke_suspend_next(struct mount *lastmp, struct mount *thismp) { int error; if (lastmp == thismp) return thismp; if (lastmp != NULL) vfs_resume(lastmp); if (thismp == NULL) return NULL; do { error = vfs_suspend(thismp, 0); } while (error == EINTR || error == ERESTART); if (error == 0) return thismp; KASSERT(error == EOPNOTSUPP || error == ENOENT); return NULL; } /* * Eliminate all activity associated with the requested vnode * and with all vnodes aliased to the requested vnode. */ void vrevoke(vnode_t *vp) { struct mount *mp; vnode_t *vq; enum vtype type; dev_t dev; KASSERT(vrefcnt(vp) > 0); mp = vrevoke_suspend_next(NULL, vp->v_mount); mutex_enter(vp->v_interlock); VSTATE_WAIT_STABLE(vp); if (VSTATE_GET(vp) == VS_RECLAIMED) { mutex_exit(vp->v_interlock); } else if (vp->v_type != VBLK && vp->v_type != VCHR) { atomic_inc_uint(&vp->v_usecount); mutex_exit(vp->v_interlock); vgone(vp); } else { dev = vp->v_rdev; type = vp->v_type; mutex_exit(vp->v_interlock); while (spec_node_lookup_by_dev(type, dev, VDEAD_NOWAIT, &vq) == 0) { mp = vrevoke_suspend_next(mp, vq->v_mount); vgone(vq); } } vrevoke_suspend_next(mp, NULL); } /* * Eliminate all activity associated with a vnode in preparation for * reuse. Drops a reference from the vnode. */ void vgone(vnode_t *vp) { int lktype; KASSERT(vp->v_mount == dead_rootmount || fstrans_is_owner(vp->v_mount)); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); lktype = LK_EXCLUSIVE; mutex_enter(vp->v_interlock); VSTATE_WAIT_STABLE(vp); if (VSTATE_GET(vp) == VS_LOADED) { VSTATE_CHANGE(vp, VS_LOADED, VS_BLOCKED); vcache_reclaim(vp); lktype = LK_NONE; } VSTATE_ASSERT(vp, VS_RECLAIMED); vrelel(vp, 0, lktype); } static inline uint32_t vcache_hash(const struct vcache_key *key) { uint32_t hash = HASH32_BUF_INIT; KASSERT(key->vk_key_len > 0); hash = hash32_buf(&key->vk_mount, sizeof(struct mount *), hash); hash = hash32_buf(key->vk_key, key->vk_key_len, hash); return hash; } static int vcache_stats(struct hashstat_sysctl *hs, bool fill) { vnode_impl_t *vip; uint64_t chain; strlcpy(hs->hash_name, "vcache", sizeof(hs->hash_name)); strlcpy(hs->hash_desc, "vnode cache hash", sizeof(hs->hash_desc)); if (!fill) return 0; hs->hash_size = vcache_hashmask + 1; for (size_t i = 0; i < hs->hash_size; i++) { chain = 0; mutex_enter(&vcache_lock); SLIST_FOREACH(vip, &vcache_hashtab[i], vi_hash) { chain++; } mutex_exit(&vcache_lock); if (chain > 0) { hs->hash_used++; hs->hash_items += chain; if (chain > hs->hash_maxchain) hs->hash_maxchain = chain; } preempt_point(); } return 0; } static void vcache_init(void) { vcache_pool = pool_cache_init(sizeof(vnode_impl_t), coherency_unit, 0, 0, "vcachepl", NULL, IPL_NONE, NULL, NULL, NULL); KASSERT(vcache_pool != NULL); mutex_init(&vcache_lock, MUTEX_DEFAULT, IPL_NONE); cv_init(&vcache_cv, "vcache"); vcache_hashsize = desiredvnodes; vcache_hashtab = hashinit(desiredvnodes, HASH_SLIST, true, &vcache_hashmask); hashstat_register("vcache", vcache_stats); } static void vcache_reinit(void) { int i; uint32_t hash; u_long oldmask, newmask; struct hashhead *oldtab, *newtab; vnode_impl_t *vip; newtab = hashinit(desiredvnodes, HASH_SLIST, true, &newmask); mutex_enter(&vcache_lock); oldtab = vcache_hashtab; oldmask = vcache_hashmask; vcache_hashsize = desiredvnodes; vcache_hashtab = newtab; vcache_hashmask = newmask; for (i = 0; i <= oldmask; i++) { while ((vip = SLIST_FIRST(&oldtab[i])) != NULL) { SLIST_REMOVE(&oldtab[i], vip, vnode_impl, vi_hash); hash = vcache_hash(&vip->vi_key); SLIST_INSERT_HEAD(&newtab[hash & vcache_hashmask], vip, vi_hash); } } mutex_exit(&vcache_lock); hashdone(oldtab, HASH_SLIST, oldmask); } static inline vnode_impl_t * vcache_hash_lookup(const struct vcache_key *key, uint32_t hash) { struct hashhead *hashp; vnode_impl_t *vip; KASSERT(mutex_owned(&vcache_lock)); hashp = &vcache_hashtab[hash & vcache_hashmask]; SLIST_FOREACH(vip, hashp, vi_hash) { if (key->vk_mount != vip->vi_key.vk_mount) continue; if (key->vk_key_len != vip->vi_key.vk_key_len) continue; if (memcmp(key->vk_key, vip->vi_key.vk_key, key->vk_key_len)) continue; return vip; } return NULL; } /* * Allocate a new, uninitialized vcache node. */ static vnode_impl_t * vcache_alloc(void) { vnode_impl_t *vip; vnode_t *vp; vip = pool_cache_get(vcache_pool, PR_WAITOK); vp = VIMPL_TO_VNODE(vip); memset(vip, 0, sizeof(*vip)); rw_init(&vip->vi_lock); vp->v_interlock = mutex_obj_alloc(MUTEX_DEFAULT, IPL_NONE); uvm_obj_init(&vp->v_uobj, &uvm_vnodeops, true, 1); klist_init(&vip->vi_klist.vk_klist); vp->v_klist = &vip->vi_klist; cv_init(&vp->v_cv, "vnode"); cache_vnode_init(vp); vp->v_usecount = 1; vp->v_type = VNON; vp->v_size = vp->v_writesize = VSIZENOTSET; vip->vi_state = VS_LOADING; lru_requeue(vp, &lru_list[LRU_FREE]); return vip; } /* * Deallocate a vcache node in state VS_LOADING. * * vcache_lock held on entry and released on return. */ static void vcache_dealloc(vnode_impl_t *vip) { vnode_t *vp; KASSERT(mutex_owned(&vcache_lock)); vp = VIMPL_TO_VNODE(vip); vfs_ref(dead_rootmount); vfs_insmntque(vp, dead_rootmount); mutex_enter(vp->v_interlock); vp->v_op = dead_vnodeop_p; VSTATE_CHANGE(vp, VS_LOADING, VS_RECLAIMED); mutex_exit(&vcache_lock); vrelel(vp, 0, LK_NONE); } /* * Free an unused, unreferenced vcache node. * v_interlock locked on entry. */ static void vcache_free(vnode_impl_t *vip) { vnode_t *vp; vp = VIMPL_TO_VNODE(vip); KASSERT(mutex_owned(vp->v_interlock)); KASSERT(vrefcnt(vp) == 0); KASSERT(vp->v_holdcnt == 0); KASSERT(vp->v_writecount == 0); lru_requeue(vp, NULL); mutex_exit(vp->v_interlock); vfs_insmntque(vp, NULL); if (vp->v_type == VBLK || vp->v_type == VCHR) spec_node_destroy(vp); mutex_obj_free(vp->v_interlock); rw_destroy(&vip->vi_lock); uvm_obj_destroy(&vp->v_uobj, true); KASSERT(vp->v_klist == &vip->vi_klist); klist_fini(&vip->vi_klist.vk_klist); cv_destroy(&vp->v_cv); cache_vnode_fini(vp); pool_cache_put(vcache_pool, vip); } /* * Try to get an initial reference on this cached vnode. * Returns zero on success or EBUSY if the vnode state is not LOADED. * * NB: lockless code sequences may rely on this not blocking. */ int vcache_tryvget(vnode_t *vp) { u_int use, next; for (use = atomic_load_relaxed(&vp->v_usecount);; use = next) { if (__predict_false((use & VUSECOUNT_GATE) == 0)) { return EBUSY; } next = atomic_cas_uint(&vp->v_usecount, use, (use + 1) | VUSECOUNT_VGET); if (__predict_true(next == use)) { membar_acquire(); return 0; } } } /* * Try to get an initial reference on this cached vnode. * Returns zero on success and ENOENT if the vnode has been reclaimed. * Will wait for the vnode state to be stable. * * v_interlock locked on entry and unlocked on exit. */ int vcache_vget(vnode_t *vp) { int error; KASSERT(mutex_owned(vp->v_interlock)); /* Increment hold count to prevent vnode from disappearing. */ vp->v_holdcnt++; VSTATE_WAIT_STABLE(vp); vp->v_holdcnt--; /* If this was the last reference to a reclaimed vnode free it now. */ if (__predict_false(VSTATE_GET(vp) == VS_RECLAIMED)) { if (vp->v_holdcnt == 0 && vrefcnt(vp) == 0) vcache_free(VNODE_TO_VIMPL(vp)); else mutex_exit(vp->v_interlock); return ENOENT; } VSTATE_ASSERT(vp, VS_LOADED); error = vcache_tryvget(vp); KASSERT(error == 0); mutex_exit(vp->v_interlock); return 0; } /* * Get a vnode / fs node pair by key and return it referenced through vpp. */ int vcache_get(struct mount *mp, const void *key, size_t key_len, struct vnode **vpp) { int error; uint32_t hash; const void *new_key; struct vnode *vp; struct vcache_key vcache_key; vnode_impl_t *vip, *new_vip; new_key = NULL; *vpp = NULL; vcache_key.vk_mount = mp; vcache_key.vk_key = key; vcache_key.vk_key_len = key_len; hash = vcache_hash(&vcache_key); again: mutex_enter(&vcache_lock); vip = vcache_hash_lookup(&vcache_key, hash); /* If found, take a reference or retry. */ if (__predict_true(vip != NULL)) { /* * If the vnode is loading we cannot take the v_interlock * here as it might change during load (see uvm_obj_setlock()). * As changing state from VS_LOADING requires both vcache_lock * and v_interlock it is safe to test with vcache_lock held. * * Wait for vnodes changing state from VS_LOADING and retry. */ if (__predict_false(vip->vi_state == VS_LOADING)) { cv_wait(&vcache_cv, &vcache_lock); mutex_exit(&vcache_lock); goto again; } vp = VIMPL_TO_VNODE(vip); mutex_enter(vp->v_interlock); mutex_exit(&vcache_lock); error = vcache_vget(vp); if (error == ENOENT) goto again; if (error == 0) *vpp = vp; KASSERT((error != 0) == (*vpp == NULL)); return error; } mutex_exit(&vcache_lock); /* Allocate and initialize a new vcache / vnode pair. */ error = vfs_busy(mp); if (error) return error; new_vip = vcache_alloc(); new_vip->vi_key = vcache_key; vp = VIMPL_TO_VNODE(new_vip); mutex_enter(&vcache_lock); vip = vcache_hash_lookup(&vcache_key, hash); if (vip == NULL) { SLIST_INSERT_HEAD(&vcache_hashtab[hash & vcache_hashmask], new_vip, vi_hash); vip = new_vip; } /* If another thread beat us inserting this node, retry. */ if (vip != new_vip) { vcache_dealloc(new_vip); vfs_unbusy(mp); goto again; } mutex_exit(&vcache_lock); /* Load the fs node. Exclusive as new_node is VS_LOADING. */ error = VFS_LOADVNODE(mp, vp, key, key_len, &new_key); if (error) { mutex_enter(&vcache_lock); SLIST_REMOVE(&vcache_hashtab[hash & vcache_hashmask], new_vip, vnode_impl, vi_hash); vcache_dealloc(new_vip); vfs_unbusy(mp); KASSERT(*vpp == NULL); return error; } KASSERT(new_key != NULL); KASSERT(memcmp(key, new_key, key_len) == 0); KASSERT(vp->v_op != NULL); vfs_insmntque(vp, mp); if ((mp->mnt_iflag & IMNT_MPSAFE) != 0) vp->v_vflag |= VV_MPSAFE; vfs_ref(mp); vfs_unbusy(mp); /* Finished loading, finalize node. */ mutex_enter(&vcache_lock); new_vip->vi_key.vk_key = new_key; mutex_enter(vp->v_interlock); VSTATE_CHANGE(vp, VS_LOADING, VS_LOADED); mutex_exit(vp->v_interlock); mutex_exit(&vcache_lock); *vpp = vp; return 0; } /* * Create a new vnode / fs node pair and return it referenced through vpp. */ int vcache_new(struct mount *mp, struct vnode *dvp, struct vattr *vap, kauth_cred_t cred, void *extra, struct vnode **vpp) { int error; uint32_t hash; struct vnode *vp, *ovp; vnode_impl_t *vip, *ovip; *vpp = NULL; /* Allocate and initialize a new vcache / vnode pair. */ error = vfs_busy(mp); if (error) return error; vip = vcache_alloc(); vip->vi_key.vk_mount = mp; vp = VIMPL_TO_VNODE(vip); /* Create and load the fs node. */ error = VFS_NEWVNODE(mp, dvp, vp, vap, cred, extra, &vip->vi_key.vk_key_len, &vip->vi_key.vk_key); if (error) { mutex_enter(&vcache_lock); vcache_dealloc(vip); vfs_unbusy(mp); KASSERT(*vpp == NULL); return error; } KASSERT(vp->v_op != NULL); KASSERT((vip->vi_key.vk_key_len == 0) == (mp == dead_rootmount)); if (vip->vi_key.vk_key_len > 0) { KASSERT(vip->vi_key.vk_key != NULL); hash = vcache_hash(&vip->vi_key); /* * Wait for previous instance to be reclaimed, * then insert new node. */ mutex_enter(&vcache_lock); while ((ovip = vcache_hash_lookup(&vip->vi_key, hash))) { ovp = VIMPL_TO_VNODE(ovip); mutex_enter(ovp->v_interlock); mutex_exit(&vcache_lock); error = vcache_vget(ovp); KASSERT(error == ENOENT); mutex_enter(&vcache_lock); } SLIST_INSERT_HEAD(&vcache_hashtab[hash & vcache_hashmask], vip, vi_hash); mutex_exit(&vcache_lock); } vfs_insmntque(vp, mp); if ((mp->mnt_iflag & IMNT_MPSAFE) != 0) vp->v_vflag |= VV_MPSAFE; vfs_ref(mp); vfs_unbusy(mp); /* Finished loading, finalize node. */ mutex_enter(&vcache_lock); mutex_enter(vp->v_interlock); VSTATE_CHANGE(vp, VS_LOADING, VS_LOADED); mutex_exit(&vcache_lock); mutex_exit(vp->v_interlock); *vpp = vp; return 0; } /* * Prepare key change: update old cache nodes key and lock new cache node. * Return an error if the new node already exists. */ int vcache_rekey_enter(struct mount *mp, struct vnode *vp, const void *old_key, size_t old_key_len, const void *new_key, size_t new_key_len) { uint32_t old_hash, new_hash; struct vcache_key old_vcache_key, new_vcache_key; vnode_impl_t *vip, *new_vip; old_vcache_key.vk_mount = mp; old_vcache_key.vk_key = old_key; old_vcache_key.vk_key_len = old_key_len; old_hash = vcache_hash(&old_vcache_key); new_vcache_key.vk_mount = mp; new_vcache_key.vk_key = new_key; new_vcache_key.vk_key_len = new_key_len; new_hash = vcache_hash(&new_vcache_key); new_vip = vcache_alloc(); new_vip->vi_key = new_vcache_key; /* Insert locked new node used as placeholder. */ mutex_enter(&vcache_lock); vip = vcache_hash_lookup(&new_vcache_key, new_hash); if (vip != NULL) { vcache_dealloc(new_vip); return EEXIST; } SLIST_INSERT_HEAD(&vcache_hashtab[new_hash & vcache_hashmask], new_vip, vi_hash); /* Replace old nodes key with the temporary copy. */ vip = vcache_hash_lookup(&old_vcache_key, old_hash); KASSERT(vip != NULL); KASSERT(VIMPL_TO_VNODE(vip) == vp); KASSERT(vip->vi_key.vk_key != old_vcache_key.vk_key); vip->vi_key = old_vcache_key; mutex_exit(&vcache_lock); return 0; } /* * Key change complete: update old node and remove placeholder. */ void vcache_rekey_exit(struct mount *mp, struct vnode *vp, const void *old_key, size_t old_key_len, const void *new_key, size_t new_key_len) { uint32_t old_hash, new_hash; struct vcache_key old_vcache_key, new_vcache_key; vnode_impl_t *vip, *new_vip; struct vnode *new_vp; old_vcache_key.vk_mount = mp; old_vcache_key.vk_key = old_key; old_vcache_key.vk_key_len = old_key_len; old_hash = vcache_hash(&old_vcache_key); new_vcache_key.vk_mount = mp; new_vcache_key.vk_key = new_key; new_vcache_key.vk_key_len = new_key_len; new_hash = vcache_hash(&new_vcache_key); mutex_enter(&vcache_lock); /* Lookup old and new node. */ vip = vcache_hash_lookup(&old_vcache_key, old_hash); KASSERT(vip != NULL); KASSERT(VIMPL_TO_VNODE(vip) == vp); new_vip = vcache_hash_lookup(&new_vcache_key, new_hash); KASSERT(new_vip != NULL); KASSERT(new_vip->vi_key.vk_key_len == new_key_len); new_vp = VIMPL_TO_VNODE(new_vip); mutex_enter(new_vp->v_interlock); VSTATE_ASSERT(VIMPL_TO_VNODE(new_vip), VS_LOADING); mutex_exit(new_vp->v_interlock); /* Rekey old node and put it onto its new hashlist. */ vip->vi_key = new_vcache_key; if (old_hash != new_hash) { SLIST_REMOVE(&vcache_hashtab[old_hash & vcache_hashmask], vip, vnode_impl, vi_hash); SLIST_INSERT_HEAD(&vcache_hashtab[new_hash & vcache_hashmask], vip, vi_hash); } /* Remove new node used as placeholder. */ SLIST_REMOVE(&vcache_hashtab[new_hash & vcache_hashmask], new_vip, vnode_impl, vi_hash); vcache_dealloc(new_vip); } /* * Disassociate the underlying file system from a vnode. * * Must be called with vnode locked and will return unlocked. * Must be called with the interlock held, and will return with it held. */ static void vcache_reclaim(vnode_t *vp) { lwp_t *l = curlwp; vnode_impl_t *vip = VNODE_TO_VIMPL(vp); struct mount *mp = vp->v_mount; uint32_t hash; uint8_t temp_buf[64], *temp_key; size_t temp_key_len; bool recycle; int error; KASSERT(VOP_ISLOCKED(vp) == LK_EXCLUSIVE); KASSERT(mutex_owned(vp->v_interlock)); KASSERT(vrefcnt(vp) != 0); temp_key_len = vip->vi_key.vk_key_len; /* * Prevent the vnode from being recycled or brought into use * while we clean it out. */ VSTATE_CHANGE(vp, VS_BLOCKED, VS_RECLAIMING); /* * Send NOTE_REVOKE now, before we call VOP_RECLAIM(), * because VOP_RECLAIM() could cause vp->v_klist to * become invalid. Don't check for interest in NOTE_REVOKE * here; it's always posted because it sets EV_EOF. * * Once it's been posted, reset vp->v_klist to point to * our own local storage, in case we were sharing with * someone else. */ KNOTE(&vp->v_klist->vk_klist, NOTE_REVOKE); vp->v_klist = &vip->vi_klist; mutex_exit(vp->v_interlock); rw_enter(vp->v_uobj.vmobjlock, RW_WRITER); mutex_enter(vp->v_interlock); if ((vp->v_iflag & VI_EXECMAP) != 0) { cpu_count(CPU_COUNT_EXECPAGES, -vp->v_uobj.uo_npages); } vp->v_iflag &= ~(VI_TEXT|VI_EXECMAP); vp->v_iflag |= VI_DEADCHECK; /* for genfs_getpages() */ mutex_exit(vp->v_interlock); rw_exit(vp->v_uobj.vmobjlock); /* * With vnode state set to reclaiming, purge name cache immediately * to prevent new handles on vnode, and wait for existing threads * trying to get a handle to notice VS_RECLAIMED status and abort. */ cache_purge(vp); /* Replace the vnode key with a temporary copy. */ if (vip->vi_key.vk_key_len > sizeof(temp_buf)) { temp_key = kmem_alloc(temp_key_len, KM_SLEEP); } else { temp_key = temp_buf; } if (vip->vi_key.vk_key_len > 0) { mutex_enter(&vcache_lock); memcpy(temp_key, vip->vi_key.vk_key, temp_key_len); vip->vi_key.vk_key = temp_key; mutex_exit(&vcache_lock); } fstrans_start(mp); /* * Clean out any cached data associated with the vnode. */ error = vinvalbuf(vp, V_SAVE, NOCRED, l, 0, 0); if (error != 0) { if (wapbl_vphaswapbl(vp)) WAPBL_DISCARD(wapbl_vptomp(vp)); error = vinvalbuf(vp, 0, NOCRED, l, 0, 0); } KASSERTMSG((error == 0), "vinvalbuf failed: %d", error); KASSERT((vp->v_iflag & VI_ONWORKLST) == 0); if (vp->v_type == VBLK || vp->v_type == VCHR) { spec_node_revoke(vp); } /* * Disassociate the underlying file system from the vnode. * VOP_INACTIVE leaves the vnode locked; VOP_RECLAIM unlocks * the vnode, and may destroy the vnode so that VOP_UNLOCK * would no longer function. */ VOP_INACTIVE(vp, &recycle); KASSERT(VOP_ISLOCKED(vp) == LK_EXCLUSIVE); if (VOP_RECLAIM(vp)) { vnpanic(vp, "%s: cannot reclaim", __func__); } KASSERT(vp->v_data == NULL); KASSERT((vp->v_iflag & VI_PAGES) == 0); if (vp->v_type == VREG && vp->v_ractx != NULL) { uvm_ra_freectx(vp->v_ractx); vp->v_ractx = NULL; } if (vip->vi_key.vk_key_len > 0) { /* Remove from vnode cache. */ hash = vcache_hash(&vip->vi_key); mutex_enter(&vcache_lock); KASSERT(vip == vcache_hash_lookup(&vip->vi_key, hash)); SLIST_REMOVE(&vcache_hashtab[hash & vcache_hashmask], vip, vnode_impl, vi_hash); mutex_exit(&vcache_lock); } if (temp_key != temp_buf) kmem_free(temp_key, temp_key_len); /* Done with purge, notify sleepers of the grim news. */ mutex_enter(vp->v_interlock); vp->v_op = dead_vnodeop_p; VSTATE_CHANGE(vp, VS_RECLAIMING, VS_RECLAIMED); vp->v_tag = VT_NON; mutex_exit(vp->v_interlock); /* * Move to dead mount. Must be after changing the operations * vector as vnode operations enter the mount before using the * operations vector. See sys/kern/vnode_if.c. */ vp->v_vflag &= ~VV_ROOT; vfs_ref(dead_rootmount); vfs_insmntque(vp, dead_rootmount); #ifdef PAX_SEGVGUARD pax_segvguard_cleanup(vp); #endif /* PAX_SEGVGUARD */ mutex_enter(vp->v_interlock); fstrans_done(mp); KASSERT((vp->v_iflag & VI_ONWORKLST) == 0); } /* * Disassociate the underlying file system from an open device vnode * and make it anonymous. * * Vnode unlocked on entry, drops a reference to the vnode. */ void vcache_make_anon(vnode_t *vp) { vnode_impl_t *vip = VNODE_TO_VIMPL(vp); uint32_t hash; bool recycle; KASSERT(vp->v_type == VBLK || vp->v_type == VCHR); KASSERT(vp->v_mount == dead_rootmount || fstrans_is_owner(vp->v_mount)); VSTATE_ASSERT_UNLOCKED(vp, VS_ACTIVE); /* Remove from vnode cache. */ hash = vcache_hash(&vip->vi_key); mutex_enter(&vcache_lock); KASSERT(vip == vcache_hash_lookup(&vip->vi_key, hash)); SLIST_REMOVE(&vcache_hashtab[hash & vcache_hashmask], vip, vnode_impl, vi_hash); vip->vi_key.vk_mount = dead_rootmount; vip->vi_key.vk_key_len = 0; vip->vi_key.vk_key = NULL; mutex_exit(&vcache_lock); /* * Disassociate the underlying file system from the vnode. * VOP_INACTIVE leaves the vnode locked; VOP_RECLAIM unlocks * the vnode, and may destroy the vnode so that VOP_UNLOCK * would no longer function. */ if (vn_lock(vp, LK_EXCLUSIVE)) { vnpanic(vp, "%s: cannot lock", __func__); } VOP_INACTIVE(vp, &recycle); KASSERT(VOP_ISLOCKED(vp) == LK_EXCLUSIVE); if (VOP_RECLAIM(vp)) { vnpanic(vp, "%s: cannot reclaim", __func__); } /* Purge name cache. */ cache_purge(vp); /* Done with purge, change operations vector. */ mutex_enter(vp->v_interlock); vp->v_op = spec_vnodeop_p; vp->v_vflag |= VV_MPSAFE; mutex_exit(vp->v_interlock); /* * Move to dead mount. Must be after changing the operations * vector as vnode operations enter the mount before using the * operations vector. See sys/kern/vnode_if.c. */ vfs_ref(dead_rootmount); vfs_insmntque(vp, dead_rootmount); vrele(vp); } /* * Update outstanding I/O count and do wakeup if requested. */ void vwakeup(struct buf *bp) { vnode_t *vp; if ((vp = bp->b_vp) == NULL) return; KASSERT(bp->b_objlock == vp->v_interlock); KASSERT(mutex_owned(bp->b_objlock)); if (--vp->v_numoutput < 0) vnpanic(vp, "%s: neg numoutput, vp %p", __func__, vp); if (vp->v_numoutput == 0) cv_broadcast(&vp->v_cv); } /* * Test a vnode for being or becoming dead. Returns one of: * EBUSY: vnode is becoming dead, with "flags == VDEAD_NOWAIT" only. * ENOENT: vnode is dead. * 0: otherwise. * * Whenever this function returns a non-zero value all future * calls will also return a non-zero value. */ int vdead_check(struct vnode *vp, int flags) { KASSERT(mutex_owned(vp->v_interlock)); if (! ISSET(flags, VDEAD_NOWAIT)) VSTATE_WAIT_STABLE(vp); if (VSTATE_GET(vp) == VS_RECLAIMING) { KASSERT(ISSET(flags, VDEAD_NOWAIT)); return EBUSY; } else if (VSTATE_GET(vp) == VS_RECLAIMED) { return ENOENT; } return 0; } int vfs_drainvnodes(void) { mutex_enter(&vdrain_lock); if (!vdrain_one(desiredvnodes)) { mutex_exit(&vdrain_lock); return EBUSY; } mutex_exit(&vdrain_lock); if (vcache_hashsize != desiredvnodes) vcache_reinit(); return 0; } void vnpanic(vnode_t *vp, const char *fmt, ...) { va_list ap; #ifdef DIAGNOSTIC vprint(NULL, vp); #endif va_start(ap, fmt); vpanic(fmt, ap); va_end(ap); } void vshareilock(vnode_t *tvp, vnode_t *fvp) { kmutex_t *oldlock; oldlock = tvp->v_interlock; mutex_obj_hold(fvp->v_interlock); tvp->v_interlock = fvp->v_interlock; mutex_obj_free(oldlock); } void vshareklist(vnode_t *tvp, vnode_t *fvp) { /* * If two vnodes share klist state, they must also share * an interlock. */ KASSERT(tvp->v_interlock == fvp->v_interlock); /* * We make the following assumptions: * * ==> Some other synchronization is happening outside of * our view to make this safe. * * ==> That the "to" vnode will have the necessary references * on the "from" vnode so that the storage for the klist * won't be yanked out from beneath us (the vnode_impl). * * ==> If "from" is also sharing, we then assume that "from" * has the necessary references, and so on. */ tvp->v_klist = fvp->v_klist; }
1 3 1 3 1 3 1 3 3 1 3 1 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 1 3 3 3 3 3 3 3 3 3 3 3 5 5 3 3 3 3 3 3 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 /* $NetBSD: vhci.c,v 1.27 2022/03/12 15:30:51 riastradh Exp $ */ /* * Copyright (c) 2019-2020 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Maxime Villard. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: vhci.c,v 1.27 2022/03/12 15:30:51 riastradh Exp $"); #ifdef _KERNEL_OPT #include "opt_usb.h" #endif #include <sys/param.h> #include <sys/bus.h> #include <sys/cpu.h> #include <sys/conf.h> #include <sys/device.h> #include <sys/kernel.h> #include <sys/kmem.h> #include <sys/mutex.h> #include <sys/proc.h> #include <sys/queue.h> #include <sys/systm.h> #include <sys/mman.h> #include <sys/file.h> #include <sys/filedesc.h> #include <sys/kcov.h> #include <machine/endian.h> #include "ioconf.h" #include <dev/usb/usb.h> #include <dev/usb/usbdi.h> #include <dev/usb/usbdivar.h> #include <dev/usb/usbroothub.h> #include <dev/usb/vhci.h> #ifdef VHCI_DEBUG #define DPRINTF(fmt, ...) printf(fmt, __VA_ARGS__) #else #define DPRINTF(fmt, ...) __nothing #endif static usbd_status vhci_open(struct usbd_pipe *); static void vhci_softintr(void *); static struct usbd_xfer *vhci_allocx(struct usbd_bus *, unsigned int); static void vhci_freex(struct usbd_bus *, struct usbd_xfer *); static void vhci_get_lock(struct usbd_bus *, kmutex_t **); static int vhci_roothub_ctrl(struct usbd_bus *, usb_device_request_t *, void *, int); static const struct usbd_bus_methods vhci_bus_methods = { .ubm_open = vhci_open, .ubm_softint = vhci_softintr, .ubm_dopoll = NULL, .ubm_allocx = vhci_allocx, .ubm_freex = vhci_freex, .ubm_getlock = vhci_get_lock, .ubm_rhctrl = vhci_roothub_ctrl, }; static usbd_status vhci_device_ctrl_transfer(struct usbd_xfer *); static usbd_status vhci_device_ctrl_start(struct usbd_xfer *); static void vhci_device_ctrl_abort(struct usbd_xfer *); static void vhci_device_ctrl_close(struct usbd_pipe *); static void vhci_device_ctrl_cleartoggle(struct usbd_pipe *); static void vhci_device_ctrl_done(struct usbd_xfer *); static const struct usbd_pipe_methods vhci_device_ctrl_methods = { .upm_init = NULL, .upm_fini = NULL, .upm_transfer = vhci_device_ctrl_transfer, .upm_start = vhci_device_ctrl_start, .upm_abort = vhci_device_ctrl_abort, .upm_close = vhci_device_ctrl_close, .upm_cleartoggle = vhci_device_ctrl_cleartoggle, .upm_done = vhci_device_ctrl_done, }; static usbd_status vhci_root_intr_transfer(struct usbd_xfer *); static usbd_status vhci_root_intr_start(struct usbd_xfer *); static void vhci_root_intr_abort(struct usbd_xfer *); static void vhci_root_intr_close(struct usbd_pipe *); static void vhci_root_intr_cleartoggle(struct usbd_pipe *); static void vhci_root_intr_done(struct usbd_xfer *); static const struct usbd_pipe_methods vhci_root_intr_methods = { .upm_init = NULL, .upm_fini = NULL, .upm_transfer = vhci_root_intr_transfer, .upm_start = vhci_root_intr_start, .upm_abort = vhci_root_intr_abort, .upm_close = vhci_root_intr_close, .upm_cleartoggle = vhci_root_intr_cleartoggle, .upm_done = vhci_root_intr_done, }; /* * There are three structures to understand: vxfers, packets, and ports. * * Each xfer from the point of view of the USB stack is a vxfer from the point * of view of vHCI. * * A vxfer has a linked list containing a maximum of two packets: a request * packet and possibly a data packet. Packets basically contain data exchanged * between the Host and the virtual USB device. A packet is linked to both a * vxfer and a port. * * A port is an abstraction of an actual USB port. Each virtual USB device gets * connected to a port. A port has two lists: * - The Usb-To-Host list, containing packets to be fetched from the USB * device and provided to the host. * - The Host-To-Usb list, containing packets to be sent from the Host to the * USB device. * Request packets are always in the H->U direction. Data packets however can * be in both the H->U and U->H directions. * * With read() and write() operations on /dev/vhci, userland respectively * "fetches" and "sends" packets from or to the virtual USB device, which * respectively means reading/inserting packets in the H->U and U->H lists on * the port where the virtual USB device is connected. * * +------------------------------------------------+ * | USB Stack | * +---------------------^--------------------------+ * | * +---------------------V--------------------------+ * | +----------------+ +-------------+ | * | | Request Packet | | Data Packet | Xfer | * | +-------|--------+ +----|---^----+ | * +---------|------------------|---|---------------+ * | | | * | +--------------+ | * | | | * +---------|---|------------------|---------------+ * | +---V---V---+ +---------|-+ | * | | H->U List | | U->H List | vHCI Port | * | +-----|-----+ +-----^-----+ | * +-----------|----------------|-------------------+ * | | * +-----------|----------------|-------------------+ * | +-----V-----+ +-----|-----+ | * | | read() | | write() | vHCI FD | * | +-----------+ +-----------+ | * +------------------------------------------------+ */ struct vhci_xfer; typedef struct vhci_packet { /* General. */ TAILQ_ENTRY(vhci_packet) portlist; TAILQ_ENTRY(vhci_packet) xferlist; struct vhci_xfer *vxfer; bool utoh; uint8_t addr; /* Type. */ struct { bool req:1; bool res:1; bool dat:1; } type; /* Exposed for FD operations. */ uint8_t *buf; size_t size; size_t cursor; } vhci_packet_t; typedef TAILQ_HEAD(, vhci_packet) vhci_packet_list_t; #define VHCI_NADDRS 16 /* maximum supported by USB */ typedef struct { kmutex_t lock; int status; int change; struct { vhci_packet_list_t usb_to_host; vhci_packet_list_t host_to_usb; } endpoints[VHCI_NADDRS]; } vhci_port_t; typedef struct { struct usbd_pipe pipe; } vhci_pipe_t; typedef struct vhci_xfer { /* General. */ struct usbd_xfer xfer; /* Port where the xfer occurs. */ vhci_port_t *port; /* Packets in the xfer. */ size_t npkts; vhci_packet_list_t pkts; /* Header storage. */ vhci_request_t reqbuf; vhci_response_t resbuf; /* Used for G/C. */ TAILQ_ENTRY(vhci_xfer) freelist; } vhci_xfer_t; typedef TAILQ_HEAD(, vhci_xfer) vhci_xfer_list_t; #define VHCI_INDEX2PORT(idx) (idx) #define VHCI_NPORTS 8 /* above 8, update TODO-bitmap */ #define VHCI_NBUSES 8 typedef struct { device_t sc_dev; struct usbd_bus sc_bus; bool sc_dying; kmutex_t sc_lock; /* * Intr Root. Used to attach the devices. */ struct usbd_xfer *sc_intrxfer; /* * The ports. Zero is for the roothub, one and beyond for the USB * devices. */ size_t sc_nports; vhci_port_t sc_port[VHCI_NPORTS]; device_t sc_child; /* /dev/usb# device */ } vhci_softc_t; typedef struct { u_int port; uint8_t addr; vhci_softc_t *softc; } vhci_fd_t; extern struct cfdriver vhci_cd; /* -------------------------------------------------------------------------- */ static void vhci_pkt_ctrl_create(vhci_port_t *port, struct usbd_xfer *xfer, bool utoh, uint8_t addr) { vhci_xfer_t *vxfer = (vhci_xfer_t *)xfer; vhci_packet_list_t *reqlist, *reslist, *datlist = NULL; vhci_packet_t *req, *res = NULL, *dat = NULL; size_t npkts = 0; /* Request packet. */ reqlist = &port->endpoints[addr].host_to_usb; req = kmem_zalloc(sizeof(*req), KM_SLEEP); req->vxfer = vxfer; req->utoh = false; req->addr = addr; req->type.req = true; req->buf = (uint8_t *)&vxfer->reqbuf; req->size = sizeof(vxfer->reqbuf); req->cursor = 0; npkts++; /* Init the request buffer. */ memset(&vxfer->reqbuf, 0, sizeof(vxfer->reqbuf)); vxfer->reqbuf.type = VHCI_REQ_CTRL; memcpy(&vxfer->reqbuf.u.ctrl, &xfer->ux_request, sizeof(xfer->ux_request)); /* Response packet. */ if (utoh && (xfer->ux_length > 0)) { reslist = &port->endpoints[addr].usb_to_host; res = kmem_zalloc(sizeof(*res), KM_SLEEP); res->vxfer = vxfer; res->utoh = true; res->addr = addr; res->type.res = true; res->buf = (uint8_t *)&vxfer->resbuf; res->size = sizeof(vxfer->resbuf); res->cursor = 0; npkts++; } /* Data packet. */ if (xfer->ux_length > 0) { if (utoh) { datlist = &port->endpoints[addr].usb_to_host; } else { datlist = &port->endpoints[addr].host_to_usb; } dat = kmem_zalloc(sizeof(*dat), KM_SLEEP); dat->vxfer = vxfer; dat->utoh = utoh; dat->addr = addr; dat->type.dat = true; dat->buf = xfer->ux_buf; dat->size = xfer->ux_length; dat->cursor = 0; npkts++; } /* Insert in the xfer. */ vxfer->port = port; vxfer->npkts = npkts; TAILQ_INIT(&vxfer->pkts); TAILQ_INSERT_TAIL(&vxfer->pkts, req, xferlist); if (res != NULL) TAILQ_INSERT_TAIL(&vxfer->pkts, res, xferlist); if (dat != NULL) TAILQ_INSERT_TAIL(&vxfer->pkts, dat, xferlist); /* Insert in the port. */ KASSERT(mutex_owned(&port->lock)); TAILQ_INSERT_TAIL(reqlist, req, portlist); if (res != NULL) TAILQ_INSERT_TAIL(reslist, res, portlist); if (dat != NULL) TAILQ_INSERT_TAIL(datlist, dat, portlist); } static void vhci_pkt_destroy(vhci_softc_t *sc, vhci_packet_t *pkt) { vhci_xfer_t *vxfer = pkt->vxfer; vhci_port_t *port = vxfer->port; vhci_packet_list_t *pktlist; KASSERT(mutex_owned(&port->lock)); /* Remove from the port. */ if (pkt->utoh) { pktlist = &port->endpoints[pkt->addr].usb_to_host; } else { pktlist = &port->endpoints[pkt->addr].host_to_usb; } TAILQ_REMOVE(pktlist, pkt, portlist); /* Remove from the xfer. */ TAILQ_REMOVE(&vxfer->pkts, pkt, xferlist); kmem_free(pkt, sizeof(*pkt)); /* Unref. */ KASSERT(vxfer->npkts > 0); vxfer->npkts--; if (vxfer->npkts > 0) return; KASSERT(TAILQ_FIRST(&vxfer->pkts) == NULL); } /* -------------------------------------------------------------------------- */ static usbd_status vhci_open(struct usbd_pipe *pipe) { struct usbd_device *dev = pipe->up_dev; struct usbd_bus *bus = dev->ud_bus; usb_endpoint_descriptor_t *ed = pipe->up_endpoint->ue_edesc; vhci_softc_t *sc = bus->ub_hcpriv; uint8_t addr = dev->ud_addr; if (sc->sc_dying) return USBD_IOERROR; DPRINTF("%s: called, type=%d\n", __func__, UE_GET_XFERTYPE(ed->bmAttributes)); if (addr == bus->ub_rhaddr) { switch (ed->bEndpointAddress) { case USB_CONTROL_ENDPOINT: DPRINTF("%s: roothub_ctrl\n", __func__); pipe->up_methods = &roothub_ctrl_methods; break; case UE_DIR_IN | USBROOTHUB_INTR_ENDPT: DPRINTF("%s: root_intr\n", __func__); pipe->up_methods = &vhci_root_intr_methods; break; default: DPRINTF("%s: inval\n", __func__); return USBD_INVAL; } } else { switch (UE_GET_XFERTYPE(ed->bmAttributes)) { case UE_CONTROL: pipe->up_methods = &vhci_device_ctrl_methods; break; case UE_INTERRUPT: case UE_BULK: default: goto bad; } } return USBD_NORMAL_COMPLETION; bad: return USBD_NOMEM; } static void vhci_softintr(void *v) { DPRINTF("%s: called\n", __func__); } static struct usbd_xfer * vhci_allocx(struct usbd_bus *bus, unsigned int nframes) { vhci_xfer_t *vxfer; vxfer = kmem_zalloc(sizeof(*vxfer), KM_SLEEP); #ifdef DIAGNOSTIC vxfer->xfer.ux_state = XFER_BUSY; #endif return (struct usbd_xfer *)vxfer; } static void vhci_freex(struct usbd_bus *bus, struct usbd_xfer *xfer) { vhci_xfer_t *vxfer = (vhci_xfer_t *)xfer; KASSERT(vxfer->npkts == 0); KASSERT(TAILQ_FIRST(&vxfer->pkts) == NULL); #ifdef DIAGNOSTIC vxfer->xfer.ux_state = XFER_FREE; #endif kmem_free(vxfer, sizeof(*vxfer)); } static void vhci_get_lock(struct usbd_bus *bus, kmutex_t **lock) { vhci_softc_t *sc = bus->ub_hcpriv; *lock = &sc->sc_lock; } static int vhci_roothub_ctrl(struct usbd_bus *bus, usb_device_request_t *req, void *buf, int buflen) { vhci_softc_t *sc = bus->ub_hcpriv; vhci_port_t *port; usb_hub_descriptor_t hubd; uint16_t len, value, index; int totlen = 0; len = UGETW(req->wLength); value = UGETW(req->wValue); index = UGETW(req->wIndex); #define C(x,y) ((x) | ((y) << 8)) switch (C(req->bRequest, req->bmRequestType)) { case C(UR_GET_DESCRIPTOR, UT_READ_DEVICE): switch (value) { case C(0, UDESC_DEVICE): { usb_device_descriptor_t devd; totlen = uimin(buflen, sizeof(devd)); memcpy(&devd, buf, totlen); USETW(devd.idVendor, 0); USETW(devd.idProduct, 0); memcpy(buf, &devd, totlen); break; } #define sd ((usb_string_descriptor_t *)buf) case C(1, UDESC_STRING): /* Vendor */ totlen = usb_makestrdesc(sd, len, "NetBSD"); break; case C(2, UDESC_STRING): /* Product */ totlen = usb_makestrdesc(sd, len, "VHCI root hub"); break; #undef sd default: /* default from usbroothub */ return buflen; } break; case C(UR_SET_FEATURE, UT_WRITE_CLASS_OTHER): switch (value) { case UHF_PORT_RESET: if (index < 1 || index >= sc->sc_nports) { return -1; } port = &sc->sc_port[VHCI_INDEX2PORT(index)]; port->status |= UPS_C_PORT_RESET; break; case UHF_PORT_POWER: break; default: return -1; } break; /* Hub requests. */ case C(UR_CLEAR_FEATURE, UT_WRITE_CLASS_DEVICE): break; case C(UR_CLEAR_FEATURE, UT_WRITE_CLASS_OTHER): if (index < 1 || index >= sc->sc_nports) { return -1; } port = &sc->sc_port[VHCI_INDEX2PORT(index)]; switch (value) { case UHF_PORT_ENABLE: port->status &= ~UPS_PORT_ENABLED; break; case UHF_C_PORT_ENABLE: port->change |= UPS_C_PORT_ENABLED; break; default: return -1; } break; case C(UR_GET_DESCRIPTOR, UT_READ_CLASS_DEVICE): totlen = uimin(buflen, sizeof(hubd)); memcpy(&hubd, buf, totlen); hubd.bNbrPorts = sc->sc_nports - 1; hubd.bDescLength = USB_HUB_DESCRIPTOR_SIZE; totlen = uimin(totlen, hubd.bDescLength); memcpy(buf, &hubd, totlen); break; case C(UR_GET_STATUS, UT_READ_CLASS_DEVICE): /* XXX The other HCs do this */ memset(buf, 0, len); totlen = len; break; case C(UR_GET_STATUS, UT_READ_CLASS_OTHER): { usb_port_status_t ps; if (index < 1 || index >= sc->sc_nports) { return -1; } port = &sc->sc_port[VHCI_INDEX2PORT(index)]; USETW(ps.wPortStatus, port->status); USETW(ps.wPortChange, port->change); totlen = uimin(len, sizeof(ps)); memcpy(buf, &ps, totlen); break; } default: /* default from usbroothub */ return buflen; } return totlen; } /* -------------------------------------------------------------------------- */ static usbd_status vhci_device_ctrl_transfer(struct usbd_xfer *xfer) { DPRINTF("%s: called\n", __func__); /* Pipe isn't running, start first */ return vhci_device_ctrl_start(SIMPLEQ_FIRST(&xfer->ux_pipe->up_queue)); } static usbd_status vhci_device_ctrl_start(struct usbd_xfer *xfer) { usb_endpoint_descriptor_t *ed = xfer->ux_pipe->up_endpoint->ue_edesc; usb_device_request_t *req = &xfer->ux_request; struct usbd_device *dev = xfer->ux_pipe->up_dev; vhci_softc_t *sc = xfer->ux_bus->ub_hcpriv; vhci_port_t *port; bool isread = (req->bmRequestType & UT_READ) != 0; uint8_t addr = UE_GET_ADDR(ed->bEndpointAddress); int portno, ret; KASSERT(addr == 0); KASSERT(xfer->ux_rqflags & URQ_REQUEST); KASSERT(dev->ud_myhsport != NULL); portno = dev->ud_myhsport->up_portno; DPRINTF("%s: type=0x%02x, len=%d, isread=%d, portno=%d\n", __func__, req->bmRequestType, UGETW(req->wLength), isread, portno); KASSERT(sc->sc_bus.ub_usepolling || mutex_owned(&sc->sc_lock)); if (sc->sc_dying) return USBD_IOERROR; port = &sc->sc_port[portno]; mutex_enter(&port->lock); if (port->status & UPS_PORT_ENABLED) { xfer->ux_status = USBD_IN_PROGRESS; vhci_pkt_ctrl_create(port, xfer, isread, addr); ret = USBD_IN_PROGRESS; } else { ret = USBD_IOERROR; } mutex_exit(&port->lock); return ret; } static void vhci_device_ctrl_abort(struct usbd_xfer *xfer) { vhci_xfer_t *vxfer = (vhci_xfer_t *)xfer; vhci_softc_t *sc = xfer->ux_bus->ub_hcpriv; vhci_port_t *port = vxfer->port; vhci_packet_t *pkt; DPRINTF("%s: called\n", __func__); KASSERT(mutex_owned(&sc->sc_lock)); callout_halt(&xfer->ux_callout, &sc->sc_lock); /* If anyone else beat us, we're done. */ KASSERT(xfer->ux_status != USBD_CANCELLED); if (xfer->ux_status != USBD_IN_PROGRESS) return; mutex_enter(&port->lock); while (vxfer->npkts > 0) { pkt = TAILQ_FIRST(&vxfer->pkts); KASSERT(pkt != NULL); vhci_pkt_destroy(sc, pkt); } KASSERT(TAILQ_FIRST(&vxfer->pkts) == NULL); mutex_exit(&port->lock); xfer->ux_status = USBD_CANCELLED; usb_transfer_complete(xfer); KASSERT(mutex_owned(&sc->sc_lock)); } static void vhci_device_ctrl_close(struct usbd_pipe *pipe) { DPRINTF("%s: called\n", __func__); } static void vhci_device_ctrl_cleartoggle(struct usbd_pipe *pipe) { DPRINTF("%s: called\n", __func__); } static void vhci_device_ctrl_done(struct usbd_xfer *xfer) { DPRINTF("%s: called\n", __func__); } /* -------------------------------------------------------------------------- */ static usbd_status vhci_root_intr_transfer(struct usbd_xfer *xfer) { DPRINTF("%s: called\n", __func__); /* Pipe isn't running, start first */ return vhci_root_intr_start(SIMPLEQ_FIRST(&xfer->ux_pipe->up_queue)); } static usbd_status vhci_root_intr_start(struct usbd_xfer *xfer) { vhci_softc_t *sc = xfer->ux_bus->ub_hcpriv; DPRINTF("%s: called, len=%zu\n", __func__, (size_t)xfer->ux_length); KASSERT(sc->sc_bus.ub_usepolling || mutex_owned(&sc->sc_lock)); if (sc->sc_dying) return USBD_IOERROR; KASSERT(sc->sc_intrxfer == NULL); sc->sc_intrxfer = xfer; xfer->ux_status = USBD_IN_PROGRESS; return USBD_IN_PROGRESS; } static void vhci_root_intr_abort(struct usbd_xfer *xfer) { vhci_softc_t *sc = xfer->ux_bus->ub_hcpriv; DPRINTF("%s: called\n", __func__); KASSERT(mutex_owned(&sc->sc_lock)); KASSERT(xfer->ux_pipe->up_intrxfer == xfer); /* If xfer has already completed, nothing to do here. */ if (sc->sc_intrxfer == NULL) return; /* * Otherwise, sc->sc_intrxfer had better be this transfer. * Cancel it. */ KASSERT(sc->sc_intrxfer == xfer); KASSERT(xfer->ux_status == USBD_IN_PROGRESS); xfer->ux_status = USBD_CANCELLED; usb_transfer_complete(xfer); } static void vhci_root_intr_close(struct usbd_pipe *pipe) { vhci_softc_t *sc __diagused = pipe->up_dev->ud_bus->ub_hcpriv; DPRINTF("%s: called\n", __func__); KASSERT(mutex_owned(&sc->sc_lock)); /* * Caller must guarantee the xfer has completed first, by * closing the pipe only after normal completion or an abort. */ KASSERT(sc->sc_intrxfer == NULL); } static void vhci_root_intr_cleartoggle(struct usbd_pipe *pipe) { DPRINTF("%s: called\n", __func__); } static void vhci_root_intr_done(struct usbd_xfer *xfer) { vhci_softc_t *sc = xfer->ux_bus->ub_hcpriv; KASSERT(mutex_owned(&sc->sc_lock)); /* Claim the xfer so it doesn't get completed again. */ KASSERT(sc->sc_intrxfer == xfer); KASSERT(xfer->ux_status != USBD_IN_PROGRESS); sc->sc_intrxfer = NULL; } /* -------------------------------------------------------------------------- */ static void vhci_usb_attach(vhci_fd_t *vfd) { vhci_softc_t *sc = vfd->softc; vhci_port_t *port; struct usbd_xfer *xfer; u_char *p; port = &sc->sc_port[vfd->port]; mutex_enter(&sc->sc_lock); mutex_enter(&port->lock); port->status = UPS_CURRENT_CONNECT_STATUS | UPS_PORT_ENABLED | UPS_PORT_POWER; port->change = UPS_C_CONNECT_STATUS | UPS_C_PORT_RESET; mutex_exit(&port->lock); xfer = sc->sc_intrxfer; if (xfer == NULL) { goto done; } KASSERT(xfer->ux_status == USBD_IN_PROGRESS); /* * Mark our port has having changed state. Uhub will then fetch * status/change and see it needs to perform an attach. */ p = xfer->ux_buf; memset(p, 0, xfer->ux_length); p[0] = __BIT(vfd->port); /* TODO-bitmap */ xfer->ux_actlen = xfer->ux_length; xfer->ux_status = USBD_NORMAL_COMPLETION; usb_transfer_complete(xfer); done: mutex_exit(&sc->sc_lock); } static void vhci_port_flush(vhci_softc_t *sc, vhci_port_t *port) { vhci_packet_list_t *pktlist; vhci_packet_t *pkt, *nxt; vhci_xfer_list_t vxferlist; vhci_xfer_t *vxfer; uint8_t addr; KASSERT(mutex_owned(&sc->sc_lock)); KASSERT(mutex_owned(&port->lock)); TAILQ_INIT(&vxferlist); for (addr = 0; addr < VHCI_NADDRS; addr++) { /* Drop all the packets in the H->U direction. */ pktlist = &port->endpoints[addr].host_to_usb; TAILQ_FOREACH_SAFE(pkt, pktlist, portlist, nxt) { vxfer = pkt->vxfer; KASSERT(vxfer->xfer.ux_status == USBD_IN_PROGRESS); vhci_pkt_destroy(sc, pkt); if (vxfer->npkts == 0) TAILQ_INSERT_TAIL(&vxferlist, vxfer, freelist); } KASSERT(TAILQ_FIRST(pktlist) == NULL); /* Drop all the packets in the U->H direction. */ pktlist = &port->endpoints[addr].usb_to_host; TAILQ_FOREACH_SAFE(pkt, pktlist, portlist, nxt) { vxfer = pkt->vxfer; KASSERT(vxfer->xfer.ux_status == USBD_IN_PROGRESS); vhci_pkt_destroy(sc, pkt); if (vxfer->npkts == 0) TAILQ_INSERT_TAIL(&vxferlist, vxfer, freelist); } KASSERT(TAILQ_FIRST(pktlist) == NULL); /* Terminate all the xfers collected. */ while ((vxfer = TAILQ_FIRST(&vxferlist)) != NULL) { struct usbd_xfer *xfer = &vxfer->xfer; TAILQ_REMOVE(&vxferlist, vxfer, freelist); xfer->ux_status = USBD_TIMEOUT; usb_transfer_complete(xfer); } } } static void vhci_usb_detach(vhci_fd_t *vfd) { vhci_softc_t *sc = vfd->softc; vhci_port_t *port; struct usbd_xfer *xfer; u_char *p; port = &sc->sc_port[vfd->port]; mutex_enter(&sc->sc_lock); xfer = sc->sc_intrxfer; if (xfer == NULL) { goto done; } KASSERT(xfer->ux_status == USBD_IN_PROGRESS); mutex_enter(&port->lock); port->status = 0; port->change = UPS_C_CONNECT_STATUS | UPS_C_PORT_RESET; /* * Mark our port has having changed state. Uhub will then fetch * status/change and see it needs to perform a detach. */ p = xfer->ux_buf; memset(p, 0, xfer->ux_length); p[0] = __BIT(vfd->port); /* TODO-bitmap */ xfer->ux_actlen = xfer->ux_length; xfer->ux_status = USBD_NORMAL_COMPLETION; usb_transfer_complete(xfer); vhci_port_flush(sc, port); mutex_exit(&port->lock); done: mutex_exit(&sc->sc_lock); } static int vhci_get_info(vhci_fd_t *vfd, struct vhci_ioc_get_info *args) { vhci_softc_t *sc = vfd->softc; vhci_port_t *port; port = &sc->sc_port[vfd->port]; args->nports = VHCI_NPORTS; args->port = vfd->port; mutex_enter(&port->lock); args->status = port->status; mutex_exit(&port->lock); args->addr = vfd->addr; return 0; } static int vhci_set_port(vhci_fd_t *vfd, struct vhci_ioc_set_port *args) { vhci_softc_t *sc = vfd->softc; if (args->port == 0 || args->port >= sc->sc_nports) return EINVAL; vfd->port = args->port; return 0; } static int vhci_set_addr(vhci_fd_t *vfd, struct vhci_ioc_set_addr *args) { if (args->addr >= VHCI_NADDRS) return EINVAL; vfd->addr = args->addr; return 0; } /* -------------------------------------------------------------------------- */ static dev_type_open(vhci_fd_open); const struct cdevsw vhci_cdevsw = { .d_open = vhci_fd_open, .d_close = noclose, .d_read = noread, .d_write = nowrite, .d_ioctl = noioctl, .d_stop = nostop, .d_tty = notty, .d_poll = nopoll, .d_mmap = nommap, .d_kqfilter = nokqfilter, .d_discard = nodiscard, .d_flag = D_OTHER | D_MPSAFE }; static int vhci_fd_ioctl(file_t *, u_long, void *); static int vhci_fd_close(file_t *); static int vhci_fd_read(struct file *, off_t *, struct uio *, kauth_cred_t, int); static int vhci_fd_write(struct file *, off_t *, struct uio *, kauth_cred_t, int); const struct fileops vhci_fileops = { .fo_read = vhci_fd_read, .fo_write = vhci_fd_write, .fo_ioctl = vhci_fd_ioctl, .fo_fcntl = fnullop_fcntl, .fo_poll = fnullop_poll, .fo_stat = fbadop_stat, .fo_close = vhci_fd_close, .fo_kqfilter = fnullop_kqfilter, .fo_restart = fnullop_restart, .fo_mmap = NULL, }; static int vhci_fd_open(dev_t dev, int flags, int type, struct lwp *l) { vhci_softc_t *sc; vhci_fd_t *vfd; struct file *fp; int error, fd; sc = device_lookup_private(&vhci_cd, minor(dev)); if (sc == NULL) return EXDEV; error = fd_allocfile(&fp, &fd); if (error) return error; vfd = kmem_alloc(sizeof(*vfd), KM_SLEEP); vfd->port = 1; vfd->addr = 0; vfd->softc = sc; return fd_clone(fp, fd, flags, &vhci_fileops, vfd); } static int vhci_fd_close(file_t *fp) { vhci_fd_t *vfd = fp->f_data; KASSERT(vfd != NULL); vhci_usb_detach(vfd); kmem_free(vfd, sizeof(*vfd)); fp->f_data = NULL; return 0; } static int vhci_fd_read(struct file *fp, off_t *offp, struct uio *uio, kauth_cred_t cred, int flags) { vhci_fd_t *vfd = fp->f_data; vhci_softc_t *sc = vfd->softc; vhci_packet_list_t *pktlist; vhci_packet_t *pkt, *nxt; vhci_xfer_list_t vxferlist; vhci_xfer_t *vxfer; vhci_port_t *port; int error = 0; uint8_t *buf; size_t size; if (uio->uio_resid == 0) return 0; port = &sc->sc_port[vfd->port]; pktlist = &port->endpoints[vfd->addr].host_to_usb; TAILQ_INIT(&vxferlist); mutex_enter(&port->lock); if (!(port->status & UPS_PORT_ENABLED)) { error = ENOBUFS; goto out; } TAILQ_FOREACH_SAFE(pkt, pktlist, portlist, nxt) { vxfer = pkt->vxfer; buf = pkt->buf + pkt->cursor; KASSERT(pkt->size >= pkt->cursor); size = uimin(uio->uio_resid, pkt->size - pkt->cursor); KASSERT(vxfer->xfer.ux_status == USBD_IN_PROGRESS); error = uiomove(buf, size, uio); if (error) { DPRINTF("%s: error = %d\n", __func__, error); goto out; } pkt->cursor += size; if (pkt->cursor == pkt->size) { vhci_pkt_destroy(sc, pkt); if (vxfer->npkts == 0) { TAILQ_INSERT_TAIL(&vxferlist, vxfer, freelist); } } if (uio->uio_resid == 0) { break; } } out: mutex_exit(&port->lock); while ((vxfer = TAILQ_FIRST(&vxferlist)) != NULL) { struct usbd_xfer *xfer = &vxfer->xfer; TAILQ_REMOVE(&vxferlist, vxfer, freelist); mutex_enter(&sc->sc_lock); xfer->ux_actlen = xfer->ux_length; xfer->ux_status = USBD_NORMAL_COMPLETION; usb_transfer_complete(xfer); mutex_exit(&sc->sc_lock); } return error; } static int vhci_fd_write(struct file *fp, off_t *offp, struct uio *uio, kauth_cred_t cred, int flags) { vhci_fd_t *vfd = fp->f_data; vhci_softc_t *sc = vfd->softc; vhci_packet_list_t *pktlist; vhci_packet_t *pkt, *nxt; vhci_xfer_list_t vxferlist; vhci_xfer_t *vxfer; vhci_port_t *port; int error = 0; uint8_t *buf; size_t pktsize, size; if (uio->uio_resid == 0) return 0; port = &sc->sc_port[vfd->port]; pktlist = &port->endpoints[vfd->addr].usb_to_host; TAILQ_INIT(&vxferlist); mutex_enter(&port->lock); if (!(port->status & UPS_PORT_ENABLED)) { error = ENOBUFS; goto out; } TAILQ_FOREACH_SAFE(pkt, pktlist, portlist, nxt) { vxfer = pkt->vxfer; buf = pkt->buf + pkt->cursor; pktsize = pkt->size; if (pkt->type.dat) pktsize = ulmin(vxfer->resbuf.size, pktsize); KASSERT(pktsize >= pkt->cursor); size = uimin(uio->uio_resid, pktsize - pkt->cursor); KASSERT(vxfer->xfer.ux_status == USBD_IN_PROGRESS); error = uiomove(buf, size, uio); if (error) { DPRINTF("%s: error = %d\n", __func__, error); goto out; } pkt->cursor += size; if (pkt->cursor == pktsize) { vhci_pkt_destroy(sc, pkt); if (vxfer->npkts == 0) { TAILQ_INSERT_TAIL(&vxferlist, vxfer, freelist); } } if (uio->uio_resid == 0) { break; } } out: mutex_exit(&port->lock); while ((vxfer = TAILQ_FIRST(&vxferlist)) != NULL) { struct usbd_xfer *xfer = &vxfer->xfer; TAILQ_REMOVE(&vxferlist, vxfer, freelist); mutex_enter(&sc->sc_lock); xfer->ux_actlen = ulmin(vxfer->resbuf.size, xfer->ux_length); xfer->ux_status = USBD_NORMAL_COMPLETION; usb_transfer_complete(xfer); mutex_exit(&sc->sc_lock); } return error; } static int vhci_fd_ioctl(file_t *fp, u_long cmd, void *data) { vhci_fd_t *vfd = fp->f_data; KASSERT(vfd != NULL); switch (cmd) { case VHCI_IOC_GET_INFO: return vhci_get_info(vfd, data); case VHCI_IOC_SET_PORT: return vhci_set_port(vfd, data); case VHCI_IOC_SET_ADDR: return vhci_set_addr(vfd, data); case VHCI_IOC_USB_ATTACH: vhci_usb_attach(vfd); return 0; case VHCI_IOC_USB_DETACH: vhci_usb_detach(vfd); return 0; default: return EINVAL; } } /* -------------------------------------------------------------------------- */ static int vhci_match(device_t, cfdata_t, void *); static void vhci_attach(device_t, device_t, void *); static int vhci_activate(device_t, enum devact); CFATTACH_DECL_NEW(vhci, sizeof(vhci_softc_t), vhci_match, vhci_attach, NULL, vhci_activate); void vhciattach(int nunits) { struct cfdata *cf; int error; size_t i; error = config_cfattach_attach(vhci_cd.cd_name, &vhci_ca); if (error) { aprint_error("%s: unable to register cfattach\n", vhci_cd.cd_name); (void)config_cfdriver_detach(&vhci_cd); return; } for (i = 0; i < VHCI_NBUSES; i++) { cf = kmem_alloc(sizeof(*cf), KM_SLEEP); cf->cf_name = vhci_cd.cd_name; cf->cf_atname = vhci_cd.cd_name; cf->cf_unit = i; cf->cf_fstate = FSTATE_STAR; config_attach_pseudo(cf); } } static int vhci_activate(device_t self, enum devact act) { vhci_softc_t *sc = device_private(self); switch (act) { case DVACT_DEACTIVATE: sc->sc_dying = 1; return 0; default: return EOPNOTSUPP; } } static int vhci_match(device_t parent, cfdata_t match, void *aux) { return 1; } static void vhci_attach(device_t parent, device_t self, void *aux) { vhci_softc_t *sc = device_private(self); vhci_port_t *port; uint8_t addr; size_t i; sc->sc_dev = self; sc->sc_bus.ub_revision = USBREV_2_0; sc->sc_bus.ub_hctype = USBHCTYPE_VHCI; sc->sc_bus.ub_busnum = device_unit(self); sc->sc_bus.ub_usedma = false; sc->sc_bus.ub_methods = &vhci_bus_methods; sc->sc_bus.ub_pipesize = sizeof(vhci_pipe_t); sc->sc_bus.ub_hcpriv = sc; sc->sc_dying = false; mutex_init(&sc->sc_lock, MUTEX_DEFAULT, IPL_SOFTUSB); sc->sc_nports = VHCI_NPORTS; for (i = 0; i < sc->sc_nports; i++) { port = &sc->sc_port[i]; mutex_init(&port->lock, MUTEX_DEFAULT, IPL_SOFTUSB); for (addr = 0; addr < VHCI_NADDRS; addr++) { TAILQ_INIT(&port->endpoints[addr].usb_to_host); TAILQ_INIT(&port->endpoints[addr].host_to_usb); } kcov_remote_register(KCOV_REMOTE_VHCI, KCOV_REMOTE_VHCI_ID(sc->sc_bus.ub_busnum, i)); } sc->sc_child = config_found(self, &sc->sc_bus, usbctlprint, CFARGS_NONE); }
79 80 3 2 3 4 4 2 2 1 1 1 3 3 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 /* $NetBSD: uipc_syscalls_40.c,v 1.24 2022/07/07 18:17:33 riastradh Exp $ */ /* written by Pavel Cahyna, 2006. Public domain. */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: uipc_syscalls_40.c,v 1.24 2022/07/07 18:17:33 riastradh Exp $"); #if defined(_KERNEL_OPT) #include "opt_compat_netbsd.h" #endif /* * System call interface to the socket abstraction. */ #include <sys/param.h> #include <sys/kernel.h> #include <sys/msg.h> #include <sys/sysctl.h> #include <sys/syscallargs.h> #include <sys/errno.h> #include <sys/compat_stub.h> #include <net/if.h> #include <compat/sys/socket.h> #include <compat/sys/sockio.h> #include <compat/common/compat_mod.h> /* * Return interface configuration of system. List may be used in later * ioctl's (above) to get other information. */ /*ARGSUSED*/ static int compat_ifconf(u_long cmd, void *data) { struct oifconf *ifc = data; struct ifnet *ifp; struct oifreq ifr, *ifrp = NULL; int space = 0, error = 0; const int sz = (int)sizeof(ifr); int s; int bound; struct psref psref; switch (cmd) { case OSIOCGIFCONF: case OOSIOCGIFCONF: break; default: return ENOSYS; } const bool docopy = ifc->ifc_req != NULL; if (docopy) { if (ifc->ifc_len < 0) return EINVAL; space = ifc->ifc_len; ifrp = ifc->ifc_req; } memset(&ifr, 0, sizeof(ifr)); bound = curlwp_bind(); s = pserialize_read_enter(); IFNET_READER_FOREACH(ifp) { struct ifaddr *ifa; if_acquire(ifp, &psref); pserialize_read_exit(s); (void)strncpy(ifr.ifr_name, ifp->if_xname, sizeof(ifr.ifr_name)); if (ifr.ifr_name[sizeof(ifr.ifr_name) - 1] != '\0') { error = ENAMETOOLONG; goto release_exit; } if (IFADDR_READER_EMPTY(ifp)) { memset(&ifr.ifr_addr, 0, sizeof(ifr.ifr_addr)); if (space >= sz) { error = copyout(&ifr, ifrp, sz); if (error != 0) goto release_exit; ifrp++; } space -= sizeof(ifr); goto next; } s = pserialize_read_enter(); IFADDR_READER_FOREACH(ifa, ifp) { struct sockaddr *sa = ifa->ifa_addr; struct psref psref_ifa; ifa_acquire(ifa, &psref_ifa); pserialize_read_exit(s); #ifdef COMPAT_OSOCK if (cmd == OOSIOCGIFCONF) { struct osockaddr *osa = (struct osockaddr *)&ifr.ifr_addr; /* * If it does not fit, we don't bother with it */ if (sa->sa_len > sizeof(*osa)) goto next_ifa; memcpy(&ifr.ifr_addr, sa, sa->sa_len); osa->sa_family = sa->sa_family; if (space >= sz) { error = copyout(&ifr, ifrp, sz); ifrp++; } } else #endif if (sa->sa_len <= sizeof(*sa)) { memcpy(&ifr.ifr_addr, sa, sa->sa_len); if (space >= sz) { error = copyout(&ifr, ifrp, sz); ifrp++; } } else { space -= sa->sa_len - sizeof(*sa); if (space >= sz) { error = copyout(&ifr.ifr_name, ifrp, sizeof(ifr.ifr_name)); if (error == 0) { error = copyout(sa, &ifrp->ifr_addr, sa->sa_len); } ifrp = (struct oifreq *) (sa->sa_len + (char *)&ifrp->ifr_addr); } } if (error != 0) { ifa_release(ifa, &psref_ifa); goto release_exit; } space -= sz; #ifdef COMPAT_OSOCK next_ifa: #endif s = pserialize_read_enter(); ifa_release(ifa, &psref_ifa); } pserialize_read_exit(s); next: s = pserialize_read_enter(); if_release(ifp, &psref); } pserialize_read_exit(s); curlwp_bindx(bound); if (docopy) ifc->ifc_len -= space; else ifc->ifc_len = -space; return 0; release_exit: if_release(ifp, &psref); curlwp_bindx(bound); return error; } void uipc_syscalls_40_init(void) { MODULE_HOOK_SET(uipc_syscalls_40_hook, compat_ifconf); } void uipc_syscalls_40_fini(void) { MODULE_HOOK_UNSET(uipc_syscalls_40_hook); }
260 906 465 929 31 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 /* $NetBSD: wapbl.h,v 1.21 2018/12/10 21:19:33 jdolecek Exp $ */ /*- * Copyright (c) 2003,2008 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Wasabi Systems, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #ifndef _SYS_WAPBL_H #define _SYS_WAPBL_H #include <sys/mutex.h> #if defined(_KERNEL) || defined(_KMEMUSER) #include <miscfs/specfs/specdev.h> #endif /* This header file describes the api and data structures for * write ahead physical block logging (WAPBL) support. */ #if defined(_KERNEL_OPT) #include "opt_wapbl.h" #endif #ifdef WAPBL_DEBUG #ifndef WAPBL_DEBUG_PRINT #define WAPBL_DEBUG_PRINT (WAPBL_PRINT_REPLAY | WAPBL_PRINT_OPEN) #endif #if 0 #define WAPBL_DEBUG_BUFBYTES #endif #endif #ifdef WAPBL_DEBUG_PRINT enum { WAPBL_PRINT_OPEN = 0x1, WAPBL_PRINT_FLUSH = 0x2, WAPBL_PRINT_TRUNCATE = 0x4, WAPBL_PRINT_TRANSACTION = 0x8, WAPBL_PRINT_BUFFER = 0x10, WAPBL_PRINT_BUFFER2 = 0x20, WAPBL_PRINT_ALLOC = 0x40, WAPBL_PRINT_INODE = 0x80, WAPBL_PRINT_WRITE = 0x100, WAPBL_PRINT_IO = 0x200, WAPBL_PRINT_REPLAY = 0x400, WAPBL_PRINT_ERROR = 0x800, WAPBL_PRINT_DISCARD = 0x1000, WAPBL_PRINT_BIODONE = 0x2000, }; #define WAPBL_PRINTF(mask, a) if (wapbl_debug_print & (mask)) printf a extern int wapbl_debug_print; #else #define WAPBL_PRINTF(mask, a) #endif /****************************************************************/ #include <sys/queue.h> #include <sys/vnode.h> #include <sys/buf.h> #ifdef _KERNEL struct wapbl_entry; struct wapbl_replay; struct wapbl; struct wapbl_dealloc { TAILQ_ENTRY(wapbl_dealloc) wd_entries; daddr_t wd_blkno; /* address of block */ int wd_len; /* size of block */ }; typedef void (*wapbl_flush_fn_t)(struct mount *, struct wapbl_dealloc *); /* * This structure holds per transaction log information */ struct wapbl_entry { struct wapbl *we_wapbl; SIMPLEQ_ENTRY(wapbl_entry) we_entries; size_t we_bufcount; /* Count of unsynced buffers */ size_t we_reclaimable_bytes; /* Number on disk bytes for this transaction */ int we_error; #ifdef WAPBL_DEBUG_BUFBYTES size_t we_unsynced_bufbytes; /* Byte count of unsynced buffers */ #endif }; /* Start using a log */ int wapbl_start(struct wapbl **, struct mount *, struct vnode *, daddr_t, size_t, size_t, struct wapbl_replay *, wapbl_flush_fn_t, wapbl_flush_fn_t); /* Discard the current transaction, potentially dangerous */ void wapbl_discard(struct wapbl *); /* stop using a log */ int wapbl_stop(struct wapbl *, int); /* * Begin a new transaction or increment transaction recursion * level if called while a transaction is already in progress * by the current process. */ int wapbl_begin(struct wapbl *, const char *, int); /* End a transaction or decrement the transaction recursion level */ void wapbl_end(struct wapbl *); /* * Add a new buffer to the current transaction. The buffers * data will be copied to the current transaction log and the * buffer will be marked B_LOCKED so that it will not be * flushed to disk by the syncer or reallocated. */ void wapbl_add_buf(struct wapbl *, struct buf *); /* Remove a buffer from the current transaction. */ void wapbl_remove_buf(struct wapbl *, struct buf *); void wapbl_resize_buf(struct wapbl *, struct buf *, long, long); /* * This will flush all completed transactions to disk and * start asynchronous writes on the associated buffers */ int wapbl_flush(struct wapbl *, int); /* * Inodes that are allocated but have zero link count * must be registered with the current transaction * so they may be recorded in the log and cleaned up later. * registration/unregistration of ino numbers already registered is ok. */ void wapbl_register_inode(struct wapbl *, ino_t, mode_t); void wapbl_unregister_inode(struct wapbl *, ino_t, mode_t); /* * Metadata block deallocations must be registered so * that revocations records can be written and to prevent * the corresponding blocks from being reused as data * blocks until the log is on disk. */ int wapbl_register_deallocation(struct wapbl *, daddr_t, int, bool, void **); void wapbl_unregister_deallocation(struct wapbl *, void *); void wapbl_jlock_assert(struct wapbl *wl); void wapbl_junlock_assert(struct wapbl *wl); void wapbl_print(struct wapbl *wl, int full, void (*pr)(const char *, ...) __printflike(1, 2)); #if defined(WAPBL_DEBUG) || defined(DDB) void wapbl_dump(struct wapbl *); #endif void wapbl_biodone(struct buf *); extern const struct wapbl_ops wapbl_ops; static __inline struct mount * wapbl_vptomp(struct vnode *vp) { struct mount *mp; mp = NULL; if (vp != NULL) { if (vp->v_type == VBLK) mp = spec_node_getmountedfs(vp); else mp = vp->v_mount; } return mp; } static __inline bool wapbl_vphaswapbl(struct vnode *vp) { struct mount *mp; if (vp == NULL) return false; mp = wapbl_vptomp(vp); return mp && mp->mnt_wapbl; } #endif /* _KERNEL */ /****************************************************************/ /* Replay support */ #ifdef WAPBL_INTERNAL LIST_HEAD(wapbl_blk_head, wapbl_blk); struct wapbl_replay { struct vnode *wr_logvp; struct vnode *wr_devvp; daddr_t wr_logpbn; int wr_log_dev_bshift; int wr_fs_dev_bshift; int64_t wr_circ_off; int64_t wr_circ_size; uint32_t wr_generation; void *wr_scratch; struct wapbl_blk_head *wr_blkhash; u_long wr_blkhashmask; int wr_blkhashcnt; off_t wr_inodeshead; off_t wr_inodestail; int wr_inodescnt; struct { uint32_t wr_inumber; uint32_t wr_imode; } *wr_inodes; }; #define wapbl_replay_isopen(wr) ((wr)->wr_scratch != 0) /* Supply this to provide i/o support */ int wapbl_write(void *, size_t, struct vnode *, daddr_t); int wapbl_read(void *, size_t, struct vnode *, daddr_t); /****************************************************************/ #else struct wapbl_replay; #endif /* WAPBL_INTERNAL */ /****************************************************************/ int wapbl_replay_start(struct wapbl_replay **, struct vnode *, daddr_t, size_t, size_t); void wapbl_replay_stop(struct wapbl_replay *); void wapbl_replay_free(struct wapbl_replay *); int wapbl_replay_write(struct wapbl_replay *, struct vnode *); int wapbl_replay_can_read(struct wapbl_replay *, daddr_t, long); int wapbl_replay_read(struct wapbl_replay *, void *, daddr_t, long); /****************************************************************/ #endif /* !_SYS_WAPBL_H */
442 59 58 273 274 273 24 24 24 24 24 75 73 74 75 1 71 23 17 17 17 17 17 58 58 58 38 38 38 33 10 38 52 52 51 52 50 51 37 17 52 52 70 69 70 70 68 15 15 15 332 332 332 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 /* $NetBSD: kern_timeout.c,v 1.79 2023/10/08 13:23:05 ad Exp $ */ /*- * Copyright (c) 2003, 2006, 2007, 2008, 2009, 2019, 2023 * The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Jason R. Thorpe, and by Andrew Doran. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /* * Copyright (c) 2001 Thomas Nordin <nordin@openbsd.org> * Copyright (c) 2000-2001 Artur Grabowski <art@openbsd.org> * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The name of the author may not be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL * THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: kern_timeout.c,v 1.79 2023/10/08 13:23:05 ad Exp $"); /* * Timeouts are kept in a hierarchical timing wheel. The c_time is the * value of c_cpu->cc_ticks when the timeout should be called. There are * four levels with 256 buckets each. See 'Scheme 7' in "Hashed and * Hierarchical Timing Wheels: Efficient Data Structures for Implementing * a Timer Facility" by George Varghese and Tony Lauck. * * Some of the "math" in here is a bit tricky. We have to beware of * wrapping ints. * * We use the fact that any element added to the queue must be added with * a positive time. That means that any element `to' on the queue cannot * be scheduled to timeout further in time than INT_MAX, but c->c_time can * be positive or negative so comparing it with anything is dangerous. * The only way we can use the c->c_time value in any predictable way is * when we calculate how far in the future `to' will timeout - "c->c_time * - c->c_cpu->cc_ticks". The result will always be positive for future * timeouts and 0 or negative for due timeouts. */ #define _CALLOUT_PRIVATE #include <sys/param.h> #include <sys/systm.h> #include <sys/kernel.h> #include <sys/callout.h> #include <sys/lwp.h> #include <sys/mutex.h> #include <sys/proc.h> #include <sys/sleepq.h> #include <sys/syncobj.h> #include <sys/evcnt.h> #include <sys/intr.h> #include <sys/cpu.h> #include <sys/kmem.h> #include <sys/sdt.h> #ifdef DDB #include <machine/db_machdep.h> #include <ddb/db_interface.h> #include <ddb/db_access.h> #include <ddb/db_cpu.h> #include <ddb/db_sym.h> #include <ddb/db_output.h> #endif #define BUCKETS 1024 #define WHEELSIZE 256 #define WHEELMASK 255 #define WHEELBITS 8 #define MASKWHEEL(wheel, time) (((time) >> ((wheel)*WHEELBITS)) & WHEELMASK) #define BUCKET(cc, rel, abs) \ (((rel) <= (1 << (2*WHEELBITS))) \ ? ((rel) <= (1 << WHEELBITS)) \ ? &(cc)->cc_wheel[MASKWHEEL(0, (abs))] \ : &(cc)->cc_wheel[MASKWHEEL(1, (abs)) + WHEELSIZE] \ : ((rel) <= (1 << (3*WHEELBITS))) \ ? &(cc)->cc_wheel[MASKWHEEL(2, (abs)) + 2*WHEELSIZE] \ : &(cc)->cc_wheel[MASKWHEEL(3, (abs)) + 3*WHEELSIZE]) #define MOVEBUCKET(cc, wheel, time) \ CIRCQ_APPEND(&(cc)->cc_todo, \ &(cc)->cc_wheel[MASKWHEEL((wheel), (time)) + (wheel)*WHEELSIZE]) /* * Circular queue definitions. */ #define CIRCQ_INIT(list) \ do { \ (list)->cq_next_l = (list); \ (list)->cq_prev_l = (list); \ } while (/*CONSTCOND*/0) #define CIRCQ_INSERT(elem, list) \ do { \ (elem)->cq_prev_e = (list)->cq_prev_e; \ (elem)->cq_next_l = (list); \ (list)->cq_prev_l->cq_next_l = (elem); \ (list)->cq_prev_l = (elem); \ } while (/*CONSTCOND*/0) #define CIRCQ_APPEND(fst, snd) \ do { \ if (!CIRCQ_EMPTY(snd)) { \ (fst)->cq_prev_l->cq_next_l = (snd)->cq_next_l; \ (snd)->cq_next_l->cq_prev_l = (fst)->cq_prev_l; \ (snd)->cq_prev_l->cq_next_l = (fst); \ (fst)->cq_prev_l = (snd)->cq_prev_l; \ CIRCQ_INIT(snd); \ } \ } while (/*CONSTCOND*/0) #define CIRCQ_REMOVE(elem) \ do { \ (elem)->cq_next_l->cq_prev_e = (elem)->cq_prev_e; \ (elem)->cq_prev_l->cq_next_e = (elem)->cq_next_e; \ } while (/*CONSTCOND*/0) #define CIRCQ_FIRST(list) ((list)->cq_next_e) #define CIRCQ_NEXT(elem) ((elem)->cq_next_e) #define CIRCQ_LAST(elem,list) ((elem)->cq_next_l == (list)) #define CIRCQ_EMPTY(list) ((list)->cq_next_l == (list)) struct callout_cpu { kmutex_t *cc_lock; sleepq_t cc_sleepq; u_int cc_nwait; u_int cc_ticks; lwp_t *cc_lwp; callout_impl_t *cc_active; struct evcnt cc_ev_late; struct evcnt cc_ev_block; struct callout_circq cc_todo; /* Worklist */ struct callout_circq cc_wheel[BUCKETS]; /* Queues of timeouts */ char cc_name1[12]; char cc_name2[12]; struct cpu_info *cc_cpu; }; #ifdef DDB static struct callout_cpu ccb; #endif #ifndef CRASH /* _KERNEL */ static void callout_softclock(void *); static void callout_wait(callout_impl_t *, void *, kmutex_t *); static struct callout_cpu callout_cpu0 __cacheline_aligned; static void *callout_sih __read_mostly; SDT_PROBE_DEFINE2(sdt, kernel, callout, init, "struct callout *"/*ch*/, "unsigned"/*flags*/); SDT_PROBE_DEFINE1(sdt, kernel, callout, destroy, "struct callout *"/*ch*/); SDT_PROBE_DEFINE4(sdt, kernel, callout, setfunc, "struct callout *"/*ch*/, "void (*)(void *)"/*func*/, "void *"/*arg*/, "unsigned"/*flags*/); SDT_PROBE_DEFINE5(sdt, kernel, callout, schedule, "struct callout *"/*ch*/, "void (*)(void *)"/*func*/, "void *"/*arg*/, "unsigned"/*flags*/, "int"/*ticks*/); SDT_PROBE_DEFINE6(sdt, kernel, callout, migrate, "struct callout *"/*ch*/, "void (*)(void *)"/*func*/, "void *"/*arg*/, "unsigned"/*flags*/, "struct cpu_info *"/*ocpu*/, "struct cpu_info *"/*ncpu*/); SDT_PROBE_DEFINE4(sdt, kernel, callout, entry, "struct callout *"/*ch*/, "void (*)(void *)"/*func*/, "void *"/*arg*/, "unsigned"/*flags*/); SDT_PROBE_DEFINE4(sdt, kernel, callout, return, "struct callout *"/*ch*/, "void (*)(void *)"/*func*/, "void *"/*arg*/, "unsigned"/*flags*/); SDT_PROBE_DEFINE5(sdt, kernel, callout, stop, "struct callout *"/*ch*/, "void (*)(void *)"/*func*/, "void *"/*arg*/, "unsigned"/*flags*/, "bool"/*expired*/); SDT_PROBE_DEFINE4(sdt, kernel, callout, halt, "struct callout *"/*ch*/, "void (*)(void *)"/*func*/, "void *"/*arg*/, "unsigned"/*flags*/); SDT_PROBE_DEFINE5(sdt, kernel, callout, halt__done, "struct callout *"/*ch*/, "void (*)(void *)"/*func*/, "void *"/*arg*/, "unsigned"/*flags*/, "bool"/*expired*/); syncobj_t callout_syncobj = { .sobj_name = "callout", .sobj_flag = SOBJ_SLEEPQ_SORTED, .sobj_boostpri = PRI_KERNEL, .sobj_unsleep = sleepq_unsleep, .sobj_changepri = sleepq_changepri, .sobj_lendpri = sleepq_lendpri, .sobj_owner = syncobj_noowner, }; static inline kmutex_t * callout_lock(callout_impl_t *c) { struct callout_cpu *cc; kmutex_t *lock; for (;;) { cc = c->c_cpu; lock = cc->cc_lock; mutex_spin_enter(lock); if (__predict_true(cc == c->c_cpu)) return lock; mutex_spin_exit(lock); } } /* * Check if the callout is currently running on an LWP that isn't curlwp. */ static inline bool callout_running_somewhere_else(callout_impl_t *c, struct callout_cpu *cc) { KASSERT(c->c_cpu == cc); return cc->cc_active == c && cc->cc_lwp != curlwp; } /* * callout_startup: * * Initialize the callout facility, called at system startup time. * Do just enough to allow callouts to be safely registered. */ void callout_startup(void) { struct callout_cpu *cc; int b; KASSERT(curcpu()->ci_data.cpu_callout == NULL); cc = &callout_cpu0; cc->cc_lock = mutex_obj_alloc(MUTEX_DEFAULT, IPL_SCHED); CIRCQ_INIT(&cc->cc_todo); for (b = 0; b < BUCKETS; b++) CIRCQ_INIT(&cc->cc_wheel[b]); curcpu()->ci_data.cpu_callout = cc; } /* * callout_init_cpu: * * Per-CPU initialization. */ CTASSERT(sizeof(callout_impl_t) <= sizeof(callout_t)); void callout_init_cpu(struct cpu_info *ci) { struct callout_cpu *cc; int b; if ((cc = ci->ci_data.cpu_callout) == NULL) { cc = kmem_zalloc(sizeof(*cc), KM_SLEEP); cc->cc_lock = mutex_obj_alloc(MUTEX_DEFAULT, IPL_SCHED); CIRCQ_INIT(&cc->cc_todo); for (b = 0; b < BUCKETS; b++) CIRCQ_INIT(&cc->cc_wheel[b]); } else { /* Boot CPU, one time only. */ callout_sih = softint_establish(SOFTINT_CLOCK | SOFTINT_MPSAFE, callout_softclock, NULL); if (callout_sih == NULL) panic("callout_init_cpu (2)"); } sleepq_init(&cc->cc_sleepq); snprintf(cc->cc_name1, sizeof(cc->cc_name1), "late/%u", cpu_index(ci)); evcnt_attach_dynamic(&cc->cc_ev_late, EVCNT_TYPE_MISC, NULL, "callout", cc->cc_name1); snprintf(cc->cc_name2, sizeof(cc->cc_name2), "wait/%u", cpu_index(ci)); evcnt_attach_dynamic(&cc->cc_ev_block, EVCNT_TYPE_MISC, NULL, "callout", cc->cc_name2); cc->cc_cpu = ci; ci->ci_data.cpu_callout = cc; } /* * callout_init: * * Initialize a callout structure. This must be quick, so we fill * only the minimum number of fields. */ void callout_init(callout_t *cs, u_int flags) { callout_impl_t *c = (callout_impl_t *)cs; struct callout_cpu *cc; KASSERT((flags & ~CALLOUT_FLAGMASK) == 0); SDT_PROBE2(sdt, kernel, callout, init, cs, flags); cc = curcpu()->ci_data.cpu_callout; c->c_func = NULL; c->c_magic = CALLOUT_MAGIC; if (__predict_true((flags & CALLOUT_MPSAFE) != 0 && cc != NULL)) { c->c_flags = flags; c->c_cpu = cc; return; } c->c_flags = flags | CALLOUT_BOUND; c->c_cpu = &callout_cpu0; } /* * callout_destroy: * * Destroy a callout structure. The callout must be stopped. */ void callout_destroy(callout_t *cs) { callout_impl_t *c = (callout_impl_t *)cs; SDT_PROBE1(sdt, kernel, callout, destroy, cs); KASSERTMSG(c->c_magic == CALLOUT_MAGIC, "callout %p: c_magic (%#x) != CALLOUT_MAGIC (%#x)", c, c->c_magic, CALLOUT_MAGIC); /* * It's not necessary to lock in order to see the correct value * of c->c_flags. If the callout could potentially have been * running, the current thread should have stopped it. */ KASSERTMSG((c->c_flags & CALLOUT_PENDING) == 0, "pending callout %p: c_func (%p) c_flags (%#x) destroyed from %p", c, c->c_func, c->c_flags, __builtin_return_address(0)); KASSERTMSG(!callout_running_somewhere_else(c, c->c_cpu), "running callout %p: c_func (%p) c_flags (%#x) destroyed from %p", c, c->c_func, c->c_flags, __builtin_return_address(0)); c->c_magic = 0; } /* * callout_schedule_locked: * * Schedule a callout to run. The function and argument must * already be set in the callout structure. Must be called with * callout_lock. */ static void callout_schedule_locked(callout_impl_t *c, kmutex_t *lock, int to_ticks) { struct callout_cpu *cc, *occ; int old_time; SDT_PROBE5(sdt, kernel, callout, schedule, c, c->c_func, c->c_arg, c->c_flags, to_ticks); KASSERT(to_ticks >= 0); KASSERT(c->c_func != NULL); /* Initialize the time here, it won't change. */ occ = c->c_cpu; c->c_flags &= ~(CALLOUT_FIRED | CALLOUT_INVOKING); /* * If this timeout is already scheduled and now is moved * earlier, reschedule it now. Otherwise leave it in place * and let it be rescheduled later. */ if ((c->c_flags & CALLOUT_PENDING) != 0) { /* Leave on existing CPU. */ old_time = c->c_time; c->c_time = to_ticks + occ->cc_ticks; if (c->c_time - old_time < 0) { CIRCQ_REMOVE(&c->c_list); CIRCQ_INSERT(&c->c_list, &occ->cc_todo); } mutex_spin_exit(lock); return; } cc = curcpu()->ci_data.cpu_callout; if ((c->c_flags & CALLOUT_BOUND) != 0 || cc == occ || !mutex_tryenter(cc->cc_lock)) { /* Leave on existing CPU. */ c->c_time = to_ticks + occ->cc_ticks; c->c_flags |= CALLOUT_PENDING; CIRCQ_INSERT(&c->c_list, &occ->cc_todo); } else { /* Move to this CPU. */ c->c_cpu = cc; c->c_time = to_ticks + cc->cc_ticks; c->c_flags |= CALLOUT_PENDING; CIRCQ_INSERT(&c->c_list, &cc->cc_todo); mutex_spin_exit(cc->cc_lock); SDT_PROBE6(sdt, kernel, callout, migrate, c, c->c_func, c->c_arg, c->c_flags, occ->cc_cpu, cc->cc_cpu); } mutex_spin_exit(lock); } /* * callout_reset: * * Reset a callout structure with a new function and argument, and * schedule it to run. */ void callout_reset(callout_t *cs, int to_ticks, void (*func)(void *), void *arg) { callout_impl_t *c = (callout_impl_t *)cs; kmutex_t *lock; KASSERT(c->c_magic == CALLOUT_MAGIC); KASSERT(func != NULL); lock = callout_lock(c); SDT_PROBE4(sdt, kernel, callout, setfunc, cs, func, arg, c->c_flags); c->c_func = func; c->c_arg = arg; callout_schedule_locked(c, lock, to_ticks); } /* * callout_schedule: * * Schedule a callout to run. The function and argument must * already be set in the callout structure. */ void callout_schedule(callout_t *cs, int to_ticks) { callout_impl_t *c = (callout_impl_t *)cs; kmutex_t *lock; KASSERT(c->c_magic == CALLOUT_MAGIC); lock = callout_lock(c); callout_schedule_locked(c, lock, to_ticks); } /* * callout_stop: * * Try to cancel a pending callout. It may be too late: the callout * could be running on another CPU. If called from interrupt context, * the callout could already be in progress at a lower priority. */ bool callout_stop(callout_t *cs) { callout_impl_t *c = (callout_impl_t *)cs; kmutex_t *lock; bool expired; KASSERT(c->c_magic == CALLOUT_MAGIC); lock = callout_lock(c); if ((c->c_flags & CALLOUT_PENDING) != 0) CIRCQ_REMOVE(&c->c_list); expired = ((c->c_flags & CALLOUT_FIRED) != 0); c->c_flags &= ~(CALLOUT_PENDING|CALLOUT_FIRED); SDT_PROBE5(sdt, kernel, callout, stop, c, c->c_func, c->c_arg, c->c_flags, expired); mutex_spin_exit(lock); return expired; } /* * callout_halt: * * Cancel a pending callout. If in-flight, block until it completes. * May not be called from a hard interrupt handler. If the callout * can take locks, the caller of callout_halt() must not hold any of * those locks, otherwise the two could deadlock. If 'interlock' is * non-NULL and we must wait for the callout to complete, it will be * released and re-acquired before returning. */ bool callout_halt(callout_t *cs, void *interlock) { callout_impl_t *c = (callout_impl_t *)cs; kmutex_t *lock; KASSERT(c->c_magic == CALLOUT_MAGIC); KASSERT(!cpu_intr_p()); KASSERT(interlock == NULL || mutex_owned(interlock)); /* Fast path. */ lock = callout_lock(c); SDT_PROBE4(sdt, kernel, callout, halt, c, c->c_func, c->c_arg, c->c_flags); if ((c->c_flags & CALLOUT_PENDING) != 0) CIRCQ_REMOVE(&c->c_list); c->c_flags &= ~(CALLOUT_PENDING|CALLOUT_FIRED); if (__predict_false(callout_running_somewhere_else(c, c->c_cpu))) { callout_wait(c, interlock, lock); return true; } SDT_PROBE5(sdt, kernel, callout, halt__done, c, c->c_func, c->c_arg, c->c_flags, /*expired*/false); mutex_spin_exit(lock); return false; } /* * callout_wait: * * Slow path for callout_halt(). Deliberately marked __noinline to * prevent unneeded overhead in the caller. */ static void __noinline callout_wait(callout_impl_t *c, void *interlock, kmutex_t *lock) { struct callout_cpu *cc; struct lwp *l; kmutex_t *relock; int nlocks; l = curlwp; relock = NULL; for (;;) { /* * At this point we know the callout is not pending, but it * could be running on a CPU somewhere. That can be curcpu * in a few cases: * * - curlwp is a higher priority soft interrupt * - the callout blocked on a lock and is currently asleep * - the callout itself has called callout_halt() (nice!) */ cc = c->c_cpu; if (__predict_true(!callout_running_somewhere_else(c, cc))) break; /* It's running - need to wait for it to complete. */ if (interlock != NULL) { /* * Avoid potential scheduler lock order problems by * dropping the interlock without the callout lock * held; then retry. */ mutex_spin_exit(lock); mutex_exit(interlock); relock = interlock; interlock = NULL; } else { /* XXX Better to do priority inheritance. */ KASSERT(l->l_wchan == NULL); cc->cc_nwait++; cc->cc_ev_block.ev_count++; nlocks = sleepq_enter(&cc->cc_sleepq, l, cc->cc_lock); sleepq_enqueue(&cc->cc_sleepq, cc, "callout", &callout_syncobj, false); sleepq_block(0, false, &callout_syncobj, nlocks); } /* * Re-lock the callout and check the state of play again. * It's a common design pattern for callouts to re-schedule * themselves so put a stop to it again if needed. */ lock = callout_lock(c); if ((c->c_flags & CALLOUT_PENDING) != 0) CIRCQ_REMOVE(&c->c_list); c->c_flags &= ~(CALLOUT_PENDING|CALLOUT_FIRED); } SDT_PROBE5(sdt, kernel, callout, halt__done, c, c->c_func, c->c_arg, c->c_flags, /*expired*/true); mutex_spin_exit(lock); if (__predict_false(relock != NULL)) mutex_enter(relock); } #ifdef notyet /* * callout_bind: * * Bind a callout so that it will only execute on one CPU. * The callout must be stopped, and must be MPSAFE. * * XXX Disabled for now until it is decided how to handle * offlined CPUs. We may want weak+strong binding. */ void callout_bind(callout_t *cs, struct cpu_info *ci) { callout_impl_t *c = (callout_impl_t *)cs; struct callout_cpu *cc; kmutex_t *lock; KASSERT((c->c_flags & CALLOUT_PENDING) == 0); KASSERT(c->c_cpu->cc_active != c); KASSERT(c->c_magic == CALLOUT_MAGIC); KASSERT((c->c_flags & CALLOUT_MPSAFE) != 0); lock = callout_lock(c); cc = ci->ci_data.cpu_callout; c->c_flags |= CALLOUT_BOUND; if (c->c_cpu != cc) { /* * Assigning c_cpu effectively unlocks the callout * structure, as we don't hold the new CPU's lock. * Issue memory barrier to prevent accesses being * reordered. */ membar_exit(); c->c_cpu = cc; } mutex_spin_exit(lock); } #endif void callout_setfunc(callout_t *cs, void (*func)(void *), void *arg) { callout_impl_t *c = (callout_impl_t *)cs; kmutex_t *lock; KASSERT(c->c_magic == CALLOUT_MAGIC); KASSERT(func != NULL); lock = callout_lock(c); SDT_PROBE4(sdt, kernel, callout, setfunc, cs, func, arg, c->c_flags); c->c_func = func; c->c_arg = arg; mutex_spin_exit(lock); } bool callout_expired(callout_t *cs) { callout_impl_t *c = (callout_impl_t *)cs; kmutex_t *lock; bool rv; KASSERT(c->c_magic == CALLOUT_MAGIC); lock = callout_lock(c); rv = ((c->c_flags & CALLOUT_FIRED) != 0); mutex_spin_exit(lock); return rv; } bool callout_active(callout_t *cs) { callout_impl_t *c = (callout_impl_t *)cs; kmutex_t *lock; bool rv; KASSERT(c->c_magic == CALLOUT_MAGIC); lock = callout_lock(c); rv = ((c->c_flags & (CALLOUT_PENDING|CALLOUT_FIRED)) != 0); mutex_spin_exit(lock); return rv; } bool callout_pending(callout_t *cs) { callout_impl_t *c = (callout_impl_t *)cs; kmutex_t *lock; bool rv; KASSERT(c->c_magic == CALLOUT_MAGIC); lock = callout_lock(c); rv = ((c->c_flags & CALLOUT_PENDING) != 0); mutex_spin_exit(lock); return rv; } bool callout_invoking(callout_t *cs) { callout_impl_t *c = (callout_impl_t *)cs; kmutex_t *lock; bool rv; KASSERT(c->c_magic == CALLOUT_MAGIC); lock = callout_lock(c); rv = ((c->c_flags & CALLOUT_INVOKING) != 0); mutex_spin_exit(lock); return rv; } void callout_ack(callout_t *cs) { callout_impl_t *c = (callout_impl_t *)cs; kmutex_t *lock; KASSERT(c->c_magic == CALLOUT_MAGIC); lock = callout_lock(c); c->c_flags &= ~CALLOUT_INVOKING; mutex_spin_exit(lock); } /* * callout_hardclock: * * Called from hardclock() once every tick. We schedule a soft * interrupt if there is work to be done. */ void callout_hardclock(void) { struct callout_cpu *cc; int needsoftclock, ticks; cc = curcpu()->ci_data.cpu_callout; mutex_spin_enter(cc->cc_lock); ticks = ++cc->cc_ticks; MOVEBUCKET(cc, 0, ticks); if (MASKWHEEL(0, ticks) == 0) { MOVEBUCKET(cc, 1, ticks); if (MASKWHEEL(1, ticks) == 0) { MOVEBUCKET(cc, 2, ticks); if (MASKWHEEL(2, ticks) == 0) MOVEBUCKET(cc, 3, ticks); } } needsoftclock = !CIRCQ_EMPTY(&cc->cc_todo); mutex_spin_exit(cc->cc_lock); if (needsoftclock) softint_schedule(callout_sih); } /* * callout_softclock: * * Soft interrupt handler, scheduled above if there is work to * be done. Callouts are made in soft interrupt context. */ static void callout_softclock(void *v) { callout_impl_t *c; struct callout_cpu *cc; void (*func)(void *); void *arg; int mpsafe, count, ticks, delta; u_int flags __unused; lwp_t *l; l = curlwp; KASSERT(l->l_cpu == curcpu()); cc = l->l_cpu->ci_data.cpu_callout; mutex_spin_enter(cc->cc_lock); cc->cc_lwp = l; while (!CIRCQ_EMPTY(&cc->cc_todo)) { c = CIRCQ_FIRST(&cc->cc_todo); KASSERT(c->c_magic == CALLOUT_MAGIC); KASSERT(c->c_func != NULL); KASSERT(c->c_cpu == cc); KASSERT((c->c_flags & CALLOUT_PENDING) != 0); KASSERT((c->c_flags & CALLOUT_FIRED) == 0); CIRCQ_REMOVE(&c->c_list); /* If due run it, otherwise insert it into the right bucket. */ ticks = cc->cc_ticks; delta = (int)((unsigned)c->c_time - (unsigned)ticks); if (delta > 0) { CIRCQ_INSERT(&c->c_list, BUCKET(cc, delta, c->c_time)); continue; } if (delta < 0) cc->cc_ev_late.ev_count++; c->c_flags = (c->c_flags & ~CALLOUT_PENDING) | (CALLOUT_FIRED | CALLOUT_INVOKING); mpsafe = (c->c_flags & CALLOUT_MPSAFE); func = c->c_func; arg = c->c_arg; cc->cc_active = c; flags = c->c_flags; mutex_spin_exit(cc->cc_lock); KASSERT(func != NULL); SDT_PROBE4(sdt, kernel, callout, entry, c, func, arg, flags); if (__predict_false(!mpsafe)) { KERNEL_LOCK(1, NULL); (*func)(arg); KERNEL_UNLOCK_ONE(NULL); } else (*func)(arg); SDT_PROBE4(sdt, kernel, callout, return, c, func, arg, flags); KASSERTMSG(l->l_blcnt == 0, "callout %p func %p leaked %d biglocks", c, func, l->l_blcnt); mutex_spin_enter(cc->cc_lock); /* * We can't touch 'c' here because it might be * freed already. If LWPs waiting for callout * to complete, awaken them. */ cc->cc_active = NULL; if ((count = cc->cc_nwait) != 0) { cc->cc_nwait = 0; /* sleepq_wake() drops the lock. */ sleepq_wake(&cc->cc_sleepq, cc, count, cc->cc_lock); mutex_spin_enter(cc->cc_lock); } } cc->cc_lwp = NULL; mutex_spin_exit(cc->cc_lock); } #endif /* !CRASH */ #ifdef DDB static void db_show_callout_bucket(struct callout_cpu *cc, struct callout_circq *kbucket, struct callout_circq *bucket) { callout_impl_t *c, ci; db_expr_t offset; const char *name; static char question[] = "?"; int b; if (CIRCQ_LAST(bucket, kbucket)) return; for (c = CIRCQ_FIRST(bucket); /*nothing*/; c = CIRCQ_NEXT(&c->c_list)) { db_read_bytes((db_addr_t)c, sizeof(ci), (char *)&ci); c = &ci; db_find_sym_and_offset((db_addr_t)(intptr_t)c->c_func, &name, &offset); name = name ? name : question; b = (bucket - cc->cc_wheel); if (b < 0) b = -WHEELSIZE; db_printf("%9d %2d/%-4d %16lx %s\n", c->c_time - cc->cc_ticks, b / WHEELSIZE, b, (u_long)c->c_arg, name); if (CIRCQ_LAST(&c->c_list, kbucket)) break; } } void db_show_callout(db_expr_t addr, bool haddr, db_expr_t count, const char *modif) { struct callout_cpu *cc; struct cpu_info *ci; int b; #ifndef CRASH db_printf("hardclock_ticks now: %d\n", getticks()); #endif db_printf(" ticks wheel arg func\n"); /* * Don't lock the callwheel; all the other CPUs are paused * anyhow, and we might be called in a circumstance where * some other CPU was paused while holding the lock. */ for (ci = db_cpu_first(); ci != NULL; ci = db_cpu_next(ci)) { db_read_bytes((db_addr_t)ci + offsetof(struct cpu_info, ci_data.cpu_callout), sizeof(cc), (char *)&cc); db_read_bytes((db_addr_t)cc, sizeof(ccb), (char *)&ccb); db_show_callout_bucket(&ccb, &cc->cc_todo, &ccb.cc_todo); } for (b = 0; b < BUCKETS; b++) { for (ci = db_cpu_first(); ci != NULL; ci = db_cpu_next(ci)) { db_read_bytes((db_addr_t)ci + offsetof(struct cpu_info, ci_data.cpu_callout), sizeof(cc), (char *)&cc); db_read_bytes((db_addr_t)cc, sizeof(ccb), (char *)&ccb); db_show_callout_bucket(&ccb, &cc->cc_wheel[b], &ccb.cc_wheel[b]); } } } #endif /* DDB */
3 3 3 3 3 3 3 3 3 3 3 3 3 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 /* $NetBSD: usb.c,v 1.203 2024/02/04 05:43:06 mrg Exp $ */ /* * Copyright (c) 1998, 2002, 2008, 2012 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Lennart Augustsson (lennart@augustsson.net) at * Carlstedt Research & Technology and Matthew R. Green (mrg@eterna23.net). * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /* * USB specifications and other documentation can be found at * http://www.usb.org/developers/docs/ and * http://www.usb.org/developers/devclass_docs/ */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: usb.c,v 1.203 2024/02/04 05:43:06 mrg Exp $"); #ifdef _KERNEL_OPT #include "opt_usb.h" #include "opt_ddb.h" #include "opt_compat_netbsd.h" #endif #include <sys/param.h> #include <sys/systm.h> #include <sys/kernel.h> #include <sys/kmem.h> #include <sys/device.h> #include <sys/kthread.h> #include <sys/proc.h> #include <sys/conf.h> #include <sys/fcntl.h> #include <sys/poll.h> #include <sys/select.h> #include <sys/vnode.h> #include <sys/signalvar.h> #include <sys/intr.h> #include <sys/module.h> #include <sys/mutex.h> #include <sys/bus.h> #include <sys/once.h> #include <sys/atomic.h> #include <sys/sysctl.h> #include <sys/compat_stub.h> #include <sys/sdt.h> #include <dev/usb/usb.h> #include <dev/usb/usbdi.h> #include <dev/usb/usbdi_util.h> #include <dev/usb/usbdivar.h> #include <dev/usb/usb_verbose.h> #include <dev/usb/usb_quirks.h> #include <dev/usb/usbhist.h> #include <dev/usb/usb_sdt.h> #include "ioconf.h" #if defined(USB_DEBUG) #ifndef USBHIST_SIZE #define USBHIST_SIZE 50000 #endif static struct kern_history_ent usbhistbuf[USBHIST_SIZE]; USBHIST_DEFINE(usbhist) = KERNHIST_INITIALIZER(usbhist, usbhistbuf); #endif #define USB_DEV_MINOR 255 #ifdef USB_DEBUG /* * 0 - do usual exploration * 1 - do not use timeout exploration * >1 - do no exploration */ int usb_noexplore = 0; #ifndef USB_DEBUG_DEFAULT #define USB_DEBUG_DEFAULT 0 #endif int usbdebug = USB_DEBUG_DEFAULT; SYSCTL_SETUP(sysctl_hw_usb_setup, "sysctl hw.usb setup") { int err; const struct sysctlnode *rnode; const struct sysctlnode *cnode; err = sysctl_createv(clog, 0, NULL, &rnode, CTLFLAG_PERMANENT, CTLTYPE_NODE, "usb", SYSCTL_DESCR("usb global controls"), NULL, 0, NULL, 0, CTL_HW, CTL_CREATE, CTL_EOL); if (err) goto fail; /* control debugging printfs */ err = sysctl_createv(clog, 0, &rnode, &cnode, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_INT, "debug", SYSCTL_DESCR("Enable debugging output"), NULL, 0, &usbdebug, sizeof(usbdebug), CTL_CREATE, CTL_EOL); if (err) goto fail; return; fail: aprint_error("%s: sysctl_createv failed (err = %d)\n", __func__, err); } #else #define usb_noexplore 0 #endif #define DPRINTF(FMT,A,B,C,D) USBHIST_LOG(usbdebug,FMT,A,B,C,D) #define DPRINTFN(N,FMT,A,B,C,D) USBHIST_LOGN(usbdebug,N,FMT,A,B,C,D) struct usb_softc { #if 0 device_t sc_dev; /* base device */ #endif struct usbd_bus *sc_bus; /* USB controller */ struct usbd_port sc_port; /* dummy port for root hub */ struct lwp *sc_event_thread; struct lwp *sc_attach_thread; char sc_dying; bool sc_pmf_registered; }; struct usb_taskq { TAILQ_HEAD(, usb_task) tasks; kmutex_t lock; kcondvar_t cv; struct lwp *task_thread_lwp; const char *name; struct usb_task *current_task; }; static struct usb_taskq usb_taskq[USB_NUM_TASKQS]; /* XXX wrong place */ #ifdef KDTRACE_HOOKS #define __dtrace_used #else #define __dtrace_used __unused #endif SDT_PROVIDER_DEFINE(usb); SDT_PROBE_DEFINE3(usb, kernel, task, add, "struct usbd_device *"/*dev*/, "struct usb_task *"/*task*/, "int"/*q*/); SDT_PROBE_DEFINE2(usb, kernel, task, rem__start, "struct usbd_device *"/*dev*/, "struct usb_task *"/*task*/); SDT_PROBE_DEFINE3(usb, kernel, task, rem__done, "struct usbd_device *"/*dev*/, "struct usb_task *"/*task*/, "bool"/*removed*/); SDT_PROBE_DEFINE4(usb, kernel, task, rem__wait__start, "struct usbd_device *"/*dev*/, "struct usb_task *"/*task*/, "int"/*queue*/, "kmutex_t *"/*interlock*/); SDT_PROBE_DEFINE5(usb, kernel, task, rem__wait__done, "struct usbd_device *"/*dev*/, "struct usb_task *"/*task*/, "int"/*queue*/, "kmutex_t *"/*interlock*/, "bool"/*done*/); SDT_PROBE_DEFINE1(usb, kernel, task, start, "struct usb_task *"/*task*/); SDT_PROBE_DEFINE1(usb, kernel, task, done, "struct usb_task *"/*task*/); SDT_PROBE_DEFINE1(usb, kernel, bus, needs__explore, "struct usbd_bus *"/*bus*/); SDT_PROBE_DEFINE1(usb, kernel, bus, needs__reattach, "struct usbd_bus *"/*bus*/); SDT_PROBE_DEFINE1(usb, kernel, bus, discover__start, "struct usbd_bus *"/*bus*/); SDT_PROBE_DEFINE1(usb, kernel, bus, discover__done, "struct usbd_bus *"/*bus*/); SDT_PROBE_DEFINE1(usb, kernel, bus, explore__start, "struct usbd_bus *"/*bus*/); SDT_PROBE_DEFINE1(usb, kernel, bus, explore__done, "struct usbd_bus *"/*bus*/); SDT_PROBE_DEFINE1(usb, kernel, event, add, "struct usb_event *"/*uep*/); SDT_PROBE_DEFINE1(usb, kernel, event, drop, "struct usb_event *"/*uep*/); dev_type_open(usbopen); dev_type_close(usbclose); dev_type_read(usbread); dev_type_ioctl(usbioctl); dev_type_poll(usbpoll); dev_type_kqfilter(usbkqfilter); const struct cdevsw usb_cdevsw = { .d_open = usbopen, .d_close = usbclose, .d_read = usbread, .d_write = nowrite, .d_ioctl = usbioctl, .d_stop = nostop, .d_tty = notty, .d_poll = usbpoll, .d_mmap = nommap, .d_kqfilter = usbkqfilter, .d_discard = nodiscard, .d_flag = D_OTHER }; Static void usb_discover(struct usb_softc *); Static void usb_create_event_thread(device_t); Static void usb_event_thread(void *); Static void usb_task_thread(void *); /* * Count of USB busses */ int nusbbusses = 0; #define USB_MAX_EVENTS 100 struct usb_event_q { struct usb_event ue; SIMPLEQ_ENTRY(usb_event_q) next; }; Static SIMPLEQ_HEAD(, usb_event_q) usb_events = SIMPLEQ_HEAD_INITIALIZER(usb_events); Static int usb_nevents = 0; Static struct selinfo usb_selevent; Static kmutex_t usb_event_lock; Static kcondvar_t usb_event_cv; /* XXX this is gross and broken */ Static proc_t *usb_async_proc; /* process that wants USB SIGIO */ Static void *usb_async_sih; Static int usb_dev_open = 0; Static struct usb_event *usb_alloc_event(void); Static void usb_free_event(struct usb_event *); Static void usb_add_event(int, struct usb_event *); Static int usb_get_next_event(struct usb_event *); Static void usb_async_intr(void *); Static void usb_soft_intr(void *); Static const char *usbrev_str[] = USBREV_STR; static int usb_match(device_t, cfdata_t, void *); static void usb_attach(device_t, device_t, void *); static int usb_detach(device_t, int); static int usb_activate(device_t, enum devact); static void usb_childdet(device_t, device_t); static int usb_once_init(void); static void usb_doattach(device_t); CFATTACH_DECL3_NEW(usb, sizeof(struct usb_softc), usb_match, usb_attach, usb_detach, usb_activate, NULL, usb_childdet, DVF_DETACH_SHUTDOWN); static const char *taskq_names[] = USB_TASKQ_NAMES; int usb_match(device_t parent, cfdata_t match, void *aux) { USBHIST_FUNC(); USBHIST_CALLED(usbdebug); return UMATCH_GENERIC; } void usb_attach(device_t parent, device_t self, void *aux) { static ONCE_DECL(init_control); struct usb_softc *sc = device_private(self); int usbrev; sc->sc_bus = aux; usbrev = sc->sc_bus->ub_revision; cv_init(&sc->sc_bus->ub_needsexplore_cv, "usbevt"); cv_init(&sc->sc_bus->ub_rhxfercv, "usbrhxfer"); sc->sc_pmf_registered = false; aprint_naive("\n"); aprint_normal(": USB revision %s", usbrev_str[usbrev]); switch (usbrev) { case USBREV_1_0: case USBREV_1_1: case USBREV_2_0: case USBREV_3_0: case USBREV_3_1: break; default: aprint_error(", not supported\n"); sc->sc_dying = 1; return; } aprint_normal("\n"); /* XXX we should have our own level */ sc->sc_bus->ub_soft = softint_establish(SOFTINT_USB | SOFTINT_MPSAFE, usb_soft_intr, sc->sc_bus); if (sc->sc_bus->ub_soft == NULL) { aprint_error("%s: can't register softintr\n", device_xname(self)); sc->sc_dying = 1; return; } sc->sc_bus->ub_methods->ubm_getlock(sc->sc_bus, &sc->sc_bus->ub_lock); KASSERT(sc->sc_bus->ub_lock != NULL); RUN_ONCE(&init_control, usb_once_init); config_interrupts(self, usb_doattach); } #ifdef DDB #include <machine/db_machdep.h> #include <ddb/db_output.h> #include <ddb/db_command.h> static void db_usb_xfer(db_expr_t addr, bool have_addr, db_expr_t count, const char *modif) { struct usbd_xfer *xfer = (struct usbd_xfer *)(uintptr_t)addr; if (!have_addr) { db_printf("%s: need usbd_xfer address\n", __func__); return; } db_printf("usb xfer: %p pipe %p priv %p buffer %p\n", xfer, xfer->ux_pipe, xfer->ux_priv, xfer->ux_buffer); db_printf(" len %x actlen %x flags %x timeout %x status %x\n", xfer->ux_length, xfer->ux_actlen, xfer->ux_flags, xfer->ux_timeout, xfer->ux_status); db_printf(" callback %p done %x state %x tm_set %x tm_reset %x\n", xfer->ux_callback, xfer->ux_done, xfer->ux_state, xfer->ux_timeout_set, xfer->ux_timeout_reset); } static void db_usb_xferlist(db_expr_t addr, bool have_addr, db_expr_t count, const char *modif) { struct usbd_pipe *pipe = (struct usbd_pipe *)(uintptr_t)addr; struct usbd_xfer *xfer; if (!have_addr) { db_printf("%s: need usbd_pipe address\n", __func__); return; } db_printf("usb pipe: %p\n", pipe); unsigned xfercount = 0; SIMPLEQ_FOREACH(xfer, &pipe->up_queue, ux_next) { db_printf(" xfer = %p%s", xfer, xfercount == 0 || xfercount % 2 == 0 ? "" : "\n"); xfercount++; } } static const struct db_command db_usb_command_table[] = { { DDB_ADD_CMD("usbxfer", db_usb_xfer, 0, "display a USB xfer structure", NULL, NULL) }, { DDB_ADD_CMD("usbxferlist", db_usb_xferlist, 0, "display a USB xfer structure given pipe", NULL, NULL) }, { DDB_END_CMD }, }; static void usb_init_ddb(void) { (void)db_register_tbl(DDB_SHOW_CMD, db_usb_command_table); } #else #define usb_init_ddb() /* nothing */ #endif static int usb_once_init(void) { struct usb_taskq *taskq; int i; USBHIST_LINK_STATIC(usbhist); selinit(&usb_selevent); mutex_init(&usb_event_lock, MUTEX_DEFAULT, IPL_NONE); cv_init(&usb_event_cv, "usbrea"); for (i = 0; i < USB_NUM_TASKQS; i++) { taskq = &usb_taskq[i]; TAILQ_INIT(&taskq->tasks); /* * Since USB task methods usb_{add,rem}_task are callable * from any context, we have to make this lock a spinlock. */ mutex_init(&taskq->lock, MUTEX_DEFAULT, IPL_USB); cv_init(&taskq->cv, "usbtsk"); taskq->name = taskq_names[i]; taskq->current_task = NULL; if (kthread_create(PRI_NONE, KTHREAD_MPSAFE, NULL, usb_task_thread, taskq, &taskq->task_thread_lwp, "%s", taskq->name)) { printf("unable to create task thread: %s\n", taskq->name); panic("usb_create_event_thread task"); } /* * XXX we should make sure these threads are alive before * end up using them in usb_doattach(). */ } KASSERT(usb_async_sih == NULL); usb_async_sih = softint_establish(SOFTINT_CLOCK | SOFTINT_MPSAFE, usb_async_intr, NULL); usb_init_ddb(); return 0; } static void usb_doattach(device_t self) { struct usb_softc *sc = device_private(self); struct usbd_device *dev; usbd_status err; int speed; struct usb_event *ue; USBHIST_FUNC(); USBHIST_CALLED(usbdebug); KASSERT(KERNEL_LOCKED_P()); /* Protected by KERNEL_LOCK */ nusbbusses++; sc->sc_bus->ub_usbctl = self; sc->sc_port.up_power = USB_MAX_POWER; switch (sc->sc_bus->ub_revision) { case USBREV_1_0: case USBREV_1_1: speed = USB_SPEED_FULL; break; case USBREV_2_0: speed = USB_SPEED_HIGH; break; case USBREV_3_0: speed = USB_SPEED_SUPER; break; case USBREV_3_1: speed = USB_SPEED_SUPER_PLUS; break; default: panic("usb_doattach"); } ue = usb_alloc_event(); ue->u.ue_ctrlr.ue_bus = device_unit(self); usb_add_event(USB_EVENT_CTRLR_ATTACH, ue); sc->sc_attach_thread = curlwp; err = usbd_new_device(self, sc->sc_bus, 0, speed, 0, &sc->sc_port); sc->sc_attach_thread = NULL; if (!err) { dev = sc->sc_port.up_dev; if (dev->ud_hub == NULL) { sc->sc_dying = 1; aprint_error("%s: root device is not a hub\n", device_xname(self)); return; } sc->sc_bus->ub_roothub = dev; usb_create_event_thread(self); } else { aprint_error("%s: root hub problem, error=%s\n", device_xname(self), usbd_errstr(err)); sc->sc_dying = 1; } /* * Drop this reference after the first set of attachments in the * event thread. */ config_pending_incr(self); if (!pmf_device_register(self, NULL, NULL)) aprint_error_dev(self, "couldn't establish power handler\n"); else sc->sc_pmf_registered = true; return; } void usb_create_event_thread(device_t self) { struct usb_softc *sc = device_private(self); if (kthread_create(PRI_NONE, 0, NULL, usb_event_thread, sc, &sc->sc_event_thread, "%s", device_xname(self))) { printf("%s: unable to create event thread for\n", device_xname(self)); panic("usb_create_event_thread"); } } bool usb_in_event_thread(device_t dev) { struct usb_softc *sc; if (cold) return true; for (; dev; dev = device_parent(dev)) { if (device_is_a(dev, "usb")) break; } if (dev == NULL) return false; sc = device_private(dev); return curlwp == sc->sc_event_thread || curlwp == sc->sc_attach_thread; } /* * Add a task to be performed by the task thread. This function can be * called from any context and the task will be executed in a process * context ASAP. */ void usb_add_task(struct usbd_device *dev, struct usb_task *task, int queue) { struct usb_taskq *taskq; USBHIST_FUNC(); USBHIST_CALLED(usbdebug); SDT_PROBE3(usb, kernel, task, add, dev, task, queue); KASSERT(0 <= queue); KASSERT(queue < USB_NUM_TASKQS); taskq = &usb_taskq[queue]; mutex_enter(&taskq->lock); if (atomic_cas_uint(&task->queue, USB_NUM_TASKQS, queue) == USB_NUM_TASKQS) { DPRINTFN(2, "task=%#jx", (uintptr_t)task, 0, 0, 0); TAILQ_INSERT_TAIL(&taskq->tasks, task, next); cv_signal(&taskq->cv); } else { DPRINTFN(2, "task=%#jx on q", (uintptr_t)task, 0, 0, 0); } mutex_exit(&taskq->lock); } /* * usb_rem_task(dev, task) * * If task is queued to run, remove it from the queue. Return * true if it successfully removed the task from the queue, false * if not. * * Caller is _not_ guaranteed that the task is not running when * this is done. * * Never sleeps. */ bool usb_rem_task(struct usbd_device *dev, struct usb_task *task) { unsigned queue; USBHIST_FUNC(); USBHIST_CALLED(usbdebug); SDT_PROBE2(usb, kernel, task, rem__start, dev, task); while ((queue = task->queue) != USB_NUM_TASKQS) { struct usb_taskq *taskq = &usb_taskq[queue]; mutex_enter(&taskq->lock); if (__predict_true(task->queue == queue)) { TAILQ_REMOVE(&taskq->tasks, task, next); task->queue = USB_NUM_TASKQS; mutex_exit(&taskq->lock); SDT_PROBE3(usb, kernel, task, rem__done, dev, task, true); return true; /* removed from the queue */ } mutex_exit(&taskq->lock); } SDT_PROBE3(usb, kernel, task, rem__done, dev, task, false); return false; /* was not removed from the queue */ } /* * usb_rem_task_wait(dev, task, queue, interlock) * * If task is scheduled to run, remove it from the queue. If it * may have already begun to run, drop interlock if not null, wait * for it to complete, and reacquire interlock if not null. * Return true if it successfully removed the task from the queue, * false if not. * * Caller MUST guarantee that task will not be scheduled on a * _different_ queue, at least until after this returns. * * If caller guarantees that task will not be scheduled on the * same queue before this returns, then caller is guaranteed that * the task is not running at all when this returns. * * May sleep. */ bool usb_rem_task_wait(struct usbd_device *dev, struct usb_task *task, int queue, kmutex_t *interlock) { struct usb_taskq *taskq; int queue1; bool removed; USBHIST_FUNC(); USBHIST_CALLED(usbdebug); SDT_PROBE4(usb, kernel, task, rem__wait__start, dev, task, queue, interlock); ASSERT_SLEEPABLE(); KASSERT(0 <= queue); KASSERT(queue < USB_NUM_TASKQS); taskq = &usb_taskq[queue]; mutex_enter(&taskq->lock); queue1 = task->queue; if (queue1 == USB_NUM_TASKQS) { /* * It is not on the queue. It may be about to run, or * it may have already finished running -- there is no * stopping it now. Wait for it if it is running. */ if (interlock) mutex_exit(interlock); while (taskq->current_task == task) cv_wait(&taskq->cv, &taskq->lock); removed = false; } else { /* * It is still on the queue. We can stop it before the * task thread will run it. */ KASSERTMSG(queue1 == queue, "task %p on q%d expected on q%d", task, queue1, queue); TAILQ_REMOVE(&taskq->tasks, task, next); task->queue = USB_NUM_TASKQS; removed = true; } mutex_exit(&taskq->lock); /* * If there's an interlock, and we dropped it to wait, * reacquire it. */ if (interlock && !removed) mutex_enter(interlock); SDT_PROBE5(usb, kernel, task, rem__wait__done, dev, task, queue, interlock, removed); return removed; } /* * usb_task_pending(dev, task) * * True if task is queued, false if not. Note that if task is * already running, it is not considered queued. * * For _negative_ diagnostic assertions only: * * KASSERT(!usb_task_pending(dev, task)); */ bool usb_task_pending(struct usbd_device *dev, struct usb_task *task) { return task->queue != USB_NUM_TASKQS; } void usb_event_thread(void *arg) { struct usb_softc *sc = arg; struct usbd_bus *bus = sc->sc_bus; USBHIST_FUNC(); USBHIST_CALLED(usbdebug); KASSERT(KERNEL_LOCKED_P()); /* * In case this controller is a companion controller to an * EHCI controller we need to wait until the EHCI controller * has grabbed the port. * XXX It would be nicer to do this with a tsleep(), but I don't * know how to synchronize the creation of the threads so it * will work. */ if (bus->ub_revision < USBREV_2_0) { usb_delay_ms(bus, 500); } /* Make sure first discover does something. */ mutex_enter(bus->ub_lock); sc->sc_bus->ub_needsexplore = 1; usb_discover(sc); mutex_exit(bus->ub_lock); /* Drop the config_pending reference from attach. */ config_pending_decr(bus->ub_usbctl); mutex_enter(bus->ub_lock); while (!sc->sc_dying) { #if 0 /* not yet */ while (sc->sc_bus->ub_usepolling) kpause("usbpoll", true, hz, bus->ub_lock); #endif if (usb_noexplore < 2) usb_discover(sc); cv_timedwait(&bus->ub_needsexplore_cv, bus->ub_lock, usb_noexplore ? 0 : hz * 60); DPRINTFN(2, "sc %#jx woke up", (uintptr_t)sc, 0, 0, 0); } sc->sc_event_thread = NULL; /* In case parent is waiting for us to exit. */ cv_signal(&bus->ub_needsexplore_cv); mutex_exit(bus->ub_lock); DPRINTF("sc %#jx exit", (uintptr_t)sc, 0, 0, 0); kthread_exit(0); } void usb_task_thread(void *arg) { struct usb_task *task; struct usb_taskq *taskq; bool mpsafe; taskq = arg; USBHIST_FUNC(); USBHIST_CALLARGS(usbdebug, "start taskq %#jx", (uintptr_t)taskq, 0, 0, 0); mutex_enter(&taskq->lock); for (;;) { task = TAILQ_FIRST(&taskq->tasks); if (task == NULL) { cv_wait(&taskq->cv, &taskq->lock); task = TAILQ_FIRST(&taskq->tasks); } DPRINTFN(2, "woke up task=%#jx", (uintptr_t)task, 0, 0, 0); if (task != NULL) { mpsafe = ISSET(task->flags, USB_TASKQ_MPSAFE); TAILQ_REMOVE(&taskq->tasks, task, next); task->queue = USB_NUM_TASKQS; taskq->current_task = task; mutex_exit(&taskq->lock); if (!mpsafe) KERNEL_LOCK(1, curlwp); SDT_PROBE1(usb, kernel, task, start, task); task->fun(task->arg); /* Can't dereference task after this point. */ SDT_PROBE1(usb, kernel, task, done, task); if (!mpsafe) KERNEL_UNLOCK_ONE(curlwp); mutex_enter(&taskq->lock); KASSERTMSG(taskq->current_task == task, "somebody scribbled on usb taskq %p", taskq); taskq->current_task = NULL; cv_broadcast(&taskq->cv); } } mutex_exit(&taskq->lock); } int usbctlprint(void *aux, const char *pnp) { /* only "usb"es can attach to host controllers */ if (pnp) aprint_normal("usb at %s", pnp); return UNCONF; } int usbopen(dev_t dev, int flag, int mode, struct lwp *l) { int unit = minor(dev); struct usb_softc *sc; if (nusbbusses == 0) return ENXIO; if (unit == USB_DEV_MINOR) { if (usb_dev_open) return EBUSY; usb_dev_open = 1; mutex_enter(&proc_lock); atomic_store_relaxed(&usb_async_proc, NULL); mutex_exit(&proc_lock); return 0; } sc = device_lookup_private(&usb_cd, unit); if (!sc) return ENXIO; if (sc->sc_dying) return EIO; return 0; } int usbread(dev_t dev, struct uio *uio, int flag) { struct usb_event *ue; struct usb_event30 *ueo = NULL; /* XXXGCC */ int useold = 0; int error, n; if (minor(dev) != USB_DEV_MINOR) return ENXIO; switch (uio->uio_resid) { case sizeof(struct usb_event30): ueo = kmem_zalloc(sizeof(struct usb_event30), KM_SLEEP); useold = 1; /* FALLTHROUGH */ case sizeof(struct usb_event): ue = usb_alloc_event(); break; default: return EINVAL; } error = 0; mutex_enter(&usb_event_lock); for (;;) { n = usb_get_next_event(ue); if (n != 0) break; if (flag & IO_NDELAY) { error = EWOULDBLOCK; break; } error = cv_wait_sig(&usb_event_cv, &usb_event_lock); if (error) break; } mutex_exit(&usb_event_lock); if (!error) { if (useold) { /* copy fields to old struct */ MODULE_HOOK_CALL(usb_subr_copy_30_hook, (ue, ueo, uio), enosys(), error); if (error == ENOSYS) error = EINVAL; if (!error) error = uiomove((void *)ueo, sizeof(*ueo), uio); } else error = uiomove((void *)ue, sizeof(*ue), uio); } usb_free_event(ue); if (ueo) kmem_free(ueo, sizeof(struct usb_event30)); return error; } int usbclose(dev_t dev, int flag, int mode, struct lwp *l) { int unit = minor(dev); if (unit == USB_DEV_MINOR) { mutex_enter(&proc_lock); atomic_store_relaxed(&usb_async_proc, NULL); mutex_exit(&proc_lock); usb_dev_open = 0; } return 0; } int usbioctl(dev_t devt, u_long cmd, void *data, int flag, struct lwp *l) { struct usb_softc *sc; int unit = minor(devt); USBHIST_FUNC(); USBHIST_CALLARGS(usbdebug, "cmd %#jx", cmd, 0, 0, 0); if (unit == USB_DEV_MINOR) { switch (cmd) { case FIONBIO: /* All handled in the upper FS layer. */ return 0; case FIOASYNC: mutex_enter(&proc_lock); atomic_store_relaxed(&usb_async_proc, *(int *)data ? l->l_proc : NULL); mutex_exit(&proc_lock); return 0; default: return EINVAL; } } sc = device_lookup_private(&usb_cd, unit); if (sc->sc_dying) return EIO; int error = 0; switch (cmd) { #ifdef USB_DEBUG case USB_SETDEBUG: if (!(flag & FWRITE)) return EBADF; usbdebug = ((*(int *)data) & 0x000000ff); break; #endif /* USB_DEBUG */ case USB_REQUEST: { struct usb_ctl_request *ur = (void *)data; int len = UGETW(ur->ucr_request.wLength); struct iovec iov; struct uio uio; void *ptr = 0; int addr = ur->ucr_addr; usbd_status err; if (!(flag & FWRITE)) { error = EBADF; goto fail; } DPRINTF("USB_REQUEST addr=%jd len=%jd", addr, len, 0, 0); if (len < 0 || len > 32768) { error = EINVAL; goto fail; } if (addr < 0 || addr >= USB_MAX_DEVICES) { error = EINVAL; goto fail; } size_t dindex = usb_addr2dindex(addr); if (sc->sc_bus->ub_devices[dindex] == NULL) { error = EINVAL; goto fail; } if (len != 0) { iov.iov_base = (void *)ur->ucr_data; iov.iov_len = len; uio.uio_iov = &iov; uio.uio_iovcnt = 1; uio.uio_resid = len; uio.uio_offset = 0; uio.uio_rw = ur->ucr_request.bmRequestType & UT_READ ? UIO_READ : UIO_WRITE; uio.uio_vmspace = l->l_proc->p_vmspace; ptr = kmem_alloc(len, KM_SLEEP); if (uio.uio_rw == UIO_WRITE) { error = uiomove(ptr, len, &uio); if (error) goto ret; } } err = usbd_do_request_flags(sc->sc_bus->ub_devices[dindex], &ur->ucr_request, ptr, ur->ucr_flags, &ur->ucr_actlen, USBD_DEFAULT_TIMEOUT); if (err) { error = EIO; goto ret; } if (len > ur->ucr_actlen) len = ur->ucr_actlen; if (len != 0) { if (uio.uio_rw == UIO_READ) { error = uiomove(ptr, len, &uio); if (error) goto ret; } } ret: if (ptr) { len = UGETW(ur->ucr_request.wLength); kmem_free(ptr, len); } break; } case USB_DEVICEINFO: { struct usbd_device *dev; struct usb_device_info *di = (void *)data; int addr = di->udi_addr; if (addr < 0 || addr >= USB_MAX_DEVICES) { error = EINVAL; goto fail; } size_t dindex = usb_addr2dindex(addr); if ((dev = sc->sc_bus->ub_devices[dindex]) == NULL) { error = ENXIO; goto fail; } usbd_fill_deviceinfo(dev, di, 1); break; } case USB_DEVICEINFO_30: { struct usbd_device *dev; struct usb_device_info30 *di = (void *)data; int addr = di->udi_addr; if (addr < 1 || addr >= USB_MAX_DEVICES) { error = EINVAL; goto fail; } size_t dindex = usb_addr2dindex(addr); if ((dev = sc->sc_bus->ub_devices[dindex]) == NULL) { error = ENXIO; goto fail; } MODULE_HOOK_CALL(usb_subr_fill_30_hook, (dev, di, 1, usbd_devinfo_vp, usbd_printBCD), enosys(), error); if (error == ENOSYS) error = EINVAL; if (error) goto fail; break; } case USB_DEVICESTATS: *(struct usb_device_stats *)data = sc->sc_bus->ub_stats; break; default: error = EINVAL; } fail: DPRINTF("... done (error = %jd)", error, 0, 0, 0); return error; } int usbpoll(dev_t dev, int events, struct lwp *l) { int revents, mask; if (minor(dev) == USB_DEV_MINOR) { revents = 0; mask = POLLIN | POLLRDNORM; mutex_enter(&usb_event_lock); if (events & mask && usb_nevents > 0) revents |= events & mask; if (revents == 0 && events & mask) selrecord(l, &usb_selevent); mutex_exit(&usb_event_lock); return revents; } else { return 0; } } static void filt_usbrdetach(struct knote *kn) { mutex_enter(&usb_event_lock); selremove_knote(&usb_selevent, kn); mutex_exit(&usb_event_lock); } static int filt_usbread(struct knote *kn, long hint) { if (usb_nevents == 0) return 0; kn->kn_data = sizeof(struct usb_event); return 1; } static const struct filterops usbread_filtops = { .f_flags = FILTEROP_ISFD, .f_attach = NULL, .f_detach = filt_usbrdetach, .f_event = filt_usbread, }; int usbkqfilter(dev_t dev, struct knote *kn) { switch (kn->kn_filter) { case EVFILT_READ: if (minor(dev) != USB_DEV_MINOR) return 1; kn->kn_fop = &usbread_filtops; break; default: return EINVAL; } kn->kn_hook = NULL; mutex_enter(&usb_event_lock); selrecord_knote(&usb_selevent, kn); mutex_exit(&usb_event_lock); return 0; } /* Explore device tree from the root. */ Static void usb_discover(struct usb_softc *sc) { struct usbd_bus *bus = sc->sc_bus; USBHIST_FUNC(); USBHIST_CALLED(usbdebug); KASSERT(KERNEL_LOCKED_P()); KASSERT(mutex_owned(bus->ub_lock)); if (usb_noexplore > 1) return; /* * We need mutual exclusion while traversing the device tree, * but this is guaranteed since this function is only called * from the event thread for the controller. * * Also, we now have bus->ub_lock held, and in combination * with ub_exploring, avoids interferring with polling. */ SDT_PROBE1(usb, kernel, bus, discover__start, bus); while (bus->ub_needsexplore && !sc->sc_dying) { bus->ub_needsexplore = 0; mutex_exit(sc->sc_bus->ub_lock); SDT_PROBE1(usb, kernel, bus, explore__start, bus); bus->ub_roothub->ud_hub->uh_explore(bus->ub_roothub); SDT_PROBE1(usb, kernel, bus, explore__done, bus); mutex_enter(bus->ub_lock); } SDT_PROBE1(usb, kernel, bus, discover__done, bus); } void usb_needs_explore(struct usbd_device *dev) { USBHIST_FUNC(); USBHIST_CALLED(usbdebug); SDT_PROBE1(usb, kernel, bus, needs__explore, dev->ud_bus); mutex_enter(dev->ud_bus->ub_lock); dev->ud_bus->ub_needsexplore = 1; cv_signal(&dev->ud_bus->ub_needsexplore_cv); mutex_exit(dev->ud_bus->ub_lock); } void usb_needs_reattach(struct usbd_device *dev) { USBHIST_FUNC(); USBHIST_CALLED(usbdebug); SDT_PROBE1(usb, kernel, bus, needs__reattach, dev->ud_bus); mutex_enter(dev->ud_bus->ub_lock); dev->ud_powersrc->up_reattach = 1; dev->ud_bus->ub_needsexplore = 1; cv_signal(&dev->ud_bus->ub_needsexplore_cv); mutex_exit(dev->ud_bus->ub_lock); } /* Called at with usb_event_lock held. */ int usb_get_next_event(struct usb_event *ue) { struct usb_event_q *ueq; KASSERT(mutex_owned(&usb_event_lock)); if (usb_nevents <= 0) return 0; ueq = SIMPLEQ_FIRST(&usb_events); #ifdef DIAGNOSTIC if (ueq == NULL) { printf("usb: usb_nevents got out of sync! %d\n", usb_nevents); usb_nevents = 0; return 0; } #endif if (ue) *ue = ueq->ue; SIMPLEQ_REMOVE_HEAD(&usb_events, next); usb_free_event((struct usb_event *)(void *)ueq); usb_nevents--; return 1; } void usbd_add_dev_event(int type, struct usbd_device *udev) { struct usb_event *ue = usb_alloc_event(); usbd_fill_deviceinfo(udev, &ue->u.ue_device, false); usb_add_event(type, ue); } void usbd_add_drv_event(int type, struct usbd_device *udev, device_t dev) { struct usb_event *ue = usb_alloc_event(); ue->u.ue_driver.ue_cookie = udev->ud_cookie; strncpy(ue->u.ue_driver.ue_devname, device_xname(dev), sizeof(ue->u.ue_driver.ue_devname)); usb_add_event(type, ue); } Static struct usb_event * usb_alloc_event(void) { /* Yes, this is right; we allocate enough so that we can use it later */ return kmem_zalloc(sizeof(struct usb_event_q), KM_SLEEP); } Static void usb_free_event(struct usb_event *uep) { kmem_free(uep, sizeof(struct usb_event_q)); } Static void usb_add_event(int type, struct usb_event *uep) { struct usb_event_q *ueq; struct timeval thetime; USBHIST_FUNC(); USBHIST_CALLED(usbdebug); microtime(&thetime); /* Don't want to wait here with usb_event_lock held */ ueq = (struct usb_event_q *)(void *)uep; ueq->ue = *uep; ueq->ue.ue_type = type; TIMEVAL_TO_TIMESPEC(&thetime, &ueq->ue.ue_time); SDT_PROBE1(usb, kernel, event, add, uep); mutex_enter(&usb_event_lock); if (++usb_nevents >= USB_MAX_EVENTS) { /* Too many queued events, drop an old one. */ DPRINTF("event dropped", 0, 0, 0, 0); #ifdef KDTRACE_HOOKS struct usb_event oue; if (usb_get_next_event(&oue)) SDT_PROBE1(usb, kernel, event, drop, &oue); #else usb_get_next_event(NULL); #endif } SIMPLEQ_INSERT_TAIL(&usb_events, ueq, next); cv_signal(&usb_event_cv); selnotify(&usb_selevent, 0, 0); if (atomic_load_relaxed(&usb_async_proc) != NULL) { kpreempt_disable(); softint_schedule(usb_async_sih); kpreempt_enable(); } mutex_exit(&usb_event_lock); } Static void usb_async_intr(void *cookie) { proc_t *proc; mutex_enter(&proc_lock); if ((proc = atomic_load_relaxed(&usb_async_proc)) != NULL) psignal(proc, SIGIO); mutex_exit(&proc_lock); } Static void usb_soft_intr(void *arg) { struct usbd_bus *bus = arg; mutex_enter(bus->ub_lock); bus->ub_methods->ubm_softint(bus); mutex_exit(bus->ub_lock); } void usb_schedsoftintr(struct usbd_bus *bus) { USBHIST_FUNC(); USBHIST_CALLARGS(usbdebug, "polling=%jd", bus->ub_usepolling, 0, 0, 0); /* In case the bus never finished setting up. */ if (__predict_false(bus->ub_soft == NULL)) return; if (bus->ub_usepolling) { bus->ub_methods->ubm_softint(bus); } else { kpreempt_disable(); softint_schedule(bus->ub_soft); kpreempt_enable(); } } int usb_activate(device_t self, enum devact act) { struct usb_softc *sc = device_private(self); switch (act) { case DVACT_DEACTIVATE: sc->sc_dying = 1; return 0; default: return EOPNOTSUPP; } } void usb_childdet(device_t self, device_t child) { int i; struct usb_softc *sc = device_private(self); struct usbd_device *dev; if ((dev = sc->sc_port.up_dev) == NULL || dev->ud_subdevlen == 0) return; for (i = 0; i < dev->ud_subdevlen; i++) if (dev->ud_subdevs[i] == child) dev->ud_subdevs[i] = NULL; } int usb_detach(device_t self, int flags) { struct usb_softc *sc = device_private(self); struct usb_event *ue; int rc; USBHIST_FUNC(); USBHIST_CALLED(usbdebug); /* Make all devices disconnect. */ if (sc->sc_port.up_dev != NULL && (rc = usb_disconnect_port(&sc->sc_port, self, flags)) != 0) return rc; if (sc->sc_pmf_registered) pmf_device_deregister(self); /* Kill off event thread. */ sc->sc_dying = 1; while (sc->sc_event_thread != NULL) { mutex_enter(sc->sc_bus->ub_lock); cv_signal(&sc->sc_bus->ub_needsexplore_cv); cv_timedwait(&sc->sc_bus->ub_needsexplore_cv, sc->sc_bus->ub_lock, hz * 60); mutex_exit(sc->sc_bus->ub_lock); } DPRINTF("event thread dead", 0, 0, 0, 0); if (sc->sc_bus->ub_soft != NULL) { softint_disestablish(sc->sc_bus->ub_soft); sc->sc_bus->ub_soft = NULL; } ue = usb_alloc_event(); ue->u.ue_ctrlr.ue_bus = device_unit(self); usb_add_event(USB_EVENT_CTRLR_DETACH, ue); cv_destroy(&sc->sc_bus->ub_needsexplore_cv); cv_destroy(&sc->sc_bus->ub_rhxfercv); return 0; }
7 4 1 2 5 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 /* $NetBSD: sysv_shm_50.c,v 1.5 2019/12/15 16:48:26 tsutsui Exp $ */ /*- * Copyright (c) 1999 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility, * NASA Ames Research Center. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: sysv_shm_50.c,v 1.5 2019/12/15 16:48:26 tsutsui Exp $"); #if defined(_KERNEL_OPT) #include "opt_compat_netbsd.h" #endif #include <sys/param.h> #include <sys/systm.h> #include <sys/signal.h> #include <sys/proc.h> #include <sys/shm.h> #ifndef SYSVSHM #define SYSVSHM #endif #include <sys/syscallargs.h> #include <compat/sys/shm.h> int compat_50_sys___shmctl13(struct lwp *l, const struct compat_50_sys___shmctl13_args *uap, register_t *retval) { /* { syscallarg(int) shmid; syscallarg(int) cmd; syscallarg(struct shmid_ds13 *) buf; } */ struct shmid_ds shmbuf; struct shmid_ds13 oshmbuf; int cmd, error; cmd = SCARG(uap, cmd); if (cmd == IPC_SET) { error = copyin(SCARG(uap, buf), &oshmbuf, sizeof(oshmbuf)); if (error) return (error); __shmid_ds13_to_native(&oshmbuf, &shmbuf); } error = shmctl1(l, SCARG(uap, shmid), cmd, (cmd == IPC_SET || cmd == IPC_STAT) ? &shmbuf : NULL); if (error == 0 && cmd == IPC_STAT) { __native_to_shmid_ds13(&shmbuf, &oshmbuf); error = copyout(&oshmbuf, SCARG(uap, buf), sizeof(oshmbuf)); } return (error); }
4 1 3 3 2 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 /* $NetBSD: coda_vfsops.c,v 1.90 2022/03/28 12:37:46 riastradh Exp $ */ /* * * Coda: an Experimental Distributed File System * Release 3.1 * * Copyright (c) 1987-1998 Carnegie Mellon University * All Rights Reserved * * Permission to use, copy, modify and distribute this software and its * documentation is hereby granted, provided that both the copyright * notice and this permission notice appear in all copies of the * software, derivative works or modified versions, and any portions * thereof, and that both notices appear in supporting documentation, and * that credit is given to Carnegie Mellon University in all documents * and publicity pertaining to direct or indirect use of this code or its * derivatives. * * CODA IS AN EXPERIMENTAL SOFTWARE SYSTEM AND IS KNOWN TO HAVE BUGS, * SOME OF WHICH MAY HAVE SERIOUS CONSEQUENCES. CARNEGIE MELLON ALLOWS * FREE USE OF THIS SOFTWARE IN ITS "AS IS" CONDITION. CARNEGIE MELLON * DISCLAIMS ANY LIABILITY OF ANY KIND FOR ANY DAMAGES WHATSOEVER * RESULTING DIRECTLY OR INDIRECTLY FROM THE USE OF THIS SOFTWARE OR OF * ANY DERIVATIVE WORK. * * Carnegie Mellon encourages users of this software to return any * improvements or extensions that they make, and to grant Carnegie * Mellon the rights to redistribute these changes without encumbrance. * * @(#) cfs/coda_vfsops.c,v 1.1.1.1 1998/08/29 21:26:45 rvb Exp $ */ /* * Mach Operating System * Copyright (c) 1989 Carnegie-Mellon University * All rights reserved. The CMU software License Agreement specifies * the terms and conditions for use and redistribution. */ /* * This code was written for the Coda file system at Carnegie Mellon * University. Contributers include David Steere, James Kistler, and * M. Satyanarayanan. */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: coda_vfsops.c,v 1.90 2022/03/28 12:37:46 riastradh Exp $"); #include <sys/param.h> #include <sys/systm.h> #include <sys/sysctl.h> #include <sys/malloc.h> #include <sys/conf.h> #include <sys/namei.h> #include <sys/dirent.h> #include <sys/mount.h> #include <sys/proc.h> #include <sys/select.h> #include <sys/kauth.h> #include <sys/module.h> #include <coda/coda.h> #include <coda/cnode.h> #include <coda/coda_vfsops.h> #include <coda/coda_venus.h> #include <coda/coda_subr.h> #include <coda/coda_opstats.h> /* for VN_RDEV */ #include <miscfs/specfs/specdev.h> #include <miscfs/genfs/genfs.h> MODULE(MODULE_CLASS_VFS, coda, "vcoda"); #define ENTRY if(coda_vfsop_print_entry) myprintf(("Entered %s\n",__func__)) extern struct vnode *coda_ctlvp; extern struct coda_mntinfo coda_mnttbl[NVCODA]; /* indexed by minor device number */ /* structure to keep statistics of internally generated/satisfied calls */ struct coda_op_stats coda_vfsopstats[CODA_VFSOPS_SIZE]; #define MARK_ENTRY(op) (coda_vfsopstats[op].entries++) #define MARK_INT_SAT(op) (coda_vfsopstats[op].sat_intrn++) #define MARK_INT_FAIL(op) (coda_vfsopstats[op].unsat_intrn++) #define MRAK_INT_GEN(op) (coda_vfsopstats[op].gen_intrn++) extern const struct cdevsw vcoda_cdevsw; extern const struct vnodeopv_desc coda_vnodeop_opv_desc; const struct vnodeopv_desc * const coda_vnodeopv_descs[] = { &coda_vnodeop_opv_desc, NULL, }; struct vfsops coda_vfsops = { .vfs_name = MOUNT_CODA, .vfs_min_mount_data = 256, /* This is the pathname, unlike every other fs */ .vfs_mount = coda_mount, .vfs_start = coda_start, .vfs_unmount = coda_unmount, .vfs_root = coda_root, .vfs_quotactl = (void *)eopnotsupp, .vfs_statvfs = coda_nb_statvfs, .vfs_sync = coda_sync, .vfs_vget = coda_vget, .vfs_loadvnode = coda_loadvnode, .vfs_fhtovp = (void *)eopnotsupp, .vfs_vptofh = (void *)eopnotsupp, .vfs_init = coda_init, .vfs_done = coda_done, .vfs_mountroot = (void *)eopnotsupp, .vfs_snapshot = (void *)eopnotsupp, .vfs_extattrctl = vfs_stdextattrctl, .vfs_suspendctl = genfs_suspendctl, .vfs_renamelock_enter = genfs_renamelock_enter, .vfs_renamelock_exit = genfs_renamelock_exit, .vfs_fsync = (void *)eopnotsupp, .vfs_opv_descs = coda_vnodeopv_descs }; static int coda_modcmd(modcmd_t cmd, void *arg) { switch (cmd) { case MODULE_CMD_INIT: return vfs_attach(&coda_vfsops); case MODULE_CMD_FINI: return vfs_detach(&coda_vfsops); default: return ENOTTY; } } int coda_vfsopstats_init(void) { int i; for (i=0;i<CODA_VFSOPS_SIZE;i++) { coda_vfsopstats[i].opcode = i; coda_vfsopstats[i].entries = 0; coda_vfsopstats[i].sat_intrn = 0; coda_vfsopstats[i].unsat_intrn = 0; coda_vfsopstats[i].gen_intrn = 0; } return 0; } /* * cfs mount vfsop * Set up mount info record and attach it to vfs struct. */ /*ARGSUSED*/ int coda_mount(struct mount *vfsp, /* Allocated and initialized by mount(2) */ const char *path, /* path covered: ignored by the fs-layer */ void *data, /* Need to define a data type for this in netbsd? */ size_t *data_len) { struct lwp *l = curlwp; struct vnode *dvp; struct cnode *cp; dev_t dev; struct coda_mntinfo *mi; struct vnode *rtvp; const struct cdevsw *cdev; CodaFid rootfid = INVAL_FID; CodaFid ctlfid = CTL_FID; int error; if (data == NULL) return EINVAL; if (vfsp->mnt_flag & MNT_GETARGS) return EINVAL; ENTRY; coda_vfsopstats_init(); coda_vnodeopstats_init(); MARK_ENTRY(CODA_MOUNT_STATS); if (CODA_MOUNTED(vfsp)) { MARK_INT_FAIL(CODA_MOUNT_STATS); return(EBUSY); } /* Validate mount device. Similar to getmdev(). */ /* * XXX: coda passes the mount device as the entire mount args, * All other fs pass a structure contining a pointer. * In order to get sys_mount() to do the copyin() we've set a * fixed default size for the filename buffer. */ /* Ensure that namei() doesn't run off the filename buffer */ if (*data_len < 1 || *data_len > PATH_MAX || strnlen(data, *data_len) >= *data_len) { MARK_INT_FAIL(CODA_MOUNT_STATS); return EINVAL; } error = namei_simple_kernel((char *)data, NSM_FOLLOW_NOEMULROOT, &dvp); if (error) { MARK_INT_FAIL(CODA_MOUNT_STATS); return (error); } if (dvp->v_type != VCHR) { MARK_INT_FAIL(CODA_MOUNT_STATS); vrele(dvp); return(ENXIO); } dev = dvp->v_rdev; vrele(dvp); cdev = cdevsw_lookup(dev); if (cdev == NULL) { MARK_INT_FAIL(CODA_MOUNT_STATS); return(ENXIO); } /* * See if the device table matches our expectations. */ if (cdev != &vcoda_cdevsw) { MARK_INT_FAIL(CODA_MOUNT_STATS); return(ENXIO); } if (minor(dev) >= NVCODA) { MARK_INT_FAIL(CODA_MOUNT_STATS); return(ENXIO); } /* * Initialize the mount record and link it to the vfs struct */ mi = &coda_mnttbl[minor(dev)]; if (!VC_OPEN(&mi->mi_vcomm)) { MARK_INT_FAIL(CODA_MOUNT_STATS); return(ENODEV); } /* No initialization (here) of mi_vcomm! */ vfsp->mnt_data = mi; vfsp->mnt_stat.f_fsidx.__fsid_val[0] = 0; vfsp->mnt_stat.f_fsidx.__fsid_val[1] = makefstype(MOUNT_CODA); vfsp->mnt_stat.f_fsid = vfsp->mnt_stat.f_fsidx.__fsid_val[0]; vfsp->mnt_stat.f_namemax = CODA_MAXNAMLEN; mi->mi_vfsp = vfsp; /* * Make a root vnode to placate the Vnode interface, but don't * actually make the CODA_ROOT call to venus until the first call * to coda_root in case a server is down while venus is starting. */ cp = make_coda_node(&rootfid, vfsp, VDIR); rtvp = CTOV(cp); rtvp->v_vflag |= VV_ROOT; cp = make_coda_node(&ctlfid, vfsp, VCHR); coda_ctlvp = CTOV(cp); /* Add vfs and rootvp to chain of vfs hanging off mntinfo */ mi->mi_vfsp = vfsp; mi->mi_rootvp = rtvp; /* set filesystem block size */ vfsp->mnt_stat.f_bsize = 8192; /* XXX -JJK */ vfsp->mnt_stat.f_frsize = 8192; /* XXX -JJK */ /* error is currently guaranteed to be zero, but in case some code changes... */ CODADEBUG(1, myprintf(("coda_mount returned %d\n",error));); if (error) MARK_INT_FAIL(CODA_MOUNT_STATS); else MARK_INT_SAT(CODA_MOUNT_STATS); return set_statvfs_info("/coda", UIO_SYSSPACE, "CODA", UIO_SYSSPACE, vfsp->mnt_op->vfs_name, vfsp, l); } int coda_start(struct mount *vfsp, int flags) { ENTRY; vftomi(vfsp)->mi_started = 1; return (0); } int coda_unmount(struct mount *vfsp, int mntflags) { struct coda_mntinfo *mi = vftomi(vfsp); int active, error = 0; ENTRY; MARK_ENTRY(CODA_UMOUNT_STATS); if (!CODA_MOUNTED(vfsp)) { MARK_INT_FAIL(CODA_UMOUNT_STATS); return(EINVAL); } if (mi->mi_vfsp == vfsp) { /* We found the victim */ if (!IS_UNMOUNTING(VTOC(mi->mi_rootvp))) return (EBUSY); /* Venus is still running */ #ifdef DEBUG printf("coda_unmount: ROOT: vp %p, cp %p\n", mi->mi_rootvp, VTOC(mi->mi_rootvp)); #endif mi->mi_started = 0; vrele(mi->mi_rootvp); vrele(coda_ctlvp); active = coda_kill(vfsp, NOT_DOWNCALL); mi->mi_rootvp->v_vflag &= ~VV_ROOT; error = vflush(mi->mi_vfsp, NULLVP, FORCECLOSE); printf("coda_unmount: active = %d, vflush active %d\n", active, error); error = 0; /* I'm going to take this out to allow lookups to go through. I'm * not sure it's important anyway. -- DCS 2/2/94 */ /* vfsp->VFS_DATA = NULL; */ /* No more vfsp's to hold onto */ mi->mi_vfsp = NULL; mi->mi_rootvp = NULL; if (error) MARK_INT_FAIL(CODA_UMOUNT_STATS); else MARK_INT_SAT(CODA_UMOUNT_STATS); return(error); } return (EINVAL); } /* * find root of cfs */ int coda_root(struct mount *vfsp, int lktype, struct vnode **vpp) { struct coda_mntinfo *mi = vftomi(vfsp); int error; struct lwp *l = curlwp; /* XXX - bnoble */ CodaFid VFid; static const CodaFid invalfid = INVAL_FID; ENTRY; MARK_ENTRY(CODA_ROOT_STATS); if (vfsp == mi->mi_vfsp) { if (memcmp(&VTOC(mi->mi_rootvp)->c_fid, &invalfid, sizeof(CodaFid))) { /* Found valid root. */ *vpp = mi->mi_rootvp; /* On Mach, this is vref. On NetBSD, VOP_LOCK */ vref(*vpp); vn_lock(*vpp, lktype); MARK_INT_SAT(CODA_ROOT_STATS); return(0); } } error = venus_root(vftomi(vfsp), l->l_cred, l->l_proc, &VFid); if (!error) { struct cnode *cp = VTOC(mi->mi_rootvp); /* * Save the new rootfid in the cnode, and rekey the cnode * with the new fid key. */ error = vcache_rekey_enter(vfsp, mi->mi_rootvp, &invalfid, sizeof(CodaFid), &VFid, sizeof(CodaFid)); if (error) goto exit; cp->c_fid = VFid; vcache_rekey_exit(vfsp, mi->mi_rootvp, &invalfid, sizeof(CodaFid), &cp->c_fid, sizeof(CodaFid)); *vpp = mi->mi_rootvp; vref(*vpp); vn_lock(*vpp, lktype); MARK_INT_SAT(CODA_ROOT_STATS); goto exit; } else if (error == ENODEV || error == EINTR) { /* Gross hack here! */ /* * If Venus fails to respond to the CODA_ROOT call, coda_call returns * ENODEV. Return the uninitialized root vnode to allow vfs * operations such as unmount to continue. Without this hack, * there is no way to do an unmount if Venus dies before a * successful CODA_ROOT call is done. All vnode operations * will fail. */ *vpp = mi->mi_rootvp; vref(*vpp); vn_lock(*vpp, lktype); MARK_INT_FAIL(CODA_ROOT_STATS); error = 0; goto exit; } else { CODADEBUG( CODA_ROOT, myprintf(("error %d in CODA_ROOT\n", error)); ); MARK_INT_FAIL(CODA_ROOT_STATS); goto exit; } exit: return(error); } /* * Get file system statistics. */ int coda_nb_statvfs(struct mount *vfsp, struct statvfs *sbp) { struct lwp *l = curlwp; struct coda_statfs fsstat; int error; ENTRY; MARK_ENTRY(CODA_STATFS_STATS); if (!CODA_MOUNTED(vfsp)) { /* MARK_INT_FAIL(CODA_STATFS_STATS); */ return(EINVAL); } /* XXX - what to do about f_flags, others? --bnoble */ /* Below This is what AFS does #define NB_SFS_SIZ 0x895440 */ /* Note: Normal fs's have a bsize of 0x400 == 1024 */ error = venus_statfs(vftomi(vfsp), l->l_cred, l, &fsstat); if (!error) { sbp->f_bsize = 8192; /* XXX */ sbp->f_frsize = 8192; /* XXX */ sbp->f_iosize = 8192; /* XXX */ sbp->f_blocks = fsstat.f_blocks; sbp->f_bfree = fsstat.f_bfree; sbp->f_bavail = fsstat.f_bavail; sbp->f_bresvd = 0; sbp->f_files = fsstat.f_files; sbp->f_ffree = fsstat.f_ffree; sbp->f_favail = fsstat.f_ffree; sbp->f_fresvd = 0; copy_statvfs_info(sbp, vfsp); } MARK_INT_SAT(CODA_STATFS_STATS); return(error); } /* * Flush any pending I/O. */ int coda_sync(struct mount *vfsp, int waitfor, kauth_cred_t cred) { ENTRY; MARK_ENTRY(CODA_SYNC_STATS); MARK_INT_SAT(CODA_SYNC_STATS); return(0); } int coda_vget(struct mount *vfsp, ino_t ino, int lktype, struct vnode **vpp) { ENTRY; return (EOPNOTSUPP); } int coda_loadvnode(struct mount *mp, struct vnode *vp, const void *key, size_t key_len, const void **new_key) { CodaFid fid; struct cnode *cp; extern int (**coda_vnodeop_p)(void *); KASSERT(key_len == sizeof(CodaFid)); memcpy(&fid, key, key_len); cp = kmem_zalloc(sizeof(*cp), KM_SLEEP); mutex_init(&cp->c_lock, MUTEX_DEFAULT, IPL_NONE); cp->c_fid = fid; cp->c_vnode = vp; vp->v_op = coda_vnodeop_p; vp->v_tag = VT_CODA; vp->v_type = VNON; vp->v_data = cp; *new_key = &cp->c_fid; return 0; } /* * fhtovp is now what vget used to be in 4.3-derived systems. For * some silly reason, vget is now keyed by a 32 bit ino_t, rather than * a type-specific fid. */ int coda_fhtovp(struct mount *vfsp, struct fid *fhp, struct mbuf *nam, struct vnode **vpp, int *exflagsp, kauth_cred_t *creadanonp, int lktype) { struct cfid *cfid = (struct cfid *)fhp; struct cnode *cp = 0; int error; struct lwp *l = curlwp; /* XXX -mach */ CodaFid VFid; int vtype; ENTRY; MARK_ENTRY(CODA_VGET_STATS); /* Check for vget of control object. */ if (IS_CTL_FID(&cfid->cfid_fid)) { *vpp = coda_ctlvp; vref(coda_ctlvp); MARK_INT_SAT(CODA_VGET_STATS); return(0); } error = venus_fhtovp(vftomi(vfsp), &cfid->cfid_fid, l->l_cred, l->l_proc, &VFid, &vtype); if (error) { CODADEBUG(CODA_VGET, myprintf(("vget error %d\n",error));) *vpp = (struct vnode *)0; } else { CODADEBUG(CODA_VGET, myprintf(("vget: %s type %d result %d\n", coda_f2s(&VFid), vtype, error)); ) cp = make_coda_node(&VFid, vfsp, vtype); *vpp = CTOV(cp); } return(error); } int coda_vptofh(struct vnode *vnp, struct fid *fidp) { ENTRY; return (EOPNOTSUPP); } void coda_init(void) { ENTRY; } void coda_done(void) { ENTRY; } SYSCTL_SETUP(sysctl_vfs_coda_setup, "sysctl vfs.coda subtree setup") { sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT, CTLTYPE_NODE, "coda", SYSCTL_DESCR("code vfs options"), NULL, 0, NULL, 0, CTL_VFS, 18, CTL_EOL); /* * XXX the "18" above could be dynamic, thereby eliminating * one more instance of the "number to vfs" mapping problem, * but "18" is the order as taken from sys/mount.h */ /* sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_INT, "clusterread", SYSCTL_DESCR( anyone? ), NULL, 0, &doclusterread, 0, CTL_VFS, 18, FFS_CLUSTERREAD, CTL_EOL); */ } /* * To allow for greater ease of use, some vnodes may be orphaned when * Venus dies. Certain operations should still be allowed to go * through, but without propagating orphan-ness. So this function will * get a new vnode for the file from the current run of Venus. */ int getNewVnode(struct vnode **vpp) { struct cfid cfid; struct coda_mntinfo *mi = vftomi((*vpp)->v_mount); ENTRY; cfid.cfid_len = (short)sizeof(CodaFid); cfid.cfid_fid = VTOC(*vpp)->c_fid; /* Structure assignment. */ /* XXX ? */ /* We're guessing that if set, the 1st element on the list is a * valid vnode to use. If not, return ENODEV as venus is dead. */ if (mi->mi_vfsp == NULL) return ENODEV; return coda_fhtovp(mi->mi_vfsp, (struct fid*)&cfid, NULL, vpp, NULL, NULL, LK_EXCLUSIVE); } /* Get the mount structure corresponding to a given device. * Return NULL if no device is found or the device is not mounted. */ struct mount *devtomp(dev_t dev) { struct mount *mp; struct vnode *vp; if (spec_node_lookup_by_dev(VBLK, dev, VDEAD_NOWAIT, &vp) == 0) { mp = spec_node_getmountedfs(vp); vrele(vp); } else { mp = NULL; } return mp; }
4 4 2 2 4 4 3 4 1 4 15 15 15 15 15 2 2 2 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 /* $NetBSD: entpool.c,v 1.1 2020/04/30 03:28:19 riastradh Exp $ */ /*- * Copyright (c) 2019 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Taylor R. Campbell. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /* * Entropy pool (`reseedable pseudorandom number generator') based on a * sponge duplex, following the design described and analyzed in * * Guido Bertoni, Joan Daemen, Michaël Peeters, and Gilles Van * Assche, `Sponge-Based Pseudo-Random Number Generators', in * Stefan Mangard and François-Xavier Standaert, eds., * Cryptographic Hardware and Embedded Systems—CHES 2010, Springer * LNCS 6225, pp. 33–47. * https://link.springer.com/chapter/10.1007/978-3-642-15031-9_3 * https://keccak.team/files/SpongePRNG.pdf * * Guido Bertoni, Joan Daemen, Michaël Peeters, and Gilles Van * Assche, `Duplexing the Sponge: Single-Pass Authenticated * Encryption and Other Applications', in Ali Miri and Serge * Vaudenay, eds., Selected Areas in Cryptography—SAC 2011, * Springer LNCS 7118, pp. 320–337. * https://link.springer.com/chapter/10.1007/978-3-642-28496-0_19 * https://keccak.team/files/SpongeDuplex.pdf * * We make the following tweaks that don't affect security: * * - Samples are length-delimited 7-bit variable-length encoding. * The encoding is still injective, so the security theorems * continue to apply. * * - Output is not buffered -- callers should draw 32 bytes and * expand with a stream cipher. In effect, every output draws * the full rate, and we just discard whatever the caller didn't * ask for; the impact is only on performance, not security. * * On top of the underlying sponge state, an entropy pool maintains an * integer i in [0, RATE-1] indicating where to write the next byte in * the input buffer. Zeroing an entropy pool initializes it. */ #if defined(_KERNEL) || defined(_STANDALONE) #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: entpool.c,v 1.1 2020/04/30 03:28:19 riastradh Exp $"); #endif #include "entpool.h" #include ENTPOOL_HEADER #if defined(_KERNEL) || defined(_STANDALONE) #include <sys/types.h> #include <lib/libkern/libkern.h> #define ASSERT KASSERT #else #include <sys/cdefs.h> #include <assert.h> #include <stdbool.h> #include <stdint.h> #include <string.h> #define ASSERT assert #define CTASSERT __CTASSERT #endif #define secret /* must not use in variable-time operations; should zero */ #define arraycount(A) (sizeof(A)/sizeof((A)[0])) #define MIN(X,Y) ((X) < (Y) ? (X) : (Y)) #define RATE ENTPOOL_RATE /* * stir(P) * * Internal subroutine to apply the sponge permutation to the * state in P. Resets P->i to 0 to indicate that the input buffer * is empty. */ static void stir(struct entpool *P) { size_t i; /* * Switch to the permutation's byte order, if necessary, apply * permutation, and then switch back. This way we can data in * and out byte by byte, but get the same answers out of test * vectors. */ for (i = 0; i < arraycount(P->s.w); i++) P->s.w[i] = ENTPOOL_WTOH(P->s.w[i]); ENTPOOL_PERMUTE(P->s.w); for (i = 0; i < arraycount(P->s.w); i++) P->s.w[i] = ENTPOOL_HTOW(P->s.w[i]); /* Reset the input buffer. */ P->i = 0; } /* * entpool_enter(P, buf, len) * * Enter len bytes from buf into the entropy pool P, stirring as * needed. Corresponds to P.feed in the paper. */ void entpool_enter(struct entpool *P, const void *buf, size_t len) { const uint8_t *p = buf; size_t n = len, n1 = n; /* Sanity-check P->i. */ ASSERT(P->i <= RATE-1); /* Encode the length, stirring as needed. */ while (n1) { if (P->i == RATE-1) stir(P); ASSERT(P->i < RATE-1); P->s.u8[P->i++] ^= (n1 >= 0x80 ? 0x80 : 0) | (n1 & 0x7f); n1 >>= 7; } /* Enter the sample, stirring as needed. */ while (n --> 0) { if (P->i == RATE-1) stir(P); ASSERT(P->i < RATE-1); P->s.u8[P->i++] ^= *p++; } /* If we filled the input buffer exactly, stir once more. */ if (P->i == RATE-1) stir(P); ASSERT(P->i < RATE-1); } /* * entpool_enter_nostir(P, buf, len) * * Enter as many bytes as possible, up to len, from buf into the * entropy pool P. Roughly corresponds to P.feed in the paper, * but we stop if we would have run the permutation. * * Return true if the sample was consumed in its entirety, or true * if the sample was truncated so the caller should arrange to * call entpool_stir when it is next convenient to do so. * * This function is cheap -- it only xors the input into the * state, and never calls the underlying permutation, but it may * truncate samples. */ bool entpool_enter_nostir(struct entpool *P, const void *buf, size_t len) { const uint8_t *p = buf; size_t n0, n; /* Sanity-check P->i. */ ASSERT(P->i <= RATE-1); /* If the input buffer is full, fail. */ if (P->i == RATE-1) return false; ASSERT(P->i < RATE-1); /* * Truncate the sample and enter it with 1-byte length encoding * -- don't bother with variable-length encoding, not worth the * trouble. */ n = n0 = MIN(127, MIN(len, RATE-1 - P->i - 1)); P->s.u8[P->i++] ^= n; while (n --> 0) P->s.u8[P->i++] ^= *p++; /* Can't guarantee anything better than 0 <= i <= RATE-1. */ ASSERT(P->i <= RATE-1); /* Return true if all done, false if truncated and in need of stir. */ return (n0 == len); } /* * entpool_stir(P) * * Stir the entropy pool after entpool_enter_nostir fails. If it * has already been stirred already, this has no effect. */ void entpool_stir(struct entpool *P) { /* Sanity-check P->i. */ ASSERT(P->i <= RATE-1); /* If the input buffer is full, stir. */ if (P->i == RATE-1) stir(P); ASSERT(P->i < RATE-1); } /* * entpool_extract(P, buf, len) * * Extract len bytes from the entropy pool P into buf. * Corresponds to iterating P.fetch/P.forget in the paper. * (Feeding the output back in -- as P.forget does -- is the same * as zeroing what we just read out.) */ void entpool_extract(struct entpool *P, secret void *buf, size_t len) { uint8_t *p = buf; size_t n = len; /* Sanity-check P->i. */ ASSERT(P->i <= RATE-1); /* If input buffer is not empty, stir. */ if (P->i != 0) stir(P); ASSERT(P->i == 0); /* * Copy out and zero (RATE-1)-sized chunks at a time, stirring * with a bit set to distinguish this from inputs. */ while (n >= RATE-1) { memcpy(p, P->s.u8, RATE-1); memset(P->s.u8, 0, RATE-1); P->s.u8[RATE-1] ^= 0x80; stir(P); p += RATE-1; n -= RATE-1; } /* * If there's anything left, copy out a partial rate's worth * and zero the entire rate's worth, stirring with a bit set to * distinguish this from inputs. */ if (n) { ASSERT(n < RATE-1); memcpy(p, P->s.u8, n); /* Copy part of it. */ memset(P->s.u8, 0, RATE-1); /* Zero all of it. */ P->s.u8[RATE-1] ^= 0x80; stir(P); } } /* * Known-answer tests */ #if ENTPOOL_SMALL #define KATLEN 15 /* Gimli */ static const uint8_t known_answers[][KATLEN] = { [0] = { 0x69,0xb8,0x49,0x0d,0x39,0xfb,0x42,0x61, 0xf7,0x66,0xdf,0x04,0xb6,0xed,0x11, }, [1] = { 0x74,0x15,0x16,0x49,0x31,0x07,0x77,0xa1, 0x3b,0x4d,0x78,0xc6,0x5d,0xef,0x87, }, [2] = { 0xae,0xfd,0x7d,0xc4,0x3b,0xce,0x09,0x25, 0xbf,0x60,0x21,0x6e,0x3c,0x3a,0x84, }, [3] = { 0xae,0xfd,0x7d,0xc4,0x3b,0xce,0x09,0x25, 0xbf,0x60,0x21,0x6e,0x3c,0x3a,0x84, }, [4] = { 0x69,0xb8,0x49,0x0d,0x39,0xfb,0x42,0x61, 0xf7,0x66,0xdf,0x04,0xb6,0xed,0x11, }, [5] = { 0xa9,0x3c,0x3c,0xac,0x5f,0x6d,0x80,0xdc, 0x33,0x0c,0xb2,0xe3,0xdd,0x55,0x31, }, [6] = { 0x2e,0x69,0x1a,0x2a,0x2d,0x09,0xd4,0x5e, 0x49,0xcc,0x8c,0xb2,0x0b,0xcc,0x42, }, [7] = { 0xae,0xfd,0x7d,0xc4,0x3b,0xce,0x09,0x25, 0xbf,0x60,0x21,0x6e,0x3c,0x3a,0x84, }, [8] = { 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00, }, [9] = { 0x69,0xb8,0x49,0x0d,0x39,0xfb,0x42,0x61, 0xf7,0x66,0xdf,0x04,0xb6,0xed,0x11, }, [10] = { 0x2e,0x69,0x1a,0x2a,0x2d,0x09,0xd4,0x5e, 0x49,0xcc,0x8c,0xb2,0x0b,0xcc,0x42, }, [11] = { 0x6f,0xfd,0xd2,0x29,0x78,0x46,0xc0,0x7d, 0xc7,0xf2,0x0a,0x2b,0x72,0xd6,0xc6, }, [12] = { 0x86,0xf0,0xc1,0xf9,0x95,0x0f,0xc9,0x12, 0xde,0x38,0x39,0x10,0x1f,0x8c,0xc4, }, }; #else /* !ENTPOOL_SMALL */ #define KATLEN 16 /* Keccak-p[1600, 24] */ static const uint8_t known_answers[][KATLEN] = { [0] = { 0x3b,0x20,0xf0,0xe9,0xce,0x94,0x48,0x07, 0x97,0xb6,0x16,0xb5,0xb5,0x05,0x1a,0xce, }, [1] = { 0x57,0x49,0x6e,0x28,0x7f,0xaa,0xee,0x6c, 0xa8,0xb0,0xf5,0x0b,0x87,0xae,0xd6,0xd6, }, [2] = { 0x51,0x72,0x0f,0x59,0x54,0xe1,0xaf,0xa8, 0x16,0x67,0xfa,0x3f,0x8a,0x19,0x52,0x50, }, [3] = { 0x51,0x72,0x0f,0x59,0x54,0xe1,0xaf,0xa8, 0x16,0x67,0xfa,0x3f,0x8a,0x19,0x52,0x50, }, [4] = { 0x3b,0x20,0xf0,0xe9,0xce,0x94,0x48,0x07, 0x97,0xb6,0x16,0xb5,0xb5,0x05,0x1a,0xce, }, [5] = { 0x95,0x23,0x77,0xe4,0x84,0xeb,0xaa,0x2e, 0x6a,0x99,0xc2,0x52,0x06,0x6d,0xdf,0xea, }, [6] = { 0x8c,0xdd,0x1b,0xaf,0x0e,0xf6,0xe9,0x1d, 0x51,0x33,0x68,0x38,0x8d,0xad,0x55,0x84, }, [7] = { 0x51,0x72,0x0f,0x59,0x54,0xe1,0xaf,0xa8, 0x16,0x67,0xfa,0x3f,0x8a,0x19,0x52,0x50, }, [8] = { 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, }, [9] = { 0x3b,0x20,0xf0,0xe9,0xce,0x94,0x48,0x07, 0x97,0xb6,0x16,0xb5,0xb5,0x05,0x1a,0xce, }, [10] = { 0x8c,0xdd,0x1b,0xaf,0x0e,0xf6,0xe9,0x1d, 0x51,0x33,0x68,0x38,0x8d,0xad,0x55,0x84, }, [11] = { 0xf6,0xc1,0x14,0xbb,0x13,0x0a,0xaf,0xed, 0xca,0x0b,0x35,0x2c,0xf1,0x2b,0x1a,0x85, }, [12] = { 0xf9,0x4b,0x05,0xd1,0x8b,0xcd,0xb3,0xd0, 0x77,0x27,0xfe,0x46,0xf9,0x33,0xb2,0xa2, }, }; #endif #define KAT_BEGIN(P, n) memset(P, 0, sizeof(*(P))) #define KAT_ERROR() return -1 #define KAT_END(P, n) do \ { \ uint8_t KAT_ACTUAL[KATLEN]; \ entpool_extract(P, KAT_ACTUAL, KATLEN); \ if (memcmp(KAT_ACTUAL, known_answers[n], KATLEN)) \ return -1; \ } while (0) int entpool_selftest(void) { struct entpool pool, *P = &pool; uint8_t sample[1] = {0xff}; uint8_t scratch[RATE]; const uint8_t zero[RATE] = {0}; /* Test entpool_enter with empty buffer. */ KAT_BEGIN(P, 0); entpool_stir(P); /* noop */ entpool_enter(P, sample, 1); entpool_stir(P); /* noop */ KAT_END(P, 0); /* Test entpool_enter with partial buffer. */ KAT_BEGIN(P, 1); entpool_stir(P); /* noop */ #if ENTPOOL_SMALL entpool_enter(P, zero, RATE-3); #else entpool_enter(P, zero, RATE-4); #endif entpool_stir(P); /* noop */ entpool_enter(P, sample, 1); entpool_stir(P); /* noop */ KAT_END(P, 1); /* Test entpool_enter with full buffer. */ KAT_BEGIN(P, 2); entpool_stir(P); /* noop */ #if ENTPOOL_SMALL if (!entpool_enter_nostir(P, zero, RATE-2)) KAT_ERROR(); #else if (!entpool_enter_nostir(P, zero, 127)) KAT_ERROR(); if (!entpool_enter_nostir(P, zero, RATE-2 - 127 - 1)) KAT_ERROR(); #endif entpool_enter(P, sample, 1); entpool_stir(P); /* noop */ KAT_END(P, 2); /* Test entpool_enter with full buffer after stir. */ KAT_BEGIN(P, 3); entpool_stir(P); /* noop */ #if ENTPOOL_SMALL if (!entpool_enter_nostir(P, zero, RATE-2)) KAT_ERROR(); #else CTASSERT(127 <= RATE-2); if (!entpool_enter_nostir(P, zero, 127)) KAT_ERROR(); if (!entpool_enter_nostir(P, zero, RATE-2 - 127 - 1)) KAT_ERROR(); #endif entpool_stir(P); entpool_enter(P, sample, 1); entpool_stir(P); /* noop */ KAT_END(P, 3); /* Test entpool_enter_nostir with empty buffer. */ KAT_BEGIN(P, 4); entpool_stir(P); /* noop */ if (!entpool_enter_nostir(P, sample, 1)) KAT_ERROR(); entpool_stir(P); /* noop */ KAT_END(P, 4); /* Test entpool_enter_nostir with partial buffer. */ KAT_BEGIN(P, 5); entpool_stir(P); /* noop */ #if ENTPOOL_SMALL entpool_enter(P, zero, RATE-3); #else entpool_enter(P, zero, RATE-4); #endif entpool_stir(P); /* noop */ if (entpool_enter_nostir(P, sample, 1)) KAT_ERROR(); entpool_stir(P); KAT_END(P, 5); /* Test entpool_enter_nostir with full buffer. */ KAT_BEGIN(P, 6); entpool_stir(P); /* noop */ #if ENTPOOL_SMALL if (!entpool_enter_nostir(P, zero, RATE-2)) KAT_ERROR(); #else CTASSERT(127 <= RATE-2); if (!entpool_enter_nostir(P, zero, 127)) KAT_ERROR(); if (!entpool_enter_nostir(P, zero, RATE-2 - 127 - 1)) KAT_ERROR(); #endif if (entpool_enter_nostir(P, sample, 1)) KAT_ERROR(); entpool_stir(P); KAT_END(P, 6); /* Test entpool_enter_nostir with full buffer after stir. */ KAT_BEGIN(P, 7); entpool_stir(P); /* noop */ #if ENTPOOL_SMALL if (!entpool_enter_nostir(P, zero, RATE-2)) KAT_ERROR(); #else CTASSERT(127 <= RATE-2); if (!entpool_enter_nostir(P, zero, 127)) KAT_ERROR(); if (!entpool_enter_nostir(P, zero, RATE-2 - 127 - 1)) KAT_ERROR(); #endif entpool_stir(P); if (!entpool_enter_nostir(P, sample, 1)) KAT_ERROR(); entpool_stir(P); /* noop */ KAT_END(P, 7); /* Test entpool_extract with empty input buffer. */ KAT_BEGIN(P, 8); entpool_stir(P); /* noop */ KAT_END(P, 8); /* Test entpool_extract with nonempty input buffer. */ KAT_BEGIN(P, 9); entpool_stir(P); /* noop */ entpool_enter(P, sample, 1); entpool_stir(P); /* noop */ KAT_END(P, 9); /* Test entpool_extract with full input buffer. */ KAT_BEGIN(P, 10); entpool_stir(P); /* noop */ #if ENTPOOL_SMALL if (!entpool_enter_nostir(P, zero, RATE-2)) KAT_ERROR(); #else CTASSERT(127 <= RATE-2); if (!entpool_enter_nostir(P, zero, 127)) KAT_ERROR(); if (!entpool_enter_nostir(P, zero, RATE-2 - 127 - 1)) KAT_ERROR(); #endif KAT_END(P, 10); /* Test entpool_extract with iterated output. */ KAT_BEGIN(P, 11); entpool_stir(P); /* noop */ entpool_extract(P, scratch, RATE-1 + 1); entpool_stir(P); /* noop */ KAT_END(P, 11); /* Test extract, enter, extract. */ KAT_BEGIN(P, 12); entpool_stir(P); /* noop */ entpool_extract(P, scratch, 1); entpool_stir(P); /* noop */ entpool_enter(P, sample, 1); entpool_stir(P); /* noop */ KAT_END(P, 12); return 0; } #if ENTPOOL_TEST int main(void) { return entpool_selftest(); } #endif /* * Known-answer test generation * * This generates the known-answer test vectors from explicitly * specified duplex inputs that correspond to what entpool_enter * &c. induce, to confirm the encoding of inputs works as * intended. */ #if ENTPOOL_GENKAT #include <stdio.h> struct event { enum { IN, OUT, STOP } t; uint8_t b[RATE-1]; }; /* Cases correspond to entpool_selftest above. */ static const struct event *const cases[] = { [0] = (const struct event[]) { {IN, {1, 0xff}}, {STOP, {0}}, }, [1] = (const struct event[]) { #if ENTPOOL_SMALL {IN, {RATE-3, [RATE-2] = 1}}, #else {IN, {0x80|((RATE-4)&0x7f), (RATE-4)>>7, [RATE-2] = 1}}, #endif {IN, {0xff}}, {STOP, {0}}, }, [2] = (const struct event[]) { #if ENTPOOL_SMALL {IN, {RATE-2}}, #else {IN, {127, [128] = RATE-2 - 127 - 1}}, #endif {IN, {1, 0xff}}, {STOP, {0}}, }, [3] = (const struct event[]) { #if ENTPOOL_SMALL {IN, {RATE-2}}, #else {IN, {127, [128] = RATE-2 - 127 - 1}}, #endif {IN, {1, 0xff}}, {STOP, {0}}, }, [4] = (const struct event[]) { {IN, {1, 0xff}}, {STOP, {0}}, }, [5] = (const struct event[]) { #if ENTPOOL_SMALL {IN, {RATE-3, [RATE-2] = 0 /* truncated length */}}, #else {IN, {0x80|((RATE-4)&0x7f), (RATE-4)>>7, [RATE-2] = 0 /* truncated length */}}, #endif {STOP, {0}}, }, [6] = (const struct event[]) { #if ENTPOOL_SMALL {IN, {RATE-2}}, #else {IN, {127, [128] = RATE-2 - 127 - 1}}, #endif {STOP, {0}}, }, [7] = (const struct event[]) { #if ENTPOOL_SMALL {IN, {RATE-2}}, #else {IN, {127, [128] = RATE-2 - 127 - 1}}, #endif {IN, {1, 0xff}}, {STOP, {0}}, }, [8] = (const struct event[]) { {STOP, {0}}, }, [9] = (const struct event[]) { {IN, {1, 0xff}}, {STOP, {0}}, }, [10] = (const struct event[]) { #if ENTPOOL_SMALL {IN, {RATE-2}}, #else {IN, {127, [128] = RATE-2 - 127 - 1}}, #endif {STOP, {0}}, }, [11] = (const struct event[]) { {OUT, {0}}, {OUT, {0}}, {STOP, {0}}, }, [12] = (const struct event[]) { {OUT, {0}}, {IN, {1, 0xff}}, {STOP, {0}}, }, }; static void compute(uint8_t output[KATLEN], const struct event *events) { union { uint8_t b[ENTPOOL_SIZE]; ENTPOOL_WORD w[ENTPOOL_SIZE/sizeof(ENTPOOL_WORD)]; } u; unsigned i, j, k; memset(&u.b, 0, sizeof u.b); for (i = 0;; i++) { if (events[i].t == STOP) break; for (j = 0; j < sizeof(events[i].b); j++) u.b[j] ^= events[i].b[j]; if (events[i].t == OUT) { memset(u.b, 0, RATE-1); u.b[RATE-1] ^= 0x80; } for (k = 0; k < arraycount(u.w); k++) u.w[k] = ENTPOOL_WTOH(u.w[k]); ENTPOOL_PERMUTE(u.w); for (k = 0; k < arraycount(u.w); k++) u.w[k] = ENTPOOL_HTOW(u.w[k]); } for (j = 0; j < KATLEN; j++) output[j] = u.b[j]; } int main(void) { uint8_t output[KATLEN]; unsigned i, j; printf("static const uint8_t known_answers[][KATLEN] = {\n"); for (i = 0; i < arraycount(cases); i++) { printf("\t[%u] = {\n", i); compute(output, cases[i]); for (j = 0; j < KATLEN; j++) { if (j % 8 == 0) printf("\t\t"); printf("0x%02hhx,", output[j]); if (j % 8 == 7) printf("\n"); } if ((KATLEN % 8) != 0) printf("\n"); printf("\t},\n"); } printf("};\n"); fflush(stdout); return ferror(stdout); } #endif
5 3 1 15 15 15 15 4 11 1 10 3 1 1 1 9 8 1 8 5 5 5 5 4 7 7 2 2 2 2 2 2 1 18 18 1 16 17 3 4 4 4 15 15 15 15 8 8 14 1 15 15 17 1 1 1 1 1 1 4 3 3 4 1 3 1 6 3 17 23 23 23 23 23 23 5 21 23 17 17 3 3 3 4 5 5 5 5 5 5 4 4 5 5 2 4 3 2 1 1 23 5 23 21 16 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 /* $NetBSD: rtsock_shared.c,v 1.23 2022/10/04 07:06:31 msaitoh Exp $ */ /* * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the project nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * Copyright (c) 1988, 1991, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)rtsock.c 8.7 (Berkeley) 10/12/95 */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: rtsock_shared.c,v 1.23 2022/10/04 07:06:31 msaitoh Exp $"); #ifdef _KERNEL_OPT #include "opt_inet.h" #include "opt_net_mpsafe.h" #endif #include <sys/param.h> #include <sys/systm.h> #include <sys/proc.h> #include <sys/socket.h> #include <sys/socketvar.h> #include <sys/domain.h> #include <sys/protosw.h> #include <sys/sysctl.h> #include <sys/kauth.h> #include <sys/kmem.h> #include <sys/intr.h> #include <sys/condvar.h> #include <sys/compat_stub.h> #include <net/if.h> #include <net/if_llatbl.h> #include <net/if_types.h> #include <net/route.h> #include <net/raw_cb.h> #include <netinet/in_var.h> #include <netinet/if_inarp.h> #include <netmpls/mpls.h> #include <compat/net/if.h> #include <compat/net/route.h> /* sa_family is after sa_len, rest is data */ #define _SA_MINSIZE (offsetof(struct sockaddr, sa_family) + \ sizeof(((struct sockaddr *)0)->sa_family)) #ifdef COMPAT_RTSOCK /* * These are used when #include-d from compat/common/rtsock_50.c */ #define RTM_XVERSION RTM_OVERSION #define RTM_XNEWADDR RTM_ONEWADDR #define RTM_XDELADDR RTM_ODELADDR #define RTM_XCHGADDR RTM_OCHGADDR #define RT_XADVANCE(a,b) RT_OADVANCE(a,b) #define RT_XROUNDUP(n) RT_OROUNDUP(n) #define PF_XROUTE PF_OROUTE #define rt_xmsghdr rt_msghdr50 #define if_xmsghdr if_msghdr /* if_msghdr50 is for RTM_OIFINFO */ #define ifa_xmsghdr ifa_msghdr50 #define if_xannouncemsghdr if_announcemsghdr50 #define COMPATNAME(x) compat_50_ ## x #define DOMAINNAME "oroute" #define COMPATCALL(name, args) \ MODULE_HOOK_CALL_VOID(rtsock_ ## name ## _50_hook, args, __nothing); #define RTS_CTASSERT(x) __nothing CTASSERT(sizeof(struct ifa_xmsghdr) == 20); DOMAIN_DEFINE(compat_50_routedomain); /* forward declare and add to link set */ #else /* COMPAT_RTSOCK */ /* * These are used normally, when not #include-d from compat/common/rtsock_50.c */ #define RTM_XVERSION RTM_VERSION #define RTM_XNEWADDR RTM_NEWADDR #define RTM_XDELADDR RTM_DELADDR #define RTM_XCHGADDR RTM_CHGADDR #define RT_XADVANCE(a,b) RT_ADVANCE(a,b) #define RT_XROUNDUP(n) RT_ROUNDUP(n) #define PF_XROUTE PF_ROUTE #define rt_xmsghdr rt_msghdr #define if_xmsghdr if_msghdr #define ifa_xmsghdr ifa_msghdr #define if_xannouncemsghdr if_announcemsghdr #define COMPATNAME(x) x #define DOMAINNAME "route" #define COMPATCALL(name, args) __nothing; #define RTS_CTASSERT(x) CTASSERT(x) CTASSERT(sizeof(struct ifa_xmsghdr) == 32); DOMAIN_DEFINE(routedomain); /* forward declare and add to link set */ #endif /* COMPAT_RTSOCK */ #ifdef RTSOCK_DEBUG #define RT_IN_PRINT(info, b, a) (in_print((b), sizeof(b), \ &((const struct sockaddr_in *)(info)->rti_info[(a)])->sin_addr), (b)) #endif /* RTSOCK_DEBUG */ struct route_info COMPATNAME(route_info) = { .ri_dst = { .sa_len = 2, .sa_family = PF_XROUTE, }, .ri_src = { .sa_len = 2, .sa_family = PF_XROUTE, }, .ri_maxqlen = IFQ_MAXLEN, }; static void COMPATNAME(route_init)(void); static int COMPATNAME(route_output)(struct mbuf *, struct socket *); static int rt_xaddrs(u_char, const char *, const char *, struct rt_addrinfo *); static struct mbuf *rt_makeifannouncemsg(struct ifnet *, int, int, struct rt_addrinfo *); static int rt_msg2(int, struct rt_addrinfo *, void *, struct rt_walkarg *, int *); static void _rt_setmetrics(int, const struct rt_xmsghdr *, struct rtentry *); static void rtm_setmetrics(const struct rtentry *, struct rt_xmsghdr *); static void rt_adjustcount(int, int); static const struct protosw COMPATNAME(route_protosw)[]; struct routecb { struct rawcb rocb_rcb; unsigned int rocb_msgfilter; #define RTMSGFILTER(m) (1U << (m)) char *rocb_missfilter; size_t rocb_missfilterlen; }; #define sotoroutecb(so) ((struct routecb *)(so)->so_pcb) static struct rawcbhead rt_rawcb; #ifdef NET_MPSAFE static kmutex_t *rt_so_mtx; static bool rt_updating = false; static kcondvar_t rt_update_cv; #endif static void rt_adjustcount(int af, int cnt) { struct route_cb * const cb = &COMPATNAME(route_info).ri_cb; cb->any_count += cnt; switch (af) { case AF_INET: cb->ip_count += cnt; return; #ifdef INET6 case AF_INET6: cb->ip6_count += cnt; return; #endif case AF_MPLS: cb->mpls_count += cnt; return; } } static int COMPATNAME(route_filter)(struct mbuf *m, struct sockproto *proto, struct rawcb *rp) { struct routecb *rop = (struct routecb *)rp; struct rt_xmsghdr rtm; KASSERT(m != NULL); KASSERT(proto != NULL); KASSERT(rp != NULL); /* Wrong family for this socket. */ if (proto->sp_family != PF_ROUTE) return ENOPROTOOPT; /* If no filter set, just return. */ if (rop->rocb_msgfilter == 0 && rop->rocb_missfilterlen == 0) return 0; /* Ensure we can access rtm_type */ if (m->m_len < offsetof(struct rt_xmsghdr, rtm_type) + sizeof(rtm.rtm_type)) return EINVAL; m_copydata(m, offsetof(struct rt_xmsghdr, rtm_type), sizeof(rtm.rtm_type), &rtm.rtm_type); if (rtm.rtm_type >= sizeof(rop->rocb_msgfilter) * CHAR_BIT) return EINVAL; /* If the rtm type is filtered out, return a positive. */ if (rop->rocb_msgfilter != 0 && !(rop->rocb_msgfilter & RTMSGFILTER(rtm.rtm_type))) return EEXIST; if (rop->rocb_missfilterlen != 0 && rtm.rtm_type == RTM_MISS) { __CTASSERT(RTAX_DST == 0); struct sockaddr_storage ss; struct sockaddr *dst = (struct sockaddr *)&ss, *sa; char *cp = rop->rocb_missfilter; char *ep = cp + rop->rocb_missfilterlen; /* Ensure we can access sa_len */ if (m->m_pkthdr.len < sizeof(rtm) + _SA_MINSIZE) return EINVAL; m_copydata(m, sizeof(rtm) + offsetof(struct sockaddr, sa_len), sizeof(ss.ss_len), &ss.ss_len); if (ss.ss_len < _SA_MINSIZE || ss.ss_len > sizeof(ss) || m->m_pkthdr.len < sizeof(rtm) + ss.ss_len) return EINVAL; /* Copy out the destination sockaddr */ m_copydata(m, sizeof(rtm), ss.ss_len, &ss); /* Find a matching sockaddr in the filter */ while (cp < ep) { sa = (struct sockaddr *)cp; if (sa->sa_len == dst->sa_len && memcmp(sa, dst, sa->sa_len) == 0) break; cp += RT_XROUNDUP(sa->sa_len); } if (cp == ep) return EEXIST; } /* Passed the filter. */ return 0; } static void rt_pr_init(void) { LIST_INIT(&rt_rawcb); } static int COMPATNAME(route_attach)(struct socket *so, int proto) { struct rawcb *rp; struct routecb *rop; int s, error; KASSERT(sotorawcb(so) == NULL); rop = kmem_zalloc(sizeof(*rop), KM_SLEEP); rp = &rop->rocb_rcb; rp->rcb_len = sizeof(*rop); so->so_pcb = rp; s = splsoftnet(); #ifdef NET_MPSAFE KASSERT(so->so_lock == NULL); mutex_obj_hold(rt_so_mtx); so->so_lock = rt_so_mtx; solock(so); #endif if ((error = raw_attach(so, proto, &rt_rawcb)) == 0) { rt_adjustcount(rp->rcb_proto.sp_protocol, 1); rp->rcb_laddr = &COMPATNAME(route_info).ri_src; rp->rcb_faddr = &COMPATNAME(route_info).ri_dst; rp->rcb_filter = COMPATNAME(route_filter); } splx(s); if (error) { kmem_free(rop, sizeof(*rop)); so->so_pcb = NULL; return error; } soisconnected(so); so->so_options |= SO_USELOOPBACK; KASSERT(solocked(so)); return error; } static void COMPATNAME(route_detach)(struct socket *so) { struct rawcb *rp = sotorawcb(so); struct routecb *rop = (struct routecb *)rp; int s; KASSERT(rp != NULL); KASSERT(solocked(so)); s = splsoftnet(); if (rop->rocb_missfilterlen != 0) kmem_free(rop->rocb_missfilter, rop->rocb_missfilterlen); rt_adjustcount(rp->rcb_proto.sp_protocol, -1); raw_detach(so); splx(s); } static int COMPATNAME(route_accept)(struct socket *so, struct sockaddr *nam) { KASSERT(solocked(so)); panic("route_accept"); return EOPNOTSUPP; } static int COMPATNAME(route_bind)(struct socket *so, struct sockaddr *nam, struct lwp *l) { KASSERT(solocked(so)); return EOPNOTSUPP; } static int COMPATNAME(route_listen)(struct socket *so, struct lwp *l) { KASSERT(solocked(so)); return EOPNOTSUPP; } static int COMPATNAME(route_connect)(struct socket *so, struct sockaddr *nam, struct lwp *l) { KASSERT(solocked(so)); return EOPNOTSUPP; } static int COMPATNAME(route_connect2)(struct socket *so, struct socket *so2) { KASSERT(solocked(so)); return EOPNOTSUPP; } static int COMPATNAME(route_disconnect)(struct socket *so) { struct rawcb *rp = sotorawcb(so); int s; KASSERT(solocked(so)); KASSERT(rp != NULL); s = splsoftnet(); soisdisconnected(so); raw_disconnect(rp); splx(s); return 0; } static int COMPATNAME(route_shutdown)(struct socket *so) { int s; KASSERT(solocked(so)); /* * Mark the connection as being incapable of further input. */ s = splsoftnet(); socantsendmore(so); splx(s); return 0; } static int COMPATNAME(route_abort)(struct socket *so) { KASSERT(solocked(so)); panic("route_abort"); return EOPNOTSUPP; } static int COMPATNAME(route_ioctl)(struct socket *so, u_long cmd, void *nam, struct ifnet * ifp) { return EOPNOTSUPP; } static int COMPATNAME(route_stat)(struct socket *so, struct stat *ub) { KASSERT(solocked(so)); return 0; } static int COMPATNAME(route_peeraddr)(struct socket *so, struct sockaddr *nam) { struct rawcb *rp = sotorawcb(so); KASSERT(solocked(so)); KASSERT(rp != NULL); KASSERT(nam != NULL); if (rp->rcb_faddr == NULL) return ENOTCONN; raw_setpeeraddr(rp, nam); return 0; } static int COMPATNAME(route_sockaddr)(struct socket *so, struct sockaddr *nam) { struct rawcb *rp = sotorawcb(so); KASSERT(solocked(so)); KASSERT(rp != NULL); KASSERT(nam != NULL); if (rp->rcb_faddr == NULL) return ENOTCONN; raw_setsockaddr(rp, nam); return 0; } static int COMPATNAME(route_rcvd)(struct socket *so, int flags, struct lwp *l) { KASSERT(solocked(so)); return EOPNOTSUPP; } static int COMPATNAME(route_recvoob)(struct socket *so, struct mbuf *m, int flags) { KASSERT(solocked(so)); return EOPNOTSUPP; } static int COMPATNAME(route_send)(struct socket *so, struct mbuf *m, struct sockaddr *nam, struct mbuf *control, struct lwp *l) { int error = 0; int s; KASSERT(solocked(so)); KASSERT(so->so_proto == &COMPATNAME(route_protosw)[0]); s = splsoftnet(); error = raw_send(so, m, nam, control, l, &COMPATNAME(route_output)); splx(s); return error; } static int COMPATNAME(route_sendoob)(struct socket *so, struct mbuf *m, struct mbuf *control) { KASSERT(solocked(so)); m_freem(m); m_freem(control); return EOPNOTSUPP; } static int COMPATNAME(route_purgeif)(struct socket *so, struct ifnet *ifp) { panic("route_purgeif"); return EOPNOTSUPP; } #if defined(INET) || defined(INET6) static int route_get_sdl_index(struct rt_addrinfo *info, int *sdl_index) { struct rtentry *nrt; int error; error = rtrequest1(RTM_GET, info, &nrt); if (error != 0) return error; /* * nrt->rt_ifp->if_index may not be correct * due to changing to ifplo0. */ *sdl_index = satosdl(nrt->rt_gateway)->sdl_index; rt_unref(nrt); return 0; } #endif static void route_get_sdl(const struct ifnet *ifp, const struct sockaddr *dst, struct sockaddr_dl *sdl, int *flags) { struct llentry *la; KASSERT(ifp != NULL); IF_AFDATA_RLOCK(ifp); switch (dst->sa_family) { case AF_INET: la = lla_lookup(LLTABLE(ifp), 0, dst); break; case AF_INET6: la = lla_lookup(LLTABLE6(ifp), 0, dst); break; default: la = NULL; KASSERTMSG(0, "Invalid AF=%d\n", dst->sa_family); break; } IF_AFDATA_RUNLOCK(ifp); void *a = (LLE_IS_VALID(la) && (la->la_flags & LLE_VALID) == LLE_VALID) ? &la->ll_addr : NULL; a = sockaddr_dl_init(sdl, sizeof(*sdl), ifp->if_index, ifp->if_type, NULL, 0, a, ifp->if_addrlen); KASSERT(a != NULL); if (la != NULL) { *flags = la->la_flags; LLE_RUNLOCK(la); } } static int route_output_report(struct rtentry *rt, struct rt_addrinfo *info, struct rt_xmsghdr *rtm, struct rt_xmsghdr **new_rtm) { int len, error; if (rtm->rtm_addrs & (RTA_IFP | RTA_IFA)) { const struct ifaddr *rtifa; const struct ifnet *ifp = rt->rt_ifp; info->rti_info[RTAX_IFP] = ifp->if_dl->ifa_addr; /* rtifa used to be simply rt->rt_ifa. * If rt->rt_ifa != NULL, then * rt_get_ifa() != NULL. So this * ought to still be safe. --dyoung */ rtifa = rt_get_ifa(rt); info->rti_info[RTAX_IFA] = rtifa->ifa_addr; #ifdef RTSOCK_DEBUG if (info->rti_info[RTAX_IFA]->sa_family == AF_INET) { char ibuf[INET_ADDRSTRLEN]; char abuf[INET_ADDRSTRLEN]; printf("%s: copying out RTAX_IFA %s " "for info->rti_info[RTAX_DST] %s " "ifa_getifa %p ifa_seqno %p\n", __func__, RT_IN_PRINT(info, ibuf, RTAX_IFA), RT_IN_PRINT(info, abuf, RTAX_DST), (void *)rtifa->ifa_getifa, rtifa->ifa_seqno); } #endif /* RTSOCK_DEBUG */ if (ifp->if_flags & IFF_POINTOPOINT) info->rti_info[RTAX_BRD] = rtifa->ifa_dstaddr; else info->rti_info[RTAX_BRD] = NULL; rtm->rtm_index = ifp->if_index; } error = rt_msg2(rtm->rtm_type, info, NULL, NULL, &len); if (error) return error; if (len > rtm->rtm_msglen) { struct rt_xmsghdr *old_rtm = rtm; R_Malloc(*new_rtm, struct rt_xmsghdr *, len); if (*new_rtm == NULL) return ENOBUFS; (void)memcpy(*new_rtm, old_rtm, old_rtm->rtm_msglen); rtm = *new_rtm; } (void)rt_msg2(rtm->rtm_type, info, rtm, NULL, 0); rtm->rtm_flags = rt->rt_flags; rtm_setmetrics(rt, rtm); rtm->rtm_addrs = info->rti_addrs; return 0; } /*ARGSUSED*/ int COMPATNAME(route_output)(struct mbuf *m, struct socket *so) { struct sockproto proto = { .sp_family = PF_XROUTE, }; struct rt_xmsghdr hdr; struct rt_xmsghdr *rtm = NULL; struct rt_xmsghdr *old_rtm = NULL, *new_rtm = NULL; struct rtentry *rt = NULL; struct rtentry *saved_nrt = NULL; struct rt_addrinfo info; int len, error = 0; sa_family_t family; struct sockaddr_dl sdl; int bound = curlwp_bind(); bool do_rt_free = false; struct sockaddr_storage netmask; #define senderr(e) do { error = e; goto flush;} while (/*CONSTCOND*/ 0) if (m == NULL || ((m->m_len < sizeof(int32_t)) && (m = m_pullup(m, sizeof(int32_t))) == NULL)) { error = ENOBUFS; goto out; } if ((m->m_flags & M_PKTHDR) == 0) panic("%s", __func__); len = m->m_pkthdr.len; if (len < sizeof(*rtm)) { info.rti_info[RTAX_DST] = NULL; senderr(EINVAL); } m_copydata(m, 0, sizeof(hdr), &hdr); if (len != hdr.rtm_msglen) { info.rti_info[RTAX_DST] = NULL; senderr(EINVAL); } R_Malloc(rtm, struct rt_xmsghdr *, len); if (rtm == NULL) { info.rti_info[RTAX_DST] = NULL; senderr(ENOBUFS); } m_copydata(m, 0, len, rtm); if (rtm->rtm_version != RTM_XVERSION) { info.rti_info[RTAX_DST] = NULL; senderr(EPROTONOSUPPORT); } rtm->rtm_pid = curproc->p_pid; memset(&info, 0, sizeof(info)); info.rti_addrs = rtm->rtm_addrs; if (rt_xaddrs(rtm->rtm_type, (const char *)(rtm + 1), len + (char *)rtm, &info)) { senderr(EINVAL); } info.rti_flags = rtm->rtm_flags; if (info.rti_info[RTAX_DST] == NULL || (info.rti_info[RTAX_DST]->sa_family >= AF_MAX)) { senderr(EINVAL); } #ifdef RTSOCK_DEBUG if (info.rti_info[RTAX_DST]->sa_family == AF_INET) { char abuf[INET_ADDRSTRLEN]; printf("%s: extracted info.rti_info[RTAX_DST] %s\n", __func__, RT_IN_PRINT(&info, abuf, RTAX_DST)); } #endif /* RTSOCK_DEBUG */ if (info.rti_info[RTAX_GATEWAY] != NULL && (info.rti_info[RTAX_GATEWAY]->sa_family >= AF_MAX)) { senderr(EINVAL); } /* * Verify that the socket has the appropriate privilege; RTM_GET * is the only operation the non-superuser is allowed. */ if (kauth_authorize_network(so->so_cred, KAUTH_NETWORK_ROUTE, 0, rtm, NULL, NULL) != 0) senderr(EACCES); /* * route(8) passes a sockaddr truncated with prefixlen. * The kernel doesn't expect such sockaddr and need to * use a buffer that is big enough for the sockaddr expected * (padded with 0's). We keep the original length of the sockaddr. */ if (info.rti_info[RTAX_NETMASK]) { /* * Use the family of RTAX_DST, because RTAX_NETMASK * can have a zero family if it comes from the radix * tree via rt_mask(). */ socklen_t sa_len = sockaddr_getsize_by_family( info.rti_info[RTAX_DST]->sa_family); socklen_t masklen = sockaddr_getlen( info.rti_info[RTAX_NETMASK]); if (sa_len != 0 && sa_len > masklen) { KASSERT(sa_len <= sizeof(netmask)); memcpy(&netmask, info.rti_info[RTAX_NETMASK], masklen); memset((char *)&netmask + masklen, 0, sa_len - masklen); info.rti_info[RTAX_NETMASK] = sstocsa(&netmask); } } switch (rtm->rtm_type) { case RTM_ADD: if (info.rti_info[RTAX_GATEWAY] == NULL) { senderr(EINVAL); } #if defined(INET) || defined(INET6) /* support for new ARP/NDP code with keeping backcompat */ if (info.rti_info[RTAX_GATEWAY]->sa_family == AF_LINK) { const struct sockaddr_dl *sdlp = satocsdl(info.rti_info[RTAX_GATEWAY]); /* Allow routing requests by interface index */ if (sdlp->sdl_nlen == 0 && sdlp->sdl_alen == 0 && sdlp->sdl_slen == 0) goto fallback; /* * Old arp binaries don't set the sdl_index * so we have to complement it. */ int sdl_index = sdlp->sdl_index; if (sdl_index == 0) { error = route_get_sdl_index(&info, &sdl_index); if (error != 0) goto fallback; } else if ( info.rti_info[RTAX_DST]->sa_family == AF_INET) { /* * XXX workaround for SIN_PROXY case; proxy arp * entry should be in an interface that has * a network route including the destination, * not a local (link) route that may not be a * desired place, for example a tap. */ const struct sockaddr_inarp *sina = (const struct sockaddr_inarp *) info.rti_info[RTAX_DST]; if (sina->sin_other & SIN_PROXY) { error = route_get_sdl_index(&info, &sdl_index); if (error != 0) goto fallback; } } error = lla_rt_output(rtm->rtm_type, rtm->rtm_flags, rtm->rtm_rmx.rmx_expire, &info, sdl_index); break; } fallback: #endif /* defined(INET) || defined(INET6) */ error = rtrequest1(rtm->rtm_type, &info, &saved_nrt); if (error == 0) { _rt_setmetrics(rtm->rtm_inits, rtm, saved_nrt); rt_unref(saved_nrt); } break; case RTM_DELETE: #if defined(INET) || defined(INET6) /* support for new ARP/NDP code */ if (info.rti_info[RTAX_GATEWAY] && (info.rti_info[RTAX_GATEWAY]->sa_family == AF_LINK) && (rtm->rtm_flags & RTF_LLDATA) != 0) { const struct sockaddr_dl *sdlp = satocsdl(info.rti_info[RTAX_GATEWAY]); error = lla_rt_output(rtm->rtm_type, rtm->rtm_flags, rtm->rtm_rmx.rmx_expire, &info, sdlp->sdl_index); rtm->rtm_flags &= ~RTF_UP; break; } #endif error = rtrequest1(rtm->rtm_type, &info, &saved_nrt); if (error != 0) break; rt = saved_nrt; do_rt_free = true; info.rti_info[RTAX_DST] = rt_getkey(rt); info.rti_info[RTAX_GATEWAY] = rt->rt_gateway; info.rti_info[RTAX_NETMASK] = rt_mask(rt); info.rti_info[RTAX_TAG] = rt_gettag(rt); error = route_output_report(rt, &info, rtm, &new_rtm); if (error) senderr(error); if (new_rtm != NULL) { old_rtm = rtm; rtm = new_rtm; } break; case RTM_GET: case RTM_CHANGE: case RTM_LOCK: /* XXX This will mask info.rti_info[RTAX_DST] with * info.rti_info[RTAX_NETMASK] before * searching. It did not used to do that. --dyoung */ rt = NULL; error = rtrequest1(RTM_GET, &info, &rt); if (error != 0) senderr(error); if (rtm->rtm_type != RTM_GET) {/* XXX: too grotty */ if (memcmp(info.rti_info[RTAX_DST], rt_getkey(rt), info.rti_info[RTAX_DST]->sa_len) != 0) senderr(ESRCH); if (info.rti_info[RTAX_NETMASK] == NULL && rt_mask(rt) != NULL) senderr(ETOOMANYREFS); } /* * XXX if arp/ndp requests an L2 entry, we have to obtain * it from lltable while for the route command we have to * return a route as it is. How to distinguish them? * For newer arp/ndp, RTF_LLDATA flag set by arp/ndp * indicates an L2 entry is requested. For old arp/ndp * binaries, we check RTF_UP flag is NOT set; it works * by the fact that arp/ndp don't set it while the route * command sets it. */ if (((rtm->rtm_flags & RTF_LLDATA) != 0 || (rtm->rtm_flags & RTF_UP) == 0) && rtm->rtm_type == RTM_GET && sockaddr_cmp(rt_getkey(rt), info.rti_info[RTAX_DST]) != 0) { int ll_flags = 0; route_get_sdl(rt->rt_ifp, info.rti_info[RTAX_DST], &sdl, &ll_flags); info.rti_info[RTAX_GATEWAY] = sstocsa(&sdl); error = route_output_report(rt, &info, rtm, &new_rtm); if (error) senderr(error); if (new_rtm != NULL) { old_rtm = rtm; rtm = new_rtm; } rtm->rtm_flags |= RTF_LLDATA; rtm->rtm_flags &= ~RTF_CONNECTED; rtm->rtm_flags |= (ll_flags & LLE_STATIC) ? RTF_STATIC : 0; break; } switch (rtm->rtm_type) { case RTM_GET: info.rti_info[RTAX_DST] = rt_getkey(rt); info.rti_info[RTAX_GATEWAY] = rt->rt_gateway; info.rti_info[RTAX_NETMASK] = rt_mask(rt); info.rti_info[RTAX_TAG] = rt_gettag(rt); error = route_output_report(rt, &info, rtm, &new_rtm); if (error) senderr(error); if (new_rtm != NULL) { old_rtm = rtm; rtm = new_rtm; } break; case RTM_CHANGE: #ifdef NET_MPSAFE /* * Release rt_so_mtx to avoid a deadlock with route_intr * and also serialize updating routes to avoid another. */ if (rt_updating) { /* Release to allow the updater to proceed */ rt_unref(rt); rt = NULL; } while (rt_updating) { error = cv_wait_sig(&rt_update_cv, rt_so_mtx); if (error != 0) goto flush; } if (rt == NULL) { error = rtrequest1(RTM_GET, &info, &rt); if (error != 0) goto flush; } rt_updating = true; mutex_exit(rt_so_mtx); error = rt_update_prepare(rt); if (error == 0) { error = rt_update(rt, &info, rtm); rt_update_finish(rt); } mutex_enter(rt_so_mtx); rt_updating = false; cv_broadcast(&rt_update_cv); #else error = rt_update(rt, &info, rtm); #endif if (error != 0) goto flush; /*FALLTHROUGH*/ case RTM_LOCK: rt->rt_rmx.rmx_locks &= ~(rtm->rtm_inits); rt->rt_rmx.rmx_locks |= (rtm->rtm_inits & rtm->rtm_rmx.rmx_locks); break; } break; default: senderr(EOPNOTSUPP); } flush: if (rtm) { if (error) rtm->rtm_errno = error; else rtm->rtm_flags |= RTF_DONE; } family = info.rti_info[RTAX_DST] ? info.rti_info[RTAX_DST]->sa_family : 0; /* We cannot free old_rtm until we have stopped using the * pointers in info, some of which may point to sockaddrs * in old_rtm. */ if (old_rtm != NULL) Free(old_rtm); if (rt) { if (do_rt_free) { #ifdef NET_MPSAFE /* * Release rt_so_mtx to avoid a deadlock with * route_intr. */ mutex_exit(rt_so_mtx); rt_free(rt); mutex_enter(rt_so_mtx); #else rt_free(rt); #endif } else rt_unref(rt); } { struct rawcb *rp = NULL; /* * Check to see if we don't want our own messages. */ if ((so->so_options & SO_USELOOPBACK) == 0) { if (COMPATNAME(route_info).ri_cb.any_count <= 1) { if (rtm) Free(rtm); m_freem(m); goto out; } /* There is another listener, so construct message */ rp = sotorawcb(so); } if (rtm) { m_copyback(m, 0, rtm->rtm_msglen, rtm); if (m->m_pkthdr.len < rtm->rtm_msglen) { m_freem(m); m = NULL; } else if (m->m_pkthdr.len > rtm->rtm_msglen) m_adj(m, rtm->rtm_msglen - m->m_pkthdr.len); Free(rtm); } if (rp) rp->rcb_proto.sp_family = 0; /* Avoid us */ if (family) proto.sp_protocol = family; if (m) raw_input(m, &proto, &COMPATNAME(route_info).ri_src, &COMPATNAME(route_info).ri_dst, &rt_rawcb); if (rp) rp->rcb_proto.sp_family = PF_XROUTE; } out: curlwp_bindx(bound); return error; } static int route_ctloutput(int op, struct socket *so, struct sockopt *sopt) { struct routecb *rop = sotoroutecb(so); int error = 0; unsigned char *rtm_type, *cp, *ep; size_t len; unsigned int msgfilter; struct sockaddr *sa; KASSERT(solocked(so)); if (sopt->sopt_level != AF_ROUTE) { error = ENOPROTOOPT; } else switch (op) { case PRCO_SETOPT: switch (sopt->sopt_name) { case RO_MSGFILTER: msgfilter = 0; for (rtm_type = sopt->sopt_data, len = sopt->sopt_size; len != 0; rtm_type++, len -= sizeof(*rtm_type)) { /* Guard against overflowing our storage. */ if (*rtm_type >= sizeof(msgfilter) * CHAR_BIT) { error = EOVERFLOW; break; } msgfilter |= RTMSGFILTER(*rtm_type); } if (error == 0) rop->rocb_msgfilter = msgfilter; break; case RO_MISSFILTER: /* Validate the data */ len = 0; cp = sopt->sopt_data; ep = cp + sopt->sopt_size; while (cp < ep) { if (ep - cp < offsetof(struct sockaddr, sa_len) + sizeof(sa->sa_len)) break; if (++len > RO_FILTSA_MAX) { error = ENOBUFS; break; } sa = (struct sockaddr *)cp; if (sa->sa_len < _SA_MINSIZE || sa->sa_len >sizeof(struct sockaddr_storage)) return EINVAL; cp += RT_XROUNDUP(sa->sa_len); } if (cp != ep) { if (error == 0) error = EINVAL; break; } if (rop->rocb_missfilterlen != 0) kmem_free(rop->rocb_missfilter, rop->rocb_missfilterlen); if (sopt->sopt_size != 0) { rop->rocb_missfilter = kmem_alloc(sopt->sopt_size, KM_SLEEP); if (rop->rocb_missfilter == NULL) { rop->rocb_missfilterlen = 0; error = ENOBUFS; break; } } else rop->rocb_missfilter = NULL; rop->rocb_missfilterlen = sopt->sopt_size; if (rop->rocb_missfilterlen != 0) memcpy(rop->rocb_missfilter, sopt->sopt_data, rop->rocb_missfilterlen); break; default: error = ENOPROTOOPT; break; } break; case PRCO_GETOPT: switch (sopt->sopt_name) { case RO_MSGFILTER: error = ENOTSUP; break; default: error = ENOPROTOOPT; break; } } return error; } static void _rt_setmetrics(int which, const struct rt_xmsghdr *in, struct rtentry *out) { #define metric(f, e) if (which & (f)) out->rt_rmx.e = in->rtm_rmx.e; metric(RTV_RPIPE, rmx_recvpipe); metric(RTV_SPIPE, rmx_sendpipe); metric(RTV_SSTHRESH, rmx_ssthresh); metric(RTV_RTT, rmx_rtt); metric(RTV_RTTVAR, rmx_rttvar); metric(RTV_HOPCOUNT, rmx_hopcount); metric(RTV_MTU, rmx_mtu); #undef metric if (which & RTV_EXPIRE) { out->rt_rmx.rmx_expire = in->rtm_rmx.rmx_expire ? time_wall_to_mono(in->rtm_rmx.rmx_expire) : 0; } } static void rtm_setmetrics(const struct rtentry *in, struct rt_xmsghdr *out) { #define metric(e) out->rtm_rmx.e = in->rt_rmx.e; metric(rmx_recvpipe); metric(rmx_sendpipe); metric(rmx_ssthresh); metric(rmx_rtt); metric(rmx_rttvar); metric(rmx_hopcount); metric(rmx_mtu); metric(rmx_locks); #undef metric out->rtm_rmx.rmx_expire = in->rt_rmx.rmx_expire ? time_mono_to_wall(in->rt_rmx.rmx_expire) : 0; } static int rt_xaddrs(u_char rtmtype, const char *cp, const char *cplim, struct rt_addrinfo *rtinfo) { const struct sockaddr *sa = NULL; /* Quell compiler warning */ int i; for (i = 0; i < RTAX_MAX && cp < cplim; i++) { if ((rtinfo->rti_addrs & (1 << i)) == 0) continue; rtinfo->rti_info[i] = sa = (const struct sockaddr *)cp; RT_XADVANCE(cp, sa); } /* * Check for extra addresses specified, except RTM_GET asking * for interface info. */ if (rtmtype == RTM_GET) { if (((rtinfo->rti_addrs & (~((1 << RTAX_IFP) | (1 << RTAX_IFA)))) & (~0U << i)) != 0) return 1; } else if ((rtinfo->rti_addrs & (~0U << i)) != 0) return 1; /* Check for bad data length. */ if (cp != cplim) { if (i == RTAX_NETMASK + 1 && sa != NULL && cp - RT_XROUNDUP(sa->sa_len) + sa->sa_len == cplim) /* * The last sockaddr was info.rti_info[RTAX_NETMASK]. * We accept this for now for the sake of old * binaries or third party softwares. */ ; else return 1; } return 0; } static int rt_getlen(int type) { RTS_CTASSERT(__alignof(struct ifa_msghdr) >= sizeof(uint64_t)); RTS_CTASSERT(__alignof(struct if_msghdr) >= sizeof(uint64_t)); RTS_CTASSERT(__alignof(struct if_announcemsghdr) >= sizeof(uint64_t)); RTS_CTASSERT(__alignof(struct rt_msghdr) >= sizeof(uint64_t)); switch (type) { case RTM_ODELADDR: case RTM_ONEWADDR: case RTM_OCHGADDR: if (rtsock_iflist_70_hook.hooked) return sizeof(struct ifa_msghdr70); else { #ifdef RTSOCK_DEBUG printf("%s: unsupported RTM type %d\n", __func__, type); #endif return -1; } case RTM_DELADDR: case RTM_NEWADDR: case RTM_CHGADDR: return sizeof(struct ifa_xmsghdr); case RTM_OOIFINFO: if (rtsock_iflist_14_hook.hooked) return sizeof(struct if_msghdr14); else { #ifdef RTSOCK_DEBUG printf("%s: unsupported RTM type RTM_OOIFINFO\n", __func__); #endif return -1; } case RTM_OIFINFO: if (rtsock_iflist_50_hook.hooked) return sizeof(struct if_msghdr50); else { #ifdef RTSOCK_DEBUG printf("%s: unsupported RTM type RTM_OIFINFO\n", __func__); #endif return -1; } case RTM_IFINFO: return sizeof(struct if_xmsghdr); case RTM_IFANNOUNCE: case RTM_IEEE80211: return sizeof(struct if_xannouncemsghdr); default: return sizeof(struct rt_xmsghdr); } } struct mbuf * COMPATNAME(rt_msg1)(int type, struct rt_addrinfo *rtinfo, void *data, int datalen) { struct rt_xmsghdr *rtm; struct mbuf *m; int i; const struct sockaddr *sa; int len, dlen; m = m_gethdr(M_DONTWAIT, MT_DATA); if (m == NULL) return m; MCLAIM(m, &COMPATNAME(routedomain).dom_mowner); if ((len = rt_getlen(type)) == -1) goto out; if (len > MHLEN + MLEN) panic("%s: message too long", __func__); else if (len > MHLEN) { m->m_next = m_get(M_DONTWAIT, MT_DATA); if (m->m_next == NULL) goto out; MCLAIM(m->m_next, m->m_owner); m->m_pkthdr.len = len; m->m_len = MHLEN; m->m_next->m_len = len - MHLEN; } else { m->m_pkthdr.len = m->m_len = len; } m_reset_rcvif(m); m_copyback(m, 0, datalen, data); if (len > datalen) (void)memset(mtod(m, char *) + datalen, 0, len - datalen); rtm = mtod(m, struct rt_xmsghdr *); for (i = 0; i < RTAX_MAX; i++) { if ((sa = rtinfo->rti_info[i]) == NULL) continue; rtinfo->rti_addrs |= (1 << i); dlen = RT_XROUNDUP(sa->sa_len); m_copyback(m, len, sa->sa_len, sa); if (dlen != sa->sa_len) { /* * Up to 7 + 1 nul's since roundup is to * sizeof(uint64_t) (8 bytes) */ m_copyback(m, len + sa->sa_len, dlen - sa->sa_len, "\0\0\0\0\0\0\0"); } len += dlen; } if (m->m_pkthdr.len != len) goto out; rtm->rtm_msglen = len; rtm->rtm_version = RTM_XVERSION; rtm->rtm_type = type; return m; out: m_freem(m); return NULL; } /* * rt_msg2 * * fills 'cp' or 'w'.w_tmem with the routing socket message and * returns the length of the message in 'lenp'. * * if walkarg is 0, cp is expected to be 0 or a buffer large enough to hold * the message * otherwise walkarg's w_needed is updated and if the user buffer is * specified and w_needed indicates space exists the information is copied * into the temp space (w_tmem). w_tmem is [re]allocated if necessary, * if the allocation fails ENOBUFS is returned. */ static int rt_msg2(int type, struct rt_addrinfo *rtinfo, void *cpv, struct rt_walkarg *w, int *lenp) { int i; int len, dlen, second_time = 0; char *cp0, *cp = cpv; rtinfo->rti_addrs = 0; again: if ((len = rt_getlen(type)) == -1) return EINVAL; if ((cp0 = cp) != NULL) cp += len; for (i = 0; i < RTAX_MAX; i++) { const struct sockaddr *sa; if ((sa = rtinfo->rti_info[i]) == NULL) continue; rtinfo->rti_addrs |= (1 << i); dlen = RT_XROUNDUP(sa->sa_len); if (cp) { int diff = dlen - sa->sa_len; (void)memcpy(cp, sa, (size_t)sa->sa_len); cp += sa->sa_len; if (diff > 0) { (void)memset(cp, 0, (size_t)diff); cp += diff; } } len += dlen; } if (cp == NULL && w != NULL && !second_time) { struct rt_walkarg *rw = w; rw->w_needed += len; if (rw->w_needed <= 0 && rw->w_where) { if (rw->w_tmemsize < len) { if (rw->w_tmem) kmem_free(rw->w_tmem, rw->w_tmemsize); rw->w_tmem = kmem_zalloc(len, KM_SLEEP); rw->w_tmemsize = len; } if (rw->w_tmem) { cp = rw->w_tmem; second_time = 1; goto again; } else { rw->w_tmemneeded = len; return ENOBUFS; } } } if (cp) { struct rt_xmsghdr *rtm = (struct rt_xmsghdr *)cp0; rtm->rtm_version = RTM_XVERSION; rtm->rtm_type = type; rtm->rtm_msglen = len; } if (lenp) *lenp = len; return 0; } /* * This routine is called to generate a message from the routing * socket indicating that a redirect has occurred, a routing lookup * has failed, or that a protocol has detected timeouts to a particular * destination. */ void COMPATNAME(rt_missmsg)(int type, const struct rt_addrinfo *rtinfo, int flags, int error) { struct rt_xmsghdr rtm; struct mbuf *m; const struct sockaddr *sa = rtinfo->rti_info[RTAX_DST]; struct rt_addrinfo info = *rtinfo; COMPATCALL(rt_missmsg, (type, rtinfo, flags, error)); if (COMPATNAME(route_info).ri_cb.any_count == 0) return; memset(&rtm, 0, sizeof(rtm)); rtm.rtm_pid = curproc->p_pid; rtm.rtm_flags = RTF_DONE | flags; rtm.rtm_errno = error; m = COMPATNAME(rt_msg1)(type, &info, &rtm, sizeof(rtm)); if (m == NULL) return; mtod(m, struct rt_xmsghdr *)->rtm_addrs = info.rti_addrs; COMPATNAME(route_enqueue)(m, sa ? sa->sa_family : 0); } /* * This routine is called to generate a message from the routing * socket indicating that the status of a network interface has changed. */ void COMPATNAME(rt_ifmsg)(struct ifnet *ifp) { struct if_xmsghdr ifm; struct mbuf *m; struct rt_addrinfo info; COMPATCALL(rt_ifmsg, (ifp)); if (COMPATNAME(route_info).ri_cb.any_count == 0) return; (void)memset(&info, 0, sizeof(info)); (void)memset(&ifm, 0, sizeof(ifm)); ifm.ifm_index = ifp->if_index; ifm.ifm_flags = ifp->if_flags; if_export_if_data(ifp, &ifm.ifm_data, false); ifm.ifm_addrs = 0; m = COMPATNAME(rt_msg1)(RTM_IFINFO, &info, &ifm, sizeof(ifm)); if (m == NULL) return; COMPATNAME(route_enqueue)(m, 0); MODULE_HOOK_CALL_VOID(rtsock_oifmsg_14_hook, (ifp), __nothing); MODULE_HOOK_CALL_VOID(rtsock_oifmsg_50_hook, (ifp), __nothing); } /* * This is called to generate messages from the routing socket * indicating a network interface has had addresses associated with it. * if we ever reverse the logic and replace messages TO the routing * socket indicate a request to configure interfaces, then it will * be unnecessary as the routing socket will automatically generate * copies of it. */ static void COMPATNAME(rt_addrmsg0)(int cmd, struct ifaddr *ifa, int error, struct rtentry *rt, const struct sockaddr *src) { #define cmdpass(__cmd, __pass) (((__cmd) << 2) | (__pass)) struct rt_addrinfo info; const struct sockaddr *sa; int pass; struct mbuf *m; struct ifnet *ifp; struct rt_xmsghdr rtm; struct ifa_xmsghdr ifam; int ncmd; KASSERT(ifa != NULL); KASSERT(ifa->ifa_addr != NULL); ifp = ifa->ifa_ifp; if (cmd == RTM_ADD && vec_sctp_add_ip_address != NULL) { (*vec_sctp_add_ip_address)(ifa); } else if (cmd == RTM_DELETE && vec_sctp_delete_ip_address != NULL) { (*vec_sctp_delete_ip_address)(ifa); } COMPATCALL(rt_addrmsg_rt, (cmd, ifa, error, rt)); if (COMPATNAME(route_info).ri_cb.any_count == 0) return; for (pass = 1; pass < 3; pass++) { memset(&info, 0, sizeof(info)); switch (cmdpass(cmd, pass)) { case cmdpass(RTM_ADD, 1): case cmdpass(RTM_CHANGE, 1): case cmdpass(RTM_DELETE, 2): case cmdpass(RTM_NEWADDR, 1): case cmdpass(RTM_DELADDR, 1): case cmdpass(RTM_CHGADDR, 1): switch (cmd) { case RTM_ADD: ncmd = RTM_XNEWADDR; break; case RTM_DELETE: ncmd = RTM_XDELADDR; break; case RTM_CHANGE: ncmd = RTM_XCHGADDR; break; case RTM_NEWADDR: ncmd = RTM_XNEWADDR; break; case RTM_DELADDR: ncmd = RTM_XDELADDR; break; case RTM_CHGADDR: ncmd = RTM_XCHGADDR; break; default: panic("%s: unknown command %d", __func__, cmd); } MODULE_HOOK_CALL_VOID(rtsock_newaddr_70_hook, (ncmd, ifa), __nothing); info.rti_info[RTAX_IFA] = sa = ifa->ifa_addr; KASSERT(ifp->if_dl != NULL); info.rti_info[RTAX_IFP] = ifp->if_dl->ifa_addr; info.rti_info[RTAX_NETMASK] = ifa->ifa_netmask; info.rti_info[RTAX_BRD] = ifa->ifa_dstaddr; info.rti_info[RTAX_AUTHOR] = src; memset(&ifam, 0, sizeof(ifam)); ifam.ifam_index = ifp->if_index; ifam.ifam_metric = ifa->ifa_metric; ifam.ifam_flags = ifa->ifa_flags; #ifndef COMPAT_RTSOCK ifam.ifam_pid = curproc->p_pid; ifam.ifam_addrflags = if_addrflags(ifa); #endif m = COMPATNAME(rt_msg1)(ncmd, &info, &ifam, sizeof(ifam)); if (m == NULL) continue; mtod(m, struct ifa_xmsghdr *)->ifam_addrs = info.rti_addrs; break; case cmdpass(RTM_ADD, 2): case cmdpass(RTM_CHANGE, 2): case cmdpass(RTM_DELETE, 1): if (rt == NULL) continue; info.rti_info[RTAX_NETMASK] = rt_mask(rt); info.rti_info[RTAX_DST] = sa = rt_getkey(rt); info.rti_info[RTAX_GATEWAY] = rt->rt_gateway; memset(&rtm, 0, sizeof(rtm)); rtm.rtm_pid = curproc->p_pid; rtm.rtm_index = ifp->if_index; rtm.rtm_flags |= rt->rt_flags; rtm.rtm_errno = error; m = COMPATNAME(rt_msg1)(cmd, &info, &rtm, sizeof(rtm)); if (m == NULL) continue; mtod(m, struct rt_xmsghdr *)->rtm_addrs = info.rti_addrs; break; default: continue; } KASSERTMSG(m != NULL, "called with wrong command"); COMPATNAME(route_enqueue)(m, sa ? sa->sa_family : 0); } #undef cmdpass } void COMPATNAME(rt_addrmsg)(int cmd, struct ifaddr *ifa) { COMPATNAME(rt_addrmsg0)(cmd, ifa, 0, NULL, NULL); } void COMPATNAME(rt_addrmsg_rt)(int cmd, struct ifaddr *ifa, int error, struct rtentry *rt) { COMPATNAME(rt_addrmsg0)(cmd, ifa, error, rt, NULL); } void COMPATNAME(rt_addrmsg_src)(int cmd, struct ifaddr *ifa, const struct sockaddr *src) { COMPATNAME(rt_addrmsg0)(cmd, ifa, 0, NULL, src); } static struct mbuf * rt_makeifannouncemsg(struct ifnet *ifp, int type, int what, struct rt_addrinfo *info) { struct if_xannouncemsghdr ifan; memset(info, 0, sizeof(*info)); memset(&ifan, 0, sizeof(ifan)); ifan.ifan_index = ifp->if_index; strlcpy(ifan.ifan_name, ifp->if_xname, sizeof(ifan.ifan_name)); ifan.ifan_what = what; return COMPATNAME(rt_msg1)(type, info, &ifan, sizeof(ifan)); } /* * This is called to generate routing socket messages indicating * network interface arrival and departure. */ void COMPATNAME(rt_ifannouncemsg)(struct ifnet *ifp, int what) { struct mbuf *m; struct rt_addrinfo info; COMPATCALL(rt_ifannouncemsg, (ifp, what)); if (COMPATNAME(route_info).ri_cb.any_count == 0) return; m = rt_makeifannouncemsg(ifp, RTM_IFANNOUNCE, what, &info); if (m == NULL) return; COMPATNAME(route_enqueue)(m, 0); } /* * This is called to generate routing socket messages indicating * IEEE80211 wireless events. * XXX we piggyback on the RTM_IFANNOUNCE msg format in a clumsy way. */ void COMPATNAME(rt_ieee80211msg)(struct ifnet *ifp, int what, void *data, size_t data_len) { struct mbuf *m; struct rt_addrinfo info; COMPATCALL(rt_ieee80211msg, (ifp, what, data, data_len)); if (COMPATNAME(route_info).ri_cb.any_count == 0) return; m = rt_makeifannouncemsg(ifp, RTM_IEEE80211, what, &info); if (m == NULL) return; /* * Append the ieee80211 data. Try to stick it in the * mbuf containing the ifannounce msg; otherwise allocate * a new mbuf and append. * * NB: we assume m is a single mbuf. */ if (data_len > M_TRAILINGSPACE(m)) { struct mbuf *n = m_get(M_NOWAIT, MT_DATA); if (n == NULL) { m_freem(m); return; } (void)memcpy(mtod(n, void *), data, data_len); n->m_len = data_len; m->m_next = n; } else if (data_len > 0) { (void)memcpy(mtod(m, uint8_t *) + m->m_len, data, data_len); m->m_len += data_len; } if (m->m_flags & M_PKTHDR) m->m_pkthdr.len += data_len; mtod(m, struct if_xannouncemsghdr *)->ifan_msglen += data_len; COMPATNAME(route_enqueue)(m, 0); } /* * Routing message software interrupt routine */ static void COMPATNAME(route_intr)(void *cookie) { struct sockproto proto = { .sp_family = PF_XROUTE, }; struct route_info * const ri = &COMPATNAME(route_info); struct mbuf *m; SOFTNET_KERNEL_LOCK_UNLESS_NET_MPSAFE(); for (;;) { IFQ_LOCK(&ri->ri_intrq); IF_DEQUEUE(&ri->ri_intrq, m); IFQ_UNLOCK(&ri->ri_intrq); if (m == NULL) break; proto.sp_protocol = M_GETCTX(m, uintptr_t); #ifdef NET_MPSAFE mutex_enter(rt_so_mtx); #endif raw_input(m, &proto, &ri->ri_src, &ri->ri_dst, &rt_rawcb); #ifdef NET_MPSAFE mutex_exit(rt_so_mtx); #endif } SOFTNET_KERNEL_UNLOCK_UNLESS_NET_MPSAFE(); } /* * Enqueue a message to the software interrupt routine. */ void COMPATNAME(route_enqueue)(struct mbuf *m, int family) { struct route_info * const ri = &COMPATNAME(route_info); int wasempty; IFQ_LOCK(&ri->ri_intrq); if (IF_QFULL(&ri->ri_intrq)) { printf("%s: queue full, dropped message\n", __func__); IF_DROP(&ri->ri_intrq); IFQ_UNLOCK(&ri->ri_intrq); m_freem(m); } else { wasempty = IF_IS_EMPTY(&ri->ri_intrq); M_SETCTX(m, (uintptr_t)family); IF_ENQUEUE(&ri->ri_intrq, m); IFQ_UNLOCK(&ri->ri_intrq); if (wasempty) { kpreempt_disable(); softint_schedule(ri->ri_sih); kpreempt_enable(); } } } static void COMPATNAME(route_init)(void) { struct route_info * const ri = &COMPATNAME(route_info); #ifndef COMPAT_RTSOCK rt_init(); #ifdef NET_MPSAFE rt_so_mtx = mutex_obj_alloc(MUTEX_DEFAULT, IPL_NONE); cv_init(&rt_update_cv, "rtsock_cv"); #endif sysctl_net_route_setup(NULL, PF_ROUTE, "rtable"); #endif ri->ri_intrq.ifq_maxlen = ri->ri_maxqlen; ri->ri_sih = softint_establish(SOFTINT_NET | SOFTINT_MPSAFE, COMPATNAME(route_intr), NULL); IFQ_LOCK_INIT(&ri->ri_intrq); #ifdef MBUFTRACE MOWNER_ATTACH(&COMPATNAME(routedomain).dom_mowner); #endif } /* * Definitions of protocols supported in the ROUTE domain. */ #ifndef COMPAT_RTSOCK PR_WRAP_USRREQS(route); #else PR_WRAP_USRREQS(compat_50_route); #endif static const struct pr_usrreqs route_usrreqs = { .pr_attach = COMPATNAME(route_attach_wrapper), .pr_detach = COMPATNAME(route_detach_wrapper), .pr_accept = COMPATNAME(route_accept_wrapper), .pr_bind = COMPATNAME(route_bind_wrapper), .pr_listen = COMPATNAME(route_listen_wrapper), .pr_connect = COMPATNAME(route_connect_wrapper), .pr_connect2 = COMPATNAME(route_connect2_wrapper), .pr_disconnect = COMPATNAME(route_disconnect_wrapper), .pr_shutdown = COMPATNAME(route_shutdown_wrapper), .pr_abort = COMPATNAME(route_abort_wrapper), .pr_ioctl = COMPATNAME(route_ioctl_wrapper), .pr_stat = COMPATNAME(route_stat_wrapper), .pr_peeraddr = COMPATNAME(route_peeraddr_wrapper), .pr_sockaddr = COMPATNAME(route_sockaddr_wrapper), .pr_rcvd = COMPATNAME(route_rcvd_wrapper), .pr_recvoob = COMPATNAME(route_recvoob_wrapper), .pr_send = COMPATNAME(route_send_wrapper), .pr_sendoob = COMPATNAME(route_sendoob_wrapper), .pr_purgeif = COMPATNAME(route_purgeif_wrapper), }; static const struct protosw COMPATNAME(route_protosw)[] = { { .pr_type = SOCK_RAW, .pr_domain = &COMPATNAME(routedomain), .pr_flags = PR_ATOMIC|PR_ADDR, .pr_ctlinput = raw_ctlinput, .pr_ctloutput = route_ctloutput, .pr_usrreqs = &route_usrreqs, .pr_init = rt_pr_init, }, }; struct domain COMPATNAME(routedomain) = { .dom_family = PF_XROUTE, .dom_name = DOMAINNAME, .dom_init = COMPATNAME(route_init), .dom_protosw = COMPATNAME(route_protosw), .dom_protoswNPROTOSW = &COMPATNAME(route_protosw)[__arraycount(COMPATNAME(route_protosw))], #ifdef MBUFTRACE .dom_mowner = MOWNER_INIT("route", "rtm"), #endif };
34 34 4 4 4 4 4 4 4 4 4 27 27 27 27 27 27 27 27 27 27 27 26 27 27 27 27 27 27 27 27 27 26 27 27 27 27 27 27 27 26 27 27 27 27 27 27 27 27 27 27 26 27 2 2 2 2 2 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 1 27 27 27 27 27 27 27 27 27 27 27 27 27 27 26 27 27 27 27 27 27 27 27 27 27 11 27 27 27 27 19 7 4 9 2 2 2 7 4 4 23 15 8 19 11 27 17 17 17 17 17 17 4 15 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 1 4 8 8 8 8 8 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 30 30 30 29 30 30 30 19 30 34 34 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 3 6 3 3 3 3 34 34 34 34 34 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 /* $NetBSD: ufs_rename.c,v 1.14 2021/10/20 03:08:19 thorpej Exp $ */ /*- * Copyright (c) 2012 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Taylor R Campbell. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /* * UFS Rename */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: ufs_rename.c,v 1.14 2021/10/20 03:08:19 thorpej Exp $"); #include <sys/param.h> #include <sys/buf.h> #include <sys/errno.h> #include <sys/kauth.h> #include <sys/mount.h> #include <sys/namei.h> #include <sys/pool.h> #include <sys/vnode.h> #include <sys/vnode_if.h> #include <sys/wapbl.h> #include <miscfs/genfs/genfs.h> #include <ufs/ufs/dir.h> #include <ufs/ufs/inode.h> #include <ufs/ufs/ufs_bswap.h> #include <ufs/ufs/ufs_extern.h> #include <ufs/ufs/ufs_wapbl.h> #include <ufs/ufs/ufsmount.h> /* * Forward declarations */ static int ufs_sane_rename(struct vnode *, struct componentname *, struct vnode *, struct componentname *, kauth_cred_t, bool); static bool ufs_rename_ulr_overlap_p(const struct ufs_lookup_results *, const struct ufs_lookup_results *); static int ufs_rename_recalculate_fulr(struct vnode *, struct ufs_lookup_results *, const struct ufs_lookup_results *, const struct componentname *); static int ufs_direct_namlen(const struct direct *, const struct vnode *); static int ufs_read_dotdot(struct vnode *, kauth_cred_t, ino_t *); static int ufs_dirbuf_dotdot_namlen(const struct dirtemplate *, const struct vnode *); static const struct genfs_rename_ops ufs_genfs_rename_ops; /* * ufs_sane_rename: The hairiest vop, with the saner API. * * Arguments: * * . fdvp (from directory vnode), * . fcnp (from component name), * . tdvp (to directory vnode), * . tcnp (to component name), * . cred (credentials structure), and * . posixly_correct (flag for behaviour if target & source link same file). * * fdvp and tdvp may be the same, and must be referenced and unlocked. */ static int ufs_sane_rename( struct vnode *fdvp, struct componentname *fcnp, struct vnode *tdvp, struct componentname *tcnp, kauth_cred_t cred, bool posixly_correct) { struct ufs_lookup_results fulr, tulr; return genfs_sane_rename(&ufs_genfs_rename_ops, fdvp, fcnp, &fulr, tdvp, tcnp, &tulr, cred, posixly_correct); } /* * ufs_rename: The hairiest vop, with the insanest API. Defer to * genfs_insane_rename immediately. */ int ufs_rename(void *v) { return genfs_insane_rename(v, &ufs_sane_rename); } /* * ufs_gro_directory_empty_p: Return true if the directory vp is * empty. dvp is its parent. * * vp and dvp must be locked and referenced. */ bool ufs_gro_directory_empty_p(struct mount *mp, kauth_cred_t cred, struct vnode *vp, struct vnode *dvp) { (void)mp; KASSERT(mp != NULL); KASSERT(vp != NULL); KASSERT(dvp != NULL); KASSERT(vp != dvp); KASSERT(vp->v_mount == mp); KASSERT(dvp->v_mount == mp); KASSERT(VOP_ISLOCKED(vp) == LK_EXCLUSIVE); KASSERT(VOP_ISLOCKED(dvp) == LK_EXCLUSIVE); return ufs_dirempty(VTOI(vp), VTOI(dvp)->i_number, cred); } /* * ufs_gro_rename_check_possible: Check whether a rename is possible * independent of credentials. */ int ufs_gro_rename_check_possible(struct mount *mp, struct vnode *fdvp, struct vnode *fvp, struct vnode *tdvp, struct vnode *tvp) { (void)mp; KASSERT(mp != NULL); KASSERT(fdvp != NULL); KASSERT(fvp != NULL); KASSERT(tdvp != NULL); KASSERT(fdvp != fvp); KASSERT(fdvp != tvp); KASSERT(tdvp != fvp); KASSERT(tdvp != tvp); KASSERT(fvp != tvp); KASSERT(fdvp->v_type == VDIR); KASSERT(tdvp->v_type == VDIR); KASSERT(fdvp->v_mount == mp); KASSERT(fvp->v_mount == mp); KASSERT(tdvp->v_mount == mp); KASSERT((tvp == NULL) || (tvp->v_mount == mp)); KASSERT(VOP_ISLOCKED(fdvp) == LK_EXCLUSIVE); KASSERT(VOP_ISLOCKED(fvp) == LK_EXCLUSIVE); KASSERT(VOP_ISLOCKED(tdvp) == LK_EXCLUSIVE); KASSERT((tvp == NULL) || (VOP_ISLOCKED(tvp) == LK_EXCLUSIVE)); return genfs_ufslike_rename_check_possible( VTOI(fdvp)->i_flags, VTOI(fvp)->i_flags, VTOI(tdvp)->i_flags, (tvp? VTOI(tvp)->i_flags : 0), (tvp != NULL), IMMUTABLE, APPEND); } /* * ufs_gro_rename_check_permitted: Check whether a rename is permitted * given our credentials. */ int ufs_gro_rename_check_permitted(struct mount *mp, kauth_cred_t cred, struct vnode *fdvp, struct vnode *fvp, struct vnode *tdvp, struct vnode *tvp) { (void)mp; KASSERT(mp != NULL); KASSERT(fdvp != NULL); KASSERT(fvp != NULL); KASSERT(tdvp != NULL); KASSERT(fdvp != fvp); KASSERT(fdvp != tvp); KASSERT(tdvp != fvp); KASSERT(tdvp != tvp); KASSERT(fvp != tvp); KASSERT(fdvp->v_type == VDIR); KASSERT(tdvp->v_type == VDIR); KASSERT(fdvp->v_mount == mp); KASSERT(fvp->v_mount == mp); KASSERT(tdvp->v_mount == mp); KASSERT((tvp == NULL) || (tvp->v_mount == mp)); KASSERT(VOP_ISLOCKED(fdvp) == LK_EXCLUSIVE); KASSERT(VOP_ISLOCKED(fvp) == LK_EXCLUSIVE); KASSERT(VOP_ISLOCKED(tdvp) == LK_EXCLUSIVE); KASSERT((tvp == NULL) || (VOP_ISLOCKED(tvp) == LK_EXCLUSIVE)); return genfs_ufslike_rename_check_permitted(cred, fdvp, VTOI(fdvp)->i_mode, VTOI(fdvp)->i_uid, fvp, VTOI(fvp)->i_uid, tdvp, VTOI(tdvp)->i_mode, VTOI(tdvp)->i_uid, tvp, (tvp? VTOI(tvp)->i_uid : 0)); } /* * ufs_gro_remove_check_possible: Check whether a remove is possible * independent of credentials. */ int ufs_gro_remove_check_possible(struct mount *mp, struct vnode *dvp, struct vnode *vp) { (void)mp; KASSERT(mp != NULL); KASSERT(dvp != NULL); KASSERT(vp != NULL); KASSERT(dvp != vp); KASSERT(dvp->v_type == VDIR); KASSERT(vp->v_type != VDIR); KASSERT(dvp->v_mount == mp); KASSERT(vp->v_mount == mp); KASSERT(VOP_ISLOCKED(dvp) == LK_EXCLUSIVE); KASSERT(VOP_ISLOCKED(vp) == LK_EXCLUSIVE); return genfs_ufslike_remove_check_possible( VTOI(dvp)->i_flags, VTOI(vp)->i_flags, IMMUTABLE, APPEND); } /* * ufs_gro_remove_check_permitted: Check whether a remove is permitted * given our credentials. */ int ufs_gro_remove_check_permitted(struct mount *mp, kauth_cred_t cred, struct vnode *dvp, struct vnode *vp) { (void)mp; KASSERT(mp != NULL); KASSERT(dvp != NULL); KASSERT(vp != NULL); KASSERT(dvp != vp); KASSERT(dvp->v_type == VDIR); KASSERT(vp->v_type != VDIR); KASSERT(dvp->v_mount == mp); KASSERT(vp->v_mount == mp); KASSERT(VOP_ISLOCKED(dvp) == LK_EXCLUSIVE); KASSERT(VOP_ISLOCKED(vp) == LK_EXCLUSIVE); return genfs_ufslike_remove_check_permitted(cred, dvp, VTOI(dvp)->i_mode, VTOI(dvp)->i_uid, vp, VTOI(vp)->i_uid); } /* * A virgin directory (no blushing please). * * XXX Copypasta from ufs_vnops.c. Kill! */ static const struct dirtemplate mastertemplate = { 0, 12, DT_DIR, 1, ".", 0, UFS_DIRBLKSIZ - 12, DT_DIR, 2, ".." }; /* * ufs_gro_rename: Actually perform the rename operation. */ int ufs_gro_rename(struct mount *mp, kauth_cred_t cred, struct vnode *fdvp, struct componentname *fcnp, void *fde, struct vnode *fvp, struct vnode *tdvp, struct componentname *tcnp, void *tde, struct vnode *tvp, nlink_t *tvp_nlinkp) { struct ufs_lookup_results *fulr = fde; struct ufs_lookup_results *tulr = tde; bool directory_p, reparent_p; struct direct *newdir; int error; KASSERT(mp != NULL); KASSERT(fdvp != NULL); KASSERT(fcnp != NULL); KASSERT(fulr != NULL); KASSERT(fvp != NULL); KASSERT(tdvp != NULL); KASSERT(tcnp != NULL); KASSERT(tulr != NULL); KASSERT(fulr != tulr); KASSERT(fdvp != fvp); KASSERT(fdvp != tvp); KASSERT(tdvp != fvp); KASSERT(tdvp != tvp); KASSERT(fvp != tvp); KASSERT(fdvp->v_mount == mp); KASSERT(fvp->v_mount == mp); KASSERT(tdvp->v_mount == mp); KASSERT((tvp == NULL) || (tvp->v_mount == mp)); KASSERT(VOP_ISLOCKED(fdvp) == LK_EXCLUSIVE); KASSERT(VOP_ISLOCKED(fvp) == LK_EXCLUSIVE); KASSERT(VOP_ISLOCKED(tdvp) == LK_EXCLUSIVE); KASSERT((tvp == NULL) || (VOP_ISLOCKED(tvp) == LK_EXCLUSIVE)); /* * We shall need to temporarily bump the link count, so make * sure there is room to do so. */ if ((nlink_t)VTOI(fvp)->i_nlink >= LINK_MAX) return EMLINK; directory_p = (fvp->v_type == VDIR); KASSERT(directory_p == ((VTOI(fvp)->i_mode & IFMT) == IFDIR)); KASSERT((tvp == NULL) || (directory_p == (tvp->v_type == VDIR))); KASSERT((tvp == NULL) || (directory_p == ((VTOI(tvp)->i_mode & IFMT) == IFDIR))); reparent_p = (fdvp != tdvp); KASSERT(reparent_p == (VTOI(fdvp)->i_number != VTOI(tdvp)->i_number)); /* * Commence hacking of the data on disk. */ error = UFS_WAPBL_BEGIN(mp); if (error) goto ihateyou; /* * 1) Bump link count while we're moving stuff * around. If we crash somewhere before * completing our work, the link count * may be wrong, but correctable. */ KASSERT((nlink_t)VTOI(fvp)->i_nlink < LINK_MAX); VTOI(fvp)->i_nlink++; DIP_ASSIGN(VTOI(fvp), nlink, VTOI(fvp)->i_nlink); VTOI(fvp)->i_flag |= IN_CHANGE; error = UFS_UPDATE(fvp, NULL, NULL, UPDATE_DIROP); if (error) goto whymustithurtsomuch; /* * 2) If target doesn't exist, link the target * to the source and unlink the source. * Otherwise, rewrite the target directory * entry to reference the source inode and * expunge the original entry's existence. */ if (tvp == NULL) { /* * Account for ".." in new directory. * When source and destination have the same * parent we don't fool with the link count. */ if (directory_p && reparent_p) { if ((nlink_t)VTOI(tdvp)->i_nlink >= LINK_MAX) { error = EMLINK; goto whymustithurtsomuch; } KASSERT((nlink_t)VTOI(tdvp)->i_nlink < LINK_MAX); VTOI(tdvp)->i_nlink++; DIP_ASSIGN(VTOI(tdvp), nlink, VTOI(tdvp)->i_nlink); VTOI(tdvp)->i_flag |= IN_CHANGE; error = UFS_UPDATE(tdvp, NULL, NULL, UPDATE_DIROP); if (error) { /* * Link count update didn't take -- * back out the in-memory link count. */ KASSERT(0 < VTOI(tdvp)->i_nlink); VTOI(tdvp)->i_nlink--; DIP_ASSIGN(VTOI(tdvp), nlink, VTOI(tdvp)->i_nlink); VTOI(tdvp)->i_flag |= IN_CHANGE; goto whymustithurtsomuch; } } newdir = pool_cache_get(ufs_direct_cache, PR_WAITOK); ufs_makedirentry(VTOI(fvp), tcnp, newdir); error = ufs_direnter(tdvp, tulr, NULL, newdir, tcnp, NULL); pool_cache_put(ufs_direct_cache, newdir); if (error) { if (directory_p && reparent_p) { /* * Directory update didn't take, but * the link count update did -- back * out the in-memory link count and the * on-disk link count. */ KASSERT(0 < VTOI(tdvp)->i_nlink); VTOI(tdvp)->i_nlink--; DIP_ASSIGN(VTOI(tdvp), nlink, VTOI(tdvp)->i_nlink); VTOI(tdvp)->i_flag |= IN_CHANGE; (void)UFS_UPDATE(tdvp, NULL, NULL, UPDATE_WAIT | UPDATE_DIROP); } goto whymustithurtsomuch; } } else { if (directory_p) /* XXX WTF? Why purge here? Why not purge others? */ cache_purge(tdvp); /* * Make the target directory's entry for tcnp point at * the source node. * * XXX ufs_dirrewrite decrements tvp's link count, but * doesn't touch the link count of the new inode. Go * figure. */ error = ufs_dirrewrite(VTOI(tdvp), tulr->ulr_offset, VTOI(tvp), VTOI(fvp)->i_number, IFTODT(VTOI(fvp)->i_mode), ((directory_p && reparent_p) ? reparent_p : directory_p), IN_CHANGE | IN_UPDATE); if (error) goto whymustithurtsomuch; /* * If the source and target are directories, and the * target is in the same directory as the source, * decrement the link count of the common parent * directory, since we are removing the target from * that directory. */ if (directory_p && !reparent_p) { KASSERT(fdvp == tdvp); /* XXX check, don't kassert */ KASSERT(0 < VTOI(tdvp)->i_nlink); VTOI(tdvp)->i_nlink--; DIP_ASSIGN(VTOI(tdvp), nlink, VTOI(tdvp)->i_nlink); VTOI(tdvp)->i_flag |= IN_CHANGE; UFS_WAPBL_UPDATE(tdvp, NULL, NULL, 0); } if (directory_p) { /* * XXX I don't understand the following comment * from ufs_rename -- in particular, the part * about `there may be other hard links'. * * Truncate inode. The only stuff left in the directory * is "." and "..". The "." reference is inconsequential * since we are quashing it. We have removed the "." * reference and the reference in the parent directory, * but there may be other hard links. * * XXX The ufs_dirempty call earlier does * not guarantee anything about nlink. */ if (VTOI(tvp)->i_nlink != 1) ufs_dirbad(VTOI(tvp), (doff_t)0, "hard-linked directory"); VTOI(tvp)->i_nlink = 0; DIP_ASSIGN(VTOI(tvp), nlink, 0); (void) UFS_TRUNCATE(tvp, (off_t)0, IO_SYNC, cred); } } /* * If the source is a directory with a new parent, the link * count of the old parent directory must be decremented and * ".." set to point to the new parent. * * XXX ufs_dirrewrite updates the link count of fdvp, but not * the link count of fvp or the link count of tdvp. Go figure. */ if (directory_p && reparent_p) { error = ufs_dirrewrite(VTOI(fvp), mastertemplate.dot_reclen, VTOI(fdvp), VTOI(tdvp)->i_number, DT_DIR, 0, IN_CHANGE); #if 0 /* XXX This branch was not in ufs_rename! */ if (error) goto whymustithurtsomuch; #endif /* XXX WTF? Why purge here? Why not purge others? */ cache_purge(fdvp); } /* * 3) Unlink the source. */ /* * ufs_direnter may compact the directory in the process of * inserting a new entry. That may invalidate fulr, which we * need in order to remove the old entry. In that case, we * need to recalculate what fulr should be. */ if (!reparent_p && (tvp == NULL) && ufs_rename_ulr_overlap_p(fulr, tulr)) { error = ufs_rename_recalculate_fulr(fdvp, fulr, tulr, fcnp); #if 0 /* XXX */ if (error) /* XXX Try to back out changes? */ goto whymustithurtsomuch; #endif } /* * XXX 0 means !isrmdir. But can't this be an rmdir? * XXX Well, turns out that argument to ufs_dirremove is ignored... * XXX And it turns out ufs_dirremove updates the link count of fvp. * XXX But it doesn't update the link count of fdvp. Go figure. * XXX fdvp's link count is updated in ufs_dirrewrite instead. * XXX Actually, sometimes it doesn't update fvp's link count. * XXX I hate the world. */ error = ufs_dirremove(fdvp, fulr, VTOI(fvp), fcnp->cn_flags, 0); if (error) #if 0 /* XXX */ goto whymustithurtsomuch; #endif goto arghmybrainhurts; if (tvp != NULL) { *tvp_nlinkp = VTOI(tvp)->i_nlink; } #if 0 /* XXX */ genfs_rename_cache_purge(fdvp, fvp, tdvp, tvp); #endif goto arghmybrainhurts; whymustithurtsomuch: KASSERT(0 < VTOI(fvp)->i_nlink); VTOI(fvp)->i_nlink--; DIP_ASSIGN(VTOI(fvp), nlink, VTOI(fvp)->i_nlink); VTOI(fvp)->i_flag |= IN_CHANGE; UFS_WAPBL_UPDATE(fvp, NULL, NULL, 0); arghmybrainhurts: UFS_WAPBL_END(mp); ihateyou: return error; } /* * ufs_rename_ulr_overlap_p: True iff tulr overlaps with fulr so that * entering a directory entry at tulr may move fulr. */ static bool ufs_rename_ulr_overlap_p(const struct ufs_lookup_results *fulr, const struct ufs_lookup_results *tulr) { doff_t from_prev_start, from_prev_end, to_start, to_end; KASSERT(fulr != NULL); KASSERT(tulr != NULL); KASSERT(fulr != tulr); /* * fulr is from a DELETE lookup, so fulr->ulr_count is the size * of the preceding entry (d_reclen). */ from_prev_end = fulr->ulr_offset; KASSERT(fulr->ulr_count <= from_prev_end); from_prev_start = (from_prev_end - fulr->ulr_count); /* * tulr is from a RENAME lookup, so tulr->ulr_count is the size * of the free space for an entry that we are about to fill. */ to_start = tulr->ulr_offset; KASSERT(tulr->ulr_count < (UFS_MAXDIRSIZE - to_start)); to_end = (to_start + tulr->ulr_count); return (((to_start <= from_prev_start) && (from_prev_start < to_end)) || ((to_start <= from_prev_end) && (from_prev_end < to_end))); } /* * ufs_rename_recalculate_fulr: If we have just entered a directory into * dvp at tulr, and we were about to remove one at fulr for an entry * named fcnp, fulr may be invalid. So, if necessary, recalculate it. */ static int ufs_rename_recalculate_fulr(struct vnode *dvp, struct ufs_lookup_results *fulr, const struct ufs_lookup_results *tulr, const struct componentname *fcnp) { struct mount *mp; struct ufsmount *ump; int needswap; /* XXX int is a silly type for this; blame ufsmount::um_dirblksiz. */ int dirblksiz; doff_t search_start, search_end; doff_t offset; /* Offset of entry we're examining. */ struct buf *bp; /* I/O block we're examining. */ char *dirbuf; /* Pointer into directory at search_start. */ struct direct *ep; /* Pointer to the entry we're examining. */ /* XXX direct::d_reclen is 16-bit; * ufs_lookup_results::ulr_reclen is 32-bit. Blah. */ uint32_t reclen; /* Length of the entry we're examining. */ uint32_t prev_reclen; /* Length of the preceding entry. */ int error; KASSERT(dvp != NULL); KASSERT(dvp->v_mount != NULL); KASSERT(VTOI(dvp) != NULL); KASSERT(fulr != NULL); KASSERT(tulr != NULL); KASSERT(fulr != tulr); KASSERT(ufs_rename_ulr_overlap_p(fulr, tulr)); mp = dvp->v_mount; ump = VFSTOUFS(mp); KASSERT(ump != NULL); KASSERT(ump == VTOI(dvp)->i_ump); needswap = UFS_MPNEEDSWAP(ump); dirblksiz = ump->um_dirblksiz; KASSERT(0 < dirblksiz); KASSERT((dirblksiz & (dirblksiz - 1)) == 0); /* A directory block may not span across multiple I/O blocks. */ KASSERT(dirblksiz <= mp->mnt_stat.f_iosize); /* Find the bounds of the search. */ search_start = tulr->ulr_offset; KASSERT(fulr->ulr_reclen < (UFS_MAXDIRSIZE - fulr->ulr_offset)); search_end = (fulr->ulr_offset + fulr->ulr_reclen); /* Compaction must happen only within a directory block. (*) */ KASSERT(search_start <= search_end); KASSERT((search_end - (search_start &~ (dirblksiz - 1))) <= dirblksiz); dirbuf = NULL; bp = NULL; error = ufs_blkatoff(dvp, (off_t)search_start, &dirbuf, &bp, false); if (error) return error; KASSERT(dirbuf != NULL); KASSERT(bp != NULL); /* * Guarantee we sha'n't go past the end of the buffer we got. * dirbuf is bp->b_data + (search_start & (iosize - 1)), and * the valid range is [bp->b_data, bp->b_data + bp->b_bcount). */ KASSERT((search_end - search_start) <= (bp->b_bcount - (search_start & (mp->mnt_stat.f_iosize - 1)))); prev_reclen = fulr->ulr_count; offset = search_start; /* * Search from search_start to search_end for the entry matching * fcnp, which must be there because we found it before and it * should only at most have moved earlier. */ for (;;) { KASSERT(search_start <= offset); KASSERT(offset < search_end); /* * Examine the directory entry at offset. */ ep = (struct direct *)(dirbuf + (offset - search_start)); reclen = ufs_rw16(ep->d_reclen, needswap); if (ep->d_ino == 0) goto next; /* Entry is unused. */ if (ufs_rw32(ep->d_ino, needswap) == UFS_WINO) goto next; /* Entry is whiteout. */ if (fcnp->cn_namelen != ufs_direct_namlen(ep, dvp)) goto next; /* Wrong name length. */ if (memcmp(ep->d_name, fcnp->cn_nameptr, fcnp->cn_namelen)) goto next; /* Wrong name. */ /* Got it! */ break; next: if (! ((reclen < search_end) && (offset < (search_end - reclen)))) { brelse(bp, 0); return EIO; /* XXX Panic? What? */ } /* We may not move past the search end. */ KASSERT(reclen < search_end); KASSERT(offset < (search_end - reclen)); /* * We may not move across a directory block boundary; * see (*) above. */ KASSERT((offset &~ (dirblksiz - 1)) == ((offset + reclen) &~ (dirblksiz - 1))); prev_reclen = reclen; offset += reclen; } /* * Found the entry. Record where. */ fulr->ulr_offset = offset; fulr->ulr_reclen = reclen; /* * Record the preceding record length, but not if we're at the * start of a directory block. */ fulr->ulr_count = ((offset & (dirblksiz - 1))? prev_reclen : 0); brelse(bp, 0); return 0; } /* * ufs_direct_namlen: Return the namlen of the directory entry ep from * the directory vp. */ static int /* XXX int? uint8_t? */ ufs_direct_namlen(const struct direct *ep, const struct vnode *vp) { bool swap; KASSERT(ep != NULL); KASSERT(vp != NULL); KASSERT(VTOI(vp) != NULL); KASSERT(VTOI(vp)->i_ump != NULL); #if (BYTE_ORDER == LITTLE_ENDIAN) swap = (UFS_MPNEEDSWAP(VTOI(vp)->i_ump) == 0); #else swap = (UFS_MPNEEDSWAP(VTOI(vp)->i_ump) != 0); #endif return ((FSFMT(vp) && swap)? ep->d_type : ep->d_namlen); } /* * ufs_gro_remove: Rename an object over another link to itself, * effectively removing just the original link. */ int ufs_gro_remove(struct mount *mp, kauth_cred_t cred, struct vnode *dvp, struct componentname *cnp, void *de, struct vnode *vp, nlink_t *tvp_nlinkp) { struct ufs_lookup_results *ulr = de; int error; KASSERT(mp != NULL); KASSERT(dvp != NULL); KASSERT(cnp != NULL); KASSERT(ulr != NULL); KASSERT(vp != NULL); KASSERT(dvp != vp); KASSERT(dvp->v_mount == mp); KASSERT(vp->v_mount == mp); KASSERT(dvp->v_type == VDIR); KASSERT(vp->v_type != VDIR); KASSERT(VOP_ISLOCKED(dvp) == LK_EXCLUSIVE); KASSERT(VOP_ISLOCKED(vp) == LK_EXCLUSIVE); KASSERT(cnp->cn_nameiop == DELETE); error = UFS_WAPBL_BEGIN(mp); if (error) goto out; /* XXX ufs_dirremove decrements vp's link count for us. */ error = ufs_dirremove(dvp, ulr, VTOI(vp), cnp->cn_flags, 0); UFS_WAPBL_END(mp); *tvp_nlinkp = VTOI(vp)->i_nlink; out: return error; } /* * ufs_gro_lookup: Look up and save the lookup results. */ int ufs_gro_lookup(struct mount *mp, struct vnode *dvp, struct componentname *cnp, void *de_ret, struct vnode **vp_ret) { struct ufs_lookup_results *ulr_ret = de_ret; struct vnode *vp = NULL; int error; (void)mp; KASSERT(mp != NULL); KASSERT(dvp != NULL); KASSERT(cnp != NULL); KASSERT(ulr_ret != NULL); KASSERT(vp_ret != NULL); KASSERT(VOP_ISLOCKED(dvp) == LK_EXCLUSIVE); /* Kludge cargo-culted from dholland's ufs_rename. */ cnp->cn_flags &=~ MODMASK; cnp->cn_flags |= (LOCKPARENT | LOCKLEAF); error = relookup(dvp, &vp, cnp, 0 /* dummy */); if ((error == 0) && (vp == NULL)) { error = ENOENT; goto out; } else if (error) { return error; } /* * Thanks to VFS insanity, relookup locks vp, which screws us * in various ways. */ KASSERT(vp != NULL); VOP_UNLOCK(vp); out: *ulr_ret = VTOI(dvp)->i_crap; *vp_ret = vp; return error; } /* * ufs_rmdired_p: Check whether the directory vp has been rmdired. * * vp must be locked and referenced. */ static bool ufs_rmdired_p(struct vnode *vp) { KASSERT(vp != NULL); KASSERT(VOP_ISLOCKED(vp) == LK_EXCLUSIVE); KASSERT(vp->v_type == VDIR); /* XXX Is this correct? */ return (VTOI(vp)->i_size == 0); } /* * ufs_read_dotdot: Store in *ino_ret the inode number of the parent * of the directory vp. */ static int ufs_read_dotdot(struct vnode *vp, kauth_cred_t cred, ino_t *ino_ret) { struct dirtemplate dirbuf; int error; KASSERT(vp != NULL); KASSERT(ino_ret != NULL); KASSERT(vp->v_type == VDIR); error = ufs_bufio(UIO_READ, vp, &dirbuf, sizeof dirbuf, (off_t)0, IO_NODELOCKED, cred, NULL, NULL); if (error) return error; if (ufs_dirbuf_dotdot_namlen(&dirbuf, vp) != 2 || dirbuf.dotdot_name[0] != '.' || dirbuf.dotdot_name[1] != '.') /* XXX Panic? Print warning? */ return ENOTDIR; *ino_ret = ufs_rw32(dirbuf.dotdot_ino, UFS_MPNEEDSWAP(VTOI(vp)->i_ump)); return 0; } /* * ufs_dirbuf_dotdot_namlen: Return the namlen of the directory buffer * dirbuf that came from the directory vp. Swap byte order if * necessary. */ static int /* XXX int? uint8_t? */ ufs_dirbuf_dotdot_namlen(const struct dirtemplate *dirbuf, const struct vnode *vp) { bool swap; KASSERT(dirbuf != NULL); KASSERT(vp != NULL); KASSERT(VTOI(vp) != NULL); KASSERT(VTOI(vp)->i_ump != NULL); #if (BYTE_ORDER == LITTLE_ENDIAN) swap = (UFS_MPNEEDSWAP(VTOI(vp)->i_ump) == 0); #else swap = (UFS_MPNEEDSWAP(VTOI(vp)->i_ump) != 0); #endif return ((FSFMT(vp) && swap)? dirbuf->dotdot_type : dirbuf->dotdot_namlen); } /* * ufs_gro_genealogy: Analyze the genealogy of the source and target * directories. */ int ufs_gro_genealogy(struct mount *mp, kauth_cred_t cred, struct vnode *fdvp, struct vnode *tdvp, struct vnode **intermediate_node_ret) { struct vnode *vp, *dvp; ino_t dotdot_ino = 0; /* XXX: gcc */ int error; KASSERT(mp != NULL); KASSERT(fdvp != NULL); KASSERT(tdvp != NULL); KASSERT(fdvp != tdvp); KASSERT(intermediate_node_ret != NULL); KASSERT(fdvp->v_mount == mp); KASSERT(tdvp->v_mount == mp); KASSERT(fdvp->v_type == VDIR); KASSERT(tdvp->v_type == VDIR); /* * We need to provisionally lock tdvp to keep rmdir from * deleting it -- or any ancestor -- at an inopportune moment. */ error = ufs_gro_lock_directory(mp, tdvp); if (error) return error; vp = tdvp; vref(vp); for (;;) { KASSERT(vp != NULL); KASSERT(VOP_ISLOCKED(vp) == LK_EXCLUSIVE); KASSERT(vp->v_mount == mp); KASSERT(vp->v_type == VDIR); KASSERT(!ufs_rmdired_p(vp)); /* Did we hit the root without finding fdvp? */ if (VTOI(vp)->i_number == UFS_ROOTINO) { vput(vp); *intermediate_node_ret = NULL; return 0; } error = ufs_read_dotdot(vp, cred, &dotdot_ino); if (error) { vput(vp); return error; } /* Did we find that fdvp is an ancestor of tdvp? */ if (VTOI(fdvp)->i_number == dotdot_ino) { /* Unlock vp, but keep it referenced. */ VOP_UNLOCK(vp); *intermediate_node_ret = vp; return 0; } /* Neither -- keep ascending the family tree. */ error = vcache_get(mp, &dotdot_ino, sizeof(dotdot_ino), &dvp); vput(vp); if (error) return error; error = vn_lock(dvp, LK_EXCLUSIVE); if (error) { vrele(dvp); return error; } KASSERT(dvp != NULL); KASSERT(VOP_ISLOCKED(dvp) == LK_EXCLUSIVE); vp = dvp; if (vp->v_type != VDIR) { /* * XXX Panic? Print a warning? Can this * happen if we lose the race I suspect to * exist above, and the `..' inode number has * been recycled? */ vput(vp); return ENOTDIR; } if (ufs_rmdired_p(vp)) { vput(vp); return ENOENT; } } } /* * ufs_gro_lock_directory: Lock the directory vp, but fail if it has * been rmdir'd. */ int ufs_gro_lock_directory(struct mount *mp, struct vnode *vp) { (void)mp; KASSERT(mp != NULL); KASSERT(vp != NULL); KASSERT(vp->v_mount == mp); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); if (ufs_rmdired_p(vp)) { VOP_UNLOCK(vp); return ENOENT; } return 0; } static const struct genfs_rename_ops ufs_genfs_rename_ops = { .gro_directory_empty_p = ufs_gro_directory_empty_p, .gro_rename_check_possible = ufs_gro_rename_check_possible, .gro_rename_check_permitted = ufs_gro_rename_check_permitted, .gro_remove_check_possible = ufs_gro_remove_check_possible, .gro_remove_check_permitted = ufs_gro_remove_check_permitted, .gro_rename = ufs_gro_rename, .gro_remove = ufs_gro_remove, .gro_lookup = ufs_gro_lookup, .gro_genealogy = ufs_gro_genealogy, .gro_lock_directory = ufs_gro_lock_directory, };
27 28 28 28 28 25 25 25 8 8 6 8 1 7 8 1 1 8 8 8 8 8 8 8 8 1 7 8 8 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 /* $NetBSD: uvm_pager.c,v 1.131 2024/03/15 07:09:37 andvar Exp $ */ /* * Copyright (c) 1997 Charles D. Cranor and Washington University. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * from: Id: uvm_pager.c,v 1.1.2.23 1998/02/02 20:38:06 chuck Exp */ /* * uvm_pager.c: generic functions used to assist the pagers. */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: uvm_pager.c,v 1.131 2024/03/15 07:09:37 andvar Exp $"); #include "opt_uvmhist.h" #include "opt_readahead.h" #include "opt_pagermap.h" #include <sys/param.h> #include <sys/systm.h> #include <sys/atomic.h> #include <sys/vnode.h> #include <sys/buf.h> #include <uvm/uvm.h> /* * XXX * this is needed until the device strategy interface * is changed to do physically-addressed i/o. */ #ifndef PAGER_MAP_DEFAULT_SIZE #define PAGER_MAP_DEFAULT_SIZE (16 * 1024 * 1024) #endif #ifndef PAGER_MAP_SIZE #define PAGER_MAP_SIZE PAGER_MAP_DEFAULT_SIZE #endif size_t pager_map_size = PAGER_MAP_SIZE; /* * list of uvm pagers in the system */ const struct uvm_pagerops * const uvmpagerops[] = { &aobj_pager, &uvm_deviceops, &uvm_vnodeops, &ubc_pager, }; /* * the pager map: provides KVA for I/O */ struct vm_map *pager_map; /* XXX */ kmutex_t pager_map_wanted_lock __cacheline_aligned; bool pager_map_wanted; /* locked by pager map */ static vaddr_t emergva; static int emerg_ncolors; static bool emerginuse; void uvm_pager_realloc_emerg(void) { vaddr_t new_emergva, old_emergva; int old_emerg_ncolors; if (__predict_true(emergva != 0 && emerg_ncolors >= uvmexp.ncolors)) return; KASSERT(!emerginuse); new_emergva = uvm_km_alloc(kernel_map, round_page(MAXPHYS) + ptoa(uvmexp.ncolors), ptoa(uvmexp.ncolors), UVM_KMF_VAONLY); KASSERT(new_emergva != 0); old_emergva = emergva; old_emerg_ncolors = emerg_ncolors; /* * don't support re-color in late boot anyway. */ if (0) /* XXX */ mutex_enter(&pager_map_wanted_lock); emergva = new_emergva; emerg_ncolors = uvmexp.ncolors; wakeup(&old_emergva); if (0) /* XXX */ mutex_exit(&pager_map_wanted_lock); if (old_emergva) uvm_km_free(kernel_map, old_emergva, round_page(MAXPHYS) + ptoa(old_emerg_ncolors), UVM_KMF_VAONLY); } /* * uvm_pager_init: init pagers (at boot time) */ void uvm_pager_init(void) { u_int lcv; vaddr_t sva, eva; /* * init pager map */ sva = 0; pager_map = uvm_km_suballoc(kernel_map, &sva, &eva, pager_map_size, 0, false, NULL); mutex_init(&pager_map_wanted_lock, MUTEX_DEFAULT, IPL_NONE); pager_map_wanted = false; uvm_pager_realloc_emerg(); /* * call pager init functions */ for (lcv = 0 ; lcv < __arraycount(uvmpagerops); lcv++) { if (uvmpagerops[lcv]->pgo_init) uvmpagerops[lcv]->pgo_init(); } } #ifdef PMAP_DIRECT /* * uvm_pagermapdirect: map a single page via the pmap's direct segment * * this is an abuse of pmap_direct_process(), since the kva is being grabbed * and no processing is taking place, but for now.. */ static int uvm_pagermapdirect(void *kva, size_t sz, void *cookie) { KASSERT(sz == PAGE_SIZE); *(vaddr_t *)cookie = (vaddr_t)kva; return 0; } #endif /* * uvm_pagermapin: map pages into KVA (pager_map) for I/O that needs mappings * * we basically just map in a blank map entry to reserve the space in the * map and then use pmap_enter() to put the mappings in by hand. */ vaddr_t uvm_pagermapin(struct vm_page **pps, int npages, int flags) { vsize_t size; vaddr_t kva; vaddr_t cva; struct vm_page *pp; vm_prot_t prot; const bool pdaemon = (curlwp == uvm.pagedaemon_lwp); const u_int first_color = VM_PGCOLOR(*pps); UVMHIST_FUNC(__func__); UVMHIST_CALLARGS(maphist,"(pps=%#jx, npages=%jd, first_color=%ju)", (uintptr_t)pps, npages, first_color, 0); #ifdef PMAP_DIRECT /* * for a single page the direct mapped segment can be used. */ if (npages == 1) { int error __diagused; KASSERT((pps[0]->flags & PG_BUSY) != 0); error = pmap_direct_process(VM_PAGE_TO_PHYS(pps[0]), 0, PAGE_SIZE, uvm_pagermapdirect, &kva); KASSERT(error == 0); UVMHIST_LOG(maphist, "<- done, direct (KVA=%#jx)", kva,0,0,0); return kva; } #endif /* * compute protection. outgoing I/O only needs read * access to the page, whereas incoming needs read/write. */ prot = VM_PROT_READ; if (flags & UVMPAGER_MAPIN_READ) prot |= VM_PROT_WRITE; ReStart: size = ptoa(npages); kva = 0; /* let system choose VA */ if (uvm_map(pager_map, &kva, size, NULL, UVM_UNKNOWN_OFFSET, first_color, UVM_FLAG_COLORMATCH | UVM_FLAG_NOMERGE | (pdaemon ? UVM_FLAG_NOWAIT : 0)) != 0) { if (pdaemon) { mutex_enter(&pager_map_wanted_lock); if (emerginuse) { UVM_UNLOCK_AND_WAIT(&emergva, &pager_map_wanted_lock, false, "emergva", 0); goto ReStart; } emerginuse = true; mutex_exit(&pager_map_wanted_lock); kva = emergva + ptoa(first_color); /* The shift implicitly truncates to PAGE_SIZE */ KASSERT(npages <= (MAXPHYS >> PAGE_SHIFT)); goto enter; } if ((flags & UVMPAGER_MAPIN_WAITOK) == 0) { UVMHIST_LOG(maphist,"<- NOWAIT failed", 0,0,0,0); return(0); } mutex_enter(&pager_map_wanted_lock); pager_map_wanted = true; UVMHIST_LOG(maphist, " SLEEPING on pager_map",0,0,0,0); UVM_UNLOCK_AND_WAIT(pager_map, &pager_map_wanted_lock, false, "pager_map", 0); goto ReStart; } enter: /* got it */ for (cva = kva; npages != 0; npages--, cva += PAGE_SIZE) { pp = *pps++; KASSERT(pp); // KASSERT(!((VM_PAGE_TO_PHYS(pp) ^ cva) & uvmexp.colormask)); KASSERT(pp->flags & PG_BUSY); pmap_kenter_pa(cva, VM_PAGE_TO_PHYS(pp), prot, 0); } pmap_update(vm_map_pmap(pager_map)); UVMHIST_LOG(maphist, "<- done (KVA=%#jx)", kva,0,0,0); return(kva); } /* * uvm_pagermapout: remove pager_map mapping * * we remove our mappings by hand and then remove the mapping (waking * up anyone wanting space). */ void uvm_pagermapout(vaddr_t kva, int npages) { vsize_t size = ptoa(npages); struct vm_map_entry *entries; UVMHIST_FUNC(__func__); UVMHIST_CALLARGS(maphist, " (kva=%#jx, npages=%jd)", kva, npages,0,0); #ifdef PMAP_DIRECT /* * solitary pages are mapped directly. */ if (npages == 1) { UVMHIST_LOG(maphist,"<- done, direct", 0,0,0,0); return; } #endif /* * duplicate uvm_unmap, but add in pager_map_wanted handling. */ pmap_kremove(kva, size); pmap_update(pmap_kernel()); if ((kva & ~ptoa(uvmexp.colormask)) == emergva) { mutex_enter(&pager_map_wanted_lock); KASSERT(emerginuse); emerginuse = false; wakeup(&emergva); mutex_exit(&pager_map_wanted_lock); return; } vm_map_lock(pager_map); uvm_unmap_remove(pager_map, kva, kva + size, &entries, 0); mutex_enter(&pager_map_wanted_lock); if (pager_map_wanted) { pager_map_wanted = false; wakeup(pager_map); } mutex_exit(&pager_map_wanted_lock); vm_map_unlock(pager_map); if (entries) uvm_unmap_detach(entries, 0); UVMHIST_LOG(maphist,"<- done",0,0,0,0); } void uvm_aio_aiodone_pages(struct vm_page **pgs, int npages, bool write, int error) { struct uvm_object *uobj; struct vm_page *pg; krwlock_t *slock; int pageout_done; /* number of PG_PAGEOUT pages processed */ int swslot __unused; /* used for VMSWAP */ int i; bool swap; UVMHIST_FUNC(__func__); UVMHIST_CALLED(ubchist); swslot = 0; pageout_done = 0; slock = NULL; uobj = NULL; pg = pgs[0]; swap = (pg->uanon != NULL && pg->uobject == NULL) || (pg->flags & PG_AOBJ) != 0; if (!swap) { uobj = pg->uobject; slock = uobj->vmobjlock; rw_enter(slock, RW_WRITER); } else { #if defined(VMSWAP) if (error) { if (pg->uobject != NULL) { swslot = uao_find_swslot(pg->uobject, pg->offset >> PAGE_SHIFT); } else { KASSERT(pg->uanon != NULL); swslot = pg->uanon->an_swslot; } KASSERT(swslot); } #else /* defined(VMSWAP) */ panic("%s: swap", __func__); #endif /* defined(VMSWAP) */ } for (i = 0; i < npages; i++) { #if defined(VMSWAP) bool anon_disposed = false; /* XXX gcc */ #endif /* defined(VMSWAP) */ pg = pgs[i]; KASSERT(swap || pg->uobject == uobj); UVMHIST_LOG(ubchist, "pg %#jx", (uintptr_t)pg, 0,0,0); #if defined(VMSWAP) /* * for swap i/os, lock each page's object (or anon) * individually since each page may need a different lock. */ if (swap) { if (pg->uobject != NULL) { slock = pg->uobject->vmobjlock; } else { slock = pg->uanon->an_lock; } rw_enter(slock, RW_WRITER); anon_disposed = (pg->flags & PG_RELEASED) != 0; KASSERT(!anon_disposed || pg->uobject != NULL || pg->uanon->an_ref == 0); } #endif /* defined(VMSWAP) */ if (write && uobj != NULL) { KASSERT(uvm_obj_page_writeback_p(pg)); uvm_obj_page_clear_writeback(pg); } /* * process errors. for reads, just mark the page to be freed. * for writes, if the error was ENOMEM, we assume this was * a transient failure so we mark the page dirty so that * we'll try to write it again later. for all other write * errors, we assume the error is permanent, thus the data * in the page is lost. bummer. */ if (error) { int slot __unused; /* used for VMSWAP */ if (!write) { pg->flags |= PG_RELEASED; continue; } else if (error == ENOMEM) { if (pg->flags & PG_PAGEOUT) { pg->flags &= ~PG_PAGEOUT; pageout_done++; } uvm_pagemarkdirty(pg, UVM_PAGE_STATUS_DIRTY); uvm_pagelock(pg); uvm_pageactivate(pg); uvm_pageunlock(pg); slot = 0; } else slot = SWSLOT_BAD; #if defined(VMSWAP) if (swap) { if (pg->uobject != NULL) { int oldslot __diagused; oldslot = uao_set_swslot(pg->uobject, pg->offset >> PAGE_SHIFT, slot); KASSERT(oldslot == swslot + i); } else { KASSERT(pg->uanon->an_swslot == swslot + i); pg->uanon->an_swslot = slot; } } #endif /* defined(VMSWAP) */ } /* * if the page is PG_FAKE, this must have been a read to * initialize the page. clear PG_FAKE and activate the page. */ if (pg->flags & PG_FAKE) { KASSERT(!write); pg->flags &= ~PG_FAKE; #if defined(READAHEAD_STATS) pg->flags |= PG_READAHEAD; uvm_ra_total.ev_count++; #endif /* defined(READAHEAD_STATS) */ KASSERT(uvm_pagegetdirty(pg) == UVM_PAGE_STATUS_CLEAN); uvm_pagelock(pg); uvm_pageenqueue(pg); uvm_pageunlock(pg); } #if defined(VMSWAP) /* * for swap pages, unlock everything for this page now. */ if (swap) { if (pg->uobject == NULL && anon_disposed) { uvm_anon_release(pg->uanon); } else { uvm_page_unbusy(&pg, 1); rw_exit(slock); } } #endif /* defined(VMSWAP) */ } if (pageout_done != 0) { uvm_pageout_done(pageout_done); } if (!swap) { uvm_page_unbusy(pgs, npages); rw_exit(slock); } else { #if defined(VMSWAP) KASSERT(write); /* these pages are now only in swap. */ if (error != ENOMEM) { atomic_add_int(&uvmexp.swpgonly, npages); } if (error) { if (error != ENOMEM) uvm_swap_markbad(swslot, npages); else uvm_swap_free(swslot, npages); } atomic_dec_uint(&uvmexp.pdpending); #endif /* defined(VMSWAP) */ } } /* * uvm_aio_aiodone: do iodone processing for async i/os. * this should be called in thread context, not interrupt context. */ void uvm_aio_aiodone(struct buf *bp) { const int npages = bp->b_bufsize >> PAGE_SHIFT; struct vm_page *pgs[howmany(MAXPHYS, MIN_PAGE_SIZE)]; int i, error; bool write; UVMHIST_FUNC(__func__); UVMHIST_CALLARGS(ubchist, "bp %#jx", (uintptr_t)bp, 0,0,0); KASSERT(bp->b_bufsize <= MAXPHYS); KASSERT(npages <= __arraycount(pgs)); error = bp->b_error; write = (bp->b_flags & B_READ) == 0; for (i = 0; i < npages; i++) { pgs[i] = uvm_pageratop((vaddr_t)bp->b_data + (i << PAGE_SHIFT)); UVMHIST_LOG(ubchist, "pgs[%jd] = %#jx", i, (uintptr_t)pgs[i], 0, 0); } uvm_pagermapout((vaddr_t)bp->b_data, npages); uvm_aio_aiodone_pages(pgs, npages, write, error); if (write && (bp->b_cflags & BC_AGE) != 0) { mutex_enter(bp->b_objlock); vwakeup(bp); mutex_exit(bp->b_objlock); } putiobuf(bp); } /* * uvm_pageratop: convert KVAs in the pager map back to their page * structures. */ struct vm_page * uvm_pageratop(vaddr_t kva) { struct vm_page *pg; paddr_t pa; bool rv __diagused; rv = pmap_extract(pmap_kernel(), kva, &pa); KASSERT(rv); pg = PHYS_TO_VM_PAGE(pa); KASSERT(pg != NULL); return (pg); }
1303 308 328 2 921 946 1 278 926 1300 308 109 109 109 777 776 779 598 596 595 355 564 29 29 29 29 29 23 23 23 23 23 196 195 196 196 54 53 54 54 658 661 661 578 578 578 201 201 201 107 108 108 108 107 43 44 41 41 49 48 49 45 45 178 181 179 11 11 11 24 24 24 1 1 1 13 13 13 99 98 98 36 36 36 22 22 21 22 22 9 9 9 9 42 42 42 20 20 20 20 20 14 14 14 14 14 15 15 15 15 15 51 51 50 94 94 94 81 81 81 472 473 472 57 57 57 928 929 928 925 925 926 112 112 112 210 211 211 696 700 700 89 89 89 52 52 51 15 15 15 281 282 282 132 133 133 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 /* $NetBSD: vnode_if.c,v 1.119 2023/06/15 09:15:13 hannken Exp $ */ /* * Warning: DO NOT EDIT! This file is automatically generated! * (Modifications made here may easily be lost!) * * Created from the file: * NetBSD: vnode_if.src,v 1.85 2023/06/15 09:13:36 hannken Exp * by the script: * NetBSD: vnode_if.sh,v 1.77 2022/10/26 23:39:43 riastradh Exp */ /* * Copyright (c) 1992, 1993, 1994, 1995 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: vnode_if.c,v 1.119 2023/06/15 09:15:13 hannken Exp $"); #ifdef _KERNEL_OPT #include "opt_vnode_lockdebug.h" #endif /* _KERNEL_OPT */ #include <sys/param.h> #include <sys/mount.h> #include <sys/buf.h> #include <sys/fcntl.h> #include <sys/vnode.h> #include <sys/lock.h> #include <sys/fstrans.h> #include <miscfs/deadfs/deadfs.h> enum fst_op { FST_NO, FST_YES, FST_LAZY, FST_TRY }; static inline int vop_pre(vnode_t *vp, struct mount **mp, bool *mpsafe, enum fst_op op) { int error; *mpsafe = (vp->v_vflag & VV_MPSAFE); if (!*mpsafe) { KERNEL_LOCK(1, curlwp); } if (op == FST_YES || op == FST_LAZY || op == FST_TRY) { for (;;) { *mp = vp->v_mount; if (op == FST_TRY) { error = fstrans_start_nowait(*mp); if (error) { if (!*mpsafe) { KERNEL_UNLOCK_ONE(curlwp); } return error; } } else if (op == FST_LAZY) { fstrans_start_lazy(*mp); } else { fstrans_start(*mp); } if (__predict_true(*mp == vp->v_mount)) break; fstrans_done(*mp); } } else { *mp = vp->v_mount; } return 0; } static inline u_quad_t vop_pre_get_size(struct vnode *vp) { mutex_enter(vp->v_interlock); KASSERT(vp->v_size != VSIZENOTSET); u_quad_t rv = (u_quad_t)vp->v_size; mutex_exit(vp->v_interlock); return rv; } /* * VOP_RMDIR(), VOP_REMOVE(), and VOP_RENAME() need special handling * because they each drop the caller's references on one or more of * their arguments. While there must be an open file descriptor in * associated with a vnode in order for knotes to be attached to it, * that status could change during the course of the operation. So, * for the vnode arguments that are WILLRELE or WILLPUT, we check * pre-op if there are registered knotes, take a hold count if so, * and post-op release the hold after activating any knotes still * associated with the vnode. */ #define VOP_POST_KNOTE(thisvp, e, n) \ do { \ if (__predict_true((e) == 0)) { \ /* \ * VN_KNOTE() does the VN_KEVENT_INTEREST() \ * check for us. \ */ \ VN_KNOTE((thisvp), (n)); \ } \ } while (/*CONSTCOND*/0) #define VOP_POST_KNOTE_HELD(thisvp, e, n) \ do { \ /* \ * We don't perform a VN_KEVENT_INTEREST() check here; it \ * was already performed when we did the pre-op work that \ * caused the vnode to be held in the first place. \ */ \ mutex_enter((thisvp)->v_interlock); \ if (__predict_true((e) == 0)) { \ knote(&(thisvp)->v_klist->vk_klist, (n)); \ } \ holdrelel((thisvp)); \ mutex_exit((thisvp)->v_interlock); \ /* \ * thisvp might be gone now! Don't touch! \ */ \ } while (/*CONSTCOND*/0) #define vop_create_post(ap, e) \ VOP_POST_KNOTE((ap)->a_dvp, (e), NOTE_WRITE) #define vop_mknod_post(ap, e) \ VOP_POST_KNOTE((ap)->a_dvp, (e), NOTE_WRITE) #define vop_setattr_pre(ap) \ u_quad_t osize = 0; \ long vp_events = \ VN_KEVENT_INTEREST((ap)->a_vp, NOTE_ATTRIB | NOTE_EXTEND) \ ? NOTE_ATTRIB : 0; \ bool check_extend = false; \ if (__predict_false(vp_events != 0 && \ (ap)->a_vap->va_size != VNOVALSIZE)) { \ check_extend = true; \ osize = vop_pre_get_size((ap)->a_vp); \ } #define vop_setattr_post(ap, e) \ do { \ if (__predict_false(vp_events != 0)) { \ if (__predict_false(check_extend && \ (ap)->a_vap->va_size > osize)) { \ vp_events |= NOTE_EXTEND; \ } \ VOP_POST_KNOTE((ap)->a_vp, (e), vp_events); \ } \ } while (/*CONSTCOND*/0) #define vop_setacl_post(ap, e) \ VOP_POST_KNOTE((ap)->a_vp, (e), NOTE_ATTRIB) #define vop_link_post(ap, e) \ do { \ VOP_POST_KNOTE((ap)->a_dvp, (e), NOTE_WRITE); \ VOP_POST_KNOTE((ap)->a_vp, (e), NOTE_LINK); \ } while (/*CONSTCOND*/0) #define vop_mkdir_post(ap, e) \ VOP_POST_KNOTE((ap)->a_dvp, (e), NOTE_WRITE | NOTE_LINK) #define vop_remove_pre_common(ap) \ bool post_event_vp = \ VN_KEVENT_INTEREST((ap)->a_vp, NOTE_DELETE | NOTE_LINK); \ if (__predict_false(post_event_vp)) { \ vhold((ap)->a_vp); \ } #define vop_remove_post_common(ap, e, dn, lc) \ do { \ VOP_POST_KNOTE((ap)->a_dvp, (e), (dn)); \ if (__predict_false(post_event_vp)) { \ VOP_POST_KNOTE_HELD((ap)->a_vp, (e), \ (lc) ? NOTE_LINK : NOTE_DELETE); \ } \ } while (/*CONSTCOND*/0) /* * One could make the argument that VOP_REMOVE() should send NOTE_LINK * on vp if the resulting link count is not zero, but that's not what * the documentation says. * * We could change this easily by passing ap->ctx_vp_new_nlink to * vop_remove_post_common(). */ #define vop_remove_pre(ap) \ vop_remove_pre_common((ap)); \ /* \ * We will assume that the file being removed is deleted unless \ * the file system tells us otherwise by updating vp_new_nlink. \ */ \ (ap)->ctx_vp_new_nlink = 0; #define vop_remove_post(ap, e) \ vop_remove_post_common((ap), (e), NOTE_WRITE, 0) #define vop_rmdir_pre(ap) \ vop_remove_pre_common(ap) #define vop_rmdir_post(ap, e) \ vop_remove_post_common((ap), (e), NOTE_WRITE | NOTE_LINK, 0) #define vop_symlink_post(ap, e) \ VOP_POST_KNOTE((ap)->a_dvp, (e), NOTE_WRITE) #define vop_open_post(ap, e) \ VOP_POST_KNOTE((ap)->a_vp, (e), NOTE_OPEN) #define vop_close_post(ap, e) \ do { \ /* See the definition of VN_KNOTE() in <sys/vnode.h>. */ \ if (__predict_false(VN_KEVENT_INTEREST((ap)->a_vp, \ NOTE_CLOSE_WRITE | NOTE_CLOSE) && (e) == 0)) { \ struct vnode *thisvp = (ap)->a_vp; \ mutex_enter(thisvp->v_interlock); \ /* \ * Don't send NOTE_CLOSE when closing a vnode that's \ * been reclaimed or otherwise revoked; a NOTE_REVOKE \ * has already been sent, and this close is effectively \ * meaningless from the watcher's perspective. \ */ \ if (__predict_true(thisvp->v_op != dead_vnodeop_p)) { \ knote(&thisvp->v_klist->vk_klist, \ ((ap)->a_fflag & FWRITE) \ ? NOTE_CLOSE_WRITE : NOTE_CLOSE); \ } \ mutex_exit(thisvp->v_interlock); \ } \ } while (/*CONSTCOND*/0) #define vop_read_post(ap, e) \ VOP_POST_KNOTE((ap)->a_vp, (e), NOTE_READ) #define vop_write_pre(ap) \ off_t ooffset = 0, noffset = 0; \ u_quad_t osize = 0; \ long vp_events = \ VN_KEVENT_INTEREST((ap)->a_vp, NOTE_WRITE | NOTE_EXTEND) \ ? NOTE_WRITE : 0; \ if (__predict_false(vp_events != 0)) { \ ooffset = (ap)->a_uio->uio_offset; \ osize = vop_pre_get_size((ap)->a_vp); \ } #define vop_write_post(ap, e) \ do { \ /* \ * If any data was written, we'll post an event, even if \ * there was an error. \ */ \ noffset = (ap)->a_uio->uio_offset; \ if (__predict_false(vp_events != 0 && noffset > ooffset)) { \ if (noffset > osize) { \ vp_events |= NOTE_EXTEND; \ } \ VN_KNOTE((ap)->a_vp, vp_events); \ } \ } while (/*CONSTCOND*/0) static inline void vop_post(vnode_t *vp, struct mount *mp, bool mpsafe, enum fst_op op) { if (op == FST_YES || op == FST_LAZY) { fstrans_done(mp); } if (!mpsafe) { KERNEL_UNLOCK_ONE(curlwp); } } static inline void assert_vop_unlocked(vnode_t *vp, const char *str) { #if defined(VNODE_LOCKDEBUG) if (VOP_ISLOCKED(vp) == LK_EXCLUSIVE) panic("%s: %p %d/%d is locked but should not be", str, vp, vp->v_tag, vp->v_type); #endif } static inline void assert_vop_locked(vnode_t *vp, const char *str) { #if defined(VNODE_LOCKDEBUG) if (VOP_ISLOCKED(vp) == LK_NONE) panic("%s: %p %d/%d is not locked but should be", str, vp, vp->v_tag, vp->v_type); #endif } static inline void assert_vop_elocked(vnode_t *vp, const char *str) { #if defined(VNODE_LOCKDEBUG) if (VOP_ISLOCKED(vp) != LK_EXCLUSIVE) panic("%s: %p %d/%d is not exclusive locked but should be", str, vp, vp->v_tag, vp->v_type); #endif } const struct vnodeop_desc vop_default_desc = { 0, "default", 0, NULL, VDESC_NO_OFFSET, VDESC_NO_OFFSET, VDESC_NO_OFFSET, }; const int vop_bwrite_vp_offsets[] = { VOPARG_OFFSETOF(struct vop_bwrite_args,a_vp), VDESC_NO_OFFSET }; const struct vnodeop_desc vop_bwrite_desc = { VOP_BWRITE_DESCOFFSET, "vop_bwrite", 0, vop_bwrite_vp_offsets, VDESC_NO_OFFSET, VDESC_NO_OFFSET, VDESC_NO_OFFSET, }; int VOP_BWRITE(struct vnode *vp, struct buf *bp) { int error; bool mpsafe; struct vop_bwrite_args a; struct mount *mp; a.a_desc = VDESC(vop_bwrite); a.a_vp = vp; a.a_bp = bp; error = vop_pre(vp, &mp, &mpsafe, FST_YES); if (error) return error; error = (VCALL(vp, VOFFSET(vop_bwrite), &a)); vop_post(vp, mp, mpsafe, FST_YES); return error; } const int vop_parsepath_vp_offsets[] = { VOPARG_OFFSETOF(struct vop_parsepath_args,a_dvp), VDESC_NO_OFFSET }; const struct vnodeop_desc vop_parsepath_desc = { VOP_PARSEPATH_DESCOFFSET, "vop_parsepath", 0, vop_parsepath_vp_offsets, VDESC_NO_OFFSET, VDESC_NO_OFFSET, VDESC_NO_OFFSET, }; int VOP_PARSEPATH(struct vnode *dvp, const char *name, size_t *retval) { int error; bool mpsafe; struct vop_parsepath_args a; struct mount *mp; a.a_desc = VDESC(vop_parsepath); a.a_dvp = dvp; a.a_name = name; a.a_retval = retval; error = vop_pre(dvp, &mp, &mpsafe, FST_NO); if (error) return error; error = (VCALL(dvp, VOFFSET(vop_parsepath), &a)); vop_post(dvp, mp, mpsafe, FST_NO); return error; } const int vop_lookup_vp_offsets[] = { VOPARG_OFFSETOF(struct vop_lookup_v2_args,a_dvp), VDESC_NO_OFFSET }; const struct vnodeop_desc vop_lookup_desc = { VOP_LOOKUP_DESCOFFSET, "vop_lookup", 0, vop_lookup_vp_offsets, VOPARG_OFFSETOF(struct vop_lookup_v2_args, a_vpp), VDESC_NO_OFFSET, VOPARG_OFFSETOF(struct vop_lookup_v2_args, a_cnp), }; int VOP_LOOKUP(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp) { int error; bool mpsafe; struct vop_lookup_v2_args a; struct mount *mp; a.a_desc = VDESC(vop_lookup); a.a_dvp = dvp; a.a_vpp = vpp; a.a_cnp = cnp; assert_vop_locked(dvp, "vop_lookup: dvp"); error = vop_pre(dvp, &mp, &mpsafe, FST_NO); if (error) return error; error = (VCALL(dvp, VOFFSET(vop_lookup), &a)); vop_post(dvp, mp, mpsafe, FST_NO); #ifdef DIAGNOSTIC if (error == 0) KASSERT((*vpp)->v_size != VSIZENOTSET && (*vpp)->v_writesize != VSIZENOTSET); #endif /* DIAGNOSTIC */ return error; } const int vop_create_vp_offsets[] = { VOPARG_OFFSETOF(struct vop_create_v3_args,a_dvp), VDESC_NO_OFFSET }; const struct vnodeop_desc vop_create_desc = { VOP_CREATE_DESCOFFSET, "vop_create", 0, vop_create_vp_offsets, VOPARG_OFFSETOF(struct vop_create_v3_args, a_vpp), VDESC_NO_OFFSET, VOPARG_OFFSETOF(struct vop_create_v3_args, a_cnp), }; int VOP_CREATE(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, struct vattr *vap) { int error; bool mpsafe; struct vop_create_v3_args a; struct mount *mp; a.a_desc = VDESC(vop_create); a.a_dvp = dvp; a.a_vpp = vpp; a.a_cnp = cnp; a.a_vap = vap; assert_vop_elocked(dvp, "vop_create: dvp"); error = vop_pre(dvp, &mp, &mpsafe, FST_NO); if (error) return error; error = (VCALL(dvp, VOFFSET(vop_create), &a)); vop_post(dvp, mp, mpsafe, FST_NO); vop_create_post(&a, error); #ifdef DIAGNOSTIC if (error == 0) KASSERT((*vpp)->v_size != VSIZENOTSET && (*vpp)->v_writesize != VSIZENOTSET); #endif /* DIAGNOSTIC */ return error; } const int vop_mknod_vp_offsets[] = { VOPARG_OFFSETOF(struct vop_mknod_v3_args,a_dvp), VDESC_NO_OFFSET }; const struct vnodeop_desc vop_mknod_desc = { VOP_MKNOD_DESCOFFSET, "vop_mknod", 0, vop_mknod_vp_offsets, VOPARG_OFFSETOF(struct vop_mknod_v3_args, a_vpp), VDESC_NO_OFFSET, VOPARG_OFFSETOF(struct vop_mknod_v3_args, a_cnp), }; int VOP_MKNOD(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, struct vattr *vap) { int error; bool mpsafe; struct vop_mknod_v3_args a; struct mount *mp; a.a_desc = VDESC(vop_mknod); a.a_dvp = dvp; a.a_vpp = vpp; a.a_cnp = cnp; a.a_vap = vap; assert_vop_elocked(dvp, "vop_mknod: dvp"); error = vop_pre(dvp, &mp, &mpsafe, FST_NO); if (error) return error; error = (VCALL(dvp, VOFFSET(vop_mknod), &a)); vop_post(dvp, mp, mpsafe, FST_NO); vop_mknod_post(&a, error); #ifdef DIAGNOSTIC if (error == 0) KASSERT((*vpp)->v_size != VSIZENOTSET && (*vpp)->v_writesize != VSIZENOTSET); #endif /* DIAGNOSTIC */ return error; } const int vop_open_vp_offsets[] = { VOPARG_OFFSETOF(struct vop_open_args,a_vp), VDESC_NO_OFFSET }; const struct vnodeop_desc vop_open_desc = { VOP_OPEN_DESCOFFSET, "vop_open", 0, vop_open_vp_offsets, VDESC_NO_OFFSET, VOPARG_OFFSETOF(struct vop_open_args, a_cred), VDESC_NO_OFFSET, }; int VOP_OPEN(struct vnode *vp, int mode, kauth_cred_t cred) { int error; bool mpsafe; struct vop_open_args a; struct mount *mp; a.a_desc = VDESC(vop_open); a.a_vp = vp; a.a_mode = mode; a.a_cred = cred; assert_vop_locked(vp, "vop_open: vp"); error = vop_pre(vp, &mp, &mpsafe, FST_NO); if (error) return error; error = (VCALL(vp, VOFFSET(vop_open), &a)); vop_post(vp, mp, mpsafe, FST_NO); vop_open_post(&a, error); return error; } const int vop_close_vp_offsets[] = { VOPARG_OFFSETOF(struct vop_close_args,a_vp), VDESC_NO_OFFSET }; const struct vnodeop_desc vop_close_desc = { VOP_CLOSE_DESCOFFSET, "vop_close", 0, vop_close_vp_offsets, VDESC_NO_OFFSET, VOPARG_OFFSETOF(struct vop_close_args, a_cred), VDESC_NO_OFFSET, }; int VOP_CLOSE(struct vnode *vp, int fflag, kauth_cred_t cred) { int error; bool mpsafe; struct vop_close_args a; struct mount *mp; a.a_desc = VDESC(vop_close); a.a_vp = vp; a.a_fflag = fflag; a.a_cred = cred; assert_vop_locked(vp, "vop_close: vp"); error = vop_pre(vp, &mp, &mpsafe, FST_NO); if (error) return error; error = (VCALL(vp, VOFFSET(vop_close), &a)); vop_post(vp, mp, mpsafe, FST_NO); vop_close_post(&a, error); return error; } const int vop_access_vp_offsets[] = { VOPARG_OFFSETOF(struct vop_access_args,a_vp), VDESC_NO_OFFSET }; const struct vnodeop_desc vop_access_desc = { VOP_ACCESS_DESCOFFSET, "vop_access", 0, vop_access_vp_offsets, VDESC_NO_OFFSET, VOPARG_OFFSETOF(struct vop_access_args, a_cred), VDESC_NO_OFFSET, }; int VOP_ACCESS(struct vnode *vp, accmode_t accmode, kauth_cred_t cred) { int error; bool mpsafe; struct vop_access_args a; struct mount *mp; a.a_desc = VDESC(vop_access); a.a_vp = vp; a.a_accmode = accmode; a.a_cred = cred; assert_vop_locked(vp, "vop_access: vp"); error = vop_pre(vp, &mp, &mpsafe, FST_NO); if (error) return error; error = (VCALL(vp, VOFFSET(vop_access), &a)); vop_post(vp, mp, mpsafe, FST_NO); return error; } const int vop_accessx_vp_offsets[] = { VOPARG_OFFSETOF(struct vop_accessx_args,a_vp), VDESC_NO_OFFSET }; const struct vnodeop_desc vop_accessx_desc = { VOP_ACCESSX_DESCOFFSET, "vop_accessx", 0, vop_accessx_vp_offsets, VDESC_NO_OFFSET, VOPARG_OFFSETOF(struct vop_accessx_args, a_cred), VDESC_NO_OFFSET, }; int VOP_ACCESSX(struct vnode *vp, accmode_t accmode, kauth_cred_t cred) { int error; bool mpsafe; struct vop_accessx_args a; struct mount *mp; a.a_desc = VDESC(vop_accessx); a.a_vp = vp; a.a_accmode = accmode; a.a_cred = cred; assert_vop_locked(vp, "vop_accessx: vp"); error = vop_pre(vp, &mp, &mpsafe, FST_NO); if (error) return error; error = (VCALL(vp, VOFFSET(vop_accessx), &a)); vop_post(vp, mp, mpsafe, FST_NO); return error; } const int vop_getattr_vp_offsets[] = { VOPARG_OFFSETOF(struct vop_getattr_args,a_vp), VDESC_NO_OFFSET }; const struct vnodeop_desc vop_getattr_desc = { VOP_GETATTR_DESCOFFSET, "vop_getattr", 0, vop_getattr_vp_offsets, VDESC_NO_OFFSET, VOPARG_OFFSETOF(struct vop_getattr_args, a_cred), VDESC_NO_OFFSET, }; int VOP_GETATTR(struct vnode *vp, struct vattr *vap, kauth_cred_t cred) { int error; bool mpsafe; struct vop_getattr_args a; struct mount *mp; a.a_desc = VDESC(vop_getattr); a.a_vp = vp; a.a_vap = vap; a.a_cred = cred; assert_vop_locked(vp, "vop_getattr: vp"); error = vop_pre(vp, &mp, &mpsafe, FST_NO); if (error) return error; error = (VCALL(vp, VOFFSET(vop_getattr), &a)); vop_post(vp, mp, mpsafe, FST_NO); return error; } const int vop_setattr_vp_offsets[] = { VOPARG_OFFSETOF(struct vop_setattr_args,a_vp), VDESC_NO_OFFSET }; const struct vnodeop_desc vop_setattr_desc = { VOP_SETATTR_DESCOFFSET, "vop_setattr", 0, vop_setattr_vp_offsets, VDESC_NO_OFFSET, VOPARG_OFFSETOF(struct vop_setattr_args, a_cred), VDESC_NO_OFFSET, }; int VOP_SETATTR(struct vnode *vp, struct vattr *vap, kauth_cred_t cred) { int error; bool mpsafe; struct vop_setattr_args a; struct mount *mp; a.a_desc = VDESC(vop_setattr); a.a_vp = vp; a.a_vap = vap; a.a_cred = cred; assert_vop_elocked(vp, "vop_setattr: vp"); vop_setattr_pre(&a); error = vop_pre(vp, &mp, &mpsafe, FST_NO); if (error) return error; error = (VCALL(vp, VOFFSET(vop_setattr), &a)); vop_post(vp, mp, mpsafe, FST_NO); vop_setattr_post(&a, error); return error; } const int vop_read_vp_offsets[] = { VOPARG_OFFSETOF(struct vop_read_args,a_vp), VDESC_NO_OFFSET }; const struct vnodeop_desc vop_read_desc = { VOP_READ_DESCOFFSET, "vop_read", 0, vop_read_vp_offsets, VDESC_NO_OFFSET, VOPARG_OFFSETOF(struct vop_read_args, a_cred), VDESC_NO_OFFSET, }; int VOP_READ(struct vnode *vp, struct uio *uio, int ioflag, kauth_cred_t cred) { int error; bool mpsafe; struct vop_read_args a; struct mount *mp; a.a_desc = VDESC(vop_read); a.a_vp = vp; a.a_uio = uio; a.a_ioflag = ioflag; a.a_cred = cred; assert_vop_locked(vp, "vop_read: vp"); error = vop_pre(vp, &mp, &mpsafe, FST_NO); if (error) return error; error = (VCALL(vp, VOFFSET(vop_read), &a)); vop_post(vp, mp, mpsafe, FST_NO); vop_read_post(&a, error); return error; } const int vop_write_vp_offsets[] = { VOPARG_OFFSETOF(struct vop_write_args,a_vp), VDESC_NO_OFFSET }; const struct vnodeop_desc vop_write_desc = { VOP_WRITE_DESCOFFSET, "vop_write", 0, vop_write_vp_offsets, VDESC_NO_OFFSET, VOPARG_OFFSETOF(struct vop_write_args, a_cred), VDESC_NO_OFFSET, }; int VOP_WRITE(struct vnode *vp, struct uio *uio, int ioflag, kauth_cred_t cred) { int error; bool mpsafe; struct vop_write_args a; struct mount *mp; a.a_desc = VDESC(vop_write); a.a_vp = vp; a.a_uio = uio; a.a_ioflag = ioflag; a.a_cred = cred; assert_vop_locked(vp, "vop_write: vp"); vop_write_pre(&a); error = vop_pre(vp, &mp, &mpsafe, FST_NO); if (error) return error; error = (VCALL(vp, VOFFSET(vop_write), &a)); vop_post(vp, mp, mpsafe, FST_NO); vop_write_post(&a, error); return error; } const int vop_fallocate_vp_offsets[] = { VOPARG_OFFSETOF(struct vop_fallocate_args,a_vp), VDESC_NO_OFFSET }; const struct vnodeop_desc vop_fallocate_desc = { VOP_FALLOCATE_DESCOFFSET, "vop_fallocate", 0, vop_fallocate_vp_offsets, VDESC_NO_OFFSET, VDESC_NO_OFFSET, VDESC_NO_OFFSET, }; int VOP_FALLOCATE(struct vnode *vp, off_t pos, off_t len) { int error; bool mpsafe; struct vop_fallocate_args a; struct mount *mp; a.a_desc = VDESC(vop_fallocate); a.a_vp = vp; a.a_pos = pos; a.a_len = len; assert_vop_locked(vp, "vop_fallocate: vp"); error = vop_pre(vp, &mp, &mpsafe, FST_NO); if (error) return error; error = (VCALL(vp, VOFFSET(vop_fallocate), &a)); vop_post(vp, mp, mpsafe, FST_NO); return error; } const int vop_fdiscard_vp_offsets[] = { VOPARG_OFFSETOF(struct vop_fdiscard_args,a_vp), VDESC_NO_OFFSET }; const struct vnodeop_desc vop_fdiscard_desc = { VOP_FDISCARD_DESCOFFSET, "vop_fdiscard", 0, vop_fdiscard_vp_offsets, VDESC_NO_OFFSET, VDESC_NO_OFFSET, VDESC_NO_OFFSET, }; int VOP_FDISCARD(struct vnode *vp, off_t pos, off_t len) { int error; bool mpsafe; struct vop_fdiscard_args a; struct mount *mp; a.a_desc = VDESC(vop_fdiscard); a.a_vp = vp; a.a_pos = pos; a.a_len = len; assert_vop_locked(vp, "vop_fdiscard: vp"); error = vop_pre(vp, &mp, &mpsafe, FST_NO); if (error) return error; error = (VCALL(vp, VOFFSET(vop_fdiscard), &a)); vop_post(vp, mp, mpsafe, FST_NO); return error; } const int vop_ioctl_vp_offsets[] = { VOPARG_OFFSETOF(struct vop_ioctl_args,a_vp), VDESC_NO_OFFSET }; const struct vnodeop_desc vop_ioctl_desc = { VOP_IOCTL_DESCOFFSET, "vop_ioctl", 0, vop_ioctl_vp_offsets, VDESC_NO_OFFSET, VOPARG_OFFSETOF(struct vop_ioctl_args, a_cred), VDESC_NO_OFFSET, }; int VOP_IOCTL(struct vnode *vp, u_long command, void *data, int fflag, kauth_cred_t cred) { int error; bool mpsafe; struct vop_ioctl_args a; struct mount *mp; a.a_desc = VDESC(vop_ioctl); a.a_vp = vp; a.a_command = command; a.a_data = data; a.a_fflag = fflag; a.a_cred = cred; error = vop_pre(vp, &mp, &mpsafe, FST_NO); if (error) return error; error = (VCALL(vp, VOFFSET(vop_ioctl), &a)); vop_post(vp, mp, mpsafe, FST_NO); return error; } const int vop_fcntl_vp_offsets[] = { VOPARG_OFFSETOF(struct vop_fcntl_args,a_vp), VDESC_NO_OFFSET }; const struct vnodeop_desc vop_fcntl_desc = { VOP_FCNTL_DESCOFFSET, "vop_fcntl", 0, vop_fcntl_vp_offsets, VDESC_NO_OFFSET, VOPARG_OFFSETOF(struct vop_fcntl_args, a_cred), VDESC_NO_OFFSET, }; int VOP_FCNTL(struct vnode *vp, u_int command, void *data, int fflag, kauth_cred_t cred) { int error; bool mpsafe; struct vop_fcntl_args a; struct mount *mp; a.a_desc = VDESC(vop_fcntl); a.a_vp = vp; a.a_command = command; a.a_data = data; a.a_fflag = fflag; a.a_cred = cred; assert_vop_unlocked(vp, "vop_fcntl: vp"); error = vop_pre(vp, &mp, &mpsafe, FST_NO); if (error) return error; error = (VCALL(vp, VOFFSET(vop_fcntl), &a)); vop_post(vp, mp, mpsafe, FST_NO); return error; } const int vop_poll_vp_offsets[] = { VOPARG_OFFSETOF(struct vop_poll_args,a_vp), VDESC_NO_OFFSET }; const struct vnodeop_desc vop_poll_desc = { VOP_POLL_DESCOFFSET, "vop_poll", 0, vop_poll_vp_offsets, VDESC_NO_OFFSET, VDESC_NO_OFFSET, VDESC_NO_OFFSET, }; int VOP_POLL(struct vnode *vp, int events) { int error; bool mpsafe; struct vop_poll_args a; struct mount *mp; a.a_desc = VDESC(vop_poll); a.a_vp = vp; a.a_events = events; assert_vop_unlocked(vp, "vop_poll: vp"); error = vop_pre(vp, &mp, &mpsafe, FST_YES); if (error) return error; error = (VCALL(vp, VOFFSET(vop_poll), &a)); vop_post(vp, mp, mpsafe, FST_YES); return error; } const int vop_kqfilter_vp_offsets[] = { VOPARG_OFFSETOF(struct vop_kqfilter_args,a_vp), VDESC_NO_OFFSET }; const struct vnodeop_desc vop_kqfilter_desc = { VOP_KQFILTER_DESCOFFSET, "vop_kqfilter", 0, vop_kqfilter_vp_offsets, VDESC_NO_OFFSET, VDESC_NO_OFFSET, VDESC_NO_OFFSET, }; int VOP_KQFILTER(struct vnode *vp, struct knote *kn) { int error; bool mpsafe; struct vop_kqfilter_args a; struct mount *mp; a.a_desc = VDESC(vop_kqfilter); a.a_vp = vp; a.a_kn = kn; assert_vop_unlocked(vp, "vop_kqfilter: vp"); error = vop_pre(vp, &mp, &mpsafe, FST_YES); if (error) return error; error = (VCALL(vp, VOFFSET(vop_kqfilter), &a)); vop_post(vp, mp, mpsafe, FST_YES); return error; } const int vop_revoke_vp_offsets[] = { VOPARG_OFFSETOF(struct vop_revoke_args,a_vp), VDESC_NO_OFFSET }; const struct vnodeop_desc vop_revoke_desc = { VOP_REVOKE_DESCOFFSET, "vop_revoke", 0, vop_revoke_vp_offsets, VDESC_NO_OFFSET, VDESC_NO_OFFSET, VDESC_NO_OFFSET, }; int VOP_REVOKE(struct vnode *vp, int flags) { int error; bool mpsafe; struct vop_revoke_args a; struct mount *mp; a.a_desc = VDESC(vop_revoke); a.a_vp = vp; a.a_flags = flags; assert_vop_unlocked(vp, "vop_revoke: vp"); error = vop_pre(vp, &mp, &mpsafe, FST_NO); if (error) return error; error = (VCALL(vp, VOFFSET(vop_revoke), &a)); vop_post(vp, mp, mpsafe, FST_NO); return error; } const int vop_mmap_vp_offsets[] = { VOPARG_OFFSETOF(struct vop_mmap_args,a_vp), VDESC_NO_OFFSET }; const struct vnodeop_desc vop_mmap_desc = { VOP_MMAP_DESCOFFSET, "vop_mmap", 0, vop_mmap_vp_offsets, VDESC_NO_OFFSET, VOPARG_OFFSETOF(struct vop_mmap_args, a_cred), VDESC_NO_OFFSET, }; int VOP_MMAP(struct vnode *vp, vm_prot_t prot, kauth_cred_t cred) { int error; bool mpsafe; struct vop_mmap_args a; struct mount *mp; a.a_desc = VDESC(vop_mmap); a.a_vp = vp; a.a_prot = prot; a.a_cred = cred; error = vop_pre(vp, &mp, &mpsafe, FST_YES); if (error) return error; error = (VCALL(vp, VOFFSET(vop_mmap), &a)); vop_post(vp, mp, mpsafe, FST_YES); return error; } const int vop_fsync_vp_offsets[] = { VOPARG_OFFSETOF(struct vop_fsync_args,a_vp), VDESC_NO_OFFSET }; const struct vnodeop_desc vop_fsync_desc = { VOP_FSYNC_DESCOFFSET, "vop_fsync", 0, vop_fsync_vp_offsets, VDESC_NO_OFFSET, VOPARG_OFFSETOF(struct vop_fsync_args, a_cred), VDESC_NO_OFFSET, }; int VOP_FSYNC(struct vnode *vp, kauth_cred_t cred, int flags, off_t offlo, off_t offhi) { int error; bool mpsafe; struct vop_fsync_args a; struct mount *mp; a.a_desc = VDESC(vop_fsync); a.a_vp = vp; a.a_cred = cred; a.a_flags = flags; a.a_offlo = offlo; a.a_offhi = offhi; assert_vop_locked(vp, "vop_fsync: vp"); error = vop_pre(vp, &mp, &mpsafe, FST_NO); if (error) return error; error = (VCALL(vp, VOFFSET(vop_fsync), &a)); vop_post(vp, mp, mpsafe, FST_NO); return error; } const int vop_seek_vp_offsets[] = { VOPARG_OFFSETOF(struct vop_seek_args,a_vp), VDESC_NO_OFFSET }; const struct vnodeop_desc vop_seek_desc = { VOP_SEEK_DESCOFFSET, "vop_seek", 0, vop_seek_vp_offsets, VDESC_NO_OFFSET, VOPARG_OFFSETOF(struct vop_seek_args, a_cred), VDESC_NO_OFFSET, }; int VOP_SEEK(struct vnode *vp, off_t oldoff, off_t newoff, kauth_cred_t cred) { int error; bool mpsafe; struct vop_seek_args a; struct mount *mp; a.a_desc = VDESC(vop_seek); a.a_vp = vp; a.a_oldoff = oldoff; a.a_newoff = newoff; a.a_cred = cred; error = vop_pre(vp, &mp, &mpsafe, FST_YES); if (error) return error; error = (VCALL(vp, VOFFSET(vop_seek), &a)); vop_post(vp, mp, mpsafe, FST_YES); return error; } const int vop_remove_vp_offsets[] = { VOPARG_OFFSETOF(struct vop_remove_v3_args,a_dvp), VOPARG_OFFSETOF(struct vop_remove_v3_args,a_vp), VDESC_NO_OFFSET }; const struct vnodeop_desc vop_remove_desc = { VOP_REMOVE_DESCOFFSET, "vop_remove", 0 | VDESC_VP1_WILLPUT, vop_remove_vp_offsets, VDESC_NO_OFFSET, VDESC_NO_OFFSET, VOPARG_OFFSETOF(struct vop_remove_v3_args, a_cnp), }; int VOP_REMOVE(struct vnode *dvp, struct vnode *vp, struct componentname *cnp) { int error; bool mpsafe; struct vop_remove_v3_args a; struct mount *mp; a.a_desc = VDESC(vop_remove); a.a_dvp = dvp; a.a_vp = vp; a.a_cnp = cnp; assert_vop_elocked(dvp, "vop_remove: dvp"); assert_vop_locked(vp, "vop_remove: vp"); vop_remove_pre(&a); error = vop_pre(dvp, &mp, &mpsafe, FST_NO); if (error) return error; error = (VCALL(dvp, VOFFSET(vop_remove), &a)); vop_post(dvp, mp, mpsafe, FST_NO); vop_remove_post(&a, error); return error; } const int vop_link_vp_offsets[] = { VOPARG_OFFSETOF(struct vop_link_v2_args,a_dvp), VOPARG_OFFSETOF(struct vop_link_v2_args,a_vp), VDESC_NO_OFFSET }; const struct vnodeop_desc vop_link_desc = { VOP_LINK_DESCOFFSET, "vop_link", 0, vop_link_vp_offsets, VDESC_NO_OFFSET, VDESC_NO_OFFSET, VOPARG_OFFSETOF(struct vop_link_v2_args, a_cnp), }; int VOP_LINK(struct vnode *dvp, struct vnode *vp, struct componentname *cnp) { int error; bool mpsafe; struct vop_link_v2_args a; struct mount *mp; a.a_desc = VDESC(vop_link); a.a_dvp = dvp; a.a_vp = vp; a.a_cnp = cnp; assert_vop_elocked(dvp, "vop_link: dvp"); assert_vop_unlocked(vp, "vop_link: vp"); error = vop_pre(dvp, &mp, &mpsafe, FST_NO); if (error) return error; error = (VCALL(dvp, VOFFSET(vop_link), &a)); vop_post(dvp, mp, mpsafe, FST_NO); vop_link_post(&a, error); return error; } const int vop_rename_vp_offsets[] = { VOPARG_OFFSETOF(struct vop_rename_args,a_fdvp), VOPARG_OFFSETOF(struct vop_rename_args,a_fvp), VOPARG_OFFSETOF(struct vop_rename_args,a_tdvp), VOPARG_OFFSETOF(struct vop_rename_args,a_tvp), VDESC_NO_OFFSET }; const struct vnodeop_desc vop_rename_desc = { VOP_RENAME_DESCOFFSET, "vop_rename", 0 | VDESC_VP0_WILLRELE | VDESC_VP1_WILLRELE | VDESC_VP2_WILLPUT | VDESC_VP3_WILLPUT, vop_rename_vp_offsets, VDESC_NO_OFFSET, VDESC_NO_OFFSET, VOPARG_OFFSETOF(struct vop_rename_args, a_fcnp), }; int VOP_RENAME(struct vnode *fdvp, struct vnode *fvp, struct componentname *fcnp, struct vnode *tdvp, struct vnode *tvp, struct componentname *tcnp) { int error; bool mpsafe; struct vop_rename_args a; struct mount *mp; a.a_desc = VDESC(vop_rename); a.a_fdvp = fdvp; a.a_fvp = fvp; a.a_fcnp = fcnp; a.a_tdvp = tdvp; a.a_tvp = tvp; a.a_tcnp = tcnp; assert_vop_locked(tdvp, "vop_rename: tdvp"); error = vop_pre(fdvp, &mp, &mpsafe, FST_YES); if (error) return error; error = (VCALL(fdvp, VOFFSET(vop_rename), &a)); vop_post(fdvp, mp, mpsafe, FST_YES); return error; } const int vop_mkdir_vp_offsets[] = { VOPARG_OFFSETOF(struct vop_mkdir_v3_args,a_dvp), VDESC_NO_OFFSET }; const struct vnodeop_desc vop_mkdir_desc = { VOP_MKDIR_DESCOFFSET, "vop_mkdir", 0, vop_mkdir_vp_offsets, VOPARG_OFFSETOF(struct vop_mkdir_v3_args, a_vpp), VDESC_NO_OFFSET, VOPARG_OFFSETOF(struct vop_mkdir_v3_args, a_cnp), }; int VOP_MKDIR(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, struct vattr *vap) { int error; bool mpsafe; struct vop_mkdir_v3_args a; struct mount *mp; a.a_desc = VDESC(vop_mkdir); a.a_dvp = dvp; a.a_vpp = vpp; a.a_cnp = cnp; a.a_vap = vap; assert_vop_elocked(dvp, "vop_mkdir: dvp"); error = vop_pre(dvp, &mp, &mpsafe, FST_NO); if (error) return error; error = (VCALL(dvp, VOFFSET(vop_mkdir), &a)); vop_post(dvp, mp, mpsafe, FST_NO); vop_mkdir_post(&a, error); #ifdef DIAGNOSTIC if (error == 0) KASSERT((*vpp)->v_size != VSIZENOTSET && (*vpp)->v_writesize != VSIZENOTSET); #endif /* DIAGNOSTIC */ return error; } const int vop_rmdir_vp_offsets[] = { VOPARG_OFFSETOF(struct vop_rmdir_v2_args,a_dvp), VOPARG_OFFSETOF(struct vop_rmdir_v2_args,a_vp), VDESC_NO_OFFSET }; const struct vnodeop_desc vop_rmdir_desc = { VOP_RMDIR_DESCOFFSET, "vop_rmdir", 0 | VDESC_VP1_WILLPUT, vop_rmdir_vp_offsets, VDESC_NO_OFFSET, VDESC_NO_OFFSET, VOPARG_OFFSETOF(struct vop_rmdir_v2_args, a_cnp), }; int VOP_RMDIR(struct vnode *dvp, struct vnode *vp, struct componentname *cnp) { int error; bool mpsafe; struct vop_rmdir_v2_args a; struct mount *mp; a.a_desc = VDESC(vop_rmdir); a.a_dvp = dvp; a.a_vp = vp; a.a_cnp = cnp; assert_vop_elocked(dvp, "vop_rmdir: dvp"); assert_vop_elocked(vp, "vop_rmdir: vp"); vop_rmdir_pre(&a); error = vop_pre(dvp, &mp, &mpsafe, FST_NO); if (error) return error; error = (VCALL(dvp, VOFFSET(vop_rmdir), &a)); vop_post(dvp, mp, mpsafe, FST_NO); vop_rmdir_post(&a, error); return error; } const int vop_symlink_vp_offsets[] = { VOPARG_OFFSETOF(struct vop_symlink_v3_args,a_dvp), VDESC_NO_OFFSET }; const struct vnodeop_desc vop_symlink_desc = { VOP_SYMLINK_DESCOFFSET, "vop_symlink", 0, vop_symlink_vp_offsets, VOPARG_OFFSETOF(struct vop_symlink_v3_args, a_vpp), VDESC_NO_OFFSET, VOPARG_OFFSETOF(struct vop_symlink_v3_args, a_cnp), }; int VOP_SYMLINK(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, struct vattr *vap, char *target) { int error; bool mpsafe; struct vop_symlink_v3_args a; struct mount *mp; a.a_desc = VDESC(vop_symlink); a.a_dvp = dvp; a.a_vpp = vpp; a.a_cnp = cnp; a.a_vap = vap; a.a_target = target; assert_vop_elocked(dvp, "vop_symlink: dvp"); error = vop_pre(dvp, &mp, &mpsafe, FST_NO); if (error) return error; error = (VCALL(dvp, VOFFSET(vop_symlink), &a)); vop_post(dvp, mp, mpsafe, FST_NO); vop_symlink_post(&a, error); #ifdef DIAGNOSTIC if (error == 0) KASSERT((*vpp)->v_size != VSIZENOTSET && (*vpp)->v_writesize != VSIZENOTSET); #endif /* DIAGNOSTIC */ return error; } const int vop_readdir_vp_offsets[] = { VOPARG_OFFSETOF(struct vop_readdir_args,a_vp), VDESC_NO_OFFSET }; const struct vnodeop_desc vop_readdir_desc = { VOP_READDIR_DESCOFFSET, "vop_readdir", 0, vop_readdir_vp_offsets, VDESC_NO_OFFSET, VOPARG_OFFSETOF(struct vop_readdir_args, a_cred), VDESC_NO_OFFSET, }; int VOP_READDIR(struct vnode *vp, struct uio *uio, kauth_cred_t cred, int *eofflag, off_t **cookies, int *ncookies) { int error; bool mpsafe; struct vop_readdir_args a; struct mount *mp; a.a_desc = VDESC(vop_readdir); a.a_vp = vp; a.a_uio = uio; a.a_cred = cred; a.a_eofflag = eofflag; a.a_cookies = cookies; a.a_ncookies = ncookies; assert_vop_locked(vp, "vop_readdir: vp"); error = vop_pre(vp, &mp, &mpsafe, FST_NO); if (error) return error; error = (VCALL(vp, VOFFSET(vop_readdir), &a)); vop_post(vp, mp, mpsafe, FST_NO); return error; } const int vop_readlink_vp_offsets[] = { VOPARG_OFFSETOF(struct vop_readlink_args,a_vp), VDESC_NO_OFFSET }; const struct vnodeop_desc vop_readlink_desc = { VOP_READLINK_DESCOFFSET, "vop_readlink", 0, vop_readlink_vp_offsets, VDESC_NO_OFFSET, VOPARG_OFFSETOF(struct vop_readlink_args, a_cred), VDESC_NO_OFFSET, }; int VOP_READLINK(struct vnode *vp, struct uio *uio, kauth_cred_t cred) { int error; bool mpsafe; struct vop_readlink_args a; struct mount *mp; a.a_desc = VDESC(vop_readlink); a.a_vp = vp; a.a_uio = uio; a.a_cred = cred; assert_vop_locked(vp, "vop_readlink: vp"); error = vop_pre(vp, &mp, &mpsafe, FST_NO); if (error) return error; error = (VCALL(vp, VOFFSET(vop_readlink), &a)); vop_post(vp, mp, mpsafe, FST_NO); return error; } const int vop_abortop_vp_offsets[] = { VOPARG_OFFSETOF(struct vop_abortop_args,a_dvp), VDESC_NO_OFFSET }; const struct vnodeop_desc vop_abortop_desc = { VOP_ABORTOP_DESCOFFSET, "vop_abortop", 0, vop_abortop_vp_offsets, VDESC_NO_OFFSET, VDESC_NO_OFFSET, VOPARG_OFFSETOF(struct vop_abortop_args, a_cnp), }; int VOP_ABORTOP(struct vnode *dvp, struct componentname *cnp) { int error; bool mpsafe; struct vop_abortop_args a; struct mount *mp; a.a_desc = VDESC(vop_abortop); a.a_dvp = dvp; a.a_cnp = cnp; error = vop_pre(dvp, &mp, &mpsafe, FST_YES); if (error) return error; error = (VCALL(dvp, VOFFSET(vop_abortop), &a)); vop_post(dvp, mp, mpsafe, FST_YES); return error; } const int vop_inactive_vp_offsets[] = { VOPARG_OFFSETOF(struct vop_inactive_v2_args,a_vp), VDESC_NO_OFFSET }; const struct vnodeop_desc vop_inactive_desc = { VOP_INACTIVE_DESCOFFSET, "vop_inactive", 0, vop_inactive_vp_offsets, VDESC_NO_OFFSET, VDESC_NO_OFFSET, VDESC_NO_OFFSET, }; int VOP_INACTIVE(struct vnode *vp, bool *recycle) { int error; bool mpsafe; struct vop_inactive_v2_args a; struct mount *mp; a.a_desc = VDESC(vop_inactive); a.a_vp = vp; a.a_recycle = recycle; assert_vop_elocked(vp, "vop_inactive: vp"); error = vop_pre(vp, &mp, &mpsafe, FST_NO); if (error) return error; error = (VCALL(vp, VOFFSET(vop_inactive), &a)); vop_post(vp, mp, mpsafe, FST_NO); return error; } const int vop_reclaim_vp_offsets[] = { VOPARG_OFFSETOF(struct vop_reclaim_v2_args,a_vp), VDESC_NO_OFFSET }; const struct vnodeop_desc vop_reclaim_desc = { VOP_RECLAIM_DESCOFFSET, "vop_reclaim", 0, vop_reclaim_vp_offsets, VDESC_NO_OFFSET, VDESC_NO_OFFSET, VDESC_NO_OFFSET, }; int VOP_RECLAIM(struct vnode *vp) { int error; bool mpsafe; struct vop_reclaim_v2_args a; struct mount *mp; a.a_desc = VDESC(vop_reclaim); a.a_vp = vp; assert_vop_elocked(vp, "vop_reclaim: vp"); error = vop_pre(vp, &mp, &mpsafe, FST_NO); if (error) return error; error = (VCALL(vp, VOFFSET(vop_reclaim), &a)); vop_post(vp, mp, mpsafe, FST_NO); return error; } const int vop_lock_vp_offsets[] = { VOPARG_OFFSETOF(struct vop_lock_args,a_vp), VDESC_NO_OFFSET }; const struct vnodeop_desc vop_lock_desc = { VOP_LOCK_DESCOFFSET, "vop_lock", 0, vop_lock_vp_offsets, VDESC_NO_OFFSET, VDESC_NO_OFFSET, VDESC_NO_OFFSET, }; int VOP_LOCK(struct vnode *vp, int flags) { int error; bool mpsafe; struct vop_lock_args a; struct mount *mp; a.a_desc = VDESC(vop_lock); a.a_vp = vp; a.a_flags = flags; error = vop_pre(vp, &mp, &mpsafe, (!(flags & (LK_SHARED|LK_EXCLUSIVE)) ? FST_NO : (flags & LK_NOWAIT ? FST_TRY : FST_YES))); if (error) return error; error = (VCALL(vp, VOFFSET(vop_lock), &a)); vop_post(vp, mp, mpsafe, (flags & (LK_UPGRADE|LK_DOWNGRADE) ? FST_NO : (error ? FST_YES : FST_NO))); return error; } const int vop_unlock_vp_offsets[] = { VOPARG_OFFSETOF(struct vop_unlock_args,a_vp), VDESC_NO_OFFSET }; const struct vnodeop_desc vop_unlock_desc = { VOP_UNLOCK_DESCOFFSET, "vop_unlock", 0, vop_unlock_vp_offsets, VDESC_NO_OFFSET, VDESC_NO_OFFSET, VDESC_NO_OFFSET, }; int VOP_UNLOCK(struct vnode *vp) { int error; bool mpsafe; struct vop_unlock_args a; struct mount *mp; a.a_desc = VDESC(vop_unlock); a.a_vp = vp; assert_vop_locked(vp, "vop_unlock: vp"); error = vop_pre(vp, &mp, &mpsafe, FST_NO); if (error) return error; error = (VCALL(vp, VOFFSET(vop_unlock), &a)); vop_post(vp, mp, mpsafe, FST_YES); return error; } const int vop_bmap_vp_offsets[] = { VOPARG_OFFSETOF(struct vop_bmap_args,a_vp), VDESC_NO_OFFSET }; const struct vnodeop_desc vop_bmap_desc = { VOP_BMAP_DESCOFFSET, "vop_bmap", 0, vop_bmap_vp_offsets, VOPARG_OFFSETOF(struct vop_bmap_args, a_vpp), VDESC_NO_OFFSET, VDESC_NO_OFFSET, }; int VOP_BMAP(struct vnode *vp, daddr_t bn, struct vnode **vpp, daddr_t *bnp, int *runp) { int error; bool mpsafe; struct vop_bmap_args a; struct mount *mp; a.a_desc = VDESC(vop_bmap); a.a_vp = vp; a.a_bn = bn; a.a_vpp = vpp; a.a_bnp = bnp; a.a_runp = runp; error = vop_pre(vp, &mp, &mpsafe, FST_YES); if (error) return error; error = (VCALL(vp, VOFFSET(vop_bmap), &a)); vop_post(vp, mp, mpsafe, FST_YES); return error; } const int vop_strategy_vp_offsets[] = { VOPARG_OFFSETOF(struct vop_strategy_args,a_vp), VDESC_NO_OFFSET }; const struct vnodeop_desc vop_strategy_desc = { VOP_STRATEGY_DESCOFFSET, "vop_strategy", 0, vop_strategy_vp_offsets, VDESC_NO_OFFSET, VDESC_NO_OFFSET, VDESC_NO_OFFSET, }; int VOP_STRATEGY(struct vnode *vp, struct buf *bp) { int error; bool mpsafe; struct vop_strategy_args a; struct mount *mp; a.a_desc = VDESC(vop_strategy); a.a_vp = vp; a.a_bp = bp; error = vop_pre(vp, &mp, &mpsafe, FST_NO); if (error) return error; error = (VCALL(vp, VOFFSET(vop_strategy), &a)); vop_post(vp, mp, mpsafe, FST_NO); return error; } const int vop_print_vp_offsets[] = { VOPARG_OFFSETOF(struct vop_print_args,a_vp), VDESC_NO_OFFSET }; const struct vnodeop_desc vop_print_desc = { VOP_PRINT_DESCOFFSET, "vop_print", 0, vop_print_vp_offsets, VDESC_NO_OFFSET, VDESC_NO_OFFSET, VDESC_NO_OFFSET, }; int VOP_PRINT(struct vnode *vp) { int error; bool mpsafe; struct vop_print_args a; struct mount *mp; a.a_desc = VDESC(vop_print); a.a_vp = vp; error = vop_pre(vp, &mp, &mpsafe, FST_YES); if (error) return error; error = (VCALL(vp, VOFFSET(vop_print), &a)); vop_post(vp, mp, mpsafe, FST_YES); return error; } const int vop_islocked_vp_offsets[] = { VOPARG_OFFSETOF(struct vop_islocked_args,a_vp), VDESC_NO_OFFSET }; const struct vnodeop_desc vop_islocked_desc = { VOP_ISLOCKED_DESCOFFSET, "vop_islocked", 0, vop_islocked_vp_offsets, VDESC_NO_OFFSET, VDESC_NO_OFFSET, VDESC_NO_OFFSET, }; int VOP_ISLOCKED(struct vnode *vp) { int error; bool mpsafe; struct vop_islocked_args a; struct mount *mp; a.a_desc = VDESC(vop_islocked); a.a_vp = vp; error = vop_pre(vp, &mp, &mpsafe, FST_NO); if (error) return error; error = (VCALL(vp, VOFFSET(vop_islocked), &a)); vop_post(vp, mp, mpsafe, FST_NO); return error; } const int vop_pathconf_vp_offsets[] = { VOPARG_OFFSETOF(struct vop_pathconf_args,a_vp), VDESC_NO_OFFSET }; const struct vnodeop_desc vop_pathconf_desc = { VOP_PATHCONF_DESCOFFSET, "vop_pathconf", 0, vop_pathconf_vp_offsets, VDESC_NO_OFFSET, VDESC_NO_OFFSET, VDESC_NO_OFFSET, }; int VOP_PATHCONF(struct vnode *vp, int name, register_t *retval) { int error; bool mpsafe; struct vop_pathconf_args a; struct mount *mp; a.a_desc = VDESC(vop_pathconf); a.a_vp = vp; a.a_name = name; a.a_retval = retval; assert_vop_locked(vp, "vop_pathconf: vp"); error = vop_pre(vp, &mp, &mpsafe, FST_NO); if (error) return error; error = (VCALL(vp, VOFFSET(vop_pathconf), &a)); vop_post(vp, mp, mpsafe, FST_NO); return error; } const int vop_advlock_vp_offsets[] = { VOPARG_OFFSETOF(struct vop_advlock_args,a_vp), VDESC_NO_OFFSET }; const struct vnodeop_desc vop_advlock_desc = { VOP_ADVLOCK_DESCOFFSET, "vop_advlock", 0, vop_advlock_vp_offsets, VDESC_NO_OFFSET, VDESC_NO_OFFSET, VDESC_NO_OFFSET, }; int VOP_ADVLOCK(struct vnode *vp, void *id, int op, struct flock *fl, int flags) { int error; bool mpsafe; struct vop_advlock_args a; struct mount *mp; a.a_desc = VDESC(vop_advlock); a.a_vp = vp; a.a_id = id; a.a_op = op; a.a_fl = fl; a.a_flags = flags; assert_vop_unlocked(vp, "vop_advlock: vp"); error = vop_pre(vp, &mp, &mpsafe, FST_NO); if (error) return error; error = (VCALL(vp, VOFFSET(vop_advlock), &a)); vop_post(vp, mp, mpsafe, FST_NO); return error; } const int vop_whiteout_vp_offsets[] = { VOPARG_OFFSETOF(struct vop_whiteout_args,a_dvp), VDESC_NO_OFFSET }; const struct vnodeop_desc vop_whiteout_desc = { VOP_WHITEOUT_DESCOFFSET, "vop_whiteout", 0, vop_whiteout_vp_offsets, VDESC_NO_OFFSET, VDESC_NO_OFFSET, VOPARG_OFFSETOF(struct vop_whiteout_args, a_cnp), }; int VOP_WHITEOUT(struct vnode *dvp, struct componentname *cnp, int flags) { int error; bool mpsafe; struct vop_whiteout_args a; struct mount *mp; a.a_desc = VDESC(vop_whiteout); a.a_dvp = dvp; a.a_cnp = cnp; a.a_flags = flags; assert_vop_elocked(dvp, "vop_whiteout: dvp"); error = vop_pre(dvp, &mp, &mpsafe, FST_NO); if (error) return error; error = (VCALL(dvp, VOFFSET(vop_whiteout), &a)); vop_post(dvp, mp, mpsafe, FST_NO); return error; } const int vop_getpages_vp_offsets[] = { VOPARG_OFFSETOF(struct vop_getpages_args,a_vp), VDESC_NO_OFFSET }; const struct vnodeop_desc vop_getpages_desc = { VOP_GETPAGES_DESCOFFSET, "vop_getpages", 0, vop_getpages_vp_offsets, VDESC_NO_OFFSET, VDESC_NO_OFFSET, VDESC_NO_OFFSET, }; int VOP_GETPAGES(struct vnode *vp, voff_t offset, struct vm_page **m, int *count, int centeridx, vm_prot_t access_type, int advice, int flags) { int error; bool mpsafe; struct vop_getpages_args a; struct mount *mp; a.a_desc = VDESC(vop_getpages); a.a_vp = vp; a.a_offset = offset; a.a_m = m; a.a_count = count; a.a_centeridx = centeridx; a.a_access_type = access_type; a.a_advice = advice; a.a_flags = flags; error = vop_pre(vp, &mp, &mpsafe, FST_NO); if (error) return error; error = (VCALL(vp, VOFFSET(vop_getpages), &a)); vop_post(vp, mp, mpsafe, FST_NO); return error; } const int vop_putpages_vp_offsets[] = { VOPARG_OFFSETOF(struct vop_putpages_args,a_vp), VDESC_NO_OFFSET }; const struct vnodeop_desc vop_putpages_desc = { VOP_PUTPAGES_DESCOFFSET, "vop_putpages", 0, vop_putpages_vp_offsets, VDESC_NO_OFFSET, VDESC_NO_OFFSET, VDESC_NO_OFFSET, }; int VOP_PUTPAGES(struct vnode *vp, voff_t offlo, voff_t offhi, int flags) { int error; bool mpsafe; struct vop_putpages_args a; struct mount *mp; a.a_desc = VDESC(vop_putpages); a.a_vp = vp; a.a_offlo = offlo; a.a_offhi = offhi; a.a_flags = flags; error = vop_pre(vp, &mp, &mpsafe, FST_NO); if (error) return error; error = (VCALL(vp, VOFFSET(vop_putpages), &a)); vop_post(vp, mp, mpsafe, FST_NO); return error; } const int vop_getacl_vp_offsets[] = { VOPARG_OFFSETOF(struct vop_getacl_args,a_vp), VDESC_NO_OFFSET }; const struct vnodeop_desc vop_getacl_desc = { VOP_GETACL_DESCOFFSET, "vop_getacl", 0, vop_getacl_vp_offsets, VDESC_NO_OFFSET, VOPARG_OFFSETOF(struct vop_getacl_args, a_cred), VDESC_NO_OFFSET, }; int VOP_GETACL(struct vnode *vp, acl_type_t type, struct acl *aclp, kauth_cred_t cred) { int error; bool mpsafe; struct vop_getacl_args a; struct mount *mp; a.a_desc = VDESC(vop_getacl); a.a_vp = vp; a.a_type = type; a.a_aclp = aclp; a.a_cred = cred; error = vop_pre(vp, &mp, &mpsafe, FST_YES); if (error) return error; error = (VCALL(vp, VOFFSET(vop_getacl), &a)); vop_post(vp, mp, mpsafe, FST_YES); return error; } const int vop_setacl_vp_offsets[] = { VOPARG_OFFSETOF(struct vop_setacl_args,a_vp), VDESC_NO_OFFSET }; const struct vnodeop_desc vop_setacl_desc = { VOP_SETACL_DESCOFFSET, "vop_setacl", 0, vop_setacl_vp_offsets, VDESC_NO_OFFSET, VOPARG_OFFSETOF(struct vop_setacl_args, a_cred), VDESC_NO_OFFSET, }; int VOP_SETACL(struct vnode *vp, acl_type_t type, struct acl *aclp, kauth_cred_t cred) { int error; bool mpsafe; struct vop_setacl_args a; struct mount *mp; a.a_desc = VDESC(vop_setacl); a.a_vp = vp; a.a_type = type; a.a_aclp = aclp; a.a_cred = cred; assert_vop_elocked(vp, "vop_setacl: vp"); error = vop_pre(vp, &mp, &mpsafe, FST_NO); if (error) return error; error = (VCALL(vp, VOFFSET(vop_setacl), &a)); vop_post(vp, mp, mpsafe, FST_NO); vop_setacl_post(&a, error); return error; } const int vop_aclcheck_vp_offsets[] = { VOPARG_OFFSETOF(struct vop_aclcheck_args,a_vp), VDESC_NO_OFFSET }; const struct vnodeop_desc vop_aclcheck_desc = { VOP_ACLCHECK_DESCOFFSET, "vop_aclcheck", 0, vop_aclcheck_vp_offsets, VDESC_NO_OFFSET, VOPARG_OFFSETOF(struct vop_aclcheck_args, a_cred), VDESC_NO_OFFSET, }; int VOP_ACLCHECK(struct vnode *vp, acl_type_t type, struct acl *aclp, kauth_cred_t cred) { int error; bool mpsafe; struct vop_aclcheck_args a; struct mount *mp; a.a_desc = VDESC(vop_aclcheck); a.a_vp = vp; a.a_type = type; a.a_aclp = aclp; a.a_cred = cred; error = vop_pre(vp, &mp, &mpsafe, FST_YES); if (error) return error; error = (VCALL(vp, VOFFSET(vop_aclcheck), &a)); vop_post(vp, mp, mpsafe, FST_YES); return error; } const int vop_closeextattr_vp_offsets[] = { VOPARG_OFFSETOF(struct vop_closeextattr_args,a_vp), VDESC_NO_OFFSET }; const struct vnodeop_desc vop_closeextattr_desc = { VOP_CLOSEEXTATTR_DESCOFFSET, "vop_closeextattr", 0, vop_closeextattr_vp_offsets, VDESC_NO_OFFSET, VOPARG_OFFSETOF(struct vop_closeextattr_args, a_cred), VDESC_NO_OFFSET, }; int VOP_CLOSEEXTATTR(struct vnode *vp, int commit, kauth_cred_t cred) { int error; bool mpsafe; struct vop_closeextattr_args a; struct mount *mp; a.a_desc = VDESC(vop_closeextattr); a.a_vp = vp; a.a_commit = commit; a.a_cred = cred; assert_vop_locked(vp, "vop_closeextattr: vp"); error = vop_pre(vp, &mp, &mpsafe, FST_NO); if (error) return error; error = (VCALL(vp, VOFFSET(vop_closeextattr), &a)); vop_post(vp, mp, mpsafe, FST_NO); return error; } const int vop_getextattr_vp_offsets[] = { VOPARG_OFFSETOF(struct vop_getextattr_args,a_vp), VDESC_NO_OFFSET }; const struct vnodeop_desc vop_getextattr_desc = { VOP_GETEXTATTR_DESCOFFSET, "vop_getextattr", 0, vop_getextattr_vp_offsets, VDESC_NO_OFFSET, VOPARG_OFFSETOF(struct vop_getextattr_args, a_cred), VDESC_NO_OFFSET, }; int VOP_GETEXTATTR(struct vnode *vp, int attrnamespace, const char *name, struct uio *uio, size_t *size, kauth_cred_t cred) { int error; bool mpsafe; struct vop_getextattr_args a; struct mount *mp; a.a_desc = VDESC(vop_getextattr); a.a_vp = vp; a.a_attrnamespace = attrnamespace; a.a_name = name; a.a_uio = uio; a.a_size = size; a.a_cred = cred; assert_vop_locked(vp, "vop_getextattr: vp"); error = vop_pre(vp, &mp, &mpsafe, FST_NO); if (error) return error; error = (VCALL(vp, VOFFSET(vop_getextattr), &a)); vop_post(vp, mp, mpsafe, FST_NO); return error; } const int vop_listextattr_vp_offsets[] = { VOPARG_OFFSETOF(struct vop_listextattr_args,a_vp), VDESC_NO_OFFSET }; const struct vnodeop_desc vop_listextattr_desc = { VOP_LISTEXTATTR_DESCOFFSET, "vop_listextattr", 0, vop_listextattr_vp_offsets, VDESC_NO_OFFSET, VOPARG_OFFSETOF(struct vop_listextattr_args, a_cred), VDESC_NO_OFFSET, }; int VOP_LISTEXTATTR(struct vnode *vp, int attrnamespace, struct uio *uio, size_t *size, int flag, kauth_cred_t cred) { int error; bool mpsafe; struct vop_listextattr_args a; struct mount *mp; a.a_desc = VDESC(vop_listextattr); a.a_vp = vp; a.a_attrnamespace = attrnamespace; a.a_uio = uio; a.a_size = size; a.a_flag = flag; a.a_cred = cred; assert_vop_locked(vp, "vop_listextattr: vp"); error = vop_pre(vp, &mp, &mpsafe, FST_NO); if (error) return error; error = (VCALL(vp, VOFFSET(vop_listextattr), &a)); vop_post(vp, mp, mpsafe, FST_NO); return error; } const int vop_openextattr_vp_offsets[] = { VOPARG_OFFSETOF(struct vop_openextattr_args,a_vp), VDESC_NO_OFFSET }; const struct vnodeop_desc vop_openextattr_desc = { VOP_OPENEXTATTR_DESCOFFSET, "vop_openextattr", 0, vop_openextattr_vp_offsets, VDESC_NO_OFFSET, VOPARG_OFFSETOF(struct vop_openextattr_args, a_cred), VDESC_NO_OFFSET, }; int VOP_OPENEXTATTR(struct vnode *vp, kauth_cred_t cred) { int error; bool mpsafe; struct vop_openextattr_args a; struct mount *mp; a.a_desc = VDESC(vop_openextattr); a.a_vp = vp; a.a_cred = cred; assert_vop_locked(vp, "vop_openextattr: vp"); error = vop_pre(vp, &mp, &mpsafe, FST_NO); if (error) return error; error = (VCALL(vp, VOFFSET(vop_openextattr), &a)); vop_post(vp, mp, mpsafe, FST_NO); return error; } const int vop_deleteextattr_vp_offsets[] = { VOPARG_OFFSETOF(struct vop_deleteextattr_args,a_vp), VDESC_NO_OFFSET }; const struct vnodeop_desc vop_deleteextattr_desc = { VOP_DELETEEXTATTR_DESCOFFSET, "vop_deleteextattr", 0, vop_deleteextattr_vp_offsets, VDESC_NO_OFFSET, VOPARG_OFFSETOF(struct vop_deleteextattr_args, a_cred), VDESC_NO_OFFSET, }; int VOP_DELETEEXTATTR(struct vnode *vp, int attrnamespace, const char *name, kauth_cred_t cred) { int error; bool mpsafe; struct vop_deleteextattr_args a; struct mount *mp; a.a_desc = VDESC(vop_deleteextattr); a.a_vp = vp; a.a_attrnamespace = attrnamespace; a.a_name = name; a.a_cred = cred; assert_vop_elocked(vp, "vop_deleteextattr: vp"); error = vop_pre(vp, &mp, &mpsafe, FST_NO); if (error) return error; error = (VCALL(vp, VOFFSET(vop_deleteextattr), &a)); vop_post(vp, mp, mpsafe, FST_NO); return error; } const int vop_setextattr_vp_offsets[] = { VOPARG_OFFSETOF(struct vop_setextattr_args,a_vp), VDESC_NO_OFFSET }; const struct vnodeop_desc vop_setextattr_desc = { VOP_SETEXTATTR_DESCOFFSET, "vop_setextattr", 0, vop_setextattr_vp_offsets, VDESC_NO_OFFSET, VOPARG_OFFSETOF(struct vop_setextattr_args, a_cred), VDESC_NO_OFFSET, }; int VOP_SETEXTATTR(struct vnode *vp, int attrnamespace, const char *name, struct uio *uio, kauth_cred_t cred) { int error; bool mpsafe; struct vop_setextattr_args a; struct mount *mp; a.a_desc = VDESC(vop_setextattr); a.a_vp = vp; a.a_attrnamespace = attrnamespace; a.a_name = name; a.a_uio = uio; a.a_cred = cred; assert_vop_elocked(vp, "vop_setextattr: vp"); error = vop_pre(vp, &mp, &mpsafe, FST_NO); if (error) return error; error = (VCALL(vp, VOFFSET(vop_setextattr), &a)); vop_post(vp, mp, mpsafe, FST_NO); return error; } const struct vnodeop_desc * const vfs_op_descs[] = { &vop_default_desc, /* MUST BE FIRST */ &vop_bwrite_desc, &vop_parsepath_desc, &vop_lookup_desc, &vop_create_desc, &vop_mknod_desc, &vop_open_desc, &vop_close_desc, &vop_access_desc, &vop_accessx_desc, &vop_getattr_desc, &vop_setattr_desc, &vop_read_desc, &vop_write_desc, &vop_fallocate_desc, &vop_fdiscard_desc, &vop_ioctl_desc, &vop_fcntl_desc, &vop_poll_desc, &vop_kqfilter_desc, &vop_revoke_desc, &vop_mmap_desc, &vop_fsync_desc, &vop_seek_desc, &vop_remove_desc, &vop_link_desc, &vop_rename_desc, &vop_mkdir_desc, &vop_rmdir_desc, &vop_symlink_desc, &vop_readdir_desc, &vop_readlink_desc, &vop_abortop_desc, &vop_inactive_desc, &vop_reclaim_desc, &vop_lock_desc, &vop_unlock_desc, &vop_bmap_desc, &vop_strategy_desc, &vop_print_desc, &vop_islocked_desc, &vop_pathconf_desc, &vop_advlock_desc, &vop_whiteout_desc, &vop_getpages_desc, &vop_putpages_desc, &vop_getacl_desc, &vop_setacl_desc, &vop_aclcheck_desc, &vop_closeextattr_desc, &vop_getextattr_desc, &vop_listextattr_desc, &vop_openextattr_desc, &vop_deleteextattr_desc, &vop_setextattr_desc, NULL };
1 1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 /* $NetBSD: nd.c,v 1.5 2022/11/19 08:00:51 yamt Exp $ */ /* * Copyright (c) 2020 The NetBSD Foundation, Inc. * * This code is derived from software contributed to The NetBSD Foundation * by Roy Marples. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: nd.c,v 1.5 2022/11/19 08:00:51 yamt Exp $"); #include <sys/callout.h> #include <sys/mbuf.h> #include <sys/socketvar.h> /* for softnet_lock */ #include <net/if_llatbl.h> #include <net/nd.h> #include <net/route.h> #include <netinet/in.h> #include <netinet/ip6.h> static struct nd_domain *nd_domains[AF_MAX]; static int nd_gctimer = (60 * 60 * 24); /* 1 day: garbage collection timer */ static void nd_set_timertick(struct llentry *, time_t); static struct nd_domain *nd_find_domain(int); static void nd_timer(void *arg) { struct llentry *ln = arg; struct nd_domain *nd; struct ifnet *ifp = NULL; struct psref psref; struct mbuf *m = NULL; bool send_ns = false; int16_t missed = ND_LLINFO_NOSTATE; union l3addr taddr, *daddrp = NULL; SOFTNET_KERNEL_LOCK_UNLESS_NET_MPSAFE(); LLE_WLOCK(ln); if (!(ln->la_flags & LLE_LINKED)) goto out; if (ln->ln_ntick > 0) { nd_set_timer(ln, ND_TIMER_TICK); goto out; } nd = nd_find_domain(ln->lle_tbl->llt_af); ifp = ln->lle_tbl->llt_ifp; KASSERT(ifp != NULL); if_acquire(ifp, &psref); memcpy(&taddr, &ln->r_l3addr, sizeof(taddr)); switch (ln->ln_state) { case ND_LLINFO_WAITDELETE: LLE_REMREF(ln); nd->nd_free(ln, 0); ln = NULL; break; case ND_LLINFO_INCOMPLETE: send_ns = true; if (ln->ln_asked++ < nd->nd_mmaxtries) break; if (ln->ln_hold) { struct mbuf *m0, *mnxt; /* * Assuming every packet in ln_hold * has the same IP header. */ m = ln->ln_hold; for (m0 = m->m_nextpkt; m0 != NULL; m0 = mnxt) { mnxt = m0->m_nextpkt; m0->m_nextpkt = NULL; m_freem(m0); } m->m_nextpkt = NULL; ln->ln_hold = NULL; } missed = ND_LLINFO_INCOMPLETE; ln->ln_state = ND_LLINFO_WAITDELETE; break; case ND_LLINFO_REACHABLE: if (!ND_IS_LLINFO_PERMANENT(ln)) { ln->ln_state = ND_LLINFO_STALE; nd_set_timer(ln, ND_TIMER_GC); } break; case ND_LLINFO_PURGE: /* FALLTHROUGH */ case ND_LLINFO_STALE: if (!ND_IS_LLINFO_PERMANENT(ln)) { LLE_REMREF(ln); nd->nd_free(ln, 1); ln = NULL; } break; case ND_LLINFO_DELAY: if (nd->nd_nud_enabled(ifp)) { ln->ln_asked = 1; ln->ln_state = ND_LLINFO_PROBE; send_ns = true; daddrp = &taddr; } else { ln->ln_state = ND_LLINFO_STALE; nd_set_timer(ln, ND_TIMER_GC); } break; case ND_LLINFO_PROBE: send_ns = true; if (ln->ln_asked++ < nd->nd_umaxtries) { daddrp = &taddr; } else { ln->ln_state = ND_LLINFO_UNREACHABLE; ln->ln_asked = 1; missed = ND_LLINFO_PROBE; /* nd_missed() consumers can use missed to know if * they need to send ICMP UNREACHABLE or not. */ } break; case ND_LLINFO_UNREACHABLE: /* * RFC 7048 Section 3 says in the UNREACHABLE state * packets continue to be sent to the link-layer address and * then backoff exponentially. * We adjust this slightly and move to the INCOMPLETE state * after nd_mmaxtries probes and then start backing off. * * This results in simpler code whilst providing a more robust * model which doubles the time to failure over what we did * before. We don't want to be back to the old ARP model where * no unreachability errors are returned because very * few applications would look at unreachability hints provided * such as ND_LLINFO_UNREACHABLE or RTM_MISS. */ send_ns = true; if (ln->ln_asked++ < nd->nd_mmaxtries) break; missed = ND_LLINFO_UNREACHABLE; ln->ln_state = ND_LLINFO_WAITDELETE; ln->la_flags &= ~LLE_VALID; break; } if (send_ns) { uint8_t lladdr[255], *lladdrp; union l3addr src, *psrc; if (ln->ln_state == ND_LLINFO_WAITDELETE) nd_set_timer(ln, ND_TIMER_RETRANS_BACKOFF); else nd_set_timer(ln, ND_TIMER_RETRANS); if (ln->ln_state > ND_LLINFO_INCOMPLETE && ln->la_flags & LLE_VALID) { KASSERT(sizeof(lladdr) >= ifp->if_addrlen); memcpy(lladdr, &ln->ll_addr, ifp->if_addrlen); lladdrp = lladdr; } else lladdrp = NULL; psrc = nd->nd_holdsrc(ln, &src); LLE_FREE_LOCKED(ln); ln = NULL; nd->nd_output(ifp, daddrp, &taddr, lladdrp, psrc); } out: if (ln != NULL) LLE_FREE_LOCKED(ln); SOFTNET_KERNEL_UNLOCK_UNLESS_NET_MPSAFE(); if (missed != ND_LLINFO_NOSTATE) nd->nd_missed(ifp, &taddr, missed, m); if (ifp != NULL) if_release(ifp, &psref); } static void nd_set_timertick(struct llentry *ln, time_t xtick) { CTASSERT(sizeof(time_t) > sizeof(int)); KASSERT(xtick >= 0); /* * We have to take care of a reference leak which occurs if * callout_reset overwrites a pending callout schedule. Unfortunately * we don't have a mean to know the overwrite, so we need to know it * using callout_stop. We need to call callout_pending first to exclude * the case that the callout has never been scheduled. */ if (callout_pending(&ln->la_timer)) { bool expired; expired = callout_stop(&ln->la_timer); if (!expired) LLE_REMREF(ln); } ln->ln_expire = time_uptime + xtick / hz; LLE_ADDREF(ln); if (xtick > INT_MAX) { ln->ln_ntick = xtick - INT_MAX; xtick = INT_MAX; } else { ln->ln_ntick = 0; } callout_reset(&ln->ln_timer_ch, xtick, nd_timer, ln); } void nd_set_timer(struct llentry *ln, int type) { time_t xtick; struct ifnet *ifp; struct nd_domain *nd; LLE_WLOCK_ASSERT(ln); ifp = ln->lle_tbl->llt_ifp; nd = nd_find_domain(ln->lle_tbl->llt_af); switch (type) { case ND_TIMER_IMMEDIATE: xtick = 0; break; case ND_TIMER_TICK: xtick = ln->ln_ntick; break; case ND_TIMER_RETRANS: xtick = nd->nd_retrans(ifp) * hz / 1000; break; case ND_TIMER_RETRANS_BACKOFF: { unsigned int retrans = nd->nd_retrans(ifp); unsigned int attempts = ln->ln_asked - nd->nd_mmaxtries; xtick = retrans; while (attempts-- != 0) { xtick *= nd->nd_retransmultiple; if (xtick > nd->nd_maxretrans || xtick < retrans) { xtick = nd->nd_maxretrans; break; } } xtick = xtick * hz / 1000; break; } case ND_TIMER_REACHABLE: xtick = nd->nd_reachable(ifp) * hz / 1000; break; case ND_TIMER_EXPIRE: if (ln->ln_expire > time_uptime) xtick = (ln->ln_expire - time_uptime) * hz; else xtick = nd_gctimer * hz; break; case ND_TIMER_DELAY: xtick = nd->nd_delay * hz; break; case ND_TIMER_GC: xtick = nd_gctimer * hz; break; default: panic("%s: invalid timer type\n", __func__); } nd_set_timertick(ln, xtick); } int nd_resolve(struct llentry *ln, const struct rtentry *rt, struct mbuf *m, uint8_t *lldst, size_t dstsize) { struct ifnet *ifp; struct nd_domain *nd; int error; LLE_WLOCK_ASSERT(ln); ifp = ln->lle_tbl->llt_ifp; nd = nd_find_domain(ln->lle_tbl->llt_af); /* We don't have to do link-layer address resolution on a p2p link. */ if (ifp->if_flags & IFF_POINTOPOINT && ln->ln_state < ND_LLINFO_REACHABLE) { ln->ln_state = ND_LLINFO_STALE; nd_set_timer(ln, ND_TIMER_GC); } /* * The first time we send a packet to a neighbor whose entry is * STALE, we have to change the state to DELAY and a sets a timer to * expire in DELAY_FIRST_PROBE_TIME seconds to ensure do * neighbor unreachability detection on expiration. * (RFC 2461 7.3.3) */ if (ln->ln_state == ND_LLINFO_STALE) { ln->ln_asked = 0; ln->ln_state = ND_LLINFO_DELAY; nd_set_timer(ln, ND_TIMER_DELAY); } /* * If the neighbor cache entry has a state other than INCOMPLETE * (i.e. its link-layer address is already resolved), just * send the packet. */ if (ln->ln_state > ND_LLINFO_INCOMPLETE) { KASSERT((ln->la_flags & LLE_VALID) != 0); memcpy(lldst, &ln->ll_addr, MIN(dstsize, ifp->if_addrlen)); LLE_WUNLOCK(ln); return 0; } /* * There is a neighbor cache entry, but no ethernet address * response yet. Append this latest packet to the end of the * packet queue in the mbuf, unless the number of the packet * does not exceed maxqueuelen. When it exceeds maxqueuelen, * the oldest packet in the queue will be removed. */ if (ln->ln_state == ND_LLINFO_NOSTATE || ln->ln_state == ND_LLINFO_WAITDELETE) ln->ln_state = ND_LLINFO_INCOMPLETE; #ifdef MBUFTRACE m_claimm(m, ln->lle_tbl->llt_mowner); #endif if (ln->ln_hold != NULL) { struct mbuf *m_hold; int i; i = 0; for (m_hold = ln->ln_hold; m_hold; m_hold = m_hold->m_nextpkt) { i++; if (m_hold->m_nextpkt == NULL) { m_hold->m_nextpkt = m; break; } } while (i >= nd->nd_maxqueuelen) { m_hold = ln->ln_hold; ln->ln_hold = ln->ln_hold->m_nextpkt; m_freem(m_hold); i--; } } else ln->ln_hold = m; if (ln->ln_asked >= nd->nd_mmaxtries) error = (rt != NULL && rt->rt_flags & RTF_GATEWAY) ? EHOSTUNREACH : EHOSTDOWN; else error = EWOULDBLOCK; /* * If there has been no NS for the neighbor after entering the * INCOMPLETE state, send the first solicitation. */ if (!ND_IS_LLINFO_PERMANENT(ln) && ln->ln_asked == 0) { struct psref psref; union l3addr dst, src, *psrc; ln->ln_asked++; nd_set_timer(ln, ND_TIMER_RETRANS); memcpy(&dst, &ln->r_l3addr, sizeof(dst)); psrc = nd->nd_holdsrc(ln, &src); if_acquire(ifp, &psref); LLE_WUNLOCK(ln); nd->nd_output(ifp, NULL, &dst, NULL, psrc); if_release(ifp, &psref); } else LLE_WUNLOCK(ln); return error; } void nd_nud_hint(struct llentry *ln) { struct nd_domain *nd; if (ln == NULL) return; LLE_WLOCK_ASSERT(ln); if (ln->ln_state < ND_LLINFO_REACHABLE) goto done; nd = nd_find_domain(ln->lle_tbl->llt_af); /* * if we get upper-layer reachability confirmation many times, * it is possible we have false information. */ ln->ln_byhint++; if (ln->ln_byhint > nd->nd_maxnudhint) goto done; ln->ln_state = ND_LLINFO_REACHABLE; if (!ND_IS_LLINFO_PERMANENT(ln)) nd_set_timer(ln, ND_TIMER_REACHABLE); done: LLE_WUNLOCK(ln); return; } static struct nd_domain * nd_find_domain(int af) { KASSERT(af < __arraycount(nd_domains) && nd_domains[af] != NULL); return nd_domains[af]; } void nd_attach_domain(struct nd_domain *nd) { KASSERT(nd->nd_family < __arraycount(nd_domains)); nd_domains[nd->nd_family] = nd; }
1 1 3 1 2 2 2 2 2 1 1 1 1 1 1 1 1 1 14 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 /* $NetBSD: ddp_usrreq.c,v 1.76 2022/09/03 01:48:22 thorpej Exp $ */ /* * Copyright (c) 1990,1991 Regents of The University of Michigan. * All Rights Reserved. * * Permission to use, copy, modify, and distribute this software and * its documentation for any purpose and without fee is hereby granted, * provided that the above copyright notice appears in all copies and * that both that copyright notice and this permission notice appear * in supporting documentation, and that the name of The University * of Michigan not be used in advertising or publicity pertaining to * distribution of the software without specific, written prior * permission. This software is supplied as is without expressed or * implied warranties of any kind. * * This product includes software developed by the University of * California, Berkeley and its contributors. * * Research Systems Unix Group * The University of Michigan * c/o Wesley Craig * 535 W. William Street * Ann Arbor, Michigan * +1-313-764-2278 * netatalk@umich.edu */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: ddp_usrreq.c,v 1.76 2022/09/03 01:48:22 thorpej Exp $"); #include "opt_mbuftrace.h" #include "opt_atalk.h" #include <sys/param.h> #include <sys/errno.h> #include <sys/systm.h> #include <sys/mbuf.h> #include <sys/ioctl.h> #include <sys/queue.h> #include <sys/socket.h> #include <sys/socketvar.h> #include <sys/protosw.h> #include <sys/kauth.h> #include <sys/kmem.h> #include <sys/sysctl.h> #include <net/if.h> #include <net/route.h> #include <net/if_ether.h> #include <net/net_stats.h> #include <netinet/in.h> #include <netatalk/at.h> #include <netatalk/at_var.h> #include <netatalk/ddp_var.h> #include <netatalk/ddp_private.h> #include <netatalk/aarp.h> #include <netatalk/at_extern.h> static void at_pcbdisconnect(struct ddpcb *); static void at_sockaddr(struct ddpcb *, struct sockaddr_at *); static int at_pcbsetaddr(struct ddpcb *, struct sockaddr_at *); static int at_pcbconnect(struct ddpcb *, struct sockaddr_at *); static void ddp_detach(struct socket *); pktqueue_t * at_pktq1 __read_mostly; pktqueue_t * at_pktq2 __read_mostly; struct ddpcb *ddp_ports[ATPORT_LAST]; struct ddpcb *ddpcb = NULL; percpu_t *ddpstat_percpu; struct at_ifaddrhead at_ifaddr; /* Here as inited in this file */ u_long ddp_sendspace = DDP_MAXSZ; /* Max ddp size + 1 (ddp_type) */ u_long ddp_recvspace = 25 * (587 + sizeof(struct sockaddr_at)); #ifdef MBUFTRACE struct mowner atalk_rx_mowner = MOWNER_INIT("atalk", "rx"); struct mowner atalk_tx_mowner = MOWNER_INIT("atalk", "tx"); #endif static void at_sockaddr(struct ddpcb *ddp, struct sockaddr_at *addr) { *addr = ddp->ddp_lsat; } static int at_pcbsetaddr(struct ddpcb *ddp, struct sockaddr_at *sat) { struct sockaddr_at lsat; struct at_ifaddr *aa; struct ddpcb *ddpp; if (ddp->ddp_lsat.sat_port != ATADDR_ANYPORT) { /* shouldn't be bound */ return (EINVAL); } if (NULL != sat) { /* validate passed address */ if (sat->sat_family != AF_APPLETALK) return (EAFNOSUPPORT); if (sat->sat_len != sizeof(*sat)) return EINVAL; if (sat->sat_addr.s_node != ATADDR_ANYNODE || sat->sat_addr.s_net != ATADDR_ANYNET) { TAILQ_FOREACH(aa, &at_ifaddr, aa_list) { if ((sat->sat_addr.s_net == AA_SAT(aa)->sat_addr.s_net) && (sat->sat_addr.s_node == AA_SAT(aa)->sat_addr.s_node)) break; } if (!aa) return (EADDRNOTAVAIL); } if (sat->sat_port != ATADDR_ANYPORT) { int error; if (sat->sat_port < ATPORT_FIRST || sat->sat_port >= ATPORT_LAST) return (EINVAL); if (sat->sat_port < ATPORT_RESERVED && (error = kauth_authorize_network( kauth_cred_get(), KAUTH_NETWORK_BIND, KAUTH_REQ_NETWORK_BIND_PRIVPORT, ddpcb->ddp_socket, sat, NULL)) != 0) return (error); } } else { memset((void *) & lsat, 0, sizeof(struct sockaddr_at)); lsat.sat_len = sizeof(struct sockaddr_at); lsat.sat_addr.s_node = ATADDR_ANYNODE; lsat.sat_addr.s_net = ATADDR_ANYNET; lsat.sat_family = AF_APPLETALK; sat = &lsat; } if (sat->sat_addr.s_node == ATADDR_ANYNODE && sat->sat_addr.s_net == ATADDR_ANYNET) { if (TAILQ_EMPTY(&at_ifaddr)) return EADDRNOTAVAIL; sat->sat_addr = AA_SAT(TAILQ_FIRST(&at_ifaddr))->sat_addr; } ddp->ddp_lsat = *sat; /* * Choose port. */ if (sat->sat_port == ATADDR_ANYPORT) { for (sat->sat_port = ATPORT_RESERVED; sat->sat_port < ATPORT_LAST; sat->sat_port++) { if (ddp_ports[sat->sat_port - 1] == 0) break; } if (sat->sat_port == ATPORT_LAST) { return (EADDRNOTAVAIL); } ddp->ddp_lsat.sat_port = sat->sat_port; ddp_ports[sat->sat_port - 1] = ddp; } else { for (ddpp = ddp_ports[sat->sat_port - 1]; ddpp; ddpp = ddpp->ddp_pnext) { if (ddpp->ddp_lsat.sat_addr.s_net == sat->sat_addr.s_net && ddpp->ddp_lsat.sat_addr.s_node == sat->sat_addr.s_node) break; } if (ddpp != NULL) return (EADDRINUSE); ddp->ddp_pnext = ddp_ports[sat->sat_port - 1]; ddp_ports[sat->sat_port - 1] = ddp; if (ddp->ddp_pnext) ddp->ddp_pnext->ddp_pprev = ddp; } return 0; } static int at_pcbconnect(struct ddpcb *ddp, struct sockaddr_at *sat) { struct rtentry *rt; const struct sockaddr_at *cdst; struct route *ro; struct at_ifaddr *aa; struct ifnet *ifp; u_short hintnet = 0, net; if (sat->sat_family != AF_APPLETALK) return EAFNOSUPPORT; if (sat->sat_len != sizeof(*sat)) return EINVAL; /* * Under phase 2, network 0 means "the network". We take "the * network" to mean the network the control block is bound to. * If the control block is not bound, there is an error. */ if (sat->sat_addr.s_net == ATADDR_ANYNET && sat->sat_addr.s_node != ATADDR_ANYNODE) { if (ddp->ddp_lsat.sat_port == ATADDR_ANYPORT) { return EADDRNOTAVAIL; } hintnet = ddp->ddp_lsat.sat_addr.s_net; } ro = &ddp->ddp_route; /* * If we've got an old route for this pcb, check that it is valid. * If we've changed our address, we may have an old "good looking" * route here. Attempt to detect it. */ if ((rt = rtcache_validate(ro)) != NULL || (rt = rtcache_update(ro, 1)) != NULL) { if (hintnet) { net = hintnet; } else { net = sat->sat_addr.s_net; } if ((ifp = rt->rt_ifp) != NULL) { TAILQ_FOREACH(aa, &at_ifaddr, aa_list) { if (aa->aa_ifp == ifp && ntohs(net) >= ntohs(aa->aa_firstnet) && ntohs(net) <= ntohs(aa->aa_lastnet)) { break; } } } else aa = NULL; cdst = satocsat(rtcache_getdst(ro)); if (aa == NULL || (cdst->sat_addr.s_net != (hintnet ? hintnet : sat->sat_addr.s_net) || cdst->sat_addr.s_node != sat->sat_addr.s_node)) { rtcache_unref(rt, ro); rtcache_free(ro); rt = NULL; } } /* * If we've got no route for this interface, try to find one. */ if (rt == NULL) { union { struct sockaddr dst; struct sockaddr_at dsta; } u; sockaddr_at_init(&u.dsta, &sat->sat_addr, 0); if (hintnet) u.dsta.sat_addr.s_net = hintnet; rt = rtcache_lookup(ro, &u.dst); } /* * Make sure any route that we have has a valid interface. */ if (rt != NULL && (ifp = rt->rt_ifp) != NULL) { TAILQ_FOREACH(aa, &at_ifaddr, aa_list) { if (aa->aa_ifp == ifp) break; } } else aa = NULL; rtcache_unref(rt, ro); if (aa == NULL) return ENETUNREACH; ddp->ddp_fsat = *sat; if (ddp->ddp_lsat.sat_port == ATADDR_ANYPORT) return at_pcbsetaddr(ddp, NULL); return 0; } static void at_pcbdisconnect(struct ddpcb *ddp) { ddp->ddp_fsat.sat_addr.s_net = ATADDR_ANYNET; ddp->ddp_fsat.sat_addr.s_node = ATADDR_ANYNODE; ddp->ddp_fsat.sat_port = ATADDR_ANYPORT; } static int ddp_attach(struct socket *so, int proto) { struct ddpcb *ddp; int error; KASSERT(sotoddpcb(so) == NULL); sosetlock(so); #ifdef MBUFTRACE so->so_rcv.sb_mowner = &atalk_rx_mowner; so->so_snd.sb_mowner = &atalk_tx_mowner; #endif error = soreserve(so, ddp_sendspace, ddp_recvspace); if (error) { return error; } ddp = kmem_zalloc(sizeof(*ddp), KM_SLEEP); ddp->ddp_lsat.sat_port = ATADDR_ANYPORT; ddp->ddp_next = ddpcb; ddp->ddp_prev = NULL; ddp->ddp_pprev = NULL; ddp->ddp_pnext = NULL; if (ddpcb) { ddpcb->ddp_prev = ddp; } ddpcb = ddp; ddp->ddp_socket = so; so->so_pcb = ddp; return 0; } static void ddp_detach(struct socket *so) { struct ddpcb *ddp = sotoddpcb(so); soisdisconnected(so); so->so_pcb = NULL; /* sofree drops the lock */ sofree(so); mutex_enter(softnet_lock); /* remove ddp from ddp_ports list */ if (ddp->ddp_lsat.sat_port != ATADDR_ANYPORT && ddp_ports[ddp->ddp_lsat.sat_port - 1] != NULL) { if (ddp->ddp_pprev != NULL) { ddp->ddp_pprev->ddp_pnext = ddp->ddp_pnext; } else { ddp_ports[ddp->ddp_lsat.sat_port - 1] = ddp->ddp_pnext; } if (ddp->ddp_pnext != NULL) { ddp->ddp_pnext->ddp_pprev = ddp->ddp_pprev; } } rtcache_free(&ddp->ddp_route); if (ddp->ddp_prev) { ddp->ddp_prev->ddp_next = ddp->ddp_next; } else { ddpcb = ddp->ddp_next; } if (ddp->ddp_next) { ddp->ddp_next->ddp_prev = ddp->ddp_prev; } kmem_free(ddp, sizeof(*ddp)); } static int ddp_accept(struct socket *so, struct sockaddr *nam) { KASSERT(solocked(so)); return EOPNOTSUPP; } static int ddp_bind(struct socket *so, struct sockaddr *nam, struct lwp *l) { KASSERT(solocked(so)); KASSERT(sotoddpcb(so) != NULL); return at_pcbsetaddr(sotoddpcb(so), (struct sockaddr_at *)nam); } static int ddp_listen(struct socket *so, struct lwp *l) { KASSERT(solocked(so)); return EOPNOTSUPP; } static int ddp_connect(struct socket *so, struct sockaddr *nam, struct lwp *l) { struct ddpcb *ddp = sotoddpcb(so); int error = 0; KASSERT(solocked(so)); KASSERT(ddp != NULL); KASSERT(nam != NULL); if (ddp->ddp_fsat.sat_port != ATADDR_ANYPORT) return EISCONN; error = at_pcbconnect(ddp, (struct sockaddr_at *)nam); if (error == 0) soisconnected(so); return error; } static int ddp_connect2(struct socket *so, struct socket *so2) { KASSERT(solocked(so)); return EOPNOTSUPP; } static int ddp_disconnect(struct socket *so) { struct ddpcb *ddp = sotoddpcb(so); KASSERT(solocked(so)); KASSERT(ddp != NULL); if (ddp->ddp_fsat.sat_addr.s_node == ATADDR_ANYNODE) return ENOTCONN; at_pcbdisconnect(ddp); soisdisconnected(so); return 0; } static int ddp_shutdown(struct socket *so) { KASSERT(solocked(so)); socantsendmore(so); return 0; } static int ddp_abort(struct socket *so) { KASSERT(solocked(so)); soisdisconnected(so); ddp_detach(so); return 0; } static int ddp_ioctl(struct socket *so, u_long cmd, void *addr, struct ifnet *ifp) { return at_control(cmd, addr, ifp); } static int ddp_stat(struct socket *so, struct stat *ub) { KASSERT(solocked(so)); /* stat: don't bother with a blocksize. */ return 0; } static int ddp_peeraddr(struct socket *so, struct sockaddr *nam) { KASSERT(solocked(so)); return EOPNOTSUPP; } static int ddp_sockaddr(struct socket *so, struct sockaddr *nam) { KASSERT(solocked(so)); KASSERT(sotoddpcb(so) != NULL); KASSERT(nam != NULL); at_sockaddr(sotoddpcb(so), (struct sockaddr_at *)nam); return 0; } static int ddp_rcvd(struct socket *so, int flags, struct lwp *l) { KASSERT(solocked(so)); return EOPNOTSUPP; } static int ddp_recvoob(struct socket *so, struct mbuf *m, int flags) { KASSERT(solocked(so)); return EOPNOTSUPP; } static int ddp_send(struct socket *so, struct mbuf *m, struct sockaddr *nam, struct mbuf *control, struct lwp *l) { struct ddpcb *ddp = sotoddpcb(so); int error = 0; int s = 0; /* XXX gcc 4.8 warns on sgimips */ KASSERT(solocked(so)); KASSERT(ddp != NULL); if (nam) { if (ddp->ddp_fsat.sat_port != ATADDR_ANYPORT) return EISCONN; s = splnet(); error = at_pcbconnect(ddp, (struct sockaddr_at *)nam); if (error) { splx(s); return error; } } else { if (ddp->ddp_fsat.sat_port == ATADDR_ANYPORT) return ENOTCONN; } error = ddp_output(m, ddp); m = NULL; if (nam) { at_pcbdisconnect(ddp); splx(s); } return error; } static int ddp_sendoob(struct socket *so, struct mbuf *m, struct mbuf *control) { KASSERT(solocked(so)); m_freem(m); m_freem(control); return EOPNOTSUPP; } static int ddp_purgeif(struct socket *so, struct ifnet *ifp) { mutex_enter(softnet_lock); at_purgeif(ifp); mutex_exit(softnet_lock); return 0; } /* * For the moment, this just find the pcb with the correct local address. * In the future, this will actually do some real searching, so we can use * the sender's address to do de-multiplexing on a single port to many * sockets (pcbs). */ struct ddpcb * ddp_search( struct sockaddr_at *from, struct sockaddr_at *to, struct at_ifaddr *aa) { struct ddpcb *ddp; /* * Check for bad ports. */ if (to->sat_port < ATPORT_FIRST || to->sat_port >= ATPORT_LAST) return NULL; /* * Make sure the local address matches the sent address. What about * the interface? */ for (ddp = ddp_ports[to->sat_port - 1]; ddp; ddp = ddp->ddp_pnext) { /* XXX should we handle 0.YY? */ /* XXXX.YY to socket on destination interface */ if (to->sat_addr.s_net == ddp->ddp_lsat.sat_addr.s_net && to->sat_addr.s_node == ddp->ddp_lsat.sat_addr.s_node) { break; } /* 0.255 to socket on receiving interface */ if (to->sat_addr.s_node == ATADDR_BCAST && (to->sat_addr.s_net == 0 || to->sat_addr.s_net == ddp->ddp_lsat.sat_addr.s_net) && ddp->ddp_lsat.sat_addr.s_net == AA_SAT(aa)->sat_addr.s_net) { break; } /* XXXX.0 to socket on destination interface */ if (to->sat_addr.s_net == aa->aa_firstnet && to->sat_addr.s_node == 0 && ntohs(ddp->ddp_lsat.sat_addr.s_net) >= ntohs(aa->aa_firstnet) && ntohs(ddp->ddp_lsat.sat_addr.s_net) <= ntohs(aa->aa_lastnet)) { break; } } return (ddp); } /* * Initialize all the ddp & appletalk stuff */ void ddp_init(void) { ddpstat_percpu = percpu_alloc(sizeof(uint64_t) * DDP_NSTATS); TAILQ_INIT(&at_ifaddr); at_pktq1 = pktq_create(IFQ_MAXLEN, atintr1, NULL); KASSERT(at_pktq1 != NULL); at_pktq2 = pktq_create(IFQ_MAXLEN, atintr2, NULL); KASSERT(at_pktq2 != NULL); MOWNER_ATTACH(&atalk_tx_mowner); MOWNER_ATTACH(&atalk_rx_mowner); MOWNER_ATTACH(&aarp_mowner); } PR_WRAP_USRREQS(ddp) #define ddp_attach ddp_attach_wrapper #define ddp_detach ddp_detach_wrapper #define ddp_accept ddp_accept_wrapper #define ddp_bind ddp_bind_wrapper #define ddp_listen ddp_listen_wrapper #define ddp_connect ddp_connect_wrapper #define ddp_connect2 ddp_connect2_wrapper #define ddp_disconnect ddp_disconnect_wrapper #define ddp_shutdown ddp_shutdown_wrapper #define ddp_abort ddp_abort_wrapper #define ddp_ioctl ddp_ioctl_wrapper #define ddp_stat ddp_stat_wrapper #define ddp_peeraddr ddp_peeraddr_wrapper #define ddp_sockaddr ddp_sockaddr_wrapper #define ddp_rcvd ddp_rcvd_wrapper #define ddp_recvoob ddp_recvoob_wrapper #define ddp_send ddp_send_wrapper #define ddp_sendoob ddp_sendoob_wrapper #define ddp_purgeif ddp_purgeif_wrapper const struct pr_usrreqs ddp_usrreqs = { .pr_attach = ddp_attach, .pr_detach = ddp_detach, .pr_accept = ddp_accept, .pr_bind = ddp_bind, .pr_listen = ddp_listen, .pr_connect = ddp_connect, .pr_connect2 = ddp_connect2, .pr_disconnect = ddp_disconnect, .pr_shutdown = ddp_shutdown, .pr_abort = ddp_abort, .pr_ioctl = ddp_ioctl, .pr_stat = ddp_stat, .pr_peeraddr = ddp_peeraddr, .pr_sockaddr = ddp_sockaddr, .pr_rcvd = ddp_rcvd, .pr_recvoob = ddp_recvoob, .pr_send = ddp_send, .pr_sendoob = ddp_sendoob, .pr_purgeif = ddp_purgeif, }; static int sysctl_net_atalk_ddp_stats(SYSCTLFN_ARGS) { return (NETSTAT_SYSCTL(ddpstat_percpu, DDP_NSTATS)); } /* * Sysctl for DDP variables. */ SYSCTL_SETUP(sysctl_net_atalk_ddp_setup, "sysctl net.atalk.ddp subtree setup") { sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT, CTLTYPE_NODE, "atalk", NULL, NULL, 0, NULL, 0, CTL_NET, PF_APPLETALK, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT, CTLTYPE_NODE, "ddp", SYSCTL_DESCR("DDP related settings"), NULL, 0, NULL, 0, CTL_NET, PF_APPLETALK, ATPROTO_DDP, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT, CTLTYPE_STRUCT, "stats", SYSCTL_DESCR("DDP statistics"), sysctl_net_atalk_ddp_stats, 0, NULL, 0, CTL_NET, PF_APPLETALK, ATPROTO_DDP, CTL_CREATE, CTL_EOL); }
42 42 42 41 42 41 41 42 42 42 42 42 41 26 19 26 19 42 42 42 42 41 42 42 40 42 42 42 42 41 42 41 42 42 42 35 35 35 35 16 35 3 3 3 3 32 22 8 5 32 32 31 32 32 32 32 32 32 32 32 32 13 22 13 32 32 24 8 22 13 6 6 6 6 6 6 6 6 6 6 6 3 3 2 4 5 1 42 42 41 41 32 10 7 35 35 35 35 35 35 35 32 32 32 5 27 27 27 11 11 27 27 27 21 8 8 26 27 27 10 10 10 10 10 10 10 10 5 5 1 9 8 1 5 5 10 10 10 10 10 10 10 10 9 9 10 10 10 10 10 10 9 9 9 9 5 4 4 6 6 9 8 6 4 3 6 6 9 9 9 9 9 9 35 36 36 35 36 36 36 36 36 36 36 35 16 3 13 23 27 9 2 3 3 3 3 3 3 1 2 32 31 32 32 32 32 32 32 32 32 32 32 31 31 32 32 32 32 32 24 4 3 31 1 32 3 2 2 2 2 2 2 2 2 2 2 34 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 /* $NetBSD: genfs_rename.c,v 1.7 2021/10/20 13:29:06 thorpej Exp $ */ /*- * Copyright (c) 2012 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Taylor R Campbell. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /* * Generic rename abstraction. * * Rename is unbelievably hairy. Try to use this if you can -- * otherwise you are practically guaranteed to get it wrong. */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: genfs_rename.c,v 1.7 2021/10/20 13:29:06 thorpej Exp $"); #include <sys/param.h> #include <sys/kauth.h> #include <sys/mount.h> #include <sys/namei.h> #include <sys/stat.h> #include <sys/vnode.h> #include <sys/types.h> #include <miscfs/genfs/genfs.h> /* * Sample copypasta for implementing VOP_RENAME via genfs_rename. * Don't change this template without carefully considering whether * every other file system that already uses it needs to change too. * That way, once we have changed all the file systems to use it, we * can easily replace mumblefs_rename by mumblefs_sane_rename and * eliminate the insane API altogether. */ /* begin sample copypasta */ #if 0 static const struct genfs_rename_ops mumblefs_genfs_rename_ops; /* * mumblefs_sane_rename: The hairiest vop, with the saner API. * * Arguments: * * . fdvp (from directory vnode), * . fcnp (from component name), * . tdvp (to directory vnode), * . tcnp (to component name), * . cred (credentials structure), and * . posixly_correct (flag for behaviour if target & source link same file). * * fdvp and tdvp may be the same, and must be referenced and unlocked. */ static int mumblefs_sane_rename( struct vnode *fdvp, struct componentname *fcnp, struct vnode *tdvp, struct componentname *tcnp, kauth_cred_t cred, bool posixly_correct) { struct mumblefs_lookup_results fulr, tulr; return genfs_sane_rename(&mumblefs_genfs_rename_ops, fdvp, fcnp, &fulr, tdvp, tcnp, &tulr, cred, posixly_correct); } /* * mumblefs_rename: The hairiest vop, with the insanest API. Defer to * genfs_insane_rename immediately. */ int mumblefs_rename(void *v) { return genfs_insane_rename(v, &mumblefs_sane_rename); } #endif /* end sample copypasta */ /* * Forward declarations */ static int genfs_rename_enter(const struct genfs_rename_ops *, struct mount *, kauth_cred_t, struct vnode *, struct componentname *, void *, struct vnode **, struct vnode *, struct componentname *, void *, struct vnode **); static int genfs_rename_enter_common(const struct genfs_rename_ops *, struct mount *, kauth_cred_t, struct vnode *, struct componentname *, void *, struct vnode **, struct componentname *, void *, struct vnode **); static int genfs_rename_enter_separate(const struct genfs_rename_ops *, struct mount *, kauth_cred_t, struct vnode *, struct componentname *, void *, struct vnode **, struct vnode *, struct componentname *, void *, struct vnode **); static int genfs_rename_lock(const struct genfs_rename_ops *, struct mount *, kauth_cred_t, int, int, int, struct vnode *, struct componentname *, bool, void *, struct vnode **, struct vnode *, struct componentname *, bool, void *, struct vnode **); static void genfs_rename_exit(const struct genfs_rename_ops *, struct mount *, struct vnode *, struct vnode *, struct vnode *, struct vnode *); static int genfs_rename_remove(const struct genfs_rename_ops *, struct mount *, kauth_cred_t, struct vnode *, struct componentname *, void *, struct vnode *, nlink_t *); /* * genfs_insane_rename: Generic implementation of the insane API for * the rename vop. * * Arguments: * * . fdvp (from directory vnode), * . fvp (from vnode), * . fcnp (from component name), * . tdvp (to directory vnode), * . tvp (to vnode, or NULL), and * . tcnp (to component name). * * Any pair of vnode parameters may have the same vnode. * * On entry, * * . fdvp, fvp, tdvp, and tvp are referenced, * . fdvp and fvp are unlocked, and * . tdvp and tvp (if nonnull) are locked. * * On exit, * * . fdvp, fvp, tdvp, and tvp (if nonnull) are unreferenced, and * . tdvp and tvp (if nonnull) are unlocked. */ int genfs_insane_rename(void *v, int (*sane_rename)(struct vnode *fdvp, struct componentname *fcnp, struct vnode *tdvp, struct componentname *tcnp, kauth_cred_t cred, bool posixly_correct)) { struct vop_rename_args /* { struct vnode *a_fdvp; struct vnode *a_fvp; struct componentname *a_fcnp; struct vnode *a_tdvp; struct vnode *a_tvp; struct componentname *a_tcnp; } */ *ap = v; struct vnode *fdvp = ap->a_fdvp; struct vnode *fvp = ap->a_fvp; struct componentname *fcnp = ap->a_fcnp; struct vnode *tdvp = ap->a_tdvp; struct vnode *tvp = ap->a_tvp; struct componentname *tcnp = ap->a_tcnp; kauth_cred_t cred; int error; KASSERT(fdvp != NULL); KASSERT(fvp != NULL); KASSERT(fcnp != NULL); KASSERT(fcnp->cn_nameptr != NULL); KASSERT(tdvp != NULL); KASSERT(tcnp != NULL); KASSERT(fcnp->cn_nameptr != NULL); /* KASSERT(VOP_ISLOCKED(fdvp) != LK_EXCLUSIVE); */ /* KASSERT(VOP_ISLOCKED(fvp) != LK_EXCLUSIVE); */ KASSERT(VOP_ISLOCKED(tdvp) == LK_EXCLUSIVE); KASSERT((tvp == NULL) || (VOP_ISLOCKED(tvp) == LK_EXCLUSIVE)); KASSERT(fdvp->v_type == VDIR); KASSERT(tdvp->v_type == VDIR); cred = fcnp->cn_cred; /* * XXX Want a better equality test. `tcnp->cn_cred == cred' * hoses p2k because puffs transmits the creds separately and * allocates distinct but equivalent structures for them. */ KASSERT(kauth_cred_uidmatch(cred, tcnp->cn_cred)); /* * Sanitize our world from the VFS insanity. Unlock the target * directory and node, which are locked. Release the children, * which are referenced, since we'll be looking them up again * later. */ VOP_UNLOCK(tdvp); if ((tvp != NULL) && (tvp != tdvp)) VOP_UNLOCK(tvp); vrele(fvp); if (tvp != NULL) vrele(tvp); error = (*sane_rename)(fdvp, fcnp, tdvp, tcnp, cred, false); /* * All done, whether with success or failure. Release the * directory nodes now, as the caller expects from the VFS * protocol. */ vrele(fdvp); vrele(tdvp); return error; } /* * genfs_sane_rename: Generic implementation of the saner API for the * rename vop. Handles ancestry checks, locking, and permissions * checks. Caller is responsible for implementing the genfs rename * operations. * * fdvp and tdvp must be referenced and unlocked. */ int genfs_sane_rename(const struct genfs_rename_ops *ops, struct vnode *fdvp, struct componentname *fcnp, void *fde, struct vnode *tdvp, struct componentname *tcnp, void *tde, kauth_cred_t cred, bool posixly_correct) { struct mount *mp; struct vnode *fvp = NULL, *tvp = NULL; nlink_t tvp_new_nlink = 0; int error; KASSERT(ops != NULL); KASSERT(fdvp != NULL); KASSERT(fcnp != NULL); KASSERT(tdvp != NULL); KASSERT(tcnp != NULL); /* KASSERT(VOP_ISLOCKED(fdvp) != LK_EXCLUSIVE); */ /* KASSERT(VOP_ISLOCKED(tdvp) != LK_EXCLUSIVE); */ KASSERT(fdvp->v_type == VDIR); KASSERT(tdvp->v_type == VDIR); KASSERT(fdvp->v_mount == tdvp->v_mount); KASSERT(fcnp != tcnp); KASSERT(fcnp->cn_nameiop == DELETE); KASSERT(tcnp->cn_nameiop == RENAME); /* XXX Want a better equality test. */ KASSERT(kauth_cred_uidmatch(cred, fcnp->cn_cred)); KASSERT(kauth_cred_uidmatch(cred, tcnp->cn_cred)); mp = fdvp->v_mount; KASSERT(mp != NULL); KASSERT(mp == tdvp->v_mount); /* XXX How can we be sure this stays true? */ KASSERT((mp->mnt_flag & MNT_RDONLY) == 0); /* Reject rename("x/..", ...) and rename(..., "x/..") early. */ if ((fcnp->cn_flags | tcnp->cn_flags) & ISDOTDOT) return EINVAL; /* XXX EISDIR? */ error = genfs_rename_enter(ops, mp, cred, fdvp, fcnp, fde, &fvp, tdvp, tcnp, tde, &tvp); if (error) return error; /* * Check that everything is locked and looks right. */ KASSERT(fvp != NULL); KASSERT(VOP_ISLOCKED(fdvp) == LK_EXCLUSIVE); KASSERT(VOP_ISLOCKED(fvp) == LK_EXCLUSIVE); KASSERT(VOP_ISLOCKED(tdvp) == LK_EXCLUSIVE); KASSERT((tvp == NULL) || (VOP_ISLOCKED(tvp) == LK_EXCLUSIVE)); /* * If the source and destination are the same object, we need * only at most delete the source entry. We are guaranteed at * this point that the entries are distinct. */ if (fvp == tvp) { KASSERT(tvp != NULL); if (fvp->v_type == VDIR) /* XXX This shouldn't be possible. */ error = EINVAL; else if (posixly_correct) /* POSIX sez to leave them alone. */ error = 0; else if ((fdvp == tdvp) && (fcnp->cn_namelen == tcnp->cn_namelen) && (memcmp(fcnp->cn_nameptr, tcnp->cn_nameptr, fcnp->cn_namelen) == 0)) /* Renaming an entry over itself does nothing. */ error = 0; else { /* XXX Can't use VOP_REMOVE because of locking. */ error = genfs_rename_remove(ops, mp, cred, fdvp, fcnp, fde, fvp, &tvp_new_nlink); VN_KNOTE(fdvp, NOTE_WRITE); VN_KNOTE(fvp, tvp_new_nlink == 0 ? NOTE_DELETE : NOTE_LINK); } goto out; } KASSERT(fvp != tvp); KASSERT((fdvp != tdvp) || (fcnp->cn_namelen != tcnp->cn_namelen) || (memcmp(fcnp->cn_nameptr, tcnp->cn_nameptr, fcnp->cn_namelen) != 0)); /* * If the target exists, refuse to rename a directory over a * non-directory or vice versa, or to clobber a non-empty * directory. */ if (tvp != NULL) { if (fvp->v_type == VDIR && tvp->v_type == VDIR) error = (ops->gro_directory_empty_p(mp, cred, tvp, tdvp)? 0 : ENOTEMPTY); else if (fvp->v_type == VDIR && tvp->v_type != VDIR) error = ENOTDIR; else if (fvp->v_type != VDIR && tvp->v_type == VDIR) error = EISDIR; else error = 0; if (error) goto out; KASSERT((fvp->v_type == VDIR) == (tvp->v_type == VDIR)); } /* * Authorize the rename. */ error = ops->gro_rename_check_possible(mp, fdvp, fvp, tdvp, tvp); if (error) goto out; error = ops->gro_rename_check_permitted(mp, cred, fdvp, fvp, tdvp, tvp); error = kauth_authorize_vnode(cred, KAUTH_VNODE_DELETE, fvp, fdvp, error); error = kauth_authorize_vnode(cred, KAUTH_VNODE_RENAME, tvp, tdvp, error); if (error) goto out; /* * Everything is hunky-dory. Shuffle the directory entries. */ error = ops->gro_rename(mp, cred, fdvp, fcnp, fde, fvp, tdvp, tcnp, tde, tvp, &tvp_new_nlink); if (error) goto out; /* Success! */ genfs_rename_knote(fdvp, fvp, tdvp, tvp, tvp_new_nlink); out: genfs_rename_exit(ops, mp, fdvp, fvp, tdvp, tvp); return error; } /* * genfs_rename_knote: Note events about the various vnodes in a * rename. To be called by gro_rename on success. The only pair of * vnodes that may be identical is {fdvp, tdvp}. tvp_new_nlink is * the resulting link count of tvp. */ void genfs_rename_knote(struct vnode *fdvp, struct vnode *fvp, struct vnode *tdvp, struct vnode *tvp, nlink_t tvp_new_nlink) { long fdvp_events, tdvp_events; bool directory_p, reparent_p, replaced_p; KASSERT(fdvp != NULL); KASSERT(fvp != NULL); KASSERT(tdvp != NULL); KASSERT(fdvp != fvp); KASSERT(fdvp != tvp); KASSERT(tdvp != fvp); KASSERT(tdvp != tvp); KASSERT(fvp != tvp); KASSERT(VOP_ISLOCKED(fdvp) == LK_EXCLUSIVE); KASSERT(VOP_ISLOCKED(fvp) == LK_EXCLUSIVE); KASSERT(VOP_ISLOCKED(tdvp) == LK_EXCLUSIVE); KASSERT((tvp == NULL) || (VOP_ISLOCKED(tvp) == LK_EXCLUSIVE)); directory_p = (fvp->v_type == VDIR); reparent_p = (fdvp != tdvp); replaced_p = (tvp != NULL); KASSERT((tvp == NULL) || (directory_p == (tvp->v_type == VDIR))); fdvp_events = NOTE_WRITE; if (directory_p && reparent_p) fdvp_events |= NOTE_LINK; VN_KNOTE(fdvp, fdvp_events); VN_KNOTE(fvp, NOTE_RENAME); if (reparent_p) { tdvp_events = NOTE_WRITE; if (!replaced_p) { tdvp_events |= NOTE_EXTEND; if (directory_p) tdvp_events |= NOTE_LINK; } VN_KNOTE(tdvp, tdvp_events); } if (replaced_p) VN_KNOTE(tvp, (tvp_new_nlink == 0 ? NOTE_DELETE : NOTE_LINK)); } /* * genfs_rename_cache_purge: Purge the name cache. To be called by * gro_rename on success. The only pair of vnodes that may be * identical is {fdvp, tdvp}. */ void genfs_rename_cache_purge(struct vnode *fdvp, struct vnode *fvp, struct vnode *tdvp, struct vnode *tvp) { KASSERT(fdvp != NULL); KASSERT(fvp != NULL); KASSERT(tdvp != NULL); KASSERT(fdvp != fvp); KASSERT(fdvp != tvp); KASSERT(tdvp != fvp); KASSERT(tdvp != tvp); KASSERT(fvp != tvp); KASSERT(fdvp->v_type == VDIR); KASSERT(tdvp->v_type == VDIR); /* * XXX What actually needs to be purged? */ cache_purge(fdvp); if (fvp->v_type == VDIR) cache_purge(fvp); if (tdvp != fdvp) cache_purge(tdvp); if ((tvp != NULL) && (tvp->v_type == VDIR)) cache_purge(tvp); } /* * genfs_rename_enter: Look up fcnp in fdvp, and store the lookup * results in *fde_ret and the associated vnode in *fvp_ret; fail if * not found. Look up tcnp in tdvp, and store the lookup results in * *tde_ret and the associated vnode in *tvp_ret; store null instead if * not found. Fail if anything has been mounted on any of the nodes * involved. * * fdvp and tdvp must be referenced. * * On entry, nothing is locked. * * On success, everything is locked, and *fvp_ret, and *tvp_ret if * nonnull, are referenced. The only pairs of vnodes that may be * identical are {fdvp, tdvp} and {fvp, tvp}. * * On failure, everything remains as was. * * Locking everything including the source and target nodes is * necessary to make sure that, e.g., link count updates are OK. The * locking order is, in general, ancestor-first, matching the order you * need to use to look up a descendant anyway. */ static int genfs_rename_enter(const struct genfs_rename_ops *ops, struct mount *mp, kauth_cred_t cred, struct vnode *fdvp, struct componentname *fcnp, void *fde_ret, struct vnode **fvp_ret, struct vnode *tdvp, struct componentname *tcnp, void *tde_ret, struct vnode **tvp_ret) { int error; KASSERT(mp != NULL); KASSERT(fdvp != NULL); KASSERT(fcnp != NULL); KASSERT(fvp_ret != NULL); KASSERT(tdvp != NULL); KASSERT(tcnp != NULL); KASSERT(tvp_ret != NULL); KASSERT(fvp_ret != tvp_ret); KASSERT(fdvp->v_type == VDIR); KASSERT(tdvp->v_type == VDIR); KASSERT(fdvp->v_mount == mp); KASSERT(tdvp->v_mount == mp); if (fdvp == tdvp) error = genfs_rename_enter_common(ops, mp, cred, fdvp, fcnp, fde_ret, fvp_ret, tcnp, tde_ret, tvp_ret); else error = genfs_rename_enter_separate(ops, mp, cred, fdvp, fcnp, fde_ret, fvp_ret, tdvp, tcnp, tde_ret, tvp_ret); if (error) return error; KASSERT(*fvp_ret != NULL); KASSERT(VOP_ISLOCKED(*fvp_ret) == LK_EXCLUSIVE); KASSERT((*tvp_ret == NULL) || (VOP_ISLOCKED(*tvp_ret) == LK_EXCLUSIVE)); KASSERT(*fvp_ret != fdvp); KASSERT(*fvp_ret != tdvp); KASSERT(*tvp_ret != fdvp); KASSERT(*tvp_ret != tdvp); return 0; } /* * genfs_rename_enter_common: Lock and look up with a common * source/target directory. */ static int genfs_rename_enter_common(const struct genfs_rename_ops *ops, struct mount *mp, kauth_cred_t cred, struct vnode *dvp, struct componentname *fcnp, void *fde_ret, struct vnode **fvp_ret, struct componentname *tcnp, void *tde_ret, struct vnode **tvp_ret) { struct vnode *fvp, *tvp; int error; KASSERT(ops != NULL); KASSERT(mp != NULL); KASSERT(dvp != NULL); KASSERT(fcnp != NULL); KASSERT(fvp_ret != NULL); KASSERT(tcnp != NULL); KASSERT(tvp_ret != NULL); KASSERT(dvp->v_type == VDIR); KASSERT(dvp->v_mount == mp); error = ops->gro_lock_directory(mp, dvp); if (error) goto fail0; /* Did we lose a race with mount? */ if (dvp->v_mountedhere != NULL) { error = EBUSY; goto fail1; } KASSERT(fcnp->cn_nameiop == DELETE); error = ops->gro_lookup(mp, dvp, fcnp, fde_ret, &fvp); if (error) goto fail1; KASSERT(fvp != NULL); /* Refuse to rename `.'. */ if (fvp == dvp) { error = EINVAL; goto fail2; } KASSERT(fvp != dvp); KASSERT(tcnp->cn_nameiop == RENAME); error = ops->gro_lookup(mp, dvp, tcnp, tde_ret, &tvp); if (error == ENOENT) { tvp = NULL; } else if (error) { goto fail2; } else { KASSERT(tvp != NULL); /* Refuse to rename over `.'. */ if (tvp == dvp) { error = EISDIR; /* XXX EINVAL? */ goto fail2; } } KASSERT(tvp != dvp); /* * We've looked up both nodes. Now lock them and check them. */ vn_lock(fvp, LK_EXCLUSIVE | LK_RETRY); KASSERT(fvp->v_mount == mp); /* Refuse to rename a mount point. */ if ((fvp->v_type == VDIR) && (fvp->v_mountedhere != NULL)) { error = EBUSY; goto fail3; } if ((tvp != NULL) && (tvp != fvp)) { vn_lock(tvp, LK_EXCLUSIVE | LK_RETRY); KASSERT(tvp->v_mount == mp); /* Refuse to rename over a mount point. */ if ((tvp->v_type == VDIR) && (tvp->v_mountedhere != NULL)) { error = EBUSY; goto fail4; } } KASSERT(VOP_ISLOCKED(dvp) == LK_EXCLUSIVE); KASSERT(VOP_ISLOCKED(fvp) == LK_EXCLUSIVE); KASSERT((tvp == NULL) || (VOP_ISLOCKED(tvp) == LK_EXCLUSIVE)); *fvp_ret = fvp; *tvp_ret = tvp; return 0; fail4: if ((tvp != NULL) && (tvp != fvp)) VOP_UNLOCK(tvp); fail3: VOP_UNLOCK(fvp); if (tvp != NULL) vrele(tvp); fail2: vrele(fvp); fail1: VOP_UNLOCK(dvp); fail0: return error; } /* * genfs_rename_enter_separate: Lock and look up with separate source * and target directories. */ static int genfs_rename_enter_separate(const struct genfs_rename_ops *ops, struct mount *mp, kauth_cred_t cred, struct vnode *fdvp, struct componentname *fcnp, void *fde_ret, struct vnode **fvp_ret, struct vnode *tdvp, struct componentname *tcnp, void *tde_ret, struct vnode **tvp_ret) { struct vnode *intermediate_node; struct vnode *fvp, *tvp; int error; KASSERT(ops != NULL); KASSERT(mp != NULL); KASSERT(fdvp != NULL); KASSERT(fcnp != NULL); KASSERT(fvp_ret != NULL); KASSERT(tdvp != NULL); KASSERT(tcnp != NULL); KASSERT(tvp_ret != NULL); KASSERT(fdvp != tdvp); KASSERT(fcnp != tcnp); KASSERT(fcnp->cn_nameiop == DELETE); KASSERT(tcnp->cn_nameiop == RENAME); KASSERT(fvp_ret != tvp_ret); KASSERT(fdvp->v_type == VDIR); KASSERT(tdvp->v_type == VDIR); KASSERT(fdvp->v_mount == mp); KASSERT(tdvp->v_mount == mp); error = ops->gro_genealogy(mp, cred, fdvp, tdvp, &intermediate_node); if (error) return error; /* * intermediate_node == NULL means fdvp is not an ancestor of tdvp. */ if (intermediate_node == NULL) error = genfs_rename_lock(ops, mp, cred, ENOTEMPTY, EISDIR, EINVAL, tdvp, tcnp, true, tde_ret, &tvp, fdvp, fcnp, false, fde_ret, &fvp); else error = genfs_rename_lock(ops, mp, cred, EINVAL, EISDIR, EINVAL, fdvp, fcnp, false, fde_ret, &fvp, tdvp, tcnp, true, tde_ret, &tvp); if (error) goto out; KASSERT(fvp != NULL); /* * Reject rename("foo/bar", "foo/bar/baz/quux/zot"). */ if (fvp == intermediate_node) { genfs_rename_exit(ops, mp, fdvp, fvp, tdvp, tvp); error = EINVAL; goto out; } *fvp_ret = fvp; *tvp_ret = tvp; error = 0; out: if (intermediate_node != NULL) vrele(intermediate_node); return error; } /* * genfs_rename_lock: Lookup and lock it all. The lock order is: * * a_dvp -> a_vp -> b_dvp -> b_vp, * * except if a_vp is a nondirectory in which case the lock order is: * * a_dvp -> b_dvp -> b_vp -> a_vp, * * which can't violate ancestor->descendant because a_vp has no * descendants in this case. This edge case is necessary because some * file systems can only lookup/lock/unlock, and we can't hold a_vp * locked when we lookup/lock/unlock b_vp if they turn out to be the * same, and we can't find out that they're the same until after the * lookup. * * b_dvp must not be an ancestor of a_dvp, although a_dvp may be an * ancestor of b_dvp. * * Fail with overlap_error if node a is directory b. Neither * componentname may be `.' or `..'. * * a_dvp and b_dvp must be referenced. * * On entry, a_dvp and b_dvp are unlocked. * * On success, * . a_dvp and b_dvp are locked, * . *a_dirent_ret is filled with a directory entry whose node is * locked and referenced, * . *b_vp_ret is filled with the corresponding vnode, * . *b_dirent_ret is filled either with null or with a directory entry * whose node is locked and referenced, * . *b_vp is filled either with null or with the corresponding vnode, * and * . the only pair of vnodes that may be identical is a_vp and b_vp. * * On failure, a_dvp and b_dvp are left unlocked, and *a_dirent_ret, * *a_vp, *b_dirent_ret, and *b_vp are left alone. */ static int genfs_rename_lock(const struct genfs_rename_ops *ops, struct mount *mp, kauth_cred_t cred, int overlap_error, int a_dot_error, int b_dot_error, struct vnode *a_dvp, struct componentname *a_cnp, bool a_missing_ok, void *a_de_ret, struct vnode **a_vp_ret, struct vnode *b_dvp, struct componentname *b_cnp, bool b_missing_ok, void *b_de_ret, struct vnode **b_vp_ret) { struct vnode *a_vp, *b_vp; int error; KASSERT(ops != NULL); KASSERT(mp != NULL); KASSERT(a_dvp != NULL); KASSERT(a_cnp != NULL); KASSERT(a_vp_ret != NULL); KASSERT(b_dvp != NULL); KASSERT(b_cnp != NULL); KASSERT(b_vp_ret != NULL); KASSERT(a_dvp != b_dvp); KASSERT(a_vp_ret != b_vp_ret); KASSERT(a_dvp->v_type == VDIR); KASSERT(b_dvp->v_type == VDIR); KASSERT(a_dvp->v_mount == mp); KASSERT(b_dvp->v_mount == mp); KASSERT(a_missing_ok != b_missing_ok); /* * 1. Lock a_dvp. */ error = ops->gro_lock_directory(mp, a_dvp); if (error) goto fail0; /* Did we lose a race with mount? */ if (a_dvp->v_mountedhere != NULL) { error = EBUSY; goto fail1; } /* * 2. Lookup a_vp. May lock/unlock a_vp. */ error = ops->gro_lookup(mp, a_dvp, a_cnp, a_de_ret, &a_vp); if (error) { if (a_missing_ok && (error == ENOENT)) a_vp = NULL; else goto fail1; } else { KASSERT(a_vp != NULL); /* Refuse to rename (over) `.'. */ if (a_vp == a_dvp) { error = a_dot_error; goto fail2; } /* Reject rename("x", "x/y") or rename("x/y", "x"). */ if (a_vp == b_dvp) { error = overlap_error; goto fail2; } } KASSERT(a_vp != a_dvp); KASSERT(a_vp != b_dvp); /* * 3. Lock a_vp, if it is a directory. * * We already ruled out a_vp == a_dvp (i.e., a_cnp is `.'), so * this is not locking against self, and we already ruled out * a_vp == b_dvp, so this won't cause subsequent locking of * b_dvp to lock against self. * * If a_vp is a nondirectory, we can't hold it when we lookup * b_vp in case (a) the file system can only lookup/lock/unlock * and (b) b_vp turns out to be the same file as a_vp due to * hard links -- and we can't even detect that case until after * we've looked up b_vp. Fortunately, if a_vp is a * nondirectory, then it is a leaf, so we can safely lock it * last. */ if (a_vp != NULL && a_vp->v_type == VDIR) { vn_lock(a_vp, LK_EXCLUSIVE | LK_RETRY); KASSERT(a_vp->v_mount == mp); /* Refuse to rename (over) a mount point. */ if (a_vp->v_mountedhere != NULL) { error = EBUSY; goto fail3; } } /* * 4. Lock b_dvp. */ error = ops->gro_lock_directory(mp, b_dvp); if (error) goto fail3; /* Did we lose a race with mount? */ if (b_dvp->v_mountedhere != NULL) { error = EBUSY; goto fail4; } /* * 5. Lookup b_vp. May lock/unlock b_vp. */ error = ops->gro_lookup(mp, b_dvp, b_cnp, b_de_ret, &b_vp); if (error) { if (b_missing_ok && (error == ENOENT)) b_vp = NULL; else goto fail4; } else { KASSERT(b_vp != NULL); /* Refuse to rename (over) `.'. */ if (b_vp == b_dvp) { error = b_dot_error; goto fail5; } /* * b_dvp must not be an ancestor of a_dvp, so if we * find b_dvp/b_vp=a_dvp/a_vp something is wrong. */ if (b_vp == a_dvp) { /* * We have a directory hard link before us. * XXX What error should this return? EDEADLK? * Panic? */ error = EIO; goto fail5; } } KASSERT(b_vp != b_dvp); KASSERT(b_vp != a_dvp); /* * 6. Lock a_vp, if it is a nondirectory. * * In this case a_vp is a leaf, so it is either equal to or * incommensurate with b_vp, and so we can safely lock it at * any point now. */ if (a_vp != NULL && a_vp->v_type != VDIR) { vn_lock(a_vp, LK_EXCLUSIVE | LK_RETRY); KASSERT(a_vp->v_mount == mp); /* (not a directory so can't have anything mounted here) */ } /* * 7. Lock b_vp, if it is not a_vp. * * b_vp and a_vp may the same inode if they are hard links to * one another. */ if ((b_vp != NULL) && (b_vp != a_vp)) { vn_lock(b_vp, LK_EXCLUSIVE | LK_RETRY); KASSERT(b_vp->v_mount == mp); /* Refuse to rename (over) a mount point. */ if ((b_vp->v_type == VDIR) && (b_vp->v_mountedhere != NULL)) { error = EBUSY; goto fail6; } } KASSERT(VOP_ISLOCKED(a_dvp) == LK_EXCLUSIVE); KASSERT(VOP_ISLOCKED(b_dvp) == LK_EXCLUSIVE); KASSERT(a_missing_ok || (a_vp != NULL)); KASSERT(b_missing_ok || (b_vp != NULL)); KASSERT((a_vp == NULL) || (VOP_ISLOCKED(a_vp) == LK_EXCLUSIVE)); KASSERT((b_vp == NULL) || (VOP_ISLOCKED(b_vp) == LK_EXCLUSIVE)); *a_vp_ret = a_vp; *b_vp_ret = b_vp; return 0; fail6: if ((b_vp != NULL) && (b_vp != a_vp)) VOP_UNLOCK(b_vp); if (a_vp != NULL && a_vp->v_type != VDIR) VOP_UNLOCK(a_vp); fail5: if (b_vp != NULL) vrele(b_vp); fail4: VOP_UNLOCK(b_dvp); fail3: if (a_vp != NULL && a_vp->v_type == VDIR) VOP_UNLOCK(a_vp); fail2: if (a_vp != NULL) vrele(a_vp); fail1: VOP_UNLOCK(a_dvp); fail0: return error; } /* * genfs_rename_exit: Unlock everything we locked for rename. * * fdvp and tdvp must be referenced. * * On entry, everything is locked, and fvp and tvp referenced. * * On exit, everything is unlocked, and fvp and tvp are released. */ static void genfs_rename_exit(const struct genfs_rename_ops *ops, struct mount *mp, struct vnode *fdvp, struct vnode *fvp, struct vnode *tdvp, struct vnode *tvp) { (void)ops; KASSERT(ops != NULL); KASSERT(mp != NULL); KASSERT(fdvp != NULL); KASSERT(fvp != NULL); KASSERT(fdvp != fvp); KASSERT(fdvp != tvp); KASSERT(tdvp != tvp); KASSERT(tdvp != fvp); KASSERT(VOP_ISLOCKED(fdvp) == LK_EXCLUSIVE); KASSERT(VOP_ISLOCKED(tdvp) == LK_EXCLUSIVE); KASSERT(VOP_ISLOCKED(fvp) == LK_EXCLUSIVE); KASSERT((tvp == NULL) || (VOP_ISLOCKED(tvp) == LK_EXCLUSIVE)); if ((tvp != NULL) && (tvp != fvp)) VOP_UNLOCK(tvp); VOP_UNLOCK(fvp); if (tvp != NULL) vrele(tvp); if (tdvp != fdvp) VOP_UNLOCK(tdvp); vrele(fvp); VOP_UNLOCK(fdvp); } /* * genfs_rename_remove: Remove the entry for the non-directory vp with * componentname cnp from the directory dvp, using the lookup results * de. It is the responsibility of gro_remove to purge the name cache. * * Everything must be locked and referenced. */ static int genfs_rename_remove(const struct genfs_rename_ops *ops, struct mount *mp, kauth_cred_t cred, struct vnode *dvp, struct componentname *cnp, void *de, struct vnode *vp, nlink_t *tvp_nlinkp) { int error; KASSERT(ops != NULL); KASSERT(mp != NULL); KASSERT(dvp != NULL); KASSERT(cnp != NULL); KASSERT(vp != NULL); KASSERT(dvp != vp); KASSERT(dvp->v_type == VDIR); KASSERT(vp->v_type != VDIR); KASSERT(dvp->v_mount == mp); KASSERT(vp->v_mount == mp); KASSERT(VOP_ISLOCKED(dvp) == LK_EXCLUSIVE); KASSERT(VOP_ISLOCKED(vp) == LK_EXCLUSIVE); error = ops->gro_remove_check_possible(mp, dvp, vp); if (error) return error; error = ops->gro_remove_check_permitted(mp, cred, dvp, vp); error = kauth_authorize_vnode(cred, KAUTH_VNODE_DELETE, vp, dvp, error); if (error) return error; error = ops->gro_remove(mp, cred, dvp, cnp, de, vp, tvp_nlinkp); if (error) return error; return 0; } static int genfs_ufslike_check_sticky(kauth_cred_t, mode_t, uid_t, struct vnode *, uid_t); /* * genfs_ufslike_rename_check_possible: Check whether a rename is * possible independent of credentials, assuming UFS-like inode flag * semantics. clobber_p is true iff the target node already exists. */ int genfs_ufslike_rename_check_possible( unsigned long fdflags, unsigned long fflags, unsigned long tdflags, unsigned long tflags, bool clobber_p, unsigned long immutable, unsigned long append) { if ((fdflags | fflags) & (immutable | append)) return EPERM; if (tdflags & (immutable | (clobber_p? append : 0))) return EPERM; if (clobber_p && (tflags & (immutable | append))) return EPERM; return 0; } /* * genfs_ufslike_rename_check_permitted: Check whether a rename is * permitted given our credentials, assuming UFS-like permission and * ownership semantics. * * The only pair of vnodes that may be identical is {fdvp, tdvp}. * * Everything must be locked and referenced. */ int genfs_ufslike_rename_check_permitted(kauth_cred_t cred, struct vnode *fdvp, mode_t fdmode, uid_t fduid, struct vnode *fvp, uid_t fuid, struct vnode *tdvp, mode_t tdmode, uid_t tduid, struct vnode *tvp, uid_t tuid) { int error; KASSERT(fdvp != NULL); KASSERT(fvp != NULL); KASSERT(tdvp != NULL); KASSERT(fdvp != fvp); KASSERT(fdvp != tvp); KASSERT(tdvp != fvp); KASSERT(tdvp != tvp); KASSERT(fvp != tvp); KASSERT(fdvp->v_type == VDIR); KASSERT(tdvp->v_type == VDIR); KASSERT(fdvp->v_mount == fvp->v_mount); KASSERT(fdvp->v_mount == tdvp->v_mount); KASSERT((tvp == NULL) || (fdvp->v_mount == tvp->v_mount)); KASSERT(VOP_ISLOCKED(fdvp) == LK_EXCLUSIVE); KASSERT(VOP_ISLOCKED(fvp) == LK_EXCLUSIVE); KASSERT(VOP_ISLOCKED(tdvp) == LK_EXCLUSIVE); KASSERT((tvp == NULL) || (VOP_ISLOCKED(tvp) == LK_EXCLUSIVE)); /* * We need to remove or change an entry in the source directory. */ error = VOP_ACCESS(fdvp, VWRITE, cred); if (error) return error; /* * If we are changing directories, then we need to write to the * target directory to add or change an entry. Also, if fvp is * a directory, we need to write to it to change its `..' * entry. */ if (fdvp != tdvp) { error = VOP_ACCESS(tdvp, VWRITE, cred); if (error) return error; if (fvp->v_type == VDIR) { error = VOP_ACCESS(fvp, VWRITE, cred); if (error) return error; } } error = genfs_ufslike_check_sticky(cred, fdmode, fduid, fvp, fuid); if (error) return error; error = genfs_ufslike_check_sticky(cred, tdmode, tduid, tvp, tuid); if (error) return error; return 0; } /* * genfs_ufslike_remove_check_possible: Check whether a remove is * possible independent of credentials, assuming UFS-like inode flag * semantics. */ int genfs_ufslike_remove_check_possible(unsigned long dflags, unsigned long flags, unsigned long immutable, unsigned long append) { /* * We want to delete the entry. If the directory is immutable, * we can't write to it to delete the entry. If the directory * is append-only, the only change we can make is to add * entries, so we can't delete entries. If the node is * immutable, we can't change the links to it, so we can't * delete the entry. If the node is append-only...well, this * is what UFS does. */ if ((dflags | flags) & (immutable | append)) return EPERM; return 0; } /* * genfs_ufslike_remove_check_permitted: Check whether a remove is * permitted given our credentials, assuming UFS-like permission and * ownership semantics. * * Everything must be locked and referenced. */ int genfs_ufslike_remove_check_permitted(kauth_cred_t cred, struct vnode *dvp, mode_t dmode, uid_t duid, struct vnode *vp, uid_t uid) { int error; KASSERT(dvp != NULL); KASSERT(vp != NULL); KASSERT(dvp != vp); KASSERT(dvp->v_type == VDIR); KASSERT(vp->v_type != VDIR); KASSERT(dvp->v_mount == vp->v_mount); KASSERT(VOP_ISLOCKED(dvp) == LK_EXCLUSIVE); KASSERT(VOP_ISLOCKED(vp) == LK_EXCLUSIVE); /* * We need to write to the directory to remove from it. */ error = VOP_ACCESS(dvp, VWRITE, cred); if (error) return error; error = genfs_ufslike_check_sticky(cred, dmode, duid, vp, uid); if (error) return error; return 0; } /* * genfs_ufslike_check_sticky: Check whether a party with credentials * cred may change an entry in a sticky directory, assuming UFS-like * permission, ownership, and stickiness semantics: If the directory is * sticky and the entry exists, the user must own either the directory * or the entry's node in order to change the entry. * * Everything must be locked and referenced. */ int genfs_ufslike_check_sticky(kauth_cred_t cred, mode_t dmode, uid_t duid, struct vnode *vp, uid_t uid) { if ((dmode & S_ISTXT) && (vp != NULL)) return genfs_can_sticky(vp, cred, duid, uid); return 0; }
262 19 243 82 82 10 71 153 19 134 153 1 1 1 1 1 1 1 1 1 466 465 466 25 26 401 400 184 185 19 18 400 401 180 181 19 19 7 7 19 19 7 7 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 /* $NetBSD: uvm_object.c,v 1.25 2020/08/15 07:24:09 chs Exp $ */ /* * Copyright (c) 2006, 2010, 2019 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Mindaugas Rasiukevicius. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /* * uvm_object.c: operate with memory objects * * TODO: * 1. Support PG_RELEASED-using objects */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: uvm_object.c,v 1.25 2020/08/15 07:24:09 chs Exp $"); #ifdef _KERNEL_OPT #include "opt_ddb.h" #endif #include <sys/param.h> #include <sys/rwlock.h> #include <sys/queue.h> #include <uvm/uvm.h> #include <uvm/uvm_ddb.h> #include <uvm/uvm_page_array.h> /* Page count to fetch per single step. */ #define FETCH_PAGECOUNT 16 /* * uvm_obj_init: initialize UVM memory object. */ void uvm_obj_init(struct uvm_object *uo, const struct uvm_pagerops *ops, bool alock, u_int refs) { #if 0 /* notyet */ KASSERT(ops); #endif if (alock) { /* Allocate and assign a lock. */ uo->vmobjlock = rw_obj_alloc(); } else { /* The lock will need to be set via uvm_obj_setlock(). */ uo->vmobjlock = NULL; } uo->pgops = ops; LIST_INIT(&uo->uo_ubc); uo->uo_npages = 0; uo->uo_refs = refs; radix_tree_init_tree(&uo->uo_pages); } /* * uvm_obj_destroy: destroy UVM memory object. */ void uvm_obj_destroy(struct uvm_object *uo, bool dlock) { KASSERT(radix_tree_empty_tree_p(&uo->uo_pages)); /* Purge any UBC entries associated with this object. */ ubc_purge(uo); /* Destroy the lock, if requested. */ if (dlock) { rw_obj_free(uo->vmobjlock); } radix_tree_fini_tree(&uo->uo_pages); } /* * uvm_obj_setlock: assign a vmobjlock to the UVM object. * * => Caller is responsible to ensure that UVM objects is not use. * => Only dynamic lock may be previously set. We drop the reference then. */ void uvm_obj_setlock(struct uvm_object *uo, krwlock_t *lockptr) { krwlock_t *olockptr = uo->vmobjlock; if (olockptr) { /* Drop the reference on the old lock. */ rw_obj_free(olockptr); } if (lockptr == NULL) { /* If new lock is not passed - allocate default one. */ lockptr = rw_obj_alloc(); } uo->vmobjlock = lockptr; } /* * uvm_obj_wirepages: wire the pages of entire UVM object. * * => NOTE: this function should only be used for types of objects * where PG_RELEASED flag is never set (aobj objects) * => caller must pass page-aligned start and end values */ int uvm_obj_wirepages(struct uvm_object *uobj, off_t start, off_t end, struct pglist *list) { int i, npages, error; struct vm_page *pgs[FETCH_PAGECOUNT], *pg = NULL; off_t offset = start, left; left = (end - start) >> PAGE_SHIFT; rw_enter(uobj->vmobjlock, RW_WRITER); while (left) { npages = MIN(FETCH_PAGECOUNT, left); /* Get the pages */ memset(pgs, 0, sizeof(pgs)); error = (*uobj->pgops->pgo_get)(uobj, offset, pgs, &npages, 0, VM_PROT_READ | VM_PROT_WRITE, UVM_ADV_SEQUENTIAL, PGO_SYNCIO); if (error) goto error; rw_enter(uobj->vmobjlock, RW_WRITER); for (i = 0; i < npages; i++) { KASSERT(pgs[i] != NULL); KASSERT(!(pgs[i]->flags & PG_RELEASED)); /* * Loan break */ if (pgs[i]->loan_count) { while (pgs[i]->loan_count) { pg = uvm_loanbreak(pgs[i]); if (!pg) { rw_exit(uobj->vmobjlock); uvm_wait("uobjwirepg"); rw_enter(uobj->vmobjlock, RW_WRITER); continue; } } pgs[i] = pg; } if (pgs[i]->flags & PG_AOBJ) { uvm_pagemarkdirty(pgs[i], UVM_PAGE_STATUS_DIRTY); uao_dropswap(uobj, i); } } /* Wire the pages */ for (i = 0; i < npages; i++) { uvm_pagelock(pgs[i]); uvm_pagewire(pgs[i]); uvm_pageunlock(pgs[i]); if (list != NULL) TAILQ_INSERT_TAIL(list, pgs[i], pageq.queue); } /* Unbusy the pages */ uvm_page_unbusy(pgs, npages); left -= npages; offset += npages << PAGE_SHIFT; } rw_exit(uobj->vmobjlock); return 0; error: /* Unwire the pages which has been wired */ uvm_obj_unwirepages(uobj, start, offset); return error; } /* * uvm_obj_unwirepages: unwire the pages of entire UVM object. * * => NOTE: this function should only be used for types of objects * where PG_RELEASED flag is never set * => caller must pass page-aligned start and end values */ void uvm_obj_unwirepages(struct uvm_object *uobj, off_t start, off_t end) { struct vm_page *pg; off_t offset; rw_enter(uobj->vmobjlock, RW_WRITER); for (offset = start; offset < end; offset += PAGE_SIZE) { pg = uvm_pagelookup(uobj, offset); KASSERT(pg != NULL); KASSERT(!(pg->flags & PG_RELEASED)); uvm_pagelock(pg); uvm_pageunwire(pg); uvm_pageunlock(pg); } rw_exit(uobj->vmobjlock); } static inline bool uvm_obj_notag_p(struct uvm_object *uobj, int tag) { KASSERT(rw_lock_held(uobj->vmobjlock)); return radix_tree_empty_tagged_tree_p(&uobj->uo_pages, tag); } bool uvm_obj_clean_p(struct uvm_object *uobj) { return uvm_obj_notag_p(uobj, UVM_PAGE_DIRTY_TAG); } bool uvm_obj_nowriteback_p(struct uvm_object *uobj) { return uvm_obj_notag_p(uobj, UVM_PAGE_WRITEBACK_TAG); } static inline bool uvm_obj_page_tag_p(struct vm_page *pg, int tag) { struct uvm_object *uobj = pg->uobject; uint64_t pgidx = pg->offset >> PAGE_SHIFT; KASSERT(uobj != NULL); KASSERT(rw_lock_held(uobj->vmobjlock)); return radix_tree_get_tag(&uobj->uo_pages, pgidx, tag) != 0; } static inline void uvm_obj_page_set_tag(struct vm_page *pg, int tag) { struct uvm_object *uobj = pg->uobject; uint64_t pgidx = pg->offset >> PAGE_SHIFT; KASSERT(uobj != NULL); KASSERT(rw_write_held(uobj->vmobjlock)); radix_tree_set_tag(&uobj->uo_pages, pgidx, tag); } static inline void uvm_obj_page_clear_tag(struct vm_page *pg, int tag) { struct uvm_object *uobj = pg->uobject; uint64_t pgidx = pg->offset >> PAGE_SHIFT; KASSERT(uobj != NULL); KASSERT(rw_write_held(uobj->vmobjlock)); radix_tree_clear_tag(&uobj->uo_pages, pgidx, tag); } bool uvm_obj_page_dirty_p(struct vm_page *pg) { return uvm_obj_page_tag_p(pg, UVM_PAGE_DIRTY_TAG); } void uvm_obj_page_set_dirty(struct vm_page *pg) { uvm_obj_page_set_tag(pg, UVM_PAGE_DIRTY_TAG); } void uvm_obj_page_clear_dirty(struct vm_page *pg) { uvm_obj_page_clear_tag(pg, UVM_PAGE_DIRTY_TAG); } bool uvm_obj_page_writeback_p(struct vm_page *pg) { return uvm_obj_page_tag_p(pg, UVM_PAGE_WRITEBACK_TAG); } void uvm_obj_page_set_writeback(struct vm_page *pg) { uvm_obj_page_set_tag(pg, UVM_PAGE_WRITEBACK_TAG); } void uvm_obj_page_clear_writeback(struct vm_page *pg) { uvm_obj_page_clear_tag(pg, UVM_PAGE_WRITEBACK_TAG); } #if defined(DDB) || defined(DEBUGPRINT) /* * uvm_object_printit: actually prints the object */ void uvm_object_printit(struct uvm_object *uobj, bool full, void (*pr)(const char *, ...)) { struct uvm_page_array a; struct vm_page *pg; int cnt = 0; voff_t off; (*pr)("OBJECT %p: locked=%d, pgops=%p, npages=%d, ", uobj, rw_write_held(uobj->vmobjlock), uobj->pgops, uobj->uo_npages); if (UVM_OBJ_IS_KERN_OBJECT(uobj)) (*pr)("refs=<SYSTEM>\n"); else (*pr)("refs=%d\n", uobj->uo_refs); if (!full) { return; } (*pr)(" PAGES <pg,offset>:\n "); uvm_page_array_init(&a, uobj, 0); off = 0; while ((pg = uvm_page_array_fill_and_peek(&a, off, 0)) != NULL) { cnt++; (*pr)("<%p,0x%llx> ", pg, (long long)pg->offset); if ((cnt % 3) == 0) { (*pr)("\n "); } off = pg->offset + PAGE_SIZE; uvm_page_array_advance(&a); } if ((cnt % 3) != 0) { (*pr)("\n"); } uvm_page_array_fini(&a); } #endif /* DDB || DEBUGPRINT */
399 413 204 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 /* $NetBSD: ufs_bswap.h,v 1.23 2018/04/19 21:50:10 christos Exp $ */ /* * Copyright (c) 1998 Manuel Bouyer. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * */ #ifndef _UFS_UFS_BSWAP_H_ #define _UFS_UFS_BSWAP_H_ #if defined(_KERNEL_OPT) #include "opt_ffs.h" #endif #include <sys/bswap.h> /* Macros to access UFS flags */ #ifdef FFS_EI #define UFS_MPNEEDSWAP(ump) ((ump)->um_flags & UFS_NEEDSWAP) #define UFS_FSNEEDSWAP(fs) ((fs)->fs_flags & FS_SWAPPED) #define UFS_IPNEEDSWAP(ip) UFS_MPNEEDSWAP((ip)->i_ump) #else #define UFS_MPNEEDSWAP(ump) ((void)(ump), 0) #define UFS_FSNEEDSWAP(fs) ((void)(fs), 0) #define UFS_IPNEEDSWAP(ip) ((void)(ip), 0) #endif #if (!defined(_KERNEL) && !defined(NO_FFS_EI)) || defined(FFS_EI) /* inlines for access to swapped data */ static __inline u_int16_t ufs_rw16(uint16_t a, int ns) { return ((ns) ? bswap16(a) : (a)); } static __inline u_int32_t ufs_rw32(uint32_t a, int ns) { return ((ns) ? bswap32(a) : (a)); } static __inline u_int64_t ufs_rw64(uint64_t a, int ns) { return ((ns) ? bswap64(a) : (a)); } #else static __inline u_int16_t ufs_rw16(uint16_t a, int ns) { return a; } static __inline u_int32_t ufs_rw32(uint32_t a, int ns) { return a; } static __inline u_int64_t ufs_rw64(uint64_t a, int ns) { return a; } #endif #define ufs_add16(a, b, ns) \ (a) = ufs_rw16(ufs_rw16((a), (ns)) + (b), (ns)) #define ufs_add32(a, b, ns) \ (a) = ufs_rw32(ufs_rw32((a), (ns)) + (b), (ns)) #define ufs_add64(a, b, ns) \ (a) = ufs_rw64(ufs_rw64((a), (ns)) + (b), (ns)) #endif /* !_UFS_UFS_BSWAP_H_ */
16 4 4 4 4 4 63 4 4 4 4 4 4 4 4 4 4 4 2 4 3 15 15 15 15 15 2 2 2 2 2 2 2 2 2 2 2 1 1 1 1 15 3 1 1 1 2 17 17 17 17 3 15 15 17 15 3 15 17 15 17 6 6 3 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 /* $NetBSD: kern_entropy.c,v 1.66 2023/10/04 20:28:06 ad Exp $ */ /*- * Copyright (c) 2019 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Taylor R. Campbell. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /* * Entropy subsystem * * * Each CPU maintains a per-CPU entropy pool so that gathering * entropy requires no interprocessor synchronization, except * early at boot when we may be scrambling to gather entropy as * soon as possible. * * - entropy_enter gathers entropy and never drops it on the * floor, at the cost of sometimes having to do cryptography. * * - entropy_enter_intr gathers entropy or drops it on the * floor, with low latency. Work to stir the pool or kick the * housekeeping thread is scheduled in soft interrupts. * * * entropy_enter immediately enters into the global pool if it * can transition to full entropy in one swell foop. Otherwise, * it defers to a housekeeping thread that consolidates entropy, * but only when the CPUs collectively have full entropy, in * order to mitigate iterative-guessing attacks. * * * The entropy housekeeping thread continues to consolidate * entropy even after we think we have full entropy, in case we * are wrong, but is limited to one discretionary consolidation * per minute, and only when new entropy is actually coming in, * to limit performance impact. * * * The entropy epoch is the number that changes when we * transition from partial entropy to full entropy, so that * users can easily determine when to reseed. This also * facilitates an operator explicitly causing everything to * reseed by sysctl -w kern.entropy.consolidate=1. * * * Entropy depletion is available for testing (or if you're into * that sort of thing), with sysctl -w kern.entropy.depletion=1; * the logic to support it is small, to minimize chance of bugs. * * * While cold, a single global entropy pool is available for * entering and extracting, serialized through splhigh/splx. * The per-CPU entropy pool data structures are initialized in * entropy_init and entropy_init_late (separated mainly for * hysterical raisins at this point), but are not used until the * system is warm, at which point access to the global entropy * pool is limited to thread and softint context and serialized * by E->lock. */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: kern_entropy.c,v 1.66 2023/10/04 20:28:06 ad Exp $"); #include <sys/param.h> #include <sys/types.h> #include <sys/atomic.h> #include <sys/compat_stub.h> #include <sys/condvar.h> #include <sys/cpu.h> #include <sys/entropy.h> #include <sys/errno.h> #include <sys/evcnt.h> #include <sys/event.h> #include <sys/file.h> #include <sys/intr.h> #include <sys/kauth.h> #include <sys/kernel.h> #include <sys/kmem.h> #include <sys/kthread.h> #include <sys/lwp.h> #include <sys/module_hook.h> #include <sys/mutex.h> #include <sys/percpu.h> #include <sys/poll.h> #include <sys/proc.h> #include <sys/queue.h> #include <sys/reboot.h> #include <sys/rnd.h> /* legacy kernel API */ #include <sys/rndio.h> /* userland ioctl interface */ #include <sys/rndsource.h> /* kernel rndsource driver API */ #include <sys/select.h> #include <sys/selinfo.h> #include <sys/sha1.h> /* for boot seed checksum */ #include <sys/stdint.h> #include <sys/sysctl.h> #include <sys/syslog.h> #include <sys/systm.h> #include <sys/time.h> #include <sys/xcall.h> #include <lib/libkern/entpool.h> #include <machine/limits.h> #ifdef __HAVE_CPU_COUNTER #include <machine/cpu_counter.h> #endif #define MINENTROPYBYTES ENTROPY_CAPACITY #define MINENTROPYBITS (MINENTROPYBYTES*NBBY) #define MINSAMPLES (2*MINENTROPYBITS) /* * struct entropy_cpu * * Per-CPU entropy state. The pool is allocated separately * because percpu(9) sometimes moves per-CPU objects around * without zeroing them, which would lead to unwanted copies of * sensitive secrets. The evcnt is allocated separately because * evcnt(9) assumes it stays put in memory. */ struct entropy_cpu { struct entropy_cpu_evcnt { struct evcnt softint; struct evcnt intrdrop; struct evcnt intrtrunc; } *ec_evcnt; struct entpool *ec_pool; unsigned ec_bitspending; unsigned ec_samplespending; bool ec_locked; }; /* * struct entropy_cpu_lock * * State for locking the per-CPU entropy state. */ struct entropy_cpu_lock { int ecl_s; long ecl_pctr; }; /* * struct rndsource_cpu * * Per-CPU rndsource state. */ struct rndsource_cpu { unsigned rc_entropybits; unsigned rc_timesamples; unsigned rc_datasamples; rnd_delta_t rc_timedelta; }; /* * entropy_global (a.k.a. E for short in this file) * * Global entropy state. Writes protected by the global lock. * Some fields, marked (A), can be read outside the lock, and are * maintained with atomic_load/store_relaxed. */ struct { kmutex_t lock; /* covers all global state */ struct entpool pool; /* global pool for extraction */ unsigned bitsneeded; /* (A) needed globally */ unsigned bitspending; /* pending in per-CPU pools */ unsigned samplesneeded; /* (A) needed globally */ unsigned samplespending; /* pending in per-CPU pools */ unsigned timestamp; /* (A) time of last consolidation */ unsigned epoch; /* (A) changes when needed -> 0 */ kcondvar_t cv; /* notifies state changes */ struct selinfo selq; /* notifies needed -> 0 */ struct lwp *sourcelock; /* lock on list of sources */ kcondvar_t sourcelock_cv; /* notifies sourcelock release */ LIST_HEAD(,krndsource) sources; /* list of entropy sources */ bool consolidate; /* kick thread to consolidate */ bool seed_rndsource; /* true if seed source is attached */ bool seeded; /* true if seed file already loaded */ } entropy_global __cacheline_aligned = { /* Fields that must be initialized when the kernel is loaded. */ .bitsneeded = MINENTROPYBITS, .samplesneeded = MINSAMPLES, .epoch = (unsigned)-1, /* -1 means entropy never consolidated */ .sources = LIST_HEAD_INITIALIZER(entropy_global.sources), }; #define E (&entropy_global) /* declutter */ /* Read-mostly globals */ static struct percpu *entropy_percpu __read_mostly; /* struct entropy_cpu */ static void *entropy_sih __read_mostly; /* softint handler */ static struct lwp *entropy_lwp __read_mostly; /* housekeeping thread */ static struct krndsource seed_rndsource __read_mostly; /* * Event counters * * Must be careful with adding these because they can serve as * side channels. */ static struct evcnt entropy_discretionary_evcnt = EVCNT_INITIALIZER(EVCNT_TYPE_MISC, NULL, "entropy", "discretionary"); EVCNT_ATTACH_STATIC(entropy_discretionary_evcnt); static struct evcnt entropy_immediate_evcnt = EVCNT_INITIALIZER(EVCNT_TYPE_MISC, NULL, "entropy", "immediate"); EVCNT_ATTACH_STATIC(entropy_immediate_evcnt); static struct evcnt entropy_partial_evcnt = EVCNT_INITIALIZER(EVCNT_TYPE_MISC, NULL, "entropy", "partial"); EVCNT_ATTACH_STATIC(entropy_partial_evcnt); static struct evcnt entropy_consolidate_evcnt = EVCNT_INITIALIZER(EVCNT_TYPE_MISC, NULL, "entropy", "consolidate"); EVCNT_ATTACH_STATIC(entropy_consolidate_evcnt); static struct evcnt entropy_extract_fail_evcnt = EVCNT_INITIALIZER(EVCNT_TYPE_MISC, NULL, "entropy", "extract fail"); EVCNT_ATTACH_STATIC(entropy_extract_fail_evcnt); static struct evcnt entropy_request_evcnt = EVCNT_INITIALIZER(EVCNT_TYPE_MISC, NULL, "entropy", "request"); EVCNT_ATTACH_STATIC(entropy_request_evcnt); static struct evcnt entropy_deplete_evcnt = EVCNT_INITIALIZER(EVCNT_TYPE_MISC, NULL, "entropy", "deplete"); EVCNT_ATTACH_STATIC(entropy_deplete_evcnt); static struct evcnt entropy_notify_evcnt = EVCNT_INITIALIZER(EVCNT_TYPE_MISC, NULL, "entropy", "notify"); EVCNT_ATTACH_STATIC(entropy_notify_evcnt); /* Sysctl knobs */ static bool entropy_collection = 1; static bool entropy_depletion = 0; /* Silly! */ static const struct sysctlnode *entropy_sysctlroot; static struct sysctllog *entropy_sysctllog; /* Forward declarations */ static void entropy_init_cpu(void *, void *, struct cpu_info *); static void entropy_fini_cpu(void *, void *, struct cpu_info *); static void entropy_account_cpu(struct entropy_cpu *); static void entropy_enter(const void *, size_t, unsigned, bool); static bool entropy_enter_intr(const void *, size_t, unsigned, bool); static void entropy_softintr(void *); static void entropy_thread(void *); static bool entropy_pending(void); static void entropy_pending_cpu(void *, void *, struct cpu_info *); static void entropy_do_consolidate(void); static void entropy_consolidate_xc(void *, void *); static void entropy_notify(void); static int sysctl_entropy_consolidate(SYSCTLFN_ARGS); static int sysctl_entropy_gather(SYSCTLFN_ARGS); static void filt_entropy_read_detach(struct knote *); static int filt_entropy_read_event(struct knote *, long); static int entropy_request(size_t, int); static void rnd_add_data_internal(struct krndsource *, const void *, uint32_t, uint32_t, bool); static void rnd_add_data_1(struct krndsource *, const void *, uint32_t, uint32_t, bool, uint32_t, bool); static unsigned rndsource_entropybits(struct krndsource *); static void rndsource_entropybits_cpu(void *, void *, struct cpu_info *); static void rndsource_to_user(struct krndsource *, rndsource_t *); static void rndsource_to_user_est(struct krndsource *, rndsource_est_t *); static void rndsource_to_user_est_cpu(void *, void *, struct cpu_info *); /* * entropy_timer() * * Cycle counter, time counter, or anything that changes a wee bit * unpredictably. */ static inline uint32_t entropy_timer(void) { struct bintime bt; uint32_t v; /* If we have a CPU cycle counter, use the low 32 bits. */ #ifdef __HAVE_CPU_COUNTER if (__predict_true(cpu_hascounter())) return cpu_counter32(); #endif /* __HAVE_CPU_COUNTER */ /* If we're cold, tough. Can't binuptime while cold. */ if (__predict_false(cold)) return 0; /* Fold the 128 bits of binuptime into 32 bits. */ binuptime(&bt); v = bt.frac; v ^= bt.frac >> 32; v ^= bt.sec; v ^= bt.sec >> 32; return v; } static void attach_seed_rndsource(void) { KASSERT(!cpu_intr_p()); KASSERT(!cpu_softintr_p()); KASSERT(cold); /* * First called no later than entropy_init, while we are still * single-threaded, so no need for RUN_ONCE. */ if (E->seed_rndsource) return; rnd_attach_source(&seed_rndsource, "seed", RND_TYPE_UNKNOWN, RND_FLAG_COLLECT_VALUE); E->seed_rndsource = true; } /* * entropy_init() * * Initialize the entropy subsystem. Panic on failure. * * Requires percpu(9) and sysctl(9) to be initialized. Must run * while cold. */ static void entropy_init(void) { uint32_t extra[2]; struct krndsource *rs; unsigned i = 0; KASSERT(cold); /* Grab some cycle counts early at boot. */ extra[i++] = entropy_timer(); /* Run the entropy pool cryptography self-test. */ if (entpool_selftest() == -1) panic("entropy pool crypto self-test failed"); /* Create the sysctl directory. */ sysctl_createv(&entropy_sysctllog, 0, NULL, &entropy_sysctlroot, CTLFLAG_PERMANENT, CTLTYPE_NODE, "entropy", SYSCTL_DESCR("Entropy (random number sources) options"), NULL, 0, NULL, 0, CTL_KERN, CTL_CREATE, CTL_EOL); /* Create the sysctl knobs. */ /* XXX These shouldn't be writable at securelevel>0. */ sysctl_createv(&entropy_sysctllog, 0, &entropy_sysctlroot, NULL, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_BOOL, "collection", SYSCTL_DESCR("Automatically collect entropy from hardware"), NULL, 0, &entropy_collection, 0, CTL_CREATE, CTL_EOL); sysctl_createv(&entropy_sysctllog, 0, &entropy_sysctlroot, NULL, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_BOOL, "depletion", SYSCTL_DESCR("`Deplete' entropy pool when observed"), NULL, 0, &entropy_depletion, 0, CTL_CREATE, CTL_EOL); sysctl_createv(&entropy_sysctllog, 0, &entropy_sysctlroot, NULL, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_INT, "consolidate", SYSCTL_DESCR("Trigger entropy consolidation now"), sysctl_entropy_consolidate, 0, NULL, 0, CTL_CREATE, CTL_EOL); sysctl_createv(&entropy_sysctllog, 0, &entropy_sysctlroot, NULL, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_INT, "gather", SYSCTL_DESCR("Trigger entropy gathering from sources now"), sysctl_entropy_gather, 0, NULL, 0, CTL_CREATE, CTL_EOL); /* XXX These should maybe not be readable at securelevel>0. */ sysctl_createv(&entropy_sysctllog, 0, &entropy_sysctlroot, NULL, CTLFLAG_PERMANENT|CTLFLAG_READONLY|CTLFLAG_PRIVATE, CTLTYPE_INT, "needed", SYSCTL_DESCR("Systemwide entropy deficit (bits of entropy)"), NULL, 0, &E->bitsneeded, 0, CTL_CREATE, CTL_EOL); sysctl_createv(&entropy_sysctllog, 0, &entropy_sysctlroot, NULL, CTLFLAG_PERMANENT|CTLFLAG_READONLY|CTLFLAG_PRIVATE, CTLTYPE_INT, "pending", SYSCTL_DESCR("Number of bits of entropy pending on CPUs"), NULL, 0, &E->bitspending, 0, CTL_CREATE, CTL_EOL); sysctl_createv(&entropy_sysctllog, 0, &entropy_sysctlroot, NULL, CTLFLAG_PERMANENT|CTLFLAG_READONLY|CTLFLAG_PRIVATE, CTLTYPE_INT, "samplesneeded", SYSCTL_DESCR("Systemwide entropy deficit (samples)"), NULL, 0, &E->samplesneeded, 0, CTL_CREATE, CTL_EOL); sysctl_createv(&entropy_sysctllog, 0, &entropy_sysctlroot, NULL, CTLFLAG_PERMANENT|CTLFLAG_READONLY|CTLFLAG_PRIVATE, CTLTYPE_INT, "samplespending", SYSCTL_DESCR("Number of samples pending on CPUs"), NULL, 0, &E->samplespending, 0, CTL_CREATE, CTL_EOL); sysctl_createv(&entropy_sysctllog, 0, &entropy_sysctlroot, NULL, CTLFLAG_PERMANENT|CTLFLAG_READONLY|CTLFLAG_PRIVATE, CTLTYPE_INT, "epoch", SYSCTL_DESCR("Entropy epoch"), NULL, 0, &E->epoch, 0, CTL_CREATE, CTL_EOL); /* Initialize the global state for multithreaded operation. */ mutex_init(&E->lock, MUTEX_DEFAULT, IPL_SOFTSERIAL); cv_init(&E->cv, "entropy"); selinit(&E->selq); cv_init(&E->sourcelock_cv, "entsrclock"); /* Make sure the seed source is attached. */ attach_seed_rndsource(); /* Note if the bootloader didn't provide a seed. */ if (!E->seeded) aprint_debug("entropy: no seed from bootloader\n"); /* Allocate the per-CPU records for all early entropy sources. */ LIST_FOREACH(rs, &E->sources, list) rs->state = percpu_alloc(sizeof(struct rndsource_cpu)); /* Allocate and initialize the per-CPU state. */ entropy_percpu = percpu_create(sizeof(struct entropy_cpu), entropy_init_cpu, entropy_fini_cpu, NULL); /* Enter the boot cycle count to get started. */ extra[i++] = entropy_timer(); KASSERT(i == __arraycount(extra)); entropy_enter(extra, sizeof extra, /*nbits*/0, /*count*/false); explicit_memset(extra, 0, sizeof extra); } /* * entropy_init_late() * * Late initialization. Panic on failure. * * Requires CPUs to have been detected and LWPs to have started. * Must run while cold. */ static void entropy_init_late(void) { int error; KASSERT(cold); /* * Establish the softint at the highest softint priority level. * Must happen after CPU detection. */ entropy_sih = softint_establish(SOFTINT_SERIAL|SOFTINT_MPSAFE, &entropy_softintr, NULL); if (entropy_sih == NULL) panic("unable to establish entropy softint"); /* * Create the entropy housekeeping thread. Must happen after * lwpinit. */ error = kthread_create(PRI_NONE, KTHREAD_MPSAFE|KTHREAD_TS, NULL, entropy_thread, NULL, &entropy_lwp, "entbutler"); if (error) panic("unable to create entropy housekeeping thread: %d", error); } /* * entropy_init_cpu(ptr, cookie, ci) * * percpu(9) constructor for per-CPU entropy pool. */ static void entropy_init_cpu(void *ptr, void *cookie, struct cpu_info *ci) { struct entropy_cpu *ec = ptr; const char *cpuname; ec->ec_evcnt = kmem_alloc(sizeof(*ec->ec_evcnt), KM_SLEEP); ec->ec_pool = kmem_zalloc(sizeof(*ec->ec_pool), KM_SLEEP); ec->ec_bitspending = 0; ec->ec_samplespending = 0; ec->ec_locked = false; /* XXX ci_cpuname may not be initialized early enough. */ cpuname = ci->ci_cpuname[0] == '\0' ? "cpu0" : ci->ci_cpuname; evcnt_attach_dynamic(&ec->ec_evcnt->softint, EVCNT_TYPE_MISC, NULL, cpuname, "entropy softint"); evcnt_attach_dynamic(&ec->ec_evcnt->intrdrop, EVCNT_TYPE_MISC, NULL, cpuname, "entropy intrdrop"); evcnt_attach_dynamic(&ec->ec_evcnt->intrtrunc, EVCNT_TYPE_MISC, NULL, cpuname, "entropy intrtrunc"); } /* * entropy_fini_cpu(ptr, cookie, ci) * * percpu(9) destructor for per-CPU entropy pool. */ static void entropy_fini_cpu(void *ptr, void *cookie, struct cpu_info *ci) { struct entropy_cpu *ec = ptr; /* * Zero any lingering data. Disclosure of the per-CPU pool * shouldn't retroactively affect the security of any keys * generated, because entpool(9) erases whatever we have just * drawn out of any pool, but better safe than sorry. */ explicit_memset(ec->ec_pool, 0, sizeof(*ec->ec_pool)); evcnt_detach(&ec->ec_evcnt->intrtrunc); evcnt_detach(&ec->ec_evcnt->intrdrop); evcnt_detach(&ec->ec_evcnt->softint); kmem_free(ec->ec_pool, sizeof(*ec->ec_pool)); kmem_free(ec->ec_evcnt, sizeof(*ec->ec_evcnt)); } /* * ec = entropy_cpu_get(&lock) * entropy_cpu_put(&lock, ec) * * Lock and unlock the per-CPU entropy state. This only prevents * access on the same CPU -- by hard interrupts, by soft * interrupts, or by other threads. * * Blocks soft interrupts and preemption altogether; doesn't block * hard interrupts, but causes samples in hard interrupts to be * dropped. */ static struct entropy_cpu * entropy_cpu_get(struct entropy_cpu_lock *lock) { struct entropy_cpu *ec; ec = percpu_getref(entropy_percpu); lock->ecl_s = splsoftserial(); KASSERT(!ec->ec_locked); ec->ec_locked = true; lock->ecl_pctr = lwp_pctr(); __insn_barrier(); return ec; } static void entropy_cpu_put(struct entropy_cpu_lock *lock, struct entropy_cpu *ec) { KASSERT(ec == percpu_getptr_remote(entropy_percpu, curcpu())); KASSERT(ec->ec_locked); __insn_barrier(); KASSERT(lock->ecl_pctr == lwp_pctr()); ec->ec_locked = false; splx(lock->ecl_s); percpu_putref(entropy_percpu); } /* * entropy_seed(seed) * * Seed the entropy pool with seed. Meant to be called as early * as possible by the bootloader; may be called before or after * entropy_init. Must be called before system reaches userland. * Must be called in thread or soft interrupt context, not in hard * interrupt context. Must be called at most once. * * Overwrites the seed in place. Caller may then free the memory. */ static void entropy_seed(rndsave_t *seed) { SHA1_CTX ctx; uint8_t digest[SHA1_DIGEST_LENGTH]; bool seeded; KASSERT(!cpu_intr_p()); KASSERT(!cpu_softintr_p()); KASSERT(cold); /* * Verify the checksum. If the checksum fails, take the data * but ignore the entropy estimate -- the file may have been * incompletely written with garbage, which is harmless to add * but may not be as unpredictable as alleged. */ SHA1Init(&ctx); SHA1Update(&ctx, (const void *)&seed->entropy, sizeof(seed->entropy)); SHA1Update(&ctx, seed->data, sizeof(seed->data)); SHA1Final(digest, &ctx); CTASSERT(sizeof(seed->digest) == sizeof(digest)); if (!consttime_memequal(digest, seed->digest, sizeof(digest))) { printf("entropy: invalid seed checksum\n"); seed->entropy = 0; } explicit_memset(&ctx, 0, sizeof ctx); explicit_memset(digest, 0, sizeof digest); /* * If the entropy is insensibly large, try byte-swapping. * Otherwise assume the file is corrupted and act as though it * has zero entropy. */ if (howmany(seed->entropy, NBBY) > sizeof(seed->data)) { seed->entropy = bswap32(seed->entropy); if (howmany(seed->entropy, NBBY) > sizeof(seed->data)) seed->entropy = 0; } /* Make sure the seed source is attached. */ attach_seed_rndsource(); /* Test and set E->seeded. */ seeded = E->seeded; E->seeded = (seed->entropy > 0); /* * If we've been seeded, may be re-entering the same seed * (e.g., bootloader vs module init, or something). No harm in * entering it twice, but it contributes no additional entropy. */ if (seeded) { printf("entropy: double-seeded by bootloader\n"); seed->entropy = 0; } else { printf("entropy: entering seed from bootloader" " with %u bits of entropy\n", (unsigned)seed->entropy); } /* Enter it into the pool and promptly zero it. */ rnd_add_data(&seed_rndsource, seed->data, sizeof(seed->data), seed->entropy); explicit_memset(seed, 0, sizeof(*seed)); } /* * entropy_bootrequest() * * Request entropy from all sources at boot, once config is * complete and interrupts are running but we are still cold. */ void entropy_bootrequest(void) { int error; KASSERT(!cpu_intr_p()); KASSERT(!cpu_softintr_p()); KASSERT(cold); /* * Request enough to satisfy the maximum entropy shortage. * This is harmless overkill if the bootloader provided a seed. */ error = entropy_request(MINENTROPYBYTES, ENTROPY_WAIT); KASSERTMSG(error == 0, "error=%d", error); } /* * entropy_epoch() * * Returns the current entropy epoch. If this changes, you should * reseed. If -1, means system entropy has not yet reached full * entropy or been explicitly consolidated; never reverts back to * -1. Never zero, so you can always use zero as an uninitialized * sentinel value meaning `reseed ASAP'. * * Usage model: * * struct foo { * struct crypto_prng prng; * unsigned epoch; * } *foo; * * unsigned epoch = entropy_epoch(); * if (__predict_false(epoch != foo->epoch)) { * uint8_t seed[32]; * if (entropy_extract(seed, sizeof seed, 0) != 0) * warn("no entropy"); * crypto_prng_reseed(&foo->prng, seed, sizeof seed); * foo->epoch = epoch; * } */ unsigned entropy_epoch(void) { /* * Unsigned int, so no need for seqlock for an atomic read, but * make sure we read it afresh each time. */ return atomic_load_relaxed(&E->epoch); } /* * entropy_ready() * * True if the entropy pool has full entropy. */ bool entropy_ready(void) { return atomic_load_relaxed(&E->bitsneeded) == 0; } /* * entropy_account_cpu(ec) * * Consider whether to consolidate entropy into the global pool * after we just added some into the current CPU's pending pool. * * - If this CPU can provide enough entropy now, do so. * * - If this and whatever else is available on other CPUs can * provide enough entropy, kick the consolidation thread. * * - Otherwise, do as little as possible, except maybe consolidate * entropy at most once a minute. * * Caller must be bound to a CPU and therefore have exclusive * access to ec. Will acquire and release the global lock. */ static void entropy_account_cpu(struct entropy_cpu *ec) { struct entropy_cpu_lock lock; struct entropy_cpu *ec0; unsigned bitsdiff, samplesdiff; KASSERT(!cpu_intr_p()); KASSERT(!cold); KASSERT(curlwp->l_pflag & LP_BOUND); /* * If there's no entropy needed, and entropy has been * consolidated in the last minute, do nothing. */ if (__predict_true(atomic_load_relaxed(&E->bitsneeded) == 0) && __predict_true(!atomic_load_relaxed(&entropy_depletion)) && __predict_true((time_uptime - E->timestamp) <= 60)) return; /* * Consider consolidation, under the global lock and with the * per-CPU state locked. */ mutex_enter(&E->lock); ec0 = entropy_cpu_get(&lock); KASSERT(ec0 == ec); if (ec->ec_bitspending == 0 && ec->ec_samplespending == 0) { /* Raced with consolidation xcall. Nothing to do. */ } else if (E->bitsneeded != 0 && E->bitsneeded <= ec->ec_bitspending) { /* * If we have not yet attained full entropy but we can * now, do so. This way we disseminate entropy * promptly when it becomes available early at boot; * otherwise we leave it to the entropy consolidation * thread, which is rate-limited to mitigate side * channels and abuse. */ uint8_t buf[ENTPOOL_CAPACITY]; /* Transfer from the local pool to the global pool. */ entpool_extract(ec->ec_pool, buf, sizeof buf); entpool_enter(&E->pool, buf, sizeof buf); atomic_store_relaxed(&ec->ec_bitspending, 0); atomic_store_relaxed(&ec->ec_samplespending, 0); atomic_store_relaxed(&E->bitsneeded, 0); atomic_store_relaxed(&E->samplesneeded, 0); /* Notify waiters that we now have full entropy. */ entropy_notify(); entropy_immediate_evcnt.ev_count++; } else { /* Determine how much we can add to the global pool. */ KASSERTMSG(E->bitspending <= MINENTROPYBITS, "E->bitspending=%u", E->bitspending); bitsdiff = MIN(ec->ec_bitspending, MINENTROPYBITS - E->bitspending); KASSERTMSG(E->samplespending <= MINSAMPLES, "E->samplespending=%u", E->samplespending); samplesdiff = MIN(ec->ec_samplespending, MINSAMPLES - E->samplespending); /* * This should make a difference unless we are already * saturated. */ KASSERTMSG((bitsdiff || samplesdiff || E->bitspending == MINENTROPYBITS || E->samplespending == MINSAMPLES), "bitsdiff=%u E->bitspending=%u ec->ec_bitspending=%u" "samplesdiff=%u E->samplespending=%u" " ec->ec_samplespending=%u" " minentropybits=%u minsamples=%u", bitsdiff, E->bitspending, ec->ec_bitspending, samplesdiff, E->samplespending, ec->ec_samplespending, (unsigned)MINENTROPYBITS, (unsigned)MINSAMPLES); /* Add to the global, subtract from the local. */ E->bitspending += bitsdiff; KASSERTMSG(E->bitspending <= MINENTROPYBITS, "E->bitspending=%u", E->bitspending); atomic_store_relaxed(&ec->ec_bitspending, ec->ec_bitspending - bitsdiff); E->samplespending += samplesdiff; KASSERTMSG(E->samplespending <= MINSAMPLES, "E->samplespending=%u", E->samplespending); atomic_store_relaxed(&ec->ec_samplespending, ec->ec_samplespending - samplesdiff); /* One or the other must have gone up from zero. */ KASSERT(E->bitspending || E->samplespending); if (E->bitsneeded <= E->bitspending || E->samplesneeded <= E->samplespending) { /* * Enough bits or at least samples between all * the per-CPU pools. Leave a note for the * housekeeping thread to consolidate entropy * next time it wakes up -- and wake it up if * this is the first time, to speed things up. * * If we don't need any entropy, this doesn't * mean much, but it is the only time we ever * gather additional entropy in case the * accounting has been overly optimistic. This * happens at most once a minute, so there's * negligible performance cost. */ E->consolidate = true; if (E->epoch == (unsigned)-1) cv_broadcast(&E->cv); if (E->bitsneeded == 0) entropy_discretionary_evcnt.ev_count++; } else { /* Can't get full entropy. Keep gathering. */ entropy_partial_evcnt.ev_count++; } } entropy_cpu_put(&lock, ec); mutex_exit(&E->lock); } /* * entropy_enter_early(buf, len, nbits) * * Do entropy bookkeeping globally, before we have established * per-CPU pools. Enter directly into the global pool in the hope * that we enter enough before the first entropy_extract to thwart * iterative-guessing attacks; entropy_extract will warn if not. */ static void entropy_enter_early(const void *buf, size_t len, unsigned nbits) { bool notify = false; int s; KASSERT(cold); /* * We're early at boot before multithreading and multi-CPU * operation, and we don't have softints yet to defer * processing from interrupt context, so we have to enter the * samples directly into the global pool. But interrupts may * be enabled, and we enter this path from interrupt context, * so block interrupts until we're done. */ s = splhigh(); /* Enter it into the pool. */ entpool_enter(&E->pool, buf, len); /* * Decide whether to notify reseed -- we will do so if either: * (a) we transition from partial entropy to full entropy, or * (b) we get a batch of full entropy all at once. * We don't count timing samples because we assume, while cold, * there's not likely to be much jitter yet. */ notify |= (E->bitsneeded && E->bitsneeded <= nbits); notify |= (nbits >= MINENTROPYBITS); /* * Subtract from the needed count and notify if appropriate. * We don't count samples here because entropy_timer might * still be returning zero at this point if there's no CPU * cycle counter. */ E->bitsneeded -= MIN(E->bitsneeded, nbits); if (notify) { entropy_notify(); entropy_immediate_evcnt.ev_count++; } splx(s); } /* * entropy_enter(buf, len, nbits, count) * * Enter len bytes of data from buf into the system's entropy * pool, stirring as necessary when the internal buffer fills up. * nbits is a lower bound on the number of bits of entropy in the * process that led to this sample. */ static void entropy_enter(const void *buf, size_t len, unsigned nbits, bool count) { struct entropy_cpu_lock lock; struct entropy_cpu *ec; unsigned bitspending, samplespending; int bound; KASSERTMSG(!cpu_intr_p(), "use entropy_enter_intr from interrupt context"); KASSERTMSG(howmany(nbits, NBBY) <= len, "impossible entropy rate: %u bits in %zu-byte string", nbits, len); /* * If we're still cold, just use entropy_enter_early to put * samples directly into the global pool. */ if (__predict_false(cold)) { entropy_enter_early(buf, len, nbits); return; } /* * Bind ourselves to the current CPU so we don't switch CPUs * between entering data into the current CPU's pool (and * updating the pending count) and transferring it to the * global pool in entropy_account_cpu. */ bound = curlwp_bind(); /* * With the per-CPU state locked, enter into the per-CPU pool * and count up what we can add. * * We don't count samples while cold because entropy_timer * might still be returning zero if there's no CPU cycle * counter. */ ec = entropy_cpu_get(&lock); entpool_enter(ec->ec_pool, buf, len); bitspending = ec->ec_bitspending; bitspending += MIN(MINENTROPYBITS - bitspending, nbits); atomic_store_relaxed(&ec->ec_bitspending, bitspending); samplespending = ec->ec_samplespending; if (__predict_true(count)) { samplespending += MIN(MINSAMPLES - samplespending, 1); atomic_store_relaxed(&ec->ec_samplespending, samplespending); } entropy_cpu_put(&lock, ec); /* Consolidate globally if appropriate based on what we added. */ if (bitspending > 0 || samplespending >= MINSAMPLES) entropy_account_cpu(ec); curlwp_bindx(bound); } /* * entropy_enter_intr(buf, len, nbits, count) * * Enter up to len bytes of data from buf into the system's * entropy pool without stirring. nbits is a lower bound on the * number of bits of entropy in the process that led to this * sample. If the sample could be entered completely, assume * nbits of entropy pending; otherwise assume none, since we don't * know whether some parts of the sample are constant, for * instance. Schedule a softint to stir the entropy pool if * needed. Return true if used fully, false if truncated at all. * * Using this in thread or softint context with no spin locks held * will work, but you might as well use entropy_enter in that * case. */ static bool entropy_enter_intr(const void *buf, size_t len, unsigned nbits, bool count) { struct entropy_cpu *ec; bool fullyused = false; uint32_t bitspending, samplespending; int s; KASSERTMSG(howmany(nbits, NBBY) <= len, "impossible entropy rate: %u bits in %zu-byte string", nbits, len); /* * If we're still cold, just use entropy_enter_early to put * samples directly into the global pool. */ if (__predict_false(cold)) { entropy_enter_early(buf, len, nbits); return true; } /* * In case we were called in thread or interrupt context with * interrupts unblocked, block soft interrupts up to * IPL_SOFTSERIAL. This way logic that is safe in interrupt * context or under a spin lock is also safe in less * restrictive contexts. */ s = splsoftserial(); /* * Acquire the per-CPU state. If someone is in the middle of * using it, drop the sample. Otherwise, take the lock so that * higher-priority interrupts will drop their samples. */ ec = percpu_getref(entropy_percpu); if (ec->ec_locked) { ec->ec_evcnt->intrdrop.ev_count++; goto out0; } ec->ec_locked = true; __insn_barrier(); /* * Enter as much as we can into the per-CPU pool. If it was * truncated, schedule a softint to stir the pool and stop. */ if (!entpool_enter_nostir(ec->ec_pool, buf, len)) { if (__predict_true(!cold)) softint_schedule(entropy_sih); ec->ec_evcnt->intrtrunc.ev_count++; goto out1; } fullyused = true; /* * Count up what we can contribute. * * We don't count samples while cold because entropy_timer * might still be returning zero if there's no CPU cycle * counter. */ bitspending = ec->ec_bitspending; bitspending += MIN(MINENTROPYBITS - bitspending, nbits); atomic_store_relaxed(&ec->ec_bitspending, bitspending); if (__predict_true(count)) { samplespending = ec->ec_samplespending; samplespending += MIN(MINSAMPLES - samplespending, 1); atomic_store_relaxed(&ec->ec_samplespending, samplespending); } /* Schedule a softint if we added anything and it matters. */ if (__predict_false(atomic_load_relaxed(&E->bitsneeded) || atomic_load_relaxed(&entropy_depletion)) && (nbits != 0 || count) && __predict_true(!cold)) softint_schedule(entropy_sih); out1: /* Release the per-CPU state. */ KASSERT(ec->ec_locked); __insn_barrier(); ec->ec_locked = false; out0: percpu_putref(entropy_percpu); splx(s); return fullyused; } /* * entropy_softintr(cookie) * * Soft interrupt handler for entering entropy. Takes care of * stirring the local CPU's entropy pool if it filled up during * hard interrupts, and promptly crediting entropy from the local * CPU's entropy pool to the global entropy pool if needed. */ static void entropy_softintr(void *cookie) { struct entropy_cpu_lock lock; struct entropy_cpu *ec; unsigned bitspending, samplespending; /* * With the per-CPU state locked, stir the pool if necessary * and determine if there's any pending entropy on this CPU to * account globally. */ ec = entropy_cpu_get(&lock); ec->ec_evcnt->softint.ev_count++; entpool_stir(ec->ec_pool); bitspending = ec->ec_bitspending; samplespending = ec->ec_samplespending; entropy_cpu_put(&lock, ec); /* Consolidate globally if appropriate based on what we added. */ if (bitspending > 0 || samplespending >= MINSAMPLES) entropy_account_cpu(ec); } /* * entropy_thread(cookie) * * Handle any asynchronous entropy housekeeping. */ static void entropy_thread(void *cookie) { bool consolidate; #ifndef _RUMPKERNEL /* XXX rump starts threads before cold */ KASSERT(!cold); #endif for (;;) { /* * Wait until there's full entropy somewhere among the * CPUs, as confirmed at most once per minute, or * someone wants to consolidate. */ if (entropy_pending()) { consolidate = true; } else { mutex_enter(&E->lock); if (!E->consolidate) cv_timedwait(&E->cv, &E->lock, 60*hz); consolidate = E->consolidate; E->consolidate = false; mutex_exit(&E->lock); } if (consolidate) { /* Do it. */ entropy_do_consolidate(); /* Mitigate abuse. */ kpause("entropy", false, hz, NULL); } } } struct entropy_pending_count { uint32_t bitspending; uint32_t samplespending; }; /* * entropy_pending() * * True if enough bits or samples are pending on other CPUs to * warrant consolidation. */ static bool entropy_pending(void) { struct entropy_pending_count count = { 0, 0 }, *C = &count; percpu_foreach(entropy_percpu, &entropy_pending_cpu, C); return C->bitspending >= MINENTROPYBITS || C->samplespending >= MINSAMPLES; } static void entropy_pending_cpu(void *ptr, void *cookie, struct cpu_info *ci) { struct entropy_cpu *ec = ptr; struct entropy_pending_count *C = cookie; uint32_t cpu_bitspending; uint32_t cpu_samplespending; cpu_bitspending = atomic_load_relaxed(&ec->ec_bitspending); cpu_samplespending = atomic_load_relaxed(&ec->ec_samplespending); C->bitspending += MIN(MINENTROPYBITS - C->bitspending, cpu_bitspending); C->samplespending += MIN(MINSAMPLES - C->samplespending, cpu_samplespending); } /* * entropy_do_consolidate() * * Issue a cross-call to gather entropy on all CPUs and advance * the entropy epoch. */ static void entropy_do_consolidate(void) { static const struct timeval interval = {.tv_sec = 60, .tv_usec = 0}; static struct timeval lasttime; /* serialized by E->lock */ struct entpool pool; uint8_t buf[ENTPOOL_CAPACITY]; unsigned bitsdiff, samplesdiff; uint64_t ticket; KASSERT(!cold); ASSERT_SLEEPABLE(); /* Gather entropy on all CPUs into a temporary pool. */ memset(&pool, 0, sizeof pool); ticket = xc_broadcast(0, &entropy_consolidate_xc, &pool, NULL); xc_wait(ticket); /* Acquire the lock to notify waiters. */ mutex_enter(&E->lock); /* Count another consolidation. */ entropy_consolidate_evcnt.ev_count++; /* Note when we last consolidated, i.e. now. */ E->timestamp = time_uptime; /* Mix what we gathered into the global pool. */ entpool_extract(&pool, buf, sizeof buf); entpool_enter(&E->pool, buf, sizeof buf); explicit_memset(&pool, 0, sizeof pool); /* Count the entropy that was gathered. */ bitsdiff = MIN(E->bitsneeded, E->bitspending); atomic_store_relaxed(&E->bitsneeded, E->bitsneeded - bitsdiff); E->bitspending -= bitsdiff; if (__predict_false(E->bitsneeded > 0) && bitsdiff != 0) { if ((boothowto & AB_DEBUG) != 0 && ratecheck(&lasttime, &interval)) { printf("WARNING:" " consolidating less than full entropy\n"); } } samplesdiff = MIN(E->samplesneeded, E->samplespending); atomic_store_relaxed(&E->samplesneeded, E->samplesneeded - samplesdiff); E->samplespending -= samplesdiff; /* Advance the epoch and notify waiters. */ entropy_notify(); /* Release the lock. */ mutex_exit(&E->lock); } /* * entropy_consolidate_xc(vpool, arg2) * * Extract output from the local CPU's input pool and enter it * into a temporary pool passed as vpool. */ static void entropy_consolidate_xc(void *vpool, void *arg2 __unused) { struct entpool *pool = vpool; struct entropy_cpu_lock lock; struct entropy_cpu *ec; uint8_t buf[ENTPOOL_CAPACITY]; uint32_t extra[7]; unsigned i = 0; /* Grab CPU number and cycle counter to mix extra into the pool. */ extra[i++] = cpu_number(); extra[i++] = entropy_timer(); /* * With the per-CPU state locked, extract from the per-CPU pool * and count it as no longer pending. */ ec = entropy_cpu_get(&lock); extra[i++] = entropy_timer(); entpool_extract(ec->ec_pool, buf, sizeof buf); atomic_store_relaxed(&ec->ec_bitspending, 0); atomic_store_relaxed(&ec->ec_samplespending, 0); extra[i++] = entropy_timer(); entropy_cpu_put(&lock, ec); extra[i++] = entropy_timer(); /* * Copy over statistics, and enter the per-CPU extract and the * extra timing into the temporary pool, under the global lock. */ mutex_enter(&E->lock); extra[i++] = entropy_timer(); entpool_enter(pool, buf, sizeof buf); explicit_memset(buf, 0, sizeof buf); extra[i++] = entropy_timer(); KASSERT(i == __arraycount(extra)); entpool_enter(pool, extra, sizeof extra); explicit_memset(extra, 0, sizeof extra); mutex_exit(&E->lock); } /* * entropy_notify() * * Caller just contributed entropy to the global pool. Advance * the entropy epoch and notify waiters. * * Caller must hold the global entropy lock. */ static void entropy_notify(void) { static const struct timeval interval = {.tv_sec = 60, .tv_usec = 0}; static struct timeval lasttime; /* serialized by E->lock */ static bool ready = false, besteffort = false; unsigned epoch; KASSERT(__predict_false(cold) || mutex_owned(&E->lock)); /* * If this is the first time, print a message to the console * that we're ready so operators can compare it to the timing * of other events. * * If we didn't get full entropy from reliable sources, report * instead that we are running on fumes with best effort. (If * we ever do get full entropy after that, print the ready * message once.) */ if (__predict_false(!ready)) { if (E->bitsneeded == 0) { printf("entropy: ready\n"); ready = true; } else if (E->samplesneeded == 0 && !besteffort) { printf("entropy: best effort\n"); besteffort = true; } } /* Set the epoch; roll over from UINTMAX-1 to 1. */ if (__predict_true(!atomic_load_relaxed(&entropy_depletion)) || ratecheck(&lasttime, &interval)) { epoch = E->epoch + 1; if (epoch == 0 || epoch == (unsigned)-1) epoch = 1; atomic_store_relaxed(&E->epoch, epoch); } KASSERT(E->epoch != (unsigned)-1); /* Notify waiters. */ if (__predict_true(!cold)) { cv_broadcast(&E->cv); selnotify(&E->selq, POLLIN|POLLRDNORM, NOTE_SUBMIT); } /* Count another notification. */ entropy_notify_evcnt.ev_count++; } /* * entropy_consolidate() * * Trigger entropy consolidation and wait for it to complete. * * This should be used sparingly, not periodically -- requiring * conscious intervention by the operator or a clear policy * decision. Otherwise, the kernel will automatically consolidate * when enough entropy has been gathered into per-CPU pools to * transition to full entropy. */ void entropy_consolidate(void) { uint64_t ticket; int error; KASSERT(!cold); ASSERT_SLEEPABLE(); mutex_enter(&E->lock); ticket = entropy_consolidate_evcnt.ev_count; E->consolidate = true; cv_broadcast(&E->cv); while (ticket == entropy_consolidate_evcnt.ev_count) { error = cv_wait_sig(&E->cv, &E->lock); if (error) break; } mutex_exit(&E->lock); } /* * sysctl -w kern.entropy.consolidate=1 * * Trigger entropy consolidation and wait for it to complete. * Writable only by superuser. This, writing to /dev/random, and * ioctl(RNDADDDATA) are the only ways for the system to * consolidate entropy if the operator knows something the kernel * doesn't about how unpredictable the pending entropy pools are. */ static int sysctl_entropy_consolidate(SYSCTLFN_ARGS) { struct sysctlnode node = *rnode; int arg = 0; int error; node.sysctl_data = &arg; error = sysctl_lookup(SYSCTLFN_CALL(&node)); if (error || newp == NULL) return error; if (arg) entropy_consolidate(); return error; } /* * sysctl -w kern.entropy.gather=1 * * Trigger gathering entropy from all on-demand sources, and wait * for synchronous sources (but not asynchronous sources) to * complete. Writable only by superuser. */ static int sysctl_entropy_gather(SYSCTLFN_ARGS) { struct sysctlnode node = *rnode; int arg = 0; int error; node.sysctl_data = &arg; error = sysctl_lookup(SYSCTLFN_CALL(&node)); if (error || newp == NULL) return error; if (arg) { mutex_enter(&E->lock); error = entropy_request(ENTROPY_CAPACITY, ENTROPY_WAIT|ENTROPY_SIG); mutex_exit(&E->lock); } return 0; } /* * entropy_extract(buf, len, flags) * * Extract len bytes from the global entropy pool into buf. * * Caller MUST NOT expose these bytes directly -- must use them * ONLY to seed a cryptographic pseudorandom number generator * (`CPRNG'), a.k.a. deterministic random bit generator (`DRBG'), * and then erase them. entropy_extract does not, on its own, * provide backtracking resistance -- it must be combined with a * PRNG/DRBG that does. * * This may be used very early at boot, before even entropy_init * has been called. * * You generally shouldn't use this directly -- use cprng(9) * instead. * * Flags may have: * * ENTROPY_WAIT Wait for entropy if not available yet. * ENTROPY_SIG Allow interruption by a signal during wait. * ENTROPY_HARDFAIL Either fill the buffer with full entropy, * or fail without filling it at all. * * Return zero on success, or error on failure: * * EWOULDBLOCK No entropy and ENTROPY_WAIT not set. * EINTR/ERESTART No entropy, ENTROPY_SIG set, and interrupted. * * If ENTROPY_WAIT is set, allowed only in thread context. If * ENTROPY_WAIT is not set, allowed also in softint context -- may * sleep on an adaptive lock up to IPL_SOFTSERIAL. Forbidden in * hard interrupt context. */ int entropy_extract(void *buf, size_t len, int flags) { static const struct timeval interval = {.tv_sec = 60, .tv_usec = 0}; static struct timeval lasttime; /* serialized by E->lock */ bool printed = false; int s = -1/*XXXGCC*/, error; if (ISSET(flags, ENTROPY_WAIT)) { ASSERT_SLEEPABLE(); KASSERT(!cold); } /* Refuse to operate in interrupt context. */ KASSERT(!cpu_intr_p()); /* * If we're cold, we are only contending with interrupts on the * current CPU, so block them. Otherwise, we are _not_ * contending with interrupts on the current CPU, but we are * contending with other threads, to exclude them with a mutex. */ if (__predict_false(cold)) s = splhigh(); else mutex_enter(&E->lock); /* Wait until there is enough entropy in the system. */ error = 0; if (E->bitsneeded > 0 && E->samplesneeded == 0) { /* * We don't have full entropy from reliable sources, * but we gathered a plausible number of samples from * other sources such as timers. Try asking for more * from any sources we can, but don't worry if it * fails -- best effort. */ (void)entropy_request(ENTROPY_CAPACITY, flags); } else while (E->bitsneeded > 0 && E->samplesneeded > 0) { /* Ask for more, synchronously if possible. */ error = entropy_request(len, flags); if (error) break; /* If we got enough, we're done. */ if (E->bitsneeded == 0 || E->samplesneeded == 0) { KASSERT(error == 0); break; } /* If not waiting, stop here. */ if (!ISSET(flags, ENTROPY_WAIT)) { error = EWOULDBLOCK; break; } /* Wait for some entropy to come in and try again. */ KASSERT(!cold); if (!printed) { printf("entropy: pid %d (%s) waiting for entropy(7)\n", curproc->p_pid, curproc->p_comm); printed = true; } if (ISSET(flags, ENTROPY_SIG)) { error = cv_timedwait_sig(&E->cv, &E->lock, hz); if (error && error != EWOULDBLOCK) break; } else { cv_timedwait(&E->cv, &E->lock, hz); } } /* * Count failure -- but fill the buffer nevertheless, unless * the caller specified ENTROPY_HARDFAIL. */ if (error) { if (ISSET(flags, ENTROPY_HARDFAIL)) goto out; entropy_extract_fail_evcnt.ev_count++; } /* * Report a warning if we haven't yet reached full entropy. * This is the only case where we consider entropy to be * `depleted' without kern.entropy.depletion enabled -- when we * only have partial entropy, an adversary may be able to * narrow the state of the pool down to a small number of * possibilities; the output then enables them to confirm a * guess, reducing its entropy from the adversary's perspective * to zero. * * This should only happen if the operator has chosen to * consolidate, either through sysctl kern.entropy.consolidate * or by writing less than full entropy to /dev/random as root * (which /dev/random promises will immediately affect * subsequent output, for better or worse). */ if (E->bitsneeded > 0 && E->samplesneeded > 0) { if (__predict_false(E->epoch == (unsigned)-1) && ratecheck(&lasttime, &interval)) { printf("WARNING:" " system needs entropy for security;" " see entropy(7)\n"); } atomic_store_relaxed(&E->bitsneeded, MINENTROPYBITS); atomic_store_relaxed(&E->samplesneeded, MINSAMPLES); } /* Extract data from the pool, and `deplete' if we're doing that. */ entpool_extract(&E->pool, buf, len); if (__predict_false(atomic_load_relaxed(&entropy_depletion)) && error == 0) { unsigned cost = MIN(len, ENTROPY_CAPACITY)*NBBY; unsigned bitsneeded = E->bitsneeded; unsigned samplesneeded = E->samplesneeded; bitsneeded += MIN(MINENTROPYBITS - bitsneeded, cost); samplesneeded += MIN(MINSAMPLES - samplesneeded, cost); atomic_store_relaxed(&E->bitsneeded, bitsneeded); atomic_store_relaxed(&E->samplesneeded, samplesneeded); entropy_deplete_evcnt.ev_count++; } out: /* Release the global lock and return the error. */ if (__predict_false(cold)) splx(s); else mutex_exit(&E->lock); return error; } /* * entropy_poll(events) * * Return the subset of events ready, and if it is not all of * events, record curlwp as waiting for entropy. */ int entropy_poll(int events) { int revents = 0; KASSERT(!cold); /* Always ready for writing. */ revents |= events & (POLLOUT|POLLWRNORM); /* Narrow it down to reads. */ events &= POLLIN|POLLRDNORM; if (events == 0) return revents; /* * If we have reached full entropy and we're not depleting * entropy, we are forever ready. */ if (__predict_true(atomic_load_relaxed(&E->bitsneeded) == 0 || atomic_load_relaxed(&E->samplesneeded) == 0) && __predict_true(!atomic_load_relaxed(&entropy_depletion))) return revents | events; /* * Otherwise, check whether we need entropy under the lock. If * we don't, we're ready; if we do, add ourselves to the queue. */ mutex_enter(&E->lock); if (E->bitsneeded == 0 || E->samplesneeded == 0) revents |= events; else selrecord(curlwp, &E->selq); mutex_exit(&E->lock); return revents; } /* * filt_entropy_read_detach(kn) * * struct filterops::f_detach callback for entropy read events: * remove kn from the list of waiters. */ static void filt_entropy_read_detach(struct knote *kn) { KASSERT(!cold); mutex_enter(&E->lock); selremove_knote(&E->selq, kn); mutex_exit(&E->lock); } /* * filt_entropy_read_event(kn, hint) * * struct filterops::f_event callback for entropy read events: * poll for entropy. Caller must hold the global entropy lock if * hint is NOTE_SUBMIT, and must not if hint is not NOTE_SUBMIT. */ static int filt_entropy_read_event(struct knote *kn, long hint) { int ret; KASSERT(!cold); /* Acquire the lock, if caller is outside entropy subsystem. */ if (hint == NOTE_SUBMIT) KASSERT(mutex_owned(&E->lock)); else mutex_enter(&E->lock); /* * If we still need entropy, can't read anything; if not, can * read arbitrarily much. */ if (E->bitsneeded != 0 && E->samplesneeded != 0) { ret = 0; } else { if (atomic_load_relaxed(&entropy_depletion)) kn->kn_data = ENTROPY_CAPACITY; /* bytes */ else kn->kn_data = MIN(INT64_MAX, SSIZE_MAX); ret = 1; } /* Release the lock, if caller is outside entropy subsystem. */ if (hint == NOTE_SUBMIT) KASSERT(mutex_owned(&E->lock)); else mutex_exit(&E->lock); return ret; } /* XXX Makes sense only for /dev/u?random. */ static const struct filterops entropy_read_filtops = { .f_flags = FILTEROP_ISFD | FILTEROP_MPSAFE, .f_attach = NULL, .f_detach = filt_entropy_read_detach, .f_event = filt_entropy_read_event, }; /* * entropy_kqfilter(kn) * * Register kn to receive entropy event notifications. May be * EVFILT_READ or EVFILT_WRITE; anything else yields EINVAL. */ int entropy_kqfilter(struct knote *kn) { KASSERT(!cold); switch (kn->kn_filter) { case EVFILT_READ: /* Enter into the global select queue. */ mutex_enter(&E->lock); kn->kn_fop = &entropy_read_filtops; selrecord_knote(&E->selq, kn); mutex_exit(&E->lock); return 0; case EVFILT_WRITE: /* Can always dump entropy into the system. */ kn->kn_fop = &seltrue_filtops; return 0; default: return EINVAL; } } /* * rndsource_setcb(rs, get, getarg) * * Set the request callback for the entropy source rs, if it can * provide entropy on demand. Must precede rnd_attach_source. */ void rndsource_setcb(struct krndsource *rs, void (*get)(size_t, void *), void *getarg) { rs->get = get; rs->getarg = getarg; } /* * rnd_attach_source(rs, name, type, flags) * * Attach the entropy source rs. Must be done after * rndsource_setcb, if any, and before any calls to rnd_add_data. */ void rnd_attach_source(struct krndsource *rs, const char *name, uint32_t type, uint32_t flags) { uint32_t extra[4]; unsigned i = 0; KASSERTMSG(name[0] != '\0', "rndsource must have nonempty name"); /* Grab cycle counter to mix extra into the pool. */ extra[i++] = entropy_timer(); /* * Apply some standard flags: * * - We do not bother with network devices by default, for * hysterical raisins (perhaps: because it is often the case * that an adversary can influence network packet timings). */ switch (type) { case RND_TYPE_NET: flags |= RND_FLAG_NO_COLLECT; break; } /* Sanity-check the callback if RND_FLAG_HASCB is set. */ KASSERT(!ISSET(flags, RND_FLAG_HASCB) || rs->get != NULL); /* Initialize the random source. */ memset(rs->name, 0, sizeof(rs->name)); /* paranoia */ strlcpy(rs->name, name, sizeof(rs->name)); memset(&rs->time_delta, 0, sizeof(rs->time_delta)); memset(&rs->value_delta, 0, sizeof(rs->value_delta)); rs->total = 0; rs->type = type; rs->flags = flags; if (entropy_percpu != NULL) rs->state = percpu_alloc(sizeof(struct rndsource_cpu)); extra[i++] = entropy_timer(); /* Wire it into the global list of random sources. */ if (__predict_true(!cold)) mutex_enter(&E->lock); LIST_INSERT_HEAD(&E->sources, rs, list); if (__predict_true(!cold)) mutex_exit(&E->lock); extra[i++] = entropy_timer(); /* Request that it provide entropy ASAP, if we can. */ if (ISSET(flags, RND_FLAG_HASCB)) (*rs->get)(ENTROPY_CAPACITY, rs->getarg); extra[i++] = entropy_timer(); /* Mix the extra into the pool. */ KASSERT(i == __arraycount(extra)); entropy_enter(extra, sizeof extra, 0, /*count*/__predict_true(!cold)); explicit_memset(extra, 0, sizeof extra); } /* * rnd_detach_source(rs) * * Detach the entropy source rs. May sleep waiting for users to * drain. Further use is not allowed. */ void rnd_detach_source(struct krndsource *rs) { /* * If we're cold (shouldn't happen, but hey), just remove it * from the list -- there's nothing allocated. */ if (__predict_false(cold) && entropy_percpu == NULL) { LIST_REMOVE(rs, list); return; } /* We may have to wait for entropy_request. */ ASSERT_SLEEPABLE(); /* Wait until the source list is not in use, and remove it. */ mutex_enter(&E->lock); while (E->sourcelock) cv_wait(&E->sourcelock_cv, &E->lock); LIST_REMOVE(rs, list); mutex_exit(&E->lock); /* Free the per-CPU data. */ percpu_free(rs->state, sizeof(struct rndsource_cpu)); } /* * rnd_lock_sources(flags) * * Lock the list of entropy sources. Caller must hold the global * entropy lock. If successful, no rndsource will go away until * rnd_unlock_sources even while the caller releases the global * entropy lock. * * May be called very early at boot, before entropy_init. * * If flags & ENTROPY_WAIT, wait for concurrent access to finish. * If flags & ENTROPY_SIG, allow interruption by signal. */ static int __attribute__((warn_unused_result)) rnd_lock_sources(int flags) { int error; KASSERT(__predict_false(cold) || mutex_owned(&E->lock)); KASSERT(!cpu_intr_p()); while (E->sourcelock) { KASSERT(!cold); if (!ISSET(flags, ENTROPY_WAIT)) return EWOULDBLOCK; if (ISSET(flags, ENTROPY_SIG)) { error = cv_wait_sig(&E->sourcelock_cv, &E->lock); if (error) return error; } else { cv_wait(&E->sourcelock_cv, &E->lock); } } E->sourcelock = curlwp; return 0; } /* * rnd_unlock_sources() * * Unlock the list of sources after rnd_lock_sources. Caller must * hold the global entropy lock. * * May be called very early at boot, before entropy_init. */ static void rnd_unlock_sources(void) { KASSERT(__predict_false(cold) || mutex_owned(&E->lock)); KASSERT(!cpu_intr_p()); KASSERTMSG(E->sourcelock == curlwp, "lwp %p releasing lock held by %p", curlwp, E->sourcelock); E->sourcelock = NULL; if (__predict_true(!cold)) cv_signal(&E->sourcelock_cv); } /* * rnd_sources_locked() * * True if we hold the list of rndsources locked, for diagnostic * assertions. * * May be called very early at boot, before entropy_init. */ static bool __diagused rnd_sources_locked(void) { return E->sourcelock == curlwp; } /* * entropy_request(nbytes, flags) * * Request nbytes bytes of entropy from all sources in the system. * OK if we overdo it. Caller must hold the global entropy lock; * will release and re-acquire it. * * May be called very early at boot, before entropy_init. * * If flags & ENTROPY_WAIT, wait for concurrent access to finish. * If flags & ENTROPY_SIG, allow interruption by signal. */ static int entropy_request(size_t nbytes, int flags) { struct krndsource *rs; int error; KASSERT(__predict_false(cold) || mutex_owned(&E->lock)); KASSERT(!cpu_intr_p()); if ((flags & ENTROPY_WAIT) != 0 && __predict_false(!cold)) ASSERT_SLEEPABLE(); /* * Lock the list of entropy sources to block rnd_detach_source * until we're done, and to serialize calls to the entropy * callbacks as guaranteed to drivers. */ error = rnd_lock_sources(flags); if (error) return error; entropy_request_evcnt.ev_count++; /* Clamp to the maximum reasonable request. */ nbytes = MIN(nbytes, ENTROPY_CAPACITY); /* Walk the list of sources. */ LIST_FOREACH(rs, &E->sources, list) { /* Skip sources without callbacks. */ if (!ISSET(rs->flags, RND_FLAG_HASCB)) continue; /* * Skip sources that are disabled altogether -- we * would just ignore their samples anyway. */ if (ISSET(rs->flags, RND_FLAG_NO_COLLECT)) continue; /* Drop the lock while we call the callback. */ if (__predict_true(!cold)) mutex_exit(&E->lock); (*rs->get)(nbytes, rs->getarg); if (__predict_true(!cold)) mutex_enter(&E->lock); } /* Request done; unlock the list of entropy sources. */ rnd_unlock_sources(); return 0; } static inline uint32_t rnd_delta_estimate(rnd_delta_t *d, uint32_t v, int32_t delta) { int32_t delta2, delta3; /* * Calculate the second and third order differentials */ delta2 = d->dx - delta; if (delta2 < 0) delta2 = -delta2; /* XXX arithmetic overflow */ delta3 = d->d2x - delta2; if (delta3 < 0) delta3 = -delta3; /* XXX arithmetic overflow */ d->x = v; d->dx = delta; d->d2x = delta2; /* * If any delta is 0, we got no entropy. If all are non-zero, we * might have something. */ if (delta == 0 || delta2 == 0 || delta3 == 0) return 0; return 1; } static inline uint32_t rnd_dt_estimate(struct krndsource *rs, uint32_t t) { int32_t delta; uint32_t ret; rnd_delta_t *d; struct rndsource_cpu *rc; rc = percpu_getref(rs->state); d = &rc->rc_timedelta; if (t < d->x) { delta = UINT32_MAX - d->x + t; } else { delta = d->x - t; } if (delta < 0) { delta = -delta; /* XXX arithmetic overflow */ } ret = rnd_delta_estimate(d, t, delta); KASSERT(d->x == t); KASSERT(d->dx == delta); percpu_putref(rs->state); return ret; } /* * rnd_add_uint32(rs, value) * * Enter 32 bits of data from an entropy source into the pool. * * May be called from any context or with spin locks held, but may * drop data. * * This is meant for cheaply taking samples from devices that * aren't designed to be hardware random number generators. */ void rnd_add_uint32(struct krndsource *rs, uint32_t value) { bool intr_p = true; rnd_add_data_internal(rs, &value, sizeof value, 0, intr_p); } void _rnd_add_uint32(struct krndsource *rs, uint32_t value) { bool intr_p = true; rnd_add_data_internal(rs, &value, sizeof value, 0, intr_p); } void _rnd_add_uint64(struct krndsource *rs, uint64_t value) { bool intr_p = true; rnd_add_data_internal(rs, &value, sizeof value, 0, intr_p); } /* * rnd_add_data(rs, buf, len, entropybits) * * Enter data from an entropy source into the pool, with a * driver's estimate of how much entropy the physical source of * the data has. If RND_FLAG_NO_ESTIMATE, we ignore the driver's * estimate and treat it as zero. * * rs MAY but SHOULD NOT be NULL. If rs is NULL, MUST NOT be * called from interrupt context or with spin locks held. * * If rs is non-NULL, MAY but SHOULD NOT be called from interrupt * context, in which case act like rnd_add_data_intr -- if the * sample buffer is full, schedule a softint and drop any * additional data on the floor. (This may change later once we * fix drivers that still call this from interrupt context to use * rnd_add_data_intr instead.) MUST NOT be called with spin locks * held if not in hard interrupt context -- i.e., MUST NOT be * called in thread context or softint context with spin locks * held. */ void rnd_add_data(struct krndsource *rs, const void *buf, uint32_t len, uint32_t entropybits) { bool intr_p = cpu_intr_p(); /* XXX make this unconditionally false */ /* * Weird legacy exception that we should rip out and replace by * creating new rndsources to attribute entropy to the callers: * If there's no rndsource, just enter the data and time now. */ if (rs == NULL) { uint32_t extra; KASSERT(!intr_p); KASSERTMSG(howmany(entropybits, NBBY) <= len, "%s: impossible entropy rate:" " %"PRIu32" bits in %"PRIu32"-byte string", rs ? rs->name : "(anonymous)", entropybits, len); entropy_enter(buf, len, entropybits, /*count*/false); extra = entropy_timer(); entropy_enter(&extra, sizeof extra, 0, /*count*/false); explicit_memset(&extra, 0, sizeof extra); return; } rnd_add_data_internal(rs, buf, len, entropybits, intr_p); } /* * rnd_add_data_intr(rs, buf, len, entropybits) * * Try to enter data from an entropy source into the pool, with a * driver's estimate of how much entropy the physical source of * the data has. If RND_FLAG_NO_ESTIMATE, we ignore the driver's * estimate and treat it as zero. If the sample buffer is full, * schedule a softint and drop any additional data on the floor. */ void rnd_add_data_intr(struct krndsource *rs, const void *buf, uint32_t len, uint32_t entropybits) { bool intr_p = true; rnd_add_data_internal(rs, buf, len, entropybits, intr_p); } /* * rnd_add_data_internal(rs, buf, len, entropybits, intr_p) * * Internal subroutine to decide whether or not to enter data or * timing for a particular rndsource, and if so, to enter it. * * intr_p is true for callers from interrupt context or spin locks * held, and false for callers from thread or soft interrupt * context and no spin locks held. */ static void rnd_add_data_internal(struct krndsource *rs, const void *buf, uint32_t len, uint32_t entropybits, bool intr_p) { uint32_t flags; KASSERTMSG(howmany(entropybits, NBBY) <= len, "%s: impossible entropy rate:" " %"PRIu32" bits in %"PRIu32"-byte string", rs ? rs->name : "(anonymous)", entropybits, len); /* * Hold up the reset xcall before it zeroes the entropy counts * on this CPU or globally. Otherwise, we might leave some * nonzero entropy attributed to an untrusted source in the * event of a race with a change to flags. */ kpreempt_disable(); /* Load a snapshot of the flags. Ioctl may change them under us. */ flags = atomic_load_relaxed(&rs->flags); /* * Skip if: * - we're not collecting entropy, or * - the operator doesn't want to collect entropy from this, or * - neither data nor timings are being collected from this. */ if (!atomic_load_relaxed(&entropy_collection) || ISSET(flags, RND_FLAG_NO_COLLECT) || !ISSET(flags, RND_FLAG_COLLECT_VALUE|RND_FLAG_COLLECT_TIME)) goto out; /* If asked, ignore the estimate. */ if (ISSET(flags, RND_FLAG_NO_ESTIMATE)) entropybits = 0; /* If we are collecting data, enter them. */ if (ISSET(flags, RND_FLAG_COLLECT_VALUE)) { rnd_add_data_1(rs, buf, len, entropybits, /*count*/false, RND_FLAG_COLLECT_VALUE, intr_p); } /* If we are collecting timings, enter one. */ if (ISSET(flags, RND_FLAG_COLLECT_TIME)) { uint32_t extra; bool count; /* Sample a timer. */ extra = entropy_timer(); /* If asked, do entropy estimation on the time. */ if ((flags & (RND_FLAG_ESTIMATE_TIME|RND_FLAG_NO_ESTIMATE)) == RND_FLAG_ESTIMATE_TIME && __predict_true(!cold)) count = rnd_dt_estimate(rs, extra); else count = false; rnd_add_data_1(rs, &extra, sizeof extra, 0, count, RND_FLAG_COLLECT_TIME, intr_p); } out: /* Allow concurrent changes to flags to finish. */ kpreempt_enable(); } static unsigned add_sat(unsigned a, unsigned b) { unsigned c = a + b; return (c < a ? UINT_MAX : c); } /* * rnd_add_data_1(rs, buf, len, entropybits, count, flag) * * Internal subroutine to call either entropy_enter_intr, if we're * in interrupt context, or entropy_enter if not, and to count the * entropy in an rndsource. */ static void rnd_add_data_1(struct krndsource *rs, const void *buf, uint32_t len, uint32_t entropybits, bool count, uint32_t flag, bool intr_p) { bool fullyused; /* * For the interrupt-like path, use entropy_enter_intr and take * note of whether it consumed the full sample; otherwise, use * entropy_enter, which always consumes the full sample. */ if (intr_p) { fullyused = entropy_enter_intr(buf, len, entropybits, count); } else { entropy_enter(buf, len, entropybits, count); fullyused = true; } /* * If we used the full sample, note how many bits were * contributed from this source. */ if (fullyused) { if (__predict_false(cold)) { const int s = splhigh(); rs->total = add_sat(rs->total, entropybits); switch (flag) { case RND_FLAG_COLLECT_TIME: rs->time_delta.insamples = add_sat(rs->time_delta.insamples, 1); break; case RND_FLAG_COLLECT_VALUE: rs->value_delta.insamples = add_sat(rs->value_delta.insamples, 1); break; } splx(s); } else { struct rndsource_cpu *rc = percpu_getref(rs->state); atomic_store_relaxed(&rc->rc_entropybits, add_sat(rc->rc_entropybits, entropybits)); switch (flag) { case RND_FLAG_COLLECT_TIME: atomic_store_relaxed(&rc->rc_timesamples, add_sat(rc->rc_timesamples, 1)); break; case RND_FLAG_COLLECT_VALUE: atomic_store_relaxed(&rc->rc_datasamples, add_sat(rc->rc_datasamples, 1)); break; } percpu_putref(rs->state); } } } /* * rnd_add_data_sync(rs, buf, len, entropybits) * * Same as rnd_add_data. Originally used in rndsource callbacks, * to break an unnecessary cycle; no longer really needed. */ void rnd_add_data_sync(struct krndsource *rs, const void *buf, uint32_t len, uint32_t entropybits) { rnd_add_data(rs, buf, len, entropybits); } /* * rndsource_entropybits(rs) * * Return approximately the number of bits of entropy that have * been contributed via rs so far. Approximate if other CPUs may * be calling rnd_add_data concurrently. */ static unsigned rndsource_entropybits(struct krndsource *rs) { unsigned nbits = rs->total; KASSERT(!cold); KASSERT(rnd_sources_locked()); percpu_foreach(rs->state, rndsource_entropybits_cpu, &nbits); return nbits; } static void rndsource_entropybits_cpu(void *ptr, void *cookie, struct cpu_info *ci) { struct rndsource_cpu *rc = ptr; unsigned *nbitsp = cookie; unsigned cpu_nbits; cpu_nbits = atomic_load_relaxed(&rc->rc_entropybits); *nbitsp += MIN(UINT_MAX - *nbitsp, cpu_nbits); } /* * rndsource_to_user(rs, urs) * * Copy a description of rs out to urs for userland. */ static void rndsource_to_user(struct krndsource *rs, rndsource_t *urs) { KASSERT(!cold); KASSERT(rnd_sources_locked()); /* Avoid kernel memory disclosure. */ memset(urs, 0, sizeof(*urs)); CTASSERT(sizeof(urs->name) == sizeof(rs->name)); strlcpy(urs->name, rs->name, sizeof(urs->name)); urs->total = rndsource_entropybits(rs); urs->type = rs->type; urs->flags = atomic_load_relaxed(&rs->flags); } /* * rndsource_to_user_est(rs, urse) * * Copy a description of rs and estimation statistics out to urse * for userland. */ static void rndsource_to_user_est(struct krndsource *rs, rndsource_est_t *urse) { KASSERT(!cold); KASSERT(rnd_sources_locked()); /* Avoid kernel memory disclosure. */ memset(urse, 0, sizeof(*urse)); /* Copy out the rndsource description. */ rndsource_to_user(rs, &urse->rt); /* Gather the statistics. */ urse->dt_samples = rs->time_delta.insamples; urse->dt_total = 0; urse->dv_samples = rs->value_delta.insamples; urse->dv_total = urse->rt.total; percpu_foreach(rs->state, rndsource_to_user_est_cpu, urse); } static void rndsource_to_user_est_cpu(void *ptr, void *cookie, struct cpu_info *ci) { struct rndsource_cpu *rc = ptr; rndsource_est_t *urse = cookie; urse->dt_samples = add_sat(urse->dt_samples, atomic_load_relaxed(&rc->rc_timesamples)); urse->dv_samples = add_sat(urse->dv_samples, atomic_load_relaxed(&rc->rc_datasamples)); } /* * entropy_reset_xc(arg1, arg2) * * Reset the current CPU's pending entropy to zero. */ static void entropy_reset_xc(void *arg1 __unused, void *arg2 __unused) { uint32_t extra = entropy_timer(); struct entropy_cpu_lock lock; struct entropy_cpu *ec; /* * With the per-CPU state locked, zero the pending count and * enter a cycle count for fun. */ ec = entropy_cpu_get(&lock); ec->ec_bitspending = 0; ec->ec_samplespending = 0; entpool_enter(ec->ec_pool, &extra, sizeof extra); entropy_cpu_put(&lock, ec); } /* * entropy_ioctl(cmd, data) * * Handle various /dev/random ioctl queries. */ int entropy_ioctl(unsigned long cmd, void *data) { struct krndsource *rs; bool privileged; int error; KASSERT(!cold); /* Verify user's authorization to perform the ioctl. */ switch (cmd) { case RNDGETENTCNT: case RNDGETPOOLSTAT: case RNDGETSRCNUM: case RNDGETSRCNAME: case RNDGETESTNUM: case RNDGETESTNAME: error = kauth_authorize_device(kauth_cred_get(), KAUTH_DEVICE_RND_GETPRIV, NULL, NULL, NULL, NULL); break; case RNDCTL: error = kauth_authorize_device(kauth_cred_get(), KAUTH_DEVICE_RND_SETPRIV, NULL, NULL, NULL, NULL); break; case RNDADDDATA: error = kauth_authorize_device(kauth_cred_get(), KAUTH_DEVICE_RND_ADDDATA, NULL, NULL, NULL, NULL); /* Ascertain whether the user's inputs should be counted. */ if (kauth_authorize_device(kauth_cred_get(), KAUTH_DEVICE_RND_ADDDATA_ESTIMATE, NULL, NULL, NULL, NULL) == 0) privileged = true; break; default: { /* * XXX Hack to avoid changing module ABI so this can be * pulled up. Later, we can just remove the argument. */ static const struct fileops fops = { .fo_ioctl = rnd_system_ioctl, }; struct file f = { .f_ops = &fops, }; MODULE_HOOK_CALL(rnd_ioctl_50_hook, (&f, cmd, data), enosys(), error); #if defined(_LP64) if (error == ENOSYS) MODULE_HOOK_CALL(rnd_ioctl32_50_hook, (&f, cmd, data), enosys(), error); #endif if (error == ENOSYS) error = ENOTTY; break; } } /* If anything went wrong with authorization, stop here. */ if (error) return error; /* Dispatch on the command. */ switch (cmd) { case RNDGETENTCNT: { /* Get current entropy count in bits. */ uint32_t *countp = data; mutex_enter(&E->lock); *countp = MINENTROPYBITS - E->bitsneeded; mutex_exit(&E->lock); break; } case RNDGETPOOLSTAT: { /* Get entropy pool statistics. */ rndpoolstat_t *pstat = data; mutex_enter(&E->lock); /* parameters */ pstat->poolsize = ENTPOOL_SIZE/sizeof(uint32_t); /* words */ pstat->threshold = MINENTROPYBITS/NBBY; /* bytes */ pstat->maxentropy = ENTROPY_CAPACITY*NBBY; /* bits */ /* state */ pstat->added = 0; /* XXX total entropy_enter count */ pstat->curentropy = MINENTROPYBITS - E->bitsneeded; /* bits */ pstat->removed = 0; /* XXX total entropy_extract count */ pstat->discarded = 0; /* XXX bits of entropy beyond capacity */ /* * This used to be bits of data fabricated in some * sense; we'll take it to mean number of samples, * excluding the bits of entropy from HWRNG or seed. */ pstat->generated = MINSAMPLES - E->samplesneeded; pstat->generated -= MIN(pstat->generated, pstat->curentropy); mutex_exit(&E->lock); break; } case RNDGETSRCNUM: { /* Get entropy sources by number. */ rndstat_t *stat = data; uint32_t start = 0, i = 0; /* Skip if none requested; fail if too many requested. */ if (stat->count == 0) break; if (stat->count > RND_MAXSTATCOUNT) return EINVAL; /* * Under the lock, find the first one, copy out as many * as requested, and report how many we copied out. */ mutex_enter(&E->lock); error = rnd_lock_sources(ENTROPY_WAIT|ENTROPY_SIG); if (error) { mutex_exit(&E->lock); return error; } LIST_FOREACH(rs, &E->sources, list) { if (start++ == stat->start) break; } while (i < stat->count && rs != NULL) { mutex_exit(&E->lock); rndsource_to_user(rs, &stat->source[i++]); mutex_enter(&E->lock); rs = LIST_NEXT(rs, list); } KASSERT(i <= stat->count); stat->count = i; rnd_unlock_sources(); mutex_exit(&E->lock); break; } case RNDGETESTNUM: { /* Get sources and estimates by number. */ rndstat_est_t *estat = data; uint32_t start = 0, i = 0; /* Skip if none requested; fail if too many requested. */ if (estat->count == 0) break; if (estat->count > RND_MAXSTATCOUNT) return EINVAL; /* * Under the lock, find the first one, copy out as many * as requested, and report how many we copied out. */ mutex_enter(&E->lock); error = rnd_lock_sources(ENTROPY_WAIT|ENTROPY_SIG); if (error) { mutex_exit(&E->lock); return error; } LIST_FOREACH(rs, &E->sources, list) { if (start++ == estat->start) break; } while (i < estat->count && rs != NULL) { mutex_exit(&E->lock); rndsource_to_user_est(rs, &estat->source[i++]); mutex_enter(&E->lock); rs = LIST_NEXT(rs, list); } KASSERT(i <= estat->count); estat->count = i; rnd_unlock_sources(); mutex_exit(&E->lock); break; } case RNDGETSRCNAME: { /* Get entropy sources by name. */ rndstat_name_t *nstat = data; const size_t n = sizeof(rs->name); CTASSERT(sizeof(rs->name) == sizeof(nstat->name)); /* * Under the lock, search by name. If found, copy it * out; if not found, fail with ENOENT. */ mutex_enter(&E->lock); error = rnd_lock_sources(ENTROPY_WAIT|ENTROPY_SIG); if (error) { mutex_exit(&E->lock); return error; } LIST_FOREACH(rs, &E->sources, list) { if (strncmp(rs->name, nstat->name, n) == 0) break; } if (rs != NULL) { mutex_exit(&E->lock); rndsource_to_user(rs, &nstat->source); mutex_enter(&E->lock); } else { error = ENOENT; } rnd_unlock_sources(); mutex_exit(&E->lock); break; } case RNDGETESTNAME: { /* Get sources and estimates by name. */ rndstat_est_name_t *enstat = data; const size_t n = sizeof(rs->name); CTASSERT(sizeof(rs->name) == sizeof(enstat->name)); /* * Under the lock, search by name. If found, copy it * out; if not found, fail with ENOENT. */ mutex_enter(&E->lock); error = rnd_lock_sources(ENTROPY_WAIT|ENTROPY_SIG); if (error) { mutex_exit(&E->lock); return error; } LIST_FOREACH(rs, &E->sources, list) { if (strncmp(rs->name, enstat->name, n) == 0) break; } if (rs != NULL) { mutex_exit(&E->lock); rndsource_to_user_est(rs, &enstat->source); mutex_enter(&E->lock); } else { error = ENOENT; } rnd_unlock_sources(); mutex_exit(&E->lock); break; } case RNDCTL: { /* Modify entropy source flags. */ rndctl_t *rndctl = data; const size_t n = sizeof(rs->name); uint32_t resetflags = RND_FLAG_NO_ESTIMATE|RND_FLAG_NO_COLLECT; uint32_t flags; bool reset = false, request = false; CTASSERT(sizeof(rs->name) == sizeof(rndctl->name)); /* Whitelist the flags that user can change. */ rndctl->mask &= RND_FLAG_NO_ESTIMATE|RND_FLAG_NO_COLLECT; /* * For each matching rndsource, either by type if * specified or by name if not, set the masked flags. */ mutex_enter(&E->lock); LIST_FOREACH(rs, &E->sources, list) { if (rndctl->type != 0xff) { if (rs->type != rndctl->type) continue; } else if (rndctl->name[0] != '\0') { if (strncmp(rs->name, rndctl->name, n) != 0) continue; } flags = rs->flags & ~rndctl->mask; flags |= rndctl->flags & rndctl->mask; if ((rs->flags & resetflags) == 0 && (flags & resetflags) != 0) reset = true; if ((rs->flags ^ flags) & resetflags) request = true; atomic_store_relaxed(&rs->flags, flags); } mutex_exit(&E->lock); /* * If we disabled estimation or collection, nix all the * pending entropy and set needed to the maximum. */ if (reset) { xc_broadcast(0, &entropy_reset_xc, NULL, NULL); mutex_enter(&E->lock); E->bitspending = 0; E->samplespending = 0; atomic_store_relaxed(&E->bitsneeded, MINENTROPYBITS); atomic_store_relaxed(&E->samplesneeded, MINSAMPLES); E->consolidate = false; mutex_exit(&E->lock); } /* * If we changed any of the estimation or collection * flags, request new samples from everyone -- either * to make up for what we just lost, or to get new * samples from what we just added. * * Failing on signal, while waiting for another process * to finish requesting entropy, is OK here even though * we have committed side effects, because this ioctl * command is idempotent, so repeating it is safe. */ if (request) { mutex_enter(&E->lock); error = entropy_request(ENTROPY_CAPACITY, ENTROPY_WAIT|ENTROPY_SIG); mutex_exit(&E->lock); } break; } case RNDADDDATA: { /* Enter seed into entropy pool. */ rnddata_t *rdata = data; unsigned entropybits = 0; if (!atomic_load_relaxed(&entropy_collection)) break; /* thanks but no thanks */ if (rdata->len > MIN(sizeof(rdata->data), UINT32_MAX/NBBY)) return EINVAL; /* * This ioctl serves as the userland alternative a * bootloader-provided seed -- typically furnished by * /etc/rc.d/random_seed. We accept the user's entropy * claim only if * * (a) the user is privileged, and * (b) we have not entered a bootloader seed. * * under the assumption that the user may use this to * load a seed from disk that we have already loaded * from the bootloader, so we don't double-count it. */ if (privileged && rdata->entropy && rdata->len) { mutex_enter(&E->lock); if (!E->seeded) { entropybits = MIN(rdata->entropy, MIN(rdata->len, ENTROPY_CAPACITY)*NBBY); E->seeded = true; } mutex_exit(&E->lock); } /* Enter the data and consolidate entropy. */ rnd_add_data(&seed_rndsource, rdata->data, rdata->len, entropybits); entropy_consolidate(); break; } default: error = ENOTTY; } /* Return any error that may have come up. */ return error; } /* Legacy entry points */ void rnd_seed(void *seed, size_t len) { if (len != sizeof(rndsave_t)) { printf("entropy: invalid seed length: %zu," " expected sizeof(rndsave_t) = %zu\n", len, sizeof(rndsave_t)); return; } entropy_seed(seed); } void rnd_init(void) { entropy_init(); } void rnd_init_softint(void) { entropy_init_late(); entropy_bootrequest(); } int rnd_system_ioctl(struct file *fp, unsigned long cmd, void *data) { return entropy_ioctl(cmd, data); }
414 1 1 42 1 1 1 11 27 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 /* $NetBSD: clockctl.c,v 1.39 2022/03/28 12:33:20 riastradh Exp $ */ /*- * Copyright (c) 2001 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Emmanuel Dreyfus. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The name of the author may not be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: clockctl.c,v 1.39 2022/03/28 12:33:20 riastradh Exp $"); #ifdef _KERNEL_OPT #include "opt_ntp.h" #include "opt_compat_netbsd.h" #endif #include <sys/param.h> #include <sys/systm.h> #include <sys/proc.h> #include <sys/errno.h> #include <sys/ioctl.h> #include <sys/device.h> #include <sys/time.h> #include <sys/conf.h> #include <sys/timex.h> #include <sys/kauth.h> #include <sys/module.h> #include <sys/mutex.h> #include <sys/compat_stub.h> #include <sys/clockctl.h> #include <compat/sys/clockctl.h> #include <compat/sys/time_types.h> kmutex_t clockctl_mtx; int clockctl_refcnt; #include "ioconf.h" dev_type_ioctl(clockctlioctl); const struct cdevsw clockctl_cdevsw = { .d_open = clockctlopen, .d_close = clockctlclose, .d_read = noread, .d_write = nowrite, .d_ioctl = clockctlioctl, .d_stop = nostop, .d_tty = notty, .d_poll = nopoll, .d_mmap = nommap, .d_kqfilter = nokqfilter, .d_discard = nodiscard, .d_flag = D_OTHER, }; static kauth_listener_t clockctl_listener; static int clockctl_listener_cb(kauth_cred_t cred, kauth_action_t action, void *cookie, void *arg0, void *arg1, void *arg2, void *arg3) { int result; enum kauth_system_req req; bool device_context; result = KAUTH_RESULT_DEFER; req = (enum kauth_system_req)(uintptr_t)arg0; if ((action != KAUTH_SYSTEM_TIME) || (req != KAUTH_REQ_SYSTEM_TIME_SYSTEM)) return result; device_context = arg3 != NULL; /* Device is controlled by permissions, so allow. */ if (device_context) result = KAUTH_RESULT_ALLOW; return result; } /*ARGSUSED*/ void clockctlattach(int num) { /* * Don't initialize the listener here - it will get handled as part * of module initialization. */ #if 0 clockctl_listener = kauth_listen_scope(KAUTH_SCOPE_SYSTEM, clockctl_listener_cb, NULL); #endif } /* * Maintain a refcount for each open/close, so we know when it is * safe to call devsw_detach() */ int clockctlopen(dev_t dev, int flag, int mode, struct lwp *l) { mutex_enter(&clockctl_mtx); clockctl_refcnt++; mutex_exit(&clockctl_mtx); return 0; } int clockctlclose(dev_t dev, int flag, int mode, struct lwp *l) { mutex_enter(&clockctl_mtx); clockctl_refcnt--; mutex_exit(&clockctl_mtx); return 0; } MODULE(MODULE_CLASS_DRIVER, clockctl, NULL); int clockctl_modcmd(modcmd_t cmd, void *data) { int error; #ifdef _MODULE int bmajor, cmajor; #endif error = 0; switch (cmd) { case MODULE_CMD_INIT: mutex_init(&clockctl_mtx, MUTEX_DEFAULT, IPL_NONE); clockctl_listener = kauth_listen_scope(KAUTH_SCOPE_SYSTEM, clockctl_listener_cb, NULL); #ifdef _MODULE bmajor = cmajor = -1; error = devsw_attach("clockctl", NULL, &bmajor, &clockctl_cdevsw, &cmajor); if (error != 0) kauth_unlisten_scope(clockctl_listener); #endif break; case MODULE_CMD_FINI: mutex_enter(&clockctl_mtx); if (clockctl_refcnt != 0) { mutex_exit(&clockctl_mtx); return EBUSY; } #ifdef _MODULE devsw_detach(NULL, &clockctl_cdevsw); #endif mutex_exit(&clockctl_mtx); kauth_unlisten_scope(clockctl_listener); mutex_destroy(&clockctl_mtx); break; default: error = ENOTTY; break; } return error; } int clockctlioctl( dev_t dev, u_long cmd, void *data, int flags, struct lwp *l) { int error = 0; switch (cmd) { case CLOCKCTL_SETTIMEOFDAY: { struct clockctl_settimeofday *args = data; error = settimeofday1(args->tv, true, args->tzp, l, false); break; } case CLOCKCTL_ADJTIME: { struct timeval atv, oldatv; struct clockctl_adjtime *args = data; if (args->delta) { error = copyin(args->delta, &atv, sizeof(atv)); if (error) return (error); } adjtime1(args->delta ? &atv : NULL, args->olddelta ? &oldatv : NULL, l->l_proc); if (args->olddelta) error = copyout(&oldatv, args->olddelta, sizeof(oldatv)); break; } case CLOCKCTL_CLOCK_SETTIME: { struct clockctl_clock_settime *args = data; struct timespec ts; error = copyin(args->tp, &ts, sizeof ts); if (error) return (error); error = clock_settime1(l->l_proc, args->clock_id, &ts, false); break; } case CLOCKCTL_NTP_ADJTIME: { struct clockctl_ntp_adjtime *args = data; struct timex ntv; if (vec_ntp_timestatus == NULL) { error = ENOTTY; break; } error = copyin(args->tp, &ntv, sizeof(ntv)); if (error) return (error); (*vec_ntp_adjtime1)(&ntv); error = copyout(&ntv, args->tp, sizeof(ntv)); if (error == 0) args->retval = (*vec_ntp_timestatus)(); break; } default: MODULE_HOOK_CALL(clockctl_ioctl_50_hook, (dev, cmd, data, flags, l), enosys(), error); if (error == ENOSYS) error = ENOTTY; } return (error); }
2 2 2 2 2 2 2 2 2 2 1 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 /* $NetBSD: pckbport.c,v 1.20 2021/08/07 16:19:15 thorpej Exp $ */ /* * Copyright (c) 2004 Ben Harris * Copyright (c) 1998 * Matthias Drochner. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: pckbport.c,v 1.20 2021/08/07 16:19:15 thorpej Exp $"); #include <sys/param.h> #include <sys/systm.h> #include <sys/callout.h> #include <sys/kernel.h> #include <sys/proc.h> #include <sys/device.h> #include <sys/malloc.h> #include <sys/errno.h> #include <sys/queue.h> #include <dev/pckbport/pckbdreg.h> #include <dev/pckbport/pckbportvar.h> #include "locators.h" #include "pckbd.h" #if (NPCKBD > 0) #include <dev/pckbport/pckbdvar.h> #endif /* descriptor for one device command */ struct pckbport_devcmd { TAILQ_ENTRY(pckbport_devcmd) next; int flags; #define KBC_CMDFLAG_SYNC 1 /* give descriptor back to caller */ #define KBC_CMDFLAG_SLOW 2 u_char cmd[4]; int cmdlen, cmdidx, retries; u_char response[4]; int status, responselen, responseidx; }; /* data per slave device */ struct pckbport_slotdata { int polling; /* don't process data in interrupt handler */ TAILQ_HEAD(, pckbport_devcmd) cmdqueue; /* active commands */ TAILQ_HEAD(, pckbport_devcmd) freequeue; /* free commands */ #define NCMD 5 struct pckbport_devcmd cmds[NCMD]; }; #define CMD_IN_QUEUE(q) (TAILQ_FIRST(&(q)->cmdqueue) != NULL) static void pckbport_init_slotdata(struct pckbport_slotdata *); static int pckbportprint(void *, const char *); static struct pckbport_slotdata pckbport_cons_slotdata; static int pckbport_poll_data1(pckbport_tag_t, pckbport_slot_t); static int pckbport_send_devcmd(struct pckbport_tag *, pckbport_slot_t, u_char); static void pckbport_poll_cmd1(struct pckbport_tag *, pckbport_slot_t, struct pckbport_devcmd *); static void pckbport_cleanqueue(struct pckbport_slotdata *); static void pckbport_cleanup(void *); static int pckbport_cmdresponse(struct pckbport_tag *, pckbport_slot_t, u_char); static void pckbport_start(struct pckbport_tag *, pckbport_slot_t); static const char * const pckbport_slot_names[] = { "kbd", "aux" }; static struct pckbport_tag pckbport_cntag; #define KBD_DELAY DELAY(8) #ifdef PCKBPORTDEBUG #define DPRINTF(a) printf a #else #define DPRINTF(a) #endif static int pckbport_poll_data1(pckbport_tag_t t, pckbport_slot_t slot) { return t->t_ops->t_poll_data1(t->t_cookie, slot); } static int pckbport_send_devcmd(struct pckbport_tag *t, pckbport_slot_t slot, u_char val) { return t->t_ops->t_send_devcmd(t->t_cookie, slot, val); } pckbport_tag_t pckbport_attach(void *cookie, struct pckbport_accessops const *ops) { pckbport_tag_t t; if (cookie == pckbport_cntag.t_cookie && ops == pckbport_cntag.t_ops) return &pckbport_cntag; t = malloc(sizeof(struct pckbport_tag), M_DEVBUF, M_WAITOK | M_ZERO); callout_init(&t->t_cleanup, 0); t->t_cookie = cookie; t->t_ops = ops; return t; } device_t pckbport_attach_slot(device_t dev, pckbport_tag_t t, pckbport_slot_t slot) { struct pckbport_attach_args pa; void *sdata; device_t found; int alloced = 0; int locs[PCKBPORTCF_NLOCS]; pa.pa_tag = t; pa.pa_slot = slot; if (t->t_slotdata[slot] == NULL) { sdata = malloc(sizeof(struct pckbport_slotdata), M_DEVBUF, M_WAITOK); t->t_slotdata[slot] = sdata; pckbport_init_slotdata(t->t_slotdata[slot]); alloced++; } locs[PCKBPORTCF_SLOT] = slot; found = config_found(dev, &pa, pckbportprint, CFARGS(.submatch = config_stdsubmatch, .iattr = "pckbport", .locators = locs)); if (found == NULL && alloced) { free(t->t_slotdata[slot], M_DEVBUF); t->t_slotdata[slot] = NULL; } return found; } int pckbportprint(void *aux, const char *pnp) { struct pckbport_attach_args *pa = aux; if (!pnp) aprint_normal(" (%s slot)", pckbport_slot_names[pa->pa_slot]); return QUIET; } void pckbport_init_slotdata(struct pckbport_slotdata *q) { int i; TAILQ_INIT(&q->cmdqueue); TAILQ_INIT(&q->freequeue); for (i = 0; i < NCMD; i++) TAILQ_INSERT_TAIL(&q->freequeue, &(q->cmds[i]), next); q->polling = 0; } void pckbport_flush(pckbport_tag_t t, pckbport_slot_t slot) { (void)pckbport_poll_data1(t, slot); } int pckbport_poll_data(pckbport_tag_t t, pckbport_slot_t slot) { struct pckbport_slotdata *q = t->t_slotdata[slot]; int c; c = pckbport_poll_data1(t, slot); if (c != -1 && q && CMD_IN_QUEUE(q)) /* * we jumped into a running command - try to deliver * the response */ if (pckbport_cmdresponse(t, slot, c)) return -1; return c; } /* * switch scancode translation on / off * return nonzero on success */ int pckbport_xt_translation(pckbport_tag_t t, pckbport_slot_t slot, int on) { return t->t_ops->t_xt_translation(t->t_cookie, slot, on); } void pckbport_slot_enable(pckbport_tag_t t, pckbport_slot_t slot, int on) { t->t_ops->t_slot_enable(t->t_cookie, slot, on); } void pckbport_set_poll(pckbport_tag_t t, pckbport_slot_t slot, int on) { t->t_slotdata[slot]->polling = on; t->t_ops->t_set_poll(t->t_cookie, slot, on); } /* * Pass command to device, poll for ACK and data. * to be called at spltty() */ static void pckbport_poll_cmd1(struct pckbport_tag *t, pckbport_slot_t slot, struct pckbport_devcmd *cmd) { int i, c = 0; while (cmd->cmdidx < cmd->cmdlen) { if (!pckbport_send_devcmd(t, slot, cmd->cmd[cmd->cmdidx])) { printf("pckbport_cmd: send error\n"); cmd->status = EIO; return; } for (i = 10; i; i--) { /* 1s ??? */ c = pckbport_poll_data1(t, slot); if (c != -1) break; } switch (c) { case KBR_ACK: cmd->cmdidx++; continue; case KBR_BAT_DONE: case KBR_BAT_FAIL: case KBR_RESEND: DPRINTF(("%s: %s\n", __func__, c == KBR_RESEND ? "RESEND" : (c == KBR_BAT_DONE ? "BAT_DONE" : "BAT_FAIL"))); if (cmd->retries++ < 5) continue; else { DPRINTF(("%s: cmd failed\n", __func__)); cmd->status = EIO; return; } case -1: DPRINTF(("%s: timeout\n", __func__)); cmd->status = EIO; return; } DPRINTF(("%s: lost 0x%x\n", __func__, c)); } while (cmd->responseidx < cmd->responselen) { if (cmd->flags & KBC_CMDFLAG_SLOW) i = 100; /* 10s ??? */ else i = 10; /* 1s ??? */ while (i--) { c = pckbport_poll_data1(t, slot); if (c != -1) break; } if (c == -1) { DPRINTF(("%s: no data\n", __func__)); cmd->status = ETIMEDOUT; return; } else cmd->response[cmd->responseidx++] = c; } } /* for use in autoconfiguration */ int pckbport_poll_cmd(pckbport_tag_t t, pckbport_slot_t slot, const u_char *cmd, int len, int responselen, u_char *respbuf, int slow) { struct pckbport_devcmd nc; if ((len > 4) || (responselen > 4)) return (EINVAL); memset(&nc, 0, sizeof(nc)); memcpy(nc.cmd, cmd, len); nc.cmdlen = len; nc.responselen = responselen; nc.flags = (slow ? KBC_CMDFLAG_SLOW : 0); pckbport_poll_cmd1(t, slot, &nc); if (nc.status == 0 && respbuf) memcpy(respbuf, nc.response, responselen); return nc.status; } /* * Clean up a command queue, throw away everything. */ void pckbport_cleanqueue(struct pckbport_slotdata *q) { struct pckbport_devcmd *cmd; while ((cmd = TAILQ_FIRST(&q->cmdqueue))) { TAILQ_REMOVE(&q->cmdqueue, cmd, next); #ifdef PCKBPORTDEBUG printf("%s: removing", __func__); for (int i = 0; i < cmd->cmdlen; i++) printf(" %02x", cmd->cmd[i]); printf("\n"); #endif TAILQ_INSERT_TAIL(&q->freequeue, cmd, next); } } /* * Timeout error handler: clean queues and data port. * XXX could be less invasive. */ void pckbport_cleanup(void *self) { struct pckbport_tag *t = self; int s; u_char cmd[1], resp[2]; printf("pckbport: command timeout\n"); s = spltty(); if (t->t_slotdata[PCKBPORT_KBD_SLOT]) pckbport_cleanqueue(t->t_slotdata[PCKBPORT_KBD_SLOT]); if (t->t_slotdata[PCKBPORT_AUX_SLOT]) pckbport_cleanqueue(t->t_slotdata[PCKBPORT_AUX_SLOT]); #if 0 /* XXXBJH Move to controller driver? */ while (bus_space_read_1(t->t_iot, t->t_ioh_c, 0) & KBS_DIB) { KBD_DELAY; (void) bus_space_read_1(t->t_iot, t->t_ioh_d, 0); } #endif cmd[0] = KBC_RESET; (void)pckbport_poll_cmd(t, PCKBPORT_KBD_SLOT, cmd, 1, 2, resp, 1); pckbport_flush(t, PCKBPORT_KBD_SLOT); splx(s); } /* * Pass command to device during normal operation. * to be called at spltty() */ void pckbport_start(struct pckbport_tag *t, pckbport_slot_t slot) { struct pckbport_slotdata *q = t->t_slotdata[slot]; struct pckbport_devcmd *cmd = TAILQ_FIRST(&q->cmdqueue); KASSERT(cmd != NULL); if (q->polling) { do { pckbport_poll_cmd1(t, slot, cmd); if (cmd->status) printf("pckbport_start: command error\n"); TAILQ_REMOVE(&q->cmdqueue, cmd, next); if (cmd->flags & KBC_CMDFLAG_SYNC) wakeup(cmd); else { callout_stop(&t->t_cleanup); TAILQ_INSERT_TAIL(&q->freequeue, cmd, next); } cmd = TAILQ_FIRST(&q->cmdqueue); } while (cmd); return; } if (!pckbport_send_devcmd(t, slot, cmd->cmd[cmd->cmdidx])) { printf("pckbport_start: send error\n"); /* XXX what now? */ return; } } /* * Handle command responses coming in asynchronously, * return nonzero if valid response. * to be called at spltty() */ int pckbport_cmdresponse(struct pckbport_tag *t, pckbport_slot_t slot, u_char data) { struct pckbport_slotdata *q = t->t_slotdata[slot]; struct pckbport_devcmd *cmd = TAILQ_FIRST(&q->cmdqueue); KASSERT(cmd != NULL); if (cmd->cmdidx < cmd->cmdlen) { if (data != KBR_ACK && data != KBR_RESEND) return 0; if (data == KBR_RESEND) { if (cmd->retries++ < 5) /* try again last command */ goto restart; else { DPRINTF(("%s: cmd failed\n", __func__)); cmd->status = EIO; /* dequeue */ } } else { if (++cmd->cmdidx < cmd->cmdlen) goto restart; if (cmd->responselen) return 1; /* else dequeue */ } } else if (cmd->responseidx < cmd->responselen) { cmd->response[cmd->responseidx++] = data; if (cmd->responseidx < cmd->responselen) return 1; /* else dequeue */ } else return 0; /* dequeue: */ TAILQ_REMOVE(&q->cmdqueue, cmd, next); if (cmd->flags & KBC_CMDFLAG_SYNC) wakeup(cmd); else { callout_stop(&t->t_cleanup); TAILQ_INSERT_TAIL(&q->freequeue, cmd, next); } if (!CMD_IN_QUEUE(q)) return 1; restart: pckbport_start(t, slot); return 1; } /* * Put command into the device's command queue, return zero or errno. */ int pckbport_enqueue_cmd(pckbport_tag_t t, pckbport_slot_t slot, const u_char *cmd, int len, int responselen, int sync, u_char *respbuf) { struct pckbport_slotdata *q = t->t_slotdata[slot]; struct pckbport_devcmd *nc; int s, isactive, res = 0; if ((len > 4) || (responselen > 4)) return EINVAL; s = spltty(); nc = TAILQ_FIRST(&q->freequeue); if (nc) TAILQ_REMOVE(&q->freequeue, nc, next); splx(s); if (!nc) return ENOMEM; memset(nc, 0, sizeof(*nc)); memcpy(nc->cmd, cmd, len); nc->cmdlen = len; nc->responselen = responselen; nc->flags = (sync ? KBC_CMDFLAG_SYNC : 0); s = spltty(); if (q->polling && sync) /* * XXX We should poll until the queue is empty. * But we don't come here normally, so make * it simple and throw away everything. */ pckbport_cleanqueue(q); isactive = CMD_IN_QUEUE(q); TAILQ_INSERT_TAIL(&q->cmdqueue, nc, next); if (!isactive) pckbport_start(t, slot); if (q->polling) res = (sync ? nc->status : 0); else if (sync) { if ((res = tsleep(nc, 0, "kbccmd", 1*hz))) { TAILQ_REMOVE(&q->cmdqueue, nc, next); pckbport_cleanup(t); } else res = nc->status; } else callout_reset(&t->t_cleanup, hz, pckbport_cleanup, t); if (sync) { if (respbuf) memcpy(respbuf, nc->response, responselen); TAILQ_INSERT_TAIL(&q->freequeue, nc, next); } splx(s); return res; } void pckbport_set_inputhandler(pckbport_tag_t t, pckbport_slot_t slot, pckbport_inputfcn func, void *arg, const char *name) { if (slot >= PCKBPORT_NSLOTS) panic("pckbport_set_inputhandler: bad slot %d", slot); t->t_ops->t_intr_establish(t->t_cookie, slot); t->t_inputhandler[slot] = func; t->t_inputarg[slot] = arg; t->t_subname[slot] = name; } void pckbportintr(pckbport_tag_t t, pckbport_slot_t slot, int data) { struct pckbport_slotdata *q; q = t->t_slotdata[slot]; if (!q) { /* XXX do something for live insertion? */ printf("pckbportintr: no dev for slot %d\n", slot); return; } if (CMD_IN_QUEUE(q) && pckbport_cmdresponse(t, slot, data)) return; if (t->t_inputhandler[slot]) { (*t->t_inputhandler[slot])(t->t_inputarg[slot], data); return; } DPRINTF(("%s: slot %d lost %d\n", __func__, slot, data)); } int pckbport_cnattach(void *cookie, struct pckbport_accessops const *ops, pckbport_slot_t slot) { int res = 0; pckbport_tag_t t = &pckbport_cntag; callout_init(&t->t_cleanup, 0); t->t_cookie = cookie; t->t_ops = ops; /* flush */ pckbport_flush(t, slot); #if (NPCKBD > 0) res = pckbd_cnattach(t, slot); #elif (NPCKBPORT_MACHDEP_CNATTACH > 0) res = pckbport_machdep_cnattach(t, slot); #else res = ENXIO; #endif /* NPCKBPORT_MACHDEP_CNATTACH > 0 */ if (res == 0) { t->t_slotdata[slot] = &pckbport_cons_slotdata; pckbport_init_slotdata(&pckbport_cons_slotdata); } return res; }
2 2 2 2 2 2 23 23 23 43 44 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 /* $NetBSD: kern_hook.c,v 1.15 2024/01/17 10:18:41 hannken Exp $ */ /*- * Copyright (c) 1997, 1998, 1999, 2002, 2007, 2008 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility, * NASA Ames Research Center, and by Luke Mewburn. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: kern_hook.c,v 1.15 2024/01/17 10:18:41 hannken Exp $"); #include <sys/param.h> #include <sys/condvar.h> #include <sys/cpu.h> #include <sys/device.h> #include <sys/exec.h> #include <sys/hook.h> #include <sys/kmem.h> #include <sys/malloc.h> #include <sys/once.h> #include <sys/rwlock.h> #include <sys/systm.h> /* * A generic linear hook. */ struct hook_desc { LIST_ENTRY(hook_desc) hk_list; void (*hk_fn)(void *); void *hk_arg; }; typedef LIST_HEAD(, hook_desc) hook_list_t; enum hook_list_st { HKLIST_IDLE, HKLIST_INUSE, }; struct khook_list { hook_list_t hl_list; kmutex_t hl_lock; kmutex_t *hl_cvlock; struct lwp *hl_lwp; kcondvar_t hl_cv; enum hook_list_st hl_state; khook_t *hl_active_hk; char hl_namebuf[HOOKNAMSIZ]; }; int powerhook_debug = 0; static ONCE_DECL(hook_control); static krwlock_t exithook_lock; static krwlock_t forkhook_lock; static int hook_init(void) { rw_init(&exithook_lock); rw_init(&forkhook_lock); return 0; } static void * hook_establish(hook_list_t *list, krwlock_t *lock, void (*fn)(void *), void *arg) { struct hook_desc *hd; RUN_ONCE(&hook_control, hook_init); hd = malloc(sizeof(*hd), M_DEVBUF, M_NOWAIT); if (hd != NULL) { if (lock) rw_enter(lock, RW_WRITER); hd->hk_fn = fn; hd->hk_arg = arg; LIST_INSERT_HEAD(list, hd, hk_list); if (lock) rw_exit(lock); } return (hd); } static void hook_disestablish(hook_list_t *list, krwlock_t *lock, void *vhook) { if (lock) rw_enter(lock, RW_WRITER); #ifdef DIAGNOSTIC struct hook_desc *hd; LIST_FOREACH(hd, list, hk_list) { if (hd == vhook) break; } if (hd == NULL) panic("hook_disestablish: hook %p not established", vhook); #endif LIST_REMOVE((struct hook_desc *)vhook, hk_list); free(vhook, M_DEVBUF); if (lock) rw_exit(lock); } static void hook_destroy(hook_list_t *list) { struct hook_desc *hd; while ((hd = LIST_FIRST(list)) != NULL) { LIST_REMOVE(hd, hk_list); free(hd, M_DEVBUF); } } static void hook_proc_run(hook_list_t *list, krwlock_t *lock, struct proc *p) { struct hook_desc *hd; RUN_ONCE(&hook_control, hook_init); if (lock) rw_enter(lock, RW_READER); LIST_FOREACH(hd, list, hk_list) { __FPTRCAST(void (*)(struct proc *, void *), *hd->hk_fn)(p, hd->hk_arg); } if (lock) rw_exit(lock); } /* * "Shutdown hook" types, functions, and variables. * * Should be invoked immediately before the * system is halted or rebooted, i.e. after file systems unmounted, * after crash dump done, etc. * * Each shutdown hook is removed from the list before it's run, so that * it won't be run again. */ static hook_list_t shutdownhook_list = LIST_HEAD_INITIALIZER(shutdownhook_list); void * shutdownhook_establish(void (*fn)(void *), void *arg) { return hook_establish(&shutdownhook_list, NULL, fn, arg); } void shutdownhook_disestablish(void *vhook) { hook_disestablish(&shutdownhook_list, NULL, vhook); } /* * Run shutdown hooks. Should be invoked immediately before the * system is halted or rebooted, i.e. after file systems unmounted, * after crash dump done, etc. * * Each shutdown hook is removed from the list before it's run, so that * it won't be run again. */ void doshutdownhooks(void) { struct hook_desc *dp; while ((dp = LIST_FIRST(&shutdownhook_list)) != NULL) { LIST_REMOVE(dp, hk_list); (*dp->hk_fn)(dp->hk_arg); #if 0 /* * Don't bother freeing the hook structure,, since we may * be rebooting because of a memory corruption problem, * and this might only make things worse. It doesn't * matter, anyway, since the system is just about to * reboot. */ free(dp, M_DEVBUF); #endif } } /* * "Mountroot hook" types, functions, and variables. */ static hook_list_t mountroothook_list=LIST_HEAD_INITIALIZER(mountroothook_list); void * mountroothook_establish(void (*fn)(device_t), device_t dev) { return hook_establish(&mountroothook_list, NULL, __FPTRCAST(void (*), fn), dev); } void mountroothook_disestablish(void *vhook) { hook_disestablish(&mountroothook_list, NULL, vhook); } void mountroothook_destroy(void) { hook_destroy(&mountroothook_list); } void domountroothook(device_t therootdev) { struct hook_desc *hd; LIST_FOREACH(hd, &mountroothook_list, hk_list) { if (hd->hk_arg == therootdev) { (*hd->hk_fn)(hd->hk_arg); return; } } } static hook_list_t exechook_list = LIST_HEAD_INITIALIZER(exechook_list); void * exechook_establish(void (*fn)(struct proc *, void *), void *arg) { return hook_establish(&exechook_list, &exec_lock, __FPTRCAST(void (*)(void *), fn), arg); } void exechook_disestablish(void *vhook) { hook_disestablish(&exechook_list, &exec_lock, vhook); } /* * Run exec hooks. */ void doexechooks(struct proc *p) { KASSERT(rw_lock_held(&exec_lock)); hook_proc_run(&exechook_list, NULL, p); } static hook_list_t exithook_list = LIST_HEAD_INITIALIZER(exithook_list); void * exithook_establish(void (*fn)(struct proc *, void *), void *arg) { return hook_establish(&exithook_list, &exithook_lock, __FPTRCAST(void (*)(void *), fn), arg); } void exithook_disestablish(void *vhook) { hook_disestablish(&exithook_list, &exithook_lock, vhook); } /* * Run exit hooks. */ void doexithooks(struct proc *p) { hook_proc_run(&exithook_list, &exithook_lock, p); } static hook_list_t forkhook_list = LIST_HEAD_INITIALIZER(forkhook_list); void * forkhook_establish(void (*fn)(struct proc *, struct proc *)) { return hook_establish(&forkhook_list, &forkhook_lock, __FPTRCAST(void (*)(void *), fn), NULL); } void forkhook_disestablish(void *vhook) { hook_disestablish(&forkhook_list, &forkhook_lock, vhook); } /* * Run fork hooks. */ void doforkhooks(struct proc *p2, struct proc *p1) { struct hook_desc *hd; RUN_ONCE(&hook_control, hook_init); rw_enter(&forkhook_lock, RW_READER); LIST_FOREACH(hd, &forkhook_list, hk_list) { __FPTRCAST(void (*)(struct proc *, struct proc *), *hd->hk_fn) (p2, p1); } rw_exit(&forkhook_lock); } static hook_list_t critpollhook_list = LIST_HEAD_INITIALIZER(critpollhook_list); void * critpollhook_establish(void (*fn)(void *), void *arg) { return hook_establish(&critpollhook_list, NULL, fn, arg); } void critpollhook_disestablish(void *vhook) { hook_disestablish(&critpollhook_list, NULL, vhook); } /* * Run critical polling hooks. */ void docritpollhooks(void) { struct hook_desc *hd; LIST_FOREACH(hd, &critpollhook_list, hk_list) { (*hd->hk_fn)(hd->hk_arg); } } /* * "Power hook" types, functions, and variables. * The list of power hooks is kept ordered with the last registered hook * first. * When running the hooks on power down the hooks are called in reverse * registration order, when powering up in registration order. */ struct powerhook_desc { TAILQ_ENTRY(powerhook_desc) sfd_list; void (*sfd_fn)(int, void *); void *sfd_arg; char sfd_name[16]; }; static TAILQ_HEAD(powerhook_head, powerhook_desc) powerhook_list = TAILQ_HEAD_INITIALIZER(powerhook_list); void * powerhook_establish(const char *name, void (*fn)(int, void *), void *arg) { struct powerhook_desc *ndp; ndp = (struct powerhook_desc *) malloc(sizeof(*ndp), M_DEVBUF, M_NOWAIT); if (ndp == NULL) return (NULL); ndp->sfd_fn = fn; ndp->sfd_arg = arg; strlcpy(ndp->sfd_name, name, sizeof(ndp->sfd_name)); TAILQ_INSERT_HEAD(&powerhook_list, ndp, sfd_list); aprint_error("%s: WARNING: powerhook_establish is deprecated\n", name); return (ndp); } void powerhook_disestablish(void *vhook) { #ifdef DIAGNOSTIC struct powerhook_desc *dp; TAILQ_FOREACH(dp, &powerhook_list, sfd_list) if (dp == vhook) goto found; panic("powerhook_disestablish: hook %p not established", vhook); found: #endif TAILQ_REMOVE(&powerhook_list, (struct powerhook_desc *)vhook, sfd_list); free(vhook, M_DEVBUF); } /* * Run power hooks. */ void dopowerhooks(int why) { struct powerhook_desc *dp; const char *why_name; static const char * pwr_names[] = {PWR_NAMES}; why_name = why < __arraycount(pwr_names) ? pwr_names[why] : "???"; if (why == PWR_RESUME || why == PWR_SOFTRESUME) { TAILQ_FOREACH_REVERSE(dp, &powerhook_list, powerhook_head, sfd_list) { if (powerhook_debug) printf("dopowerhooks %s: %s (%p)\n", why_name, dp->sfd_name, dp); (*dp->sfd_fn)(why, dp->sfd_arg); } } else { TAILQ_FOREACH(dp, &powerhook_list, sfd_list) { if (powerhook_debug) printf("dopowerhooks %s: %s (%p)\n", why_name, dp->sfd_name, dp); (*dp->sfd_fn)(why, dp->sfd_arg); } } if (powerhook_debug) printf("dopowerhooks: %s done\n", why_name); } /* * A simple linear hook. */ khook_list_t * simplehook_create(int ipl, const char *wmsg) { khook_list_t *l; l = kmem_zalloc(sizeof(*l), KM_SLEEP); mutex_init(&l->hl_lock, MUTEX_DEFAULT, ipl); strlcpy(l->hl_namebuf, wmsg, sizeof(l->hl_namebuf)); cv_init(&l->hl_cv, l->hl_namebuf); LIST_INIT(&l->hl_list); l->hl_state = HKLIST_IDLE; return l; } void simplehook_destroy(khook_list_t *l) { struct hook_desc *hd; KASSERT(l->hl_state == HKLIST_IDLE); while ((hd = LIST_FIRST(&l->hl_list)) != NULL) { LIST_REMOVE(hd, hk_list); kmem_free(hd, sizeof(*hd)); } cv_destroy(&l->hl_cv); mutex_destroy(&l->hl_lock); kmem_free(l, sizeof(*l)); } int simplehook_dohooks(khook_list_t *l) { struct hook_desc *hd, *nexthd; kmutex_t *cv_lock; void (*fn)(void *); void *arg; mutex_enter(&l->hl_lock); if (l->hl_state != HKLIST_IDLE) { mutex_exit(&l->hl_lock); return EBUSY; } /* stop removing hooks */ l->hl_state = HKLIST_INUSE; l->hl_lwp = curlwp; LIST_FOREACH(hd, &l->hl_list, hk_list) { if (hd->hk_fn == NULL) continue; fn = hd->hk_fn; arg = hd->hk_arg; l->hl_active_hk = hd; l->hl_cvlock = NULL; mutex_exit(&l->hl_lock); /* do callback without l->hl_lock */ (*fn)(arg); mutex_enter(&l->hl_lock); l->hl_active_hk = NULL; cv_lock = l->hl_cvlock; if (hd->hk_fn == NULL) { if (cv_lock != NULL) { mutex_exit(&l->hl_lock); mutex_enter(cv_lock); } cv_broadcast(&l->hl_cv); if (cv_lock != NULL) { mutex_exit(cv_lock); mutex_enter(&l->hl_lock); } } } /* remove marked node while running hooks */ LIST_FOREACH_SAFE(hd, &l->hl_list, hk_list, nexthd) { if (hd->hk_fn == NULL) { LIST_REMOVE(hd, hk_list); kmem_free(hd, sizeof(*hd)); } } l->hl_lwp = NULL; l->hl_state = HKLIST_IDLE; mutex_exit(&l->hl_lock); return 0; } khook_t * simplehook_establish(khook_list_t *l, void (*fn)(void *), void *arg) { struct hook_desc *hd; hd = kmem_zalloc(sizeof(*hd), KM_SLEEP); hd->hk_fn = fn; hd->hk_arg = arg; mutex_enter(&l->hl_lock); LIST_INSERT_HEAD(&l->hl_list, hd, hk_list); mutex_exit(&l->hl_lock); return hd; } void simplehook_disestablish(khook_list_t *l, khook_t *hd, kmutex_t *lock) { struct hook_desc *hd0 __diagused; kmutex_t *cv_lock; KASSERT(lock == NULL || mutex_owned(lock)); mutex_enter(&l->hl_lock); #ifdef DIAGNOSTIC LIST_FOREACH(hd0, &l->hl_list, hk_list) { if (hd == hd0) break; } if (hd0 == NULL) panic("hook_disestablish: hook %p not established", hd); #endif /* The hook is not referred, remove immediately */ if (l->hl_state == HKLIST_IDLE) { LIST_REMOVE(hd, hk_list); kmem_free(hd, sizeof(*hd)); mutex_exit(&l->hl_lock); return; } /* remove callback. hd will be removed in dohooks */ hd->hk_fn = NULL; hd->hk_arg = NULL; /* If the hook is running, wait for the completion */ if (l->hl_active_hk == hd && l->hl_lwp != curlwp) { if (lock != NULL) { cv_lock = lock; KASSERT(l->hl_cvlock == NULL); l->hl_cvlock = lock; mutex_exit(&l->hl_lock); } else { cv_lock = &l->hl_lock; } cv_wait(&l->hl_cv, cv_lock); if (lock == NULL) mutex_exit(&l->hl_lock); } else { mutex_exit(&l->hl_lock); } } bool simplehook_has_hooks(khook_list_t *l) { bool empty; mutex_enter(&l->hl_lock); empty = LIST_EMPTY(&l->hl_list); mutex_exit(&l->hl_lock); return !empty; }
26 26 17 10 19 1 7 26 26 26 3 24 24 20 4 26 3 24 26 5 4 7 5 10 26 12 13 4 3 2 1 3 6 6 6 6 13 13 24 24 24 19 5 5 4 24 20 4 21 3 3 21 3 4 4 12 15 15 30 30 30 30 30 1 30 25 5 5 5 16 17 3 14 11 4 4 17 17 17 17 3 14 63 62 62 1 63 41 13 38 6 6 1 6 6 31 31 2 30 17 17 17 2 2 2 2 2 2 4 3 3 1 3 4 4 4 1 3 3 1 2 2 1 2 4 8 8 8 8 8 8 2 4 4 4 4 3 3 3 1 1 1 3 3 3 1 2 5 5 5 5 5 1 1 3 9 9 9 3 1 5 5 5 5 1 1 1 2 2 1 1 67 68 68 28 43 21 34 41 22 43 86 86 86 57 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 /* $NetBSD: tmpfs_subr.c,v 1.117 2023/04/29 08:15:13 riastradh Exp $ */ /* * Copyright (c) 2005-2020 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Julio M. Merino Vidal, developed as part of Google's Summer of Code * 2005 program, and by Mindaugas Rasiukevicius. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /* * Efficient memory file system: interfaces for inode and directory entry * construction, destruction and manipulation. * * Reference counting * * The link count of inode (tmpfs_node_t::tn_links) is used as a * reference counter. However, it has slightly different semantics. * * For directories - link count represents directory entries, which * refer to the directories. In other words, it represents the count * of sub-directories. It also takes into account the virtual '.' * entry (which has no real entry in the list). For files - link count * represents the hard links. Since only empty directories can be * removed - link count aligns the reference counting requirements * enough. Note: to check whether directory is not empty, the inode * size (tmpfs_node_t::tn_size) can be used. * * The inode itself, as an object, gathers its first reference when * directory entry is attached via tmpfs_dir_attach(9). For instance, * after regular tmpfs_create(), a file would have a link count of 1, * while directory after tmpfs_mkdir() would have 2 (due to '.'). * * Reclamation * * It should be noted that tmpfs inodes rely on a combination of vnode * reference counting and link counting. That is, an inode can only be * destroyed if its associated vnode is inactive. The destruction is * done on vnode reclamation i.e. tmpfs_reclaim(). It should be noted * that tmpfs_node_t::tn_links being 0 is a destruction criterion. * * If an inode has references within the file system (tn_links > 0) and * its inactive vnode gets reclaimed/recycled - then the association is * broken in tmpfs_reclaim(). In such case, an inode will always pass * tmpfs_lookup() and thus vcache_get() to associate a new vnode. * * Lock order * * vnode_t::v_vlock -> * vnode_t::v_interlock */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: tmpfs_subr.c,v 1.117 2023/04/29 08:15:13 riastradh Exp $"); #include <sys/param.h> #include <sys/cprng.h> #include <sys/dirent.h> #include <sys/event.h> #include <sys/kmem.h> #include <sys/mount.h> #include <sys/namei.h> #include <sys/time.h> #include <sys/stat.h> #include <sys/systm.h> #include <sys/vnode.h> #include <sys/kauth.h> #include <sys/atomic.h> #include <uvm/uvm_aobj.h> #include <uvm/uvm_extern.h> #include <uvm/uvm_object.h> #include <miscfs/specfs/specdev.h> #include <miscfs/genfs/genfs.h> #include <fs/tmpfs/tmpfs.h> #include <fs/tmpfs/tmpfs_fifoops.h> #include <fs/tmpfs/tmpfs_specops.h> #include <fs/tmpfs/tmpfs_vnops.h> static void tmpfs_dir_putseq(tmpfs_node_t *, tmpfs_dirent_t *); /* * Initialize vnode with tmpfs node. */ static void tmpfs_init_vnode(struct vnode *vp, tmpfs_node_t *node) { krwlock_t *slock; KASSERT(node->tn_vnode == NULL); /* Share the interlock with the node. */ if (node->tn_type == VREG) { slock = node->tn_spec.tn_reg.tn_aobj->vmobjlock; rw_obj_hold(slock); uvm_obj_setlock(&vp->v_uobj, slock); } vp->v_tag = VT_TMPFS; vp->v_type = node->tn_type; /* Type-specific initialization. */ switch (vp->v_type) { case VBLK: case VCHR: vp->v_op = tmpfs_specop_p; spec_node_init(vp, node->tn_spec.tn_dev.tn_rdev); break; case VFIFO: vp->v_op = tmpfs_fifoop_p; break; case VDIR: if (node->tn_spec.tn_dir.tn_parent == node) vp->v_vflag |= VV_ROOT; /* FALLTHROUGH */ case VLNK: case VREG: case VSOCK: vp->v_op = tmpfs_vnodeop_p; break; default: panic("bad node type %d", vp->v_type); break; } vp->v_data = node; node->tn_vnode = vp; uvm_vnp_setsize(vp, node->tn_size); KASSERT(node->tn_mode != VNOVAL); cache_enter_id(vp, node->tn_mode, node->tn_uid, node->tn_gid, true); } /* * tmpfs_loadvnode: initialise a vnode for a specified inode. */ int tmpfs_loadvnode(struct mount *mp, struct vnode *vp, const void *key, size_t key_len, const void **new_key) { tmpfs_node_t *node; KASSERT(key_len == sizeof(node)); memcpy(&node, key, key_len); if (node->tn_links == 0) return ENOENT; tmpfs_init_vnode(vp, node); *new_key = &vp->v_data; return 0; } /* * tmpfs_newvnode: allocate a new inode of a specified type and * attach the vonode. */ int tmpfs_newvnode(struct mount *mp, struct vnode *dvp, struct vnode *vp, struct vattr *vap, kauth_cred_t cred, void *extra, size_t *key_len, const void **new_key) { tmpfs_mount_t *tmp = VFS_TO_TMPFS(mp); tmpfs_node_t *node, *dnode; if (dvp != NULL) { KASSERT(VOP_ISLOCKED(dvp)); dnode = VP_TO_TMPFS_DIR(dvp); if (dnode->tn_links == 0) return ENOENT; if (vap->va_type == VDIR) { /* Check for maximum links limit. */ if (dnode->tn_links == LINK_MAX) return EMLINK; KASSERT(dnode->tn_links < LINK_MAX); } } else dnode = NULL; node = tmpfs_node_get(tmp); if (node == NULL) return ENOSPC; /* Initially, no references and no associations. */ node->tn_links = 0; node->tn_vnode = NULL; node->tn_holdcount = 0; node->tn_dirent_hint = NULL; /* * XXX Where the pool is backed by a map larger than (4GB * * sizeof(*node)), this may produce duplicate inode numbers * for applications that do not understand 64-bit ino_t. */ node->tn_id = (ino_t)((uintptr_t)node / sizeof(*node)); /* * Make sure the generation number is not zero. * tmpfs_inactive() uses generation zero to mark dead nodes. */ do { node->tn_gen = TMPFS_NODE_GEN_MASK & cprng_fast32(); } while (node->tn_gen == 0); /* Generic initialization. */ KASSERT((int)vap->va_type != VNOVAL); node->tn_type = vap->va_type; node->tn_size = 0; node->tn_flags = 0; node->tn_lockf = NULL; node->tn_tflags = 0; vfs_timestamp(&node->tn_atime); node->tn_birthtime = node->tn_atime; node->tn_ctime = node->tn_atime; node->tn_mtime = node->tn_atime; mutex_init(&node->tn_timelock, MUTEX_DEFAULT, IPL_NONE); if (dvp == NULL) { KASSERT(vap->va_uid != VNOVAL && vap->va_gid != VNOVAL); node->tn_uid = vap->va_uid; node->tn_gid = vap->va_gid; vp->v_vflag |= VV_ROOT; } else { KASSERT(dnode != NULL); node->tn_uid = kauth_cred_geteuid(cred); node->tn_gid = dnode->tn_gid; } KASSERT(vap->va_mode != VNOVAL); node->tn_mode = vap->va_mode; /* Type-specific initialization. */ switch (node->tn_type) { case VBLK: case VCHR: /* Character/block special device. */ KASSERT(vap->va_rdev != VNOVAL); node->tn_spec.tn_dev.tn_rdev = vap->va_rdev; break; case VDIR: /* Directory. */ TAILQ_INIT(&node->tn_spec.tn_dir.tn_dir); node->tn_spec.tn_dir.tn_parent = NULL; node->tn_spec.tn_dir.tn_seq_arena = NULL; node->tn_spec.tn_dir.tn_next_seq = TMPFS_DIRSEQ_START; node->tn_spec.tn_dir.tn_readdir_lastp = NULL; /* Extra link count for the virtual '.' entry. */ node->tn_links++; break; case VFIFO: case VSOCK: break; case VLNK: node->tn_size = 0; node->tn_spec.tn_lnk.tn_link = NULL; break; case VREG: /* Regular file. Create an underlying UVM object. */ node->tn_spec.tn_reg.tn_aobj = uao_create(INT64_MAX - PAGE_SIZE, 0); node->tn_spec.tn_reg.tn_aobj_pages = 0; break; default: panic("bad node type %d", vp->v_type); break; } tmpfs_init_vnode(vp, node); mutex_enter(&tmp->tm_lock); LIST_INSERT_HEAD(&tmp->tm_nodes, node, tn_entries); mutex_exit(&tmp->tm_lock); *key_len = sizeof(vp->v_data); *new_key = &vp->v_data; return 0; } /* * tmpfs_free_node: remove the inode from a list in the mount point and * destroy the inode structures. */ void tmpfs_free_node(tmpfs_mount_t *tmp, tmpfs_node_t *node) { size_t objsz; uint32_t hold; mutex_enter(&tmp->tm_lock); hold = atomic_or_32_nv(&node->tn_holdcount, TMPFS_NODE_RECLAIMED); /* Defer destruction to last thread holding this node. */ if (hold != TMPFS_NODE_RECLAIMED) { mutex_exit(&tmp->tm_lock); return; } LIST_REMOVE(node, tn_entries); mutex_exit(&tmp->tm_lock); switch (node->tn_type) { case VLNK: if (node->tn_size > 0) { tmpfs_strname_free(tmp, node->tn_spec.tn_lnk.tn_link, node->tn_size); } break; case VREG: /* * Calculate the size of inode data, decrease the used-memory * counter, and destroy the unerlying UVM object (if any). */ objsz = PAGE_SIZE * node->tn_spec.tn_reg.tn_aobj_pages; if (objsz != 0) { tmpfs_mem_decr(tmp, objsz); } if (node->tn_spec.tn_reg.tn_aobj != NULL) { uao_detach(node->tn_spec.tn_reg.tn_aobj); } break; case VDIR: KASSERT(node->tn_size == 0); KASSERT(node->tn_spec.tn_dir.tn_seq_arena == NULL); KASSERT(TAILQ_EMPTY(&node->tn_spec.tn_dir.tn_dir)); KASSERT(node->tn_spec.tn_dir.tn_parent == NULL || node == tmp->tm_root); break; default: break; } KASSERT(node->tn_vnode == NULL); KASSERT(node->tn_links == 0); mutex_destroy(&node->tn_timelock); tmpfs_node_put(tmp, node); } /* * tmpfs_construct_node: allocate a new file of specified type and adds it * into the parent directory. * * => Credentials of the caller are used. */ int tmpfs_construct_node(vnode_t *dvp, vnode_t **vpp, struct vattr *vap, struct componentname *cnp, char *target) { tmpfs_mount_t *tmp = VFS_TO_TMPFS(dvp->v_mount); tmpfs_node_t *dnode = VP_TO_TMPFS_DIR(dvp), *node; tmpfs_dirent_t *de, *wde; char *slink = NULL; int ssize = 0; int error; /* Allocate symlink target. */ if (target != NULL) { KASSERT(vap->va_type == VLNK); ssize = strlen(target); KASSERT(ssize < MAXPATHLEN); if (ssize > 0) { slink = tmpfs_strname_alloc(tmp, ssize); if (slink == NULL) return ENOSPC; memcpy(slink, target, ssize); } } /* Allocate a directory entry that points to the new file. */ error = tmpfs_alloc_dirent(tmp, cnp->cn_nameptr, cnp->cn_namelen, &de); if (error) { if (slink != NULL) tmpfs_strname_free(tmp, slink, ssize); return error; } /* Allocate a vnode that represents the new file. */ error = vcache_new(dvp->v_mount, dvp, vap, cnp->cn_cred, NULL, vpp); if (error) { if (slink != NULL) tmpfs_strname_free(tmp, slink, ssize); tmpfs_free_dirent(tmp, de); return error; } error = vn_lock(*vpp, LK_EXCLUSIVE); if (error) { vrele(*vpp); *vpp = NULL; if (slink != NULL) tmpfs_strname_free(tmp, slink, ssize); tmpfs_free_dirent(tmp, de); return error; } node = VP_TO_TMPFS_NODE(*vpp); if (slink != NULL) { node->tn_spec.tn_lnk.tn_link = slink; node->tn_size = ssize; } /* Remove whiteout before adding the new entry. */ if (cnp->cn_flags & ISWHITEOUT) { wde = tmpfs_dir_lookup(dnode, cnp); KASSERT(wde != NULL && wde->td_node == TMPFS_NODE_WHITEOUT); tmpfs_dir_detach(dnode, wde); tmpfs_free_dirent(tmp, wde); } /* Associate inode and attach the entry into the directory. */ tmpfs_dir_attach(dnode, de, node); /* Make node opaque if requested. */ if (cnp->cn_flags & ISWHITEOUT) node->tn_flags |= UF_OPAQUE; /* Update the parent's timestamps. */ tmpfs_update(dvp, TMPFS_UPDATE_MTIME | TMPFS_UPDATE_CTIME); VOP_UNLOCK(*vpp); cache_enter(dvp, *vpp, cnp->cn_nameptr, cnp->cn_namelen, cnp->cn_flags); return 0; } /* * tmpfs_alloc_dirent: allocates a new directory entry for the inode. * The directory entry contains a path name component. */ int tmpfs_alloc_dirent(tmpfs_mount_t *tmp, const char *name, uint16_t len, tmpfs_dirent_t **de) { tmpfs_dirent_t *nde; nde = tmpfs_dirent_get(tmp); if (nde == NULL) return ENOSPC; nde->td_name = tmpfs_strname_alloc(tmp, len); if (nde->td_name == NULL) { tmpfs_dirent_put(tmp, nde); return ENOSPC; } nde->td_namelen = len; memcpy(nde->td_name, name, len); nde->td_seq = TMPFS_DIRSEQ_NONE; nde->td_node = NULL; /* for asserts */ *de = nde; return 0; } /* * tmpfs_free_dirent: free a directory entry. */ void tmpfs_free_dirent(tmpfs_mount_t *tmp, tmpfs_dirent_t *de) { KASSERT(de->td_node == NULL); KASSERT(de->td_seq == TMPFS_DIRSEQ_NONE); tmpfs_strname_free(tmp, de->td_name, de->td_namelen); tmpfs_dirent_put(tmp, de); } /* * tmpfs_dir_attach: associate directory entry with a specified inode, * and attach the entry into the directory, specified by vnode. * * => Increases link count on the associated node. * => Increases link count on directory node if our node is VDIR. * => It is caller's responsibility to check for the LINK_MAX limit. * => Triggers kqueue events here. */ void tmpfs_dir_attach(tmpfs_node_t *dnode, tmpfs_dirent_t *de, tmpfs_node_t *node) { vnode_t *dvp = dnode->tn_vnode; int events = NOTE_WRITE; KASSERT(dvp != NULL); KASSERT(VOP_ISLOCKED(dvp)); /* Get a new sequence number. */ KASSERT(de->td_seq == TMPFS_DIRSEQ_NONE); de->td_seq = tmpfs_dir_getseq(dnode, de); /* Associate directory entry and the inode. */ de->td_node = node; if (node != TMPFS_NODE_WHITEOUT) { KASSERT(node->tn_links < LINK_MAX); node->tn_links++; /* Save the hint (might overwrite). */ node->tn_dirent_hint = de; } else if ((dnode->tn_gen & TMPFS_WHITEOUT_BIT) == 0) { /* Flag that there are whiteout entries. */ atomic_or_32(&dnode->tn_gen, TMPFS_WHITEOUT_BIT); } /* Insert the entry to the directory (parent of inode). */ TAILQ_INSERT_TAIL(&dnode->tn_spec.tn_dir.tn_dir, de, td_entries); KASSERT(dnode->tn_size <= __type_max(off_t) - sizeof(tmpfs_dirent_t)); dnode->tn_size += sizeof(tmpfs_dirent_t); uvm_vnp_setsize(dvp, dnode->tn_size); if (node != TMPFS_NODE_WHITEOUT && node->tn_type == VDIR) { /* Set parent. */ KASSERT(node->tn_spec.tn_dir.tn_parent == NULL); node->tn_spec.tn_dir.tn_parent = dnode; /* Increase the link count of parent. */ KASSERT(dnode->tn_links < LINK_MAX); dnode->tn_links++; events |= NOTE_LINK; TMPFS_VALIDATE_DIR(node); } } /* * tmpfs_dir_detach: disassociate directory entry and its inode, * and detach the entry from the directory, specified by vnode. * * => Decreases link count on the associated node. * => Decreases the link count on directory node, if our node is VDIR. * => Triggers kqueue events here. * * => Note: dvp and vp may be NULL only if called by tmpfs_unmount(). */ void tmpfs_dir_detach(tmpfs_node_t *dnode, tmpfs_dirent_t *de) { tmpfs_node_t *node = de->td_node; vnode_t *dvp = dnode->tn_vnode; KASSERT(dvp == NULL || VOP_ISLOCKED(dvp)); if (__predict_true(node != TMPFS_NODE_WHITEOUT)) { /* Deassociate the inode and entry. */ node->tn_dirent_hint = NULL; KASSERT(node->tn_links > 0); node->tn_links--; /* If directory - decrease the link count of parent. */ if (node->tn_type == VDIR) { KASSERT(node->tn_spec.tn_dir.tn_parent == dnode); node->tn_spec.tn_dir.tn_parent = NULL; KASSERT(dnode->tn_links > 0); dnode->tn_links--; } } de->td_node = NULL; /* Remove the entry from the directory. */ if (dnode->tn_spec.tn_dir.tn_readdir_lastp == de) { dnode->tn_spec.tn_dir.tn_readdir_lastp = NULL; } TAILQ_REMOVE(&dnode->tn_spec.tn_dir.tn_dir, de, td_entries); KASSERT(dnode->tn_size >= sizeof(tmpfs_dirent_t)); dnode->tn_size -= sizeof(tmpfs_dirent_t); tmpfs_dir_putseq(dnode, de); if (dvp) { uvm_vnp_setsize(dvp, dnode->tn_size); } } /* * tmpfs_dir_lookup: find a directory entry in the specified inode. * * Note that the . and .. components are not allowed as they do not * physically exist within directories. */ tmpfs_dirent_t * tmpfs_dir_lookup(tmpfs_node_t *node, struct componentname *cnp) { const char *name = cnp->cn_nameptr; const uint16_t nlen = cnp->cn_namelen; tmpfs_dirent_t *de; KASSERT(VOP_ISLOCKED(node->tn_vnode)); KASSERT(nlen != 1 || !(name[0] == '.')); KASSERT(nlen != 2 || !(name[0] == '.' && name[1] == '.')); TMPFS_VALIDATE_DIR(node); TAILQ_FOREACH(de, &node->tn_spec.tn_dir.tn_dir, td_entries) { if (de->td_namelen != nlen) continue; if (memcmp(de->td_name, name, nlen) != 0) continue; break; } return de; } /* * tmpfs_dir_cached: get a cached directory entry if it is valid. Used to * avoid unnecessary tmpfs_dir_lookup(). * * => The vnode must be locked. */ tmpfs_dirent_t * tmpfs_dir_cached(tmpfs_node_t *node) { tmpfs_dirent_t *de = node->tn_dirent_hint; KASSERT(VOP_ISLOCKED(node->tn_vnode)); if (de == NULL) { return NULL; } KASSERT(de->td_node == node); /* * Directories always have a valid hint. For files, check if there * are any hard links. If there are - hint might be invalid. */ return (node->tn_type != VDIR && node->tn_links > 1) ? NULL : de; } /* * tmpfs_dir_getseq: get a per-directory sequence number for the entry. * * => Shall not be larger than 2^31 for linux32 compatibility. */ uint32_t tmpfs_dir_getseq(tmpfs_node_t *dnode, tmpfs_dirent_t *de) { uint32_t seq = de->td_seq; vmem_t *seq_arena; vmem_addr_t off; int error __diagused; TMPFS_VALIDATE_DIR(dnode); if (__predict_true(seq != TMPFS_DIRSEQ_NONE)) { /* Already set. */ KASSERT(seq >= TMPFS_DIRSEQ_START); return seq; } /* * The "." and ".." and the end-of-directory have reserved numbers. * The other sequence numbers are allocated as following: * * - The first half of the 2^31 is assigned incrementally. * * - If that range is exceeded, then the second half of 2^31 * is used, but managed by vmem(9). */ seq = dnode->tn_spec.tn_dir.tn_next_seq; KASSERT(seq >= TMPFS_DIRSEQ_START); if (__predict_true(seq < TMPFS_DIRSEQ_END)) { /* First half: just increment and return. */ dnode->tn_spec.tn_dir.tn_next_seq++; return seq; } /* * First half exceeded, use the second half. May need to create * vmem(9) arena for the directory first. */ if ((seq_arena = dnode->tn_spec.tn_dir.tn_seq_arena) == NULL) { seq_arena = vmem_create("tmpfscoo", 0, TMPFS_DIRSEQ_END - 1, 1, NULL, NULL, NULL, 0, VM_SLEEP, IPL_NONE); dnode->tn_spec.tn_dir.tn_seq_arena = seq_arena; KASSERT(seq_arena != NULL); } error = vmem_alloc(seq_arena, 1, VM_SLEEP | VM_BESTFIT, &off); KASSERT(error == 0); KASSERT(off < TMPFS_DIRSEQ_END); seq = off | TMPFS_DIRSEQ_END; return seq; } static void tmpfs_dir_putseq(tmpfs_node_t *dnode, tmpfs_dirent_t *de) { vmem_t *seq_arena = dnode->tn_spec.tn_dir.tn_seq_arena; uint32_t seq = de->td_seq; TMPFS_VALIDATE_DIR(dnode); if (seq == TMPFS_DIRSEQ_NONE || seq < TMPFS_DIRSEQ_END) { /* First half (or no sequence number set yet). */ KASSERT(de->td_seq >= TMPFS_DIRSEQ_START); } else { /* Second half. */ KASSERT(seq_arena != NULL); KASSERT(seq >= TMPFS_DIRSEQ_END); seq &= ~TMPFS_DIRSEQ_END; vmem_free(seq_arena, seq, 1); } de->td_seq = TMPFS_DIRSEQ_NONE; /* Empty? We can reset. */ if (seq_arena && dnode->tn_size == 0) { dnode->tn_spec.tn_dir.tn_seq_arena = NULL; dnode->tn_spec.tn_dir.tn_next_seq = TMPFS_DIRSEQ_START; vmem_destroy(seq_arena); } } /* * tmpfs_dir_lookupbyseq: lookup a directory entry by the sequence number. */ tmpfs_dirent_t * tmpfs_dir_lookupbyseq(tmpfs_node_t *node, off_t seq) { tmpfs_dirent_t *de = node->tn_spec.tn_dir.tn_readdir_lastp; TMPFS_VALIDATE_DIR(node); /* * First, check the cache. If does not match - perform a lookup. */ if (de && de->td_seq == seq) { KASSERT(de->td_seq >= TMPFS_DIRSEQ_START); KASSERT(de->td_seq != TMPFS_DIRSEQ_NONE); return de; } TAILQ_FOREACH(de, &node->tn_spec.tn_dir.tn_dir, td_entries) { KASSERT(de->td_seq >= TMPFS_DIRSEQ_START); KASSERT(de->td_seq != TMPFS_DIRSEQ_NONE); if (de->td_seq == seq) return de; } return NULL; } /* * tmpfs_dir_getdotents: helper function for tmpfs_readdir() to get the * dot meta entries, that is, "." or "..". Copy it to the UIO space. */ static int tmpfs_dir_getdotents(tmpfs_node_t *node, struct dirent *dp, struct uio *uio) { tmpfs_dirent_t *de; off_t next = 0; int error; switch (uio->uio_offset) { case TMPFS_DIRSEQ_DOT: dp->d_fileno = node->tn_id; strlcpy(dp->d_name, ".", sizeof(dp->d_name)); next = TMPFS_DIRSEQ_DOTDOT; break; case TMPFS_DIRSEQ_DOTDOT: dp->d_fileno = node->tn_spec.tn_dir.tn_parent->tn_id; strlcpy(dp->d_name, "..", sizeof(dp->d_name)); de = TAILQ_FIRST(&node->tn_spec.tn_dir.tn_dir); next = de ? tmpfs_dir_getseq(node, de) : TMPFS_DIRSEQ_EOF; break; default: KASSERT(false); } dp->d_type = DT_DIR; dp->d_namlen = strlen(dp->d_name); dp->d_reclen = _DIRENT_SIZE(dp); if (dp->d_reclen > uio->uio_resid) { return EJUSTRETURN; } if ((error = uiomove(dp, dp->d_reclen, uio)) != 0) { return error; } uio->uio_offset = next; return error; } /* * tmpfs_dir_getdents: helper function for tmpfs_readdir. * * => Returns as much directory entries as can fit in the uio space. * => The read starts at uio->uio_offset. */ int tmpfs_dir_getdents(tmpfs_node_t *node, struct uio *uio, off_t *cntp) { tmpfs_dirent_t *de; struct dirent dent; int error = 0; KASSERT(VOP_ISLOCKED(node->tn_vnode)); TMPFS_VALIDATE_DIR(node); /* * First check for the "." and ".." cases. * Note: tmpfs_dir_getdotents() will "seek" for us. */ memset(&dent, 0, sizeof(dent)); if (uio->uio_offset == TMPFS_DIRSEQ_DOT) { if ((error = tmpfs_dir_getdotents(node, &dent, uio)) != 0) { goto done; } (*cntp)++; } if (uio->uio_offset == TMPFS_DIRSEQ_DOTDOT) { if ((error = tmpfs_dir_getdotents(node, &dent, uio)) != 0) { goto done; } (*cntp)++; } /* Done if we reached the end. */ if (uio->uio_offset == TMPFS_DIRSEQ_EOF) { goto done; } /* Locate the directory entry given by the given sequence number. */ de = tmpfs_dir_lookupbyseq(node, uio->uio_offset); if (de == NULL) { error = EINVAL; goto done; } /* * Read as many entries as possible; i.e., until we reach the end * of the directory or we exhaust UIO space. */ do { if (de->td_node == TMPFS_NODE_WHITEOUT) { dent.d_fileno = 1; dent.d_type = DT_WHT; } else { dent.d_fileno = de->td_node->tn_id; dent.d_type = vtype2dt(de->td_node->tn_type); } dent.d_namlen = de->td_namelen; KASSERT(de->td_namelen < sizeof(dent.d_name)); memcpy(dent.d_name, de->td_name, de->td_namelen); dent.d_name[de->td_namelen] = '\0'; dent.d_reclen = _DIRENT_SIZE(&dent); if (dent.d_reclen > uio->uio_resid) { /* Exhausted UIO space. */ error = EJUSTRETURN; break; } /* Copy out the directory entry and continue. */ error = uiomove(&dent, dent.d_reclen, uio); if (error) { break; } (*cntp)++; de = TAILQ_NEXT(de, td_entries); } while (uio->uio_resid > 0 && de); /* Cache the last entry or clear and mark EOF. */ uio->uio_offset = de ? tmpfs_dir_getseq(node, de) : TMPFS_DIRSEQ_EOF; node->tn_spec.tn_dir.tn_readdir_lastp = de; done: tmpfs_update(node->tn_vnode, TMPFS_UPDATE_ATIME); if (error == EJUSTRETURN) { /* Exhausted UIO space - just return. */ error = 0; } KASSERT(error >= 0); return error; } /* * tmpfs_reg_resize: resize the underlying UVM object associated with the * specified regular file. */ int tmpfs_reg_resize(struct vnode *vp, off_t newsize) { tmpfs_mount_t *tmp = VFS_TO_TMPFS(vp->v_mount); tmpfs_node_t *node = VP_TO_TMPFS_NODE(vp); struct uvm_object *uobj = node->tn_spec.tn_reg.tn_aobj; size_t newpages, oldpages; off_t oldsize; KASSERT(vp->v_type == VREG); KASSERT(newsize >= 0); if (newsize > __type_max(off_t) - PAGE_SIZE + 1) return EFBIG; oldsize = node->tn_size; oldpages = round_page(oldsize) >> PAGE_SHIFT; newpages = round_page(newsize) >> PAGE_SHIFT; KASSERT(oldpages == node->tn_spec.tn_reg.tn_aobj_pages); if (newsize == oldsize) { return 0; } if (newpages > oldpages) { /* Increase the used-memory counter if getting extra pages. */ if (!tmpfs_mem_incr(tmp, (newpages - oldpages) << PAGE_SHIFT)) { return ENOSPC; } } else if (newsize < oldsize) { size_t zerolen; zerolen = MIN(round_page(newsize), node->tn_size) - newsize; ubc_zerorange(uobj, newsize, zerolen, UBC_VNODE_FLAGS(vp)); } node->tn_spec.tn_reg.tn_aobj_pages = newpages; node->tn_size = newsize; uvm_vnp_setsize(vp, newsize); /* * Free "backing store". */ if (newpages < oldpages) { rw_enter(uobj->vmobjlock, RW_WRITER); uao_dropswap_range(uobj, newpages, oldpages); rw_exit(uobj->vmobjlock); /* Decrease the used-memory counter. */ tmpfs_mem_decr(tmp, (oldpages - newpages) << PAGE_SHIFT); } return 0; } /* * tmpfs_chflags: change flags of the given vnode. */ int tmpfs_chflags(vnode_t *vp, int flags, kauth_cred_t cred, lwp_t *l) { tmpfs_node_t *node = VP_TO_TMPFS_NODE(vp); kauth_action_t action = KAUTH_VNODE_WRITE_FLAGS; int error; bool changing_sysflags = false; KASSERT(VOP_ISLOCKED(vp)); /* Disallow this operation if the file system is mounted read-only. */ if (vp->v_mount->mnt_flag & MNT_RDONLY) return EROFS; /* * If the new flags have non-user flags that are different than * those on the node, we need special permission to change them. */ if ((flags & SF_SETTABLE) != (node->tn_flags & SF_SETTABLE)) { action |= KAUTH_VNODE_WRITE_SYSFLAGS; changing_sysflags = true; } /* * Indicate that this node's flags have system attributes in them if * that's the case. */ if (node->tn_flags & (SF_IMMUTABLE | SF_APPEND)) { action |= KAUTH_VNODE_HAS_SYSFLAGS; } error = kauth_authorize_vnode(cred, action, vp, NULL, genfs_can_chflags(vp, cred, node->tn_uid, changing_sysflags)); if (error) return error; /* * Set the flags. If we're not setting non-user flags, be careful not * to overwrite them. * * XXX: Can't we always assign here? if the system flags are different, * the code above should catch attempts to change them without * proper permissions, and if we're here it means it's okay to * change them... */ if (!changing_sysflags) { /* Clear all user-settable flags and re-set them. */ node->tn_flags &= SF_SETTABLE; node->tn_flags |= (flags & UF_SETTABLE); } else { node->tn_flags = flags; } tmpfs_update(vp, TMPFS_UPDATE_CTIME); return 0; } /* * tmpfs_chmod: change access mode on the given vnode. */ int tmpfs_chmod(vnode_t *vp, mode_t mode, kauth_cred_t cred, lwp_t *l) { tmpfs_node_t *node = VP_TO_TMPFS_NODE(vp); int error; KASSERT(VOP_ISLOCKED(vp)); /* Disallow this operation if the file system is mounted read-only. */ if (vp->v_mount->mnt_flag & MNT_RDONLY) return EROFS; /* Immutable or append-only files cannot be modified, either. */ if (node->tn_flags & (IMMUTABLE | APPEND)) return EPERM; error = kauth_authorize_vnode(cred, KAUTH_VNODE_WRITE_SECURITY, vp, NULL, genfs_can_chmod(vp, cred, node->tn_uid, node->tn_gid, mode)); if (error) { return error; } node->tn_mode = (mode & ALLPERMS); tmpfs_update(vp, TMPFS_UPDATE_CTIME); cache_enter_id(vp, node->tn_mode, node->tn_uid, node->tn_gid, true); return 0; } /* * tmpfs_chown: change ownership of the given vnode. * * => At least one of uid or gid must be different than VNOVAL. * => Attribute is unchanged for VNOVAL case. */ int tmpfs_chown(vnode_t *vp, uid_t uid, gid_t gid, kauth_cred_t cred, lwp_t *l) { tmpfs_node_t *node = VP_TO_TMPFS_NODE(vp); int error; KASSERT(VOP_ISLOCKED(vp)); /* Assign default values if they are unknown. */ KASSERT(uid != VNOVAL || gid != VNOVAL); if (uid == VNOVAL) { uid = node->tn_uid; } if (gid == VNOVAL) { gid = node->tn_gid; } /* Disallow this operation if the file system is mounted read-only. */ if (vp->v_mount->mnt_flag & MNT_RDONLY) return EROFS; /* Immutable or append-only files cannot be modified, either. */ if (node->tn_flags & (IMMUTABLE | APPEND)) return EPERM; error = kauth_authorize_vnode(cred, KAUTH_VNODE_CHANGE_OWNERSHIP, vp, NULL, genfs_can_chown(vp, cred, node->tn_uid, node->tn_gid, uid, gid)); if (error) { return error; } node->tn_uid = uid; node->tn_gid = gid; tmpfs_update(vp, TMPFS_UPDATE_CTIME); cache_enter_id(vp, node->tn_mode, node->tn_uid, node->tn_gid, true); return 0; } /* * tmpfs_chsize: change size of the given vnode. */ int tmpfs_chsize(vnode_t *vp, u_quad_t size, kauth_cred_t cred, lwp_t *l) { tmpfs_node_t *node = VP_TO_TMPFS_NODE(vp); const off_t length = size; int error; KASSERT(VOP_ISLOCKED(vp)); /* Decide whether this is a valid operation based on the file type. */ switch (vp->v_type) { case VDIR: return EISDIR; case VREG: if (vp->v_mount->mnt_flag & MNT_RDONLY) { return EROFS; } break; case VBLK: case VCHR: case VFIFO: /* * Allow modifications of special files even if in the file * system is mounted read-only (we are not modifying the * files themselves, but the objects they represent). */ return 0; default: return EOPNOTSUPP; } /* Immutable or append-only files cannot be modified, either. */ if (node->tn_flags & (IMMUTABLE | APPEND)) { return EPERM; } if (length < 0) { return EINVAL; } /* Note: tmpfs_reg_resize() will raise NOTE_EXTEND and NOTE_ATTRIB. */ if (node->tn_size != length && (error = tmpfs_reg_resize(vp, length)) != 0) { return error; } tmpfs_update(vp, TMPFS_UPDATE_CTIME | TMPFS_UPDATE_MTIME); return 0; } /* * tmpfs_chtimes: change access and modification times for vnode. */ int tmpfs_chtimes(vnode_t *vp, const struct timespec *atime, const struct timespec *mtime, const struct timespec *btime, int vaflags, kauth_cred_t cred, lwp_t *l) { tmpfs_node_t *node = VP_TO_TMPFS_NODE(vp); int error; KASSERT(VOP_ISLOCKED(vp)); /* Disallow this operation if the file system is mounted read-only. */ if (vp->v_mount->mnt_flag & MNT_RDONLY) return EROFS; /* Immutable or append-only files cannot be modified, either. */ if (node->tn_flags & (IMMUTABLE | APPEND)) return EPERM; error = kauth_authorize_vnode(cred, KAUTH_VNODE_WRITE_TIMES, vp, NULL, genfs_can_chtimes(vp, cred, node->tn_uid, vaflags)); if (error) return error; mutex_enter(&node->tn_timelock); if (atime->tv_sec != VNOVAL) { atomic_and_uint(&node->tn_tflags, ~TMPFS_UPDATE_ATIME); node->tn_atime = *atime; } if (mtime->tv_sec != VNOVAL) { atomic_and_uint(&node->tn_tflags, ~TMPFS_UPDATE_MTIME); node->tn_mtime = *mtime; } if (btime->tv_sec != VNOVAL) { node->tn_birthtime = *btime; } mutex_exit(&node->tn_timelock); return 0; } /* * tmpfs_update_locked: update the timestamps as indicated by the flags. */ void tmpfs_update_locked(vnode_t *vp, unsigned tflags) { tmpfs_node_t *node = VP_TO_TMPFS_NODE(vp); struct timespec nowtm; KASSERT(mutex_owned(&node->tn_timelock)); if ((tflags |= atomic_swap_uint(&node->tn_tflags, 0)) == 0) { return; } vfs_timestamp(&nowtm); if (tflags & TMPFS_UPDATE_ATIME) { node->tn_atime = nowtm; } if (tflags & TMPFS_UPDATE_MTIME) { node->tn_mtime = nowtm; } if (tflags & TMPFS_UPDATE_CTIME) { node->tn_ctime = nowtm; } } /* * tmpfs_update: update the timestamps as indicated by the flags. */ void tmpfs_update(vnode_t *vp, unsigned tflags) { tmpfs_node_t *node = VP_TO_TMPFS_NODE(vp); if ((tflags | atomic_load_relaxed(&node->tn_tflags)) == 0) { return; } mutex_enter(&node->tn_timelock); tmpfs_update_locked(vp, tflags); mutex_exit(&node->tn_timelock); } /* * tmpfs_update_lazily: schedule a deferred timestamp update. */ void tmpfs_update_lazily(vnode_t *vp, unsigned tflags) { tmpfs_node_t *node = VP_TO_TMPFS_NODE(vp); unsigned cur; cur = atomic_load_relaxed(&node->tn_tflags); if ((cur & tflags) != tflags) { atomic_or_uint(&node->tn_tflags, tflags); return; } }
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 /* $NetBSD: if_arp.h,v 1.43 2021/02/19 14:51:59 christos Exp $ */ /* * Copyright (c) 1986, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)if_arp.h 8.1 (Berkeley) 6/10/93 */ #ifndef _NET_IF_ARP_H_ #define _NET_IF_ARP_H_ /* * Address Resolution Protocol. * * See RFC 826 for protocol description. ARP packets are variable * in size; the arphdr structure defines the fixed-length portion. * Protocol type values are the same as those for 10 Mb/s Ethernet. * It is followed by the variable-sized fields ar_sha, arp_spa, * arp_tha and arp_tpa in that order, according to the lengths * specified. Field names used correspond to RFC 826. */ struct arphdr { uint16_t ar_hrd; /* format of hardware address */ #define ARPHRD_ETHER 1 /* ethernet hardware format */ #define ARPHRD_IEEE802 6 /* IEEE 802 hardware format */ #define ARPHRD_ARCNET 7 /* ethernet hardware format */ #define ARPHRD_FRELAY 15 /* frame relay hardware format */ #define ARPHRD_STRIP 23 /* Ricochet Starmode Radio hardware format */ #define ARPHRD_IEEE1394 24 /* IEEE 1394 (FireWire) hardware format */ uint16_t ar_pro; /* format of protocol address */ uint8_t ar_hln; /* length of hardware address */ uint8_t ar_pln; /* length of protocol address */ uint16_t ar_op; /* one of: */ #define ARPOP_REQUEST 1 /* request to resolve address */ #define ARPOP_REPLY 2 /* response to previous request */ #define ARPOP_REVREQUEST 3 /* request protocol address given hardware */ #define ARPOP_REVREPLY 4 /* response giving protocol address */ #define ARPOP_INVREQUEST 8 /* request to identify peer */ #define ARPOP_INVREPLY 9 /* response identifying peer */ /* * The remaining fields are variable in size, * according to the sizes above. */ #ifdef COMMENT_ONLY uint8_t ar_sha[]; /* sender hardware address */ uint8_t ar_spa[]; /* sender protocol address */ uint8_t ar_tha[]; /* target hardware address (!IEEE1394) */ uint8_t ar_tpa[]; /* target protocol address */ #endif }; static __inline uint8_t * ar_data(struct arphdr *ap) { return (uint8_t *)(void *)(ap + 1); } static __inline uint8_t * ar_sha(struct arphdr *ap) { return ar_data(ap) + 0; } static __inline uint8_t * ar_spa(struct arphdr *ap) { return ar_data(ap) + ap->ar_hln; } static __inline uint8_t * ar_tha(struct arphdr *ap) { if (ntohs(ap->ar_hrd) == ARPHRD_IEEE1394) { return NULL; } else { return ar_data(ap) + ap->ar_hln + ap->ar_pln; } } static __inline uint8_t * ar_tpa(struct arphdr *ap) { if (ntohs(ap->ar_hrd) == ARPHRD_IEEE1394) { return ar_data(ap) + ap->ar_hln + ap->ar_pln; } else { return ar_data(ap) + ap->ar_hln + ap->ar_pln + ap->ar_hln; } } /* * ARP ioctl request */ struct arpreq { struct sockaddr arp_pa; /* protocol address */ struct sockaddr arp_ha; /* hardware address */ int arp_flags; /* flags */ }; /* arp_flags and at_flags field values */ #define ATF_INUSE 0x01 /* entry in use */ #define ATF_COM 0x02 /* completed entry (enaddr valid) */ #define ATF_PERM 0x04 /* permanent entry */ #define ATF_PUBL 0x08 /* publish entry (respond for other host) */ #define ATF_USETRAILERS 0x10 /* has requested trailers */ /* * Kernel statistics about arp */ #define ARP_STAT_SNDTOTAL 0 /* total packets sent */ #define ARP_STAT_SNDREPLY 1 /* replies sent */ #define ARP_STAT_SENDREQUEST 2 /* requests sent */ #define ARP_STAT_RCVTOTAL 3 /* total packets received */ #define ARP_STAT_RCVREQUEST 4 /* valid requests received */ #define ARP_STAT_RCVREPLY 5 /* replies received */ #define ARP_STAT_RCVMCAST 6 /* multicast/broadcast received */ #define ARP_STAT_RCVBADPROTO 7 /* unknown protocol type received */ #define ARP_STAT_RCVBADLEN 8 /* bad (short) length received */ #define ARP_STAT_RCVZEROTPA 9 /* received w/ null target ip */ #define ARP_STAT_RCVZEROSPA 10 /* received w/ null source ip */ #define ARP_STAT_RCVNOINT 11 /* couldn't map to interface */ #define ARP_STAT_RCVLOCALSHA 12 /* received from local hw address */ #define ARP_STAT_RCVBCASTSHA 13 /* received w/ broadcast src */ #define ARP_STAT_RCVLOCALSPA 14 /* received for a local ip [dup!] */ #define ARP_STAT_RCVOVERPERM 15 /* attempts to overwrite static info */ #define ARP_STAT_RCVOVERINT 16 /* attempts to overwrite wrong if */ #define ARP_STAT_RCVOVER 17 /* entries overwritten! */ #define ARP_STAT_RCVLENCHG 18 /* changes in hw address len */ #define ARP_STAT_DFRTOTAL 19 /* deferred pending ARP resolution */ #define ARP_STAT_DFRSENT 20 /* deferred, then sent */ #define ARP_STAT_DFRDROPPED 21 /* deferred, then dropped */ #define ARP_STAT_ALLOCFAIL 22 /* failures to allocate llinfo */ #define ARP_NSTATS 23 void arp_stat_add(int, uint64_t); #endif /* !_NET_IF_ARP_H_ */
26 8 5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 /* $NetBSD: if.h,v 1.305 2023/10/09 11:55:34 riastradh Exp $ */ /*- * Copyright (c) 1999, 2000, 2001 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by William Studenmund and Jason R. Thorpe. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /* * Copyright (c) 1982, 1986, 1989, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)if.h 8.3 (Berkeley) 2/9/95 */ #ifndef _NET_IF_H_ #define _NET_IF_H_ #if !defined(_KERNEL) && !defined(_STANDALONE) #include <stdbool.h> #endif #include <sys/featuretest.h> /* * Length of interface external name, including terminating '\0'. * Note: this is the same size as a generic device's external name. */ #define IF_NAMESIZE 16 /* * Length of interface description, including terminating '\0'. */ #define IFDESCRSIZE 64 #if defined(_NETBSD_SOURCE) #include <sys/socket.h> #include <sys/queue.h> #include <sys/mutex.h> #include <sys/hook.h> #include <net/dlt.h> #include <net/pfil.h> #ifdef _KERNEL #include <net/pktqueue.h> #include <sys/pslist.h> #include <sys/pserialize.h> #include <sys/psref.h> #include <sys/module_hook.h> #endif /* * Always include ALTQ glue here -- we use the ALTQ interface queue * structure even when ALTQ is not configured into the kernel so that * the size of struct ifnet does not changed based on the option. The * ALTQ queue structure is API-compatible with the legacy ifqueue. */ #include <altq/if_altq.h> /* * Structures defining a network interface, providing a packet * transport mechanism (ala level 0 of the PUP protocols). * * Each interface accepts output datagrams of a specified maximum * length, and provides higher level routines with input datagrams * received from its medium. * * Output occurs when the routine if_output is called, with four parameters: * (*ifp->if_output)(ifp, m, dst, rt) * Here m is the mbuf chain to be sent and dst is the destination address. * The output routine encapsulates the supplied datagram if necessary, * and then transmits it on its medium. * * On input, each interface unwraps the data received by it, and either * places it on the input queue of a internetwork datagram routine * and posts the associated software interrupt, or passes the datagram to a raw * packet input routine. * * Routines exist for locating interfaces by their addresses * or for locating a interface on a certain network, as well as more general * routing and gateway routines maintaining information used to locate * interfaces. These routines live in the files if.c and route.c */ #include <sys/time.h> #if defined(_KERNEL_OPT) #include "opt_compat_netbsd.h" #include "opt_gateway.h" #endif struct mbuf; struct proc; struct rtentry; struct socket; struct ether_header; struct ifaddr; struct ifnet; struct rt_addrinfo; #define IFNAMSIZ IF_NAMESIZE /* * Structure describing a `cloning' interface. */ struct if_clone { LIST_ENTRY(if_clone) ifc_list; /* on list of cloners */ const char *ifc_name; /* name of device, e.g. `gif' */ size_t ifc_namelen; /* length of name */ int (*ifc_create)(struct if_clone *, int); int (*ifc_destroy)(struct ifnet *); }; #define IF_CLONE_INITIALIZER(name, create, destroy) \ { { NULL, NULL }, name, sizeof(name) - 1, create, destroy } /* * Structure used to query names of interface cloners. */ struct if_clonereq { int ifcr_total; /* total cloners (out) */ int ifcr_count; /* room for this many in user buffer */ char *ifcr_buffer; /* buffer for cloner names */ }; /* * Structure defining statistics and other data kept regarding a network * interface. * * Only used for exporting data from the interface. */ struct if_data { /* generic interface information */ u_char ifi_type; /* ethernet, tokenring, etc. */ u_char ifi_addrlen; /* media address length */ u_char ifi_hdrlen; /* media header length */ int ifi_link_state; /* current link state */ uint64_t ifi_mtu; /* maximum transmission unit */ uint64_t ifi_metric; /* routing metric (external only) */ uint64_t ifi_baudrate; /* linespeed */ /* volatile statistics */ uint64_t ifi_ipackets; /* packets received on interface */ uint64_t ifi_ierrors; /* input errors on interface */ uint64_t ifi_opackets; /* packets sent on interface */ uint64_t ifi_oerrors; /* output errors on interface */ uint64_t ifi_collisions; /* collisions on csma interfaces */ uint64_t ifi_ibytes; /* total number of octets received */ uint64_t ifi_obytes; /* total number of octets sent */ uint64_t ifi_imcasts; /* packets received via multicast */ uint64_t ifi_omcasts; /* packets sent via multicast */ uint64_t ifi_iqdrops; /* dropped on input, this interface */ uint64_t ifi_noproto; /* destined for unsupported protocol */ struct timespec ifi_lastchange;/* last operational state change */ }; /* * Values for if_link_state. */ #define LINK_STATE_UNKNOWN 0 /* link invalid/unknown */ #define LINK_STATE_DOWN 1 /* link is down */ #define LINK_STATE_UP 2 /* link is up */ /* * Status bit descriptions for the various interface types. */ struct if_status_description { unsigned char ifs_type; unsigned char ifs_state; const char *ifs_string; }; #define LINK_STATE_DESC_MATCH(_ifs, _t, _s) \ (((_ifs)->ifs_type == (_t) || (_ifs)->ifs_type == 0) && \ (_ifs)->ifs_state == (_s)) #define LINK_STATE_DESCRIPTIONS { \ { IFT_ETHER, LINK_STATE_DOWN, "no carrier" }, \ { IFT_IEEE80211, LINK_STATE_DOWN, "no network" }, \ { IFT_PPP, LINK_STATE_DOWN, "no carrier" }, \ { IFT_CARP, LINK_STATE_DOWN, "backup" }, \ { IFT_CARP, LINK_STATE_UP, "master" }, \ { 0, LINK_STATE_UP, "active" }, \ { 0, LINK_STATE_UNKNOWN, "unknown" }, \ { 0, LINK_STATE_DOWN, "down" }, \ { 0, 0, NULL } \ } /* * Structure defining a queue for a network interface. */ struct ifqueue { struct mbuf *ifq_head; struct mbuf *ifq_tail; int ifq_len; int ifq_maxlen; uint64_t ifq_drops; kmutex_t *ifq_lock; }; #ifdef _KERNEL #include <sys/percpu.h> #include <sys/callout.h> #include <sys/rwlock.h> #include <sys/workqueue.h> #endif /* _KERNEL */ /* * Structure defining a queue for a network interface. * * (Would like to call this struct ``if'', but C isn't PL/1.) */ TAILQ_HEAD(ifnet_head, ifnet); /* the actual queue head */ struct bridge_softc; struct bridge_iflist; struct callout; struct krwlock; struct if_percpuq; struct if_deferred_start; struct in6_multi; typedef unsigned short if_index_t; /* * Interface. Field markings and the corresponding locks: * * i: IFNET_LOCK (a.k.a., if_ioctl_lock) * q: ifq_lock (struct ifaltq) * a: if_afdata_lock * 6: in6_multilock (global lock) * :: unlocked, stable * ?: unknown, maybe unsafe * * Lock order: IFNET_LOCK => in6_multilock => if_afdata_lock => ifq_lock * Note that currently if_afdata_lock and ifq_lock aren't held * at the same time, but define the order anyway. * * Lock order of IFNET_LOCK with other locks: * softnet_lock => solock => IFNET_LOCK => ND6_LOCK, in_multilock */ typedef struct ifnet { void *if_softc; /* :: lower-level data for this if */ /* DEPRECATED. Keep it to avoid breaking kvm(3) users */ TAILQ_ENTRY(ifnet) if_list; /* i: all struct ifnets are chained */ TAILQ_HEAD(, ifaddr) if_addrlist; /* i: linked list of addresses per if */ char if_xname[IFNAMSIZ]; /* :: external name (name + unit) */ int if_pcount; /* i: number of promiscuous listeners */ struct bpf_if *if_bpf; /* :: packet filter structure */ if_index_t if_index; /* :: numeric abbreviation for this if */ short if_timer; /* ?: time 'til if_slowtimo called */ unsigned short if_flags; /* i: up/down, broadcast, etc. */ short if_extflags; /* :: if_output MP-safe, etc. */ u_char if_type; /* :: ethernet, tokenring, etc. */ u_char if_addrlen; /* :: media address length */ u_char if_hdrlen; /* :: media header length */ /* XXX audit :? fields here. */ int if_link_state; /* :? current link state */ uint64_t if_mtu; /* :? maximum transmission unit */ uint64_t if_metric; /* :? routing metric (external only) */ uint64_t if_baudrate; /* :? linespeed */ struct timespec if_lastchange; /* :? last operational state change */ #ifdef _KERNEL percpu_t *if_stats; /* :: statistics */ #else void *if_stats; /* opaque to user-space */ #endif /* _KERNEL */ /* * Procedure handles. If you add more of these, don't forget the * corresponding NULL stub in if.c. */ int (*if_output) /* :: output routine (enqueue) */ (struct ifnet *, struct mbuf *, const struct sockaddr *, const struct rtentry *); void (*_if_input) /* :: input routine (from h/w driver) */ (struct ifnet *, struct mbuf *); void (*if_start) /* :: initiate output routine */ (struct ifnet *); int (*if_transmit) /* :: output routine, must be MP-safe */ (struct ifnet *, struct mbuf *); int (*if_ioctl) /* :: ioctl routine */ (struct ifnet *, u_long, void *); int (*if_init) /* :: init routine */ (struct ifnet *); void (*if_stop) /* :: stop routine */ (struct ifnet *, int); void (*if_slowtimo) /* :: timer routine */ (struct ifnet *); #define if_watchdog if_slowtimo void (*if_drain) /* :: routine to release resources */ (struct ifnet *); void (*if_bpf_mtap) /* :: bpf routine */ (struct bpf_if *, struct mbuf *, u_int); struct ifaltq if_snd; /* q: output queue (includes altq) */ struct ifaddr *if_dl; /* i: identity of this interface. */ const struct sockaddr_dl *if_sadl; /* i: pointer to sockaddr_dl of if_dl */ /* * May be NULL. If not NULL, it is the address assigned * to the interface by the manufacturer, so it very likely * to be unique. It MUST NOT be deleted. It is highly * suitable for deriving the EUI64 for the interface. */ struct ifaddr *if_hwdl; /* i: h/w identity */ const uint8_t *if_broadcastaddr; /* :: linklevel broadcast bytestring */ struct bridge_softc *if_bridge; /* i: bridge glue */ struct bridge_iflist *if_bridgeif; /* i: shortcut to interface list entry */ int if_dlt; /* :: data link type (<net/dlt.h>) */ pfil_head_t * if_pfil; /* :: filtering point */ uint64_t if_capabilities; /* i: interface capabilities */ uint64_t if_capenable; /* i: capabilities enabled */ union { void * carp_s; /* carp structure (used by !carp ifs) */ struct ifnet *carp_d;/* ptr to carpdev (used by carp ifs) */ } if_carp_ptr; /* ?: */ #define if_carp if_carp_ptr.carp_s #define if_carpdev if_carp_ptr.carp_d /* * These are pre-computed based on an interfaces enabled * capabilities, for speed elsewhere. */ int if_csum_flags_tx; /* i: M_CSUM_* flags for Tx */ int if_csum_flags_rx; /* i: M_CSUM_* flags for Rx */ void *if_afdata[AF_MAX]; /* a: */ struct mowner *if_mowner; /* ?: who owns mbufs for this interface */ void *if_lagg; /* :: lagg or agr structure */ void *if_npf_private;/* ?: associated NPF context */ /* * pf specific data, used only when #if NPF > 0. */ void *if_pf_kif; /* ?: pf interface abstraction */ void *if_pf_groups; /* ?: pf interface groups */ /* * During an ifnet's lifetime, it has only one if_index, but * an if_index is not sufficient to identify an ifnet * because during the lifetime of the system, many ifnets may occupy a * given if_index. Let us tell different ifnets at the same * if_index apart by their if_index_gen, a unique number that each ifnet * is assigned when it if_attach()s. Now, the kernel can use the * pair (if_index, if_index_gen) as a weak reference to an ifnet. */ uint64_t if_index_gen; /* :: generation number for the ifnet * at if_index: if two ifnets' index * and generation number are both the * same, they are the same ifnet. */ struct sysctllog *if_sysctl_log; /* :: */ int (*if_initaddr) /* :: */ (struct ifnet *, struct ifaddr *, bool); int (*if_setflags) /* :: */ (struct ifnet *, const u_short); kmutex_t *if_ioctl_lock; /* :: */ char *if_description; /* i: interface description */ #ifdef _KERNEL /* XXX kvm(3) */ struct if_slowtimo_data *if_slowtimo_data; /* :: */ struct krwlock *if_afdata_lock;/* :: */ struct if_percpuq *if_percpuq; /* :: we should remove it in the future */ struct work if_link_work; /* q: linkage on link state work queue */ uint16_t if_link_queue; /* q: masked link state change queue */ /* q: is link state work scheduled? */ bool if_link_scheduled; struct pslist_entry if_pslist_entry;/* i: */ struct psref_target if_psref; /* :: */ struct pslist_head if_addr_pslist; /* i: */ struct if_deferred_start *if_deferred_start; /* :: */ /* XXX should be protocol independent */ LIST_HEAD(, in6_multi) if_multiaddrs; /* 6: */ khook_list_t *if_linkstate_hooks; /* :: */ #endif } ifnet_t; #include <net/if_stats.h> #define if_name(ifp) ((ifp)->if_xname) #define IFF_UP 0x0001 /* interface is up */ #define IFF_BROADCAST 0x0002 /* broadcast address valid */ #define IFF_DEBUG 0x0004 /* turn on debugging */ #define IFF_LOOPBACK 0x0008 /* is a loopback net */ #define IFF_POINTOPOINT 0x0010 /* interface is point-to-point link */ #if 0 /* 0x0020 was IFF_NOTRAILERS */ #else /* * sys/compat/svr4 is remvoed on 19 Dec 2018. * And then, IFF_NOTRAILERS itself is removed by if.h:r1.268 on 5 Feb 2019. */ #define IFF_UNNUMBERED 0x0020 /* explicit unnumbered */ #endif #define IFF_RUNNING 0x0040 /* resources allocated */ #define IFF_NOARP 0x0080 /* no address resolution protocol */ #define IFF_PROMISC 0x0100 /* receive all packets */ #define IFF_ALLMULTI 0x0200 /* OBSOLETE -- DO NOT USE */ /* * IFF_ALLMULTI obsoleted on 2019-05-15 -- existing non-MP-safe drivers * can use it for themselves under IFNET_LOCK, but they should be * converted to use ETHER_F_ALLMULTI under ETHER_LOCK instead. For * compatibility with existing drivers, if_ethersubr and if_arcsubr * will set IFF_ALLMULTI according to other flags, but you should not * rely on this. */ #define IFF_OACTIVE 0x0400 /* transmission in progress */ #define IFF_SIMPLEX 0x0800 /* can't hear own transmissions */ #define IFF_LINK0 0x1000 /* per link layer defined bit */ #define IFF_LINK1 0x2000 /* per link layer defined bit */ #define IFF_LINK2 0x4000 /* per link layer defined bit */ #define IFF_MULTICAST 0x8000 /* supports multicast */ #define IFEF_MPSAFE __BIT(0) /* handlers can run in parallel (see below) */ /* * The guidelines for converting an interface to IFEF_MPSAFE are as follows * * Enabling IFEF_MPSAFE on an interface suppresses taking KERNEL_LOCK when * calling the following handlers: * - if_start * - Note that if_transmit is always called without KERNEL_LOCK * - if_output * - if_ioctl * - if_init * - if_stop * * This means that an interface with IFEF_MPSAFE must make the above handlers * MP-safe or take KERNEL_LOCK by itself inside handlers that aren't MP-safe * yet. * * There are some additional restrictions to access member variables of struct * ifnet: * - if_flags * - Must be updated with holding IFNET_LOCK * - You cannot use the flag in Tx/Rx paths anymore because there is no * synchronization on the flag except for IFNET_LOCK * - Note that IFNET_LOCK can't be taken in softint because it's known * that it causes a deadlock * - Some synchronization mechanisms such as pserialize_perform are called * with IFNET_LOCK and also require context switches on every CPUs * that mean softints finish so trying to take IFNET_LOCK in softint * might block on IFNET_LOCK and prevent such synchronization mechanisms * from being completed * - Currently the deadlock occurs only if NET_MPSAFE is enabled, however, * we should deal with the restriction because NET_MPSAFE will be enabled * by default in the future * - if_watchdog and if_timer * - The watchdog framework works only for non-IFEF_MPSAFE interfaces * that rely on KERNEL_LOCK * - Interfaces with IFEF_MPSAFE have to provide its own watchdog mechanism * if needed * - Keep if_watchdog NULL when calling if_attach */ #ifdef _KERNEL static __inline bool if_is_mpsafe(struct ifnet *ifp) { return ((ifp->if_extflags & IFEF_MPSAFE) != 0); } static __inline int if_output_lock(struct ifnet *cifp, struct ifnet *ifp, struct mbuf *m, const struct sockaddr *dst, const struct rtentry *rt) { if (if_is_mpsafe(cifp)) { return (*cifp->if_output)(ifp, m, dst, rt); } else { int ret; KERNEL_LOCK(1, NULL); ret = (*cifp->if_output)(ifp, m, dst, rt); KERNEL_UNLOCK_ONE(NULL); return ret; } } static __inline void if_start_lock(struct ifnet *ifp) { if (if_is_mpsafe(ifp)) { (*ifp->if_start)(ifp); } else { KERNEL_LOCK(1, NULL); (*ifp->if_start)(ifp); KERNEL_UNLOCK_ONE(NULL); } } #define KERNEL_LOCK_IF_IFP_MPSAFE(ifp) \ do { if (if_is_mpsafe(ifp)) { KERNEL_LOCK(1, NULL); } } while (0) #define KERNEL_UNLOCK_IF_IFP_MPSAFE(ifp) \ do { if (if_is_mpsafe(ifp)) { KERNEL_UNLOCK_ONE(NULL); } } while (0) #define KERNEL_LOCK_UNLESS_IFP_MPSAFE(ifp) \ do { if (!if_is_mpsafe(ifp)) { KERNEL_LOCK(1, NULL); } } while (0) #define KERNEL_UNLOCK_UNLESS_IFP_MPSAFE(ifp) \ do { if (!if_is_mpsafe(ifp)) { KERNEL_UNLOCK_ONE(NULL); } } while (0) #ifdef _KERNEL_OPT #include "opt_net_mpsafe.h" #endif /* XXX explore a better place to define */ #ifdef NET_MPSAFE #define KERNEL_LOCK_UNLESS_NET_MPSAFE() do { } while (0) #define KERNEL_UNLOCK_UNLESS_NET_MPSAFE() do { } while (0) #define SOFTNET_LOCK_UNLESS_NET_MPSAFE() do { } while (0) #define SOFTNET_UNLOCK_UNLESS_NET_MPSAFE() do { } while (0) #define SOFTNET_LOCK_IF_NET_MPSAFE() \ do { mutex_enter(softnet_lock); } while (0) #define SOFTNET_UNLOCK_IF_NET_MPSAFE() \ do { mutex_exit(softnet_lock); } while (0) #else /* NET_MPSAFE */ #define KERNEL_LOCK_UNLESS_NET_MPSAFE() \ do { KERNEL_LOCK(1, NULL); } while (0) #define KERNEL_UNLOCK_UNLESS_NET_MPSAFE() \ do { KERNEL_UNLOCK_ONE(NULL); } while (0) #define SOFTNET_LOCK_UNLESS_NET_MPSAFE() \ do { mutex_enter(softnet_lock); } while (0) #define SOFTNET_UNLOCK_UNLESS_NET_MPSAFE() \ do { mutex_exit(softnet_lock); } while (0) #define SOFTNET_LOCK_IF_NET_MPSAFE() do { } while (0) #define SOFTNET_UNLOCK_IF_NET_MPSAFE() do { } while (0) #endif /* NET_MPSAFE */ #define SOFTNET_KERNEL_LOCK_UNLESS_NET_MPSAFE() \ do { \ SOFTNET_LOCK_UNLESS_NET_MPSAFE(); \ KERNEL_LOCK_UNLESS_NET_MPSAFE(); \ } while (0) #define SOFTNET_KERNEL_UNLOCK_UNLESS_NET_MPSAFE() \ do { \ KERNEL_UNLOCK_UNLESS_NET_MPSAFE(); \ SOFTNET_UNLOCK_UNLESS_NET_MPSAFE(); \ } while (0) #endif /* _KERNEL */ #define IFFBITS \ "\020\1UP\2BROADCAST\3DEBUG\4LOOPBACK\5POINTOPOINT\6UNNUMBERED" \ "\7RUNNING\10NOARP\11PROMISC\12ALLMULTI\13OACTIVE\14SIMPLEX" \ "\15LINK0\16LINK1\17LINK2\20MULTICAST" /* flags set internally only: */ #define IFF_CANTCHANGE \ (IFF_BROADCAST|IFF_POINTOPOINT|IFF_RUNNING|IFF_OACTIVE|\ IFF_SIMPLEX|IFF_MULTICAST|IFF_ALLMULTI|IFF_PROMISC) /* * Some convenience macros used for setting ifi_baudrate. */ #define IF_Kbps(x) ((x) * 1000ULL) /* kilobits/sec. */ #define IF_Mbps(x) (IF_Kbps((x) * 1000ULL)) /* megabits/sec. */ #define IF_Gbps(x) (IF_Mbps((x) * 1000ULL)) /* gigabits/sec. */ /* Capabilities that interfaces can advertise. */ /* 0x01 .. 0x40 were previously used */ #define IFCAP_TSOv4 0x00080 /* can do TCPv4 segmentation offload */ #define IFCAP_CSUM_IPv4_Rx 0x00100 /* can do IPv4 header checksums (Rx) */ #define IFCAP_CSUM_IPv4_Tx 0x00200 /* can do IPv4 header checksums (Tx) */ #define IFCAP_CSUM_TCPv4_Rx 0x00400 /* can do IPv4/TCP checksums (Rx) */ #define IFCAP_CSUM_TCPv4_Tx 0x00800 /* can do IPv4/TCP checksums (Tx) */ #define IFCAP_CSUM_UDPv4_Rx 0x01000 /* can do IPv4/UDP checksums (Rx) */ #define IFCAP_CSUM_UDPv4_Tx 0x02000 /* can do IPv4/UDP checksums (Tx) */ #define IFCAP_CSUM_TCPv6_Rx 0x04000 /* can do IPv6/TCP checksums (Rx) */ #define IFCAP_CSUM_TCPv6_Tx 0x08000 /* can do IPv6/TCP checksums (Tx) */ #define IFCAP_CSUM_UDPv6_Rx 0x10000 /* can do IPv6/UDP checksums (Rx) */ #define IFCAP_CSUM_UDPv6_Tx 0x20000 /* can do IPv6/UDP checksums (Tx) */ #define IFCAP_TSOv6 0x40000 /* can do TCPv6 segmentation offload */ #define IFCAP_LRO 0x80000 /* can do Large Receive Offload */ #define IFCAP_MASK 0xfff80 /* currently valid capabilities */ #define IFCAPBITS \ "\020" \ "\10TSO4" \ "\11IP4CSUM_Rx" \ "\12IP4CSUM_Tx" \ "\13TCP4CSUM_Rx" \ "\14TCP4CSUM_Tx" \ "\15UDP4CSUM_Rx" \ "\16UDP4CSUM_Tx" \ "\17TCP6CSUM_Rx" \ "\20TCP6CSUM_Tx" \ "\21UDP6CSUM_Rx" \ "\22UDP6CSUM_Tx" \ "\23TSO6" \ "\24LRO" \ #define IF_AFDATA_LOCK_INIT(ifp) \ do {(ifp)->if_afdata_lock = rw_obj_alloc();} while (0) #define IF_AFDATA_LOCK_DESTROY(ifp) rw_obj_free((ifp)->if_afdata_lock) #define IF_AFDATA_WLOCK(ifp) rw_enter((ifp)->if_afdata_lock, RW_WRITER) #define IF_AFDATA_RLOCK(ifp) rw_enter((ifp)->if_afdata_lock, RW_READER) #define IF_AFDATA_WUNLOCK(ifp) rw_exit((ifp)->if_afdata_lock) #define IF_AFDATA_RUNLOCK(ifp) rw_exit((ifp)->if_afdata_lock) #define IF_AFDATA_LOCK(ifp) IF_AFDATA_WLOCK(ifp) #define IF_AFDATA_UNLOCK(ifp) IF_AFDATA_WUNLOCK(ifp) #define IF_AFDATA_TRYLOCK(ifp) rw_tryenter((ifp)->if_afdata_lock, RW_WRITER) #define IF_AFDATA_LOCK_ASSERT(ifp) \ KASSERT(rw_lock_held((ifp)->if_afdata_lock)) #define IF_AFDATA_RLOCK_ASSERT(ifp) \ KASSERT(rw_read_held((ifp)->if_afdata_lock)) #define IF_AFDATA_WLOCK_ASSERT(ifp) \ KASSERT(rw_write_held((ifp)->if_afdata_lock)) /* * Output queues (ifp->if_snd) and internetwork datagram level (pup level 1) * input routines have queues of messages stored on ifqueue structures * (defined above). Entries are added to and deleted from these structures * by these macros, which should be called with ipl raised to splnet(). */ #define IF_QFULL(ifq) ((ifq)->ifq_len >= (ifq)->ifq_maxlen) #define IF_DROP(ifq) ((ifq)->ifq_drops++) #define IF_ENQUEUE(ifq, m) do { \ (m)->m_nextpkt = 0; \ if ((ifq)->ifq_tail == 0) \ (ifq)->ifq_head = m; \ else \ (ifq)->ifq_tail->m_nextpkt = m; \ (ifq)->ifq_tail = m; \ (ifq)->ifq_len++; \ } while (/*CONSTCOND*/0) #define IF_PREPEND(ifq, m) do { \ (m)->m_nextpkt = (ifq)->ifq_head; \ if ((ifq)->ifq_tail == 0) \ (ifq)->ifq_tail = (m); \ (ifq)->ifq_head = (m); \ (ifq)->ifq_len++; \ } while (/*CONSTCOND*/0) #define IF_DEQUEUE(ifq, m) do { \ (m) = (ifq)->ifq_head; \ if (m) { \ if (((ifq)->ifq_head = (m)->m_nextpkt) == 0) \ (ifq)->ifq_tail = 0; \ (m)->m_nextpkt = 0; \ (ifq)->ifq_len--; \ } \ } while (/*CONSTCOND*/0) #define IF_POLL(ifq, m) ((m) = (ifq)->ifq_head) #define IF_PURGE(ifq) \ do { \ struct mbuf *__m0; \ \ for (;;) { \ IF_DEQUEUE((ifq), __m0); \ if (__m0 == NULL) \ break; \ else \ m_freem(__m0); \ } \ } while (/*CONSTCOND*/ 0) #define IF_IS_EMPTY(ifq) ((ifq)->ifq_len == 0) #ifndef IFQ_MAXLEN #define IFQ_MAXLEN 256 #endif #define IFNET_SLOWHZ 1 /* granularity is 1 second */ /* * Structure defining statistics and other data kept regarding an address * on a network interface. */ struct ifaddr_data { int64_t ifad_inbytes; int64_t ifad_outbytes; }; /* * The ifaddr structure contains information about one address * of an interface. They are maintained by the different address families, * are allocated and attached when an address is set, and are linked * together so all addresses for an interface can be located. */ struct ifaddr { struct sockaddr *ifa_addr; /* address of interface */ struct sockaddr *ifa_dstaddr; /* other end of p-to-p link */ #define ifa_broadaddr ifa_dstaddr /* broadcast address interface */ struct sockaddr *ifa_netmask; /* used to determine subnet */ struct ifnet *ifa_ifp; /* back-pointer to interface */ TAILQ_ENTRY(ifaddr) ifa_list; /* list of addresses for interface */ struct ifaddr_data ifa_data; /* statistics on the address */ void (*ifa_rtrequest) /* check or clean routes (+ or -)'d */ (int, struct rtentry *, const struct rt_addrinfo *); u_int ifa_flags; /* mostly rt_flags for cloning */ int ifa_refcnt; /* count of references */ int ifa_metric; /* cost of going out this interface */ struct ifaddr *(*ifa_getifa)(struct ifaddr *, const struct sockaddr *); uint32_t *ifa_seqno; int16_t ifa_preference; /* preference level for this address */ #ifdef _KERNEL struct pslist_entry ifa_pslist_entry; struct psref_target ifa_psref; #endif }; #define IFA_ROUTE RTF_UP /* (0x01) route installed */ #define IFA_DESTROYING 0x2 /* * Message format for use in obtaining information about interfaces from * sysctl and the routing socket. We need to force 64-bit alignment if we * aren't using compatibility definitions. */ #if !defined(_KERNEL) || !defined(COMPAT_RTSOCK) #define __align64 __aligned(sizeof(uint64_t)) #else #define __align64 #endif struct if_msghdr { u_short ifm_msglen __align64; /* to skip over non-understood messages */ u_char ifm_version; /* future binary compatibility */ u_char ifm_type; /* message type */ int ifm_addrs; /* like rtm_addrs */ int ifm_flags; /* value of if_flags */ u_short ifm_index; /* index for associated ifp */ struct if_data ifm_data __align64; /* statistics and other data about if */ }; /* * Message format for use in obtaining information about interface addresses * from sysctl and the routing socket. */ struct ifa_msghdr { u_short ifam_msglen __align64; /* to skip over non-understood messages */ u_char ifam_version; /* future binary compatibility */ u_char ifam_type; /* message type */ u_short ifam_index; /* index for associated ifp */ int ifam_flags; /* value of ifa_flags */ int ifam_addrs; /* like rtm_addrs */ pid_t ifam_pid; /* identify sender */ int ifam_addrflags; /* family specific address flags */ int ifam_metric; /* value of ifa_metric */ }; /* * Message format announcing the arrival or departure of a network interface. */ struct if_announcemsghdr { u_short ifan_msglen __align64; /* to skip over non-understood messages */ u_char ifan_version; /* future binary compatibility */ u_char ifan_type; /* message type */ u_short ifan_index; /* index for associated ifp */ char ifan_name[IFNAMSIZ]; /* if name, e.g. "en0" */ u_short ifan_what; /* what type of announcement */ }; #define IFAN_ARRIVAL 0 /* interface arrival */ #define IFAN_DEPARTURE 1 /* interface departure */ #undef __align64 /* * Interface request structure used for socket * ioctl's. All interface ioctl's must have parameter * definitions which begin with ifr_name. The * remainder may be interface specific. */ struct ifreq { char ifr_name[IFNAMSIZ]; /* if name, e.g. "en0" */ union { struct sockaddr ifru_addr; struct sockaddr ifru_dstaddr; struct sockaddr ifru_broadaddr; struct sockaddr_storage ifru_space; short ifru_flags; int ifru_addrflags; int ifru_metric; int ifru_mtu; int ifru_dlt; u_int ifru_value; void * ifru_data; struct { uint32_t b_buflen; void *b_buf; } ifru_b; } ifr_ifru; #define ifr_addr ifr_ifru.ifru_addr /* address */ #define ifr_dstaddr ifr_ifru.ifru_dstaddr /* other end of p-to-p link */ #define ifr_broadaddr ifr_ifru.ifru_broadaddr /* broadcast address */ #define ifr_space ifr_ifru.ifru_space /* sockaddr_storage */ #define ifr_flags ifr_ifru.ifru_flags /* flags */ #define ifr_addrflags ifr_ifru.ifru_addrflags /* addr flags */ #define ifr_metric ifr_ifru.ifru_metric /* metric */ #define ifr_mtu ifr_ifru.ifru_mtu /* mtu */ #define ifr_dlt ifr_ifru.ifru_dlt /* data link type (DLT_*) */ #define ifr_value ifr_ifru.ifru_value /* generic value */ #define ifr_media ifr_ifru.ifru_metric /* media options (overload) */ #define ifr_data ifr_ifru.ifru_data /* for use by interface * XXX deprecated */ #define ifr_buf ifr_ifru.ifru_b.b_buf /* new interface ioctls */ #define ifr_buflen ifr_ifru.ifru_b.b_buflen #define ifr_index ifr_ifru.ifru_value /* interface index, BSD */ #define ifr_ifindex ifr_index /* interface index, linux */ }; #ifdef _KERNEL #define ifreq_setdstaddr ifreq_setaddr #define ifreq_setbroadaddr ifreq_setaddr #define ifreq_getdstaddr ifreq_getaddr #define ifreq_getbroadaddr ifreq_getaddr static __inline const struct sockaddr * /*ARGSUSED*/ ifreq_getaddr(u_long cmd, const struct ifreq *ifr) { return &ifr->ifr_addr; } #endif /* _KERNEL */ struct ifcapreq { char ifcr_name[IFNAMSIZ]; /* if name, e.g. "en0" */ uint64_t ifcr_capabilities; /* supported capabiliites */ uint64_t ifcr_capenable; /* capabilities enabled */ }; struct ifaliasreq { char ifra_name[IFNAMSIZ]; /* if name, e.g. "en0" */ struct sockaddr ifra_addr; struct sockaddr ifra_dstaddr; #define ifra_broadaddr ifra_dstaddr struct sockaddr ifra_mask; }; struct ifdatareq { char ifdr_name[IFNAMSIZ]; /* if name, e.g. "en0" */ struct if_data ifdr_data; }; struct ifmediareq { char ifm_name[IFNAMSIZ]; /* if name, e.g. "en0" */ int ifm_current; /* IFMWD: current media options */ int ifm_mask; /* IFMWD: don't care mask */ int ifm_status; /* media status */ int ifm_active; /* IFMWD: active options */ int ifm_count; /* # entries in ifm_ulist array */ int *ifm_ulist; /* array of ifmedia word */ }; struct ifdrv { char ifd_name[IFNAMSIZ]; /* if name, e.g. "en0" */ unsigned long ifd_cmd; size_t ifd_len; void *ifd_data; }; #define IFLINKSTR_QUERYLEN 0x01 #define IFLINKSTR_UNSET 0x02 /* * Structure used in SIOCGIFCONF request. * Used to retrieve interface configuration * for machine (useful for programs which * must know all networks accessible). */ struct ifconf { int ifc_len; /* size of associated buffer */ union { void * ifcu_buf; struct ifreq *ifcu_req; } ifc_ifcu; #define ifc_buf ifc_ifcu.ifcu_buf /* buffer address */ #define ifc_req ifc_ifcu.ifcu_req /* array of structures returned */ }; /* * Structure for SIOC[AGD]LIFADDR */ struct if_laddrreq { char iflr_name[IFNAMSIZ]; unsigned int flags; #define IFLR_PREFIX 0x8000 /* in: prefix given out: kernel fills id */ #define IFLR_ACTIVE 0x4000 /* in/out: link-layer address activation */ #define IFLR_FACTORY 0x2000 /* in/out: factory link-layer address */ unsigned int prefixlen; /* in/out */ struct sockaddr_storage addr; /* in/out */ struct sockaddr_storage dstaddr; /* out */ }; /* * Structure for SIOC[SG]IFADDRPREF */ struct if_addrprefreq { char ifap_name[IFNAMSIZ]; int16_t ifap_preference; /* in/out */ struct sockaddr_storage ifap_addr; /* in/out */ }; #include <net/if_arp.h> #endif /* _NETBSD_SOURCE */ #ifdef _KERNEL #ifdef ALTQ #define IFQ_ENQUEUE(ifq, m, err) \ do { \ mutex_enter((ifq)->ifq_lock); \ if (ALTQ_IS_ENABLED(ifq)) \ ALTQ_ENQUEUE((ifq), (m), (err)); \ else { \ if (IF_QFULL(ifq)) { \ m_freem(m); \ (err) = ENOBUFS; \ } else { \ IF_ENQUEUE((ifq), (m)); \ (err) = 0; \ } \ } \ if ((err)) \ (ifq)->ifq_drops++; \ mutex_exit((ifq)->ifq_lock); \ } while (/*CONSTCOND*/ 0) #define IFQ_DEQUEUE(ifq, m) \ do { \ mutex_enter((ifq)->ifq_lock); \ if (TBR_IS_ENABLED(ifq)) \ (m) = tbr_dequeue((ifq), ALTDQ_REMOVE); \ else if (ALTQ_IS_ENABLED(ifq)) \ ALTQ_DEQUEUE((ifq), (m)); \ else \ IF_DEQUEUE((ifq), (m)); \ mutex_exit((ifq)->ifq_lock); \ } while (/*CONSTCOND*/ 0) #define IFQ_POLL(ifq, m) \ do { \ mutex_enter((ifq)->ifq_lock); \ if (TBR_IS_ENABLED(ifq)) \ (m) = tbr_dequeue((ifq), ALTDQ_POLL); \ else if (ALTQ_IS_ENABLED(ifq)) \ ALTQ_POLL((ifq), (m)); \ else \ IF_POLL((ifq), (m)); \ mutex_exit((ifq)->ifq_lock); \ } while (/*CONSTCOND*/ 0) #define IFQ_PURGE(ifq) \ do { \ mutex_enter((ifq)->ifq_lock); \ if (ALTQ_IS_ENABLED(ifq)) \ ALTQ_PURGE(ifq); \ else \ IF_PURGE(ifq); \ mutex_exit((ifq)->ifq_lock); \ } while (/*CONSTCOND*/ 0) #define IFQ_SET_READY(ifq) \ do { \ (ifq)->altq_flags |= ALTQF_READY; \ } while (/*CONSTCOND*/ 0) #define IFQ_CLASSIFY(ifq, m, af) \ do { \ KASSERT(((m)->m_flags & M_PKTHDR) != 0); \ mutex_enter((ifq)->ifq_lock); \ if (ALTQ_IS_ENABLED(ifq)) { \ if (ALTQ_NEEDS_CLASSIFY(ifq)) \ (m)->m_pkthdr.pattr_class = (*(ifq)->altq_classify) \ ((ifq)->altq_clfier, (m), (af)); \ (m)->m_pkthdr.pattr_af = (af); \ (m)->m_pkthdr.pattr_hdr = mtod((m), void *); \ } \ mutex_exit((ifq)->ifq_lock); \ } while (/*CONSTCOND*/ 0) #else /* ! ALTQ */ #define IFQ_ENQUEUE(ifq, m, err) \ do { \ mutex_enter((ifq)->ifq_lock); \ if (IF_QFULL(ifq)) { \ m_freem(m); \ (err) = ENOBUFS; \ } else { \ IF_ENQUEUE((ifq), (m)); \ (err) = 0; \ } \ if (err) \ (ifq)->ifq_drops++; \ mutex_exit((ifq)->ifq_lock); \ } while (/*CONSTCOND*/ 0) #define IFQ_DEQUEUE(ifq, m) \ do { \ mutex_enter((ifq)->ifq_lock); \ IF_DEQUEUE((ifq), (m)); \ mutex_exit((ifq)->ifq_lock); \ } while (/*CONSTCOND*/ 0) #define IFQ_POLL(ifq, m) \ do { \ mutex_enter((ifq)->ifq_lock); \ IF_POLL((ifq), (m)); \ mutex_exit((ifq)->ifq_lock); \ } while (/*CONSTCOND*/ 0) #define IFQ_PURGE(ifq) \ do { \ mutex_enter((ifq)->ifq_lock); \ IF_PURGE(ifq); \ mutex_exit((ifq)->ifq_lock); \ } while (/*CONSTCOND*/ 0) #define IFQ_SET_READY(ifq) /* nothing */ #define IFQ_CLASSIFY(ifq, m, af) /* nothing */ #endif /* ALTQ */ #define IFQ_LOCK_INIT(ifq) (ifq)->ifq_lock = \ mutex_obj_alloc(MUTEX_DEFAULT, IPL_NET) #define IFQ_LOCK_DESTROY(ifq) mutex_obj_free((ifq)->ifq_lock) #define IFQ_LOCK(ifq) mutex_enter((ifq)->ifq_lock) #define IFQ_UNLOCK(ifq) mutex_exit((ifq)->ifq_lock) #define IFQ_IS_EMPTY(ifq) IF_IS_EMPTY(ifq) #define IFQ_INC_LEN(ifq) ((ifq)->ifq_len++) #define IFQ_DEC_LEN(ifq) (--(ifq)->ifq_len) #define IFQ_INC_DROPS(ifq) ((ifq)->ifq_drops++) #define IFQ_SET_MAXLEN(ifq, len) ((ifq)->ifq_maxlen = (len)) #include <sys/mallocvar.h> MALLOC_DECLARE(M_IFADDR); MALLOC_DECLARE(M_IFMADDR); int ifreq_setaddr(u_long, struct ifreq *, const struct sockaddr *); struct ifnet *if_alloc(u_char); void if_free(struct ifnet *); void if_initname(struct ifnet *, const char *, int); struct ifaddr *if_dl_create(const struct ifnet *, const struct sockaddr_dl **); void if_activate_sadl(struct ifnet *, struct ifaddr *, const struct sockaddr_dl *); void if_set_sadl(struct ifnet *, const void *, u_char, bool); void if_alloc_sadl(struct ifnet *); void if_free_sadl(struct ifnet *, int); void if_initialize(struct ifnet *); void if_register(struct ifnet *); void if_attach(struct ifnet *); /* Deprecated. Use if_initialize and if_register */ void if_attachdomain(void); void if_deactivate(struct ifnet *); bool if_is_deactivated(const struct ifnet *); void if_export_if_data(struct ifnet *, struct if_data *, bool); void if_purgeaddrs(struct ifnet *, int, void (*)(struct ifaddr *)); void if_detach(struct ifnet *); void if_down(struct ifnet *); void if_down_locked(struct ifnet *); void if_link_state_change(struct ifnet *, int); void if_domain_link_state_change(struct ifnet *, int); void if_up(struct ifnet *); void ifinit(void); void ifinit1(void); void ifinit_post(void); int ifaddrpref_ioctl(struct socket *, u_long, void *, struct ifnet *); extern int (*ifioctl)(struct socket *, u_long, void *, struct lwp *); int ifioctl_common(struct ifnet *, u_long, void *); int ifpromisc(struct ifnet *, int); int ifpromisc_locked(struct ifnet *, int); int if_addr_init(ifnet_t *, struct ifaddr *, bool); int if_do_dad(struct ifnet *); int if_mcast_op(ifnet_t *, const unsigned long, const struct sockaddr *); int if_flags_set(struct ifnet *, const u_short); int if_clone_list(int, char *, int *); int if_ioctl(struct ifnet *, u_long, void *); int if_init(struct ifnet *); void if_stop(struct ifnet *, int); struct ifnet *ifunit(const char *); struct ifnet *if_get(const char *, struct psref *); ifnet_t *if_byindex(u_int); ifnet_t *_if_byindex(u_int); ifnet_t *if_get_byindex(u_int, struct psref *); ifnet_t *if_get_bylla(const void *, unsigned char, struct psref *); void if_put(const struct ifnet *, struct psref *); void if_acquire(struct ifnet *, struct psref *); #define if_release if_put int if_tunnel_check_nesting(struct ifnet *, struct mbuf *, int); percpu_t *if_tunnel_alloc_ro_percpu(void); void if_tunnel_free_ro_percpu(percpu_t *); void if_tunnel_ro_percpu_rtcache_free(percpu_t *); struct tunnel_ro { struct route *tr_ro; kmutex_t *tr_lock; }; static inline void if_tunnel_get_ro(percpu_t *ro_percpu, struct route **ro, kmutex_t **lock) { struct tunnel_ro *tro; tro = percpu_getref(ro_percpu); *ro = tro->tr_ro; *lock = tro->tr_lock; mutex_enter(*lock); } static inline void if_tunnel_put_ro(percpu_t *ro_percpu, kmutex_t *lock) { mutex_exit(lock); percpu_putref(ro_percpu); } static __inline if_index_t if_get_index(const struct ifnet *ifp) { return ifp != NULL ? ifp->if_index : 0; } bool if_held(struct ifnet *); void if_input(struct ifnet *, struct mbuf *); struct if_percpuq * if_percpuq_create(struct ifnet *); void if_percpuq_destroy(struct if_percpuq *); void if_percpuq_enqueue(struct if_percpuq *, struct mbuf *); void if_deferred_start_init(struct ifnet *, void (*)(struct ifnet *)); void if_schedule_deferred_start(struct ifnet *); void ifa_insert(struct ifnet *, struct ifaddr *); void ifa_remove(struct ifnet *, struct ifaddr *); void ifa_psref_init(struct ifaddr *); void ifa_acquire(struct ifaddr *, struct psref *); void ifa_release(struct ifaddr *, struct psref *); bool ifa_held(struct ifaddr *); bool ifa_is_destroying(struct ifaddr *); void ifaref(struct ifaddr *); void ifafree(struct ifaddr *); struct ifaddr *ifa_ifwithaddr(const struct sockaddr *); struct ifaddr *ifa_ifwithaddr_psref(const struct sockaddr *, struct psref *); struct ifaddr *ifa_ifwithaf(int); struct ifaddr *ifa_ifwithdstaddr(const struct sockaddr *); struct ifaddr *ifa_ifwithdstaddr_psref(const struct sockaddr *, struct psref *); struct ifaddr *ifa_ifwithnet(const struct sockaddr *); struct ifaddr *ifa_ifwithnet_psref(const struct sockaddr *, struct psref *); struct ifaddr *ifa_ifwithladdr(const struct sockaddr *); struct ifaddr *ifa_ifwithladdr_psref(const struct sockaddr *, struct psref *); struct ifaddr *ifaof_ifpforaddr(const struct sockaddr *, struct ifnet *); struct ifaddr *ifaof_ifpforaddr_psref(const struct sockaddr *, struct ifnet *, struct psref *); void link_rtrequest(int, struct rtentry *, const struct rt_addrinfo *); void p2p_rtrequest(int, struct rtentry *, const struct rt_addrinfo *); void if_clone_attach(struct if_clone *); void if_clone_detach(struct if_clone *); int if_transmit_lock(struct ifnet *, struct mbuf *); int ifq_enqueue(struct ifnet *, struct mbuf *); int ifq_enqueue2(struct ifnet *, struct ifqueue *, struct mbuf *); int loioctl(struct ifnet *, u_long, void *); void loopattach(int); void loopinit(void); int looutput(struct ifnet *, struct mbuf *, const struct sockaddr *, const struct rtentry *); void * if_linkstate_change_establish(struct ifnet *, void (*)(void *), void *); void if_linkstate_change_disestablish(struct ifnet *, void *, kmutex_t *); /* * These are exported because they're an easy way to tell if * an interface is going away without having to burn a flag. */ int if_nulloutput(struct ifnet *, struct mbuf *, const struct sockaddr *, const struct rtentry *); void if_nullinput(struct ifnet *, struct mbuf *); void if_nullstart(struct ifnet *); int if_nulltransmit(struct ifnet *, struct mbuf *); int if_nullioctl(struct ifnet *, u_long, void *); int if_nullinit(struct ifnet *); void if_nullstop(struct ifnet *, int); void if_nullslowtimo(struct ifnet *); #define if_nullwatchdog if_nullslowtimo void if_nulldrain(struct ifnet *); #else struct if_nameindex { unsigned int if_index; /* 1, 2, ... */ char *if_name; /* null terminated name: "le0", ... */ }; #include <sys/cdefs.h> __BEGIN_DECLS unsigned int if_nametoindex(const char *); char * if_indextoname(unsigned int, char *); struct if_nameindex * if_nameindex(void); void if_freenameindex(struct if_nameindex *); __END_DECLS #endif /* _KERNEL */ /* XXX really ALTQ? */ #ifdef _KERNEL #define IFADDR_FIRST(__ifp) TAILQ_FIRST(&(__ifp)->if_addrlist) #define IFADDR_NEXT(__ifa) TAILQ_NEXT((__ifa), ifa_list) #define IFADDR_FOREACH(__ifa, __ifp) TAILQ_FOREACH(__ifa, \ &(__ifp)->if_addrlist, ifa_list) #define IFADDR_FOREACH_SAFE(__ifa, __ifp, __nifa) \ TAILQ_FOREACH_SAFE(__ifa, \ &(__ifp)->if_addrlist, ifa_list, __nifa) #define IFADDR_EMPTY(__ifp) TAILQ_EMPTY(&(__ifp)->if_addrlist) #define IFADDR_ENTRY_INIT(__ifa) \ PSLIST_ENTRY_INIT((__ifa), ifa_pslist_entry) #define IFADDR_ENTRY_DESTROY(__ifa) \ PSLIST_ENTRY_DESTROY((__ifa), ifa_pslist_entry) #define IFADDR_READER_EMPTY(__ifp) \ (PSLIST_READER_FIRST(&(__ifp)->if_addr_pslist, struct ifaddr, \ ifa_pslist_entry) == NULL) #define IFADDR_READER_FIRST(__ifp) \ PSLIST_READER_FIRST(&(__ifp)->if_addr_pslist, struct ifaddr, \ ifa_pslist_entry) #define IFADDR_READER_NEXT(__ifa) \ PSLIST_READER_NEXT((__ifa), struct ifaddr, ifa_pslist_entry) #define IFADDR_READER_FOREACH(__ifa, __ifp) \ PSLIST_READER_FOREACH((__ifa), &(__ifp)->if_addr_pslist, struct ifaddr,\ ifa_pslist_entry) #define IFADDR_WRITER_INSERT_HEAD(__ifp, __ifa) \ PSLIST_WRITER_INSERT_HEAD(&(__ifp)->if_addr_pslist, (__ifa), \ ifa_pslist_entry) #define IFADDR_WRITER_REMOVE(__ifa) \ PSLIST_WRITER_REMOVE((__ifa), ifa_pslist_entry) #define IFADDR_WRITER_FOREACH(__ifa, __ifp) \ PSLIST_WRITER_FOREACH((__ifa), &(__ifp)->if_addr_pslist, struct ifaddr,\ ifa_pslist_entry) #define IFADDR_WRITER_NEXT(__ifp) \ PSLIST_WRITER_NEXT((__ifp), struct ifaddr, ifa_pslist_entry) #define IFADDR_WRITER_INSERT_AFTER(__ifp, __new) \ PSLIST_WRITER_INSERT_AFTER((__ifp), (__new), ifa_pslist_entry) #define IFADDR_WRITER_EMPTY(__ifp) \ (PSLIST_WRITER_FIRST(&(__ifp)->if_addr_pslist, struct ifaddr, \ ifa_pslist_entry) == NULL) #define IFADDR_WRITER_INSERT_TAIL(__ifp, __new) \ do { \ if (IFADDR_WRITER_EMPTY(__ifp)) { \ IFADDR_WRITER_INSERT_HEAD((__ifp), (__new)); \ } else { \ struct ifaddr *__ifa; \ IFADDR_WRITER_FOREACH(__ifa, (__ifp)) { \ if (IFADDR_WRITER_NEXT(__ifa) == NULL) {\ IFADDR_WRITER_INSERT_AFTER(__ifa,\ (__new)); \ break; \ } \ } \ } \ } while (0) #define IFNET_GLOBAL_LOCK() mutex_enter(&ifnet_mtx) #define IFNET_GLOBAL_UNLOCK() mutex_exit(&ifnet_mtx) #define IFNET_GLOBAL_LOCKED() mutex_owned(&ifnet_mtx) #define IFNET_READER_EMPTY() \ (PSLIST_READER_FIRST(&ifnet_pslist, struct ifnet, if_pslist_entry) == NULL) #define IFNET_READER_FIRST() \ PSLIST_READER_FIRST(&ifnet_pslist, struct ifnet, if_pslist_entry) #define IFNET_READER_NEXT(__ifp) \ PSLIST_READER_NEXT((__ifp), struct ifnet, if_pslist_entry) #define IFNET_READER_FOREACH(__ifp) \ PSLIST_READER_FOREACH((__ifp), &ifnet_pslist, struct ifnet, \ if_pslist_entry) #define IFNET_WRITER_INSERT_HEAD(__ifp) \ PSLIST_WRITER_INSERT_HEAD(&ifnet_pslist, (__ifp), if_pslist_entry) #define IFNET_WRITER_REMOVE(__ifp) \ PSLIST_WRITER_REMOVE((__ifp), if_pslist_entry) #define IFNET_WRITER_FOREACH(__ifp) \ PSLIST_WRITER_FOREACH((__ifp), &ifnet_pslist, struct ifnet, \ if_pslist_entry) #define IFNET_WRITER_NEXT(__ifp) \ PSLIST_WRITER_NEXT((__ifp), struct ifnet, if_pslist_entry) #define IFNET_WRITER_INSERT_AFTER(__ifp, __new) \ PSLIST_WRITER_INSERT_AFTER((__ifp), (__new), if_pslist_entry) #define IFNET_WRITER_EMPTY() \ (PSLIST_WRITER_FIRST(&ifnet_pslist, struct ifnet, if_pslist_entry) == NULL) #define IFNET_WRITER_INSERT_TAIL(__new) \ do { \ if (IFNET_WRITER_EMPTY()) { \ IFNET_WRITER_INSERT_HEAD(__new); \ } else { \ struct ifnet *__ifp; \ IFNET_WRITER_FOREACH(__ifp) { \ if (IFNET_WRITER_NEXT(__ifp) == NULL) { \ IFNET_WRITER_INSERT_AFTER(__ifp,\ (__new)); \ break; \ } \ } \ } \ } while (0) #define IFNET_LOCK(ifp) mutex_enter((ifp)->if_ioctl_lock) #define IFNET_UNLOCK(ifp) mutex_exit((ifp)->if_ioctl_lock) #define IFNET_LOCKED(ifp) mutex_owned((ifp)->if_ioctl_lock) #define IFNET_ASSERT_UNLOCKED(ifp) \ KDASSERT(mutex_ownable((ifp)->if_ioctl_lock)) extern struct pslist_head ifnet_pslist; extern kmutex_t ifnet_mtx; extern struct ifnet *lo0ifp; /* * ifq sysctl support */ int sysctl_ifq(int *name, u_int namelen, void *oldp, size_t *oldlenp, void *newp, size_t newlen, struct ifqueue *ifq); /* symbolic names for terminal (per-protocol) CTL_IFQ_ nodes */ #define IFQCTL_LEN 1 #define IFQCTL_MAXLEN 2 #define IFQCTL_PEAK 3 #define IFQCTL_DROPS 4 /* * Hook for if_vlan - needed by if_agr */ MODULE_HOOK(if_vlan_vlan_input_hook, struct mbuf *, (struct ifnet *, struct mbuf *)); #endif /* _KERNEL */ #endif /* !_NET_IF_H_ */
3 3 3 1 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 /* $NetBSD: kern_resource_43.c,v 1.23 2021/09/07 11:43:02 riastradh Exp $ */ /*- * Copyright (c) 1982, 1986, 1991, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)kern_resource.c 8.5 (Berkeley) 1/21/94 */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: kern_resource_43.c,v 1.23 2021/09/07 11:43:02 riastradh Exp $"); #if defined(_KERNEL_OPT) #include "opt_compat_netbsd.h" #endif #include <sys/param.h> #include <sys/systm.h> #include <sys/kernel.h> #include <sys/file.h> #include <sys/resourcevar.h> #include <sys/proc.h> #include <sys/mount.h> #include <sys/syscall.h> #include <sys/syscallvar.h> #include <sys/syscallargs.h> #include <compat/common/compat_mod.h> static struct syscall_package kern_resource_43_syscalls[] = { { SYS_compat_43_ogetrlimit, 0, (sy_call_t *)compat_43_sys_getrlimit }, { SYS_compat_43_osetrlimit, 0, (sy_call_t *)compat_43_sys_setrlimit }, { 0, 0, NULL } }; /* ARGSUSED */ int compat_43_sys_getrlimit(struct lwp *l, const struct compat_43_sys_getrlimit_args *uap, register_t *retval) { /* { syscallarg(int) which; syscallarg(struct orlimit *) rlp; } */ struct proc *p = l->l_proc; int which = SCARG(uap, which); struct orlimit olim; if ((u_int)which >= RLIM_NLIMITS) return (EINVAL); memset(&olim, 0, sizeof(olim)); olim.rlim_cur = p->p_rlimit[which].rlim_cur; if (olim.rlim_cur == -1) olim.rlim_cur = 0x7fffffff; olim.rlim_max = p->p_rlimit[which].rlim_max; if (olim.rlim_max == -1) olim.rlim_max = 0x7fffffff; return copyout(&olim, SCARG(uap, rlp), sizeof(olim)); } /* ARGSUSED */ int compat_43_sys_setrlimit(struct lwp *l, const struct compat_43_sys_setrlimit_args *uap, register_t *retval) { /* { syscallarg(int) which; syscallarg(const struct orlimit *) rlp; } */ int which = SCARG(uap, which); struct orlimit olim; struct rlimit lim; int error; error = copyin(SCARG(uap, rlp), &olim, sizeof(struct orlimit)); if (error) return (error); lim.rlim_cur = olim.rlim_cur; lim.rlim_max = olim.rlim_max; return (dosetrlimit(l, l->l_proc, which, &lim)); } int kern_resource_43_init(void) { return syscall_establish(NULL, kern_resource_43_syscalls); } int kern_resource_43_fini(void) { return syscall_disestablish(NULL, kern_resource_43_syscalls); }
84 84 72 99 460 461 462 460 457 458 32 1 453 31 10 10 38 38 38 18 38 18 33 9 33 9 4 4 9 6 3 9 263 263 2 2 2 2 11 12 12 12 12 12 12 12 12 2 11 12 12 12 7 7 7 7 87 86 87 87 87 72 3 12 12 55 18 83 84 60 7 1 4 6 5 1 83 47 47 47 46 38 37 38 32 6 42 6 15 15 14 1 13 1 1 1 1 22 21 22 18 4 4 2 2 1 1 1 165 151 11 14 13 1 1 1 1 14 9 5 209 89 127 138 139 208 209 4 4 4 4 4 15 15 12 3 3 15 15 15 12 3 10 10 6 1 9 3 7 10 3 7 10 10 1 9 10 2 1 2 1 2 1 1 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 /* $NetBSD: spec_vnops.c,v 1.218 2023/04/22 15:32:49 riastradh Exp $ */ /*- * Copyright (c) 2008 The NetBSD Foundation, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /* * Copyright (c) 1989, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)spec_vnops.c 8.15 (Berkeley) 7/14/95 */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: spec_vnops.c,v 1.218 2023/04/22 15:32:49 riastradh Exp $"); #ifdef _KERNEL_OPT #include "opt_ddb.h" #endif #include <sys/param.h> #include <sys/proc.h> #include <sys/systm.h> #include <sys/kernel.h> #include <sys/conf.h> #include <sys/buf.h> #include <sys/mount.h> #include <sys/namei.h> #include <sys/vnode_impl.h> #include <sys/stat.h> #include <sys/errno.h> #include <sys/ioctl.h> #include <sys/poll.h> #include <sys/file.h> #include <sys/disklabel.h> #include <sys/disk.h> #include <sys/lockf.h> #include <sys/tty.h> #include <sys/kauth.h> #include <sys/fstrans.h> #include <sys/module.h> #include <sys/atomic.h> #include <miscfs/genfs/genfs.h> #include <miscfs/specfs/specdev.h> #ifdef DDB #include <ddb/ddb.h> #endif /* * Lock order: * * vnode lock * -> device_lock * -> struct vnode::v_interlock */ /* symbolic sleep message strings for devices */ const char devopn[] = "devopn"; const char devio[] = "devio"; const char devwait[] = "devwait"; const char devin[] = "devin"; const char devout[] = "devout"; const char devioc[] = "devioc"; const char devcls[] = "devcls"; #define SPECHSZ 64 #if ((SPECHSZ&(SPECHSZ-1)) == 0) #define SPECHASH(rdev) (((rdev>>5)+(rdev))&(SPECHSZ-1)) #else #define SPECHASH(rdev) (((unsigned)((rdev>>5)+(rdev)))%SPECHSZ) #endif static vnode_t *specfs_hash[SPECHSZ]; extern struct mount *dead_rootmount; /* * This vnode operations vector is used for special device nodes * created from whole cloth by the kernel. For the ops vector for * vnodes built from special devices found in a filesystem, see (e.g) * ffs_specop_entries[] in ffs_vnops.c or the equivalent for other * filesystems. */ int (**spec_vnodeop_p)(void *); const struct vnodeopv_entry_desc spec_vnodeop_entries[] = { { &vop_default_desc, vn_default_error }, { &vop_parsepath_desc, genfs_parsepath }, /* parsepath */ { &vop_lookup_desc, spec_lookup }, /* lookup */ { &vop_create_desc, genfs_badop }, /* create */ { &vop_mknod_desc, genfs_badop }, /* mknod */ { &vop_open_desc, spec_open }, /* open */ { &vop_close_desc, spec_close }, /* close */ { &vop_access_desc, genfs_ebadf }, /* access */ { &vop_accessx_desc, genfs_ebadf }, /* accessx */ { &vop_getattr_desc, genfs_ebadf }, /* getattr */ { &vop_setattr_desc, genfs_ebadf }, /* setattr */ { &vop_read_desc, spec_read }, /* read */ { &vop_write_desc, spec_write }, /* write */ { &vop_fallocate_desc, genfs_eopnotsupp }, /* fallocate */ { &vop_fdiscard_desc, spec_fdiscard }, /* fdiscard */ { &vop_fcntl_desc, genfs_fcntl }, /* fcntl */ { &vop_ioctl_desc, spec_ioctl }, /* ioctl */ { &vop_poll_desc, spec_poll }, /* poll */ { &vop_kqfilter_desc, spec_kqfilter }, /* kqfilter */ { &vop_revoke_desc, genfs_revoke }, /* revoke */ { &vop_mmap_desc, spec_mmap }, /* mmap */ { &vop_fsync_desc, spec_fsync }, /* fsync */ { &vop_seek_desc, spec_seek }, /* seek */ { &vop_remove_desc, genfs_badop }, /* remove */ { &vop_link_desc, genfs_badop }, /* link */ { &vop_rename_desc, genfs_badop }, /* rename */ { &vop_mkdir_desc, genfs_badop }, /* mkdir */ { &vop_rmdir_desc, genfs_badop }, /* rmdir */ { &vop_symlink_desc, genfs_badop }, /* symlink */ { &vop_readdir_desc, genfs_badop }, /* readdir */ { &vop_readlink_desc, genfs_badop }, /* readlink */ { &vop_abortop_desc, genfs_badop }, /* abortop */ { &vop_inactive_desc, spec_inactive }, /* inactive */ { &vop_reclaim_desc, spec_reclaim }, /* reclaim */ { &vop_lock_desc, genfs_lock }, /* lock */ { &vop_unlock_desc, genfs_unlock }, /* unlock */ { &vop_bmap_desc, spec_bmap }, /* bmap */ { &vop_strategy_desc, spec_strategy }, /* strategy */ { &vop_print_desc, spec_print }, /* print */ { &vop_islocked_desc, genfs_islocked }, /* islocked */ { &vop_pathconf_desc, spec_pathconf }, /* pathconf */ { &vop_advlock_desc, spec_advlock }, /* advlock */ { &vop_bwrite_desc, vn_bwrite }, /* bwrite */ { &vop_getpages_desc, genfs_getpages }, /* getpages */ { &vop_putpages_desc, genfs_putpages }, /* putpages */ { NULL, NULL } }; const struct vnodeopv_desc spec_vnodeop_opv_desc = { &spec_vnodeop_p, spec_vnodeop_entries }; static kauth_listener_t rawio_listener; static struct kcondvar specfs_iocv; /* * Returns true if vnode is /dev/mem or /dev/kmem. */ bool iskmemvp(struct vnode *vp) { return ((vp->v_type == VCHR) && iskmemdev(vp->v_rdev)); } /* * Returns true if dev is /dev/mem or /dev/kmem. */ int iskmemdev(dev_t dev) { /* mem_no is emitted by config(8) to generated devsw.c */ extern const int mem_no; /* minor 14 is /dev/io on i386 with COMPAT_10 */ return (major(dev) == mem_no && (minor(dev) < 2 || minor(dev) == 14)); } static int rawio_listener_cb(kauth_cred_t cred, kauth_action_t action, void *cookie, void *arg0, void *arg1, void *arg2, void *arg3) { int result; result = KAUTH_RESULT_DEFER; if ((action != KAUTH_DEVICE_RAWIO_SPEC) && (action != KAUTH_DEVICE_RAWIO_PASSTHRU)) return result; /* Access is mandated by permissions. */ result = KAUTH_RESULT_ALLOW; return result; } void spec_init(void) { rawio_listener = kauth_listen_scope(KAUTH_SCOPE_DEVICE, rawio_listener_cb, NULL); cv_init(&specfs_iocv, "specio"); } /* * spec_io_enter(vp, &sn, &dev) * * Enter an operation that may not hold vp's vnode lock or an * fstrans on vp's mount. Until spec_io_exit, the vnode will not * be revoked. * * On success, set sn to the specnode pointer and dev to the dev_t * number and return zero. Caller must later call spec_io_exit * when done. * * On failure, return ENXIO -- the device has been revoked and no * longer exists. */ static int spec_io_enter(struct vnode *vp, struct specnode **snp, dev_t *devp) { dev_t dev; struct specnode *sn; unsigned iocnt; int error = 0; mutex_enter(vp->v_interlock); /* * Extract all the info we need from the vnode, unless the * vnode has already been reclaimed. This can happen if the * underlying device has been removed and all the device nodes * for it have been revoked. The caller may not hold a vnode * lock or fstrans to prevent this from happening before it has * had an opportunity to notice the vnode is dead. */ if (vdead_check(vp, VDEAD_NOWAIT) != 0 || (sn = vp->v_specnode) == NULL || (dev = vp->v_rdev) == NODEV) { error = ENXIO; goto out; } /* * Notify spec_close that we are doing an I/O operation which * may not be not bracketed by fstrans(9) and thus is not * blocked by vfs suspension. * * We could hold this reference with psref(9) instead, but we * already have to take the interlock for vdead_check, so * there's not much more cost here to another atomic operation. */ do { iocnt = atomic_load_relaxed(&sn->sn_dev->sd_iocnt); if (__predict_false(iocnt == UINT_MAX)) { /* * The I/O count is limited by the number of * LWPs (which will never overflow this) -- * unless one driver uses another driver via * specfs, which is rather unusual, but which * could happen via pud(4) userspace drivers. * We could use a 64-bit count, but can't use * atomics for that on all platforms. * (Probably better to switch to psref or * localcount instead.) */ error = EBUSY; goto out; } } while (atomic_cas_uint(&sn->sn_dev->sd_iocnt, iocnt, iocnt + 1) != iocnt); /* Success! */ *snp = sn; *devp = dev; error = 0; out: mutex_exit(vp->v_interlock); return error; } /* * spec_io_exit(vp, sn) * * Exit an operation entered with a successful spec_io_enter -- * allow concurrent spec_node_revoke to proceed. The argument sn * must match the struct specnode pointer returned by spec_io_exit * for vp. */ static void spec_io_exit(struct vnode *vp, struct specnode *sn) { struct specdev *sd = sn->sn_dev; unsigned iocnt; KASSERT(vp->v_specnode == sn); /* * We are done. Notify spec_close if appropriate. The * transition of 1 -> 0 must happen under device_lock so * spec_close doesn't miss a wakeup. */ do { iocnt = atomic_load_relaxed(&sd->sd_iocnt); KASSERT(iocnt > 0); if (iocnt == 1) { mutex_enter(&device_lock); if (atomic_dec_uint_nv(&sd->sd_iocnt) == 0) cv_broadcast(&specfs_iocv); mutex_exit(&device_lock); break; } } while (atomic_cas_uint(&sd->sd_iocnt, iocnt, iocnt - 1) != iocnt); } /* * spec_io_drain(sd) * * Wait for all existing spec_io_enter/exit sections to complete. * Caller must ensure spec_io_enter will fail at this point. */ static void spec_io_drain(struct specdev *sd) { /* * I/O at the same time as closing is unlikely -- it often * indicates an application bug. */ if (__predict_true(atomic_load_relaxed(&sd->sd_iocnt) == 0)) return; mutex_enter(&device_lock); while (atomic_load_relaxed(&sd->sd_iocnt) > 0) cv_wait(&specfs_iocv, &device_lock); mutex_exit(&device_lock); } /* * Initialize a vnode that represents a device. */ void spec_node_init(vnode_t *vp, dev_t rdev) { specnode_t *sn; specdev_t *sd; vnode_t *vp2; vnode_t **vpp; KASSERT(vp->v_type == VBLK || vp->v_type == VCHR); KASSERT(vp->v_specnode == NULL); /* * Search the hash table for this device. If known, add a * reference to the device structure. If not known, create * a new entry to represent the device. In all cases add * the vnode to the hash table. */ sn = kmem_alloc(sizeof(*sn), KM_SLEEP); sd = kmem_alloc(sizeof(*sd), KM_SLEEP); mutex_enter(&device_lock); vpp = &specfs_hash[SPECHASH(rdev)]; for (vp2 = *vpp; vp2 != NULL; vp2 = vp2->v_specnext) { KASSERT(vp2->v_specnode != NULL); if (rdev == vp2->v_rdev && vp->v_type == vp2->v_type) { break; } } if (vp2 == NULL) { /* No existing record, create a new one. */ sd->sd_mountpoint = NULL; sd->sd_lockf = NULL; sd->sd_refcnt = 1; sd->sd_opencnt = 0; sd->sd_bdevvp = NULL; sd->sd_iocnt = 0; sd->sd_opened = false; sd->sd_closing = false; sn->sn_dev = sd; sd = NULL; } else { /* Use the existing record. */ sn->sn_dev = vp2->v_specnode->sn_dev; sn->sn_dev->sd_refcnt++; } /* Insert vnode into the hash chain. */ sn->sn_opencnt = 0; sn->sn_rdev = rdev; sn->sn_gone = false; vp->v_specnode = sn; vp->v_specnext = *vpp; *vpp = vp; mutex_exit(&device_lock); /* Free the record we allocated if unused. */ if (sd != NULL) { kmem_free(sd, sizeof(*sd)); } } /* * Lookup a vnode by device number and return it referenced. */ int spec_node_lookup_by_dev(enum vtype type, dev_t dev, int flags, vnode_t **vpp) { int error; vnode_t *vp; top: mutex_enter(&device_lock); for (vp = specfs_hash[SPECHASH(dev)]; vp; vp = vp->v_specnext) { if (type == vp->v_type && dev == vp->v_rdev) { mutex_enter(vp->v_interlock); /* If clean or being cleaned, then ignore it. */ if (vdead_check(vp, VDEAD_NOWAIT) == 0) break; if ((flags & VDEAD_NOWAIT) == 0) { mutex_exit(&device_lock); /* * It may be being revoked as we speak, * and the caller wants to wait until * all revocation has completed. Let * vcache_vget wait for it to finish * dying; as a side effect, vcache_vget * releases vp->v_interlock. Note that * vcache_vget cannot succeed at this * point because vdead_check already * failed. */ error = vcache_vget(vp); KASSERT(error); goto top; } mutex_exit(vp->v_interlock); } } KASSERT(vp == NULL || mutex_owned(vp->v_interlock)); if (vp == NULL) { mutex_exit(&device_lock); return ENOENT; } /* * If it is an opened block device return the opened vnode. */ if (type == VBLK && vp->v_specnode->sn_dev->sd_bdevvp != NULL) { mutex_exit(vp->v_interlock); vp = vp->v_specnode->sn_dev->sd_bdevvp; mutex_enter(vp->v_interlock); } mutex_exit(&device_lock); error = vcache_vget(vp); if (error) return error; *vpp = vp; return 0; } /* * Lookup a vnode by file system mounted on and return it referenced. */ int spec_node_lookup_by_mount(struct mount *mp, vnode_t **vpp) { int i, error; vnode_t *vp, *vq; mutex_enter(&device_lock); for (i = 0, vq = NULL; i < SPECHSZ && vq == NULL; i++) { for (vp = specfs_hash[i]; vp; vp = vp->v_specnext) { if (vp->v_type != VBLK) continue; vq = vp->v_specnode->sn_dev->sd_bdevvp; if (vq != NULL && vq->v_specnode->sn_dev->sd_mountpoint == mp) break; vq = NULL; } } if (vq == NULL) { mutex_exit(&device_lock); return ENOENT; } mutex_enter(vq->v_interlock); mutex_exit(&device_lock); error = vcache_vget(vq); if (error) return error; *vpp = vq; return 0; } /* * Get the file system mounted on this block device. * * XXX Caller should hold the vnode lock -- shared or exclusive -- so * that this can't changed, and the vnode can't be revoked while we * examine it. But not all callers do, and they're scattered through a * lot of file systems, so we can't assert this yet. */ struct mount * spec_node_getmountedfs(vnode_t *devvp) { struct mount *mp; KASSERT(devvp->v_type == VBLK); mp = devvp->v_specnode->sn_dev->sd_mountpoint; return mp; } /* * Set the file system mounted on this block device. * * XXX Caller should hold the vnode lock exclusively so this can't be * changed or assumed by spec_node_getmountedfs while we change it, and * the vnode can't be revoked while we handle it. But not all callers * do, and they're scattered through a lot of file systems, so we can't * assert this yet. Instead, for now, we'll take an I/O reference so * at least the ioctl doesn't race with revoke/detach. * * If you do change this to assert an exclusive vnode lock, you must * also do vdead_check before trying bdev_ioctl, because the vnode may * have been revoked by the time the caller locked it, and this is * _not_ a vop -- calls to spec_node_setmountedfs don't go through * v_op, so revoking the vnode doesn't prevent further calls. * * XXX Caller should additionally have the vnode open, at least if mp * is nonnull, but I'm not sure all callers do that -- need to audit. * Currently udf closes the vnode before clearing the mount. */ void spec_node_setmountedfs(vnode_t *devvp, struct mount *mp) { struct dkwedge_info dkw; struct specnode *sn; dev_t dev; int error; KASSERT(devvp->v_type == VBLK); error = spec_io_enter(devvp, &sn, &dev); if (error) return; KASSERT(sn->sn_dev->sd_mountpoint == NULL || mp == NULL); sn->sn_dev->sd_mountpoint = mp; if (mp == NULL) goto out; error = bdev_ioctl(dev, DIOCGWEDGEINFO, &dkw, FREAD, curlwp); if (error) goto out; strlcpy(mp->mnt_stat.f_mntfromlabel, dkw.dkw_wname, sizeof(mp->mnt_stat.f_mntfromlabel)); out: spec_io_exit(devvp, sn); } /* * A vnode representing a special device is going away. Close * the device if the vnode holds it open. */ void spec_node_revoke(vnode_t *vp) { specnode_t *sn; specdev_t *sd; struct vnode **vpp; KASSERT(VOP_ISLOCKED(vp) == LK_EXCLUSIVE); sn = vp->v_specnode; sd = sn->sn_dev; KASSERT(vp->v_type == VBLK || vp->v_type == VCHR); KASSERT(vp->v_specnode != NULL); KASSERT(sn->sn_gone == false); mutex_enter(&device_lock); KASSERTMSG(sn->sn_opencnt <= sd->sd_opencnt, "sn_opencnt=%u > sd_opencnt=%u", sn->sn_opencnt, sd->sd_opencnt); sn->sn_gone = true; if (sn->sn_opencnt != 0) { sd->sd_opencnt -= (sn->sn_opencnt - 1); sn->sn_opencnt = 1; mutex_exit(&device_lock); VOP_CLOSE(vp, FNONBLOCK, NOCRED); mutex_enter(&device_lock); KASSERT(sn->sn_opencnt == 0); } /* * We may have revoked the vnode in this thread while another * thread was in the middle of spec_close, in the window when * spec_close releases the vnode lock to call .d_close for the * last close. In that case, wait for the concurrent * spec_close to complete. */ while (sd->sd_closing) cv_wait(&specfs_iocv, &device_lock); /* * Remove from the hash so lookups stop returning this * specnode. We will dissociate it from the specdev -- and * possibly free the specdev -- in spec_node_destroy. */ KASSERT(sn->sn_gone); KASSERT(sn->sn_opencnt == 0); for (vpp = &specfs_hash[SPECHASH(vp->v_rdev)];; vpp = &(*vpp)->v_specnext) { if (*vpp == vp) { *vpp = vp->v_specnext; vp->v_specnext = NULL; break; } } mutex_exit(&device_lock); } /* * A vnode representing a special device is being recycled. * Destroy the specfs component. */ void spec_node_destroy(vnode_t *vp) { specnode_t *sn; specdev_t *sd; int refcnt; sn = vp->v_specnode; sd = sn->sn_dev; KASSERT(vp->v_type == VBLK || vp->v_type == VCHR); KASSERT(vp->v_specnode != NULL); KASSERT(sn->sn_opencnt == 0); mutex_enter(&device_lock); sn = vp->v_specnode; vp->v_specnode = NULL; refcnt = sd->sd_refcnt--; KASSERT(refcnt > 0); mutex_exit(&device_lock); /* If the device is no longer in use, destroy our record. */ if (refcnt == 1) { KASSERT(sd->sd_iocnt == 0); KASSERT(sd->sd_opencnt == 0); KASSERT(sd->sd_bdevvp == NULL); kmem_free(sd, sizeof(*sd)); } kmem_free(sn, sizeof(*sn)); } /* * Trivial lookup routine that always fails. */ int spec_lookup(void *v) { struct vop_lookup_v2_args /* { struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; } */ *ap = v; *ap->a_vpp = NULL; return ENOTDIR; } typedef int (*spec_ioctl_t)(dev_t, u_long, void *, int, struct lwp *); /* * Open a special file. */ /* ARGSUSED */ int spec_open(void *v) { struct vop_open_args /* { struct vnode *a_vp; int a_mode; kauth_cred_t a_cred; } */ *ap = v; struct lwp *l = curlwp; struct vnode *vp = ap->a_vp; dev_t dev, dev1; int error; enum kauth_device_req req; specnode_t *sn, *sn1; specdev_t *sd; spec_ioctl_t ioctl; u_int gen = 0; const char *name = NULL; bool needclose = false; struct partinfo pi; KASSERT(VOP_ISLOCKED(vp) == LK_EXCLUSIVE); KASSERTMSG(vp->v_type == VBLK || vp->v_type == VCHR, "type=%d", vp->v_type); dev = vp->v_rdev; sn = vp->v_specnode; sd = sn->sn_dev; /* * Don't allow open if fs is mounted -nodev. */ if (vp->v_mount && (vp->v_mount->mnt_flag & MNT_NODEV)) return ENXIO; switch (ap->a_mode & (FREAD | FWRITE)) { case FREAD | FWRITE: req = KAUTH_REQ_DEVICE_RAWIO_SPEC_RW; break; case FWRITE: req = KAUTH_REQ_DEVICE_RAWIO_SPEC_WRITE; break; default: req = KAUTH_REQ_DEVICE_RAWIO_SPEC_READ; break; } error = kauth_authorize_device_spec(ap->a_cred, req, vp); if (error) return error; /* * Acquire an open reference -- as long as we hold onto it, and * the vnode isn't revoked, it can't be closed, and the vnode * can't be revoked until we release the vnode lock. */ mutex_enter(&device_lock); KASSERT(!sn->sn_gone); switch (vp->v_type) { case VCHR: /* * Character devices can accept opens from multiple * vnodes. But first, wait for any close to finish. * Wait under the vnode lock so we don't have to worry * about the vnode being revoked while we wait. */ while (sd->sd_closing) { error = cv_wait_sig(&specfs_iocv, &device_lock); if (error) break; } if (error) break; sd->sd_opencnt++; sn->sn_opencnt++; KASSERTMSG(sn->sn_opencnt <= sd->sd_opencnt, "sn_opencnt=%u > sd_opencnt=%u", sn->sn_opencnt, sd->sd_opencnt); break; case VBLK: /* * For block devices, permit only one open. The buffer * cache cannot remain self-consistent with multiple * vnodes holding a block device open. * * Treat zero opencnt with non-NULL mountpoint as open. * This may happen after forced detach of a mounted device. * * Also treat sd_closing, meaning there is a concurrent * close in progress, as still open. */ if (sd->sd_opencnt != 0 || sd->sd_mountpoint != NULL || sd->sd_closing) { error = EBUSY; break; } KASSERTMSG(sn->sn_opencnt == 0, "sn_opencnt=%u", sn->sn_opencnt); sn->sn_opencnt = 1; sd->sd_opencnt = 1; sd->sd_bdevvp = vp; break; default: panic("invalid specfs vnode type: %d", vp->v_type); } mutex_exit(&device_lock); if (error) return error; /* * Set VV_ISTTY if this is a tty cdev. * * XXX This does the wrong thing if the module has to be * autoloaded. We should maybe set this after autoloading * modules and calling .d_open successfully, except (a) we need * the vnode lock to touch it, and (b) once we acquire the * vnode lock again, the vnode may have been revoked, and * deadfs's dead_read needs VV_ISTTY to be already set in order * to return the right answer. So this needs some additional * synchronization to be made to work correctly with tty driver * module autoload. For now, let's just hope it doesn't cause * too much trouble for a tty from an autoloaded driver module * to fail with EIO instead of returning EOF. */ if (vp->v_type == VCHR) { if (cdev_type(dev) == D_TTY) vp->v_vflag |= VV_ISTTY; } /* * Because opening the device may block indefinitely, e.g. when * opening a tty, and loading a module may cross into many * other subsystems, we must not hold the vnode lock while * calling .d_open, so release it now and reacquire it when * done. * * Take an I/O reference so that any concurrent spec_close via * spec_node_revoke will wait for us to finish calling .d_open. * The vnode can't be dead at this point because we have it * locked. Note that if revoked, the driver must interrupt * .d_open before spec_close starts waiting for I/O to drain so * this doesn't deadlock. */ VOP_UNLOCK(vp); error = spec_io_enter(vp, &sn1, &dev1); if (error) { vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); return error; } KASSERT(sn1 == sn); KASSERT(dev1 == dev); /* * Open the device. If .d_open returns ENXIO (device not * configured), the driver may not be loaded, so try * autoloading a module and then try .d_open again if anything * got loaded. */ switch (vp->v_type) { case VCHR: do { const struct cdevsw *cdev; gen = module_gen; error = cdev_open(dev, ap->a_mode, S_IFCHR, l); if (error != ENXIO) break; /* Check if we already have a valid driver */ mutex_enter(&device_lock); cdev = cdevsw_lookup(dev); mutex_exit(&device_lock); if (cdev != NULL) break; /* Get device name from devsw_conv array */ if ((name = cdevsw_getname(major(dev))) == NULL) break; /* Try to autoload device module */ (void)module_autoload(name, MODULE_CLASS_DRIVER); } while (gen != module_gen); break; case VBLK: do { const struct bdevsw *bdev; gen = module_gen; error = bdev_open(dev, ap->a_mode, S_IFBLK, l); if (error != ENXIO) break; /* Check if we already have a valid driver */ mutex_enter(&device_lock); bdev = bdevsw_lookup(dev); mutex_exit(&device_lock); if (bdev != NULL) break; /* Get device name from devsw_conv array */ if ((name = bdevsw_getname(major(dev))) == NULL) break; /* Try to autoload device module */ (void)module_autoload(name, MODULE_CLASS_DRIVER); } while (gen != module_gen); break; default: __unreachable(); } /* * Release the I/O reference now that we have called .d_open, * and reacquire the vnode lock. At this point, the device may * have been revoked, so we must tread carefully. However, sn * and sd remain valid pointers until we drop our reference. */ spec_io_exit(vp, sn); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); KASSERT(vp->v_specnode == sn); /* * If it has been revoked since we released the vnode lock and * reacquired it, then spec_node_revoke has closed it, and we * must fail with EBADF. * * Otherwise, if opening it failed, back out and release the * open reference. If it was ever successfully opened and we * got the last reference this way, it's now our job to close * it. This might happen in the following scenario: * * Thread 1 Thread 2 * VOP_OPEN * ... * .d_open -> 0 (success) * acquire vnode lock * do stuff VOP_OPEN * release vnode lock ... * .d_open -> EBUSY * VOP_CLOSE * acquire vnode lock * --sd_opencnt != 0 * => no .d_close * release vnode lock * acquire vnode lock * --sd_opencnt == 0 * * We can't resolve this by making spec_close wait for .d_open * to complete before examining sd_opencnt, because .d_open can * hang indefinitely, e.g. for a tty. */ mutex_enter(&device_lock); if (sn->sn_gone) { if (error == 0) error = EBADF; } else if (error == 0) { /* * Device has not been revoked, so our opencnt can't * have gone away at this point -- transition to * sn_gone=true happens before transition to * sn_opencnt=0 in spec_node_revoke. */ KASSERT(sd->sd_opencnt); KASSERT(sn->sn_opencnt); KASSERTMSG(sn->sn_opencnt <= sd->sd_opencnt, "sn_opencnt=%u > sd_opencnt=%u", sn->sn_opencnt, sd->sd_opencnt); KASSERT(!sd->sd_closing); sd->sd_opened = true; } else if (sd->sd_opencnt == 1 && sd->sd_opened) { /* * We're the last reference to a _previous_ open even * though this one failed, so we have to close it. * Don't decrement the reference count here -- * spec_close will do that. */ KASSERT(sn->sn_opencnt == 1); needclose = true; } else { KASSERT(sd->sd_opencnt); KASSERT(sn->sn_opencnt); KASSERTMSG(sn->sn_opencnt <= sd->sd_opencnt, "sn_opencnt=%u > sd_opencnt=%u", sn->sn_opencnt, sd->sd_opencnt); sd->sd_opencnt--; sn->sn_opencnt--; if (vp->v_type == VBLK) sd->sd_bdevvp = NULL; } mutex_exit(&device_lock); /* * If this open failed, but the device was previously opened, * and another thread concurrently closed the vnode while we * were in the middle of reopening it, the other thread will * see sd_opencnt > 0 and thus decide not to call .d_close -- * it is now our responsibility to do so. * * XXX The flags passed to VOP_CLOSE here are wrong, but * drivers can't rely on FREAD|FWRITE anyway -- e.g., consider * a device opened by thread 0 with O_READ, then opened by * thread 1 with O_WRITE, then closed by thread 0, and finally * closed by thread 1; the last .d_close call will have FWRITE * but not FREAD. We should just eliminate the FREAD/FWRITE * parameter to .d_close altogether. */ if (needclose) { KASSERT(error); VOP_CLOSE(vp, FNONBLOCK, NOCRED); } /* If anything went wrong, we're done. */ if (error) return error; /* * For disk devices, automagically set the vnode size to the * partition size, if we can. This applies to block devices * and character devices alike -- every block device must have * a corresponding character device. And if the module is * loaded it will remain loaded until we're done here (it is * forbidden to devsw_detach until closed). So it is safe to * query cdev_type unconditionally here. */ if (cdev_type(dev) == D_DISK) { ioctl = vp->v_type == VCHR ? cdev_ioctl : bdev_ioctl; if ((*ioctl)(dev, DIOCGPARTINFO, &pi, FREAD, curlwp) == 0) uvm_vnp_setsize(vp, (voff_t)pi.pi_secsize * pi.pi_size); } /* Success! */ return 0; } /* * Vnode op for read */ /* ARGSUSED */ int spec_read(void *v) { struct vop_read_args /* { struct vnode *a_vp; struct uio *a_uio; int a_ioflag; kauth_cred_t a_cred; } */ *ap = v; struct vnode *vp = ap->a_vp; struct uio *uio = ap->a_uio; struct lwp *l = curlwp; struct specnode *sn; dev_t dev; struct buf *bp; daddr_t bn; int bsize, bscale; struct partinfo pi; int n, on; int error = 0; int i, nra; daddr_t lastbn, *rablks; int *rasizes; int nrablks, ratogo; KASSERT(uio->uio_rw == UIO_READ); KASSERTMSG((VMSPACE_IS_KERNEL_P(uio->uio_vmspace) || uio->uio_vmspace == curproc->p_vmspace), "vmspace belongs to neither kernel nor curproc"); if (uio->uio_resid == 0) return 0; switch (vp->v_type) { case VCHR: /* * Release the lock while we sleep -- possibly * indefinitely, if this is, e.g., a tty -- in * cdev_read, so we don't hold up everything else that * might want access to the vnode. * * But before we issue the read, take an I/O reference * to the specnode so close will know when we're done * reading. Note that the moment we release the lock, * the vnode's identity may change; hence spec_io_enter * may fail, and the caller may have a dead vnode on * their hands, if the file system on which vp lived * has been unmounted. */ VOP_UNLOCK(vp); error = spec_io_enter(vp, &sn, &dev); if (error) goto out; error = cdev_read(dev, uio, ap->a_ioflag); spec_io_exit(vp, sn); out: /* XXX What if the caller held an exclusive lock? */ vn_lock(vp, LK_SHARED | LK_RETRY); return error; case VBLK: KASSERT(vp == vp->v_specnode->sn_dev->sd_bdevvp); if (uio->uio_offset < 0) return EINVAL; if (bdev_ioctl(vp->v_rdev, DIOCGPARTINFO, &pi, FREAD, l) == 0) bsize = imin(imax(pi.pi_bsize, DEV_BSIZE), MAXBSIZE); else bsize = BLKDEV_IOSIZE; bscale = bsize >> DEV_BSHIFT; nra = uimax(16 * MAXPHYS / bsize - 1, 511); rablks = kmem_alloc(nra * sizeof(*rablks), KM_SLEEP); rasizes = kmem_alloc(nra * sizeof(*rasizes), KM_SLEEP); lastbn = ((uio->uio_offset + uio->uio_resid - 1) >> DEV_BSHIFT) &~ (bscale - 1); nrablks = ratogo = 0; do { bn = (uio->uio_offset >> DEV_BSHIFT) &~ (bscale - 1); on = uio->uio_offset % bsize; n = uimin((unsigned)(bsize - on), uio->uio_resid); if (ratogo == 0) { nrablks = uimin((lastbn - bn) / bscale, nra); ratogo = nrablks; for (i = 0; i < nrablks; ++i) { rablks[i] = bn + (i+1) * bscale; rasizes[i] = bsize; } error = breadn(vp, bn, bsize, rablks, rasizes, nrablks, 0, &bp); } else { if (ratogo > 0) --ratogo; error = bread(vp, bn, bsize, 0, &bp); } if (error) break; n = uimin(n, bsize - bp->b_resid); error = uiomove((char *)bp->b_data + on, n, uio); brelse(bp, 0); } while (error == 0 && uio->uio_resid > 0 && n != 0); kmem_free(rablks, nra * sizeof(*rablks)); kmem_free(rasizes, nra * sizeof(*rasizes)); return error; default: panic("spec_read type"); } /* NOTREACHED */ } /* * Vnode op for write */ /* ARGSUSED */ int spec_write(void *v) { struct vop_write_args /* { struct vnode *a_vp; struct uio *a_uio; int a_ioflag; kauth_cred_t a_cred; } */ *ap = v; struct vnode *vp = ap->a_vp; struct uio *uio = ap->a_uio; struct lwp *l = curlwp; struct specnode *sn; dev_t dev; struct buf *bp; daddr_t bn; int bsize, bscale; struct partinfo pi; int n, on; int error = 0; KASSERT(uio->uio_rw == UIO_WRITE); KASSERTMSG((VMSPACE_IS_KERNEL_P(uio->uio_vmspace) || uio->uio_vmspace == curproc->p_vmspace), "vmspace belongs to neither kernel nor curproc"); switch (vp->v_type) { case VCHR: /* * Release the lock while we sleep -- possibly * indefinitely, if this is, e.g., a tty -- in * cdev_write, so we don't hold up everything else that * might want access to the vnode. * * But before we issue the write, take an I/O reference * to the specnode so close will know when we're done * writing. Note that the moment we release the lock, * the vnode's identity may change; hence spec_io_enter * may fail, and the caller may have a dead vnode on * their hands, if the file system on which vp lived * has been unmounted. */ VOP_UNLOCK(vp); error = spec_io_enter(vp, &sn, &dev); if (error) goto out; error = cdev_write(dev, uio, ap->a_ioflag); spec_io_exit(vp, sn); out: vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); return error; case VBLK: KASSERT(vp == vp->v_specnode->sn_dev->sd_bdevvp); if (uio->uio_resid == 0) return 0; if (uio->uio_offset < 0) return EINVAL; if (bdev_ioctl(vp->v_rdev, DIOCGPARTINFO, &pi, FREAD, l) == 0) bsize = imin(imax(pi.pi_bsize, DEV_BSIZE), MAXBSIZE); else bsize = BLKDEV_IOSIZE; bscale = bsize >> DEV_BSHIFT; do { bn = (uio->uio_offset >> DEV_BSHIFT) &~ (bscale - 1); on = uio->uio_offset % bsize; n = uimin((unsigned)(bsize - on), uio->uio_resid); if (n == bsize) bp = getblk(vp, bn, bsize, 0, 0); else error = bread(vp, bn, bsize, B_MODIFY, &bp); if (error) { return error; } n = uimin(n, bsize - bp->b_resid); error = uiomove((char *)bp->b_data + on, n, uio); if (error) brelse(bp, 0); else { if (n + on == bsize) bawrite(bp); else bdwrite(bp); error = bp->b_error; } } while (error == 0 && uio->uio_resid > 0 && n != 0); return error; default: panic("spec_write type"); } /* NOTREACHED */ } /* * fdiscard, which on disk devices becomes TRIM. */ int spec_fdiscard(void *v) { struct vop_fdiscard_args /* { struct vnode *a_vp; off_t a_pos; off_t a_len; } */ *ap = v; struct vnode *vp = ap->a_vp; dev_t dev; KASSERT(VOP_ISLOCKED(vp) == LK_EXCLUSIVE); dev = vp->v_rdev; switch (vp->v_type) { case VCHR: #if 0 /* This is not stored for character devices. */ KASSERT(vp == vp->v_specnode->sn_dev->sd_cdevvp); #endif return cdev_discard(dev, ap->a_pos, ap->a_len); case VBLK: KASSERT(vp == vp->v_specnode->sn_dev->sd_bdevvp); return bdev_discard(dev, ap->a_pos, ap->a_len); default: panic("spec_fdiscard: not a device\n"); } } /* * Device ioctl operation. */ /* ARGSUSED */ int spec_ioctl(void *v) { struct vop_ioctl_args /* { struct vnode *a_vp; u_long a_command; void *a_data; int a_fflag; kauth_cred_t a_cred; } */ *ap = v; struct vnode *vp = ap->a_vp; struct specnode *sn; dev_t dev; int error; error = spec_io_enter(vp, &sn, &dev); if (error) return error; switch (vp->v_type) { case VCHR: error = cdev_ioctl(dev, ap->a_command, ap->a_data, ap->a_fflag, curlwp); break; case VBLK: KASSERT(vp == vp->v_specnode->sn_dev->sd_bdevvp); error = bdev_ioctl(dev, ap->a_command, ap->a_data, ap->a_fflag, curlwp); break; default: panic("spec_ioctl"); /* NOTREACHED */ } spec_io_exit(vp, sn); return error; } /* ARGSUSED */ int spec_poll(void *v) { struct vop_poll_args /* { struct vnode *a_vp; int a_events; } */ *ap = v; struct vnode *vp = ap->a_vp; struct specnode *sn; dev_t dev; int revents; if (spec_io_enter(vp, &sn, &dev) != 0) return POLLERR; switch (vp->v_type) { case VCHR: revents = cdev_poll(dev, ap->a_events, curlwp); break; default: revents = genfs_poll(v); break; } spec_io_exit(vp, sn); return revents; } /* ARGSUSED */ int spec_kqfilter(void *v) { struct vop_kqfilter_args /* { struct vnode *a_vp; struct proc *a_kn; } */ *ap = v; struct vnode *vp = ap->a_vp; struct specnode *sn; dev_t dev; int error; error = spec_io_enter(vp, &sn, &dev); if (error) return error; switch (vp->v_type) { case VCHR: error = cdev_kqfilter(dev, ap->a_kn); break; default: /* * Block devices don't support kqfilter, and refuse it * for any other files (like those vflush()ed) too. */ error = EOPNOTSUPP; break; } spec_io_exit(vp, sn); return error; } /* * Allow mapping of only D_DISK. This is called only for VBLK. */ int spec_mmap(void *v) { struct vop_mmap_args /* { struct vnode *a_vp; vm_prot_t a_prot; kauth_cred_t a_cred; } */ *ap = v; struct vnode *vp = ap->a_vp; struct specnode *sn; dev_t dev; int error; KASSERT(vp->v_type == VBLK); error = spec_io_enter(vp, &sn, &dev); if (error) return error; error = bdev_type(dev) == D_DISK ? 0 : EINVAL; spec_io_exit(vp, sn); return 0; } /* * Synch buffers associated with a block device */ /* ARGSUSED */ int spec_fsync(void *v) { struct vop_fsync_args /* { struct vnode *a_vp; kauth_cred_t a_cred; int a_flags; off_t offlo; off_t offhi; } */ *ap = v; struct vnode *vp = ap->a_vp; struct mount *mp; int error; if (vp->v_type == VBLK) { if ((mp = spec_node_getmountedfs(vp)) != NULL) { error = VFS_FSYNC(mp, vp, ap->a_flags); if (error != EOPNOTSUPP) return error; } return vflushbuf(vp, ap->a_flags); } return 0; } /* * Just call the device strategy routine */ int spec_strategy(void *v) { struct vop_strategy_args /* { struct vnode *a_vp; struct buf *a_bp; } */ *ap = v; struct vnode *vp = ap->a_vp; struct buf *bp = ap->a_bp; struct specnode *sn = NULL; dev_t dev; int error; error = spec_io_enter(vp, &sn, &dev); if (error) goto out; bp->b_dev = dev; if (!(bp->b_flags & B_READ)) { #ifdef DIAGNOSTIC if (bp->b_vp && bp->b_vp->v_type == VBLK) { struct mount *mp = spec_node_getmountedfs(bp->b_vp); if (mp && (mp->mnt_flag & MNT_RDONLY)) { printf("%s blk %"PRId64" written while ro!\n", mp->mnt_stat.f_mntonname, bp->b_blkno); #ifdef DDB db_stacktrace(); #endif } } #endif /* DIAGNOSTIC */ error = fscow_run(bp, false); if (error) goto out; } bdev_strategy(bp); error = 0; out: if (sn) spec_io_exit(vp, sn); if (error) { bp->b_error = error; bp->b_resid = bp->b_bcount; biodone(bp); } return error; } int spec_inactive(void *v) { struct vop_inactive_v2_args /* { struct vnode *a_vp; struct bool *a_recycle; } */ *ap = v; KASSERT(ap->a_vp->v_mount == dead_rootmount); *ap->a_recycle = true; return 0; } int spec_reclaim(void *v) { struct vop_reclaim_v2_args /* { struct vnode *a_vp; } */ *ap = v; struct vnode *vp = ap->a_vp; KASSERT(vp->v_specnode->sn_opencnt == 0); VOP_UNLOCK(vp); KASSERT(vp->v_mount == dead_rootmount); return 0; } /* * This is a noop, simply returning what one has been given. */ int spec_bmap(void *v) { struct vop_bmap_args /* { struct vnode *a_vp; daddr_t a_bn; struct vnode **a_vpp; daddr_t *a_bnp; int *a_runp; } */ *ap = v; if (ap->a_vpp != NULL) *ap->a_vpp = ap->a_vp; if (ap->a_bnp != NULL) *ap->a_bnp = ap->a_bn; if (ap->a_runp != NULL) *ap->a_runp = (MAXBSIZE >> DEV_BSHIFT) - 1; return 0; } /* * Device close routine */ /* ARGSUSED */ int spec_close(void *v) { struct vop_close_args /* { struct vnode *a_vp; int a_fflag; kauth_cred_t a_cred; } */ *ap = v; struct vnode *vp = ap->a_vp; struct session *sess; dev_t dev; int flags = ap->a_fflag; int mode, error, count; specnode_t *sn; specdev_t *sd; KASSERT(VOP_ISLOCKED(vp) == LK_EXCLUSIVE); mutex_enter(vp->v_interlock); sn = vp->v_specnode; dev = vp->v_rdev; sd = sn->sn_dev; /* * If we're going away soon, make this non-blocking. * Also ensures that we won't wedge in vn_lock below. */ if (vdead_check(vp, VDEAD_NOWAIT) != 0) flags |= FNONBLOCK; mutex_exit(vp->v_interlock); switch (vp->v_type) { case VCHR: /* * Hack: a tty device that is a controlling terminal * has a reference from the session structure. We * cannot easily tell that a character device is a * controlling terminal, unless it is the closing * process' controlling terminal. In that case, if the * open count is 1 release the reference from the * session. Also, remove the link from the tty back to * the session and pgrp. * * XXX V. fishy. */ mutex_enter(&proc_lock); sess = curlwp->l_proc->p_session; if (sn->sn_opencnt == 1 && vp == sess->s_ttyvp) { mutex_spin_enter(&tty_lock); sess->s_ttyvp = NULL; if (sess->s_ttyp->t_session != NULL) { sess->s_ttyp->t_pgrp = NULL; sess->s_ttyp->t_session = NULL; mutex_spin_exit(&tty_lock); /* Releases proc_lock. */ proc_sessrele(sess); } else { mutex_spin_exit(&tty_lock); if (sess->s_ttyp->t_pgrp != NULL) panic("spec_close: spurious pgrp ref"); mutex_exit(&proc_lock); } vrele(vp); } else mutex_exit(&proc_lock); /* * If the vnode is locked, then we are in the midst * of forcably closing the device, otherwise we only * close on last reference. */ mode = S_IFCHR; break; case VBLK: KASSERT(vp == vp->v_specnode->sn_dev->sd_bdevvp); /* * On last close of a block device (that isn't mounted) * we must invalidate any in core blocks, so that * we can, for instance, change floppy disks. */ error = vinvalbuf(vp, V_SAVE, ap->a_cred, curlwp, 0, 0); if (error) return error; /* * We do not want to really close the device if it * is still in use unless we are trying to close it * forcibly. Since every use (buffer, vnode, swap, cmap) * holds a reference to the vnode, and because we mark * any other vnodes that alias this device, when the * sum of the reference counts on all the aliased * vnodes descends to one, we are on last close. */ mode = S_IFBLK; break; default: panic("spec_close: not special"); } /* * Decrement the open reference count of this node and the * device. For block devices, the open reference count must be * 1 at this point. If the device's open reference count goes * to zero, we're the last one out so get the lights. * * We may find --sd->sd_opencnt gives zero, and yet * sd->sd_opened is false. This happens if the vnode is * revoked at the same time as it is being opened, which can * happen when opening a tty blocks indefinitely. In that * case, we still must call close -- it is the job of close to * interrupt the open. Either way, the device will be no * longer opened, so we have to clear sd->sd_opened; subsequent * opens will have responsibility for issuing close. * * This has the side effect that the sequence of opens might * happen out of order -- we might end up doing open, open, * close, close, instead of open, close, open, close. This is * unavoidable with the current devsw API, where open is * allowed to block and close must be able to run concurrently * to interrupt it. It is the driver's responsibility to * ensure that close is idempotent so that this works. Drivers * requiring per-open state and exact 1:1 correspondence * between open and close can use fd_clone. */ mutex_enter(&device_lock); KASSERT(sn->sn_opencnt); KASSERT(sd->sd_opencnt); KASSERTMSG(sn->sn_opencnt <= sd->sd_opencnt, "sn_opencnt=%u > sd_opencnt=%u", sn->sn_opencnt, sd->sd_opencnt); sn->sn_opencnt--; count = --sd->sd_opencnt; if (vp->v_type == VBLK) { KASSERTMSG(count == 0, "block device with %u opens", count + 1); sd->sd_bdevvp = NULL; } if (count == 0) { KASSERTMSG(sn->sn_opencnt == 0, "sn_opencnt=%u", sn->sn_opencnt); KASSERT(!sd->sd_closing); sd->sd_opened = false; sd->sd_closing = true; } mutex_exit(&device_lock); if (count != 0) return 0; /* * If we're able to block, release the vnode lock & reacquire. We * might end up sleeping for someone else who wants our queues. They * won't get them if we hold the vnode locked. */ if (!(flags & FNONBLOCK)) VOP_UNLOCK(vp); /* * If we can cancel all outstanding I/O, then wait for it to * drain before we call .d_close. Drivers that split up * .d_cancel and .d_close this way need not have any internal * mechanism for waiting in .d_close for I/O to drain. */ if (vp->v_type == VBLK) error = bdev_cancel(dev, flags, mode, curlwp); else error = cdev_cancel(dev, flags, mode, curlwp); if (error == 0) spec_io_drain(sd); else KASSERTMSG(error == ENODEV, "cancel dev=0x%lx failed with %d", (unsigned long)dev, error); if (vp->v_type == VBLK) error = bdev_close(dev, flags, mode, curlwp); else error = cdev_close(dev, flags, mode, curlwp); /* * Wait for all other devsw operations to drain. After this * point, no bdev/cdev_* can be active for this specdev. */ spec_io_drain(sd); /* * Wake any spec_open calls waiting for close to finish -- do * this before reacquiring the vnode lock, because spec_open * holds the vnode lock while waiting, so doing this after * reacquiring the lock would deadlock. */ mutex_enter(&device_lock); KASSERT(!sd->sd_opened); KASSERT(sd->sd_closing); sd->sd_closing = false; cv_broadcast(&specfs_iocv); mutex_exit(&device_lock); if (!(flags & FNONBLOCK)) vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); return error; } /* * Print out the contents of a special device vnode. */ int spec_print(void *v) { struct vop_print_args /* { struct vnode *a_vp; } */ *ap = v; printf("dev %llu, %llu\n", (unsigned long long)major(ap->a_vp->v_rdev), (unsigned long long)minor(ap->a_vp->v_rdev)); return 0; } /* * Return POSIX pathconf information applicable to special devices. */ int spec_pathconf(void *v) { struct vop_pathconf_args /* { struct vnode *a_vp; int a_name; register_t *a_retval; } */ *ap = v; switch (ap->a_name) { case _PC_LINK_MAX: *ap->a_retval = LINK_MAX; return 0; case _PC_MAX_CANON: *ap->a_retval = MAX_CANON; return 0; case _PC_MAX_INPUT: *ap->a_retval = MAX_INPUT; return 0; case _PC_PIPE_BUF: *ap->a_retval = PIPE_BUF; return 0; case _PC_CHOWN_RESTRICTED: *ap->a_retval = 1; return 0; case _PC_VDISABLE: *ap->a_retval = _POSIX_VDISABLE; return 0; case _PC_SYNC_IO: *ap->a_retval = 1; return 0; default: return genfs_pathconf(ap); } /* NOTREACHED */ } /* * Advisory record locking support. */ int spec_advlock(void *v) { struct vop_advlock_args /* { struct vnode *a_vp; void *a_id; int a_op; struct flock *a_fl; int a_flags; } */ *ap = v; struct vnode *vp = ap->a_vp; return lf_advlock(ap, &vp->v_speclockf, (off_t)0); }
1 5 2 1 1 1 990 990 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 /* $NetBSD: dbregs.c,v 1.15 2020/01/31 08:55:38 maxv Exp $ */ /* * Copyright (c) 2016 The NetBSD Foundation, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #include <sys/param.h> #include <sys/types.h> #include <sys/lwp.h> #include <sys/pool.h> #include <x86/cpufunc.h> #include <x86/dbregs.h> #include <uvm/uvm_prot.h> #include <uvm/uvm_pmap.h> #include <machine/pmap.h> struct pool x86_dbregspl; static struct dbreg initdbstate; #define X86_BREAKPOINT_CONDITION_DETECTED ( \ X86_DR6_DR0_BREAKPOINT_CONDITION_DETECTED | \ X86_DR6_DR1_BREAKPOINT_CONDITION_DETECTED | \ X86_DR6_DR2_BREAKPOINT_CONDITION_DETECTED | \ X86_DR6_DR3_BREAKPOINT_CONDITION_DETECTED ) #define X86_GLOBAL_BREAKPOINT ( \ X86_DR7_GLOBAL_DR0_BREAKPOINT | \ X86_DR7_GLOBAL_DR1_BREAKPOINT | \ X86_DR7_GLOBAL_DR2_BREAKPOINT | \ X86_DR7_GLOBAL_DR3_BREAKPOINT ) void x86_dbregs_init(void) { /* DR0-DR3 should always be 0 */ initdbstate.dr[0] = rdr0(); initdbstate.dr[1] = rdr1(); initdbstate.dr[2] = rdr2(); initdbstate.dr[3] = rdr3(); /* DR4-DR5 are reserved - skip */ /* DR6 and DR7 contain predefined nonzero bits */ initdbstate.dr[6] = rdr6(); initdbstate.dr[7] = rdr7(); /* DR8-DR15 are reserved - skip */ /* * Explicitly reset some bits just in case they could be * set by brave software/hardware before the kernel boot. */ initdbstate.dr[6] &= ~X86_BREAKPOINT_CONDITION_DETECTED; initdbstate.dr[7] &= ~X86_DR7_GENERAL_DETECT_ENABLE; pool_init(&x86_dbregspl, sizeof(struct dbreg), 16, 0, 0, "dbregs", NULL, IPL_NONE); } static void x86_dbregs_reset(void) { /* * It's sufficient to just disable Debug Control Register (DR7). * It will deactivate hardware watchpoints. */ ldr7(0); /* * However at some point we need to clear Debug Status Registers * (DR6). The CPU will never do it automatically. * * Clear BREAKPOINT_CONDITION_DETECTED bits and ignore the rest. */ ldr6(rdr6() & ~X86_BREAKPOINT_CONDITION_DETECTED); } void x86_dbregs_clear(struct lwp *l) { struct pcb *pcb = lwp_getpcb(l); struct dbreg *dbregs; KASSERT(l == curlwp); if (__predict_true(pcb->pcb_dbregs == NULL)) { KASSERT((pcb->pcb_flags & PCB_DBREGS) == 0); return; } dbregs = pcb->pcb_dbregs; kpreempt_disable(); pcb->pcb_dbregs = NULL; pcb->pcb_flags &= ~PCB_DBREGS; x86_dbregs_reset(); kpreempt_enable(); pool_put(&x86_dbregspl, dbregs); } void x86_dbregs_abandon(struct lwp *l) { struct pcb *pcb = lwp_getpcb(l); kpreempt_disable(); pcb->pcb_flags &= ~PCB_DBREGS; x86_dbregs_reset(); kpreempt_enable(); } void x86_dbregs_read(struct lwp *l, struct dbreg *regs) { struct pcb *pcb = lwp_getpcb(l); if (pcb->pcb_dbregs == NULL) { pcb->pcb_dbregs = pool_get(&x86_dbregspl, PR_WAITOK); memcpy(pcb->pcb_dbregs, &initdbstate, sizeof(initdbstate)); pcb->pcb_flags |= PCB_DBREGS; } memcpy(regs, pcb->pcb_dbregs, sizeof(*regs)); } void x86_dbregs_save(struct lwp *l) { struct pcb *pcb = lwp_getpcb(l); if (!(pcb->pcb_flags & PCB_DBREGS)) { return; } KASSERT(pcb->pcb_dbregs != NULL); pcb->pcb_dbregs->dr[0] = rdr0(); pcb->pcb_dbregs->dr[1] = rdr1(); pcb->pcb_dbregs->dr[2] = rdr2(); pcb->pcb_dbregs->dr[3] = rdr3(); pcb->pcb_dbregs->dr[6] = rdr6(); pcb->pcb_dbregs->dr[7] = rdr7(); } void x86_dbregs_restore(struct lwp *l) { struct pcb *pcb = lwp_getpcb(l); if (!(pcb->pcb_flags & PCB_DBREGS)) { return; } KASSERT(pcb->pcb_dbregs != NULL); ldr0(pcb->pcb_dbregs->dr[0]); ldr1(pcb->pcb_dbregs->dr[1]); ldr2(pcb->pcb_dbregs->dr[2]); ldr3(pcb->pcb_dbregs->dr[3]); ldr6(pcb->pcb_dbregs->dr[6]); ldr7(pcb->pcb_dbregs->dr[7]); } void x86_dbregs_store_dr6(struct lwp *l) { struct pcb *pcb = lwp_getpcb(l); KASSERT(l == curlwp); KASSERT(pcb->pcb_dbregs != NULL); pcb->pcb_dbregs->dr[6] = rdr6(); } int x86_dbregs_user_trap(void) { register_t dr7, dr6; register_t bp; dr7 = rdr7(); if ((dr7 & X86_GLOBAL_BREAKPOINT) == 0) { /* * All Global Breakpoint bits are zero, thus the trap couldn't * have been caused by the hardware debug registers. */ return 0; } dr6 = rdr6(); bp = dr6 & X86_BREAKPOINT_CONDITION_DETECTED; if (!bp) { /* * None of the breakpoint bits are set, meaning this * trap was not caused by any of the debug registers. */ return 0; } /* * At least one of the breakpoints was hit, check to see * which ones and if any of them are user space addresses. */ if (bp & X86_DR6_DR0_BREAKPOINT_CONDITION_DETECTED) if (rdr0() < (vaddr_t)VM_MAXUSER_ADDRESS) return 1; if (bp & X86_DR6_DR1_BREAKPOINT_CONDITION_DETECTED) if (rdr1() < (vaddr_t)VM_MAXUSER_ADDRESS) return 1; if (bp & X86_DR6_DR2_BREAKPOINT_CONDITION_DETECTED) if (rdr2() < (vaddr_t)VM_MAXUSER_ADDRESS) return 1; if (bp & X86_DR6_DR3_BREAKPOINT_CONDITION_DETECTED) if (rdr3() < (vaddr_t)VM_MAXUSER_ADDRESS) return 1; return 0; } int x86_dbregs_validate(const struct dbreg *regs) { size_t i; /* Check that DR0-DR3 contain user-space address */ for (i = 0; i < X86_DBREGS; i++) { if (regs->dr[i] >= (vaddr_t)VM_MAXUSER_ADDRESS) return EINVAL; } #ifndef i386 if (regs->dr[6] & X86_DR6_MBZ) { return EINVAL; } if (regs->dr[7] & X86_DR7_MBZ) { return EINVAL; } #endif if (regs->dr[7] & X86_DR7_GENERAL_DETECT_ENABLE) { return EINVAL; } /* * Skip checks for reserved registers (DR4-DR5, DR8-DR15). */ return 0; } void x86_dbregs_write(struct lwp *l, const struct dbreg *regs) { struct pcb *pcb = lwp_getpcb(l); if (pcb->pcb_dbregs == NULL) { pcb->pcb_dbregs = pool_get(&x86_dbregspl, PR_WAITOK); } memcpy(pcb->pcb_dbregs, regs, sizeof(*regs)); pcb->pcb_flags |= PCB_DBREGS; } /* * Called with preemption disabled. */ void x86_dbregs_switch(struct lwp *oldlwp, struct lwp *newlwp) { struct pcb *oldpcb, *newpcb; bool olddb, newdb; oldpcb = lwp_getpcb(oldlwp); newpcb = lwp_getpcb(newlwp); olddb = (oldpcb->pcb_flags & PCB_DBREGS) != 0; newdb = (newpcb->pcb_flags & PCB_DBREGS) != 0; if (__predict_true(!olddb && !newdb)) { /* fast path */ return; } if (olddb) { x86_dbregs_save(oldlwp); } if (newdb) { x86_dbregs_restore(newlwp); } else if (olddb) { x86_dbregs_reset(); } }
3 22 23 15 3 2 3 2 1 1 3 5 5 18 18 1 1 17 9 10 8 1 1 9 5 7 1 1 1 1 1 1 5 2 1 2 4 1 15 12 1 2 10 3 1 2 2 2 2 2 6 6 5 1 4 5 1 1 3 3 1 1 1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 /* $NetBSD: ptyfs_vnops.c,v 1.69 2022/08/05 10:36:02 riastradh Exp $ */ /* * Copyright (c) 1993, 1995 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * Jan-Simon Pendry. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)procfs_vnops.c 8.18 (Berkeley) 5/21/95 */ /* * Copyright (c) 1993 Jan-Simon Pendry * * This code is derived from software contributed to Berkeley by * Jan-Simon Pendry. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)procfs_vnops.c 8.18 (Berkeley) 5/21/95 */ /* * ptyfs vnode interface */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: ptyfs_vnops.c,v 1.69 2022/08/05 10:36:02 riastradh Exp $"); #include <sys/param.h> #include <sys/systm.h> #include <sys/time.h> #include <sys/kernel.h> #include <sys/file.h> #include <sys/filedesc.h> #include <sys/proc.h> #include <sys/vnode.h> #include <sys/namei.h> #include <sys/malloc.h> #include <sys/mount.h> #include <sys/select.h> #include <sys/dirent.h> #include <sys/resourcevar.h> #include <sys/stat.h> #include <sys/conf.h> #include <sys/tty.h> #include <sys/pty.h> #include <sys/kauth.h> #include <uvm/uvm_extern.h> /* for PAGE_SIZE */ #include <machine/reg.h> #include <fs/ptyfs/ptyfs.h> #include <miscfs/genfs/genfs.h> #include <miscfs/specfs/specdev.h> MALLOC_DECLARE(M_PTYFSTMP); /* * Vnode Operations. * */ int ptyfs_lookup (void *); int ptyfs_open (void *); int ptyfs_close (void *); int ptyfs_access (void *); int ptyfs_getattr (void *); int ptyfs_setattr (void *); int ptyfs_read (void *); int ptyfs_write (void *); int ptyfs_ioctl (void *); int ptyfs_poll (void *); int ptyfs_kqfilter (void *); int ptyfs_readdir (void *); int ptyfs_reclaim (void *); int ptyfs_inactive (void *); int ptyfs_print (void *); int ptyfs_pathconf (void *); int ptyfs_advlock (void *); static int ptyfs_update(struct vnode *, const struct timespec *, const struct timespec *, int); static int ptyfs_chown(struct vnode *, uid_t, gid_t, kauth_cred_t, struct lwp *); static int ptyfs_chmod(struct vnode *, mode_t, kauth_cred_t, struct lwp *); static int atoi(const char *, size_t); /* * ptyfs vnode operations. */ int (**ptyfs_vnodeop_p)(void *); const struct vnodeopv_entry_desc ptyfs_vnodeop_entries[] = { { &vop_default_desc, vn_default_error }, { &vop_parsepath_desc, genfs_parsepath }, /* parsepath */ { &vop_lookup_desc, ptyfs_lookup }, /* lookup */ { &vop_create_desc, genfs_eopnotsupp }, /* create */ { &vop_mknod_desc, genfs_eopnotsupp }, /* mknod */ { &vop_open_desc, ptyfs_open }, /* open */ { &vop_close_desc, ptyfs_close }, /* close */ { &vop_access_desc, ptyfs_access }, /* access */ { &vop_accessx_desc, genfs_accessx }, /* accessx */ { &vop_getattr_desc, ptyfs_getattr }, /* getattr */ { &vop_setattr_desc, ptyfs_setattr }, /* setattr */ { &vop_read_desc, ptyfs_read }, /* read */ { &vop_write_desc, ptyfs_write }, /* write */ { &vop_fallocate_desc, genfs_eopnotsupp }, /* fallocate */ { &vop_fdiscard_desc, genfs_eopnotsupp }, /* fdiscard */ { &vop_ioctl_desc, ptyfs_ioctl }, /* ioctl */ { &vop_fcntl_desc, genfs_fcntl }, /* fcntl */ { &vop_poll_desc, ptyfs_poll }, /* poll */ { &vop_kqfilter_desc, ptyfs_kqfilter }, /* kqfilter */ { &vop_revoke_desc, genfs_revoke }, /* revoke */ { &vop_mmap_desc, genfs_eopnotsupp }, /* mmap */ { &vop_fsync_desc, genfs_nullop }, /* fsync */ { &vop_seek_desc, genfs_nullop }, /* seek */ { &vop_remove_desc, genfs_eopnotsupp }, /* remove */ { &vop_link_desc, genfs_eopnotsupp }, /* link */ { &vop_rename_desc, genfs_eopnotsupp }, /* rename */ { &vop_mkdir_desc, genfs_eopnotsupp }, /* mkdir */ { &vop_rmdir_desc, genfs_eopnotsupp }, /* rmdir */ { &vop_symlink_desc, genfs_eopnotsupp }, /* symlink */ { &vop_readdir_desc, ptyfs_readdir }, /* readdir */ { &vop_readlink_desc, genfs_eopnotsupp }, /* readlink */ { &vop_abortop_desc, genfs_abortop }, /* abortop */ { &vop_inactive_desc, ptyfs_inactive }, /* inactive */ { &vop_reclaim_desc, ptyfs_reclaim }, /* reclaim */ { &vop_lock_desc, genfs_lock }, /* lock */ { &vop_unlock_desc, genfs_unlock }, /* unlock */ { &vop_bmap_desc, genfs_eopnotsupp }, /* bmap */ { &vop_strategy_desc, genfs_badop }, /* strategy */ { &vop_print_desc, ptyfs_print }, /* print */ { &vop_islocked_desc, genfs_islocked }, /* islocked */ { &vop_pathconf_desc, ptyfs_pathconf }, /* pathconf */ { &vop_advlock_desc, ptyfs_advlock }, /* advlock */ { &vop_bwrite_desc, genfs_eopnotsupp }, /* bwrite */ { &vop_putpages_desc, genfs_null_putpages }, /* putpages */ { NULL, NULL } }; const struct vnodeopv_desc ptyfs_vnodeop_opv_desc = { &ptyfs_vnodeop_p, ptyfs_vnodeop_entries }; /* * free any private data and remove the node * from any private lists. */ int ptyfs_reclaim(void *v) { struct vop_reclaim_v2_args /* { struct vnode *a_vp; } */ *ap = v; struct vnode *vp = ap->a_vp; VOP_UNLOCK(vp); vp->v_data = NULL; return 0; } int ptyfs_inactive(void *v) { struct vop_inactive_v2_args /* { struct vnode *a_vp; bool *a_recycle; } */ *ap = v; struct vnode *vp = ap->a_vp; struct ptyfsnode *ptyfs = VTOPTYFS(vp); if (ptyfs->ptyfs_type == PTYFSptc) ptyfs_clr_active(vp->v_mount, ptyfs->ptyfs_pty); return 0; } /* * Return POSIX pathconf information applicable to special devices. */ int ptyfs_pathconf(void *v) { struct vop_pathconf_args /* { struct vnode *a_vp; int a_name; register_t *a_retval; } */ *ap = v; switch (ap->a_name) { case _PC_LINK_MAX: *ap->a_retval = LINK_MAX; return 0; case _PC_MAX_CANON: *ap->a_retval = MAX_CANON; return 0; case _PC_MAX_INPUT: *ap->a_retval = MAX_INPUT; return 0; case _PC_PIPE_BUF: *ap->a_retval = PIPE_BUF; return 0; case _PC_CHOWN_RESTRICTED: *ap->a_retval = 1; return 0; case _PC_VDISABLE: *ap->a_retval = _POSIX_VDISABLE; return 0; case _PC_SYNC_IO: *ap->a_retval = 1; return 0; default: return genfs_pathconf(ap); } } /* * _print is used for debugging. * just print a readable description * of (vp). */ int ptyfs_print(void *v) { struct vop_print_args /* { struct vnode *a_vp; } */ *ap = v; struct ptyfsnode *ptyfs = VTOPTYFS(ap->a_vp); printf("tag VT_PTYFS, type %d, pty %d\n", ptyfs->ptyfs_type, ptyfs->ptyfs_pty); return 0; } /* * support advisory locking on pty nodes */ int ptyfs_advlock(void *v) { struct vop_print_args /* { struct vnode *a_vp; } */ *ap = v; struct ptyfsnode *ptyfs = VTOPTYFS(ap->a_vp); switch (ptyfs->ptyfs_type) { case PTYFSpts: case PTYFSptc: return spec_advlock(v); default: return EOPNOTSUPP; } } /* * Invent attributes for ptyfsnode (vp) and store * them in (vap). * Directories lengths are returned as zero since * any real length would require the genuine size * to be computed, and nothing cares anyway. * * this is relatively minimal for ptyfs. */ int ptyfs_getattr(void *v) { struct vop_getattr_args /* { struct vnode *a_vp; struct vattr *a_vap; kauth_cred_t a_cred; } */ *ap = v; struct ptyfsnode *ptyfs = VTOPTYFS(ap->a_vp); struct vattr *vap = ap->a_vap; PTYFS_ITIMES(ptyfs, NULL, NULL, NULL); /* start by zeroing out the attributes */ vattr_null(vap); /* next do all the common fields */ vap->va_type = ap->a_vp->v_type; vap->va_fsid = ap->a_vp->v_mount->mnt_stat.f_fsidx.__fsid_val[0]; vap->va_fileid = ptyfs->ptyfs_fileno; vap->va_gen = 0; vap->va_flags = 0; vap->va_blocksize = PAGE_SIZE; vap->va_atime = ptyfs->ptyfs_atime; vap->va_mtime = ptyfs->ptyfs_mtime; vap->va_ctime = ptyfs->ptyfs_ctime; vap->va_birthtime = ptyfs->ptyfs_birthtime; vap->va_mode = ptyfs->ptyfs_mode; vap->va_flags = ptyfs->ptyfs_flags; vap->va_uid = ptyfs->ptyfs_uid; vap->va_gid = ptyfs->ptyfs_gid; switch (ptyfs->ptyfs_type) { case PTYFSpts: case PTYFSptc: if (pty_isfree(ptyfs->ptyfs_pty, 1)) return ENOENT; vap->va_bytes = vap->va_size = 0; vap->va_rdev = ap->a_vp->v_rdev; vap->va_nlink = 1; break; case PTYFSroot: vap->va_rdev = 0; vap->va_bytes = vap->va_size = DEV_BSIZE; vap->va_nlink = 2; break; default: return EOPNOTSUPP; } return 0; } /*ARGSUSED*/ int ptyfs_setattr(void *v) { struct vop_setattr_args /* { struct vnodeop_desc *a_desc; struct vnode *a_vp; struct vattr *a_vap; kauth_cred_t a_cred; } */ *ap = v; struct vnode *vp = ap->a_vp; struct ptyfsnode *ptyfs = VTOPTYFS(vp); struct vattr *vap = ap->a_vap; kauth_cred_t cred = ap->a_cred; struct lwp *l = curlwp; int error; kauth_action_t action = KAUTH_VNODE_WRITE_FLAGS; bool changing_sysflags = false; if (vap->va_size != VNOVALSIZE) { switch (ptyfs->ptyfs_type) { case PTYFSroot: return EISDIR; case PTYFSpts: case PTYFSptc: break; default: return EINVAL; } } if (vap->va_flags != VNOVALFLAGS) { if (vp->v_mount->mnt_flag & MNT_RDONLY) return EROFS; /* Immutable and append-only flags are not supported on ptyfs. */ if (vap->va_flags & (IMMUTABLE | APPEND)) return EINVAL; /* Snapshot flag cannot be set or cleared */ if ((vap->va_flags & SF_SNAPSHOT) != (ptyfs->ptyfs_flags & SF_SNAPSHOT)) return EPERM; if ((ptyfs->ptyfs_flags & SF_SETTABLE) != (vap->va_flags & SF_SETTABLE)) { changing_sysflags = true; action |= KAUTH_VNODE_WRITE_SYSFLAGS; } error = kauth_authorize_vnode(cred, action, vp, NULL, genfs_can_chflags(vp, cred, ptyfs->ptyfs_uid, changing_sysflags)); if (error) return error; if (changing_sysflags) { ptyfs->ptyfs_flags = vap->va_flags; } else { ptyfs->ptyfs_flags &= SF_SETTABLE; ptyfs->ptyfs_flags |= (vap->va_flags & UF_SETTABLE); } ptyfs->ptyfs_status |= PTYFS_CHANGE; } /* * Go through the fields and update iff not VNOVAL. */ if (vap->va_uid != (uid_t)VNOVAL || vap->va_gid != (gid_t)VNOVAL) { if (vp->v_mount->mnt_flag & MNT_RDONLY) return EROFS; error = ptyfs_chown(vp, vap->va_uid, vap->va_gid, cred, l); if (error) return error; } if (vap->va_atime.tv_sec != VNOVAL || vap->va_mtime.tv_sec != VNOVAL || vap->va_birthtime.tv_sec != VNOVAL) { if (vp->v_mount->mnt_flag & MNT_RDONLY) return EROFS; if ((ptyfs->ptyfs_flags & SF_SNAPSHOT) != 0) return EPERM; error = kauth_authorize_vnode(cred, KAUTH_VNODE_WRITE_TIMES, vp, NULL, genfs_can_chtimes(vp, cred, ptyfs->ptyfs_uid, vap->va_vaflags)); if (error) return (error); if (vap->va_atime.tv_sec != VNOVAL) if (!(vp->v_mount->mnt_flag & MNT_NOATIME)) ptyfs->ptyfs_status |= PTYFS_ACCESS; if (vap->va_mtime.tv_sec != VNOVAL) { ptyfs->ptyfs_status |= PTYFS_CHANGE | PTYFS_MODIFY; if (vp->v_mount->mnt_flag & MNT_RELATIME) ptyfs->ptyfs_status |= PTYFS_ACCESS; } if (vap->va_birthtime.tv_sec != VNOVAL) ptyfs->ptyfs_birthtime = vap->va_birthtime; ptyfs->ptyfs_status |= PTYFS_CHANGE; error = ptyfs_update(vp, &vap->va_atime, &vap->va_mtime, 0); if (error) return error; } if (vap->va_mode != (mode_t)VNOVAL) { if (vp->v_mount->mnt_flag & MNT_RDONLY) return EROFS; if ((ptyfs->ptyfs_flags & SF_SNAPSHOT) != 0 && (vap->va_mode & (S_IXUSR|S_IWUSR|S_IXGRP|S_IWGRP|S_IXOTH|S_IWOTH))) return EPERM; error = ptyfs_chmod(vp, vap->va_mode, cred, l); if (error) return error; } return 0; } /* * Change the mode on a file. * Inode must be locked before calling. */ static int ptyfs_chmod(struct vnode *vp, mode_t mode, kauth_cred_t cred, struct lwp *l) { struct ptyfsnode *ptyfs = VTOPTYFS(vp); int error; error = kauth_authorize_vnode(cred, KAUTH_VNODE_WRITE_SECURITY, vp, NULL, genfs_can_chmod(vp, cred, ptyfs->ptyfs_uid, ptyfs->ptyfs_gid, mode)); if (error) return (error); ptyfs->ptyfs_mode &= ~ALLPERMS; ptyfs->ptyfs_mode |= (mode & ALLPERMS); return 0; } /* * Perform chown operation on inode ip; * inode must be locked prior to call. */ static int ptyfs_chown(struct vnode *vp, uid_t uid, gid_t gid, kauth_cred_t cred, struct lwp *l) { struct ptyfsnode *ptyfs = VTOPTYFS(vp); int error; if (uid == (uid_t)VNOVAL) uid = ptyfs->ptyfs_uid; if (gid == (gid_t)VNOVAL) gid = ptyfs->ptyfs_gid; error = kauth_authorize_vnode(cred, KAUTH_VNODE_CHANGE_OWNERSHIP, vp, NULL, genfs_can_chown(vp, cred, ptyfs->ptyfs_uid, ptyfs->ptyfs_gid, uid, gid)); if (error) return (error); ptyfs->ptyfs_gid = gid; ptyfs->ptyfs_uid = uid; return 0; } /* * implement access checking. * * actually, the check for super-user is slightly * broken since it will allow read access to write-only * objects. this doesn't cause any particular trouble * but does mean that the i/o entry points need to check * that the operation really does make sense. */ int ptyfs_access(void *v) { struct vop_access_args /* { struct vnode *a_vp; accmode_t a_accmode; kauth_cred_t a_cred; } */ *ap = v; struct vattr va; int error; if ((error = VOP_GETATTR(ap->a_vp, &va, ap->a_cred)) != 0) return error; return kauth_authorize_vnode(ap->a_cred, KAUTH_ACCESS_ACTION(ap->a_accmode, ap->a_vp->v_type, va.va_mode), ap->a_vp, NULL, genfs_can_access(ap->a_vp, ap->a_cred, va.va_uid, va.va_gid, va.va_mode, NULL, ap->a_accmode)); } /* * lookup. this is incredibly complicated in the * general case, however for most pseudo-filesystems * very little needs to be done. * * Locking isn't hard here, just poorly documented. * * If we're looking up ".", just vref the parent & return it. * * If we're looking up "..", unlock the parent, and lock "..". If everything * went ok, try to re-lock the parent. We do this to prevent lock races. * * For anything else, get the needed node. * * We try to exit with the parent locked in error cases. */ int ptyfs_lookup(void *v) { struct vop_lookup_v2_args /* { struct vnode * a_dvp; struct vnode ** a_vpp; struct componentname * a_cnp; } */ *ap = v; struct componentname *cnp = ap->a_cnp; struct vnode **vpp = ap->a_vpp; struct vnode *dvp = ap->a_dvp; const char *pname = cnp->cn_nameptr; struct ptyfsnode *ptyfs; int pty, error; *vpp = NULL; if (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME) return EROFS; if (cnp->cn_namelen == 1 && *pname == '.') { *vpp = dvp; vref(dvp); return 0; } ptyfs = VTOPTYFS(dvp); switch (ptyfs->ptyfs_type) { case PTYFSroot: /* * Shouldn't get here with .. in the root node. */ if (cnp->cn_flags & ISDOTDOT) return EIO; pty = atoi(pname, cnp->cn_namelen); if (pty < 0 || ptyfs_next_active(dvp->v_mount, pty) != pty) break; error = ptyfs_allocvp(dvp->v_mount, vpp, PTYFSpts, pty); if (error) return error; if (ptyfs_next_active(dvp->v_mount, pty) != pty) { vrele(*vpp); *vpp = NULL; return ENOENT; } return 0; default: return ENOTDIR; } return cnp->cn_nameiop == LOOKUP ? ENOENT : EROFS; } /* * readdir returns directory entries from ptyfsnode (vp). * * the strategy here with ptyfs is to generate a single * directory entry at a time (struct dirent) and then * copy that out to userland using uiomove. a more efficient * though more complex implementation, would try to minimize * the number of calls to uiomove(). for ptyfs, this is * hardly worth the added code complexity. * * this should just be done through read() */ int ptyfs_readdir(void *v) { struct vop_readdir_args /* { struct vnode *a_vp; struct uio *a_uio; kauth_cred_t a_cred; int *a_eofflag; off_t **a_cookies; int *a_ncookies; } */ *ap = v; struct uio *uio = ap->a_uio; struct dirent *dp; struct ptyfsnode *ptyfs; off_t i; int error; off_t *cookies = NULL; int ncookies; struct vnode *vp; int n, nc = 0; vp = ap->a_vp; ptyfs = VTOPTYFS(vp); if (uio->uio_resid < UIO_MX) return EINVAL; if (uio->uio_offset < 0) return EINVAL; dp = malloc(sizeof(struct dirent), M_PTYFSTMP, M_WAITOK | M_ZERO); error = 0; i = uio->uio_offset; dp->d_reclen = UIO_MX; ncookies = uio->uio_resid / UIO_MX; if (ptyfs->ptyfs_type != PTYFSroot) { error = ENOTDIR; goto out; } if (i >= npty) goto out; if (ap->a_ncookies) { ncookies = uimin(ncookies, (npty + 2 - i)); cookies = malloc(ncookies * sizeof (off_t), M_TEMP, M_WAITOK); *ap->a_cookies = cookies; } for (; i < 2 && uio->uio_resid >= UIO_MX; i++) { /* `.' and/or `..' */ dp->d_fileno = PTYFS_FILENO(PTYFSroot, 0); dp->d_namlen = i + 1; (void)memcpy(dp->d_name, "..", dp->d_namlen); dp->d_name[i + 1] = '\0'; dp->d_type = DT_DIR; if ((error = uiomove(dp, UIO_MX, uio)) != 0) goto out; if (cookies) *cookies++ = i + 1; nc++; } while (uio->uio_resid >= UIO_MX) { /* check for used ptys */ n = ptyfs_next_active(vp->v_mount, i - 2); if (n < 0) break; dp->d_fileno = PTYFS_FILENO(PTYFSpts, n); dp->d_namlen = snprintf(dp->d_name, sizeof(dp->d_name), "%lld", (long long)(n)); dp->d_type = DT_CHR; if ((error = uiomove(dp, UIO_MX, uio)) != 0) goto out; i = n + 3; if (cookies) *cookies++ = i; nc++; } out: /* not pertinent in error cases */ ncookies = nc; if (ap->a_ncookies) { if (error) { if (cookies) free(*ap->a_cookies, M_TEMP); *ap->a_ncookies = 0; *ap->a_cookies = NULL; } else *ap->a_ncookies = ncookies; } uio->uio_offset = i; free(dp, M_PTYFSTMP); return error; } int ptyfs_open(void *v) { struct vop_open_args /* { struct vnode *a_vp; int a_mode; kauth_cred_t a_cred; } */ *ap = v; struct vnode *vp = ap->a_vp; struct ptyfsnode *ptyfs = VTOPTYFS(vp); switch (ptyfs->ptyfs_type) { case PTYFSpts: case PTYFSptc: return spec_open(v); case PTYFSroot: return 0; default: return EINVAL; } } int ptyfs_close(void *v) { struct vop_close_args /* { struct vnode *a_vp; int a_fflag; kauth_cred_t a_cred; } */ *ap = v; struct vnode *vp = ap->a_vp; struct ptyfsnode *ptyfs = VTOPTYFS(vp); mutex_enter(vp->v_interlock); if (vrefcnt(vp) > 1) PTYFS_ITIMES(ptyfs, NULL, NULL, NULL); mutex_exit(vp->v_interlock); switch (ptyfs->ptyfs_type) { case PTYFSpts: case PTYFSptc: return spec_close(v); case PTYFSroot: return 0; default: return EINVAL; } } int ptyfs_read(void *v) { struct vop_read_args /* { struct vnode *a_vp; struct uio *a_uio; int a_ioflag; kauth_cred_t a_cred; } */ *ap = v; struct timespec ts; struct vnode *vp = ap->a_vp; struct ptyfsnode *ptyfs = VTOPTYFS(vp); int error; if (vp->v_type == VDIR) return EISDIR; ptyfs->ptyfs_status |= PTYFS_ACCESS; /* hardclock() resolution is good enough for ptyfs */ getnanotime(&ts); (void)ptyfs_update(vp, &ts, &ts, 0); switch (ptyfs->ptyfs_type) { case PTYFSpts: case PTYFSptc: VOP_UNLOCK(vp); error = cdev_read(vp->v_rdev, ap->a_uio, ap->a_ioflag); vn_lock(vp, LK_RETRY|LK_EXCLUSIVE); return error; default: return EOPNOTSUPP; } } int ptyfs_write(void *v) { struct vop_write_args /* { struct vnode *a_vp; struct uio *a_uio; int a_ioflag; kauth_cred_t a_cred; } */ *ap = v; struct timespec ts; struct vnode *vp = ap->a_vp; struct ptyfsnode *ptyfs = VTOPTYFS(vp); int error; ptyfs->ptyfs_status |= PTYFS_MODIFY; getnanotime(&ts); (void)ptyfs_update(vp, &ts, &ts, 0); switch (ptyfs->ptyfs_type) { case PTYFSpts: case PTYFSptc: VOP_UNLOCK(vp); error = cdev_write(vp->v_rdev, ap->a_uio, ap->a_ioflag); vn_lock(vp, LK_RETRY|LK_EXCLUSIVE); return error; default: return EOPNOTSUPP; } } int ptyfs_ioctl(void *v) { struct vop_ioctl_args /* { struct vnode *a_vp; u_long a_command; void *a_data; int a_fflag; kauth_cred_t a_cred; } */ *ap = v; struct vnode *vp = ap->a_vp; struct ptyfsnode *ptyfs = VTOPTYFS(vp); switch (ptyfs->ptyfs_type) { case PTYFSpts: case PTYFSptc: return cdev_ioctl(vp->v_rdev, ap->a_command, ap->a_data, ap->a_fflag, curlwp); default: return EOPNOTSUPP; } } int ptyfs_poll(void *v) { struct vop_poll_args /* { struct vnode *a_vp; int a_events; } */ *ap = v; struct vnode *vp = ap->a_vp; struct ptyfsnode *ptyfs = VTOPTYFS(vp); switch (ptyfs->ptyfs_type) { case PTYFSpts: case PTYFSptc: return cdev_poll(vp->v_rdev, ap->a_events, curlwp); default: return genfs_poll(v); } } int ptyfs_kqfilter(void *v) { struct vop_kqfilter_args /* { struct vnode *a_vp; struct knote *a_kn; } */ *ap = v; struct vnode *vp = ap->a_vp; struct ptyfsnode *ptyfs = VTOPTYFS(vp); switch (ptyfs->ptyfs_type) { case PTYFSpts: case PTYFSptc: return cdev_kqfilter(vp->v_rdev, ap->a_kn); default: return genfs_kqfilter(v); } } static int ptyfs_update(struct vnode *vp, const struct timespec *acc, const struct timespec *mod, int flags) { struct ptyfsnode *ptyfs = VTOPTYFS(vp); if (vp->v_mount->mnt_flag & MNT_RDONLY) return 0; PTYFS_ITIMES(ptyfs, acc, mod, NULL); return 0; } void ptyfs_itimes(struct ptyfsnode *ptyfs, const struct timespec *acc, const struct timespec *mod, const struct timespec *cre) { struct timespec now; KASSERT(ptyfs->ptyfs_status & (PTYFS_ACCESS|PTYFS_CHANGE|PTYFS_MODIFY)); getnanotime(&now); if (ptyfs->ptyfs_status & PTYFS_ACCESS) { if (acc == NULL) acc = &now; ptyfs->ptyfs_atime = *acc; } if (ptyfs->ptyfs_status & PTYFS_MODIFY) { if (mod == NULL) mod = &now; ptyfs->ptyfs_mtime = *mod; } if (ptyfs->ptyfs_status & PTYFS_CHANGE) { if (cre == NULL) cre = &now; ptyfs->ptyfs_ctime = *cre; } ptyfs->ptyfs_status &= ~(PTYFS_ACCESS|PTYFS_CHANGE|PTYFS_MODIFY); } /* * convert decimal ascii to int */ static int atoi(const char *b, size_t len) { int p = 0; while (len--) { char c = *b++; if (c < '0' || c > '9') return -1; p = 10 * p + (c - '0'); } return p; }
9 8 8 8 9 9 8 8 8 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 /* $NetBSD: prop_stack.c,v 1.3 2019/05/08 02:25:50 thorpej Exp $ */ /*- * Copyright (c) 2007 Joerg Sonnenberger <joerg@NetBSD.org>. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in * the documentation and/or other materials provided with the * distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include "prop_object_impl.h" #include "prop_stack.h" void _prop_stack_init(prop_stack_t stack) { stack->used_intern_elems = 0; SLIST_INIT(&stack->extern_elems); } bool _prop_stack_push(prop_stack_t stack, prop_object_t obj, void *data1, void *data2, void *data3) { struct _prop_stack_extern_elem *eelem; struct _prop_stack_intern_elem *ielem; if (stack->used_intern_elems == PROP_STACK_INTERN_ELEMS) { eelem = _PROP_MALLOC(sizeof(*eelem), M_TEMP); if (eelem == NULL) return false; eelem->object = obj; eelem->object_data[0] = data1; eelem->object_data[1] = data2; eelem->object_data[2] = data3; SLIST_INSERT_HEAD(&stack->extern_elems, eelem, stack_link); return true; } _PROP_ASSERT(stack->used_intern_elems < PROP_STACK_INTERN_ELEMS); _PROP_ASSERT(SLIST_EMPTY(&stack->extern_elems)); ielem = &stack->intern_elems[stack->used_intern_elems]; ielem->object = obj; ielem->object_data[0] = data1; ielem->object_data[1] = data2; ielem->object_data[2] = data3; ++stack->used_intern_elems; return true; } bool _prop_stack_pop(prop_stack_t stack, prop_object_t *obj, void **data1, void **data2, void **data3) { struct _prop_stack_extern_elem *eelem; struct _prop_stack_intern_elem *ielem; if (stack->used_intern_elems == 0) return false; if ((eelem = SLIST_FIRST(&stack->extern_elems)) != NULL) { _PROP_ASSERT(stack->used_intern_elems == PROP_STACK_INTERN_ELEMS); SLIST_REMOVE_HEAD(&stack->extern_elems, stack_link); if (obj) *obj = eelem->object; if (data1) *data1 = eelem->object_data[0]; if (data2) *data2 = eelem->object_data[1]; if (data3) *data3 = eelem->object_data[2]; _PROP_FREE(eelem, M_TEMP); return true; } --stack->used_intern_elems; ielem = &stack->intern_elems[stack->used_intern_elems]; if (obj) *obj = ielem->object; if (data1) *data1 = ielem->object_data[0]; if (data2) *data2 = ielem->object_data[1]; if (data3) *data3 = ielem->object_data[2]; return true; }
90 87 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 /* $NetBSD: route.h,v 1.134 2023/06/16 02:48:07 rin Exp $ */ /* * Copyright (c) 1980, 1986, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)route.h 8.5 (Berkeley) 2/8/95 */ #ifndef _NET_ROUTE_H_ #define _NET_ROUTE_H_ #include <sys/queue.h> #include <sys/socket.h> #include <sys/types.h> #include <net/if.h> #ifdef _KERNEL #include <sys/rwlock.h> #include <sys/condvar.h> #include <sys/pserialize.h> #include <sys/percpu.h> #endif #include <sys/psref.h> #if !(defined(_KERNEL) || defined(_STANDALONE)) #include <stdbool.h> #endif /* * Kernel resident routing tables. * * The routing tables are initialized when interface addresses * are set by making entries for all directly connected interfaces. */ /* * A route consists of a destination address and a reference * to a routing entry. These are often held by protocols * in their control blocks, e.g. inpcb. */ struct route { struct rtentry *_ro_rt; struct sockaddr *ro_sa; uint64_t ro_rtcache_generation; struct psref ro_psref; int ro_bound; }; /* * These numbers are used by reliable protocols for determining * retransmission behavior and are included in the routing structure. */ struct rt_metrics { uint64_t rmx_locks; /* Kernel must leave these values alone */ uint64_t rmx_mtu; /* MTU for this path */ uint64_t rmx_hopcount; /* max hops expected */ uint64_t rmx_recvpipe; /* inbound delay-bandwidth product */ uint64_t rmx_sendpipe; /* outbound delay-bandwidth product */ uint64_t rmx_ssthresh; /* outbound gateway buffer limit */ uint64_t rmx_rtt; /* estimated round trip time */ uint64_t rmx_rttvar; /* estimated rtt variance */ time_t rmx_expire; /* lifetime for route, e.g. redirect */ time_t rmx_pksent; /* packets sent using this route */ }; /* * rmx_rtt and rmx_rttvar are stored as microseconds; * RTTTOPRHZ(rtt) converts to a value suitable for use * by a protocol slowtimo counter. */ #define RTM_RTTUNIT 1000000 /* units for rtt, rttvar, as units per sec */ #define RTTTOPRHZ(r) ((r) / (RTM_RTTUNIT / PR_SLOWHZ)) /* * We distinguish between routes to hosts and routes to networks, * preferring the former if available. For each route we infer * the interface to use from the gateway address supplied when * the route was entered. Routes that forward packets through * gateways are marked so that the output routines know to address the * gateway rather than the ultimate destination. */ #ifndef RNF_NORMAL #include <net/radix.h> #endif struct rtentry { struct radix_node rt_nodes[2]; /* tree glue, and other values */ #define rt_mask(r) ((const struct sockaddr *)((r)->rt_nodes->rn_mask)) struct sockaddr *rt_gateway; /* value */ int rt_flags; /* up/down?, host/net */ int rt_refcnt; /* # held references */ uint64_t rt_use; /* raw # packets forwarded */ struct ifnet *rt_ifp; /* the answer: interface to use */ struct ifaddr *rt_ifa; /* the answer: interface to use */ uint32_t rt_ifa_seqno; void * rt_llinfo; /* pointer to link level info cache */ struct rt_metrics rt_rmx; /* metrics used by rx'ing protocols */ struct rtentry *rt_gwroute; /* implied entry for gatewayed routes */ LIST_HEAD(, rttimer) rt_timer; /* queue of timeouts for misc funcs */ struct rtentry *rt_parent; /* parent of cloned route */ struct sockaddr *_rt_key; struct sockaddr *rt_tag; /* route tagging info */ #ifdef _KERNEL kcondvar_t rt_cv; struct psref_target rt_psref; SLIST_ENTRY(rtentry) rt_free; /* queue of deferred frees */ #endif }; static __inline const struct sockaddr * rt_getkey(const struct rtentry *rt) { return rt->_rt_key; } /* * Following structure necessary for 4.3 compatibility; * We should eventually move it to a compat file. */ struct ortentry { uint32_t rt_hash; /* to speed lookups */ struct sockaddr rt_dst; /* key */ struct sockaddr rt_gateway; /* value */ int16_t rt_flags; /* up/down?, host/net */ int16_t rt_refcnt; /* # held references */ uint32_t rt_use; /* raw # packets forwarded */ struct ifnet *rt_ifp; /* the answer: interface to use */ }; #define RTF_UP 0x1 /* route usable */ #define RTF_GATEWAY 0x2 /* destination is a gateway */ #define RTF_HOST 0x4 /* host entry (net otherwise) */ #define RTF_REJECT 0x8 /* host or net unreachable */ #define RTF_DYNAMIC 0x10 /* created dynamically (by redirect) */ #define RTF_MODIFIED 0x20 /* modified dynamically (by redirect) */ #define RTF_DONE 0x40 /* message confirmed */ #define RTF_MASK 0x80 /* subnet mask present */ // #define RTF_CLONING 0x100 /* generate new routes on use */ #define RTF_CONNECTED 0x100 /* hosts on this route are neighbours */ // #define RTF_XRESOLVE 0x200 /* external daemon resolves name */ // #define RTF_LLINFO 0x400 /* generated by ARP or NDP */ #define RTF_LLDATA 0x400 /* used by apps to add/del L2 entries */ #define RTF_STATIC 0x800 /* manually added */ #define RTF_BLACKHOLE 0x1000 /* just discard pkts (during updates) */ // #define RTF_CLONED 0x2000 /* this is a cloned route */ #define RTF_PROTO2 0x4000 /* protocol specific routing flag */ #define RTF_PROTO1 0x8000 /* protocol specific routing flag */ #define RTF_SRC 0x10000 /* route has fixed source address */ #define RTF_ANNOUNCE 0x20000 /* announce new ARP or NDP entry */ #define RTF_LOCAL 0x40000 /* route represents a local address */ #define RTF_BROADCAST 0x80000 /* route represents a bcast address */ #define RTF_UPDATING 0x100000 /* route is updating */ /* * The flag is nevert set to rt_flags. It just tells rtrequest1 to set a passed * ifa to rt_ifa (via rti_ifa) and not replace rt_ifa in ifa_rtrequest. */ #define RTF_DONTCHANGEIFA 0x200000 /* suppress rt_ifa replacement */ /* * 0x400 is exposed to userland just for backward compatibility. For that * purpose, it should be shown as LLINFO. */ #define RTFBITS "\020\1UP\2GATEWAY\3HOST\4REJECT\5DYNAMIC\6MODIFIED\7DONE" \ "\010MASK_PRESENT\011CONNECTED\012XRESOLVE\013LLINFO\014STATIC" \ "\015BLACKHOLE\016CLONED\017PROTO2\020PROTO1\021SRC\022ANNOUNCE" \ "\023LOCAL\024BROADCAST\025UPDATING" /* * Routing statistics. */ struct rtstat { uint64_t rts_badredirect; /* bogus redirect calls */ uint64_t rts_dynamic; /* routes created by redirects */ uint64_t rts_newgateway; /* routes modified by redirects */ uint64_t rts_unreach; /* lookups which failed */ uint64_t rts_wildcard; /* lookups satisfied by a wildcard */ }; /* * Structures for routing messages. By forcing the first member to be aligned * at a 64-bit boundary, we also force the size to be a multiple of 64-bits. */ #if !defined(_KERNEL) || !defined(COMPAT_RTSOCK) /* * If we aren't being compiled for backwards compatibility, enforce 64-bit * alignment so any routing message is the same regardless if the kernel * is an ILP32 or LP64 kernel. */ #define __align64 __aligned(sizeof(uint64_t)) #else #define __align64 #endif struct rt_msghdr { u_short rtm_msglen __align64; /* to skip over non-understood messages */ u_char rtm_version; /* future binary compatibility */ u_char rtm_type; /* message type */ u_short rtm_index; /* index for associated ifp */ int rtm_flags; /* flags, incl. kern & message, e.g. DONE */ int rtm_addrs; /* bitmask identifying sockaddrs in msg */ pid_t rtm_pid; /* identify sender */ int rtm_seq; /* for sender to identify action */ int rtm_errno; /* why failed */ int rtm_use; /* from rtentry */ int rtm_inits; /* which metrics we are initializing */ struct rt_metrics rtm_rmx __align64; /* metrics themselves */ }; #undef __align64 #define RTM_VERSION 4 /* Up the ante and ignore older versions */ #define RTM_ADD 0x1 /* Add Route */ #define RTM_DELETE 0x2 /* Delete Route */ #define RTM_CHANGE 0x3 /* Change Metrics or flags */ #define RTM_GET 0x4 /* Report Metrics */ #define RTM_LOSING 0x5 /* Kernel Suspects Partitioning */ #define RTM_REDIRECT 0x6 /* Told to use different route */ #define RTM_MISS 0x7 /* Lookup failed on this address */ #define RTM_LOCK 0x8 /* fix specified metrics */ #define RTM_OLDADD 0x9 /* caused by SIOCADDRT */ #define RTM_OLDDEL 0xa /* caused by SIOCDELRT */ // #define RTM_RESOLVE 0xb /* req to resolve dst to LL addr */ #define RTM_ONEWADDR 0xc /* Old (pre-8.0) RTM_NEWADDR message */ #define RTM_ODELADDR 0xd /* Old (pre-8.0) RTM_DELADDR message */ #define RTM_OOIFINFO 0xe /* Old (pre-1.5) RTM_IFINFO message */ #define RTM_OIFINFO 0xf /* Old (pre-64bit time) RTM_IFINFO message */ #define RTM_IFANNOUNCE 0x10 /* iface arrival/departure */ #define RTM_IEEE80211 0x11 /* IEEE80211 wireless event */ #define RTM_SETGATE 0x12 /* set prototype gateway for clones * (see example in arp_rtrequest). */ #define RTM_LLINFO_UPD 0x13 /* indication to ARP/NDP/etc. that link-layer * address has changed */ #define RTM_IFINFO 0x14 /* iface/link going up/down etc. */ #define RTM_OCHGADDR 0x15 /* Old (pre-8.0) RTM_CHGADDR message */ #define RTM_NEWADDR 0x16 /* address being added to iface */ #define RTM_DELADDR 0x17 /* address being removed from iface */ #define RTM_CHGADDR 0x18 /* address properties changed */ #ifdef RTM_NAMES static const char *rtm_names[] = { "*none*", "add", "delete", "change", "get", "losing", "redirect", "miss", "lock", "oldadd", "olddel", "*resolve*", "onewaddr", "odeladdr", "ooifinfo", "oifinfo", "ifannounce", "ieee80211", "setgate", "llinfo_upd", "ifinfo", "ochgaddr", "newaddr", "deladdr", "chgaddr", }; #endif /* * setsockopt defines used for the filtering. */ #define RO_MSGFILTER 1 /* array of which rtm_type to send to client */ #define RO_MISSFILTER 2 /* array of sockaddrs to match miss dst */ #define RO_FILTSA_MAX 30 /* maximum number of sockaddrs per filter */ #define RTV_MTU 0x1 /* init or lock _mtu */ #define RTV_HOPCOUNT 0x2 /* init or lock _hopcount */ #define RTV_EXPIRE 0x4 /* init or lock _expire */ #define RTV_RPIPE 0x8 /* init or lock _recvpipe */ #define RTV_SPIPE 0x10 /* init or lock _sendpipe */ #define RTV_SSTHRESH 0x20 /* init or lock _ssthresh */ #define RTV_RTT 0x40 /* init or lock _rtt */ #define RTV_RTTVAR 0x80 /* init or lock _rttvar */ #define RTVBITS "\020\1MTU\2HOPCOUNT\3EXPIRE\4RECVPIPE\5SENDPIPE" \ "\6SSTHRESH\7RTT\010RTTVAR" /* * Bitmask values for rtm_addr. */ #define RTA_DST 0x1 /* destination sockaddr present */ #define RTA_GATEWAY 0x2 /* gateway sockaddr present */ #define RTA_NETMASK 0x4 /* netmask sockaddr present */ #define RTA_GENMASK 0x8 /* cloning mask sockaddr present */ #define RTA_IFP 0x10 /* interface name sockaddr present */ #define RTA_IFA 0x20 /* interface addr sockaddr present */ #define RTA_AUTHOR 0x40 /* sockaddr for author of redirect */ #define RTA_BRD 0x80 /* for NEWADDR, broadcast or p-p dest addr */ #define RTA_TAG 0x100 /* route tag */ #define RTABITS "\020\1DST\2GATEWAY\3NETMASK\4GENMASK\5IFP\6IFA\7AUTHOR" \ "\010BRD\011TAG" /* * Index offsets for sockaddr array for alternate internal encoding. */ #define RTAX_DST 0 /* destination sockaddr present */ #define RTAX_GATEWAY 1 /* gateway sockaddr present */ #define RTAX_NETMASK 2 /* netmask sockaddr present */ #define RTAX_GENMASK 3 /* cloning mask sockaddr present */ #define RTAX_IFP 4 /* interface name sockaddr present */ #define RTAX_IFA 5 /* interface addr sockaddr present */ #define RTAX_AUTHOR 6 /* sockaddr for author of redirect */ #define RTAX_BRD 7 /* for NEWADDR, broadcast or p-p dest addr */ #define RTAX_TAG 8 /* route tag */ #define RTAX_MAX 9 /* size of array to allocate */ #define RT_ROUNDUP2(a, n) ((a) > 0 ? (1 + (((a) - 1U) | ((n) - 1))) : (n)) #define RT_ROUNDUP(a) RT_ROUNDUP2((a), sizeof(uint64_t)) #define RT_ADVANCE(x, n) (x += RT_ROUNDUP((n)->sa_len)) struct rt_addrinfo { int rti_addrs; const struct sockaddr *rti_info[RTAX_MAX]; int rti_flags; struct ifaddr *rti_ifa; struct ifnet *rti_ifp; }; struct route_cb { int ip_count; int ip6_count; int unused1; int mpls_count; int any_count; }; /* * This structure, and the prototypes for the rt_timer_{init,remove_all, * add,timer} functions all used with the kind permission of BSDI. * These allow functions to be called for routes at specific times. */ struct rttimer { TAILQ_ENTRY(rttimer) rtt_next; /* entry on timer queue */ LIST_ENTRY(rttimer) rtt_link; /* multiple timers per rtentry */ struct rttimer_queue *rtt_queue; /* back pointer to queue */ struct rtentry *rtt_rt; /* Back pointer to the route */ void (*rtt_func)(struct rtentry *, struct rttimer *); time_t rtt_time; /* When this timer was registered */ }; struct rttimer_queue { long rtq_timeout; unsigned long rtq_count; TAILQ_HEAD(, rttimer) rtq_head; LIST_ENTRY(rttimer_queue) rtq_link; }; struct rtbl; typedef struct rtbl rtbl_t; #ifdef _KERNEL struct rtbl { struct radix_node_head t_rnh; }; struct rt_walkarg { int w_op; int w_arg; int w_given; int w_needed; void * w_where; int w_tmemsize; int w_tmemneeded; void * w_tmem; }; #if 0 #define RT_DPRINTF(__fmt, ...) do { } while (/*CONSTCOND*/0) #else #define RT_DPRINTF(__fmt, ...) /* do nothing */ #endif struct rtwalk { int (*rw_f)(struct rtentry *, void *); void *rw_v; }; /* * Global data specific to the routing socket. */ struct route_info { struct sockaddr ri_dst; struct sockaddr ri_src; struct route_cb ri_cb; int ri_maxqlen; struct ifqueue ri_intrq; void *ri_sih; }; extern struct route_info route_info; extern struct rtstat rtstat; struct socket; void rt_init(void); int rt_timer_add(struct rtentry *, void(*)(struct rtentry *, struct rttimer *), struct rttimer_queue *); unsigned long rt_timer_count(struct rttimer_queue *); void rt_timer_queue_change(struct rttimer_queue *, long); struct rttimer_queue * rt_timer_queue_create(u_int); void rt_timer_queue_destroy(struct rttimer_queue *); void rt_free(struct rtentry *); void rt_unref(struct rtentry *); int rt_update(struct rtentry *, struct rt_addrinfo *, void *); int rt_update_prepare(struct rtentry *); void rt_update_finish(struct rtentry *); void rt_newmsg(const int, const struct rtentry *); void rt_newmsg_dynamic(const int, const struct rtentry *); struct rtentry * rtalloc1(const struct sockaddr *, int); int rtinit(struct ifaddr *, int, int); void rtredirect(const struct sockaddr *, const struct sockaddr *, const struct sockaddr *, int, const struct sockaddr *, struct rtentry **); int rtrequest(int, const struct sockaddr *, const struct sockaddr *, const struct sockaddr *, int, struct rtentry **); int rtrequest1(int, struct rt_addrinfo *, struct rtentry **); int rt_ifa_addlocal(struct ifaddr *); int rt_ifa_remlocal(struct ifaddr *, struct ifaddr *); struct ifaddr * rt_get_ifa(struct rtentry *); void rt_replace_ifa(struct rtentry *, struct ifaddr *); int rt_setgate(struct rtentry *, const struct sockaddr *); const struct sockaddr * rt_settag(struct rtentry *, const struct sockaddr *); struct sockaddr * rt_gettag(const struct rtentry *); int rt_check_reject_route(const struct rtentry *, const struct ifnet *); void rt_delete_matched_entries(sa_family_t, int (*)(struct rtentry *, void *), void *, bool); void rt_replace_ifa_matched_entries(sa_family_t, int (*)(struct rtentry *, void *), void *, struct ifaddr *); int rt_walktree(sa_family_t, int (*)(struct rtentry *, void *), void *); static __inline void rt_assert_referenced(const struct rtentry *rt) { KASSERT(rt->rt_refcnt > 0); } void rtcache_copy(struct route *, struct route *); void rtcache_free(struct route *); struct rtentry * rtcache_init(struct route *); struct rtentry * rtcache_init_noclone(struct route *); struct rtentry * rtcache_lookup2(struct route *, const struct sockaddr *, int, int *); int rtcache_setdst(struct route *, const struct sockaddr *); struct rtentry * rtcache_update(struct route *, int); static __inline void rtcache_invariants(const struct route *ro) { KASSERT(ro->ro_sa != NULL || ro->_ro_rt == NULL); } static __inline struct rtentry * rtcache_lookup1(struct route *ro, const struct sockaddr *dst, int clone) { int hit; return rtcache_lookup2(ro, dst, clone, &hit); } static __inline struct rtentry * rtcache_lookup(struct route *ro, const struct sockaddr *dst) { return rtcache_lookup1(ro, dst, 1); } static __inline const struct sockaddr * rtcache_getdst(const struct route *ro) { rtcache_invariants(ro); return ro->ro_sa; } struct rtentry * rtcache_validate(struct route *); void rtcache_unref(struct rtentry *, struct route *); percpu_t * rtcache_percpu_alloc(void); static __inline struct route * rtcache_percpu_getref(percpu_t *pc) { return *(struct route **)percpu_getref(pc); } static __inline void rtcache_percpu_putref(percpu_t *pc) { percpu_putref(pc); } /* rtsock */ void rt_ieee80211msg(struct ifnet *, int, void *, size_t); void rt_ifannouncemsg(struct ifnet *, int); void rt_ifmsg(struct ifnet *); void rt_missmsg(int, const struct rt_addrinfo *, int, int); struct mbuf * rt_msg1(int, struct rt_addrinfo *, void *, int); int rt_msg3(int, struct rt_addrinfo *, void *, struct rt_walkarg *, int *); void rt_addrmsg(int, struct ifaddr *); void rt_addrmsg_src(int, struct ifaddr *, const struct sockaddr *); void rt_addrmsg_rt(int, struct ifaddr *, int, struct rtentry *); void route_enqueue(struct mbuf *, int); struct llentry; void rt_clonedmsg(int, const struct sockaddr *, const struct sockaddr *, const uint8_t *, const struct ifnet *); void rt_setmetrics(void *, struct rtentry *); /* rtbl */ int rt_addaddr(rtbl_t *, struct rtentry *, const struct sockaddr *); void rt_assert_inactive(const struct rtentry *); struct rtentry * rt_deladdr(rtbl_t *, const struct sockaddr *, const struct sockaddr *); rtbl_t *rt_gettable(sa_family_t); int rt_inithead(rtbl_t **, int); struct rtentry * rt_lookup(rtbl_t *, const struct sockaddr *, const struct sockaddr *); struct rtentry * rt_matchaddr(rtbl_t *, const struct sockaddr *); int rt_refines(const struct sockaddr *, const struct sockaddr *); int rtbl_walktree(sa_family_t, int (*)(struct rtentry *, void *), void *); struct rtentry * rtbl_search_matched_entry(sa_family_t, int (*)(struct rtentry *, void *), void *); void rtbl_init(void); void sysctl_net_route_setup(struct sysctllog **, int, const char *); #endif /* _KERNEL */ #endif /* !_NET_ROUTE_H_ */
85 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 /* $NetBSD: vfs_syscalls_40.c,v 1.5 2019/01/27 02:08:39 pgoyette Exp $ */ /*- * Copyright (c) 2008 The NetBSD Foundation, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /* * Copyright (c) 1989, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)vfs_syscalls.c 8.42 (Berkeley) 7/31/95 */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: vfs_syscalls_40.c,v 1.5 2019/01/27 02:08:39 pgoyette Exp $"); #if defined(_KERNEL_OPT) #include "opt_compat_netbsd.h" #endif #include <sys/param.h> #include <sys/systm.h> #include <sys/mount.h> #include <sys/syscall.h> #include <sys/syscallvar.h> #include <sys/syscallargs.h> #include <compat/common/compat_mod.h> static const struct syscall_package vfs_syscalls_40_syscalls[] = { { SYS_compat_40_mount, 0, (sy_call_t *)compat_40_sys_mount }, { 0, 0, NULL }, }; int compat_40_sys_mount(struct lwp *l, const struct compat_40_sys_mount_args *uap, register_t *retval) { /* { syscallarg(const char *) type; syscallarg(const char *) path; syscallarg(int) flags; syscallarg(void *) data; } */ register_t dummy; return do_sys_mount(l, SCARG(uap, type), UIO_USERSPACE, SCARG(uap, path), SCARG(uap, flags), SCARG(uap, data), UIO_USERSPACE, 0, &dummy); } int vfs_syscalls_40_init(void) { return syscall_establish(NULL, vfs_syscalls_40_syscalls); } int vfs_syscalls_40_fini(void) { return syscall_disestablish(NULL, vfs_syscalls_40_syscalls); }
176 174 49 50 10 10 6 6 151 12 151 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 /* $NetBSD: kern_uidinfo.c,v 1.13 2021/12/28 13:28:24 riastradh Exp $ */ /*- * Copyright (c) 1982, 1986, 1991, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: kern_uidinfo.c,v 1.13 2021/12/28 13:28:24 riastradh Exp $"); #include <sys/param.h> #include <sys/systm.h> #include <sys/kmem.h> #include <sys/proc.h> #include <sys/atomic.h> #include <sys/uidinfo.h> #include <sys/sysctl.h> #include <sys/kauth.h> #include <sys/cpu.h> static SLIST_HEAD(uihashhead, uidinfo) *uihashtbl; static u_long uihash; #define UIHASH(uid) (&uihashtbl[(uid) & uihash]) static int sysctl_kern_uidinfo_cnt(SYSCTLFN_ARGS) { static const struct { const char *name; u_int value; } nv[] = { #define _MEM(n) { # n, offsetof(struct uidinfo, ui_ ## n) } _MEM(proccnt), _MEM(lwpcnt), _MEM(lockcnt), _MEM(semcnt), _MEM(sbsize), #undef _MEM }; for (size_t i = 0; i < __arraycount(nv); i++) if (strcmp(nv[i].name, rnode->sysctl_name) == 0) { uint64_t cnt; struct sysctlnode node = *rnode; struct uidinfo *uip; node.sysctl_data = &cnt; uip = uid_find(kauth_cred_geteuid(l->l_cred)); *(uint64_t *)node.sysctl_data = *(u_long *)((char *)uip + nv[i].value); return sysctl_lookup(SYSCTLFN_CALL(&node)); } return EINVAL; } static struct sysctllog *kern_uidinfo_sysctllog; static void sysctl_kern_uidinfo_setup(void) { const struct sysctlnode *rnode, *cnode; sysctl_createv(&kern_uidinfo_sysctllog, 0, NULL, &rnode, CTLFLAG_PERMANENT, CTLTYPE_NODE, "uidinfo", SYSCTL_DESCR("Resource usage per uid"), NULL, 0, NULL, 0, CTL_KERN, CTL_CREATE, CTL_EOL); sysctl_createv(&kern_uidinfo_sysctllog, 0, &rnode, &cnode, CTLFLAG_PERMANENT, CTLTYPE_QUAD, "proccnt", SYSCTL_DESCR("Number of processes for the current user"), sysctl_kern_uidinfo_cnt, 0, NULL, 0, CTL_CREATE, CTL_EOL); sysctl_createv(&kern_uidinfo_sysctllog, 0, &rnode, &cnode, CTLFLAG_PERMANENT, CTLTYPE_QUAD, "lwpcnt", SYSCTL_DESCR("Number of lwps for the current user"), sysctl_kern_uidinfo_cnt, 0, NULL, 0, CTL_CREATE, CTL_EOL); sysctl_createv(&kern_uidinfo_sysctllog, 0, &rnode, &cnode, CTLFLAG_PERMANENT, CTLTYPE_QUAD, "lockcnt", SYSCTL_DESCR("Number of locks for the current user"), sysctl_kern_uidinfo_cnt, 0, NULL, 0, CTL_CREATE, CTL_EOL); sysctl_createv(&kern_uidinfo_sysctllog, 0, &rnode, &cnode, CTLFLAG_PERMANENT, CTLTYPE_QUAD, "semcnt", SYSCTL_DESCR("Number of semaphores used for the current user"), sysctl_kern_uidinfo_cnt, 0, NULL, 0, CTL_CREATE, CTL_EOL); sysctl_createv(&kern_uidinfo_sysctllog, 0, &rnode, &cnode, CTLFLAG_PERMANENT, CTLTYPE_QUAD, "sbsize", SYSCTL_DESCR("Socket buffers used for the current user"), sysctl_kern_uidinfo_cnt, 0, NULL, 0, CTL_CREATE, CTL_EOL); } static int uid_stats(struct hashstat_sysctl *hs, bool fill) { struct uidinfo *uip; uint64_t chain; strlcpy(hs->hash_name, "uihash", sizeof(hs->hash_name)); strlcpy(hs->hash_desc, "user info (uid->used proc) hash", sizeof(hs->hash_desc)); if (!fill) return 0; hs->hash_size = uihash + 1; for (size_t i = 0; i < hs->hash_size; i++) { chain = 0; SLIST_FOREACH(uip, &uihashtbl[i], ui_hash) { membar_datadep_consumer(); chain++; } if (chain > 0) { hs->hash_used++; hs->hash_items += chain; if (chain > hs->hash_maxchain) hs->hash_maxchain = chain; } } return 0; } void uid_init(void) { /* * In case of MP system, SLIST_FOREACH would force a cache line * write-back for every modified 'uidinfo', thus we try to keep the * lists short. */ const u_int uihash_sz = (maxcpus > 1 ? 1024 : 64); uihashtbl = hashinit(uihash_sz, HASH_SLIST, true, &uihash); /* * Ensure that uid 0 is always in the user hash table, as * sbreserve() expects it available from interrupt context. */ (void)uid_find(0); sysctl_kern_uidinfo_setup(); hashstat_register("uihash", uid_stats); } struct uidinfo * uid_find(uid_t uid) { struct uidinfo *uip, *uip_first, *newuip; struct uihashhead *uipp; uipp = UIHASH(uid); newuip = NULL; /* * To make insertion atomic, abstraction of SLIST will be violated. */ uip_first = uipp->slh_first; again: SLIST_FOREACH(uip, uipp, ui_hash) { membar_datadep_consumer(); if (uip->ui_uid != uid) continue; if (newuip != NULL) kmem_free(newuip, sizeof(*newuip)); return uip; } if (newuip == NULL) newuip = kmem_zalloc(sizeof(*newuip), KM_SLEEP); newuip->ui_uid = uid; /* * If atomic insert is unsuccessful, another thread might be * allocated this 'uid', thus full re-check is needed. */ newuip->ui_hash.sle_next = uip_first; membar_producer(); uip = atomic_cas_ptr(&uipp->slh_first, uip_first, newuip); if (uip != uip_first) { uip_first = uip; goto again; } return newuip; } /* * Change the count associated with number of processes * a given user is using. */ int chgproccnt(uid_t uid, int diff) { struct uidinfo *uip; long proccnt; uip = uid_find(uid); proccnt = atomic_add_long_nv(&uip->ui_proccnt, diff); KASSERTMSG(proccnt >= 0, "uid=%d diff=%d proccnt=%ld", uid, diff, proccnt); return proccnt; } /* * Change the count associated with number of lwps * a given user is using. */ int chglwpcnt(uid_t uid, int diff) { struct uidinfo *uip; long lwpcnt; uip = uid_find(uid); lwpcnt = atomic_add_long_nv(&uip->ui_lwpcnt, diff); KASSERTMSG(lwpcnt >= 0, "uid=%d diff=%d lwpcnt=%ld", uid, diff, lwpcnt); return lwpcnt; } /* * Change the count associated with number of semaphores * a given user is using. */ int chgsemcnt(uid_t uid, int diff) { struct uidinfo *uip; long semcnt; uip = uid_find(uid); semcnt = atomic_add_long_nv(&uip->ui_semcnt, diff); KASSERTMSG(semcnt >= 0, "uid=%d diff=%d semcnt=%ld", uid, diff, semcnt); return semcnt; } int chgsbsize(struct uidinfo *uip, u_long *hiwat, u_long to, rlim_t xmax) { rlim_t nsb; const long diff = to - *hiwat; nsb = (rlim_t)atomic_add_long_nv((long *)&uip->ui_sbsize, diff); if (diff > 0 && nsb > xmax) { atomic_add_long((long *)&uip->ui_sbsize, -diff); return 0; } *hiwat = to; return 1; }
34 242 242 59 242 242 416 256 280 46 75 45 264 6 75 225 73 72 2 73 283 72 167 22 156 166 166 167 166 22 156 283 283 283 283 282 236 236 235 236 93 93 94 94 283 283 281 242 241 241 242 241 241 114 167 3 241 242 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 /* $NetBSD: kern_turnstile.c,v 1.55 2023/10/15 10:30:20 riastradh Exp $ */ /*- * Copyright (c) 2002, 2006, 2007, 2009, 2019, 2020, 2023 * The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Jason R. Thorpe and Andrew Doran. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /* * Turnstiles are described in detail in: * * Solaris Internals: Core Kernel Architecture, Jim Mauro and * Richard McDougall. * * Turnstiles are kept in a hash table. There are likely to be many more * synchronisation objects than there are threads. Since a thread can block * on only one lock at a time, we only need one turnstile per thread, and * so they are allocated at thread creation time. * * When a thread decides it needs to block on a lock, it looks up the * active turnstile for that lock. If no active turnstile exists, then * the process lends its turnstile to the lock. If there is already an * active turnstile for the lock, the thread places its turnstile on a * list of free turnstiles, and references the active one instead. * * The act of looking up the turnstile acquires an interlock on the sleep * queue. If a thread decides it doesn't need to block after all, then this * interlock must be released by explicitly aborting the turnstile * operation. * * When a thread is awakened, it needs to get its turnstile back. If there * are still other threads waiting in the active turnstile, the thread * grabs a free turnstile off the free list. Otherwise, it can take back * the active turnstile from the lock (thus deactivating the turnstile). * * Turnstiles are where we do priority inheritence. */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: kern_turnstile.c,v 1.55 2023/10/15 10:30:20 riastradh Exp $"); #include <sys/param.h> #include <sys/lockdebug.h> #include <sys/lwp.h> #include <sys/proc.h> #include <sys/sleepq.h> #include <sys/sleeptab.h> #include <sys/syncobj.h> #include <sys/systm.h> /* * Shift of 6 aligns to typical cache line size of 64 bytes; there's no * point having two turnstile locks to back two lock objects that share one * cache line. */ #define TS_HASH_SIZE 128 #define TS_HASH_MASK (TS_HASH_SIZE - 1) #define TS_HASH(obj) (((uintptr_t)(obj) >> 6) & TS_HASH_MASK) static tschain_t turnstile_chains[TS_HASH_SIZE] __cacheline_aligned; static union { kmutex_t lock; uint8_t pad[COHERENCY_UNIT]; } turnstile_locks[TS_HASH_SIZE] __cacheline_aligned; /* * turnstile_init: * * Initialize the turnstile mechanism. */ void turnstile_init(void) { int i; for (i = 0; i < TS_HASH_SIZE; i++) { LIST_INIT(&turnstile_chains[i]); mutex_init(&turnstile_locks[i].lock, MUTEX_DEFAULT, IPL_SCHED); } turnstile_ctor(&turnstile0); } /* * turnstile_ctor: * * Constructor for turnstiles. */ void turnstile_ctor(turnstile_t *ts) { memset(ts, 0, sizeof(*ts)); sleepq_init(&ts->ts_sleepq[TS_READER_Q]); sleepq_init(&ts->ts_sleepq[TS_WRITER_Q]); } /* * turnstile_remove: * * Remove an LWP from a turnstile sleep queue and wake it. */ static inline void turnstile_remove(turnstile_t *ts, lwp_t *l, int q) { turnstile_t *nts; KASSERT(l->l_ts == ts); /* * This process is no longer using the active turnstile. * Find an inactive one on the free list to give to it. */ if ((nts = ts->ts_free) != NULL) { KASSERT(TS_ALL_WAITERS(ts) > 1); l->l_ts = nts; ts->ts_free = nts->ts_free; nts->ts_free = NULL; } else { /* * If the free list is empty, this is the last * waiter. */ KASSERT(TS_ALL_WAITERS(ts) == 1); LIST_REMOVE(ts, ts_chain); } ts->ts_waiters[q]--; sleepq_remove(&ts->ts_sleepq[q], l, true); } /* * turnstile_lookup: * * Look up the turnstile for the specified lock. This acquires and * holds the turnstile chain lock (sleep queue interlock). */ turnstile_t * turnstile_lookup(wchan_t obj) { turnstile_t *ts; tschain_t *tc; u_int hash; hash = TS_HASH(obj); tc = &turnstile_chains[hash]; mutex_spin_enter(&turnstile_locks[hash].lock); LIST_FOREACH(ts, tc, ts_chain) if (ts->ts_obj == obj) return (ts); /* * No turnstile yet for this lock. No problem, turnstile_block() * handles this by fetching the turnstile from the blocking thread. */ return (NULL); } /* * turnstile_exit: * * Abort a turnstile operation. */ void turnstile_exit(wchan_t obj) { mutex_spin_exit(&turnstile_locks[TS_HASH(obj)].lock); } /* * turnstile_lendpri: * * Lend our priority to lwps on the blocking chain. * * If the current owner of the lock (l->l_wchan, set by sleepq_enqueue) * has a priority lower than ours (lwp_eprio(l)), lend our priority to * him to avoid priority inversions. */ static void turnstile_lendpri(lwp_t *cur) { lwp_t * l = cur; pri_t prio; /* * NOTE: if you get a panic in this code block, it is likely that * a lock has been destroyed or corrupted while still in use. Try * compiling a kernel with LOCKDEBUG to pinpoint the problem. */ LOCKDEBUG_BARRIER(l->l_mutex, 1); KASSERT(l == curlwp); prio = lwp_eprio(l); for (;;) { lwp_t *owner; turnstile_t *ts; bool dolock; if (l->l_wchan == NULL) break; /* * Ask syncobj the owner of the lock. */ owner = (*l->l_syncobj->sobj_owner)(l->l_wchan); if (owner == NULL) break; /* * The owner may have changed as we have dropped the tc lock. */ if (cur == owner) { /* * We own the lock: stop here, sleepq_block() * should wake up immediately. */ break; } /* * Acquire owner->l_mutex if we don't have it yet. * Because we already have another LWP lock (l->l_mutex) held, * we need to play a try lock dance to avoid deadlock. */ dolock = l->l_mutex != atomic_load_relaxed(&owner->l_mutex); if (l == owner || (dolock && !lwp_trylock(owner))) { /* * The owner was changed behind us or trylock failed. * Restart from curlwp. * * Note that there may be a livelock here: * the owner may try grabbing cur's lock (which is the * tc lock) while we're trying to grab the owner's lock. */ lwp_unlock(l); l = cur; lwp_lock(l); prio = lwp_eprio(l); continue; } /* * If the owner's priority is already higher than ours, * there's nothing to do anymore. */ if (prio <= lwp_eprio(owner)) { if (dolock) lwp_unlock(owner); break; } /* * Lend our priority to the 'owner' LWP. * * Update lenders info for turnstile_unlendpri. */ ts = l->l_ts; KASSERT(ts->ts_inheritor == owner || ts->ts_inheritor == NULL); if (ts->ts_inheritor == NULL) { ts->ts_inheritor = owner; ts->ts_eprio = prio; SLIST_INSERT_HEAD(&owner->l_pi_lenders, ts, ts_pichain); lwp_lendpri(owner, prio); } else if (prio > ts->ts_eprio) { ts->ts_eprio = prio; lwp_lendpri(owner, prio); } if (dolock) lwp_unlock(l); LOCKDEBUG_BARRIER(owner->l_mutex, 1); l = owner; } LOCKDEBUG_BARRIER(l->l_mutex, 1); if (cur->l_mutex != atomic_load_relaxed(&l->l_mutex)) { lwp_unlock(l); lwp_lock(cur); } LOCKDEBUG_BARRIER(cur->l_mutex, 1); } /* * turnstile_unlendpri: undo turnstile_lendpri */ static void turnstile_unlendpri(turnstile_t *ts) { lwp_t * const l = curlwp; turnstile_t *iter; turnstile_t *next; turnstile_t *prev = NULL; pri_t prio; bool dolock; KASSERT(ts->ts_inheritor != NULL); ts->ts_inheritor = NULL; dolock = (atomic_load_relaxed(&l->l_mutex) == l->l_cpu->ci_schedstate.spc_lwplock); if (dolock) { lwp_lock(l); } /* * the following loop does two things. * * - remove ts from the list. * * - from the rest of the list, find the highest priority. */ prio = -1; KASSERT(!SLIST_EMPTY(&l->l_pi_lenders)); for (iter = SLIST_FIRST(&l->l_pi_lenders); iter != NULL; iter = next) { KASSERT(lwp_eprio(l) >= ts->ts_eprio); next = SLIST_NEXT(iter, ts_pichain); if (iter == ts) { if (prev == NULL) { SLIST_REMOVE_HEAD(&l->l_pi_lenders, ts_pichain); } else { SLIST_REMOVE_AFTER(prev, ts_pichain); } } else if (prio < iter->ts_eprio) { prio = iter->ts_eprio; } prev = iter; } lwp_lendpri(l, prio); if (dolock) { lwp_unlock(l); } } /* * turnstile_block: * * Enter an object into the turnstile chain and prepare the current * LWP for sleep. */ void turnstile_block(turnstile_t *ts, int q, wchan_t obj, syncobj_t *sobj) { lwp_t * const l = curlwp; /* cached curlwp */ turnstile_t *ots; tschain_t *tc; kmutex_t *lock; sleepq_t *sq; u_int hash; int nlocks; hash = TS_HASH(obj); tc = &turnstile_chains[hash]; lock = &turnstile_locks[hash].lock; KASSERT(q == TS_READER_Q || q == TS_WRITER_Q); KASSERT(mutex_owned(lock)); KASSERT(l != NULL); KASSERT(l->l_ts != NULL); if (ts == NULL) { /* * We are the first thread to wait for this object; * lend our turnstile to it. */ ts = l->l_ts; KASSERT(TS_ALL_WAITERS(ts) == 0); KASSERT(LIST_EMPTY(&ts->ts_sleepq[TS_READER_Q])); KASSERT(LIST_EMPTY(&ts->ts_sleepq[TS_WRITER_Q])); ts->ts_obj = obj; ts->ts_inheritor = NULL; LIST_INSERT_HEAD(tc, ts, ts_chain); } else { /* * Object already has a turnstile. Put our turnstile * onto the free list, and reference the existing * turnstile instead. */ ots = l->l_ts; KASSERT(ots->ts_free == NULL); ots->ts_free = ts->ts_free; ts->ts_free = ots; l->l_ts = ts; KASSERT(ts->ts_obj == obj); KASSERT(TS_ALL_WAITERS(ts) != 0); KASSERT(!LIST_EMPTY(&ts->ts_sleepq[TS_READER_Q]) || !LIST_EMPTY(&ts->ts_sleepq[TS_WRITER_Q])); } sq = &ts->ts_sleepq[q]; ts->ts_waiters[q]++; nlocks = sleepq_enter(sq, l, lock); LOCKDEBUG_BARRIER(lock, 1); sleepq_enqueue(sq, obj, sobj->sobj_name, sobj, false); /* * Disable preemption across this entire block, as we may drop * scheduler locks (allowing preemption), and would prefer not * to be interrupted while in a state of flux. */ KPREEMPT_DISABLE(l); KASSERT(lock == l->l_mutex); turnstile_lendpri(l); sleepq_block(0, false, sobj, nlocks); KPREEMPT_ENABLE(l); } /* * turnstile_wakeup: * * Wake up the specified number of threads that are blocked * in a turnstile. */ void turnstile_wakeup(turnstile_t *ts, int q, int count, lwp_t *nl) { sleepq_t *sq; kmutex_t *lock; u_int hash; lwp_t *l; hash = TS_HASH(ts->ts_obj); lock = &turnstile_locks[hash].lock; sq = &ts->ts_sleepq[q]; KASSERT(q == TS_READER_Q || q == TS_WRITER_Q); KASSERT(count > 0); KASSERT(count <= TS_WAITERS(ts, q)); KASSERT(mutex_owned(lock)); KASSERT(ts->ts_inheritor == curlwp || ts->ts_inheritor == NULL); /* * restore inherited priority if necessary. */ if (ts->ts_inheritor != NULL) { turnstile_unlendpri(ts); } if (nl != NULL) { #if defined(DEBUG) || defined(LOCKDEBUG) LIST_FOREACH(l, sq, l_sleepchain) { if (l == nl) break; } if (l == NULL) panic("turnstile_wakeup: nl not on sleepq"); #endif turnstile_remove(ts, nl, q); } else { while (count-- > 0) { l = LIST_FIRST(sq); KASSERT(l != NULL); turnstile_remove(ts, l, q); } } mutex_spin_exit(lock); } /* * turnstile_unsleep: * * Remove an LWP from the turnstile. This is called when the LWP has * not been awoken normally but instead interrupted: for example, if it * has received a signal. It's not a valid action for turnstiles, * since LWPs blocking on a turnstile are not interruptable. */ void turnstile_unsleep(lwp_t *l, bool cleanup) { lwp_unlock(l); panic("turnstile_unsleep"); } /* * turnstile_changepri: * * Adjust the priority of an LWP residing on a turnstile. */ void turnstile_changepri(lwp_t *l, pri_t pri) { /* XXX priority inheritance */ sleepq_changepri(l, pri); } #if defined(LOCKDEBUG) /* * turnstile_print: * * Given the address of a lock object, print the contents of a * turnstile. */ void turnstile_print(volatile void *obj, void (*pr)(const char *, ...)) { turnstile_t *ts; tschain_t *tc; sleepq_t *rsq, *wsq; u_int hash; lwp_t *l; hash = TS_HASH(obj); tc = &turnstile_chains[hash]; LIST_FOREACH(ts, tc, ts_chain) if (ts->ts_obj == obj) break; if (ts == NULL) { (*pr)("Turnstile: no active turnstile for this lock.\n"); return; } rsq = &ts->ts_sleepq[TS_READER_Q]; wsq = &ts->ts_sleepq[TS_WRITER_Q]; (*pr)("Turnstile:\n"); (*pr)("=> %d waiting readers:", TS_WAITERS(ts, TS_READER_Q)); LIST_FOREACH(l, rsq, l_sleepchain) { (*pr)(" %p", l); } (*pr)("\n"); (*pr)("=> %d waiting writers:", TS_WAITERS(ts, TS_WRITER_Q)); LIST_FOREACH(l, wsq, l_sleepchain) { (*pr)(" %p", l); } (*pr)("\n"); } #endif /* LOCKDEBUG */
12 10 2 8 9 9 7 2 5 8 3 5 5 3 7 3 3 1 2 4 7 6 7 1 3 3 11 2 6 1 1 19 2 5 6 5 1 4 4 4 4 7 4 3 2 2 3 3 9 6 3 3 3 3 12 12 12 8 8 4 7 7 7 7 3 4 14 14 14 8 8 4 3 6 6 6 1 4 4 4 3 1 1 2 1 1 12 12 10 10 6 1 9 4 2 16 1 15 8 23 1 2 20 14 12 4 12 8 2 6 1 4 3 14 15 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 /* $NetBSD: kern_time.c,v 1.221 2023/02/23 02:57:17 riastradh Exp $ */ /*- * Copyright (c) 2000, 2004, 2005, 2007, 2008, 2009, 2020 * The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Christopher G. Demetriou, by Andrew Doran, and by Jason R. Thorpe. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /* * Copyright (c) 1982, 1986, 1989, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)kern_time.c 8.4 (Berkeley) 5/26/95 */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: kern_time.c,v 1.221 2023/02/23 02:57:17 riastradh Exp $"); #include <sys/param.h> #include <sys/resourcevar.h> #include <sys/kernel.h> #include <sys/systm.h> #include <sys/proc.h> #include <sys/vnode.h> #include <sys/signalvar.h> #include <sys/syslog.h> #include <sys/timetc.h> #include <sys/timevar.h> #include <sys/timex.h> #include <sys/kauth.h> #include <sys/mount.h> #include <sys/syscallargs.h> #include <sys/cpu.h> kmutex_t itimer_mutex __cacheline_aligned; /* XXX static */ static struct itlist itimer_realtime_changed_notify; static void itimer_callout(void *); static void ptimer_intr(void *); static void *ptimer_sih __read_mostly; static TAILQ_HEAD(, ptimer) ptimer_queue; #define CLOCK_VIRTUAL_P(clockid) \ ((clockid) == CLOCK_VIRTUAL || (clockid) == CLOCK_PROF) CTASSERT(ITIMER_REAL == CLOCK_REALTIME); CTASSERT(ITIMER_VIRTUAL == CLOCK_VIRTUAL); CTASSERT(ITIMER_PROF == CLOCK_PROF); CTASSERT(ITIMER_MONOTONIC == CLOCK_MONOTONIC); #define DELAYTIMER_MAX 32 /* * Initialize timekeeping. */ void time_init(void) { mutex_init(&itimer_mutex, MUTEX_DEFAULT, IPL_SCHED); LIST_INIT(&itimer_realtime_changed_notify); TAILQ_INIT(&ptimer_queue); ptimer_sih = softint_establish(SOFTINT_CLOCK | SOFTINT_MPSAFE, ptimer_intr, NULL); } /* * Check if the time will wrap if set to ts. * * ts - timespec describing the new time * delta - the delta between the current time and ts */ bool time_wraps(struct timespec *ts, struct timespec *delta) { /* * Don't allow the time to be set forward so far it * will wrap and become negative, thus allowing an * attacker to bypass the next check below. The * cutoff is 1 year before rollover occurs, so even * if the attacker uses adjtime(2) to move the time * past the cutoff, it will take a very long time * to get to the wrap point. */ if ((ts->tv_sec > LLONG_MAX - 365*24*60*60) || (delta->tv_sec < 0 || delta->tv_nsec < 0)) return true; return false; } /* * itimer_lock: * * Acquire the interval timer data lock. */ void itimer_lock(void) { mutex_spin_enter(&itimer_mutex); } /* * itimer_unlock: * * Release the interval timer data lock. */ void itimer_unlock(void) { mutex_spin_exit(&itimer_mutex); } /* * itimer_lock_held: * * Check that the interval timer lock is held for diagnostic * assertions. */ inline bool __diagused itimer_lock_held(void) { return mutex_owned(&itimer_mutex); } /* * Time of day and interval timer support. * * These routines provide the kernel entry points to get and set * the time-of-day and per-process interval timers. Subroutines * here provide support for adding and subtracting timeval structures * and decrementing interval timers, optionally reloading the interval * timers when they expire. */ /* This function is used by clock_settime and settimeofday */ static int settime1(struct proc *p, const struct timespec *ts, bool check_kauth) { struct timespec delta, now; /* * The time being set to an unreasonable value will cause * unreasonable system behaviour. */ if (ts->tv_sec < 0 || ts->tv_sec > (1LL << 36)) return EINVAL; nanotime(&now); timespecsub(ts, &now, &delta); if (check_kauth && kauth_authorize_system(kauth_cred_get(), KAUTH_SYSTEM_TIME, KAUTH_REQ_SYSTEM_TIME_SYSTEM, __UNCONST(ts), &delta, KAUTH_ARG(check_kauth ? false : true)) != 0) { return EPERM; } #ifdef notyet if ((delta.tv_sec < 86400) && securelevel > 0) { /* XXX elad - notyet */ return EPERM; } #endif tc_setclock(ts); resettodr(); /* * Notify pending CLOCK_REALTIME timers about the real time change. * There may be inactive timers on this list, but this happens * comparatively less often than timers firing, and so it's better * to put the extra checks here than to complicate the other code * path. */ struct itimer *it; itimer_lock(); LIST_FOREACH(it, &itimer_realtime_changed_notify, it_rtchgq) { KASSERT(it->it_ops->ito_realtime_changed != NULL); if (timespecisset(&it->it_time.it_value)) { (*it->it_ops->ito_realtime_changed)(it); } } itimer_unlock(); return 0; } int settime(struct proc *p, struct timespec *ts) { return settime1(p, ts, true); } /* ARGSUSED */ int sys___clock_gettime50(struct lwp *l, const struct sys___clock_gettime50_args *uap, register_t *retval) { /* { syscallarg(clockid_t) clock_id; syscallarg(struct timespec *) tp; } */ int error; struct timespec ats; error = clock_gettime1(SCARG(uap, clock_id), &ats); if (error != 0) return error; return copyout(&ats, SCARG(uap, tp), sizeof(ats)); } /* ARGSUSED */ int sys___clock_settime50(struct lwp *l, const struct sys___clock_settime50_args *uap, register_t *retval) { /* { syscallarg(clockid_t) clock_id; syscallarg(const struct timespec *) tp; } */ int error; struct timespec ats; if ((error = copyin(SCARG(uap, tp), &ats, sizeof(ats))) != 0) return error; return clock_settime1(l->l_proc, SCARG(uap, clock_id), &ats, true); } int clock_settime1(struct proc *p, clockid_t clock_id, const struct timespec *tp, bool check_kauth) { int error; if (tp->tv_nsec < 0 || tp->tv_nsec >= 1000000000L) return EINVAL; switch (clock_id) { case CLOCK_REALTIME: if ((error = settime1(p, tp, check_kauth)) != 0) return error; break; case CLOCK_MONOTONIC: return EINVAL; /* read-only clock */ default: return EINVAL; } return 0; } int sys___clock_getres50(struct lwp *l, const struct sys___clock_getres50_args *uap, register_t *retval) { /* { syscallarg(clockid_t) clock_id; syscallarg(struct timespec *) tp; } */ struct timespec ts; int error; if ((error = clock_getres1(SCARG(uap, clock_id), &ts)) != 0) return error; if (SCARG(uap, tp)) error = copyout(&ts, SCARG(uap, tp), sizeof(ts)); return error; } int clock_getres1(clockid_t clock_id, struct timespec *ts) { switch (clock_id) { case CLOCK_REALTIME: case CLOCK_MONOTONIC: ts->tv_sec = 0; if (tc_getfrequency() > 1000000000) ts->tv_nsec = 1; else ts->tv_nsec = 1000000000 / tc_getfrequency(); break; default: return EINVAL; } return 0; } /* ARGSUSED */ int sys___nanosleep50(struct lwp *l, const struct sys___nanosleep50_args *uap, register_t *retval) { /* { syscallarg(struct timespec *) rqtp; syscallarg(struct timespec *) rmtp; } */ struct timespec rmt, rqt; int error, error1; error = copyin(SCARG(uap, rqtp), &rqt, sizeof(struct timespec)); if (error) return error; error = nanosleep1(l, CLOCK_MONOTONIC, 0, &rqt, SCARG(uap, rmtp) ? &rmt : NULL); if (SCARG(uap, rmtp) == NULL || (error != 0 && error != EINTR)) return error; error1 = copyout(&rmt, SCARG(uap, rmtp), sizeof(rmt)); return error1 ? error1 : error; } /* ARGSUSED */ int sys_clock_nanosleep(struct lwp *l, const struct sys_clock_nanosleep_args *uap, register_t *retval) { /* { syscallarg(clockid_t) clock_id; syscallarg(int) flags; syscallarg(struct timespec *) rqtp; syscallarg(struct timespec *) rmtp; } */ struct timespec rmt, rqt; int error, error1; error = copyin(SCARG(uap, rqtp), &rqt, sizeof(struct timespec)); if (error) goto out; error = nanosleep1(l, SCARG(uap, clock_id), SCARG(uap, flags), &rqt, SCARG(uap, rmtp) ? &rmt : NULL); if (SCARG(uap, rmtp) == NULL || (error != 0 && error != EINTR)) goto out; if ((SCARG(uap, flags) & TIMER_ABSTIME) == 0 && (error1 = copyout(&rmt, SCARG(uap, rmtp), sizeof(rmt))) != 0) error = error1; out: *retval = error; return 0; } int nanosleep1(struct lwp *l, clockid_t clock_id, int flags, struct timespec *rqt, struct timespec *rmt) { struct timespec rmtstart; int error, timo; if ((error = ts2timo(clock_id, flags, rqt, &timo, &rmtstart)) != 0) { if (error == ETIMEDOUT) { error = 0; if (rmt != NULL) rmt->tv_sec = rmt->tv_nsec = 0; } return error; } /* * Avoid inadvertently sleeping forever */ if (timo == 0) timo = 1; again: error = kpause("nanoslp", true, timo, NULL); if (error == EWOULDBLOCK) error = 0; if (rmt != NULL || error == 0) { struct timespec rmtend; struct timespec t0; struct timespec *t; int err; err = clock_gettime1(clock_id, &rmtend); if (err != 0) return err; t = (rmt != NULL) ? rmt : &t0; if (flags & TIMER_ABSTIME) { timespecsub(rqt, &rmtend, t); } else { if (timespeccmp(&rmtend, &rmtstart, <)) timespecclear(t); /* clock wound back */ else timespecsub(&rmtend, &rmtstart, t); if (timespeccmp(rqt, t, <)) timespecclear(t); else timespecsub(rqt, t, t); } if (t->tv_sec < 0) timespecclear(t); if (error == 0) { timo = tstohz(t); if (timo > 0) goto again; } } if (error == ERESTART) error = EINTR; return error; } int sys_clock_getcpuclockid2(struct lwp *l, const struct sys_clock_getcpuclockid2_args *uap, register_t *retval) { /* { syscallarg(idtype_t idtype; syscallarg(id_t id); syscallarg(clockid_t *)clock_id; } */ pid_t pid; lwpid_t lid; clockid_t clock_id; id_t id = SCARG(uap, id); switch (SCARG(uap, idtype)) { case P_PID: pid = id == 0 ? l->l_proc->p_pid : id; clock_id = CLOCK_PROCESS_CPUTIME_ID | pid; break; case P_LWPID: lid = id == 0 ? l->l_lid : id; clock_id = CLOCK_THREAD_CPUTIME_ID | lid; break; default: return EINVAL; } return copyout(&clock_id, SCARG(uap, clock_id), sizeof(clock_id)); } /* ARGSUSED */ int sys___gettimeofday50(struct lwp *l, const struct sys___gettimeofday50_args *uap, register_t *retval) { /* { syscallarg(struct timeval *) tp; syscallarg(void *) tzp; really "struct timezone *"; } */ struct timeval atv; int error = 0; struct timezone tzfake; if (SCARG(uap, tp)) { memset(&atv, 0, sizeof(atv)); microtime(&atv); error = copyout(&atv, SCARG(uap, tp), sizeof(atv)); if (error) return error; } if (SCARG(uap, tzp)) { /* * NetBSD has no kernel notion of time zone, so we just * fake up a timezone struct and return it if demanded. */ tzfake.tz_minuteswest = 0; tzfake.tz_dsttime = 0; error = copyout(&tzfake, SCARG(uap, tzp), sizeof(tzfake)); } return error; } /* ARGSUSED */ int sys___settimeofday50(struct lwp *l, const struct sys___settimeofday50_args *uap, register_t *retval) { /* { syscallarg(const struct timeval *) tv; syscallarg(const void *) tzp; really "const struct timezone *"; } */ return settimeofday1(SCARG(uap, tv), true, SCARG(uap, tzp), l, true); } int settimeofday1(const struct timeval *utv, bool userspace, const void *utzp, struct lwp *l, bool check_kauth) { struct timeval atv; struct timespec ts; int error; /* Verify all parameters before changing time. */ /* * NetBSD has no kernel notion of time zone, and only an * obsolete program would try to set it, so we log a warning. */ if (utzp) log(LOG_WARNING, "pid %d attempted to set the " "(obsolete) kernel time zone\n", l->l_proc->p_pid); if (utv == NULL) return 0; if (userspace) { if ((error = copyin(utv, &atv, sizeof(atv))) != 0) return error; utv = &atv; } if (utv->tv_usec < 0 || utv->tv_usec >= 1000000) return EINVAL; TIMEVAL_TO_TIMESPEC(utv, &ts); return settime1(l->l_proc, &ts, check_kauth); } int time_adjusted; /* set if an adjustment is made */ /* ARGSUSED */ int sys___adjtime50(struct lwp *l, const struct sys___adjtime50_args *uap, register_t *retval) { /* { syscallarg(const struct timeval *) delta; syscallarg(struct timeval *) olddelta; } */ int error; struct timeval atv, oldatv; if ((error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_TIME, KAUTH_REQ_SYSTEM_TIME_ADJTIME, NULL, NULL, NULL)) != 0) return error; if (SCARG(uap, delta)) { error = copyin(SCARG(uap, delta), &atv, sizeof(*SCARG(uap, delta))); if (error) return error; } adjtime1(SCARG(uap, delta) ? &atv : NULL, SCARG(uap, olddelta) ? &oldatv : NULL, l->l_proc); if (SCARG(uap, olddelta)) error = copyout(&oldatv, SCARG(uap, olddelta), sizeof(*SCARG(uap, olddelta))); return error; } void adjtime1(const struct timeval *delta, struct timeval *olddelta, struct proc *p) { if (olddelta) { memset(olddelta, 0, sizeof(*olddelta)); mutex_spin_enter(&timecounter_lock); olddelta->tv_sec = time_adjtime / 1000000; olddelta->tv_usec = time_adjtime % 1000000; if (olddelta->tv_usec < 0) { olddelta->tv_usec += 1000000; olddelta->tv_sec--; } mutex_spin_exit(&timecounter_lock); } if (delta) { mutex_spin_enter(&timecounter_lock); /* * XXX This should maybe just report failure to * userland for nonsense deltas. */ if (delta->tv_sec > INT64_MAX/1000000 - 1) { time_adjtime = INT64_MAX; } else if (delta->tv_sec < INT64_MIN/1000000 + 1) { time_adjtime = INT64_MIN; } else { time_adjtime = delta->tv_sec * 1000000 + MAX(-999999, MIN(999999, delta->tv_usec)); } if (time_adjtime) { /* We need to save the system time during shutdown */ time_adjusted |= 1; } mutex_spin_exit(&timecounter_lock); } } /* * Interval timer support. * * The itimer_*() routines provide generic support for interval timers, * both real (CLOCK_REALTIME, CLOCK_MONOTIME), and virtual (CLOCK_VIRTUAL, * CLOCK_PROF). * * Real timers keep their deadline as an absolute time, and are fired * by a callout. Virtual timers are kept as a linked-list of deltas, * and are processed by hardclock(). * * Because the real time timer callout may be delayed in real time due * to interrupt processing on the system, it is possible for the real * time timeout routine (itimer_callout()) run past after its deadline. * It does not suffice, therefore, to reload the real timer .it_value * from the timer's .it_interval. Rather, we compute the next deadline * in absolute time based on the current time and the .it_interval value, * and report any overruns. * * Note that while the virtual timers are supported in a generic fashion * here, they only (currently) make sense as per-process timers, and thus * only really work for that case. */ /* * itimer_init: * * Initialize the common data for an interval timer. */ void itimer_init(struct itimer * const it, const struct itimer_ops * const ops, clockid_t const id, struct itlist * const itl) { KASSERT(itimer_lock_held()); KASSERT(ops != NULL); timespecclear(&it->it_time.it_value); it->it_ops = ops; it->it_clockid = id; it->it_overruns = 0; it->it_dying = false; if (!CLOCK_VIRTUAL_P(id)) { KASSERT(itl == NULL); callout_init(&it->it_ch, CALLOUT_MPSAFE); callout_setfunc(&it->it_ch, itimer_callout, it); if (id == CLOCK_REALTIME && ops->ito_realtime_changed != NULL) { LIST_INSERT_HEAD(&itimer_realtime_changed_notify, it, it_rtchgq); } } else { KASSERT(itl != NULL); it->it_vlist = itl; it->it_active = false; } } /* * itimer_poison: * * Poison an interval timer, preventing it from being scheduled * or processed, in preparation for freeing the timer. */ void itimer_poison(struct itimer * const it) { KASSERT(itimer_lock_held()); it->it_dying = true; /* * For non-virtual timers, stop the callout, or wait for it to * run if it has already fired. It cannot restart again after * this point: the callout won't restart itself when dying, no * other users holding the lock can restart it, and any other * users waiting for callout_halt concurrently (itimer_settime) * will restart from the top. */ if (!CLOCK_VIRTUAL_P(it->it_clockid)) { callout_halt(&it->it_ch, &itimer_mutex); if (it->it_clockid == CLOCK_REALTIME && it->it_ops->ito_realtime_changed != NULL) { LIST_REMOVE(it, it_rtchgq); } } } /* * itimer_fini: * * Release resources used by an interval timer. * * N.B. itimer_lock must be held on entry, and is released on exit. */ void itimer_fini(struct itimer * const it) { KASSERT(itimer_lock_held()); /* All done with the global state. */ itimer_unlock(); /* Destroy the callout, if needed. */ if (!CLOCK_VIRTUAL_P(it->it_clockid)) callout_destroy(&it->it_ch); } /* * itimer_decr: * * Decrement an interval timer by a specified number of nanoseconds, * which must be less than a second, i.e. < 1000000000. If the timer * expires, then reload it. In this case, carry over (nsec - old value) * to reduce the value reloaded into the timer so that the timer does * not drift. This routine assumes that it is called in a context where * the timers on which it is operating cannot change in value. * * Returns true if the timer has expired. */ static bool itimer_decr(struct itimer *it, int nsec) { struct itimerspec *itp; int error __diagused; KASSERT(itimer_lock_held()); KASSERT(CLOCK_VIRTUAL_P(it->it_clockid)); itp = &it->it_time; if (itp->it_value.tv_nsec < nsec) { if (itp->it_value.tv_sec == 0) { /* expired, and already in next interval */ nsec -= itp->it_value.tv_nsec; goto expire; } itp->it_value.tv_nsec += 1000000000; itp->it_value.tv_sec--; } itp->it_value.tv_nsec -= nsec; nsec = 0; if (timespecisset(&itp->it_value)) return false; /* expired, exactly at end of interval */ expire: if (timespecisset(&itp->it_interval)) { itp->it_value = itp->it_interval; itp->it_value.tv_nsec -= nsec; if (itp->it_value.tv_nsec < 0) { itp->it_value.tv_nsec += 1000000000; itp->it_value.tv_sec--; } error = itimer_settime(it); KASSERT(error == 0); /* virtual, never fails */ } else itp->it_value.tv_nsec = 0; /* sec is already 0 */ return true; } /* * itimer_arm_real: * * Arm a non-virtual timer. */ static void itimer_arm_real(struct itimer * const it) { KASSERT(!it->it_dying); KASSERT(!CLOCK_VIRTUAL_P(it->it_clockid)); KASSERT(!callout_pending(&it->it_ch)); /* * Don't need to check tshzto() return value, here. * callout_schedule() does it for us. */ callout_schedule(&it->it_ch, (it->it_clockid == CLOCK_MONOTONIC ? tshztoup(&it->it_time.it_value) : tshzto(&it->it_time.it_value))); } /* * itimer_callout: * * Callout to expire a non-virtual timer. Queue it up for processing, * and then reload, if it is configured to do so. * * N.B. A delay in processing this callout causes multiple * SIGALRM calls to be compressed into one. */ static void itimer_callout(void *arg) { uint64_t last_val, next_val, interval, now_ns; struct timespec now, next; struct itimer * const it = arg; int backwards; itimer_lock(); (*it->it_ops->ito_fire)(it); if (!timespecisset(&it->it_time.it_interval)) { timespecclear(&it->it_time.it_value); itimer_unlock(); return; } if (it->it_clockid == CLOCK_MONOTONIC) { getnanouptime(&now); } else { getnanotime(&now); } backwards = (timespeccmp(&it->it_time.it_value, &now, >)); /* Nonnegative interval guaranteed by itimerfix. */ KASSERT(it->it_time.it_interval.tv_sec >= 0); KASSERT(it->it_time.it_interval.tv_nsec >= 0); /* Handle the easy case of non-overflown timers first. */ if (!backwards && timespecaddok(&it->it_time.it_value, &it->it_time.it_interval)) { timespecadd(&it->it_time.it_value, &it->it_time.it_interval, &next); it->it_time.it_value = next; } else { now_ns = timespec2ns(&now); last_val = timespec2ns(&it->it_time.it_value); interval = timespec2ns(&it->it_time.it_interval); next_val = now_ns + (now_ns - last_val + interval - 1) % interval; if (backwards) next_val += interval; else it->it_overruns += (now_ns - last_val) / interval; it->it_time.it_value.tv_sec = next_val / 1000000000; it->it_time.it_value.tv_nsec = next_val % 1000000000; } /* * Reset the callout, if it's not going away. */ if (!it->it_dying) itimer_arm_real(it); itimer_unlock(); } /* * itimer_settime: * * Set up the given interval timer. The value in it->it_time.it_value * is taken to be an absolute time for CLOCK_REALTIME/CLOCK_MONOTONIC * timers and a relative time for CLOCK_VIRTUAL/CLOCK_PROF timers. * * If the callout had already fired but not yet run, fails with * ERESTART -- caller must restart from the top to look up a timer. */ int itimer_settime(struct itimer *it) { struct itimer *itn, *pitn; struct itlist *itl; KASSERT(itimer_lock_held()); KASSERT(!it->it_dying); if (!CLOCK_VIRTUAL_P(it->it_clockid)) { /* * Try to stop the callout. However, if it had already * fired, we have to drop the lock to wait for it, so * the world may have changed and pt may not be there * any more. In that case, tell the caller to start * over from the top. */ if (callout_halt(&it->it_ch, &itimer_mutex)) return ERESTART; KASSERT(!it->it_dying); /* Now we can touch it and start it up again. */ if (timespecisset(&it->it_time.it_value)) itimer_arm_real(it); } else { if (it->it_active) { itn = LIST_NEXT(it, it_list); LIST_REMOVE(it, it_list); for ( ; itn; itn = LIST_NEXT(itn, it_list)) timespecadd(&it->it_time.it_value, &itn->it_time.it_value, &itn->it_time.it_value); } if (timespecisset(&it->it_time.it_value)) { itl = it->it_vlist; for (itn = LIST_FIRST(itl), pitn = NULL; itn && timespeccmp(&it->it_time.it_value, &itn->it_time.it_value, >); pitn = itn, itn = LIST_NEXT(itn, it_list)) timespecsub(&it->it_time.it_value, &itn->it_time.it_value, &it->it_time.it_value); if (pitn) LIST_INSERT_AFTER(pitn, it, it_list); else LIST_INSERT_HEAD(itl, it, it_list); for ( ; itn ; itn = LIST_NEXT(itn, it_list)) timespecsub(&itn->it_time.it_value, &it->it_time.it_value, &itn->it_time.it_value); it->it_active = true; } else { it->it_active = false; } } /* Success! */ return 0; } /* * itimer_gettime: * * Return the remaining time of an interval timer. */ void itimer_gettime(const struct itimer *it, struct itimerspec *aits) { struct timespec now; struct itimer *itn; KASSERT(itimer_lock_held()); KASSERT(!it->it_dying); *aits = it->it_time; if (!CLOCK_VIRTUAL_P(it->it_clockid)) { /* * Convert from absolute to relative time in .it_value * part of real time timer. If time for real time * timer has passed return 0, else return difference * between current time and time for the timer to go * off. */ if (timespecisset(&aits->it_value)) { if (it->it_clockid == CLOCK_REALTIME) { getnanotime(&now); } else { /* CLOCK_MONOTONIC */ getnanouptime(&now); } if (timespeccmp(&aits->it_value, &now, <)) timespecclear(&aits->it_value); else timespecsub(&aits->it_value, &now, &aits->it_value); } } else if (it->it_active) { for (itn = LIST_FIRST(it->it_vlist); itn && itn != it; itn = LIST_NEXT(itn, it_list)) timespecadd(&aits->it_value, &itn->it_time.it_value, &aits->it_value); KASSERT(itn != NULL); /* it should be findable on the list */ } else timespecclear(&aits->it_value); } /* * Per-process timer support. * * Both the BSD getitimer() family and the POSIX timer_*() family of * routines are supported. * * All timers are kept in an array pointed to by p_timers, which is * allocated on demand - many processes don't use timers at all. The * first four elements in this array are reserved for the BSD timers: * element 0 is ITIMER_REAL, element 1 is ITIMER_VIRTUAL, element * 2 is ITIMER_PROF, and element 3 is ITIMER_MONOTONIC. The rest may be * allocated by the timer_create() syscall. * * These timers are a "sub-class" of interval timer. */ /* * ptimer_free: * * Free the per-process timer at the specified index. */ static void ptimer_free(struct ptimers *pts, int index) { struct itimer *it; struct ptimer *pt; KASSERT(itimer_lock_held()); it = pts->pts_timers[index]; pt = container_of(it, struct ptimer, pt_itimer); pts->pts_timers[index] = NULL; itimer_poison(it); /* * Remove it from the queue to be signalled. Must be done * after itimer is poisoned, because we may have had to wait * for the callout to complete. */ if (pt->pt_queued) { TAILQ_REMOVE(&ptimer_queue, pt, pt_chain); pt->pt_queued = false; } itimer_fini(it); /* releases itimer_lock */ kmem_free(pt, sizeof(*pt)); } /* * ptimers_alloc: * * Allocate a ptimers for the specified process. */ static struct ptimers * ptimers_alloc(struct proc *p) { struct ptimers *pts; int i; pts = kmem_alloc(sizeof(*pts), KM_SLEEP); LIST_INIT(&pts->pts_virtual); LIST_INIT(&pts->pts_prof); for (i = 0; i < TIMER_MAX; i++) pts->pts_timers[i] = NULL; itimer_lock(); if (p->p_timers == NULL) { p->p_timers = pts; itimer_unlock(); return pts; } itimer_unlock(); kmem_free(pts, sizeof(*pts)); return p->p_timers; } /* * ptimers_free: * * Clean up the per-process timers. If "which" is set to TIMERS_ALL, * then clean up all timers and free all the data structures. If * "which" is set to TIMERS_POSIX, only clean up the timers allocated * by timer_create(), not the BSD setitimer() timers, and only free the * structure if none of those remain. * * This function is exported because it is needed in the exec and * exit code paths. */ void ptimers_free(struct proc *p, int which) { struct ptimers *pts; struct itimer *itn; struct timespec ts; int i; if (p->p_timers == NULL) return; pts = p->p_timers; itimer_lock(); if (which == TIMERS_ALL) { p->p_timers = NULL; i = 0; } else { timespecclear(&ts); for (itn = LIST_FIRST(&pts->pts_virtual); itn && itn != pts->pts_timers[ITIMER_VIRTUAL]; itn = LIST_NEXT(itn, it_list)) { KASSERT(itn->it_clockid == CLOCK_VIRTUAL); timespecadd(&ts, &itn->it_time.it_value, &ts); } LIST_FIRST(&pts->pts_virtual) = NULL; if (itn) { KASSERT(itn->it_clockid == CLOCK_VIRTUAL); timespecadd(&ts, &itn->it_time.it_value, &itn->it_time.it_value); LIST_INSERT_HEAD(&pts->pts_virtual, itn, it_list); } timespecclear(&ts); for (itn = LIST_FIRST(&pts->pts_prof); itn && itn != pts->pts_timers[ITIMER_PROF]; itn = LIST_NEXT(itn, it_list)) { KASSERT(itn->it_clockid == CLOCK_PROF); timespecadd(&ts, &itn->it_time.it_value, &ts); } LIST_FIRST(&pts->pts_prof) = NULL; if (itn) { KASSERT(itn->it_clockid == CLOCK_PROF); timespecadd(&ts, &itn->it_time.it_value, &itn->it_time.it_value); LIST_INSERT_HEAD(&pts->pts_prof, itn, it_list); } i = TIMER_MIN; } for ( ; i < TIMER_MAX; i++) { if (pts->pts_timers[i] != NULL) { /* Free the timer and release the lock. */ ptimer_free(pts, i); /* Reacquire the lock for the next one. */ itimer_lock(); } } if (pts->pts_timers[0] == NULL && pts->pts_timers[1] == NULL && pts->pts_timers[2] == NULL && pts->pts_timers[3] == NULL) { p->p_timers = NULL; itimer_unlock(); kmem_free(pts, sizeof(*pts)); } else itimer_unlock(); } /* * ptimer_fire: * * Fire a per-process timer. */ static void ptimer_fire(struct itimer *it) { struct ptimer *pt = container_of(it, struct ptimer, pt_itimer); KASSERT(itimer_lock_held()); /* * XXX Can overrun, but we don't do signal queueing yet, anyway. * XXX Relying on the clock interrupt is stupid. */ if (pt->pt_ev.sigev_notify != SIGEV_SIGNAL) { return; } if (!pt->pt_queued) { TAILQ_INSERT_TAIL(&ptimer_queue, pt, pt_chain); pt->pt_queued = true; softint_schedule(ptimer_sih); } } /* * Operations vector for per-process timers (BSD and POSIX). */ static const struct itimer_ops ptimer_itimer_ops = { .ito_fire = ptimer_fire, }; /* * sys_timer_create: * * System call to create a POSIX timer. */ int sys_timer_create(struct lwp *l, const struct sys_timer_create_args *uap, register_t *retval) { /* { syscallarg(clockid_t) clock_id; syscallarg(struct sigevent *) evp; syscallarg(timer_t *) timerid; } */ return timer_create1(SCARG(uap, timerid), SCARG(uap, clock_id), SCARG(uap, evp), copyin, l); } int timer_create1(timer_t *tid, clockid_t id, struct sigevent *evp, copyin_t fetch_event, struct lwp *l) { int error; timer_t timerid; struct itlist *itl; struct ptimers *pts; struct ptimer *pt; struct proc *p; p = l->l_proc; if ((u_int)id > CLOCK_MONOTONIC) return EINVAL; if ((pts = p->p_timers) == NULL) pts = ptimers_alloc(p); pt = kmem_zalloc(sizeof(*pt), KM_SLEEP); if (evp != NULL) { if (((error = (*fetch_event)(evp, &pt->pt_ev, sizeof(pt->pt_ev))) != 0) || ((pt->pt_ev.sigev_notify < SIGEV_NONE) || (pt->pt_ev.sigev_notify > SIGEV_SA)) || (pt->pt_ev.sigev_notify == SIGEV_SIGNAL && (pt->pt_ev.sigev_signo <= 0 || pt->pt_ev.sigev_signo >= NSIG))) { kmem_free(pt, sizeof(*pt)); return (error ? error : EINVAL); } } /* Find a free timer slot, skipping those reserved for setitimer(). */ itimer_lock(); for (timerid = TIMER_MIN; timerid < TIMER_MAX; timerid++) if (pts->pts_timers[timerid] == NULL) break; if (timerid == TIMER_MAX) { itimer_unlock(); kmem_free(pt, sizeof(*pt)); return EAGAIN; } if (evp == NULL) { pt->pt_ev.sigev_notify = SIGEV_SIGNAL; switch (id) { case CLOCK_REALTIME: case CLOCK_MONOTONIC: pt->pt_ev.sigev_signo = SIGALRM; break; case CLOCK_VIRTUAL: pt->pt_ev.sigev_signo = SIGVTALRM; break; case CLOCK_PROF: pt->pt_ev.sigev_signo = SIGPROF; break; } pt->pt_ev.sigev_value.sival_int = timerid; } switch (id) { case CLOCK_VIRTUAL: itl = &pts->pts_virtual; break; case CLOCK_PROF: itl = &pts->pts_prof; break; default: itl = NULL; } itimer_init(&pt->pt_itimer, &ptimer_itimer_ops, id, itl); pt->pt_proc = p; pt->pt_poverruns = 0; pt->pt_entry = timerid; pt->pt_queued = false; pts->pts_timers[timerid] = &pt->pt_itimer; itimer_unlock(); return copyout(&timerid, tid, sizeof(timerid)); } /* * sys_timer_delete: * * System call to delete a POSIX timer. */ int sys_timer_delete(struct lwp *l, const struct sys_timer_delete_args *uap, register_t *retval) { /* { syscallarg(timer_t) timerid; } */ struct proc *p = l->l_proc; timer_t timerid; struct ptimers *pts; struct itimer *it, *itn; timerid = SCARG(uap, timerid); pts = p->p_timers; if (pts == NULL || timerid < 2 || timerid >= TIMER_MAX) return EINVAL; itimer_lock(); if ((it = pts->pts_timers[timerid]) == NULL) { itimer_unlock(); return EINVAL; } if (CLOCK_VIRTUAL_P(it->it_clockid)) { if (it->it_active) { itn = LIST_NEXT(it, it_list); LIST_REMOVE(it, it_list); for ( ; itn; itn = LIST_NEXT(itn, it_list)) timespecadd(&it->it_time.it_value, &itn->it_time.it_value, &itn->it_time.it_value); it->it_active = false; } } /* Free the timer and release the lock. */ ptimer_free(pts, timerid); return 0; } /* * sys___timer_settime50: * * System call to set/arm a POSIX timer. */ int sys___timer_settime50(struct lwp *l, const struct sys___timer_settime50_args *uap, register_t *retval) { /* { syscallarg(timer_t) timerid; syscallarg(int) flags; syscallarg(const struct itimerspec *) value; syscallarg(struct itimerspec *) ovalue; } */ int error; struct itimerspec value, ovalue, *ovp = NULL; if ((error = copyin(SCARG(uap, value), &value, sizeof(struct itimerspec))) != 0) return error; if (SCARG(uap, ovalue)) ovp = &ovalue; if ((error = dotimer_settime(SCARG(uap, timerid), &value, ovp, SCARG(uap, flags), l->l_proc)) != 0) return error; if (ovp) return copyout(&ovalue, SCARG(uap, ovalue), sizeof(struct itimerspec)); return 0; } int dotimer_settime(int timerid, struct itimerspec *value, struct itimerspec *ovalue, int flags, struct proc *p) { struct timespec now; struct itimerspec val, oval; struct ptimers *pts; struct itimer *it; int error; pts = p->p_timers; if (pts == NULL || timerid < 2 || timerid >= TIMER_MAX) return EINVAL; val = *value; if ((error = itimespecfix(&val.it_value)) != 0 || (error = itimespecfix(&val.it_interval)) != 0) return error; itimer_lock(); restart: if ((it = pts->pts_timers[timerid]) == NULL) { itimer_unlock(); return EINVAL; } oval = it->it_time; it->it_time = val; /* * If we've been passed a relative time for a realtime timer, * convert it to absolute; if an absolute time for a virtual * timer, convert it to relative and make sure we don't set it * to zero, which would cancel the timer, or let it go * negative, which would confuse the comparison tests. */ if (timespecisset(&it->it_time.it_value)) { if (!CLOCK_VIRTUAL_P(it->it_clockid)) { if ((flags & TIMER_ABSTIME) == 0) { if (it->it_clockid == CLOCK_REALTIME) { getnanotime(&now); } else { /* CLOCK_MONOTONIC */ getnanouptime(&now); } timespecadd(&it->it_time.it_value, &now, &it->it_time.it_value); } } else { if ((flags & TIMER_ABSTIME) != 0) { getnanotime(&now); timespecsub(&it->it_time.it_value, &now, &it->it_time.it_value); if (!timespecisset(&it->it_time.it_value) || it->it_time.it_value.tv_sec < 0) { it->it_time.it_value.tv_sec = 0; it->it_time.it_value.tv_nsec = 1; } } } } error = itimer_settime(it); if (error == ERESTART) { KASSERT(!CLOCK_VIRTUAL_P(it->it_clockid)); goto restart; } KASSERT(error == 0); itimer_unlock(); if (ovalue) *ovalue = oval; return 0; } /* * sys___timer_gettime50: * * System call to return the time remaining until a POSIX timer fires. */ int sys___timer_gettime50(struct lwp *l, const struct sys___timer_gettime50_args *uap, register_t *retval) { /* { syscallarg(timer_t) timerid; syscallarg(struct itimerspec *) value; } */ struct itimerspec its; int error; if ((error = dotimer_gettime(SCARG(uap, timerid), l->l_proc, &its)) != 0) return error; return copyout(&its, SCARG(uap, value), sizeof(its)); } int dotimer_gettime(int timerid, struct proc *p, struct itimerspec *its) { struct itimer *it; struct ptimers *pts; pts = p->p_timers; if (pts == NULL || timerid < 2 || timerid >= TIMER_MAX) return EINVAL; itimer_lock(); if ((it = pts->pts_timers[timerid]) == NULL) { itimer_unlock(); return EINVAL; } itimer_gettime(it, its); itimer_unlock(); return 0; } /* * sys_timer_getoverrun: * * System call to return the number of times a POSIX timer has * expired while a notification was already pending. The counter * is reset when a timer expires and a notification can be posted. */ int sys_timer_getoverrun(struct lwp *l, const struct sys_timer_getoverrun_args *uap, register_t *retval) { /* { syscallarg(timer_t) timerid; } */ struct proc *p = l->l_proc; struct ptimers *pts; int timerid; struct itimer *it; struct ptimer *pt; timerid = SCARG(uap, timerid); pts = p->p_timers; if (pts == NULL || timerid < 2 || timerid >= TIMER_MAX) return EINVAL; itimer_lock(); if ((it = pts->pts_timers[timerid]) == NULL) { itimer_unlock(); return EINVAL; } pt = container_of(it, struct ptimer, pt_itimer); *retval = pt->pt_poverruns; if (*retval >= DELAYTIMER_MAX) *retval = DELAYTIMER_MAX; itimer_unlock(); return 0; } /* * sys___getitimer50: * * System call to get the time remaining before a BSD timer fires. */ int sys___getitimer50(struct lwp *l, const struct sys___getitimer50_args *uap, register_t *retval) { /* { syscallarg(int) which; syscallarg(struct itimerval *) itv; } */ struct proc *p = l->l_proc; struct itimerval aitv; int error; memset(&aitv, 0, sizeof(aitv)); error = dogetitimer(p, SCARG(uap, which), &aitv); if (error) return error; return copyout(&aitv, SCARG(uap, itv), sizeof(struct itimerval)); } int dogetitimer(struct proc *p, int which, struct itimerval *itvp) { struct ptimers *pts; struct itimer *it; struct itimerspec its; if ((u_int)which > ITIMER_MONOTONIC) return EINVAL; itimer_lock(); pts = p->p_timers; if (pts == NULL || (it = pts->pts_timers[which]) == NULL) { timerclear(&itvp->it_value); timerclear(&itvp->it_interval); } else { itimer_gettime(it, &its); TIMESPEC_TO_TIMEVAL(&itvp->it_value, &its.it_value); TIMESPEC_TO_TIMEVAL(&itvp->it_interval, &its.it_interval); } itimer_unlock(); return 0; } /* * sys___setitimer50: * * System call to set/arm a BSD timer. */ int sys___setitimer50(struct lwp *l, const struct sys___setitimer50_args *uap, register_t *retval) { /* { syscallarg(int) which; syscallarg(const struct itimerval *) itv; syscallarg(struct itimerval *) oitv; } */ struct proc *p = l->l_proc; int which = SCARG(uap, which); struct sys___getitimer50_args getargs; const struct itimerval *itvp; struct itimerval aitv; int error; itvp = SCARG(uap, itv); if (itvp && (error = copyin(itvp, &aitv, sizeof(struct itimerval))) != 0) return error; if (SCARG(uap, oitv) != NULL) { SCARG(&getargs, which) = which; SCARG(&getargs, itv) = SCARG(uap, oitv); if ((error = sys___getitimer50(l, &getargs, retval)) != 0) return error; } if (itvp == 0) return 0; return dosetitimer(p, which, &aitv); } int dosetitimer(struct proc *p, int which, struct itimerval *itvp) { struct timespec now; struct ptimers *pts; struct ptimer *spare; struct itimer *it; struct itlist *itl; int error; if ((u_int)which > ITIMER_MONOTONIC) return EINVAL; if (itimerfix(&itvp->it_value) || itimerfix(&itvp->it_interval)) return EINVAL; /* * Don't bother allocating data structures if the process just * wants to clear the timer. */ spare = NULL; pts = p->p_timers; retry: if (!timerisset(&itvp->it_value) && (pts == NULL || pts->pts_timers[which] == NULL)) return 0; if (pts == NULL) pts = ptimers_alloc(p); itimer_lock(); restart: it = pts->pts_timers[which]; if (it == NULL) { struct ptimer *pt; if (spare == NULL) { itimer_unlock(); spare = kmem_zalloc(sizeof(*spare), KM_SLEEP); goto retry; } pt = spare; spare = NULL; it = &pt->pt_itimer; pt->pt_ev.sigev_notify = SIGEV_SIGNAL; pt->pt_ev.sigev_value.sival_int = which; switch (which) { case ITIMER_REAL: case ITIMER_MONOTONIC: itl = NULL; pt->pt_ev.sigev_signo = SIGALRM; break; case ITIMER_VIRTUAL: itl = &pts->pts_virtual; pt->pt_ev.sigev_signo = SIGVTALRM; break; case ITIMER_PROF: itl = &pts->pts_prof; pt->pt_ev.sigev_signo = SIGPROF; break; default: panic("%s: can't happen %d", __func__, which); } itimer_init(it, &ptimer_itimer_ops, which, itl); pt->pt_proc = p; pt->pt_entry = which; pts->pts_timers[which] = it; } TIMEVAL_TO_TIMESPEC(&itvp->it_value, &it->it_time.it_value); TIMEVAL_TO_TIMESPEC(&itvp->it_interval, &it->it_time.it_interval); error = 0; if (timespecisset(&it->it_time.it_value)) { /* Convert to absolute time */ /* XXX need to wrap in splclock for timecounters case? */ switch (which) { case ITIMER_REAL: getnanotime(&now); if (!timespecaddok(&it->it_time.it_value, &now)) { error = EINVAL; goto out; } timespecadd(&it->it_time.it_value, &now, &it->it_time.it_value); break; case ITIMER_MONOTONIC: getnanouptime(&now); if (!timespecaddok(&it->it_time.it_value, &now)) { error = EINVAL; goto out; } timespecadd(&it->it_time.it_value, &now, &it->it_time.it_value); break; default: break; } } error = itimer_settime(it); if (error == ERESTART) { KASSERT(!CLOCK_VIRTUAL_P(it->it_clockid)); goto restart; } KASSERT(error == 0); out: itimer_unlock(); if (spare != NULL) kmem_free(spare, sizeof(*spare)); return error; } /* * ptimer_tick: * * Called from hardclock() to decrement per-process virtual timers. */ void ptimer_tick(lwp_t *l, bool user) { struct ptimers *pts; struct itimer *it; proc_t *p; p = l->l_proc; if (p->p_timers == NULL) return; itimer_lock(); if ((pts = l->l_proc->p_timers) != NULL) { /* * Run current process's virtual and profile time, as needed. */ if (user && (it = LIST_FIRST(&pts->pts_virtual)) != NULL) if (itimer_decr(it, tick * 1000)) (*it->it_ops->ito_fire)(it); if ((it = LIST_FIRST(&pts->pts_prof)) != NULL) if (itimer_decr(it, tick * 1000)) (*it->it_ops->ito_fire)(it); } itimer_unlock(); } /* * ptimer_intr: * * Software interrupt handler for processing per-process * timer expiration. */ static void ptimer_intr(void *cookie) { ksiginfo_t ksi; struct itimer *it; struct ptimer *pt; proc_t *p; mutex_enter(&proc_lock); itimer_lock(); while ((pt = TAILQ_FIRST(&ptimer_queue)) != NULL) { it = &pt->pt_itimer; TAILQ_REMOVE(&ptimer_queue, pt, pt_chain); KASSERT(pt->pt_queued); pt->pt_queued = false; p = pt->pt_proc; if (p->p_timers == NULL) { /* Process is dying. */ continue; } if (pt->pt_ev.sigev_notify != SIGEV_SIGNAL) { continue; } if (sigismember(&p->p_sigpend.sp_set, pt->pt_ev.sigev_signo)) { it->it_overruns++; continue; } KSI_INIT(&ksi); ksi.ksi_signo = pt->pt_ev.sigev_signo; ksi.ksi_code = SI_TIMER; ksi.ksi_value = pt->pt_ev.sigev_value; pt->pt_poverruns = it->it_overruns; it->it_overruns = 0; itimer_unlock(); kpsignal(p, &ksi, NULL); itimer_lock(); } itimer_unlock(); mutex_exit(&proc_lock); }
1 1 1 1 3 53 55 40 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 /* $NetBSD: in6_proto.c,v 1.131 2024/02/09 22:08:37 andvar Exp $ */ /* $KAME: in6_proto.c,v 1.66 2000/10/10 15:35:47 itojun Exp $ */ /* * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the project nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * Copyright (c) 1982, 1986, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)in_proto.c 8.1 (Berkeley) 6/10/93 */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: in6_proto.c,v 1.131 2024/02/09 22:08:37 andvar Exp $"); #ifdef _KERNEL_OPT #include "opt_gateway.h" #include "opt_inet.h" #include "opt_ipsec.h" #include "opt_dccp.h" #include "opt_sctp.h" #include "opt_net_mpsafe.h" #endif #include <sys/param.h> #include <sys/socket.h> #include <sys/protosw.h> #include <sys/kernel.h> #include <sys/domain.h> #include <sys/mbuf.h> #include <net/if.h> #include <netinet/in.h> #include <netinet/in_systm.h> #include <netinet/in_var.h> #include <netinet/ip_encap.h> #include <netinet/ip.h> #include <netinet/ip_var.h> #include <netinet/in_pcb.h> #include <netinet/ip6.h> #include <netinet6/ip6_var.h> #include <netinet/icmp6.h> #include <netinet6/in6_pcb.h> #include <netinet/tcp.h> #include <netinet/tcp_fsm.h> #include <netinet/tcp_seq.h> #include <netinet/tcp_timer.h> #include <netinet/tcp_var.h> #include <netinet/tcp_debug.h> #include <netinet6/udp6.h> #include <netinet6/udp6_var.h> #ifdef DCCP #include <netinet/dccp.h> #include <netinet/dccp_var.h> #include <netinet6/dccp6_var.h> #endif #ifdef SCTP #include <netinet/sctp_pcb.h> #include <netinet/sctp.h> #include <netinet/sctp_var.h> #include <netinet6/sctp6_var.h> #endif #include <netinet6/pim6_var.h> #ifdef IPSEC #include <netipsec/ipsec.h> #include <netipsec/ipsec6.h> #include <netipsec/key.h> #endif #include "carp.h" #if NCARP > 0 #include <netinet/ip_carp.h> #endif #include <netinet6/ip6protosw.h> /* * TCP/IP protocol family: IP6, ICMP6, UDP, TCP. */ DOMAIN_DEFINE(inet6domain); /* forward declare and add to link set */ /* Wrappers to acquire kernel_lock. */ PR_WRAP_CTLINPUT(rip6_ctlinput) PR_WRAP_CTLINPUT(encap6_ctlinput) PR_WRAP_CTLINPUT(udp6_ctlinput) PR_WRAP_CTLINPUT(tcp6_ctlinput) #define rip6_ctlinput rip6_ctlinput_wrapper #define encap6_ctlinput encap6_ctlinput_wrapper #define udp6_ctlinput udp6_ctlinput_wrapper #define tcp6_ctlinput tcp6_ctlinput_wrapper PR_WRAP_CTLOUTPUT(rip6_ctloutput) PR_WRAP_CTLOUTPUT(tcp_ctloutput) PR_WRAP_CTLOUTPUT(udp6_ctloutput) PR_WRAP_CTLOUTPUT(icmp6_ctloutput) #define rip6_ctloutput rip6_ctloutput_wrapper #define tcp_ctloutput tcp_ctloutput_wrapper #define udp6_ctloutput udp6_ctloutput_wrapper #define icmp6_ctloutput icmp6_ctloutput_wrapper #if defined(DCCP) PR_WRAP_CTLINPUT(dccp6_ctlinput) PR_WRAP_CTLOUTPUT(dccp_ctloutput) #define dccp6_ctlinput dccp6_ctlinput_wrapper #define dccp_ctloutput dccp_ctloutput_wrapper #endif #if defined(SCTP) PR_WRAP_CTLINPUT(sctp6_ctlinput) PR_WRAP_CTLOUTPUT(sctp_ctloutput) #define sctp6_ctlinput sctp6_ctlinput_wrapper #define sctp_ctloutput sctp_ctloutput_wrapper #endif #ifdef NET_MPSAFE PR_WRAP_INPUT6(udp6_input) PR_WRAP_INPUT6(tcp6_input) #ifdef DCCP PR_WRAP_INPUT6(dccp6_input) #endif #ifdef SCTP PR_WRAP_INPUT6(sctp6_input) #endif PR_WRAP_INPUT6(rip6_input) PR_WRAP_INPUT6(dest6_input) PR_WRAP_INPUT6(route6_input) PR_WRAP_INPUT6(frag6_input) #if NPFSYNC > 0 PR_WRAP_INPUT6(pfsync_input) #endif PR_WRAP_INPUT6(pim6_input) #define udp6_input udp6_input_wrapper #define tcp6_input tcp6_input_wrapper #define dccp6_input dccp6_input_wrapper #define sctp6_input sctp6_input_wrapper #define rip6_input rip6_input_wrapper #define dest6_input dest6_input_wrapper #define route6_input route6_input_wrapper #define frag6_input frag6_input_wrapper #define pim6_input pim6_input_wrapper #endif #if defined(IPSEC) #ifdef IPSEC_RUMPKERNEL /* * .pr_input = ipsec6_common_input won't be resolved on loading * the ipsec shared library. We need a wrapper anyway. */ static int ipsec6_common_input_wrapper(struct mbuf **mp, int *offp, int proto) { if (ipsec_enabled) { return ipsec6_common_input(mp, offp, proto); } else { m_freem(*mp); return IPPROTO_DONE; } } #define ipsec6_common_input ipsec6_common_input_wrapper /* The ctlinput functions may not be loaded */ #define IPSEC_WRAP_CTLINPUT(name) \ static void * \ name##_wrapper(int a, const struct sockaddr *b, void *c)\ { \ void *rv; \ KERNEL_LOCK(1, NULL); \ if (ipsec_enabled) \ rv = name(a, b, c); \ else \ rv = NULL; \ KERNEL_UNLOCK_ONE(NULL); \ return rv; \ } IPSEC_WRAP_CTLINPUT(ah6_ctlinput) IPSEC_WRAP_CTLINPUT(esp6_ctlinput) #else /* !IPSEC_RUMPKERNEL */ PR_WRAP_CTLINPUT(ah6_ctlinput) PR_WRAP_CTLINPUT(esp6_ctlinput) #endif /* !IPSEC_RUMPKERNEL */ #define ah6_ctlinput ah6_ctlinput_wrapper #define esp6_ctlinput esp6_ctlinput_wrapper #endif /* IPSEC */ static void tcp6_init(void) { icmp6_mtudisc_callback_register(tcp6_mtudisc_callback); tcp_init_common(sizeof(struct ip6_hdr)); } const struct ip6protosw inet6sw[] = { { .pr_domain = &inet6domain, .pr_protocol = IPPROTO_IPV6, .pr_init = ip6_init, .pr_fasttimo = frag6_fasttimo, .pr_slowtimo = frag6_slowtimo, .pr_drain = frag6_drainstub, }, { .pr_type = SOCK_RAW, .pr_domain = &inet6domain, .pr_protocol = IPPROTO_ICMPV6, .pr_flags = PR_ATOMIC|PR_ADDR|PR_LASTHDR, .pr_input = icmp6_input, .pr_ctlinput = rip6_ctlinput, .pr_ctloutput = icmp6_ctloutput, .pr_usrreqs = &rip6_usrreqs, .pr_init = icmp6_init, }, { .pr_type = SOCK_DGRAM, .pr_domain = &inet6domain, .pr_protocol = IPPROTO_UDP, .pr_flags = PR_ATOMIC|PR_ADDR|PR_PURGEIF, .pr_input = udp6_input, .pr_ctlinput = udp6_ctlinput, .pr_ctloutput = udp6_ctloutput, .pr_usrreqs = &udp6_usrreqs, .pr_init = udp6_init, }, { .pr_type = SOCK_STREAM, .pr_domain = &inet6domain, .pr_protocol = IPPROTO_TCP, .pr_flags = PR_CONNREQUIRED|PR_WANTRCVD|PR_LISTEN|PR_ABRTACPTDIS|PR_PURGEIF, .pr_input = tcp6_input, .pr_ctlinput = tcp6_ctlinput, .pr_ctloutput = tcp_ctloutput, .pr_usrreqs = &tcp_usrreqs, .pr_init = tcp6_init, .pr_fasttimo = tcp_fasttimo, .pr_drain = tcp_drainstub, }, #ifdef DCCP { .pr_type = SOCK_CONN_DGRAM, .pr_domain = &inet6domain, .pr_protocol = IPPROTO_DCCP, .pr_flags = PR_CONNREQUIRED|PR_ATOMIC|PR_LISTEN, .pr_input = dccp6_input, .pr_ctlinput = dccp6_ctlinput, .pr_ctloutput = dccp_ctloutput, .pr_usrreqs = &dccp6_usrreqs, #ifndef INET .pr_init = dccp_init, #endif }, #endif /* DCCP */ #ifdef SCTP { .pr_type = SOCK_DGRAM, .pr_domain = &inet6domain, .pr_protocol = IPPROTO_SCTP, .pr_flags = PR_ADDR_OPT|PR_WANTRCVD, .pr_input = sctp6_input, .pr_ctlinput = sctp6_ctlinput, .pr_ctloutput = sctp_ctloutput, .pr_usrreqs = &sctp6_usrreqs, .pr_drain = sctp_drain, }, { .pr_type = SOCK_SEQPACKET, .pr_domain = &inet6domain, .pr_protocol = IPPROTO_SCTP, .pr_flags = PR_ADDR_OPT|PR_WANTRCVD, .pr_input = sctp6_input, .pr_ctlinput = sctp6_ctlinput, .pr_ctloutput = sctp_ctloutput, .pr_usrreqs = &sctp6_usrreqs, .pr_drain = sctp_drain, }, { .pr_type = SOCK_STREAM, .pr_domain = &inet6domain, .pr_protocol = IPPROTO_SCTP, .pr_flags = PR_CONNREQUIRED|PR_ADDR_OPT|PR_WANTRCVD|PR_LISTEN, .pr_input = sctp6_input, .pr_ctlinput = sctp6_ctlinput, .pr_ctloutput = sctp_ctloutput, .pr_usrreqs = &sctp6_usrreqs, .pr_drain = sctp_drain, }, #endif /* SCTP */ { .pr_type = SOCK_RAW, .pr_domain = &inet6domain, .pr_protocol = IPPROTO_RAW, .pr_flags = PR_ATOMIC|PR_ADDR|PR_PURGEIF, .pr_input = rip6_input, .pr_ctlinput = rip6_ctlinput, .pr_ctloutput = rip6_ctloutput, .pr_usrreqs = &rip6_usrreqs, }, #ifdef GATEWAY { .pr_domain = &inet6domain, .pr_protocol = IPPROTO_IPV6, .pr_slowtimo = ip6flow_slowtimo, .pr_init = ip6flow_poolinit, }, #endif /* GATEWAY */ { .pr_type = SOCK_RAW, .pr_domain = &inet6domain, .pr_protocol = IPPROTO_DSTOPTS, .pr_flags = PR_ATOMIC|PR_ADDR, .pr_input = dest6_input, }, { .pr_type = SOCK_RAW, .pr_domain = &inet6domain, .pr_protocol = IPPROTO_ROUTING, .pr_flags = PR_ATOMIC|PR_ADDR, .pr_input = route6_input, }, { .pr_type = SOCK_RAW, .pr_domain = &inet6domain, .pr_protocol = IPPROTO_FRAGMENT, .pr_flags = PR_ATOMIC|PR_ADDR, .pr_input = frag6_input, }, #ifdef IPSEC { .pr_type = SOCK_RAW, .pr_domain = &inet6domain, .pr_protocol = IPPROTO_AH, .pr_flags = PR_ATOMIC|PR_ADDR, .pr_input = ipsec6_common_input, .pr_ctlinput = ah6_ctlinput, }, { .pr_type = SOCK_RAW, .pr_domain = &inet6domain, .pr_protocol = IPPROTO_ESP, .pr_flags = PR_ATOMIC|PR_ADDR, .pr_input = ipsec6_common_input, .pr_ctlinput = esp6_ctlinput, }, { .pr_type = SOCK_RAW, .pr_domain = &inet6domain, .pr_protocol = IPPROTO_IPCOMP, .pr_flags = PR_ATOMIC|PR_ADDR, .pr_input = ipsec6_common_input, }, #endif /* IPSEC */ #ifdef INET { .pr_type = SOCK_RAW, .pr_domain = &inet6domain, .pr_protocol = IPPROTO_IPV4, .pr_flags = PR_ATOMIC|PR_ADDR|PR_LASTHDR, .pr_input = encap6_input, .pr_ctlinput = encap6_ctlinput, .pr_ctloutput = rip6_ctloutput, .pr_usrreqs = &rip6_usrreqs, .pr_init = encap_init, }, #endif { .pr_type = SOCK_RAW, .pr_domain = &inet6domain, .pr_protocol = IPPROTO_IPV6, .pr_flags = PR_ATOMIC|PR_ADDR|PR_LASTHDR, .pr_input = encap6_input, .pr_ctlinput = encap6_ctlinput, .pr_ctloutput = rip6_ctloutput, .pr_usrreqs = &rip6_usrreqs, .pr_init = encap_init, }, #if NCARP > 0 { .pr_type = SOCK_RAW, .pr_domain = &inet6domain, .pr_protocol = IPPROTO_CARP, .pr_flags = PR_ATOMIC|PR_ADDR|PR_LASTHDR, .pr_input = carp6_proto_input, .pr_ctloutput = rip6_ctloutput, .pr_usrreqs = &rip6_usrreqs, }, #endif /* NCARP */ { .pr_type = SOCK_RAW, .pr_domain = &inet6domain, .pr_protocol = IPPROTO_L2TP, .pr_flags = PR_ATOMIC|PR_ADDR|PR_LASTHDR, .pr_input = encap6_input, .pr_ctlinput = rip6_ctlinput, .pr_ctloutput = rip6_ctloutput, .pr_usrreqs = &rip6_usrreqs, .pr_init = encap_init, }, { .pr_type = SOCK_RAW, .pr_domain = &inet6domain, .pr_protocol = IPPROTO_PIM, .pr_flags = PR_ATOMIC|PR_ADDR|PR_LASTHDR, .pr_input = pim6_input, .pr_ctloutput = rip6_ctloutput, .pr_usrreqs = &rip6_usrreqs, .pr_init = pim6_init, }, /* raw wildcard */ { .pr_type = SOCK_RAW, .pr_domain = &inet6domain, .pr_flags = PR_ATOMIC|PR_ADDR|PR_LASTHDR, .pr_input = rip6_input, .pr_ctloutput = rip6_ctloutput, .pr_usrreqs = &rip6_usrreqs, .pr_init = rip6_init, }, }; static const struct sockaddr_in6 in6_any = { .sin6_len = sizeof(in6_any) , .sin6_family = AF_INET6 , .sin6_port = 0 , .sin6_flowinfo = 0 , .sin6_addr = IN6ADDR_ANY_INIT , .sin6_scope_id = 0 }; bool in6_present = false; static void in6_dom_init(void) { in6_present = true; } struct domain inet6domain = { .dom_family = AF_INET6, .dom_name = "internet6", .dom_init = in6_dom_init, .dom_externalize = NULL, .dom_dispose = NULL, .dom_protosw = (const struct protosw *)inet6sw, .dom_protoswNPROTOSW = (const struct protosw *)&inet6sw[sizeof(inet6sw)/sizeof(inet6sw[0])], .dom_rtattach = rt_inithead, .dom_rtoffset = offsetof(struct sockaddr_in6, sin6_addr) << 3, .dom_maxrtkey = sizeof(struct ip_pack6), .dom_if_up = in6_if_up, .dom_if_down = in6_if_down, .dom_ifattach = in6_domifattach, .dom_ifdetach = in6_domifdetach, .dom_if_link_state_change = in6_if_link_state_change, .dom_link = { NULL }, .dom_mowner = MOWNER_INIT("",""), .dom_sa_cmpofs = offsetof(struct sockaddr_in6, sin6_addr), .dom_sa_cmplen = sizeof(struct in6_addr), .dom_sa_any = (const struct sockaddr *)&in6_any, .dom_sockaddr_externalize = sockaddr_in6_externalize, }; #if 0 int sockaddr_in6_cmp(const struct sockaddr *lsa, const struct sockaddr *rsa) { uint_fast8_t len; const uint_fast8_t addrofs = offsetof(struct sockaddr_in6, sin6_addr), addrend = addrofs + sizeof(struct in6_addr); int rc; const struct sockaddr_in6 *lsin6, *rsin6; lsin6 = satocsin6(lsa); rsin6 = satocsin6(rsa); len = MIN(addrend, MIN(lsin6->sin6_len, rsin6->sin6_len)); if (len > addrofs && (rc = memcmp(&lsin6->sin6_addr, &rsin6->sin6_addr, len - addrofs)) != 0) return rc; return lsin6->sin6_len - rsin6->sin6_len; } #endif /* * Internet configuration info */ #ifdef GATEWAY6 #define IPV6FORWARDING 1 /* forward IP6 packets not for us */ #else #define IPV6FORWARDING 0 /* don't forward IP6 packets not for us */ #endif int ip6_forwarding = IPV6FORWARDING; /* act as router? */ int ip6_sendredirects = 1; int ip6_defhlim = IPV6_DEFHLIM; int ip6_defmcasthlim = IPV6_DEFAULT_MULTICAST_HOPS; int ip6_maxfragpackets = 200; int ip6_maxfrags = 200; int ip6_log_interval = 5; int ip6_hdrnestlimit = 15; int ip6_dad_count = 1; /* DupAddrDetectionTransmits */ int ip6_auto_flowlabel = 1; int ip6_use_deprecated = 1; /* allow deprecated addr (RFC2462 5.5.4) */ int ip6_mcast_pmtu = 0; /* enable pMTU discovery for multicast? */ int ip6_v6only = 1; int ip6_neighborgcthresh = 2048; /* Threshold # of NDP entries for GC */ int ip6_maxdynroutes = 4096; /* Max # of routes created via redirect */ int ip6_param_rt_msg = 1; /* How to send parameter changing rtm */ int ip6_keepfaith = 0; time_t ip6_log_time = 0; /* icmp6 */ int pmtu_expire = 60*10; /* raw IP6 parameters */ /* * Nominal space allocated to a raw ip socket. */ #define RIPV6SNDQ 8192 #define RIPV6RCVQ 16384 u_long rip6_sendspace = RIPV6SNDQ; u_long rip6_recvspace = RIPV6RCVQ; /* ICMPV6 parameters */ int icmp6_rediraccept = 1; /* accept and process redirects */ int icmp6_redirtimeout = 10 * 60; /* 10 minutes */ int icmp6errppslim = 100; /* 100pps */ int icmp6_nodeinfo = 1; /* enable/disable NI response */
1 1 1 1 1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 /* $NetBSD: ipsec_netbsd.c,v 1.55 2022/09/02 23:48:11 thorpej Exp $ */ /* $KAME: esp_input.c,v 1.60 2001/09/04 08:43:19 itojun Exp $ */ /* $KAME: ah_input.c,v 1.64 2001/09/04 08:43:19 itojun Exp $ */ /* * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the project nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: ipsec_netbsd.c,v 1.55 2022/09/02 23:48:11 thorpej Exp $"); #if defined(_KERNEL_OPT) #include "opt_inet.h" #include "opt_ipsec.h" #endif #include <sys/param.h> #include <sys/systm.h> #include <sys/mbuf.h> #include <sys/domain.h> #include <sys/protosw.h> #include <sys/socket.h> #include <sys/errno.h> #include <sys/time.h> #include <sys/kernel.h> #include <sys/sysctl.h> #include <net/if.h> #include <net/route.h> #include <sys/cpu.h> #include <netinet/in.h> #include <netinet/in_systm.h> #include <netinet/in_var.h> #include <netinet/ip.h> #include <netinet/ip_var.h> #include <netinet/ip_ecn.h> #include <netinet/ip_icmp.h> #include <netipsec/ipsec.h> #include <netipsec/ipsec_var.h> #include <netipsec/ipsec_private.h> #include <netipsec/key.h> #include <netipsec/keydb.h> #include <netipsec/key_debug.h> #include <netipsec/ah.h> #include <netipsec/ah_var.h> #include <netipsec/esp.h> #include <netipsec/esp_var.h> #include <netipsec/ipip_var.h> #include <netipsec/ipcomp_var.h> #ifdef INET6 #include <netipsec/ipsec6.h> #include <netinet6/ip6protosw.h> #include <netinet/icmp6.h> #endif #include <netipsec/key.h> /* assumes that ip header and ah header are contiguous on mbuf */ void * ah4_ctlinput(int cmd, const struct sockaddr *sa, void *v) { struct ip *ip = v; struct ah *ah; struct icmp *icp; struct secasvar *sav; if (sa->sa_family != AF_INET || sa->sa_len != sizeof(struct sockaddr_in)) return NULL; if ((unsigned)cmd >= PRC_NCMDS) return NULL; if (cmd == PRC_MSGSIZE && ip_mtudisc && ip && ip->ip_v == 4) { /* * Check to see if we have a valid SA corresponding to * the address in the ICMP message payload. */ ah = (struct ah *)((char *)ip + (ip->ip_hl << 2)); sav = KEY_LOOKUP_SA((const union sockaddr_union *)sa, IPPROTO_AH, ah->ah_spi, 0, 0); if (sav) { if (SADB_SASTATE_USABLE_P(sav)) { /* * Now that we've validated that we are actually * communicating with the host indicated in the * ICMP message, locate the ICMP header, * recalculate the new MTU, and create the * corresponding routing entry. */ icp = (struct icmp *)((char *)ip - offsetof(struct icmp, icmp_ip)); icmp_mtudisc(icp, ip->ip_dst); } KEY_SA_UNREF(&sav); } } return NULL; } /* assumes that ip header and esp header are contiguous on mbuf */ void * esp4_ctlinput(int cmd, const struct sockaddr *sa, void *v) { struct ip *ip = v; struct esp *esp; struct icmp *icp; struct secasvar *sav; if (sa->sa_family != AF_INET || sa->sa_len != sizeof(struct sockaddr_in)) return NULL; if ((unsigned)cmd >= PRC_NCMDS) return NULL; if (cmd == PRC_MSGSIZE && ip_mtudisc && ip && ip->ip_v == 4) { /* * Check to see if we have a valid SA corresponding to * the address in the ICMP message payload. */ esp = (struct esp *)((char *)ip + (ip->ip_hl << 2)); sav = KEY_LOOKUP_SA((const union sockaddr_union *)sa, IPPROTO_ESP, esp->esp_spi, 0, 0); if (sav) { if (SADB_SASTATE_USABLE_P(sav)) { /* * Now that we've validated that we are actually * communicating with the host indicated in the * ICMP message, locate the ICMP header, * recalculate the new MTU, and create the * corresponding routing entry. */ icp = (struct icmp *)((char *)ip - offsetof(struct icmp, icmp_ip)); icmp_mtudisc(icp, ip->ip_dst); } KEY_SA_UNREF(&sav); } } return NULL; } #ifdef INET6 void * ah6_ctlinput(int cmd, const struct sockaddr *sa, void *d) { const struct newah *ahp; struct newah ah; struct secasvar *sav; struct ip6_hdr *ip6; struct mbuf *m; struct ip6ctlparam *ip6cp = NULL; int off; if (sa->sa_family != AF_INET6 || sa->sa_len != sizeof(struct sockaddr_in6)) return NULL; if ((unsigned)cmd >= PRC_NCMDS) return NULL; /* if the parameter is from icmp6, decode it. */ if (d != NULL) { ip6cp = (struct ip6ctlparam *)d; m = ip6cp->ip6c_m; ip6 = ip6cp->ip6c_ip6; off = ip6cp->ip6c_off; } else { m = NULL; ip6 = NULL; off = 0; } if (ip6) { /* check if we can safely examine src and dst ports */ if (m->m_pkthdr.len < off + sizeof(ah)) return NULL; if (m->m_len < off + sizeof(ah)) { /* * this should be rare case, * so we compromise on this copy... */ m_copydata(m, off, sizeof(ah), &ah); ahp = &ah; } else ahp = (struct newah *)(mtod(m, char *) + off); if (cmd == PRC_MSGSIZE) { int valid = 0; /* * Check to see if we have a valid SA corresponding * to the address in the ICMP message payload. */ sav = KEY_LOOKUP_SA((const union sockaddr_union *)sa, IPPROTO_AH, ahp->ah_spi, 0, 0); if (sav) { if (SADB_SASTATE_USABLE_P(sav)) valid++; KEY_SA_UNREF(&sav); } /* XXX Further validation? */ /* * Depending on the value of "valid" and routing * table size (mtudisc_{hi,lo}wat), we will: * - recalculate the new MTU and create the * corresponding routing entry, or * - ignore the MTU change notification. */ icmp6_mtudisc_update((struct ip6ctlparam *)d, valid); } /* we normally notify single pcb here */ } else { /* we normally notify any pcb here */ } return NULL; } void * esp6_ctlinput(int cmd, const struct sockaddr *sa, void *d) { const struct newesp *espp; struct newesp esp; struct ip6ctlparam *ip6cp = NULL, ip6cp1; struct secasvar *sav; struct ip6_hdr *ip6; struct mbuf *m; int off; if (sa->sa_family != AF_INET6 || sa->sa_len != sizeof(struct sockaddr_in6)) return NULL; if ((unsigned)cmd >= PRC_NCMDS) return NULL; /* if the parameter is from icmp6, decode it. */ if (d != NULL) { ip6cp = (struct ip6ctlparam *)d; m = ip6cp->ip6c_m; ip6 = ip6cp->ip6c_ip6; off = ip6cp->ip6c_off; } else { m = NULL; ip6 = NULL; off = 0; } if (ip6) { /* * Notify the error to all possible sockets via pfctlinput2. * Since the upper layer information (such as protocol type, * source and destination ports) is embedded in the encrypted * data and might have been cut, we can't directly call * an upper layer ctlinput function. However, the pcbnotify * function will consider source and destination addresses * as well as the flow info value, and may be able to find * some PCB that should be notified. * Although pfctlinput2 will call esp6_ctlinput(), there is * no possibility of an infinite loop of function calls, * because we don't pass the inner IPv6 header. */ memset(&ip6cp1, 0, sizeof(ip6cp1)); ip6cp1.ip6c_src = ip6cp->ip6c_src; pfctlinput2(cmd, sa, &ip6cp1); /* check if we can safely examine src and dst ports */ if (m->m_pkthdr.len < off + sizeof(esp)) return NULL; if (m->m_len < off + sizeof(esp)) { /* * this should be rare case, * so we compromise on this copy... */ m_copydata(m, off, sizeof(esp), &esp); espp = &esp; } else espp = (struct newesp *)(mtod(m, char *) + off); if (cmd == PRC_MSGSIZE) { int valid = 0; /* * Check to see if we have a valid SA corresponding to * the address in the ICMP message payload. */ sav = KEY_LOOKUP_SA((const union sockaddr_union *)sa, IPPROTO_ESP, espp->esp_spi, 0, 0); if (sav) { if (SADB_SASTATE_USABLE_P(sav)) valid++; KEY_SA_UNREF(&sav); } /* XXX Further validation? */ /* * Depending on the value of "valid" and routing table * size (mtudisc_{hi,lo}wat), we will: * - recalcurate the new MTU and create the * corresponding routing entry, or * - ignore the MTU change notification. */ icmp6_mtudisc_update((struct ip6ctlparam *)d, valid); } } else { /* we normally notify any pcb here */ } return NULL; } #endif /* INET6 */ static int sysctl_ipsec(SYSCTLFN_ARGS) { int error, t; struct sysctlnode node; node = *rnode; t = *(int *)rnode->sysctl_data; node.sysctl_data = &t; error = sysctl_lookup(SYSCTLFN_CALL(&node)); if (error || newp == NULL) return error; switch (rnode->sysctl_num) { case IPSECCTL_DEF_ESP_TRANSLEV: case IPSECCTL_DEF_ESP_NETLEV: case IPSECCTL_DEF_AH_TRANSLEV: case IPSECCTL_DEF_AH_NETLEV: if (t != IPSEC_LEVEL_USE && t != IPSEC_LEVEL_REQUIRE) return EINVAL; ipsec_invalpcbcacheall(); break; case IPSECCTL_DEF_POLICY: if (t != IPSEC_POLICY_DISCARD && t != IPSEC_POLICY_NONE) return EINVAL; ipsec_invalpcbcacheall(); break; default: return EINVAL; } *(int *)rnode->sysctl_data = t; return 0; } #ifdef IPSEC_DEBUG static int sysctl_ipsec_test(SYSCTLFN_ARGS) { int t, error; struct sysctlnode node; node = *rnode; t = *(int *)rnode->sysctl_data; node.sysctl_data = &t; error = sysctl_lookup(SYSCTLFN_CALL(&node)); if (error || newp == NULL) return error; if (t < 0 || t > 1) return EINVAL; if (rnode->sysctl_data == &ipsec_replay) printf("ipsec: Anti-Replay service %s\n", (t == 1) ? "deactivated" : "activated"); else if (rnode->sysctl_data == &ipsec_integrity) printf("ipsec: HMAC corruption %s\n", (t == 0) ? "deactivated" : "activated"); *(int *)rnode->sysctl_data = t; return 0; } #endif static int sysctl_net_inet_ipsec_stats(SYSCTLFN_ARGS) { return (NETSTAT_SYSCTL(ipsecstat_percpu, IPSEC_NSTATS)); } static int sysctl_net_inet_ah_stats(SYSCTLFN_ARGS) { return (NETSTAT_SYSCTL(ahstat_percpu, AH_NSTATS)); } static int sysctl_net_inet_esp_stats(SYSCTLFN_ARGS) { return (NETSTAT_SYSCTL(espstat_percpu, ESP_NSTATS)); } static int sysctl_net_inet_ipcomp_stats(SYSCTLFN_ARGS) { return (NETSTAT_SYSCTL(ipcompstat_percpu, IPCOMP_NSTATS)); } static int sysctl_net_inet_ipip_stats(SYSCTLFN_ARGS) { return (NETSTAT_SYSCTL(ipipstat_percpu, IPIP_NSTATS)); } static int sysctl_net_ipsec_enabled(SYSCTLFN_ARGS) { int newenabled, error; struct sysctlnode node; node = *rnode; node.sysctl_data = &newenabled; newenabled = ipsec_enabled; error = sysctl_lookup(SYSCTLFN_CALL(&node)); if (error || newp == NULL) return error; switch (newenabled) { case 0: if (key_get_used()) return EBUSY; /*FALLTHROUGH*/ case 1: case 2: ipsec_enabled = newenabled; key_update_used(); return 0; default: return EINVAL; } } /* XXX will need a different oid at parent */ void sysctl_net_inet_ipsec_setup(struct sysctllog **clog) { const struct sysctlnode *_ipsec; int ipproto_ipsec; sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT, CTLTYPE_NODE, "inet", NULL, NULL, 0, NULL, 0, CTL_NET, PF_INET, CTL_EOL); /* * in numerical order: * * net.inet.ipip: CTL_NET.PF_INET.IPPROTO_IPIP * net.inet.esp: CTL_NET.PF_INET.IPPROTO_ESP * net.inet.ah: CTL_NET.PF_INET.IPPROTO_AH * net.inet.ipcomp: CTL_NET.PF_INET.IPPROTO_IPCOMP * net.inet.ipsec: CTL_NET.PF_INET.CTL_CREATE * * this creates separate trees by name, but maintains that the * ipsec name leads to all the old leaves. */ /* create net.inet.ipip */ sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT, CTLTYPE_NODE, "ipip", NULL, NULL, 0, NULL, 0, CTL_NET, PF_INET, IPPROTO_IPIP, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_READONLY, CTLTYPE_STRUCT, "ipip_stats", NULL, sysctl_net_inet_ipip_stats, 0, NULL, 0, CTL_NET, PF_INET, IPPROTO_IPIP, CTL_CREATE, CTL_EOL); /* create net.inet.esp subtree under IPPROTO_ESP */ sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT, CTLTYPE_NODE, "esp", NULL, NULL, 0, NULL, 0, CTL_NET, PF_INET, IPPROTO_ESP, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_READONLY, CTLTYPE_STRUCT, "esp_stats", NULL, sysctl_net_inet_esp_stats, 0, NULL, 0, CTL_NET, PF_INET, IPPROTO_ESP, CTL_CREATE, CTL_EOL); /* create net.inet.ah subtree under IPPROTO_AH */ sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT, CTLTYPE_NODE, "ah", NULL, NULL, 0, NULL, 0, CTL_NET, PF_INET, IPPROTO_AH, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_READONLY, CTLTYPE_STRUCT, "ah_stats", NULL, sysctl_net_inet_ah_stats, 0, NULL, 0, CTL_NET, PF_INET, IPPROTO_AH, CTL_CREATE, CTL_EOL); /* create net.inet.ipcomp */ sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT, CTLTYPE_NODE, "ipcomp", NULL, NULL, 0, NULL, 0, CTL_NET, PF_INET, IPPROTO_IPCOMP, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_READONLY, CTLTYPE_STRUCT, "ipcomp_stats", NULL, sysctl_net_inet_ipcomp_stats, 0, NULL, 0, CTL_NET, PF_INET, IPPROTO_IPCOMP, CTL_CREATE, CTL_EOL); /* create net.inet.ipsec subtree under dynamic oid */ sysctl_createv(clog, 0, NULL, &_ipsec, CTLFLAG_PERMANENT, CTLTYPE_NODE, "ipsec", NULL, NULL, 0, NULL, 0, CTL_NET, PF_INET, CTL_CREATE, CTL_EOL); ipproto_ipsec = (_ipsec != NULL) ? _ipsec->sysctl_num : 0; sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_INT, "def_policy", NULL, sysctl_ipsec, 0, &ip4_def_policy.policy, 0, CTL_NET, PF_INET, ipproto_ipsec, IPSECCTL_DEF_POLICY, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_INT, "esp_trans_deflev", NULL, sysctl_ipsec, 0, &ip4_esp_trans_deflev, 0, CTL_NET, PF_INET, ipproto_ipsec, IPSECCTL_DEF_ESP_TRANSLEV, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_INT, "esp_net_deflev", NULL, sysctl_ipsec, 0, &ip4_esp_net_deflev, 0, CTL_NET, PF_INET, ipproto_ipsec, IPSECCTL_DEF_ESP_NETLEV, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_INT, "ah_trans_deflev", NULL, sysctl_ipsec, 0, &ip4_ah_trans_deflev, 0, CTL_NET, PF_INET, ipproto_ipsec, IPSECCTL_DEF_AH_TRANSLEV, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_INT, "ah_net_deflev", NULL, sysctl_ipsec, 0, &ip4_ah_net_deflev, 0, CTL_NET, PF_INET, ipproto_ipsec, IPSECCTL_DEF_AH_NETLEV, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_INT, "ah_cleartos", NULL, NULL, 0, &ip4_ah_cleartos, 0, CTL_NET, PF_INET, ipproto_ipsec, IPSECCTL_AH_CLEARTOS, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_INT, "ah_offsetmask", NULL, NULL, 0, &ip4_ah_offsetmask, 0, CTL_NET, PF_INET, ipproto_ipsec, IPSECCTL_AH_OFFSETMASK, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_INT, "dfbit", NULL, NULL, 0, &ip4_ipsec_dfbit, 0, CTL_NET, PF_INET, ipproto_ipsec, IPSECCTL_DFBIT, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_INT, "ecn", NULL, NULL, 0, &ip4_ipsec_ecn, 0, CTL_NET, PF_INET, ipproto_ipsec, IPSECCTL_ECN, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_INT, "debug", NULL, NULL, 0, &ipsec_debug, 0, CTL_NET, PF_INET, ipproto_ipsec, IPSECCTL_DEBUG, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_INT, "ipip_spoofcheck", NULL, NULL, 0, &ipip_spoofcheck, 0, CTL_NET, PF_INET, ipproto_ipsec, CTL_CREATE, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_READONLY, CTLTYPE_STRUCT, "ipsecstats", NULL, sysctl_net_inet_ipsec_stats, 0, NULL, 0, CTL_NET, PF_INET, ipproto_ipsec, CTL_CREATE, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_INT, "enabled", SYSCTL_DESCR("Enable IPSec processing"), sysctl_net_ipsec_enabled, 0, NULL, 0, CTL_NET, PF_INET, ipproto_ipsec, CTL_CREATE, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_READONLY, CTLTYPE_INT, "used", SYSCTL_DESCR("Is IPSec active?"), NULL, 0, &ipsec_used, 0, CTL_NET, PF_INET, ipproto_ipsec, CTL_CREATE, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_INT, "ah_enable", NULL, NULL, 0, &ah_enable, 0, CTL_NET, PF_INET, ipproto_ipsec, CTL_CREATE, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_INT, "esp_enable", NULL, NULL, 0, &esp_enable, 0, CTL_NET, PF_INET, ipproto_ipsec, CTL_CREATE, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_INT, "ipcomp_enable", NULL, NULL, 0, &ipcomp_enable, 0, CTL_NET, PF_INET, ipproto_ipsec, CTL_CREATE, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_INT, "crypto_support", NULL, NULL, 0, &crypto_support, 0, CTL_NET, PF_INET, ipproto_ipsec, CTL_CREATE, CTL_EOL); #ifdef IPSEC_DEBUG sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_INT, "test_replay", SYSCTL_DESCR("Emulate replay attack"), sysctl_ipsec_test, 0, &ipsec_replay, 0, CTL_NET, PF_INET, ipproto_ipsec, CTL_CREATE, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_INT, "test_integrity", SYSCTL_DESCR("Emulate man-in-the-middle attack"), sysctl_ipsec_test, 0, &ipsec_integrity, 0, CTL_NET, PF_INET, ipproto_ipsec, CTL_CREATE, CTL_EOL); #endif } #ifdef INET6 void sysctl_net_inet6_ipsec6_setup(struct sysctllog **clog) { sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT, CTLTYPE_NODE, "inet6", NULL, NULL, 0, NULL, 0, CTL_NET, PF_INET6, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT, CTLTYPE_NODE, "ipsec6", SYSCTL_DESCR("IPv6 related IPSec settings"), NULL, 0, NULL, 0, CTL_NET, PF_INET6, IPPROTO_AH, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_STRUCT, "stats", SYSCTL_DESCR("IPSec statistics and counters"), sysctl_net_inet_ipsec_stats, 0, NULL, 0, CTL_NET, PF_INET6, IPPROTO_AH, IPSECCTL_STATS, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_INT, "def_policy", SYSCTL_DESCR("Default action for non-IPSec packets"), sysctl_ipsec, 0, &ip6_def_policy.policy, 0, CTL_NET, PF_INET6, IPPROTO_AH, IPSECCTL_DEF_POLICY, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_INT, "esp_trans_deflev", SYSCTL_DESCR("Default required security level for " "transport mode traffic"), sysctl_ipsec, 0, &ip6_esp_trans_deflev, 0, CTL_NET, PF_INET6, IPPROTO_AH, IPSECCTL_DEF_ESP_TRANSLEV, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_INT, "esp_net_deflev", SYSCTL_DESCR("Default required security level for " "tunneled traffic"), sysctl_ipsec, 0, &ip6_esp_net_deflev, 0, CTL_NET, PF_INET6, IPPROTO_AH, IPSECCTL_DEF_ESP_NETLEV, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_INT, "ah_trans_deflev", SYSCTL_DESCR("Default required security level for " "transport mode headers"), sysctl_ipsec, 0, &ip6_ah_trans_deflev, 0, CTL_NET, PF_INET6, IPPROTO_AH, IPSECCTL_DEF_AH_TRANSLEV, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_INT, "ah_net_deflev", SYSCTL_DESCR("Default required security level for " "tunneled headers"), sysctl_ipsec, 0, &ip6_ah_net_deflev, 0, CTL_NET, PF_INET6, IPPROTO_AH, IPSECCTL_DEF_AH_NETLEV, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_INT, "ecn", SYSCTL_DESCR("Behavior of ECN for tunneled traffic"), NULL, 0, &ip6_ipsec_ecn, 0, CTL_NET, PF_INET6, IPPROTO_AH, IPSECCTL_ECN, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_INT, "debug", SYSCTL_DESCR("Enable IPSec debugging output"), NULL, 0, &ipsec_debug, 0, CTL_NET, PF_INET6, IPPROTO_AH, IPSECCTL_DEBUG, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_INT, "enabled", SYSCTL_DESCR("Enable IPSec processing"), sysctl_net_ipsec_enabled, 0, NULL, 0, CTL_NET, PF_INET6, IPPROTO_AH, CTL_CREATE, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_READONLY, CTLTYPE_INT, "used", SYSCTL_DESCR("Is IPSec active?"), NULL, 0, &ipsec_used, 0, CTL_NET, PF_INET6, IPPROTO_AH, CTL_CREATE, CTL_EOL); } #endif /* INET6 */
1571 1568 1570 1512 965 113 112 1557 26 468 1344 1346 1345 1284 943 92 1329 1350 1351 1349 1350 445 445 445 444 1245 1244 1239 181 23 23 12 12 1329 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 /* $NetBSD: subr_kmem.c,v 1.89 2023/09/10 14:29:13 ad Exp $ */ /* * Copyright (c) 2009-2023 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Andrew Doran and Maxime Villard. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /* * Copyright (c)2006 YAMAMOTO Takashi, * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * Allocator of kernel wired memory. This allocator has some debug features * enabled with "option DIAGNOSTIC" and "option DEBUG". */ /* * KMEM_SIZE: detect alloc/free size mismatch bugs. * Append to each allocation a fixed-sized footer and record the exact * user-requested allocation size in it. When freeing, compare it with * kmem_free's "size" argument. * * This option is enabled on DIAGNOSTIC. * * |CHUNK|CHUNK|CHUNK|CHUNK|CHUNK|CHUNK|CHUNK|CHUNK|CHUNK| | * +-----+-----+-----+-----+-----+-----+-----+-----+-----+-+ * | | | | | | | | |/////|U| * | | | | | | | | |/HSZ/|U| * | | | | | | | | |/////|U| * +-----+-----+-----+-----+-----+-----+-----+-----+-----+-+ * | Buffer usable by the caller (requested size) |Size |Unused */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: subr_kmem.c,v 1.89 2023/09/10 14:29:13 ad Exp $"); #ifdef _KERNEL_OPT #include "opt_kmem.h" #endif #include <sys/param.h> #include <sys/callback.h> #include <sys/kmem.h> #include <sys/pool.h> #include <sys/debug.h> #include <sys/lockdebug.h> #include <sys/cpu.h> #include <sys/asan.h> #include <sys/msan.h> #include <sys/sdt.h> #include <uvm/uvm_extern.h> #include <uvm/uvm_map.h> #include <lib/libkern/libkern.h> struct kmem_cache_info { size_t kc_size; const char * kc_name; #ifdef KDTRACE_HOOKS const id_t *kc_alloc_probe_id; const id_t *kc_free_probe_id; #endif }; #define KMEM_CACHE_SIZES(F) \ F(8, kmem-00008, kmem__00008) \ F(16, kmem-00016, kmem__00016) \ F(24, kmem-00024, kmem__00024) \ F(32, kmem-00032, kmem__00032) \ F(40, kmem-00040, kmem__00040) \ F(48, kmem-00048, kmem__00048) \ F(56, kmem-00056, kmem__00056) \ F(64, kmem-00064, kmem__00064) \ F(80, kmem-00080, kmem__00080) \ F(96, kmem-00096, kmem__00096) \ F(112, kmem-00112, kmem__00112) \ F(128, kmem-00128, kmem__00128) \ F(160, kmem-00160, kmem__00160) \ F(192, kmem-00192, kmem__00192) \ F(224, kmem-00224, kmem__00224) \ F(256, kmem-00256, kmem__00256) \ F(320, kmem-00320, kmem__00320) \ F(384, kmem-00384, kmem__00384) \ F(448, kmem-00448, kmem__00448) \ F(512, kmem-00512, kmem__00512) \ F(768, kmem-00768, kmem__00768) \ F(1024, kmem-01024, kmem__01024) \ /* end of KMEM_CACHE_SIZES */ #define KMEM_CACHE_BIG_SIZES(F) \ F(2048, kmem-02048, kmem__02048) \ F(4096, kmem-04096, kmem__04096) \ F(8192, kmem-08192, kmem__08192) \ F(16384, kmem-16384, kmem__16384) \ /* end of KMEM_CACHE_BIG_SIZES */ /* sdt:kmem:alloc:kmem-* probes */ #define F(SZ, NAME, PROBENAME) \ SDT_PROBE_DEFINE4(sdt, kmem, alloc, PROBENAME, \ "void *"/*ptr*/, \ "size_t"/*requested_size*/, \ "size_t"/*allocated_size*/, \ "km_flag_t"/*kmflags*/); KMEM_CACHE_SIZES(F); KMEM_CACHE_BIG_SIZES(F); #undef F /* sdt:kmem:free:kmem-* probes */ #define F(SZ, NAME, PROBENAME) \ SDT_PROBE_DEFINE3(sdt, kmem, free, PROBENAME, \ "void *"/*ptr*/, \ "size_t"/*requested_size*/, \ "size_t"/*allocated_size*/); KMEM_CACHE_SIZES(F); KMEM_CACHE_BIG_SIZES(F); #undef F /* sdt:kmem:alloc:large, sdt:kmem:free:large probes */ SDT_PROBE_DEFINE4(sdt, kmem, alloc, large, "void *"/*ptr*/, "size_t"/*requested_size*/, "size_t"/*allocated_size*/, "km_flag_t"/*kmflags*/); SDT_PROBE_DEFINE3(sdt, kmem, free, large, "void *"/*ptr*/, "size_t"/*requested_size*/, "size_t"/*allocated_size*/); #ifdef KDTRACE_HOOKS #define F(SZ, NAME, PROBENAME) \ { SZ, #NAME, \ &sdt_sdt_kmem_alloc_##PROBENAME->id, \ &sdt_sdt_kmem_free_##PROBENAME->id }, #else #define F(SZ, NAME, PROBENAME) { SZ, #NAME }, #endif static const struct kmem_cache_info kmem_cache_sizes[] = { KMEM_CACHE_SIZES(F) { 0 } }; static const struct kmem_cache_info kmem_cache_big_sizes[] = { KMEM_CACHE_BIG_SIZES(F) { 0 } }; #undef F /* * KMEM_ALIGN is the smallest guaranteed alignment and also the * smallest allocateable quantum. * Every cache size >= CACHE_LINE_SIZE gets CACHE_LINE_SIZE alignment. */ #define KMEM_ALIGN 8 #define KMEM_SHIFT 3 #define KMEM_MAXSIZE 1024 #define KMEM_CACHE_COUNT (KMEM_MAXSIZE >> KMEM_SHIFT) static pool_cache_t kmem_cache[KMEM_CACHE_COUNT] __cacheline_aligned; static size_t kmem_cache_maxidx __read_mostly; #define KMEM_BIG_ALIGN 2048 #define KMEM_BIG_SHIFT 11 #define KMEM_BIG_MAXSIZE 16384 #define KMEM_CACHE_BIG_COUNT (KMEM_BIG_MAXSIZE >> KMEM_BIG_SHIFT) static pool_cache_t kmem_cache_big[KMEM_CACHE_BIG_COUNT] __cacheline_aligned; static size_t kmem_cache_big_maxidx __read_mostly; #if defined(DIAGNOSTIC) && defined(_HARDKERNEL) #define KMEM_SIZE #endif #if defined(DEBUG) && defined(_HARDKERNEL) static void *kmem_freecheck; #endif #if defined(KMEM_SIZE) #define SIZE_SIZE sizeof(size_t) static void kmem_size_set(void *, size_t); static void kmem_size_check(void *, size_t); #else #define SIZE_SIZE 0 #define kmem_size_set(p, sz) /* nothing */ #define kmem_size_check(p, sz) /* nothing */ #endif #ifndef KDTRACE_HOOKS static const id_t **const kmem_cache_alloc_probe_id = NULL; static const id_t **const kmem_cache_big_alloc_probe_id = NULL; static const id_t **const kmem_cache_free_probe_id = NULL; static const id_t **const kmem_cache_big_free_probe_id = NULL; #define KMEM_CACHE_PROBE(ARRAY, INDEX, PTR, REQSIZE, ALLOCSIZE, FLAGS) \ __nothing #else static const id_t *kmem_cache_alloc_probe_id[KMEM_CACHE_COUNT]; static const id_t *kmem_cache_big_alloc_probe_id[KMEM_CACHE_COUNT]; static const id_t *kmem_cache_free_probe_id[KMEM_CACHE_COUNT]; static const id_t *kmem_cache_big_free_probe_id[KMEM_CACHE_COUNT]; #define KMEM_CACHE_PROBE(ARRAY, INDEX, PTR, REQSIZE, ALLOCSIZE, FLAGS) do \ { \ id_t id; \ \ KDASSERT((INDEX) < __arraycount(ARRAY)); \ if (__predict_false((id = *(ARRAY)[INDEX]) != 0)) { \ (*sdt_probe_func)(id, \ (uintptr_t)(PTR), \ (uintptr_t)(REQSIZE), \ (uintptr_t)(ALLOCSIZE), \ (uintptr_t)(FLAGS), \ (uintptr_t)0); \ } \ } while (0) #endif /* KDTRACE_HOOKS */ #define KMEM_CACHE_ALLOC_PROBE(I, P, RS, AS, F) \ KMEM_CACHE_PROBE(kmem_cache_alloc_probe_id, I, P, RS, AS, F) #define KMEM_CACHE_BIG_ALLOC_PROBE(I, P, RS, AS, F) \ KMEM_CACHE_PROBE(kmem_cache_big_alloc_probe_id, I, P, RS, AS, F) #define KMEM_CACHE_FREE_PROBE(I, P, RS, AS) \ KMEM_CACHE_PROBE(kmem_cache_free_probe_id, I, P, RS, AS, 0) #define KMEM_CACHE_BIG_FREE_PROBE(I, P, RS, AS) \ KMEM_CACHE_PROBE(kmem_cache_big_free_probe_id, I, P, RS, AS, 0) CTASSERT(KM_SLEEP == PR_WAITOK); CTASSERT(KM_NOSLEEP == PR_NOWAIT); /* * kmem_intr_alloc: allocate wired memory. */ void * kmem_intr_alloc(size_t requested_size, km_flag_t kmflags) { #ifdef KASAN const size_t origsize = requested_size; #endif size_t allocsz, index; size_t size; pool_cache_t pc; uint8_t *p; KASSERT(requested_size > 0); KASSERT((kmflags & KM_SLEEP) || (kmflags & KM_NOSLEEP)); KASSERT(!(kmflags & KM_SLEEP) || !(kmflags & KM_NOSLEEP)); kasan_add_redzone(&requested_size); size = kmem_roundup_size(requested_size); allocsz = size + SIZE_SIZE; if ((index = ((allocsz - 1) >> KMEM_SHIFT)) < kmem_cache_maxidx) { pc = kmem_cache[index]; p = pool_cache_get(pc, kmflags); KMEM_CACHE_ALLOC_PROBE(index, p, requested_size, allocsz, kmflags); } else if ((index = ((allocsz - 1) >> KMEM_BIG_SHIFT)) < kmem_cache_big_maxidx) { pc = kmem_cache_big[index]; p = pool_cache_get(pc, kmflags); KMEM_CACHE_BIG_ALLOC_PROBE(index, p, requested_size, allocsz, kmflags); } else { int ret = uvm_km_kmem_alloc(kmem_va_arena, (vsize_t)round_page(size), ((kmflags & KM_SLEEP) ? VM_SLEEP : VM_NOSLEEP) | VM_INSTANTFIT, (vmem_addr_t *)&p); SDT_PROBE4(sdt, kmem, alloc, large, ret ? NULL : p, requested_size, round_page(size), kmflags); if (ret) { return NULL; } FREECHECK_OUT(&kmem_freecheck, p); KASSERT(size < coherency_unit || ALIGNED_POINTER(p, coherency_unit)); return p; } if (__predict_true(p != NULL)) { FREECHECK_OUT(&kmem_freecheck, p); kmem_size_set(p, requested_size); kasan_mark(p, origsize, size, KASAN_KMEM_REDZONE); return p; } KASSERT(size < coherency_unit || ALIGNED_POINTER(p, coherency_unit)); return p; } /* * kmem_intr_zalloc: allocate zeroed wired memory. */ void * kmem_intr_zalloc(size_t size, km_flag_t kmflags) { void *p; p = kmem_intr_alloc(size, kmflags); if (__predict_true(p != NULL)) { memset(p, 0, size); } return p; } /* * kmem_intr_free: free wired memory allocated by kmem_alloc. */ void kmem_intr_free(void *p, size_t requested_size) { size_t allocsz, index; size_t size; pool_cache_t pc; KASSERT(p != NULL); KASSERTMSG(requested_size > 0, "kmem_intr_free(%p, 0)", p); kasan_add_redzone(&requested_size); size = kmem_roundup_size(requested_size); allocsz = size + SIZE_SIZE; if ((index = ((allocsz - 1) >> KMEM_SHIFT)) < kmem_cache_maxidx) { KMEM_CACHE_FREE_PROBE(index, p, requested_size, allocsz); pc = kmem_cache[index]; } else if ((index = ((allocsz - 1) >> KMEM_BIG_SHIFT)) < kmem_cache_big_maxidx) { KMEM_CACHE_BIG_FREE_PROBE(index, p, requested_size, allocsz); pc = kmem_cache_big[index]; } else { FREECHECK_IN(&kmem_freecheck, p); SDT_PROBE3(sdt, kmem, free, large, p, requested_size, round_page(size)); uvm_km_kmem_free(kmem_va_arena, (vaddr_t)p, round_page(size)); return; } kasan_mark(p, size, size, 0); kmem_size_check(p, requested_size); FREECHECK_IN(&kmem_freecheck, p); LOCKDEBUG_MEM_CHECK(p, size); pool_cache_put(pc, p); } /* -------------------------------- Kmem API -------------------------------- */ /* * kmem_alloc: allocate wired memory. * => must not be called from interrupt context. */ void * kmem_alloc(size_t size, km_flag_t kmflags) { void *v; KASSERT(!cpu_intr_p()); KASSERT(!cpu_softintr_p()); v = kmem_intr_alloc(size, kmflags); if (__predict_true(v != NULL)) { kmsan_mark(v, size, KMSAN_STATE_UNINIT); kmsan_orig(v, size, KMSAN_TYPE_KMEM, __RET_ADDR); } KASSERT(v || (kmflags & KM_NOSLEEP) != 0); return v; } /* * kmem_zalloc: allocate zeroed wired memory. * => must not be called from interrupt context. */ void * kmem_zalloc(size_t size, km_flag_t kmflags) { void *v; KASSERT(!cpu_intr_p()); KASSERT(!cpu_softintr_p()); v = kmem_intr_zalloc(size, kmflags); KASSERT(v || (kmflags & KM_NOSLEEP) != 0); return v; } /* * kmem_free: free wired memory allocated by kmem_alloc. * => must not be called from interrupt context. */ void kmem_free(void *p, size_t size) { KASSERT(!cpu_intr_p()); KASSERT(!cpu_softintr_p()); kmem_intr_free(p, size); kmsan_mark(p, size, KMSAN_STATE_INITED); } static size_t kmem_create_caches(const struct kmem_cache_info *array, const id_t *alloc_probe_table[], const id_t *free_probe_table[], pool_cache_t alloc_table[], size_t maxsize, int shift, int ipl) { size_t maxidx = 0; size_t table_unit = (1 << shift); size_t size = table_unit; int i; for (i = 0; array[i].kc_size != 0 ; i++) { const char *name = array[i].kc_name; size_t cache_size = array[i].kc_size; struct pool_allocator *pa; int flags = 0; pool_cache_t pc; size_t align; /* check if we reached the requested size */ if (cache_size > maxsize || cache_size > PAGE_SIZE) { break; } /* * Exclude caches with size not a factor or multiple of the * coherency unit. */ if (cache_size < COHERENCY_UNIT) { if (COHERENCY_UNIT % cache_size > 0) { continue; } flags |= PR_NOTOUCH; align = KMEM_ALIGN; } else if ((cache_size & (PAGE_SIZE - 1)) == 0) { align = PAGE_SIZE; } else { if ((cache_size % COHERENCY_UNIT) > 0) { continue; } align = COHERENCY_UNIT; } if ((cache_size >> shift) > maxidx) { maxidx = cache_size >> shift; } pa = &pool_allocator_kmem; pc = pool_cache_init(cache_size, align, 0, flags, name, pa, ipl, NULL, NULL, NULL); while (size <= cache_size) { alloc_table[(size - 1) >> shift] = pc; #ifdef KDTRACE_HOOKS if (alloc_probe_table) { alloc_probe_table[(size - 1) >> shift] = array[i].kc_alloc_probe_id; } if (free_probe_table) { free_probe_table[(size - 1) >> shift] = array[i].kc_free_probe_id; } #endif size += table_unit; } } return maxidx; } void kmem_init(void) { kmem_cache_maxidx = kmem_create_caches(kmem_cache_sizes, kmem_cache_alloc_probe_id, kmem_cache_free_probe_id, kmem_cache, KMEM_MAXSIZE, KMEM_SHIFT, IPL_VM); kmem_cache_big_maxidx = kmem_create_caches(kmem_cache_big_sizes, kmem_cache_big_alloc_probe_id, kmem_cache_big_free_probe_id, kmem_cache_big, PAGE_SIZE, KMEM_BIG_SHIFT, IPL_VM); } size_t kmem_roundup_size(size_t size) { return (size + (KMEM_ALIGN - 1)) & ~(KMEM_ALIGN - 1); } /* * Used to dynamically allocate string with kmem accordingly to format. */ char * kmem_asprintf(const char *fmt, ...) { int size __diagused, len; va_list va; char *str; va_start(va, fmt); len = vsnprintf(NULL, 0, fmt, va); va_end(va); str = kmem_alloc(len + 1, KM_SLEEP); va_start(va, fmt); size = vsnprintf(str, len + 1, fmt, va); va_end(va); KASSERT(size == len); return str; } char * kmem_strdupsize(const char *str, size_t *lenp, km_flag_t flags) { size_t len = strlen(str) + 1; char *ptr = kmem_alloc(len, flags); if (ptr == NULL) return NULL; if (lenp) *lenp = len; memcpy(ptr, str, len); return ptr; } char * kmem_strndup(const char *str, size_t maxlen, km_flag_t flags) { KASSERT(str != NULL); KASSERT(maxlen != 0); size_t len = strnlen(str, maxlen); char *ptr = kmem_alloc(len + 1, flags); if (ptr == NULL) return NULL; memcpy(ptr, str, len); ptr[len] = '\0'; return ptr; } void kmem_strfree(char *str) { if (str == NULL) return; kmem_free(str, strlen(str) + 1); } /* * Utility routine to maybe-allocate a temporary buffer if the size * is larger than we're willing to put on the stack. */ void * kmem_tmpbuf_alloc(size_t size, void *stackbuf, size_t stackbufsize, km_flag_t flags) { if (size <= stackbufsize) { return stackbuf; } return kmem_alloc(size, flags); } void kmem_tmpbuf_free(void *buf, size_t size, void *stackbuf) { if (buf != stackbuf) { kmem_free(buf, size); } } /* --------------------------- DEBUG / DIAGNOSTIC --------------------------- */ #if defined(KMEM_SIZE) static void kmem_size_set(void *p, size_t sz) { memcpy((char *)p + sz, &sz, sizeof(size_t)); } static void kmem_size_check(void *p, size_t sz) { size_t hsz; memcpy(&hsz, (char *)p + sz, sizeof(size_t)); if (hsz != sz) { panic("kmem_free(%p, %zu) != allocated size %zu; overwrote?", p, sz, hsz); } memset((char *)p + sz, 0xff, sizeof(size_t)); } #endif /* defined(KMEM_SIZE) */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 /* $NetBSD: msgbuf.h,v 1.18 2022/10/26 23:28:43 riastradh Exp $ */ /* * Copyright (c) 1981, 1984, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)msgbuf.h 8.1 (Berkeley) 6/2/93 */ #ifndef _SYS_MSGBUF_H_ #define _SYS_MSGBUF_H_ struct kern_msgbuf { #define MSG_MAGIC 0x063061 long msg_magic; long msg_bufx; /* write pointer */ long msg_bufr; /* read pointer */ long msg_bufs; /* real msg_bufc size (bytes) */ char msg_bufc[1]; /* buffer */ }; #ifdef _KERNEL extern int msgbufmapped; /* is the message buffer mapped */ extern int msgbufenabled; /* is logging to the buffer enabled */ extern struct kern_msgbuf *msgbufp; /* the mapped buffer, itself. */ extern int log_open; /* is /dev/klog open? */ void initmsgbuf(void *, size_t); void loginit(void); void logputchar(int); static __inline int logenabled(const struct kern_msgbuf *mbp) { return msgbufenabled && mbp->msg_magic == MSG_MAGIC; } #endif #endif /* !_SYS_MSGBUF_H_ */
3 3 3 3 3 3 3 3 3 3 3 3 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 /* $NetBSD: usbroothub.c,v 1.16 2024/02/04 05:43:06 mrg Exp $ */ /*- * Copyright (c) 1998, 2004, 2011, 2012 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Lennart Augustsson (lennart@augustsson.net) at * Carlstedt Research & Technology, Jared D. McNeill (jmcneill@invisible.ca), * Matthew R. Green (mrg@eterna23.net) and Nick Hudson. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /* * Copyright (c) 2008 * Matthias Drochner. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: usbroothub.c,v 1.16 2024/02/04 05:43:06 mrg Exp $"); #include <sys/param.h> #include <sys/systm.h> /* for ostype */ #include <dev/usb/usb.h> #include <dev/usb/usbdi.h> #include <dev/usb/usbdivar.h> #include <dev/usb/usbroothub.h> #include <dev/usb/usbhist.h> /* helper functions for USB root hub emulation */ static usbd_status roothub_ctrl_transfer(struct usbd_xfer *); static usbd_status roothub_ctrl_start(struct usbd_xfer *); static void roothub_ctrl_abort(struct usbd_xfer *); static void roothub_ctrl_close(struct usbd_pipe *); static void roothub_ctrl_done(struct usbd_xfer *); static void roothub_noop(struct usbd_pipe *pipe); const struct usbd_pipe_methods roothub_ctrl_methods = { .upm_transfer = roothub_ctrl_transfer, .upm_start = roothub_ctrl_start, .upm_abort = roothub_ctrl_abort, .upm_close = roothub_ctrl_close, .upm_cleartoggle = roothub_noop, .upm_done = roothub_ctrl_done, }; int usb_makestrdesc(usb_string_descriptor_t *p, int l, const char *s) { int i; if (l == 0) return 0; p->bLength = 2 * strlen(s) + 2; if (l == 1) return 1; p->bDescriptorType = UDESC_STRING; l -= 2; /* poor man's utf-16le conversion */ for (i = 0; s[i] && l > 1; i++, l -= 2) USETW2(p->bString[i], 0, s[i]); return 2 * i + 2; } int usb_makelangtbl(usb_string_descriptor_t *p, int l) { if (l == 0) return 0; p->bLength = 4; if (l == 1) return 1; p->bDescriptorType = UDESC_STRING; if (l < 4) return 2; USETW(p->bString[0], 0x0409); /* english/US */ return 4; } /* * Data structures and routines to emulate the root hub. */ static const usb_device_descriptor_t usbroothub_devd1 = { .bLength = sizeof(usb_device_descriptor_t), .bDescriptorType = UDESC_DEVICE, .bcdUSB = {0x00, 0x01}, .bDeviceClass = UDCLASS_HUB, .bDeviceSubClass = UDSUBCLASS_HUB, .bDeviceProtocol = UDPROTO_FSHUB, .bMaxPacketSize = 64, .idVendor = {0}, .idProduct = {0}, .bcdDevice = {0x00, 0x01}, .iManufacturer = 1, .iProduct = 2, .iSerialNumber = 0, .bNumConfigurations = 1 }; static const struct usb_roothub_descriptors usbroothub_confd1 = { .urh_confd = { .bLength = USB_CONFIG_DESCRIPTOR_SIZE, .bDescriptorType = UDESC_CONFIG, .wTotalLength = USETWD(sizeof(usbroothub_confd1)), .bNumInterface = 1, .bConfigurationValue = 1, .iConfiguration = 0, .bmAttributes = UC_ATTR_MBO | UC_SELF_POWERED, .bMaxPower = 0, }, .urh_ifcd = { .bLength = USB_INTERFACE_DESCRIPTOR_SIZE, .bDescriptorType = UDESC_INTERFACE, .bInterfaceNumber = 0, .bAlternateSetting = 0, .bNumEndpoints = 1, .bInterfaceClass = UICLASS_HUB, .bInterfaceSubClass = UISUBCLASS_HUB, .bInterfaceProtocol = UIPROTO_FSHUB, .iInterface = 0 }, .urh_endpd = { .bLength = USB_ENDPOINT_DESCRIPTOR_SIZE, .bDescriptorType = UDESC_ENDPOINT, .bEndpointAddress = UE_DIR_IN | USBROOTHUB_INTR_ENDPT, .bmAttributes = UE_INTERRUPT, .wMaxPacketSize = USETWD(8), /* max packet */ .bInterval = 255, }, }; /* USB 3.0 10.15.1 */ static const usb_device_descriptor_t usbroothub_devd3 = { .bLength = sizeof(usb_device_descriptor_t), .bDescriptorType = UDESC_DEVICE, .bcdUSB = {0x00, 0x03}, .bDeviceClass = UDCLASS_HUB, .bDeviceSubClass = UDSUBCLASS_HUB, .bDeviceProtocol = UDPROTO_SSHUB, .bMaxPacketSize = 9, .idVendor = {0}, .idProduct = {0}, .bcdDevice = {0x00, 0x01}, .iManufacturer = 1, .iProduct = 2, .iSerialNumber = 0, .bNumConfigurations = 1 }; static const usb_device_descriptor_t usbroothub_devd2 = { .bLength = sizeof(usb_device_descriptor_t), .bDescriptorType = UDESC_DEVICE, .bcdUSB = {0x00, 0x02}, .bDeviceClass = UDCLASS_HUB, .bDeviceSubClass = UDSUBCLASS_HUB, .bDeviceProtocol = UDPROTO_HSHUBSTT, .bMaxPacketSize = 64, .idVendor = {0}, .idProduct = {0}, .bcdDevice = {0x00, 0x01}, .iManufacturer = 1, .iProduct = 2, .iSerialNumber = 0, .bNumConfigurations = 1 }; static const usb_device_qualifier_t usbroothub_odevd2 = { .bLength = USB_DEVICE_QUALIFIER_SIZE, .bDescriptorType = UDESC_DEVICE_QUALIFIER, .bcdUSB = {0x00, 0x02}, .bDeviceClass = UDCLASS_HUB, .bDeviceSubClass = UDSUBCLASS_HUB, .bDeviceProtocol = UDPROTO_FSHUB, .bMaxPacketSize0 = 64, .bNumConfigurations = 1, }; static const struct usb_roothub_descriptors usbroothub_confd2 = { .urh_confd = { .bLength = USB_CONFIG_DESCRIPTOR_SIZE, .bDescriptorType = UDESC_CONFIG, .wTotalLength = USETWD(sizeof(usbroothub_confd2)), .bNumInterface = 1, .bConfigurationValue = 1, .iConfiguration = 0, .bmAttributes = UC_ATTR_MBO | UC_SELF_POWERED, .bMaxPower = 0, }, .urh_ifcd = { .bLength = USB_INTERFACE_DESCRIPTOR_SIZE, .bDescriptorType = UDESC_INTERFACE, .bInterfaceNumber = 0, .bAlternateSetting = 0, .bNumEndpoints = 1, .bInterfaceClass = UICLASS_HUB, .bInterfaceSubClass = UISUBCLASS_HUB, .bInterfaceProtocol = UIPROTO_HSHUBSTT, .iInterface = 0 }, .urh_endpd = { .bLength = USB_ENDPOINT_DESCRIPTOR_SIZE, .bDescriptorType = UDESC_ENDPOINT, .bEndpointAddress = UE_DIR_IN | USBROOTHUB_INTR_ENDPT, .bmAttributes = UE_INTERRUPT, .wMaxPacketSize = USETWD(8), /* max packet */ .bInterval = 12, }, }; static const struct usb3_roothub_descriptors usbroothub_confd3 = { .urh_confd = { .bLength = USB_CONFIG_DESCRIPTOR_SIZE, .bDescriptorType = UDESC_CONFIG, .wTotalLength = USETWD(sizeof(usbroothub_confd3)), .bNumInterface = 1, .bConfigurationValue = 1, .iConfiguration = 0, .bmAttributes = UC_SELF_POWERED, /* 10.13.1 */ .bMaxPower = 0, }, .urh_ifcd = { .bLength = USB_INTERFACE_DESCRIPTOR_SIZE, .bDescriptorType = UDESC_INTERFACE, .bInterfaceNumber = 0, .bAlternateSetting = 0, .bNumEndpoints = 1, .bInterfaceClass = UICLASS_HUB, .bInterfaceSubClass = UISUBCLASS_HUB, .bInterfaceProtocol = 0, /* UIPROTO_SSHUB ??? */ .iInterface = 0 }, .urh_endpd = { .bLength = USB_ENDPOINT_DESCRIPTOR_SIZE, .bDescriptorType = UDESC_ENDPOINT, .bEndpointAddress = UE_DIR_IN | USBROOTHUB_INTR_ENDPT, .bmAttributes = UE_INTERRUPT, .wMaxPacketSize = USETWD(2), /* max packet */ .bInterval = 8, }, .urh_endpssd = { .bLength = USB_ENDPOINT_SS_COMP_DESCRIPTOR_SIZE, .bDescriptorType = UDESC_ENDPOINT_SS_COMP, .bMaxBurst = 0, .bmAttributes = 0, .wBytesPerInterval = USETWD(2) }, }; static const struct usb3_roothub_bos_descriptors usbroothub_bosd3 = { .urh_bosd = { .bLength = USB_BOS_DESCRIPTOR_SIZE, .bDescriptorType = UDESC_BOS, .wTotalLength = USETWD(sizeof(usbroothub_bosd3)), .bNumDeviceCaps = 3, }, /* 9.6.2.1 USB 2.0 Extension */ .urh_usb2extd = { .bLength = USB_DEVCAP_USB2EXT_DESCRIPTOR_SIZE, .bDescriptorType = 1, .bDevCapabilityType = 2, .bmAttributes[0] = 2, }, /* 9.6.2.2 Superspeed device capability */ .urh_ssd = { .bLength = USB_DEVCAP_SS_DESCRIPTOR_SIZE, .bDescriptorType = UDESC_DEVICE_CAPABILITY, .bDevCapabilityType = USB_DEVCAP_SUPER_SPEED, .bmAttributes = 0, /* USB_DEVCAP_SS_LTM */ .wSpeedsSupported = USETWD( USB_DEVCAP_SS_SPEED_LS | USB_DEVCAP_SS_SPEED_FS | USB_DEVCAP_SS_SPEED_HS | USB_DEVCAP_SS_SPEED_SS), .bFunctionalitySupport = 8, /* SS is 3, i.e. 1 << 3? */ .bU1DevExitLat = 255, /* Dummy... 0? */ .wU2DevExitLat = USETWD(8), /* Also dummy... 0? */ }, /* 9.6.2.3 Container ID - see RFC 4122 */ .urh_containerd = { .bLength = USB_DEVCAP_CONTAINER_ID_DESCRIPTOR_SIZE, .bDescriptorType = 1, .bDevCapabilityType = 4, .bReserved = 0, // ContainerID will be zero }, }; static const usb_hub_descriptor_t usbroothub_hubd = { .bDescLength = USB_HUB_DESCRIPTOR_SIZE, .bDescriptorType = UDESC_HUB, .bNbrPorts = 1, .wHubCharacteristics = USETWD(UHD_PWR_NO_SWITCH | UHD_OC_INDIVIDUAL), .bPwrOn2PwrGood = 50, .bHubContrCurrent = 0, .DeviceRemovable = {0}, /* port is removable */ }; /* * Simulate a hardware hub by handling all the necessary requests. */ usbd_status roothub_ctrl_transfer(struct usbd_xfer *xfer) { /* Pipe isn't running, start first */ return roothub_ctrl_start(SIMPLEQ_FIRST(&xfer->ux_pipe->up_queue)); } static usbd_status roothub_ctrl_start(struct usbd_xfer *xfer) { struct usbd_pipe *pipe = xfer->ux_pipe; struct usbd_bus *bus = pipe->up_dev->ud_bus; usb_device_request_t *req; usbd_status err = USBD_IOERROR; /* XXX STALL? */ uint16_t len, value; int buflen, actlen = -1; void *buf; USBHIST_FUNC(); /* * XXX Should really assert pipe lock, in case ever have * per-pipe locking instead of using the bus lock for all * pipes. */ KASSERT(bus->ub_usepolling || mutex_owned(bus->ub_lock)); /* Roothub xfers are serialized through the pipe. */ KASSERTMSG(bus->ub_rhxfer == NULL, "rhxfer=%p", bus->ub_rhxfer); KASSERT(xfer->ux_rqflags & URQ_REQUEST); req = &xfer->ux_request; len = UGETW(req->wLength); value = UGETW(req->wValue); USBHIST_CALLARGS(usbdebug, "type=%#jx request=%#jx len=%#jx value=%#jx", req->bmRequestType, req->bRequest, len, value); buf = len ? usbd_get_buffer(xfer) : NULL; buflen = 0; #define C(x,y) ((x) | ((y) << 8)) switch (C(req->bRequest, req->bmRequestType)) { case C(UR_CLEAR_FEATURE, UT_WRITE_DEVICE): case C(UR_CLEAR_FEATURE, UT_WRITE_INTERFACE): case C(UR_CLEAR_FEATURE, UT_WRITE_ENDPOINT): /* * DEVICE_REMOTE_WAKEUP and ENDPOINT_HALT are no-ops * for the integrated root hub. */ break; case C(UR_GET_CONFIG, UT_READ_DEVICE): if (len > 0) { uint8_t *out = buf; *out = bus->ub_rhconf; buflen = sizeof(*out); } break; case C(UR_GET_DESCRIPTOR, UT_READ_DEVICE): USBHIST_LOG(usbdebug, "wValue=%#4jx", value, 0, 0, 0); if (len == 0) break; switch (value) { case C(0, UDESC_DEVICE): if (bus->ub_revision >= USBREV_3_0) { buflen = uimin(len, sizeof(usbroothub_devd3)); memcpy(buf, &usbroothub_devd3, buflen); } else if (bus->ub_revision == USBREV_2_0) { buflen = uimin(len, sizeof(usbroothub_devd2)); memcpy(buf, &usbroothub_devd2, buflen); } else { buflen = uimin(len, sizeof(usbroothub_devd1)); memcpy(buf, &usbroothub_devd1, buflen); } break; case C(0, UDESC_CONFIG): if (bus->ub_revision >= USBREV_3_0) { buflen = uimin(len, sizeof(usbroothub_confd3)); memcpy(buf, &usbroothub_confd3, buflen); } else if (bus->ub_revision == USBREV_2_0) { buflen = uimin(len, sizeof(usbroothub_confd2)); memcpy(buf, &usbroothub_confd2, buflen); } else { buflen = uimin(len, sizeof(usbroothub_confd1)); memcpy(buf, &usbroothub_confd1, buflen); } break; case C(0, UDESC_DEVICE_QUALIFIER): if (bus->ub_revision == USBREV_2_0) { /* * We can't really operate at another speed, * but the spec says we need this descriptor. */ buflen = uimin(len, sizeof(usbroothub_odevd2)); memcpy(buf, &usbroothub_odevd2, buflen); } else goto fail; break; case C(0, UDESC_OTHER_SPEED_CONFIGURATION): if (bus->ub_revision == USBREV_2_0) { struct usb_roothub_descriptors confd; /* * We can't really operate at another speed, * but the spec says we need this descriptor. */ buflen = uimin(len, sizeof(usbroothub_confd2)); memcpy(&confd, &usbroothub_confd2, buflen); confd.urh_confd.bDescriptorType = UDESC_OTHER_SPEED_CONFIGURATION; memcpy(buf, &confd, buflen); } else goto fail; break; case C(0, UDESC_BOS): if (bus->ub_revision >= USBREV_3_0) { buflen = uimin(len, sizeof(usbroothub_bosd3)); memcpy(buf, &usbroothub_bosd3, buflen); } else goto fail; break; #define sd ((usb_string_descriptor_t *)buf) case C(0, UDESC_STRING): /* Language table */ buflen = usb_makelangtbl(sd, len); break; case C(1, UDESC_STRING): /* Vendor */ buflen = usb_makestrdesc(sd, len, ostype); break; case C(2, UDESC_STRING): /* Product */ buflen = usb_makestrdesc(sd, len, "Root hub"); break; #undef sd default: /* Default to error */ buflen = -1; } break; case C(UR_GET_DESCRIPTOR, UT_READ_CLASS_DEVICE): buflen = uimin(len, sizeof(usbroothub_hubd)); memcpy(buf, &usbroothub_hubd, buflen); break; case C(UR_GET_INTERFACE, UT_READ_INTERFACE): /* Get Interface, 9.4.4 */ if (len > 0) { uint8_t *out = buf; *out = 0; buflen = sizeof(*out); } break; case C(UR_GET_STATUS, UT_READ_DEVICE): /* Get Status from device, 9.4.5 */ if (len > 1) { usb_status_t *out = buf; USETW(out->wStatus, UDS_SELF_POWERED); buflen = sizeof(*out); } break; case C(UR_GET_STATUS, UT_READ_INTERFACE): case C(UR_GET_STATUS, UT_READ_ENDPOINT): /* Get Status from interface, endpoint, 9.4.5 */ if (len > 1) { usb_status_t *out = buf; USETW(out->wStatus, 0); buflen = sizeof(*out); } break; case C(UR_SET_ADDRESS, UT_WRITE_DEVICE): /* Set Address, 9.4.6 */ USBHIST_LOG(usbdebug, "UR_SET_ADDRESS, UT_WRITE_DEVICE: " "addr %jd", value, 0, 0, 0); if (value >= USB_MAX_DEVICES) { goto fail; } bus->ub_rhaddr = value; break; case C(UR_SET_CONFIG, UT_WRITE_DEVICE): /* Set Configuration, 9.4.7 */ if (value != 0 && value != 1) { goto fail; } bus->ub_rhconf = value; break; case C(UR_SET_DESCRIPTOR, UT_WRITE_DEVICE): /* Set Descriptor, 9.4.8, not supported */ break; case C(UR_SET_FEATURE, UT_WRITE_DEVICE): case C(UR_SET_FEATURE, UT_WRITE_INTERFACE): case C(UR_SET_FEATURE, UT_WRITE_ENDPOINT): /* Set Feature, 9.4.9, not supported */ goto fail; case C(UR_SET_INTERFACE, UT_WRITE_INTERFACE): /* Set Interface, 9.4.10, not supported */ break; case C(UR_SYNCH_FRAME, UT_WRITE_ENDPOINT): /* Synch Frame, 9.4.11, not supported */ break; default: /* Default to error */ buflen = -1; break; } KASSERTMSG(bus->ub_rhxfer == NULL, "rhxfer=%p", bus->ub_rhxfer); bus->ub_rhxfer = xfer; if (!bus->ub_usepolling) mutex_exit(bus->ub_lock); actlen = bus->ub_methods->ubm_rhctrl(bus, req, buf, buflen); if (!bus->ub_usepolling) mutex_enter(bus->ub_lock); KASSERTMSG(bus->ub_rhxfer == xfer, "rhxfer=%p", bus->ub_rhxfer); bus->ub_rhxfer = NULL; cv_signal(&bus->ub_rhxfercv); if (actlen < 0) goto fail; xfer->ux_actlen = actlen; err = USBD_NORMAL_COMPLETION; fail: USBHIST_LOG(usbdebug, "xfer %#jx buflen %jd actlen %jd err %jd", (uintptr_t)xfer, buflen, actlen, err); xfer->ux_status = err; usb_transfer_complete(xfer); return USBD_NORMAL_COMPLETION; } /* Abort a root control request. */ Static void roothub_ctrl_abort(struct usbd_xfer *xfer) { struct usbd_bus *bus = xfer->ux_bus; KASSERT(mutex_owned(bus->ub_lock)); KASSERTMSG(bus->ub_rhxfer == xfer, "rhxfer=%p", bus->ub_rhxfer); /* * No mechanism to abort the xfer (would have to coordinate * with the bus's ubm_rhctrl to be useful, and usually at most * there's some short bounded delays of a few tens of * milliseconds), so just wait for it to complete. */ while (bus->ub_rhxfer == xfer) cv_wait(&bus->ub_rhxfercv, bus->ub_lock); } /* Close the root pipe. */ Static void roothub_ctrl_close(struct usbd_pipe *pipe) { /* Nothing to do. */ } Static void roothub_ctrl_done(struct usbd_xfer *xfer) { /* Nothing to do. */ } static void roothub_noop(struct usbd_pipe *pipe) { }
203 187 3 5 2 203 203 204 163 182 202 203 204 203 203 203 202 202 197 3 197 3 3 203 203 203 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 /* $NetBSD: bufq_priocscan.c,v 1.21 2017/05/04 11:03:27 kamil Exp $ */ /*- * Copyright (c)2004,2005,2006,2008,2009,2011,2012 YAMAMOTO Takashi, * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: bufq_priocscan.c,v 1.21 2017/05/04 11:03:27 kamil Exp $"); #include <sys/param.h> #include <sys/systm.h> #include <sys/buf.h> #include <sys/bufq.h> #include <sys/bufq_impl.h> #include <sys/kmem.h> #include <sys/rbtree.h> #include <sys/module.h> #undef PRIOCSCAN_USE_GLOBAL_POSITION /* * Cyclical scan (CSCAN) */ struct cscan_key { daddr_t k_rawblkno; int k_cylinder; }; struct cscan_queue { rb_tree_t cq_buffers; /* ordered list of buffers */ #if !defined(PRIOCSCAN_USE_GLOBAL_POSITION) struct cscan_key cq_lastkey; /* key of last request */ #endif /* !defined(PRIOCSCAN_USE_GLOBAL_POSITION) */ int cq_sortby; /* BUFQ_SORT_MASK */ rb_tree_ops_t cq_ops; }; static signed int buf_cmp(const struct buf *b1, const struct buf *b2, int sortby) { if (buf_inorder(b2, b1, sortby)) { return 1; /* b1 > b2 */ } if (buf_inorder(b1, b2, sortby)) { return -1; /* b1 < b2 */ } return 0; } /* return positive if n1 > n2 */ static signed int cscan_tree_compare_nodes(void *context, const void *n1, const void *n2) { const struct cscan_queue * const q = context; const struct buf * const b1 = n1; const struct buf * const b2 = n2; const int sortby = q->cq_sortby; const int diff = buf_cmp(b1, b2, sortby); /* * XXX rawblkno/cylinder might not be unique. eg. unbuffered i/o */ if (diff != 0) { return diff; } /* * XXX rawblkno/cylinder might not be unique. eg. unbuffered i/o */ if (b1 > b2) { return 1; } if (b1 < b2) { return -1; } return 0; } /* return positive if n1 > k2 */ static signed int cscan_tree_compare_key(void *context, const void *n1, const void *k2) { const struct cscan_queue * const q = context; const struct buf * const b1 = n1; const struct cscan_key * const key = k2; const struct buf tmp = { .b_rawblkno = key->k_rawblkno, .b_cylinder = key->k_cylinder, }; const struct buf *b2 = &tmp; const int sortby = q->cq_sortby; return buf_cmp(b1, b2, sortby); } static void __unused cscan_dump(struct cscan_queue *cq) { const int sortby = cq->cq_sortby; struct buf *bp; RB_TREE_FOREACH(bp, &cq->cq_buffers) { if (sortby == BUFQ_SORT_RAWBLOCK) { printf(" %jd", (intmax_t)bp->b_rawblkno); } else { printf(" %jd/%jd", (intmax_t)bp->b_cylinder, (intmax_t)bp->b_rawblkno); } } } static inline bool cscan_empty(struct cscan_queue *q) { /* XXX this might do more work than necessary */ return rb_tree_iterate(&q->cq_buffers, NULL, RB_DIR_LEFT) == NULL; } static void cscan_put(struct cscan_queue *q, struct buf *bp) { struct buf *obp __diagused; obp = rb_tree_insert_node(&q->cq_buffers, bp); KASSERT(obp == bp); /* see cscan_tree_compare_nodes */ } static struct buf * cscan_get(struct cscan_queue *q, int remove, struct cscan_key *key) { struct buf *bp; bp = rb_tree_find_node_geq(&q->cq_buffers, key); KDASSERT(bp == NULL || cscan_tree_compare_key(q, bp, key) >= 0); if (bp == NULL) { bp = rb_tree_iterate(&q->cq_buffers, NULL, RB_DIR_LEFT); KDASSERT(cscan_tree_compare_key(q, bp, key) < 0); } if (bp != NULL && remove) { #if defined(DEBUG) struct buf *nbp; #endif /* defined(DEBUG) */ rb_tree_remove_node(&q->cq_buffers, bp); /* * remember the head position. */ key->k_cylinder = bp->b_cylinder; key->k_rawblkno = bp->b_rawblkno + (bp->b_bcount >> DEV_BSHIFT); #if defined(DEBUG) nbp = rb_tree_find_node_geq(&q->cq_buffers, key); if (nbp != NULL && cscan_tree_compare_nodes(q, nbp, bp) < 0) { panic("%s: wrong order %p < %p\n", __func__, nbp, bp); } #endif /* defined(DEBUG) */ } return bp; } static void cscan_init(struct cscan_queue *q, int sortby) { static const rb_tree_ops_t cscan_tree_ops = { .rbto_compare_nodes = cscan_tree_compare_nodes, .rbto_compare_key = cscan_tree_compare_key, .rbto_node_offset = offsetof(struct buf, b_u.u_rbnode), .rbto_context = NULL, }; q->cq_sortby = sortby; /* XXX copy ops to workaround rbtree.h API limitation */ q->cq_ops = cscan_tree_ops; q->cq_ops.rbto_context = q; rb_tree_init(&q->cq_buffers, &q->cq_ops); } /* * Per-prioritiy CSCAN. * * XXX probably we should have a way to raise * priority of the on-queue requests. */ #define PRIOCSCAN_NQUEUE 3 struct priocscan_queue { struct cscan_queue q_queue; unsigned int q_burst; }; struct bufq_priocscan { struct priocscan_queue bq_queue[PRIOCSCAN_NQUEUE]; #if defined(PRIOCSCAN_USE_GLOBAL_POSITION) /* * XXX using "global" head position can reduce positioning time * when switching between queues. * although it might affect against fairness. */ struct cscan_key bq_lastkey; #endif }; /* * how many requests to serve when having pending requests on other queues. * * XXX tune * be careful: while making these values larger likely * increases the total throughput, it can also increase latencies * for some workloads. */ const int priocscan_burst[] = { 64, 16, 4 }; static void bufq_priocscan_init(struct bufq_state *); static void bufq_priocscan_put(struct bufq_state *, struct buf *); static struct buf *bufq_priocscan_get(struct bufq_state *, int); BUFQ_DEFINE(priocscan, 40, bufq_priocscan_init); static inline struct cscan_queue *bufq_priocscan_selectqueue( struct bufq_priocscan *, const struct buf *); static inline struct cscan_queue * bufq_priocscan_selectqueue(struct bufq_priocscan *q, const struct buf *bp) { static const int priocscan_priomap[] = { [BPRIO_TIMENONCRITICAL] = 2, [BPRIO_TIMELIMITED] = 1, [BPRIO_TIMECRITICAL] = 0 }; return &q->bq_queue[priocscan_priomap[BIO_GETPRIO(bp)]].q_queue; } static void bufq_priocscan_put(struct bufq_state *bufq, struct buf *bp) { struct bufq_priocscan *q = bufq_private(bufq); struct cscan_queue *cq; cq = bufq_priocscan_selectqueue(q, bp); cscan_put(cq, bp); } static struct buf * bufq_priocscan_get(struct bufq_state *bufq, int remove) { struct bufq_priocscan *q = bufq_private(bufq); struct priocscan_queue *pq, *npq; struct priocscan_queue *first; /* highest priority non-empty queue */ const struct priocscan_queue *epq; struct buf *bp; bool single; /* true if there's only one non-empty queue */ /* * find the highest priority non-empty queue. */ pq = &q->bq_queue[0]; epq = pq + PRIOCSCAN_NQUEUE; for (; pq < epq; pq++) { if (!cscan_empty(&pq->q_queue)) { break; } } if (pq == epq) { /* * all our queues are empty. there's nothing to serve. */ return NULL; } first = pq; /* * scan the rest of queues. * * if we have two or more non-empty queues, we serve the highest * priority one with non-zero burst count. */ single = true; for (npq = pq + 1; npq < epq; npq++) { if (!cscan_empty(&npq->q_queue)) { /* * we found another non-empty queue. * it means that a queue needs to consume its burst * count to be served. */ single = false; /* * check if our current candidate queue has already * exhausted its burst count. */ if (pq->q_burst > 0) { break; } pq = npq; } } if (single) { /* * there's only a non-empty queue. * just serve it without consuming its burst count. */ KASSERT(pq == first); } else { /* * there are two or more non-empty queues. */ if (pq->q_burst == 0) { /* * no queues can be served because they have already * exhausted their burst count. */ unsigned int i; #ifdef DEBUG for (i = 0; i < PRIOCSCAN_NQUEUE; i++) { pq = &q->bq_queue[i]; if (!cscan_empty(&pq->q_queue) && pq->q_burst) { panic("%s: inconsist", __func__); } } #endif /* DEBUG */ /* * reset burst counts. */ if (remove) { for (i = 0; i < PRIOCSCAN_NQUEUE; i++) { pq = &q->bq_queue[i]; pq->q_burst = priocscan_burst[i]; } } /* * serve the highest priority non-empty queue. */ pq = first; } /* * consume the burst count. * * XXX account only by number of requests. is it good enough? */ if (remove) { KASSERT(pq->q_burst > 0); pq->q_burst--; } } /* * finally, get a request from the selected queue. */ KDASSERT(!cscan_empty(&pq->q_queue)); bp = cscan_get(&pq->q_queue, remove, #if defined(PRIOCSCAN_USE_GLOBAL_POSITION) &q->bq_lastkey #else /* defined(PRIOCSCAN_USE_GLOBAL_POSITION) */ &pq->q_queue.cq_lastkey #endif /* defined(PRIOCSCAN_USE_GLOBAL_POSITION) */ ); KDASSERT(bp != NULL); KDASSERT(&pq->q_queue == bufq_priocscan_selectqueue(q, bp)); return bp; } static struct buf * bufq_priocscan_cancel(struct bufq_state *bufq, struct buf *bp) { struct bufq_priocscan * const q = bufq_private(bufq); unsigned int i; for (i = 0; i < PRIOCSCAN_NQUEUE; i++) { struct cscan_queue * const cq = &q->bq_queue[i].q_queue; struct buf *it; /* * XXX probably could be faster but the cancel functionality * is not widely used anyway. */ RB_TREE_FOREACH(it, &cq->cq_buffers) { if (it == bp) { rb_tree_remove_node(&cq->cq_buffers, bp); return bp; } } } return NULL; } static void bufq_priocscan_fini(struct bufq_state *bufq) { KASSERT(bufq->bq_private != NULL); kmem_free(bufq->bq_private, sizeof(struct bufq_priocscan)); } static void bufq_priocscan_init(struct bufq_state *bufq) { struct bufq_priocscan *q; const int sortby = bufq->bq_flags & BUFQ_SORT_MASK; unsigned int i; bufq->bq_get = bufq_priocscan_get; bufq->bq_put = bufq_priocscan_put; bufq->bq_cancel = bufq_priocscan_cancel; bufq->bq_fini = bufq_priocscan_fini; bufq->bq_private = kmem_zalloc(sizeof(struct bufq_priocscan), KM_SLEEP); q = bufq->bq_private; for (i = 0; i < PRIOCSCAN_NQUEUE; i++) { struct cscan_queue *cq = &q->bq_queue[i].q_queue; cscan_init(cq, sortby); } } MODULE(MODULE_CLASS_BUFQ, bufq_priocscan, NULL); static int bufq_priocscan_modcmd(modcmd_t cmd, void *opaque) { switch (cmd) { case MODULE_CMD_INIT: return bufq_register(&bufq_strat_priocscan); case MODULE_CMD_FINI: return bufq_unregister(&bufq_strat_priocscan); default: return ENOTTY; } }
1 6 26 1 3 31 31 20 27 5 20 11 7 7 6 27 27 33 33 13 20 31 27 7 34 2 32 1 31 31 31 31 21 12 9 19 2 21 21 21 1 11 8 21 21 10 11 2 2 2 10 10 2 2 34 34 34 9 9 8 7 5 2 3 3 7 1 3 4 7 1 34 11 11 11 9 3 1 4 21 21 21 1 7 1 21 48 44 22 17 31 21 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 /* $NetBSD: in6_pcb.c,v 1.177 2022/11/04 09:04:27 ozaki-r Exp $ */ /* $KAME: in6_pcb.c,v 1.84 2001/02/08 18:02:08 itojun Exp $ */ /* * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the project nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * Copyright (c) 1982, 1986, 1991, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)in_pcb.c 8.2 (Berkeley) 1/4/94 */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: in6_pcb.c,v 1.177 2022/11/04 09:04:27 ozaki-r Exp $"); #ifdef _KERNEL_OPT #include "opt_inet.h" #include "opt_ipsec.h" #endif #include <sys/param.h> #include <sys/systm.h> #include <sys/mbuf.h> #include <sys/protosw.h> #include <sys/socket.h> #include <sys/socketvar.h> #include <sys/ioctl.h> #include <sys/errno.h> #include <sys/time.h> #include <sys/proc.h> #include <sys/kauth.h> #include <sys/domain.h> #include <sys/once.h> #include <net/if.h> #include <net/route.h> #include <netinet/in.h> #include <netinet/in_var.h> #include <netinet/in_systm.h> #include <netinet/ip.h> #include <netinet/in_pcb.h> #include <netinet/ip6.h> #include <netinet/portalgo.h> #include <netinet6/ip6_var.h> #include <netinet6/in6_pcb.h> #include <netinet6/scope6_var.h> #include "faith.h" #ifdef IPSEC #include <netipsec/ipsec.h> #include <netipsec/ipsec6.h> #include <netipsec/key.h> #endif /* IPSEC */ #include <netinet/tcp_vtw.h> const struct in6_addr zeroin6_addr; #define IN6PCBHASH_PORT(table, lport) \ &(table)->inpt_porthashtbl[ntohs(lport) & (table)->inpt_porthash] #define IN6PCBHASH_BIND(table, laddr, lport) \ &(table)->inpt_bindhashtbl[ \ (((laddr)->s6_addr32[0] ^ (laddr)->s6_addr32[1] ^ \ (laddr)->s6_addr32[2] ^ (laddr)->s6_addr32[3]) + ntohs(lport)) & \ (table)->inpt_bindhash] #define IN6PCBHASH_CONNECT(table, faddr, fport, laddr, lport) \ &(table)->inpt_bindhashtbl[ \ ((((faddr)->s6_addr32[0] ^ (faddr)->s6_addr32[1] ^ \ (faddr)->s6_addr32[2] ^ (faddr)->s6_addr32[3]) + ntohs(fport)) + \ (((laddr)->s6_addr32[0] ^ (laddr)->s6_addr32[1] ^ \ (laddr)->s6_addr32[2] ^ (laddr)->s6_addr32[3]) + \ ntohs(lport))) & (table)->inpt_bindhash] int ip6_anonportmin = IPV6PORT_ANONMIN; int ip6_anonportmax = IPV6PORT_ANONMAX; int ip6_lowportmin = IPV6PORT_RESERVEDMIN; int ip6_lowportmax = IPV6PORT_RESERVEDMAX; void in6pcb_init(struct inpcbtable *table, int bindhashsize, int connecthashsize) { inpcb_init(table, bindhashsize, connecthashsize); table->inpt_lastport = (in_port_t)ip6_anonportmax; } /* * Bind address from sin6 to inp. */ static int in6pcb_bind_addr(struct inpcb *inp, struct sockaddr_in6 *sin6, struct lwp *l) { int error; int s; /* * We should check the family, but old programs * incorrectly fail to initialize it. */ if (sin6->sin6_family != AF_INET6) return EAFNOSUPPORT; #ifndef INET if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) return EADDRNOTAVAIL; #endif if ((error = sa6_embedscope(sin6, ip6_use_defzone)) != 0) return error; s = pserialize_read_enter(); if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) { if ((inp->inp_flags & IN6P_IPV6_V6ONLY) != 0) { error = EINVAL; goto out; } if (sin6->sin6_addr.s6_addr32[3]) { struct sockaddr_in sin; memset(&sin, 0, sizeof(sin)); sin.sin_len = sizeof(sin); sin.sin_family = AF_INET; bcopy(&sin6->sin6_addr.s6_addr32[3], &sin.sin_addr, sizeof(sin.sin_addr)); if (!IN_MULTICAST(sin.sin_addr.s_addr)) { struct ifaddr *ifa; ifa = ifa_ifwithaddr((struct sockaddr *)&sin); if (ifa == NULL && (inp->inp_flags & IN6P_BINDANY) == 0) { error = EADDRNOTAVAIL; goto out; } } } } else if (IN6_IS_ADDR_MULTICAST(&sin6->sin6_addr)) { // succeed } else if (!IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) { struct ifaddr *ifa = NULL; if ((inp->inp_flags & IN6P_FAITH) == 0) { ifa = ifa_ifwithaddr(sin6tosa(sin6)); if (ifa == NULL && (inp->inp_flags & IN6P_BINDANY) == 0) { error = EADDRNOTAVAIL; goto out; } } /* * bind to an anycast address might accidentally * cause sending a packet with an anycast source * address, so we forbid it. * * We should allow to bind to a deprecated address, * since the application dare to use it. * But, can we assume that they are careful enough * to check if the address is deprecated or not? * Maybe, as a safeguard, we should have a setsockopt * flag to control the bind(2) behavior against * deprecated addresses (default: forbid bind(2)). */ if (ifa && ifatoia6(ifa)->ia6_flags & (IN6_IFF_ANYCAST | IN6_IFF_DUPLICATED)) { error = EADDRNOTAVAIL; goto out; } } in6p_laddr(inp) = sin6->sin6_addr; error = 0; out: pserialize_read_exit(s); return error; } /* * Bind port from sin6 to inp. */ static int in6pcb_bind_port(struct inpcb *inp, struct sockaddr_in6 *sin6, struct lwp *l) { struct inpcbtable *table = inp->inp_table; struct socket *so = inp->inp_socket; int wild = 0, reuseport = (so->so_options & SO_REUSEPORT); int error; if ((so->so_options & (SO_REUSEADDR|SO_REUSEPORT)) == 0 && ((so->so_proto->pr_flags & PR_CONNREQUIRED) == 0 || (so->so_options & SO_ACCEPTCONN) == 0)) wild = 1; if (sin6->sin6_port != 0) { enum kauth_network_req req; #ifndef IPNOPRIVPORTS if (ntohs(sin6->sin6_port) < IPV6PORT_RESERVED) req = KAUTH_REQ_NETWORK_BIND_PRIVPORT; else #endif /* IPNOPRIVPORTS */ req = KAUTH_REQ_NETWORK_BIND_PORT; error = kauth_authorize_network(l->l_cred, KAUTH_NETWORK_BIND, req, so, sin6, NULL); if (error) return EACCES; } if (IN6_IS_ADDR_MULTICAST(&sin6->sin6_addr)) { /* * Treat SO_REUSEADDR as SO_REUSEPORT for multicast; * allow compepte duplication of binding if * SO_REUSEPORT is set, or if SO_REUSEADDR is set * and a multicast address is bound on both * new and duplicated sockets. */ if (so->so_options & (SO_REUSEADDR | SO_REUSEPORT)) reuseport = SO_REUSEADDR|SO_REUSEPORT; } if (sin6->sin6_port != 0) { if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) { #ifdef INET struct inpcb *t; struct vestigial_inpcb vestige; t = inpcb_lookup_local(table, *(struct in_addr *)&sin6->sin6_addr.s6_addr32[3], sin6->sin6_port, wild, &vestige); if (t && (reuseport & t->inp_socket->so_options) == 0) return EADDRINUSE; if (!t && vestige.valid && !(reuseport && vestige.reuse_port)) return EADDRINUSE; #else return EADDRNOTAVAIL; #endif } { struct inpcb *t; struct vestigial_inpcb vestige; t = in6pcb_lookup_local(table, &sin6->sin6_addr, sin6->sin6_port, wild, &vestige); if (t && (reuseport & t->inp_socket->so_options) == 0) return EADDRINUSE; if (!t && vestige.valid && !(reuseport && vestige.reuse_port)) return EADDRINUSE; } } if (sin6->sin6_port == 0) { int e; e = in6pcb_set_port(sin6, inp, l); if (e != 0) return e; } else { inp->inp_lport = sin6->sin6_port; inpcb_set_state(inp, INP_BOUND); } LIST_REMOVE(inp, inp_lhash); LIST_INSERT_HEAD(IN6PCBHASH_PORT(table, inp->inp_lport), inp, inp_lhash); return 0; } int in6pcb_bind(void *v, struct sockaddr_in6 *sin6, struct lwp *l) { struct inpcb *inp = v; struct sockaddr_in6 lsin6; int error; if (inp->inp_af != AF_INET6) return EINVAL; /* * If we already have a local port or a local address it means we're * bounded. */ if (inp->inp_lport || !(IN6_IS_ADDR_UNSPECIFIED(&in6p_laddr(inp)) || (IN6_IS_ADDR_V4MAPPED(&in6p_laddr(inp)) && in6p_laddr(inp).s6_addr32[3] == 0))) return EINVAL; if (NULL != sin6) { /* We were provided a sockaddr_in6 to use. */ if (sin6->sin6_len != sizeof(*sin6)) return EINVAL; } else { /* We always bind to *something*, even if it's "anything". */ lsin6 = *((const struct sockaddr_in6 *) inp->inp_socket->so_proto->pr_domain->dom_sa_any); sin6 = &lsin6; } /* Bind address. */ error = in6pcb_bind_addr(inp, sin6, l); if (error) return error; /* Bind port. */ error = in6pcb_bind_port(inp, sin6, l); if (error) { /* * Reset the address here to "any" so we don't "leak" the * inpcb. */ in6p_laddr(inp) = in6addr_any; return error; } #if 0 in6p_flowinfo(inp) = 0; /* XXX */ #endif return 0; } /* * Connect from a socket to a specified address. * Both address and port must be specified in argument sin6. * If don't have a local address for this socket yet, * then pick one. */ int in6pcb_connect(void *v, struct sockaddr_in6 *sin6, struct lwp *l) { struct inpcb *inp = v; struct in6_addr *in6a = NULL; struct in6_addr ia6; struct ifnet *ifp = NULL; /* outgoing interface */ int error = 0; int scope_ambiguous = 0; #ifdef INET struct in6_addr mapped; #endif struct sockaddr_in6 tmp; struct vestigial_inpcb vestige; struct psref psref; int bound; (void)&in6a; /* XXX fool gcc */ if (inp->inp_af != AF_INET6) return EINVAL; if (sin6->sin6_len != sizeof(*sin6)) return EINVAL; if (sin6->sin6_family != AF_INET6) return EAFNOSUPPORT; if (sin6->sin6_port == 0) return EADDRNOTAVAIL; if (IN6_IS_ADDR_MULTICAST(&sin6->sin6_addr) && inp->inp_socket->so_type == SOCK_STREAM) return EADDRNOTAVAIL; if (sin6->sin6_scope_id == 0 && !ip6_use_defzone) scope_ambiguous = 1; if ((error = sa6_embedscope(sin6, ip6_use_defzone)) != 0) return error; /* sanity check for mapped address case */ if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) { if ((inp->inp_flags & IN6P_IPV6_V6ONLY) != 0) return EINVAL; if (IN6_IS_ADDR_UNSPECIFIED(&in6p_laddr(inp))) in6p_laddr(inp).s6_addr16[5] = htons(0xffff); if (!IN6_IS_ADDR_V4MAPPED(&in6p_laddr(inp))) return EINVAL; } else { if (IN6_IS_ADDR_V4MAPPED(&in6p_laddr(inp))) return EINVAL; } /* protect *sin6 from overwrites */ tmp = *sin6; sin6 = &tmp; bound = curlwp_bind(); /* Source address selection. */ if (IN6_IS_ADDR_V4MAPPED(&in6p_laddr(inp)) && in6p_laddr(inp).s6_addr32[3] == 0) { #ifdef INET struct sockaddr_in sin; struct in_ifaddr *ia4; struct psref _psref; memset(&sin, 0, sizeof(sin)); sin.sin_len = sizeof(sin); sin.sin_family = AF_INET; memcpy(&sin.sin_addr, &sin6->sin6_addr.s6_addr32[3], sizeof(sin.sin_addr)); ia4 = in_selectsrc(&sin, &inp->inp_route, inp->inp_socket->so_options, NULL, &error, &_psref); if (ia4 == NULL) { if (error == 0) error = EADDRNOTAVAIL; curlwp_bindx(bound); return error; } memset(&mapped, 0, sizeof(mapped)); mapped.s6_addr16[5] = htons(0xffff); memcpy(&mapped.s6_addr32[3], &IA_SIN(ia4)->sin_addr, sizeof(IA_SIN(ia4)->sin_addr)); ia4_release(ia4, &_psref); in6a = &mapped; #else curlwp_bindx(bound); return EADDRNOTAVAIL; #endif } else { /* * XXX: in6_selectsrc might replace the bound local address * with the address specified by setsockopt(IPV6_PKTINFO). * Is it the intended behavior? */ error = in6_selectsrc(sin6, in6p_outputopts(inp), in6p_moptions(inp), &inp->inp_route, &in6p_laddr(inp), &ifp, &psref, &ia6); if (error == 0) in6a = &ia6; if (ifp && scope_ambiguous && (error = in6_setscope(&sin6->sin6_addr, ifp, NULL)) != 0) { if_put(ifp, &psref); curlwp_bindx(bound); return error; } if (in6a == NULL) { if_put(ifp, &psref); curlwp_bindx(bound); if (error == 0) error = EADDRNOTAVAIL; return error; } } if (ifp != NULL) { in6p_ip6(inp).ip6_hlim = (u_int8_t)in6pcb_selecthlim(inp, ifp); if_put(ifp, &psref); } else in6p_ip6(inp).ip6_hlim = (u_int8_t)in6pcb_selecthlim_rt(inp); curlwp_bindx(bound); if (in6pcb_lookup(inp->inp_table, &sin6->sin6_addr, sin6->sin6_port, IN6_IS_ADDR_UNSPECIFIED(&in6p_laddr(inp)) ? in6a : &in6p_laddr(inp), inp->inp_lport, 0, &vestige) || vestige.valid) return EADDRINUSE; if (IN6_IS_ADDR_UNSPECIFIED(&in6p_laddr(inp)) || (IN6_IS_ADDR_V4MAPPED(&in6p_laddr(inp)) && in6p_laddr(inp).s6_addr32[3] == 0)) { if (inp->inp_lport == 0) { error = in6pcb_bind(inp, NULL, l); if (error != 0) return error; } in6p_laddr(inp) = *in6a; } in6p_faddr(inp) = sin6->sin6_addr; inp->inp_fport = sin6->sin6_port; /* Late bind, if needed */ if (inp->inp_bindportonsend) { struct sockaddr_in6 lsin = *((const struct sockaddr_in6 *) inp->inp_socket->so_proto->pr_domain->dom_sa_any); lsin.sin6_addr = in6p_laddr(inp); lsin.sin6_port = 0; if ((error = in6pcb_bind_port(inp, &lsin, l)) != 0) return error; } inpcb_set_state(inp, INP_CONNECTED); in6p_flowinfo(inp) &= ~IPV6_FLOWLABEL_MASK; if (ip6_auto_flowlabel) in6p_flowinfo(inp) |= (htonl(ip6_randomflowlabel()) & IPV6_FLOWLABEL_MASK); #if defined(IPSEC) if (ipsec_enabled && inp->inp_socket->so_type == SOCK_STREAM) ipsec_pcbconn(inp->inp_sp); #endif return 0; } void in6pcb_disconnect(struct inpcb *inp) { memset((void *)&in6p_faddr(inp), 0, sizeof(in6p_faddr(inp))); inp->inp_fport = 0; inpcb_set_state(inp, INP_BOUND); in6p_flowinfo(inp) &= ~IPV6_FLOWLABEL_MASK; #if defined(IPSEC) if (ipsec_enabled) ipsec_pcbdisconn(inp->inp_sp); #endif if (inp->inp_socket->so_state & SS_NOFDREF) inpcb_destroy(inp); } void in6pcb_fetch_sockaddr(struct inpcb *inp, struct sockaddr_in6 *sin6) { if (inp->inp_af != AF_INET6) return; sockaddr_in6_init(sin6, &in6p_laddr(inp), inp->inp_lport, 0, 0); (void)sa6_recoverscope(sin6); /* XXX: should catch errors */ } void in6pcb_fetch_peeraddr(struct inpcb *inp, struct sockaddr_in6 *sin6) { if (inp->inp_af != AF_INET6) return; sockaddr_in6_init(sin6, &in6p_faddr(inp), inp->inp_fport, 0, 0); (void)sa6_recoverscope(sin6); /* XXX: should catch errors */ } /* * Pass some notification to all connections of a protocol * associated with address dst. The local address and/or port numbers * may be specified to limit the search. The "usual action" will be * taken, depending on the ctlinput cmd. The caller must filter any * cmds that are uninteresting (e.g., no error in the map). * Call the protocol specific routine (if any) to report * any errors for each matching socket. * * Must be called at splsoftnet. * * Note: src (4th arg) carries the flowlabel value on the original IPv6 * header, in sin6_flowinfo member. */ int in6pcb_notify(struct inpcbtable *table, const struct sockaddr *dst, u_int fport_arg, const struct sockaddr *src, u_int lport_arg, int cmd, void *cmdarg, void (*notify)(struct inpcb *, int)) { struct inpcb *inp; struct sockaddr_in6 sa6_src; const struct sockaddr_in6 *sa6_dst; in_port_t fport = fport_arg, lport = lport_arg; int errno; int nmatch = 0; u_int32_t flowinfo; if ((unsigned)cmd >= PRC_NCMDS || dst->sa_family != AF_INET6) return 0; sa6_dst = (const struct sockaddr_in6 *)dst; if (IN6_IS_ADDR_UNSPECIFIED(&sa6_dst->sin6_addr)) return 0; /* * note that src can be NULL when we get notify by local fragmentation. */ sa6_src = (src == NULL) ? sa6_any : *(const struct sockaddr_in6 *)src; flowinfo = sa6_src.sin6_flowinfo; /* * Redirects go to all references to the destination, * and use in6pcb_rtchange to invalidate the route cache. * Dead host indications: also use in6pcb_rtchange to invalidate * the cache, and deliver the error to all the sockets. * Otherwise, if we have knowledge of the local port and address, * deliver only to that socket. */ if (PRC_IS_REDIRECT(cmd) || cmd == PRC_HOSTDEAD) { fport = 0; lport = 0; memset((void *)&sa6_src.sin6_addr, 0, sizeof(sa6_src.sin6_addr)); if (cmd != PRC_HOSTDEAD) notify = in6pcb_rtchange; } errno = inet6ctlerrmap[cmd]; TAILQ_FOREACH(inp, &table->inpt_queue, inp_queue) { struct rtentry *rt = NULL; if (inp->inp_af != AF_INET6) continue; /* * Under the following condition, notify of redirects * to the pcb, without making address matches against inpcb. * - redirect notification is arrived. * - the inpcb is unconnected. * - the inpcb is caching !RTF_HOST routing entry. * - the ICMPv6 notification is from the gateway cached in the * inpcb. i.e. ICMPv6 notification is from nexthop gateway * the inpcb used very recently. * * This is to improve interaction between netbsd/openbsd * redirect handling code, and inpcb route cache code. * without the clause, !RTF_HOST routing entry (which carries * gateway used by inpcb right before the ICMPv6 redirect) * will be cached forever in unconnected inpcb. * * There still is a question regarding to what is TRT: * - On bsdi/freebsd, RTF_HOST (cloned) routing entry will be * generated on packet output. inpcb will always cache * RTF_HOST routing entry so there's no need for the clause * (ICMPv6 redirect will update RTF_HOST routing entry, * and inpcb is caching it already). * However, bsdi/freebsd are vulnerable to local DoS attacks * due to the cloned routing entries. * - Specwise, "destination cache" is mentioned in RFC2461. * Jinmei says that it implies bsdi/freebsd behavior, itojun * is not really convinced. * - Having hiwat/lowat on # of cloned host route (redirect/ * pmtud) may be a good idea. netbsd/openbsd has it. see * icmp6_mtudisc_update(). */ if ((PRC_IS_REDIRECT(cmd) || cmd == PRC_HOSTDEAD) && IN6_IS_ADDR_UNSPECIFIED(&in6p_laddr(inp)) && (rt = rtcache_validate(&inp->inp_route)) != NULL && !(rt->rt_flags & RTF_HOST)) { const struct sockaddr_in6 *dst6; dst6 = (const struct sockaddr_in6 *) rtcache_getdst(&inp->inp_route); if (dst6 == NULL) ; else if (IN6_ARE_ADDR_EQUAL(&dst6->sin6_addr, &sa6_dst->sin6_addr)) { rtcache_unref(rt, &inp->inp_route); goto do_notify; } } rtcache_unref(rt, &inp->inp_route); /* * If the error designates a new path MTU for a destination * and the application (associated with this socket) wanted to * know the value, notify. Note that we notify for all * disconnected sockets if the corresponding application * wanted. This is because some UDP applications keep sending * sockets disconnected. * XXX: should we avoid to notify the value to TCP sockets? */ if (cmd == PRC_MSGSIZE && (inp->inp_flags & IN6P_MTU) != 0 && (IN6_IS_ADDR_UNSPECIFIED(&in6p_faddr(inp)) || IN6_ARE_ADDR_EQUAL(&in6p_faddr(inp), &sa6_dst->sin6_addr))) { ip6_notify_pmtu(inp, (const struct sockaddr_in6 *)dst, (u_int32_t *)cmdarg); } /* * Detect if we should notify the error. If no source and * destination ports are specified, but non-zero flowinfo and * local address match, notify the error. This is the case * when the error is delivered with an encrypted buffer * by ESP. Otherwise, just compare addresses and ports * as usual. */ if (lport == 0 && fport == 0 && flowinfo && inp->inp_socket != NULL && flowinfo == (in6p_flowinfo(inp) & IPV6_FLOWLABEL_MASK) && IN6_ARE_ADDR_EQUAL(&in6p_laddr(inp), &sa6_src.sin6_addr)) goto do_notify; else if (!IN6_ARE_ADDR_EQUAL(&in6p_faddr(inp), &sa6_dst->sin6_addr) || inp->inp_socket == NULL || (lport && inp->inp_lport != lport) || (!IN6_IS_ADDR_UNSPECIFIED(&sa6_src.sin6_addr) && !IN6_ARE_ADDR_EQUAL(&in6p_laddr(inp), &sa6_src.sin6_addr)) || (fport && inp->inp_fport != fport)) continue; do_notify: if (notify) (*notify)(inp, errno); nmatch++; } return nmatch; } void in6pcb_purgeif0(struct inpcbtable *table, struct ifnet *ifp) { struct inpcb *inp; struct ip6_moptions *im6o; struct in6_multi_mship *imm, *nimm; KASSERT(ifp != NULL); TAILQ_FOREACH(inp, &table->inpt_queue, inp_queue) { bool need_unlock = false; if (inp->inp_af != AF_INET6) continue; /* The caller holds either one of inps' lock */ if (!inp_locked(inp)) { inp_lock(inp); need_unlock = true; } im6o = in6p_moptions(inp); if (im6o) { /* * Unselect the outgoing interface if it is being * detached. */ if (im6o->im6o_multicast_if_index == ifp->if_index) im6o->im6o_multicast_if_index = 0; /* * Drop multicast group membership if we joined * through the interface being detached. * XXX controversial - is it really legal for kernel * to force this? */ LIST_FOREACH_SAFE(imm, &im6o->im6o_memberships, i6mm_chain, nimm) { if (imm->i6mm_maddr->in6m_ifp == ifp) { LIST_REMOVE(imm, i6mm_chain); in6_leavegroup(imm); } } } in_purgeifmcast(inp->inp_moptions, ifp); if (need_unlock) inp_unlock(inp); } } void in6pcb_purgeif(struct inpcbtable *table, struct ifnet *ifp) { struct rtentry *rt; struct inpcb *inp; TAILQ_FOREACH(inp, &table->inpt_queue, inp_queue) { if (inp->inp_af != AF_INET6) continue; if ((rt = rtcache_validate(&inp->inp_route)) != NULL && rt->rt_ifp == ifp) { rtcache_unref(rt, &inp->inp_route); in6pcb_rtchange(inp, 0); } else rtcache_unref(rt, &inp->inp_route); } } /* * After a routing change, flush old routing. A new route can be * allocated the next time output is attempted. */ void in6pcb_rtchange(struct inpcb *inp, int errno) { if (inp->inp_af != AF_INET6) return; rtcache_free(&inp->inp_route); /* * A new route can be allocated the next time * output is attempted. */ } struct inpcb * in6pcb_lookup_local(struct inpcbtable *table, struct in6_addr *laddr6, u_int lport_arg, int lookup_wildcard, struct vestigial_inpcb *vp) { struct inpcbhead *head; struct inpcb *inp, *match = NULL; int matchwild = 3, wildcard; in_port_t lport = lport_arg; if (vp) vp->valid = 0; head = IN6PCBHASH_PORT(table, lport); LIST_FOREACH(inp, head, inp_lhash) { if (inp->inp_af != AF_INET6) continue; if (inp->inp_lport != lport) continue; wildcard = 0; if (IN6_IS_ADDR_V4MAPPED(&in6p_faddr(inp))) { if ((inp->inp_flags & IN6P_IPV6_V6ONLY) != 0) continue; } if (!IN6_IS_ADDR_UNSPECIFIED(&in6p_faddr(inp))) wildcard++; if (IN6_IS_ADDR_V4MAPPED(&in6p_laddr(inp))) { if ((inp->inp_flags & IN6P_IPV6_V6ONLY) != 0) continue; if (!IN6_IS_ADDR_V4MAPPED(laddr6)) continue; /* duplicate of IPv4 logic */ wildcard = 0; if (IN6_IS_ADDR_V4MAPPED(&in6p_faddr(inp)) && in6p_faddr(inp).s6_addr32[3]) wildcard++; if (!in6p_laddr(inp).s6_addr32[3]) { if (laddr6->s6_addr32[3]) wildcard++; } else { if (!laddr6->s6_addr32[3]) wildcard++; else { if (in6p_laddr(inp).s6_addr32[3] != laddr6->s6_addr32[3]) continue; } } } else if (IN6_IS_ADDR_UNSPECIFIED(&in6p_laddr(inp))) { if (IN6_IS_ADDR_V4MAPPED(laddr6)) { if ((inp->inp_flags & IN6P_IPV6_V6ONLY) != 0) continue; } if (!IN6_IS_ADDR_UNSPECIFIED(laddr6)) wildcard++; } else { if (IN6_IS_ADDR_V4MAPPED(laddr6)) { if ((inp->inp_flags & IN6P_IPV6_V6ONLY) != 0) continue; } if (IN6_IS_ADDR_UNSPECIFIED(laddr6)) wildcard++; else { if (!IN6_ARE_ADDR_EQUAL(&in6p_laddr(inp), laddr6)) continue; } } if (wildcard && !lookup_wildcard) continue; if (wildcard < matchwild) { match = inp; matchwild = wildcard; if (matchwild == 0) break; } } if (match && matchwild == 0) return match; if (vp && table->vestige && table->vestige->init_ports6) { struct vestigial_inpcb better; bool has_better = false; void *state; state = (*table->vestige->init_ports6)(laddr6, lport_arg, lookup_wildcard); while (table->vestige && (*table->vestige->next_port6)(state, vp)) { if (vp->lport != lport) continue; wildcard = 0; if (!IN6_IS_ADDR_UNSPECIFIED(&vp->faddr.v6)) wildcard++; if (IN6_IS_ADDR_UNSPECIFIED(&vp->laddr.v6)) { if (!IN6_IS_ADDR_UNSPECIFIED(laddr6)) wildcard++; } else { if (IN6_IS_ADDR_V4MAPPED(laddr6)) { if (vp->v6only) continue; } if (IN6_IS_ADDR_UNSPECIFIED(laddr6)) wildcard++; else { if (!IN6_ARE_ADDR_EQUAL(&vp->laddr.v6, laddr6)) continue; } } if (wildcard && !lookup_wildcard) continue; if (wildcard < matchwild) { better = *vp; has_better = true; matchwild = wildcard; if (matchwild == 0) break; } } if (has_better) { *vp = better; return 0; } } return match; } /* * WARNING: return value (rtentry) could be IPv4 one if inpcb is connected to * IPv4 mapped address. */ struct rtentry * in6pcb_rtentry(struct inpcb *inp) { struct rtentry *rt; struct route *ro; union { const struct sockaddr *sa; const struct sockaddr_in6 *sa6; #ifdef INET const struct sockaddr_in *sa4; #endif } cdst; ro = &inp->inp_route; if (inp->inp_af != AF_INET6) return NULL; cdst.sa = rtcache_getdst(ro); if (cdst.sa == NULL) ; #ifdef INET else if (cdst.sa->sa_family == AF_INET) { KASSERT(IN6_IS_ADDR_V4MAPPED(&in6p_faddr(inp))); if (cdst.sa4->sin_addr.s_addr != in6p_faddr(inp).s6_addr32[3]) rtcache_free(ro); } #endif else { if (!IN6_ARE_ADDR_EQUAL(&cdst.sa6->sin6_addr, &in6p_faddr(inp))) rtcache_free(ro); } if ((rt = rtcache_validate(ro)) == NULL) rt = rtcache_update(ro, 1); #ifdef INET if (rt == NULL && IN6_IS_ADDR_V4MAPPED(&in6p_faddr(inp))) { union { struct sockaddr dst; struct sockaddr_in dst4; } u; struct in_addr addr; addr.s_addr = in6p_faddr(inp).s6_addr32[3]; sockaddr_in_init(&u.dst4, &addr, 0); if (rtcache_setdst(ro, &u.dst) != 0) return NULL; rt = rtcache_init(ro); } else #endif if (rt == NULL && !IN6_IS_ADDR_UNSPECIFIED(&in6p_faddr(inp))) { union { struct sockaddr dst; struct sockaddr_in6 dst6; } u; sockaddr_in6_init(&u.dst6, &in6p_faddr(inp), 0, 0, 0); if (rtcache_setdst(ro, &u.dst) != 0) return NULL; rt = rtcache_init(ro); } return rt; } void in6pcb_rtentry_unref(struct rtentry *rt, struct inpcb *inp) { rtcache_unref(rt, &inp->inp_route); } struct inpcb * in6pcb_lookup(struct inpcbtable *table, const struct in6_addr *faddr6, u_int fport_arg, const struct in6_addr *laddr6, u_int lport_arg, int faith, struct vestigial_inpcb *vp) { struct inpcbhead *head; struct inpcb *inp; in_port_t fport = fport_arg, lport = lport_arg; if (vp) vp->valid = 0; head = IN6PCBHASH_CONNECT(table, faddr6, fport, laddr6, lport); LIST_FOREACH(inp, head, inp_hash) { if (inp->inp_af != AF_INET6) continue; /* find exact match on both source and dest */ if (inp->inp_fport != fport) continue; if (inp->inp_lport != lport) continue; if (IN6_IS_ADDR_UNSPECIFIED(&in6p_faddr(inp))) continue; if (!IN6_ARE_ADDR_EQUAL(&in6p_faddr(inp), faddr6)) continue; if (IN6_IS_ADDR_UNSPECIFIED(&in6p_laddr(inp))) continue; if (!IN6_ARE_ADDR_EQUAL(&in6p_laddr(inp), laddr6)) continue; if ((IN6_IS_ADDR_V4MAPPED(laddr6) || IN6_IS_ADDR_V4MAPPED(faddr6)) && (inp->inp_flags & IN6P_IPV6_V6ONLY)) continue; return inp; } if (vp && table->vestige) { if ((*table->vestige->lookup6)(faddr6, fport_arg, laddr6, lport_arg, vp)) return NULL; } return NULL; } struct inpcb * in6pcb_lookup_bound(struct inpcbtable *table, const struct in6_addr *laddr6, u_int lport_arg, int faith) { struct inpcbhead *head; struct inpcb *inp; in_port_t lport = lport_arg; #ifdef INET struct in6_addr zero_mapped; #endif head = IN6PCBHASH_BIND(table, laddr6, lport); LIST_FOREACH(inp, head, inp_hash) { if (inp->inp_af != AF_INET6) continue; if (faith && (inp->inp_flags & IN6P_FAITH) == 0) continue; if (inp->inp_fport != 0) continue; if (inp->inp_lport != lport) continue; if (IN6_IS_ADDR_V4MAPPED(laddr6) && (inp->inp_flags & IN6P_IPV6_V6ONLY) != 0) continue; if (IN6_ARE_ADDR_EQUAL(&in6p_laddr(inp), laddr6)) goto out; } #ifdef INET if (IN6_IS_ADDR_V4MAPPED(laddr6)) { memset(&zero_mapped, 0, sizeof(zero_mapped)); zero_mapped.s6_addr16[5] = 0xffff; head = IN6PCBHASH_BIND(table, &zero_mapped, lport); LIST_FOREACH(inp, head, inp_hash) { if (inp->inp_af != AF_INET6) continue; if (faith && (inp->inp_flags & IN6P_FAITH) == 0) continue; if (inp->inp_fport != 0) continue; if (inp->inp_lport != lport) continue; if ((inp->inp_flags & IN6P_IPV6_V6ONLY) != 0) continue; if (IN6_ARE_ADDR_EQUAL(&in6p_laddr(inp), &zero_mapped)) goto out; } } #endif head = IN6PCBHASH_BIND(table, &zeroin6_addr, lport); LIST_FOREACH(inp, head, inp_hash) { if (inp->inp_af != AF_INET6) continue; if (faith && (inp->inp_flags & IN6P_FAITH) == 0) continue; if (inp->inp_fport != 0) continue; if (inp->inp_lport != lport) continue; if (IN6_IS_ADDR_V4MAPPED(laddr6) && (inp->inp_flags & IN6P_IPV6_V6ONLY) != 0) continue; if (IN6_ARE_ADDR_EQUAL(&in6p_laddr(inp), &zeroin6_addr)) goto out; } return NULL; out: if (inp != LIST_FIRST(head)) { LIST_REMOVE(inp, inp_hash); LIST_INSERT_HEAD(head, inp, inp_hash); } return inp; } void in6pcb_set_state(struct inpcb *inp, int state) { if (inp->inp_af != AF_INET6) return; if (inp->inp_state > INP_ATTACHED) LIST_REMOVE(inp, inp_hash); switch (state) { case INP_BOUND: LIST_INSERT_HEAD(IN6PCBHASH_BIND(inp->inp_table, &in6p_laddr(inp), inp->inp_lport), inp, inp_hash); break; case INP_CONNECTED: LIST_INSERT_HEAD(IN6PCBHASH_CONNECT(inp->inp_table, &in6p_faddr(inp), inp->inp_fport, &in6p_laddr(inp), inp->inp_lport), inp, inp_hash); break; } inp->inp_state = state; }
32 3 28 1 3 28 28 1 28 37 14 24 37 1 4 12 13 1 3 12 2 33 1 31 32 2 5 28 32 37 36 1 37 8 31 32 5 2 3 1 2 2 2 2 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 /* $NetBSD: vfs_getcwd.c,v 1.61 2021/06/29 22:39:21 dholland Exp $ */ /*- * Copyright (c) 1999, 2020 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Bill Sommerfeld. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: vfs_getcwd.c,v 1.61 2021/06/29 22:39:21 dholland Exp $"); #include <sys/param.h> #include <sys/systm.h> #include <sys/namei.h> #include <sys/filedesc.h> #include <sys/kernel.h> #include <sys/file.h> #include <sys/stat.h> #include <sys/vnode.h> #include <sys/mount.h> #include <sys/proc.h> #include <sys/uio.h> #include <sys/kmem.h> #include <sys/dirent.h> #include <sys/kauth.h> #include <ufs/ufs/dir.h> /* XXX only for DIRBLKSIZ */ #include <sys/syscallargs.h> /* * Vnode variable naming conventions in this file: * * rvp: the current root we're aiming towards. * lvp, *lvpp: the "lower" vnode * uvp, *uvpp: the "upper" vnode. * * Since all the vnodes we're dealing with are directories, and the * lookups are going *up* in the filesystem rather than *down*, the * usual "pvp" (parent) or "dvp" (directory) naming conventions are * too confusing. */ /* * XXX Will infinite loop in certain cases if a directory read reliably * returns EINVAL on last block. * XXX is EINVAL the right thing to return if a directory is malformed? */ /* * XXX Untested vs. mount -o union; probably does the wrong thing. */ /* * Find parent vnode of *lvpp, return in *uvpp * * If we care about the name, scan it looking for name of directory * entry pointing at lvp. * * Place the name in the buffer which starts at bufp, immediately * before *bpp, and move bpp backwards to point at the start of it. * * On entry, *lvpp is a locked vnode reference; on exit, it is vput and NULL'ed * On exit, *uvpp is either NULL or is a locked vnode reference. */ static int getcwd_scandir(struct vnode *lvp, struct vnode **uvpp, char **bpp, char *bufp, struct lwp *l) { int error = 0; int eofflag; off_t off; int tries; struct uio uio; struct iovec iov; char *dirbuf = NULL; int dirbuflen; ino_t fileno; struct vattr va; struct vnode *uvp = NULL; kauth_cred_t cred = l->l_cred; struct componentname cn; int len, reclen; tries = 0; /* Need exclusive for UFS VOP_GETATTR (itimes) & VOP_LOOKUP. */ KASSERT(VOP_ISLOCKED(lvp) == LK_EXCLUSIVE); /* * If we want the filename, get some info we need while the * current directory is still locked. */ if (bufp != NULL) { error = VOP_GETATTR(lvp, &va, cred); if (error) { VOP_UNLOCK(lvp); *uvpp = NULL; return error; } } /* * Ok, we have to do it the hard way.. * Next, get parent vnode using lookup of .. */ cn.cn_nameiop = LOOKUP; cn.cn_flags = ISLASTCN | ISDOTDOT | RDONLY; cn.cn_cred = cred; cn.cn_nameptr = ".."; cn.cn_namelen = 2; /* At this point, lvp is locked */ error = VOP_LOOKUP(lvp, uvpp, &cn); VOP_UNLOCK(lvp); if (error) { *uvpp = NULL; return error; } uvp = *uvpp; /* If we don't care about the pathname, we're done */ if (bufp == NULL) { return 0; } fileno = va.va_fileid; /* I guess UFS_DIRBLKSIZ is a good guess at a good size to use? */ dirbuflen = UFS_DIRBLKSIZ; if (dirbuflen < va.va_blocksize) dirbuflen = va.va_blocksize; dirbuf = kmem_alloc(dirbuflen, KM_SLEEP); /* Now lvp is unlocked, try to lock uvp */ error = vn_lock(uvp, LK_SHARED); if (error) { vrele(uvp); *uvpp = NULL; return error; } #if 0 unionread: #endif off = 0; do { /* call VOP_READDIR of parent */ iov.iov_base = dirbuf; iov.iov_len = dirbuflen; uio.uio_iov = &iov; uio.uio_iovcnt = 1; uio.uio_offset = off; uio.uio_resid = dirbuflen; uio.uio_rw = UIO_READ; UIO_SETUP_SYSSPACE(&uio); eofflag = 0; error = VOP_READDIR(uvp, &uio, cred, &eofflag, 0, 0); off = uio.uio_offset; /* * Try again if NFS tosses its cookies. * XXX this can still loop forever if the directory is busted * such that the second or subsequent page of it always * returns EINVAL */ if ((error == EINVAL) && (tries < 3)) { off = 0; tries++; continue; /* once more, with feeling */ } if (!error) { char *cpos; struct dirent *dp; cpos = dirbuf; tries = 0; /* scan directory page looking for matching vnode */ for (len = (dirbuflen - uio.uio_resid); len > 0; len -= reclen) { dp = (struct dirent *) cpos; reclen = dp->d_reclen; /* check for malformed directory.. */ if (reclen < _DIRENT_MINSIZE(dp) || reclen > len) { error = EINVAL; goto out; } /* * XXX should perhaps do VOP_LOOKUP to * check that we got back to the right place, * but getting the locking games for that * right would be heinous. */ if ((dp->d_type != DT_WHT) && (dp->d_fileno == fileno)) { char *bp = *bpp; bp -= dp->d_namlen; if (bp <= bufp) { error = ERANGE; goto out; } memcpy(bp, dp->d_name, dp->d_namlen); error = 0; *bpp = bp; goto out; } cpos += reclen; } } else goto out; } while (!eofflag); #if 0 /* * Deal with mount -o union, which unions only the * root directory of the mount. */ if ((uvp->v_vflag & VV_ROOT) && (uvp->v_mount->mnt_flag & MNT_UNION)) { struct vnode *tvp = uvp; uvp = uvp->v_mount->mnt_vnodecovered; vput(tvp); vref(uvp); *uvpp = uvp; vn_lock(uvp, LK_SHARED | LK_RETRY); goto unionread; } #endif error = ENOENT; out: VOP_UNLOCK(uvp); kmem_free(dirbuf, dirbuflen); return error; } /* * common routine shared by sys___getcwd() and vn_isunder() */ int getcwd_common(struct vnode *lvp, struct vnode *rvp, char **bpp, char *bufp, int limit, int flags, struct lwp *l) { struct cwdinfo *cwdi = l->l_proc->p_cwdi; kauth_cred_t cred = l->l_cred; struct vnode *uvp = NULL; char *bp = NULL; int error; accmode_t accmode = VEXEC; error = 0; if (rvp == NULL) { rvp = cwdi->cwdi_rdir; if (rvp == NULL) rvp = rootvnode; } vref(rvp); vref(lvp); /* * Error handling invariant: * Before a `goto out': * lvp is either NULL, or held. * uvp is either NULL, or held. */ if (bufp) bp = *bpp; /* * this loop will terminate when one of the following happens: * - we hit the root * - getdirentries or lookup fails * - we run out of space in the buffer. */ if (lvp == rvp) { if (bp) *(--bp) = '/'; goto out; } do { /* * access check here is optional, depending on * whether or not caller cares. */ int chkaccess = (flags & GETCWD_CHECK_ACCESS); bool locked = false; /* * step up if we're a covered vnode.. * check access on the first vnode only. */ if (lvp->v_vflag & VV_ROOT) { vn_lock(lvp, LK_SHARED | LK_RETRY); if (chkaccess) { error = VOP_ACCESS(lvp, accmode, cred); if (error) { VOP_UNLOCK(lvp); goto out; } chkaccess = 0; } while (lvp->v_vflag & VV_ROOT) { struct vnode *tvp; if (lvp == rvp) { VOP_UNLOCK(lvp); goto out; } tvp = lvp->v_mount->mnt_vnodecovered; /* * hodie natus est radici frater */ if (tvp == NULL) { VOP_UNLOCK(lvp); error = ENOENT; goto out; } vref(tvp); vput(lvp); lvp = tvp; if (lvp->v_vflag & VV_ROOT) vn_lock(lvp, LK_SHARED | LK_RETRY); } } /* Do we need to check access to the directory? */ if (chkaccess && !cache_have_id(lvp)) { /* Need exclusive for UFS VOP_GETATTR (itimes) & VOP_LOOKUP. */ vn_lock(lvp, LK_EXCLUSIVE | LK_RETRY); error = VOP_ACCESS(lvp, accmode, cred); if (error) { VOP_UNLOCK(lvp); goto out; } chkaccess = 0; locked = true; } /* * Look in the name cache; if that fails, look in the * directory.. */ error = cache_revlookup(lvp, &uvp, &bp, bufp, chkaccess, accmode); if (error == -1) { if (!locked) { locked = true; vn_lock(lvp, LK_EXCLUSIVE | LK_RETRY); } if (lvp->v_type != VDIR) { VOP_UNLOCK(lvp); error = ENOTDIR; goto out; } error = getcwd_scandir(lvp, &uvp, &bp, bufp, l); /* lvp now unlocked */ } else if (locked) { VOP_UNLOCK(lvp); } if (error) goto out; #if DIAGNOSTIC if (bufp && (bp <= bufp)) { panic("getcwd: oops, went back too far"); } #endif accmode = VEXEC | VREAD; if (bp) *(--bp) = '/'; vrele(lvp); lvp = uvp; uvp = NULL; limit--; } while ((lvp != rvp) && (limit > 0)); out: if (bpp) *bpp = bp; if (uvp) vrele(uvp); if (lvp) vrele(lvp); vrele(rvp); return error; } /* * Check if one directory can be found inside another in the directory * hierarchy. * * Intended to be used in chroot, chdir, fchdir, etc., to ensure that * chroot() actually means something. */ int vn_isunder(struct vnode *lvp, struct vnode *rvp, struct lwp *l) { int error; error = getcwd_common(lvp, rvp, NULL, NULL, MAXPATHLEN / 2, 0, l); if (!error) return 1; else return 0; } /* * Returns true if proc p1's root directory equal to or under p2's * root directory. * * Intended to be used from ptrace/procfs sorts of things. */ int proc_isunder(struct proc *p1, struct lwp *l2) { struct vnode *r1 = p1->p_cwdi->cwdi_rdir; struct vnode *r2 = l2->l_proc->p_cwdi->cwdi_rdir; if (r1 == NULL) return (r2 == NULL); else if (r2 == NULL) return 1; else return vn_isunder(r1, r2, l2); } /* * Find pathname of process's current directory. * * Use vfs vnode-to-name reverse cache; if that fails, fall back * to reading directory contents. */ int sys___getcwd(struct lwp *l, const struct sys___getcwd_args *uap, register_t *retval) { /* { syscallarg(char *) bufp; syscallarg(size_t) length; } */ int error; char *path; char *bp, *bend; int len = SCARG(uap, length); int lenused; struct cwdinfo *cwdi; if (len > MAXPATHLEN * 4) len = MAXPATHLEN * 4; else if (len < 2) return ERANGE; path = kmem_alloc(len, KM_SLEEP); bp = &path[len]; bend = bp; *(--bp) = '\0'; /* * 5th argument here is "max number of vnodes to traverse". * Since each entry takes up at least 2 bytes in the output buffer, * limit it to N/2 vnodes for an N byte buffer. */ cwdi = l->l_proc->p_cwdi; rw_enter(&cwdi->cwdi_lock, RW_READER); error = getcwd_common(cwdi->cwdi_cdir, NULL, &bp, path, len/2, GETCWD_CHECK_ACCESS, l); rw_exit(&cwdi->cwdi_lock); if (error) goto out; lenused = bend - bp; *retval = lenused; /* put the result into user buffer */ error = copyout(bp, SCARG(uap, bufp), lenused); out: kmem_free(path, len); return error; } /* * Try to find a pathname for a vnode. Since there is no mapping vnode -> * parent directory, this needs the namecache to succeed. Caller holds a * reference to the vnode. */ int vnode_to_path(char *path, size_t len, struct vnode *vp, struct lwp *curl, struct proc *p) { struct proc *curp = curl->l_proc; int error, lenused, elen; char *bp, *bend; struct vnode *dvp; KASSERT(vrefcnt(vp) > 0); bp = bend = &path[len]; *(--bp) = '\0'; error = cache_revlookup(vp, &dvp, &bp, path, false, 0); if (error != 0) return (error == -1 ? ENOENT : error); *(--bp) = '/'; error = getcwd_common(dvp, NULL, &bp, path, len / 2, GETCWD_CHECK_ACCESS, curl); vrele(dvp); if (error != 0) return error; /* * Strip off emulation path for emulated processes looking at * the maps file of a process of the same emulation. (Won't * work if /emul/xxx is a symlink..) */ if (curp->p_emul == p->p_emul && curp->p_emul->e_path != NULL) { elen = strlen(curp->p_emul->e_path); if (!strncmp(bp, curp->p_emul->e_path, elen)) bp = &bp[elen]; } lenused = bend - bp; memcpy(path, bp, lenused); path[lenused] = '\0'; return 0; }
738 742 18 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 /* $NetBSD: cpu.c,v 1.210 2024/04/22 23:07:47 andvar Exp $ */ /* * Copyright (c) 2000-2020 NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Bill Sommerfeld of RedBack Networks Inc, and by Andrew Doran. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /* * Copyright (c) 1999 Stefan Grefen * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the NetBSD * Foundation, Inc. and its contributors. * 4. Neither the name of The NetBSD Foundation nor the names of its * contributors may be used to endorse or promote products derived * from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND ANY * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR AND CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: cpu.c,v 1.210 2024/04/22 23:07:47 andvar Exp $"); #include "opt_ddb.h" #include "opt_mpbios.h" /* for MPDEBUG */ #include "opt_mtrr.h" #include "opt_multiprocessor.h" #include "opt_svs.h" #include "lapic.h" #include "ioapic.h" #include "acpica.h" #include "hpet.h" #include <sys/param.h> #include <sys/proc.h> #include <sys/systm.h> #include <sys/device.h> #include <sys/cpu.h> #include <sys/cpufreq.h> #include <sys/idle.h> #include <sys/atomic.h> #include <sys/reboot.h> #include <sys/csan.h> #include <uvm/uvm.h> #include "acpica.h" /* for NACPICA, for mp_verbose */ #include <x86/machdep.h> #include <machine/cpufunc.h> #include <machine/cpuvar.h> #include <machine/pmap.h> #include <machine/vmparam.h> #if defined(MULTIPROCESSOR) #include <machine/mpbiosvar.h> #endif #include <machine/mpconfig.h> /* for mp_verbose */ #include <machine/pcb.h> #include <machine/specialreg.h> #include <machine/segments.h> #include <machine/gdt.h> #include <machine/mtrr.h> #include <machine/pio.h> #include <machine/cpu_counter.h> #include <machine/pmap_private.h> #include <x86/fpu.h> #if NACPICA > 0 #include <dev/acpi/acpi_srat.h> #endif #if NLAPIC > 0 #include <machine/apicvar.h> #include <machine/i82489reg.h> #include <machine/i82489var.h> #endif #include <dev/ic/mc146818reg.h> #include <dev/ic/hpetvar.h> #include <i386/isa/nvram.h> #include <dev/isa/isareg.h> #include "tsc.h" #ifndef XENPV #include "hyperv.h" #if NHYPERV > 0 #include <x86/x86/hypervvar.h> #endif #endif #ifdef XEN #include <xen/hypervisor.h> #endif static int cpu_match(device_t, cfdata_t, void *); static void cpu_attach(device_t, device_t, void *); static void cpu_defer(device_t); static int cpu_rescan(device_t, const char *, const int *); static void cpu_childdetached(device_t, device_t); static bool cpu_stop(device_t); static bool cpu_suspend(device_t, const pmf_qual_t *); static bool cpu_resume(device_t, const pmf_qual_t *); static bool cpu_shutdown(device_t, int); struct cpu_softc { device_t sc_dev; /* device tree glue */ struct cpu_info *sc_info; /* pointer to CPU info */ bool sc_wasonline; }; #ifdef MULTIPROCESSOR int mp_cpu_start(struct cpu_info *, paddr_t); void mp_cpu_start_cleanup(struct cpu_info *); const struct cpu_functions mp_cpu_funcs = { mp_cpu_start, NULL, mp_cpu_start_cleanup }; #endif CFATTACH_DECL2_NEW(cpu, sizeof(struct cpu_softc), cpu_match, cpu_attach, NULL, NULL, cpu_rescan, cpu_childdetached); /* * Statically-allocated CPU info for the primary CPU (or the only * CPU, on uniprocessors). The CPU info list is initialized to * point at it. */ struct cpu_info cpu_info_primary __aligned(CACHE_LINE_SIZE) = { .ci_dev = 0, .ci_self = &cpu_info_primary, .ci_idepth = -1, .ci_curlwp = &lwp0, .ci_curldt = -1, .ci_kfpu_spl = -1, }; struct cpu_info *cpu_info_list = &cpu_info_primary; #ifdef i386 void cpu_set_tss_gates(struct cpu_info *); #endif static void cpu_init_idle_lwp(struct cpu_info *); uint32_t cpu_feature[7] __read_mostly; /* X86 CPUID feature bits */ /* [0] basic features cpuid.1:%edx * [1] basic features cpuid.1:%ecx (CPUID2_xxx bits) * [2] extended features cpuid:80000001:%edx * [3] extended features cpuid:80000001:%ecx * [4] VIA padlock features * [5] structured extended features cpuid.7:%ebx * [6] structured extended features cpuid.7:%ecx */ #ifdef MULTIPROCESSOR bool x86_mp_online; paddr_t mp_trampoline_paddr = MP_TRAMPOLINE; #endif #if NLAPIC > 0 static vaddr_t cmos_data_mapping; #endif struct cpu_info *cpu_starting; #ifdef MULTIPROCESSOR void cpu_hatch(void *); static void cpu_boot_secondary(struct cpu_info *ci); static void cpu_start_secondary(struct cpu_info *ci); #if NLAPIC > 0 static void cpu_copy_trampoline(paddr_t); #endif #endif /* MULTIPROCESSOR */ /* * Runs once per boot once multiprocessor goo has been detected and * the local APIC on the boot processor has been mapped. * * Called from lapic_boot_init() (from mpbios_scan()). */ #if NLAPIC > 0 void cpu_init_first(void) { cpu_info_primary.ci_cpuid = lapic_cpu_number(); cmos_data_mapping = uvm_km_alloc(kernel_map, PAGE_SIZE, 0, UVM_KMF_VAONLY); if (cmos_data_mapping == 0) panic("No KVA for page 0"); pmap_kenter_pa(cmos_data_mapping, 0, VM_PROT_READ|VM_PROT_WRITE, 0); pmap_update(pmap_kernel()); } #endif static int cpu_match(device_t parent, cfdata_t match, void *aux) { return 1; } #ifdef __HAVE_PCPU_AREA void cpu_pcpuarea_init(struct cpu_info *ci) { struct vm_page *pg; size_t i, npages; vaddr_t base, va; paddr_t pa; CTASSERT(sizeof(struct pcpu_entry) % PAGE_SIZE == 0); npages = sizeof(struct pcpu_entry) / PAGE_SIZE; base = (vaddr_t)&pcpuarea->ent[cpu_index(ci)]; for (i = 0; i < npages; i++) { pg = uvm_pagealloc(NULL, 0, NULL, UVM_PGA_ZERO); if (pg == NULL) { panic("failed to allocate pcpu PA"); } va = base + i * PAGE_SIZE; pa = VM_PAGE_TO_PHYS(pg); pmap_kenter_pa(va, pa, VM_PROT_READ|VM_PROT_WRITE, 0); } pmap_update(pmap_kernel()); } #endif static void cpu_vm_init(struct cpu_info *ci) { unsigned int ncolors = 2; /* * XXX: for AP's the cache info has not been initialized yet * but that does not matter because uvm only pays attention at * the maximum only. We should fix it once cpus have different * cache sizes. */ for (unsigned int i = CAI_ICACHE; i <= CAI_L2CACHE; i++) { struct x86_cache_info *cai; unsigned int tcolors; cai = &ci->ci_cinfo[i]; tcolors = atop(cai->cai_totalsize); switch (cai->cai_associativity) { case 0xff: tcolors = 1; /* fully associative */ break; case 0: case 1: break; default: tcolors /= cai->cai_associativity; } if (tcolors <= ncolors) continue; ncolors = tcolors; } /* * If the desired number of colors is not a power of * two, it won't be good. Find the greatest power of * two which is an even divisor of the number of colors, * to preserve even coloring of pages. */ if (ncolors & (ncolors - 1) ) { unsigned int try, picked = 1; for (try = 1; try < ncolors; try *= 2) { if (ncolors % try == 0) picked = try; } if (picked == 1) { panic("desired number of cache colors %u is " " > 1, but not even!", ncolors); } ncolors = picked; } /* * Knowing the size of the largest cache on this CPU, potentially * re-color our pages. */ aprint_debug_dev(ci->ci_dev, "%d page colors\n", ncolors); uvm_page_recolor(ncolors); pmap_tlb_cpu_init(ci); #ifndef __HAVE_DIRECT_MAP pmap_vpage_cpu_init(ci); #endif } static void cpu_attach(device_t parent, device_t self, void *aux) { struct cpu_softc *sc = device_private(self); struct cpu_attach_args *caa = aux; struct cpu_info *ci; uintptr_t ptr; #if NLAPIC > 0 int cpunum = caa->cpu_number; #endif static bool again; sc->sc_dev = self; if (ncpu > maxcpus) { #ifndef _LP64 aprint_error(": too many CPUs, please use NetBSD/amd64\n"); #else aprint_error(": too many CPUs\n"); #endif return; } /* * If we're an Application Processor, allocate a cpu_info * structure, otherwise use the primary's. */ if (caa->cpu_role == CPU_ROLE_AP) { if ((boothowto & RB_MD1) != 0) { aprint_error(": multiprocessor boot disabled\n"); if (!pmf_device_register(self, NULL, NULL)) aprint_error_dev(self, "couldn't establish power handler\n"); return; } aprint_naive(": Application Processor\n"); ptr = (uintptr_t)uvm_km_alloc(kernel_map, sizeof(*ci) + CACHE_LINE_SIZE - 1, 0, UVM_KMF_WIRED|UVM_KMF_ZERO); ci = (struct cpu_info *)roundup2(ptr, CACHE_LINE_SIZE); ci->ci_curldt = -1; } else { aprint_naive(": %s Processor\n", caa->cpu_role == CPU_ROLE_SP ? "Single" : "Boot"); ci = &cpu_info_primary; #if NLAPIC > 0 if (cpunum != lapic_cpu_number()) { /* XXX should be done earlier. */ uint32_t reg; aprint_verbose("\n"); aprint_verbose_dev(self, "running CPU at apic %d" " instead of at expected %d", lapic_cpu_number(), cpunum); reg = lapic_readreg(LAPIC_ID); lapic_writereg(LAPIC_ID, (reg & ~LAPIC_ID_MASK) | (cpunum << LAPIC_ID_SHIFT)); } if (cpunum != lapic_cpu_number()) { aprint_error_dev(self, "unable to reset apic id\n"); } #endif } ci->ci_self = ci; sc->sc_info = ci; ci->ci_dev = self; ci->ci_acpiid = caa->cpu_id; ci->ci_cpuid = caa->cpu_number; ci->ci_func = caa->cpu_func; ci->ci_kfpu_spl = -1; aprint_normal("\n"); /* Must be before mi_cpu_attach(). */ cpu_vm_init(ci); if (caa->cpu_role == CPU_ROLE_AP) { int error; error = mi_cpu_attach(ci); if (error != 0) { aprint_error_dev(self, "mi_cpu_attach failed with %d\n", error); return; } #ifdef __HAVE_PCPU_AREA cpu_pcpuarea_init(ci); #endif cpu_init_tss(ci); } else { KASSERT(ci->ci_data.cpu_idlelwp != NULL); #if NACPICA > 0 /* Parse out NUMA info for cpu_identify(). */ acpisrat_init(); #endif } #ifdef SVS cpu_svs_init(ci); #endif pmap_reference(pmap_kernel()); ci->ci_pmap = pmap_kernel(); ci->ci_tlbstate = TLBSTATE_STALE; /* * Boot processor may not be attached first, but the below * must be done to allow booting other processors. */ if (!again) { /* Make sure DELAY() (likely i8254_delay()) is initialized. */ DELAY(1); /* * Basic init. Compute an approximate frequency for the TSC * using the i8254. If there's a HPET we'll redo it later. */ atomic_or_32(&ci->ci_flags, CPUF_PRESENT | CPUF_PRIMARY); cpu_intr_init(ci); tsc_setfunc(ci); cpu_get_tsc_freq(ci); cpu_init(ci); #ifdef i386 cpu_set_tss_gates(ci); #endif pmap_cpu_init_late(ci); #if NLAPIC > 0 if (caa->cpu_role != CPU_ROLE_SP) { /* Enable lapic. */ lapic_enable(); lapic_set_lvt(); if (!vm_guest_is_xenpvh_or_pvhvm()) lapic_calibrate_timer(false); } #endif kcsan_cpu_init(ci); again = true; } /* further PCB init done later. */ switch (caa->cpu_role) { case CPU_ROLE_SP: atomic_or_32(&ci->ci_flags, CPUF_SP); cpu_identify(ci); x86_errata(); x86_cpu_idle_init(); #ifdef XENPVHVM xen_hvm_init_cpu(ci); #endif break; case CPU_ROLE_BP: atomic_or_32(&ci->ci_flags, CPUF_BSP); cpu_identify(ci); x86_errata(); x86_cpu_idle_init(); #ifdef XENPVHVM xen_hvm_init_cpu(ci); #endif break; #ifdef MULTIPROCESSOR case CPU_ROLE_AP: /* * report on an AP */ cpu_intr_init(ci); idt_vec_init_cpu_md(&ci->ci_idtvec, cpu_index(ci)); gdt_alloc_cpu(ci); #ifdef i386 cpu_set_tss_gates(ci); #endif pmap_cpu_init_late(ci); cpu_start_secondary(ci); if (ci->ci_flags & CPUF_PRESENT) { struct cpu_info *tmp; cpu_identify(ci); tmp = cpu_info_list; while (tmp->ci_next) tmp = tmp->ci_next; tmp->ci_next = ci; } break; #endif default: panic("unknown processor type??\n"); } pat_init(ci); if (!pmf_device_register1(self, cpu_suspend, cpu_resume, cpu_shutdown)) aprint_error_dev(self, "couldn't establish power handler\n"); #ifdef MULTIPROCESSOR if (mp_verbose) { struct lwp *l = ci->ci_data.cpu_idlelwp; struct pcb *pcb = lwp_getpcb(l); aprint_verbose_dev(self, "idle lwp at %p, idle sp at %p\n", l, #ifdef i386 (void *)pcb->pcb_esp #else (void *)pcb->pcb_rsp #endif ); } #endif /* * Postpone the "cpufeaturebus" scan. * It is safe to scan the pseudo-bus * only after all CPUs have attached. */ (void)config_defer(self, cpu_defer); } static void cpu_defer(device_t self) { cpu_rescan(self, NULL, NULL); } static int cpu_rescan(device_t self, const char *ifattr, const int *locators) { struct cpu_softc *sc = device_private(self); struct cpufeature_attach_args cfaa; struct cpu_info *ci = sc->sc_info; /* * If we booted with RB_MD1 to disable multiprocessor, the * auto-configuration data still contains the additional * CPUs. But their initialization was mostly bypassed * during attach, so we have to make sure we don't look at * their featurebus info, since it wasn't retrieved. */ if (ci == NULL) return 0; memset(&cfaa, 0, sizeof(cfaa)); cfaa.ci = ci; if (ifattr_match(ifattr, "cpufeaturebus")) { if (ci->ci_frequency == NULL) { cfaa.name = "frequency"; ci->ci_frequency = config_found(self, &cfaa, NULL, CFARGS(.iattr = "cpufeaturebus")); } if (ci->ci_padlock == NULL) { cfaa.name = "padlock"; ci->ci_padlock = config_found(self, &cfaa, NULL, CFARGS(.iattr = "cpufeaturebus")); } if (ci->ci_temperature == NULL) { cfaa.name = "temperature"; ci->ci_temperature = config_found(self, &cfaa, NULL, CFARGS(.iattr = "cpufeaturebus")); } if (ci->ci_vm == NULL) { cfaa.name = "vm"; ci->ci_vm = config_found(self, &cfaa, NULL, CFARGS(.iattr = "cpufeaturebus")); } } return 0; } static void cpu_childdetached(device_t self, device_t child) { struct cpu_softc *sc = device_private(self); struct cpu_info *ci = sc->sc_info; if (ci->ci_frequency == child) ci->ci_frequency = NULL; if (ci->ci_padlock == child) ci->ci_padlock = NULL; if (ci->ci_temperature == child) ci->ci_temperature = NULL; if (ci->ci_vm == child) ci->ci_vm = NULL; } /* * Initialize the processor appropriately. */ void cpu_init(struct cpu_info *ci) { extern int x86_fpu_save; uint32_t cr4 = 0; lcr0(rcr0() | CR0_WP); /* If global TLB caching is supported, enable it */ if (cpu_feature[0] & CPUID_PGE) cr4 |= CR4_PGE; /* * If we have FXSAVE/FXRESTOR, use them. */ if (cpu_feature[0] & CPUID_FXSR) { cr4 |= CR4_OSFXSR; /* * If we have SSE/SSE2, enable XMM exceptions. */ if (cpu_feature[0] & (CPUID_SSE|CPUID_SSE2)) cr4 |= CR4_OSXMMEXCPT; } /* If xsave is supported, enable it */ if (cpu_feature[1] & CPUID2_XSAVE) cr4 |= CR4_OSXSAVE; /* If SMEP is supported, enable it */ if (cpu_feature[5] & CPUID_SEF_SMEP) cr4 |= CR4_SMEP; /* If SMAP is supported, enable it */ if (cpu_feature[5] & CPUID_SEF_SMAP) cr4 |= CR4_SMAP; #ifdef SVS /* If PCID is supported, enable it */ if (svs_pcid) cr4 |= CR4_PCIDE; #endif if (cr4) { cr4 |= rcr4(); lcr4(cr4); } /* * Changing CR4 register may change cpuid values. For example, setting * CR4_OSXSAVE sets CPUID2_OSXSAVE. The CPUID2_OSXSAVE is in * ci_feat_val[1], so update it. * XXX Other than ci_feat_val[1] might be changed. */ if (cpuid_level >= 1) { u_int descs[4]; x86_cpuid(1, descs); ci->ci_feat_val[1] = descs[2]; } if (CPU_IS_PRIMARY(ci) && x86_fpu_save >= FPU_SAVE_FXSAVE) { fpuinit_mxcsr_mask(); } /* If xsave is enabled, enable all fpu features */ if (cr4 & CR4_OSXSAVE) wrxcr(0, x86_xsave_features & XCR0_FPU); #ifdef MTRR /* * On a P6 or above, initialize MTRR's if the hardware supports them. */ if (cpu_feature[0] & CPUID_MTRR) { if ((ci->ci_flags & CPUF_AP) == 0) i686_mtrr_init_first(); mtrr_init_cpu(ci); } #ifdef i386 if (strcmp((char *)(ci->ci_vendor), "AuthenticAMD") == 0) { /* * Must be a K6-2 Step >= 7 or a K6-III. */ if (CPUID_TO_FAMILY(ci->ci_signature) == 5) { if (CPUID_TO_MODEL(ci->ci_signature) > 8 || (CPUID_TO_MODEL(ci->ci_signature) == 8 && CPUID_TO_STEPPING(ci->ci_signature) >= 7)) { mtrr_funcs = &k6_mtrr_funcs; k6_mtrr_init_first(); mtrr_init_cpu(ci); } } } #endif /* i386 */ #endif /* MTRR */ if (ci != &cpu_info_primary) { /* Synchronize TSC */ atomic_or_32(&ci->ci_flags, CPUF_RUNNING); tsc_sync_ap(ci); } else { atomic_or_32(&ci->ci_flags, CPUF_RUNNING); } } #ifdef MULTIPROCESSOR void cpu_boot_secondary_processors(void) { struct cpu_info *ci; kcpuset_t *cpus; u_long i; /* Now that we know the number of CPUs, patch the text segment. */ x86_patch(false); #if NACPICA > 0 /* Finished with NUMA info for now. */ acpisrat_exit(); #endif kcpuset_create(&cpus, true); kcpuset_set(cpus, cpu_index(curcpu())); for (i = 0; i < maxcpus; i++) { ci = cpu_lookup(i); if (ci == NULL) continue; if (ci->ci_data.cpu_idlelwp == NULL) continue; if ((ci->ci_flags & CPUF_PRESENT) == 0) continue; if (ci->ci_flags & (CPUF_BSP|CPUF_SP|CPUF_PRIMARY)) continue; cpu_boot_secondary(ci); kcpuset_set(cpus, cpu_index(ci)); } while (!kcpuset_match(cpus, kcpuset_running)) ; kcpuset_destroy(cpus); x86_mp_online = true; /* Now that we know about the TSC, attach the timecounter. */ tsc_tc_init(); } #endif static void cpu_init_idle_lwp(struct cpu_info *ci) { struct lwp *l = ci->ci_data.cpu_idlelwp; struct pcb *pcb = lwp_getpcb(l); pcb->pcb_cr0 = rcr0(); } void cpu_init_idle_lwps(void) { struct cpu_info *ci; u_long i; for (i = 0; i < maxcpus; i++) { ci = cpu_lookup(i); if (ci == NULL) continue; if (ci->ci_data.cpu_idlelwp == NULL) continue; if ((ci->ci_flags & CPUF_PRESENT) == 0) continue; cpu_init_idle_lwp(ci); } } #ifdef MULTIPROCESSOR void cpu_start_secondary(struct cpu_info *ci) { u_long psl; int i; #if NLAPIC > 0 paddr_t mp_pdirpa; mp_pdirpa = pmap_init_tmp_pgtbl(mp_trampoline_paddr); cpu_copy_trampoline(mp_pdirpa); #endif atomic_or_32(&ci->ci_flags, CPUF_AP); ci->ci_curlwp = ci->ci_data.cpu_idlelwp; if (CPU_STARTUP(ci, mp_trampoline_paddr) != 0) { return; } /* * Wait for it to become ready. Setting cpu_starting opens the * initial gate and allows the AP to start soft initialization. */ KASSERT(cpu_starting == NULL); cpu_starting = ci; for (i = 100000; (!(ci->ci_flags & CPUF_PRESENT)) && i > 0; i--) { delay_func(10); } if ((ci->ci_flags & CPUF_PRESENT) == 0) { aprint_error_dev(ci->ci_dev, "failed to become ready\n"); #if defined(MPDEBUG) && defined(DDB) printf("dropping into debugger; continue from here to resume boot\n"); Debugger(); #endif } else { /* * Synchronize time stamp counters. Invalidate cache and do * twice (in tsc_sync_bp) to minimize possible cache effects. * Disable interrupts to try and rule out any external * interference. */ psl = x86_read_psl(); x86_disable_intr(); tsc_sync_bp(ci); x86_write_psl(psl); } CPU_START_CLEANUP(ci); cpu_starting = NULL; } void cpu_boot_secondary(struct cpu_info *ci) { int64_t drift; u_long psl; int i; atomic_or_32(&ci->ci_flags, CPUF_GO); for (i = 100000; (!(ci->ci_flags & CPUF_RUNNING)) && i > 0; i--) { delay_func(10); } if ((ci->ci_flags & CPUF_RUNNING) == 0) { aprint_error_dev(ci->ci_dev, "failed to start\n"); #if defined(MPDEBUG) && defined(DDB) printf("dropping into debugger; continue from here to resume boot\n"); Debugger(); #endif } else { /* Synchronize TSC again, check for drift. */ drift = ci->ci_data.cpu_cc_skew; psl = x86_read_psl(); x86_disable_intr(); tsc_sync_bp(ci); x86_write_psl(psl); drift -= ci->ci_data.cpu_cc_skew; aprint_debug_dev(ci->ci_dev, "TSC skew=%lld drift=%lld\n", (long long)ci->ci_data.cpu_cc_skew, (long long)drift); tsc_sync_drift(drift); } } /* * The CPU ends up here when it's ready to run. * This is called from code in mptramp.s; at this point, we are running * in the idle pcb/idle stack of the new CPU. When this function returns, * this processor will enter the idle loop and start looking for work. */ void cpu_hatch(void *v) { struct cpu_info *ci = (struct cpu_info *)v; struct pcb *pcb; int s, i; /* ------------------------------------------------------------- */ /* * This section of code must be compiled with SSP disabled, to * prevent a race against cpu0. See sys/conf/ssp.mk. */ /* * Initialize MSRs on this CPU: * * - On amd64: Enables SYSCALL/SYSRET. * * - On amd64: Sets up %fs and %gs so that %gs points to the * current struct cpu_info as needed for CPUVAR(...), * curcpu(), and curlwp. * * (On i386, CPUVAR(...), curcpu(), and curlwp are made to * work first by the conifguration of segment descriptors in * the Global Descriptor Table (GDT) in initgdt.) * * - Enables the no-execute bit if supported. * * Thus, after this point, CPUVAR(...), curcpu(), and curlwp * will work on this CPU. * * Note: The call to cpu_init_msrs for cpu0 happens in * init386/init_x86_64. */ cpu_init_msrs(ci, true); cpu_probe(ci); cpu_speculation_init(ci); #if NHYPERV > 0 hyperv_init_cpu(ci); #endif ci->ci_data.cpu_cc_freq = cpu_info_primary.ci_data.cpu_cc_freq; /* cpu_get_tsc_freq(ci); */ KDASSERT((ci->ci_flags & CPUF_PRESENT) == 0); /* * Synchronize the TSC for the first time. Note that interrupts are * off at this point. */ atomic_or_32(&ci->ci_flags, CPUF_PRESENT); tsc_sync_ap(ci); /* ------------------------------------------------------------- */ /* * Wait to be brought online. * * Use MONITOR/MWAIT if available. These instructions put the CPU in * a low consumption mode (C-state), and if the TSC is not invariant, * this causes the TSC to drift. We want this to happen, so that we * can later detect (in tsc_tc_init) any abnormal drift with invariant * TSCs. That's just for safety; by definition such drifts should * never occur with invariant TSCs. * * If not available, try PAUSE. We'd like to use HLT, but we have * interrupts off. */ while ((ci->ci_flags & CPUF_GO) == 0) { if ((cpu_feature[1] & CPUID2_MONITOR) != 0) { x86_monitor(&ci->ci_flags, 0, 0); if ((ci->ci_flags & CPUF_GO) != 0) { continue; } x86_mwait(0, 0); } else { /* * XXX The loop repetition count could be a lot higher, but * XXX currently qemu emulator takes a _very_long_time_ to * XXX execute the pause instruction. So for now, use a low * XXX value to allow the cpu to hatch before timing out. */ for (i = 50; i != 0; i--) { x86_pause(); } } } /* Because the text may have been patched in x86_patch(). */ wbinvd(); x86_flush(); tlbflushg(); KASSERT((ci->ci_flags & CPUF_RUNNING) == 0); #ifdef PAE pd_entry_t * l3_pd = ci->ci_pae_l3_pdir; for (i = 0 ; i < PDP_SIZE; i++) { l3_pd[i] = pmap_kernel()->pm_pdirpa[i] | PTE_P; } lcr3(ci->ci_pae_l3_pdirpa); #else lcr3(pmap_pdirpa(pmap_kernel(), 0)); #endif pcb = lwp_getpcb(curlwp); pcb->pcb_cr3 = rcr3(); pcb = lwp_getpcb(ci->ci_data.cpu_idlelwp); lcr0(pcb->pcb_cr0); cpu_init_idt(ci); gdt_init_cpu(ci); #if NLAPIC > 0 lapic_enable(); lapic_set_lvt(); #endif fpuinit(ci); lldt(GSYSSEL(GLDT_SEL, SEL_KPL)); ltr(ci->ci_tss_sel); /* * cpu_init will re-synchronize the TSC, and will detect any abnormal * drift that would have been caused by the use of MONITOR/MWAIT * above. */ cpu_init(ci); #ifdef XENPVHVM xen_hvm_init_cpu(ci); #endif (*x86_initclock_func)(); cpu_get_tsc_freq(ci); s = splhigh(); #if NLAPIC > 0 lapic_write_tpri(0); #endif x86_enable_intr(); splx(s); x86_errata(); aprint_debug_dev(ci->ci_dev, "running\n"); kcsan_cpu_init(ci); idle_loop(NULL); KASSERT(false); } #endif #if defined(DDB) #include <ddb/db_output.h> #include <machine/db_machdep.h> /* * Dump CPU information from ddb. */ void cpu_debug_dump(void) { struct cpu_info *ci; CPU_INFO_ITERATOR cii; const char sixtyfour64space[] = #ifdef _LP64 " " #endif ""; db_printf("addr %sdev id flags ipis spl curlwp " "\n", sixtyfour64space); for (CPU_INFO_FOREACH(cii, ci)) { db_printf("%p %s %ld %x %x %d %10p\n", ci, ci->ci_dev == NULL ? "BOOT" : device_xname(ci->ci_dev), (long)ci->ci_cpuid, ci->ci_flags, ci->ci_ipis, ci->ci_ilevel, ci->ci_curlwp); } } #endif #ifdef MULTIPROCESSOR #if NLAPIC > 0 static void cpu_copy_trampoline(paddr_t pdir_pa) { extern uint32_t nox_flag; extern u_char cpu_spinup_trampoline[]; extern u_char cpu_spinup_trampoline_end[]; vaddr_t mp_trampoline_vaddr; struct { uint32_t large; uint32_t nox; uint32_t pdir; } smp_data; CTASSERT(sizeof(smp_data) == 3 * 4); smp_data.large = (pmap_largepages != 0); smp_data.nox = nox_flag; smp_data.pdir = (uint32_t)(pdir_pa & 0xFFFFFFFF); /* Enter the physical address */ mp_trampoline_vaddr = uvm_km_alloc(kernel_map, PAGE_SIZE, 0, UVM_KMF_VAONLY); pmap_kenter_pa(mp_trampoline_vaddr, mp_trampoline_paddr, VM_PROT_READ | VM_PROT_WRITE, 0); pmap_update(pmap_kernel()); /* Copy boot code */ memcpy((void *)mp_trampoline_vaddr, cpu_spinup_trampoline, cpu_spinup_trampoline_end - cpu_spinup_trampoline); /* Copy smp_data at the end */ memcpy((void *)(mp_trampoline_vaddr + PAGE_SIZE - sizeof(smp_data)), &smp_data, sizeof(smp_data)); pmap_kremove(mp_trampoline_vaddr, PAGE_SIZE); pmap_update(pmap_kernel()); uvm_km_free(kernel_map, mp_trampoline_vaddr, PAGE_SIZE, UVM_KMF_VAONLY); } #endif int mp_cpu_start(struct cpu_info *ci, paddr_t target) { #if NLAPIC > 0 int error; /* * Bootstrap code must be addressable in real mode * and it must be page aligned. */ KASSERT(target < 0x10000 && target % PAGE_SIZE == 0); /* * "The BSP must initialize CMOS shutdown code to 0Ah ..." */ outb(IO_RTC, NVRAM_RESET); outb(IO_RTC+1, NVRAM_RESET_JUMP); /* * "and the warm reset vector (DWORD based at 40:67) to point * to the AP startup code ..." */ unsigned short dwordptr[2]; dwordptr[0] = 0; dwordptr[1] = target >> 4; memcpy((uint8_t *)cmos_data_mapping + 0x467, dwordptr, 4); if ((cpu_feature[0] & CPUID_APIC) == 0) { aprint_error("mp_cpu_start: CPU does not have APIC\n"); return ENODEV; } /* * ... prior to executing the following sequence:". We'll also add in * local cache flush, in case the BIOS has left the AP with its cache * disabled. It may not be able to cope with MP coherency. */ wbinvd(); if (ci->ci_flags & CPUF_AP) { error = x86_ipi_init(ci->ci_cpuid); if (error != 0) { aprint_error_dev(ci->ci_dev, "%s: IPI not taken (1)\n", __func__); return error; } delay_func(10000); error = x86_ipi_startup(ci->ci_cpuid, target / PAGE_SIZE); if (error != 0) { aprint_error_dev(ci->ci_dev, "%s: IPI not taken (2)\n", __func__); return error; } delay_func(200); error = x86_ipi_startup(ci->ci_cpuid, target / PAGE_SIZE); if (error != 0) { aprint_error_dev(ci->ci_dev, "%s: IPI not taken (3)\n", __func__); return error; } delay_func(200); } return 0; #else return ENODEV; #endif /* NLAPIC > 0 */ } void mp_cpu_start_cleanup(struct cpu_info *ci) { /* * Ensure the NVRAM reset byte contains something vaguely sane. */ outb(IO_RTC, NVRAM_RESET); outb(IO_RTC+1, NVRAM_RESET_RST); } #endif #ifdef __x86_64__ typedef void (vector)(void); extern vector Xsyscall, Xsyscall32, Xsyscall_svs; #endif /* * cpu_init_msrs(ci, full) * * Initialize some Model-Specific Registers (MSRs) on the current * CPU, whose struct cpu_info pointer is ci, for: * * - SYSCALL/SYSRET. * - %fs/%gs on amd64 if `full' is true; needed to make * CPUVAR(...), curcpu(), and curlwp work. (We do this at boot, * but skip it on ACPI wakeup.) * - No-execute bit, if supported. * * References: * * - Intel 64 and IA-32 Architectures Software Developer's Manual, * Volume 3: System Programming Guide, Order Number 325384, * April 2022, Sec. 5.8.8 `Fast System Calls in 64-Bit Mode', * pp. 5-22 through 5-23. * * - Intel 64 and IA-32 Architectures Software Developer's Manual, * Volume 4: Model-Specific Registers, Order Number 335592, * April 2022, Sec. 2.1 `Architectural MSRs', Table 2-2, * pp. 2-60 through 2-61. */ void cpu_init_msrs(struct cpu_info *ci, bool full) { #ifdef __x86_64__ /* * On amd64, set up the syscall target address registers * for SYSCALL/SYSRET: * * - IA32_STAR, c000_0081h (MSR_STAR): System Call Target * Address. Code and stack segment selectors for SYSRET * (bits 48:63) and SYSCALL (bits 32:47). * * - IA32_LSTAR, c000_0082h (MSR_LSTAR): IA-32e Mode System * Call Target Address. Target rip for SYSCALL when executed * in 64-bit mode. * * - IA32_CSTAR, c000_0083h (MSR_CSTAR): IA-32e Mode System * Call Target Address. Target rip for SYSCALL when executed * in compatibility mode. (XXX Manual says this is `[n]ot * used, as the SYSCALL instruction is not recognized in * compatibility mode', so why do we set it?) * * - IA32_FMASK, c000_0084h (MSR_SFMASK): System Call Flag * Mask. Mask for the RFLAGS register on SYSCALL. */ wrmsr(MSR_STAR, ((uint64_t)GSEL(GCODE_SEL, SEL_KPL) << 32) | ((uint64_t)LSEL(LSYSRETBASE_SEL, SEL_UPL) << 48)); wrmsr(MSR_LSTAR, (uint64_t)Xsyscall); wrmsr(MSR_CSTAR, (uint64_t)Xsyscall32); wrmsr(MSR_SFMASK, PSL_NT|PSL_T|PSL_I|PSL_C|PSL_D|PSL_AC); #ifdef SVS if (svs_enabled) wrmsr(MSR_LSTAR, (uint64_t)Xsyscall_svs); #endif /* * On amd64 if `full' is true -- used at boot, but not on ACPI * wakeup -- then additionally set up %fs and %gs: * * - IA32_FS_BASE, c000_0100h (MSR_FSBASE): Base address of * %fs. Not used in NetBSD kernel, so zero it. * * - IA32_GS_BASE, c000_0101h (MSR_GSBASE): Base address of * %gs. Used in NetBSD kernel by CPUVAR(...), curcpu(), and * curlwp for access to the CPU-local area, so set it to ci. * * - IA32_KERNEL_GS_BASE, c000_0102h (MSR_KERNELGSBASE): Base * address of what swapgs will leave in %gs when switching to * userland. Zero for now; will be set to pcb->pcb_gs in * cpu_switchto for user threads. */ if (full) { wrmsr(MSR_FSBASE, 0); wrmsr(MSR_GSBASE, (uint64_t)ci); wrmsr(MSR_KERNELGSBASE, 0); } #endif /* __x86_64__ */ /* * If the no-execute bit is supported, enable it in: * * - IA32_EFER, c000_0080h (MSR_EFER): Extended Feature * Enables. */ if (cpu_feature[2] & CPUID_NOX) wrmsr(MSR_EFER, rdmsr(MSR_EFER) | EFER_NXE); } void cpu_offline_md(void) { return; } /* XXX joerg restructure and restart CPUs individually */ static bool cpu_stop(device_t dv) { struct cpu_softc *sc = device_private(dv); struct cpu_info *ci = sc->sc_info; int err; KASSERT((ci->ci_flags & CPUF_PRESENT) != 0); if (CPU_IS_PRIMARY(ci)) return true; if (ci->ci_data.cpu_idlelwp == NULL) return true; sc->sc_wasonline = !(ci->ci_schedstate.spc_flags & SPCF_OFFLINE); if (sc->sc_wasonline) { mutex_enter(&cpu_lock); err = cpu_setstate(ci, false); mutex_exit(&cpu_lock); if (err != 0) return false; } return true; } static bool cpu_suspend(device_t dv, const pmf_qual_t *qual) { struct cpu_softc *sc = device_private(dv); struct cpu_info *ci = sc->sc_info; if ((ci->ci_flags & CPUF_PRESENT) == 0) return true; else { cpufreq_suspend(ci); } return cpu_stop(dv); } static bool cpu_resume(device_t dv, const pmf_qual_t *qual) { struct cpu_softc *sc = device_private(dv); struct cpu_info *ci = sc->sc_info; int err = 0; if ((ci->ci_flags & CPUF_PRESENT) == 0) return true; if (CPU_IS_PRIMARY(ci)) goto out; if (ci->ci_data.cpu_idlelwp == NULL) goto out; if (sc->sc_wasonline) { mutex_enter(&cpu_lock); err = cpu_setstate(ci, true); mutex_exit(&cpu_lock); } out: if (err != 0) return false; cpufreq_resume(ci); return true; } static bool cpu_shutdown(device_t dv, int how) { struct cpu_softc *sc = device_private(dv); struct cpu_info *ci = sc->sc_info; if ((ci->ci_flags & CPUF_BSP) != 0) return false; if ((ci->ci_flags & CPUF_PRESENT) == 0) return true; return cpu_stop(dv); } /* Get the TSC frequency and set it to ci->ci_data.cpu_cc_freq. */ void cpu_get_tsc_freq(struct cpu_info *ci) { uint64_t freq = 0, freq_from_cpuid, t0, t1; int64_t overhead; if (CPU_IS_PRIMARY(ci) && cpu_hascounter()) { /* * If it's the first call of this function, try to get TSC * freq from CPUID by calling cpu_tsc_freq_cpuid(). * The function also set lapic_per_second variable if it's * known. This is required for Intel's Comet Lake and newer * processors to set LAPIC timer correctly. */ if (ci->ci_data.cpu_cc_freq == 0) freq = freq_from_cpuid = cpu_tsc_freq_cpuid(ci); if (freq != 0) aprint_debug_dev(ci->ci_dev, "TSC freq " "from CPUID %" PRIu64 " Hz\n", freq); #if NHPET > 0 if (freq == 0) { freq = hpet_tsc_freq(); if (freq != 0) aprint_debug_dev(ci->ci_dev, "TSC freq " "from HPET %" PRIu64 " Hz\n", freq); } #endif if (freq == 0) { /* * Work out the approximate overhead involved below. * Discard the result of the first go around the * loop. */ overhead = 0; for (int i = 0; i <= 8; i++) { const int s = splhigh(); t0 = cpu_counter(); delay_func(0); t1 = cpu_counter(); splx(s); if (i > 0) { overhead += (t1 - t0); } } overhead >>= 3; /* * Now do the calibration. */ freq = 0; for (int i = 0; i < 1000; i++) { const int s = splhigh(); t0 = cpu_counter(); delay_func(100); t1 = cpu_counter(); splx(s); freq += t1 - t0 - overhead; } freq = freq * 10; aprint_debug_dev(ci->ci_dev, "TSC freq " "from delay %" PRIu64 " Hz\n", freq); } if (ci->ci_data.cpu_cc_freq != 0) { freq_from_cpuid = cpu_tsc_freq_cpuid(ci); if ((freq_from_cpuid != 0) && (freq != freq_from_cpuid)) aprint_verbose_dev(ci->ci_dev, "TSC freq " "calibrated %" PRIu64 " Hz\n", freq); } } else { freq = cpu_info_primary.ci_data.cpu_cc_freq; } ci->ci_data.cpu_cc_freq = freq; } void x86_cpu_idle_mwait(void) { struct cpu_info *ci = curcpu(); KASSERT(ci->ci_ilevel == IPL_NONE); x86_monitor(&ci->ci_want_resched, 0, 0); if (__predict_false(ci->ci_want_resched)) { return; } x86_mwait(0, 0); } void x86_cpu_idle_halt(void) { struct cpu_info *ci = curcpu(); KASSERT(ci->ci_ilevel == IPL_NONE); x86_disable_intr(); if (!__predict_false(ci->ci_want_resched)) { x86_stihlt(); } else { x86_enable_intr(); } } /* * Loads pmap for the current CPU. */ void cpu_load_pmap(struct pmap *pmap, struct pmap *oldpmap) { KASSERT(kpreempt_disabled()); #ifdef SVS if (svs_enabled && pmap_is_user(pmap)) { svs_pdir_switch(pmap); } #endif #ifdef PAE struct cpu_info *ci = curcpu(); bool interrupts_enabled; pd_entry_t *l3_pd = ci->ci_pae_l3_pdir; int i; /* * disable interrupts to block TLB shootdowns, which can reload cr3. * while this doesn't block NMIs, it's probably ok as NMIs unlikely * reload cr3. */ interrupts_enabled = (x86_read_flags() & PSL_I) != 0; if (interrupts_enabled) x86_disable_intr(); for (i = 0 ; i < PDP_SIZE; i++) { l3_pd[i] = pmap->pm_pdirpa[i] | PTE_P; } if (interrupts_enabled) x86_enable_intr(); tlbflush(); #else lcr3(pmap_pdirpa(pmap, 0)); #endif } /* * Notify all other cpus to halt. */ void cpu_broadcast_halt(void) { x86_broadcast_ipi(X86_IPI_HALT); } /* * Send a dummy ipi to a cpu to force it to run splraise()/spllower(), * and trigger an AST on the running LWP. */ void cpu_kick(struct cpu_info *ci) { x86_send_ipi(ci, X86_IPI_AST); }
4 4 4 4 4 4 4 4 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 /* $NetBSD: rtc.c,v 1.2 2022/12/30 21:40:20 jakllsch Exp $ */ /*- * Copyright (c) 1990 The Regents of the University of California. * All rights reserved. * * This code is derived from software contributed to Berkeley by * William Jolitz and Don Ahn. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)clock.c 7.2 (Berkeley) 5/12/91 */ /*- * Copyright (c) 1993, 1994 Charles M. Hannum. * * This code is derived from software contributed to Berkeley by * William Jolitz and Don Ahn. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)clock.c 7.2 (Berkeley) 5/12/91 */ /* * Mach Operating System * Copyright (c) 1991,1990,1989 Carnegie Mellon University * All Rights Reserved. * * Permission to use, copy, modify and distribute this software and its * documentation is hereby granted, provided that both the copyright * notice and this permission notice appear in all copies of the * software, derivative works or modified versions, and any portions * thereof, and that both notices appear in supporting documentation. * * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. * * Carnegie Mellon requests users of this software to return to * * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU * School of Computer Science * Carnegie Mellon University * Pittsburgh PA 15213-3890 * * any improvements or extensions that they make and grant Carnegie Mellon * the rights to redistribute these changes. */ /* Copyright 1988, 1989 by Intel Corporation, Santa Clara, California. All Rights Reserved Permission to use, copy, modify, and distribute this software and its documentation for any purpose and without fee is hereby granted, provided that the above copyright notice appears in all copies and that both the copyright notice and this permission notice appear in supporting documentation, and that the name of Intel not be used in advertising or publicity pertaining to distribution of the software without specific, written prior permission. INTEL DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL INTEL BE LIABLE FOR ANY SPECIAL, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN ACTION OF CONTRACT, NEGLIGENCE, OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ /* * Primitive RTC chip routines. */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: rtc.c,v 1.2 2022/12/30 21:40:20 jakllsch Exp $"); #include <sys/param.h> #include <sys/systm.h> #include <sys/time.h> #include <sys/timetc.h> #include <sys/kernel.h> #include <dev/isa/isareg.h> #include <dev/isa/isavar.h> #include <i386/isa/nvram.h> #include <machine/pio.h> #include <dev/ic/mc146818reg.h> #include <x86/rtc.h> #ifndef __x86_64__ #include "mca.h" #endif #if NMCA > 0 #include <machine/mca_machdep.h> /* for MCA_system */ #endif #include "acpica.h" #if NACPICA > 0 #include <dev/acpi/acpivar.h> #endif static void rtcinit(void); static int rtcget(mc_todregs *); static void rtcput(mc_todregs *); static int cmoscheck(void); static int clock_expandyear(int); /* XXX use sc? */ u_int mc146818_read(void *sc, u_int reg) { outb(IO_RTC, reg); return (inb(IO_RTC+1)); } void mc146818_write(void *sc, u_int reg, u_int datum) { outb(IO_RTC, reg); outb(IO_RTC+1, datum); } static void rtcinit(void) { static int first_rtcopen_ever = 1; if (!first_rtcopen_ever) return; first_rtcopen_ever = 0; mc146818_write(NULL, MC_REGA, /* XXX softc */ MC_BASE_32_KHz | MC_RATE_1024_Hz); mc146818_write(NULL, MC_REGB, MC_REGB_24HR); /* XXX softc */ } static int rtcget(mc_todregs *regs) { rtcinit(); if ((mc146818_read(NULL, MC_REGD) & MC_REGD_VRT) == 0) /* XXX softc */ return (-1); MC146818_GETTOD(NULL, regs); /* XXX softc */ return (0); } static void rtcput(mc_todregs *regs) { rtcinit(); MC146818_PUTTOD(NULL, regs); /* XXX softc */ } /* * check whether the CMOS layout is "standard"-like (ie, not PS/2-like), * to be called at splclock() */ static int cmoscheck(void) { int i; unsigned short cksum = 0; for (i = 0x10; i <= 0x2d; i++) cksum += mc146818_read(NULL, i); /* XXX softc */ return (cksum == (mc146818_read(NULL, 0x2e) << 8) + mc146818_read(NULL, 0x2f)); } #if NMCA > 0 /* * Check whether the CMOS layout is PS/2 like, to be called at splclock(). */ static int cmoscheckps2(void); static int cmoscheckps2(void) { #if 0 /* Disabled until I find out the CRC checksum algorithm IBM uses */ int i; unsigned short cksum = 0; for (i = 0x10; i <= 0x31; i++) cksum += mc146818_read(NULL, i); /* XXX softc */ return (cksum == (mc146818_read(NULL, 0x32) << 8) + mc146818_read(NULL, 0x33)); #else /* Check 'incorrect checksum' bit of IBM PS/2 Diagnostic Status Byte */ return ((mc146818_read(NULL, NVRAM_DIAG) & (1<<6)) == 0); #endif } #endif /* NMCA > 0 */ /* * patchable to control century byte handling: * 1: always update * -1: never touch * 0: try to figure out itself */ int rtc_update_century = 0; /* * Expand a two-digit year as read from the clock chip * into full width. * Being here, deal with the CMOS century byte. */ static int centb = NVRAM_CENTURY; static int clock_expandyear(int clockyear) { int s, clockcentury, cmoscentury; clockcentury = (clockyear < 70) ? 20 : 19; clockyear += 100 * clockcentury; if (rtc_update_century < 0) return (clockyear); s = splclock(); #if NACPICA > 0 if (acpi_active) cmoscentury = mc146818_read(NULL, (centb = AcpiGbl_FADT.Century)); else #endif if (cmoscheck()) cmoscentury = mc146818_read(NULL, NVRAM_CENTURY); #if NMCA > 0 else if (MCA_system && cmoscheckps2()) cmoscentury = mc146818_read(NULL, (centb = 0x37)); #endif else cmoscentury = 0; splx(s); if (!cmoscentury) { #ifdef DIAGNOSTIC printf("clock: unknown CMOS layout\n"); #endif return (clockyear); } cmoscentury = bcdtobin(cmoscentury); if (cmoscentury != clockcentury) { /* XXX note: saying "century is 20" might confuse the naive. */ printf("WARNING: NVRAM century is %d but RTC year is %d\n", cmoscentury, clockyear); /* Kludge to roll over century. */ if ((rtc_update_century > 0) || ((cmoscentury == 19) && (clockcentury == 20) && (clockyear == 2000))) { printf("WARNING: Setting NVRAM century to %d\n", clockcentury); s = splclock(); mc146818_write(NULL, centb, bintobcd(clockcentury)); splx(s); } } else if (cmoscentury == 19 && rtc_update_century == 0) rtc_update_century = 1; /* will update later in resettodr() */ return (clockyear); } int rtc_get_ymdhms(todr_chip_handle_t tch, struct clock_ymdhms *dt) { int s; mc_todregs rtclk; s = splclock(); if (rtcget(&rtclk)) { splx(s); return -1; } splx(s); dt->dt_sec = bcdtobin(rtclk[MC_SEC]); dt->dt_min = bcdtobin(rtclk[MC_MIN]); dt->dt_hour = bcdtobin(rtclk[MC_HOUR]); dt->dt_day = bcdtobin(rtclk[MC_DOM]); dt->dt_mon = bcdtobin(rtclk[MC_MONTH]); dt->dt_year = clock_expandyear(bcdtobin(rtclk[MC_YEAR])); return 0; } int rtc_set_ymdhms(todr_chip_handle_t tch, struct clock_ymdhms *dt) { mc_todregs rtclk; int century; int s; s = splclock(); if (rtcget(&rtclk)) memset(&rtclk, 0, sizeof(rtclk)); splx(s); rtclk[MC_SEC] = bintobcd(dt->dt_sec); rtclk[MC_MIN] = bintobcd(dt->dt_min); rtclk[MC_HOUR] = bintobcd(dt->dt_hour); rtclk[MC_DOW] = dt->dt_wday + 1; rtclk[MC_YEAR] = bintobcd(dt->dt_year % 100); rtclk[MC_MONTH] = bintobcd(dt->dt_mon); rtclk[MC_DOM] = bintobcd(dt->dt_day); #ifdef DEBUG_CLOCK printf("setclock: %x/%x/%x %x:%x:%x\n", rtclk[MC_YEAR], rtclk[MC_MONTH], rtclk[MC_DOM], rtclk[MC_HOUR], rtclk[MC_MIN], rtclk[MC_SEC]); #endif s = splclock(); rtcput(&rtclk); if (rtc_update_century > 0) { century = bintobcd(dt->dt_year / 100); mc146818_write(NULL, centb, century); /* XXX softc */ } splx(s); return 0; } void rtc_register(void) { static struct todr_chip_handle tch; tch.todr_gettime_ymdhms = rtc_get_ymdhms; tch.todr_settime_ymdhms = rtc_set_ymdhms; tch.todr_setwen = NULL; todr_attach(&tch); }
3 5 5 1 9 13 13 4 33 5 22 27 27 15 15 26 26 13 13 29 29 29 16 16 16 6 6 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 /* $NetBSD: tmpfs_mem.c,v 1.14 2023/04/29 06:29:55 riastradh Exp $ */ /* * Copyright (c) 2010, 2011, 2020 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Mindaugas Rasiukevicius. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /* * tmpfs memory allocation routines. * Implements memory usage accounting and limiting. */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: tmpfs_mem.c,v 1.14 2023/04/29 06:29:55 riastradh Exp $"); #include <sys/param.h> #include <sys/atomic.h> #include <sys/kmem.h> #include <sys/namei.h> #include <sys/pool.h> #include <fs/tmpfs/tmpfs.h> extern struct pool tmpfs_dirent_pool; extern struct pool tmpfs_node_pool; void tmpfs_mntmem_init(struct tmpfs_mount *mp, uint64_t memlimit) { mutex_init(&mp->tm_acc_lock, MUTEX_DEFAULT, IPL_NONE); mp->tm_mem_limit = memlimit; mp->tm_bytes_used = 0; } void tmpfs_mntmem_destroy(struct tmpfs_mount *mp) { KASSERT(mp->tm_bytes_used == 0); mutex_destroy(&mp->tm_acc_lock); } int tmpfs_mntmem_set(struct tmpfs_mount *mp, uint64_t memlimit) { int error; mutex_enter(&mp->tm_acc_lock); if (round_page(mp->tm_bytes_used) >= memlimit) error = EBUSY; else { error = 0; mp->tm_mem_limit = memlimit; } mutex_exit(&mp->tm_acc_lock); return error; } /* * tmpfs_mem_info: return the number of available memory pages. * * => If 'total' is true, then return _total_ amount of pages. * => If false, then return the amount of _free_ memory pages. * * Remember to remove uvmexp.freetarg from the returned value to avoid * excessive memory usage. */ size_t tmpfs_mem_info(bool total) { size_t size = 0; size += uvmexp.swpgavail; if (!total) { size -= uvmexp.swpgonly; } size += uvm_availmem(true); size += uvmexp.filepages; if (size > uvmexp.wired) { size -= uvmexp.wired; } else { size = 0; } return size; } uint64_t tmpfs_bytes_max(struct tmpfs_mount *mp) { psize_t freepages = tmpfs_mem_info(false); int freetarg = uvmexp.freetarg; // XXX unlocked uint64_t avail_mem; if (freepages < freetarg) { freepages = 0; } else { freepages -= freetarg; } avail_mem = round_page(mp->tm_bytes_used) + (freepages << PAGE_SHIFT); return MIN(mp->tm_mem_limit, avail_mem); } size_t tmpfs_pages_avail(struct tmpfs_mount *mp) { return (tmpfs_bytes_max(mp) - mp->tm_bytes_used) >> PAGE_SHIFT; } bool tmpfs_mem_incr(struct tmpfs_mount *mp, size_t sz) { uint64_t lim; mutex_enter(&mp->tm_acc_lock); lim = tmpfs_bytes_max(mp); if (mp->tm_bytes_used + sz >= lim) { mutex_exit(&mp->tm_acc_lock); return false; } mp->tm_bytes_used += sz; mutex_exit(&mp->tm_acc_lock); return true; } void tmpfs_mem_decr(struct tmpfs_mount *mp, size_t sz) { mutex_enter(&mp->tm_acc_lock); KASSERT(mp->tm_bytes_used >= sz); mp->tm_bytes_used -= sz; mutex_exit(&mp->tm_acc_lock); } struct tmpfs_dirent * tmpfs_dirent_get(struct tmpfs_mount *mp) { if (!tmpfs_mem_incr(mp, sizeof(struct tmpfs_dirent))) { return NULL; } return pool_get(&tmpfs_dirent_pool, PR_WAITOK); } void tmpfs_dirent_put(struct tmpfs_mount *mp, struct tmpfs_dirent *de) { tmpfs_mem_decr(mp, sizeof(struct tmpfs_dirent)); pool_put(&tmpfs_dirent_pool, de); } struct tmpfs_node * tmpfs_node_get(struct tmpfs_mount *mp) { if (atomic_inc_uint_nv(&mp->tm_nodes_cnt) >= mp->tm_nodes_max) { atomic_dec_uint(&mp->tm_nodes_cnt); return NULL; } if (!tmpfs_mem_incr(mp, sizeof(struct tmpfs_node))) { atomic_dec_uint(&mp->tm_nodes_cnt); return NULL; } return pool_get(&tmpfs_node_pool, PR_WAITOK); } void tmpfs_node_put(struct tmpfs_mount *mp, struct tmpfs_node *tn) { atomic_dec_uint(&mp->tm_nodes_cnt); tmpfs_mem_decr(mp, sizeof(struct tmpfs_node)); pool_put(&tmpfs_node_pool, tn); } /* * Quantum size to round-up the tmpfs names in order to reduce re-allocations. */ #define TMPFS_NAME_QUANTUM (32) char * tmpfs_strname_alloc(struct tmpfs_mount *mp, size_t len) { const size_t sz = roundup2(len, TMPFS_NAME_QUANTUM); KASSERT(sz > 0 && sz <= 1024); if (!tmpfs_mem_incr(mp, sz)) { return NULL; } return kmem_alloc(sz, KM_SLEEP); } void tmpfs_strname_free(struct tmpfs_mount *mp, char *str, size_t len) { const size_t sz = roundup2(len, TMPFS_NAME_QUANTUM); KASSERT(sz > 0 && sz <= 1024); tmpfs_mem_decr(mp, sz); kmem_free(str, sz); } bool tmpfs_strname_neqlen(struct componentname *fcnp, struct componentname *tcnp) { const size_t fln = fcnp->cn_namelen; const size_t tln = tcnp->cn_namelen; return (fln != tln) || memcmp(fcnp->cn_nameptr, tcnp->cn_nameptr, fln); }
1025 13 1019 31 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 /* $NetBSD: time.h,v 1.80 2022/06/26 22:31:38 riastradh Exp $ */ /* * Copyright (c) 1982, 1986, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)time.h 8.5 (Berkeley) 5/4/95 */ #ifndef _SYS_TIME_H_ #define _SYS_TIME_H_ #include <sys/featuretest.h> #include <sys/types.h> /* * Structure returned by gettimeofday(2) system call, * and used in other calls. */ struct timeval { time_t tv_sec; /* seconds */ suseconds_t tv_usec; /* and microseconds */ }; #include <sys/timespec.h> #if defined(_NETBSD_SOURCE) #define TIMEVAL_TO_TIMESPEC(tv, ts) do { \ (ts)->tv_sec = (tv)->tv_sec; \ (ts)->tv_nsec = (tv)->tv_usec * 1000; \ } while (/*CONSTCOND*/0) #define TIMESPEC_TO_TIMEVAL(tv, ts) do { \ (tv)->tv_sec = (ts)->tv_sec; \ (tv)->tv_usec = (suseconds_t)(ts)->tv_nsec / 1000; \ } while (/*CONSTCOND*/0) /* * Note: timezone is obsolete. All timezone handling is now in * userland. Its just here for back compatibility. */ struct timezone { int tz_minuteswest; /* minutes west of Greenwich */ int tz_dsttime; /* type of dst correction */ }; /* Operations on timevals. */ #define timerclear(tvp) (tvp)->tv_sec = (tvp)->tv_usec = 0L #define timerisset(tvp) ((tvp)->tv_sec || (tvp)->tv_usec) #define timercmp(tvp, uvp, cmp) \ (((tvp)->tv_sec == (uvp)->tv_sec) ? \ ((tvp)->tv_usec cmp (uvp)->tv_usec) : \ ((tvp)->tv_sec cmp (uvp)->tv_sec)) #define timeradd(tvp, uvp, vvp) \ do { \ (vvp)->tv_sec = (tvp)->tv_sec + (uvp)->tv_sec; \ (vvp)->tv_usec = (tvp)->tv_usec + (uvp)->tv_usec; \ if ((vvp)->tv_usec >= 1000000) { \ (vvp)->tv_sec++; \ (vvp)->tv_usec -= 1000000; \ } \ } while (/* CONSTCOND */ 0) #define timersub(tvp, uvp, vvp) \ do { \ (vvp)->tv_sec = (tvp)->tv_sec - (uvp)->tv_sec; \ (vvp)->tv_usec = (tvp)->tv_usec - (uvp)->tv_usec; \ if ((vvp)->tv_usec < 0) { \ (vvp)->tv_sec--; \ (vvp)->tv_usec += 1000000; \ } \ } while (/* CONSTCOND */ 0) /* * hide bintime for _STANDALONE because this header is used for hpcboot.exe, * which is built with compilers which don't recognize LL suffix. * http://mail-index.NetBSD.org/tech-userlevel/2008/02/27/msg000181.html */ #if !defined(_STANDALONE) struct bintime { time_t sec; uint64_t frac; }; static __inline void bintime_addx(struct bintime *bt, uint64_t x) { uint64_t u; u = bt->frac; bt->frac += x; if (u > bt->frac) bt->sec++; } static __inline void bintime_add(struct bintime *bt, const struct bintime *bt2) { uint64_t u; u = bt->frac; bt->frac += bt2->frac; if (u > bt->frac) bt->sec++; bt->sec += bt2->sec; } static __inline void bintime_sub(struct bintime *bt, const struct bintime *bt2) { uint64_t u; u = bt->frac; bt->frac -= bt2->frac; if (u < bt->frac) bt->sec--; bt->sec -= bt2->sec; } #define bintimecmp(bta, btb, cmp) \ (((bta)->sec == (btb)->sec) ? \ ((bta)->frac cmp (btb)->frac) : \ ((bta)->sec cmp (btb)->sec)) /*- * Background information: * * When converting between timestamps on parallel timescales of differing * resolutions it is historical and scientific practice to round down rather * than doing 4/5 rounding. * * The date changes at midnight, not at noon. * * Even at 15:59:59.999999999 it's not four'o'clock. * * time_second ticks after N.999999999 not after N.4999999999 */ /* * The magic numbers for converting ms/us/ns to fractions */ /* 1ms = (2^64) / 1000 */ #define BINTIME_SCALE_MS ((uint64_t)18446744073709551ULL) /* 1us = (2^64) / 1000000 */ #define BINTIME_SCALE_US ((uint64_t)18446744073709ULL) /* 1ns = (2^64) / 1000000000 */ #define BINTIME_SCALE_NS ((uint64_t)18446744073ULL) static __inline void bintime2timespec(const struct bintime *bt, struct timespec *ts) { ts->tv_sec = bt->sec; ts->tv_nsec = (long)((1000000000ULL * (uint32_t)(bt->frac >> 32)) >> 32); } static __inline void timespec2bintime(const struct timespec *ts, struct bintime *bt) { bt->sec = ts->tv_sec; bt->frac = (uint64_t)ts->tv_nsec * BINTIME_SCALE_NS; } static __inline void bintime2timeval(const struct bintime *bt, struct timeval *tv) { tv->tv_sec = bt->sec; tv->tv_usec = (suseconds_t)((1000000ULL * (uint32_t)(bt->frac >> 32)) >> 32); } static __inline void timeval2bintime(const struct timeval *tv, struct bintime *bt) { bt->sec = tv->tv_sec; bt->frac = (uint64_t)tv->tv_usec * BINTIME_SCALE_US; } static __inline struct bintime ms2bintime(uint64_t ms) { struct bintime bt; bt.sec = (time_t)(ms / 1000U); bt.frac = (uint64_t)(ms % 1000U) * BINTIME_SCALE_MS; return bt; } static __inline struct bintime us2bintime(uint64_t us) { struct bintime bt; bt.sec = (time_t)(us / 1000000U); bt.frac = (uint64_t)(us % 1000000U) * BINTIME_SCALE_US; return bt; } static __inline struct bintime ns2bintime(uint64_t ns) { struct bintime bt; bt.sec = (time_t)(ns / 1000000000U); bt.frac = (uint64_t)(ns % 1000000000U) * BINTIME_SCALE_NS; return bt; } #endif /* !defined(_STANDALONE) */ /* Operations on timespecs. */ #define timespecclear(tsp) (tsp)->tv_sec = (time_t)((tsp)->tv_nsec = 0L) #define timespecisset(tsp) ((tsp)->tv_sec || (tsp)->tv_nsec) #define timespeccmp(tsp, usp, cmp) \ (((tsp)->tv_sec == (usp)->tv_sec) ? \ ((tsp)->tv_nsec cmp (usp)->tv_nsec) : \ ((tsp)->tv_sec cmp (usp)->tv_sec)) #define timespecadd(tsp, usp, vsp) \ do { \ (vsp)->tv_sec = (tsp)->tv_sec + (usp)->tv_sec; \ (vsp)->tv_nsec = (tsp)->tv_nsec + (usp)->tv_nsec; \ if ((vsp)->tv_nsec >= 1000000000L) { \ (vsp)->tv_sec++; \ (vsp)->tv_nsec -= 1000000000L; \ } \ } while (/* CONSTCOND */ 0) #define timespecsub(tsp, usp, vsp) \ do { \ (vsp)->tv_sec = (tsp)->tv_sec - (usp)->tv_sec; \ (vsp)->tv_nsec = (tsp)->tv_nsec - (usp)->tv_nsec; \ if ((vsp)->tv_nsec < 0) { \ (vsp)->tv_sec--; \ (vsp)->tv_nsec += 1000000000L; \ } \ } while (/* CONSTCOND */ 0) #define timespec2ns(x) (((uint64_t)(x)->tv_sec) * 1000000000L + (x)->tv_nsec) #ifdef _KERNEL bool timespecaddok(const struct timespec *, const struct timespec *) __pure; bool timespecsubok(const struct timespec *, const struct timespec *) __pure; #endif #endif /* _NETBSD_SOURCE */ /* * Names of the interval timers, and structure * defining a timer setting. * NB: Must match the CLOCK_ constants below. */ #define ITIMER_REAL 0 #define ITIMER_VIRTUAL 1 #define ITIMER_PROF 2 #define ITIMER_MONOTONIC 3 struct itimerval { struct timeval it_interval; /* timer interval */ struct timeval it_value; /* current value */ }; /* * Structure defined by POSIX.1b to be like a itimerval, but with * timespecs. Used in the timer_*() system calls. */ struct itimerspec { struct timespec it_interval; struct timespec it_value; }; #define CLOCK_REALTIME 0 #define CLOCK_VIRTUAL 1 #define CLOCK_PROF 2 #define CLOCK_MONOTONIC 3 #define CLOCK_THREAD_CPUTIME_ID 0x20000000 #define CLOCK_PROCESS_CPUTIME_ID 0x40000000 #if defined(_NETBSD_SOURCE) #define TIMER_RELTIME 0x0 /* relative timer */ #endif #define TIMER_ABSTIME 0x1 /* absolute timer */ #ifdef _KERNEL #include <sys/timevar.h> #else /* !_KERNEL */ #ifndef _STANDALONE #if (_POSIX_C_SOURCE - 0) >= 200112L || \ (defined(_XOPEN_SOURCE) && defined(_XOPEN_SOURCE_EXTENDED)) || \ (_XOPEN_SOURCE - 0) >= 500 || defined(_NETBSD_SOURCE) #include <sys/select.h> #endif #include <sys/cdefs.h> #include <time.h> __BEGIN_DECLS #ifndef __LIBC12_SOURCE__ #if (_POSIX_C_SOURCE - 0) >= 200112L || \ defined(_XOPEN_SOURCE) || defined(_NETBSD_SOURCE) int getitimer(int, struct itimerval *) __RENAME(__getitimer50); int gettimeofday(struct timeval * __restrict, void *__restrict) __RENAME(__gettimeofday50); int setitimer(int, const struct itimerval * __restrict, struct itimerval * __restrict) __RENAME(__setitimer50); int utimes(const char *, const struct timeval [2]) __RENAME(__utimes50); #endif /* _POSIX_C_SOURCE >= 200112L || _XOPEN_SOURCE || _NETBSD_SOURCE */ #if defined(_NETBSD_SOURCE) || defined(HAVE_NBTOOL_CONFIG_H) int adjtime(const struct timeval *, struct timeval *) __RENAME(__adjtime50); int futimes(int, const struct timeval [2]) __RENAME(__futimes50); int lutimes(const char *, const struct timeval [2]) __RENAME(__lutimes50); int settimeofday(const struct timeval * __restrict, const void *__restrict) __RENAME(__settimeofday50); #endif /* _NETBSD_SOURCE */ #endif /* __LIBC12_SOURCE__ */ __END_DECLS #endif /* !_STANDALONE */ #endif /* !_KERNEL */ #endif /* !_SYS_TIME_H_ */
468 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 /* $NetBSD: uvm_pdpolicy.h,v 1.9 2022/08/20 23:26:02 riastradh Exp $ */ /*- * Copyright (c)2005, 2006 YAMAMOTO Takashi, * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #ifndef _UVM_PDPOLICY_H_ #define _UVM_PDPOLICY_H_ #include <sys/mutex.h> #include <sys/stdint.h> #include <uvm/uvm_page.h> struct krwlock; struct uvm_cpu; struct vm_anon; struct vm_page; /* * these API is for uvm internal use only. * don't use them directly from outside of /sys/uvm. */ void uvmpdpol_idle(struct uvm_cpu *); void uvmpdpol_init(void); void uvmpdpol_init_cpu(struct uvm_cpu *); void uvmpdpol_reinit(void); void uvmpdpol_estimatepageable(int *, int *); bool uvmpdpol_needsscan_p(void); void uvmpdpol_pageactivate(struct vm_page *); void uvmpdpol_pagedeactivate(struct vm_page *); void uvmpdpol_pagedequeue(struct vm_page *); void uvmpdpol_pageenqueue(struct vm_page *); bool uvmpdpol_pageactivate_p(struct vm_page *); bool uvmpdpol_pageisqueued_p(struct vm_page *); void uvmpdpol_pagerealize(struct vm_page *); void uvmpdpol_anfree(struct vm_anon *); void uvmpdpol_tune(void); void uvmpdpol_scaninit(void); void uvmpdpol_scanfini(void); struct vm_page *uvmpdpol_selectvictim(struct krwlock **); void uvmpdpol_balancequeue(int); void uvmpdpol_sysctlsetup(void); /* * uvmpdpol_set_intent: set an intended state for the page, taking care not * to overwrite any of the other flags. */ static inline void uvmpdpol_set_intent(struct vm_page *pg, uint32_t i) { KASSERT(mutex_owned(&pg->interlock)); pg->pqflags = PQ_INTENT_SET | (pg->pqflags & ~PQ_INTENT_MASK) | i; } #endif /* !_UVM_PDPOLICY_H_ */
1 3 3 3 3 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 /* $NetBSD: usbdi_util.c,v 1.88 2024/02/04 05:43:06 mrg Exp $ */ /* * Copyright (c) 1998, 2012 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Lennart Augustsson (lennart@augustsson.net) at * Carlstedt Research & Technology and Matthew R. Green (mrg@eterna23.net). * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: usbdi_util.c,v 1.88 2024/02/04 05:43:06 mrg Exp $"); #ifdef _KERNEL_OPT #include "opt_usb.h" #endif #include <sys/param.h> #include <sys/systm.h> #include <sys/kernel.h> #include <sys/kmem.h> #include <sys/proc.h> #include <sys/device.h> #include <sys/bus.h> #include <dev/usb/usb.h> #include <dev/usb/usbhid.h> #include <dev/usb/usbdi.h> #include <dev/usb/usbdivar.h> #include <dev/usb/usbdi_util.h> #include <dev/usb/usb_quirks.h> #include <dev/usb/usbhist.h> #define DPRINTF(FMT,A,B,C,D) USBHIST_LOGN(usbdebug,1,FMT,A,B,C,D) #define DPRINTFN(N,FMT,A,B,C,D) USBHIST_LOGN(usbdebug,N,FMT,A,B,C,D) usbd_status usbd_get_desc(struct usbd_device *dev, int type, int index, int len, void *desc) { usb_device_request_t req; usbd_status err; USBHIST_FUNC(); USBHIST_CALLARGS(usbdebug, "type=%jd, index=%jd, len=%jd", type, index, len, 0); /* * Provide hard-coded configuration descriptors * for devices that may corrupt it. This cannot * be done for device descriptors which are used * to identify the device. */ if (type != UDESC_DEVICE && dev->ud_quirks->uq_flags & UQ_DESC_CORRUPT) { err = usbd_get_desc_fake(dev, type, index, len, desc); goto out; } req.bmRequestType = UT_READ_DEVICE; req.bRequest = UR_GET_DESCRIPTOR; USETW2(req.wValue, type, index); USETW(req.wIndex, 0); USETW(req.wLength, len); err = usbd_do_request(dev, &req, desc); out: return err; } usbd_status usbd_get_config_desc(struct usbd_device *dev, int confidx, usb_config_descriptor_t *d) { USBHIST_FUNC(); USBHIST_CALLARGS(usbdebug, "confidx=%jd", confidx, 0, 0, 0); usbd_status err; err = usbd_get_desc(dev, UDESC_CONFIG, confidx, USB_CONFIG_DESCRIPTOR_SIZE, d); if (err) return err; if (d->bDescriptorType != UDESC_CONFIG) { DPRINTFN(1, "confidx=%jd, bad desc len=%jd type=%jd", confidx, d->bLength, d->bDescriptorType, 0); return USBD_INVAL; } return USBD_NORMAL_COMPLETION; } usbd_status usbd_get_config_desc_full(struct usbd_device *dev, int conf, void *d, int size) { USBHIST_FUNC(); USBHIST_CALLARGS(usbdebug, "conf=%jd", conf, 0, 0, 0); return usbd_get_desc(dev, UDESC_CONFIG, conf, size, d); } usbd_status usbd_get_bos_desc(struct usbd_device *dev, int confidx, usb_bos_descriptor_t *d) { USBHIST_FUNC(); USBHIST_CALLARGS(usbdebug, "confidx=%jd", confidx, 0, 0, 0); usbd_status err; err = usbd_get_desc(dev, UDESC_BOS, confidx, USB_BOS_DESCRIPTOR_SIZE, d); if (err) return err; if (d->bDescriptorType != UDESC_BOS) { DPRINTFN(1, "confidx=%jd, bad desc len=%jd type=%jd", confidx, d->bLength, d->bDescriptorType, 0); return USBD_INVAL; } return USBD_NORMAL_COMPLETION; } usbd_status usbd_get_device_desc(struct usbd_device *dev, usb_device_descriptor_t *d) { USBHIST_FUNC(); USBHIST_CALLED(usbdebug); return usbd_get_desc(dev, UDESC_DEVICE, 0, USB_DEVICE_DESCRIPTOR_SIZE, d); } /* * Get the first 8 bytes of the device descriptor. * Do as Windows does: try to read 64 bytes -- there are devices which * recognize the initial descriptor fetch (before the control endpoint's * MaxPacketSize is known by the host) by exactly this length. */ usbd_status usbd_get_initial_ddesc(struct usbd_device *dev, usb_device_descriptor_t *desc) { USBHIST_FUNC(); USBHIST_CALLARGS(usbdebug, "dev %#jx", (uintptr_t)dev, 0, 0, 0); usb_device_request_t req; char buf[64]; int res, actlen; req.bmRequestType = UT_READ_DEVICE; req.bRequest = UR_GET_DESCRIPTOR; USETW2(req.wValue, UDESC_DEVICE, 0); USETW(req.wIndex, 0); USETW(req.wLength, 8); res = usbd_do_request_flags(dev, &req, buf, USBD_SHORT_XFER_OK, &actlen, USBD_DEFAULT_TIMEOUT); if (res) return res; if (actlen < 8) return USBD_SHORT_XFER; memcpy(desc, buf, 8); return USBD_NORMAL_COMPLETION; } usbd_status usbd_get_string_desc(struct usbd_device *dev, int sindex, int langid, usb_string_descriptor_t *sdesc, int *sizep) { USBHIST_FUNC(); USBHIST_CALLED(usbdebug); usb_device_request_t req; usbd_status err; int actlen; /* * Pass a full-sized buffer to usbd_do_request_len(). At least * one device has been seen returning additional data beyond the * provided buffers (2-bytes written shortly after the request * claims to have completed and returned the 2 byte header, * corrupting other memory.) */ req.bmRequestType = UT_READ_DEVICE; req.bRequest = UR_GET_DESCRIPTOR; USETW2(req.wValue, UDESC_STRING, sindex); USETW(req.wIndex, langid); USETW(req.wLength, 2); /* only size byte first */ err = usbd_do_request_len(dev, &req, sizeof(*sdesc), sdesc, USBD_SHORT_XFER_OK, &actlen, USBD_DEFAULT_TIMEOUT); if (err) return err; if (actlen < 2) return USBD_SHORT_XFER; if (sdesc->bLength > sizeof(*sdesc)) return USBD_INVAL; USETW(req.wLength, sdesc->bLength); /* the whole string */ err = usbd_do_request_len(dev, &req, sizeof(*sdesc), sdesc, USBD_SHORT_XFER_OK, &actlen, USBD_DEFAULT_TIMEOUT); if (err) return err; if (actlen != sdesc->bLength) { DPRINTF("expected %jd, got %jd", sdesc->bLength, actlen, 0, 0); } *sizep = actlen; return USBD_NORMAL_COMPLETION; } /* -------------------------------------------------------------------------- */ usbd_status usbd_get_device_status(struct usbd_device *dev, usb_status_t *st) { USBHIST_FUNC(); USBHIST_CALLED(usbdebug); usb_device_request_t req; req.bmRequestType = UT_READ_DEVICE; req.bRequest = UR_GET_STATUS; USETW(req.wValue, 0); USETW(req.wIndex, 0); USETW(req.wLength, sizeof(usb_status_t)); return usbd_do_request(dev, &req, st); } usbd_status usbd_get_hub_status(struct usbd_device *dev, usb_hub_status_t *st) { USBHIST_FUNC(); USBHIST_CALLARGS(usbdebug, "dev %#jx", (uintptr_t)dev, 0, 0, 0); usb_device_request_t req; req.bmRequestType = UT_READ_CLASS_DEVICE; req.bRequest = UR_GET_STATUS; USETW(req.wValue, 0); USETW(req.wIndex, 0); USETW(req.wLength, sizeof(usb_hub_status_t)); return usbd_do_request(dev, &req, st); } usbd_status usbd_get_port_status(struct usbd_device *dev, int port, usb_port_status_t *ps) { USBHIST_FUNC(); USBHIST_CALLARGS(usbdebug, "dev %#jx port %jd", (uintptr_t)dev, port, 0, 0); usb_device_request_t req; req.bmRequestType = UT_READ_CLASS_OTHER; req.bRequest = UR_GET_STATUS; USETW(req.wValue, 0); USETW(req.wIndex, port); USETW(req.wLength, sizeof(*ps)); return usbd_do_request(dev, &req, ps); } /* USB 3.1 10.16.2.6, 10.16.2.6.3 */ usbd_status usbd_get_port_status_ext(struct usbd_device *dev, int port, usb_port_status_ext_t *pse) { USBHIST_FUNC(); USBHIST_CALLARGS(usbdebug, "dev %#jx port %jd", (uintptr_t)dev, port, 0, 0); usb_device_request_t req; req.bmRequestType = UT_READ_CLASS_OTHER; req.bRequest = UR_GET_STATUS; USETW2(req.wValue, 0, UR_PST_EXT_PORT_STATUS); USETW(req.wIndex, port); USETW(req.wLength, sizeof(*pse)); return usbd_do_request(dev, &req, pse); } /* -------------------------------------------------------------------------- */ usbd_status usbd_clear_hub_feature(struct usbd_device *dev, int sel) { USBHIST_FUNC(); USBHIST_CALLARGS(usbdebug, "dev %#jx sel %jd", (uintptr_t)dev, sel, 0, 0); usb_device_request_t req; req.bmRequestType = UT_WRITE_CLASS_DEVICE; req.bRequest = UR_CLEAR_FEATURE; USETW(req.wValue, sel); USETW(req.wIndex, 0); USETW(req.wLength, 0); return usbd_do_request(dev, &req, 0); } usbd_status usbd_set_hub_feature(struct usbd_device *dev, int sel) { USBHIST_FUNC(); USBHIST_CALLARGS(usbdebug, "dev %#jx sel %jd", (uintptr_t)dev, sel, 0, 0); usb_device_request_t req; req.bmRequestType = UT_WRITE_CLASS_DEVICE; req.bRequest = UR_SET_FEATURE; USETW(req.wValue, sel); USETW(req.wIndex, 0); USETW(req.wLength, 0); return usbd_do_request(dev, &req, 0); } usbd_status usbd_clear_port_feature(struct usbd_device *dev, int port, int sel) { USBHIST_FUNC(); USBHIST_CALLARGS(usbdebug, "dev %#jx port %jd sel %jd", (uintptr_t)dev, port, sel, 0); usb_device_request_t req; req.bmRequestType = UT_WRITE_CLASS_OTHER; req.bRequest = UR_CLEAR_FEATURE; USETW(req.wValue, sel); USETW(req.wIndex, port); USETW(req.wLength, 0); return usbd_do_request(dev, &req, 0); } usbd_status usbd_set_port_feature(struct usbd_device *dev, int port, int sel) { USBHIST_FUNC(); USBHIST_CALLARGS(usbdebug, "dev %#jx port %jd sel %.d", (uintptr_t)dev, sel, 0, 0); usb_device_request_t req; req.bmRequestType = UT_WRITE_CLASS_OTHER; req.bRequest = UR_SET_FEATURE; USETW(req.wValue, sel); USETW(req.wIndex, port); USETW(req.wLength, 0); return usbd_do_request(dev, &req, 0); } usbd_status usbd_set_port_u1_timeout(struct usbd_device *dev, int port, int timeout) { USBHIST_FUNC(); USBHIST_CALLARGS(usbdebug, "dev %#jx port %jd timeout %.d", (uintptr_t)dev, port, timeout, 0); usb_device_request_t req; req.bmRequestType = UT_WRITE_CLASS_OTHER; req.bRequest = UR_SET_FEATURE; USETW(req.wValue, UHF_PORT_U1_TIMEOUT); USETW2(req.wIndex, timeout, port); USETW(req.wLength, 0); return usbd_do_request(dev, &req, 0); } usbd_status usbd_set_port_u2_timeout(struct usbd_device *dev, int port, int timeout) { USBHIST_FUNC(); USBHIST_CALLARGS(usbdebug, "dev %#jx port %jd timeout %jd", (uintptr_t)dev, port, timeout, 0); usb_device_request_t req; req.bmRequestType = UT_WRITE_CLASS_OTHER; req.bRequest = UR_SET_FEATURE; USETW(req.wValue, UHF_PORT_U2_TIMEOUT); USETW2(req.wIndex, timeout, port); USETW(req.wLength, 0); return usbd_do_request(dev, &req, 0); } usbd_status usbd_clear_endpoint_feature(struct usbd_device *dev, int epaddr, int sel) { USBHIST_FUNC(); USBHIST_CALLARGS(usbdebug, "dev %#jx epaddr %jd sel %jd", (uintptr_t)dev, epaddr, sel, 0); usb_device_request_t req; req.bmRequestType = UT_WRITE_ENDPOINT; req.bRequest = UR_CLEAR_FEATURE; USETW(req.wValue, sel); USETW(req.wIndex, epaddr); USETW(req.wLength, 0); return usbd_do_request(dev, &req, 0); } /* -------------------------------------------------------------------------- */ usbd_status usbd_get_config(struct usbd_device *dev, uint8_t *conf) { USBHIST_FUNC(); USBHIST_CALLARGS(usbdebug, "dev %#jx", (uintptr_t)dev, 0, 0, 0); usb_device_request_t req; req.bmRequestType = UT_READ_DEVICE; req.bRequest = UR_GET_CONFIG; USETW(req.wValue, 0); USETW(req.wIndex, 0); USETW(req.wLength, 1); return usbd_do_request(dev, &req, conf); } usbd_status usbd_set_config(struct usbd_device *dev, int conf) { USBHIST_FUNC(); USBHIST_CALLARGS(usbdebug, "dev %#jx conf %jd", (uintptr_t)dev, conf, 0, 0); usb_device_request_t req; req.bmRequestType = UT_WRITE_DEVICE; req.bRequest = UR_SET_CONFIG; USETW(req.wValue, conf); USETW(req.wIndex, 0); USETW(req.wLength, 0); return usbd_do_request(dev, &req, 0); } usbd_status usbd_set_address(struct usbd_device *dev, int addr) { USBHIST_FUNC(); USBHIST_CALLARGS(usbdebug, "dev %#jx addr %jd", (uintptr_t)dev, addr, 0, 0); usb_device_request_t req; req.bmRequestType = UT_WRITE_DEVICE; req.bRequest = UR_SET_ADDRESS; USETW(req.wValue, addr); USETW(req.wIndex, 0); USETW(req.wLength, 0); return usbd_do_request(dev, &req, 0); } usbd_status usbd_set_idle(struct usbd_interface *iface, int duration, int id) { usb_interface_descriptor_t *ifd = usbd_get_interface_descriptor(iface); struct usbd_device *dev; usb_device_request_t req; USBHIST_FUNC(); USBHIST_CALLARGS(usbdebug, "duration %jd id %jd", duration, id, 0, 0); if (ifd == NULL) return USBD_IOERROR; usbd_interface2device_handle(iface, &dev); req.bmRequestType = UT_WRITE_CLASS_INTERFACE; req.bRequest = UR_SET_IDLE; USETW2(req.wValue, duration, id); USETW(req.wIndex, ifd->bInterfaceNumber); USETW(req.wLength, 0); return usbd_do_request(dev, &req, 0); } /* -------------------------------------------------------------------------- */ usbd_status usbd_get_protocol(struct usbd_interface *iface, uint8_t *report) { usb_interface_descriptor_t *id = usbd_get_interface_descriptor(iface); struct usbd_device *dev; usb_device_request_t req; USBHIST_FUNC(); USBHIST_CALLARGS(usbdebug, "iface=%#jx, endpt=%jd", (uintptr_t)iface, id->bInterfaceNumber, 0, 0); if (id == NULL) return USBD_IOERROR; usbd_interface2device_handle(iface, &dev); req.bmRequestType = UT_READ_CLASS_INTERFACE; req.bRequest = UR_GET_PROTOCOL; USETW(req.wValue, 0); USETW(req.wIndex, id->bInterfaceNumber); USETW(req.wLength, 1); return usbd_do_request(dev, &req, report); } usbd_status usbd_set_protocol(struct usbd_interface *iface, int report) { usb_interface_descriptor_t *id = usbd_get_interface_descriptor(iface); struct usbd_device *dev; usb_device_request_t req; USBHIST_FUNC(); USBHIST_CALLARGS(usbdebug, "iface=%#jx, report=%jd, endpt=%jd", (uintptr_t)iface, report, id->bInterfaceNumber, 0); if (id == NULL) return USBD_IOERROR; usbd_interface2device_handle(iface, &dev); req.bmRequestType = UT_WRITE_CLASS_INTERFACE; req.bRequest = UR_SET_PROTOCOL; USETW(req.wValue, report); USETW(req.wIndex, id->bInterfaceNumber); USETW(req.wLength, 0); return usbd_do_request(dev, &req, 0); } /* -------------------------------------------------------------------------- */ usbd_status usbd_set_report(struct usbd_interface *iface, int type, int id, void *data, int len) { usb_interface_descriptor_t *ifd = usbd_get_interface_descriptor(iface); struct usbd_device *dev; usb_device_request_t req; USBHIST_FUNC(); USBHIST_CALLARGS(usbdebug, "len=%jd", len, 0, 0, 0); if (ifd == NULL) return USBD_IOERROR; usbd_interface2device_handle(iface, &dev); req.bmRequestType = UT_WRITE_CLASS_INTERFACE; req.bRequest = UR_SET_REPORT; USETW2(req.wValue, type, id); USETW(req.wIndex, ifd->bInterfaceNumber); USETW(req.wLength, len); return usbd_do_request(dev, &req, data); } usbd_status usbd_get_report(struct usbd_interface *iface, int type, int id, void *data, int len) { usb_interface_descriptor_t *ifd = usbd_get_interface_descriptor(iface); struct usbd_device *dev; usb_device_request_t req; USBHIST_FUNC(); USBHIST_CALLARGS(usbdebug, "len=%jd", len, 0, 0, 0); if (ifd == NULL) return USBD_IOERROR; usbd_interface2device_handle(iface, &dev); req.bmRequestType = UT_READ_CLASS_INTERFACE; req.bRequest = UR_GET_REPORT; USETW2(req.wValue, type, id); USETW(req.wIndex, ifd->bInterfaceNumber); USETW(req.wLength, len); return usbd_do_request(dev, &req, data); } usbd_status usbd_get_report_descriptor(struct usbd_device *dev, int ifcno, int size, void *d) { USBHIST_FUNC(); USBHIST_CALLARGS(usbdebug, "dev %#jx ifcno %jd size %jd", (uintptr_t)dev, ifcno, size, 0); usb_device_request_t req; req.bmRequestType = UT_READ_INTERFACE; req.bRequest = UR_GET_DESCRIPTOR; USETW2(req.wValue, UDESC_REPORT, 0); /* report id should be 0 */ USETW(req.wIndex, ifcno); USETW(req.wLength, size); return usbd_do_request(dev, &req, d); } /* -------------------------------------------------------------------------- */ usb_hid_descriptor_t * usbd_get_hid_descriptor(struct usbd_interface *ifc) { usb_interface_descriptor_t *idesc = usbd_get_interface_descriptor(ifc); struct usbd_device *dev; usb_config_descriptor_t *cdesc; usb_hid_descriptor_t *hd; char *p, *end; if (idesc == NULL) return NULL; usbd_interface2device_handle(ifc, &dev); cdesc = usbd_get_config_descriptor(dev); p = (char *)idesc + idesc->bLength; end = (char *)cdesc + UGETW(cdesc->wTotalLength); for (; end - p >= sizeof(*hd); p += hd->bLength) { hd = (usb_hid_descriptor_t *)p; if (hd->bLength < sizeof(*hd) || hd->bLength > end - p) break; if (hd->bLength >= USB_HID_DESCRIPTOR_SIZE(0) && hd->bDescriptorType == UDESC_HID) return hd; if (hd->bDescriptorType == UDESC_INTERFACE) break; } return NULL; } usbd_status usbd_read_report_desc(struct usbd_interface *ifc, void **descp, int *sizep) { usb_interface_descriptor_t *id; usb_hid_descriptor_t *hid; struct usbd_device *dev; usbd_status err; usbd_interface2device_handle(ifc, &dev); id = usbd_get_interface_descriptor(ifc); if (id == NULL) return USBD_INVAL; hid = usbd_get_hid_descriptor(ifc); if (hid == NULL) return USBD_IOERROR; *sizep = UGETW(hid->descrs[0].wDescriptorLength); if (*sizep == 0) return USBD_INVAL; *descp = kmem_alloc(*sizep, KM_SLEEP); err = usbd_get_report_descriptor(dev, id->bInterfaceNumber, *sizep, *descp); if (err) { kmem_free(*descp, *sizep); *descp = NULL; return err; } return USBD_NORMAL_COMPLETION; } usbd_status usbd_bulk_transfer(struct usbd_xfer *xfer, struct usbd_pipe *pipe, uint16_t flags, uint32_t timeout, void *buf, uint32_t *size) { usbd_status err; USBHIST_FUNC(); USBHIST_CALLARGS(usbdebug, "start transfer %jd bytes", *size, 0, 0, 0); usbd_setup_xfer(xfer, 0, buf, *size, flags, timeout, NULL); err = usbd_sync_transfer_sig(xfer); usbd_get_xfer_status(xfer, NULL, NULL, size, NULL); DPRINTFN(1, "transferred %jd", *size, 0, 0, 0); if (err) { usbd_clear_endpoint_stall(pipe); } USBHIST_LOG(usbdebug, "<- done xfer %#jx err %jd", (uintptr_t)xfer, err, 0, 0); return err; } usbd_status usbd_intr_transfer(struct usbd_xfer *xfer, struct usbd_pipe *pipe, uint16_t flags, uint32_t timeout, void *buf, uint32_t *size) { usbd_status err; USBHIST_FUNC(); USBHIST_CALLARGS(usbdebug, "start transfer %jd bytes", *size, 0, 0, 0); usbd_setup_xfer(xfer, 0, buf, *size, flags, timeout, NULL); err = usbd_sync_transfer_sig(xfer); usbd_get_xfer_status(xfer, NULL, NULL, size, NULL); DPRINTFN(1, "transferred %jd", *size, 0, 0, 0); if (err) { usbd_clear_endpoint_stall(pipe); } USBHIST_LOG(usbdebug, "<- done xfer %#jx err %jd", (uintptr_t)xfer, err, 0, 0); return err; } void usb_detach_waitold(device_t dv) { USBHIST_FUNC(); USBHIST_CALLARGS(usbdebug, "waiting for dv %#jx", (uintptr_t)dv, 0, 0, 0); if (tsleep(dv, PZERO, "usbdet", hz * 60)) /* XXXSMP ok */ aprint_error_dev(dv, "usb_detach_waitold: didn't detach\n"); DPRINTFN(1, "done", 0, 0, 0, 0); } void usb_detach_wakeupold(device_t dv) { USBHIST_FUNC(); USBHIST_CALLARGS(usbdebug, "for dv %#jx", (uintptr_t)dv, 0, 0, 0); wakeup(dv); /* XXXSMP ok */ } /* -------------------------------------------------------------------------- */ void usb_desc_iter_init(struct usbd_device *dev, usbd_desc_iter_t *iter) { const usb_config_descriptor_t *cd = usbd_get_config_descriptor(dev); iter->cur = (const uByte *)cd; iter->end = (const uByte *)cd + UGETW(cd->wTotalLength); } const usb_descriptor_t * usb_desc_iter_peek(usbd_desc_iter_t *iter) { const usb_descriptor_t *desc; if (iter->end - iter->cur < sizeof(usb_descriptor_t)) { if (iter->cur != iter->end) printf("%s: bad descriptor\n", __func__); return NULL; } desc = (const usb_descriptor_t *)iter->cur; if (desc->bLength < USB_DESCRIPTOR_SIZE) { printf("%s: descriptor length too small\n", __func__); return NULL; } if (desc->bLength > iter->end - iter->cur) { printf("%s: descriptor length too large\n", __func__); return NULL; } return desc; } const usb_descriptor_t * usb_desc_iter_next(usbd_desc_iter_t *iter) { const usb_descriptor_t *desc = usb_desc_iter_peek(iter); if (desc == NULL) return NULL; KASSERT(desc->bLength <= iter->end - iter->cur); iter->cur += desc->bLength; return desc; } /* * Return the next interface descriptor, skipping over any other * descriptors. Returns NULL at the end or on error. */ const usb_interface_descriptor_t * usb_desc_iter_next_interface(usbd_desc_iter_t *iter) { const usb_descriptor_t *desc; while ((desc = usb_desc_iter_peek(iter)) != NULL && desc->bDescriptorType != UDESC_INTERFACE) { usb_desc_iter_next(iter); } if ((desc = usb_desc_iter_next(iter)) == NULL || desc->bLength < sizeof(usb_interface_descriptor_t)) return NULL; KASSERT(desc->bDescriptorType == UDESC_INTERFACE); return (const usb_interface_descriptor_t *)desc; } /* * Returns the next non-interface descriptor, returning NULL when the * next descriptor would be an interface descriptor. */ const usb_descriptor_t * usb_desc_iter_next_non_interface(usbd_desc_iter_t *iter) { const usb_descriptor_t *desc; if ((desc = usb_desc_iter_peek(iter)) != NULL && desc->bDescriptorType != UDESC_INTERFACE) { return usb_desc_iter_next(iter); } else { return NULL; } } const usb_cdc_descriptor_t * usb_find_desc(struct usbd_device *dev, int type, int subtype) { usbd_desc_iter_t iter; const usb_cdc_descriptor_t *desc; usb_desc_iter_init(dev, &iter); for (;;) { desc = (const usb_cdc_descriptor_t *)usb_desc_iter_next(&iter); if (desc == NULL) break; if (desc->bDescriptorType != type) continue; if (subtype == USBD_CDCSUBTYPE_ANY || subtype == desc->bDescriptorSubtype) break; } return desc; } /* * Same as usb_find_desc(), but searches only in the specified * interface. */ const usb_cdc_descriptor_t * usb_find_desc_if(struct usbd_device *dev, int type, int subtype, usb_interface_descriptor_t *id) { usbd_desc_iter_t iter; const usb_cdc_descriptor_t *desc; if (id == NULL) return usb_find_desc(dev, type, subtype); usb_desc_iter_init(dev, &iter); iter.cur = (void *)id; /* start from the interface desc */ usb_desc_iter_next(&iter); /* and skip it */ while ((desc = (const usb_cdc_descriptor_t *)usb_desc_iter_next(&iter)) != NULL) { if (desc->bDescriptorType == UDESC_INTERFACE) { /* we ran into the next interface --- not found */ return NULL; } if (desc->bDescriptorType == type && (subtype == USBD_CDCSUBTYPE_ANY || subtype == desc->bDescriptorSubtype)) break; } return desc; }
1 1 1 56 1 56 1 19 9 9 4 14 1 7 4 7 7 3 3 1 2 5 3 8 8 8 15 3 1 4 1 3 4 4 2 2 1 22 22 12 10 1 2 3 3 1 1 1 1 3 3 3 1 19 19 19 1 56 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 /* $NetBSD: udp6_usrreq.c,v 1.154 2022/11/04 09:01:53 ozaki-r Exp $ */ /* $KAME: udp6_usrreq.c,v 1.86 2001/05/27 17:33:00 itojun Exp $ */ /* $KAME: udp6_output.c,v 1.43 2001/10/15 09:19:52 itojun Exp $ */ /* * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the project nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * Copyright (c) 1982, 1986, 1989, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)udp_var.h 8.1 (Berkeley) 6/10/93 */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: udp6_usrreq.c,v 1.154 2022/11/04 09:01:53 ozaki-r Exp $"); #ifdef _KERNEL_OPT #include "opt_inet.h" #include "opt_inet_csum.h" #include "opt_ipsec.h" #include "opt_net_mpsafe.h" #endif #include <sys/param.h> #include <sys/mbuf.h> #include <sys/protosw.h> #include <sys/socket.h> #include <sys/socketvar.h> #include <sys/systm.h> #include <sys/proc.h> #include <sys/syslog.h> #include <sys/domain.h> #include <sys/sysctl.h> #include <net/if.h> #include <net/if_types.h> #include <netinet/in.h> #include <netinet/in_var.h> #include <netinet/in_systm.h> #include <netinet/in_offload.h> #include <netinet/ip.h> #include <netinet/ip_var.h> #include <netinet/in_pcb.h> #include <netinet/udp.h> #include <netinet/udp_var.h> #include <netinet/udp_private.h> #include <netinet/ip6.h> #include <netinet/icmp6.h> #include <netinet6/ip6_var.h> #include <netinet6/ip6_private.h> #include <netinet6/in6_pcb.h> #include <netinet6/udp6_var.h> #include <netinet6/udp6_private.h> #include <netinet6/ip6protosw.h> #include <netinet6/scope6_var.h> #ifdef IPSEC #include <netipsec/ipsec.h> #include <netipsec/esp.h> #ifdef INET6 #include <netipsec/ipsec6.h> #endif #endif #include "faith.h" #if defined(NFAITH) && NFAITH > 0 #include <net/if_faith.h> #endif /* * UDP protocol implementation. * Per RFC 768, August, 1980. */ extern struct inpcbtable udbtable; percpu_t *udp6stat_percpu; /* UDP on IP6 parameters */ static int udp6_sendspace = 9216; /* really max datagram size */ static int udp6_recvspace = 40 * (1024 + sizeof(struct sockaddr_in6)); /* 40 1K datagrams */ static void udp6_notify(struct inpcb *, int); static void sysctl_net_inet6_udp6_setup(struct sysctllog **); #ifdef IPSEC static int udp6_espinudp(struct mbuf **, int); #endif #ifdef UDP_CSUM_COUNTERS #include <sys/device.h> struct evcnt udp6_hwcsum_bad = EVCNT_INITIALIZER(EVCNT_TYPE_MISC, NULL, "udp6", "hwcsum bad"); struct evcnt udp6_hwcsum_ok = EVCNT_INITIALIZER(EVCNT_TYPE_MISC, NULL, "udp6", "hwcsum ok"); struct evcnt udp6_hwcsum_data = EVCNT_INITIALIZER(EVCNT_TYPE_MISC, NULL, "udp6", "hwcsum data"); struct evcnt udp6_swcsum = EVCNT_INITIALIZER(EVCNT_TYPE_MISC, NULL, "udp6", "swcsum"); EVCNT_ATTACH_STATIC(udp6_hwcsum_bad); EVCNT_ATTACH_STATIC(udp6_hwcsum_ok); EVCNT_ATTACH_STATIC(udp6_hwcsum_data); EVCNT_ATTACH_STATIC(udp6_swcsum); #define UDP_CSUM_COUNTER_INCR(ev) (ev)->ev_count++ #else #define UDP_CSUM_COUNTER_INCR(ev) /* nothing */ #endif void udp6_init(void) { sysctl_net_inet6_udp6_setup(NULL); udp6stat_percpu = percpu_alloc(sizeof(uint64_t) * UDP6_NSTATS); udp_init_common(); } /* * Notify a udp user of an asynchronous error; * just wake up so that he can collect error status. */ static void udp6_notify(struct inpcb *inp, int errno) { inp->inp_socket->so_error = errno; sorwakeup(inp->inp_socket); sowwakeup(inp->inp_socket); } void * udp6_ctlinput(int cmd, const struct sockaddr *sa, void *d) { struct udphdr uh; struct ip6_hdr *ip6; const struct sockaddr_in6 *sa6 = (const struct sockaddr_in6 *)sa; struct mbuf *m; int off; void *cmdarg; struct ip6ctlparam *ip6cp = NULL; const struct sockaddr_in6 *sa6_src = NULL; void (*notify)(struct inpcb *, int) = udp6_notify; struct udp_portonly { u_int16_t uh_sport; u_int16_t uh_dport; } *uhp; if (sa->sa_family != AF_INET6 || sa->sa_len != sizeof(struct sockaddr_in6)) return NULL; if ((unsigned)cmd >= PRC_NCMDS) return NULL; if (PRC_IS_REDIRECT(cmd)) notify = in6pcb_rtchange, d = NULL; else if (cmd == PRC_HOSTDEAD) d = NULL; else if (cmd == PRC_MSGSIZE) { /* special code is present, see below */ notify = in6pcb_rtchange; } else if (inet6ctlerrmap[cmd] == 0) return NULL; /* if the parameter is from icmp6, decode it. */ if (d != NULL) { ip6cp = (struct ip6ctlparam *)d; m = ip6cp->ip6c_m; ip6 = ip6cp->ip6c_ip6; off = ip6cp->ip6c_off; cmdarg = ip6cp->ip6c_cmdarg; sa6_src = ip6cp->ip6c_src; } else { m = NULL; ip6 = NULL; cmdarg = NULL; sa6_src = &sa6_any; off = 0; } if (ip6) { /* check if we can safely examine src and dst ports */ if (m->m_pkthdr.len < off + sizeof(*uhp)) { if (cmd == PRC_MSGSIZE) icmp6_mtudisc_update((struct ip6ctlparam *)d, 0); return NULL; } memset(&uh, 0, sizeof(uh)); m_copydata(m, off, sizeof(*uhp), (void *)&uh); if (cmd == PRC_MSGSIZE) { int valid = 0; /* * Check to see if we have a valid UDP socket * corresponding to the address in the ICMPv6 message * payload. */ if (in6pcb_lookup(&udbtable, &sa6->sin6_addr, uh.uh_dport, (const struct in6_addr *)&sa6_src->sin6_addr, uh.uh_sport, 0, 0)) valid++; #if 0 /* * As the use of sendto(2) is fairly popular, * we may want to allow non-connected pcb too. * But it could be too weak against attacks... * We should at least check if the local address (= s) * is really ours. */ else if (in6pcb_lookup_bound(&udbtable, &sa6->sin6_addr, uh.uh_dport, 0)) valid++; #endif /* * Depending on the value of "valid" and routing table * size (mtudisc_{hi,lo}wat), we will: * - recalculate the new MTU and create the * corresponding routing entry, or * - ignore the MTU change notification. */ icmp6_mtudisc_update((struct ip6ctlparam *)d, valid); /* * regardless of if we called * icmp6_mtudisc_update(), we need to call * in6pcb_notify(), to notify path MTU change * to the userland (RFC3542), because some * unconnected sockets may share the same * destination and want to know the path MTU. */ } (void)in6pcb_notify(&udbtable, sa, uh.uh_dport, sin6tocsa(sa6_src), uh.uh_sport, cmd, cmdarg, notify); } else { (void)in6pcb_notify(&udbtable, sa, 0, sin6tocsa(sa6_src), 0, cmd, cmdarg, notify); } return NULL; } int udp6_ctloutput(int op, struct socket *so, struct sockopt *sopt) { int s; int error = 0; struct inpcb *inp; int family; int optval; family = so->so_proto->pr_domain->dom_family; s = splsoftnet(); switch (family) { #ifdef INET case PF_INET: if (sopt->sopt_level != IPPROTO_UDP) { error = ip_ctloutput(op, so, sopt); goto end; } break; #endif #ifdef INET6 case PF_INET6: if (sopt->sopt_level != IPPROTO_UDP) { error = ip6_ctloutput(op, so, sopt); goto end; } break; #endif default: error = EAFNOSUPPORT; goto end; } switch (op) { case PRCO_SETOPT: inp = sotoinpcb(so); switch (sopt->sopt_name) { case UDP_ENCAP: error = sockopt_getint(sopt, &optval); if (error) break; switch(optval) { case 0: inp->inp_flags &= ~IN6P_ESPINUDP; break; case UDP_ENCAP_ESPINUDP: inp->inp_flags |= IN6P_ESPINUDP; break; default: error = EINVAL; break; } break; default: error = ENOPROTOOPT; break; } break; default: error = EINVAL; break; } end: splx(s); return error; } static void udp6_sendup(struct mbuf *m, int off /* offset of data portion */, struct sockaddr *src, struct socket *so) { struct mbuf *opts = NULL; struct mbuf *n; struct inpcb *inp; KASSERT(so != NULL); KASSERT(so->so_proto->pr_domain->dom_family == AF_INET6); inp = sotoinpcb(so); KASSERT(inp != NULL); #if defined(IPSEC) if (ipsec_used && ipsec_in_reject(m, inp)) { if ((n = m_copypacket(m, M_DONTWAIT)) != NULL) icmp6_error(n, ICMP6_DST_UNREACH, ICMP6_DST_UNREACH_ADMIN, 0); return; } #endif if ((n = m_copypacket(m, M_DONTWAIT)) != NULL) { if (inp->inp_flags & IN6P_CONTROLOPTS || SOOPT_TIMESTAMP(inp->inp_socket->so_options)) { struct ip6_hdr *ip6 = mtod(n, struct ip6_hdr *); ip6_savecontrol(inp, &opts, ip6, n); } m_adj(n, off); if (sbappendaddr(&so->so_rcv, src, n, opts) == 0) { m_freem(n); if (opts) m_freem(opts); UDP6_STATINC(UDP6_STAT_FULLSOCK); soroverflow(so); } else sorwakeup(so); } } int udp6_realinput(int af, struct sockaddr_in6 *src, struct sockaddr_in6 *dst, struct mbuf **mp, int off) { u_int16_t sport, dport; int rcvcnt; struct in6_addr src6, *dst6; const struct in_addr *dst4; struct inpcb *inp; struct mbuf *m = *mp; rcvcnt = 0; off += sizeof(struct udphdr); /* now, offset of payload */ if (af != AF_INET && af != AF_INET6) goto bad; if (src->sin6_family != AF_INET6 || dst->sin6_family != AF_INET6) goto bad; src6 = src->sin6_addr; if (sa6_recoverscope(src) != 0) { /* XXX: should be impossible. */ goto bad; } sport = src->sin6_port; dport = dst->sin6_port; dst4 = (struct in_addr *)&dst->sin6_addr.s6_addr[12]; dst6 = &dst->sin6_addr; if (IN6_IS_ADDR_MULTICAST(dst6) || (af == AF_INET && IN_MULTICAST(dst4->s_addr))) { /* * Deliver a multicast or broadcast datagram to *all* sockets * for which the local and remote addresses and ports match * those of the incoming datagram. This allows more than * one process to receive multi/broadcasts on the same port. * (This really ought to be done for unicast datagrams as * well, but that would cause problems with existing * applications that open both address-specific sockets and * a wildcard socket listening to the same port -- they would * end up receiving duplicates of every unicast datagram. * Those applications open the multiple sockets to overcome an * inadequacy of the UDP socket interface, but for backwards * compatibility we avoid the problem here rather than * fixing the interface. Maybe 4.5BSD will remedy this?) */ /* * KAME note: traditionally we dropped udpiphdr from mbuf here. * we need udpiphdr for IPsec processing so we do that later. */ /* * Locate pcb(s) for datagram. */ TAILQ_FOREACH(inp, &udbtable.inpt_queue, inp_queue) { if (inp->inp_af != AF_INET6) continue; if (inp->inp_lport != dport) continue; if (!IN6_IS_ADDR_UNSPECIFIED(&in6p_laddr(inp))) { if (!IN6_ARE_ADDR_EQUAL(&in6p_laddr(inp), dst6)) continue; } else { if (IN6_IS_ADDR_V4MAPPED(dst6) && (inp->inp_flags & IN6P_IPV6_V6ONLY)) continue; } if (!IN6_IS_ADDR_UNSPECIFIED(&in6p_faddr(inp))) { if (!IN6_ARE_ADDR_EQUAL(&in6p_faddr(inp), &src6) || inp->inp_fport != sport) continue; } else { if (IN6_IS_ADDR_V4MAPPED(&src6) && (inp->inp_flags & IN6P_IPV6_V6ONLY)) continue; } udp6_sendup(m, off, sin6tosa(src), inp->inp_socket); rcvcnt++; /* * Don't look for additional matches if this one does * not have either the SO_REUSEPORT or SO_REUSEADDR * socket options set. This heuristic avoids searching * through all pcbs in the common case of a non-shared * port. It assumes that an application will never * clear these options after setting them. */ if ((inp->inp_socket->so_options & (SO_REUSEPORT|SO_REUSEADDR)) == 0) break; } } else { /* * Locate pcb for datagram. */ inp = in6pcb_lookup(&udbtable, &src6, sport, dst6, dport, 0, 0); if (inp == NULL) { UDP_STATINC(UDP_STAT_PCBHASHMISS); inp = in6pcb_lookup_bound(&udbtable, dst6, dport, 0); if (inp == NULL) return rcvcnt; } #ifdef IPSEC /* Handle ESP over UDP */ if (inp->inp_flags & IN6P_ESPINUDP) { switch (udp6_espinudp(mp, off)) { case -1: /* Error, m was freed */ rcvcnt = -1; goto bad; case 1: /* ESP over UDP */ rcvcnt++; goto bad; case 0: /* plain UDP */ default: /* Unexpected */ /* * Normal UDP processing will take place, * m may have changed. */ m = *mp; break; } } #endif if (inp->inp_overudp_cb != NULL) { int ret; ret = inp->inp_overudp_cb(mp, off, inp->inp_socket, sin6tosa(src), inp->inp_overudp_arg); switch (ret) { case -1: /* Error, m was freed */ rcvcnt = -1; goto bad; case 1: /* Foo over UDP */ KASSERT(*mp == NULL); rcvcnt++; goto bad; case 0: /* plain UDP */ default: /* Unexpected */ /* * Normal UDP processing will take place, * m may have changed. */ break; } } udp6_sendup(m, off, sin6tosa(src), inp->inp_socket); rcvcnt++; } bad: return rcvcnt; } int udp6_input_checksum(struct mbuf *m, const struct udphdr *uh, int off, int len) { /* * XXX it's better to record and check if this mbuf is * already checked. */ if (__predict_false((m->m_flags & M_LOOP) && !udp_do_loopback_cksum)) { goto good; } if (uh->uh_sum == 0) { UDP6_STATINC(UDP6_STAT_NOSUM); goto bad; } switch (m->m_pkthdr.csum_flags & ((m_get_rcvif_NOMPSAFE(m)->if_csum_flags_rx & M_CSUM_UDPv6) | M_CSUM_TCP_UDP_BAD | M_CSUM_DATA)) { case M_CSUM_UDPv6|M_CSUM_TCP_UDP_BAD: UDP_CSUM_COUNTER_INCR(&udp6_hwcsum_bad); UDP6_STATINC(UDP6_STAT_BADSUM); goto bad; #if 0 /* notyet */ case M_CSUM_UDPv6|M_CSUM_DATA: #endif case M_CSUM_UDPv6: /* Checksum was okay. */ UDP_CSUM_COUNTER_INCR(&udp6_hwcsum_ok); break; default: /* * Need to compute it ourselves. Maybe skip checksum * on loopback interfaces. */ UDP_CSUM_COUNTER_INCR(&udp6_swcsum); if (in6_cksum(m, IPPROTO_UDP, off, len) != 0) { UDP6_STATINC(UDP6_STAT_BADSUM); goto bad; } } good: return 0; bad: return -1; } int udp6_input(struct mbuf **mp, int *offp, int proto) { struct mbuf *m = *mp; int off = *offp; struct sockaddr_in6 src, dst; struct ip6_hdr *ip6; struct udphdr *uh; u_int32_t plen, ulen; ip6 = mtod(m, struct ip6_hdr *); #if defined(NFAITH) && 0 < NFAITH if (faithprefix(&ip6->ip6_dst)) { /* send icmp6 host unreach? */ m_freem(m); return IPPROTO_DONE; } #endif UDP6_STATINC(UDP6_STAT_IPACKETS); /* Check for jumbogram is done in ip6_input. We can trust pkthdr.len. */ plen = m->m_pkthdr.len - off; IP6_EXTHDR_GET(uh, struct udphdr *, m, off, sizeof(struct udphdr)); if (uh == NULL) { IP6_STATINC(IP6_STAT_TOOSHORT); return IPPROTO_DONE; } /* * Enforce alignment requirements that are violated in * some cases, see kern/50766 for details. */ if (ACCESSIBLE_POINTER(uh, struct udphdr) == 0) { m = m_copyup(m, off + sizeof(struct udphdr), 0); if (m == NULL) { IP6_STATINC(IP6_STAT_TOOSHORT); return IPPROTO_DONE; } ip6 = mtod(m, struct ip6_hdr *); uh = (struct udphdr *)(mtod(m, char *) + off); } KASSERT(ACCESSIBLE_POINTER(uh, struct udphdr)); ulen = ntohs((u_short)uh->uh_ulen); /* * RFC2675 section 4: jumbograms will have 0 in the UDP header field, * iff payload length > 0xffff. */ if (ulen == 0 && plen > 0xffff) ulen = plen; if (plen != ulen) { UDP6_STATINC(UDP6_STAT_BADLEN); goto bad; } /* destination port of 0 is illegal, based on RFC768. */ if (uh->uh_dport == 0) goto bad; /* * Checksum extended UDP header and data. Maybe skip checksum * on loopback interfaces. */ if (udp6_input_checksum(m, uh, off, ulen)) goto bad; /* * Construct source and dst sockaddrs. */ memset(&src, 0, sizeof(src)); src.sin6_family = AF_INET6; src.sin6_len = sizeof(struct sockaddr_in6); src.sin6_addr = ip6->ip6_src; src.sin6_port = uh->uh_sport; memset(&dst, 0, sizeof(dst)); dst.sin6_family = AF_INET6; dst.sin6_len = sizeof(struct sockaddr_in6); dst.sin6_addr = ip6->ip6_dst; dst.sin6_port = uh->uh_dport; if (udp6_realinput(AF_INET6, &src, &dst, &m, off) == 0) { if (m->m_flags & M_MCAST) { UDP6_STATINC(UDP6_STAT_NOPORTMCAST); goto bad; } UDP6_STATINC(UDP6_STAT_NOPORT); icmp6_error(m, ICMP6_DST_UNREACH, ICMP6_DST_UNREACH_NOPORT, 0); m = NULL; } bad: if (m) m_freem(m); return IPPROTO_DONE; } int udp6_output(struct inpcb * const inp, struct mbuf *m, struct sockaddr_in6 * const addr6, struct mbuf * const control, struct lwp * const l) { u_int32_t ulen = m->m_pkthdr.len; u_int32_t plen = sizeof(struct udphdr) + ulen; struct ip6_hdr *ip6; struct udphdr *udp6; struct in6_addr _laddr, *laddr, *faddr; struct in6_addr laddr_mapped; /* XXX ugly */ struct sockaddr_in6 *sin6 = NULL; struct ifnet *oifp = NULL; int scope_ambiguous = 0; u_int16_t fport; int error = 0; struct ip6_pktopts *optp = NULL; struct ip6_pktopts opt; int af = AF_INET6, hlen = sizeof(struct ip6_hdr); #ifdef INET struct ip *ip; struct udpiphdr *ui; int flags = 0; #endif struct sockaddr_in6 tmp; if (addr6) { sin6 = addr6; if (sin6->sin6_len != sizeof(*sin6)) { error = EINVAL; goto release; } if (sin6->sin6_family != AF_INET6) { error = EAFNOSUPPORT; goto release; } /* protect *sin6 from overwrites */ tmp = *sin6; sin6 = &tmp; /* * Application should provide a proper zone ID or the use of * default zone IDs should be enabled. Unfortunately, some * applications do not behave as it should, so we need a * workaround. Even if an appropriate ID is not determined, * we'll see if we can determine the outgoing interface. If we * can, determine the zone ID based on the interface below. */ if (sin6->sin6_scope_id == 0 && !ip6_use_defzone) scope_ambiguous = 1; if ((error = sa6_embedscope(sin6, ip6_use_defzone)) != 0) goto release; } if (control) { if (__predict_false(l == NULL)) { panic("%s: control but no lwp", __func__); } if ((error = ip6_setpktopts(control, &opt, in6p_outputopts(inp), l->l_cred, IPPROTO_UDP)) != 0) goto release; optp = &opt; } else optp = in6p_outputopts(inp); if (sin6) { /* * Slightly different than v4 version in that we call * in6_selectsrc and in6pcb_set_port to fill in the local * address and port rather than inpcb_connect. inpcb_connect * sets inp_faddr which causes EISCONN below to be hit on * subsequent sendto. */ if (sin6->sin6_port == 0) { error = EADDRNOTAVAIL; goto release; } if (!IN6_IS_ADDR_UNSPECIFIED(&in6p_faddr(inp))) { /* how about ::ffff:0.0.0.0 case? */ error = EISCONN; goto release; } faddr = &sin6->sin6_addr; fport = sin6->sin6_port; /* allow 0 port */ if (IN6_IS_ADDR_V4MAPPED(faddr)) { if ((inp->inp_flags & IN6P_IPV6_V6ONLY)) { /* * I believe we should explicitly discard the * packet when mapped addresses are disabled, * rather than send the packet as an IPv6 one. * If we chose the latter approach, the packet * might be sent out on the wire based on the * default route, the situation which we'd * probably want to avoid. * (20010421 jinmei@kame.net) */ error = EINVAL; goto release; } if (!IN6_IS_ADDR_UNSPECIFIED(&in6p_laddr(inp)) && !IN6_IS_ADDR_V4MAPPED(&in6p_laddr(inp))) { /* * when remote addr is an IPv4-mapped address, * local addr should not be an IPv6 address, * since you cannot determine how to map IPv6 * source address to IPv4. */ error = EINVAL; goto release; } af = AF_INET; } if (!IN6_IS_ADDR_V4MAPPED(faddr)) { struct psref psref; int bound = curlwp_bind(); error = in6_selectsrc(sin6, optp, in6p_moptions(inp), &inp->inp_route, &in6p_laddr(inp), &oifp, &psref, &_laddr); if (error) laddr = NULL; else laddr = &_laddr; if (oifp && scope_ambiguous && (error = in6_setscope(&sin6->sin6_addr, oifp, NULL))) { if_put(oifp, &psref); curlwp_bindx(bound); goto release; } if_put(oifp, &psref); curlwp_bindx(bound); } else { /* * XXX: freebsd[34] does not have in_selectsrc, but * we can omit the whole part because freebsd4 calls * udp_output() directly in this case, and thus we'll * never see this path. */ if (IN6_IS_ADDR_UNSPECIFIED(&in6p_laddr(inp))) { struct sockaddr_in sin_dst; struct in_addr ina; struct in_ifaddr *ia4; struct psref _psref; int bound; memcpy(&ina, &faddr->s6_addr[12], sizeof(ina)); sockaddr_in_init(&sin_dst, &ina, 0); bound = curlwp_bind(); ia4 = in_selectsrc(&sin_dst, &inp->inp_route, inp->inp_socket->so_options, NULL, &error, &_psref); if (ia4 == NULL) { curlwp_bindx(bound); if (error == 0) error = EADDRNOTAVAIL; goto release; } memset(&laddr_mapped, 0, sizeof(laddr_mapped)); laddr_mapped.s6_addr16[5] = 0xffff; /* ugly */ memcpy(&laddr_mapped.s6_addr[12], &IA_SIN(ia4)->sin_addr, sizeof(IA_SIN(ia4)->sin_addr)); ia4_release(ia4, &_psref); curlwp_bindx(bound); laddr = &laddr_mapped; } else { laddr = &in6p_laddr(inp); /* XXX */ } } if (laddr == NULL) { if (error == 0) error = EADDRNOTAVAIL; goto release; } if (inp->inp_lport == 0) { /* * Craft a sockaddr_in6 for the local endpoint. Use the * "any" as a base, set the address, and recover the * scope. */ struct sockaddr_in6 lsin6 = *((const struct sockaddr_in6 *)inp->inp_socket->so_proto->pr_domain->dom_sa_any); lsin6.sin6_addr = *laddr; error = sa6_recoverscope(&lsin6); if (error) goto release; error = in6pcb_set_port(&lsin6, inp, l); if (error) { in6p_laddr(inp) = in6addr_any; goto release; } } } else { if (IN6_IS_ADDR_UNSPECIFIED(&in6p_faddr(inp))) { error = ENOTCONN; goto release; } if (IN6_IS_ADDR_V4MAPPED(&in6p_faddr(inp))) { if ((inp->inp_flags & IN6P_IPV6_V6ONLY)) { /* * XXX: this case would happen when the * application sets the V6ONLY flag after * connecting the foreign address. * Such applications should be fixed, * so we bark here. */ log(LOG_INFO, "udp6_output: IPV6_V6ONLY " "option was set for a connected socket\n"); error = EINVAL; goto release; } else af = AF_INET; } laddr = &in6p_laddr(inp); faddr = &in6p_faddr(inp); fport = inp->inp_fport; } if (af == AF_INET) hlen = sizeof(struct ip); /* * Calculate data length and get a mbuf * for UDP and IP6 headers. */ M_PREPEND(m, hlen + sizeof(struct udphdr), M_DONTWAIT); if (m == NULL) { error = ENOBUFS; goto release; } /* * Stuff checksum and output datagram. */ udp6 = (struct udphdr *)(mtod(m, char *) + hlen); udp6->uh_sport = inp->inp_lport; /* lport is always set in the PCB */ udp6->uh_dport = fport; if (plen <= 0xffff) udp6->uh_ulen = htons((u_int16_t)plen); else udp6->uh_ulen = 0; udp6->uh_sum = 0; switch (af) { case AF_INET6: ip6 = mtod(m, struct ip6_hdr *); ip6->ip6_flow = in6p_flowinfo(inp) & IPV6_FLOWINFO_MASK; ip6->ip6_vfc &= ~IPV6_VERSION_MASK; ip6->ip6_vfc |= IPV6_VERSION; #if 0 /* ip6_plen will be filled in ip6_output. */ ip6->ip6_plen = htons((u_int16_t)plen); #endif ip6->ip6_nxt = IPPROTO_UDP; ip6->ip6_hlim = in6pcb_selecthlim_rt(inp); ip6->ip6_src = *laddr; ip6->ip6_dst = *faddr; udp6->uh_sum = in6_cksum_phdr(laddr, faddr, htonl(plen), htonl(IPPROTO_UDP)); m->m_pkthdr.csum_flags = M_CSUM_UDPv6; m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum); UDP6_STATINC(UDP6_STAT_OPACKETS); error = ip6_output(m, optp, &inp->inp_route, 0, in6p_moptions(inp), inp, NULL); break; case AF_INET: #ifdef INET /* can't transmit jumbogram over IPv4 */ if (plen > 0xffff) { error = EMSGSIZE; goto release; } ip = mtod(m, struct ip *); ui = (struct udpiphdr *)ip; memset(ui->ui_x1, 0, sizeof(ui->ui_x1)); ui->ui_pr = IPPROTO_UDP; ui->ui_len = htons(plen); memcpy(&ui->ui_src, &laddr->s6_addr[12], sizeof(ui->ui_src)); ui->ui_ulen = ui->ui_len; flags = (inp->inp_socket->so_options & (SO_DONTROUTE | SO_BROADCAST)); memcpy(&ui->ui_dst, &faddr->s6_addr[12], sizeof(ui->ui_dst)); udp6->uh_sum = in_cksum(m, hlen + plen); if (udp6->uh_sum == 0) udp6->uh_sum = 0xffff; ip->ip_len = htons(hlen + plen); ip->ip_ttl = in6pcb_selecthlim(inp, NULL); /* XXX */ ip->ip_tos = 0; /* XXX */ UDP_STATINC(UDP_STAT_OPACKETS); error = ip_output(m, NULL, &inp->inp_route, flags /* XXX */, inp->inp_moptions, NULL); break; #else error = EAFNOSUPPORT; goto release; #endif } goto releaseopt; release: m_freem(m); releaseopt: if (control) { if (optp == &opt) ip6_clearpktopts(&opt, -1); m_freem(control); } return (error); } static int udp6_attach(struct socket *so, int proto) { struct inpcb *inp; int s, error; KASSERT(sotoinpcb(so) == NULL); sosetlock(so); error = soreserve(so, udp6_sendspace, udp6_recvspace); if (error) { return error; } /* * MAPPED_ADDR implementation spec: * Always attach for IPv6, and only when necessary for IPv4. */ s = splsoftnet(); error = inpcb_create(so, &udbtable); splx(s); if (error) { return error; } inp = sotoinpcb(so); in6p_cksum(inp) = -1; /* just to be sure */ KASSERT(solocked(so)); return 0; } static void udp6_detach(struct socket *so) { struct inpcb *inp = sotoinpcb(so); int s; KASSERT(solocked(so)); KASSERT(inp != NULL); s = splsoftnet(); inpcb_destroy(inp); splx(s); } static int udp6_accept(struct socket *so, struct sockaddr *nam) { KASSERT(solocked(so)); return EOPNOTSUPP; } static int udp6_bind(struct socket *so, struct sockaddr *nam, struct lwp *l) { struct inpcb *inp = sotoinpcb(so); struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)nam; int error = 0; int s; KASSERT(solocked(so)); KASSERT(inp != NULL); s = splsoftnet(); error = in6pcb_bind(inp, sin6, l); splx(s); return error; } static int udp6_listen(struct socket *so, struct lwp *l) { KASSERT(solocked(so)); return EOPNOTSUPP; } static int udp6_connect(struct socket *so, struct sockaddr *nam, struct lwp *l) { struct inpcb *inp = sotoinpcb(so); int error = 0; int s; KASSERT(solocked(so)); KASSERT(inp != NULL); if (!IN6_IS_ADDR_UNSPECIFIED(&in6p_faddr(inp))) return EISCONN; s = splsoftnet(); error = in6pcb_connect(inp, (struct sockaddr_in6 *)nam, l); splx(s); if (error == 0) soisconnected(so); return error; } static int udp6_connect2(struct socket *so, struct socket *so2) { KASSERT(solocked(so)); return EOPNOTSUPP; } static int udp6_disconnect(struct socket *so) { struct inpcb *inp = sotoinpcb(so); int s; KASSERT(solocked(so)); KASSERT(inp != NULL); if (IN6_IS_ADDR_UNSPECIFIED(&in6p_faddr(inp))) return ENOTCONN; s = splsoftnet(); in6pcb_disconnect(inp); memset((void *)&in6p_laddr(inp), 0, sizeof(in6p_laddr(inp))); splx(s); so->so_state &= ~SS_ISCONNECTED; /* XXX */ in6pcb_set_state(inp, INP_BOUND); /* XXX */ return 0; } static int udp6_shutdown(struct socket *so) { int s; s = splsoftnet(); socantsendmore(so); splx(s); return 0; } static int udp6_abort(struct socket *so) { int s; KASSERT(solocked(so)); KASSERT(sotoinpcb(so) != NULL); s = splsoftnet(); soisdisconnected(so); inpcb_destroy(sotoinpcb(so)); splx(s); return 0; } static int udp6_ioctl(struct socket *so, u_long cmd, void *addr6, struct ifnet *ifp) { /* * MAPPED_ADDR implementation info: * Mapped addr support for PRU_CONTROL is not necessary. * Because typical user of PRU_CONTROL is such as ifconfig, * and they don't associate any addr to their socket. Then * socket family is only hint about the PRU_CONTROL'ed address * family, especially when getting addrs from kernel. * So AF_INET socket need to be used to control AF_INET addrs, * and AF_INET6 socket for AF_INET6 addrs. */ return in6_control(so, cmd, addr6, ifp); } static int udp6_stat(struct socket *so, struct stat *ub) { KASSERT(solocked(so)); /* stat: don't bother with a blocksize */ return 0; } static int udp6_peeraddr(struct socket *so, struct sockaddr *nam) { KASSERT(solocked(so)); KASSERT(sotoinpcb(so) != NULL); KASSERT(nam != NULL); in6pcb_fetch_peeraddr(sotoinpcb(so), (struct sockaddr_in6 *)nam); return 0; } static int udp6_sockaddr(struct socket *so, struct sockaddr *nam) { KASSERT(solocked(so)); KASSERT(sotoinpcb(so) != NULL); KASSERT(nam != NULL); in6pcb_fetch_sockaddr(sotoinpcb(so), (struct sockaddr_in6 *)nam); return 0; } static int udp6_rcvd(struct socket *so, int flags, struct lwp *l) { KASSERT(solocked(so)); return EOPNOTSUPP; } static int udp6_recvoob(struct socket *so, struct mbuf *m, int flags) { KASSERT(solocked(so)); return EOPNOTSUPP; } static int udp6_send(struct socket *so, struct mbuf *m, struct sockaddr *nam, struct mbuf *control, struct lwp *l) { struct inpcb *inp = sotoinpcb(so); int error = 0; int s; KASSERT(solocked(so)); KASSERT(inp != NULL); KASSERT(m != NULL); s = splsoftnet(); error = udp6_output(inp, m, (struct sockaddr_in6 *)nam, control, l); splx(s); return error; } static int udp6_sendoob(struct socket *so, struct mbuf *m, struct mbuf *control) { KASSERT(solocked(so)); m_freem(m); m_freem(control); return EOPNOTSUPP; } static int udp6_purgeif(struct socket *so, struct ifnet *ifp) { mutex_enter(softnet_lock); in6pcb_purgeif0(&udbtable, ifp); #ifdef NET_MPSAFE mutex_exit(softnet_lock); #endif in6_purgeif(ifp); #ifdef NET_MPSAFE mutex_enter(softnet_lock); #endif in6pcb_purgeif(&udbtable, ifp); mutex_exit(softnet_lock); return 0; } static int sysctl_net_inet6_udp6_stats(SYSCTLFN_ARGS) { return (NETSTAT_SYSCTL(udp6stat_percpu, UDP6_NSTATS)); } static void sysctl_net_inet6_udp6_setup(struct sysctllog **clog) { sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT, CTLTYPE_NODE, "inet6", NULL, NULL, 0, NULL, 0, CTL_NET, PF_INET6, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT, CTLTYPE_NODE, "udp6", SYSCTL_DESCR("UDPv6 related settings"), NULL, 0, NULL, 0, CTL_NET, PF_INET6, IPPROTO_UDP, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_INT, "sendspace", SYSCTL_DESCR("Default UDP send buffer size"), NULL, 0, &udp6_sendspace, 0, CTL_NET, PF_INET6, IPPROTO_UDP, UDP6CTL_SENDSPACE, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_INT, "recvspace", SYSCTL_DESCR("Default UDP receive buffer size"), NULL, 0, &udp6_recvspace, 0, CTL_NET, PF_INET6, IPPROTO_UDP, UDP6CTL_RECVSPACE, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_INT, "do_loopback_cksum", SYSCTL_DESCR("Perform UDP checksum on loopback"), NULL, 0, &udp_do_loopback_cksum, 0, CTL_NET, PF_INET6, IPPROTO_UDP, UDP6CTL_LOOPBACKCKSUM, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT, CTLTYPE_STRUCT, "pcblist", SYSCTL_DESCR("UDP protocol control block list"), sysctl_inpcblist, 0, &udbtable, 0, CTL_NET, PF_INET6, IPPROTO_UDP, CTL_CREATE, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT, CTLTYPE_STRUCT, "stats", SYSCTL_DESCR("UDPv6 statistics"), sysctl_net_inet6_udp6_stats, 0, NULL, 0, CTL_NET, PF_INET6, IPPROTO_UDP, UDP6CTL_STATS, CTL_EOL); } void udp6_statinc(u_int stat) { KASSERT(stat < UDP6_NSTATS); UDP6_STATINC(stat); } #ifdef IPSEC /* * Returns: * 1 if the packet was processed * 0 if normal UDP processing should take place * -1 if an error occurred and m was freed */ static int udp6_espinudp(struct mbuf **mp, int off) { const size_t skip = sizeof(struct udphdr); size_t len; void *data; size_t minlen; int ip6hdrlen; struct ip6_hdr *ip6; struct m_tag *tag; struct udphdr *udphdr; u_int16_t sport, dport; struct mbuf *m = *mp; uint32_t *marker; /* * Collapse the mbuf chain if the first mbuf is too short * The longest case is: UDP + non ESP marker + ESP */ minlen = off + sizeof(u_int64_t) + sizeof(struct esp); if (minlen > m->m_pkthdr.len) minlen = m->m_pkthdr.len; if (m->m_len < minlen) { if ((*mp = m_pullup(m, minlen)) == NULL) { return -1; } m = *mp; } len = m->m_len - off; data = mtod(m, char *) + off; /* Ignore keepalive packets */ if ((len == 1) && (*(unsigned char *)data == 0xff)) { m_freem(m); *mp = NULL; /* avoid any further processing by caller ... */ return 1; } /* Handle Non-ESP marker (32bit). If zero, then IKE. */ marker = (uint32_t *)data; if (len <= sizeof(uint32_t)) return 0; if (marker[0] == 0) return 0; /* * Get the UDP ports. They are handled in network * order everywhere in IPSEC_NAT_T code. */ udphdr = (struct udphdr *)((char *)data - skip); sport = udphdr->uh_sport; dport = udphdr->uh_dport; /* * Remove the UDP header (and possibly the non ESP marker) * IPv6 header length is ip6hdrlen * Before: * <---- off ---> * +-----+------+-----+ * | IP6 | UDP | ESP | * +-----+------+-----+ * <-skip-> * After: * +-----+-----+ * | IP6 | ESP | * +-----+-----+ * <-skip-> */ ip6hdrlen = off - sizeof(struct udphdr); memmove(mtod(m, char *) + skip, mtod(m, void *), ip6hdrlen); m_adj(m, skip); ip6 = mtod(m, struct ip6_hdr *); ip6->ip6_plen = htons(ntohs(ip6->ip6_plen) - skip); ip6->ip6_nxt = IPPROTO_ESP; /* * We have modified the packet - it is now ESP, so we should not * return to UDP processing ... * * Add a PACKET_TAG_IPSEC_NAT_T_PORT tag to remember * the source UDP port. This is required if we want * to select the right SPD for multiple hosts behind * same NAT */ if ((tag = m_tag_get(PACKET_TAG_IPSEC_NAT_T_PORTS, sizeof(sport) + sizeof(dport), M_DONTWAIT)) == NULL) { m_freem(m); return -1; } ((u_int16_t *)(tag + 1))[0] = sport; ((u_int16_t *)(tag + 1))[1] = dport; m_tag_prepend(m, tag); if (ipsec_used) ipsec6_common_input(&m, &ip6hdrlen, IPPROTO_ESP); else m_freem(m); /* We handled it, it shouldn't be handled by UDP */ *mp = NULL; /* avoid free by caller ... */ return 1; } #endif /* IPSEC */ PR_WRAP_USRREQS(udp6) #define udp6_attach udp6_attach_wrapper #define udp6_detach udp6_detach_wrapper #define udp6_accept udp6_accept_wrapper #define udp6_bind udp6_bind_wrapper #define udp6_listen udp6_listen_wrapper #define udp6_connect udp6_connect_wrapper #define udp6_connect2 udp6_connect2_wrapper #define udp6_disconnect udp6_disconnect_wrapper #define udp6_shutdown udp6_shutdown_wrapper #define udp6_abort udp6_abort_wrapper #define udp6_ioctl udp6_ioctl_wrapper #define udp6_stat udp6_stat_wrapper #define udp6_peeraddr udp6_peeraddr_wrapper #define udp6_sockaddr udp6_sockaddr_wrapper #define udp6_rcvd udp6_rcvd_wrapper #define udp6_recvoob udp6_recvoob_wrapper #define udp6_send udp6_send_wrapper #define udp6_sendoob udp6_sendoob_wrapper #define udp6_purgeif udp6_purgeif_wrapper const struct pr_usrreqs udp6_usrreqs = { .pr_attach = udp6_attach, .pr_detach = udp6_detach, .pr_accept = udp6_accept, .pr_bind = udp6_bind, .pr_listen = udp6_listen, .pr_connect = udp6_connect, .pr_connect2 = udp6_connect2, .pr_disconnect = udp6_disconnect, .pr_shutdown = udp6_shutdown, .pr_abort = udp6_abort, .pr_ioctl = udp6_ioctl, .pr_stat = udp6_stat, .pr_peeraddr = udp6_peeraddr, .pr_sockaddr = udp6_sockaddr, .pr_rcvd = udp6_rcvd, .pr_recvoob = udp6_recvoob, .pr_send = udp6_send, .pr_sendoob = udp6_sendoob, .pr_purgeif = udp6_purgeif, };
5 5 5 1 5 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 /* $NetBSD: if_arp.c,v 1.312 2024/02/24 21:39:05 mlelstv Exp $ */ /* * Copyright (c) 1998, 2000, 2008 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Public Access Networks Corporation ("Panix"). It was developed under * contract to Panix by Eric Haszlakiewicz and Thor Lancelot Simon. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /* * Copyright (c) 1982, 1986, 1988, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)if_ether.c 8.2 (Berkeley) 9/26/94 */ /* * Ethernet address resolution protocol. * TODO: * add "inuse/lock" bit (or ref. count) along with valid bit */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: if_arp.c,v 1.312 2024/02/24 21:39:05 mlelstv Exp $"); #ifdef _KERNEL_OPT #include "opt_ddb.h" #include "opt_inet.h" #include "opt_net_mpsafe.h" #endif #ifdef INET #include "arp.h" #include "bridge.h" #include <sys/param.h> #include <sys/systm.h> #include <sys/callout.h> #include <sys/kmem.h> #include <sys/mbuf.h> #include <sys/socket.h> #include <sys/time.h> #include <sys/timetc.h> #include <sys/kernel.h> #include <sys/errno.h> #include <sys/ioctl.h> #include <sys/syslog.h> #include <sys/proc.h> #include <sys/protosw.h> #include <sys/domain.h> #include <sys/sysctl.h> #include <sys/socketvar.h> #include <sys/percpu.h> #include <sys/cprng.h> #include <sys/kmem.h> #include <net/ethertypes.h> #include <net/if.h> #include <net/if_dl.h> #include <net/if_types.h> #include <net/if_ether.h> #include <net/if_llatbl.h> #include <net/nd.h> #include <net/route.h> #include <net/net_stats.h> #include <netinet/in.h> #include <netinet/in_systm.h> #include <netinet/in_var.h> #include <netinet/ip.h> #include <netinet/if_inarp.h> #include "arcnet.h" #if NARCNET > 0 #include <net/if_arc.h> #endif #include "carp.h" #if NCARP > 0 #include <netinet/ip_carp.h> #endif /* * ARP trailer negotiation. Trailer protocol is not IP specific, * but ARP request/response use IP addresses. */ #define ETHERTYPE_IPTRAILERS ETHERTYPE_TRAIL /* timers */ static int arp_reachable = REACHABLE_TIME; static int arp_retrans = RETRANS_TIMER; static int arp_perform_nud = 1; static bool arp_nud_enabled(struct ifnet *); static unsigned int arp_llinfo_reachable(struct ifnet *); static unsigned int arp_llinfo_retrans(struct ifnet *); static union l3addr *arp_llinfo_holdsrc(struct llentry *, union l3addr *); static void arp_llinfo_output(struct ifnet *, const union l3addr *, const union l3addr *, const uint8_t *, const union l3addr *); static void arp_llinfo_missed(struct ifnet *, const union l3addr *, int16_t, struct mbuf *); static void arp_free(struct llentry *, int); static struct nd_domain arp_nd_domain = { .nd_family = AF_INET, .nd_delay = 5, /* delay first probe time 5 second */ .nd_mmaxtries = 3, /* maximum broadcast query */ .nd_umaxtries = 3, /* maximum unicast query */ .nd_retransmultiple = BACKOFF_MULTIPLE, .nd_maxretrans = MAX_RETRANS_TIMER, .nd_maxnudhint = 0, /* max # of subsequent upper layer hints */ .nd_maxqueuelen = 1, /* max # of packets in unresolved ND entries */ .nd_nud_enabled = arp_nud_enabled, .nd_reachable = arp_llinfo_reachable, .nd_retrans = arp_llinfo_retrans, .nd_holdsrc = arp_llinfo_holdsrc, .nd_output = arp_llinfo_output, .nd_missed = arp_llinfo_missed, .nd_free = arp_free, }; int ip_dad_count = PROBE_NUM; #ifdef ARP_DEBUG int arp_debug = 1; #else int arp_debug = 0; #endif static void arp_init(void); static void arp_dad_init(void); static void arprequest(struct ifnet *, const struct in_addr *, const struct in_addr *, const uint8_t *, const uint8_t *); static void arpannounce1(struct ifaddr *); static struct sockaddr *arp_setgate(struct rtentry *, struct sockaddr *, const struct sockaddr *); static struct llentry *arpcreate(struct ifnet *, const struct in_addr *, const struct sockaddr *, int); static void in_arpinput(struct mbuf *); static void in_revarpinput(struct mbuf *); static void revarprequest(struct ifnet *); static void arp_drainstub(void); struct dadq; static void arp_dad_timer(struct dadq *); static void arp_dad_start(struct ifaddr *); static void arp_dad_stop(struct ifaddr *); static void arp_dad_duplicated(struct ifaddr *, const struct sockaddr_dl *); #define ARP_MAXQLEN 50 pktqueue_t * arp_pktq __read_mostly; static int useloopback = 1; /* use loopback interface for local traffic */ static percpu_t *arpstat_percpu; #define ARP_STAT_GETREF() _NET_STAT_GETREF(arpstat_percpu) #define ARP_STAT_PUTREF() _NET_STAT_PUTREF(arpstat_percpu) #define ARP_STATINC(x) _NET_STATINC(arpstat_percpu, x) #define ARP_STATADD(x, v) _NET_STATADD(arpstat_percpu, x, v) /* revarp state */ static struct in_addr myip, srv_ip; static int myip_initialized = 0; static int revarp_in_progress = 0; static struct ifnet *myip_ifp = NULL; static int arp_drainwanted; static int log_movements = 0; static int log_permanent_modify = 1; static int log_wrong_iface = 1; DOMAIN_DEFINE(arpdomain); /* forward declare and add to link set */ static void arp_fasttimo(void) { if (arp_drainwanted) { arp_drain(); arp_drainwanted = 0; } } static const struct protosw arpsw[] = { { .pr_type = 0, .pr_domain = &arpdomain, .pr_protocol = 0, .pr_flags = 0, .pr_input = 0, .pr_ctlinput = 0, .pr_ctloutput = 0, .pr_usrreqs = 0, .pr_init = arp_init, .pr_fasttimo = arp_fasttimo, .pr_slowtimo = 0, .pr_drain = arp_drainstub, } }; struct domain arpdomain = { .dom_family = PF_ARP, .dom_name = "arp", .dom_protosw = arpsw, .dom_protoswNPROTOSW = &arpsw[__arraycount(arpsw)], #ifdef MBUFTRACE .dom_mowner = MOWNER_INIT("internet", "arp"), #endif }; static void sysctl_net_inet_arp_setup(struct sysctllog **); void arp_init(void) { arp_pktq = pktq_create(ARP_MAXQLEN, arpintr, NULL); KASSERT(arp_pktq != NULL); sysctl_net_inet_arp_setup(NULL); arpstat_percpu = percpu_alloc(sizeof(uint64_t) * ARP_NSTATS); #ifdef MBUFTRACE MOWNER_ATTACH(&arpdomain.dom_mowner); #endif nd_attach_domain(&arp_nd_domain); arp_dad_init(); } static void arp_drainstub(void) { arp_drainwanted = 1; } /* * ARP protocol drain routine. Called when memory is in short supply. * Called at splvm(); don't acquire softnet_lock as can be called from * hardware interrupt handlers. */ void arp_drain(void) { lltable_drain(AF_INET); } /* * We set the gateway for RTF_CLONING routes to a "prototype" * link-layer sockaddr whose interface type (if_type) and interface * index (if_index) fields are prepared. */ static struct sockaddr * arp_setgate(struct rtentry *rt, struct sockaddr *gate, const struct sockaddr *netmask) { const struct ifnet *ifp = rt->rt_ifp; uint8_t namelen = strlen(ifp->if_xname); uint8_t addrlen = ifp->if_addrlen; /* * XXX: If this is a manually added route to interface * such as older version of routed or gated might provide, * restore cloning bit. */ if ((rt->rt_flags & RTF_HOST) == 0 && netmask != NULL && satocsin(netmask)->sin_addr.s_addr != 0xffffffff) rt->rt_flags |= RTF_CONNECTED; if ((rt->rt_flags & (RTF_CONNECTED | RTF_LOCAL))) { union { struct sockaddr sa; struct sockaddr_storage ss; struct sockaddr_dl sdl; } u; /* * Case 1: This route should come from a route to iface. */ sockaddr_dl_init(&u.sdl, sizeof(u.ss), ifp->if_index, ifp->if_type, NULL, namelen, NULL, addrlen); rt_setgate(rt, &u.sa); gate = rt->rt_gateway; } return gate; } /* * Parallel to llc_rtrequest. */ void arp_rtrequest(int req, struct rtentry *rt, const struct rt_addrinfo *info) { struct sockaddr *gate = rt->rt_gateway; struct in_ifaddr *ia; struct ifaddr *ifa; struct ifnet *ifp = rt->rt_ifp; int bound; int s; if (req == RTM_LLINFO_UPD) { if ((ifa = info->rti_ifa) != NULL) arpannounce1(ifa); return; } if ((rt->rt_flags & RTF_GATEWAY) != 0) { if (req != RTM_ADD) return; /* * linklayers with particular link MTU limitation. */ switch(ifp->if_type) { #if NARCNET > 0 case IFT_ARCNET: { int arcipifmtu; if (ifp->if_flags & IFF_LINK0) arcipifmtu = arc_ipmtu; else arcipifmtu = ARCMTU; if (ifp->if_mtu > arcipifmtu) rt->rt_rmx.rmx_mtu = arcipifmtu; break; } #endif } return; } switch (req) { case RTM_SETGATE: gate = arp_setgate(rt, gate, info->rti_info[RTAX_NETMASK]); break; case RTM_ADD: gate = arp_setgate(rt, gate, info->rti_info[RTAX_NETMASK]); if (gate == NULL) { log(LOG_ERR, "%s: arp_setgate failed\n", __func__); break; } if ((rt->rt_flags & RTF_CONNECTED) || (rt->rt_flags & RTF_LOCAL)) { /* * linklayers with particular link MTU limitation. */ switch (ifp->if_type) { #if NARCNET > 0 case IFT_ARCNET: { int arcipifmtu; if (ifp->if_flags & IFF_LINK0) arcipifmtu = arc_ipmtu; else arcipifmtu = ARCMTU; if ((rt->rt_rmx.rmx_locks & RTV_MTU) == 0 && (rt->rt_rmx.rmx_mtu > arcipifmtu || (rt->rt_rmx.rmx_mtu == 0 && ifp->if_mtu > arcipifmtu))) rt->rt_rmx.rmx_mtu = arcipifmtu; break; } #endif } if (rt->rt_flags & RTF_CONNECTED) break; } bound = curlwp_bind(); /* Announce a new entry if requested. */ if (rt->rt_flags & RTF_ANNOUNCE) { struct psref psref; ia = in_get_ia_on_iface_psref( satocsin(rt_getkey(rt))->sin_addr, ifp, &psref); if (ia != NULL) { arpannounce(ifp, &ia->ia_ifa, CLLADDR(satocsdl(gate))); ia4_release(ia, &psref); } } if (gate->sa_family != AF_LINK || gate->sa_len < sockaddr_dl_measure(0, ifp->if_addrlen)) { log(LOG_DEBUG, "%s: bad gateway value\n", __func__); goto out; } satosdl(gate)->sdl_type = ifp->if_type; satosdl(gate)->sdl_index = ifp->if_index; /* * If the route is for a broadcast address mark it as such. * This way we can avoid an expensive call to in_broadcast() * in ip_output() most of the time (because the route passed * to ip_output() is almost always a host route). */ if (rt->rt_flags & RTF_HOST && !(rt->rt_flags & RTF_BROADCAST) && in_broadcast(satocsin(rt_getkey(rt))->sin_addr, rt->rt_ifp)) rt->rt_flags |= RTF_BROADCAST; /* There is little point in resolving the broadcast address */ if (rt->rt_flags & RTF_BROADCAST) goto out; /* * When called from rt_ifa_addlocal, we cannot depend on that * the address (rt_getkey(rt)) exits in the address list of the * interface. So check RTF_LOCAL instead. */ if (rt->rt_flags & RTF_LOCAL) { if (useloopback) { rt->rt_ifp = lo0ifp; rt->rt_rmx.rmx_mtu = 0; } goto out; } s = pserialize_read_enter(); ia = in_get_ia_on_iface(satocsin(rt_getkey(rt))->sin_addr, ifp); if (ia == NULL) { pserialize_read_exit(s); goto out; } if (useloopback) { rt->rt_ifp = lo0ifp; rt->rt_rmx.rmx_mtu = 0; } rt->rt_flags |= RTF_LOCAL; if (ISSET(info->rti_flags, RTF_DONTCHANGEIFA)) { pserialize_read_exit(s); goto out; } /* * make sure to set rt->rt_ifa to the interface * address we are using, otherwise we will have trouble * with source address selection. */ ifa = &ia->ia_ifa; if (ifa != rt->rt_ifa) /* Assume it doesn't sleep */ rt_replace_ifa(rt, ifa); pserialize_read_exit(s); out: curlwp_bindx(bound); break; } } /* * Broadcast an ARP request. Caller specifies: * - arp header source ip address * - arp header target ip address * - arp header source ethernet address */ static void arprequest(struct ifnet *ifp, const struct in_addr *sip, const struct in_addr *tip, const uint8_t *saddr, const uint8_t *taddr) { struct mbuf *m; struct arphdr *ah; struct sockaddr sa; uint64_t *arps; KASSERT(sip != NULL); KASSERT(tip != NULL); KASSERT(saddr != NULL); if ((m = m_gethdr(M_DONTWAIT, MT_DATA)) == NULL) return; MCLAIM(m, &arpdomain.dom_mowner); switch (ifp->if_type) { case IFT_IEEE1394: m->m_len = sizeof(*ah) + 2 * sizeof(struct in_addr) + ifp->if_addrlen; break; default: m->m_len = sizeof(*ah) + 2 * sizeof(struct in_addr) + 2 * ifp->if_addrlen; break; } m->m_pkthdr.len = m->m_len; m_align(m, m->m_len); ah = mtod(m, struct arphdr *); memset(ah, 0, m->m_len); switch (ifp->if_type) { case IFT_IEEE1394: /* RFC2734 */ /* fill it now for ar_tpa computation */ ah->ar_hrd = htons(ARPHRD_IEEE1394); break; default: /* ifp->if_output will fill ar_hrd */ break; } ah->ar_pro = htons(ETHERTYPE_IP); ah->ar_hln = ifp->if_addrlen; /* hardware address length */ ah->ar_pln = sizeof(struct in_addr); /* protocol address length */ ah->ar_op = htons(ARPOP_REQUEST); memcpy(ar_sha(ah), saddr, ah->ar_hln); if (taddr == NULL) m->m_flags |= M_BCAST; else memcpy(ar_tha(ah), taddr, ah->ar_hln); memcpy(ar_spa(ah), sip, ah->ar_pln); memcpy(ar_tpa(ah), tip, ah->ar_pln); sa.sa_family = AF_ARP; sa.sa_len = 2; arps = ARP_STAT_GETREF(); arps[ARP_STAT_SNDTOTAL]++; arps[ARP_STAT_SENDREQUEST]++; ARP_STAT_PUTREF(); if_output_lock(ifp, ifp, m, &sa, NULL); } void arpannounce(struct ifnet *ifp, struct ifaddr *ifa, const uint8_t *enaddr) { struct in_ifaddr *ia = ifatoia(ifa); struct in_addr *ip = &IA_SIN(ifa)->sin_addr; if (ia->ia4_flags & (IN_IFF_NOTREADY | IN_IFF_DETACHED)) { ARPLOG(LOG_DEBUG, "%s not ready\n", ARPLOGADDR(ip)); return; } arprequest(ifp, ip, ip, enaddr, NULL); } static void arpannounce1(struct ifaddr *ifa) { arpannounce(ifa->ifa_ifp, ifa, CLLADDR(ifa->ifa_ifp->if_sadl)); } /* * Resolve an IP address into an ethernet address. If success, desten is * filled in. If there is no entry in arptab, set one up and broadcast a * request for the IP address. Hold onto this mbuf and resend it once the * address is finally resolved. * * A return value of 0 indicates that desten has been filled in and the packet * should be sent normally; a return value of EWOULDBLOCK indicates that the * packet has been held pending resolution. Any other value indicates an * error. */ int arpresolve(struct ifnet *ifp, const struct rtentry *rt, struct mbuf *m, const struct sockaddr *dst, void *desten, size_t destlen) { struct llentry *la; const char *create_lookup; int error; #if NCARP > 0 if (rt != NULL && rt->rt_ifp->if_type == IFT_CARP) ifp = rt->rt_ifp; #endif KASSERT(m != NULL); la = arplookup(ifp, NULL, dst, 0); if (la == NULL) goto notfound; if (la->la_flags & LLE_VALID && la->ln_state == ND_LLINFO_REACHABLE) { KASSERT(destlen >= ifp->if_addrlen); memcpy(desten, &la->ll_addr, ifp->if_addrlen); LLE_RUNLOCK(la); return 0; } notfound: if (ifp->if_flags & IFF_NOARP) { if (la != NULL) LLE_RUNLOCK(la); error = ENOTSUP; goto bad; } if (la == NULL) { struct rtentry *_rt; create_lookup = "create"; _rt = rtalloc1(dst, 0); IF_AFDATA_WLOCK(ifp); la = lla_create(LLTABLE(ifp), LLE_EXCLUSIVE, dst, _rt); IF_AFDATA_WUNLOCK(ifp); if (_rt != NULL) rt_unref(_rt); if (la == NULL) ARP_STATINC(ARP_STAT_ALLOCFAIL); else la->ln_state = ND_LLINFO_NOSTATE; } else if (LLE_TRY_UPGRADE(la) == 0) { create_lookup = "lookup"; LLE_RUNLOCK(la); IF_AFDATA_RLOCK(ifp); la = lla_lookup(LLTABLE(ifp), LLE_EXCLUSIVE, dst); IF_AFDATA_RUNLOCK(ifp); } error = EINVAL; if (la == NULL) { log(LOG_DEBUG, "%s: failed to %s llentry for %s on %s\n", __func__, create_lookup, inet_ntoa(satocsin(dst)->sin_addr), ifp->if_xname); goto bad; } error = nd_resolve(la, rt, m, desten, destlen); return error; bad: m_freem(m); return error; } /* * Common length and type checks are done here, * then the protocol-specific routine is called. */ void arpintr(void *arg __unused) { struct mbuf *m; struct arphdr *ar; int s; int arplen; struct ifnet *rcvif; bool badhrd; SOFTNET_KERNEL_LOCK_UNLESS_NET_MPSAFE(); while ((m = pktq_dequeue(arp_pktq)) != NULL) { if ((m->m_flags & M_PKTHDR) == 0) panic("arpintr"); MCLAIM(m, &arpdomain.dom_mowner); ARP_STATINC(ARP_STAT_RCVTOTAL); if (__predict_false(m->m_len < sizeof(*ar))) { if ((m = m_pullup(m, sizeof(*ar))) == NULL) goto badlen; } ar = mtod(m, struct arphdr *); KASSERT(ACCESSIBLE_POINTER(ar, struct arphdr)); rcvif = m_get_rcvif(m, &s); if (__predict_false(rcvif == NULL)) { ARP_STATINC(ARP_STAT_RCVNOINT); goto free; } /* * We don't want non-IEEE1394 ARP packets on IEEE1394 * interfaces, and vice versa. Our life depends on that. */ if (ntohs(ar->ar_hrd) == ARPHRD_IEEE1394) badhrd = rcvif->if_type != IFT_IEEE1394; else badhrd = rcvif->if_type == IFT_IEEE1394; m_put_rcvif(rcvif, &s); if (badhrd) { ARP_STATINC(ARP_STAT_RCVBADPROTO); goto free; } arplen = sizeof(*ar) + 2 * ar->ar_hln + 2 * ar->ar_pln; if (__predict_false(m->m_len < arplen)) { if ((m = m_pullup(m, arplen)) == NULL) goto badlen; ar = mtod(m, struct arphdr *); KASSERT(ACCESSIBLE_POINTER(ar, struct arphdr)); } switch (ntohs(ar->ar_pro)) { case ETHERTYPE_IP: case ETHERTYPE_IPTRAILERS: in_arpinput(m); continue; default: ARP_STATINC(ARP_STAT_RCVBADPROTO); goto free; } badlen: ARP_STATINC(ARP_STAT_RCVBADLEN); free: m_freem(m); } SOFTNET_KERNEL_UNLOCK_UNLESS_NET_MPSAFE(); return; /* XXX gcc */ } /* * ARP for Internet protocols on 10 Mb/s Ethernet. Algorithm is that given in * RFC 826. In addition, a sanity check is performed on the sender protocol * address, to catch impersonators. * * We no longer handle negotiations for use of trailer protocol: formerly, ARP * replied for protocol type ETHERTYPE_TRAIL sent along with IP replies if we * wanted trailers sent to us, and also sent them in response to IP replies. * This allowed either end to announce the desire to receive trailer packets. * * We no longer reply to requests for ETHERTYPE_TRAIL protocol either, but * formerly didn't normally send requests. */ static void in_arpinput(struct mbuf *m) { struct arphdr *ah; struct ifnet *ifp, *rcvif = NULL; struct llentry *la = NULL; struct in_ifaddr *ia = NULL; #if NBRIDGE > 0 struct in_ifaddr *bridge_ia = NULL; #endif #if NCARP > 0 uint32_t count = 0, index = 0; #endif struct sockaddr sa; struct in_addr isaddr, itaddr, myaddr; int op, rt_cmd, new_state = 0; void *tha; uint64_t *arps; struct psref psref, psref_ia; int s; char ipbuf[INET_ADDRSTRLEN]; bool find_source, do_dad; if (__predict_false(m_makewritable(&m, 0, m->m_pkthdr.len, M_DONTWAIT))) goto out; ah = mtod(m, struct arphdr *); op = ntohs(ah->ar_op); if (ah->ar_pln != sizeof(struct in_addr)) goto out; ifp = if_get_bylla(ar_sha(ah), ah->ar_hln, &psref); if (ifp) { /* it's from me, ignore it. */ if_put(ifp, &psref); ARP_STATINC(ARP_STAT_RCVLOCALSHA); goto out; } rcvif = ifp = m_get_rcvif_psref(m, &psref); if (__predict_false(rcvif == NULL)) goto out; if (rcvif->if_flags & IFF_NOARP) goto out; memcpy(&isaddr, ar_spa(ah), sizeof(isaddr)); memcpy(&itaddr, ar_tpa(ah), sizeof(itaddr)); if (m->m_flags & (M_BCAST|M_MCAST)) ARP_STATINC(ARP_STAT_RCVMCAST); /* * Search for a matching interface address * or any address on the interface to use * as a dummy address in the rest of this function. * * First try and find the source address for early * duplicate address detection. */ if (in_nullhost(isaddr)) { if (in_nullhost(itaddr)) /* very bogus ARP */ goto out; find_source = false; myaddr = itaddr; } else { find_source = true; myaddr = isaddr; } s = pserialize_read_enter(); again: IN_ADDRHASH_READER_FOREACH(ia, myaddr.s_addr) { if (!in_hosteq(ia->ia_addr.sin_addr, myaddr)) continue; #if NCARP > 0 if (ia->ia_ifp->if_type == IFT_CARP && ((ia->ia_ifp->if_flags & (IFF_UP|IFF_RUNNING)) == (IFF_UP|IFF_RUNNING))) { index++; /* XXX: ar_hln? */ if (ia->ia_ifp == rcvif && (ah->ar_hln >= 6) && carp_iamatch(ia, ar_sha(ah), &count, index)) { break; } } else #endif if (ia->ia_ifp == rcvif) break; #if NBRIDGE > 0 /* * If the interface we received the packet on * is part of a bridge, check to see if we need * to "bridge" the packet to ourselves at this * layer. Note we still prefer a perfect match, * but allow this weaker match if necessary. */ if (rcvif->if_bridge != NULL && rcvif->if_bridge == ia->ia_ifp->if_bridge) bridge_ia = ia; #endif } #if NBRIDGE > 0 if (ia == NULL && bridge_ia != NULL) { ia = bridge_ia; m_put_rcvif_psref(rcvif, &psref); rcvif = NULL; /* FIXME */ ifp = bridge_ia->ia_ifp; } #endif /* If we failed to find the source address then find * the target address. */ if (ia == NULL && find_source && !in_nullhost(itaddr)) { find_source = false; myaddr = itaddr; goto again; } if (ia != NULL) ia4_acquire(ia, &psref_ia); pserialize_read_exit(s); if (ah->ar_hln != ifp->if_addrlen) { ARP_STATINC(ARP_STAT_RCVBADLEN); log(LOG_WARNING, "arp from %s: addr len: new %d, i/f %d (ignored)\n", IN_PRINT(ipbuf, &isaddr), ah->ar_hln, ifp->if_addrlen); goto out; } /* Only do DaD if we have a matching address. */ do_dad = (ia != NULL); if (ia == NULL) { ia = in_get_ia_on_iface_psref(isaddr, rcvif, &psref_ia); if (ia == NULL) { ia = in_get_ia_from_ifp_psref(ifp, &psref_ia); if (ia == NULL) { ARP_STATINC(ARP_STAT_RCVNOINT); goto out; } } } myaddr = ia->ia_addr.sin_addr; /* XXX checks for bridge case? */ if (!memcmp(ar_sha(ah), ifp->if_broadcastaddr, ifp->if_addrlen)) { ARP_STATINC(ARP_STAT_RCVBCASTSHA); log(LOG_ERR, "%s: arp: link address is broadcast for IP address %s!\n", ifp->if_xname, IN_PRINT(ipbuf, &isaddr)); goto out; } /* * If the source IP address is zero, this is an RFC 5227 ARP probe */ if (in_nullhost(isaddr)) ARP_STATINC(ARP_STAT_RCVZEROSPA); else if (in_hosteq(isaddr, myaddr)) ARP_STATINC(ARP_STAT_RCVLOCALSPA); if (in_nullhost(itaddr)) ARP_STATINC(ARP_STAT_RCVZEROTPA); /* * DAD check, RFC 5227. * ARP sender hardware address must match the interface * address of the interface sending the packet. * Collision on sender address is always a duplicate. * Collision on target address is only a duplicate * IF the sender address is the null host (ie a DAD probe) * AND the message was broadcast * AND our address is either tentative or duplicated * If it was unicast then it's a valid Unicast Poll from RFC 1122. */ if (do_dad && (in_hosteq(isaddr, myaddr) || (in_nullhost(isaddr) && in_hosteq(itaddr, myaddr) && m->m_flags & M_BCAST && ia->ia4_flags & (IN_IFF_TENTATIVE | IN_IFF_DUPLICATED)))) { struct m_tag *mtag; mtag = m_tag_find(m, PACKET_TAG_ETHERNET_SRC); if (mtag == NULL || (ah->ar_hln == ETHER_ADDR_LEN && memcmp(mtag + 1, ar_sha(ah), ah->ar_hln) == 0)) { struct sockaddr_dl sdl, *sdlp; sdlp = sockaddr_dl_init(&sdl, sizeof(sdl), ifp->if_index, ifp->if_type, NULL, 0, ar_sha(ah), ah->ar_hln); arp_dad_duplicated((struct ifaddr *)ia, sdlp); goto out; } } /* * If the target IP address is zero, ignore the packet. * This prevents the code below from trying to answer * when we are using IP address zero (booting). */ if (in_nullhost(itaddr)) goto out; if (in_nullhost(isaddr)) goto reply; if (in_hosteq(itaddr, myaddr)) la = arpcreate(ifp, &isaddr, NULL, 1); else la = arplookup(ifp, &isaddr, NULL, 1); if (la == NULL) goto reply; if ((la->la_flags & LLE_VALID) && memcmp(ar_sha(ah), &la->ll_addr, ifp->if_addrlen)) { char llabuf[LLA_ADDRSTRLEN], *llastr; llastr = lla_snprintf(llabuf, sizeof(llabuf), ar_sha(ah), ah->ar_hln); if (la->la_flags & LLE_STATIC) { ARP_STATINC(ARP_STAT_RCVOVERPERM); if (!log_permanent_modify) goto out; log(LOG_INFO, "%s tried to overwrite permanent arp info" " for %s\n", llastr, IN_PRINT(ipbuf, &isaddr)); goto out; } else if (la->lle_tbl->llt_ifp != ifp) { /* XXX should not happen? */ ARP_STATINC(ARP_STAT_RCVOVERINT); if (!log_wrong_iface) goto out; log(LOG_INFO, "%s on %s tried to overwrite " "arp info for %s on %s\n", llastr, ifp->if_xname, IN_PRINT(ipbuf, &isaddr), la->lle_tbl->llt_ifp->if_xname); goto out; } else { ARP_STATINC(ARP_STAT_RCVOVER); if (log_movements) log(LOG_INFO, "arp info overwritten " "for %s by %s\n", IN_PRINT(ipbuf, &isaddr), llastr); } rt_cmd = RTM_CHANGE; new_state = ND_LLINFO_STALE; } else { if (op == ARPOP_REPLY && in_hosteq(itaddr, myaddr)) { /* This was a solicited ARP reply. */ la->ln_byhint = 0; new_state = ND_LLINFO_REACHABLE; } rt_cmd = la->la_flags & LLE_VALID ? 0 : RTM_ADD; } KASSERT(ifp->if_sadl->sdl_alen == ifp->if_addrlen); KASSERT(sizeof(la->ll_addr) >= ifp->if_addrlen); memcpy(&la->ll_addr, ar_sha(ah), ifp->if_addrlen); la->la_flags |= LLE_VALID; la->ln_asked = 0; if (new_state != 0) { la->ln_state = new_state; if (new_state != ND_LLINFO_REACHABLE || !(la->la_flags & LLE_STATIC)) { int timer = ND_TIMER_GC; if (new_state == ND_LLINFO_REACHABLE) timer = ND_TIMER_REACHABLE; nd_set_timer(la, timer); } } if (rt_cmd != 0) { struct sockaddr_in sin; sockaddr_in_init(&sin, &la->r_l3addr.addr4, 0); rt_clonedmsg(rt_cmd, NULL, sintosa(&sin), ar_sha(ah), ifp); } if (la->la_hold != NULL) { int n = la->la_numheld; struct mbuf *m_hold, *m_hold_next; struct sockaddr_in sin; sockaddr_in_init(&sin, &la->r_l3addr.addr4, 0); m_hold = la->la_hold; la->la_hold = NULL; la->la_numheld = 0; /* * We have to unlock here because if_output would call * arpresolve */ LLE_WUNLOCK(la); ARP_STATADD(ARP_STAT_DFRSENT, n); ARP_STATADD(ARP_STAT_DFRTOTAL, n); for (; m_hold != NULL; m_hold = m_hold_next) { m_hold_next = m_hold->m_nextpkt; m_hold->m_nextpkt = NULL; if_output_lock(ifp, ifp, m_hold, sintosa(&sin), NULL); } } else LLE_WUNLOCK(la); la = NULL; reply: if (la != NULL) { LLE_WUNLOCK(la); la = NULL; } if (op != ARPOP_REQUEST) { if (op == ARPOP_REPLY) ARP_STATINC(ARP_STAT_RCVREPLY); goto out; } ARP_STATINC(ARP_STAT_RCVREQUEST); if (in_hosteq(itaddr, myaddr)) { /* If our address is unusable, don't reply */ if (ia->ia4_flags & (IN_IFF_NOTREADY | IN_IFF_DETACHED)) goto out; /* I am the target */ tha = ar_tha(ah); if (tha) memcpy(tha, ar_sha(ah), ah->ar_hln); memcpy(ar_sha(ah), CLLADDR(ifp->if_sadl), ah->ar_hln); } else { /* Proxy ARP */ struct llentry *lle = NULL; struct sockaddr_in sin; #if NCARP > 0 if (ifp->if_type == IFT_CARP) { struct ifnet *_rcvif = m_get_rcvif(m, &s); int iftype = 0; if (__predict_true(_rcvif != NULL)) iftype = _rcvif->if_type; m_put_rcvif(_rcvif, &s); if (iftype != IFT_CARP) goto out; } #endif tha = ar_tha(ah); sockaddr_in_init(&sin, &itaddr, 0); IF_AFDATA_RLOCK(ifp); lle = lla_lookup(LLTABLE(ifp), 0, (struct sockaddr *)&sin); IF_AFDATA_RUNLOCK(ifp); if ((lle != NULL) && (lle->la_flags & LLE_PUB)) { if (tha) memcpy(tha, ar_sha(ah), ah->ar_hln); memcpy(ar_sha(ah), &lle->ll_addr, ah->ar_hln); LLE_RUNLOCK(lle); } else { if (lle != NULL) LLE_RUNLOCK(lle); goto out; } } ia4_release(ia, &psref_ia); /* * XXX XXX: Here we're recycling the mbuf. But the mbuf could have * other mbufs in its chain, and just overwriting m->m_pkthdr.len * would be wrong in this case (the length becomes smaller than the * real chain size). * * This can theoretically cause bugs in the lower layers (drivers, * and L2encap), in some corner cases. */ memcpy(ar_tpa(ah), ar_spa(ah), ah->ar_pln); memcpy(ar_spa(ah), &itaddr, ah->ar_pln); ah->ar_op = htons(ARPOP_REPLY); ah->ar_pro = htons(ETHERTYPE_IP); /* let's be sure! */ switch (ifp->if_type) { case IFT_IEEE1394: /* ieee1394 arp reply is broadcast */ m->m_flags &= ~M_MCAST; m->m_flags |= M_BCAST; m->m_len = sizeof(*ah) + (2 * ah->ar_pln) + ah->ar_hln; break; default: m->m_flags &= ~(M_BCAST|M_MCAST); /* never reply by broadcast */ m->m_len = sizeof(*ah) + (2 * ah->ar_pln) + (2 * ah->ar_hln); break; } m->m_pkthdr.len = m->m_len; sa.sa_family = AF_ARP; sa.sa_len = 2; arps = ARP_STAT_GETREF(); arps[ARP_STAT_SNDTOTAL]++; arps[ARP_STAT_SNDREPLY]++; ARP_STAT_PUTREF(); if_output_lock(ifp, ifp, m, &sa, NULL); if (rcvif != NULL) m_put_rcvif_psref(rcvif, &psref); return; out: if (la != NULL) LLE_WUNLOCK(la); if (ia != NULL) ia4_release(ia, &psref_ia); if (rcvif != NULL) m_put_rcvif_psref(rcvif, &psref); m_freem(m); } /* * Lookup or a new address in arptab. */ struct llentry * arplookup(struct ifnet *ifp, const struct in_addr *addr, const struct sockaddr *sa, int wlock) { struct sockaddr_in sin; struct llentry *la; int flags = wlock ? LLE_EXCLUSIVE : 0; if (sa == NULL) { KASSERT(addr != NULL); sockaddr_in_init(&sin, addr, 0); sa = sintocsa(&sin); } IF_AFDATA_RLOCK(ifp); la = lla_lookup(LLTABLE(ifp), flags, sa); IF_AFDATA_RUNLOCK(ifp); return la; } static struct llentry * arpcreate(struct ifnet *ifp, const struct in_addr *addr, const struct sockaddr *sa, int wlock) { struct sockaddr_in sin; struct llentry *la; int flags = wlock ? LLE_EXCLUSIVE : 0; if (sa == NULL) { KASSERT(addr != NULL); sockaddr_in_init(&sin, addr, 0); sa = sintocsa(&sin); } la = arplookup(ifp, addr, sa, wlock); if (la == NULL) { struct rtentry *rt; rt = rtalloc1(sa, 0); IF_AFDATA_WLOCK(ifp); la = lla_create(LLTABLE(ifp), flags, sa, rt); IF_AFDATA_WUNLOCK(ifp); if (rt != NULL) rt_unref(rt); if (la != NULL) la->ln_state = ND_LLINFO_NOSTATE; } return la; } int arpioctl(u_long cmd, void *data) { return EOPNOTSUPP; } void arp_ifinit(struct ifnet *ifp, struct ifaddr *ifa) { struct in_ifaddr *ia = (struct in_ifaddr *)ifa; ifa->ifa_rtrequest = arp_rtrequest; ifa->ifa_flags |= RTF_CONNECTED; /* ARP will handle DAD for this address. */ if (in_nullhost(IA_SIN(ifa)->sin_addr)) { if (ia->ia_dad_stop != NULL) /* safety */ ia->ia_dad_stop(ifa); ia->ia_dad_start = NULL; ia->ia_dad_stop = NULL; ia->ia4_flags &= ~IN_IFF_TENTATIVE; } else { ia->ia_dad_start = arp_dad_start; ia->ia_dad_stop = arp_dad_stop; if (ia->ia4_flags & IN_IFF_TRYTENTATIVE && ip_dad_enabled()) ia->ia4_flags |= IN_IFF_TENTATIVE; else arpannounce1(ifa); } } static bool arp_nud_enabled(__unused struct ifnet *ifp) { return arp_perform_nud != 0; } static unsigned int arp_llinfo_reachable(__unused struct ifnet *ifp) { return arp_reachable; } static unsigned int arp_llinfo_retrans(__unused struct ifnet *ifp) { return arp_retrans; } /* * Gets source address of the first packet in hold queue * and stores it in @src. * Returns pointer to @src (if hold queue is not empty) or NULL. */ static union l3addr * arp_llinfo_holdsrc(struct llentry *ln, union l3addr *src) { struct ip *ip; if (ln == NULL || ln->ln_hold == NULL) return NULL; /* * assuming every packet in ln_hold has the same IP header */ ip = mtod(ln->ln_hold, struct ip *); /* XXX pullup? */ if (sizeof(*ip) < ln->ln_hold->m_len) src->addr4 = ip->ip_src; else src = NULL; return src; } static void arp_llinfo_output(struct ifnet *ifp, __unused const union l3addr *daddr, const union l3addr *taddr, const uint8_t *tlladdr, const union l3addr *hsrc) { struct in_addr tip = taddr->addr4, sip = zeroin_addr; const uint8_t *slladdr = CLLADDR(ifp->if_sadl); if (hsrc != NULL) { struct in_ifaddr *ia; struct psref psref; ia = in_get_ia_on_iface_psref(hsrc->addr4, ifp, &psref); if (ia != NULL) { sip = hsrc->addr4; ia4_release(ia, &psref); } } if (sip.s_addr == INADDR_ANY) { struct sockaddr_in dst; struct rtentry *rt; sockaddr_in_init(&dst, &tip, 0); rt = rtalloc1(sintosa(&dst), 0); if (rt != NULL) { if (rt->rt_ifp == ifp && rt->rt_ifa != NULL && rt->rt_ifa->ifa_addr->sa_family == AF_INET) sip = satosin(rt->rt_ifa->ifa_addr)->sin_addr; rt_unref(rt); } if (sip.s_addr == INADDR_ANY) { char ipbuf[INET_ADDRSTRLEN]; log(LOG_DEBUG, "%s: source can't be " "determined: dst=%s\n", __func__, IN_PRINT(ipbuf, &tip)); return; } } arprequest(ifp, &sip, &tip, slladdr, tlladdr); } static void arp_llinfo_missed(struct ifnet *ifp, const union l3addr *taddr, __unused int16_t type, struct mbuf *m) { struct in_addr mdaddr = zeroin_addr; struct sockaddr_in dsin, tsin; struct sockaddr *sa; if (m != NULL) { struct ip *ip = mtod(m, struct ip *); if (sizeof(*ip) < m->m_len) mdaddr = ip->ip_src; /* ip_input() will send ICMP_UNREACH_HOST, not us. */ m_freem(m); } if (mdaddr.s_addr != INADDR_ANY) { sockaddr_in_init(&dsin, &mdaddr, 0); sa = sintosa(&dsin); } else sa = NULL; sockaddr_in_init(&tsin, &taddr->addr4, 0); rt_clonedmsg(RTM_MISS, sa, sintosa(&tsin), NULL, ifp); } static void arp_free(struct llentry *ln, int gc) { struct ifnet *ifp; KASSERT(ln != NULL); LLE_WLOCK_ASSERT(ln); ifp = ln->lle_tbl->llt_ifp; if (ln->la_flags & LLE_VALID || gc) { struct sockaddr_in sin; const char *lladdr; sockaddr_in_init(&sin, &ln->r_l3addr.addr4, 0); lladdr = ln->la_flags & LLE_VALID ? (const char *)&ln->ll_addr : NULL; rt_clonedmsg(RTM_DELETE, NULL, sintosa(&sin), lladdr, ifp); } /* * Save to unlock. We still hold an extra reference and will not * free(9) in llentry_free() if someone else holds one as well. */ LLE_WUNLOCK(ln); IF_AFDATA_LOCK(ifp); LLE_WLOCK(ln); lltable_free_entry(LLTABLE(ifp), ln); IF_AFDATA_UNLOCK(ifp); } /* * Upper-layer reachability hint for Neighbor Unreachability Detection. * * XXX cost-effective methods? */ void arp_nud_hint(struct rtentry *rt) { struct llentry *ln; struct ifnet *ifp; if (rt == NULL) return; ifp = rt->rt_ifp; ln = arplookup(ifp, NULL, rt_getkey(rt), 1); nd_nud_hint(ln); } TAILQ_HEAD(dadq_head, dadq); struct dadq { TAILQ_ENTRY(dadq) dad_list; struct ifaddr *dad_ifa; int dad_count; /* max ARP to send */ int dad_arp_tcount; /* # of trials to send ARP */ int dad_arp_ocount; /* ARP sent so far */ int dad_arp_announce; /* max ARP announcements */ int dad_arp_acount; /* # of announcements */ struct callout dad_timer_ch; }; static struct dadq_head dadq; static int dad_maxtry = 15; /* max # of *tries* to transmit DAD packet */ static kmutex_t arp_dad_lock; static void arp_dad_init(void) { TAILQ_INIT(&dadq); mutex_init(&arp_dad_lock, MUTEX_DEFAULT, IPL_NONE); } static struct dadq * arp_dad_find(struct ifaddr *ifa) { struct dadq *dp; KASSERT(mutex_owned(&arp_dad_lock)); TAILQ_FOREACH(dp, &dadq, dad_list) { if (dp->dad_ifa == ifa) return dp; } return NULL; } static void arp_dad_starttimer(struct dadq *dp, int ticks) { callout_reset(&dp->dad_timer_ch, ticks, (void (*)(void *))arp_dad_timer, dp); } static void arp_dad_stoptimer(struct dadq *dp) { KASSERT(mutex_owned(&arp_dad_lock)); TAILQ_REMOVE(&dadq, dp, dad_list); /* Tell the timer that dp is being destroyed. */ dp->dad_ifa = NULL; callout_halt(&dp->dad_timer_ch, &arp_dad_lock); } static void arp_dad_destroytimer(struct dadq *dp) { callout_destroy(&dp->dad_timer_ch); KASSERT(dp->dad_ifa == NULL); kmem_intr_free(dp, sizeof(*dp)); } static void arp_dad_output(struct dadq *dp, struct ifaddr *ifa) { struct in_ifaddr *ia = (struct in_ifaddr *)ifa; struct ifnet *ifp = ifa->ifa_ifp; struct in_addr sip; dp->dad_arp_tcount++; if ((ifp->if_flags & IFF_UP) == 0) return; if ((ifp->if_flags & IFF_RUNNING) == 0) return; dp->dad_arp_tcount = 0; dp->dad_arp_ocount++; memset(&sip, 0, sizeof(sip)); arprequest(ifa->ifa_ifp, &sip, &ia->ia_addr.sin_addr, CLLADDR(ifa->ifa_ifp->if_sadl), NULL); } /* * Start Duplicate Address Detection (DAD) for specified interface address. */ static void arp_dad_start(struct ifaddr *ifa) { struct in_ifaddr *ia = (struct in_ifaddr *)ifa; struct dadq *dp; char ipbuf[INET_ADDRSTRLEN]; /* * If we don't need DAD, don't do it. * - DAD is disabled */ if (!(ia->ia4_flags & IN_IFF_TENTATIVE)) { log(LOG_DEBUG, "%s: called with non-tentative address %s(%s)\n", __func__, IN_PRINT(ipbuf, &ia->ia_addr.sin_addr), ifa->ifa_ifp ? if_name(ifa->ifa_ifp) : "???"); return; } if (!ip_dad_enabled()) { ia->ia4_flags &= ~IN_IFF_TENTATIVE; rt_addrmsg(RTM_NEWADDR, ifa); arpannounce1(ifa); return; } KASSERT(ifa->ifa_ifp != NULL); if (!(ifa->ifa_ifp->if_flags & IFF_UP)) return; dp = kmem_intr_alloc(sizeof(*dp), KM_NOSLEEP); mutex_enter(&arp_dad_lock); if (arp_dad_find(ifa) != NULL) { mutex_exit(&arp_dad_lock); /* DAD already in progress */ if (dp != NULL) kmem_intr_free(dp, sizeof(*dp)); return; } if (dp == NULL) { mutex_exit(&arp_dad_lock); log(LOG_ERR, "%s: memory allocation failed for %s(%s)\n", __func__, IN_PRINT(ipbuf, &ia->ia_addr.sin_addr), ifa->ifa_ifp ? if_name(ifa->ifa_ifp) : "???"); return; } /* * Send ARP packet for DAD, ip_dad_count times. * Note that we must delay the first transmission. */ callout_init(&dp->dad_timer_ch, CALLOUT_MPSAFE); dp->dad_ifa = ifa; ifaref(ifa); /* just for safety */ dp->dad_count = ip_dad_count; dp->dad_arp_announce = 0; /* Will be set when starting to announce */ dp->dad_arp_acount = dp->dad_arp_ocount = dp->dad_arp_tcount = 0; TAILQ_INSERT_TAIL(&dadq, (struct dadq *)dp, dad_list); ARPLOG(LOG_DEBUG, "%s: starting DAD for %s\n", if_name(ifa->ifa_ifp), ARPLOGADDR(&ia->ia_addr.sin_addr)); arp_dad_starttimer(dp, cprng_fast32() % (PROBE_WAIT * hz)); mutex_exit(&arp_dad_lock); } /* * terminate DAD unconditionally. used for address removals. */ static void arp_dad_stop(struct ifaddr *ifa) { struct dadq *dp; mutex_enter(&arp_dad_lock); dp = arp_dad_find(ifa); if (dp == NULL) { mutex_exit(&arp_dad_lock); /* DAD wasn't started yet */ return; } arp_dad_stoptimer(dp); mutex_exit(&arp_dad_lock); arp_dad_destroytimer(dp); ifafree(ifa); } static void arp_dad_timer(struct dadq *dp) { struct ifaddr *ifa; struct in_ifaddr *ia; char ipbuf[INET_ADDRSTRLEN]; bool need_free = false; KERNEL_LOCK_UNLESS_NET_MPSAFE(); mutex_enter(&arp_dad_lock); ifa = dp->dad_ifa; if (ifa == NULL) { /* dp is being destroyed by someone. Do nothing. */ goto done; } ia = (struct in_ifaddr *)ifa; if (ia->ia4_flags & IN_IFF_DUPLICATED) { log(LOG_ERR, "%s: called with duplicate address %s(%s)\n", __func__, IN_PRINT(ipbuf, &ia->ia_addr.sin_addr), ifa->ifa_ifp ? if_name(ifa->ifa_ifp) : "???"); goto done; } if ((ia->ia4_flags & IN_IFF_TENTATIVE) == 0 && dp->dad_arp_acount == 0) { log(LOG_ERR, "%s: called with non-tentative address %s(%s)\n", __func__, IN_PRINT(ipbuf, &ia->ia_addr.sin_addr), ifa->ifa_ifp ? if_name(ifa->ifa_ifp) : "???"); goto done; } /* timeouted with IFF_{RUNNING,UP} check */ if (dp->dad_arp_tcount > dad_maxtry) { ARPLOG(LOG_INFO, "%s: could not run DAD, driver problem?\n", if_name(ifa->ifa_ifp)); arp_dad_stoptimer(dp); need_free = true; goto done; } /* Need more checks? */ if (dp->dad_arp_ocount < dp->dad_count) { int adelay; /* * We have more ARP to go. Send ARP packet for DAD. */ arp_dad_output(dp, ifa); if (dp->dad_arp_ocount < dp->dad_count) adelay = (PROBE_MIN * hz) + (cprng_fast32() % ((PROBE_MAX * hz) - (PROBE_MIN * hz))); else adelay = ANNOUNCE_WAIT * hz; arp_dad_starttimer(dp, adelay); goto done; } else if (dp->dad_arp_acount == 0) { /* * We are done with DAD. * No duplicate address found. */ ia->ia4_flags &= ~IN_IFF_TENTATIVE; rt_addrmsg(RTM_NEWADDR, ifa); ARPLOG(LOG_DEBUG, "%s: DAD complete for %s - no duplicates found\n", if_name(ifa->ifa_ifp), ARPLOGADDR(&ia->ia_addr.sin_addr)); dp->dad_arp_announce = ANNOUNCE_NUM; goto announce; } else if (dp->dad_arp_acount < dp->dad_arp_announce) { announce: /* * Announce the address. */ arpannounce1(ifa); dp->dad_arp_acount++; if (dp->dad_arp_acount < dp->dad_arp_announce) { arp_dad_starttimer(dp, ANNOUNCE_INTERVAL * hz); goto done; } ARPLOG(LOG_DEBUG, "%s: ARP announcement complete for %s\n", if_name(ifa->ifa_ifp), ARPLOGADDR(&ia->ia_addr.sin_addr)); } arp_dad_stoptimer(dp); need_free = true; done: mutex_exit(&arp_dad_lock); if (need_free) { arp_dad_destroytimer(dp); KASSERT(ifa != NULL); ifafree(ifa); } KERNEL_UNLOCK_UNLESS_NET_MPSAFE(); } static void arp_dad_duplicated(struct ifaddr *ifa, const struct sockaddr_dl *from) { struct in_ifaddr *ia = ifatoia(ifa); struct ifnet *ifp = ifa->ifa_ifp; char ipbuf[INET_ADDRSTRLEN], llabuf[LLA_ADDRSTRLEN]; const char *iastr, *llastr; iastr = IN_PRINT(ipbuf, &ia->ia_addr.sin_addr); if (__predict_false(from == NULL)) llastr = NULL; else llastr = lla_snprintf(llabuf, sizeof(llabuf), CLLADDR(from), from->sdl_alen); if (ia->ia4_flags & (IN_IFF_TENTATIVE|IN_IFF_DUPLICATED)) { log(LOG_ERR, "%s: DAD duplicate address %s from %s\n", if_name(ifp), iastr, llastr); } else if (ia->ia_dad_defended == 0 || ia->ia_dad_defended < time_uptime - DEFEND_INTERVAL) { ia->ia_dad_defended = time_uptime; arpannounce1(ifa); log(LOG_ERR, "%s: DAD defended address %s from %s\n", if_name(ifp), iastr, llastr); return; } else { /* If DAD is disabled, just report the duplicate. */ if (!ip_dad_enabled()) { log(LOG_ERR, "%s: DAD ignoring duplicate address %s from %s\n", if_name(ifp), iastr, llastr); return; } log(LOG_ERR, "%s: DAD defence failed for %s from %s\n", if_name(ifp), iastr, llastr); } arp_dad_stop(ifa); ia->ia4_flags &= ~IN_IFF_TENTATIVE; if ((ia->ia4_flags & IN_IFF_DUPLICATED) == 0) { ia->ia4_flags |= IN_IFF_DUPLICATED; /* Inform the routing socket of the duplicate address */ rt_addrmsg_src(RTM_NEWADDR, ifa, (const struct sockaddr *)from); } } /* * Called from 10 Mb/s Ethernet interrupt handlers * when ether packet type ETHERTYPE_REVARP * is received. Common length and type checks are done here, * then the protocol-specific routine is called. */ void revarpinput(struct mbuf *m) { struct arphdr *ar; int arplen; arplen = sizeof(struct arphdr); if (m->m_len < arplen && (m = m_pullup(m, arplen)) == NULL) return; ar = mtod(m, struct arphdr *); if (ntohs(ar->ar_hrd) == ARPHRD_IEEE1394) { goto out; } arplen = sizeof(struct arphdr) + 2 * (ar->ar_hln + ar->ar_pln); if (m->m_len < arplen && (m = m_pullup(m, arplen)) == NULL) return; ar = mtod(m, struct arphdr *); switch (ntohs(ar->ar_pro)) { case ETHERTYPE_IP: case ETHERTYPE_IPTRAILERS: in_revarpinput(m); return; default: break; } out: m_freem(m); } /* * RARP for Internet protocols on 10 Mb/s Ethernet. * Algorithm is that given in RFC 903. * We are only using for bootstrap purposes to get an ip address for one of * our interfaces. Thus we support no user-interface. * * Since the contents of the RARP reply are specific to the interface that * sent the request, this code must ensure that they are properly associated. * * Note: also supports ARP via RARP packets, per the RFC. */ void in_revarpinput(struct mbuf *m) { struct arphdr *ah; void *tha; int op; struct ifnet *rcvif; int s; ah = mtod(m, struct arphdr *); op = ntohs(ah->ar_op); rcvif = m_get_rcvif(m, &s); if (__predict_false(rcvif == NULL)) goto out; if (rcvif->if_flags & IFF_NOARP) goto out; switch (rcvif->if_type) { case IFT_IEEE1394: /* ARP without target hardware address is not supported */ goto out; default: break; } switch (op) { case ARPOP_REQUEST: case ARPOP_REPLY: /* per RFC */ m_put_rcvif(rcvif, &s); in_arpinput(m); return; case ARPOP_REVREPLY: break; case ARPOP_REVREQUEST: /* handled by rarpd(8) */ default: goto out; } if (!revarp_in_progress) goto out; if (rcvif != myip_ifp) /* !same interface */ goto out; if (myip_initialized) goto wake; tha = ar_tha(ah); if (tha == NULL) goto out; if (ah->ar_pln != sizeof(struct in_addr)) goto out; if (ah->ar_hln != rcvif->if_sadl->sdl_alen) goto out; if (memcmp(tha, CLLADDR(rcvif->if_sadl), rcvif->if_sadl->sdl_alen)) goto out; memcpy(&srv_ip, ar_spa(ah), sizeof(srv_ip)); memcpy(&myip, ar_tpa(ah), sizeof(myip)); myip_initialized = 1; wake: /* Do wakeup every time in case it was missed. */ wakeup((void *)&myip); out: m_put_rcvif(rcvif, &s); m_freem(m); } /* * Send a RARP request for the ip address of the specified interface. * The request should be RFC 903-compliant. */ static void revarprequest(struct ifnet *ifp) { struct sockaddr sa; struct mbuf *m; struct arphdr *ah; void *tha; if ((m = m_gethdr(M_DONTWAIT, MT_DATA)) == NULL) return; MCLAIM(m, &arpdomain.dom_mowner); m->m_len = sizeof(*ah) + 2*sizeof(struct in_addr) + 2*ifp->if_addrlen; m->m_pkthdr.len = m->m_len; m_align(m, m->m_len); ah = mtod(m, struct arphdr *); memset(ah, 0, m->m_len); ah->ar_pro = htons(ETHERTYPE_IP); ah->ar_hln = ifp->if_addrlen; /* hardware address length */ ah->ar_pln = sizeof(struct in_addr); /* protocol address length */ ah->ar_op = htons(ARPOP_REVREQUEST); memcpy(ar_sha(ah), CLLADDR(ifp->if_sadl), ah->ar_hln); tha = ar_tha(ah); if (tha == NULL) { m_free(m); return; } memcpy(tha, CLLADDR(ifp->if_sadl), ah->ar_hln); sa.sa_family = AF_ARP; sa.sa_len = 2; m->m_flags |= M_BCAST; if_output_lock(ifp, ifp, m, &sa, NULL); } /* * RARP for the ip address of the specified interface, but also * save the ip address of the server that sent the answer. * Timeout if no response is received. */ int revarpwhoarewe(struct ifnet *ifp, struct in_addr *serv_in, struct in_addr *clnt_in) { int result, count = 20; myip_initialized = 0; myip_ifp = ifp; revarp_in_progress = 1; while (count--) { revarprequest(ifp); result = tsleep((void *)&myip, PSOCK, "revarp", hz/2); if (result != EWOULDBLOCK) break; } revarp_in_progress = 0; if (!myip_initialized) return ENETUNREACH; memcpy(serv_in, &srv_ip, sizeof(*serv_in)); memcpy(clnt_in, &myip, sizeof(*clnt_in)); return 0; } void arp_stat_add(int type, uint64_t count) { ARP_STATADD(type, count); } static int sysctl_net_inet_arp_stats(SYSCTLFN_ARGS) { return NETSTAT_SYSCTL(arpstat_percpu, ARP_NSTATS); } static void sysctl_net_inet_arp_setup(struct sysctllog **clog) { const struct sysctlnode *node; sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT, CTLTYPE_NODE, "inet", NULL, NULL, 0, NULL, 0, CTL_NET, PF_INET, CTL_EOL); sysctl_createv(clog, 0, NULL, &node, CTLFLAG_PERMANENT, CTLTYPE_NODE, "arp", SYSCTL_DESCR("Address Resolution Protocol"), NULL, 0, NULL, 0, CTL_NET, PF_INET, CTL_CREATE, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_INT, "nd_delay", SYSCTL_DESCR("First probe delay time"), NULL, 0, &arp_nd_domain.nd_delay, 0, CTL_NET, PF_INET, node->sysctl_num, CTL_CREATE, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_INT, "nd_bmaxtries", SYSCTL_DESCR("Number of broadcast discovery attempts"), NULL, 0, &arp_nd_domain.nd_mmaxtries, 0, CTL_NET, PF_INET, node->sysctl_num, CTL_CREATE, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_INT, "nd_umaxtries", SYSCTL_DESCR("Number of unicast discovery attempts"), NULL, 0, &arp_nd_domain.nd_umaxtries, 0, CTL_NET, PF_INET, node->sysctl_num, CTL_CREATE, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_INT, "nd_reachable", SYSCTL_DESCR("Reachable time"), NULL, 0, &arp_reachable, 0, CTL_NET, PF_INET, node->sysctl_num, CTL_CREATE, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_INT, "nd_retrans", SYSCTL_DESCR("Retransmission time"), NULL, 0, &arp_retrans, 0, CTL_NET, PF_INET, node->sysctl_num, CTL_CREATE, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_INT, "nd_nud", SYSCTL_DESCR("Perform neighbour unreachability detection"), NULL, 0, &arp_perform_nud, 0, CTL_NET, PF_INET, node->sysctl_num, CTL_CREATE, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_INT, "nd_maxnudhint", SYSCTL_DESCR("Maximum neighbor unreachable hint count"), NULL, 0, &arp_nd_domain.nd_maxnudhint, 0, CTL_NET, PF_INET, node->sysctl_num, CTL_CREATE, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_INT, "maxqueuelen", SYSCTL_DESCR("max packet queue len for a unresolved ARP"), NULL, 1, &arp_nd_domain.nd_maxqueuelen, 0, CTL_NET, PF_INET, node->sysctl_num, CTL_CREATE, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT, CTLTYPE_STRUCT, "stats", SYSCTL_DESCR("ARP statistics"), sysctl_net_inet_arp_stats, 0, NULL, 0, CTL_NET,PF_INET, node->sysctl_num, CTL_CREATE, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_INT, "log_movements", SYSCTL_DESCR("log ARP replies from MACs different than" " the one in the cache"), NULL, 0, &log_movements, 0, CTL_NET,PF_INET, node->sysctl_num, CTL_CREATE, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_INT, "log_permanent_modify", SYSCTL_DESCR("log ARP replies from MACs different than" " the one in the permanent arp entry"), NULL, 0, &log_permanent_modify, 0, CTL_NET,PF_INET, node->sysctl_num, CTL_CREATE, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_INT, "log_wrong_iface", SYSCTL_DESCR("log ARP packets arriving on the wrong" " interface"), NULL, 0, &log_wrong_iface, 0, CTL_NET,PF_INET, node->sysctl_num, CTL_CREATE, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_INT, "debug", SYSCTL_DESCR("Enable ARP DAD debug output"), NULL, 0, &arp_debug, 0, CTL_NET, PF_INET, node->sysctl_num, CTL_CREATE, CTL_EOL); } #endif /* INET */
65 30 211 55 187 95 19 87 4 1 1 1 1 1 1 1 1 55 55 55 55 55 12 12 12 12 12 30 29 8 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 /* $NetBSD: ufs_quota.c,v 1.118 2023/02/22 21:49:45 riastradh Exp $ */ /* * Copyright (c) 1982, 1986, 1990, 1993, 1995 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * Robert Elz at The University of Melbourne. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)ufs_quota.c 8.5 (Berkeley) 5/20/95 */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: ufs_quota.c,v 1.118 2023/02/22 21:49:45 riastradh Exp $"); #if defined(_KERNEL_OPT) #include "opt_quota.h" #endif #include <sys/param.h> #include <sys/kernel.h> #include <sys/systm.h> #include <sys/namei.h> #include <sys/file.h> #include <sys/proc.h> #include <sys/vnode.h> #include <sys/mount.h> #include <sys/kauth.h> #include <sys/quotactl.h> #include <ufs/ufs/quota.h> #include <ufs/ufs/inode.h> #include <ufs/ufs/ufsmount.h> #include <ufs/ufs/ufs_extern.h> #include <ufs/ufs/ufs_quota.h> kmutex_t dqlock; kcondvar_t dqcv; const char *quotatypes[MAXQUOTAS] = INITQFNAMES; /* * Code pertaining to management of the in-core dquot data structures. */ #define DQHASH(dqvp, id) \ (((((long)(dqvp)) >> 8) + id) & dqhash) static LIST_HEAD(dqhashhead, dquot) *dqhashtbl; static u_long dqhash; static pool_cache_t dquot_cache; static int quota_handle_cmd_stat(struct mount *, struct lwp *, struct quotactl_args *args); static int quota_handle_cmd_idtypestat(struct mount *, struct lwp *, struct quotactl_args *args); static int quota_handle_cmd_objtypestat(struct mount *, struct lwp *, struct quotactl_args *args); static int quota_handle_cmd_get(struct mount *, struct lwp *, struct quotactl_args *args); static int quota_handle_cmd_put(struct mount *, struct lwp *, struct quotactl_args *args); static int quota_handle_cmd_cursorget(struct mount *, struct lwp *, struct quotactl_args *args); static int quota_handle_cmd_del(struct mount *, struct lwp *, struct quotactl_args *args); static int quota_handle_cmd_quotaon(struct mount *, struct lwp *, struct quotactl_args *args); static int quota_handle_cmd_quotaoff(struct mount *, struct lwp *, struct quotactl_args *args); static int quota_handle_cmd_cursoropen(struct mount *, struct lwp *, struct quotactl_args *args); static int quota_handle_cmd_cursorclose(struct mount *, struct lwp *, struct quotactl_args *args); static int quota_handle_cmd_cursorskipidtype(struct mount *, struct lwp *, struct quotactl_args *args); static int quota_handle_cmd_cursoratend(struct mount *, struct lwp *, struct quotactl_args *args); static int quota_handle_cmd_cursorrewind(struct mount *, struct lwp *, struct quotactl_args *args); /* * Initialize the quota fields of an inode. */ void ufsquota_init(struct inode *ip) { int i; for (i = 0; i < MAXQUOTAS; i++) ip->i_dquot[i] = NODQUOT; } /* * Release the quota fields from an inode. */ void ufsquota_free(struct inode *ip) { int i; for (i = 0; i < MAXQUOTAS; i++) { dqrele(ITOV(ip), ip->i_dquot[i]); ip->i_dquot[i] = NODQUOT; } } /* * Update disk usage, and take corrective action. */ int chkdq(struct inode *ip, int64_t change, kauth_cred_t cred, int flags) { /* do not track snapshot usage, or we will deadlock */ if ((ip->i_flags & SF_SNAPSHOT) != 0) return 0; #ifdef QUOTA if (ip->i_ump->um_flags & UFS_QUOTA) return chkdq1(ip, change, cred, flags); #endif #ifdef QUOTA2 if (ip->i_ump->um_flags & UFS_QUOTA2) return chkdq2(ip, change, cred, flags); #endif return 0; } /* * Check the inode limit, applying corrective action. */ int chkiq(struct inode *ip, int32_t change, kauth_cred_t cred, int flags) { /* do not track snapshot usage, or we will deadlock */ if ((ip->i_flags & SF_SNAPSHOT) != 0) return 0; #ifdef QUOTA if (ip->i_ump->um_flags & UFS_QUOTA) return chkiq1(ip, change, cred, flags); #endif #ifdef QUOTA2 if (ip->i_ump->um_flags & UFS_QUOTA2) return chkiq2(ip, change, cred, flags); #endif return 0; } int quota_handle_cmd(struct mount *mp, struct lwp *l, struct quotactl_args *args) { int error = 0; switch (args->qc_op) { case QUOTACTL_STAT: error = quota_handle_cmd_stat(mp, l, args); break; case QUOTACTL_IDTYPESTAT: error = quota_handle_cmd_idtypestat(mp, l, args); break; case QUOTACTL_OBJTYPESTAT: error = quota_handle_cmd_objtypestat(mp, l, args); break; case QUOTACTL_QUOTAON: error = quota_handle_cmd_quotaon(mp, l, args); break; case QUOTACTL_QUOTAOFF: error = quota_handle_cmd_quotaoff(mp, l, args); break; case QUOTACTL_GET: error = quota_handle_cmd_get(mp, l, args); break; case QUOTACTL_PUT: error = quota_handle_cmd_put(mp, l, args); break; case QUOTACTL_CURSORGET: error = quota_handle_cmd_cursorget(mp, l, args); break; case QUOTACTL_DEL: error = quota_handle_cmd_del(mp, l, args); break; case QUOTACTL_CURSOROPEN: error = quota_handle_cmd_cursoropen(mp, l, args); break; case QUOTACTL_CURSORCLOSE: error = quota_handle_cmd_cursorclose(mp, l, args); break; case QUOTACTL_CURSORSKIPIDTYPE: error = quota_handle_cmd_cursorskipidtype(mp, l, args); break; case QUOTACTL_CURSORATEND: error = quota_handle_cmd_cursoratend(mp, l, args); break; case QUOTACTL_CURSORREWIND: error = quota_handle_cmd_cursorrewind(mp, l, args); break; default: panic("Invalid quotactl operation %d\n", args->qc_op); } return error; } static int quota_handle_cmd_stat(struct mount *mp, struct lwp *l, struct quotactl_args *args) { struct ufsmount *ump = VFSTOUFS(mp); struct quotastat *info; KASSERT(args->qc_op == QUOTACTL_STAT); info = args->u.stat.qc_info; if ((ump->um_flags & (UFS_QUOTA|UFS_QUOTA2)) == 0) return EOPNOTSUPP; #ifdef QUOTA if (ump->um_flags & UFS_QUOTA) { strcpy(info->qs_implname, "ufs/ffs quota v1"); info->qs_numidtypes = MAXQUOTAS; /* XXX no define for this */ info->qs_numobjtypes = 2; info->qs_restrictions = 0; info->qs_restrictions |= QUOTA_RESTRICT_NEEDSQUOTACHECK; info->qs_restrictions |= QUOTA_RESTRICT_UNIFORMGRACE; info->qs_restrictions |= QUOTA_RESTRICT_32BIT; } else #endif #ifdef QUOTA2 if (ump->um_flags & UFS_QUOTA2) { strcpy(info->qs_implname, "ufs/ffs quota v2"); info->qs_numidtypes = MAXQUOTAS; info->qs_numobjtypes = N_QL; info->qs_restrictions = 0; } else #endif return EOPNOTSUPP; return 0; } static int quota_handle_cmd_idtypestat(struct mount *mp, struct lwp *l, struct quotactl_args *args) { struct ufsmount *ump = VFSTOUFS(mp); int idtype; struct quotaidtypestat *info; const char *name; KASSERT(args->qc_op == QUOTACTL_IDTYPESTAT); idtype = args->u.idtypestat.qc_idtype; info = args->u.idtypestat.qc_info; if ((ump->um_flags & (UFS_QUOTA|UFS_QUOTA2)) == 0) return EOPNOTSUPP; /* * These are the same for both QUOTA and QUOTA2. */ switch (idtype) { case QUOTA_IDTYPE_USER: name = "user"; break; case QUOTA_IDTYPE_GROUP: name = "group"; break; default: return EINVAL; } strlcpy(info->qis_name, name, sizeof(info->qis_name)); return 0; } static int quota_handle_cmd_objtypestat(struct mount *mp, struct lwp *l, struct quotactl_args *args) { struct ufsmount *ump = VFSTOUFS(mp); int objtype; struct quotaobjtypestat *info; const char *name; int isbytes; KASSERT(args->qc_op == QUOTACTL_OBJTYPESTAT); objtype = args->u.objtypestat.qc_objtype; info = args->u.objtypestat.qc_info; if ((ump->um_flags & (UFS_QUOTA|UFS_QUOTA2)) == 0) return EOPNOTSUPP; /* * These are the same for both QUOTA and QUOTA2. */ switch (objtype) { case QUOTA_OBJTYPE_BLOCKS: name = "block"; isbytes = 1; break; case QUOTA_OBJTYPE_FILES: name = "file"; isbytes = 0; break; default: return EINVAL; } strlcpy(info->qos_name, name, sizeof(info->qos_name)); info->qos_isbytes = isbytes; return 0; } /* XXX shouldn't all this be in kauth ? */ static int quota_get_auth(struct mount *mp, struct lwp *l, uid_t id) { /* The user can always query about his own quota. */ if (id == kauth_cred_geteuid(l->l_cred)) return 0; return kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_FS_QUOTA, KAUTH_REQ_SYSTEM_FS_QUOTA_GET, mp, KAUTH_ARG(id), NULL); } static int quota_handle_cmd_get(struct mount *mp, struct lwp *l, struct quotactl_args *args) { struct ufsmount *ump = VFSTOUFS(mp); int error; const struct quotakey *qk; struct quotaval *qv; KASSERT(args->qc_op == QUOTACTL_GET); qk = args->u.get.qc_key; qv = args->u.get.qc_val; if ((ump->um_flags & (UFS_QUOTA|UFS_QUOTA2)) == 0) return EOPNOTSUPP; error = quota_get_auth(mp, l, qk->qk_id); if (error != 0) return error; #ifdef QUOTA if (ump->um_flags & UFS_QUOTA) { error = quota1_handle_cmd_get(ump, qk, qv); } else #endif #ifdef QUOTA2 if (ump->um_flags & UFS_QUOTA2) { error = quota2_handle_cmd_get(ump, qk, qv); } else #endif panic("quota_handle_cmd_get: no support ?"); if (error != 0) return error; return error; } static int quota_handle_cmd_put(struct mount *mp, struct lwp *l, struct quotactl_args *args) { struct ufsmount *ump = VFSTOUFS(mp); const struct quotakey *qk; const struct quotaval *qv; id_t kauth_id; int error; KASSERT(args->qc_op == QUOTACTL_PUT); qk = args->u.put.qc_key; qv = args->u.put.qc_val; if ((ump->um_flags & (UFS_QUOTA|UFS_QUOTA2)) == 0) return EOPNOTSUPP; kauth_id = qk->qk_id; if (kauth_id == QUOTA_DEFAULTID) { kauth_id = 0; } error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_FS_QUOTA, KAUTH_REQ_SYSTEM_FS_QUOTA_MANAGE, mp, KAUTH_ARG(kauth_id), NULL); if (error != 0) { return error; } #ifdef QUOTA if (ump->um_flags & UFS_QUOTA) error = quota1_handle_cmd_put(ump, qk, qv); else #endif #ifdef QUOTA2 if (ump->um_flags & UFS_QUOTA2) { error = quota2_handle_cmd_put(ump, qk, qv); } else #endif panic("quota_handle_cmd_get: no support ?"); if (error == ENOENT) { error = 0; } return error; } static int quota_handle_cmd_del(struct mount *mp, struct lwp *l, struct quotactl_args *args) { struct ufsmount *ump = VFSTOUFS(mp); const struct quotakey *qk; id_t kauth_id; int error; KASSERT(args->qc_op == QUOTACTL_DEL); qk = args->u.del.qc_key; kauth_id = qk->qk_id; if (kauth_id == QUOTA_DEFAULTID) { kauth_id = 0; } if ((ump->um_flags & UFS_QUOTA2) == 0) return EOPNOTSUPP; /* avoid whitespace changes */ { error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_FS_QUOTA, KAUTH_REQ_SYSTEM_FS_QUOTA_MANAGE, mp, KAUTH_ARG(kauth_id), NULL); if (error != 0) goto err; #ifdef QUOTA2 if (ump->um_flags & UFS_QUOTA2) { error = quota2_handle_cmd_del(ump, qk); } else #endif panic("quota_handle_cmd_get: no support ?"); if (error && error != ENOENT) goto err; } return 0; err: return error; } static int quota_handle_cmd_cursorget(struct mount *mp, struct lwp *l, struct quotactl_args *args) { struct ufsmount *ump = VFSTOUFS(mp); int error; KASSERT(args->qc_op == QUOTACTL_CURSORGET); if ((ump->um_flags & UFS_QUOTA2) == 0) return EOPNOTSUPP; error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_FS_QUOTA, KAUTH_REQ_SYSTEM_FS_QUOTA_GET, mp, NULL, NULL); if (error) return error; #ifdef QUOTA2 if (ump->um_flags & UFS_QUOTA2) { struct quotakcursor *cursor = args->u.cursorget.qc_cursor; struct quotakey *keys = args->u.cursorget.qc_keys; struct quotaval *vals = args->u.cursorget.qc_vals; unsigned maxnum = args->u.cursorget.qc_maxnum; unsigned *ret = args->u.cursorget.qc_ret; error = quota2_handle_cmd_cursorget(ump, cursor, keys, vals, maxnum, ret); } else #endif panic("quota_handle_cmd_cursorget: no support ?"); return error; } static int quota_handle_cmd_cursoropen(struct mount *mp, struct lwp *l, struct quotactl_args *args) { #ifdef QUOTA2 struct ufsmount *ump = VFSTOUFS(mp); struct quotakcursor *cursor = args->u.cursoropen.qc_cursor; #endif int error; KASSERT(args->qc_op == QUOTACTL_CURSOROPEN); error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_FS_QUOTA, KAUTH_REQ_SYSTEM_FS_QUOTA_GET, mp, NULL, NULL); if (error) return error; #ifdef QUOTA2 if (ump->um_flags & UFS_QUOTA2) { error = quota2_handle_cmd_cursoropen(ump, cursor); } else #endif error = EOPNOTSUPP; return error; } static int quota_handle_cmd_cursorclose(struct mount *mp, struct lwp *l, struct quotactl_args *args) { #ifdef QUOTA2 struct ufsmount *ump = VFSTOUFS(mp); struct quotakcursor *cursor = args->u.cursorclose.qc_cursor; #endif int error; KASSERT(args->qc_op == QUOTACTL_CURSORCLOSE); error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_FS_QUOTA, KAUTH_REQ_SYSTEM_FS_QUOTA_GET, mp, NULL, NULL); if (error) return error; #ifdef QUOTA2 if (ump->um_flags & UFS_QUOTA2) { error = quota2_handle_cmd_cursorclose(ump, cursor); } else #endif error = EOPNOTSUPP; return error; } static int quota_handle_cmd_cursorskipidtype(struct mount *mp, struct lwp *l, struct quotactl_args *args) { #ifdef QUOTA2 struct ufsmount *ump = VFSTOUFS(mp); struct quotakcursor *cursor = args->u.cursorskipidtype.qc_cursor; int idtype = args->u.cursorskipidtype.qc_idtype; #endif int error; KASSERT(args->qc_op == QUOTACTL_CURSORSKIPIDTYPE); #ifdef QUOTA2 if (ump->um_flags & UFS_QUOTA2) { error = quota2_handle_cmd_cursorskipidtype(ump, cursor, idtype); } else #endif error = EOPNOTSUPP; return error; } static int quota_handle_cmd_cursoratend(struct mount *mp, struct lwp *l, struct quotactl_args *args) { #ifdef QUOTA2 struct ufsmount *ump = VFSTOUFS(mp); struct quotakcursor *cursor = args->u.cursoratend.qc_cursor; unsigned *ret = args->u.cursoratend.qc_ret; #endif int error; KASSERT(args->qc_op == QUOTACTL_CURSORATEND); #ifdef QUOTA2 if (ump->um_flags & UFS_QUOTA2) { error = quota2_handle_cmd_cursoratend(ump, cursor, ret); } else #endif error = EOPNOTSUPP; return error; } static int quota_handle_cmd_cursorrewind(struct mount *mp, struct lwp *l, struct quotactl_args *args) { #ifdef QUOTA2 struct ufsmount *ump = VFSTOUFS(mp); struct quotakcursor *cursor = args->u.cursorrewind.qc_cursor; #endif int error; KASSERT(args->qc_op == QUOTACTL_CURSORREWIND); #ifdef QUOTA2 if (ump->um_flags & UFS_QUOTA2) { error = quota2_handle_cmd_cursorrewind(ump, cursor); } else #endif error = EOPNOTSUPP; return error; } static int quota_handle_cmd_quotaon(struct mount *mp, struct lwp *l, struct quotactl_args *args) { struct ufsmount *ump = VFSTOUFS(mp); int error; KASSERT(args->qc_op == QUOTACTL_QUOTAON); if ((ump->um_flags & UFS_QUOTA2) != 0) return EBUSY; error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_FS_QUOTA, KAUTH_REQ_SYSTEM_FS_QUOTA_ONOFF, mp, NULL, NULL); if (error != 0) { return error; } #ifdef QUOTA int idtype = args->u.quotaon.qc_idtype; const char *qfile = args->u.quotaon.qc_quotafile; error = quota1_handle_cmd_quotaon(l, ump, idtype, qfile); #else error = EOPNOTSUPP; #endif return error; } static int quota_handle_cmd_quotaoff(struct mount *mp, struct lwp *l, struct quotactl_args *args) { struct ufsmount *ump = VFSTOUFS(mp); int error; KASSERT(args->qc_op == QUOTACTL_QUOTAOFF); if ((ump->um_flags & UFS_QUOTA2) != 0) return EOPNOTSUPP; error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_FS_QUOTA, KAUTH_REQ_SYSTEM_FS_QUOTA_ONOFF, mp, NULL, NULL); if (error != 0) { return error; } #ifdef QUOTA int idtype = args->u.quotaoff.qc_idtype; error = quota1_handle_cmd_quotaoff(l, ump, idtype); #else error = EOPNOTSUPP; #endif return error; } /* * Initialize the quota system. */ void dqinit(void) { mutex_init(&dqlock, MUTEX_DEFAULT, IPL_NONE); cv_init(&dqcv, "quota"); dqhashtbl = hashinit(desiredvnodes, HASH_LIST, true, &dqhash); dquot_cache = pool_cache_init(sizeof(struct dquot), 0, 0, 0, "ufsdq", NULL, IPL_NONE, NULL, NULL, NULL); } void dqreinit(void) { struct dquot *dq; struct dqhashhead *oldhash, *hash; struct vnode *dqvp; u_long oldmask, mask, hashval; int i; hash = hashinit(desiredvnodes, HASH_LIST, true, &mask); mutex_enter(&dqlock); oldhash = dqhashtbl; oldmask = dqhash; dqhashtbl = hash; dqhash = mask; for (i = 0; i <= oldmask; i++) { while ((dq = LIST_FIRST(&oldhash[i])) != NULL) { dqvp = dq->dq_ump->um_quotas[dq->dq_type]; LIST_REMOVE(dq, dq_hash); hashval = DQHASH(dqvp, dq->dq_id); LIST_INSERT_HEAD(&dqhashtbl[hashval], dq, dq_hash); } } mutex_exit(&dqlock); hashdone(oldhash, HASH_LIST, oldmask); } /* * Free resources held by quota system. */ void dqdone(void) { pool_cache_destroy(dquot_cache); hashdone(dqhashtbl, HASH_LIST, dqhash); cv_destroy(&dqcv); mutex_destroy(&dqlock); } /* * Set up the quotas for an inode. * * This routine completely defines the semantics of quotas. * If other criterion want to be used to establish quotas, the * MAXQUOTAS value in quotas.h should be increased, and the * additional dquots set up here. */ int getinoquota(struct inode *ip) { struct ufsmount *ump = ip->i_ump; struct vnode *vp = ITOV(ip); int i, error; u_int32_t ino_ids[MAXQUOTAS]; /* * To avoid deadlocks never update quotas for quota files * on the same file system */ for (i = 0; i < MAXQUOTAS; i++) if (vp == ump->um_quotas[i]) return 0; ino_ids[USRQUOTA] = ip->i_uid; ino_ids[GRPQUOTA] = ip->i_gid; for (i = 0; i < MAXQUOTAS; i++) { /* * If the file id changed the quota needs update. */ if (ip->i_dquot[i] != NODQUOT && ip->i_dquot[i]->dq_id != ino_ids[i]) { dqrele(ITOV(ip), ip->i_dquot[i]); ip->i_dquot[i] = NODQUOT; } /* * Set up the quota based on file id. * ENODEV means that quotas are not enabled. */ if (ip->i_dquot[i] == NODQUOT && (error = dqget(vp, ino_ids[i], ump, i, &ip->i_dquot[i])) && error != ENODEV) return (error); } return 0; } /* * Obtain a dquot structure for the specified identifier and quota file * reading the information from the file if necessary. */ int dqget(struct vnode *vp, u_long id, struct ufsmount *ump, int type, struct dquot **dqp) { struct dquot *dq, *ndq; struct dqhashhead *dqh; struct vnode *dqvp; int error = 0; /* XXX gcc */ /* Lock to see an up to date value for QTF_CLOSING. */ mutex_enter(&dqlock); if ((ump->um_flags & (UFS_QUOTA|UFS_QUOTA2)) == 0) { mutex_exit(&dqlock); *dqp = NODQUOT; return (ENODEV); } dqvp = ump->um_quotas[type]; #ifdef QUOTA if (ump->um_flags & UFS_QUOTA) { if (dqvp == NULLVP || (ump->umq1_qflags[type] & QTF_CLOSING)) { mutex_exit(&dqlock); *dqp = NODQUOT; return (ENODEV); } } #endif #ifdef QUOTA2 if (ump->um_flags & UFS_QUOTA2) { if (dqvp == NULLVP) { mutex_exit(&dqlock); *dqp = NODQUOT; return (ENODEV); } } #endif KASSERT(dqvp != vp); /* * Check the cache first. */ dqh = &dqhashtbl[DQHASH(dqvp, id)]; LIST_FOREACH(dq, dqh, dq_hash) { if (dq->dq_id != id || dq->dq_ump->um_quotas[dq->dq_type] != dqvp) continue; KASSERT(dq->dq_cnt > 0); dqref(dq); mutex_exit(&dqlock); *dqp = dq; return (0); } /* * Not in cache, allocate a new one. */ mutex_exit(&dqlock); ndq = pool_cache_get(dquot_cache, PR_WAITOK); /* * Initialize the contents of the dquot structure. */ memset((char *)ndq, 0, sizeof *ndq); ndq->dq_flags = 0; ndq->dq_id = id; ndq->dq_ump = ump; ndq->dq_type = type; mutex_init(&ndq->dq_interlock, MUTEX_DEFAULT, IPL_NONE); mutex_enter(&dqlock); dqh = &dqhashtbl[DQHASH(dqvp, id)]; LIST_FOREACH(dq, dqh, dq_hash) { if (dq->dq_id != id || dq->dq_ump->um_quotas[dq->dq_type] != dqvp) continue; /* * Another thread beat us allocating this dquot. */ KASSERT(dq->dq_cnt > 0); dqref(dq); mutex_exit(&dqlock); mutex_destroy(&ndq->dq_interlock); pool_cache_put(dquot_cache, ndq); *dqp = dq; return 0; } dq = ndq; LIST_INSERT_HEAD(dqh, dq, dq_hash); dqref(dq); mutex_enter(&dq->dq_interlock); mutex_exit(&dqlock); #ifdef QUOTA if (ump->um_flags & UFS_QUOTA) error = dq1get(dqvp, id, ump, type, dq); #endif #ifdef QUOTA2 if (ump->um_flags & UFS_QUOTA2) error = dq2get(dqvp, id, ump, type, dq); #endif /* * I/O error in reading quota file, release * quota structure and reflect problem to caller. */ if (error) { mutex_enter(&dqlock); LIST_REMOVE(dq, dq_hash); mutex_exit(&dqlock); mutex_exit(&dq->dq_interlock); dqrele(vp, dq); *dqp = NODQUOT; return (error); } mutex_exit(&dq->dq_interlock); *dqp = dq; return (0); } /* * Obtain a reference to a dquot. */ void dqref(struct dquot *dq) { KASSERT(mutex_owned(&dqlock)); dq->dq_cnt++; KASSERT(dq->dq_cnt > 0); } /* * Release a reference to a dquot. */ void dqrele(struct vnode *vp, struct dquot *dq) { if (dq == NODQUOT) return; mutex_enter(&dq->dq_interlock); for (;;) { mutex_enter(&dqlock); if (dq->dq_cnt > 1) { dq->dq_cnt--; mutex_exit(&dqlock); mutex_exit(&dq->dq_interlock); return; } if ((dq->dq_flags & DQ_MOD) == 0) break; mutex_exit(&dqlock); #ifdef QUOTA if (dq->dq_ump->um_flags & UFS_QUOTA) (void) dq1sync(vp, dq); #endif #ifdef QUOTA2 if (dq->dq_ump->um_flags & UFS_QUOTA2) (void) dq2sync(vp, dq); #endif } KASSERT(dq->dq_cnt == 1 && (dq->dq_flags & DQ_MOD) == 0); LIST_REMOVE(dq, dq_hash); mutex_exit(&dqlock); mutex_exit(&dq->dq_interlock); mutex_destroy(&dq->dq_interlock); pool_cache_put(dquot_cache, dq); } int qsync(struct mount *mp) { struct ufsmount *ump = VFSTOUFS(mp); #ifdef QUOTA if (ump->um_flags & UFS_QUOTA) return q1sync(mp); #endif #ifdef QUOTA2 if (ump->um_flags & UFS_QUOTA2) return q2sync(mp); #endif return 0; } #ifdef DIAGNOSTIC /* * Check the hash chains for stray dquot's. */ void dqflush(struct vnode *vp) { struct dquot *dq; int i; mutex_enter(&dqlock); for (i = 0; i <= dqhash; i++) LIST_FOREACH(dq, &dqhashtbl[i], dq_hash) KASSERT(dq->dq_ump->um_quotas[dq->dq_type] != vp); mutex_exit(&dqlock); } #endif
477 478 12 12 5 5 1 1 1 1 1 1 506 504 348 505 31 31 102 102 102 23 88 88 88 84 4 87 87 114 92 92 92 32 47 7 4 47 47 13 47 476 476 95 442 123 479 478 479 478 479 478 479 441 123 598 2 961 960 54 960 961 472 436 960 2 38 956 958 327 959 960 960 435 958 47 957 65 268 269 269 269 27 27 27 72 72 72 72 72 72 72 72 14 72 72 72 72 72 72 18 71 72 71 72 72 171 162 139 126 2 151 151 151 151 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 /* $NetBSD: vfs_trans.c,v 1.70 2022/11/04 11:20:39 hannken Exp $ */ /*- * Copyright (c) 2007, 2020 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Juergen Hannken-Illjes. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: vfs_trans.c,v 1.70 2022/11/04 11:20:39 hannken Exp $"); /* * File system transaction operations. */ #ifdef _KERNEL_OPT #include "opt_ddb.h" #endif #include <sys/param.h> #include <sys/systm.h> #include <sys/atomic.h> #include <sys/buf.h> #include <sys/hash.h> #include <sys/kmem.h> #include <sys/mount.h> #include <sys/pserialize.h> #include <sys/vnode.h> #include <sys/fstrans.h> #include <sys/proc.h> #include <sys/pool.h> #include <miscfs/deadfs/deadfs.h> #include <miscfs/specfs/specdev.h> #define FSTRANS_MOUNT_HASHSIZE 32 enum fstrans_lock_type { FSTRANS_LAZY, /* Granted while not suspended */ FSTRANS_SHARED /* Granted while not suspending */ }; struct fscow_handler { LIST_ENTRY(fscow_handler) ch_list; int (*ch_func)(void *, struct buf *, bool); void *ch_arg; }; struct fstrans_lwp_info { struct fstrans_lwp_info *fli_succ; struct lwp *fli_self; struct mount *fli_mount; struct fstrans_lwp_info *fli_alias; struct fstrans_mount_info *fli_mountinfo; int fli_trans_cnt; int fli_alias_cnt; int fli_cow_cnt; enum fstrans_lock_type fli_lock_type; LIST_ENTRY(fstrans_lwp_info) fli_list; }; struct fstrans_mount_info { enum fstrans_state fmi_state; unsigned int fmi_ref_cnt; bool fmi_gone; bool fmi_cow_change; SLIST_ENTRY(fstrans_mount_info) fmi_hash; LIST_HEAD(, fscow_handler) fmi_cow_handler; struct mount *fmi_mount; struct fstrans_mount_info *fmi_lower_info; struct lwp *fmi_owner; }; SLIST_HEAD(fstrans_mount_hashhead, fstrans_mount_info); static kmutex_t vfs_suspend_lock /* Serialize suspensions. */ __cacheline_aligned; static kmutex_t fstrans_lock /* Fstrans big lock. */ __cacheline_aligned; static kcondvar_t fstrans_state_cv; /* Fstrans or cow state changed. */ static kcondvar_t fstrans_count_cv; /* Fstrans or cow count changed. */ static pserialize_t fstrans_psz; /* Pserialize state. */ static LIST_HEAD(fstrans_lwp_head, fstrans_lwp_info) fstrans_fli_head; /* List of all fstrans_lwp_info. */ static pool_cache_t fstrans_lwp_cache; /* Cache of fstrans_lwp_info. */ static u_long fstrans_mount_hashmask; static struct fstrans_mount_hashhead *fstrans_mount_hashtab; static int fstrans_gone_count; /* Number of fstrans_mount_info gone. */ static inline uint32_t fstrans_mount_hash(struct mount *); static inline struct fstrans_mount_info *fstrans_mount_get(struct mount *); static void fstrans_mount_dtor(struct fstrans_mount_info *); static void fstrans_clear_lwp_info(void); static inline struct fstrans_lwp_info * fstrans_get_lwp_info(struct mount *, bool); static struct fstrans_lwp_info *fstrans_alloc_lwp_info(struct mount *); static int fstrans_lwp_pcc(void *, void *, int); static void fstrans_lwp_pcd(void *, void *); static inline int _fstrans_start(struct mount *, enum fstrans_lock_type, int); static bool grant_lock(const struct fstrans_mount_info *, const enum fstrans_lock_type); static bool state_change_done(const struct fstrans_mount_info *); static bool cow_state_change_done(const struct fstrans_mount_info *); static void cow_change_enter(struct fstrans_mount_info *); static void cow_change_done(struct fstrans_mount_info *); /* * Initialize. */ void fstrans_init(void) { mutex_init(&vfs_suspend_lock, MUTEX_DEFAULT, IPL_NONE); mutex_init(&fstrans_lock, MUTEX_DEFAULT, IPL_NONE); cv_init(&fstrans_state_cv, "fstchg"); cv_init(&fstrans_count_cv, "fstcnt"); fstrans_psz = pserialize_create(); LIST_INIT(&fstrans_fli_head); fstrans_lwp_cache = pool_cache_init(sizeof(struct fstrans_lwp_info), coherency_unit, 0, 0, "fstlwp", NULL, IPL_NONE, fstrans_lwp_pcc, fstrans_lwp_pcd, NULL); KASSERT(fstrans_lwp_cache != NULL); fstrans_mount_hashtab = hashinit(FSTRANS_MOUNT_HASHSIZE, HASH_SLIST, true, &fstrans_mount_hashmask); } /* * pool_cache constructor for fstrans_lwp_info. Updating the global list * produces cache misses on MP. Minimise by keeping free entries on list. */ int fstrans_lwp_pcc(void *arg, void *obj, int flags) { struct fstrans_lwp_info *fli = obj; memset(fli, 0, sizeof(*fli)); mutex_enter(&fstrans_lock); LIST_INSERT_HEAD(&fstrans_fli_head, fli, fli_list); mutex_exit(&fstrans_lock); return 0; } /* * pool_cache destructor */ void fstrans_lwp_pcd(void *arg, void *obj) { struct fstrans_lwp_info *fli = obj; mutex_enter(&fstrans_lock); LIST_REMOVE(fli, fli_list); mutex_exit(&fstrans_lock); } /* * Deallocate lwp state. */ void fstrans_lwp_dtor(lwp_t *l) { struct fstrans_lwp_info *fli, *fli_next; if (l->l_fstrans == NULL) return; mutex_enter(&fstrans_lock); for (fli = l->l_fstrans; fli; fli = fli_next) { KASSERT(fli->fli_trans_cnt == 0); KASSERT(fli->fli_cow_cnt == 0); KASSERT(fli->fli_self == l); if (fli->fli_mount != NULL) fstrans_mount_dtor(fli->fli_mountinfo); fli_next = fli->fli_succ; fli->fli_alias_cnt = 0; fli->fli_mount = NULL; fli->fli_alias = NULL; fli->fli_mountinfo = NULL; fli->fli_self = NULL; } mutex_exit(&fstrans_lock); for (fli = l->l_fstrans; fli; fli = fli_next) { fli_next = fli->fli_succ; pool_cache_put(fstrans_lwp_cache, fli); } l->l_fstrans = NULL; } /* * mount pointer to hash */ static inline uint32_t fstrans_mount_hash(struct mount *mp) { return hash32_buf(&mp, sizeof(mp), HASH32_BUF_INIT) & fstrans_mount_hashmask; } /* * retrieve fstrans_mount_info by mount or NULL */ static inline struct fstrans_mount_info * fstrans_mount_get(struct mount *mp) { uint32_t indx; struct fstrans_mount_info *fmi, *fmi_lower; KASSERT(mutex_owned(&fstrans_lock)); indx = fstrans_mount_hash(mp); SLIST_FOREACH(fmi, &fstrans_mount_hashtab[indx], fmi_hash) { if (fmi->fmi_mount == mp) { if (__predict_false(mp->mnt_lower != NULL && fmi->fmi_lower_info == NULL)) { /* * Intern the lower/lowest mount into * this mount info on first lookup. */ KASSERT(fmi->fmi_ref_cnt == 1); fmi_lower = fstrans_mount_get(mp->mnt_lower); if (fmi_lower && fmi_lower->fmi_lower_info) fmi_lower = fmi_lower->fmi_lower_info; if (fmi_lower == NULL) return NULL; fmi->fmi_lower_info = fmi_lower; fmi->fmi_lower_info->fmi_ref_cnt += 1; } return fmi; } } return NULL; } /* * Dereference mount state. */ static void fstrans_mount_dtor(struct fstrans_mount_info *fmi) { KASSERT(mutex_owned(&fstrans_lock)); KASSERT(fmi != NULL); fmi->fmi_ref_cnt -= 1; if (__predict_true(fmi->fmi_ref_cnt > 0)) { return; } KASSERT(fmi->fmi_state == FSTRANS_NORMAL); KASSERT(LIST_FIRST(&fmi->fmi_cow_handler) == NULL); KASSERT(fmi->fmi_owner == NULL); if (fmi->fmi_lower_info) fstrans_mount_dtor(fmi->fmi_lower_info); KASSERT(fstrans_gone_count > 0); fstrans_gone_count -= 1; KASSERT(fmi->fmi_mount->mnt_lower == NULL); kmem_free(fmi->fmi_mount, sizeof(*fmi->fmi_mount)); kmem_free(fmi, sizeof(*fmi)); } /* * Allocate mount state. */ int fstrans_mount(struct mount *mp) { uint32_t indx; struct fstrans_mount_info *newfmi; indx = fstrans_mount_hash(mp); newfmi = kmem_alloc(sizeof(*newfmi), KM_SLEEP); newfmi->fmi_state = FSTRANS_NORMAL; newfmi->fmi_ref_cnt = 1; newfmi->fmi_gone = false; LIST_INIT(&newfmi->fmi_cow_handler); newfmi->fmi_cow_change = false; newfmi->fmi_mount = mp; newfmi->fmi_lower_info = NULL; newfmi->fmi_owner = NULL; mutex_enter(&fstrans_lock); SLIST_INSERT_HEAD(&fstrans_mount_hashtab[indx], newfmi, fmi_hash); mutex_exit(&fstrans_lock); return 0; } /* * Deallocate mount state. */ void fstrans_unmount(struct mount *mp) { uint32_t indx; struct fstrans_mount_info *fmi; indx = fstrans_mount_hash(mp); mutex_enter(&fstrans_lock); fmi = fstrans_mount_get(mp); KASSERT(fmi != NULL); fmi->fmi_gone = true; SLIST_REMOVE(&fstrans_mount_hashtab[indx], fmi, fstrans_mount_info, fmi_hash); fstrans_gone_count += 1; fstrans_mount_dtor(fmi); mutex_exit(&fstrans_lock); } /* * Clear mount entries whose mount is gone. */ static void fstrans_clear_lwp_info(void) { struct fstrans_lwp_info **p, *fli, *tofree = NULL; /* * Scan our list clearing entries whose mount is gone. */ mutex_enter(&fstrans_lock); for (p = &curlwp->l_fstrans; *p; ) { fli = *p; if (fli->fli_mount != NULL && fli->fli_mountinfo->fmi_gone && fli->fli_trans_cnt == 0 && fli->fli_cow_cnt == 0 && fli->fli_alias_cnt == 0) { *p = (*p)->fli_succ; fstrans_mount_dtor(fli->fli_mountinfo); if (fli->fli_alias) { KASSERT(fli->fli_alias->fli_alias_cnt > 0); fli->fli_alias->fli_alias_cnt--; } fli->fli_mount = NULL; fli->fli_alias = NULL; fli->fli_mountinfo = NULL; fli->fli_self = NULL; p = &curlwp->l_fstrans; fli->fli_succ = tofree; tofree = fli; } else { p = &(*p)->fli_succ; } } #ifdef DIAGNOSTIC for (fli = curlwp->l_fstrans; fli; fli = fli->fli_succ) if (fli->fli_alias != NULL) KASSERT(fli->fli_alias->fli_self == curlwp); #endif /* DIAGNOSTIC */ mutex_exit(&fstrans_lock); while (tofree != NULL) { fli = tofree; tofree = fli->fli_succ; pool_cache_put(fstrans_lwp_cache, fli); } } /* * Allocate and return per lwp info for this mount. */ static struct fstrans_lwp_info * fstrans_alloc_lwp_info(struct mount *mp) { struct fstrans_lwp_info *fli, *fli_lower; struct fstrans_mount_info *fmi; for (fli = curlwp->l_fstrans; fli; fli = fli->fli_succ) { if (fli->fli_mount == mp) return fli; } /* * Lookup mount info and get lower mount per lwp info. */ mutex_enter(&fstrans_lock); fmi = fstrans_mount_get(mp); if (fmi == NULL) { mutex_exit(&fstrans_lock); return NULL; } fmi->fmi_ref_cnt += 1; mutex_exit(&fstrans_lock); if (fmi->fmi_lower_info) { fli_lower = fstrans_alloc_lwp_info(fmi->fmi_lower_info->fmi_mount); if (fli_lower == NULL) { mutex_enter(&fstrans_lock); fstrans_mount_dtor(fmi); mutex_exit(&fstrans_lock); return NULL; } } else { fli_lower = NULL; } /* * Allocate a new entry. */ fli = pool_cache_get(fstrans_lwp_cache, PR_WAITOK); KASSERT(fli->fli_trans_cnt == 0); KASSERT(fli->fli_cow_cnt == 0); KASSERT(fli->fli_alias_cnt == 0); KASSERT(fli->fli_mount == NULL); KASSERT(fli->fli_alias == NULL); KASSERT(fli->fli_mountinfo == NULL); KASSERT(fli->fli_self == NULL); /* * Attach the mount info and alias. */ fli->fli_self = curlwp; fli->fli_mount = mp; fli->fli_mountinfo = fmi; fli->fli_succ = curlwp->l_fstrans; curlwp->l_fstrans = fli; if (fli_lower) { fli->fli_alias = fli_lower; fli->fli_alias->fli_alias_cnt++; fli = fli->fli_alias; } return fli; } /* * Retrieve the per lwp info for this mount allocating if necessary. */ static inline struct fstrans_lwp_info * fstrans_get_lwp_info(struct mount *mp, bool do_alloc) { struct fstrans_lwp_info *fli; /* * Scan our list for a match. */ for (fli = curlwp->l_fstrans; fli; fli = fli->fli_succ) { if (fli->fli_mount == mp) { KASSERT(mp->mnt_lower == NULL || fli->fli_alias != NULL); if (fli->fli_alias != NULL) fli = fli->fli_alias; break; } } if (do_alloc) { if (__predict_false(fli == NULL)) fli = fstrans_alloc_lwp_info(mp); } return fli; } /* * Check if this lock type is granted at this state. */ static bool grant_lock(const struct fstrans_mount_info *fmi, const enum fstrans_lock_type type) { if (__predict_true(fmi->fmi_state == FSTRANS_NORMAL)) return true; if (fmi->fmi_owner == curlwp) return true; if (fmi->fmi_state == FSTRANS_SUSPENDING && type == FSTRANS_LAZY) return true; return false; } /* * Start a transaction. If this thread already has a transaction on this * file system increment the reference counter. */ static inline int _fstrans_start(struct mount *mp, enum fstrans_lock_type lock_type, int wait) { int s; struct fstrans_lwp_info *fli; struct fstrans_mount_info *fmi; ASSERT_SLEEPABLE(); fli = fstrans_get_lwp_info(mp, true); if (fli == NULL) return 0; fmi = fli->fli_mountinfo; if (fli->fli_trans_cnt > 0) { fli->fli_trans_cnt += 1; return 0; } s = pserialize_read_enter(); if (__predict_true(grant_lock(fmi, lock_type))) { fli->fli_trans_cnt = 1; fli->fli_lock_type = lock_type; pserialize_read_exit(s); return 0; } pserialize_read_exit(s); if (! wait) return EBUSY; mutex_enter(&fstrans_lock); while (! grant_lock(fmi, lock_type)) cv_wait(&fstrans_state_cv, &fstrans_lock); fli->fli_trans_cnt = 1; fli->fli_lock_type = lock_type; mutex_exit(&fstrans_lock); return 0; } void fstrans_start(struct mount *mp) { int error __diagused; error = _fstrans_start(mp, FSTRANS_SHARED, 1); KASSERT(error == 0); } int fstrans_start_nowait(struct mount *mp) { return _fstrans_start(mp, FSTRANS_SHARED, 0); } void fstrans_start_lazy(struct mount *mp) { int error __diagused; error = _fstrans_start(mp, FSTRANS_LAZY, 1); KASSERT(error == 0); } /* * Finish a transaction. */ void fstrans_done(struct mount *mp) { int s; struct fstrans_lwp_info *fli; struct fstrans_mount_info *fmi; fli = fstrans_get_lwp_info(mp, false); if (fli == NULL) return; fmi = fli->fli_mountinfo; KASSERT(fli->fli_trans_cnt > 0); if (fli->fli_trans_cnt > 1) { fli->fli_trans_cnt -= 1; return; } if (__predict_false(fstrans_gone_count > 0)) fstrans_clear_lwp_info(); s = pserialize_read_enter(); if (__predict_true(fmi->fmi_state == FSTRANS_NORMAL)) { fli->fli_trans_cnt = 0; pserialize_read_exit(s); return; } pserialize_read_exit(s); mutex_enter(&fstrans_lock); fli->fli_trans_cnt = 0; cv_signal(&fstrans_count_cv); mutex_exit(&fstrans_lock); } /* * Check if we hold an lock. */ int fstrans_held(struct mount *mp) { struct fstrans_lwp_info *fli; struct fstrans_mount_info *fmi; KASSERT(mp != dead_rootmount); fli = fstrans_get_lwp_info(mp, false); if (fli == NULL) return 0; fmi = fli->fli_mountinfo; return (fli->fli_trans_cnt > 0 || fmi->fmi_owner == curlwp); } /* * Check if this thread has an exclusive lock. */ int fstrans_is_owner(struct mount *mp) { struct fstrans_lwp_info *fli; struct fstrans_mount_info *fmi; KASSERT(mp != dead_rootmount); fli = fstrans_get_lwp_info(mp, false); if (fli == NULL) return 0; fmi = fli->fli_mountinfo; return (fmi->fmi_owner == curlwp); } /* * True, if no thread is in a transaction not granted at the current state. */ static bool state_change_done(const struct fstrans_mount_info *fmi) { struct fstrans_lwp_info *fli; KASSERT(mutex_owned(&fstrans_lock)); LIST_FOREACH(fli, &fstrans_fli_head, fli_list) { if (fli->fli_mountinfo != fmi) continue; if (fli->fli_trans_cnt == 0) continue; if (fli->fli_self == curlwp) continue; if (grant_lock(fmi, fli->fli_lock_type)) continue; return false; } return true; } /* * Set new file system state. */ int fstrans_setstate(struct mount *mp, enum fstrans_state new_state) { int error; enum fstrans_state old_state; struct fstrans_lwp_info *fli; struct fstrans_mount_info *fmi; KASSERT(mp != dead_rootmount); fli = fstrans_get_lwp_info(mp, true); if (fli == NULL) return ENOENT; fmi = fli->fli_mountinfo; old_state = fmi->fmi_state; if (old_state == new_state) return 0; mutex_enter(&fstrans_lock); fmi->fmi_state = new_state; pserialize_perform(fstrans_psz); /* * All threads see the new state now. * Wait for transactions invalid at this state to leave. */ error = 0; while (! state_change_done(fmi)) { error = cv_wait_sig(&fstrans_count_cv, &fstrans_lock); if (error) { new_state = fmi->fmi_state = FSTRANS_NORMAL; break; } } if (old_state != new_state) { if (old_state == FSTRANS_NORMAL) { KASSERT(fmi->fmi_owner == NULL); fmi->fmi_owner = curlwp; } if (new_state == FSTRANS_NORMAL) { KASSERT(fmi->fmi_owner == curlwp); fmi->fmi_owner = NULL; } } cv_broadcast(&fstrans_state_cv); mutex_exit(&fstrans_lock); return error; } /* * Get current file system state. */ enum fstrans_state fstrans_getstate(struct mount *mp) { struct fstrans_lwp_info *fli; struct fstrans_mount_info *fmi; KASSERT(mp != dead_rootmount); fli = fstrans_get_lwp_info(mp, true); KASSERT(fli != NULL); fmi = fli->fli_mountinfo; return fmi->fmi_state; } /* * Request a filesystem to suspend all operations. */ int vfs_suspend(struct mount *mp, int nowait) { struct fstrans_lwp_info *fli; int error; if (mp == dead_rootmount) return EOPNOTSUPP; fli = fstrans_get_lwp_info(mp, true); if (fli == NULL) return ENOENT; if (nowait) { if (!mutex_tryenter(&vfs_suspend_lock)) return EWOULDBLOCK; } else mutex_enter(&vfs_suspend_lock); if ((error = VFS_SUSPENDCTL(fli->fli_mount, SUSPEND_SUSPEND)) != 0) { mutex_exit(&vfs_suspend_lock); return error; } if ((mp->mnt_iflag & IMNT_GONE) != 0) { vfs_resume(mp); return ENOENT; } return 0; } /* * Request a filesystem to resume all operations. */ void vfs_resume(struct mount *mp) { struct fstrans_lwp_info *fli; KASSERT(mp != dead_rootmount); fli = fstrans_get_lwp_info(mp, false); mp = fli->fli_mount; VFS_SUSPENDCTL(mp, SUSPEND_RESUME); mutex_exit(&vfs_suspend_lock); } /* * True, if no thread is running a cow handler. */ static bool cow_state_change_done(const struct fstrans_mount_info *fmi) { struct fstrans_lwp_info *fli; KASSERT(mutex_owned(&fstrans_lock)); KASSERT(fmi->fmi_cow_change); LIST_FOREACH(fli, &fstrans_fli_head, fli_list) { if (fli->fli_mount != fmi->fmi_mount) continue; if (fli->fli_cow_cnt == 0) continue; return false; } return true; } /* * Prepare for changing this mounts cow list. * Returns with fstrans_lock locked. */ static void cow_change_enter(struct fstrans_mount_info *fmi) { mutex_enter(&fstrans_lock); /* * Wait for other threads changing the list. */ while (fmi->fmi_cow_change) cv_wait(&fstrans_state_cv, &fstrans_lock); /* * Wait until all threads are aware of a state change. */ fmi->fmi_cow_change = true; pserialize_perform(fstrans_psz); while (! cow_state_change_done(fmi)) cv_wait(&fstrans_count_cv, &fstrans_lock); } /* * Done changing this mounts cow list. */ static void cow_change_done(struct fstrans_mount_info *fmi) { KASSERT(mutex_owned(&fstrans_lock)); fmi->fmi_cow_change = false; pserialize_perform(fstrans_psz); cv_broadcast(&fstrans_state_cv); mutex_exit(&fstrans_lock); } /* * Add a handler to this mount. */ int fscow_establish(struct mount *mp, int (*func)(void *, struct buf *, bool), void *arg) { struct fstrans_mount_info *fmi; struct fscow_handler *newch; KASSERT(mp != dead_rootmount); mutex_enter(&fstrans_lock); fmi = fstrans_mount_get(mp); KASSERT(fmi != NULL); fmi->fmi_ref_cnt += 1; mutex_exit(&fstrans_lock); newch = kmem_alloc(sizeof(*newch), KM_SLEEP); newch->ch_func = func; newch->ch_arg = arg; cow_change_enter(fmi); LIST_INSERT_HEAD(&fmi->fmi_cow_handler, newch, ch_list); cow_change_done(fmi); return 0; } /* * Remove a handler from this mount. */ int fscow_disestablish(struct mount *mp, int (*func)(void *, struct buf *, bool), void *arg) { struct fstrans_mount_info *fmi; struct fscow_handler *hp = NULL; KASSERT(mp != dead_rootmount); mutex_enter(&fstrans_lock); fmi = fstrans_mount_get(mp); KASSERT(fmi != NULL); mutex_exit(&fstrans_lock); cow_change_enter(fmi); LIST_FOREACH(hp, &fmi->fmi_cow_handler, ch_list) if (hp->ch_func == func && hp->ch_arg == arg) break; if (hp != NULL) { LIST_REMOVE(hp, ch_list); kmem_free(hp, sizeof(*hp)); } fstrans_mount_dtor(fmi); cow_change_done(fmi); return hp ? 0 : EINVAL; } /* * Check for need to copy block that is about to be written. */ int fscow_run(struct buf *bp, bool data_valid) { int error, s; struct mount *mp; struct fstrans_lwp_info *fli; struct fstrans_mount_info *fmi; struct fscow_handler *hp; /* * First check if we need run the copy-on-write handler. */ if ((bp->b_flags & B_COWDONE)) return 0; if (bp->b_vp == NULL) { bp->b_flags |= B_COWDONE; return 0; } if (bp->b_vp->v_type == VBLK) mp = spec_node_getmountedfs(bp->b_vp); else mp = bp->b_vp->v_mount; if (mp == NULL || mp == dead_rootmount) { bp->b_flags |= B_COWDONE; return 0; } fli = fstrans_get_lwp_info(mp, true); KASSERT(fli != NULL); fmi = fli->fli_mountinfo; /* * On non-recursed run check if other threads * want to change the list. */ if (fli->fli_cow_cnt == 0) { s = pserialize_read_enter(); if (__predict_false(fmi->fmi_cow_change)) { pserialize_read_exit(s); mutex_enter(&fstrans_lock); while (fmi->fmi_cow_change) cv_wait(&fstrans_state_cv, &fstrans_lock); fli->fli_cow_cnt = 1; mutex_exit(&fstrans_lock); } else { fli->fli_cow_cnt = 1; pserialize_read_exit(s); } } else fli->fli_cow_cnt += 1; /* * Run all copy-on-write handlers, stop on error. */ error = 0; LIST_FOREACH(hp, &fmi->fmi_cow_handler, ch_list) if ((error = (*hp->ch_func)(hp->ch_arg, bp, data_valid)) != 0) break; if (error == 0) bp->b_flags |= B_COWDONE; /* * Check if other threads want to change the list. */ if (fli->fli_cow_cnt > 1) { fli->fli_cow_cnt -= 1; } else { s = pserialize_read_enter(); if (__predict_false(fmi->fmi_cow_change)) { pserialize_read_exit(s); mutex_enter(&fstrans_lock); fli->fli_cow_cnt = 0; cv_signal(&fstrans_count_cv); mutex_exit(&fstrans_lock); } else { fli->fli_cow_cnt = 0; pserialize_read_exit(s); } } return error; } #if defined(DDB) void fstrans_dump(int); static void fstrans_print_lwp(struct proc *p, struct lwp *l, int verbose) { char prefix[9]; struct fstrans_lwp_info *fli; snprintf(prefix, sizeof(prefix), "%d.%d", p->p_pid, l->l_lid); LIST_FOREACH(fli, &fstrans_fli_head, fli_list) { if (fli->fli_self != l) continue; if (fli->fli_trans_cnt == 0 && fli->fli_cow_cnt == 0) { if (! verbose) continue; } printf("%-8s", prefix); if (verbose) printf(" @%p", fli); if (fli->fli_mount == dead_rootmount) printf(" <dead>"); else if (fli->fli_mount != NULL) printf(" (%s)", fli->fli_mount->mnt_stat.f_mntonname); else printf(" NULL"); if (fli->fli_alias != NULL) { struct mount *amp = fli->fli_alias->fli_mount; printf(" alias"); if (verbose) printf(" @%p", fli->fli_alias); if (amp == NULL) printf(" NULL"); else printf(" (%s)", amp->mnt_stat.f_mntonname); } if (fli->fli_mountinfo && fli->fli_mountinfo->fmi_gone) printf(" gone"); if (fli->fli_trans_cnt == 0) { printf(" -"); } else { switch (fli->fli_lock_type) { case FSTRANS_LAZY: printf(" lazy"); break; case FSTRANS_SHARED: printf(" shared"); break; default: printf(" %#x", fli->fli_lock_type); break; } } printf(" %d cow %d alias %d\n", fli->fli_trans_cnt, fli->fli_cow_cnt, fli->fli_alias_cnt); prefix[0] = '\0'; } } static void fstrans_print_mount(struct mount *mp, int verbose) { uint32_t indx; struct fstrans_mount_info *fmi; indx = fstrans_mount_hash(mp); SLIST_FOREACH(fmi, &fstrans_mount_hashtab[indx], fmi_hash) if (fmi->fmi_mount == mp) break; if (!verbose && (fmi == NULL || fmi->fmi_state == FSTRANS_NORMAL)) return; printf("%-16s ", mp->mnt_stat.f_mntonname); if (fmi == NULL) { printf("(null)\n"); return; } printf("owner %p ", fmi->fmi_owner); switch (fmi->fmi_state) { case FSTRANS_NORMAL: printf("state normal\n"); break; case FSTRANS_SUSPENDING: printf("state suspending\n"); break; case FSTRANS_SUSPENDED: printf("state suspended\n"); break; default: printf("state %#x\n", fmi->fmi_state); break; } } void fstrans_dump(int full) { const struct proclist_desc *pd; struct proc *p; struct lwp *l; struct mount *mp; printf("Fstrans locks by lwp:\n"); for (pd = proclists; pd->pd_list != NULL; pd++) PROCLIST_FOREACH(p, pd->pd_list) LIST_FOREACH(l, &p->p_lwps, l_sibling) fstrans_print_lwp(p, l, full == 1); printf("Fstrans state by mount:\n"); for (mp = _mountlist_next(NULL); mp; mp = _mountlist_next(mp)) fstrans_print_mount(mp, full == 1); } #endif /* defined(DDB) */
1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 /* $NetBSD: kern_core.c,v 1.39 2023/10/04 22:17:09 ad Exp $ */ /* * Copyright (c) 1982, 1986, 1989, 1991, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)kern_sig.c 8.14 (Berkeley) 5/14/95 */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: kern_core.c,v 1.39 2023/10/04 22:17:09 ad Exp $"); #ifdef _KERNEL_OPT #include "opt_execfmt.h" #include "opt_compat_netbsd32.h" #endif #include <sys/param.h> #include <sys/vnode.h> #include <sys/namei.h> #include <sys/acct.h> #include <sys/file.h> #include <sys/stat.h> #include <sys/proc.h> #include <sys/exec.h> #include <sys/filedesc.h> #include <sys/kauth.h> #include <sys/module.h> #include <sys/compat_stub.h> #include <sys/exec_elf.h> #include <sys/resourcevar.h> MODULE(MODULE_CLASS_MISC, coredump, NULL); struct coredump_iostate { struct lwp *io_lwp; struct vnode *io_vp; kauth_cred_t io_cred; off_t io_offset; }; static int coredump(struct lwp *, const char *); static int coredump_buildname(struct proc *, char *, const char *, size_t); static int coredump_write(struct coredump_iostate *, enum uio_seg segflg, const void *, size_t); static off_t coredump_offset(struct coredump_iostate *); static int coredump_modcmd(modcmd_t cmd, void *arg) { switch (cmd) { case MODULE_CMD_INIT: MODULE_HOOK_SET(coredump_hook, coredump); MODULE_HOOK_SET(coredump_write_hook, coredump_write); MODULE_HOOK_SET(coredump_offset_hook, coredump_offset); MODULE_HOOK_SET(coredump_netbsd_hook, real_coredump_netbsd); #if defined(EXEC_ELF64) MODULE_HOOK_SET(coredump_elf64_hook, real_coredump_elf64); #elif defined(EXEC_ELF32) MODULE_HOOK_SET(coredump_elf32_hook, real_coredump_elf32); #endif MODULE_HOOK_SET(uvm_coredump_walkmap_hook, uvm_coredump_walkmap); MODULE_HOOK_SET(uvm_coredump_count_segs_hook, uvm_coredump_count_segs); return 0; case MODULE_CMD_FINI: MODULE_HOOK_UNSET(uvm_coredump_count_segs_hook); MODULE_HOOK_UNSET(uvm_coredump_walkmap_hook); #if defined(EXEC_ELF64) MODULE_HOOK_UNSET(coredump_elf64_hook); #elif defined(EXEC_ELF32) MODULE_HOOK_UNSET(coredump_elf32_hook); #endif MODULE_HOOK_UNSET(coredump_netbsd_hook); MODULE_HOOK_UNSET(coredump_offset_hook); MODULE_HOOK_UNSET(coredump_write_hook); MODULE_HOOK_UNSET(coredump_hook); return 0; default: return ENOTTY; } } /* * Dump core, into a file named "progname.core" or "core" (depending on the * value of shortcorename), unless the process was setuid/setgid. */ static int coredump(struct lwp *l, const char *pattern) { struct vnode *vp; struct proc *p; struct vmspace *vm; kauth_cred_t cred = NULL; struct pathbuf *pb; struct vattr vattr; struct coredump_iostate io; struct plimit *lim; int error, error1; char *name, *lastslash = NULL /* XXXgcc */; name = PNBUF_GET(); p = l->l_proc; vm = p->p_vmspace; mutex_enter(&proc_lock); /* p_session */ mutex_enter(p->p_lock); /* * Refuse to core if the data + stack + user size is larger than * the core dump limit. XXX THIS IS WRONG, because of mapped * data. */ if (USPACE + ctob(vm->vm_dsize + vm->vm_ssize) >= p->p_rlimit[RLIMIT_CORE].rlim_cur) { error = EFBIG; /* better error code? */ goto release; } /* * It may well not be curproc, so grab a reference to its current * credentials. */ cred = kauth_cred_hold(p->p_cred); /* * Make sure the process has not set-id, to prevent data leaks, * unless it was specifically requested to allow set-id coredumps. */ if (p->p_flag & PK_SUGID) { if (!security_setidcore_dump) { error = EPERM; goto release; } pattern = security_setidcore_path; } /* Lock, as p_limit and pl_corename might change. */ lim = p->p_limit; mutex_enter(&lim->pl_lock); if (pattern == NULL) { pattern = lim->pl_corename; } error = coredump_buildname(p, name, pattern, MAXPATHLEN); mutex_exit(&lim->pl_lock); if (error) goto release; /* * On a simple filename, see if the filesystem allow us to write * core dumps there. */ lastslash = strrchr(name, '/'); if (!lastslash) { vp = p->p_cwdi->cwdi_cdir; if (vp->v_mount == NULL || (vp->v_mount->mnt_flag & MNT_NOCOREDUMP) != 0) error = EPERM; } release: mutex_exit(p->p_lock); mutex_exit(&proc_lock); if (error) goto done; /* * On a complex filename, see if the filesystem allow us to write * core dumps there. * * XXX: We should have an API that avoids double lookups */ if (lastslash) { char c[2]; if (lastslash - name >= MAXPATHLEN - 2) { error = EPERM; goto done; } c[0] = lastslash[1]; c[1] = lastslash[2]; lastslash[1] = '.'; lastslash[2] = '\0'; error = namei_simple_kernel(name, NSM_FOLLOW_NOEMULROOT, &vp); if (error) goto done; if (vp->v_mount == NULL || (vp->v_mount->mnt_flag & MNT_NOCOREDUMP) != 0) error = EPERM; vrele(vp); if (error) goto done; lastslash[1] = c[0]; lastslash[2] = c[1]; } pb = pathbuf_create(name); if (pb == NULL) { error = ENOMEM; goto done; } error = vn_open(NULL, pb, 0, O_CREAT | O_NOFOLLOW | FWRITE, S_IRUSR | S_IWUSR, &vp, NULL, NULL); if (error != 0) { pathbuf_destroy(pb); goto done; } pathbuf_destroy(pb); /* * Don't dump to: * - non-regular files * - files with links * - files we don't own */ if (vp->v_type != VREG || VOP_GETATTR(vp, &vattr, cred) || vattr.va_nlink != 1 || vattr.va_uid != kauth_cred_geteuid(cred)) { error = EACCES; goto out; } vattr_null(&vattr); vattr.va_size = 0; if ((p->p_flag & PK_SUGID) && security_setidcore_dump) { vattr.va_uid = security_setidcore_owner; vattr.va_gid = security_setidcore_group; vattr.va_mode = security_setidcore_mode; } VOP_SETATTR(vp, &vattr, cred); p->p_acflag |= ACORE; io.io_lwp = l; io.io_vp = vp; io.io_cred = cred; io.io_offset = 0; /* Now dump the actual core file. */ error = (*p->p_execsw->es_coredump)(l, &io); out: VOP_UNLOCK(vp); error1 = vn_close(vp, FWRITE, cred); if (error == 0) error = error1; done: if (cred != NULL) kauth_cred_free(cred); if (name != NULL) PNBUF_PUT(name); return error; } static int coredump_buildname(struct proc *p, char *dst, const char *src, size_t len) { const char *s; char *d, *end; int i; KASSERT(mutex_owned(&proc_lock)); for (s = src, d = dst, end = d + len; *s != '\0'; s++) { if (*s == '%') { switch (*(s + 1)) { case 'n': i = snprintf(d, end - d, "%s", p->p_comm); break; case 'p': i = snprintf(d, end - d, "%d", p->p_pid); break; case 'u': i = snprintf(d, end - d, "%.*s", (int)sizeof p->p_pgrp->pg_session->s_login, p->p_pgrp->pg_session->s_login); break; case 't': i = snprintf(d, end - d, "%lld", (long long)p->p_stats->p_start.tv_sec); break; default: goto copy; } d += i; s++; } else { copy: *d = *s; d++; } if (d >= end) return (ENAMETOOLONG); } *d = '\0'; return 0; } static int coredump_write(struct coredump_iostate *io, enum uio_seg segflg, const void *data, size_t len) { int error; error = vn_rdwr(UIO_WRITE, io->io_vp, __UNCONST(data), len, io->io_offset, segflg, IO_NODELOCKED|IO_UNIT, io->io_cred, NULL, segflg == UIO_USERSPACE ? io->io_lwp : NULL); if (error) { printf("pid %d (%s): %s write of %zu@%p at %lld failed: %d\n", io->io_lwp->l_proc->p_pid, io->io_lwp->l_proc->p_comm, segflg == UIO_USERSPACE ? "user" : "system", len, data, (long long) io->io_offset, error); return (error); } io->io_offset += len; return (0); } static off_t coredump_offset(struct coredump_iostate *io) { return io->io_offset; }
20 459 41 42 42 452 450 451 439 439 438 438 439 438 438 111 458 458 459 116 116 11 11 6 455 89 436 169 405 7 450 204 126 164 398 398 1 398 107 130 130 130 130 116 113 129 127 1 87 122 130 128 2 109 151 153 153 153 153 152 96 135 2 2 2 454 459 459 458 457 457 456 459 17 426 157 2 459 457 458 30 457 42 42 42 42 453 459 411 459 459 113 94 438 459 458 20 439 437 111 459 459 458 459 438 116 110 6 111 6 116 459 111 111 110 111 110 11 11 454 454 456 456 455 19 18 18 19 19 16 3 4 2 14 212 5 209 30 1 1 1 28 26 8 8 8 437 435 438 21 21 21 438 439 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 /* $NetBSD: vfs_bio.c,v 1.303 2022/03/30 14:54:29 riastradh Exp $ */ /*- * Copyright (c) 2007, 2008, 2009, 2019, 2020 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Andrew Doran, and by Wasabi Systems, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /*- * Copyright (c) 1982, 1986, 1989, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)vfs_bio.c 8.6 (Berkeley) 1/11/94 */ /*- * Copyright (c) 1994 Christopher G. Demetriou * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)vfs_bio.c 8.6 (Berkeley) 1/11/94 */ /* * The buffer cache subsystem. * * Some references: * Bach: The Design of the UNIX Operating System (Prentice Hall, 1986) * Leffler, et al.: The Design and Implementation of the 4.3BSD * UNIX Operating System (Addison Welley, 1989) * * Locking * * There are three locks: * - bufcache_lock: protects global buffer cache state. * - BC_BUSY: a long term per-buffer lock. * - buf_t::b_objlock: lock on completion (biowait vs biodone). * * For buffers associated with vnodes (a most common case) b_objlock points * to the vnode_t::v_interlock. Otherwise, it points to generic buffer_lock. * * Lock order: * bufcache_lock -> * buf_t::b_objlock */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: vfs_bio.c,v 1.303 2022/03/30 14:54:29 riastradh Exp $"); #ifdef _KERNEL_OPT #include "opt_bufcache.h" #include "opt_dtrace.h" #include "opt_biohist.h" #endif #include <sys/param.h> #include <sys/systm.h> #include <sys/kernel.h> #include <sys/proc.h> #include <sys/buf.h> #include <sys/vnode.h> #include <sys/mount.h> #include <sys/resourcevar.h> #include <sys/sysctl.h> #include <sys/conf.h> #include <sys/kauth.h> #include <sys/fstrans.h> #include <sys/intr.h> #include <sys/cpu.h> #include <sys/wapbl.h> #include <sys/bitops.h> #include <sys/cprng.h> #include <sys/sdt.h> #include <uvm/uvm.h> /* extern struct uvm uvm */ #include <miscfs/specfs/specdev.h> SDT_PROVIDER_DEFINE(io); SDT_PROBE_DEFINE4(io, kernel, , bbusy__start, "struct buf *"/*bp*/, "bool"/*intr*/, "int"/*timo*/, "kmutex_t *"/*interlock*/); SDT_PROBE_DEFINE5(io, kernel, , bbusy__done, "struct buf *"/*bp*/, "bool"/*intr*/, "int"/*timo*/, "kmutex_t *"/*interlock*/, "int"/*error*/); SDT_PROBE_DEFINE0(io, kernel, , getnewbuf__start); SDT_PROBE_DEFINE1(io, kernel, , getnewbuf__done, "struct buf *"/*bp*/); SDT_PROBE_DEFINE3(io, kernel, , getblk__start, "struct vnode *"/*vp*/, "daddr_t"/*blkno*/, "int"/*size*/); SDT_PROBE_DEFINE4(io, kernel, , getblk__done, "struct vnode *"/*vp*/, "daddr_t"/*blkno*/, "int"/*size*/, "struct buf *"/*bp*/); SDT_PROBE_DEFINE2(io, kernel, , brelse, "struct buf *"/*bp*/, "int"/*set*/); SDT_PROBE_DEFINE1(io, kernel, , wait__start, "struct buf *"/*bp*/); SDT_PROBE_DEFINE1(io, kernel, , wait__done, "struct buf *"/*bp*/); #ifndef BUFPAGES # define BUFPAGES 0 #endif #ifdef BUFCACHE # if (BUFCACHE < 5) || (BUFCACHE > 95) # error BUFCACHE is not between 5 and 95 # endif #else # define BUFCACHE 15 #endif u_int nbuf; /* desired number of buffer headers */ u_int bufpages = BUFPAGES; /* optional hardwired count */ u_int bufcache = BUFCACHE; /* max % of RAM to use for buffer cache */ /* * Definitions for the buffer free lists. */ #define BQUEUES 3 /* number of free buffer queues */ #define BQ_LOCKED 0 /* super-blocks &c */ #define BQ_LRU 1 /* lru, useful buffers */ #define BQ_AGE 2 /* rubbish */ struct bqueue { TAILQ_HEAD(, buf) bq_queue; uint64_t bq_bytes; buf_t *bq_marker; }; static struct bqueue bufqueues[BQUEUES] __cacheline_aligned; /* Function prototypes */ static void buf_setwm(void); static int buf_trim(void); static void *bufpool_page_alloc(struct pool *, int); static void bufpool_page_free(struct pool *, void *); static buf_t *bio_doread(struct vnode *, daddr_t, int, int); static buf_t *getnewbuf(int, int, int); static int buf_lotsfree(void); static int buf_canrelease(void); static u_long buf_mempoolidx(u_long); static u_long buf_roundsize(u_long); static void *buf_alloc(size_t); static void buf_mrelease(void *, size_t); static void binsheadfree(buf_t *, struct bqueue *); static void binstailfree(buf_t *, struct bqueue *); #ifdef DEBUG static int checkfreelist(buf_t *, struct bqueue *, int); #endif static void biointr(void *); static void biodone2(buf_t *); static void sysctl_kern_buf_setup(void); static void sysctl_vm_buf_setup(void); /* Initialization for biohist */ #include <sys/biohist.h> BIOHIST_DEFINE(biohist); void biohist_init(void) { BIOHIST_INIT(biohist, BIOHIST_SIZE); } /* * Definitions for the buffer hash lists. */ #define BUFHASH(dvp, lbn) \ (&bufhashtbl[(((long)(dvp) >> 8) + (int)(lbn)) & bufhash]) LIST_HEAD(bufhashhdr, buf) *bufhashtbl, invalhash; u_long bufhash; static int bufhash_stats(struct hashstat_sysctl *, bool); static kcondvar_t needbuffer_cv; /* * Buffer queue lock. */ kmutex_t bufcache_lock __cacheline_aligned; kmutex_t buffer_lock __cacheline_aligned; /* Software ISR for completed transfers. */ static void *biodone_sih; /* Buffer pool for I/O buffers. */ static pool_cache_t buf_cache; static pool_cache_t bufio_cache; #define MEMPOOL_INDEX_OFFSET (ilog2(DEV_BSIZE)) /* smallest pool is 512 bytes */ #define NMEMPOOLS (ilog2(MAXBSIZE) - MEMPOOL_INDEX_OFFSET + 1) __CTASSERT((1 << (NMEMPOOLS + MEMPOOL_INDEX_OFFSET - 1)) == MAXBSIZE); /* Buffer memory pools */ static struct pool bmempools[NMEMPOOLS]; static struct vm_map *buf_map; /* * Buffer memory pool allocator. */ static void * bufpool_page_alloc(struct pool *pp, int flags) { return (void *)uvm_km_alloc(buf_map, MAXBSIZE, MAXBSIZE, ((flags & PR_WAITOK) ? 0 : UVM_KMF_NOWAIT|UVM_KMF_TRYLOCK) | UVM_KMF_WIRED); } static void bufpool_page_free(struct pool *pp, void *v) { uvm_km_free(buf_map, (vaddr_t)v, MAXBSIZE, UVM_KMF_WIRED); } static struct pool_allocator bufmempool_allocator = { .pa_alloc = bufpool_page_alloc, .pa_free = bufpool_page_free, .pa_pagesz = MAXBSIZE, }; /* Buffer memory management variables */ u_long bufmem_valimit; u_long bufmem_hiwater; u_long bufmem_lowater; u_long bufmem; /* * MD code can call this to set a hard limit on the amount * of virtual memory used by the buffer cache. */ int buf_setvalimit(vsize_t sz) { /* We need to accommodate at least NMEMPOOLS of MAXBSIZE each */ if (sz < NMEMPOOLS * MAXBSIZE) return EINVAL; bufmem_valimit = sz; return 0; } static void buf_setwm(void) { bufmem_hiwater = buf_memcalc(); /* lowater is approx. 2% of memory (with bufcache = 15) */ #define BUFMEM_WMSHIFT 3 #define BUFMEM_HIWMMIN (64 * 1024 << BUFMEM_WMSHIFT) if (bufmem_hiwater < BUFMEM_HIWMMIN) /* Ensure a reasonable minimum value */ bufmem_hiwater = BUFMEM_HIWMMIN; bufmem_lowater = bufmem_hiwater >> BUFMEM_WMSHIFT; } #ifdef DEBUG int debug_verify_freelist = 0; static int checkfreelist(buf_t *bp, struct bqueue *dp, int ison) { buf_t *b; if (!debug_verify_freelist) return 1; TAILQ_FOREACH(b, &dp->bq_queue, b_freelist) { if (b == bp) return ison ? 1 : 0; } return ison ? 0 : 1; } #endif /* * Insq/Remq for the buffer hash lists. * Call with buffer queue locked. */ static void binsheadfree(buf_t *bp, struct bqueue *dp) { KASSERT(mutex_owned(&bufcache_lock)); KASSERT(bp->b_freelistindex == -1); TAILQ_INSERT_HEAD(&dp->bq_queue, bp, b_freelist); dp->bq_bytes += bp->b_bufsize; bp->b_freelistindex = dp - bufqueues; } static void binstailfree(buf_t *bp, struct bqueue *dp) { KASSERT(mutex_owned(&bufcache_lock)); KASSERTMSG(bp->b_freelistindex == -1, "double free of buffer? " "bp=%p, b_freelistindex=%d\n", bp, bp->b_freelistindex); TAILQ_INSERT_TAIL(&dp->bq_queue, bp, b_freelist); dp->bq_bytes += bp->b_bufsize; bp->b_freelistindex = dp - bufqueues; } void bremfree(buf_t *bp) { struct bqueue *dp; int bqidx = bp->b_freelistindex; KASSERT(mutex_owned(&bufcache_lock)); KASSERT(bqidx != -1); dp = &bufqueues[bqidx]; KDASSERT(checkfreelist(bp, dp, 1)); KASSERT(dp->bq_bytes >= bp->b_bufsize); TAILQ_REMOVE(&dp->bq_queue, bp, b_freelist); dp->bq_bytes -= bp->b_bufsize; /* For the sysctl helper. */ if (bp == dp->bq_marker) dp->bq_marker = NULL; #if defined(DIAGNOSTIC) bp->b_freelistindex = -1; #endif /* defined(DIAGNOSTIC) */ } /* * note that for some ports this is used by pmap bootstrap code to * determine kva size. */ u_long buf_memcalc(void) { u_long n; vsize_t mapsz = 0; /* * Determine the upper bound of memory to use for buffers. * * - If bufpages is specified, use that as the number * pages. * * - Otherwise, use bufcache as the percentage of * physical memory. */ if (bufpages != 0) { n = bufpages; } else { if (bufcache < 5) { printf("forcing bufcache %d -> 5", bufcache); bufcache = 5; } if (bufcache > 95) { printf("forcing bufcache %d -> 95", bufcache); bufcache = 95; } if (buf_map != NULL) mapsz = vm_map_max(buf_map) - vm_map_min(buf_map); n = calc_cache_size(mapsz, bufcache, (buf_map != kernel_map) ? 100 : BUFCACHE_VA_MAXPCT) / PAGE_SIZE; } n <<= PAGE_SHIFT; if (bufmem_valimit != 0 && n > bufmem_valimit) n = bufmem_valimit; return (n); } /* * Initialize buffers and hash links for buffers. */ void bufinit(void) { struct bqueue *dp; int use_std; u_int i; biodone_vfs = biodone; mutex_init(&bufcache_lock, MUTEX_DEFAULT, IPL_NONE); mutex_init(&buffer_lock, MUTEX_DEFAULT, IPL_NONE); cv_init(&needbuffer_cv, "needbuf"); if (bufmem_valimit != 0) { vaddr_t minaddr = 0, maxaddr; buf_map = uvm_km_suballoc(kernel_map, &minaddr, &maxaddr, bufmem_valimit, 0, false, 0); if (buf_map == NULL) panic("bufinit: cannot allocate submap"); } else buf_map = kernel_map; /* * Initialize buffer cache memory parameters. */ bufmem = 0; buf_setwm(); /* On "small" machines use small pool page sizes where possible */ use_std = (physmem < atop(16*1024*1024)); /* * Also use them on systems that can map the pool pages using * a direct-mapped segment. */ #ifdef PMAP_MAP_POOLPAGE use_std = 1; #endif buf_cache = pool_cache_init(sizeof(buf_t), 0, 0, 0, "bufpl", NULL, IPL_SOFTBIO, NULL, NULL, NULL); bufio_cache = pool_cache_init(sizeof(buf_t), 0, 0, 0, "biopl", NULL, IPL_BIO, NULL, NULL, NULL); for (i = 0; i < NMEMPOOLS; i++) { struct pool_allocator *pa; struct pool *pp = &bmempools[i]; u_int size = 1 << (i + MEMPOOL_INDEX_OFFSET); char *name = kmem_alloc(8, KM_SLEEP); /* XXX: never freed */ if (__predict_false(size >= 1048576)) (void)snprintf(name, 8, "buf%um", size / 1048576); else if (__predict_true(size >= 1024)) (void)snprintf(name, 8, "buf%uk", size / 1024); else (void)snprintf(name, 8, "buf%ub", size); pa = (size <= PAGE_SIZE && use_std) ? &pool_allocator_nointr : &bufmempool_allocator; pool_init(pp, size, DEV_BSIZE, 0, 0, name, pa, IPL_NONE); pool_setlowat(pp, 1); pool_sethiwat(pp, 1); } /* Initialize the buffer queues */ for (dp = bufqueues; dp < &bufqueues[BQUEUES]; dp++) { TAILQ_INIT(&dp->bq_queue); dp->bq_bytes = 0; } /* * Estimate hash table size based on the amount of memory we * intend to use for the buffer cache. The average buffer * size is dependent on our clients (i.e. filesystems). * * For now, use an empirical 3K per buffer. */ nbuf = (bufmem_hiwater / 1024) / 3; bufhashtbl = hashinit(nbuf, HASH_LIST, true, &bufhash); sysctl_kern_buf_setup(); sysctl_vm_buf_setup(); hashstat_register("bufhash", bufhash_stats); } void bufinit2(void) { biodone_sih = softint_establish(SOFTINT_BIO | SOFTINT_MPSAFE, biointr, NULL); if (biodone_sih == NULL) panic("bufinit2: can't establish soft interrupt"); } static int buf_lotsfree(void) { u_long guess; /* Always allocate if less than the low water mark. */ if (bufmem < bufmem_lowater) return 1; /* Never allocate if greater than the high water mark. */ if (bufmem > bufmem_hiwater) return 0; /* If there's anything on the AGE list, it should be eaten. */ if (TAILQ_FIRST(&bufqueues[BQ_AGE].bq_queue) != NULL) return 0; /* * The probabily of getting a new allocation is inversely * proportional to the current size of the cache above * the low water mark. Divide the total first to avoid overflows * in the product. */ guess = cprng_fast32() % 16; if ((bufmem_hiwater - bufmem_lowater) / 16 * guess >= (bufmem - bufmem_lowater)) return 1; /* Otherwise don't allocate. */ return 0; } /* * Return estimate of bytes we think need to be * released to help resolve low memory conditions. * * => called with bufcache_lock held. */ static int buf_canrelease(void) { int pagedemand, ninvalid = 0; KASSERT(mutex_owned(&bufcache_lock)); if (bufmem < bufmem_lowater) return 0; if (bufmem > bufmem_hiwater) return bufmem - bufmem_hiwater; ninvalid += bufqueues[BQ_AGE].bq_bytes; pagedemand = uvmexp.freetarg - uvm_availmem(false); if (pagedemand < 0) return ninvalid; return MAX(ninvalid, MIN(2 * MAXBSIZE, MIN((bufmem - bufmem_lowater) / 16, pagedemand * PAGE_SIZE))); } /* * Buffer memory allocation helper functions */ static u_long buf_mempoolidx(u_long size) { u_int n = 0; size -= 1; size >>= MEMPOOL_INDEX_OFFSET; while (size) { size >>= 1; n += 1; } if (n >= NMEMPOOLS) panic("buf mem pool index %d", n); return n; } static u_long buf_roundsize(u_long size) { /* Round up to nearest power of 2 */ return (1 << (buf_mempoolidx(size) + MEMPOOL_INDEX_OFFSET)); } static void * buf_alloc(size_t size) { u_int n = buf_mempoolidx(size); void *addr; while (1) { addr = pool_get(&bmempools[n], PR_NOWAIT); if (addr != NULL) break; /* No memory, see if we can free some. If so, try again */ mutex_enter(&bufcache_lock); if (buf_drain(1) > 0) { mutex_exit(&bufcache_lock); continue; } if (curlwp == uvm.pagedaemon_lwp) { mutex_exit(&bufcache_lock); return NULL; } /* Wait for buffers to arrive on the LRU queue */ cv_timedwait(&needbuffer_cv, &bufcache_lock, hz / 4); mutex_exit(&bufcache_lock); } return addr; } static void buf_mrelease(void *addr, size_t size) { pool_put(&bmempools[buf_mempoolidx(size)], addr); } /* * bread()/breadn() helper. */ static buf_t * bio_doread(struct vnode *vp, daddr_t blkno, int size, int async) { buf_t *bp; struct mount *mp; bp = getblk(vp, blkno, size, 0, 0); /* * getblk() may return NULL if we are the pagedaemon. */ if (bp == NULL) { KASSERT(curlwp == uvm.pagedaemon_lwp); return NULL; } /* * If buffer does not have data valid, start a read. * Note that if buffer is BC_INVAL, getblk() won't return it. * Therefore, it's valid if its I/O has completed or been delayed. */ if (!ISSET(bp->b_oflags, (BO_DONE | BO_DELWRI))) { /* Start I/O for the buffer. */ SET(bp->b_flags, B_READ | async); if (async) BIO_SETPRIO(bp, BPRIO_TIMELIMITED); else BIO_SETPRIO(bp, BPRIO_TIMECRITICAL); VOP_STRATEGY(vp, bp); /* Pay for the read. */ curlwp->l_ru.ru_inblock++; } else if (async) brelse(bp, 0); if (vp->v_type == VBLK) mp = spec_node_getmountedfs(vp); else mp = vp->v_mount; /* * Collect statistics on synchronous and asynchronous reads. * Reads from block devices are charged to their associated * filesystem (if any). */ if (mp != NULL) { if (async == 0) mp->mnt_stat.f_syncreads++; else mp->mnt_stat.f_asyncreads++; } return (bp); } /* * Read a disk block. * This algorithm described in Bach (p.54). */ int bread(struct vnode *vp, daddr_t blkno, int size, int flags, buf_t **bpp) { buf_t *bp; int error; BIOHIST_FUNC(__func__); BIOHIST_CALLED(biohist); /* Get buffer for block. */ bp = *bpp = bio_doread(vp, blkno, size, 0); if (bp == NULL) return ENOMEM; /* Wait for the read to complete, and return result. */ error = biowait(bp); if (error == 0 && (flags & B_MODIFY) != 0) error = fscow_run(bp, true); if (error) { brelse(bp, 0); *bpp = NULL; } return error; } /* * Read-ahead multiple disk blocks. The first is sync, the rest async. * Trivial modification to the breada algorithm presented in Bach (p.55). */ int breadn(struct vnode *vp, daddr_t blkno, int size, daddr_t *rablks, int *rasizes, int nrablks, int flags, buf_t **bpp) { buf_t *bp; int error, i; BIOHIST_FUNC(__func__); BIOHIST_CALLED(biohist); bp = *bpp = bio_doread(vp, blkno, size, 0); if (bp == NULL) return ENOMEM; /* * For each of the read-ahead blocks, start a read, if necessary. */ mutex_enter(&bufcache_lock); for (i = 0; i < nrablks; i++) { /* If it's in the cache, just go on to next one. */ if (incore(vp, rablks[i])) continue; /* Get a buffer for the read-ahead block */ mutex_exit(&bufcache_lock); (void) bio_doread(vp, rablks[i], rasizes[i], B_ASYNC); mutex_enter(&bufcache_lock); } mutex_exit(&bufcache_lock); /* Otherwise, we had to start a read for it; wait until it's valid. */ error = biowait(bp); if (error == 0 && (flags & B_MODIFY) != 0) error = fscow_run(bp, true); if (error) { brelse(bp, 0); *bpp = NULL; } return error; } /* * Block write. Described in Bach (p.56) */ int bwrite(buf_t *bp) { int rv, sync, wasdelayed; struct vnode *vp; struct mount *mp; BIOHIST_FUNC(__func__); BIOHIST_CALLARGS(biohist, "bp=%#jx", (uintptr_t)bp, 0, 0, 0); KASSERT(ISSET(bp->b_cflags, BC_BUSY)); KASSERT(!cv_has_waiters(&bp->b_done)); vp = bp->b_vp; /* * dholland 20160728 AFAICT vp==NULL must be impossible as it * will crash upon reaching VOP_STRATEGY below... see further * analysis on tech-kern. */ KASSERTMSG(vp != NULL, "bwrite given buffer with null vnode"); if (vp != NULL) { KASSERT(bp->b_objlock == vp->v_interlock); if (vp->v_type == VBLK) mp = spec_node_getmountedfs(vp); else mp = vp->v_mount; } else { mp = NULL; } if (mp && mp->mnt_wapbl) { if (bp->b_iodone != mp->mnt_wapbl_op->wo_wapbl_biodone) { bdwrite(bp); return 0; } } /* * Remember buffer type, to switch on it later. If the write was * synchronous, but the file system was mounted with MNT_ASYNC, * convert it to a delayed write. * XXX note that this relies on delayed tape writes being converted * to async, not sync writes (which is safe, but ugly). */ sync = !ISSET(bp->b_flags, B_ASYNC); if (sync && mp != NULL && ISSET(mp->mnt_flag, MNT_ASYNC)) { bdwrite(bp); return (0); } /* * Collect statistics on synchronous and asynchronous writes. * Writes to block devices are charged to their associated * filesystem (if any). */ if (mp != NULL) { if (sync) mp->mnt_stat.f_syncwrites++; else mp->mnt_stat.f_asyncwrites++; } /* * Pay for the I/O operation and make sure the buf is on the correct * vnode queue. */ bp->b_error = 0; wasdelayed = ISSET(bp->b_oflags, BO_DELWRI); CLR(bp->b_flags, B_READ); if (wasdelayed) { mutex_enter(&bufcache_lock); mutex_enter(bp->b_objlock); CLR(bp->b_oflags, BO_DONE | BO_DELWRI); reassignbuf(bp, bp->b_vp); /* Wake anyone trying to busy the buffer via vnode's lists. */ cv_broadcast(&bp->b_busy); mutex_exit(&bufcache_lock); } else { curlwp->l_ru.ru_oublock++; mutex_enter(bp->b_objlock); CLR(bp->b_oflags, BO_DONE | BO_DELWRI); } if (vp != NULL) vp->v_numoutput++; mutex_exit(bp->b_objlock); /* Initiate disk write. */ if (sync) BIO_SETPRIO(bp, BPRIO_TIMECRITICAL); else BIO_SETPRIO(bp, BPRIO_TIMELIMITED); VOP_STRATEGY(vp, bp); if (sync) { /* If I/O was synchronous, wait for it to complete. */ rv = biowait(bp); /* Release the buffer. */ brelse(bp, 0); return (rv); } else { return (0); } } int vn_bwrite(void *v) { struct vop_bwrite_args *ap = v; return (bwrite(ap->a_bp)); } /* * Delayed write. * * The buffer is marked dirty, but is not queued for I/O. * This routine should be used when the buffer is expected * to be modified again soon, typically a small write that * partially fills a buffer. * * NB: magnetic tapes cannot be delayed; they must be * written in the order that the writes are requested. * * Described in Leffler, et al. (pp. 208-213). */ void bdwrite(buf_t *bp) { BIOHIST_FUNC(__func__); BIOHIST_CALLARGS(biohist, "bp=%#jx", (uintptr_t)bp, 0, 0, 0); KASSERT(bp->b_vp == NULL || bp->b_vp->v_tag != VT_UFS || bp->b_vp->v_type == VBLK || ISSET(bp->b_flags, B_COWDONE)); KASSERT(ISSET(bp->b_cflags, BC_BUSY)); KASSERT(!cv_has_waiters(&bp->b_done)); /* If this is a tape block, write the block now. */ if (bdev_type(bp->b_dev) == D_TAPE) { bawrite(bp); return; } if (wapbl_vphaswapbl(bp->b_vp)) { struct mount *mp = wapbl_vptomp(bp->b_vp); if (bp->b_iodone != mp->mnt_wapbl_op->wo_wapbl_biodone) { WAPBL_ADD_BUF(mp, bp); } } /* * If the block hasn't been seen before: * (1) Mark it as having been seen, * (2) Charge for the write, * (3) Make sure it's on its vnode's correct block list. */ KASSERT(bp->b_vp == NULL || bp->b_objlock == bp->b_vp->v_interlock); if (!ISSET(bp->b_oflags, BO_DELWRI)) { mutex_enter(&bufcache_lock); mutex_enter(bp->b_objlock); SET(bp->b_oflags, BO_DELWRI); curlwp->l_ru.ru_oublock++; reassignbuf(bp, bp->b_vp); /* Wake anyone trying to busy the buffer via vnode's lists. */ cv_broadcast(&bp->b_busy); mutex_exit(&bufcache_lock); } else { mutex_enter(bp->b_objlock); } /* Otherwise, the "write" is done, so mark and release the buffer. */ CLR(bp->b_oflags, BO_DONE); mutex_exit(bp->b_objlock); brelse(bp, 0); } /* * Asynchronous block write; just an asynchronous bwrite(). */ void bawrite(buf_t *bp) { KASSERT(ISSET(bp->b_cflags, BC_BUSY)); KASSERT(bp->b_vp != NULL); SET(bp->b_flags, B_ASYNC); VOP_BWRITE(bp->b_vp, bp); } /* * Release a buffer on to the free lists. * Described in Bach (p. 46). */ void brelsel(buf_t *bp, int set) { struct bqueue *bufq; struct vnode *vp; SDT_PROBE2(io, kernel, , brelse, bp, set); KASSERT(bp != NULL); KASSERT(mutex_owned(&bufcache_lock)); KASSERT(!cv_has_waiters(&bp->b_done)); SET(bp->b_cflags, set); KASSERT(ISSET(bp->b_cflags, BC_BUSY)); KASSERT(bp->b_iodone == NULL); /* Wake up any processes waiting for any buffer to become free. */ cv_signal(&needbuffer_cv); /* Wake up any proceeses waiting for _this_ buffer to become free */ if (ISSET(bp->b_cflags, BC_WANTED)) CLR(bp->b_cflags, BC_WANTED|BC_AGE); /* If it's clean clear the copy-on-write flag. */ if (ISSET(bp->b_flags, B_COWDONE)) { mutex_enter(bp->b_objlock); if (!ISSET(bp->b_oflags, BO_DELWRI)) CLR(bp->b_flags, B_COWDONE); mutex_exit(bp->b_objlock); } /* * Determine which queue the buffer should be on, then put it there. */ /* If it's locked, don't report an error; try again later. */ if (ISSET(bp->b_flags, B_LOCKED)) bp->b_error = 0; /* If it's not cacheable, or an error, mark it invalid. */ if (ISSET(bp->b_cflags, BC_NOCACHE) || bp->b_error != 0) SET(bp->b_cflags, BC_INVAL); if (ISSET(bp->b_cflags, BC_VFLUSH)) { /* * This is a delayed write buffer that was just flushed to * disk. It is still on the LRU queue. If it's become * invalid, then we need to move it to a different queue; * otherwise leave it in its current position. */ CLR(bp->b_cflags, BC_VFLUSH); if (!ISSET(bp->b_cflags, BC_INVAL|BC_AGE) && !ISSET(bp->b_flags, B_LOCKED) && bp->b_error == 0) { KDASSERT(checkfreelist(bp, &bufqueues[BQ_LRU], 1)); goto already_queued; } else { bremfree(bp); } } KDASSERT(checkfreelist(bp, &bufqueues[BQ_AGE], 0)); KDASSERT(checkfreelist(bp, &bufqueues[BQ_LRU], 0)); KDASSERT(checkfreelist(bp, &bufqueues[BQ_LOCKED], 0)); if ((bp->b_bufsize <= 0) || ISSET(bp->b_cflags, BC_INVAL)) { /* * If it's invalid or empty, dissociate it from its vnode * and put on the head of the appropriate queue. */ if (ISSET(bp->b_flags, B_LOCKED)) { if (wapbl_vphaswapbl(vp = bp->b_vp)) { struct mount *mp = wapbl_vptomp(vp); KASSERT(bp->b_iodone != mp->mnt_wapbl_op->wo_wapbl_biodone); WAPBL_REMOVE_BUF(mp, bp); } } mutex_enter(bp->b_objlock); CLR(bp->b_oflags, BO_DONE|BO_DELWRI); if ((vp = bp->b_vp) != NULL) { KASSERT(bp->b_objlock == vp->v_interlock); reassignbuf(bp, bp->b_vp); brelvp(bp); mutex_exit(vp->v_interlock); } else { KASSERT(bp->b_objlock == &buffer_lock); mutex_exit(bp->b_objlock); } /* We want to dispose of the buffer, so wake everybody. */ cv_broadcast(&bp->b_busy); if (bp->b_bufsize <= 0) /* no data */ goto already_queued; else /* invalid data */ bufq = &bufqueues[BQ_AGE]; binsheadfree(bp, bufq); } else { /* * It has valid data. Put it on the end of the appropriate * queue, so that it'll stick around for as long as possible. * If buf is AGE, but has dependencies, must put it on last * bufqueue to be scanned, ie LRU. This protects against the * livelock where BQ_AGE only has buffers with dependencies, * and we thus never get to the dependent buffers in BQ_LRU. */ if (ISSET(bp->b_flags, B_LOCKED)) { /* locked in core */ bufq = &bufqueues[BQ_LOCKED]; } else if (!ISSET(bp->b_cflags, BC_AGE)) { /* valid data */ bufq = &bufqueues[BQ_LRU]; } else { /* stale but valid data */ bufq = &bufqueues[BQ_AGE]; } binstailfree(bp, bufq); } already_queued: /* Unlock the buffer. */ CLR(bp->b_cflags, BC_AGE|BC_BUSY|BC_NOCACHE); CLR(bp->b_flags, B_ASYNC); /* * Wake only the highest priority waiter on the lock, in order to * prevent a thundering herd: many LWPs simultaneously awakening and * competing for the buffer's lock. Testing in 2019 revealed this * to reduce contention on bufcache_lock tenfold during a kernel * compile. Here and elsewhere, when the buffer is changing * identity, being disposed of, or moving from one list to another, * we wake all lock requestors. */ if (bp->b_bufsize <= 0) { cv_broadcast(&bp->b_busy); buf_destroy(bp); #ifdef DEBUG memset((char *)bp, 0, sizeof(*bp)); #endif pool_cache_put(buf_cache, bp); } else cv_signal(&bp->b_busy); } void brelse(buf_t *bp, int set) { mutex_enter(&bufcache_lock); brelsel(bp, set); mutex_exit(&bufcache_lock); } /* * Determine if a block is in the cache. * Just look on what would be its hash chain. If it's there, return * a pointer to it, unless it's marked invalid. If it's marked invalid, * we normally don't return the buffer, unless the caller explicitly * wants us to. */ buf_t * incore(struct vnode *vp, daddr_t blkno) { buf_t *bp; KASSERT(mutex_owned(&bufcache_lock)); /* Search hash chain */ LIST_FOREACH(bp, BUFHASH(vp, blkno), b_hash) { if (bp->b_lblkno == blkno && bp->b_vp == vp && !ISSET(bp->b_cflags, BC_INVAL)) { KASSERT(bp->b_objlock == vp->v_interlock); return (bp); } } return (NULL); } /* * Get a block of requested size that is associated with * a given vnode and block offset. If it is found in the * block cache, mark it as having been found, make it busy * and return it. Otherwise, return an empty block of the * correct size. It is up to the caller to insure that the * cached blocks be of the correct size. */ buf_t * getblk(struct vnode *vp, daddr_t blkno, int size, int slpflag, int slptimeo) { int err, preserve; buf_t *bp; mutex_enter(&bufcache_lock); SDT_PROBE3(io, kernel, , getblk__start, vp, blkno, size); loop: bp = incore(vp, blkno); if (bp != NULL) { err = bbusy(bp, ((slpflag & PCATCH) != 0), slptimeo, NULL); if (err != 0) { if (err == EPASSTHROUGH) goto loop; mutex_exit(&bufcache_lock); SDT_PROBE4(io, kernel, , getblk__done, vp, blkno, size, NULL); return (NULL); } KASSERT(!cv_has_waiters(&bp->b_done)); #ifdef DIAGNOSTIC if (ISSET(bp->b_oflags, BO_DONE|BO_DELWRI) && bp->b_bcount < size && vp->v_type != VBLK) panic("getblk: block size invariant failed"); #endif bremfree(bp); preserve = 1; } else { if ((bp = getnewbuf(slpflag, slptimeo, 0)) == NULL) goto loop; if (incore(vp, blkno) != NULL) { /* The block has come into memory in the meantime. */ brelsel(bp, 0); goto loop; } LIST_INSERT_HEAD(BUFHASH(vp, blkno), bp, b_hash); bp->b_blkno = bp->b_lblkno = bp->b_rawblkno = blkno; mutex_enter(vp->v_interlock); bgetvp(vp, bp); mutex_exit(vp->v_interlock); preserve = 0; } mutex_exit(&bufcache_lock); /* * LFS can't track total size of B_LOCKED buffer (locked_queue_bytes) * if we re-size buffers here. */ if (ISSET(bp->b_flags, B_LOCKED)) { KASSERT(bp->b_bufsize >= size); } else { if (allocbuf(bp, size, preserve)) { mutex_enter(&bufcache_lock); LIST_REMOVE(bp, b_hash); brelsel(bp, BC_INVAL); mutex_exit(&bufcache_lock); SDT_PROBE4(io, kernel, , getblk__done, vp, blkno, size, NULL); return NULL; } } BIO_SETPRIO(bp, BPRIO_DEFAULT); SDT_PROBE4(io, kernel, , getblk__done, vp, blkno, size, bp); return (bp); } /* * Get an empty, disassociated buffer of given size. */ buf_t * geteblk(int size) { buf_t *bp; int error __diagused; mutex_enter(&bufcache_lock); while ((bp = getnewbuf(0, 0, 0)) == NULL) ; SET(bp->b_cflags, BC_INVAL); LIST_INSERT_HEAD(&invalhash, bp, b_hash); mutex_exit(&bufcache_lock); BIO_SETPRIO(bp, BPRIO_DEFAULT); error = allocbuf(bp, size, 0); KASSERT(error == 0); return (bp); } /* * Expand or contract the actual memory allocated to a buffer. * * If the buffer shrinks, data is lost, so it's up to the * caller to have written it out *first*; this routine will not * start a write. If the buffer grows, it's the callers * responsibility to fill out the buffer's additional contents. */ int allocbuf(buf_t *bp, int size, int preserve) { void *addr; vsize_t oldsize, desired_size; int oldcount; int delta; desired_size = buf_roundsize(size); if (desired_size > MAXBSIZE) printf("allocbuf: buffer larger than MAXBSIZE requested"); oldcount = bp->b_bcount; bp->b_bcount = size; oldsize = bp->b_bufsize; if (oldsize == desired_size) { /* * Do not short cut the WAPBL resize, as the buffer length * could still have changed and this would corrupt the * tracking of the transaction length. */ goto out; } /* * If we want a buffer of a different size, re-allocate the * buffer's memory; copy old content only if needed. */ addr = buf_alloc(desired_size); if (addr == NULL) return ENOMEM; if (preserve) memcpy(addr, bp->b_data, MIN(oldsize,desired_size)); if (bp->b_data != NULL) buf_mrelease(bp->b_data, oldsize); bp->b_data = addr; bp->b_bufsize = desired_size; /* * Update overall buffer memory counter (protected by bufcache_lock) */ delta = (long)desired_size - (long)oldsize; mutex_enter(&bufcache_lock); if ((bufmem += delta) > bufmem_hiwater) { /* * Need to trim overall memory usage. */ while (buf_canrelease()) { if (preempt_needed()) { mutex_exit(&bufcache_lock); preempt(); mutex_enter(&bufcache_lock); } if (buf_trim() == 0) break; } } mutex_exit(&bufcache_lock); out: if (wapbl_vphaswapbl(bp->b_vp)) WAPBL_RESIZE_BUF(wapbl_vptomp(bp->b_vp), bp, oldsize, oldcount); return 0; } /* * Find a buffer which is available for use. * Select something from a free list. * Preference is to AGE list, then LRU list. * * Called with the buffer queues locked. * Return buffer locked. */ static buf_t * getnewbuf(int slpflag, int slptimeo, int from_bufq) { buf_t *bp; struct vnode *vp; struct mount *transmp = NULL; SDT_PROBE0(io, kernel, , getnewbuf__start); start: KASSERT(mutex_owned(&bufcache_lock)); /* * Get a new buffer from the pool. */ if (!from_bufq && buf_lotsfree()) { mutex_exit(&bufcache_lock); bp = pool_cache_get(buf_cache, PR_NOWAIT); if (bp != NULL) { memset((char *)bp, 0, sizeof(*bp)); buf_init(bp); SET(bp->b_cflags, BC_BUSY); /* mark buffer busy */ mutex_enter(&bufcache_lock); #if defined(DIAGNOSTIC) bp->b_freelistindex = -1; #endif /* defined(DIAGNOSTIC) */ SDT_PROBE1(io, kernel, , getnewbuf__done, bp); return (bp); } mutex_enter(&bufcache_lock); } KASSERT(mutex_owned(&bufcache_lock)); if ((bp = TAILQ_FIRST(&bufqueues[BQ_AGE].bq_queue)) != NULL) { KASSERT(!ISSET(bp->b_oflags, BO_DELWRI)); } else { TAILQ_FOREACH(bp, &bufqueues[BQ_LRU].bq_queue, b_freelist) { if (ISSET(bp->b_cflags, BC_VFLUSH) || !ISSET(bp->b_oflags, BO_DELWRI)) break; if (fstrans_start_nowait(bp->b_vp->v_mount) == 0) { KASSERT(transmp == NULL); transmp = bp->b_vp->v_mount; break; } } } if (bp != NULL) { KASSERT(!ISSET(bp->b_cflags, BC_BUSY) || ISSET(bp->b_cflags, BC_VFLUSH)); bremfree(bp); /* Buffer is no longer on free lists. */ SET(bp->b_cflags, BC_BUSY); /* Wake anyone trying to lock the old identity. */ cv_broadcast(&bp->b_busy); } else { /* * XXX: !from_bufq should be removed. */ if (!from_bufq || curlwp != uvm.pagedaemon_lwp) { /* wait for a free buffer of any kind */ if ((slpflag & PCATCH) != 0) (void)cv_timedwait_sig(&needbuffer_cv, &bufcache_lock, slptimeo); else (void)cv_timedwait(&needbuffer_cv, &bufcache_lock, slptimeo); } SDT_PROBE1(io, kernel, , getnewbuf__done, NULL); return (NULL); } #ifdef DIAGNOSTIC if (bp->b_bufsize <= 0) panic("buffer %p: on queue but empty", bp); #endif if (ISSET(bp->b_cflags, BC_VFLUSH)) { /* * This is a delayed write buffer being flushed to disk. Make * sure it gets aged out of the queue when it's finished, and * leave it off the LRU queue. */ CLR(bp->b_cflags, BC_VFLUSH); SET(bp->b_cflags, BC_AGE); goto start; } KASSERT(ISSET(bp->b_cflags, BC_BUSY)); KASSERT(!cv_has_waiters(&bp->b_done)); /* * If buffer was a delayed write, start it and return NULL * (since we might sleep while starting the write). */ if (ISSET(bp->b_oflags, BO_DELWRI)) { /* * This buffer has gone through the LRU, so make sure it gets * reused ASAP. */ SET(bp->b_cflags, BC_AGE); mutex_exit(&bufcache_lock); bawrite(bp); KASSERT(transmp != NULL); fstrans_done(transmp); mutex_enter(&bufcache_lock); SDT_PROBE1(io, kernel, , getnewbuf__done, NULL); return (NULL); } KASSERT(transmp == NULL); vp = bp->b_vp; /* clear out various other fields */ bp->b_cflags = BC_BUSY; bp->b_oflags = 0; bp->b_flags = 0; bp->b_dev = NODEV; bp->b_blkno = 0; bp->b_lblkno = 0; bp->b_rawblkno = 0; bp->b_iodone = 0; bp->b_error = 0; bp->b_resid = 0; bp->b_bcount = 0; LIST_REMOVE(bp, b_hash); /* Disassociate us from our vnode, if we had one... */ if (vp != NULL) { mutex_enter(vp->v_interlock); brelvp(bp); mutex_exit(vp->v_interlock); } SDT_PROBE1(io, kernel, , getnewbuf__done, bp); return (bp); } /* * Invalidate the specified buffer if it exists. */ void binvalbuf(struct vnode *vp, daddr_t blkno) { buf_t *bp; int err; mutex_enter(&bufcache_lock); loop: bp = incore(vp, blkno); if (bp != NULL) { err = bbusy(bp, 0, 0, NULL); if (err == EPASSTHROUGH) goto loop; bremfree(bp); if (ISSET(bp->b_oflags, BO_DELWRI)) { SET(bp->b_cflags, BC_NOCACHE); mutex_exit(&bufcache_lock); bwrite(bp); } else { brelsel(bp, BC_INVAL); mutex_exit(&bufcache_lock); } } else mutex_exit(&bufcache_lock); } /* * Attempt to free an aged buffer off the queues. * Called with queue lock held. * Returns the amount of buffer memory freed. */ static int buf_trim(void) { buf_t *bp; long size; KASSERT(mutex_owned(&bufcache_lock)); /* Instruct getnewbuf() to get buffers off the queues */ if ((bp = getnewbuf(PCATCH, 1, 1)) == NULL) return 0; KASSERT((bp->b_cflags & BC_WANTED) == 0); size = bp->b_bufsize; bufmem -= size; if (size > 0) { buf_mrelease(bp->b_data, size); bp->b_bcount = bp->b_bufsize = 0; } /* brelse() will return the buffer to the global buffer pool */ brelsel(bp, 0); return size; } int buf_drain(int n) { int size = 0, sz; KASSERT(mutex_owned(&bufcache_lock)); while (size < n && bufmem > bufmem_lowater) { sz = buf_trim(); if (sz <= 0) break; size += sz; } return size; } /* * Wait for operations on the buffer to complete. * When they do, extract and return the I/O's error value. */ int biowait(buf_t *bp) { BIOHIST_FUNC(__func__); KASSERT(ISSET(bp->b_cflags, BC_BUSY)); SDT_PROBE1(io, kernel, , wait__start, bp); mutex_enter(bp->b_objlock); BIOHIST_CALLARGS(biohist, "bp=%#jx, oflags=0x%jx, ret_addr=%#jx", (uintptr_t)bp, bp->b_oflags, (uintptr_t)__builtin_return_address(0), 0); while (!ISSET(bp->b_oflags, BO_DONE | BO_DELWRI)) { BIOHIST_LOG(biohist, "waiting bp=%#jx", (uintptr_t)bp, 0, 0, 0); cv_wait(&bp->b_done, bp->b_objlock); } mutex_exit(bp->b_objlock); SDT_PROBE1(io, kernel, , wait__done, bp); BIOHIST_LOG(biohist, "return %jd", bp->b_error, 0, 0, 0); return bp->b_error; } /* * Mark I/O complete on a buffer. * * If a callback has been requested, e.g. the pageout * daemon, do so. Otherwise, awaken waiting processes. * * [ Leffler, et al., says on p.247: * "This routine wakes up the blocked process, frees the buffer * for an asynchronous write, or, for a request by the pagedaemon * process, invokes a procedure specified in the buffer structure" ] * * In real life, the pagedaemon (or other system processes) wants * to do async stuff too, and doesn't want the buffer brelse()'d. * (for swap pager, that puts swap buffers on the free lists (!!!), * for the vn device, that puts allocated buffers on the free lists!) */ void biodone(buf_t *bp) { int s; BIOHIST_FUNC(__func__); KASSERT(!ISSET(bp->b_oflags, BO_DONE)); if (cpu_intr_p()) { /* From interrupt mode: defer to a soft interrupt. */ s = splvm(); TAILQ_INSERT_TAIL(&curcpu()->ci_data.cpu_biodone, bp, b_actq); BIOHIST_CALLARGS(biohist, "bp=%#jx, softint scheduled", (uintptr_t)bp, 0, 0, 0); softint_schedule(biodone_sih); splx(s); } else { /* Process now - the buffer may be freed soon. */ biodone2(bp); } } SDT_PROBE_DEFINE1(io, kernel, , done, "struct buf *"/*bp*/); static void biodone2(buf_t *bp) { void (*callout)(buf_t *); SDT_PROBE1(io, kernel, ,done, bp); BIOHIST_FUNC(__func__); BIOHIST_CALLARGS(biohist, "bp=%#jx", (uintptr_t)bp, 0, 0, 0); mutex_enter(bp->b_objlock); /* Note that the transfer is done. */ if (ISSET(bp->b_oflags, BO_DONE)) panic("biodone2 already"); CLR(bp->b_flags, B_COWDONE); SET(bp->b_oflags, BO_DONE); BIO_SETPRIO(bp, BPRIO_DEFAULT); /* Wake up waiting writers. */ if (!ISSET(bp->b_flags, B_READ)) vwakeup(bp); if ((callout = bp->b_iodone) != NULL) { BIOHIST_LOG(biohist, "callout %#jx", (uintptr_t)callout, 0, 0, 0); /* Note callout done, then call out. */ KASSERT(!cv_has_waiters(&bp->b_done)); bp->b_iodone = NULL; mutex_exit(bp->b_objlock); (*callout)(bp); } else if (ISSET(bp->b_flags, B_ASYNC)) { /* If async, release. */ BIOHIST_LOG(biohist, "async", 0, 0, 0, 0); KASSERT(!cv_has_waiters(&bp->b_done)); mutex_exit(bp->b_objlock); brelse(bp, 0); } else { /* Otherwise just wake up waiters in biowait(). */ BIOHIST_LOG(biohist, "wake-up", 0, 0, 0, 0); cv_broadcast(&bp->b_done); mutex_exit(bp->b_objlock); } } static void biointr(void *cookie) { struct cpu_info *ci; buf_t *bp; int s; BIOHIST_FUNC(__func__); BIOHIST_CALLED(biohist); ci = curcpu(); s = splvm(); while (!TAILQ_EMPTY(&ci->ci_data.cpu_biodone)) { KASSERT(curcpu() == ci); bp = TAILQ_FIRST(&ci->ci_data.cpu_biodone); TAILQ_REMOVE(&ci->ci_data.cpu_biodone, bp, b_actq); splx(s); BIOHIST_LOG(biohist, "bp=%#jx", (uintptr_t)bp, 0, 0, 0); biodone2(bp); s = splvm(); } splx(s); } static void sysctl_fillbuf(const buf_t *i, struct buf_sysctl *o) { const bool allowaddr = get_expose_address(curproc); memset(o, 0, sizeof(*o)); o->b_flags = i->b_flags | i->b_cflags | i->b_oflags; o->b_error = i->b_error; o->b_prio = i->b_prio; o->b_dev = i->b_dev; o->b_bufsize = i->b_bufsize; o->b_bcount = i->b_bcount; o->b_resid = i->b_resid; COND_SET_VALUE(o->b_addr, PTRTOUINT64(i->b_data), allowaddr); o->b_blkno = i->b_blkno; o->b_rawblkno = i->b_rawblkno; COND_SET_VALUE(o->b_iodone, PTRTOUINT64(i->b_iodone), allowaddr); COND_SET_VALUE(o->b_proc, PTRTOUINT64(i->b_proc), allowaddr); COND_SET_VALUE(o->b_vp, PTRTOUINT64(i->b_vp), allowaddr); COND_SET_VALUE(o->b_saveaddr, PTRTOUINT64(i->b_saveaddr), allowaddr); o->b_lblkno = i->b_lblkno; } static int sysctl_dobuf(SYSCTLFN_ARGS) { buf_t *bp; struct buf_sysctl bs; struct bqueue *bq; char *dp; u_int i, op, arg; size_t len, needed, elem_size, out_size; int error, elem_count, retries; if (namelen == 1 && name[0] == CTL_QUERY) return (sysctl_query(SYSCTLFN_CALL(rnode))); if (namelen != 4) return (EINVAL); retries = 100; retry: dp = oldp; len = (oldp != NULL) ? *oldlenp : 0; op = name[0]; arg = name[1]; elem_size = name[2]; elem_count = name[3]; out_size = MIN(sizeof(bs), elem_size); /* * at the moment, these are just "placeholders" to make the * API for retrieving kern.buf data more extensible in the * future. * * XXX kern.buf currently has "netbsd32" issues. hopefully * these will be resolved at a later point. */ if (op != KERN_BUF_ALL || arg != KERN_BUF_ALL || elem_size < 1 || elem_count < 0) return (EINVAL); if (oldp == NULL) { /* count only, don't run through the buffer queues */ needed = pool_cache_nget(buf_cache) - pool_cache_nput(buf_cache); *oldlenp = (needed + KERN_BUFSLOP) * elem_size; return 0; } error = 0; needed = 0; sysctl_unlock(); mutex_enter(&bufcache_lock); for (i = 0; i < BQUEUES; i++) { bq = &bufqueues[i]; TAILQ_FOREACH(bp, &bq->bq_queue, b_freelist) { bq->bq_marker = bp; if (len >= elem_size && elem_count > 0) { sysctl_fillbuf(bp, &bs); mutex_exit(&bufcache_lock); error = copyout(&bs, dp, out_size); mutex_enter(&bufcache_lock); if (error) break; if (bq->bq_marker != bp) { /* * This sysctl node is only for * statistics. Retry; if the * queue keeps changing, then * bail out. */ if (retries-- == 0) { error = EAGAIN; break; } mutex_exit(&bufcache_lock); sysctl_relock(); goto retry; } dp += elem_size; len -= elem_size; } needed += elem_size; if (elem_count > 0 && elem_count != INT_MAX) elem_count--; } if (error != 0) break; } mutex_exit(&bufcache_lock); sysctl_relock(); *oldlenp = needed; return (error); } static int sysctl_bufvm_update(SYSCTLFN_ARGS) { int error, rv; struct sysctlnode node; unsigned int temp_bufcache; unsigned long temp_water; /* Take a copy of the supplied node and its data */ node = *rnode; if (node.sysctl_data == &bufcache) { node.sysctl_data = &temp_bufcache; temp_bufcache = *(unsigned int *)rnode->sysctl_data; } else { node.sysctl_data = &temp_water; temp_water = *(unsigned long *)rnode->sysctl_data; } /* Update the copy */ error = sysctl_lookup(SYSCTLFN_CALL(&node)); if (error || newp == NULL) return (error); if (rnode->sysctl_data == &bufcache) { if (temp_bufcache > 100) return (EINVAL); bufcache = temp_bufcache; buf_setwm(); } else if (rnode->sysctl_data == &bufmem_lowater) { if (bufmem_hiwater - temp_water < 16) return (EINVAL); bufmem_lowater = temp_water; } else if (rnode->sysctl_data == &bufmem_hiwater) { if (temp_water - bufmem_lowater < 16) return (EINVAL); bufmem_hiwater = temp_water; } else return (EINVAL); /* Drain until below new high water mark */ sysctl_unlock(); mutex_enter(&bufcache_lock); while (bufmem > bufmem_hiwater) { rv = buf_drain((bufmem - bufmem_hiwater) / (2 * 1024)); if (rv <= 0) break; } mutex_exit(&bufcache_lock); sysctl_relock(); return 0; } static struct sysctllog *vfsbio_sysctllog; static void sysctl_kern_buf_setup(void) { sysctl_createv(&vfsbio_sysctllog, 0, NULL, NULL, CTLFLAG_PERMANENT, CTLTYPE_NODE, "buf", SYSCTL_DESCR("Kernel buffer cache information"), sysctl_dobuf, 0, NULL, 0, CTL_KERN, KERN_BUF, CTL_EOL); } static void sysctl_vm_buf_setup(void) { sysctl_createv(&vfsbio_sysctllog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_INT, "bufcache", SYSCTL_DESCR("Percentage of physical memory to use for " "buffer cache"), sysctl_bufvm_update, 0, &bufcache, 0, CTL_VM, CTL_CREATE, CTL_EOL); sysctl_createv(&vfsbio_sysctllog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_READONLY, CTLTYPE_LONG, "bufmem", SYSCTL_DESCR("Amount of kernel memory used by buffer " "cache"), NULL, 0, &bufmem, 0, CTL_VM, CTL_CREATE, CTL_EOL); sysctl_createv(&vfsbio_sysctllog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_LONG, "bufmem_lowater", SYSCTL_DESCR("Minimum amount of kernel memory to " "reserve for buffer cache"), sysctl_bufvm_update, 0, &bufmem_lowater, 0, CTL_VM, CTL_CREATE, CTL_EOL); sysctl_createv(&vfsbio_sysctllog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_LONG, "bufmem_hiwater", SYSCTL_DESCR("Maximum amount of kernel memory to use " "for buffer cache"), sysctl_bufvm_update, 0, &bufmem_hiwater, 0, CTL_VM, CTL_CREATE, CTL_EOL); } static int bufhash_stats(struct hashstat_sysctl *hs, bool fill) { buf_t *bp; uint64_t chain; strlcpy(hs->hash_name, "bufhash", sizeof(hs->hash_name)); strlcpy(hs->hash_desc, "buffer hash", sizeof(hs->hash_desc)); if (!fill) return 0; hs->hash_size = bufhash + 1; for (size_t i = 0; i < hs->hash_size; i++) { chain = 0; mutex_enter(&bufcache_lock); LIST_FOREACH(bp, &bufhashtbl[i], b_hash) { chain++; } mutex_exit(&bufcache_lock); if (chain > 0) { hs->hash_used++; hs->hash_items += chain; if (chain > hs->hash_maxchain) hs->hash_maxchain = chain; } preempt_point(); } return 0; } #ifdef DEBUG /* * Print out statistics on the current allocation of the buffer pool. * Can be enabled to print out on every ``sync'' by setting "syncprt" * in vfs_syscalls.c using sysctl. */ void vfs_bufstats(void) { int i, j, count; buf_t *bp; struct bqueue *dp; int counts[MAXBSIZE / MIN_PAGE_SIZE + 1]; static const char *bname[BQUEUES] = { "LOCKED", "LRU", "AGE" }; for (dp = bufqueues, i = 0; dp < &bufqueues[BQUEUES]; dp++, i++) { count = 0; memset(counts, 0, sizeof(counts)); TAILQ_FOREACH(bp, &dp->bq_queue, b_freelist) { counts[bp->b_bufsize / PAGE_SIZE]++; count++; } printf("%s: total-%d", bname[i], count); for (j = 0; j <= MAXBSIZE / PAGE_SIZE; j++) if (counts[j] != 0) printf(", %d-%d", j * PAGE_SIZE, counts[j]); printf("\n"); } } #endif /* DEBUG */ /* ------------------------------ */ buf_t * getiobuf(struct vnode *vp, bool waitok) { buf_t *bp; bp = pool_cache_get(bufio_cache, (waitok ? PR_WAITOK : PR_NOWAIT)); if (bp == NULL) return bp; buf_init(bp); if ((bp->b_vp = vp) != NULL) { bp->b_objlock = vp->v_interlock; } else { KASSERT(bp->b_objlock == &buffer_lock); } return bp; } void putiobuf(buf_t *bp) { buf_destroy(bp); pool_cache_put(bufio_cache, bp); } /* * nestiobuf_iodone: b_iodone callback for nested buffers. */ void nestiobuf_iodone(buf_t *bp) { buf_t *mbp = bp->b_private; int error; int donebytes; KASSERT(bp->b_bcount <= bp->b_bufsize); KASSERT(mbp != bp); error = bp->b_error; if (bp->b_error == 0 && (bp->b_bcount < bp->b_bufsize || bp->b_resid > 0)) { /* * Not all got transferred, raise an error. We have no way to * propagate these conditions to mbp. */ error = EIO; } donebytes = bp->b_bufsize; putiobuf(bp); nestiobuf_done(mbp, donebytes, error); } /* * nestiobuf_setup: setup a "nested" buffer. * * => 'mbp' is a "master" buffer which is being divided into sub pieces. * => 'bp' should be a buffer allocated by getiobuf. * => 'offset' is a byte offset in the master buffer. * => 'size' is a size in bytes of this nested buffer. */ void nestiobuf_setup(buf_t *mbp, buf_t *bp, int offset, size_t size) { const int b_pass = mbp->b_flags & (B_READ|B_PHYS|B_RAW|B_MEDIA_FLAGS); struct vnode *vp = mbp->b_vp; KASSERT(mbp->b_bcount >= offset + size); bp->b_vp = vp; bp->b_dev = mbp->b_dev; bp->b_objlock = mbp->b_objlock; bp->b_cflags = BC_BUSY; bp->b_flags = B_ASYNC | b_pass; bp->b_iodone = nestiobuf_iodone; bp->b_data = (char *)mbp->b_data + offset; bp->b_resid = bp->b_bcount = size; bp->b_bufsize = bp->b_bcount; bp->b_private = mbp; BIO_COPYPRIO(bp, mbp); if (BUF_ISWRITE(bp) && vp != NULL) { mutex_enter(vp->v_interlock); vp->v_numoutput++; mutex_exit(vp->v_interlock); } } /* * nestiobuf_done: propagate completion to the master buffer. * * => 'donebytes' specifies how many bytes in the 'mbp' is completed. * => 'error' is an errno(2) that 'donebytes' has been completed with. */ void nestiobuf_done(buf_t *mbp, int donebytes, int error) { if (donebytes == 0) { return; } mutex_enter(mbp->b_objlock); KASSERT(mbp->b_resid >= donebytes); mbp->b_resid -= donebytes; if (error) mbp->b_error = error; if (mbp->b_resid == 0) { if (mbp->b_error) mbp->b_resid = mbp->b_bcount; mutex_exit(mbp->b_objlock); biodone(mbp); } else mutex_exit(mbp->b_objlock); } void buf_init(buf_t *bp) { cv_init(&bp->b_busy, "biolock"); cv_init(&bp->b_done, "biowait"); bp->b_dev = NODEV; bp->b_error = 0; bp->b_flags = 0; bp->b_cflags = 0; bp->b_oflags = 0; bp->b_objlock = &buffer_lock; bp->b_iodone = NULL; bp->b_dev = NODEV; bp->b_vnbufs.le_next = NOLIST; BIO_SETPRIO(bp, BPRIO_DEFAULT); } void buf_destroy(buf_t *bp) { cv_destroy(&bp->b_done); cv_destroy(&bp->b_busy); } int bbusy(buf_t *bp, bool intr, int timo, kmutex_t *interlock) { int error; KASSERT(mutex_owned(&bufcache_lock)); SDT_PROBE4(io, kernel, , bbusy__start, bp, intr, timo, interlock); if ((bp->b_cflags & BC_BUSY) != 0) { if (curlwp == uvm.pagedaemon_lwp) { error = EDEADLK; goto out; } bp->b_cflags |= BC_WANTED; if (interlock != NULL) mutex_exit(interlock); if (intr) { error = cv_timedwait_sig(&bp->b_busy, &bufcache_lock, timo); } else { error = cv_timedwait(&bp->b_busy, &bufcache_lock, timo); } /* * At this point the buffer may be gone: don't touch it * again. The caller needs to find it again and retry. */ if (interlock != NULL) mutex_enter(interlock); if (error == 0) error = EPASSTHROUGH; } else { bp->b_cflags |= BC_BUSY; error = 0; } out: SDT_PROBE5(io, kernel, , bbusy__done, bp, intr, timo, interlock, error); return error; } /* * Nothing outside this file should really need to know about nbuf, * but a few things still want to read it, so give them a way to do that. */ u_int buf_nbuf(void) { return nbuf; }
13 2 4 1 5 5 5 4 1 3 2 3 3 3 7 7 2 5 3 5 3 5 10 10 10 3 3 2 13 13 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 /* $NetBSD: tmpfs_vfsops.c,v 1.78 2022/11/10 10:54:14 hannken Exp $ */ /* * Copyright (c) 2005, 2006, 2007 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Julio M. Merino Vidal, developed as part of Google's Summer of Code * 2005 program. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /* * Efficient memory file system. * * tmpfs is a file system that uses NetBSD's virtual memory sub-system * (the well-known UVM) to store file data and metadata in an efficient * way. This means that it does not follow the structure of an on-disk * file system because it simply does not need to. Instead, it uses * memory-specific data structures and algorithms to automatically * allocate and release resources. */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: tmpfs_vfsops.c,v 1.78 2022/11/10 10:54:14 hannken Exp $"); #include <sys/param.h> #include <sys/atomic.h> #include <sys/types.h> #include <sys/kmem.h> #include <sys/mount.h> #include <sys/stat.h> #include <sys/systm.h> #include <sys/vnode.h> #include <sys/kauth.h> #include <sys/module.h> #include <miscfs/genfs/genfs.h> #include <fs/tmpfs/tmpfs.h> #include <fs/tmpfs/tmpfs_args.h> MODULE(MODULE_CLASS_VFS, tmpfs, NULL); struct pool tmpfs_dirent_pool; struct pool tmpfs_node_pool; void tmpfs_init(void) { pool_init(&tmpfs_dirent_pool, sizeof(tmpfs_dirent_t), 0, 0, 0, "tmpfs_dirent", &pool_allocator_nointr, IPL_NONE); pool_init(&tmpfs_node_pool, sizeof(tmpfs_node_t), 0, 0, 0, "tmpfs_node", &pool_allocator_nointr, IPL_NONE); } void tmpfs_done(void) { pool_destroy(&tmpfs_dirent_pool); pool_destroy(&tmpfs_node_pool); } int tmpfs_mount(struct mount *mp, const char *path, void *data, size_t *data_len) { struct tmpfs_args *args = data; tmpfs_mount_t *tmp; tmpfs_node_t *root; struct vattr va; struct vnode *vp; uint64_t memlimit; ino_t nodes; int error, flags; bool set_memlimit; bool set_nodes; if (args == NULL) return EINVAL; /* Validate the version. */ if (*data_len < sizeof(*args) || args->ta_version != TMPFS_ARGS_VERSION) return EINVAL; /* Handle retrieval of mount point arguments. */ if (mp->mnt_flag & MNT_GETARGS) { if (mp->mnt_data == NULL) return EIO; tmp = VFS_TO_TMPFS(mp); args->ta_version = TMPFS_ARGS_VERSION; args->ta_nodes_max = tmp->tm_nodes_max; args->ta_size_max = tmp->tm_mem_limit; root = tmp->tm_root; args->ta_root_uid = root->tn_uid; args->ta_root_gid = root->tn_gid; args->ta_root_mode = root->tn_mode; *data_len = sizeof(*args); return 0; } /* Prohibit mounts if there is not enough memory. */ if (tmpfs_mem_info(true) < uvmexp.freetarg) return EINVAL; /* Check for invalid uid and gid arguments */ if (args->ta_root_uid == VNOVAL || args->ta_root_gid == VNOVAL) return EINVAL; /* Get the memory usage limit for this file-system. */ if (args->ta_size_max < PAGE_SIZE) { memlimit = UINT64_MAX; set_memlimit = false; } else { memlimit = args->ta_size_max; set_memlimit = true; } KASSERT(memlimit > 0); if (args->ta_nodes_max <= 3) { nodes = 3 + (memlimit / 1024); set_nodes = false; } else { nodes = args->ta_nodes_max; set_nodes = true; } nodes = MIN(nodes, INT_MAX); KASSERT(nodes >= 3); if (mp->mnt_flag & MNT_UPDATE) { tmp = VFS_TO_TMPFS(mp); if (set_nodes && nodes < tmp->tm_nodes_cnt) return EBUSY; if ((mp->mnt_iflag & IMNT_WANTRDONLY)) { /* Changing from read/write to read-only. */ flags = WRITECLOSE; if ((mp->mnt_flag & MNT_FORCE)) flags |= FORCECLOSE; error = vflush(mp, NULL, flags); if (error) return error; } if (set_memlimit) { if ((error = tmpfs_mntmem_set(tmp, memlimit)) != 0) return error; } if (set_nodes) tmp->tm_nodes_max = nodes; root = tmp->tm_root; root->tn_uid = args->ta_root_uid; root->tn_gid = args->ta_root_gid; root->tn_mode = args->ta_root_mode; return 0; } mp->mnt_flag |= MNT_LOCAL; mp->mnt_stat.f_namemax = TMPFS_MAXNAMLEN; mp->mnt_fs_bshift = PAGE_SHIFT; mp->mnt_dev_bshift = DEV_BSHIFT; mp->mnt_iflag |= IMNT_MPSAFE | IMNT_CAN_RWTORO | IMNT_SHRLOOKUP | IMNT_NCLOOKUP; vfs_getnewfsid(mp); /* Allocate the tmpfs mount structure and fill it. */ tmp = kmem_zalloc(sizeof(tmpfs_mount_t), KM_SLEEP); tmp->tm_nodes_max = nodes; tmp->tm_nodes_cnt = 0; LIST_INIT(&tmp->tm_nodes); mutex_init(&tmp->tm_lock, MUTEX_DEFAULT, IPL_NONE); tmpfs_mntmem_init(tmp, memlimit); mp->mnt_data = tmp; error = set_statvfs_info(path, UIO_USERSPACE, "tmpfs", UIO_SYSSPACE, mp->mnt_op->vfs_name, mp, curlwp); if (error) goto errout; /* Allocate the root node. */ vattr_null(&va); va.va_type = VDIR; va.va_mode = args->ta_root_mode & ALLPERMS; va.va_uid = args->ta_root_uid; va.va_gid = args->ta_root_gid; error = vcache_new(mp, NULL, &va, NOCRED, NULL, &vp); if (error) goto errout; KASSERT(vp != NULL); root = VP_TO_TMPFS_NODE(vp); KASSERT(root != NULL); /* * Parent of the root inode is itself. Also, root inode has no * directory entry (i.e. is never attached), thus hold an extra * reference (link) for it. */ root->tn_links++; root->tn_spec.tn_dir.tn_parent = root; tmp->tm_root = root; vrele(vp); return 0; errout: mp->mnt_data = NULL; tmpfs_mntmem_destroy(tmp); mutex_destroy(&tmp->tm_lock); kmem_free(tmp, sizeof(*tmp)); return error; } int tmpfs_start(struct mount *mp, int flags) { return 0; } int tmpfs_unmount(struct mount *mp, int mntflags) { tmpfs_mount_t *tmp = VFS_TO_TMPFS(mp); tmpfs_node_t *node, *cnode; int error, flags = 0; /* Handle forced unmounts. */ if (mntflags & MNT_FORCE) flags |= FORCECLOSE; /* Finalize all pending I/O. */ error = vflush(mp, NULL, flags); if (error != 0) return error; /* * First round, detach and destroy all directory entries. * Also, clear the pointers to the vnodes - they are gone. */ LIST_FOREACH(node, &tmp->tm_nodes, tn_entries) { tmpfs_dirent_t *de; node->tn_vnode = NULL; if (node->tn_type != VDIR) { continue; } while ((de = TAILQ_FIRST(&node->tn_spec.tn_dir.tn_dir)) != NULL) { cnode = de->td_node; if (cnode && cnode != TMPFS_NODE_WHITEOUT) { cnode->tn_vnode = NULL; } tmpfs_dir_detach(node, de); tmpfs_free_dirent(tmp, de); } /* Extra virtual entry (itself for the root). */ node->tn_links--; } /* Release the reference on root (diagnostic). */ node = tmp->tm_root; node->tn_links--; /* Second round, destroy all inodes. */ while ((node = LIST_FIRST(&tmp->tm_nodes)) != NULL) { tmpfs_free_node(tmp, node); } /* Throw away the tmpfs_mount structure. */ tmpfs_mntmem_destroy(tmp); mutex_destroy(&tmp->tm_lock); kmem_free(tmp, sizeof(*tmp)); mp->mnt_data = NULL; return 0; } int tmpfs_root(struct mount *mp, int lktype, vnode_t **vpp) { tmpfs_node_t *node = VFS_TO_TMPFS(mp)->tm_root; int error; error = vcache_get(mp, &node, sizeof(node), vpp); if (error) return error; error = vn_lock(*vpp, lktype); if (error) { vrele(*vpp); *vpp = NULL; return error; } return 0; } int tmpfs_vget(struct mount *mp, ino_t ino, int lktype, vnode_t **vpp) { return EOPNOTSUPP; } int tmpfs_fhtovp(struct mount *mp, struct fid *fhp, int lktype, vnode_t **vpp) { tmpfs_mount_t *tmp = VFS_TO_TMPFS(mp); tmpfs_node_t *node; tmpfs_fid_t tfh; int error; if (fhp->fid_len != sizeof(tmpfs_fid_t)) { return EINVAL; } memcpy(&tfh, fhp, sizeof(tmpfs_fid_t)); mutex_enter(&tmp->tm_lock); /* XXX big oof .. use a better data structure */ LIST_FOREACH(node, &tmp->tm_nodes, tn_entries) { if (node->tn_id == tfh.tf_id) { /* Prevent this node from disappearing. */ atomic_inc_32(&node->tn_holdcount); break; } } mutex_exit(&tmp->tm_lock); if (node == NULL) return ESTALE; error = vcache_get(mp, &node, sizeof(node), vpp); /* If this node has been reclaimed free it now. */ if (atomic_dec_32_nv(&node->tn_holdcount) == TMPFS_NODE_RECLAIMED) { KASSERT(error != 0); tmpfs_free_node(tmp, node); } if (error) return (error == ENOENT ? ESTALE : error); error = vn_lock(*vpp, lktype); if (error) { vrele(*vpp); *vpp = NULL; return error; } if (TMPFS_NODE_GEN(node) != tfh.tf_gen) { vput(*vpp); *vpp = NULL; return ESTALE; } return 0; } int tmpfs_vptofh(vnode_t *vp, struct fid *fhp, size_t *fh_size) { tmpfs_fid_t tfh; tmpfs_node_t *node; if (*fh_size < sizeof(tmpfs_fid_t)) { *fh_size = sizeof(tmpfs_fid_t); return E2BIG; } *fh_size = sizeof(tmpfs_fid_t); node = VP_TO_TMPFS_NODE(vp); memset(&tfh, 0, sizeof(tfh)); tfh.tf_len = sizeof(tmpfs_fid_t); tfh.tf_gen = TMPFS_NODE_GEN(node); tfh.tf_id = node->tn_id; memcpy(fhp, &tfh, sizeof(tfh)); return 0; } int tmpfs_statvfs(struct mount *mp, struct statvfs *sbp) { tmpfs_mount_t *tmp; fsfilcnt_t freenodes; size_t avail; tmp = VFS_TO_TMPFS(mp); sbp->f_iosize = sbp->f_frsize = sbp->f_bsize = PAGE_SIZE; mutex_enter(&tmp->tm_acc_lock); avail = tmpfs_pages_avail(tmp); sbp->f_blocks = (tmpfs_bytes_max(tmp) >> PAGE_SHIFT); sbp->f_bavail = sbp->f_bfree = avail; sbp->f_bresvd = 0; freenodes = MIN(tmp->tm_nodes_max - tmp->tm_nodes_cnt, avail * PAGE_SIZE / sizeof(tmpfs_node_t)); sbp->f_files = tmp->tm_nodes_cnt + freenodes; sbp->f_favail = sbp->f_ffree = freenodes; sbp->f_fresvd = 0; mutex_exit(&tmp->tm_acc_lock); copy_statvfs_info(sbp, mp); return 0; } int tmpfs_sync(struct mount *mp, int waitfor, kauth_cred_t uc) { return 0; } int tmpfs_snapshot(struct mount *mp, vnode_t *vp, struct timespec *ctime) { return EOPNOTSUPP; } /* * tmpfs vfs operations. */ extern const struct vnodeopv_desc tmpfs_fifoop_opv_desc; extern const struct vnodeopv_desc tmpfs_specop_opv_desc; extern const struct vnodeopv_desc tmpfs_vnodeop_opv_desc; const struct vnodeopv_desc * const tmpfs_vnodeopv_descs[] = { &tmpfs_fifoop_opv_desc, &tmpfs_specop_opv_desc, &tmpfs_vnodeop_opv_desc, NULL, }; struct vfsops tmpfs_vfsops = { .vfs_name = MOUNT_TMPFS, .vfs_min_mount_data = sizeof (struct tmpfs_args), .vfs_mount = tmpfs_mount, .vfs_start = tmpfs_start, .vfs_unmount = tmpfs_unmount, .vfs_root = tmpfs_root, .vfs_quotactl = (void *)eopnotsupp, .vfs_statvfs = tmpfs_statvfs, .vfs_sync = tmpfs_sync, .vfs_vget = tmpfs_vget, .vfs_loadvnode = tmpfs_loadvnode, .vfs_newvnode = tmpfs_newvnode, .vfs_fhtovp = tmpfs_fhtovp, .vfs_vptofh = tmpfs_vptofh, .vfs_init = tmpfs_init, .vfs_done = tmpfs_done, .vfs_snapshot = tmpfs_snapshot, .vfs_extattrctl = vfs_stdextattrctl, .vfs_suspendctl = genfs_suspendctl, .vfs_renamelock_enter = genfs_renamelock_enter, .vfs_renamelock_exit = genfs_renamelock_exit, .vfs_fsync = (void *)eopnotsupp, .vfs_opv_descs = tmpfs_vnodeopv_descs }; static int tmpfs_modcmd(modcmd_t cmd, void *arg) { switch (cmd) { case MODULE_CMD_INIT: return vfs_attach(&tmpfs_vfsops); case MODULE_CMD_FINI: return vfs_detach(&tmpfs_vfsops); default: return ENOTTY; } }
8 7 3 5 8 8 8 8 8 8 8 15 1 1 3 4 7 11 1 9 9 3 7 2 2 6 19 3 10 7 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 /* $NetBSD: if_ethersubr.c,v 1.326 2023/11/02 09:40:47 yamaguchi Exp $ */ /* * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the project nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * Copyright (c) 1982, 1989, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)if_ethersubr.c 8.2 (Berkeley) 4/4/96 */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: if_ethersubr.c,v 1.326 2023/11/02 09:40:47 yamaguchi Exp $"); #ifdef _KERNEL_OPT #include "opt_inet.h" #include "opt_atalk.h" #include "opt_mbuftrace.h" #include "opt_mpls.h" #include "opt_gateway.h" #include "opt_pppoe.h" #include "opt_net_mpsafe.h" #endif #include "vlan.h" #include "pppoe.h" #include "bridge.h" #include "arp.h" #include "agr.h" #include <sys/sysctl.h> #include <sys/mbuf.h> #include <sys/mutex.h> #include <sys/ioctl.h> #include <sys/errno.h> #include <sys/device.h> #include <sys/entropy.h> #include <sys/rndsource.h> #include <sys/cpu.h> #include <sys/kmem.h> #include <sys/hook.h> #include <net/if.h> #include <net/route.h> #include <net/if_llc.h> #include <net/if_dl.h> #include <net/if_types.h> #include <net/pktqueue.h> #include <net/if_media.h> #include <dev/mii/mii.h> #include <dev/mii/miivar.h> #if NARP == 0 /* * XXX there should really be a way to issue this warning from within config(8) */ #error You have included NETATALK or a pseudo-device in your configuration that depends on the presence of ethernet interfaces, but have no such interfaces configured. Check if you really need pseudo-device bridge, pppoe, vlan or options NETATALK. #endif #include <net/bpf.h> #include <net/if_ether.h> #include <net/if_vlanvar.h> #if NPPPOE > 0 #include <net/if_pppoe.h> #endif #if NAGR > 0 #include <net/ether_slowprotocols.h> #include <net/agr/ieee8023ad.h> #include <net/agr/if_agrvar.h> #endif #if NBRIDGE > 0 #include <net/if_bridgevar.h> #endif #include <netinet/in.h> #ifdef INET #include <netinet/in_var.h> #endif #include <netinet/if_inarp.h> #ifdef INET6 #ifndef INET #include <netinet/in.h> #endif #include <netinet6/in6_var.h> #include <netinet6/nd6.h> #endif #include "carp.h" #if NCARP > 0 #include <netinet/ip_carp.h> #endif #ifdef NETATALK #include <netatalk/at.h> #include <netatalk/at_var.h> #include <netatalk/at_extern.h> #define llc_snap_org_code llc_un.type_snap.org_code #define llc_snap_ether_type llc_un.type_snap.ether_type extern u_char at_org_code[3]; extern u_char aarp_org_code[3]; #endif /* NETATALK */ #ifdef MPLS #include <netmpls/mpls.h> #include <netmpls/mpls_var.h> #endif CTASSERT(sizeof(struct ether_addr) == 6); CTASSERT(sizeof(struct ether_header) == 14); #ifdef DIAGNOSTIC static struct timeval bigpktppslim_last; static int bigpktppslim = 2; /* XXX */ static int bigpktpps_count; static kmutex_t bigpktpps_lock __cacheline_aligned; #endif const uint8_t etherbroadcastaddr[ETHER_ADDR_LEN] = { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff }; const uint8_t ethermulticastaddr_slowprotocols[ETHER_ADDR_LEN] = { 0x01, 0x80, 0xc2, 0x00, 0x00, 0x02 }; #define senderr(e) { error = (e); goto bad;} static pktq_rps_hash_func_t ether_pktq_rps_hash_p; static int ether_output(struct ifnet *, struct mbuf *, const struct sockaddr *, const struct rtentry *); /* * Ethernet output routine. * Encapsulate a packet of type family for the local net. * Assumes that ifp is actually pointer to ethercom structure. */ static int ether_output(struct ifnet * const ifp0, struct mbuf * const m0, const struct sockaddr * const dst, const struct rtentry *rt) { uint8_t esrc[ETHER_ADDR_LEN], edst[ETHER_ADDR_LEN]; uint16_t etype = 0; int error = 0, hdrcmplt = 0; struct mbuf *m = m0; struct mbuf *mcopy = NULL; struct ether_header *eh; struct ifnet *ifp = ifp0; #ifdef INET struct arphdr *ah; #endif #ifdef NETATALK struct at_ifaddr *aa; #endif #ifdef MBUFTRACE m_claimm(m, ifp->if_mowner); #endif #if NCARP > 0 if (ifp->if_type == IFT_CARP) { struct ifaddr *ifa; int s = pserialize_read_enter(); /* loop back if this is going to the carp interface */ if (dst != NULL && ifp0->if_link_state == LINK_STATE_UP && (ifa = ifa_ifwithaddr(dst)) != NULL) { if (ifa->ifa_ifp == ifp0) { pserialize_read_exit(s); return looutput(ifp0, m, dst, rt); } } pserialize_read_exit(s); ifp = ifp->if_carpdev; /* ac = (struct arpcom *)ifp; */ if ((ifp0->if_flags & (IFF_UP | IFF_RUNNING)) != (IFF_UP | IFF_RUNNING)) senderr(ENETDOWN); } #endif if ((ifp->if_flags & (IFF_UP | IFF_RUNNING)) != (IFF_UP | IFF_RUNNING)) senderr(ENETDOWN); switch (dst->sa_family) { #ifdef INET case AF_INET: if (m->m_flags & M_BCAST) { memcpy(edst, etherbroadcastaddr, sizeof(edst)); } else if (m->m_flags & M_MCAST) { ETHER_MAP_IP_MULTICAST(&satocsin(dst)->sin_addr, edst); } else { error = arpresolve(ifp0, rt, m, dst, edst, sizeof(edst)); if (error) return (error == EWOULDBLOCK) ? 0 : error; } /* If broadcasting on a simplex interface, loopback a copy */ if ((m->m_flags & M_BCAST) && (ifp->if_flags & IFF_SIMPLEX)) mcopy = m_copypacket(m, M_DONTWAIT); etype = htons(ETHERTYPE_IP); break; case AF_ARP: ah = mtod(m, struct arphdr *); if (m->m_flags & M_BCAST) { memcpy(edst, etherbroadcastaddr, sizeof(edst)); } else { void *tha = ar_tha(ah); if (tha == NULL) { /* fake with ARPHRD_IEEE1394 */ m_freem(m); return 0; } memcpy(edst, tha, sizeof(edst)); } ah->ar_hrd = htons(ARPHRD_ETHER); switch (ntohs(ah->ar_op)) { case ARPOP_REVREQUEST: case ARPOP_REVREPLY: etype = htons(ETHERTYPE_REVARP); break; case ARPOP_REQUEST: case ARPOP_REPLY: default: etype = htons(ETHERTYPE_ARP); } break; #endif #ifdef INET6 case AF_INET6: if (m->m_flags & M_BCAST) { memcpy(edst, etherbroadcastaddr, sizeof(edst)); } else if (m->m_flags & M_MCAST) { ETHER_MAP_IPV6_MULTICAST(&satocsin6(dst)->sin6_addr, edst); } else { error = nd6_resolve(ifp0, rt, m, dst, edst, sizeof(edst)); if (error) return (error == EWOULDBLOCK) ? 0 : error; } etype = htons(ETHERTYPE_IPV6); break; #endif #ifdef NETATALK case AF_APPLETALK: { struct ifaddr *ifa; int s; KERNEL_LOCK(1, NULL); if (!aarpresolve(ifp, m, (const struct sockaddr_at *)dst, edst)) { KERNEL_UNLOCK_ONE(NULL); return 0; } /* * ifaddr is the first thing in at_ifaddr */ s = pserialize_read_enter(); ifa = at_ifawithnet((const struct sockaddr_at *)dst, ifp); if (ifa == NULL) { pserialize_read_exit(s); KERNEL_UNLOCK_ONE(NULL); senderr(EADDRNOTAVAIL); } aa = (struct at_ifaddr *)ifa; /* * In the phase 2 case, we need to prepend an mbuf for the * llc header. */ if (aa->aa_flags & AFA_PHASE2) { struct llc llc; M_PREPEND(m, sizeof(struct llc), M_DONTWAIT); if (m == NULL) { pserialize_read_exit(s); KERNEL_UNLOCK_ONE(NULL); senderr(ENOBUFS); } llc.llc_dsap = llc.llc_ssap = LLC_SNAP_LSAP; llc.llc_control = LLC_UI; memcpy(llc.llc_snap_org_code, at_org_code, sizeof(llc.llc_snap_org_code)); llc.llc_snap_ether_type = htons(ETHERTYPE_ATALK); memcpy(mtod(m, void *), &llc, sizeof(struct llc)); } else { etype = htons(ETHERTYPE_ATALK); } pserialize_read_exit(s); KERNEL_UNLOCK_ONE(NULL); break; } #endif /* NETATALK */ case pseudo_AF_HDRCMPLT: hdrcmplt = 1; memcpy(esrc, ((const struct ether_header *)dst->sa_data)->ether_shost, sizeof(esrc)); /* FALLTHROUGH */ case AF_UNSPEC: memcpy(edst, ((const struct ether_header *)dst->sa_data)->ether_dhost, sizeof(edst)); /* AF_UNSPEC doesn't swap the byte order of the ether_type. */ etype = ((const struct ether_header *)dst->sa_data)->ether_type; break; default: printf("%s: can't handle af%d\n", ifp->if_xname, dst->sa_family); senderr(EAFNOSUPPORT); } #ifdef MPLS { struct m_tag *mtag; mtag = m_tag_find(m, PACKET_TAG_MPLS); if (mtag != NULL) { /* Having the tag itself indicates it's MPLS */ etype = htons(ETHERTYPE_MPLS); m_tag_delete(m, mtag); } } #endif if (mcopy) (void)looutput(ifp, mcopy, dst, rt); KASSERT((m->m_flags & M_PKTHDR) != 0); /* * If no ether type is set, this must be a 802.2 formatted packet. */ if (etype == 0) etype = htons(m->m_pkthdr.len); /* * Add local net header. If no space in first mbuf, allocate another. */ M_PREPEND(m, sizeof(struct ether_header), M_DONTWAIT); if (m == NULL) senderr(ENOBUFS); eh = mtod(m, struct ether_header *); /* Note: etype is already in network byte order. */ memcpy(&eh->ether_type, &etype, sizeof(eh->ether_type)); memcpy(eh->ether_dhost, edst, sizeof(edst)); if (hdrcmplt) { memcpy(eh->ether_shost, esrc, sizeof(eh->ether_shost)); } else { memcpy(eh->ether_shost, CLLADDR(ifp->if_sadl), sizeof(eh->ether_shost)); } #if NCARP > 0 if (ifp0 != ifp && ifp0->if_type == IFT_CARP) { memcpy(eh->ether_shost, CLLADDR(ifp0->if_sadl), sizeof(eh->ether_shost)); } #endif if ((error = pfil_run_hooks(ifp->if_pfil, &m, ifp, PFIL_OUT)) != 0) return error; if (m == NULL) return 0; #if NBRIDGE > 0 /* * Bridges require special output handling. */ if (ifp->if_bridge) return bridge_output(ifp, m, NULL, NULL); #endif #if NCARP > 0 if (ifp != ifp0) if_statadd(ifp0, if_obytes, m->m_pkthdr.len + ETHER_HDR_LEN); #endif #ifdef ALTQ KERNEL_LOCK(1, NULL); /* * If ALTQ is enabled on the parent interface, do * classification; the queueing discipline might not * require classification, but might require the * address family/header pointer in the pktattr. */ if (ALTQ_IS_ENABLED(&ifp->if_snd)) altq_etherclassify(&ifp->if_snd, m); KERNEL_UNLOCK_ONE(NULL); #endif return ifq_enqueue(ifp, m); bad: if_statinc(ifp, if_oerrors); if (m) m_freem(m); return error; } #ifdef ALTQ /* * This routine is a slight hack to allow a packet to be classified * if the Ethernet headers are present. It will go away when ALTQ's * classification engine understands link headers. * * XXX: We may need to do m_pullups here. First to ensure struct ether_header * is indeed contiguous, then to read the LLC and so on. */ void altq_etherclassify(struct ifaltq *ifq, struct mbuf *m) { struct ether_header *eh; struct mbuf *mtop = m; uint16_t ether_type; int hlen, af, hdrsize; void *hdr; KASSERT((mtop->m_flags & M_PKTHDR) != 0); hlen = ETHER_HDR_LEN; eh = mtod(m, struct ether_header *); ether_type = htons(eh->ether_type); if (ether_type < ETHERMTU) { /* LLC/SNAP */ struct llc *llc = (struct llc *)(eh + 1); hlen += 8; if (m->m_len < hlen || llc->llc_dsap != LLC_SNAP_LSAP || llc->llc_ssap != LLC_SNAP_LSAP || llc->llc_control != LLC_UI) { /* Not SNAP. */ goto bad; } ether_type = htons(llc->llc_un.type_snap.ether_type); } switch (ether_type) { case ETHERTYPE_IP: af = AF_INET; hdrsize = 20; /* sizeof(struct ip) */ break; case ETHERTYPE_IPV6: af = AF_INET6; hdrsize = 40; /* sizeof(struct ip6_hdr) */ break; default: af = AF_UNSPEC; hdrsize = 0; break; } while (m->m_len <= hlen) { hlen -= m->m_len; m = m->m_next; if (m == NULL) goto bad; } if (m->m_len < (hlen + hdrsize)) { /* * protocol header not in a single mbuf. * We can't cope with this situation right * now (but it shouldn't ever happen, really, anyhow). */ #ifdef DEBUG printf("altq_etherclassify: headers span multiple mbufs: " "%d < %d\n", m->m_len, (hlen + hdrsize)); #endif goto bad; } m->m_data += hlen; m->m_len -= hlen; hdr = mtod(m, void *); if (ALTQ_NEEDS_CLASSIFY(ifq)) { mtop->m_pkthdr.pattr_class = (*ifq->altq_classify)(ifq->altq_clfier, m, af); } mtop->m_pkthdr.pattr_af = af; mtop->m_pkthdr.pattr_hdr = hdr; m->m_data -= hlen; m->m_len += hlen; return; bad: mtop->m_pkthdr.pattr_class = NULL; mtop->m_pkthdr.pattr_hdr = NULL; mtop->m_pkthdr.pattr_af = AF_UNSPEC; } #endif /* ALTQ */ #if defined (LLC) || defined (NETATALK) static void ether_input_llc(struct ifnet *ifp, struct mbuf *m, struct ether_header *eh) { pktqueue_t *pktq = NULL; struct llc *l; if (m->m_len < sizeof(*eh) + sizeof(struct llc)) goto error; l = (struct llc *)(eh+1); switch (l->llc_dsap) { #ifdef NETATALK case LLC_SNAP_LSAP: switch (l->llc_control) { case LLC_UI: if (l->llc_ssap != LLC_SNAP_LSAP) goto error; if (memcmp(&(l->llc_snap_org_code)[0], at_org_code, sizeof(at_org_code)) == 0 && ntohs(l->llc_snap_ether_type) == ETHERTYPE_ATALK) { pktq = at_pktq2; m_adj(m, sizeof(struct ether_header) + sizeof(struct llc)); break; } if (memcmp(&(l->llc_snap_org_code)[0], aarp_org_code, sizeof(aarp_org_code)) == 0 && ntohs(l->llc_snap_ether_type) == ETHERTYPE_AARP) { m_adj(m, sizeof(struct ether_header) + sizeof(struct llc)); aarpinput(ifp, m); /* XXX queue? */ return; } default: goto error; } break; #endif default: goto noproto; } KASSERT(pktq != NULL); if (__predict_false(!pktq_enqueue(pktq, m, 0))) { m_freem(m); } return; noproto: m_freem(m); if_statinc(ifp, if_noproto); return; error: m_freem(m); if_statinc(ifp, if_ierrors); return; } #endif /* defined (LLC) || defined (NETATALK) */ /* * Process a received Ethernet packet; * the packet is in the mbuf chain m with * the ether header. */ void ether_input(struct ifnet *ifp, struct mbuf *m) { #if NVLAN > 0 || defined(MBUFTRACE) struct ethercom *ec = (struct ethercom *) ifp; #endif pktqueue_t *pktq = NULL; uint16_t etype; struct ether_header *eh; size_t ehlen; static int earlypkts; /* No RPS for not-IP. */ pktq_rps_hash_func_t rps_hash = NULL; KASSERT(!cpu_intr_p()); KASSERT((m->m_flags & M_PKTHDR) != 0); if ((ifp->if_flags & IFF_UP) == 0) goto drop; #ifdef MBUFTRACE m_claimm(m, &ec->ec_rx_mowner); #endif if (__predict_false(m->m_len < sizeof(*eh))) { if ((m = m_pullup(m, sizeof(*eh))) == NULL) { if_statinc(ifp, if_ierrors); return; } } eh = mtod(m, struct ether_header *); etype = ntohs(eh->ether_type); ehlen = sizeof(*eh); if (__predict_false(earlypkts < 100 || entropy_epoch() == (unsigned)-1)) { rnd_add_data(NULL, eh, ehlen, 0); earlypkts++; } /* * Determine if the packet is within its size limits. For MPLS the * header length is variable, so we skip the check. */ if (etype != ETHERTYPE_MPLS && m->m_pkthdr.len > ETHER_MAX_FRAME(ifp, etype, m->m_flags & M_HASFCS)) { #ifdef DIAGNOSTIC mutex_enter(&bigpktpps_lock); if (ppsratecheck(&bigpktppslim_last, &bigpktpps_count, bigpktppslim)) { printf("%s: discarding oversize frame (len=%d)\n", ifp->if_xname, m->m_pkthdr.len); } mutex_exit(&bigpktpps_lock); #endif goto error; } if (ETHER_IS_MULTICAST(eh->ether_dhost)) { /* * If this is not a simplex interface, drop the packet * if it came from us. */ if ((ifp->if_flags & IFF_SIMPLEX) == 0 && memcmp(CLLADDR(ifp->if_sadl), eh->ether_shost, ETHER_ADDR_LEN) == 0) { goto drop; } if (memcmp(etherbroadcastaddr, eh->ether_dhost, ETHER_ADDR_LEN) == 0) m->m_flags |= M_BCAST; else m->m_flags |= M_MCAST; if_statinc(ifp, if_imcasts); } /* If the CRC is still on the packet, trim it off. */ if (m->m_flags & M_HASFCS) { m_adj(m, -ETHER_CRC_LEN); m->m_flags &= ~M_HASFCS; } if_statadd(ifp, if_ibytes, m->m_pkthdr.len); if (!vlan_has_tag(m) && etype == ETHERTYPE_VLAN) { m = ether_strip_vlantag(m); if (m == NULL) { if_statinc(ifp, if_ierrors); return; } eh = mtod(m, struct ether_header *); etype = ntohs(eh->ether_type); ehlen = sizeof(*eh); } if ((m->m_flags & (M_BCAST | M_MCAST | M_PROMISC)) == 0 && (ifp->if_flags & IFF_PROMISC) != 0 && memcmp(CLLADDR(ifp->if_sadl), eh->ether_dhost, ETHER_ADDR_LEN) != 0) { m->m_flags |= M_PROMISC; } if ((m->m_flags & M_PROMISC) == 0) { if (pfil_run_hooks(ifp->if_pfil, &m, ifp, PFIL_IN) != 0) return; if (m == NULL) return; eh = mtod(m, struct ether_header *); etype = ntohs(eh->ether_type); } /* * Processing a logical interfaces that are able * to configure vlan(4). */ #if NAGR > 0 if (ifp->if_lagg != NULL && __predict_true(etype != ETHERTYPE_SLOWPROTOCOLS)) { m->m_flags &= ~M_PROMISC; agr_input(ifp, m); return; } #endif /* * VLAN processing. * * VLAN provides service delimiting so the frames are * processed before other handlings. If a VLAN interface * does not exist to take those frames, they're returned * to ether_input(). */ if (vlan_has_tag(m)) { if (EVL_VLANOFTAG(vlan_get_tag(m)) == 0) { if (etype == ETHERTYPE_VLAN || etype == ETHERTYPE_QINQ) goto drop; /* XXX we should actually use the prio value? */ m->m_flags &= ~M_VLANTAG; } else { #if NVLAN > 0 if (ec->ec_nvlans > 0) { m = vlan_input(ifp, m); /* vlan_input() called ether_input() recursively */ if (m == NULL) return; } #endif /* drop VLAN frames not for this port. */ goto noproto; } } #if NCARP > 0 if (__predict_false(ifp->if_carp && ifp->if_type != IFT_CARP)) { /* * Clear M_PROMISC, in case the packet comes from a * vlan. */ m->m_flags &= ~M_PROMISC; if (carp_input(m, (uint8_t *)&eh->ether_shost, (uint8_t *)&eh->ether_dhost, eh->ether_type) == 0) return; } #endif /* * Handle protocols that expect to have the Ethernet header * (and possibly FCS) intact. */ switch (etype) { #if NPPPOE > 0 case ETHERTYPE_PPPOEDISC: pppoedisc_input(ifp, m); return; case ETHERTYPE_PPPOE: pppoe_input(ifp, m); return; #endif case ETHERTYPE_SLOWPROTOCOLS: { uint8_t subtype; if (m->m_pkthdr.len < sizeof(*eh) + sizeof(subtype)) goto error; m_copydata(m, sizeof(*eh), sizeof(subtype), &subtype); switch (subtype) { #if NAGR > 0 case SLOWPROTOCOLS_SUBTYPE_LACP: if (ifp->if_lagg != NULL) { ieee8023ad_lacp_input(ifp, m); return; } break; case SLOWPROTOCOLS_SUBTYPE_MARKER: if (ifp->if_lagg != NULL) { ieee8023ad_marker_input(ifp, m); return; } break; #endif default: if (subtype == 0 || subtype > 10) { /* illegal value */ goto error; } /* unknown subtype */ break; } } /* FALLTHROUGH */ default: if (m->m_flags & M_PROMISC) goto drop; } /* If the CRC is still on the packet, trim it off. */ if (m->m_flags & M_HASFCS) { m_adj(m, -ETHER_CRC_LEN); m->m_flags &= ~M_HASFCS; } /* etype represents the size of the payload in this case */ if (etype <= ETHERMTU + sizeof(struct ether_header)) { KASSERT(ehlen == sizeof(*eh)); #if defined (LLC) || defined (NETATALK) ether_input_llc(ifp, m, eh); return; #else /* ethertype of 0-1500 is regarded as noproto */ goto noproto; #endif } /* For ARP packets, store the source address so that * ARP DAD probes can be validated. */ if (etype == ETHERTYPE_ARP) { struct m_tag *mtag; mtag = m_tag_get(PACKET_TAG_ETHERNET_SRC, ETHER_ADDR_LEN, M_NOWAIT); if (mtag != NULL) { memcpy(mtag + 1, &eh->ether_shost, ETHER_ADDR_LEN); m_tag_prepend(m, mtag); } } /* Strip off the Ethernet header. */ m_adj(m, ehlen); switch (etype) { #ifdef INET case ETHERTYPE_IP: #ifdef GATEWAY if (ipflow_fastforward(m)) return; #endif pktq = ip_pktq; rps_hash = atomic_load_relaxed(&ether_pktq_rps_hash_p); break; case ETHERTYPE_ARP: pktq = arp_pktq; break; case ETHERTYPE_REVARP: revarpinput(m); /* XXX queue? */ return; #endif #ifdef INET6 case ETHERTYPE_IPV6: if (__predict_false(!in6_present)) goto noproto; #ifdef GATEWAY if (ip6flow_fastforward(&m)) return; #endif pktq = ip6_pktq; rps_hash = atomic_load_relaxed(&ether_pktq_rps_hash_p); break; #endif #ifdef NETATALK case ETHERTYPE_ATALK: pktq = at_pktq1; break; case ETHERTYPE_AARP: aarpinput(ifp, m); /* XXX queue? */ return; #endif #ifdef MPLS case ETHERTYPE_MPLS: pktq = mpls_pktq; break; #endif default: goto noproto; } KASSERT(pktq != NULL); const uint32_t h = rps_hash ? pktq_rps_hash(&rps_hash, m) : 0; if (__predict_false(!pktq_enqueue(pktq, m, h))) { m_freem(m); } return; drop: m_freem(m); if_statinc(ifp, if_iqdrops); return; noproto: m_freem(m); if_statinc(ifp, if_noproto); return; error: m_freem(m); if_statinc(ifp, if_ierrors); return; } static void ether_bpf_mtap(struct bpf_if *bp, struct mbuf *m, u_int direction) { struct ether_vlan_header evl; struct m_hdr mh, md; KASSERT(bp != NULL); if (!vlan_has_tag(m)) { bpf_mtap3(bp, m, direction); return; } memcpy(&evl, mtod(m, char *), ETHER_HDR_LEN); evl.evl_proto = evl.evl_encap_proto; evl.evl_encap_proto = htons(ETHERTYPE_VLAN); evl.evl_tag = htons(vlan_get_tag(m)); md.mh_flags = 0; md.mh_data = m->m_data + ETHER_HDR_LEN; md.mh_len = m->m_len - ETHER_HDR_LEN; md.mh_next = m->m_next; mh.mh_flags = 0; mh.mh_data = (char *)&evl; mh.mh_len = sizeof(evl); mh.mh_next = (struct mbuf *)&md; bpf_mtap3(bp, (struct mbuf *)&mh, direction); } /* * Convert Ethernet address to printable (loggable) representation. */ char * ether_sprintf(const u_char *ap) { static char etherbuf[3 * ETHER_ADDR_LEN]; return ether_snprintf(etherbuf, sizeof(etherbuf), ap); } char * ether_snprintf(char *buf, size_t len, const u_char *ap) { char *cp = buf; size_t i; for (i = 0; i < len / 3; i++) { *cp++ = hexdigits[*ap >> 4]; *cp++ = hexdigits[*ap++ & 0xf]; *cp++ = ':'; } *--cp = '\0'; return buf; } /* * Perform common duties while attaching to interface list */ void ether_ifattach(struct ifnet *ifp, const uint8_t *lla) { struct ethercom *ec = (struct ethercom *)ifp; char xnamebuf[HOOKNAMSIZ]; if (lla != NULL && ETHER_IS_MULTICAST(lla)) aprint_error("The multicast bit is set in the MAC address. " "It's wrong.\n"); ifp->if_type = IFT_ETHER; ifp->if_hdrlen = ETHER_HDR_LEN; ifp->if_dlt = DLT_EN10MB; ifp->if_mtu = ETHERMTU; ifp->if_output = ether_output; ifp->_if_input = ether_input; if (ec->ec_capabilities & ETHERCAP_VLAN_HWTAGGING) ifp->if_bpf_mtap = ether_bpf_mtap; if (ifp->if_baudrate == 0) ifp->if_baudrate = IF_Mbps(10); /* just a default */ if (lla != NULL) if_set_sadl(ifp, lla, ETHER_ADDR_LEN, !ETHER_IS_LOCAL(lla)); LIST_INIT(&ec->ec_multiaddrs); SIMPLEQ_INIT(&ec->ec_vids); ec->ec_lock = mutex_obj_alloc(MUTEX_DEFAULT, IPL_NET); ec->ec_flags = 0; ifp->if_broadcastaddr = etherbroadcastaddr; bpf_attach(ifp, DLT_EN10MB, sizeof(struct ether_header)); snprintf(xnamebuf, sizeof(xnamebuf), "%s-ether_ifdetachhooks", ifp->if_xname); ec->ec_ifdetach_hooks = simplehook_create(IPL_NET, xnamebuf); #ifdef MBUFTRACE mowner_init_owner(&ec->ec_tx_mowner, ifp->if_xname, "tx"); mowner_init_owner(&ec->ec_rx_mowner, ifp->if_xname, "rx"); MOWNER_ATTACH(&ec->ec_tx_mowner); MOWNER_ATTACH(&ec->ec_rx_mowner); ifp->if_mowner = &ec->ec_tx_mowner; #endif } void ether_ifdetach(struct ifnet *ifp) { struct ethercom *ec = (void *) ifp; struct ether_multi *enm; IFNET_ASSERT_UNLOCKED(ifp); /* * Prevent further calls to ioctl (for example turning off * promiscuous mode from the bridge code), which eventually can * call if_init() which can cause panics because the interface * is in the process of being detached. Return device not configured * instead. */ ifp->if_ioctl = __FPTRCAST(int (*)(struct ifnet *, u_long, void *), enxio); simplehook_dohooks(ec->ec_ifdetach_hooks); KASSERT(!simplehook_has_hooks(ec->ec_ifdetach_hooks)); simplehook_destroy(ec->ec_ifdetach_hooks); bpf_detach(ifp); ETHER_LOCK(ec); KASSERT(ec->ec_nvlans == 0); while ((enm = LIST_FIRST(&ec->ec_multiaddrs)) != NULL) { LIST_REMOVE(enm, enm_list); kmem_free(enm, sizeof(*enm)); ec->ec_multicnt--; } ETHER_UNLOCK(ec); mutex_obj_free(ec->ec_lock); ec->ec_lock = NULL; ifp->if_mowner = NULL; MOWNER_DETACH(&ec->ec_rx_mowner); MOWNER_DETACH(&ec->ec_tx_mowner); } void * ether_ifdetachhook_establish(struct ifnet *ifp, void (*fn)(void *), void *arg) { struct ethercom *ec; khook_t *hk; if (ifp->if_type != IFT_ETHER) return NULL; ec = (struct ethercom *)ifp; hk = simplehook_establish(ec->ec_ifdetach_hooks, fn, arg); return (void *)hk; } void ether_ifdetachhook_disestablish(struct ifnet *ifp, void *vhook, kmutex_t *lock) { struct ethercom *ec; if (vhook == NULL) return; ec = (struct ethercom *)ifp; simplehook_disestablish(ec->ec_ifdetach_hooks, vhook, lock); } #if 0 /* * This is for reference. We have a table-driven version * of the little-endian crc32 generator, which is faster * than the double-loop. */ uint32_t ether_crc32_le(const uint8_t *buf, size_t len) { uint32_t c, crc, carry; size_t i, j; crc = 0xffffffffU; /* initial value */ for (i = 0; i < len; i++) { c = buf[i]; for (j = 0; j < 8; j++) { carry = ((crc & 0x01) ? 1 : 0) ^ (c & 0x01); crc >>= 1; c >>= 1; if (carry) crc = (crc ^ ETHER_CRC_POLY_LE); } } return (crc); } #else uint32_t ether_crc32_le(const uint8_t *buf, size_t len) { static const uint32_t crctab[] = { 0x00000000, 0x1db71064, 0x3b6e20c8, 0x26d930ac, 0x76dc4190, 0x6b6b51f4, 0x4db26158, 0x5005713c, 0xedb88320, 0xf00f9344, 0xd6d6a3e8, 0xcb61b38c, 0x9b64c2b0, 0x86d3d2d4, 0xa00ae278, 0xbdbdf21c }; uint32_t crc; size_t i; crc = 0xffffffffU; /* initial value */ for (i = 0; i < len; i++) { crc ^= buf[i]; crc = (crc >> 4) ^ crctab[crc & 0xf]; crc = (crc >> 4) ^ crctab[crc & 0xf]; } return (crc); } #endif uint32_t ether_crc32_be(const uint8_t *buf, size_t len) { uint32_t c, crc, carry; size_t i, j; crc = 0xffffffffU; /* initial value */ for (i = 0; i < len; i++) { c = buf[i]; for (j = 0; j < 8; j++) { carry = ((crc & 0x80000000U) ? 1 : 0) ^ (c & 0x01); crc <<= 1; c >>= 1; if (carry) crc = (crc ^ ETHER_CRC_POLY_BE) | carry; } } return (crc); } #ifdef INET const uint8_t ether_ipmulticast_min[ETHER_ADDR_LEN] = { 0x01, 0x00, 0x5e, 0x00, 0x00, 0x00 }; const uint8_t ether_ipmulticast_max[ETHER_ADDR_LEN] = { 0x01, 0x00, 0x5e, 0x7f, 0xff, 0xff }; #endif #ifdef INET6 const uint8_t ether_ip6multicast_min[ETHER_ADDR_LEN] = { 0x33, 0x33, 0x00, 0x00, 0x00, 0x00 }; const uint8_t ether_ip6multicast_max[ETHER_ADDR_LEN] = { 0x33, 0x33, 0xff, 0xff, 0xff, 0xff }; #endif /* * ether_aton implementation, not using a static buffer. */ int ether_aton_r(u_char *dest, size_t len, const char *str) { const u_char *cp = (const void *)str; u_char *ep; #define atox(c) (((c) <= '9') ? ((c) - '0') : ((toupper(c) - 'A') + 10)) if (len < ETHER_ADDR_LEN) return ENOSPC; ep = dest + ETHER_ADDR_LEN; while (*cp) { if (!isxdigit(*cp)) return EINVAL; *dest = atox(*cp); cp++; if (isxdigit(*cp)) { *dest = (*dest << 4) | atox(*cp); cp++; } dest++; if (dest == ep) return (*cp == '\0') ? 0 : ENAMETOOLONG; switch (*cp) { case ':': case '-': case '.': cp++; break; } } return ENOBUFS; } /* * Convert a sockaddr into an Ethernet address or range of Ethernet * addresses. */ int ether_multiaddr(const struct sockaddr *sa, uint8_t addrlo[ETHER_ADDR_LEN], uint8_t addrhi[ETHER_ADDR_LEN]) { #ifdef INET const struct sockaddr_in *sin; #endif #ifdef INET6 const struct sockaddr_in6 *sin6; #endif switch (sa->sa_family) { case AF_UNSPEC: memcpy(addrlo, sa->sa_data, ETHER_ADDR_LEN); memcpy(addrhi, addrlo, ETHER_ADDR_LEN); break; #ifdef INET case AF_INET: sin = satocsin(sa); if (sin->sin_addr.s_addr == INADDR_ANY) { /* * An IP address of INADDR_ANY means listen to * or stop listening to all of the Ethernet * multicast addresses used for IP. * (This is for the sake of IP multicast routers.) */ memcpy(addrlo, ether_ipmulticast_min, ETHER_ADDR_LEN); memcpy(addrhi, ether_ipmulticast_max, ETHER_ADDR_LEN); } else { ETHER_MAP_IP_MULTICAST(&sin->sin_addr, addrlo); memcpy(addrhi, addrlo, ETHER_ADDR_LEN); } break; #endif #ifdef INET6 case AF_INET6: sin6 = satocsin6(sa); if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) { /* * An IP6 address of 0 means listen to or stop * listening to all of the Ethernet multicast * address used for IP6. * (This is used for multicast routers.) */ memcpy(addrlo, ether_ip6multicast_min, ETHER_ADDR_LEN); memcpy(addrhi, ether_ip6multicast_max, ETHER_ADDR_LEN); } else { ETHER_MAP_IPV6_MULTICAST(&sin6->sin6_addr, addrlo); memcpy(addrhi, addrlo, ETHER_ADDR_LEN); } break; #endif default: return EAFNOSUPPORT; } return 0; } /* * Add an Ethernet multicast address or range of addresses to the list for a * given interface. */ int ether_addmulti(const struct sockaddr *sa, struct ethercom *ec) { struct ether_multi *enm, *_enm; u_char addrlo[ETHER_ADDR_LEN]; u_char addrhi[ETHER_ADDR_LEN]; int error = 0; /* Allocate out of lock */ enm = kmem_alloc(sizeof(*enm), KM_SLEEP); ETHER_LOCK(ec); error = ether_multiaddr(sa, addrlo, addrhi); if (error != 0) goto out; /* * Verify that we have valid Ethernet multicast addresses. */ if (!ETHER_IS_MULTICAST(addrlo) || !ETHER_IS_MULTICAST(addrhi)) { error = EINVAL; goto out; } /* * See if the address range is already in the list. */ _enm = ether_lookup_multi(addrlo, addrhi, ec); if (_enm != NULL) { /* * Found it; just increment the reference count. */ ++_enm->enm_refcount; error = 0; goto out; } /* * Link a new multicast record into the interface's multicast list. */ memcpy(enm->enm_addrlo, addrlo, ETHER_ADDR_LEN); memcpy(enm->enm_addrhi, addrhi, ETHER_ADDR_LEN); enm->enm_refcount = 1; LIST_INSERT_HEAD(&ec->ec_multiaddrs, enm, enm_list); ec->ec_multicnt++; /* * Return ENETRESET to inform the driver that the list has changed * and its reception filter should be adjusted accordingly. */ error = ENETRESET; enm = NULL; out: ETHER_UNLOCK(ec); if (enm != NULL) kmem_free(enm, sizeof(*enm)); return error; } /* * Delete a multicast address record. */ int ether_delmulti(const struct sockaddr *sa, struct ethercom *ec) { struct ether_multi *enm; u_char addrlo[ETHER_ADDR_LEN]; u_char addrhi[ETHER_ADDR_LEN]; int error; ETHER_LOCK(ec); error = ether_multiaddr(sa, addrlo, addrhi); if (error != 0) goto error; /* * Look up the address in our list. */ enm = ether_lookup_multi(addrlo, addrhi, ec); if (enm == NULL) { error = ENXIO; goto error; } if (--enm->enm_refcount != 0) { /* * Still some claims to this record. */ error = 0; goto error; } /* * No remaining claims to this record; unlink and free it. */ LIST_REMOVE(enm, enm_list); ec->ec_multicnt--; ETHER_UNLOCK(ec); kmem_free(enm, sizeof(*enm)); /* * Return ENETRESET to inform the driver that the list has changed * and its reception filter should be adjusted accordingly. */ return ENETRESET; error: ETHER_UNLOCK(ec); return error; } void ether_set_ifflags_cb(struct ethercom *ec, ether_cb_t cb) { ec->ec_ifflags_cb = cb; } void ether_set_vlan_cb(struct ethercom *ec, ether_vlancb_t cb) { ec->ec_vlan_cb = cb; } static int ether_ioctl_reinit(struct ethercom *ec) { struct ifnet *ifp = &ec->ec_if; int error; KASSERTMSG(IFNET_LOCKED(ifp), "%s", ifp->if_xname); switch (ifp->if_flags & (IFF_UP | IFF_RUNNING)) { case IFF_RUNNING: /* * If interface is marked down and it is running, * then stop and disable it. */ if_stop(ifp, 1); break; case IFF_UP: /* * If interface is marked up and it is stopped, then * start it. */ return if_init(ifp); case IFF_UP | IFF_RUNNING: error = 0; if (ec->ec_ifflags_cb != NULL) { error = (*ec->ec_ifflags_cb)(ec); if (error == ENETRESET) { /* * Reset the interface to pick up * changes in any other flags that * affect the hardware state. */ return if_init(ifp); } } else error = if_init(ifp); return error; case 0: break; } return 0; } /* * Common ioctls for Ethernet interfaces. Note, we must be * called at splnet(). */ int ether_ioctl(struct ifnet *ifp, u_long cmd, void *data) { struct ethercom *ec = (void *)ifp; struct eccapreq *eccr; struct ifreq *ifr = (struct ifreq *)data; struct if_laddrreq *iflr = data; const struct sockaddr_dl *sdl; static const uint8_t zero[ETHER_ADDR_LEN]; int error; switch (cmd) { case SIOCINITIFADDR: { struct ifaddr *ifa = (struct ifaddr *)data; if (ifa->ifa_addr->sa_family != AF_LINK && (ifp->if_flags & (IFF_UP | IFF_RUNNING)) != (IFF_UP | IFF_RUNNING)) { ifp->if_flags |= IFF_UP; if ((error = if_init(ifp)) != 0) return error; } #ifdef INET if (ifa->ifa_addr->sa_family == AF_INET) arp_ifinit(ifp, ifa); #endif return 0; } case SIOCSIFMTU: { int maxmtu; if (ec->ec_capabilities & ETHERCAP_JUMBO_MTU) maxmtu = ETHERMTU_JUMBO; else maxmtu = ETHERMTU; if (ifr->ifr_mtu < ETHERMIN || ifr->ifr_mtu > maxmtu) return EINVAL; else if ((error = ifioctl_common(ifp, cmd, data)) != ENETRESET) return error; else if (ifp->if_flags & IFF_UP) { /* Make sure the device notices the MTU change. */ return if_init(ifp); } else return 0; } case SIOCSIFFLAGS: if ((error = ifioctl_common(ifp, cmd, data)) != 0) return error; return ether_ioctl_reinit(ec); case SIOCGIFFLAGS: error = ifioctl_common(ifp, cmd, data); if (error == 0) { /* Set IFF_ALLMULTI for backcompat */ ifr->ifr_flags |= (ec->ec_flags & ETHER_F_ALLMULTI) ? IFF_ALLMULTI : 0; } return error; case SIOCGETHERCAP: eccr = (struct eccapreq *)data; eccr->eccr_capabilities = ec->ec_capabilities; eccr->eccr_capenable = ec->ec_capenable; return 0; case SIOCSETHERCAP: eccr = (struct eccapreq *)data; if ((eccr->eccr_capenable & ~ec->ec_capabilities) != 0) return EINVAL; if (eccr->eccr_capenable == ec->ec_capenable) return 0; #if 0 /* notyet */ ec->ec_capenable = (ec->ec_capenable & ETHERCAP_CANTCHANGE) | (eccr->eccr_capenable & ~ETHERCAP_CANTCHANGE); #else ec->ec_capenable = eccr->eccr_capenable; #endif return ether_ioctl_reinit(ec); case SIOCADDMULTI: return ether_addmulti(ifreq_getaddr(cmd, ifr), ec); case SIOCDELMULTI: return ether_delmulti(ifreq_getaddr(cmd, ifr), ec); case SIOCSIFMEDIA: case SIOCGIFMEDIA: if (ec->ec_mii != NULL) return ifmedia_ioctl(ifp, ifr, &ec->ec_mii->mii_media, cmd); else if (ec->ec_ifmedia != NULL) return ifmedia_ioctl(ifp, ifr, ec->ec_ifmedia, cmd); else return ENOTTY; break; case SIOCALIFADDR: sdl = satocsdl(sstocsa(&iflr->addr)); if (sdl->sdl_family != AF_LINK) ; else if (ETHER_IS_MULTICAST(CLLADDR(sdl))) return EINVAL; else if (memcmp(zero, CLLADDR(sdl), sizeof(zero)) == 0) return EINVAL; /*FALLTHROUGH*/ default: return ifioctl_common(ifp, cmd, data); } return 0; } /* * Enable/disable passing VLAN packets if the parent interface supports it. * Return: * 0: Ok * -1: Parent interface does not support vlans * >0: Error */ int ether_enable_vlan_mtu(struct ifnet *ifp) { int error; struct ethercom *ec = (void *)ifp; /* Parent does not support VLAN's */ if ((ec->ec_capabilities & ETHERCAP_VLAN_MTU) == 0) return -1; /* * Parent supports the VLAN_MTU capability, * i.e. can Tx/Rx larger than ETHER_MAX_LEN frames; * enable it. */ ec->ec_capenable |= ETHERCAP_VLAN_MTU; /* Interface is down, defer for later */ if ((ifp->if_flags & IFF_UP) == 0) return 0; if ((error = if_flags_set(ifp, ifp->if_flags)) == 0) return 0; ec->ec_capenable &= ~ETHERCAP_VLAN_MTU; return error; } int ether_disable_vlan_mtu(struct ifnet *ifp) { int error; struct ethercom *ec = (void *)ifp; /* We still have VLAN's, defer for later */ if (ec->ec_nvlans != 0) return 0; /* Parent does not support VLAB's, nothing to do. */ if ((ec->ec_capenable & ETHERCAP_VLAN_MTU) == 0) return -1; /* * Disable Tx/Rx of VLAN-sized frames. */ ec->ec_capenable &= ~ETHERCAP_VLAN_MTU; /* Interface is down, defer for later */ if ((ifp->if_flags & IFF_UP) == 0) return 0; if ((error = if_flags_set(ifp, ifp->if_flags)) == 0) return 0; ec->ec_capenable |= ETHERCAP_VLAN_MTU; return error; } /* * Add and delete VLAN TAG */ int ether_add_vlantag(struct ifnet *ifp, uint16_t vtag, bool *vlanmtu_status) { struct ethercom *ec = (void *)ifp; struct vlanid_list *vidp; bool vlanmtu_enabled; uint16_t vid = EVL_VLANOFTAG(vtag); int error; vlanmtu_enabled = false; /* Add a vid to the list */ vidp = kmem_alloc(sizeof(*vidp), KM_SLEEP); vidp->vid = vid; ETHER_LOCK(ec); ec->ec_nvlans++; SIMPLEQ_INSERT_TAIL(&ec->ec_vids, vidp, vid_list); ETHER_UNLOCK(ec); if (ec->ec_nvlans == 1) { IFNET_LOCK(ifp); error = ether_enable_vlan_mtu(ifp); IFNET_UNLOCK(ifp); if (error == 0) { vlanmtu_enabled = true; } else if (error != -1) { goto fail; } } if (ec->ec_vlan_cb != NULL) { error = (*ec->ec_vlan_cb)(ec, vid, true); if (error != 0) goto fail; } if (vlanmtu_status != NULL) *vlanmtu_status = vlanmtu_enabled; return 0; fail: ETHER_LOCK(ec); ec->ec_nvlans--; SIMPLEQ_REMOVE(&ec->ec_vids, vidp, vlanid_list, vid_list); ETHER_UNLOCK(ec); if (vlanmtu_enabled) { IFNET_LOCK(ifp); (void)ether_disable_vlan_mtu(ifp); IFNET_UNLOCK(ifp); } kmem_free(vidp, sizeof(*vidp)); return error; } int ether_del_vlantag(struct ifnet *ifp, uint16_t vtag) { struct ethercom *ec = (void *)ifp; struct vlanid_list *vidp; uint16_t vid = EVL_VLANOFTAG(vtag); ETHER_LOCK(ec); SIMPLEQ_FOREACH(vidp, &ec->ec_vids, vid_list) { if (vidp->vid == vid) { SIMPLEQ_REMOVE(&ec->ec_vids, vidp, vlanid_list, vid_list); ec->ec_nvlans--; break; } } ETHER_UNLOCK(ec); if (vidp == NULL) return ENOENT; if (ec->ec_vlan_cb != NULL) { (void)(*ec->ec_vlan_cb)(ec, vidp->vid, false); } if (ec->ec_nvlans == 0) { IFNET_LOCK(ifp); (void)ether_disable_vlan_mtu(ifp); IFNET_UNLOCK(ifp); } kmem_free(vidp, sizeof(*vidp)); return 0; } int ether_inject_vlantag(struct mbuf **mp, uint16_t etype, uint16_t tag) { static const size_t min_data_len = ETHER_MIN_LEN - ETHER_CRC_LEN + ETHER_VLAN_ENCAP_LEN; /* Used to pad ethernet frames with < ETHER_MIN_LEN bytes */ static const char vlan_zero_pad_buff[ETHER_MIN_LEN] = { 0 }; struct ether_vlan_header *evl; struct mbuf *m = *mp; int error; error = 0; M_PREPEND(m, ETHER_VLAN_ENCAP_LEN, M_DONTWAIT); if (m == NULL) { error = ENOBUFS; goto out; } if (m->m_len < sizeof(*evl)) { m = m_pullup(m, sizeof(*evl)); if (m == NULL) { error = ENOBUFS; goto out; } } /* * Transform the Ethernet header into an * Ethernet header with 802.1Q encapsulation. */ memmove(mtod(m, void *), mtod(m, char *) + ETHER_VLAN_ENCAP_LEN, sizeof(struct ether_header)); evl = mtod(m, struct ether_vlan_header *); evl->evl_proto = evl->evl_encap_proto; evl->evl_encap_proto = htons(etype); evl->evl_tag = htons(tag); /* * To cater for VLAN-aware layer 2 ethernet * switches which may need to strip the tag * before forwarding the packet, make sure * the packet+tag is at least 68 bytes long. * This is necessary because our parent will * only pad to 64 bytes (ETHER_MIN_LEN) and * some switches will not pad by themselves * after deleting a tag. */ if (m->m_pkthdr.len < min_data_len) { m_copyback(m, m->m_pkthdr.len, min_data_len - m->m_pkthdr.len, vlan_zero_pad_buff); } m->m_flags &= ~M_VLANTAG; out: *mp = m; return error; } struct mbuf * ether_strip_vlantag(struct mbuf *m) { struct ether_vlan_header *evl; if (m->m_len < sizeof(*evl) && (m = m_pullup(m, sizeof(*evl))) == NULL) { return NULL; } if (m_makewritable(&m, 0, sizeof(*evl), M_DONTWAIT)) { m_freem(m); return NULL; } evl = mtod(m, struct ether_vlan_header *); KASSERT(ntohs(evl->evl_encap_proto) == ETHERTYPE_VLAN); vlan_set_tag(m, ntohs(evl->evl_tag)); /* * Restore the original ethertype. We'll remove * the encapsulation after we've found the vlan * interface corresponding to the tag. */ evl->evl_encap_proto = evl->evl_proto; /* * Remove the encapsulation header and append tag. * The original header has already been fixed up above. */ vlan_set_tag(m, ntohs(evl->evl_tag)); memmove((char *)evl + ETHER_VLAN_ENCAP_LEN, evl, offsetof(struct ether_vlan_header, evl_encap_proto)); m_adj(m, ETHER_VLAN_ENCAP_LEN); return m; } static int ether_multicast_sysctl(SYSCTLFN_ARGS) { struct ether_multi *enm; struct ifnet *ifp; struct ethercom *ec; int error = 0; size_t written; struct psref psref; int bound; unsigned int multicnt; struct ether_multi_sysctl *addrs; int i; if (namelen != 1) return EINVAL; bound = curlwp_bind(); ifp = if_get_byindex(name[0], &psref); if (ifp == NULL) { error = ENODEV; goto out; } if (ifp->if_type != IFT_ETHER) { if_put(ifp, &psref); *oldlenp = 0; goto out; } ec = (struct ethercom *)ifp; if (oldp == NULL) { if_put(ifp, &psref); *oldlenp = ec->ec_multicnt * sizeof(*addrs); goto out; } /* * ec->ec_lock is a spin mutex so we cannot call sysctl_copyout, which * is sleepable, while holding it. Copy data to a local buffer first * with the lock taken and then call sysctl_copyout without holding it. */ retry: multicnt = ec->ec_multicnt; if (multicnt == 0) { if_put(ifp, &psref); *oldlenp = 0; goto out; } addrs = kmem_zalloc(sizeof(*addrs) * multicnt, KM_SLEEP); ETHER_LOCK(ec); if (multicnt != ec->ec_multicnt) { /* The number of multicast addresses has changed */ ETHER_UNLOCK(ec); kmem_free(addrs, sizeof(*addrs) * multicnt); goto retry; } i = 0; LIST_FOREACH(enm, &ec->ec_multiaddrs, enm_list) { struct ether_multi_sysctl *addr = &addrs[i]; addr->enm_refcount = enm->enm_refcount; memcpy(addr->enm_addrlo, enm->enm_addrlo, ETHER_ADDR_LEN); memcpy(addr->enm_addrhi, enm->enm_addrhi, ETHER_ADDR_LEN); i++; } ETHER_UNLOCK(ec); error = 0; written = 0; for (i = 0; i < multicnt; i++) { struct ether_multi_sysctl *addr = &addrs[i]; if (written + sizeof(*addr) > *oldlenp) break; error = sysctl_copyout(l, addr, oldp, sizeof(*addr)); if (error) break; written += sizeof(*addr); oldp = (char *)oldp + sizeof(*addr); } kmem_free(addrs, sizeof(*addrs) * multicnt); if_put(ifp, &psref); *oldlenp = written; out: curlwp_bindx(bound); return error; } static void ether_sysctl_setup(struct sysctllog **clog) { const struct sysctlnode *rnode = NULL; sysctl_createv(clog, 0, NULL, &rnode, CTLFLAG_PERMANENT, CTLTYPE_NODE, "ether", SYSCTL_DESCR("Ethernet-specific information"), NULL, 0, NULL, 0, CTL_NET, CTL_CREATE, CTL_EOL); sysctl_createv(clog, 0, &rnode, NULL, CTLFLAG_PERMANENT, CTLTYPE_NODE, "multicast", SYSCTL_DESCR("multicast addresses"), ether_multicast_sysctl, 0, NULL, 0, CTL_CREATE, CTL_EOL); sysctl_createv(clog, 0, &rnode, NULL, CTLFLAG_PERMANENT | CTLFLAG_READWRITE, CTLTYPE_STRING, "rps_hash", SYSCTL_DESCR("Interface rps hash function control"), sysctl_pktq_rps_hash_handler, 0, (void *)&ether_pktq_rps_hash_p, PKTQ_RPS_HASH_NAME_LEN, CTL_CREATE, CTL_EOL); } void etherinit(void) { #ifdef DIAGNOSTIC mutex_init(&bigpktpps_lock, MUTEX_DEFAULT, IPL_NET); #endif ether_pktq_rps_hash_p = pktq_rps_hash_default; ether_sysctl_setup(NULL); }
171 151 10 146 3 24 5 1 12 9 9 3 5 2 39 34 16 20 15 7 156 157 16 100 100 5 651 656 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 /* $NetBSD: socketvar.h,v 1.168 2024/02/03 19:05:14 jdolecek Exp $ */ /*- * Copyright (c) 2008, 2009 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Andrew Doran. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /*- * Copyright (c) 1982, 1986, 1990, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)socketvar.h 8.3 (Berkeley) 2/19/95 */ #ifndef _SYS_SOCKETVAR_H_ #define _SYS_SOCKETVAR_H_ #include <sys/select.h> #include <sys/selinfo.h> /* for struct selinfo */ #include <sys/queue.h> #include <sys/mutex.h> #include <sys/condvar.h> #if !defined(_KERNEL) struct uio; struct lwp; struct uidinfo; #else #include <sys/atomic.h> #include <sys/uidinfo.h> #endif TAILQ_HEAD(soqhead, socket); /* * Variables for socket buffering. */ struct sockbuf { struct selinfo sb_sel; /* process selecting read/write */ struct mowner *sb_mowner; /* who owns data for this sockbuf */ struct socket *sb_so; /* back pointer to socket */ kcondvar_t sb_cv; /* notifier */ /* When re-zeroing this struct, we zero from sb_startzero to the end */ #define sb_startzero sb_cc u_long sb_cc; /* actual chars in buffer */ u_long sb_hiwat; /* max actual char count */ u_long sb_mbcnt; /* chars of mbufs used */ u_long sb_mbmax; /* max chars of mbufs to use */ u_long sb_lowat; /* low water mark */ struct mbuf *sb_mb; /* the mbuf chain */ struct mbuf *sb_mbtail; /* the last mbuf in the chain */ struct mbuf *sb_lastrecord; /* first mbuf of last record in socket buffer */ int sb_flags; /* flags, see below */ int sb_timeo; /* timeout for read/write */ u_long sb_overflowed; /* # of drops due to full buffer */ }; #ifndef SB_MAX #define SB_MAX (256*1024) /* default for max chars in sockbuf */ #endif #define SB_LOCK 0x01 /* lock on data queue */ #define SB_NOTIFY 0x04 /* someone is waiting for data/space */ #define SB_ASYNC 0x10 /* ASYNC I/O, need signals */ #define SB_UPCALL 0x20 /* someone wants an upcall */ #define SB_NOINTR 0x40 /* operations not interruptible */ #define SB_KNOTE 0x100 /* kernel note attached */ #define SB_AUTOSIZE 0x800 /* automatically size socket buffer */ /* * Kernel structure per socket. * Contains send and receive buffer queues, * handle on protocol and pointer to protocol * private data and error information. */ struct so_accf { struct accept_filter *so_accept_filter; void *so_accept_filter_arg; /* saved filter args */ char *so_accept_filter_str; /* saved user args */ }; struct sockaddr; struct socket { kmutex_t * volatile so_lock; /* pointer to lock on structure */ kcondvar_t so_cv; /* notifier */ short so_type; /* generic type, see socket.h */ short so_options; /* from socket call, see socket.h */ u_short so_linger; /* time to linger while closing */ short so_state; /* internal state flags SS_*, below */ int so_unused; /* used to be so_nbio */ void *so_pcb; /* protocol control block */ const struct protosw *so_proto; /* protocol handle */ /* * Variables for connection queueing. * Socket where accepts occur is so_head in all subsidiary sockets. * If so_head is 0, socket is not related to an accept. * For head socket so_q0 queues partially completed connections, * while so_q is a queue of connections ready to be accepted. * If a connection is aborted and it has so_head set, then * it has to be pulled out of either so_q0 or so_q. * We allow connections to queue up based on current queue lengths * and limit on number of queued connections for this socket. */ struct socket *so_head; /* back pointer to accept socket */ struct soqhead *so_onq; /* queue (q or q0) that we're on */ struct soqhead so_q0; /* queue of partial connections */ struct soqhead so_q; /* queue of incoming connections */ TAILQ_ENTRY(socket) so_qe; /* our queue entry (q or q0) */ short so_q0len; /* partials on so_q0 */ short so_qlen; /* number of connections on so_q */ short so_qlimit; /* max number queued connections */ short so_timeo; /* connection timeout */ u_short so_error; /* error affecting connection */ u_short so_rerror; /* error affecting receiving */ u_short so_aborting; /* references from soabort() */ pid_t so_pgid; /* pgid for signals */ u_long so_oobmark; /* chars to oob mark */ struct sockbuf so_snd; /* send buffer */ struct sockbuf so_rcv; /* receive buffer */ void *so_internal; /* Space for svr4 stream data */ void (*so_upcall) (struct socket *, void *, int, int); void * so_upcallarg; /* Arg for above */ int (*so_send) (struct socket *, struct sockaddr *, struct uio *, struct mbuf *, struct mbuf *, int, struct lwp *); int (*so_receive) (struct socket *, struct mbuf **, struct uio *, struct mbuf **, struct mbuf **, int *); struct mowner *so_mowner; /* who owns mbufs for this socket */ struct uidinfo *so_uidinfo; /* who opened the socket */ gid_t so_egid; /* creator effective gid */ pid_t so_cpid; /* creator pid */ struct so_accf *so_accf; kauth_cred_t so_cred; /* socket credentials */ }; /* * Socket state bits. */ #define SS_NOFDREF 0x001 /* no file table ref any more */ #define SS_ISCONNECTED 0x002 /* socket connected to a peer */ #define SS_ISCONNECTING 0x004 /* in process of connecting to peer */ #define SS_ISDISCONNECTING 0x008 /* in process of disconnecting */ #define SS_CANTSENDMORE 0x010 /* can't send more data to peer */ #define SS_CANTRCVMORE 0x020 /* can't receive more data from peer */ #define SS_RCVATMARK 0x040 /* at mark on input */ #define SS_ISABORTING 0x080 /* aborting fd references - close() */ #define SS_RESTARTSYS 0x100 /* restart blocked system calls */ #define SS_POLLRDBAND 0x200 /* poll should return POLLRDBAND */ #define SS_MORETOCOME 0x400 /* * hint from sosend to lower layer; * more data coming */ #define SS_ISDISCONNECTED 0x800 /* socket disconnected from peer */ #define SS_ISAPIPE 0x1000 /* socket is implementing a pipe */ #define SS_NBIO 0x2000 /* socket is in non blocking I/O */ #ifdef _KERNEL struct accept_filter { char accf_name[16]; void (*accf_callback) (struct socket *, void *, int, int); void * (*accf_create) (struct socket *, char *); void (*accf_destroy) (struct socket *); LIST_ENTRY(accept_filter) accf_next; u_int accf_refcnt; }; struct sockopt { int sopt_level; /* option level */ int sopt_name; /* option name */ size_t sopt_size; /* data length */ size_t sopt_retsize; /* returned data length */ void * sopt_data; /* data pointer */ uint8_t sopt_buf[sizeof(int)]; /* internal storage */ }; #define SB_EMPTY_FIXUP(sb) \ do { \ KASSERT(solocked((sb)->sb_so)); \ if ((sb)->sb_mb == NULL) { \ (sb)->sb_mbtail = NULL; \ (sb)->sb_lastrecord = NULL; \ } \ } while (/*CONSTCOND*/0) extern u_long sb_max; extern int somaxkva; extern int sock_loan_thresh; extern kmutex_t *softnet_lock; struct mbuf; struct lwp; struct msghdr; struct stat; struct knote; struct sockaddr_big; enum uio_seg; /* 0x400 is SO_OTIMESTAMP */ #define SOOPT_TIMESTAMP(o) ((o) & (SO_TIMESTAMP | 0x400)) /* * File operations on sockets. */ int soo_read(file_t *, off_t *, struct uio *, kauth_cred_t, int); int soo_write(file_t *, off_t *, struct uio *, kauth_cred_t, int); int soo_fcntl(file_t *, u_int cmd, void *); int soo_ioctl(file_t *, u_long cmd, void *); int soo_poll(file_t *, int); int soo_kqfilter(file_t *, struct knote *); int soo_close(file_t *); int soo_stat(file_t *, struct stat *); void soo_restart(file_t *); void sbappend(struct sockbuf *, struct mbuf *); void sbappendstream(struct sockbuf *, struct mbuf *); int sbappendaddr(struct sockbuf *, const struct sockaddr *, struct mbuf *, struct mbuf *); int sbappendaddrchain(struct sockbuf *, const struct sockaddr *, struct mbuf *, int); int sbappendcontrol(struct sockbuf *, struct mbuf *, struct mbuf *); void sbappendrecord(struct sockbuf *, struct mbuf *); void sbcheck(struct sockbuf *); void sbcompress(struct sockbuf *, struct mbuf *, struct mbuf *); struct mbuf * sbcreatecontrol(void *, int, int, int); struct mbuf * sbcreatecontrol1(void **, int, int, int, int); struct mbuf ** sbsavetimestamp(int, struct mbuf **); void sbdrop(struct sockbuf *, int); void sbdroprecord(struct sockbuf *); void sbflush(struct sockbuf *); void sbinsertoob(struct sockbuf *, struct mbuf *); void sbrelease(struct sockbuf *, struct socket *); int sbreserve(struct sockbuf *, u_long, struct socket *); int sbwait(struct sockbuf *); int sb_max_set(u_long); void soinit(void); void soinit1(void); void soinit2(void); int soabort(struct socket *); int soaccept(struct socket *, struct sockaddr *); int sofamily(const struct socket *); int sobind(struct socket *, struct sockaddr *, struct lwp *); void socantrcvmore(struct socket *); void socantsendmore(struct socket *); void soroverflow(struct socket *); int soclose(struct socket *); int soconnect(struct socket *, struct sockaddr *, struct lwp *); int soconnect2(struct socket *, struct socket *); int socreate(int, struct socket **, int, int, struct lwp *, struct socket *); int fsocreate(int, struct socket **, int, int, int *, file_t **, struct socket *); int sodisconnect(struct socket *); void sofree(struct socket *); int sogetopt(struct socket *, struct sockopt *); void sohasoutofband(struct socket *); void soisconnected(struct socket *); void soisconnecting(struct socket *); void soisdisconnected(struct socket *); void soisdisconnecting(struct socket *); int solisten(struct socket *, int, struct lwp *); struct socket * sonewconn(struct socket *, bool); void soqinsque(struct socket *, struct socket *, int); bool soqremque(struct socket *, int); int soreceive(struct socket *, struct mbuf **, struct uio *, struct mbuf **, struct mbuf **, int *); int soreserve(struct socket *, u_long, u_long); void sorflush(struct socket *); int sosend(struct socket *, struct sockaddr *, struct uio *, struct mbuf *, struct mbuf *, int, struct lwp *); int sosetopt(struct socket *, struct sockopt *); int so_setsockopt(struct lwp *, struct socket *, int, int, const void *, size_t); int soshutdown(struct socket *, int); void sorestart(struct socket *); void sowakeup(struct socket *, struct sockbuf *, int); int sockargs(struct mbuf **, const void *, size_t, enum uio_seg, int); int sopoll(struct socket *, int); struct socket *soget(bool); void soput(struct socket *); bool solocked(const struct socket *); bool solocked2(const struct socket *, const struct socket *); int sblock(struct sockbuf *, int); void sbunlock(struct sockbuf *); int sowait(struct socket *, bool, int); void solockretry(struct socket *, kmutex_t *); void sosetlock(struct socket *); void solockreset(struct socket *, kmutex_t *); void sockopt_init(struct sockopt *, int, int, size_t); void sockopt_destroy(struct sockopt *); int sockopt_set(struct sockopt *, const void *, size_t); int sockopt_setint(struct sockopt *, int); int sockopt_get(const struct sockopt *, void *, size_t); int sockopt_getint(const struct sockopt *, int *); int sockopt_setmbuf(struct sockopt *, struct mbuf *); struct mbuf *sockopt_getmbuf(const struct sockopt *); int copyout_sockname(struct sockaddr *, unsigned int *, int, struct mbuf *); int copyout_sockname_sb(struct sockaddr *, unsigned int *, int , struct sockaddr_big *); int copyout_msg_control(struct lwp *, struct msghdr *, struct mbuf *); void free_control_mbuf(struct lwp *, struct mbuf *, struct mbuf *); int do_sys_getpeername(int, struct sockaddr *); int do_sys_getsockname(int, struct sockaddr *); int do_sys_sendmsg(struct lwp *, int, struct msghdr *, int, register_t *); int do_sys_sendmsg_so(struct lwp *, int, struct socket *, file_t *, struct msghdr *, int, register_t *); int do_sys_recvmsg(struct lwp *, int, struct msghdr *, struct mbuf **, struct mbuf **, register_t *); int do_sys_recvmsg_so(struct lwp *, int, struct socket *, struct msghdr *mp, struct mbuf **, struct mbuf **, register_t *); int do_sys_bind(struct lwp *, int, struct sockaddr *); int do_sys_connect(struct lwp *, int, struct sockaddr *); int do_sys_accept(struct lwp *, int, struct sockaddr *, register_t *, const sigset_t *, int, int); int do_sys_peeloff(struct socket *, void *); /* * Inline functions for sockets and socket buffering. */ #include <sys/protosw.h> #include <sys/mbuf.h> /* * Do we need to notify the other side when I/O is possible? */ static __inline int sb_notify(struct sockbuf *sb) { KASSERT(solocked(sb->sb_so)); return sb->sb_flags & (SB_NOTIFY | SB_ASYNC | SB_UPCALL | SB_KNOTE); } /* * How much space is there in a socket buffer (so->so_snd or so->so_rcv)? * Since the fields are unsigned, detect overflow and return 0. */ static __inline u_long sbspace(const struct sockbuf *sb) { KASSERT(solocked(sb->sb_so)); if (sb->sb_hiwat <= sb->sb_cc || sb->sb_mbmax <= sb->sb_mbcnt) return 0; return lmin(sb->sb_hiwat - sb->sb_cc, sb->sb_mbmax - sb->sb_mbcnt); } static __inline u_long sbspace_oob(const struct sockbuf *sb) { u_long hiwat = sb->sb_hiwat; if (hiwat < ULONG_MAX - 1024) hiwat += 1024; KASSERT(solocked(sb->sb_so)); if (hiwat <= sb->sb_cc || sb->sb_mbmax <= sb->sb_mbcnt) return 0; return lmin(hiwat - sb->sb_cc, sb->sb_mbmax - sb->sb_mbcnt); } /* * How much socket buffer space has been used? */ static __inline u_long sbused(const struct sockbuf *sb) { KASSERT(solocked(sb->sb_so)); return sb->sb_cc; } /* do we have to send all at once on a socket? */ static __inline int sosendallatonce(const struct socket *so) { return so->so_proto->pr_flags & PR_ATOMIC; } /* can we read something from so? */ static __inline int soreadable(const struct socket *so) { KASSERT(solocked(so)); return so->so_rcv.sb_cc >= so->so_rcv.sb_lowat || (so->so_state & SS_CANTRCVMORE) != 0 || so->so_qlen != 0 || so->so_error != 0 || so->so_rerror != 0; } /* can we write something to so? */ static __inline int sowritable(const struct socket *so) { KASSERT(solocked(so)); return (sbspace(&so->so_snd) >= so->so_snd.sb_lowat && ((so->so_state & SS_ISCONNECTED) != 0 || (so->so_proto->pr_flags & PR_CONNREQUIRED) == 0)) || (so->so_state & SS_CANTSENDMORE) != 0 || so->so_error != 0; } /* adjust counters in sb reflecting allocation of m */ static __inline void sballoc(struct sockbuf *sb, struct mbuf *m) { KASSERT(solocked(sb->sb_so)); sb->sb_cc += m->m_len; sb->sb_mbcnt += MSIZE; if (m->m_flags & M_EXT) sb->sb_mbcnt += m->m_ext.ext_size; } /* adjust counters in sb reflecting freeing of m */ static __inline void sbfree(struct sockbuf *sb, struct mbuf *m) { KASSERT(solocked(sb->sb_so)); sb->sb_cc -= m->m_len; sb->sb_mbcnt -= MSIZE; if (m->m_flags & M_EXT) sb->sb_mbcnt -= m->m_ext.ext_size; } static __inline void sorwakeup(struct socket *so) { KASSERT(solocked(so)); if (sb_notify(&so->so_rcv)) sowakeup(so, &so->so_rcv, POLL_IN); } static __inline void sowwakeup(struct socket *so) { KASSERT(solocked(so)); if (sb_notify(&so->so_snd)) sowakeup(so, &so->so_snd, POLL_OUT); } static __inline void solock(struct socket *so) { kmutex_t *lock; lock = atomic_load_consume(&so->so_lock); mutex_enter(lock); if (__predict_false(lock != atomic_load_relaxed(&so->so_lock))) solockretry(so, lock); } static __inline void sounlock(struct socket *so) { mutex_exit(so->so_lock); } #ifdef SOCKBUF_DEBUG /* * SBLASTRECORDCHK: check sb->sb_lastrecord is maintained correctly. * SBLASTMBUFCHK: check sb->sb_mbtail is maintained correctly. * * => panic if the socket buffer is inconsistent. * => 'where' is used for a panic message. */ void sblastrecordchk(struct sockbuf *, const char *); #define SBLASTRECORDCHK(sb, where) sblastrecordchk((sb), (where)) void sblastmbufchk(struct sockbuf *, const char *); #define SBLASTMBUFCHK(sb, where) sblastmbufchk((sb), (where)) #define SBCHECK(sb) sbcheck(sb) #else #define SBLASTRECORDCHK(sb, where) /* nothing */ #define SBLASTMBUFCHK(sb, where) /* nothing */ #define SBCHECK(sb) /* nothing */ #endif /* SOCKBUF_DEBUG */ /* sosend loan */ vaddr_t sokvaalloc(vaddr_t, vsize_t, struct socket *); void sokvafree(vaddr_t, vsize_t); void soloanfree(struct mbuf *, void *, size_t, void *); /* * Values for socket-buffer-append priority argument to sbappendaddrchain(). * The following flags are reserved for future implementation: * * SB_PRIO_NONE: honour normal socket-buffer limits. * * SB_PRIO_ONESHOT_OVERFLOW: if the socket has any space, * deliver the entire chain. Intended for large requests * that should be delivered in their entirety, or not at all. * * SB_PRIO_OVERDRAFT: allow a small (2*MLEN) overflow, over and * aboce normal socket limits. Intended messages indicating * buffer overflow in earlier normal/lower-priority messages . * * SB_PRIO_BESTEFFORT: Ignore limits entirely. Intended only for * kernel-generated messages to specially-marked scokets which * require "reliable" delivery, nd where the source socket/protocol * message generator enforce some hard limit (but possibly well * above kern.sbmax). It is entirely up to the in-kernel source to * avoid complete mbuf exhaustion or DoS scenarios. */ #define SB_PRIO_NONE 0 #define SB_PRIO_ONESHOT_OVERFLOW 1 #define SB_PRIO_OVERDRAFT 2 #define SB_PRIO_BESTEFFORT 3 /* * Accept filter functions (duh). */ int accept_filt_getopt(struct socket *, struct sockopt *); int accept_filt_setopt(struct socket *, const struct sockopt *); int accept_filt_clear(struct socket *); int accept_filt_add(struct accept_filter *); int accept_filt_del(struct accept_filter *); struct accept_filter *accept_filt_get(char *); #ifdef ACCEPT_FILTER_MOD #ifdef SYSCTL_DECL SYSCTL_DECL(_net_inet_accf); #endif void accept_filter_init(void); #endif #ifdef DDB int sofindproc(struct socket *so, int all, void (*pr)(const char *, ...)); void socket_print(const char *modif, void (*pr)(const char *, ...)); #endif #endif /* _KERNEL */ #endif /* !_SYS_SOCKETVAR_H_ */
21 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 /* $NetBSD: ip6_var.h,v 1.94 2024/02/09 22:08:37 andvar Exp $ */ /* $KAME: ip6_var.h,v 1.33 2000/06/11 14:59:20 jinmei Exp $ */ /* * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the project nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * Copyright (c) 1982, 1986, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)ip_var.h 8.1 (Berkeley) 6/10/93 */ #ifndef _NETINET6_IP6_VAR_H_ #define _NETINET6_IP6_VAR_H_ #include <sys/types.h> #include <sys/queue.h> #include <sys/socketvar.h> #include <net/if.h> #include <net/route.h> #include <netinet/in.h> #include <netinet/ip6.h> struct ip6_moptions { if_index_t im6o_multicast_if_index; /* I/F for outgoing multicasts */ u_char im6o_multicast_hlim; /* hoplimit for outgoing multicasts */ u_char im6o_multicast_loop; /* 1 >= hear sends if a member */ LIST_HEAD(, in6_multi_mship) im6o_memberships; }; /* * Control options for outgoing packets */ /* Routing header related info */ struct ip6po_rhinfo { struct ip6_rthdr *ip6po_rhi_rthdr; /* Routing header */ struct route ip6po_rhi_route; /* Route to the 1st hop */ }; #define ip6po_rthdr ip6po_rhinfo.ip6po_rhi_rthdr #define ip6po_route ip6po_rhinfo.ip6po_rhi_route /* Nexthop related info */ struct ip6po_nhinfo { struct sockaddr *ip6po_nhi_nexthop; struct route ip6po_nhi_route; /* Route to the nexthop */ }; #define ip6po_nexthop ip6po_nhinfo.ip6po_nhi_nexthop #define ip6po_nextroute ip6po_nhinfo.ip6po_nhi_route struct ip6_pktopts { int ip6po_hlim; /* Hoplimit for outgoing packets */ struct in6_pktinfo *ip6po_pktinfo; /* Outgoing IF/address information */ struct ip6po_nhinfo ip6po_nhinfo; /* Next-hop address information */ struct ip6_hbh *ip6po_hbh; /* Hop-by-Hop options header */ struct ip6_dest *ip6po_dest1; /* Destination options header(1st part) */ struct ip6po_rhinfo ip6po_rhinfo; /* Routing header related info. */ struct ip6_dest *ip6po_dest2; /* Destination options header(2nd part) */ int ip6po_tclass; /* traffic class */ int ip6po_minmtu; /* fragment vs PMTU discovery policy */ #define IP6PO_MINMTU_MCASTONLY -1 /* default; send at min MTU for multicast*/ #define IP6PO_MINMTU_DISABLE 0 /* always perform pmtu disc */ #define IP6PO_MINMTU_ALL 1 /* always send at min MTU */ int ip6po_prefer_tempaddr; /* whether temporary addresses are * preferred as source address */ #define IP6PO_TEMPADDR_SYSTEM -1 /* follow the system default */ #define IP6PO_TEMPADDR_NOTPREFER 0 /* not prefer temporary address */ #define IP6PO_TEMPADDR_PREFER 1 /* prefer temporary address */ int ip6po_flags; #if 0 /* parameters in this block is obsolete. do not reuse the values. */ #define IP6PO_REACHCONF 0x01 /* upper-layer reachability confirmation. */ #define IP6PO_MINMTU 0x02 /* use minimum MTU (IPV6_USE_MIN_MTU) */ #endif #define IP6PO_DONTFRAG 0x04 /* disable fragmentation (IPV6_DONTFRAG) */ }; /* * IPv6 statistics. * Each counter is an unsigned 64-bit value. */ #define IP6_STAT_TOTAL 0 /* total packets received */ #define IP6_STAT_TOOSHORT 1 /* packet too short */ #define IP6_STAT_TOOSMALL 2 /* not enough data */ #define IP6_STAT_FRAGMENTS 3 /* fragments received */ #define IP6_STAT_FRAGDROPPED 4 /* frags dropped (dups, out of space) */ #define IP6_STAT_FRAGTIMEOUT 5 /* fragments timed out */ #define IP6_STAT_FRAGOVERFLOW 6 /* fragments that exceed limit */ #define IP6_STAT_FORWARD 7 /* packets forwarded */ #define IP6_STAT_CANTFORWARD 8 /* packets rcvd for uncreachable dst */ #define IP6_STAT_REDIRECTSENT 9 /* packets forwarded on same net */ #define IP6_STAT_DELIVERED 10 /* datagrams delivered to upper level */ #define IP6_STAT_LOCALOUT 11 /* total IP packets generated here */ #define IP6_STAT_ODROPPED 12 /* lost packets due to nobufs, etc. */ #define IP6_STAT_REASSEMBLED 13 /* total packets reassembled ok */ #define IP6_STAT_FRAGMENTED 14 /* datagrams successfully fragmented */ #define IP6_STAT_OFRAGMENTS 15 /* output fragments created */ #define IP6_STAT_CANTFRAG 16 /* don't fragment flag was set, etc. */ #define IP6_STAT_BADOPTIONS 17 /* error in option processing */ #define IP6_STAT_NOROUTE 18 /* packets discarded due to no route */ #define IP6_STAT_BADVERS 19 /* ip6 version != 6 */ #define IP6_STAT_RAWOUT 20 /* total raw ip packets generated */ #define IP6_STAT_BADSCOPE 21 /* scope error */ #define IP6_STAT_NOTMEMBER 22 /* don't join this multicast group */ #define IP6_STAT_NXTHIST 23 /* next header histogram */ /* space for 256 counters */ #define IP6_STAT_M1 279 /* one mbuf */ #define IP6_STAT_M2M 280 /* two or more mbuf */ /* space for 32 counters */ #define IP6_STAT_MEXT1 312 /* one ext mbuf */ #define IP6_STAT_MEXT2M 313 /* two or more ext mbuf */ #define IP6_STAT_EXTHDRTOOLONG 314 /* ext hdr are not contiguous */ #define IP6_STAT_NOGIF 315 /* no match gif found */ #define IP6_STAT_TOOMANYHDR 316 /* discarded due to too many headers */ /* * statistics for improvement of the source address selection * algorithm: * XXX: hardcoded 16 = # of ip6 multicast scope types + 1 */ #define IP6_STAT_SOURCES_NONE 317 /* number of times that address selection fails */ #define IP6_STAT_SOURCES_SAMEIF 318 /* number of times that an address on the outgoing I/F is chosen */ /* space for 16 counters */ #define IP6_STAT_SOURCES_OTHERIF 334 /* number of times that an address on a non-outgoing I/F is chosen */ /* space for 16 counters */ #define IP6_STAT_SOURCES_SAMESCOPE 350 /* number of times that an address that has the same scope from the dest. is chosen */ /* space for 16 counters */ #define IP6_STAT_SOURCES_OTHERSCOPE 366 /* number of times that an address that has a different scope from the dest. is chosen */ /* space for 16 counters */ #define IP6_STAT_SOURCES_DEPRECATED 382 /* number of times that a deprecated address is chosen */ /* space for 16 counters */ #define IP6_STAT_FORWARD_CACHEHIT 398 #define IP6_STAT_FORWARD_CACHEMISS 399 #define IP6_STAT_FASTFORWARD 400 /* packets fast forwarded */ #define IP6_STAT_FASTFORWARDFLOWS 401 /* number of fast forward flows */ #define IP6_STAT_NOIPSEC 402 /* no match ipsec(4) found */ #define IP6_STAT_PFILDROP_IN 403 /* dropped by pfil (PFIL_IN) */ #define IP6_STAT_PFILDROP_OUT 404 /* dropped by pfil (PFIL_OUT) */ #define IP6_STAT_IPSECDROP_IN 405 /* dropped by IPsec SP check */ #define IP6_STAT_IPSECDROP_OUT 406 /* dropped by IPsec SP check */ #define IP6_STAT_IFDROP 407 /* dropped due to interface state */ #define IP6_STAT_IDROPPED 408 /* lost packets due to nobufs, etc. */ #define IP6_STAT_TIMXCEED 409 /* hop limit exceeded */ #define IP6_STAT_TOOBIG 410 /* packet bigger than MTU */ #define IP6_STAT_RTREJECT 411 /* rejected by route */ #define IP6_NSTATS 412 #define IP6FLOW_HASHBITS 6 /* should not be a multiple of 8 */ /* * Structure for an IPv6 flow (ip6_fastforward). */ struct ip6flow { TAILQ_ENTRY(ip6flow) ip6f_list; /* next in active list */ TAILQ_ENTRY(ip6flow) ip6f_hash; /* next ip6flow in bucket */ size_t ip6f_hashidx; /* own hash index of ipflowtable[] */ struct in6_addr ip6f_dst; /* destination address */ struct in6_addr ip6f_src; /* source address */ struct route ip6f_ro; /* associated route entry */ u_int32_t ip6f_flow; /* flow (tos) */ u_quad_t ip6f_uses; /* number of uses in this period */ u_quad_t ip6f_last_uses; /* number of uses in last period */ u_quad_t ip6f_dropped; /* ENOBUFS returned by if_output */ u_quad_t ip6f_forwarded; /* packets forwarded */ u_int ip6f_timer; /* lifetime timer */ }; #ifdef _KERNEL #include <sys/protosw.h> #include <sys/cprng.h> /* * Auxiliary attributes of incoming IPv6 packets, which is initialized when we * come into ip6_input(). * XXX do not make it a kitchen sink! */ struct ip6aux { /* ip6.ip6_dst */ struct in6_addr ip6a_src; uint32_t ip6a_scope_id; int ip6a_flags; }; /* flags passed to ip6_output as last parameter */ #define IPV6_UNSPECSRC 0x01 /* allow :: as the source address */ #define IPV6_FORWARDING 0x02 /* most of IPv6 header exists */ #define IPV6_MINMTU 0x04 /* use minimum MTU (IPV6_USE_MIN_MTU) */ extern u_int32_t ip6_id; /* fragment identifier */ extern int ip6_defhlim; /* default hop limit */ extern int ip6_defmcasthlim; /* default multicast hop limit */ extern int ip6_forwarding; /* act as router? */ extern int ip6_sendredirect; /* send ICMPv6 redirect? */ extern int ip6_use_deprecated; /* allow deprecated addr as source */ extern int ip6_mcast_pmtu; /* enable pMTU discovery for multicast? */ extern int ip6_v6only; extern int ip6_neighborgcthresh; /* Threshold # of NDP entries for GC */ extern int ip6_maxdynroutes; /* Max # of routes created via redirect */ extern int ip6_param_rt_msg; /* How to send parameter changing rtm */ extern struct socket *ip6_mrouter; /* multicast routing daemon */ extern int ip6_sendredirects; /* send IP redirects when forwarding? */ extern int ip6_maxfragpackets; /* Maximum packets in reassembly queue */ extern int ip6_maxfrags; /* Maximum fragments in reassembly queue */ extern int ip6_keepfaith; /* Firewall Aided Internet Translator */ extern int ip6_log_interval; extern time_t ip6_log_time; extern int ip6_hdrnestlimit; /* upper limit of # of extension headers */ extern int ip6_dad_count; /* DupAddrDetectionTransmits */ extern int ip6_auto_flowlabel; extern int ip6_auto_linklocal; extern int ip6_anonportmin; /* minimum ephemeral port */ extern int ip6_anonportmax; /* maximum ephemeral port */ extern int ip6_lowportmin; /* minimum reserved port */ extern int ip6_lowportmax; /* maximum reserved port */ extern int ip6_prefer_tempaddr; /* whether to prefer temporary addresses in the source address selection */ extern int ip6_use_defzone; /* whether to use the default scope zone when unspecified */ #ifdef GATEWAY extern int ip6_maxflows; /* maximum amount of flows for ip6ff */ extern int ip6_hashsize; /* size of hash table */ #endif struct inpcb; extern const struct pr_usrreqs rip6_usrreqs; int icmp6_ctloutput(int, struct socket *, struct sockopt *); struct mbuf; void ip6_init(void); const struct ip6aux *ip6_getdstifaddr(struct mbuf *); void ip6_freepcbopts(struct ip6_pktopts *); void ip6_freemoptions(struct ip6_moptions *); int ip6_unknown_opt(u_int8_t *, struct mbuf *, int); int ip6_get_prevhdr(struct mbuf *, int); int ip6_nexthdr(struct mbuf *, int, int, int *); int ip6_lasthdr(struct mbuf *, int, int, int *); struct ip6_hdr; int ip6_mforward(struct ip6_hdr *, struct ifnet *, struct mbuf *); int ip6_hopopts_input(u_int32_t *, u_int32_t *, struct mbuf **, int *); void ip6_savecontrol(struct inpcb *, struct mbuf **, struct ip6_hdr *, struct mbuf *); void ip6_notify_pmtu(struct inpcb *, const struct sockaddr_in6 *, u_int32_t *); int ip6_sysctl(int *, u_int, void *, size_t *, void *, size_t); void ip6_forward(struct mbuf *, int, struct ifnet *); void ip6_mloopback(struct ifnet *, struct mbuf *, const struct sockaddr_in6 *); int ip6_output(struct mbuf *, struct ip6_pktopts *, struct route *, int, struct ip6_moptions *, struct inpcb *, struct ifnet **); int ip6_if_output(struct ifnet * const, struct ifnet * const, struct mbuf * const, const struct sockaddr_in6 * const, const struct rtentry *); int ip6_ctloutput(int, struct socket *, struct sockopt *); int ip6_raw_ctloutput(int, struct socket *, struct sockopt *); void ip6_initpktopts(struct ip6_pktopts *); int ip6_setpktopts(struct mbuf *, struct ip6_pktopts *, struct ip6_pktopts *, kauth_cred_t, int); void ip6_clearpktopts(struct ip6_pktopts *, int); struct ip6_pktopts *ip6_copypktopts(struct ip6_pktopts *, int); int ip6_optlen(struct inpcb *); void ip6_statinc(u_int); int route6_input(struct mbuf **, int *, int); void frag6_init(void); int frag6_input(struct mbuf **, int *, int); int ip6_reass_packet(struct mbuf **, int); void frag6_slowtimo(void); void frag6_fasttimo(void); void frag6_drain(void); void frag6_drainstub(void); int ip6flow_init(int); void ip6flow_poolinit(void); struct ip6flow *ip6flow_reap(int); void ip6flow_create(struct route *, struct mbuf *); void ip6flow_slowtimo(void); int ip6flow_invalidate_all(int); void rip6_init(void); int rip6_input(struct mbuf **, int *, int); void *rip6_ctlinput(int, const struct sockaddr *, void *); int rip6_ctloutput(int, struct socket *, struct sockopt *); int rip6_output(struct mbuf *, struct socket *, struct sockaddr_in6 *, struct mbuf *); int rip6_attach(struct socket *, int); int rip6_usrreq(struct socket *, int, struct mbuf *, struct mbuf *, struct mbuf *, struct lwp *); int dest6_input(struct mbuf **, int *, int); int none_input(struct mbuf **, int *, int); struct route; int in6_selectsrc(struct sockaddr_in6 *, struct ip6_pktopts *, struct ip6_moptions *, struct route *, struct in6_addr *, struct ifnet **, struct psref *, struct in6_addr *); int in6_selectroute(struct sockaddr_in6 *, struct ip6_pktopts *, struct route **, struct rtentry **, bool); int ip6_get_membership(const struct sockopt *, struct ifnet **, struct psref *, void *, size_t); static __inline uint32_t ip6_randomid(void) { return cprng_fast32(); } static __inline uint32_t ip6_randomflowlabel(void) { return cprng_fast32() & 0xfffff; } static __inline bool ip6_dad_enabled(void) { return ip6_dad_count > 0; } #endif /* _KERNEL */ #endif /* !_NETINET6_IP6_VAR_H_ */
9 5 4 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 /* $NetBSD: exec_aout.c,v 1.41 2019/11/20 19:37:53 pgoyette Exp $ */ /* * Copyright (c) 1993, 1994 Christopher G. Demetriou * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by Christopher G. Demetriou. * 4. The name of the author may not be used to endorse or promote products * derived from this software without specific prior written permission * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: exec_aout.c,v 1.41 2019/11/20 19:37:53 pgoyette Exp $"); #include <sys/param.h> #include <sys/systm.h> #include <sys/proc.h> #include <sys/vnode.h> #include <sys/exec.h> #include <sys/exec_aout.h> #include <sys/resourcevar.h> #include <sys/module.h> #include <uvm/uvm_extern.h> MODULE(MODULE_CLASS_EXEC, exec_aout, NULL); static struct execsw exec_aout_execsw = { .es_hdrsz = sizeof(struct exec), .es_makecmds = exec_aout_makecmds, .u = { .elf_probe_func = NULL, }, .es_emul = &emul_netbsd, .es_prio = EXECSW_PRIO_ANY, .es_arglen = 0, .es_copyargs = copyargs, .es_setregs = NULL, .es_coredump = coredump_netbsd, .es_setup_stack = exec_setup_stack, }; static int exec_aout_modcmd(modcmd_t cmd, void *arg) { switch (cmd) { case MODULE_CMD_INIT: return exec_add(&exec_aout_execsw, 1); case MODULE_CMD_FINI: return exec_remove(&exec_aout_execsw, 1); default: return ENOTTY; } } /* * exec_aout_makecmds(): Check if it's an a.out-format executable. * * Given a lwp pointer and an exec package pointer, see if the referent * of the epp is in a.out format. First check 'standard' magic numbers for * this architecture. If that fails, try a CPU-dependent hook. * * This function, in the former case, or the hook, in the latter, is * responsible for creating a set of vmcmds which can be used to build * the process's vm space and inserting them into the exec package. */ int exec_aout_makecmds(struct lwp *l, struct exec_package *epp) { u_long midmag, magic; u_short mid; int error; struct exec *execp = epp->ep_hdr; if (epp->ep_hdrvalid < sizeof(struct exec)) return ENOEXEC; midmag = ntohl(execp->a_midmag); mid = (midmag >> 16) & 0x3ff; magic = midmag & 0xffff; midmag = mid << 16 | magic; switch (midmag) { case (MID_MACHINE << 16) | ZMAGIC: error = exec_aout_prep_zmagic(l, epp); break; case (MID_MACHINE << 16) | NMAGIC: error = exec_aout_prep_nmagic(l, epp); break; case (MID_MACHINE << 16) | OMAGIC: error = exec_aout_prep_omagic(l, epp); break; default: error = cpu_exec_aout_makecmds(l, epp); } if (error) kill_vmcmds(&epp->ep_vmcmds); else epp->ep_flags &= ~EXEC_TOPDOWN_VM; return error; } /* * exec_aout_prep_zmagic(): Prepare a 'native' ZMAGIC binary's exec package * * First, set of the various offsets/lengths in the exec package. * * Then, mark the text image busy (so it can be demand paged) or error * out if this is not possible. Finally, set up vmcmds for the * text, data, bss, and stack segments. */ int exec_aout_prep_zmagic(struct lwp *l, struct exec_package *epp) { struct exec *execp = epp->ep_hdr; int error; epp->ep_taddr = AOUT_LDPGSZ; epp->ep_tsize = execp->a_text; epp->ep_daddr = epp->ep_taddr + execp->a_text; epp->ep_dsize = execp->a_data + execp->a_bss; epp->ep_entry = execp->a_entry; error = vn_marktext(epp->ep_vp); if (error) return (error); /* set up command for text segment */ NEW_VMCMD(&epp->ep_vmcmds, vmcmd_map_pagedvn, round_page(execp->a_text), epp->ep_taddr, epp->ep_vp, 0, VM_PROT_READ|VM_PROT_EXECUTE); /* set up command for data segment */ NEW_VMCMD(&epp->ep_vmcmds, vmcmd_map_pagedvn, round_page(execp->a_data), epp->ep_daddr, epp->ep_vp, execp->a_text, VM_PROT_READ|VM_PROT_WRITE|VM_PROT_EXECUTE); /* set up command for bss segment */ if (execp->a_bss > 0) NEW_VMCMD(&epp->ep_vmcmds, vmcmd_map_zero, execp->a_bss, epp->ep_daddr + execp->a_data, NULLVP, 0, VM_PROT_READ|VM_PROT_WRITE|VM_PROT_EXECUTE); return (*epp->ep_esch->es_setup_stack)(l, epp); } /* * exec_aout_prep_nmagic(): Prepare a 'native' NMAGIC binary's exec package */ int exec_aout_prep_nmagic(struct lwp *l, struct exec_package *epp) { struct exec *execp = epp->ep_hdr; long bsize, baddr; epp->ep_taddr = AOUT_LDPGSZ; epp->ep_tsize = execp->a_text; epp->ep_daddr = roundup(epp->ep_taddr + execp->a_text, AOUT_LDPGSZ); epp->ep_dsize = execp->a_data + execp->a_bss; epp->ep_entry = execp->a_entry; /* set up command for text segment */ NEW_VMCMD(&epp->ep_vmcmds, vmcmd_map_readvn, execp->a_text, epp->ep_taddr, epp->ep_vp, sizeof(struct exec), VM_PROT_READ|VM_PROT_EXECUTE); /* set up command for data segment */ NEW_VMCMD(&epp->ep_vmcmds, vmcmd_map_readvn, execp->a_data, epp->ep_daddr, epp->ep_vp, execp->a_text + sizeof(struct exec), VM_PROT_READ|VM_PROT_WRITE|VM_PROT_EXECUTE); /* set up command for bss segment */ baddr = round_page(epp->ep_daddr + execp->a_data); bsize = epp->ep_daddr + epp->ep_dsize - baddr; if (bsize > 0) NEW_VMCMD(&epp->ep_vmcmds, vmcmd_map_zero, bsize, baddr, NULLVP, 0, VM_PROT_READ|VM_PROT_WRITE|VM_PROT_EXECUTE); return (*epp->ep_esch->es_setup_stack)(l, epp); } /* * exec_aout_prep_omagic(): Prepare a 'native' OMAGIC binary's exec package */ int exec_aout_prep_omagic(struct lwp *l, struct exec_package *epp) { struct exec *execp = epp->ep_hdr; long dsize, bsize, baddr; epp->ep_taddr = AOUT_LDPGSZ; epp->ep_tsize = execp->a_text; epp->ep_daddr = epp->ep_taddr + execp->a_text; epp->ep_dsize = execp->a_data + execp->a_bss; epp->ep_entry = execp->a_entry; /* set up command for text and data segments */ NEW_VMCMD(&epp->ep_vmcmds, vmcmd_map_readvn, execp->a_text + execp->a_data, epp->ep_taddr, epp->ep_vp, sizeof(struct exec), VM_PROT_READ|VM_PROT_WRITE|VM_PROT_EXECUTE); /* set up command for bss segment */ baddr = round_page(epp->ep_daddr + execp->a_data); bsize = epp->ep_daddr + epp->ep_dsize - baddr; if (bsize > 0) NEW_VMCMD(&epp->ep_vmcmds, vmcmd_map_zero, bsize, baddr, NULLVP, 0, VM_PROT_READ|VM_PROT_WRITE|VM_PROT_EXECUTE); /* * Make sure (# of pages) mapped above equals (vm_tsize + vm_dsize); * obreak(2) relies on this fact. Both `vm_tsize' and `vm_dsize' are * computed (in execve(2)) by rounding *up* `ep_tsize' and `ep_dsize' * respectively to page boundaries. * Compensate `ep_dsize' for the amount of data covered by the last * text page. */ dsize = epp->ep_dsize + execp->a_text - round_page(execp->a_text); epp->ep_dsize = (dsize > 0) ? dsize : 0; return (*epp->ep_esch->es_setup_stack)(l, epp); }
1 1 1 1 1 1 1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 /* $NetBSD: sco_upper.c,v 1.16 2014/08/05 07:55:32 rtr Exp $ */ /*- * Copyright (c) 2006 Itronix Inc. * All rights reserved. * * Written by Iain Hibbert for Itronix Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The name of Itronix Inc. may not be used to endorse * or promote products derived from this software without specific * prior written permission. * * THIS SOFTWARE IS PROVIDED BY ITRONIX INC. ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL ITRONIX INC. BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: sco_upper.c,v 1.16 2014/08/05 07:55:32 rtr Exp $"); #include <sys/param.h> #include <sys/kernel.h> #include <sys/mbuf.h> #include <sys/proc.h> #include <sys/socketvar.h> #include <sys/systm.h> #include <netbt/bluetooth.h> #include <netbt/hci.h> #include <netbt/sco.h> /**************************************************************************** * * SCO - Upper Protocol API */ struct sco_pcb_list sco_pcb = LIST_HEAD_INITIALIZER(sco_pcb); /* * sco_attach_pcb(handle, proto, upper) * * Attach a new instance of SCO pcb to handle */ int sco_attach_pcb(struct sco_pcb **handle, const struct btproto *proto, void *upper) { struct sco_pcb *pcb; KASSERT(handle != NULL); KASSERT(proto != NULL); KASSERT(upper != NULL); pcb = malloc(sizeof(struct sco_pcb), M_BLUETOOTH, M_NOWAIT | M_ZERO); if (pcb == NULL) return ENOMEM; pcb->sp_proto = proto; pcb->sp_upper = upper; LIST_INSERT_HEAD(&sco_pcb, pcb, sp_next); *handle = pcb; return 0; } /* * sco_bind_pcb(pcb, sockaddr) * * Bind SCO pcb to local address */ int sco_bind_pcb(struct sco_pcb *pcb, struct sockaddr_bt *addr) { if (pcb->sp_link != NULL || pcb->sp_flags & SP_LISTENING) return EINVAL; bdaddr_copy(&pcb->sp_laddr, &addr->bt_bdaddr); return 0; } /* * sco_sockaddr_pcb(pcb, sockaddr) * * Copy local address of PCB to sockaddr */ int sco_sockaddr_pcb(struct sco_pcb *pcb, struct sockaddr_bt *addr) { memset(addr, 0, sizeof(struct sockaddr_bt)); addr->bt_len = sizeof(struct sockaddr_bt); addr->bt_family = AF_BLUETOOTH; bdaddr_copy(&addr->bt_bdaddr, &pcb->sp_laddr); return 0; } /* * sco_connect_pcb(pcb, sockaddr) * * Initiate a SCO connection to the destination address. */ int sco_connect_pcb(struct sco_pcb *pcb, struct sockaddr_bt *dest) { hci_add_sco_con_cp cp; struct hci_unit *unit; struct hci_link *acl, *sco; int err; if (pcb->sp_flags & SP_LISTENING) return EINVAL; bdaddr_copy(&pcb->sp_raddr, &dest->bt_bdaddr); if (bdaddr_any(&pcb->sp_raddr)) return EDESTADDRREQ; if (bdaddr_any(&pcb->sp_laddr)) { err = hci_route_lookup(&pcb->sp_laddr, &pcb->sp_raddr); if (err) return err; } unit = hci_unit_lookup(&pcb->sp_laddr); if (unit == NULL) return ENETDOWN; /* * We must have an already open ACL connection before we open the SCO * connection, and since SCO connections dont happen on their own we * will not open one, the application wanting this should have opened * it previously. */ acl = hci_link_lookup_bdaddr(unit, &pcb->sp_raddr, HCI_LINK_ACL); if (acl == NULL || acl->hl_state != HCI_LINK_OPEN) return EHOSTUNREACH; sco = hci_link_alloc(unit, &pcb->sp_raddr, HCI_LINK_SCO); if (sco == NULL) return ENOMEM; sco->hl_link = hci_acl_open(unit, &pcb->sp_raddr); KASSERT(sco->hl_link == acl); cp.con_handle = htole16(acl->hl_handle); cp.pkt_type = htole16(0x00e0); /* HV1, HV2, HV3 */ err = hci_send_cmd(unit, HCI_CMD_ADD_SCO_CON, &cp, sizeof(cp)); if (err) { hci_link_free(sco, err); return err; } sco->hl_sco = pcb; pcb->sp_link = sco; pcb->sp_mtu = unit->hci_max_sco_size; return 0; } /* * sco_peeraddr_pcb(pcb, sockaddr) * * Copy remote address of SCO pcb to sockaddr */ int sco_peeraddr_pcb(struct sco_pcb *pcb, struct sockaddr_bt *addr) { memset(addr, 0, sizeof(struct sockaddr_bt)); addr->bt_len = sizeof(struct sockaddr_bt); addr->bt_family = AF_BLUETOOTH; bdaddr_copy(&addr->bt_bdaddr, &pcb->sp_raddr); return 0; } /* * sco_disconnect_pcb(pcb, linger) * * Initiate disconnection of connected SCO pcb */ int sco_disconnect_pcb(struct sco_pcb *pcb, int linger) { hci_discon_cp cp; struct hci_link *sco; int err; sco = pcb->sp_link; if (sco == NULL) return EINVAL; cp.con_handle = htole16(sco->hl_handle); cp.reason = 0x13; /* "Remote User Terminated Connection" */ err = hci_send_cmd(sco->hl_unit, HCI_CMD_DISCONNECT, &cp, sizeof(cp)); if (err || linger == 0) { sco->hl_sco = NULL; pcb->sp_link = NULL; hci_link_free(sco, err); } return err; } /* * sco_detach_pcb(handle) * * Detach SCO pcb from handle and clear up */ void sco_detach_pcb(struct sco_pcb **handle) { struct sco_pcb *pcb; KASSERT(handle != NULL); pcb = *handle; *handle = NULL; if (pcb->sp_link != NULL) { sco_disconnect_pcb(pcb, 0); pcb->sp_link = NULL; } LIST_REMOVE(pcb, sp_next); free(pcb, M_BLUETOOTH); } /* * sco_listen_pcb(pcb) * * Mark pcb as a listener. */ int sco_listen_pcb(struct sco_pcb *pcb) { if (pcb->sp_link != NULL) return EINVAL; pcb->sp_flags |= SP_LISTENING; return 0; } /* * sco_send_pcb(pcb, mbuf) * * Send data on SCO pcb. * * Gross hackage, we just output the packet directly onto the unit queue. * This will work fine for one channel per unit, but for more channels it * really needs fixing. We set the context so that when the packet is sent, * we can drop a record from the socket buffer. */ int sco_send_pcb(struct sco_pcb *pcb, struct mbuf *m) { hci_scodata_hdr_t *hdr; int plen; if (pcb->sp_link == NULL) { m_freem(m); return EINVAL; } plen = m->m_pkthdr.len; DPRINTFN(10, "%d bytes\n", plen); /* * This is a temporary limitation, as USB devices cannot * handle SCO packet sizes that are not an integer number * of Isochronous frames. See ubt(4) */ if (plen != pcb->sp_mtu) { m_freem(m); return EMSGSIZE; } M_PREPEND(m, sizeof(hci_scodata_hdr_t), M_DONTWAIT); if (m == NULL) return ENOMEM; hdr = mtod(m, hci_scodata_hdr_t *); hdr->type = HCI_SCO_DATA_PKT; hdr->con_handle = htole16(pcb->sp_link->hl_handle); hdr->length = plen; pcb->sp_pending++; M_SETCTX(m, pcb->sp_link); hci_output_sco(pcb->sp_link->hl_unit, m); return 0; } /* * sco_setopt(pcb, sopt) * * Set SCO pcb options */ int sco_setopt(struct sco_pcb *pcb, const struct sockopt *sopt) { int err = 0; switch (sopt->sopt_name) { default: err = ENOPROTOOPT; break; } return err; } /* * sco_getopt(pcb, sopt) * * Get SCO pcb options */ int sco_getopt(struct sco_pcb *pcb, struct sockopt *sopt) { switch (sopt->sopt_name) { case SO_SCO_MTU: return sockopt_set(sopt, &pcb->sp_mtu, sizeof(uint16_t)); case SO_SCO_HANDLE: if (pcb->sp_link) return sockopt_set(sopt, &pcb->sp_link->hl_handle, sizeof(uint16_t)); return ENOTCONN; default: break; } return ENOPROTOOPT; }
502 321 320 321 322 6 10 319 509 16 495 16 470 113 472 503 501 10 495 431 92 509 510 509 507 506 508 448 113 504 509 1 446 32 490 484 482 7 486 418 89 2 2 2 2 2 2 6 6 6 6 25 26 4 23 2 4 4 22 22 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 /* $NetBSD: kern_sleepq.c,v 1.87 2023/11/02 10:31:55 martin Exp $ */ /*- * Copyright (c) 2006, 2007, 2008, 2009, 2019, 2020, 2023 * The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Andrew Doran. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /* * Sleep queue implementation, used by turnstiles and general sleep/wakeup * interfaces. */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: kern_sleepq.c,v 1.87 2023/11/02 10:31:55 martin Exp $"); #include <sys/param.h> #include <sys/cpu.h> #include <sys/intr.h> #include <sys/kernel.h> #include <sys/ktrace.h> #include <sys/pool.h> #include <sys/proc.h> #include <sys/resourcevar.h> #include <sys/sched.h> #include <sys/sleepq.h> #include <sys/syncobj.h> #include <sys/systm.h> /* * for sleepq_abort: * During autoconfiguration or after a panic, a sleep will simply lower the * priority briefly to allow interrupts, then return. The priority to be * used (IPL_SAFEPRI) is machine-dependent, thus this value is initialized and * maintained in the machine-dependent layers. This priority will typically * be 0, or the lowest priority that is safe for use on the interrupt stack; * it can be made higher to block network software interrupts after panics. */ #ifndef IPL_SAFEPRI #define IPL_SAFEPRI 0 #endif static int sleepq_sigtoerror(lwp_t *, int); /* General purpose sleep table, used by mtsleep() and condition variables. */ sleeptab_t sleeptab __cacheline_aligned; sleepqlock_t sleepq_locks[SLEEPTAB_HASH_SIZE] __cacheline_aligned; /* * sleeptab_init: * * Initialize a sleep table. */ void sleeptab_init(sleeptab_t *st) { static bool again; int i; for (i = 0; i < SLEEPTAB_HASH_SIZE; i++) { if (!again) { mutex_init(&sleepq_locks[i].lock, MUTEX_DEFAULT, IPL_SCHED); } sleepq_init(&st->st_queue[i]); } again = true; } /* * sleepq_init: * * Prepare a sleep queue for use. */ void sleepq_init(sleepq_t *sq) { LIST_INIT(sq); } /* * sleepq_remove: * * Remove an LWP from a sleep queue and wake it up. Distinguish * between deliberate wakeups (which are a valuable information) and * "unsleep" (an out-of-band action must be taken). * * For wakeup, convert any interruptable wait into non-interruptable * one before waking the LWP. Otherwise, if only one LWP is awoken it * could fail to do something useful with the wakeup due to an error * return and the caller of e.g. cv_signal() may not expect this. */ void sleepq_remove(sleepq_t *sq, lwp_t *l, bool wakeup) { struct schedstate_percpu *spc; struct cpu_info *ci; KASSERT(lwp_locked(l, NULL)); if ((l->l_syncobj->sobj_flag & SOBJ_SLEEPQ_NULL) == 0) { KASSERT(sq != NULL); LIST_REMOVE(l, l_sleepchain); } else { KASSERT(sq == NULL); } l->l_syncobj = &sched_syncobj; l->l_wchan = NULL; l->l_sleepq = NULL; l->l_flag &= wakeup ? ~(LW_SINTR|LW_CATCHINTR|LW_STIMO) : ~LW_SINTR; ci = l->l_cpu; spc = &ci->ci_schedstate; /* * If not sleeping, the LWP must have been suspended. Let whoever * holds it stopped set it running again. */ if (l->l_stat != LSSLEEP) { KASSERT(l->l_stat == LSSTOP || l->l_stat == LSSUSPENDED); lwp_setlock(l, spc->spc_lwplock); return; } /* * If the LWP is still on the CPU, mark it as LSONPROC. It may be * about to call mi_switch(), in which case it will yield. */ if ((l->l_pflag & LP_RUNNING) != 0) { l->l_stat = LSONPROC; l->l_slptime = 0; lwp_setlock(l, spc->spc_lwplock); return; } /* Update sleep time delta, call the wake-up handler of scheduler */ l->l_slpticksum += (getticks() - l->l_slpticks); sched_wakeup(l); /* Look for a CPU to wake up */ l->l_cpu = sched_takecpu(l); ci = l->l_cpu; spc = &ci->ci_schedstate; /* * Set it running. */ spc_lock(ci); lwp_setlock(l, spc->spc_mutex); sched_setrunnable(l); l->l_stat = LSRUN; l->l_slptime = 0; sched_enqueue(l); sched_resched_lwp(l, true); /* LWP & SPC now unlocked, but we still hold sleep queue lock. */ } /* * sleepq_insert: * * Insert an LWP into the sleep queue, optionally sorting by priority. */ static void sleepq_insert(sleepq_t *sq, lwp_t *l, syncobj_t *sobj) { if ((sobj->sobj_flag & SOBJ_SLEEPQ_NULL) != 0) { KASSERT(sq == NULL); return; } KASSERT(sq != NULL); if ((sobj->sobj_flag & SOBJ_SLEEPQ_SORTED) != 0) { lwp_t *l2, *l_last = NULL; const pri_t pri = lwp_eprio(l); LIST_FOREACH(l2, sq, l_sleepchain) { l_last = l2; if (lwp_eprio(l2) < pri) { LIST_INSERT_BEFORE(l2, l, l_sleepchain); return; } } /* * Ensure FIFO ordering if no waiters are of lower priority. */ if (l_last != NULL) { LIST_INSERT_AFTER(l_last, l, l_sleepchain); return; } } LIST_INSERT_HEAD(sq, l, l_sleepchain); } /* * sleepq_enter: * * Prepare to block on a sleep queue, after which any interlock can be * safely released. */ int sleepq_enter(sleepq_t *sq, lwp_t *l, kmutex_t *mp) { int nlocks; KASSERT((sq != NULL) == (mp != NULL)); /* * Acquire the per-LWP mutex and lend it our sleep queue lock. * Once interlocked, we can release the kernel lock. */ lwp_lock(l); if (mp != NULL) { lwp_unlock_to(l, mp); } if (__predict_false((nlocks = l->l_blcnt) != 0)) { KERNEL_UNLOCK_ALL(NULL, NULL); } return nlocks; } /* * sleepq_enqueue: * * Enter an LWP into the sleep queue and prepare for sleep. The sleep * queue must already be locked, and any interlock (such as the kernel * lock) must have be released (see sleeptab_lookup(), sleepq_enter()). */ void sleepq_enqueue(sleepq_t *sq, wchan_t wchan, const char *wmesg, syncobj_t *sobj, bool catch_p) { lwp_t *l = curlwp; KASSERT(lwp_locked(l, NULL)); KASSERT(l->l_stat == LSONPROC); KASSERT(l->l_wchan == NULL); KASSERT(l->l_sleepq == NULL); KASSERT((l->l_flag & LW_SINTR) == 0); l->l_syncobj = sobj; l->l_wchan = wchan; l->l_sleepq = sq; l->l_wmesg = wmesg; l->l_slptime = 0; l->l_stat = LSSLEEP; if (catch_p) l->l_flag |= LW_SINTR; sleepq_insert(sq, l, sobj); /* Save the time when thread has slept */ l->l_slpticks = getticks(); sched_slept(l); } /* * sleepq_transfer: * * Move an LWP from one sleep queue to another. Both sleep queues * must already be locked. * * The LWP will be updated with the new sleepq, wchan, wmesg, * sobj, and mutex. The interruptible flag will also be updated. */ void sleepq_transfer(lwp_t *l, sleepq_t *from_sq, sleepq_t *sq, wchan_t wchan, const char *wmesg, syncobj_t *sobj, kmutex_t *mp, bool catch_p) { KASSERT(l->l_sleepq == from_sq); LIST_REMOVE(l, l_sleepchain); l->l_syncobj = sobj; l->l_wchan = wchan; l->l_sleepq = sq; l->l_wmesg = wmesg; if (catch_p) l->l_flag = LW_SINTR | LW_CATCHINTR; else l->l_flag = ~(LW_SINTR | LW_CATCHINTR); /* * This allows the transfer from one sleepq to another where * it is known that they're both protected by the same lock. */ if (mp != NULL) lwp_setlock(l, mp); sleepq_insert(sq, l, sobj); } /* * sleepq_uncatch: * * Mark the LWP as no longer sleeping interruptibly. */ void sleepq_uncatch(lwp_t *l) { l->l_flag &= ~(LW_SINTR | LW_CATCHINTR | LW_STIMO); } /* * sleepq_block: * * After any intermediate step such as releasing an interlock, switch. * sleepq_block() may return early under exceptional conditions, for * example if the LWP's containing process is exiting. * * timo is a timeout in ticks. timo = 0 specifies an infinite timeout. */ int sleepq_block(int timo, bool catch_p, syncobj_t *syncobj, int nlocks) { const int mask = LW_CANCELLED|LW_WEXIT|LW_WCORE|LW_PENDSIG; int error = 0, sig, flag; struct proc *p; lwp_t *l = curlwp; bool early = false; ktrcsw(1, 0, syncobj); /* * If sleeping interruptably, check for pending signals, exits or * core dump events. * * Note the usage of LW_CATCHINTR. This expresses our intent * to catch or not catch sleep interruptions, which might change * while we are sleeping. It is independent from LW_SINTR because * we don't want to leave LW_SINTR set when the LWP is not asleep. */ if (catch_p) { if ((l->l_flag & (LW_CANCELLED|LW_WEXIT|LW_WCORE)) != 0) { l->l_flag &= ~LW_CANCELLED; error = EINTR; early = true; } else if ((l->l_flag & LW_PENDSIG) != 0 && sigispending(l, 0)) early = true; l->l_flag |= LW_CATCHINTR; } else l->l_flag &= ~LW_CATCHINTR; if (early) { /* lwp_unsleep() will release the lock */ lwp_unsleep(l, true); } else { /* * The LWP may have already been awoken if the caller * dropped the sleep queue lock between sleepq_enqueue() and * sleepq_block(). If that happens l_stat will be LSONPROC * and mi_switch() will treat this as a preemption. No need * to do anything special here. */ if (timo) { l->l_flag &= ~LW_STIMO; callout_schedule(&l->l_timeout_ch, timo); } l->l_boostpri = l->l_syncobj->sobj_boostpri; spc_lock(l->l_cpu); mi_switch(l); /* The LWP and sleep queue are now unlocked. */ if (timo) { /* * Even if the callout appears to have fired, we * need to stop it in order to synchronise with * other CPUs. It's important that we do this in * this LWP's context, and not during wakeup, in * order to keep the callout & its cache lines * co-located on the CPU with the LWP. */ (void)callout_halt(&l->l_timeout_ch, NULL); error = (l->l_flag & LW_STIMO) ? EWOULDBLOCK : 0; } } /* * LW_CATCHINTR is only modified in this function OR when we * are asleep (with the sleepq locked). We can therefore safely * test it unlocked here as it is guaranteed to be stable by * virtue of us running. * * We do not bother clearing it if set; that would require us * to take the LWP lock, and it doesn't seem worth the hassle * considering it is only meaningful here inside this function, * and is set to reflect intent upon entry. */ flag = atomic_load_relaxed(&l->l_flag); if (__predict_false((flag & mask) != 0)) { if ((flag & LW_CATCHINTR) == 0 || error != 0) /* nothing */; else if ((flag & (LW_CANCELLED | LW_WEXIT | LW_WCORE)) != 0) error = EINTR; else if ((flag & LW_PENDSIG) != 0) { /* * Acquiring p_lock may cause us to recurse * through the sleep path and back into this * routine, but is safe because LWPs sleeping * on locks are non-interruptable and we will * not recurse again. */ p = l->l_proc; mutex_enter(p->p_lock); if (((sig = sigispending(l, 0)) != 0 && (sigprop[sig] & SA_STOP) == 0) || (sig = issignal(l)) != 0) error = sleepq_sigtoerror(l, sig); mutex_exit(p->p_lock); } } ktrcsw(0, 0, syncobj); if (__predict_false(nlocks != 0)) { KERNEL_LOCK(nlocks, NULL); } return error; } /* * sleepq_wake: * * Wake zero or more LWPs blocked on a single wait channel. */ void sleepq_wake(sleepq_t *sq, wchan_t wchan, u_int expected, kmutex_t *mp) { lwp_t *l, *next; KASSERT(mutex_owned(mp)); for (l = LIST_FIRST(sq); l != NULL; l = next) { KASSERT(l->l_sleepq == sq); KASSERT(l->l_mutex == mp); next = LIST_NEXT(l, l_sleepchain); if (l->l_wchan != wchan) continue; sleepq_remove(sq, l, true); if (--expected == 0) break; } mutex_spin_exit(mp); } /* * sleepq_unsleep: * * Remove an LWP from its sleep queue and set it runnable again. * sleepq_unsleep() is called with the LWP's mutex held, and will * release it if "unlock" is true. */ void sleepq_unsleep(lwp_t *l, bool unlock) { sleepq_t *sq = l->l_sleepq; kmutex_t *mp = l->l_mutex; KASSERT(lwp_locked(l, mp)); KASSERT(l->l_wchan != NULL); sleepq_remove(sq, l, false); if (unlock) { mutex_spin_exit(mp); } } /* * sleepq_timeout: * * Entered via the callout(9) subsystem to time out an LWP that is on a * sleep queue. */ void sleepq_timeout(void *arg) { lwp_t *l = arg; /* * Lock the LWP. Assuming it's still on the sleep queue, its * current mutex will also be the sleep queue mutex. */ lwp_lock(l); if (l->l_wchan == NULL || l->l_syncobj == &callout_syncobj) { /* * Somebody beat us to it, or the LWP is blocked in * callout_halt() waiting for us to finish here. In * neither case should the LWP produce EWOULDBLOCK. */ lwp_unlock(l); return; } l->l_flag |= LW_STIMO; lwp_unsleep(l, true); } /* * sleepq_sigtoerror: * * Given a signal number, interpret and return an error code. */ static int sleepq_sigtoerror(lwp_t *l, int sig) { struct proc *p = l->l_proc; int error; KASSERT(mutex_owned(p->p_lock)); /* * If this sleep was canceled, don't let the syscall restart. */ if ((SIGACTION(p, sig).sa_flags & SA_RESTART) == 0) error = EINTR; else error = ERESTART; return error; } /* * sleepq_abort: * * After a panic or during autoconfiguration, lower the interrupt * priority level to give pending interrupts a chance to run, and * then return. Called if sleepq_dontsleep() returns non-zero, and * always returns zero. */ int sleepq_abort(kmutex_t *mtx, int unlock) { int s; s = splhigh(); splx(IPL_SAFEPRI); splx(s); if (mtx != NULL && unlock != 0) mutex_exit(mtx); return 0; } /* * sleepq_reinsert: * * Move the position of the lwp in the sleep queue after a possible * change of the lwp's effective priority. */ static void sleepq_reinsert(sleepq_t *sq, lwp_t *l) { KASSERT(l->l_sleepq == sq); if ((l->l_syncobj->sobj_flag & SOBJ_SLEEPQ_SORTED) == 0) { return; } /* * Don't let the sleep queue become empty, even briefly. * cv_signal() and cv_broadcast() inspect it without the * sleep queue lock held and need to see a non-empty queue * head if there are waiters. */ if (LIST_FIRST(sq) == l && LIST_NEXT(l, l_sleepchain) == NULL) { return; } LIST_REMOVE(l, l_sleepchain); sleepq_insert(sq, l, l->l_syncobj); } /* * sleepq_changepri: * * Adjust the priority of an LWP residing on a sleepq. */ void sleepq_changepri(lwp_t *l, pri_t pri) { sleepq_t *sq = l->l_sleepq; KASSERT(lwp_locked(l, NULL)); l->l_priority = pri; sleepq_reinsert(sq, l); } /* * sleepq_changepri: * * Adjust the lended priority of an LWP residing on a sleepq. */ void sleepq_lendpri(lwp_t *l, pri_t pri) { sleepq_t *sq = l->l_sleepq; KASSERT(lwp_locked(l, NULL)); l->l_inheritedprio = pri; l->l_auxprio = MAX(l->l_inheritedprio, l->l_protectprio); sleepq_reinsert(sq, l); }
9 8 1 1 6 1 5 1 1 4 2 1 1 2 2 2 1 1 1 1 2 15 6 1 4 1 1 1 1 3 1 1 1 1 1 1 1 3 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 /* $NetBSD: kern_drvctl.c,v 1.51 2022/03/28 12:33:22 riastradh Exp $ */ /* * Copyright (c) 2004 * Matthias Drochner. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions, and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: kern_drvctl.c,v 1.51 2022/03/28 12:33:22 riastradh Exp $"); #include <sys/param.h> #include <sys/systm.h> #include <sys/kernel.h> #include <sys/conf.h> #include <sys/device.h> #include <sys/event.h> #include <sys/kmem.h> #include <sys/ioctl.h> #include <sys/fcntl.h> #include <sys/file.h> #include <sys/filedesc.h> #include <sys/select.h> #include <sys/poll.h> #include <sys/drvctlio.h> #include <sys/devmon.h> #include <sys/stat.h> #include <sys/kauth.h> #include <sys/lwp.h> #include <sys/module.h> #include "ioconf.h" struct drvctl_event { TAILQ_ENTRY(drvctl_event) dce_link; prop_dictionary_t dce_event; }; TAILQ_HEAD(drvctl_queue, drvctl_event); static struct drvctl_queue drvctl_eventq; /* FIFO */ static kcondvar_t drvctl_cond; static kmutex_t drvctl_lock; static int drvctl_nopen = 0, drvctl_eventcnt = 0; static struct selinfo drvctl_rdsel; #define DRVCTL_EVENTQ_DEPTH 64 /* arbitrary queue limit */ dev_type_open(drvctlopen); const struct cdevsw drvctl_cdevsw = { .d_open = drvctlopen, .d_close = nullclose, .d_read = nullread, .d_write = nullwrite, .d_ioctl = noioctl, .d_stop = nostop, .d_tty = notty, .d_poll = nopoll, .d_mmap = nommap, .d_kqfilter = nokqfilter, .d_discard = nodiscard, .d_flag = D_OTHER }; static int drvctl_read(struct file *, off_t *, struct uio *, kauth_cred_t, int); static int drvctl_write(struct file *, off_t *, struct uio *, kauth_cred_t, int); static int drvctl_ioctl(struct file *, u_long, void *); static int drvctl_poll(struct file *, int); static int drvctl_stat(struct file *, struct stat *); static int drvctl_close(struct file *); static const struct fileops drvctl_fileops = { .fo_name = "drvctl", .fo_read = drvctl_read, .fo_write = drvctl_write, .fo_ioctl = drvctl_ioctl, .fo_fcntl = fnullop_fcntl, .fo_poll = drvctl_poll, .fo_stat = drvctl_stat, .fo_close = drvctl_close, .fo_kqfilter = fnullop_kqfilter, .fo_restart = fnullop_restart, }; #define MAXLOCATORS 100 static int (*saved_insert_vec)(const char *, prop_dictionary_t) = NULL; static int drvctl_command(struct lwp *, struct plistref *, u_long, int); static int drvctl_getevent(struct lwp *, struct plistref *, u_long, int); void drvctl_init(void) { TAILQ_INIT(&drvctl_eventq); mutex_init(&drvctl_lock, MUTEX_DEFAULT, IPL_NONE); cv_init(&drvctl_cond, "devmon"); selinit(&drvctl_rdsel); } void drvctl_fini(void) { seldestroy(&drvctl_rdsel); cv_destroy(&drvctl_cond); mutex_destroy(&drvctl_lock); } int devmon_insert(const char *event, prop_dictionary_t ev) { struct drvctl_event *dce, *odce; mutex_enter(&drvctl_lock); if (drvctl_nopen == 0) { prop_object_release(ev); mutex_exit(&drvctl_lock); return 0; } /* Fill in mandatory member */ if (!prop_dictionary_set_string_nocopy(ev, "event", event)) { prop_object_release(ev); mutex_exit(&drvctl_lock); return 0; } dce = kmem_alloc(sizeof(*dce), KM_SLEEP); dce->dce_event = ev; if (drvctl_eventcnt == DRVCTL_EVENTQ_DEPTH) { odce = TAILQ_FIRST(&drvctl_eventq); TAILQ_REMOVE(&drvctl_eventq, odce, dce_link); prop_object_release(odce->dce_event); kmem_free(odce, sizeof(*odce)); --drvctl_eventcnt; } TAILQ_INSERT_TAIL(&drvctl_eventq, dce, dce_link); ++drvctl_eventcnt; cv_broadcast(&drvctl_cond); selnotify(&drvctl_rdsel, 0, 0); mutex_exit(&drvctl_lock); return 0; } int drvctlopen(dev_t dev, int flags, int mode, struct lwp *l) { struct file *fp; int fd; int ret; ret = fd_allocfile(&fp, &fd); if (ret) return ret; /* XXX setup context */ mutex_enter(&drvctl_lock); ret = fd_clone(fp, fd, flags, &drvctl_fileops, /* context */NULL); ++drvctl_nopen; mutex_exit(&drvctl_lock); return ret; } static int pmdevbyname(u_long cmd, struct devpmargs *a) { device_t d; KASSERT(KERNEL_LOCKED_P()); if ((d = device_find_by_xname(a->devname)) == NULL) return ENXIO; switch (cmd) { case DRVSUSPENDDEV: return pmf_device_recursive_suspend(d, PMF_Q_DRVCTL) ? 0 : EBUSY; case DRVRESUMEDEV: if (a->flags & DEVPM_F_SUBTREE) { return pmf_device_subtree_resume(d, PMF_Q_DRVCTL) ? 0 : EBUSY; } else { return pmf_device_recursive_resume(d, PMF_Q_DRVCTL) ? 0 : EBUSY; } default: return EPASSTHROUGH; } } static int listdevbyname(struct devlistargs *l) { device_t d, child; deviter_t di; int cnt = 0, idx, error = 0; KASSERT(KERNEL_LOCKED_P()); if (*l->l_devname == '\0') d = NULL; else if (memchr(l->l_devname, 0, sizeof(l->l_devname)) == NULL) return EINVAL; else if ((d = device_find_by_xname(l->l_devname)) == NULL) return ENXIO; for (child = deviter_first(&di, 0); child != NULL; child = deviter_next(&di)) { if (device_parent(child) != d) continue; idx = cnt++; if (l->l_childname == NULL || idx >= l->l_children) continue; error = copyoutstr(device_xname(child), l->l_childname[idx], sizeof(l->l_childname[idx]), NULL); if (error != 0) break; } deviter_release(&di); l->l_children = cnt; return error; } static int detachdevbyname(const char *devname) { device_t d; deviter_t di; int error; KASSERT(KERNEL_LOCKED_P()); for (d = deviter_first(&di, DEVITER_F_RW); d != NULL; d = deviter_next(&di)) { if (strcmp(device_xname(d), devname) == 0) break; } if (d == NULL) { error = ENXIO; goto out; } #ifndef XXXFULLRISK /* * If the parent cannot be notified, it might keep * pointers to the detached device. * There might be a private notification mechanism, * but better play it safe here. */ if (device_parent(d) && !device_cfattach(device_parent(d))->ca_childdetached) { error = ENOTSUP; goto out; } #endif error = config_detach(d, 0); out: deviter_release(&di); return error; } static int rescanbus(const char *busname, const char *ifattr, int numlocators, const int *locators) { int i, rc; device_t d; const struct cfiattrdata * const *ap; KASSERT(KERNEL_LOCKED_P()); /* XXX there should be a way to get limits and defaults (per device) from config generated data */ int locs[MAXLOCATORS]; for (i = 0; i < MAXLOCATORS; i++) locs[i] = -1; for (i = 0; i < numlocators;i++) locs[i] = locators[i]; if ((d = device_find_by_xname(busname)) == NULL) return ENXIO; /* * must support rescan, and must have something * to attach to */ if (!device_cfattach(d)->ca_rescan || !device_cfdriver(d)->cd_attrs) return ENODEV; /* rescan all ifattrs if none is specified */ if (!ifattr) { rc = 0; for (ap = device_cfdriver(d)->cd_attrs; *ap; ap++) { rc = (*device_cfattach(d)->ca_rescan)(d, (*ap)->ci_name, locs); if (rc) break; } } else { /* check for valid attribute passed */ for (ap = device_cfdriver(d)->cd_attrs; *ap; ap++) if (!strcmp((*ap)->ci_name, ifattr)) break; if (!*ap) return EINVAL; rc = (*device_cfattach(d)->ca_rescan)(d, ifattr, locs); } config_deferred(NULL); return rc; } static int drvctl_read(struct file *fp, off_t *offp, struct uio *uio, kauth_cred_t cred, int flags) { return ENODEV; } static int drvctl_write(struct file *fp, off_t *offp, struct uio *uio, kauth_cred_t cred, int flags) { return ENODEV; } static int drvctl_ioctl(struct file *fp, u_long cmd, void *data) { int res; char *ifattr; int *locs; size_t locs_sz = 0; /* XXXgcc */ KERNEL_LOCK(1, NULL); switch (cmd) { case DRVSUSPENDDEV: case DRVRESUMEDEV: #define d ((struct devpmargs *)data) res = pmdevbyname(cmd, d); #undef d break; case DRVLISTDEV: res = listdevbyname((struct devlistargs *)data); break; case DRVDETACHDEV: #define d ((struct devdetachargs *)data) res = detachdevbyname(d->devname); #undef d break; case DRVRESCANBUS: #define d ((struct devrescanargs *)data) d->busname[sizeof(d->busname) - 1] = '\0'; /* XXX better copyin? */ if (d->ifattr[0]) { d->ifattr[sizeof(d->ifattr) - 1] = '\0'; ifattr = d->ifattr; } else ifattr = 0; if (d->numlocators) { if (d->numlocators > MAXLOCATORS) { res = EINVAL; goto out; } locs_sz = d->numlocators * sizeof(int); locs = kmem_alloc(locs_sz, KM_SLEEP); res = copyin(d->locators, locs, locs_sz); if (res) { kmem_free(locs, locs_sz); goto out; } } else locs = NULL; res = rescanbus(d->busname, ifattr, d->numlocators, locs); if (locs) kmem_free(locs, locs_sz); #undef d break; case DRVCTLCOMMAND: res = drvctl_command(curlwp, (struct plistref *)data, cmd, fp->f_flag); break; case DRVGETEVENT: res = drvctl_getevent(curlwp, (struct plistref *)data, cmd, fp->f_flag); break; default: res = EPASSTHROUGH; break; } out: KERNEL_UNLOCK_ONE(NULL); return res; } static int drvctl_stat(struct file *fp, struct stat *st) { (void)memset(st, 0, sizeof(*st)); st->st_uid = kauth_cred_geteuid(fp->f_cred); st->st_gid = kauth_cred_getegid(fp->f_cred); return 0; } static int drvctl_poll(struct file *fp, int events) { int revents = 0; if (!TAILQ_EMPTY(&drvctl_eventq)) revents |= events & (POLLIN | POLLRDNORM); else selrecord(curlwp, &drvctl_rdsel); return revents; } static int drvctl_close(struct file *fp) { struct drvctl_event *dce; /* XXX free context */ mutex_enter(&drvctl_lock); KASSERT(drvctl_nopen > 0); --drvctl_nopen; if (drvctl_nopen == 0) { /* flush queue */ while ((dce = TAILQ_FIRST(&drvctl_eventq)) != NULL) { TAILQ_REMOVE(&drvctl_eventq, dce, dce_link); KASSERT(drvctl_eventcnt > 0); --drvctl_eventcnt; prop_object_release(dce->dce_event); kmem_free(dce, sizeof(*dce)); } } mutex_exit(&drvctl_lock); return 0; } void drvctlattach(int arg __unused) { } /***************************************************************************** * Driver control command processing engine *****************************************************************************/ static int drvctl_command_get_properties(struct lwp *l, prop_dictionary_t command_dict, prop_dictionary_t results_dict) { prop_dictionary_t args_dict; prop_string_t devname_string; device_t dev; deviter_t di; args_dict = prop_dictionary_get(command_dict, "drvctl-arguments"); if (args_dict == NULL) return EINVAL; devname_string = prop_dictionary_get(args_dict, "device-name"); if (devname_string == NULL) return EINVAL; for (dev = deviter_first(&di, 0); dev != NULL; dev = deviter_next(&di)) { if (prop_string_equals_string(devname_string, device_xname(dev))) { prop_dictionary_set(results_dict, "drvctl-result-data", device_properties(dev)); break; } } deviter_release(&di); if (dev == NULL) return ESRCH; return 0; } struct drvctl_command_desc { const char *dcd_name; /* command name */ int (*dcd_func)(struct lwp *, /* handler function */ prop_dictionary_t, prop_dictionary_t); int dcd_rw; /* read or write required */ }; static const struct drvctl_command_desc drvctl_command_table[] = { { .dcd_name = "get-properties", .dcd_func = drvctl_command_get_properties, .dcd_rw = FREAD, }, { .dcd_name = NULL } }; static int drvctl_command(struct lwp *l, struct plistref *pref, u_long ioctl_cmd, int fflag) { prop_dictionary_t command_dict, results_dict; prop_string_t command_string; const struct drvctl_command_desc *dcd; int error; error = prop_dictionary_copyin_ioctl(pref, ioctl_cmd, &command_dict); if (error) return error; results_dict = prop_dictionary_create(); if (results_dict == NULL) { prop_object_release(command_dict); return ENOMEM; } command_string = prop_dictionary_get(command_dict, "drvctl-command"); if (command_string == NULL) { error = EINVAL; goto out; } for (dcd = drvctl_command_table; dcd->dcd_name != NULL; dcd++) { if (prop_string_equals_string(command_string, dcd->dcd_name)) break; } if (dcd->dcd_name == NULL) { error = EINVAL; goto out; } if ((fflag & dcd->dcd_rw) == 0) { error = EPERM; goto out; } error = (*dcd->dcd_func)(l, command_dict, results_dict); prop_dictionary_set_int32(results_dict, "drvctl-error", error); error = prop_dictionary_copyout_ioctl(pref, ioctl_cmd, results_dict); out: prop_object_release(command_dict); prop_object_release(results_dict); return error; } static int drvctl_getevent(struct lwp *l, struct plistref *pref, u_long ioctl_cmd, int fflag) { struct drvctl_event *dce; int ret; if ((fflag & (FREAD|FWRITE)) != (FREAD|FWRITE)) return EPERM; mutex_enter(&drvctl_lock); while ((dce = TAILQ_FIRST(&drvctl_eventq)) == NULL) { if (fflag & O_NONBLOCK) { mutex_exit(&drvctl_lock); return EWOULDBLOCK; } ret = cv_wait_sig(&drvctl_cond, &drvctl_lock); if (ret) { mutex_exit(&drvctl_lock); return ret; } } TAILQ_REMOVE(&drvctl_eventq, dce, dce_link); KASSERT(drvctl_eventcnt > 0); --drvctl_eventcnt; mutex_exit(&drvctl_lock); ret = prop_dictionary_copyout_ioctl(pref, ioctl_cmd, dce->dce_event); prop_object_release(dce->dce_event); kmem_free(dce, sizeof(*dce)); return ret; } /* * Module glue */ MODULE(MODULE_CLASS_DRIVER, drvctl, NULL); int drvctl_modcmd(modcmd_t cmd, void *arg) { int error; #ifdef _MODULE int bmajor, cmajor; #endif error = 0; switch (cmd) { case MODULE_CMD_INIT: drvctl_init(); mutex_enter(&drvctl_lock); #ifdef _MODULE bmajor = cmajor = -1; error = devsw_attach("drvctl", NULL, &bmajor, &drvctl_cdevsw, &cmajor); #endif if (error == 0) { KASSERT(saved_insert_vec == NULL); saved_insert_vec = devmon_insert_vec; devmon_insert_vec = devmon_insert; } mutex_exit(&drvctl_lock); break; case MODULE_CMD_FINI: mutex_enter(&drvctl_lock); if (drvctl_nopen != 0 || drvctl_eventcnt != 0 ) { mutex_exit(&drvctl_lock); return EBUSY; } KASSERT(saved_insert_vec != NULL); devmon_insert_vec = saved_insert_vec; saved_insert_vec = NULL; #ifdef _MODULE devsw_detach(NULL, &drvctl_cdevsw); #endif mutex_exit(&drvctl_lock); drvctl_fini(); break; default: error = ENOTTY; break; } return error; }
1030 875 875 874 588 653 824 384 69 863 844 388 1017 1016 1016 1016 1017 1015 95 988 1017 1019 402 971 21 922 330 364 363 69 61 5 7 317 321 320 58 364 365 363 4 652 585 589 586 589 589 545 465 288 286 586 588 327 327 53 280 37 3 3 280 279 280 45 43 43 43 837 837 523 569 568 567 568 1018 66 1016 1017 1014 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 /* $NetBSD: kern_runq.c,v 1.70 2023/09/19 22:15:32 ad Exp $ */ /*- * Copyright (c) 2019, 2020 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Andrew Doran. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /* * Copyright (c) 2007, 2008 Mindaugas Rasiukevicius <rmind at NetBSD org> * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: kern_runq.c,v 1.70 2023/09/19 22:15:32 ad Exp $"); #include "opt_dtrace.h" #include <sys/param.h> #include <sys/kernel.h> #include <sys/bitops.h> #include <sys/cpu.h> #include <sys/idle.h> #include <sys/intr.h> #include <sys/kmem.h> #include <sys/lwp.h> #include <sys/mutex.h> #include <sys/proc.h> #include <sys/pset.h> #include <sys/sched.h> #include <sys/syscallargs.h> #include <sys/sysctl.h> #include <sys/systm.h> #include <sys/types.h> #include <sys/evcnt.h> #include <sys/atomic.h> /* * Bits per map. */ #define BITMAP_BITS (32) #define BITMAP_SHIFT (5) #define BITMAP_MSB (0x80000000U) #define BITMAP_MASK (BITMAP_BITS - 1) const int schedppq = 1; static void *sched_getrq(struct schedstate_percpu *, const pri_t); #ifdef MULTIPROCESSOR static lwp_t * sched_catchlwp(struct cpu_info *); #endif /* * Preemption control. */ #ifdef __HAVE_PREEMPTION # ifdef DEBUG int sched_kpreempt_pri = 0; # else int sched_kpreempt_pri = PRI_USER_RT; # endif #else int sched_kpreempt_pri = 1000; #endif /* * Migration and balancing. */ static u_int cacheht_time; /* Cache hotness time */ static u_int min_catch; /* Minimal LWP count for catching */ static u_int skim_interval; /* Rate limit for stealing LWPs */ #ifdef KDTRACE_HOOKS struct lwp *curthread; #endif void runq_init(void) { /* Pulling from remote packages, LWP must not have run for 10ms. */ cacheht_time = 10; /* Minimal count of LWPs for catching */ min_catch = 1; /* Steal from other CPUs at most every 10ms. */ skim_interval = 10; } void sched_cpuattach(struct cpu_info *ci) { struct schedstate_percpu *spc; size_t size; void *p; u_int i; spc = &ci->ci_schedstate; spc->spc_nextpkg = ci; if (spc->spc_lwplock == NULL) { spc->spc_lwplock = mutex_obj_alloc(MUTEX_DEFAULT, IPL_SCHED); } if (ci == lwp0.l_cpu) { /* Initialize the scheduler structure of the primary LWP */ lwp0.l_mutex = spc->spc_lwplock; } if (spc->spc_mutex != NULL) { /* Already initialized. */ return; } /* Allocate the run queue */ size = roundup2(sizeof(spc->spc_queue[0]) * PRI_COUNT, coherency_unit) + coherency_unit; p = kmem_alloc(size, KM_SLEEP); spc->spc_queue = (void *)roundup2((uintptr_t)p, coherency_unit); /* Initialize run queues */ spc->spc_mutex = mutex_obj_alloc(MUTEX_DEFAULT, IPL_SCHED); for (i = 0; i < PRI_COUNT; i++) TAILQ_INIT(&spc->spc_queue[i]); } /* * Control of the runqueue. */ static inline void * sched_getrq(struct schedstate_percpu *spc, const pri_t prio) { KASSERT(prio < PRI_COUNT); return &spc->spc_queue[prio]; } /* * Put an LWP onto a run queue. The LWP must be locked by spc_mutex for * l_cpu. */ void sched_enqueue(struct lwp *l) { struct schedstate_percpu *spc; TAILQ_HEAD(, lwp) *q_head; const pri_t eprio = lwp_eprio(l); struct cpu_info *ci; ci = l->l_cpu; spc = &ci->ci_schedstate; KASSERT(lwp_locked(l, l->l_cpu->ci_schedstate.spc_mutex)); /* Enqueue the thread */ q_head = sched_getrq(spc, eprio); if (TAILQ_EMPTY(q_head)) { u_int i; uint32_t q; /* Mark bit */ i = eprio >> BITMAP_SHIFT; q = BITMAP_MSB >> (eprio & BITMAP_MASK); KASSERT((spc->spc_bitmap[i] & q) == 0); spc->spc_bitmap[i] |= q; } /* * Determine run queue position according to POSIX. XXX Explicitly * lowering a thread's priority with pthread_setschedparam() is not * handled. */ if ((l->l_pflag & LP_PREEMPTING) != 0) { switch (l->l_class) { case SCHED_OTHER: TAILQ_INSERT_TAIL(q_head, l, l_runq); break; case SCHED_FIFO: TAILQ_INSERT_HEAD(q_head, l, l_runq); break; case SCHED_RR: if (getticks() - l->l_rticks >= sched_rrticks) { TAILQ_INSERT_TAIL(q_head, l, l_runq); } else { TAILQ_INSERT_HEAD(q_head, l, l_runq); } break; default: panic("sched_enqueue: LWP %p has class %d\n", l, l->l_class); } } else { TAILQ_INSERT_TAIL(q_head, l, l_runq); } spc->spc_flags &= ~SPCF_IDLE; spc->spc_count++; if ((l->l_pflag & LP_BOUND) == 0) { atomic_store_relaxed(&spc->spc_mcount, atomic_load_relaxed(&spc->spc_mcount) + 1); } /* * Update the value of highest priority in the runqueue, * if priority of this thread is higher. */ if (eprio > spc->spc_maxpriority) spc->spc_maxpriority = eprio; sched_newts(l); } /* * Remove and LWP from the run queue it's on. The LWP must be in state * LSRUN. */ void sched_dequeue(struct lwp *l) { TAILQ_HEAD(, lwp) *q_head; struct schedstate_percpu *spc; const pri_t eprio = lwp_eprio(l); spc = &l->l_cpu->ci_schedstate; KASSERT(lwp_locked(l, spc->spc_mutex)); KASSERT(eprio <= spc->spc_maxpriority); KASSERT(spc->spc_bitmap[eprio >> BITMAP_SHIFT] != 0); KASSERT(spc->spc_count > 0); if (spc->spc_migrating == l) spc->spc_migrating = NULL; spc->spc_count--; if ((l->l_pflag & LP_BOUND) == 0) { atomic_store_relaxed(&spc->spc_mcount, atomic_load_relaxed(&spc->spc_mcount) - 1); } q_head = sched_getrq(spc, eprio); TAILQ_REMOVE(q_head, l, l_runq); if (TAILQ_EMPTY(q_head)) { u_int i; uint32_t q; /* Unmark bit */ i = eprio >> BITMAP_SHIFT; q = BITMAP_MSB >> (eprio & BITMAP_MASK); KASSERT((spc->spc_bitmap[i] & q) != 0); spc->spc_bitmap[i] &= ~q; /* * Update the value of highest priority in the runqueue, in a * case it was a last thread in the queue of highest priority. */ if (eprio != spc->spc_maxpriority) return; do { if (spc->spc_bitmap[i] != 0) { q = ffs(spc->spc_bitmap[i]); spc->spc_maxpriority = (i << BITMAP_SHIFT) + (BITMAP_BITS - q); return; } } while (i--); /* If not found - set the lowest value */ spc->spc_maxpriority = 0; } } /* * Cause a preemption on the given CPU, if the priority "pri" is higher * priority than the running LWP. If "unlock" is specified, and ideally it * will be for concurrency reasons, spc_mutex will be dropped before return. */ void sched_resched_cpu(struct cpu_info *ci, pri_t pri, bool unlock) { struct schedstate_percpu *spc; u_int o, n, f; lwp_t *l; spc = &ci->ci_schedstate; KASSERT(mutex_owned(spc->spc_mutex)); /* * If the priority level we're evaluating wouldn't cause a new LWP * to be run on the CPU, then we have nothing to do. */ if (pri <= spc->spc_curpriority || !mp_online) { if (__predict_true(unlock)) { spc_unlock(ci); } return; } /* * Figure out what kind of preemption we should do. */ l = ci->ci_onproc; if ((l->l_flag & LW_IDLE) != 0) { f = RESCHED_IDLE | RESCHED_UPREEMPT; } else if (pri >= sched_kpreempt_pri && (l->l_pflag & LP_INTR) == 0) { /* We can't currently preempt softints - should be able to. */ #ifdef __HAVE_PREEMPTION f = RESCHED_KPREEMPT; #else /* Leave door open for test: set kpreempt_pri with sysctl. */ f = RESCHED_UPREEMPT; #endif /* * l_dopreempt must be set with the CPU locked to sync with * mi_switch(). It must also be set with an atomic to sync * with kpreempt(). */ atomic_or_uint(&l->l_dopreempt, DOPREEMPT_ACTIVE); } else { f = RESCHED_UPREEMPT; } if (ci != curcpu()) { f |= RESCHED_REMOTE; } /* * Things can start as soon as ci_want_resched is touched: x86 has * an instruction that monitors the memory cell it's in. Drop the * schedstate lock in advance, otherwise the remote CPU can awaken * and immediately block on the lock. */ if (__predict_true(unlock)) { spc_unlock(ci); } /* * The caller almost always has a second scheduler lock held: either * the running LWP lock (spc_lwplock), or a sleep queue lock. That * keeps preemption disabled, which among other things ensures all * LWPs involved won't be freed while we're here (see lwp_dtor()). */ KASSERT(kpreempt_disabled()); for (o = 0;; o = n) { n = atomic_cas_uint(&ci->ci_want_resched, o, o | f); if (__predict_true(o == n)) { /* * We're the first to set a resched on the CPU. Try * to avoid causing a needless trip through trap() * to handle an AST fault, if it's known the LWP * will either block or go through userret() soon. */ if (l != curlwp || cpu_intr_p()) { cpu_need_resched(ci, l, f); } break; } if (__predict_true( (n & (RESCHED_KPREEMPT|RESCHED_UPREEMPT)) >= (f & (RESCHED_KPREEMPT|RESCHED_UPREEMPT)))) { /* Already in progress, nothing to do. */ break; } } } /* * Cause a preemption on the given CPU, if the priority of LWP "l" in state * LSRUN, is higher priority than the running LWP. If "unlock" is * specified, and ideally it will be for concurrency reasons, spc_mutex will * be dropped before return. */ void sched_resched_lwp(struct lwp *l, bool unlock) { struct cpu_info *ci = l->l_cpu; KASSERT(lwp_locked(l, ci->ci_schedstate.spc_mutex)); KASSERT(l->l_stat == LSRUN); sched_resched_cpu(ci, lwp_eprio(l), unlock); } /* * Migration and balancing. */ #ifdef MULTIPROCESSOR /* * Estimate if LWP is cache-hot. */ static inline bool lwp_cache_hot(const struct lwp *l) { /* Leave new LWPs in peace, determination has already been made. */ if (l->l_stat == LSIDL) return true; if (__predict_false(l->l_slptime != 0 || l->l_rticks == 0)) return false; return (getticks() - l->l_rticks < mstohz(cacheht_time)); } /* * Check if LWP can migrate to the chosen CPU. */ static inline bool sched_migratable(const struct lwp *l, struct cpu_info *ci) { const struct schedstate_percpu *spc = &ci->ci_schedstate; KASSERT(lwp_locked(__UNCONST(l), NULL)); /* Is CPU offline? */ if (__predict_false(spc->spc_flags & SPCF_OFFLINE)) return false; /* Is affinity set? */ if (__predict_false(l->l_affinity)) return kcpuset_isset(l->l_affinity, cpu_index(ci)); /* Is there a processor-set? */ return (spc->spc_psid == l->l_psid); } /* * A small helper to do round robin through CPU packages. */ static struct cpu_info * sched_nextpkg(void) { struct schedstate_percpu *spc = &curcpu()->ci_schedstate; spc->spc_nextpkg = spc->spc_nextpkg->ci_sibling[CPUREL_PACKAGE1ST]; return spc->spc_nextpkg; } /* * Find a CPU to run LWP "l". Look for the CPU with the lowest priority * thread. In case of equal priority, prefer first class CPUs, and amongst * the remainder choose the CPU with the fewest runqueue entries. * * Begin the search in the CPU package which "pivot" is a member of. */ static struct cpu_info * __noinline sched_bestcpu(struct lwp *l, struct cpu_info *pivot) { struct cpu_info *bestci, *curci, *outer; struct schedstate_percpu *bestspc, *curspc; pri_t bestpri, curpri; /* * If this fails (it shouldn't), run on the given CPU. This also * gives us a weak preference for "pivot" to begin with. */ bestci = pivot; bestspc = &bestci->ci_schedstate; if (sched_migratable(l, bestci)) { bestpri = MAX(bestspc->spc_curpriority, bestspc->spc_maxpriority); } else { /* Invalidate the priority. */ bestpri = PRI_COUNT; } /* In the outer loop scroll through all CPU packages. */ pivot = pivot->ci_package1st; outer = pivot; do { /* In the inner loop scroll through all CPUs in package. */ curci = outer; do { if (!sched_migratable(l, curci)) { continue; } curspc = &curci->ci_schedstate; /* If this CPU is idle and 1st class, we're done. */ if ((curspc->spc_flags & (SPCF_IDLE | SPCF_1STCLASS)) == (SPCF_IDLE | SPCF_1STCLASS)) { return curci; } curpri = MAX(curspc->spc_curpriority, curspc->spc_maxpriority); if (curpri > bestpri) { continue; } if (curpri == bestpri) { /* Prefer first class CPUs over others. */ if ((curspc->spc_flags & SPCF_1STCLASS) == 0 && (bestspc->spc_flags & SPCF_1STCLASS) != 0) { continue; } /* * Pick the least busy CPU. Make sure this is not * <=, otherwise it defeats the above preference. */ if (bestspc->spc_count < curspc->spc_count) { continue; } } bestpri = curpri; bestci = curci; bestspc = curspc; } while (curci = curci->ci_sibling[CPUREL_PACKAGE], curci != outer); } while (outer = outer->ci_sibling[CPUREL_PACKAGE1ST], outer != pivot); return bestci; } /* * Estimate the migration of LWP to the other CPU. * Take and return the CPU, if migration is needed. */ struct cpu_info * sched_takecpu(struct lwp *l) { struct schedstate_percpu *spc, *tspc; struct cpu_info *ci, *curci, *tci; pri_t eprio; int flags; KASSERT(lwp_locked(l, NULL)); /* If thread is strictly bound, do not estimate other CPUs */ ci = l->l_cpu; if (l->l_pflag & LP_BOUND) return ci; spc = &ci->ci_schedstate; eprio = lwp_eprio(l); /* * Handle new LWPs. For vfork() with a timeshared child, make it * run on the same CPU as the parent if no other LWPs in queue. * Otherwise scatter far and wide - try for an even distribution * across all CPU packages and CPUs. */ if (l->l_stat == LSIDL) { if (curlwp->l_vforkwaiting && l->l_class == SCHED_OTHER) { if (sched_migratable(l, curlwp->l_cpu) && eprio > curlwp->l_cpu->ci_schedstate.spc_maxpriority) { return curlwp->l_cpu; } } else { return sched_bestcpu(l, sched_nextpkg()); } flags = SPCF_IDLE; } else { flags = SPCF_IDLE | SPCF_1STCLASS; } /* * Try to send the LWP back to the first CPU in the same core if * idle. This keeps LWPs clustered in the run queues of 1st class * CPUs. This implies stickiness. If we didn't find a home for * a vfork() child above, try to use any SMT sibling to help out. */ tci = ci; do { tspc = &tci->ci_schedstate; if ((tspc->spc_flags & flags) == flags && sched_migratable(l, tci)) { return tci; } tci = tci->ci_sibling[CPUREL_CORE]; } while (tci != ci); /* * Otherwise the LWP is "sticky", i.e. generally preferring to stay * on the same CPU. */ if (sched_migratable(l, ci) && (eprio > spc->spc_curpriority || (lwp_cache_hot(l) && l->l_class == SCHED_OTHER))) { return ci; } /* * If the current CPU core is idle, run there and avoid the * expensive scan of CPUs below. */ curci = curcpu(); tci = curci; do { tspc = &tci->ci_schedstate; if ((tspc->spc_flags & flags) == flags && sched_migratable(l, tci)) { return tci; } tci = tci->ci_sibling[CPUREL_CORE]; } while (tci != curci); /* * Didn't find a new home above - happens infrequently. Start the * search in last CPU package that the LWP ran in, but expand to * include the whole system if needed. */ return sched_bestcpu(l, l->l_cpu); } /* * Tries to catch an LWP from the runqueue of other CPU. */ static struct lwp * sched_catchlwp(struct cpu_info *ci) { struct cpu_info *curci = curcpu(); struct schedstate_percpu *spc, *curspc; TAILQ_HEAD(, lwp) *q_head; struct lwp *l; bool gentle; curspc = &curci->ci_schedstate; spc = &ci->ci_schedstate; /* * Be more aggressive if this CPU is first class, and the other * is not. */ gentle = ((curspc->spc_flags & SPCF_1STCLASS) == 0 || (spc->spc_flags & SPCF_1STCLASS) != 0); if (atomic_load_relaxed(&spc->spc_mcount) < (gentle ? min_catch : 1) || curspc->spc_psid != spc->spc_psid) { spc_unlock(ci); return NULL; } /* Take the highest priority thread */ q_head = sched_getrq(spc, spc->spc_maxpriority); l = TAILQ_FIRST(q_head); for (;;) { /* Check the first and next result from the queue */ if (l == NULL) { break; } KASSERTMSG(l->l_stat == LSRUN, "%s l %p (%s) l_stat %d", ci->ci_data.cpu_name, l, (l->l_name ? l->l_name : l->l_proc->p_comm), l->l_stat); /* Look for threads, whose are allowed to migrate */ if ((l->l_pflag & LP_BOUND) || (gentle && lwp_cache_hot(l)) || !sched_migratable(l, curci)) { l = TAILQ_NEXT(l, l_runq); /* XXX Gap: could walk down priority list. */ continue; } /* Grab the thread, and move to the local run queue */ sched_dequeue(l); l->l_cpu = curci; lwp_unlock_to(l, curspc->spc_mutex); sched_enqueue(l); return l; } spc_unlock(ci); return l; } /* * Called from sched_idle() to handle migration. Return the CPU that we * pushed the LWP to (may be NULL). */ static struct cpu_info * sched_idle_migrate(void) { struct cpu_info *ci = curcpu(), *tci = NULL; struct schedstate_percpu *spc, *tspc; bool dlock = false; spc = &ci->ci_schedstate; spc_lock(ci); for (;;) { struct lwp *l; l = spc->spc_migrating; if (l == NULL) break; /* * If second attempt, and target CPU has changed, * drop the old lock. */ if (dlock == true && tci != l->l_target_cpu) { KASSERT(tci != NULL); spc_unlock(tci); dlock = false; } /* * Nothing to do if destination has changed to the * local CPU, or migration was done by other CPU. */ tci = l->l_target_cpu; if (tci == NULL || tci == ci) { spc->spc_migrating = NULL; l->l_target_cpu = NULL; break; } tspc = &tci->ci_schedstate; /* * Double-lock the runqueues. * We do that only once. */ if (dlock == false) { dlock = true; if (ci < tci) { spc_lock(tci); } else if (!mutex_tryenter(tspc->spc_mutex)) { spc_unlock(ci); spc_lock(tci); spc_lock(ci); /* Check the situation again.. */ continue; } } /* Migrate the thread */ KASSERT(l->l_stat == LSRUN); spc->spc_migrating = NULL; l->l_target_cpu = NULL; sched_dequeue(l); l->l_cpu = tci; lwp_setlock(l, tspc->spc_mutex); sched_enqueue(l); sched_resched_lwp(l, true); /* tci now unlocked */ spc_unlock(ci); return tci; } if (dlock == true) { KASSERT(tci != NULL); spc_unlock(tci); } spc_unlock(ci); return NULL; } /* * Try to steal an LWP from "tci". */ static bool sched_steal(struct cpu_info *ci, struct cpu_info *tci) { struct schedstate_percpu *spc, *tspc; lwp_t *l; spc = &ci->ci_schedstate; tspc = &tci->ci_schedstate; if (atomic_load_relaxed(&tspc->spc_mcount) != 0 && spc->spc_psid == tspc->spc_psid) { spc_dlock(ci, tci); l = sched_catchlwp(tci); spc_unlock(ci); if (l != NULL) { return true; } } return false; } /* * Called from each CPU's idle loop. */ void sched_idle(void) { struct cpu_info *ci, *inner, *outer, *first, *tci, *mci; struct schedstate_percpu *spc, *tspc; struct lwp *l; ci = curcpu(); spc = &ci->ci_schedstate; tci = NULL; mci = NULL; /* * Handle LWP migrations off this CPU to another. If there a is * migration to do then remember the CPU the LWP was sent to, and * don't steal the LWP back from that CPU below. */ if (spc->spc_migrating != NULL) { mci = sched_idle_migrate(); } /* If this CPU is offline, or we have an LWP to run, we're done. */ if ((spc->spc_flags & SPCF_OFFLINE) != 0 || spc->spc_count != 0) { return; } /* Deal with SMT. */ if (ci->ci_nsibling[CPUREL_CORE] > 1) { /* Try to help our siblings out. */ tci = ci->ci_sibling[CPUREL_CORE]; while (tci != ci) { if (tci != mci && sched_steal(ci, tci)) { return; } tci = tci->ci_sibling[CPUREL_CORE]; } /* * If not the first SMT in the core, and in the default * processor set, the search ends here. */ if ((spc->spc_flags & SPCF_1STCLASS) == 0 && spc->spc_psid == PS_NONE) { return; } } /* * Find something to run, unless this CPU exceeded the rate limit. * Start looking on the current package to maximise L2/L3 cache * locality. Then expand to looking at the rest of the system. * * XXX Should probably look at 2nd class CPUs first, but they will * shed jobs via preempt() anyway. */ if (spc->spc_nextskim > getticks()) { return; } spc->spc_nextskim = getticks() + mstohz(skim_interval); /* In the outer loop scroll through all CPU packages, starting here. */ first = ci->ci_package1st; outer = first; do { /* In the inner loop scroll through all CPUs in package. */ inner = outer; do { /* Don't hit the locks unless needed. */ tspc = &inner->ci_schedstate; if (ci == inner || ci == mci || spc->spc_psid != tspc->spc_psid || atomic_load_relaxed(&tspc->spc_mcount) < min_catch) { continue; } spc_dlock(ci, inner); l = sched_catchlwp(inner); spc_unlock(ci); if (l != NULL) { /* Got it! */ return; } } while (inner = inner->ci_sibling[CPUREL_PACKAGE], inner != outer); } while (outer = outer->ci_sibling[CPUREL_PACKAGE1ST], outer != first); } /* * Called from mi_switch() when an LWP has been preempted / has yielded. * The LWP is presently in the CPU's run queue. Here we look for a better * CPU to teleport the LWP to; there may not be one. */ void sched_preempted(struct lwp *l) { const int flags = SPCF_IDLE | SPCF_1STCLASS; struct schedstate_percpu *tspc; struct cpu_info *ci, *tci; ci = l->l_cpu; tspc = &ci->ci_schedstate; KASSERT(tspc->spc_count >= 1); /* * Try to select another CPU if: * * - there is no migration pending already * - and this LWP is running on a 2nd class CPU * - or this LWP is a child of vfork() that has just done execve() */ if (l->l_target_cpu != NULL || ((tspc->spc_flags & SPCF_1STCLASS) != 0 && (l->l_pflag & LP_TELEPORT) == 0)) { return; } /* * Fast path: if the first SMT in the core is idle, send it back * there, because the cache is shared (cheap) and we want all LWPs * to be clustered on 1st class CPUs (either running there or on * their runqueues). */ tci = ci->ci_sibling[CPUREL_CORE]; while (tci != ci) { tspc = &tci->ci_schedstate; if ((tspc->spc_flags & flags) == flags && sched_migratable(l, tci)) { l->l_target_cpu = tci; l->l_pflag &= ~LP_TELEPORT; return; } tci = tci->ci_sibling[CPUREL_CORE]; } if ((l->l_pflag & LP_TELEPORT) != 0) { /* * A child of vfork(): now that the parent is released, * scatter far and wide, to match the LSIDL distribution * done in sched_takecpu(). */ l->l_pflag &= ~LP_TELEPORT; tci = sched_bestcpu(l, sched_nextpkg()); if (tci != ci) { l->l_target_cpu = tci; } } else { /* * Try to find a better CPU to take it, but don't move to * another 2nd class CPU, and don't move to a non-idle CPU, * because that would prevent SMT being used to maximise * throughput. * * Search in the current CPU package in order to try and * keep L2/L3 cache locality, but expand to include the * whole system if needed. */ tci = sched_bestcpu(l, l->l_cpu); if (tci != ci && (tci->ci_schedstate.spc_flags & flags) == flags) { l->l_target_cpu = tci; } } } /* * Called during execve() by a child of vfork(). Does two things: * * - If the parent has been awoken and put back on curcpu then give the * CPU back to the parent. * * - If curlwp is not on a 1st class CPU then find somewhere else to run, * since it dodged the distribution in sched_takecpu() when first set * runnable. */ void sched_vforkexec(struct lwp *l, bool samecpu) { KASSERT(l == curlwp); if ((samecpu && ncpu > 1) || (l->l_cpu->ci_schedstate.spc_flags & SPCF_1STCLASS) == 0) { l->l_pflag |= LP_TELEPORT; preempt(); } } #else /* * stubs for !MULTIPROCESSOR */ struct cpu_info * sched_takecpu(struct lwp *l) { return l->l_cpu; } void sched_idle(void) { } void sched_preempted(struct lwp *l) { } void sched_vforkexec(struct lwp *l, bool samecpu) { KASSERT(l == curlwp); } #endif /* MULTIPROCESSOR */ /* * Scheduling statistics and balancing. */ void sched_lwp_stats(struct lwp *l) { int batch; KASSERT(lwp_locked(l, NULL)); /* Update sleep time */ if (l->l_stat == LSSLEEP || l->l_stat == LSSTOP || l->l_stat == LSSUSPENDED) l->l_slptime++; /* * Set that thread is more CPU-bound, if sum of run time exceeds the * sum of sleep time. Check if thread is CPU-bound a first time. */ batch = (l->l_rticksum > l->l_slpticksum); if (batch != 0) { if ((l->l_flag & LW_BATCH) == 0) batch = 0; l->l_flag |= LW_BATCH; } else l->l_flag &= ~LW_BATCH; /* Reset the time sums */ l->l_slpticksum = 0; l->l_rticksum = 0; /* Scheduler-specific hook */ sched_pstats_hook(l, batch); #ifdef KDTRACE_HOOKS curthread = l; #endif } /* * Scheduler mill. */ struct lwp * sched_nextlwp(void) { struct cpu_info *ci = curcpu(); struct schedstate_percpu *spc; TAILQ_HEAD(, lwp) *q_head; struct lwp *l; /* Update the last run time on switch */ l = curlwp; l->l_rticksum += (getticks() - l->l_rticks); /* Return to idle LWP if there is a migrating thread */ spc = &ci->ci_schedstate; if (__predict_false(spc->spc_migrating != NULL)) return NULL; /* Return to idle LWP if there is no runnable job */ if (__predict_false(spc->spc_count == 0)) return NULL; /* Take the highest priority thread */ KASSERT(spc->spc_bitmap[spc->spc_maxpriority >> BITMAP_SHIFT]); q_head = sched_getrq(spc, spc->spc_maxpriority); l = TAILQ_FIRST(q_head); KASSERT(l != NULL); sched_oncpu(l); l->l_rticks = getticks(); return l; } /* * sched_curcpu_runnable_p: return if curcpu() should exit the idle loop. */ bool sched_curcpu_runnable_p(void) { const struct cpu_info *ci; const struct schedstate_percpu *spc; bool rv; kpreempt_disable(); ci = curcpu(); spc = &ci->ci_schedstate; rv = (spc->spc_count != 0); #ifndef __HAVE_FAST_SOFTINTS rv |= (ci->ci_data.cpu_softints != 0); #endif kpreempt_enable(); return rv; } /* * Sysctl nodes and initialization. */ SYSCTL_SETUP(sysctl_sched_setup, "sysctl sched setup") { const struct sysctlnode *node = NULL; sysctl_createv(clog, 0, NULL, &node, CTLFLAG_PERMANENT, CTLTYPE_NODE, "sched", SYSCTL_DESCR("Scheduler options"), NULL, 0, NULL, 0, CTL_KERN, CTL_CREATE, CTL_EOL); if (node == NULL) return; sysctl_createv(clog, 0, &node, NULL, CTLFLAG_PERMANENT | CTLFLAG_READWRITE, CTLTYPE_INT, "cacheht_time", SYSCTL_DESCR("Cache hotness time (in ms)"), NULL, 0, &cacheht_time, 0, CTL_CREATE, CTL_EOL); sysctl_createv(clog, 0, &node, NULL, CTLFLAG_PERMANENT | CTLFLAG_READWRITE, CTLTYPE_INT, "skim_interval", SYSCTL_DESCR("Rate limit for stealing from other CPUs (in ms)"), NULL, 0, &skim_interval, 0, CTL_CREATE, CTL_EOL); sysctl_createv(clog, 0, &node, NULL, CTLFLAG_PERMANENT | CTLFLAG_READWRITE, CTLTYPE_INT, "min_catch", SYSCTL_DESCR("Minimal count of threads for catching"), NULL, 0, &min_catch, 0, CTL_CREATE, CTL_EOL); sysctl_createv(clog, 0, &node, NULL, CTLFLAG_PERMANENT | CTLFLAG_READWRITE, CTLTYPE_INT, "timesoftints", SYSCTL_DESCR("Track CPU time for soft interrupts"), NULL, 0, &softint_timing, 0, CTL_CREATE, CTL_EOL); sysctl_createv(clog, 0, &node, NULL, CTLFLAG_PERMANENT | CTLFLAG_READWRITE, CTLTYPE_INT, "kpreempt_pri", SYSCTL_DESCR("Minimum priority to trigger kernel preemption"), NULL, 0, &sched_kpreempt_pri, 0, CTL_CREATE, CTL_EOL); } /* * Debugging. */ #ifdef DDB void sched_print_runqueue(void (*pr)(const char *, ...)) { struct cpu_info *ci, *tci; struct schedstate_percpu *spc; struct lwp *l; struct proc *p; CPU_INFO_ITERATOR cii; for (CPU_INFO_FOREACH(cii, ci)) { int i; spc = &ci->ci_schedstate; (*pr)("Run-queue (CPU = %u):\n", ci->ci_index); (*pr)(" pid.lid = %d.%d, r_count = %u, " "maxpri = %d, mlwp = %p\n", #ifdef MULTIPROCESSOR ci->ci_curlwp->l_proc->p_pid, ci->ci_curlwp->l_lid, #else curlwp->l_proc->p_pid, curlwp->l_lid, #endif spc->spc_count, spc->spc_maxpriority, spc->spc_migrating); i = (PRI_COUNT >> BITMAP_SHIFT) - 1; do { uint32_t q; q = spc->spc_bitmap[i]; (*pr)(" bitmap[%d] => [ %d (0x%x) ]\n", i, ffs(q), q); } while (i--); } (*pr)(" %5s %4s %4s %10s %3s %18s %4s %4s %s\n", "LID", "PRI", "EPRI", "FL", "ST", "LWP", "CPU", "TCI", "LRTICKS"); PROCLIST_FOREACH(p, &allproc) { (*pr)(" /- %d (%s)\n", (int)p->p_pid, p->p_comm); LIST_FOREACH(l, &p->p_lwps, l_sibling) { ci = l->l_cpu; tci = l->l_target_cpu; (*pr)(" | %5d %4u %4u 0x%8.8x %3s %18p %4u %4d %u\n", (int)l->l_lid, l->l_priority, lwp_eprio(l), l->l_flag, l->l_stat == LSRUN ? "RQ" : (l->l_stat == LSSLEEP ? "SQ" : "-"), l, ci->ci_index, (tci ? tci->ci_index : -1), (u_int)(getticks() - l->l_rticks)); } } } #endif
2 2 52 52 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 /* $NetBSD: bus_space.c,v 1.47 2022/07/17 08:33:48 riastradh Exp $ */ /*- * Copyright (c) 1996, 1997, 1998 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Charles M. Hannum and by Jason R. Thorpe of the Numerical Aerospace * Simulation Facility, NASA Ames Research Center. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: bus_space.c,v 1.47 2022/07/17 08:33:48 riastradh Exp $"); #include <sys/param.h> #include <sys/systm.h> #include <sys/malloc.h> #include <sys/extent.h> #include <sys/kmem.h> #include <uvm/uvm_extern.h> #include <dev/isa/isareg.h> #include <sys/bus.h> #include <machine/pio.h> #include <machine/isa_machdep.h> #ifdef XEN #include <xen/hypervisor.h> #endif /* * Macros for sanity-checking the aligned-ness of pointers passed to * bus space ops. These are not strictly necessary on the x86, but * could lead to performance improvements, and help catch problems * with drivers that would creep up on other architectures. */ #ifdef BUS_SPACE_DEBUG #define BUS_SPACE_ALIGNED_ADDRESS(p, t) \ ((((u_long)(p)) & (sizeof(t)-1)) == 0) #define BUS_SPACE_ADDRESS_SANITY(p, t, d) \ ({ \ if (BUS_SPACE_ALIGNED_ADDRESS((p), t) == 0) { \ printf("%s 0x%lx not aligned to %zu bytes %s:%d\n", \ d, (u_long)(p), sizeof(t), __FILE__, __LINE__); \ } \ (void) 0; \ }) #else #define BUS_SPACE_ADDRESS_SANITY(p,t,d) (void) 0 #endif /* BUS_SPACE_DEBUG */ /* * Extent maps to manage I/O and memory space. Allocate * storage for 8 regions in each, initially. Later, ioport_malloc_safe * will indicate that it's safe to use malloc() to dynamically allocate * region descriptors. * * N.B. At least two regions are _always_ allocated from the iomem * extent map; (0 -> ISA hole) and (end of ISA hole -> end of RAM). * * The extent maps are not static! Machine-dependent ISA and EISA * routines need access to them for bus address space allocation. */ static long ioport_ex_storage[EXTENT_FIXED_STORAGE_SIZE(16) / sizeof(long)]; static long iomem_ex_storage[EXTENT_FIXED_STORAGE_SIZE(64) / sizeof(long)]; struct extent *ioport_ex; struct extent *iomem_ex; static int ioport_malloc_safe; static struct bus_space_tag x86_io = { .bst_type = X86_BUS_SPACE_IO }; static struct bus_space_tag x86_mem = { .bst_type = X86_BUS_SPACE_MEM }; bus_space_tag_t x86_bus_space_io = &x86_io; bus_space_tag_t x86_bus_space_mem = &x86_mem; int x86_mem_add_mapping(bus_addr_t, bus_size_t, int, bus_space_handle_t *); static inline bool x86_bus_space_is_io(bus_space_tag_t t) { return t->bst_type == X86_BUS_SPACE_IO; } static inline bool x86_bus_space_is_mem(bus_space_tag_t t) { return t->bst_type == X86_BUS_SPACE_MEM; } void x86_bus_space_init(void) { /* * Initialize the I/O port and I/O mem extent maps. * Note: we don't have to check the return value since * creation of a fixed extent map will never fail (since * descriptor storage has already been allocated). * * N.B. The iomem extent manages _all_ physical addresses * on the machine. When the amount of RAM is found, the two * extents of RAM are allocated from the map (0 -> ISA hole * and end of ISA hole -> end of RAM). */ ioport_ex = extent_create("ioport", 0x0, 0xffff, (void *)ioport_ex_storage, sizeof(ioport_ex_storage), EX_NOCOALESCE|EX_NOWAIT); iomem_ex = extent_create("iomem", 0x0, MAXIOMEM, (void *)iomem_ex_storage, sizeof(iomem_ex_storage), EX_NOCOALESCE|EX_NOWAIT); #ifdef XENPV /* We are privileged guest os - should have IO privileges. */ if (xendomain_is_privileged()) { struct physdev_set_iopl set_iopl; memset(&set_iopl, 0, sizeof(set_iopl)); set_iopl.iopl = 1; if (HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl) != 0) panic("Unable to obtain IOPL, " "despite being SIF_PRIVILEGED"); } #endif /* XENPV */ } void x86_bus_space_mallocok(void) { ioport_malloc_safe = 1; } int bus_space_map(bus_space_tag_t t, bus_addr_t bpa, bus_size_t size, int flags, bus_space_handle_t *bshp) { bus_space_reservation_t bsr; bus_space_tag_t it; int error; if ((t->bst_exists & BUS_SPACE_OVERRIDE_MAP) == 0) ; /* skip override */ else for (it = t; it != NULL; it = it->bst_super) { if ((it->bst_present & BUS_SPACE_OVERRIDE_MAP) == 0) continue; return (*it->bst_ov->ov_space_map)(it->bst_ctx, t, bpa, size, flags, bshp); } error = bus_space_reserve(t, bpa, size, flags, &bsr); if (error != 0) return error; error = bus_space_reservation_map(t, &bsr, flags, bshp); if (error != 0) bus_space_release(t, &bsr); return error; } int bus_space_reservation_map(bus_space_tag_t t, bus_space_reservation_t *bsr, int flags, bus_space_handle_t *bshp) { bus_addr_t bpa; bus_size_t size; bus_space_tag_t it; if ((t->bst_exists & BUS_SPACE_OVERRIDE_RESERVATION_MAP) == 0) ; /* skip override */ else for (it = t; it != NULL; it = it->bst_super) { if ((it->bst_present & BUS_SPACE_OVERRIDE_RESERVATION_MAP) == 0) continue; return (*it->bst_ov->ov_space_reservation_map)(it->bst_ctx, t, bsr, flags, bshp); } bpa = bus_space_reservation_addr(bsr); size = bus_space_reservation_size(bsr); /* * For I/O space, that's all she wrote. */ if (x86_bus_space_is_io(t)) { *bshp = bpa; return 0; } #ifndef XENPV if (bpa >= IOM_BEGIN && (bpa + size) != 0 && (bpa + size) <= IOM_END) { *bshp = (bus_space_handle_t)ISA_HOLE_VADDR(bpa); return 0; } #endif /* !XENPV */ /* * For memory space, map the bus physical address to * a kernel virtual address. */ return x86_mem_add_mapping(bpa, size, flags, bshp); } int _x86_memio_map(bus_space_tag_t t, bus_addr_t bpa, bus_size_t size, int flags, bus_space_handle_t *bshp) { /* * For I/O space, just fill in the handle. */ if (x86_bus_space_is_io(t)) { if (flags & BUS_SPACE_MAP_LINEAR) return (EOPNOTSUPP); *bshp = bpa; return (0); } /* * For memory space, map the bus physical address to * a kernel virtual address. */ return x86_mem_add_mapping(bpa, size, flags, bshp); } int bus_space_reserve(bus_space_tag_t t, bus_addr_t bpa, bus_size_t size, int flags, bus_space_reservation_t *bsrp) { struct extent *ex; int error; bus_space_tag_t it; if ((t->bst_exists & BUS_SPACE_OVERRIDE_RESERVE) == 0) ; /* skip override */ else for (it = t; it != NULL; it = it->bst_super) { if ((it->bst_present & BUS_SPACE_OVERRIDE_RESERVE) == 0) continue; return (*it->bst_ov->ov_space_reserve)(it->bst_ctx, t, bpa, size, flags, bsrp); } /* * Pick the appropriate extent map. */ if (x86_bus_space_is_io(t)) { if (flags & BUS_SPACE_MAP_LINEAR) return (EOPNOTSUPP); ex = ioport_ex; } else if (x86_bus_space_is_mem(t)) ex = iomem_ex; else panic("x86_memio_alloc: bad bus space tag"); /* * Before we go any further, let's make sure that this * region is available. */ error = extent_alloc_region(ex, bpa, size, EX_NOWAIT | (ioport_malloc_safe ? EX_MALLOCOK : 0)); if (error != 0) return error; bus_space_reservation_init(bsrp, bpa, size); return 0; } int bus_space_reserve_subregion(bus_space_tag_t t, bus_addr_t rstart, bus_addr_t rend, const bus_size_t size, const bus_size_t alignment, const bus_size_t boundary, const int flags, bus_space_reservation_t *bsrp) { bus_space_reservation_t bsr; struct extent *ex; u_long bpa; int error; bus_space_tag_t it; if ((t->bst_exists & BUS_SPACE_OVERRIDE_RESERVE_SUBREGION) == 0) ; /* skip override */ else for (it = t; it != NULL; it = it->bst_super) { if ((it->bst_present & BUS_SPACE_OVERRIDE_RESERVE_SUBREGION) == 0) continue; return (*it->bst_ov->ov_space_reserve_subregion)(it->bst_ctx, t, rstart, rend, size, alignment, boundary, flags, bsrp); } /* * Pick the appropriate extent map. */ if (x86_bus_space_is_io(t)) { if (flags & BUS_SPACE_MAP_LINEAR) return (EOPNOTSUPP); ex = ioport_ex; } else if (x86_bus_space_is_mem(t)) ex = iomem_ex; else panic("x86_memio_alloc: bad bus space tag"); /* * Sanity check the allocation against the extent's boundaries. */ rstart = MAX(rstart, ex->ex_start); rend = MIN(rend, ex->ex_end); if (rstart >= rend) panic("x86_memio_alloc: bad region start/end"); /* * Do the requested allocation. */ error = extent_alloc_subregion(ex, rstart, rend, size, alignment, boundary, EX_FAST | EX_NOWAIT | (ioport_malloc_safe ? EX_MALLOCOK : 0), &bpa); if (error) return (error); bus_space_reservation_init(&bsr, bpa, size); *bsrp = bsr; return 0; } void bus_space_release(bus_space_tag_t t, bus_space_reservation_t *bsr) { struct extent *ex; bus_space_tag_t it; if ((t->bst_exists & BUS_SPACE_OVERRIDE_RELEASE) == 0) ; /* skip override */ else for (it = t; it != NULL; it = it->bst_super) { if ((it->bst_present & BUS_SPACE_OVERRIDE_RELEASE) == 0) continue; (*it->bst_ov->ov_space_release)(it->bst_ctx, t, bsr); return; } /* * Pick the appropriate extent map. */ if (x86_bus_space_is_io(t)) { ex = ioport_ex; } else if (x86_bus_space_is_mem(t)) ex = iomem_ex; else panic("x86_memio_alloc: bad bus space tag"); if (extent_free(ex, bus_space_reservation_addr(bsr), bus_space_reservation_size(bsr), EX_NOWAIT | (ioport_malloc_safe ? EX_MALLOCOK : 0))) { printf("%s: pa 0x%jx, size 0x%jx\n", __func__, (uintmax_t)bus_space_reservation_addr(bsr), (uintmax_t)bus_space_reservation_size(bsr)); printf("%s: can't free region\n", __func__); } } int bus_space_alloc(bus_space_tag_t t, bus_addr_t rstart, bus_addr_t rend, bus_size_t size, bus_size_t alignment, bus_size_t boundary, int flags, bus_addr_t *bpap, bus_space_handle_t *bshp) { bus_space_reservation_t bsr; bus_space_tag_t it; int error; if ((t->bst_exists & BUS_SPACE_OVERRIDE_ALLOC) == 0) ; /* skip override */ else for (it = t; it != NULL; it = it->bst_super) { if ((it->bst_present & BUS_SPACE_OVERRIDE_ALLOC) == 0) continue; return (*it->bst_ov->ov_space_alloc)(it->bst_ctx, t, rstart, rend, size, alignment, boundary, flags, bpap, bshp); } /* * Do the requested allocation. */ error = bus_space_reserve_subregion(t, rstart, rend, size, alignment, boundary, flags, &bsr); if (error != 0) return error; error = bus_space_reservation_map(t, &bsr, flags, bshp); if (error != 0) bus_space_release(t, &bsr); *bpap = bus_space_reservation_addr(&bsr); return error; } int x86_mem_add_mapping(bus_addr_t bpa, bus_size_t size, int flags, bus_space_handle_t *bshp) { paddr_t pa, endpa; vaddr_t va, sva; u_int pmapflags; pa = x86_trunc_page(bpa); endpa = x86_round_page(bpa + size); pmapflags = PMAP_NOCACHE; if ((flags & BUS_SPACE_MAP_CACHEABLE) != 0) pmapflags = 0; else if (flags & BUS_SPACE_MAP_PREFETCHABLE) pmapflags = PMAP_WRITE_COMBINE; #ifdef DIAGNOSTIC if (endpa != 0 && endpa <= pa) panic("x86_mem_add_mapping: overflow"); #endif #ifdef XENPV if (bpa >= IOM_BEGIN && (bpa + size) != 0 && (bpa + size) <= IOM_END) { sva = (vaddr_t)ISA_HOLE_VADDR(pa); } else #endif /* XENPV */ { sva = uvm_km_alloc(kernel_map, endpa - pa, 0, UVM_KMF_VAONLY | UVM_KMF_NOWAIT); if (sva == 0) return (ENOMEM); } *bshp = (bus_space_handle_t)(sva + (bpa & PGOFSET)); for (va = sva; pa != endpa; pa += PAGE_SIZE, va += PAGE_SIZE) { pmap_kenter_ma(va, pa, VM_PROT_READ | VM_PROT_WRITE, pmapflags); } pmap_update(pmap_kernel()); return 0; } bool bus_space_is_equal(bus_space_tag_t t1, bus_space_tag_t t2) { if (t1 == NULL || t2 == NULL) return false; return t1->bst_type == t2->bst_type; } /* * void _x86_memio_unmap(bus_space_tag bst, bus_space_handle bsh, * bus_size_t size, bus_addr_t *adrp) * * This function unmaps memory- or io-space mapped by the function * _x86_memio_map(). This function works nearly as same as * x86_memio_unmap(), but this function does not ask kernel * built-in extents and returns physical address of the bus space, * for the convenience of the extra extent manager. */ void _x86_memio_unmap(bus_space_tag_t t, bus_space_handle_t bsh, bus_size_t size, bus_addr_t *adrp) { u_long va, endva; bus_addr_t bpa; /* * Find the correct extent and bus physical address. */ if (x86_bus_space_is_io(t)) { bpa = bsh; } else if (x86_bus_space_is_mem(t)) { if (bsh >= atdevbase && (bsh + size) != 0 && (bsh + size) <= (atdevbase + IOM_SIZE)) { bpa = (bus_addr_t)ISA_PHYSADDR(bsh); } else { va = x86_trunc_page(bsh); endva = x86_round_page(bsh + size); #ifdef DIAGNOSTIC if (endva <= va) { panic("_x86_memio_unmap: overflow"); } #endif if (pmap_extract_ma(pmap_kernel(), va, &bpa) == FALSE) { panic("_x86_memio_unmap:" " wrong virtual address"); } bpa += (bsh & PGOFSET); pmap_kremove(va, endva - va); pmap_update(pmap_kernel()); /* * Free the kernel virtual mapping. */ uvm_km_free(kernel_map, va, endva - va, UVM_KMF_VAONLY); } } else { panic("_x86_memio_unmap: bad bus space tag"); } if (adrp != NULL) { *adrp = bpa; } } static void bus_space_reservation_unmap1(bus_space_tag_t t, const bus_space_handle_t bsh, const bus_size_t size, bus_addr_t *bpap) { u_long va, endva; bus_addr_t bpa; /* * Find the correct extent and bus physical address. */ if (x86_bus_space_is_io(t)) { bpa = bsh; } else if (x86_bus_space_is_mem(t)) { if (bsh >= atdevbase && (bsh + size) != 0 && (bsh + size) <= (atdevbase + IOM_SIZE)) { bpa = (bus_addr_t)ISA_PHYSADDR(bsh); goto ok; } va = x86_trunc_page(bsh); endva = x86_round_page(bsh + size); #ifdef DIAGNOSTIC if (endva <= va) panic("x86_memio_unmap: overflow"); #endif (void) pmap_extract_ma(pmap_kernel(), va, &bpa); bpa += (bsh & PGOFSET); pmap_kremove(va, endva - va); pmap_update(pmap_kernel()); /* * Free the kernel virtual mapping. */ uvm_km_free(kernel_map, va, endva - va, UVM_KMF_VAONLY); } else panic("x86_memio_unmap: bad bus space tag"); ok: if (bpap != NULL) *bpap = bpa; } void bus_space_reservation_unmap(bus_space_tag_t t, const bus_space_handle_t bsh, const bus_size_t size) { bus_space_tag_t it; if ((t->bst_exists & BUS_SPACE_OVERRIDE_RESERVATION_UNMAP) == 0) ; /* skip override */ else for (it = t; it != NULL; it = it->bst_super) { if ((it->bst_present & BUS_SPACE_OVERRIDE_RESERVATION_UNMAP) == 0) continue; (*it->bst_ov->ov_space_reservation_unmap)(it->bst_ctx, t, bsh, size); return; } bus_space_reservation_unmap1(t, bsh, size, NULL); } void bus_space_unmap(bus_space_tag_t t, const bus_space_handle_t bsh, const bus_size_t size) { bus_addr_t addr; bus_space_reservation_t bsr; bus_space_tag_t it; if ((t->bst_exists & BUS_SPACE_OVERRIDE_UNMAP) == 0) ; /* skip override */ else for (it = t; it != NULL; it = it->bst_super) { if ((it->bst_present & BUS_SPACE_OVERRIDE_UNMAP) == 0) continue; (*it->bst_ov->ov_space_unmap)(it->bst_ctx, t, bsh, size); return; } bus_space_reservation_unmap1(t, bsh, size, &addr); bus_space_reservation_init(&bsr, addr, size); bus_space_release(t, &bsr); } void bus_space_free(bus_space_tag_t t, bus_space_handle_t bsh, bus_size_t size) { bus_space_tag_t it; if ((t->bst_exists & BUS_SPACE_OVERRIDE_FREE) == 0) ; /* skip override */ else for (it = t; it != NULL; it = it->bst_super) { if ((it->bst_present & BUS_SPACE_OVERRIDE_FREE) == 0) continue; (*it->bst_ov->ov_space_free)(it->bst_ctx, t, bsh, size); return; } /* bus_space_unmap() does all that we need to do. */ bus_space_unmap(t, bsh, size); } int bus_space_subregion(bus_space_tag_t t, bus_space_handle_t bsh, bus_size_t offset, bus_size_t size, bus_space_handle_t *nbshp) { *nbshp = bsh + offset; return (0); } paddr_t bus_space_mmap(bus_space_tag_t t, bus_addr_t addr, off_t off, int prot, int flags) { paddr_t pflags = 0; /* Can't mmap I/O space. */ if (x86_bus_space_is_io(t)) return (-1); /* * "addr" is the base address of the device we're mapping. * "off" is the offset into that device. * * Note we are called for each "page" in the device that * the upper layers want to map. */ if (flags & BUS_SPACE_MAP_PREFETCHABLE) pflags |= X86_MMAP_FLAG_PREFETCH; return x86_btop(addr + off) | (pflags << X86_MMAP_FLAG_SHIFT); } void bus_space_set_multi_1(bus_space_tag_t t, bus_space_handle_t h, bus_size_t o, uint8_t v, size_t c) { vaddr_t addr = h + o; if (x86_bus_space_is_io(t)) while (c--) outb(addr, v); else while (c--) *(volatile uint8_t *)(addr) = v; } void bus_space_set_multi_2(bus_space_tag_t t, bus_space_handle_t h, bus_size_t o, uint16_t v, size_t c) { vaddr_t addr = h + o; BUS_SPACE_ADDRESS_SANITY(addr, uint16_t, "bus addr"); if (x86_bus_space_is_io(t)) while (c--) outw(addr, v); else while (c--) *(volatile uint16_t *)(addr) = v; } void bus_space_set_multi_4(bus_space_tag_t t, bus_space_handle_t h, bus_size_t o, uint32_t v, size_t c) { vaddr_t addr = h + o; BUS_SPACE_ADDRESS_SANITY(addr, uint32_t, "bus addr"); if (x86_bus_space_is_io(t)) while (c--) outl(addr, v); else while (c--) *(volatile uint32_t *)(addr) = v; } void bus_space_set_region_1(bus_space_tag_t t, bus_space_handle_t h, bus_size_t o, uint8_t v, size_t c) { vaddr_t addr = h + o; if (x86_bus_space_is_io(t)) for (; c != 0; c--, addr++) outb(addr, v); else for (; c != 0; c--, addr++) *(volatile uint8_t *)(addr) = v; } void bus_space_set_region_2(bus_space_tag_t t, bus_space_handle_t h, bus_size_t o, uint16_t v, size_t c) { vaddr_t addr = h + o; BUS_SPACE_ADDRESS_SANITY(addr, uint16_t, "bus addr"); if (x86_bus_space_is_io(t)) for (; c != 0; c--, addr += 2) outw(addr, v); else for (; c != 0; c--, addr += 2) *(volatile uint16_t *)(addr) = v; } void bus_space_set_region_4(bus_space_tag_t t, bus_space_handle_t h, bus_size_t o, uint32_t v, size_t c) { vaddr_t addr = h + o; BUS_SPACE_ADDRESS_SANITY(addr, uint32_t, "bus addr"); if (x86_bus_space_is_io(t)) for (; c != 0; c--, addr += 4) outl(addr, v); else for (; c != 0; c--, addr += 4) *(volatile uint32_t *)(addr) = v; } void bus_space_copy_region_1(bus_space_tag_t t, bus_space_handle_t h1, bus_size_t o1, bus_space_handle_t h2, bus_size_t o2, size_t c) { vaddr_t addr1 = h1 + o1; vaddr_t addr2 = h2 + o2; if (x86_bus_space_is_io(t)) { if (addr1 >= addr2) { /* src after dest: copy forward */ for (; c != 0; c--, addr1++, addr2++) outb(addr2, inb(addr1)); } else { /* dest after src: copy backwards */ for (addr1 += (c - 1), addr2 += (c - 1); c != 0; c--, addr1--, addr2--) outb(addr2, inb(addr1)); } } else { if (addr1 >= addr2) { /* src after dest: copy forward */ for (; c != 0; c--, addr1++, addr2++) *(volatile uint8_t *)(addr2) = *(volatile uint8_t *)(addr1); } else { /* dest after src: copy backwards */ for (addr1 += (c - 1), addr2 += (c - 1); c != 0; c--, addr1--, addr2--) *(volatile uint8_t *)(addr2) = *(volatile uint8_t *)(addr1); } } } void bus_space_copy_region_2(bus_space_tag_t t, bus_space_handle_t h1, bus_size_t o1, bus_space_handle_t h2, bus_size_t o2, size_t c) { vaddr_t addr1 = h1 + o1; vaddr_t addr2 = h2 + o2; BUS_SPACE_ADDRESS_SANITY(addr1, uint16_t, "bus addr 1"); BUS_SPACE_ADDRESS_SANITY(addr2, uint16_t, "bus addr 2"); if (x86_bus_space_is_io(t)) { if (addr1 >= addr2) { /* src after dest: copy forward */ for (; c != 0; c--, addr1 += 2, addr2 += 2) outw(addr2, inw(addr1)); } else { /* dest after src: copy backwards */ for (addr1 += 2 * (c - 1), addr2 += 2 * (c - 1); c != 0; c--, addr1 -= 2, addr2 -= 2) outw(addr2, inw(addr1)); } } else { if (addr1 >= addr2) { /* src after dest: copy forward */ for (; c != 0; c--, addr1 += 2, addr2 += 2) *(volatile uint16_t *)(addr2) = *(volatile uint16_t *)(addr1); } else { /* dest after src: copy backwards */ for (addr1 += 2 * (c - 1), addr2 += 2 * (c - 1); c != 0; c--, addr1 -= 2, addr2 -= 2) *(volatile uint16_t *)(addr2) = *(volatile uint16_t *)(addr1); } } } void bus_space_copy_region_4(bus_space_tag_t t, bus_space_handle_t h1, bus_size_t o1, bus_space_handle_t h2, bus_size_t o2, size_t c) { vaddr_t addr1 = h1 + o1; vaddr_t addr2 = h2 + o2; BUS_SPACE_ADDRESS_SANITY(addr1, uint32_t, "bus addr 1"); BUS_SPACE_ADDRESS_SANITY(addr2, uint32_t, "bus addr 2"); if (x86_bus_space_is_io(t)) { if (addr1 >= addr2) { /* src after dest: copy forward */ for (; c != 0; c--, addr1 += 4, addr2 += 4) outl(addr2, inl(addr1)); } else { /* dest after src: copy backwards */ for (addr1 += 4 * (c - 1), addr2 += 4 * (c - 1); c != 0; c--, addr1 -= 4, addr2 -= 4) outl(addr2, inl(addr1)); } } else { if (addr1 >= addr2) { /* src after dest: copy forward */ for (; c != 0; c--, addr1 += 4, addr2 += 4) *(volatile uint32_t *)(addr2) = *(volatile uint32_t *)(addr1); } else { /* dest after src: copy backwards */ for (addr1 += 4 * (c - 1), addr2 += 4 * (c - 1); c != 0; c--, addr1 -= 4, addr2 -= 4) *(volatile uint32_t *)(addr2) = *(volatile uint32_t *)(addr1); } } } void bus_space_barrier(bus_space_tag_t tag, bus_space_handle_t bsh, bus_size_t offset, bus_size_t len, int flags) { /* I/O instructions always happen in program order. */ if (x86_bus_space_is_io(tag)) return; /* * For default mappings, which are mapped with UC-type memory * regions, all loads and stores are issued in program order. * * For BUS_SPACE_MAP_PREFETCHABLE mappings, which are mapped * with WC-type memory regions, loads and stores may be issued * out of order, potentially requiring any of the three x86 * fences -- LFENCE, SFENCE, MFENCE. * * For BUS_SPACE_MAP_CACHEABLE mappings, which are mapped with * WB-type memory regions (like normal memory), store/load may * be reordered to load/store, potentially requiring MFENCE. * * We can't easily tell here how the region was mapped (without * consulting the page tables), so just issue the fence * unconditionally. Chances are either it's necessary or the * cost is small in comparison to device register I/O. * * Reference: * * AMD64 Architecture Programmer's Manual, Volume 2: * System Programming, 24593--Rev. 3.38--November 2021, * Sec. 7.4.2 Memory Barrier Interaction with Memory * Types, Table 7-3, p. 196. * https://web.archive.org/web/20220625040004/https://www.amd.com/system/files/TechDocs/24593.pdf#page=256 */ switch (flags) { case 0: break; case BUS_SPACE_BARRIER_READ: x86_lfence(); break; case BUS_SPACE_BARRIER_WRITE: x86_sfence(); break; case BUS_SPACE_BARRIER_READ|BUS_SPACE_BARRIER_WRITE: x86_mfence(); break; default: panic("unknown bus space barrier: 0x%x", (unsigned)flags); } } void * bus_space_vaddr(bus_space_tag_t tag, bus_space_handle_t bsh) { return x86_bus_space_is_mem(tag) ? (void *)bsh : NULL; } static const void * bit_to_function_pointer(const struct bus_space_overrides *ov, uint64_t bit) { switch (bit) { case BUS_SPACE_OVERRIDE_MAP: return ov->ov_space_map; case BUS_SPACE_OVERRIDE_UNMAP: return ov->ov_space_unmap; case BUS_SPACE_OVERRIDE_ALLOC: return ov->ov_space_alloc; case BUS_SPACE_OVERRIDE_FREE: return ov->ov_space_free; case BUS_SPACE_OVERRIDE_RESERVE: return ov->ov_space_reserve; case BUS_SPACE_OVERRIDE_RELEASE: return ov->ov_space_release; case BUS_SPACE_OVERRIDE_RESERVATION_MAP: return ov->ov_space_reservation_map; case BUS_SPACE_OVERRIDE_RESERVATION_UNMAP: return ov->ov_space_reservation_unmap; case BUS_SPACE_OVERRIDE_RESERVE_SUBREGION: return ov->ov_space_reserve_subregion; default: return NULL; } } void bus_space_tag_destroy(bus_space_tag_t bst) { kmem_free(bst, sizeof(struct bus_space_tag)); } int bus_space_tag_create(bus_space_tag_t obst, const uint64_t present, const uint64_t extpresent, const struct bus_space_overrides *ov, void *ctx, bus_space_tag_t *bstp) { uint64_t bit, bits, nbits; bus_space_tag_t bst; const void *fp; if (ov == NULL || present == 0 || extpresent != 0) return EINVAL; bst = kmem_alloc(sizeof(struct bus_space_tag), KM_SLEEP); bst->bst_super = obst; bst->bst_type = obst->bst_type; for (bits = present; bits != 0; bits = nbits) { nbits = bits & (bits - 1); bit = nbits ^ bits; if ((fp = bit_to_function_pointer(ov, bit)) == NULL) { printf("%s: missing bit %" PRIx64 "\n", __func__, bit); goto einval; } } bst->bst_ov = ov; bst->bst_exists = obst->bst_exists | present; bst->bst_present = present; bst->bst_ctx = ctx; *bstp = bst; return 0; einval: kmem_free(bst, sizeof(struct bus_space_tag)); return EINVAL; }
2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 /* $NetBSD: sha2.c,v 1.26 2024/01/20 14:55:02 christos Exp $ */ /* $KAME: sha2.c,v 1.9 2003/07/20 00:28:38 itojun Exp $ */ /* * sha2.c * * Version 1.0.0beta1 * * Written by Aaron D. Gifford <me@aarongifford.com> * * Copyright 2000 Aaron D. Gifford. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the copyright holder nor the names of contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR(S) AND CONTRIBUTOR(S) ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR(S) OR CONTRIBUTOR(S) BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * */ #if HAVE_NBTOOL_CONFIG_H #include "nbtool_config.h" #endif #include <sys/cdefs.h> #if defined(_KERNEL) || defined(_STANDALONE) __KERNEL_RCSID(0, "$NetBSD: sha2.c,v 1.26 2024/01/20 14:55:02 christos Exp $"); #include <sys/param.h> /* XXX: to pull <machine/macros.h> for vax memset(9) */ #include <lib/libkern/libkern.h> #else #if defined(LIBC_SCCS) && !defined(lint) __RCSID("$NetBSD: sha2.c,v 1.26 2024/01/20 14:55:02 christos Exp $"); #endif /* LIBC_SCCS and not lint */ #include "namespace.h" #include <string.h> #endif #ifndef _LIBC_INTERNAL #define _LIBC_INTERNAL #endif #include <sys/types.h> #include <sys/sha2.h> #if HAVE_SYS_ENDIAN_H # include <sys/endian.h> #endif /*** SHA-256/384/512 Various Length Definitions ***********************/ /* NOTE: Most of these are in sha2.h */ #define SHA256_SHORT_BLOCK_LENGTH (SHA256_BLOCK_LENGTH - 8) #define SHA384_SHORT_BLOCK_LENGTH (SHA384_BLOCK_LENGTH - 16) #define SHA512_SHORT_BLOCK_LENGTH (SHA512_BLOCK_LENGTH - 16) /* * Macro for incrementally adding the unsigned 64-bit integer n to the * unsigned 128-bit integer (represented using a two-element array of * 64-bit words): */ #define ADDINC128(w,n) { \ (w)[0] += (uint64_t)(n); \ if ((w)[0] < (n)) { \ (w)[1]++; \ } \ } /*** THE SIX LOGICAL FUNCTIONS ****************************************/ /* * Bit shifting and rotation (used by the six SHA-XYZ logical functions: * * NOTE: The naming of R and S appears backwards here (R is a SHIFT and * S is a ROTATION) because the SHA-256/384/512 description document * (see http://csrc.nist.gov/cryptval/shs/sha256-384-512.pdf) uses this * same "backwards" definition. */ /* Shift-right (used in SHA-256, SHA-384, and SHA-512): */ #define R(b,x) ((x) >> (b)) /* 32-bit Rotate-right (used in SHA-256): */ #define S32(b,x) (((x) >> (b)) | ((x) << (32 - (b)))) /* 64-bit Rotate-right (used in SHA-384 and SHA-512): */ #define S64(b,x) (((x) >> (b)) | ((x) << (64 - (b)))) /* Two of six logical functions used in SHA-256, SHA-384, and SHA-512: */ #define Ch(x,y,z) (((x) & (y)) ^ ((~(x)) & (z))) #define Maj(x,y,z) (((x) & (y)) ^ ((x) & (z)) ^ ((y) & (z))) /* Four of six logical functions used in SHA-256: */ #define Sigma0_256(x) (S32(2, (x)) ^ S32(13, (x)) ^ S32(22, (x))) #define Sigma1_256(x) (S32(6, (x)) ^ S32(11, (x)) ^ S32(25, (x))) #define sigma0_256(x) (S32(7, (x)) ^ S32(18, (x)) ^ R(3 , (x))) #define sigma1_256(x) (S32(17, (x)) ^ S32(19, (x)) ^ R(10, (x))) /* Four of six logical functions used in SHA-384 and SHA-512: */ #define Sigma0_512(x) (S64(28, (x)) ^ S64(34, (x)) ^ S64(39, (x))) #define Sigma1_512(x) (S64(14, (x)) ^ S64(18, (x)) ^ S64(41, (x))) #define sigma0_512(x) (S64( 1, (x)) ^ S64( 8, (x)) ^ R( 7, (x))) #define sigma1_512(x) (S64(19, (x)) ^ S64(61, (x)) ^ R( 6, (x))) /*** INTERNAL FUNCTION PROTOTYPES *************************************/ /* NOTE: These should not be accessed directly from outside this * library -- they are intended for private internal visibility/use * only. */ static void SHA512_Last(SHA512_CTX *); /*** SHA-XYZ INITIAL HASH VALUES AND CONSTANTS ************************/ /* Hash constant words K for SHA-256: */ static const uint32_t K256[64] = { 0x428a2f98UL, 0x71374491UL, 0xb5c0fbcfUL, 0xe9b5dba5UL, 0x3956c25bUL, 0x59f111f1UL, 0x923f82a4UL, 0xab1c5ed5UL, 0xd807aa98UL, 0x12835b01UL, 0x243185beUL, 0x550c7dc3UL, 0x72be5d74UL, 0x80deb1feUL, 0x9bdc06a7UL, 0xc19bf174UL, 0xe49b69c1UL, 0xefbe4786UL, 0x0fc19dc6UL, 0x240ca1ccUL, 0x2de92c6fUL, 0x4a7484aaUL, 0x5cb0a9dcUL, 0x76f988daUL, 0x983e5152UL, 0xa831c66dUL, 0xb00327c8UL, 0xbf597fc7UL, 0xc6e00bf3UL, 0xd5a79147UL, 0x06ca6351UL, 0x14292967UL, 0x27b70a85UL, 0x2e1b2138UL, 0x4d2c6dfcUL, 0x53380d13UL, 0x650a7354UL, 0x766a0abbUL, 0x81c2c92eUL, 0x92722c85UL, 0xa2bfe8a1UL, 0xa81a664bUL, 0xc24b8b70UL, 0xc76c51a3UL, 0xd192e819UL, 0xd6990624UL, 0xf40e3585UL, 0x106aa070UL, 0x19a4c116UL, 0x1e376c08UL, 0x2748774cUL, 0x34b0bcb5UL, 0x391c0cb3UL, 0x4ed8aa4aUL, 0x5b9cca4fUL, 0x682e6ff3UL, 0x748f82eeUL, 0x78a5636fUL, 0x84c87814UL, 0x8cc70208UL, 0x90befffaUL, 0xa4506cebUL, 0xbef9a3f7UL, 0xc67178f2UL }; /* Initial hash value H for SHA-224: */ static const uint32_t sha224_initial_hash_value[8] = { 0xc1059ed8UL, 0x367cd507UL, 0x3070dd17UL, 0xf70e5939UL, 0xffc00b31UL, 0x68581511UL, 0x64f98fa7UL, 0xbefa4fa4UL }; /* Initial hash value H for SHA-256: */ static const uint32_t sha256_initial_hash_value[8] = { 0x6a09e667UL, 0xbb67ae85UL, 0x3c6ef372UL, 0xa54ff53aUL, 0x510e527fUL, 0x9b05688cUL, 0x1f83d9abUL, 0x5be0cd19UL }; /* Hash constant words K for SHA-384 and SHA-512: */ static const uint64_t K512[80] = { 0x428a2f98d728ae22ULL, 0x7137449123ef65cdULL, 0xb5c0fbcfec4d3b2fULL, 0xe9b5dba58189dbbcULL, 0x3956c25bf348b538ULL, 0x59f111f1b605d019ULL, 0x923f82a4af194f9bULL, 0xab1c5ed5da6d8118ULL, 0xd807aa98a3030242ULL, 0x12835b0145706fbeULL, 0x243185be4ee4b28cULL, 0x550c7dc3d5ffb4e2ULL, 0x72be5d74f27b896fULL, 0x80deb1fe3b1696b1ULL, 0x9bdc06a725c71235ULL, 0xc19bf174cf692694ULL, 0xe49b69c19ef14ad2ULL, 0xefbe4786384f25e3ULL, 0x0fc19dc68b8cd5b5ULL, 0x240ca1cc77ac9c65ULL, 0x2de92c6f592b0275ULL, 0x4a7484aa6ea6e483ULL, 0x5cb0a9dcbd41fbd4ULL, 0x76f988da831153b5ULL, 0x983e5152ee66dfabULL, 0xa831c66d2db43210ULL, 0xb00327c898fb213fULL, 0xbf597fc7beef0ee4ULL, 0xc6e00bf33da88fc2ULL, 0xd5a79147930aa725ULL, 0x06ca6351e003826fULL, 0x142929670a0e6e70ULL, 0x27b70a8546d22ffcULL, 0x2e1b21385c26c926ULL, 0x4d2c6dfc5ac42aedULL, 0x53380d139d95b3dfULL, 0x650a73548baf63deULL, 0x766a0abb3c77b2a8ULL, 0x81c2c92e47edaee6ULL, 0x92722c851482353bULL, 0xa2bfe8a14cf10364ULL, 0xa81a664bbc423001ULL, 0xc24b8b70d0f89791ULL, 0xc76c51a30654be30ULL, 0xd192e819d6ef5218ULL, 0xd69906245565a910ULL, 0xf40e35855771202aULL, 0x106aa07032bbd1b8ULL, 0x19a4c116b8d2d0c8ULL, 0x1e376c085141ab53ULL, 0x2748774cdf8eeb99ULL, 0x34b0bcb5e19b48a8ULL, 0x391c0cb3c5c95a63ULL, 0x4ed8aa4ae3418acbULL, 0x5b9cca4f7763e373ULL, 0x682e6ff3d6b2b8a3ULL, 0x748f82ee5defb2fcULL, 0x78a5636f43172f60ULL, 0x84c87814a1f0ab72ULL, 0x8cc702081a6439ecULL, 0x90befffa23631e28ULL, 0xa4506cebde82bde9ULL, 0xbef9a3f7b2c67915ULL, 0xc67178f2e372532bULL, 0xca273eceea26619cULL, 0xd186b8c721c0c207ULL, 0xeada7dd6cde0eb1eULL, 0xf57d4f7fee6ed178ULL, 0x06f067aa72176fbaULL, 0x0a637dc5a2c898a6ULL, 0x113f9804bef90daeULL, 0x1b710b35131c471bULL, 0x28db77f523047d84ULL, 0x32caab7b40c72493ULL, 0x3c9ebe0a15c9bebcULL, 0x431d67c49c100d4cULL, 0x4cc5d4becb3e42b6ULL, 0x597f299cfc657e2aULL, 0x5fcb6fab3ad6faecULL, 0x6c44198c4a475817ULL }; /* Initial hash value H for SHA-384 */ static const uint64_t sha384_initial_hash_value[8] = { 0xcbbb9d5dc1059ed8ULL, 0x629a292a367cd507ULL, 0x9159015a3070dd17ULL, 0x152fecd8f70e5939ULL, 0x67332667ffc00b31ULL, 0x8eb44a8768581511ULL, 0xdb0c2e0d64f98fa7ULL, 0x47b5481dbefa4fa4ULL }; /* Initial hash value H for SHA-512 */ static const uint64_t sha512_initial_hash_value[8] = { 0x6a09e667f3bcc908ULL, 0xbb67ae8584caa73bULL, 0x3c6ef372fe94f82bULL, 0xa54ff53a5f1d36f1ULL, 0x510e527fade682d1ULL, 0x9b05688c2b3e6c1fULL, 0x1f83d9abfb41bd6bULL, 0x5be0cd19137e2179ULL }; #if !defined(_KERNEL) && !defined(_STANDALONE) #if defined(__weak_alias) __weak_alias(SHA224_Init,_SHA224_Init) __weak_alias(SHA224_Update,_SHA224_Update) __weak_alias(SHA224_Final,_SHA224_Final) __weak_alias(SHA224_Transform,_SHA224_Transform) __weak_alias(SHA256_Init,_SHA256_Init) __weak_alias(SHA256_Update,_SHA256_Update) __weak_alias(SHA256_Final,_SHA256_Final) __weak_alias(SHA256_Transform,_SHA256_Transform) __weak_alias(SHA384_Init,_SHA384_Init) __weak_alias(SHA384_Update,_SHA384_Update) __weak_alias(SHA384_Final,_SHA384_Final) __weak_alias(SHA384_Transform,_SHA384_Transform) __weak_alias(SHA512_Init,_SHA512_Init) __weak_alias(SHA512_Update,_SHA512_Update) __weak_alias(SHA512_Final,_SHA512_Final) __weak_alias(SHA512_Transform,_SHA512_Transform) #endif #endif /*** SHA-256: *********************************************************/ int SHA256_Init(SHA256_CTX *context) { if (context == NULL) return 1; memcpy(context->state, sha256_initial_hash_value, (size_t)(SHA256_DIGEST_LENGTH)); memset(context->buffer, 0, (size_t)(SHA256_BLOCK_LENGTH)); context->bitcount = 0; return 1; } #ifdef SHA2_UNROLL_TRANSFORM /* Unrolled SHA-256 round macros: */ #define ROUND256_0_TO_15(a,b,c,d,e,f,g,h) \ W256[j] = be32dec(data); \ ++data; \ T1 = (h) + Sigma1_256(e) + Ch((e), (f), (g)) + \ K256[j] + W256[j]; \ (d) += T1; \ (h) = T1 + Sigma0_256(a) + Maj((a), (b), (c)); \ j++ #define ROUND256(a,b,c,d,e,f,g,h) \ s0 = W256[(j+1)&0x0f]; \ s0 = sigma0_256(s0); \ s1 = W256[(j+14)&0x0f]; \ s1 = sigma1_256(s1); \ T1 = (h) + Sigma1_256(e) + Ch((e), (f), (g)) + K256[j] + \ (W256[j&0x0f] += s1 + W256[(j+9)&0x0f] + s0); \ (d) += T1; \ (h) = T1 + Sigma0_256(a) + Maj((a), (b), (c)); \ j++ void SHA256_Transform(SHA256_CTX *context, const uint32_t *data) { uint32_t a, b, c, d, e, f, g, h, s0, s1; uint32_t T1, *W256; int j; W256 = (uint32_t *)context->buffer; /* Initialize registers with the prev. intermediate value */ a = context->state[0]; b = context->state[1]; c = context->state[2]; d = context->state[3]; e = context->state[4]; f = context->state[5]; g = context->state[6]; h = context->state[7]; j = 0; do { /* Rounds 0 to 15 (unrolled): */ ROUND256_0_TO_15(a,b,c,d,e,f,g,h); ROUND256_0_TO_15(h,a,b,c,d,e,f,g); ROUND256_0_TO_15(g,h,a,b,c,d,e,f); ROUND256_0_TO_15(f,g,h,a,b,c,d,e); ROUND256_0_TO_15(e,f,g,h,a,b,c,d); ROUND256_0_TO_15(d,e,f,g,h,a,b,c); ROUND256_0_TO_15(c,d,e,f,g,h,a,b); ROUND256_0_TO_15(b,c,d,e,f,g,h,a); } while (j < 16); /* Now for the remaining rounds to 64: */ do { ROUND256(a,b,c,d,e,f,g,h); ROUND256(h,a,b,c,d,e,f,g); ROUND256(g,h,a,b,c,d,e,f); ROUND256(f,g,h,a,b,c,d,e); ROUND256(e,f,g,h,a,b,c,d); ROUND256(d,e,f,g,h,a,b,c); ROUND256(c,d,e,f,g,h,a,b); ROUND256(b,c,d,e,f,g,h,a); } while (j < 64); /* Compute the current intermediate hash value */ context->state[0] += a; context->state[1] += b; context->state[2] += c; context->state[3] += d; context->state[4] += e; context->state[5] += f; context->state[6] += g; context->state[7] += h; /* Clean up */ a = b = c = d = e = f = g = h = T1 = 0; } #else /* SHA2_UNROLL_TRANSFORM */ void SHA256_Transform(SHA256_CTX *context, const uint32_t *data) { uint32_t a, b, c, d, e, f, g, h, s0, s1; uint32_t T1, T2, *W256; int j; W256 = (uint32_t *)(void *)context->buffer; /* Initialize registers with the prev. intermediate value */ a = context->state[0]; b = context->state[1]; c = context->state[2]; d = context->state[3]; e = context->state[4]; f = context->state[5]; g = context->state[6]; h = context->state[7]; j = 0; do { W256[j] = be32dec(data); ++data; /* Apply the SHA-256 compression function to update a..h */ T1 = h + Sigma1_256(e) + Ch(e, f, g) + K256[j] + W256[j]; T2 = Sigma0_256(a) + Maj(a, b, c); h = g; g = f; f = e; e = d + T1; d = c; c = b; b = a; a = T1 + T2; j++; } while (j < 16); do { /* Part of the message block expansion: */ s0 = W256[(j+1)&0x0f]; s0 = sigma0_256(s0); s1 = W256[(j+14)&0x0f]; s1 = sigma1_256(s1); /* Apply the SHA-256 compression function to update a..h */ T1 = h + Sigma1_256(e) + Ch(e, f, g) + K256[j] + (W256[j&0x0f] += s1 + W256[(j+9)&0x0f] + s0); T2 = Sigma0_256(a) + Maj(a, b, c); h = g; g = f; f = e; e = d + T1; d = c; c = b; b = a; a = T1 + T2; j++; } while (j < 64); /* Compute the current intermediate hash value */ context->state[0] += a; context->state[1] += b; context->state[2] += c; context->state[3] += d; context->state[4] += e; context->state[5] += f; context->state[6] += g; context->state[7] += h; /* Clean up */ a = b = c = d = e = f = g = h = T1 = T2 = 0; } #endif /* SHA2_UNROLL_TRANSFORM */ int SHA256_Update(SHA256_CTX *context, const uint8_t *data, size_t len) { unsigned int freespace, usedspace; if (len == 0) { /* Calling with no data is valid - we do nothing */ return 1; } usedspace = (unsigned int)((context->bitcount >> 3) % SHA256_BLOCK_LENGTH); if (usedspace > 0) { /* Calculate how much free space is available in the buffer */ freespace = SHA256_BLOCK_LENGTH - usedspace; if (len >= freespace) { /* Fill the buffer completely and process it */ memcpy(&context->buffer[usedspace], data, (size_t)(freespace)); context->bitcount += freespace << 3; len -= freespace; data += freespace; SHA256_Transform(context, (uint32_t *)(void *)context->buffer); } else { /* The buffer is not yet full */ memcpy(&context->buffer[usedspace], data, len); context->bitcount += len << 3; /* Clean up: */ usedspace = freespace = 0; return 1; } } /* * Process as many complete blocks as possible. * * Check alignment of the data pointer. If it is 32bit aligned, * SHA256_Transform can be called directly on the data stream, * otherwise enforce the alignment by copy into the buffer. */ if ((uintptr_t)data % 4 == 0) { while (len >= SHA256_BLOCK_LENGTH) { SHA256_Transform(context, (const uint32_t *)(const void *)data); context->bitcount += SHA256_BLOCK_LENGTH << 3; len -= SHA256_BLOCK_LENGTH; data += SHA256_BLOCK_LENGTH; } } else { while (len >= SHA256_BLOCK_LENGTH) { memcpy(context->buffer, data, SHA256_BLOCK_LENGTH); SHA256_Transform(context, (const uint32_t *)(const void *)context->buffer); context->bitcount += SHA256_BLOCK_LENGTH << 3; len -= SHA256_BLOCK_LENGTH; data += SHA256_BLOCK_LENGTH; } } if (len > 0) { /* There's left-overs, so save 'em */ memcpy(context->buffer, data, len); context->bitcount += len << 3; } /* Clean up: */ usedspace = freespace = 0; return 1; } static int SHA224_256_Final(uint8_t digest[], SHA256_CTX *context, size_t len) { unsigned int usedspace; size_t i; /* If no digest buffer is passed, we don't bother doing this: */ if (digest != NULL) { usedspace = (unsigned int)((context->bitcount >> 3) % SHA256_BLOCK_LENGTH); context->bitcount = htobe64(context->bitcount); if (usedspace > 0) { /* Begin padding with a 1 bit: */ context->buffer[usedspace++] = 0x80; if (usedspace <= SHA256_SHORT_BLOCK_LENGTH) { /* Set-up for the last transform: */ memset(&context->buffer[usedspace], 0, (size_t)(SHA256_SHORT_BLOCK_LENGTH - usedspace)); } else { if (usedspace < SHA256_BLOCK_LENGTH) { memset(&context->buffer[usedspace], 0, (size_t)(SHA256_BLOCK_LENGTH - usedspace)); } /* Do second-to-last transform: */ SHA256_Transform(context, (uint32_t *)(void *)context->buffer); /* And set-up for the last transform: */ memset(context->buffer, 0, (size_t)(SHA256_SHORT_BLOCK_LENGTH)); } } else { /* Set-up for the last transform: */ memset(context->buffer, 0, (size_t)(SHA256_SHORT_BLOCK_LENGTH)); /* Begin padding with a 1 bit: */ *context->buffer = 0x80; } /* Set the bit count: */ memcpy(&context->buffer[SHA256_SHORT_BLOCK_LENGTH], &context->bitcount, sizeof(context->bitcount)); /* Final transform: */ SHA256_Transform(context, (uint32_t *)(void *)context->buffer); for (i = 0; i < len / 4; i++) be32enc(digest + 4 * i, context->state[i]); } /* Clean up state data: */ memset(context, 0, sizeof(*context)); usedspace = 0; return 1; } int SHA256_Final(uint8_t digest[SHA256_DIGEST_LENGTH], SHA256_CTX *context) { return SHA224_256_Final(digest, context, SHA256_DIGEST_LENGTH); } /*** SHA-224: *********************************************************/ int SHA224_Init(SHA224_CTX *context) { if (context == NULL) return 1; /* The state and buffer size are driven by SHA256, not by SHA224. */ memcpy(context->state, sha224_initial_hash_value, (size_t)(SHA256_DIGEST_LENGTH)); memset(context->buffer, 0, (size_t)(SHA256_BLOCK_LENGTH)); context->bitcount = 0; return 1; } int SHA224_Update(SHA224_CTX *context, const uint8_t *data, size_t len) { return SHA256_Update((SHA256_CTX *)context, data, len); } void SHA224_Transform(SHA224_CTX *context, const uint32_t *data) { SHA256_Transform((SHA256_CTX *)context, data); } int SHA224_Final(uint8_t digest[SHA224_DIGEST_LENGTH], SHA224_CTX *context) { return SHA224_256_Final(digest, (SHA256_CTX *)context, SHA224_DIGEST_LENGTH); } /*** SHA-512: *********************************************************/ int SHA512_Init(SHA512_CTX *context) { if (context == NULL) return 1; memcpy(context->state, sha512_initial_hash_value, (size_t)(SHA512_DIGEST_LENGTH)); memset(context->buffer, 0, (size_t)(SHA512_BLOCK_LENGTH)); context->bitcount[0] = context->bitcount[1] = 0; return 1; } #ifdef SHA2_UNROLL_TRANSFORM /* Unrolled SHA-512 round macros: */ #define ROUND512_0_TO_15(a,b,c,d,e,f,g,h) \ W512[j] = be64dec(data); \ ++data; \ T1 = (h) + Sigma1_512(e) + Ch((e), (f), (g)) + \ K512[j] + W512[j]; \ (d) += T1, \ (h) = T1 + Sigma0_512(a) + Maj((a), (b), (c)), \ j++ #define ROUND512(a,b,c,d,e,f,g,h) \ s0 = W512[(j+1)&0x0f]; \ s0 = sigma0_512(s0); \ s1 = W512[(j+14)&0x0f]; \ s1 = sigma1_512(s1); \ T1 = (h) + Sigma1_512(e) + Ch((e), (f), (g)) + K512[j] + \ (W512[j&0x0f] += s1 + W512[(j+9)&0x0f] + s0); \ (d) += T1; \ (h) = T1 + Sigma0_512(a) + Maj((a), (b), (c)); \ j++ void SHA512_Transform(SHA512_CTX *context, const uint64_t *data) { uint64_t a, b, c, d, e, f, g, h, s0, s1; uint64_t T1, *W512 = (uint64_t *)context->buffer; int j; /* Initialize registers with the prev. intermediate value */ a = context->state[0]; b = context->state[1]; c = context->state[2]; d = context->state[3]; e = context->state[4]; f = context->state[5]; g = context->state[6]; h = context->state[7]; j = 0; do { ROUND512_0_TO_15(a,b,c,d,e,f,g,h); ROUND512_0_TO_15(h,a,b,c,d,e,f,g); ROUND512_0_TO_15(g,h,a,b,c,d,e,f); ROUND512_0_TO_15(f,g,h,a,b,c,d,e); ROUND512_0_TO_15(e,f,g,h,a,b,c,d); ROUND512_0_TO_15(d,e,f,g,h,a,b,c); ROUND512_0_TO_15(c,d,e,f,g,h,a,b); ROUND512_0_TO_15(b,c,d,e,f,g,h,a); } while (j < 16); /* Now for the remaining rounds up to 79: */ do { ROUND512(a,b,c,d,e,f,g,h); ROUND512(h,a,b,c,d,e,f,g); ROUND512(g,h,a,b,c,d,e,f); ROUND512(f,g,h,a,b,c,d,e); ROUND512(e,f,g,h,a,b,c,d); ROUND512(d,e,f,g,h,a,b,c); ROUND512(c,d,e,f,g,h,a,b); ROUND512(b,c,d,e,f,g,h,a); } while (j < 80); /* Compute the current intermediate hash value */ context->state[0] += a; context->state[1] += b; context->state[2] += c; context->state[3] += d; context->state[4] += e; context->state[5] += f; context->state[6] += g; context->state[7] += h; /* Clean up */ a = b = c = d = e = f = g = h = T1 = 0; } #else /* SHA2_UNROLL_TRANSFORM */ void SHA512_Transform(SHA512_CTX *context, const uint64_t *data) { uint64_t a, b, c, d, e, f, g, h, s0, s1; uint64_t T1, T2, *W512 = (void *)context->buffer; int j; /* Initialize registers with the prev. intermediate value */ a = context->state[0]; b = context->state[1]; c = context->state[2]; d = context->state[3]; e = context->state[4]; f = context->state[5]; g = context->state[6]; h = context->state[7]; j = 0; do { W512[j] = be64dec(data); ++data; /* Apply the SHA-512 compression function to update a..h */ T1 = h + Sigma1_512(e) + Ch(e, f, g) + K512[j] + W512[j]; T2 = Sigma0_512(a) + Maj(a, b, c); h = g; g = f; f = e; e = d + T1; d = c; c = b; b = a; a = T1 + T2; j++; } while (j < 16); do { /* Part of the message block expansion: */ s0 = W512[(j+1)&0x0f]; s0 = sigma0_512(s0); s1 = W512[(j+14)&0x0f]; s1 = sigma1_512(s1); /* Apply the SHA-512 compression function to update a..h */ T1 = h + Sigma1_512(e) + Ch(e, f, g) + K512[j] + (W512[j&0x0f] += s1 + W512[(j+9)&0x0f] + s0); T2 = Sigma0_512(a) + Maj(a, b, c); h = g; g = f; f = e; e = d + T1; d = c; c = b; b = a; a = T1 + T2; j++; } while (j < 80); /* Compute the current intermediate hash value */ context->state[0] += a; context->state[1] += b; context->state[2] += c; context->state[3] += d; context->state[4] += e; context->state[5] += f; context->state[6] += g; context->state[7] += h; /* Clean up */ a = b = c = d = e = f = g = h = T1 = T2 = 0; } #endif /* SHA2_UNROLL_TRANSFORM */ int SHA512_Update(SHA512_CTX *context, const uint8_t *data, size_t len) { unsigned int freespace, usedspace; if (len == 0) { /* Calling with no data is valid - we do nothing */ return 1; } usedspace = (unsigned int)((context->bitcount[0] >> 3) % SHA512_BLOCK_LENGTH); if (usedspace > 0) { /* Calculate how much free space is available in the buffer */ freespace = SHA512_BLOCK_LENGTH - usedspace; if (len >= freespace) { /* Fill the buffer completely and process it */ memcpy(&context->buffer[usedspace], data, (size_t)(freespace)); ADDINC128(context->bitcount, freespace << 3); len -= freespace; data += freespace; SHA512_Transform(context, (uint64_t *)(void *)context->buffer); } else { /* The buffer is not yet full */ memcpy(&context->buffer[usedspace], data, len); ADDINC128(context->bitcount, len << 3); /* Clean up: */ usedspace = freespace = 0; return 1; } } /* * Process as many complete blocks as possible. * * Check alignment of the data pointer. If it is 64bit aligned, * SHA512_Transform can be called directly on the data stream, * otherwise enforce the alignment by copy into the buffer. */ if ((uintptr_t)data % 8 == 0) { while (len >= SHA512_BLOCK_LENGTH) { SHA512_Transform(context, (const uint64_t*)(const void *)data); ADDINC128(context->bitcount, SHA512_BLOCK_LENGTH << 3); len -= SHA512_BLOCK_LENGTH; data += SHA512_BLOCK_LENGTH; } } else { while (len >= SHA512_BLOCK_LENGTH) { memcpy(context->buffer, data, SHA512_BLOCK_LENGTH); SHA512_Transform(context, (const void *)context->buffer); ADDINC128(context->bitcount, SHA512_BLOCK_LENGTH << 3); len -= SHA512_BLOCK_LENGTH; data += SHA512_BLOCK_LENGTH; } } if (len > 0) { /* There's left-overs, so save 'em */ memcpy(context->buffer, data, len); ADDINC128(context->bitcount, len << 3); } /* Clean up: */ usedspace = freespace = 0; return 1; } static void SHA512_Last(SHA512_CTX *context) { unsigned int usedspace; usedspace = (unsigned int)((context->bitcount[0] >> 3) % SHA512_BLOCK_LENGTH); context->bitcount[0] = htobe64(context->bitcount[0]); context->bitcount[1] = htobe64(context->bitcount[1]); if (usedspace > 0) { /* Begin padding with a 1 bit: */ context->buffer[usedspace++] = 0x80; if (usedspace <= SHA512_SHORT_BLOCK_LENGTH) { /* Set-up for the last transform: */ memset(&context->buffer[usedspace], 0, (size_t)(SHA512_SHORT_BLOCK_LENGTH - usedspace)); } else { if (usedspace < SHA512_BLOCK_LENGTH) { memset(&context->buffer[usedspace], 0, (size_t)(SHA512_BLOCK_LENGTH - usedspace)); } /* Do second-to-last transform: */ SHA512_Transform(context, (uint64_t *)(void *)context->buffer); /* And set-up for the last transform: */ memset(context->buffer, 0, (size_t)(SHA512_BLOCK_LENGTH - 2)); } } else { /* Prepare for final transform: */ memset(context->buffer, 0, (size_t)(SHA512_SHORT_BLOCK_LENGTH)); /* Begin padding with a 1 bit: */ *context->buffer = 0x80; } /* Store the length of input data (in bits): */ memcpy(&context->buffer[SHA512_SHORT_BLOCK_LENGTH], &context->bitcount[1], sizeof(context->bitcount[1])); memcpy(&context->buffer[SHA512_SHORT_BLOCK_LENGTH + 8], &context->bitcount[0], sizeof(context->bitcount[0])); /* Final transform: */ SHA512_Transform(context, (uint64_t *)(void *)context->buffer); } int SHA512_Final(uint8_t digest[SHA512_DIGEST_LENGTH], SHA512_CTX *context) { size_t i; /* If no digest buffer is passed, we don't bother doing this: */ if (digest != NULL) { SHA512_Last(context); /* Save the hash data for output: */ for (i = 0; i < 8; ++i) be64enc(digest + 8 * i, context->state[i]); } /* Zero out state data */ memset(context, 0, sizeof(*context)); return 1; } /*** SHA-384: *********************************************************/ int SHA384_Init(SHA384_CTX *context) { if (context == NULL) return 1; memcpy(context->state, sha384_initial_hash_value, (size_t)(SHA512_DIGEST_LENGTH)); memset(context->buffer, 0, (size_t)(SHA384_BLOCK_LENGTH)); context->bitcount[0] = context->bitcount[1] = 0; return 1; } int SHA384_Update(SHA384_CTX *context, const uint8_t *data, size_t len) { return SHA512_Update((SHA512_CTX *)context, data, len); } void SHA384_Transform(SHA512_CTX *context, const uint64_t *data) { SHA512_Transform((SHA512_CTX *)context, data); } int SHA384_Final(uint8_t digest[SHA384_DIGEST_LENGTH], SHA384_CTX *context) { size_t i; /* If no digest buffer is passed, we don't bother doing this: */ if (digest != NULL) { SHA512_Last((SHA512_CTX *)context); /* Save the hash data for output: */ for (i = 0; i < 6; ++i) be64enc(digest + 8 * i, context->state[i]); } /* Zero out state data */ memset(context, 0, sizeof(*context)); return 1; }
3 3 3 3 3 3 3 3 121 122 121 122 122 1 121 20 20 18 18 17 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 /* $NetBSD: kern_softint.c,v 1.76 2024/03/01 04:32:38 mrg Exp $ */ /*- * Copyright (c) 2007, 2008, 2019, 2020 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Andrew Doran. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /* * Generic software interrupt framework. * * Overview * * The soft interrupt framework provides a mechanism to schedule a * low priority callback that runs with thread context. It allows * for dynamic registration of software interrupts, and for fair * queueing and prioritization of those interrupts. The callbacks * can be scheduled to run from nearly any point in the kernel: by * code running with thread context, by code running from a * hardware interrupt handler, and at any interrupt priority * level. * * Priority levels * * Since soft interrupt dispatch can be tied to the underlying * architecture's interrupt dispatch code, it can be limited * both by the capabilities of the hardware and the capabilities * of the interrupt dispatch code itself. The number of priority * levels is restricted to four. In order of priority (lowest to * highest) the levels are: clock, bio, net, serial. * * The names are symbolic and in isolation do not have any direct * connection with a particular kind of device activity: they are * only meant as a guide. * * The four priority levels map directly to scheduler priority * levels, and where the architecture implements 'fast' software * interrupts, they also map onto interrupt priorities. The * interrupt priorities are intended to be hidden from machine * independent code, which should use thread-safe mechanisms to * synchronize with software interrupts (for example: mutexes). * * Capabilities * * Software interrupts run with limited machine context. In * particular, they do not posess any address space context. They * should not try to operate on user space addresses, or to use * virtual memory facilities other than those noted as interrupt * safe. * * Unlike hardware interrupts, software interrupts do have thread * context. They may block on synchronization objects, sleep, and * resume execution at a later time. * * Since software interrupts are a limited resource and run with * higher priority than most other LWPs in the system, all * block-and-resume activity by a software interrupt must be kept * short to allow further processing at that level to continue. By * extension, code running with process context must take care to * ensure that any lock that may be taken from a software interrupt * can not be held for more than a short period of time. * * The kernel does not allow software interrupts to use facilities * or perform actions that may block for a significant amount of * time. This means that it's not valid for a software interrupt * to sleep on condition variables or wait for resources to become * available (for example, memory). * * Per-CPU operation * * If a soft interrupt is triggered on a CPU, it can only be * dispatched on the same CPU. Each LWP dedicated to handling a * soft interrupt is bound to its home CPU, so if the LWP blocks * and needs to run again, it can only run there. Nearly all data * structures used to manage software interrupts are per-CPU. * * The per-CPU requirement is intended to reduce "ping-pong" of * cache lines between CPUs: lines occupied by data structures * used to manage the soft interrupts, and lines occupied by data * items being passed down to the soft interrupt. As a positive * side effect, this also means that the soft interrupt dispatch * code does not need to to use spinlocks to synchronize. * * Generic implementation * * A generic, low performance implementation is provided that * works across all architectures, with no machine-dependent * modifications needed. This implementation uses the scheduler, * and so has a number of restrictions: * * 1) The software interrupts are not currently preemptive, so * must wait for the currently executing LWP to yield the CPU. * This can introduce latency. * * 2) An expensive context switch is required for a software * interrupt to be handled. * * 'Fast' software interrupts * * If an architectures defines __HAVE_FAST_SOFTINTS, it implements * the fast mechanism. Threads running either in the kernel or in * userspace will be interrupted, but will not be preempted. When * the soft interrupt completes execution, the interrupted LWP * is resumed. Interrupt dispatch code must provide the minimum * level of context necessary for the soft interrupt to block and * be resumed at a later time. The machine-dependent dispatch * path looks something like the following: * * softintr() * { * go to IPL_HIGH if necessary for switch; * save any necessary registers in a format that can be * restored by cpu_switchto if the softint blocks; * arrange for cpu_switchto() to restore into the * trampoline function; * identify LWP to handle this interrupt; * switch to the LWP's stack; * switch register stacks, if necessary; * assign new value of curlwp; * call MI softint_dispatch, passing old curlwp and IPL * to execute interrupt at; * switch back to old stack; * switch back to old register stack, if necessary; * restore curlwp; * return to interrupted LWP; * } * * If the soft interrupt blocks, a trampoline function is returned * to in the context of the interrupted LWP, as arranged for by * softint(): * * softint_ret() * { * unlock soft interrupt LWP; * resume interrupt processing, likely returning to * interrupted LWP or dispatching another, different * interrupt; * } * * Once the soft interrupt has fired (and even if it has blocked), * no further soft interrupts at that level will be triggered by * MI code until the soft interrupt handler has ceased execution. * If a soft interrupt handler blocks and is resumed, it resumes * execution as a normal LWP (kthread) and gains VM context. Only * when it has completed and is ready to fire again will it * interrupt other threads. */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: kern_softint.c,v 1.76 2024/03/01 04:32:38 mrg Exp $"); #include <sys/param.h> #include <sys/proc.h> #include <sys/intr.h> #include <sys/ipi.h> #include <sys/lock.h> #include <sys/mutex.h> #include <sys/kernel.h> #include <sys/kthread.h> #include <sys/evcnt.h> #include <sys/cpu.h> #include <sys/xcall.h> #include <sys/psref.h> #include <sys/sdt.h> #include <uvm/uvm_extern.h> /* This could overlap with signal info in struct lwp. */ typedef struct softint { SIMPLEQ_HEAD(, softhand) si_q; struct lwp *si_lwp; struct cpu_info *si_cpu; uintptr_t si_machdep; struct evcnt si_evcnt; struct evcnt si_evcnt_block; volatile int si_active; int si_ipl; char si_name[8]; char si_name_block[8+6]; } softint_t; typedef struct softhand { SIMPLEQ_ENTRY(softhand) sh_q; void (*sh_func)(void *); void *sh_arg; softint_t *sh_isr; u_int sh_flags; u_int sh_ipi_id; } softhand_t; typedef struct softcpu { struct cpu_info *sc_cpu; softint_t sc_int[SOFTINT_COUNT]; softhand_t sc_hand[1]; } softcpu_t; static void softint_thread(void *); u_int softint_bytes = 32768; u_int softint_timing; static u_int softint_max; static kmutex_t softint_lock; SDT_PROBE_DEFINE4(sdt, kernel, softint, establish, "void *"/*sih*/, "void (*)(void *)"/*func*/, "void *"/*arg*/, "unsigned"/*flags*/); SDT_PROBE_DEFINE1(sdt, kernel, softint, disestablish, "void *"/*sih*/); SDT_PROBE_DEFINE2(sdt, kernel, softint, schedule, "void *"/*sih*/, "struct cpu_info *"/*ci*/); SDT_PROBE_DEFINE4(sdt, kernel, softint, entry, "void *"/*sih*/, "void (*)(void *)"/*func*/, "void *"/*arg*/, "unsigned"/*flags*/); SDT_PROBE_DEFINE4(sdt, kernel, softint, return, "void *"/*sih*/, "void (*)(void *)"/*func*/, "void *"/*arg*/, "unsigned"/*flags*/); /* * softint_init_isr: * * Initialize a single interrupt level for a single CPU. */ static void softint_init_isr(softcpu_t *sc, const char *desc, pri_t pri, u_int level, int ipl) { struct cpu_info *ci; softint_t *si; int error; si = &sc->sc_int[level]; ci = sc->sc_cpu; si->si_cpu = ci; SIMPLEQ_INIT(&si->si_q); error = kthread_create(pri, KTHREAD_MPSAFE | KTHREAD_INTR | KTHREAD_IDLE, ci, softint_thread, si, &si->si_lwp, "soft%s/%u", desc, ci->ci_index); if (error != 0) panic("softint_init_isr: error %d", error); snprintf(si->si_name, sizeof(si->si_name), "%s/%u", desc, ci->ci_index); evcnt_attach_dynamic(&si->si_evcnt, EVCNT_TYPE_MISC, NULL, "softint", si->si_name); snprintf(si->si_name_block, sizeof(si->si_name_block), "%s block/%u", desc, ci->ci_index); evcnt_attach_dynamic(&si->si_evcnt_block, EVCNT_TYPE_MISC, NULL, "softint", si->si_name_block); si->si_ipl = ipl; si->si_lwp->l_private = si; softint_init_md(si->si_lwp, level, &si->si_machdep); } /* * softint_init: * * Initialize per-CPU data structures. Called from mi_cpu_attach(). */ void softint_init(struct cpu_info *ci) { static struct cpu_info *first; softcpu_t *sc, *scfirst; softhand_t *sh, *shmax; if (first == NULL) { /* Boot CPU. */ first = ci; mutex_init(&softint_lock, MUTEX_DEFAULT, IPL_NONE); softint_bytes = round_page(softint_bytes); softint_max = (softint_bytes - sizeof(softcpu_t)) / sizeof(softhand_t); } /* Use uvm_km(9) for persistent, page-aligned allocation. */ sc = (softcpu_t *)uvm_km_alloc(kernel_map, softint_bytes, 0, UVM_KMF_WIRED | UVM_KMF_ZERO); if (sc == NULL) panic("softint_init_cpu: cannot allocate memory"); ci->ci_data.cpu_softcpu = sc; ci->ci_data.cpu_softints = 0; sc->sc_cpu = ci; softint_init_isr(sc, "net", PRI_SOFTNET, SOFTINT_NET, IPL_SOFTNET); softint_init_isr(sc, "bio", PRI_SOFTBIO, SOFTINT_BIO, IPL_SOFTBIO); softint_init_isr(sc, "clk", PRI_SOFTCLOCK, SOFTINT_CLOCK, IPL_SOFTCLOCK); softint_init_isr(sc, "ser", PRI_SOFTSERIAL, SOFTINT_SERIAL, IPL_SOFTSERIAL); if (first != ci) { mutex_enter(&softint_lock); scfirst = first->ci_data.cpu_softcpu; sh = sc->sc_hand; memcpy(sh, scfirst->sc_hand, sizeof(*sh) * softint_max); /* Update pointers for this CPU. */ for (shmax = sh + softint_max; sh < shmax; sh++) { if (sh->sh_func == NULL) continue; sh->sh_isr = &sc->sc_int[sh->sh_flags & SOFTINT_LVLMASK]; } mutex_exit(&softint_lock); } } /* * softint_establish: * * Register a software interrupt handler. */ void * softint_establish(u_int flags, void (*func)(void *), void *arg) { CPU_INFO_ITERATOR cii; struct cpu_info *ci; softcpu_t *sc; softhand_t *sh; u_int level, index; u_int ipi_id = 0; void *sih; level = (flags & SOFTINT_LVLMASK); KASSERT(level < SOFTINT_COUNT); KASSERT((flags & SOFTINT_IMPMASK) == 0); mutex_enter(&softint_lock); /* Find a free slot. */ sc = curcpu()->ci_data.cpu_softcpu; for (index = 1; index < softint_max; index++) { if (sc->sc_hand[index].sh_func == NULL) break; } if (index == softint_max) { mutex_exit(&softint_lock); printf("WARNING: softint_establish: table full, " "increase softint_bytes\n"); return NULL; } sih = (void *)((uint8_t *)&sc->sc_hand[index] - (uint8_t *)sc); if (flags & SOFTINT_RCPU) { if ((ipi_id = ipi_register(softint_schedule, sih)) == 0) { mutex_exit(&softint_lock); return NULL; } } /* Set up the handler on each CPU. */ if (ncpu < 2) { /* XXX hack for machines with no CPU_INFO_FOREACH() early on */ sc = curcpu()->ci_data.cpu_softcpu; sh = &sc->sc_hand[index]; sh->sh_isr = &sc->sc_int[level]; sh->sh_func = func; sh->sh_arg = arg; sh->sh_flags = flags; sh->sh_ipi_id = ipi_id; } else for (CPU_INFO_FOREACH(cii, ci)) { sc = ci->ci_data.cpu_softcpu; sh = &sc->sc_hand[index]; sh->sh_isr = &sc->sc_int[level]; sh->sh_func = func; sh->sh_arg = arg; sh->sh_flags = flags; sh->sh_ipi_id = ipi_id; } mutex_exit(&softint_lock); SDT_PROBE4(sdt, kernel, softint, establish, sih, func, arg, flags); return sih; } /* * softint_disestablish: * * Unregister a software interrupt handler. The soft interrupt could * still be active at this point, but the caller commits not to try * and trigger it again once this call is made. The caller must not * hold any locks that could be taken from soft interrupt context, * because we will wait for the softint to complete if it's still * running. */ void softint_disestablish(void *arg) { CPU_INFO_ITERATOR cii; struct cpu_info *ci; softcpu_t *sc; softhand_t *sh; uintptr_t offset; offset = (uintptr_t)arg; KASSERT(offset != 0); KASSERTMSG(offset < softint_bytes, "%"PRIuPTR" %u", offset, softint_bytes); /* * Unregister IPI handler if there is any. Note: there is no need * to disable preemption here - ID is stable. */ sc = curcpu()->ci_data.cpu_softcpu; sh = (softhand_t *)((uint8_t *)sc + offset); if (sh->sh_ipi_id) { ipi_unregister(sh->sh_ipi_id); } /* * Run a dummy softint at the same level on all CPUs and wait for * completion, to make sure this softint is no longer running * anywhere. */ xc_barrier(XC_HIGHPRI_IPL(sh->sh_isr->si_ipl)); /* * Notify dtrace probe when the old softint can't be running * any more, but before it can be recycled for a new softint. */ SDT_PROBE1(sdt, kernel, softint, disestablish, arg); /* Clear the handler on each CPU. */ mutex_enter(&softint_lock); for (CPU_INFO_FOREACH(cii, ci)) { sc = ci->ci_data.cpu_softcpu; sh = (softhand_t *)((uint8_t *)sc + offset); KASSERT(sh->sh_func != NULL); sh->sh_func = NULL; } mutex_exit(&softint_lock); } /* * softint_schedule: * * Trigger a software interrupt. Must be called from a hardware * interrupt handler, or with preemption disabled (since we are * using the value of curcpu()). */ void softint_schedule(void *arg) { softhand_t *sh; softint_t *si; uintptr_t offset; int s; SDT_PROBE2(sdt, kernel, softint, schedule, arg, /*ci*/NULL); /* * If this assert fires, rather than disabling preemption explicitly * to make it stop, consider that you are probably using a softint * when you don't need to. */ KASSERT(kpreempt_disabled()); /* Find the handler record for this CPU. */ offset = (uintptr_t)arg; KASSERT(offset != 0); KASSERTMSG(offset < softint_bytes, "%"PRIuPTR" %u", offset, softint_bytes); sh = (softhand_t *)((uint8_t *)curcpu()->ci_data.cpu_softcpu + offset); /* If it's already pending there's nothing to do. */ if ((sh->sh_flags & SOFTINT_PENDING) != 0) { return; } /* * Enqueue the handler into the LWP's pending list. * If the LWP is completely idle, then make it run. */ s = splhigh(); if ((sh->sh_flags & SOFTINT_PENDING) == 0) { si = sh->sh_isr; sh->sh_flags |= SOFTINT_PENDING; SIMPLEQ_INSERT_TAIL(&si->si_q, sh, sh_q); if (si->si_active == 0) { si->si_active = 1; softint_trigger(si->si_machdep); } } splx(s); } /* * softint_schedule_cpu: * * Trigger a software interrupt on a target CPU. This invokes * softint_schedule() for the local CPU or send an IPI to invoke * this routine on the remote CPU. Preemption must be disabled. */ void softint_schedule_cpu(void *arg, struct cpu_info *ci) { KASSERT(kpreempt_disabled()); if (curcpu() != ci) { const softcpu_t *sc = ci->ci_data.cpu_softcpu; const uintptr_t offset = (uintptr_t)arg; const softhand_t *sh; SDT_PROBE2(sdt, kernel, softint, schedule, arg, ci); sh = (const softhand_t *)((const uint8_t *)sc + offset); KASSERT((sh->sh_flags & SOFTINT_RCPU) != 0); ipi_trigger(sh->sh_ipi_id, ci); return; } /* Just a local CPU. */ softint_schedule(arg); } /* * softint_execute: * * Invoke handlers for the specified soft interrupt. * Must be entered at splhigh. Will drop the priority * to the level specified, but returns back at splhigh. */ static inline void softint_execute(lwp_t *l, int s) { softint_t *si = l->l_private; softhand_t *sh; KASSERT(si->si_lwp == curlwp); KASSERT(si->si_cpu == curcpu()); KASSERT(si->si_lwp->l_wchan == NULL); KASSERT(si->si_active); KASSERTMSG(l->l_nopreempt == 0, "lwp %p nopreempt %d", l, l->l_nopreempt); /* * Note: due to priority inheritance we may have interrupted a * higher priority LWP. Since the soft interrupt must be quick * and is non-preemptable, we don't bother yielding. */ while (!SIMPLEQ_EMPTY(&si->si_q)) { /* * Pick the longest waiting handler to run. We block * interrupts but do not lock in order to do this, as * we are protecting against the local CPU only. */ sh = SIMPLEQ_FIRST(&si->si_q); SIMPLEQ_REMOVE_HEAD(&si->si_q, sh_q); KASSERT((sh->sh_flags & SOFTINT_PENDING) != 0); sh->sh_flags ^= SOFTINT_PENDING; splx(s); /* Run the handler. */ SDT_PROBE4(sdt, kernel, softint, entry, ((const char *)sh - (const char *)curcpu()->ci_data.cpu_softcpu), sh->sh_func, sh->sh_arg, sh->sh_flags); if (__predict_true((sh->sh_flags & SOFTINT_MPSAFE) != 0)) { (*sh->sh_func)(sh->sh_arg); } else { KERNEL_LOCK(1, l); (*sh->sh_func)(sh->sh_arg); KERNEL_UNLOCK_ONE(l); } SDT_PROBE4(sdt, kernel, softint, return, ((const char *)sh - (const char *)curcpu()->ci_data.cpu_softcpu), sh->sh_func, sh->sh_arg, sh->sh_flags); /* Diagnostic: check that spin-locks have not leaked. */ KASSERTMSG(curcpu()->ci_mtx_count == 0, "%s: ci_mtx_count (%d) != 0, sh_func %p\n", __func__, curcpu()->ci_mtx_count, sh->sh_func); /* Diagnostic: check that psrefs have not leaked. */ KASSERTMSG(l->l_psrefs == 0, "%s: l_psrefs=%d, sh_func=%p\n", __func__, l->l_psrefs, sh->sh_func); /* Diagnostic: check that biglocks have not leaked. */ KASSERTMSG(l->l_blcnt == 0, "%s: sh_func=%p leaked %d biglocks", __func__, sh->sh_func, curlwp->l_blcnt); /* Diagnostic: check that LWP nopreempt remains zero. */ KASSERTMSG(l->l_nopreempt == 0, "%s: lwp %p nopreempt %d func %p", __func__, l, l->l_nopreempt, sh->sh_func); (void)splhigh(); } PSREF_DEBUG_BARRIER(); CPU_COUNT(CPU_COUNT_NSOFT, 1); KASSERT(si->si_cpu == curcpu()); KASSERT(si->si_lwp->l_wchan == NULL); KASSERT(si->si_active); si->si_evcnt.ev_count++; si->si_active = 0; } /* * softint_block: * * Update statistics when the soft interrupt blocks. */ void softint_block(lwp_t *l) { softint_t *si = l->l_private; KASSERT((l->l_pflag & LP_INTR) != 0); si->si_evcnt_block.ev_count++; } #ifndef __HAVE_FAST_SOFTINTS #ifdef __HAVE_PREEMPTION #error __HAVE_PREEMPTION requires __HAVE_FAST_SOFTINTS #endif /* * softint_init_md: * * Slow path: perform machine-dependent initialization. */ void softint_init_md(lwp_t *l, u_int level, uintptr_t *machdep) { struct proc *p; softint_t *si; *machdep = (1 << level); si = l->l_private; p = l->l_proc; mutex_enter(p->p_lock); lwp_lock(l); /* Cheat and make the KASSERT in softint_thread() happy. */ si->si_active = 1; setrunnable(l); /* LWP now unlocked */ mutex_exit(p->p_lock); } /* * softint_trigger: * * Slow path: cause a soft interrupt handler to begin executing. * Called at IPL_HIGH. */ void softint_trigger(uintptr_t machdep) { struct cpu_info *ci; lwp_t *l; ci = curcpu(); ci->ci_data.cpu_softints |= machdep; l = ci->ci_onproc; /* * Arrange for mi_switch() to be called. If called from interrupt * mode, we don't know if curlwp is executing in kernel or user, so * post an AST and have it take a trip through userret(). If not in * interrupt mode, curlwp is running in kernel and will notice the * resched soon enough; avoid the AST. */ if (l == ci->ci_data.cpu_idlelwp) { atomic_or_uint(&ci->ci_want_resched, RESCHED_IDLE | RESCHED_UPREEMPT); } else { atomic_or_uint(&ci->ci_want_resched, RESCHED_UPREEMPT); if (cpu_intr_p()) { cpu_signotify(l); } } } /* * softint_thread: * * Slow path: MI software interrupt dispatch. */ void softint_thread(void *cookie) { softint_t *si; lwp_t *l; int s; l = curlwp; si = l->l_private; for (;;) { /* Clear pending status and run it. */ s = splhigh(); l->l_cpu->ci_data.cpu_softints &= ~si->si_machdep; softint_execute(l, s); splx(s); /* Interrupts allowed to run again before switching. */ lwp_lock(l); l->l_stat = LSIDL; spc_lock(l->l_cpu); mi_switch(l); } } /* * softint_picklwp: * * Slow path: called from mi_switch() to pick the highest priority * soft interrupt LWP that needs to run. */ lwp_t * softint_picklwp(void) { struct cpu_info *ci; u_int mask; softint_t *si; lwp_t *l; ci = curcpu(); si = ((softcpu_t *)ci->ci_data.cpu_softcpu)->sc_int; mask = ci->ci_data.cpu_softints; if ((mask & (1 << SOFTINT_SERIAL)) != 0) { l = si[SOFTINT_SERIAL].si_lwp; } else if ((mask & (1 << SOFTINT_NET)) != 0) { l = si[SOFTINT_NET].si_lwp; } else if ((mask & (1 << SOFTINT_BIO)) != 0) { l = si[SOFTINT_BIO].si_lwp; } else if ((mask & (1 << SOFTINT_CLOCK)) != 0) { l = si[SOFTINT_CLOCK].si_lwp; } else { panic("softint_picklwp"); } return l; } #else /* !__HAVE_FAST_SOFTINTS */ /* * softint_thread: * * Fast path: the LWP is switched to without restoring any state, * so we should not arrive here - there is a direct handoff between * the interrupt stub and softint_dispatch(). */ void softint_thread(void *cookie) { panic("softint_thread"); } /* * softint_dispatch: * * Fast path: entry point from machine-dependent code. */ void softint_dispatch(lwp_t *pinned, int s) { struct bintime now; u_int timing; lwp_t *l; #ifdef DIAGNOSTIC if ((pinned->l_pflag & LP_RUNNING) == 0 || curlwp->l_stat != LSIDL) { struct lwp *onproc = curcpu()->ci_onproc; int s2 = splhigh(); printf("curcpu=%d, spl=%d curspl=%d\n" "onproc=%p => l_stat=%d l_flag=%08x l_cpu=%d\n" "curlwp=%p => l_stat=%d l_flag=%08x l_cpu=%d\n" "pinned=%p => l_stat=%d l_flag=%08x l_cpu=%d\n", cpu_index(curcpu()), s, s2, onproc, onproc->l_stat, onproc->l_flag, cpu_index(onproc->l_cpu), curlwp, curlwp->l_stat, curlwp->l_flag, cpu_index(curlwp->l_cpu), pinned, pinned->l_stat, pinned->l_flag, cpu_index(pinned->l_cpu)); splx(s2); panic("softint screwup"); } #endif /* * Note the interrupted LWP, and mark the current LWP as running * before proceeding. Although this must as a rule be done with * the LWP locked, at this point no external agents will want to * modify the interrupt LWP's state. */ timing = softint_timing; l = curlwp; l->l_switchto = pinned; l->l_stat = LSONPROC; /* * Dispatch the interrupt. If softints are being timed, charge * for it. */ if (timing) { binuptime(&l->l_stime); membar_producer(); /* for calcru */ l->l_pflag |= LP_TIMEINTR; } l->l_pflag |= LP_RUNNING; softint_execute(l, s); if (timing) { binuptime(&now); updatertime(l, &now); l->l_pflag &= ~LP_TIMEINTR; } /* * If we blocked while handling the interrupt, the pinned LWP is * gone and we are now running as a kthread, so find another LWP to * run. softint_dispatch() won't be reentered until the priority is * finally dropped to IPL_NONE on entry to the next LWP on this CPU. */ l->l_stat = LSIDL; if (l->l_switchto == NULL) { lwp_lock(l); spc_lock(l->l_cpu); mi_switch(l); /* NOTREACHED */ } l->l_switchto = NULL; l->l_pflag &= ~LP_RUNNING; } #endif /* !__HAVE_FAST_SOFTINTS */
2 2 2 1 1 1 1 1 2 1 1 1 1 3 3 3 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 /* $NetBSD: if_llatbl.c,v 1.35 2022/11/19 08:00:51 yamt Exp $ */ /* * Copyright (c) 2004 Luigi Rizzo, Alessandro Cerri. All rights reserved. * Copyright (c) 2004-2008 Qing Li. All rights reserved. * Copyright (c) 2008 Kip Macy. All rights reserved. * Copyright (c) 2015 The NetBSD Foundation, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include <sys/cdefs.h> #ifdef _KERNEL_OPT #include "opt_ddb.h" #include "opt_inet.h" #include "opt_inet6.h" #include "opt_net_mpsafe.h" #endif #include "arp.h" #include <sys/param.h> #include <sys/systm.h> #include <sys/malloc.h> #include <sys/mbuf.h> #include <sys/syslog.h> #include <sys/sysctl.h> #include <sys/socket.h> #include <sys/socketvar.h> #include <sys/kernel.h> #include <sys/lock.h> #include <sys/mutex.h> #include <sys/rwlock.h> #ifdef DDB #include <ddb/ddb.h> #endif #include <netinet/in.h> #include <net/if_llatbl.h> #include <net/if.h> #include <net/if_dl.h> #include <net/nd.h> #include <net/route.h> #include <netinet/if_inarp.h> #include <netinet/in_var.h> #include <netinet6/in6_var.h> static SLIST_HEAD(, lltable) lltables; krwlock_t lltable_rwlock; static struct pool llentry_pool; static void lltable_unlink(struct lltable *llt); static void llentries_unlink(struct lltable *llt, struct llentries *head); static void htable_unlink_entry(struct llentry *lle); static void htable_link_entry(struct lltable *llt, struct llentry *lle); static int htable_foreach_lle(struct lltable *llt, llt_foreach_cb_t *f, void *farg); int lltable_dump_entry(struct lltable *llt, struct llentry *lle, struct rt_walkarg *w, struct sockaddr *sa) { #define RTF_LLINFO 0x400 #define RTF_CLONED 0x2000 struct ifnet *ifp = llt->llt_ifp; int error; void *a; struct sockaddr_dl sdl; int size; struct rt_addrinfo info; memset(&info, 0, sizeof(info)); info.rti_info[RTAX_DST] = sa; a = (lle->la_flags & LLE_VALID) == LLE_VALID ? &lle->ll_addr : NULL; if (sockaddr_dl_init(&sdl, sizeof(sdl), ifp->if_index, ifp->if_type, NULL, 0, a, ifp->if_addrlen) == NULL) return EINVAL; info.rti_info[RTAX_GATEWAY] = sstocsa(&sdl); if (sa->sa_family == AF_INET && lle->la_flags & LLE_PUB) { struct sockaddr_inarp *sin; sin = (struct sockaddr_inarp *)sa; sin->sin_other = SIN_PROXY; } if ((error = rt_msg3(RTM_GET, &info, 0, w, &size))) return error; if (w->w_where && w->w_tmem && w->w_needed <= 0) { struct rt_msghdr *rtm = (struct rt_msghdr *)w->w_tmem; /* Need to copy by myself */ rtm->rtm_index = ifp->if_index; rtm->rtm_rmx.rmx_mtu = 0; rtm->rtm_rmx.rmx_expire = (lle->la_flags & LLE_STATIC) ? 0 : time_mono_to_wall(lle->la_expire); rtm->rtm_flags = RTF_UP; rtm->rtm_flags |= RTF_HOST; /* For ndp */ /* For backward compatibility */ rtm->rtm_flags |= RTF_LLINFO | RTF_CLONED; rtm->rtm_flags |= (lle->la_flags & LLE_STATIC) ? RTF_STATIC : 0; if (lle->la_flags & LLE_PUB) rtm->rtm_flags |= RTF_ANNOUNCE; rtm->rtm_addrs = info.rti_addrs; if ((error = copyout(rtm, w->w_where, size)) != 0) w->w_where = NULL; else w->w_where = (char *)w->w_where + size; } return error; #undef RTF_LLINFO #undef RTF_CLONED } /* * Dump lle state for a specific address family. */ static int lltable_dump_af(struct lltable *llt, struct rt_walkarg *w) { int error; LLTABLE_LOCK_ASSERT(); if (llt->llt_ifp->if_flags & IFF_LOOPBACK) return (0); error = 0; IF_AFDATA_RLOCK(llt->llt_ifp); error = lltable_foreach_lle(llt, (llt_foreach_cb_t *)llt->llt_dump_entry, w); IF_AFDATA_RUNLOCK(llt->llt_ifp); return (error); } /* * Dump arp state for a specific address family. */ int lltable_sysctl_dump(int af, struct rt_walkarg *w) { struct lltable *llt; int error = 0; LLTABLE_RLOCK(); SLIST_FOREACH(llt, &lltables, llt_link) { if (llt->llt_af == af) { error = lltable_dump_af(llt, w); if (error != 0) goto done; } } done: LLTABLE_RUNLOCK(); return (error); } /* * Common function helpers for chained hash table. */ /* * Runs specified callback for each entry in @llt. * Caller does the locking. * */ static int htable_foreach_lle(struct lltable *llt, llt_foreach_cb_t *f, void *farg) { struct llentry *lle, *next; int i, error; error = 0; for (i = 0; i < llt->llt_hsize; i++) { LIST_FOREACH_SAFE(lle, &llt->lle_head[i], lle_next, next) { error = f(llt, lle, farg); if (error != 0) break; } } return (error); } static void htable_link_entry(struct lltable *llt, struct llentry *lle) { struct llentries *lleh; uint32_t hashidx; if ((lle->la_flags & LLE_LINKED) != 0) return; IF_AFDATA_WLOCK_ASSERT(llt->llt_ifp); hashidx = llt->llt_hash(lle, llt->llt_hsize); lleh = &llt->lle_head[hashidx]; lle->lle_tbl = llt; lle->lle_head = lleh; lle->la_flags |= LLE_LINKED; LIST_INSERT_HEAD(lleh, lle, lle_next); llt->llt_lle_count++; } static void htable_unlink_entry(struct llentry *lle) { if ((lle->la_flags & LLE_LINKED) != 0) { IF_AFDATA_WLOCK_ASSERT(lle->lle_tbl->llt_ifp); LIST_REMOVE(lle, lle_next); lle->la_flags &= ~(LLE_VALID | LLE_LINKED); #if 0 lle->lle_tbl = NULL; lle->lle_head = NULL; #endif KASSERTMSG(lle->lle_tbl->llt_lle_count != 0, "llt_lle_count=%u", lle->lle_tbl->llt_lle_count); lle->lle_tbl->llt_lle_count--; } } struct prefix_match_data { const struct sockaddr *prefix; const struct sockaddr *mask; struct llentries dchain; u_int flags; }; static int htable_prefix_free_cb(struct lltable *llt, struct llentry *lle, void *farg) { struct prefix_match_data *pmd; pmd = (struct prefix_match_data *)farg; if (llt->llt_match_prefix(pmd->prefix, pmd->mask, pmd->flags, lle)) { LLE_WLOCK(lle); LIST_INSERT_HEAD(&pmd->dchain, lle, lle_chain); } return (0); } static void htable_prefix_free(struct lltable *llt, const struct sockaddr *prefix, const struct sockaddr *mask, u_int flags) { struct llentry *lle, *next; struct prefix_match_data pmd; memset(&pmd, 0, sizeof(pmd)); pmd.prefix = prefix; pmd.mask = mask; pmd.flags = flags; LIST_INIT(&pmd.dchain); IF_AFDATA_WLOCK(llt->llt_ifp); /* Push matching lles to chain */ lltable_foreach_lle(llt, htable_prefix_free_cb, &pmd); llentries_unlink(llt, &pmd.dchain); IF_AFDATA_WUNLOCK(llt->llt_ifp); LIST_FOREACH_SAFE(lle, &pmd.dchain, lle_chain, next) llt->llt_free_entry(llt, lle); } static void htable_free_tbl(struct lltable *llt) { free(llt->lle_head, M_LLTABLE); free(llt, M_LLTABLE); } static void llentries_unlink(struct lltable *llt, struct llentries *head) { struct llentry *lle, *next; LIST_FOREACH_SAFE(lle, head, lle_chain, next) llt->llt_unlink_entry(lle); } /* * Helper function used to drop all mbufs in hold queue. * * Returns the number of held packets, if any, that were dropped. */ size_t lltable_drop_entry_queue(struct llentry *lle) { size_t pkts_dropped; struct mbuf *next; LLE_WLOCK_ASSERT(lle); pkts_dropped = 0; while ((lle->la_numheld > 0) && (lle->la_hold != NULL)) { next = lle->la_hold->m_nextpkt; m_freem(lle->la_hold); lle->la_hold = next; lle->la_numheld--; pkts_dropped++; } KASSERTMSG(lle->la_numheld == 0, "la_numheld %d > 0, pkts_dropped %zd", lle->la_numheld, pkts_dropped); return (pkts_dropped); } struct llentry * llentry_pool_get(int flags) { struct llentry *lle; lle = pool_get(&llentry_pool, flags); if (lle != NULL) memset(lle, 0, sizeof(*lle)); return lle; } void llentry_pool_put(struct llentry *lle) { pool_put(&llentry_pool, lle); } /* * Deletes an address from the address table. * This function is called by the timer functions * such as arptimer() and nd6_llinfo_timer(), and * the caller does the locking. * * Returns the number of held packets, if any, that were dropped. */ size_t llentry_free(struct llentry *lle) { struct lltable *llt; size_t pkts_dropped; LLE_WLOCK_ASSERT(lle); lle->la_flags |= LLE_DELETED; if ((lle->la_flags & LLE_LINKED) != 0) { llt = lle->lle_tbl; IF_AFDATA_WLOCK_ASSERT(llt->llt_ifp); llt->llt_unlink_entry(lle); } /* * Stop a pending callout if one exists. If we cancel one, we have to * remove a reference to avoid a leak. callout_pending is required to * to exclude the case that the callout has never been scheduled. */ /* XXX once softnet_lock goes away, we should use callout_halt */ if (callout_pending(&lle->la_timer)) { bool expired = callout_stop(&lle->la_timer); if (!expired) LLE_REMREF(lle); } pkts_dropped = lltable_drop_entry_queue(lle); LLE_FREE_LOCKED(lle); return (pkts_dropped); } /* * (al)locate an llentry for address dst (equivalent to rtalloc for new-arp). * * If found the llentry * is returned referenced and unlocked. */ struct llentry * llentry_alloc(struct ifnet *ifp, struct lltable *lt, struct sockaddr_storage *dst) { struct llentry *la; IF_AFDATA_RLOCK(ifp); la = lla_lookup(lt, LLE_EXCLUSIVE, (struct sockaddr *)dst); IF_AFDATA_RUNLOCK(ifp); if ((la == NULL) && (ifp->if_flags & IFF_NOARP) == 0) { IF_AFDATA_WLOCK(ifp); la = lla_create(lt, 0, (struct sockaddr *)dst, NULL /* XXX */); IF_AFDATA_WUNLOCK(ifp); } if (la != NULL) { LLE_ADDREF(la); LLE_WUNLOCK(la); } return (la); } /* * Free all entries from given table and free itself. */ static int lltable_free_cb(struct lltable *llt, struct llentry *lle, void *farg) { struct llentries *dchain; dchain = (struct llentries *)farg; LLE_WLOCK(lle); LIST_INSERT_HEAD(dchain, lle, lle_chain); return (0); } /* * Free all entries from given table. */ void lltable_purge_entries(struct lltable *llt) { struct llentry *lle, *next; struct llentries dchain; KASSERTMSG(llt != NULL, "llt is NULL"); LIST_INIT(&dchain); IF_AFDATA_WLOCK(llt->llt_ifp); /* Push all lles to @dchain */ lltable_foreach_lle(llt, lltable_free_cb, &dchain); llentries_unlink(llt, &dchain); IF_AFDATA_WUNLOCK(llt->llt_ifp); LIST_FOREACH_SAFE(lle, &dchain, lle_chain, next) (void)llentry_free(lle); } /* * Free all entries from given table and free itself. */ void lltable_free(struct lltable *llt) { KASSERTMSG(llt != NULL, "llt is NULL"); lltable_unlink(llt); lltable_purge_entries(llt); llt->llt_free_tbl(llt); } void lltable_drain(int af) { struct lltable *llt; struct llentry *lle; register int i; LLTABLE_RLOCK(); SLIST_FOREACH(llt, &lltables, llt_link) { if (llt->llt_af != af) continue; for (i=0; i < llt->llt_hsize; i++) { LIST_FOREACH(lle, &llt->lle_head[i], lle_next) { LLE_WLOCK(lle); lltable_drop_entry_queue(lle); LLE_WUNLOCK(lle); } } } LLTABLE_RUNLOCK(); } void lltable_prefix_free(const int af, const struct sockaddr *prefix, const struct sockaddr *mask, const u_int flags) { struct lltable *llt; LLTABLE_RLOCK(); SLIST_FOREACH(llt, &lltables, llt_link) { if (llt->llt_af != af) continue; llt->llt_prefix_free(llt, prefix, mask, flags); } LLTABLE_RUNLOCK(); } struct lltable * lltable_allocate_htbl(uint32_t hsize) { struct lltable *llt; int i; llt = malloc(sizeof(struct lltable), M_LLTABLE, M_WAITOK | M_ZERO); llt->llt_hsize = hsize; llt->lle_head = malloc(sizeof(struct llentries) * hsize, M_LLTABLE, M_WAITOK | M_ZERO); for (i = 0; i < llt->llt_hsize; i++) LIST_INIT(&llt->lle_head[i]); /* Set some default callbacks */ llt->llt_link_entry = htable_link_entry; llt->llt_unlink_entry = htable_unlink_entry; llt->llt_prefix_free = htable_prefix_free; llt->llt_foreach_entry = htable_foreach_lle; llt->llt_free_tbl = htable_free_tbl; #ifdef MBUFTRACE llt->llt_mowner = NULL; #endif return (llt); } /* * Links lltable to global llt list. */ void lltable_link(struct lltable *llt) { LLTABLE_WLOCK(); SLIST_INSERT_HEAD(&lltables, llt, llt_link); LLTABLE_WUNLOCK(); } static void lltable_unlink(struct lltable *llt) { LLTABLE_WLOCK(); SLIST_REMOVE(&lltables, llt, lltable, llt_link); LLTABLE_WUNLOCK(); } /* * External methods used by lltable consumers */ int lltable_foreach_lle(struct lltable *llt, llt_foreach_cb_t *f, void *farg) { return (llt->llt_foreach_entry(llt, f, farg)); } void lltable_link_entry(struct lltable *llt, struct llentry *lle) { llt->llt_link_entry(llt, lle); } void lltable_unlink_entry(struct lltable *llt, struct llentry *lle) { llt->llt_unlink_entry(lle); } void lltable_free_entry(struct lltable *llt, struct llentry *lle) { llt->llt_free_entry(llt, lle); } void lltable_fill_sa_entry(const struct llentry *lle, struct sockaddr *sa) { struct lltable *llt; llt = lle->lle_tbl; llt->llt_fill_sa_entry(lle, sa); } struct ifnet * lltable_get_ifp(const struct lltable *llt) { return (llt->llt_ifp); } int lltable_get_af(const struct lltable *llt) { return (llt->llt_af); } /* * Called in route_output when rtm_flags contains RTF_LLDATA. */ int lla_rt_output(const u_char rtm_type, const int rtm_flags, const time_t rtm_expire, struct rt_addrinfo *info, int sdl_index) { const struct sockaddr_dl *dl = satocsdl(info->rti_info[RTAX_GATEWAY]); const struct sockaddr *dst = info->rti_info[RTAX_DST]; struct ifnet *ifp; struct lltable *llt; struct llentry *lle; u_int laflags; int error; struct psref psref; int bound; KASSERTMSG(dl != NULL && dl->sdl_family == AF_LINK, "invalid dl"); bound = curlwp_bind(); if (sdl_index != 0) ifp = if_get_byindex(sdl_index, &psref); else ifp = if_get_byindex(dl->sdl_index, &psref); if (ifp == NULL) { curlwp_bindx(bound); log(LOG_INFO, "%s: invalid ifp (sdl_index %d)\n", __func__, sdl_index != 0 ? sdl_index : dl->sdl_index); return EINVAL; } /* XXX linked list may be too expensive */ LLTABLE_RLOCK(); SLIST_FOREACH(llt, &lltables, llt_link) { if (llt->llt_af == dst->sa_family && llt->llt_ifp == ifp) break; } LLTABLE_RUNLOCK(); KASSERTMSG(llt != NULL, "Yep, ugly hacks are bad"); error = 0; switch (rtm_type) { case RTM_ADD: { struct rtentry *rt; /* Never call rtalloc1 with IF_AFDATA_WLOCK */ rt = rtalloc1(dst, 0); /* Add static LLE */ IF_AFDATA_WLOCK(ifp); lle = lla_lookup(llt, LLE_EXCLUSIVE, dst); /* Cannot overwrite an existing static entry */ if (lle != NULL && (lle->la_flags & LLE_STATIC || lle->la_expire == 0)) { LLE_RUNLOCK(lle); IF_AFDATA_WUNLOCK(ifp); if (rt != NULL) rt_unref(rt); error = EEXIST; goto out; } /* * We can't overwrite an existing entry to avoid race * conditions so remove it first. */ if (lle != NULL) { #if defined(INET) && NARP > 0 size_t pkts_dropped = llentry_free(lle); if (dst->sa_family == AF_INET) { arp_stat_add(ARP_STAT_DFRDROPPED, (uint64_t)pkts_dropped); } #else (void) llentry_free(lle); #endif } lle = lla_create(llt, 0, dst, rt); if (lle == NULL) { IF_AFDATA_WUNLOCK(ifp); if (rt != NULL) rt_unref(rt); error = ENOMEM; goto out; } KASSERT(ifp->if_addrlen <= sizeof(lle->ll_addr)); memcpy(&lle->ll_addr, CLLADDR(dl), ifp->if_addrlen); if ((rtm_flags & RTF_ANNOUNCE)) lle->la_flags |= LLE_PUB; lle->la_flags |= LLE_VALID; switch (dst->sa_family) { #ifdef INET case AF_INET: lle->ln_state = ND_LLINFO_REACHABLE; break; #endif #ifdef INET6 case AF_INET6: lle->ln_state = ND_LLINFO_REACHABLE; break; #endif } /* * NB: arp and ndp always set (RTF_STATIC | RTF_HOST) */ if (rtm_expire == 0) { lle->la_flags |= LLE_STATIC; lle->la_expire = 0; } else lle->la_expire = rtm_expire; laflags = lle->la_flags; LLE_WUNLOCK(lle); IF_AFDATA_WUNLOCK(ifp); if (rt != NULL) rt_unref(rt); #if defined(INET) && NARP > 0 /* gratuitous ARP */ if ((laflags & LLE_PUB) && dst->sa_family == AF_INET) { const struct sockaddr_in *sin; struct in_ifaddr *ia; struct psref _psref; sin = satocsin(dst); ia = in_get_ia_on_iface_psref(sin->sin_addr, ifp, &_psref); if (ia != NULL) { arpannounce(ifp, &ia->ia_ifa, CLLADDR(dl)); ia4_release(ia, &_psref); } } #else (void)laflags; #endif break; } case RTM_DELETE: IF_AFDATA_WLOCK(ifp); error = lla_delete(llt, 0, dst); IF_AFDATA_WUNLOCK(ifp); error = (error == 0 ? 0 : ENOENT); break; default: error = EINVAL; } out: if_put(ifp, &psref); curlwp_bindx(bound); return (error); } void lltableinit(void) { SLIST_INIT(&lltables); rw_init(&lltable_rwlock); pool_init(&llentry_pool, sizeof(struct llentry), 0, 0, 0, "llentrypl", NULL, IPL_SOFTNET); } #ifdef __FreeBSD__ #ifdef DDB struct llentry_sa { struct llentry base; struct sockaddr l3_addr; }; static void llatbl_lle_show(struct llentry_sa *la) { struct llentry *lle; uint8_t octet[6]; lle = &la->base; db_printf("lle=%p\n", lle); db_printf(" lle_next=%p\n", lle->lle_next.le_next); db_printf(" lle_lock=%p\n", &lle->lle_lock); db_printf(" lle_tbl=%p\n", lle->lle_tbl); db_printf(" lle_head=%p\n", lle->lle_head); db_printf(" la_hold=%p\n", lle->la_hold); db_printf(" la_numheld=%d\n", lle->la_numheld); db_printf(" la_expire=%ju\n", (uintmax_t)lle->la_expire); db_printf(" la_flags=0x%04x\n", lle->la_flags); db_printf(" la_asked=%u\n", lle->la_asked); db_printf(" la_preempt=%u\n", lle->la_preempt); db_printf(" ln_byhint=%u\n", lle->ln_byhint); db_printf(" ln_state=%d\n", lle->ln_state); db_printf(" ln_router=%u\n", lle->ln_router); db_printf(" ln_ntick=%ju\n", (uintmax_t)lle->ln_ntick); db_printf(" lle_refcnt=%d\n", lle->lle_refcnt); memcopy(octet, &lle->ll_addr.mac16, sizeof(octet)); db_printf(" ll_addr=%02x:%02x:%02x:%02x:%02x:%02x\n", octet[0], octet[1], octet[2], octet[3], octet[4], octet[5]); db_printf(" lle_timer=%p\n", &lle->lle_timer); switch (la->l3_addr.sa_family) { #ifdef INET case AF_INET: { struct sockaddr_in *sin; char l3s[INET_ADDRSTRLEN]; sin = (struct sockaddr_in *)&la->l3_addr; inet_ntoa_r(sin->sin_addr, l3s); db_printf(" l3_addr=%s\n", l3s); break; } #endif #ifdef INET6 case AF_INET6: { struct sockaddr_in6 *sin6; char l3s[INET6_ADDRSTRLEN]; sin6 = (struct sockaddr_in6 *)&la->l3_addr; IN6_PRINT(l3s, &sin6->sin6_addr); db_printf(" l3_addr=%s\n", l3s); break; } #endif default: db_printf(" l3_addr=N/A (af=%d)\n", la->l3_addr.sa_family); break; } } DB_SHOW_COMMAND(llentry, db_show_llentry) { if (!have_addr) { db_printf("usage: show llentry <struct llentry *>\n"); return; } llatbl_lle_show((struct llentry_sa *)addr); } static void llatbl_llt_show(struct lltable *llt) { int i; struct llentry *lle; db_printf("llt=%p llt_af=%d llt_ifp=%p\n", llt, llt->llt_af, llt->llt_ifp); for (i = 0; i < llt->llt_hsize; i++) { LIST_FOREACH(lle, &llt->lle_head[i], lle_next) { llatbl_lle_show((struct llentry_sa *)lle); if (db_pager_quit) return; } } } DB_SHOW_COMMAND(lltable, db_show_lltable) { if (!have_addr) { db_printf("usage: show lltable <struct lltable *>\n"); return; } llatbl_llt_show((struct lltable *)addr); } DB_SHOW_ALL_COMMAND(lltables, db_show_all_lltables) { VNET_ITERATOR_DECL(vnet_iter); struct lltable *llt; VNET_FOREACH(vnet_iter) { CURVNET_SET_QUIET(vnet_iter); #ifdef VIMAGE db_printf("vnet=%p\n", curvnet); #endif SLIST_FOREACH(llt, &lltables, llt_link) { db_printf("llt=%p llt_af=%d llt_ifp=%p(%s)\n", llt, llt->llt_af, llt->llt_ifp, (llt->llt_ifp != NULL) ? llt->llt_ifp->if_xname : "?"); if (have_addr && addr != 0) /* verbose */ llatbl_llt_show(llt); if (db_pager_quit) { CURVNET_RESTORE(); return; } } CURVNET_RESTORE(); } } #endif /* DDB */ #endif /* __FreeBSD__ */
20 19 20 20 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 /* $NetBSD: kern_cpu.c,v 1.97 2023/09/02 17:44:59 riastradh Exp $ */ /*- * Copyright (c) 2007, 2008, 2009, 2010, 2012, 2019 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Andrew Doran. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /*- * Copyright (c)2007 YAMAMOTO Takashi, * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * CPU related routines not shared with rump. */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: kern_cpu.c,v 1.97 2023/09/02 17:44:59 riastradh Exp $"); #ifdef _KERNEL_OPT #include "opt_cpu_ucode.h" #include "opt_heartbeat.h" #endif #include <sys/param.h> #include <sys/systm.h> #include <sys/idle.h> #include <sys/sched.h> #include <sys/intr.h> #include <sys/conf.h> #include <sys/cpu.h> #include <sys/cpuio.h> #include <sys/proc.h> #include <sys/percpu.h> #include <sys/kernel.h> #include <sys/kauth.h> #include <sys/xcall.h> #include <sys/pool.h> #include <sys/kmem.h> #include <sys/select.h> #include <sys/namei.h> #include <sys/callout.h> #include <sys/pcu.h> #include <sys/heartbeat.h> #include <uvm/uvm_extern.h> #include "ioconf.h" /* * If the port has stated that cpu_data is the first thing in cpu_info, * verify that the claim is true. This will prevent them from getting out * of sync. */ #ifdef __HAVE_CPU_DATA_FIRST CTASSERT(offsetof(struct cpu_info, ci_data) == 0); #else CTASSERT(offsetof(struct cpu_info, ci_data) != 0); #endif int (*compat_cpuctl_ioctl)(struct lwp *, u_long, void *) = (void *)enosys; static void cpu_xc_online(struct cpu_info *, void *); static void cpu_xc_offline(struct cpu_info *, void *); dev_type_ioctl(cpuctl_ioctl); const struct cdevsw cpuctl_cdevsw = { .d_open = nullopen, .d_close = nullclose, .d_read = nullread, .d_write = nullwrite, .d_ioctl = cpuctl_ioctl, .d_stop = nullstop, .d_tty = notty, .d_poll = nopoll, .d_mmap = nommap, .d_kqfilter = nokqfilter, .d_discard = nodiscard, .d_flag = D_OTHER | D_MPSAFE }; int mi_cpu_attach(struct cpu_info *ci) { int error; KASSERT(maxcpus > 0); if ((ci->ci_index = ncpu) >= maxcpus) panic("Too many CPUs. Increase MAXCPUS?"); kcpuset_set(kcpuset_attached, cpu_index(ci)); /* * Create a convenience cpuset of just ourselves. */ kcpuset_create(&ci->ci_kcpuset, true); kcpuset_set(ci->ci_kcpuset, cpu_index(ci)); TAILQ_INIT(&ci->ci_data.cpu_ld_locks); __cpu_simple_lock_init(&ci->ci_data.cpu_ld_lock); /* This is useful for eg, per-cpu evcnt */ snprintf(ci->ci_data.cpu_name, sizeof(ci->ci_data.cpu_name), "cpu%d", cpu_index(ci)); if (__predict_false(cpu_infos == NULL)) { size_t ci_bufsize = (maxcpus + 1) * sizeof(struct cpu_info *); cpu_infos = kmem_zalloc(ci_bufsize, KM_SLEEP); } cpu_infos[cpu_index(ci)] = ci; sched_cpuattach(ci); error = create_idle_lwp(ci); if (error != 0) { /* XXX revert sched_cpuattach */ return error; } if (ci == curcpu()) ci->ci_onproc = curlwp; else ci->ci_onproc = ci->ci_data.cpu_idlelwp; percpu_init_cpu(ci); softint_init(ci); callout_init_cpu(ci); xc_init_cpu(ci); pool_cache_cpu_init(ci); selsysinit(ci); cache_cpu_init(ci); TAILQ_INIT(&ci->ci_data.cpu_biodone); ncpu++; ncpuonline++; return 0; } void cpuctlattach(int dummy __unused) { KASSERT(cpu_infos != NULL); } int cpuctl_ioctl(dev_t dev, u_long cmd, void *data, int flag, lwp_t *l) { CPU_INFO_ITERATOR cii; cpustate_t *cs; struct cpu_info *ci; int error, i; u_int id; error = 0; mutex_enter(&cpu_lock); switch (cmd) { case IOC_CPU_SETSTATE: cs = data; error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_CPU, KAUTH_REQ_SYSTEM_CPU_SETSTATE, cs, NULL, NULL); if (error != 0) break; if (cs->cs_id >= maxcpus || (ci = cpu_lookup(cs->cs_id)) == NULL) { error = ESRCH; break; } cpu_setintr(ci, cs->cs_intr); /* XXX neglect errors */ error = cpu_setstate(ci, cs->cs_online); break; case IOC_CPU_GETSTATE: cs = data; id = cs->cs_id; memset(cs, 0, sizeof(*cs)); cs->cs_id = id; if (cs->cs_id >= maxcpus || (ci = cpu_lookup(id)) == NULL) { error = ESRCH; break; } if ((ci->ci_schedstate.spc_flags & SPCF_OFFLINE) != 0) cs->cs_online = false; else cs->cs_online = true; if ((ci->ci_schedstate.spc_flags & SPCF_NOINTR) != 0) cs->cs_intr = false; else cs->cs_intr = true; cs->cs_lastmod = (int32_t)ci->ci_schedstate.spc_lastmod; cs->cs_lastmodhi = (int32_t) (ci->ci_schedstate.spc_lastmod >> 32); cs->cs_intrcnt = cpu_intr_count(ci) + 1; cs->cs_hwid = ci->ci_cpuid; break; case IOC_CPU_MAPID: i = 0; for (CPU_INFO_FOREACH(cii, ci)) { if (i++ == *(int *)data) break; } if (ci == NULL) error = ESRCH; else *(int *)data = cpu_index(ci); break; case IOC_CPU_GETCOUNT: *(int *)data = ncpu; break; #ifdef CPU_UCODE case IOC_CPU_UCODE_GET_VERSION: error = cpu_ucode_get_version((struct cpu_ucode_version *)data); break; case IOC_CPU_UCODE_APPLY: error = kauth_authorize_machdep(l->l_cred, KAUTH_MACHDEP_CPU_UCODE_APPLY, NULL, NULL, NULL, NULL); if (error != 0) break; error = cpu_ucode_apply((const struct cpu_ucode *)data); break; #endif default: error = (*compat_cpuctl_ioctl)(l, cmd, data); break; } mutex_exit(&cpu_lock); return error; } struct cpu_info * cpu_lookup(u_int idx) { struct cpu_info *ci; /* * cpu_infos is a NULL terminated array of MAXCPUS + 1 entries, * so an index of MAXCPUS here is ok. See mi_cpu_attach. */ KASSERT(idx <= maxcpus); if (__predict_false(cpu_infos == NULL)) { KASSERT(idx == 0); return curcpu(); } ci = cpu_infos[idx]; KASSERT(ci == NULL || cpu_index(ci) == idx); KASSERTMSG(idx < maxcpus || ci == NULL, "idx %d ci %p", idx, ci); return ci; } static void cpu_xc_offline(struct cpu_info *ci, void *unused) { struct schedstate_percpu *spc, *mspc = NULL; struct cpu_info *target_ci; struct lwp *l; CPU_INFO_ITERATOR cii; int s; /* * Thread that made the cross call (separate context) holds * cpu_lock on our behalf. */ spc = &ci->ci_schedstate; s = splsched(); spc->spc_flags |= SPCF_OFFLINE; splx(s); /* Take the first available CPU for the migration. */ for (CPU_INFO_FOREACH(cii, target_ci)) { mspc = &target_ci->ci_schedstate; if ((mspc->spc_flags & SPCF_OFFLINE) == 0) break; } KASSERT(target_ci != NULL); /* * Migrate all non-bound threads to the other CPU. Note that this * runs from the xcall thread, thus handling of LSONPROC is not needed. */ mutex_enter(&proc_lock); LIST_FOREACH(l, &alllwp, l_list) { struct cpu_info *mci; lwp_lock(l); if (l->l_cpu != ci || (l->l_pflag & (LP_BOUND | LP_INTR))) { lwp_unlock(l); continue; } /* Regular case - no affinity. */ if (l->l_affinity == NULL) { lwp_migrate(l, target_ci); continue; } /* Affinity is set, find an online CPU in the set. */ for (CPU_INFO_FOREACH(cii, mci)) { mspc = &mci->ci_schedstate; if ((mspc->spc_flags & SPCF_OFFLINE) == 0 && kcpuset_isset(l->l_affinity, cpu_index(mci))) break; } if (mci == NULL) { lwp_unlock(l); mutex_exit(&proc_lock); goto fail; } lwp_migrate(l, mci); } mutex_exit(&proc_lock); #if PCU_UNIT_COUNT > 0 pcu_save_all_on_cpu(); #endif heartbeat_suspend(); #ifdef __HAVE_MD_CPU_OFFLINE cpu_offline_md(); #endif return; fail: /* Just unset the SPCF_OFFLINE flag, caller will check */ s = splsched(); spc->spc_flags &= ~SPCF_OFFLINE; splx(s); } static void cpu_xc_online(struct cpu_info *ci, void *unused) { struct schedstate_percpu *spc; int s; heartbeat_resume(); spc = &ci->ci_schedstate; s = splsched(); spc->spc_flags &= ~SPCF_OFFLINE; splx(s); } int cpu_setstate(struct cpu_info *ci, bool online) { struct schedstate_percpu *spc; CPU_INFO_ITERATOR cii; struct cpu_info *ci2; uint64_t where; xcfunc_t func; int nonline; spc = &ci->ci_schedstate; KASSERT(mutex_owned(&cpu_lock)); if (online) { if ((spc->spc_flags & SPCF_OFFLINE) == 0) return 0; func = (xcfunc_t)cpu_xc_online; } else { if ((spc->spc_flags & SPCF_OFFLINE) != 0) return 0; nonline = 0; /* * Ensure that at least one CPU within the processor set * stays online. Revisit this later. */ for (CPU_INFO_FOREACH(cii, ci2)) { if ((ci2->ci_schedstate.spc_flags & SPCF_OFFLINE) != 0) continue; if (ci2->ci_schedstate.spc_psid != spc->spc_psid) continue; nonline++; } if (nonline == 1) return EBUSY; func = (xcfunc_t)cpu_xc_offline; } where = xc_unicast(0, func, ci, NULL, ci); xc_wait(where); if (online) { KASSERT((spc->spc_flags & SPCF_OFFLINE) == 0); ncpuonline++; } else { if ((spc->spc_flags & SPCF_OFFLINE) == 0) { /* If was not set offline, then it is busy */ return EBUSY; } ncpuonline--; } spc->spc_lastmod = time_second; return 0; } #if defined(__HAVE_INTR_CONTROL) static void cpu_xc_intr(struct cpu_info *ci, void *unused) { struct schedstate_percpu *spc; int s; spc = &ci->ci_schedstate; s = splsched(); spc->spc_flags &= ~SPCF_NOINTR; splx(s); } static void cpu_xc_nointr(struct cpu_info *ci, void *unused) { struct schedstate_percpu *spc; int s; spc = &ci->ci_schedstate; s = splsched(); spc->spc_flags |= SPCF_NOINTR; splx(s); } int cpu_setintr(struct cpu_info *ci, bool intr) { struct schedstate_percpu *spc; CPU_INFO_ITERATOR cii; struct cpu_info *ci2; uint64_t where; xcfunc_t func; int nintr; spc = &ci->ci_schedstate; KASSERT(mutex_owned(&cpu_lock)); if (intr) { if ((spc->spc_flags & SPCF_NOINTR) == 0) return 0; func = (xcfunc_t)cpu_xc_intr; } else { if (CPU_IS_PRIMARY(ci)) /* XXX kern/45117 */ return EINVAL; if ((spc->spc_flags & SPCF_NOINTR) != 0) return 0; /* * Ensure that at least one CPU within the system * is handing device interrupts. */ nintr = 0; for (CPU_INFO_FOREACH(cii, ci2)) { if ((ci2->ci_schedstate.spc_flags & SPCF_NOINTR) != 0) continue; if (ci2 == ci) continue; nintr++; } if (nintr == 0) return EBUSY; func = (xcfunc_t)cpu_xc_nointr; } where = xc_unicast(0, func, ci, NULL, ci); xc_wait(where); if (intr) { KASSERT((spc->spc_flags & SPCF_NOINTR) == 0); } else if ((spc->spc_flags & SPCF_NOINTR) == 0) { /* If was not set offline, then it is busy */ return EBUSY; } /* Direct interrupts away from the CPU and record the change. */ cpu_intr_redistribute(); spc->spc_lastmod = time_second; return 0; } #else /* __HAVE_INTR_CONTROL */ int cpu_setintr(struct cpu_info *ci, bool intr) { return EOPNOTSUPP; } u_int cpu_intr_count(struct cpu_info *ci) { return 0; /* 0 == "don't know" */ } #endif /* __HAVE_INTR_CONTROL */ #ifdef CPU_UCODE int cpu_ucode_load(struct cpu_ucode_softc *sc, const char *fwname) { firmware_handle_t fwh; int error; if (sc->sc_blob != NULL) { firmware_free(sc->sc_blob, sc->sc_blobsize); sc->sc_blob = NULL; sc->sc_blobsize = 0; } error = cpu_ucode_md_open(&fwh, sc->loader_version, fwname); if (error != 0) { #ifdef DEBUG printf("ucode: firmware_open(%s) failed: %i\n", fwname, error); #endif goto err0; } sc->sc_blobsize = firmware_get_size(fwh); if (sc->sc_blobsize == 0) { error = EFTYPE; firmware_close(fwh); goto err0; } sc->sc_blob = firmware_malloc(sc->sc_blobsize); if (sc->sc_blob == NULL) { error = ENOMEM; firmware_close(fwh); goto err0; } error = firmware_read(fwh, 0, sc->sc_blob, sc->sc_blobsize); firmware_close(fwh); if (error != 0) goto err1; return 0; err1: firmware_free(sc->sc_blob, sc->sc_blobsize); sc->sc_blob = NULL; sc->sc_blobsize = 0; err0: return error; } #endif
191 157 7 21 25 13 4 2 2 2 2 4 2 2 4 4 1 2 8 16 7 4 3 2 3 3 2 4 6 4 2 8 8 8 7 28 1 27 29 1 2 26 1 6 2 1 1 5 1 3 3 2 2 3 3 2 2 3 2 1 1 2 1 1 2 2 1 1 18 19 16 11 16 15 16 4 15 16 3 13 3 12 16 13 3 4 2 2 6 2 4 1 34 34 34 34 8 4 4 22 22 22 19 23 8 18 9 8 1 1 23 9 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 /* $NetBSD: kern_resource.c,v 1.195 2023/10/04 20:28:06 ad Exp $ */ /*- * Copyright (c) 1982, 1986, 1991, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)kern_resource.c 8.8 (Berkeley) 2/14/95 */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: kern_resource.c,v 1.195 2023/10/04 20:28:06 ad Exp $"); #include <sys/param.h> #include <sys/systm.h> #include <sys/kernel.h> #include <sys/file.h> #include <sys/resourcevar.h> #include <sys/kmem.h> #include <sys/namei.h> #include <sys/pool.h> #include <sys/proc.h> #include <sys/sysctl.h> #include <sys/timevar.h> #include <sys/kauth.h> #include <sys/atomic.h> #include <sys/mount.h> #include <sys/syscallargs.h> #include <sys/atomic.h> #include <uvm/uvm_extern.h> /* * Maximum process data and stack limits. * They are variables so they are patchable. */ rlim_t maxdmap = MAXDSIZ; rlim_t maxsmap = MAXSSIZ; static kauth_listener_t resource_listener; static struct sysctllog *proc_sysctllog; static int donice(struct lwp *, struct proc *, int); static void sysctl_proc_setup(void); static int resource_listener_cb(kauth_cred_t cred, kauth_action_t action, void *cookie, void *arg0, void *arg1, void *arg2, void *arg3) { struct proc *p; int result; result = KAUTH_RESULT_DEFER; p = arg0; switch (action) { case KAUTH_PROCESS_NICE: if (kauth_cred_geteuid(cred) != kauth_cred_geteuid(p->p_cred) && kauth_cred_getuid(cred) != kauth_cred_geteuid(p->p_cred)) { break; } if ((u_long)arg1 >= p->p_nice) result = KAUTH_RESULT_ALLOW; break; case KAUTH_PROCESS_RLIMIT: { enum kauth_process_req req; req = (enum kauth_process_req)(uintptr_t)arg1; switch (req) { case KAUTH_REQ_PROCESS_RLIMIT_GET: result = KAUTH_RESULT_ALLOW; break; case KAUTH_REQ_PROCESS_RLIMIT_SET: { struct rlimit *new_rlimit; u_long which; if ((p != curlwp->l_proc) && (proc_uidmatch(cred, p->p_cred) != 0)) break; new_rlimit = arg2; which = (u_long)arg3; if (new_rlimit->rlim_max <= p->p_rlimit[which].rlim_max) result = KAUTH_RESULT_ALLOW; break; } default: break; } break; } default: break; } return result; } void resource_init(void) { resource_listener = kauth_listen_scope(KAUTH_SCOPE_PROCESS, resource_listener_cb, NULL); sysctl_proc_setup(); } /* * Resource controls and accounting. */ int sys_getpriority(struct lwp *l, const struct sys_getpriority_args *uap, register_t *retval) { /* { syscallarg(int) which; syscallarg(id_t) who; } */ struct proc *curp = l->l_proc, *p; id_t who = SCARG(uap, who); int low = NZERO + PRIO_MAX + 1; mutex_enter(&proc_lock); switch (SCARG(uap, which)) { case PRIO_PROCESS: p = who ? proc_find(who) : curp; if (p != NULL) low = p->p_nice; break; case PRIO_PGRP: { struct pgrp *pg; if (who == 0) pg = curp->p_pgrp; else if ((pg = pgrp_find(who)) == NULL) break; LIST_FOREACH(p, &pg->pg_members, p_pglist) { if (p->p_nice < low) low = p->p_nice; } break; } case PRIO_USER: if (who == 0) who = (int)kauth_cred_geteuid(l->l_cred); PROCLIST_FOREACH(p, &allproc) { mutex_enter(p->p_lock); if (kauth_cred_geteuid(p->p_cred) == (uid_t)who && p->p_nice < low) low = p->p_nice; mutex_exit(p->p_lock); } break; default: mutex_exit(&proc_lock); return EINVAL; } mutex_exit(&proc_lock); if (low == NZERO + PRIO_MAX + 1) { return ESRCH; } *retval = low - NZERO; return 0; } int sys_setpriority(struct lwp *l, const struct sys_setpriority_args *uap, register_t *retval) { /* { syscallarg(int) which; syscallarg(id_t) who; syscallarg(int) prio; } */ struct proc *curp = l->l_proc, *p; id_t who = SCARG(uap, who); int found = 0, error = 0; mutex_enter(&proc_lock); switch (SCARG(uap, which)) { case PRIO_PROCESS: p = who ? proc_find(who) : curp; if (p != NULL) { mutex_enter(p->p_lock); found++; error = donice(l, p, SCARG(uap, prio)); mutex_exit(p->p_lock); } break; case PRIO_PGRP: { struct pgrp *pg; if (who == 0) pg = curp->p_pgrp; else if ((pg = pgrp_find(who)) == NULL) break; LIST_FOREACH(p, &pg->pg_members, p_pglist) { mutex_enter(p->p_lock); found++; error = donice(l, p, SCARG(uap, prio)); mutex_exit(p->p_lock); if (error) break; } break; } case PRIO_USER: if (who == 0) who = (int)kauth_cred_geteuid(l->l_cred); PROCLIST_FOREACH(p, &allproc) { mutex_enter(p->p_lock); if (kauth_cred_geteuid(p->p_cred) == (uid_t)SCARG(uap, who)) { found++; error = donice(l, p, SCARG(uap, prio)); } mutex_exit(p->p_lock); if (error) break; } break; default: mutex_exit(&proc_lock); return EINVAL; } mutex_exit(&proc_lock); return (found == 0) ? ESRCH : error; } /* * Renice a process. * * Call with the target process' credentials locked. */ static int donice(struct lwp *l, struct proc *chgp, int n) { kauth_cred_t cred = l->l_cred; KASSERT(mutex_owned(chgp->p_lock)); if (kauth_cred_geteuid(cred) && kauth_cred_getuid(cred) && kauth_cred_geteuid(cred) != kauth_cred_geteuid(chgp->p_cred) && kauth_cred_getuid(cred) != kauth_cred_geteuid(chgp->p_cred)) return EPERM; if (n > PRIO_MAX) { n = PRIO_MAX; } if (n < PRIO_MIN) { n = PRIO_MIN; } n += NZERO; if (kauth_authorize_process(cred, KAUTH_PROCESS_NICE, chgp, KAUTH_ARG(n), NULL, NULL)) { return EACCES; } sched_nice(chgp, n); return 0; } int sys_setrlimit(struct lwp *l, const struct sys_setrlimit_args *uap, register_t *retval) { /* { syscallarg(int) which; syscallarg(const struct rlimit *) rlp; } */ int error, which = SCARG(uap, which); struct rlimit alim; error = copyin(SCARG(uap, rlp), &alim, sizeof(struct rlimit)); if (error) { return error; } return dosetrlimit(l, l->l_proc, which, &alim); } int dosetrlimit(struct lwp *l, struct proc *p, int which, struct rlimit *limp) { struct rlimit *alimp; int error; if ((u_int)which >= RLIM_NLIMITS) return EINVAL; if (limp->rlim_cur > limp->rlim_max) { /* * This is programming error. According to SUSv2, we should * return error in this case. */ return EINVAL; } alimp = &p->p_rlimit[which]; /* if we don't change the value, no need to limcopy() */ if (limp->rlim_cur == alimp->rlim_cur && limp->rlim_max == alimp->rlim_max) return 0; error = kauth_authorize_process(l->l_cred, KAUTH_PROCESS_RLIMIT, p, KAUTH_ARG(KAUTH_REQ_PROCESS_RLIMIT_SET), limp, KAUTH_ARG(which)); if (error) return error; lim_privatise(p); /* p->p_limit is now unchangeable */ alimp = &p->p_rlimit[which]; switch (which) { case RLIMIT_DATA: if (limp->rlim_cur > maxdmap) limp->rlim_cur = maxdmap; if (limp->rlim_max > maxdmap) limp->rlim_max = maxdmap; break; case RLIMIT_STACK: if (limp->rlim_cur > maxsmap) limp->rlim_cur = maxsmap; if (limp->rlim_max > maxsmap) limp->rlim_max = maxsmap; /* * Return EINVAL if the new stack size limit is lower than * current usage. Otherwise, the process would get SIGSEGV the * moment it would try to access anything on its current stack. * This conforms to SUSv2. */ if (btoc(limp->rlim_cur) < p->p_vmspace->vm_ssize || btoc(limp->rlim_max) < p->p_vmspace->vm_ssize) { return EINVAL; } /* * Stack is allocated to the max at exec time with * only "rlim_cur" bytes accessible (In other words, * allocates stack dividing two contiguous regions at * "rlim_cur" bytes boundary). * * Since allocation is done in terms of page, roundup * "rlim_cur" (otherwise, contiguous regions * overlap). If stack limit is going up make more * accessible, if going down make inaccessible. */ limp->rlim_max = round_page(limp->rlim_max); limp->rlim_cur = round_page(limp->rlim_cur); if (limp->rlim_cur != alimp->rlim_cur) { vaddr_t addr; vsize_t size; vm_prot_t prot; char *base, *tmp; base = p->p_vmspace->vm_minsaddr; if (limp->rlim_cur > alimp->rlim_cur) { prot = VM_PROT_READ | VM_PROT_WRITE; size = limp->rlim_cur - alimp->rlim_cur; tmp = STACK_GROW(base, alimp->rlim_cur); } else { prot = VM_PROT_NONE; size = alimp->rlim_cur - limp->rlim_cur; tmp = STACK_GROW(base, limp->rlim_cur); } addr = (vaddr_t)STACK_ALLOC(tmp, size); (void) uvm_map_protect(&p->p_vmspace->vm_map, addr, addr + size, prot, false); } break; case RLIMIT_NOFILE: if (limp->rlim_cur > maxfiles) limp->rlim_cur = maxfiles; if (limp->rlim_max > maxfiles) limp->rlim_max = maxfiles; break; case RLIMIT_NPROC: if (limp->rlim_cur > maxproc) limp->rlim_cur = maxproc; if (limp->rlim_max > maxproc) limp->rlim_max = maxproc; break; case RLIMIT_NTHR: if (limp->rlim_cur > maxlwp) limp->rlim_cur = maxlwp; if (limp->rlim_max > maxlwp) limp->rlim_max = maxlwp; break; } mutex_enter(&p->p_limit->pl_lock); *alimp = *limp; mutex_exit(&p->p_limit->pl_lock); return 0; } int sys_getrlimit(struct lwp *l, const struct sys_getrlimit_args *uap, register_t *retval) { /* { syscallarg(int) which; syscallarg(struct rlimit *) rlp; } */ struct proc *p = l->l_proc; int which = SCARG(uap, which); struct rlimit rl; if ((u_int)which >= RLIM_NLIMITS) return EINVAL; mutex_enter(p->p_lock); memcpy(&rl, &p->p_rlimit[which], sizeof(rl)); mutex_exit(p->p_lock); return copyout(&rl, SCARG(uap, rlp), sizeof(rl)); } void addrulwp(struct lwp *l, struct bintime *tm) { lwp_lock(l); bintime_add(tm, &l->l_rtime); if ((l->l_pflag & LP_RUNNING) != 0 && (l->l_pflag & (LP_INTR | LP_TIMEINTR)) != LP_INTR) { struct bintime diff; /* * Adjust for the current time slice. This is * actually fairly important since the error * here is on the order of a time quantum, * which is much greater than the sampling * error. */ binuptime(&diff); membar_consumer(); /* for softint_dispatch() */ bintime_sub(&diff, &l->l_stime); bintime_add(tm, &diff); } lwp_unlock(l); } /* * Transform the running time and tick information in proc p into user, * system, and interrupt time usage. * * Should be called with p->p_lock held unless called from exit1(). */ void calcru(struct proc *p, struct timeval *up, struct timeval *sp, struct timeval *ip, struct timeval *rp) { uint64_t u, st, ut, it, tot, dt; struct lwp *l; struct bintime tm; struct timeval tv; KASSERT(p->p_stat == SDEAD || mutex_owned(p->p_lock)); mutex_spin_enter(&p->p_stmutex); st = p->p_sticks; ut = p->p_uticks; it = p->p_iticks; mutex_spin_exit(&p->p_stmutex); tm = p->p_rtime; LIST_FOREACH(l, &p->p_lwps, l_sibling) { addrulwp(l, &tm); } tot = st + ut + it; bintime2timeval(&tm, &tv); u = (uint64_t)tv.tv_sec * 1000000ul + tv.tv_usec; if (tot == 0) { /* No ticks, so can't use to share time out, split 50-50 */ st = ut = u / 2; } else { st = (u * st) / tot; ut = (u * ut) / tot; } /* * Try to avoid lying to the users (too much) * * Of course, user/sys time are based on sampling (ie: statistics) * so that would be impossible, but convincing the mark * that we have used less ?time this call than we had * last time, is beyond reasonable... (the con fails!) * * Note that since actual used time cannot decrease, either * utime or stime (or both) must be greater now than last time * (or both the same) - if one seems to have decreased, hold * it constant and steal the necessary bump from the other * which must have increased. */ if (p->p_xutime > ut) { dt = p->p_xutime - ut; st -= uimin(dt, st); ut = p->p_xutime; } else if (p->p_xstime > st) { dt = p->p_xstime - st; ut -= uimin(dt, ut); st = p->p_xstime; } if (sp != NULL) { p->p_xstime = st; sp->tv_sec = st / 1000000; sp->tv_usec = st % 1000000; } if (up != NULL) { p->p_xutime = ut; up->tv_sec = ut / 1000000; up->tv_usec = ut % 1000000; } if (ip != NULL) { if (it != 0) /* it != 0 --> tot != 0 */ it = (u * it) / tot; ip->tv_sec = it / 1000000; ip->tv_usec = it % 1000000; } if (rp != NULL) { *rp = tv; } } int sys___getrusage50(struct lwp *l, const struct sys___getrusage50_args *uap, register_t *retval) { /* { syscallarg(int) who; syscallarg(struct rusage *) rusage; } */ int error; struct rusage ru; struct proc *p = l->l_proc; error = getrusage1(p, SCARG(uap, who), &ru); if (error != 0) return error; return copyout(&ru, SCARG(uap, rusage), sizeof(ru)); } int getrusage1(struct proc *p, int who, struct rusage *ru) { switch (who) { case RUSAGE_SELF: mutex_enter(p->p_lock); ruspace(p); memcpy(ru, &p->p_stats->p_ru, sizeof(*ru)); calcru(p, &ru->ru_utime, &ru->ru_stime, NULL, NULL); rulwps(p, ru); mutex_exit(p->p_lock); break; case RUSAGE_CHILDREN: mutex_enter(p->p_lock); memcpy(ru, &p->p_stats->p_cru, sizeof(*ru)); mutex_exit(p->p_lock); break; default: return EINVAL; } return 0; } void ruspace(struct proc *p) { struct vmspace *vm = p->p_vmspace; struct rusage *ru = &p->p_stats->p_ru; ru->ru_ixrss = vm->vm_tsize << (PAGE_SHIFT - 10); ru->ru_idrss = vm->vm_dsize << (PAGE_SHIFT - 10); ru->ru_isrss = vm->vm_ssize << (PAGE_SHIFT - 10); #ifdef __HAVE_NO_PMAP_STATS /* We don't keep track of the max so we get the current */ ru->ru_maxrss = vm_resident_count(vm) << (PAGE_SHIFT - 10); #else ru->ru_maxrss = vm->vm_rssmax << (PAGE_SHIFT - 10); #endif } void ruadd(struct rusage *ru, struct rusage *ru2) { long *ip, *ip2; int i; timeradd(&ru->ru_utime, &ru2->ru_utime, &ru->ru_utime); timeradd(&ru->ru_stime, &ru2->ru_stime, &ru->ru_stime); if (ru->ru_maxrss < ru2->ru_maxrss) ru->ru_maxrss = ru2->ru_maxrss; ip = &ru->ru_first; ip2 = &ru2->ru_first; for (i = &ru->ru_last - &ru->ru_first; i >= 0; i--) *ip++ += *ip2++; } void rulwps(proc_t *p, struct rusage *ru) { lwp_t *l; KASSERT(mutex_owned(p->p_lock)); LIST_FOREACH(l, &p->p_lwps, l_sibling) { ruadd(ru, &l->l_ru); } } /* * lim_copy: make a copy of the plimit structure. * * We use copy-on-write after fork, and copy when a limit is changed. */ struct plimit * lim_copy(struct plimit *lim) { struct plimit *newlim; char *corename; size_t alen, len; newlim = kmem_alloc(sizeof(*newlim), KM_SLEEP); mutex_init(&newlim->pl_lock, MUTEX_DEFAULT, IPL_NONE); newlim->pl_writeable = false; newlim->pl_refcnt = 1; newlim->pl_sv_limit = NULL; mutex_enter(&lim->pl_lock); memcpy(newlim->pl_rlimit, lim->pl_rlimit, sizeof(struct rlimit) * RLIM_NLIMITS); /* * Note: the common case is a use of default core name. */ alen = 0; corename = NULL; for (;;) { if (lim->pl_corename == defcorename) { newlim->pl_corename = defcorename; newlim->pl_cnlen = 0; break; } len = lim->pl_cnlen; if (len == alen) { newlim->pl_corename = corename; newlim->pl_cnlen = len; memcpy(corename, lim->pl_corename, len); corename = NULL; break; } mutex_exit(&lim->pl_lock); if (corename) { kmem_free(corename, alen); } alen = len; corename = kmem_alloc(alen, KM_SLEEP); mutex_enter(&lim->pl_lock); } mutex_exit(&lim->pl_lock); if (corename) { kmem_free(corename, alen); } return newlim; } void lim_addref(struct plimit *lim) { atomic_inc_uint(&lim->pl_refcnt); } /* * lim_privatise: give a process its own private plimit structure. */ void lim_privatise(proc_t *p) { struct plimit *lim = p->p_limit, *newlim; if (lim->pl_writeable) { return; } newlim = lim_copy(lim); mutex_enter(p->p_lock); if (p->p_limit->pl_writeable) { /* Other thread won the race. */ mutex_exit(p->p_lock); lim_free(newlim); return; } /* * Since p->p_limit can be accessed without locked held, * old limit structure must not be deleted yet. */ newlim->pl_sv_limit = p->p_limit; newlim->pl_writeable = true; p->p_limit = newlim; mutex_exit(p->p_lock); } void lim_setcorename(proc_t *p, char *name, size_t len) { struct plimit *lim; char *oname; size_t olen; lim_privatise(p); lim = p->p_limit; mutex_enter(&lim->pl_lock); oname = lim->pl_corename; olen = lim->pl_cnlen; lim->pl_corename = name; lim->pl_cnlen = len; mutex_exit(&lim->pl_lock); if (oname != defcorename) { kmem_free(oname, olen); } } void lim_free(struct plimit *lim) { struct plimit *sv_lim; do { membar_release(); if (atomic_dec_uint_nv(&lim->pl_refcnt) > 0) { return; } membar_acquire(); if (lim->pl_corename != defcorename) { kmem_free(lim->pl_corename, lim->pl_cnlen); } sv_lim = lim->pl_sv_limit; mutex_destroy(&lim->pl_lock); kmem_free(lim, sizeof(*lim)); } while ((lim = sv_lim) != NULL); } struct pstats * pstatscopy(struct pstats *ps) { struct pstats *nps; size_t len; nps = kmem_alloc(sizeof(*nps), KM_SLEEP); len = (char *)&nps->pstat_endzero - (char *)&nps->pstat_startzero; memset(&nps->pstat_startzero, 0, len); len = (char *)&nps->pstat_endcopy - (char *)&nps->pstat_startcopy; memcpy(&nps->pstat_startcopy, &ps->pstat_startcopy, len); return nps; } void pstatsfree(struct pstats *ps) { kmem_free(ps, sizeof(*ps)); } /* * sysctl_proc_findproc: a routine for sysctl proc subtree helpers that * need to pick a valid process by PID. * * => Hold a reference on the process, on success. */ static int sysctl_proc_findproc(lwp_t *l, pid_t pid, proc_t **p2) { proc_t *p; int error; if (pid == PROC_CURPROC) { p = l->l_proc; } else { mutex_enter(&proc_lock); p = proc_find(pid); if (p == NULL) { mutex_exit(&proc_lock); return ESRCH; } } error = rw_tryenter(&p->p_reflock, RW_READER) ? 0 : EBUSY; if (pid != PROC_CURPROC) { mutex_exit(&proc_lock); } *p2 = p; return error; } /* * sysctl_proc_paxflags: helper routine to get process's paxctl flags */ static int sysctl_proc_paxflags(SYSCTLFN_ARGS) { struct proc *p; struct sysctlnode node; int paxflags; int error; /* First, validate the request. */ if (namelen != 0 || name[-1] != PROC_PID_PAXFLAGS) return EINVAL; /* Find the process. Hold a reference (p_reflock), if found. */ error = sysctl_proc_findproc(l, (pid_t)name[-2], &p); if (error) return error; /* XXX-elad */ error = kauth_authorize_process(l->l_cred, KAUTH_PROCESS_CANSEE, p, KAUTH_ARG(KAUTH_REQ_PROCESS_CANSEE_ENTRY), NULL, NULL); if (error) { rw_exit(&p->p_reflock); return error; } /* Retrieve the limits. */ node = *rnode; paxflags = p->p_pax; node.sysctl_data = &paxflags; error = sysctl_lookup(SYSCTLFN_CALL(&node)); /* If attempting to write new value, it's an error */ if (error == 0 && newp != NULL) error = EACCES; rw_exit(&p->p_reflock); return error; } /* * sysctl_proc_corename: helper routine to get or set the core file name * for a process specified by PID. */ static int sysctl_proc_corename(SYSCTLFN_ARGS) { struct proc *p; struct plimit *lim; char *cnbuf, *cname; struct sysctlnode node; size_t len; int error; /* First, validate the request. */ if (namelen != 0 || name[-1] != PROC_PID_CORENAME) return EINVAL; /* Find the process. Hold a reference (p_reflock), if found. */ error = sysctl_proc_findproc(l, (pid_t)name[-2], &p); if (error) return error; /* XXX-elad */ error = kauth_authorize_process(l->l_cred, KAUTH_PROCESS_CANSEE, p, KAUTH_ARG(KAUTH_REQ_PROCESS_CANSEE_ENTRY), NULL, NULL); if (error) { rw_exit(&p->p_reflock); return error; } cnbuf = PNBUF_GET(); if (oldp) { /* Get case: copy the core name into the buffer. */ error = kauth_authorize_process(l->l_cred, KAUTH_PROCESS_CORENAME, p, KAUTH_ARG(KAUTH_REQ_PROCESS_CORENAME_GET), NULL, NULL); if (error) { goto done; } lim = p->p_limit; mutex_enter(&lim->pl_lock); strlcpy(cnbuf, lim->pl_corename, MAXPATHLEN); mutex_exit(&lim->pl_lock); } node = *rnode; node.sysctl_data = cnbuf; error = sysctl_lookup(SYSCTLFN_CALL(&node)); /* Return if error, or if caller is only getting the core name. */ if (error || newp == NULL) { goto done; } /* * Set case. Check permission and then validate new core name. * It must be either "core", "/core", or end in ".core". */ error = kauth_authorize_process(l->l_cred, KAUTH_PROCESS_CORENAME, p, KAUTH_ARG(KAUTH_REQ_PROCESS_CORENAME_SET), cnbuf, NULL); if (error) { goto done; } len = strlen(cnbuf); if ((len < 4 || strcmp(cnbuf + len - 4, "core") != 0) || (len > 4 && cnbuf[len - 5] != '/' && cnbuf[len - 5] != '.')) { error = EINVAL; goto done; } /* Allocate, copy and set the new core name for plimit structure. */ cname = kmem_alloc(++len, KM_NOSLEEP); if (cname == NULL) { error = ENOMEM; goto done; } memcpy(cname, cnbuf, len); lim_setcorename(p, cname, len); done: rw_exit(&p->p_reflock); PNBUF_PUT(cnbuf); return error; } /* * sysctl_proc_stop: helper routine for checking/setting the stop flags. */ static int sysctl_proc_stop(SYSCTLFN_ARGS) { struct proc *p; int isset, flag, error = 0; struct sysctlnode node; if (namelen != 0) return EINVAL; /* Find the process. Hold a reference (p_reflock), if found. */ error = sysctl_proc_findproc(l, (pid_t)name[-2], &p); if (error) return error; /* XXX-elad */ error = kauth_authorize_process(l->l_cred, KAUTH_PROCESS_CANSEE, p, KAUTH_ARG(KAUTH_REQ_PROCESS_CANSEE_ENTRY), NULL, NULL); if (error) { goto out; } /* Determine the flag. */ switch (rnode->sysctl_num) { case PROC_PID_STOPFORK: flag = PS_STOPFORK; break; case PROC_PID_STOPEXEC: flag = PS_STOPEXEC; break; case PROC_PID_STOPEXIT: flag = PS_STOPEXIT; break; default: error = EINVAL; goto out; } isset = (p->p_flag & flag) ? 1 : 0; node = *rnode; node.sysctl_data = &isset; error = sysctl_lookup(SYSCTLFN_CALL(&node)); /* Return if error, or if callers is only getting the flag. */ if (error || newp == NULL) { goto out; } /* Check if caller can set the flags. */ error = kauth_authorize_process(l->l_cred, KAUTH_PROCESS_STOPFLAG, p, KAUTH_ARG(flag), NULL, NULL); if (error) { goto out; } mutex_enter(p->p_lock); if (isset) { p->p_sflag |= flag; } else { p->p_sflag &= ~flag; } mutex_exit(p->p_lock); out: rw_exit(&p->p_reflock); return error; } /* * sysctl_proc_plimit: helper routine to get/set rlimits of a process. */ static int sysctl_proc_plimit(SYSCTLFN_ARGS) { struct proc *p; u_int limitno; int which, error = 0; struct rlimit alim; struct sysctlnode node; if (namelen != 0) return EINVAL; which = name[-1]; if (which != PROC_PID_LIMIT_TYPE_SOFT && which != PROC_PID_LIMIT_TYPE_HARD) return EINVAL; limitno = name[-2] - 1; if (limitno >= RLIM_NLIMITS) return EINVAL; if (name[-3] != PROC_PID_LIMIT) return EINVAL; /* Find the process. Hold a reference (p_reflock), if found. */ error = sysctl_proc_findproc(l, (pid_t)name[-4], &p); if (error) return error; /* XXX-elad */ error = kauth_authorize_process(l->l_cred, KAUTH_PROCESS_CANSEE, p, KAUTH_ARG(KAUTH_REQ_PROCESS_CANSEE_ENTRY), NULL, NULL); if (error) goto out; /* Check if caller can retrieve the limits. */ if (newp == NULL) { error = kauth_authorize_process(l->l_cred, KAUTH_PROCESS_RLIMIT, p, KAUTH_ARG(KAUTH_REQ_PROCESS_RLIMIT_GET), &alim, KAUTH_ARG(which)); if (error) goto out; } /* Retrieve the limits. */ node = *rnode; memcpy(&alim, &p->p_rlimit[limitno], sizeof(alim)); if (which == PROC_PID_LIMIT_TYPE_HARD) { node.sysctl_data = &alim.rlim_max; } else { node.sysctl_data = &alim.rlim_cur; } error = sysctl_lookup(SYSCTLFN_CALL(&node)); /* Return if error, or if we are only retrieving the limits. */ if (error || newp == NULL) { goto out; } error = dosetrlimit(l, p, limitno, &alim); out: rw_exit(&p->p_reflock); return error; } /* * Setup sysctl nodes. */ static void sysctl_proc_setup(void) { sysctl_createv(&proc_sysctllog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_ANYNUMBER, CTLTYPE_NODE, "curproc", SYSCTL_DESCR("Per-process settings"), NULL, 0, NULL, 0, CTL_PROC, PROC_CURPROC, CTL_EOL); sysctl_createv(&proc_sysctllog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_READONLY, CTLTYPE_INT, "paxflags", SYSCTL_DESCR("Process PAX control flags"), sysctl_proc_paxflags, 0, NULL, 0, CTL_PROC, PROC_CURPROC, PROC_PID_PAXFLAGS, CTL_EOL); sysctl_createv(&proc_sysctllog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_READWRITE|CTLFLAG_ANYWRITE, CTLTYPE_STRING, "corename", SYSCTL_DESCR("Core file name"), sysctl_proc_corename, 0, NULL, MAXPATHLEN, CTL_PROC, PROC_CURPROC, PROC_PID_CORENAME, CTL_EOL); sysctl_createv(&proc_sysctllog, 0, NULL, NULL, CTLFLAG_PERMANENT, CTLTYPE_NODE, "rlimit", SYSCTL_DESCR("Process limits"), NULL, 0, NULL, 0, CTL_PROC, PROC_CURPROC, PROC_PID_LIMIT, CTL_EOL); #define create_proc_plimit(s, n) do { \ sysctl_createv(&proc_sysctllog, 0, NULL, NULL, \ CTLFLAG_PERMANENT, \ CTLTYPE_NODE, s, \ SYSCTL_DESCR("Process " s " limits"), \ NULL, 0, NULL, 0, \ CTL_PROC, PROC_CURPROC, PROC_PID_LIMIT, n, \ CTL_EOL); \ sysctl_createv(&proc_sysctllog, 0, NULL, NULL, \ CTLFLAG_PERMANENT|CTLFLAG_READWRITE|CTLFLAG_ANYWRITE, \ CTLTYPE_QUAD, "soft", \ SYSCTL_DESCR("Process soft " s " limit"), \ sysctl_proc_plimit, 0, NULL, 0, \ CTL_PROC, PROC_CURPROC, PROC_PID_LIMIT, n, \ PROC_PID_LIMIT_TYPE_SOFT, CTL_EOL); \ sysctl_createv(&proc_sysctllog, 0, NULL, NULL, \ CTLFLAG_PERMANENT|CTLFLAG_READWRITE|CTLFLAG_ANYWRITE, \ CTLTYPE_QUAD, "hard", \ SYSCTL_DESCR("Process hard " s " limit"), \ sysctl_proc_plimit, 0, NULL, 0, \ CTL_PROC, PROC_CURPROC, PROC_PID_LIMIT, n, \ PROC_PID_LIMIT_TYPE_HARD, CTL_EOL); \ } while (0/*CONSTCOND*/) create_proc_plimit("cputime", PROC_PID_LIMIT_CPU); create_proc_plimit("filesize", PROC_PID_LIMIT_FSIZE); create_proc_plimit("datasize", PROC_PID_LIMIT_DATA); create_proc_plimit("stacksize", PROC_PID_LIMIT_STACK); create_proc_plimit("coredumpsize", PROC_PID_LIMIT_CORE); create_proc_plimit("memoryuse", PROC_PID_LIMIT_RSS); create_proc_plimit("memorylocked", PROC_PID_LIMIT_MEMLOCK); create_proc_plimit("maxproc", PROC_PID_LIMIT_NPROC); create_proc_plimit("descriptors", PROC_PID_LIMIT_NOFILE); create_proc_plimit("sbsize", PROC_PID_LIMIT_SBSIZE); create_proc_plimit("vmemoryuse", PROC_PID_LIMIT_AS); create_proc_plimit("maxlwp", PROC_PID_LIMIT_NTHR); #undef create_proc_plimit sysctl_createv(&proc_sysctllog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_READWRITE|CTLFLAG_ANYWRITE, CTLTYPE_INT, "stopfork", SYSCTL_DESCR("Stop process at fork(2)"), sysctl_proc_stop, 0, NULL, 0, CTL_PROC, PROC_CURPROC, PROC_PID_STOPFORK, CTL_EOL); sysctl_createv(&proc_sysctllog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_READWRITE|CTLFLAG_ANYWRITE, CTLTYPE_INT, "stopexec", SYSCTL_DESCR("Stop process at execve(2)"), sysctl_proc_stop, 0, NULL, 0, CTL_PROC, PROC_CURPROC, PROC_PID_STOPEXEC, CTL_EOL); sysctl_createv(&proc_sysctllog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_READWRITE|CTLFLAG_ANYWRITE, CTLTYPE_INT, "stopexit", SYSCTL_DESCR("Stop process before completing exit"), sysctl_proc_stop, 0, NULL, 0, CTL_PROC, PROC_CURPROC, PROC_PID_STOPEXIT, CTL_EOL); }
3 9 1 13 214 269 326 328 9 3 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 /* $NetBSD: subr_device.c,v 1.13 2022/03/28 12:38:59 riastradh Exp $ */ /* * Copyright (c) 2006, 2021 The NetBSD Foundation, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: subr_device.c,v 1.13 2022/03/28 12:38:59 riastradh Exp $"); #include <sys/param.h> #include <sys/device.h> #include <sys/device_impl.h> #include <sys/systm.h> #include <sys/device_calls.h> /* Root device. */ device_t root_device; /* * devhandle_t accessors / mutators. */ static bool devhandle_is_valid_internal(const devhandle_t * const handlep) { if (handlep->impl == NULL) { return false; } return handlep->impl->type != DEVHANDLE_TYPE_INVALID; } bool devhandle_is_valid(devhandle_t handle) { return devhandle_is_valid_internal(&handle); } devhandle_t devhandle_invalid(void) { static const devhandle_t invalid_devhandle = { .impl = NULL, .uintptr = 0, }; return invalid_devhandle; } devhandle_type_t devhandle_type(devhandle_t handle) { if (!devhandle_is_valid_internal(&handle)) { return DEVHANDLE_TYPE_INVALID; } return handle.impl->type; } int devhandle_compare(devhandle_t handle1, devhandle_t handle2) { devhandle_type_t type1 = devhandle_type(handle1); devhandle_type_t type2 = devhandle_type(handle2); if (type1 == DEVHANDLE_TYPE_INVALID) { return -1; } if (type2 == DEVHANDLE_TYPE_INVALID) { return 1; } if (type1 < type2) { return -1; } if (type1 > type2) { return 1; } /* For private handles, we also compare the impl pointers. */ if (type1 == DEVHANDLE_TYPE_PRIVATE) { intptr_t impl1 = (intptr_t)handle1.impl; intptr_t impl2 = (intptr_t)handle2.impl; if (impl1 < impl2) { return -1; } if (impl1 > impl2) { return 1; } } if (handle1.integer < handle2.integer) { return -1; } if (handle1.integer > handle2.integer) { return 1; } return 0; } device_call_t devhandle_lookup_device_call(devhandle_t handle, const char *name, devhandle_t *call_handlep) { const struct devhandle_impl *impl; device_call_t call; /* * The back-end can override the handle to use for the call, * if needed. */ *call_handlep = handle; for (impl = handle.impl; impl != NULL; impl = impl->super) { if (impl->lookup_device_call != NULL) { call = impl->lookup_device_call(handle, name, call_handlep); if (call != NULL) { return call; } } } return NULL; } void devhandle_impl_inherit(struct devhandle_impl *impl, const struct devhandle_impl *super) { memcpy(impl, super, sizeof(*impl)); impl->super = super; } /* * Accessor functions for the device_t type. */ devclass_t device_class(device_t dev) { return dev->dv_class; } cfdata_t device_cfdata(device_t dev) { return dev->dv_cfdata; } cfdriver_t device_cfdriver(device_t dev) { return dev->dv_cfdriver; } cfattach_t device_cfattach(device_t dev) { return dev->dv_cfattach; } int device_unit(device_t dev) { return dev->dv_unit; } const char * device_xname(device_t dev) { return dev->dv_xname; } device_t device_parent(device_t dev) { return dev->dv_parent; } bool device_activation(device_t dev, devact_level_t level) { int active_flags; active_flags = DVF_ACTIVE; switch (level) { case DEVACT_LEVEL_FULL: active_flags |= DVF_CLASS_SUSPENDED; /*FALLTHROUGH*/ case DEVACT_LEVEL_DRIVER: active_flags |= DVF_DRIVER_SUSPENDED; /*FALLTHROUGH*/ case DEVACT_LEVEL_BUS: active_flags |= DVF_BUS_SUSPENDED; break; } return (dev->dv_flags & active_flags) == DVF_ACTIVE; } bool device_is_active(device_t dev) { int active_flags; active_flags = DVF_ACTIVE; active_flags |= DVF_CLASS_SUSPENDED; active_flags |= DVF_DRIVER_SUSPENDED; active_flags |= DVF_BUS_SUSPENDED; return (dev->dv_flags & active_flags) == DVF_ACTIVE; } bool device_is_enabled(device_t dev) { return (dev->dv_flags & DVF_ACTIVE) == DVF_ACTIVE; } bool device_has_power(device_t dev) { int active_flags; active_flags = DVF_ACTIVE | DVF_BUS_SUSPENDED; return (dev->dv_flags & active_flags) == DVF_ACTIVE; } int device_locator(device_t dev, u_int locnum) { KASSERT(dev->dv_locators != NULL); return dev->dv_locators[locnum]; } void * device_private(device_t dev) { /* * The reason why device_private(NULL) is allowed is to simplify the * work of a lot of userspace request handlers (i.e., c/bdev * handlers) which grab cfdriver_t->cd_units[n]. * It avoids having them test for it to be NULL and only then calling * device_private. */ return dev == NULL ? NULL : dev->dv_private; } void device_set_private(device_t dev, void *private) { KASSERTMSG(dev->dv_private == NULL, "device_set_private(%p, %p):" " device %s already has private set to %p", dev, private, device_xname(dev), device_private(dev)); KASSERT(private != NULL); dev->dv_private = private; } prop_dictionary_t device_properties(device_t dev) { return dev->dv_properties; } /* * device_is_a: * * Returns true if the device is an instance of the specified * driver. */ bool device_is_a(device_t dev, const char *dname) { if (dev == NULL || dev->dv_cfdriver == NULL) { return false; } return strcmp(dev->dv_cfdriver->cd_name, dname) == 0; } /* * device_attached_to_iattr: * * Returns true if the device attached to the specified interface * attribute. */ bool device_attached_to_iattr(device_t dev, const char *iattr) { cfdata_t cfdata = device_cfdata(dev); const struct cfparent *pspec; if (cfdata == NULL || (pspec = cfdata->cf_pspec) == NULL) { return false; } return strcmp(pspec->cfp_iattr, iattr) == 0; } void device_set_handle(device_t dev, devhandle_t handle) { dev->dv_handle = handle; } devhandle_t device_handle(device_t dev) { return dev->dv_handle; } int device_call_generic(device_t dev, const struct device_call_generic *gen) { devhandle_t handle = device_handle(dev); device_call_t call; devhandle_t call_handle; call = devhandle_lookup_device_call(handle, gen->name, &call_handle); if (call == NULL) { return ENOTSUP; } return call(dev, call_handle, gen->args); } int device_enumerate_children(device_t dev, bool (*callback)(device_t, devhandle_t, void *), void *callback_arg) { struct device_enumerate_children_args args = { .callback = callback, .callback_arg = callback_arg, }; return device_call(dev, DEVICE_ENUMERATE_CHILDREN(&args)); }
1 1 4 4 7 4 4 7 1 1 1 4 4 4 4 4 4 7 7 7 3 4 7 7 7 4 3 4 3 4 3 7 4 3 4 3 1 2 3 3 7 7 7 7 8 8 3 6 1 6 2 1 4 65 65 65 27 16 1 1 1 1 1 2 1 1 1 1 3 1 1 2 1 1 4 4 5 3 2 5 2 3 1 1 1 1 7 1 1 1 1 1 1 1 2 23 23 2 2 2 2 2 2 2 2 1 2 7 7 2 7 7 7 7 5 2 4 14 2 2 6 1 2 5 1 1 1 3 3 1 3 1 3 1 2 1 1 4 4 4 4 1 4 4 3 2 2 1 4 4 3 4 1 1 12 1 14 7 7 7 4 3 1 3 15 15 14 1 1 1 1 4 9 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 3 1 2 3 3 3 3 1 2 1 3 1 1 1 99 99 2 3 3 1 1 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 /* $NetBSD: tty.c,v 1.312 2023/12/07 09:00:32 pgoyette Exp $ */ /*- * Copyright (c) 2008, 2020 The NetBSD Foundation, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /*- * Copyright (c) 1982, 1986, 1990, 1991, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)tty.c 8.13 (Berkeley) 1/9/95 */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: tty.c,v 1.312 2023/12/07 09:00:32 pgoyette Exp $"); #ifdef _KERNEL_OPT #include "opt_compat_netbsd.h" #endif #define TTY_ALLOW_PRIVATE #include <sys/param.h> #include <sys/systm.h> #include <sys/ioctl.h> #include <sys/proc.h> #define TTYDEFCHARS #include <sys/tty.h> #undef TTYDEFCHARS #include <sys/file.h> #include <sys/conf.h> #include <sys/cpu.h> #include <sys/dkstat.h> #include <sys/uio.h> #include <sys/kernel.h> #include <sys/vnode.h> #include <sys/syslog.h> #include <sys/kmem.h> #include <sys/signalvar.h> #include <sys/resourcevar.h> #include <sys/poll.h> #include <sys/kprintf.h> #include <sys/namei.h> #include <sys/sysctl.h> #include <sys/kauth.h> #include <sys/intr.h> #include <sys/ioctl_compat.h> #include <sys/module.h> #include <sys/bitops.h> #include <sys/compat_stub.h> #include <sys/atomic.h> #include <sys/condvar.h> #include <sys/pserialize.h> static int ttnread(struct tty *); static void ttyblock(struct tty *); static void ttyecho(int, struct tty *); static void ttyrubo(struct tty *, int); static void ttyprintf_nolock(struct tty *, const char *fmt, ...) __printflike(2, 3); static int proc_compare_wrapper(struct proc *, struct proc *); static void ttysigintr(void *); /* Symbolic sleep message strings. */ const char ttclos[] = "ttycls"; const char ttopen[] = "ttyopn"; const char ttybg[] = "ttybg"; const char ttyin[] = "ttyin"; const char ttyout[] = "ttyout"; /* * Used to determine whether we still have a connection. This is true in * one of 3 cases: * 1) We have carrier. * 2) It's a locally attached terminal, and we are therefore ignoring carrier. * 3) We're using a flow control mechanism that overloads the carrier signal. */ #define CONNECTED(tp) (ISSET(tp->t_state, TS_CARR_ON) || \ ISSET(tp->t_cflag, CLOCAL | MDMBUF)) /* * Table with character classes and parity. The 8th bit indicates parity, * the 7th bit indicates the character is an alphameric or underscore (for * ALTWERASE), and the low 6 bits indicate delay type. If the low 6 bits * are 0 then the character needs no special processing on output; classes * other than 0 might be translated or (not currently) require delays. */ #define E 0x00 /* Even parity. */ #define O 0x80 /* Odd parity. */ #define PARITY(c) (char_type[c] & O) #define ALPHA 0x40 /* Alpha or underscore. */ #define ISALPHA(c) (char_type[(c) & TTY_CHARMASK] & ALPHA) #define CCLASSMASK 0x3f #define CCLASS(c) (char_type[c] & CCLASSMASK) #define BS BACKSPACE #define CC CONTROL #define CR RETURN #define NA ORDINARY | ALPHA #define NL NEWLINE #define NO ORDINARY #define TB TAB #define VT VTAB unsigned char const char_type[] = { E|CC, O|CC, O|CC, E|CC, O|CC, E|CC, E|CC, O|CC, /* nul - bel */ O|BS, E|TB, E|NL, O|CC, E|VT, O|CR, O|CC, E|CC, /* bs - si */ O|CC, E|CC, E|CC, O|CC, E|CC, O|CC, O|CC, E|CC, /* dle - etb */ E|CC, O|CC, O|CC, E|CC, O|CC, E|CC, E|CC, O|CC, /* can - us */ O|NO, E|NO, E|NO, O|NO, E|NO, O|NO, O|NO, E|NO, /* sp - ' */ E|NO, O|NO, O|NO, E|NO, O|NO, E|NO, E|NO, O|NO, /* ( - / */ E|NA, O|NA, O|NA, E|NA, O|NA, E|NA, E|NA, O|NA, /* 0 - 7 */ O|NA, E|NA, E|NO, O|NO, E|NO, O|NO, O|NO, E|NO, /* 8 - ? */ O|NO, E|NA, E|NA, O|NA, E|NA, O|NA, O|NA, E|NA, /* @ - G */ E|NA, O|NA, O|NA, E|NA, O|NA, E|NA, E|NA, O|NA, /* H - O */ E|NA, O|NA, O|NA, E|NA, O|NA, E|NA, E|NA, O|NA, /* P - W */ O|NA, E|NA, E|NA, O|NO, E|NO, O|NO, O|NO, O|NA, /* X - _ */ E|NO, O|NA, O|NA, E|NA, O|NA, E|NA, E|NA, O|NA, /* ` - g */ O|NA, E|NA, E|NA, O|NA, E|NA, O|NA, O|NA, E|NA, /* h - o */ O|NA, E|NA, E|NA, O|NA, E|NA, O|NA, O|NA, E|NA, /* p - w */ E|NA, O|NA, O|NA, E|NO, O|NO, E|NO, E|NO, O|CC, /* x - del */ /* * Meta chars; should be settable per character set; * for now, treat them all as normal characters. */ NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, }; #undef BS #undef CC #undef CR #undef NA #undef NL #undef NO #undef TB #undef VT static struct ttylist_head tty_sigqueue = TAILQ_HEAD_INITIALIZER(tty_sigqueue); static void *tty_sigsih; struct ttylist_head ttylist = TAILQ_HEAD_INITIALIZER(ttylist); int tty_count; kmutex_t tty_lock; kmutex_t constty_lock; static struct pserialize *constty_psz; static kcondvar_t ttyref_cv; struct ptm_pty *ptm = NULL; uint64_t tk_cancc; uint64_t tk_nin; uint64_t tk_nout; uint64_t tk_rawcc; static kauth_listener_t tty_listener; #define TTY_MINQSIZE 0x00400 #define TTY_MAXQSIZE 0x10000 int tty_qsize = TTY_MINQSIZE; static int tty_get_qsize(int *qsize, int newsize) { if (newsize <= 0) return EINVAL; newsize = 1 << ilog2(newsize); /* Make it a power of two */ if (newsize < TTY_MINQSIZE || newsize > TTY_MAXQSIZE) return EINVAL; *qsize = newsize; return 0; } static int tty_set_qsize(struct tty *tp, int newsize) { struct clist rawq, canq, outq; struct clist orawq, ocanq, ooutq; clalloc(&rawq, newsize, 1); clalloc(&canq, newsize, 1); clalloc(&outq, newsize, 0); mutex_spin_enter(&tty_lock); if (tp->t_outq.c_cc != 0) { mutex_spin_exit(&tty_lock); clfree(&rawq); clfree(&canq); clfree(&outq); return EBUSY; } orawq = tp->t_rawq; ocanq = tp->t_canq; ooutq = tp->t_outq; tp->t_qsize = newsize; tp->t_rawq = rawq; tp->t_canq = canq; tp->t_outq = outq; ttsetwater(tp); mutex_spin_exit(&tty_lock); clfree(&orawq); clfree(&ocanq); clfree(&ooutq); return 0; } static int sysctl_kern_tty_qsize(SYSCTLFN_ARGS) { int newsize; int error; struct sysctlnode node; node = *rnode; node.sysctl_data = &newsize; newsize = tty_qsize; error = sysctl_lookup(SYSCTLFN_CALL(&node)); if (error || newp == NULL) return error; return tty_get_qsize(&tty_qsize, newsize); } static void sysctl_kern_tty_setup(void) { const struct sysctlnode *rnode, *cnode; sysctl_createv(NULL, 0, NULL, NULL, CTLFLAG_PERMANENT, CTLTYPE_NODE, "tkstat", SYSCTL_DESCR("Number of characters sent and received " "on ttys"), NULL, 0, NULL, 0, CTL_KERN, KERN_TKSTAT, CTL_EOL); sysctl_createv(NULL, 0, NULL, NULL, CTLFLAG_PERMANENT, CTLTYPE_QUAD, "nin", SYSCTL_DESCR("Total number of tty input characters"), NULL, 0, &tk_nin, 0, CTL_KERN, KERN_TKSTAT, KERN_TKSTAT_NIN, CTL_EOL); sysctl_createv(NULL, 0, NULL, NULL, CTLFLAG_PERMANENT, CTLTYPE_QUAD, "nout", SYSCTL_DESCR("Total number of tty output characters"), NULL, 0, &tk_nout, 0, CTL_KERN, KERN_TKSTAT, KERN_TKSTAT_NOUT, CTL_EOL); sysctl_createv(NULL, 0, NULL, NULL, CTLFLAG_PERMANENT, CTLTYPE_QUAD, "cancc", SYSCTL_DESCR("Number of canonical tty input characters"), NULL, 0, &tk_cancc, 0, CTL_KERN, KERN_TKSTAT, KERN_TKSTAT_CANCC, CTL_EOL); sysctl_createv(NULL, 0, NULL, NULL, CTLFLAG_PERMANENT, CTLTYPE_QUAD, "rawcc", SYSCTL_DESCR("Number of raw tty input characters"), NULL, 0, &tk_rawcc, 0, CTL_KERN, KERN_TKSTAT, KERN_TKSTAT_RAWCC, CTL_EOL); sysctl_createv(NULL, 0, NULL, &rnode, CTLFLAG_PERMANENT, CTLTYPE_NODE, "tty", NULL, NULL, 0, NULL, 0, CTL_KERN, CTL_CREATE, CTL_EOL); sysctl_createv(NULL, 0, &rnode, &cnode, CTLFLAG_PERMANENT | CTLFLAG_READWRITE, CTLTYPE_INT, "qsize", SYSCTL_DESCR("TTY input and output queue size"), sysctl_kern_tty_qsize, 0, &tty_qsize, 0, CTL_CREATE, CTL_EOL); } /* * ttylock(tp), ttyunlock(tp), ttylocked(tp) * * Exclusive lock on tty. Currently a single global lock. * * ttylocked is for positive DIAGNOSTIC assertions only. */ void ttylock(struct tty *tp) { mutex_spin_enter(&tty_lock); } void ttyunlock(struct tty *tp) { mutex_spin_exit(&tty_lock); } bool ttylocked(struct tty *tp) { return mutex_owned(&tty_lock); } int ttyopen(struct tty *tp, int dialout, int nonblock) { int error; error = 0; mutex_spin_enter(&tty_lock); if (dialout) { /* * If the device is already open for non-dialout, fail. * Otherwise, set TS_DIALOUT to block any pending non-dialout * opens. */ if (ISSET(tp->t_state, TS_ISOPEN) && !ISSET(tp->t_state, TS_DIALOUT)) { error = EBUSY; goto out; } SET(tp->t_state, TS_DIALOUT); } else { if (!nonblock) { /* * Wait for carrier. Also wait for any dialout * processes to close the tty first. */ while (ISSET(tp->t_state, TS_DIALOUT) || !CONNECTED(tp)) { tp->t_wopen++; error = ttysleep(tp, &tp->t_rawcv, true, 0); tp->t_wopen--; if (error) goto out; } } else { /* * Don't allow a non-blocking non-dialout open if the * device is already open for dialout. */ if (ISSET(tp->t_state, TS_DIALOUT)) { error = EBUSY; goto out; } } } out: mutex_spin_exit(&tty_lock); return (error); } /* * Initial open of tty, or (re)entry to standard tty line discipline. */ int ttylopen(dev_t device, struct tty *tp) { mutex_spin_enter(&tty_lock); tp->t_dev = device; if (!ISSET(tp->t_state, TS_ISOPEN)) { SET(tp->t_state, TS_ISOPEN); memset(&tp->t_winsize, 0, sizeof(tp->t_winsize)); tp->t_flags = 0; } mutex_spin_exit(&tty_lock); if (tp->t_qsize != tty_qsize) tty_set_qsize(tp, tty_qsize); return (0); } /* * Interrupt any pending I/O and make it fail. Used before close to * interrupt pending open/read/write/&c. and make it fail promptly. */ void ttycancel(struct tty *tp) { mutex_spin_enter(&tty_lock); tp->t_state |= TS_CANCEL; cv_broadcast(&tp->t_outcv); cv_broadcast(&tp->t_rawcv); mutex_spin_exit(&tty_lock); } /* * Handle close() on a tty line: flush and set to initial state, * bumping generation number so that pending read/write calls * can detect recycling of the tty. */ int ttyclose(struct tty *tp) { struct session *sess; /* * Make sure this is not the constty. Without constty_lock it * is always allowed to transition from nonnull to null. */ (void)atomic_cas_ptr(&constty, tp, NULL); /* * We don't know if this has _ever_ been the constty: another * thread may have kicked it out as constty before we started * to close. * * So we wait for all users that might be acquiring references * to finish doing so -- after that, no more references can be * made, at which point we can safely flush the tty, wait for * the existing references to drain, and finally free or reuse * the tty. */ pserialize_perform(constty_psz); mutex_spin_enter(&tty_lock); ttyflush(tp, FREAD | FWRITE); tp->t_gen++; tp->t_pgrp = NULL; tp->t_state = 0; sess = tp->t_session; tp->t_session = NULL; while (tp->t_refcnt) cv_wait(&ttyref_cv, &tty_lock); mutex_spin_exit(&tty_lock); if (sess != NULL) { mutex_enter(&proc_lock); /* Releases proc_lock. */ proc_sessrele(sess); } return (0); } #define FLUSHQ(q) { \ if ((q)->c_cc) \ ndflush(q, (q)->c_cc); \ } /* * tty_acquire(tp), tty_release(tp) * * Acquire a reference to tp that prevents it from being closed * until released. Caller must guarantee tp has not yet been * closed, e.g. by obtaining tp from constty during a pserialize * read section. Caller must not hold tty_lock. */ void tty_acquire(struct tty *tp) { unsigned refcnt __diagused; refcnt = atomic_inc_uint_nv(&tp->t_refcnt); KASSERT(refcnt < UINT_MAX); } void tty_release(struct tty *tp) { unsigned old, new; KDASSERT(mutex_ownable(&tty_lock)); do { old = atomic_load_relaxed(&tp->t_refcnt); if (old == 1) { mutex_spin_enter(&tty_lock); if (atomic_dec_uint_nv(&tp->t_refcnt) == 0) cv_broadcast(&ttyref_cv); mutex_spin_exit(&tty_lock); return; } KASSERT(old != 0); new = old - 1; } while (atomic_cas_uint(&tp->t_refcnt, old, new) != old); } /* * This macro is used in canonical mode input processing, where a read * request shall not return unless a 'line delimiter' ('\n') or 'break' * (EOF, EOL, EOL2) character (or a signal) has been received. As EOL2 * is an extension to the POSIX.1 defined set of special characters, * recognize it only if IEXTEN is set in the set of local flags. */ #define TTBREAKC(c, lflg) \ ((c) == '\n' || (((c) == cc[VEOF] || (c) == cc[VEOL] || \ ((c) == cc[VEOL2] && ISSET(lflg, IEXTEN))) && (c) != _POSIX_VDISABLE)) /* * ttyinput() helper. * Call with the tty lock held. */ /* XXX static */ int ttyinput_wlock(int c, struct tty *tp) { int iflag, lflag, i, error; u_char *cc; KASSERT(mutex_owned(&tty_lock)); /* * If input is pending take it first. */ lflag = tp->t_lflag; if (ISSET(lflag, PENDIN)) ttypend(tp); /* * Gather stats. */ if (ISSET(lflag, ICANON)) { ++tk_cancc; ++tp->t_cancc; } else { ++tk_rawcc; ++tp->t_rawcc; } ++tk_nin; cc = tp->t_cc; /* * Handle exceptional conditions (break, parity, framing). */ iflag = tp->t_iflag; if ((error = (ISSET(c, TTY_ERRORMASK))) != 0) { CLR(c, TTY_ERRORMASK); if (ISSET(error, TTY_FE) && c == 0) { /* Break. */ if (ISSET(iflag, IGNBRK)) return (0); else if (ISSET(iflag, BRKINT)) { ttyflush(tp, FREAD | FWRITE); ttysig(tp, TTYSIG_PG1, SIGINT); return (0); } else if (ISSET(iflag, PARMRK)) goto parmrk; } else if ((ISSET(error, TTY_PE) && ISSET(iflag, INPCK)) || ISSET(error, TTY_FE)) { if (ISSET(iflag, IGNPAR)) return (0); else if (ISSET(iflag, PARMRK)) { parmrk: (void)putc(0377 | TTY_QUOTE, &tp->t_rawq); (void)putc(0 | TTY_QUOTE, &tp->t_rawq); (void)putc(c | TTY_QUOTE, &tp->t_rawq); return (0); } else c = 0; } } else if (c == 0377 && ISSET(iflag, ISTRIP|IGNPAR|INPCK|PARMRK) == (INPCK|PARMRK)) { /* "Escape" a valid character of '\377'. */ (void)putc(0377 | TTY_QUOTE, &tp->t_rawq); (void)putc(0377 | TTY_QUOTE, &tp->t_rawq); goto endcase; } /* * In tandem mode, check high water mark. */ if (ISSET(iflag, IXOFF) || ISSET(tp->t_cflag, CHWFLOW)) ttyblock(tp); if (!ISSET(tp->t_state, TS_TYPEN) && ISSET(iflag, ISTRIP)) CLR(c, 0x80); if (!ISSET(lflag, EXTPROC)) { /* * Check for literal nexting very first */ if (ISSET(tp->t_state, TS_LNCH)) { SET(c, TTY_QUOTE); CLR(tp->t_state, TS_LNCH); } /* * Scan for special characters. This code * is really just a big case statement with * non-constant cases. The bottom of the * case statement is labeled ``endcase'', so goto * it after a case match, or similar. */ /* * Control chars which aren't controlled * by ICANON, ISIG, or IXON. */ if (ISSET(lflag, IEXTEN)) { if (CCEQ(cc[VLNEXT], c)) { if (ISSET(lflag, ECHO)) { if (ISSET(lflag, ECHOE)) { (void)ttyoutput('^', tp); (void)ttyoutput('\b', tp); } else ttyecho(c, tp); } SET(tp->t_state, TS_LNCH); goto endcase; } if (CCEQ(cc[VDISCARD], c)) { if (ISSET(lflag, FLUSHO)) CLR(tp->t_lflag, FLUSHO); else { ttyflush(tp, FWRITE); ttyecho(c, tp); if (tp->t_rawq.c_cc + tp->t_canq.c_cc) ttyretype(tp); SET(tp->t_lflag, FLUSHO); } goto startoutput; } } /* * Signals. */ if (ISSET(lflag, ISIG)) { if (CCEQ(cc[VINTR], c) || CCEQ(cc[VQUIT], c)) { if (!ISSET(lflag, NOFLSH)) ttyflush(tp, FREAD | FWRITE); ttyecho(c, tp); ttysig(tp, TTYSIG_PG1, CCEQ(cc[VINTR], c) ? SIGINT : SIGQUIT); goto endcase; } if (CCEQ(cc[VSUSP], c)) { if (!ISSET(lflag, NOFLSH)) ttyflush(tp, FREAD); ttyecho(c, tp); ttysig(tp, TTYSIG_PG1, SIGTSTP); goto endcase; } } /* * Handle start/stop characters. */ if (ISSET(iflag, IXON)) { if (CCEQ(cc[VSTOP], c)) { if (!ISSET(tp->t_state, TS_TTSTOP)) { SET(tp->t_state, TS_TTSTOP); cdev_stop(tp, 0); return (0); } if (!CCEQ(cc[VSTART], c)) return (0); /* * if VSTART == VSTOP then toggle */ goto endcase; } if (CCEQ(cc[VSTART], c)) goto restartoutput; } /* * IGNCR, ICRNL, & INLCR */ if (c == '\r') { if (ISSET(iflag, IGNCR)) goto endcase; else if (ISSET(iflag, ICRNL)) c = '\n'; } else if (c == '\n' && ISSET(iflag, INLCR)) c = '\r'; } if (!ISSET(lflag, EXTPROC) && ISSET(lflag, ICANON)) { /* * From here on down canonical mode character * processing takes place. */ /* * erase (^H / ^?) */ if (CCEQ(cc[VERASE], c)) { if (tp->t_rawq.c_cc) ttyrub(unputc(&tp->t_rawq), tp); goto endcase; } /* * kill (^U) */ if (CCEQ(cc[VKILL], c)) { if (ISSET(lflag, ECHOKE) && tp->t_rawq.c_cc == tp->t_rocount && !ISSET(lflag, ECHOPRT)) while (tp->t_rawq.c_cc) ttyrub(unputc(&tp->t_rawq), tp); else { ttyecho(c, tp); if (ISSET(lflag, ECHOK) || ISSET(lflag, ECHOKE)) ttyecho('\n', tp); FLUSHQ(&tp->t_rawq); tp->t_rocount = 0; } CLR(tp->t_state, TS_LOCAL); goto endcase; } /* * Extensions to the POSIX.1 GTI set of functions. */ if (ISSET(lflag, IEXTEN)) { /* * word erase (^W) */ if (CCEQ(cc[VWERASE], c)) { int alt = ISSET(lflag, ALTWERASE); int ctype; /* * erase whitespace */ while ((c = unputc(&tp->t_rawq)) == ' ' || c == '\t') ttyrub(c, tp); if (c == -1) goto endcase; /* * erase last char of word and remember the * next chars type (for ALTWERASE) */ ttyrub(c, tp); c = unputc(&tp->t_rawq); if (c == -1) goto endcase; if (c == ' ' || c == '\t') { (void)putc(c, &tp->t_rawq); goto endcase; } ctype = ISALPHA(c); /* * erase rest of word */ do { ttyrub(c, tp); c = unputc(&tp->t_rawq); if (c == -1) goto endcase; } while (c != ' ' && c != '\t' && (alt == 0 || ISALPHA(c) == ctype)); (void)putc(c, &tp->t_rawq); goto endcase; } /* * reprint line (^R) */ if (CCEQ(cc[VREPRINT], c)) { ttyretype(tp); goto endcase; } /* * ^T - kernel info and generate SIGINFO */ if (CCEQ(cc[VSTATUS], c)) { ttysig(tp, TTYSIG_PG1, SIGINFO); goto endcase; } } } /* * Check for input buffer overflow */ if (tp->t_rawq.c_cc + tp->t_canq.c_cc >= TTYHOG) { if (ISSET(iflag, IMAXBEL)) { if (tp->t_outq.c_cc < tp->t_hiwat) (void)ttyoutput(CTRL('g'), tp); } else ttyflush(tp, FREAD | FWRITE); goto endcase; } /* * Put data char in q for user and * wakeup on seeing a line delimiter. */ if (putc(c, &tp->t_rawq) >= 0) { if (!ISSET(lflag, ICANON)) { ttwakeup(tp); ttyecho(c, tp); goto endcase; } if (TTBREAKC(c, lflag)) { tp->t_rocount = 0; catq(&tp->t_rawq, &tp->t_canq); ttwakeup(tp); } else if (tp->t_rocount++ == 0) tp->t_rocol = tp->t_column; if (ISSET(tp->t_state, TS_ERASE)) { /* * end of prterase \.../ */ CLR(tp->t_state, TS_ERASE); (void)ttyoutput('/', tp); } i = tp->t_column; ttyecho(c, tp); if (CCEQ(cc[VEOF], c) && ISSET(lflag, ECHO)) { /* * Place the cursor over the '^' of the ^D. */ i = uimin(2, tp->t_column - i); while (i > 0) { (void)ttyoutput('\b', tp); i--; } } } endcase: /* * IXANY means allow any character to restart output. */ if (ISSET(tp->t_state, TS_TTSTOP) && !ISSET(iflag, IXANY) && cc[VSTART] != cc[VSTOP]) { return (0); } restartoutput: CLR(tp->t_lflag, FLUSHO); CLR(tp->t_state, TS_TTSTOP); startoutput: return (ttstart(tp)); } /* * Process input of a single character received on a tty. * * XXX - this is a hack, all drivers must changed to acquire the * lock before calling linesw->l_rint() */ int ttyinput(int c, struct tty *tp) { int error; /* * Unless the receiver is enabled, drop incoming data. */ if (!ISSET(tp->t_cflag, CREAD)) return (0); mutex_spin_enter(&tty_lock); error = ttyinput_wlock(c, tp); mutex_spin_exit(&tty_lock); return (error); } /* * Output a single character on a tty, doing output processing * as needed (expanding tabs, newline processing, etc.). * Returns < 0 if succeeds, otherwise returns char to resend. * Must be recursive. * * Call with tty lock held. */ int ttyoutput(int c, struct tty *tp) { long oflag; int col, notout; KASSERT(mutex_owned(&tty_lock)); oflag = tp->t_oflag; if (!ISSET(oflag, OPOST)) { tk_nout++; tp->t_outcc++; if (!ISSET(tp->t_lflag, FLUSHO) && putc(c, &tp->t_outq)) return (c); return (-1); } /* * Do tab expansion if OXTABS is set. Special case if we do external * processing, we don't do the tab expansion because we'll probably * get it wrong. If tab expansion needs to be done, let it happen * externally. */ CLR(c, ~TTY_CHARMASK); if (c == '\t' && ISSET(oflag, OXTABS) && !ISSET(tp->t_lflag, EXTPROC)) { c = 8 - (tp->t_column & 7); if (ISSET(tp->t_lflag, FLUSHO)) { notout = 0; } else { notout = b_to_q(" ", c, &tp->t_outq); c -= notout; tk_nout += c; tp->t_outcc += c; } tp->t_column += c; return (notout ? '\t' : -1); } if (c == CEOT && ISSET(oflag, ONOEOT)) return (-1); /* * Newline translation: if ONLCR is set, * translate newline into "\r\n". */ if (c == '\n' && ISSET(tp->t_oflag, ONLCR)) { tk_nout++; tp->t_outcc++; if (!ISSET(tp->t_lflag, FLUSHO) && putc('\r', &tp->t_outq)) return (c); } /* If OCRNL is set, translate "\r" into "\n". */ else if (c == '\r' && ISSET(tp->t_oflag, OCRNL)) c = '\n'; /* If ONOCR is set, don't transmit CRs when on column 0. */ else if (c == '\r' && ISSET(tp->t_oflag, ONOCR) && tp->t_column == 0) return (-1); tk_nout++; tp->t_outcc++; if (!ISSET(tp->t_lflag, FLUSHO) && putc(c, &tp->t_outq)) return (c); col = tp->t_column; switch (CCLASS(c)) { case BACKSPACE: if (col > 0) --col; break; case CONTROL: break; case NEWLINE: if (ISSET(tp->t_oflag, ONLCR | ONLRET)) col = 0; break; case RETURN: col = 0; break; case ORDINARY: ++col; break; case TAB: col = (col + 8) & ~7; break; } tp->t_column = col; return (-1); } /* * Ioctls for all tty devices. Called after line-discipline specific ioctl * has been called to do discipline-specific functions and/or reject any * of these ioctl commands. */ /* ARGSUSED */ int ttioctl(struct tty *tp, u_long cmd, void *data, int flag, struct lwp *l) { struct proc *p; struct linesw *lp; int s, error; struct pathbuf *pb; struct nameidata nd; char infobuf[200]; KASSERT(l != NULL); p = l->l_proc; /* If the ioctl involves modification, hang if in the background. */ switch (cmd) { case TIOCFLUSH: case TIOCDRAIN: case TIOCSBRK: case TIOCCBRK: case TIOCSTART: case TIOCSETA: case TIOCSETD: case TIOCSLINED: case TIOCSETAF: case TIOCSETAW: #ifdef notdef case TIOCSPGRP: case FIOSETOWN: #endif case TIOCSTAT: case TIOCSTI: case TIOCSWINSZ: case TIOCSQSIZE: case TIOCLBIC: case TIOCLBIS: case TIOCLSET: case TIOCSETC: case OTIOCSETD: case TIOCSETN: case TIOCSETP: case TIOCSLTC: mutex_spin_enter(&tty_lock); while (isbackground(curproc, tp) && p->p_pgrp->pg_jobc && (p->p_lflag & PL_PPWAIT) == 0 && !sigismasked(l, SIGTTOU)) { mutex_spin_exit(&tty_lock); mutex_enter(&proc_lock); pgsignal(p->p_pgrp, SIGTTOU, 1); mutex_exit(&proc_lock); mutex_spin_enter(&tty_lock); error = ttypause(tp, hz); if (error) { mutex_spin_exit(&tty_lock); return (error); } } mutex_spin_exit(&tty_lock); break; } switch (cmd) { /* Process the ioctl. */ case FIOASYNC: /* set/clear async i/o */ mutex_spin_enter(&tty_lock); if (*(int *)data) SET(tp->t_state, TS_ASYNC); else CLR(tp->t_state, TS_ASYNC); mutex_spin_exit(&tty_lock); break; case FIONBIO: /* set/clear non-blocking i/o */ break; /* XXX: delete. */ case FIONREAD: /* get # bytes to read */ mutex_spin_enter(&tty_lock); *(int *)data = ttnread(tp); mutex_spin_exit(&tty_lock); break; case FIONWRITE: /* get # bytes to written & unsent */ mutex_spin_enter(&tty_lock); *(int *)data = tp->t_outq.c_cc; mutex_spin_exit(&tty_lock); break; case FIONSPACE: /* get # bytes to written & unsent */ mutex_spin_enter(&tty_lock); *(int *)data = tp->t_outq.c_cn - tp->t_outq.c_cc; mutex_spin_exit(&tty_lock); break; case TIOCEXCL: /* set exclusive use of tty */ mutex_spin_enter(&tty_lock); SET(tp->t_state, TS_XCLUDE); mutex_spin_exit(&tty_lock); break; case TIOCFLUSH: { /* flush buffers */ int flags = *(int *)data; if (flags == 0) flags = FREAD | FWRITE; else flags &= FREAD | FWRITE; mutex_spin_enter(&tty_lock); ttyflush(tp, flags); mutex_spin_exit(&tty_lock); break; } case TIOCCONS: { /* become virtual console */ struct tty *ctp; mutex_enter(&constty_lock); error = 0; ctp = atomic_load_relaxed(&constty); if (*(int *)data) { if (ctp != NULL && ctp != tp && ISSET(ctp->t_state, TS_CARR_ON | TS_ISOPEN) == (TS_CARR_ON | TS_ISOPEN)) { error = EBUSY; goto unlock_constty; } pb = pathbuf_create("/dev/console"); if (pb == NULL) { error = ENOMEM; goto unlock_constty; } NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, pb); if ((error = namei(&nd)) != 0) { pathbuf_destroy(pb); goto unlock_constty; } error = VOP_ACCESS(nd.ni_vp, VREAD, l->l_cred); vput(nd.ni_vp); pathbuf_destroy(pb); if (error) goto unlock_constty; KASSERT(atomic_load_relaxed(&constty) == ctp || atomic_load_relaxed(&constty) == NULL); atomic_store_release(&constty, tp); } else if (tp == ctp) { atomic_store_relaxed(&constty, NULL); } unlock_constty: mutex_exit(&constty_lock); if (error) return error; break; } case TIOCDRAIN: /* wait till output drained */ if ((error = ttywait(tp)) != 0) return (error); break; case TIOCGETA: { /* get termios struct */ struct termios *t = (struct termios *)data; memcpy(t, &tp->t_termios, sizeof(struct termios)); break; } case TIOCGETD: /* get line discipline (old) */ *(int *)data = tp->t_linesw->l_no; break; case TIOCGLINED: /* get line discipline (new) */ (void)strncpy((char *)data, tp->t_linesw->l_name, TTLINEDNAMELEN - 1); break; case TIOCGWINSZ: /* get window size */ *(struct winsize *)data = tp->t_winsize; break; case TIOCGQSIZE: *(int *)data = tp->t_qsize; break; case FIOGETOWN: mutex_enter(&proc_lock); if (tp->t_session != NULL && !isctty(p, tp)) { mutex_exit(&proc_lock); return (ENOTTY); } *(int *)data = tp->t_pgrp ? -tp->t_pgrp->pg_id : 0; mutex_exit(&proc_lock); break; case TIOCGPGRP: /* get pgrp of tty */ mutex_enter(&proc_lock); if (!isctty(p, tp)) { mutex_exit(&proc_lock); return (ENOTTY); } *(int *)data = tp->t_pgrp ? tp->t_pgrp->pg_id : NO_PGID; mutex_exit(&proc_lock); break; case TIOCGSID: /* get sid of tty */ mutex_enter(&proc_lock); if (!isctty(p, tp)) { mutex_exit(&proc_lock); return (ENOTTY); } *(int *)data = tp->t_session->s_sid; mutex_exit(&proc_lock); break; #ifdef TIOCHPCL case TIOCHPCL: /* hang up on last close */ mutex_spin_enter(&tty_lock); SET(tp->t_cflag, HUPCL); mutex_spin_exit(&tty_lock); break; #endif case TIOCNXCL: /* reset exclusive use of tty */ mutex_spin_enter(&tty_lock); CLR(tp->t_state, TS_XCLUDE); mutex_spin_exit(&tty_lock); break; case TIOCOUTQ: /* output queue size */ *(int *)data = tp->t_outq.c_cc; break; case TIOCSETA: /* set termios struct */ case TIOCSETAW: /* drain output, set */ case TIOCSETAF: { /* drn out, fls in, set */ struct termios *t = (struct termios *)data; if (cmd == TIOCSETAW || cmd == TIOCSETAF) { if ((error = ttywait(tp)) != 0) return (error); if (cmd == TIOCSETAF) { mutex_spin_enter(&tty_lock); ttyflush(tp, FREAD); mutex_spin_exit(&tty_lock); } } s = spltty(); /* * XXXSMP - some drivers call back on us from t_param(), so * don't take the tty spin lock here. * require t_param() to unlock upon callback? */ /* wanted here: mutex_spin_enter(&tty_lock); */ if (!ISSET(t->c_cflag, CIGNORE)) { /* * Set device hardware. */ if (tp->t_param && (error = (*tp->t_param)(tp, t))) { /* wanted here: mutex_spin_exit(&tty_lock); */ splx(s); return (error); } else { tp->t_cflag = t->c_cflag; tp->t_ispeed = t->c_ispeed; tp->t_ospeed = t->c_ospeed; if (t->c_ospeed == 0) ttysig(tp, TTYSIG_LEADER, SIGHUP); } ttsetwater(tp); } /* delayed lock acquiring */ mutex_spin_enter(&tty_lock); if (cmd != TIOCSETAF) { if (ISSET(t->c_lflag, ICANON) != ISSET(tp->t_lflag, ICANON)) { if (ISSET(t->c_lflag, ICANON)) { SET(tp->t_lflag, PENDIN); ttwakeup(tp); } else { struct clist tq; catq(&tp->t_rawq, &tp->t_canq); tq = tp->t_rawq; tp->t_rawq = tp->t_canq; tp->t_canq = tq; CLR(tp->t_lflag, PENDIN); } } } tp->t_iflag = t->c_iflag; tp->t_oflag = t->c_oflag; /* * Make the EXTPROC bit read only. */ if (ISSET(tp->t_lflag, EXTPROC)) SET(t->c_lflag, EXTPROC); else CLR(t->c_lflag, EXTPROC); tp->t_lflag = t->c_lflag | ISSET(tp->t_lflag, PENDIN); memcpy(tp->t_cc, t->c_cc, sizeof(t->c_cc)); mutex_spin_exit(&tty_lock); splx(s); break; } case TIOCSETD: /* set line discipline (old) */ lp = ttyldisc_lookup_bynum(*(int *)data); goto setldisc; case TIOCSLINED: { /* set line discipline (new) */ char *name = (char *)data; dev_t device; /* Null terminate to prevent buffer overflow */ name[TTLINEDNAMELEN - 1] = '\0'; lp = ttyldisc_lookup(name); setldisc: if (lp == NULL) return (ENXIO); if (lp != tp->t_linesw) { device = tp->t_dev; s = spltty(); (*tp->t_linesw->l_close)(tp, flag); error = (*lp->l_open)(device, tp); if (error) { (void)(*tp->t_linesw->l_open)(device, tp); splx(s); ttyldisc_release(lp); return (error); } ttyldisc_release(tp->t_linesw); tp->t_linesw = lp; splx(s); } else { /* Drop extra reference. */ ttyldisc_release(lp); } break; } case TIOCSTART: /* start output, like ^Q */ mutex_spin_enter(&tty_lock); if (ISSET(tp->t_state, TS_TTSTOP) || ISSET(tp->t_lflag, FLUSHO)) { CLR(tp->t_lflag, FLUSHO); CLR(tp->t_state, TS_TTSTOP); ttstart(tp); } mutex_spin_exit(&tty_lock); break; case TIOCSTI: /* simulate terminal input */ if ((error = kauth_authorize_device_tty(l->l_cred, KAUTH_DEVICE_TTY_STI, tp)) != 0) { if (!ISSET(flag, FREAD)) return EPERM; if (!isctty(p, tp)) return EACCES; if (tp->t_session->s_leader->p_cred != p->p_cred) return error; } (*tp->t_linesw->l_rint)(*(u_char *)data, tp); break; case TIOCSTOP: /* stop output, like ^S */ { mutex_spin_enter(&tty_lock); if (!ISSET(tp->t_state, TS_TTSTOP)) { SET(tp->t_state, TS_TTSTOP); cdev_stop(tp, 0); } mutex_spin_exit(&tty_lock); break; } case TIOCSCTTY: /* become controlling tty */ mutex_enter(&proc_lock); mutex_spin_enter(&tty_lock); /* Session ctty vnode pointer set in vnode layer. */ if (!SESS_LEADER(p) || ((p->p_session->s_ttyvp || tp->t_session) && (tp->t_session != p->p_session))) { mutex_spin_exit(&tty_lock); mutex_exit(&proc_lock); return (EPERM); } /* * `p_session' acquires a reference. * But note that if `t_session' is set at this point, * it must equal `p_session', in which case the session * already has the correct reference count. */ if (tp->t_session == NULL) { proc_sesshold(p->p_session); } tp->t_session = p->p_session; tp->t_pgrp = p->p_pgrp; p->p_session->s_ttyp = tp; p->p_lflag |= PL_CONTROLT; mutex_spin_exit(&tty_lock); mutex_exit(&proc_lock); break; case FIOSETOWN: { /* set pgrp of tty */ pid_t pgid = *(pid_t *)data; struct pgrp *pgrp; mutex_enter(&proc_lock); if (tp->t_session != NULL && !isctty(p, tp)) { mutex_exit(&proc_lock); return (ENOTTY); } if (pgid < 0) { if (pgid == INT_MIN) { mutex_exit(&proc_lock); return (EINVAL); } pgrp = pgrp_find(-pgid); if (pgrp == NULL) { mutex_exit(&proc_lock); return (EINVAL); } } else { struct proc *p1; p1 = proc_find(pgid); if (!p1) { mutex_exit(&proc_lock); return (ESRCH); } pgrp = p1->p_pgrp; } if (pgrp->pg_session != p->p_session) { mutex_exit(&proc_lock); return (EPERM); } mutex_spin_enter(&tty_lock); tp->t_pgrp = pgrp; mutex_spin_exit(&tty_lock); mutex_exit(&proc_lock); break; } case TIOCSPGRP: { /* set pgrp of tty */ struct pgrp *pgrp; pid_t pgid = *(pid_t *)data; if (pgid == NO_PGID) return EINVAL; mutex_enter(&proc_lock); if (!isctty(p, tp)) { mutex_exit(&proc_lock); return (ENOTTY); } pgrp = pgrp_find(pgid); if (pgrp == NULL || pgrp->pg_session != p->p_session) { mutex_exit(&proc_lock); return (EPERM); } mutex_spin_enter(&tty_lock); tp->t_pgrp = pgrp; mutex_spin_exit(&tty_lock); mutex_exit(&proc_lock); break; } case TIOCSTAT: /* get load avg stats */ mutex_enter(&proc_lock); ttygetinfo(tp, 0, infobuf, sizeof(infobuf)); mutex_exit(&proc_lock); mutex_spin_enter(&tty_lock); ttyputinfo(tp, infobuf); mutex_spin_exit(&tty_lock); break; case TIOCSWINSZ: /* set window size */ mutex_spin_enter(&tty_lock); if (memcmp((void *)&tp->t_winsize, data, sizeof(struct winsize))) { tp->t_winsize = *(struct winsize *)data; ttysig(tp, TTYSIG_PG1, SIGWINCH); } mutex_spin_exit(&tty_lock); break; case TIOCSQSIZE: if ((error = tty_get_qsize(&s, *(int *)data)) == 0 && s != tp->t_qsize) error = tty_set_qsize(tp, s); return error; case TIOCSBRK: case TIOCCBRK: case TIOCSDTR: case TIOCCDTR: case TIOCSFLAGS: case TIOCGFLAGS: case TIOCMSET: case TIOCMGET: case TIOCMBIS: case TIOCMBIC: /* Handled by the driver layer */ return EPASSTHROUGH; case TIOCEXT: case TIOCPTSNAME: case TIOCGRANTPT: case TIOCPKT: case TIOCUCNTL: case TIOCREMOTE: case TIOCSIG: /* for ptys */ return EPASSTHROUGH; default: /* Pass through various console ioctls */ switch (IOCGROUP(cmd)) { case 'c': /* syscons console */ case 'v': /* usl console, video - where one letter */ case 'K': /* usl console, keyboard - aint enough */ case 'V': /* pcvt compat */ case 'W': /* wscons console */ return EPASSTHROUGH; default: break; } /* We may have to load the compat_60 module for this. */ (void)module_autoload("compat_60", MODULE_CLASS_EXEC); MODULE_HOOK_CALL(tty_ttioctl_60_hook, (tp, cmd, data, flag, l), enosys(), error); if (error != EPASSTHROUGH) return error; /* We may have to load the compat_43 module for this. */ (void)module_autoload("compat_43", MODULE_CLASS_EXEC); MODULE_HOOK_CALL(tty_ttioctl_43_hook, (tp, cmd, data, flag, l), enosys(), error); return error; } return (0); } int ttpoll(struct tty *tp, int events, struct lwp *l) { int revents; revents = 0; mutex_spin_enter(&tty_lock); if (events & (POLLIN | POLLRDNORM)) if (ttnread(tp) > 0) revents |= events & (POLLIN | POLLRDNORM); if (events & (POLLOUT | POLLWRNORM)) if (tp->t_outq.c_cc <= tp->t_lowat) revents |= events & (POLLOUT | POLLWRNORM); if (events & POLLHUP) if (!CONNECTED(tp)) revents |= POLLHUP; if (revents == 0) { if (events & (POLLIN | POLLHUP | POLLRDNORM)) selrecord(l, &tp->t_rsel); if (events & (POLLOUT | POLLWRNORM)) selrecord(l, &tp->t_wsel); } mutex_spin_exit(&tty_lock); return (revents); } static void filt_ttyrdetach(struct knote *kn) { struct tty *tp; tp = kn->kn_hook; mutex_spin_enter(&tty_lock); selremove_knote(&tp->t_rsel, kn); mutex_spin_exit(&tty_lock); } static int filt_ttyread(struct knote *kn, long hint) { struct tty *tp; int rv; tp = kn->kn_hook; if ((hint & NOTE_SUBMIT) == 0) mutex_spin_enter(&tty_lock); kn->kn_data = ttnread(tp); rv = kn->kn_data > 0; if ((hint & NOTE_SUBMIT) == 0) mutex_spin_exit(&tty_lock); return rv; } static void filt_ttywdetach(struct knote *kn) { struct tty *tp; tp = kn->kn_hook; mutex_spin_enter(&tty_lock); selremove_knote(&tp->t_wsel, kn); mutex_spin_exit(&tty_lock); } static int filt_ttywrite(struct knote *kn, long hint) { struct tty *tp; int canwrite; tp = kn->kn_hook; if ((hint & NOTE_SUBMIT) == 0) mutex_spin_enter(&tty_lock); kn->kn_data = tp->t_outq.c_cn - tp->t_outq.c_cc; canwrite = (tp->t_outq.c_cc <= tp->t_lowat) && CONNECTED(tp); if ((hint & NOTE_SUBMIT) == 0) mutex_spin_exit(&tty_lock); return (canwrite); } static const struct filterops ttyread_filtops = { .f_flags = FILTEROP_ISFD | FILTEROP_MPSAFE, .f_attach = NULL, .f_detach = filt_ttyrdetach, .f_event = filt_ttyread, }; static const struct filterops ttywrite_filtops = { .f_flags = FILTEROP_ISFD | FILTEROP_MPSAFE, .f_attach = NULL, .f_detach = filt_ttywdetach, .f_event = filt_ttywrite, }; int ttykqfilter(dev_t dev, struct knote *kn) { struct tty *tp; struct selinfo *sip; if ((tp = cdev_tty(dev)) == NULL) return (ENXIO); switch (kn->kn_filter) { case EVFILT_READ: sip = &tp->t_rsel; kn->kn_fop = &ttyread_filtops; break; case EVFILT_WRITE: sip = &tp->t_wsel; kn->kn_fop = &ttywrite_filtops; break; default: return EINVAL; } kn->kn_hook = tp; mutex_spin_enter(&tty_lock); selrecord_knote(sip, kn); mutex_spin_exit(&tty_lock); return (0); } /* * Find the number of chars ready to be read from this tty. * Call with the tty lock held. */ static int ttnread(struct tty *tp) { int nread; KASSERT(mutex_owned(&tty_lock)); if (ISSET(tp->t_lflag, PENDIN)) ttypend(tp); nread = tp->t_canq.c_cc; if (!ISSET(tp->t_lflag, ICANON)) { nread += tp->t_rawq.c_cc; if (nread < tp->t_cc[VMIN] && !tp->t_cc[VTIME]) nread = 0; } return (nread); } /* * Wait for output to drain, or if this times out, flush it. */ static int ttywait_timo(struct tty *tp, int timo) { int error; error = 0; mutex_spin_enter(&tty_lock); while ((tp->t_outq.c_cc || ISSET(tp->t_state, TS_BUSY)) && CONNECTED(tp) && tp->t_oproc) { (*tp->t_oproc)(tp); error = ttysleep(tp, &tp->t_outcv, true, timo); if (error == EWOULDBLOCK) ttyflush(tp, FWRITE); if (error) break; } mutex_spin_exit(&tty_lock); return (error); } /* * Wait for output to drain. */ int ttywait(struct tty *tp) { return ttywait_timo(tp, 0); } /* * Flush if successfully wait. */ int ttywflush(struct tty *tp) { int error; error = ttywait_timo(tp, 5 * hz); if (error == 0 || error == EWOULDBLOCK) { mutex_spin_enter(&tty_lock); ttyflush(tp, FREAD); mutex_spin_exit(&tty_lock); } return (error); } /* * Flush tty read and/or write queues, notifying anyone waiting. * Call with the tty lock held. */ void ttyflush(struct tty *tp, int rw) { KASSERT(mutex_owned(&tty_lock)); if (rw & FREAD) { FLUSHQ(&tp->t_canq); FLUSHQ(&tp->t_rawq); tp->t_rocount = 0; tp->t_rocol = 0; CLR(tp->t_state, TS_LOCAL); ttwakeup(tp); } if (rw & FWRITE) { CLR(tp->t_state, TS_TTSTOP); cdev_stop(tp, rw); FLUSHQ(&tp->t_outq); cv_broadcast(&tp->t_outcv); selnotify(&tp->t_wsel, 0, NOTE_SUBMIT); } } /* * Copy in the default termios characters. */ void ttychars(struct tty *tp) { memcpy(tp->t_cc, ttydefchars, sizeof(ttydefchars)); } /* * Send stop character on input overflow. * Call with the tty lock held. */ static void ttyblock(struct tty *tp) { int total; KASSERT(mutex_owned(&tty_lock)); total = tp->t_rawq.c_cc + tp->t_canq.c_cc; if (tp->t_rawq.c_cc > TTYHOG) { ttyflush(tp, FREAD | FWRITE); CLR(tp->t_state, TS_TBLOCK); } /* * Block further input iff: current input > threshold * AND input is available to user program. */ if (total >= TTYHOG / 2 && !ISSET(tp->t_state, TS_TBLOCK) && (!ISSET(tp->t_lflag, ICANON) || tp->t_canq.c_cc > 0)) { if (ISSET(tp->t_iflag, IXOFF) && tp->t_cc[VSTOP] != _POSIX_VDISABLE && putc(tp->t_cc[VSTOP], &tp->t_outq) == 0) { SET(tp->t_state, TS_TBLOCK); ttstart(tp); } /* Try to block remote output via hardware flow control. */ if (ISSET(tp->t_cflag, CHWFLOW) && tp->t_hwiflow && (*tp->t_hwiflow)(tp, 1) != 0) SET(tp->t_state, TS_TBLOCK); } } /* * Delayed line discipline output */ void ttrstrt(void *tp_arg) { struct tty *tp; #ifdef DIAGNOSTIC if (tp_arg == NULL) panic("ttrstrt"); #endif tp = tp_arg; mutex_spin_enter(&tty_lock); CLR(tp->t_state, TS_TIMEOUT); ttstart(tp); /* XXX - Shouldn't this be tp->l_start(tp)? */ mutex_spin_exit(&tty_lock); } /* * start a line discipline * Always call with tty lock held? */ int ttstart(struct tty *tp) { if (tp->t_oproc != NULL) /* XXX: Kludge for pty. */ (*tp->t_oproc)(tp); return (0); } /* * "close" a line discipline */ int ttylclose(struct tty *tp, int flag) { if (flag & FNONBLOCK) { mutex_spin_enter(&tty_lock); ttyflush(tp, FREAD | FWRITE); mutex_spin_exit(&tty_lock); } else ttywflush(tp); return (0); } /* * Handle modem control transition on a tty. * Flag indicates new state of carrier. * Returns 0 if the line should be turned off, otherwise 1. */ int ttymodem(struct tty *tp, int flag) { mutex_spin_enter(&tty_lock); if (flag == 0) { if (ISSET(tp->t_state, TS_CARR_ON)) { /* * Lost carrier. */ CLR(tp->t_state, TS_CARR_ON); if (ISSET(tp->t_state, TS_ISOPEN) && !CONNECTED(tp)) { ttysig(tp, TTYSIG_LEADER, SIGHUP); ttyflush(tp, FREAD | FWRITE); mutex_spin_exit(&tty_lock); return (0); } } } else { if (!ISSET(tp->t_state, TS_CARR_ON)) { /* * Carrier now on. */ SET(tp->t_state, TS_CARR_ON); ttwakeup(tp); } } mutex_spin_exit(&tty_lock); return (1); } /* * Default modem control routine (for other line disciplines). * Return argument flag, to turn off device on carrier drop. */ int nullmodem(struct tty *tp, int flag) { mutex_spin_enter(&tty_lock); if (flag) SET(tp->t_state, TS_CARR_ON); else { CLR(tp->t_state, TS_CARR_ON); if (!CONNECTED(tp)) { ttysig(tp, TTYSIG_LEADER, SIGHUP); mutex_spin_exit(&tty_lock); return (0); } } mutex_spin_exit(&tty_lock); return (1); } /* * Reinput pending characters after state switch. */ void ttypend(struct tty *tp) { struct clist tq; int c; KASSERT(mutex_owned(&tty_lock)); CLR(tp->t_lflag, PENDIN); SET(tp->t_state, TS_TYPEN); tq = tp->t_rawq; tp->t_rawq.c_cc = 0; tp->t_rawq.c_cf = tp->t_rawq.c_cl = 0; while ((c = getc(&tq)) >= 0) ttyinput_wlock(c, tp); CLR(tp->t_state, TS_TYPEN); } /* * Process a read call on a tty device. */ int ttread(struct tty *tp, struct uio *uio, int flag) { struct clist *qp; u_char *cc; struct proc *p; int c, first, error, has_stime, last_cc; long lflag, slp; struct timeval now, stime; if (uio->uio_resid == 0) return 0; stime.tv_usec = 0; /* XXX gcc */ stime.tv_sec = 0; /* XXX gcc */ cc = tp->t_cc; p = curproc; error = 0; has_stime = 0; last_cc = 0; slp = 0; loop: mutex_spin_enter(&tty_lock); lflag = tp->t_lflag; /* * take pending input first */ if (ISSET(lflag, PENDIN)) ttypend(tp); /* * Hang process if it's in the background. */ if (isbackground(p, tp)) { if (sigismasked(curlwp, SIGTTIN) || p->p_lflag & PL_PPWAIT || p->p_pgrp->pg_jobc == 0) { mutex_spin_exit(&tty_lock); return (EIO); } mutex_spin_exit(&tty_lock); mutex_enter(&proc_lock); pgsignal(p->p_pgrp, SIGTTIN, 1); mutex_exit(&proc_lock); mutex_spin_enter(&tty_lock); error = ttypause(tp, hz); mutex_spin_exit(&tty_lock); if (error) return (error); goto loop; } if (!ISSET(lflag, ICANON)) { int m = cc[VMIN]; long t = cc[VTIME]; qp = &tp->t_rawq; /* * Check each of the four combinations. * (m > 0 && t == 0) is the normal read case. * It should be fairly efficient, so we check that and its * companion case (m == 0 && t == 0) first. * For the other two cases, we compute the target sleep time * into slp. */ if (t == 0) { if (qp->c_cc < m) goto sleep; goto read; } t *= hz; /* time in deca-ticks */ /* * Time difference in deca-ticks, split division to avoid numeric overflow. * Ok for hz < ~200kHz */ #define diff(t1, t2) (((t1).tv_sec - (t2).tv_sec) * 10 * hz + \ ((t1).tv_usec - (t2).tv_usec) / 100 * hz / 1000) if (m > 0) { if (qp->c_cc <= 0) goto sleep; if (qp->c_cc >= m) goto read; if (!has_stime) { /* first character, start timer */ has_stime = 1; getmicrotime(&stime); slp = t; } else if (qp->c_cc > last_cc) { /* got a character, restart timer */ getmicrotime(&stime); slp = t; } else { /* nothing, check expiration */ getmicrotime(&now); slp = t - diff(now, stime); } } else { /* m == 0 */ if (qp->c_cc > 0) goto read; if (!has_stime) { has_stime = 1; getmicrotime(&stime); slp = t; } else { getmicrotime(&now); slp = t - diff(now, stime); } } last_cc = qp->c_cc; #undef diff if (slp > 0) { /* * Convert deca-ticks back to ticks. * Rounding down may make us wake up just short * of the target, so we round up. * Maybe we should do 'slp/10 + 1' because the * first tick maybe almost immediate. * However it is more useful for a program that sets * VTIME=10 to wakeup every second not every 1.01 * seconds (if hz=100). */ slp = (slp + 9)/ 10; goto sleep; } } else if ((qp = &tp->t_canq)->c_cc <= 0) { int carrier; sleep: /* * If there is no input, sleep on rawq * awaiting hardware receipt and notification. * If we have data, we don't need to check for carrier. */ carrier = CONNECTED(tp); if (!carrier && ISSET(tp->t_state, TS_ISOPEN)) { mutex_spin_exit(&tty_lock); return (0); /* EOF */ } if (!has_stime || slp <= 0) { if (flag & IO_NDELAY) { mutex_spin_exit(&tty_lock); return (EWOULDBLOCK); } } error = ttysleep(tp, &tp->t_rawcv, true, slp); mutex_spin_exit(&tty_lock); /* VMIN == 0: any quantity read satisfies */ if (cc[VMIN] == 0 && error == EWOULDBLOCK) return (0); if (error && error != EWOULDBLOCK) return (error); goto loop; } read: /* * Input present, check for input mapping and processing. */ first = 1; while ((c = getc(qp)) >= 0) { /* * delayed suspend (^Y) */ if (CCEQ(cc[VDSUSP], c) && ISSET(lflag, IEXTEN|ISIG) == (IEXTEN|ISIG)) { ttysig(tp, TTYSIG_PG1, SIGTSTP); if (first) { error = ttypause(tp, hz); if (error) break; mutex_spin_exit(&tty_lock); goto loop; } break; } /* * Interpret EOF only in canonical mode. */ if (CCEQ(cc[VEOF], c) && ISSET(lflag, ICANON)) break; /* * Give user character. */ mutex_spin_exit(&tty_lock); error = ureadc(c, uio); mutex_spin_enter(&tty_lock); if (error) break; if (uio->uio_resid == 0) break; /* * In canonical mode check for a "break character" * marking the end of a "line of input". */ if (ISSET(lflag, ICANON) && TTBREAKC(c, lflag)) break; first = 0; } /* * Look to unblock output now that (presumably) * the input queue has gone down. */ if (ISSET(tp->t_state, TS_TBLOCK) && tp->t_rawq.c_cc < TTYHOG / 5) { if (ISSET(tp->t_iflag, IXOFF) && cc[VSTART] != _POSIX_VDISABLE && putc(cc[VSTART], &tp->t_outq) == 0) { CLR(tp->t_state, TS_TBLOCK); ttstart(tp); } /* Try to unblock remote output via hardware flow control. */ if (ISSET(tp->t_cflag, CHWFLOW) && tp->t_hwiflow && (*tp->t_hwiflow)(tp, 0) != 0) CLR(tp->t_state, TS_TBLOCK); } mutex_spin_exit(&tty_lock); return (error); } /* * Check the output queue on tp for space for a kernel message (from uprintf * or tprintf). Allow some space over the normal hiwater mark so we don't * lose messages due to normal flow control, but don't let the tty run amok. * Sleeps here are not interruptible, but we return prematurely if new signals * arrive. * Call with tty lock held. */ static int ttycheckoutq_wlock(struct tty *tp) { int hiwat; KASSERT(mutex_owned(&tty_lock)); hiwat = tp->t_hiwat; if (tp->t_outq.c_cc > hiwat + 200) if (tp->t_outq.c_cc > hiwat) { ttstart(tp); return (0); } return (1); } int ttycheckoutq(struct tty *tp) { int r; mutex_spin_enter(&tty_lock); r = ttycheckoutq_wlock(tp); mutex_spin_exit(&tty_lock); return (r); } /* * Process a write call on a tty device. */ int ttwrite(struct tty *tp, struct uio *uio, int flag) { u_char *cp; struct proc *p; int cc, cc0, ce, i, hiwat, error; u_char obuf[OBUFSIZ]; cp = NULL; hiwat = tp->t_hiwat; error = 0; cc0 = cc = 0; loop: mutex_spin_enter(&tty_lock); if (!CONNECTED(tp)) { if (ISSET(tp->t_state, TS_ISOPEN)) { mutex_spin_exit(&tty_lock); return (EIO); } else if (flag & IO_NDELAY) { mutex_spin_exit(&tty_lock); error = EWOULDBLOCK; goto out; } else { /* Sleep awaiting carrier. */ error = ttysleep(tp, &tp->t_rawcv, true, 0); mutex_spin_exit(&tty_lock); if (error) goto out; goto loop; } } /* * Hang the process if it's in the background. */ p = curproc; if (isbackground(p, tp) && ISSET(tp->t_lflag, TOSTOP) && (p->p_lflag & PL_PPWAIT) == 0 && !sigismasked(curlwp, SIGTTOU)) { if (p->p_pgrp->pg_jobc == 0) { error = EIO; mutex_spin_exit(&tty_lock); goto out; } mutex_spin_exit(&tty_lock); mutex_enter(&proc_lock); pgsignal(p->p_pgrp, SIGTTOU, 1); mutex_exit(&proc_lock); mutex_spin_enter(&tty_lock); error = ttypause(tp, hz); mutex_spin_exit(&tty_lock); if (error) goto out; goto loop; } mutex_spin_exit(&tty_lock); /* * Process the user's data in at most OBUFSIZ chunks. Perform any * output translation. Keep track of high water mark, sleep on * overflow awaiting device aid in acquiring new space. */ while (uio->uio_resid > 0 || cc > 0) { if (ISSET(tp->t_lflag, FLUSHO)) { uio->uio_resid = 0; return (0); } if (tp->t_outq.c_cc > hiwat) goto ovhiwat; /* * Grab a hunk of data from the user, unless we have some * leftover from last time. */ if (cc == 0) { uioskip(cc0, uio); cc0 = cc = uimin(uio->uio_resid, OBUFSIZ); cp = obuf; error = uiopeek(cp, cc, uio); if (error) { cc = 0; goto out; } } /* * If nothing fancy need be done, grab those characters we * can handle without any of ttyoutput's processing and * just transfer them to the output q. For those chars * which require special processing (as indicated by the * bits in char_type), call ttyoutput. After processing * a hunk of data, look for FLUSHO so ^O's will take effect * immediately. */ mutex_spin_enter(&tty_lock); while (cc > 0) { if (!ISSET(tp->t_oflag, OPOST)) ce = cc; else { ce = cc - scanc((u_int)cc, cp, char_type, CCLASSMASK); /* * If ce is zero, then we're processing * a special character through ttyoutput. */ if (ce == 0) { tp->t_rocount = 0; if (ttyoutput(*cp, tp) >= 0) { /* out of space */ mutex_spin_exit(&tty_lock); goto overfull; } cp++; cc--; if (ISSET(tp->t_lflag, FLUSHO) || tp->t_outq.c_cc > hiwat) { mutex_spin_exit(&tty_lock); goto ovhiwat; } continue; } } /* * A bunch of normal characters have been found. * Transfer them en masse to the output queue and * continue processing at the top of the loop. * If there are any further characters in this * <= OBUFSIZ chunk, the first should be a character * requiring special handling by ttyoutput. */ tp->t_rocount = 0; i = b_to_q(cp, ce, &tp->t_outq); ce -= i; tp->t_column += ce; cp += ce, cc -= ce, tk_nout += ce; tp->t_outcc += ce; if (i > 0) { /* out of space */ mutex_spin_exit(&tty_lock); goto overfull; } if (ISSET(tp->t_lflag, FLUSHO) || tp->t_outq.c_cc > hiwat) break; } ttstart(tp); mutex_spin_exit(&tty_lock); } out: KASSERTMSG(error || cc == 0, "error=%d cc=%d", error, cc); KASSERTMSG(cc0 >= cc, "cc0=%d cc=%d", cc0, cc); uioskip(cc0 - cc, uio); return (error); overfull: /* * Since we are using ring buffers, if we can't insert any more into * the output queue, we can assume the ring is full and that someone * forgot to set the high water mark correctly. We set it and then * proceed as normal. */ hiwat = tp->t_outq.c_cc - 1; ovhiwat: mutex_spin_enter(&tty_lock); ttstart(tp); /* * This can only occur if FLUSHO is set in t_lflag, * or if ttstart/oproc is synchronous (or very fast). */ if (tp->t_outq.c_cc <= hiwat) { mutex_spin_exit(&tty_lock); goto loop; } if (flag & IO_NDELAY) { mutex_spin_exit(&tty_lock); error = EWOULDBLOCK; goto out; } error = ttysleep(tp, &tp->t_outcv, true, 0); mutex_spin_exit(&tty_lock); if (error) goto out; goto loop; } /* * Try to pull more output from the producer. Return non-zero if * there is output ready to be sent. */ bool ttypull(struct tty *tp) { /* XXXSMP not yet KASSERT(mutex_owned(&tty_lock)); */ if (tp->t_outq.c_cc <= tp->t_lowat) { cv_broadcast(&tp->t_outcv); selnotify(&tp->t_wsel, 0, NOTE_SUBMIT); } return tp->t_outq.c_cc != 0; } /* * Rubout one character from the rawq of tp * as cleanly as possible. * Called with tty lock held. */ void ttyrub(int c, struct tty *tp) { u_char *cp; int savecol, tabc; KASSERT(mutex_owned(&tty_lock)); if (!ISSET(tp->t_lflag, ECHO) || ISSET(tp->t_lflag, EXTPROC)) return; CLR(tp->t_lflag, FLUSHO); if (ISSET(tp->t_lflag, ECHOE)) { if (tp->t_rocount == 0) { /* * Screwed by ttwrite; retype */ ttyretype(tp); return; } if (c == ('\t' | TTY_QUOTE) || c == ('\n' | TTY_QUOTE)) ttyrubo(tp, 2); else { CLR(c, ~TTY_CHARMASK); switch (CCLASS(c)) { case ORDINARY: ttyrubo(tp, 1); break; case BACKSPACE: case CONTROL: case NEWLINE: case RETURN: case VTAB: if (ISSET(tp->t_lflag, ECHOCTL)) ttyrubo(tp, 2); break; case TAB: if (tp->t_rocount < tp->t_rawq.c_cc) { ttyretype(tp); return; } savecol = tp->t_column; SET(tp->t_state, TS_CNTTB); SET(tp->t_lflag, FLUSHO); tp->t_column = tp->t_rocol; for (cp = firstc(&tp->t_rawq, &tabc); cp; cp = nextc(&tp->t_rawq, cp, &tabc)) ttyecho(tabc, tp); CLR(tp->t_lflag, FLUSHO); CLR(tp->t_state, TS_CNTTB); /* savecol will now be length of the tab. */ savecol -= tp->t_column; tp->t_column += savecol; if (savecol > 8) savecol = 8; /* overflow screw */ while (--savecol >= 0) (void)ttyoutput('\b', tp); break; default: /* XXX */ (void)printf("ttyrub: would panic c = %d, " "val = %d\n", c, CCLASS(c)); } } } else if (ISSET(tp->t_lflag, ECHOPRT)) { if (!ISSET(tp->t_state, TS_ERASE)) { SET(tp->t_state, TS_ERASE); (void)ttyoutput('\\', tp); } ttyecho(c, tp); } else ttyecho(tp->t_cc[VERASE], tp); --tp->t_rocount; } /* * Back over cnt characters, erasing them. * Called with tty lock held. */ static void ttyrubo(struct tty *tp, int cnt) { KASSERT(mutex_owned(&tty_lock)); while (cnt-- > 0) { (void)ttyoutput('\b', tp); (void)ttyoutput(' ', tp); (void)ttyoutput('\b', tp); } } /* * ttyretype -- * Reprint the rawq line. Note, it is assumed that c_cc has already * been checked. * * Called with tty lock held. */ void ttyretype(struct tty *tp) { u_char *cp; int c; KASSERT(mutex_owned(&tty_lock)); /* Echo the reprint character. */ if (tp->t_cc[VREPRINT] != _POSIX_VDISABLE) ttyecho(tp->t_cc[VREPRINT], tp); (void)ttyoutput('\n', tp); for (cp = firstc(&tp->t_canq, &c); cp; cp = nextc(&tp->t_canq, cp, &c)) ttyecho(c, tp); for (cp = firstc(&tp->t_rawq, &c); cp; cp = nextc(&tp->t_rawq, cp, &c)) ttyecho(c, tp); CLR(tp->t_state, TS_ERASE); tp->t_rocount = tp->t_rawq.c_cc; tp->t_rocol = 0; } /* * Echo a typed character to the terminal. * Called with tty lock held. */ static void ttyecho(int c, struct tty *tp) { KASSERT(mutex_owned(&tty_lock)); if (!ISSET(tp->t_state, TS_CNTTB)) CLR(tp->t_lflag, FLUSHO); if ((!ISSET(tp->t_lflag, ECHO) && (!ISSET(tp->t_lflag, ECHONL) || c != '\n')) || ISSET(tp->t_lflag, EXTPROC)) return; if (((ISSET(tp->t_lflag, ECHOCTL) && (ISSET(c, TTY_CHARMASK) <= 037 && c != '\t' && c != '\n')) || ISSET(c, TTY_CHARMASK) == 0177)) { (void)ttyoutput('^', tp); CLR(c, ~TTY_CHARMASK); if (c == 0177) c = '?'; else c += 'A' - 1; } (void)ttyoutput(c, tp); } /* * Wake up any readers on a tty. * Called with tty lock held. */ void ttwakeup(struct tty *tp) { KASSERT(mutex_owned(&tty_lock)); selnotify(&tp->t_rsel, 0, NOTE_SUBMIT); if (ISSET(tp->t_state, TS_ASYNC)) ttysig(tp, TTYSIG_PG2, SIGIO); cv_broadcast(&tp->t_rawcv); } /* * Look up a code for a specified speed in a conversion table; * used by drivers to map software speed values to hardware parameters. */ int ttspeedtab(int speed, const struct speedtab *table) { for (; table->sp_speed != -1; table++) if (table->sp_speed == speed) return (table->sp_code); return (-1); } /* * Set tty hi and low water marks. * * Try to arrange the dynamics so there's about one second * from hi to low water. */ void ttsetwater(struct tty *tp) { int cps, x; /* XXX not yet KASSERT(mutex_owned(&tty_lock)); */ #define CLAMP(x, h, l) ((x) > h ? h : ((x) < l) ? l : (x)) cps = tp->t_ospeed / 10; tp->t_lowat = x = CLAMP(cps / 2, TTMAXLOWAT, TTMINLOWAT); x += cps; x = CLAMP(x, TTMAXHIWAT, TTMINHIWAT); tp->t_hiwat = roundup(x, TTROUND); #undef CLAMP } /* * Prepare report on state of foreground process group. * Call with &proc_lock held. */ void ttygetinfo(struct tty *tp, int fromsig, char *buf, size_t bufsz) { struct lwp *l; struct proc *p, *pick = NULL; struct timeval utime, stime; int tmp; fixpt_t pctcpu = 0; const char *msg = NULL; char lmsg[100]; long rss; bool again = false; KASSERT(mutex_owned(&proc_lock)); *buf = '\0'; retry: if (tp->t_session == NULL) msg = "not a controlling terminal\n"; else if (tp->t_pgrp == NULL) msg = "no foreground process group\n"; else if ((p = LIST_FIRST(&tp->t_pgrp->pg_members)) == NULL) msg = "empty foreground process group\n"; else { /* Pick interesting process. */ for (; p != NULL; p = LIST_NEXT(p, p_pglist)) { struct proc *oldpick; if (pick == NULL) { pick = p; continue; } if (pick->p_lock < p->p_lock) { mutex_enter(pick->p_lock); mutex_enter(p->p_lock); } else if (pick->p_lock > p->p_lock) { mutex_enter(p->p_lock); mutex_enter(pick->p_lock); } else mutex_enter(p->p_lock); oldpick = pick; if (proc_compare_wrapper(pick, p)) pick = p; mutex_exit(p->p_lock); if (p->p_lock != oldpick->p_lock) mutex_exit(oldpick->p_lock); } if (pick != NULL) { mutex_enter(pick->p_lock); if (P_ZOMBIE(pick)) { mutex_exit(pick->p_lock); pick = NULL; if (!again) { again = true; goto retry; } msg = "found only zombie processes\n"; } if (pick && fromsig && (SIGACTION_PS(pick->p_sigacts, SIGINFO).sa_flags & SA_NOKERNINFO)) { mutex_exit(pick->p_lock); return; } } } /* Print load average. */ tmp = (averunnable.ldavg[0] * 100 + FSCALE / 2) >> FSHIFT; snprintf(lmsg, sizeof(lmsg), "load: %d.%02d ", tmp / 100, tmp % 100); strlcat(buf, lmsg, bufsz); if (pick == NULL) { strlcat(buf, msg, bufsz); return; } snprintf(lmsg, sizeof(lmsg), " cmd: %s %d [", pick->p_comm, pick->p_pid); strlcat(buf, lmsg, bufsz); KASSERT(mutex_owned(pick->p_lock)); LIST_FOREACH(l, &pick->p_lwps, l_sibling) { const char *lp; lwp_lock(l); #ifdef LWP_PC #define FMT_RUN "%#"PRIxVADDR #define VAL_RUNNING (vaddr_t)LWP_PC(l) #define VAL_RUNNABLE (vaddr_t)LWP_PC(l) #else #define FMT_RUN "%s" #define VAL_RUNNING "running" #define VAL_RUNNABLE "runnable" #endif switch (l->l_stat) { case LSONPROC: snprintf(lmsg, sizeof(lmsg), FMT_RUN"/%d", VAL_RUNNING, cpu_index(l->l_cpu)); lp = lmsg; break; case LSRUN: snprintf(lmsg, sizeof(lmsg), FMT_RUN, VAL_RUNNABLE); lp = lmsg; break; default: lp = l->l_wchan ? l->l_wmesg : "iowait"; break; } strlcat(buf, lp, bufsz); strlcat(buf, LIST_NEXT(l, l_sibling) != NULL ? " " : "] ", bufsz); pctcpu += l->l_pctcpu; lwp_unlock(l); } pctcpu += pick->p_pctcpu; calcru(pick, &utime, &stime, NULL, NULL); mutex_exit(pick->p_lock); /* Round up and print user+system time, %CPU and RSS. */ utime.tv_usec += 5000; if (utime.tv_usec >= 1000000) { utime.tv_sec += 1; utime.tv_usec -= 1000000; } stime.tv_usec += 5000; if (stime.tv_usec >= 1000000) { stime.tv_sec += 1; stime.tv_usec -= 1000000; } #define pgtok(a) (((u_long) ((a) * PAGE_SIZE) / 1024)) tmp = (pctcpu * 10000 + FSCALE / 2) >> FSHIFT; if (pick->p_stat == SIDL || P_ZOMBIE(pick)) rss = 0; else rss = pgtok(vm_resident_count(pick->p_vmspace)); snprintf(lmsg, sizeof(lmsg), "%ld.%02ldu %ld.%02lds %d%% %ldk", (long)utime.tv_sec, (long)utime.tv_usec / 10000, (long)stime.tv_sec, (long)stime.tv_usec / 10000, tmp / 100, rss); strlcat(buf, lmsg, bufsz); } /* * Print report on state of foreground process group. * Call with tty_lock held. */ void ttyputinfo(struct tty *tp, char *buf) { KASSERT(mutex_owned(&tty_lock)); if (ttycheckoutq_wlock(tp) == 0) return; ttyprintf_nolock(tp, "%s\n", buf); tp->t_rocount = 0; /* so pending input will be retyped if BS */ } /* * Returns 1 if p2 has a better chance being the active foreground process * in a terminal instead of p1. */ static int proc_compare_wrapper(struct proc *p1, struct proc *p2) { lwp_t *l1, *l2; KASSERT(mutex_owned(p1->p_lock)); KASSERT(mutex_owned(p2->p_lock)); l1 = LIST_FIRST(&p1->p_lwps); l2 = LIST_FIRST(&p2->p_lwps); return proc_compare(p1, l1, p2, l2); } /* * Output char to tty; console putchar style. * Can be called with tty lock held through kprintf() machinery.. */ int tputchar(int c, int flags, struct tty *tp) { int r = 0; if ((flags & NOLOCK) == 0) mutex_spin_enter(&tty_lock); if (!CONNECTED(tp)) { r = -1; goto out; } if (c == '\n') (void)ttyoutput('\r', tp); (void)ttyoutput(c, tp); ttstart(tp); out: if ((flags & NOLOCK) == 0) mutex_spin_exit(&tty_lock); return (r); } /* * Sleep on chan, returning ERESTART if tty changed while we napped and * returning any errors (e.g. EINTR/EWOULDBLOCK) reported by * cv_timedwait(_sig). * If the tty is revoked, restarting a pending call will redo validation done * at the start of the call. * * Must be called with the tty lock held. */ int ttysleep(struct tty *tp, kcondvar_t *cv, bool catch_p, int timo) { int error; short gen; KASSERT(mutex_owned(&tty_lock)); gen = tp->t_gen; if (ISSET(tp->t_state, TS_CANCEL)) error = ERESTART; else if (cv == NULL) error = kpause("ttypause", catch_p, timo, &tty_lock); else if (catch_p) error = cv_timedwait_sig(cv, &tty_lock, timo); else error = cv_timedwait(cv, &tty_lock, timo); if (error != 0) return (error); return (tp->t_gen == gen ? 0 : ERESTART); } int ttypause(struct tty *tp, int timo) { int error; error = ttysleep(tp, NULL, true, timo); if (error == EWOULDBLOCK) error = 0; return error; } /* * Attach a tty to the tty list. * * This should be called ONLY once per real tty (including pty's). * eg, on the sparc, the keyboard and mouse have struct tty's that are * distinctly NOT usable as tty's, and thus should not be attached to * the ttylist. This is why this call is not done from tty_alloc(). * * Device drivers should attach tty's at a similar time that they are * allocated, or, for the case of statically allocated struct tty's * either in the attach or (first) open routine. */ void tty_attach(struct tty *tp) { mutex_spin_enter(&tty_lock); TAILQ_INSERT_TAIL(&ttylist, tp, tty_link); ++tty_count; mutex_spin_exit(&tty_lock); } /* * Remove a tty from the tty list. */ void tty_detach(struct tty *tp) { mutex_spin_enter(&tty_lock); --tty_count; #ifdef DIAGNOSTIC if (tty_count < 0) panic("tty_detach: tty_count < 0"); #endif TAILQ_REMOVE(&ttylist, tp, tty_link); mutex_spin_exit(&tty_lock); } /* * Allocate a tty structure and its associated buffers. */ struct tty * tty_alloc(void) { struct tty *tp; int i; tp = kmem_zalloc(sizeof(*tp), KM_SLEEP); callout_init(&tp->t_rstrt_ch, 0); callout_setfunc(&tp->t_rstrt_ch, ttrstrt, tp); tp->t_qsize = tty_qsize; clalloc(&tp->t_rawq, tp->t_qsize, 1); cv_init(&tp->t_rawcv, "ttyraw"); cv_init(&tp->t_rawcvf, "ttyrawf"); clalloc(&tp->t_canq, tp->t_qsize, 1); cv_init(&tp->t_cancv, "ttycan"); cv_init(&tp->t_cancvf, "ttycanf"); /* output queue doesn't need quoting */ clalloc(&tp->t_outq, tp->t_qsize, 0); cv_init(&tp->t_outcv, "ttyout"); cv_init(&tp->t_outcvf, "ttyoutf"); /* Set default line discipline. */ tp->t_linesw = ttyldisc_default(); tp->t_dev = NODEV; selinit(&tp->t_rsel); selinit(&tp->t_wsel); for (i = 0; i < TTYSIG_COUNT; i++) { sigemptyset(&tp->t_sigs[i]); } return tp; } /* * Free a tty structure and its buffers. * * Be sure to call tty_detach() for any tty that has been * tty_attach()ed. */ void tty_free(struct tty *tp) { int i; mutex_enter(&proc_lock); mutex_enter(&tty_lock); for (i = 0; i < TTYSIG_COUNT; i++) sigemptyset(&tp->t_sigs[i]); if (tp->t_sigcount != 0) TAILQ_REMOVE(&tty_sigqueue, tp, t_sigqueue); mutex_exit(&tty_lock); mutex_exit(&proc_lock); callout_halt(&tp->t_rstrt_ch, NULL); callout_destroy(&tp->t_rstrt_ch); ttyldisc_release(tp->t_linesw); clfree(&tp->t_rawq); clfree(&tp->t_canq); clfree(&tp->t_outq); cv_destroy(&tp->t_rawcv); cv_destroy(&tp->t_rawcvf); cv_destroy(&tp->t_cancv); cv_destroy(&tp->t_cancvf); cv_destroy(&tp->t_outcv); cv_destroy(&tp->t_outcvf); seldestroy(&tp->t_rsel); seldestroy(&tp->t_wsel); kmem_free(tp, sizeof(*tp)); } /* * tty_unit: map dev_t to tty unit number, as with TTUNIT * * => defined as function for use with struct cdevsw::d_devtounit * => not for drivers with different unit numbering, e.g. TTUNIT(d) >> 4 */ int tty_unit(dev_t dev) { return TTUNIT(dev); } /* * ttyprintf_nolock: send a message to a specific tty, without locking. * * => should be used only by tty driver or anything that knows the * underlying tty will not be revoked(2)'d away. [otherwise, * use tprintf] */ static void ttyprintf_nolock(struct tty *tp, const char *fmt, ...) { va_list ap; /* No mutex needed; going to process TTY. */ va_start(ap, fmt); kprintf(fmt, TOTTY|NOLOCK, tp, NULL, ap); va_end(ap); } static int tty_listener_cb(kauth_cred_t cred, kauth_action_t action, void *cookie, void *arg0, void *arg1, void *arg2, void *arg3) { struct tty *tty; int result; result = KAUTH_RESULT_DEFER; if (action != KAUTH_DEVICE_TTY_OPEN) return result; tty = arg0; /* If it's not opened, we allow. */ if ((tty->t_state & TS_ISOPEN) == 0) result = KAUTH_RESULT_ALLOW; else { /* * If it's opened, we can only allow if it's not exclusively * opened; otherwise, that's a privileged operation and we * let the secmodel handle it. */ if ((tty->t_state & TS_XCLUDE) == 0) result = KAUTH_RESULT_ALLOW; } return result; } /* * Initialize the tty subsystem. */ void tty_init(void) { mutex_init(&tty_lock, MUTEX_DEFAULT, IPL_VM); mutex_init(&constty_lock, MUTEX_DEFAULT, IPL_NONE); constty_psz = pserialize_create(); cv_init(&ttyref_cv, "ttyref"); tty_sigsih = softint_establish(SOFTINT_CLOCK, ttysigintr, NULL); KASSERT(tty_sigsih != NULL); tty_listener = kauth_listen_scope(KAUTH_SCOPE_DEVICE, tty_listener_cb, NULL); sysctl_kern_tty_setup(); } /* * Send a signal from a tty to its process group or session leader. * Handoff to the target is deferred to a soft interrupt. */ void ttysig(struct tty *tp, enum ttysigtype st, int sig) { sigset_t *sp; /* XXXSMP not yet KASSERT(mutex_owned(&tty_lock)); */ sp = &tp->t_sigs[st]; if (sigismember(sp, sig)) return; sigaddset(sp, sig); if (tp->t_sigcount++ == 0) TAILQ_INSERT_TAIL(&tty_sigqueue, tp, t_sigqueue); softint_schedule(tty_sigsih); } /* * Deliver deferred signals from ttys. Note that the process groups * and sessions associated with the ttys may have changed from when * the signal was originally sent, but in practice it should not matter. * For signals produced as a result of a syscall, the soft interrupt * will fire before the syscall returns to the user. */ static void ttysigintr(void *cookie) { struct tty *tp; enum ttysigtype st; struct pgrp *pgrp; struct session *sess; int sig, lflag; char infobuf[200]; mutex_enter(&proc_lock); mutex_spin_enter(&tty_lock); while ((tp = TAILQ_FIRST(&tty_sigqueue)) != NULL) { KASSERT(tp->t_sigcount > 0); for (st = TTYSIG_PG1; st < TTYSIG_COUNT; st++) { if ((sig = firstsig(&tp->t_sigs[st])) != 0) break; } KASSERT(st < TTYSIG_COUNT); sigdelset(&tp->t_sigs[st], sig); if (--tp->t_sigcount == 0) TAILQ_REMOVE(&tty_sigqueue, tp, t_sigqueue); pgrp = tp->t_pgrp; sess = tp->t_session; lflag = tp->t_lflag; if (sig == SIGINFO) { if (ISSET(tp->t_state, TS_SIGINFO)) { /* Via ioctl: ignore tty option. */ tp->t_state &= ~TS_SIGINFO; lflag |= ISIG; } if (!ISSET(lflag, NOKERNINFO)) { mutex_spin_exit(&tty_lock); ttygetinfo(tp, 1, infobuf, sizeof(infobuf)); mutex_spin_enter(&tty_lock); ttyputinfo(tp, infobuf); } if (!ISSET(lflag, ISIG)) continue; } mutex_spin_exit(&tty_lock); KASSERT(sig != 0); switch (st) { case TTYSIG_PG1: if (pgrp != NULL) pgsignal(pgrp, sig, 1); break; case TTYSIG_PG2: if (pgrp != NULL) pgsignal(pgrp, sig, sess != NULL); break; case TTYSIG_LEADER: if (sess != NULL && sess->s_leader != NULL) psignal(sess->s_leader, sig); break; default: /* NOTREACHED */ break; } mutex_spin_enter(&tty_lock); } mutex_spin_exit(&tty_lock); mutex_exit(&proc_lock); } unsigned char tty_getctrlchar(struct tty *tp, unsigned which) { KASSERT(which < NCCS); return tp->t_cc[which]; } void tty_setctrlchar(struct tty *tp, unsigned which, unsigned char val) { KASSERT(which < NCCS); tp->t_cc[which] = val; } int tty_try_xonxoff(struct tty *tp, unsigned char c) { const struct cdevsw *cdev; if (tp->t_iflag & IXON) { if (c == tp->t_cc[VSTOP] && tp->t_cc[VSTOP] != _POSIX_VDISABLE) { if ((tp->t_state & TS_TTSTOP) == 0) { tp->t_state |= TS_TTSTOP; cdev = cdevsw_lookup(tp->t_dev); if (cdev != NULL) (*cdev->d_stop)(tp, 0); } return 0; } if (c == tp->t_cc[VSTART] && tp->t_cc[VSTART] != _POSIX_VDISABLE) { tp->t_state &= ~TS_TTSTOP; if (tp->t_oproc != NULL) { mutex_spin_enter(&tty_lock); /* XXX */ (*tp->t_oproc)(tp); mutex_spin_exit(&tty_lock); /* XXX */ } return 0; } } return EAGAIN; }
122 353 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 /* $NetBSD: pmap_private.h,v 1.5 2023/10/04 20:28:06 ad Exp $ */ /* * Copyright (c) 1997 Charles D. Cranor and Washington University. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* * Copyright (c) 2001 Wasabi Systems, Inc. * All rights reserved. * * Written by Frank van der Linden for Wasabi Systems, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed for the NetBSD Project by * Wasabi Systems, Inc. * 4. The name of Wasabi Systems, Inc. may not be used to endorse * or promote products derived from this software without specific prior * written permission. * * THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL WASABI SYSTEMS, INC * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #ifndef _X86_PMAP_PRIVATE_H_ #define _X86_PMAP_PRIVATE_H_ #ifndef _MACHINE_PMAP_PRIVATE_H_X86 #error Include machine/pmap_private.h, not x86/pmap_private.h. #endif #ifdef _KERNEL_OPT #include "opt_svs.h" #endif #include <sys/param.h> #include <sys/types.h> #include <sys/kcpuset.h> #include <sys/mutex.h> #include <sys/pool.h> #include <sys/queue.h> #include <sys/rwlock.h> #include <machine/cpufunc.h> #include <machine/pte.h> #include <machine/vmparam.h> #include <uvm/uvm_object.h> #include <uvm/uvm_pmap.h> struct pmap; #define SLAREA_USER 0 #define SLAREA_PTE 1 #define SLAREA_MAIN 2 #define SLAREA_PCPU 3 #define SLAREA_DMAP 4 #define SLAREA_HYPV 5 #define SLAREA_ASAN 6 #define SLAREA_MSAN 7 #define SLAREA_KERN 8 #define SLSPACE_NAREAS 9 struct slotspace { struct { size_t sslot; /* start slot */ size_t nslot; /* # of slots */ bool active; /* area is active */ } area[SLSPACE_NAREAS]; }; extern struct slotspace slotspace; #include <x86/gdt.h> struct pcpu_entry { uint8_t gdt[MAXGDTSIZ]; uint8_t ldt[MAX_USERLDT_SIZE]; uint8_t idt[PAGE_SIZE]; uint8_t tss[PAGE_SIZE]; uint8_t ist0[PAGE_SIZE]; uint8_t ist1[PAGE_SIZE]; uint8_t ist2[PAGE_SIZE]; uint8_t ist3[PAGE_SIZE]; uint8_t rsp0[2 * PAGE_SIZE]; } __packed; struct pcpu_area { #ifdef SVS uint8_t utls[PAGE_SIZE]; #endif uint8_t ldt[PAGE_SIZE]; struct pcpu_entry ent[MAXCPUS]; } __packed; extern struct pcpu_area *pcpuarea; #define PMAP_PCID_KERN 0 #define PMAP_PCID_USER 1 /* * pmap data structures: see pmap.c for details of locking. */ /* * we maintain a list of all non-kernel pmaps */ LIST_HEAD(pmap_head, pmap); /* struct pmap_head: head of a pmap list */ /* * linked list of all non-kernel pmaps */ extern struct pmap_head pmaps; extern kmutex_t pmaps_lock; /* protects pmaps */ /* * pool_cache(9) that pmaps are allocated from */ extern struct pool_cache pmap_cache; /* * the pmap structure * * note that the pm_obj contains the lock pointer, the reference count, * page list, and number of PTPs within the pmap. * * pm_lock is the same as the lock for vm object 0. Changes to * the other objects may only be made if that lock has been taken * (the other object locks are only used when uvm_pagealloc is called) */ struct pv_page; struct pmap { struct uvm_object pm_obj[PTP_LEVELS-1];/* objects for lvl >= 1) */ LIST_ENTRY(pmap) pm_list; /* list of all pmaps */ pd_entry_t *pm_pdir; /* VA of PD */ paddr_t pm_pdirpa[PDP_SIZE]; /* PA of PDs (read-only after create) */ struct vm_page *pm_ptphint[PTP_LEVELS-1]; /* pointer to a PTP in our pmap */ struct pmap_statistics pm_stats; /* pmap stats */ struct pv_entry *pm_pve; /* spare pv_entry */ LIST_HEAD(, pv_page) pm_pvp_part; LIST_HEAD(, pv_page) pm_pvp_empty; LIST_HEAD(, pv_page) pm_pvp_full; #if !defined(__x86_64__) vaddr_t pm_hiexec; /* highest executable mapping */ #endif /* !defined(__x86_64__) */ union descriptor *pm_ldt; /* user-set LDT */ size_t pm_ldt_len; /* XXX unused, remove */ int pm_ldt_sel; /* LDT selector */ kcpuset_t *pm_cpus; /* mask of CPUs using pmap */ kcpuset_t *pm_kernel_cpus; /* mask of CPUs using kernel part of pmap */ kcpuset_t *pm_xen_ptp_cpus; /* mask of CPUs which have this pmap's ptp mapped */ long pm_pctr; /* for assertions */ LIST_HEAD(,vm_page) pm_gc_ptp; /* PTPs queued for free */ /* Used by NVMM and Xen */ int (*pm_enter)(struct pmap *, vaddr_t, paddr_t, vm_prot_t, u_int); bool (*pm_extract)(struct pmap *, vaddr_t, paddr_t *); void (*pm_remove)(struct pmap *, vaddr_t, vaddr_t); int (*pm_sync_pv)(struct vm_page *, vaddr_t, paddr_t, int, uint8_t *, pt_entry_t *); void (*pm_pp_remove_ent)(struct pmap *, struct vm_page *, pt_entry_t, vaddr_t); void (*pm_write_protect)(struct pmap *, vaddr_t, vaddr_t, vm_prot_t); void (*pm_unwire)(struct pmap *, vaddr_t); void (*pm_tlb_flush)(struct pmap *); void *pm_data; kmutex_t pm_lock /* locks for pm_objs */ __aligned(64); /* give lock own cache line */ krwlock_t pm_dummy_lock; /* ugly hack for abusing uvm_object */ }; /* macro to access pm_pdirpa slots */ #ifdef PAE #define pmap_pdirpa(pmap, index) \ ((pmap)->pm_pdirpa[l2tol3(index)] + l2tol2(index) * sizeof(pd_entry_t)) #else #define pmap_pdirpa(pmap, index) \ ((pmap)->pm_pdirpa[0] + (index) * sizeof(pd_entry_t)) #endif /* * global kernel variables */ /* * PDPpaddr is the physical address of the kernel's PDP. * - i386 non-PAE and amd64: PDPpaddr corresponds directly to the %cr3 * value associated to the kernel process, proc0. * - i386 PAE: it still represents the PA of the kernel's PDP (L2). Due to * the L3 PD, it cannot be considered as the equivalent of a %cr3 any more. * - Xen: it corresponds to the PFN of the kernel's PDP. */ extern u_long PDPpaddr; extern pd_entry_t pmap_pg_g; /* do we support PTE_G? */ extern pd_entry_t pmap_pg_nx; /* do we support PTE_NX? */ extern int pmap_largepages; extern long nkptp[PTP_LEVELS]; #define pmap_valid_entry(E) ((E) & PTE_P) /* is PDE or PTE valid? */ void pmap_map_ptes(struct pmap *, struct pmap **, pd_entry_t **, pd_entry_t * const **); void pmap_unmap_ptes(struct pmap *, struct pmap *); bool pmap_pdes_valid(vaddr_t, pd_entry_t * const *, pd_entry_t *, int *lastlvl); bool pmap_is_curpmap(struct pmap *); void pmap_ept_transform(struct pmap *); #ifndef __HAVE_DIRECT_MAP void pmap_vpage_cpu_init(struct cpu_info *); #endif vaddr_t slotspace_rand(int, size_t, size_t, size_t, vaddr_t); vaddr_t reserve_dumppages(vaddr_t); /* XXX: not a pmap fn */ typedef enum tlbwhy { TLBSHOOT_REMOVE_ALL, TLBSHOOT_KENTER, TLBSHOOT_KREMOVE, TLBSHOOT_FREE_PTP, TLBSHOOT_REMOVE_PTE, TLBSHOOT_SYNC_PV, TLBSHOOT_WRITE_PROTECT, TLBSHOOT_ENTER, TLBSHOOT_NVMM, TLBSHOOT_BUS_DMA, TLBSHOOT_BUS_SPACE, TLBSHOOT__MAX, } tlbwhy_t; void pmap_tlb_init(void); void pmap_tlb_cpu_init(struct cpu_info *); void pmap_tlb_shootdown(pmap_t, vaddr_t, pt_entry_t, tlbwhy_t); void pmap_tlb_shootnow(void); void pmap_tlb_intr(void); /* * inline functions */ /* * pmap_update_pg: flush one page from the TLB (or flush the whole thing * if hardware doesn't support one-page flushing) */ __inline static void __unused pmap_update_pg(vaddr_t va) { invlpg(va); } /* * various address inlines * * vtopte: return a pointer to the PTE mapping a VA, works only for * user and PT addresses * * kvtopte: return a pointer to the PTE mapping a kernel VA */ #include <lib/libkern/libkern.h> static __inline pt_entry_t * __unused vtopte(vaddr_t va) { KASSERT(va < VM_MIN_KERNEL_ADDRESS); return (PTE_BASE + pl1_i(va)); } static __inline pt_entry_t * __unused kvtopte(vaddr_t va) { pd_entry_t *pde; KASSERT(va >= VM_MIN_KERNEL_ADDRESS); pde = L2_BASE + pl2_i(va); if (*pde & PTE_PS) return ((pt_entry_t *)pde); return (PTE_BASE + pl1_i(va)); } #ifdef XENPV #include <sys/bitops.h> #define XPTE_MASK L1_FRAME /* Selects the index of a PTE in (A)PTE_BASE */ #define XPTE_SHIFT (L1_SHIFT - ilog2(sizeof(pt_entry_t))) /* PTE access inline functions */ /* * Get the machine address of the pointed pte * We use hardware MMU to get value so works only for levels 1-3 */ static __inline paddr_t xpmap_ptetomach(pt_entry_t *pte) { pt_entry_t *up_pte; vaddr_t va = (vaddr_t) pte; va = ((va & XPTE_MASK) >> XPTE_SHIFT) | (vaddr_t) PTE_BASE; up_pte = (pt_entry_t *) va; return (paddr_t) (((*up_pte) & PTE_FRAME) + (((vaddr_t) pte) & (~PTE_FRAME & ~VA_SIGN_MASK))); } /* Xen helpers to change bits of a pte */ #define XPMAP_UPDATE_DIRECT 1 /* Update direct map entry flags too */ paddr_t vtomach(vaddr_t); #define vtomfn(va) (vtomach(va) >> PAGE_SHIFT) #endif /* XENPV */ #ifdef __HAVE_PCPU_AREA extern struct pcpu_area *pcpuarea; #define PDIR_SLOT_PCPU 510 #define PMAP_PCPU_BASE (VA_SIGN_NEG((PDIR_SLOT_PCPU * NBPD_L4))) #endif void svs_quad_copy(void *, void *, long); #ifdef _KERNEL_OPT #include "opt_efi.h" #endif #ifdef EFI_RUNTIME void * pmap_activate_sync(struct pmap *); void pmap_deactivate_sync(struct pmap *, void *); bool pmap_is_user(struct pmap *); #else static inline bool pmap_is_user(struct pmap *pmap) { KASSERT(pmap != pmap_kernel()); return true; } #endif #endif /* _X86_PMAP_PRIVATE_H_ */
13 153 5 70 71 38 64 20 1 19 16 17 12 9 15 16 75 17 75 91 81 81 90 91 64 64 64 30 49 81 81 36 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 /* $NetBSD: uipc_domain.c,v 1.109 2023/03/30 15:58:21 riastradh Exp $ */ /* * Copyright (c) 1982, 1986, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)uipc_domain.c 8.3 (Berkeley) 2/14/95 */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: uipc_domain.c,v 1.109 2023/03/30 15:58:21 riastradh Exp $"); #include <sys/param.h> #include <sys/socket.h> #include <sys/socketvar.h> #include <sys/protosw.h> #include <sys/domain.h> #include <sys/mbuf.h> #include <sys/time.h> #include <sys/kernel.h> #include <sys/systm.h> #include <sys/callout.h> #include <sys/queue.h> #include <sys/proc.h> #include <sys/sysctl.h> #include <sys/un.h> #include <sys/unpcb.h> #include <sys/file.h> #include <sys/filedesc.h> #include <sys/kauth.h> #include <netatalk/at.h> #include <net/if_dl.h> #include <netinet/in.h> MALLOC_DECLARE(M_SOCKADDR); MALLOC_DEFINE(M_SOCKADDR, "sockaddr", "socket endpoints"); void pffasttimo(void *); void pfslowtimo(void *); struct domainhead domains = STAILQ_HEAD_INITIALIZER(domains); static struct domain *domain_array[AF_MAX]; callout_t pffasttimo_ch, pfslowtimo_ch; /* * Current time values for fast and slow timeouts. We can use u_int * relatively safely. The fast timer will roll over in 27 years and * the slow timer in 68 years. */ u_int pfslowtimo_now; u_int pffasttimo_now; static struct sysctllog *domain_sysctllog; static void sysctl_net_setup(void); /* ensure successful linkage even without any domains in link sets */ static struct domain domain_dummy; __link_set_add_rodata(domains,domain_dummy); static void domain_init_timers(void) { callout_init(&pffasttimo_ch, CALLOUT_MPSAFE); callout_init(&pfslowtimo_ch, CALLOUT_MPSAFE); callout_reset(&pffasttimo_ch, 1, pffasttimo, NULL); callout_reset(&pfslowtimo_ch, 1, pfslowtimo, NULL); } void domaininit(bool attach) { __link_set_decl(domains, struct domain); struct domain * const * dpp; struct domain *rt_domain = NULL; sysctl_net_setup(); /* * Add all of the domains. Make sure the PF_ROUTE * domain is added last. */ if (attach) { __link_set_foreach(dpp, domains) { if (*dpp == &domain_dummy) continue; if ((*dpp)->dom_family == PF_ROUTE) rt_domain = *dpp; else domain_attach(*dpp); } if (rt_domain) domain_attach(rt_domain); domain_init_timers(); } } /* * Must be called only if domaininit has been called with false and * after all domains have been attached. */ void domaininit_post(void) { domain_init_timers(); } void domain_attach(struct domain *dp) { const struct protosw *pr; STAILQ_INSERT_TAIL(&domains, dp, dom_link); if (dp->dom_family < __arraycount(domain_array)) domain_array[dp->dom_family] = dp; if (dp->dom_init) (*dp->dom_init)(); #ifdef MBUFTRACE if (dp->dom_mowner.mo_name[0] == '\0') { strncpy(dp->dom_mowner.mo_name, dp->dom_name, sizeof(dp->dom_mowner.mo_name)); MOWNER_ATTACH(&dp->dom_mowner); } #endif for (pr = dp->dom_protosw; pr < dp->dom_protoswNPROTOSW; pr++) { if (pr->pr_init) (*pr->pr_init)(); } if (max_linkhdr < 16) /* XXX */ max_linkhdr = 16; max_hdr = max_linkhdr + max_protohdr; max_datalen = MHLEN - max_hdr; } struct domain * pffinddomain(int family) { struct domain *dp; if (family < __arraycount(domain_array) && domain_array[family] != NULL) return domain_array[family]; DOMAIN_FOREACH(dp) if (dp->dom_family == family) return dp; return NULL; } const struct protosw * pffindtype(int family, int type) { struct domain *dp; const struct protosw *pr; dp = pffinddomain(family); if (dp == NULL) return NULL; for (pr = dp->dom_protosw; pr < dp->dom_protoswNPROTOSW; pr++) if (pr->pr_type && pr->pr_type == type) return pr; return NULL; } const struct protosw * pffindproto(int family, int protocol, int type) { struct domain *dp; const struct protosw *pr; const struct protosw *maybe = NULL; if (family == 0) return NULL; dp = pffinddomain(family); if (dp == NULL) return NULL; for (pr = dp->dom_protosw; pr < dp->dom_protoswNPROTOSW; pr++) { if ((pr->pr_protocol == protocol) && (pr->pr_type == type)) return pr; if (type == SOCK_RAW && pr->pr_type == SOCK_RAW && pr->pr_protocol == 0 && maybe == NULL) maybe = pr; } return maybe; } void * sockaddr_addr(struct sockaddr *sa, socklen_t *slenp) { const struct domain *dom; if ((dom = pffinddomain(sa->sa_family)) == NULL || dom->dom_sockaddr_addr == NULL) return NULL; return (*dom->dom_sockaddr_addr)(sa, slenp); } const void * sockaddr_const_addr(const struct sockaddr *sa, socklen_t *slenp) { const struct domain *dom; if ((dom = pffinddomain(sa->sa_family)) == NULL || dom->dom_sockaddr_const_addr == NULL) return NULL; return (*dom->dom_sockaddr_const_addr)(sa, slenp); } const struct sockaddr * sockaddr_any_by_family(sa_family_t family) { const struct domain *dom; if ((dom = pffinddomain(family)) == NULL) return NULL; return dom->dom_sa_any; } const struct sockaddr * sockaddr_any(const struct sockaddr *sa) { return sockaddr_any_by_family(sa->sa_family); } const void * sockaddr_anyaddr(const struct sockaddr *sa, socklen_t *slenp) { const struct sockaddr *any; if ((any = sockaddr_any(sa)) == NULL) return NULL; return sockaddr_const_addr(any, slenp); } socklen_t sockaddr_getsize_by_family(sa_family_t af) { switch (af) { case AF_INET: return sizeof(struct sockaddr_in); case AF_INET6: return sizeof(struct sockaddr_in6); case AF_UNIX: return sizeof(struct sockaddr_un); case AF_LINK: return sizeof(struct sockaddr_dl); case AF_APPLETALK: return sizeof(struct sockaddr_at); default: #ifdef DIAGNOSTIC printf("%s: (%s:%u:%u) Unhandled address family=%hhu\n", __func__, curlwp->l_proc->p_comm, curlwp->l_proc->p_pid, curlwp->l_lid, af); #endif return 0; } } #ifdef DIAGNOSTIC static void sockaddr_checklen(const struct sockaddr *sa) { // Can't tell how much was allocated, if it was allocated. if (sa->sa_family == AF_LINK) return; socklen_t len = sockaddr_getsize_by_family(sa->sa_family); if (len == 0 || len == sa->sa_len) return; char buf[512]; sockaddr_format(sa, buf, sizeof(buf)); printf("%s: %p bad len af=%hhu socklen=%hhu len=%u [%s]\n", __func__, sa, sa->sa_family, sa->sa_len, (unsigned)len, buf); } #else #define sockaddr_checklen(sa) ((void)0) #endif struct sockaddr * sockaddr_alloc(sa_family_t af, socklen_t socklen, int flags) { struct sockaddr *sa; socklen_t reallen = MAX(socklen, offsetof(struct sockaddr, sa_data[0])); #ifdef DIAGNOSTIC /* * sockaddr_checklen passes sa to sockaddr_format which * requires it to be fully initialized. * * XXX This should be factored better. */ flags |= M_ZERO; #endif if ((sa = malloc(reallen, M_SOCKADDR, flags)) == NULL) return NULL; sa->sa_family = af; sa->sa_len = reallen; sockaddr_checklen(sa); return sa; } struct sockaddr * sockaddr_copy(struct sockaddr *dst, socklen_t socklen, const struct sockaddr *src) { if (__predict_false(socklen < src->sa_len)) { panic("%s: source too long, %d < %d bytes", __func__, socklen, src->sa_len); } sockaddr_checklen(src); return memcpy(dst, src, src->sa_len); } struct sockaddr * sockaddr_externalize(struct sockaddr *dst, socklen_t socklen, const struct sockaddr *src) { struct domain *dom; dom = pffinddomain(src->sa_family); if (dom != NULL && dom->dom_sockaddr_externalize != NULL) return (*dom->dom_sockaddr_externalize)(dst, socklen, src); return sockaddr_copy(dst, socklen, src); } int sockaddr_cmp(const struct sockaddr *sa1, const struct sockaddr *sa2) { int len, rc; struct domain *dom; if (sa1->sa_family != sa2->sa_family) return sa1->sa_family - sa2->sa_family; dom = pffinddomain(sa1->sa_family); if (dom != NULL && dom->dom_sockaddr_cmp != NULL) return (*dom->dom_sockaddr_cmp)(sa1, sa2); len = MIN(sa1->sa_len, sa2->sa_len); if (dom == NULL || dom->dom_sa_cmplen == 0) { if ((rc = memcmp(sa1, sa2, len)) != 0) return rc; return sa1->sa_len - sa2->sa_len; } if ((rc = memcmp((const char *)sa1 + dom->dom_sa_cmpofs, (const char *)sa2 + dom->dom_sa_cmpofs, MIN(dom->dom_sa_cmplen, len - MIN(len, dom->dom_sa_cmpofs)))) != 0) return rc; return MIN(dom->dom_sa_cmplen + dom->dom_sa_cmpofs, sa1->sa_len) - MIN(dom->dom_sa_cmplen + dom->dom_sa_cmpofs, sa2->sa_len); } struct sockaddr * sockaddr_dup(const struct sockaddr *src, int flags) { struct sockaddr *dst; if ((dst = sockaddr_alloc(src->sa_family, src->sa_len, flags)) == NULL) return NULL; return sockaddr_copy(dst, dst->sa_len, src); } void sockaddr_free(struct sockaddr *sa) { free(sa, M_SOCKADDR); } static int sun_print(char *buf, size_t len, const void *v) { const struct sockaddr_un *sun = v; size_t plen; KASSERT(sun->sun_len >= offsetof(struct sockaddr_un, sun_path[0])); plen = sun->sun_len - offsetof(struct sockaddr_un, sun_path[0]); len = MIN(len, plen); return snprintf(buf, len, "%s", sun->sun_path); } int sockaddr_format(const struct sockaddr *sa, char *buf, size_t len) { size_t plen = 0; if (sa == NULL) return strlcpy(buf, "(null)", len); switch (sa->sa_family) { case AF_LOCAL: plen = strlcpy(buf, "unix: ", len); break; case AF_INET: plen = strlcpy(buf, "inet: ", len); break; case AF_INET6: plen = strlcpy(buf, "inet6: ", len); break; case AF_LINK: plen = strlcpy(buf, "link: ", len); break; case AF_APPLETALK: plen = strlcpy(buf, "atalk: ", len); break; default: return snprintf(buf, len, "(unknown socket family %d)", (int)sa->sa_family); } buf += plen; if (plen > len) len = 0; else len -= plen; switch (sa->sa_family) { case AF_LOCAL: return sun_print(buf, len, sa); case AF_INET: return sin_print(buf, len, sa); case AF_INET6: return sin6_print(buf, len, sa); case AF_LINK: return sdl_print(buf, len, sa); case AF_APPLETALK: return sat_print(buf, len, sa); default: panic("bad family %hhu", sa->sa_family); } } /* * sysctl helper to stuff PF_LOCAL pcbs into sysctl structures */ static void sysctl_dounpcb(struct kinfo_pcb *pcb, const struct socket *so) { const bool allowaddr = get_expose_address(curproc); struct unpcb *unp = sotounpcb(so); struct sockaddr_un *un = unp->unp_addr; memset(pcb, 0, sizeof(*pcb)); pcb->ki_family = so->so_proto->pr_domain->dom_family; pcb->ki_type = so->so_proto->pr_type; pcb->ki_protocol = so->so_proto->pr_protocol; pcb->ki_pflags = unp->unp_flags; COND_SET_VALUE(pcb->ki_pcbaddr, PTRTOUINT64(unp), allowaddr); /* pcb->ki_ppcbaddr = unp has no ppcb... */ COND_SET_VALUE(pcb->ki_sockaddr, PTRTOUINT64(so), allowaddr); pcb->ki_sostate = so->so_state; /* pcb->ki_prstate = unp has no state... */ pcb->ki_rcvq = so->so_rcv.sb_cc; pcb->ki_sndq = so->so_snd.sb_cc; un = (struct sockaddr_un *)pcb->ki_spad; /* * local domain sockets may bind without having a local * endpoint. bleah! */ if (unp->unp_addr != NULL) { /* * We've added one to sun_len when allocating to * hold terminating NUL which we want here. See * makeun(). */ memcpy(un, unp->unp_addr, uimin(sizeof(pcb->ki_spad), unp->unp_addr->sun_len + 1)); } else { un->sun_len = offsetof(struct sockaddr_un, sun_path); un->sun_family = pcb->ki_family; } if (unp->unp_conn != NULL) { un = (struct sockaddr_un *)pcb->ki_dpad; if (unp->unp_conn->unp_addr != NULL) { memcpy(un, unp->unp_conn->unp_addr, uimin(sizeof(pcb->ki_dpad), unp->unp_conn->unp_addr->sun_len + 1)); } else { un->sun_len = offsetof(struct sockaddr_un, sun_path); un->sun_family = pcb->ki_family; } } pcb->ki_inode = unp->unp_ino; COND_SET_VALUE(pcb->ki_vnode, PTRTOUINT64(unp->unp_vnode), allowaddr); COND_SET_VALUE(pcb->ki_conn, PTRTOUINT64(unp->unp_conn), allowaddr); COND_SET_VALUE(pcb->ki_refs, PTRTOUINT64(unp->unp_refs), allowaddr); COND_SET_VALUE(pcb->ki_nextref, PTRTOUINT64(unp->unp_nextref), allowaddr); } static int sysctl_unpcblist(SYSCTLFN_ARGS) { struct file *fp, *np, *dfp; struct socket *so; struct kinfo_pcb pcb; char *dp; size_t len, needed, elem_size, out_size; int error, elem_count, pf, type; if (namelen == 1 && name[0] == CTL_QUERY) return sysctl_query(SYSCTLFN_CALL(rnode)); if (namelen != 4) return EINVAL; if (oldp != NULL) { len = *oldlenp; elem_size = name[2]; elem_count = name[3]; if (elem_size != sizeof(pcb)) return EINVAL; } else { len = 0; elem_size = sizeof(pcb); elem_count = INT_MAX; } error = 0; dp = oldp; out_size = elem_size; needed = 0; if (name - oname != 4) return EINVAL; pf = oname[1]; type = oname[2]; /* * allocate dummy file descriptor to make position in list. */ sysctl_unlock(); if ((dfp = fgetdummy()) == NULL) { sysctl_relock(); return ENOMEM; } /* * there's no "list" of local domain sockets, so we have * to walk the file list looking for them. :-/ */ mutex_enter(&filelist_lock); LIST_FOREACH_SAFE(fp, &filehead, f_list, np) { if (fp->f_count == 0 || fp->f_type != DTYPE_SOCKET || fp->f_socket == NULL) continue; so = fp->f_socket; if (so->so_type != type) continue; if (so->so_proto->pr_domain->dom_family != pf) continue; if (kauth_authorize_network(l->l_cred, KAUTH_NETWORK_SOCKET, KAUTH_REQ_NETWORK_SOCKET_CANSEE, so, NULL, NULL) != 0) continue; if (len >= elem_size && elem_count > 0) { mutex_enter(&fp->f_lock); /* * Do not add references, if the count reached 0. * Since the check above has been performed without * locking, it must be rechecked here as a concurrent * closef could have reduced it. */ if (fp->f_count == 0) { mutex_exit(&fp->f_lock); continue; } fp->f_count++; mutex_exit(&fp->f_lock); LIST_INSERT_AFTER(fp, dfp, f_list); mutex_exit(&filelist_lock); sysctl_dounpcb(&pcb, so); error = copyout(&pcb, dp, out_size); closef(fp); mutex_enter(&filelist_lock); np = LIST_NEXT(dfp, f_list); LIST_REMOVE(dfp, f_list); if (error) break; dp += elem_size; len -= elem_size; } needed += elem_size; if (elem_count > 0 && elem_count != INT_MAX) elem_count--; } mutex_exit(&filelist_lock); fputdummy(dfp); *oldlenp = needed; if (oldp == NULL) *oldlenp += PCB_SLOP * sizeof(struct kinfo_pcb); sysctl_relock(); return error; } static void sysctl_net_setup(void) { KASSERT(domain_sysctllog == NULL); sysctl_createv(&domain_sysctllog, 0, NULL, NULL, CTLFLAG_PERMANENT, CTLTYPE_NODE, "local", SYSCTL_DESCR("PF_LOCAL related settings"), NULL, 0, NULL, 0, CTL_NET, PF_LOCAL, CTL_EOL); sysctl_createv(&domain_sysctllog, 0, NULL, NULL, CTLFLAG_PERMANENT, CTLTYPE_NODE, "stream", SYSCTL_DESCR("SOCK_STREAM settings"), NULL, 0, NULL, 0, CTL_NET, PF_LOCAL, SOCK_STREAM, CTL_EOL); sysctl_createv(&domain_sysctllog, 0, NULL, NULL, CTLFLAG_PERMANENT, CTLTYPE_NODE, "seqpacket", SYSCTL_DESCR("SOCK_SEQPACKET settings"), NULL, 0, NULL, 0, CTL_NET, PF_LOCAL, SOCK_SEQPACKET, CTL_EOL); sysctl_createv(&domain_sysctllog, 0, NULL, NULL, CTLFLAG_PERMANENT, CTLTYPE_NODE, "dgram", SYSCTL_DESCR("SOCK_DGRAM settings"), NULL, 0, NULL, 0, CTL_NET, PF_LOCAL, SOCK_DGRAM, CTL_EOL); sysctl_createv(&domain_sysctllog, 0, NULL, NULL, CTLFLAG_PERMANENT, CTLTYPE_STRUCT, "pcblist", SYSCTL_DESCR("SOCK_STREAM protocol control block list"), sysctl_unpcblist, 0, NULL, 0, CTL_NET, PF_LOCAL, SOCK_STREAM, CTL_CREATE, CTL_EOL); sysctl_createv(&domain_sysctllog, 0, NULL, NULL, CTLFLAG_PERMANENT, CTLTYPE_STRUCT, "pcblist", SYSCTL_DESCR("SOCK_SEQPACKET protocol control " "block list"), sysctl_unpcblist, 0, NULL, 0, CTL_NET, PF_LOCAL, SOCK_SEQPACKET, CTL_CREATE, CTL_EOL); sysctl_createv(&domain_sysctllog, 0, NULL, NULL, CTLFLAG_PERMANENT, CTLTYPE_STRUCT, "pcblist", SYSCTL_DESCR("SOCK_DGRAM protocol control block list"), sysctl_unpcblist, 0, NULL, 0, CTL_NET, PF_LOCAL, SOCK_DGRAM, CTL_CREATE, CTL_EOL); } void pfctlinput(int cmd, const struct sockaddr *sa) { struct domain *dp; const struct protosw *pr; DOMAIN_FOREACH(dp) { for (pr = dp->dom_protosw; pr < dp->dom_protoswNPROTOSW; pr++) { if (pr->pr_ctlinput != NULL) (*pr->pr_ctlinput)(cmd, sa, NULL); } } } void pfctlinput2(int cmd, const struct sockaddr *sa, void *ctlparam) { struct domain *dp; const struct protosw *pr; if (sa == NULL) return; DOMAIN_FOREACH(dp) { /* * the check must be made by xx_ctlinput() anyways, to * make sure we use data item pointed to by ctlparam in * correct way. the following check is made just for safety. */ if (dp->dom_family != sa->sa_family) continue; for (pr = dp->dom_protosw; pr < dp->dom_protoswNPROTOSW; pr++) { if (pr->pr_ctlinput != NULL) (*pr->pr_ctlinput)(cmd, sa, ctlparam); } } } void pfslowtimo(void *arg) { struct domain *dp; const struct protosw *pr; pfslowtimo_now++; DOMAIN_FOREACH(dp) { for (pr = dp->dom_protosw; pr < dp->dom_protoswNPROTOSW; pr++) if (pr->pr_slowtimo) (*pr->pr_slowtimo)(); } callout_schedule(&pfslowtimo_ch, hz / PR_SLOWHZ); } void pffasttimo(void *arg) { struct domain *dp; const struct protosw *pr; pffasttimo_now++; DOMAIN_FOREACH(dp) { for (pr = dp->dom_protosw; pr < dp->dom_protoswNPROTOSW; pr++) if (pr->pr_fasttimo) (*pr->pr_fasttimo)(); } callout_schedule(&pffasttimo_ch, hz / PR_FASTHZ); }
62 62 61 60 60 20 59 46 46 46 60 60 56 57 3 60 19 60 60 16 1 20 59 46 60 60 8 8 5 5 4 4 23 23 12 11 4 4 3 1 4 4 21 13 8 6 21 12 8 6 6 12 11 3 1 11 4 6 9 4 5 1 1 1 1 1 4 1 3 4 4 1 1 2 2 16 15 1 16 3 3 9 9 9 2 2 2 1 1 1 1 1 1 1 2 2 1 1 1 1 3 3 3 25 25 25 7 7 56 56 11 11 92 93 92 44 92 93 93 93 2 2 2 53 53 20 20 20 7 7 2 2 2 2 2 18 18 18 18 16 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 /* $NetBSD: union_vnops.c,v 1.83 2022/03/19 13:48:04 hannken Exp $ */ /* * Copyright (c) 1992, 1993, 1994, 1995 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * Jan-Simon Pendry. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)union_vnops.c 8.33 (Berkeley) 7/31/95 */ /* * Copyright (c) 1992, 1993, 1994, 1995 Jan-Simon Pendry. * * This code is derived from software contributed to Berkeley by * Jan-Simon Pendry. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)union_vnops.c 8.33 (Berkeley) 7/31/95 */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: union_vnops.c,v 1.83 2022/03/19 13:48:04 hannken Exp $"); #include <sys/param.h> #include <sys/systm.h> #include <sys/proc.h> #include <sys/file.h> #include <sys/time.h> #include <sys/stat.h> #include <sys/vnode.h> #include <sys/mount.h> #include <sys/namei.h> #include <sys/malloc.h> #include <sys/buf.h> #include <sys/queue.h> #include <sys/lock.h> #include <sys/kauth.h> #include <fs/union/union.h> #include <miscfs/genfs/genfs.h> #include <miscfs/specfs/specdev.h> int union_parsepath(void *); int union_lookup(void *); int union_create(void *); int union_whiteout(void *); int union_mknod(void *); int union_open(void *); int union_close(void *); int union_access(void *); int union_getattr(void *); int union_setattr(void *); int union_read(void *); int union_write(void *); int union_ioctl(void *); int union_poll(void *); int union_revoke(void *); int union_mmap(void *); int union_fsync(void *); int union_seek(void *); int union_remove(void *); int union_link(void *); int union_rename(void *); int union_mkdir(void *); int union_rmdir(void *); int union_symlink(void *); int union_readdir(void *); int union_readlink(void *); int union_abortop(void *); int union_inactive(void *); int union_reclaim(void *); int union_lock(void *); int union_unlock(void *); int union_bmap(void *); int union_print(void *); int union_islocked(void *); int union_pathconf(void *); int union_advlock(void *); int union_strategy(void *); int union_bwrite(void *); int union_getpages(void *); int union_putpages(void *); int union_kqfilter(void *); static int union_lookup1(struct vnode *, struct vnode **, struct vnode **, struct componentname *); /* * Global vfs data structures */ int (**union_vnodeop_p)(void *); const struct vnodeopv_entry_desc union_vnodeop_entries[] = { { &vop_default_desc, vn_default_error }, { &vop_parsepath_desc, union_parsepath }, /* parsepath */ { &vop_lookup_desc, union_lookup }, /* lookup */ { &vop_create_desc, union_create }, /* create */ { &vop_whiteout_desc, union_whiteout }, /* whiteout */ { &vop_mknod_desc, union_mknod }, /* mknod */ { &vop_open_desc, union_open }, /* open */ { &vop_close_desc, union_close }, /* close */ { &vop_access_desc, union_access }, /* access */ { &vop_accessx_desc, genfs_accessx }, /* accessx */ { &vop_getattr_desc, union_getattr }, /* getattr */ { &vop_setattr_desc, union_setattr }, /* setattr */ { &vop_read_desc, union_read }, /* read */ { &vop_write_desc, union_write }, /* write */ { &vop_fallocate_desc, genfs_eopnotsupp }, /* fallocate */ { &vop_fdiscard_desc, genfs_eopnotsupp }, /* fdiscard */ { &vop_ioctl_desc, union_ioctl }, /* ioctl */ { &vop_poll_desc, union_poll }, /* select */ { &vop_revoke_desc, union_revoke }, /* revoke */ { &vop_mmap_desc, union_mmap }, /* mmap */ { &vop_fsync_desc, union_fsync }, /* fsync */ { &vop_seek_desc, union_seek }, /* seek */ { &vop_remove_desc, union_remove }, /* remove */ { &vop_link_desc, union_link }, /* link */ { &vop_rename_desc, union_rename }, /* rename */ { &vop_mkdir_desc, union_mkdir }, /* mkdir */ { &vop_rmdir_desc, union_rmdir }, /* rmdir */ { &vop_symlink_desc, union_symlink }, /* symlink */ { &vop_readdir_desc, union_readdir }, /* readdir */ { &vop_readlink_desc, union_readlink }, /* readlink */ { &vop_abortop_desc, union_abortop }, /* abortop */ { &vop_inactive_desc, union_inactive }, /* inactive */ { &vop_reclaim_desc, union_reclaim }, /* reclaim */ { &vop_lock_desc, union_lock }, /* lock */ { &vop_unlock_desc, union_unlock }, /* unlock */ { &vop_bmap_desc, union_bmap }, /* bmap */ { &vop_strategy_desc, union_strategy }, /* strategy */ { &vop_bwrite_desc, union_bwrite }, /* bwrite */ { &vop_print_desc, union_print }, /* print */ { &vop_islocked_desc, union_islocked }, /* islocked */ { &vop_pathconf_desc, union_pathconf }, /* pathconf */ { &vop_advlock_desc, union_advlock }, /* advlock */ { &vop_getpages_desc, union_getpages }, /* getpages */ { &vop_putpages_desc, union_putpages }, /* putpages */ { &vop_kqfilter_desc, union_kqfilter }, /* kqfilter */ { NULL, NULL } }; const struct vnodeopv_desc union_vnodeop_opv_desc = { &union_vnodeop_p, union_vnodeop_entries }; #define NODE_IS_SPECIAL(vp) \ ((vp)->v_type == VBLK || (vp)->v_type == VCHR || \ (vp)->v_type == VSOCK || (vp)->v_type == VFIFO) int union_parsepath(void *v) { struct vop_parsepath_args /* { struct vnode *a_dvp; const char *a_name; size_t *a_retval; } */ *ap = v; struct vnode *upperdvp, *lowerdvp; size_t upper, lower; int error; upperdvp = UPPERVP(ap->a_dvp); lowerdvp = LOWERVP(ap->a_dvp); if (upperdvp != NULLVP) { error = VOP_PARSEPATH(upperdvp, ap->a_name, &upper); if (error) { return error; } } else { upper = 0; } if (lowerdvp != NULLVP) { error = VOP_PARSEPATH(lowerdvp, ap->a_name, &lower); if (error) { return error; } } else { lower = 0; } if (upper == 0 && lower == 0) { panic("%s: missing both layers", __func__); } /* * If they're different, use the larger one. This is not a * comprehensive solution, but it's sufficient for the * non-default cases of parsepath that currently exist. */ *ap->a_retval = MAX(upper, lower); return 0; } static int union_lookup1(struct vnode *udvp, struct vnode **dvpp, struct vnode **vpp, struct componentname *cnp) { int error; struct vnode *tdvp; struct vnode *dvp; struct mount *mp; dvp = *dvpp; /* * If stepping up the directory tree, check for going * back across the mount point, in which case do what * lookup would do by stepping back down the mount * hierarchy. */ if (cnp->cn_flags & ISDOTDOT) { while ((dvp != udvp) && (dvp->v_vflag & VV_ROOT)) { /* * Don't do the NOCROSSMOUNT check * at this level. By definition, * union fs deals with namespaces, not * filesystems. */ tdvp = dvp; *dvpp = dvp = dvp->v_mount->mnt_vnodecovered; VOP_UNLOCK(tdvp); vn_lock(dvp, LK_EXCLUSIVE | LK_RETRY); } } error = VOP_LOOKUP(dvp, &tdvp, cnp); if (error) return (error); if (dvp != tdvp) { if (cnp->cn_flags & ISDOTDOT) VOP_UNLOCK(dvp); error = vn_lock(tdvp, LK_EXCLUSIVE); if (cnp->cn_flags & ISDOTDOT) vn_lock(dvp, LK_EXCLUSIVE | LK_RETRY); if (error) { vrele(tdvp); return error; } dvp = tdvp; } /* * Lastly check if the current node is a mount point in * which case walk up the mount hierarchy making sure not to * bump into the root of the mount tree (ie. dvp != udvp). */ while (dvp != udvp && (dvp->v_type == VDIR) && (mp = dvp->v_mountedhere)) { if (vfs_busy(mp)) continue; vput(dvp); error = VFS_ROOT(mp, LK_EXCLUSIVE, &tdvp); vfs_unbusy(mp); if (error) { return (error); } dvp = tdvp; } *vpp = dvp; return (0); } int union_lookup(void *v) { struct vop_lookup_v2_args /* { struct vnodeop_desc *a_desc; struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; } */ *ap = v; int error; int uerror, lerror; struct vnode *uppervp, *lowervp; struct vnode *upperdvp, *lowerdvp; struct vnode *dvp = ap->a_dvp; struct union_node *dun = VTOUNION(dvp); struct componentname *cnp = ap->a_cnp; struct union_mount *um = MOUNTTOUNIONMOUNT(dvp->v_mount); kauth_cred_t saved_cred = NULL; int iswhiteout; struct vattr va; #ifdef notyet if (cnp->cn_namelen == 3 && cnp->cn_nameptr[2] == '.' && cnp->cn_nameptr[1] == '.' && cnp->cn_nameptr[0] == '.') { dvp = *ap->a_vpp = LOWERVP(ap->a_dvp); if (dvp == NULLVP) return (ENOENT); vref(dvp); vn_lock(dvp, LK_EXCLUSIVE | LK_RETRY); return (0); } #endif if ((cnp->cn_flags & ISLASTCN) && (dvp->v_mount->mnt_flag & MNT_RDONLY) && (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)) return (EROFS); start: upperdvp = dun->un_uppervp; lowerdvp = dun->un_lowervp; uppervp = NULLVP; lowervp = NULLVP; iswhiteout = 0; /* * do the lookup in the upper level. * if that level comsumes additional pathnames, * then assume that something special is going * on and just return that vnode. */ if (upperdvp != NULLVP) { uerror = union_lookup1(um->um_uppervp, &upperdvp, &uppervp, cnp); if (uerror == ENOENT || uerror == EJUSTRETURN) { if (cnp->cn_flags & ISWHITEOUT) { iswhiteout = 1; } else if (lowerdvp != NULLVP) { lerror = VOP_GETATTR(upperdvp, &va, cnp->cn_cred); if (lerror == 0 && (va.va_flags & OPAQUE)) iswhiteout = 1; } } } else { uerror = ENOENT; } /* * in a similar way to the upper layer, do the lookup * in the lower layer. this time, if there is some * component magic going on, then vput whatever we got * back from the upper layer and return the lower vnode * instead. */ if (lowerdvp != NULLVP && !iswhiteout) { int nameiop; vn_lock(lowerdvp, LK_EXCLUSIVE | LK_RETRY); /* * Only do a LOOKUP on the bottom node, since * we won't be making changes to it anyway. */ nameiop = cnp->cn_nameiop; cnp->cn_nameiop = LOOKUP; if (um->um_op == UNMNT_BELOW) { saved_cred = cnp->cn_cred; cnp->cn_cred = um->um_cred; } /* * we shouldn't have to worry about locking interactions * between the lower layer and our union layer (w.r.t. * `..' processing) because we don't futz with lowervp * locks in the union-node instantiation code path. */ lerror = union_lookup1(um->um_lowervp, &lowerdvp, &lowervp, cnp); if (um->um_op == UNMNT_BELOW) cnp->cn_cred = saved_cred; cnp->cn_nameiop = nameiop; if (lowervp != lowerdvp) VOP_UNLOCK(lowerdvp); } else { lerror = ENOENT; if ((cnp->cn_flags & ISDOTDOT) && dun->un_pvp != NULLVP) { lowervp = LOWERVP(dun->un_pvp); if (lowervp != NULLVP) { vref(lowervp); vn_lock(lowervp, LK_EXCLUSIVE | LK_RETRY); lerror = 0; } } } /* * EJUSTRETURN is used by underlying filesystems to indicate that * a directory modification op was started successfully. * This will only happen in the upper layer, since * the lower layer only does LOOKUPs. * If this union is mounted read-only, bounce it now. */ if ((uerror == EJUSTRETURN) && (cnp->cn_flags & ISLASTCN) && (dvp->v_mount->mnt_flag & MNT_RDONLY) && ((cnp->cn_nameiop == CREATE) || (cnp->cn_nameiop == RENAME))) uerror = EROFS; /* * at this point, we have uerror and lerror indicating * possible errors with the lookups in the upper and lower * layers. additionally, uppervp and lowervp are (locked) * references to existing vnodes in the upper and lower layers. * * there are now three cases to consider. * 1. if both layers returned an error, then return whatever * error the upper layer generated. * * 2. if the top layer failed and the bottom layer succeeded * then two subcases occur. * a. the bottom vnode is not a directory, in which * case just return a new union vnode referencing * an empty top layer and the existing bottom layer. * b. the bottom vnode is a directory, in which case * create a new directory in the top-level and * continue as in case 3. * * 3. if the top layer succeeded then return a new union * vnode referencing whatever the new top layer and * whatever the bottom layer returned. */ *ap->a_vpp = NULLVP; /* case 1. */ if ((uerror != 0) && (lerror != 0)) { return (uerror); } /* case 2. */ if (uerror != 0 /* && (lerror == 0) */ ) { if (lowervp->v_type == VDIR) { /* case 2b. */ /* * We may be racing another process to make the * upper-level shadow directory. Be careful with * locks/etc! * If we have to create a shadow directory and want * to commit the node we have to restart the lookup * to get the componentname right. */ if (upperdvp) { VOP_UNLOCK(upperdvp); uerror = union_mkshadow(um, upperdvp, cnp, &uppervp); vn_lock(upperdvp, LK_EXCLUSIVE | LK_RETRY); if (uerror == 0 && cnp->cn_nameiop != LOOKUP) { vrele(uppervp); if (lowervp != NULLVP) vput(lowervp); goto start; } } if (uerror) { if (lowervp != NULLVP) { vput(lowervp); lowervp = NULLVP; } return (uerror); } } } else { /* uerror == 0 */ if (uppervp != upperdvp) VOP_UNLOCK(uppervp); } if (lowervp != NULLVP) VOP_UNLOCK(lowervp); error = union_allocvp(ap->a_vpp, dvp->v_mount, dvp, upperdvp, cnp, uppervp, lowervp, 1); if (error) { if (uppervp != NULLVP) vrele(uppervp); if (lowervp != NULLVP) vrele(lowervp); return error; } return 0; } int union_create(void *v) { struct vop_create_v3_args /* { struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; struct vattr *a_vap; } */ *ap = v; struct union_node *un = VTOUNION(ap->a_dvp); struct vnode *dvp = un->un_uppervp; struct componentname *cnp = ap->a_cnp; if (dvp != NULLVP) { int error; struct vnode *vp; struct mount *mp; mp = ap->a_dvp->v_mount; vp = NULL; error = VOP_CREATE(dvp, &vp, cnp, ap->a_vap); if (error) return (error); error = union_allocvp(ap->a_vpp, mp, NULLVP, NULLVP, cnp, vp, NULLVP, 1); if (error) vrele(vp); return (error); } return (EROFS); } int union_whiteout(void *v) { struct vop_whiteout_args /* { struct vnode *a_dvp; struct componentname *a_cnp; int a_flags; } */ *ap = v; struct union_node *un = VTOUNION(ap->a_dvp); struct componentname *cnp = ap->a_cnp; if (un->un_uppervp == NULLVP) return (EOPNOTSUPP); return (VOP_WHITEOUT(un->un_uppervp, cnp, ap->a_flags)); } int union_mknod(void *v) { struct vop_mknod_v3_args /* { struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; struct vattr *a_vap; } */ *ap = v; struct union_node *un = VTOUNION(ap->a_dvp); struct vnode *dvp = un->un_uppervp; struct componentname *cnp = ap->a_cnp; if (dvp != NULLVP) { int error; struct vnode *vp; struct mount *mp; mp = ap->a_dvp->v_mount; error = VOP_MKNOD(dvp, &vp, cnp, ap->a_vap); if (error) return (error); error = union_allocvp(ap->a_vpp, mp, NULLVP, NULLVP, cnp, vp, NULLVP, 1); if (error) vrele(vp); return (error); } return (EROFS); } int union_open(void *v) { struct vop_open_args /* { struct vnodeop_desc *a_desc; struct vnode *a_vp; int a_mode; kauth_cred_t a_cred; } */ *ap = v; struct union_node *un = VTOUNION(ap->a_vp); struct vnode *tvp; int mode = ap->a_mode; kauth_cred_t cred = ap->a_cred; struct lwp *l = curlwp; int error; /* * If there is an existing upper vp then simply open that. */ tvp = un->un_uppervp; if (tvp == NULLVP) { /* * If the lower vnode is being opened for writing, then * copy the file contents to the upper vnode and open that, * otherwise can simply open the lower vnode. */ tvp = un->un_lowervp; if ((ap->a_mode & FWRITE) && (tvp->v_type == VREG)) { error = union_copyup(un, (mode&O_TRUNC) == 0, cred, l); if (error == 0) error = VOP_OPEN(un->un_uppervp, mode, cred); if (error == 0) { mutex_enter(un->un_uppervp->v_interlock); un->un_uppervp->v_writecount++; mutex_exit(un->un_uppervp->v_interlock); } return (error); } /* * Just open the lower vnode, but check for nodev mount flag */ if ((tvp->v_type == VBLK || tvp->v_type == VCHR) && (ap->a_vp->v_mount->mnt_flag & MNT_NODEV)) return ENXIO; un->un_openl++; vn_lock(tvp, LK_EXCLUSIVE | LK_RETRY); error = VOP_OPEN(tvp, mode, cred); VOP_UNLOCK(tvp); return (error); } /* * Just open the upper vnode, checking for nodev mount flag first */ if ((tvp->v_type == VBLK || tvp->v_type == VCHR) && (ap->a_vp->v_mount->mnt_flag & MNT_NODEV)) return ENXIO; error = VOP_OPEN(tvp, mode, cred); if (error == 0 && (ap->a_mode & FWRITE)) { mutex_enter(tvp->v_interlock); tvp->v_writecount++; mutex_exit(tvp->v_interlock); } return (error); } int union_close(void *v) { struct vop_close_args /* { struct vnode *a_vp; int a_fflag; kauth_cred_t a_cred; } */ *ap = v; struct union_node *un = VTOUNION(ap->a_vp); struct vnode *vp; int error; bool do_lock; vp = un->un_uppervp; if (vp != NULLVP) { do_lock = false; } else { KASSERT(un->un_openl > 0); --un->un_openl; vp = un->un_lowervp; do_lock = true; } KASSERT(vp != NULLVP); ap->a_vp = vp; if ((ap->a_fflag & FWRITE)) { KASSERT(vp == un->un_uppervp); mutex_enter(vp->v_interlock); vp->v_writecount--; mutex_exit(vp->v_interlock); } if (do_lock) vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); error = VCALL(vp, VOFFSET(vop_close), ap); if (do_lock) VOP_UNLOCK(vp); return error; } /* * Check access permission on the union vnode. * The access check being enforced is to check * against both the underlying vnode, and any * copied vnode. This ensures that no additional * file permissions are given away simply because * the user caused an implicit file copy. */ int union_access(void *v) { struct vop_access_args /* { struct vnodeop_desc *a_desc; struct vnode *a_vp; accmode_t a_accmode; kauth_cred_t a_cred; } */ *ap = v; struct vnode *vp = ap->a_vp; struct union_node *un = VTOUNION(vp); int error = EACCES; struct union_mount *um = MOUNTTOUNIONMOUNT(vp->v_mount); /* * Disallow write attempts on read-only file systems; * unless the file is a socket, fifo, or a block or * character device resident on the file system. */ if (ap->a_accmode & VWRITE) { switch (vp->v_type) { case VDIR: case VLNK: case VREG: if (vp->v_mount->mnt_flag & MNT_RDONLY) return (EROFS); break; case VBAD: case VBLK: case VCHR: case VSOCK: case VFIFO: case VNON: default: break; } } /* * Copy up to prevent checking (and failing) against * underlying file system mounted read only. * Check for read access first to prevent implicit * copy of inaccessible underlying vnode. */ if (un->un_uppervp == NULLVP && (un->un_lowervp->v_type == VREG) && (ap->a_accmode & VWRITE)) { vn_lock(un->un_lowervp, LK_EXCLUSIVE | LK_RETRY); error = VOP_ACCESS(un->un_lowervp, VREAD, ap->a_cred); VOP_UNLOCK(un->un_lowervp); if (error == 0) error = union_copyup(un, 1, ap->a_cred, curlwp); if (error) return error; } if ((vp = un->un_uppervp) != NULLVP) { ap->a_vp = vp; return (VCALL(vp, VOFFSET(vop_access), ap)); } if ((vp = un->un_lowervp) != NULLVP) { vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); ap->a_vp = vp; error = VCALL(vp, VOFFSET(vop_access), ap); if (error == 0) { if (um->um_op == UNMNT_BELOW) { ap->a_cred = um->um_cred; error = VCALL(vp, VOFFSET(vop_access), ap); } } VOP_UNLOCK(vp); if (error) return (error); } return (error); } /* * We handle getattr only to change the fsid and * track object sizes */ int union_getattr(void *v) { struct vop_getattr_args /* { struct vnode *a_vp; struct vattr *a_vap; kauth_cred_t a_cred; } */ *ap = v; int error; struct union_node *un = VTOUNION(ap->a_vp); struct vnode *vp = un->un_uppervp; struct vattr *vap; struct vattr va; /* * Some programs walk the filesystem hierarchy by counting * links to directories to avoid stat'ing all the time. * This means the link count on directories needs to be "correct". * The only way to do that is to call getattr on both layers * and fix up the link count. The link count will not necessarily * be accurate but will be large enough to defeat the tree walkers. * * To make life more interesting, some filesystems don't keep * track of link counts in the expected way, and return a * link count of `1' for those directories; if either of the * component directories returns a link count of `1', we return a 1. */ vap = ap->a_vap; vp = un->un_uppervp; if (vp != NULLVP) { error = VOP_GETATTR(vp, vap, ap->a_cred); if (error) return (error); mutex_enter(&un->un_lock); union_newsize(ap->a_vp, vap->va_size, VNOVAL); } if (vp == NULLVP) { vp = un->un_lowervp; } else if (vp->v_type == VDIR) { vp = un->un_lowervp; if (vp != NULLVP) vap = &va; } else { vp = NULLVP; } if (vp != NULLVP) { if (vp == un->un_lowervp) vn_lock(vp, LK_SHARED | LK_RETRY); error = VOP_GETATTR(vp, vap, ap->a_cred); if (vp == un->un_lowervp) VOP_UNLOCK(vp); if (error) return (error); mutex_enter(&un->un_lock); union_newsize(ap->a_vp, VNOVAL, vap->va_size); } if ((vap != ap->a_vap) && (vap->va_type == VDIR)) { /* * Link count manipulation: * - If both return "2", return 2 (no subdirs) * - If one or the other return "1", return "1" (ENOCLUE) */ if ((ap->a_vap->va_nlink == 2) && (vap->va_nlink == 2)) ; else if (ap->a_vap->va_nlink != 1) { if (vap->va_nlink == 1) ap->a_vap->va_nlink = 1; else ap->a_vap->va_nlink += vap->va_nlink; } } ap->a_vap->va_fsid = ap->a_vp->v_mount->mnt_stat.f_fsidx.__fsid_val[0]; return (0); } int union_setattr(void *v) { struct vop_setattr_args /* { struct vnode *a_vp; struct vattr *a_vap; kauth_cred_t a_cred; } */ *ap = v; struct vattr *vap = ap->a_vap; struct vnode *vp = ap->a_vp; struct union_node *un = VTOUNION(vp); bool size_only; /* All but va_size are VNOVAL. */ int error; size_only = (vap->va_flags == VNOVAL && vap->va_uid == (uid_t)VNOVAL && vap->va_gid == (gid_t)VNOVAL && vap->va_atime.tv_sec == VNOVAL && vap->va_mtime.tv_sec == VNOVAL && vap->va_mode == (mode_t)VNOVAL); if (!size_only && (vp->v_mount->mnt_flag & MNT_RDONLY)) return (EROFS); if (vap->va_size != VNOVAL) { switch (vp->v_type) { case VDIR: return (EISDIR); case VCHR: case VBLK: case VSOCK: case VFIFO: break; case VREG: case VLNK: default: /* * Disallow write attempts if the filesystem is * mounted read-only. */ if (vp->v_mount->mnt_flag & MNT_RDONLY) return (EROFS); } } /* * Handle case of truncating lower object to zero size, * by creating a zero length upper object. This is to * handle the case of open with O_TRUNC and O_CREAT. */ if ((un->un_uppervp == NULLVP) && /* assert(un->un_lowervp != NULLVP) */ (un->un_lowervp->v_type == VREG)) { error = union_copyup(un, (vap->va_size != 0), ap->a_cred, curlwp); if (error) return (error); } /* * Try to set attributes in upper layer, ignore size change to zero * for devices to handle O_TRUNC and return read-only filesystem error * otherwise. */ if (un->un_uppervp != NULLVP) { error = VOP_SETATTR(un->un_uppervp, vap, ap->a_cred); if ((error == 0) && (vap->va_size != VNOVAL)) { mutex_enter(&un->un_lock); union_newsize(ap->a_vp, vap->va_size, VNOVAL); } } else { KASSERT(un->un_lowervp != NULLVP); if (NODE_IS_SPECIAL(un->un_lowervp)) { if (size_only && (vap->va_size == 0 || vap->va_size == VNOVAL)) error = 0; else error = EROFS; } else { error = EROFS; } } return (error); } int union_read(void *v) { struct vop_read_args /* { struct vnode *a_vp; struct uio *a_uio; int a_ioflag; kauth_cred_t a_cred; } */ *ap = v; int error; struct vnode *vp = OTHERVP(ap->a_vp); int dolock = (vp == LOWERVP(ap->a_vp)); if (dolock) vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); error = VOP_READ(vp, ap->a_uio, ap->a_ioflag, ap->a_cred); if (dolock) VOP_UNLOCK(vp); /* * XXX * perhaps the size of the underlying object has changed under * our feet. take advantage of the offset information present * in the uio structure. */ if (error == 0) { struct union_node *un = VTOUNION(ap->a_vp); off_t cur = ap->a_uio->uio_offset; off_t usz = VNOVAL, lsz = VNOVAL; mutex_enter(&un->un_lock); if (vp == un->un_uppervp) { if (cur > un->un_uppersz) usz = cur; } else { if (cur > un->un_lowersz) lsz = cur; } if (usz != VNOVAL || lsz != VNOVAL) union_newsize(ap->a_vp, usz, lsz); else mutex_exit(&un->un_lock); } return (error); } int union_write(void *v) { struct vop_read_args /* { struct vnode *a_vp; struct uio *a_uio; int a_ioflag; kauth_cred_t a_cred; } */ *ap = v; int error; struct vnode *vp; struct union_node *un = VTOUNION(ap->a_vp); vp = UPPERVP(ap->a_vp); if (vp == NULLVP) { vp = LOWERVP(ap->a_vp); if (NODE_IS_SPECIAL(vp)) { vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); error = VOP_WRITE(vp, ap->a_uio, ap->a_ioflag, ap->a_cred); VOP_UNLOCK(vp); return error; } panic("union: missing upper layer in write"); } error = VOP_WRITE(vp, ap->a_uio, ap->a_ioflag, ap->a_cred); /* * the size of the underlying object may be changed by the * write. */ if (error == 0) { off_t cur = ap->a_uio->uio_offset; mutex_enter(&un->un_lock); if (cur > un->un_uppersz) union_newsize(ap->a_vp, cur, VNOVAL); else mutex_exit(&un->un_lock); } return (error); } int union_ioctl(void *v) { struct vop_ioctl_args /* { struct vnode *a_vp; int a_command; void *a_data; int a_fflag; kauth_cred_t a_cred; } */ *ap = v; struct vnode *ovp = OTHERVP(ap->a_vp); ap->a_vp = ovp; return (VCALL(ovp, VOFFSET(vop_ioctl), ap)); } int union_poll(void *v) { struct vop_poll_args /* { struct vnode *a_vp; int a_events; } */ *ap = v; struct vnode *ovp = OTHERVP(ap->a_vp); ap->a_vp = ovp; return (VCALL(ovp, VOFFSET(vop_poll), ap)); } int union_revoke(void *v) { struct vop_revoke_args /* { struct vnode *a_vp; int a_flags; struct proc *a_p; } */ *ap = v; struct vnode *vp = ap->a_vp; if (UPPERVP(vp)) VOP_REVOKE(UPPERVP(vp), ap->a_flags); if (LOWERVP(vp)) VOP_REVOKE(LOWERVP(vp), ap->a_flags); vgone(vp); /* XXXAD?? */ return (0); } int union_mmap(void *v) { struct vop_mmap_args /* { struct vnode *a_vp; vm_prot_t a_prot; kauth_cred_t a_cred; } */ *ap = v; struct vnode *ovp = OTHERVP(ap->a_vp); ap->a_vp = ovp; return (VCALL(ovp, VOFFSET(vop_mmap), ap)); } int union_fsync(void *v) { struct vop_fsync_args /* { struct vnode *a_vp; kauth_cred_t a_cred; int a_flags; off_t offhi; off_t offlo; } */ *ap = v; int error = 0; struct vnode *targetvp; /* * If vinvalbuf is calling us, it's a "shallow fsync" -- don't * bother syncing the underlying vnodes, since (a) they'll be * fsync'ed when reclaimed and (b) we could deadlock if * they're locked; otherwise, pass it through to the * underlying layer. */ if (ap->a_vp->v_type == VBLK || ap->a_vp->v_type == VCHR) { error = spec_fsync(v); if (error) return error; } if (ap->a_flags & FSYNC_RECLAIM) return 0; targetvp = OTHERVP(ap->a_vp); if (targetvp != NULLVP) { int dolock = (targetvp == LOWERVP(ap->a_vp)); if (dolock) vn_lock(targetvp, LK_EXCLUSIVE | LK_RETRY); error = VOP_FSYNC(targetvp, ap->a_cred, ap->a_flags, ap->a_offlo, ap->a_offhi); if (dolock) VOP_UNLOCK(targetvp); } return (error); } int union_seek(void *v) { struct vop_seek_args /* { struct vnode *a_vp; off_t a_oldoff; off_t a_newoff; kauth_cred_t a_cred; } */ *ap = v; struct vnode *ovp = OTHERVP(ap->a_vp); ap->a_vp = ovp; return (VCALL(ovp, VOFFSET(vop_seek), ap)); } int union_remove(void *v) { struct vop_remove_v3_args /* { struct vnode *a_dvp; struct vnode *a_vp; struct componentname *a_cnp; nlink_t ctx_vp_new_nlink; } */ *ap = v; int error; struct union_node *dun = VTOUNION(ap->a_dvp); struct union_node *un = VTOUNION(ap->a_vp); struct componentname *cnp = ap->a_cnp; if (dun->un_uppervp == NULLVP) panic("union remove: null upper vnode"); if (un->un_uppervp != NULLVP) { struct vnode *dvp = dun->un_uppervp; struct vnode *vp = un->un_uppervp; /* Account for VOP_REMOVE to vrele vp. */ vref(vp); if (union_dowhiteout(un, cnp->cn_cred)) cnp->cn_flags |= DOWHITEOUT; error = VOP_REMOVE(dvp, vp, cnp); if (!error) union_removed_upper(un); vrele(ap->a_vp); } else { error = union_mkwhiteout( MOUNTTOUNIONMOUNT(UNIONTOV(dun)->v_mount), dun->un_uppervp, ap->a_cnp, un); vput(ap->a_vp); } return (error); } int union_link(void *v) { struct vop_link_v2_args /* { struct vnode *a_dvp; struct vnode *a_vp; struct componentname *a_cnp; } */ *ap = v; int error = 0; struct componentname *cnp = ap->a_cnp; struct union_node *dun; struct vnode *vp; struct vnode *dvp; dun = VTOUNION(ap->a_dvp); KASSERT((ap->a_cnp->cn_flags & LOCKPARENT) != 0); if (ap->a_dvp->v_op != ap->a_vp->v_op) { vp = ap->a_vp; } else { struct union_node *un = VTOUNION(ap->a_vp); if (un->un_uppervp == NULLVP) { const bool droplock = (dun->un_uppervp == un->un_dirvp); /* * Needs to be copied before we can link it. */ vn_lock(ap->a_vp, LK_EXCLUSIVE | LK_RETRY); if (droplock) VOP_UNLOCK(dun->un_uppervp); error = union_copyup(un, 1, cnp->cn_cred, curlwp); if (droplock) { vn_lock(dun->un_uppervp, LK_EXCLUSIVE | LK_RETRY); /* * During copyup, we dropped the lock on the * dir and invalidated any saved namei lookup * state for the directory we'll be entering * the link in. We need to re-run the lookup * in that directory to reset any state needed * for VOP_LINK. * Call relookup on the union-layer to reset * the state. */ vp = NULLVP; if (dun->un_uppervp == NULLVP) panic("union: null upperdvp?"); error = relookup(ap->a_dvp, &vp, ap->a_cnp, 0); if (error) { VOP_UNLOCK(ap->a_vp); return EROFS; /* ? */ } if (vp != NULLVP) { /* * The name we want to create has * mysteriously appeared (a race?) */ error = EEXIST; VOP_UNLOCK(ap->a_vp); vput(vp); return (error); } } VOP_UNLOCK(ap->a_vp); } vp = un->un_uppervp; } dvp = dun->un_uppervp; if (dvp == NULLVP) error = EROFS; if (error) return (error); return VOP_LINK(dvp, vp, cnp); } int union_rename(void *v) { struct vop_rename_args /* { struct vnode *a_fdvp; struct vnode *a_fvp; struct componentname *a_fcnp; struct vnode *a_tdvp; struct vnode *a_tvp; struct componentname *a_tcnp; } */ *ap = v; int error; struct vnode *fdvp = ap->a_fdvp; struct vnode *fvp = ap->a_fvp; struct vnode *tdvp = ap->a_tdvp; struct vnode *tvp = ap->a_tvp; /* * Account for VOP_RENAME to vrele all nodes. * Note: VOP_RENAME will unlock tdvp. */ if (fdvp->v_op == union_vnodeop_p) { /* always true */ struct union_node *un = VTOUNION(fdvp); if (un->un_uppervp == NULLVP) { /* * this should never happen in normal * operation but might if there was * a problem creating the top-level shadow * directory. */ error = EXDEV; goto bad; } fdvp = un->un_uppervp; vref(fdvp); } if (fvp->v_op == union_vnodeop_p) { /* always true */ struct union_node *un = VTOUNION(fvp); if (un->un_uppervp == NULLVP) { /* XXX: should do a copyup */ error = EXDEV; goto bad; } if (un->un_lowervp != NULLVP) ap->a_fcnp->cn_flags |= DOWHITEOUT; fvp = un->un_uppervp; vref(fvp); } if (tdvp->v_op == union_vnodeop_p) { struct union_node *un = VTOUNION(tdvp); if (un->un_uppervp == NULLVP) { /* * this should never happen in normal * operation but might if there was * a problem creating the top-level shadow * directory. */ error = EXDEV; goto bad; } tdvp = un->un_uppervp; vref(tdvp); } if (tvp != NULLVP && tvp->v_op == union_vnodeop_p) { struct union_node *un = VTOUNION(tvp); tvp = un->un_uppervp; if (tvp != NULLVP) { vref(tvp); } } error = VOP_RENAME(fdvp, fvp, ap->a_fcnp, tdvp, tvp, ap->a_tcnp); goto out; bad: vput(tdvp); if (tvp != NULLVP) vput(tvp); vrele(fdvp); vrele(fvp); out: if (fdvp != ap->a_fdvp) { vrele(ap->a_fdvp); } if (fvp != ap->a_fvp) { vrele(ap->a_fvp); } if (tdvp != ap->a_tdvp) { vrele(ap->a_tdvp); } if (tvp != ap->a_tvp) { vrele(ap->a_tvp); } return (error); } int union_mkdir(void *v) { struct vop_mkdir_v3_args /* { struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; struct vattr *a_vap; } */ *ap = v; struct union_node *un = VTOUNION(ap->a_dvp); struct vnode *dvp = un->un_uppervp; struct componentname *cnp = ap->a_cnp; if (dvp != NULLVP) { int error; struct vnode *vp; vp = NULL; error = VOP_MKDIR(dvp, &vp, cnp, ap->a_vap); if (error) { vrele(ap->a_dvp); return (error); } error = union_allocvp(ap->a_vpp, ap->a_dvp->v_mount, ap->a_dvp, NULLVP, cnp, vp, NULLVP, 1); if (error) vrele(vp); return (error); } return (EROFS); } int union_rmdir(void *v) { struct vop_rmdir_v2_args /* { struct vnode *a_dvp; struct vnode *a_vp; struct componentname *a_cnp; } */ *ap = v; int error; struct union_node *dun = VTOUNION(ap->a_dvp); struct union_node *un = VTOUNION(ap->a_vp); struct componentname *cnp = ap->a_cnp; if (dun->un_uppervp == NULLVP) panic("union rmdir: null upper vnode"); error = union_check_rmdir(un, cnp->cn_cred); if (error) { vput(ap->a_vp); return error; } if (un->un_uppervp != NULLVP) { struct vnode *dvp = dun->un_uppervp; struct vnode *vp = un->un_uppervp; /* Account for VOP_RMDIR to vrele vp. */ vref(vp); if (union_dowhiteout(un, cnp->cn_cred)) cnp->cn_flags |= DOWHITEOUT; error = VOP_RMDIR(dvp, vp, ap->a_cnp); if (!error) union_removed_upper(un); vrele(ap->a_vp); } else { error = union_mkwhiteout( MOUNTTOUNIONMOUNT(UNIONTOV(dun)->v_mount), dun->un_uppervp, ap->a_cnp, un); vput(ap->a_vp); } return (error); } int union_symlink(void *v) { struct vop_symlink_v3_args /* { struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; struct vattr *a_vap; char *a_target; } */ *ap = v; struct union_node *un = VTOUNION(ap->a_dvp); struct vnode *dvp = un->un_uppervp; struct componentname *cnp = ap->a_cnp; if (dvp != NULLVP) { int error; error = VOP_SYMLINK(dvp, ap->a_vpp, cnp, ap->a_vap, ap->a_target); return (error); } return (EROFS); } /* * union_readdir works in concert with getdirentries and * readdir(3) to provide a list of entries in the unioned * directories. getdirentries is responsible for walking * down the union stack. readdir(3) is responsible for * eliminating duplicate names from the returned data stream. */ int union_readdir(void *v) { struct vop_readdir_args /* { struct vnodeop_desc *a_desc; struct vnode *a_vp; struct uio *a_uio; kauth_cred_t a_cred; int *a_eofflag; u_long *a_cookies; int a_ncookies; } */ *ap = v; struct union_node *un = VTOUNION(ap->a_vp); struct vnode *vp; int dolock, error; if (un->un_hooknode) { KASSERT(un->un_uppervp == NULLVP); KASSERT(un->un_lowervp != NULLVP); vp = un->un_lowervp; dolock = 1; } else { vp = un->un_uppervp; dolock = 0; } if (vp == NULLVP) return 0; if (dolock) vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); ap->a_vp = vp; error = VCALL(vp, VOFFSET(vop_readdir), ap); if (dolock) VOP_UNLOCK(vp); return error; } int union_readlink(void *v) { struct vop_readlink_args /* { struct vnode *a_vp; struct uio *a_uio; kauth_cred_t a_cred; } */ *ap = v; int error; struct vnode *vp = OTHERVP(ap->a_vp); int dolock = (vp == LOWERVP(ap->a_vp)); if (dolock) vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); ap->a_vp = vp; error = VCALL(vp, VOFFSET(vop_readlink), ap); if (dolock) VOP_UNLOCK(vp); return (error); } int union_abortop(void *v) { struct vop_abortop_args /* { struct vnode *a_dvp; struct componentname *a_cnp; } */ *ap = v; KASSERT(UPPERVP(ap->a_dvp) != NULL); ap->a_dvp = UPPERVP(ap->a_dvp); return VCALL(ap->a_dvp, VOFFSET(vop_abortop), ap); } int union_inactive(void *v) { struct vop_inactive_v2_args /* { const struct vnodeop_desc *a_desc; struct vnode *a_vp; bool *a_recycle; } */ *ap = v; struct vnode *vp = ap->a_vp; struct union_node *un = VTOUNION(vp); struct vnode **vpp; /* * Do nothing (and _don't_ bypass). * Wait to vrele lowervp until reclaim, * so that until then our union_node is in the * cache and reusable. * * NEEDSWORK: Someday, consider inactive'ing * the lowervp and then trying to reactivate it * with capabilities (v_id) * like they do in the name lookup cache code. * That's too much work for now. */ if (un->un_dircache != 0) { for (vpp = un->un_dircache; *vpp != NULLVP; vpp++) vrele(*vpp); free(un->un_dircache, M_TEMP); un->un_dircache = 0; } *ap->a_recycle = ((un->un_cflags & UN_CACHED) == 0); return (0); } int union_reclaim(void *v) { struct vop_reclaim_v2_args /* { struct vnode *a_vp; } */ *ap = v; struct vnode *vp = ap->a_vp; struct vnode *uvp = UPPERVP(vp); VOP_UNLOCK(vp); if (uvp != NULL) { mutex_enter(uvp->v_interlock); KASSERT(vp->v_interlock == uvp->v_interlock); uvp->v_writecount -= vp->v_writecount; mutex_exit(uvp->v_interlock); } union_freevp(vp); return (0); } static int union_lock1(struct vnode *vp, struct vnode *lockvp, int flags) { struct vop_lock_args ap; ap.a_desc = VDESC(vop_lock); ap.a_vp = lockvp; ap.a_flags = flags; if (lockvp == vp) return genfs_lock(&ap); else return VCALL(ap.a_vp, VOFFSET(vop_lock), &ap); } static int union_unlock1(struct vnode *vp, struct vnode *lockvp) { struct vop_unlock_args ap; ap.a_desc = VDESC(vop_unlock); ap.a_vp = lockvp; if (lockvp == vp) return genfs_unlock(&ap); else return VCALL(ap.a_vp, VOFFSET(vop_unlock), &ap); } int union_lock(void *v) { struct vop_lock_args /* { struct vnode *a_vp; int a_flags; } */ *ap = v; struct vnode *vp = ap->a_vp, *lockvp; struct union_node *un = VTOUNION(vp); int flags = ap->a_flags; int error; if ((flags & LK_NOWAIT) != 0) { if (!mutex_tryenter(&un->un_lock)) return EBUSY; lockvp = LOCKVP(vp); error = union_lock1(vp, lockvp, flags); mutex_exit(&un->un_lock); return error; } mutex_enter(&un->un_lock); for (;;) { lockvp = LOCKVP(vp); mutex_exit(&un->un_lock); error = union_lock1(vp, lockvp, flags); if (error != 0 || (flags & (LK_DOWNGRADE | LK_UPGRADE)) != 0) return error; mutex_enter(&un->un_lock); if (lockvp == LOCKVP(vp)) break; union_unlock1(vp, lockvp); } mutex_exit(&un->un_lock); return error; } int union_unlock(void *v) { struct vop_unlock_args /* { struct vnode *a_vp; int a_flags; } */ *ap = v; struct vnode *vp = ap->a_vp, *lockvp; lockvp = LOCKVP(vp); union_unlock1(vp, lockvp); return 0; } int union_bmap(void *v) { struct vop_bmap_args /* { struct vnode *a_vp; daddr_t a_bn; struct vnode **a_vpp; daddr_t *a_bnp; int *a_runp; } */ *ap = v; int error; struct vnode *vp = OTHERVP(ap->a_vp); int dolock = (vp == LOWERVP(ap->a_vp)); if (dolock) vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); ap->a_vp = vp; error = VCALL(vp, VOFFSET(vop_bmap), ap); if (dolock) VOP_UNLOCK(vp); return (error); } int union_print(void *v) { struct vop_print_args /* { struct vnode *a_vp; } */ *ap = v; struct vnode *vp = ap->a_vp; printf("\ttag VT_UNION, vp=%p, uppervp=%p, lowervp=%p\n", vp, UPPERVP(vp), LOWERVP(vp)); if (UPPERVP(vp) != NULLVP) vprint("union: upper", UPPERVP(vp)); if (LOWERVP(vp) != NULLVP) vprint("union: lower", LOWERVP(vp)); if (VTOUNION(vp)->un_dircache) { struct vnode **vpp; for (vpp = VTOUNION(vp)->un_dircache; *vpp != NULLVP; vpp++) vprint("dircache:", *vpp); } return (0); } int union_islocked(void *v) { struct vop_islocked_args /* { struct vnode *a_vp; } */ *ap = v; struct vnode *vp; struct union_node *un; un = VTOUNION(ap->a_vp); mutex_enter(&un->un_lock); vp = LOCKVP(ap->a_vp); mutex_exit(&un->un_lock); if (vp == ap->a_vp) return genfs_islocked(ap); else return VOP_ISLOCKED(vp); } int union_pathconf(void *v) { struct vop_pathconf_args /* { struct vnode *a_vp; int a_name; int *a_retval; } */ *ap = v; int error; struct vnode *vp = OTHERVP(ap->a_vp); int dolock = (vp == LOWERVP(ap->a_vp)); if (dolock) vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); ap->a_vp = vp; error = VCALL(vp, VOFFSET(vop_pathconf), ap); if (dolock) VOP_UNLOCK(vp); return (error); } int union_advlock(void *v) { struct vop_advlock_args /* { struct vnode *a_vp; void *a_id; int a_op; struct flock *a_fl; int a_flags; } */ *ap = v; struct vnode *ovp = OTHERVP(ap->a_vp); ap->a_vp = ovp; return (VCALL(ovp, VOFFSET(vop_advlock), ap)); } int union_strategy(void *v) { struct vop_strategy_args /* { struct vnode *a_vp; struct buf *a_bp; } */ *ap = v; struct vnode *ovp = OTHERVP(ap->a_vp); struct buf *bp = ap->a_bp; KASSERT(ovp != NULLVP); if (!NODE_IS_SPECIAL(ovp)) KASSERT((bp->b_flags & B_READ) || ovp != LOWERVP(bp->b_vp)); return (VOP_STRATEGY(ovp, bp)); } int union_bwrite(void *v) { struct vop_bwrite_args /* { struct vnode *a_vp; struct buf *a_bp; } */ *ap = v; struct vnode *ovp = OTHERVP(ap->a_vp); struct buf *bp = ap->a_bp; KASSERT(ovp != NULLVP); if (!NODE_IS_SPECIAL(ovp)) KASSERT((bp->b_flags & B_READ) || ovp != LOWERVP(bp->b_vp)); return (VOP_BWRITE(ovp, bp)); } int union_getpages(void *v) { struct vop_getpages_args /* { struct vnode *a_vp; voff_t a_offset; struct vm_page **a_m; int *a_count; int a_centeridx; vm_prot_t a_access_type; int a_advice; int a_flags; } */ *ap = v; struct vnode *vp = ap->a_vp; KASSERT(rw_lock_held(vp->v_uobj.vmobjlock)); if (ap->a_flags & PGO_LOCKED) { return EBUSY; } ap->a_vp = OTHERVP(vp); KASSERT(vp->v_uobj.vmobjlock == ap->a_vp->v_uobj.vmobjlock); /* Just pass the request on to the underlying layer. */ return VCALL(ap->a_vp, VOFFSET(vop_getpages), ap); } int union_putpages(void *v) { struct vop_putpages_args /* { struct vnode *a_vp; voff_t a_offlo; voff_t a_offhi; int a_flags; } */ *ap = v; struct vnode *vp = ap->a_vp; KASSERT(rw_lock_held(vp->v_uobj.vmobjlock)); ap->a_vp = OTHERVP(vp); KASSERT(vp->v_uobj.vmobjlock == ap->a_vp->v_uobj.vmobjlock); if (ap->a_flags & PGO_RECLAIM) { rw_exit(vp->v_uobj.vmobjlock); return 0; } /* Just pass the request on to the underlying layer. */ return VCALL(ap->a_vp, VOFFSET(vop_putpages), ap); } int union_kqfilter(void *v) { struct vop_kqfilter_args /* { struct vnode *a_vp; struct knote *a_kn; } */ *ap = v; int error; /* * We watch either the upper layer file (if it already exists), * or the lower layer one. If there is lower layer file only * at this moment, we will keep watching that lower layer file * even if upper layer file would be created later on. */ if (UPPERVP(ap->a_vp)) error = VOP_KQFILTER(UPPERVP(ap->a_vp), ap->a_kn); else if (LOWERVP(ap->a_vp)) error = VOP_KQFILTER(LOWERVP(ap->a_vp), ap->a_kn); else { /* panic? */ error = EOPNOTSUPP; } return (error); }
201 204 205 206 204 2 2 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 /* $NetBSD: bufq_fcfs.c,v 1.13 2017/05/04 11:03:27 kamil Exp $ */ /* NetBSD: subr_disk.c,v 1.61 2004/09/25 03:30:44 thorpej Exp */ /*- * Copyright (c) 1996, 1997, 1999, 2000 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility, * NASA Ames Research Center. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /* * Copyright (c) 1982, 1986, 1988, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)ufs_disksubr.c 8.5 (Berkeley) 1/21/94 */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: bufq_fcfs.c,v 1.13 2017/05/04 11:03:27 kamil Exp $"); #include <sys/param.h> #include <sys/systm.h> #include <sys/buf.h> #include <sys/bufq.h> #include <sys/bufq_impl.h> #include <sys/kmem.h> #include <sys/module.h> /* * First-come first-served sort for disks. * * Requests are appended to the queue without any reordering. */ struct bufq_fcfs { TAILQ_HEAD(, buf) bq_head; /* actual list of buffers */ }; static void bufq_fcfs_init(struct bufq_state *); static void bufq_fcfs_put(struct bufq_state *, struct buf *); static struct buf *bufq_fcfs_get(struct bufq_state *, int); BUFQ_DEFINE(fcfs, 10, bufq_fcfs_init); static void bufq_fcfs_put(struct bufq_state *bufq, struct buf *bp) { struct bufq_fcfs *fcfs = bufq_private(bufq); TAILQ_INSERT_TAIL(&fcfs->bq_head, bp, b_actq); } static struct buf * bufq_fcfs_get(struct bufq_state *bufq, int remove) { struct bufq_fcfs *fcfs = bufq_private(bufq); struct buf *bp; bp = TAILQ_FIRST(&fcfs->bq_head); if (bp != NULL && remove) TAILQ_REMOVE(&fcfs->bq_head, bp, b_actq); return (bp); } static struct buf * bufq_fcfs_cancel(struct bufq_state *bufq, struct buf *buf) { struct bufq_fcfs *fcfs = bufq_private(bufq); struct buf *bp; TAILQ_FOREACH(bp, &fcfs->bq_head, b_actq) { if (bp == buf) { TAILQ_REMOVE(&fcfs->bq_head, bp, b_actq); return buf; } } return NULL; } static void bufq_fcfs_fini(struct bufq_state *bufq) { KASSERT(bufq->bq_private != NULL); kmem_free(bufq->bq_private, sizeof(struct bufq_fcfs)); } static void bufq_fcfs_init(struct bufq_state *bufq) { struct bufq_fcfs *fcfs; bufq->bq_get = bufq_fcfs_get; bufq->bq_put = bufq_fcfs_put; bufq->bq_cancel = bufq_fcfs_cancel; bufq->bq_fini = bufq_fcfs_fini; bufq->bq_private = kmem_zalloc(sizeof(struct bufq_fcfs), KM_SLEEP); fcfs = (struct bufq_fcfs *)bufq->bq_private; TAILQ_INIT(&fcfs->bq_head); } MODULE(MODULE_CLASS_BUFQ, bufq_fcfs, NULL); static int bufq_fcfs_modcmd(modcmd_t cmd, void *opaque) { switch (cmd) { case MODULE_CMD_INIT: return bufq_register(&bufq_strat_fcfs); case MODULE_CMD_FINI: return bufq_unregister(&bufq_strat_fcfs); default: return ENOTTY; } }
15 15 12 3 15 15 15 204 204 205 10 10 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 /* $NetBSD: subr_bufq.c,v 1.27 2019/02/17 23:17:41 bad Exp $ */ /* NetBSD: subr_disk.c,v 1.70 2005/08/20 12:00:01 yamt Exp $ */ /*- * Copyright (c) 1996, 1997, 1999, 2000 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility, * NASA Ames Research Center. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /* * Copyright (c) 1982, 1986, 1988, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)ufs_disksubr.c 8.5 (Berkeley) 1/21/94 */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: subr_bufq.c,v 1.27 2019/02/17 23:17:41 bad Exp $"); #include <sys/param.h> #include <sys/systm.h> #include <sys/buf.h> #include <sys/bufq.h> #include <sys/bufq_impl.h> #include <sys/kmem.h> #include <sys/sysctl.h> #include <sys/module.h> #define STRAT_MATCH(id, bs) (strcmp((id), (bs)->bs_name) == 0) static void sysctl_kern_bufq_strategies_setup(struct sysctllog **); static SLIST_HEAD(, bufq_strat) bufq_strat_list = SLIST_HEAD_INITIALIZER(bufq_strat_list); static kmutex_t bufq_mutex; static struct sysctllog *sysctllog; void bufq_init(void) { mutex_init(&bufq_mutex, MUTEX_DEFAULT, IPL_NONE); sysctl_kern_bufq_strategies_setup(&sysctllog); } int bufq_register(struct bufq_strat *bs) { mutex_enter(&bufq_mutex); SLIST_INSERT_HEAD(&bufq_strat_list, bs, bs_next); bs->bs_refcnt = 0; mutex_exit(&bufq_mutex); return 0; } int bufq_unregister(struct bufq_strat *bs) { mutex_enter(&bufq_mutex); if (bs->bs_refcnt != 0) { mutex_exit(&bufq_mutex); return EBUSY; } SLIST_REMOVE(&bufq_strat_list, bs, bufq_strat, bs_next); mutex_exit(&bufq_mutex); return 0; } /* * Create a device buffer queue. */ int bufq_alloc(struct bufq_state **bufqp, const char *strategy, int flags) { struct bufq_strat *bsp, *it; struct bufq_state *bufq; int error = 0; u_int gen; bool found_exact; char strategy_module_name[MAXPATHLEN]; KASSERT((flags & BUFQ_EXACT) == 0 || strategy != BUFQ_STRAT_ANY); switch (flags & BUFQ_SORT_MASK) { case BUFQ_SORT_RAWBLOCK: case BUFQ_SORT_CYLINDER: break; case 0: /* * for strategies which don't care about block numbers. * eg. fcfs */ flags |= BUFQ_SORT_RAWBLOCK; break; default: panic("bufq_alloc: sort out of range"); } /* * select strategy. * if a strategy specified by flags is found, use it. * otherwise, select one with the largest bs_prio. */ mutex_enter(&bufq_mutex); do { gen = module_gen; bsp = NULL; found_exact = false; SLIST_FOREACH(it, &bufq_strat_list, bs_next) { if (strategy != BUFQ_STRAT_ANY && STRAT_MATCH(strategy, (it))) { bsp = it; found_exact = true; break; } if (bsp == NULL || (it)->bs_prio > bsp->bs_prio) bsp = it; } if (strategy == BUFQ_STRAT_ANY || found_exact) break; /* Try to autoload the bufq strategy module */ strlcpy(strategy_module_name, "bufq_", sizeof(strategy_module_name)); strlcat(strategy_module_name, strategy, sizeof(strategy_module_name)); mutex_exit(&bufq_mutex); (void) module_autoload(strategy_module_name, MODULE_CLASS_BUFQ); mutex_enter(&bufq_mutex); } while (gen != module_gen); if (bsp == NULL) { panic("bufq_alloc: no strategy"); } if (strategy != BUFQ_STRAT_ANY && !found_exact) { if ((flags & BUFQ_EXACT)) { error = ENOENT; mutex_exit(&bufq_mutex); goto out; } #if defined(DEBUG) printf("bufq_alloc: '%s' is not available. using '%s'.\n", strategy, bsp->bs_name); #endif } #if defined(BUFQ_DEBUG) /* XXX aprint? */ printf("bufq_alloc: using '%s'\n", bsp->bs_name); #endif bsp->bs_refcnt++; mutex_exit(&bufq_mutex); *bufqp = bufq = kmem_zalloc(sizeof(*bufq), KM_SLEEP); bufq->bq_flags = flags; bufq->bq_strat = bsp; (*bsp->bs_initfn)(bufq); out: return error; } void bufq_put(struct bufq_state *bufq, struct buf *bp) { (*bufq->bq_put)(bufq, bp); } struct buf * bufq_get(struct bufq_state *bufq) { return (*bufq->bq_get)(bufq, 1); } struct buf * bufq_peek(struct bufq_state *bufq) { return (*bufq->bq_get)(bufq, 0); } struct buf * bufq_cancel(struct bufq_state *bufq, struct buf *bp) { return (*bufq->bq_cancel)(bufq, bp); } /* * Drain a device buffer queue. */ void bufq_drain(struct bufq_state *bufq) { struct buf *bp; while ((bp = bufq_get(bufq)) != NULL) { bp->b_error = EIO; bp->b_resid = bp->b_bcount; biodone(bp); } } /* * Destroy a device buffer queue. */ void bufq_free(struct bufq_state *bufq) { KASSERT(bufq_peek(bufq) == NULL); bufq->bq_fini(bufq); mutex_enter(&bufq_mutex); bufq->bq_strat->bs_refcnt--; mutex_exit(&bufq_mutex); kmem_free(bufq, sizeof(*bufq)); } /* * get a strategy identifier of a buffer queue. */ const char * bufq_getstrategyname(struct bufq_state *bufq) { return bufq->bq_strat->bs_name; } /* * move all requests on a buffer queue to another. */ void bufq_move(struct bufq_state *dst, struct bufq_state *src) { struct buf *bp; while ((bp = bufq_get(src)) != NULL) { bufq_put(dst, bp); } } static int docopy(char *buf, size_t *bufoffp, size_t buflen, const char *datap, size_t datalen) { int error = 0; if (buf != NULL && datalen > 0) { if (*bufoffp + datalen > buflen) { goto out; } error = copyout(datap, buf + *bufoffp, datalen); if (error) { goto out; } } out: if (error == 0) { *bufoffp += datalen; } return error; } static int docopystr(char *buf, size_t *bufoffp, size_t buflen, const char *datap) { return docopy(buf, bufoffp, buflen, datap, strlen(datap)); } static int docopynul(char *buf, size_t *bufoffp, size_t buflen) { return docopy(buf, bufoffp, buflen, "", 1); } /* * sysctl function that will print all bufq strategies * currently available to the kernel. */ static int sysctl_kern_bufq_strategies(SYSCTLFN_ARGS) { const struct bufq_strat *bq_strat; const char *delim = ""; size_t off = 0; size_t buflen = *oldlenp; int error; SLIST_FOREACH(bq_strat, &bufq_strat_list, bs_next) { error = docopystr(oldp, &off, buflen, delim); if (error) { goto out; } error = docopystr(oldp, &off, buflen, (bq_strat)->bs_name); if (error) { goto out; } delim = " "; } /* In case there are no registered strategies ... */ if (off == 0) { error = docopystr(oldp, &off, buflen, "NULL"); if (error) { goto out; } } /* NUL terminate */ error = docopynul(oldp, &off, buflen); out: *oldlenp = off; return error; } static void sysctl_kern_bufq_strategies_setup(struct sysctllog **clog) { const struct sysctlnode *node; node = NULL; sysctl_createv(clog, 0, NULL, &node, CTLFLAG_PERMANENT, CTLTYPE_NODE, "bufq", SYSCTL_DESCR("buffer queue subtree"), NULL, 0, NULL, 0, CTL_KERN, CTL_CREATE, CTL_EOL); if (node != NULL) { sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT, CTLTYPE_STRING, "strategies", SYSCTL_DESCR("List of bufq strategies present"), sysctl_kern_bufq_strategies, 0, NULL, 0, CTL_KERN, node->sysctl_num, CTL_CREATE, CTL_EOL); } }
2 2 2 2 2 2 2 2 2 393 234 241 394 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 /* $NetBSD: lapic.c,v 1.90 2024/02/25 18:27:54 andvar Exp $ */ /*- * Copyright (c) 2000, 2008, 2020 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by RedBack Networks Inc. * * Author: Bill Sommerfeld * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: lapic.c,v 1.90 2024/02/25 18:27:54 andvar Exp $"); #include "acpica.h" #include "ioapic.h" #include "opt_acpi.h" #include "opt_ddb.h" #include "opt_mpbios.h" /* for MPDEBUG */ #include "opt_multiprocessor.h" #include "opt_ntp.h" #include "opt_xen.h" #include <sys/param.h> #include <sys/proc.h> #include <sys/systm.h> #include <sys/device.h> #include <sys/timetc.h> #include <uvm/uvm_extern.h> #include <dev/ic/i8253reg.h> #include <x86/machdep.h> #include <machine/cpu.h> #include <machine/cpu_counter.h> #include <machine/cpufunc.h> #include <machine/cpuvar.h> #include <machine/pmap.h> #include <machine/vmparam.h> #include <machine/mpacpi.h> #include <machine/mpbiosvar.h> #include <machine/pcb.h> #include <machine/pmap_private.h> #include <machine/specialreg.h> #include <machine/segments.h> #include <x86/x86/tsc.h> #include <x86/i82093var.h> #include <machine/apicvar.h> #include <machine/i82489reg.h> #include <machine/i82489var.h> #ifndef XENPV #if NACPICA > 0 #include <dev/acpi/acpica.h> #include <dev/acpi/acpivar.h> #endif #ifdef DDB #include <machine/db_machdep.h> #ifdef MULTIPROCESSOR #ifdef __x86_64__ typedef void (vector)(void); extern vector Xintr_x2apic_ddbipi; extern int ddb_vec; #endif #endif #endif #include <dev/vmt/vmtreg.h> /* for vmt_hvcall() */ #include <dev/vmt/vmtvar.h> /* for vmt_hvcall() */ /* Referenced from vector.S */ void lapic_clockintr(void *, struct intrframe *); static void lapic_delay(unsigned int); static uint32_t lapic_gettick(void); static void lapic_setup_bsp(paddr_t); static void lapic_map(paddr_t); static void lapic_hwmask(struct pic *, int); static void lapic_hwunmask(struct pic *, int); static void lapic_setup(struct pic *, struct cpu_info *, int, int, int); /* Make it public to call via ddb */ void lapic_dump(void); struct pic local_pic = { .pic_name = "lapic", .pic_type = PIC_LAPIC, .pic_lock = __SIMPLELOCK_UNLOCKED, .pic_hwmask = lapic_hwmask, .pic_hwunmask = lapic_hwunmask, .pic_addroute = lapic_setup, .pic_delroute = lapic_setup, .pic_intr_get_devname = x86_intr_get_devname, .pic_intr_get_assigned = x86_intr_get_assigned, .pic_intr_get_count = x86_intr_get_count, }; static int i82489_ipi(int vec, int target, int dl); static int x2apic_ipi(int vec, int target, int dl); int (*x86_ipi)(int, int, int) = i82489_ipi; bool x2apic_mode __read_mostly; #ifdef LAPIC_ENABLE_X2APIC bool x2apic_enable = true; #else bool x2apic_enable = false; #endif static bool lapic_broken_periodic __read_mostly; static uint32_t i82489_readreg(u_int reg) { return *((volatile uint32_t *)(local_apic_va + reg)); } static void i82489_writereg(u_int reg, uint32_t val) { *((volatile uint32_t *)(local_apic_va + reg)) = val; } static uint32_t i82489_cpu_number(void) { return i82489_readreg(LAPIC_ID) >> LAPIC_ID_SHIFT; } static uint32_t x2apic_readreg(u_int reg) { return rdmsr(MSR_X2APIC_BASE + (reg >> 4)); } static void x2apic_writereg(u_int reg, uint32_t val) { x86_mfence(); wrmsr(MSR_X2APIC_BASE + (reg >> 4), val); } static void x2apic_writereg64(u_int reg, uint64_t val) { KDASSERT(reg == LAPIC_ICRLO); x86_mfence(); wrmsr(MSR_X2APIC_BASE + (reg >> 4), val); } static void x2apic_write_icr(uint32_t hi, uint32_t lo) { x2apic_writereg64(LAPIC_ICRLO, ((uint64_t)hi << 32) | lo); } static uint32_t x2apic_cpu_number(void) { return x2apic_readreg(LAPIC_ID); } uint32_t lapic_readreg(u_int reg) { if (x2apic_mode) return x2apic_readreg(reg); return i82489_readreg(reg); } void lapic_writereg(u_int reg, uint32_t val) { if (x2apic_mode) x2apic_writereg(reg, val); else i82489_writereg(reg, val); } void lapic_write_tpri(uint32_t val) { val &= LAPIC_TPRI_MASK; #ifdef i386 lapic_writereg(LAPIC_TPRI, val); #else lcr8(val >> 4); #endif } uint32_t lapic_cpu_number(void) { if (x2apic_mode) return x2apic_cpu_number(); return i82489_cpu_number(); } static void lapic_enable_x2apic(void) { uint64_t apicbase; apicbase = rdmsr(MSR_APICBASE); if (!ISSET(apicbase, APICBASE_EN)) { apicbase |= APICBASE_EN; wrmsr(MSR_APICBASE, apicbase); } apicbase |= APICBASE_EXTD; wrmsr(MSR_APICBASE, apicbase); } bool lapic_is_x2apic(void) { uint64_t msr; if (!ISSET(cpu_feature[0], CPUID_APIC) || rdmsr_safe(MSR_APICBASE, &msr) == EFAULT) return false; return (msr & (APICBASE_EN | APICBASE_EXTD)) == (APICBASE_EN | APICBASE_EXTD); } /* * Initialize the local APIC on the BSP. */ static void lapic_setup_bsp(paddr_t lapic_base) { u_int regs[6]; const char *reason = NULL; const char *hw_vendor; bool bios_x2apic; if (ISSET(cpu_feature[1], CPUID2_X2APIC)) { #if NACPICA > 0 if (acpi_present) { ACPI_TABLE_DMAR *dmar; ACPI_STATUS status; /* * Automatically detect several configurations where * x2APIC mode is known to cause troubles. User can * override the setting with hw.x2apic_enable tunable. */ status = AcpiGetTable(ACPI_SIG_DMAR, 1, (ACPI_TABLE_HEADER **)&dmar); if (ACPI_SUCCESS(status)) { if (ISSET(dmar->Flags, ACPI_DMAR_X2APIC_OPT_OUT)) { reason = "by DMAR table"; } AcpiPutTable(&dmar->Header); } } #endif /* NACPICA > 0 */ if (vm_guest == VM_GUEST_VMWARE) { vmt_hvcall(VM_CMD_GET_VCPU_INFO, regs); if (ISSET(regs[0], VCPUINFO_VCPU_RESERVED) || !ISSET(regs[0], VCPUINFO_LEGACY_X2APIC)) reason = "inside VMWare without intr " "redirection"; } else if (vm_guest == VM_GUEST_XENHVM) { reason = "due to running under XEN"; } else if (vm_guest == VM_GUEST_NO && CPUID_TO_FAMILY(curcpu()->ci_signature) == 6 && CPUID_TO_MODEL(curcpu()->ci_signature) == 0x2a) { hw_vendor = pmf_get_platform("board-vendor"); if (hw_vendor != NULL) { /* * It seems that some Lenovo and ASUS * SandyBridge-based notebook BIOSes have a bug * which prevents booting AP in x2APIC mode. * Since the only way to detect mobile CPU is * to check northbridge pci id, which cannot be * done that early, disable x2APIC for all * Lenovo and ASUS SandyBridge machines. */ if (strcmp(hw_vendor, "LENOVO") == 0 || strcmp(hw_vendor, "ASUSTeK Computer Inc.") == 0) { reason = "for a suspected SandyBridge " "BIOS bug"; } } } bios_x2apic = lapic_is_x2apic(); if (reason != NULL && bios_x2apic) { aprint_verbose("x2APIC should be disabled %s but " "already enabled by BIOS; enabling.\n", reason); reason = NULL; } if (reason == NULL) x2apic_mode = true; else aprint_verbose("x2APIC available but disabled %s\n", reason); if (x2apic_enable != x2apic_mode) { if (bios_x2apic && !x2apic_enable) aprint_verbose("x2APIC disabled by user and " "enabled by BIOS; ignoring user setting.\n"); else x2apic_mode = x2apic_enable; } } if (x2apic_mode) { x86_ipi = x2apic_ipi; #if NIOAPIC > 0 struct ioapic_softc *ioapic; for (ioapic = ioapics; ioapic != NULL; ioapic = ioapic->sc_next) { ioapic->sc_pic.pic_edge_stubs = x2apic_edge_stubs; ioapic->sc_pic.pic_level_stubs = x2apic_level_stubs; } #endif #if defined(DDB) && defined(MULTIPROCESSOR) #ifdef __x86_64__ struct idt_vec *iv = &(cpu_info_primary.ci_idtvec); idt_descriptor_t *idt = iv->iv_idt; set_idtgate(&idt[ddb_vec], &Xintr_x2apic_ddbipi, 1, SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); #else /* * Set DDB IPI handler in cpu_set_tss_gates() when cpu0 is * attached. */ #endif #endif x86_disable_intr(); lapic_enable_x2apic(); #ifdef MULTIPROCESSOR cpu_init_first(); /* Catch up to changed cpu_number() */ #endif lapic_write_tpri(0); x86_enable_intr(); } else lapic_map(lapic_base); } static void lapic_map(paddr_t lapic_base) { pt_entry_t *pte; vaddr_t va = local_apic_va; /* * If the CPU has an APIC MSR, use it and ignore the supplied value: * some ACPI implementations have been observed to pass bad values. * Additionally, ensure that the lapic is enabled as we are committed * to using it at this point. Be conservative and assume that the MSR * is not present on the Pentium (is it?). */ if (CPUID_TO_FAMILY(curcpu()->ci_signature) >= 6) { lapic_base = (paddr_t)rdmsr(MSR_APICBASE); if ((lapic_base & APICBASE_PHYSADDR) == 0) { lapic_base |= LAPIC_BASE; } wrmsr(MSR_APICBASE, lapic_base | APICBASE_EN); lapic_base &= APICBASE_PHYSADDR; } x86_disable_intr(); /* * Map local apic. If we have a local apic, it's safe to assume * we're on a 486 or better and can use invlpg and non-cacheable PTE's * * Whap the PTE "by hand" rather than calling pmap_kenter_pa because * the latter will attempt to invoke TLB shootdown code just as we * might have changed the value of cpu_number().. */ pte = kvtopte(va); *pte = lapic_base | PTE_W | PTE_P | PTE_PCD | pmap_pg_g | pmap_pg_nx; invlpg(va); #ifdef MULTIPROCESSOR cpu_init_first(); /* Catch up to changed cpu_number() */ #endif lapic_write_tpri(0); x86_enable_intr(); } /* * enable local apic */ void lapic_enable(void) { lapic_writereg(LAPIC_SVR, LAPIC_SVR_ENABLE | LAPIC_SPURIOUS_VECTOR); } void lapic_set_lvt(void) { struct cpu_info *ci = curcpu(); int i; struct mp_intr_map *mpi; uint32_t lint0, lint1; #ifdef MULTIPROCESSOR if (mp_verbose) { apic_format_redir(device_xname(ci->ci_dev), "prelint", 0, APIC_VECTYPE_LAPIC_LVT, 0, lapic_readreg(LAPIC_LVT_LINT0)); apic_format_redir(device_xname(ci->ci_dev), "prelint", 1, APIC_VECTYPE_LAPIC_LVT, 0, lapic_readreg(LAPIC_LVT_LINT1)); } #endif /* * If an I/O APIC has been attached, assume that it is used instead of * the 8259A for interrupt delivery. Otherwise request the LAPIC to * get external interrupts via LINT0 for the primary CPU. */ lint0 = LAPIC_DLMODE_EXTINT; if (nioapics > 0 || !CPU_IS_PRIMARY(curcpu())) lint0 |= LAPIC_LVT_MASKED; lapic_writereg(LAPIC_LVT_LINT0, lint0); /* * Non Maskable Interrupts are to be delivered to the primary CPU. */ lint1 = LAPIC_DLMODE_NMI; if (!CPU_IS_PRIMARY(curcpu())) lint1 |= LAPIC_LVT_MASKED; lapic_writereg(LAPIC_LVT_LINT1, lint1); for (i = 0; i < mp_nintr; i++) { mpi = &mp_intrs[i]; if (mpi->ioapic == NULL && (mpi->cpu_id == MPS_ALL_APICS || mpi->cpu_id == ci->ci_cpuid)) { if (mpi->ioapic_pin > 1) aprint_error_dev(ci->ci_dev, "%s: WARNING: bad pin value %d\n", __func__, mpi->ioapic_pin); if (mpi->ioapic_pin == 0) lapic_writereg(LAPIC_LVT_LINT0, mpi->redir); else lapic_writereg(LAPIC_LVT_LINT1, mpi->redir); } } #ifdef MULTIPROCESSOR if (mp_verbose) lapic_dump(); #endif } /* * Initialize fixed idt vectors for use by local apic. */ void lapic_boot_init(paddr_t lapic_base) { struct idt_vec *iv = &(cpu_info_primary.ci_idtvec); lapic_setup_bsp(lapic_base); #ifdef MULTIPROCESSOR idt_vec_reserve(iv, LAPIC_IPI_VECTOR); idt_vec_set(iv, LAPIC_IPI_VECTOR, x2apic_mode ? Xintr_x2apic_ipi : Xintr_lapic_ipi); idt_vec_reserve(iv, LAPIC_TLB_VECTOR); idt_vec_set(iv, LAPIC_TLB_VECTOR, x2apic_mode ? Xintr_x2apic_tlb : Xintr_lapic_tlb); #endif idt_vec_reserve(iv, LAPIC_SPURIOUS_VECTOR); idt_vec_set(iv, LAPIC_SPURIOUS_VECTOR, Xintrspurious); idt_vec_reserve(iv, LAPIC_TIMER_VECTOR); idt_vec_set(iv, LAPIC_TIMER_VECTOR, x2apic_mode ? Xintr_x2apic_ltimer : Xintr_lapic_ltimer); } static uint32_t lapic_gettick(void) { return lapic_readreg(LAPIC_CCR_TIMER); } #include <sys/kernel.h> /* for hz */ uint32_t lapic_tval; /* * this gets us up to a 4GHz busclock.... */ uint32_t lapic_per_second; uint32_t lapic_frac_usec_per_cycle; uint64_t lapic_frac_cycle_per_usec; uint32_t lapic_delaytab[26]; static u_int lapic_get_timecount(struct timecounter *tc) { struct cpu_info *ci; uint32_t cur_timer; int s; s = splhigh(); ci = curcpu(); /* * Check for a race against the clockinterrupt. * The update of ci_lapic_counter is blocked by splhigh() and * the check for a pending clockinterrupt compensates for that. * * If the current tick is almost the Initial Counter, explicitly * check for the pending interrupt bit as the interrupt delivery * could be asynchronous and compensate as well. * * This can't be done without splhigh() as the calling code might * have masked the clockinterrupt already. * * This code assumes that clockinterrupts are not missed. */ cur_timer = lapic_gettick(); if (cur_timer >= lapic_tval - 1) { uint16_t reg = LAPIC_IRR + LAPIC_TIMER_VECTOR / 32 * 16; if (lapic_readreg(reg) & (1 << (LAPIC_TIMER_VECTOR % 32))) { cur_timer -= lapic_tval; } } else if (ci->ci_ipending & (1ULL << LIR_TIMER)) cur_timer = lapic_gettick() - lapic_tval; cur_timer = ci->ci_lapic_counter - cur_timer; splx(s); return cur_timer; } static struct timecounter lapic_timecounter = { .tc_get_timecount = lapic_get_timecount, .tc_counter_mask = ~0u, .tc_name = "lapic", .tc_quality = #ifndef MULTIPROCESSOR 2100, #else -100, /* per CPU state */ #endif }; extern u_int i8254_get_timecount(struct timecounter *); void lapic_clockintr(void *arg, struct intrframe *frame) { struct cpu_info *ci = curcpu(); ci->ci_lapic_counter += lapic_tval; ci->ci_isources[LIR_TIMER]->is_evcnt.ev_count++; hardclock((struct clockframe *)frame); } void lapic_reset(void) { /* * Mask the clock interrupt and set mode, * then set divisor, * then unmask and set the vector. */ lapic_writereg(LAPIC_LVT_TIMER, LAPIC_LVT_TMM_PERIODIC | LAPIC_LVT_MASKED); lapic_writereg(LAPIC_DCR_TIMER, LAPIC_DCRT_DIV1); lapic_writereg(LAPIC_ICR_TIMER, lapic_tval); lapic_writereg(LAPIC_LVT_TIMER, LAPIC_LVT_TMM_PERIODIC | LAPIC_TIMER_VECTOR); lapic_writereg(LAPIC_EOI, 0); } static void lapic_initclock(void) { if (curcpu() == &cpu_info_primary) { /* * Recalibrate the timer using the cycle counter, now that * the cycle counter itself has been recalibrated. */ lapic_calibrate_timer(true); /* * Hook up time counter. This assume that all LAPICs have * the same frequency. */ lapic_timecounter.tc_frequency = lapic_per_second; tc_init(&lapic_timecounter); } /* Start local apic countdown timer running, in repeated mode. */ lapic_reset(); } /* * Calibrate the local apic count-down timer (which is running at * bus-clock speed) vs. the i8254 counter/timer (which is running at * a fixed rate). * * The Intel MP spec says: "An MP operating system may use the IRQ8 * real-time clock as a reference to determine the actual APIC timer clock * speed." * * We're actually using the IRQ0 timer. Hmm. */ void lapic_calibrate_timer(bool secondpass) { struct cpu_info *ci = curcpu(); uint64_t tmp; int i; char tbuf[9]; KASSERT(ci == &cpu_info_primary); aprint_debug_dev(ci->ci_dev, "[re]calibrating local timer\n"); /* * Configure timer to one-shot, interrupt masked, * large positive number. */ x86_disable_intr(); lapic_writereg(LAPIC_LVT_TIMER, LAPIC_LVT_MASKED); lapic_writereg(LAPIC_DCR_TIMER, LAPIC_DCRT_DIV1); lapic_writereg(LAPIC_ICR_TIMER, 0x80000000); (void)lapic_gettick(); if (secondpass && cpu_hascounter()) { /* * Second pass calibration, using the TSC which has ideally * been calibrated using the HPET or information gleaned * from MSRs by this point. */ uint64_t l0, l1, t0, t1; (void)cpu_counter(); t0 = cpu_counter(); l0 = lapic_gettick(); t0 += cpu_counter(); DELAY(50000); t1 = cpu_counter(); l1 = lapic_gettick(); t1 += cpu_counter(); tmp = (l0 - l1) * cpu_frequency(ci) / ((t1 - t0 + 1) / 2); lapic_per_second = rounddown(tmp + 500, 1000); } else if (lapic_per_second == 0) { /* * Inaccurate first pass calibration using the i8254. */ unsigned int seen, delta, initial_i8254, initial_lapic; unsigned int cur_i8254, cur_lapic; (void)gettick(); initial_lapic = lapic_gettick(); initial_i8254 = gettick(); for (seen = 0; seen < TIMER_FREQ / 100; seen += delta) { cur_i8254 = gettick(); if (cur_i8254 > initial_i8254) delta = x86_rtclock_tval - (cur_i8254 - initial_i8254); else delta = initial_i8254 - cur_i8254; initial_i8254 = cur_i8254; } cur_lapic = lapic_gettick(); tmp = initial_lapic - cur_lapic; lapic_per_second = (tmp * TIMER_FREQ + seen / 2) / seen; } x86_enable_intr(); humanize_number(tbuf, sizeof(tbuf), lapic_per_second, "Hz", 1000); aprint_debug_dev(ci->ci_dev, "apic clock running at %s\n", tbuf); if (lapic_per_second != 0) { /* * reprogram the apic timer to run in periodic mode. * XXX need to program timer on other CPUs, too. */ lapic_tval = (lapic_per_second * 2) / hz; lapic_tval = (lapic_tval / 2) + (lapic_tval & 0x1); lapic_writereg(LAPIC_LVT_TIMER, LAPIC_LVT_TMM_PERIODIC | LAPIC_LVT_MASKED | LAPIC_TIMER_VECTOR); lapic_writereg(LAPIC_DCR_TIMER, LAPIC_DCRT_DIV1); lapic_writereg(LAPIC_ICR_TIMER, lapic_tval); /* * Compute fixed-point ratios between cycles and * microseconds to avoid having to do any division * in lapic_delay. */ tmp = (1000000 * (uint64_t)1 << 32) / lapic_per_second; lapic_frac_usec_per_cycle = tmp; tmp = (lapic_per_second * (uint64_t)1 << 32) / 1000000; lapic_frac_cycle_per_usec = tmp; /* * Compute delay in cycles for likely short delays in usec. */ for (i = 0; i < 26; i++) lapic_delaytab[i] = (lapic_frac_cycle_per_usec * i) >> 32; /* * Apply workaround for broken periodic timer under KVM */ if (vm_guest == VM_GUEST_KVM) { lapic_broken_periodic = true; lapic_timecounter.tc_quality = -100; aprint_debug_dev(ci->ci_dev, "applying KVM timer workaround\n"); } /* * Now that the timer's calibrated, use the apic timer routines * for all our timing needs.. */ if (!secondpass) { delay_func = lapic_delay; x86_initclock_func = lapic_initclock; initrtclock(0); } } } /* * delay for N usec. */ static void lapic_delay(unsigned int usec) { int32_t xtick, otick; int64_t deltat; /* XXX Bad to disable preemption, but it's tied to the cpu. */ kpreempt_disable(); otick = lapic_gettick(); if (usec <= 0) { kpreempt_enable(); return; } if (usec <= 25) deltat = lapic_delaytab[usec]; else deltat = (lapic_frac_cycle_per_usec * usec) >> 32; while (deltat > 0) { xtick = lapic_gettick(); if (lapic_broken_periodic && xtick == 0 && otick == 0) { lapic_reset(); xtick = lapic_gettick(); if (xtick == 0) panic("lapic timer stopped ticking"); } if (xtick > otick) deltat -= lapic_tval - (xtick - otick); else deltat -= otick - xtick; otick = xtick; x86_pause(); } kpreempt_enable(); } /* * XXX the following belong mostly or partly elsewhere.. */ static void i82489_icr_wait(void) { #ifdef DIAGNOSTIC unsigned j = 100000; #endif /* DIAGNOSTIC */ while ((i82489_readreg(LAPIC_ICRLO) & LAPIC_DLSTAT_BUSY) != 0) { x86_pause(); #ifdef DIAGNOSTIC j--; if (j == 0) panic("i82489_icr_wait: busy"); #endif /* DIAGNOSTIC */ } } static int i82489_ipi_init(int target) { uint32_t esr; i82489_writereg(LAPIC_ESR, 0); (void)i82489_readreg(LAPIC_ESR); i82489_writereg(LAPIC_ICRHI, target << LAPIC_ID_SHIFT); i82489_writereg(LAPIC_ICRLO, LAPIC_DLMODE_INIT | LAPIC_LEVEL_ASSERT); i82489_icr_wait(); delay_func(10000); i82489_writereg(LAPIC_ICRLO, LAPIC_DLMODE_INIT | LAPIC_TRIGMODE_LEVEL | LAPIC_LEVEL_DEASSERT); i82489_icr_wait(); if ((i82489_readreg(LAPIC_ICRLO) & LAPIC_DLSTAT_BUSY) != 0) return EBUSY; esr = i82489_readreg(LAPIC_ESR); if (esr != 0) aprint_debug("%s: ESR %08x\n", __func__, esr); return 0; } static int i82489_ipi_startup(int target, int vec) { uint32_t esr; i82489_writereg(LAPIC_ESR, 0); (void)i82489_readreg(LAPIC_ESR); i82489_icr_wait(); i82489_writereg(LAPIC_ICRHI, target << LAPIC_ID_SHIFT); i82489_writereg(LAPIC_ICRLO, vec | LAPIC_DLMODE_STARTUP | LAPIC_LEVEL_ASSERT); i82489_icr_wait(); if ((i82489_readreg(LAPIC_ICRLO) & LAPIC_DLSTAT_BUSY) != 0) return EBUSY; esr = i82489_readreg(LAPIC_ESR); if (esr != 0) aprint_debug("%s: ESR %08x\n", __func__, esr); return 0; } static int i82489_ipi(int vec, int target, int dl) { int result, s; s = splhigh(); i82489_icr_wait(); if ((target & LAPIC_DEST_MASK) == 0) i82489_writereg(LAPIC_ICRHI, target << LAPIC_ID_SHIFT); i82489_writereg(LAPIC_ICRLO, (target & LAPIC_DEST_MASK) | vec | dl | LAPIC_LEVEL_ASSERT); #ifdef DIAGNOSTIC i82489_icr_wait(); result = (i82489_readreg(LAPIC_ICRLO) & LAPIC_DLSTAT_BUSY) ? EBUSY : 0; #else /* Don't wait - if it doesn't go, we're in big trouble anyway. */ result = 0; #endif splx(s); return result; } static int x2apic_ipi_init(int target) { x2apic_write_icr(target, LAPIC_DLMODE_INIT | LAPIC_LEVEL_ASSERT); delay_func(10000); x2apic_write_icr(0, LAPIC_DLMODE_INIT | LAPIC_TRIGMODE_LEVEL | LAPIC_LEVEL_DEASSERT); return 0; } static int x2apic_ipi_startup(int target, int vec) { x2apic_write_icr(target, vec | LAPIC_DLMODE_STARTUP | LAPIC_LEVEL_ASSERT); return 0; } static int x2apic_ipi(int vec, int target, int dl) { uint32_t dest_id = 0; if ((target & LAPIC_DEST_MASK) == 0) dest_id = target; x2apic_write_icr(dest_id, (target & LAPIC_DEST_MASK) | vec | dl | LAPIC_LEVEL_ASSERT); return 0; } int x86_ipi_init(int target) { if (x2apic_mode) return x2apic_ipi_init(target); return i82489_ipi_init(target); } int x86_ipi_startup(int target, int vec) { if (x2apic_mode) return x2apic_ipi_startup(target, vec); return i82489_ipi_startup(target, vec); } /* * Using 'pin numbers' as: * 0 - timer * 1 - thermal * 2 - PCINT * 3 - LVINT0 * 4 - LVINT1 * 5 - LVERR */ static void lapic_hwmask(struct pic *pic, int pin) { int reg; uint32_t val; reg = LAPIC_LVT_TIMER + (pin << 4); val = lapic_readreg(reg); val |= LAPIC_LVT_MASKED; lapic_writereg(reg, val); } static void lapic_hwunmask(struct pic *pic, int pin) { int reg; uint32_t val; reg = LAPIC_LVT_TIMER + (pin << 4); val = lapic_readreg(reg); val &= ~LAPIC_LVT_MASKED; lapic_writereg(reg, val); } static void lapic_setup(struct pic *pic, struct cpu_info *ci, int pin, int idtvec, int type) { } void lapic_dump(void) { struct cpu_info *ci = curcpu(); #define APIC_LVT_PRINT(ci, where, idx, lvtreg) \ apic_format_redir(device_xname(ci->ci_dev), where, (idx), \ APIC_VECTYPE_LAPIC_LVT, 0, lapic_readreg(lvtreg)) APIC_LVT_PRINT(ci, "cmci", 0, LAPIC_LVT_CMCI); APIC_LVT_PRINT(ci, "timer", 0, LAPIC_LVT_TIMER); APIC_LVT_PRINT(ci, "thermal", 0, LAPIC_LVT_THERM); APIC_LVT_PRINT(ci, "pcint", 0, LAPIC_LVT_PCINT); APIC_LVT_PRINT(ci, "lint", 0, LAPIC_LVT_LINT0); APIC_LVT_PRINT(ci, "lint", 1, LAPIC_LVT_LINT1); APIC_LVT_PRINT(ci, "err", 0, LAPIC_LVT_ERR); #undef APIC_LVT_PRINT } #else /* XENPV */ void lapic_boot_init(paddr_t lapic_base) { } #endif /* XENPV */
20 20 20 20 20 190 190 20 20 20 20 20 20 20 20 20 4 20 340 143 143 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 /* $NetBSD: kern_event.c,v 1.150 2023/09/21 09:31:50 msaitoh Exp $ */ /*- * Copyright (c) 2008, 2009, 2021 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Andrew Doran. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /*- * Copyright (c) 1999,2000,2001 Jonathan Lemon <jlemon@FreeBSD.org> * Copyright (c) 2009 Apple, Inc * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * FreeBSD: src/sys/kern/kern_event.c,v 1.27 2001/07/05 17:10:44 rwatson Exp */ #ifdef _KERNEL_OPT #include "opt_ddb.h" #endif /* _KERNEL_OPT */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: kern_event.c,v 1.150 2023/09/21 09:31:50 msaitoh Exp $"); #include <sys/param.h> #include <sys/systm.h> #include <sys/kernel.h> #include <sys/wait.h> #include <sys/proc.h> #include <sys/file.h> #include <sys/select.h> #include <sys/queue.h> #include <sys/event.h> #include <sys/eventvar.h> #include <sys/poll.h> #include <sys/kmem.h> #include <sys/stat.h> #include <sys/filedesc.h> #include <sys/syscallargs.h> #include <sys/kauth.h> #include <sys/conf.h> #include <sys/atomic.h> static int kqueue_scan(file_t *, size_t, struct kevent *, const struct timespec *, register_t *, const struct kevent_ops *, struct kevent *, size_t); static int kqueue_ioctl(file_t *, u_long, void *); static int kqueue_fcntl(file_t *, u_int, void *); static int kqueue_poll(file_t *, int); static int kqueue_kqfilter(file_t *, struct knote *); static int kqueue_stat(file_t *, struct stat *); static int kqueue_close(file_t *); static void kqueue_restart(file_t *); static int kqueue_fpathconf(file_t *, int, register_t *); static int kqueue_register(struct kqueue *, struct kevent *); static void kqueue_doclose(struct kqueue *, struct klist *, int); static void knote_detach(struct knote *, filedesc_t *fdp, bool); static void knote_enqueue(struct knote *); static void knote_activate(struct knote *); static void knote_activate_locked(struct knote *); static void knote_deactivate_locked(struct knote *); static void filt_kqdetach(struct knote *); static int filt_kqueue(struct knote *, long hint); static int filt_procattach(struct knote *); static void filt_procdetach(struct knote *); static int filt_proc(struct knote *, long hint); static int filt_fileattach(struct knote *); static void filt_timerexpire(void *x); static int filt_timerattach(struct knote *); static void filt_timerdetach(struct knote *); static int filt_timer(struct knote *, long hint); static int filt_timertouch(struct knote *, struct kevent *, long type); static int filt_userattach(struct knote *); static void filt_userdetach(struct knote *); static int filt_user(struct knote *, long hint); static int filt_usertouch(struct knote *, struct kevent *, long type); /* * Private knote state that should never be exposed outside * of kern_event.c * * Field locking: * * q kn_kq->kq_lock */ struct knote_impl { struct knote ki_knote; unsigned int ki_influx; /* q: in-flux counter */ kmutex_t ki_foplock; /* for kn_filterops */ }; #define KIMPL_TO_KNOTE(kip) (&(kip)->ki_knote) #define KNOTE_TO_KIMPL(knp) container_of((knp), struct knote_impl, ki_knote) static inline struct knote * knote_alloc(bool sleepok) { struct knote_impl *ki; ki = kmem_zalloc(sizeof(*ki), sleepok ? KM_SLEEP : KM_NOSLEEP); mutex_init(&ki->ki_foplock, MUTEX_DEFAULT, IPL_NONE); return KIMPL_TO_KNOTE(ki); } static inline void knote_free(struct knote *kn) { struct knote_impl *ki = KNOTE_TO_KIMPL(kn); mutex_destroy(&ki->ki_foplock); kmem_free(ki, sizeof(*ki)); } static inline void knote_foplock_enter(struct knote *kn) { mutex_enter(&KNOTE_TO_KIMPL(kn)->ki_foplock); } static inline void knote_foplock_exit(struct knote *kn) { mutex_exit(&KNOTE_TO_KIMPL(kn)->ki_foplock); } static inline bool __diagused knote_foplock_owned(struct knote *kn) { return mutex_owned(&KNOTE_TO_KIMPL(kn)->ki_foplock); } static const struct fileops kqueueops = { .fo_name = "kqueue", .fo_read = (void *)enxio, .fo_write = (void *)enxio, .fo_ioctl = kqueue_ioctl, .fo_fcntl = kqueue_fcntl, .fo_poll = kqueue_poll, .fo_stat = kqueue_stat, .fo_close = kqueue_close, .fo_kqfilter = kqueue_kqfilter, .fo_restart = kqueue_restart, .fo_fpathconf = kqueue_fpathconf, }; static void filt_nopdetach(struct knote *kn __unused) { } static int filt_nopevent(struct knote *kn __unused, long hint __unused) { return 0; } static const struct filterops nop_fd_filtops = { .f_flags = FILTEROP_ISFD | FILTEROP_MPSAFE, .f_attach = NULL, .f_detach = filt_nopdetach, .f_event = filt_nopevent, }; static const struct filterops nop_filtops = { .f_flags = FILTEROP_MPSAFE, .f_attach = NULL, .f_detach = filt_nopdetach, .f_event = filt_nopevent, }; static const struct filterops kqread_filtops = { .f_flags = FILTEROP_ISFD | FILTEROP_MPSAFE, .f_attach = NULL, .f_detach = filt_kqdetach, .f_event = filt_kqueue, }; static const struct filterops proc_filtops = { .f_flags = FILTEROP_MPSAFE, .f_attach = filt_procattach, .f_detach = filt_procdetach, .f_event = filt_proc, }; /* * file_filtops is not marked MPSAFE because it's going to call * fileops::fo_kqfilter(), which might not be. That function, * however, will override the knote's filterops, and thus will * inherit the MPSAFE-ness of the back-end at that time. */ static const struct filterops file_filtops = { .f_flags = FILTEROP_ISFD, .f_attach = filt_fileattach, .f_detach = NULL, .f_event = NULL, }; static const struct filterops timer_filtops = { .f_flags = FILTEROP_MPSAFE, .f_attach = filt_timerattach, .f_detach = filt_timerdetach, .f_event = filt_timer, .f_touch = filt_timertouch, }; static const struct filterops user_filtops = { .f_flags = FILTEROP_MPSAFE, .f_attach = filt_userattach, .f_detach = filt_userdetach, .f_event = filt_user, .f_touch = filt_usertouch, }; static u_int kq_ncallouts = 0; static int kq_calloutmax = (4 * 1024); #define KN_HASHSIZE 64 /* XXX should be tunable */ #define KN_HASH(val, mask) (((val) ^ (val >> 8)) & (mask)) extern const struct filterops fs_filtops; /* vfs_syscalls.c */ extern const struct filterops sig_filtops; /* kern_sig.c */ /* * Table for all system-defined filters. * These should be listed in the numeric order of the EVFILT_* defines. * If filtops is NULL, the filter isn't implemented in NetBSD. * End of list is when name is NULL. * * Note that 'refcnt' is meaningless for built-in filters. */ struct kfilter { const char *name; /* name of filter */ uint32_t filter; /* id of filter */ unsigned refcnt; /* reference count */ const struct filterops *filtops;/* operations for filter */ size_t namelen; /* length of name string */ }; /* System defined filters */ static struct kfilter sys_kfilters[] = { { "EVFILT_READ", EVFILT_READ, 0, &file_filtops, 0 }, { "EVFILT_WRITE", EVFILT_WRITE, 0, &file_filtops, 0, }, { "EVFILT_AIO", EVFILT_AIO, 0, NULL, 0 }, { "EVFILT_VNODE", EVFILT_VNODE, 0, &file_filtops, 0 }, { "EVFILT_PROC", EVFILT_PROC, 0, &proc_filtops, 0 }, { "EVFILT_SIGNAL", EVFILT_SIGNAL, 0, &sig_filtops, 0 }, { "EVFILT_TIMER", EVFILT_TIMER, 0, &timer_filtops, 0 }, { "EVFILT_FS", EVFILT_FS, 0, &fs_filtops, 0 }, { "EVFILT_USER", EVFILT_USER, 0, &user_filtops, 0 }, { "EVFILT_EMPTY", EVFILT_EMPTY, 0, &file_filtops, 0 }, { NULL, 0, 0, NULL, 0 }, }; /* User defined kfilters */ static struct kfilter *user_kfilters; /* array */ static int user_kfilterc; /* current offset */ static int user_kfiltermaxc; /* max size so far */ static size_t user_kfiltersz; /* size of allocated memory */ /* * Global Locks. * * Lock order: * * kqueue_filter_lock * -> kn_kq->kq_fdp->fd_lock * -> knote foplock (if taken) * -> object lock (e.g., device driver lock, &c.) * -> kn_kq->kq_lock * * Locking rules. ==> indicates the lock is acquired by the backing * object, locks prior are acquired before calling filter ops: * * f_attach: fdp->fd_lock -> knote foplock -> * (maybe) KERNEL_LOCK ==> backing object lock * * f_detach: fdp->fd_lock -> knote foplock -> * (maybe) KERNEL_LOCK ==> backing object lock * * f_event via kevent: fdp->fd_lock -> knote foplock -> * (maybe) KERNEL_LOCK ==> backing object lock * N.B. NOTE_SUBMIT will never be set in the "hint" argument * in this case. * * f_event via knote (via backing object: Whatever caller guarantees. * Typically: * f_event(NOTE_SUBMIT): caller has already acquired backing * object lock. * f_event(!NOTE_SUBMIT): caller has not acquired backing object, * lock or has possibly acquired KERNEL_LOCK. Backing object * lock may or may not be acquired as-needed. * N.B. the knote foplock will **not** be acquired in this case. The * caller guarantees that klist_fini() will not be called concurrently * with knote(). * * f_touch: fdp->fd_lock -> kn_kq->kq_lock (spin lock) * N.B. knote foplock is **not** acquired in this case and * the caller must guarantee that klist_fini() will never * be called. kevent_register() restricts filters that * provide f_touch to known-safe cases. * * klist_fini(): Caller must guarantee that no more knotes can * be attached to the klist, and must **not** hold the backing * object's lock; klist_fini() itself will acquire the foplock * of each knote on the klist. * * Locking rules when detaching knotes: * * There are some situations where knote submission may require dropping * locks (see knote_proc_fork()). In order to support this, it's possible * to mark a knote as being 'in-flux'. Such a knote is guaranteed not to * be detached while it remains in-flux. Because it will not be detached, * locks can be dropped so e.g. memory can be allocated, locks on other * data structures can be acquired, etc. During this time, any attempt to * detach an in-flux knote must wait until the knote is no longer in-flux. * When this happens, the knote is marked for death (KN_WILLDETACH) and the * LWP who gets to finish the detach operation is recorded in the knote's * 'udata' field (which is no longer required for its original purpose once * a knote is so marked). Code paths that lead to knote_detach() must ensure * that their LWP is the one tasked with its final demise after waiting for * the in-flux status of the knote to clear. Note that once a knote is * marked KN_WILLDETACH, no code paths may put it into an in-flux state. * * Once the special circumstances have been handled, the locks are re- * acquired in the proper order (object lock -> kq_lock), the knote taken * out of flux, and any waiters are notified. Because waiters must have * also dropped *their* locks in order to safely block, they must re- * validate all of their assumptions; see knote_detach_quiesce(). See also * the kqueue_register() (EV_ADD, EV_DELETE) and kqueue_scan() (EV_ONESHOT) * cases. * * When kqueue_scan() encounters an in-flux knote, the situation is * treated like another LWP's list marker. * * LISTEN WELL: It is important to not hold knotes in flux for an * extended period of time! In-flux knotes effectively block any * progress of the kqueue_scan() operation. Any code paths that place * knotes in-flux should be careful to not block for indefinite periods * of time, such as for memory allocation (i.e. KM_NOSLEEP is OK, but * KM_SLEEP is not). */ static krwlock_t kqueue_filter_lock; /* lock on filter lists */ #define KQ_FLUX_WAIT(kq) (void)cv_wait(&kq->kq_cv, &kq->kq_lock) #define KQ_FLUX_WAKEUP(kq) cv_broadcast(&kq->kq_cv) static inline bool kn_in_flux(struct knote *kn) { KASSERT(mutex_owned(&kn->kn_kq->kq_lock)); return KNOTE_TO_KIMPL(kn)->ki_influx != 0; } static inline bool kn_enter_flux(struct knote *kn) { KASSERT(mutex_owned(&kn->kn_kq->kq_lock)); if (kn->kn_status & KN_WILLDETACH) { return false; } struct knote_impl *ki = KNOTE_TO_KIMPL(kn); KASSERT(ki->ki_influx < UINT_MAX); ki->ki_influx++; return true; } static inline bool kn_leave_flux(struct knote *kn) { KASSERT(mutex_owned(&kn->kn_kq->kq_lock)); struct knote_impl *ki = KNOTE_TO_KIMPL(kn); KASSERT(ki->ki_influx > 0); ki->ki_influx--; return ki->ki_influx == 0; } static void kn_wait_flux(struct knote *kn, bool can_loop) { struct knote_impl *ki = KNOTE_TO_KIMPL(kn); bool loop; KASSERT(mutex_owned(&kn->kn_kq->kq_lock)); /* * It may not be safe for us to touch the knote again after * dropping the kq_lock. The caller has let us know in * 'can_loop'. */ for (loop = true; loop && ki->ki_influx != 0; loop = can_loop) { KQ_FLUX_WAIT(kn->kn_kq); } } #define KNOTE_WILLDETACH(kn) \ do { \ (kn)->kn_status |= KN_WILLDETACH; \ (kn)->kn_kevent.udata = curlwp; \ } while (/*CONSTCOND*/0) /* * Wait until the specified knote is in a quiescent state and * safe to detach. Returns true if we potentially blocked (and * thus dropped our locks). */ static bool knote_detach_quiesce(struct knote *kn) { struct kqueue *kq = kn->kn_kq; filedesc_t *fdp = kq->kq_fdp; KASSERT(mutex_owned(&fdp->fd_lock)); mutex_spin_enter(&kq->kq_lock); /* * There are two cases where we might see KN_WILLDETACH here: * * 1. Someone else has already started detaching the knote but * had to wait for it to settle first. * * 2. We had to wait for it to settle, and had to come back * around after re-acquiring the locks. * * When KN_WILLDETACH is set, we also set the LWP that claimed * the prize of finishing the detach in the 'udata' field of the * knote (which will never be used again for its usual purpose * once the note is in this state). If it doesn't point to us, * we must drop the locks and let them in to finish the job. * * Otherwise, once we have claimed the knote for ourselves, we * can finish waiting for it to settle. The is the only scenario * where touching a detaching knote is safe after dropping the * locks. */ if ((kn->kn_status & KN_WILLDETACH) != 0 && kn->kn_kevent.udata != curlwp) { /* * N.B. it is NOT safe for us to touch the knote again * after dropping the locks here. The caller must go * back around and re-validate everything. However, if * the knote is in-flux, we want to block to minimize * busy-looping. */ mutex_exit(&fdp->fd_lock); if (kn_in_flux(kn)) { kn_wait_flux(kn, false); mutex_spin_exit(&kq->kq_lock); return true; } mutex_spin_exit(&kq->kq_lock); preempt_point(); return true; } /* * If we get here, we know that we will be claiming the * detach responsibilies, or that we already have and * this is the second attempt after re-validation. */ KASSERT((kn->kn_status & KN_WILLDETACH) == 0 || kn->kn_kevent.udata == curlwp); /* * Similarly, if we get here, either we are just claiming it * and may have to wait for it to settle, or if this is the * second attempt after re-validation that no other code paths * have put it in-flux. */ KASSERT((kn->kn_status & KN_WILLDETACH) == 0 || kn_in_flux(kn) == false); KNOTE_WILLDETACH(kn); if (kn_in_flux(kn)) { mutex_exit(&fdp->fd_lock); kn_wait_flux(kn, true); /* * It is safe for us to touch the knote again after * dropping the locks, but the caller must still * re-validate everything because other aspects of * the environment may have changed while we blocked. */ KASSERT(kn_in_flux(kn) == false); mutex_spin_exit(&kq->kq_lock); return true; } mutex_spin_exit(&kq->kq_lock); return false; } /* * Calls into the filterops need to be resilient against things which * destroy a klist, e.g. device detach, freeing a vnode, etc., to avoid * chasing garbage pointers (to data, or even potentially code in a * module about to be unloaded). To that end, we acquire the * knote foplock before calling into the filter ops. When a driver * (or anything else) is tearing down its klist, klist_fini() enumerates * each knote, acquires its foplock, and replaces the filterops with a * nop stub, allowing knote detach (when descriptors are closed) to safely * proceed. */ static int filter_attach(struct knote *kn) { int rv; KASSERT(knote_foplock_owned(kn)); KASSERT(kn->kn_fop != NULL); KASSERT(kn->kn_fop->f_attach != NULL); /* * N.B. that kn->kn_fop may change as the result of calling * f_attach(). After f_attach() returns, kn->kn_fop may not * be modified by code outside of klist_fini(). */ if (kn->kn_fop->f_flags & FILTEROP_MPSAFE) { rv = kn->kn_fop->f_attach(kn); } else { KERNEL_LOCK(1, NULL); rv = kn->kn_fop->f_attach(kn); KERNEL_UNLOCK_ONE(NULL); } return rv; } static void filter_detach(struct knote *kn) { KASSERT(knote_foplock_owned(kn)); KASSERT(kn->kn_fop != NULL); KASSERT(kn->kn_fop->f_detach != NULL); if (kn->kn_fop->f_flags & FILTEROP_MPSAFE) { kn->kn_fop->f_detach(kn); } else { KERNEL_LOCK(1, NULL); kn->kn_fop->f_detach(kn); KERNEL_UNLOCK_ONE(NULL); } } static int filter_event(struct knote *kn, long hint, bool submitting) { int rv; /* See knote(). */ KASSERT(submitting || knote_foplock_owned(kn)); KASSERT(kn->kn_fop != NULL); KASSERT(kn->kn_fop->f_event != NULL); if (kn->kn_fop->f_flags & FILTEROP_MPSAFE) { rv = kn->kn_fop->f_event(kn, hint); } else { KERNEL_LOCK(1, NULL); rv = kn->kn_fop->f_event(kn, hint); KERNEL_UNLOCK_ONE(NULL); } return rv; } static int filter_touch(struct knote *kn, struct kevent *kev, long type) { /* * XXX We cannot assert that the knote foplock is held here * XXX beause we cannot safely acquire it in all cases * XXX where "touch" will be used in kqueue_scan(). We just * XXX have to assume that f_touch will always be safe to call, * XXX and kqueue_register() allows only the two known-safe * XXX users of that op. */ KASSERT(kn->kn_fop != NULL); KASSERT(kn->kn_fop->f_touch != NULL); return kn->kn_fop->f_touch(kn, kev, type); } static kauth_listener_t kqueue_listener; static int kqueue_listener_cb(kauth_cred_t cred, kauth_action_t action, void *cookie, void *arg0, void *arg1, void *arg2, void *arg3) { struct proc *p; int result; result = KAUTH_RESULT_DEFER; p = arg0; if (action != KAUTH_PROCESS_KEVENT_FILTER) return result; if ((kauth_cred_getuid(p->p_cred) != kauth_cred_getuid(cred) || ISSET(p->p_flag, PK_SUGID))) return result; result = KAUTH_RESULT_ALLOW; return result; } /* * Initialize the kqueue subsystem. */ void kqueue_init(void) { rw_init(&kqueue_filter_lock); kqueue_listener = kauth_listen_scope(KAUTH_SCOPE_PROCESS, kqueue_listener_cb, NULL); } /* * Find kfilter entry by name, or NULL if not found. */ static struct kfilter * kfilter_byname_sys(const char *name) { int i; KASSERT(rw_lock_held(&kqueue_filter_lock)); for (i = 0; sys_kfilters[i].name != NULL; i++) { if (strcmp(name, sys_kfilters[i].name) == 0) return &sys_kfilters[i]; } return NULL; } static struct kfilter * kfilter_byname_user(const char *name) { int i; KASSERT(rw_lock_held(&kqueue_filter_lock)); /* user filter slots have a NULL name if previously deregistered */ for (i = 0; i < user_kfilterc ; i++) { if (user_kfilters[i].name != NULL && strcmp(name, user_kfilters[i].name) == 0) return &user_kfilters[i]; } return NULL; } static struct kfilter * kfilter_byname(const char *name) { struct kfilter *kfilter; KASSERT(rw_lock_held(&kqueue_filter_lock)); if ((kfilter = kfilter_byname_sys(name)) != NULL) return kfilter; return kfilter_byname_user(name); } /* * Find kfilter entry by filter id, or NULL if not found. * Assumes entries are indexed in filter id order, for speed. */ static struct kfilter * kfilter_byfilter(uint32_t filter) { struct kfilter *kfilter; KASSERT(rw_lock_held(&kqueue_filter_lock)); if (filter < EVFILT_SYSCOUNT) /* it's a system filter */ kfilter = &sys_kfilters[filter]; else if (user_kfilters != NULL && filter < EVFILT_SYSCOUNT + user_kfilterc) /* it's a user filter */ kfilter = &user_kfilters[filter - EVFILT_SYSCOUNT]; else return (NULL); /* out of range */ KASSERT(kfilter->filter == filter); /* sanity check! */ return (kfilter); } /* * Register a new kfilter. Stores the entry in user_kfilters. * Returns 0 if operation succeeded, or an appropriate errno(2) otherwise. * If retfilter != NULL, the new filterid is returned in it. */ int kfilter_register(const char *name, const struct filterops *filtops, int *retfilter) { struct kfilter *kfilter; size_t len; int i; if (name == NULL || name[0] == '\0' || filtops == NULL) return (EINVAL); /* invalid args */ rw_enter(&kqueue_filter_lock, RW_WRITER); if (kfilter_byname(name) != NULL) { rw_exit(&kqueue_filter_lock); return (EEXIST); /* already exists */ } if (user_kfilterc > 0xffffffff - EVFILT_SYSCOUNT) { rw_exit(&kqueue_filter_lock); return (EINVAL); /* too many */ } for (i = 0; i < user_kfilterc; i++) { kfilter = &user_kfilters[i]; if (kfilter->name == NULL) { /* Previously deregistered slot. Reuse. */ goto reuse; } } /* check if need to grow user_kfilters */ if (user_kfilterc + 1 > user_kfiltermaxc) { /* Grow in KFILTER_EXTENT chunks. */ user_kfiltermaxc += KFILTER_EXTENT; len = user_kfiltermaxc * sizeof(*kfilter); kfilter = kmem_alloc(len, KM_SLEEP); memset((char *)kfilter + user_kfiltersz, 0, len - user_kfiltersz); if (user_kfilters != NULL) { memcpy(kfilter, user_kfilters, user_kfiltersz); kmem_free(user_kfilters, user_kfiltersz); } user_kfiltersz = len; user_kfilters = kfilter; } /* Adding new slot */ kfilter = &user_kfilters[user_kfilterc++]; reuse: kfilter->name = kmem_strdupsize(name, &kfilter->namelen, KM_SLEEP); kfilter->filter = (kfilter - user_kfilters) + EVFILT_SYSCOUNT; kfilter->filtops = kmem_alloc(sizeof(*filtops), KM_SLEEP); memcpy(__UNCONST(kfilter->filtops), filtops, sizeof(*filtops)); if (retfilter != NULL) *retfilter = kfilter->filter; rw_exit(&kqueue_filter_lock); return (0); } /* * Unregister a kfilter previously registered with kfilter_register. * This retains the filter id, but clears the name and frees filtops (filter * operations), so that the number isn't reused during a boot. * Returns 0 if operation succeeded, or an appropriate errno(2) otherwise. */ int kfilter_unregister(const char *name) { struct kfilter *kfilter; if (name == NULL || name[0] == '\0') return (EINVAL); /* invalid name */ rw_enter(&kqueue_filter_lock, RW_WRITER); if (kfilter_byname_sys(name) != NULL) { rw_exit(&kqueue_filter_lock); return (EINVAL); /* can't detach system filters */ } kfilter = kfilter_byname_user(name); if (kfilter == NULL) { rw_exit(&kqueue_filter_lock); return (ENOENT); } if (kfilter->refcnt != 0) { rw_exit(&kqueue_filter_lock); return (EBUSY); } /* Cast away const (but we know it's safe. */ kmem_free(__UNCONST(kfilter->name), kfilter->namelen); kfilter->name = NULL; /* mark as `not implemented' */ if (kfilter->filtops != NULL) { /* Cast away const (but we know it's safe. */ kmem_free(__UNCONST(kfilter->filtops), sizeof(*kfilter->filtops)); kfilter->filtops = NULL; /* mark as `not implemented' */ } rw_exit(&kqueue_filter_lock); return (0); } /* * Filter attach method for EVFILT_READ and EVFILT_WRITE on normal file * descriptors. Calls fileops kqfilter method for given file descriptor. */ static int filt_fileattach(struct knote *kn) { file_t *fp; fp = kn->kn_obj; return (*fp->f_ops->fo_kqfilter)(fp, kn); } /* * Filter detach method for EVFILT_READ on kqueue descriptor. */ static void filt_kqdetach(struct knote *kn) { struct kqueue *kq; kq = ((file_t *)kn->kn_obj)->f_kqueue; mutex_spin_enter(&kq->kq_lock); selremove_knote(&kq->kq_sel, kn); mutex_spin_exit(&kq->kq_lock); } /* * Filter event method for EVFILT_READ on kqueue descriptor. */ /*ARGSUSED*/ static int filt_kqueue(struct knote *kn, long hint) { struct kqueue *kq; int rv; kq = ((file_t *)kn->kn_obj)->f_kqueue; if (hint != NOTE_SUBMIT) mutex_spin_enter(&kq->kq_lock); kn->kn_data = KQ_COUNT(kq); rv = (kn->kn_data > 0); if (hint != NOTE_SUBMIT) mutex_spin_exit(&kq->kq_lock); return rv; } /* * Filter attach method for EVFILT_PROC. */ static int filt_procattach(struct knote *kn) { struct proc *p; mutex_enter(&proc_lock); p = proc_find(kn->kn_id); if (p == NULL) { mutex_exit(&proc_lock); return ESRCH; } /* * Fail if it's not owned by you, or the last exec gave us * setuid/setgid privs (unless you're root). */ mutex_enter(p->p_lock); mutex_exit(&proc_lock); if (kauth_authorize_process(curlwp->l_cred, KAUTH_PROCESS_KEVENT_FILTER, p, NULL, NULL, NULL) != 0) { mutex_exit(p->p_lock); return EACCES; } kn->kn_obj = p; kn->kn_flags |= EV_CLEAR; /* automatically set */ /* * NOTE_CHILD is only ever generated internally; don't let it * leak in from user-space. See knote_proc_fork_track(). */ kn->kn_sfflags &= ~NOTE_CHILD; klist_insert(&p->p_klist, kn); mutex_exit(p->p_lock); return 0; } /* * Filter detach method for EVFILT_PROC. * * The knote may be attached to a different process, which may exit, * leaving nothing for the knote to be attached to. So when the process * exits, the knote is marked as DETACHED and also flagged as ONESHOT so * it will be deleted when read out. However, as part of the knote deletion, * this routine is called, so a check is needed to avoid actually performing * a detach, because the original process might not exist any more. */ static void filt_procdetach(struct knote *kn) { struct kqueue *kq = kn->kn_kq; struct proc *p; /* * We have to synchronize with knote_proc_exit(), but we * are forced to acquire the locks in the wrong order here * because we can't be sure kn->kn_obj is valid unless * KN_DETACHED is not set. */ again: mutex_spin_enter(&kq->kq_lock); if ((kn->kn_status & KN_DETACHED) == 0) { p = kn->kn_obj; if (!mutex_tryenter(p->p_lock)) { mutex_spin_exit(&kq->kq_lock); preempt_point(); goto again; } kn->kn_status |= KN_DETACHED; klist_remove(&p->p_klist, kn); mutex_exit(p->p_lock); } mutex_spin_exit(&kq->kq_lock); } /* * Filter event method for EVFILT_PROC. * * Due to some of the complexities of process locking, we have special * entry points for delivering knote submissions. filt_proc() is used * only to check for activation from kqueue_register() and kqueue_scan(). */ static int filt_proc(struct knote *kn, long hint) { struct kqueue *kq = kn->kn_kq; uint32_t fflags; /* * Because we share the same klist with signal knotes, just * ensure that we're not being invoked for the proc-related * submissions. */ KASSERT((hint & (NOTE_EXEC | NOTE_EXIT | NOTE_FORK)) == 0); mutex_spin_enter(&kq->kq_lock); fflags = kn->kn_fflags; mutex_spin_exit(&kq->kq_lock); return fflags != 0; } void knote_proc_exec(struct proc *p) { struct knote *kn, *tmpkn; struct kqueue *kq; uint32_t fflags; mutex_enter(p->p_lock); SLIST_FOREACH_SAFE(kn, &p->p_klist, kn_selnext, tmpkn) { /* N.B. EVFILT_SIGNAL knotes are on this same list. */ if (kn->kn_fop == &sig_filtops) { continue; } KASSERT(kn->kn_fop == &proc_filtops); kq = kn->kn_kq; mutex_spin_enter(&kq->kq_lock); fflags = (kn->kn_fflags |= (kn->kn_sfflags & NOTE_EXEC)); if (fflags) { knote_activate_locked(kn); } mutex_spin_exit(&kq->kq_lock); } mutex_exit(p->p_lock); } static int __noinline knote_proc_fork_track(struct proc *p1, struct proc *p2, struct knote *okn) { struct kqueue *kq = okn->kn_kq; KASSERT(mutex_owned(&kq->kq_lock)); KASSERT(mutex_owned(p1->p_lock)); /* * We're going to put this knote into flux while we drop * the locks and create and attach a new knote to track the * child. If we are not able to enter flux, then this knote * is about to go away, so skip the notification. */ if (!kn_enter_flux(okn)) { return 0; } mutex_spin_exit(&kq->kq_lock); mutex_exit(p1->p_lock); /* * We actually have to register *two* new knotes: * * ==> One for the NOTE_CHILD notification. This is a forced * ONESHOT note. * * ==> One to actually track the child process as it subsequently * forks, execs, and, ultimately, exits. * * If we only register a single knote, then it's possible for * for the NOTE_CHILD and NOTE_EXIT to be collapsed into a single * notification if the child exits before the tracking process * has received the NOTE_CHILD notification, which applications * aren't expecting (the event's 'data' field would be clobbered, * for example). * * To do this, what we have here is an **extremely** stripped-down * version of kqueue_register() that has the following properties: * * ==> Does not block to allocate memory. If we are unable * to allocate memory, we return ENOMEM. * * ==> Does not search for existing knotes; we know there * are not any because this is a new process that isn't * even visible to other processes yet. * * ==> Assumes that the knhash for our kq's descriptor table * already exists (after all, we're already tracking * processes with knotes if we got here). * * ==> Directly attaches the new tracking knote to the child * process. * * The whole point is to do the minimum amount of work while the * knote is held in-flux, and to avoid doing extra work in general * (we already have the new child process; why bother looking it * up again?). */ filedesc_t *fdp = kq->kq_fdp; struct knote *knchild, *kntrack; int error = 0; knchild = knote_alloc(false); kntrack = knote_alloc(false); if (__predict_false(knchild == NULL || kntrack == NULL)) { error = ENOMEM; goto out; } kntrack->kn_obj = p2; kntrack->kn_id = p2->p_pid; kntrack->kn_kq = kq; kntrack->kn_fop = okn->kn_fop; kntrack->kn_kfilter = okn->kn_kfilter; kntrack->kn_sfflags = okn->kn_sfflags; kntrack->kn_sdata = p1->p_pid; kntrack->kn_kevent.ident = p2->p_pid; kntrack->kn_kevent.filter = okn->kn_filter; kntrack->kn_kevent.flags = okn->kn_flags | EV_ADD | EV_ENABLE | EV_CLEAR; kntrack->kn_kevent.fflags = 0; kntrack->kn_kevent.data = 0; kntrack->kn_kevent.udata = okn->kn_kevent.udata; /* preserve udata */ /* * The child note does not need to be attached to the * new proc's klist at all. */ *knchild = *kntrack; knchild->kn_status = KN_DETACHED; knchild->kn_sfflags = 0; knchild->kn_kevent.flags |= EV_ONESHOT; knchild->kn_kevent.fflags = NOTE_CHILD; knchild->kn_kevent.data = p1->p_pid; /* parent */ mutex_enter(&fdp->fd_lock); /* * We need to check to see if the kq is closing, and skip * attaching the knote if so. Normally, this isn't necessary * when coming in the front door because the file descriptor * layer will synchronize this. * * It's safe to test KQ_CLOSING without taking the kq_lock * here because that flag is only ever set when the fd_lock * is also held. */ if (__predict_false(kq->kq_count & KQ_CLOSING)) { mutex_exit(&fdp->fd_lock); goto out; } /* * We do the "insert into FD table" and "attach to klist" steps * in the opposite order of kqueue_register() here to avoid * having to take p2->p_lock twice. But this is OK because we * hold fd_lock across the entire operation. */ mutex_enter(p2->p_lock); error = kauth_authorize_process(curlwp->l_cred, KAUTH_PROCESS_KEVENT_FILTER, p2, NULL, NULL, NULL); if (__predict_false(error != 0)) { mutex_exit(p2->p_lock); mutex_exit(&fdp->fd_lock); error = EACCES; goto out; } klist_insert(&p2->p_klist, kntrack); mutex_exit(p2->p_lock); KASSERT(fdp->fd_knhashmask != 0); KASSERT(fdp->fd_knhash != NULL); struct klist *list = &fdp->fd_knhash[KN_HASH(kntrack->kn_id, fdp->fd_knhashmask)]; SLIST_INSERT_HEAD(list, kntrack, kn_link); SLIST_INSERT_HEAD(list, knchild, kn_link); /* This adds references for knchild *and* kntrack. */ atomic_add_int(&kntrack->kn_kfilter->refcnt, 2); knote_activate(knchild); kntrack = NULL; knchild = NULL; mutex_exit(&fdp->fd_lock); out: if (__predict_false(knchild != NULL)) { knote_free(knchild); } if (__predict_false(kntrack != NULL)) { knote_free(kntrack); } mutex_enter(p1->p_lock); mutex_spin_enter(&kq->kq_lock); if (kn_leave_flux(okn)) { KQ_FLUX_WAKEUP(kq); } return error; } void knote_proc_fork(struct proc *p1, struct proc *p2) { struct knote *kn; struct kqueue *kq; uint32_t fflags; mutex_enter(p1->p_lock); /* * N.B. We DO NOT use SLIST_FOREACH_SAFE() here because we * don't want to pre-fetch the next knote; in the event we * have to drop p_lock, we will have put the knote in-flux, * meaning that no one will be able to detach it until we * have taken the knote out of flux. However, that does * NOT stop someone else from detaching the next note in the * list while we have it unlocked. Thus, we want to fetch * the next note in the list only after we have re-acquired * the lock, and using SLIST_FOREACH() will satisfy that. */ SLIST_FOREACH(kn, &p1->p_klist, kn_selnext) { /* N.B. EVFILT_SIGNAL knotes are on this same list. */ if (kn->kn_fop == &sig_filtops) { continue; } KASSERT(kn->kn_fop == &proc_filtops); kq = kn->kn_kq; mutex_spin_enter(&kq->kq_lock); kn->kn_fflags |= (kn->kn_sfflags & NOTE_FORK); if (__predict_false(kn->kn_sfflags & NOTE_TRACK)) { /* * This will drop kq_lock and p_lock and * re-acquire them before it returns. */ if (knote_proc_fork_track(p1, p2, kn)) { kn->kn_fflags |= NOTE_TRACKERR; } KASSERT(mutex_owned(p1->p_lock)); KASSERT(mutex_owned(&kq->kq_lock)); } fflags = kn->kn_fflags; if (fflags) { knote_activate_locked(kn); } mutex_spin_exit(&kq->kq_lock); } mutex_exit(p1->p_lock); } void knote_proc_exit(struct proc *p) { struct knote *kn; struct kqueue *kq; KASSERT(mutex_owned(p->p_lock)); while (!SLIST_EMPTY(&p->p_klist)) { kn = SLIST_FIRST(&p->p_klist); kq = kn->kn_kq; KASSERT(kn->kn_obj == p); mutex_spin_enter(&kq->kq_lock); kn->kn_data = P_WAITSTATUS(p); /* * Mark as ONESHOT, so that the knote is g/c'ed * when read. */ kn->kn_flags |= (EV_EOF | EV_ONESHOT); kn->kn_fflags |= kn->kn_sfflags & NOTE_EXIT; /* * Detach the knote from the process and mark it as such. * N.B. EVFILT_SIGNAL are also on p_klist, but by the * time we get here, all open file descriptors for this * process have been released, meaning that signal knotes * will have already been detached. * * We need to synchronize this with filt_procdetach(). */ KASSERT(kn->kn_fop == &proc_filtops); if ((kn->kn_status & KN_DETACHED) == 0) { kn->kn_status |= KN_DETACHED; SLIST_REMOVE_HEAD(&p->p_klist, kn_selnext); } /* * Always activate the knote for NOTE_EXIT regardless * of whether or not the listener cares about it. * This matches historical behavior. */ knote_activate_locked(kn); mutex_spin_exit(&kq->kq_lock); } } #define FILT_TIMER_NOSCHED ((uintptr_t)-1) static int filt_timercompute(struct kevent *kev, uintptr_t *tticksp) { struct timespec ts; uintptr_t tticks; if (kev->fflags & ~(NOTE_TIMER_UNITMASK | NOTE_ABSTIME)) { return EINVAL; } /* * Convert the event 'data' to a timespec, then convert the * timespec to callout ticks. */ switch (kev->fflags & NOTE_TIMER_UNITMASK) { case NOTE_SECONDS: ts.tv_sec = kev->data; ts.tv_nsec = 0; break; case NOTE_MSECONDS: /* == historical value 0 */ ts.tv_sec = kev->data / 1000; ts.tv_nsec = (kev->data % 1000) * 1000000; break; case NOTE_USECONDS: ts.tv_sec = kev->data / 1000000; ts.tv_nsec = (kev->data % 1000000) * 1000; break; case NOTE_NSECONDS: ts.tv_sec = kev->data / 1000000000; ts.tv_nsec = kev->data % 1000000000; break; default: return EINVAL; } if (kev->fflags & NOTE_ABSTIME) { struct timespec deadline = ts; /* * Get current time. * * XXX This is CLOCK_REALTIME. There is no way to * XXX specify CLOCK_MONOTONIC. */ nanotime(&ts); /* Absolute timers do not repeat. */ kev->data = FILT_TIMER_NOSCHED; /* If we're past the deadline, then the event will fire. */ if (timespeccmp(&deadline, &ts, <=)) { tticks = FILT_TIMER_NOSCHED; goto out; } /* Calculate how much time is left. */ timespecsub(&deadline, &ts, &ts); } else { /* EV_CLEAR automatically set for relative timers. */ kev->flags |= EV_CLEAR; } tticks = tstohz(&ts); /* if the supplied value is under our resolution, use 1 tick */ if (tticks == 0) { if (kev->data == 0) return EINVAL; tticks = 1; } else if (tticks > INT_MAX) { return EINVAL; } if ((kev->flags & EV_ONESHOT) != 0) { /* Timer does not repeat. */ kev->data = FILT_TIMER_NOSCHED; } else { KASSERT((uintptr_t)tticks != FILT_TIMER_NOSCHED); kev->data = tticks; } out: *tticksp = tticks; return 0; } static void filt_timerexpire(void *knx) { struct knote *kn = knx; struct kqueue *kq = kn->kn_kq; mutex_spin_enter(&kq->kq_lock); kn->kn_data++; knote_activate_locked(kn); if (kn->kn_sdata != FILT_TIMER_NOSCHED) { KASSERT(kn->kn_sdata > 0); KASSERT(kn->kn_sdata <= INT_MAX); callout_schedule((callout_t *)kn->kn_hook, (int)kn->kn_sdata); } mutex_spin_exit(&kq->kq_lock); } static inline void filt_timerstart(struct knote *kn, uintptr_t tticks) { callout_t *calloutp = kn->kn_hook; KASSERT(mutex_owned(&kn->kn_kq->kq_lock)); KASSERT(!callout_pending(calloutp)); if (__predict_false(tticks == FILT_TIMER_NOSCHED)) { kn->kn_data = 1; } else { KASSERT(tticks <= INT_MAX); callout_reset(calloutp, (int)tticks, filt_timerexpire, kn); } } static int filt_timerattach(struct knote *kn) { callout_t *calloutp; struct kqueue *kq; uintptr_t tticks; int error; struct kevent kev = { .flags = kn->kn_flags, .fflags = kn->kn_sfflags, .data = kn->kn_sdata, }; error = filt_timercompute(&kev, &tticks); if (error) { return error; } if (atomic_inc_uint_nv(&kq_ncallouts) >= kq_calloutmax || (calloutp = kmem_alloc(sizeof(*calloutp), KM_NOSLEEP)) == NULL) { atomic_dec_uint(&kq_ncallouts); return ENOMEM; } callout_init(calloutp, CALLOUT_MPSAFE); kq = kn->kn_kq; mutex_spin_enter(&kq->kq_lock); kn->kn_sdata = kev.data; kn->kn_flags = kev.flags; KASSERT(kn->kn_sfflags == kev.fflags); kn->kn_hook = calloutp; filt_timerstart(kn, tticks); mutex_spin_exit(&kq->kq_lock); return (0); } static void filt_timerdetach(struct knote *kn) { callout_t *calloutp; struct kqueue *kq = kn->kn_kq; /* prevent rescheduling when we expire */ mutex_spin_enter(&kq->kq_lock); kn->kn_sdata = FILT_TIMER_NOSCHED; mutex_spin_exit(&kq->kq_lock); calloutp = (callout_t *)kn->kn_hook; /* * Attempt to stop the callout. This will block if it's * already running. */ callout_halt(calloutp, NULL); callout_destroy(calloutp); kmem_free(calloutp, sizeof(*calloutp)); atomic_dec_uint(&kq_ncallouts); } static int filt_timertouch(struct knote *kn, struct kevent *kev, long type) { struct kqueue *kq = kn->kn_kq; callout_t *calloutp; uintptr_t tticks; int error; KASSERT(mutex_owned(&kq->kq_lock)); switch (type) { case EVENT_REGISTER: /* Only relevant for EV_ADD. */ if ((kev->flags & EV_ADD) == 0) { return 0; } /* * Stop the timer, under the assumption that if * an application is re-configuring the timer, * they no longer care about the old one. We * can safely drop the kq_lock while we wait * because fdp->fd_lock will be held throughout, * ensuring that no one can sneak in with an * EV_DELETE or close the kq. */ KASSERT(mutex_owned(&kq->kq_fdp->fd_lock)); calloutp = kn->kn_hook; callout_halt(calloutp, &kq->kq_lock); KASSERT(mutex_owned(&kq->kq_lock)); knote_deactivate_locked(kn); kn->kn_data = 0; error = filt_timercompute(kev, &tticks); if (error) { return error; } kn->kn_sdata = kev->data; kn->kn_flags = kev->flags; kn->kn_sfflags = kev->fflags; filt_timerstart(kn, tticks); break; case EVENT_PROCESS: *kev = kn->kn_kevent; break; default: panic("%s: invalid type (%ld)", __func__, type); } return 0; } static int filt_timer(struct knote *kn, long hint) { struct kqueue *kq = kn->kn_kq; int rv; mutex_spin_enter(&kq->kq_lock); rv = (kn->kn_data != 0); mutex_spin_exit(&kq->kq_lock); return rv; } static int filt_userattach(struct knote *kn) { struct kqueue *kq = kn->kn_kq; /* * EVFILT_USER knotes are not attached to anything in the kernel. */ mutex_spin_enter(&kq->kq_lock); kn->kn_hook = NULL; if (kn->kn_fflags & NOTE_TRIGGER) kn->kn_hookid = 1; else kn->kn_hookid = 0; mutex_spin_exit(&kq->kq_lock); return (0); } static void filt_userdetach(struct knote *kn) { /* * EVFILT_USER knotes are not attached to anything in the kernel. */ } static int filt_user(struct knote *kn, long hint) { struct kqueue *kq = kn->kn_kq; int hookid; mutex_spin_enter(&kq->kq_lock); hookid = kn->kn_hookid; mutex_spin_exit(&kq->kq_lock); return hookid; } static int filt_usertouch(struct knote *kn, struct kevent *kev, long type) { int ffctrl; KASSERT(mutex_owned(&kn->kn_kq->kq_lock)); switch (type) { case EVENT_REGISTER: if (kev->fflags & NOTE_TRIGGER) kn->kn_hookid = 1; ffctrl = kev->fflags & NOTE_FFCTRLMASK; kev->fflags &= NOTE_FFLAGSMASK; switch (ffctrl) { case NOTE_FFNOP: break; case NOTE_FFAND: kn->kn_sfflags &= kev->fflags; break; case NOTE_FFOR: kn->kn_sfflags |= kev->fflags; break; case NOTE_FFCOPY: kn->kn_sfflags = kev->fflags; break; default: /* XXX Return error? */ break; } kn->kn_sdata = kev->data; if (kev->flags & EV_CLEAR) { kn->kn_hookid = 0; kn->kn_data = 0; kn->kn_fflags = 0; } break; case EVENT_PROCESS: *kev = kn->kn_kevent; kev->fflags = kn->kn_sfflags; kev->data = kn->kn_sdata; if (kn->kn_flags & EV_CLEAR) { kn->kn_hookid = 0; kn->kn_data = 0; kn->kn_fflags = 0; } break; default: panic("filt_usertouch() - invalid type (%ld)", type); break; } return 0; } /* * filt_seltrue: * * This filter "event" routine simulates seltrue(). */ int filt_seltrue(struct knote *kn, long hint) { /* * We don't know how much data can be read/written, * but we know that it *can* be. This is about as * good as select/poll does as well. */ kn->kn_data = 0; return (1); } /* * This provides full kqfilter entry for device switch tables, which * has same effect as filter using filt_seltrue() as filter method. */ static void filt_seltruedetach(struct knote *kn) { /* Nothing to do */ } const struct filterops seltrue_filtops = { .f_flags = FILTEROP_ISFD | FILTEROP_MPSAFE, .f_attach = NULL, .f_detach = filt_seltruedetach, .f_event = filt_seltrue, }; int seltrue_kqfilter(dev_t dev, struct knote *kn) { switch (kn->kn_filter) { case EVFILT_READ: case EVFILT_WRITE: kn->kn_fop = &seltrue_filtops; break; default: return (EINVAL); } /* Nothing more to do */ return (0); } /* * kqueue(2) system call. */ static int kqueue1(struct lwp *l, int flags, register_t *retval) { struct kqueue *kq; file_t *fp; int fd, error; if ((error = fd_allocfile(&fp, &fd)) != 0) return error; fp->f_flag = FREAD | FWRITE | (flags & (FNONBLOCK|FNOSIGPIPE)); fp->f_type = DTYPE_KQUEUE; fp->f_ops = &kqueueops; kq = kmem_zalloc(sizeof(*kq), KM_SLEEP); mutex_init(&kq->kq_lock, MUTEX_DEFAULT, IPL_SCHED); cv_init(&kq->kq_cv, "kqueue"); selinit(&kq->kq_sel); TAILQ_INIT(&kq->kq_head); fp->f_kqueue = kq; *retval = fd; kq->kq_fdp = curlwp->l_fd; fd_set_exclose(l, fd, (flags & O_CLOEXEC) != 0); fd_affix(curproc, fp, fd); return error; } /* * kqueue(2) system call. */ int sys_kqueue(struct lwp *l, const void *v, register_t *retval) { return kqueue1(l, 0, retval); } int sys_kqueue1(struct lwp *l, const struct sys_kqueue1_args *uap, register_t *retval) { /* { syscallarg(int) flags; } */ return kqueue1(l, SCARG(uap, flags), retval); } /* * kevent(2) system call. */ int kevent_fetch_changes(void *ctx, const struct kevent *changelist, struct kevent *changes, size_t index, int n) { return copyin(changelist + index, changes, n * sizeof(*changes)); } int kevent_put_events(void *ctx, struct kevent *events, struct kevent *eventlist, size_t index, int n) { return copyout(events, eventlist + index, n * sizeof(*events)); } static const struct kevent_ops kevent_native_ops = { .keo_private = NULL, .keo_fetch_timeout = copyin, .keo_fetch_changes = kevent_fetch_changes, .keo_put_events = kevent_put_events, }; int sys___kevent100(struct lwp *l, const struct sys___kevent100_args *uap, register_t *retval) { /* { syscallarg(int) fd; syscallarg(const struct kevent *) changelist; syscallarg(size_t) nchanges; syscallarg(struct kevent *) eventlist; syscallarg(size_t) nevents; syscallarg(const struct timespec *) timeout; } */ return kevent1(retval, SCARG(uap, fd), SCARG(uap, changelist), SCARG(uap, nchanges), SCARG(uap, eventlist), SCARG(uap, nevents), SCARG(uap, timeout), &kevent_native_ops); } int kevent1(register_t *retval, int fd, const struct kevent *changelist, size_t nchanges, struct kevent *eventlist, size_t nevents, const struct timespec *timeout, const struct kevent_ops *keops) { struct kevent *kevp; struct kqueue *kq; struct timespec ts; size_t i, n, ichange; int nerrors, error; struct kevent kevbuf[KQ_NEVENTS]; /* approx 300 bytes on 64-bit */ file_t *fp; /* check that we're dealing with a kq */ fp = fd_getfile(fd); if (fp == NULL) return (EBADF); if (fp->f_type != DTYPE_KQUEUE) { fd_putfile(fd); return (EBADF); } if (timeout != NULL) { error = (*keops->keo_fetch_timeout)(timeout, &ts, sizeof(ts)); if (error) goto done; timeout = &ts; } kq = fp->f_kqueue; nerrors = 0; ichange = 0; /* traverse list of events to register */ while (nchanges > 0) { n = MIN(nchanges, __arraycount(kevbuf)); error = (*keops->keo_fetch_changes)(keops->keo_private, changelist, kevbuf, ichange, n); if (error) goto done; for (i = 0; i < n; i++) { kevp = &kevbuf[i]; kevp->flags &= ~EV_SYSFLAGS; /* register each knote */ error = kqueue_register(kq, kevp); if (!error && !(kevp->flags & EV_RECEIPT)) continue; if (nevents == 0) goto done; kevp->flags = EV_ERROR; kevp->data = error; error = (*keops->keo_put_events) (keops->keo_private, kevp, eventlist, nerrors, 1); if (error) goto done; nevents--; nerrors++; } nchanges -= n; /* update the results */ ichange += n; } if (nerrors) { *retval = nerrors; error = 0; goto done; } /* actually scan through the events */ error = kqueue_scan(fp, nevents, eventlist, timeout, retval, keops, kevbuf, __arraycount(kevbuf)); done: fd_putfile(fd); return (error); } /* * Register a given kevent kev onto the kqueue */ static int kqueue_register(struct kqueue *kq, struct kevent *kev) { struct kfilter *kfilter; filedesc_t *fdp; file_t *fp; fdfile_t *ff; struct knote *kn, *newkn; struct klist *list; int error, fd, rv; fdp = kq->kq_fdp; fp = NULL; kn = NULL; error = 0; fd = 0; newkn = knote_alloc(true); rw_enter(&kqueue_filter_lock, RW_READER); kfilter = kfilter_byfilter(kev->filter); if (kfilter == NULL || kfilter->filtops == NULL) { /* filter not found nor implemented */ rw_exit(&kqueue_filter_lock); knote_free(newkn); return (EINVAL); } /* search if knote already exists */ if (kfilter->filtops->f_flags & FILTEROP_ISFD) { /* monitoring a file descriptor */ /* validate descriptor */ if (kev->ident > INT_MAX || (fp = fd_getfile(fd = kev->ident)) == NULL) { rw_exit(&kqueue_filter_lock); knote_free(newkn); return EBADF; } mutex_enter(&fdp->fd_lock); ff = fdp->fd_dt->dt_ff[fd]; if (ff->ff_refcnt & FR_CLOSING) { error = EBADF; goto doneunlock; } if (fd <= fdp->fd_lastkqfile) { SLIST_FOREACH(kn, &ff->ff_knlist, kn_link) { if (kq == kn->kn_kq && kev->filter == kn->kn_filter) break; } } } else { /* * not monitoring a file descriptor, so * lookup knotes in internal hash table */ mutex_enter(&fdp->fd_lock); if (fdp->fd_knhashmask != 0) { list = &fdp->fd_knhash[ KN_HASH((u_long)kev->ident, fdp->fd_knhashmask)]; SLIST_FOREACH(kn, list, kn_link) { if (kev->ident == kn->kn_id && kq == kn->kn_kq && kev->filter == kn->kn_filter) break; } } } /* It's safe to test KQ_CLOSING while holding only the fd_lock. */ KASSERT(mutex_owned(&fdp->fd_lock)); KASSERT((kq->kq_count & KQ_CLOSING) == 0); /* * kn now contains the matching knote, or NULL if no match */ if (kn == NULL) { if (kev->flags & EV_ADD) { /* create new knote */ kn = newkn; newkn = NULL; kn->kn_obj = fp; kn->kn_id = kev->ident; kn->kn_kq = kq; kn->kn_fop = kfilter->filtops; kn->kn_kfilter = kfilter; kn->kn_sfflags = kev->fflags; kn->kn_sdata = kev->data; kev->fflags = 0; kev->data = 0; kn->kn_kevent = *kev; KASSERT(kn->kn_fop != NULL); /* * XXX Allow only known-safe users of f_touch. * XXX See filter_touch() for details. */ if (kn->kn_fop->f_touch != NULL && kn->kn_fop != &timer_filtops && kn->kn_fop != &user_filtops) { error = ENOTSUP; goto fail_ev_add; } /* * apply reference count to knote structure, and * do not release it at the end of this routine. */ fp = NULL; if (!(kn->kn_fop->f_flags & FILTEROP_ISFD)) { /* * If knote is not on an fd, store on * internal hash table. */ if (fdp->fd_knhashmask == 0) { /* XXXAD can block with fd_lock held */ fdp->fd_knhash = hashinit(KN_HASHSIZE, HASH_LIST, true, &fdp->fd_knhashmask); } list = &fdp->fd_knhash[KN_HASH(kn->kn_id, fdp->fd_knhashmask)]; } else { /* Otherwise, knote is on an fd. */ list = (struct klist *) &fdp->fd_dt->dt_ff[kn->kn_id]->ff_knlist; if ((int)kn->kn_id > fdp->fd_lastkqfile) fdp->fd_lastkqfile = kn->kn_id; } SLIST_INSERT_HEAD(list, kn, kn_link); /* * N.B. kn->kn_fop may change as the result * of filter_attach()! */ knote_foplock_enter(kn); error = filter_attach(kn); if (error != 0) { #ifdef DEBUG struct proc *p = curlwp->l_proc; const file_t *ft = kn->kn_obj; printf("%s: %s[%d]: event type %d not " "supported for file type %d/%s " "(error %d)\n", __func__, p->p_comm, p->p_pid, kn->kn_filter, ft ? ft->f_type : -1, ft ? ft->f_ops->fo_name : "?", error); #endif fail_ev_add: /* * N.B. no need to check for this note to * be in-flux, since it was never visible * to the monitored object. * * knote_detach() drops fdp->fd_lock */ knote_foplock_exit(kn); mutex_enter(&kq->kq_lock); KNOTE_WILLDETACH(kn); KASSERT(kn_in_flux(kn) == false); mutex_exit(&kq->kq_lock); knote_detach(kn, fdp, false); goto done; } atomic_inc_uint(&kfilter->refcnt); goto done_ev_add; } else { /* No matching knote and the EV_ADD flag is not set. */ error = ENOENT; goto doneunlock; } } if (kev->flags & EV_DELETE) { /* * Let the world know that this knote is about to go * away, and wait for it to settle if it's currently * in-flux. */ mutex_spin_enter(&kq->kq_lock); if (kn->kn_status & KN_WILLDETACH) { /* * This knote is already on its way out, * so just be done. */ mutex_spin_exit(&kq->kq_lock); goto doneunlock; } KNOTE_WILLDETACH(kn); if (kn_in_flux(kn)) { mutex_exit(&fdp->fd_lock); /* * It's safe for us to conclusively wait for * this knote to settle because we know we'll * be completing the detach. */ kn_wait_flux(kn, true); KASSERT(kn_in_flux(kn) == false); mutex_spin_exit(&kq->kq_lock); mutex_enter(&fdp->fd_lock); } else { mutex_spin_exit(&kq->kq_lock); } /* knote_detach() drops fdp->fd_lock */ knote_detach(kn, fdp, true); goto done; } /* * The user may change some filter values after the * initial EV_ADD, but doing so will not reset any * filter which have already been triggered. */ knote_foplock_enter(kn); kn->kn_kevent.udata = kev->udata; KASSERT(kn->kn_fop != NULL); if (!(kn->kn_fop->f_flags & FILTEROP_ISFD) && kn->kn_fop->f_touch != NULL) { mutex_spin_enter(&kq->kq_lock); error = filter_touch(kn, kev, EVENT_REGISTER); mutex_spin_exit(&kq->kq_lock); if (__predict_false(error != 0)) { /* Never a new knote (which would consume newkn). */ KASSERT(newkn != NULL); knote_foplock_exit(kn); goto doneunlock; } } else { kn->kn_sfflags = kev->fflags; kn->kn_sdata = kev->data; } /* * We can get here if we are trying to attach * an event to a file descriptor that does not * support events, and the attach routine is * broken and does not return an error. */ done_ev_add: rv = filter_event(kn, 0, false); if (rv) knote_activate(kn); knote_foplock_exit(kn); /* disable knote */ if ((kev->flags & EV_DISABLE)) { mutex_spin_enter(&kq->kq_lock); if ((kn->kn_status & KN_DISABLED) == 0) kn->kn_status |= KN_DISABLED; mutex_spin_exit(&kq->kq_lock); } /* enable knote */ if ((kev->flags & EV_ENABLE)) { knote_enqueue(kn); } doneunlock: mutex_exit(&fdp->fd_lock); done: rw_exit(&kqueue_filter_lock); if (newkn != NULL) knote_free(newkn); if (fp != NULL) fd_putfile(fd); return (error); } #define KN_FMT(buf, kn) \ (snprintb((buf), sizeof(buf), __KN_FLAG_BITS, (kn)->kn_status), buf) #if defined(DDB) void kqueue_printit(struct kqueue *kq, bool full, void (*pr)(const char *, ...)) { const struct knote *kn; u_int count; int nmarker; char buf[128]; count = 0; nmarker = 0; (*pr)("kqueue %p (restart=%d count=%u):\n", kq, !!(kq->kq_count & KQ_RESTART), KQ_COUNT(kq)); (*pr)(" Queued knotes:\n"); TAILQ_FOREACH(kn, &kq->kq_head, kn_tqe) { if (kn->kn_status & KN_MARKER) { nmarker++; } else { count++; } (*pr)(" knote %p: kq=%p status=%s\n", kn, kn->kn_kq, KN_FMT(buf, kn)); (*pr)(" id=0x%lx (%lu) filter=%d\n", (u_long)kn->kn_id, (u_long)kn->kn_id, kn->kn_filter); if (kn->kn_kq != kq) { (*pr)(" !!! kn->kn_kq != kq\n"); } } if (count != KQ_COUNT(kq)) { (*pr)(" !!! count(%u) != KQ_COUNT(%u)\n", count, KQ_COUNT(kq)); } } #endif /* DDB */ #if defined(DEBUG) static void kqueue_check(const char *func, size_t line, const struct kqueue *kq) { const struct knote *kn; u_int count; int nmarker; char buf[128]; KASSERT(mutex_owned(&kq->kq_lock)); count = 0; nmarker = 0; TAILQ_FOREACH(kn, &kq->kq_head, kn_tqe) { if ((kn->kn_status & (KN_MARKER | KN_QUEUED)) == 0) { panic("%s,%zu: kq=%p kn=%p !(MARKER|QUEUED) %s", func, line, kq, kn, KN_FMT(buf, kn)); } if ((kn->kn_status & KN_MARKER) == 0) { if (kn->kn_kq != kq) { panic("%s,%zu: kq=%p kn(%p) != kn->kq(%p): %s", func, line, kq, kn, kn->kn_kq, KN_FMT(buf, kn)); } if ((kn->kn_status & KN_ACTIVE) == 0) { panic("%s,%zu: kq=%p kn=%p: !ACTIVE %s", func, line, kq, kn, KN_FMT(buf, kn)); } count++; if (count > KQ_COUNT(kq)) { panic("%s,%zu: kq=%p kq->kq_count(%u) != " "count(%d), nmarker=%d", func, line, kq, KQ_COUNT(kq), count, nmarker); } } else { nmarker++; } } } #define kq_check(a) kqueue_check(__func__, __LINE__, (a)) #else /* defined(DEBUG) */ #define kq_check(a) /* nothing */ #endif /* defined(DEBUG) */ static void kqueue_restart(file_t *fp) { struct kqueue *kq = fp->f_kqueue; KASSERT(kq != NULL); mutex_spin_enter(&kq->kq_lock); kq->kq_count |= KQ_RESTART; cv_broadcast(&kq->kq_cv); mutex_spin_exit(&kq->kq_lock); } static int kqueue_fpathconf(struct file *fp, int name, register_t *retval) { return EINVAL; } /* * Scan through the list of events on fp (for a maximum of maxevents), * returning the results in to ulistp. Timeout is determined by tsp; if * NULL, wait indefinitely, if 0 valued, perform a poll, otherwise wait * as appropriate. */ static int kqueue_scan(file_t *fp, size_t maxevents, struct kevent *ulistp, const struct timespec *tsp, register_t *retval, const struct kevent_ops *keops, struct kevent *kevbuf, size_t kevcnt) { struct kqueue *kq; struct kevent *kevp; struct timespec ats, sleepts; struct knote *kn, *marker; struct knote_impl morker; size_t count, nkev, nevents; int timeout, error, touch, rv, influx; filedesc_t *fdp; fdp = curlwp->l_fd; kq = fp->f_kqueue; count = maxevents; nkev = nevents = error = 0; if (count == 0) { *retval = 0; return 0; } if (tsp) { /* timeout supplied */ ats = *tsp; if (inittimeleft(&ats, &sleepts) == -1) { *retval = maxevents; return EINVAL; } timeout = tstohz(&ats); if (timeout <= 0) timeout = -1; /* do poll */ } else { /* no timeout, wait forever */ timeout = 0; } memset(&morker, 0, sizeof(morker)); marker = &morker.ki_knote; marker->kn_kq = kq; marker->kn_status = KN_MARKER; mutex_spin_enter(&kq->kq_lock); retry: kevp = kevbuf; if (KQ_COUNT(kq) == 0) { if (timeout >= 0) { error = cv_timedwait_sig(&kq->kq_cv, &kq->kq_lock, timeout); if (error == 0) { if (KQ_COUNT(kq) == 0 && (kq->kq_count & KQ_RESTART)) { /* return to clear file reference */ error = ERESTART; } else if (tsp == NULL || (timeout = gettimeleft(&ats, &sleepts)) > 0) { goto retry; } } else { /* don't restart after signals... */ if (error == ERESTART) error = EINTR; if (error == EWOULDBLOCK) error = 0; } } mutex_spin_exit(&kq->kq_lock); goto done; } /* mark end of knote list */ TAILQ_INSERT_TAIL(&kq->kq_head, marker, kn_tqe); influx = 0; /* * Acquire the fdp->fd_lock interlock to avoid races with * file creation/destruction from other threads. */ mutex_spin_exit(&kq->kq_lock); relock: mutex_enter(&fdp->fd_lock); mutex_spin_enter(&kq->kq_lock); while (count != 0) { /* * Get next knote. We are guaranteed this will never * be NULL because of the marker we inserted above. */ kn = TAILQ_FIRST(&kq->kq_head); bool kn_is_other_marker = (kn->kn_status & KN_MARKER) != 0 && kn != marker; bool kn_is_detaching = (kn->kn_status & KN_WILLDETACH) != 0; bool kn_is_in_flux = kn_in_flux(kn); /* * If we found a marker that's not ours, or this knote * is in a state of flux, then wait for everything to * settle down and go around again. */ if (kn_is_other_marker || kn_is_detaching || kn_is_in_flux) { if (influx) { influx = 0; KQ_FLUX_WAKEUP(kq); } mutex_exit(&fdp->fd_lock); if (kn_is_other_marker || kn_is_in_flux) { KQ_FLUX_WAIT(kq); mutex_spin_exit(&kq->kq_lock); } else { /* * Detaching but not in-flux? Someone is * actively trying to finish the job; just * go around and try again. */ KASSERT(kn_is_detaching); mutex_spin_exit(&kq->kq_lock); preempt_point(); } goto relock; } TAILQ_REMOVE(&kq->kq_head, kn, kn_tqe); if (kn == marker) { /* it's our marker, stop */ KQ_FLUX_WAKEUP(kq); if (count == maxevents) { mutex_exit(&fdp->fd_lock); goto retry; } break; } KASSERT((kn->kn_status & KN_BUSY) == 0); kq_check(kq); kn->kn_status &= ~KN_QUEUED; kn->kn_status |= KN_BUSY; kq_check(kq); if (kn->kn_status & KN_DISABLED) { kn->kn_status &= ~KN_BUSY; kq->kq_count--; /* don't want disabled events */ continue; } if ((kn->kn_flags & EV_ONESHOT) == 0) { mutex_spin_exit(&kq->kq_lock); KASSERT(mutex_owned(&fdp->fd_lock)); knote_foplock_enter(kn); rv = filter_event(kn, 0, false); knote_foplock_exit(kn); mutex_spin_enter(&kq->kq_lock); /* Re-poll if note was re-enqueued. */ if ((kn->kn_status & KN_QUEUED) != 0) { kn->kn_status &= ~KN_BUSY; /* Re-enqueue raised kq_count, lower it again */ kq->kq_count--; influx = 1; continue; } if (rv == 0) { /* * non-ONESHOT event that hasn't triggered * again, so it will remain de-queued. */ kn->kn_status &= ~(KN_ACTIVE|KN_BUSY); kq->kq_count--; influx = 1; continue; } } else { /* * Must NOT drop kq_lock until we can do * the KNOTE_WILLDETACH() below. */ } KASSERT(kn->kn_fop != NULL); touch = (!(kn->kn_fop->f_flags & FILTEROP_ISFD) && kn->kn_fop->f_touch != NULL); /* XXXAD should be got from f_event if !oneshot. */ KASSERT((kn->kn_status & KN_WILLDETACH) == 0); if (touch) { (void)filter_touch(kn, kevp, EVENT_PROCESS); } else { *kevp = kn->kn_kevent; } kevp++; nkev++; influx = 1; if (kn->kn_flags & EV_ONESHOT) { /* delete ONESHOT events after retrieval */ KNOTE_WILLDETACH(kn); kn->kn_status &= ~KN_BUSY; kq->kq_count--; KASSERT(kn_in_flux(kn) == false); KASSERT((kn->kn_status & KN_WILLDETACH) != 0); KASSERT(kn->kn_kevent.udata == curlwp); mutex_spin_exit(&kq->kq_lock); knote_detach(kn, fdp, true); mutex_enter(&fdp->fd_lock); mutex_spin_enter(&kq->kq_lock); } else if (kn->kn_flags & EV_CLEAR) { /* clear state after retrieval */ kn->kn_data = 0; kn->kn_fflags = 0; /* * Manually clear knotes who weren't * 'touch'ed. */ if (touch == 0) { kn->kn_data = 0; kn->kn_fflags = 0; } kn->kn_status &= ~(KN_ACTIVE|KN_BUSY); kq->kq_count--; } else if (kn->kn_flags & EV_DISPATCH) { kn->kn_status |= KN_DISABLED; kn->kn_status &= ~(KN_ACTIVE|KN_BUSY); kq->kq_count--; } else { /* add event back on list */ kq_check(kq); kn->kn_status |= KN_QUEUED; kn->kn_status &= ~KN_BUSY; TAILQ_INSERT_TAIL(&kq->kq_head, kn, kn_tqe); kq_check(kq); } if (nkev == kevcnt) { /* do copyouts in kevcnt chunks */ influx = 0; KQ_FLUX_WAKEUP(kq); mutex_spin_exit(&kq->kq_lock); mutex_exit(&fdp->fd_lock); error = (*keops->keo_put_events) (keops->keo_private, kevbuf, ulistp, nevents, nkev); mutex_enter(&fdp->fd_lock); mutex_spin_enter(&kq->kq_lock); nevents += nkev; nkev = 0; kevp = kevbuf; } count--; if (error != 0 || count == 0) { /* remove marker */ TAILQ_REMOVE(&kq->kq_head, marker, kn_tqe); break; } } KQ_FLUX_WAKEUP(kq); mutex_spin_exit(&kq->kq_lock); mutex_exit(&fdp->fd_lock); done: if (nkev != 0) { /* copyout remaining events */ error = (*keops->keo_put_events)(keops->keo_private, kevbuf, ulistp, nevents, nkev); } *retval = maxevents - count; return error; } /* * fileops ioctl method for a kqueue descriptor. * * Two ioctls are currently supported. They both use struct kfilter_mapping: * KFILTER_BYNAME find name for filter, and return result in * name, which is of size len. * KFILTER_BYFILTER find filter for name. len is ignored. */ /*ARGSUSED*/ static int kqueue_ioctl(file_t *fp, u_long com, void *data) { struct kfilter_mapping *km; const struct kfilter *kfilter; char *name; int error; km = data; error = 0; name = kmem_alloc(KFILTER_MAXNAME, KM_SLEEP); switch (com) { case KFILTER_BYFILTER: /* convert filter -> name */ rw_enter(&kqueue_filter_lock, RW_READER); kfilter = kfilter_byfilter(km->filter); if (kfilter != NULL) { strlcpy(name, kfilter->name, KFILTER_MAXNAME); rw_exit(&kqueue_filter_lock); error = copyoutstr(name, km->name, km->len, NULL); } else { rw_exit(&kqueue_filter_lock); error = ENOENT; } break; case KFILTER_BYNAME: /* convert name -> filter */ error = copyinstr(km->name, name, KFILTER_MAXNAME, NULL); if (error) { break; } rw_enter(&kqueue_filter_lock, RW_READER); kfilter = kfilter_byname(name); if (kfilter != NULL) km->filter = kfilter->filter; else error = ENOENT; rw_exit(&kqueue_filter_lock); break; default: error = ENOTTY; break; } kmem_free(name, KFILTER_MAXNAME); return (error); } /* * fileops fcntl method for a kqueue descriptor. */ static int kqueue_fcntl(file_t *fp, u_int com, void *data) { return (ENOTTY); } /* * fileops poll method for a kqueue descriptor. * Determine if kqueue has events pending. */ static int kqueue_poll(file_t *fp, int events) { struct kqueue *kq; int revents; kq = fp->f_kqueue; revents = 0; if (events & (POLLIN | POLLRDNORM)) { mutex_spin_enter(&kq->kq_lock); if (KQ_COUNT(kq) != 0) { revents |= events & (POLLIN | POLLRDNORM); } else { selrecord(curlwp, &kq->kq_sel); } kq_check(kq); mutex_spin_exit(&kq->kq_lock); } return revents; } /* * fileops stat method for a kqueue descriptor. * Returns dummy info, with st_size being number of events pending. */ static int kqueue_stat(file_t *fp, struct stat *st) { struct kqueue *kq; kq = fp->f_kqueue; memset(st, 0, sizeof(*st)); st->st_size = KQ_COUNT(kq); st->st_blksize = sizeof(struct kevent); st->st_mode = S_IFIFO | S_IRUSR | S_IWUSR; st->st_blocks = 1; st->st_uid = kauth_cred_geteuid(fp->f_cred); st->st_gid = kauth_cred_getegid(fp->f_cred); return 0; } static void kqueue_doclose(struct kqueue *kq, struct klist *list, int fd) { struct knote *kn; filedesc_t *fdp; fdp = kq->kq_fdp; KASSERT(mutex_owned(&fdp->fd_lock)); again: for (kn = SLIST_FIRST(list); kn != NULL;) { if (kq != kn->kn_kq) { kn = SLIST_NEXT(kn, kn_link); continue; } if (knote_detach_quiesce(kn)) { mutex_enter(&fdp->fd_lock); goto again; } knote_detach(kn, fdp, true); mutex_enter(&fdp->fd_lock); kn = SLIST_FIRST(list); } } /* * fileops close method for a kqueue descriptor. */ static int kqueue_close(file_t *fp) { struct kqueue *kq; filedesc_t *fdp; fdfile_t *ff; int i; kq = fp->f_kqueue; fp->f_kqueue = NULL; fp->f_type = 0; fdp = curlwp->l_fd; KASSERT(kq->kq_fdp == fdp); mutex_enter(&fdp->fd_lock); /* * We're doing to drop the fd_lock multiple times while * we detach knotes. During this time, attempts to register * knotes via the back door (e.g. knote_proc_fork_track()) * need to fail, lest they sneak in to attach a knote after * we've already drained the list it's destined for. * * We must acquire kq_lock here to set KQ_CLOSING (to serialize * with other code paths that modify kq_count without holding * the fd_lock), but once this bit is set, it's only safe to * test it while holding the fd_lock, and holding kq_lock while * doing so is not necessary. */ mutex_enter(&kq->kq_lock); kq->kq_count |= KQ_CLOSING; mutex_exit(&kq->kq_lock); for (i = 0; i <= fdp->fd_lastkqfile; i++) { if ((ff = fdp->fd_dt->dt_ff[i]) == NULL) continue; kqueue_doclose(kq, (struct klist *)&ff->ff_knlist, i); } if (fdp->fd_knhashmask != 0) { for (i = 0; i < fdp->fd_knhashmask + 1; i++) { kqueue_doclose(kq, &fdp->fd_knhash[i], -1); } } mutex_exit(&fdp->fd_lock); #if defined(DEBUG) mutex_enter(&kq->kq_lock); kq_check(kq); mutex_exit(&kq->kq_lock); #endif /* DEBUG */ KASSERT(TAILQ_EMPTY(&kq->kq_head)); KASSERT(KQ_COUNT(kq) == 0); mutex_destroy(&kq->kq_lock); cv_destroy(&kq->kq_cv); seldestroy(&kq->kq_sel); kmem_free(kq, sizeof(*kq)); return (0); } /* * struct fileops kqfilter method for a kqueue descriptor. * Event triggered when monitored kqueue changes. */ static int kqueue_kqfilter(file_t *fp, struct knote *kn) { struct kqueue *kq; kq = ((file_t *)kn->kn_obj)->f_kqueue; KASSERT(fp == kn->kn_obj); if (kn->kn_filter != EVFILT_READ) return EINVAL; kn->kn_fop = &kqread_filtops; mutex_enter(&kq->kq_lock); selrecord_knote(&kq->kq_sel, kn); mutex_exit(&kq->kq_lock); return 0; } /* * Walk down a list of knotes, activating them if their event has * triggered. The caller's object lock (e.g. device driver lock) * must be held. */ void knote(struct klist *list, long hint) { struct knote *kn, *tmpkn; SLIST_FOREACH_SAFE(kn, list, kn_selnext, tmpkn) { /* * We assume here that the backing object's lock is * already held if we're traversing the klist, and * so acquiring the knote foplock would create a * deadlock scenario. But we also know that the klist * won't disappear on us while we're here, so not * acquiring it is safe. */ if (filter_event(kn, hint, true)) { knote_activate(kn); } } } /* * Remove all knotes referencing a specified fd */ void knote_fdclose(int fd) { struct klist *list; struct knote *kn; filedesc_t *fdp; again: fdp = curlwp->l_fd; mutex_enter(&fdp->fd_lock); list = (struct klist *)&fdp->fd_dt->dt_ff[fd]->ff_knlist; while ((kn = SLIST_FIRST(list)) != NULL) { if (knote_detach_quiesce(kn)) { goto again; } knote_detach(kn, fdp, true); mutex_enter(&fdp->fd_lock); } mutex_exit(&fdp->fd_lock); } /* * Drop knote. Called with fdp->fd_lock held, and will drop before * returning. */ static void knote_detach(struct knote *kn, filedesc_t *fdp, bool dofop) { struct klist *list; struct kqueue *kq; kq = kn->kn_kq; KASSERT((kn->kn_status & KN_MARKER) == 0); KASSERT((kn->kn_status & KN_WILLDETACH) != 0); KASSERT(kn->kn_fop != NULL); KASSERT(mutex_owned(&fdp->fd_lock)); /* Remove from monitored object. */ if (dofop) { knote_foplock_enter(kn); filter_detach(kn); knote_foplock_exit(kn); } /* Remove from descriptor table. */ if (kn->kn_fop->f_flags & FILTEROP_ISFD) list = (struct klist *)&fdp->fd_dt->dt_ff[kn->kn_id]->ff_knlist; else list = &fdp->fd_knhash[KN_HASH(kn->kn_id, fdp->fd_knhashmask)]; SLIST_REMOVE(list, kn, knote, kn_link); /* Remove from kqueue. */ again: mutex_spin_enter(&kq->kq_lock); KASSERT(kn_in_flux(kn) == false); if ((kn->kn_status & KN_QUEUED) != 0) { kq_check(kq); KASSERT(KQ_COUNT(kq) != 0); kq->kq_count--; TAILQ_REMOVE(&kq->kq_head, kn, kn_tqe); kn->kn_status &= ~KN_QUEUED; kq_check(kq); } else if (kn->kn_status & KN_BUSY) { mutex_spin_exit(&kq->kq_lock); goto again; } mutex_spin_exit(&kq->kq_lock); mutex_exit(&fdp->fd_lock); if (kn->kn_fop->f_flags & FILTEROP_ISFD) fd_putfile(kn->kn_id); atomic_dec_uint(&kn->kn_kfilter->refcnt); knote_free(kn); } /* * Queue new event for knote. */ static void knote_enqueue(struct knote *kn) { struct kqueue *kq; KASSERT((kn->kn_status & KN_MARKER) == 0); kq = kn->kn_kq; mutex_spin_enter(&kq->kq_lock); if (__predict_false(kn->kn_status & KN_WILLDETACH)) { /* Don't bother enqueueing a dying knote. */ goto out; } if ((kn->kn_status & KN_DISABLED) != 0) { kn->kn_status &= ~KN_DISABLED; } if ((kn->kn_status & (KN_ACTIVE | KN_QUEUED)) == KN_ACTIVE) { kq_check(kq); kn->kn_status |= KN_QUEUED; TAILQ_INSERT_TAIL(&kq->kq_head, kn, kn_tqe); KASSERT(KQ_COUNT(kq) < KQ_MAXCOUNT); kq->kq_count++; kq_check(kq); cv_broadcast(&kq->kq_cv); selnotify(&kq->kq_sel, 0, NOTE_SUBMIT); } out: mutex_spin_exit(&kq->kq_lock); } /* * Queue new event for knote. */ static void knote_activate_locked(struct knote *kn) { struct kqueue *kq; KASSERT((kn->kn_status & KN_MARKER) == 0); kq = kn->kn_kq; if (__predict_false(kn->kn_status & KN_WILLDETACH)) { /* Don't bother enqueueing a dying knote. */ return; } kn->kn_status |= KN_ACTIVE; if ((kn->kn_status & (KN_QUEUED | KN_DISABLED)) == 0) { kq_check(kq); kn->kn_status |= KN_QUEUED; TAILQ_INSERT_TAIL(&kq->kq_head, kn, kn_tqe); KASSERT(KQ_COUNT(kq) < KQ_MAXCOUNT); kq->kq_count++; kq_check(kq); cv_broadcast(&kq->kq_cv); selnotify(&kq->kq_sel, 0, NOTE_SUBMIT); } } static void knote_activate(struct knote *kn) { struct kqueue *kq = kn->kn_kq; mutex_spin_enter(&kq->kq_lock); knote_activate_locked(kn); mutex_spin_exit(&kq->kq_lock); } static void knote_deactivate_locked(struct knote *kn) { struct kqueue *kq = kn->kn_kq; if (kn->kn_status & KN_QUEUED) { kq_check(kq); kn->kn_status &= ~KN_QUEUED; TAILQ_REMOVE(&kq->kq_head, kn, kn_tqe); KASSERT(KQ_COUNT(kq) > 0); kq->kq_count--; kq_check(kq); } kn->kn_status &= ~KN_ACTIVE; } /* * Set EV_EOF on the specified knote. Also allows additional * EV_* flags to be set (e.g. EV_ONESHOT). */ void knote_set_eof(struct knote *kn, uint32_t flags) { struct kqueue *kq = kn->kn_kq; mutex_spin_enter(&kq->kq_lock); kn->kn_flags |= EV_EOF | flags; mutex_spin_exit(&kq->kq_lock); } /* * Clear EV_EOF on the specified knote. */ void knote_clear_eof(struct knote *kn) { struct kqueue *kq = kn->kn_kq; mutex_spin_enter(&kq->kq_lock); kn->kn_flags &= ~EV_EOF; mutex_spin_exit(&kq->kq_lock); } /* * Initialize a klist. */ void klist_init(struct klist *list) { SLIST_INIT(list); } /* * Finalize a klist. */ void klist_fini(struct klist *list) { struct knote *kn; /* * Neuter all existing knotes on the klist because the list is * being destroyed. The caller has guaranteed that no additional * knotes will be added to the list, that the backing object's * locks are not held (otherwise there is a locking order issue * with acquiring the knote foplock ), and that we can traverse * the list safely in this state. */ SLIST_FOREACH(kn, list, kn_selnext) { knote_foplock_enter(kn); KASSERT(kn->kn_fop != NULL); if (kn->kn_fop->f_flags & FILTEROP_ISFD) { kn->kn_fop = &nop_fd_filtops; } else { kn->kn_fop = &nop_filtops; } knote_foplock_exit(kn); } } /* * Insert a knote into a klist. */ void klist_insert(struct klist *list, struct knote *kn) { SLIST_INSERT_HEAD(list, kn, kn_selnext); } /* * Remove a knote from a klist. Returns true if the last * knote was removed and the list is now empty. */ bool klist_remove(struct klist *list, struct knote *kn) { SLIST_REMOVE(list, kn, knote, kn_selnext); return SLIST_EMPTY(list); }
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 /* $NetBSD: bluetooth.h,v 1.12 2014/05/18 14:46:16 rmind Exp $ */ /*- * Copyright (c) 2005 Iain Hibbert. * Copyright (c) 2006 Itronix Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The name of Itronix Inc. may not be used to endorse * or promote products derived from this software without specific * prior written permission. * * THIS SOFTWARE IS PROVIDED BY ITRONIX INC. ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL ITRONIX INC. BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #ifndef _NETBT_BLUETOOTH_H_ #define _NETBT_BLUETOOTH_H_ #include <sys/socket.h> #include <sys/types.h> /* * Bluetooth Address Family Protocol Numbers */ #define BTPROTO_HCI 1 #define BTPROTO_L2CAP 2 #define BTPROTO_RFCOMM 3 #define BTPROTO_SCO 4 /* All sizes are in bytes */ #define BLUETOOTH_BDADDR_SIZE 6 /* * Bluetooth device address */ typedef struct { uint8_t b[BLUETOOTH_BDADDR_SIZE]; } __packed bdaddr_t; /* * bdaddr utility functions */ static __inline int bdaddr_same(const bdaddr_t *a, const bdaddr_t *b) { return (a->b[0] == b->b[0] && a->b[1] == b->b[1] && a->b[2] == b->b[2] && a->b[3] == b->b[3] && a->b[4] == b->b[4] && a->b[5] == b->b[5]); } static __inline int bdaddr_any(const bdaddr_t *a) { return (a->b[0] == 0 && a->b[1] == 0 && a->b[2] == 0 && a->b[3] == 0 && a->b[4] == 0 && a->b[5] == 0); } static __inline void bdaddr_copy(bdaddr_t *d, const bdaddr_t *s) { d->b[0] = s->b[0]; d->b[1] = s->b[1]; d->b[2] = s->b[2]; d->b[3] = s->b[3]; d->b[4] = s->b[4]; d->b[5] = s->b[5]; } /* * Socket address used by Bluetooth protocols */ struct sockaddr_bt { uint8_t bt_len; sa_family_t bt_family; bdaddr_t bt_bdaddr; uint16_t bt_psm; uint8_t bt_channel; uint8_t bt_zero[5]; }; /* Note: this is actually 6 bytes including terminator */ #define BDADDR_ANY ((const bdaddr_t *) "\000\000\000\000\000") #ifdef _KERNEL #include <sys/protosw.h> #include <sys/mallocvar.h> MALLOC_DECLARE(M_BLUETOOTH); /* * Bluetooth Protocol API callback methods */ struct mbuf; struct btproto { void (*connecting)(void *); void (*connected)(void *); void (*disconnected)(void *, int); void *(*newconn)(void *, struct sockaddr_bt *, struct sockaddr_bt *); void (*complete)(void *, int); void (*linkmode)(void *, int); void (*input)(void *, struct mbuf *); }; extern const struct pr_usrreqs hci_usrreqs; extern const struct pr_usrreqs sco_usrreqs; extern const struct pr_usrreqs l2cap_usrreqs; extern const struct pr_usrreqs rfcomm_usrreqs; extern kmutex_t *bt_lock; /* * Debugging stuff */ #ifdef BLUETOOTH_DEBUG extern int bluetooth_debug; # define DPRINTF(...) do { \ if (bluetooth_debug) { \ printf("%s: ", __func__); \ printf(__VA_ARGS__); \ } \ } while (/* CONSTCOND */0) # define DPRINTFN(n, ...) do { \ if (bluetooth_debug > (n)) { \ printf("%s: ", __func__); \ printf(__VA_ARGS__); \ } \ } while (/* CONSTCOND */0) # define UNKNOWN(value) \ printf("%s: %s = %d unknown!\n", __func__, #value, (value)); #else # define DPRINTF(...) ((void)0) # define DPRINTFN(...) ((void)0) # define UNKNOWN(x) ((void)0) #endif /* BLUETOOTH_DEBUG */ #endif /* _KERNEL */ #endif /* _NETBT_BLUETOOTH_H_ */
6 2 1 15 1 11 6 6 11 17 1 16 2 3 3 2 2 3 1 3 3 2 17 17 17 10 6 6 6 10 2 8 6 10 1 10 16 16 1 11 6 14 1 3 16 1 16 16 12 6 16 16 16 15 3 3 2 1 1 2 13 1 2 2 2 1 3 5 2 9 11 11 15 3 14 15 15 15 15 3 12 2 14 15 15 15 14 2 14 14 1 14 5 10 15 4 12 2 3 14 4 14 14 13 1 1 1 5 10 5 10 9 9 9 5 5 6 4 5 7 7 7 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 /* $NetBSD: tcp_output.c,v 1.219 2023/09/13 15:54:28 bouyer Exp $ */ /* * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the project nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * @(#)COPYRIGHT 1.1 (NRL) 17 January 1995 * * NRL grants permission for redistribution and use in source and binary * forms, with or without modification, of the software and documentation * created at NRL provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgements: * This product includes software developed by the University of * California, Berkeley and its contributors. * This product includes software developed at the Information * Technology Division, US Naval Research Laboratory. * 4. Neither the name of the NRL nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THE SOFTWARE PROVIDED BY NRL IS PROVIDED BY NRL AND CONTRIBUTORS ``AS * IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NRL OR * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * The views and conclusions contained in the software and documentation * are those of the authors and should not be interpreted as representing * official policies, either expressed or implied, of the US Naval * Research Laboratory (NRL). */ /*- * Copyright (c) 1997, 1998, 2001, 2005, 2006 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Jason R. Thorpe and Kevin M. Lahey of the Numerical Aerospace Simulation * Facility, NASA Ames Research Center. * This code is derived from software contributed to The NetBSD Foundation * by Charles M. Hannum. * This code is derived from software contributed to The NetBSD Foundation * by Rui Paulo. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /* * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)tcp_output.c 8.4 (Berkeley) 5/24/95 */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: tcp_output.c,v 1.219 2023/09/13 15:54:28 bouyer Exp $"); #ifdef _KERNEL_OPT #include "opt_inet.h" #include "opt_ipsec.h" #include "opt_tcp_debug.h" #endif #include <sys/param.h> #include <sys/systm.h> #include <sys/mbuf.h> #include <sys/protosw.h> #include <sys/socket.h> #include <sys/socketvar.h> #include <sys/errno.h> #include <sys/domain.h> #include <sys/kernel.h> #ifdef TCP_SIGNATURE #include <sys/md5.h> #endif #include <net/if.h> #include <net/route.h> #include <netinet/in.h> #include <netinet/in_systm.h> #include <netinet/ip.h> #include <netinet/in_pcb.h> #include <netinet/ip_var.h> #ifdef INET6 #include <netinet/ip6.h> #include <netinet6/in6_var.h> #include <netinet6/ip6_var.h> #include <netinet6/in6_pcb.h> #include <netinet6/nd6.h> #endif #ifdef IPSEC #include <netipsec/ipsec.h> #include <netipsec/key.h> #ifdef INET6 #include <netipsec/ipsec6.h> #endif #endif #include <netinet/tcp.h> #define TCPOUTFLAGS #include <netinet/tcp_fsm.h> #include <netinet/tcp_seq.h> #include <netinet/tcp_timer.h> #include <netinet/tcp_var.h> #include <netinet/tcp_private.h> #include <netinet/tcp_congctl.h> #include <netinet/tcp_debug.h> #include <netinet/in_offload.h> #include <netinet6/in6_offload.h> /* * Knob to enable Congestion Window Monitoring, and control * the burst size it allows. Default burst is 4 packets, per * the Internet draft. */ int tcp_cwm = 0; int tcp_cwm_burstsize = 4; int tcp_do_autosndbuf = 1; int tcp_autosndbuf_inc = 8 * 1024; int tcp_autosndbuf_max = 256 * 1024; #ifdef TCP_OUTPUT_COUNTERS #include <sys/device.h> extern struct evcnt tcp_output_bigheader; extern struct evcnt tcp_output_predict_hit; extern struct evcnt tcp_output_predict_miss; extern struct evcnt tcp_output_copysmall; extern struct evcnt tcp_output_copybig; extern struct evcnt tcp_output_refbig; #define TCP_OUTPUT_COUNTER_INCR(ev) (ev)->ev_count++ #else #define TCP_OUTPUT_COUNTER_INCR(ev) /* nothing */ #endif /* TCP_OUTPUT_COUNTERS */ static int tcp_segsize(struct tcpcb *tp, int *txsegsizep, int *rxsegsizep, bool *alwaysfragp) { struct inpcb *inp = tp->t_inpcb; struct socket *so = NULL; struct rtentry *rt; struct ifnet *ifp; int size; int hdrlen; int optlen; *alwaysfragp = false; size = tcp_mssdflt; switch (tp->t_family) { case AF_INET: hdrlen = sizeof(struct ip) + sizeof(struct tcphdr); break; #ifdef INET6 case AF_INET6: hdrlen = sizeof(struct ip6_hdr) + sizeof(struct tcphdr); break; #endif default: hdrlen = 1; /* prevent zero sized segments */ goto out; } rt = inpcb_rtentry(inp); so = inp->inp_socket; if (rt == NULL) { goto out; } ifp = rt->rt_ifp; if (tp->t_mtudisc && rt->rt_rmx.rmx_mtu != 0) { #ifdef INET6 if (inp->inp_af == AF_INET6 && rt->rt_rmx.rmx_mtu < IPV6_MMTU) { /* * RFC2460 section 5, last paragraph: if path MTU is * smaller than 1280, use 1280 as packet size and * attach fragment header. */ size = IPV6_MMTU - hdrlen - sizeof(struct ip6_frag); *alwaysfragp = true; } else size = rt->rt_rmx.rmx_mtu - hdrlen; #else size = rt->rt_rmx.rmx_mtu - hdrlen; #endif } else if (ifp->if_flags & IFF_LOOPBACK) size = ifp->if_mtu - hdrlen; else if (inp->inp_af == AF_INET && tp->t_mtudisc) size = ifp->if_mtu - hdrlen; else if (inp->inp_af == AF_INET && in_localaddr(in4p_faddr(inp))) size = ifp->if_mtu - hdrlen; #ifdef INET6 else if (inp->inp_af == AF_INET6) { if (IN6_IS_ADDR_V4MAPPED(&in6p_faddr(inp))) { /* mapped addr case */ struct in_addr d; memcpy(&d, &in6p_faddr(inp).s6_addr32[3], sizeof(d)); if (tp->t_mtudisc || in_localaddr(d)) size = ifp->if_mtu - hdrlen; } else { /* * for IPv6, path MTU discovery is always turned on, * or the node must use packet size <= 1280. */ size = tp->t_mtudisc ? ifp->if_mtu : IPV6_MMTU; size -= hdrlen; } } #endif inpcb_rtentry_unref(rt, inp); out: /* * Now we must make room for whatever extra TCP/IP options are in * the packet. */ optlen = tcp_optlen(tp); /* * XXX tp->t_ourmss should have the right size, but without this code * fragmentation will occur... need more investigation */ if (inp->inp_af == AF_INET) { #if defined(IPSEC) if (ipsec_used && !ipsec_pcb_skip_ipsec(inp->inp_sp, IPSEC_DIR_OUTBOUND)) optlen += ipsec4_hdrsiz_tcp(tp); #endif optlen += ip_optlen(inp); } #ifdef INET6 if (inp->inp_af == AF_INET6 && tp->t_family == AF_INET) { #if defined(IPSEC) if (ipsec_used && !ipsec_pcb_skip_ipsec(inp->inp_sp, IPSEC_DIR_OUTBOUND)) optlen += ipsec4_hdrsiz_tcp(tp); #endif /* XXX size -= ip_optlen(in6p); */ } else if (inp->inp_af == AF_INET6) { #if defined(IPSEC) if (ipsec_used && !ipsec_pcb_skip_ipsec(inp->inp_sp, IPSEC_DIR_OUTBOUND)) optlen += ipsec6_hdrsiz_tcp(tp); #endif optlen += ip6_optlen(inp); } #endif size -= optlen; /* * There may not be any room for data if mtu is too small. This * includes zero-sized. */ if (size <= 0) { return EMSGSIZE; } /* * *rxsegsizep holds *estimated* inbound segment size (estimation * assumes that path MTU is the same for both ways). this is only * for silly window avoidance, do not use the value for other purposes. * * ipseclen is subtracted from both sides, this may not be right. * I'm not quite sure about this (could someone comment). */ *txsegsizep = uimin(tp->t_peermss - optlen, size); *rxsegsizep = uimin(tp->t_ourmss - optlen, size); /* * Never send more than half a buffer full. This insures that we can * always keep 2 packets on the wire, no matter what SO_SNDBUF is, and * therefore acks will never be delayed unless we run out of data to * transmit. */ if (so) { *txsegsizep = uimin(so->so_snd.sb_hiwat >> 1, *txsegsizep); } /* * A segment must at least store header + options */ if (*txsegsizep < hdrlen + optlen) { return EMSGSIZE; } if (*txsegsizep != tp->t_segsz) { /* * If the new segment size is larger, we don't want to * mess up the congestion window, but if it is smaller * we'll have to reduce the congestion window to ensure * that we don't get into trouble with initial windows * and the rest. In any case, if the segment size * has changed, chances are the path has, too, and * our congestion window will be different. */ if (*txsegsizep < tp->t_segsz) { tp->snd_cwnd = uimax((tp->snd_cwnd / tp->t_segsz) * *txsegsizep, *txsegsizep); tp->snd_ssthresh = uimax((tp->snd_ssthresh / tp->t_segsz) * *txsegsizep, *txsegsizep); } tp->t_segsz = *txsegsizep; } return 0; } static int tcp_build_datapkt(struct tcpcb *tp, struct socket *so, int off, long len, int hdrlen, struct mbuf **mp) { struct mbuf *m, *m0; uint64_t *tcps; tcps = TCP_STAT_GETREF(); if (tp->t_force && len == 1) tcps[TCP_STAT_SNDPROBE]++; else if (SEQ_LT(tp->snd_nxt, tp->snd_max)) { tp->t_sndrexmitpack++; tcps[TCP_STAT_SNDREXMITPACK]++; tcps[TCP_STAT_SNDREXMITBYTE] += len; } else { tcps[TCP_STAT_SNDPACK]++; tcps[TCP_STAT_SNDBYTE] += len; } TCP_STAT_PUTREF(); MGETHDR(m, M_DONTWAIT, MT_HEADER); if (__predict_false(m == NULL)) return ENOBUFS; MCLAIM(m, &tcp_tx_mowner); /* * XXX Because other code assumes headers will fit in * XXX one header mbuf. * * (This code should almost *never* be run.) */ if (__predict_false((max_linkhdr + hdrlen) > MHLEN)) { TCP_OUTPUT_COUNTER_INCR(&tcp_output_bigheader); MCLGET(m, M_DONTWAIT); if ((m->m_flags & M_EXT) == 0) { m_freem(m); return ENOBUFS; } } m->m_data += max_linkhdr; m->m_len = hdrlen; /* * To avoid traversing the whole sb_mb chain for correct * data to send, remember last sent mbuf, its offset and * the sent size. When called the next time, see if the * data to send is directly following the previous transfer. * This is important for large TCP windows. */ if (off == 0 || tp->t_lastm == NULL || (tp->t_lastoff + tp->t_lastlen) != off) { TCP_OUTPUT_COUNTER_INCR(&tcp_output_predict_miss); /* * Either a new packet or a retransmit. * Start from the beginning. */ tp->t_lastm = so->so_snd.sb_mb; tp->t_inoff = off; } else { TCP_OUTPUT_COUNTER_INCR(&tcp_output_predict_hit); tp->t_inoff += tp->t_lastlen; } /* Traverse forward to next packet */ while (tp->t_inoff > 0) { if (tp->t_lastm == NULL) panic("tp->t_lastm == NULL"); if (tp->t_inoff < tp->t_lastm->m_len) break; tp->t_inoff -= tp->t_lastm->m_len; tp->t_lastm = tp->t_lastm->m_next; } tp->t_lastoff = off; tp->t_lastlen = len; m0 = tp->t_lastm; off = tp->t_inoff; if (len <= M_TRAILINGSPACE(m)) { m_copydata(m0, off, (int)len, mtod(m, char *) + hdrlen); m->m_len += len; TCP_OUTPUT_COUNTER_INCR(&tcp_output_copysmall); } else { m->m_next = m_copym(m0, off, (int)len, M_DONTWAIT); if (m->m_next == NULL) { m_freem(m); return ENOBUFS; } #ifdef TCP_OUTPUT_COUNTERS if (m->m_next->m_flags & M_EXT) TCP_OUTPUT_COUNTER_INCR(&tcp_output_refbig); else TCP_OUTPUT_COUNTER_INCR(&tcp_output_copybig); #endif } *mp = m; return 0; } /* * Tcp output routine: figure out what should be sent and send it. */ int tcp_output(struct tcpcb *tp) { struct rtentry *rt = NULL; struct socket *so; struct route *ro; long len, win; int off, flags, error; struct mbuf *m; struct ip *ip; #ifdef INET6 struct ip6_hdr *ip6; #endif struct tcphdr *th; u_char opt[MAX_TCPOPTLEN], *optp; #define OPT_FITS(more) ((optlen + (more)) <= sizeof(opt)) unsigned optlen, hdrlen, packetlen; unsigned int sack_numblks; int idle, sendalot, txsegsize, rxsegsize; int txsegsize_nosack; int maxburst = TCP_MAXBURST; int af; /* address family on the wire */ int iphdrlen; int has_tso4, has_tso6; int has_tso, use_tso; bool alwaysfrag; int sack_rxmit; int sack_bytes_rxmt; int ecn_tos; struct sackhole *p; #ifdef TCP_SIGNATURE int sigoff = 0; #endif uint64_t *tcps; so = tp->t_inpcb->inp_socket; ro = &tp->t_inpcb->inp_route; switch (af = tp->t_family) { case AF_INET: case AF_INET6: if (tp->t_inpcb) break; return EINVAL; default: return EAFNOSUPPORT; } if (tcp_segsize(tp, &txsegsize, &rxsegsize, &alwaysfrag)) return EMSGSIZE; idle = (tp->snd_max == tp->snd_una); /* * Determine if we can use TCP segmentation offload: * - If we're using IPv4 * - If there is not an IPsec policy that prevents it * - If the interface can do it */ has_tso4 = has_tso6 = false; has_tso4 = tp->t_inpcb->inp_af == AF_INET && #if defined(IPSEC) (!ipsec_used || ipsec_pcb_skip_ipsec(tp->t_inpcb->inp_sp, IPSEC_DIR_OUTBOUND)) && #endif (rt = rtcache_validate(&tp->t_inpcb->inp_route)) != NULL && (rt->rt_ifp->if_capenable & IFCAP_TSOv4) != 0; if (rt != NULL) { rtcache_unref(rt, &tp->t_inpcb->inp_route); rt = NULL; } #if defined(INET6) has_tso6 = tp->t_inpcb->inp_af == AF_INET6 && #if defined(IPSEC) (!ipsec_used || ipsec_pcb_skip_ipsec(tp->t_inpcb->inp_sp, IPSEC_DIR_OUTBOUND)) && #endif (rt = rtcache_validate(&tp->t_inpcb->inp_route)) != NULL && (rt->rt_ifp->if_capenable & IFCAP_TSOv6) != 0; if (rt != NULL) rtcache_unref(rt, &tp->t_inpcb->inp_route); #endif /* defined(INET6) */ has_tso = (has_tso4 || has_tso6) && !alwaysfrag; /* * Restart Window computation. From draft-floyd-incr-init-win-03: * * Optionally, a TCP MAY set the restart window to the * minimum of the value used for the initial window and * the current value of cwnd (in other words, using a * larger value for the restart window should never increase * the size of cwnd). */ if (tcp_cwm) { /* * Hughes/Touch/Heidemann Congestion Window Monitoring. * Count the number of packets currently pending * acknowledgement, and limit our congestion window * to a pre-determined allowed burst size plus that count. * This prevents bursting once all pending packets have * been acknowledged (i.e. transmission is idle). * * XXX Link this to Initial Window? */ tp->snd_cwnd = uimin(tp->snd_cwnd, (tcp_cwm_burstsize * txsegsize) + (tp->snd_nxt - tp->snd_una)); } else { if (idle && (tcp_now - tp->t_rcvtime) >= tp->t_rxtcur) { /* * We have been idle for "a while" and no acks are * expected to clock out any data we send -- * slow start to get ack "clock" running again. */ int ss = tcp_init_win; if (tp->t_inpcb->inp_af == AF_INET && in_localaddr(in4p_faddr(tp->t_inpcb))) ss = tcp_init_win_local; #ifdef INET6 else if (tp->t_inpcb->inp_af == AF_INET6 && in6_localaddr(&in6p_faddr(tp->t_inpcb))) ss = tcp_init_win_local; #endif tp->snd_cwnd = uimin(tp->snd_cwnd, TCP_INITIAL_WINDOW(ss, txsegsize)); } } txsegsize_nosack = txsegsize; again: ecn_tos = 0; use_tso = has_tso; if ((tp->t_flags & (TF_ECN_SND_CWR|TF_ECN_SND_ECE)) != 0) { /* don't duplicate CWR/ECE. */ use_tso = 0; } TCP_REASS_LOCK(tp); sack_numblks = tcp_sack_numblks(tp); if (sack_numblks) { int sackoptlen; sackoptlen = TCP_SACK_OPTLEN(sack_numblks); if (sackoptlen > txsegsize_nosack) { sack_numblks = 0; /* give up SACK */ txsegsize = txsegsize_nosack; } else { if ((tp->rcv_sack_flags & TCPSACK_HAVED) != 0) { /* don't duplicate D-SACK. */ use_tso = 0; } txsegsize = txsegsize_nosack - sackoptlen; } } else { txsegsize = txsegsize_nosack; } /* * Determine length of data that should be transmitted, and * flags that should be used. If there is some data or critical * controls (SYN, RST) to send, then transmit; otherwise, * investigate further. * * Readjust SACK information to avoid resending duplicate data. */ if (TCP_SACK_ENABLED(tp) && SEQ_LT(tp->snd_nxt, tp->snd_max)) tcp_sack_adjust(tp); sendalot = 0; off = tp->snd_nxt - tp->snd_una; win = uimin(tp->snd_wnd, tp->snd_cwnd); flags = tcp_outflags[tp->t_state]; /* * Send any SACK-generated retransmissions. If we're explicitly trying * to send out new data (when sendalot is 1), bypass this function. * If we retransmit in fast recovery mode, decrement snd_cwnd, since * we're replacing a (future) new transmission with a retransmission * now, and we previously incremented snd_cwnd in tcp_input(). */ /* * Still in sack recovery, reset rxmit flag to zero. */ sack_rxmit = 0; sack_bytes_rxmt = 0; len = 0; p = NULL; do { long cwin; if (!TCP_SACK_ENABLED(tp)) break; if (tp->t_partialacks < 0) break; p = tcp_sack_output(tp, &sack_bytes_rxmt); if (p == NULL) break; cwin = uimin(tp->snd_wnd, tp->snd_cwnd) - sack_bytes_rxmt; if (cwin < 0) cwin = 0; /* Do not retransmit SACK segments beyond snd_recover */ if (SEQ_GT(p->end, tp->snd_recover)) { /* * (At least) part of sack hole extends beyond * snd_recover. Check to see if we can rexmit data * for this hole. */ if (SEQ_GEQ(p->rxmit, tp->snd_recover)) { /* * Can't rexmit any more data for this hole. * That data will be rexmitted in the next * sack recovery episode, when snd_recover * moves past p->rxmit. */ p = NULL; break; } /* Can rexmit part of the current hole */ len = ((long)ulmin(cwin, tp->snd_recover - p->rxmit)); } else len = ((long)ulmin(cwin, p->end - p->rxmit)); off = p->rxmit - tp->snd_una; if (off + len > so->so_snd.sb_cc) { /* 1 for TH_FIN */ KASSERT(off + len == so->so_snd.sb_cc + 1); KASSERT(p->rxmit + len == tp->snd_max); len = so->so_snd.sb_cc - off; } if (len > 0) { sack_rxmit = 1; sendalot = 1; } } while (/*CONSTCOND*/0); /* * If in persist timeout with window of 0, send 1 byte. * Otherwise, if window is small but nonzero * and timer expired, we will send what we can * and go to transmit state. */ if (tp->t_force) { if (win == 0) { /* * If we still have some data to send, then * clear the FIN bit. Usually this would * happen below when it realizes that we * aren't sending all the data. However, * if we have exactly 1 byte of unset data, * then it won't clear the FIN bit below, * and if we are in persist state, we wind * up sending the packet without recording * that we sent the FIN bit. * * We can't just blindly clear the FIN bit, * because if we don't have any more data * to send then the probe will be the FIN * itself. */ if (off < so->so_snd.sb_cc) flags &= ~TH_FIN; win = 1; } else { TCP_TIMER_DISARM(tp, TCPT_PERSIST); tp->t_rxtshift = 0; } } if (sack_rxmit == 0) { if (TCP_SACK_ENABLED(tp) && tp->t_partialacks >= 0) { long cwin; /* * We are inside of a SACK recovery episode and are * sending new data, having retransmitted all the * data possible in the scoreboard. */ if (tp->snd_wnd < so->so_snd.sb_cc) { len = tp->snd_wnd - off; flags &= ~TH_FIN; } else { len = so->so_snd.sb_cc - off; } /* * From FreeBSD: * Don't remove this (len > 0) check ! * We explicitly check for len > 0 here (although it * isn't really necessary), to work around a gcc * optimization issue - to force gcc to compute * len above. Without this check, the computation * of len is bungled by the optimizer. */ if (len > 0) { cwin = tp->snd_cwnd - (tp->snd_nxt - tp->sack_newdata) - sack_bytes_rxmt; if (cwin < 0) cwin = 0; if (cwin < len) { len = cwin; flags &= ~TH_FIN; } } } else if (win < so->so_snd.sb_cc) { len = win - off; flags &= ~TH_FIN; } else { len = so->so_snd.sb_cc - off; } } if (len < 0) { /* * If FIN has been sent but not acked, * but we haven't been called to retransmit, * len will be -1. Otherwise, window shrank * after we sent into it. If window shrank to 0, * cancel pending retransmit, pull snd_nxt back * to (closed) window, and set the persist timer * if it isn't already going. If the window didn't * close completely, just wait for an ACK. * * If we have a pending FIN, either it has already been * transmitted or it is outside the window, so drop it. * If the FIN has been transmitted, but this is not a * retransmission, then len must be -1. Therefore we also * prevent here the sending of `gratuitous FINs'. This * eliminates the need to check for that case below (e.g. * to back up snd_nxt before the FIN so that the sequence * number is correct). */ len = 0; flags &= ~TH_FIN; if (win == 0) { TCP_TIMER_DISARM(tp, TCPT_REXMT); tp->t_rxtshift = 0; tp->snd_nxt = tp->snd_una; if (TCP_TIMER_ISARMED(tp, TCPT_PERSIST) == 0) tcp_setpersist(tp); } } /* * Automatic sizing enables the performance of large buffers * and most of the efficiency of small ones by only allocating * space when it is needed. * * The criteria to step up the send buffer one notch are: * 1. receive window of remote host is larger than send buffer * (with a fudge factor of 5/4th); * 2. send buffer is filled to 7/8th with data (so we actually * have data to make use of it); * 3. send buffer fill has not hit maximal automatic size; * 4. our send window (slow start and cogestion controlled) is * larger than sent but unacknowledged data in send buffer. * * The remote host receive window scaling factor may limit the * growing of the send buffer before it reaches its allowed * maximum. * * It scales directly with slow start or congestion window * and does at most one step per received ACK. This fast * scaling has the drawback of growing the send buffer beyond * what is strictly necessary to make full use of a given * delay*bandwidth product. However testing has shown this not * to be much of an problem. At worst we are trading wasting * of available bandwidth (the non-use of it) for wasting some * socket buffer memory. * * TODO: Shrink send buffer during idle periods together * with congestion window. Requires another timer. */ if (tcp_do_autosndbuf && so->so_snd.sb_flags & SB_AUTOSIZE) { if ((tp->snd_wnd / 4 * 5) >= so->so_snd.sb_hiwat && so->so_snd.sb_cc >= (so->so_snd.sb_hiwat / 8 * 7) && so->so_snd.sb_cc < tcp_autosndbuf_max && win >= (so->so_snd.sb_cc - (tp->snd_nxt - tp->snd_una))) { if (!sbreserve(&so->so_snd, uimin(so->so_snd.sb_hiwat + tcp_autosndbuf_inc, tcp_autosndbuf_max), so)) so->so_snd.sb_flags &= ~SB_AUTOSIZE; } } if (len > txsegsize) { if (use_tso) { /* * Truncate TSO transfers to IP_MAXPACKET, and make * sure that we send equal size transfers down the * stack (rather than big-small-big-small-...). */ #ifdef INET6 CTASSERT(IPV6_MAXPACKET == IP_MAXPACKET); #endif len = (uimin(len, IP_MAXPACKET) / txsegsize) * txsegsize; if (len <= txsegsize) { use_tso = 0; } } else len = txsegsize; flags &= ~TH_FIN; sendalot = 1; } else use_tso = 0; if (sack_rxmit) { if (SEQ_LT(p->rxmit + len, tp->snd_una + so->so_snd.sb_cc)) flags &= ~TH_FIN; } win = sbspace(&so->so_rcv); /* * Sender silly window avoidance. If connection is idle * and can send all data, a maximum segment, * at least a maximum default-size segment do it, * or are forced, do it; otherwise don't bother. * If peer's buffer is tiny, then send * when window is at least half open. * If retransmitting (possibly after persist timer forced us * to send into a small window), then must resend. */ if (len) { if (len >= txsegsize) goto send; if ((so->so_state & SS_MORETOCOME) == 0 && ((idle || tp->t_flags & TF_NODELAY) && len + off >= so->so_snd.sb_cc)) goto send; if (tp->t_force) goto send; if (len >= tp->max_sndwnd / 2) goto send; if (SEQ_LT(tp->snd_nxt, tp->snd_max)) goto send; if (sack_rxmit) goto send; } /* * Compare available window to amount of window known to peer * (as advertised window less next expected input). If the * difference is at least twice the size of the largest segment * we expect to receive (i.e. two segments) or at least 50% of * the maximum possible window, then want to send a window update * to peer. */ if (win > 0) { /* * "adv" is the amount we can increase the window, * taking into account that we are limited by * TCP_MAXWIN << tp->rcv_scale. */ long recwin = uimin(win, (long)TCP_MAXWIN << tp->rcv_scale); long oldwin, adv; /* * rcv_nxt may overtake rcv_adv when we accept a * zero-window probe. */ if (SEQ_GT(tp->rcv_adv, tp->rcv_nxt)) oldwin = tp->rcv_adv - tp->rcv_nxt; else oldwin = 0; /* * If the new window size ends up being the same as or * less than the old size when it is scaled, then * don't force a window update. */ if (recwin >> tp->rcv_scale <= oldwin >> tp->rcv_scale) goto dontupdate; adv = recwin - oldwin; if (adv >= (long) (2 * rxsegsize)) goto send; if (2 * adv >= (long) so->so_rcv.sb_hiwat) goto send; } dontupdate: /* * Send if we owe peer an ACK. */ if (tp->t_flags & TF_ACKNOW) goto send; if (flags & (TH_SYN|TH_FIN|TH_RST)) goto send; if (SEQ_GT(tp->snd_up, tp->snd_una)) goto send; /* * In SACK, it is possible for tcp_output to fail to send a segment * after the retransmission timer has been turned off. Make sure * that the retransmission timer is set. */ if (TCP_SACK_ENABLED(tp) && SEQ_GT(tp->snd_max, tp->snd_una) && !TCP_TIMER_ISARMED(tp, TCPT_REXMT) && !TCP_TIMER_ISARMED(tp, TCPT_PERSIST)) { TCP_TIMER_ARM(tp, TCPT_REXMT, tp->t_rxtcur); goto just_return; } /* * TCP window updates are not reliable, rather a polling protocol * using ``persist'' packets is used to insure receipt of window * updates. The three ``states'' for the output side are: * idle not doing retransmits or persists * persisting to move a small or zero window * (re)transmitting and thereby not persisting * * tp->t_timer[TCPT_PERSIST] * is set when we are in persist state. * tp->t_force * is set when we are called to send a persist packet. * tp->t_timer[TCPT_REXMT] * is set when we are retransmitting * The output side is idle when both timers are zero. * * If send window is too small, there is data to transmit, and no * retransmit or persist is pending, then go to persist state. * If nothing happens soon, send when timer expires: * if window is nonzero, transmit what we can, * otherwise force out a byte. */ if (so->so_snd.sb_cc && TCP_TIMER_ISARMED(tp, TCPT_REXMT) == 0 && TCP_TIMER_ISARMED(tp, TCPT_PERSIST) == 0) { tp->t_rxtshift = 0; tcp_setpersist(tp); } /* * No reason to send a segment, just return. */ just_return: TCP_REASS_UNLOCK(tp); return 0; send: /* * Before ESTABLISHED, force sending of initial options unless TCP set * not to do any options. * * Note: we assume that the IP/TCP header plus TCP options always fit * in a single mbuf, leaving room for a maximum link header, i.e.: * max_linkhdr + IP_header + TCP_header + optlen <= MCLBYTES */ optlen = 0; optp = opt; switch (af) { case AF_INET: iphdrlen = sizeof(struct ip) + sizeof(struct tcphdr); break; #ifdef INET6 case AF_INET6: iphdrlen = sizeof(struct ip6_hdr) + sizeof(struct tcphdr); break; #endif default: /*pacify gcc*/ iphdrlen = 0; break; } hdrlen = iphdrlen; if (flags & TH_SYN) { struct rtentry *synrt; synrt = inpcb_rtentry(tp->t_inpcb); tp->snd_nxt = tp->iss; tp->t_ourmss = tcp_mss_to_advertise(synrt != NULL ? synrt->rt_ifp : NULL, af); inpcb_rtentry_unref(synrt, tp->t_inpcb); if ((tp->t_flags & TF_NOOPT) == 0 && OPT_FITS(TCPOLEN_MAXSEG)) { *optp++ = TCPOPT_MAXSEG; *optp++ = TCPOLEN_MAXSEG; *optp++ = (tp->t_ourmss >> 8) & 0xff; *optp++ = tp->t_ourmss & 0xff; optlen += TCPOLEN_MAXSEG; if ((tp->t_flags & TF_REQ_SCALE) && ((flags & TH_ACK) == 0 || (tp->t_flags & TF_RCVD_SCALE)) && OPT_FITS(TCPOLEN_WINDOW + TCPOLEN_NOP)) { *((uint32_t *)optp) = htonl( TCPOPT_NOP << 24 | TCPOPT_WINDOW << 16 | TCPOLEN_WINDOW << 8 | tp->request_r_scale); optp += TCPOLEN_WINDOW + TCPOLEN_NOP; optlen += TCPOLEN_WINDOW + TCPOLEN_NOP; } if (tcp_do_sack && OPT_FITS(TCPOLEN_SACK_PERMITTED)) { *optp++ = TCPOPT_SACK_PERMITTED; *optp++ = TCPOLEN_SACK_PERMITTED; optlen += TCPOLEN_SACK_PERMITTED; } } } /* * Send a timestamp and echo-reply if this is a SYN and our side * wants to use timestamps (TF_REQ_TSTMP is set) or both our side * and our peer have sent timestamps in our SYN's. */ if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP && (flags & TH_RST) == 0 && ((flags & (TH_SYN|TH_ACK)) == TH_SYN || (tp->t_flags & TF_RCVD_TSTMP))) { int alen = 0; while (optlen % 4 != 2) { optlen += TCPOLEN_NOP; *optp++ = TCPOPT_NOP; alen++; } if (OPT_FITS(TCPOLEN_TIMESTAMP)) { *optp++ = TCPOPT_TIMESTAMP; *optp++ = TCPOLEN_TIMESTAMP; uint32_t *lp = (uint32_t *)optp; /* Form timestamp option (appendix A of RFC 1323) */ *lp++ = htonl(TCP_TIMESTAMP(tp)); *lp = htonl(tp->ts_recent); optp += TCPOLEN_TIMESTAMP - 2; optlen += TCPOLEN_TIMESTAMP; /* Set receive buffer autosizing timestamp. */ if (tp->rfbuf_ts == 0 && (so->so_rcv.sb_flags & SB_AUTOSIZE)) tp->rfbuf_ts = TCP_TIMESTAMP(tp); } else { optp -= alen; optlen -= alen; } } #ifdef TCP_SIGNATURE if (tp->t_flags & TF_SIGNATURE) { /* * Initialize TCP-MD5 option (RFC2385) */ if (!OPT_FITS(TCPOLEN_SIGNATURE)) goto reset; *optp++ = TCPOPT_SIGNATURE; *optp++ = TCPOLEN_SIGNATURE; sigoff = optlen + 2; memset(optp, 0, TCP_SIGLEN); optlen += TCPOLEN_SIGNATURE; optp += TCP_SIGLEN; } #endif /* * Tack on the SACK block if it is necessary. */ if (sack_numblks) { int alen = 0; int sack_len = sack_numblks * 8; while (optlen % 4 != 2) { optlen += TCPOLEN_NOP; *optp++ = TCPOPT_NOP; alen++; } if (OPT_FITS(sack_len + 2)) { struct ipqent *tiqe; *optp++ = TCPOPT_SACK; *optp++ = sack_len + 2; uint32_t *lp = (uint32_t *)optp; if ((tp->rcv_sack_flags & TCPSACK_HAVED) != 0) { sack_numblks--; *lp++ = htonl(tp->rcv_dsack_block.left); *lp++ = htonl(tp->rcv_dsack_block.right); tp->rcv_sack_flags &= ~TCPSACK_HAVED; } for (tiqe = TAILQ_FIRST(&tp->timeq); sack_numblks > 0; tiqe = TAILQ_NEXT(tiqe, ipqe_timeq)) { KASSERT(tiqe != NULL); sack_numblks--; *lp++ = htonl(tiqe->ipqe_seq); *lp++ = htonl(tiqe->ipqe_seq + tiqe->ipqe_len + ((tiqe->ipqe_flags & TH_FIN) != 0 ? 1 : 0)); } optlen += sack_len + 2; optp += sack_len; } else { optp -= alen; optlen -= alen; } } /* Terminate and pad TCP options to a 4 byte boundary. */ if (optlen % 4) { if (!OPT_FITS(TCPOLEN_EOL)) { reset: TCP_REASS_UNLOCK(tp); error = ECONNABORTED; goto out; } optlen += TCPOLEN_EOL; *optp++ = TCPOPT_EOL; } /* * According to RFC 793 (STD0007): * "The content of the header beyond the End-of-Option option * must be header padding (i.e., zero)." * and later: "The padding is composed of zeros." */ while (optlen % 4) { if (!OPT_FITS(TCPOLEN_PAD)) goto reset; optlen += TCPOLEN_PAD; *optp++ = TCPOPT_PAD; } TCP_REASS_UNLOCK(tp); hdrlen += optlen; #ifdef DIAGNOSTIC if (!use_tso && len > txsegsize) panic("tcp data to be sent is larger than segment"); else if (use_tso && len > IP_MAXPACKET) panic("tcp data to be sent is larger than max TSO size"); if (max_linkhdr + hdrlen > MCLBYTES) panic("tcphdr too big"); #endif /* * Grab a header mbuf, attaching a copy of data to * be transmitted, and initialize the header from * the template for sends on this connection. */ if (len) { error = tcp_build_datapkt(tp, so, off, len, hdrlen, &m); if (error) goto out; /* * If we're sending everything we've got, set PUSH. * (This will keep happy those implementations which only * give data to the user when a buffer fills or * a PUSH comes in.) */ if (off + len == so->so_snd.sb_cc) flags |= TH_PUSH; } else { tcps = TCP_STAT_GETREF(); if (tp->t_flags & TF_ACKNOW) tcps[TCP_STAT_SNDACKS]++; else if (flags & (TH_SYN|TH_FIN|TH_RST)) tcps[TCP_STAT_SNDCTRL]++; else if (SEQ_GT(tp->snd_up, tp->snd_una)) tcps[TCP_STAT_SNDURG]++; else tcps[TCP_STAT_SNDWINUP]++; TCP_STAT_PUTREF(); MGETHDR(m, M_DONTWAIT, MT_HEADER); if (m != NULL && max_linkhdr + hdrlen > MHLEN) { MCLGET(m, M_DONTWAIT); if ((m->m_flags & M_EXT) == 0) { m_freem(m); m = NULL; } } if (m == NULL) { error = ENOBUFS; goto out; } MCLAIM(m, &tcp_tx_mowner); m->m_data += max_linkhdr; m->m_len = hdrlen; } m_reset_rcvif(m); switch (af) { case AF_INET: ip = mtod(m, struct ip *); #ifdef INET6 ip6 = NULL; #endif th = (struct tcphdr *)(ip + 1); break; #ifdef INET6 case AF_INET6: ip = NULL; ip6 = mtod(m, struct ip6_hdr *); th = (struct tcphdr *)(ip6 + 1); break; #endif default: /*pacify gcc*/ ip = NULL; #ifdef INET6 ip6 = NULL; #endif th = NULL; break; } if (tp->t_template == NULL) panic("%s: no template", __func__); if (tp->t_template->m_len < iphdrlen) panic("%s: %d < %d", __func__, tp->t_template->m_len, iphdrlen); bcopy(mtod(tp->t_template, void *), mtod(m, void *), iphdrlen); /* * If we are starting a connection, send ECN setup * SYN packet. If we are on a retransmit, we may * resend those bits a number of times as per * RFC 3168. */ if (tp->t_state == TCPS_SYN_SENT && tcp_do_ecn) { if (tp->t_flags & TF_SYN_REXMT) { if (tp->t_ecn_retries--) flags |= TH_ECE|TH_CWR; } else { flags |= TH_ECE|TH_CWR; tp->t_ecn_retries = tcp_ecn_maxretries; } } if (TCP_ECN_ALLOWED(tp)) { /* * If the peer has ECN, mark data packets * ECN capable. Ignore pure ack packets, retransmissions * and window probes. */ if (len > 0 && SEQ_GEQ(tp->snd_nxt, tp->snd_max) && !(tp->t_force && len == 1)) { ecn_tos = IPTOS_ECN_ECT0; TCP_STATINC(TCP_STAT_ECN_ECT); } /* * Reply with proper ECN notifications. */ if (tp->t_flags & TF_ECN_SND_CWR) { flags |= TH_CWR; tp->t_flags &= ~TF_ECN_SND_CWR; } if (tp->t_flags & TF_ECN_SND_ECE) { flags |= TH_ECE; } } /* * If we are doing retransmissions, then snd_nxt will * not reflect the first unsent octet. For ACK only * packets, we do not want the sequence number of the * retransmitted packet, we want the sequence number * of the next unsent octet. So, if there is no data * (and no SYN or FIN), use snd_max instead of snd_nxt * when filling in ti_seq. But if we are in persist * state, snd_max might reflect one byte beyond the * right edge of the window, so use snd_nxt in that * case, since we know we aren't doing a retransmission. * (retransmit and persist are mutually exclusive...) */ if (TCP_SACK_ENABLED(tp) && sack_rxmit) { th->th_seq = htonl(p->rxmit); p->rxmit += len; } else { if (len || (flags & (TH_SYN|TH_FIN)) || TCP_TIMER_ISARMED(tp, TCPT_PERSIST)) th->th_seq = htonl(tp->snd_nxt); else th->th_seq = htonl(tp->snd_max); } th->th_ack = htonl(tp->rcv_nxt); if (optlen) { memcpy(th + 1, opt, optlen); th->th_off = (sizeof (struct tcphdr) + optlen) >> 2; } th->th_flags = flags; /* * Calculate receive window. Don't shrink window, * but avoid silly window syndrome. */ if (win < (long)(so->so_rcv.sb_hiwat / 4) && win < (long)rxsegsize) win = 0; if (win > (long)TCP_MAXWIN << tp->rcv_scale) win = (long)TCP_MAXWIN << tp->rcv_scale; if (win < (long)(int32_t)(tp->rcv_adv - tp->rcv_nxt)) win = (long)(int32_t)(tp->rcv_adv - tp->rcv_nxt); th->th_win = htons((u_int16_t) (win>>tp->rcv_scale)); if (th->th_win == 0) { tp->t_sndzerowin++; } if (SEQ_GT(tp->snd_up, tp->snd_nxt)) { u_int32_t urp = tp->snd_up - tp->snd_nxt; if (urp > IP_MAXPACKET) urp = IP_MAXPACKET; th->th_urp = htons((u_int16_t)urp); th->th_flags |= TH_URG; } else /* * If no urgent pointer to send, then we pull * the urgent pointer to the left edge of the send window * so that it doesn't drift into the send window on sequence * number wraparound. */ tp->snd_up = tp->snd_una; /* drag it along */ #ifdef TCP_SIGNATURE if (sigoff && (tp->t_flags & TF_SIGNATURE)) { struct secasvar *sav; u_int8_t *sigp; sav = tcp_signature_getsav(m); if (sav == NULL) { if (m) m_freem(m); return EPERM; } m->m_pkthdr.len = hdrlen + len; sigp = (char *)th + sizeof(*th) + sigoff; tcp_signature(m, th, (char *)th - mtod(m, char *), sav, sigp); key_sa_recordxfer(sav, m); KEY_SA_UNREF(&sav); } #endif /* * Set ourselves up to be checksummed just before the packet * hits the wire. */ switch (af) { case AF_INET: m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); if (use_tso) { m->m_pkthdr.segsz = txsegsize; m->m_pkthdr.csum_flags = M_CSUM_TSOv4; } else { m->m_pkthdr.csum_flags = M_CSUM_TCPv4; if (len + optlen) { /* Fixup the pseudo-header checksum. */ /* XXXJRT Not IP Jumbogram safe. */ th->th_sum = in_cksum_addword(th->th_sum, htons((u_int16_t) (len + optlen))); } } break; #ifdef INET6 case AF_INET6: m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); if (use_tso) { m->m_pkthdr.segsz = txsegsize; m->m_pkthdr.csum_flags = M_CSUM_TSOv6; } else { m->m_pkthdr.csum_flags = M_CSUM_TCPv6; if (len + optlen) { /* Fixup the pseudo-header checksum. */ /* XXXJRT: Not IPv6 Jumbogram safe. */ th->th_sum = in_cksum_addword(th->th_sum, htons((u_int16_t) (len + optlen))); } } break; #endif } /* * In transmit state, time the transmission and arrange for * the retransmit. In persist state, just set snd_max. */ if (tp->t_force == 0 || TCP_TIMER_ISARMED(tp, TCPT_PERSIST) == 0) { tcp_seq startseq = tp->snd_nxt; /* * Advance snd_nxt over sequence space of this segment. * There are no states in which we send both a SYN and a FIN, * so we collapse the tests for these flags. */ if (flags & (TH_SYN|TH_FIN)) tp->snd_nxt++; if (sack_rxmit) goto timer; tp->snd_nxt += len; if (SEQ_GT(tp->snd_nxt, tp->snd_max)) { tp->snd_max = tp->snd_nxt; /* * Time this transmission if not a retransmission and * not currently timing anything. */ if (tp->t_rtttime == 0) { tp->t_rtttime = tcp_now; tp->t_rtseq = startseq; TCP_STATINC(TCP_STAT_SEGSTIMED); } } /* * Set retransmit timer if not currently set, * and not doing an ack or a keep-alive probe. * Initial value for retransmit timer is smoothed * round-trip time + 2 * round-trip time variance. * Initialize shift counter which is used for backoff * of retransmit time. */ timer: if (TCP_TIMER_ISARMED(tp, TCPT_REXMT) == 0) { if ((sack_rxmit && tp->snd_nxt != tp->snd_max) || tp->snd_nxt != tp->snd_una) { if (TCP_TIMER_ISARMED(tp, TCPT_PERSIST)) { TCP_TIMER_DISARM(tp, TCPT_PERSIST); tp->t_rxtshift = 0; } TCP_TIMER_ARM(tp, TCPT_REXMT, tp->t_rxtcur); } else if (len == 0 && so->so_snd.sb_cc > 0 && TCP_TIMER_ISARMED(tp, TCPT_PERSIST) == 0) { /* * If we are sending a window probe and there's * unacked data in the socket, make sure at * least the persist timer is running. */ tp->t_rxtshift = 0; tcp_setpersist(tp); } } } else if (SEQ_GT(tp->snd_nxt + len, tp->snd_max)) tp->snd_max = tp->snd_nxt + len; #ifdef TCP_DEBUG /* * Trace. */ if (so->so_options & SO_DEBUG) tcp_trace(TA_OUTPUT, tp->t_state, tp, m, 0); #endif /* * Fill in IP length and desired time to live and * send to IP level. There should be a better way * to handle ttl and tos; we could keep them in * the template, but need a way to checksum without them. */ m->m_pkthdr.len = hdrlen + len; switch (af) { case AF_INET: ip->ip_len = htons(m->m_pkthdr.len); packetlen = m->m_pkthdr.len; if (tp->t_inpcb->inp_af == AF_INET) { ip->ip_ttl = in4p_ip(tp->t_inpcb).ip_ttl; ip->ip_tos = in4p_ip(tp->t_inpcb).ip_tos | ecn_tos; } #ifdef INET6 else if (tp->t_inpcb->inp_af == AF_INET6) { ip->ip_ttl = in6pcb_selecthlim(tp->t_inpcb, NULL); /*XXX*/ ip->ip_tos = ecn_tos; /*XXX*/ } #endif break; #ifdef INET6 case AF_INET6: packetlen = m->m_pkthdr.len; ip6->ip6_nxt = IPPROTO_TCP; if (tp->t_family == AF_INET6) { /* * we separately set hoplimit for every segment, since * the user might want to change the value via * setsockopt. Also, desired default hop limit might * be changed via Neighbor Discovery. */ ip6->ip6_hlim = in6pcb_selecthlim_rt(tp->t_inpcb); } ip6->ip6_flow |= htonl(ecn_tos << 20); /* ip6->ip6_flow = ??? (from template) */ /* ip6_plen will be filled in ip6_output(). */ break; #endif default: /*pacify gcc*/ packetlen = 0; break; } switch (af) { case AF_INET: { struct mbuf *opts; if (tp->t_inpcb->inp_af == AF_INET) opts = tp->t_inpcb->inp_options; else opts = NULL; error = ip_output(m, opts, ro, (tp->t_mtudisc ? IP_MTUDISC : 0) | (so->so_options & SO_DONTROUTE), NULL, tp->t_inpcb); break; } #ifdef INET6 case AF_INET6: { struct ip6_pktopts *opts; if (tp->t_inpcb->inp_af == AF_INET6) opts = in6p_outputopts(tp->t_inpcb); else opts = NULL; error = ip6_output(m, opts, ro, so->so_options & SO_DONTROUTE, NULL, tp->t_inpcb, NULL); break; } #endif default: error = EAFNOSUPPORT; break; } if (error) { out: if (error == ENOBUFS) { TCP_STATINC(TCP_STAT_SELFQUENCH); tcp_quench(tp->t_inpcb); error = 0; } else if ((error == EHOSTUNREACH || error == ENETDOWN || error == EHOSTDOWN) && TCPS_HAVERCVDSYN(tp->t_state)) { tp->t_softerror = error; error = 0; } /* Back out the sequence number advance. */ if (sack_rxmit) p->rxmit -= len; /* Restart the delayed ACK timer, if necessary. */ if (tp->t_flags & TF_DELACK) TCP_RESTART_DELACK(tp); return error; } if (packetlen > tp->t_pmtud_mtu_sent) tp->t_pmtud_mtu_sent = packetlen; tcps = TCP_STAT_GETREF(); tcps[TCP_STAT_SNDTOTAL]++; if (tp->t_flags & TF_DELACK) tcps[TCP_STAT_DELACK]++; TCP_STAT_PUTREF(); /* * Data sent (as far as we can tell). * If this advertises a larger window than any other segment, * then remember the size of the advertised window. * Any pending ACK has now been sent. */ if (win > 0 && SEQ_GT(tp->rcv_nxt+win, tp->rcv_adv)) tp->rcv_adv = tp->rcv_nxt + win; tp->last_ack_sent = tp->rcv_nxt; tp->t_flags &= ~TF_ACKNOW; TCP_CLEAR_DELACK(tp); #ifdef DIAGNOSTIC if (maxburst < 0) printf("tcp_output: maxburst exceeded by %d\n", -maxburst); #endif if (sendalot && (tp->t_congctl == &tcp_reno_ctl || --maxburst)) goto again; return 0; } void tcp_setpersist(struct tcpcb *tp) { int t = ((tp->t_srtt >> 2) + tp->t_rttvar) >> (1 + 2); int nticks; if (TCP_TIMER_ISARMED(tp, TCPT_REXMT)) panic("tcp_output REXMT"); /* * Start/restart persistance timer. */ if (t < tp->t_rttmin) t = tp->t_rttmin; TCPT_RANGESET(nticks, t * tcp_backoff[tp->t_rxtshift], TCPTV_PERSMIN, TCPTV_PERSMAX); TCP_TIMER_ARM(tp, TCPT_PERSIST, nticks); if (tp->t_rxtshift < TCP_MAXRXTSHIFT) tp->t_rxtshift++; }
166 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 /* $NetBSD: secmodel.c,v 1.2 2014/11/04 16:01:58 maxv Exp $ */ /*- * Copyright (c) 2011 Elad Efrat <elad@NetBSD.org> * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The name of the author may not be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include <sys/types.h> #include <sys/param.h> #include <sys/errno.h> #include <sys/atomic.h> #include <sys/kauth.h> #include <sys/kmem.h> #include <sys/queue.h> #include <sys/rwlock.h> #include <secmodel/secmodel.h> #include <prop/proplib.h> /* List of secmodels, parameters, and lock. */ static LIST_HEAD(, secmodel_descr) secmodels = LIST_HEAD_INITIALIZER(secmodels); static unsigned int secmodel_copy_cred_on_fork = false; static krwlock_t secmodels_lock; static int nsecmodels = 0; /* number of registered secmodels */ static int secmodel_plug(secmodel_t); static int secmodel_unplug(secmodel_t); int secmodel_nsecmodels(void) { return nsecmodels; } void secmodel_init(void) { rw_init(&secmodels_lock); secmodel_copy_cred_on_fork = false; } /* * Register a new secmodel. */ int secmodel_register(secmodel_t *secmodel, const char *id, const char *name, prop_dictionary_t behavior, secmodel_eval_t eval, secmodel_setinfo_t setinfo) { int err; secmodel_t sm; sm = kmem_alloc(sizeof(*sm), KM_SLEEP); sm->sm_id = id; sm->sm_name = name; sm->sm_behavior = behavior; sm->sm_eval = eval; sm->sm_setinfo = setinfo; err = secmodel_plug(sm); if (err == 0) { atomic_inc_uint(&nsecmodels); } else { kmem_free(sm, sizeof(*sm)); sm = NULL; } *secmodel = sm; return err; } /* * Deregister a secmodel. */ int secmodel_deregister(secmodel_t sm) { int error; error = secmodel_unplug(sm); if (error == 0) { atomic_dec_uint(&nsecmodels); kmem_free(sm, sizeof(*sm)); } return error; } /* * Lookup a secmodel by its id. * * Requires "secmodels_lock" handling by the caller. */ static secmodel_t secmodel_lookup(const char *id) { secmodel_t tsm; KASSERT(rw_lock_held(&secmodels_lock)); LIST_FOREACH(tsm, &secmodels, sm_list) { if (strcasecmp(tsm->sm_id, id) == 0) { return tsm; } } return NULL; } /* * Adjust system-global secmodel behavior following the addition * or removal of a secmodel. * * Requires "secmodels_lock" to be held by the caller. */ static void secmodel_adjust_behavior(secmodel_t sm, bool added) { bool r, b; KASSERT(rw_write_held(&secmodels_lock)); #define ADJUST_COUNTER(which, added) \ do { \ if (added) { \ (which)++; \ } else { \ if ((which) > 0) \ (which)--; \ } \ } while (/*CONSTCOND*/0) /* Copy credentials on fork? */ r = prop_dictionary_get_bool(sm->sm_behavior, "copy-cred-on-fork", &b); if (r) { ADJUST_COUNTER(secmodel_copy_cred_on_fork, added); } #undef ADJUST_COUNTER } static int secmodel_plug(secmodel_t sm) { secmodel_t tsm; int error = 0; if (sm == NULL) return EFAULT; /* Check if the secmodel is already present. */ rw_enter(&secmodels_lock, RW_WRITER); tsm = secmodel_lookup(sm->sm_id); if (tsm != NULL) { error = EEXIST; goto out; } /* Add the secmodel. */ LIST_INSERT_HEAD(&secmodels, sm, sm_list); /* Adjust behavior. */ secmodel_adjust_behavior(sm, true); out: /* Unlock the secmodels list. */ rw_exit(&secmodels_lock); return error; } static int secmodel_unplug(secmodel_t sm) { secmodel_t tsm; int error = 0; if (sm == NULL) return EFAULT; /* Make sure the secmodel is present. */ rw_enter(&secmodels_lock, RW_WRITER); tsm = secmodel_lookup(sm->sm_id); if (tsm == NULL) { error = ENOENT; goto out; } /* Remove the secmodel. */ LIST_REMOVE(tsm, sm_list); /* Adjust behavior. */ secmodel_adjust_behavior(tsm, false); out: /* Unlock the secmodels list. */ rw_exit(&secmodels_lock); return error; } /* XXX TODO */ int secmodel_setinfo(const char *id, void *v, int *err) { return EOPNOTSUPP; } int secmodel_eval(const char *id, const char *what, void *arg, void *ret) { secmodel_t sm; int error = 0; rw_enter(&secmodels_lock, RW_READER); sm = secmodel_lookup(id); if (sm == NULL) { error = EINVAL; goto out; } if (sm->sm_eval == NULL) { error = ENOENT; goto out; } if (ret == NULL) { error = EFAULT; goto out; } error = sm->sm_eval(what, arg, ret); /* pass error from a secmodel(9) callback as a negative value */ error = -error; out: rw_exit(&secmodels_lock); return error; }
1 2 1 1 1 1 1 11 11 1 7 7 7 1 1 7 7 1 6 7 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 /* $NetBSD: kernfs_vfsops.c,v 1.100 2020/04/07 08:35:49 jdolecek Exp $ */ /* * Copyright (c) 1992, 1993, 1995 * The Regents of the University of California. All rights reserved. * * This code is derived from software donated to Berkeley by * Jan-Simon Pendry. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)kernfs_vfsops.c 8.10 (Berkeley) 5/14/95 */ /* * Kernel params Filesystem */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: kernfs_vfsops.c,v 1.100 2020/04/07 08:35:49 jdolecek Exp $"); #ifdef _KERNEL_OPT #include "opt_compat_netbsd.h" #endif #include <sys/param.h> #include <sys/systm.h> #include <sys/sysctl.h> #include <sys/conf.h> #include <sys/proc.h> #include <sys/vnode.h> #include <sys/mount.h> #include <sys/namei.h> #include <sys/dirent.h> #include <sys/syslog.h> #include <sys/kauth.h> #include <sys/module.h> #include <miscfs/genfs/genfs.h> #include <miscfs/specfs/specdev.h> #include <miscfs/kernfs/kernfs.h> MODULE(MODULE_CLASS_VFS, kernfs, NULL); dev_t rrootdev = NODEV; kmutex_t kfs_lock; VFS_PROTOS(kernfs); void kernfs_get_rrootdev(void); void kernfs_init(void) { mutex_init(&kfs_lock, MUTEX_DEFAULT, IPL_NONE); } void kernfs_reinit(void) { } void kernfs_done(void) { mutex_destroy(&kfs_lock); } void kernfs_get_rrootdev(void) { static int tried = 0; if (tried) { /* Already did it once. */ return; } tried = 1; if (rootdev == NODEV) return; rrootdev = devsw_blk2chr(rootdev); if (rrootdev != NODEV) return; rrootdev = NODEV; printf("kernfs_get_rrootdev: no raw root device\n"); } /* * Mount the Kernel params filesystem */ int kernfs_mount(struct mount *mp, const char *path, void *data, size_t *data_len) { struct lwp *l = curlwp; int error = 0; struct kernfs_mount *fmp; if (UIO_MX & (UIO_MX - 1)) { log(LOG_ERR, "kernfs: invalid directory entry size"); return (EINVAL); } if (mp->mnt_flag & MNT_GETARGS) { *data_len = 0; return 0; } /* * Update is a no-op */ if (mp->mnt_flag & MNT_UPDATE) return (EOPNOTSUPP); fmp = kmem_zalloc(sizeof(struct kernfs_mount), KM_SLEEP); TAILQ_INIT(&fmp->nodelist); mp->mnt_stat.f_namemax = KERNFS_MAXNAMLEN; mp->mnt_flag |= MNT_LOCAL; mp->mnt_data = fmp; vfs_getnewfsid(mp); if ((error = set_statvfs_info(path, UIO_USERSPACE, "kernfs", UIO_SYSSPACE, mp->mnt_op->vfs_name, mp, l)) != 0) { kmem_free(fmp, sizeof(struct kernfs_mount)); return error; } kernfs_get_rrootdev(); return 0; } int kernfs_start(struct mount *mp, int flags) { return (0); } int kernfs_unmount(struct mount *mp, int mntflags) { int error; int flags = 0; if (mntflags & MNT_FORCE) flags |= FORCECLOSE; if ((error = vflush(mp, 0, flags)) != 0) return (error); /* * Finally, throw away the kernfs_mount structure */ kmem_free(mp->mnt_data, sizeof(struct kernfs_mount)); mp->mnt_data = NULL; return (0); } int kernfs_root(struct mount *mp, int lktype, struct vnode **vpp) { const struct kern_target *root_target = &kern_targets[0]; int error; /* setup "." */ error = vcache_get(mp, &root_target, sizeof(root_target), vpp); if (error) return error; error = vn_lock(*vpp, lktype); if (error) { vrele(*vpp); *vpp = NULL; return error; } return 0; } /*ARGSUSED*/ int kernfs_sync(struct mount *mp, int waitfor, kauth_cred_t uc) { return (0); } /* * Kernfs flat namespace lookup. * Currently unsupported. */ int kernfs_vget(struct mount *mp, ino_t ino, int lktype, struct vnode **vpp) { return (EOPNOTSUPP); } int kernfs_loadvnode(struct mount *mp, struct vnode *vp, const void *key, size_t key_len, const void **new_key) { const struct kern_target *kt; struct kernfs_node *kfs, *kfsp; long *cookie; KASSERT(key_len == sizeof(kt)); memcpy(&kt, key, key_len); kfs = kmem_zalloc(sizeof(struct kernfs_node), KM_SLEEP); cookie = &(VFSTOKERNFS(mp)->fileno_cookie); mutex_enter(&kfs_lock); again: TAILQ_FOREACH(kfsp, &VFSTOKERNFS(mp)->nodelist, kfs_list) { if (kfsp->kfs_cookie == *cookie) { (*cookie) ++; goto again; } if (TAILQ_NEXT(kfsp, kfs_list)) { if (kfsp->kfs_cookie < *cookie && *cookie < TAILQ_NEXT(kfsp, kfs_list)->kfs_cookie) break; if (kfsp->kfs_cookie + 1 < TAILQ_NEXT(kfsp, kfs_list)->kfs_cookie) { *cookie = kfsp->kfs_cookie + 1; break; } } } kfs->kfs_cookie = *cookie; if (kfsp) TAILQ_INSERT_AFTER(&VFSTOKERNFS(mp)->nodelist, kfsp, kfs, kfs_list); else TAILQ_INSERT_TAIL(&VFSTOKERNFS(mp)->nodelist, kfs, kfs_list); kfs->kfs_type = kt->kt_tag; kfs->kfs_vnode = vp; kfs->kfs_fileno = KERNFS_FILENO(kt, kt->kt_tag, kfs->kfs_cookie); kfs->kfs_kt = kt; kfs->kfs_mode = kt->kt_mode; vp->v_tag = VT_KERNFS; vp->v_op = kernfs_vnodeop_p; vp->v_data = kfs; vp->v_type = kt->kt_vtype; mutex_exit(&kfs_lock); if (kt->kt_tag == KFSkern) vp->v_vflag = VV_ROOT; if (kt->kt_tag == KFSdevice) { vp->v_op = kernfs_specop_p; spec_node_init(vp, *(dev_t *)kt->kt_data); } uvm_vnp_setsize(vp, 0); *new_key = &kfs->kfs_kt; return 0; } extern const struct vnodeopv_desc kernfs_vnodeop_opv_desc; extern const struct vnodeopv_desc kernfs_specop_opv_desc; const struct vnodeopv_desc * const kernfs_vnodeopv_descs[] = { &kernfs_vnodeop_opv_desc, &kernfs_specop_opv_desc, NULL, }; struct vfsops kernfs_vfsops = { .vfs_name = MOUNT_KERNFS, .vfs_min_mount_data = 0, .vfs_mount = kernfs_mount, .vfs_start = kernfs_start, .vfs_unmount = kernfs_unmount, .vfs_root = kernfs_root, .vfs_quotactl = (void *)eopnotsupp, .vfs_statvfs = genfs_statvfs, .vfs_sync = kernfs_sync, .vfs_vget = kernfs_vget, .vfs_loadvnode = kernfs_loadvnode, .vfs_fhtovp = (void *)eopnotsupp, .vfs_vptofh = (void *)eopnotsupp, .vfs_init = kernfs_init, .vfs_reinit = kernfs_reinit, .vfs_done = kernfs_done, .vfs_snapshot = (void *)eopnotsupp, .vfs_extattrctl = vfs_stdextattrctl, .vfs_suspendctl = genfs_suspendctl, .vfs_renamelock_enter = genfs_renamelock_enter, .vfs_renamelock_exit = genfs_renamelock_exit, .vfs_fsync = (void *)eopnotsupp, .vfs_opv_descs = kernfs_vnodeopv_descs }; SYSCTL_SETUP(kernfs_sysctl_setup, "kernfs sysctl") { sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT, CTLTYPE_NODE, "kernfs", SYSCTL_DESCR("/kern file system"), NULL, 0, NULL, 0, CTL_VFS, 11, CTL_EOL); /* * XXX the "11" above could be dynamic, thereby eliminating one * more instance of the "number to vfs" mapping problem, but * "11" is the order as taken from sys/mount.h */ } static int kernfs_modcmd(modcmd_t cmd, void *arg) { int error; switch (cmd) { case MODULE_CMD_INIT: error = vfs_attach(&kernfs_vfsops); if (error != 0) break; break; case MODULE_CMD_FINI: error = vfs_detach(&kernfs_vfsops); if (error != 0) break; break; default: error = ENOTTY; break; } return (error); }
9 9 9 9 7 7 7 6 1 5 6 6 7 7 7 3 3 3 3 3 3 3 3 3 3 3 3 3 7 6 7 7 7 7 6 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 9 9 9 9 9 9 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 7 7 7 7 7 7 7 7 7 9 9 9 9 2 17 306 316 316 3 1 3 3 12 3 3 3 7 3 3 7 7 3 3 3 3 5 5 5 5 5 2 5 5 5 5 5 5 5 5 5 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 /* $NetBSD: subr_autoconf.c,v 1.314 2023/07/18 11:57:37 riastradh Exp $ */ /* * Copyright (c) 1996, 2000 Christopher G. Demetriou * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed for the * NetBSD Project. See http://www.NetBSD.org/ for * information about NetBSD. * 4. The name of the author may not be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * --(license Id: LICENSE.proto,v 1.1 2000/06/13 21:40:26 cgd Exp )-- */ /* * Copyright (c) 1992, 1993 * The Regents of the University of California. All rights reserved. * * This software was developed by the Computer Systems Engineering group * at Lawrence Berkeley Laboratory under DARPA contract BG 91-66 and * contributed to Berkeley. * * All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Lawrence Berkeley Laboratories. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: Header: subr_autoconf.c,v 1.12 93/02/01 19:31:48 torek Exp (LBL) * * @(#)subr_autoconf.c 8.3 (Berkeley) 5/17/94 */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: subr_autoconf.c,v 1.314 2023/07/18 11:57:37 riastradh Exp $"); #ifdef _KERNEL_OPT #include "opt_ddb.h" #include "drvctl.h" #endif #include <sys/param.h> #include <sys/device.h> #include <sys/device_impl.h> #include <sys/disklabel.h> #include <sys/conf.h> #include <sys/kauth.h> #include <sys/kmem.h> #include <sys/systm.h> #include <sys/kernel.h> #include <sys/errno.h> #include <sys/proc.h> #include <sys/reboot.h> #include <sys/kthread.h> #include <sys/buf.h> #include <sys/dirent.h> #include <sys/mount.h> #include <sys/namei.h> #include <sys/unistd.h> #include <sys/fcntl.h> #include <sys/lockf.h> #include <sys/callout.h> #include <sys/devmon.h> #include <sys/cpu.h> #include <sys/sysctl.h> #include <sys/stdarg.h> #include <sys/localcount.h> #include <sys/disk.h> #include <sys/rndsource.h> #include <machine/limits.h> /* * Autoconfiguration subroutines. */ /* * Device autoconfiguration timings are mixed into the entropy pool. */ static krndsource_t rnd_autoconf_source; /* * ioconf.c exports exactly two names: cfdata and cfroots. All system * devices and drivers are found via these tables. */ extern struct cfdata cfdata[]; extern const short cfroots[]; /* * List of all cfdriver structures. We use this to detect duplicates * when other cfdrivers are loaded. */ struct cfdriverlist allcfdrivers = LIST_HEAD_INITIALIZER(&allcfdrivers); extern struct cfdriver * const cfdriver_list_initial[]; /* * Initial list of cfattach's. */ extern const struct cfattachinit cfattachinit[]; /* * List of cfdata tables. We always have one such list -- the one * built statically when the kernel was configured. */ struct cftablelist allcftables = TAILQ_HEAD_INITIALIZER(allcftables); static struct cftable initcftable; #define ROOT ((device_t)NULL) struct matchinfo { cfsubmatch_t fn; device_t parent; const int *locs; void *aux; struct cfdata *match; int pri; }; struct alldevs_foray { int af_s; struct devicelist af_garbage; }; /* * Internal version of the cfargs structure; all versions are * canonicalized to this. */ struct cfargs_internal { union { cfsubmatch_t submatch;/* submatch function (direct config) */ cfsearch_t search; /* search function (indirect config) */ }; const char * iattr; /* interface attribute */ const int * locators; /* locators array */ devhandle_t devhandle; /* devhandle_t (by value) */ }; static char *number(char *, int); static void mapply(struct matchinfo *, cfdata_t); static void config_devdelete(device_t); static void config_devunlink(device_t, struct devicelist *); static void config_makeroom(int, struct cfdriver *); static void config_devlink(device_t); static void config_alldevs_enter(struct alldevs_foray *); static void config_alldevs_exit(struct alldevs_foray *); static void config_add_attrib_dict(device_t); static device_t config_attach_internal(device_t, cfdata_t, void *, cfprint_t, const struct cfargs_internal *); static void config_collect_garbage(struct devicelist *); static void config_dump_garbage(struct devicelist *); static void pmflock_debug(device_t, const char *, int); static device_t deviter_next1(deviter_t *); static void deviter_reinit(deviter_t *); struct deferred_config { TAILQ_ENTRY(deferred_config) dc_queue; device_t dc_dev; void (*dc_func)(device_t); }; TAILQ_HEAD(deferred_config_head, deferred_config); static struct deferred_config_head deferred_config_queue = TAILQ_HEAD_INITIALIZER(deferred_config_queue); static struct deferred_config_head interrupt_config_queue = TAILQ_HEAD_INITIALIZER(interrupt_config_queue); static int interrupt_config_threads = 8; static struct deferred_config_head mountroot_config_queue = TAILQ_HEAD_INITIALIZER(mountroot_config_queue); static int mountroot_config_threads = 2; static lwp_t **mountroot_config_lwpids; static size_t mountroot_config_lwpids_size; bool root_is_mounted = false; static void config_process_deferred(struct deferred_config_head *, device_t); /* Hooks to finalize configuration once all real devices have been found. */ struct finalize_hook { TAILQ_ENTRY(finalize_hook) f_list; int (*f_func)(device_t); device_t f_dev; }; static TAILQ_HEAD(, finalize_hook) config_finalize_list = TAILQ_HEAD_INITIALIZER(config_finalize_list); static int config_finalize_done; /* list of all devices */ static struct devicelist alldevs = TAILQ_HEAD_INITIALIZER(alldevs); static kmutex_t alldevs_lock __cacheline_aligned; static devgen_t alldevs_gen = 1; static int alldevs_nread = 0; static int alldevs_nwrite = 0; static bool alldevs_garbage = false; static struct devicelist config_pending = TAILQ_HEAD_INITIALIZER(config_pending); static kmutex_t config_misc_lock; static kcondvar_t config_misc_cv; static bool detachall = false; #define STREQ(s1, s2) \ (*(s1) == *(s2) && strcmp((s1), (s2)) == 0) static bool config_initialized = false; /* config_init() has been called. */ static int config_do_twiddle; static callout_t config_twiddle_ch; static void sysctl_detach_setup(struct sysctllog **); int no_devmon_insert(const char *, prop_dictionary_t); int (*devmon_insert_vec)(const char *, prop_dictionary_t) = no_devmon_insert; typedef int (*cfdriver_fn)(struct cfdriver *); static int frob_cfdrivervec(struct cfdriver * const *cfdriverv, cfdriver_fn drv_do, cfdriver_fn drv_undo, const char *style, bool dopanic) { void (*pr)(const char *, ...) __printflike(1, 2) = dopanic ? panic : printf; int i, error = 0, e2 __diagused; for (i = 0; cfdriverv[i] != NULL; i++) { if ((error = drv_do(cfdriverv[i])) != 0) { pr("configure: `%s' driver %s failed: %d", cfdriverv[i]->cd_name, style, error); goto bad; } } KASSERT(error == 0); return 0; bad: printf("\n"); for (i--; i >= 0; i--) { e2 = drv_undo(cfdriverv[i]); KASSERT(e2 == 0); } return error; } typedef int (*cfattach_fn)(const char *, struct cfattach *); static int frob_cfattachvec(const struct cfattachinit *cfattachv, cfattach_fn att_do, cfattach_fn att_undo, const char *style, bool dopanic) { const struct cfattachinit *cfai = NULL; void (*pr)(const char *, ...) __printflike(1, 2) = dopanic ? panic : printf; int j = 0, error = 0, e2 __diagused; for (cfai = &cfattachv[0]; cfai->cfai_name != NULL; cfai++) { for (j = 0; cfai->cfai_list[j] != NULL; j++) { if ((error = att_do(cfai->cfai_name, cfai->cfai_list[j])) != 0) { pr("configure: attachment `%s' " "of `%s' driver %s failed: %d", cfai->cfai_list[j]->ca_name, cfai->cfai_name, style, error); goto bad; } } } KASSERT(error == 0); return 0; bad: /* * Rollback in reverse order. dunno if super-important, but * do that anyway. Although the code looks a little like * someone did a little integration (in the math sense). */ printf("\n"); if (cfai) { bool last; for (last = false; last == false; ) { if (cfai == &cfattachv[0]) last = true; for (j--; j >= 0; j--) { e2 = att_undo(cfai->cfai_name, cfai->cfai_list[j]); KASSERT(e2 == 0); } if (!last) { cfai--; for (j = 0; cfai->cfai_list[j] != NULL; j++) ; } } } return error; } /* * Initialize the autoconfiguration data structures. Normally this * is done by configure(), but some platforms need to do this very * early (to e.g. initialize the console). */ void config_init(void) { KASSERT(config_initialized == false); mutex_init(&alldevs_lock, MUTEX_DEFAULT, IPL_VM); mutex_init(&config_misc_lock, MUTEX_DEFAULT, IPL_NONE); cv_init(&config_misc_cv, "cfgmisc"); callout_init(&config_twiddle_ch, CALLOUT_MPSAFE); frob_cfdrivervec(cfdriver_list_initial, config_cfdriver_attach, NULL, "bootstrap", true); frob_cfattachvec(cfattachinit, config_cfattach_attach, NULL, "bootstrap", true); initcftable.ct_cfdata = cfdata; TAILQ_INSERT_TAIL(&allcftables, &initcftable, ct_list); rnd_attach_source(&rnd_autoconf_source, "autoconf", RND_TYPE_UNKNOWN, RND_FLAG_COLLECT_TIME); config_initialized = true; } /* * Init or fini drivers and attachments. Either all or none * are processed (via rollback). It would be nice if this were * atomic to outside consumers, but with the current state of * locking ... */ int config_init_component(struct cfdriver * const *cfdriverv, const struct cfattachinit *cfattachv, struct cfdata *cfdatav) { int error; KERNEL_LOCK(1, NULL); if ((error = frob_cfdrivervec(cfdriverv, config_cfdriver_attach, config_cfdriver_detach, "init", false))!= 0) goto out; if ((error = frob_cfattachvec(cfattachv, config_cfattach_attach, config_cfattach_detach, "init", false)) != 0) { frob_cfdrivervec(cfdriverv, config_cfdriver_detach, NULL, "init rollback", true); goto out; } if ((error = config_cfdata_attach(cfdatav, 1)) != 0) { frob_cfattachvec(cfattachv, config_cfattach_detach, NULL, "init rollback", true); frob_cfdrivervec(cfdriverv, config_cfdriver_detach, NULL, "init rollback", true); goto out; } /* Success! */ error = 0; out: KERNEL_UNLOCK_ONE(NULL); return error; } int config_fini_component(struct cfdriver * const *cfdriverv, const struct cfattachinit *cfattachv, struct cfdata *cfdatav) { int error; KERNEL_LOCK(1, NULL); if ((error = config_cfdata_detach(cfdatav)) != 0) goto out; if ((error = frob_cfattachvec(cfattachv, config_cfattach_detach, config_cfattach_attach, "fini", false)) != 0) { if (config_cfdata_attach(cfdatav, 0) != 0) panic("config_cfdata fini rollback failed"); goto out; } if ((error = frob_cfdrivervec(cfdriverv, config_cfdriver_detach, config_cfdriver_attach, "fini", false)) != 0) { frob_cfattachvec(cfattachv, config_cfattach_attach, NULL, "fini rollback", true); if (config_cfdata_attach(cfdatav, 0) != 0) panic("config_cfdata fini rollback failed"); goto out; } /* Success! */ error = 0; out: KERNEL_UNLOCK_ONE(NULL); return error; } void config_init_mi(void) { if (!config_initialized) config_init(); sysctl_detach_setup(NULL); } void config_deferred(device_t dev) { KASSERT(KERNEL_LOCKED_P()); config_process_deferred(&deferred_config_queue, dev); config_process_deferred(&interrupt_config_queue, dev); config_process_deferred(&mountroot_config_queue, dev); } static void config_interrupts_thread(void *cookie) { struct deferred_config *dc; device_t dev; mutex_enter(&config_misc_lock); while ((dc = TAILQ_FIRST(&interrupt_config_queue)) != NULL) { TAILQ_REMOVE(&interrupt_config_queue, dc, dc_queue); mutex_exit(&config_misc_lock); dev = dc->dc_dev; (*dc->dc_func)(dev); if (!device_pmf_is_registered(dev)) aprint_debug_dev(dev, "WARNING: power management not supported\n"); config_pending_decr(dev); kmem_free(dc, sizeof(*dc)); mutex_enter(&config_misc_lock); } mutex_exit(&config_misc_lock); kthread_exit(0); } void config_create_interruptthreads(void) { int i; for (i = 0; i < interrupt_config_threads; i++) { (void)kthread_create(PRI_NONE, 0/*XXXSMP */, NULL, config_interrupts_thread, NULL, NULL, "configintr"); } } static void config_mountroot_thread(void *cookie) { struct deferred_config *dc; mutex_enter(&config_misc_lock); while ((dc = TAILQ_FIRST(&mountroot_config_queue)) != NULL) { TAILQ_REMOVE(&mountroot_config_queue, dc, dc_queue); mutex_exit(&config_misc_lock); (*dc->dc_func)(dc->dc_dev); kmem_free(dc, sizeof(*dc)); mutex_enter(&config_misc_lock); } mutex_exit(&config_misc_lock); kthread_exit(0); } void config_create_mountrootthreads(void) { int i; if (!root_is_mounted) root_is_mounted = true; mountroot_config_lwpids_size = sizeof(mountroot_config_lwpids) * mountroot_config_threads; mountroot_config_lwpids = kmem_alloc(mountroot_config_lwpids_size, KM_NOSLEEP); KASSERT(mountroot_config_lwpids); for (i = 0; i < mountroot_config_threads; i++) { mountroot_config_lwpids[i] = 0; (void)kthread_create(PRI_NONE, KTHREAD_MUSTJOIN/* XXXSMP */, NULL, config_mountroot_thread, NULL, &mountroot_config_lwpids[i], "configroot"); } } void config_finalize_mountroot(void) { int i, error; for (i = 0; i < mountroot_config_threads; i++) { if (mountroot_config_lwpids[i] == 0) continue; error = kthread_join(mountroot_config_lwpids[i]); if (error) printf("%s: thread %x joined with error %d\n", __func__, i, error); } kmem_free(mountroot_config_lwpids, mountroot_config_lwpids_size); } /* * Announce device attach/detach to userland listeners. */ int no_devmon_insert(const char *name, prop_dictionary_t p) { return ENODEV; } static void devmon_report_device(device_t dev, bool isattach) { prop_dictionary_t ev, dict = device_properties(dev); const char *parent; const char *what; const char *where; device_t pdev = device_parent(dev); /* If currently no drvctl device, just return */ if (devmon_insert_vec == no_devmon_insert) return; ev = prop_dictionary_create(); if (ev == NULL) return; what = (isattach ? "device-attach" : "device-detach"); parent = (pdev == NULL ? "root" : device_xname(pdev)); if (prop_dictionary_get_string(dict, "location", &where)) { prop_dictionary_set_string(ev, "location", where); aprint_debug("ev: %s %s at %s in [%s]\n", what, device_xname(dev), parent, where); } if (!prop_dictionary_set_string(ev, "device", device_xname(dev)) || !prop_dictionary_set_string(ev, "parent", parent)) { prop_object_release(ev); return; } if ((*devmon_insert_vec)(what, ev) != 0) prop_object_release(ev); } /* * Add a cfdriver to the system. */ int config_cfdriver_attach(struct cfdriver *cd) { struct cfdriver *lcd; /* Make sure this driver isn't already in the system. */ LIST_FOREACH(lcd, &allcfdrivers, cd_list) { if (STREQ(lcd->cd_name, cd->cd_name)) return EEXIST; } LIST_INIT(&cd->cd_attach); LIST_INSERT_HEAD(&allcfdrivers, cd, cd_list); return 0; } /* * Remove a cfdriver from the system. */ int config_cfdriver_detach(struct cfdriver *cd) { struct alldevs_foray af; int i, rc = 0; config_alldevs_enter(&af); /* Make sure there are no active instances. */ for (i = 0; i < cd->cd_ndevs; i++) { if (cd->cd_devs[i] != NULL) { rc = EBUSY; break; } } config_alldevs_exit(&af); if (rc != 0) return rc; /* ...and no attachments loaded. */ if (LIST_EMPTY(&cd->cd_attach) == 0) return EBUSY; LIST_REMOVE(cd, cd_list); KASSERT(cd->cd_devs == NULL); return 0; } /* * Look up a cfdriver by name. */ struct cfdriver * config_cfdriver_lookup(const char *name) { struct cfdriver *cd; LIST_FOREACH(cd, &allcfdrivers, cd_list) { if (STREQ(cd->cd_name, name)) return cd; } return NULL; } /* * Add a cfattach to the specified driver. */ int config_cfattach_attach(const char *driver, struct cfattach *ca) { struct cfattach *lca; struct cfdriver *cd; cd = config_cfdriver_lookup(driver); if (cd == NULL) return ESRCH; /* Make sure this attachment isn't already on this driver. */ LIST_FOREACH(lca, &cd->cd_attach, ca_list) { if (STREQ(lca->ca_name, ca->ca_name)) return EEXIST; } LIST_INSERT_HEAD(&cd->cd_attach, ca, ca_list); return 0; } /* * Remove a cfattach from the specified driver. */ int config_cfattach_detach(const char *driver, struct cfattach *ca) { struct alldevs_foray af; struct cfdriver *cd; device_t dev; int i, rc = 0; cd = config_cfdriver_lookup(driver); if (cd == NULL) return ESRCH; config_alldevs_enter(&af); /* Make sure there are no active instances. */ for (i = 0; i < cd->cd_ndevs; i++) { if ((dev = cd->cd_devs[i]) == NULL) continue; if (dev->dv_cfattach == ca) { rc = EBUSY; break; } } config_alldevs_exit(&af); if (rc != 0) return rc; LIST_REMOVE(ca, ca_list); return 0; } /* * Look up a cfattach by name. */ static struct cfattach * config_cfattach_lookup_cd(struct cfdriver *cd, const char *atname) { struct cfattach *ca; LIST_FOREACH(ca, &cd->cd_attach, ca_list) { if (STREQ(ca->ca_name, atname)) return ca; } return NULL; } /* * Look up a cfattach by driver/attachment name. */ struct cfattach * config_cfattach_lookup(const char *name, const char *atname) { struct cfdriver *cd; cd = config_cfdriver_lookup(name); if (cd == NULL) return NULL; return config_cfattach_lookup_cd(cd, atname); } /* * Apply the matching function and choose the best. This is used * a few times and we want to keep the code small. */ static void mapply(struct matchinfo *m, cfdata_t cf) { int pri; if (m->fn != NULL) { pri = (*m->fn)(m->parent, cf, m->locs, m->aux); } else { pri = config_match(m->parent, cf, m->aux); } if (pri > m->pri) { m->match = cf; m->pri = pri; } } int config_stdsubmatch(device_t parent, cfdata_t cf, const int *locs, void *aux) { const struct cfiattrdata *ci; const struct cflocdesc *cl; int nlocs, i; ci = cfiattr_lookup(cfdata_ifattr(cf), parent->dv_cfdriver); KASSERT(ci); nlocs = ci->ci_loclen; KASSERT(!nlocs || locs); for (i = 0; i < nlocs; i++) { cl = &ci->ci_locdesc[i]; if (cl->cld_defaultstr != NULL && cf->cf_loc[i] == cl->cld_default) continue; if (cf->cf_loc[i] == locs[i]) continue; return 0; } return config_match(parent, cf, aux); } /* * Helper function: check whether the driver supports the interface attribute * and return its descriptor structure. */ static const struct cfiattrdata * cfdriver_get_iattr(const struct cfdriver *cd, const char *ia) { const struct cfiattrdata * const *cpp; if (cd->cd_attrs == NULL) return 0; for (cpp = cd->cd_attrs; *cpp; cpp++) { if (STREQ((*cpp)->ci_name, ia)) { /* Match. */ return *cpp; } } return 0; } static int __diagused cfdriver_iattr_count(const struct cfdriver *cd) { const struct cfiattrdata * const *cpp; int i; if (cd->cd_attrs == NULL) return 0; for (i = 0, cpp = cd->cd_attrs; *cpp; cpp++) { i++; } return i; } /* * Lookup an interface attribute description by name. * If the driver is given, consider only its supported attributes. */ const struct cfiattrdata * cfiattr_lookup(const char *name, const struct cfdriver *cd) { const struct cfdriver *d; const struct cfiattrdata *ia; if (cd) return cfdriver_get_iattr(cd, name); LIST_FOREACH(d, &allcfdrivers, cd_list) { ia = cfdriver_get_iattr(d, name); if (ia) return ia; } return 0; } /* * Determine if `parent' is a potential parent for a device spec based * on `cfp'. */ static int cfparent_match(const device_t parent, const struct cfparent *cfp) { struct cfdriver *pcd; /* We don't match root nodes here. */ if (cfp == NULL) return 0; pcd = parent->dv_cfdriver; KASSERT(pcd != NULL); /* * First, ensure this parent has the correct interface * attribute. */ if (!cfdriver_get_iattr(pcd, cfp->cfp_iattr)) return 0; /* * If no specific parent device instance was specified (i.e. * we're attaching to the attribute only), we're done! */ if (cfp->cfp_parent == NULL) return 1; /* * Check the parent device's name. */ if (STREQ(pcd->cd_name, cfp->cfp_parent) == 0) return 0; /* not the same parent */ /* * Make sure the unit number matches. */ if (cfp->cfp_unit == DVUNIT_ANY || /* wildcard */ cfp->cfp_unit == parent->dv_unit) return 1; /* Unit numbers don't match. */ return 0; } /* * Helper for config_cfdata_attach(): check all devices whether it could be * parent any attachment in the config data table passed, and rescan. */ static void rescan_with_cfdata(const struct cfdata *cf) { device_t d; const struct cfdata *cf1; deviter_t di; KASSERT(KERNEL_LOCKED_P()); /* * "alldevs" is likely longer than a modules's cfdata, so make it * the outer loop. */ for (d = deviter_first(&di, 0); d != NULL; d = deviter_next(&di)) { if (!(d->dv_cfattach->ca_rescan)) continue; for (cf1 = cf; cf1->cf_name; cf1++) { if (!cfparent_match(d, cf1->cf_pspec)) continue; (*d->dv_cfattach->ca_rescan)(d, cfdata_ifattr(cf1), cf1->cf_loc); config_deferred(d); } } deviter_release(&di); } /* * Attach a supplemental config data table and rescan potential * parent devices if required. */ int config_cfdata_attach(cfdata_t cf, int scannow) { struct cftable *ct; KERNEL_LOCK(1, NULL); ct = kmem_alloc(sizeof(*ct), KM_SLEEP); ct->ct_cfdata = cf; TAILQ_INSERT_TAIL(&allcftables, ct, ct_list); if (scannow) rescan_with_cfdata(cf); KERNEL_UNLOCK_ONE(NULL); return 0; } /* * Helper for config_cfdata_detach: check whether a device is * found through any attachment in the config data table. */ static int dev_in_cfdata(device_t d, cfdata_t cf) { const struct cfdata *cf1; for (cf1 = cf; cf1->cf_name; cf1++) if (d->dv_cfdata == cf1) return 1; return 0; } /* * Detach a supplemental config data table. Detach all devices found * through that table (and thus keeping references to it) before. */ int config_cfdata_detach(cfdata_t cf) { device_t d; int error = 0; struct cftable *ct; deviter_t di; KERNEL_LOCK(1, NULL); for (d = deviter_first(&di, DEVITER_F_RW); d != NULL; d = deviter_next(&di)) { if (!dev_in_cfdata(d, cf)) continue; if ((error = config_detach(d, 0)) != 0) break; } deviter_release(&di); if (error) { aprint_error_dev(d, "unable to detach instance\n"); goto out; } TAILQ_FOREACH(ct, &allcftables, ct_list) { if (ct->ct_cfdata == cf) { TAILQ_REMOVE(&allcftables, ct, ct_list); kmem_free(ct, sizeof(*ct)); error = 0; goto out; } } /* not found -- shouldn't happen */ error = EINVAL; out: KERNEL_UNLOCK_ONE(NULL); return error; } /* * Invoke the "match" routine for a cfdata entry on behalf of * an external caller, usually a direct config "submatch" routine. */ int config_match(device_t parent, cfdata_t cf, void *aux) { struct cfattach *ca; KASSERT(KERNEL_LOCKED_P()); ca = config_cfattach_lookup(cf->cf_name, cf->cf_atname); if (ca == NULL) { /* No attachment for this entry, oh well. */ return 0; } return (*ca->ca_match)(parent, cf, aux); } /* * Invoke the "probe" routine for a cfdata entry on behalf of * an external caller, usually an indirect config "search" routine. */ int config_probe(device_t parent, cfdata_t cf, void *aux) { /* * This is currently a synonym for config_match(), but this * is an implementation detail; "match" and "probe" routines * have different behaviors. * * XXX config_probe() should return a bool, because there is * XXX no match score for probe -- it's either there or it's * XXX not, but some ports abuse the return value as a way * XXX to attach "critical" devices before "non-critical" * XXX devices. */ return config_match(parent, cf, aux); } static struct cfargs_internal * cfargs_canonicalize(const struct cfargs * const cfargs, struct cfargs_internal * const store) { struct cfargs_internal *args = store; memset(args, 0, sizeof(*args)); /* If none specified, are all-NULL pointers are good. */ if (cfargs == NULL) { return args; } /* * Only one arguments version is recognized at this time. */ if (cfargs->cfargs_version != CFARGS_VERSION) { panic("cfargs_canonicalize: unknown version %lu\n", (unsigned long)cfargs->cfargs_version); } /* * submatch and search are mutually-exclusive. */ if (cfargs->submatch != NULL && cfargs->search != NULL) { panic("cfargs_canonicalize: submatch and search are " "mutually-exclusive"); } if (cfargs->submatch != NULL) { args->submatch = cfargs->submatch; } else if (cfargs->search != NULL) { args->search = cfargs->search; } args->iattr = cfargs->iattr; args->locators = cfargs->locators; args->devhandle = cfargs->devhandle; return args; } /* * Iterate over all potential children of some device, calling the given * function (default being the child's match function) for each one. * Nonzero returns are matches; the highest value returned is considered * the best match. Return the `found child' if we got a match, or NULL * otherwise. The `aux' pointer is simply passed on through. * * Note that this function is designed so that it can be used to apply * an arbitrary function to all potential children (its return value * can be ignored). */ static cfdata_t config_search_internal(device_t parent, void *aux, const struct cfargs_internal * const args) { struct cftable *ct; cfdata_t cf; struct matchinfo m; KASSERT(config_initialized); KASSERTMSG((!args->iattr || cfdriver_get_iattr(parent->dv_cfdriver, args->iattr)), "%s searched for child at interface attribute %s," " but device %s(4) has no such interface attribute in config(5)", device_xname(parent), args->iattr, parent->dv_cfdriver->cd_name); KASSERTMSG((args->iattr || cfdriver_iattr_count(parent->dv_cfdriver) < 2), "%s searched for child without interface attribute," " needed to disambiguate among the %d declared for in %s(4)" " in config(5)", device_xname(parent), cfdriver_iattr_count(parent->dv_cfdriver), parent->dv_cfdriver->cd_name); m.fn = args->submatch; /* N.B. union */ m.parent = parent; m.locs = args->locators; m.aux = aux; m.match = NULL; m.pri = 0; TAILQ_FOREACH(ct, &allcftables, ct_list) { for (cf = ct->ct_cfdata; cf->cf_name; cf++) { /* We don't match root nodes here. */ if (!cf->cf_pspec) continue; /* * Skip cf if no longer eligible, otherwise scan * through parents for one matching `parent', and * try match function. */ if (cf->cf_fstate == FSTATE_FOUND) continue; if (cf->cf_fstate == FSTATE_DNOTFOUND || cf->cf_fstate == FSTATE_DSTAR) continue; /* * If an interface attribute was specified, * consider only children which attach to * that attribute. */ if (args->iattr != NULL && !STREQ(args->iattr, cfdata_ifattr(cf))) continue; if (cfparent_match(parent, cf->cf_pspec)) mapply(&m, cf); } } rnd_add_uint32(&rnd_autoconf_source, 0); return m.match; } cfdata_t config_search(device_t parent, void *aux, const struct cfargs *cfargs) { cfdata_t cf; struct cfargs_internal store; cf = config_search_internal(parent, aux, cfargs_canonicalize(cfargs, &store)); return cf; } /* * Find the given root device. * This is much like config_search, but there is no parent. * Don't bother with multiple cfdata tables; the root node * must always be in the initial table. */ cfdata_t config_rootsearch(cfsubmatch_t fn, const char *rootname, void *aux) { cfdata_t cf; const short *p; struct matchinfo m; m.fn = fn; m.parent = ROOT; m.aux = aux; m.match = NULL; m.pri = 0; m.locs = 0; /* * Look at root entries for matching name. We do not bother * with found-state here since only one root should ever be * searched (and it must be done first). */ for (p = cfroots; *p >= 0; p++) { cf = &cfdata[*p]; if (strcmp(cf->cf_name, rootname) == 0) mapply(&m, cf); } return m.match; } static const char * const msgs[] = { [QUIET] = "", [UNCONF] = " not configured\n", [UNSUPP] = " unsupported\n", }; /* * The given `aux' argument describes a device that has been found * on the given parent, but not necessarily configured. Locate the * configuration data for that device (using the submatch function * provided, or using candidates' cd_match configuration driver * functions) and attach it, and return its device_t. If the device was * not configured, call the given `print' function and return NULL. */ device_t config_found_acquire(device_t parent, void *aux, cfprint_t print, const struct cfargs * const cfargs) { cfdata_t cf; struct cfargs_internal store; const struct cfargs_internal * const args = cfargs_canonicalize(cfargs, &store); device_t dev; KERNEL_LOCK(1, NULL); cf = config_search_internal(parent, aux, args); if (cf != NULL) { dev = config_attach_internal(parent, cf, aux, print, args); goto out; } if (print) { if (config_do_twiddle && cold) twiddle(); const int pret = (*print)(aux, device_xname(parent)); KASSERT(pret >= 0); KASSERT(pret < __arraycount(msgs)); KASSERT(msgs[pret] != NULL); aprint_normal("%s", msgs[pret]); } dev = NULL; out: KERNEL_UNLOCK_ONE(NULL); return dev; } /* * config_found(parent, aux, print, cfargs) * * Legacy entry point for callers whose use of the returned * device_t is not delimited by device_release. * * The caller is required to hold the kernel lock as a fragile * defence against races. * * Callers should ignore the return value or be converted to * config_found_acquire with a matching device_release once they * have finished with the returned device_t. */ device_t config_found(device_t parent, void *aux, cfprint_t print, const struct cfargs * const cfargs) { device_t dev; KASSERT(KERNEL_LOCKED_P()); dev = config_found_acquire(parent, aux, print, cfargs); if (dev == NULL) return NULL; device_release(dev); return dev; } /* * As above, but for root devices. */ device_t config_rootfound(const char *rootname, void *aux) { cfdata_t cf; device_t dev = NULL; KERNEL_LOCK(1, NULL); if ((cf = config_rootsearch(NULL, rootname, aux)) != NULL) dev = config_attach(ROOT, cf, aux, NULL, CFARGS_NONE); else aprint_error("root device %s not configured\n", rootname); KERNEL_UNLOCK_ONE(NULL); return dev; } /* just like sprintf(buf, "%d") except that it works from the end */ static char * number(char *ep, int n) { *--ep = 0; while (n >= 10) { *--ep = (n % 10) + '0'; n /= 10; } *--ep = n + '0'; return ep; } /* * Expand the size of the cd_devs array if necessary. * * The caller must hold alldevs_lock. config_makeroom() may release and * re-acquire alldevs_lock, so callers should re-check conditions such * as alldevs_nwrite == 0 and alldevs_nread == 0 when config_makeroom() * returns. */ static void config_makeroom(int n, struct cfdriver *cd) { int ondevs, nndevs; device_t *osp, *nsp; KASSERT(mutex_owned(&alldevs_lock)); alldevs_nwrite++; /* XXX arithmetic overflow */ for (nndevs = MAX(4, cd->cd_ndevs); nndevs <= n; nndevs += nndevs) ; while (n >= cd->cd_ndevs) { /* * Need to expand the array. */ ondevs = cd->cd_ndevs; osp = cd->cd_devs; /* * Release alldevs_lock around allocation, which may * sleep. */ mutex_exit(&alldevs_lock); nsp = kmem_alloc(sizeof(device_t) * nndevs, KM_SLEEP); mutex_enter(&alldevs_lock); /* * If another thread moved the array while we did * not hold alldevs_lock, try again. */ if (cd->cd_devs != osp || cd->cd_ndevs != ondevs) { mutex_exit(&alldevs_lock); kmem_free(nsp, sizeof(device_t) * nndevs); mutex_enter(&alldevs_lock); continue; } memset(nsp + ondevs, 0, sizeof(device_t) * (nndevs - ondevs)); if (ondevs != 0) memcpy(nsp, cd->cd_devs, sizeof(device_t) * ondevs); cd->cd_ndevs = nndevs; cd->cd_devs = nsp; if (ondevs != 0) { mutex_exit(&alldevs_lock); kmem_free(osp, sizeof(device_t) * ondevs); mutex_enter(&alldevs_lock); } } KASSERT(mutex_owned(&alldevs_lock)); alldevs_nwrite--; } /* * Put dev into the devices list. */ static void config_devlink(device_t dev) { mutex_enter(&alldevs_lock); KASSERT(device_cfdriver(dev)->cd_devs[dev->dv_unit] == dev); dev->dv_add_gen = alldevs_gen; /* It is safe to add a device to the tail of the list while * readers and writers are in the list. */ TAILQ_INSERT_TAIL(&alldevs, dev, dv_list); mutex_exit(&alldevs_lock); } static void config_devfree(device_t dev) { KASSERT(dev->dv_flags & DVF_PRIV_ALLOC); KASSERTMSG(dev->dv_pending == 0, "%d", dev->dv_pending); if (dev->dv_cfattach->ca_devsize > 0) kmem_free(dev->dv_private, dev->dv_cfattach->ca_devsize); kmem_free(dev, sizeof(*dev)); } /* * Caller must hold alldevs_lock. */ static void config_devunlink(device_t dev, struct devicelist *garbage) { struct device_garbage *dg = &dev->dv_garbage; cfdriver_t cd = device_cfdriver(dev); int i; KASSERT(mutex_owned(&alldevs_lock)); KASSERTMSG(dev->dv_pending == 0, "%d", dev->dv_pending); /* Unlink from device list. Link to garbage list. */ TAILQ_REMOVE(&alldevs, dev, dv_list); TAILQ_INSERT_TAIL(garbage, dev, dv_list); /* Remove from cfdriver's array. */ cd->cd_devs[dev->dv_unit] = NULL; /* * If the device now has no units in use, unlink its softc array. */ for (i = 0; i < cd->cd_ndevs; i++) { if (cd->cd_devs[i] != NULL) break; } /* Nothing found. Unlink, now. Deallocate, later. */ if (i == cd->cd_ndevs) { dg->dg_ndevs = cd->cd_ndevs; dg->dg_devs = cd->cd_devs; cd->cd_devs = NULL; cd->cd_ndevs = 0; } } static void config_devdelete(device_t dev) { struct device_garbage *dg = &dev->dv_garbage; device_lock_t dvl = device_getlock(dev); KASSERTMSG(dev->dv_pending == 0, "%d", dev->dv_pending); if (dg->dg_devs != NULL) kmem_free(dg->dg_devs, sizeof(device_t) * dg->dg_ndevs); localcount_fini(dev->dv_localcount); kmem_free(dev->dv_localcount, sizeof(*dev->dv_localcount)); cv_destroy(&dvl->dvl_cv); mutex_destroy(&dvl->dvl_mtx); KASSERT(dev->dv_properties != NULL); prop_object_release(dev->dv_properties); if (dev->dv_activity_handlers) panic("%s with registered handlers", __func__); if (dev->dv_locators) { size_t amount = *--dev->dv_locators; kmem_free(dev->dv_locators, amount); } config_devfree(dev); } static int config_unit_nextfree(cfdriver_t cd, cfdata_t cf) { int unit = cf->cf_unit; KASSERT(mutex_owned(&alldevs_lock)); if (unit < 0) return -1; if (cf->cf_fstate == FSTATE_STAR) { for (; unit < cd->cd_ndevs; unit++) if (cd->cd_devs[unit] == NULL) break; /* * unit is now the unit of the first NULL device pointer, * or max(cd->cd_ndevs,cf->cf_unit). */ } else { if (unit < cd->cd_ndevs && cd->cd_devs[unit] != NULL) unit = -1; } return unit; } static int config_unit_alloc(device_t dev, cfdriver_t cd, cfdata_t cf) { struct alldevs_foray af; int unit; config_alldevs_enter(&af); for (;;) { unit = config_unit_nextfree(cd, cf); if (unit == -1) break; if (unit < cd->cd_ndevs) { cd->cd_devs[unit] = dev; dev->dv_unit = unit; break; } config_makeroom(unit, cd); } config_alldevs_exit(&af); return unit; } static device_t config_devalloc(const device_t parent, const cfdata_t cf, const struct cfargs_internal * const args) { cfdriver_t cd; cfattach_t ca; size_t lname, lunit; const char *xunit; int myunit; char num[10]; device_t dev; void *dev_private; const struct cfiattrdata *ia; device_lock_t dvl; cd = config_cfdriver_lookup(cf->cf_name); if (cd == NULL) return NULL; ca = config_cfattach_lookup_cd(cd, cf->cf_atname); if (ca == NULL) return NULL; /* get memory for all device vars */ KASSERT(ca->ca_flags & DVF_PRIV_ALLOC); if (ca->ca_devsize > 0) { dev_private = kmem_zalloc(ca->ca_devsize, KM_SLEEP); } else { dev_private = NULL; } dev = kmem_zalloc(sizeof(*dev), KM_SLEEP); dev->dv_handle = args->devhandle; dev->dv_class = cd->cd_class; dev->dv_cfdata = cf; dev->dv_cfdriver = cd; dev->dv_cfattach = ca; dev->dv_activity_count = 0; dev->dv_activity_handlers = NULL; dev->dv_private = dev_private; dev->dv_flags = ca->ca_flags; /* inherit flags from class */ dev->dv_attaching = curlwp; myunit = config_unit_alloc(dev, cd, cf); if (myunit == -1) { config_devfree(dev); return NULL; } /* compute length of name and decimal expansion of unit number */ lname = strlen(cd->cd_name); xunit = number(&num[sizeof(num)], myunit); lunit = &num[sizeof(num)] - xunit; if (lname + lunit > sizeof(dev->dv_xname)) panic("config_devalloc: device name too long"); dvl = device_getlock(dev); mutex_init(&dvl->dvl_mtx, MUTEX_DEFAULT, IPL_NONE); cv_init(&dvl->dvl_cv, "pmfsusp"); memcpy(dev->dv_xname, cd->cd_name, lname); memcpy(dev->dv_xname + lname, xunit, lunit); dev->dv_parent = parent; if (parent != NULL) dev->dv_depth = parent->dv_depth + 1; else dev->dv_depth = 0; dev->dv_flags |= DVF_ACTIVE; /* always initially active */ if (args->locators) { KASSERT(parent); /* no locators at root */ ia = cfiattr_lookup(cfdata_ifattr(cf), parent->dv_cfdriver); dev->dv_locators = kmem_alloc(sizeof(int) * (ia->ci_loclen + 1), KM_SLEEP); *dev->dv_locators++ = sizeof(int) * (ia->ci_loclen + 1); memcpy(dev->dv_locators, args->locators, sizeof(int) * ia->ci_loclen); } dev->dv_properties = prop_dictionary_create(); KASSERT(dev->dv_properties != NULL); prop_dictionary_set_string_nocopy(dev->dv_properties, "device-driver", dev->dv_cfdriver->cd_name); prop_dictionary_set_uint16(dev->dv_properties, "device-unit", dev->dv_unit); if (parent != NULL) { prop_dictionary_set_string(dev->dv_properties, "device-parent", device_xname(parent)); } dev->dv_localcount = kmem_zalloc(sizeof(*dev->dv_localcount), KM_SLEEP); localcount_init(dev->dv_localcount); if (dev->dv_cfdriver->cd_attrs != NULL) config_add_attrib_dict(dev); return dev; } /* * Create an array of device attach attributes and add it * to the device's dv_properties dictionary. * * <key>interface-attributes</key> * <array> * <dict> * <key>attribute-name</key> * <string>foo</string> * <key>locators</key> * <array> * <dict> * <key>loc-name</key> * <string>foo-loc1</string> * </dict> * <dict> * <key>loc-name</key> * <string>foo-loc2</string> * <key>default</key> * <string>foo-loc2-default</string> * </dict> * ... * </array> * </dict> * ... * </array> */ static void config_add_attrib_dict(device_t dev) { int i, j; const struct cfiattrdata *ci; prop_dictionary_t attr_dict, loc_dict; prop_array_t attr_array, loc_array; if ((attr_array = prop_array_create()) == NULL) return; for (i = 0; ; i++) { if ((ci = dev->dv_cfdriver->cd_attrs[i]) == NULL) break; if ((attr_dict = prop_dictionary_create()) == NULL) break; prop_dictionary_set_string_nocopy(attr_dict, "attribute-name", ci->ci_name); /* Create an array of the locator names and defaults */ if (ci->ci_loclen != 0 && (loc_array = prop_array_create()) != NULL) { for (j = 0; j < ci->ci_loclen; j++) { loc_dict = prop_dictionary_create(); if (loc_dict == NULL) continue; prop_dictionary_set_string_nocopy(loc_dict, "loc-name", ci->ci_locdesc[j].cld_name); if (ci->ci_locdesc[j].cld_defaultstr != NULL) prop_dictionary_set_string_nocopy( loc_dict, "default", ci->ci_locdesc[j].cld_defaultstr); prop_array_set(loc_array, j, loc_dict); prop_object_release(loc_dict); } prop_dictionary_set_and_rel(attr_dict, "locators", loc_array); } prop_array_add(attr_array, attr_dict); prop_object_release(attr_dict); } if (i == 0) prop_object_release(attr_array); else prop_dictionary_set_and_rel(dev->dv_properties, "interface-attributes", attr_array); return; } /* * Attach a found device. * * Returns the device referenced, to be released with device_release. */ static device_t config_attach_internal(device_t parent, cfdata_t cf, void *aux, cfprint_t print, const struct cfargs_internal * const args) { device_t dev; struct cftable *ct; const char *drvname; bool deferred; KASSERT(KERNEL_LOCKED_P()); dev = config_devalloc(parent, cf, args); if (!dev) panic("config_attach: allocation of device softc failed"); /* XXX redundant - see below? */ if (cf->cf_fstate != FSTATE_STAR) { KASSERT(cf->cf_fstate == FSTATE_NOTFOUND); cf->cf_fstate = FSTATE_FOUND; } config_devlink(dev); if (config_do_twiddle && cold) twiddle(); else aprint_naive("Found "); /* * We want the next two printfs for normal, verbose, and quiet, * but not silent (in which case, we're twiddling, instead). */ if (parent == ROOT) { aprint_naive("%s (root)", device_xname(dev)); aprint_normal("%s (root)", device_xname(dev)); } else { aprint_naive("%s at %s", device_xname(dev), device_xname(parent)); aprint_normal("%s at %s", device_xname(dev), device_xname(parent)); if (print) (void) (*print)(aux, NULL); } /* * Before attaching, clobber any unfound devices that are * otherwise identical. * XXX code above is redundant? */ drvname = dev->dv_cfdriver->cd_name; TAILQ_FOREACH(ct, &allcftables, ct_list) { for (cf = ct->ct_cfdata; cf->cf_name; cf++) { if (STREQ(cf->cf_name, drvname) && cf->cf_unit == dev->dv_unit) { if (cf->cf_fstate == FSTATE_NOTFOUND) cf->cf_fstate = FSTATE_FOUND; } } } device_register(dev, aux); /* Let userland know */ devmon_report_device(dev, true); /* * Prevent detach until the driver's attach function, and all * deferred actions, have finished. */ config_pending_incr(dev); /* * Prevent concurrent detach from destroying the device_t until * the caller has released the device. */ device_acquire(dev); /* Call the driver's attach function. */ (*dev->dv_cfattach->ca_attach)(parent, dev, aux); /* * Allow other threads to acquire references to the device now * that the driver's attach function is done. */ mutex_enter(&config_misc_lock); KASSERT(dev->dv_attaching == curlwp); dev->dv_attaching = NULL; cv_broadcast(&config_misc_cv); mutex_exit(&config_misc_lock); /* * Synchronous parts of attach are done. Allow detach, unless * the driver's attach function scheduled deferred actions. */ config_pending_decr(dev); mutex_enter(&config_misc_lock); deferred = (dev->dv_pending != 0); mutex_exit(&config_misc_lock); if (!deferred && !device_pmf_is_registered(dev)) aprint_debug_dev(dev, "WARNING: power management not supported\n"); config_process_deferred(&deferred_config_queue, dev); device_register_post_config(dev, aux); rnd_add_uint32(&rnd_autoconf_source, 0); return dev; } device_t config_attach_acquire(device_t parent, cfdata_t cf, void *aux, cfprint_t print, const struct cfargs *cfargs) { struct cfargs_internal store; device_t dev; KERNEL_LOCK(1, NULL); dev = config_attach_internal(parent, cf, aux, print, cfargs_canonicalize(cfargs, &store)); KERNEL_UNLOCK_ONE(NULL); return dev; } /* * config_attach(parent, cf, aux, print, cfargs) * * Legacy entry point for callers whose use of the returned * device_t is not delimited by device_release. * * The caller is required to hold the kernel lock as a fragile * defence against races. * * Callers should ignore the return value or be converted to * config_attach_acquire with a matching device_release once they * have finished with the returned device_t. */ device_t config_attach(device_t parent, cfdata_t cf, void *aux, cfprint_t print, const struct cfargs *cfargs) { device_t dev; KASSERT(KERNEL_LOCKED_P()); dev = config_attach_acquire(parent, cf, aux, print, cfargs); if (dev == NULL) return NULL; device_release(dev); return dev; } /* * As above, but for pseudo-devices. Pseudo-devices attached in this * way are silently inserted into the device tree, and their children * attached. * * Note that because pseudo-devices are attached silently, any information * the attach routine wishes to print should be prefixed with the device * name by the attach routine. */ device_t config_attach_pseudo_acquire(cfdata_t cf, void *aux) { device_t dev; KERNEL_LOCK(1, NULL); struct cfargs_internal args = { }; dev = config_devalloc(ROOT, cf, &args); if (!dev) goto out; /* XXX mark busy in cfdata */ if (cf->cf_fstate != FSTATE_STAR) { KASSERT(cf->cf_fstate == FSTATE_NOTFOUND); cf->cf_fstate = FSTATE_FOUND; } config_devlink(dev); #if 0 /* XXXJRT not yet */ device_register(dev, NULL); /* like a root node */ #endif /* Let userland know */ devmon_report_device(dev, true); /* * Prevent detach until the driver's attach function, and all * deferred actions, have finished. */ config_pending_incr(dev); /* * Prevent concurrent detach from destroying the device_t until * the caller has released the device. */ device_acquire(dev); /* Call the driver's attach function. */ (*dev->dv_cfattach->ca_attach)(ROOT, dev, aux); /* * Allow other threads to acquire references to the device now * that the driver's attach function is done. */ mutex_enter(&config_misc_lock); KASSERT(dev->dv_attaching == curlwp); dev->dv_attaching = NULL; cv_broadcast(&config_misc_cv); mutex_exit(&config_misc_lock); /* * Synchronous parts of attach are done. Allow detach, unless * the driver's attach function scheduled deferred actions. */ config_pending_decr(dev); config_process_deferred(&deferred_config_queue, dev); out: KERNEL_UNLOCK_ONE(NULL); return dev; } /* * config_attach_pseudo(cf) * * Legacy entry point for callers whose use of the returned * device_t is not delimited by device_release. * * The caller is required to hold the kernel lock as a fragile * defence against races. * * Callers should ignore the return value or be converted to * config_attach_pseudo_acquire with a matching device_release * once they have finished with the returned device_t. As a * bonus, config_attach_pseudo_acquire can pass a non-null aux * argument into the driver's attach routine. */ device_t config_attach_pseudo(cfdata_t cf) { device_t dev; dev = config_attach_pseudo_acquire(cf, NULL); if (dev == NULL) return dev; device_release(dev); return dev; } /* * Caller must hold alldevs_lock. */ static void config_collect_garbage(struct devicelist *garbage) { device_t dv; KASSERT(!cpu_intr_p()); KASSERT(!cpu_softintr_p()); KASSERT(mutex_owned(&alldevs_lock)); while (alldevs_nwrite == 0 && alldevs_nread == 0 && alldevs_garbage) { TAILQ_FOREACH(dv, &alldevs, dv_list) { if (dv->dv_del_gen != 0) break; } if (dv == NULL) { alldevs_garbage = false; break; } config_devunlink(dv, garbage); } KASSERT(mutex_owned(&alldevs_lock)); } static void config_dump_garbage(struct devicelist *garbage) { device_t dv; while ((dv = TAILQ_FIRST(garbage)) != NULL) { TAILQ_REMOVE(garbage, dv, dv_list); config_devdelete(dv); } } static int config_detach_enter(device_t dev) { struct lwp *l __diagused; int error = 0; mutex_enter(&config_misc_lock); /* * Wait until attach has fully completed, and until any * concurrent detach (e.g., drvctl racing with USB event * thread) has completed. * * Caller must hold alldevs_nread or alldevs_nwrite (e.g., via * deviter) to ensure the winner of the race doesn't free the * device leading the loser of the race into use-after-free. * * XXX Not all callers do this! */ while (dev->dv_pending || dev->dv_detaching) { KASSERTMSG(dev->dv_detaching != curlwp, "recursively detaching %s", device_xname(dev)); error = cv_wait_sig(&config_misc_cv, &config_misc_lock); if (error) goto out; } /* * Attach has completed, and no other concurrent detach is * running. Claim the device for detaching. This will cause * all new attempts to acquire references to block. */ KASSERTMSG((l = dev->dv_attaching) == NULL, "lwp %ld [%s] @ %p attaching %s", (long)l->l_lid, (l->l_name ? l->l_name : l->l_proc->p_comm), l, device_xname(dev)); KASSERTMSG((l = dev->dv_detaching) == NULL, "lwp %ld [%s] @ %p detaching %s", (long)l->l_lid, (l->l_name ? l->l_name : l->l_proc->p_comm), l, device_xname(dev)); dev->dv_detaching = curlwp; out: mutex_exit(&config_misc_lock); return error; } static void config_detach_exit(device_t dev) { struct lwp *l __diagused; mutex_enter(&config_misc_lock); KASSERTMSG(dev->dv_detaching != NULL, "not detaching %s", device_xname(dev)); KASSERTMSG((l = dev->dv_detaching) == curlwp, "lwp %ld [%s] @ %p detaching %s", (long)l->l_lid, (l->l_name ? l->l_name : l->l_proc->p_comm), l, device_xname(dev)); dev->dv_detaching = NULL; cv_broadcast(&config_misc_cv); mutex_exit(&config_misc_lock); } /* * Detach a device. Optionally forced (e.g. because of hardware * removal) and quiet. Returns zero if successful, non-zero * (an error code) otherwise. * * Note that this code wants to be run from a process context, so * that the detach can sleep to allow processes which have a device * open to run and unwind their stacks. * * Caller must hold a reference with device_acquire or * device_lookup_acquire. */ int config_detach_release(device_t dev, int flags) { struct alldevs_foray af; struct cftable *ct; cfdata_t cf; const struct cfattach *ca; struct cfdriver *cd; device_t d __diagused; int rv = 0; KERNEL_LOCK(1, NULL); cf = dev->dv_cfdata; KASSERTMSG((cf == NULL || cf->cf_fstate == FSTATE_FOUND || cf->cf_fstate == FSTATE_STAR), "config_detach: %s: bad device fstate: %d", device_xname(dev), cf ? cf->cf_fstate : -1); cd = dev->dv_cfdriver; KASSERT(cd != NULL); ca = dev->dv_cfattach; KASSERT(ca != NULL); /* * Only one detach at a time, please -- and not until fully * attached. */ rv = config_detach_enter(dev); device_release(dev); if (rv) { KERNEL_UNLOCK_ONE(NULL); return rv; } mutex_enter(&alldevs_lock); if (dev->dv_del_gen != 0) { mutex_exit(&alldevs_lock); #ifdef DIAGNOSTIC printf("%s: %s is already detached\n", __func__, device_xname(dev)); #endif /* DIAGNOSTIC */ config_detach_exit(dev); KERNEL_UNLOCK_ONE(NULL); return ENOENT; } alldevs_nwrite++; mutex_exit(&alldevs_lock); /* * Call the driver's .ca_detach function, unless it has none or * we are skipping it because it's unforced shutdown time and * the driver didn't ask to detach on shutdown. */ if (!detachall && (flags & (DETACH_SHUTDOWN|DETACH_FORCE)) == DETACH_SHUTDOWN && (dev->dv_flags & DVF_DETACH_SHUTDOWN) == 0) { rv = EOPNOTSUPP; } else if (ca->ca_detach != NULL) { rv = (*ca->ca_detach)(dev, flags); } else rv = EOPNOTSUPP; KASSERTMSG(!dev->dv_detach_done, "%s detached twice, error=%d", device_xname(dev), rv); /* * If it was not possible to detach the device, then we either * panic() (for the forced but failed case), or return an error. */ if (rv) { /* * Detach failed -- likely EOPNOTSUPP or EBUSY. Driver * must not have called config_detach_commit. */ KASSERTMSG(!dev->dv_detach_committed, "%s committed to detaching and then backed out, error=%d", device_xname(dev), rv); if (flags & DETACH_FORCE) { panic("config_detach: forced detach of %s failed (%d)", device_xname(dev), rv); } goto out; } /* * The device has now been successfully detached. */ dev->dv_detach_done = true; /* * If .ca_detach didn't commit to detach, then do that for it. * This wakes any pending device_lookup_acquire calls so they * will fail. */ config_detach_commit(dev); /* * If it was possible to detach the device, ensure that the * device is deactivated. */ dev->dv_flags &= ~DVF_ACTIVE; /* XXXSMP */ /* * Wait for all device_lookup_acquire references -- mostly, for * all attempts to open the device -- to drain. It is the * responsibility of .ca_detach to ensure anything with open * references will be interrupted and release them promptly, * not block indefinitely. All new attempts to acquire * references will fail, as config_detach_commit has arranged * by now. */ mutex_enter(&config_misc_lock); localcount_drain(dev->dv_localcount, &config_misc_cv, &config_misc_lock); mutex_exit(&config_misc_lock); /* Let userland know */ devmon_report_device(dev, false); #ifdef DIAGNOSTIC /* * Sanity: If you're successfully detached, you should have no * children. (Note that because children must be attached * after parents, we only need to search the latter part of * the list.) */ mutex_enter(&alldevs_lock); for (d = TAILQ_NEXT(dev, dv_list); d != NULL; d = TAILQ_NEXT(d, dv_list)) { if (d->dv_parent == dev && d->dv_del_gen == 0) { printf("config_detach: detached device %s" " has children %s\n", device_xname(dev), device_xname(d)); panic("config_detach"); } } mutex_exit(&alldevs_lock); #endif /* notify the parent that the child is gone */ if (dev->dv_parent) { device_t p = dev->dv_parent; if (p->dv_cfattach->ca_childdetached) (*p->dv_cfattach->ca_childdetached)(p, dev); } /* * Mark cfdata to show that the unit can be reused, if possible. */ TAILQ_FOREACH(ct, &allcftables, ct_list) { for (cf = ct->ct_cfdata; cf->cf_name; cf++) { if (STREQ(cf->cf_name, cd->cd_name)) { if (cf->cf_fstate == FSTATE_FOUND && cf->cf_unit == dev->dv_unit) cf->cf_fstate = FSTATE_NOTFOUND; } } } if (dev->dv_cfdata != NULL && (flags & DETACH_QUIET) == 0) aprint_normal_dev(dev, "detached\n"); out: config_detach_exit(dev); config_alldevs_enter(&af); KASSERT(alldevs_nwrite != 0); --alldevs_nwrite; if (rv == 0 && dev->dv_del_gen == 0) { if (alldevs_nwrite == 0 && alldevs_nread == 0) config_devunlink(dev, &af.af_garbage); else { dev->dv_del_gen = alldevs_gen; alldevs_garbage = true; } } config_alldevs_exit(&af); KERNEL_UNLOCK_ONE(NULL); return rv; } /* * config_detach(dev, flags) * * Legacy entry point for callers that have not acquired a * reference to dev. * * The caller is required to hold the kernel lock as a fragile * defence against races. * * Callers should be converted to use device_acquire under a lock * taken also by .ca_childdetached to synchronize access to the * device_t, and then config_detach_release ouside the lock. * Alternatively, most drivers detach children only in their own * detach routines, which can be done with config_detach_children * instead. */ int config_detach(device_t dev, int flags) { device_acquire(dev); return config_detach_release(dev, flags); } /* * config_detach_commit(dev) * * Issued by a driver's .ca_detach routine to notify anyone * waiting in device_lookup_acquire that the driver is committed * to detaching the device, which allows device_lookup_acquire to * wake up and fail immediately. * * Safe to call multiple times -- idempotent. Must be called * during config_detach_enter/exit. Safe to use with * device_lookup because the device is not actually removed from * the table until after config_detach_exit. */ void config_detach_commit(device_t dev) { struct lwp *l __diagused; mutex_enter(&config_misc_lock); KASSERTMSG(dev->dv_detaching != NULL, "not detaching %s", device_xname(dev)); KASSERTMSG((l = dev->dv_detaching) == curlwp, "lwp %ld [%s] @ %p detaching %s", (long)l->l_lid, (l->l_name ? l->l_name : l->l_proc->p_comm), l, device_xname(dev)); dev->dv_detach_committed = true; cv_broadcast(&config_misc_cv); mutex_exit(&config_misc_lock); } int config_detach_children(device_t parent, int flags) { device_t dv; deviter_t di; int error = 0; KASSERT(KERNEL_LOCKED_P()); for (dv = deviter_first(&di, DEVITER_F_RW); dv != NULL; dv = deviter_next(&di)) { if (device_parent(dv) != parent) continue; if ((error = config_detach(dv, flags)) != 0) break; } deviter_release(&di); return error; } device_t shutdown_first(struct shutdown_state *s) { if (!s->initialized) { deviter_init(&s->di, DEVITER_F_SHUTDOWN|DEVITER_F_LEAVES_FIRST); s->initialized = true; } return shutdown_next(s); } device_t shutdown_next(struct shutdown_state *s) { device_t dv; while ((dv = deviter_next(&s->di)) != NULL && !device_is_active(dv)) ; if (dv == NULL) s->initialized = false; return dv; } bool config_detach_all(int how) { static struct shutdown_state s; device_t curdev; bool progress = false; int flags; KERNEL_LOCK(1, NULL); if ((how & (RB_NOSYNC|RB_DUMP)) != 0) goto out; if ((how & RB_POWERDOWN) == RB_POWERDOWN) flags = DETACH_SHUTDOWN | DETACH_POWEROFF; else flags = DETACH_SHUTDOWN; for (curdev = shutdown_first(&s); curdev != NULL; curdev = shutdown_next(&s)) { aprint_debug(" detaching %s, ", device_xname(curdev)); if (config_detach(curdev, flags) == 0) { progress = true; aprint_debug("success."); } else aprint_debug("failed."); } out: KERNEL_UNLOCK_ONE(NULL); return progress; } static bool device_is_ancestor_of(device_t ancestor, device_t descendant) { device_t dv; for (dv = descendant; dv != NULL; dv = device_parent(dv)) { if (device_parent(dv) == ancestor) return true; } return false; } int config_deactivate(device_t dev) { deviter_t di; const struct cfattach *ca; device_t descendant; int s, rv = 0, oflags; for (descendant = deviter_first(&di, DEVITER_F_ROOT_FIRST); descendant != NULL; descendant = deviter_next(&di)) { if (dev != descendant && !device_is_ancestor_of(dev, descendant)) continue; if ((descendant->dv_flags & DVF_ACTIVE) == 0) continue; ca = descendant->dv_cfattach; oflags = descendant->dv_flags; descendant->dv_flags &= ~DVF_ACTIVE; if (ca->ca_activate == NULL) continue; s = splhigh(); rv = (*ca->ca_activate)(descendant, DVACT_DEACTIVATE); splx(s); if (rv != 0) descendant->dv_flags = oflags; } deviter_release(&di); return rv; } /* * Defer the configuration of the specified device until all * of its parent's devices have been attached. */ void config_defer(device_t dev, void (*func)(device_t)) { struct deferred_config *dc; if (dev->dv_parent == NULL) panic("config_defer: can't defer config of a root device"); dc = kmem_alloc(sizeof(*dc), KM_SLEEP); config_pending_incr(dev); mutex_enter(&config_misc_lock); #ifdef DIAGNOSTIC struct deferred_config *odc; TAILQ_FOREACH(odc, &deferred_config_queue, dc_queue) { if (odc->dc_dev == dev) panic("config_defer: deferred twice"); } #endif dc->dc_dev = dev; dc->dc_func = func; TAILQ_INSERT_TAIL(&deferred_config_queue, dc, dc_queue); mutex_exit(&config_misc_lock); } /* * Defer some autoconfiguration for a device until after interrupts * are enabled. */ void config_interrupts(device_t dev, void (*func)(device_t)) { struct deferred_config *dc; /* * If interrupts are enabled, callback now. */ if (cold == 0) { (*func)(dev); return; } dc = kmem_alloc(sizeof(*dc), KM_SLEEP); config_pending_incr(dev); mutex_enter(&config_misc_lock); #ifdef DIAGNOSTIC struct deferred_config *odc; TAILQ_FOREACH(odc, &interrupt_config_queue, dc_queue) { if (odc->dc_dev == dev) panic("config_interrupts: deferred twice"); } #endif dc->dc_dev = dev; dc->dc_func = func; TAILQ_INSERT_TAIL(&interrupt_config_queue, dc, dc_queue); mutex_exit(&config_misc_lock); } /* * Defer some autoconfiguration for a device until after root file system * is mounted (to load firmware etc). */ void config_mountroot(device_t dev, void (*func)(device_t)) { struct deferred_config *dc; /* * If root file system is mounted, callback now. */ if (root_is_mounted) { (*func)(dev); return; } dc = kmem_alloc(sizeof(*dc), KM_SLEEP); mutex_enter(&config_misc_lock); #ifdef DIAGNOSTIC struct deferred_config *odc; TAILQ_FOREACH(odc, &mountroot_config_queue, dc_queue) { if (odc->dc_dev == dev) panic("%s: deferred twice", __func__); } #endif dc->dc_dev = dev; dc->dc_func = func; TAILQ_INSERT_TAIL(&mountroot_config_queue, dc, dc_queue); mutex_exit(&config_misc_lock); } /* * Process a deferred configuration queue. */ static void config_process_deferred(struct deferred_config_head *queue, device_t parent) { struct deferred_config *dc; KASSERT(KERNEL_LOCKED_P()); mutex_enter(&config_misc_lock); dc = TAILQ_FIRST(queue); while (dc) { if (parent == NULL || dc->dc_dev->dv_parent == parent) { TAILQ_REMOVE(queue, dc, dc_queue); mutex_exit(&config_misc_lock); (*dc->dc_func)(dc->dc_dev); config_pending_decr(dc->dc_dev); kmem_free(dc, sizeof(*dc)); mutex_enter(&config_misc_lock); /* Restart, queue might have changed */ dc = TAILQ_FIRST(queue); } else { dc = TAILQ_NEXT(dc, dc_queue); } } mutex_exit(&config_misc_lock); } /* * Manipulate the config_pending semaphore. */ void config_pending_incr(device_t dev) { mutex_enter(&config_misc_lock); KASSERTMSG(dev->dv_pending < INT_MAX, "%s: excess config_pending_incr", device_xname(dev)); if (dev->dv_pending++ == 0) TAILQ_INSERT_TAIL(&config_pending, dev, dv_pending_list); #ifdef DEBUG_AUTOCONF printf("%s: %s %d\n", __func__, device_xname(dev), dev->dv_pending); #endif mutex_exit(&config_misc_lock); } void config_pending_decr(device_t dev) { mutex_enter(&config_misc_lock); KASSERTMSG(dev->dv_pending > 0, "%s: excess config_pending_decr", device_xname(dev)); if (--dev->dv_pending == 0) { TAILQ_REMOVE(&config_pending, dev, dv_pending_list); cv_broadcast(&config_misc_cv); } #ifdef DEBUG_AUTOCONF printf("%s: %s %d\n", __func__, device_xname(dev), dev->dv_pending); #endif mutex_exit(&config_misc_lock); } /* * Register a "finalization" routine. Finalization routines are * called iteratively once all real devices have been found during * autoconfiguration, for as long as any one finalizer has done * any work. */ int config_finalize_register(device_t dev, int (*fn)(device_t)) { struct finalize_hook *f; int error = 0; KERNEL_LOCK(1, NULL); /* * If finalization has already been done, invoke the * callback function now. */ if (config_finalize_done) { while ((*fn)(dev) != 0) /* loop */ ; goto out; } /* Ensure this isn't already on the list. */ TAILQ_FOREACH(f, &config_finalize_list, f_list) { if (f->f_func == fn && f->f_dev == dev) { error = EEXIST; goto out; } } f = kmem_alloc(sizeof(*f), KM_SLEEP); f->f_func = fn; f->f_dev = dev; TAILQ_INSERT_TAIL(&config_finalize_list, f, f_list); /* Success! */ error = 0; out: KERNEL_UNLOCK_ONE(NULL); return error; } void config_finalize(void) { struct finalize_hook *f; struct pdevinit *pdev; extern struct pdevinit pdevinit[]; unsigned t0 = getticks(); int errcnt, rv; /* * Now that device driver threads have been created, wait for * them to finish any deferred autoconfiguration. */ mutex_enter(&config_misc_lock); while (!TAILQ_EMPTY(&config_pending)) { const unsigned t1 = getticks(); if (t1 - t0 >= hz) { void (*pr)(const char *, ...) __printflike(1,2); device_t dev; if (t1 - t0 >= 60*hz) { pr = aprint_normal; t0 = t1; } else { pr = aprint_debug; } (*pr)("waiting for devices:"); TAILQ_FOREACH(dev, &config_pending, dv_pending_list) (*pr)(" %s", device_xname(dev)); (*pr)("\n"); } (void)cv_timedwait(&config_misc_cv, &config_misc_lock, mstohz(1000)); } mutex_exit(&config_misc_lock); KERNEL_LOCK(1, NULL); /* Attach pseudo-devices. */ for (pdev = pdevinit; pdev->pdev_attach != NULL; pdev++) (*pdev->pdev_attach)(pdev->pdev_count); /* Run the hooks until none of them does any work. */ do { rv = 0; TAILQ_FOREACH(f, &config_finalize_list, f_list) rv |= (*f->f_func)(f->f_dev); } while (rv != 0); config_finalize_done = 1; /* Now free all the hooks. */ while ((f = TAILQ_FIRST(&config_finalize_list)) != NULL) { TAILQ_REMOVE(&config_finalize_list, f, f_list); kmem_free(f, sizeof(*f)); } KERNEL_UNLOCK_ONE(NULL); errcnt = aprint_get_error_count(); if ((boothowto & (AB_QUIET|AB_SILENT)) != 0 && (boothowto & AB_VERBOSE) == 0) { mutex_enter(&config_misc_lock); if (config_do_twiddle) { config_do_twiddle = 0; printf_nolog(" done.\n"); } mutex_exit(&config_misc_lock); } if (errcnt != 0) { printf("WARNING: %d error%s while detecting hardware; " "check system log.\n", errcnt, errcnt == 1 ? "" : "s"); } } void config_twiddle_init(void) { if ((boothowto & (AB_SILENT|AB_VERBOSE)) == AB_SILENT) { config_do_twiddle = 1; } callout_setfunc(&config_twiddle_ch, config_twiddle_fn, NULL); } void config_twiddle_fn(void *cookie) { mutex_enter(&config_misc_lock); if (config_do_twiddle) { twiddle(); callout_schedule(&config_twiddle_ch, mstohz(100)); } mutex_exit(&config_misc_lock); } static void config_alldevs_enter(struct alldevs_foray *af) { TAILQ_INIT(&af->af_garbage); mutex_enter(&alldevs_lock); config_collect_garbage(&af->af_garbage); } static void config_alldevs_exit(struct alldevs_foray *af) { mutex_exit(&alldevs_lock); config_dump_garbage(&af->af_garbage); } /* * device_lookup: * * Look up a device instance for a given driver. * * Caller is responsible for ensuring the device's state is * stable, either by holding a reference already obtained with * device_lookup_acquire or by otherwise ensuring the device is * attached and can't be detached (e.g., holding an open device * node and ensuring *_detach calls vdevgone). * * XXX Find a way to assert this. * * Safe for use up to and including interrupt context at IPL_VM. * Never sleeps. */ device_t device_lookup(cfdriver_t cd, int unit) { device_t dv; mutex_enter(&alldevs_lock); if (unit < 0 || unit >= cd->cd_ndevs) dv = NULL; else if ((dv = cd->cd_devs[unit]) != NULL && dv->dv_del_gen != 0) dv = NULL; mutex_exit(&alldevs_lock); return dv; } /* * device_lookup_private: * * Look up a softc instance for a given driver. */ void * device_lookup_private(cfdriver_t cd, int unit) { return device_private(device_lookup(cd, unit)); } /* * device_lookup_acquire: * * Look up a device instance for a given driver, and return a * reference to it that must be released by device_release. * * => If the device is still attaching, blocks until *_attach has * returned. * * => If the device is detaching, blocks until *_detach has * returned. May succeed or fail in that case, depending on * whether *_detach has backed out (EBUSY) or committed to * detaching. * * May sleep. */ device_t device_lookup_acquire(cfdriver_t cd, int unit) { device_t dv; ASSERT_SLEEPABLE(); /* XXX This should have a pserialized fast path -- TBD. */ mutex_enter(&config_misc_lock); mutex_enter(&alldevs_lock); retry: if (unit < 0 || unit >= cd->cd_ndevs || (dv = cd->cd_devs[unit]) == NULL || dv->dv_del_gen != 0 || dv->dv_detach_committed) { dv = NULL; } else { /* * Wait for the device to stabilize, if attaching or * detaching. Either way we must wait for *_attach or * *_detach to complete, and either way we must retry: * even if detaching, *_detach might fail (EBUSY) so * the device may still be there. */ if ((dv->dv_attaching != NULL && dv->dv_attaching != curlwp) || dv->dv_detaching != NULL) { mutex_exit(&alldevs_lock); cv_wait(&config_misc_cv, &config_misc_lock); mutex_enter(&alldevs_lock); goto retry; } device_acquire(dv); } mutex_exit(&alldevs_lock); mutex_exit(&config_misc_lock); return dv; } /* * device_acquire: * * Acquire a reference to a device. It is the caller's * responsibility to ensure that the device's .ca_detach routine * cannot return before calling this. Caller must release the * reference with device_release or config_detach_release. */ void device_acquire(device_t dv) { /* * No lock because the caller has promised that this can't * change concurrently with device_acquire. */ KASSERTMSG(!dv->dv_detach_done, "%s", dv == NULL ? "(null)" : device_xname(dv)); localcount_acquire(dv->dv_localcount); } /* * device_release: * * Release a reference to a device acquired with device_acquire or * device_lookup_acquire. */ void device_release(device_t dv) { localcount_release(dv->dv_localcount, &config_misc_cv, &config_misc_lock); } /* * device_find_by_xname: * * Returns the device of the given name or NULL if it doesn't exist. */ device_t device_find_by_xname(const char *name) { device_t dv; deviter_t di; for (dv = deviter_first(&di, 0); dv != NULL; dv = deviter_next(&di)) { if (strcmp(device_xname(dv), name) == 0) break; } deviter_release(&di); return dv; } /* * device_find_by_driver_unit: * * Returns the device of the given driver name and unit or * NULL if it doesn't exist. */ device_t device_find_by_driver_unit(const char *name, int unit) { struct cfdriver *cd; if ((cd = config_cfdriver_lookup(name)) == NULL) return NULL; return device_lookup(cd, unit); } static bool match_strcmp(const char * const s1, const char * const s2) { return strcmp(s1, s2) == 0; } static bool match_pmatch(const char * const s1, const char * const s2) { return pmatch(s1, s2, NULL) == 2; } static bool strarray_match_internal(const char ** const strings, unsigned int const nstrings, const char * const str, unsigned int * const indexp, bool (*match_fn)(const char *, const char *)) { unsigned int i; if (strings == NULL || nstrings == 0) { return false; } for (i = 0; i < nstrings; i++) { if ((*match_fn)(strings[i], str)) { *indexp = i; return true; } } return false; } static int strarray_match(const char ** const strings, unsigned int const nstrings, const char * const str) { unsigned int idx; if (strarray_match_internal(strings, nstrings, str, &idx, match_strcmp)) { return (int)(nstrings - idx); } return 0; } static int strarray_pmatch(const char ** const strings, unsigned int const nstrings, const char * const pattern) { unsigned int idx; if (strarray_match_internal(strings, nstrings, pattern, &idx, match_pmatch)) { return (int)(nstrings - idx); } return 0; } static int device_compatible_match_strarray_internal( const char **device_compats, int ndevice_compats, const struct device_compatible_entry *driver_compats, const struct device_compatible_entry **matching_entryp, int (*match_fn)(const char **, unsigned int, const char *)) { const struct device_compatible_entry *dce = NULL; int rv; if (ndevice_compats == 0 || device_compats == NULL || driver_compats == NULL) return 0; for (dce = driver_compats; dce->compat != NULL; dce++) { rv = (*match_fn)(device_compats, ndevice_compats, dce->compat); if (rv != 0) { if (matching_entryp != NULL) { *matching_entryp = dce; } return rv; } } return 0; } /* * device_compatible_match: * * Match a driver's "compatible" data against a device's * "compatible" strings. Returns resulted weighted by * which device "compatible" string was matched. */ int device_compatible_match(const char **device_compats, int ndevice_compats, const struct device_compatible_entry *driver_compats) { return device_compatible_match_strarray_internal(device_compats, ndevice_compats, driver_compats, NULL, strarray_match); } /* * device_compatible_pmatch: * * Like device_compatible_match(), but uses pmatch(9) to compare * the device "compatible" strings against patterns in the * driver's "compatible" data. */ int device_compatible_pmatch(const char **device_compats, int ndevice_compats, const struct device_compatible_entry *driver_compats) { return device_compatible_match_strarray_internal(device_compats, ndevice_compats, driver_compats, NULL, strarray_pmatch); } static int device_compatible_match_strlist_internal( const char * const device_compats, size_t const device_compatsize, const struct device_compatible_entry *driver_compats, const struct device_compatible_entry **matching_entryp, int (*match_fn)(const char *, size_t, const char *)) { const struct device_compatible_entry *dce = NULL; int rv; if (device_compats == NULL || device_compatsize == 0 || driver_compats == NULL) return 0; for (dce = driver_compats; dce->compat != NULL; dce++) { rv = (*match_fn)(device_compats, device_compatsize, dce->compat); if (rv != 0) { if (matching_entryp != NULL) { *matching_entryp = dce; } return rv; } } return 0; } /* * device_compatible_match_strlist: * * Like device_compatible_match(), but take the device * "compatible" strings as an OpenFirmware-style string * list. */ int device_compatible_match_strlist( const char * const device_compats, size_t const device_compatsize, const struct device_compatible_entry *driver_compats) { return device_compatible_match_strlist_internal(device_compats, device_compatsize, driver_compats, NULL, strlist_match); } /* * device_compatible_pmatch_strlist: * * Like device_compatible_pmatch(), but take the device * "compatible" strings as an OpenFirmware-style string * list. */ int device_compatible_pmatch_strlist( const char * const device_compats, size_t const device_compatsize, const struct device_compatible_entry *driver_compats) { return device_compatible_match_strlist_internal(device_compats, device_compatsize, driver_compats, NULL, strlist_pmatch); } static int device_compatible_match_id_internal( uintptr_t const id, uintptr_t const mask, uintptr_t const sentinel_id, const struct device_compatible_entry *driver_compats, const struct device_compatible_entry **matching_entryp) { const struct device_compatible_entry *dce = NULL; if (mask == 0) return 0; for (dce = driver_compats; dce->id != sentinel_id; dce++) { if ((id & mask) == dce->id) { if (matching_entryp != NULL) { *matching_entryp = dce; } return 1; } } return 0; } /* * device_compatible_match_id: * * Like device_compatible_match(), but takes a single * unsigned integer device ID. */ int device_compatible_match_id( uintptr_t const id, uintptr_t const sentinel_id, const struct device_compatible_entry *driver_compats) { return device_compatible_match_id_internal(id, (uintptr_t)-1, sentinel_id, driver_compats, NULL); } /* * device_compatible_lookup: * * Look up and return the device_compatible_entry, using the * same matching criteria used by device_compatible_match(). */ const struct device_compatible_entry * device_compatible_lookup(const char **device_compats, int ndevice_compats, const struct device_compatible_entry *driver_compats) { const struct device_compatible_entry *dce; if (device_compatible_match_strarray_internal(device_compats, ndevice_compats, driver_compats, &dce, strarray_match)) { return dce; } return NULL; } /* * device_compatible_plookup: * * Look up and return the device_compatible_entry, using the * same matching criteria used by device_compatible_pmatch(). */ const struct device_compatible_entry * device_compatible_plookup(const char **device_compats, int ndevice_compats, const struct device_compatible_entry *driver_compats) { const struct device_compatible_entry *dce; if (device_compatible_match_strarray_internal(device_compats, ndevice_compats, driver_compats, &dce, strarray_pmatch)) { return dce; } return NULL; } /* * device_compatible_lookup_strlist: * * Like device_compatible_lookup(), but take the device * "compatible" strings as an OpenFirmware-style string * list. */ const struct device_compatible_entry * device_compatible_lookup_strlist( const char * const device_compats, size_t const device_compatsize, const struct device_compatible_entry *driver_compats) { const struct device_compatible_entry *dce; if (device_compatible_match_strlist_internal(device_compats, device_compatsize, driver_compats, &dce, strlist_match)) { return dce; } return NULL; } /* * device_compatible_plookup_strlist: * * Like device_compatible_plookup(), but take the device * "compatible" strings as an OpenFirmware-style string * list. */ const struct device_compatible_entry * device_compatible_plookup_strlist( const char * const device_compats, size_t const device_compatsize, const struct device_compatible_entry *driver_compats) { const struct device_compatible_entry *dce; if (device_compatible_match_strlist_internal(device_compats, device_compatsize, driver_compats, &dce, strlist_pmatch)) { return dce; } return NULL; } /* * device_compatible_lookup_id: * * Like device_compatible_lookup(), but takes a single * unsigned integer device ID. */ const struct device_compatible_entry * device_compatible_lookup_id( uintptr_t const id, uintptr_t const sentinel_id, const struct device_compatible_entry *driver_compats) { const struct device_compatible_entry *dce; if (device_compatible_match_id_internal(id, (uintptr_t)-1, sentinel_id, driver_compats, &dce)) { return dce; } return NULL; } /* * Power management related functions. */ bool device_pmf_is_registered(device_t dev) { return (dev->dv_flags & DVF_POWER_HANDLERS) != 0; } bool device_pmf_driver_suspend(device_t dev, const pmf_qual_t *qual) { if ((dev->dv_flags & DVF_DRIVER_SUSPENDED) != 0) return true; if ((dev->dv_flags & DVF_CLASS_SUSPENDED) == 0) return false; if (pmf_qual_depth(qual) <= DEVACT_LEVEL_DRIVER && dev->dv_driver_suspend != NULL && !(*dev->dv_driver_suspend)(dev, qual)) return false; dev->dv_flags |= DVF_DRIVER_SUSPENDED; return true; } bool device_pmf_driver_resume(device_t dev, const pmf_qual_t *qual) { if ((dev->dv_flags & DVF_DRIVER_SUSPENDED) == 0) return true; if ((dev->dv_flags & DVF_BUS_SUSPENDED) != 0) return false; if (pmf_qual_depth(qual) <= DEVACT_LEVEL_DRIVER && dev->dv_driver_resume != NULL && !(*dev->dv_driver_resume)(dev, qual)) return false; dev->dv_flags &= ~DVF_DRIVER_SUSPENDED; return true; } bool device_pmf_driver_shutdown(device_t dev, int how) { if (*dev->dv_driver_shutdown != NULL && !(*dev->dv_driver_shutdown)(dev, how)) return false; return true; } void device_pmf_driver_register(device_t dev, bool (*suspend)(device_t, const pmf_qual_t *), bool (*resume)(device_t, const pmf_qual_t *), bool (*shutdown)(device_t, int)) { dev->dv_driver_suspend = suspend; dev->dv_driver_resume = resume; dev->dv_driver_shutdown = shutdown; dev->dv_flags |= DVF_POWER_HANDLERS; } void device_pmf_driver_deregister(device_t dev) { device_lock_t dvl = device_getlock(dev); dev->dv_driver_suspend = NULL; dev->dv_driver_resume = NULL; mutex_enter(&dvl->dvl_mtx); dev->dv_flags &= ~DVF_POWER_HANDLERS; while (dvl->dvl_nlock > 0 || dvl->dvl_nwait > 0) { /* Wake a thread that waits for the lock. That * thread will fail to acquire the lock, and then * it will wake the next thread that waits for the * lock, or else it will wake us. */ cv_signal(&dvl->dvl_cv); pmflock_debug(dev, __func__, __LINE__); cv_wait(&dvl->dvl_cv, &dvl->dvl_mtx); pmflock_debug(dev, __func__, __LINE__); } mutex_exit(&dvl->dvl_mtx); } void device_pmf_driver_child_register(device_t dev) { device_t parent = device_parent(dev); if (parent == NULL || parent->dv_driver_child_register == NULL) return; (*parent->dv_driver_child_register)(dev); } void device_pmf_driver_set_child_register(device_t dev, void (*child_register)(device_t)) { dev->dv_driver_child_register = child_register; } static void pmflock_debug(device_t dev, const char *func, int line) { #ifdef PMFLOCK_DEBUG device_lock_t dvl = device_getlock(dev); const char *curlwp_name; if (curlwp->l_name != NULL) curlwp_name = curlwp->l_name; else curlwp_name = curlwp->l_proc->p_comm; aprint_debug_dev(dev, "%s.%d, %s dvl_nlock %d dvl_nwait %d dv_flags %x\n", func, line, curlwp_name, dvl->dvl_nlock, dvl->dvl_nwait, dev->dv_flags); #endif /* PMFLOCK_DEBUG */ } static bool device_pmf_lock1(device_t dev) { device_lock_t dvl = device_getlock(dev); while (device_pmf_is_registered(dev) && dvl->dvl_nlock > 0 && dvl->dvl_holder != curlwp) { dvl->dvl_nwait++; pmflock_debug(dev, __func__, __LINE__); cv_wait(&dvl->dvl_cv, &dvl->dvl_mtx); pmflock_debug(dev, __func__, __LINE__); dvl->dvl_nwait--; } if (!device_pmf_is_registered(dev)) { pmflock_debug(dev, __func__, __LINE__); /* We could not acquire the lock, but some other thread may * wait for it, also. Wake that thread. */ cv_signal(&dvl->dvl_cv); return false; } dvl->dvl_nlock++; dvl->dvl_holder = curlwp; pmflock_debug(dev, __func__, __LINE__); return true; } bool device_pmf_lock(device_t dev) { bool rc; device_lock_t dvl = device_getlock(dev); mutex_enter(&dvl->dvl_mtx); rc = device_pmf_lock1(dev); mutex_exit(&dvl->dvl_mtx); return rc; } void device_pmf_unlock(device_t dev) { device_lock_t dvl = device_getlock(dev); KASSERT(dvl->dvl_nlock > 0); mutex_enter(&dvl->dvl_mtx); if (--dvl->dvl_nlock == 0) dvl->dvl_holder = NULL; cv_signal(&dvl->dvl_cv); pmflock_debug(dev, __func__, __LINE__); mutex_exit(&dvl->dvl_mtx); } device_lock_t device_getlock(device_t dev) { return &dev->dv_lock; } void * device_pmf_bus_private(device_t dev) { return dev->dv_bus_private; } bool device_pmf_bus_suspend(device_t dev, const pmf_qual_t *qual) { if ((dev->dv_flags & DVF_BUS_SUSPENDED) != 0) return true; if ((dev->dv_flags & DVF_CLASS_SUSPENDED) == 0 || (dev->dv_flags & DVF_DRIVER_SUSPENDED) == 0) return false; if (pmf_qual_depth(qual) <= DEVACT_LEVEL_BUS && dev->dv_bus_suspend != NULL && !(*dev->dv_bus_suspend)(dev, qual)) return false; dev->dv_flags |= DVF_BUS_SUSPENDED; return true; } bool device_pmf_bus_resume(device_t dev, const pmf_qual_t *qual) { if ((dev->dv_flags & DVF_BUS_SUSPENDED) == 0) return true; if (pmf_qual_depth(qual) <= DEVACT_LEVEL_BUS && dev->dv_bus_resume != NULL && !(*dev->dv_bus_resume)(dev, qual)) return false; dev->dv_flags &= ~DVF_BUS_SUSPENDED; return true; } bool device_pmf_bus_shutdown(device_t dev, int how) { if (*dev->dv_bus_shutdown != NULL && !(*dev->dv_bus_shutdown)(dev, how)) return false; return true; } void device_pmf_bus_register(device_t dev, void *priv, bool (*suspend)(device_t, const pmf_qual_t *), bool (*resume)(device_t, const pmf_qual_t *), bool (*shutdown)(device_t, int), void (*deregister)(device_t)) { dev->dv_bus_private = priv; dev->dv_bus_resume = resume; dev->dv_bus_suspend = suspend; dev->dv_bus_shutdown = shutdown; dev->dv_bus_deregister = deregister; } void device_pmf_bus_deregister(device_t dev) { if (dev->dv_bus_deregister == NULL) return; (*dev->dv_bus_deregister)(dev); dev->dv_bus_private = NULL; dev->dv_bus_suspend = NULL; dev->dv_bus_resume = NULL; dev->dv_bus_deregister = NULL; } void * device_pmf_class_private(device_t dev) { return dev->dv_class_private; } bool device_pmf_class_suspend(device_t dev, const pmf_qual_t *qual) { if ((dev->dv_flags & DVF_CLASS_SUSPENDED) != 0) return true; if (pmf_qual_depth(qual) <= DEVACT_LEVEL_CLASS && dev->dv_class_suspend != NULL && !(*dev->dv_class_suspend)(dev, qual)) return false; dev->dv_flags |= DVF_CLASS_SUSPENDED; return true; } bool device_pmf_class_resume(device_t dev, const pmf_qual_t *qual) { if ((dev->dv_flags & DVF_CLASS_SUSPENDED) == 0) return true; if ((dev->dv_flags & DVF_BUS_SUSPENDED) != 0 || (dev->dv_flags & DVF_DRIVER_SUSPENDED) != 0) return false; if (pmf_qual_depth(qual) <= DEVACT_LEVEL_CLASS && dev->dv_class_resume != NULL && !(*dev->dv_class_resume)(dev, qual)) return false; dev->dv_flags &= ~DVF_CLASS_SUSPENDED; return true; } void device_pmf_class_register(device_t dev, void *priv, bool (*suspend)(device_t, const pmf_qual_t *), bool (*resume)(device_t, const pmf_qual_t *), void (*deregister)(device_t)) { dev->dv_class_private = priv; dev->dv_class_suspend = suspend; dev->dv_class_resume = resume; dev->dv_class_deregister = deregister; } void device_pmf_class_deregister(device_t dev) { if (dev->dv_class_deregister == NULL) return; (*dev->dv_class_deregister)(dev); dev->dv_class_private = NULL; dev->dv_class_suspend = NULL; dev->dv_class_resume = NULL; dev->dv_class_deregister = NULL; } bool device_active(device_t dev, devactive_t type) { size_t i; if (dev->dv_activity_count == 0) return false; for (i = 0; i < dev->dv_activity_count; ++i) { if (dev->dv_activity_handlers[i] == NULL) break; (*dev->dv_activity_handlers[i])(dev, type); } return true; } bool device_active_register(device_t dev, void (*handler)(device_t, devactive_t)) { void (**new_handlers)(device_t, devactive_t); void (**old_handlers)(device_t, devactive_t); size_t i, old_size, new_size; int s; old_handlers = dev->dv_activity_handlers; old_size = dev->dv_activity_count; KASSERT(old_size == 0 || old_handlers != NULL); for (i = 0; i < old_size; ++i) { KASSERT(old_handlers[i] != handler); if (old_handlers[i] == NULL) { old_handlers[i] = handler; return true; } } new_size = old_size + 4; new_handlers = kmem_alloc(sizeof(void *) * new_size, KM_SLEEP); for (i = 0; i < old_size; ++i) new_handlers[i] = old_handlers[i]; new_handlers[old_size] = handler; for (i = old_size+1; i < new_size; ++i) new_handlers[i] = NULL; s = splhigh(); dev->dv_activity_count = new_size; dev->dv_activity_handlers = new_handlers; splx(s); if (old_size > 0) kmem_free(old_handlers, sizeof(void *) * old_size); return true; } void device_active_deregister(device_t dev, void (*handler)(device_t, devactive_t)) { void (**old_handlers)(device_t, devactive_t); size_t i, old_size; int s; old_handlers = dev->dv_activity_handlers; old_size = dev->dv_activity_count; for (i = 0; i < old_size; ++i) { if (old_handlers[i] == handler) break; if (old_handlers[i] == NULL) return; /* XXX panic? */ } if (i == old_size) return; /* XXX panic? */ for (; i < old_size - 1; ++i) { if ((old_handlers[i] = old_handlers[i + 1]) != NULL) continue; if (i == 0) { s = splhigh(); dev->dv_activity_count = 0; dev->dv_activity_handlers = NULL; splx(s); kmem_free(old_handlers, sizeof(void *) * old_size); } return; } old_handlers[i] = NULL; } /* Return true iff the device_t `dev' exists at generation `gen'. */ static bool device_exists_at(device_t dv, devgen_t gen) { return (dv->dv_del_gen == 0 || dv->dv_del_gen > gen) && dv->dv_add_gen <= gen; } static bool deviter_visits(const deviter_t *di, device_t dv) { return device_exists_at(dv, di->di_gen); } /* * Device Iteration * * deviter_t: a device iterator. Holds state for a "walk" visiting * each device_t's in the device tree. * * deviter_init(di, flags): initialize the device iterator `di' * to "walk" the device tree. deviter_next(di) will return * the first device_t in the device tree, or NULL if there are * no devices. * * `flags' is one or more of DEVITER_F_RW, indicating that the * caller intends to modify the device tree by calling * config_detach(9) on devices in the order that the iterator * returns them; DEVITER_F_ROOT_FIRST, asking for the devices * nearest the "root" of the device tree to be returned, first; * DEVITER_F_LEAVES_FIRST, asking for the devices furthest from * the root of the device tree, first; and DEVITER_F_SHUTDOWN, * indicating both that deviter_init() should not respect any * locks on the device tree, and that deviter_next(di) may run * in more than one LWP before the walk has finished. * * Only one DEVITER_F_RW iterator may be in the device tree at * once. * * DEVITER_F_SHUTDOWN implies DEVITER_F_RW. * * Results are undefined if the flags DEVITER_F_ROOT_FIRST and * DEVITER_F_LEAVES_FIRST are used in combination. * * deviter_first(di, flags): initialize the device iterator `di' * and return the first device_t in the device tree, or NULL * if there are no devices. The statement * * dv = deviter_first(di); * * is shorthand for * * deviter_init(di); * dv = deviter_next(di); * * deviter_next(di): return the next device_t in the device tree, * or NULL if there are no more devices. deviter_next(di) * is undefined if `di' was not initialized with deviter_init() or * deviter_first(). * * deviter_release(di): stops iteration (subsequent calls to * deviter_next() will return NULL), releases any locks and * resources held by the device iterator. * * Device iteration does not return device_t's in any particular * order. An iterator will never return the same device_t twice. * Device iteration is guaranteed to complete---i.e., if deviter_next(di) * is called repeatedly on the same `di', it will eventually return * NULL. It is ok to attach/detach devices during device iteration. */ void deviter_init(deviter_t *di, deviter_flags_t flags) { device_t dv; memset(di, 0, sizeof(*di)); if ((flags & DEVITER_F_SHUTDOWN) != 0) flags |= DEVITER_F_RW; mutex_enter(&alldevs_lock); if ((flags & DEVITER_F_RW) != 0) alldevs_nwrite++; else alldevs_nread++; di->di_gen = alldevs_gen++; di->di_flags = flags; switch (di->di_flags & (DEVITER_F_LEAVES_FIRST|DEVITER_F_ROOT_FIRST)) { case DEVITER_F_LEAVES_FIRST: TAILQ_FOREACH(dv, &alldevs, dv_list) { if (!deviter_visits(di, dv)) continue; di->di_curdepth = MAX(di->di_curdepth, dv->dv_depth); } break; case DEVITER_F_ROOT_FIRST: TAILQ_FOREACH(dv, &alldevs, dv_list) { if (!deviter_visits(di, dv)) continue; di->di_maxdepth = MAX(di->di_maxdepth, dv->dv_depth); } break; default: break; } deviter_reinit(di); mutex_exit(&alldevs_lock); } static void deviter_reinit(deviter_t *di) { KASSERT(mutex_owned(&alldevs_lock)); if ((di->di_flags & DEVITER_F_RW) != 0) di->di_prev = TAILQ_LAST(&alldevs, devicelist); else di->di_prev = TAILQ_FIRST(&alldevs); } device_t deviter_first(deviter_t *di, deviter_flags_t flags) { deviter_init(di, flags); return deviter_next(di); } static device_t deviter_next2(deviter_t *di) { device_t dv; KASSERT(mutex_owned(&alldevs_lock)); dv = di->di_prev; if (dv == NULL) return NULL; if ((di->di_flags & DEVITER_F_RW) != 0) di->di_prev = TAILQ_PREV(dv, devicelist, dv_list); else di->di_prev = TAILQ_NEXT(dv, dv_list); return dv; } static device_t deviter_next1(deviter_t *di) { device_t dv; KASSERT(mutex_owned(&alldevs_lock)); do { dv = deviter_next2(di); } while (dv != NULL && !deviter_visits(di, dv)); return dv; } device_t deviter_next(deviter_t *di) { device_t dv = NULL; mutex_enter(&alldevs_lock); switch (di->di_flags & (DEVITER_F_LEAVES_FIRST|DEVITER_F_ROOT_FIRST)) { case 0: dv = deviter_next1(di); break; case DEVITER_F_LEAVES_FIRST: while (di->di_curdepth >= 0) { if ((dv = deviter_next1(di)) == NULL) { di->di_curdepth--; deviter_reinit(di); } else if (dv->dv_depth == di->di_curdepth) break; } break; case DEVITER_F_ROOT_FIRST: while (di->di_curdepth <= di->di_maxdepth) { if ((dv = deviter_next1(di)) == NULL) { di->di_curdepth++; deviter_reinit(di); } else if (dv->dv_depth == di->di_curdepth) break; } break; default: break; } mutex_exit(&alldevs_lock); return dv; } void deviter_release(deviter_t *di) { bool rw = (di->di_flags & DEVITER_F_RW) != 0; mutex_enter(&alldevs_lock); if (rw) --alldevs_nwrite; else --alldevs_nread; /* XXX wake a garbage-collection thread */ mutex_exit(&alldevs_lock); } const char * cfdata_ifattr(const struct cfdata *cf) { return cf->cf_pspec->cfp_iattr; } bool ifattr_match(const char *snull, const char *t) { return (snull == NULL) || strcmp(snull, t) == 0; } void null_childdetached(device_t self, device_t child) { /* do nothing */ } static void sysctl_detach_setup(struct sysctllog **clog) { sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT | CTLFLAG_READWRITE, CTLTYPE_BOOL, "detachall", SYSCTL_DESCR("Detach all devices at shutdown"), NULL, 0, &detachall, 0, CTL_KERN, CTL_CREATE, CTL_EOL); }

r0 = open$dir(&(0x7f0000001240)='.\x00', 0x0, 0x0)
setrlimit(0x8, &(0x7f0000000980))
dup(r0)


mknod(&(0x7f0000000000)='./file1\x00', 0x2000, 0xa718)
r0 = open$dir(&(0x7f00000000c0)='./file1\x00', 0x0, 0x0)
ioctl$OFIOGETBMAP(r0, 0x8004667e, &(0x7f0000000100))


compat_50___msgctl13$IPC_STAT(0x0, 0x2, 0x0)
fchmod(0xffffffffffffffff, 0x0)


pipe2(&(0x7f0000000000), 0x0)
compat_40_mount(&(0x7f0000000040)='tmpfs\x00', &(0x7f00000000c0)='.\x00', 0x0, &(0x7f00000002c0)="01")
__mount50(&(0x7f00000002c0)='overlay\x00', &(0x7f0000000040)='.\x00', 0x0, &(0x7f0000000540), 0x0)
connect$unix(0xffffffffffffffff, 0x0, 0x0)
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
r0 = open$dir(&(0x7f0000000000)='./file0\x00', 0x40000400001803c1, 0x0)
pwritev(r0, &(0x7f0000000280)=[{0x0}], 0x1, 0x0)


__mount50(&(0x7f0000000000)='overlay\x00', &(0x7f0000000040)='.\x00', 0x0, &(0x7f0000000540), 0x0)
mknod(&(0x7f0000000200)='./file0\x00', 0x2000, 0x0)
r0 = open(&(0x7f0000000100)='./file0\x00', 0x0, 0x0)
fcntl$setstatus(r0, 0x4, 0x0)


openat$tprof(0xffffffffffffff9c, &(0x7f0000000000), 0x1, 0x0)


chroot(&(0x7f0000000000)='.\x00')
compat_40_mount(&(0x7f0000000380)='tmpfs\x00', &(0x7f00000003c0)='.\x00', 0x0, &(0x7f0000000140)="01")
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
unmount(&(0x7f0000000000)='./file0\x00', 0x255a0100)
__getvfsstat90(&(0x7f0000000400), 0xce0, 0x0)


compat_40_mount(&(0x7f0000000040)='tmpfs\x00', &(0x7f00000000c0)='.\x00', 0x0, &(0x7f00000002c0)="01")
symlink(&(0x7f0000000080)='.\x00', &(0x7f0000000240)='./file0\x00')
compat_50___msgctl13$IPC_STAT(0x0, 0x2, &(0x7f0000000280)={{}, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, &(0x7f00000001c0)={&(0x7f0000000100)}})
sendmsg$unix(0xffffffffffffffff, &(0x7f0000000240)={&(0x7f0000000180)=@abs={0x0, 0x0, 0x2}, 0x8, &(0x7f0000000200)}, 0x0)
lchown(&(0x7f0000000100)='./file0\x00', 0x0, 0xffffffffffffffff)
compat_40_mount(&(0x7f0000000140)='umap\x00', &(0x7f0000000040)='./file0\x00', 0x0, &(0x7f00000001c0))
unlink(&(0x7f0000000000)='./file0\x00')
symlink(&(0x7f0000000080)='.\x00', &(0x7f0000000240)='./file0\x00')


compat_43_osetrlimit(0x9, &(0x7f0000000080))
socket(0x11, 0x3, 0x0)


connect$unix(0xffffffffffffffff, 0xffffffffffffffff, 0x10)


r0 = getsid(0x0)
ptrace(0x9, r0, 0x0, 0x0)
ptrace(0xd, r0, 0x0, 0x0)


r0 = socket(0x18, 0x2, 0x0)
close(r0)
r1 = socket(0x18, 0x3, 0x0)
setsockopt(r1, 0x1000000029, 0x31, &(0x7f00000000c0)="b211d7170d816685c8e360f2fa41c1a0946988b272d2dd3dc90142a84231a746e337b372e93320cff6669cbe7868de45ed3fc33719ca6df71ecec8a918458b2c10a1f8c66653b276e180e9cb9b21f9982230f575295d48889c9a920796b2dd92fc8575680b37ba955d2c15e6d7c9198ed900ab006ddfb67869b51a2216114d1ece85f593e74035f5bc054eb1dbddf42a", 0x90)
msgctl$IPC_SET(0x0, 0x1, &(0x7f00000000c0))
connect$unix(r0, &(0x7f00000000c0)=@abs={0x682eb13985c518e6, 0x7}, 0x1c)
writev(r1, &(0x7f0000000500)=[{&(0x7f00000005c0)="294bbe01e775d42b920587bee9952844c93c122f517f000000e5395a3adbd32b44bbf70100fbd963abbeba59c23c5310add84f1332990c7b3c89185df9f097003f67ca6918f9363386bef4cd0aa3d52bbf1ab75d8c4102f4e3138c229004357738d4ecaca017eb674086edbe016ad9c91ab1fbde25895467c8dce7fff6040000c8abca0036fb92c3cdeffde58eafef99abad4d0c0b9d3cd358d9552dd02afeb2dcdbad04", 0xa4}, {&(0x7f00000001c0)="4fb753dcdadfc366ed6c604d2880be6ecba35fdb2c1643bc9bbfe37bf9b31d625e398beca1d2d858cd37255afed6c1fb00ccf3a4da033bb92a5cc65597870c034aac4125adc0d3960e1c71b921d94624033f62bad195480fe568ecb8a37527d4e5a4306c591cc35c181e9b88e80074bf3157b8dea45e3391e8246c6ba2a894", 0x7f}, {&(0x7f0000000240)="a29228fcd8b93635bd8ce9b958fc56277452f4bd7372ef7f9829fc5fc6f55a034732c8f770149389111a04ee9a4d4ea18cc39157341d5f083e3275bdba233d65aa00e5416ced2bdb35a2d0d7544e2886598fa027e0c681635e3c5902497ab520a51f694a457dec3eb0bdc737408f856cc9f41d12255d5f77658faf335ab0f25a7330b20d57d9936f2909c4a030a1b3122001ddd6607e740f00000000000017a7699073d9497074bd10a6112e2acaefbdd2e9ff71c4292c082da70a15844eb00c9734ef5d2b2a7fa4f3403567b0e6f0d862015f8ad2d31268a9b957a4850accf9615634f6d247a2c9e338c08ab3da458c4312986966ab546f0ad48961f323906ea0fe454b2b9932a94ad1d8d7e2bf3ffc5a48b0127c8b417b678d35193bfd50d740eb93219b6d77f57ac3051d459ba41d2c07ed25f7b867d66d4692d9654f85821a44333a73c58f163431c692da984a52561aa97c729feb9fa6144263b28733c857188b16e5", 0x165}], 0x3)


setrlimit(0x0, &(0x7f00000000c0))
munmap(&(0x7f0000001000/0x3000)=nil, 0x3000)
compat_43_ommap(&(0x7f0000ffd000/0x2000)=nil, 0x2000, 0x0, 0x402, 0xffffffffffffffff, 0x0)


mkdir(&(0x7f0000000080)='./file0\x00', 0x0)
__mount50(&(0x7f0000000000)='fdesc\x00', &(0x7f0000000180)='./file0\x00', 0x0, 0x0, 0x0)
__getfh30(&(0x7f0000000040)='./file0\x00', 0x0, 0x0)


_lwp_create(0x0, 0x0, 0x0)
setpriority(0x0, 0x0, 0x8c1)
setpriority(0x1, 0x0, 0x0)


mknod(&(0x7f00000000c0)='./bus\x00', 0x2000, 0x4f4b)
r0 = open$dir(&(0x7f0000000040)='./bus\x00', 0x0, 0x0)
ioctl$FIOASYNC(r0, 0xc0104302, &(0x7f00000001c0))


mkdir(&(0x7f0000000080)='./file0\x00', 0x0)
__mount50(&(0x7f0000000000)='kernfs\x00', &(0x7f00000000c0)='./file0\x00', 0x0, 0x0, 0x0)
open(0x0, 0x0, 0x0)
pathconf(&(0x7f0000000280)='./file0\x00', 0x2)


mknod(&(0x7f00000000c0)='./bus\x00', 0x2000, 0x4f4b)
r0 = open$dir(&(0x7f0000000040)='./bus\x00', 0x0, 0x0)
rasctl(&(0x7f0000000140)="ef4b46576915a8a1d8a7483629690845415a47a4e17aac216f66f971fb993b769ad8a320eaedac75724c8c01e1961608e255daaf400723f09306fa50b306c92fae5937e6aae5b1ca1b1c53e718602c3419959c9501d12529f96e4e3afa2888a20d123f94b38b39ac2e15279d634cbe243c56a16b41d05c32078511fddd0b7198ec20328ac38342de4527eda6ce53a5", 0x0, 0x0)
ioctl$FIOASYNC(r0, 0x80104303, &(0x7f00000001c0))


__mount50(&(0x7f0000000080)='overlay\x00', &(0x7f0000000040)='.\x00', 0x1000000, &(0x7f0000000540), 0x0)


ioctl$FIOSEEKHOLE(0xffffffffffffffff, 0x80206931, 0x0)
symlink(&(0x7f0000000080)='.\x00', &(0x7f0000000240)='./file0\x00')
compat_50___msgctl13$IPC_STAT(0x0, 0x2, &(0x7f0000000280)={{}, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, &(0x7f00000001c0)})
__mount50(&(0x7f00000002c0)='overlay\x00', &(0x7f0000000040)='.\x00', 0x0, &(0x7f0000000540), 0x0)
r0 = open$dir(&(0x7f00000000c0)='./file0\x00', 0x0, 0x0)
r1 = dup(r0)
preadv(r1, &(0x7f0000000240), 0x100000000000030f, 0x0)


mknod$loop(&(0x7f0000000000)='./file0\x00', 0x2000, 0x1)
modctl$MODCTL_UNLOAD(0x2, 0x0)
link(0x0, 0x0)
rename(0x0, 0x0)
rename(0x0, 0x0)
open(&(0x7f0000000100)='./file0\x00', 0x0, 0x0)


mknod(&(0x7f00000000c0)='./bus\x00', 0x2000, 0x0)
r0 = open(&(0x7f0000000280)='./bus\x00', 0x0, 0x0)
ioctl$KDMKTONE(r0, 0x20004b08)


r0 = open$dir(&(0x7f0000000000)='.\x00', 0x0, 0x0)
symlink(&(0x7f0000000080)='.\x00', &(0x7f0000000240)='./file0\x00')
compat_50___msgctl13$IPC_STAT(0x0, 0x2, &(0x7f0000000280)={{}, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, &(0x7f00000001c0)={&(0x7f0000000100), 0xdffffffffffff7ff}})
lchown(&(0x7f0000000100)='./file0\x00', 0x0, 0xffffffffffffffff)
compat_40_mount(&(0x7f0000000380)='union\x00', &(0x7f0000000140)='./file0\x00', 0x0, &(0x7f00000001c0))
r1 = open$dir(&(0x7f0000000180)='./file0\x00', 0x0, 0x0)
renameat(r1, &(0x7f0000000f40)='./file0\x00', r0, &(0x7f0000001040)='./file0\x00')


__mount50(&(0x7f00000002c0)='nfs\x00', &(0x7f0000000000)='.\x00', 0x0, &(0x7f0000000180)='l', 0x1)


mknod(&(0x7f0000000480)='./file0\x00', 0x2000, 0x1733)
setrlimit(0x8, &(0x7f0000000100))
ktrace(&(0x7f0000000040)='./file0\x00', 0x0, 0x0, 0x0)


r0 = getsid(0x0)
ptrace(0x9, r0, 0x0, 0x0)
compat_50_wait4(0x0, 0x0, 0x0, 0x0)
ptrace(0x26, r0, 0x0, 0x0)


r0 = socket$inet6(0x18, 0x3, 0x0)
mknodat(0xffffffffffffff9c, &(0x7f0000000100)='./file0\x00', 0x0, 0x0)
symlinkat(&(0x7f0000000040)='./file0/file0\x00', 0xffffffffffffffff, 0x0)
sendto$inet6(r0, &(0x7f0000000000)=':', 0x358, 0x2, &(0x7f0000000040)={0x18, 0x3, 0x0, 0x20080fe}, 0x1c)


msgctl$IPC_SET(0x0, 0x1, &(0x7f00000000c0)={{0x0, 0x0, 0x0, 0x0, 0x0, 0x101}})
connect$unix(0xffffffffffffffff, &(0x7f00000000c0)=@abs={0x682eb13985c518e6, 0x7}, 0x1c)
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
r0 = socket(0x1, 0x1, 0x0)
close(r0)
r1 = socket(0x18, 0x2, 0x0)
connect$unix(r0, &(0x7f00000000c0)=@abs={0x0, 0x7}, 0x1c)
write(r1, &(0x7f0000000c00)="ee4b847cba24c072bcc4d235436c44ff765bbdee055f55fb5648914dcb5e3d889c7e49d11cc1e338cf6dc800b3ffc3be419bd83095489ef712a05f2547abf74a946852d3984fa7aece83a41f6567761dd1064322c6155f566373d0321845bd0c529c171e9921661522362aa95c04e876acc9dd4de9765d6cbbbcf03c225c955e1e2d2b7e7a50b6fcc0dfb13824b4176b4a470980b32f879a2f227450a229af0308eb50222c75a9980353dd62f4d0e4b2e501834ea93ae4cde92875f6c735ee3ff5d3d8074f43f7c2f04b2831957e53b1cc7186905088ebf996f9158672e026ae3e4a9742e9e0bc5c7a494710c79a23f75679e7c1975d3a50e430a6beba5fb8a41cc8a1198a4c32856870d0fd193d6847033bdf837a11fada6a1035846bb31c10f5cbf566a5730cd677c7b53644fee38269ba1b2df39f9be1959ea2626ae4010a4edbd9e8eca796220cd80ce4e1f0ee1720037a73bf55c1590e7866938029f36fad", 0x161)


open$dir(0x0, 0x8, 0x0)
pipe2(&(0x7f0000000780)={<r0=>0xffffffffffffffff, <r1=>0xffffffffffffffff}, 0x0)
readv(r0, &(0x7f00000003c0)=[{&(0x7f0000000940)=""/142, 0x8e}], 0x1)
getsockopt$sock_cred(0xffffffffffffffff, 0xffff, 0x1022, 0x0, 0x0)
mknod(&(0x7f00000000c0)='./bus\x00', 0x0, 0x0)
mknodat(0xffffffffffffff9c, 0x0, 0x0, 0x0)
mknodat(0xffffffffffffffff, 0x0, 0x0, 0x0)
execve(0x0, 0x0, 0x0)
writev(r1, &(0x7f0000000400)=[{0x0}, {&(0x7f00000006c0)="1a17d80e55dd55818fc50629d4e1832e0bffffffff957759de62e763e90d2dd4649b24f8bfb4f3d1b6571d6f26c72273119f030c9314", 0x36}], 0x2)


mknod(&(0x7f0000000240)='./file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\x00', 0x2000, 0x0)
mknod$loop(&(0x7f0000000000)='./file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\x00', 0x2000, 0x1)
link(&(0x7f0000000940)='./file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\x00', &(0x7f0000000d40)='./file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\x00')
mknod(&(0x7f0000000080)='./bus\x00', 0x2000, 0x0)
link(&(0x7f0000000100)='./file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\x00', &(0x7f0000000bc0)='./file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\x00')
rename(&(0x7f0000000340)='./file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\x00', &(0x7f0000000480)='./file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\x00')
rename(&(0x7f00000006c0)='./file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\x00', &(0x7f0000000580)='./file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\x00')
unlink(&(0x7f0000000a80)='./file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\x00')
rename(&(0x7f00000007c0)='./file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\x00', &(0x7f0000000a40)='./file0\x00')
mknod$loop(&(0x7f0000001040)='./file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\x00', 0x2000, 0x1)


open(&(0x7f00000000c0)='./file0\x00', 0x205, 0x0)
r0 = open(&(0x7f0000000100)='./file0\x00', 0x0, 0x0)
fcntl$lock(r0, 0x9, &(0x7f0000000340)={0x3, 0x0, 0x0, 0x100000001})
r1 = open(&(0x7f0000000040)='./file0\x00', 0x205, 0x0)
fcntl$lock(r1, 0x8, &(0x7f0000000000)={0x4, 0x0, 0xfffffffffffffffe, 0x1000300010008, 0xffffffffffffffff})


r0 = socket(0x2, 0x3, 0x0)
connect$inet(r0, &(0x7f00000000c0)={0x2, 0x2}, 0xc)


_lwp_setname(0xffffffffffffffff, &(0x7f0000000000)='\x00\x00\x00\x00\x00\x00&\xc1\x87I\xdd\xfda\x11E\x81\xcb\x9bE^\r*1TGA\xee\xee\xd0\x11\xd3\xc2\x12')
minherit(&(0x7f0000000000/0x4000)=nil, 0x4000, 0x0)
__clone(0x0, &(0x7f0000000040))
mlock(&(0x7f0000009000/0x2000)=nil, 0x2000)


socketpair$unix(0x1, 0x0, 0x0, 0x0)
open(&(0x7f0000000500)='./file0\x00', 0x70e, 0x0)
r0 = getpid()
ktrace(&(0x7f0000000000)='./file0\x00', 0x0, 0x144, r0)
r1 = msgget$private(0x0, 0x100)
msgrcv(r1, 0x0, 0x0, 0x0, 0x0)
msgsnd(r1, &(0x7f0000001540)=ANY=[@ANYBLOB], 0x32, 0x0)


munmap(&(0x7f0000001000/0x3000)=nil, 0x3000)
r0 = shmget$private(0x0, 0x3000, 0x0, &(0x7f0000ffa000/0x3000)=nil)
shmat(r0, &(0x7f0000001000/0x3000)=nil, 0x0)
mincore(&(0x7f0000000000/0x9000)=nil, 0x9000, &(0x7f0000000640)=""/115)


open$dir(0x0, 0x0, 0x0)
__utimes50(0x0, &(0x7f00000000c0)={0x0, 0xfffffffffffffbff})


modctl$MODCTL_LOAD(0x4, 0xffffffffffffffff)


minherit(&(0x7f000009f000/0x2000)=nil, 0x2000, 0x6)
__clock_gettime50(0x2, 0x0)
mknod(&(0x7f0000000480)='./file1\x00', 0x2, 0x1733)
r0 = socket(0x18, 0x3, 0x0)
ioctl$FIOSEEKHOLE(r0, 0xc020697e, &(0x7f0000000040)=0x8000000000000031)
open(&(0x7f0000000480)='./file0\x00', 0x80000000000206, 0x4ebfac6bbaf7959)
writev(0xffffffffffffffff, 0x0, 0x0)
syz_emit_ethernet(0x0, 0x0)
syz_extract_tcp_res(0x0, 0x0, 0x0)
r1 = semget$private(0x0, 0x7, 0x3c0)
semop(r1, &(0x7f0000000180)=[{0x0, 0x43, 0x1800}, {0x4, 0xe6, 0x1800}, {0x0, 0xfd, 0x1000}, {0x1, 0x20}, {0x2, 0x5, 0x1800}, {0x4, 0x9e, 0x1000}, {0x2, 0xfffb, 0x1000}, {0x3}], 0x8)


mkdir(&(0x7f0000000000)='./file0\x00', 0x0)
chmod(&(0x7f0000000280)='./file0\x00', 0x3a)
chdir(&(0x7f0000000240)='./file0\x00')
setreuid(0x0, 0xee01)
mkdirat(0xffffffffffffff9c, &(0x7f0000000040)='./file0\x00', 0x0)
lchown(&(0x7f0000000100)='./file0\x00', 0x0, 0xffffffffffffffff)


__mount50(&(0x7f0000000000)='overlay\x00', &(0x7f0000000040)='.\x00', 0x0, &(0x7f0000000540), 0x0)
mknod(&(0x7f00000000c0)='./bus\x00', 0x2000, 0x0)
r0 = open$dir(&(0x7f0000000000)='./bus\x00', 0x0, 0x0)
r1 = open(&(0x7f00000002c0)='./bus\x00', 0x0, 0x0)
read(r1, &(0x7f0000000180)=""/198, 0xc6)
ioctl$FIOASYNC(r0, 0x80017472, &(0x7f00000001c0))


compat_40_mount(&(0x7f0000000380)='tmpfs\x00', &(0x7f00000003c0)='.\x00', 0x0, &(0x7f0000000140)="01")
mknod(&(0x7f0000000480)='./file0\x00', 0x2000, 0x1733)
__mount50(0x0, &(0x7f0000000040)='.\x00', 0xe680bf986d21abfb, &(0x7f0000000400), 0x0)
compat_50_utimes(&(0x7f0000000480)='./file0\x00', 0x0)


mknod(&(0x7f00000000c0)='./bus\x00', 0x2000, 0x4f4b)
r0 = open$dir(&(0x7f0000000040)='./bus\x00', 0x0, 0x0)
setsockopt(0xffffffffffffffff, 0x0, 0x0, &(0x7f0000000000)="5ab777", 0x3)
ioctl$FIOASYNC(r0, 0xc0104304, &(0x7f00000001c0)=0x20000002)


compat_43_ocreat(&(0x7f0000000000)='./file0\x00', 0x38)
rename(&(0x7f0000000040)='./file0\x00', &(0x7f0000000080)='./file0\x00')
r0 = semget(0x0, 0x1, 0x1)
compat_50_____semctl13$GETALL(r0, 0x0, 0x6, &(0x7f00000000c0))
socketpair$unix(0x1, 0x5, 0x0, &(0x7f0000000100)={<r1=>0xffffffffffffffff, <r2=>0xffffffffffffffff})
mprotect(&(0x7f0000ffa000/0x3000)=nil, 0x3000, 0x0)
mincore(&(0x7f0000ffa000/0x3000)=nil, 0x3000, &(0x7f0000000140)=""/224)
shmget$private(0x0, 0x1000, 0x85, &(0x7f0000ffc000/0x1000)=nil)
getsockopt$SO_PEERCRED(r1, 0xffff, 0x11, &(0x7f0000000240), 0xc)
__fhstat50(&(0x7f0000000280)="5c4c1f9f0a277ea9f61113c53231c62c24694bbdbead9fdcf22f2916be108c91a721799c5ee849e31e23a258b1c6fdadad19f336eb5ff0fa8d61d9e377cb4a402882204b4d1b4d10f37bbfc5e8cbdfa7af26186af65b940c81927f60f53771e2c12ad11000be907417e2af45f7123364764da5f1148db2e040330b54234fd408531469f3a1c4b240241f4456ae83b19699364de34f1430fe5d2d13711884921521d48d3309df5d3bbda91dd9176f01bb6cca461438c4f7968d24bac58c18215a9f9b08645324062005f841268ae37e0064e119", 0xd3, &(0x7f0000000380))
__fstat50(r2, &(0x7f0000000440))
r3 = getpgid(0x0)
getpriority(0x1, r3)


clock_nanosleep(0x0, 0x1, &(0x7f0000000100), 0x0)


_ksem_init(0x0, &(0x7f0000000100)=<r0=>0x50535244)
_ksem_close(0x0)
_ksem_destroy(r0)
_ksem_close(r0)


open(&(0x7f0000000100)='./file0\x00', 0x200, 0x0)
r0 = open(&(0x7f0000000040)='./file0\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x200000)=nil, 0x200000, 0x3, 0x10, r0, 0x0, 0x0)
mprotect(&(0x7f00001ef000/0x2000)=nil, 0x2000, 0x0)


compat_40_mount(&(0x7f0000000200)='cd9660\x00', &(0x7f00000000c0)='.\x00', 0x0, &(0x7f00000002c0))


symlink(&(0x7f0000000080)='.\x00', &(0x7f0000000240)='./file0\x00')
__mount50(&(0x7f0000000180)='efs\x00', &(0x7f0000000200)='./file0\x00', 0x0, &(0x7f0000000300)="13", 0x1)


symlink(&(0x7f0000000080)='.\x00', &(0x7f0000000240)='./file0\x00')
compat_50___msgctl13$IPC_STAT(0x0, 0x2, &(0x7f0000000280)={{}, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, &(0x7f00000001c0)={&(0x7f0000000100), 0xb}})
lchown(&(0x7f0000000100)='./file0\x00', 0x0, 0x0)
compat_40_mount(&(0x7f0000000380)='union\x00', &(0x7f0000000140)='./file0\x00', 0x0, &(0x7f00000001c0))
r0 = socket$inet(0x2, 0x2, 0x0)
setsockopt$inet_opts(r0, 0x0, 0x200000000000c, &(0x7f0000000080)="eaef125c00000000", 0x8)
execve(&(0x7f0000000200)='./file1\x00', 0x0, 0x0)


mlockall(0x3)


chroot(0x0)
__mount50(&(0x7f00000002c0)='overlay\x00', &(0x7f0000000040)='.\x00', 0x0, &(0x7f0000000540), 0x0)
symlink(&(0x7f0000000380)='.\x02\x00', &(0x7f00000002c0)='.\x02\x00')
rename(&(0x7f00000000c0)='.\x02\x00', &(0x7f0000000100)='./file0/file0/../file0\x00')


mknod(&(0x7f00000000c0)='./file0\x00', 0x6000, 0xe03)
compat_50___msgctl13$IPC_STAT(0x0, 0x2, &(0x7f0000000280)={{}, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, &(0x7f00000001c0)={&(0x7f0000000100)}})
lchown(&(0x7f0000000100)='./file0\x00', 0x0, 0xffffffffffffffff)
compat_40_mount(&(0x7f0000000140)='msdos\x00', &(0x7f00000000c0)='.\x00', 0x0, &(0x7f00000001c0))


symlink(&(0x7f0000000080)='.\x00', &(0x7f0000000240)='./file0\x00')
compat_50___msgctl13$IPC_STAT(0x0, 0x2, &(0x7f0000000280)={{}, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, &(0x7f00000001c0)={&(0x7f0000000100), 0xb}})
lchown(&(0x7f0000000100)='./file0\x00', 0x0, 0x0)
compat_40_mount(&(0x7f0000000380)='union\x00', &(0x7f0000000140)='./file0\x00', 0x0, &(0x7f00000001c0))
__mount50(&(0x7f00000002c0)='overlay\x00', &(0x7f0000000040)='.\x00', 0x0, &(0x7f0000000540), 0x0)
__mount50(&(0x7f0000000000)='overlay\x00', &(0x7f0000000040)='.\x00', 0x0, &(0x7f0000000540), 0x0)
symlink(0x0, 0x0)
lchown(0x0, 0x0, 0xffffffffffffffff)
compat_40_mount(&(0x7f0000000380)='union\x00', &(0x7f0000000140)='./file0\x00', 0x0, &(0x7f00000001c0))
unlink(&(0x7f0000000000)='./file0\x00')
open(&(0x7f00000000c0)='./file0\x00', 0x615, 0x0)
truncate(&(0x7f0000000200)='./file0\x00', 0x0, 0x7fff)
truncate(&(0x7f0000000100)='./file0\x00', 0x0, 0x0)


symlink(&(0x7f0000000000)='./file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa/../file0\x00', 0x0)
mknod(&(0x7f0000000480)='./file0\x00', 0x2000, 0x1733)
r0 = open(&(0x7f00000000c0)='./file0\x00', 0x0, 0x0)
ioctl$FIONREAD(r0, 0x80104267, &(0x7f0000000080))


symlink(&(0x7f000001fb80)='./file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa/file0\x00', &(0x7f0000000100)='./file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\x00')
link(&(0x7f0000000000)='./file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\x00', &(0x7f0000000240)='./file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa/file0\x00')


mknod(&(0x7f0000000000)='./file0\x00', 0x2000, 0x5900)
open(&(0x7f0000000040)='./file0\x00', 0x0, 0x0)
open(&(0x7f00000000c0)='./file0\x00', 0x0, 0x0)


__mount50(0x0, &(0x7f0000000040)='.\x00', 0xe680bf986d21abfb, 0x0, 0x0)


socketpair(0x1f, 0x3, 0x0, 0x0)


open(&(0x7f0000001640)='./file0\x00', 0x615, 0x0)
ktrace(&(0x7f0000000200)='./file0\x00', 0x4, 0xd27d43220c7df9b, 0x0)
r0 = open$dir(&(0x7f0000000180)='./file0\x00', 0x0, 0x0)
preadv(r0, &(0x7f00000012c0)=[{0x0, 0x19}], 0x1, 0x0)


mkdir(&(0x7f0000000080)='./file0\x00', 0x0)
__mount50(&(0x7f0000000000)='fdesc\x00', &(0x7f0000000200)='./file0\x00', 0x0, 0x0, 0x0)
faccessat(0xffffffffffffff9c, &(0x7f0000000040)='./file0\x00', 0x2, 0x0)


utimensat(0xffffffffffffffff, 0x0, &(0x7f0000000300)={{0x0, 0x3fffffff}}, 0x0)


r0 = posix_spawn(0x0, 0x0, 0x0, 0x0, 0x0, 0x0)
ptrace(0x6f, r0, 0x0, 0x0)


_ksem_unlink(0x0)


r0 = socket(0x18, 0x1, 0x0)
r1 = socket(0x18, 0x1, 0x0)
setsockopt(r1, 0x1000000029, 0x4, &(0x7f0000000040)="03000000", 0x4)
dup2(r1, r0)
connect$unix(r0, &(0x7f00000000c0)=@abs={0x682eb13985c518e6, 0x7}, 0x1c)


modctl$MODCTL_UNLOAD(0x2, 0x0)
mknod(&(0x7f0000000180)='./bus\x00', 0x2000, 0x0)
r0 = open$dir(&(0x7f0000000000)='./bus\x00', 0x0, 0x0)
ioctl$FIOASYNC(r0, 0x80017472, &(0x7f00000001c0))


mknod(&(0x7f0000000280)='./file0\x00', 0x6000, 0x500)
mkdirat(0xffffffffffffffff, &(0x7f0000000000)='./file0\x00', 0x0)
swapctl$SWAP_ON(0x7, &(0x7f0000000000), 0x0)


mknod(&(0x7f0000000040)='./bus\x00', 0x100000000205f, 0x2802)
socket(0x11, 0x3, 0x0)
r0 = socket$inet(0x2, 0x2, 0x0)
setsockopt$inet_opts(r0, 0x0, 0x200000000000c, &(0x7f0000000080), 0x0)
setsockopt$inet_opts(r0, 0x0, 0xd, &(0x7f0000000240), 0x0)
ktrace(&(0x7f0000000100)='./bus\x00', 0x1, 0x4000010e, 0x0)


mkdir(&(0x7f0000000080)='./file0\x00', 0x0)
compat_50___msgctl13$IPC_STAT(0x0, 0x2, &(0x7f0000000280)={{}, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, &(0x7f00000001c0)={&(0x7f0000000000)}})
compat_40_mount(&(0x7f0000000040)='ext2fs\x00', &(0x7f0000000140)='./file0\x00', 0x200, &(0x7f00000001c0))


unlink(0x0)
mlockall(0x3)
munmap(&(0x7f0000ffc000/0x4000)=nil, 0x4000)
r0 = shmget$private(0x0, 0x4000, 0x0, &(0x7f0000ffb000/0x4000)=nil)
shmat(r0, &(0x7f0000ffc000/0x3000)=nil, 0x0)
mprotect(&(0x7f0000ffe000/0x1000)=nil, 0x1000, 0x0)


compat_40_mount(0x0, 0x0, 0x0, &(0x7f0000000080)="189595edeb718a18fb")
r0 = socket(0x2, 0x2, 0x0)
ioctl$FIONREAD(r0, 0xc0106914, &(0x7f0000000080))


_ksem_init(0x0, &(0x7f0000000400)=<r0=>0x0)
_ksem_timedwait(r0, &(0x7f0000000040)={0x80000000})


socket$unix(0x1, 0x0, 0x0)
r0 = socket$inet(0x2, 0x3, 0x0)
ioctl$FIOGETBMAP(r0, 0xc008667a, &(0x7f0000001480))


open(&(0x7f0000000040)='./file0\x00', 0x70e, 0x0)
link(&(0x7f0000000940)='./file0\x00', 0x0)


r0 = socket(0x11, 0x3, 0x0)
sendmsg(r0, &(0x7f0000002880)={0x0, 0x0, 0x0, 0x0, &(0x7f0000001440)=[{0xd0, 0x0, 0x0, "3f38455ebe81793dad9924b5c520f99448e539aaf0dc839129b6e3ac10f8f511433e2d1e25c2b4dccd4dff8e3b840912467eda15f085f1edbbb8778f96430952988190e0da533c5dc821327dfbbc0dcd00160be1962f6f55fbbb5d350cc329667b4c8490cc377807d52fdf22b107f7a60f1886c493172eb77715893126136681b343b2339d30e45c32acd484285ad06522090c69ba1a89f2253b0d6be8190e1413bee597d9e949cf738aaa2c82985dc5498129a45c1ba26053"}, {0x18, 0x0, 0x0, "87"}, {0x50, 0x0, 0x0, "6a9d6affd45d34f176fc394b3b2532048ec2218f9cd290132fcc2b17d8cd32104e08678b6332e5203a9c21ac14b5cf04a0d3a2f302acd9511d"}, {0x70, 0x0, 0x0, "35d00b1b57a6d123ba8403c7dd3c608df608ebd082b054182874411f489fa55345e668ede94b6e7dc6cf58882bb61aa5ffd3d03c7b2f5394edbc45c680b6b7a621fe97901b610c720ab54abdca92f322770a062c3520e3b064"}, {0xf0, 0x0, 0x0, "a3fe80503d9b419f32ee2382d5b180804270bea9110ad9268b1ebafdaef74d7a997ec643e1ab017cfded341b70989969408e57ef83fcc22bbb0a23f11841cda579801486b6f0d53270461e5dbfd49f2d146b446e52d8329ffca683b6aed64fb9f0753d51ee363763fc8dce3d15c41423428d464e396c3f83fec10391e6fa205f0ff7f0f1fc64c5f1223c91ebc1bc28ea68543a300c9f1d4fab8282659ec042495696d001140ab23cccfb77eb7edf92d8d92fc2cf0f358efdb0f6ce063a3293d06bc2c62989c4412c5ce48c8a44266e62891c17ea851ea9254a"}, {0xd70, 0x0, 0x0, "6955bf76c7fbe54f1cbd52d03ed9c009448641d4a53aa0511ff7b1270d9f69dc569703b780a9bbf5a232e27e376b20b2ce38574e6c33042e636be1f0d53d2c1bdf1dcd028d066b3e8e7b2ec5826fada56693b2f8220d05bbdf04afc720b7a4e120aa0f28ad6f4dd624b671b101ac2de90ade4ba597924f2ac679ef29203aa751da3bf241b30f64684e87bf99896400e77e71cf9428450ff7e171dfb784951835931d1ba7caba3c227759afe975a3d2f67859c0c3d5821028c9ecb6f03364562f5cf20a4e7162c4d003305ebc3691006e6b14de0f71bd727425c91bb70adebd84f09ed1618dc4259a31520b60027c11cf90142874127c1f28acf13b23328dbcea8d792055fe9c80df29501cf70b6d486621fd5546d3fdccf16b746424f3f85ec4d121c2a2619b428c21a2a8063f3613aafb9e36a80bc58f23e28c7d51fcfd3b793bf9ee72f36c07d46ade9b3485c3fa71e260d385a9ae0503ceaadd3a378c24f8633125a61a9a170ce815259378857ea5a1ced820621231a566669014b082fd6a854c7d7d93bb1525dc47fa3299dfe3b85bb3617d5f6d0d135f9c0259174fa8e2937feecc815d60808f7bda1f46dc849c315649d6b1eeac546344fe2869892a88e64e8b3009f3e0cffd6919ca5b77d3a8078f9107f4f41972a653ea4e3273e3efa5a377da9d3ffb873839ed5d45502cf7f13022d02390550c9552b11a57f26d0d3455064ab0a468e0e4d4dba4786dbec98aabc8cbd780011e9f6c99738ddc6e5d70d33921414be94ad1fd4c3d53b5c76df9601e73992fa17dbeb57214c9055f9c6f3dcb938d9750d44a50ee076d13e8c197e51e39d57ea6859682dd5cb2d09564c7a2d79b808ffd21c0217a1b13f796687ea505cae1bbeb3704993c3332e161d43ca58659ca1e07d3d748e5c780d135cba3dc9aeab098ea7890bd37d070c5fac3d629b5ee84caab9c8e8e6185c8c5f50e75467711d9871840843f43fad7eabda546417ff645fec4d00bd5f6ffd49673346731f423c3591ea649ec923a0402c9ade3f9d04516595d67fa66ec74b3325b616174389f1fac36ddf818f172f35e86c0140cd7ec8b5fb2cd65443b26be2ae8e666f418dbcdc0cc32cd9ac40f08ce9c1bbc95fa392d9c51054ffd28d0058e4ada714ee3aadfb3bf0e14ecb4571b113ab80bcac1ab37becad3cfc9157590dad5efb197b2dd10acbffcc5867421952afe763ab73b1dd5555dcfd5b37ae8c9fb37b6611de8aa994c81689625456e1471bb67894047666524470ddaaca07c69468b5d73734e911e5a42ea5f559ae1c5bb4a70c5924f8e57b86c54572cf2bc936b37dd23aaae5f9d89cd7a466bc89912edbd583f9aed92abf1940056a9d2bb8055905f741f61e20876a42a379be9a682d101f881e5af88ded87efbed441f2c19e8bb256145772fb2246cbd03fdbf04c45b14efe04f40bafd802dc659d238374e77fab8165ee388da37b41611636da4b0e8b3cf35f22522cfad90c99be6652087db77a4aa4e9cee2d460ef858394c5a44e87df07de82980e9db3af748024af9a848082be82d9be53b8dcb5f2580e46d68fee70945b8e26282788da3d0a02aec8e53f488210283129dd674fd3b6f2607c43858a78a9a3e29f582975266e3639824dcdfb5c8a8b4e09607a105c24dc16982c8e3fa9a3e06db286f2e8d3b4e5d08f5ae7de156e4cc250dfd3d03d73488a48d29cffe7ffa452fee9bab8e9bc187dc91a8553bfd4465ce4d9728989a93dd059de129cd04fa782c00104a7b731d636a77b79da5d51cd06020c8510fb9241490f114ac914f1734fcfa5ec0fb4b658288c9b96a89355861194116c1b2059dcd0f220b1c41f9965428879f08bfeb0863423b241d1a002915e97bee54d9b62d216a2350895b9e7a93b2488aae095e44adff8c9f49df64b588492d20b9331039743d572e091d38c0192644ee5a30a460a7a39851e54efd5e94b0234eab5a3e95af33bb2a6eb01d1b3bb0afefdcc3fa63402c960f12c9f66d451eddb490ab3f1a3a7faa5e95ea808953eb9f60da70cfd8c2514357b894924c1c1687e86bf313b06ff232f324b7999101a17b6bcd13b7511d51a0c4e0311b68a5df7c6f0442b3e02f9b4e4d10df7c45abcbf295a403453b0f464c293f9c40528b978642faaad10dff2925c128623724aa27188d1da93babe1a8fb29f1cf3bfd76d185c7d8066613ff94758712a92423ea0c0bd517175bd8594d64b26a91605b785c9f2bc93f5bd5aee5143c1c9f00b0b1f3c1fae9411ed65a3a53de76eaa115a946d7b447b2061e49715aa18e2a2ea46f86c95be613054871c2b552e6207b1b48f45df8056c941105588c4b201a402380635db3184741a4b5370f33cf18ffb90a69f7bee834fc47be841413ff90b21fee2ffa18acd8ae8914e2c8b38a7d171b6f8f796ac0fff1b4462a502b60fe4d74af9e8f158c3c933e9dfe45277c46ba1ca39f78ef680883de59e289f86c6acb5a929a6113c26a25177953782d02d887302542d2939b675006a4910f2f24d9db472eb406b4cf36f92f15f4d8a3a37e46034ab7245691f5ddc3506fd1cb2771ca812d656d315674c7eef0903ccbc210ee8af138a3336eb1aa38ddf62f8e7f6e8bd987786deaa73d5cbf3f259db1e1f1bbd010d2bc1309a26632664b4eff70c501d511a4a497af931888a6132bc42bc4828fbeb19c8a0debb0d35793cdd293dfb6d043d09365b7f174f853407caa662143550cfc52b28dce74620ed887988e6d93926c52f4a5c74779aa464a235ef264463b681c4fb44bbc1222c76b10d1d0c23e3141b37dd39b64609857de7c35795d5ce124d3ec3b6fcad143715829b4d79961f6cc538e54b2636cc30aeff29cc262d711796f4ff2bf01855f9623b20d188262db1cc5a2ac046a3f1358e8e8c8c4b626f75eb69d71a7e3e6692148e8fc5a3c0f48a5c523f1694847fb15f034b2fc43da6f60a9d471730954f78a3edd4dab3f569a55e4cd4837b3ecba7f2a1f6e2bc0c117700dbb6794794b5ce865e7c514191665f7295fd9bc454e48da617540cfabc378a7976b7c9e16a6171d1b9a3e6e50e86c9cca87b3a269c30921bc453863f43a2e2de3fb5da512c622de677d984966d1f549cdf63bf3aba4cf9c61b7fe832f823cd843063c58ce65c21ca1ecb2882109b6c1317736c7510483e47cbef7f14cdd86259eab8240307566f1e3299c4fba81dc4582860582b06e877b818bf8cdd7536c60c3117689afaa2a471f03f12d7581c6407387fbcb6d8b99425ed73c8b2f9dd16a952f62acf51e863ea81348c6112bd695eecb29ae3b218367676f64431558b3e71db48bfaed26a0de9fe9a01cc79e6c0ac2bd97db9821ce4a026ce026326f7cdc287868af87b9914b8e5943480607f0cc76c18a9e8da86150c1088a445b1d601d1c7ff0ce9a099fb7ac2c9db2992df0dd14c9c1f1650497a75ae75d4f5ddadaa440fdf5654da2a1ff514bc5a7eecdc1a79acda29a02d106487b195f8a0436e2c705e100de5ade0b1aeb6027c3850594e17ddbb6cf70570bd5486617770bb1c7675ebc842277245c17b24e52d7a81c2157b8046373b428c9a66d8e8fbd930f8dd9973414d3f7127d5f031a1a65606ac0be5469d8bd025af1756b7ef65573b1eae29027d2b8dd8f5205532fa7fec961de85e869cbc70d8e7664badd8da623f34fa982293af47f7f93dc901b47fea40b2406767267ee67b13090d03bab03f4d917db59b47c9861f8b7f2fd0f8462880e8b417fd57cd3ef9a127934561dfa6ffd65a669e8686e84f7c9aa39bd63c653a3c225400ee83815951e8fec5e28e08c098e9ab1863ccb465c8ed585a7929ca5f730be728834a8520721cb1fc0768569053ecf6202621be65eba6c2f412d79a06a2e0b27de9d452ca2b86c902b2fd386fdad2c4cfa79cd768774277086d5675f774e91ee115eede68046185deab24711d749045215a9763ad706312406d7f68f88595bcfd328c078c6afde17019d4ae89d4204c590c1a623c57692611e06185e29721c59c6ef23bc6174cec34a21f4dc50867d306aa9fb3c737f5a8e562c48270038d23087abeb6306fa156789e91f51116ce817edfb179891f85093c2692184ba1a8b63557fe923fa3a16c9399ab4005a782171e28272f275d8ba8d7d1d2f25459354729296a490075e301a9d01791e078be0dacc09d65eb673be798d7af07ef81450574962d5ec14a7914266cdc40b21d45837db37bc62e353b02562087c8d59d1b490eda0f0c348ae7b75ba85899600884a8a17ad374fcb5a65992ddec90077dac2f50eebdfd20d2fce0ecdd63241391c8e8fe476ab8f878097632292a015d37ee4d2f5b0ab484120824b8490db2c72505e3ccfb2839e83e19a3f3ec422507c7ec0df7c6032fe2f9abff9789611ca2691d0d8dee866481978261d1621ca10fda73f8194de5e23061aede81e14bf58b4d9d007455ba3fee066fd2f82ee498911cfc052f0abfd728a0664ce8a18dcd26f9caab3360d5d7976ff8e4671386415dfdf608a3ca98f0068da92971b23402375ef167dac86162323e3ef4c629ba684430d015341ba70878f4743fa25906fc996d46e05d3e1c3cea407a5a1c6c856d39763542eb3ae2756745d8707435d787af3ff3b9cf95c24e78df2e839d880d92c5925c8597350c3cd6d5b47c3c20d6ad38f118c4831da313c61e3b576a0697c2a72f88f16c48cc8b3b1539f58d9a8fba653a3e1e63511970902ad50c378fd30465a7d773dd915de2657f071c66f114b1ffda8cbba5849d6ba72560b46cedff58918b87fe237cccee78f31d611cf5d33a15a02f9552a0a19362a"}], 0x1008}, 0x0)


r0 = socket$inet(0x2, 0x2, 0x0)
setsockopt$inet_opts(r0, 0x0, 0x200000000000c, &(0x7f0000000240)="ea00000100000000", 0x8)
r1 = socket$inet(0x2, 0x2, 0x0)
close(r1)
r2 = socket$inet(0x2, 0x3, 0x0)
dup2(r0, r2)
setsockopt$inet_opts(r2, 0x0, 0x200000000000c, &(0x7f0000000240)="ea02000000000000", 0x8)
setsockopt$inet_opts(r1, 0x0, 0xd, &(0x7f00000003c0)="ea00000100000000", 0x8)


compat_40_mount(&(0x7f0000000040)='tmpfs\x00', &(0x7f00000000c0)='.\x00', 0x0, &(0x7f00000002c0)="01")
mknod$loop(&(0x7f0000000000)='./file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\x00', 0x2000, 0x1)
link(&(0x7f0000000940)='./file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\x00', &(0x7f0000000d40)='./file0\x00')
unlink(&(0x7f0000000180)='./file0\x00')
symlink(&(0x7f0000000080)='.\x00', &(0x7f0000000300)='./file0\x00')
compat_50___msgctl13$IPC_STAT(0x0, 0x2, &(0x7f0000000280)={{}, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, &(0x7f00000001c0)={&(0x7f0000000100), 0xdffffffffffff7ff}})
lchown(&(0x7f0000000100)='./file0\x00', 0x0, 0x0)
compat_40_mount(&(0x7f0000000380)='union\x00', &(0x7f0000000140)='./file0\x00', 0x0, &(0x7f00000001c0))
unlink(&(0x7f0000000a80)='./file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\x00')


symlink(&(0x7f0000000080)='.\x00', &(0x7f0000000240)='./file0\x00')
compat_50___msgctl13$IPC_STAT(0x0, 0x2, &(0x7f0000000280)={{}, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, &(0x7f00000001c0)={&(0x7f0000000100), 0xb}})
lchown(&(0x7f0000000100)='./file0\x00', 0x0, 0x0)
compat_40_mount(&(0x7f0000000380)='union\x00', &(0x7f0000000140)='./file0\x00', 0x0, &(0x7f00000001c0))
unlink(&(0x7f0000000000)='./file0\x00')
mknod(&(0x7f0000000180)='./file0\x00', 0x2000, 0x202)
__getfh30(&(0x7f0000000140)='./file0\x00', 0x0, 0x0)


mknod(&(0x7f00000000c0)='./file0\x00', 0x2000, 0x4100)
r0 = open(&(0x7f00000000c0)='./file0\x00', 0x0, 0x0)
ioctl$FIOASYNC(r0, 0x80085762, &(0x7f0000000080))


r0 = socket$unix(0x1, 0x2, 0x0)
bind$unix(r0, &(0x7f0000000100)=@file={0x1, '\xe9\x1fq\x89Y\x1e\x923aK\x00'}, 0x6e)
r1 = socket$unix(0x1, 0x2, 0x0)
swapctl$SWAP_ON(0x2, 0x0, 0x0)
__mount50(&(0x7f0000000000)='overlay\x00', &(0x7f00000000c0)='.\x00', 0x0, &(0x7f0000000540), 0x0)
r2 = socket(0x1, 0x2, 0x0)
ioctl$FIONREAD(r2, 0xc0106914, &(0x7f0000000080))
connect$unix(r1, &(0x7f0000000180)=@file={0x1, '\xe9\x1fq\x89Y\x1e\x923aK\x00'}, 0x6e)


fcntl$lock(0xffffffffffffffff, 0x0, &(0x7f00000000c0)={0x0, 0x0, 0x83fe})
r0 = socket(0x18, 0x400000002, 0x0)
r1 = socket(0x18, 0x2, 0x0)
connect$unix(r1, &(0x7f00000000c0)=@abs={0x682eb13985c518e6, 0x7}, 0x1c)
dup2(r1, r0)
getsockopt(r0, 0x29, 0x2c, 0x0, 0x0)


_ksem_init(0x0, &(0x7f00000006c0)=<r0=>0x50535244)
_ksem_getvalue(r0, 0x0)


r0 = socket$inet(0x2, 0x2, 0x0)
close(r0)
r1 = socket$inet(0x2, 0x2, 0x0)
setsockopt$inet_opts(r1, 0x0, 0x200000000000a, &(0x7f0000000040)='\x00', 0x1)
setsockopt$inet_opts(r0, 0x0, 0x9, &(0x7f0000000100)="ea000001", 0x4)


mprotect(&(0x7f0000ffc000/0x4000)=nil, 0x4000, 0x0)
mlock(&(0x7f0000ffc000/0x3000)=nil, 0x3000)
mmap(&(0x7f0000ffb000/0x4000)=nil, 0x4000, 0xd49f275d97cc01bb, 0x1810, 0xffffffffffffffff, 0x0, 0x0)


symlink(&(0x7f0000000080)='.\x00', &(0x7f0000000240)='./file0\x00')
compat_50___msgctl13$IPC_STAT(0x0, 0x2, &(0x7f00000003c0)={{}, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, &(0x7f00000001c0)={&(0x7f0000000100), 0xdffffffffffff7ff}})
lchown(&(0x7f0000000100)='./file0/file0\x00', 0x0, 0x0)
compat_40_mount(&(0x7f0000000380)='union\x00', &(0x7f0000000140)='./file0\x00', 0x0, &(0x7f00000001c0))
chflags(&(0x7f00000003c0)='./file0\x00', 0x5)
mkdir(&(0x7f00000000c0)='./control\x00', 0x0)
rmdir(&(0x7f0000000040)='./control\x00')


mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1)
compat_50_select(0x0, 0x0, 0x0, 0x0, &(0x7f0000000100))


fcntl$lock(0xffffffffffffffff, 0x0, &(0x7f00000000c0))
r0 = getsid(0x0)
ptrace(0x9, r0, 0x0, 0x0)
compat_50_wait4(0x0, 0x0, 0x0, 0x0)
ptrace(0xd, r0, &(0x7f0000000240), 0x8)


symlink(&(0x7f0000000080)='.\x00', &(0x7f0000000240)='./file0\x00')
compat_50___msgctl13$IPC_STAT(0x0, 0x2, &(0x7f0000000280)={{}, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, &(0x7f00000001c0)={&(0x7f0000000100)}})
__posix_rename(&(0x7f0000000100)='./file0\x00', 0x0)
compat_40_mount(&(0x7f0000000140)='umap\x00', &(0x7f0000000040)='./file0\x00', 0x0, &(0x7f00000001c0))
r0 = open$dir(&(0x7f0000001240)='.\x00', 0x0, 0x0)
__getdents30(r0, 0x0, 0x0)


symlink(&(0x7f0000000080)='.\x00', &(0x7f0000000240)='./file0\x00')
compat_50___msgctl13$IPC_STAT(0x0, 0x2, &(0x7f0000000280)={{}, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, &(0x7f00000001c0)={&(0x7f0000000100), 0xb}})
lchown(&(0x7f0000000100)='./file0\x00', 0x0, 0x0)
compat_40_mount(&(0x7f0000000380)='union\x00', &(0x7f0000000140)='./file0\x00', 0x0, &(0x7f00000001c0))
unlink(&(0x7f0000000000)='./file0\x00')
open(&(0x7f00000000c0)='./file0\x00', 0x615, 0x0)
__mount50(&(0x7f00000002c0)='overlay\x00', &(0x7f0000000040)='.\x00', 0x0, &(0x7f0000000540), 0x0)
open(&(0x7f0000000000)='./file0\x00', 0x78e, 0x0)


open$dir(&(0x7f00000000c0)='./file0\x00', 0x40000400001803c1, 0x0)
r0 = open(&(0x7f0000000140)='./file0\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x200000)=nil, 0x200000, 0x3, 0x10, r0, 0x0, 0x0)
madvise(&(0x7f0000000000/0x3000)=nil, 0x3000, 0x80000000002)
__clock_settime50(0x0, &(0x7f0000002c00))


socketpair$unix(0x1, 0x1, 0x0, &(0x7f0000000040)={<r0=>0xffffffffffffffff, <r1=>0xffffffffffffffff})
writev(r1, &(0x7f0000000180)=[{&(0x7f0000000000)="9c", 0xffffff7d}], 0x1)
shutdown(r0, 0x0)


r0 = socket(0x18, 0x2, 0x0)
compat_43_orecvmsg(r0, &(0x7f0000000240)={0x0, 0x0, 0x0, 0x0, 0x0}, 0x2001)


mknodat(0xffffffffffffff9c, &(0x7f0000000040)='./file0\x00', 0x81c0, 0x0)
r0 = getpid()
ktrace(&(0x7f0000000000)='./file0\x00', 0x0, 0x40001404, r0)
compat_43_ogethostid()


setreuid(0x0, 0xee01)
swapctl$SWAP_ON(0x1, 0x0, 0x0)


mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
r0 = socket(0x18, 0x1, 0x0)
setsockopt(r0, 0x1000000029, 0x2e, &(0x7f0000000040)="ebffcbff13b9fd812eaa4e713048e69931929648", 0x14)
setsockopt(r0, 0x1000000029, 0x2e, &(0x7f0000000000)="ebffcbdf13b9fd812eaa4e713048e69931929648", 0x14)


r0 = socket(0x1d, 0x40000003, 0x0)
shutdown(r0, 0x1)


mknod(&(0x7f0000000280)='./file0\x00', 0x2000, 0x200)
r0 = open(&(0x7f0000000040)='./file0\x00', 0x0, 0x0)
getpeername$unix(r0, 0x0, 0x0)


r0 = socket$unix(0x1, 0x1, 0x0)
poll(&(0x7f0000000000)=[{r0, 0x4}], 0x1, 0x0)
shutdown(r0, 0x2)


symlink(&(0x7f0000000080)='.\x00', &(0x7f0000000240)='./file0\x00')
compat_50___msgctl13$IPC_STAT(0x0, 0x2, &(0x7f0000000280)={{}, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, &(0x7f00000001c0)={&(0x7f0000000100)}})
sendmsg$unix(0xffffffffffffffff, &(0x7f0000000240)={&(0x7f0000000180)=@abs={0x0, 0x0, 0x2}, 0x8, &(0x7f0000000200)}, 0x0)
open(&(0x7f0000000100)='./file0\x00', 0x0, 0x0)
compat_40_mount(&(0x7f0000000140)='umap\x00', &(0x7f0000000040)='./file0\x00', 0x0, &(0x7f00000001c0))
unlink(&(0x7f0000000000)='./file0\x00')
mknod(&(0x7f0000000280)='./file0\x00', 0x1100, 0x0)
open(&(0x7f0000000180)='./file0\x00', 0x80000000000206, 0x0)


r0 = semget$private(0x0, 0x4, 0x0)
compat_50_____semctl13$IPC_STAT(r0, 0x0, 0x2, &(0x7f0000000040))


r0 = socket$inet(0x2, 0x2, 0x0)
recvmmsg(r0, &(0x7f0000001d80)={0x0}, 0x10, 0x0, &(0x7f0000001dc0))


munmap(&(0x7f0000002000/0x1000)=nil, 0x1000)
r0 = shmget$private(0x0, 0x1000, 0x0, &(0x7f0000003000/0x1000)=nil)
shmat(r0, &(0x7f0000002000/0x1000)=nil, 0x0)
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)


syz_emit_ethernet(0x2a, &(0x7f0000000000))
syz_emit_ethernet(0x138, &(0x7f0000000000))
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x0)
syz_emit_ethernet(0x138, 0x0)
writev(0xffffffffffffffff, 0x0, 0x0)
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1)
r0 = socket(0x2, 0x1, 0x0)
accept$inet(r0, &(0x7f0000000040), &(0x7f0000000080)=0xc)
connect$unix(r0, &(0x7f0000000000), 0x10)


r0 = socket(0x18, 0x3, 0x0)
ioctl$FIOSEEKHOLE(r0, 0xc0906935, &(0x7f0000000180))


r0 = semget$private(0x0, 0x4, 0x7a4)
semop(r0, &(0x7f0000000400), 0x0)
semop(r0, &(0x7f0000000440), 0x0)
symlink(&(0x7f0000000080)='.\x00', &(0x7f0000000000)='./file0\x00')
compat_50___msgctl13$IPC_STAT(0x0, 0x2, &(0x7f00000003c0)={{}, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, &(0x7f00000001c0)={&(0x7f0000000100)}})
lchown(&(0x7f0000000100)='./file0\x00', 0x0, 0xffffffffffffffff)
compat_40_mount(&(0x7f0000000000)='null\x00', 0x0, 0x0, 0x0)
rename(0x0, 0x0)
__mount50(0x0, 0x0, 0x0, &(0x7f0000000540), 0x0)
open(0x0, 0x0, 0x0)
semctl$IPC_SET(r0, 0x0, 0x1, &(0x7f00000000c0)={{0x3, 0x0, 0x0, 0x0, 0x0, 0x0, 0x7}, 0xffffffffffffffff})
r1 = getgid()
____semctl50$GETALL(r0, 0x0, 0x6, &(0x7f00000000c0)=@buf=&(0x7f0000000080)={{0x0, r1, 0x0, 0x6, 0x2, 0x101, 0x5}, 0x800, 0x7, 0x0, 0x0})
mknod(0x0, 0x0, 0x0)
ioctl$FIONREAD(0xffffffffffffffff, 0x8020699d, 0x0)
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x0)
__setitimer50(0x0, 0x0, 0x0)
socket(0x0, 0x0, 0x0)


open(&(0x7f0000000040)='./file0\x00', 0x70e, 0x0)
ktrace(&(0x7f0000000200)='./file0\x00', 0x4, 0x0, 0x0)
minherit(&(0x7f0000ffe000/0x2000)=nil, 0x2000, 0x0)


mknodat(0xffffffffffffff9c, &(0x7f0000000040)='./file0\x00', 0xc0e99db6de761f86, 0x0)
open$dir(&(0x7f0000000240)='./file0\x00', 0x200000, 0x0)


r0 = getsid(0x0)
ptrace(0x9, r0, 0x0, 0x0)
compat_50_wait4(0x0, 0x0, 0x0, 0x0)
ptrace(0xb, r0, 0x0, 0x8)


mknod(&(0x7f0000000000)='./bus\x00', 0x2000, 0x2e00)
open$dir(&(0x7f0000000080)='./bus\x00', 0x0, 0x0)
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
r0 = open(&(0x7f0000000080)='./file0\x00', 0x0, 0x0)
pread(r0, 0x0, 0x0, 0x0)
__lstat50(&(0x7f0000000000)='.\x00', 0x0)


mknod(&(0x7f00000000c0)='./file0\x00', 0x2000, 0x4100)
r0 = open$dir(&(0x7f00000000c0)='./file0\x00', 0x0, 0x0)
compat_90_fstatvfs1(r0, &(0x7f0000000b80), 0x2)


r0 = getsid(0x0)
ptrace(0x9, r0, 0x0, 0x0)
r1 = getsid(0x0)
ptrace(0xe, r1, 0x0, 0x0)


r0 = socket(0x2, 0x2, 0x0)
ioctl$FIONREAD(r0, 0x80206979, &(0x7f00000001c0))
modctl$MODCTL_UNLOAD(0x2, &(0x7f0000000180))


mkdir(&(0x7f0000000080)='./file0\x00', 0x0)
pathconf(&(0x7f0000000080)='./file0\x00', 0x11)


r0 = socket(0x1f, 0x5, 0x0)
ioctl$FIOSEEKHOLE(r0, 0x80206975, &(0x7f0000000180)=0x8000000000000032)


compat_50_setitimer(0x0, &(0x7f0000000240)={{0xfffffffffffffca0}}, 0x0)


compat_50___msgctl13$IPC_STAT(0x0, 0x2, &(0x7f0000000280)={{}, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, &(0x7f00000001c0)={&(0x7f0000000100), 0xdffffffffffff7ff}})
symlink(&(0x7f0000000080)='.\x00', &(0x7f0000000240)='./file0\x00')
lchown(&(0x7f0000000100)='./file0\x00', 0x0, 0xffffffffffffffff)
compat_40_mount(&(0x7f0000000140)='umap\x00', &(0x7f0000000040)='./file0\x00', 0x0, &(0x7f00000001c0))
__mount50(&(0x7f0000000440)='overlay\x00', &(0x7f0000000040)='.\x00', 0x0, &(0x7f0000000540), 0x0)
lchown(0x0, 0x0, 0xffffffffffffffff)
compat_40_mount(&(0x7f0000000380)='union\x00', &(0x7f0000000140)='./file0\x00', 0x0, &(0x7f00000001c0))


open(&(0x7f0000000480)='./file0\x00', 0x7fffe, 0x0)
truncate(&(0x7f0000000440)='./file0\x00', 0x0, 0x3e2b649f)
mknod(&(0x7f00000000c0)='./bus\x00', 0x2000, 0x4f4b)
open(0x0, 0x0, 0x0)
poll(0x0, 0x0, 0x0)
r0 = openat(0xffffffffffffff9c, &(0x7f0000000040)='./file0\x00', 0x0, 0x0)
readv(r0, &(0x7f00000003c0)=[{&(0x7f0000000300)=""/53, 0x35}], 0x1)


modctl$MODCTL_UNLOAD(0x1, &(0x7f0000000080)="f7f18c4b0d602d76648e1e31046b3d046bdf3bf31d62c7487d077681d6fcd0998d")
modctl$MODCTL_LOAD(0x0, &(0x7f0000000080)={0x0, 0x0, 0x0, 0xc1})
compat_40_mount(&(0x7f0000000040)='tmpfs\x00', &(0x7f0000000140)='.\x00', 0x0, &(0x7f0000000080)="01")
symlink(&(0x7f0000000080)='.\x00', &(0x7f0000000240)='./file0\x00')
compat_50___msgctl13$IPC_STAT(0xffffffffffffffff, 0x2, &(0x7f0000000280)={{}, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, &(0x7f00000001c0)={&(0x7f0000000100)}})
__mount50(&(0x7f00000002c0)='overlay\x00', &(0x7f0000000040)='.\x00', 0x0, &(0x7f0000000540), 0x0)
lchown(&(0x7f0000000100)='./file0\x00', 0x0, 0xffffffffffffffff)
compat_40_mount(&(0x7f0000000140)='umap\x00', &(0x7f0000000040)='./file0\x00', 0x0, &(0x7f00000001c0))
open(&(0x7f0000000100)='./file0\x00', 0x0, 0x0)


mknod(&(0x7f00000000c0)='./file0\x00', 0x6000, 0xe03)
compat_50___msgctl13$IPC_STAT(0x0, 0x2, &(0x7f0000000280)={{}, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, &(0x7f00000001c0)={&(0x7f0000000100)}})
open(&(0x7f0000000100)='./file0\x00', 0x0, 0x0)
compat_40_mount(&(0x7f0000000140)='msdos\x00', &(0x7f00000000c0)='.\x00', 0x0, &(0x7f00000001c0))


mknod$loop(&(0x7f0000000000)='./file0\x00', 0x2000, 0x1)
rename(0x0, 0x0)
__clock_getres50(0x0, 0x0)
execve(&(0x7f0000000240)='./file0\x00', 0x0, 0x0)


r0 = getsid(0x0)
ptrace(0x9, r0, 0x0, 0x0)
compat_50_wait4(0x0, 0x0, 0x0, 0x0)
ptrace(0x4, r0, 0x0, 0x0)


openat$tprof(0xffffffffffffff9c, &(0x7f00000000c0), 0xbf00, 0x0)


open(&(0x7f0000000040)='./file0\x00', 0x70e, 0x0)
r0 = getpid()
ktrace(&(0x7f0000000000)='./file0\x00', 0x0, 0x0, r0)
compat_90_fhstatvfs1(0x0, 0x0, 0x0, 0x0)


symlink(&(0x7f0000000080)='.\x00', &(0x7f0000000240)='./file0\x00')
compat_40_mount(&(0x7f0000000380)='tmpfs\x00', &(0x7f00000003c0)='.\x00', 0x10800039, &(0x7f0000000140)="01")
r0 = open(&(0x7f0000000140)='./file0\x00', 0x0, 0x0)
fcntl$setown(r0, 0xf, 0x0)


mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
ktrace(0x0, 0x0, 0x0, 0x0)
compat_30___stat13(0x0, 0x0)
r0 = socket$inet(0x2, 0x1, 0x0)
setsockopt(r0, 0x0, 0x19, &(0x7f0000000080)="301dc649", 0x4)


open(&(0x7f0000000480)='./file0\x00', 0x7fffe, 0x0)
truncate(&(0x7f0000000780)='./file0\x00', 0x0, 0x10001)
acct(&(0x7f0000000080)='./file0\x00')


r0 = getsid(0x0)
ptrace(0x9, r0, 0x0, 0x0)
ptrace(0x15, r0, 0x0, 0x0)


setsockopt$sock_timeval(0xffffffffffffffff, 0xffff, 0x0, &(0x7f00000000c0)={0x0, 0x1ff}, 0x10)
r0 = socket(0x18, 0x400000002, 0x0)
r1 = socket(0x18, 0x2, 0x0)
connect$unix(r1, &(0x7f00000000c0)=@abs={0x682eb13985c518e6, 0x7}, 0x1c)
sendmsg$unix(0xffffffffffffffff, &(0x7f0000001700)={&(0x7f00000000c0), 0x1c, 0x0}, 0x0)
fcntl$lock(0xffffffffffffffff, 0x0, &(0x7f00000000c0)={0x0, 0x0, 0x84000000000000})
connect$unix(0xffffffffffffffff, &(0x7f00000000c0)=@abs={0x682eb13985c518e6, 0x7}, 0x8)
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
r2 = dup2(r1, r0)
sendmsg$unix(r2, &(0x7f0000001700)={0x0, 0x0, 0x0}, 0x0)


__mount50(&(0x7f0000000000)='overlay\x00', &(0x7f0000000040)='.\x00', 0x0, &(0x7f0000000540), 0x0)
open(&(0x7f00000000c0)='./file0\x00', 0x615, 0x0)
acct(&(0x7f0000000000)='./file0\x00')


r0 = open$dir(&(0x7f00000000c0)='.\x00', 0x0, 0x0)
mkdirat(r0, &(0x7f0000000100)='./file0\x00', 0x0)
mknodat(r0, &(0x7f0000000580)='./file0/file1\x00', 0x8000, 0x0)
rmdir(&(0x7f0000000180)='./file0\x00')


swapctl$SWAP_CTL(0x5, 0x0, 0x0)


open(&(0x7f0000000040)='./file0\x00', 0x205, 0x0)
r0 = open(&(0x7f0000000100)='./file0\x00', 0x0, 0x0)
fcntl$lock(r0, 0x7, &(0x7f0000000140)={0x0, 0x0, 0xfff, 0x100000002})


modctl$MODCTL_UNLOAD(0x1, &(0x7f0000000080)="f7f18c4b0d602d76648e1e31046b3d046bdf3bf31d62c7487d077681d6fcd0998d")
modctl$MODCTL_LOAD(0x0, &(0x7f0000000080)={0x0, 0x0, 0x0, 0xc1})
compat_40_mount(&(0x7f0000000040)='tmpfs\x00', &(0x7f0000000140)='.\x00', 0x0, &(0x7f0000000080)="01")
symlink(&(0x7f0000000080)='.\x00', &(0x7f0000000240)='./file0\x00')
compat_50___msgctl13$IPC_STAT(0xffffffffffffffff, 0x2, &(0x7f0000000280)={{}, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, &(0x7f00000001c0)={&(0x7f0000000100)}})
open(&(0x7f0000000100)='./file0\x00', 0x0, 0x0)
compat_40_mount(&(0x7f0000000140)='umap\x00', &(0x7f0000000040)='./file0\x00', 0x0, &(0x7f00000001c0))
r0 = openat(0xffffffffffffff9c, &(0x7f0000000140)='./file0\x00', 0x0, 0x0)
fdatasync(r0)


munmap(&(0x7f0000000000/0x4000)=nil, 0x4000)
r0 = getsid(0x0)
ptrace(0x9, r0, 0x0, 0x0)
compat_50_wait4(0x0, 0x0, 0x0, 0x0)
ptrace(0x12, r0, 0x0, 0x8)


munmap(&(0x7f0000fec000/0x14000)=nil, 0x14000)
minherit(&(0x7f0000ffd000/0x3000)=nil, 0x3000, 0x0)


r0 = socket$unix(0x1, 0x1, 0x0)
getsockopt$sock_timeval(r0, 0xffff, 0x1007, 0x0, 0x0)


modctl$MODCTL_UNLOAD(0x2, 0x0)
compat_50_setitimer(0x1, &(0x7f0000000000)={{}, {0x0, 0x613}}, 0x0)


r0 = _lwp_self()
r1 = _lwp_self()
_lwp_unpark_all(&(0x7f0000000000)=[r0, r1], 0x2, 0x0)


mknod(&(0x7f0000000000)='./file1\x00', 0x2000, 0xa718)
r0 = open$dir(&(0x7f00000000c0)='./file1\x00', 0x0, 0x0)
ioctl$OFIOGETBMAP(r0, 0xc010447d, &(0x7f0000000040))


rasctl(0x0, 0x9, 0x0)
rasctl(0x0, 0x9, 0x1)


r0 = getsid(0x0)
ptrace(0x9, r0, 0x0, 0x0)
compat_50_wait4(0x0, 0x0, 0x0, 0x0)
ptrace(0x7, r0, 0x0, 0x88)


modctl$MODCTL_LOAD(0x5, 0x0)
munlock(&(0x7f0000ffb000/0x3000)=nil, 0x3000)
mkdir(&(0x7f0000000080)='./file0\x00', 0x0)
__mount50(&(0x7f0000000000)='fdesc\x00', &(0x7f0000000180)='./file0\x00', 0x0, 0x0, 0x0)
compat_50___msgctl13$IPC_STAT(0x0, 0x2, &(0x7f0000000280)={{}, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, &(0x7f00000001c0)={&(0x7f0000000100), 0xdffffffffffff7ff}})
lchown(&(0x7f0000000100)='./file0\x00', 0x0, 0x0)
compat_40_mount(&(0x7f0000000380)='union\x00', &(0x7f0000000140)='./file0\x00', 0x0, &(0x7f00000001c0))


modctl$MODCTL_UNLOAD(0x2, 0x0)
mkdir(&(0x7f0000000080)='./file0\x00', 0x0)
compat_40_mount(&(0x7f0000000180)='ptyfs\x00', &(0x7f00000002c0)='./file0\x00', 0x0, &(0x7f0000000500))
r0 = open(&(0x7f00000000c0)='./file0\x00', 0x0, 0x0)
ioctl$FIOASYNC(r0, 0x8020426c, &(0x7f0000000140))


sendmsg(0xffffffffffffffff, &(0x7f0000000040)={0x0, 0x0, 0x0, 0x0, &(0x7f0000000080)=ANY=[@ANYRES8], 0x3e}, 0x0)
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
r0 = socket$inet(0x2, 0x2, 0x0)
setsockopt$inet_opts(r0, 0x0, 0x200000000000b, &(0x7f0000000080)='\x00', 0x1)


writev(0xffffffffffffffff, 0x0, 0x0)
compat_50_setitimer(0x1, &(0x7f0000000000)={{}, {0x0, 0x613}}, 0x0)
__setitimer50(0x1, &(0x7f0000000140)={{}, {0x3f0000000}}, 0x0)


compat_50___msgctl13$IPC_STAT(0x0, 0x2, 0x0)
compat_50___msgctl13$IPC_STAT(0x0, 0x2, &(0x7f0000000280)={{}, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, &(0x7f00000001c0)={0x0, 0xb}})
mknod(&(0x7f00000000c0)='./bus\x00', 0x2000, 0x4f4b)
r0 = open$dir(&(0x7f0000000040)='./bus\x00', 0x0, 0x0)
ioctl$FIOASYNC(r0, 0xc0104302, &(0x7f00000001c0)=0x20000002)


r0 = socket$inet6(0x18, 0x2, 0x0)
getsockopt(r0, 0x29, 0x1b, 0x0, 0x0)


r0 = socket(0x18, 0x1, 0x0)
pwritev(r0, &(0x7f0000000140)=[{0x0}], 0x1, 0x0)


mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
r0 = socket(0x18, 0x3, 0x0)
setsockopt(r0, 0x1000000029, 0x24, &(0x7f0000000000)="5ab7776a", 0x4)


mkdir(&(0x7f0000000080)='./file0\x00', 0x0)
compat_50___msgctl13$IPC_STAT(0x0, 0x2, &(0x7f0000000280)={{}, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, &(0x7f00000001c0)={&(0x7f0000000100)}})
open(&(0x7f0000000100)='./file0\x00', 0x0, 0x0)
compat_40_mount(&(0x7f0000000140)='umap\x00', &(0x7f0000000040)='./file0\x00', 0x0, &(0x7f00000001c0))
compat_50___msgctl13$IPC_STAT(0x0, 0x2, &(0x7f0000000280)={{}, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, &(0x7f00000001c0)={&(0x7f0000000100), 0xdffffffffffff7ff}})
lchown(&(0x7f0000000100)='./file0\x00', 0x0, 0xffffffffffffffff)
compat_40_mount(&(0x7f0000000380)='union\x00', &(0x7f0000000140)='./file0\x00', 0x0, &(0x7f00000001c0))
lchmod(&(0x7f0000000180)='./file0\x00', 0x0)


symlink(&(0x7f0000000080)='.\x00', &(0x7f0000000240)='./file0\x00')
chroot(&(0x7f0000000000)='.\x00')
__mount50(0x0, &(0x7f00000003c0)='./file0/file0/..\x00', 0x0, 0x0, 0x0)


mknod(&(0x7f0000000040)='./file0\x00', 0x201a, 0x0)
__mount50(&(0x7f0000000000)='overlay\x00', &(0x7f0000000040)='.\x00', 0x0, &(0x7f0000000540), 0x0)
mknod(&(0x7f00000000c0)='./bus\x00', 0x2000, 0x0)
r0 = open$dir(&(0x7f0000000080)='./bus\x00', 0x0, 0x0)
ioctl$FIOASYNC(r0, 0x20003101, 0x0)
unlink(&(0x7f0000000000)='./file0\x00')


open(&(0x7f0000000100)='./file0\x00', 0x200, 0x0)
__mount50(&(0x7f0000000000)='overlay\x00', &(0x7f0000000040)='.\x00', 0x0, &(0x7f0000000540), 0x0)
ktrace(&(0x7f00000000c0)='./file0\x00', 0x0, 0x30a, 0xffffffffffffffff)


swapctl$SWAP_STATS(0xa, &(0x7f0000000c40)={0x0, 0x0, 0x0, 0x0, 0x0, 0x0}, 0xffff)


ioctl$WSMUXIO_REMOVE_DEVICE(0xffffffffffffffff, 0x80085762, &(0x7f0000000040)={0x1})
socketpair$unix(0x1, 0x2, 0x0, &(0x7f0000000080)={0xffffffffffffffff, <r0=>0xffffffffffffffff})
getsockopt$sock_int(r0, 0xffff, 0x0, 0xfffffffffffffffe, &(0x7f0000000040)=0x18)
syz_emit_ethernet(0xe, &(0x7f0000000000))
writev(0xffffffffffffffff, &(0x7f0000000580)=[{&(0x7f0000000000)="b886b4e47f", 0x5}], 0x1)
syz_emit_ethernet(0x138, &(0x7f0000000000))
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1)
r1 = socket(0x2, 0x1, 0x0)
bind(r1, &(0x7f0000000000), 0x10)
setsockopt(r1, 0x6, 0x8, &(0x7f00000001c0)="fcda85f8", 0x4)
listen(r1, 0x0)
r2 = socket(0x2, 0x1, 0x0)
connect$unix(r2, &(0x7f0000000000), 0x10)
sendto(r2, &(0x7f0000000200)="ba147c4d085d4eb6f08a7f199468fd503c0de8476f816ef6d241aa9d749c498fe61545221ca92c3647264cd3ccd784302cd0188ca9bc6fa88f0f916d7f7fd927d2a7f51ed1376c60fa98cd3552f348c540eb4793deec40423f49bdd41990a4fdbd900d1c5a17528909da2b49388f80bd1d3f381dcb786d4374a4ef992144de80e422dd53cc8b05b756c3583fe15326447b94032c95ee03aa7e14d0527b370a3b5f772be2414be8e3698505ec3e2e1b6dd5428820ab0e4dca16542f06ea14c15a925f427eee39bcf43c405eac57a342940816e326190649f846f2b447", 0xdc, 0xc, &(0x7f00000000c0), 0xa)
sendto$inet(r2, &(0x7f0000000100)="18", 0xffffff36, 0x195a05e282d6161, 0x0, 0x0)
getpeername$unix(0xffffffffffffffff, 0x0, 0x0)
execve(0x0, 0x0, 0x0)
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
r3 = socket(0x18, 0x2, 0x0)
setsockopt(r3, 0x1000000000029, 0xb, &(0x7f0000000040)='\x00\x00\x00\x00', 0x4)


compat_40_mount(&(0x7f0000000040)='tmpfs\x00', &(0x7f0000000140)='.\x00', 0x0, &(0x7f00000001c0)="01")
mkdir(&(0x7f0000000080)='./file0\x00', 0x0)
compat_50___msgctl13$IPC_STAT(0x0, 0x2, &(0x7f0000000280)={{}, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, &(0x7f00000001c0)={&(0x7f0000000100)}})
lchown(&(0x7f0000000100)='./file0\x00', 0x0, 0xffffffffffffffff)
compat_40_mount(&(0x7f0000000140)='umap\x00', &(0x7f0000000040)='./file0\x00', 0x0, &(0x7f00000001c0))
pathconf(&(0x7f00000000c0)='./file0\x00', 0x5)


mknod$loop(&(0x7f0000000000)='./file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\x00', 0x0, 0x1)
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x0)
compat_30_fhstat(0x0, 0x0)


__mount50(&(0x7f0000000c00)='kernfs\x00', &(0x7f0000000040)='.\x00', 0x0, 0x0, 0x0)
__mount50(&(0x7f0000000000)='overlay\x00', 0x0, 0x0, 0x0, 0x0)
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
mknod(&(0x7f0000000000)='./file1\x00', 0x0, 0xa718)


r0 = open(&(0x7f0000000000)='.\x00', 0x0, 0x0)
setuid(0xee01)
mknodat(r0, &(0x7f00000003c0)='./file0\x00', 0x0, 0x0)


open(&(0x7f0000000040)='./file0\x00', 0x70e, 0x0)
__lstat50(&(0x7f0000000000)='./file0\x00', &(0x7f0000000080))
r0 = socket(0x800000018, 0x3, 0x0)
bind$unix(r0, &(0x7f0000000080)=@abs={0x1f95d27d48731892, 0x7}, 0x1c)


socketpair(0x10, 0x2, 0x0, &(0x7f0000004bc0))


r0 = getsid(0x0)
ptrace(0x9, r0, 0x0, 0x0)
compat_50_wait4(0x0, &(0x7f0000000080), 0x0, &(0x7f00000000c0))


mknod(&(0x7f0000000280)='./file0\x00', 0x2000, 0x200)
r0 = open$dir(&(0x7f0000000000)='./file0\x00', 0x2, 0x0)
writev(r0, &(0x7f0000000340)=[{&(0x7f0000000000), 0x2cfea}], 0x1000000000000013)


syz_emit_ethernet(0xe, &(0x7f0000000000))
writev(0xffffffffffffffff, &(0x7f0000000580)=[{&(0x7f0000000000)="b886b4e47f", 0x5}], 0x1)
syz_emit_ethernet(0x138, &(0x7f0000000000))
r0 = socket$inet(0x2, 0x5, 0x1)
sendto$inet(r0, &(0x7f0000000080), 0x0, 0x1, &(0x7f00000000c0)={0x2, 0x0}, 0xc)
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1)
r1 = socket(0x18, 0x3, 0x0)
setsockopt(r1, 0x1000000029, 0x25, &(0x7f0000000280)="5ab7776a", 0x4)
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1)
syz_extract_tcp_res(&(0x7f0000000040), 0x4, 0x800)
r2 = socket(0x2, 0x1, 0x0)
bind(r2, &(0x7f0000000000), 0x10)
dup(r2)
r3 = socket(0x2, 0x1, 0x0)
connect$inet(r3, &(0x7f0000000000), 0x10)
shutdown(r3, 0x2)
close(r3)


setsockopt$sock_int(0xffffffffffffffff, 0xffff, 0x0, &(0x7f0000000000)=0x9455, 0x4)
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
__select50(0x190, &(0x7f0000000000), 0x0, 0x0, 0x0)


r0 = openat(0xffffffffffffffff, &(0x7f0000000100)='./file0/file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\x00', 0x800, 0x0)
__utimes50(0x0, &(0x7f0000000280))
r1 = msgget$private(0x0, 0x0)
msgrcv(r1, 0x0, 0x0, 0x2, 0x1000)
getsockopt$sock_cred(0xffffffffffffffff, 0xffff, 0x1022, &(0x7f0000000000)={0x0, <r2=>0x0, <r3=>0x0}, &(0x7f0000000040)=0xc)
getsockopt$sock_cred(r0, 0xffff, 0x1022, &(0x7f0000000080)={<r4=>0x0, 0x0, <r5=>0x0}, &(0x7f0000000640)=0xe)
getpid()
msgctl$IPC_SET(r1, 0x11, &(0x7f0000000180)={{0x7, 0x0, r3, r2, r5, 0x1b, 0x2}, 0x7, 0x0, r4, r4, 0x4, 0x20000000000fc9, 0x3})
pipe(&(0x7f0000000340)={<r6=>0xffffffffffffffff, <r7=>0xffffffffffffffff})
chmod(0x0, 0x200)
recvfrom$unix(0xffffffffffffffff, 0x0, 0x0, 0x0, 0x0, 0x0)
mkdirat(0xffffffffffffffff, 0x0, 0x0)
r8 = open$dir(&(0x7f0000000000)='./file1\x00', 0x0, 0x0)
mkdirat(r8, &(0x7f00000000c0)='.\x00', 0x0)
fktrace(r6, 0x0, 0x0, 0x0)
accept$unix(r7, &(0x7f00000003c0)=@file={0x0, ""/515}, &(0x7f0000000600)=0x205)


open(&(0x7f0000000140)='./file0\x00', 0x78e, 0x0)
ktrace(&(0x7f0000000200)='./file0\x00', 0x4, 0xd27d43220c7df9b, 0x0)
semget$private(0x0, 0x0, 0x0)
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x0)
__stat50(0x0, 0x0)


poll(0x0, 0x0, 0x0)
r0 = socket(0x18, 0x1, 0x0)
compat_43_ogetsockname(r0, &(0x7f00000001c0)=""/125, 0xffffffffffffffff)


r0 = socket$inet6(0x18, 0x3, 0x0)
setsockopt$sock_int(r0, 0xffff, 0x200, &(0x7f0000000000), 0x4)


munmap(&(0x7f0000fec000/0x14000)=nil, 0x14000)
r0 = shmget$private(0x0, 0x4000, 0x0, &(0x7f000055b000/0x4000)=nil)
r1 = shmat(r0, &(0x7f0000ff5000/0x4000)=nil, 0x0)
__clone(0x0, 0x0)
shmdt(r1)


mknod(&(0x7f0000000300)='./file0\x00', 0x2000, 0x6da)
r0 = open(&(0x7f0000000000)='./file0\x00', 0x0, 0x0)
ioctl$FIOASYNC(r0, 0x80047466, &(0x7f0000000100)=0x5)
poll(&(0x7f0000000040)=[{r0, 0x80}], 0x1, 0x0)


mknod(&(0x7f0000000480)='./file0\x00', 0x2000, 0x1733)
r0 = open(&(0x7f00000000c0)='./file0\x00', 0x0, 0x0)
ioctl$FIONREAD(r0, 0x8010427a, &(0x7f0000000080))


r0 = socket(0x18, 0x2, 0x0)
setsockopt(r0, 0x1000000000029, 0x26, &(0x7f0000000040)='\x00\x00\x00\x00', 0x4)


mknod(&(0x7f0000000280)='./file0\x00', 0x1ffa, 0x0)
r0 = open(&(0x7f0000000140)='./file0\x00', 0x804, 0x0)
fcntl$setown(r0, 0x4, 0x0)


mkdirat(0xffffffffffffff9c, &(0x7f0000002040)='./file0\x00', 0x0)
lchown(&(0x7f0000000140)='./file0\x00', 0x0, 0xee01)


truncate(0x0, 0x0, 0x0)
compat_50_getrusage(0x0, 0x0)
r0 = socket(0x18, 0x3, 0x0)
ioctl$FIOSEEKHOLE(r0, 0xc118691d, &(0x7f0000000180))
mknodat(0xffffffffffffff9c, &(0x7f0000000040)='./file0\x00', 0x1020, 0x0)
undelete(0x0)
compat_50_quotactl(&(0x7f0000000000)='./file0\x00', 0x10001, 0x0, &(0x7f0000000080))


mknod(&(0x7f0000000180)='./file0\x00', 0x2000, 0x202)
mknod(&(0x7f0000000200)='./bus\x00', 0x6000, 0x202)
ktrace(&(0x7f0000000340)='./bus\x00', 0x0, 0x0, 0x0)
ktrace(&(0x7f0000000280)='./file0\x00', 0x0, 0x0, 0x0)


r0 = socket(0x18, 0x2, 0x0)
msgctl$IPC_STAT(0x0, 0x2, 0x0)
fcntl$lock(0xffffffffffffffff, 0x0, &(0x7f00000000c0)={0x0, 0x0, 0x83fe})
connect$unix(r0, &(0x7f00000000c0)=@abs={0x682eb13985c518e6, 0x7}, 0x1c)
fcntl$dupfd(0xffffffffffffffff, 0x0, 0xffffffffffffff9c)
close(0xffffffffffffffff)
setsockopt(r0, 0x1000000029, 0x3e, &(0x7f0000000000)="674cd6e5", 0x4)
writev(r0, &(0x7f0000000080)=[{0x0}], 0x1)


__clock_getres50(0x0, &(0x7f0000000000))
modctl$MODCTL_UNLOAD(0x4, &(0x7f0000000000))


modctl$MODCTL_UNLOAD(0x2, 0x0)
mkdir(&(0x7f0000000080)='./file0\x00', 0x0)
compat_40_mount(&(0x7f0000000180)='ptyfs\x00', &(0x7f00000002c0)='./file0\x00', 0x0, &(0x7f0000000500))
compat_50___msgctl13$IPC_STAT(0x0, 0x2, &(0x7f0000000480)={{}, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, &(0x7f00000001c0)={&(0x7f0000000100)}})
lchown(&(0x7f0000000100)='./file0\x00', 0x0, 0xffffffffffffffff)
compat_40_mount(&(0x7f0000000000)='null\x00', &(0x7f0000000140)='./file0\x00', 0x0, &(0x7f00000001c0))
unmount(&(0x7f0000000000)='./file0\x00', 0x0)


r0 = socket(0x2, 0x1, 0x0)
listen(r0, 0x0)
accept$inet(r0, 0x0, 0x0)
close(r0)


symlink(&(0x7f0000000080)='.\x00', &(0x7f0000000240)='./file0\x00')
compat_50___msgctl13$IPC_STAT(0x0, 0x2, &(0x7f0000000280)={{}, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, &(0x7f00000001c0)={&(0x7f0000000100)}})
open(&(0x7f0000000100)='./file0\x00', 0x0, 0x0)
compat_40_mount(&(0x7f0000000140)='umap\x00', &(0x7f0000000040)='./file0\x00', 0x0, &(0x7f00000001c0))
compat_40_mount(&(0x7f0000000140)='umap\x00', &(0x7f0000000040)='./file0\x00', 0x0, &(0x7f00000001c0))
mkdir(&(0x7f0000000080)='./file0\x00', 0x0)


r0 = socket(0x18, 0x400000002, 0x0)
getsockopt(r0, 0x29, 0x18, 0x0, 0x0)


r0 = socket(0x1f, 0x40000003, 0x0)
compat_43_orecvfrom(r0, 0x0, 0x0, 0x2009, 0x0, 0x0)


symlink(&(0x7f0000000080)='.\x00', &(0x7f0000000240)='./file0\x00')
compat_50___msgctl13$IPC_STAT(0x0, 0x2, &(0x7f0000000280)={{}, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, &(0x7f00000001c0)={&(0x7f0000000100), 0xdffffffffffff7ff}})
lchown(&(0x7f0000000100)='./file0\x00', 0x0, 0xffffffffffffffff)
compat_40_mount(&(0x7f0000000380)='union\x00', &(0x7f0000000140)='./file0\x00', 0x0, &(0x7f00000001c0))
mknod(&(0x7f00000000c0)='./bus\x00', 0x2000, 0x4f4b)
open$dir(&(0x7f0000000040)='./bus\x00', 0x0, 0x0)
compat_50_mknod(&(0x7f0000000000)='./file0\x00', 0x0, 0x0)
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
__select50(0x40, &(0x7f0000000000), 0x0, 0x0, 0x0)


r0 = open(&(0x7f00000000c0)='./file0\x00', 0x615, 0x0)
pread(r0, 0x0, 0x0, 0x0)


mknod(&(0x7f0000000100)='./file0\x00', 0x2000, 0x0)
r0 = open(&(0x7f0000000140)='./file0\x00', 0x0, 0x0)
ioctl$FIOASYNC(r0, 0x2000740d, 0x0)


setregid(0x0, 0x0)
socketpair$unix(0x1, 0x1, 0x0, &(0x7f0000000140)={0xffffffffffffffff, <r0=>0xffffffffffffffff})
sendmsg$unix(r0, &(0x7f00000000c0)={0x0, 0xfffffffffffffdb8, 0x0, 0x0, &(0x7f0000000100)=ANY=[@ANYBLOB="31000000ffff000001"], 0x28}, 0x0)
r1 = msgget$private(0x0, 0x0)
msgrcv(r1, &(0x7f00000003c0)={0x0, ""/250}, 0x102, 0x2, 0x1000)
ioctl$WSMUXIO_INJECTEVENT(0xffffffffffffffff, 0x80185760, &(0x7f0000000240)={0x7fffffff, 0x0, {0x0, 0x4}})
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
r2 = socket(0x18, 0x1, 0x0)
setsockopt(r2, 0x1000000029, 0xc, &(0x7f0000000000)="ebffcbff13b9fd812eaa4e713048e69931929648", 0x14)
setsockopt(r2, 0x1000000029, 0xc, &(0x7f0000000040)="ebffcbff13b9fd812eaa4e713048e69931929648", 0x14)
setsockopt(r2, 0x1000000029, 0xd, &(0x7f0000000000)="ebffcbff13b9fd812eaa4e713048e69931929648", 0x14)
getsockopt$sock_cred(r0, 0xffff, 0x1022, &(0x7f00000001c0)={0x0, <r3=>0x0}, &(0x7f0000000080)=0xc)
r4 = open(&(0x7f0000000040)='./file0\x00', 0x200, 0x0)
r5 = open$dir(&(0x7f0000000000)='./file0\x00', 0x2, 0x0)
writev(r5, &(0x7f0000000340)=[{&(0x7f0000000000), 0x2cfea}], 0x1000000000000013)
mmap(&(0x7f0000000000/0x200000)=nil, 0x200000, 0x0, 0x10, r4, 0x0, 0x0)
msgctl$IPC_SET(r1, 0x1, &(0x7f00000000c0)={{0x2, 0x0, 0x0, r3}, 0xbd, 0x0, 0x0, 0x0, 0x40})


compat_43_osetrlimit(0x9, &(0x7f0000000080))
socket(0x2, 0x2, 0x0)


r0 = socket$inet(0x2, 0x1, 0x0)
poll(&(0x7f0000000080)=[{r0, 0x517ec557131b40ac}], 0x1, 0x0)


mknod(&(0x7f00000000c0)='./bus\x00', 0x2000, 0x4f4b)
r0 = open$dir(&(0x7f0000000040)='./bus\x00', 0x0, 0x0)
ioctl$FIOASYNC(r0, 0x80104305, &(0x7f00000001c0))


swapctl$SWAP_STATS(0xa, &(0x7f0000000040)={0x0, 0x0, 0x0, 0x0, 0x0, 0x0}, 0x401800c)
munmap(&(0x7f0000001000/0x3000)=nil, 0x3000)
r0 = open(&(0x7f0000000080)='./file0\x00', 0x70e, 0x0)
r1 = open(&(0x7f00000000c0)='./file0\x00', 0x0, 0x0)
r2 = socket$inet(0x2, 0x2, 0x0)
__fstat50(r2, &(0x7f0000000000))
mmap(&(0x7f0000000000/0x2000)=nil, 0x2000, 0x0, 0x10, r1, 0x0, 0x0)
writev(r0, &(0x7f0000000400)=[{0x0}, {0x0}, {0x0}, {0x0}, {0x0}, {0x0}, {0x0}, {0x0}], 0x8)
socket$inet6(0x18, 0x4, 0x1)
compat_40_mount(&(0x7f0000000040)='tmpfs\x00', &(0x7f00000000c0)='.\x00', 0x0, &(0x7f00000002c0)="01")
symlink(&(0x7f0000000080)='.\x00', &(0x7f0000000240)='./file0\x00')
lchown(0x0, 0x0, 0x0)
compat_40_mount(&(0x7f0000000380)='union\x00', 0x0, 0x0, 0x0)
paccept(0xffffffffffffffff, 0x0, 0x0, 0x0)
madvise(&(0x7f0000001000/0x4000)=nil, 0x4000, 0x3)


r0 = socket$unix(0x1, 0x5, 0x0)
getsockopt$sock_int(r0, 0xffff, 0x20, &(0x7f0000000040), &(0x7f0000000100)=0x4)


r0 = socket(0x18, 0x3, 0x0)
ioctl$FIOSEEKHOLE(r0, 0xc118691d, &(0x7f0000000180)=0x8000000000000032)
__fhstat50(&(0x7f0000000180)="eb01b685c8f859535d508f86fd7d537dd3df", 0x12, 0x0)
r1 = socket(0x18, 0x3, 0x0)
compat_50___msgctl13$IPC_SET(0x0, 0x1, &(0x7f00000001c0)={{0x0, 0x0, 0x0, 0x0, 0xffffffffffffffff}, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x2, 0x0, 0x0})
ioctl$FIOSEEKHOLE(r1, 0xc118691d, &(0x7f0000000180)=0x8000000000000032)


mkdir(&(0x7f0000000040)='./file2\x00', 0x0)
mkdir(&(0x7f0000000300)='./file2/file0\x00', 0x0)
rename(&(0x7f00000002c0)='./file2/file0\x00', &(0x7f0000000340)='./file2\x00')


compat_40_mount(&(0x7f0000000040)='tmpfs\x00', &(0x7f00000000c0)='.\x00', 0x0, &(0x7f00000002c0)="01")
__mount50(&(0x7f0000000000)='overlay\x00', &(0x7f0000000040)='.\x00', 0x0, &(0x7f0000000100), 0x0)
mkdir(&(0x7f00000000c0)='./control\x00', 0x0)
rmdir(&(0x7f0000000040)='./control\x00')


r0 = open(&(0x7f0000000480)='./file0\x00', 0x80000000000206, 0x4ebfac6bbaf7949)
writev(r0, &(0x7f00000002c0)=[{&(0x7f0000000300)='#!', 0x2}], 0x1)
write(r0, &(0x7f0000000080)='!', 0x1)
writev(r0, &(0x7f00000000c0)=[{&(0x7f0000000040)=' ', 0x1}], 0x1)
writev(r0, &(0x7f0000000440)=[{&(0x7f0000000180)="0000e80a", 0x4}], 0x1)
execve(&(0x7f0000000140)='./file0\x00', 0x0, 0x0)


mknod(&(0x7f0000000300)='./file0\x00', 0x2000, 0x6da)
r0 = open(&(0x7f0000000180)='./file0\x00', 0x80000000000206, 0x0)
read(r0, &(0x7f00000002c0)=""/82, 0x52)


mknod$loop(&(0x7f0000000000)='./file0\x00', 0x6000, 0x0)
swapctl$SWAP_ON(0x7, &(0x7f0000000000), 0x0)


r0 = socket(0x18, 0x3, 0x6)
compat_43_orecvmsg(0xffffffffffffffff, 0x0, 0x0)
getsockopt(r0, 0x0, 0x0, 0x0, 0x0)


r0 = open(&(0x7f0000000080)='./file0\x00', 0x200, 0x0)
connect$unix(r0, &(0x7f0000000140)=@file={0x0, './bus\x00'}, 0x8)


open(&(0x7f00000000c0)='./file0\x00', 0x615, 0x0)
open(0x0, 0x0, 0x0)
open$dir(0x0, 0x0, 0x0)
ftruncate(0xffffffffffffffff, 0x0, 0x0)
r0 = open$dir(&(0x7f0000000180)='./file0\x00', 0x2, 0x0)
pwrite(r0, &(0x7f0000000300)='_', 0x1, 0x8001)
preadv(r0, &(0x7f00000012c0)=[{&(0x7f00000001c0)=""/224, 0xfffffdd5}], 0x1, 0x0)


mkdir(&(0x7f0000000080)='./file0\x00', 0x0)
__mount50(&(0x7f0000000040)='procfs\x00', &(0x7f0000000080)='./file0\x00', 0x0, &(0x7f00000000c0)="82", 0x1)
compat_50___msgctl13$IPC_STAT(0x0, 0x2, &(0x7f0000000280)={{}, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, &(0x7f00000001c0)={&(0x7f0000000100)}})
lchown(&(0x7f0000000100)='./file0\x00', 0x0, 0xffffffffffffffff)
compat_40_mount(&(0x7f0000000000)='null\x00', &(0x7f0000000140)='./file0\x00', 0x0, &(0x7f00000001c0))
lchown(&(0x7f0000000100)='./file0\x00', 0x0, 0xffffffffffffffff)


compat_50_utimes(0x0, 0xffffffffffffffff)


compat_90_getvfsstat(&(0x7f0000001b00), 0x8d0, 0x6)


ktrace(0x0, 0x4, 0xd27d43220c7df9b, 0x0)
_ksem_init(0x0, &(0x7f0000000000)=<r0=>0x50535244)
_ksem_post(r0)
_ksem_trywait(r0)


modctl$MODCTL_UNLOAD(0x2, 0x0)
mkdir(&(0x7f0000000080)='./file0\x00', 0x0)
compat_40_mount(&(0x7f0000000180)='ptyfs\x00', &(0x7f00000002c0)='./file0\x00', 0x0, &(0x7f00000000c0))
unmount(&(0x7f0000000000)='./file0\x00', 0x0)


madvise(&(0x7f0000ffb000/0x3000)=nil, 0xffffffffdf004fff, 0x0)


r0 = socket(0x11, 0x3, 0x0)
r1 = open(&(0x7f00000000c0)='./file0\x00', 0x615, 0x0)
open(&(0x7f0000000300)='./file0\x00', 0x11, 0x0)
mmap(&(0x7f0000000000/0x2000)=nil, 0x2000, 0x0, 0x10, r1, 0x0, 0x0)
execve(0x0, 0x0, 0x0)
sendto$unix(r0, &(0x7f00000000c0)="b10005166000009f0000000000070000001c130500000000fef96ecfc72fd3357ae380b37b673039d2d236acf60b7804be78164991f7c8cf5f882b297be1aa5b23edeb51e2f0ac3ebbc215000000eeffffff028ea8af630037282102000000720fd38bfbb770c1f572c881ea772ec592040000000000ff0c2300008abfba0900000008e371a3f8343712051eeab71d89e0442c5e52000080042000"/177, 0xb1, 0x0, 0x0, 0x0)
socketpair$unix(0x1, 0x2, 0x0, &(0x7f0000000040))
r2 = msgget$private(0x0, 0x2)
msgsnd(r2, &(0x7f0000000180)=ANY=[], 0x401, 0x0)
ktrace(&(0x7f0000000000)='./bus\x00', 0x1, 0x100, 0x0)
msgrcv(r2, &(0x7f0000000140), 0xfffffffffffffecf, 0x0, 0x0)
msgsnd(r2, &(0x7f0000000000)=ANY=[@ANYBLOB="0200000000000000210cd4dec2cb1d8bf2a68ffcc8ba3c196d6e5768fd697e85cd8d3452aa26c6fcce0d0db6597b2217de42ab712270bc1a3c70036f2fbc2b92a5fd9a0754570a9e1ddcf2be337a9a6269b3b86c2e3e014e63570f4610ea80dedf00625fbb6ff00eedb8127c3e75aa1444c88f94adca1487c665772150"], 0x83, 0x0)


socket$unix(0x1, 0x0, 0x0)
getsockopt$sock_int(0xffffffffffffffff, 0xffff, 0x20, 0x0, 0x0)
socketpair$unix(0x1, 0x2, 0x0, &(0x7f0000000400)={0xffffffffffffffff, <r0=>0xffffffffffffffff})
write(r0, &(0x7f0000001180)="135ef812d903627da2706a90b8f972bb28b89c03956f2cd30c6416ff2c2f3c7dd141ffe5d731bcd4e88ea996cebe6d6e551603d0371e9143a1c2fea094d224d4f1fe742fef31b37d6f705623926ff627ee248efeab5b920fc21f5ef142d50d245ea26c22dbf0f9da5837330cabe6de1dbe4802529f20642f8c14db6a2bb4683b33112db2", 0x84)
getsockopt$sock_int(r0, 0xffff, 0x100, &(0x7f0000000cc0), &(0x7f0000000d00)=0x4)
r1 = msgget$private(0x0, 0x114)
msgsnd(r1, &(0x7f0000000140)={0x0, "191dc9ee04d0b1c028f4a310f6ac87395acbd304febabc380ebab272e097e6b0cc01fbe1c32b92a91fb06301cf6692924a6c21de91af6430826854759378d91b4062ecd7da472a3f6ec43123dd1069c701a77c96b32acf9c055507d790119a6f0886e78ff9818809d9422ffa72bec10970e2de065fa352bb4cf26639610ba6c9c8bedcf70b8dafaf528d20c61c469e573b39013287b39b5e7d98527d47bf4ed21d2e46e3995388f25c2ec95a8c2e62f2b836d63d89e8122ba193c384c73ecf7ca8fc0fdfdc6367edc3f868c4d3c73363434a9fee48253818cd7a72018ed9b95099cd54542355d41ab10bda1dc2d2b8530a8e39ed67a9fbcf048b30444a18673769dea8540d230532a51190cdbfae67571d6f98a7cb7e99c25b9063e0950a0d0fc33211ec9b8a7749bab367394b8efdc45f182a0ad15e6b6b95a810603bbb1e848e0a43a08bf6b221073f8028a5dfeac917db30d9a7611cee395ec4effdb8fe2ab87ba13e9677c87c3f50ebdd44460cfaa4b0f15470f1ceb7dec5b3274a070ba330577d6f6ef046aa4735cfdef53679a82551a39c894fd0d39d5b289dd9bfc4c610fcf94f4f5cac897e2f5b68f92b51e6b165de9edd1b7fa3c83218e4b3beeb2c880c2949b62e5a9fcf2fc405e5ace98f6db61604808d556c7b841b99896cf33598b4ef8d69e33b2f502079205bccce15c6bd8b3ae6f2e1fe20550bbafb5cdc4f6503d85773921a9c27ac6296ffacf7a01c5156ea25a7899c519a840595619d76ee15db2407096cc5b3bba86544f5b3d2ff7394e39b999de369214ce6a49bea356a634be58ca6083957954f6aa876abd21409717e200b109be34fa6279558ac03e7e0c935aa592145e3dc6fbb9c84f159f68c80a0f853b0d737f2c2cc752af7d9fc9f91b47f029bed6741c6828ddc99550f52350f1fd321c9139cab6257ce51b90a12297de3b38e3e98ce7f06ae4386c56d8d93ac7a1ef8b2df8507ec7a939a596d98f4d720662458e5f72d8aa274c85b0e671288ec09eaa691366825118b75b14e49ad21912f3ca8fa9b4a37e027c42f6e99e677410542b8c7a39dc1e0aa9fec430da0258e6fe43e2ce9b098e85d4e94863b41614085bdbb2feb312ad4ce289f9d2b31334e8c944d94b95e57cdc9c5e0c18e04244a3f223a4025bc373ffe454ca053c8b8386fb34fee433e0fb95deff4c4a351b38e87750dc588da99820fad14b0bd5d7ea2263eb6ac731c66d6fcafc9737f8895b4431b7dd546bd05c5aa94b58c08781b2b6f9d6a8b9e2c0e5a72ef56f226aa6312eb6d309678a2013eac60286053f2c005d1f4290893c55041429769943bebd0dfffa2c89d5dbe7cbead6d3f410fcb2a24439a3802cb634c33868e3f0a5eec65c9ee1d021d991f7792ad17f8b28cd7817933b431dc60155e5c8b349b02f1e637706103d904223923f7a4eed52c68e936d3ed8fc0e8230b97ec3191505122ab7d410d12ac7044cd6356d48ad7a1af84f8909a258deaf4f116d6be5e9655b32fb1c0a3f80fd1b053e7c3c1fe5d94b01b6e08856e735bba4618030d130c0a2c8eafaa1a6dd4a4f9fe3a6b95ad54f79cfbc445a9239db9271fd94579b0c96607a782252bc290503179b2c773729edf4fa48f9f2009cf387eac02fa787c942920eb14b5c853068bd785d0aa44a859ba3d0c36091b25c5c6e5e24a78e97e7531d1d213cc166cd778afbb5aa8ee467f495e70962a014486a50c185b01b00764ccc1afc47dd41ad12ac597d8772a608adf9b126c6864dc8544e41bad532c03274ae89bf6549afcc7ce87c36c2cae443fad2b6fc9eaff9f9ca88328e8667d6a5b62bf146454cb9f6f45ef0f852a878bffe945facce36566f8a49ece70d71e2f993fb43871017a41b6c4531a03a14c6face003d5c1e0fd4cf3ab1ce8330f88f06df77b285c734a3e3617f97b26c2a2fa5719ab521c1e8e5899bd5491e797ba2a339039fe3b4a03645389ea51f4a76e194df5973af60fb9785749ae88467b8d53332ce373efcfc15b49f4f5b5ec003fc51de8f1d924836917414acae63e2e99386cfd0bf09bbd7e28ea4ea07e655926fd2f328f5074390bcc194b9515925658b8b9fed1f324d9457e1acd6211dd059665936d44ddda341b3ea57057d4f14b668052b189ff2558d59572fde6e6595ab454637b3da9a1c749c3045b93d805aec317d97a99fc4ec98dc83df96700e795d4bdab6b2de939a7ef04a23aa83b420cb9c7965dd6aac3ba01678491c316611eef17e2251636b1a4b6ab1d17d9b1768fd6e8729ec63b4522e4d75016e540dfd7fd5d1dd5e1b1fb4a030a26592390685854f34b79c528932a303ac9dd8bb8e8bb2191cce215318ed66f914f477b0128b206046f012548fb9a08c94a079c954c077aeae8e447517c743e35b6043a0db9fa144bcf4976a0833ef03dbc3b480452d4dbed3009f9cf3644e11ce2126d7ac4df09ef6337da1275a0fb9b1c4ac547dd06c9f79708a2619245e315b69979650c56822b8fb4585893a38721ca7df5277e35f3c03a8fbdbf08407e9b5b58e0305f43d82986354d5076b734203cb416ed81f3f4426038ea2f264f0d136d4a12c202979c39896d1af1670dc2c12f68d181b36c6c43a632704212279786ab69ad716a8d211762cbc10c8a4ca07e7aa5db46dd24d315959e0d8f2545c2807fd43ecda65c4159bfddea4506c7ea72501668a9baefe453dc9073910cf89029de67acfc8f07ebbad0811a873308172656a3617ea27422b2e473d6e629f8e6fb7566efd0a5d5f8e7e2630da4c5c6b3744caf7f61d05cf46b657a9b92be11968afb11cd2f9b0cfe0e7e690735193aa935a5cd64612133f957e5fb71d0222f1853e5cad30cc36a5afd237a7c923154a12d09ab4f65654b1b9f13808d7ddcfcbbc86b94633455186e60be4680587d876969eafcd7458a8541085c1685c18981834e6cd1c9a944daacae41322b0f096cce28e0d05bf0c799fd30a6ce4bf525824c6831863cc3cc19edbff43031969be8bc0a83924f07cb1ee284929f7462d4d410a5d90d7de0f5a13bad8f2f9b5c58b30dff7ae7d426637b1aedc4edc723ecf5cb8cacf852abb573b67ab1b23b364c0e6015baa27d0e1a7f1fcee0e7b7cc2ce9b4343e2f55d80d795c9551abae1b080dcf37115aa6cea7a23215bdc31508efc8d1dace6f40e7b9244f3fe52aa32fbdef4c5f6887c9b5eabd9ecbf1939a960b88cbf97d20ce8c14283869580a436c6a2af8625a826020143e5542e26bccf8066f89573144b9085f7a97ebe9892010bbcd397703b498b60831059f69b815b79c5ea34c7b58769b259475a53f3628fa61851c9f983219a6af0ed143b7e494be54ed8184c15a3567464ac48ce4fe27de98aca417e527aebbf9c0d9284fb2ef39d71a84186264c2ae4aa44869bae0efea1908f44c59f0faffd4eae49ce57a5fc2ac66c67a498ca9d6bd00b4a35adfdf2e30aeb7fbf57e7a051722b75dac21f54e5ee3940c7e6714e79c3b968199a1f34218cc672ed9197633fce18ef2335e3bd461cb8e4f0b50274d2be9fcf2f3a7081bd0d93a22f13618fad79e782cd059bcb0274e182bc7158359bdd5786f4c0c1bcd9f23af219cdf9f710136da1ddd2ef06c6550266dc9495205f50c2537a5ef1091bfbae4d607b0a6896554e6fd65851cdf2252573489d1e334c3073f840871ed2ff146944919dfb998cf1eeb5aa0b1074b2a7ad8016a6c3edd11090bf4be3376c512f51a8b3011c09bb362e9d70c727c03da53750fab495c04d36820986ecf0a6dc040836f143aaa4c348baca47f3fa2796b9d56611e23993fd9809826cf118a692f7a8fa74ba57c7170ba305a72a5d404b17d16e8813242461f05e2ac8dddb791371aed189ecd6edf067fc029200d04c8162972413c57fa6bd4388f1e22cfbb94886eca0836df778ac945d7c1dfbacaf5119c66e71b43e2ce6e8e1829de0b9008be8863d3ce44239dd8dca3585603d847b75457ed51f1b7adea5ecb2bc88c4af97a348738c86f18faa82822e3fabbf8ff599194b246489860aeb593cb2709b4d9cf6fb2c0bb5287cef261d23118a307f9b8bd3f116ae76ccdc007884464e31e717522dd3283cb8d521b664bee608a2c7d349c4d4cd0e91920154b4dd2602d8337d013cb79c344ff6435d9078a508487a7cb648cfec78bd6cfe0a930ebdba83a04fe883def9338ca484dabc34f0a63cfde56b1ff5001800dc3bc3447155999836db15594fc102fed3b06f92d315e7d178eabbc32507c617cb9834ee87a0910f81ed10b75cc323657ce55b9d57599829167fa4fe91495e0b6348b7de931a0bee8fba3c8975b659549f9f126ebc44bd45aeeec5294e13ae22cbfd96f5d3dfa4a45f1131f833107c724b48f7950ff8a0361acc408849389efbd43e3a51a158a03d6c8d894b385e95f426ffe26e16325e4b9db27e79a79d53fab44a46ad11e3d3370a71c67dfdbf00a596e049b246572555083c29c90a77cd6d1825f419ef1dd701ceb37187355fd76737f74fb48b3577d0ca58a2848544aa778c26c755d0ffb3eb378ed60aea6de8123a5eecc03d19ce2f3647ed78aa75cd215dc5158a082838129e2d4713c2f61c21bc126a50b5487d59c5ec026653f456c33c1a327919b0168ead8cf2e2b478a26adb0452ff368baab8964bb286c94f0cc76314eede0a48623ebc0e189f634b9125226efbbd85bd775c39a78bab1198da7db68ffd3d278a203659208da6b32afa9c1323f99987e4bff135c1b8660bf890132f9ed1e9773ce4597e91a8f11bb72007a8689ba66fad054f8e8643e7bcec9cb2eb1fd1650c06fb5ec78638bba68bef9dc842148c32822167a28fb745530cda7b744cccfc7edf1bd4fcbb84ad5581694d079ab822498acd66f757dcccbba9d95d6213407c770bc0c56db143ce804a7d1756b243ea676b303376b352c71188e4b4ee4ac761067b4ba5369875c090d72e92530429d46000daffe5e7f22be0c187be2310c73786c810b0d6d728ca0288445d1699baab1e11f880fb63c5c2df3d260f58e59761bce11f4af88ecf81700ccba14c3dc123dea0856024271e0daf8b5c7d50e5b36d132f45be275ef0cb6c1487be3e2612c0530f6b992114f3b2be7932c6345812714a28e431de467915393f6e1cf87b3c29330f55fad923804a3a69c2935050472ec06b9b8509cb0fc9ae06412c520d7e9db9c94caa3d9cbf86cf54aebb9b6f37f85be7368088e477ea65cf8f25c47bd1dcf52f0526913d43cdd5e48d4d7b41fc698ae9c829e480a3152b419e005d448156bec5476e87ec3afb6be4352161ad5f55de4a923ab54d315086122e7b9537fd1db83bd053d1606180386e4436c8471e08e2471299b3677f278bb838bfb562d5527d0ea8eb19896bae0ead0fd083ddef9baad1fd300fd16eb020b74f606e35b5d47b7b004592e5f76a074d3e196661c7d95c16b3238d7b4ae103cbad188fbb4668a580891b9a39b544cfc2e7c3c860af2c80699e09162407b82b60d41dda8757490ce4e18a8d90cd425435f6c121e7593671a54e0208e1c2e395db8ef5024c1dccdc47c7c53c856621527883f7ac4f3a9617ea0fa5cabb3362b1c9f0c1df84f8cfdd0bd"}, 0xf88, 0x0)


mknod(&(0x7f0000000300)='./file0\x00', 0x2000, 0x6da)
r0 = open$dir(&(0x7f0000000000)='./file0\x00', 0x0, 0x0)
ioctl$FIOASYNC(r0, 0x40047463, 0x0)


symlink(&(0x7f0000000080)='.\x00', 0x0)
__clone(0x0, 0x0)
minherit(&(0x7f0000000000/0x4000)=nil, 0x4000, 0x0)
__clone(0x0, 0x0)


setrlimit(0x0, &(0x7f0000000980))
setrlimit(0x2, &(0x7f0000000240))


pipe(&(0x7f00000002c0)={<r0=>0xffffffffffffffff})
fchdir(0xffffffffffffffff)
socketpair$unix(0x1, 0x0, 0x0, 0x0)
__msync13(&(0x7f0000ffc000/0x2000)=nil, 0x0, 0x0)
shmget(0x3, 0x3000, 0x0, &(0x7f0000ffb000/0x3000)=nil)
ioctl$WSKBDIO_GETMAP(0xffffffffffffffff, 0xc010570d, 0x0)
fchroot(r0)


r0 = getsid(0x0)
ptrace(0x9, r0, 0x0, 0x0)
ptrace(0x19, r0, 0x0, 0x0)


mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x0)
mincore(&(0x7f00002ad000/0x1000)=nil, 0x1000, 0x0)


mknod(&(0x7f0000000480)='./file0\x00', 0x2000, 0x1733)
open(&(0x7f00000000c0)='./file0\x00', 0x0, 0x0)
syz_usb_connect$hid(0x0, 0x36, &(0x7f00000001c0)={{0x12, 0x1, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x40, 0x1, 0x2, 0x3, 0x1, [{{0x9, 0x2, 0x24, 0x1, 0x1, 0x0, 0x0, 0x0, [{{0x9, 0x4, 0x0, 0x0, 0x0, 0x3, 0x1, 0x0, 0x0, {0x46}}}]}}]}}, 0x0)
poll(&(0x7f0000000000)=[{}], 0x20000000000000fe, 0x0)


compat_40_mount(0x0, 0x0, 0x0, 0x0)
r0 = open(&(0x7f0000000480)='./file0\x00', 0x80000000000206, 0x40)
writev(r0, &(0x7f00000000c0)=[{&(0x7f0000000280)='#!', 0x10}, {&(0x7f0000000000)="8d6bb85551ec8430877ae32fe9bbe42cc8f2147a3eba8e1969f0435119cf4c071c8aee7ef2921be5d7d4796c5566c95989acb3d185587234186e96b8fde9ffac51de05a87b8b893e2abd154dd886eafbe03881d25b7b13b4c32227fc9e5a86a06f59f701322b3a109a13436e486b0a", 0x6f}], 0x2)
execve(&(0x7f0000000140)='./file0\x00', 0x0, 0x0)


r0 = socket(0x18, 0x2, 0x0)
setsockopt(r0, 0x1000000000029, 0x30, &(0x7f0000000040)='\x00', 0x1)


fcntl$lock(0xffffffffffffffff, 0x0, &(0x7f0000000040)={0x0, 0x0, 0x0, 0x1})
semctl$SETALL(0x0, 0x0, 0x9, &(0x7f0000000040)=[0x7ff])
ioctl$WSMUXIO_INJECTEVENT(0xffffffffffffffff, 0x80185760, &(0x7f0000000000)={0x0, 0x0, {0x0, 0x1}})
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
r0 = socket(0x18, 0x1, 0x0)
setsockopt(r0, 0x1000000029, 0xc, &(0x7f0000000100)="ebffcbff13b9fd812eaa4e713048e69931929648", 0x14)
setsockopt(r0, 0x1000000029, 0xc, &(0x7f0000000040)="ebffcbff13b9fd812eaa4e713048e69931929648", 0x14)
setsockopt(r0, 0x1000000029, 0xd, &(0x7f0000000000)="ebffcbff13b9fd812eaa4e713048e69931929648", 0x14)


semctl$IPC_SET(0x0, 0x0, 0x1, &(0x7f0000000080)={{0x0, 0x0, 0x0, 0x0, 0x0, 0x6c}})
setsockopt$sock_int(0xffffffffffffffff, 0xffff, 0x0, 0x0, 0x0)
mknod(0x0, 0x0, 0x0)
open(0x0, 0x0, 0x0)
poll(&(0x7f0000000000)=[{}], 0x20000000000000fe, 0x0)


r0 = socket$inet(0x2, 0x1, 0x0)
setsockopt(r0, 0x0, 0x5, &(0x7f0000000a00)="8b589d9d", 0x4)


__mount50(&(0x7f0000000000)='overlay\x00', &(0x7f0000000040)='.\x00', 0x0, &(0x7f0000000540), 0x0)
compat_40_mount(0x0, 0x0, 0x0, 0x0)
compat_43_osethostname(0x0, 0x0)
socket(0x0, 0x0, 0x0)
unlink(0x0)
mknod(&(0x7f0000000280)='./file0\x00', 0x1100, 0x0)
open(&(0x7f0000000180)='./file0\x00', 0x80000000000206, 0x0)
open(&(0x7f00000002c0)='./file0\x00', 0x690, 0x0)


modctl$MODCTL_UNLOAD(0x2, 0x0)
compat_30_fhstatvfs1(&(0x7f00000000c0)={{}, {0x0, 0x0, "4680fc451d3d9d414598d512d114090f"}}, 0x0, 0x0)


mknod(&(0x7f0000000480)='./file0\x00', 0x2000, 0x1733)
r0 = open(&(0x7f00000000c0)='./file0\x00', 0x0, 0x0)
ioctl$FIONREAD(r0, 0x4090426b, 0x0)


r0 = getsid(0x0)
ptrace(0x9, r0, 0x0, 0x0)
ptrace(0x11, r0, 0x0, 0x0)


mknod$loop(&(0x7f0000000000)='./file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\x00', 0x2000, 0x1)
link(&(0x7f0000000940)='./file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\x00', &(0x7f0000000d40)='./file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\x00')
__stat50(&(0x7f0000000440)='./file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\x00', 0x0)
compat_20_statfs(&(0x7f0000000700)='./file0\x00', 0x0)


getpriority(0x6301e4eb1f933c6c, 0x0)


r0 = socket(0x800000018, 0x1, 0x0)
msgctl$IPC_SET(0x0, 0x1, &(0x7f0000000080)={{0x0, 0x0, 0xffffffffffffffff}})
r1 = socket(0x18, 0x2, 0x0)
close(r1)
r2 = socket(0x800000018, 0x1, 0x0)
fcntl$lock(0xffffffffffffffff, 0x0, &(0x7f00000000c0)={0x0, 0x0, 0x40000000000})
bind$unix(r2, &(0x7f0000000080)=@abs={0x1f95d27d48731892, 0x7}, 0x1c)
connect$unix(r1, &(0x7f00000000c0)=@abs={0x682eb13985c518e6, 0x7}, 0x1c)
bind$unix(r0, &(0x7f0000000080)=@abs={0x1f95d27d48731892, 0x7}, 0x1c)


r0 = socket(0x18, 0x3, 0x0)
compat_43_orecvmsg(r0, &(0x7f0000000100)={0x0, 0x0, &(0x7f00000000c0)={0x0}, 0x10, 0x0}, 0x0)
close(r0)


r0 = socket$inet(0x2, 0x3, 0x0)
getsockopt$inet_opts(r0, 0x0, 0x3, 0x0, 0x0)


socket(0x0, 0x0, 0x0)
r0 = socket(0x2, 0x1, 0x0)
sendmsg(r0, &(0x7f00000011c0)={0x0, 0x0, 0x0, 0x0, &(0x7f0000000e80)=ANY=[], 0x10}, 0x0)


r0 = socket$inet(0x2, 0x2, 0x0)
compat_40_mount(&(0x7f0000000380)='tmpfs\x00', &(0x7f00000003c0)='.\x00', 0x0, &(0x7f0000000140)="01")
open$dir(&(0x7f0000000000)='.\x00', 0x0, 0x0)
mknod(&(0x7f00000000c0)='./file0\x00', 0x6000, 0xe03)
open(&(0x7f0000000180)='./file0\x00', 0x0, 0x0)
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
unmount(&(0x7f0000000000)='./file0\x00', 0x255a0100)
fcntl$lock(r0, 0xa, 0x0)


mknod(&(0x7f00000000c0)='./bus\x00', 0x2000, 0x0)
r0 = open$dir(&(0x7f0000000000)='./bus\x00', 0x0, 0x0)
ioctl$FIOASYNC(r0, 0x40043105, 0x0)
_ksem_init(0x0, &(0x7f0000000100)=<r1=>0x0)
_ksem_close(r1)


mknod(&(0x7f0000000100)='./file0\x00', 0x80002005, 0x5300)
r0 = open(&(0x7f0000002600)='./file0\x00', 0x0, 0x0)
__mount50(&(0x7f0000000000)='overlay\x00', 0x0, 0x0, 0x0, 0x0)
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
ioctl$WSKBDIO_SETMAP(r0, 0x80145003, &(0x7f0000000000)={0x0, 0x0})


symlinkat(0x0, 0xffffffffffffffff, &(0x7f0000000140)='./file0/file0\x00')
mknod(&(0x7f0000001200)='./file0\x00', 0x2000, 0x400)
r0 = open$dir(&(0x7f00000000c0)='./file0\x00', 0x2, 0x0)
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
pwritev(r0, &(0x7f0000000080)=[{&(0x7f00000006c0), 0xf0f75}], 0x10, 0x0)


msgctl$IPC_STAT(0x0, 0x2, 0x0)
mkdir(&(0x7f0000000000)='./file0\x00', 0x0)
chmod(&(0x7f0000000080)='./file0\x00', 0x2ea)
r0 = open(&(0x7f0000000080)='.\x00', 0x0, 0x0)
mkdirat(r0, &(0x7f0000000100)='./file1\x00', 0x9f)
renameat(r0, &(0x7f0000000040)='./file1\x00', r0, &(0x7f0000000200)='./file0/file0\x00')
r1 = open(&(0x7f0000000080)='.\x00', 0x0, 0x0)
mkdirat(r1, &(0x7f0000000100)='./file1\x00', 0x9f)
renameat(r1, &(0x7f0000000040)='./file1\x00', r1, &(0x7f0000000200)='./file0/file0\x00')


ktrace(0x0, 0x0, 0x0, 0x0)
compat_43_ocreat(0x0, 0x0)
ktrace(&(0x7f0000000200)='./file0\x00', 0x4, 0xf, 0x0)
_ksem_timedwait(0x0, &(0x7f0000000000)={0x0, 0x80000001})


compat_40_mount(&(0x7f0000000200)='cd9660\x00', &(0x7f00000000c0)='.\x00', 0x0, 0x0)


r0 = _lwp_self()
_lwp_wait(0x0, 0x0)
_lwp_wakeup(r0)


mlock(&(0x7f0000ffc000/0x3000)=nil, 0x3000)
__clone(0x0, 0x0)
mmap(&(0x7f0000ffa000/0x4000)=nil, 0x4000, 0xd49f275d97cc01bb, 0x1810, 0xffffffffffffffff, 0x0, 0x0)


r0 = socket$unix(0x1, 0x1, 0x0)
r1 = socket(0x18, 0x1, 0x0)
dup2(r0, r1)
setsockopt(r1, 0x0, 0x0, 0x0, 0x0)


mknod(&(0x7f00000000c0)='./bus\x00', 0x2000, 0x0)
r0 = open$dir(&(0x7f0000000000)='./bus\x00', 0x0, 0x0)
ioctl$FIOASYNC(r0, 0x48087448, 0x0)


open$dir(0x0, 0x0, 0x0)
getpid()
compat_50_nanosleep(&(0x7f0000000040), 0x0)


mkdir(&(0x7f0000000000)='./file0\x00', 0x0)
chdir(&(0x7f0000000240)='./file0\x00')
setreuid(0x0, 0xee01)
mkdir(&(0x7f0000000100)='./file0\x00', 0x0)


modctl$MODCTL_UNLOAD(0x2, 0x0)
compat_50_setitimer(0x0, &(0x7f0000001800)={{}, {0x0, 0xf423f}}, 0x0)
compat_50_getitimer(0x2, &(0x7f0000000200))


r0 = socket(0x18, 0x2, 0x0)
setsockopt(r0, 0x1000000000029, 0x40, &(0x7f0000000040)="00fb6c4f", 0x4)


r0 = socket(0x800000018, 0x1, 0x0)
bind$unix(r0, &(0x7f0000000080)=@abs={0x1f95d27d48731892, 0x7, 0x3}, 0x8)


open$dir(&(0x7f0000000000)='./file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa/../file0\x00', 0x0, 0x0)
sendmsg$unix(0xffffffffffffffff, &(0x7f0000001700)={&(0x7f0000000080), 0x1c, 0x0}, 0x0)
bind$unix(0xffffffffffffffff, &(0x7f0000000080)=@abs={0x1f95d27d48731892, 0x7}, 0x1c)
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
r0 = socket(0x18, 0x1, 0x0)
close(r0)
socket(0x18, 0x2, 0x0)
sendmsg$unix(r0, &(0x7f0000001700)={0x0, 0x0, 0x0}, 0x0)


mknod(&(0x7f0000000480)='./file0\x00', 0x2000, 0x1733)
r0 = open(&(0x7f00000000c0)='./file0\x00', 0x0, 0x0)
fcntl$setstatus(r0, 0x4, 0x20044)


socket(0x2, 0x0, 0x0)
socket(0x0, 0x0, 0x0)
connect$unix(0xffffffffffffffff, 0x0, 0x0)
socket(0x0, 0x0, 0x0)
ktrace(0x0, 0x1, 0xd27d43220c7df9b, 0xffffffffffffffff)


compat_40_mount(&(0x7f0000000040)='tmpfs\x00', &(0x7f00000000c0)='.\x00', 0x0, &(0x7f00000002c0)="01")
mknod(&(0x7f0000000480)='./file0\x00', 0x2000, 0x1733)
link(&(0x7f0000000940)='./file0\x00', &(0x7f0000000d40)='./file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\x00')
rename(&(0x7f0000000240)='./file0\x00', &(0x7f0000000640)='./file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\x00')


modctl$MODCTL_UNLOAD(0x2, 0x0)
r0 = socket$unix(0x1, 0x1, 0x0)
r1 = socket$unix(0x1, 0x5, 0x0)
bind$unix(r1, &(0x7f00000001c0)=@file={0x1, '\xe9\x1fq\x89Y\x1e\x923aK\x00'}, 0xd)
getsockopt$sock_cred(0xffffffffffffffff, 0x1, 0x11, 0x0, 0x0)
connect$unix(r0, &(0x7f0000000040)=@file={0x1, '\xe9\x1fq\x89Y\x1e\x923aK\x00'}, 0xd)


r0 = socket(0x18, 0x3, 0x0)
compat_43_orecvmsg(0xffffffffffffffff, &(0x7f0000000100)={0x0, 0x0, &(0x7f00000000c0)={0x0}, 0x10, 0x0}, 0x0)
close(r0)


chroot(&(0x7f0000000000)='.\x00')
chroot(&(0x7f0000000000)='.\x00')


semctl$IPC_SET(0x0, 0x0, 0x1, &(0x7f0000000000)={{0x0, 0xffffffffffffffff}, 0x0, 0x0, 0x1})
fcntl$lock(0xffffffffffffffff, 0x0, &(0x7f00000000c0)={0x0, 0x0, 0x1ff})
connect$unix(0xffffffffffffffff, &(0x7f00000000c0)=@abs={0x682eb13985c518e6, 0x7}, 0x1c)
syz_emit_ethernet(0x3e, &(0x7f0000000000))
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
r0 = socket(0x18, 0x1, 0x0)
close(r0)
r1 = socket(0x18, 0x2, 0x0)
setsockopt(r1, 0x1000000000029, 0x9, &(0x7f0000000040)='\x00\x00\x00\x00', 0x4)
setsockopt(r1, 0x1000000029, 0x2e, 0x0, 0x0)
connect$unix(r0, &(0x7f00000000c0)=@abs={0x0, 0x7}, 0x1c)


setreuid(0x0, 0xee01)
r0 = socket(0x18, 0x2, 0x0)
setsockopt(r0, 0x29, 0x16, &(0x7f0000000000)="02000000", 0x4)


compat_50_setitimer(0x300, &(0x7f0000001800), 0x0)


mlock(&(0x7f0000ffa000/0x4000)=nil, 0x4000)
munlock(&(0x7f0000ffb000/0x2000)=nil, 0x2000)


mknod(&(0x7f00000000c0)='./bus\x00', 0x2000, 0x0)
r0 = open$dir(&(0x7f0000000040)='./bus\x00', 0x0, 0x0)
ioctl$FIOASYNC(r0, 0x8004745c, &(0x7f00000001c0))


r0 = socket(0x2, 0x3, 0x0)
sendto$inet(r0, 0x0, 0x0, 0x0, &(0x7f0000002400)={0x2, 0x0}, 0xc)


compat_40_mount(&(0x7f0000000040)='tmpfs\x00', &(0x7f00000000c0)='.\x00', 0x0, &(0x7f00000002c0))


symlink(0x0, 0x0)
chmod(&(0x7f0000000080)='./file0\x00', 0x0)
unlink(0x0)
compat_50___shmctl13$IPC_SET(0x0, 0x1, 0x0)
mkdir(&(0x7f0000000200)='./file0\x00', 0x0)
__mount50(&(0x7f0000000000)='kernfs\x00', &(0x7f0000000180)='./file0\x00', 0x0, 0x0, 0x0)
r0 = open(&(0x7f0000000240)='./file0\x00', 0x0, 0x0)
ioctl$FIONREAD(r0, 0x80045729, &(0x7f0000000080))


r0 = compat_30_socket(0x22, 0x3, 0x0)
compat_43_osend(r0, &(0x7f0000000040)="08200203", 0x358, 0x0)
recvfrom$inet6(r0, &(0x7f0000000080)=""/138, 0x190, 0x4080, &(0x7f0000000000)={0x18, 0x3}, 0xc)


compat_40_mount(&(0x7f0000000200)='ptyfs\x00', &(0x7f00000003c0)='.\x00', 0x0, &(0x7f0000000a80))
__mount50(&(0x7f00000002c0)='overlay\x00', &(0x7f0000000040)='.\x00', 0x0, &(0x7f0000000540), 0x0)
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
pathconf(&(0x7f0000000040)='./file0\x00', 0x7)


r0 = getpid()
setpriority(0x1, r0, 0x0)


r0 = socket$unix(0x1, 0x1, 0x0)
r1 = socket$unix(0x1, 0x1, 0x0)
bind$unix(r1, &(0x7f0000003000)=@file={0x1, '\xe9\x1fq\x89Y\x1e\x923aK\x00'}, 0x6e)
listen(r1, 0x0)
connect$unix(r0, &(0x7f0000000080)=@file={0x1, '\xe9\x1fq\x89Y\x1e\x923aK\x00'}, 0x6e)
r2 = dup3(r1, r0, 0x0)
r3 = accept(r2, 0x0, 0x0)
fchmod(r3, 0x0)


r0 = socket(0x2, 0x3, 0x0)
ioctl$FIOSEEKHOLE(r0, 0xc0906911, &(0x7f0000000180))
setpriority(0x0, 0x4, 0x0)


symlink(&(0x7f0000000080)='.\x00', &(0x7f0000000240)='./file0\x00')
compat_50___msgctl13$IPC_STAT(0x0, 0x2, &(0x7f0000000280)={{}, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, &(0x7f00000001c0)={&(0x7f0000000100), 0xb}})
lchown(&(0x7f0000000100)='./file0\x00', 0x0, 0x0)
compat_30___lstat13(0x0, 0x0)
compat_40_mount(&(0x7f0000000380)='union\x00', &(0x7f0000000140)='./file0\x00', 0x0, &(0x7f00000001c0))
r0 = open$dir(&(0x7f0000000080)='.\x00', 0x0, 0x0)
mkdirat(r0, &(0x7f0000000180)='./file1\x00', 0x0)
chdir(&(0x7f0000000040)='./file1\x00')
r1 = socket$inet(0x2, 0x2, 0x0)
setsockopt$inet_opts(r1, 0x0, 0x200000000000c, &(0x7f0000000080)="eaef125c00000000", 0x8)
open$dir(&(0x7f00000000c0)='./file0\x00', 0x40000400001803c1, 0x0)
r2 = open$dir(&(0x7f0000000000)='./file0\x00', 0x2, 0x0)
writev(r2, &(0x7f0000000340)=[{&(0x7f0000000000), 0x2cfea}], 0x1000000000000013)


_ksem_init(0x0, &(0x7f00000006c0)=<r0=>0x50535244)
socketpair$unix(0x1, 0x2, 0x0, &(0x7f00000007c0)={<r1=>0xffffffffffffffff})
getsockopt$sock_cred(r1, 0xffff, 0x11, 0x0, 0x0)
_ksem_destroy(r0)
_ksem_destroy(r0)


rename(0x0, 0x0)
r0 = compat_30_socket(0x22, 0x3, 0x0)
compat_43_osend(r0, &(0x7f0000000200)="08200203", 0x4, 0x0)


shmget(0x0, 0x2000, 0x603, &(0x7f0000ffc000/0x2000)=nil)
shmget(0x0, 0x3000, 0x0, &(0x7f0000ffd000/0x3000)=nil)


r0 = socket(0x1f, 0x1, 0x0)
ioctl$FIOSEEKDATA(r0, 0xc0086661, &(0x7f0000001040))


bind$unix(0xffffffffffffffff, 0x0, 0x0)
fcntl$lock(0xffffffffffffffff, 0x0, &(0x7f0000000080)={0x0, 0x0, 0x0, 0x100000001})
setsockopt$sock_int(0xffffffffffffffff, 0xffff, 0x0, 0x0, 0x0)
r0 = socket(0x800000018, 0x3, 0x0)
setsockopt$sock_int(r0, 0xffff, 0x1000, &(0x7f0000000000)=0x7, 0x4)
bind$unix(r0, &(0x7f0000000080)=@abs={0x1f95d27d48731892, 0x7}, 0x1c)
syz_emit_ethernet(0x4e, &(0x7f0000000140))


r0 = open$dir(&(0x7f0000000b80)='./file0\x00', 0x200, 0x0)
fcntl$lock(r0, 0x0, &(0x7f0000000200))


mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x0)
acct(&(0x7f0000000080)='./file0\x00')


acct(0x0)
minherit(&(0x7f0000ffc000/0x3000)=nil, 0x3000, 0x0)
munlock(&(0x7f0000ffc000/0x3000)=nil, 0x3000)


r0 = socket(0x1f, 0x5, 0x0)
listen(r0, 0x0)


mknodat(0xffffffffffffff9c, &(0x7f0000000040)='./file0\x00', 0xc0e99db6de761f86, 0x0)
r0 = open$dir(&(0x7f0000000000)='./file0\x00', 0x2, 0x0)
syz_emit_ethernet(0x46, &(0x7f0000000140))
ioctl$FIONREAD(r0, 0xc0106978, &(0x7f0000000140))


r0 = open$dir(&(0x7f0000000000)='./file0\x00', 0x40000400001803c1, 0x0)
pwritev(r0, &(0x7f0000000080)=[{&(0x7f0000000100)="1f319c1fde2abc05119e9bdbed82410f2932fbd8845cf92b5b8ff03fab37e84f062a6661e620d93bf8e9bfdbd8850fa7aa8788b2bd66a10ccd45801e6147b7272a71be5f82227bb3d03acda3661252bbde1da4947d5032f057eb2fcfe99888b1e034125885a1770f221ca28972f352b2d04db1e8d522a260085a40c38e403025b4212719d15f6e1de8b9043ef294c1", 0xff82}], 0x1, 0x4af)


open(&(0x7f0000000040)='./file0\x00', 0xf8e, 0x0)
ktrace(&(0x7f0000000200)='./file0\x00', 0x4, 0xd27d43220c7df9b, 0x0)
_ksem_init(0x0, &(0x7f0000000100)=<r0=>0x0)
_ksem_wait(r0)
_ksem_post(r0)


compat_40_mount(&(0x7f0000000080)='tmpfs\x00', &(0x7f00000000c0)='.\x00', 0x0, &(0x7f00000002c0)="01")
mkdir(&(0x7f0000000000)='./file0\x00', 0x0)
r0 = open$dir(&(0x7f0000000200)='.\x00', 0x0, 0x0)
r1 = openat(r0, &(0x7f0000000000)='./file0\x00', 0x0, 0x0)
readv(r1, &(0x7f0000000080)=[{0x0}], 0x1)


mkdir(&(0x7f0000000340)='./file0\x00', 0x0)
compat_50_quotactl(&(0x7f0000000000)='./file0\x00', 0x60000, 0x0, 0x0)


symlink(&(0x7f0000000080)='.\x00', &(0x7f0000000240)='./file0\x00')
compat_50___msgctl13$IPC_STAT(0x0, 0x2, &(0x7f0000000280)={{}, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, &(0x7f00000001c0)={&(0x7f0000000100), 0xdffffffffffff7ff}})
lchown(&(0x7f0000000100)='./file0\x00', 0x0, 0xffffffffffffffff)
compat_40_mount(&(0x7f0000000380)='union\x00', &(0x7f0000000140)='./file0\x00', 0x0, &(0x7f00000001c0))
pathconf(&(0x7f0000000080)='./file0\x00', 0x8)


socketpair$unix(0x1, 0x5, 0x0, &(0x7f0000000200)={<r0=>0xffffffffffffffff, <r1=>0xffffffffffffffff})
recvfrom$unix(r1, &(0x7f0000000680)=""/4096, 0x1000, 0x0, 0x0, 0x0)
fcntl$setstatus(r0, 0x4, 0x40)
sendmsg$unix(0xffffffffffffffff, &(0x7f0000000640)={0x0, 0x0, 0x0, 0x0, &(0x7f0000000080)=ANY=[@ANYBLOB="18000000ffff000001"], 0x18}, 0x0)
sendmmsg(r0, &(0x7f0000000480)={0x0}, 0x10, 0x0, 0x0)


compat_40_mount(&(0x7f0000000040)='tmpfs\x00', &(0x7f00000000c0)='.\x00', 0x0, &(0x7f00000002c0)="01")
r0 = open(&(0x7f00000000c0)='./file0\x00', 0x615, 0x0)
mmap(&(0x7f0000000000/0x2000)=nil, 0x2000, 0x2, 0x10, r0, 0x0, 0x0)
compat_50_clock_gettime(0x0, &(0x7f0000000000))


compat_50_setitimer(0x0, 0x0, &(0x7f0000000240))
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x0)
mlock(&(0x7f0000007000/0xc000)=nil, 0xc000)


mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
compat_90_fhstatvfs1(&(0x7f0000000080)="a62f49629b06f3acd7f9224dba3d024c07fe13937d584cf8589d73ea", 0x1c, 0x0, 0x0)


modctl$MODCTL_UNLOAD(0x1, &(0x7f0000000080)="f7f18c4b0d602d76648e1e31046b3d046bdf3bf31d62c7487d077681d6fcd0998d")
modctl$MODCTL_LOAD(0x0, &(0x7f0000000080)={0x0, 0x0, 0x0, 0xc1})
compat_40_mount(&(0x7f0000000040)='tmpfs\x00', &(0x7f0000000140)='.\x00', 0x0, &(0x7f0000000080)="01")
symlink(&(0x7f0000000080)='.\x00', &(0x7f0000000240)='./file0\x00')
compat_50___msgctl13$IPC_STAT(0xffffffffffffffff, 0x2, &(0x7f0000000280)={{}, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, &(0x7f00000001c0)={&(0x7f0000000100)}})
open(&(0x7f0000000100)='./file0\x00', 0x0, 0x0)
compat_40_mount(&(0x7f0000000140)='umap\x00', &(0x7f0000000040)='./file0\x00', 0x0, &(0x7f00000001c0))
compat_50_lutimes(&(0x7f0000000bc0)='./file0\x00', 0x0)


symlink(&(0x7f0000000080)='.\x00', &(0x7f0000000240)='./file0\x00')
compat_50___msgctl13$IPC_STAT(0x0, 0x2, &(0x7f0000000280)={{}, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, &(0x7f00000001c0)={&(0x7f0000000100), 0xdffffffffffff7ff}})
lchown(&(0x7f0000000100)='./file0\x00', 0x0, 0x0)
compat_40_mount(&(0x7f0000000380)='union\x00', &(0x7f0000000140)='./file0\x00', 0x20000000, &(0x7f00000001c0))
open(&(0x7f00000000c0)='./file0\x00', 0x0, 0x0)


compat_50_setitimer(0x0, 0x0, 0x0)
_ksem_init(0x0, 0x0)
clock_nanosleep(0x0, 0x1, &(0x7f0000000000), &(0x7f0000000040))


open(&(0x7f0000000480)='./file0\x00', 0x7fffe, 0x0)
truncate(&(0x7f0000000440)='./file0\x00', 0x0, 0x3e2b649f)
open$dir(&(0x7f0000000000)='./file0\x00', 0x0, 0x0)
swapctl$SWAP_ON(0x1, &(0x7f0000000000), 0x0)


compat_40_mount(&(0x7f0000000040)='tmpfs\x00', &(0x7f00000000c0)='.\x00', 0x0, &(0x7f00000002c0)="01")
mknod(&(0x7f0000000280)='./file0\x00', 0x1100, 0x0)
pathconf(&(0x7f0000000080)='./file0\x00', 0x6)


mknod(&(0x7f00000000c0)='./bus\x00', 0x2000, 0x0)
r0 = open$dir(&(0x7f0000000000)='./bus\x00', 0x0, 0x0)
ioctl$FIOASYNC(r0, 0x80017472, &(0x7f0000000040)=0x1ff)


compat_50_setitimer(0x0, 0x0, 0x0)
r0 = socket$inet(0x2, 0x2, 0x0)
ioctl$FIONREAD(r0, 0xc0106926, &(0x7f0000000080))


open(0x0, 0x70e, 0x0)
ktrace(&(0x7f0000000200)='./file0\x00', 0x4, 0xd27d43220c7df9b, 0x0)
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
compat_50_clock_settime(0x40000000, &(0x7f0000000000))


socket(0x1f, 0x1, 0x0)
symlink(&(0x7f0000000080)='.\x00', 0x0)
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
socket$inet(0x2, 0x1, 0x0)
r0 = socket(0x18, 0x1, 0x0)
compat_50_select(0x40, &(0x7f0000000080), 0x0, 0x0, 0x0)
shutdown(r0, 0x0)


pipe2(&(0x7f0000000000)={<r0=>0xffffffffffffffff}, 0x0)
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
socket(0x0, 0x0, 0x0)
readv(r0, &(0x7f00000002c0)=[{0x0}], 0x1)


mkdir(&(0x7f0000000080)='./file0\x00', 0x0)
compat_40_mount(&(0x7f0000000180)='ptyfs\x00', &(0x7f00000002c0)='./file0\x00', 0x0, &(0x7f0000000500))
compat_50___msgctl13$IPC_STAT(0x0, 0x2, &(0x7f0000000280)={{}, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, &(0x7f00000001c0)={&(0x7f0000000100), 0xdffffffffffff7ff}})
lchown(&(0x7f0000000100)='./file0\x00', 0x0, 0xffffffffffffffff)
compat_40_mount(&(0x7f0000000380)='union\x00', &(0x7f0000000140)='./file0\x00', 0x3, &(0x7f00000001c0))
open$dir(&(0x7f0000000080)='./file0\x00', 0x10, 0x0)


r0 = socket$inet(0x2, 0x2, 0x0)
setsockopt$inet_opts(r0, 0x0, 0x1, &(0x7f0000000280)="ea000001", 0x4)


mknod(&(0x7f0000000000)='./file0\x00', 0x2000, 0x0)
r0 = open(&(0x7f0000000480)='./file0\x00', 0x0, 0x0)
compat_20_fstatfs(r0, &(0x7f0000000200))
compat_40_mount(&(0x7f0000000040)='tmpfs\x00', &(0x7f0000000140)='.\x00', 0x0, &(0x7f0000000080)="01")
symlink(&(0x7f0000000080)='.\x00', &(0x7f0000000240)='./file0\x00')
compat_50___msgctl13$IPC_STAT(0xffffffffffffffff, 0x2, &(0x7f0000000280)={{}, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, &(0x7f00000001c0)={&(0x7f0000000100)}})
open(&(0x7f0000000100)='./file0\x00', 0x0, 0x0)
compat_40_mount(&(0x7f0000000140)='umap\x00', &(0x7f0000000040)='./file0\x00', 0x0, &(0x7f00000001c0))


mknod(&(0x7f00000000c0)='./file0\x00', 0x2000, 0x4100)
r0 = open(&(0x7f00000003c0)='./file0\x00', 0x0, 0x0)
ioctl$FIONREAD(r0, 0x80185760, &(0x7f0000000080))


fcntl$lock(0xffffffffffffffff, 0x0, &(0x7f00000000c0))
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x4)
pipe(&(0x7f0000000640)={<r0=>0xffffffffffffffff})
write(r0, &(0x7f0000000340), 0xd4e688a67930cd)


modctl$MODCTL_UNLOAD(0x2, 0x0)
r0 = shmget$private(0x0, 0x1000, 0x0, &(0x7f0000ffc000/0x1000)=nil)
munmap(&(0x7f0000ffb000/0x4000)=nil, 0x4000)
shmat(r0, &(0x7f0000ffd000/0x1000)=nil, 0x0)
mprotect(&(0x7f0000ffb000/0x3000)=nil, 0x3000, 0x0)


setreuid(0x0, 0xee01)
modctl$MODCTL_LOAD(0x0, &(0x7f0000000040)={&(0x7f00000000c0), 0x0, 0x0})


open$dir(0x0, 0x0, 0x0)
r0 = __clone(0x0, 0x0)
compat_50_wait4(r0, &(0x7f00000002c0), 0x4, &(0x7f0000000440))


r0 = socket$inet(0x2, 0x2, 0x0)
close(r0)
r1 = socket$inet(0x2, 0x2, 0x0)
setsockopt$inet_opts(r1, 0x0, 0x200000000000a, &(0x7f0000000040)='\x00', 0x1)
setsockopt$inet_opts(r0, 0x0, 0x9, 0x0, 0x0)


symlink(0x0, 0x0)
link(0x0, &(0x7f0000000240)='./file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa/file0\x00')
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
r0 = socket(0x18, 0x1, 0x0)
pread(0xffffffffffffffff, 0x0, 0x0, 0x0)
recvmsg(r0, &(0x7f0000000240)={0x0, 0x0, 0x0, 0x0, 0x0}, 0x0)


madvise(&(0x7f0000ffa000/0x4000)=nil, 0x4000, 0x0)
mlock(&(0x7f0000ffd000/0x2000)=nil, 0x2000)
mlock(&(0x7f0000ff5000/0x3000)=nil, 0x3000)


r0 = socket(0x2, 0x2, 0x0)
shutdown(r0, 0x1)


rasctl(0x0, 0x9, 0x0)
rasctl(&(0x7f00000012c0), 0xd39, 0x0)
rasctl(0x0, 0x0, 0x1)


r0 = socket(0x18, 0x3, 0x0)
setsockopt(r0, 0x1000000029, 0x1a, &(0x7f0000000000)="5afee7d8", 0x4)


mknod(&(0x7f00000000c0)='./bus\x00', 0x2000, 0x4f4b)
r0 = open$dir(&(0x7f0000000040)='./bus\x00', 0x0, 0x0)
ioctl$FIOASYNC(r0, 0xc0104306, &(0x7f0000000180)=0x8)


r0 = getsid(0x0)
ptrace(0x9, r0, 0x0, 0x0)
compat_50_wait4(0x0, 0x0, 0x0, 0x0)
ptrace(0x2, r0, &(0x7f0000000240), 0x0)


symlink(&(0x7f0000000080)='.\x00', &(0x7f0000000240)='./file0\x00')
compat_50___msgctl13$IPC_STAT(0x0, 0x2, &(0x7f0000000280)={{}, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, &(0x7f00000001c0)={&(0x7f0000000100)}})
lchown(&(0x7f0000000100)='./file0\x00', 0x0, 0xffffffffffffffff)
compat_40_mount(&(0x7f0000000200)='ext2fs\x00', &(0x7f0000000140)='./file0\x00', 0x0, &(0x7f00000001c0))


modctl$MODCTL_UNLOAD(0x2, 0x0)
compat_40_mount(0x0, 0x0, 0x0, 0x0)
mknod(&(0x7f0000000100)='./file0\x00', 0x3a0914c44f7b202d, 0x500)
open(&(0x7f0000000080)='./file0\x00', 0x70e, 0x0)
open(&(0x7f0000000040)='./file0\x00', 0x0, 0x0)


symlink(&(0x7f0000000000)='./file0\x00', &(0x7f0000000240)='./file0\x00')
r0 = open$dir(&(0x7f0000000200)='.\x00', 0x0, 0x0)
openat(r0, &(0x7f0000000000)='./file0\x00', 0x144, 0x0)


socketpair$unix(0x1, 0x1, 0x0, &(0x7f0000000200)={<r0=>0xffffffffffffffff, <r1=>0xffffffffffffffff})
recvmsg(0xffffffffffffffff, &(0x7f0000000700)={0x0, 0x0, &(0x7f0000000640)=[{0x0}, {0x0}, {&(0x7f0000000540)=""/239, 0xef}], 0x3, 0x0}, 0x0)
setsockopt$sock_int(r1, 0xffff, 0x1004, &(0x7f00000000c0)=0x8000, 0x4)
sendmmsg(r0, &(0x7f0000000500)={0x0}, 0x10, 0x0, 0x0)
recvmsg(r1, &(0x7f0000000380)={0x0, 0x0, &(0x7f0000000040)=[{&(0x7f0000000100)=""/218, 0x11d}], 0x1, 0x0}, 0x0)
dup2(r1, r0)
execve(0x0, 0x0, 0x0)


mkdir(&(0x7f0000000080)='./file0\x00', 0x0)
compat_50___msgctl13$IPC_STAT(0x0, 0x2, &(0x7f0000000280)={{}, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, &(0x7f00000001c0)={&(0x7f0000000100), 0xdffffffffffff7ff}})
lchown(&(0x7f0000000100)='./file0\x00', 0x0, 0xffffffffffffffff)
__mount50(&(0x7f0000000000)='fdesc\x00', &(0x7f0000000200)='./file0\x00', 0x0, 0x0, 0x0)
compat_40_mount(&(0x7f0000000380)='union\x00', &(0x7f0000000140)='./file0\x00', 0x3, &(0x7f00000001c0))
__posix_rename(0x0, 0x0)
compat_40_mount(&(0x7f0000000140)='umap\x00', &(0x7f0000000040)='./file0\x00', 0x0, &(0x7f00000001c0))
pathconf(&(0x7f0000000200)='./file0\x00', 0x2)


compat_40_mount(&(0x7f0000000040)='tmpfs\x00', &(0x7f00000000c0)='.\x00', 0x0, &(0x7f0000000140)="01")
fcntl$lock(0xffffffffffffffff, 0x0, &(0x7f0000000040))
semctl$SETALL(0x0, 0x0, 0x9, &(0x7f0000000040)=[0x7ff])
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
compat_43_ocreat(&(0x7f0000000040)='./file0\x00', 0x0)


compat_50_setitimer(0x0, 0x0, 0x0)
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x0)
modctl$MODCTL_LOAD(0x5, 0x0)
mknod(0x0, 0x0, 0x4f4b)
socketpair$unix(0x2, 0x3, 0x88, 0x0)


compat_40_mount(&(0x7f0000000080)='tmpfs\x00', &(0x7f00000000c0)='.\x00', 0x0, &(0x7f00000002c0)="01")
mknod(&(0x7f0000000280)='./file0\x00', 0x1100, 0x0)
symlinkat(0x0, 0xffffffffffffffff, 0x0)
chflags(&(0x7f0000000000)='./file0\x00', 0x20000)
compat_40_mount(0x0, 0x0, 0x0, 0x0)
r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='.\x00', 0x0, 0x0)
utimensat(r0, &(0x7f00000000c0)='./file0\x00', 0x0, 0x0)


_lwp_detach(0x0)
r0 = _lwp_self()
_lwp_wait(0x0, 0x0)
_lwp_detach(r0)


socket(0x10, 0x4000, 0x8)
mprotect(&(0x7f00006ea000/0x3000)=nil, 0x3000, 0x5)
shmctl$IPC_SET(0x0, 0x1, &(0x7f0000000a00)={{0x40000000, 0x0, 0x0, 0x0, 0x0, 0x20}, 0x0, 0x7, 0x0, 0x0, 0x10000000000008, 0x1, 0x100000001})
r0 = semget$private(0x0, 0x7, 0x3c0)
semctl$SETALL(r0, 0x0, 0x9, &(0x7f00000002c0))
semctl$SETALL(r0, 0x0, 0x9, &(0x7f0000000140)=[0x6, 0x2080])
semop(r0, &(0x7f0000000080)=[{0x0, 0x43, 0x800}, {0x4, 0xe6, 0x1800}, {0x0, 0x101}, {0x1, 0x20, 0x800}, {0x2, 0x5, 0x1800}, {0x1, 0x9e, 0x1000}, {0x2, 0xfffb, 0x1000}, {0x0, 0x40, 0x800}], 0x8)


r0 = socket(0x18, 0x1, 0x0)
setsockopt(r0, 0x29, 0xe, 0x0, 0x0)


compat_40_mount(&(0x7f0000000380)='tmpfs\x00', &(0x7f00000003c0)='.\x00', 0x0, &(0x7f0000000140)="01")
__mount50(0x0, &(0x7f0000000040)='.\x00', 0xe680bf986d21abfb, 0x0, 0x0)


mknod(&(0x7f00000000c0)='./bus\x00', 0x2000, 0x4f4b)
r0 = open$dir(&(0x7f0000000040)='./bus\x00', 0x0, 0x0)
swapctl$SWAP_CTL(0x5, &(0x7f0000000000)="06e494", 0x0)
ioctl$FIOASYNC(r0, 0xc0104308, &(0x7f00000001c0)=0x20000002)


mknod(&(0x7f00000000c0)='./bus\x00', 0x2000, 0x0)
r0 = open$dir(&(0x7f0000000040)='./bus\x00', 0x0, 0x0)
ioctl$FIOASYNC(r0, 0x40087468, &(0x7f00000001c0))


bind$unix(0xffffffffffffffff, &(0x7f0000000040)=@file={0xd19450564dee018c, './file0\x00'}, 0xa)
writev(0xffffffffffffffff, &(0x7f0000000100)=[{&(0x7f0000000000)="76e5eac907f9ccf7a251ceddcec7d6aa45cffe2c63a56077123a276d3ba4e9d17eb3eb5db12a3783a8e0620d357de1fe04fa9465b5bd1286e9624dec06a00c222f", 0x41}], 0x1)
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1)
lchown(&(0x7f0000000040)='./file1\x00', 0xffffffffffffffff, 0xffffffffffffffff)


mknod(&(0x7f0000000000)='./file1\x00', 0x2000, 0xa718)
rename(0x0, &(0x7f0000000100)='./file0/file0/../file0\x00')
r0 = open$dir(&(0x7f00000000c0)='./file1\x00', 0x0, 0x0)
ioctl$OFIOGETBMAP(r0, 0xc020447f, &(0x7f0000000100))


setrlimit(0x8, &(0x7f0000000100))
socket(0x1f, 0x1, 0x0)


open(&(0x7f0000000480)='./file0\x00', 0x80000000000206, 0x0)
r0 = open(&(0x7f0000000140)='./file0\x00', 0x0, 0x0)
compat_43_ogetdirentries(r0, 0x0, 0x0, 0x0)


mknod(&(0x7f0000000000)='./file0\x00', 0x1ffa, 0x0)
__mount50(&(0x7f00000002c0)='overlay\x00', &(0x7f0000000040)='.\x00', 0x0, &(0x7f0000000540), 0x0)
compat_50_lutimes(&(0x7f0000000080)='./file0\x00', &(0x7f0000000100)={0xffffffffffffffff})


r0 = socket$unix(0x1, 0x1, 0x0)
r1 = socket$unix(0x1, 0x1, 0x0)
bind$unix(r1, &(0x7f0000003000)=@file={0x1, '\xe9\x1fq\x89Y\x1e\x923aK\x00'}, 0xc)
listen(r1, 0x0)
connect$unix(0xffffffffffffffff, &(0x7f0000000280)=@file={0x1, '\xe9\x1fq\x89Y\x1e\x923aK\x00'}, 0x6e)
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
dup2(r1, r0)
r2 = socket$unix(0x1, 0x1, 0x0)
accept$unix(r0, &(0x7f0000000040), 0x0)
connect$unix(r2, &(0x7f0000000280)=@file={0x0, '\xe9\x1fq\x89Y\x1e\x923aK\x00'}, 0x6e)


__mount50(&(0x7f00000002c0)='overlay\x00', &(0x7f0000000040)='.\x00', 0x0, &(0x7f0000000540), 0x0)
__mount50(&(0x7f00000002c0)='overlay\x00', &(0x7f0000000040)='.\x00', 0x0, &(0x7f0000000540), 0x0)
pathconf(&(0x7f0000000000)='./file0\x00', 0x0)


mkdir(&(0x7f0000000080)='./file0\x00', 0x0)
r0 = open(&(0x7f0000000180)='./file0\x00', 0x0, 0x0)
compat_43_ogetdirentries(r0, 0x0, 0xc, 0x0)


mknod(&(0x7f0000000040)='./file0\x00', 0x0, 0xe06)
compat_50_getrusage(0x0, &(0x7f0000000100))


_lwp_setname(0x0, &(0x7f0000001100)='*/-\x00')
_lwp_getname(0x0, &(0x7f0000001140)=""/4, 0x4)


compat_40_mount(&(0x7f0000000040)='tmpfs\x00', &(0x7f00000000c0)='.\x00', 0x0, &(0x7f00000002c0)="01")
open(&(0x7f0000000040)='./file0\x00', 0xf8e, 0x0)
rename(&(0x7f0000000440)='./file0\x00', &(0x7f0000000480)='.\x00')


r0 = socket$unix(0x1, 0x2, 0x0)
sendto$unix(r0, 0x0, 0x0, 0x40d, &(0x7f0000000580)=@file={0x0, './file0/file0\x00'}, 0x10)


r0 = socket(0x2, 0x3, 0x0)
r1 = socket(0x18, 0x3, 0x0)
connect$unix(r1, &(0x7f00000000c0)=@abs={0x682eb13985c518e6, 0x7}, 0x1c)
dup2(r1, r0)
setsockopt$sock_int(r0, 0xffff, 0x1001, &(0x7f0000000100)=0x20000, 0x4)
write(r0, &(0x7f0000001680)="04bdfa5d1d2873c63e3534825ba166e2fea9aec43050006123339a346f731573d8d508753f95b7688ad48b8cf6bbca325cebc37fc4e1dd543dbe2da6dd", 0x1001c)


socketpair$unix(0x1, 0x1, 0x0, 0x0)
open(&(0x7f0000000040)='./file0\x00', 0x70e, 0x0)
ktrace(&(0x7f0000000200)='./file0\x00', 0x4, 0xd27d43220c7df9b, 0x0)
close(0xffffffffffffffff)
setrlimit(0x0, &(0x7f00000001c0)={0x42, 0x63})
semop(0x0, &(0x7f0000000140), 0x0)
getgroups(0x0, 0x0)
munlock(&(0x7f0000002000/0x1000)=nil, 0xffffffffdfffdfff)
truncate(&(0x7f00000000c0)='./file0\x00', 0x0, 0x0)


mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x0)
r0 = socket$inet6(0x18, 0x3, 0x0)
sendto$inet6(r0, 0x0, 0x0, 0x0, &(0x7f0000000040)={0x18, 0x3}, 0xc)


open(&(0x7f0000000040)='./file0\x00', 0x200, 0x0)
open(0x0, 0x0, 0x0)
r0 = open(&(0x7f0000002600)='./file0\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x13000)=nil, 0x13000, 0x5, 0x10, r0, 0x0, 0x0)
r1 = compat_30_socket(0x1f, 0x3, 0x0)
ioctl$WSMOUSEIO_SRES(r1, 0x80045721, &(0x7f0000003440))
compat_40_mount(0x0, &(0x7f0000000100)='./file0\x00', 0x0, 0x0)


getsockname$unix(0xffffffffffffffff, 0x0, 0x0)
compat_40_mount(&(0x7f0000000040)='tmpfs\x00', &(0x7f00000000c0)='.\x00', 0x0, &(0x7f00000002c0)="01")
open(&(0x7f0000000040)='./file0\x00', 0xf8e, 0x0)
__mount50(&(0x7f00000002c0)='overlay\x00', &(0x7f0000000040)='.\x00', 0x0, &(0x7f0000000540), 0x0)
acct(&(0x7f00000000c0)='./file0\x00')


compat_43_ocreat(&(0x7f0000000000)='./file0\x00', 0x0)
r0 = open(&(0x7f0000000280)='./file0\x00', 0x0, 0x0)
bind$unix(r0, &(0x7f0000000440)=@abs={0x0, 0x0, 0x2}, 0x8)


r0 = socket(0x2, 0x2, 0x0)
setsockopt$sock_int(r0, 0xffff, 0x0, 0x0, 0x0)
r1 = socket(0x2, 0x1, 0x0)
r2 = dup2(r1, r1)
shutdown(r2, 0x1)
listen(r2, 0x0)


symlink(&(0x7f0000000080)='.\x00', &(0x7f0000000300)='./file0\x00')
compat_50___msgctl13$IPC_STAT(0x0, 0x2, &(0x7f0000000440)={{}, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, &(0x7f00000001c0)={&(0x7f0000000100)}})
mknod(&(0x7f0000000100)='./file0\x00', 0x0, 0x5300)
msgctl$IPC_SET(0x0, 0x1, &(0x7f0000000200)={{}, 0x0, 0x0, 0x0, 0x0, 0x0, 0xfffffffffffffe00})
compat_40_mount(&(0x7f0000000140)='umap\x00', &(0x7f0000000040)='./file0\x00', 0x0, &(0x7f00000001c0))


r0 = socket(0x18, 0x3, 0x0)
setsockopt(r0, 0x1000000029, 0x33, &(0x7f0000000000)="0000199a", 0x4)


mkdir(&(0x7f0000000200)='./file0\x00', 0x0)
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
madvise(&(0x7f0000000000/0x3000)=nil, 0x3000, 0x80000000002)
__clock_settime50(0x0, &(0x7f0000002c00))


modctl$MODCTL_UNLOAD(0x2, 0x0)
mknod(&(0x7f0000000240)='./file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\x00', 0x2000, 0x0)
mknod$loop(&(0x7f0000000000)='./file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\x00', 0x2000, 0x1)
link(&(0x7f0000000940)='./file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\x00', &(0x7f0000000d40)='./file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\x00')
link(&(0x7f0000000100)='./file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\x00', &(0x7f0000000bc0)='./file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\x00')
rename(&(0x7f0000000340)='./file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\x00', &(0x7f0000000480)='./file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\x00')
rename(&(0x7f00000006c0)='./file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\x00', &(0x7f0000000e40)='./file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\x00')


symlink(&(0x7f0000000080)='.\x00', &(0x7f0000000240)='./file0\x00')
chroot(&(0x7f0000000000)='.\x00')
compat_50_mknod(&(0x7f00000002c0)='./file0/file0/..\x00', 0x0, 0x0)


modctl$MODCTL_UNLOAD(0x2, 0x0)
compat_40_mount(&(0x7f0000000380)='tmpfs\x00', &(0x7f00000003c0)='.\x00', 0x0, &(0x7f0000000140)="01")
__mount50(0x0, &(0x7f0000000040)='.\x00', 0xe680bf986d21abfb, &(0x7f0000000400), 0x0)
__lstat50(&(0x7f0000000000)='.\x00', 0x0)
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
unmount(&(0x7f0000000000)='./file0\x00', 0x255a0100)


socketpair$unix(0x1, 0x1, 0x0, &(0x7f0000000200)={<r0=>0xffffffffffffffff, <r1=>0xffffffffffffffff})
write(r0, &(0x7f0000000040)="ed", 0x1)
recvmmsg(r1, &(0x7f0000000880)={&(0x7f0000000840)={0x0, 0x0, &(0x7f0000000ac0)=[{&(0x7f0000000240)=""/217, 0xd9}], 0x1, 0x0}}, 0x10, 0x317a, 0x0)
close(r1)


mknod(&(0x7f0000000400)='./file0\x00', 0x2000, 0x287e)
r0 = open(&(0x7f0000000480)='./file0\x00', 0x80000000000206, 0x0)
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
writev(r0, &(0x7f0000000880)=[{0x0}], 0x1)


r0 = socket$inet(0x2, 0x2, 0x0)
sendmmsg(r0, &(0x7f00000008c0)={&(0x7f0000000340)={0x0, 0x0, 0x0, 0x0, 0x0}}, 0x10, 0x0, 0x0)


_ksem_init(0x0, &(0x7f0000000000)=<r0=>0x50535244)
_ksem_timedwait(r0, &(0x7f0000000980))
_ksem_destroy(r0)


munmap(&(0x7f0000ffa000/0x3000)=nil, 0x3000)
madvise(&(0x7f0000ffc000/0x1000)=nil, 0x1000, 0x0)


mknod(0x0, 0x0, 0x6da)
compat_40_mount(&(0x7f0000000040)='tmpfs\x00', &(0x7f00000000c0)='.\x00', 0x0, &(0x7f00000002c0)="01")
mknod(&(0x7f00000000c0)='./bus\x00', 0x2000, 0x0)
r0 = open$dir(&(0x7f0000000000)='./bus\x00', 0x0, 0x0)
ioctl$FIOASYNC(r0, 0x40067408, 0x0)
mknod(&(0x7f00000000c0)='./file0\x00', 0x2000, 0x4100)
__lutimes50(&(0x7f0000000180)='./file0\x00', 0x0)


__clock_settime50(0x0, 0xffffffffffffffff)


open(&(0x7f0000000140)='./file0\x00', 0x80000000000206, 0x400)
setreuid(0x0, 0xee01)
__posix_chown(&(0x7f0000000880)='./file0\x00', 0x0, 0x0)


socket$inet6(0x18, 0x3, 0x0)
r0 = socket(0x18, 0x3, 0x0)
connect$unix(r0, &(0x7f00000000c0)=@abs={0x682eb13985c518e6, 0x7}, 0x1c)
close(0x4)


ktrace(0x0, 0x0, 0x0, 0x0)
_ksem_init(0x0, &(0x7f0000000100)=<r0=>0x0)
_ksem_wait(r0)
_ksem_destroy(r0)


compat_50___msgctl13$IPC_STAT(0x0, 0x2, &(0x7f0000000280)={{}, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, &(0x7f00000001c0)={&(0x7f0000000100)}})
lchown(&(0x7f0000000100)='./file0\x00', 0x0, 0xffffffffffffffff)
symlink(&(0x7f0000000080)='.\x00', &(0x7f0000000240)='./file0\x00')
compat_40_mount(&(0x7f0000000000)='null\x00', &(0x7f0000000140)='./file0\x00', 0x0, &(0x7f00000001c0))
symlink(&(0x7f0000000300)='.\x00', &(0x7f00000001c0)='./bus\x00')


open(&(0x7f00000000c0)='./file0\x00', 0x205, 0x0)
ktrace(&(0x7f0000000200)='./file0\x00', 0x4, 0xd27d43220c7df9b, 0x0)
setuid(0xee01)


__mount50(&(0x7f00000002c0)='overlay\x00', &(0x7f0000000040)='.\x00', 0x0, &(0x7f0000000540), 0x0)
symlink(&(0x7f0000000380)='.\x02\x00', &(0x7f00000002c0)='.\x02\x00')
lchown(&(0x7f0000000140)='.\x02\x00', 0xffffffffffffffff, 0xffffffffffffffff)


ioctl$WSMUXIO_INJECTEVENT(0xffffffffffffffff, 0x80185760, &(0x7f0000000000)={0x0, 0x0, {0x0, 0x10000000000002}})
fcntl$lock(0xffffffffffffffff, 0x0, &(0x7f00000000c0)={0x0, 0x0, 0x5fe})
connect$unix(0xffffffffffffffff, &(0x7f00000000c0)=@abs={0x682eb13985c518e6, 0x7}, 0x1c)
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
r0 = socket(0x18, 0x3, 0x0)
close(r0)
r1 = socket(0x18, 0x2, 0x0)
setsockopt(r1, 0x1000000029, 0x2e, &(0x7f0000000000)="ebffcbff13b9fd812eaa4e713048e69931929648", 0x14)
connect$unix(r0, &(0x7f00000000c0)=@abs={0x0, 0x7}, 0x1c)


r0 = socket(0x18, 0x3, 0x0)
ioctl$FIOSEEKHOLE(r0, 0xc118691d, &(0x7f0000000180)=0x8000000000000032)
r1 = socket(0x18, 0x3, 0x0)
ioctl$FIOSEEKHOLE(r1, 0x8118691c, &(0x7f0000000180)=0x8000000000000032)


r0 = socket(0x1f, 0x5, 0x0)
ioctl$FIOGETBMAP(r0, 0xc008667a, &(0x7f0000000100))


r0 = socket(0x2, 0x3, 0x0)
setreuid(0xee00, 0x0)
r1 = getuid()
seteuid(r1)
ioctl$FIOSEEKHOLE(r0, 0xc0986981, &(0x7f0000000180)=0x8000000000000032)


r0 = socket$unix(0x1, 0x5, 0x0)
getsockopt$sock_int(r0, 0xffff, 0x8, &(0x7f0000000040), &(0x7f0000000100)=0x4)


setsockopt$inet6_MRT6_DEL_MFC(0xffffffffffffffff, 0x29, 0x69, &(0x7f0000000180)={{0x18, 0x3, 0x80000001, 0x3d2}, {0x18, 0x2}}, 0x3c)
socketpair$unix(0x1, 0x5, 0x0, &(0x7f0000000040)={0xffffffffffffffff, <r0=>0xffffffffffffffff})
sendmmsg(r0, &(0x7f0000000080)={0x0}, 0x10, 0x0, 0x0)


r0 = socket$inet(0x2, 0x2, 0x0)
mkdir(&(0x7f0000000200)='./file0\x00', 0x0)
compat_40_mount(&(0x7f0000000180)='ptyfs\x00', &(0x7f00000002c0)='./file0\x00', 0x0, &(0x7f0000000380))
open$dir(&(0x7f00000000c0)='./file0\x00', 0x0, 0x0)
unmount(&(0x7f0000000000)='./file0\x00', 0x255a0100)
fcntl$lock(r0, 0xa, 0x0)


mkdir(&(0x7f0000000200)='./file0\x00', 0x0)
compat_40_mount(&(0x7f0000000180)='ptyfs\x00', &(0x7f00000002c0)='./file0\x00', 0x0, &(0x7f0000000500))
compat_50___msgctl13$IPC_STAT(0x0, 0x2, &(0x7f0000000280)={{}, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, &(0x7f00000001c0)={&(0x7f0000000100), 0xdffffffffffff7ff}})
lchown(&(0x7f0000000100)='./file0\x00', 0x0, 0xffffffffffffffff)
compat_40_mount(&(0x7f0000000380)='union\x00', &(0x7f0000000140)='./file0\x00', 0x3, &(0x7f00000001c0))
pathconf(&(0x7f0000000240)='./file0\x00', 0x5)


mknod(&(0x7f00000000c0)='./bus\x00', 0x2000, 0x4f4b)
__getfh30(&(0x7f0000000000)='./bus\x00', 0x0, &(0x7f0000000080))


mknod(&(0x7f0000000400)='./file0\x00', 0x2000, 0x287e)
r0 = open(&(0x7f00000000c0)='./file0\x00', 0x80000000000206, 0x0)
write(r0, &(0x7f0000000180)="1e", 0x1)


open(&(0x7f00000000c0)='./file0\x00', 0x615, 0x0)
r0 = open(&(0x7f0000000040)='./file0\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x13000)=nil, 0x13000, 0x5, 0x10, r0, 0x0, 0x0)
connect$unix(0xffffffffffffffff, 0x0, 0x0)
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)


mknod(&(0x7f00000000c0)='./bus\x00', 0x2000, 0x0)
r0 = open$dir(&(0x7f0000000080)='./bus\x00', 0x0, 0x0)
ioctl$FIOASYNC(r0, 0x20003101, 0x0)
socketpair(0x11, 0x3, 0x0, 0x0)


r0 = compat_30_socket(0x22, 0x3, 0x0)
compat_43_osend(r0, &(0x7f0000000040)="08200207", 0x2000, 0x0)


_ksem_timedwait(0x0, &(0x7f0000000000)={0xfffffffffffffff8})


mkdir(&(0x7f0000000200)='./file0\x00', 0x0)
__mount50(&(0x7f0000000000)='kernfs\x00', &(0x7f0000000180)='./file0\x00', 0x0, 0x0, 0x0)
r0 = open$dir(&(0x7f00000000c0)='./file0\x00', 0x0, 0x0)
r1 = dup(r0)
lseek(r1, 0x0, 0x0, 0x2)
__getdents30(r1, 0x0, 0xa9d9)
fcntl$setstatus(r1, 0x4, 0x10000)
compat_40_mount(&(0x7f0000000040)='tmpfs\x00', &(0x7f00000005c0)='.\x00', 0x0, &(0x7f00000002c0)="01")
open(&(0x7f0000001640)='./file0\x00', 0x615, 0x0)


mknod(&(0x7f00000000c0)='./bus\x00', 0x2000, 0x0)
r0 = open$dir(&(0x7f0000000000)='./bus\x00', 0x0, 0x0)
ioctl$FIOASYNC(r0, 0x80047462, &(0x7f00000001c0))


compat_50___msgctl13$IPC_STAT(0x0, 0x2, 0x0)
r0 = socket(0x18, 0x3, 0x0)
connect$unix(0xffffffffffffffff, 0x0, 0x0)
writev(r0, &(0x7f0000000680)=[{&(0x7f0000000000)="2f87bb4098d7de56", 0x7ffffffffffffff7}], 0x1)


compat_40_mount(&(0x7f0000000040)='tmpfs\x00', &(0x7f00000000c0)='.\x00', 0x0, &(0x7f00000002c0)="01")
setrlimit(0x6, &(0x7f00000000c0))
symlink(&(0x7f0000000080)='.\x00', &(0x7f0000000240)='./file0\x00')
compat_50___msgctl13$IPC_STAT(0x0, 0x2, &(0x7f0000000280)={{}, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, &(0x7f00000001c0)={&(0x7f0000000100), 0xdffffffffffff7ff}})
lchown(&(0x7f0000000100)='./file0\x00', 0x0, 0xffffffffffffffff)
compat_40_mount(&(0x7f0000000380)='union\x00', &(0x7f0000000140)='./file0\x00', 0x0, &(0x7f00000001c0))
unlink(&(0x7f0000000000)='./file0\x00')
r0 = open(&(0x7f00000000c0)='./file0\x00', 0x615, 0x0)
mmap(&(0x7f0000000000/0x2000)=nil, 0x2000, 0x2, 0x10, r0, 0x0, 0x0)
mlockall(0x1)


r0 = socket(0x18, 0x3, 0x0)
ioctl$FIOSEEKHOLE(r0, 0xc048696d, &(0x7f0000000180)=0x8000000000000032)


mknod(&(0x7f0000000040)='./bus\x00', 0x100000000205f, 0x2802)
pathconf(&(0x7f0000000000)='./bus\x00', 0x2)


symlink(&(0x7f0000000900)='.\x00', &(0x7f0000000240)='./file0\x00')
compat_50___msgctl13$IPC_STAT(0x0, 0x2, &(0x7f0000000940)={{}, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, &(0x7f00000001c0)={&(0x7f0000000100)}})
__posix_rename(&(0x7f0000000100)='./file0\x00', 0x0)
sendmsg$unix(0xffffffffffffffff, &(0x7f0000000240)={&(0x7f0000000180)=@abs={0x0, 0x0, 0x2}, 0x8, &(0x7f0000000200)}, 0x0)
compat_40_mount(&(0x7f0000000140)='umap\x00', &(0x7f0000000040)='./file0\x00', 0x0, &(0x7f00000001c0))
rename(&(0x7f0000000640)='./file0\x00', &(0x7f0000000440)='./file0\x00')


mknod(&(0x7f0000000180)='./file0\x00', 0x2000, 0x202)
r0 = open(&(0x7f0000000040)='./file0\x00', 0x70e, 0x0)
fcntl$lock(r0, 0x4, 0x0)


mknod(&(0x7f00000000c0)='./bus\x00', 0x2000, 0x0)
r0 = open$dir(&(0x7f0000000000)='./bus\x00', 0x0, 0x0)
ioctl$FIOASYNC(r0, 0x402c7413, &(0x7f0000000040))


mknod(&(0x7f0000000000)='./file0\x00', 0x2000, 0x1803)
compat_43_ocreat(&(0x7f0000000000)='./file0\x00', 0x0)
r0 = socket(0x840000000002, 0x3, 0x0)
getsockopt$sock_cred(r0, 0x1, 0x11, 0x0, 0x0)


mknod(&(0x7f00000000c0)='./file0\x00', 0x6000, 0xe03)
mkdirat(0xffffffffffffffff, &(0x7f0000000000)='./file0\x00', 0x0)
swapctl$SWAP_ON(0x7, &(0x7f0000000000), 0x0)


swapctl$SWAP_ON(0x9, 0x0, 0x0)


symlink(&(0x7f0000000080)='.\x00', &(0x7f0000000240)='./file0\x00')
__mount50(&(0x7f00000002c0)='overlay\x00', &(0x7f0000000040)='.\x00', 0x0, &(0x7f0000000540), 0x0)
pathconf(0x0, 0x0)
open(&(0x7f00000000c0)='./file0\x00', 0x615, 0x0)


mlock(&(0x7f0000ffc000/0x4000)=nil, 0x4000)
sendmsg$unix(0xffffffffffffffff, 0x0, 0x0)
mincore(&(0x7f0000ffd000/0x3000)=nil, 0x3000, &(0x7f0000000200)=""/181)
minherit(&(0x7f0000000000/0x4000)=nil, 0x4000, 0x0)


socket(0x0, 0x0, 0x0)
socket(0x6c29f450c3de7f86, 0x0, 0xfd)


socketpair$unix(0x1, 0x2, 0x0, &(0x7f0000000340)={0xffffffffffffffff, <r0=>0xffffffffffffffff})
sendmmsg(r0, &(0x7f0000000080)={0x0}, 0x10, 0x0, 0x0)
compat_40_mount(&(0x7f0000000080)='tmpfs\x00', &(0x7f00000000c0)='.\x00', 0x0, &(0x7f00000002c0)="01")
__mount50(&(0x7f0000000000)='overlay\x00', &(0x7f0000000040)='.\x00', 0x0, &(0x7f0000000540), 0x0)
unlinkat(0xffffffffffffff9c, 0x0, 0x0)
mkdir(&(0x7f0000000000)='./file0\x00', 0x0)
setreuid(0xee00, 0x0)
r1 = open$dir(&(0x7f0000000080)='./file0\x00', 0x0, 0x0)
r2 = getuid()
fchown(r1, r2, 0xffffffffffffffff)
chmod(&(0x7f00000001c0)='./file0\x00', 0x0)


symlink(&(0x7f0000000080)='.\x00', &(0x7f0000000240)='./file0\x00')
compat_40_mount(&(0x7f0000000180)='ptyfs\x00', &(0x7f00000002c0)='./file0\x00', 0x0, &(0x7f0000000500))
rename(&(0x7f0000000000)='./file0/file0/../file0\x00', &(0x7f0000000140)='./file0/file0/../file0\x00')


mknod(&(0x7f0000000100)='./file0\x00', 0x2000, 0x0)
r0 = open(&(0x7f0000000140)='./file0\x00', 0x0, 0x0)
ioctl$FIOASYNC(r0, 0x20007462, 0x0)


r0 = socket(0x18, 0x1, 0x0)
r1 = socket(0x18, 0x2, 0x0)
setsockopt(r1, 0x1000000000029, 0xb, &(0x7f0000000040)='\x00\x00\x00\x00', 0x4)
dup2(r1, r0)
setsockopt(r0, 0x1000000029, 0xd, 0x0, 0x0)


mknod(&(0x7f0000000280)='./file0\x00', 0x1ffa, 0x0)
r0 = open(&(0x7f0000000000)='./file0\x00', 0x2, 0x0)
ioctl$FIONREAD(r0, 0x8020699d, &(0x7f00000001c0))
socket(0x1f, 0x1, 0x0)


mknod(0x0, 0x0, 0x0)
mknod(0x0, 0x0, 0x1803)
open(&(0x7f0000000180)='./file0\x00', 0x200, 0x0)
ktrace(&(0x7f0000000200)='./file0\x00', 0x4, 0xd27d43220c7df9b, 0x0)
r0 = open(&(0x7f00000000c0)='./file0\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x2000)=nil, 0x2000, 0x0, 0x10, r0, 0x0, 0x0)
sendto$inet6(0xffffffffffffffff, 0x0, 0x0, 0x0, 0x0, 0x0)
msgctl$IPC_SET(0x0, 0x1, 0x0)
modctl$MODCTL_LOAD(0x0, 0x0)


_lwp_create(0x0, 0x0, 0x0)
socketpair$unix(0x1, 0x1, 0x0, &(0x7f0000000040)={0xffffffffffffffff, <r0=>0xffffffffffffffff})
ioctl$FIOSEEKHOLE(r0, 0xc0206912, &(0x7f0000000180))
mknod(&(0x7f0000000480)='./file0\x00', 0x2000, 0x1733)
r1 = open$dir(&(0x7f0000000080)='./file0\x00', 0x0, 0x0)
close(r1)


open(&(0x7f0000000040)='./file0\x00', 0x70e, 0x0)
ktrace(&(0x7f0000000200)='./file0\x00', 0x4, 0xd27d43220c7df9b, 0x0)
ktrace(0x0, 0x0, 0x0, 0x0)
compat_43_ocreat(&(0x7f0000000000)='./bus\x00', 0x0)
__stat50(&(0x7f0000000180)='./bus\x00', &(0x7f0000000280))


compat_30_fhopen(0x0, 0x80)
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x0)
open(&(0x7f0000000500)='./file0\x00', 0x0, 0x0)
madvise(&(0x7f0000000000/0x3000)=nil, 0x3000, 0x0)


compat_50___msgctl13$IPC_STAT(0xffffffffffffffff, 0x2, &(0x7f0000000200)={{}, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, &(0x7f00000001c0)={&(0x7f0000000180)}})
symlink(&(0x7f0000000080)='.\x00', &(0x7f0000000240)='./file0\x00')
compat_40_mount(&(0x7f00000000c0)='cd9660\x00', &(0x7f0000000000)='./file0/file0\x00', 0x8000005, &(0x7f00000001c0))


mknod(&(0x7f0000000000)='./file0\x00', 0x6000, 0x100)
acct(&(0x7f0000000080)='./file0\x00')
open(&(0x7f0000000040)='./file0\x00', 0x70e, 0x0)
ktrace(&(0x7f0000000000)='./file0\x00', 0x4, 0x1100, 0xffffffffffffffff)
socketpair$unix(0x1, 0x5, 0x0, &(0x7f0000000100)={<r0=>0xffffffffffffffff})
sendmsg$unix(r0, &(0x7f0000001840)={&(0x7f0000001140)=@file={0x0, './file0\x00'}, 0xa, 0x0}, 0x0)
dup2(r0, r0)


mknod(&(0x7f0000000480)='./file0\x00', 0x2000, 0x1733)
r0 = open(&(0x7f00000000c0)='./file0\x00', 0x0, 0x0)
ioctl$FIOASYNC(r0, 0x2000427e, 0x0)
ioctl$FIONREAD(r0, 0x20004268, 0x0)


compat_43_osend(0xffffffffffffffff, &(0x7f0000000040)="08200203", 0x358, 0x0)
compat_40_mount(&(0x7f0000000100)='procfs\x00', &(0x7f00000000c0)='.\x00', 0x0, &(0x7f00000002c0)="01")
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
open$dir(&(0x7f0000000040)='./bus\x00', 0x0, 0x0)


mkdir(&(0x7f0000000080)='./file0\x00', 0x0)
__mount50(&(0x7f0000000000)='fdesc\x00', &(0x7f0000000200)='./file0\x00', 0x0, 0x0, 0x0)
r0 = open(&(0x7f0000000140)='./file0\x00', 0x0, 0x0)
ioctl$FIOASYNC(r0, 0x80047410, &(0x7f0000000180))


symlink(&(0x7f0000000080)='.\x00', &(0x7f0000000240)='./file0\x00')
compat_50___msgctl13$IPC_STAT(0x0, 0x2, &(0x7f0000000280)={{}, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, &(0x7f00000001c0)={&(0x7f0000000100), 0xdffffffffffff7ff}})
lchown(&(0x7f0000000100)='./file0\x00', 0x0, 0xffffffffffffffff)
compat_40_mount(&(0x7f0000000380)='union\x00', &(0x7f0000000140)='./file0\x00', 0x0, &(0x7f00000001c0))
unlink(&(0x7f0000000000)='./file0\x00')
open$dir(&(0x7f0000000000)='./file0\x00', 0x40000400001803c1, 0x0)
close(0x4)
link(&(0x7f0000000940)='./file0\x00', &(0x7f0000000d40)='./file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\x00')
rename(&(0x7f0000000240)='./file0\x00', &(0x7f0000000640)='./file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\x00')


socket(0x2, 0x2, 0x0)
r0 = socket(0x2, 0x1, 0x0)
r1 = socket$inet(0x2, 0x1, 0x0)
getsockopt$sock_int(r1, 0xffff, 0x10, &(0x7f0000000000), &(0x7f0000000080)=0xca555f06cd31e785)
r2 = fcntl$dupfd(r0, 0x0, r1)
close(r2)
mknod(&(0x7f0000000000)='./file0\x00', 0x0, 0x0)
syz_emit_ethernet(0x138, &(0x7f0000000000))
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
r3 = socket(0x2, 0x1, 0x0)
connect$unix(r3, &(0x7f0000000000), 0x10)
setsockopt$sock_int(r2, 0xffff, 0x1023, &(0x7f0000000080), 0x4)
r4 = dup(r0)
listen(r4, 0x0)


symlink(0x0, &(0x7f0000000240)='./file0\x00')
compat_40_mount(&(0x7f0000000040)='tmpfs\x00', &(0x7f00000000c0)='.\x00', 0x0, &(0x7f00000002c0)="01")
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
symlink(&(0x7f0000000300)='.\x00', &(0x7f0000000240)='./file0\x00')


symlink(&(0x7f0000000080)='.\x00', &(0x7f0000000000)='./file0\x00')
compat_50___msgctl13$IPC_STAT(0x0, 0x2, 0x0)
lchown(0x0, 0x0, 0xffffffffffffffff)
__mount50(&(0x7f0000000000)='overlay\x00', &(0x7f0000000040)='.\x00', 0x0, &(0x7f0000000540), 0x0)
__mount50(0x0, &(0x7f0000000040)='.\x00', 0xb0afbd006181d6de, &(0x7f0000000540), 0x0)
open(&(0x7f0000000100)='./file0\x00', 0x0, 0x0)


r0 = socket(0x18, 0x2, 0x0)
getsockopt(r0, 0x29, 0x0, 0x0, 0x0)


compat_50__lwp_park(&(0x7f0000000200)={0xfffffff8}, 0x0, 0x0, 0x0)


__stat50(0x0, 0x0)
socketpair$unix(0x1, 0x1, 0x0, &(0x7f0000000140)={0xffffffffffffffff, <r0=>0xffffffffffffffff})
sendmsg$unix(r0, &(0x7f00000000c0)={0x0, 0xfffffffffffffdb8, 0x0, 0x0, &(0x7f0000000100)=ANY=[@ANYBLOB="31000000ffff000001"], 0x28}, 0x0)


mknod$loop(&(0x7f0000000000)='./file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\x00', 0x2000, 0x1)
link(&(0x7f0000000940)='./file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\x00', &(0x7f0000000d40)='./file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\x00')
link(&(0x7f0000001240)='./file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\x00', &(0x7f0000000bc0)='./file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\x00')
symlink(&(0x7f0000000ac0)='./file0\x00', &(0x7f0000000e40)='./file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\x00')
rename(&(0x7f0000000600)='./file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\x00', &(0x7f0000000f40)='./file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\x00')
mknod(&(0x7f00000000c0)='./bus\x00', 0x2000, 0x0)


open(&(0x7f0000000040)='./file0\x00', 0x70e, 0x0)
socketpair$unix(0x1, 0x5, 0x0, &(0x7f0000000040)={<r0=>0xffffffffffffffff})
getsockopt$sock_cred(r0, 0xffff, 0x1022, &(0x7f0000000100)={0x0, 0x0, <r1=>0x0}, &(0x7f0000000140)=0xc)
setregid(0x0, r1)
r2 = getpid()
socket(0x11, 0x3, 0x0)
ktrace(&(0x7f0000000000)='./file0\x00', 0x0, 0x40000424, r2)
setregid(0x0, 0x0)
r3 = getppid()
msgctl$IPC_SET(0x0, 0x1, &(0x7f00000000c0)={{0x0, 0x0, 0x0, 0xffffffffffffffff}, 0x0, 0x0, r3})
pipe(&(0x7f0000000400)={<r4=>0xffffffffffffffff})
setpgid(0x0, 0x0)
setpgid(0x0, r3)
ioctl$WSKBDIO_GETMAP(r4, 0x80047476, &(0x7f0000000100)={0x0, 0x0})
r5 = fcntl$getown(r4, 0x5)
ktrace(&(0x7f0000000180)='./file0\x00', 0x4, 0xd30, r5)


minherit(&(0x7f0000ffc000/0x4000)=nil, 0x4000, 0x0)
mmap(&(0x7f0000ffb000/0x4000)=nil, 0x4000, 0xd49f275d97cc01bb, 0x1810, 0xffffffffffffffff, 0x0, 0x0)


mknod(&(0x7f0000000280)='./file0\x00', 0x2000, 0x200)
r0 = open(&(0x7f0000000000)='./file0\x00', 0x0, 0x0)
ioctl$FIONREAD(r0, 0x80047476, &(0x7f0000000080))


compat_40_mount(&(0x7f0000000380)='tmpfs\x00', &(0x7f00000003c0)='.\x00', 0x0, &(0x7f0000000140)="01")
mknod(&(0x7f0000000000)='./file0\x00', 0x8000, 0xa718)
__mount50(0x0, &(0x7f0000000040)='.\x00', 0xe680bf986d21abfb, &(0x7f0000000140), 0x0)
__mount50(&(0x7f0000000000)='overlay\x00', &(0x7f0000000040)='.\x00', 0x0, &(0x7f0000000540), 0x0)
acct(&(0x7f0000000000)='./file0\x00')


open(&(0x7f0000001640)='./file0\x00', 0x615, 0x0)
r0 = open$dir(&(0x7f0000000180)='./file0\x00', 0x0, 0x0)
preadv(r0, &(0x7f00000012c0)=[{0x0}], 0x1, 0x0)


mlockall(0x2)
mprotect(&(0x7f0000ffd000/0x2000)=nil, 0x2000, 0x0)
mprotect(&(0x7f0000ffd000/0x2000)=nil, 0x2000, 0x0)
r0 = socket(0x11, 0x3, 0x0)
open(&(0x7f0000000480)='./file0\x00', 0x200, 0x0)
r1 = getpid()
ktrace(&(0x7f00000000c0)='./file0\x00', 0x0, 0xa5879f5d35e81df3, r1)
__fstat50(r0, &(0x7f0000001600))
open(&(0x7f0000000080)='./file0\x00', 0x70d, 0x400)
compat_40_mount(&(0x7f0000000200)='overlay\x00', &(0x7f0000000240)='./file0\x00', 0x20000, &(0x7f0000000540)="1454d6966c5a618b355cc9a77e25c63da8558c75df0048b0b94c8f3e4501d4fc2110df4e4500e737a9b7044ed2ac72652807e0bd7297f4157b5470e2c122bb052131f32e44e022ede7705b59558e38f3a2f9d2e14f9e923601651973a556f52cc6c82fbed86ed58942eec37f12673bacd09ac6cf197fe68162ad2bcc318c84e5b4a98eb43d3051e85bb25626b85677df8a2316683c1740bebb3063f44e2085598f312339c62479fa426987313240a455f6c218ee7469dbee6969cdea4d5f6b22197331d338bb7ccad7eff4d54ddfa75dd2ec1a7f095b4a80c7968ec978a8d250dd15a4b7e1832d26103d6dd1ba081b5c27034da29da37985cc7228503b67711e9d8902db2eb249f0d38e3e4286d7b1a4196e4cb1b288dd1bb9030c37cdfbf246dda9cb0b6105d72f4c999d31bae47f20751e6385f2173005494132fbd87a6e5c0577e703316e3ca69f274f7edd2dd1662c43c56654b21c53a0aa25f0fb6e158b94bfc6629b5bfb2d2938020c2f55de6ec26466eb4cf2257accfbd061fedd2d661b080a5aadbf82ba574052358c94e9a1508cb0614819b2f1772e5236d684e86c4695e9df56192722d85a694595e91a20614baae2e89122f8fc2efb9ecbab09c06f40d189e97146d14c119f6d26bcfadb5c3e2bb730ac89e4912d7ea77bf8ada4538cc34d252fbedecb06a06c2b1020f9857e8db5f695eed96c98382b6ac1fc037fb1a79b4317dc5e3d492339bbfcaf0a4a7ae86476db7aa82a4b876eb5ee001bca74e8f11c238af8c35c19764d134d783dd4d6bd5f5ffa6500c64fe85f0ade3f4737cf13cd40259f9418456c714aa1a273dd27ceb268a10613260f10b2f3bde6ba03d931c1b63d52b06a52a416c2acbee12035a562e40bb57ab877470e0452007fa2b2abeb342acf32d149a20e7142566dfbc57df1a2d4fe0ec723145847990ad344baf5416b069a62d288a7c40f778b661876387ee84f2888f1ce0e9b8b15e0e9910a8350bd1a65e7c0ad0e060657da5b350feb09d0f2a97e78ddd1f9aba1ae85ebf369e88195e497e85437fef2d738c869f5ff6565ed7a0f2efaee8cb069b02aa9b754da70903043f3ca6b6b0e16f3c6979f1c041f0646c70a337099271ec87d7b4d201ff3a652981f20d54d5d4cd307e5f4f14d66a4518f5126921e20632a8ee18bb8408d850ba26e980af07b66064294da6db747c6f212bd6980d3e3693e2c75b95681fc587594526996e86398d75407b48cba9ed1f655bca1209f08098a67ad2c97ab3a13574e18e9312b8e24c71fe227a4c31c3a731164cc211c7dcd48cd54fff7ec2d4974255e4600422aadc64d98ab0fa75ebea74487c38416da9b0b4b2c7a1293791679618afefd647af9d745507c57767637e760b721aceb346c73e3668fbdaa8b8a4aead1ae6a65e34cf13b364d3b829b730c13e91e2abbe75f1dcdb1494b198d88c5d4797490d7a6d1cbfcd58b9fe5fd688c5e3f91951675ce86339721898d4bf0535766785a2c94027b06a9327dcda8d7387f17c69bcdfb029f5fcb3219210b456f5830bd51a33f19e8a81c97824d2f67f1a3db9902cbed7ee6b7ea301d8593a5033b35f6257d83eeb3b914da2f32fba583c11cc01f84e97e624b3df9d0af459795536f33ee33ec81465e3e6a8cf6a69f141e9e00493c0ace6cffeff6f4e5e8bccf8c56d03e6cbd1f4125f523b25f8e345d9b356ef8f4ff1068ca27ca6cc5e4b06e11962cd946c3987c2c80d2ddf69e2e9131d8d10d01b67a797af7fb688ca04569b8aa59301d2e8ada6fa9e3815f7f8a99110de507b2ffed1f36f62c8ea68935d5df9b91e33a3272c53644e9c73469b510b7a287101bec87bb5c0659dc5c00624f4c0c4d989c6e502abb43832b5980f46ecb35377205cd428867f9131aff628d6f20d44ec48ea434ca0855cb987e0e85549e521e49020def4492607072f2dd60abe46e460411e2e0447e5606343c8bda6d0d6fa5904c9c4b6c2e0735fa23d94e3ea2e1f10e7cd0bc8647402f4637539727e67901510e96170f28eaf6ab197bb14282b6a8f1c4a3111101d95cad3babdace8cf6d6a421d5fc4cb7df4ca4afb803c9af05cb565b8dd665707eb3850a97db052bb3b6cbe24f4dff2958187d170af67da91b0afd9c267e9a5ba19bbe84c61456ce85cc75adeb76582f6ea0cfaea9f4b0f7580e273160c63bb8e105d3ed65307a5da31fa7bdd53ebdf2b0567d414be2169ab5a0ec8a15c4cc0bd85ed05294e7c2066eb839cc09e133904b002db6e4cadf3f7aa09d5552bf7dbeb43596b5726fa0593efe09abedc2017b52a1e42506d35ace12e3f6f0d4d8eb1a1dff566f5b6abfeb76035d7ae06814dca0e094802cde32d486c9d1244f175216618502a2fb11ce55697dbe0a7f701b517a5950517dc3ef57a4cba590287ebf2cbf6d43db816d541e1bb873de4289c9fc31f7e240e1ae1603821b54271088b660910e7ea0543cc4e570244f3fdaef22a2694f63caa22c2e755ea1e9b86395ac3d94e50e24b71e141baa6446076a937e89519681a74095a1ac9fc33833cd2c9c4d5221005de3068fe56550032742c99062824dcab89dd442118a7c666b56516b3ec58d4d1c70dc14ef6bc3a2a3058e3ec30740ea7198fda06d609ed66756e450dc73cc60ad417ddced2582539c4372f673eb43dc1feb484e3cea21383c1eb3f846651ad64eda178bfd5622fee3788f34f273fa5c56d1abb6cb380ea0cbca0a23575fcb3914759baa670bf8cf9f67d77f4b297c8465306a32a2a42d0c633e4979711d12d85ed6d2614276eecdb27f6d10209c3160213b2d52c26520ca55bf8708200266336e1b4fbc86a81368f0737cb7bd958b3258ee93dea6d67ea120fb613762d1af86b761c913a777b0b080cf73bd97343fff8ad5faa0cbc39b28e9187c9457e6ffd22c820e21978343e50d3019e47b9af7d0b961acb594b0bbac90fea7390e1608586194a300f5c8555eb5b8567bc45643a55712ee862ff9685fc9229feae05fc6c5b185efcc2639d54cc1deae13bf7d437b09a1f17434fb4db3b3627e2ad779be30414f65a6f15c92df0e7bdca6a99ce25b2130f1b1419d42ad14a7449a0d50444df03e94b2258372dbaaadfd9a6f578d821dde54db2508a94d215fe5963480bf32f6d5d1c471a0aed9fa2540c5cb120138df3231a074256cdea28b6b872e1ff3a1bb8cfa18c9c6c29aaabdd363a80097e6604c4d2343507a37b708c5921943982e2d6abce847249489209117b3b2bc2100037b5120b7731c41d9fe8f9f7d04771b80f72a18bc83377a6b73e178c54ec840b4db9812a4b3d86bf596e67a758a199725f5269126b900fa87802049a25e7ad2fa8ec2185e97b20ec24b65930fa965675c76270930933f70ca5ca0293c77e16d0646279c6be9710919ee7bfc65e17ea15e1031d6d985ff74f57cf3e1ad254377ce169276306ec2fd355fe61b6b37eafeccfe45e11d0440e6b3a2e608a14e3bb5e5eedf0112c35ada328efeda0d3bb83bc39abb4101a75b459194555881785cde1d3f230daf93d9eaf7ad1e862ae5e84f13f6bd2ef697244f5631c05dfb287c4a736642c551d7678d64b4e12ac37439ac5bfcf7b52f3b1f1a9e8e8711bb95c7fd5e979c5e8c7530f8c76a0a88d0c49580d7d5dc9396a541cfc1460faf3caf649956ad4aed7ddc4e8c39f37b017b2c82365b03981ebbb71cc8df724ea40f01bc7185b8d3cd67a45a21f6e9b7831b51e43134c3a16a8f0f4732dd3509fcc84fa8ef3c5716f2cdccbc298946f0f80f3b4a0df01568bf20afcf7e3933d2f7965cb237030ac6c58795318f729db501f7b57da50b7116873e86cd78b68bdbaa72aedc4211ba8474da85640908538dc7686814ec3ad5ba053be80f1bb9e1a2be5751b7c2dcb50eef3d4e5b0ea80fbe2cd240a99e2e82e515694172e765d454c7230d410a8004a7ffcc384438f79bcccdad3f766d8807196749c3b3123b624e518251f6568bafd71cbdd88941798325bc917bc7bed22f5c0dff3a4e2a1d072fd59bd2fc74b2087651b80284e35a2cce3ecf2f50202550c613b43f76f84a0d5724011212e383a826db6e35efd3d0a40467e7a72fe9db42a11bf1fc2d7bb20d9c693f3c52e8ac85e458ddac5021306dd48601406a29e7666fd3760446289c6abeb647d87bfee989270d73f81ac51520ad4881b0f4d72c299cec7f878fb91093e6585f72887734ce89741ef0948fc31c1feb53d71539b839c4b5081af62f66a97cc93f49d61a33627125efacf1290934dd079fd6f216d20d8be8743cc2518b9bf06c5116b877b4c0f916842c3fa7474befbffe0d0b0e08c5e6c5040c1ff8dbdea46e1ce9f55c5b68092b2ba39ee5a0393c5b8c3b8cb9f146f1d8c4191e1ad5cedca634ee59e5fdf63d0ba0d92546d83c8e5abb8322106b70376ce8dd52db141a89ca674b79c386681617eb1bcddbac2c381a983bbe6ddf7a5d55168e4f8d6257b5c0fdafa58e8178a12e9022f4f5e6bc966d2ed4bd2abe89187afe1e7b1167401c103666d9c1d73153edafc491801b186c39e3c90fd60281a7e2c48fc0fa3763c4bd493e217a9000f60391daa885a2c71601aa44a4bc3bd5381984662272c4cd19e73afbc75290cf2c9281c5447f7657a18c058957c62f9b2191d4bf2b2b5ddc2b7464510df52198d3127da6cb35ee419de821cfbcc3e9ca47379c51a5f63322f5616edb4f1dc98d3ac14a6916274b69a60cd630d63031156e44133bbf57c642f794ad44fd686f00352e262027c62994fbaee6dd8d8867e30ca74d13c08b1a2c03b267fd8ea215136f03a52d154a77e0e83709ddb802cba0d060da9e2d093dfa7dba6c8e92491db7971b2ea0cfc9118713473bc56d78bedd86a069df4c12fd159231402b496f3334dad17d8301d53d367eec3f8786ff129d44072d26bcdc8e733d6e4cf5ad927f295d1618b462909e3ccaac657927fdbc311be800bf555b933f31563efc2d51411f2e15c5d48853df98541a27a524aeb8ffa2a297c8eee1e9350ee8658088414f57a8948aef84404b0b24ad2458293a1fbd85539a0d5fdbc8d599fd5aa947f091723f662831e16913530e4938831408568e4243dc41987cd1c833e556ac4b0992a33b5df10565c98ca44a976a9f5acf76fae8aa6ae971b3621a004dc49e55414861f990d48dbb08d709ec02c0730b9668fbd87310c425f1281b4e0421fe39146bd5cbfc047983ce427a560af6f848838f0e905c344cbaa597ed2d483695aa6b6ffce827798f25254e8c704db5623bb4e06aa1c3aed05f16f0c700e638b94b0d6829b42f23fb016809c68ed5aa0b638c0ddd7f74a56841c4e854fcc75b0a08a9755d60ac0af4b43b1829ad602bfc26ecda5a9a2d3441e6f094fba98a800167fcdf0bed5167294809f9566ef1b274578bc3f73a6d234aeabca410c86eeb46d3b93a16c1162d439302a3ded64342fe6582c5ca8cc3cd4ba3a5362c3a4845ffd5a4d78220c30820c00ed37254386fae727ba5b335101d1d30c2a5b818afe878957b684a0d6650361e60f40002221c81623e0c909996e27983ee2bcc6b5ade1090d524a3bc23dc26ec715acf282534f72e6a2ad91adf37c4d149113a0f034ff361b784c53db77c5c12e43a939c1426c31d124e7ba3353a3a1a4051523ed74b430550c8506e46f0c41ad6f6d8ff0ac3f1bc16ad38898fcd51043f638781fa643c558043a5f9ad147531534bfc9c378cc983c687fcfeca8b5707b251aa5aed8c746c5920b52b17b03517242613e7e8110601799924131a782406d67d36c5bb")
mkdirat(0xffffffffffffff9c, &(0x7f0000000340)='./file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\x00', 0x0)


compat_20_getfsstat(&(0x7f0000000000), 0xffffffffffffff39, 0x0)
setsockopt(0xffffffffffffffff, 0x0, 0x0, 0x0, 0x0)
mknod(&(0x7f00000000c0)='./bus\x00', 0x2000, 0x4f4b)
r0 = open$dir(&(0x7f0000000040)='./bus\x00', 0x0, 0x0)
ioctl$FIOASYNC(r0, 0x80104305, &(0x7f00000001c0)=0x20000002)


mknodat(0xffffffffffffff9c, &(0x7f0000000040)='./file0\x00', 0xc0e99db6de761f86, 0x0)
r0 = open$dir(&(0x7f0000000000)='./file0\x00', 0x1, 0x0)
open(&(0x7f0000002600)='./file0\x00', 0x0, 0x0)
ioctl$FIONREAD(r0, 0x40047307, &(0x7f0000000140))


symlink(&(0x7f0000000080)='.\x00', &(0x7f0000000240)='./file0\x00')
compat_50___msgctl13$IPC_STAT(0x0, 0x2, &(0x7f0000000280)={{}, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, &(0x7f00000001c0)={&(0x7f0000000100), 0xdffffffffffff7ff}})
lchown(&(0x7f0000000100)='./file0\x00', 0x0, 0xffffffffffffffff)
compat_40_mount(&(0x7f0000000380)='union\x00', &(0x7f0000000140)='./file0\x00', 0x0, &(0x7f00000001c0))
__mount50(&(0x7f00000002c0)='overlay\x00', &(0x7f0000000040)='.\x00', 0x0, &(0x7f0000000540), 0x0)
unlink(&(0x7f0000000000)='./file0\x00')
r0 = open(&(0x7f0000000480)='./file0\x00', 0x80000000000206, 0x0)
close(r0)


r0 = msgget$private(0x0, 0x0)
msgrcv(r0, 0x0, 0x0, 0x2, 0x0)
msgsnd(r0, &(0x7f0000000c40)=ANY=[@ANYRESDEC], 0x101, 0x0)
msgsnd(r0, &(0x7f0000001180)={0x2, "086085b026b4d30a7154e475bf9d14996e39be2b9c55981a43c1cda3653181f486a1e07886a81055f25be98e8fe3aac5c2cd12d674af1ae3c0439335da7baf8dec9b610c"}, 0x4c, 0x0)


mknod(&(0x7f0000000000)='./bus\x00', 0x2000, 0x2e00)
r0 = open(&(0x7f0000000040)='./bus\x00', 0x0, 0x0)
pread(r0, &(0x7f0000000d00)="8e", 0x1, 0x0)


mkdirat(0xffffffffffffff9c, &(0x7f0000000680)='./file0\x00', 0x0)
mkdir(0x0, 0x0)
r0 = open$dir(&(0x7f00000000c0)='./file0\x00', 0x0, 0x0)
mkdir(&(0x7f0000000040)='./file2\x00', 0x0)
mkdir(&(0x7f0000000300)='./file2/file0\x00', 0x0)
rename(&(0x7f00000002c0)='./file2/file0\x00', &(0x7f0000000340)='./file0\x00')
openat(r0, &(0x7f0000000000)='./file2\x00', 0x0, 0x0)


mknod(&(0x7f00000000c0)='./bus\x00', 0x2000, 0x0)
r0 = open$dir(&(0x7f0000000000)='./bus\x00', 0x0, 0x0)
ioctl$FIOASYNC(r0, 0x20007461, 0x0)


symlink(&(0x7f0000000080)='.\x00', &(0x7f0000000240)='./file0\x00')
compat_50___msgctl13$IPC_STAT(0x0, 0x2, &(0x7f0000000280)={{}, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, &(0x7f00000001c0)={&(0x7f0000000100), 0xdffffffffffff7ff}})
lchown(&(0x7f0000000100)='./file0\x00', 0x0, 0xffffffffffffffff)
compat_40_mount(&(0x7f0000000000)='null\x00', &(0x7f0000000140)='./file0\x00', 0x0, &(0x7f00000001c0))
compat_40_mount(&(0x7f0000000380)='union\x00', &(0x7f0000000140)='./file0\x00', 0x0, &(0x7f00000001c0))


mkdir(&(0x7f0000000080)='./file0\x00', 0x0)
compat_40_mount(&(0x7f0000000180)='ptyfs\x00', &(0x7f00000002c0)='./file0\x00', 0x0, &(0x7f0000000500))
compat_50___msgctl13$IPC_STAT(0x0, 0x2, &(0x7f0000000480)={{}, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, &(0x7f00000001c0)={&(0x7f0000000100)}})
lchown(&(0x7f0000000100)='./file0\x00', 0x0, 0xffffffffffffffff)
compat_40_mount(&(0x7f0000000000)='null\x00', &(0x7f0000000140)='./file0\x00', 0x0, &(0x7f00000001c0))
pathconf(&(0x7f0000000040)='./file0\x00', 0x6)


mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
compat_50_lutimes(0x0, &(0x7f0000000180))


r0 = socket(0x18, 0x3, 0x0)
ioctl$FIOSEEKHOLE(r0, 0x40047307, &(0x7f0000000180))


open(&(0x7f0000001640)='./file0\x00', 0x615, 0x0)
ktrace(&(0x7f0000000200)='./file0\x00', 0x0, 0x0, 0x0)
compat_43_orecv(0xffffffffffffffff, 0x0, 0x0, 0x0)


symlink(&(0x7f0000000080)='.\x00', &(0x7f0000000240)='./file0\x00')
compat_50___msgctl13$IPC_STAT(0x0, 0x2, &(0x7f0000000280)={{}, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, &(0x7f00000001c0)={&(0x7f0000000100)={0x0, 0x0, 0x0, 0x2}, 0xdffffffffffff801}})
__mount50(&(0x7f0000000080)='overlay\x00', &(0x7f0000000040)='.\x00', 0x0, &(0x7f0000000540), 0x0)
compat_40_mount(&(0x7f0000000340)='zfs\x00', &(0x7f0000000140)='./file0\x00', 0x0, 0x0)
link(0x0, 0x0)
ioctl$WSKBDIO_BELL(0xffffffffffffffff, 0x20005701)
socket(0x0, 0x0, 0x0)
r0 = semget$private(0x0, 0x5, 0x0)
semop(r0, &(0x7f0000000100)=[{0x4, 0x401e, 0x1000}], 0x1)
semop(0x0, 0xffffffffffffffff, 0x0)
semctl$GETZCNT(0x0, 0x0, 0x7, 0x0)
open(0x0, 0x0, 0x20)
compat_50_____semctl13$SETVAL(r0, 0x0, 0x8, &(0x7f0000000100))


r0 = socket(0x18, 0x2, 0x0)
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
setsockopt(r0, 0x1000000029, 0x30, &(0x7f0000000000)="ebffcbff13b9fd812eaa4e713048e699", 0x10)


compat_43_osendmsg(0xffffffffffffffff, &(0x7f0000000500)="76219e8bf93cd7a79db3", 0x0)


r0 = open$dir(&(0x7f0000000000)='./file0\x00', 0xf02, 0x0)
mlockall(0x2)
mmap(&(0x7f0000ffc000/0x4000)=nil, 0x4000, 0x4, 0x10, r0, 0x0, 0x0)


ktrace(&(0x7f0000000040)='./file0\x00', 0x4, 0x114, 0x0)
compat_43_ogethostid()


modctl$MODCTL_UNLOAD(0x2, 0x0)
symlink(&(0x7f0000000080)='.\x00', &(0x7f0000000240)='./file0\x00')
compat_50___msgctl13$IPC_STAT(0x0, 0x2, &(0x7f0000000280)={{}, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, &(0x7f00000001c0)={&(0x7f0000000100)}})
lchown(&(0x7f0000000100)='./file0\x00', 0x0, 0xffffffffffffffff)
compat_40_mount(&(0x7f0000000140)='umap\x00', &(0x7f0000000040)='./file0\x00', 0x0, &(0x7f00000001c0))
open(&(0x7f0000000100)='./file0\x00', 0x0, 0x0)


pipe(&(0x7f0000000080)={<r0=>0xffffffffffffffff})
r1 = getpid()
fktrace(r0, 0x0, 0x62e2dd08f149ff1b, r1)
setreuid(0x0, 0xee01)
__getcwd(0x0, 0x0)


r0 = socket$unix(0x1, 0x5, 0x0)
compat_50___fstat30(r0, &(0x7f00000003c0))


modctl$MODCTL_LOAD(0x0, &(0x7f0000000080)={0x0, 0x0, 0x0, 0xffffffffffffffaa})
compat_40_mount(&(0x7f0000000040)='tmpfs\x00', &(0x7f0000000140)='.\x00', 0x0, &(0x7f0000000080)="01")


modctl$MODCTL_UNLOAD(0x2, 0x0)
r0 = socket$inet(0x2, 0x2, 0x0)
open(&(0x7f00000000c0)='./file0\x00', 0x615, 0x0)
fcntl$lock(r0, 0xa, 0x0)


mkdir(&(0x7f0000000080)='./file0\x00', 0x0)
compat_50___msgctl13$IPC_STAT(0x0, 0x2, &(0x7f0000000280)={{}, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, &(0x7f00000001c0)={&(0x7f0000000100)}})
lchown(&(0x7f0000000100)='./file0\x00', 0x0, 0xffffffffffffffff)
compat_40_mount(&(0x7f0000000140)='umap\x00', &(0x7f0000000040)='./file0\x00', 0x0, &(0x7f00000001c0))
pathconf(&(0x7f00000000c0)='./file0\x00', 0x5)


symlink(&(0x7f0000000080)='.\x00', &(0x7f0000000240)='./file0\x00')
r0 = open$dir(&(0x7f0000000000)='./file0\x00', 0x0, 0x0)
r1 = open$dir(&(0x7f0000000080)='./file0\x00', 0x0, 0x0)
linkat(r0, &(0x7f0000000040)='./file0\x00', r1, &(0x7f00000000c0)='./file0\x00', 0x0)


symlink(&(0x7f0000000080)='.\x00', &(0x7f0000000240)='./file0\x00')
__mount50(&(0x7f0000000000)='kernfs\x00', &(0x7f0000000180)='./file0\x00', 0x0, 0x0, 0x0)
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
pathconf(&(0x7f0000000080)='./file0\x00', 0x11)


mkdir(&(0x7f0000000080)='./file0\x00', 0x0)
compat_50___msgctl13$IPC_STAT(0x0, 0x2, &(0x7f0000000280)={{}, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, &(0x7f00000001c0)={&(0x7f0000000100)}})
__mount50(&(0x7f0000000000)='fdesc\x00', &(0x7f0000000200)='./file0\x00', 0x0, 0x0, 0x0)
lchown(&(0x7f0000000100)='./file0\x00', 0x0, 0xffffffffffffffff)
compat_40_mount(&(0x7f0000000140)='umap\x00', &(0x7f0000000040)='./file0\x00', 0x0, &(0x7f00000001c0))
r0 = open$dir(&(0x7f0000000180)='./file0\x00', 0x0, 0x0)
openat(r0, &(0x7f0000000240)='./file0\x00', 0x0, 0x0)


symlink(&(0x7f0000000080)='.\x00', 0x0)
__mount50(&(0x7f0000000280)='kernfs\x00', &(0x7f00000002c0)='.\x00', 0x0, 0x0, 0x0)
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
pathconf(&(0x7f0000000080)='./file0\x00', 0x6)


setrlimit(0x0, 0x0)
setrlimit(0x0, 0x0)
r0 = msgget$private(0x0, 0x0)
msgsnd(r0, &(0x7f0000000d00)=ANY=[@ANYRESHEX], 0x401, 0x0)
msgsnd(r0, 0x0, 0x401, 0x0)
symlinkat(0x0, 0xffffffffffffff9c, &(0x7f0000000280)='./file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\x00')
rename(&(0x7f0000000600)='./file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\x00', &(0x7f0000000f40)='./file0\x00')
msgrcv(r0, 0x0, 0x0, 0x3, 0x0)
msgsnd(r0, &(0x7f0000000000)=ANY=[@ANYBLOB], 0x83, 0x0)
getuid()
getgroups(0x0, 0x0)
getsockopt$sock_cred(0xffffffffffffffff, 0xffff, 0x1022, &(0x7f0000000340), &(0x7f0000000380)=0xc)
getsockopt$SO_PEERCRED(0xffffffffffffff9c, 0xffff, 0x1022, 0x0, 0x0)
msgctl$IPC_RMID(r0, 0x0)
r1 = getppid()
msgctl$IPC_SET(0x0, 0x1, &(0x7f00000000c0)={{0x0, 0x0, 0x0, 0xffffffffffffffff}, 0x0, 0x5, r1})
setpgid(0x0, r1)


__mount50(&(0x7f0000000080)='overlay\x00', &(0x7f0000000040)='.\x00', 0x0, &(0x7f00000000c0)="a9", 0x1)


compat_50_wait4(0x0, 0x0, 0x160596, 0x0)
r0 = shmget$private(0x0, 0x3000, 0x0, &(0x7f0000ffd000/0x3000)=nil)
shmat(r0, &(0x7f0000ffc000/0x3000)=nil, 0x0)
getgroups(0xa, &(0x7f0000000000)=[0x0, 0x0, 0xffffffffffffffff, 0xffffffffffffffff, 0x0, 0xffffffffffffffff, 0x0, 0x0, 0xffffffffffffffff, 0x0])
r1 = getuid()
setreuid(0x0, r1)
__fstat50(0xffffffffffffff9c, &(0x7f0000000040))
r2 = semget$private(0x0, 0x5, 0x2c4)
semop(r2, &(0x7f0000000100)=[{}], 0x1)
semop(0x0, 0xffffffffffffffff, 0x0)
semctl$GETZCNT(r2, 0x0, 0x7, 0x0)


mknod(&(0x7f0000000000)='./file1\x00', 0x2000, 0xa718)
rename(0x0, &(0x7f0000000100)='./file0/file0/../file0\x00')
r0 = open$dir(&(0x7f00000000c0)='./file1\x00', 0x0, 0x0)
ioctl$OFIOGETBMAP(r0, 0xc020447f, &(0x7f0000000100)=0xffffffff)


compat_40_mount(&(0x7f0000000040)='tmpfs\x00', &(0x7f00000005c0)='.\x00', 0x0, &(0x7f00000002c0)="01")
mkdir(&(0x7f0000000080)='./file0\x00', 0x0)
chdir(&(0x7f0000000000)='./file0\x00')
execve(&(0x7f0000000140)='./file0\x00', 0x0, 0x0)


ioctl$WSMUXIO_INJECTEVENT(0xffffffffffffffff, 0x80185760, &(0x7f0000000000)={0x0, 0x0, {0x4000000000}})
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
r0 = socket(0x18, 0x1, 0x0)
setsockopt(r0, 0x0, 0xc, &(0x7f0000000000)="ebffcbff13b9fd812eaa4e713048e69931929648", 0x14)


__getlogin(0x0, 0x0)


modctl$MODCTL_UNLOAD(0x2, 0x0)
munmap(&(0x7f0000001000/0x3000)=nil, 0x3000)
r0 = shmget$private(0x0, 0x3000, 0x0, &(0x7f0000ffa000/0x3000)=nil)
shmctl$IPC_SET(0x0, 0x1, 0x0)
shmat(r0, &(0x7f0000001000/0x3000)=nil, 0x3000)
r1 = shmget$private(0x0, 0x1000, 0x0, &(0x7f0000ffd000/0x1000)=nil)
compat_50___shmctl13$IPC_SET(r1, 0x1, &(0x7f00000019c0)={{}, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0})


mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
_lwp_create(&(0x7f0000000140)={0x0, 0x0, {}, {}, {0x0, 0x0, ':\xfe2\x9cT}\f\x00'}}, 0x0, 0x0)


mknod(&(0x7f00000000c0)='./file0\x00', 0x6000, 0xe03)
r0 = open$dir(&(0x7f0000000180)='./file0\x00', 0x0, 0x0)
preadv(r0, &(0x7f0000000100)=[{&(0x7f0000000500)=""/4096, 0x1000}], 0x1, 0x0)


r0 = socket(0x10, 0x2, 0x0)
ioctl$FIOSEEKHOLE(r0, 0x801669f0, &(0x7f0000000180)=0x8000000000000032)


r0 = socket$inet6(0x18, 0x3, 0x0)
mknod$loop(&(0x7f0000000000)='./file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\x00', 0x0, 0x1)
sendto$inet6(r0, &(0x7f0000000000)="88", 0x358, 0x2, &(0x7f0000000040)={0x18, 0x3}, 0x1c)


minherit(&(0x7f0000ffd000/0x3000)=nil, 0x3000, 0x0)
mlock(&(0x7f0000ffb000/0x4000)=nil, 0x4000)
munlock(&(0x7f0000ffb000/0x3000)=nil, 0x3000)


madvise(&(0x7f0000ffc000/0x2000)=nil, 0x2000, 0x0)
mlock(&(0x7f0000ffb000/0x1000)=nil, 0x1000)
mmap(&(0x7f0000ff9000/0x4000)=nil, 0x4000, 0xd49f275d97cc01bb, 0x1810, 0xffffffffffffffff, 0x0, 0x0)


r0 = socket(0x1f, 0x5, 0x0)
recvmmsg(r0, &(0x7f0000001140)={0x0}, 0x10, 0x3, 0x0)


mknod(&(0x7f0000000540)='./file0\x00', 0x6000, 0x1003)
r0 = open(&(0x7f0000000000)='./file0\x00', 0x0, 0x0)
ioctl$FIOASYNC(r0, 0x40046486, 0x0)
sendmsg$unix(0xffffffffffffffff, &(0x7f00000000c0)={0x0, 0x0, 0x0, 0x0, &(0x7f0000000140)=ANY=[@ANYBLOB='(\x00\x00\x00', @ANYRESHEX], 0x28}, 0x0)
compat_40_mount(&(0x7f0000000380)='tmpfs\x00', &(0x7f00000003c0)='.\x00', 0x0, &(0x7f0000000140)="01")
__mount50(0x0, &(0x7f0000000040)='.\x00', 0xe680bf986d21abfb, &(0x7f0000000140), 0x0)


pipe(&(0x7f0000000080)={0xffffffffffffffff, <r0=>0xffffffffffffffff})
readlinkat(r0, &(0x7f0000000100)='./bus\x00', 0x0, 0x0)


modctl$MODCTL_UNLOAD(0x2, 0x0)
mknod(&(0x7f0000000480)='./file0\x00', 0x2000, 0x1733)
r0 = open(&(0x7f0000000300)='./file0\x00', 0x0, 0x0)
ioctl$FIONREAD(r0, 0x4004667f, &(0x7f0000000080))


modctl$MODCTL_UNLOAD(0x2, 0x0)
setpriority(0x0, 0x1000, 0x0)
mknod(&(0x7f0000000280)='./file0\x00', 0x6000, 0x500)
unlink(&(0x7f0000000000)='./file0\x00')


r0 = compat_30_socket(0x22, 0x60000003, 0x0)
sendmsg$unix(r0, &(0x7f0000001a80)={0x0, 0x0, 0x0}, 0x1)


socketpair$unix(0x1, 0x2, 0x0, &(0x7f0000000000)={<r0=>0xffffffffffffffff, <r1=>0xffffffffffffffff})
sendto(r1, 0x0, 0x0, 0x8, 0x0, 0x0)
recvmsg(r0, &(0x7f0000000600)={0x0, 0x0, &(0x7f0000000040)=[{&(0x7f00000002c0)=""/146, 0x92}], 0x1, 0x0}, 0x1002)


mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x0)
compat_50_utimes(0x0, &(0x7f0000000180))


modctl$MODCTL_UNLOAD(0x2, 0x0)
swapctl$SWAP_ON(0x4, 0x0, 0x0)


__clone(0x0, &(0x7f00000000c0))
__clone(0x0, 0x0)
__wait450(0x0, 0x0, 0x1e, &(0x7f0000001200))
__wait450(0x0, &(0x7f0000001400), 0x4, 0x0)


r0 = socket(0x18, 0x400000002, 0x0)
getsockopt(r0, 0x29, 0x3e, 0x0, 0x0)


modctl$MODCTL_UNLOAD(0x2, 0x0)
mkdir(&(0x7f0000000080)='./file0\x00', 0x0)
__mount50(&(0x7f0000000000)='fdesc\x00', &(0x7f0000000200)='./file0\x00', 0x0, 0x0, 0x0)
open$dir(&(0x7f00000000c0)='./file0\x00', 0x0, 0x0)


compat_50_setitimer(0x3, &(0x7f0000001800)={{}, {0x0, 0xf423f}}, 0x0)


shmget$private(0x0, 0x3000, 0x0, &(0x7f0000ffa000/0x3000)=nil)
shmat(0x0, &(0x7f0000001000/0x3000)=nil, 0x0)
mincore(&(0x7f0000000000/0x9000)=nil, 0x9000, &(0x7f0000000280)=""/122)


mkdir(&(0x7f0000000080)='./file0\x00', 0x0)
open$dir(&(0x7f0000000180)='./file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\x00', 0x0, 0x0)
compat_50___lstat30(0x0, 0x0)
compat_50___msgctl13$IPC_STAT(0x0, 0x2, &(0x7f0000000280)={{}, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, &(0x7f00000001c0)={&(0x7f0000000100)}})
sendmsg$unix(0xffffffffffffffff, &(0x7f0000000240)={0x0, 0x0, 0x0}, 0x0)
open(&(0x7f0000000100)='./file0\x00', 0x0, 0x0)
compat_40_mount(&(0x7f0000000140)='umap\x00', &(0x7f0000000040)='./file0\x00', 0x0, &(0x7f00000001c0))


madvise(&(0x7f0000ffd000/0x2000)=nil, 0x2000, 0x0)
mlock(&(0x7f0000ffb000/0x1000)=nil, 0x1000)
mmap(&(0x7f0000ff9000/0x4000)=nil, 0x4000, 0x3, 0x1010, 0xffffffffffffffff, 0x0, 0x0)


mkdir(&(0x7f0000000080)='./file0\x00', 0x0)
__mount50(&(0x7f0000000000)='fdesc\x00', &(0x7f0000000200)='./file0\x00', 0x0, 0x0, 0x0)
compat_50___msgctl13$IPC_STAT(0x0, 0x2, &(0x7f0000000280)={{}, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, &(0x7f00000001c0)={&(0x7f0000000100), 0xdffffffffffff7ff}})
lchown(&(0x7f0000000100)='./file0\x00', 0x0, 0xffffffffffffffff)
compat_40_mount(&(0x7f0000000380)='union\x00', &(0x7f0000000140)='./file0\x00', 0x3, &(0x7f00000001c0))
chmod(&(0x7f0000000180)='./file0\x00', 0x0)


compat_90_statvfs1(0x0, 0x0, 0x0)
r0 = socket$inet(0x2, 0x2, 0x0)
bind$inet(r0, &(0x7f0000000000)={0x2, 0x3}, 0xc)


compat_50___msgctl13$IPC_STAT(0x0, 0x2, &(0x7f0000000280)={{}, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, &(0x7f00000001c0)={&(0x7f0000000100), 0xdffffffffffff7ff}})
compat_40_mount(&(0x7f0000000040)='tmpfs\x00', &(0x7f0000000140)='.\x00', 0x0, &(0x7f0000000080)="01")
symlink(&(0x7f0000000080)='.\x00', &(0x7f0000000240)='./file0\x00')
lchown(&(0x7f0000000100)='./file0\x00', 0x0, 0xffffffffffffffff)
compat_40_mount(&(0x7f0000000380)='union\x00', &(0x7f0000000140)='./file0\x00', 0x0, &(0x7f00000001c0))
unlink(&(0x7f0000000000)='./file0\x00')
open(&(0x7f00000000c0)='./file0\x00', 0x615, 0x0)
truncate(&(0x7f0000000200)='./file0\x00', 0x0, 0x7fff)
truncate(&(0x7f0000000100)='./file0\x00', 0x0, 0x0)


compat_30_socket(0x22, 0x3, 0x2)


compat_43_ommap(&(0x7f0000ffb000/0x4000)=nil, 0x4000, 0x1, 0x2, 0xffffffffffffffff, 0x0)


mknod(&(0x7f0000000180)='./bus\x00', 0x2000, 0x5700)
open(&(0x7f0000000040)='./bus\x00', 0x0, 0x0)
execve(0x0, 0x0, &(0x7f00000003c0))
syz_emit_ethernet(0x12, 0x0)
syz_emit_ethernet(0x7e, 0x0)
unlink(&(0x7f0000000080)='./bus\x00')


socketpair$unix(0x1, 0x1, 0x0, &(0x7f0000000000)={<r0=>0xffffffffffffffff, <r1=>0xffffffffffffffff})
sendmsg$unix(0xffffffffffffffff, &(0x7f0000000980)={0x0, 0x0, &(0x7f00000007c0)=[{&(0x7f0000000440)="670199445fdfebcfae989a38bd05ade6", 0x10}], 0x1}, 0x0)
recvmmsg(r0, &(0x7f0000000400)={&(0x7f0000000540)={0x0, 0x0, 0x0, 0x0, &(0x7f0000000280)=""/205, 0xcd}}, 0x10, 0x0, 0x0)
sendto$unix(r1, &(0x7f0000000040)="04", 0x1, 0x0, 0x0, 0x0)
recvfrom$unix(r0, &(0x7f0000000480)=""/98, 0x62, 0x1040, 0x0, 0x0)


modctl$MODCTL_LOAD(0x5, 0x0)
_lwp_setname(0xffffffffffffffff, 0x0)
r0 = compat_30_socket(0x22, 0x3, 0x0)
ioctl$FIOSEEKHOLE(r0, 0xc0086662, &(0x7f0000000100))


rmdir(&(0x7f0000001900)='.\x00')


r0 = getsid(0x0)
ptrace(0x9, r0, 0x0, 0x0)
compat_50_wait4(0x0, 0x0, 0x0, 0x0)
ptrace(0x7, r0, 0x0, 0xffffffffffffffff)


__fhstat50(&(0x7f0000000180)="eb", 0x1, 0x0)
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
r0 = socket(0x18, 0x3, 0x0)
ioctl$FIOSEEKHOLE(r0, 0x80047476, &(0x7f0000000180))


mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x0)
clock_nanosleep(0x0, 0x0, 0x0, 0x0)


mknod(&(0x7f0000000540)='./file0\x00', 0x6000, 0x1003)
faccessat(0xffffffffffffff9c, &(0x7f0000000040)='./file0\x00', 0x2, 0x0)


open(&(0x7f0000000480)='./file0\x00', 0x80000000000206, 0x4ebfac6bbaf7949)
r0 = open(&(0x7f00000000c0)='./file0\x00', 0x80000000000206, 0x0)
ktrace(&(0x7f0000000200)='./file0\x00', 0x4, 0xd27d43220c7df9b, 0x0)
fktrace(r0, 0x2, 0x2, 0x0)


mknod(&(0x7f00000000c0)='./bus\x00', 0x2000, 0x4f4b)
r0 = open$dir(&(0x7f0000000040)='./bus\x00', 0x0, 0x0)
ioctl$FIOASYNC(r0, 0xc0104304, &(0x7f00000001c0))
compat_50_clock_gettime(0x60000002, 0x0)


mknodat(0xffffffffffffff9c, &(0x7f0000000040)='./file0\x00', 0x1000, 0x0)
open(&(0x7f0000000040)='./file0\x00', 0x70e, 0x0)
mmap(&(0x7f0000000000/0x400000)=nil, 0x400000, 0x3, 0x5012, 0xffffffffffffffff, 0x0, 0x0)
poll(0x0, 0x0, 0xffffffff)
mknod(&(0x7f00000000c0)='./file0\x00', 0x8000, 0x0)


open(&(0x7f0000000040)='./file0\x00', 0x70e, 0x0)
r0 = open(&(0x7f0000000100)='./file0\x00', 0x0, 0x0)
fcntl$lock(r0, 0x9, &(0x7f0000000080)={0x0, 0x0, 0x0, 0x100000001})
r1 = open(&(0x7f0000000040)='./file0\x00', 0x205, 0x0)
fcntl$lock(r1, 0x8, &(0x7f0000000000)={0x0, 0x0, 0x0, 0x1000300010008, 0xffffffffffffffff})


fcntl$lock(0xffffffffffffffff, 0x0, 0x0)
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
r0 = socket(0x18, 0x2, 0x0)
setsockopt(r0, 0x1000000000029, 0x13, &(0x7f0000000040)="00fb6c4f", 0x4)


mknod(&(0x7f00000000c0)='./file0\x00', 0x2000, 0x4301)
r0 = open(&(0x7f0000000040)='./file0\x00', 0x0, 0x0)
ioctl$WSKBDIO_SETMAP(r0, 0xc0107705, &(0x7f0000000080)={0x5d, 0x0})


r0 = socket(0x12, 0x2, 0x0)
compat_43_ogetsockname(r0, 0x0, 0x0)


sendmsg(0xffffffffffffffff, &(0x7f0000000280)={0x0, 0x0, &(0x7f00000006c0)=[{&(0x7f0000000180)="f69623aaf2ffdf8200d384a60f32357316ee", 0x12}], 0x1, 0x0}, 0x0)
r0 = socket(0x18, 0x3, 0x0)
ioctl$FIOSEEKHOLE(r0, 0x8020690c, &(0x7f0000000180)=0x8000000000000032)


mknod$loop(0x0, 0x0, 0x1)
r0 = socket(0x18, 0x2, 0x0)
close(r0)
r1 = socket(0x18, 0x2, 0x0)
setsockopt(r1, 0x1000000029, 0x2e, 0x0, 0x0)
getsockopt(r0, 0x29, 0x2e, 0x0, 0x0)


semctl$IPC_SET(0x0, 0x0, 0x1, &(0x7f00000000c0)={{0x0, 0x0, 0xffffffffffffffff, 0x0, 0x0, 0x0, 0x200}})
r0 = socket(0x18, 0x3, 0x0)
connect$unix(r0, &(0x7f00000000c0)=@abs={0x682eb13985c518e6, 0x7}, 0x1c)
getsockname$inet(r0, &(0x7f00000000c0), &(0x7f0000000000)=0xffffffffffffff35)
connect$unix(0xffffffffffffffff, &(0x7f00000000c0)=@abs={0x682eb13985c518e6, 0x7, 0x1}, 0x8)
r1 = socket(0x1, 0x1, 0x0)
close(r1)
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
r2 = socket(0x18, 0x1, 0x0)
connect$unix(r1, &(0x7f00000000c0)=@abs={0x0, 0x7}, 0x1c)
shutdown(r2, 0x1)


compat_60__lwp_park(0x0, 0x0, 0x0, 0x0)


mknod(&(0x7f0000000480)='./file0\x00', 0x2000, 0x1733)
r0 = open(&(0x7f0000000300)='./file0\x00', 0x0, 0x0)
ioctl$FIONREAD(r0, 0x40044271, 0x0)


open(&(0x7f0000000040)='./file0\x00', 0x70e, 0x0)
ktrace(0x0, 0x0, 0x0, 0x0)
r0 = getpid()
ktrace(&(0x7f0000000000)='./file0\x00', 0x0, 0x144, r0)
_ksem_unlink(0x0)


modctl$MODCTL_UNLOAD(0x2, 0x0)
r0 = compat_30_socket(0x22, 0x3, 0x0)
compat_43_osend(r0, &(0x7f0000000040)="08200203", 0x358, 0x0)


mknod(&(0x7f00000000c0)='./bus\x00', 0x2000, 0x4f4b)
connect$unix(0xffffffffffffffff, &(0x7f0000000000)=@abs={0x0, 0x0, 0x2}, 0x8)
r0 = open$dir(&(0x7f0000000040)='./bus\x00', 0x0, 0x0)
ioctl$FIOASYNC(r0, 0xc0104302, &(0x7f00000001c0)=0x20000002)


_ksem_timedwait(0x50535244, &(0x7f0000000000)={0x1, 0x7f})
_ksem_wait(0x0)
clock_nanosleep(0x1, 0x0, &(0x7f0000000040)={0x3ff, 0x72df}, &(0x7f0000000080))
__clock_settime50(0x0, &(0x7f00000000c0)={0x7, 0xe09})
_ksem_getvalue(0x50535244, &(0x7f0000000100))
__clock_getres50(0x40000000, &(0x7f0000000140))
__clock_getres50(0x0, &(0x7f0000000180))
execve(&(0x7f0000000240)='./file0\x00', &(0x7f00000003c0)=[&(0x7f0000000280)='\'\\\xdd\x9d\v\x00', &(0x7f00000002c0)=')[*\x00', &(0x7f0000000300)='\\\x00', &(0x7f0000000340)='\x00', &(0x7f0000000380)='^@-,\x00'], &(0x7f0000000440)=[&(0x7f0000000400)='-$#%\x00'])
_ksem_destroy(0x0)


mknod(&(0x7f0000000100)='./file0\x00', 0x3a0914c44f7b202d, 0x500)
r0 = open(&(0x7f0000000140)='./file0\x00', 0x804, 0x0)
fcntl$setown(r0, 0xa, 0x0)


compat_43_osetrlimit(0x0, 0x0)


writev(0xffffffffffffffff, 0x0, 0x0)
r0 = socket(0x2, 0x2, 0x0)
ioctl$FIONREAD(r0, 0x80206979, &(0x7f00000001c0))
dup2(0xffffffffffffffff, 0xffffffffffffffff)
getpriority(0x2, 0x0)


setrlimit(0x3, &(0x7f00000000c0)={0x100000, 0x100000001})


setsockopt$inet_opts(0xffffffffffffffff, 0x0, 0x0, &(0x7f0000000180)="33b62b53cc518098fb586f8654ccabac394a", 0x12)
r0 = socket(0x18, 0x3, 0x0)
ioctl$FIOSEEKHOLE(r0, 0x80206913, &(0x7f0000000180)=0x8000000000000032)


mknod(&(0x7f0000000540)='./file0\x00', 0x6000, 0x1003)
r0 = open(&(0x7f0000000000)='./file0\x00', 0x0, 0x0)
ioctl$FIOASYNC(r0, 0x80204611, &(0x7f0000000380))


r0 = compat_30_socket(0x1f, 0x3, 0x0)
ioctl$WSMOUSEIO_SRES(r0, 0x80045721, &(0x7f0000003440))


compat_40_mount(&(0x7f0000000200)='ptyfs\x00', &(0x7f0000000040)='.\x00', 0x0, &(0x7f00000002c0))
r0 = open$dir(&(0x7f0000001240)='.\x00', 0x0, 0x0)
__lstat50(&(0x7f0000000000)='.\x00', 0x0)
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
unmount(&(0x7f0000000000)='./file0\x00', 0x255a0100)
fcntl$getown(r0, 0x10)


setregid(0x0, 0xffffffffffffffff)


r0 = socket$inet(0x2, 0x1, 0x0)
setsockopt(r0, 0x0, 0x13, &(0x7f0000000040)="8b589d9d", 0x4)


getpid()
socketpair$unix(0x1, 0x1, 0x0, &(0x7f0000000040)={<r0=>0xffffffffffffffff, <r1=>0xffffffffffffffff})
recvmmsg(0xffffffffffffffff, &(0x7f0000000700)={&(0x7f00000001c0)={0x0, 0x0, 0x0, 0x0, &(0x7f0000000680)=""/100, 0x64}, 0x3f8d}, 0x10, 0x0, 0x0)
sendmmsg(r1, &(0x7f0000000080)={0x0}, 0x10, 0x0, 0x0)
setsockopt$sock_timeval(r1, 0xffff, 0x1005, &(0x7f0000000000)={0x0, 0x8}, 0x10)
close(r1)
recvfrom$unix(r0, 0x0, 0x0, 0x0, 0x0, 0x0)


mknod(&(0x7f0000000100)='./file0\x00', 0x2000, 0x0)
r0 = open(&(0x7f0000000140)='./file0\x00', 0x0, 0x0)
ioctl$FIOASYNC(r0, 0x4004667b, 0x0)


r0 = socket$inet(0x2, 0x1, 0x0)
setsockopt(r0, 0x0, 0x18, &(0x7f0000000080)="301dc649", 0x4)


seteuid(0xffffffffffffffff)
r0 = shmget$private(0x0, 0x3000, 0x0, &(0x7f0000001000/0x3000)=nil)
seteuid(0x0)
shmctl$IPC_SET(r0, 0x1, &(0x7f0000000200)={{0xffffffffffffffff, 0x0, 0x0, 0xffffffffffffffff}})
shmat(r0, &(0x7f0000ffb000/0x4000)=nil, 0x2000)


open(&(0x7f0000000500)='./file0\x00', 0x70e, 0x0)
r0 = open(&(0x7f0000000040)='./file0\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x200000)=nil, 0x200000, 0x0, 0x10, r0, 0x0, 0x0)
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x0)
madvise(&(0x7f0000000000/0x3000)=nil, 0x3000, 0x3)


modctl$MODCTL_UNLOAD(0x2, 0x0)
compat_40_mount(&(0x7f0000000200)='ptyfs\x00', &(0x7f00000003c0)='.\x00', 0x0, &(0x7f0000000a80))
r0 = socket$unix(0x1, 0x1, 0x0)
bind$unix(r0, &(0x7f00000001c0)=@file={0x1, '\xe9\x1fq\x89Y\x1e\x923aK\x00'}, 0x56)


bind$unix(0xffffffffffffffff, 0x0, 0x0)
setsockopt$sock_int(0xffffffffffffffff, 0xffff, 0x0, 0x0, 0x0)
__mount50(&(0x7f00000001c0)='fdesc\x00', &(0x7f0000000040)='.\x00', 0x0, 0x0, 0x0)
__mount50(&(0x7f00000002c0)='overlay\x00', &(0x7f0000000040)='.\x00', 0x0, &(0x7f0000000540), 0x0)
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
linkat(0xffffffffffffffff, 0x0, 0xffffffffffffffff, 0x0, 0x0)
open(&(0x7f0000000040)='./file0\x00', 0x10, 0x0)


mlock(&(0x7f0000ffa000/0x4000)=nil, 0x4000)
munmap(&(0x7f0000ffc000/0x4000)=nil, 0x4000)
munlock(&(0x7f0000ffa000/0x3000)=nil, 0x3000)


modctl$MODCTL_LOAD(0x5, 0x0)
symlink(&(0x7f0000000300)='.\x00', &(0x7f0000000240)='./file0\x00')
compat_40_mount(&(0x7f0000000180)='ptyfs\x00', &(0x7f00000002c0)='./file0\x00', 0x0, &(0x7f0000000500))
r0 = open(&(0x7f0000000400)='.\x00', 0x0, 0x0)
flock(r0, 0x2)


compat_50___msgctl13$IPC_SET(0x0, 0x1, 0xfffffffffffffffe)


mkdir(&(0x7f0000000380)='./file0\x00', 0x0)
__mount50(&(0x7f0000000000)='mfs\x00', &(0x7f0000000080)='./file0/../file0\x00', 0x0, &(0x7f0000000100)="86", 0x1)


mknod(&(0x7f0000000480)='./file0\x00', 0x2000, 0x1733)
chmod(&(0x7f0000000080)='./file0\x00', 0x7bdb48f284ddc81d)


modctl$MODCTL_UNLOAD(0x2, 0x0)
mkdir(&(0x7f0000000080)='./file0\x00', 0x0)
__mount50(&(0x7f0000000000)='fdesc\x00', &(0x7f0000000200)='./file0\x00', 0x0, 0x0, 0x0)
__mount50(0x0, &(0x7f0000000200)='./file0\x00', 0x400000, &(0x7f00000003c0)="a6", 0x1)


mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
modctl$MODCTL_UNLOAD(0x2, &(0x7f0000000080))


modctl$MODCTL_UNLOAD(0x2, 0x0)
mkdir(&(0x7f0000000300)='./file0\x00', 0x0)
__mount50(&(0x7f0000000000)='kernfs\x00', &(0x7f0000000180)='./file0\x00', 0x0, 0x0, 0x0)
r0 = open(&(0x7f0000000180)='./file0\x00', 0x0, 0x0)
compat_43_ogetdirentries(r0, 0x0, 0x0, &(0x7f00000000c0))


mknod(&(0x7f0000000000)='./file0\x00', 0x2000, 0x40000802)
pathconf(&(0x7f0000000080)='./file0\x00', 0x9)


symlink(&(0x7f0000000900)='.\x00', &(0x7f0000000240)='./file0\x00')
compat_50___msgctl13$IPC_STAT(0x0, 0x2, &(0x7f0000000940)={{}, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, &(0x7f00000001c0)={&(0x7f0000000100)}})
__posix_rename(&(0x7f0000000100)='./file0\x00', 0x0)
sendmsg$unix(0xffffffffffffffff, &(0x7f0000000240)={&(0x7f0000000180)=@abs={0x0, 0x0, 0x2}, 0x8, &(0x7f0000000200)}, 0x0)
compat_40_mount(&(0x7f0000000140)='umap\x00', &(0x7f0000000040)='./file0\x00', 0x0, &(0x7f00000001c0))
symlink(&(0x7f0000001640)='./file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa/../file0\x00', &(0x7f0000000e40)='./file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\x00')
rename(&(0x7f0000000640)='./file0\x00', &(0x7f0000000440)='./file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\x00')


__getcwd(0x0, 0x0)
r0 = getsid(0x0)
ptrace(0x9, r0, 0x0, 0x0)
compat_50_wait4(0x0, 0x0, 0x0, 0x0)
ptrace(0x2a, r0, &(0x7f0000000000), 0x0)


r0 = open$dir(&(0x7f0000000040)='.\x00', 0x0, 0x0)
r1 = open$dir(&(0x7f0000000040)='.\x00', 0x0, 0x0)
symlinkat(&(0x7f0000000680)='./file0\x00', r1, &(0x7f00000006c0)='./file0\x00')
linkat(r0, &(0x7f00000000c0)='./file0\x00', r0, &(0x7f0000000100)='./file1\x00', 0x0)


r0 = socket(0x18, 0x1, 0x0)
r1 = fcntl$dupfd(r0, 0x2, 0xffffffffffffffff)
close(r1)
flock(r1, 0x0)


compat_43_osendmsg(0xffffffffffffffff, 0xffffffffffffffff, 0x0)


fcntl$lock(0xffffffffffffffff, 0x0, &(0x7f00000000c0)={0x0, 0x0, 0x20000000083fe})
r0 = socket(0x18, 0x1, 0x0)
connect$unix(r0, &(0x7f00000000c0)=@abs={0x682eb13985c518e6, 0x7}, 0x1c)
getsockname$inet(r0, &(0x7f00000000c0), &(0x7f0000000000)=0xffffffffffffff35)
ioctl$WSMUXIO_INJECTEVENT(0xffffffffffffff9c, 0x80185760, &(0x7f0000000000)={0xfffffffe})
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
r1 = socket(0x18, 0x2, 0x0)
close(r1)
r2 = socket(0x18, 0x2, 0x0)
setsockopt(r2, 0x1000000029, 0x2e, &(0x7f0000000000)="ebffcbff13b9fd812eaa4e713048e69931929648", 0x14)
connect$unix(r1, &(0x7f00000000c0)=@abs={0x0, 0x7}, 0x1c)


fcntl$lock(0xffffffffffffffff, 0x0, &(0x7f0000000040)={0x1})
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
r0 = socket(0x18, 0x2, 0x0)
setsockopt(r0, 0x1000000000029, 0xb, &(0x7f0000000000)='\x00\x00\x00\x00', 0x4)
setsockopt(r0, 0x1000000000029, 0xa, &(0x7f0000000040)="000001ad", 0x4)


compat_50_clock_gettime(0x3, &(0x7f00000000c0))


madvise(&(0x7f0000ffa000/0x1000)=nil, 0x1000, 0x0)
mlock(&(0x7f0000ffc000/0x3000)=nil, 0x3000)
setsockopt$sock_int(0xffffffffffffffff, 0xffff, 0x4, 0x0, 0x0)
mlock(&(0x7f0000ffa000/0x1000)=nil, 0x1000)
mmap(&(0x7f0000ffb000/0x4000)=nil, 0x4000, 0xd49f275d97cc01bb, 0x1810, 0xffffffffffffffff, 0x0, 0x0)


openat(0xffffffffffffff9c, &(0x7f0000000000)='./file0\x00', 0x0, 0x0)
setsockopt$sock_int(0xffffffffffffffff, 0xffff, 0x0, &(0x7f0000000000), 0x1)
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1)
r0 = socket(0x18, 0x1, 0x0)
r1 = socket(0x18, 0x2, 0x0)
setsockopt(r1, 0x1000000000029, 0xa, &(0x7f0000000040)='\x00\x00\x00\x00', 0x4)
dup2(r1, r0)
setsockopt(r0, 0x1000000029, 0xd, &(0x7f0000000000)="ebffcbff13b9fd812eaa4e713048e69931929648", 0x14)


mknod(&(0x7f0000000300)='./file0\x00', 0x2000, 0x6da)
r0 = open(&(0x7f0000000000)='./file0\x00', 0x0, 0x0)
ioctl$FIOASYNC(r0, 0x80047460, &(0x7f0000000100))


modctl$MODCTL_UNLOAD(0x2, 0x0)
setpriority(0x0, 0x0, 0x0)


compat_30_fhopen(0xffffffffffffffff, 0x0)


symlink(&(0x7f0000000080)='.\x00', &(0x7f0000000240)='./file0\x00')
r0 = open(&(0x7f0000000040)='./file0\x00', 0x0, 0x0)
fcntl$setstatus(r0, 0xd, 0x0)


ioctl$WSMUXIO_INJECTEVENT(0xffffffffffffffff, 0x80185760, &(0x7f0000000000))
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
r0 = socket(0x18, 0x1, 0x0)
r1 = socket(0x18, 0x1, 0x0)
setsockopt(r1, 0x1000000029, 0xc, &(0x7f0000000000)="ebffcbff13b9fd812eaa4e713048e69931929648", 0x14)
setsockopt(r0, 0x1000000029, 0xc, &(0x7f0000000000)="ebffcbff13b9fd812eaa4e713048e69931929648", 0x14)


setrlimit(0x6, &(0x7f00000000c0))
mprotect(&(0x7f0000ffb000/0x2000)=nil, 0x2000, 0x4)
mmap(&(0x7f0000ffc000/0x4000)=nil, 0x4000, 0x4, 0x1810, 0xffffffffffffffff, 0x0, 0x0)
mlockall(0x1)


mknod(&(0x7f00000000c0)='./bus\x00', 0x2000, 0x4f4b)
connect$unix(0xffffffffffffffff, &(0x7f0000000000)=@file={0x0, './file0\x00'}, 0xa)
r0 = open$dir(&(0x7f0000000040)='./bus\x00', 0x0, 0x0)
ioctl$FIOASYNC(r0, 0xc0104302, &(0x7f00000001c0)=0x20000002)


setreuid(0xffffffffffffffff, 0xee01)
r0 = socket(0x2, 0x2, 0x0)
r1 = dup(r0)
setsockopt$inet_opts(r1, 0x0, 0x14, &(0x7f0000000000)='\x00\x00\x00\x00', 0x4)


pipe(&(0x7f0000000a40)={0xffffffffffffffff, <r0=>0xffffffffffffffff})
compat_50___stat30(0x0, 0x0)
renameat(r0, &(0x7f0000000000)='./file0\x00', 0xffffffffffffffff, &(0x7f0000000040)='./file0\x00')


mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
compat_43_osetrlimit(0x0, &(0x7f0000000080))


mknod(&(0x7f00000000c0)='./bus\x00', 0x2000, 0x4f4b)
r0 = open$dir(&(0x7f0000000040)='./bus\x00', 0x0, 0x0)
ioctl$FIOASYNC(r0, 0x80104301, &(0x7f00000001c0)=0x20000002)


open$dir(&(0x7f00000000c0)='./file0\x00', 0x0, 0x0)
r0 = socket$inet6(0x18, 0x3, 0x0)
__mount50(&(0x7f0000000000)='fdesc\x00', 0x0, 0x0, 0x0, 0x0)
sendto$inet6(r0, &(0x7f0000000000)=',', 0x328, 0x2, &(0x7f0000000040)={0x18, 0x3}, 0x1c)


symlink(&(0x7f0000000900)='.\x00', &(0x7f0000000240)='./file0\x00')
compat_50___msgctl13$IPC_STAT(0x0, 0x2, &(0x7f0000000940)={{}, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, &(0x7f00000001c0)={&(0x7f0000000100)}})
__posix_rename(&(0x7f0000000100)='./file0\x00', 0x0)
compat_40_mount(&(0x7f0000000140)='umap\x00', &(0x7f0000000040)='./file0\x00', 0x0, &(0x7f00000001c0))
rename(&(0x7f0000000640)='./file0\x00', &(0x7f0000000440)='./file0\x00')


posix_spawn(0xffffffffffffffff, 0x0, &(0x7f0000000300)={0x0, 0x7, &(0x7f00000002c0)=@open={0x0, 0xffffffffffffffff, {0x0}}}, 0x0, 0x0, 0x0)


mknod(&(0x7f0000000000)='./file0\x00', 0x0, 0x0)
socketpair$unix(0x1, 0x2, 0x0, &(0x7f0000000100)={0xffffffffffffffff, <r0=>0xffffffffffffffff})
__fstat50(r0, &(0x7f0000000300))


mknod(&(0x7f0000000180)='./file0\x00', 0x2000, 0x202)
r0 = open(&(0x7f0000000140)='./file0\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x200000)=nil, 0x200000, 0x3, 0x10, r0, 0x0, 0x0)


r0 = socket$inet(0x2, 0x2, 0x0)
mknod(&(0x7f00000000c0)='./bus\x00', 0x2000, 0x4f4b)
__mount50(&(0x7f00000002c0)='overlay\x00', &(0x7f0000000040)='.\x00', 0x0, &(0x7f0000000540), 0x0)
open$dir(&(0x7f0000000040)='./bus\x00', 0x0, 0x0)
fcntl$lock(r0, 0xa, 0x0)


r0 = __clone(0x0, &(0x7f00000005c0))
mkdir(0x0, 0x0)
getpriority(0x0, r0)


modctl$MODCTL_LOAD(0x0, &(0x7f0000000080)={&(0x7f0000000240), 0x0, &(0x7f0000000100)="3c5ffe26b51ceab496c8f0677a2c27bd63e23ddcc2870779da41ad20", 0xfffffffffffffd6f})


r0 = socket(0x18, 0x2, 0x0)
lseek(r0, 0x0, 0x0, 0x0)


open$dir(0x0, 0x0, 0x0)
mknod(&(0x7f00000000c0)='./bus\x00', 0x2000, 0x4f4b)
r0 = open$dir(&(0x7f0000000040)='./bus\x00', 0x0, 0x0)
ioctl$FIOASYNC(r0, 0xc0104304, &(0x7f00000001c0))
setpriority(0x2, 0x0, 0x0)


symlink(&(0x7f0000000080)='.\x00', &(0x7f0000000240)='./file0\x00')
open$dir(&(0x7f0000000040)='./file1\x00', 0x3e0, 0x0)
rename(&(0x7f0000000d80)='./file0\x00', &(0x7f0000000180)='./file1\x00')


getuid()
r0 = posix_spawn(0x0, 0x0, 0x0, 0x0, 0x0, 0x0)
getsid(r0)


setregid(0xffffffffffffffff, 0xffffffffffffffff)


open(&(0x7f0000000480)='./file0\x00', 0x80000000000206, 0xc444d4bb6ef2ffc5)
r0 = open$dir(&(0x7f0000000000)='./file0\x00', 0x2, 0x0)
writev(r0, &(0x7f0000000340)=[{&(0x7f0000000000), 0x2cfea}], 0x1000000000000013)


mknod(&(0x7f0000000280)='./file0\x00', 0x2000, 0x200)
r0 = openat(0xffffffffffffff9c, &(0x7f0000000040)='./file0\x00', 0x0, 0x0)
r1 = open(&(0x7f0000000040)='./file0\x00', 0x0, 0x0)
r2 = dup2(r1, r0)
ioctl$FIOASYNC(r2, 0x8004667d, &(0x7f0000000100))


mmap(&(0x7f0000000000/0x400000)=nil, 0x400000, 0x3, 0x5012, 0xffffffffffffffff, 0x0, 0x0)
openat(0xffffffffffffffff, &(0x7f0000000280)='./file0\x00', 0x0, 0x40)
ktrace(&(0x7f00000001c0)='./file0\x00', 0x4, 0x0, 0x0)
setreuid(0xee00, 0x0)
r0 = getuid()
setreuid(0xee00, r0)
semget(0x0, 0x0, 0x0)
semctl$GETZCNT(0x0, 0x0, 0x7, 0x0)
getgid()
socket(0x0, 0x2, 0x0)
socket(0x18, 0x0, 0x0)
r1 = msgget$private(0x0, 0x0)
msgget$private(0x0, 0x0)
msgsnd(r1, &(0x7f00000000c0)=ANY=[@ANYBLOB="03000000e70000005900c707c0e6ee64a6871da5d3acb0cbf8a63004000093ef1607e4e499ba5b88cb4b5c9f7cc13bf8fd81fa1b59ef8417d0"], 0x39, 0x0)
r2 = socket$unix(0x1, 0x5, 0x0)
getsockopt$sock_cred(r2, 0xffff, 0x1024, &(0x7f0000001840)={<r3=>0x0}, &(0x7f0000001880)=0xc)
setpgid(0x0, r3)
msgrcv(r1, &(0x7f0000000140)={0x0, ""/221}, 0xe5, 0x3, 0x800)


compat_50_quotactl(0x0, 0x0, 0x0, 0x0)


mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x0)
__setitimer50(0x0, &(0x7f0000000180), 0x0)


compat_40_mount(&(0x7f0000000040)='tmpfs\x00', &(0x7f00000005c0)='.\x00', 0x0, &(0x7f00000002c0)="01")
mkdir(&(0x7f0000000080)='./file0\x00', 0x0)
mkdirat(0xffffffffffffff9c, &(0x7f0000000000)='./file0/file0\x00', 0x0)
r0 = open$dir(&(0x7f00000001c0)='./file0\x00', 0x0, 0x0)
r1 = open(&(0x7f0000000140)='./file0\x00', 0x0, 0x0)
mkdirat(r1, &(0x7f0000000180)='./file0/file0\x00', 0x0)
renameat(0xffffffffffffff9c, &(0x7f00000000c0)='./file0/file0\x00', r0, &(0x7f0000000280)='./file0/file0/file0\x00')


symlink(&(0x7f0000000080)='.\x00', &(0x7f0000000240)='./file0\x00')
r0 = open(&(0x7f0000000100)='./file0\x00', 0x0, 0x0)
fcntl$lock(r0, 0x9, &(0x7f00000000c0)={0x0, 0x2, 0x0, 0x100000001})
flock(r0, 0x5)


__posix_lchown(0xffffffffffffffff, 0x0, 0x0)


mkdir(&(0x7f0000000100)='./file0\x00', 0x0)
open$dir(&(0x7f0000000000)='./file0/file0\x00', 0xa63edcc34f204a84, 0x0)


r0 = socket(0x1f, 0x5, 0x0)
compat_43_ogetsockname(r0, 0x0, 0x0)


socketpair$unix(0x1, 0x1, 0x0, &(0x7f0000000140))
mknod(&(0x7f0000000300)='./file0\x00', 0x2000, 0x6da)
open(&(0x7f0000000480)='./file0\x00', 0x0, 0x0)
symlink(&(0x7f0000000080)='.\x00', 0x0)
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
compat_50_select(0x40, &(0x7f0000000080), 0x0, 0x0, 0x0)
compat_50_select(0x40, &(0x7f0000000080), 0x0, 0x0, 0x0)


mknod(&(0x7f0000000000)='./bus\x00', 0x2000, 0xa718)
r0 = open$dir(&(0x7f0000000080)='./bus\x00', 0x0, 0x0)
ioctl$FIOASYNC(r0, 0x80144481, &(0x7f00000001c0))


open(&(0x7f0000000040)='./file0\x00', 0xf8e, 0x0)
ktrace(&(0x7f0000000200)='./file0\x00', 0x4, 0xd27d43220c7df9b, 0x0)
_ksem_init(0x0, 0x0)
_ksem_wait(0x0)
_lwp_wait(0xffffffffffffffff, 0x0)


lchflags(&(0x7f0000001a80)='./file0\x00', 0x0)


r0 = socket(0x18, 0x3, 0x0)
ioctl$FIOSEEKHOLE(r0, 0xc0986986, &(0x7f0000000180)=0x8000000000000032)


symlink(0x0, 0x0)
__mount50(0x0, 0x0, 0x0, 0x0, 0x0)
getsockopt(0xffffffffffffffff, 0x0, 0x0, 0x0, 0x0)
chflags(0x0, 0x0)
mknod(&(0x7f00000000c0)='./bus\x00', 0x2000, 0x0)
r0 = open$dir(&(0x7f0000000000)='./bus\x00', 0x0, 0x0)
ioctl$FIOASYNC(r0, 0x80047465, &(0x7f00000001c0))


socket(0x0, 0x0, 0x0)
setsockopt$sock_int(0xffffffffffffffff, 0xffff, 0x1001, 0x0, 0x0)
r0 = openat(0xffffffffffffff9c, &(0x7f0000000080)='.\x00', 0x0, 0x0)
openat(r0, &(0x7f00000000c0)='./file0\x00', 0x8aa43, 0x0)


r0 = socket(0x18, 0x2, 0x0)
r1 = socket(0x18, 0x2, 0x0)
r2 = dup2(r0, r1)
compat_43_ogetsockname(r2, 0x0, 0x0)


open$dir(&(0x7f00000003c0)='./file0\x00', 0x400004000011830a, 0x0)
ktrace(&(0x7f0000000000)='./file0\x00', 0x0, 0x0, 0x0)
socketpair$unix(0x1, 0x1, 0x0, &(0x7f0000000000)={<r0=>0xffffffffffffffff, <r1=>0xffffffffffffffff})
recvmsg(0xffffffffffffffff, &(0x7f0000000340)={0x0, 0x0, 0x0, 0x0, &(0x7f00000024c0)=""/236, 0xec}, 0x0)
write(r0, &(0x7f0000000040)="ed", 0x1)
recvmmsg(r1, &(0x7f0000000880)={&(0x7f0000000840)={0x0, 0x0, &(0x7f0000000ac0)=[{&(0x7f0000000240)=""/217, 0xd9}], 0x1, 0x0}}, 0x10, 0x1262, 0x0)
close(r0)


mknod(&(0x7f0000000540)='./file0\x00', 0x6000, 0x1003)
open(&(0x7f0000000480)='./file0\x00', 0x0, 0x0)
symlink(&(0x7f0000000080)='.\x00', 0x0)
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
compat_50_select(0x40, &(0x7f0000000080), 0x0, 0x0, 0x0)


r0 = socket(0x18, 0x2, 0x0)
mmap(&(0x7f0000000000/0xff5000)=nil, 0xff5000, 0x0, 0x1010, 0xffffffffffffffff, 0x0, 0x0)
sendmmsg(r0, &(0x7f0000000540)={0x0}, 0x10, 0x0, 0x0)


swapctl$SWAP_ON(0x6, 0x0, 0x5)


r0 = _lwp_self()
r1 = msgget$private(0x0, 0x0)
msgrcv(r1, 0x0, 0x0, 0x0, 0x0)
_lwp_wakeup(r0)


mkdir(&(0x7f0000000080)='./file0\x00', 0x0)
setreuid(0x0, 0xee00)
compat_40_mount(&(0x7f0000000380)='union\x00', &(0x7f0000000140)='./file0\x00', 0x0, 0x0)
linkat(0xffffffffffffff9c, &(0x7f00000003c0)='./file0/file1aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\x00', 0xffffffffffffffff, 0x0, 0x0)


symlink(&(0x7f0000000080)='.\x00', &(0x7f0000000240)='./file0\x00')
mknod(&(0x7f0000000040)='./bus\x00', 0x0, 0x2802)
sendmsg$unix(0xffffffffffffffff, 0x0, 0x0)
compat_30___stat13(&(0x7f0000000180)='./file0\x00', 0x0)


madvise(&(0x7f0000ffc000/0x2000)=nil, 0x2000, 0x0)
mlock(&(0x7f0000ffc000/0x3000)=nil, 0x3000)
mlock(&(0x7f0000ffb000/0x1000)=nil, 0x1000)


symlink(&(0x7f0000000080)='.\x00', &(0x7f0000000240)='./file0\x00')
__mount50(&(0x7f0000000000)='ntfs\x00', &(0x7f0000000180)='./file0\x00', 0x0, &(0x7f0000000200)='<', 0x1)


open$dir(&(0x7f00000002c0)='./file0\x00', 0x200, 0x0)
setreuid(0xee00, 0xffffffffffffffff)
open(0x0, 0x0, 0x0)
open(&(0x7f0000001440)='./file1\x00', 0x0, 0x0)
fpathconf(0xffffffffffffffff, 0x0)
setreuid(0xffffffffffffffff, 0xffffffffffffffff)
ktrace(&(0x7f00000001c0)='./file0\x00', 0x4, 0x40001890, 0x0)
ktrace(&(0x7f0000000040)='./file0\x00', 0x2, 0x0, 0x0)


mknod(&(0x7f0000000480)='./file0\x00', 0x2000, 0x1733)
fcntl$lock(0xffffffffffffffff, 0x0, 0x0)
r0 = open(&(0x7f0000000180)='./file0\x00', 0x0, 0x0)
symlinkat(&(0x7f0000000080)='./file0\x00', r0, &(0x7f0000000100)='./file0\x00')


posix_spawn(0x0, 0x0, &(0x7f0000000400)={0x0, 0x1, &(0x7f0000000340)=@open={0x0, 0xffffffffffffffff, {&(0x7f0000000300)='\x00'}}}, &(0x7f00000004c0), 0x0, 0x0)


ptrace(0x7dc, 0xffffffffffffffff, 0x0, 0x0)


__mount50(0x0, 0x0, 0x0, 0x0, 0x0)
chdir(0x0)
rename(0x0, 0x0)
mknod(&(0x7f0000000300)='./file0\x00', 0x2000, 0x2e00)
r0 = open$dir(&(0x7f0000000000)='./file0\x00', 0x2, 0x0)
writev(r0, &(0x7f0000000340)=[{&(0x7f0000000000), 0x2cfea}], 0x1000000000000013)


compat_43_olseek(0xffffffffffffffff, 0x0, 0x5)


r0 = socket(0x18, 0x2, 0x0)
shutdown(r0, 0x1)


setrlimit(0xb, 0x0)
compat_20_getfsstat(&(0x7f0000000000), 0xffffffffffffff39, 0x0)


compat_40_mount(&(0x7f0000000040)='tmpfs\x00', &(0x7f00000000c0)='.\x00', 0x0, 0x0)


r0 = getsid(0x0)
ptrace(0x9, r0, 0x0, 0x0)
compat_50_wait4(0x0, 0x0, 0x0, 0x0)
ptrace(0x19, r0, 0x0, 0x0)


r0 = open(&(0x7f00000000c0)='./file0\x00', 0x80000000000206, 0x0)
socketpair$unix(0x1, 0x1, 0x0, &(0x7f0000000040)={0xffffffffffffffff, <r1=>0xffffffffffffffff})
ioctl$FIOSEEKHOLE(r1, 0x80206913, &(0x7f0000000180))
mmap(&(0x7f0000ffb000/0x4000)=nil, 0x4000, 0x0, 0x10, r0, 0x0, 0x0)
mincore(&(0x7f0000ffb000/0x2000)=nil, 0x2000, &(0x7f0000000100)=""/100)


compat_40_mount(&(0x7f0000000040)='tmpfs\x00', &(0x7f00000000c0)='.\x00', 0x0, &(0x7f00000002c0)="01")
mknod(&(0x7f0000000280)='./file0\x00', 0x1100, 0x0)
pathconf(&(0x7f0000000300)='./file0\x00', 0xa)


open(&(0x7f0000000040)='./file0\x00', 0x200, 0x0)
open(&(0x7f0000001700)='./file0\x00', 0x0, 0x0)
setrlimit(0x8, &(0x7f0000000980)={0x7, 0x54})
socketpair$unix(0x1, 0x1, 0x0, &(0x7f0000000140)={<r0=>0xffffffffffffffff, <r1=>0xffffffffffffffff})
sendmsg$unix(r1, &(0x7f00000000c0)={0x0, 0x0, 0x0, 0x0, &(0x7f0000000100)=ANY=[@ANYBLOB="28000000ffff000001"], 0x28}, 0x0)
recvmmsg(r0, &(0x7f0000000000)={&(0x7f0000000000)={0x0, 0x0, 0x0}}, 0x10, 0x0, 0x0)


r0 = socket(0x18, 0x3, 0x0)
setsockopt(r0, 0x1000000029, 0x1a, 0x0, 0x0)


mknod(&(0x7f0000000000)='./file0\x00', 0x2000, 0x1803)
pathconf(&(0x7f00000001c0)='./file0\x00', 0x6)


mknod(&(0x7f0000000280)='./file0\x00', 0x2000, 0x200)
openat(0xffffffffffffff9c, 0x0, 0x0, 0x0)
r0 = open$dir(&(0x7f00000000c0)='./file0\x00', 0x40000400001803c1, 0x0)
open(0x0, 0x0, 0x0)
pwritev(r0, &(0x7f0000000080)=[{&(0x7f00000006c0), 0xf0f75}], 0x1, 0x0)


symlink(&(0x7f0000000000)='./file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa/../file0\x00', 0x0)
mknod(&(0x7f0000000480)='./file0\x00', 0x2000, 0x1733)
r0 = open(&(0x7f00000000c0)='./file0\x00', 0x0, 0x0)
ioctl$FIONREAD(r0, 0x8010426d, &(0x7f0000000080))


mkdir(&(0x7f0000000080)='./file0\x00', 0x0)
compat_40_mount(&(0x7f0000000180)='ptyfs\x00', &(0x7f00000002c0)='./file0\x00', 0x0, &(0x7f0000000500))
compat_50___msgctl13$IPC_STAT(0x0, 0x2, &(0x7f0000000480)={{}, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, &(0x7f00000001c0)={&(0x7f0000000100)}})
lchown(&(0x7f0000000100)='./file0\x00', 0x0, 0xffffffffffffffff)
compat_40_mount(&(0x7f0000000000)='null\x00', &(0x7f0000000140)='./file0\x00', 0x0, &(0x7f00000001c0))
unmount(&(0x7f0000000000)='./file0\x00', 0x0)


open(&(0x7f0000000280)='./file0\x00', 0x615, 0x0)
ktrace(&(0x7f0000000200)='./file0\x00', 0x4, 0xd27d43220c7df9b, 0x0)
r0 = open$dir(&(0x7f00000000c0)='./file0\x00', 0x40000400001803c1, 0x0)
ftruncate(r0, 0x80002, 0x0)


r0 = socket(0x2, 0x1, 0x0)
ioctl$FIOSEEKHOLE(r0, 0xc020690f, &(0x7f0000000180)=0x8000000000000032)


pipe(0x0)
r0 = getpid()
fktrace(0xffffffffffffffff, 0x0, 0x62e2dd08f149ff1b, r0)
shmget$private(0x0, 0x1000, 0x0, &(0x7f0000ffd000/0x1000)=nil)
fchown(0xffffffffffffffff, 0x0, 0x0)


mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
setgroups(0x1, &(0x7f0000000380)=[0x0])


write(0xffffffffffffffff, 0x0, 0x0)
mknod(&(0x7f0000000000)='./file0\x00', 0x2000, 0x0)
__mount50(&(0x7f00000002c0)='overlay\x00', &(0x7f0000000040)='.\x00', 0x0, &(0x7f0000000540), 0x0)
mknod(&(0x7f0000000280)='./file0\x00', 0x1100, 0x0)


compat_40_mount(&(0x7f0000000040)='tmpfs\x00', &(0x7f00000000c0)='.\x00', 0x0, &(0x7f00000002c0)="01")
r0 = open(&(0x7f0000000480)='./file0\x00', 0x80000000000206, 0x40)
writev(r0, &(0x7f00000000c0)=[{&(0x7f0000000280)='#', 0x1}], 0x1)
execve(&(0x7f0000000140)='./file0\x00', 0x0, 0x0)


open(&(0x7f00000000c0)='./file0\x00', 0x80000000000206, 0x0)
ktrace(&(0x7f0000000200)='./file0\x00', 0x4, 0xd27d43220c7df9b, 0x0)
setrlimit(0x3, &(0x7f0000000980))


r0 = socket(0x2, 0x2, 0x0)
setsockopt$inet_opts(0xffffffffffffffff, 0x0, 0x0, &(0x7f0000000180)="33b62b53cc518098fb586f8654ccabac394a", 0x12)
ioctl$FIOSEEKHOLE(r0, 0x80206916, &(0x7f0000000180)=0x8000000000000032)


ioctl$WSMUXIO_INJECTEVENT(0xffffffffffffffff, 0x80185760, &(0x7f0000000000)={0x0, 0x0, {0x0, 0x10000000000001}})
fcntl$lock(0xffffffffffffffff, 0x0, &(0x7f00000000c0)={0x0, 0x0, 0xffffffffffffffff})
connect$unix(0xffffffffffffffff, &(0x7f00000000c0)=@abs={0x682eb13985c518e6, 0x7, 0x0}, 0x8)
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
r0 = socket(0x18, 0x1, 0x0)
close(r0)
socket(0x18, 0x2, 0x0)
setsockopt(r0, 0x1000000029, 0x2e, &(0x7f0000000000)="ebffcbff13b9fd812eaa4e713048e69931929648", 0x14)
fcntl$setstatus(r0, 0x4, 0xe4)
connect$unix(r0, &(0x7f00000000c0)=@abs={0x0, 0x7}, 0x1c)


r0 = socket(0x18, 0x3, 0x0)
setreuid(0xee00, 0x0)
r1 = getuid()
seteuid(r1)
ioctl$FIOSEEKHOLE(r0, 0x8048756d, &(0x7f0000000180))


open(&(0x7f0000000200)='./file0\x00', 0x245, 0x0)
mknodat(0xffffffffffffff9c, 0x0, 0x0, 0x0)
open$dir(&(0x7f0000000000)='./file0\x00', 0x400000, 0x0)


modctl$MODCTL_UNLOAD(0x2, 0x0)
socketpair$unix(0x1, 0x1, 0x0, &(0x7f0000000080)={<r0=>0xffffffffffffffff})
setsockopt$sock_timeval(r0, 0xffff, 0x100c, 0x0, 0x0)


mknod(&(0x7f0000000280)='./file0\x00', 0x1100, 0x0)
fchownat(0xffffffffffffff9c, &(0x7f0000000140)='./file0\x00', 0xffffffffffffffff, 0x0, 0x0)


pipe(&(0x7f0000000140)={<r0=>0xffffffffffffffff})
r1 = getpid()
fktrace(r0, 0x0, 0x62e2dd08f149ff1b, r1)
seteuid(0x0)


mknod(&(0x7f0000000100)='./file0\x00', 0x2000, 0x0)
pipe(0x0)
mkdir(&(0x7f0000000080)='./file0\x00', 0x0)
open(&(0x7f0000000000)='./file0\x00', 0x0, 0x0)
socketpair$unix(0x1, 0x5, 0x0, &(0x7f00000000c0))
__mount50(&(0x7f00000002c0)='overlay\x00', &(0x7f0000000040)='.\x00', 0x0, &(0x7f0000000540), 0x0)
open(&(0x7f0000000040)='./file0\x00', 0x0, 0x0)
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
compat_50_select(0x40, &(0x7f0000000080), 0x0, 0x0, 0x0)


open(&(0x7f00000000c0)='./file0\x00', 0x205, 0x0)
ktrace(&(0x7f0000000200)='./file0\x00', 0x4, 0xd27d43220c7df9b, 0x0)
compat_50_setitimer(0x0, &(0x7f0000000000)={{}, {0x0, 0x80000000}}, 0x0)


r0 = socket(0x800000018, 0x2, 0x0)
r1 = socket(0x18, 0x1, 0x0)
modctl$MODCTL_LOAD(0x5, 0x0)
dup2(r1, r0)
listen(r0, 0x0)


semget(0x2, 0x0, 0x243)


rename(0x0, 0x0)
modctl$MODCTL_LOAD(0x0, &(0x7f0000000080)={&(0x7f00000006c0), 0x0, &(0x7f0000000140)="3cd2", 0x2})


mknod(&(0x7f0000000300)='./file0\x00', 0x2000, 0x6da)
r0 = open(&(0x7f0000000080)='./file0\x00', 0x0, 0x0)
read(r0, &(0x7f00000002c0)=""/82, 0x52)


compat_40_mount(0x0, &(0x7f00000000c0)='.\x00', 0x0, 0x0)
r0 = getsid(0x0)
ptrace(0x9, r0, 0x0, 0x0)
__wait450(0xffffffffffffffff, &(0x7f0000000000), 0x0, &(0x7f00000000c0))


mknod(&(0x7f0000000000)='./bus\x00', 0x2000, 0x2e00)
r0 = open$dir(&(0x7f0000000000)='./bus\x00', 0x0, 0x0)
ioctl$FIOASYNC(r0, 0xc1e85266, &(0x7f0000000040))


mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
r0 = socket$inet(0x2, 0x1, 0x0)
getsockopt$sock_int(r0, 0xffff, 0x1008, 0x0, 0x0)


pipe(&(0x7f0000001400)={<r0=>0xffffffffffffffff})
r1 = getpid()
fktrace(r0, 0x0, 0x2, r1)
syz_usb_disconnect(0xffffffffffffffff)


setsockopt$sock_linger(0xffffffffffffffff, 0xffff, 0x80, &(0x7f0000000080)={0x4}, 0x8)
symlinkat(&(0x7f0000000040)='./file0\x00', 0xffffffffffffffff, 0x0)
symlinkat(0x0, 0xffffffffffffffff, &(0x7f0000000200)='./file1aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\x00')
mkdirat(0xffffffffffffffff, &(0x7f00000000c0)='./file0\x00', 0x0)
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
mkdir(&(0x7f0000000080)='./file0/file0\x00', 0x0)
rename(&(0x7f0000000080)='./file0\x00', &(0x7f00000000c0)='./file0\x00')
__mount50(&(0x7f00000002c0)='overlay\x00', &(0x7f0000000040)='.\x00', 0x0, &(0x7f0000000540), 0x0)


mknod(&(0x7f00000000c0)='./bus\x00', 0x2000, 0x0)
r0 = open$dir(&(0x7f0000000000)='./bus\x00', 0x0, 0x0)
socket(0x0, 0x0, 0x0)
ioctl$FIOASYNC(r0, 0x8004747e, &(0x7f00000001c0))
lchflags(0x0, 0x0)
compat_50_setitimer(0x3, &(0x7f0000001800)={{}, {0x1}}, 0x0)


mkdir(&(0x7f0000000080)='./file0\x00', 0x0)
__mount50(&(0x7f0000000000)='fdesc\x00', &(0x7f0000000180)='./file0\x00', 0x0, 0x0, 0x0)
pathconf(&(0x7f00000001c0)='./file0\x00', 0x0)


compat_40_mount(&(0x7f0000000040)='tmpfs\x00', &(0x7f00000005c0)='.\x00', 0x0, &(0x7f00000002c0)="01")
open$dir(&(0x7f00000000c0)='./file0\x00', 0x40000400001803c1, 0x0)
__getfh30(&(0x7f0000000300)='./file0\x00', 0x0, 0x0)


mmap(&(0x7f0000ffb000/0x4000)=nil, 0xfffff000, 0xd49f275d97cc01bb, 0x1810, 0xffffffffffffffff, 0x0, 0x0)
madvise(&(0x7f0000ffd000/0x3000)=nil, 0x3000, 0x3)


r0 = getsid(0x0)
ptrace(0x9, r0, 0x0, 0x0)
__wait450(r0, 0x0, 0x4, 0x0)


mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
r0 = socket(0x18, 0x1, 0x0)
setsockopt(r0, 0x1000000029, 0xc, &(0x7f0000000180)="ebffcbff13b9fd812eaa4e713048e69931929648", 0x14)
setsockopt(r0, 0x1000000029, 0xd, 0x0, 0x0)


compat_50_setitimer(0x2, &(0x7f0000001800)={{}, {0x1}}, 0x0)
__setitimer50(0x2, 0x0, &(0x7f0000000080))


getpriority(0x0, 0x0)
modctl$MODCTL_UNLOAD(0x2, 0x0)
symlinkat(0x0, 0xffffffffffffffff, 0x0)
r0 = msgget$private(0x0, 0x0)
semctl$IPC_RMID(r0, 0x0, 0x0)


mknod(&(0x7f0000000100)='./file0\x00', 0x2000, 0x0)
r0 = open(&(0x7f0000000140)='./file0\x00', 0x0, 0x0)
ioctl$FIOASYNC(r0, 0x2000740e, 0x0)


mlockall(0x2)
setrlimit(0x6, &(0x7f00000000c0))
compat_43_ommap(&(0x7f0000ffd000/0x2000)=nil, 0x2000, 0x2, 0x402, 0xffffffffffffffff, 0x0)


poll(&(0x7f00000001c0)=[{0xffffffffffffffff, 0x17}], 0x1, 0x0)
socketpair$unix(0x1, 0x2, 0x0, &(0x7f0000000040)={0xffffffffffffffff, <r0=>0xffffffffffffffff})
getsockopt$sock_cred(r0, 0xffff, 0x1022, &(0x7f00000001c0)={0x0, 0x0, <r1=>0x0}, &(0x7f0000000200)=0xc)
semctl$IPC_SET(0x0, 0x0, 0x1, &(0x7f0000000080)={{0x0, 0x0, 0x0, 0x0, r1}})
r2 = socket(0x18, 0x2, 0x0)
close(r2)
r3 = socket(0x800000018, 0x1, 0x0)
setsockopt$sock_int(r3, 0xffff, 0x1000, &(0x7f0000000000)=0x7, 0x4)
bind$unix(r3, &(0x7f0000000080)=@abs={0x1f95d27d48731892, 0x7}, 0x1c)
semctl$IPC_SET(0x0, 0x0, 0x1, &(0x7f00000000c0)={{0x0, 0x0, 0x0, 0x0, r1}})
connect$unix(r2, &(0x7f00000000c0)=@abs={0x682eb13985c518e6, 0x7}, 0x1c)


compat_50_setitimer(0x0, &(0x7f0000001800)={{}, {0x1}}, 0x0)
__setitimer50(0x1, 0x0, &(0x7f0000001940))


compat_43_ogethostname(&(0x7f0000000000)=""/3, 0x3)


r0 = socket(0x10, 0x2, 0x0)
open$dir(0x0, 0x0, 0x0)
__fstat50(r0, &(0x7f0000001040))


accept$inet(0xffffffffffffff9c, &(0x7f0000000040), &(0x7f00000000c0)=0xc)
__mount50(&(0x7f0000000080)='lfs\x00', &(0x7f0000000000)='.\x00', 0x0, &(0x7f00000000c0), 0x0)


mprotect(&(0x7f0000ffa000/0x1000)=nil, 0x1000, 0x0)
mlock(&(0x7f0000ffc000/0x3000)=nil, 0x3000)
mmap(&(0x7f0000ffb000/0x4000)=nil, 0x4000, 0x0, 0x1810, 0xffffffffffffffff, 0x0, 0x0)


r0 = socket(0x18, 0x1, 0x0)
setsockopt(r0, 0x1000000029, 0xc, 0x0, 0x0)


open$dir(&(0x7f0000000000)='./file0\x00', 0x200, 0x0)
r0 = getpid()
ktrace(&(0x7f0000000340)='./file0\x00', 0x0, 0xf, r0)
getsid(0x0)
compat_20_fstatfs(0xffffffffffffffff, 0x0)


compat_40_mount(&(0x7f0000000040)='coda\x00', &(0x7f00000000c0)='.\x00', 0x0, &(0x7f0000000480))


mknod(&(0x7f0000000000)='./file0\x00', 0x2000, 0x1803)
r0 = openat(0xffffffffffffff9c, &(0x7f0000000140)='.\x00', 0x0, 0x0)
utimensat(r0, &(0x7f00000000c0)='./file0\x00', &(0x7f0000000100)={{}, {0xffffffffffffffff}}, 0x0)


open$dir(&(0x7f0000000000)='./file0\x00', 0x0, 0x0)
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
r0 = socket(0x18, 0x1, 0x0)
setsockopt(r0, 0x1000000029, 0xc, &(0x7f0000000180)="ebffcbff13b9fd812eaa4e713048e69931929648", 0x14)
setsockopt(r0, 0x1000000029, 0xd, &(0x7f0000000000)="ebffcbff13b9fd812eaa4e713048e69931929648", 0x14)


r0 = socket(0x18, 0x2, 0x0)
setsockopt(r0, 0x1000000000029, 0xa, &(0x7f0000000040)='\x00\x00\x00\x00', 0x4)
setsockopt(r0, 0x1000000000029, 0xb, 0x0, 0x0)


socketpair$unix(0x1, 0x5, 0x0, &(0x7f0000000200)={<r0=>0xffffffffffffffff, <r1=>0xffffffffffffffff})
sendmsg$unix(0xffffffffffffffff, &(0x7f0000000640)={0x0, 0x0, 0x0, 0x0, &(0x7f00000000c0)=ANY=[@ANYBLOB="18000000ffff000001"], 0x18}, 0x0)
sendmmsg(r0, &(0x7f0000000480)={0x0}, 0x10, 0x0, 0x0)
compat_43_orecvmsg(r1, &(0x7f0000000380)={0x0, 0x0, &(0x7f00000002c0)={0x0}, 0x10, 0x0}, 0x0)


r0 = socket$inet(0x2, 0x2, 0x0)
connect$inet(r0, &(0x7f0000000540), 0xc)


socketpair$unix(0x1, 0x1, 0x0, &(0x7f0000000040)={<r0=>0xffffffffffffffff, <r1=>0xffffffffffffffff})
sendmsg$unix(r1, &(0x7f00000000c0)={0x0, 0x0, 0x0, 0x0, &(0x7f0000000100)=ANY=[@ANYBLOB="28000000ffff000001"], 0x28}, 0x0)
recvmsg(r0, &(0x7f0000000000)={0x0, 0x0, 0x0, 0x0, &(0x7f0000001380)=""/193, 0xc1}, 0x0)


r0 = socket(0x2, 0x20000001, 0x0)
open$dir(&(0x7f0000000180)='./file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\x00', 0x0, 0x0)
ioctl$FIOSEEKHOLE(r0, 0xc118691d, &(0x7f0000000180)=0x8000000000000032)


r0 = socket(0x2, 0x2, 0x0)
ioctl$FIOSEEKHOLE(r0, 0x8020690c, &(0x7f0000000180)=0x8000000000000032)
openat$wscons(0xffffffffffffff9c, 0x0, 0x0, 0x0)
open(&(0x7f0000000480)='./file0\x00', 0x7fffe, 0x0)
truncate(&(0x7f0000000440)='./file0\x00', 0x0, 0x3e2b649f)
truncate(&(0x7f0000000780)='./file0\x00', 0x0, 0x10001)


setreuid(0xee00, 0xffffffffffffffff)
r0 = getuid()
setreuid(0xee00, r0)
posix_spawn(0x0, 0x0, &(0x7f00000000c0)={0x0, 0x0, 0x0}, 0x0, 0x0, 0x0)


mknod(&(0x7f00000000c0)='./bus\x00', 0x2000, 0x0)
r0 = open$dir(&(0x7f0000000000)='./bus\x00', 0x0, 0x0)
ioctl$FIOASYNC(r0, 0x2000747a, 0x0)


r0 = socket(0x18, 0x2, 0x0)
setsockopt(r0, 0x11, 0x0, 0x0, 0x0)


r0 = socket(0x1f, 0x5, 0x0)
ioctl$FIOSEEKHOLE(r0, 0x801869ea, &(0x7f0000000180)=0x8000000000000032)


__clone(0x0, &(0x7f0000000140))
vfork()
compat_50_wait4(0x0, &(0x7f0000000180), 0x8, 0x0)


open(&(0x7f0000000040)='./file0\x00', 0x70e, 0x0)
open(0x0, 0x0, 0x0)
ktrace(&(0x7f0000000200)='./file0\x00', 0x4, 0xd27d43220c7df9b, 0x0)
r0 = open(&(0x7f0000000100)='./file0\x00', 0x0, 0x0)
close(0xffffffffffffffff)
flock(r0, 0x2)


symlink(&(0x7f0000000080)='.\x00', &(0x7f0000000240)='./file0\x00')
chflags(&(0x7f0000000180)='./file0/file0\x00', 0x40001)
r0 = open(&(0x7f0000000480)='./file0\x00', 0x0, 0x0)
__futimes50(r0, 0x0)


r0 = socket$unix(0x1, 0x2, 0x0)
recvmsg(r0, &(0x7f0000002880)={0x0, 0x0, &(0x7f0000002700)=[{&(0x7f0000001200)=""/74, 0x4a}, {0x0}, {0x0}, {0x0}, {0x0}, {0x0}, {0x0}, {0x0}, {0x0}], 0x9, &(0x7f00000027c0)=""/185, 0xb9}, 0x780c)


compat_40_mount(&(0x7f0000000040)='tmpfs\x00', &(0x7f00000000c0)='.\x00', 0x0, &(0x7f00000002c0)="01")
mknod(&(0x7f00000000c0)='./file0\x00', 0x2000, 0x4100)
__lutimes50(&(0x7f0000000180)='./file0\x00', &(0x7f00000001c0))


fcntl$lock(0xffffffffffffffff, 0x0, &(0x7f0000000040)={0x0, 0x0, 0x0, 0x1000300000001, 0xffffffffffffffff})
semctl$SETALL(0x0, 0x0, 0x9, &(0x7f0000000040)=[0x7ff])
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
r0 = socket(0x18, 0x1, 0x0)
setsockopt(r0, 0x1000000029, 0xc, &(0x7f0000000040)="ebffcbff13b9fd812eaa4e713048e69931929648", 0x14)
setsockopt(r0, 0x1000000029, 0x9, 0x0, 0x0)


r0 = socket$inet(0x2, 0x2, 0x0)
r1 = socket$inet(0x2, 0x2, 0x0)
r2 = dup2(r1, r0)
compat_50_futimes(r2, 0x0)


setuid(0xee01)
setpriority(0x2, 0x0, 0x0)


mmap(&(0x7f0000400000/0xc00000)=nil, 0xc00000, 0x0, 0x1011, 0xffffffffffffffff, 0x0, 0x0)
r0 = socket(0x2, 0x2, 0x0)
socketpair$unix(0x1, 0x1, 0x0, &(0x7f0000000040))
r1 = semget$private(0x0, 0x1, 0x120)
semctl$IPC_STAT(r1, 0x0, 0x2, &(0x7f00000000c0))
sendmsg$unix(r0, &(0x7f0000001a00)={&(0x7f0000000080)=@file={0x0, '\x00'}, 0x3, 0x0, 0x0, &(0x7f0000000040)=ANY=[@ANYBLOB="14"], 0x18}, 0x0)
r2 = msgget$private(0x0, 0x100)
msgrcv(r2, 0x0, 0x0, 0x0, 0x1400)
msgsnd(r2, &(0x7f0000001540)=ANY=[@ANYBLOB="02000000000040006d1d5fcb28d9fc2efd00080000006b5723bc77a73f2c7432a505995139959fe8249fcf0bc5a47faa0000"], 0x32, 0x0)
msgsnd(r2, &(0x7f00000000c0)=ANY=[@ANYRESHEX=0x0], 0x8, 0x0)
msgctl$IPC_RMID(r2, 0x0)


ioctl$FIOSEEKHOLE(0xffffffffffffffff, 0xc038694e, &(0x7f0000000180)=0x8000000000000032)
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1)
r0 = socket(0x18, 0x3, 0x0)
ioctl$FIOSEEKHOLE(r0, 0xc0206972, &(0x7f0000000180))


open(&(0x7f0000000480)='./file0\x00', 0x200, 0x38)
r0 = open(&(0x7f0000000340)='./file0\x00', 0x2, 0x0)
writev(r0, &(0x7f0000000440)=[{&(0x7f0000000000)='#!', 0x2}, {&(0x7f00000001c0)="f4f5925e4b49bf720cf10c06503eb7b2206c9957a1aee56b331b84a198f4160a", 0x20}], 0x2)
execve(&(0x7f0000000140)='./file0\x00', 0x0, 0x0)


r0 = socket$inet6(0x18, 0x3, 0x0)
getsockopt(r0, 0x29, 0x25, 0x0, 0x0)


compat_60__lwp_park(&(0x7f0000000b00)={0x80000001, 0x3b9ac9ff}, 0x0, 0x0, 0x0)


open(&(0x7f0000000240)='./file0\x00', 0x205, 0x0)
r0 = open(&(0x7f0000000400)='./file0\x00', 0x0, 0x0)
fcntl$lock(r0, 0x9, &(0x7f0000000140)={0x0, 0x0, 0xffffffffffffffff, 0x269000000, 0xffffffffffffffff})


socketpair$unix(0x1, 0x2, 0x0, &(0x7f0000000440)={<r0=>0xffffffffffffffff, <r1=>0xffffffffffffffff})
write(r0, 0x0, 0x0)
read(r1, 0x0, 0x0)


compat_50_clock_getres(0x1, 0x0)


mknod(&(0x7f00000000c0)='./bus\x00', 0x2000, 0x0)
r0 = open$dir(&(0x7f0000000000)='./bus\x00', 0x0, 0x0)
ioctl$FIOASYNC(r0, 0x80067411, &(0x7f0000000040))


mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
r0 = socket(0x18, 0x3, 0x0)
setsockopt(r0, 0x1000000029, 0x17, &(0x7f0000000000)="5ab7776a", 0x4)


symlink(&(0x7f0000000080)='.\x00', &(0x7f0000000240)='./file0\x00')
socket(0x0, 0x0, 0x0)
ioctl$FIOSEEKHOLE(0xffffffffffffffff, 0xc0206972, &(0x7f0000000880)=0x8000000000000037)
r0 = semget$private(0x0, 0x5, 0x0)
semop(r0, &(0x7f0000000100)=[{0x4, 0x401e, 0x1000}], 0x1)


__mount50(&(0x7f0000000000)='overlay\x00', &(0x7f0000000040)='.\x00', 0x0, &(0x7f0000000540), 0x0)
compat_40_mount(0x0, &(0x7f00000000c0)='.\x00', 0x0, 0x0)
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x4)
r0 = open$dir(&(0x7f00000000c0)='./file0\x00', 0x0, 0x0)
compat_30_getdents(r0, 0x0, 0x0)


madvise(&(0x7f0000ffd000/0x2000)=nil, 0x2000, 0x1)
mmap(&(0x7f0000ff9000/0x4000)=nil, 0x4000, 0xd49f275d97cc01bb, 0x1810, 0xffffffffffffffff, 0x0, 0x0)


mknod(&(0x7f0000000080)='./bus\x00', 0x6000, 0x20e02)
open(&(0x7f0000000000)='./bus\x00', 0x0, 0x0)


socketpair$unix(0x1, 0x2, 0x0, &(0x7f0000000000)={<r0=>0xffffffffffffffff})
sendmsg(r0, &(0x7f0000001c40)={0x0, 0x0, 0x0, 0x0, 0x0}, 0x9)


mkdir(&(0x7f0000000080)='./file0\x00', 0x0)
compat_50___msgctl13$IPC_STAT(0x0, 0x2, &(0x7f0000000280)={{}, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, &(0x7f00000001c0)={&(0x7f0000000100)}})
__mount50(&(0x7f0000000000)='kernfs\x00', &(0x7f0000000180)='./file0\x00', 0x0, 0x0, 0x0)
lchown(&(0x7f0000000100)='./file0\x00', 0x0, 0xffffffffffffffff)
compat_40_mount(&(0x7f0000000140)='umap\x00', &(0x7f0000000040)='./file0\x00', 0x0, &(0x7f00000001c0))
chdir(&(0x7f0000000240)='./file0\x00')
mkdir(&(0x7f00000001c0)='./file1\x00', 0x0)


compat_40_mount(&(0x7f0000000380)='tmpfs\x00', &(0x7f00000003c0)='.\x00', 0x0, &(0x7f0000000140)="01")
mknod(&(0x7f0000000000)='./file0\x00', 0x8000, 0xa718)
__mount50(0x0, &(0x7f0000000040)='.\x00', 0xe680bf986d21abfb, &(0x7f0000000140), 0x0)
chflags(&(0x7f0000000080)='./file0\x00', 0x0)


mkdir(&(0x7f0000000080)='./file0\x00', 0x0)
compat_50___msgctl13$IPC_STAT(0x0, 0x2, &(0x7f0000000280)={{}, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, &(0x7f00000001c0)={&(0x7f0000000100)}})
compat_40_mount(&(0x7f0000000180)='ptyfs\x00', &(0x7f00000002c0)='./file0\x00', 0x0, &(0x7f0000000500))
lchown(&(0x7f0000000100)='./file0\x00', 0xffffffffffffffff, 0x0)
compat_40_mount(&(0x7f0000000140)='umap\x00', &(0x7f0000000040)='./file0\x00', 0x0, &(0x7f00000001c0))
chmod(&(0x7f0000000200)='./file0\x00', 0x0)


r0 = open$dir(&(0x7f0000000000)='.\x00', 0x0, 0x0)
symlinkat(&(0x7f0000000dc0)='./file0\x00', r0, &(0x7f0000000ec0)='./file1aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\x00')
fchownat(r0, &(0x7f0000000340)='./file1aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\x00', 0x0, 0x0, 0x200)


mknod(&(0x7f0000000280)='./file0\x00', 0x2000, 0x200)
r0 = open(&(0x7f00000000c0)='./file0\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x2000)=nil, 0x2000, 0x1, 0x10, r0, 0x0, 0x0)
semctl$IPC_SET(0x0, 0x0, 0x1, &(0x7f0000000000)={{0x0, 0xffffffffffffffff}})


mprotect(&(0x7f0000ffd000/0x3000)=nil, 0x3000, 0x0)
mlock(&(0x7f0000ffc000/0x3000)=nil, 0x3000)
mmap(&(0x7f0000ffb000/0x4000)=nil, 0x4000, 0xd49f275d97cc01bb, 0x1810, 0xffffffffffffffff, 0x0, 0x0)


_ksem_init(0x0, &(0x7f0000000000)=<r0=>0x0)
_ksem_close(r0)


r0 = socket(0x2, 0x3, 0x0)
ioctl$FIOSEEKHOLE(r0, 0x80206916, &(0x7f0000000180)=0x8000000000000032)


socketpair(0x2, 0x1, 0x0, 0x0)


modctl$MODCTL_UNLOAD(0x2, 0x0)
compat_50_setitimer(0x0, &(0x7f0000001800)={{}, {0x0, 0xf423f}}, 0x0)
compat_50_getitimer(0x0, &(0x7f0000000180))


symlink(0x0, 0x0)
mknod(&(0x7f00000000c0)='./bus\x00', 0x2000, 0x4f4b)
r0 = open$dir(&(0x7f0000000040)='./bus\x00', 0x0, 0x0)
ioctl$FIOASYNC(r0, 0x8004667e, &(0x7f00000001c0))
__mount50(0x0, 0x0, 0x4815, 0x0, 0x0)
r1 = socket(0x1f, 0x5, 0x2)
ioctl$FIOGETBMAP(r1, 0xc008667a, &(0x7f0000000100))


modctl$MODCTL_UNLOAD(0x1, &(0x7f0000000000)="670a7f")


compat_43_ocreat(0x0, 0x0)
ktrace(&(0x7f00000000c0)='./file0\x00', 0x4, 0x4, 0xffffffffffffffff)
compat_50_setitimer(0x0, &(0x7f0000000000)={{}, {0x4000000007}}, 0x0)
__setitimer50(0x3, &(0x7f0000000100)={{0x4}}, 0x0)


mkdir(&(0x7f0000000000)='./file0\x00', 0x0)
compat_40_mount(&(0x7f0000000180)='ptyfs\x00', &(0x7f00000002c0)='./file0\x00', 0x0, &(0x7f0000000500))
compat_50___msgctl13$IPC_STAT(0x0, 0x2, &(0x7f0000000280)={{}, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, &(0x7f00000001c0)={&(0x7f0000000100)}})
lchown(&(0x7f0000000100)='./file0\x00', 0x0, 0xffffffffffffffff)
compat_40_mount(&(0x7f0000000000)='null\x00', &(0x7f0000000140)='./file0\x00', 0x0, &(0x7f00000001c0))
compat_40_mount(0x0, &(0x7f0000000140)='./file0\x00', 0x400000, &(0x7f0000000000))


r0 = open(&(0x7f0000000480)='./file0\x00', 0x80000000000206, 0x4ebfac6bbaf7869)
r1 = open(&(0x7f0000000200)='./file0\x00', 0x80, 0x29)
writev(r0, &(0x7f0000000000), 0x0)
getgroups(0x7, &(0x7f0000000040)=[<r2=>0xffffffffffffffff, <r3=>0x0, <r4=>0x0, <r5=>0x0, <r6=>0x0, 0x0, <r7=>0x0])
setgroups(0x0, 0x0)
setreuid(0xee00, 0x0)
getuid()
setegid(r4)
getsockopt$sock_cred(r0, 0xffff, 0x1022, &(0x7f0000000340)={0x0, <r8=>0x0}, &(0x7f0000000100))
r9 = semget$private(0x0, 0x2, 0x39a)
getsockopt$sock_cred(r0, 0xffff, 0x1022, &(0x7f0000000140), &(0x7f0000000580)=0xc)
r10 = getuid()
setregid(0x0, r3)
setreuid(0xee00, r10)
syz_emit_ethernet(0x2a, &(0x7f0000000080))
semctl$IPC_SET(0x0, 0x0, 0x1, &(0x7f0000000700)={{0x8001, 0x0, r5, r10, r3, 0x100, 0x53}, 0xffffffffffffff00, 0x7})
sendmsg$unix(r0, &(0x7f0000000540)={&(0x7f00000000c0)=@abs={0x0, 0x0, 0x2}, 0x8, &(0x7f0000000500)=[{&(0x7f0000000940)="36e562852f9846aca15a7db8b0266293f4b02955ba5a6887d14f2d685031e865bb532b09c0e9726165f26a90a67711a5f268539afd0c19d90c3833f0c79c74c1c2e2d79b065be8e7a5597ced0eeefc21a944042d9fb39800cb2beed2db9301ee75a25edf9cca3e49f75378b582080032815ec32012cb63b736705c3011d3f2bf64d6660c6f2c3094a4a8023a334a2b2b688eb6a7ee0330679c738083b5", 0x9d}], 0x1}, 0x0)
semop(r9, &(0x7f0000000240)=[{0x0, 0x0, 0xc00}], 0x1)
semctl$GETPID(r9, 0x0, 0x4, &(0x7f0000000ac0)=""/201)
ioctl$FIONBIO(r1, 0x8004667e, &(0x7f00000004c0))
r11 = getuid()
semctl$GETVAL(r9, 0x4, 0x5, &(0x7f0000000800)=""/266)
seteuid(0x0)
semctl$IPC_SET(0x0, 0x0, 0x1, &(0x7f0000000680)={{0x2, r8, r6, 0x0, r2, 0x64, 0x13f}, 0x200000000000205, 0xfffffffbffff0002, 0x8})
semctl$GETNCNT(r9, 0x3, 0x3, &(0x7f0000000440)=""/64)
semctl$SETVAL(0x0, 0x2, 0x8, &(0x7f0000000080)=0x2000000)
getgid()
semctl$IPC_SET(r9, 0x0, 0x1, &(0x7f0000000180)={{0x8001, 0x0, 0x0, 0x0, r7, 0x12}, 0x0, 0x441, 0x8000000000000000})
setreuid(0x0, r11)
execve(&(0x7f0000000600)='./file0\x00', 0x0, 0x0)


sendto$unix(0xffffffffffffffff, &(0x7f00000000c0)="b10005016000009f050000000010000000000000ce", 0x15, 0x0, 0x0, 0x0)
r0 = socket(0x18, 0x1, 0x0)
r1 = fcntl$dupfd(r0, 0x2, 0xffffffffffffffff)
close(r1)
r2 = socket(0x11, 0x3, 0x0)
poll(&(0x7f0000000000)=[{}], 0x20000000000000fe, 0x0)
connect$inet6(r2, &(0x7f0000000000)={0x18, 0x1}, 0xc)


open(&(0x7f0000000040)='./file0\x00', 0xf8e, 0x0)
ktrace(&(0x7f0000000200)='./file0\x00', 0x4, 0xd27d43220c7df9b, 0x0)
getgroups(0x8, &(0x7f0000000000)=[0x0, 0x0, 0xffffffffffffffff, 0xffffffffffffffff, 0x0, 0xffffffffffffffff, 0x0, 0x0])


r0 = socket(0x18, 0x2, 0x0)
mmap(&(0x7f0000001000/0x2000)=nil, 0x2000, 0x0, 0x10, r0, 0x0, 0x0)


mkdir(&(0x7f0000000080)='./file0\x00', 0x0)
__mount50(0x0, 0x0, 0x0, 0x0, 0x0)
__mount50(&(0x7f00000002c0)='overlay\x00', &(0x7f0000000040)='.\x00', 0x0, &(0x7f0000000540), 0x0)
chroot(&(0x7f00000003c0)='./file0\x00')


mkdir(&(0x7f0000000080)='./file0\x00', 0x0)
compat_40_mount(&(0x7f0000000180)='ptyfs\x00', &(0x7f00000002c0)='./file0\x00', 0x0, &(0x7f0000000500))
compat_90_statvfs1(&(0x7f0000000000)='./file0\x00', 0x0, 0x0)
swapctl$SWAP_ON(0x1, &(0x7f0000000000), 0x0)
__lutimes50(0x0, 0x0)


setreuid(0x0, 0xee01)
r0 = socket$inet(0x2, 0x1, 0x0)
setsockopt(r0, 0x0, 0x1b, &(0x7f0000000a00)="8b589d9d", 0x4)


r0 = socket(0x18, 0x2, 0x0)
writev(0xffffffffffffffff, 0x0, 0x0)
setsockopt$sock_timeval(r0, 0xffff, 0x0, 0x0, 0x0)
_lwp_create(0x0, 0x0, 0x0)
ioctl$FIOSEEKHOLE(0xffffffffffffffff, 0xc0206912, 0x0)
open$dir(0x0, 0x0, 0x0)
close(0xffffffffffffffff)
mknod(0x0, 0x0, 0xe03)
pipe(&(0x7f0000000040)={<r1=>0xffffffffffffffff})
write(r1, &(0x7f0000000340), 0xd4e688a67930cd)


r0 = socket(0x2, 0x2, 0x0)
setreuid(0x0, 0xee01)
ioctl$FIOSEEKHOLE(r0, 0x80906979, &(0x7f0000000180)=0x8000000000000031)


r0 = getsid(0x0)
ptrace(0x9, r0, 0x0, 0x0)
compat_50_wait4(0x0, 0x0, 0x0, 0x0)
ptrace(0x4, r0, &(0x7f0000000000), 0x0)


open$dir(&(0x7f00000000c0)='./file0\x00', 0x40000400001803c1, 0x0)
r0 = getpid()
ktrace(&(0x7f0000000340)='./file0\x00', 0x0, 0xf, r0)
mknod(0x0, 0x0, 0x1803)
compat_20_statfs(&(0x7f0000000300)='./file0\x00', &(0x7f0000000380))


r0 = _lwp_self()
_lwp_exit()
_lwp_unpark(r0, 0x0)


compat_40_mount(&(0x7f0000000040)='tmpfs\x00', &(0x7f00000000c0)='.\x00', 0x0, &(0x7f00000002c0)="01")
open$dir(&(0x7f0000000180)='./file0\x00', 0x0, 0x0)
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
r0 = open(&(0x7f0000000180)='./file0\x00', 0x80000000000206, 0x0)
writev(r0, &(0x7f0000000100)=[{0x0, 0x2}], 0x1)


socketpair$unix(0x1, 0x1, 0x0, &(0x7f0000000040)={<r0=>0xffffffffffffffff})
sendmsg$unix(r0, &(0x7f00000001c0)={0x0, 0x0, &(0x7f0000000000)=[{&(0x7f0000000080)="1c", 0xfffffdef}], 0x1, &(0x7f0000001080)=ANY=[@ANYBLOB="14000000000000000100000001"], 0x18}, 0x41)


socket(0x18, 0x0, 0x0)
r0 = socket(0x18, 0x2, 0x0)
pipe(&(0x7f0000000140)={<r1=>0xffffffffffffffff})
r2 = getpid()
fktrace(r1, 0x0, 0x62e2dd08f149ff1b, r2)
sendto(r0, 0x0, 0x0, 0x0, &(0x7f0000000000)=@len=0x57, 0xe)


__mount50(&(0x7f0000000080)='overlay\x00', &(0x7f0000000040)='.\x00', 0x0, &(0x7f0000000540), 0x0)
unlink(0x0)
open(&(0x7f00000000c0)='./file0\x00', 0x615, 0x7)
execve(&(0x7f0000000140)='./file0\x00', 0x0, 0x0)


r0 = fcntl$dupfd(0xffffffffffffffff, 0xb, 0xffffffffffffffff)
__fstat50(r0, 0x0)


open(&(0x7f0000000040)='./file0\x00', 0x70e, 0x0)
ktrace(&(0x7f0000000200)='./file0\x00', 0x4, 0xd27d43220c7df9b, 0x0)
compat_43_ogethostname(&(0x7f0000000000)=""/11, 0xb)


getsid(0x0)
mknod(&(0x7f0000000140)='./file0\x00', 0x0, 0x0)
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
compat_60__lwp_park(&(0x7f0000000140), 0x0, 0x0, 0x0)


r0 = open(&(0x7f0000000040)='./file0\x00', 0x70e, 0x0)
socket(0x0, 0x0, 0x0)
mmap(&(0x7f0000003000/0x2000)=nil, 0x2000, 0x0, 0x10, r0, 0x0, 0x0)
madvise(&(0x7f0000000000/0x3000)=nil, 0x3000, 0x6)


r0 = socket(0x18, 0x2, 0x0)
setsockopt(r0, 0x1000000029, 0x30, &(0x7f0000000000)="ebffcbff13b9fd812eaa4e713048e699", 0x10)


mkdir(&(0x7f0000000200)='./file0\x00', 0x0)
compat_50___msgctl13$IPC_STAT(0x0, 0x2, &(0x7f0000000280)={{}, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, &(0x7f00000001c0)={&(0x7f0000000100), 0xdffffffffffff7ff}})
lchown(&(0x7f0000000100)='./file0\x00', 0x0, 0xffffffffffffffff)
compat_40_mount(&(0x7f0000000380)='union\x00', &(0x7f0000000140)='./file0\x00', 0x0, &(0x7f00000001c0))
pathconf(&(0x7f0000000240)='./file0\x00', 0x7)


dup(0xffffffffffffffff)
syz_emit_ethernet(0x138, 0x0)
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
r0 = dup(0xffffffffffffffff)
fcntl$dupfd(r0, 0x2, 0xffffffffffffffff)
r1 = socket(0x18, 0x1, 0x0)
setsockopt(r1, 0x1000000029, 0xc, &(0x7f0000000000)="ebffcbff13b9fd812eaa4e713048e69931929648", 0x14)
setsockopt(r1, 0x1000000000029, 0xa, 0x0, 0x0)


mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
r0 = socket(0x18, 0x1, 0x0)
setsockopt(r0, 0x0, 0xc, &(0x7f0000000040)="ebffcbff13b9fd812eaa4e713048e69931929648", 0x14)
__clock_gettime50(0x40000000, 0x0)


r0 = socket(0x2, 0x3, 0x0)
setrlimit(0x8, &(0x7f0000000980))
paccept(r0, 0x0, 0x0, 0x0)


r0 = compat_30_socket(0x22, 0x3, 0x0)
compat_43_osend(r0, &(0x7f0000000040)='X', 0x1, 0x0)


socketpair$unix(0x2, 0x3, 0x88, 0x0)


open(&(0x7f0000000040)='./file0\x00', 0x70e, 0x0)
ktrace(&(0x7f0000000200)='./file0\x00', 0x4, 0xd27d43220c7df9b, 0x0)
compat_43_ogetrlimit(0x0, &(0x7f0000000000))


modctl$MODCTL_UNLOAD(0x2, 0x0)
mknod(&(0x7f00000000c0)='./file0\x00', 0x2000, 0x40000802)
ktrace(&(0x7f0000000200)='./file0\x00', 0x0, 0x0, 0x0)


modctl$MODCTL_UNLOAD(0x2, 0x0)
r0 = socket$unix(0x1, 0x1, 0x0)
r1 = socket$unix(0x1, 0x1, 0x0)
bind$unix(r1, &(0x7f00000001c0)=@file={0x1, '\xe9\x1fq\x89Y\x1e\x923aK\x00'}, 0x56)
listen(r1, 0x0)
connect$unix(r0, &(0x7f0000000140)=@file={0x1, '\xe9\x1fq\x89Y\x1e\x923aK\x00'}, 0x6e)
close(r1)


r0 = getsid(0x0)
ptrace(0x9, r0, 0x0, 0x0)
__wait450(0x0, 0x0, 0x7, 0x0)


fcntl$lock(0xffffffffffffffff, 0x0, &(0x7f0000000040)={0x0, 0x0, 0x0, 0x8000000000000001})
ioctl$WSMUXIO_INJECTEVENT(0xffffffffffffffff, 0x80185760, 0x0)
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
r0 = socket(0x18, 0x1, 0x0)
setsockopt(r0, 0x1000000029, 0xc, &(0x7f0000000180)="ebffcbff13b9fd812eaa4e713048e69931929648", 0x14)
setsockopt(r0, 0x1000000029, 0xc, &(0x7f0000000040)="ebffcbff13b9fd812eaa4e713048e69931929648", 0x14)
setsockopt(r0, 0x1000000029, 0xd, &(0x7f0000000000)="ebffcbff13b9fd812eaa4e713048e69931929648", 0x14)


symlink(&(0x7f0000001040)='./file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\x00', &(0x7f0000000240)='./file0\x00')
readlink(&(0x7f0000000200)='./file0\x00', 0x0, 0x0)


ioctl$WSMUXIO_INJECTEVENT(0xffffffffffffffff, 0x80185760, &(0x7f0000000000)={0x0, 0x0, {0x0, 0x10000000000002}})
fcntl$lock(0xffffffffffffffff, 0x0, &(0x7f00000000c0)={0x0, 0x0, 0x5ff, 0x0, 0xffffffffffffffff})
connect$unix(0xffffffffffffffff, &(0x7f00000000c0)=@abs={0x682eb13985c518e6, 0x7}, 0x1c)
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
r0 = socket(0x18, 0x3, 0x0)
close(r0)
r1 = socket(0x18, 0x2, 0x0)
setsockopt(r1, 0x1000000029, 0x2e, &(0x7f0000000000)="ebffcbff13b9fd812eaa4e713048e69931929648", 0x14)
connect$unix(r0, &(0x7f00000000c0)=@abs={0x0, 0x7}, 0x1c)


mknod$loop(&(0x7f0000000000)='./file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\x00', 0x0, 0x1)
r0 = socket(0x18, 0x1, 0x0)
ioctl$OFIOGETBMAP(r0, 0x8048756d, &(0x7f0000000000))


chown(&(0x7f0000000000)='./file0\x00', 0x0, 0x0)
r0 = socket(0x2, 0x2, 0x0)
setsockopt$sock_int(r0, 0xffff, 0x1001, &(0x7f0000000100)=0x20000, 0x4)
syz_emit_ethernet(0x138, &(0x7f0000000000))
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
connect$unix(r0, &(0x7f0000000000), 0x10)
sendmsg(r0, &(0x7f0000000400)={0x0, 0x0, 0x0, 0x0, 0x0, 0x78}, 0x0)
setsockopt$inet_opts(r0, 0x0, 0x1, &(0x7f0000000440)="9876d692a3ef9c7ab923a2f0", 0xc)
write(r0, 0x0, 0x0)


compat_60__lwp_park(&(0x7f0000000000)={0x0, 0x3b9ac9ff}, 0x0, 0x0, 0x0)


compat_40_mount(&(0x7f0000000380)='tmpfs\x00', &(0x7f00000003c0)='.\x00', 0x0, &(0x7f0000000140)="01")
__mount50(0x0, &(0x7f0000000040)='.\x00', 0xe680bf986d21abfb, &(0x7f0000000400), 0x0)
__lstat50(&(0x7f0000000000)='.\x00', 0x0)
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
unmount(&(0x7f0000000000)='./file0\x00', 0x255a0100)


compat_50_select(0x190, 0x0, &(0x7f0000000040), 0xffffffffffffffff, 0x0)


mknod(&(0x7f00000000c0)='./file0\x00', 0x2000, 0x4100)
r0 = open(&(0x7f00000000c0)='./file0\x00', 0x0, 0x0)
fcntl$setstatus(r0, 0x4, 0x1000044)


symlink(&(0x7f0000000080)='.\x00', &(0x7f0000000000)='./file0\x00')
compat_50___msgctl13$IPC_STAT(0x0, 0x2, &(0x7f00000003c0)={{}, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, &(0x7f00000001c0)={&(0x7f0000000100)}})
mkdir(&(0x7f00000000c0)='./control\x00', 0x0)
lchown(&(0x7f0000000100)='./file0\x00', 0x0, 0xffffffffffffffff)
compat_40_mount(&(0x7f0000000000)='null\x00', &(0x7f0000000140)='./file0\x00', 0x0, &(0x7f00000001c0))
rmdir(&(0x7f0000000040)='./control\x00')


__mount50(&(0x7f0000000000)='overlay\x00', &(0x7f0000000040)='.\x00', 0x0, &(0x7f0000000540), 0x0)
mknod(&(0x7f0000000240)='./file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\x00', 0x8000, 0x0)
r0 = open$dir(&(0x7f00000026c0)='./file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\x00', 0x0, 0x0)
__fstat50(r0, &(0x7f0000000340)={<r1=>0x0})
mknod(&(0x7f0000000200)='./file0\x00', 0x2000, r1)
r2 = open(&(0x7f0000000100)='./file0\x00', 0x0, 0x0)
socket$inet6(0x18, 0x0, 0x0)
fcntl$setstatus(r2, 0x4, 0x40)


mknod(&(0x7f00000000c0)='./file0\x00', 0x2000, 0x29b3)
open$dir(&(0x7f00000000c0)='./file0\x00', 0x0, 0x0)


open(&(0x7f0000000480)='./file0\x00', 0x80000000000206, 0x0)
setregid(0xee00, 0x0)
ktrace(&(0x7f0000000200)='./file0\x00', 0x4, 0xd27d43220c7df9b, 0x0)


socketpair(0x11, 0x3, 0x2, 0x0)


compat_40_mount(&(0x7f0000000040)='tmpfs\x00', &(0x7f00000000c0)='.\x00', 0x0, &(0x7f00000002c0)="01")
accept(0xffffffffffffffff, 0x0, 0x0)
unlink(0x0)
open(&(0x7f00000000c0)='./file0\x00', 0x615, 0x0)
truncate(&(0x7f0000000200)='./file0\x00', 0x0, 0x7fff)
r0 = open(&(0x7f0000000040)='./file0\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x13000)=nil, 0x13000, 0x5, 0x10, r0, 0x0, 0x0)
setrlimit(0x0, &(0x7f0000000140))


r0 = socket(0x18, 0x3, 0x0)
setsockopt$sock_int(r0, 0xffff, 0x1004, &(0x7f0000000000)=0x7, 0x4)


r0 = getsid(0x0)
ptrace(0x9, r0, 0x0, 0x0)
r1 = getppid()
setpgid(0x0, 0x0)
setpgid(0x0, r1)


writev(0xffffffffffffffff, &(0x7f0000000240)=[{&(0x7f0000000140)}], 0x1)
syz_emit_ethernet(0xe, &(0x7f0000000000))
writev(0xffffffffffffffff, &(0x7f0000000580)=[{&(0x7f0000000000)="b886b4e47f", 0x5}], 0x1)
syz_emit_ethernet(0x138, &(0x7f0000000000))
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1)
bind(0xffffffffffffffff, &(0x7f0000000000), 0x10)
listen(0xffffffffffffffff, 0x0)
r0 = socket(0x2, 0x1, 0x0)
connect$unix(r0, &(0x7f0000000000), 0x10)
r1 = dup2(r0, r0)
shutdown(r0, 0x1)
setsockopt$sock_int(r1, 0xffff, 0x1, &(0x7f0000000240), 0x4)
shutdown(r1, 0x1)


rasctl(0x0, 0x9, 0x0)
rasctl(0x0, 0xd39, 0x0)


ptrace(0x23, 0x0, 0x0, 0x0)


r0 = open(&(0x7f0000000100)='./file0\x00', 0x10310, 0x0)
fstatat(0xffffffffffffffff, 0x0, 0x0, 0x0)
close(r0)
fpathconf(r0, 0x0)


setrlimit(0x0, &(0x7f0000000000))
__clone(0x4100, 0x0)


compat_40_mount(&(0x7f0000000040)='tmpfs\x00', &(0x7f00000000c0)='.\x00', 0x0, &(0x7f00000002c0)="01")
symlink(&(0x7f0000000080)='.\x00', &(0x7f0000000240)='./file0\x00')
compat_50___msgctl13$IPC_STAT(0x0, 0x2, &(0x7f0000000280)={{}, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, &(0x7f00000001c0)={&(0x7f0000000100)}})
__posix_rename(&(0x7f0000000100)='./file0\x00', 0x0)
sendmsg$unix(0xffffffffffffffff, &(0x7f0000000240)={&(0x7f0000000180)=@abs={0x0, 0x0, 0x2}, 0x8, &(0x7f0000000200)}, 0x0)
compat_40_mount(&(0x7f0000000140)='umap\x00', &(0x7f0000000040)='./file0\x00', 0x0, &(0x7f00000001c0))
unlink(&(0x7f00000003c0)='./file0\x00')
open(&(0x7f00000000c0)='./file0\x00', 0x615, 0x0)
open(&(0x7f0000000000)='./file0\x00', 0x20410, 0x0)


compat_43_ocreat(0x0, 0x0)
__mount50(&(0x7f0000000000)='cd9660\x00', &(0x7f0000000040)='.\x00', 0x1, &(0x7f0000000380), 0x0)


__getrusage50(0x0, &(0x7f0000000000))
__mount50(&(0x7f0000000280)='kernfs\x00', &(0x7f00000002c0)='.\x00', 0x0, 0x0, 0x0)
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
__mount50(0x0, &(0x7f0000000040)='.\x00', 0x0, 0x0, 0x0)


open(&(0x7f0000000480)='./file0\x00', 0x80000000000206, 0x4ebfac6bbaf7949)
setreuid(0x0, 0xee01)
__posix_chown(&(0x7f0000000880)='./file0\x00', 0x0, 0x0)


mknod(&(0x7f00000000c0)='./file0\x00', 0x2000, 0x4100)
r0 = open(&(0x7f0000000480)='./file0\x00', 0x80000000000206, 0x0)
preadv(r0, &(0x7f00000017c0)=[{&(0x7f00000004c0)=""/4096, 0x1000}], 0x1, 0x0)


r0 = socket(0x18, 0x2, 0x0)
close(r0)
r1 = socket(0x18, 0x3, 0x0)
setsockopt(r1, 0x1000000029, 0x31, &(0x7f00000000c0)="b211d7170d816685c8e360f2fa41c1a0946988b272d2dd3dc90142a84231a746e337b372e93320cff6669cbe7868de45ed3fc33719ca6df71ecec8a918458b2c10a1f8c66653b276e180e9cb9b21f9982230f575295d48889c9a920796b2dd92fc8575680b37ba955d2c15e6d7c9198ed900ab006ddfb67869b51a2216114d1ece85f593e74035f5bc054eb1dbddf42a", 0x90)
semctl$IPC_SET(0x0, 0x0, 0x1, &(0x7f0000000080))
connect$unix(r0, &(0x7f00000000c0)=@abs={0x682eb13985c518e6, 0x7}, 0x1c)
sendmsg(r0, &(0x7f0000000280)={0x0, 0x0, 0x0, 0x0, &(0x7f0000000300)=[{0x10}], 0x10}, 0x0)


r0 = socket$inet(0x2, 0x2, 0x0)
close(r0)
r1 = socket$inet(0x2, 0x2, 0x0)
setsockopt$inet_opts(r1, 0x0, 0x200000000000a, &(0x7f0000000000)="ea00005c00000000", 0x1)
setsockopt$inet_opts(r0, 0x0, 0xb, 0x0, 0x0)


r0 = open(&(0x7f0000000480)='./file0\x00', 0x80000000000206, 0x40)
open(&(0x7f0000000280)='./file1\x00', 0x20200, 0x0)
writev(r0, &(0x7f00000000c0)=[{&(0x7f0000000280)='#!', 0x10}, {&(0x7f0000000000)="8d6bb85551ec8430877ae32fe9bbe42cc8f2147a3eba8e1969f0435119cf4c071c8aee7ef2921be5d7d4796c5566c95989acb3d185587234186e96b8fde9ffac51de05a87b8b893e2abd154dd886eafbe03881d25b7b13b4c32227fc9e5a86a06f59f701322b3a109a13436e486b0a", 0x6f}], 0x2)
execve(&(0x7f0000000140)='./file0\x00', 0x0, 0x0)


r0 = socket(0x800000018, 0x2, 0x0)
r1 = socket(0x18, 0x1, 0x0)
msgget$private(0x0, 0x0)
msgrcv(0x0, 0x0, 0x0, 0x0, 0x0)
dup2(r1, r0)
listen(r0, 0x0)
paccept(r0, 0x0, 0x0, 0x20000000)


__mount50(0x0, &(0x7f0000000180)='./file0/file0\x00', 0x0, 0x0, 0x0)
_ksem_open(&(0x7f0000000180)='/', 0x200, 0x0, 0x0, &(0x7f0000000100))


symlink(&(0x7f0000000080)='.\x00', &(0x7f0000000240)='./file0\x00')
compat_50___msgctl13$IPC_STAT(0x0, 0x2, &(0x7f0000000280)={{}, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, &(0x7f00000001c0)={&(0x7f0000000100), 0xdffffffffffff7ff}})
lchown(&(0x7f0000000100)='./file0\x00', 0x0, 0xffffffffffffffff)
compat_40_mount(&(0x7f0000000380)='union\x00', &(0x7f0000000140)='./file0\x00', 0x0, &(0x7f00000001c0))
pathconf(&(0x7f0000000080)='./file0\x00', 0xb)


mknod(&(0x7f0000000200)='./file0\x00', 0xc035cd953ea0fd64, 0x1733)
open(&(0x7f0000000040)='./file0\x00', 0x70e, 0x0)


symlink(&(0x7f0000000080)='.\x00', &(0x7f0000000240)='./file0\x00')
compat_50___msgctl13$IPC_STAT(0x0, 0x2, &(0x7f0000000280)={{}, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, &(0x7f00000001c0)={&(0x7f0000000100), 0xdffffffffffff7ff}})
lchown(&(0x7f0000000100)='./file0\x00', 0x0, 0x0)
compat_40_mount(&(0x7f0000000380)='union\x00', &(0x7f0000000140)='./file0\x00', 0x3, &(0x7f00000001c0))
lchown(&(0x7f0000000100)='./file0\x00', 0x0, 0xffffffffffffffff)


mknod(&(0x7f0000000100)='./file0\x00', 0x2000, 0x0)
r0 = open(&(0x7f0000000140)='./file0\x00', 0x0, 0x0)
ioctl$FIOASYNC(r0, 0x2000745f, 0x0)


mkdir(&(0x7f0000000080)='./file0\x00', 0x0)
r0 = open(&(0x7f0000000100)='./file0\x00', 0x0, 0x0)
fcntl$lock(r0, 0x9, &(0x7f0000000080)={0x0, 0x0, 0x0, 0x100000001})
r1 = open(&(0x7f0000000100)='./file0\x00', 0x0, 0x0)
fcntl$lock(r1, 0x9, &(0x7f0000000140)={0x0, 0x0, 0xfff, 0x100000002})


modctl$MODCTL_UNLOAD(0x2, 0x0)
ptrace(0x0, 0x0, 0x0, 0x0)


r0 = socket$inet6(0x18, 0x3, 0x0)
sendto$inet6(r0, 0x0, 0x0, 0x0, &(0x7f0000000040)={0x18, 0x3}, 0xc)


sendmsg$unix(0xffffffffffffffff, &(0x7f0000000240)={0x0, 0x0, &(0x7f00000017c0)=[{&(0x7f00000001c0)="1c80de018e2605", 0x7}], 0x1}, 0x0)
mknod(&(0x7f00000000c0)='./bus\x00', 0x2000, 0x4f4b)
r0 = open$dir(&(0x7f0000000040)='./bus\x00', 0x0, 0x0)
ioctl$FIOASYNC(r0, 0xc0104302, &(0x7f00000001c0))


r0 = socket(0x18, 0x3, 0x0)
compat_43_orecvfrom(r0, 0x0, 0x0, 0x1, 0x0, 0x0)


mkdir(&(0x7f0000000000)='./file0\x00', 0x0)
chmod(&(0x7f0000000280)='./file0\x00', 0x3a)
setreuid(0x0, 0xee01)
mkdir(0x0, 0x0)
compat_50___msgctl13$IPC_STAT(0x0, 0x2, 0x0)
__mount50(0x0, 0x0, 0x0, 0x0, 0x0)
open(0x0, 0x0, 0x0)
__utimes50(&(0x7f0000000340)='./file0\x00', 0x0)


__wait450(0x0, 0x0, 0x4, 0x0)
__mount50(&(0x7f00000002c0)='overlay\x00', &(0x7f0000000040)='.\x00', 0x0, 0x0, 0x0)
modctl$MODCTL_UNLOAD(0x2, 0x0)
open$dir(&(0x7f00000000c0)='./file0\x00', 0x40000400001803c1, 0x0)
pwritev(0xffffffffffffffff, &(0x7f0000000080)=[{&(0x7f00000006c0), 0x2cfea}], 0x1, 0x0)
open$dir(&(0x7f0000000000)='./file0\x00', 0x200, 0x0)
ktrace(0x0, 0x0, 0x0, 0x0)
modctl$MODCTL_UNLOAD(0x1, &(0x7f0000000100)="7218fd96c2d63ca3e2231e1559d65ad9ebb19bdea9a7e3ab03d9bcd29cac9b9bf1b098e1ec9781191b506151414cd1a2b8782a66")


mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
r0 = socket(0x18, 0x1, 0x0)
setsockopt(r0, 0x1000000029, 0xc, &(0x7f0000000040)="ebffcbff13b9fd812eaa4e713048e69931929648", 0x14)
setsockopt(r0, 0x1000000029, 0xb, 0x0, 0x0)


__mount50(&(0x7f0000000000)='overlay\x00', &(0x7f0000000040)='.\x00', 0x0, &(0x7f0000000540), 0x0)
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
pathconf(&(0x7f0000000040)='./file0/file0\x00', 0x7)


open$dir(&(0x7f0000000000)='.\x00', 0x0, 0x0)
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1)
r0 = socket(0x18, 0x2, 0x0)
setsockopt(r0, 0x1000000000029, 0xb, &(0x7f0000000080)="00020000", 0x4)
setsockopt(r0, 0x1000000000029, 0x9, &(0x7f0000000000)="00003dc7", 0x4)


mkdir(&(0x7f0000000080)='./file0\x00', 0x0)
__mount50(&(0x7f0000000200)='coda\x00', &(0x7f0000000240)='./file0\x00', 0x0, &(0x7f00000003c0)="a3", 0x1)


r0 = socket$inet(0x2, 0x2, 0x0)
connect$inet(r0, &(0x7f00000002c0), 0x10)
sendto$inet(r0, 0x0, 0x0, 0x0, 0x0, 0x0)
recvfrom$inet(r0, 0x0, 0x0, 0x0, 0x0, 0x0)


semctl$IPC_SET(0xffffffffffffffff, 0x0, 0x1, &(0x7f0000000000)={{}, 0x0, 0x0, 0x1})
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
r0 = socket(0x18, 0x2, 0x0)
setsockopt(r0, 0x1000000000029, 0x9, &(0x7f0000000040)='\x00\x00\x00\x00', 0x4)
setsockopt(r0, 0x1000000029, 0x9, 0x0, 0x0)


mprotect(&(0x7f0000ffa000/0x1000)=nil, 0x1000, 0x0)
mmap(&(0x7f0000ffb000/0x4000)=nil, 0x4000, 0xd49f275d97cc01bb, 0x1810, 0xffffffffffffffff, 0x0, 0x0)


r0 = socket$inet(0x2, 0x2, 0x0)
fcntl$lock(r0, 0xa, 0x0)
__clone(0x0, 0x0)


__mount50(&(0x7f0000000c00)='kernfs\x00', &(0x7f0000000040)='.\x00', 0x0, 0x0, 0x0)
__mount50(0x0, &(0x7f0000000040)='.\x00', 0xb0afbd006181d6de, 0x0, 0x0)


syz_usb_connect(0x0, 0x3f, &(0x7f0000000080)=ANY=[@ANYBLOB="11010000733336088dee1adb23610000000109022d0001100000000904000003fe03010009cd8d1f000200000009050502000000001009058b1e20"], 0x0)


r0 = socket$inet(0x2, 0x3, 0x0)
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
setsockopt$inet_opts(r0, 0x0, 0x19, &(0x7f0000000080)="eaef125c40020000", 0x8)


r0 = open(&(0x7f0000000100)='.\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x3000)=nil, 0x3000, 0x0, 0x1012, 0xffffffffffffffff, 0x0, 0x0)
preadv(r0, &(0x7f0000001400)=[{0x0}], 0x1, 0x0)


mmap(&(0x7f0000000000/0x3000)=nil, 0x3000, 0x1, 0x1012, 0xffffffffffffffff, 0x0, 0x0)
posix_spawn(0x0, &(0x7f0000000000)='\'\xb8\x00', 0x0, 0x0, 0x0, 0x0)


socket$inet(0x2, 0x4000000000000001, 0x0)
__mount50(&(0x7f00000002c0)='overlay\x00', &(0x7f0000000040)='.\x00', 0x0, &(0x7f0000000540), 0x0)
symlink(&(0x7f0000000080)='.\x00', &(0x7f0000000240)='./file0\x00')
modctl$MODCTL_UNLOAD(0x2, 0x0)
compat_50___msgctl13$IPC_STAT(0x0, 0x2, &(0x7f0000000280)={{}, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, &(0x7f00000001c0)={&(0x7f0000000100), 0xdffffffffffff7ff}})
lchown(&(0x7f0000000100)='./file0\x00', 0x0, 0xffffffffffffffff)
compat_40_mount(&(0x7f0000000380)='union\x00', &(0x7f0000000140)='./file0\x00', 0x0, &(0x7f00000001c0))
compat_40_mount(&(0x7f0000000380)='union\x00', &(0x7f0000000140)='./file0\x00', 0x0, &(0x7f00000001c0))
unlink(&(0x7f0000000000)='./file0\x00')
open$dir(&(0x7f0000000000)='./file0\x00', 0x40000400001803c1, 0x0)
close(0x4)


posix_spawn(0x0, 0x0, &(0x7f0000000880)={0x0, 0xfff, 0x0}, 0x0, 0x0, 0x0)


r0 = __clone(0x0, &(0x7f0000000000))
__wait450(0x0, 0x0, 0x4, 0x0)
setpriority(0x0, r0, 0x0)


setrlimit(0x8, &(0x7f0000000980)={0x7, 0x54})
socketpair$unix(0x1, 0x1, 0x0, &(0x7f0000000140)={<r0=>0xffffffffffffffff, <r1=>0xffffffffffffffff})
sendmsg$unix(r1, &(0x7f00000000c0)={0x0, 0x0, 0x0, 0x0, &(0x7f0000000100)=ANY=[@ANYBLOB="28000000ffff000001"], 0x28}, 0x0)
recvmmsg(r0, &(0x7f0000000000)={&(0x7f0000000000)={0x0, 0x0, 0x0}}, 0x10, 0x0, 0x0)


munmap(&(0x7f0000001000/0x3000)=nil, 0x3000)
r0 = shmget$private(0x0, 0x3000, 0x0, &(0x7f0000ffa000/0x3000)=nil)
shmat(r0, &(0x7f0000001000/0x3000)=nil, 0x1000)
sendmsg$unix(0xffffffffffffffff, &(0x7f0000001e00)={0x0, 0x0, 0x0}, 0x0)
semctl$IPC_SET(0x0, 0x0, 0x1, 0x0)
madvise(&(0x7f0000001000/0x2000)=nil, 0x2000, 0x0)
recvmmsg(0xffffffffffffffff, 0x0, 0x0, 0x0, &(0x7f0000002c40))


posix_spawn(0x0, 0x0, &(0x7f0000000100)={0x0, 0x1, &(0x7f00000000c0)=@close}, &(0x7f0000000140)={0x0, 0x0, {}, 0x0, {}, {[0x1f]}}, 0x0, 0x0)


compat_40_mount(&(0x7f0000000200)='procfs\x00', &(0x7f00000000c0)='.\x00', 0x0, &(0x7f00000002c0)="01")
__mount50(&(0x7f00000002c0)='overlay\x00', &(0x7f0000000040)='.\x00', 0x0, &(0x7f0000000540), 0x0)
r0 = open$dir(&(0x7f0000001240)='.\x00', 0x0, 0x0)
compat_30_getdents(r0, 0x0, 0xc2)


mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
r0 = socket(0x18, 0x3, 0x0)
setsockopt(r0, 0x1000000029, 0x28, &(0x7f0000000000)="73b6adec", 0x4)


poll(&(0x7f0000000080), 0x2000000000000043, 0xfffff85f)


munmap(&(0x7f0000ffb000/0x2000)=nil, 0x2000)
compat_43_ommap(&(0x7f0000ffd000/0x2000)=nil, 0x2000, 0x0, 0x402, 0xffffffffffffffff, 0x0)
madvise(&(0x7f0000ffb000/0x2000)=nil, 0x2000, 0x0)
mlock(&(0x7f0000ffc000/0x3000)=nil, 0x3000)


mknod(&(0x7f00000000c0)='./file0\x00', 0x2000, 0x4301)
r0 = open(&(0x7f0000000040)='./file0\x00', 0x0, 0x0)
ioctl$WSKBDIO_SETMAP(r0, 0x40047704, 0x0)


r0 = msgget$private(0x0, 0x0)
setreuid(0x0, 0xee00)
msgctl$IPC_RMID(r0, 0x0)


mmap(&(0x7f0000000000/0xff5000)=nil, 0xff5000, 0x0, 0x200000004d831, 0xffffffffffffffff, 0x0, 0x0)
posix_spawn(0x0, 0x0, 0x0, &(0x7f00000000c0), 0x0, 0x0)


symlink(&(0x7f0000000080)='.\x00', &(0x7f0000000240)='./file0\x00')
renameat(0xffffffffffffff9c, &(0x7f0000000600)='./file0\x00', 0xffffffffffffffff, &(0x7f0000000240)='./file0\x00')


r0 = socket(0x10, 0x2, 0x0)
listen(r0, 0x0)


mknod(&(0x7f0000000100)='./file0\x00', 0x2000, 0x0)
r0 = open(&(0x7f0000000140)='./file0\x00', 0x0, 0x0)
ioctl$FIOASYNC(r0, 0x80067409, &(0x7f0000000000)=0xae6)


r0 = socket(0x2, 0x3, 0x0)
sendmsg$unix(0xffffffffffffffff, 0x0, 0x0)
sendto$unix(0xffffffffffffffff, &(0x7f00000000c0)="b1000501600000000200000007000000110002000000000000f96ecfc72fd3357ae320b37b673039d2d236acf20b7804be38164991f7c8cf5f882b297be1aa5b236de351e21e00", 0x47, 0x0, 0x0, 0x0)
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
setsockopt$sock_int(r0, 0xffff, 0x1001, &(0x7f0000000100)=0x20000, 0x4)


r0 = socket$inet(0x2, 0x1, 0x0)
getsockopt(r0, 0x6, 0x0, 0x0, 0x0)


symlink(&(0x7f0000000080)='.\x00', &(0x7f0000000240)='./file0\x00')
compat_50___msgctl13$IPC_STAT(0x0, 0x2, &(0x7f0000000280)={{}, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, &(0x7f00000001c0)={&(0x7f0000000100)}})
lchown(&(0x7f0000000100)='./file0/file0\x00', 0x0, 0x0)
compat_40_mount(&(0x7f0000000140)='umap\x00', &(0x7f0000000040)='./file0\x00', 0x0, &(0x7f00000001c0))
compat_50___msgctl13$IPC_STAT(0x0, 0x2, &(0x7f0000000280)={{}, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, &(0x7f00000001c0)={&(0x7f0000000100), 0xdffffffffffff7ff}})
lchown(&(0x7f0000000100)='./file0\x00', 0x0, 0xffffffffffffffff)
compat_40_mount(&(0x7f0000000380)='union\x00', &(0x7f0000000140)='./file0\x00', 0x0, &(0x7f00000001c0))
open(&(0x7f0000000480)='./file0\x00', 0x7fffe, 0x0)


r0 = socket(0x2, 0x2, 0x0)
ioctl$FIOSEEKHOLE(r0, 0x4004667b, &(0x7f0000000180))


r0 = open$dir(&(0x7f00000000c0)='./file0\x00', 0x40000400001803c1, 0x0)
fcntl$lock(r0, 0x7, &(0x7f0000000000))


r0 = open(&(0x7f0000000280)='./file0\x00', 0x615, 0x0)
compat_43_ommap(&(0x7f0000fff000/0x1000)=nil, 0x1000, 0x0, 0x0, r0, 0x0)
getpid()
open(0x0, 0x0, 0x0)
ktrace(0x0, 0x0, 0x0, 0x0)
socket(0x0, 0x0, 0x0)
r1 = open$dir(&(0x7f0000000000)='./file0\x00', 0x2, 0x0)
mknod(&(0x7f00000000c0)='./bus\x00', 0x0, 0x0)
open$dir(0x0, 0x0, 0x0)
symlink(0x0, 0x0)
writev(r1, &(0x7f0000000340)=[{&(0x7f0000000000), 0x2cfea}], 0x1000000000000013)


sendmsg$unix(0xffffffffffffffff, &(0x7f0000001700)={&(0x7f00000000c0), 0x1c, 0x0}, 0x0)
connect$unix(0xffffffffffffffff, &(0x7f00000000c0)=@abs={0x682eb13985c518e6, 0x7}, 0x1c)
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
close(0xffffffffffffffff)
r0 = socket$inet(0x18, 0x3, 0x102)
sendmsg$unix(r0, &(0x7f0000001700)={0x0, 0x0, 0x0, 0x0, 0x0, 0xfffffffffffffe6d}, 0x2)


mkdir(&(0x7f0000000080)='./file0\x00', 0x0)
__mount50(&(0x7f0000000000)='fdesc\x00', &(0x7f0000000200)='./file0\x00', 0x0, 0x0, 0x0)
compat_50___msgctl13$IPC_STAT(0x0, 0x2, &(0x7f0000000280)={{}, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, &(0x7f00000001c0)={&(0x7f0000000100), 0xdffffffffffff7ff}})
lchown(&(0x7f0000000100)='./file0\x00', 0x0, 0xffffffffffffffff)
compat_40_mount(&(0x7f0000000380)='union\x00', &(0x7f0000000140)='./file0\x00', 0x3, &(0x7f00000001c0))
lchown(0x0, 0x0, 0xffffffffffffffff)
__clock_gettime50(0x0, &(0x7f0000000000))
compat_40_mount(&(0x7f0000000380)='union\x00', &(0x7f0000000140)='./file0\x00', 0x0, &(0x7f00000001c0))


_lwp_create(&(0x7f00000004c0)={0x0, &(0x7f0000000440)={0x0, &(0x7f00000003c0)={0x0, 0x0, {}, {0x0, 0x9015}, {0x0, 0x0, '\x00'}}, {}, {}, {0x0, 0x0, '\x00'}}, {}, {}, {0x0, 0x0, 'union\x00'}}, 0x0, 0x0)
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
r0 = msgget$private(0x0, 0x0)
msgctl$IPC_SET(r0, 0x1, &(0x7f00000003c0))


mknod(&(0x7f0000000000)='./file0\x00', 0x2000, 0x0)
r0 = open(&(0x7f00000000c0)='./file0\x00', 0x615, 0x0)
chflags(&(0x7f0000000040)='./file0\x00', 0x50002)
mmap(&(0x7f0000ffb000/0x3000)=nil, 0x3000, 0x16e, 0x10, r0, 0x0, 0x0)


open(&(0x7f0000000280)='./file0\x00', 0x0, 0x0)
open(&(0x7f0000000140)='./file0\x00', 0x0, 0x0)
r0 = open(&(0x7f0000000240)='./file0\x00', 0x205, 0x0)
fcntl$lock(r0, 0x8, &(0x7f0000000280)={0x2, 0x0, 0x0, 0x100030101000b})
r1 = open(&(0x7f0000000200)='./file0\x00', 0x0, 0x0)
fcntl$lock(r1, 0x9, &(0x7f0000000140)={0x0, 0x0, 0x1000, 0x269000000, 0xffffffffffffffff})


compat_40_mount(&(0x7f0000000040)='tmpfs\x00', &(0x7f00000000c0)='.\x00', 0x0, &(0x7f00000002c0)="01")
open(&(0x7f0000000040)='./file0\x00', 0xf8e, 0x0)
mkdir(0x0, 0x0)
compat_43_stat43(0x0, 0x0)
chflags(&(0x7f00000003c0)='./file0\x00', 0x5)
unlink(&(0x7f0000000000)='./file0\x00')


mknod(&(0x7f0000000480)='./file0\x00', 0x2000, 0x1733)
r0 = open(&(0x7f00000000c0)='./file0\x00', 0x0, 0x0)
ioctl$FIONREAD(r0, 0x40044278, &(0x7f0000000080))


__clone(0x0, &(0x7f0000000000))
mkdir(&(0x7f0000000080)='./file0\x00', 0x0)
__wait450(0x0, &(0x7f0000000200), 0x14, &(0x7f0000000240))


_ksem_open(&(0x7f0000000000)="a5a4bad810c14e014e79d7aab450ba912ca2e4c5c514aa9fdbb983e8dab0b656d87c97e87382b340bb2d4fb84c4acaae5ecd7d56d41d4e9e111d625066e66bbb4dce21", 0x0, 0x38, 0xa71, &(0x7f0000000080)=<r0=>0x0)
_ksem_destroy(r0)
_ksem_close(r0)
mknod(&(0x7f00000000c0)='./file0\x00', 0x0, 0x4301)
mknod(0x0, 0x0, 0x0)
mknod$loop(0x0, 0x0, 0x1)
link(0x0, 0x0)
link(0x0, 0x0)
rename(0x0, 0x0)
rename(0x0, 0x0)
unlink(0x0)
munmap(&(0x7f0000001000/0x3000)=nil, 0x3000)
r1 = shmget$private(0x0, 0x3000, 0x0, &(0x7f0000ffa000/0x3000)=nil)
r2 = shmat(r1, &(0x7f0000001000/0x3000)=nil, 0x0)
munmap(&(0x7f0000ffc000/0x4000)=nil, 0x4000)
r3 = shmget$private(0x0, 0x4000, 0x0, &(0x7f0000ffb000/0x4000)=nil)
shmat(r3, &(0x7f0000ffe000/0x2000)=nil, 0x0)
shmdt(r2)


r0 = socket$unix(0x1, 0x1, 0x0)
__fstat50(r0, &(0x7f0000001940))


ioctl$WSMUXIO_REMOVE_DEVICE(0xffffffffffffffff, 0x80085762, &(0x7f0000000040)={0x1})
r0 = socket(0x18, 0x3, 0x0)
setsockopt(r0, 0x29, 0x6c, &(0x7f0000000040), 0x4)
setsockopt$inet6_MRT6_DEL_MFC(r0, 0x29, 0x69, &(0x7f0000000200)={{0x18, 0x1}, {0x18, 0x1}}, 0x5c)


compat_50___lstat30(&(0x7f0000000040)='./file0\x00', 0x0)


compat_40_mount(&(0x7f0000000200)='procfs\x00', &(0x7f00000000c0)='.\x00', 0x0, &(0x7f00000002c0)="01")
__mount50(0x0, &(0x7f0000000040)='.\x00', 0xb0afbd006181d6de, 0x0, 0x0)


mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
compat_50_select(0x3, 0x0, &(0x7f0000000040), 0x0, &(0x7f0000003200))


posix_spawn(0xffffffffffffffff, 0x0, &(0x7f0000000300)={0x0, 0x7, &(0x7f00000002c0)=@open={0x0, 0xffffffffffffffff, {&(0x7f0000000280)='\x00'}}}, 0x0, 0x0, 0x0)


__setitimer50(0x0, 0x0, 0x0)
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
r0 = socket(0x18, 0x1, 0x0)
setsockopt(r0, 0x29, 0x29, &(0x7f0000000000)="02000000", 0x4)


mprotect(&(0x7f0000ffc000/0x4000)=nil, 0x4000, 0x0)
mmap(&(0x7f0000ffb000/0x4000)=nil, 0x4000, 0xd49f275d97cc01bb, 0x1810, 0xffffffffffffffff, 0x0, 0x0)


mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
r0 = socket(0x18, 0x1, 0x0)
setsockopt(r0, 0x0, 0xc, &(0x7f0000000040)="ebffcbff13b9fd812eaa4e713048e69931929648", 0x14)
compat_90_getvfsstat(&(0x7f0000001b00), 0x8d0, 0x0)


__mount50(&(0x7f0000000000)='overlay\x00', &(0x7f0000000040)='.\x00', 0x0, &(0x7f0000000540), 0x0)
mknod(&(0x7f0000000300)='./file0\x00', 0x2000, 0x6da)
ptrace(0x0, 0x0, 0x0, 0x0)
r0 = open(&(0x7f0000000040)='./file0\x00', 0x70e, 0x0)
mmap(&(0x7f0000000000/0x13000)=nil, 0x13000, 0x0, 0x10, r0, 0x0, 0x0)


compat_30___fhstat30(0x0, 0x0)


r0 = getsid(0x0)
ptrace(0x9, r0, 0x0, 0x0)
compat_50_wait4(0x0, 0x0, 0x0, 0x0)
ptrace(0x1000000000f, r0, 0x0, 0x0)


r0 = semget$private(0x0, 0x0, 0x0)
semop(r0, &(0x7f0000000000)=[{}], 0x1)
__mount50(0x0, &(0x7f0000000040)='.\x00', 0x0, 0x0, 0x0)
pipe2(&(0x7f0000000080)={<r1=>0xffffffffffffffff}, 0x0)
poll(&(0x7f0000000300)=[{}, {r1, 0x4}], 0x2, 0x0)
compat_50_clock_gettime(0x0, &(0x7f0000000180))
mknod(0x0, 0x0, 0x4f4b)
open$dir(0x0, 0x0, 0x0)
fcntl$getown(0xffffffffffffffff, 0x5)
mknod(&(0x7f0000000000)='./file1\x00', 0x2000, 0xa718)
r2 = open$dir(&(0x7f00000026c0)='./file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\x00', 0x0, 0x0)
__fstat50(r2, &(0x7f00000006c0)={<r3=>0x0})
mknod(0x0, 0x0, 0x0)
mknod(&(0x7f00000000c0)='./file0\x00', 0x4, r3)
mknod(&(0x7f0000000100)='./file0\x00', 0x6000, 0x1003)
r4 = open(&(0x7f0000000000)='./file0\x00', 0x0, 0x0)
ioctl$FIOASYNC(r4, 0x80d0647b, &(0x7f0000000380))
r5 = open(&(0x7f0000000200)='./file1\x00', 0x615, 0x0)
writev(r5, &(0x7f0000000140), 0x2)


r0 = socket$inet(0x2, 0x1, 0x0)
getsockopt(r0, 0x6, 0x3, 0x0, 0x0)


pipe(&(0x7f0000001400)={<r0=>0xffffffffffffffff})
compat_50_getrusage(0x0, &(0x7f0000000000))
setpriority(0x0, 0x1000, 0x0)
r1 = shmget$private(0x0, 0x1000, 0x0, &(0x7f0000ffc000/0x1000)=nil)
shmat(r1, &(0x7f0000ffc000/0x2000)=nil, 0x0)
mknod(&(0x7f0000000000)='./file1\x00', 0xe015, 0x0)
__utimes50(&(0x7f00000001c0)='./file1\x00', 0x0)
open(&(0x7f00000000c0)='./file1\x00', 0x200, 0x0)
shmget$private(0x0, 0x2000, 0x4a2, &(0x7f0000ffc000/0x2000)=nil)
shmat(r1, &(0x7f000036e000/0x3000)=nil, 0x2000)
r2 = getpid()
fktrace(r0, 0x0, 0xf0709cfa615b9be3, r2)
socket$unix(0x1, 0x2, 0x0)
r3 = socket(0x1f, 0x1, 0x0)
__fhstat50(0x0, 0x0, 0x0)
sendmsg$unix(0xffffffffffffffff, 0x0, 0x0)
socket(0x18, 0x0, 0x0)
ioctl$FIOSEEKHOLE(r3, 0xc118691d, &(0x7f0000000200))
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x0)
mknod(0x0, 0x0, 0x6da)
link(0x0, 0x0)
socketpair$unix(0x1, 0x0, 0x0, 0x0)
posix_spawn(0x0, 0x0, 0x0, 0x0, 0x0, 0x0)
mlock(&(0x7f0000ffc000/0x3000)=nil, 0x3000)
__clone(0x0, 0x0)


__mount50(0x0, &(0x7f0000000040)='.\x00', 0x0, 0x0, 0x0)
compat_40_mount(&(0x7f0000000100)='procfs\x00', &(0x7f00000000c0)='.\x00', 0x0, &(0x7f00000002c0)="01")
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
compat_30_getfh(&(0x7f0000000040)='./file0\x00', 0x0)


r0 = shmget$private(0x0, 0x3000, 0x0, &(0x7f0000ffa000/0x3000)=nil)
setreuid(0xee00, 0x0)
r1 = getuid()
seteuid(r1)
compat_50___shmctl13$IPC_SET(r0, 0x1, &(0x7f0000000480)={{}, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0})


r0 = open(&(0x7f0000000040)='./file0\x00', 0x70e, 0x0)
__posix_fadvise50(r0, 0x0, 0x0, 0x0, 0x1)


mknod(&(0x7f00000000c0)='./bus\x00', 0x2000, 0x4f4b)
r0 = open$dir(&(0x7f0000000040)='./bus\x00', 0x0, 0x0)
ioctl$FIOASYNC(r0, 0x80104305, &(0x7f00000001c0)=0x80000101)


mknod(&(0x7f0000000100)='./file0\x00', 0x2000, 0x0)
r0 = open(&(0x7f0000000140)='./file0\x00', 0x0, 0x0)
ioctl$FIOASYNC(r0, 0x80067409, &(0x7f0000000000)=0x6)


chroot(&(0x7f0000000000)='.\x00')
ptrace(0x26, 0x0, 0x0, 0x0)


compat_40_mount(&(0x7f0000000380)='tmpfs\x00', &(0x7f00000003c0)='.\x00', 0x0, &(0x7f0000000140)="01")
__mount50(0x0, &(0x7f0000000040)='.\x00', 0xe680bf986d21abfb, &(0x7f0000000400), 0x0)
__lstat50(&(0x7f0000000000)='.\x00', 0x0)
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
unmount(&(0x7f0000000000)='./file0\x00', 0x0)


r0 = posix_spawn(0xffffffffffffffff, 0x0, &(0x7f00000001c0)={0x0, 0x0, 0x0}, 0x0, 0x0, 0x0)
getpgid(r0)


open(&(0x7f0000000040)='./file0\x00', 0x0, 0x0)
r0 = socket$inet(0x2, 0x1, 0x0)
fcntl$lock(0xffffffffffffffff, 0x0, &(0x7f0000000040)={0x2, 0x0, 0x0, 0x0, 0xffffffffffffffff})
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
bind$inet(r0, &(0x7f0000000040)={0x2, 0x0}, 0x10)


open(&(0x7f0000000340)='./file0\x00', 0x300, 0x0)
r0 = open(&(0x7f00000000c0)='./file0\x00', 0x0, 0x0)
fcntl$lock(r0, 0x9, &(0x7f0000000040)={0x0, 0x0, 0x0, 0x1000301010005})


r0 = getsid(0x0)
ptrace(0x9, r0, 0x0, 0x0)
compat_50_wait4(0x0, 0x0, 0x0, 0x0)
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
ptrace(0x19, r0, &(0x7f0000000000), 0x12)


mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
r0 = socket(0x18, 0x1, 0x0)
getsockopt$sock_timeval(r0, 0xffff, 0x100c, 0x0, 0x0)


symlink(&(0x7f0000000080)='.\x00', &(0x7f0000000240)='./file0\x00')
chmod(&(0x7f0000000180)='./file0\x00', 0x23f)
setuid(0xee01)
renameat(0xffffffffffffff9c, &(0x7f0000000600)='./file0\x00', 0xffffffffffffffff, &(0x7f0000000240)='./file0\x00')


r0 = socket(0x2, 0x3, 0x0)
ioctl$FIOSEEKHOLE(r0, 0x80206910, &(0x7f0000000180)=0x8000000000000032)


mknod$loop(&(0x7f0000000000)='./file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\x00', 0x0, 0x1)
mknod(&(0x7f0000000480)='./file0\x00', 0x2000, 0x1733)
r0 = open(&(0x7f00000000c0)='./file0\x00', 0x0, 0x0)
ioctl$FIONREAD(r0, 0x8020426c, &(0x7f0000000080))


mknod(&(0x7f0000000480)='./file0\x00', 0x2000, 0x1733)
r0 = open(&(0x7f00000000c0)='./file0\x00', 0x0, 0x0)
ioctl$FIONREAD(r0, 0x4010427b, 0x0)


__mount50(&(0x7f00000001c0)='fdesc\x00', &(0x7f0000000040)='.\x00', 0x0, 0x0, 0x0)
__mount50(&(0x7f00000002c0)='overlay\x00', &(0x7f0000000040)='.\x00', 0x0, &(0x7f0000000540), 0x0)
open$dir(&(0x7f0000000000)='.\x00', 0x0, 0x0)
compat_50_select(0x40, &(0x7f0000000000)={0xffffffff}, 0x0, 0x0, 0x0)


compat_43_lstat43(0x0, 0x0)


open(&(0x7f00000000c0)='./file0\x00', 0x615, 0x0)
r0 = getpid()
ktrace(&(0x7f0000000000)='./file0\x00', 0x0, 0x142, r0)
r1 = socket(0x18, 0x1, 0x0)
r2 = fcntl$dupfd(r1, 0x2, 0xffffffffffffffff)
close(r2)
socket(0x11, 0x3, 0x0)
getpeername$inet(r2, 0x0, 0x0)


compat_43_osetrlimit(0x9, &(0x7f0000000080))
socket(0x1f, 0x5, 0x2)


__clone(0x47ff, 0x0)


modctl$MODCTL_LOAD(0x0, &(0x7f0000000180)={&(0x7f0000000240), 0x0, &(0x7f0000000340)="3cd21623b2e620", 0x7})


mknodat(0xffffffffffffff9c, &(0x7f0000000280)='./file0\x00', 0x1000, 0x0)
r0 = open(&(0x7f0000000180)='./file0\x00', 0x80000000000206, 0x0)
read(r0, 0x0, 0x0)
open(&(0x7f0000000100)='./file0\x00', 0x10, 0x0)


__mount50(&(0x7f00000001c0)='msdos\x00', &(0x7f0000000040)='.\x00', 0x0, &(0x7f0000000140)="0e", 0x1)


_lwp_create(&(0x7f0000000100)={0x4, 0x0, {}, {}, {0x0, 0x0, '!-\x00'}}, 0x0, 0x0)


r0 = socket$inet6(0x18, 0x3, 0x0)
sendto$inet(0xffffffffffffffff, &(0x7f0000000240)="8e", 0x1, 0x0, 0x0, 0x0)
__mount50(0x0, 0x0, 0x0, 0x0, 0x0)
sendto$inet6(r0, &(0x7f0000000000)=':', 0x358, 0x2, &(0x7f0000000040)={0x18, 0x3}, 0x1c)


compat_40_mount(&(0x7f0000000040)='tmpfs\x00', 0x0, 0x0, 0x0)
symlink(&(0x7f0000000080)='.\x00', 0x0)
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
__mount50(&(0x7f0000000040)='procfs\x00', &(0x7f0000000080)='./file0\x00', 0x0, &(0x7f00000000c0)="82", 0x1)


mknod(&(0x7f0000000480)='./file0\x00', 0x2000, 0x1733)
r0 = open(&(0x7f0000000080)='./file0\x00', 0x0, 0x0)
compat_43_fstat43(r0, 0x0)


setregid(0xee00, 0x0)


symlink(&(0x7f0000000080)='.\x00', &(0x7f0000000240)='./file0\x00')
compat_40_mount(&(0x7f0000000180)='ptyfs\x00', &(0x7f00000002c0)='./file0\x00', 0x0, &(0x7f0000000500))
__mount50(&(0x7f00000002c0)='overlay\x00', &(0x7f0000000040)='.\x00', 0x0, &(0x7f0000000540), 0x0)
open$dir(&(0x7f00000000c0)='./file0\x00', 0x0, 0x0)


compat_40_mount(&(0x7f0000000380)='tmpfs\x00', &(0x7f00000003c0)='.\x00', 0x0, &(0x7f0000000140)="01")
__mount50(0x0, &(0x7f0000000040)='.\x00', 0xe680bf986d21abfb, &(0x7f0000000400), 0x2)


mknod(&(0x7f0000000100)='./file0\x00', 0x2000, 0x0)
r0 = open(&(0x7f0000000140)='./file0\x00', 0x0, 0x0)
ioctl$FIOASYNC(r0, 0x2000747b, 0x0)


munmap(&(0x7f0000001000/0x3000)=nil, 0x3000)
r0 = shmget$private(0x0, 0x3000, 0x0, &(0x7f0000ffa000/0x3000)=nil)
shmat(r0, &(0x7f0000001000/0x3000)=nil, 0x0)
shmat(r0, &(0x7f0000001000/0x4000)=nil, 0x3000)


r0 = open$dir(&(0x7f0000000080)='.\x00', 0x0, 0x0)
setrlimit(0x8, &(0x7f0000000980))
openat(r0, &(0x7f0000000000)='./file0\x00', 0x0, 0x0)


mknod(&(0x7f0000000300)='./file0\x00', 0x2000, 0x2e00)
poll(0x0, 0x0, 0x0)
r0 = open(&(0x7f0000000000)='./file0\x00', 0x0, 0x0)
ioctl$FIOGETBMAP(r0, 0xc008667a, &(0x7f0000000200))


r0 = socket$inet(0x2, 0x1, 0x0)
bind$inet(r0, &(0x7f0000000040)={0x2, 0x0}, 0x10)
connect$inet(r0, &(0x7f0000000000)={0x2, 0x0}, 0x10)
setsockopt(r0, 0x0, 0x1, &(0x7f0000000100)="3417de05a6ff7e48e3ea062331b7f8acbdb06cbbae740451", 0x18)
write(r0, &(0x7f00000000c0)="0f", 0x1)


compat_50_wait4(0x0, 0x0, 0x0, &(0x7f0000000300))


r0 = socket(0x18, 0x2, 0x0)
setsockopt(r0, 0x1000000000029, 0x9, &(0x7f0000000040)="03000000", 0x4)
r1 = dup(r0)
setsockopt(r1, 0x1000000000029, 0xa, &(0x7f0000000000), 0x0)


mknod(&(0x7f0000000100)='./file0\x00', 0x80002005, 0x5300)
r0 = open(&(0x7f0000002600)='./file0\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x13000)=nil, 0x13000, 0x5, 0x10, r0, 0x0, 0x0)
fcntl$lock(0xffffffffffffffff, 0x8, &(0x7f0000000180))
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x0)


r0 = open(&(0x7f0000000480)='./file0\x00', 0x80000000000206, 0x4ebfac6bbaf7959)
writev(r0, &(0x7f0000000100)=[{&(0x7f0000000000)='#!', 0x2}], 0x1)
syz_emit_ethernet(0x17a, &(0x7f0000000380))
syz_extract_tcp_res(&(0x7f0000000000), 0xc0000000, 0x6)
r1 = socket(0x2, 0x2, 0x0)
connect$unix(r1, &(0x7f0000000300)=@file={0x1}, 0x2)
r2 = semget$private(0x0, 0x7, 0x3c0)
semop(r2, &(0x7f0000000180)=[{0x0, 0x43, 0x1800}, {0x4, 0xe6, 0x1800}, {0x0, 0xfd, 0x1000}, {0x1, 0x20, 0x1800}, {0x2, 0x5, 0x1800}, {0x4, 0x9e, 0x1000}, {0x2, 0xfffb, 0x1000}, {0x0, 0x40, 0x1000}, {0x3, 0x8, 0x1000}], 0x9)
getsockopt$SO_PEERCRED(r1, 0xffff, 0x1022, &(0x7f0000000200)={0x0, <r3=>0x0, <r4=>0x0}, 0xc)
semctl$IPC_SET(r2, 0x0, 0x1, &(0x7f0000000240)={{0x8001, 0xffffffffffffffff, r4, 0xffffffffffffffff, 0x0, 0x40, 0x2}, 0x0, 0x9, 0xe7})
setsockopt$sock_int(r1, 0xffff, 0x20, &(0x7f0000000080)=0xff, 0x4)
r5 = socket(0x11, 0x3, 0x0)
sendto$unix(r5, &(0x7f0000000000)="b10005040000000000000000070000001a5113fecea10500fef96ecfc72fd3357a89583535673039d2d236acf20b7804be38164991f7c8cf5f882b2900e1aa5b23edebc8ef99a8ad491726fa8251e2f0ac3ebbc2feb3fda1139b672f4d3353eb06acdb35a069d7080000000000000000008904000000000022830cf41bed66f4f365ccdcf3e4999d9d20002002c5dbfad800ff0f00"/177, 0xb1, 0x0, 0x0, 0x0)
r6 = accept$unix(r1, &(0x7f0000000a40), &(0x7f0000000140)=0x65)
accept(r6, &(0x7f0000000180), &(0x7f00000001c0)=0xc)
r7 = geteuid()
semctl$IPC_SET(0xffffffffffffffff, 0x0, 0x1, &(0x7f00000000c0)={{0x0, 0x0, 0x0, r7}})
r8 = getegid()
semctl$IPC_SET(0x0, 0x0, 0x1, &(0x7f0000000140)={{0x3, r7, 0x0, 0x0, r8, 0x134, 0x100}, 0x0, 0x1648})
getsockopt$SO_PEERCRED(r1, 0xffff, 0x1022, &(0x7f0000000a00)={0x0, 0x0, <r9=>0x0}, 0xc)
semctl$IPC_SET(r2, 0x0, 0x1, &(0x7f0000000240)={{0x1ff, r7, 0x0, 0x0, r9, 0x90, 0x9}, 0x8, 0x8, 0x5})
chown(&(0x7f00000002c0)='./file0\x00', r3, r9)
socketpair(0x6, 0x1, 0x0, &(0x7f0000000040))
write(r1, &(0x7f0000000400)="06db92cb2bc71bc021ddc28fd02e4310164d6d6e56f0374d591d51840492ae80b6734760f8949793f5ff925e0add46df00000000000000ed2d2d53ff016fd1967aa38c840ab15e80ffed6ac4c0f97663716a21e7d96807fc320828a79b579a5759d1585d07fc4a7f7fd721a6040f186fce37121a7ff6b505fa92a82523596d8c527e2c5f34239a59a7a05dbf06b3c173ca5011f274f29841382a51065d17a62711964cdb5839dfdf821b7f75525bca0ea85058c83f41c530d6f42ea7dc8b3910a88ca185a8d8180b85f9b2c93d622e066efa03282d37a783649f764030787e5a7012df6796b02f1efbed0e01552f24f85e17be12ae9007e26ae91aa5a5e7469787f132ca0905531c300222e9d171f44d6e60cdd25f853b3a119bbb2ff949b3a2bbd4b4cc18a5f2f94fcc6bf35641a0099a464e27197b081c626bf0ea399f1fd489787c85152bb15a38cbf15238311e0848e5b9812af4c82c62d87ee884abda4f725d0f96800345dd4c069762b1436f621f482589fb0ab042ca836dc7e6e8d1de427e31161819ec257ca878ffbd4b3c86738c35b9ad49063ec8e0e0394e6b5771abc19d37064e47d972693903c7dddfcabf9ac88983fcb9af32fde20d956184fde6af1cce696f0da59bf3c9b5f37d71b1be8ea56e06af8b56a4610f0a526b914980fa95d6d2b9bcac721989e85415da2c171f14863ba291c5d9d609656f9941b2c55ed249f4eb8e5fe072e0d81b266ca69e1e9799c56a5776208ea4d5df1920da6a514f77e070a84fd1541c0a37fcc8601bf9707af463edf2f1fdd70b1d365e79101ccbb00026e57566b66291fb5db9e33309d879f48027a7bb2099ca62dced24e3dbb0e44845b5ea0a24684f16c7bcf19de0cc6687b7509ffb9e8a9cf560c426d666a91e1e929ea6425de177a52cc75a256f92383a313f1953cfa7796e8fe1b729ff11406a0bca052b32a4a133c4cfc8dbcca976a00d4b7ae19b11d55a6235399b17f8d085ce7b398754a3e71caed15234f8c99cc51f7408384418ec0f6009c42feaeed992a00833f7022116bd14de5b9992ed09c96afaf5d6c31fe600fa3cecc364fb6dc01a8213e1594c5cc039a7ea031d26118ab91c64530f19bc64543615333fa34072dd75dc70d357e2370e208fd56e7a7daf87f30dee48944d83b5563872fb61072a2381e9e000fe2c3a422ebd90ce2e7a4107094f5f805c077841f540d465d3e17e479d57a9de0a6b3c56295c85b0ad1069837fc785c6546298c96a77c651ade40c7ab1a06c6791e1de95695e6c248dc39927af011232471b17c40151d2a2f5a763eb8b78e44422e0ab8d982a9ce2813b3533382cdeaa1b284d0d144126eb622979e6ea988079fec319cac45481c1a1f3db361146bcf41b8acfee8d13d1660ee1b5b144f0b0b043d6374ce77d5165a49b2399ded113d83c6f8114868a1d37c9f35811446d5f10c7e7c4dfc4fc878ced7f6979cab411c366e2be44378d991fec8f80985fb7581cc50d3ad506f4a3d8aea9effe341a3e80ddaa4dc857a0896a444c366c3466267830e8635de26e5887819ca9bce95bbfe9656dc57cf5f41b62182265cf67eac14058782de4e7945e6cff994bd5bebee64eb3e9855c3094aed5288d8012d3501ba2d08b5f612a867137cba147f96ca29d4372af173c0e7a7a38d58dea7e9d15b774c34279d0a7345aab1be9288a9247997e0f84bc99b27b054dc7c980ea1e425147f931f009ffb122f252eae1c985e6360a16f323ddf09bbb2bdfc08679dc53a39e6cc0b79648b282639e4913f5cb98c0582ab2661903107d62a60ea6ed9f0903b8796f3451dd6156c6ad6a00a38b260870e5214af91860147f5c734de5b25108b5aa83908f976c8662bbb2aef5a097fc7109d46d439a35df91b2414af8322c1c5f969bbe2c24f13376740eb731610a29465f30e3a049a5d253f442b72cd0cab40c4626a76d66f8124174a2dc8234bafadbfcbdcc63f91ad402ac7276c1b22e913e30ebf8a5ec06022952d7157420c298b0ef5441b5999d81d23ef6c5c8e24ca39b70adf246fa9f49a9b21695f3cefd95147b3c846e107c99", 0x5c1)
read(r0, &(0x7f0000000180)=""/162, 0xa2)
write(r0, &(0x7f0000000040)='\t', 0x1)
write(r0, &(0x7f00000004c0)="09eb00000db57c60c001267f5d5e8ee581e12aaa95ab10eba007a90172c84c838b4278ad535c2a591c97413f308cbbaee481ca7f31dbd1562ef7a6540a", 0x3d)
close(r0)
execve(&(0x7f0000000140)='./file0\x00', 0x0, 0x0)


r0 = socket(0x18, 0x2, 0x0)
r1 = dup(r0)
ioctl$NETBSD_DM_IOCTL(r1, 0xc010fd00, &(0x7f0000000140)={0x0})


socket$inet6(0xa, 0x0, 0x0)
r0 = socket(0x10, 0x2, 0x0)
write(r0, &(0x7f0000000280)="1c0000001a009b8a140000003b9b301f00"/28, 0x32)
recvmmsg(r0, &(0x7f0000002ec0), 0x400000000000ec0, 0x2, &(0x7f00000001c0)={0x77359400})


socketpair$unix(0x1, 0x2, 0x0, &(0x7f0000000100)={0xffffffffffffffff, <r0=>0xffffffffffffffff})
bind$unix(r0, &(0x7f0000000040)=@file={0xd19450564dee018c, './file0\x00'}, 0xa)
truncate(&(0x7f00000000c0)='./file0\x00', 0x0, 0x0)


open(&(0x7f0000000040)='./file0\x00', 0x70e, 0x0)
ktrace(&(0x7f0000000200)='./file0\x00', 0x4, 0xd27d43220c7df9b, 0x0)
setreuid(0x0, 0xee01)
chroot(0x0)


compat_43_osetrlimit(0x9, &(0x7f0000000080))
mknod(&(0x7f0000000280)='./file0\x00', 0x1100, 0x0)
open(&(0x7f0000000480)='./file0\x00', 0x0, 0x0)


__nanosleep50(&(0x7f0000000000)={0x0, 0x10000000000005}, 0x0)


mknod(&(0x7f00000000c0)='./bus\x00', 0x2000, 0x0)
r0 = open$dir(&(0x7f0000000000)='./bus\x00', 0x0, 0x0)
ioctl$FIOASYNC(r0, 0x2000746e, 0x0)


socketpair$unix(0x1, 0x1, 0x0, &(0x7f0000000040)={0xffffffffffffffff, <r0=>0xffffffffffffffff})
fcntl$lock(r0, 0x332b9512e66ce801, 0x0)


open(&(0x7f0000000040)='./file0\x00', 0x0, 0x0)
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
r0 = socket$inet(0x2, 0x1, 0x0)
setsockopt(r0, 0x6, 0x6, &(0x7f0000000040)="285c8f02", 0x4)


setrlimit(0x8, &(0x7f0000000000)={0x0, 0xffffffffffffff00})


compat_40_mount(&(0x7f0000000040)='tmpfs\x00', &(0x7f00000000c0)='.\x00', 0x0, &(0x7f00000002c0)="01")
symlink(&(0x7f0000000080)='.\x00', &(0x7f0000000240)='./file0\x00')
chflags(&(0x7f00000003c0)='./file0\x00', 0x5)
mkdir(&(0x7f00000000c0)='./control\x00', 0x0)
rmdir(&(0x7f0000000040)='./control\x00')


mmap(&(0x7f0000000000/0x3000)=nil, 0x3000, 0x1, 0x1012, 0xffffffffffffffff, 0x0, 0x0)
posix_spawn(0x0, 0x0, &(0x7f0000001240)={0x0, 0x0, 0x0}, 0x0, 0x0, 0x0)


symlink(&(0x7f0000000080)='.\x00', &(0x7f0000000040)='./bus\x00')
__stat50(&(0x7f0000000100)='./bus\x00', &(0x7f0000000200)={<r0=>0x0})
mknod(&(0x7f0000000140)='./file0\x00', 0x2000, r0)
open$dir(&(0x7f00000000c0)='./file0\x00', 0x0, 0x0)
r1 = socket$inet(0x2, 0x2, 0x0)
ioctl$FIONREAD(r1, 0xc004730a, &(0x7f0000000080))


swapctl$SWAP_GETDUMPDEV(0x8, 0x0, 0x0)


sendmsg$unix(0xffffffffffffffff, &(0x7f0000001280)={0x0, 0x0, &(0x7f0000001200)=[{&(0x7f0000000180)="bf3b684c5ae963b45670", 0xa}], 0x1}, 0x0)
modctl$MODCTL_STAT(0x4, &(0x7f0000000180)={&(0x7f00000000c0)=""/126, 0x7e})


r0 = socket(0x18, 0x2, 0x0)
close(r0)
socket(0x18, 0x3, 0x0)
connect$unix(r0, &(0x7f00000000c0)=@abs={0x682eb13985c518e6, 0x7}, 0x1c)
writev(r0, &(0x7f00000002c0)=[{&(0x7f0000000000)="89a135162985d7", 0x7}, {&(0x7f0000004280)='\r', 0x1}], 0x2)


open(&(0x7f0000000040)='./file0\x00', 0x70e, 0x0)
ktrace(&(0x7f0000000200)='./file0\x00', 0x4, 0xd27d43220c7df9b, 0x0)
setreuid(0x0, 0xee01)
compat_50_quotactl(&(0x7f0000000000)='./file0\x00', 0x8, 0x0, 0x0)


_ksem_post(0x50535244)


sendmsg$unix(0xffffffffffffffff, &(0x7f00000000c0)={0x0, 0x0, 0x0, 0x0, &(0x7f0000000140)=ANY=[@ANYBLOB='(\x00\x00\x00', @ANYRESHEX], 0x28}, 0x0)
compat_40_mount(&(0x7f0000000380)='tmpfs\x00', &(0x7f00000003c0)='.\x00', 0x0, &(0x7f0000000140)="01")
__mount50(0x0, &(0x7f0000000040)='.\x00', 0xe680bf986d21abfb, &(0x7f0000000140), 0x0)


sendmsg$unix(0xffffffffffffffff, 0x0, 0x0)
compat_40_mount(&(0x7f0000000080)='tmpfs\x00', &(0x7f00000000c0)='.\x00', 0x0, &(0x7f00000002c0)="01")
__mount50(&(0x7f0000000000)='overlay\x00', &(0x7f0000000040)='.\x00', 0x0, &(0x7f0000000540), 0x0)
mknod(&(0x7f0000000300)='./file0\x00', 0x2000, 0x6da)
ktrace(&(0x7f00000001c0)='./file0\x00', 0x0, 0x0, 0x0)


symlink(0x0, &(0x7f0000000240)='./file0\x00')
pipe(0x0)
compat_50_select(0x40, &(0x7f0000000140), &(0x7f0000000300)={0x8}, 0x0, 0x0)
unlink(&(0x7f0000000000)='./file0\x00')
open$dir(&(0x7f0000000000)='./file0\x00', 0x40000400001803c1, 0x0)
rename(&(0x7f0000000040)='./file0\x00', &(0x7f0000000740)='./file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa/file0\x00')
close(0x4)


mknod(&(0x7f0000000540)='./file0\x00', 0x6000, 0x1003)
compat_50___msgctl13$IPC_STAT(0x0, 0x2, &(0x7f0000000280)={{}, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, &(0x7f00000001c0)={&(0x7f0000000100)}})
lchown(&(0x7f0000000100)='./file0\x00', 0x0, 0xffffffffffffffff)
compat_40_mount(&(0x7f0000000700)='cd9660\x00', &(0x7f0000000140)='.\x00', 0x3, &(0x7f00000001c0))


open(&(0x7f0000001640)='./file0\x00', 0x615, 0x0)
ktrace(&(0x7f0000000200)='./file0\x00', 0x4, 0xd27d43220c7df9b, 0x0)
setreuid(0x0, 0x0)
__getfh30(0x0, 0x0, 0x0)


pipe(&(0x7f0000000140)={<r0=>0xffffffffffffffff})
r1 = getpid()
fktrace(r0, 0x0, 0x62e2dd08f149ff1b, r1)
compat_30_fhstatvfs1(0x0, 0x0, 0x0)


mknod(&(0x7f00000000c0)='./bus\x00', 0x2000, 0x4f4b)
r0 = open$dir(&(0x7f0000000000)='./bus\x00', 0x0, 0x0)
ioctl$FIOASYNC(r0, 0x20007461, 0x0)
swapctl$SWAP_STATS(0xa, &(0x7f0000000000)={0x0, 0x0, 0x0, 0x0, 0x0, 0x0}, 0x0)


r0 = socket$unix(0x1, 0x5, 0x0)
getsockopt$sock_linger(r0, 0xffff, 0x80, &(0x7f0000000040), &(0x7f0000000080)=0x8)


r0 = socket$unix(0x1, 0x5, 0x0)
r1 = getsid(0x0)
ptrace(0x9, r1, 0x0, 0x0)
fktrace(r0, 0x0, 0x2, r1)


compat_30_socket(0x1f, 0x3, 0x1)


r0 = open(&(0x7f00000000c0)='./file0\x00', 0x615, 0x0)
truncate(&(0x7f0000000040)='./file0\x00', 0x0, 0x72829869)
ioctl$FIOGETBMAP(r0, 0xc008667a, &(0x7f0000000080)=0x83d)


fcntl$dupfd(0xffffffffffffffff, 0x7, 0xffffffffffffff9c)


r0 = shmget$private(0x0, 0x1000, 0x0, &(0x7f0000ffc000/0x1000)=nil)
munmap(&(0x7f0000ffb000/0x4000)=nil, 0x4000)
r1 = shmat(r0, &(0x7f0000ffc000/0x1000)=nil, 0x0)
munmap(&(0x7f0000fec000/0x14000)=nil, 0x14000)
shmdt(r1)


munmap(&(0x7f0000ffb000/0x2000)=nil, 0x2000)
madvise(&(0x7f0000ffc000/0x2000)=nil, 0x2000, 0x0)
munmap(&(0x7f0000ff2000/0x4000)=nil, 0x4000)
madvise(&(0x7f0000ffb000/0x3000)=nil, 0x3000, 0x0)


mkdir(&(0x7f0000000080)='./file0\x00', 0x0)
compat_40_mount(&(0x7f0000000180)='ptyfs\x00', &(0x7f00000002c0)='./file0\x00', 0x0, &(0x7f0000000500))
chflags(&(0x7f0000000000)='./file0\x00', 0x20000)


munmap(&(0x7f0000001000/0x3000)=nil, 0x3000)
munmap(&(0x7f0000001000/0x3000)=nil, 0x3000)


mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
r0 = socket(0x18, 0x1, 0x0)
setsockopt(r0, 0x0, 0xc, &(0x7f0000000040)="ebffcbff13b9fd812eaa4e713048e69931929648", 0x14)


mknod(&(0x7f00000000c0)='./file0\x00', 0x6000, 0x0)
mkdirat(0xffffffffffffffff, &(0x7f0000000000)='./file0\x00', 0x0)
swapctl$SWAP_ON(0x7, &(0x7f0000000000), 0x0)


socketpair$unix(0x1, 0x5, 0x0, &(0x7f0000000000)={<r0=>0xffffffffffffffff})
sendmsg(r0, &(0x7f00000003c0)={&(0x7f0000000100), 0x80, &(0x7f00000013c0)=[{&(0x7f0000000040)="1fa7caebe8d61d78a08aeb624c55a0780d3f0fa296b248fb3876c646062bb7180d6f9cefd3b4e39a0ef2a05d2a000a679543f55b7aca0b1b9c72aaafba9dfd657fe46888e0a1598baeea082399000b68", 0x1}, {&(0x7f0000000280)="10", 0x3d9a}, {0x0, 0x11}, {&(0x7f00000001c0)="b4c148171a42edf342442d71db61d2d4c7590bb098a6acc362d226d6ab1fa58af23a9db7e2d3fd20a5439ee95c44d88c853ba9fca580fedb0298e9357121f28fc307a7a8be12b240d04aa06d6aa3eef3fc37c47e8ee4cad3e7f941076f631c0844aa41925e6a24e0998fc3159480ee00596fec71c4b49f4a8f", 0x79}], 0x4, &(0x7f0000001480)}, 0x0)


socket(0x0, 0x0, 0x0)
socket$inet6(0x18, 0x0, 0x0)
compat_40_mount(0x0, 0x0, 0x0, 0x0)
open(0x0, 0x0, 0x0)
close(0xffffffffffffffff)
r0 = semget$private(0x0, 0x1, 0x0)
compat_50_____semctl13$SETVAL(r0, 0xff7f0000, 0x8, &(0x7f0000000140)=@buf=&(0x7f0000000880)={{}, 0x0, 0x0, 0x0, 0x0})
pwrite(0xffffffffffffffff, 0x0, 0x0, 0x0)


mknod(&(0x7f0000000100)='./file0\x00', 0x2000, 0x1)
r0 = open(&(0x7f0000000480)='./file0\x00', 0x80000000000206, 0x0)
r1 = open(&(0x7f0000000040)='./file0\x00', 0x0, 0x0)
ioctl$WSKBDIO_SETMAP(r1, 0x80145003, &(0x7f0000000000)={0x0, 0x0})
write(r0, &(0x7f0000000040)="16", 0x1)


r0 = socket(0x10, 0x2, 0x0)
ioctl$FIOSEEKHOLE(r0, 0x8020690c, &(0x7f0000000180)=0x8000000000000032)


r0 = socket(0x18, 0x2, 0x0)
getsockopt(r0, 0x29, 0xe, 0x0, 0x0)


mkdir(&(0x7f0000000080)='./file0\x00', 0x0)
__mount50(&(0x7f00000002c0)='overlay\x00', &(0x7f0000000040)='.\x00', 0x0, &(0x7f0000000540), 0x0)
pathconf(&(0x7f0000000000)='./file0\x00', 0x0)


open(&(0x7f0000000040)='./file0\x00', 0x70e, 0x0)
getsockopt$sock_cred(0xffffffffffffffff, 0xffff, 0x1022, 0x0, 0x0)
r0 = getpid()
open(0x0, 0x0, 0x0)
ktrace(&(0x7f0000000000)='./file0\x00', 0x0, 0x40000424, r0)
setregid(0x0, 0x0)
write(0xffffffffffffffff, 0x0, 0x0)
r1 = socket(0x18, 0x2, 0x0)
close(r1)
r2 = socket(0x18, 0x3, 0x0)
connect$unix(r1, &(0x7f00000000c0)=@abs={0x682eb13985c518e6, 0x7}, 0x1c)
compat_43_ogetpeername(r2, 0x0, 0x0)


r0 = socket$inet6(0x18, 0x3, 0x0)
getsockopt(r0, 0x29, 0x14, 0x0, 0x0)


mlock(&(0x7f0000007000/0xc000)=nil, 0xc000)
mprotect(&(0x7f000000e000/0x4000)=nil, 0x4000, 0x0)
mlock(&(0x7f000000f000/0x4000)=nil, 0x4000)
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x0)
mmap(&(0x7f0000001000/0xc00000)=nil, 0xc00000, 0x0, 0x3032, 0xffffffffffffffff, 0x0, 0x0)


compat_43_ogethostname(0x0, 0x0)
__nanosleep50(&(0x7f0000000d80)={0xffffffffffffffc1}, &(0x7f0000000dc0))


symlink(&(0x7f0000000080)='.\x00', &(0x7f0000000240)='./file0\x00')
compat_50___msgctl13$IPC_STAT(0x0, 0x2, 0x0)
compat_50___msgctl13$IPC_STAT(0x0, 0x2, &(0x7f0000000280)={{}, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, &(0x7f00000001c0)={&(0x7f0000000100), 0xb}})
lchown(&(0x7f0000000100)='./file0\x00', 0x0, 0x0)
__mount50(&(0x7f00000002c0)='overlay\x00', &(0x7f0000000040)='.\x00', 0x0, &(0x7f0000000540), 0x0)
compat_40_mount(&(0x7f0000000380)='union\x00', &(0x7f0000000140)='./file0\x00', 0x0, &(0x7f00000001c0))
chflags(&(0x7f0000000000)='./file0\x00', 0x0)


r0 = socket(0x18, 0x2, 0x0)
ioctl$FIOSEEKHOLE(r0, 0xc038694e, &(0x7f0000000180)=0x8000000000000032)


mkdir(&(0x7f0000000080)='./file0\x00', 0x0)
compat_50_quotactl(&(0x7f0000000000)='./file0\x00', 0x20000, 0x0, 0x0)


mkdir(&(0x7f0000000200)='./file0\x00', 0x0)
__mount50(&(0x7f0000000000)='kernfs\x00', &(0x7f0000000180)='./file0\x00', 0x0, 0x0, 0x0)
compat_50___msgctl13$IPC_STAT(0x0, 0x2, &(0x7f0000000280)={{}, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, &(0x7f00000001c0)={&(0x7f0000000100)}})
lchown(&(0x7f0000000100)='./file0\x00', 0x0, 0xffffffffffffffff)
compat_40_mount(&(0x7f0000000140)='umap\x00', &(0x7f0000000040)='./file0\x00', 0x0, &(0x7f00000001c0))
compat_50___msgctl13$IPC_STAT(0x0, 0x2, &(0x7f0000000280)={{}, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, &(0x7f00000001c0)={&(0x7f0000000100), 0xdffffffffffff7ff}})
lchown(&(0x7f0000000100)='./file0\x00', 0x0, 0xffffffffffffffff)
compat_40_mount(&(0x7f0000000380)='union\x00', &(0x7f0000000140)='./file0\x00', 0x0, &(0x7f00000001c0))


mknod(&(0x7f0000000000)='./file0\x00', 0x2000, 0x800)
pathconf(&(0x7f00000001c0)='./file0\x00', 0x0)
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
r0 = open$dir(&(0x7f0000000000)='./bus\x00', 0x0, 0x0)
ioctl$FIOASYNC(r0, 0x8006740a, &(0x7f00000001c0))


__mount50(&(0x7f0000000000)='overlay\x00', &(0x7f0000000040)='.\x00', 0x0, &(0x7f0000000540), 0x0)
r0 = socket$inet(0x2, 0x2, 0x0)
open(&(0x7f0000000100)='./file0\x00', 0x615, 0x0)
fcntl$lock(r0, 0xa, 0x0)


mknod(&(0x7f0000000300)='./file0\x00', 0x6000, 0x6da)
open$dir(&(0x7f0000000000)='./file0\x00', 0x0, 0x0)


compat_40_mount(0x0, 0x0, 0x0, 0x0)
mknod(&(0x7f0000000000)='./bus\x00', 0x2000, 0x2e00)
open$dir(&(0x7f0000000080)='./bus\x00', 0x0, 0x0)
ioctl$FIOASYNC(0xffffffffffffffff, 0xc1c0526b, 0x0)
compat_43_ocreat(0x0, 0x0)
truncate(0x0, 0x0, 0x0)
preadv(0xffffffffffffffff, 0x0, 0x0, 0x0)
__mount50(0x0, 0x0, 0x0, 0x0, 0x0)
open(0x0, 0x0, 0x0)
socket$inet(0x2, 0x0, 0x0)
open(0x0, 0x0, 0x0)
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
__select50(0x40, &(0x7f0000000000), 0x0, 0x0, 0x0)


mkdir(&(0x7f0000000080)='./file0\x00', 0x0)
__mount50(&(0x7f0000000000)='fdesc\x00', &(0x7f0000000200)='./file0\x00', 0x0, 0x0, 0x0)
compat_50___msgctl13$IPC_STAT(0x0, 0x2, &(0x7f0000000280)={{}, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, &(0x7f00000001c0)={&(0x7f0000000100)}})
lchown(&(0x7f0000000100)='./file0\x00', 0x0, 0xffffffffffffffff)
compat_40_mount(&(0x7f0000000140)='umap\x00', &(0x7f0000000040)='./file0\x00', 0x0, &(0x7f00000001c0))
lchown(&(0x7f0000000100)='./file0\x00', 0x0, 0xffffffffffffffff)


r0 = getsid(0x0)
ptrace(0x0, r0, 0x0, 0x0)
open(&(0x7f0000000480)='./file0\x00', 0x0, 0x0)
truncate(0x0, 0x0, 0x7fff)
r1 = open(&(0x7f0000000080)='./file0\x00', 0x0, 0x0)
fdatasync(r1)
truncate(&(0x7f0000000000)='./file0\x00', 0x0, 0x0)
truncate(0x0, 0x0, 0x0)
compat_50_wait4(0xffffffffffffffff, &(0x7f0000000000), 0x0, 0x0)
compat_50___fstat30(0xffffffffffffffff, &(0x7f0000000100))


r0 = open(&(0x7f0000000380)='./file0\x00', 0x80000000000206, 0x4ebfac6bbaf7929)
writev(r0, &(0x7f0000000180)=[{&(0x7f0000000000)='#!', 0x2}, {&(0x7f00000000c0)="98", 0x1}], 0x2)
write(r0, &(0x7f00000001c0)="092009018000000000000045b65369db0000ff4278ad535c2a413f308cbbaee4c6987f31dbf15624b77b6a0a", 0x2c)
execve(&(0x7f0000000100)='./file0\x00', 0x0, 0x0)


compat_43_ocreat(&(0x7f0000000000)='./file0\x00', 0x0)
r0 = getpid()
ktrace(&(0x7f0000000000)='./file0\x00', 0x0, 0x0, r0)
__setitimer50(0x300, 0x0, &(0x7f0000000040))


munmap(&(0x7f0000001000/0x3000)=nil, 0x3000)
r0 = shmget$private(0x0, 0x4000, 0x0, &(0x7f0000ffc000/0x4000)=nil)
shmat(r0, &(0x7f0000001000/0x3000)=nil, 0x0)


mknod(&(0x7f00000000c0)='./file0\x00', 0x2000, 0x4100)
pathconf(&(0x7f0000000300)='./file0\x00', 0xa)


mkdir(&(0x7f0000000040)='./file0\x00', 0x0)
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
r0 = socket(0x18, 0x2, 0x0)
setsockopt(r0, 0x1000000000029, 0xe, &(0x7f0000000040)='\x00\x00\x00\x00', 0x4)


mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
r0 = socket(0x18, 0x3, 0x0)
recvmsg(r0, &(0x7f0000000380)={0x0, 0x0, 0x0, 0x0, &(0x7f0000000340)=""/27, 0x1b}, 0x0)


r0 = compat_30_socket(0x22, 0x30000003, 0x0)
connect$inet(r0, &(0x7f0000000500)={0x2, 0x3}, 0xc)


r0 = socket(0x18, 0x1, 0x0)
setsockopt(r0, 0x29, 0x11, &(0x7f0000000000)="02000000", 0x4)


r0 = socket(0x18, 0x2, 0x0)
setsockopt$sock_int(r0, 0xffff, 0x800, 0x0, 0x0)


r0 = socket(0x1d, 0x40000003, 0x0)
connect$inet(r0, &(0x7f0000000040)={0x2, 0x1}, 0xc)


mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
r0 = socket(0x18, 0x3, 0x0)
setsockopt(r0, 0x1000000029, 0x14, &(0x7f0000000040)="5ab7736a", 0x4)


socketpair$unix(0x1, 0x1, 0x0, &(0x7f0000000040)={<r0=>0xffffffffffffffff})
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
recvmmsg(r0, &(0x7f0000000700)={0x0}, 0x10, 0x0, 0x0)


open$dir(&(0x7f00000000c0)='./file0\x00', 0x40000400001803c1, 0x0)
acct(&(0x7f0000000140)='./file0\x00')
acct(0x0)


posix_spawn(0x0, 0x0, &(0x7f0000000080)={0x0, 0x0, &(0x7f0000000040)=@close={0x3, 0xffffffffffffff9c}}, 0x0, 0x0, 0x0)
r0 = socket(0x18, 0x2, 0x0)
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
setsockopt(r0, 0x1000000000029, 0xa, &(0x7f0000000040)='\x00\x00\x00\x00', 0x4)
setsockopt(r0, 0x1000000000029, 0xb, &(0x7f0000000040)='\x00\x00\x00\x00', 0x4)
modctl$MODCTL_UNLOAD(0x1, 0x0)


mknod(&(0x7f0000000080)='./bus\x00', 0x0, 0x6d4)
mknod(&(0x7f0000000480)='./file0\x00', 0x2000, 0x1733)
r0 = open(&(0x7f0000000300)='./file0\x00', 0x0, 0x0)
ioctl$FIONREAD(r0, 0x8010426d, &(0x7f0000000080))


madvise(&(0x7f0000ffd000/0x2000)=nil, 0x2000, 0x0)
getsid(0xffffffffffffffff)
symlink(&(0x7f0000000080)='.\x00', &(0x7f0000000240)='./file0\x00')
compat_50___msgctl13$IPC_STAT(0x0, 0x2, &(0x7f0000000280)={{}, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, &(0x7f00000001c0)={&(0x7f0000000100), 0xdffffffffffff7ff}})
__mount50(&(0x7f0000000000)='overlay\x00', &(0x7f0000000040)='.\x00', 0x0, &(0x7f0000000540), 0x0)
lchown(&(0x7f0000000100)='./file0/file0\x00', 0x0, 0x0)
compat_40_mount(&(0x7f0000000380)='union\x00', &(0x7f0000000140)='./file0\x00', 0x0, &(0x7f00000001c0))
compat_40_mount(&(0x7f0000000380)='union\x00', &(0x7f0000000140)='./file0\x00', 0x0, &(0x7f00000001c0))
lchown(0x0, 0x0, 0x0)
compat_40_mount(&(0x7f0000000380)='union\x00', &(0x7f0000000140)='./file0\x00', 0x0, &(0x7f00000001c0))


r0 = socket(0x800000018, 0x1, 0x0)
msgctl$IPC_SET(0x0, 0x1, &(0x7f0000000080)={{0x0, 0x0, 0xffffffffffffffff}})
r1 = socket(0x18, 0x2, 0x0)
close(r1)
r2 = socket(0x800000018, 0x1, 0x0)
fcntl$lock(0xffffffffffffffff, 0x0, &(0x7f00000000c0)={0x0, 0x0, 0x0, 0x20000005a})
bind$unix(r2, &(0x7f0000000080)=@abs={0x1f95d27d48731892, 0x7}, 0x1c)
sendmmsg(0xffffffffffffffff, &(0x7f0000000080)={0x0}, 0x10, 0x0, 0x0)
connect$unix(r1, &(0x7f00000000c0)=@abs={0x682eb13985c518e6, 0x7}, 0x1c)
bind$unix(r0, &(0x7f0000000080)=@abs={0x1f95d27d48731892, 0x7}, 0x1c)


socketpair(0x1d, 0x2, 0x2, &(0x7f0000000140))


compat_20_getfsstat(&(0x7f0000000000), 0xffffffffffffff39, 0x0)
modctl$MODCTL_STAT(0x4, &(0x7f0000000180)={&(0x7f0000000140)=""/16, 0x10})


semctl$IPC_SET(0x0, 0x0, 0x1, &(0x7f0000000080)={{0x0, 0x0, 0x0, 0x0, 0x0, 0x10}})
r0 = socket(0x800000018, 0x3, 0x0)
setsockopt$sock_int(r0, 0xffff, 0x1000, &(0x7f0000000000)=0x7, 0x4)
bind$unix(r0, &(0x7f0000000080)=@abs={0x1f95d27d48731892, 0x7}, 0x1c)
setsockopt(0xffffffffffffffff, 0x0, 0x0, 0x0, 0x0)
setsockopt$sock_int(0xffffffffffffffff, 0xffff, 0x0, 0x0, 0x0)
syz_emit_ethernet(0x4a, &(0x7f0000000440))


open$dir(&(0x7f0000000b80)='./file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\x00', 0x200, 0x0)
r0 = open$dir(&(0x7f0000000000)='.\x00', 0x0, 0x0)
symlinkat(&(0x7f0000000dc0)='./file0\x00', r0, &(0x7f0000000ec0)='./file1aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\x00')
rename(&(0x7f0000000980)='./file1aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\x00', &(0x7f0000000a80)='./file0\x00')
rename(&(0x7f0000000d80)='./file0\x00', &(0x7f0000000180)='./file1\x00')


write(0xffffffffffffffff, &(0x7f0000000140)="4e8f8cdc90bf00ba12aaa92982b8b4b7630aa9bd5db9dab4749648139ce6d31e8c0219fddb943501650e4f653434f29e824bfaf628632cc6628d8cf30ae00a546ca3b4d1584dcb4de2c59dfa86a50eadde287b4643dc1052ab5d03c4cab84ff29a", 0x61)
r0 = socket(0x18, 0x3, 0x0)
ioctl$FIOSEEKHOLE(r0, 0xc048696d, &(0x7f0000000180)=0x8000000000000032)


__mount50(&(0x7f00000002c0)='overlay\x00', &(0x7f0000000040)='.\x00', 0x0, &(0x7f0000000540), 0x0)
open(&(0x7f00000000c0)='./file0\x00', 0x80000000000206, 0x0)
mknod(&(0x7f0000000540)='./file0\x00', 0x0, 0x1003)


r0 = socket$inet(0x2, 0x2, 0x0)
fcntl$lock(r0, 0x5, 0x0)


modctl$MODCTL_UNLOAD(0x2, 0x0)
mknod(&(0x7f00000000c0)='./bus\x00', 0x2000, 0x4f4b)
connect$unix(0xffffffffffffffff, &(0x7f0000000000)=@file={0x0, './file0\x00'}, 0xa)
r0 = open$dir(&(0x7f0000000040)='./bus\x00', 0x0, 0x0)
ioctl$FIOASYNC(r0, 0xc0104302, &(0x7f00000001c0)=0x20000002)


r0 = socket(0x2, 0x1, 0x0)
r1 = dup2(r0, r0)
shutdown(r1, 0x1)
listen(r1, 0x0)


open(&(0x7f0000000040)='./file0\x00', 0x70e, 0x0)
__mount50(&(0x7f00000002c0)='overlay\x00', &(0x7f0000000040)='.\x00', 0x0, &(0x7f0000000540), 0x0)
r0 = open$dir(&(0x7f0000000000)='./file0\x00', 0x1, 0x0)
ioctl$FIONREAD(r0, 0x80206979, &(0x7f0000000200))


__mount50(&(0x7f0000000100)='kernfs\x00', &(0x7f0000000040)='.\x00', 0x0, 0x0, 0x0)
__mount50(0x0, &(0x7f0000000040)='.\x00', 0xe680bf986d21abfb, &(0x7f0000000400), 0x0)


open(&(0x7f00000000c0)='./file0\x00', 0x80000000000206, 0x0)
getpid()
ktrace(&(0x7f0000000000)='./file0\x00', 0x0, 0x144, 0x0)
getgid()


compat_50___shmctl13$IPC_STAT(0x0, 0x2, 0x0)
__clone(0x0, &(0x7f0000000300))
compat_50_wait4(0x0, 0x0, 0x4, &(0x7f0000000440))


mknod(&(0x7f0000000300)='./file0\x00', 0x2000, 0x6da)
r0 = open(&(0x7f0000000000)='./file0\x00', 0x0, 0x0)
__mount50(0x0, 0x0, 0x0, 0x0, 0x0)
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
ioctl$FIOASYNC(r0, 0x80047466, &(0x7f0000000100))


r0 = socket$inet6(0x18, 0x3, 0x0)
getsockopt(r0, 0x29, 0xa, 0x0, 0x0)


mkdir(&(0x7f0000000080)='./file0\x00', 0x0)
__mount50(&(0x7f0000000000)='fdesc\x00', &(0x7f0000000200)='./file0\x00', 0x0, 0x0, 0x0)
compat_50___msgctl13$IPC_STAT(0x0, 0x2, &(0x7f0000000280)={{}, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, &(0x7f00000001c0)={&(0x7f0000000100), 0xdffffffffffff7ff}})
lchown(&(0x7f0000000100)='./file0\x00', 0x0, 0xffffffffffffffff)
compat_40_mount(&(0x7f0000000140)='umap\x00', &(0x7f0000000040)='./file0\x00', 0x0, &(0x7f00000001c0))
preadv(0xffffffffffffffff, 0x0, 0x0, 0x0)
compat_40_mount(&(0x7f0000000380)='union\x00', &(0x7f0000000140)='./file0\x00', 0x3, &(0x7f00000001c0))
r0 = open$dir(&(0x7f00000000c0)='./file0\x00', 0x0, 0x0)
compat_30_getdents(r0, 0x0, 0xb2)


open$dir(&(0x7f00000000c0)='./file0\x00', 0x40000400001803c1, 0x0)
r0 = getpid()
ktrace(&(0x7f0000000340)='./file0\x00', 0x0, 0xf, r0)
clock_nanosleep(0x0, 0x0, &(0x7f0000000000), 0x0)


open$dir(&(0x7f0000000000)='./file2\x00', 0x200, 0x0)
mknod(&(0x7f0000000480)='./file0\x00', 0x2000, 0x1733)
open(&(0x7f0000000300)='./file0\x00', 0x0, 0x0)
__select50(0x40, &(0x7f0000000000)={0x3ff}, 0x0, 0x0, 0x0)


mknod(&(0x7f0000000040)='./bus\x00', 0x100000000205f, 0x4000)
ktrace(&(0x7f0000000100)='./bus\x00', 0x0, 0x0, 0x0)


mknod(&(0x7f0000000040)='./bus\x00', 0x100000000205f, 0x2801)
r0 = open(&(0x7f0000000280)='./bus\x00', 0x0, 0x0)
ioctl$FIOASYNC(r0, 0x40047441, &(0x7f0000000000))


open(&(0x7f0000000040)='./file0\x00', 0xf8e, 0x0)
mknod(0x0, 0x0, 0x0)
__mount50(&(0x7f00000002c0)='overlay\x00', &(0x7f0000000040)='.\x00', 0x0, &(0x7f0000000540), 0x0)
execve(&(0x7f0000000140)='./file0\x00', 0x0, 0x0)


symlink(&(0x7f0000000080)='.\x00', &(0x7f0000000240)='./file0\x00')
compat_50___msgctl13$IPC_STAT(0x0, 0x2, &(0x7f0000000280)={{}, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, &(0x7f00000001c0)={&(0x7f0000000100), 0xdffffffffffff7ff}})
__mount50(&(0x7f00000002c0)='overlay\x00', &(0x7f0000000040)='.\x00', 0x0, &(0x7f0000000540), 0x0)
lchown(&(0x7f0000000100)='./file0\x00', 0x0, 0xffffffffffffffff)
compat_40_mount(&(0x7f0000000380)='union\x00', &(0x7f0000000140)='./file0\x00', 0x0, &(0x7f00000001c0))
unlink(&(0x7f0000000000)='./file0\x00')
r0 = open(&(0x7f0000000480)='./file0\x00', 0x80000000000206, 0x0)
write(r0, &(0x7f0000000080)="8e", 0x1)


mknod(&(0x7f0000000480)='./file0\x00', 0x2000, 0x1733)
r0 = open(&(0x7f00000000c0)='./file0\x00', 0x0, 0x0)
ioctl$FIOASYNC(r0, 0x2000427e, 0x0)
ioctl$FIONREAD(r0, 0x2000427e, 0x0)


mkdir(&(0x7f0000000080)='./file0\x00', 0x0)
compat_40_mount(&(0x7f0000000180)='ptyfs\x00', &(0x7f00000002c0)='./file0\x00', 0x0, &(0x7f0000000500))
setuid(0xee01)
execve(0x0, 0x0, 0x0)
lchown(&(0x7f0000000100)='./file0\x00', 0x0, 0xffffffffffffffff)


mknod$loop(&(0x7f0000000000)='./file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\x00', 0x0, 0x1)
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
setrlimit(0x3, &(0x7f0000000040))
setrlimit(0x3, &(0x7f00000000c0))


r0 = open(&(0x7f0000000480)='./file0\x00', 0x80000000000206, 0x0)
writev(r0, &(0x7f00000000c0)=[{&(0x7f0000000280)='#', 0x1}], 0x1)
r1 = open(&(0x7f00000000c0)='./file0\x00', 0x0, 0x0)
mmap(&(0x7f0000ffb000/0x4000)=nil, 0x4000, 0x0, 0x10, r1, 0x0, 0x0)
mincore(&(0x7f0000ffb000/0x2000)=nil, 0x2000, &(0x7f0000000100)=""/100)


r0 = socket(0x18, 0x3, 0x0)
ioctl$FIOSEEKHOLE(r0, 0x8118691c, &(0x7f0000000180)=0x8000000000000032)


modctl$MODCTL_UNLOAD(0x2, 0x0)
mkdir(&(0x7f0000000080)='./file0\x00', 0x0)
compat_40_mount(&(0x7f0000000180)='ptyfs\x00', &(0x7f00000002c0)='./file0\x00', 0x0, &(0x7f0000000500))
unmount(&(0x7f0000000000)='./file0\x00', 0x0)


_ksem_open(&(0x7f0000000180)='/\r', 0x200, 0x0, 0x0, &(0x7f0000000100))


r0 = open$dir(&(0x7f0000001240)='.\x00', 0x0, 0x0)
faccessat(r0, &(0x7f0000000000)='./file0\x00', 0x0, 0x0)


compat_40_mount(&(0x7f0000000040)='tmpfs\x00', &(0x7f00000000c0)='.\x00', 0x0, &(0x7f0000000140)="01")
open(&(0x7f00000000c0)='./file0\x00', 0x615, 0x7)
execve(&(0x7f0000000140)='./file0\x00', 0x0, 0x0)


mknod(&(0x7f0000000480)='./file0\x00', 0x2000, 0x1733)
modctl$MODCTL_LOAD(0x0, &(0x7f0000000080)={&(0x7f0000000240), 0x0, 0x0})
r0 = open(&(0x7f00000000c0)='./file0\x00', 0x0, 0x0)
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
ioctl$FIOASYNC(r0, 0x8004667d, &(0x7f0000000080))


r0 = _lwp_self()
clock_nanosleep(0x40000000, 0x0, &(0x7f0000000140)={0x80000001}, 0x0)
_lwp_wakeup(r0)


mknod(&(0x7f00000000c0)='./file0\x00', 0x2000, 0x4100)
r0 = open(&(0x7f0000000040)='./file0\x00', 0x0, 0x0)
ioctl$WSKBDIO_SETMAP(r0, 0x80187701, &(0x7f0000000000)={0x0, 0x0})


r0 = socket(0x18, 0x3, 0x0)
connect$unix(r0, &(0x7f00000000c0)=@abs={0x682eb13985c518e6, 0x7}, 0x1c)
getsockname$inet(r0, &(0x7f00000000c0), &(0x7f0000000000)=0xffffffffffffff35)
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
r1 = socket(0x18, 0x1, 0x0)
close(r1)
r2 = socket(0x18, 0x3, 0x3a)
connect$unix(r1, &(0x7f00000000c0)=@abs={0x0, 0x7}, 0x1c)
write(r2, &(0x7f0000000100)="b437ab93c964c85e7ed0e6c8a2a6ecda5f9256f4", 0x14)


mknod(&(0x7f0000000400)='./file0\x00', 0x2000, 0x287e)
open$dir(&(0x7f0000000b80)='./file0\x00', 0x0, 0x0)
renameat(0xffffffffffffffff, 0x0, 0xffffffffffffffff, &(0x7f0000000040)='./file0\x00')
fcntl$lock(0xffffffffffffffff, 0x0, &(0x7f0000000040)={0x3})
poll(&(0x7f0000000000)=[{}], 0x20000000000000fe, 0x0)


r0 = socket(0x18, 0x2, 0x0)
setsockopt(r0, 0x1000000000029, 0x3d, &(0x7f0000000040)='\x00\x00\x00\x00', 0x4)


compat_60__lwp_park(&(0x7f0000000000)={0x8000000000000008}, 0x0, 0x0, 0x0)
compat_40_mount(&(0x7f0000000080)='ptyfs\x00', &(0x7f00000003c0)='.\x00', 0x0, &(0x7f0000000000))


pipe2(&(0x7f0000000000)={<r0=>0xffffffffffffffff, <r1=>0xffffffffffffffff}, 0x0)
fcntl$setstatus(r1, 0x4, 0xbb4289d0ac784055)
dup2(r1, r0)


compat_40_mount(&(0x7f0000000040)='tmpfs\x00', &(0x7f0000000140)='.\x00', 0x0, &(0x7f0000000080)="01")
mkdirat(0xffffffffffffff9c, &(0x7f0000000680)='./file0\x00', 0x0)
rename(&(0x7f0000000d80)='./file0\x00', &(0x7f0000000180)='./file1\x00')


mknod(&(0x7f0000000300)='./file0\x00', 0x2000, 0x6da)
r0 = open(&(0x7f0000000140)='./file0\x00', 0x0, 0x0)
ioctl$FIOASYNC(r0, 0x20007479, 0x0)


r0 = socket$inet(0x2, 0x2, 0x0)
setsockopt$inet_opts(r0, 0x0, 0x1, 0x0, 0x0)


r0 = getpid()
ktrace(&(0x7f0000000000)='./file0\x00', 0x0, 0x142, r0)
__nanosleep50(&(0x7f0000000100), 0x0)


r0 = socket$inet(0x2, 0x3, 0x0)
setsockopt$inet_opts(r0, 0x0, 0x1, &(0x7f0000000240)="01000000", 0x4)
r1 = socket$inet(0x2, 0x3, 0x0)
dup2(r1, r0)


r0 = open$dir(0x0, 0x200, 0x0)
r1 = getpid()
ktrace(0x0, 0x0, 0xf, r1)
socketpair$unix(0x1, 0x1, 0x0, &(0x7f0000000040))
fcntl$lock(r0, 0x7, 0x0)


fcntl$lock(0xffffffffffffffff, 0x0, &(0x7f00000000c0)={0x0, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff})
r0 = socket(0x18, 0x2, 0x0)
modctl$MODCTL_LOAD(0x0, &(0x7f00000000c0)={&(0x7f0000000180), 0x0, &(0x7f0000000000)='\t', 0x1})
socket(0x0, 0x0, 0x0)
connect$unix(r0, &(0x7f00000000c0)=@abs={0x682eb13985c518e6, 0x7}, 0x1c)


mkdir(&(0x7f0000000000)='./file0\x00', 0x0)
compat_40_mount(&(0x7f0000000180)='ptyfs\x00', &(0x7f00000002c0)='./file0\x00', 0x0, &(0x7f0000000500))
pathconf(&(0x7f0000000080)='./file0\x00', 0x9)


r0 = getsid(0x0)
ptrace(0x9, r0, 0x0, 0x0)
compat_50_wait4(0x0, 0x0, 0x0, 0x0)
ptrace(0xa, r0, 0x0, 0xfffffbffffff0002)


getrlimit(0xfffffffffffffffc, 0x0)


sendmsg$unix(0xffffffffffffffff, &(0x7f0000001700)={&(0x7f00000000c0), 0x1c, 0x0}, 0x0)
ioctl$WSMUXIO_INJECTEVENT(0xffffffffffffffff, 0x80185760, &(0x7f0000000000)={0x0, 0x0, {0x0, 0x3}})
fcntl$lock(0xffffffffffffffff, 0x0, &(0x7f00000000c0)={0x0, 0x0, 0x0, 0x100000000000000, 0xffffffffffffffff})
connect$unix(0xffffffffffffffff, &(0x7f00000000c0)=@abs={0x682eb13985c518e6, 0x7}, 0x1c)
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
r0 = socket(0x18, 0x3, 0x0)
close(r0)
close(0xffffffffffffffff)
r1 = socket(0x18, 0x2, 0x0)
setsockopt(r1, 0x1000000029, 0x2e, &(0x7f0000000000)="ebffcbff13b9fd812eaa4e713048e69931929648", 0x14)
sendmsg$unix(r0, &(0x7f0000001700)={0x0, 0x0, 0x0}, 0x0)


pipe(&(0x7f0000000140)={<r0=>0xffffffffffffffff})
r1 = getpid()
fktrace(r0, 0x0, 0x62e2dd08f149ff1b, r1)
r2 = shmget$private(0x0, 0x1000, 0x0, &(0x7f0000ffd000/0x1000)=nil)
compat_50___shmctl13$IPC_SET(r2, 0x2, &(0x7f0000000000)={{}, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0})


setreuid(0x0, 0xee01)
__fhstat50(0x0, 0x0, 0x0)


sendmsg$unix(0xffffffffffffffff, &(0x7f00000009c0)={0x0, 0x0, &(0x7f0000000580)=[{&(0x7f0000000200)="f1b661ad25a9398849dfe79f3648a5d5eec261e0e38cd1d0aa99569e176ea047dca4bd59831dd551c3d2c2312055a2b9d54dffdc0cc47df48c6f6b5fd80586774837760a725346e5e335186e4df000ad3903ae58f0fe5ebc0000f4", 0x5b}], 0x1}, 0x0)
compat_40_mount(&(0x7f0000000140)='msdos\x00', &(0x7f00000000c0)='.\x00', 0x0, &(0x7f00000001c0))


r0 = socket$inet(0x2, 0x2, 0x0)
setsockopt$inet_opts(r0, 0x0, 0xd, &(0x7f0000000240)="ea08000000000000", 0x8)


mkdir(&(0x7f0000000080)='./file0\x00', 0x0)
__mount50(&(0x7f0000000000)='fdesc\x00', &(0x7f0000000200)='./file0\x00', 0x0, 0x0, 0x0)
pathconf(&(0x7f00000000c0)='./file0\x00', 0x7)


mknod(&(0x7f00000000c0)='./bus\x00', 0x2000, 0x44002902)
truncate(0x0, 0x0, 0xffffffff00000000)
chdir(0x0)
mknod(&(0x7f0000000000)='./file0\x00', 0x2000, 0x40000802)
__mount50(&(0x7f0000000000)='overlay\x00', &(0x7f0000000040)='.\x00', 0x1, &(0x7f0000000380), 0x0)
lchflags(&(0x7f0000000080)='./file0\x00', 0x0)


posix_spawn(0xffffffffffffffff, 0x0, &(0x7f0000000480)={0x0, 0x95, &(0x7f0000000340)=@dup}, 0x0, 0x0, 0x0)


pipe(&(0x7f0000000200)={<r0=>0xffffffffffffffff})
ioctl$KDSETLED(r0, 0x40046678)


symlink(&(0x7f0000000080)='.\x00', &(0x7f0000000240)='./file0\x00')
__mount50(&(0x7f00000002c0)='overlay\x00', &(0x7f0000000040)='.\x00', 0x0, &(0x7f0000000540), 0x0)
pathconf(&(0x7f0000000040)='./file0\x00', 0xc)


r0 = getpid()
ptrace(0x9, r0, 0x0, 0x0)


r0 = open(&(0x7f0000000040)='./file0\x00', 0x70e, 0x0)
compat_30___fstat13(r0, &(0x7f0000000240))


r0 = shmget$private(0x0, 0x1000, 0x0, &(0x7f0000fff000/0x1000)=nil)
setreuid(0x0, 0xee01)
open(0x0, 0x0, 0x0)
shmctl$IPC_RMID(r0, 0x0)


r0 = socket(0x18, 0x1, 0x0)
connect$unix(r0, &(0x7f00000000c0)=@abs={0x682eb13985c518e6, 0x7}, 0x1c)
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x4)
r1 = socket(0x18, 0x1, 0x0)
dup2(r0, r1)
connect$unix(r1, &(0x7f00000000c0)=@abs={0x0, 0x7}, 0x1c)


r0 = compat_30_socket(0x10, 0x2, 0x0)
shutdown(r0, 0x2)


mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
writev(0xffffffffffffff9c, 0x0, 0x0)
r0 = socket(0x18, 0x1, 0x0)
setsockopt(r0, 0x1000000000029, 0x9, &(0x7f0000000000)="01000000", 0x4)


mknod(&(0x7f0000000180)='./file0\x00', 0x2000, 0x202)
r0 = open(&(0x7f0000000140)='./file0\x00', 0x0, 0x0)
ioctl$WSDISPLAYIO_ADDSCREEN(r0, 0x8018574e, &(0x7f00000001c0)={0x1dd, 0x0, 0x0})


r0 = socket$inet6(0x18, 0x3, 0x0)
getsockopt$sock_int(r0, 0xffff, 0x2, &(0x7f0000000640), &(0x7f0000000680)=0x4)


r0 = getsid(0x0)
ptrace(0x9, r0, 0x0, 0x0)
ptrace(0xa, r0, 0x0, 0x0)


socket$inet(0x2, 0x0, 0x0)
semget(0x1, 0x0, 0x0)
r0 = semget(0x3, 0x2, 0x284)
r1 = semget(0x1, 0x3, 0x204)
semctl$SETALL(0x0, 0x0, 0x11, &(0x7f0000001080))
semctl$IPC_RMID(r1, 0x0, 0x0)
msgget(0x2, 0x0)
msgrcv(0x0, 0x0, 0x0, 0x0, 0x1800)
semctl$GETALL(r0, 0x0, 0xd, 0x0)
semget(0x2, 0x0, 0x0)
semctl$IPC_RMID(0x0, 0x0, 0x0)
shmget$private(0x0, 0x1000, 0x0, &(0x7f0000fff000/0x1000)=nil)
semop(0xffffffffffffffff, 0x0, 0x0)


symlink(&(0x7f0000000080)='.\x00', &(0x7f0000000240)='./file0\x00')
compat_40_mount(&(0x7f0000000380)='tmpfs\x00', &(0x7f00000003c0)='.\x00', 0x10800039, &(0x7f0000000140)="01")
__mount50(&(0x7f0000000000)='fdesc\x00', &(0x7f0000000200)='./file0\x00', 0x0, 0x0, 0x0)


symlink(&(0x7f0000000080)='.\x00', &(0x7f0000000240)='./file0\x00')
compat_50___msgctl13$IPC_STAT(0x0, 0x2, &(0x7f0000000280)={{}, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, &(0x7f00000001c0)={&(0x7f0000000100), 0xdffffffffffff7ff}})
lchown(&(0x7f0000000100)='./file0\x00', 0x0, 0xffffffffffffffff)
compat_40_mount(&(0x7f0000000380)='union\x00', &(0x7f0000000140)='./file0\x00', 0x0, &(0x7f00000001c0))
compat_40_mount(&(0x7f0000000380)='union\x00', &(0x7f0000000140)='./file0\x00', 0x0, &(0x7f00000001c0))
pathconf(&(0x7f0000000000)='./file0\x00', 0xa)


socketpair$unix(0x1, 0x2, 0x0, &(0x7f00000000c0))
__getcwd(&(0x7f0000000000)=""/29, 0xff39)


fcntl$lock(0xffffffffffffffff, 0x0, &(0x7f00000000c0)={0x0, 0x0, 0x0, 0x100000000000000, 0xffffffffffffffff})
connect$unix(0xffffffffffffffff, &(0x7f00000000c0)=@abs={0x682eb13985c518e6, 0x7}, 0x1c)
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
r0 = socket(0x18, 0x1, 0x0)
connect$unix(r0, &(0x7f00000000c0)=@abs={0x0, 0x7}, 0x1c)
r1 = socket(0x18, 0x2, 0x0)
dup2(r0, r1)
connect$unix(r1, &(0x7f00000000c0)=@abs={0x0, 0x7, 0x0}, 0x8)


r0 = socket$inet(0x2, 0x1, 0x0)
setsockopt(r0, 0x0, 0x13, &(0x7f0000000080)="02000000", 0x4)


compat_40_mount(&(0x7f0000000040)='lfs\x00', 0x0, 0x0, 0x0)
compat_43_ocreat(&(0x7f0000000000)='./file0\x00', 0x0)
ktrace(&(0x7f00000000c0)='./file0\x00', 0x4, 0x4, 0xffffffffffffffff)
__utimes50(0x0, 0x0)
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
getsockopt$sock_cred(0xffffffffffffffff, 0xffff, 0x1022, &(0x7f0000000000), &(0x7f0000000040)=0xc)


symlink(&(0x7f0000000080)='.\x00', &(0x7f0000000240)='./file0\x00')
compat_50___msgctl13$IPC_STAT(0x0, 0x2, &(0x7f0000000280)={{}, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, &(0x7f00000001c0)={&(0x7f0000000100)}})
lchown(&(0x7f0000000100)='./file0\x00', 0x0, 0xffffffffffffffff)
compat_40_mount(&(0x7f0000000380)='efs\x00', &(0x7f0000000200)='./file0\x00', 0x0, &(0x7f00000001c0))


mknod(&(0x7f0000000000)='./file1\x00', 0xe015, 0x0)
mknodat(0xffffffffffffff9c, &(0x7f0000000040)='./file1\x00', 0xc0e99db6de761f86, 0x0)


r0 = getsid(0x0)
ptrace(0x9, r0, 0x0, 0x0)
compat_50_wait4(0x0, 0x0, 0x0, 0x0)
ptrace(0x29, r0, &(0x7f0000000000), 0x0)


symlink(&(0x7f0000000000)='./file0/file0\x00', &(0x7f0000000240)='./file0\x00')
mkdir(&(0x7f00000001c0)='./file1\x00', 0x0)
rename(&(0x7f00000003c0)='./file0\x00', &(0x7f0000000380)='./file1/file0\x00')


pipe(&(0x7f0000000400)={<r0=>0xffffffffffffffff})
r1 = getpid()
fktrace(r0, 0x0, 0xf0709cfa615b9be3, r1)
clock_nanosleep(0x0, 0x0, 0xffffffffffffffff, 0x0)


r0 = socket$inet(0x2, 0x2, 0x0)
ioctl$FIONREAD(0xffffffffffffffff, 0xc0106924, 0x0)
__clone(0x0, 0x0)
unmount(0x0, 0x0)
fcntl$lock(r0, 0xa, 0x0)


symlink(&(0x7f0000000080)='.\x00', 0x0)
compat_40_mount(0x0, 0x0, 0x0, &(0x7f00000002c0)="01")
compat_40_mount(&(0x7f0000000100)='procfs\x00', &(0x7f00000000c0)='.\x00', 0x0, &(0x7f00000002c0))
__mount50(&(0x7f0000000000)='overlay\x00', &(0x7f0000000040)='.\x00', 0x0, &(0x7f0000000540), 0x0)
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
pathconf(&(0x7f0000000080)='./file0\x00', 0x3)


r0 = open$dir(&(0x7f0000000000)='./file0\x00', 0x40000400001803c1, 0x0)
pwritev(r0, &(0x7f0000000280)=[{0x0}, {&(0x7f00000001c0)='#', 0x1}], 0x2, 0x0)


modctl$MODCTL_UNLOAD(0x2, 0x0)
compat_50___msgctl13$IPC_STAT(0x0, 0x2, &(0x7f0000000280)={{}, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, &(0x7f00000001c0)={&(0x7f0000000100), 0xdffffffffffff7ff}})
__mount50(&(0x7f0000000440)='overlay\x00', &(0x7f0000000040)='.\x00', 0x0, &(0x7f0000000540), 0x0)
lchown(&(0x7f0000000100)='./file0\x00', 0x0, 0x0)
symlink(&(0x7f0000000080)='.\x00', &(0x7f0000000240)='./file0\x00')
compat_40_mount(&(0x7f0000000380)='union\x00', &(0x7f0000000140)='./file0\x00', 0x0, &(0x7f00000001c0))
unlink(&(0x7f0000000000)='./file0\x00')
mkdir(&(0x7f0000000080)='./file0\x00', 0x0)


socketpair$unix(0x1, 0x2, 0x0, &(0x7f0000001440)={0xffffffffffffffff, <r0=>0xffffffffffffffff})
r1 = fcntl$getown(r0, 0x3)
setpgid(0x0, r1)


mknod(&(0x7f0000000280)='./file0\x00', 0x2000, 0x200)
r0 = open(&(0x7f00000000c0)='./file0\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x2000)=nil, 0x2000, 0x0, 0x10, r0, 0x0, 0x0)
__clone(0x0, 0x0)


mknod(&(0x7f0000000100)='./bus\x00', 0x2000, 0x5300)
r0 = open(&(0x7f0000000480)='./bus\x00', 0x0, 0x0)
ioctl$WSKBDIO_SETMAP(r0, 0x40085004, &(0x7f0000000000)={0x0, 0x0})


r0 = socket(0x2, 0x3, 0x0)
ioctl$FIOSEEKHOLE(r0, 0x80206913, &(0x7f0000000180))
getpriority(0x1, 0x0)


_lwp_wait(0x0, 0x0)
r0 = _lwp_self()
_lwp_detach(r0)
_lwp_exit()


mknod(&(0x7f00000000c0)='./bus\x00', 0x2000, 0x881)
open$dir(&(0x7f0000000080)='./bus\x00', 0x0, 0x0)


open$dir(0x0, 0x0, 0x0)
socketpair$unix(0x1, 0x0, 0x0, 0x0)
r0 = socket(0x1f, 0x5, 0x2)
getsockname$unix(r0, 0x0, 0x0)


_lwp_create(&(0x7f0000000200)={0x0, &(0x7f0000000100)={0x0, 0x0, {}, {}, {0x0, 0x0, '!\x00'}}, {}, {}, {0x0, 0x0, '*\x00'}}, 0x0, 0x0)
compat_40_mount(&(0x7f0000000380)='tmpfs\x00', &(0x7f00000003c0)='.\x00', 0x0, &(0x7f0000000140)="01")
__mount50(0x0, &(0x7f0000000040)='.\x00', 0xe680bf986d21abfb, &(0x7f0000000140), 0x0)


modctl$MODCTL_UNLOAD(0x4, &(0x7f0000000000))
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
r0 = socket(0x18, 0x1, 0x0)
setsockopt(r0, 0x1000000029, 0x2e, &(0x7f0000000000)="ebffcbdf13b9fd812eaa4e713048e69931929648", 0x14)


fstatat(0xffffffffffffff9c, &(0x7f0000000080)='./bus\x00', &(0x7f0000000100), 0x0)
mknod(&(0x7f00000001c0)='./file0/file0/..\x00', 0x20, 0x0)
chdir(&(0x7f0000000080)='./file0/file0/..\x00')
open(&(0x7f0000000200)='./file0/file0/..\x00', 0x40, 0x0)
modctl$MODCTL_UNLOAD(0x2, 0x0)
mknod(&(0x7f00000000c0)='./file0\x00', 0x2000, 0x40000802)
compat_43_ocreat(&(0x7f0000000000)='./file0\x00', 0x0)
__mount50(&(0x7f0000000000)='overlay\x00', &(0x7f0000000040)='.\x00', 0x1, &(0x7f0000000380), 0x0)
lchown(&(0x7f0000000100)='./file0\x00', 0x0, 0xffffffffffffffff)
r0 = open$dir(&(0x7f0000000000)='./file0\x00', 0x0, 0x0)
ioctl$FIONREAD(r0, 0x8004667c, &(0x7f0000000140))
compat_90_fstatvfs1(0xffffffffffffffff, 0x0, 0x0)
ktrace(&(0x7f0000000200)='./file0\x00', 0x4, 0xd27d43220c7df9b, 0x0)


r0 = open$dir(&(0x7f0000000200)='.\x00', 0x0, 0x0)
mknodat(r0, &(0x7f0000000100)='./file0\x00', 0xc000, 0x0)
r1 = openat(r0, &(0x7f0000000000)='./file0\x00', 0x144, 0x0)
madvise(&(0x7f0000ffe000/0x1000)=nil, 0x1000, 0x1)
readv(r1, &(0x7f0000000080)=[{0x0}], 0x1)
r2 = socket(0x18, 0x3, 0x0)
connect$unix(r2, &(0x7f00000000c0)=@abs={0x682eb13985c518e6, 0x7}, 0x1c)
getsockname$inet(r2, &(0x7f00000000c0), &(0x7f0000000000)=0xffffffffffffff35)
sendmsg$unix(0xffffffffffffffff, &(0x7f0000001700)={&(0x7f00000000c0), 0x1c, 0x0}, 0x0)
connect$unix(0xffffffffffffffff, &(0x7f00000000c0)=@abs={0x682eb13985c518e6, 0x7}, 0x1c)
ioctl$WSMUXIO_INJECTEVENT(0xffffffffffffffff, 0x80185760, &(0x7f0000000000)={0x0, 0x0, {0x100000000000000}})
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
r3 = socket(0x18, 0x1, 0x0)
close(r3)
r4 = socket(0x18, 0x400000002, 0x0)
setsockopt(r4, 0x1000000029, 0x2e, &(0x7f0000000000)="ebffcbff13b9fd812eaa4e713048e69931929648", 0x14)
r5 = socket(0x18, 0x1, 0x0)
r6 = dup2(r4, r5)
sendmsg$unix(r6, &(0x7f0000001700)={0x0, 0x0, 0x0}, 0x0)


swapctl$SWAP_CTL(0x5, &(0x7f0000000000)="06e4948ac05ab8dcd313c508362e06bc9d498d7e4d910f3677", 0x0)
r0 = getsid(0x0)
ptrace(0x9, r0, 0x0, 0x0)
compat_50_wait4(0x0, 0x0, 0x0, 0x0)
ptrace(0xb, r0, &(0x7f0000000000), 0x0)


mkdir(&(0x7f0000000080)='./file0\x00', 0x0)
r0 = open$dir(&(0x7f0000000000)='./file0\x00', 0x0, 0x0)
readlinkat(r0, &(0x7f0000000140)='./file0\x00', 0x0, 0x0)


compat_40_mount(&(0x7f0000000040)='tmpfs\x00', &(0x7f0000000140)='.\x00', 0x0, &(0x7f0000000080)="01")
symlink(&(0x7f0000000080)='.\x00', &(0x7f0000000240)='./file0\x00')
compat_50___msgctl13$IPC_STAT(0xffffffffffffffff, 0x2, &(0x7f0000000280)={{}, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, &(0x7f00000001c0)={&(0x7f0000000100)}})
r0 = open(&(0x7f0000000100)='./file0\x00', 0x0, 0x0)
compat_40_mount(&(0x7f0000000140)='umap\x00', &(0x7f0000000040)='./file0\x00', 0x0, &(0x7f00000001c0))
fchroot(r0)


mknod(&(0x7f00000000c0)='./bus\x00', 0x2000, 0x0)
r0 = open$dir(&(0x7f0000000000)='./bus\x00', 0x0, 0x0)
ioctl$FIOASYNC(r0, 0x80283103, &(0x7f0000000040))


r0 = open(&(0x7f00000000c0)='./file0\x00', 0x615, 0x0)
recvfrom(r0, 0x0, 0x0, 0x0, 0x0, 0x0)


mknod(&(0x7f0000000100)='./file0\x00', 0x2000, 0x0)
r0 = open(&(0x7f0000000140)='./file0\x00', 0x0, 0x0)
ioctl$FIOASYNC(r0, 0x20003102, 0x0)
compat_50_setitimer(0x2, &(0x7f0000000000)={{}, {0x0, 0x8}}, 0x0)


r0 = socket(0x18, 0x2, 0x0)
setsockopt(r0, 0x1000000000029, 0x19, 0x0, 0x0)


r0 = open(&(0x7f0000000340)='./file0\x00', 0x300, 0x0)
ioctl$WSKBDIO_GETMODE(r0, 0x40045714, 0x0)
r1 = open(&(0x7f0000000100)='./file0\x00', 0x0, 0x0)
r2 = open(&(0x7f00000000c0)='./file0\x00', 0x205, 0x0)
fcntl$lock(r2, 0x9, &(0x7f0000000000)={0x0, 0x0, 0x0, 0x1000301010005})
r3 = open(&(0x7f00000000c0)='./file0\x00', 0x205, 0x0)
open(&(0x7f0000000040)='./file0\x00', 0x0, 0x0)
fcntl$lock(r3, 0x9, &(0x7f0000000040)={0x0, 0x0, 0x0, 0x1000301010005})
fcntl$lock(r1, 0x8, &(0x7f0000000080)={0x0, 0x0, 0x0, 0x100000001})


r0 = compat_30_socket(0x22, 0x3, 0x0)
setsockopt$inet6_MRT6_ADD_MFC(r0, 0x22, 0x2, 0x0, 0x0)


mkdir(&(0x7f0000000080)='./file0\x00', 0x0)
compat_50___msgctl13$IPC_STAT(0x0, 0x2, &(0x7f0000000280)={{}, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, &(0x7f00000001c0)={&(0x7f0000000100), 0xdffffffffffff7ff}})
lchown(&(0x7f0000000100)='./file0\x00', 0x0, 0xffffffffffffffff)
compat_40_mount(&(0x7f0000000380)='union\x00', &(0x7f0000000140)='./file0\x00', 0x0, &(0x7f00000001c0))
r0 = open$dir(&(0x7f00000000c0)='./file0\x00', 0x0, 0x0)
compat_30_getdents(r0, 0x0, 0xb2)


setregid(0x0, 0xee01)
setreuid(0xffffffffffffffff, 0xee00)
compat_50_quotactl(&(0x7f0000000000)='./file0\x00', 0x0, 0x0, 0x0)


sendmsg$unix(0xffffffffffffffff, 0x0, 0x40a)
r0 = socket(0x18, 0x2, 0x0)
close(r0)
r1 = socket(0x800000018, 0x3, 0x0)
setsockopt$sock_int(r1, 0xffff, 0x1000, &(0x7f0000000000)=0x7, 0x4)
bind$unix(r1, &(0x7f0000000080)=@abs={0x1f95d27d48731892, 0x7}, 0x1c)
connect$unix(r0, &(0x7f00000000c0)=@abs={0x682eb13985c518e6, 0x7}, 0x1c)
sendmsg(r0, &(0x7f0000000280)={0x0, 0x0, 0x0, 0x0, &(0x7f0000000300)=ANY=[@ANYBLOB="10"], 0x10}, 0x0)


chroot(&(0x7f0000000000)='.\x00')
symlink(&(0x7f0000000180)='./file1\x00', 0x0)
__mount50(0x0, &(0x7f0000000040)='.\x00', 0x0, 0x0, 0x0)
__mount50(&(0x7f0000000000)='overlay\x00', 0x0, 0x0, 0x0, 0x0)
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
compat_40_mount(&(0x7f0000000000)='ntfs\x00', &(0x7f0000000040)='./file0\x00', 0x0, &(0x7f0000000180))


mkdir(&(0x7f0000000080)='./file0\x00', 0x0)
compat_50___msgctl13$IPC_STAT(0x0, 0x2, &(0x7f0000000280)={{}, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, &(0x7f00000001c0)={&(0x7f0000000100)}})
compat_40_mount(&(0x7f0000000180)='ptyfs\x00', &(0x7f00000002c0)='./file0\x00', 0x0, &(0x7f0000000500))
lchown(&(0x7f0000000100)='./file0\x00', 0xffffffffffffffff, 0x0)
compat_40_mount(&(0x7f0000000140)='umap\x00', &(0x7f0000000040)='./file0\x00', 0x0, &(0x7f00000001c0))
pathconf(&(0x7f00000001c0)='./file0\x00', 0x6)


symlink(&(0x7f0000000080)='.\x00', &(0x7f0000000240)='./file0\x00')
compat_50___msgctl13$IPC_STAT(0x0, 0x2, &(0x7f0000000280)={{}, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, &(0x7f00000001c0)={&(0x7f0000000100), 0xdffffffffffff7ff}})
open(&(0x7f0000000100)='./file0\x00', 0x0, 0x0)
compat_40_mount(&(0x7f0000000380)='union\x00', &(0x7f0000000140)='./file0\x00', 0x0, &(0x7f00000001c0))
mknod(&(0x7f0000000000)='./file1\x00', 0x2000, 0xa718)
open$dir(&(0x7f00000000c0)='./file1\x00', 0x0, 0x0)


writev(0xffffffffffffffff, 0x0, 0x0)
r0 = socket(0x18, 0x3, 0x0)
ioctl$FIOSEEKHOLE(r0, 0xc118691d, &(0x7f0000000180)=0x8000000000000032)
__fhstat50(&(0x7f0000000180)="eb01b685c8f859535d508f86fd7d537dd3df", 0x12, 0x0)
r1 = socket(0x18, 0x3, 0x0)
ioctl$FIOSEEKHOLE(r1, 0xc118691d, &(0x7f0000000180)=0x8000000000000032)


mknod(&(0x7f00000000c0)='./bus\x00', 0x2000, 0x0)
r0 = open$dir(&(0x7f0000000000)='./bus\x00', 0x0, 0x0)
ioctl$FIOASYNC(r0, 0x8004747d, &(0x7f0000000040))


_ksem_destroy(0x0)
socket$inet(0x2, 0x4000000000000001, 0x0)
modctl$MODCTL_UNLOAD(0x2, 0x0)
compat_30_socket(0x1f, 0x3, 0x0)
close(0x4)


acct(0x0)
minherit(&(0x7f0000ffc000/0x3000)=nil, 0x3000, 0x0)
mlock(&(0x7f0000fff000/0x1000)=nil, 0x1000)
mlock(&(0x7f0000ff4000/0x9000)=nil, 0x9000)


symlink(&(0x7f0000000080)='.\x00', &(0x7f0000000240)='./file0\x00')
compat_50___msgctl13$IPC_STAT(0x0, 0x2, &(0x7f0000000280)={{}, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, &(0x7f00000001c0)={&(0x7f0000000100)}})
__mount50(&(0x7f00000002c0)='overlay\x00', &(0x7f0000000040)='.\x00', 0x0, &(0x7f0000000540), 0x0)
mknod(&(0x7f0000000000)='./bus\x00', 0x2000, 0x2e00)
sendmsg$unix(0xffffffffffffffff, &(0x7f0000000240)={&(0x7f0000000180)=@abs={0x0, 0x0, 0x3}, 0x8, &(0x7f00000038c0)}, 0x0)
open(&(0x7f0000000100)='./file0\x00', 0x0, 0x0)
compat_40_mount(&(0x7f0000000140)='umap\x00', &(0x7f0000000040)='./file0\x00', 0x0, &(0x7f00000001c0))
link(&(0x7f00000003c0)='./bus\x00', &(0x7f0000000400)='./file1\x00')


r0 = open$dir(&(0x7f0000000000)='./file2\x00', 0x200, 0x0)
linkat(r0, 0x0, 0xffffffffffffffff, 0x0, 0x0)


munmap(&(0x7f0000000000/0xf000)=nil, 0xf000)
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x0)


mkdir(&(0x7f0000000080)='./file0\x00', 0x0)
compat_40_mount(&(0x7f0000000180)='ptyfs\x00', &(0x7f00000002c0)='./file0\x00', 0x0, &(0x7f0000000500))
compat_50___msgctl13$IPC_STAT(0x0, 0x2, &(0x7f0000000480)={{}, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, &(0x7f00000001c0)={&(0x7f0000000100)}})
lchown(&(0x7f0000000100)='./file0\x00', 0x0, 0xffffffffffffffff)
compat_40_mount(&(0x7f0000000000)='null\x00', &(0x7f0000000140)='./file0\x00', 0x0, &(0x7f00000001c0))
r0 = open(&(0x7f0000000180)='./file0\x00', 0x0, 0x0)
compat_43_ogetdirentries(r0, 0x0, 0x0, 0x0)


sendmsg$unix(0xffffffffffffffff, &(0x7f0000001700)={&(0x7f0000000080), 0x1c, 0x0}, 0x0)
r0 = socket(0x18, 0x400000002, 0x0)
r1 = socket(0x18, 0x2, 0x0)
getsockname$inet(r1, &(0x7f0000000080), &(0x7f0000000000)=0xffffffffffffffc6)
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
r2 = dup2(r1, r0)
sendmsg$unix(r2, &(0x7f0000001700)={0x0, 0x0, 0x0}, 0x0)


r0 = compat_30_socket(0x22, 0x3, 0x0)
shutdown(r0, 0x3)


r0 = msgget$private(0x0, 0x0)
msgsnd(r0, &(0x7f0000000180)=ANY=[@ANYBLOB="0300000000000000a486714b3b6964c6220190d7f39c044dac99fec5afca3ec3e155903698d635e2ab348195cce43ab9e134935e4edf5efe4e5ec4bec02d51201f93b9860f69d58fca21e1f36041df344b049af8bf321177b2fdc7cc2725691dc000"/110], 0x6e, 0x0)
msgrcv(r0, &(0x7f0000000200)={0x0, ""/8}, 0x10, 0x3, 0x0)


mkdir(&(0x7f0000000080)='./file0\x00', 0x0)
chroot(&(0x7f0000000000)='./file0\x00')
r0 = getsid(0x0)
ptrace(0x9, r0, 0x0, 0x0)


mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x0)
setreuid(0x0, 0xee01)
compat_90_fhstatvfs1(0x0, 0x0, 0x0, 0x0)


mkdir(&(0x7f0000000080)='./file0\x00', 0x0)
compat_50___msgctl13$IPC_STAT(0x0, 0x2, &(0x7f0000000280)={{}, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, &(0x7f00000001c0)={&(0x7f0000000100)}})
open(&(0x7f0000000100)='./file0\x00', 0x0, 0x0)
compat_40_mount(&(0x7f0000000140)='umap\x00', &(0x7f0000000040)='./file0\x00', 0x0, &(0x7f00000001c0))
pathconf(&(0x7f0000000080)='./file0\x00', 0x11)


mknod(&(0x7f00000000c0)='./file0\x00', 0x2000, 0x4301)
r0 = open(&(0x7f0000000180)='./file0\x00', 0x0, 0x0)
read(r0, 0x0, 0x20080)


r0 = socket(0x2, 0x2, 0x0)
setreuid(0x0, 0xee01)
ioctl$FIOSEEKHOLE(r0, 0x8090698e, &(0x7f0000000180)=0x8000000000000031)


r0 = socket(0x1f, 0x5, 0x0)
__fstat50(r0, &(0x7f0000001280))


mknod(&(0x7f0000000280)='./file0\x00', 0x1ffa, 0x0)
r0 = open(&(0x7f0000000040)='./file0\x00', 0x70e, 0x0)
r1 = getpid()
fcntl$setown(r0, 0x6, r1)
r2 = open$dir(&(0x7f0000000000)='./file0\x00', 0x0, 0x0)
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1)
ioctl$FIOSETOWN(r2, 0x8004667c, &(0x7f00000000c0))


r0 = open(&(0x7f0000000280)='./file0\x00', 0x615, 0x0)
compat_43_ommap(&(0x7f0000fff000/0x1000)=nil, 0x1000, 0x1, 0x0, r0, 0x0)


compat_40_mount(&(0x7f0000000040)='tmpfs\x00', &(0x7f00000000c0)='.\x00', 0x0, &(0x7f00000002c0)="01")
mknod$loop(&(0x7f0000000000)='./file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\x00', 0x2000, 0x1)
link(&(0x7f0000001240)='./file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\x00', &(0x7f0000000bc0)='./file0\x00')
rename(0x0, 0x0)
fchownat(0xffffffffffffff9c, &(0x7f0000000100)='./file1\x00', 0x0, 0x0, 0x0)


_lwp_ctl(0x2462, 0x0)


mknod(&(0x7f0000000300)='./file0\x00', 0x2000, 0x6da)
r0 = open$dir(&(0x7f0000000000)='./file0\x00', 0x0, 0x0)
ioctl$FIOASYNC(r0, 0x20007447, 0x0)


recvmsg(0xffffffffffffffff, 0x0, 0x0)
symlink(&(0x7f0000000080)='.\x00', &(0x7f0000000240)='./file0\x00')
r0 = open(&(0x7f0000000080)='./file0\x00', 0x0, 0x0)
mknod(&(0x7f00000000c0)='./bus\x00', 0x2000, 0x4f4b)
r1 = open$dir(&(0x7f0000000040)='./bus\x00', 0x0, 0x0)
ioctl$FIOASYNC(r1, 0xc0104304, &(0x7f00000001c0))
fchdir(r0)


open(&(0x7f00000000c0)='./file0\x00', 0x615, 0x0)
socket$unix(0x1, 0x1, 0x0)
socket$unix(0x1, 0x1, 0x0)
setrlimit(0x8, &(0x7f0000000980)={0x7, 0x54})
socketpair$unix(0x1, 0x1, 0x0, 0x0)


mknod(&(0x7f0000000400)='./file0\x00', 0x2000, 0x287e)
open(&(0x7f0000000480)='./file0\x00', 0x0, 0x0)
modctl$MODCTL_UNLOAD(0x2, 0x0)
open(&(0x7f0000000040)='./file0\x00', 0x0, 0x0)


r0 = socket(0x18, 0x3, 0x0)
setsockopt(r0, 0x1000000029, 0x3e, &(0x7f0000000000)="5ab7776a", 0x4)
r1 = socket$inet6(0x18, 0x3, 0x0)
dup2(r0, r1)
getsockopt(r1, 0x29, 0x23, 0x0, 0x0)


mknod(&(0x7f0000000540)='./file0\x00', 0x6000, 0x1003)
r0 = open(&(0x7f0000000100)='./file0\x00', 0x0, 0x0)
ioctl$FIOGETBMAP(r0, 0x4010647f, 0x0)


open(&(0x7f0000000480)='./file0\x00', 0x7fffe, 0x0)
truncate(&(0x7f0000000440)='./file0\x00', 0x0, 0x3e2b649f)
connect$unix(0xffffffffffffffff, 0x0, 0x0)
mknod(0x0, 0x0, 0x0)
r0 = openat(0xffffffffffffff9c, &(0x7f0000000040)='./file0\x00', 0x0, 0x0)
readv(r0, &(0x7f00000003c0)=[{&(0x7f0000000300)=""/53, 0x35}], 0x1)


r0 = socket(0x1f, 0x3, 0x0)
getsockname$inet6(r0, 0x0, 0x0)


r0 = socket(0x1f, 0x1, 0x0)
getsockopt$inet_opts(r0, 0x3, 0x0, 0x0, 0x0)


sync()


mkdir(&(0x7f0000000080)='./file0\x00', 0x0)
__mount50(&(0x7f0000000180)='coda\x00', &(0x7f0000000380)='./file0\x00', 0x0, 0x0, 0x0)


r0 = socket$inet6(0x18, 0x3, 0x0)
sendto$inet6(r0, &(0x7f0000000000)="86", 0x329, 0x2, &(0x7f0000000040)={0x18, 0x3}, 0x1c)


r0 = socket(0x2, 0x3, 0x0)
setsockopt$sock_int(r0, 0xffff, 0x1001, &(0x7f0000000200)=0xce, 0x4)


symlink(&(0x7f0000000180)='\x00', &(0x7f00000001c0)='./file0\x00')
execve(&(0x7f00000004c0)='./file0/file0\x00', 0x0, 0x0)


connect$unix(0xffffffffffffffff, &(0x7f0000000080)=@abs={0x1, 0x0, 0x3}, 0x8)
r0 = socket(0x2, 0x3, 0x0)
getsockopt(r0, 0x0, 0x65, 0x0, 0x0)
bind(r0, &(0x7f0000000040), 0xc)
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
r1 = socket(0x2, 0x1, 0x0)
connect$inet(r1, &(0x7f0000000000), 0x10)
r2 = socket(0x18, 0x1, 0x0)
r3 = socket(0x18, 0x4, 0x4)
setsockopt(r3, 0x1000000029, 0x36, &(0x7f0000000000)='\x00\x00\x00\x00', 0x4)
setrlimit(0x8, &(0x7f0000000980)={0x7, 0x54})
socketpair$unix(0x1, 0x1, 0x0, &(0x7f00000002c0)={0xffffffffffffffff, <r4=>0xffffffffffffffff})
sendmsg$unix(r4, &(0x7f00000000c0)={0x0, 0x0, 0x0, 0x0, 0x0, 0x28}, 0x0)
socket(0x18, 0x2, 0x0)
dup(0xffffffffffffffff)
mlock(&(0x7f0000ffb000/0x3000)=nil, 0x3000)
munlockall()
socketpair$unix(0x1, 0x1, 0x0, 0x0)
setsockopt$sock_int(r3, 0xffff, 0x1, &(0x7f0000001240)=0x6, 0x4)
getsockname$inet(0xffffffffffffffff, 0x0, 0x0)
dup2(0xffffffffffffffff, r2)
mknod(&(0x7f0000000300)='./file0\x00', 0x8000, 0x6da)
open(&(0x7f0000000000)='./file0\x00', 0x0, 0x0)
acct(0x0)
connect$unix(0xffffffffffffffff, &(0x7f00000000c0)=@abs={0x682eb13985c518e6, 0x7}, 0x1c)


symlink(&(0x7f0000000080)='.\x00', &(0x7f0000000240)='./file0\x00')
compat_40_mount(&(0x7f0000000000)='null\x00', &(0x7f0000000140)='./file0\x00', 0x0, 0x0)


r0 = socket(0x18, 0x2, 0x0)
connect$unix(r0, &(0x7f00000000c0)=@abs={0x682eb13985c518e6, 0x7}, 0x1c)
getsockname$inet(r0, &(0x7f0000000080), &(0x7f0000000000)=0xfffffffffffffe22)
r1 = socket(0x800000018, 0x1, 0x0)
bind$unix(r1, &(0x7f0000000080)=@abs={0x1f95d27d48731892, 0x7}, 0x1c)
r2 = socket(0x800000018, 0x1, 0x0)
bind$unix(r2, &(0x7f0000000080)=@abs={0x1f95d27d48731892, 0x7}, 0x1c)


modctl$MODCTL_UNLOAD(0x2, 0x0)
openat$tprof(0xffffffffffffff9c, &(0x7f00000000c0), 0x0, 0x0)


mmap(&(0x7f0000000000/0xff5000)=nil, 0xff5000, 0x0, 0x200000005c832, 0xffffffffffffffff, 0x0, 0x0)
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x7)


symlink(&(0x7f000001fb80)='./file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa/file0\x00', &(0x7f0000000100)='./file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\x00')
symlink(&(0x7f0000000080)='.\x00', &(0x7f0000000240)='./file0\x00')
compat_50___msgctl13$IPC_STAT(0x0, 0x2, &(0x7f0000000280)={{}, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, &(0x7f00000001c0)={&(0x7f0000000100), 0xdffffffffffff7ff}})
lchown(&(0x7f0000000100)='./file0\x00', 0x0, 0xffffffffffffffff)
compat_40_mount(&(0x7f0000000380)='union\x00', &(0x7f0000000140)='./file0\x00', 0x0, &(0x7f00000001c0))
link(&(0x7f0000000000)='./file0\x00', &(0x7f0000000240)='./file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa/file0\x00')


setrlimit(0x9, &(0x7f00000010c0))
socket(0x18, 0x400000002, 0x0)


setsockopt$inet_opts(0xffffffffffffffff, 0x0, 0x0, &(0x7f0000000180)="33b62b53cc518098fb586f8654ccabac394a21176475c8d7ab", 0x19)
r0 = socket(0x18, 0x3, 0x0)
ioctl$FIOSEEKHOLE(r0, 0x80206975, &(0x7f0000000180)=0x8000000000000032)


__mount50(&(0x7f0000000100)='ptyfs\x00', &(0x7f0000000040)='.\x00', 0x0, &(0x7f0000000000)='K', 0x1)


r0 = getsid(0x0)
ptrace(0x9, r0, 0x0, 0x0)
compat_50_wait4(0x0, 0x0, 0x20000, 0x0)


mmap(&(0x7f0000ffc000/0x1000)=nil, 0x1000, 0x0, 0xdce40ff13ef698d8, 0xffffffffffffffff, 0x0, 0x0)


__mount50(&(0x7f00000002c0)='union\x00', &(0x7f0000000040)='.\x00', 0x0, &(0x7f0000000100)='&', 0x1)


setsockopt(0xffffffffffffffff, 0x0, 0x0, &(0x7f0000000080)='+', 0x1)
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x0)
fcntl$setflags(0xffffffffffffffff, 0x2, 0x0)
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
r0 = shmget$private(0x0, 0x1000, 0x0, &(0x7f00003d5000/0x1000)=nil)
shmctl$IPC_SET(r0, 0x1, &(0x7f00000016c0)={{0x0, 0x0, 0x0, 0x0, 0xffffffffffffffff}})


mmap(&(0x7f0000000000/0x13000)=nil, 0xfffffffffffff000, 0x0, 0x10, 0xffffffffffffffff, 0x0, 0x0)


ioctl$WSMUXIO_INJECTEVENT(0xffffffffffffffff, 0x80185760, &(0x7f0000000000)={0x0, 0x0, {0x0, 0x1}})
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
r0 = socket(0x18, 0x1, 0x0)
setsockopt(r0, 0x0, 0xc, &(0x7f0000000000)="ebffcbff13b9fd812eaa4e713048e69931929648", 0x14)


socket$inet(0x2, 0x0, 0x0)
mknod(0x0, 0x0, 0x0)
r0 = socket(0x18, 0x3, 0x0)
ioctl$FIOSEEKHOLE(r0, 0x80206913, &(0x7f0000000180))
mknod(&(0x7f0000000300)='./file0\x00', 0x2000, 0x6da)
r1 = open$dir(&(0x7f0000000000)='./file0\x00', 0x0, 0x0)
ioctl$FIOASYNC(r1, 0x40047463, 0x0)


__mount50(&(0x7f0000000000)='ntfs\x00', &(0x7f0000000080)='.\x00', 0x0, 0x0, 0x0)


r0 = open$dir(&(0x7f0000000000)='./file0\x00', 0x40000400001803c1, 0x0)
pwritev(r0, &(0x7f0000000280)=[{&(0x7f00000001c0)='#', 0x1}], 0x1, 0x0)


__mount50(&(0x7f00000002c0)='overlay\x00', 0x0, 0x0, 0x0, 0x0)
socketpair$unix(0x1, 0x1, 0x0, &(0x7f0000000140)={<r0=>0xffffffffffffffff, <r1=>0xffffffffffffffff})
recvmsg(r0, &(0x7f0000000180)={0x0, 0x0, &(0x7f0000000640)=[{&(0x7f00000001c0)=""/23, 0x17}], 0x1, 0x0}, 0x42)
recvmmsg(r0, &(0x7f0000000040)={0x0}, 0x10, 0x0, 0x0)
r2 = socket$inet(0x2, 0x2, 0x0)
dup2(r2, r1)


shmat(0xffffffffffffffff, &(0x7f0000ffa000/0x2000)=nil, 0x0)


compat_50_clock_getres(0x0, &(0x7f0000000a40))


r0 = socket(0x1f, 0x5, 0x0)
ioctl$FIOSEEKHOLE(r0, 0x8020690c, &(0x7f0000000180)=0x8000000000000032)


socket(0x23, 0x0, 0x0)


mknod(&(0x7f0000000480)='./file0\x00', 0x2000, 0x1733)
r0 = open(&(0x7f00000000c0)='./file0\x00', 0x0, 0x0)
ioctl$FIOASYNC(r0, 0x2000427e, 0x0)
ioctl$WSKBDIO_BELL(r0, 0x4080426f)


r0 = socket(0x10, 0x2, 0x0)
ioctl$FIOSEEKHOLE(r0, 0x40046679, &(0x7f0000000180))


open(&(0x7f0000000480)='./file0\x00', 0x80000000000206, 0x0)
r0 = getpid()
ktrace(&(0x7f0000000340)='./file0\x00', 0x0, 0xf, r0)
r1 = open(&(0x7f0000000000)='.\x00', 0x0, 0x0)
unlinkat(r1, &(0x7f0000000280)='./file0\x00', 0x0)


symlink(&(0x7f0000000080)='.\x00', &(0x7f0000000240)='./file0\x00')
compat_50___msgctl13$IPC_STAT(0x0, 0x2, &(0x7f0000000280)={{}, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, &(0x7f00000001c0)={&(0x7f0000000100)}})
lchown(&(0x7f0000000100)='./file0\x00', 0x0, 0xffffffffffffffff)
compat_40_mount(&(0x7f0000000140)='umap\x00', &(0x7f0000000040)='./file0\x00', 0x0, &(0x7f00000001c0))
pathconf(&(0x7f0000000080)='./file0\x00', 0x8)


compat_43_stat43(&(0x7f0000000080)='./file0\x00', 0x0)


mknod(&(0x7f0000000200)='./file0\x00', 0x6000, 0x1003)
r0 = open$dir(&(0x7f00000000c0)='./file0\x00', 0x40000400001803c1, 0x0)
pwritev(r0, &(0x7f0000000080)=[{&(0x7f00000006c0), 0xf0f75}], 0x1, 0x0)


compat_50___msgctl13$IPC_STAT(0x0, 0x2, 0x0)
lchown(0x0, 0x0, 0xffffffffffffffff)
pipe(&(0x7f0000000800)={<r0=>0xffffffffffffffff})
r1 = getpid()
fktrace(r0, 0x0, 0xf, r1)
__fhstatvfs190(0x0, 0x0, 0x0, 0x0)


compat_43_osethostid(0x0)


mknodat(0xffffffffffffff9c, &(0x7f0000000040)='./file0\x00', 0x1000, 0x0)
__mount50(&(0x7f00000002c0)='overlay\x00', &(0x7f0000000040)='.\x00', 0x0, &(0x7f0000000540), 0x0)
open(&(0x7f0000001640)='./file0\x00', 0x615, 0x0)


open(&(0x7f0000000040)='./file0\x00', 0x70e, 0x0)
ktrace(&(0x7f0000000200)='./file0\x00', 0x4, 0xd27d43220c7df9b, 0x0)
open(0x0, 0x0, 0x0)
__lstat50(&(0x7f0000000000)='./file0\x00', &(0x7f0000000080))


mknod(&(0x7f0000000480)='./file0\x00', 0x2000, 0x1733)
r0 = open(&(0x7f00000000c0)='./file0\x00', 0x0, 0x0)
ioctl$FIONREAD(r0, 0x4008426f, &(0x7f0000000080))


open(&(0x7f00000000c0)='./file0\x00', 0x205, 0x0)
ktrace(&(0x7f0000000200)='./file0\x00', 0x4, 0xd27d43220c7df9b, 0x0)
____semctl50$GETNCNT(0x0, 0x0, 0x3)
fcntl$lock(0xffffffffffffffff, 0x0, 0x0)
__mount50(&(0x7f0000000000)='overlay\x00', &(0x7f0000000040)='.\x00', 0x0, &(0x7f0000000540), 0x0)
acct(&(0x7f0000000000)='./file0\x00')


r0 = socket(0x2, 0x1, 0x0)
fchdir(r0)


socket$inet(0x2, 0x0, 0x0)
socket$unix(0x1, 0x0, 0x0)
mlock(&(0x7f0000ffc000/0x3000)=nil, 0x3000)
madvise(&(0x7f0000ffe000/0x2000)=nil, 0x2000, 0x0)


mkdir(&(0x7f0000000080)='./file0\x00', 0x0)
compat_50___msgctl13$IPC_STAT(0x0, 0x2, &(0x7f0000000280)={{}, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, &(0x7f00000001c0)={&(0x7f0000000100), 0xdffffffffffff7ff}})
lchown(&(0x7f0000000100)='./file0\x00', 0x0, 0xffffffffffffffff)
compat_40_mount(&(0x7f0000000380)='union\x00', &(0x7f0000000140)='./file0\x00', 0x3, &(0x7f00000001c0))
chroot(&(0x7f0000000000)='./file0\x00')
__mount50(0x0, &(0x7f0000000040)='.\x00', 0xe680bf986d21ac03, &(0x7f0000000140), 0x30)


compat_40_mount(&(0x7f0000000040)='tmpfs\x00', &(0x7f00000000c0)='.\x00', 0x0, &(0x7f00000002c0)="01")
unlink(0x0)
symlink(&(0x7f0000000080)='.\x00', &(0x7f0000000300)='./file0\x00')
__mount50(&(0x7f00000002c0)='overlay\x00', &(0x7f0000000040)='.\x00', 0x0, &(0x7f0000000540), 0x0)
compat_50___msgctl13$IPC_STAT(0x0, 0x2, &(0x7f0000000280)={{}, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, &(0x7f00000001c0)={&(0x7f0000000100), 0xdffffffffffff7ff}})
lchown(&(0x7f0000000100)='./file0\x00', 0x0, 0xffffffffffffffff)
compat_40_mount(&(0x7f0000000380)='union\x00', &(0x7f0000000140)='./file0\x00', 0x0, &(0x7f00000001c0))
r0 = open(&(0x7f0000002600)='./file0\x00', 0x0, 0x0)
ioctl$WSKBDIO_SETMAP(r0, 0xc004667a, &(0x7f0000000180)={0x0, 0x0})


mknod(&(0x7f00000000c0)='./bus\x00', 0x2000, 0x4f4b)
r0 = open$dir(&(0x7f0000000040)='./bus\x00', 0x0, 0x0)
read(r0, &(0x7f0000000180)=""/54, 0x36)


mknod(&(0x7f00000000c0)='./bus\x00', 0x2000, 0x4f4b)
r0 = open$dir(&(0x7f0000000040)='./bus\x00', 0x0, 0x0)
ioctl$FIOASYNC(r0, 0xc0104304, &(0x7f00000001c0)=0x20000002)


compat_40_mount(&(0x7f0000000080)='tmpfs\x00', &(0x7f00000000c0)='.\x00', 0x0, &(0x7f00000002c0)="01")
__mount50(&(0x7f0000000000)='overlay\x00', &(0x7f0000000040)='.\x00', 0x0, &(0x7f0000000540), 0x0)
mknod(&(0x7f0000000300)='./file0\x00', 0x2000, 0x6da)
r0 = open(&(0x7f0000000000)='./file0\x00', 0x0, 0x0)
readv(r0, &(0x7f0000000140)=[{0x0}], 0x1)


__mount50(&(0x7f00000001c0)='fdesc\x00', &(0x7f0000000040)='.\x00', 0x0, 0x0, 0x0)
__mount50(&(0x7f0000000040)='procfs\x00', 0x0, 0x0, 0x0, 0x0)
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
mknod(&(0x7f0000000040)='./bus\x00', 0x0, 0x2802)


munmap(&(0x7f0000001000/0x3000)=nil, 0x3000)
__msync13(&(0x7f0000001000/0x4000)=nil, 0x0, 0x1)


compat_40_mount(&(0x7f0000000040)='tmpfs\x00', &(0x7f00000000c0)='.\x00', 0x0, &(0x7f00000002c0)="01")
__mount50(&(0x7f0000000000)='overlay\x00', &(0x7f0000000040)='.\x00', 0x0, &(0x7f0000000540), 0x0)
mknod(&(0x7f0000000280)='./file0\x00', 0x1100, 0x0)
r0 = open(&(0x7f0000000480)='./file0\x00', 0x80000000000206, 0x0)
write(r0, &(0x7f0000000040)="16", 0x1)
r1 = open(&(0x7f0000000180)='./file0\x00', 0x0, 0x0)
read(r1, &(0x7f00000002c0)=""/82, 0x52)


socket$inet(0x2, 0x4000000000000001, 0x0)
modctl$MODCTL_UNLOAD(0x2, 0x0)
socket(0x10, 0x2, 0x0)
close(0x4)


symlink(&(0x7f0000000080)='.\x00', &(0x7f0000000240)='./file0\x00')
compat_50___msgctl13$IPC_STAT(0x0, 0x2, &(0x7f0000000280)={{}, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, &(0x7f00000001c0)={&(0x7f0000000100)}})
lchown(&(0x7f0000000100)='./file0/file0\x00', 0x0, 0x0)
compat_40_mount(&(0x7f00000000c0)='cd9660\x00', &(0x7f0000000000)='./file0/file0\x00', 0x8000005, &(0x7f00000001c0))


mknod(&(0x7f0000000400)='./file0\x00', 0x2000, 0x287e)
r0 = open$dir(&(0x7f0000000b80)='./file0\x00', 0x0, 0x0)
shmget$private(0x0, 0x3000, 0x0, &(0x7f0000ffa000/0x3000)=nil)
compat_50___shmctl13$IPC_RMID(0x0, 0x0)
ioctl$FIOASYNC(r0, 0x8004667d, &(0x7f0000000200)=0x20)


compat_40_mount(&(0x7f0000000380)='tmpfs\x00', &(0x7f0000000000)='.\x00', 0x0, &(0x7f0000000140)="01")
r0 = socket$inet6(0x18, 0x3, 0x0)
poll(&(0x7f0000000000)=[{r0}], 0x1, 0x0)
swapctl$SWAP_ON(0x1, &(0x7f0000000000), 0x0)


mknod(&(0x7f0000000280)='./file0\x00', 0x1ffa, 0x0)
open(&(0x7f00000001c0)='./file0\x00', 0x6aaaf9f55adc2226, 0x0)


mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
r0 = socket$inet(0x2, 0x3, 0x0)
getsockopt$sock_int(r0, 0xffff, 0x1004, 0x0, 0x0)


symlink(&(0x7f0000000080)='.\x00', &(0x7f0000000240)='./file0\x00')
r0 = msgget$private(0x0, 0x0)
msgctl$IPC_SET(r0, 0x1, &(0x7f0000002f80)={{0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x7}, 0x6, 0x2, 0x0, 0x0, 0xb9, 0x80, 0x3, 0x5})
compat_50___msgctl13$IPC_STAT(r0, 0x2, &(0x7f0000000280)={{}, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, &(0x7f00000001c0)={&(0x7f0000000100)={0x0, 0x10001}}})
lchown(0x0, 0x0, 0xffffffffffffffff)
compat_40_mount(0x0, 0x0, 0x0, 0x0)
unlink(0x0)
open(0x0, 0x0, 0x0)
socket$inet6(0x18, 0x0, 0x0)
sendto$inet6(0xffffffffffffffff, 0x0, 0x0, 0x0, 0x0, 0x0)
chmod(0x0, 0x0)
r1 = socket(0x18, 0x2, 0x0)
close(r1)
r2 = socket(0x18, 0x3, 0x0)
setsockopt(r2, 0x1000000029, 0x32, &(0x7f0000000440)="b211d7170d816685c8e360f2fa41c1a0946988b272a746e337b372e93320cff6669cbe7868de45ed3fc33719ca6df71ecec8a900108b2c10a1f8c66653b276e180e9cb9b21f9982230f575295d48889c98000796b2dd921a4975680b37ba955d2c15e6d7c9198ed900ab006ddfb6f869b51a2216114d1ece85f593e74035f5bc054eb1dbddf42a004000000000000000", 0x90)
connect$unix(r1, &(0x7f00000000c0)=@abs={0x682eb13985c518e6, 0x7}, 0x1c)
writev(r2, &(0x7f0000000680)=[{&(0x7f0000000000)="2f87bb4098d7de56", 0x8}], 0x1)


compat_43_ogetpeername(0xffffffffffffffff, 0x0, &(0x7f0000000080))
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x4)
r0 = socket$inet(0x2, 0x1, 0x0)
setsockopt(r0, 0x0, 0x6, &(0x7f0000000040)="285c8f02", 0x4)


symlink(&(0x7f0000000080)='.\x00', &(0x7f0000000240)='./file0\x00')
__posix_rename(0x0, 0x0)
socket$unix(0x1, 0x5, 0x0)
r0 = open(&(0x7f0000000100)='./file0\x00', 0x0, 0x0)
openat(r0, &(0x7f00000000c0)='./file0\x00', 0x0, 0x0)
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
compat_50_select(0x40, &(0x7f0000000080), 0x0, 0x0, 0x0)


modctl$MODCTL_UNLOAD(0x2, 0x0)
____semctl50$IPC_SET(0x0, 0x0, 0x1, &(0x7f0000000b40)=@array=&(0x7f0000000b00))


mkdir(&(0x7f0000000080)='./file0\x00', 0x0)
compat_40_mount(&(0x7f0000000180)='ptyfs\x00', &(0x7f00000002c0)='./file0\x00', 0x0, &(0x7f0000000500))
open(&(0x7f0000000080)='./file0\x00', 0x0, 0x0)
unmount(&(0x7f0000000200)='./file0/../file0\x00', 0x0)


socketpair$unix(0x1, 0x5, 0x0, &(0x7f0000000400)={0xffffffffffffffff, <r0=>0xffffffffffffffff})
symlink(0x0, &(0x7f0000000240)='./file0\x00')
sendto$unix(0xffffffffffffffff, 0x0, 0x0, 0x0, 0x0, 0x0)
getsockopt$sock_int(r0, 0xffff, 0x1001, &(0x7f0000000000), &(0x7f0000000040)=0x4)


rasctl(&(0x7f0000000100), 0x5, 0x0)
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x0)
syz_usb_connect(0x0, 0x24, &(0x7f0000000080)={{0x12, 0x1, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x1, 0x2, 0x3, 0x1, [{{0x9, 0x2, 0x12, 0x1, 0x0, 0x0, 0x0, 0x0, [{}]}}]}}, 0x0)


setrlimit(0x3, &(0x7f0000000240)={0x0, 0xffffffff})


r0 = socket(0x2, 0x3, 0x0)
compat_50___msgctl13$IPC_STAT(0x0, 0x2, 0x0)
fpathconf(r0, 0x0)


mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
r0 = socket(0x18, 0x2, 0x0)
setsockopt(r0, 0x29, 0x16, &(0x7f0000000000)="02000000", 0x4)


modctl$MODCTL_UNLOAD(0x2, 0x0)
mkdir(&(0x7f0000000080)='./file0\x00', 0x0)
compat_40_mount(&(0x7f0000000180)='ptyfs\x00', &(0x7f00000002c0)='./file0\x00', 0x0, &(0x7f0000000500))
lchown(&(0x7f0000000100)='./file0\x00', 0x0, 0xffffffffffffffff)


pwritev(0xffffffffffffffff, &(0x7f0000002040)=[{&(0x7f0000000000)="4c64736ea91f41cd8ceed2545e1edd804faa2b995b0de3434bfba0c9da920b6b9690a5a53d79d19d29a107be39d0b0169890030211d8a55e5f8d2d0e49a80ee8690743c2a951c93d3248f036bf1599d406ecc9feea2d3a8fa90974932e4564ad2283aa2a31c12c19cf28ac5980cb01602133f1b4cf25eb3a1f487652c7fa6dfa2b6fe96420d022f7f99800b2fb707790fcbdc0dbc38b476ece9bb1486151d5433baef33a760d96f1c2cb3a519d6830a98047774a8c1abfe97d", 0xb9}], 0x1, 0x0)
r0 = compat_30_socket(0x22, 0x3, 0x0)
compat_43_osend(r0, &(0x7f0000000040)="58030707", 0x358, 0x0)


compat_40_mount(&(0x7f0000000040)='tmpfs\x00', &(0x7f0000000140)='.\x00', 0x0, &(0x7f0000000080)="01")
mkdirat(0xffffffffffffff9c, &(0x7f0000000680)='./file0\x00', 0x0)
mkdir(0x0, 0x0)
r0 = open$dir(&(0x7f00000000c0)='./file0\x00', 0x0, 0x0)
mkdir(&(0x7f0000000040)='./file2\x00', 0x0)
writev(0xffffffffffffffff, 0x0, 0x0)
mkdir(&(0x7f0000000300)='./file2/file0\x00', 0x0)
rename(&(0x7f00000002c0)='./file2/file0\x00', &(0x7f0000000340)='./file0\x00')
openat(r0, &(0x7f0000000000)='./file2\x00', 0x0, 0x0)


sendmsg$unix(0xffffffffffffffff, &(0x7f0000000240)={0x0, 0x0, &(0x7f00000017c0)=[{&(0x7f00000001c0)}], 0x1}, 0x0)
open(&(0x7f0000000000)='./file0\x00', 0x78e, 0x0)
ktrace(&(0x7f0000000200)='./file0\x00', 0x4, 0xd27d43220c7df9b, 0x0)
socketpair$unix(0x1, 0x2, 0x0, &(0x7f0000000040)={<r0=>0xffffffffffffffff, <r1=>0xffffffffffffffff})
sendto(r0, 0x0, 0x0, 0x0, 0x0, 0x0)
recvmsg(r1, &(0x7f0000002880)={&(0x7f00000014c0), 0xc, 0x0, 0x0, 0x0}, 0x0)


r0 = socket(0x2, 0x3, 0x0)
__fstat50(r0, &(0x7f0000000540))


open(&(0x7f00000000c0)='./file0\x00', 0x205, 0x0)
ktrace(&(0x7f0000000200)='./file0\x00', 0x4, 0xd27d43220c7df9b, 0x0)
poll(0xffffffffffffffff, 0xffffff90, 0x0)


r0 = socket$inet(0x2, 0x1, 0x0)
bind$inet(r0, &(0x7f0000000040)={0x2, 0x0}, 0x10)
connect$inet(r0, &(0x7f0000000000)={0x2, 0x0}, 0x10)
shutdown(r0, 0x2)
shutdown(r0, 0x1)


mkdir(&(0x7f0000000200)='./file0\x00', 0x0)
__mount50(&(0x7f0000000000)='kernfs\x00', &(0x7f0000000180)='./file0\x00', 0x0, 0x0, 0x0)
compat_50___msgctl13$IPC_STAT(0x0, 0x2, &(0x7f0000000300)={{}, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, &(0x7f00000001c0)={&(0x7f0000000100)}})
lchown(&(0x7f0000000100)='./file0\x00', 0x0, 0xffffffffffffffff)
compat_40_mount(&(0x7f0000000140)='umap\x00', &(0x7f0000000040)='./file0\x00', 0x0, &(0x7f00000001c0))
open(&(0x7f00000000c0)='./file0\x00', 0x0, 0x0)
compat_50_select(0x40, &(0x7f0000000080)={0xffffffff}, 0x0, 0x0, 0x0)


socketpair$unix(0x1, 0x5, 0x0, &(0x7f0000000200)={<r0=>0xffffffffffffffff, <r1=>0xffffffffffffffff})
sendmsg$unix(0xffffffffffffffff, &(0x7f0000000640)={0x0, 0x0, 0x0, 0x0, &(0x7f00000000c0)=ANY=[@ANYBLOB="18000000ffff000001"], 0x18}, 0x0)
sendmmsg(r0, &(0x7f0000000480)={0x0}, 0x10, 0x0, 0x0)
compat_43_orecvmsg(r1, &(0x7f0000000380)={0x0, 0x0, &(0x7f00000002c0)={0x0}, 0x10, &(0x7f0000000300)=""/68, 0x44}, 0x0)


symlink(&(0x7f0000000080)='.\x00', &(0x7f0000000240)='./file0\x00')
compat_50___msgctl13$IPC_STAT(0x0, 0x2, &(0x7f0000000280)={{}, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, &(0x7f00000001c0)={&(0x7f0000000100)}})
lchown(&(0x7f0000000100)='./file0/file0\x00', 0x0, 0x0)
compat_40_mount(&(0x7f0000000140)='umap\x00', &(0x7f0000000040)='./file0\x00', 0x0, &(0x7f00000001c0))
compat_50___msgctl13$IPC_STAT(0x0, 0x2, &(0x7f0000000280)={{}, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, &(0x7f00000001c0)={&(0x7f0000000100), 0xdffffffffffff7ff}})
lchown(&(0x7f0000000100)='./file0\x00', 0x0, 0xffffffffffffffff)
compat_40_mount(&(0x7f0000000380)='union\x00', &(0x7f0000000140)='./file0\x00', 0x0, &(0x7f00000001c0))
r0 = open(&(0x7f0000000100)='./file0\x00', 0x0, 0x0)
fcntl$lock(r0, 0x9, &(0x7f0000000340)={0x0, 0x0, 0x0, 0x100000001})


r0 = shmget$private(0x0, 0x1000, 0x0, &(0x7f0000ffc000/0x1000)=nil)
munmap(&(0x7f0000ffb000/0x4000)=nil, 0x4000)
shmat(r0, &(0x7f0000ffd000/0x1000)=nil, 0x0)
r1 = shmget$private(0x0, 0x1000, 0x0, &(0x7f0000ffc000/0x1000)=nil)
munmap(&(0x7f0000ffb000/0x4000)=nil, 0x4000)
shmat(r1, &(0x7f0000ffd000/0x1000)=nil, 0x0)
fork()
shmat(r0, &(0x7f0000ffc000/0x4000)=nil, 0x0)


mknod(&(0x7f0000000100)='./file0\x00', 0x80002005, 0x5300)
r0 = open(&(0x7f0000002600)='./file0\x00', 0x0, 0x0)
__mount50(&(0x7f0000000000)='overlay\x00', 0x0, 0x0, 0x0, 0x0)
ioctl$WSKBDIO_SETMAP(r0, 0x80145003, &(0x7f0000000000)={0x0, 0x0})


r0 = socket$unix(0x1, 0x1, 0x0)
r1 = socket$unix(0x1, 0x1, 0x0)
r2 = dup3(r1, r0, 0x0)
getsockopt$inet_opts(r2, 0x0, 0x1, &(0x7f00000000c0)=""/156, &(0x7f0000000180)=0x9c)


r0 = getsid(0x0)
ptrace(0x9, r0, 0x0, 0x0)
r1 = posix_spawn(0x0, 0x0, 0x0, 0x0, 0x0, &(0x7f0000000180))
compat_50_wait4(r1, 0x0, 0x0, 0x0)


r0 = open(&(0x7f0000000180)='./bus\x00', 0x14927e, 0x0)
mmap(&(0x7f0000000000/0x600000)=nil, 0x600000, 0x27fffff, 0x4002011, r0, 0x0, 0x0)


____semctl50$SETALL(0x0, 0x0, 0x9, 0xfffffffffffffffe)


r0 = open(&(0x7f00000000c0)='./file0\x00', 0x615, 0x0)
fcntl$setstatus(r0, 0xc, 0x48)


r0 = getsid(0x0)
ptrace(0x9, r0, 0x0, 0x0)
compat_50_wait4(0x0, 0x0, 0x0, 0x0)
ptrace(0x1b, r0, 0x0, 0x0)


r0 = open$dir(&(0x7f0000000080)='.\x00', 0x0, 0x0)
fchflags(r0, 0x0)


r0 = socket(0x800000018, 0x2, 0x0)
r1 = socket(0x18, 0x2, 0x0)
close(r1)
r2 = socket(0x800000018, 0x2, 0x0)
bind$unix(r2, &(0x7f0000000080)=@abs={0x1f95d27d48731892, 0x7}, 0x1c)
connect$unix(r1, &(0x7f00000000c0)=@abs={0x682eb13985c518e6, 0x7}, 0x1c)
bind$unix(r0, &(0x7f0000000080)=@abs={0x1f95d27d48731892, 0x7}, 0x1c)


mkdir(&(0x7f0000000080)='./file0\x00', 0x0)
compat_50___msgctl13$IPC_STAT(0x0, 0x2, &(0x7f0000000280)={{}, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, &(0x7f00000001c0)={&(0x7f0000000100)}})
lchown(&(0x7f0000000100)='./file0\x00', 0x0, 0xffffffffffffffff)
compat_40_mount(&(0x7f0000000140)='umap\x00', &(0x7f0000000040)='./file0\x00', 0x0, &(0x7f00000001c0))
mknod(&(0x7f00000000c0)='./bus\x00', 0x2000, 0x4f4b)
r0 = open$dir(&(0x7f0000000080)='./bus\x00', 0x0, 0x0)
ioctl$FIOASYNC(r0, 0xc0104304, &(0x7f00000001c0))
compat_50___stat30(&(0x7f0000000000)='./file0\x00', &(0x7f0000000300))


r0 = socket(0x18, 0x3, 0x0)
ioctl$FIOSEEKHOLE(r0, 0x8090697f, &(0x7f0000000180)=0x8000000000000032)


__select50(0x0, 0x0, &(0x7f0000000100), 0x0, &(0x7f0000000180)={0x0, 0x6})


modctl$MODCTL_LOAD(0x5, 0x0)
mknod(&(0x7f00000000c0)='./bus\x00', 0x2000, 0x4f4b)
r0 = open$dir(&(0x7f0000000040)='./bus\x00', 0x0, 0x0)
ioctl$FIOASYNC(r0, 0xc0104308, &(0x7f00000001c0)=0x20000002)


fcntl$lock(0xffffffffffffffff, 0x0, &(0x7f00000000c0)={0x0, 0x0, 0x0, 0x0, 0xffffffffffffffff})
__clone(0x0, &(0x7f0000000040))
mlock(&(0x7f000000a000/0x4000)=nil, 0x4000)


mkdir(&(0x7f0000000080)='./file0\x00', 0x0)
compat_40_mount(&(0x7f0000000180)='ptyfs\x00', &(0x7f00000002c0)='./file0\x00', 0x0, &(0x7f0000000500))
compat_50___msgctl13$IPC_STAT(0x0, 0x2, &(0x7f0000000280)={{}, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, &(0x7f00000001c0)={&(0x7f0000000100)}})
open(&(0x7f0000000100)='./file0\x00', 0x0, 0x0)
compat_40_mount(&(0x7f0000000140)='umap\x00', &(0x7f0000000040)='./file0\x00', 0x0, &(0x7f00000001c0))
pathconf(&(0x7f0000000080)='./file0\x00', 0x11)


r0 = socket(0x18, 0x2, 0x0)
connect$unix(r0, &(0x7f00000000c0)=@abs={0x682eb13985c518e6, 0x7}, 0x1c)
getsockname$inet(r0, &(0x7f00000000c0), &(0x7f0000000000)=0xc)
r1 = socket(0x18, 0x2, 0x0)
connect$unix(r1, &(0x7f00000000c0)=@abs={0x682eb13985c518e6, 0x7}, 0x1c)
getpeername$unix(r1, 0x0, &(0x7f0000000100))


setuid(0xee01)
ptrace(0x9, 0x0, 0x0, 0x0)


rasctl(&(0x7f0000001400), 0xd39, 0x0)
lchown(&(0x7f0000000100)='./file0\x00', 0x0, 0xffffffffffffffff)
rasctl(0x0, 0x9, 0x0)
rasctl(&(0x7f0000000100), 0x3, 0x0)


pipe2(&(0x7f0000000280), 0x0)
getsid(0x0)
ptrace(0xc36, 0x0, 0x0, 0x0)
mknod(&(0x7f0000000180)='./file0\x00', 0x2000, 0x0)
open(&(0x7f0000000140)='./file0\x00', 0x0, 0x0)
symlink(&(0x7f0000000080)='.\x00', 0x0)
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
compat_50_select(0x40, &(0x7f0000000080), 0x0, 0x0, 0x0)
compat_50_select(0x40, &(0x7f0000000080), 0x0, 0x0, 0x0)


mknod(&(0x7f00000000c0)='./bus\x00', 0x2000, 0x4f4b)
r0 = open$dir(&(0x7f0000000080)='./bus\x00', 0x0, 0x0)
compat_20_getfsstat(&(0x7f0000000000), 0x138, 0x0)
ioctl$FIOASYNC(r0, 0xc0104304, &(0x7f00000001c0)=0x20000002)


mknod(&(0x7f0000000000)='./file0\x00', 0x2000, 0x40000802)
r0 = open(&(0x7f0000000140)='./file0\x00', 0x0, 0x0)
ioctl$FIOASYNC(r0, 0x40047400, 0x0)


mknod(&(0x7f0000000000)='./file1\x00', 0x2000, 0xa718)
r0 = open$dir(&(0x7f00000000c0)='./file1\x00', 0x0, 0x0)
ioctl$OFIOGETBMAP(r0, 0x8004667e, &(0x7f0000000100)=0xffffffff)


setreuid(0x0, 0xee01)
compat_30_fhstatvfs1(0x0, 0x0, 0x0)


mknod(&(0x7f0000000000)='./bus\x00', 0x2000, 0xa718)
compat_40_mount(0x0, 0x0, 0x0, 0x0)
r0 = open$dir(&(0x7f0000000080)='./bus\x00', 0x0, 0x0)
fchmodat(r0, 0x0, 0x0, 0x0)


mknod(&(0x7f0000000280)='./file0\x00', 0x1ffa, 0x0)
compat_30_getfh(&(0x7f0000000280)='./file0\x00', 0x0)
r0 = open(&(0x7f0000000000)='./file0\x00', 0x80000, 0x20)
geteuid()
r1 = semget$private(0x0, 0x7, 0x3c0)
semctl$SETALL(r1, 0x0, 0x9, &(0x7f00000002c0))
semctl$SETALL(r1, 0x0, 0x9, 0x0)
semop(r1, &(0x7f0000000080)=[{0x1, 0x43, 0x800}, {0x4, 0xe6, 0x1800}, {0x0, 0x101}, {0x1, 0x20, 0x800}, {0x2, 0x5, 0x1800}, {0x1, 0x9e, 0x1000}, {}], 0x7)
r2 = getuid()
setuid(r2)
r3 = getegid()
msgctl$IPC_SET(0x0, 0x1, &(0x7f00000000c0)={{0xfff, 0x0, r3, 0x0, 0x0, 0x180}, 0x0, 0x7, 0x0, 0x0, 0x5b1428f4, 0x6, 0x2000})
semop(r1, &(0x7f00000001c0)=[{0x2, 0x2100, 0x1800}, {0x4, 0x5, 0x400}, {0x1, 0x2006, 0x800}, {0x1, 0x266, 0x800}, {0x1, 0x8, 0x1000}], 0x2aaaace1)
semctl$IPC_SET(r1, 0x0, 0x1, &(0x7f0000000500)={{0x0, 0xffffffffffffffff, 0x0, r2, r3, 0x1a4, 0x4}})
getsockopt$sock_cred(0xffffffffffffffff, 0xffff, 0x11, 0x0, 0x0)
__posix_chown(0x0, 0x0, r3)
dup(r0)


symlink(&(0x7f0000000080)='.\x00', &(0x7f0000000000)='./file0\x00')
compat_50___msgctl13$IPC_STAT(0x0, 0x2, &(0x7f00000003c0)={{}, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, &(0x7f00000001c0)={&(0x7f0000000100)}})
lchown(&(0x7f0000000100)='./file0\x00', 0x0, 0xffffffffffffffff)
compat_40_mount(&(0x7f0000000000)='null\x00', &(0x7f0000000140)='./file0\x00', 0x0, &(0x7f00000001c0))
r0 = open(&(0x7f0000000040)='./file0\x00', 0x0, 0x0)
ioctl$WSMUXIO_INJECTEVENT(r0, 0xc0104603, &(0x7f0000000080))


mknod(&(0x7f0000000280)='./file0\x00', 0x1100, 0x0)
r0 = open(&(0x7f0000000480)='./file0\x00', 0x80000000000206, 0x0)
lseek(r0, 0x0, 0x0, 0x0)


r0 = open(&(0x7f00000000c0)='./file0\x00', 0x615, 0x0)
writev(r0, &(0x7f0000001480)=[{&(0x7f0000001240)="cc60d3d79a1a7122a5fb9de9ea6ca0b30fb2942ef7652736f10f86d45ae419b2ecf7af93d33b3de1f2c56c0d93da7484095d0d543baf7f0161a025c9c02a561f9a76d06f1eaec7d8df99d6c28b10fa891e47d6c3bba690a611889032ac7b3581d7c92d1c546e59764f069048316871794f0615dd4d8bb1057a61fb177be649004fb56a7d46c856a20d73f050bf29d58b803a676cc3761e8a47857f51d90a4465f9c605ab2829d78ea2571900", 0xac}, {&(0x7f0000001200)="b389e5ae", 0x4}, {&(0x7f0000000200)="5ff08de84b4c523a583f91557c748a02cda4cedf13579996073843ec5cbabaadcec85d62a9a897a78307f3d2c30eea6356105ec8aaba86930f1aed0b3d111d", 0x3f}, {&(0x7f0000000240)="2a5f4cd9b73c31d629e8d68e605e5940744ef138960b671d42c56f79fcb70549247e5504e02f85a6099eff9291731e94399a7e966e783d4073ecd5dc8e540f32190bb47050aee3907495ecf7708d1c295a0836f79ba03ea13db12836c2441989726ffbf7fd1da55bfe2caaf081a1ef55599935c0e60cb0866ee48d420b77a6e269bc30b2650349e16792252c6be8bcc6e9ce2b941299ab505e4779fa1960bb7cf924dcbc53c82e76bcd80731a694d12e8a139d8ecef6f5560939054b7842d25cd7396377f39bcfe2a2383feea17b87393cd892b90742631718b1877d8d83d5f66587e3e5b1e129433ead750143a4c51aa46e78e3b1a54a4634eea2846511719b1637b19ea8c96a95332abd69014f53e91a7319a36efb4052387dc6330aa00b5680142650e8dab5eb6b83ff7d0bbf1293338a8d3e8a6dcd7ed4916279eac230dda46e6fcdebe8a4c12d598883d9392da05791e153d905ce1930aebfc5dfc0fc22a88777c9fbc24c0b021189c3503466bf850f59e6649dd0b40d935ba155a9d7e065df6d90a500be9d7e1e77607d495898a8d6c9c48dda88533e75e760dceae91d670f97ebbe68b845c72eb698a9e0389b15cfe768cd4e524dbe939bc2d8363ae3b71a454a0a0378cd1f36565ab9d1b92263688195df65151ee9b8158c7a5280019a38f00ea05142b3cc832ece46e326e36a55d7a30f866ca97da61a3877612f0d9f23c052edf60a430d3a9acbae3d9f14f22f38a1eeac7df37ae17748a960b9e0d265bf6cdec73bd2a7b5f25035d845d936936cbfe6f013f8d7c73eba2bd9876ff6db3224e50e75c051979f4cdc41d28ea54e28f560fb0291cd6b6ba527e0f358ffa46b988f0bdbf801063d2024380071709ac00ed463e08fd1718a857b567d8b3eda143915e943d9530d25ed15743e117be375dc08e241328a53fa05b58c7c72a35d471a72706afce82b6c57eef81444269e06338bfd0af6f6ef6479dcc26aab0b10c7e58572aa507a69831d63c78c947cecda2c57bb8ae6b2ad7862d160b423de941be69928467db10ee4817536196ad0c5cc02a74ba1a1fb467e472f025f200846112b6b3efd298284d8c0cc498fb9203db18e48e34c137d1285120d809a774d56a62b3dbe8f202cb9cf18c33099f5b9651fc7ae41de53842f327fd40bcfaa601a791da1103351a64cc9815ff1f32cde6fbfe4c92f55a9aff41c055d4edce9c656ecd1becb9035bb8bd46cf47bca0dcb1848da94d05d1475c83a48c02f8f808597eaa9e88a4e4b12d0ef95458eeab115cb189fa7e368186bb68610af4651e27feab061c4ada46e8c12d6d33fe32cd40485b02120418a9aaba8f04c17ea1abdcb9e6801f5d8090c0780b3b5ae4382efbb64f62c825527b7e3d60ad9c73ba7f174ea7c7f2d155808052a0ef6ab2af85cf9a758a8c1227c85f2986fc06c5f54807968b18b6b5f9168d0b2de2728689d75f0efaedf79bd8bbbbc1d46b0f6072d46d46687bc92c0b16c4ad36c3fe4c7a2fde0cfec859052dd78d841464a6b4432342ea973bf18f1f5abd7b2fe7962faef39ff38bd88a40dbab0f8c08541e38efe924f9ec1addbd47362e349bfca42fd62cf18fef8db7d3cbf0b1733d37dcdae5b9c738b8426a1cfa205c9671870210508c614f256d199fb822613518b1a6fccfee08c48f0feae1ca4baaf0a02635282d4f630a8df115dc73e72d9f7b8feb4333b2c796f1d3f2cfb398ba1e9c81584185b8b578bac3c64d667da7d91062f0c2ccec0c9d09e0c15c62e7bc509c3204d054c22d4eac685626e48e25cff2fc382279d9cb941c2ca182b1f65b05ff05db880a37b1e7f9b3d8975558858352bbbea9213a885b7d43451230e0624403baa77839baa33ae92de771ca76319c2353e9f98ea8191fbb686aed6d08e87bcc2bb8084ae02fafbac1a0bba93f0f155fa2d7bc21bf77f116429c6a1373c34d080518b73481fcf012c0a9127e8f174695ccd4add75b026bf2935990d5cebc4e74b97ae67a414029252336880232635fce7ed7bf0b069e08e10f7e1cc4d2973dcd2ea0b63f9dc4f4c2db92e03c553afcb1060b12f6b9b4d1d799edceefdfafdae63ad797e400645c149b20f8a10af54db3289441499e122413495c7b32179ab6c274bbdf6dde48ce26f46f07ede9a314c1a73669ee8cb0d854c8a3206dcd4f532d36fa7affca1226962e20ca2c6ca0da1ab0848d23d6f198681911592815c080cead9fe6c81cf0d7ce3578978dbf1dc2f1902e51f740f95b02ebff4cf25ac8d28b8e16998f5e329e9a377a82b5bc227f77c2f1ad505286aac8618799c2363a2d07d229519370be62ef0dde6525df45a7b807759b9ba1c8557bdbecb429432c4e1a84094edda567f1245ba887686342d0a31b7dbe09bd2be75516b255d2bfad9337649e86a29b0a19e16b4059c7c45e9f81db6780af6487a11f9088ea265a99d6e95535e14fe839945f8f7da9bf9142dd12c305bd41bb392cae8eb81b28fda176d4b538f4ddaaed47913cd9149d934ca36879718bbd76a2d7ce11e6294398ea433af6cf53b951974ec96a9527178b6cbad3088a2e2dc75113fa88ede462e68b4d6b63020a45e94fbc6c3e6677d6cffee0e3bf606cd2d4b9fac89f91db8f3a1fb9f506708314249fa5c36912e3cb281acbdec3b080735ae24ebb4bca3c7900c21175435afbe5024a706e90f8ecf257c99cd8b67b5a9c2bdc2bb624201f55882349526f8faf463eecb183635db72a6aef5fa2048d97e6f6c513598f49158050df1efc6b13b8ae396cd6a905c26b94d1d50d361f10c0362a8c206c7dd553dd8b116694aa82ac029c64aa71b8a307793da12f93ce030d3c42443c37b91043687befb7968da97ffe91d45e4fd506ba4d9c5d7fe5ac7433fb6bb3ce471f59ade734d109d2093fa119827b65a360f2128e7e85ab494609b60d6e236ed8a72984f6c75652f5baa07e2e921365af2b819eef58343e65ffb45ba8580d62f5b8a9988f7176c1559f8ede3b1b92e0ff27978fc5225ed1512b605f1fd4fce46e768970b11a7b4802c93eb05a8571887cdc15f69929eadcd1f9d08ed2d6aab2997ffa2102c2d8983a78ad43779d03aa612ecbbdde634b14e980b176f04f99efad82a04001f34d6addb89e0ce629856f5ad401fed1a2c046e1fe18741b24d4253830af94a1defeecdf6f70b430e48d7b2e4ae2fb331322d96e1a361e4dcae1d0865e7ccc4aa58bcbb3ad1ce0b294437887e03088a6bb4c92c3044bb33d2a893975023ad979fb2f84d4842326cf058595398779116245c452e54989df0591238af63c5b187d18457f008f2759ae470282ee4640e92639a46915fb67f22967beb1f46736aa9c90df08f39ee481f6859f9eafdecd473bdc24ef3ad71b4c54dde360f2d550db18af7589c8107f29b0a713b65f039da10c2dc55e408ce75c77aba97c116ab078ed1467464fe4041668228316f0fff7c0e0f1a3cc47aae570182197cc3a030a4af7fe2d401b5a13d59b8fe55d48b0667b782cbf1c2e9e491167d87830bb25008aff93c4459a3ce9fd8655a542c6311d91ec6b6bd7935aba1cd01d10439f5e5a3883566c8538edb97c5d2a7c7533d415973729f52dc03134f3d777b7ad07c0f5b0d985ca48fde6cce9ec99f1223f476dcfbb5ef865a3f54459415b25e2031eeb469c83ac68fd77e59c6b6a4b9543a943c4461e47e94d038c63abf2734a3c0e99b95505e13af6cf7b66840dc60e96fb218436ea00b4a577c2effdba43e6adf97cb47c0ce606e87a7834dec04396c1e4f7df59ae5acedfd1d68c87e405fc5d788884686270cb08f0fb15fe8b11589792f9afc11152cd45f43fac6e758588cfaa88874267c3096bd689108bbb0296f3d182df7d46cf7e6b8bb923ba887443c3d2211785e364c1e55d4e9396d950e50cc11ba9a4da70a64f6acbbcd421183eb2c0521054c4f39650409e21a9d3b781725b0c38abce7b4615fa64a9cbf71a19f7e414279a7494d2e66e4e2b2d5ce7027109cfc763a9dcac37065ae80ec0cf3962deec0f9eba3bd7c514a9854ddec6542cf60a1e69cf5d1a73239633675d4ac1f3b0758e69d1d9fb8659f43f07ef6b0b2a7c9740649198b44f3272a8810d374b00c1d49b7f667ed538d89e91e5734f8d6e270aef0a406fcb4072497dd5f25b49d31cfd64f7af2d5b4df72b74c39a66f442ae61aebe1cdb96b660e3248dad3d05bfce3d8630902a565c3a7636220e0435679ce1c45367570dd08722ddfe7f1bf56c44bcab8ef2d0979eeaa569dade4ec34751833d455b31b55edcb9b64f3045725fcd3a08ce0c20b09a437133df5738092eba91430208814b4135b0a64331aff1847ca60426c91e96a029cff039b14462da6e5f9c545980b3712f7fa5f3958d92278194cf6b37ef76e91103d14db403956a7070d87afa81df227f2189926e1265a2872e9d64c64496083caff36dd3c27e961957acb86f0f9a3cbb5ddbb6bf9c62dc102d70d62eea969b1c2c5f42823f76827e918ffa77cc12ca8d31effd0b55529c496da762b6a95d43360ea4e811979ee316e143733f886a6eca5ac76746d4a302ec57b6b4af57c25631d7773243abd400d15274703154dfde2f1a63e075e4b22ed4516dd1496e3f7848bbf19e3215edda64140649d9b1a06975773d5dd0ca9c4bdfd5eb9fb6164aa659e2a8415098dd05496d0b2a20d966a4c9efefa964e007a3b731d3bad501fd21ca564c6155d893c3075c234af6378a12595d8a88ae0518350b75d57fb91b7610bfc3168f548a9a63432b49a9dbee2ee1a4ddd80255e2c655cc80d59d7e90e21fb090db6d59ef84c0becfda6bd01c5a806ce67bea77fdba62ba40e2114d31e6087ad67b28adc20d7cabe7911aa2db119cf5f43b2011bb92556e8e303a264f813764027fbec36f8248a667c766cfc12c80d96965f222d39b5f58a54b255d3b1eb4530952b6d251ff1f9a2cbc585d539704c8ea6884699ebc95d57cd1daef169da86cc10fe7300352e939c35bbe3104aea1b1c681f3e98be8fad7c91bb56724ffa794ea4ac7c4d51d4aad8e94087b4cdcab897e70558e6648ab42dfadab447735912af011fdbd9a47a4958b953b01d2e274449854cb7d4b44d88fb2db162b8dd664c7c32858c7714c69b838cb30b17e6a42ca760256c4c6da57ad5be955ca8c787262d8bd3f4a5f0170e70f92ee2ec9d227d066e41d0d4b640166b2f6937a66bc84506d102d3edac818b246954ba200a3057820da10b64ff99f016b42d90a29b43290d731ef23a72612d646f92acc05620068fd94edd12902197733cc9dbb55d95930ccc749c4a169b4b7056dba24f767da022eb01243353e1f4d0af2eb32e3feb331283ceab3e0112b37556749a701d11ada5cbe1b4c829fac46e12612679e47d4a066be939b886f0fdc2b4c5b4dd183ca025c5191ac67d7f992643333504da30e8300a0b40152671", 0xf12}], 0x4)
mmap(&(0x7f0000000000/0x2000)=nil, 0x2000, 0x2, 0x11, r0, 0x0, 0x0)
setgroups(0x1, &(0x7f0000000080)=[0x0])
sendmsg$unix(0xffffffffffffffff, &(0x7f0000001700)={0x0, 0x0, 0x0}, 0x0)
madvise(&(0x7f0000000000/0x3000)=nil, 0x3000, 0x4)
munmap(&(0x7f0000000000/0x3000)=nil, 0x3000)


r0 = socket$inet6(0x18, 0x3, 0x0)
pathconf(0x0, 0x0)
sendto$inet6(r0, &(0x7f0000000000)="82", 0x329, 0x2, &(0x7f0000000040)={0x18, 0x3}, 0x1c)


mknod(&(0x7f0000001200)='./file0\x00', 0x2000, 0x400)
r0 = open$dir(&(0x7f0000000400)='./file0\x00', 0x0, 0x0)
preadv(r0, &(0x7f00000012c0)=[{&(0x7f00000001c0)=""/224, 0xfffffdd5}], 0x1, 0x0)


r0 = getppid()
setpriority(0x1, r0, 0x0)


mknod(&(0x7f0000000480)='./file0\x00', 0x2000, 0xffffffffffffffff)


setrlimit(0x9, &(0x7f00000010c0))
socketpair$unix(0x2, 0x3, 0x88, 0x0)


open(&(0x7f0000000000)='./file0\x00', 0x78e, 0x0)
preadv(0xffffffffffffffff, &(0x7f0000000300)=[{&(0x7f00000000c0)=""/104}, {&(0x7f0000000140)=""/192}, {&(0x7f0000000340)=""/175}, {&(0x7f0000000000)=""/18}], 0x10000000000002b8, 0x0)
ktrace(&(0x7f0000000200)='./file0\x00', 0x4, 0xd27d43220c7df9b, 0x0)
socketpair$unix(0x1, 0x2, 0x0, &(0x7f0000000040)={<r0=>0xffffffffffffffff, <r1=>0xffffffffffffffff})
sendto(r0, 0x0, 0x0, 0x0, 0x0, 0x0)
recvmsg(r1, &(0x7f0000002880)={&(0x7f00000014c0), 0xc, 0x0, 0x0, 0x0}, 0x0)


r0 = open(&(0x7f0000000480)='./file0\x00', 0x80000000000206, 0x0)
writev(0xffffffffffffffff, 0x0, 0x0)
r1 = open(&(0x7f0000000040)='./file0\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x200000)=nil, 0x200000, 0x0, 0x10, r1, 0x0, 0x0)
ioctl$FIONREAD(r0, 0x4004667f, 0x0)


r0 = socket(0x18, 0x1, 0x0)
open$dir(0x0, 0x0, 0x0)
ioctl$FIOASYNC(0xffffffffffffffff, 0x80104305, 0x0)
sendmmsg(r0, &(0x7f00000028c0)={&(0x7f0000002880)={0x0, 0x0, 0x0, 0x0, 0x0}, 0x7aa}, 0x10, 0x0, 0x0)


ioctl$WSMUXIO_INJECTEVENT(0xffffffffffffff9c, 0x80185760, &(0x7f0000000000)={0x0, 0x0, {0x0, 0x3}})
fcntl$lock(0xffffffffffffffff, 0x0, &(0x7f0000000040)={0x0, 0x0, 0x0, 0x8000000000000001})
semctl$SETALL(0x0, 0x0, 0x9, &(0x7f0000000040)=[0x7ff])
syz_emit_ethernet(0x138, &(0x7f0000000000))
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
r0 = socket(0x18, 0x1, 0x0)
setsockopt(r0, 0x1000000029, 0xc, &(0x7f0000000000)="ebffcbff13b9fd812eaa4e713048e69931929648", 0x14)
setsockopt(r0, 0x1000000029, 0xc, &(0x7f0000000040)="ebffcbff13b9fd812eaa4e713048e69931929648", 0x14)
setsockopt(r0, 0x1000000029, 0xd, &(0x7f0000000000)="ebffcbff13b9fd812eaa4e713048e69931929648", 0x14)


recvmmsg(0xffffffffffffffff, &(0x7f0000000740), 0x1, 0x0, 0x0)
madvise(&(0x7f0000000000/0x600000)=nil, 0x60005f, 0x19)
madvise(&(0x7f0000000000/0x600000)=nil, 0x600003, 0x15)
sendmsg(0xffffffffffffffff, &(0x7f0000000000)={0x0, 0x0, 0x0, 0x0, &(0x7f00000000c0)=ANY=[@ANYBLOB="e8000000000000000b210000ff3f7c081e0f315b91fcaec7bf495d5c618332756cbb1bb9ce6d12b9d976d1f33aca41e50a3342bcd67c311f7885a05c3fcf2ae21f1498ec481e7ca2c3ca4c7b3bf94448f62e111e5a79929b9182cc977ba6ae766ce37bdaac6da997fbc15f0c79f42155b99a280667b51fdc7902d7be5ef41f953fedb32aceeada13250626957eff13d5b12cc916541ccbeb0d4060a4dd89664eaba2f6b4ede0c9e3dc1c9446d9284ebe0e46eee7bc145ff0a2779c025553298812978ea53a8c60f254f23344a80a0aac7b141787bad6b0ba090000005f2f3158f0d200000000000070000000000000000701000040000000afbb30c2946e41ef3167d1f6ed47aa1f52bad114a89dbed741f74a23cd8d915e2dcc74a4932646b90f90a9d3956d5cadb642ac79fcb0aae3654482188263abd27e9d57cc28032453dc75f333e1f367ab38b7e7719805a454e79802d07ec60c00b0000000000000000100000001"], 0x208}, 0x0)
socketpair(0x1, 0x5, 0x0, &(0x7f0000000240)={<r0=>0xffffffffffffffff, <r1=>0xffffffffffffffff})
sendmsg(r0, &(0x7f0000000300)={0x0, 0x0, 0x0, 0x0, &(0x7f00000000c0)=ANY=[], 0x208}, 0x0)
recvmmsg(0xffffffffffffffff, &(0x7f0000007600), 0x1, 0x0, 0x0)
recvmmsg(r1, &(0x7f0000001dc0), 0x1800, 0x2002, 0x0)


compat_40_mount(&(0x7f0000000040)='tmpfs\x00', &(0x7f00000000c0)='.\x00', 0x0, &(0x7f00000002c0)="01")
symlink(&(0x7f0000000080)='.\x00', &(0x7f0000000240)='./file0\x00')
compat_50___msgctl13$IPC_STAT(0x0, 0x2, &(0x7f0000000280)={{}, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, &(0x7f00000001c0)={&(0x7f0000000100)}})
sendmsg$unix(0xffffffffffffffff, &(0x7f0000000240)={&(0x7f0000000180)=@abs={0x0, 0x0, 0x2}, 0x8, &(0x7f0000000200)}, 0x0)
open(&(0x7f0000000100)='./file0\x00', 0x0, 0x0)
compat_40_mount(&(0x7f0000000140)='umap\x00', &(0x7f0000000040)='./file0\x00', 0x0, &(0x7f00000001c0))
r0 = open$dir(&(0x7f0000000000)='.\x00', 0x0, 0x0)
mkdirat(r0, &(0x7f0000000600)='./file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\x00', 0x0)


open$dir(&(0x7f00000000c0)='./file0\x00', 0x40000400001803c1, 0x0)
ktrace(&(0x7f0000000200)='./file0\x00', 0x4, 0xd27d43220c7df9b, 0x0)
r0 = open$dir(&(0x7f0000000000)='./file0\x00', 0x2, 0x0)
writev(r0, &(0x7f0000000340)=[{&(0x7f0000000000), 0x2cfea}], 0x1000000000000013)


symlink(&(0x7f0000000080)='.\x00', &(0x7f0000000240)='./file0\x00')
compat_50___msgctl13$IPC_STAT(0x0, 0x2, &(0x7f0000000280)={{}, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, &(0x7f00000001c0)={&(0x7f0000000100), 0xdffffffffffff7ff}})
lchown(&(0x7f0000000100)='./file0\x00', 0x0, 0xffffffffffffffff)
compat_40_mount(&(0x7f0000000380)='union\x00', &(0x7f0000000140)='./file0\x00', 0x0, &(0x7f00000001c0))
compat_40_mount(&(0x7f0000000380)='union\x00', &(0x7f0000000140)='./file0\x00', 0x0, &(0x7f00000001c0))
unlink(&(0x7f0000000000)='./file0\x00')
mknod(&(0x7f0000000280)='./file0\x00', 0x2000, 0x200)
r0 = open$dir(&(0x7f0000000000)='./file0\x00', 0x40000400001803c1, 0x0)
pwritev(r0, &(0x7f0000000180)=[{0x0}], 0x1, 0x0)


mknod(&(0x7f0000000000)='./file0\x00', 0x2000, 0x40000802)
r0 = open(&(0x7f00000000c0)='./file0\x00', 0x0, 0x0)
ioctl$FIOASYNC(r0, 0x80017472, &(0x7f0000000080))


r0 = socket$unix(0x1, 0x2, 0x0)
bind$unix(r0, &(0x7f0000000200)=@file={0xd570d0466b6018f, './file0\x00'}, 0xa)
__mount50(&(0x7f00000002c0)='overlay\x00', &(0x7f0000000040)='.\x00', 0x0, &(0x7f0000000540), 0x0)
r1 = socket$unix(0x1, 0x5, 0x0)
connect$unix(r1, &(0x7f0000000000)=@file={0xd1653077bafa0114, './file0\x00'}, 0xa)
close(r0)


r0 = socket(0x2, 0x3, 0x0)
ioctl$FIOSEEKHOLE(r0, 0xc020690d, &(0x7f0000000180)=0x8000000000000032)
r1 = socket(0x18, 0x3, 0x0)
ioctl$FIOSEEKHOLE(r1, 0xc118691d, &(0x7f0000000180)=0x8000000000000032)


poll(&(0x7f0000000000)=[{}, {}], 0x2, 0x0)
modctl$MODCTL_UNLOAD(0x2, &(0x7f0000000000))


mknod(&(0x7f0000000100)='./file0\x00', 0x80002005, 0x5300)
r0 = open(&(0x7f0000000040)='./file0\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x13000)=nil, 0x13000, 0x0, 0x10, r0, 0x0, 0x0)
__select50(0x0, 0x0, &(0x7f0000000100), 0x0, &(0x7f0000000180))


compat_40_mount(&(0x7f0000000200)='ptyfs\x00', &(0x7f00000003c0)='.\x00', 0x0, &(0x7f0000000a80))
__mount50(&(0x7f00000002c0)='overlay\x00', &(0x7f0000000040)='.\x00', 0x0, &(0x7f0000000540), 0x0)
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
pathconf(&(0x7f0000000040)='./file0\x00', 0x2)


modctl$MODCTL_UNLOAD(0x2, 0x0)
mknodat(0xffffffffffffff9c, &(0x7f0000000040)='./file0\x00', 0x1000, 0x0)
open(&(0x7f0000000240)='./file0\x00', 0x1, 0x0)
open(&(0x7f0000000040)='./file0\x00', 0x0, 0x0)
compat_50___msgctl13$IPC_STAT(0x0, 0x2, 0x0)
setsockopt$sock_int(0xffffffffffffffff, 0xffff, 0x0, &(0x7f0000000000)=0x9455, 0x4)
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
__select50(0x40, &(0x7f0000000000), 0x0, 0x0, 0x0)


pipe(0x0)
mknod(&(0x7f0000000100)='./file0\x00', 0x2000, 0x0)
socket$inet(0x2, 0x0, 0x0)
r0 = open(&(0x7f0000000480)='./file0\x00', 0x80000000000206, 0x0)
write(r0, &(0x7f0000000080)="8e", 0x4c0)


setuid(0xee01)
execve(0xffffffffffffffff, 0x0, 0x0)


mknod(&(0x7f0000000080)='./bus\x00', 0x2000, 0x205b1a)
r0 = open(&(0x7f0000000000)='./bus\x00', 0x0, 0x0)
r1 = socket$inet(0x2, 0x3, 0x0)
setsockopt(r1, 0x0, 0x23, &(0x7f0000000040)="3342b19c", 0x4)
syz_emit_ethernet(0x2a, &(0x7f00000005c0))
poll(&(0x7f00000000c0)=[{r0, 0x4}], 0x1, 0x0)
fchmodat(r0, &(0x7f0000000100)='./bus\x00', 0x4, 0x4)
close(r0)


mknod(&(0x7f0000000140)='./file0\x00', 0x2000, 0x0)
r0 = open(&(0x7f0000000180)='./file0\x00', 0x80000000000206, 0x0)
read(r0, &(0x7f0000000240)=""/83, 0x53)


socket(0x0, 0x0, 0x0)
close(0xffffffffffffffff)
__fstat50(0xffffffffffffffff, 0x0)
semctl$IPC_SET(0x0, 0x0, 0x1, 0x0)
open$dir(0x0, 0x0, 0x0)
pipe(0x0)
getpgid(0x0)
ktrace(0x0, 0x0, 0x0, 0x0)
socket(0x0, 0x0, 0x0)
connect$unix(0xffffffffffffffff, 0x0, 0x0)
r0 = socket(0x2, 0x3, 0x0)
connect$inet(r0, &(0x7f00000000c0)={0x2, 0x0}, 0x10)
sendto$inet(r0, 0x0, 0x0, 0x8004, 0x0, 0x0)


clock_nanosleep(0x40000000, 0x0, &(0x7f0000000140)={0x0, 0x6}, 0x0)


r0 = socket(0x18, 0x2, 0x0)
r1 = socket(0x2, 0x3, 0x0)
dup2(r0, r1)
__fstat50(r1, &(0x7f0000000280))


poll(0x0, 0x0, 0x1000)


compat_43_osetrlimit(0x9, &(0x7f0000000080))
socket$inet(0x2, 0x1, 0x0)


socketpair$unix(0x1, 0x2, 0x0, &(0x7f0000000040)={0xffffffffffffffff, <r0=>0xffffffffffffffff})
getsockopt$sock_cred(r0, 0xffff, 0x1022, &(0x7f00000001c0)={0x0, 0x0, <r1=>0x0}, &(0x7f0000000200)=0xc)
semctl$IPC_SET(0x0, 0x0, 0x1, &(0x7f0000000080)={{0x0, 0x0, 0x0, 0x0, r1}})
r2 = socket(0x2, 0x3, 0x0)
close(r2)
r3 = socket(0x800000018, 0x1, 0x0)
setsockopt$sock_int(r3, 0xffff, 0x1000, &(0x7f0000000000)=0x7, 0x4)
bind$unix(r3, &(0x7f0000000080)=@abs={0x1f95d27d48731892, 0x7}, 0x1c)
semctl$IPC_SET(0x0, 0x0, 0x1, &(0x7f00000000c0)={{0x0, 0x0, 0x0, 0x0, r1}})
connect$unix(r2, &(0x7f00000000c0)=@abs={0x682eb13985c518e6, 0x7}, 0x1c)
r4 = socket(0x800000018, 0x1, 0x0)
setsockopt$sock_int(r4, 0xffff, 0x1000, &(0x7f0000000000)=0x800008, 0x4)
bind$unix(r4, &(0x7f0000000080)=@abs={0x1f95d27d48731892, 0x7}, 0x1c)


r0 = socket$inet(0x2, 0x2, 0x0)
setsockopt$inet_opts(r0, 0x11, 0x0, 0x0, 0x0)


r0 = compat_30_socket(0x22, 0x3, 0x0)
r1 = compat_30_socket(0x22, 0x3, 0x0)
compat_43_osend(r1, &(0x7f0000000040)="08200204", 0x600, 0x0)
compat_43_orecvfrom(r0, 0x0, 0x0, 0x0, 0x0, 0x0)


open$dir(&(0x7f0000000000)='./file0\x00', 0x0, 0x0)
compat_50_____semctl13$IPC_SET(0x0, 0x0, 0x1, &(0x7f0000000000))


lchown(&(0x7f00000001c0)='./file0\x00', 0x0, 0x0)
r0 = msgget$private(0x0, 0xfffffffffffffffd)
msgrcv(r0, &(0x7f0000000940), 0xcc, 0x2, 0x0)
r1 = socket(0x18, 0x2, 0x0)
getsockopt$SO_PEERCRED(r1, 0xffff, 0x1022, 0x0, 0x0)
msgsnd(0x0, 0x0, 0x15, 0x0)
msgsnd(r0, &(0x7f0000000440)={0x3, "a486714b3b6964c6224c6d3e16e0d3d8edebe56ff5cc0190d7f39c044dac99fec5afca3ec3e155903698d635e2ab348195cce43ab9e134935e4edf5efe4e5ec4bec02d51201f93b9860f69d58fca21e1f36041df344b049af83f321177b2fdcfcc2725691dc0"}, 0x6e, 0x800)
msgsnd(r0, &(0x7f0000001180)={0x1, "658a5d81ce75adb5b10eac918a2307d2ff5c352fd389f4223864f706183b00f1245146955b84e3c261d985a3057c107b6beff52d4164b475b367f4b66d2d30d7e42b0740623b05124c1211fd00000000fb5243d2296fd07388c6b1133f721f1eff78309b886d3f"}, 0x6f, 0x800)
msgrcv(r0, &(0x7f0000003480), 0x1008, 0x0, 0x1800)
msgsnd(r0, &(0x7f00000006c0)=ANY=[@ANYBLOB="0200000000a64456dc76eb4f76fe2e0155d89563ff3e533d3557413d2bd4388f5b897f5724208509aca9390356e14053f4aad7b6c725e83054b8f8cdda743d5f5e2901006494ccfa2b5aa73e516a432cc26a5d60e1ff352c6d870115c0910a2d7453ded7a23ef0c9a83b190000000000000051191500000000000000000000000000000000000000000000000000166542c779bc14147634d2dea47567e64fcaa63d61226743c01143a93b67974dc5bba812e8529a2d458e641986c68304e3959c418324ba3740cf0dc778d5"], 0x91, 0x800)
msgctl$IPC_SET(r0, 0x1, &(0x7f00000002c0)={{0x8, 0x0, 0x0, 0x0, 0x0, 0x40, 0x100}, 0x0, 0x5, 0x0, 0x0, 0x0, 0xf234, 0x8, 0x9})
msgctl$IPC_RMID(r0, 0x0)
r2 = getuid()
r3 = getegid()
r4 = socket$unix(0x1, 0x5, 0x0)
getsockopt$sock_cred(r4, 0xffff, 0x1004, &(0x7f0000001840)={0x0, <r5=>0x0}, &(0x7f0000001880)=0xc)
r6 = msgget$private(0x0, 0x22a)
msgctl$IPC_SET(r6, 0x1, &(0x7f0000000080)={{0x0, 0x0, 0x0, r5, 0x0, 0x4, 0x4}, 0x9, 0x6})
r7 = getegid()
getgroups(0x1, &(0x7f0000000000)=[r7])
getsockopt$SO_PEERCRED(0xffffffffffffffff, 0xffff, 0x1022, &(0x7f0000000080)={<r8=>0x0}, 0xc)
r9 = getppid()
msgctl$IPC_SET(0x0, 0x1, &(0x7f00000000c0)={{0x0, 0x0, 0xffffffffffffffff, 0x0, 0x0, 0x0, 0x7ff}, 0x0, 0x0, r9})
msgctl$IPC_SET(r0, 0x1, &(0x7f0000000140)={{0x1, r2, r3, r5, r7, 0xaa, 0x4}, 0x5, 0x800, r8, r9, 0x4000000000, 0x4, 0xffffffff, 0x6})
socketpair$unix(0x1, 0x1, 0x0, &(0x7f0000000040)={0xffffffffffffffff, <r10=>0xffffffffffffffff})
close(r10)
r11 = socket(0x18, 0x2, 0x0)
readv(0xffffffffffffffff, &(0x7f0000000100)=[{&(0x7f0000000000)=""/46, 0x2e}], 0x1)
connect$unix(r11, &(0x7f00000000c0)=@abs={0x682eb13985c518e6, 0x7}, 0x1c)
sendmsg(r10, &(0x7f00000003c0)={0x0, 0x0, 0x0, 0x0, &(0x7f0000000100)=ANY=[@ANYBLOB="1400000029"], 0x3e}, 0x0)


setreuid(0x0, 0xee00)
r0 = socket(0x18, 0x2, 0x0)
setsockopt(r0, 0x1000000029, 0x30, &(0x7f0000000000), 0x0)


mkdir(&(0x7f0000000080)='./file0\x00', 0x0)
compat_40_mount(&(0x7f0000000180)='ptyfs\x00', &(0x7f00000002c0)='./file0\x00', 0x0, &(0x7f0000000500))
pathconf(&(0x7f00000001c0)='./file0\x00', 0x6)


r0 = msgget(0x2, 0x200)
shmat(r0, &(0x7f0000ff9000/0x3000)=nil, 0x0)
msgrcv(0x0, &(0x7f0000000080)={0x0, ""/134}, 0x8e, 0x0, 0x0)
shmctl$IPC_RMID(r0, 0x0)


socket(0x2, 0x2, 0x0)
syz_emit_ethernet(0xe, &(0x7f0000000000))
writev(0xffffffffffffffff, &(0x7f0000000580)=[{&(0x7f0000000000)="b886b4e47f", 0x5}], 0x1)
syz_emit_ethernet(0x138, &(0x7f0000000000))
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1)
r0 = socket(0x2, 0x1, 0x0)
bind(r0, &(0x7f0000000000), 0x3)
r1 = dup(r0)
r2 = fcntl$dupfd(r1, 0x2, 0xffffffffffffffff)
close(r2)
r3 = socket(0x2, 0x1, 0x0)
connect$unix(r3, &(0x7f0000000000), 0x10)
setsockopt$sock_int(r2, 0xffff, 0x1023, &(0x7f0000000080), 0x4)
sendmsg$unix(0xffffffffffffffff, 0x0, 0x0)
r4 = socket(0x2, 0x1, 0x0)
r5 = dup(r4)
r6 = fcntl$dupfd(r5, 0x2, 0xffffffffffffffff)
dup2(r5, r6)


socket(0x0, 0x0, 0x0)
getsockname$inet(0xffffffffffffffff, 0x0, 0x0)
r0 = socket(0x2, 0x3, 0x0)
r1 = socket(0x18, 0x2, 0x0)
connect$unix(r1, &(0x7f00000000c0)=@abs={0x682eb13985c518e6, 0x7}, 0x1c)
dup2(r1, r0)
write(r0, &(0x7f0000001680)="04bdfa5d1d2873c63e3534825ba166e2fea9aec43050006123339a346f731573d8d508753f95b7688ad48b8cf6bbca325cebc37fc4e1dd543dbe2da6dd", 0x1001c)


socket(0x18, 0x2, 0x0)
fcntl$lock(0xffffffffffffffff, 0x0, &(0x7f00000000c0)={0x0, 0x0, 0x83fe})
open(0x0, 0x0, 0x0)
ktrace(&(0x7f0000000000)='./file1\x00', 0x4, 0x40000038, 0x0)
socket(0x11, 0x1, 0x9)
r0 = socket(0x2, 0x2, 0x0)
setsockopt$sock_int(r0, 0xffff, 0x1023, &(0x7f0000000040)=0x3, 0x4)
getsockopt$sock_timeval(r0, 0xffff, 0x1005, &(0x7f0000000700), &(0x7f0000000800)=0x10)
r1 = getuid()
setreuid(0x0, 0xee01)
r2 = socket(0x2, 0x2, 0x0)
r3 = socket(0x18, 0x2, 0x0)
setsockopt(r3, 0x1000000029, 0x3d, 0x0, 0x0)
ioctl$FIONREAD(r2, 0x8028698c, &(0x7f00000001c0))
setreuid(0xee00, r1)
setreuid(r1, r1)
mknod(&(0x7f0000000040)='./file0\x00', 0x6000, 0xe02)
setsockopt(0xffffffffffffffff, 0x6, 0x1, &(0x7f0000000040)='\x00\x00\x00\x00', 0x4)
r4 = open(&(0x7f0000000200)='./file0\x00', 0x0, 0x0)
preadv(r4, &(0x7f0000000a80)=[{&(0x7f0000000080)=""/185, 0xb9}, {&(0x7f0000000300)=""/243, 0xf3}, {&(0x7f0000000400)=""/145, 0x91}, {&(0x7f0000000840)=""/210, 0xd2}, {&(0x7f00000004c0)=""/169, 0xa9}, {&(0x7f0000000940)=""/104, 0x68}, {&(0x7f00000009c0)=""/150, 0x96}], 0x7, 0x0)


open(&(0x7f0000001640)='./file0\x00', 0x615, 0x0)
r0 = open(&(0x7f0000000040)='./file0\x00', 0x0, 0x0)
r1 = socket$inet(0x2, 0x2, 0x0)
setsockopt$inet_opts(r1, 0x0, 0x200000000000c, &(0x7f0000000240)="ea00000100000000", 0x8)
mmap(&(0x7f0000000000/0x13000)=nil, 0x13000, 0x5, 0x10, r0, 0x0, 0x0)


symlink(&(0x7f0000000200)='./file0\x00', &(0x7f0000000080)='./file0\x00')
access(&(0x7f0000000000)='./file0\x00', 0x0)


compat_40_mount(0x0, 0x0, 0x0, &(0x7f00000002c0)="01")
compat_40_mount(&(0x7f0000000100)='procfs\x00', &(0x7f00000000c0)='.\x00', 0x0, &(0x7f00000002c0))
__mount50(&(0x7f0000000000)='overlay\x00', 0x0, 0x0, 0x0, 0x0)
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
open$dir(&(0x7f0000000000)='./file0\x00', 0x0, 0x0)


compat_20_getfsstat(&(0x7f0000000000), 0xffffffffffffff39, 0x0)
open$dir(&(0x7f0000000000)='./file0\x00', 0x0, 0x0)
mknod(&(0x7f00000000c0)='./bus\x00', 0x2000, 0x4f4b)
r0 = open$dir(&(0x7f0000000040)='./bus\x00', 0x0, 0x0)
ioctl$FIOASYNC(r0, 0xc0104308, &(0x7f00000001c0)=0x20000002)


open(&(0x7f00000000c0)='./file0\x00', 0x80000000000206, 0x0)
ktrace(&(0x7f0000000200)='./file0\x00', 0x4, 0xd27d43220c7df9b, 0x0)
setpgid(0x0, 0xffffffffffffffff)


mknod(&(0x7f0000000080)='./bus\x00', 0x2000, 0x0)
r0 = open$dir(&(0x7f0000000000)='./bus\x00', 0x0, 0x0)
ioctl$FIOASYNC(r0, 0x4004747c, &(0x7f0000000040))


openat$tprof(0xffffffffffffff9c, &(0x7f00000000c0), 0x200, 0x0)


modctl$MODCTL_UNLOAD(0x2, 0x0)
compat_40_mount(0x0, 0x0, 0x0, &(0x7f00000002c0)="01")
__mount50(&(0x7f0000000000)='overlay\x00', &(0x7f0000000040)='.\x00', 0x0, &(0x7f0000000100), 0x0)
mkdir(&(0x7f00000000c0)='./control\x00', 0x0)
rmdir(&(0x7f0000000040)='./control\x00')


open(&(0x7f0000000340)='./file0\x00', 0x300, 0x0)
mknod(&(0x7f00000000c0)='./bus\x00', 0x2000, 0x4f4b)
__mount50(&(0x7f00000002c0)='overlay\x00', &(0x7f0000000040)='.\x00', 0x0, &(0x7f0000000540), 0x0)
rename(&(0x7f00000006c0)='./bus\x00', &(0x7f0000000580)='./file0\x00')


ktrace(&(0x7f0000000000)='./file0\x00', 0x0, 0x0, 0x0)
mknod(&(0x7f0000000100)='./file0\x00', 0x80002005, 0x5300)
r0 = open(&(0x7f0000002600)='./file0\x00', 0x0, 0x0)
ioctl$WSKBDIO_SETMAP(r0, 0xc0145002, &(0x7f0000000000)={0x0, 0x0})


fcntl$lock(0xffffffffffffffff, 0x0, &(0x7f0000000040)={0x0, 0x0, 0x0, 0x8000000000000001})
ioctl$WSMUXIO_INJECTEVENT(0xffffffffffffffff, 0x80185760, &(0x7f0000000000)={0x7fffffff, 0x0, {0x0, 0x1}})
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
r0 = socket(0x18, 0x1, 0x0)
setsockopt(r0, 0x1000000029, 0xc, &(0x7f0000000000)="ebffcbff13b9fd812eaa4e713048e69931929648", 0x14)
setsockopt(r0, 0x1000000029, 0xc, &(0x7f0000000040)="ebffcbff13b9fd812eaa4e713048e69931929648", 0x14)
setsockopt(r0, 0x1000000029, 0xc, &(0x7f00000000c0)="ebffcbff13b9fd812eaa4e713a48e69931929648", 0x14)


compat_40_mount(&(0x7f0000000040)='tmpfs\x00', &(0x7f00000000c0)='.\x00', 0x0, &(0x7f00000002c0)="01")
symlink(&(0x7f0000000080)='.\x00', &(0x7f0000000240)='./file0\x00')
compat_50___msgctl13$IPC_STAT(0x0, 0x2, &(0x7f0000000280)={{}, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, &(0x7f00000001c0)={&(0x7f0000000100), 0xdffffffffffff7ff}})
lchown(&(0x7f0000000100)='./file0\x00', 0x0, 0xffffffffffffffff)
compat_40_mount(&(0x7f0000000380)='union\x00', &(0x7f0000000140)='./file0\x00', 0x0, &(0x7f00000001c0))
unlink(&(0x7f0000000000)='./file0\x00')
open(&(0x7f00000000c0)='./file0\x00', 0x615, 0x0)
chflags(&(0x7f00000003c0)='./file0\x00', 0x5)
compat_43_ocreat(&(0x7f0000000000)='./file0\x00', 0x0)


r0 = socket$inet(0x2, 0x1, 0x0)
setsockopt$inet_opts(r0, 0x6, 0x3, &(0x7f0000000040), 0x4)


r0 = socket$inet(0x2, 0x1, 0x0)
setsockopt(r0, 0x0, 0x19, &(0x7f0000000080)="301dc649", 0x4)


r0 = socket$inet6(0x18, 0x3, 0x0)
getsockopt(r0, 0x29, 0x1d, 0x0, 0x0)


r0 = _lwp_self()
_lwp_detach(r0)
_lwp_wait(r0, 0x0)


open(&(0x7f0000000480)='./file0\x00', 0x7fffe, 0x0)
truncate(&(0x7f0000000200)='./file0\x00', 0x0, 0x7fff)
truncate(&(0x7f0000000440)='./file0\x00', 0x0, 0x3e2b649f)
truncate(&(0x7f0000000780)='./file0\x00', 0x0, 0x10001)


mknod(&(0x7f0000000480)='./file0\x00', 0x2000, 0x1733)
r0 = open(&(0x7f00000000c0)='./file0\x00', 0x0, 0x0)
ioctl$FIOASYNC(r0, 0x2000427e, 0x0)
ioctl$WSKBDIO_BELL(r0, 0x20005701)


open(&(0x7f00000000c0)='./file0\x00', 0x80000000000206, 0x0)
socket$unix(0x1, 0x0, 0x0)
ktrace(&(0x7f0000000200)='./file0\x00', 0x4, 0xd27d43220c7df9b, 0x0)
compat_30_getfh(&(0x7f0000000040)='./file0\x00', &(0x7f0000000100))


ioctl$WSMUXIO_INJECTEVENT(0xffffffffffffffff, 0x80185760, &(0x7f0000000000)={0x0, 0x0, {0x0, 0x4000000000000003}})
r0 = getsid(0x0)
ptrace(0x9, r0, 0x0, 0x0)
compat_50_wait4(0x0, 0x0, 0x0, 0x0)
ptrace(0x26, r0, &(0x7f0000000000), 0x0)


compat_43_orecvfrom(0xffffffffffffffff, 0x0, 0x0, 0x0, 0x0, 0x0)


__clone(0xf00, 0x0)
compat_50_wait4(0xffffffffffffffff, 0x0, 0x8, 0x0)


ioctl$WSMUXIO_INJECTEVENT(0xffffffffffffffff, 0x80185760, &(0x7f0000000000)={0x0, 0x0, {0x0, 0x10000000000001}})
fcntl$lock(0xffffffffffffffff, 0x0, &(0x7f0000000040)={0x0, 0x0, 0x0, 0x8000000000000001})
semctl$SETALL(0x0, 0x0, 0x9, &(0x7f0000000040)=[0x7ff])
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x4)
r0 = socket(0x18, 0x1, 0x0)
setsockopt(r0, 0x1000000029, 0xc, &(0x7f0000000000)="ebffcbff13b9fd812eaa4e713048e69931929648", 0x14)
setsockopt(r0, 0x1000000029, 0xc, &(0x7f0000000040)="ebffcbff13b9fd812eaa4e713048e69931929648", 0x14)
setsockopt(r0, 0x1000000029, 0xd, &(0x7f0000000000)="ebffcbff13b9fd812eaa4e713048e69931929648", 0x14)


modctl$MODCTL_UNLOAD(0x1, &(0x7f0000000080)="f7f18c4b0d602d76648e1e31046b3d046bdf3bf31d62c7487d077681d6fcd0998d")
modctl$MODCTL_LOAD(0x0, &(0x7f0000000080)={0x0, 0x0, 0x0, 0xc1})
compat_40_mount(&(0x7f0000000040)='tmpfs\x00', &(0x7f0000000140)='.\x00', 0x0, &(0x7f0000000080)="01")
symlink(&(0x7f0000000080)='.\x00', &(0x7f0000000240)='./file0\x00')
compat_50___msgctl13$IPC_STAT(0xffffffffffffffff, 0x2, &(0x7f0000000280)={{}, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, &(0x7f00000001c0)={&(0x7f0000000100)}})
open(&(0x7f0000000100)='./file0\x00', 0x0, 0x0)
compat_40_mount(&(0x7f0000000140)='umap\x00', &(0x7f0000000040)='./file0\x00', 0x0, &(0x7f00000001c0))
compat_50_lutimes(0x0, 0x0)
compat_50___msgctl13$IPC_STAT(0x0, 0x2, &(0x7f00000003c0)={{}, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, &(0x7f00000001c0)={&(0x7f0000000100), 0xdffffffffffff803}})
setsockopt$inet6_MRT6_ADD_MIF(0xffffffffffffffff, 0x29, 0x66, 0x0, 0x0)
lchown(&(0x7f0000000100)='./file0\x00', 0x0, 0xffffffffffffffff)
compat_40_mount(&(0x7f0000000380)='union\x00', &(0x7f0000000140)='./file0\x00', 0x0, &(0x7f00000001c0))


compat_50_utimes(0x0, &(0x7f0000000900)={0x0, 0x7fffffff})


r0 = socket(0x1f, 0x5, 0x0)
setsockopt$sock_int(r0, 0xffff, 0x0, 0x0, 0x0)
__getvfsstat90(&(0x7f0000000fc0), 0xfffffffffffffc4e, 0x0)


madvise(&(0x7f0000ffa000/0x1000)=nil, 0x1000, 0x0)
mlock(&(0x7f0000ffc000/0x3000)=nil, 0x3000)
mmap(&(0x7f0000ffb000/0x4000)=nil, 0x4000, 0xd49f275d97cc01bb, 0x1810, 0xffffffffffffffff, 0x0, 0x0)


mknod(&(0x7f0000000200)='./file0\x00', 0x2000, 0x0)
r0 = open(&(0x7f00000000c0)='./file0\x00', 0x0, 0x0)
mknodat(r0, &(0x7f00000009c0)='./file0\x00', 0x1000, 0x0)


socketpair$unix(0x1, 0x5, 0x0, &(0x7f0000000040)={<r0=>0xffffffffffffffff})
recvfrom$unix(r0, &(0x7f00000000c0)=""/11, 0x0, 0x0, &(0x7f0000000100)=@file={0x0, '\xe9\x1fq\x89Y\x1e\x923aK\x00'}, 0xfffffffffffffd42)


open(&(0x7f00000000c0)='./file0\x00', 0x615, 0x0)
munmap(&(0x7f0000001000/0x3000)=nil, 0x3000)
compat_40_mount(&(0x7f0000000080)='puffs\x00', &(0x7f0000000140)='./file0\x00', 0x0, &(0x7f0000000300))


getgroups(0x17, 0x0)


mknod(&(0x7f0000000480)='./file0\x00', 0x2000, 0x1733)
r0 = open(&(0x7f00000000c0)='./file0\x00', 0x0, 0x0)
ioctl$FIONREAD(r0, 0x80104267, &(0x7f0000000080))


mmap(&(0x7f0000ffc000/0x1000)=nil, 0x1000, 0x0, 0x10, 0xffffffffffffffff, 0x0, 0x3ff)


symlink(0x0, &(0x7f0000000240)='./file0\x00')
minherit(&(0x7f0000003000/0x1000)=nil, 0x1000, 0x4)
__clone(0x0, 0x0)


sendmsg(0xffffffffffffffff, &(0x7f0000000280)={0x0, 0x0, &(0x7f00000006c0)=[{&(0x7f0000000180)="f69623aaf2ffdf8200d384a60f32357316", 0x11}], 0x1, 0x0}, 0x0)
r0 = socket(0x18, 0x3, 0x0)
ioctl$FIOSEEKHOLE(r0, 0x8020690c, &(0x7f0000000180)=0x8000000000000032)


pipe(&(0x7f0000000640)={<r0=>0xffffffffffffffff})
r1 = getpid()
fktrace(r0, 0x0, 0x0, r1)
setgroups(0xfd63, 0x0)


r0 = socket(0x2, 0x3, 0x0)
ioctl$FIOSEEKHOLE(r0, 0xc0986981, &(0x7f0000000180)=0x8000000000000039)


r0 = socket$inet(0x2, 0x2, 0x0)
ioctl$FIONREAD(r0, 0xc0106924, &(0x7f0000000080))
_lwp_wait(0xffffffffffffffff, 0x0)
compat_43_ocreat(&(0x7f0000000080)='./file0\x00', 0x0)
r1 = open(&(0x7f0000001700)='./file0\x00', 0x0, 0x0)
syz_emit_ethernet(0x0, 0x0)
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
poll(&(0x7f0000000000)=[{}], 0x4e8, 0x0)
__posix_fadvise50(r1, 0x0, 0x0, 0xffffffffffffffff, 0x0)
r2 = semget$private(0x0, 0x7, 0x3c0)
semctl$SETALL(r2, 0x0, 0x9, &(0x7f00000002c0))
semop(r2, &(0x7f0000000080)=[{0x1, 0x43, 0x800}, {0x4, 0xe6, 0x1800}, {0x0, 0x101}, {}, {0x2, 0x5, 0x1800}, {0x1, 0x9e, 0x1000}, {}], 0x7)


__mount50(&(0x7f00000002c0)='overlay\x00', &(0x7f0000000040)='.\x00', 0x0, &(0x7f0000000540), 0x0)
symlink(&(0x7f0000000080)='.\x00', &(0x7f0000000240)='./file0\x00')
compat_50___fstat30(0xffffffffffffffff, 0x0)
compat_50___msgctl13$IPC_STAT(0x0, 0x2, &(0x7f0000000280)={{}, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, &(0x7f00000001c0)={&(0x7f0000000100), 0xdffffffffffff7ff}})
lchown(&(0x7f0000000100)='./file0\x00', 0x0, 0xffffffffffffffff)
compat_40_mount(&(0x7f0000000380)='union\x00', &(0x7f0000000140)='./file0\x00', 0x0, &(0x7f00000001c0))
unlink(&(0x7f0000000000)='./file0\x00')
open$dir(&(0x7f00000000c0)='./file0\x00', 0x40000400001803c1, 0x0)
open(&(0x7f0000001640)='./file0\x00', 0x615, 0x0)


r0 = open$dir(&(0x7f00000000c0)='./file0\x00', 0x40000400001803c1, 0x0)
__getdents30(r0, 0x0, 0x0)


setrlimit(0x0, &(0x7f0000000000))
__clone(0x300, 0x0)


r0 = socket(0x18, 0x3, 0x0)
setsockopt(r0, 0x1000000029, 0x18, &(0x7f0000000000)="5ab7776a", 0x4)


r0 = socket$unix(0x1, 0x5, 0x0)
bind$unix(r0, &(0x7f0000000200)=@file={0xd570d0466b6018f, './file0\x00'}, 0xa)
listen(r0, 0x0)
sendmsg$unix(0xffffffffffffffff, &(0x7f0000000080)={0x0, 0x0, 0x0, 0x0, &(0x7f0000000000)=ANY=[@ANYBLOB="89000000ffff000001"], 0x9}, 0x0)
socketpair$unix(0x1, 0x2, 0x0, &(0x7f0000000040)={<r1=>0xffffffffffffffff})
sendmsg(r1, &(0x7f0000000380)={0x0, 0x32c, 0x0, 0x0, &(0x7f0000000000), 0x90}, 0x0)
r2 = socket$unix(0x1, 0x5, 0x0)
connect$unix(r2, &(0x7f0000000000)=@file={0xd1653077bafa0114, './file0\x00'}, 0xa)
close(r0)


mknod(&(0x7f0000000000)='./file1\x00', 0x2000, 0xa718)
r0 = open(&(0x7f0000000200)='./file1\x00', 0x615, 0x0)
write(r0, 0x0, 0x0)


r0 = socket(0x1f, 0x5, 0x0)
ioctl$FIONREAD(r0, 0x4004667f, 0x0)


mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1)
clock_nanosleep(0x0, 0x0, &(0x7f0000000040), &(0x7f00000000c0))


__mount50(0x0, &(0x7f0000000440)='./file0\x00', 0x2, 0x0, 0x0)
execve(&(0x7f00000004c0)='./file0/file0\x00', 0x0, 0x0)


r0 = socket$inet6(0x18, 0x3, 0x0)
getsockopt(r0, 0x29, 0x9, 0x0, 0x0)


compat_50_setitimer(0x0, &(0x7f00000001c0)={{0x0, 0x9}, {0x7}}, 0x0)


mknod(&(0x7f0000000540)='./file0\x00', 0x6000, 0x1003)
r0 = open(&(0x7f0000000000)='./file0\x00', 0x0, 0x0)
ioctl$FIOASYNC(r0, 0x80046477, &(0x7f0000000380))
r1 = socket$inet(0x2, 0x1, 0x0)
getsockname$unix(r1, 0x0, 0x0)


getpriority(0x2, 0x0)


r0 = shmget$private(0x0, 0x1000, 0x0, &(0x7f0000ffc000/0x1000)=nil)
munmap(&(0x7f0000ffb000/0x4000)=nil, 0x4000)
r1 = shmat(r0, &(0x7f0000ffd000/0x1000)=nil, 0x0)
shmat(r0, &(0x7f0000ffc000/0x4000)=nil, 0x0)
shmat(r0, &(0x7f0000ffb000/0x3000)=nil, 0x0)
mlock(&(0x7f0000ffb000/0x3000)=nil, 0x3000)
shmdt(r1)


compat_50___msgctl13$IPC_SET(0x0, 0x1, 0x0)


socket$inet6(0x18, 0x0, 0x0)
pipe2(&(0x7f0000000000)={<r0=>0xffffffffffffffff}, 0x400004)
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
readv(r0, &(0x7f00000002c0)=[{0x0}], 0x1)


socketpair(0x18, 0x3, 0x0, 0x0)


r0 = socket(0x2, 0x2, 0x0)
ioctl$FIOSEEKHOLE(r0, 0x8090698e, &(0x7f0000000180)=0x8000000000000031)


open(&(0x7f0000001640)='./file0\x00', 0x615, 0x0)
ktrace(&(0x7f0000000200)='./file0\x00', 0x4, 0xd27d43220c7df9b, 0x0)
recvmmsg(0xffffffffffffffff, 0x0, 0x0, 0x0, 0x0)


mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x0)
compat_43_ogethostname(&(0x7f0000000000)=""/3, 0x3)


r0 = socket(0x2, 0x3, 0x0)
mknod(0x0, 0x0, 0x4f4b)
link(&(0x7f0000000100)='./file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\x00', 0x0)
ioctl$FIOSEEKHOLE(r0, 0x80206910, &(0x7f0000000180)=0x8000000000000032)


mknod(&(0x7f00000000c0)='./file0\x00', 0x6000, 0xe03)
r0 = open(&(0x7f00000000c0)='./file0\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x2000)=nil, 0x2000, 0x0, 0x10, r0, 0x0, 0x0)


symlink(&(0x7f0000000080)='.\x00', &(0x7f0000000240)='./file0\x00')
r0 = open(&(0x7f0000000040)='./file0\x00', 0x0, 0x0)
fcntl$setstatus(r0, 0x4, 0x16d5aca6ddaa34f9)


compat_40_mount(&(0x7f0000000040)='tmpfs\x00', &(0x7f00000000c0)='.\x00', 0x0, &(0x7f00000002c0)="01")
__mount50(&(0x7f00000002c0)='overlay\x00', &(0x7f0000000040)='.\x00', 0x0, &(0x7f0000000540), 0x0)
r0 = open$dir(&(0x7f0000000080)='.\x00', 0x0, 0x0)
mkdirat(r0, &(0x7f0000000180)='./file1\x00', 0x0)
chdir(&(0x7f0000000040)='./file1\x00')
__getcwd(&(0x7f00000003c0)=""/25, 0x19)


modctl$MODCTL_UNLOAD(0x2, 0x0)
swapctl$SWAP_ON(0x6, 0x0, 0x0)


socket(0x12, 0x2, 0x0)
compat_50_select(0x40, &(0x7f0000000280), &(0x7f0000000300)={0x6ab}, 0x0, 0x0)


write(0xffffffffffffffff, &(0x7f00000001c0)="39e4aff151", 0x5)
ktrace(&(0x7f0000000200)='./file0\x00', 0x0, 0x0, 0x0)
mkdir(&(0x7f0000000080)='./file0\x00', 0x0)
r0 = socket$inet6(0x18, 0x3, 0x0)
open$dir(&(0x7f0000000000)='./file0\x00', 0x0, 0x0)
pathconf(&(0x7f00000000c0)='./file0\x00', 0x0)
__mount50(&(0x7f0000000000)='fdesc\x00', 0x0, 0x0, 0x0, 0x0)
sendto$inet6(r0, &(0x7f0000000000)='3', 0x329, 0x2, &(0x7f0000000040)={0x18, 0x3}, 0x1c)


r0 = getsid(0x0)
ptrace(0x9, r0, 0x0, 0x0)
compat_50_wait4(0x0, 0x0, 0x0, 0x0)
ptrace(0x1a, r0, 0x0, 0x8)


modctl$MODCTL_UNLOAD(0x2, 0x0)
mkdir(&(0x7f0000000080)='./file0\x00', 0x0)
compat_40_mount(&(0x7f0000000180)='ptyfs\x00', &(0x7f00000002c0)='./file0\x00', 0x0, &(0x7f0000000500))
r0 = open(&(0x7f0000000140)='./file0\x00', 0x0, 0x0)
fcntl$setown(r0, 0x11, 0x0)


modctl$MODCTL_LOAD(0x5, 0x0)
profil(0x0, 0x0, 0x0, 0x5)


r0 = socket$inet6(0x18, 0x3, 0x0)
getsockopt(r0, 0x29, 0x69, 0x0, 0x0)


r0 = socket(0x18, 0x3, 0x0)
setsockopt(r0, 0x1000000029, 0x31, &(0x7f00000000c0)="b2", 0x1)


compat_50___msgctl13$IPC_STAT(0x0, 0x2, 0x0)
setrlimit(0xe, &(0x7f00000010c0))


socketpair$unix(0x1, 0x1, 0x0, &(0x7f0000000040)={0xffffffffffffffff, <r0=>0xffffffffffffffff})
recvmmsg(0xffffffffffffffff, &(0x7f0000000700)={&(0x7f00000001c0)={0x0, 0x0, 0x0, 0x0, &(0x7f0000000680)=""/100, 0x64}, 0x3f8d}, 0x10, 0x0, 0x0)
sendmmsg(r0, &(0x7f0000000080)={0x0}, 0x10, 0x0, 0x0)
posix_spawn(0x0, 0x0, &(0x7f00000000c0)={0x4, 0x1ff, 0x0}, 0x0, 0x0, 0x0)
poll(&(0x7f0000000000)=[{}], 0x4e8, 0x0)


mknod(&(0x7f0000000000)='./file0\x00', 0x2000, 0x800)
__mount50(&(0x7f00000002c0)='overlay\x00', &(0x7f0000000040)='.\x00', 0x0, &(0x7f0000000540), 0x0)
r0 = socket(0x1f, 0x5, 0x0)
setsockopt$sock_int(r0, 0xffff, 0x0, 0x0, 0x0)
_lwp_wait(0x0, 0x0)
_lwp_exit()
swapctl$SWAP_ON(0x1, &(0x7f0000000000), 0x0)


sendmsg$unix(0xffffffffffffffff, &(0x7f0000001600)={0x0, 0x0, 0x0, 0x0, &(0x7f0000002c40)=ANY=[@ANYBLOB="10000000ffff000001"], 0x10}, 0x0)
socketpair$unix(0x1, 0x1, 0x0, &(0x7f0000000000)={<r0=>0xffffffffffffffff, <r1=>0xffffffffffffffff})
shutdown(r1, 0x0)
sendmmsg(r0, &(0x7f0000001440)={0x0}, 0x10, 0x0, 0x0)


socketpair$unix(0x1, 0x1, 0x0, &(0x7f0000000000)={0xffffffffffffffff, <r0=>0xffffffffffffffff})
close(r0)
r1 = socket(0x18, 0x2, 0x0)
connect$unix(r1, &(0x7f00000000c0)=@abs={0x682eb13985c518e6, 0x7}, 0x1c)
sendmsg(r0, &(0x7f00000003c0)={0x0, 0x0, 0x0, 0x0, &(0x7f0000000100)=ANY=[@ANYBLOB="1400000029"], 0x3e}, 0x0)


mlockall(0x2)
mmap(&(0x7f0000000000/0x400000)=nil, 0x400000, 0x3, 0x5012, 0xffffffffffffffff, 0x0, 0x0)
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x0)


compat_50___msgctl13$IPC_STAT(0x0, 0x2, &(0x7f0000000280)={{}, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, &(0x7f00000001c0)={&(0x7f0000000100), 0xdffffffffffff7ff}})
__mount50(&(0x7f0000000000)='overlay\x00', &(0x7f0000000040)='.\x00', 0x0, &(0x7f0000000540), 0x0)
lchown(&(0x7f0000000100)='./file0\x00', 0x0, 0xffffffffffffffff)
compat_40_mount(&(0x7f0000000380)='union\x00', &(0x7f0000000140)='./file0\x00', 0x0, &(0x7f00000001c0))
symlink(&(0x7f0000000080)='.\x00', &(0x7f0000000240)='./file0\x00')
compat_40_mount(&(0x7f0000000380)='union\x00', &(0x7f0000000140)='./file0\x00', 0x0, &(0x7f00000001c0))
socketpair(0x17, 0x4, 0x7, &(0x7f00000000c0))
unlink(&(0x7f0000000000)='./file0\x00')
mknod(&(0x7f0000001200)='./file0\x00', 0x2000, 0x400)
r0 = open$dir(&(0x7f00000000c0)='./file0\x00', 0x2, 0x0)
write(r0, &(0x7f00000014c0)="ad", 0x1)


mknod(&(0x7f0000000480)='./file0\x00', 0x2000, 0x1733)
r0 = open(&(0x7f00000000c0)='./file0\x00', 0x0, 0x0)
ioctl$FIOASYNC(r0, 0x2000427e, 0x0)
ioctl$FIONREAD(r0, 0x40047477, 0x0)


r0 = fcntl$dupfd(0xffffffffffffffff, 0xb, 0xffffffffffffffff)
readv(r0, &(0x7f0000000600)=[{0x0}], 0x1)


__mount50(&(0x7f0000000000)='overlay\x00', &(0x7f0000000040)='.\x00', 0x1, &(0x7f0000000380), 0x0)
mknod(&(0x7f00000000c0)='./bus\x00', 0x0, 0x4f4b)


mkdirat(0xffffffffffffff9c, &(0x7f0000000140)='./file0\x00', 0x0)
linkat(0xffffffffffffffff, 0x0, 0xffffffffffffffff, 0x0, 0x0)
utimensat(0xffffffffffffff9c, &(0x7f0000000040)='./file0\x00', &(0x7f0000000100)={{0x77359400}, {0x0, 0x3fffffff}}, 0x0)


setrlimit(0x8, &(0x7f0000000980))
pipe2(0x0, 0x0)


open(&(0x7f0000001640)='./file0\x00', 0x615, 0x0)
ktrace(&(0x7f0000000200)='./file0\x00', 0x4, 0xd27d43220c7df9b, 0x0)
compat_43_lstat43(&(0x7f0000000000)='./file0\x00', &(0x7f0000000040))


getpriority(0x0, 0x0)
compat_50___msgctl13$IPC_STAT(0x0, 0x2, &(0x7f0000000280)={{}, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0})
r0 = getsid(0x0)
ptrace(0x9, r0, 0x0, 0x0)
__wait450(0x0, &(0x7f0000000000), 0x0, 0x0)


__mount50(&(0x7f0000000080)='overlay\x00', &(0x7f0000000040)='.\x00', 0x0, &(0x7f0000000540), 0x0)
mknod(&(0x7f0000000480)='./file0\x00', 0x2000, 0x1733)
pathconf(&(0x7f00000001c0)='./file0\x00', 0x9)


symlink(&(0x7f0000000080)='.\x00', &(0x7f0000000240)='./file0\x00')
compat_50___msgctl13$IPC_STAT(0x0, 0x2, &(0x7f0000000280)={{}, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, &(0x7f00000001c0)={&(0x7f0000000100)}})
__posix_rename(&(0x7f0000000100)='./file0\x00', 0x0)
chflags(&(0x7f00000003c0)='./file0\x00', 0x5)
compat_40_mount(&(0x7f0000000140)='umap\x00', &(0x7f0000000040)='./file0\x00', 0x0, &(0x7f00000001c0))
open(&(0x7f0000000480)='./file0\x00', 0x0, 0x0)


compat_40_mount(&(0x7f0000000040)='tmpfs\x00', &(0x7f00000000c0)='.\x00', 0x0, &(0x7f00000002c0)="01")
mkdir(&(0x7f00000001c0)='./file1\x00', 0x0)
symlink(&(0x7f0000000080)='.\x00', &(0x7f0000000240)='./file0\x00')
compat_50___msgctl13$IPC_STAT(0x0, 0x2, &(0x7f0000000280)={{}, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, &(0x7f00000001c0)={&(0x7f0000000100)}})
sendmsg$unix(0xffffffffffffffff, &(0x7f0000000240)={&(0x7f0000000800)=@file={0x0, './file0\x00'}, 0xa, &(0x7f0000000200)}, 0x0)
lchown(&(0x7f0000000100)='./file0\x00', 0x0, 0xffffffffffffffff)
compat_40_mount(&(0x7f0000000140)='umap\x00', &(0x7f0000000040)='./file0\x00', 0x0, &(0x7f00000001c0))
mknodat(0xffffffffffffff9c, &(0x7f00000000c0)='./file1/file2\x00', 0x81c0, 0x0)
linkat(0xffffffffffffff9c, &(0x7f0000000380)='./file1/file2\x00', 0xffffffffffffff9c, &(0x7f00000003c0)='./file0/file2\x00', 0x0)


link(&(0x7f0000000100)='./file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\x00', 0x0)
r0 = socket(0x18, 0x2, 0x0)
readv(r0, &(0x7f0000000100), 0x8)


mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x0)
r0 = msgget$private(0x0, 0x0)
compat_50___msgctl13$IPC_STAT(r0, 0x2, 0x0)


socketpair$unix(0x1, 0x1, 0x0, &(0x7f0000000200)={<r0=>0xffffffffffffffff})
write(r0, &(0x7f0000000040)="ed", 0x1)
sendmmsg(r0, &(0x7f0000000180)={0x0}, 0x10, 0xa, 0x0)


mkdir(&(0x7f0000000080)='./file0\x00', 0x0)
compat_40_mount(&(0x7f0000000180)='ptyfs\x00', &(0x7f00000002c0)='./file0\x00', 0x0, &(0x7f0000000500))
r0 = open(&(0x7f0000000180)='./file0\x00', 0x0, 0x0)
compat_43_ogetdirentries(r0, 0x0, 0x0, 0x0)


socketpair$unix(0x1, 0x1, 0x0, &(0x7f0000000140)={<r0=>0xffffffffffffffff, <r1=>0xffffffffffffffff})
sendmsg$unix(r1, &(0x7f00000000c0)={0x0, 0x0, 0x0, 0x0, &(0x7f0000000100)=ANY=[@ANYBLOB="28000000ffff000001"], 0x28}, 0x0)
recvmmsg(r0, &(0x7f0000000040)={0x0}, 0x10, 0x42, 0x0)


compat_40_mount(&(0x7f0000000040)='tmpfs\x00', &(0x7f0000000140)='.\x00', 0x0, &(0x7f0000000080)="01")
mkdirat(0xffffffffffffff9c, &(0x7f0000000680)='./file2\x00', 0x0)
mkdir(&(0x7f0000000300)='./file2/file0\x00', 0x0)
rename(&(0x7f00000002c0)='./file2/file0\x00', &(0x7f0000000340)='./file0\x00')


recvfrom$unix(0xffffffffffffffff, 0x0, 0x0, 0x0, &(0x7f0000000080)=@file={0x0, './file0\x00'}, 0xa)
open(&(0x7f00000000c0)='./file0\x00', 0x205, 0x0)
r0 = open(&(0x7f0000001480)='./file0\x00', 0x0, 0x0)
fcntl$lock(r0, 0x9, &(0x7f0000000080)={0x0, 0x0, 0x0, 0x100000001})
flock(r0, 0x1)
close(r0)


r0 = socket$inet(0x2, 0x1, 0x0)
bind$inet(r0, &(0x7f0000000040)={0x2, 0x0}, 0x10)
connect$inet(r0, &(0x7f0000000000)={0x2, 0x0}, 0x10)
r1 = socket(0x18, 0x2, 0x0)
r2 = socket(0x18, 0x2, 0x0)
r3 = dup2(r1, r2)
flock(0xffffffffffffffff, 0x0)
bind$inet(0xffffffffffffffff, 0x0, 0x0)
dup2(r3, r0)


symlink(&(0x7f0000000080)='.\x00', &(0x7f0000000240)='./file0\x00')
compat_50___msgctl13$IPC_STAT(0x0, 0x2, &(0x7f0000000280)={{}, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, &(0x7f00000001c0)={&(0x7f0000000100)}})
lchown(0x0, 0x0, 0xffffffffffffffff)
open(&(0x7f0000000100)='./file0\x00', 0x0, 0x0)
chmod(&(0x7f0000000200)='./bus\x00', 0x0)
sendmsg$unix(0xffffffffffffffff, &(0x7f0000000240)={&(0x7f0000000180)=@abs={0x0, 0x0, 0x2}, 0x8, &(0x7f0000000200)}, 0x0)
compat_40_mount(&(0x7f0000000140)='umap\x00', &(0x7f0000000040)='./file0\x00', 0x0, &(0x7f00000001c0))
compat_40_mount(&(0x7f0000000140)='umap\x00', &(0x7f0000000040)='./file0\x00', 0x0, &(0x7f00000001c0))


unlink(&(0x7f0000000000)='./file0\x00')
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
r0 = socket(0x18, 0x2, 0x0)
setsockopt$sock_int(r0, 0xffff, 0x1003, &(0x7f0000000000), 0x4)


mknod$loop(&(0x7f0000000000)='./file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\x00', 0x0, 0x1)
sendmmsg(0xffffffffffffffff, &(0x7f0000000080)={0x0}, 0x10, 0x0, 0x0)
r0 = socket(0x800000018, 0x3, 0x0)
bind$unix(r0, &(0x7f0000000080)=@abs={0x1f95d27d48731892}, 0x1c)


__mount50(&(0x7f00000002c0)='overlay\x00', &(0x7f0000000040)='.\x00', 0x0, &(0x7f0000000540), 0x0)
mkdir(&(0x7f0000000080)='./file0\x00', 0x0)
chmod(&(0x7f0000000080)='./file0\x00', 0x2ea)
chdir(&(0x7f0000000100)='./file0\x00')
mkdir(&(0x7f0000000040)='./file0\x00', 0x0)
rename(&(0x7f0000000140)='./file0\x00', &(0x7f0000000180)='./file2\x00')


mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
setreuid(0x0, 0xee01)
modctl$MODCTL_UNLOAD(0x1, &(0x7f00000002c0))


_ksem_init(0x0, &(0x7f00000006c0)=<r0=>0x50535244)
_ksem_getvalue(r0, &(0x7f0000000000))


modctl$MODCTL_UNLOAD(0x2, 0x0)
mmap(&(0x7f0000000000/0x13000)=nil, 0x13000, 0x0, 0x10, 0xffffffffffffffff, 0x0, 0x0)
msgget$private(0x0, 0x0)
mknod(&(0x7f0000000100)='./file1\x00', 0x202c, 0xa718)
r0 = open$dir(&(0x7f00000000c0)='./file1\x00', 0x0, 0x0)
fcntl$setstatus(r0, 0x6, 0x0)
execve(0x0, 0x0, 0x0)


compat_40_mount(&(0x7f0000000040)='tmpfs\x00', &(0x7f00000000c0)='.\x00', 0x0, &(0x7f00000002c0)="01")
r0 = socket$unix(0x1, 0x1, 0x0)
r1 = socket$unix(0x1, 0x1, 0x0)
bind$unix(r1, &(0x7f0000003000)=@file={0x1, '\xe9\x1fq\x89Y\x1e\x923aK\x00'}, 0x6e)
listen(r1, 0x0)
connect$unix(r0, &(0x7f0000000640)=@file={0x1, '\xe9\x1fq\x89Y\x1e\x923aK\x00'}, 0x6e)


swapctl$SWAP_ON(0x6, 0x0, 0xffffffff)


mknod$loop(&(0x7f0000000000)='./file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\x00', 0x2000, 0x1)
link(&(0x7f0000000940)='./file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\x00', &(0x7f0000000280)='./file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\x00')
rename(&(0x7f0000000840)='./file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\x00', &(0x7f0000000f40)='./file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\x00')
rename(&(0x7f0000000700)='./file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\x00', &(0x7f0000000380)='\x13\x13w\xc5\xfc5\xd4\x14T\xd5\xd4\x1d)\xad\x1a`)Y\x81F\xe6\xbe\x16nA\xad\r\xbd@T\x03<\x9f3\xbb\xda\x82$\xa2\xf3\xd7r\xe7cnH\xb3<\xbfp\x83r\xe8\xf1\xb9\x93>\xc5\x12wC\xbe\"\x06 \x9e\xf0-\xf9\xcb\xf2\xf6\xe8\x80\xd38/\x00')


setreuid(0x0, 0xee01)
r0 = socket(0x18, 0x2, 0x0)
setsockopt(r0, 0x1000000000029, 0x40, &(0x7f0000000040)="00fb6c4f", 0x4)


mknod(&(0x7f0000000000)='./bus\x00', 0x2000, 0x2e00)
r0 = open$dir(&(0x7f0000000000)='./bus\x00', 0x0, 0x0)
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x0)
compat_50_futimes(r0, &(0x7f0000000240))


poll(&(0x7f0000000040)=[{}], 0x1, 0x0)
ioctl$WSMUXIO_INJECTEVENT(0xffffffffffffffff, 0x80185760, &(0x7f0000000000)={0x0, 0x0, {0x0, 0x1}})
r0 = getegid()
getgroups(0x1, &(0x7f0000000000)=[r0])
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
r1 = socket(0x18, 0x1, 0x0)
setsockopt(r1, 0x1000000029, 0xc, &(0x7f0000000000)="ebffcbff13b9fd812eaa4e713048e69931929648", 0x14)
setsockopt(r1, 0x1000000000029, 0xa, &(0x7f0000000040)="03000000", 0x4)


r0 = socket(0x10, 0x2, 0x0)
shutdown(r0, 0x1)


r0 = socket(0x2, 0x3, 0x0)
connect$inet(r0, &(0x7f00000000c0)={0x2, 0x0}, 0x10)
sendto$inet(r0, &(0x7f0000000280)="ad274dc8d38aa6b43ded1379b966d0e5a1150372d10ddc2f371ac13ca8c4892db1c4395b6e2771fcb180f92bc46b8f13a3bfe379445eaaa3e9e67a4810719932850c45e3a7d5e820595d539e289df7812bf453718671e7201e4e97798c7f2759f1bf00000000000027756e7fe2b914cbba0c2ceff6c99e5d6956bfc817c93106534ac825f64f2bfd7cc17d8f95345da6810241270fe933c194d005efdba81c3c337987bdf0ef707eff2f8be99ae425fa2582375e500606f8d4beeef4ecbd3aa4fa168d75ef96836b85fd432e9761cf4120dc2cae238e901bad15a5dc74be7cfcd6d0190e0554c9fbc6d3644d7e507a63d77e10ec9f1cf6322baf806e5e7125a37e65b4b661479f1609c3adc52ed02a7b727dc31d2469a9f4a17d970de84f6668f259f95bd6fedcc4e3e8095c3a9251a22656cf19cad4eeba9e9a23d11ebe0c908c153ddbe84cf127180a6177c1a70a9a7158cf88b7d89eb6d91988a7e43466e78de8b3dd39f7ee24a562184185acb64bd3253e742304314767", 0x179, 0x0, 0x0, 0x0)


__mount50(&(0x7f00000002c0)='overlay\x00', &(0x7f0000000040)='.\x00', 0x0, &(0x7f0000000540), 0x0)
mknod(&(0x7f00000000c0)='./bus\x00', 0x2000, 0x0)
acct(&(0x7f0000000100)='./bus\x00')


__fhstat50(&(0x7f0000000180)="eb01b685", 0x4, 0x0)
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
r0 = socket(0x18, 0x3, 0x0)
ioctl$FIOSEEKHOLE(r0, 0x80047476, &(0x7f0000000180))


compat_40_mount(&(0x7f0000000040)='tmpfs\x00', &(0x7f00000000c0)='.\x00', 0x0, &(0x7f00000002c0)="01")
open(&(0x7f0000000280)='./file0\x00', 0x70e, 0x0)
pathconf(&(0x7f00000000c0)='./file0\x00', 0xb)


symlink(&(0x7f0000000080)='.\x00', &(0x7f0000000240)='./file0\x00')
compat_50___msgctl13$IPC_STAT(0x0, 0x2, &(0x7f0000000280)={{}, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, &(0x7f00000001c0)={&(0x7f0000000100), 0xdffffffffffff7ff}})
lchown(&(0x7f0000000100)='./file0\x00', 0x0, 0xffffffffffffffff)
compat_40_mount(&(0x7f0000000380)='union\x00', &(0x7f0000000140)='./file0\x00', 0x0, &(0x7f00000001c0))
unlink(&(0x7f0000000000)='./file0\x00')
r0 = open(&(0x7f0000000480)='./file0\x00', 0x80000000000206, 0x0)
open$dir(&(0x7f0000000000)='.\x00', 0x0, 0x0)
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
unmount(&(0x7f0000000000)='./file0\x00', 0x255a0100)
writev(r0, &(0x7f00000000c0)=[{0x0}], 0x1)


munmap(&(0x7f0000001000/0x3000)=nil, 0x3000)
mknod(&(0x7f0000000300)='./file0\x00', 0x2000, 0x2e00)
r0 = open$dir(&(0x7f0000000000)='./file0\x00', 0x2, 0x0)
writev(r0, &(0x7f0000000340)=[{&(0x7f0000000000), 0x2cfea}], 0x1000000000000013)


modctl$MODCTL_LOAD(0x0, &(0x7f0000000080)={&(0x7f0000000180), 0x0, &(0x7f00000000c0)=' ', 0x1})


_lwp_unpark_all(&(0x7f0000000340), 0x0, 0x0)


mknod(&(0x7f0000000000)='./file0\x00', 0x1ffa, 0x0)
r0 = open(&(0x7f00000001c0)='./file0\x00', 0x0, 0x0)
open(&(0x7f0000000040)='./file0\x00', 0x201, 0x0)
poll(&(0x7f00000003c0)=[{r0}], 0x1, 0x0)
r1 = open(&(0x7f0000000480)='./file0\x00', 0x80000000000206, 0x0)
writev(r1, &(0x7f00000002c0)=[{&(0x7f0000000300)='#', 0x1}], 0x1)


compat_50___msgctl13$IPC_STAT(0x0, 0x2, &(0x7f0000000300)={{}, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, &(0x7f00000001c0)={&(0x7f0000000100), 0xd0cf}})
mknod(&(0x7f00000000c0)='./bus\x00', 0x2010, 0x4f4b)
r0 = open$dir(&(0x7f0000000040)='./bus\x00', 0x0, 0x0)
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
ioctl$FIOASYNC(r0, 0x80104301, &(0x7f00000001c0))


setreuid(0x0, 0xee01)
r0 = socket(0x18, 0x2, 0x0)
setsockopt(r0, 0x1000000000029, 0x17, &(0x7f0000000040)="00fb6c4f", 0x4)


open(&(0x7f00000000c0)='./file0\x00', 0x205, 0x0)
recvmsg(0xffffffffffffffff, &(0x7f0000000340)={0x0, 0x0, 0x0, 0x0, &(0x7f00000024c0)=""/236, 0xec}, 0x0)
r0 = getpid()
ktrace(&(0x7f0000000000)='./file0\x00', 0x0, 0x40000530, r0)
r1 = openat(0xffffffffffffff9c, &(0x7f0000000040)='./file0\x00', 0x0, 0x0)
readv(r1, &(0x7f0000000080), 0x100000000000025c)


r0 = socket(0x18, 0x2, 0x0)
connect$unix(r0, &(0x7f00000000c0)=@abs={0x682eb13985c518e6, 0x7}, 0x1c)
r1 = socket(0x2, 0x2, 0x0)
setsockopt$sock_int(r1, 0xffff, 0x1023, &(0x7f0000000040)=0x3, 0x4)
shutdown(r0, 0x2)


open(&(0x7f0000000100)='./file0\x00', 0x200, 0x0)
r0 = open$dir(&(0x7f00000000c0)='./file0\x00', 0x0, 0x0)
compat_30_getdents(r0, 0x0, 0x0)


compat_40_mount(&(0x7f0000000040)='tmpfs\x00', &(0x7f00000000c0)='.\x00', 0x0, &(0x7f00000002c0)="01")
symlink(&(0x7f0000000080)='.\x00', &(0x7f0000000240)='./file0\x00')
chflags(&(0x7f00000003c0)='./file0\x00', 0x5)
unlink(&(0x7f0000000000)='./file0\x00')


compat_43_osendmsg(0xffffffffffffffff, &(0x7f0000000500)="76219e8bf93cd7a79d", 0x0)


r0 = socket(0x18, 0x2, 0x0)
setsockopt(r0, 0x29, 0x16, &(0x7f0000000000)="02000000", 0x4)


mknod(&(0x7f0000000200)='./file0\x00', 0x2000, 0x0)
open$dir(&(0x7f0000000180)='./file0\x00', 0x190, 0x0)
open(&(0x7f0000000040)='./file0\x00', 0x10, 0x0)
r0 = open(&(0x7f0000000380)='./file0\x00', 0x0, 0x0)
fcntl$lock(r0, 0x9, &(0x7f0000000080)={0x0, 0x0, 0x2, 0x100000001})


truncate(0x0, 0x0, 0x0)
r0 = socket$inet(0x2, 0x2, 0x0)
compat_40_mount(0x0, &(0x7f00000003c0)='.\x00', 0x0, 0x0)
mknod(&(0x7f00000000c0)='./file0\x00', 0x2000, 0x29b3)
open(&(0x7f0000000100)='./file0\x00', 0x0, 0x0)
fcntl$lock(r0, 0xa, 0x0)


mknod(&(0x7f0000000000)='./file1\x00', 0x2000, 0xa718)
r0 = open$dir(&(0x7f00000000c0)='./file1\x00', 0x0, 0x0)
compat_43_fstat43(r0, &(0x7f0000000100))


_ksem_open(0x0, 0x0, 0x0, 0x0, 0x0)


compat_30_getfh(&(0x7f00000000c0)='./bus\x00', 0x0)


mknod(&(0x7f0000000000)='./file1\x00', 0x2000, 0xa718)
rename(0x0, 0x0)
r0 = open$dir(&(0x7f00000000c0)='./file1\x00', 0x0, 0x0)
ioctl$OFIOGETBMAP(r0, 0x8004667d, &(0x7f0000000100)=0xffffffff)


r0 = socket(0x18, 0x3, 0x0)
getsockopt$sock_int(r0, 0xffff, 0x200, 0x0, 0x0)


r0 = socket(0x2, 0x1, 0x0)
close(r0)
read(r0, 0x0, 0x0)


symlink(&(0x7f0000000080)='.\x00', &(0x7f0000000240)='./file0\x00')
compat_50___msgctl13$IPC_STAT(0x0, 0x2, &(0x7f0000000280)={{}, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, &(0x7f00000001c0)={&(0x7f0000000100), 0xdffffffffffff7ff}})
lchown(&(0x7f0000000100)='./file0\x00', 0x0, 0xffffffffffffffff)
compat_40_mount(&(0x7f0000000380)='union\x00', &(0x7f0000000140)='./file0\x00', 0x0, &(0x7f00000001c0))
compat_40_mount(&(0x7f0000000380)='union\x00', &(0x7f0000000140)='./file0\x00', 0x0, &(0x7f00000001c0))
unlink(&(0x7f0000000000)='./file0\x00')
open(&(0x7f00000000c0)='./file0\x00', 0x615, 0x0)


__mount50(&(0x7f00000001c0)='ffs\x00', &(0x7f0000000040)='.\x00', 0x0, 0x0, 0x0)


r0 = getsid(0x0)
ptrace(0x9, r0, 0x0, 0x0)
compat_50_wait4(0x0, 0x0, 0x0, 0x0)
ptrace(0x19, r0, 0x0, 0x8)


r0 = compat_30_socket(0x22, 0x3, 0x0)
compat_43_osend(r0, &(0x7f0000000040)="00200203", 0x2000, 0x0)


r0 = socket$inet6(0xa, 0x1, 0x0)
mmap(&(0x7f00009ff000/0x600000)=nil, 0x600000, 0x1000004, 0x13, r0, 0x0, 0x0)
mmap(&(0x7f0000ffc000/0x1000)=nil, 0x1000, 0x0, 0x812, r0, 0x49b87000, 0x0)
mmap(&(0x7f0000de4000/0x3000)=nil, 0x3000, 0x0, 0x13, r0, 0x0, 0x0)
mmap(&(0x7f0000439000/0x4000)=nil, 0x4000, 0x0, 0x852, r0, 0x0, 0x0)
r1 = socket$inet6(0xa, 0x1, 0x0)
mmap(&(0x7f00009ff000/0x600000)=nil, 0x600000, 0x0, 0x13, r1, 0x0, 0x0)


setrlimit(0x0, &(0x7f0000000000))
setrlimit(0x7, &(0x7f0000000100))


mkdir(0x0, 0x0)
semctl$SETVAL(0x0, 0x0, 0x8, 0xffffffffffffffff)


r0 = getsid(0x0)
ptrace(0x9, r0, 0x0, 0x0)
ptrace(0x2b, r0, 0x0, 0x0)


__mount50(&(0x7f00000001c0)='fdesc\x00', &(0x7f0000000040)='.\x00', 0x0, 0x0, 0x0)
__mount50(&(0x7f00000002c0)='overlay\x00', &(0x7f0000000040)='.\x00', 0x0, &(0x7f0000000540), 0x0)
mknod(&(0x7f0000000040)='./file0\x00', 0x0, 0x0)


symlink(&(0x7f0000000080)='.\x00', &(0x7f0000000240)='./file0\x00')
r0 = open(&(0x7f0000000140)='./file0\x00', 0x0, 0x0)
__posix_fadvise50(r0, 0x0, 0x0, 0x0, 0x3)


modctl$MODCTL_UNLOAD(0x2, 0x0)
munmap(&(0x7f0000001000/0x3000)=nil, 0x3000)
r0 = shmget$private(0x0, 0x2000, 0x0, &(0x7f0000001000/0x2000)=nil)
shmat(r0, &(0x7f0000001000/0x3000)=nil, 0x0)
madvise(&(0x7f0000001000/0x2000)=nil, 0x2000, 0x6)


mknod(&(0x7f0000000100)='./file0\x00', 0x2000, 0x0)
r0 = open(&(0x7f0000000140)='./file0\x00', 0x0, 0x0)
ioctl$FIOASYNC(r0, 0x40047477, 0x0)


r0 = open$dir(&(0x7f0000000080)='.\x00', 0x0, 0x0)
fchflags(r0, 0xfdffffff)


setrlimit(0x7, &(0x7f0000000100))
setuid(0xee01)
fork()


setrlimit(0x0, &(0x7f00000000c0))
setrlimit(0x0, &(0x7f0000000080)={0x0, 0x10000})


r0 = socket$inet(0x2, 0x2, 0x0)
mknod(&(0x7f0000000000)='./bus\x00', 0x2000, 0x2e00)
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
open$dir(&(0x7f0000000000)='./file0\x00', 0x0, 0x0)
fcntl$lock(r0, 0xa, 0x0)


open$dir(&(0x7f0000000080)='./file0\x00', 0x200, 0x0)
r0 = getpid()
ktrace(&(0x7f0000000340)='./file0\x00', 0x0, 0xf, r0)
__posix_fchown(0x0, 0x0, 0x0)


modctl$MODCTL_UNLOAD(0x2, 0x0)
mknodat(0xffffffffffffff9c, &(0x7f0000000040)='./file0\x00', 0xc0e99db6de761f86, 0x0)
open$dir(&(0x7f0000000000)='./file0\x00', 0x0, 0x0)
open(&(0x7f0000000040)='./file0\x00', 0x70e, 0x0)


r0 = socket(0x11, 0x3, 0x0)
r1 = socket(0x11, 0x3, 0x0)
setsockopt$sock_int(r1, 0xffff, 0x1002, &(0x7f00000000c0), 0x4)
sendto$unix(0xffffffffffffffff, &(0x7f0000000000)="b1000504000004000000000001000000331c13fecea10500fef96ec0c72fd3357ae30200004e3003000000acf20b7804be38164991f7c8cf5f882b297be1aa0500000051e2f0ad3ebbc257699a1f139b672f4d335c223e7d0c032bfa896443a42102000000720fd18bfbb670c1f5a872c881ea6e2ec5890400000000008000361b4cc702fac500002021fbfa0c0f00008abfba221554f4e0f668246c0900000008e371a3f8343712051eea040000000000", 0xb1, 0x0, 0x0, 0x0)
sendto$unix(r0, &(0x7f0000000000)="b1000504000004000000000001000000331c13fecea10500fef96ec0c72fd3357ae30200004e3003000000acf20b7804be38164991f7c8cf5f882b297be1aa0500000051e2f0ad3ebbc257699a1f139b672f4d335c223e7d0c032bfa896443a42102000000720fd18bfbb670c1f5a872c881ea6e2ec5890400000000008000361b4cc702fac500002021fbfa0c0f00008abfba221554f4e0f668246c0900000008e371a3f8343712051eea040000000000", 0xb1, 0x0, 0x0, 0x0)


compat_43_osetrlimit(0x9, &(0x7f0000000080))
socket(0x1f, 0x3, 0x0)


r0 = socket(0x18, 0x3, 0x0)
setsockopt$sock_linger(r0, 0xffff, 0x80, &(0x7f0000000040)={0x1, 0x5b}, 0x8)
setsockopt(r0, 0x1000000029, 0x28, &(0x7f0000000000)="73b6adec", 0x4)
socketpair$unix(0x1, 0x2, 0x0, &(0x7f0000000040)={0xffffffffffffffff, <r1=>0xffffffffffffffff})
sendmmsg(r1, &(0x7f0000000000)={0x0}, 0xfffffe32, 0x0, 0x0)


compat_43_osetrlimit(0x0, &(0x7f0000000000))
compat_43_ogetrlimit(0x0, &(0x7f0000000040))


r0 = socket(0x18, 0x2, 0x0)
getsockopt(r0, 0x29, 0x2e, 0x0, 0x0)


setsockopt$sock_int(0xffffffffffffffff, 0xffff, 0x0, &(0x7f00000000c0)=0xfffe, 0x4)
r0 = socket(0x2, 0x1, 0x0)
syz_emit_ethernet(0xe, &(0x7f0000000000))
writev(0xffffffffffffffff, &(0x7f0000000580)=[{&(0x7f0000000000)="b886b4e47f", 0x5}], 0x1)
syz_emit_ethernet(0x138, &(0x7f0000000000))
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1)
r1 = socket(0x2, 0x1, 0x0)
bind(r1, &(0x7f0000000000), 0x10)
r2 = dup(r1)
setsockopt$sock_linger(0xffffffffffffffff, 0xffff, 0x80, 0x0, 0x0)
listen(r2, 0x0)
setsockopt$sock_int(r1, 0xffff, 0x1002, &(0x7f00000000c0), 0x4)
connect$unix(r0, &(0x7f0000000000), 0x10)
write(r0, &(0x7f0000000080)="04bdfa5d1d2873c63e3534825ba166e2fea9aec43050006123339a346f731573d8d508753f95b7688ad48b8cf6bbca325cebc37fc4e1dd543dbe2da6dd", 0xffea)


compat_50_select(0x190, 0x0, 0x0, &(0x7f00000000c0), &(0x7f0000000100))


modctl$MODCTL_UNLOAD(0x2, 0x0)
getegid()
msgrcv(0x0, 0x0, 0x0, 0x0, 0x0)
r0 = socket$inet(0x2, 0x1, 0x0)
setsockopt$sock_int(r0, 0xffff, 0x1000, 0x0, 0x0)


compat_40_mount(&(0x7f0000000040)='tmpfs\x00', &(0x7f00000000c0)='.\x00', 0x0, &(0x7f00000002c0)="01")
__mount50(&(0x7f00000002c0)='overlay\x00', &(0x7f0000000040)='.\x00', 0x0, &(0x7f0000000540), 0x0)
open$dir(0x0, 0x0, 0x0)
mkdirat(0xffffffffffffffff, 0x0, 0x0)
chdir(0x0)
symlink(&(0x7f0000000080)='.\x00', &(0x7f0000000240)='./file0\x00')


r0 = socket(0x2, 0x1, 0x0)
close(0xffffffffffffffff)
fsync(r0)


compat_43_ocreat(&(0x7f0000000000)='./file0\x00', 0x0)
r0 = open(&(0x7f0000000280)='./file0\x00', 0x0, 0x0)
fcntl$lock(r0, 0x9, &(0x7f00000001c0)={0x0, 0x0, 0x0, 0x100000004})
flock(r0, 0x2)
close(r0)
r1 = socket(0x1f, 0x5, 0x0)
dup2(r1, r0)


__mount50(&(0x7f0000000000)='overlay\x00', 0x0, 0x0, 0x0, 0x0)
fcntl$lock(0xffffffffffffffff, 0x0, &(0x7f00000000c0)={0x0, 0x0, 0x84000000000000})
connect$unix(0xffffffffffffffff, &(0x7f00000000c0)=@abs={0x682eb13985c518e6, 0x7}, 0x8)
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
r0 = socket(0x1, 0x1, 0x0)
close(r0)
r1 = socket(0x18, 0x2, 0x0)
setsockopt(r1, 0x1000000029, 0x2e, &(0x7f0000000000)="ebffcbff13b9fd812eaa4e713048e69931929648", 0x14)
connect$unix(r0, &(0x7f00000000c0)=@abs={0x0, 0x7}, 0x1c)


r0 = socket(0x18, 0x2, 0x0)
setsockopt$sock_int(r0, 0xffff, 0x0, 0x0, 0x0)
compat_30_socket(0x0, 0x0, 0x0)
__mount50(&(0x7f00000002c0)='overlay\x00', &(0x7f0000000040)='.\x00', 0x0, &(0x7f0000000540), 0x0)
__lstat50(&(0x7f0000000000)='.\x00', 0x0)
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
unmount(&(0x7f0000000000)='./file0\x00', 0x0)


mlock(&(0x7f0000009000/0x2000)=nil, 0x2000)
madvise(&(0x7f0000008000/0x2000)=nil, 0x2000, 0x6)


mkdir(&(0x7f0000000440)='./file0\x00', 0x0)
pathconf(&(0x7f0000000080)='./file0\x00', 0x4)


r0 = getsid(0x0)
ptrace(0x9, r0, 0x0, 0x0)
compat_50_wait4(0x0, 0x0, 0x0, 0x0)
ptrace(0x10000000010, r0, 0x0, 0x0)


__setitimer50(0x0, &(0x7f0000000040)={{}, {0x7fffffffffffffff}}, 0x0)


symlink(&(0x7f0000000080)='.\x00', &(0x7f0000000240)='./file0\x00')
compat_50___msgctl13$IPC_STAT(0x0, 0x2, &(0x7f0000000280)={{}, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, &(0x7f00000001c0)={&(0x7f0000000100), 0xdffffffffffff7ff}})
__mount50(&(0x7f00000002c0)='overlay\x00', &(0x7f0000000040)='.\x00', 0x0, &(0x7f0000000540), 0x0)
lchown(&(0x7f0000000100)='./file0\x00', 0x0, 0xffffffffffffffff)
compat_40_mount(&(0x7f0000000380)='union\x00', &(0x7f0000000140)='./file0\x00', 0x0, &(0x7f00000001c0))
pathconf(&(0x7f0000000200)='./file0\x00', 0xe)


r0 = socket$unix(0x1, 0x5, 0x0)
ioctl$FIONBIO(r0, 0x8004667e, &(0x7f0000000080)=0x9)
bind$unix(r0, &(0x7f0000000200)=@file={0xd570d0466b6018f, './file0\x00'}, 0xa)
listen(r0, 0x0)
r1 = socket$unix(0x1, 0x5, 0x0)
connect$unix(r1, &(0x7f0000000000)=@file={0xd1653077bafa0114, './file0\x00'}, 0xa)
accept$unix(r0, 0x0, 0x0)


r0 = socket$unix(0x1, 0x1, 0x0)
r1 = socket$unix(0x1, 0x1, 0x0)
r2 = dup3(r1, r0, 0x0)
getsockopt$inet_opts(r2, 0x0, 0x1, 0x0, 0x0)


mknod(&(0x7f00000000c0)='./file0\x00', 0x6000, 0xe03)
r0 = open(&(0x7f0000000040)='./file0\x00', 0x1, 0x0)
ioctl$WSMUXIO_INJECTEVENT(r0, 0xc0284600, &(0x7f00000001c0))


compat_50___msgctl13$IPC_STAT(0x0, 0x2, &(0x7f0000000280)={{}, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, &(0x7f00000001c0)={&(0x7f0000000100)}})
symlink(&(0x7f0000000080)='.\x00', &(0x7f0000000240)='./file0\x00')
__posix_rename(&(0x7f0000000100)='./file0\x00', 0x0)
sendmsg$unix(0xffffffffffffffff, &(0x7f0000000240)={&(0x7f0000000180)=@abs={0x0, 0x0, 0x2}, 0x2, &(0x7f0000000440)=[{&(0x7f0000000340)="4cfc7dc8f70705bcfcd8d6543cac6ec0a12b9e92b3ee1350023ff52988be2002547f3bec9e1522c2e964a2f55849bbc8b67cd509edbfe5fc749be3bdeda8109172dd9095df7343d47ec5627434e9d672bd042be3d817c32f090e828831c112a19346cccd2c85844396867416b298bc77ad422cf9df9bda5d001b84581018b4cea4c7edeaa668c0acaa765b779d5dc09d186d8fda7f1f578dcfafa9832c1113351c58b945e45fcf93998dff86fab023ad", 0xb9}, {&(0x7f0000000800)="54b5fc0c33903ab64237f96f38c42c80dbea07c75b95d6e886e7f80a35383266345beb29006dd12a14d3456226371adb5aa5c93b28dbc866a38259958468077dda3d131098aa4fbdb9e104bac01b6bf56ad921e22ec460ec062748db82155dff728d120a936da5e40a847fbee69c3981740b54d97a685ae058cb17a8926fd302d1666d63a8bac43a3843a07d51aff73722ada6f25bf02ab7c438e5e7794f0f000de628b590901f87bdb48a1788542d65c37399c226afefd8292b53a34582eef26b5a44bb9da50a7067bccd7dfce2ab9070db69b02e59dfc8a6d450b140499e511a792d8b1f03f5a7d7801d3aee4c373088550d6015c8efcc06aee29dde7d950e7e30614b01c6c2ee91aa38a2ff42454fca4b2b9d2966ec914be3e022e4123a8f40f9690c97e6ec1293f13c2a8dec90a03f9914ef73e6547ecaeb7f1e3b02c110f6dfaf92ab4e511cbb8d01c52ea09e3a8130bda6e5a1816d34cdcb53417bcf1ba71f2191aa2e72ffcc2e625c84ff4d5c7044286cce3648d0145ceac3dde5663c1f3d4a379e37bf699d1a6881b27afabaa7b988c3a5755a8f61b1807a3e31e4ea4ae467f2eed8b922bfc9f5e1c050a44cac7ca6c9e02f22d276d1058d567d7db524e376ed42619b8eed4bee3d08e877463b2723fa378baff82f4cb94118e09f9939806ddc8dbeeebe3be9a9ce76a3bed23bf1ecea904ae7cebe9aaf264971d468f19bf7a0a6e1951c518bf27924529447a63ba70f8d2dc0a18a6b0738bd849b55687e19d064175093f90b908b984089d7ed0097817927c06b31ea7f0cda3770bdb2e5b1ce77e14f37c8dcffb53b94bc07f555ab422c394dd8166bb0aef6c503f3ef7d3df1371f58c3a42e3c04176efcab05aee67d268017f8e6c4399794b66681b75b6d28daa0615da35abe11fbf8c1c0e1e8797312679d63ecf6afeceab9f7d5f7bbc9206ef5321371666c4195aa264ec36cc695b74fc5bf4908a477f71d4f16d9bf873a3686132d56f590eb2990fef2c078bcf1559d42f45905f480addbea49ada98bcbb5fbc6e73ccfc90bf01017c147df73c9cdb876b2f721d0231e86a3ccb863a67c7bafd2a58348b2377a76d698dc8a5e1d4f96babf47a42e38fb124500a2eaf644b4979e275e57e9041a69140c31d22f472943e0f66849bdd0d53691a4501a8cb6856fce98ec4e256d581167f813bf45c79666ad56e6ab79dd33d0a23b3757a1ad09ee27de834f0eb0f13c15cd3e1b1a675723b0a4a68551211651fe3ca46ebb7e631393abea22eec6832456a61bcb4ae00f2e1e2b492378fa14bc08b7a0917b24c1648359f0edbcde8c89f47806b562aec2ef6267e03cb9adbd65c5ce6ee5fd70c4294e91eeab6428774579bf5a012c55707f56288941b96dd190cd1d2f0bb05b291507e00ec186a7fba7969202366c3141108b2899bd4a083cd5fa62a6aed261a286da357bec039073c2d250485f2d86ab9e82487688d608b5f82bfa7b5278a07feb24a70b35e2362073598027c5e52dd06490cd74179802e4b9034e8c9fafbc0617021bdf111833739309cd953d051d016c3d8d339b756edf2d7959940a60eca82c82ffe1d9249e30c7184976de662edbc990b14a8b87cc358c927d4c57eb7e7ea8b6aab03710e961a029eb3d3cf035a5247ec495692ec8fef3e1098bcd9ac1c25f66e70748d8b48fd0c3fd0d4bd3a75d0e3d9ac5a19cb3a38f7ce3f643a799cd1bb306961d41d11cb910ca7c556224f1dc2235562742f8dfddc7311bc82906f5d1dcfa824853b4659351af06a97ec3c3c778f3d7bead7061f37c8d4dcb56db267423fc8e05fedfb26224bdcf6c4489b16746243d2064fbadc3d9ab10cf99640dbfccd93e0674ff99b4ebec733844ce3ec2df7dacffb68f120bbc0629b3919b29964ccc9d3d228912fc7b4a2e00687c757dc9f93db03537ba175fc1303bb981ccd311c0d6e5d3efe5b6c555e4f228b08262ebc9a410111fb075c7ba336c231ac679f0ee083390a2364998c3507c9f14c25f9e1ef46d07e520f3447906caf276a474bc6b13bbd787da65ecf5c9d194312770b54d9281250c84310cfb54aef8dd9685edc0e9d4528defb85dbaff10aafebd7ebf598be73f57231af5b53938d3c63784870c5745eec61c3683e2fd65a9393514fd251efb4367706d4bf14081cd4028472169e0939c39f9556c8243af7a19ca89db043948b2bb45783192a3cddd08b3fff934d0a149db40278233ac3c250ae363e8551bf774459067d11bf724946de345a3ee3f33a7b137a89dbf3c96e4d2abb4bfe589e7465e5f0c5f21f801620a034fb245e64040a9fa8c68a8215dc2651f4f1c8849acee89e810b141a85c93a89ba70af7fe8646329209f6ef7f199af37f56466505a135755c39982563ff38a62cc47b7e300da17b7929c0d9b70be7f4e3e101ef07882ab6b553bf80ff2abd3d5ea96f9726b71f56d39c7a82dabbddd3a62637374c71a087198136a912eb8f29a673296f909cd1a0f84c0532f1296d79eab6491d75860eecbafe3f0fb097276b989bfb046df7afd238346b583d14292b4f65e1e60b7d4740c9753962c6ea4948347d8f72227a19ce745542aee706f1cfe4d5344f5b139a28b807589efcd6431adcc05e6b9e6eacd9d8e116d8c476e758345c387aadeca308a5b336ea8b93710a46cfb14dea87c7f9c183ca2f8e3c8a13724974f0664a3bee6cd3387877bbc1cf25eb295b8fbdac873dd4fc247aec2bd16b7b24b6f7b515593d4f1020a311b4c489cb22e4a2282b4872d883c625cb7cae23f6beb82f66445b93d9ebf115708cb4c8f5a2d7d4da780f6c7aa9a37328da40a0497313a738d6cfcb9b57aeeffdd5a34d97143f6bdc14d6d1b2458625b1ff528845921ddc9882f1943cc60cfce49f11eda7215cd84c0ee852a6f309e044c84fdc4ee98e8593033404442fc00735f1bfb03c26cbbb5e0a36f82630387371e906c6f3d71b5961d23ce9690a7ce6bbcc28546be18c9549d9b65f3cea5504a50f782ed9e57d66e2d51d4724d2565a92b60f9f55c6190e49269d5d1136799a05ad3c9322d3ea969d420c71f2156dfc6c453e126b7d4492af267c15ef07e4b59cc5a5330d41445303033cee0feff4490e5a0f170e6a64e99e664c02da0e4db787e35f7849b0abcbe9878a54d1fa7bc787d67ffdd63ba5f9700f3ee8fc810d4fd8d6ac03e47deb83c264611f1dbc520006d176e7aa407299ee6a69d9c28bd7c88af055f993fc19e1a152701069d7a8865e582c1603c4c32a7e14fb1b55167d17e3d882f3ee9628e50202c33c1127f3339d8126f94be61045a4e9d7d17fc70a7be40998e5e774733c5be624b82fd04223039a9079ef6368b82a02ca88ee9cfca55334b7de9d885713a33593b59f4b9aec2b11a8142add537416b40e51b27c0f6b83be0575803cd196558af0139674fd64e34de1ba2394f665af6bb662b4a0fd74ed3b291347bbd8572cd8df836d8d03bbf8b7a1659278e0b3ffdddc1d05e7b46a31678f6b4f8917518da60c0038c599e028922a2f29969a3393e2480d1f0089b8529a48c971c6a1ef355e7b122075f8c811a7fb972d400da92f4aa1351c42363f2bd83cf20c603727af47a7b8e9e22ee87d4e7a201b789f410ce9c4489330fd74030d7b22d5bcf3296fd9a8d507462beda498a07796c66b035d116f3ee6b49a1c0dde22706c5f8d481c74b573282eb70751885630fa27e04f5032b7d6fdb6f6f3235523d60c0d40013a1c97074ffb99e04565225ffffa61a3218b883aa9a82cdeb57bedf7feb963e7774ab853817384455c436e419c4cf730a18c821643cfe86e4de914d5913fee5738ec55206d3087b840158f95f5d4502231e2371b6dc391a8cb71e67b4b0fdfc2891c2400f05c4147204106e532689761782b041d37b25f9f8e65264d02142d37759f543b238dfdf5fd90adb446b6585116b1746a19052a958db6db08d9729310ac6c39b025118b459a3aef4a5c704ad68efb4a82c73699b6ccddc6c3d0c74489c8973e96430ff8844a820a525d7ab66339aec1a456149503c4519b85f582492e67be318cfa38b5a1ed84c5ccb1dff97cb3864d78fefff641f8a681ea2d130981a2c14221aea63c554c4f6c279ad038be153637ec1dcc137db8c85911f528df63a6e5c899619ddbc960dc7439dd0efe08d04f1c381a8bee694ff52956b60f885316c544b2e785f3eb08dd630338faf7185a037544bd69d6dbedf8f42d1d4ce8f458c598d66707afbdb9748610599e83f576976ed77d00c7c50526b6872204ffedeb683e9b92341642ed6f1f49e1f87adda32c0209dc2101ea3127a4f5701f0d8429f534e2f71cc0234c2f1b6d81b92592ee88893e97da7bbcac29a63ed374c1d87c9afb82476117c248f70cf05a6bfbd76c11a55d97ccc84a1f934b7c18ff6aebc2b4c0e2194fdca6abbfe8c9c5022b797808f34ecf3078e07a2976d12de907a8c45b3e375a2de8738f72c69d62cc55166b660ad652ab1a3f62c87141e38dfaa798bd0b3e3348bab7155940f0d7612c3d751523ae4efcb9e9a8824de1986b561b7c86c24466d19340f1a525237e965fd986ce9c4168de7916412cb3701178fb3134f10ac43f56491b401057f3277ef00a7fe0ff427e93e66538a76b84c9bc85fb402feaef4a01452409746852d31a34a0796e7f816a7aa2642c0144b189cf50b3c9f84d6f3963da4200c0bc9acac1423f676010a2a588c48c9532ab667d607bafae12887a9ad9219cfdb30fd8a76bd6390380b83235a600a5286e440f0587a37077c04dc9eb94c1c5dbcd4892327051809c361fd9e744e56a5d86c8c23b5813d8e5b1b9195f5a76121b6973d34178c095a30edd066ef4e4b45156369765a6561eada9fc4de54a3065c26d1acfbb99f133565a626253c84c7e09b1de3572208f93fba6a21de7039ea66d6d4fffc191e70aa1cafec1445d128be6739052368817019ac8ef4be8a533c566ad534e40493611686cdd289052d402e49a214d1c07e296ef487961df5a5d4742ade04992c8c4564a5c67057b396b941932cb1e615c6d4b65bd2066075d65ab428f047dab80e8aab63e3c0ad8103ec83af4eedae7b2b59a04003778aa55467e001b4304050056f1461de9057a619f27c3b43a31dda658c2a1411b4a674ce9ed1547b826077f64479d79c929eaec7ef2ec80c3b9b240f57a75d11816f6a867319a79f4497b61eb01616eef50f3db17223bc1a8474802503c309f90246786b72cac5d469002607ec23053079419ad9550f90ee932cf2585b3e8173c7cb2306ff73ef27aa4389e9a089ceff4714e5fe318cfe43e7ce9c839ab65e00259ef2cd1a2fead10f31f8c9e8367a59e41f703e5958dd7ee37dc3bf1cfcf74ab10714a09e95c248af9af484457ac12f6c85e984305c98f8f79b4e9e441dd0ca1ced70f795c3a3f5549b5c035f31902ff89448074a7d7f60015f6a7b21d9cc1037c61222c7fd892e81659d7bc357a0442b60b90ba7e256ca00dc08bfc9721310c3407bf86a89b2b070e4b1147d13021c386c509b015ee5af941f658332e40ebe8038dc52e7fb3a81b8d7af47f5ddb8574be9fba34fd6446e071aaf6109a14e9e88a2a157715d54a9a50b21e3c53bde9d2c6cb682cdf9bf06c4c18a6d1f5827d1cf228da73711d42a11ca320a522ea07f024c0979bcc7e06b4b6fd4b777fb9550853ccc471ff8143b780f4a95dddb942e7704ed3e122f83444769a75aa4c2c64f2848c2868a803a8a7aad1cd6e472931d6fed9bdc9139b3a59fd1a3edb40fe4e30decbfc3e", 0x1000}], 0x2, 0x0, 0x63}, 0x0)
compat_40_mount(&(0x7f0000000140)='umap\x00', &(0x7f0000000040)='./file0\x00', 0x0, &(0x7f00000001c0))
mkdir(&(0x7f00000007c0)='./file0\x00', 0x0)


mkdir(0x0, 0x0)
r0 = socket$inet6(0x18, 0x3, 0x0)
poll(&(0x7f0000000000)=[{r0, 0xd}], 0x1, 0x0)
shutdown(r0, 0x1)


r0 = open(&(0x7f0000000080)='./bus\x00', 0x400141042, 0x0)
mmap(&(0x7f0000001000/0xa000)=nil, 0xa000, 0x0, 0x12, r0, 0x0, 0x0)
madvise(&(0x7f0000000000/0x600000)=nil, 0x600003, 0x15)


mkdir(&(0x7f0000000080)='./file0\x00', 0x0)
compat_50___lstat30(&(0x7f0000000000)='./file0\x00', 0x0)


compat_40_mount(&(0x7f0000000040)='tmpfs\x00', &(0x7f0000000140)='.\x00', 0x0, &(0x7f0000000080)="01")
open(&(0x7f0000001700)='./file0\x00', 0x70e, 0x0)
acct(&(0x7f0000000080)='./file0\x00')


chroot(&(0x7f0000000000)='.\x00')
compat_40_mount(&(0x7f0000000380)='tmpfs\x00', &(0x7f00000003c0)='.\x00', 0x0, &(0x7f0000000140)="01")
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
unmount(&(0x7f0000000000)='./file0\x00', 0x255a0100)
__getvfsstat90(0x0, 0x0, 0x0)


mkdir(&(0x7f0000000080)='./file0\x00', 0x0)
compat_50___msgctl13$IPC_STAT(0x0, 0x2, &(0x7f0000000280)={{}, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, &(0x7f00000001c0)={&(0x7f0000000100), 0xdffffffffffff7ff}})
lchown(&(0x7f0000000100)='./file0\x00', 0x0, 0xffffffffffffffff)
compat_40_mount(&(0x7f0000000380)='union\x00', &(0x7f0000000140)='./file0\x00', 0x3, &(0x7f00000001c0))
compat_40_mount(&(0x7f0000000140)='umap\x00', &(0x7f0000000040)='./file0\x00', 0x0, &(0x7f00000001c0))
lchown(&(0x7f0000000100)='./file0\x00', 0x0, 0xffffffffffffffff)


__clock_getres50(0x0, &(0x7f0000000000))
setrlimit(0x0, 0x0)
chflags(&(0x7f0000000000)='./file0\x00', 0x0)
modctl$MODCTL_UNLOAD(0x4, &(0x7f0000000000))


compat_43_ocreat(&(0x7f0000000c40)='./file0\x00', 0x0)
r0 = getpid()
ktrace(&(0x7f0000000340)='./file0\x00', 0x0, 0xf, r0)
__lstat50(0x0, 0x0)
getgroups(0x1, &(0x7f0000000040)=[0xffffffffffffffff])


modctl$MODCTL_UNLOAD(0x2, 0x0)
compat_43_osend(0xffffffffffffffff, 0x0, 0x0, 0x0)
_ksem_init(0x0, &(0x7f0000000100)=<r0=>0x0)
_ksem_wait(r0)
_ksem_post(r0)


open(&(0x7f0000000000)='./file0\x00', 0x611, 0x0)
mknod(&(0x7f0000000280)='./file0\x00', 0x1ffa, 0x0)


_ksem_init(0x0, &(0x7f0000000300)=<r0=>0x0)
_ksem_destroy(r0)
_ksem_destroy(r0)


modctl$MODCTL_UNLOAD(0x2, 0x0)
r0 = shmget$private(0x0, 0x4000, 0x0, &(0x7f000055b000/0x4000)=nil)
shmat(r0, &(0x7f0000ff5000/0x4000)=nil, 0x0)
shmdt(0x0)


r0 = socket$unix(0x1, 0x1, 0x0)
recvmsg(r0, &(0x7f00000008c0)={0x0, 0x0, 0x0, 0x0, &(0x7f0000000880)=""/17, 0x11}, 0x843)


symlink(&(0x7f0000001040)='./file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\x00', &(0x7f0000000240)='./file0\x00')
unlink(&(0x7f0000000000)='./file0\x00')


posix_spawn(0x0, &(0x7f00000001c0)='/dev/wscons\x00', 0x0, 0x0, 0x0, 0x0)


recvmmsg(0xffffffffffffffff, &(0x7f0000000080)={&(0x7f0000000100)={&(0x7f00000006c0), 0x213, 0x0, 0x0, 0x0}}, 0x10, 0x0, 0x0)
socketpair$unix(0x1, 0x1, 0x0, &(0x7f0000000340)={0xffffffffffffffff, <r0=>0xffffffffffffffff})
lchown(&(0x7f0000000100)='./file0\x00', 0x0, 0xffffffffffffffff)
sendmmsg(r0, &(0x7f0000000080)={0x0}, 0x10, 0x0, 0x0)


_lwp_setname(0x0, &(0x7f0000000000)='\\\x9f^*-]%!O+&*.#\x00')
_lwp_exit()


ioctl$WSMUXIO_INJECTEVENT(0xffffffffffffffff, 0x80185760, 0x0)
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
r0 = socket$unix(0x1, 0x2, 0x0)
getsockopt$sock_int(r0, 0xffff, 0x2, 0x0, 0x0)


mknod(&(0x7f00000000c0)='./file0\x00', 0x2000, 0x4100)
r0 = open(&(0x7f00000000c0)='./file0\x00', 0x0, 0x0)
ioctl$FIOASYNC(r0, 0x8010572b, &(0x7f0000000080))


r0 = fcntl$dupfd(0xffffffffffffffff, 0xb, 0xffffffffffffffff)
compat_30___fstat13(r0, 0x0)


mknod(&(0x7f0000000100)='./file0\x00', 0x2000, 0x0)
r0 = open(&(0x7f0000000140)='./file0\x00', 0x0, 0x0)
__futimes50(r0, &(0x7f0000000000))


_lwp_kill(0x0, 0x10001)
ptrace(0x0, 0x0, 0x0, 0x0)


compat_50_select(0x0, 0x0, 0x0, 0xffffffffffffffff, 0x0)


mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x0)
r0 = socket(0x18, 0x3, 0x0)
setsockopt(r0, 0x0, 0x0, &(0x7f0000000000)='Z', 0x1)


compat_50_setitimer(0x0, &(0x7f0000000080)={{0x0, 0x1}}, 0x0)
r0 = socket$inet(0x2, 0x2, 0x0)
ioctl$FIONREAD(r0, 0xc0106926, &(0x7f0000000080))


compat_40_mount(&(0x7f0000000040)='tmpfs\x00', &(0x7f00000000c0)='.\x00', 0x0, &(0x7f00000002c0)="01")
r0 = open$dir(&(0x7f00000000c0)='./file0\x00', 0x40000400001803c1, 0x0)
open$dir(0x0, 0x0, 0x0)
pwritev(r0, &(0x7f0000000080)=[{&(0x7f00000006c0), 0xf0f75}], 0x1, 0x0)
open(&(0x7f0000000040)='./file0\x00', 0x615, 0x0)


modctl$MODCTL_UNLOAD(0x2, 0x0)
r0 = socket(0x18, 0x2, 0x0)
r1 = socket(0x18, 0x2, 0x0)
r2 = dup2(r0, r1)
ioctl$KDMKTONE(r2, 0x20004b08)


r0 = getpgrp()
getpriority(0x0, r0)


modctl$MODCTL_UNLOAD(0x2, 0x0)
open(0x0, 0x0, 0x0)
r0 = open(&(0x7f00000000c0)='./file0\x00', 0x80000000000206, 0x0)
writev(r0, &(0x7f0000000280)=[{&(0x7f0000000000)='#', 0x1}], 0x1)
mprotect(&(0x7f0000ffc000/0x1000)=nil, 0x1000, 0x0)
r1 = open$dir(&(0x7f00000000c0)='./file0\x00', 0x40000400001803c1, 0x0)
pwritev(r1, &(0x7f0000000080)=[{&(0x7f00000006c0), 0xf0f75}], 0x1, 0x0)


sendmsg(0xffffffffffffffff, &(0x7f0000000800)={0x0, 0x0, &(0x7f0000000040)=[{&(0x7f0000000140)="b5590404fd4a3129714bd5bfbd7fc0912cea952861147a2d9ba21dead5751ed75b4e8926e08faf65c48579946410000df92c614e489294fff8a08bd44770c485ae10bf4234f95cf146ab5a3ee056e94baca4409d1bca64f6778130897e66305b77e374c3c35161ca07794a0fafa0e96f69990088b486ac49a3b7f1cf5fec1f42456cad0db605b1553f531010d2b12f85", 0x90}], 0x1, 0x0, 0x3f0}, 0x0)
r0 = socket(0x2, 0x2, 0x0)
ioctl$FIONREAD(r0, 0x80206979, &(0x7f00000001c0))


mknod(&(0x7f0000000480)='./file0\x00', 0x2000, 0x1733)
r0 = open(&(0x7f0000000080)='./file0\x00', 0x0, 0x0)
read(r0, 0x0, 0x0)


mknod$loop(&(0x7f0000000000)='./file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\x00', 0x0, 0x1)
mknod(&(0x7f0000000480)='./file0\x00', 0x2000, 0x1733)
r0 = open(&(0x7f00000000c0)='./file0\x00', 0x0, 0x0)
ioctl$FIONREAD(r0, 0x80044279, &(0x7f0000000080))


socketpair$unix(0x1, 0x2, 0x0, &(0x7f0000000340)={<r0=>0xffffffffffffffff, <r1=>0xffffffffffffffff})
write(r0, 0x0, 0x0)
recvmmsg(r1, &(0x7f0000000880), 0x10, 0x0, 0x0)


symlink(0x0, &(0x7f0000000240)='./file0\x00')
madvise(&(0x7f0000ffd000/0x2000)=nil, 0x2000, 0x1)
mlock(&(0x7f0000ffb000/0x1000)=nil, 0x1000)
mmap(&(0x7f0000ff9000/0x4000)=nil, 0x4000, 0xd49f275d97cc01bb, 0x1810, 0xffffffffffffffff, 0x0, 0x0)


mknod(&(0x7f0000000540)='./file0\x00', 0x6000, 0x1003)
open(&(0x7f0000000480)='./file0\x00', 0x0, 0x0)
mkdirat(0xffffffffffffffff, &(0x7f0000000000)='./file0\x00', 0x0)
swapctl$SWAP_ON(0x1, &(0x7f0000000000), 0x0)


ptrace(0x12, 0x0, 0x0, 0x0)


modctl$MODCTL_UNLOAD(0x2, &(0x7f0000000080))


mkdir(&(0x7f0000000080)='./file0\x00', 0x0)
compat_40_mount(&(0x7f0000000180)='ptyfs\x00', &(0x7f00000002c0)='./file0\x00', 0x0, &(0x7f0000000500))
__mount50(&(0x7f00000002c0)='overlay\x00', &(0x7f0000000000)='./file0\x00', 0x0, &(0x7f0000000540), 0x0)
pathconf(&(0x7f00000000c0)='./file0\x00', 0x5)


modctl$MODCTL_UNLOAD(0x1, &(0x7f0000000000)='g\n')


symlink(&(0x7f0000000080)='.\x00', &(0x7f0000000240)='./file0\x00')
compat_50___msgctl13$IPC_STAT(0x0, 0x2, &(0x7f0000000280)={{}, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, &(0x7f00000001c0)={&(0x7f0000000100), 0xdffffffffffff7ff}})
lchown(&(0x7f0000000100)='./file0\x00', 0x0, 0xffffffffffffffff)
compat_40_mount(&(0x7f0000000380)='union\x00', &(0x7f0000000140)='./file0\x00', 0x0, &(0x7f00000001c0))
sendmsg$unix(0xffffffffffffffff, &(0x7f0000000240)={&(0x7f0000000180)=@abs={0x0, 0x0, 0x2}, 0x8, &(0x7f0000000200)}, 0x0)
compat_40_mount(&(0x7f0000000140)='umap\x00', &(0x7f0000000040)='./file0\x00', 0x0, &(0x7f00000001c0))
unlink(&(0x7f0000000000)='./file0\x00')
mknod$loop(&(0x7f00000023c0)='./file0\x00', 0x8000, 0x1)


open(0x0, 0x0, 0x0)
mknod(&(0x7f0000000480)='./file0\x00', 0x2000, 0x1733)
pathconf(&(0x7f00000000c0)='./file0\x00', 0x7)


r0 = socket$inet(0x2, 0x2, 0x0)
setsockopt$sock_timeval(r0, 0xffff, 0x100c, &(0x7f0000000040)={0x0, 0x3}, 0x10)


r0 = getsid(0x0)
ptrace(0x9, r0, 0x0, 0x0)
compat_50_wait4(0x0, 0x0, 0x0, 0x0)
ptrace(0x20, r0, 0x0, 0xae5)


r0 = socket$inet(0x2, 0x1, 0x0)
bind$inet(r0, &(0x7f0000000040)={0x2, 0x0}, 0x10)
connect$inet(r0, &(0x7f0000000000)={0x2, 0x0}, 0x10)
writev(r0, &(0x7f00000018c0)=[{&(0x7f00000002c0)="3cf7dd8b2cef328b10ee834b9bf279997acfaa5605c1e72da4d734bbcd193eb0e3f5886966d7ad6d4ec6002dce897ecf0c28387d55de272dcefe308d941a77d7d79b3fd430ca526c3de7139335e47faaf2410562e1151eba0d9e803e02029dea943189a28e10c6df509c84b0a64ac893cda7beb2b104b312d6c2db9701def1ea55436a8413bd9ddc5c61a3a8f6992bd19b262e9f933f0786f77e53a5c9a978d0fca504c121c8e756849d66c88229aaf09b663b66f779b33b978d39b4d2855dc3573792a3f2ab246edcf33b25b4c8ac280fca979eec19", 0xd6}, {&(0x7f00000003c0)="617148cc4a35a2d5cd9872ea23d881017b3e11d1f1daa17e012c531b9662c8efb29f00f26fe1daa121b490a54c96f79c8713807dd81de979a279455352420d567bbb44dcf356a467d107cdca51062ecabe9b675db6f686a79cc4780b8ebd9e9b70e0b74e0dc9b8dde7d244ef95649a60cf78d265aba5b027be2d6e170fe3cd2cef25d69cdce35b389b0c2c542a7fe8126c50268010fe00233d655ec514996df644287c86670891806b9b4f30fc3988a70702b5ba41a37b25476932", 0xbb}], 0x2)
compat_43_osend(r0, &(0x7f0000000140)='1', 0x1, 0xb)


mkdir(&(0x7f0000000200)='./file0\x00', 0x0)
compat_50___msgctl13$IPC_STAT(0x0, 0x2, &(0x7f0000000280)={{}, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, &(0x7f00000001c0)={&(0x7f0000000100)}})
__mount50(&(0x7f0000000000)='fdesc\x00', &(0x7f0000000200)='./file0\x00', 0x0, 0x0, 0x0)
lchown(&(0x7f0000000100)='./file0\x00', 0x0, 0xffffffffffffffff)
compat_40_mount(&(0x7f0000000140)='umap\x00', &(0x7f0000000040)='./file0\x00', 0x0, &(0x7f00000001c0))
pathconf(&(0x7f0000000080)='./file0\x00', 0x1)


compat_50_select(0x2, &(0x7f0000000280)={0xdc}, 0x0, 0x0, 0x0)


madvise(&(0x7f0000ffd000/0x2000)=nil, 0x2000, 0x0)
mlock(&(0x7f0000ffc000/0x3000)=nil, 0x3000)
munlock(&(0x7f0000ffc000/0x3000)=nil, 0x3000)
mmap(&(0x7f0000ffa000/0x4000)=nil, 0x4000, 0xd49f275d97cc01bb, 0x1810, 0xffffffffffffffff, 0x0, 0x0)


chroot(&(0x7f0000000000)='.\x00')
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1)
execve(&(0x7f0000000200)='./file0/file0\x00', 0x0, 0x0)


open(0x0, 0x0, 0x0)
r0 = socket(0x18, 0x2, 0x0)
setsockopt(r0, 0x1000000029, 0x3f, &(0x7f0000000040)="674cd6e5", 0x4)


r0 = shmget$private(0x0, 0x3000, 0x0, &(0x7f0000ffa000/0x3000)=nil)
shmctl$IPC_SET(r0, 0x1, &(0x7f0000000000)={{0x1000, 0x0, 0x0, 0x0, 0x140}, 0x0, 0x2})
compat_50___shmctl13$IPC_RMID(r0, 0x0)


modctl$MODCTL_UNLOAD(0x2, 0x0)
mknod$loop(&(0x7f0000000000)='./file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\x00', 0x0, 0x1)
fcntl$lock(0xffffffffffffffff, 0x0, &(0x7f0000000040))
socketpair$unix(0x1, 0x1, 0x0, &(0x7f0000000140)={<r0=>0xffffffffffffffff, <r1=>0xffffffffffffffff})
sendmsg$unix(r1, &(0x7f00000000c0)={0x0, 0x2f, 0x0, 0x0, &(0x7f0000000100)=ANY=[@ANYBLOB="28000000ffff000001"], 0x28}, 0x0)
recvmsg(0xffffffffffffffff, 0x0, 0x0)
recvmmsg(r0, &(0x7f0000000040)={0x0}, 0x10, 0x0, 0x0)
socket$inet(0x2, 0x0, 0x0)
dup2(0xffffffffffffffff, 0xffffffffffffffff)
socketpair$unix(0x1, 0x0, 0x0, 0x0)
compat_40_mount(0x0, &(0x7f00000000c0)='.\x00', 0x0, 0x0)


mlock(&(0x7f0000ffd000/0x2000)=nil, 0x2000)
mmap(&(0x7f0000ffb000/0x4000)=nil, 0x800000, 0xd49f275d97cc01bb, 0x1810, 0xffffffffffffffff, 0x0, 0x0)


ioctl$WSMUXIO_INJECTEVENT(0xffffffffffffffff, 0x80185760, 0x0)
r0 = socket(0x2, 0x2, 0x0)
ioctl$FIONREAD(r0, 0xc020699e, &(0x7f00000001c0))
r1 = msgget$private(0x0, 0x0)
msgsnd(r1, 0x0, 0x401, 0x0)


open$dir(&(0x7f0000000000)='./bus\x00', 0x0, 0x0)
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x4)
compat_50_clock_settime(0x0, &(0x7f0000000000))


socketpair$unix(0x1, 0x1, 0x0, &(0x7f0000000200)={<r0=>0xffffffffffffffff})
shutdown(r0, 0x1)


getsid(0x0)
pipe(&(0x7f0000000840)={<r0=>0xffffffffffffffff})
__posix_fadvise50(0xffffffffffffffff, 0x0, 0x0, 0x0, 0x0)
open(&(0x7f0000000040)='./file0\x00', 0x70e, 0x0)
ktrace(&(0x7f0000000200)='./file0\x00', 0x4, 0xd27d43220c7df9b, 0x0)
ptrace(0x0, 0x0, 0x0, 0x0)
msgctl$IPC_STAT(0x0, 0x2, 0x0)
mkdir(0x0, 0x0)
ioctl$TPROF_IOC_START(r0, 0x80185402, &(0x7f0000000040)={0x0, 0x0, 0x3})


munmap(&(0x7f0000001000/0x3000)=nil, 0x3000)
madvise(&(0x7f0000001000/0x4000)=nil, 0x4000, 0x3)


compat_50_select(0x0, 0x0, 0x0, &(0x7f00000031c0), &(0x7f0000003200)={0x0, 0x4})


modctl$MODCTL_UNLOAD(0x2, 0x0)
compat_50_setitimer(0x0, &(0x7f0000001800)={{}, {0x0, 0xf423f}}, 0x0)
__setitimer50(0x2, 0x0, &(0x7f0000000080))


msgrcv(0x0, 0x0, 0x0, 0x0, 0x0)
compat_43_ocreat(&(0x7f0000000000)='./file0\x00', 0x0)
ktrace(&(0x7f0000000200)='./file0\x00', 0x4, 0xf, 0x0)
geteuid()
r0 = msgget$private(0x0, 0x0)
msgctl$IPC_STAT(r0, 0x2, &(0x7f0000000040)=""/56)


symlink(&(0x7f0000000080)='.\x00', &(0x7f0000000240)='./file0\x00')
compat_50___msgctl13$IPC_STAT(0x0, 0x2, &(0x7f0000000280)={{}, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, &(0x7f00000001c0)={&(0x7f0000000100), 0xdffffffffffff7ff}})
lchown(&(0x7f0000000100)='./file0\x00', 0x0, 0xffffffffffffffff)
compat_40_mount(&(0x7f0000000380)='union\x00', &(0x7f0000000140)='./file0\x00', 0x0, &(0x7f00000001c0))
utimensat(0xffffffffffffff9c, &(0x7f0000000000)='./file0\x00', 0x0, 0x0)


r0 = semget$private(0x0, 0x6, 0x600)
semctl$IPC_RMID(r0, 0x0, 0x0)
compat_40_mount(&(0x7f0000000000)='null\x00', &(0x7f0000000040)='./file0\x00', 0x80000000, 0x0)
setrlimit(0x0, &(0x7f00000000c0)={0x100000001})
mkdir(0x0, 0x4)
getsockopt$SO_PEERCRED(0xffffffffffffffff, 0xffff, 0x11, &(0x7f0000000180)={0x0, <r1=>0x0, <r2=>0x0}, 0xc)
__posix_fchown(&(0x7f0000000140)='./file0\x00', 0x0, 0xffffffffffffffff)
compat_30___lstat13(0x0, &(0x7f0000000200))
r3 = msgget(0x0, 0x645)
msgrcv(r3, &(0x7f0000000280)={0x0, ""/124}, 0x84, 0x2, 0x800)
unmount(&(0x7f0000000340)='./file0\x00', 0x0)
rename(0x0, 0x0)
getpeername$unix(0xffffffffffffffff, 0x0, 0x0)
__clock_settime50(0x3, &(0x7f00000005c0)={0x1, 0xf4ec})
chown(&(0x7f0000000740)='./file1\x00', r1, r2)
getrlimit(0x8, &(0x7f0000000780))
open$dir(&(0x7f00000007c0)='./file1/file0\x00', 0x10000, 0x80)
write(0xffffffffffffffff, &(0x7f0000000800)="f1ef9b61ab2572e720661a3307df6592164c17ece60899ee1bde02d8772ec17c55b7f12e06632d803838d66980c8043e1a1bd4307adfa26f64bbe4e9d2d40afa31a5830573681a70e43de1652712327fc1f0903ae0953f48c1235696dce9391726cbb9f974d95aa3c9ed8431eacf489c59b8fb40ba5bd1489f9f787a613a329239fc4070d43fea9de09b9f0d85cc7fc4df9acbf42172e1b6567d8d4f6a64", 0x9e)
semop(r0, &(0x7f00000008c0)=[{0x3, 0x9}], 0x1)


syz_emit_ethernet(0x36, &(0x7f0000000000))
fcntl$lock(0xffffffffffffffff, 0x0, &(0x7f00000000c0)={0x0, 0x0, 0x1ff})
connect$unix(0xffffffffffffffff, &(0x7f00000000c0)=@abs={0x682eb13985c518e6, 0x7}, 0x1c)
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
r0 = socket(0x18, 0x1, 0x0)
close(r0)
r1 = socket(0x18, 0x2, 0x0)
setsockopt(r0, 0x1000000029, 0x2e, 0x0, 0x0)
setsockopt(r1, 0x1000000000029, 0xa, &(0x7f0000000040)='\x00\x00\x00\x00', 0x4)
connect$unix(r0, &(0x7f00000000c0)=@abs={0x0, 0x7}, 0x1c)


_lwp_setname(0x0, &(0x7f0000000080)='\x9a\x03N\xb9\xc9%D\x8f\xd8\x16R\xd1z8L\x8a!8\xcf\x9e9O\xbe\x17G\xf0\x1cs\xcf\xbd\x10\xb3[\x8d}\xbeH k\x7f\xc8\xcd\x14L\x8a\x89{\xcc\xeeN\x8a%H\xd8C\xd3\xednGO\xb0^$}\xb7\xa5\xff&=\xaa\xb6\x9f\xd2a\xc7\x04\xa1l\x8a\xe8\x9b\xeexu\x01\x00\x00\x00\x94\x1c\x16\a\xb9\x95x\"#\a _e\xb4\xb3-3\x1eN\xb8\xed\xdb\xf4\xa7\x86A\f\xb5M\xac\x1eR\x18\xc3\xa4\xea\x11\x84\xe1eG`\xb0\xa5\x06l\x93\xf2\xc9\xff\x1bm\x13!\xc1\xc3\xec\xf9[\xffa\x86\xc2\xbd_\xbe_\xb2\xa0\xcfM\x83\xe8W\xec\xd2\x14*\x87E,\xd4\xf5D\x8d\xcd~\xb3+\x84\x9b\xcc\xcaH\xa6\b\fpm@[\x85\r\x8d\xea+=\x87Z\xf1\x88\xdf\xc6#\xfb\x9a$\x18;\xe6\x18\x98n\x00\x00\x00\x00\x00\x00\x00')
_lwp_getname(0x0, 0x0, 0x0)


r0 = socket(0x18, 0x3, 0x0)
setsockopt$sock_int(r0, 0xffff, 0x100a, 0x0, 0x0)


_lwp_ctl(0x0, 0x0)


r0 = socket(0x18, 0x3, 0x0)
setsockopt(r0, 0x1000000029, 0x2f, &(0x7f00000000c0), 0x0)


symlink(&(0x7f0000000080)='.\x00', 0x0)
__mount50(&(0x7f00000001c0)='fdesc\x00', &(0x7f0000000040)='.\x00', 0x0, 0x0, 0x0)
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
r0 = open(&(0x7f0000000080)='./file0\x00', 0x0, 0x0)
ioctl$FIONREAD(r0, 0x4004667f, 0x0)


socketpair(0x1, 0x20000001, 0x0, &(0x7f0000000200))


open$dir(&(0x7f0000000000)='./file0\x00', 0xf02, 0x0)
r0 = open(&(0x7f0000000000)='./file0\x00', 0x0, 0x0)
fcntl$lock(r0, 0x9, &(0x7f0000000080)={0x0, 0x0, 0x0, 0x100000402})
open$dir(&(0x7f0000000180)='./file0\x00', 0x190, 0x0)
r1 = open(&(0x7f00000000c0)='./file0\x00', 0x205, 0x0)
fcntl$lock(r1, 0x8, &(0x7f0000000000)={0x0, 0x0, 0x0, 0x1000300010008, 0xffffffffffffffff})


writev(0xffffffffffffffff, &(0x7f0000000240)=[{&(0x7f0000000140)='\x00', 0x1}], 0x1)
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
r0 = socket$inet(0x2, 0x2, 0x0)
setsockopt$inet_opts(r0, 0x0, 0x9, &(0x7f0000000240)="ea00000100000000", 0xc)
setsockopt$inet_opts(r0, 0x0, 0xd, &(0x7f0000000240)="ea08000000000000", 0x8)


compat_50___msgctl13$IPC_STAT(0x0, 0x2, &(0x7f0000000280)={{}, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, &(0x7f00000001c0)={&(0x7f0000000100), 0xdffffffffffff7ff}})
lchown(&(0x7f0000000100)='./file0\x00', 0x0, 0xffffffffffffffff)
symlink(&(0x7f0000000080)='.\x00', &(0x7f0000000240)='./file0\x00')
compat_40_mount(&(0x7f0000000380)='union\x00', &(0x7f0000000140)='./file0\x00', 0x0, &(0x7f00000001c0))
__mount50(&(0x7f00000002c0)='overlay\x00', &(0x7f0000000040)='.\x00', 0x0, &(0x7f00000001c0), 0x0)
r0 = open(&(0x7f0000000040)='./file0\x00', 0x0, 0x0)
fcntl$setstatus(r0, 0x4, 0x0)


open(&(0x7f0000000140)='./file0\x00', 0xf8e, 0x0)
r0 = getpid()
ktrace(&(0x7f0000000000)='./file0\x00', 0x0, 0x40000530, r0)
socketpair$unix(0x1, 0x1, 0x0, &(0x7f0000000000)={<r1=>0xffffffffffffffff, <r2=>0xffffffffffffffff})
r3 = dup(r2)
recvmmsg(r3, &(0x7f0000000440)={&(0x7f00000000c0)={0x0, 0x0, &(0x7f0000000580)=[{0x0}, {0x0}, {&(0x7f0000000240)=""/90, 0x5a}], 0x3, 0x0}}, 0x10, 0x64, 0x0)
dup2(r1, r2)
writev(r2, &(0x7f0000000640)=[{&(0x7f0000000140)="90", 0x1}], 0x1)
execve(0x0, 0x0, 0x0)


modctl$MODCTL_UNLOAD(0x2, 0x0)
compat_50___lstat30(0x0, 0x0)
shmctl$IPC_RMID(0xffffffffffffffff, 0x0)
r0 = socket(0x2, 0x2, 0x0)
getsockname$unix(r0, &(0x7f0000000000)=@abs, &(0x7f0000000040)=0x8)
r1 = socket(0x2, 0x1, 0x0)
bind(r1, &(0x7f0000000000), 0x10)
listen(r1, 0x0)


open(&(0x7f0000000040)='./file0\x00', 0x70e, 0x0)
ktrace(&(0x7f0000000200)='./file0\x00', 0x4, 0xd27d43220c7df9b, 0x0)
compat_50_nanosleep(0x0, 0x0)


ioctl$WSMUXIO_REMOVE_DEVICE(0xffffffffffffffff, 0x80085762, &(0x7f0000000040)={0x1})
r0 = socket(0x18, 0x3, 0x3a)
r1 = socket(0x18, 0x3, 0x3a)
setsockopt(r1, 0x29, 0x6c, &(0x7f0000000040), 0x4)
setsockopt(r0, 0x29, 0x6c, &(0x7f0000000040), 0x4)


mkdir(&(0x7f0000000080)='./file0\x00', 0x0)
symlink(0x0, 0x0)
mknod(&(0x7f00000000c0)='./bus\x00', 0x2000, 0x4f4b)
r0 = open$dir(&(0x7f0000000040)='./bus\x00', 0x0, 0x0)
ioctl$FIOASYNC(r0, 0x8004667e, &(0x7f00000001c0))
__mount50(&(0x7f0000000000)='kernfs\x00', &(0x7f0000000180)='./file0\x00', 0x0, 0x0, 0x0)
compat_50_____semctl13$GETVAL(0x0, 0x0, 0x5)
pathconf(&(0x7f0000000000)='./file0\x00', 0xa)


r0 = open(&(0x7f00000000c0)='./file0\x00', 0x80000000000206, 0x0)
r1 = open(&(0x7f0000000040)='./file0\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x200000)=nil, 0x200000, 0x0, 0x10, r1, 0x0, 0x0)
writev(r0, &(0x7f0000000280), 0x1000000000000329)


setreuid(0x0, 0xee01)
setpriority(0x2, 0x0, 0x0)


setsockopt$inet_opts(0xffffffffffffffff, 0x11, 0x0, 0x0, 0x0)
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
_ksem_init(0x0, &(0x7f0000000140))


mknod(&(0x7f0000000480)='./file0\x00', 0x2000, 0x1733)
r0 = open(&(0x7f00000000c0)='./file0\x00', 0x0, 0x0)
ioctl$FIONREAD(r0, 0x4004427c, &(0x7f0000000080))


r0 = socket(0x10, 0x2, 0x0)
sendto(r0, 0x0, 0x0, 0x0, &(0x7f00000000c0), 0xe)


mknod(&(0x7f0000000000)='./file0\x00', 0x1ffa, 0x0)
r0 = open(&(0x7f00000000c0)='./file0\x00', 0x0, 0x0)
open(&(0x7f0000001700)='./file0\x00', 0x70e, 0x0)
pread(r0, 0x0, 0x0, 0x0)


open(0x0, 0x0, 0x0)
getpid()
r0 = socket$inet(0x2, 0x2, 0x0)
mknod(&(0x7f0000000000)='./file1\x00', 0x2000, 0xa718)
open$dir(&(0x7f00000000c0)='./file1\x00', 0x0, 0x0)
open(0x0, 0x0, 0x0)
fcntl$lock(r0, 0xa, 0x0)


r0 = socket(0x18, 0x2, 0x0)
setsockopt(r0, 0x1000000000029, 0xb, &(0x7f0000000080)='\x00\x00\x00\x00', 0x4)
r1 = socket(0x2, 0x3, 0x0)
dup2(r0, r1)
setsockopt(r1, 0x1000000029, 0xb, &(0x7f0000000000), 0x0)


pipe(&(0x7f0000000140))
getpid()
__msync13(&(0x7f0000ff9000/0x4000)=nil, 0xfffffffe, 0x4)
compat_40_mount(&(0x7f0000000380)='tmpfs\x00', &(0x7f00000003c0)='.\x00', 0x0, &(0x7f0000000140)="01")
__mount50(0x0, &(0x7f0000000040)='.\x00', 0xe680bf986d21abfb, &(0x7f0000000400), 0x0)
__lstat50(&(0x7f0000000000)='.\x00', 0x0)
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
unmount(&(0x7f0000000000)='./file0\x00', 0x255a0100)


modctl$MODCTL_UNLOAD(0x2, 0x0)
r0 = socket(0x18, 0x2, 0x0)
setsockopt$sock_int(r0, 0xffff, 0x1003, 0x0, 0x0)


mkdir(&(0x7f0000000080)='./file0\x00', 0x0)
__posix_rename(&(0x7f0000000100)='./file0\x00', &(0x7f0000000140)='./file0\x00')


mkdir(&(0x7f0000000080)='./file0\x00', 0x0)
__mount50(&(0x7f0000000040)='lfs\x00', &(0x7f00000002c0)='./file0\x00', 0x0, &(0x7f0000000000)="ce", 0x1)


symlink(&(0x7f0000000080)='.\x00', &(0x7f0000000240)='./file0\x00')
compat_50___msgctl13$IPC_STAT(0x0, 0x2, &(0x7f0000000280)={{}, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, &(0x7f00000001c0)={&(0x7f0000000100), 0xb}})
lchown(&(0x7f0000000100)='./file0\x00', 0x0, 0x0)
compat_40_mount(&(0x7f0000000380)='union\x00', &(0x7f0000000140)='./file0\x00', 0x0, &(0x7f00000001c0))
unlink(&(0x7f0000000000)='./file0\x00')
mknod(&(0x7f0000000180)='./file0\x00', 0x2000, 0x202)
ktrace(&(0x7f0000000200)='./file0\x00', 0x0, 0x0, 0x0)


r0 = socket$inet(0x2, 0x2, 0x0)
r1 = socket(0x2, 0x1, 0x0)
ioctl$FIOSEEKHOLE(r1, 0xc020690f, &(0x7f0000000180))
connect$inet(r0, &(0x7f00000002c0)={0x2, 0x0}, 0x10)
sendto$inet(r0, 0x0, 0x0, 0x0, 0x0, 0xffffffffffffffe1)


mknod(&(0x7f00000000c0)='./file0\x00', 0x0, 0x0)
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1)
r0 = socket(0x2, 0x1, 0x0)
setsockopt$sock_int(r0, 0xffff, 0x1002, &(0x7f00000000c0), 0x4)


mknod(&(0x7f0000000100)='./file0\x00', 0x2000, 0x0)
r0 = open(&(0x7f0000000140)='./file0\x00', 0x0, 0x0)
ioctl$FIOASYNC(r0, 0x20007402, 0x0)


r0 = getsid(0x0)
ptrace(0x9, r0, 0x0, 0x0)
compat_50_wait4(0x0, 0x0, 0x0, 0x0)
ptrace(0xe, r0, &(0x7f0000000240), 0x8)


r0 = compat_30_socket(0x10, 0x2, 0x0)
setsockopt$sock_timeval(r0, 0xffff, 0x4, 0x0, 0x0)


socket(0x2, 0x0, 0x0)
ioctl$FIOSEEKHOLE(0xffffffffffffffff, 0x8020690c, 0x0)
ioctl$FIOSEEKHOLE(0xffffffffffffffff, 0x8090697f, 0x0)
open(&(0x7f0000000480)='./file0\x00', 0x7fffe, 0x0)
truncate(&(0x7f0000000440)='./file0\x00', 0x0, 0x3e2b649f)
truncate(&(0x7f0000000780)='./file0\x00', 0x0, 0x0)


mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1)
_lwp_unpark_all(&(0x7f0000000040)=[0x0], 0x1, 0x0)


fcntl$lock(0xffffffffffffffff, 0x0, &(0x7f0000000040)={0x0, 0x0, 0x0, 0x2001000300000002})
semctl$SETALL(0x0, 0x0, 0x9, &(0x7f0000000040)=[0x7ff])
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
r0 = socket(0x18, 0x1, 0x0)
fcntl$lock(0xffffffffffffffff, 0x0, 0x0)
setsockopt(r0, 0x1000000029, 0xc, &(0x7f0000000040)="ebffcbff13b9fd812eaa4e713048e69931929648", 0x14)


mkdir(&(0x7f0000000200)='./file0\x00', 0x0)
__mount50(&(0x7f0000000000)='kernfs\x00', &(0x7f0000000180)='./file0\x00', 0x0, 0x0, 0x0)
compat_50___msgctl13$IPC_STAT(0x0, 0x2, &(0x7f0000000280)={{}, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, &(0x7f00000001c0)={&(0x7f0000000100)}})
lchown(&(0x7f0000000100)='./file0\x00', 0x0, 0xffffffffffffffff)
compat_40_mount(&(0x7f0000000000)='null\x00', &(0x7f0000000140)='./file0\x00', 0x0, &(0x7f00000001c0))
compat_30_getfh(&(0x7f0000000280)='./file0\x00', 0x0)


symlink(&(0x7f0000000080)='.\x00', &(0x7f0000000240)='./file0\x00')
compat_50___msgctl13$IPC_STAT(0x0, 0x2, 0x0)
compat_50___msgctl13$IPC_STAT(0x0, 0x2, &(0x7f0000000280)={{}, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, &(0x7f00000001c0)={&(0x7f0000000100), 0xdffffffffffff7ff}})
getsockopt$SO_PEERCRED(0xffffffffffffffff, 0xffff, 0x1022, &(0x7f0000000300), 0xc)
r0 = semget$private(0x0, 0x4000000009, 0x82)
semop(r0, &(0x7f00000000c0)=[{0x4, 0xffff, 0xe5ce97ab354d96be}, {}], 0x2)
semop(r0, &(0x7f0000000480)=[{}, {0x2, 0xe244, 0x1800}], 0x2)
semop(r0, 0x0, 0x0)
getegid()
semctl$IPC_SET(r0, 0x0, 0x1, 0x0)
compat_50_____semctl13$SETALL(r0, 0x0, 0x9, &(0x7f00000000c0)=@array=&(0x7f0000000000)=0xfff7)
lchown(&(0x7f0000000100)='./file0\x00', 0x0, 0x0)
compat_40_mount(&(0x7f0000000380)='union\x00', &(0x7f0000000140)='./file0\x00', 0x0, 0x0)
compat_40_mount(0x0, 0x0, 0x0, &(0x7f00000001c0))
open(0x0, 0x0, 0x0)
close(0xffffffffffffffff)


madvise(&(0x7f0000ffa000/0x4000)=nil, 0x4000, 0x1)
mlock(&(0x7f0000ff8000/0x3000)=nil, 0x3000)
mmap(&(0x7f0000ff9000/0x4000)=nil, 0x4000, 0xd49f275d97cc01bb, 0x1810, 0xffffffffffffffff, 0x0, 0x0)


compat_90_fhstatvfs1(&(0x7f0000000a80)="c1fc8a554889ce8714b526092e404a4befedc54b78087379a0d72b9f5a128f5631feda309439a9bd3eaaf2bcc1b3bdf3d8f82222303c39e4b5c2fd21700761af5a5c933ddafe7565a51259b26ad0990737060d553333e1a608857599ef35351a17f58e3bc0e202e8296c998086a3d96a6ad08454bc50ad5e4db1ea056a60f9ea75df8c2fb0e77ebc93434f05b87ea5412822188b7a338c321a43ffe5ca030334948428948fafcd87dc923b291771326742441356e226235e000000", 0xbb, &(0x7f00000000c0), 0x0)
mknod(0x0, 0x0, 0x0)
ktrace(&(0x7f0000000200)='./file0\x00', 0x0, 0x0, 0x0)
_ksem_timedwait(0x0, 0x0)
rasctl(0x0, 0x9, 0x0)
socketpair$unix(0x1, 0x5, 0x0, &(0x7f0000000cc0))
modctl$MODCTL_UNLOAD(0x2, 0x0)
rasctl(0x0, 0x0, 0x2)
r0 = open$dir(&(0x7f0000000280)='.\x00', 0x0, 0x0)
writev(0xffffffffffffffff, 0x0, 0x0)
socketpair$unix(0x1, 0x2, 0x0, &(0x7f0000000040)={<r1=>0xffffffffffffffff})
ioctl$FIONREAD(r1, 0x80206979, &(0x7f00000001c0))
symlinkat(0x0, 0xffffffffffffffff, 0x0)
setsockopt$inet_opts(0xffffffffffffffff, 0x0, 0x0, 0x0, 0x0)
mkdir(0x0, 0x0)
compat_50___msgctl13$IPC_STAT(0x0, 0x2, 0x0)
__mount50(0x0, 0x0, 0x0, 0x0, 0x0)
compat_40_mount(0x0, 0x0, 0x0, 0x0)
symlinkat(0x0, 0xffffffffffffffff, 0x0)
ioctl$FIONREAD(r0, 0x4004667f, &(0x7f00000002c0))


fcntl$lock(0xffffffffffffffff, 0x0, &(0x7f0000000040)={0x0, 0x0, 0x0, 0x8000000000000001})
ioctl$WSMUXIO_INJECTEVENT(0xffffffffffffffff, 0x80185760, &(0x7f0000000000)={0x0, 0x0, {0x0, 0x10000000000001}})
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
r0 = socket(0x18, 0x1, 0x0)
setsockopt(r0, 0x1000000029, 0xc, &(0x7f0000000180)="ebffcbff13b9fd812eaa4e713048e69931929648", 0x14)
setsockopt(r0, 0x1000000029, 0xc, &(0x7f0000000040)="ebffcbff13b9fd812eaa4e713048e69931929648", 0x14)
setsockopt(r0, 0x1000000029, 0xd, &(0x7f0000000000)="ebffcbff13b9fd812eaa4e713048e69931929648", 0x14)


modctl$MODCTL_UNLOAD(0x2, 0x0)
mknod(&(0x7f0000000300)='./file0\x00', 0x2000, 0x6da)
r0 = open(&(0x7f0000000000)='./file0\x00', 0x0, 0x0)
poll(&(0x7f0000000040)=[{r0}], 0x1, 0x0)


mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x0)
_lwp_unpark_all(&(0x7f0000000000), 0x36, 0x0)


compat_40_mount(&(0x7f0000000040)='tmpfs\x00', &(0x7f0000000140)='.\x00', 0x0, &(0x7f0000000080)="01")
r0 = open$dir(&(0x7f0000001240)='.\x00', 0x0, 0x0)
__getdents30(r0, 0x0, 0x0)


modctl$MODCTL_UNLOAD(0x2, 0x0)
rasctl(0x0, 0x9, 0x0)
rasctl(0x0, 0xd39, 0x0)


getsockopt$SO_PEERCRED(0xffffffffffffffff, 0xffff, 0x1022, &(0x7f0000000300)={0x0, <r0=>0x0}, 0xc)
r1 = geteuid()
r2 = semget$private(0x0, 0x4000000009, 0x82)
fcntl$lock(0xffffffffffffffff, 0x0, &(0x7f00000000c0)={0x0, 0x0, 0x83fe})
r3 = socket(0x18, 0x1, 0x0)
connect$unix(r3, &(0x7f00000000c0)=@abs={0x682eb13985c518e6, 0x7}, 0x1c)
getsockname$inet(r3, &(0x7f00000000c0), &(0x7f0000000000)=0xffffffffffffff35)
r4 = socket(0x18, 0x2, 0x0)
close(r4)
r5 = socket(0x18, 0x0, 0x0)
ioctl$WSMUXIO_INJECTEVENT(0xffffffffffffffff, 0x80185760, &(0x7f0000000000)={0x1fe})
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
setsockopt(r5, 0x1000000029, 0x2e, &(0x7f0000000000)="ebffcbff13b9fd812eaa4e713048e69931929648", 0x14)
connect$unix(r4, &(0x7f00000000c0)=@abs={0x0, 0x7}, 0x1c)
semop(r2, &(0x7f00000000c0)=[{0x4, 0xffff, 0xe5ce97ab354d96be}, {0x2, 0x2, 0x1800}, {0x2, 0x4, 0x1000}, {0x4, 0x2, 0x800}], 0x4)
semop(r2, &(0x7f0000000480)=[{0x2, 0x1, 0x1000}, {0x4, 0xd, 0x1000}, {0x2, 0xe244, 0x1800}], 0x3)
semop(r2, &(0x7f0000000840)=[{0x1, 0x20, 0x1000}, {0x0, 0xff01, 0x1000}, {0x4, 0x7}, {0x4, 0x6, 0x800}, {0x1, 0xfff7, 0x1800}, {0x1, 0x4}, {0x1, 0x2, 0x1800}, {0x4, 0x3, 0x1000}, {0x3, 0x3, 0x800}], 0x9)
r6 = getegid()
semctl$IPC_SET(r2, 0x0, 0x1, &(0x7f0000000740)={{0x7, 0x0, 0x0, r0, r6, 0x1e9, 0xffff}, 0xc0000, 0xbe3, 0x7})
msgctl$IPC_SET(0x0, 0x1, &(0x7f00000000c0)={{0x3, 0x0, 0x0, r1, 0x0, 0xb2}, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x4})
r7 = socket(0x18, 0x1, 0x0)
connect$unix(r7, &(0x7f00000000c0)=@abs={0x682eb13985c518e6, 0x7}, 0x1c)
getpgrp()
poll(&(0x7f00000000c0)=[{}, {}], 0x2, 0x0)
getsockname$inet(r7, &(0x7f00000000c0), &(0x7f0000000000)=0x9)
r8 = socket(0x18, 0xc002, 0x0)
poll(&(0x7f0000000180)=[{r8, 0x46}], 0x1, 0x0)
connect$unix(r8, &(0x7f00000000c0)=@abs={0x682eb13985c518e6, 0x7}, 0x1c)
socket(0x20, 0x4, 0x81)


mknod(&(0x7f0000000280)='./file0\x00', 0x1ffa, 0x0)
r0 = open(&(0x7f0000000040)='./file0\x00', 0x70e, 0x0)
r1 = getpid()
fcntl$setown(r0, 0x6, r1)
r2 = open$dir(&(0x7f0000000000)='./file0\x00', 0x1, 0x0)
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1)
ioctl$FIOSETOWN(r2, 0x8004667c, &(0x7f0000000140))


compat_40_mount(&(0x7f0000000040)='tmpfs\x00', &(0x7f00000000c0)='.\x00', 0x0, &(0x7f0000000140)="01")
mknod(&(0x7f0000000000)='./file0\x00', 0xe015, 0x0)
mkdir(&(0x7f0000000080)='./file0\x00', 0x0)


mknod(&(0x7f0000000480)='./file0\x00', 0x2000, 0x1733)
r0 = socket$unix(0x1, 0x5, 0x0)
connect$unix(r0, &(0x7f0000000000)=@file={0xd1653077bafa0114, './file0\x00'}, 0xa)


mkdir(&(0x7f0000000080)='./file0\x00', 0x0)
__mount50(&(0x7f0000000040)='procfs\x00', &(0x7f0000000080)='./file0\x00', 0x0, &(0x7f00000000c0)='@', 0x1)
pathconf(&(0x7f0000000080)='./file0\x00', 0x9)


modctl$MODCTL_UNLOAD(0x2, 0x0)
open$dir(0x0, 0x0, 0x0)
compat_43_osendmsg(0xffffffffffffffff, 0x0, 0x0)
mknod$loop(&(0x7f0000000000)='./file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\x00', 0x2000, 0x1)
link(&(0x7f0000000940)='./file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\x00', &(0x7f0000000d40)='./file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\x00')
link(&(0x7f0000000100)='./file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\x00', &(0x7f0000000bc0)='./file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\x00')
symlink(&(0x7f0000000ac0)='./file0\x00', &(0x7f0000000e40)='./file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\x00')
rename(&(0x7f0000000600)='./file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\x00', &(0x7f0000000f40)='./file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\x00')
rename(&(0x7f0000000300)='./file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\x00', &(0x7f0000000200)='./file0\x00')


compat_30___fhstat30(&(0x7f00000006c0)={{}, {0x0, 0x0, "fbf6ea69840991b744967e896b5d2079"}}, 0x0)


mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
poll(&(0x7f0000000380)=[{}], 0x1, 0xffff)
r0 = socket(0x2, 0x1, 0x0)
r1 = dup(r0)
r2 = fcntl$dupfd(r1, 0x2, 0xffffffffffffffff)
close(r2)


mknod(&(0x7f0000000480)='./file0\x00', 0x2000, 0x1733)
r0 = open(&(0x7f00000000c0)='./file0\x00', 0x0, 0x0)
ioctl$FIONREAD(r0, 0x80044275, &(0x7f0000000080))


symlink(0x0, &(0x7f0000000240)='./file0\x00')
mlockall(0x2)
mprotect(&(0x7f0000029000/0x1000)=nil, 0x1000, 0x0)
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
mlock(&(0x7f0000007000/0xc000)=nil, 0xc000)


mkdir(&(0x7f0000000080)='./file0\x00', 0x0)
__mount50(0x0, &(0x7f0000000080)='./file0\x00', 0x954e934714f9a7b2, &(0x7f0000000180), 0x0)


modctl$MODCTL_UNLOAD(0x2, 0x0)
r0 = getsid(0x0)
ptrace(0x9, r0, 0x0, 0x0)
compat_50_wait4(0x0, 0x0, 0x0, 0x0)
mmap(&(0x7f0000000000/0x3000)=nil, 0x3000, 0x1, 0x10, 0xffffffffffffffff, 0x0, 0x0)
ptrace(0xd, r0, &(0x7f0000000240), 0x8)


mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x0)
__utimes50(0x0, &(0x7f0000000140))


mknod(&(0x7f00000000c0)='./bus\x00', 0x2000, 0x4f4b)
r0 = open$dir(&(0x7f0000000040)='./bus\x00', 0x0, 0x0)
ioctl$FIOASYNC(r0, 0xc0104308, &(0x7f00000001c0))


mknod(&(0x7f0000000040)='./bus\x00', 0x100000000205f, 0x2801)
r0 = open(&(0x7f0000000280)='./bus\x00', 0x0, 0x0)
ioctl$FIOASYNC(r0, 0x4004667f, &(0x7f0000000000))


poll(&(0x7f0000000180)=[{}], 0x1, 0xffffffff)


r0 = socket(0x11, 0x3, 0x0)
sendmsg(r0, &(0x7f0000002880)={0x0, 0x0, 0x0, 0x0, 0x0}, 0x0)


mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
r0 = socket$inet(0x2, 0x2, 0x0)
close(r0)
r1 = socket$inet(0x2, 0x2, 0x0)
setsockopt$inet_opts(r1, 0x0, 0x200000000000a, &(0x7f0000000040)='\x00', 0x1)
setsockopt$inet_opts(r0, 0x0, 0x9, &(0x7f0000000100)="ea000001", 0x4)


syz_usb_connect$hid(0x0, 0x0, 0x0, &(0x7f0000001980)={0x0, 0x0, 0x0, 0x0, 0x1, [{0x0, 0x0}]})
r0 = socket(0x18, 0x2, 0x0)
recvmmsg(r0, &(0x7f0000000640)={0x0}, 0xfffffffffffffe51, 0x0, 0x0)
readv(r0, &(0x7f0000001480)=[{0x0}], 0x1)


r0 = socket$unix(0x1, 0x5, 0x0)
bind$unix(r0, &(0x7f0000000200)=@file={0xd570d0466b6018f, './file0\x00'}, 0xa)
listen(r0, 0x0)
r1 = socket$unix(0x1, 0x5, 0x0)
connect$unix(r1, &(0x7f0000000000)=@file={0xd1653077bafa0114, './file0\x00'}, 0xa)
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
accept(r0, &(0x7f0000000040), &(0x7f00000000c0)=0x8)


r0 = socket(0x18, 0x2, 0x0)
__fstat50(r0, &(0x7f0000000080))
r1 = socket$inet(0x2, 0x2, 0x0)
mkdir(&(0x7f0000000080)='./file0\x00', 0x0)
ioctl$FIONREAD(r1, 0xc0106926, &(0x7f0000000080))


mknod(&(0x7f00000000c0)='./bus\x00', 0x2000, 0x0)
mknodat(0xffffffffffffff9c, &(0x7f0000000100)='./file0\x00', 0x8000, 0x0)
mknod(&(0x7f0000000240)='./file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\x00', 0x2000, 0x0)
open$dir(&(0x7f0000000140)='./file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\x00', 0x0, 0x0)
rename(&(0x7f0000000380)='./file0\x00', &(0x7f0000000080)='./bus\x00')


open(&(0x7f00000000c0)='./file0\x00', 0x615, 0x0)
r0 = getpid()
ktrace(&(0x7f0000000000)='./file0\x00', 0x0, 0x144, r0)
paccept(0xffffffffffffffff, 0x0, 0x0, 0x7a6464e84b1eab2c)


symlink(&(0x7f0000000080)='.\x00', &(0x7f0000000240)='./file0\x00')
chflags(&(0x7f0000000140)='./file0\x00', 0x20000)
unlink(&(0x7f0000000000)='./file0\x00')


r0 = socket(0x18, 0x3, 0x0)
ioctl$FIOSEEKHOLE(r0, 0xc118691d, &(0x7f0000000180))
r1 = socket(0x18, 0x3, 0x0)
ioctl$FIOSEEKHOLE(r1, 0x40046678, &(0x7f0000000180))


r0 = getsid(0x0)
ptrace(0x9, r0, 0x0, 0x0)
ptrace(0x1000000000000027, r0, 0x0, 0xd40)


mknod(&(0x7f00000000c0)='./file0\x00', 0x2000, 0x4100)
r0 = open(&(0x7f0000000040)='./file0\x00', 0x0, 0x0)
ioctl$WSMUXIO_INJECTEVENT(r0, 0x80185728, &(0x7f00000001c0))


open(&(0x7f0000000040)='./file0\x00', 0x70e, 0x0)
socketpair$unix(0x1, 0x1, 0x0, &(0x7f0000000280)={<r0=>0xffffffffffffffff})
getsockopt$sock_cred(r0, 0xffff, 0x1022, &(0x7f0000000100)={0x0, 0x0, <r1=>0x0}, &(0x7f00000000c0)=0xffffffffffffff76)
ktrace(0x0, 0x4, 0x20001410, 0x0)
setregid(0x0, r1)
r2 = getpid()
ktrace(&(0x7f0000000180)='./file0\x00', 0x4, 0x532, r2)


compat_43_ommap(&(0x7f0000ffd000/0x2000)=nil, 0x7f7ff7eee000, 0x0, 0x2, 0xffffffffffffffff, 0x0)


msgrcv(0x0, 0x0, 0x0, 0x0, 0x0)
r0 = socket(0x18, 0x3, 0x0)
setsockopt(r0, 0x1000000029, 0x25, &(0x7f0000000000)="5ab7776a", 0x4)


compat_40_mount(&(0x7f0000000380)='tmpfs\x00', &(0x7f00000003c0)='.\x00', 0x0, &(0x7f0000000140)="01")
__mount50(0x0, &(0x7f0000000040)='.\x00', 0xe680bf986d21abfb, &(0x7f0000000140), 0x0)
open(&(0x7f0000000480)='./file0\x00', 0x0, 0x0)
symlink(&(0x7f0000000080)='.\x00', &(0x7f0000000040)='./file0/../file0\x00')


shmat(0x0, &(0x7f0000bfd000/0x400000)=nil, 0x0)
getgid()
r0 = shmget$private(0x0, 0x1000, 0x0, &(0x7f0000fc6000/0x1000)=nil)
socketpair$unix(0x1, 0x5, 0x0, &(0x7f0000000200)={<r1=>0xffffffffffffffff})
getsockopt$sock_cred(r1, 0xffff, 0x1022, &(0x7f00000000c0)={0x0, 0x0, <r2=>0x0}, &(0x7f0000000100)=0xc)
setregid(0x0, r2)
setgroups(0x1, &(0x7f0000000100)=[r2])
setreuid(0xee00, 0x0)
r3 = getuid()
setreuid(0xee00, r3)
shmat(r0, &(0x7f0000bfe000/0x3000)=nil, 0x0)


mknod(&(0x7f0000000040)='./bus\x00', 0x2000, 0x205310)
r0 = open(&(0x7f0000000000)='./bus\x00', 0x0, 0x0)
ioctl$KDSETMODE(r0, 0x20004b0a)


open(&(0x7f00000000c0)='./file0\x00', 0x615, 0x0)
r0 = getpid()
ktrace(&(0x7f0000000000)='./file0\x00', 0x0, 0x144, r0)
compat_30_getfh(0x0, 0x0)
rmdir(0x0)


rasctl(0x0, 0x0, 0x1)


mknod(&(0x7f0000000280)='./file0\x00', 0x2000, 0x200)
r0 = open(&(0x7f00000000c0)='./file0\x00', 0x80000000000206, 0x0)
writev(r0, &(0x7f0000000280), 0x1000000000000329)
r1 = open(&(0x7f00000000c0)='./file0\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x2000)=nil, 0x2000, 0x1, 0x10, r1, 0x0, 0x0)
modctl$MODCTL_STAT(0x4, &(0x7f0000000540)={0x0})


mknod(&(0x7f0000000040)='./bus\x00', 0x100000000205f, 0x2802)
r0 = open$dir(&(0x7f0000000040)='./bus\x00', 0x0, 0x0)
ioctl$FIOASYNC(r0, 0x80047458, &(0x7f00000001c0))


open(&(0x7f0000000040)='./file0\x00', 0x200, 0x0)
open(0x0, 0x0, 0x0)
r0 = open(&(0x7f0000002600)='./file0\x00', 0x10a0008, 0x0)
modctl$MODCTL_UNLOAD(0x1, &(0x7f0000000080)="f7f18c4b0d602d76648e1e31046b3d")
modctl$MODCTL_LOAD(0x0, &(0x7f0000000080)={0x0, 0x0, 0x0, 0xc1})
mknod(&(0x7f0000000480)='./file0\x00', 0x2000, 0x1733)
r1 = open(0x0, 0x0, 0x0)
symlink(0x0, 0x0)
ioctl$FIONREAD(r1, 0x8010427f, &(0x7f0000000080))
mknod(&(0x7f0000000280)='./file0\x00', 0x2000, 0x200)
r2 = open$dir(&(0x7f0000000b80)='./file0\x00', 0x0, 0x0)
ioctl$FIOASYNC(0xffffffffffffffff, 0x4004667f, 0x0)
pread(r2, &(0x7f0000000000)="a6", 0x1, 0x0)
mmap(&(0x7f0000000000/0x13000)=nil, 0x13000, 0x5, 0x10, r0, 0x0, 0x0)
compat_30_socket(0x0, 0x0, 0x0)
ioctl$WSMOUSEIO_SRES(0xffffffffffffffff, 0x80045721, &(0x7f0000003440))
madvise(&(0x7f000000c000/0x2000)=nil, 0x2000, 0x0)
compat_40_mount(0x0, &(0x7f00000000c0)='.\x00', 0x0, &(0x7f00000002c0)="01")
setuid(0xee01)
mkdir(&(0x7f0000000080)='./file0\x00', 0x0)
setrlimit(0x0, 0x0)
poll(0x0, 0x0, 0x0)
r3 = socket(0x18, 0x3, 0x0)
setsockopt(r3, 0x1000000029, 0x25, &(0x7f0000000000)="5ab7776a", 0x4)
setsockopt$sock_int(r3, 0xffff, 0x800, &(0x7f0000000180)=0x2, 0x4)
recvmsg(r3, &(0x7f0000000380)={0x0, 0x0, 0x0, 0x0, &(0x7f0000000340)=""/27, 0x1b}, 0x0)
syz_emit_ethernet(0xfffffffffffffec3, 0x0)
mlock(&(0x7f0000ffb000/0x4000)=nil, 0x4000)


compat_40_mount(&(0x7f0000000200)='ptyfs\x00', &(0x7f00000003c0)='.\x00', 0x0, &(0x7f0000000a80))
__mount50(&(0x7f00000002c0)='overlay\x00', &(0x7f0000000040)='.\x00', 0x0, &(0x7f0000000540), 0x0)
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
open(&(0x7f0000000040)='./file0\x00', 0x20, 0x0)


__mount50(&(0x7f00000001c0)='fdesc\x00', &(0x7f0000000040)='.\x00', 0x0, 0x0, 0x0)
__mount50(&(0x7f00000002c0)='overlay\x00', &(0x7f0000000040)='.\x00', 0x0, &(0x7f0000000540), 0x0)
open$dir(&(0x7f0000000000)='.\x00', 0x0, 0x0)
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
__posix_chown(&(0x7f0000000000)='./file0\x00', 0xffffffffffffffff, 0x0)


mkdir(&(0x7f00000007c0)='./file0\x00', 0x0)
compat_40_mount(&(0x7f0000000180)='ptyfs\x00', &(0x7f00000002c0)='./file0\x00', 0x0, &(0x7f0000000500))
pathconf(&(0x7f0000000140)='./file0\x00', 0x7)


mkdir(0x0, 0x0)
mkdir(&(0x7f0000000080)='./file0\x00', 0x0)
__mount50(&(0x7f0000000000)='fdesc\x00', &(0x7f0000000180)='./file0\x00', 0x0, 0x0, 0x0)
r0 = open$dir(&(0x7f0000000180)='./file0\x00', 0x0, 0x0)
preadv(r0, &(0x7f00000012c0)=[{0x0, 0x1b}], 0x1, 0x0)


r0 = socket$inet(0x2, 0x1, 0x0)
setsockopt$inet_opts(r0, 0x6, 0x3, 0x0, 0x0)


mknod(&(0x7f0000000280)='./file0\x00', 0x1ffa, 0x0)
__clone(0x0, 0x0)
r0 = open$dir(&(0x7f00000000c0)='./file0\x00', 0x2, 0x0)
pwritev(r0, &(0x7f0000000080)=[{&(0x7f00000006c0), 0xf0f75}], 0x10, 0x0)


mknod(&(0x7f0000000480)='./file0\x00', 0x2000, 0x1733)
r0 = open(&(0x7f00000000c0)='./file0\x00', 0x0, 0x0)
__fstat50(r0, &(0x7f0000000280))


mkdir(&(0x7f0000000080)='./file0\x00', 0x0)
chroot(&(0x7f0000000000)='./file0\x00')
ptrace(0x0, 0x0, 0x0, 0x0)


mknod(&(0x7f0000000100)='./file0\x00', 0x80002005, 0x5300)
r0 = open(&(0x7f0000002600)='./file0\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x13000)=nil, 0x13000, 0x5, 0x10, r0, 0x0, 0x0)
fcntl$lock(0xffffffffffffffff, 0x8, &(0x7f0000000180))
munmap(&(0x7f0000001000/0x3000)=nil, 0x3000)
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x0)


chroot(&(0x7f0000000000)='.\x00')
modctl$MODCTL_UNLOAD(0x2, 0x0)
ptrace(0x24, 0x0, 0x0, 0x0)


mknod(&(0x7f0000000480)='./file0\x00', 0x2000, 0x1733)
r0 = open(&(0x7f00000000c0)='./file0\x00', 0x0, 0x0)
symlink(&(0x7f0000000080)='.\x00', 0x0)
ioctl$FIONREAD(r0, 0x8010426d, &(0x7f0000000080))


socket(0x11, 0x3, 0x0)
r0 = socket(0x1, 0x2, 0x0)
r1 = socket(0x11, 0x3, 0x0)
r2 = dup2(r1, r0)
shutdown(r2, 0x2)
msgctl$IPC_SET(0xffffffffffffffff, 0x1, &(0x7f00000000c0)={{0x0, 0x0, 0x0, 0x0, 0x0, 0x1a}, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0xfffffffffffffffd})
r3 = socket(0x18, 0x1, 0x0)
connect$unix(r3, &(0x7f00000000c0)=@abs={0x682eb13985c518e6, 0x7}, 0x1c)
socket(0x18, 0x1, 0x0)
close(0xffffffffffffffff)
r4 = getuid()
r5 = getgid()
semctl$IPC_SET(0x0, 0x0, 0x1, &(0x7f00000000c0)={{0x0, r4, 0x0, 0x0, r5, 0x0, 0xe}})
getsockname$inet(r3, &(0x7f00000000c0), &(0x7f0000000140)=0xc)
sendmsg$unix(0xffffffffffffffff, &(0x7f0000001700)={&(0x7f00000000c0), 0x1c, 0x0}, 0x0)
connect$unix(0xffffffffffffffff, &(0x7f00000000c0)=@abs={0x682eb13985c518e6, 0x7}, 0x1c)
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
r6 = socket(0x18, 0x400000002, 0x0)
r7 = socket(0x18, 0x1, 0x0)
r8 = dup2(r6, r7)
sendmsg$unix(r8, &(0x7f0000001700)={0x0, 0x0, 0x0, 0x0, 0x0, 0xfdc3}, 0x0)


modctl$MODCTL_LOAD(0x5, 0x0)
mknod(&(0x7f0000000480)='./file0\x00', 0x2000, 0x1733)
link(&(0x7f0000000940)='./file0\x00', &(0x7f0000000d40)='./file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\x00')
rename(&(0x7f0000000240)='./file0\x00', &(0x7f0000000640)='./file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\x00')


link(&(0x7f0000000100)='./file0\x00', 0x0)
r0 = getsid(0x0)
ptrace(0x9, r0, 0x0, 0x0)
compat_50_wait4(0x0, 0x0, 0x0, 0x0)
ptrace(0x12, r0, &(0x7f0000000240), 0x8)


ktrace(&(0x7f0000000200)='./file0\x00', 0x4, 0xd27d43220c7df9b, 0x0)
_lwp_unpark_all(0x0, 0x0, 0x0)


r0 = socket(0x18, 0x2, 0x0)
setsockopt(r0, 0x1000000029, 0xb, 0x0, 0x0)


chroot(&(0x7f0000000000)='.\x00')
mknod(&(0x7f0000000300)='./file0\x00', 0x2000, 0x6da)
r0 = open$dir(&(0x7f0000000000)='./file0\x00', 0x0, 0x0)
ioctl$FIOASYNC(r0, 0x48087448, 0x0)


pipe2(&(0x7f0000000000), 0x0)
r0 = open(&(0x7f0000000340)='./file0\x00', 0x300, 0x0)
flock(r0, 0x1)
r1 = open(&(0x7f0000000100)='./file0\x00', 0x0, 0x0)
fcntl$lock(r1, 0x8, &(0x7f0000000080)={0x0, 0x0, 0x0, 0x100000001})
r2 = open(&(0x7f00000000c0)='./file0\x00', 0x201, 0x0)
r3 = open(&(0x7f0000000140)='./file0\x00', 0x0, 0x0)
r4 = getpid()
fcntl$lock(r3, 0xe, &(0x7f0000000040)={0x3, 0x0, 0xffffffffffffffff, 0x100000006, r4})
fcntl$lock(r2, 0x9, &(0x7f0000000000)={0x0, 0x0, 0x0, 0x1000300010005})
flock(r0, 0x8)


munmap(&(0x7f0000ffb000/0x4000)=nil, 0x4000)
mprotect(&(0x7f0000ffc000/0x1000)=nil, 0x1000, 0x0)


mkdir(&(0x7f0000000080)='./file0\x00', 0x0)
compat_40_mount(&(0x7f0000000180)='ptyfs\x00', &(0x7f00000002c0)='./file0\x00', 0x0, &(0x7f0000000500))
lchown(&(0x7f0000000100)='./file0\x00', 0xffffffffffffffff, 0x0)


__mount50(&(0x7f0000000280)='ext2fs\x00', &(0x7f00000002c0)='.\x00', 0x0, 0x0, 0x0)


_lwp_create(&(0x7f0000000300)={0x0, &(0x7f00000001c0)={0x0, &(0x7f0000000140)={0x0, 0x0, {}, {}, {0x0, 0x0, '\x03\x00'}}, {}, {}, {0x0, 0x0, ':^\x00'}}, {}, {}, {0x0, 0x0, '\x00'}}, 0x0, 0x0)
mknod(&(0x7f00000000c0)='./bus\x00', 0x2000, 0x4f4b)
r0 = open$dir(&(0x7f0000000040)='./bus\x00', 0x0, 0x0)
mknod(&(0x7f0000000100)='./file0\x00', 0x2000, 0x0)
r1 = open(&(0x7f0000000140)='./file0\x00', 0x0, 0x0)
ioctl$FIOASYNC(r1, 0x80047401, &(0x7f0000000000))
ioctl$FIOASYNC(r0, 0x80104303, &(0x7f00000001c0))


modctl$MODCTL_UNLOAD(0x2, 0x0)
r0 = socket(0x1f, 0x3, 0x0)
setsockopt(r0, 0x0, 0x0, 0x0, 0x0)


setsockopt(0xffffffffffffffff, 0x0, 0x0, &(0x7f00000000c0)="b211d7170d816685c8e360f2fa41c1a0946988b272d2dd3dc90142a84231a746e337b372e93320cff6669cbe7868de45ed3fc33719ca6df71ecec8a918458b2c10a1f8c66653b276e180e9cb9b21f9982230f575295d48889c9a920796b2dd92fc8575680b37ba955d2c15e6d7c9198ed900ab006ddfb67869b51a2216114d1ece85f593e74035f5", 0x88)
open(&(0x7f0000000100)='./file0\x00', 0x615, 0x0)
r0 = open(&(0x7f0000000100)='./file0\x00', 0x0, 0x0)
fcntl$lock(r0, 0x9, &(0x7f0000000140)={0x0, 0x0, 0x1000000, 0x100000002})


__clock_settime50(0x0, &(0x7f0000000000)={0x20bce330})


__mount50(&(0x7f0000000c00)='kernfs\x00', &(0x7f0000000040)='.\x00', 0x0, 0x0, 0x0)
unlink(&(0x7f0000000080)='./file0\x00')


mknod(&(0x7f0000000000)='./bus\x00', 0x2000, 0xa718)
__mount50(0x0, 0x0, 0x0, 0x0, 0x0)
r0 = socket(0x800000018, 0x2, 0x0)
r1 = socket(0x18, 0x1, 0x0)
r2 = dup2(r1, r0)
getsockopt$sock_cred(r2, 0xffff, 0x1022, 0x0, 0x0)
r3 = open$dir(&(0x7f0000000080)='./bus\x00', 0x0, 0x0)
ioctl$FIOASYNC(r3, 0x8030447c, &(0x7f00000001c0))


setuid(0xee01)
r0 = getpid()
ktrace(0x0, 0x0, 0x0, r0)
compat_40_mount(0x0, 0x0, 0x0, 0x0)
r1 = shmget$private(0x0, 0x12000, 0x0, &(0x7f00002b9000/0x12000)=nil)
compat_50___shmctl13$IPC_STAT(r1, 0x2, 0x0)


symlink(&(0x7f0000000080)='.\x00', &(0x7f0000000300)='./file0\x00')
compat_50___msgctl13$IPC_STAT(0x0, 0x2, &(0x7f0000000440)={{}, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, &(0x7f00000001c0)={&(0x7f0000000100)}})
mknod(&(0x7f0000000100)='./file0\x00', 0x0, 0x5300)
msgctl$IPC_SET(0x0, 0x1, &(0x7f0000000200)={{}, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0xfffffffffffffffc})
compat_40_mount(&(0x7f0000000140)='umap\x00', &(0x7f0000000040)='./file0\x00', 0x0, &(0x7f00000001c0))


mknod(&(0x7f0000000000)='./file0\x00', 0x2000, 0x45d48)
open(&(0x7f0000000000)='./file0\x00', 0x2, 0x0)


compat_50_setitimer(0x300, 0x0, &(0x7f0000000040))


mkdir(&(0x7f0000000080)='./file0\x00', 0x0)
__mount50(&(0x7f0000000000)='fdesc\x00', &(0x7f0000000180)='./file0\x00', 0x0, 0x0, 0x0)
open(0x0, 0x0, 0x0)
pathconf(&(0x7f00000000c0)='./file0\x00', 0x1)


mkdir(&(0x7f0000000080)='./file0\x00', 0x0)
compat_50___msgctl13$IPC_STAT(0x0, 0x2, &(0x7f0000000280)={{}, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, &(0x7f00000001c0)={&(0x7f0000000100)}})
__mount50(&(0x7f0000000040)='procfs\x00', &(0x7f0000000080)='./file0\x00', 0x0, &(0x7f00000000c0)="82", 0x1)
open(&(0x7f0000000100)='./file0\x00', 0x0, 0x0)
compat_40_mount(&(0x7f0000000140)='umap\x00', &(0x7f0000000040)='./file0\x00', 0x0, &(0x7f00000001c0))
__utimes50(&(0x7f0000000340)='./file0\x00', 0x0)
mkdir(&(0x7f0000000080)='./file0\x00', 0x0)


semctl$IPC_SET(0x0, 0x0, 0x1, 0x0)
pipe(&(0x7f0000000080)={<r0=>0xffffffffffffffff})
r1 = getpid()
fktrace(r0, 0x0, 0x62e2dd08f149ff1b, r1)
fchdir(0xffffffffffffffff)
read(0xffffffffffffffff, 0x0, 0x0)
modctl$MODCTL_LOAD(0x2, 0x0)
r2 = msgget$private(0x0, 0x0)
msgsnd(r2, &(0x7f0000000040)=ANY=[@ANYRESDEC], 0x0, 0x0)
msgrcv(r2, &(0x7f0000000200)={0x0, ""/41}, 0x31, 0x0, 0x0)


chroot(&(0x7f0000000000)='.\x00')
compat_20_getfsstat(&(0x7f0000000000), 0xffffffffffffff8e, 0x0)


ktrace(0x0, 0x5, 0x0, 0xffffffffffffffff)


compat_40_mount(&(0x7f0000000040)='tmpfs\x00', &(0x7f00000000c0)='.\x00', 0x0, &(0x7f00000002c0)="01")
r0 = open$dir(&(0x7f0000001240)='.\x00', 0x0, 0x0)
symlink(&(0x7f0000000080)='.\x00', &(0x7f0000000040)='./bus\x00')
mknod(&(0x7f0000000140)='./file0\x00', 0x2000, 0x0)
compat_30_getdents(r0, &(0x7f0000000000), 0x8001)


r0 = getsid(0x0)
ptrace(0x9, r0, 0x0, 0x0)
ptrace(0x0, 0x0, 0x0, 0x0)
ptrace(0x7, r0, 0x0, 0x0)


r0 = socket(0x18, 0x3, 0x0)
preadv(r0, &(0x7f0000000100)=[{0x0}], 0x1, 0x0)


symlink(&(0x7f0000000080)='.\x00', &(0x7f0000000240)='./file0\x00')
r0 = open(&(0x7f0000000140)='./file0\x00', 0x0, 0x0)
__posix_fadvise50(r0, 0x0, 0x0, 0x6d7, 0x0)


setreuid(0x0, 0xee01)
compat_43_osethostname(&(0x7f00000005c0)="e5", 0x1)


r0 = open(&(0x7f00000000c0)='./file0\x00', 0x80000000000206, 0x0)
chmod(&(0x7f0000000080)='./file0\x00', 0x439)
writev(r0, &(0x7f0000000180)=[{&(0x7f00000001c0)='#!', 0x2}], 0x1)
execve(&(0x7f0000000140)='./file0\x00', 0x0, 0x0)


mkdir(&(0x7f0000000080)='./file0\x00', 0x0)
__mount50(&(0x7f00000002c0)='overlay\x00', &(0x7f0000000040)='.\x00', 0x0, &(0x7f0000000540), 0x0)
unlink(&(0x7f0000000000)='./file0\x00')


open(&(0x7f0000000340)='./file0\x00', 0x300, 0x0)
r0 = open(&(0x7f0000000100)='./file0\x00', 0x0, 0x0)
fcntl$lock(r0, 0x9, &(0x7f0000000340))


ktrace(&(0x7f0000000200)='./file0\x00', 0x4, 0xd27d43220c7df9b, 0x0)
_ksem_timedwait(0x0, &(0x7f0000000000)={0x0, 0x80000001})


compat_40_mount(&(0x7f0000000040)='tmpfs\x00', &(0x7f0000000140)='.\x00', 0x0, &(0x7f00000001c0)="01")
mknod$loop(&(0x7f0000000000)='./file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\x00', 0x2000, 0x1)
symlink(&(0x7f0000000ac0)='./file0\x00', &(0x7f0000000e40)='./file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\x00')
rename(&(0x7f0000000600)='./file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\x00', &(0x7f0000000f40)='./file0\x00')
link(&(0x7f0000000440)='./file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa/file0\x00', 0x0)


r0 = open(&(0x7f0000000040)='./file0\x00', 0x70e, 0x0)
ktrace(0x0, 0x0, 0x0, 0x0)
compat_43_orecvmsg(r0, &(0x7f00000000c0)={0x0, 0x0, 0x0, 0x0, 0x0}, 0x0)


mknodat(0xffffffffffffff9c, &(0x7f0000000040)='./file0\x00', 0xc0e99db6de761f86, 0x0)
r0 = open$dir(&(0x7f0000000000)='./file0\x00', 0x2, 0x0)
ioctl$FIONREAD(r0, 0x40046679, &(0x7f0000000140))


mknod(&(0x7f0000000000)='./file1\x00', 0x2000, 0xa718)
r0 = open$dir(&(0x7f00000000c0)='./file1\x00', 0x0, 0x0)
ioctl$OFIOGETBMAP(r0, 0x8004667d, &(0x7f0000000100))


r0 = socket(0x1f, 0x1, 0x0)
ioctl$FIOSEEKHOLE(r0, 0xc118691d, &(0x7f0000000180)=0x8000000000000032)


r0 = getsid(0x0)
ptrace(0x9, r0, 0x0, 0x0)
ptrace(0x17, r0, 0x0, 0x0)


syz_emit_ethernet(0xe, &(0x7f0000000000))
writev(0xffffffffffffffff, &(0x7f0000000580)=[{&(0x7f0000000000)="b886b4e47f", 0x5}], 0x1)
syz_emit_ethernet(0x138, &(0x7f0000000000))
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1)
r0 = socket(0x2, 0x1, 0x0)
bind(r0, &(0x7f0000000000), 0x10)
r1 = dup(r0)
listen(r1, 0x0)
r2 = socket(0x2, 0x1, 0x0)
fcntl$setstatus(r2, 0x4, 0x40)
connect$inet(r2, &(0x7f0000000000), 0x10)
shutdown(r2, 0x2)


r0 = socket(0x10, 0x2, 0x0)
fcntl$setflags(r0, 0x10, 0x1000000000000)


setregid(0x0, 0xee01)
r0 = getegid()
setregid(0x0, r0)


__mount50(0x0, &(0x7f0000000040)='.\x00', 0x0, 0x0, 0x0)
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x4)
pathconf(&(0x7f0000000040)='./file0\x00', 0xd)


open$dir(&(0x7f00000000c0)='./file0\x00', 0x40000400001803c1, 0x0)
r0 = open(&(0x7f0000000140)='./file0\x00', 0x0, 0x0)
open(&(0x7f00000000c0)='./file0\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x200000)=nil, 0x200000, 0x0, 0x10, r0, 0x0, 0x0)
compat_30___fhstat30(&(0x7f00000006c0)={{}, {0x0, 0x0, "fbf6ea69840991b744967e896b5d2079"}}, 0x0)
minherit(&(0x7f0000003000/0x2000)=nil, 0x2000, 0x0)


mknod(&(0x7f0000000200)='./file0\x00', 0x0, 0x0)
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
r0 = socket(0x1f, 0x40000003, 0x0)
setsockopt$sock_timeval(r0, 0xffff, 0x100b, &(0x7f0000000200), 0x10)


socket$inet(0x2, 0x0, 0x0)
setsockopt$sock_linger(0xffffffffffffffff, 0xffff, 0x80, &(0x7f0000000080)={0x4}, 0x8)
syz_emit_ethernet(0x46, &(0x7f00000001c0))
setsockopt(0xffffffffffffffff, 0x0, 0x0, 0x0, 0x0)
r0 = open$dir(0x0, 0x0, 0x0)
getpid()
r1 = open$dir(&(0x7f0000000100)='.\x00', 0x0, 0x0)
mkdirat(r1, &(0x7f0000000a80)='./file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\x00', 0x0)
open$dir(&(0x7f0000000b80)='./file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\x00', 0x200, 0x0)
r2 = open$dir(&(0x7f0000000000)='.\x00', 0x0, 0x0)
symlinkat(&(0x7f0000000040)='./file0\x00', r2, &(0x7f00000004c0)='./file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\x00')
symlinkat(&(0x7f0000000000)='./file0\x00', r2, &(0x7f0000000200)='./file1aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\x00')
symlinkat(&(0x7f0000000800)='./file0\x00', r2, &(0x7f0000000400)='./file1\x00')
fcntl$setflags(r0, 0x2, 0x1)
rename(&(0x7f0000000100)='./file1\x00', &(0x7f0000000300)='./file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\x00')
mkdirat(0xffffffffffffffff, &(0x7f00000000c0)='./file0\x00', 0x0)
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
mkdir(&(0x7f0000000080)='./file0/file0\x00', 0x183)
rename(&(0x7f0000000080)='./file0\x00', &(0x7f00000000c0)='./file0\x00')
syz_emit_ethernet(0x3e, &(0x7f0000000040))


__mount50(&(0x7f0000000000)='fdesc\x00', &(0x7f0000000040)='.\x00', 0x0, 0x0, 0x0)
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
r0 = socket(0x2, 0x3, 0x0)
ioctl$FIOSEEKHOLE(r0, 0xc0986981, &(0x7f0000000180))
pathconf(&(0x7f0000000040)='./file0\x00', 0x2)


socketpair$unix(0x1, 0x5, 0x0, &(0x7f0000000040)={<r0=>0xffffffffffffffff, <r1=>0xffffffffffffffff})
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
recvfrom$unix(r0, &(0x7f00000000c0), 0x832f1f7d, 0x0, &(0x7f0000000000)=@abs, 0x2000c600)
close(r0)
shutdown(r1, 0x2)


mknod(&(0x7f0000000280)='./file0\x00', 0x2000, 0x200)
r0 = open$dir(&(0x7f0000000b80)='./file0\x00', 0x0, 0x0)
ioctl$FIOASYNC(0xffffffffffffffff, 0x4004667f, 0x0)
pread(r0, &(0x7f0000000000)="a6", 0x1, 0x0)


compat_40_mount(&(0x7f0000000200)='ptyfs\x00', &(0x7f00000003c0)='.\x00', 0x0, &(0x7f0000000a80))
__mount50(0x0, &(0x7f0000000040)='.\x00', 0xe680bf986d21abfb, 0x0, 0x0)


mknod(&(0x7f0000000480)='./file0\x00', 0x2000, 0x1733)
r0 = open(&(0x7f00000000c0)='./file0\x00', 0x0, 0x0)
ioctl$FIONREAD(r0, 0x40044274, &(0x7f0000000080))


ioctl$WSMUXIO_INJECTEVENT(0xffffffffffffffff, 0x80185760, &(0x7f0000000000)={0x0, 0x0, {0x0, 0x10000000000001}})
fcntl$lock(0xffffffffffffffff, 0x0, &(0x7f00000000c0)={0x0, 0x0, 0xffffffffffffffff})
connect$unix(0xffffffffffffffff, &(0x7f00000000c0)=@abs={0x682eb13985c518e6, 0x7}, 0x1c)
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
r0 = socket(0x1, 0x1, 0x0)
close(r0)
r1 = socket(0x18, 0x2, 0x0)
setsockopt(r1, 0x1000000000029, 0x2a, &(0x7f0000000040)='\x00\x00\x00\x00', 0x4)
setsockopt(r1, 0x1000000029, 0x2e, &(0x7f0000000000)="ebffcbff13b9fd812eaa4e713048e69931929648", 0x14)
connect$unix(r0, &(0x7f00000000c0)=@abs={0x0, 0x7}, 0x1c)
write(r0, 0x0, 0x0)


undelete(&(0x7f0000000040)='.\x00')


r0 = msgget$private(0x0, 0x100)
msgrcv(r0, 0x0, 0x0, 0x0, 0x0)
msgctl$IPC_STAT(r0, 0x2, &(0x7f0000000040)=""/56)
r1 = socket$inet(0x2, 0x1, 0x0)
msgrcv(r0, &(0x7f0000000840), 0xfa, 0x0, 0x0)
getsockopt(r1, 0x0, 0x5, 0x0, 0x0)
getsockopt$SO_PEERCRED(r1, 0xffff, 0x1022, &(0x7f0000000080), 0xc)
socket(0x18, 0x3, 0x0)
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1)
mmap(&(0x7f0000000000/0x3000)=nil, 0x3000, 0x1, 0x1012, 0xffffffffffffffff, 0x0, 0x0)
r2 = socket(0x18, 0x3, 0x0)
setsockopt(r2, 0x1000000029, 0x27, &(0x7f0000000040)="5ab7736a", 0x4)


compat_43_osendmsg(0xffffffffffffffff, &(0x7f0000000000), 0x0)


compat_40_mount(&(0x7f0000000200)='procfs\x00', &(0x7f00000000c0)='.\x00', 0x0, &(0x7f00000002c0)="01")
r0 = open$dir(&(0x7f0000001240)='.\x00', 0x0, 0x0)
__lstat50(&(0x7f0000000000)='.\x00', 0x0)
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
unmount(&(0x7f0000000000)='./file0\x00', 0x255a0100)
fcntl$getown(r0, 0x10)


mkdir(&(0x7f0000000080)='./file0\x00', 0x0)
compat_50___msgctl13$IPC_STAT(0x0, 0x2, &(0x7f0000000280)={{}, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, &(0x7f00000001c0)={&(0x7f0000000100)}})
__mount50(&(0x7f0000000040)='procfs\x00', &(0x7f0000000080)='./file0\x00', 0x0, &(0x7f00000000c0)='@', 0x1)
lchown(&(0x7f0000000100)='./file0\x00', 0x0, 0xffffffffffffffff)
compat_40_mount(&(0x7f0000000140)='umap\x00', &(0x7f0000000040)='./file0\x00', 0x0, &(0x7f00000001c0))
pathconf(&(0x7f0000000080)='./file0\x00', 0x2)


__mount50(&(0x7f00000002c0)='overlay\x00', &(0x7f0000000040)='.\x00', 0x0, &(0x7f0000000540), 0x0)
symlink(&(0x7f0000000080)='.\x00', &(0x7f0000000240)='./file0\x00')
compat_50___msgctl13$IPC_STAT(0x0, 0x2, &(0x7f0000000280)={{}, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, &(0x7f00000001c0)={&(0x7f0000000100), 0xdffffffffffff7ff}})
lchown(&(0x7f0000000100)='./file0\x00', 0x0, 0xffffffffffffffff)
compat_40_mount(&(0x7f0000000380)='union\x00', &(0x7f0000000140)='./file0\x00', 0x0, &(0x7f00000001c0))
compat_40_mount(&(0x7f0000000380)='union\x00', &(0x7f0000000140)='./file0\x00', 0x0, &(0x7f00000001c0))
mknod(&(0x7f0000000000)='./file1\x00', 0x2000, 0xa718)


fcntl$lock(0xffffffffffffffff, 0x9, 0x0)


msgctl$IPC_SET(0x0, 0x1, 0x0)
socket(0x0, 0x0, 0x0)
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1)
r0 = semget$private(0x0, 0x1, 0x1)
compat_50_____semctl13$SETALL(r0, 0x0, 0x9, &(0x7f0000001640)=@val=0xffff0000)


mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
utimensat(0xffffffffffffffff, 0x0, &(0x7f0000002540), 0x0)


mknod(&(0x7f00000000c0)='./bus\x00', 0x2000, 0x4f4b)
unlink(&(0x7f0000000040)='./bus/\x00')


mkdir(&(0x7f0000000080)='./file0\x00', 0x0)
__mount50(&(0x7f0000000040)='procfs\x00', &(0x7f0000000080)='./file0\x00', 0x0, &(0x7f00000000c0)='@', 0x1)
pathconf(&(0x7f0000000080)='./file0\x00', 0xa)


open(&(0x7f0000000040)='./file0\x00', 0x70e, 0x0)
ktrace(&(0x7f0000000200)='./file0\x00', 0x4, 0xd27d43220c7df9b, 0x0)
r0 = openat(0xffffffffffffff9c, &(0x7f0000000140)='./file0\x00', 0x0, 0x0)
fdatasync(r0)


mknod(&(0x7f0000000180)='./file0\x00', 0x2000, 0x0)
compat_50_select(0x3, &(0x7f0000000140), &(0x7f0000000300)={0x8}, 0x0, 0x0)


fstatat(0xffffffffffffffff, &(0x7f00000000c0)='./file1\x00', 0x0, 0x0)


r0 = _lwp_self()
_lwp_suspend(r0)
_lwp_wait(r0, 0x0)
_lwp_wakeup(r0)


mknod(&(0x7f0000000280)='./file0\x00', 0x2000, 0x200)
r0 = open(&(0x7f0000000040)='./file0\x00', 0x0, 0x0)
write(r0, 0x0, 0x0)


mknod(&(0x7f0000000000)='./file1\x00', 0x2000, 0xa718)
r0 = open$dir(&(0x7f00000000c0)='./file1\x00', 0x0, 0x0)
ioctl$OFIOGETBMAP(r0, 0xc020447f, &(0x7f0000000100)=0xffffffff)


chroot(&(0x7f0000000000)='.\x00')
__clone(0x0, 0x0)
modctl$MODCTL_UNLOAD(0x2, &(0x7f0000000000))


semctl$IPC_SET(0x0, 0x0, 0x1, &(0x7f0000000000)={{0x0, 0xffffffffffffffff}, 0x0, 0x0, 0x1})
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
r0 = socket(0x18, 0x2, 0x0)
setsockopt(r0, 0x1000000000029, 0xe, &(0x7f0000000040)='\x00\x00\x00\x00', 0x4)


mkdir(&(0x7f0000000080)='./file0\x00', 0x0)
__mount50(&(0x7f0000000000)='fdesc\x00', &(0x7f0000000200)='./file0\x00', 0x0, 0x0, 0x0)
compat_50___msgctl13$IPC_STAT(0x0, 0x2, &(0x7f0000000280)={{}, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, &(0x7f00000001c0)={&(0x7f0000000100), 0xdffffffffffff7ff}})
lchown(&(0x7f0000000100)='./file0\x00', 0x0, 0xffffffffffffffff)
compat_40_mount(&(0x7f0000000380)='union\x00', &(0x7f0000000140)='./file0\x00', 0x3, &(0x7f00000001c0))
compat_40_mount(&(0x7f0000000140)='umap\x00', &(0x7f0000000040)='./file0\x00', 0x0, &(0x7f00000001c0))
r0 = open(&(0x7f0000002600)='./file0\x00', 0x0, 0x0)
ioctl$WSKBDIO_SETMAP(r0, 0xc0145002, &(0x7f0000000180)={0x0, 0x0})


r0 = socket$inet(0x2, 0x1, 0x0)
bind$inet(r0, &(0x7f0000000040)={0x2, 0x0}, 0x10)
connect$inet(r0, &(0x7f0000000000)={0x2, 0x0}, 0x10)
compat_43_osend(r0, 0x0, 0x0, 0xb)


r0 = open(&(0x7f0000000040)='./file0\x00', 0x70e, 0x0)
ioctl$FIONREAD(r0, 0x4004667f, &(0x7f0000000100))


fcntl$lock(0xffffffffffffffff, 0x0, &(0x7f00000000c0)={0x0, 0x0, 0x8000000000000, 0x300100000})
connect$unix(0xffffffffffffffff, &(0x7f00000000c0)=@abs={0x682eb13985c518e6, 0x7}, 0x1c)
ioctl$WSMUXIO_INJECTEVENT(0xffffffffffffffff, 0x80185760, &(0x7f0000000000)={0x0, 0x0, {0x0, 0x10000000000002}})
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
r0 = socket(0x18, 0x1, 0x0)
close(r0)
r1 = socket(0x18, 0x2, 0x0)
setsockopt(r1, 0x1000000029, 0x2e, &(0x7f0000000000)="ebffcbff13b9fd812eaa4e713048e69931929648", 0x14)
connect$unix(r0, &(0x7f00000000c0)=@abs={0x0, 0x7}, 0x1c)


compat_50___msgctl13$IPC_STAT(0x0, 0x2, &(0x7f0000000280)={{}, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, &(0x7f00000001c0)={0x0, 0xdffffffffffff7ff}})
mknod(&(0x7f00000000c0)='./bus\x00', 0x2000, 0x4f4b)
r0 = open$dir(&(0x7f0000000040)='./bus\x00', 0x0, 0x0)
ioctl$FIOASYNC(r0, 0xc0104302, &(0x7f00000001c0))


compat_50_setitimer(0x0, &(0x7f0000000080), &(0x7f00000000c0))


open(&(0x7f0000000040)='./file0\x00', 0x70e, 0x0)
ktrace(&(0x7f0000000200)='./file0\x00', 0x4, 0xd27d43220c7df9b, 0x0)
compat_50___lstat30(0x0, 0x0)


mlock(&(0x7f0000ffc000/0x3000)=nil, 0x3000)
munmap(&(0x7f0000ffd000/0x1000)=nil, 0x1000)
munlock(&(0x7f0000ffc000/0x3000)=nil, 0x3000)


setregid(0xee00, 0xee01)
setgid(0x0)


r0 = open(&(0x7f0000000040)='./file0\x00', 0x70e, 0x0)
pwritev(r0, &(0x7f0000000380)=[{&(0x7f0000000440)="e9", 0x1}], 0x1, 0x3fffd)
r1 = open$dir(&(0x7f00000003c0)='./file0\x00', 0x0, 0x0)
preadv(r1, &(0x7f00000004c0)=[{&(0x7f00000000c0)=""/99, 0xfffffd23}], 0x1, 0x0)
truncate(&(0x7f0000000140)='./file0\x00', 0x30001, 0x0)
sendmsg(0xffffffffffffffff, 0x0, 0x0)
r2 = open(&(0x7f0000000080)='./file1\x00', 0x200, 0x0)
mmap(&(0x7f0000011000/0x2000)=nil, 0x2000, 0x0, 0x10, r2, 0x0, 0x0)
mprotect(&(0x7f000000e000/0x4000)=nil, 0x4000, 0x3)
mlock(&(0x7f000000f000/0x4000)=nil, 0x4000)


writev(0xffffffffffffffff, &(0x7f0000000240)=[{&(0x7f0000000140)='\x00', 0x1}], 0x1)
r0 = socket$inet(0x2, 0x2, 0x0)
r1 = socket$inet(0x2, 0x2, 0x0)
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x4)
setsockopt$inet_opts(r1, 0x0, 0x9, &(0x7f0000000240)="ea00000100000000", 0xc)
dup2(r1, r0)
setsockopt$inet_opts(r0, 0x0, 0x200000000000c, &(0x7f0000000240)="ea0071db0000e0b8", 0x8)


mknod(&(0x7f0000000540)='./file0\x00', 0x6000, 0x1003)
mkdirat(0xffffffffffffffff, &(0x7f0000000000)='./file0\x00', 0x0)
swapctl$SWAP_ON(0x2000, &(0x7f0000000000), 0x0)


r0 = socket(0x2, 0x2, 0x0)
connect$unix(r0, &(0x7f0000000a80), 0x10)
getsockname$unix(r0, &(0x7f0000000000)=@abs, &(0x7f0000001200)=0x8)
r1 = socket(0x2, 0x1, 0x0)
bind(r1, &(0x7f0000000000), 0x10)
setsockopt$sock_int(r1, 0xffff, 0x1, &(0x7f0000000040)=0xaa3, 0x4)
r2 = dup(r1)
listen(r2, 0x0)
r3 = socket(0x2, 0x1, 0x0)
connect$unix(r3, &(0x7f0000000000), 0x10)
pipe(&(0x7f0000000100)={0xffffffffffffffff, <r4=>0xffffffffffffffff})
poll(&(0x7f0000000200)=[{}, {r4}], 0x2, 0x0)
poll(&(0x7f0000000040)=[{}, {r4}], 0x2, 0x0)
ioctl$WSDISPLAYIO_LDFONT(r4, 0x8058574d, &(0x7f00000000c0)={&(0x7f0000000000), 0x1, 0x7, 0x0, 0x1, 0xe2, 0x1f, 0x6, 0x0, 0xfffffffffffffffe})
sendto$inet(r3, &(0x7f0000000080)="f3", 0x1, 0x195a05e282d6161, 0x0, 0x0)
recvmmsg(r1, &(0x7f0000000480)={&(0x7f00000001c0)={&(0x7f0000000140), 0xc, &(0x7f0000000500)=[{&(0x7f0000000180)=""/15, 0xf}, {&(0x7f0000001240)=""/4096, 0x1000}, {&(0x7f0000000280)=""/225, 0xe1}, {&(0x7f0000002240)=""/4096, 0x1000}, {&(0x7f0000000380)=""/196, 0xc4}], 0x5, &(0x7f0000000580)=""/204, 0xcc}, 0x9}, 0x10, 0x841, &(0x7f0000000680)={0x1000, 0x8})
r5 = accept$unix(r1, 0x0, 0x0)
r6 = socket(0x18, 0x2, 0x0)
connect$unix(r6, &(0x7f00000000c0)=@abs={0x682eb13985c518e6, 0x7}, 0x1c)
sendmsg(r6, &(0x7f00000011c0)={0x0, 0x0, &(0x7f0000000e00)=[{0x0}], 0x1, &(0x7f0000000e80)=[{0x10}], 0x10}, 0x0)
r7 = socket(0x18, 0x2, 0x0)
connect$unix(r7, 0x0, 0x0)
sendmsg(0xffffffffffffffff, 0x0, 0x0)
accept$unix(0xffffffffffffffff, 0x0, 0x0)
pwritev(0xffffffffffffffff, &(0x7f00000008c0)=[{&(0x7f0000000740)="5df776e48b1e16d6bd41569b8c61dd02966d31e5fb87a486a63ff9aa68b719bf4ecfd2354adbc7e8855c53f090956e56a6f4ef78ece2df6c4130c2d90ef0caa7e3bdf7dc3418c6806de3f5b0a6a0447d21f59f92c88b53382a2aa621ffd57a0fd308679697bbf57b24d8207ec3f5a35473470b02d62941cd772bdceb90912f04ef29c8096610c35f5b47caef2fe0376f63924cb284a8c62fe13ea45153bf02c0eaa66a8c5a19", 0xa6}, {&(0x7f0000000800)="b8797e5443050bd851629cba127d94d03547c0376a65c4a0e7259cfdded14edeb431e4577126e4061ab790d5a77a03e6589b914b66185c48859a7010924e9130f7393ade111cee14d7fe70c0d5ea0b56032f976113e9cd1b865c13302eda4b80882557f8867a2b5d13cff8343f7cf53dad7c703c66925ca496a382b69066a3ac30740dfb8badbea5b6176a2a9964b819ddecd7", 0x93}], 0x2, 0xffffffffffff91a3)
recvmmsg(r5, &(0x7f00000004c0)={0x0}, 0x10, 0x41, 0x0)
recvmmsg(r5, &(0x7f0000000240)={0x0}, 0x10, 0x1, 0x0)


__mount50(&(0x7f00000002c0)='overlay\x00', &(0x7f0000000040)='.\x00', 0x0, &(0x7f0000000540), 0x0)
__lstat50(&(0x7f0000000000)='.\x00', 0x0)
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
unmount(&(0x7f0000000000)='./file0\x00', 0x255a0100)
execve(&(0x7f0000000280)='./file0\x00', 0x0, 0x0)


mknod(&(0x7f0000000480)='./file0\x00', 0x2000, 0x1733)
r0 = open$dir(&(0x7f0000000b80)='./file0\x00', 0x0, 0x0)
ioctl$FIOASYNC(r0, 0x40044266, &(0x7f0000000000))


mknod(&(0x7f0000000280)='./file0\x00', 0x6000, 0x500)
open(&(0x7f0000000040)='./file0\x00', 0x0, 0x0)


mknod(&(0x7f0000000100)='./file0\x00', 0x2000, 0x0)
r0 = open(&(0x7f0000000140)='./file0\x00', 0x0, 0x0)
ioctl$FIOASYNC(r0, 0x40047481, 0x0)


mkdir(&(0x7f0000000080)='./file0\x00', 0x0)
compat_50___msgctl13$IPC_STAT(0x0, 0x2, &(0x7f0000000280)={{}, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, &(0x7f00000001c0)={&(0x7f0000000100)}})
sendmsg$unix(0xffffffffffffffff, &(0x7f0000000240)={&(0x7f0000000180)=@abs={0x0, 0x0, 0x3}, 0x8, &(0x7f00000038c0)}, 0x0)
open(&(0x7f0000000100)='./file0\x00', 0x0, 0x0)
compat_40_mount(&(0x7f0000000140)='umap\x00', &(0x7f0000000040)='./file0\x00', 0x0, &(0x7f00000001c0))
chmod(&(0x7f0000000080)='./file0\x00', 0x439)


__mount50(0x0, &(0x7f0000000040)='.\x00', 0x0, 0x0, 0x0)
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x0)
compat_43_ommap(&(0x7f0000ffc000/0x2000)=nil, 0x2000, 0x0, 0x1082, 0xffffffffffffffff, 0x0)


mknod(&(0x7f0000000000)='./file1\x00', 0x2000, 0xa718)
r0 = open$dir(&(0x7f00000000c0)='./file1\x00', 0x0, 0x0)
ioctl$OFIOGETBMAP(r0, 0xc020447f, &(0x7f0000000100))


msgctl$IPC_SET(0x0, 0x1, 0x0)
__mount50(&(0x7f0000000000)='overlay\x00', &(0x7f0000000040)='.\x00', 0x0, &(0x7f0000000540), 0x0)
__mount50(&(0x7f00000002c0)='overlay\x00', &(0x7f0000000040)='.\x00', 0x0, &(0x7f0000000540), 0x0)
mkdir(&(0x7f0000000000)='./file0\x00', 0x0)
rmdir(&(0x7f0000000000)='./file0\x00')


semctl$IPC_SET(0x0, 0x0, 0x1, &(0x7f00000000c0)={{0x0, 0x0, 0xffffffffffffffff, 0x0, 0x0, 0x0, 0x200}})
sendmsg$unix(0xffffffffffffffff, &(0x7f0000001700)={&(0x7f00000000c0), 0x1c, 0x0}, 0x0)
connect$unix(0xffffffffffffffff, &(0x7f00000000c0)=@abs={0x682eb13985c518e6, 0x7}, 0x1c)
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
r0 = socket(0x18, 0x400000002, 0x0)
setsockopt(r0, 0x1000000029, 0x2e, &(0x7f0000000000)="ebffcbff13b9fd812eaa4e713048e69931929648", 0x14)
r1 = socket(0x18, 0x1, 0x0)
r2 = dup2(r0, r1)
sendmsg$unix(r2, &(0x7f0000001700)={0x0, 0x0, 0x0}, 0x0)


mknod(&(0x7f0000001200)='./file0\x00', 0x2000, 0x400)
r0 = open$dir(&(0x7f00000000c0)='./file0\x00', 0x2, 0x0)
pwritev(r0, &(0x7f0000000080)=[{&(0x7f00000006c0), 0xf0f75}], 0x10, 0x0)


rasctl(&(0x7f0000001400), 0xd39, 0x0)
rasctl(0x0, 0x9, 0x0)
rasctl(0x0, 0x0, 0x2)


compat_60__lwp_park(&(0x7f0000000040)={0x0, 0x80000000}, 0x0, 0x0, 0x0)


compat_43_ocreat(&(0x7f0000000000)='./file0\x00', 0x0)
ktrace(&(0x7f0000000200)='./file0\x00', 0x4, 0xd27d43220c7df9b, 0x0)
symlink(&(0x7f0000000080)='.\x00', 0x0)
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
setrlimit(0x0, &(0x7f0000000080))


swapctl$SWAP_CTL(0x5, 0x0, 0x0)
compat_40_mount(&(0x7f0000000380)='tmpfs\x00', &(0x7f00000003c0)='.\x00', 0x10800039, &(0x7f0000000140)="01")
rename(&(0x7f0000000d80)='./file0\x00', &(0x7f0000000180)='./file1\x00')


mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x0)
swapctl$SWAP_ON(0x1, &(0x7f0000000000), 0x0)


mknod(&(0x7f0000000280)='./file0\x00', 0x1100, 0x0)
r0 = open(&(0x7f0000000180)='./file0\x00', 0x80000000000206, 0x0)
writev(r0, &(0x7f0000000680)=[{&(0x7f00000001c0)="ee", 0x1}], 0x1)
ktrace(&(0x7f0000000000)='./file0\x00', 0x0, 0x0, 0xffffffffffffffff)


mkdir(&(0x7f0000000080)='./file0\x00', 0x0)
__mount50(&(0x7f0000000040)='procfs\x00', &(0x7f0000000080)='./file0\x00', 0x0, &(0x7f00000000c0)='@', 0x1)
__mount50(&(0x7f00000002c0)='overlay\x00', &(0x7f0000000040)='.\x00', 0x0, &(0x7f0000000540), 0x0)
rmdir(&(0x7f0000000000)='./file0\x00')


modctl$MODCTL_LOAD(0x5, 0x0)
ktrace(0x0, 0x0, 0x0, 0x0)
mmap(&(0x7f0000000000/0x2000)=nil, 0x2000, 0x0, 0x10, 0xffffffffffffffff, 0x0, 0x0)
msgget(0x3, 0x0)


mknod(&(0x7f0000000000)='./file0\x00', 0x2000, 0x40000802)
r0 = open(&(0x7f00000000c0)='./file0\x00', 0x0, 0x0)
ioctl$FIOASYNC(r0, 0x80017472, &(0x7f0000000040))
r1 = open(&(0x7f00000000c0)='./file0\x00', 0x0, 0x0)
ioctl$FIOASYNC(r1, 0x80017472, &(0x7f0000000040))


symlink(&(0x7f0000000080)='.\x00', &(0x7f0000000240)='./file0\x00')
pathconf(&(0x7f0000000080)='./file0\x00', 0xb)


symlink(&(0x7f0000000080)='.\x00', &(0x7f0000000240)='./file0\x00')
compat_40_mount(&(0x7f0000000200)='ext2fs\x00', &(0x7f0000000140)='./file0\x00', 0x0, &(0x7f00000001c0))


sendmsg$unix(0xffffffffffffffff, 0x0, 0x0)
semctl$SETALL(0x0, 0x0, 0x9, 0x0)
getsockopt$SO_PEERCRED(0xffffffffffffffff, 0xffff, 0x1022, 0x0, 0x0)
semctl$IPC_SET(0x0, 0x0, 0x1, &(0x7f0000000240)={{0x8001, 0xffffffffffffffff, 0x0, 0x0, 0x0, 0x21, 0x2}, 0x0, 0x9, 0xe7})
semctl$IPC_SET(0x0, 0x0, 0x1, &(0x7f0000000080)={{0x25, 0x0, 0x0, 0x0, 0x0, 0x80, 0x6}, 0x4, 0x5, 0xffffffff})
getsockopt$sock_cred(0xffffffffffffffff, 0xffff, 0x1022, &(0x7f00000000c0), &(0x7f0000000240)=0xc)
getuid()
getsockopt$sock_cred(0xffffffffffffffff, 0xffff, 0x1022, &(0x7f0000000240), &(0x7f0000000280)=0xc)
r0 = msgget$private(0x0, 0xffffffffffffffe5)
msgsnd(r0, &(0x7f0000000400)=ANY=[@ANYRESHEX, @ANYRES32], 0x15, 0x0)
msgsnd(r0, &(0x7f0000000440)={0x3, "a486714b3b6964c6224c6d3e16e0d3d8edebe56ff5cc0190d7f39c044dac99fec5afca3ec3e155903698d635e2ab348195cce43ab9e134935e4edf5efe4e5ec4bec02d51201f93b9860f69d58fca21e1f36041df344b049af83f321177b2fdcfcc2725691dc0"}, 0x6e, 0x800)
msgrcv(r0, &(0x7f00000012c0)={0x0, ""/4096}, 0x1008, 0x1, 0x800)


__mount50(&(0x7f0000000000)='overlay\x00', &(0x7f0000000040)='.\x00', 0x0, &(0x7f0000000540), 0x0)
mkdir(&(0x7f0000000080)='./file0\x00', 0x0)
compat_50___msgctl13$IPC_STAT(0x0, 0x2, &(0x7f0000000280)={{}, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, &(0x7f00000001c0)={&(0x7f0000000100), 0xdffffffffffff7ff}})
lchown(&(0x7f0000000100)='./file0\x00', 0x0, 0xffffffffffffffff)
compat_40_mount(&(0x7f0000000380)='union\x00', &(0x7f0000000140)='./file0\x00', 0x0, &(0x7f00000001c0))
unmount(&(0x7f0000000200)='./file0/../file0\x00', 0x0)


pipe(0x0)
syz_emit_ethernet(0xe, &(0x7f0000000000))
socket(0x0, 0x0, 0x0)
writev(0xffffffffffffffff, &(0x7f0000000580)=[{&(0x7f0000000000)="b886b4e47f", 0x5}], 0x1)
syz_emit_ethernet(0x138, &(0x7f0000000000))
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1)
r0 = socket(0x2, 0x1, 0x0)
bind(r0, &(0x7f0000000000), 0x10)
r1 = dup(r0)
listen(r1, 0x0)
socket(0x0, 0x1, 0x0)
listen(0xffffffffffffffff, 0x0)
r2 = socket(0x2, 0x1, 0x0)
setsockopt$sock_int(r2, 0xffff, 0x1, 0x0, 0x0)
connect$unix(r2, &(0x7f0000000000), 0x10)
dup2(r2, r2)
accept$inet(r0, 0x0, 0x0)


mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x0)
compat_50_select(0x190, 0x0, &(0x7f0000000040), 0xffffffffffffffff, 0x0)


symlink(&(0x7f0000000080)='.\x00', &(0x7f0000000240)='./file0\x00')
compat_50___msgctl13$IPC_STAT(0x0, 0x2, &(0x7f0000000280)={{}, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, &(0x7f00000001c0)={&(0x7f0000000100)}})
sendmsg$unix(0xffffffffffffffff, &(0x7f0000000240)={&(0x7f0000000180)=@abs={0x0, 0x0, 0x2}, 0x8, &(0x7f0000000200)}, 0x0)
open(&(0x7f0000000100)='./file0\x00', 0x0, 0x0)
compat_40_mount(&(0x7f0000000140)='umap\x00', &(0x7f0000000040)='./file0\x00', 0x0, &(0x7f00000001c0))
unlink(&(0x7f0000000000)='./file0\x00')
open(&(0x7f00000000c0)='./file0\x00', 0x615, 0x0)
r0 = open(&(0x7f0000000080)='./file0\x00', 0x0, 0x0)
readv(r0, &(0x7f0000000400)=[{0x0}], 0x1)


symlink(&(0x7f0000000080)='.\x00', &(0x7f0000000240)='./file0\x00')
compat_50___msgctl13$IPC_STAT(0x0, 0x2, &(0x7f0000000280)={{}, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, &(0x7f00000001c0)={&(0x7f0000000100), 0xdffffffffffff7ff}})
lchown(&(0x7f0000000100)='./file0\x00', 0x0, 0xffffffffffffffff)
compat_40_mount(&(0x7f0000000380)='union\x00', &(0x7f0000000140)='./file0\x00', 0x0, &(0x7f00000001c0))
unlink(&(0x7f0000000000)='./file0\x00')
open$dir(&(0x7f0000000000)='./file0\x00', 0x40000400001803c1, 0x0)
truncate(&(0x7f0000000000)='./file0\x00', 0x0, 0x0)


r0 = open(&(0x7f00000000c0)='./file0\x00', 0x615, 0x0)
r1 = getpid()
ktrace(&(0x7f0000000000)='./file0\x00', 0x0, 0x40000530, r1)
socketpair$unix(0x1, 0x1, 0x0, &(0x7f0000000340)={0xffffffffffffffff, <r2=>0xffffffffffffffff})
sendmmsg(r2, &(0x7f0000000080)={0x0}, 0x10, 0x0, 0x0)
setsockopt$sock_timeval(r2, 0xffff, 0x0, 0x0, 0x0)
ktrace(&(0x7f0000000200)='./file0\x00', 0x0, 0x0, 0x0)
ftruncate(r0, 0x0, 0x10000)


open(&(0x7f00000000c0)='./file0\x00', 0x205, 0x0)
r0 = open(&(0x7f0000000100)='./file0\x00', 0x0, 0x0)
fcntl$lock(r0, 0x9, &(0x7f0000000080)={0x0, 0x0, 0x0, 0x100000001})
r1 = open(&(0x7f0000000040)='./file0\x00', 0x205, 0x0)
fcntl$lock(r1, 0x8, &(0x7f00000001c0)={0x0, 0x0, 0x8, 0x1000300010008})
r2 = open(&(0x7f0000000100)='./file0\x00', 0x0, 0x0)
fcntl$lock(r2, 0x9, &(0x7f0000000140)={0x0, 0x0, 0xfff, 0x100000002})


mknod(0x0, 0x8001420, 0x0)


modctl$MODCTL_UNLOAD(0x2, 0x0)
mkdir(&(0x7f0000000100)='./file0\x00', 0x100)
__mount50(&(0x7f0000000000)='kernfs\x00', &(0x7f0000000180)='./file0\x00', 0x0, 0x0, 0x0)
mknod(0x0, 0x0, 0x0)
r0 = open(&(0x7f0000000140)='./file0\x00', 0x0, 0x0)
fcntl$lock(0xffffffffffffffff, 0x0, &(0x7f0000000000)={0x0, 0x0, 0x0, 0x0, 0xffffffffffffffff})
ioctl$FIOASYNC(r0, 0x80047401, &(0x7f0000000000))


r0 = msgget$private(0x0, 0x40)
msgctl$IPC_SET(r0, 0x1, &(0x7f0000000080)={{0x0, 0x0, 0xffffffffffffffff}, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x7fffffffffffffff})
r1 = socket(0x800000018, 0x1, 0x0)
bind$unix(r1, &(0x7f0000000080)=@abs={0x1f95d27d48731892, 0x7}, 0x1c)
r2 = socket(0x800000018, 0x1, 0x0)
sendmmsg(0xffffffffffffffff, &(0x7f0000000080)={0x0}, 0x10, 0x0, 0x0)
bind$unix(r2, &(0x7f0000000080)=@abs={0x1f95d27d48731892, 0x7}, 0x1c)
r3 = socket(0x18, 0x2, 0x0)
connect$unix(r3, &(0x7f00000000c0)=@abs={0x682eb13985c518e6, 0x7}, 0x1c)
getsockname$inet(r3, &(0x7f0000000080), &(0x7f0000000000)=0xfffffffffffffe22)
r4 = socket(0x2, 0x1, 0x0)
getsockopt$sock_int(r4, 0xffff, 0x1003, &(0x7f0000000080), &(0x7f00000000c0)=0x4)
socket(0x800000018, 0x1, 0x0)
r5 = socket(0x18, 0x2, 0x0)
getsockname$inet(r5, &(0x7f0000000080), &(0x7f0000000040)=0xc)
r6 = socket(0x18, 0x1, 0x0)
bind$unix(r6, &(0x7f0000000080)=@abs={0x1f95d27d48731892, 0x7}, 0x1c)


mknod(&(0x7f0000000480)='./file0\x00', 0x2000, 0x1733)
r0 = open(&(0x7f00000000c0)='./file0\x00', 0x80000000000206, 0x0)
writev(r0, &(0x7f0000000280)=[{0x0}], 0x1)


mkdir(&(0x7f0000000080)='./file0\x00', 0x0)
__mount50(&(0x7f0000000000)='kernfs\x00', &(0x7f0000000180)='./file0\x00', 0x0, 0x0, 0x0)
compat_50___msgctl13$IPC_STAT(0x0, 0x2, &(0x7f0000000280)={{}, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, &(0x7f00000001c0)={&(0x7f0000000100), 0xdffffffffffff7ff}})
lchown(&(0x7f0000000100)='./file0\x00', 0x0, 0xffffffffffffffff)
compat_40_mount(&(0x7f0000000380)='union\x00', &(0x7f0000000140)='./file0\x00', 0x3, &(0x7f00000001c0))
open$dir(&(0x7f0000000040)='./file0\x00', 0x20, 0x0)


fcntl$lock(0xffffffffffffffff, 0x0, &(0x7f0000000040)={0x0, 0x0, 0x0, 0x8000000000000001})
ioctl$WSMUXIO_INJECTEVENT(0xffffffffffffffff, 0x80185760, &(0x7f0000000000)={0x7fffffff, 0x0, {0x0, 0x3}})
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
r0 = socket(0x18, 0x1, 0x0)
setsockopt(r0, 0x1000000029, 0xc, &(0x7f0000000000)="ebffcbff13b9fd812eaa4e713048e69931929648", 0x14)
setsockopt(r0, 0x1000000029, 0xc, &(0x7f0000000040)="ebffcbff13b9fd812eaa4e713048e69931929648", 0x14)
setsockopt(r0, 0x1000000029, 0xc, &(0x7f00000000c0)="ebffcbff13b9fd812eaa4e713a48e69931929648", 0x14)
r1 = socket$inet(0x2, 0x1, 0x0)
dup2(r1, r0)
getrlimit(0x0, 0xffffffffffffffff)


mkdir(0x0, 0x0)
mkdir(&(0x7f00000001c0)='./file1\x00', 0x0)
mknod(&(0x7f00000000c0)='./bus\x00', 0x2000, 0x4f4b)
r0 = open$dir(&(0x7f0000000040)='./bus\x00', 0x0, 0x0)
ioctl$FIOASYNC(r0, 0xc0104306, &(0x7f00000001c0))


munmap(&(0x7f0000ffc000/0x4000)=nil, 0x4000)
r0 = shmget$private(0x0, 0x4000, 0x0, &(0x7f0000ffb000/0x4000)=nil)
shmat(r0, &(0x7f0000ffc000/0x3000)=nil, 0x0)
mlock(&(0x7f0000ffc000/0x1000)=nil, 0x1000)
madvise(&(0x7f0000ffc000/0x2000)=nil, 0x2000, 0x4)


setsockopt$inet6_MRT6_DEL_MFC(0xffffffffffffffff, 0x29, 0x69, &(0x7f0000000180)={{0x18, 0x3, 0x0, 0x3d2}, {0x18, 0x2}}, 0x3c)
socketpair$unix(0x1, 0x5, 0x0, &(0x7f0000000040)={0xffffffffffffffff, <r0=>0xffffffffffffffff})
sendmmsg(r0, &(0x7f0000000080)={0x0}, 0x10, 0x0, 0x0)


writev(0xffffffffffffffff, &(0x7f0000000240)=[{&(0x7f0000000140)='\x00', 0x1}], 0x1)
r0 = socket$inet(0x2, 0x2, 0x0)
r1 = socket$inet(0x2, 0x2, 0x0)
setsockopt$inet_opts(r1, 0x0, 0x9, &(0x7f0000000240)="ea00000100000000", 0xc)
dup2(r1, r0)
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
setsockopt$inet_opts(r0, 0x0, 0x200000000000c, &(0x7f0000000240)="ea00000100000000", 0x8)
setsockopt$inet_opts(r1, 0x0, 0xd, &(0x7f0000000240)="ea08000000000000", 0x8)


compat_20_getfsstat(&(0x7f0000000000), 0xffffffffffffff39, 0x0)
setsockopt(0xffffffffffffffff, 0x0, 0x0, &(0x7f0000000000)="ebff13fd", 0x4)
mknod(&(0x7f00000000c0)='./bus\x00', 0x2000, 0x4f4b)
r0 = open$dir(&(0x7f0000000040)='./bus\x00', 0x0, 0x0)
ioctl$FIOASYNC(r0, 0xc0104308, &(0x7f00000001c0)=0x20000002)


r0 = socket$inet6(0x18, 0x3, 0x0)
getsockopt$sock_int(r0, 0xffff, 0x1008, &(0x7f0000000640), &(0x7f0000000680)=0x4)


getgid()
msgctl$IPC_SET(0x0, 0x1, &(0x7f00000000c0)={{0x0, 0x0, 0x0, 0x0, 0x0, 0xb2}})
r0 = socket(0x18, 0x1, 0x0)
connect$unix(r0, &(0x7f00000000c0)=@abs={0x682eb13985c518e6, 0x7}, 0x1c)
getsockname$inet(r0, &(0x7f00000000c0), &(0x7f0000000000)=0xffffffffffffff35)
r1 = socket(0x18, 0x2, 0x0)
r2 = socket(0x18, 0x2, 0x0)
setsockopt(r2, 0x1000000000029, 0xa, &(0x7f0000000040)='\x00\x00\x00\x00', 0x4)
dup2(r2, r1)
setsockopt$sock_timeval(0xffffffffffffffff, 0xffff, 0x1005, &(0x7f00000000c0)={0xfffe0000000000, 0x7}, 0x10)
connect$unix(r1, &(0x7f00000000c0)=@abs={0x682eb13985c518e6, 0x7}, 0x1c)
sendmsg(r1, &(0x7f0000000ec0)={0x0, 0x0, 0x0, 0x0, 0x0}, 0x0)
r3 = socket$inet(0x2, 0x1, 0x0)
setsockopt(r3, 0x0, 0x4, &(0x7f0000000000)='\x00\x00\x00\x00', 0x4)
getgid()
socketpair$unix(0x1, 0x2, 0x0, &(0x7f0000000340)={<r4=>0xffffffffffffffff})
getsockopt$sock_cred(r4, 0xffff, 0x1022, &(0x7f0000000100)={0x0, <r5=>0x0, <r6=>0x0}, &(0x7f0000000180)=0xc)
chown(&(0x7f0000000080)='./file0\x00', 0x0, r6)
fchown(r4, r5, r6)
getegid()
setgroups(0x0, 0x0)
r7 = semget$private(0x0, 0x4, 0x100)
semctl$IPC_STAT(r7, 0x0, 0x2, &(0x7f00000000c0)=""/55)


r0 = socket(0x2, 0x0, 0x0)
ktrace(&(0x7f0000000140)='./file1\x00', 0x0, 0x0, 0x0)
symlink(&(0x7f00000000c0)='.\x00', &(0x7f0000000000)='./file0\x00')
rename(&(0x7f00000002c0)='./file0\x00', &(0x7f0000000340)='./file0/../file0\x00')
r1 = socket(0x11, 0x3, 0x3)
r2 = getpgrp()
fcntl$setown(r0, 0x6, r2)
sendto$unix(r1, &(0x7f0000000280)="b1000501600000000000000007000000331c13fecea10500fef96ecfc72fd3357ae320b37b673039d2d236acf20b7804be38164991f7c8cf5f882b297be1aa5b236deb51e2f0ac3ebbc2576b9a5f139b672f4d335d223e7d026ba8af630037282102000000720fd38bfbb770c1f5a872c881ea6e69e0bb76d907c400000200361b1257aea8c500002002fb00000000008abfba09000000ec1d89e000040781e4b2fff040ff00"/177, 0xb1, 0x0, 0x0, 0x0)


open(&(0x7f0000000040)='./file0\x00', 0x710, 0x1b2)
r0 = open$dir(&(0x7f00000000c0)='./file0\x00', 0x1, 0x0)
pwritev(r0, &(0x7f0000000080)=[{&(0x7f00000006c0), 0xf0f75}], 0x1, 0x0)
truncate(&(0x7f0000000040)='./file0\x00', 0x30001, 0x0)
r1 = open(&(0x7f0000000000)='./file0\x00', 0x0, 0x0)
preadv(r1, &(0x7f0000000340)=[{&(0x7f0000000240)=""/229, 0x6}, {&(0x7f0000000140)=""/139, 0xffffffbc}], 0x2, 0x0)


recvmmsg(0xffffffffffffffff, 0x0, 0x0, 0x0, &(0x7f0000000380)={0x9})
r0 = socket$inet6(0x18, 0x2, 0x0)
recvmmsg(r0, &(0x7f00000002c0)={0x0}, 0x10, 0x0, 0x0)
recvmmsg(r0, &(0x7f00000001c0)={0x0}, 0x10, 0x1002, 0x0)


mknod(&(0x7f0000000000)='./file0\x00', 0x2000, 0x1803)
r0 = open(&(0x7f0000000180)='./file0\x00', 0x0, 0x0)
read(r0, &(0x7f00000002c0)=""/82, 0x52)


compat_20_statfs(0x0, 0x0)
socketpair$unix(0x1, 0x1, 0x0, &(0x7f0000000040)={0xffffffffffffffff, <r0=>0xffffffffffffffff})
close(r0)
r1 = socket(0x18, 0x2, 0x0)
readv(0xffffffffffffffff, 0x0, 0x0)
connect$unix(r1, &(0x7f00000000c0)=@abs={0x682eb13985c518e6, 0x7}, 0x1c)
sendmsg(r0, &(0x7f0000000180)={0x0, 0x0, 0x0, 0x0, &(0x7f00000001c0)=ANY=[], 0x3e}, 0x0)


compat_50_nanosleep(&(0x7f0000000c40)={0x0, 0x40420f00}, 0x0)
fdatasync(0xffffffffffffffff)
mknod(&(0x7f00000000c0)='./bus\x00', 0x2000, 0x0)
r0 = semget$private(0x0, 0x1, 0x488)
compat_40_mount(&(0x7f0000000000)='overlay\x00', &(0x7f00000000c0)='.\x00', 0x0, &(0x7f00000002c0)="01")
mknod(&(0x7f0000000000)='./file0\x00', 0x8000, 0x0)
__getfh30(&(0x7f0000000040)='./file0\x00', 0x0, &(0x7f00000000c0)=0x6c3)
pipe(&(0x7f0000001400))
socketpair$unix(0x1, 0x2, 0x0, &(0x7f0000000340)={0xffffffffffffffff, <r1=>0xffffffffffffffff})
sendmmsg(r1, &(0x7f0000000080)={0x0}, 0x10, 0x0, 0x0)
compat_40_mount(&(0x7f0000000080)='tmpfs\x00', 0x0, 0x0, 0x0)
__mount50(0x0, &(0x7f0000000040)='.\x00', 0x0, &(0x7f0000000540), 0x0)
unlinkat(0xffffffffffffff9c, 0x0, 0x0)
mkdir(&(0x7f0000000000)='./file0\x00', 0x0)
r2 = open$dir(&(0x7f0000000080)='./file0\x00', 0x0, 0x0)
fchown(r2, 0x0, 0xffffffffffffffff)
chmod(&(0x7f00000001c0)='./file0\x00', 0x0)
semget$private(0x0, 0x4, 0x4a0)
compat_50_____semctl13$IPC_SET(r0, 0x0, 0x1, &(0x7f0000000080)=@buf=&(0x7f0000000040)={{0x0, 0x0, 0x1, 0x1, 0x20, 0x0, 0x42}, 0x5, 0x10000000000000, 0x8, &(0x7f0000000000)={0x5, 0x6, 0x9}})


r0 = socket$inet(0x2, 0x2, 0x0)
setsockopt$inet_opts(r0, 0x0, 0x200000000000c, &(0x7f0000000200)="ea00000000000000", 0x8)
__mount50(&(0x7f0000000080)='mfs\x00', &(0x7f0000000000)='.\x00', 0x0, 0x0, 0x0)


mknod(&(0x7f0000000240)='./file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\x00', 0x2000, 0x0)
mknod$loop(&(0x7f0000000000)='./file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\x00', 0x2000, 0x1)
link(&(0x7f0000000940)='./file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\x00', &(0x7f0000001380)='./file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\x00')
link(&(0x7f0000000100)='./file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\x00', &(0x7f0000000bc0)='./file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\x00')
mkdir(&(0x7f0000000000)='./file1\x00', 0x0)
mknod(&(0x7f0000000000)='./file0\x00', 0x2000, 0x0)
rename(&(0x7f0000000340)='./file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\x00', &(0x7f0000000480)='./file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\x00')
rename(&(0x7f00000006c0)='./file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\x00', &(0x7f0000000580)='./file0\x00')
rename(&(0x7f0000000a40)='./file0\x00', &(0x7f00000007c0)='./file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\x00')


r0 = _lwp_self()
_lwp_suspend(r0)
_lwp_wakeup(r0)
_lwp_continue(r0)


lchmod(&(0x7f0000000040)='./file0\x00', 0x0)
sendmsg$unix(0xffffffffffffffff, 0x0, 0x409)
r0 = socket(0x18, 0x3, 0x0)
connect$unix(r0, &(0x7f00000000c0)=@abs={0x682eb13985c518e6, 0x7}, 0x1c)
recvmsg(0xffffffffffffffff, &(0x7f0000000080)={0x0, 0x0, &(0x7f0000000580), 0x0, 0x0}, 0x0)
socket$unix(0x1, 0x0, 0x0)
ioctl$FIONREAD(0xffffffffffffffff, 0x80206979, 0x0)
pipe(&(0x7f00000001c0)={<r1=>0xffffffffffffffff, <r2=>0xffffffffffffffff})
compat_43_osend(r1, &(0x7f0000000200)="bdb94a1c3b83eaf58760df8a82f5819259817caecb3e14248674c4fa5937ed2703ea6d7eae50da19e37e0ffdfff99b64d9ba1454d61431a6a9e1f0b9d0bef14ac54e2f3a1e405f8db1f21410cb78eefdcfc33116a4124c273fcc9edf030df06fe846fba6850c513c78db6ec7e04a37100c1fa30cab40568761ad7fbd1ab9bab6a564bca4646c3476f5c0f675a5f02832359a95", 0x93, 0xa)
fchdir(r2)
execve(0x0, 0x0, 0x0)
getsockname$inet(r0, &(0x7f00000000c0), &(0x7f0000000000)=0xffffffffffffff35)
r3 = socket(0x18, 0x2, 0x0)
_ksem_open(&(0x7f0000000000), 0x1a00, 0x0, 0x0, &(0x7f0000001000))
r4 = socket(0x18, 0x2, 0x0)
connect$unix(r3, &(0x7f00000000c0)=@abs={0x682eb13985c518e6, 0x7}, 0x1c)
socket(0x0, 0x0, 0x0)
setsockopt(0xffffffffffffffff, 0x0, 0x0, &(0x7f0000000000), 0x0)
r5 = dup2(r3, r4)
setsockopt(r5, 0x1000000029, 0x23, &(0x7f00000000c0)="b211d7170d816684c8e360f2fa41c1a0946988b272d2dd3dc90142a84231a746e337b372e93320cff6669cbe7868de45ed3fc33719ca6df71ecec8a918458b2c10a1f8c66653b276e7aae9cb9b21f9982230f575295d48889c9a920796b2dd92fc8575680b37ba955d2c15e6d7c9198ed900ab006ddfb67869b51a2216114d1ece85f593e74035f5bc054eb1dbddf42a", 0x90)
write(r5, 0x0, 0x0)


r0 = open(&(0x7f00000001c0)='./file0\x00', 0x6aaaf9f55adc2226, 0x0)
pwritev(0xffffffffffffffff, 0x0, 0x0, 0x0)
fcntl$lock(r0, 0x7, &(0x7f0000000080)={0x0, 0x0, 0x0, 0x300000001})


open(&(0x7f0000000040)='./file0\x00', 0x70e, 0x0)
ktrace(&(0x7f0000000200)='./file0\x00', 0x4, 0xd27d43220c7df9b, 0x0)
open$dir(0x0, 0x0, 0x0)
fchownat(0xffffffffffffffff, 0x0, 0x0, 0x0, 0x0)


recvmmsg(0xffffffffffffffff, 0x0, 0x0, 0x0, &(0x7f0000000780)={0x0, 0x3907f893})


modctl$MODCTL_LOAD(0x0, &(0x7f00000000c0)={0x0})


open(&(0x7f0000000040)='./file0\x00', 0x70e, 0x0)
ktrace(&(0x7f0000000200)='./file0\x00', 0x4, 0xd27d43220c7df9b, 0x0)
__nanosleep50(0x0, 0x0)
compat_43_stat43(0x0, 0x0)


r0 = socket$inet(0x2, 0x1, 0x0)
compat_50_select(0x40, &(0x7f0000000280)={0xdc}, 0x0, 0x0, 0x0)
ptrace(0x0, 0x0, 0x0, 0x0)
shutdown(r0, 0x1)


symlink(&(0x7f0000000080)='.\x00', &(0x7f0000000240)='./file0\x00')
compat_50___msgctl13$IPC_STAT(0x0, 0x2, &(0x7f00000003c0)={{}, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, &(0x7f00000001c0)={&(0x7f0000000100)}})
compat_40_mount(&(0x7f0000000380)='ntfs\x00', &(0x7f0000000140)='./file0\x00', 0x0, &(0x7f00000001c0))


mknod(&(0x7f0000000480)='./file0\x00', 0x2000, 0x1733)
r0 = open(&(0x7f0000000480)='./file0\x00', 0x0, 0x0)
__posix_fadvise50(r0, 0x0, 0x0, 0x0, 0x0)


mknod(&(0x7f0000000480)='./file0\x00', 0x2000, 0x1733)
r0 = open(&(0x7f00000000c0)='./file0\x00', 0x0, 0x0)
fcntl$dupfd(r0, 0x0, r0)
open(&(0x7f00000000c0)='./file0\x00', 0x615, 0x0)
truncate(&(0x7f0000000200)='./file0\x00', 0x0, 0x7fff)
compat_40_mount(&(0x7f0000000040)='tmpfs\x00', &(0x7f00000000c0)='.\x00', 0x0, &(0x7f0000000140)="01")
__utimes50(&(0x7f0000000080)='./file1/file0/../file0\x00', 0x0)
__mount50(0x0, 0x0, 0x0, 0x0, 0x0)
open(&(0x7f0000000480)='./file0\x00', 0x7fffe, 0x0)
setpgid(0x0, 0x0)
getpriority(0x0, 0x0)
truncate(&(0x7f0000000440)='./file0\x00', 0x0, 0x3e2b649f)


modctl$MODCTL_UNLOAD(0x2, 0x0)
swapctl$SWAP_GETDUMPDEV(0x8, 0xffffffffffffffff, 0x0)


r0 = getsid(0x0)
ptrace(0x9, r0, 0x0, 0x0)
ptrace(0x16, r0, 0x0, 0x0)


ioctl$WSMUXIO_INJECTEVENT(0xffffffffffffffff, 0x80185760, &(0x7f0000000000)={0x0, 0x0, {0x0, 0x10000000000001}})
fcntl$lock(0xffffffffffffffff, 0x0, &(0x7f00000000c0)={0x0, 0x0, 0x0, 0x9f9d})
connect$unix(0xffffffffffffffff, &(0x7f00000000c0)=@abs={0x682eb13985c518e6, 0x7}, 0x1c)
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
r0 = socket(0x18, 0x1, 0x0)
close(r0)
r1 = socket(0x18, 0x1, 0x0)
setsockopt(r1, 0x1000000029, 0x2e, &(0x7f0000000000)="ebffcbff13b9fd812eaa4e713048e69931929648", 0x14)
connect$unix(r0, &(0x7f00000000c0)=@abs={0x0, 0x7}, 0x1c)


lchflags(0x0, 0x0)
compat_50_setitimer(0x3, &(0x7f0000001800)={{}, {0x1}}, 0x0)
__setitimer50(0x3, 0x0, &(0x7f0000001940))


socket(0x0, 0x0, 0x0)
r0 = msgget$private(0x0, 0x0)
msgrcv(r0, 0x0, 0x0, 0x2, 0x1000)
getpid()
msgctl$IPC_SET(0x0, 0x11, &(0x7f0000000180)={{}, 0x7, 0x0, 0x0, 0x0, 0x0, 0x0, 0x3})
pipe(&(0x7f0000000340)={<r1=>0xffffffffffffffff})
chmod(0x0, 0x200)
mkdirat(0xffffffffffffffff, 0x0, 0x0)
open$dir(0x0, 0x0, 0x0)
fktrace(r1, 0x0, 0x0, 0x0)
socket$inet(0x2, 0x2, 0x0)
fktrace(0xffffffffffffffff, 0x0, 0x2, 0x0)
__nanosleep50(&(0x7f0000000000)={0x8, 0x5}, 0x0)
setpriority(0x0, 0x1, 0x0)
setpriority(0x2, 0x0, 0x0)


ptrace(0x28, 0x0, 0x0, 0x0)


r0 = open(&(0x7f0000000200)='./file0\x00', 0x245, 0x0)
fcntl$lock(r0, 0x8, &(0x7f00000001c0)={0x0, 0x0, 0x0, 0x1000300010008})
open(&(0x7f0000000040)='./file0\x00', 0x10, 0x0)
r1 = open(&(0x7f0000000100)='./file0\x00', 0x0, 0x0)
fcntl$lock(r1, 0x9, &(0x7f0000000080)={0x0, 0x0, 0x0, 0x100000001})
acct(0x0)
r2 = open(&(0x7f0000000100)='./file0\x00', 0x0, 0x0)
fcntl$lock(r2, 0x9, &(0x7f0000000140)={0x0, 0x6, 0x0, 0x10000000a})


mknod(&(0x7f0000000300)='./file0\x00', 0x2000, 0x6da)
munmap(&(0x7f0000ffc000/0x4000)=nil, 0x4000)
r0 = open(&(0x7f00000000c0)='./file0\x00', 0x0, 0x0)
ioctl$FIONREAD(r0, 0x4090426b, 0x0)
fcntl$getown(r0, 0x5)


mincore(&(0x7f0000ffb000/0x2000)=nil, 0x2000, 0xffffffffffffffff)


recvmmsg(0xffffffffffffffff, 0x0, 0x0, 0x0, 0x0)
semget(0x1, 0x1, 0x242)
setuid(0xee00)
semget(0x1, 0x0, 0xf0)


r0 = getpgrp()
getpgid(r0)


mknod(&(0x7f00000000c0)='./bus\x00', 0x2000, 0x0)
r0 = open$dir(&(0x7f0000000000)='./bus\x00', 0x0, 0x0)
setreuid(0x0, 0xee00)
ioctl$FIOASYNC(r0, 0x80017472, &(0x7f00000001c0))


compat_43_osetrlimit(0x0, &(0x7f0000000080))
__clone(0x0, &(0x7f0000000000))
__wait450(0x0, 0x0, 0x1e, 0x0)


r0 = socket(0x2, 0x3, 0x0)
connect$inet(r0, &(0x7f00000000c0)={0x2, 0x0}, 0x10)
compat_43_ogetpeername(r0, 0x0, 0x0)


mknod(&(0x7f0000000000)='./file1\x00', 0xe015, 0x0)
__mount50(&(0x7f00000002c0)='overlay\x00', &(0x7f0000000040)='.\x00', 0x0, &(0x7f0000000540), 0x0)
lchown(&(0x7f00000000c0)='./file1\x00', 0x0, 0x0)
__lstat50(&(0x7f0000000400)='./file1\x00', 0x0)


mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
compat_43_orecvmsg(0xffffffffffffffff, &(0x7f0000001680)={0x0, 0x0, 0x0, 0x0, 0x0}, 0x0)


r0 = socket$inet(0x2, 0x4000000000000001, 0x0)
setsockopt$sock_linger(r0, 0xffff, 0x80, &(0x7f0000000300)={0x0, 0x1600000}, 0x8)


mkdir(&(0x7f0000000080)='./file0\x00', 0x0)
r0 = open$dir(&(0x7f0000000000)='.\x00', 0x0, 0x0)
symlinkat(&(0x7f0000000dc0)='./file0\x00', r0, &(0x7f0000000ec0)='./file1aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\x00')
rename(&(0x7f0000000980)='./file1aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\x00', &(0x7f0000000a80)='./file0\x00')


fcntl$lock(0xffffffffffffffff, 0x0, 0x0)
pipe(&(0x7f0000000140)={<r0=>0xffffffffffffffff, <r1=>0xffffffffffffffff})
write(r1, &(0x7f0000000200)="c7", 0x1)
write(r0, &(0x7f0000000340), 0xd4e688a67930cd)
close(r0)
write(r1, &(0x7f0000000040), 0xfeea)
poll(&(0x7f00000000c0)=[{r1, 0x4}], 0x1, 0x0)
execve(0x0, 0x0, 0x0)
socket(0x18, 0x1, 0x0)
chmod(&(0x7f0000000080)='./file0\x00', 0x1f3)
setreuid(0xee00, 0x0)
getuid()
getuid()


__mount50(&(0x7f0000000000)='overlay\x00', 0x0, 0x0, 0x0, 0x0)
ioctl$FIOASYNC(0xffffffffffffffff, 0x20003101, 0x0)
r0 = socket$inet6(0x18, 0x3, 0x0)
sendto$inet6(r0, &(0x7f0000000000)="87", 0x358, 0x2, &(0x7f0000000040)={0x18, 0x3}, 0x1c)


r0 = getsid(0x0)
ptrace(0x9, r0, 0x0, 0x0)
getsid(r0)


r0 = socket$inet(0x2, 0x2, 0x0)
r1 = socket$inet(0x2, 0x2, 0x0)
dup2(r1, r0)
setsockopt$inet_opts(r1, 0x0, 0x200000000000a, &(0x7f0000000000)="ea00005c00000000", 0x1)
setsockopt$inet_opts(r0, 0x0, 0x200000000000c, &(0x7f00000000c0)="e900000100200000", 0x8)


mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x0)
r0 = socket$unix(0x1, 0x1, 0x0)
r1 = socket$unix(0x1, 0x1, 0x0)
r2 = dup3(r1, r0, 0x0)
read(r2, &(0x7f0000001500)=""/4096, 0x1000)


symlink(&(0x7f0000000080)='.\x00', &(0x7f0000000240)='./file0\x00')
compat_50___msgctl13$IPC_STAT(0x0, 0x2, &(0x7f0000000280)={{}, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, &(0x7f00000001c0)={&(0x7f0000000100), 0xdffffffffffff7fb}})
lchown(&(0x7f0000000100)='./file0\x00', 0x0, 0xffffffffffffffff)
compat_40_mount(&(0x7f0000000380)='union\x00', &(0x7f0000000140)='./file0\x00', 0x0, &(0x7f00000001c0))
compat_40_mount(&(0x7f0000000000)='null\x00', &(0x7f0000000140)='./file0\x00', 0x0, &(0x7f00000001c0))
r0 = open(&(0x7f00000000c0)='./file0\x00', 0x0, 0x0)
lseek(r0, 0x0, 0x0, 0x0)


r0 = socket(0x18, 0x2, 0x0)
setsockopt(r0, 0x1000000029, 0x9, 0x0, 0x0)


rename(&(0x7f0000000240)='.\x00', &(0x7f00000006c0)='./file0\x00')


mknod(&(0x7f00000000c0)='./file0\x00', 0x2000, 0x29b3)
open(&(0x7f0000000100)='./file0\x00', 0x0, 0x0)


mkdir(&(0x7f0000000200)='./file0\x00', 0x0)
compat_40_mount(&(0x7f0000000180)='ptyfs\x00', &(0x7f00000002c0)='./file0\x00', 0x0, &(0x7f0000000500))
compat_50___msgctl13$IPC_STAT(0x0, 0x2, &(0x7f0000000280)={{}, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, &(0x7f00000001c0)={&(0x7f0000000100), 0xdffffffffffff7ff}})
lchown(&(0x7f0000000100)='./file0\x00', 0x0, 0xffffffffffffffff)
compat_40_mount(&(0x7f0000000380)='union\x00', &(0x7f0000000140)='./file0\x00', 0x3, &(0x7f00000001c0))
pathconf(&(0x7f0000000240)='./file0\x00', 0xa)


bind$unix(0xffffffffffffffff, &(0x7f00000000c0)=@abs={0x0, 0x0, 0x1}, 0x8)
setrlimit(0x0, &(0x7f0000000180)={0xb66c})
r0 = socket(0x2, 0x2, 0x0)
ioctl$FIONREAD(r0, 0xc020699e, &(0x7f00000001c0))
open(0x0, 0x0, 0x0)
r1 = msgget$private(0x0, 0x0)
msgsnd(r1, &(0x7f00000002c0)=ANY=[@ANYBLOB="40c20fc9a3b0452d625bd9d09205d0c8d98e04952f0cc998dc0305b9fd73518c65822d7bfd726151c80a4f96205c5f02c5ec7e571dd569b866aa399558a0d7c214e49db981eb8e2d06e27941afaa6ef547965b25fbb91fed9f11d9995a650e03494353b42e030f3ef3d61a7e272b312b9a462af7f829fab27a0d69bd10", @ANYRESOCT=r0, @ANYRES64=r0], 0x401, 0x0)
msgsnd(r1, &(0x7f0000001500)=ANY=[@ANYRESDEC=r0, @ANYBLOB], 0x401, 0x800)


r0 = socket(0x18, 0x3, 0x0)
setsockopt(r0, 0x1000000029, 0x24, &(0x7f0000000000)="5ab7776a", 0x4)
setsockopt$sock_int(0xffffffffffffffff, 0xffff, 0x800, &(0x7f0000000180)=0x2, 0x4)
mknod(&(0x7f0000000240)='./file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\x00', 0x2000, 0x0)
mknod$loop(&(0x7f0000000000)='./file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\x00', 0x2000, 0x1)
link(&(0x7f0000000940)='./file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\x00', &(0x7f0000000d40)='./file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\x00')
link(&(0x7f0000000100)='./file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\x00', &(0x7f0000000bc0)='./file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\x00')
rename(&(0x7f0000000340)='./file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\x00', &(0x7f0000000480)='./file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\x00')
rename(&(0x7f00000006c0)='./file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\x00', &(0x7f0000000580)='./file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\x00')
unlink(&(0x7f0000000a80)='./file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\x00')
open$dir(&(0x7f00000026c0)='./file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\x00', 0x0, 0x0)
rename(&(0x7f00000007c0)='./file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\x00', &(0x7f0000000a40)='./file0\x00')


mkdir(&(0x7f0000000080)='./file0\x00', 0x0)
__mount50(&(0x7f0000000040)='procfs\x00', &(0x7f0000000080)='./file0\x00', 0x0, &(0x7f00000000c0)='@', 0x1)
r0 = open(&(0x7f0000000400)='./file0\x00', 0x0, 0x0)
fcntl$lock(r0, 0x8, &(0x7f0000000140)={0x0, 0x0, 0x0, 0x269000000, 0xffffffffffffffff})


compat_40_mount(&(0x7f0000000380)='tmpfs\x00', &(0x7f00000003c0)='.\x00', 0x0, &(0x7f0000000140)="01")
symlink(&(0x7f0000000080)='.\x00', &(0x7f0000000240)='./file0\x00')
chmod(&(0x7f0000000180)='./file0\x00', 0x23f)
setuid(0xee01)
renameat(0xffffffffffffff9c, &(0x7f0000000600)='./file0\x00', 0xffffffffffffffff, &(0x7f0000000240)='./file0\x00')


open$dir(&(0x7f00000000c0)='./file0\x00', 0x40000400001803c1, 0x0)
r0 = getpid()
ktrace(&(0x7f0000000340)='./file0\x00', 0x0, 0xf, r0)
unlink(&(0x7f0000000000)='./file0\x00')
compat_20_statfs(0x0, 0x0)


open(&(0x7f0000000040)='./file0\x00', 0x70e, 0x0)
ktrace(&(0x7f0000000200)='./file0\x00', 0x4, 0xd27d43220c7df9b, 0x0)
_ksem_getvalue(0x0, 0x0)


compat_40_mount(&(0x7f0000000380)='tmpfs\x00', &(0x7f00000003c0)='.\x00', 0x0, &(0x7f0000000140)="01")
mknod(&(0x7f0000000000)='./file1\x00', 0xe015, 0x0)
__utimes50(&(0x7f00000001c0)='./file1\x00', 0x0)
open(&(0x7f00000000c0)='./file1\x00', 0x200, 0x0)


compat_40_mount(&(0x7f0000000040)='tmpfs\x00', &(0x7f00000000c0)='.\x00', 0x0, &(0x7f00000002c0)="01")
open(&(0x7f0000000480)='./file0\x00', 0x80000000000206, 0x0)
r0 = open$dir(&(0x7f0000000000)='.\x00', 0x0, 0x0)
symlinkat(&(0x7f0000000dc0)='./file0\x00', r0, &(0x7f0000000ec0)='./file1aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\x00')
rename(&(0x7f0000000980)='./file1aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\x00', &(0x7f0000000a80)='./file0\x00')


r0 = socket(0x10, 0x2, 0x0)
ioctl$FIOSEEKHOLE(r0, 0x8090690c, &(0x7f0000000180)=0x8000000000000032)


mknod(&(0x7f0000000480)='./file0\x00', 0x2000, 0x1733)
r0 = open(&(0x7f00000000c0)='./file0\x00', 0x0, 0x0)
ioctl$FIONREAD(r0, 0x4010427b, &(0x7f0000000000))


compat_43_osetrlimit(0x9, &(0x7f0000000080))
socket(0x1f, 0x1, 0x0)


mknod(&(0x7f0000000180)='./file0\x00', 0x2000, 0x202)
r0 = open(&(0x7f00000000c0)='./file0\x00', 0x80000000000206, 0x0)
writev(r0, &(0x7f0000000100)=[{&(0x7f0000000000)='#', 0x1}], 0x1)


setreuid(0x0, 0xee01)
r0 = socket(0x800000018, 0x1, 0x0)
bind$unix(r0, &(0x7f0000000080)=@abs={0x1f95d27d48731892, 0x7}, 0x1c)


compat_40_mount(&(0x7f0000000040)='tmpfs\x00', &(0x7f00000000c0)='.\x00', 0x0, &(0x7f00000002c0)="01")
mkdir(&(0x7f00000001c0)='./file1\x00', 0x0)
lchown(&(0x7f0000000100)='./file0\x00', 0x0, 0xffffffffffffffff)
mknodat(0xffffffffffffff9c, &(0x7f00000000c0)='./file1/file2\x00', 0x81c0, 0x0)
linkat(0xffffffffffffff9c, &(0x7f0000000380)='./file1/file2\x00', 0xffffffffffffff9c, &(0x7f00000003c0)='./file0/file2\x00', 0x0)


open(&(0x7f00000000c0)='./file0\x00', 0x205, 0x0)
r0 = getpid()
ktrace(&(0x7f0000000040)='./file0\x00', 0x0, 0x4, r0)
compat_90_fstatvfs1(0xffffffffffffffff, 0x0, 0x0)


mknod(&(0x7f0000000100)='./file0\x00', 0x2000, 0x0)
r0 = open(&(0x7f0000000140)='./file0\x00', 0x0, 0x0)
_lwp_setname(0x0, &(0x7f0000000000)='$&,\x00')
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
ioctl$FIOASYNC(r0, 0x80047410, &(0x7f0000000000))


compat_40_mount(&(0x7f0000000200)='ptyfs\x00', &(0x7f00000003c0)='.\x00', 0x0, &(0x7f0000000a80))
__mount50(0x0, &(0x7f0000000040)='.\x00', 0x0, 0x0, 0x0)
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
pathconf(&(0x7f0000000040)='./file0\x00', 0x3)


mkdir(&(0x7f0000000080)='./file0\x00', 0x0)
lchown(&(0x7f0000000100)='./file0\x00', 0x0, 0xffffffffffffffff)
chroot(&(0x7f0000000000)='.\x00')
rmdir(&(0x7f0000000180)='./file0/../file0\x00')


symlink(&(0x7f0000000080)='.\x00', &(0x7f0000000240)='./file0\x00')
compat_50___msgctl13$IPC_STAT(0x0, 0x2, &(0x7f0000000400)={{}, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, &(0x7f00000001c0)={&(0x7f0000000100), 0xdffffffffffff7ff}})
lchown(&(0x7f0000000100)='./file0\x00', 0x0, 0xffffffffffffffff)
compat_40_mount(&(0x7f0000000380)='union\x00', &(0x7f0000000140)='./file0\x00', 0x0, &(0x7f00000001c0))
mkdir(&(0x7f00000001c0)='./file1\x00', 0x0)
rename(&(0x7f00000003c0)='./file0\x00', &(0x7f0000000380)='./file1/file0\x00')


r0 = socket(0x2, 0x3, 0x0)
connect$inet(r0, &(0x7f00000000c0)={0x2, 0x0}, 0x10)
writev(r0, &(0x7f0000000300)=[{&(0x7f0000000040)="c496c863ef12e77a9ccf614d1763257a55d04812680ee039b990a455278672a838a612281e111f5f5ef488813b3ff2fe2520ffbf202a9552896725358d27749d397e155fef0b1cd1331b6ea0c89d43688c6d8993821800838b3c1d54a9702246caf61fc44e3ce8a88ca004c13d0e33166ed5bac0d31f579b0dfa", 0x7a}, {&(0x7f0000000140)="d59188f42cd003966eb295246b5e29df153f289cb1fc655990890e403e016792e2b421982b92b0b7329370bac09e2939a19a5026368289acd5666737f19fa068eb99dcd0836d13beabce2e5e40f6e646278b233bde88b5f0e28c580f6cfdcea4b39f46006fe0a50dd929a13d0d0f072fa89b9f44a48de7f85d7c0944b7a95e33", 0x80}, {&(0x7f00000001c0)="4fce57e7e07ed72795163767d9aa14d506586440d3511060aba5ab965f49", 0x1e}, {&(0x7f0000000200)="30c13d7918ad802c428a19564ecbdc86e234b5bb22b001d653a51f13856ef0f6871ff2875f6db0a7c50eadced0b7f9483d925341aa6f5b2c9f1098e9f94993ff69a7c410e913050f4a14a6cd717cf588b3", 0x51}], 0x4)


open(&(0x7f0000001180)='./file1\x00', 0x615, 0x0)
ktrace(&(0x7f0000000240)='./file1\x00', 0x4, 0x121a, 0x0)
getuid()


__lutimes50(0x0, &(0x7f0000000040)={0x0, 0x7fffffffffffffff})


mmap(&(0x7f0000000000/0x400000)=nil, 0x400000, 0x3, 0x5012, 0xffffffffffffffff, 0x0, 0x0)
open(&(0x7f0000000040)='./file0\x00', 0x200, 0x0)
open$dir(0x0, 0x0, 0x0)
socketpair$unix(0x1, 0x1, 0x0, &(0x7f0000000040)={<r0=>0xffffffffffffffff})
getsockopt$sock_cred(r0, 0xffff, 0x1022, &(0x7f0000000100)={0x0, 0x0, <r1=>0x0}, &(0x7f0000000140)=0xc)
setregid(0x0, r1)
getpid()
setregid(0x0, 0x0)
ktrace(&(0x7f0000000000)='./file0\x00', 0x4, 0x1100, 0xffffffffffffffff)


mkdir(&(0x7f0000000200)='./file0\x00', 0x0)
compat_50___msgctl13$IPC_STAT(0x0, 0x2, 0x0)
r0 = open(&(0x7f00000000c0)='./file0\x00', 0x0, 0x0)
ioctl$FIOASYNC(r0, 0x40046678, 0x0)


r0 = socket$inet(0x2, 0x2, 0x0)
setsockopt$sock_timeval(r0, 0xffff, 0x1001, 0x0, 0x0)


r0 = socket$inet(0x2, 0x1, 0x0)
setsockopt$inet_opts(r0, 0x6, 0x1, 0x0, 0x0)
setreuid(0xee00, 0x0)
getuid()
setreuid(0x0, 0x0)
semget$private(0x0, 0x0, 0x0)
getgid()
socket$inet(0x2, 0x2, 0x0)
semctl$IPC_SET(0x0, 0x0, 0x1, 0x0)
setreuid(0xee00, 0x0)
r1 = msgget$private(0x0, 0x0)
msgsnd(r1, &(0x7f0000000040)=ANY=[@ANYRESHEX, @ANYRESDEC], 0x0, 0x0)
msgrcv(r1, &(0x7f0000000680), 0xd4, 0x0, 0x1000)


r0 = socket$unix(0x1, 0x1, 0x0)
close(r0)
dup(r0)


munmap(&(0x7f0000ffb000/0x2000)=nil, 0x2000)
mlock(&(0x7f0000ffc000/0x3000)=nil, 0x3000)


mknod(&(0x7f0000000300)='./file0\x00', 0x2000, 0x6da)
r0 = open(&(0x7f0000000140)='./file0\x00', 0x0, 0x0)
ioctl$FIOASYNC(r0, 0x40047477, &(0x7f0000000000))


compat_50___stat30(&(0x7f0000002f40)='./file0\x00', 0x0)


open$dir(0x0, 0x0, 0x0)
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x0)
__getvfsstat90(0x0, 0x0, 0x0)
r0 = socket(0x18, 0x3, 0x0)
getsockname$inet(r0, &(0x7f00000000c0), 0x0)


open(&(0x7f0000000040)='./file0\x00', 0x18289, 0x0)
r0 = getpid()
ktrace(&(0x7f0000000000)='./file0\x00', 0x0, 0x40000530, r0)
recvmmsg(0xffffffffffffffff, &(0x7f0000000080)={&(0x7f0000000100)={&(0x7f00000006c0), 0x213, 0x0, 0x0, 0x0}}, 0x10, 0x0, 0x0)
socketpair$unix(0x1, 0x1, 0x0, &(0x7f0000000040)={<r1=>0xffffffffffffffff, <r2=>0xffffffffffffffff})
recvmmsg(0xffffffffffffffff, &(0x7f0000000700)={&(0x7f00000001c0)={0x0, 0x0, 0x0, 0x0, 0x0}, 0x3f8d}, 0x10, 0x0, 0x0)
sendmmsg(r2, &(0x7f0000000080)={0x0}, 0x10, 0x0, 0x0)
setsockopt$sock_timeval(r2, 0xffff, 0x1005, &(0x7f0000000000)={0x0, 0x8}, 0x10)
close(r2)
recvfrom$unix(r1, 0x0, 0x0, 0x0, 0x0, 0x0)


mkdir(&(0x7f0000000200)='./file0\x00', 0x0)
r0 = open(&(0x7f0000000000)='.\x00', 0x0, 0x0)
chdir(&(0x7f0000000140)='./file0\x00')
unlinkat(r0, &(0x7f0000000280)='./file0\x00', 0x200)
mkdirat(0xffffffffffffff9c, &(0x7f0000000080)='./file0/file0\x00', 0x0)


compat_50___msgctl13$IPC_STAT(0x0, 0x2, &(0x7f0000000280)={{}, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, &(0x7f00000001c0)={&(0x7f0000000100), 0xdffffffffffff7ff}})
lchown(&(0x7f0000000100)='./file0\x00', 0x0, 0xffffffffffffffff)
symlink(&(0x7f0000000080)='.\x00', &(0x7f0000000240)='./file0\x00')
compat_40_mount(&(0x7f0000000380)='union\x00', &(0x7f0000000140)='./file0\x00', 0x0, &(0x7f00000001c0))
r0 = open(&(0x7f0000000040)='./file0\x00', 0x0, 0x0)
fcntl$setstatus(r0, 0x4, 0x0)


r0 = socket$inet(0x2, 0x1, 0x0)
setsockopt$inet_opts(r0, 0x6, 0x7, 0x0, 0x0)


open(&(0x7f0000000000)='./file0\x00', 0x80000000000206, 0x0)
swapctl$SWAP_ON(0x1, &(0x7f0000000000), 0x0)


mkdir(&(0x7f0000000200)='./file0\x00', 0x0)
open(0x0, 0x0, 0x0)
faccessat(0xffffffffffffffff, 0x0, 0x0, 0x0)
setreuid(0xffffffffffffffff, 0xee00)
unmount(&(0x7f0000000140)='./file0\x00', 0x0)


socketpair$unix(0x1, 0x1, 0x0, &(0x7f0000000140)={0xffffffffffffffff, <r0=>0xffffffffffffffff})
_ksem_init(0x0, &(0x7f0000000000))
symlink(&(0x7f0000000080)='.\x00', 0x0)
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
compat_50_select(0x40, &(0x7f0000000080), 0x0, 0x0, 0x0)
compat_50_select(0x40, &(0x7f0000000080), 0x0, 0x0, 0x0)
writev(r0, &(0x7f0000003240)=[{0x0}], 0x1)


r0 = socket(0x1f, 0x5, 0x2)
listen(r0, 0x0)
listen(r0, 0x0)


r0 = socket(0x18, 0x3, 0x0)
ioctl$FIOSEEKHOLE(r0, 0x8090698e, &(0x7f0000000180)=0x8000000000000032)


r0 = socket(0x1f, 0x3, 0x0)
ioctl$FIOSEEKHOLE(r0, 0x8020690c, &(0x7f0000000180)=0x8000000000000032)


__select50(0x0, 0x0, 0xfffffffffffffffe, 0x0, 0x0)


pwritev(0xffffffffffffffff, &(0x7f0000001140)=[{&(0x7f0000000080)="cbf391d90a0dd110ca", 0x9}], 0x1, 0x0)
r0 = socket(0x18, 0x2, 0x0)
close(r0)
r1 = socket(0x800000018, 0x3, 0x0)
setsockopt$sock_int(r1, 0xffff, 0x1000, &(0x7f0000000000)=0x7, 0x4)
bind$unix(r1, &(0x7f0000000080)=@abs={0x1f95d27d48731892, 0x7}, 0x1c)
connect$unix(r0, &(0x7f00000000c0)=@abs={0x682eb13985c518e6, 0x7}, 0x1c)
sendmsg(r0, &(0x7f0000000280)={0x0, 0x0, 0x0, 0x0, 0x0, 0x10}, 0x0)


r0 = socket(0x18, 0x3, 0x0)
ioctl$FIOSEEKHOLE(r0, 0xc0106978, &(0x7f0000000180))
swapctl$SWAP_ON(0x7, 0x0, 0x0)


mknod(&(0x7f0000000000)='./file0\x00', 0x2000, 0x1803)
r0 = open$dir(&(0x7f0000000180)='./file0\x00', 0x0, 0x0)
fchmodat(r0, 0x0, 0x0, 0x0)


r0 = socket(0x18, 0x2, 0x0)
setsockopt(r0, 0x29, 0x16, 0x0, 0x0)


madvise(&(0x7f0000ffa000/0x4000)=nil, 0x4000, 0x0)
mlock(&(0x7f0000ffd000/0x2000)=nil, 0x2000)
mmap(&(0x7f0000ffb000/0x4000)=nil, 0x4000, 0x0, 0x1810, 0xffffffffffffffff, 0x0, 0x0)
mlock(&(0x7f0000ff5000/0x3000)=nil, 0x3000)


mkdir(&(0x7f0000000080)='./file0\x00', 0x0)
chroot(&(0x7f0000000000)='./file0\x00')
r0 = getsid(0x0)
modctl$MODCTL_UNLOAD(0x2, 0x0)
ptrace(0x9, r0, 0x0, 0x0)


mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x0)
r0 = getsid(0x0)
ptrace(0x9, r0, 0x0, 0x0)
compat_50_wait4(0x0, 0x0, 0x0, 0x0)
ptrace(0xd, r0, 0x0, 0x8)


r0 = compat_30_socket(0x22, 0x3, 0x0)
ioctl$FIOASYNC(0xffffffffffffffff, 0x80047462, 0x0)
__fstat50(r0, &(0x7f0000000100))


__mount50(&(0x7f0000000000)='overlay\x00', &(0x7f0000000040)='.\x00', 0x0, &(0x7f0000000540), 0x0)
symlink(&(0x7f0000000080)='.\x00', &(0x7f0000000240)='./file0\x00')
pathconf(&(0x7f0000000080)='./file0\x00', 0xb)


compat_40_mount(&(0x7f0000000200)='procfs\x00', &(0x7f00000000c0)='.\x00', 0x0, &(0x7f00000002c0)="01")
r0 = open$dir(&(0x7f0000001240)='.\x00', 0x0, 0x0)
__lstat50(&(0x7f0000000000)='.\x00', 0x0)
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
unmount(&(0x7f0000000000)='./file0\x00', 0x255a0100)
compat_30_getdents(r0, 0x0, 0x0)


poll(&(0x7f0000000040)=[{}], 0x1, 0x0)
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
r0 = socket(0x18, 0x1, 0x0)
setsockopt(r0, 0x1000000000029, 0xa, &(0x7f0000000040)="03000000", 0x4)


mknod$loop(&(0x7f0000000000)='./file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\x00', 0x2000, 0x1)
socket(0x0, 0x0, 0x0)
mknod(&(0x7f0000000000)='./file0\x00', 0x2000, 0x0)
__mount50(&(0x7f00000002c0)='overlay\x00', &(0x7f0000000040)='.\x00', 0x0, &(0x7f0000000540), 0x0)
rename(0x0, 0x0)
rename(&(0x7f0000000a40)='./file0\x00', &(0x7f00000007c0)='./file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\x00')


sendto$unix(0xffffffffffffffff, &(0x7f0000000040)="99b346a67e66edb3ccc214fd42e34f0fbc91fa36653ed4e57dd005be0d82e9468dd59768441c2ca481eb3dbac1821505867e7fd39118c35c2f1688bfb56b0d706d0ac5485828f6ffcfa12ddb8e0cf00ee97d83ef8264ff6937b518e9842d4207f2489426413ed9aafd1f809d404a1ad7ea95611ebe1d5b81ec78010a9d34002c2651cc03e14f52f2efdb13ea9a4e5acdaf1b7509a35ef992b0145cb20acae755dfc73ef263b1c30519cf37048d3ae8602ce4c0a5ea5fb54eced662ee55b8325a0702404db7a81a2d56b758db11b32fbdf76f0f696c827a7f7dcc13d6ebb4a3c207a8d3a428dcc6727968046d290c152a5ff99467ce74ccc26838bfc6de24f3daa27573bdaac51f2bfc2353d5e03fe631a7c11599612e48ea6a31ab62469a7a2b12e0bfd73eab8e4f10c704553088e4ff77ae25a9901f14ee63793b83a7350b1dd419ccd43d17ae6cb06d4ff8c38fe9110bed0fdba41f2d0f8a", 0x159, 0x0, 0x0, 0x0)
posix_spawn(0x0, 0x0, &(0x7f00000001c0)={0x0, 0x4, &(0x7f0000000180)=@open={0x0, 0xffffffffffffffff, {&(0x7f0000000140)='\x00', 0x0, 0x7}}}, 0x0, 0x0, 0x0)


socketpair$unix(0x1, 0x1, 0x0, &(0x7f0000000140)={<r0=>0xffffffffffffffff, <r1=>0xffffffffffffffff})
sendmsg$unix(r1, &(0x7f00000000c0)={0x0, 0x0, 0x0, 0x0, &(0x7f0000000100)=ANY=[@ANYBLOB="28000000ffff000001"], 0x28}, 0x0)
recvmsg(r0, &(0x7f0000000180)={0x0, 0x0, &(0x7f0000000640)=[{&(0x7f00000001c0)=""/23, 0x17}], 0x1}, 0x42)
recvmmsg(r0, &(0x7f0000000040), 0x10, 0x0, 0x0)
r2 = socket$inet(0x2, 0x2, 0x0)
dup2(r2, r1)


mkdir(&(0x7f0000000200)='./file0\x00', 0x0)
r0 = open$dir(&(0x7f00000000c0)='./file0\x00', 0x0, 0x0)
r1 = dup(r0)
lseek(r1, 0x0, 0x0, 0x2)


compat_40_mount(&(0x7f0000000040)='tmpfs\x00', &(0x7f00000000c0)='.\x00', 0x0, &(0x7f00000002c0)="01")
r0 = open(&(0x7f0000000040)='./file0\x00', 0x70e, 0x0)
writev(r0, &(0x7f0000000000)=[{&(0x7f0000001300)="d2", 0x1}], 0x1)


open(&(0x7f0000000040)='./file0\x00', 0x70e, 0x0)
ktrace(&(0x7f0000000200)='./file0\x00', 0x4, 0xd27d43220c7df9b, 0x0)
fchownat(0xffffffffffffff9c, 0x0, 0x0, 0x0, 0x0)


write(0xffffffffffffffff, &(0x7f0000000140)="4e8f8cdc90bf00ba12aaa92982b8b4b7630aa9bd5db9dab4749648139ce6d31e8c0219fddb943501650e4f653434f29e824bfaf628632cc6628d8cf30ae00a546ca3b4d1584dcb4de2c59dfa86a50eadde287b4643dc1052ab5d03c4cab84ff29acc3ec822", 0x65)
r0 = socket(0x18, 0x3, 0x0)
ioctl$FIOSEEKHOLE(r0, 0xc048696d, &(0x7f0000000180)=0x8000000000000032)


socketpair$unix(0x1, 0x0, 0x0, &(0x7f0000000140))
r0 = socket(0x2, 0x4001, 0x0)
r1 = dup(r0)
fcntl$dupfd(r1, 0x2, 0xffffffffffffffff)
shmctl$IPC_SET(0x0, 0x1, &(0x7f00000000c0)={{0x0, 0x0, 0x0, 0x0, 0x0, 0xc4}, 0x0, 0x0, 0x0, 0xffffffffffffffff})
sendmsg$unix(0xffffffffffffffff, 0x0, 0x0)
r2 = socket(0x18, 0x2, 0x0)
connect$unix(r2, &(0x7f00000000c0)=@abs={0x682eb13985c518e6, 0x7}, 0x1c)


getpriority(0x0, 0x0)
r0 = open$dir(&(0x7f0000000280)='.\x00', 0x0, 0x0)
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x0)
r1 = socket$unix(0x1, 0x5, 0x0)
compat_43_ogetsockname(r1, &(0x7f0000000080)=""/55, 0x0)
compat_40_mount(&(0x7f0000000040)='tmpfs\x00', 0x0, 0x0, &(0x7f00000002c0)="01")
__mount50(0x0, 0x0, 0x0, 0x0, 0x0)
modctl$MODCTL_UNLOAD(0x2, 0x0)
r2 = getpgrp()
ptrace(0x4, r2, &(0x7f0000000180), 0x0)
rename(&(0x7f0000000100)='./file0\x00', &(0x7f0000000200)='./file1\x00')
symlinkat(&(0x7f00000000c0)='./file0\x00', r0, 0x0)
compat_50___lstat30(&(0x7f0000000140)='./file0\x00', 0x0)
open$dir(0x0, 0x0, 0x0)
r3 = msgget$private(0x0, 0x0)
shmctl$IPC_RMID(0xffffffffffffffff, 0x0)
semctl$IPC_RMID(r3, 0x0, 0x0)


mknod(&(0x7f00000000c0)='./file0\x00', 0x2000, 0x4301)
r0 = open(&(0x7f0000000000)='./file0\x00', 0x2, 0x0)
ioctl$WSKBDIO_SETMAP(r0, 0x20007703, 0x0)


munmap(&(0x7f0000ffb000/0x4000)=nil, 0x4000)
minherit(&(0x7f0000fff000/0x1000)=nil, 0x1000, 0x0)


compat_40_mount(&(0x7f0000000380)='tmpfs\x00', &(0x7f00000003c0)='.\x00', 0x0, &(0x7f0000000140)="01")
open(&(0x7f0000000040)='./file0\x00', 0x70e, 0x0)
open(0x0, 0x0, 0x0)
fcntl$lock(0xffffffffffffffff, 0x0, 0x0)
r0 = getpid()
ktrace(&(0x7f0000000000)='./file0\x00', 0x0, 0x144, r0)
mknod(0x0, 0x0, 0x5300)
open(0x0, 0x0, 0x0)
ioctl$WSKBDIO_SETMAP(0xffffffffffffffff, 0xc0145002, 0x0)
swapctl$SWAP_CTL(0x5, 0x0, 0x0)
__lstat50(&(0x7f0000000000)='.\x00', 0x0)
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
unmount(&(0x7f0000000000)='./file0\x00', 0x255a0100)


compat_40_mount(&(0x7f0000000040)='tmpfs\x00', &(0x7f00000000c0)='.\x00', 0x0, &(0x7f00000002c0)="01")
__utimes50(&(0x7f0000000100)='./file0\x00', 0x0)
mknod$loop(&(0x7f0000000000)='./file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\x00', 0x2000, 0x1)
setsockopt$sock_timeval(0xffffffffffffffff, 0xffff, 0x0, 0x0, 0x0)
link(&(0x7f0000001240)='./file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\x00', &(0x7f0000000bc0)='./file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\x00')
__mount50(&(0x7f00000002c0)='overlay\x00', &(0x7f0000000040)='.\x00', 0x0, &(0x7f0000000540), 0x0)
rename(&(0x7f0000000300)='./file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\x00', &(0x7f0000000200)='./file0\x00')


r0 = socket(0x2, 0x3, 0x0)
ioctl$FIOSEEKHOLE(r0, 0xc0906911, &(0x7f0000000180)=0x8000000000000032)


__getitimer50(0x6, 0x0)


open(&(0x7f0000000180)='./file0\x00', 0x70e, 0x0)
ktrace(&(0x7f0000000240)='./file0\x00', 0x4, 0x80a, 0x0)
__clock_settime50(0x0, 0x0)
compat_50_clock_settime(0x0, 0x0)


shmget(0x2, 0x3000, 0x0, &(0x7f0000ffc000/0x3000)=nil)
r0 = open(&(0x7f0000000040)='./file0\x00', 0x70e, 0x0)
r1 = open(&(0x7f0000000040)='./file0\x00', 0x0, 0x0)
r2 = open(&(0x7f0000000100)='./file0\x00', 0x0, 0x0)
fcntl$lock(r2, 0x9, &(0x7f0000000080)={0x0, 0x0, 0x0, 0x100000001})
flock(r2, 0x2)
r3 = open(&(0x7f0000000040)='./file0\x00', 0x205, 0x0)
fcntl$lock(r3, 0x8, &(0x7f0000000000)={0x4, 0x0, 0x0, 0x1000300010008, 0xffffffffffffffff})
dup3(r1, r0, 0x0)


symlink(&(0x7f0000000080)='.\x00', &(0x7f0000000040)='./bus\x00')
__stat50(&(0x7f0000000100)='./bus\x00', &(0x7f0000000200)={<r0=>0x0})
munmap(&(0x7f0000001000/0x3000)=nil, 0x3000)
mknod(&(0x7f0000000140)='./file0\x00', 0x2000, r0)
r1 = open$dir(&(0x7f00000000c0)='./file0\x00', 0x40000400001803c1, 0x0)
pwritev(r1, &(0x7f0000000080)=[{0x0}], 0x1, 0x0)


r0 = socket(0x18, 0x3, 0x0)
setsockopt(r0, 0x1000000029, 0x1b, &(0x7f0000000000)="5ab7776a", 0x4)


socketpair(0x0, 0x0, 0x2, 0x0)
mknod(&(0x7f00000000c0)='./bus\x00', 0x2000, 0x4f4b)
__mount50(&(0x7f00000002c0)='overlay\x00', &(0x7f0000000040)='.\x00', 0x0, &(0x7f0000000140), 0x0)
__mount50(&(0x7f00000002c0)='overlay\x00', &(0x7f0000000040)='.\x00', 0x0, &(0x7f0000000540), 0x0)
rename(&(0x7f00000006c0)='./bus\x00', &(0x7f0000000580)='./file0\x00')


open(&(0x7f0000000480)='./file0\x00', 0x80000000000206, 0x4ebfac6bbaf7949)
__mount50(&(0x7f00000002c0)='efs\x00', &(0x7f0000000080)='.\x00', 0x0, &(0x7f0000000000), 0x0)
r0 = open(&(0x7f00000000c0)='./file0\x00', 0x80000000000206, 0x0)
writev(r0, &(0x7f0000000280)=[{&(0x7f0000000000)='#!', 0x2}], 0x1)
writev(r0, &(0x7f00000002c0)=[{&(0x7f0000000080)="dccd5872e57ab75ac39f155c97fc8c20200a", 0x12}], 0x100000000000030a)
_ksem_init(0x0, &(0x7f00000006c0)=0x50535244)
socketpair$unix(0x1, 0x2, 0x0, &(0x7f00000007c0))
fktrace(r0, 0x2, 0x2, 0x0)


mknod(&(0x7f0000000080)='./bus\x00', 0x2000, 0x6d4)
r0 = open(&(0x7f0000000000)='./bus\x00', 0x0, 0x0)
ioctl$FIOASYNC(r0, 0x80047470, &(0x7f00000000c0))


open(&(0x7f0000001640)='./file0\x00', 0x615, 0x0)
r0 = open(&(0x7f0000002600)='./file0\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x13000)=nil, 0x13000, 0x5, 0x10, r0, 0x0, 0x0)
syz_usb_connect$printer(0x0, 0x2d, &(0x7f0000000240)={{0x12, 0x1, 0x0, 0x0, 0x0, 0x0, 0x0, 0x525, 0xa4a8, 0x40, 0x1, 0x2, 0x3, 0x1, [{{0x9, 0x2, 0x1b, 0x1}}]}}, 0x0)


r0 = socket$inet6(0x18, 0x3, 0x0)
__mount50(0x0, &(0x7f0000000040)='.\x00', 0x0, 0x0, 0x0)
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
sendto$inet6(r0, 0x0, 0x0, 0x0, &(0x7f0000000040)={0x18, 0x3}, 0xc)


symlink(&(0x7f0000000080)='.\x00', &(0x7f0000000240)='./file0\x00')
compat_50___msgctl13$IPC_STAT(0x0, 0x2, &(0x7f0000000280)={{}, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, &(0x7f00000001c0)={&(0x7f0000000100), 0xdffffffffffff7ff}})
lchown(&(0x7f0000000100)='./file0\x00', 0x0, 0xffffffffffffffff)
compat_40_mount(&(0x7f0000000380)='union\x00', &(0x7f0000000140)='./file0\x00', 0x0, &(0x7f00000001c0))
__mount50(&(0x7f00000002c0)='overlay\x00', &(0x7f0000000040)='.\x00', 0x0, &(0x7f0000000540), 0x0)
mknod(&(0x7f0000000000)='./bus\x00', 0x2000, 0x2e00)
r0 = open$dir(&(0x7f0000000000)='./bus\x00', 0x0, 0x0)
ioctl$FIOASYNC(r0, 0x40045265, &(0x7f0000000040))


compat_40_mount(&(0x7f0000000040)='tmpfs\x00', &(0x7f00000005c0)='.\x00', 0x0, &(0x7f00000002c0)="01")
setreuid(0x0, 0xee01)
execve(&(0x7f0000000140)='./file0\x00', 0x0, 0x0)


open(&(0x7f00000000c0)='./file0\x00', 0x615, 0x0)
r0 = open$dir(&(0x7f00000000c0)='./file0\x00', 0x2, 0x0)
pwritev(r0, &(0x7f0000000080)=[{&(0x7f00000006c0), 0xf0f75}], 0x1, 0x0)
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x0)
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
r1 = socket(0x18, 0x1, 0x0)
setsockopt(r1, 0x0, 0x2, &(0x7f0000000140)="03", 0x1)


symlink(&(0x7f0000000080)='.\x00', &(0x7f0000000300)='./file0\x00')
compat_50___msgctl13$IPC_STAT(0x0, 0x2, &(0x7f0000000280)={{}, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, &(0x7f00000001c0)={&(0x7f0000000100), 0xdffffffffffff7ff}})
lchown(&(0x7f0000000100)='./file0\x00', 0x0, 0xffffffffffffffff)
compat_40_mount(&(0x7f0000000380)='union\x00', &(0x7f0000000140)='./file0\x00', 0x0, &(0x7f00000001c0))
open$dir(&(0x7f0000000000)='.\x00', 0x0, 0x0)
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
unmount(&(0x7f0000000000)='./file0\x00', 0x0)


close(0xffffffffffffffff)
open$dir(&(0x7f0000000b80)='./file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\x00', 0x200, 0x0)
r0 = open$dir(&(0x7f0000000000)='.\x00', 0x0, 0x0)
symlinkat(&(0x7f0000000300)='./file0\x00', r0, &(0x7f0000000c80)='./file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\x00')
open$dir(&(0x7f00000001c0)='./file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\x00', 0x200, 0x0)
symlinkat(&(0x7f0000000dc0)='./file0\x00', r0, &(0x7f0000000ec0)='./file1aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\x00')
mkdirat(r0, &(0x7f00000011c0)='./file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\x00', 0xb096c9c41c57ab56)
rename(&(0x7f0000000980)='./file1aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\x00', &(0x7f0000000a80)='./file0\x00')
unlink(0x0)
unlink(&(0x7f0000000080)='./file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\x00')
rename(&(0x7f0000000fc0)='./file0\x00', &(0x7f00000010c0)='./file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\x00')


madvise(&(0x7f0000ffd000/0x2000)=nil, 0x2000, 0x0)
mlock(&(0x7f0000ffc000/0x3000)=nil, 0x3000)
mmap(&(0x7f0000ffa000/0x4000)=nil, 0x4000, 0xd49f275d97cc01bb, 0x1810, 0xffffffffffffffff, 0x0, 0x0)
munmap(&(0x7f0000ffc000/0x4000)=nil, 0x4000)


setrlimit(0x8, &(0x7f0000000980))
_ksem_init(0x0, &(0x7f00000006c0)=0x50535244)


modctl$MODCTL_LOAD(0x0, &(0x7f00000000c0)={&(0x7f0000000040), 0x0, &(0x7f0000000180)="3ceec470e2ff087e1d99d46843fd6596a8be197eb609", 0x16})


open(&(0x7f0000000040)='./file0\x00', 0x70e, 0x0)
ktrace(&(0x7f0000000200)='./file0\x00', 0x4, 0xd27d43220c7df9b, 0x0)
munmap(&(0x7f0000000000/0x1000)=nil, 0x7f7fffffc000)


getpriority(0x0, 0x0)
modctl$MODCTL_UNLOAD(0x2, 0x0)
r0 = msgget$private(0x0, 0x0)
semctl$IPC_RMID(r0, 0x3, 0x0)


pipe(&(0x7f0000001400)={<r0=>0xffffffffffffffff})
fchownat(r0, 0x0, 0x0, 0x0, 0x0)


r0 = socket(0x2, 0x4001, 0x0)
r1 = dup(r0)
r2 = fcntl$dupfd(r1, 0x2, 0xffffffffffffffff)
close(r2)
msgctl$IPC_SET(0x0, 0x1, &(0x7f00000000c0)={{0x0, 0xffffffffffffffff, 0x0, 0x0, 0x0, 0x2b}})
r3 = socket(0x18, 0x2, 0x0)
connect$unix(r3, &(0x7f00000000c0)=@abs={0x682eb13985c518e6, 0x7}, 0x1c)
getsockname$inet(r3, &(0x7f00000000c0), &(0x7f0000000380)=0xffffffffffffff24)
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x4)
r4 = socket(0x18, 0x1, 0x0)
dup2(r3, r4)
connect$unix(r4, &(0x7f00000000c0)=@abs={0x0, 0x7}, 0x1c)
sendmsg$unix(r4, &(0x7f0000000080)={0x0, 0x0, 0x0}, 0x0)
setsockopt$sock_int(r2, 0xffff, 0x1023, 0x0, 0x0)
recvfrom(r3, &(0x7f0000000140)=""/107, 0x6b, 0x40, 0x0, 0x0)


r0 = getsid(0x0)
ptrace(0x9, r0, 0x0, 0x0)
compat_50_wait4(0x0, 0x0, 0x0, 0x0)
ptrace(0x12, r0, 0x0, 0x0)


modctl$MODCTL_UNLOAD(0x2, 0x0)
shmget$private(0x0, 0x4000, 0x0, &(0x7f000055b000/0x4000)=nil)
r0 = socket$inet(0x2, 0x2, 0x0)
listen(r0, 0x0)


symlink(&(0x7f0000000080)='.\x00', &(0x7f0000000240)='./file0\x00')
compat_50___msgctl13$IPC_STAT(0x0, 0x2, &(0x7f0000000280)={{}, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, &(0x7f00000001c0)={&(0x7f0000000100), 0xdffffffffffff7ff}})
lchown(&(0x7f0000000100)='./file0\x00', 0x0, 0xffffffffffffffff)
compat_40_mount(&(0x7f0000000380)='union\x00', &(0x7f0000000140)='./file0\x00', 0x0, &(0x7f00000001c0))
compat_40_mount(&(0x7f0000000000)='null\x00', &(0x7f0000000140)='./file0\x00', 0x0, &(0x7f00000001c0))
pathconf(&(0x7f0000000140)='./file0\x00', 0x0)


r0 = open$dir(&(0x7f0000001240)='.\x00', 0x0, 0x0)
mknod(&(0x7f0000000080)='./file0\x00', 0x2000, 0x0)
unlinkat(r0, &(0x7f00000000c0)='./file0\x00', 0x800)


setrlimit(0x3, &(0x7f0000000180)={0xb66c, 0x100000})
setrlimit(0x3, &(0x7f0000000040)={0x100000, 0x100000})


__mount50(&(0x7f00000001c0)='fdesc\x00', &(0x7f0000000040)='.\x00', 0x0, 0x0, 0x0)
open$dir(&(0x7f0000000000)='.\x00', 0x0, 0x0)
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
__mount50(0x0, 0x0, 0x0, 0x0, 0x0)
unmount(&(0x7f0000000000)='./file0\x00', 0x255a0100)


modctl$MODCTL_UNLOAD(0x2, 0x0)
munmap(&(0x7f0000fec000/0x14000)=nil, 0x14000)
r0 = shmget$private(0x0, 0x4000, 0x0, &(0x7f000055b000/0x4000)=nil)
r1 = shmat(r0, &(0x7f0000ff5000/0x4000)=nil, 0x0)
shmctl$IPC_RMID(r0, 0x0)
mlock(&(0x7f0000ff6000/0x2000)=nil, 0x2000)
shmdt(r1)


poll(&(0x7f0000000040)=[{}], 0x1, 0x0)
__mount50(0x0, &(0x7f0000000040)='.\x00', 0x0, 0x0, 0x0)
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
r0 = socket(0x18, 0x1, 0x0)
setsockopt(r0, 0x1000000029, 0xc, &(0x7f0000000000)="ebffcbff13b9fd812eaa4e713048e69931929648", 0x14)
setsockopt(r0, 0x1000000000029, 0xa, &(0x7f0000000040)="03000000", 0x4)


mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
__clock_settime50(0x3, &(0x7f0000002c00))


r0 = socket$inet6(0x18, 0x3, 0x0)
getsockopt$sock_int(r0, 0xffff, 0x1, &(0x7f0000000000), &(0x7f0000000040)=0x4)


modctl$MODCTL_UNLOAD(0x2, 0x0)
mknod$loop(&(0x7f0000000000)='./file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\x00', 0x2000, 0x1)
link(&(0x7f0000000100)='./file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\x00', &(0x7f0000000bc0)='./file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\x00')
__stat50(0x0, 0x0)
compat_20_statfs(&(0x7f0000000700)='./file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\x00', 0x0)
rename(&(0x7f0000000300)='./file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\x00', &(0x7f0000000200)='./file0\x00')


open(&(0x7f00000000c0)='./file0\x00', 0x615, 0x0)
r0 = getpid()
ktrace(&(0x7f0000000000)='./file0\x00', 0x0, 0x142, r0)
socketpair(0x11, 0x0, 0x0, 0x0)
__mount50(0x0, 0x0, 0x0, 0x0, 0x0)
r1 = socket$inet(0x2, 0x2, 0x0)
recvfrom$inet(r1, 0x0, 0x0, 0x0, 0x0, 0x0)


mkdir(&(0x7f0000000000)='./control\x00', 0x0)
r0 = open(&(0x7f0000022ff6)='./control\x00', 0x0, 0x0)
pwrite(r0, 0x0, 0x0, 0x0)


modctl$MODCTL_UNLOAD(0x2, 0x0)
r0 = socket(0x1f, 0x3, 0x0)
compat_50___msgctl13$IPC_STAT(0x0, 0x2, 0x0)
__fstat50(r0, &(0x7f0000000300))


setreuid(0x0, 0xee01)
compat_43_osethostname(0x0, 0x0)
r0 = compat_30_socket(0x22, 0x30000003, 0x0)
compat_43_osend(r0, &(0x7f0000000040)="00060409", 0x600, 0x0)


mmap(&(0x7f0000000000/0xff5000)=nil, 0xff5000, 0x0, 0x200000004d831, 0xffffffffffffffff, 0x0, 0x0)
semop(0x0, &(0x7f0000000140)=[{}], 0x1)


mkdir(&(0x7f0000000300)='./file0\x00', 0x0)
__mount50(&(0x7f0000000000)='kernfs\x00', &(0x7f0000000180)='./file0\x00', 0x0, 0x0, 0x0)
r0 = open$dir(&(0x7f0000000080)='.\x00', 0x0, 0x0)
faccessat(r0, &(0x7f00000000c0)='./file0\x00', 0x0, 0x100)


ptrace(0x25, 0x0, 0x0, 0x0)


mknod(&(0x7f00000000c0)='./bus\x00', 0x2000, 0x4f4b)
r0 = open$dir(&(0x7f0000000040)='./bus\x00', 0x0, 0x0)
ioctl$FIOASYNC(r0, 0x80104307, &(0x7f00000001c0))


_lwp_wakeup(0x0)
shutdown(0xffffffffffffffff, 0x0)
r0 = open(&(0x7f0000000040)='./file0\x00', 0x70e, 0x0)
writev(r0, &(0x7f00000008c0)=[{&(0x7f0000000000)='C', 0x1}], 0x1)
mmap(&(0x7f0000000000/0x13000)=nil, 0x13000, 0x3, 0x10, r0, 0x0, 0x0)
r1 = socket$inet(0x2, 0x2, 0x0)
accept(0xffffffffffffffff, 0x0, &(0x7f00000000c0))
madvise(&(0x7f0000000000/0x3000)=nil, 0x3000, 0x6)
ioctl$FIONREAD(r1, 0xc0106924, &(0x7f0000000080))


mknod(&(0x7f0000000280)='./file0\x00', 0x2000, 0x200)
compat_43_ommap(&(0x7f0000ffd000/0x3000)=nil, 0x3000, 0x0, 0x0, 0xffffffffffffffff, 0x0)
r0 = open(&(0x7f0000000040)='./file0\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x200000)=nil, 0x200000, 0x0, 0x10, r0, 0x0, 0x0)
munmap(&(0x7f0000001000/0x3000)=nil, 0x3000)


setreuid(0xee00, 0x0)
sendmsg(0xffffffffffffffff, 0x0, 0x0)
getgroups(0x1, &(0x7f0000000080)=[0x0])
setsockopt$sock_cred(0xffffffffffffffff, 0xffff, 0x1022, 0x0, 0x0)
setegid(0xffffffffffffffff)
r0 = getegid()
setgid(r0)
msgrcv(0x0, 0x0, 0x0, 0x0, 0x1000)
msgsnd(0x0, &(0x7f0000000000)=ANY=[@ANYRESHEX], 0x401, 0x0)
r1 = getuid()
setreuid(0xee00, r1)
ktrace(0x0, 0x5, 0x40000424, 0xffffffffffffffff)
connect$unix(0xffffffffffffffff, &(0x7f00000000c0)=@abs={0x682eb13985c518e6, 0x7}, 0x1c)
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
close(0xffffffffffffffff)
socket(0x18, 0x1, 0x0)
connect$unix(0xffffffffffffffff, &(0x7f00000000c0)=@abs={0x0, 0x7}, 0x1c)
sendmsg$unix(0xffffffffffffffff, 0x0, 0x0)
symlink(&(0x7f0000001640)='./file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa/../file0\x00', &(0x7f0000000e40)='./file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\x00')
rename(&(0x7f0000000980)='./file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa/../file0\x00', 0x0)


semctl$IPC_SET(0x0, 0x0, 0x1, &(0x7f0000000000)={{0x0, 0xffffffffffffffff}, 0x0, 0x0, 0x1})
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
r0 = socket(0x18, 0x2, 0x0)
setsockopt(r0, 0x0, 0xb, &(0x7f0000000040)='\x00\x00\x00\x00', 0x4)


open$dir(&(0x7f00000000c0)='./file0\x00', 0x40000400001803c1, 0x0)
r0 = open(&(0x7f0000000140)='./file0\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x200000)=nil, 0x200000, 0x0, 0x10, r0, 0x0, 0x0)
open(0x0, 0x0, 0x0)
minherit(&(0x7f0000003000/0x2000)=nil, 0x2000, 0x0)


chmod(&(0x7f0000000080)='./file0\x00', 0x0)
open(&(0x7f0000000040)='./file0\x00', 0x70e, 0x0)
r0 = open(&(0x7f0000000100)='./file0\x00', 0x0, 0x0)
fcntl$lock(r0, 0x9, &(0x7f0000000080)={0x0, 0x0, 0x0, 0x100000001})
r1 = open(&(0x7f0000000040)='./file0\x00', 0x205, 0x0)
fcntl$lock(r1, 0x8, &(0x7f0000000000)={0x0, 0x0, 0x0, 0x1000300010008, 0xffffffffffffffff})
mkdir(0x0, 0x0)


mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
mknod(&(0x7f0000000280)='./file0\x00', 0x1100, 0x0)


r0 = open(&(0x7f0000001640)='./file0\x00', 0x615, 0x0)
ioctl$OFIOGETBMAP(r0, 0xc004667a, &(0x7f00000001c0)=0x80000d84)


r0 = socket$inet6(0x18, 0x3, 0x0)
sendto$inet6(r0, &(0x7f0000000000)="87", 0x358, 0x2, &(0x7f0000000040)={0x18, 0x3}, 0x1c)


mknod(&(0x7f0000000400)='./file0\x00', 0x2000, 0x287e)
acct(&(0x7f0000000080)='./file0\x00')


mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
compat_50_select(0x190, 0x0, 0x0, &(0x7f00000000c0), &(0x7f0000000100))


symlink(&(0x7f0000000080)='.\x00', 0x0)
compat_40_mount(&(0x7f0000000200)='ptyfs\x00', &(0x7f00000003c0)='.\x00', 0x0, &(0x7f0000000a80))
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x4)
r0 = open(&(0x7f0000000080)='./file0\x00', 0x0, 0x0)
fchdir(r0)


open(&(0x7f0000000040)='./file0\x00', 0x70e, 0x0)
ktrace(&(0x7f0000000200)='./file0\x00', 0x4, 0xd27d43220c7df9b, 0x0)
semctl$IPC_SET(0x0, 0x0, 0x1, 0x0)


__getrusage50(0x748c23bba96db813, 0x0)


symlink(&(0x7f0000000000)='./file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa/../file0\x00', 0x0)
r0 = socket$inet(0x2, 0x2, 0x0)
ioctl$FIONREAD(r0, 0xc0106926, &(0x7f0000000040))


r0 = socket$inet(0x2, 0x2, 0x0)
modctl$MODCTL_LOAD(0x5, 0x0)
symlink(&(0x7f0000000300)='.\x00', &(0x7f0000000240)='./file0\x00')
compat_40_mount(&(0x7f0000000180)='ptyfs\x00', &(0x7f00000002c0)='./file0\x00', 0x0, &(0x7f0000000500))
open(&(0x7f0000000400)='.\x00', 0x0, 0x0)
fcntl$lock(r0, 0xa, 0x0)


r0 = socket(0x18, 0x2, 0x0)
connect$unix(r0, &(0x7f00000000c0)=@abs={0x682eb13985c518e6, 0x7}, 0x1c)
connect$unix(r0, &(0x7f0000000100)=@abs={0x0, 0x0, 0x3}, 0x8)


ioctl$WSMUXIO_INJECTEVENT(0xffffffffffffffff, 0x80185760, &(0x7f0000000000))
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x4)
r0 = socket(0x18, 0x1, 0x0)
setsockopt(r0, 0x1000000029, 0xc, &(0x7f0000000000)="ebffcbff13b9fd812eaa4e713048e69931929648", 0x14)
close(r0)


__mount50(&(0x7f00000002c0)='overlay\x00', &(0x7f0000000040)='.\x00', 0x0, &(0x7f0000000540), 0x0)
modctl$MODCTL_UNLOAD(0x2, 0x0)
__lstat50(&(0x7f0000000000)='.\x00', 0x0)
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
unmount(&(0x7f0000000000)='./file0\x00', 0x255a0100)
chroot(&(0x7f0000000000)='.\x00')


symlink(&(0x7f0000000080)='.\x00', &(0x7f0000000240)='./file0\x00')
link(&(0x7f0000000940)='./file0\x00', &(0x7f0000000d40)='./bus\x00')


mkdir(&(0x7f0000000080)='./file0\x00', 0x0)
access(&(0x7f0000000000)='./file0\x00', 0x1)


__mount50(&(0x7f00000002c0)='overlay\x00', &(0x7f0000000040)='.\x00', 0x0, &(0x7f0000000540), 0x0)
symlink(&(0x7f0000000080)='.\x00', &(0x7f0000000240)='./file0\x00')
compat_50___msgctl13$IPC_STAT(0x0, 0x2, &(0x7f0000000280)={{}, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, &(0x7f00000001c0)={&(0x7f0000000100), 0xdffffffffffff7ff}})
lchown(&(0x7f0000000100)='./file0\x00', 0x0, 0xffffffffffffffff)
compat_40_mount(&(0x7f0000000380)='union\x00', &(0x7f0000000140)='./file0\x00', 0x0, &(0x7f00000001c0))
compat_40_mount(&(0x7f0000000380)='union\x00', &(0x7f0000000140)='./file0\x00', 0x0, &(0x7f00000001c0))
unlink(&(0x7f0000000000)='./file0\x00')
open$dir(&(0x7f0000000000)='./file0\x00', 0x40000400001803c1, 0x0)
link(&(0x7f0000000940)='./file0\x00', &(0x7f0000000d40)='./file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\x00')


r0 = socket(0x18, 0x1, 0x0)
setsockopt(r0, 0x1000000029, 0x2e, &(0x7f0000000040)="ebffcbff13b9fd812eaa4e713048e69931929648", 0x14)


r0 = socket(0x2, 0x3, 0x0)
connect$inet(r0, &(0x7f00000000c0)={0x2, 0x0}, 0x10)
getpeername(r0, &(0x7f0000000000), &(0x7f0000000100)=0xe)


r0 = socket(0x1, 0x2, 0x0)
ioctl$FIOSEEKHOLE(r0, 0x8020690c, &(0x7f0000000180)=0x8000000000000032)


__mount50(&(0x7f00000002c0)='overlay\x00', 0x0, 0x0, 0x0, 0x0)
socketpair$unix(0x1, 0x1, 0x0, &(0x7f0000000140)={<r0=>0xffffffffffffffff, <r1=>0xffffffffffffffff})
recvmsg(r0, &(0x7f0000000180)={0x0, 0x0, &(0x7f0000000640)=[{&(0x7f00000001c0)=""/23, 0x17}], 0x1, 0x0}, 0x42)
recvmmsg(r0, &(0x7f0000000040)={0x0}, 0x10, 0x0, 0x0)
r2 = socket$inet(0x2, 0x2, 0x0)
dup2(r2, r1)
lchown(0x0, 0xffffffffffffffff, 0x0)


modctl$MODCTL_UNLOAD(0x2, 0x0)
modctl$MODCTL_UNLOAD(0x4, &(0x7f0000000040))


mknod(&(0x7f0000000140)='./file0\x00', 0x2000, 0x0)
r0 = open(&(0x7f00000000c0)='./file0\x00', 0x0, 0x0)
ioctl$FIONREAD(r0, 0x4090426b, 0x0)
r1 = msgget$private(0x0, 0x0)
msgrcv(r1, &(0x7f0000000940), 0xcc, 0x2, 0x0)
msgsnd(r1, &(0x7f0000000440)={0x3, "a486714b3b6964c6224c6d3e16e0d3d8edebe56ff5cc0190d7f39c044dac99fec5afca3ec3e155903698d635e2ab348195cce43ab9e134935e"}, 0x41, 0x800)


mmap(&(0x7f0000000000/0xff5000)=nil, 0xff5000, 0x4, 0x200000004d812, 0xffffffffffffffff, 0x0, 0x0)


mknod(&(0x7f0000000140)='./file0\x00', 0x2000, 0x0)
r0 = open(&(0x7f00000000c0)='./file0\x00', 0x0, 0x0)
ioctl$FIONREAD(r0, 0x4090426b, 0x0)
fcntl$getown(r0, 0x5)


mknod(&(0x7f00000000c0)='./bus\x00', 0x2000, 0x4f4b)
r0 = open$dir(&(0x7f0000000040)='./bus\x00', 0x0, 0x0)
ioctl$FIOASYNC(r0, 0xc0104302, &(0x7f00000001c0)=0x2)


open(&(0x7f0000000040)='./file0\x00', 0x70e, 0x0)
ktrace(&(0x7f0000000200)='./file0\x00', 0x4, 0xd27d43220c7df9b, 0x0)
paccept(0xffffffffffffffff, 0x0, 0x0, 0x0)
rename(0x0, 0x0)
modctl$MODCTL_UNLOAD(0x2, 0x0)
r0 = _lwp_self()
_lwp_suspend(r0)
_lwp_wakeup(r0)


mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x0)
__setitimer50(0x0, 0x0, &(0x7f0000000080))


r0 = socket(0x18, 0x2, 0x0)
ioctl$FIOSEEKHOLE(r0, 0x8020690c, &(0x7f0000000180)=0x8000000000000032)


mkdir(&(0x7f00000007c0)='./file0\x00', 0x0)
mkdir(&(0x7f00000001c0)='./file1\x00', 0x0)
compat_43_ocreat(&(0x7f0000000100)='./file0/file2\x00', 0x0)
mknodat(0xffffffffffffff9c, &(0x7f00000000c0)='./file1/file2\x00', 0x81c0, 0x0)
linkat(0xffffffffffffff9c, &(0x7f0000000380)='./file1/file2\x00', 0xffffffffffffff9c, &(0x7f00000003c0)='./file0/file2\x00', 0x0)


mkdir(&(0x7f0000000080)='./file0\x00', 0x0)
compat_50___msgctl13$IPC_STAT(0x0, 0x2, &(0x7f0000000280)={{}, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, &(0x7f00000001c0)={&(0x7f0000000100)}})
compat_40_mount(&(0x7f0000000000)='null\x00', &(0x7f0000000140)='./file0\x00', 0x0, &(0x7f00000001c0))
pathconf(0x0, 0x0)


compat_43_ocreat(&(0x7f0000000000)='./file0\x00', 0x0)
ktrace(&(0x7f0000000200)='./file0\x00', 0x4, 0xd27d43220c7df9b, 0x0)
__fhopen40(&(0x7f00000000c0), 0x0, 0x0)
dup3(0xffffffffffffffff, 0xffffffffffffffff, 0x0)
chroot(0x0)


r0 = socket$unix(0x1, 0x1, 0x0)
r1 = socket$unix(0x1, 0x1, 0x0)
bind$unix(r1, &(0x7f0000003000)=@file={0x1, '\xe9\x1fq\x89Y\x1e\x923aK\x00'}, 0x6e)
listen(r1, 0x0)
connect$unix(r0, &(0x7f0000000640)=@file={0x1, '\xe9\x1fq\x89Y\x1e\x923aK\x00'}, 0x6e)
connect$unix(r0, &(0x7f0000000280)=@file={0x1, '\xe9\x1fq\x89Y\x1e\x923aK\x00'}, 0x6e)
r2 = fcntl$dupfd(r1, 0x0, r0)
listen(r2, 0xa0f)


__getrusage50(0x0, &(0x7f0000000000))
__getrusage50(0xffffffffffffffff, &(0x7f00000000c0))
__getrusage50(0x0, &(0x7f0000000180))
__getrusage50(0xffffffffffffffff, &(0x7f0000000240))
__getrusage50(0x0, &(0x7f0000000300))
getsockopt$sock_cred(0xffffffffffffff9c, 0xffff, 0x11, &(0x7f00000003c0), &(0x7f0000000400)=0xc)
__wait450(0x0, 0x0, 0x0, 0x0)
__getrusage50(0x0, 0x0)
r0 = getsid(0x0)
__wait450(r0, 0x0, 0x0, &(0x7f0000000600))
__mount50(&(0x7f0000000280)='kernfs\x00', &(0x7f00000002c0)='.\x00', 0x0, 0x0, 0x0)
symlink(&(0x7f0000000080)='.\x00', 0x0)
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
pathconf(&(0x7f0000000080)='./file0\x00', 0x1)
__wait450(0x0, 0x0, 0x0, 0x0)
msgget(0x2, 0x393)


mknod(&(0x7f0000000280)='./file0\x00', 0x1100, 0x0)
r0 = open$dir(&(0x7f0000000000)='./file0\x00', 0x2, 0x0)
writev(r0, &(0x7f0000000340)=[{&(0x7f0000000000), 0x2cfea}], 0x1000000000000013)
r1 = open(&(0x7f0000000040)='./file0\x00', 0x0, 0x0)
poll(&(0x7f00000002c0)=[{r1, 0x16e}], 0x1, 0x0)


fcntl$lock(0xffffffffffffffff, 0x0, &(0x7f00000000c0)={0x0, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff})
socket(0x0, 0x0, 0x0)
dup(0xffffffffffffffff)
r0 = socket(0x18, 0x2, 0x0)
open(0x0, 0x0, 0x0)
open(0x0, 0x0, 0x0)
close(r0)
r1 = socket(0x800000018, 0x1, 0x0)
setsockopt$sock_int(0xffffffffffffffff, 0xffff, 0x0, &(0x7f0000000000), 0x4)
bind$unix(r1, 0x0, 0x0)
connect$unix(r0, &(0x7f00000000c0)=@abs={0x682eb13985c518e6, 0x7}, 0x1c)


r0 = socket$inet(0x2, 0x1, 0x0)
getsockopt(r0, 0x6, 0x6, 0x0, 0x0)


openat(0xffffffffffffff9c, 0x0, 0x0, 0x0)
write(0xffffffffffffffff, &(0x7f0000000140)="4e8f8cdc90bf00ba12aaa92982b8b4b7630aa9bd5db9dab4749648139ce6d31e8c0219fddb943501650e4f653434f29e824bfaf628632cc6628d8cf30ae00a546ca3b4d1584dcb4de2c59dfa86a50eadde287b4643dc1052ab", 0x59)
r0 = socket(0x18, 0x3, 0x0)
ioctl$FIOSEEKHOLE(r0, 0xc048696d, &(0x7f0000000180)=0x8000000000000032)


r0 = open$dir(&(0x7f0000000080)='.\x00', 0x0, 0x0)
mprotect(&(0x7f0000ffc000/0x4000)=nil, 0x4000, 0x0)
mkdirat(r0, &(0x7f0000000180)='./file1\x00', 0x0)
fstatat(r0, &(0x7f0000000100)='./file1\x00', &(0x7f0000000240), 0x0)


mknod(&(0x7f0000000080)='./file0\x00', 0x2000, 0x0)
r0 = open(&(0x7f0000000100)='./file0\x00', 0x0, 0x0)
writev(r0, &(0x7f00000000c0)=[{0x0}], 0x1)


compat_40_mount(&(0x7f0000000040)='tmpfs\x00', &(0x7f00000005c0)='.\x00', 0x0, &(0x7f00000002c0)="01")
mknod(&(0x7f0000000280)='./file0\x00', 0x1100, 0x0)
open(&(0x7f0000001640)='./file0\x00', 0x615, 0x0)


mkdir(&(0x7f0000000080)='./file0\x00', 0x0)
compat_50___msgctl13$IPC_STAT(0x0, 0x2, &(0x7f0000000280)={{}, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, &(0x7f00000001c0)={&(0x7f0000000100)}})
compat_40_mount(&(0x7f00000000c0)='ffs\x00', &(0x7f0000000040)='./file0\x00', 0x0, &(0x7f00000001c0))


modctl$MODCTL_LOAD(0x5, 0x0)
profil(0x0, 0x0, 0x0, 0x0)


modctl$MODCTL_UNLOAD(0x2, 0x0)
mknod(&(0x7f00000000c0)='./bus\x00', 0x2000, 0x0)
r0 = open(&(0x7f0000000040)='./bus\x00', 0x0, 0x0)
pread(r0, &(0x7f0000000d00)="8e", 0x1, 0x0)


r0 = socket(0x18, 0x1, 0x0)
setsockopt(r0, 0x1000000000029, 0x35, 0x0, 0x0)
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
setsockopt(0xffffffffffffffff, 0x29, 0xe, &(0x7f0000000000)="02000000", 0x4)
r1 = socket(0x18, 0x2, 0x0)
socket(0x0, 0x0, 0x0)
getsockopt(r1, 0x29, 0x32, 0x0, 0x0)
getpgrp()
r2 = msgget$private(0x0, 0x0)
msgctl$IPC_SET(r2, 0x1, &(0x7f0000002f80)={{0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x7}, 0x6, 0x2, 0x0, 0x0, 0xb9, 0x80, 0x3, 0x5})


__mount50(&(0x7f00000002c0)='efs\x00', &(0x7f0000000080)='.\x00', 0x0, 0x0, 0x0)


r0 = getsid(0x0)
ptrace(0x9, r0, 0x0, 0x0)
compat_50_wait4(0x0, 0x0, 0x0, 0x0)
ptrace(0x22, r0, 0x0, 0x0)


listen(0xffffffffffffffff, 0x0)
mkdir(&(0x7f0000000080)='./file0\x00', 0x0)
__mount50(&(0x7f0000000000)='kernfs\x00', &(0x7f00000000c0)='./file0\x00', 0x0, 0x0, 0x0)
compat_50___msgctl13$IPC_STAT(0x0, 0x2, &(0x7f0000000280)={{}, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, &(0x7f00000001c0)={&(0x7f0000000100)}})
open(&(0x7f0000000100)='./file0\x00', 0x0, 0x0)
compat_40_mount(&(0x7f0000000140)='umap\x00', &(0x7f0000000040)='./file0\x00', 0x0, &(0x7f00000001c0))
pathconf(&(0x7f0000000280)='./file0\x00', 0x7)


mkdir(&(0x7f0000000080)='./file0\x00', 0x0)
mknod(0x0, 0x0, 0x1733)
__mount50(&(0x7f0000000000)='fdesc\x00', &(0x7f0000000200)='./file0\x00', 0x0, 0x0, 0x0)
__mount50(0x0, &(0x7f0000000200)='./file0\x00', 0x400000, &(0x7f0000000300), 0x0)


_ksem_open(&(0x7f0000000100)="b7c6", 0x0, 0x0, 0x0, 0x0)
socketpair$unix(0x1, 0x1, 0x0, &(0x7f0000000280)={<r0=>0xffffffffffffffff})
sendmmsg(r0, &(0x7f0000000040)={0x0}, 0xffffffffffffff12, 0x0, 0x0)


symlink(&(0x7f0000000080)='.\x00', &(0x7f0000000240)='./file0\x00')
compat_50___msgctl13$IPC_STAT(0x0, 0x2, 0x0)
compat_50___msgctl13$IPC_STAT(0x0, 0x2, &(0x7f0000000280)={{}, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, &(0x7f00000001c0)={&(0x7f0000000100), 0xdffffffffffff7ff}})
lchown(&(0x7f0000000100)='./file0\x00', 0x0, 0x0)
compat_40_mount(&(0x7f0000000380)='union\x00', &(0x7f0000000140)='./file0\x00', 0x0, &(0x7f00000001c0))
compat_40_mount(&(0x7f0000000140)='umap\x00', &(0x7f0000000040)='./file0\x00', 0x0, &(0x7f00000001c0))
r0 = open(&(0x7f0000001480)='./file0\x00', 0x0, 0x0)
close(r0)


ioctl$FIOASYNC(0xffffffffffffffff, 0x4008426f, 0x0)
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x0)
poll(&(0x7f0000000000)=[{}], 0x4e8, 0x0)


open(&(0x7f0000000140)='./file0\x00', 0xf8e, 0x0)
r0 = getpid()
ktrace(&(0x7f0000000000)='./file0\x00', 0x0, 0x0, r0)
sendmsg$unix(0xffffffffffffffff, &(0x7f0000001700)={&(0x7f0000000080), 0x1c, 0x0}, 0x0)
bind$unix(0xffffffffffffffff, &(0x7f0000000080)=@abs={0x1f95d27d48731892, 0x7}, 0x1c)
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
r1 = socket(0x18, 0x1, 0x0)
socket(0x18, 0x2, 0x0)
sendmsg$unix(r1, &(0x7f0000001700)={0x0, 0x0, 0x0}, 0x0)


open(0x0, 0x0, 0x0)
madvise(&(0x7f0000000000/0x3000)=nil, 0x3000, 0x80000000002)
mknod(&(0x7f0000000100)='./file0\x00', 0x0, 0x0)
socketpair(0x1, 0x2, 0x0, &(0x7f0000001640)={0xffffffffffffffff, <r0=>0xffffffffffffffff})
recvmmsg(r0, &(0x7f0000001dc0)={0x0}, 0x10, 0x0, 0x0)


mknod(&(0x7f0000000480)='./file0\x00', 0x2000, 0x1733)
link(&(0x7f0000000940)='./file0\x00', &(0x7f0000000d40)='./file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\x00')
chflags(&(0x7f00000004c0)='./file0\x00', 0x3)
rename(&(0x7f0000000240)='./file0\x00', &(0x7f0000000640)='./file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\x00')


mkdir(&(0x7f0000000080)='./file0\x00', 0x0)
compat_40_mount(&(0x7f0000000140)='ffs\x00', &(0x7f0000000040)='./file0\x00', 0x0, &(0x7f00000001c0))


__mount50(&(0x7f00000002c0)='overlay\x00', &(0x7f0000000040)='.\x00', 0x0, &(0x7f0000000540), 0x0)
symlink(&(0x7f00000000c0)='./file0\x00', &(0x7f0000000180)='\x00')
compat_50___msgctl13$IPC_STAT(0x0, 0x2, &(0x7f0000000280)={{0x0, 0x0, 0x0, <r0=>0x0, <r1=>0x0}, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, &(0x7f00000001c0)={&(0x7f0000000100), 0xdffffffffffff7ff}})
symlink(&(0x7f0000000080)='.\x00', &(0x7f0000000240)='./file0\x00')
compat_40_mount(&(0x7f0000000380)='tmpfs\x00', &(0x7f00000003c0)='.\x00', 0x10800039, &(0x7f0000000140)="01")
__posix_chown(&(0x7f0000000200)='./file1\x00', r0, r1)


r0 = socket(0x18, 0x3, 0x0)
setsockopt(r0, 0x1000000029, 0x24, &(0x7f0000000000)="5ab7776a", 0x4)
setsockopt$sock_int(r0, 0xffff, 0x800, &(0x7f0000000180)=0x2, 0x4)
syz_emit_ethernet(0x4e, 0x0)
recvmsg(r0, &(0x7f00000003c0)={&(0x7f0000000100), 0xc, 0x0, 0x0, &(0x7f0000000300)=""/190, 0xbe}, 0x2800)


mknod(&(0x7f0000000480)='./file0\x00', 0x2000, 0x1733)
r0 = open(&(0x7f00000000c0)='./file0\x00', 0x0, 0x0)
ioctl$FIOASYNC(r0, 0x8010427a, &(0x7f00000001c0)=0x3)


mknod(&(0x7f0000000100)='./file0\x00', 0x3a0914c44f7b202d, 0x500)
open(&(0x7f0000000080)='./file0\x00', 0x70e, 0x0)
open(&(0x7f0000000040)='./file0\x00', 0x0, 0x0)
__select50(0x40, &(0x7f0000000040), &(0x7f00000000c0)={0x9}, 0x0, 0x0)


r0 = socket(0x18, 0x3, 0x0)
setsockopt(r0, 0x0, 0xb, 0x0, 0x0)


__mount50(&(0x7f00000002c0)='overlay\x00', &(0x7f0000000040)='.\x00', 0x0, &(0x7f0000000540), 0x0)
mknod(&(0x7f0000000280)='./file0\x00', 0x1100, 0x0)
r0 = open(&(0x7f0000000480)='./file0\x00', 0x80000000000206, 0x0)
writev(r0, &(0x7f0000000100)=[{0x0}], 0x1)


setpgid(0x0, 0xffffffffffffffff)


symlink(&(0x7f0000000080)='.\x00', &(0x7f0000000240)='./file0\x00')
__mount50(&(0x7f0000000180)='coda\x00', &(0x7f00000001c0)='./file0/file0\x00', 0x0, &(0x7f0000000200)='v`', 0x2)


symlink(&(0x7f0000000080)='.\x00', &(0x7f0000000240)='./file0\x00')
compat_50___msgctl13$IPC_STAT(0x0, 0x2, &(0x7f0000000280)={{}, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, &(0x7f00000001c0)={&(0x7f0000000100)}})
lchown(&(0x7f0000000100)='./file0\x00', 0x0, 0xffffffffffffffff)
compat_40_mount(&(0x7f0000000000)='null\x00', &(0x7f0000000140)='./file0\x00', 0x0, &(0x7f00000001c0))
r0 = open(&(0x7f0000000080)='./file0\x00', 0x0, 0x0)
pread(r0, 0x0, 0x0, 0x0)


r0 = socket(0x18, 0x400000002, 0x0)
setsockopt(r0, 0x1000000029, 0x2e, 0x0, 0x0)
getsockopt(r0, 0x29, 0x3d, 0x0, 0x0)


ioctl$WSMUXIO_INJECTEVENT(0xffffffffffffffff, 0x80185760, &(0x7f0000000000)={0x0, 0x0, {0x0, 0x10000000000002}})
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
r0 = socket(0x18, 0x1, 0x0)
setsockopt(r0, 0x1000000029, 0x2e, &(0x7f0000000040)="ebffcbff13b9fd812eaa4e713048e69931929648", 0x14)
setsockopt(r0, 0x1000000029, 0x2e, &(0x7f0000000000)="ebffcbdf13b9fd812eaa4e713048e69931929648", 0x14)


r0 = socket(0x18, 0x2, 0x0)
close(r0)
r1 = socket(0x18, 0x3, 0x0)
setsockopt(r1, 0x1000000029, 0x31, 0x0, 0x0)
semctl$IPC_SET(0x0, 0x0, 0x1, 0x0)
connect$unix(r0, &(0x7f00000000c0)=@abs={0x682eb13985c518e6, 0x7}, 0x1c)
sendmsg(r1, &(0x7f0000000280)={0x0, 0x0, 0x0, 0x0, &(0x7f0000000180)=ANY=[], 0x10}, 0x0)


compat_50_select(0x300, 0xffffffffffffffff, 0x0, 0x0, 0x0)


ioctl$WSMUXIO_REMOVE_DEVICE(0xffffffffffffffff, 0x80085762, &(0x7f0000000040)={0x1})
r0 = socket(0x18, 0x3, 0x3a)
setsockopt(r0, 0x29, 0x6c, &(0x7f0000000040), 0x4)
setsockopt$inet6_MRT6_ADD_MIF(r0, 0x29, 0x67, &(0x7f0000000240)={0x7fff}, 0xc)


compat_40_mount(&(0x7f0000000040)='tmpfs\x00', 0x0, 0x0, 0x0)
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
r0 = socket(0x18, 0x2, 0x0)
setsockopt(r0, 0x1000000029, 0x4, &(0x7f0000000040)="03000000", 0x4)


mknod(&(0x7f0000000100)='./file0\x00', 0x80002005, 0x5300)
r0 = open(&(0x7f0000002600)='./file0\x00', 0x0, 0x0)
setreuid(0xee00, 0x0)
r1 = getuid()
seteuid(r1)
mmap(&(0x7f0000000000/0x13000)=nil, 0x13000, 0x0, 0x10, r0, 0x0, 0x0)


r0 = socket(0x18, 0x1, 0x0)
r1 = fcntl$dupfd(r0, 0x2, 0xffffffffffffffff)
close(r1)
compat_40_mount(&(0x7f0000000380)='tmpfs\x00', &(0x7f00000003c0)='.\x00', 0x0, &(0x7f0000000140)="01")
open$dir(&(0x7f0000000000)='.\x00', 0x0, 0x0)
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
unmount(&(0x7f0000000000)='./file0\x00', 0x255a0100)
poll(&(0x7f0000000000)=[{}], 0x20000000000000fe, 0x0)


mkdir(0x0, 0x0)
compat_43_osetrlimit(0x9, &(0x7f0000000080))
socket(0x10, 0x2, 0x0)


mknod(&(0x7f0000000280)='./file0\x00', 0x1100, 0x0)
compat_50_quotactl(&(0x7f0000000000)='./file0\x00', 0x30000, 0x0, 0x0)


msgrcv(0xffffffffffffffff, 0x0, 0x0, 0x0, 0x0)


r0 = socket(0x2, 0x1, 0x0)
r1 = socket(0x2, 0x2, 0x0)
connect$unix(r1, &(0x7f0000000a80), 0x10)
getsockname$unix(r1, &(0x7f0000000000)=@abs, &(0x7f0000001200)=0x8)
r2 = socket(0x2, 0x1, 0x0)
bind(r2, &(0x7f0000000000), 0x10)
listen(r2, 0x0)
r3 = fcntl$dupfd(r2, 0x0, r2)
accept(r2, 0x0, 0x0)
accept$unix(r3, 0x0, 0x0)
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
connect$inet(r0, &(0x7f0000000000), 0x10)
shutdown(r3, 0x1)


compat_50___msgctl13$IPC_STAT(0x0, 0x2, &(0x7f0000000280)={{}, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, &(0x7f00000001c0)={&(0x7f0000000100)}})
compat_40_mount(0x0, 0x0, 0x0, 0x0)
compat_40_mount(&(0x7f0000000140)='msdos\x00', &(0x7f00000000c0)='.\x00', 0x0, &(0x7f00000001c0))


compat_30_fhopen(&(0x7f0000000140)={{[0x1b04]}, {0x0, 0x0, "9d7b856edd241f59dc72630a7a149435"}}, 0x80)


mknod(&(0x7f0000000480)='./file0\x00', 0x2000, 0x1733)
open(&(0x7f00000000c0)='./file0\x00', 0x0, 0x0)
r0 = open$dir(&(0x7f0000000080)='./file0\x00', 0x0, 0x0)
close(r0)


r0 = getsid(0x0)
ptrace(0x9, r0, 0x0, 0x0)
compat_50_wait4(0x0, 0x0, 0x0, 0x0)
ptrace(0x2a, r0, &(0x7f0000000000), 0x0)


mkdir(&(0x7f0000000080)='./file0\x00', 0x0)
r0 = open(&(0x7f0000000180)='./file0\x00', 0x0, 0x0)
mkdir(&(0x7f0000000040)='./file2\x00', 0x0)
rename(&(0x7f00000002c0)='./file2\x00', &(0x7f0000000200)='./file0\x00')
compat_43_ogetdirentries(r0, 0x0, 0x0, &(0x7f00000000c0))


open(&(0x7f0000000040)='./file0\x00', 0x70e, 0x0)
ktrace(&(0x7f0000000200)='./file0\x00', 0x4, 0x0, 0x0)
__setitimer50(0x0, 0x0, 0x0)


ptrace(0xa, 0x0, 0x0, 0x0)


symlink(&(0x7f0000000080)='.\x00', &(0x7f0000000240)='./file0\x00')
compat_50___msgctl13$IPC_STAT(0x0, 0x2, &(0x7f0000000280)={{}, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, &(0x7f00000001c0)={&(0x7f0000000100)}})
__posix_rename(&(0x7f0000000100)='./file0\x00', 0x0)
chflags(&(0x7f00000003c0)='./file0\x00', 0x5)
sendmsg$unix(0xffffffffffffffff, &(0x7f0000000240)={&(0x7f0000000180)=@abs={0x0, 0x0, 0x2}, 0x8, &(0x7f0000000200)}, 0x0)
compat_40_mount(&(0x7f0000000140)='umap\x00', &(0x7f0000000040)='./file0\x00', 0x0, &(0x7f00000001c0))
unlink(&(0x7f00000003c0)='./file0\x00')


mkdir(0x0, 0x0)
symlink(0x0, 0x0)
compat_40_mount(&(0x7f0000000380)='tmpfs\x00', &(0x7f00000003c0)='.\x00', 0x0, &(0x7f0000000140)="01")
mknod(&(0x7f0000000000)='./file0\x00', 0x8000, 0xa718)
__mount50(0x0, &(0x7f0000000040)='.\x00', 0xe680bf986d21abfb, &(0x7f0000000140), 0x0)
lchown(&(0x7f0000000100)='./file0\x00', 0x0, 0xffffffffffffffff)


symlink(&(0x7f0000000080)='.\x00', &(0x7f0000000240)='./file0\x00')
compat_50___msgctl13$IPC_STAT(0x0, 0x2, &(0x7f0000000280)={{}, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, &(0x7f00000001c0)={&(0x7f0000000100), 0xdffffffffffff7ff}})
lchown(&(0x7f0000000100)='./file0\x00', 0x0, 0xffffffffffffffff)
compat_40_mount(&(0x7f0000000380)='union\x00', &(0x7f0000000140)='./file0\x00', 0x0, &(0x7f00000001c0))
r0 = open$dir(&(0x7f0000000b80)='./file0\x00', 0x0, 0x0)
pread(r0, 0x0, 0x0, 0x0)


r0 = shmget$private(0x0, 0x1000, 0x0, &(0x7f0000ffc000/0x1000)=nil)
munmap(&(0x7f0000ffb000/0x4000)=nil, 0x4000)
shmat(r0, &(0x7f0000ffc000/0x1000)=nil, 0x0)
mprotect(&(0x7f0000ffc000/0x1000)=nil, 0x1000, 0x2)


open(&(0x7f00000000c0)='./file0\x00', 0x615, 0x0)
r0 = getpid()
ktrace(&(0x7f0000000000)='./file0\x00', 0x0, 0x144, r0)
ktrace(0x0, 0x0, 0x0, 0x0)
r1 = socket(0x1d, 0x40000003, 0x0)
getsockname$inet(r1, 0x0, 0x0)


r0 = socket(0x2, 0x2, 0x0)
ioctl$FIOSEEKHOLE(r0, 0x8090690c, &(0x7f0000000180)=0x8000000000000032)


compat_40_mount(&(0x7f0000000080)='tmpfs\x00', &(0x7f00000000c0)='.\x00', 0x0, &(0x7f00000002c0)="01")
mknod(&(0x7f0000000280)='./file0\x00', 0x1100, 0x0)
pathconf(&(0x7f0000000100)='./file0\x00', 0x7)


mknod(&(0x7f0000000200)='./file0\x00', 0xc035cd953ea0fd64, 0x1733)
__stat50(&(0x7f0000000140)='./file0\x00', 0x0)


r0 = getsid(0x0)
ptrace(0x9, r0, 0x0, 0x0)
compat_50_wait4(0x0, 0x0, 0x0, 0x0)
ptrace(0xc, r0, &(0x7f0000000000), 0x12)


mkdir(&(0x7f0000000080)='./file0\x00', 0x0)
compat_40_mount(&(0x7f0000000180)='ptyfs\x00', &(0x7f00000002c0)='./file0\x00', 0x0, &(0x7f0000000500))
compat_30_getfh(&(0x7f0000000280)='./file0\x00', 0x0)


syz_emit_ethernet(0x2e, &(0x7f0000000080))
sendmsg$unix(0xffffffffffffffff, &(0x7f0000001700)={&(0x7f0000000080), 0x1c, 0x0}, 0x0)
bind$unix(0xffffffffffffffff, &(0x7f0000000080)=@abs={0x1f95d27d48731892, 0x7}, 0x1c)
fcntl$lock(0xffffffffffffffff, 0x0, &(0x7f00000000c0)={0x0, 0x0, 0x0, 0x4000000000000000, 0xffffffffffffffff})
connect$unix(0xffffffffffffffff, &(0x7f00000000c0)=@abs={0x682eb13985c518e6, 0x7}, 0x1c)
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
r0 = socket(0x18, 0x1, 0x0)
close(r0)
socket(0x18, 0x2, 0x0)
connect$unix(r0, &(0x7f00000000c0)=@abs={0x0, 0x7}, 0x1c)
sendmsg$unix(r0, &(0x7f0000001700)={0x0, 0x0, 0x0}, 0x0)


pipe(&(0x7f0000000100)={<r0=>0xffffffffffffffff})
r1 = getpid()
fktrace(r0, 0x0, 0x62e2dd08f149ff1b, r1)
mknod(&(0x7f0000000300)='./file0\x00', 0x2000, 0x6da)
chflags(&(0x7f0000000000)='./file0\x00', 0x0)


__mount50(&(0x7f00000002c0)='overlay\x00', 0x0, 0x0, &(0x7f0000000540), 0x0)
pipe(&(0x7f0000000140))
mknod(&(0x7f00000000c0)='./bus\x00', 0x2000, 0x4f4b)
r0 = open$dir(&(0x7f0000000040)='./bus\x00', 0x0, 0x0)
ioctl$FIOASYNC(r0, 0xc0104304, &(0x7f00000001c0))
mknod(&(0x7f0000000100)='./file0\x00', 0x80002005, 0x5300)
r1 = open(&(0x7f0000002600)='./file0\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x13000)=nil, 0x13000, 0x5, 0x10, r1, 0x0, 0x0)
fcntl$lock(0xffffffffffffffff, 0x8, &(0x7f0000000180))
getpid()
r2 = compat_30_socket(0x22, 0x3, 0x0)
_lwp_wakeup(0x0)
getpeername(r2, &(0x7f0000000180)=@data, &(0x7f00000001c0)=0xe)


mknodat(0xffffffffffffff9c, &(0x7f0000000280)='./file0\x00', 0x1000, 0x0)
r0 = open(&(0x7f0000000040)='./file0\x00', 0x70e, 0x0)
readv(r0, &(0x7f0000000740)=[{&(0x7f0000000140)=""/144, 0x90}], 0x1)


r0 = socket(0x10, 0x3, 0x0)
getsockopt$sock_cred(r0, 0x1, 0x11, &(0x7f0000caaffb)={0x0, 0x0, <r1=>0x0}, &(0x7f0000cab000)=0xc)
setregid(0x0, r1)
r2 = socket(0x10, 0x3, 0x0)
getsockopt$sock_cred(r2, 0x1, 0x11, &(0x7f0000caaffb)={0x0, 0x0, <r3=>0x0}, &(0x7f0000cab000)=0xc)
setregid(0xffffffffffffffff, r3)


r0 = socket$inet(0x2, 0x2, 0x0)
compat_40_mount(&(0x7f0000000040)='tmpfs\x00', &(0x7f00000000c0)='.\x00', 0x0, &(0x7f00000002c0)="01")
open(&(0x7f00000000c0)='./file0\x00', 0x615, 0x0)
open$dir(&(0x7f0000000000)='.\x00', 0x0, 0x0)
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
unmount(&(0x7f0000000000)='./file0\x00', 0x255a0100)
fcntl$lock(r0, 0xa, 0x0)


open(&(0x7f00000000c0)='./file0\x00', 0x615, 0x0)
r0 = getpid()
ktrace(&(0x7f0000000000)='./file0\x00', 0x0, 0x144, r0)
lchflags(&(0x7f0000000040)='./file0\x00', 0x0)


r0 = _lwp_self()
compat_50__lwp_park(0x0, 0x0, 0x0, 0x0)
_lwp_suspend(r0)
_lwp_suspend(r0)
_lwp_continue(r0)


symlink(&(0x7f0000000080)='.\x00', &(0x7f0000000240)='./file0\x00')
compat_50___msgctl13$IPC_STAT(0x0, 0x2, &(0x7f0000000280)={{}, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, &(0x7f00000001c0)={&(0x7f0000000100)}})
lchown(&(0x7f0000000100)='./file0/file0\x00', 0x0, 0x0)
compat_40_mount(&(0x7f0000000140)='umap\x00', &(0x7f0000000040)='./file0\x00', 0x0, &(0x7f00000001c0))
compat_30_getfh(&(0x7f0000000200)='./file0/file0\x00', &(0x7f0000000300))


getpid()
r0 = socket(0x800000018, 0x1, 0x0)
msgctl$IPC_SET(0x0, 0x1, &(0x7f0000000080)={{0x0, 0x0, 0xffffffffffffffff}})
r1 = socket(0x18, 0x2, 0x0)
close(r1)
r2 = socket(0x800000018, 0x1, 0x0)
rename(0x0, &(0x7f0000000080)='./bus\x00')
fcntl$lock(0xffffffffffffffff, 0x0, &(0x7f00000000c0)={0x0, 0x0, 0x40000000000})
bind$unix(r2, &(0x7f0000000080)=@abs={0x1f95d27d48731892, 0x7}, 0x1c)
connect$unix(r1, &(0x7f00000000c0)=@abs={0x682eb13985c518e6, 0x7}, 0x1c)
bind$unix(r0, &(0x7f0000000080)=@abs={0x1f95d27d48731892, 0x7}, 0x1c)


r0 = socket(0x18, 0x3, 0x0)
connect$unix(r0, &(0x7f00000000c0)=@abs={0x682eb13985c518e6, 0x7}, 0x1c)
semop(0x0, 0x0, 0x0)
connect$unix(r0, &(0x7f00000000c0)=@abs={0x0, 0x7}, 0x8)
socket(0x18, 0x3, 0x0)
syz_emit_ethernet(0x4a, &(0x7f0000000280))


socketpair$unix(0x1, 0x2, 0x0, &(0x7f00000000c0)={<r0=>0xffffffffffffffff, <r1=>0xffffffffffffffff})
compat_43_orecvfrom(r0, &(0x7f0000001f00)=""/90, 0x5a, 0x440, 0x0, 0x0)
writev(r1, &(0x7f0000003240)=[{0x0}], 0x1)


symlink(&(0x7f0000000080)='.\x00', &(0x7f0000000240)='./file0\x00')
pathconf(&(0x7f0000000000)='./file0\x00', 0x10)


compat_50___msgctl13$IPC_STAT(0x0, 0x2, &(0x7f0000000280)={{}, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, &(0x7f00000001c0)={0x0, 0xdffffffffffff7ff}})
mknod(&(0x7f00000000c0)='./bus\x00', 0x2000, 0x4f4b)
r0 = open$dir(&(0x7f0000000040)='./bus\x00', 0x0, 0x0)
ioctl$FIOASYNC(r0, 0x80104307, &(0x7f00000001c0))


__mount50(&(0x7f0000000000)='overlay\x00', &(0x7f0000000040)='.\x00', 0x1, &(0x7f0000000380), 0x0)
unlink(&(0x7f0000000000)='./file0\x00')


mknod(&(0x7f0000000100)='./file0\x00', 0x2000, 0x0)
r0 = open(&(0x7f0000000140)='./file0\x00', 0x0, 0x0)
ioctl$FIOASYNC(r0, 0x40046678, &(0x7f0000000000))


fcntl$dupfd(0xffffffffffffffff, 0x9a0ebeef00000008, 0xffffffffffffffff)


write(0xffffffffffffffff, &(0x7f00000001c0)="39e4aff151", 0x5)
mkdir(&(0x7f0000000080)='./file0\x00', 0x0)
r0 = socket$inet6(0x18, 0x3, 0x0)
open$dir(&(0x7f0000000000)='./file0\x00', 0x0, 0x0)
pathconf(&(0x7f00000000c0)='./file0\x00', 0x0)
sendto$inet6(r0, &(0x7f0000000000)='3', 0x329, 0x2, &(0x7f0000000040)={0x18, 0x3}, 0x1c)


mknod(&(0x7f0000000240)='./file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\x00', 0x2000, 0x0)
mknod$loop(&(0x7f0000000000)='./file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\x00', 0x2000, 0x1)
link(&(0x7f0000000940)='./file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\x00', &(0x7f0000000d40)='./file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\x00')
link(&(0x7f0000000100)='./file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\x00', &(0x7f0000000bc0)='./file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\x00')
rename(&(0x7f0000000e40)='./file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\x00', &(0x7f0000000340)='./file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\x00')
rename(&(0x7f00000006c0)='./file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\x00', &(0x7f0000000580)='./file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\x00')
rename(&(0x7f00000007c0)='./file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\x00', &(0x7f0000000a40)='./file0\x00')


mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
__nanosleep50(&(0x7f0000000d80), &(0x7f0000000dc0))


sendmsg$unix(0xffffffffffffffff, &(0x7f0000000080)={0x0, 0x0, 0x0, 0x0, &(0x7f0000000000)=ANY=[@ANYBLOB="89000000ffff000001"], 0x9}, 0x0)
sendmsg(0xffffffffffffffff, &(0x7f0000000380)={0x0, 0x32c, 0x0, 0x0, &(0x7f0000000000), 0x90}, 0x0)
socketpair$unix(0x1, 0x2, 0x0, &(0x7f00000008c0)={<r0=>0xffffffffffffffff, <r1=>0xffffffffffffffff})
recvmsg(r1, &(0x7f0000000340)={0x0, 0x0, 0x0, 0x0, &(0x7f00000024c0)=""/236, 0xec}, 0x0)
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
close(r1)
sendmsg(r0, &(0x7f0000000380)={0x0, 0x0, 0x0, 0x0, 0x0}, 0x0)


r0 = socket(0x18, 0x1, 0x0)
setsockopt(r0, 0x0, 0xc, &(0x7f0000000040)="ebffcbff13b9fd812eaa4e713048e69931929648", 0x14)


__mount50(&(0x7f00000002c0)='overlay\x00', &(0x7f0000000040)='.\x00', 0x0, &(0x7f0000000540), 0x0)
open(&(0x7f00000000c0)='./file0\x00', 0x80000000000206, 0x0)
socket$inet(0x2, 0x0, 0x0)
open(&(0x7f00000000c0)='./file0\x00', 0x0, 0x0)
compat_50___msgctl13$IPC_STAT(0x0, 0x2, 0x0)
setsockopt$sock_int(0xffffffffffffffff, 0xffff, 0x0, &(0x7f0000000000)=0x9455, 0x4)
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
__select50(0x40, &(0x7f0000000000), 0x0, 0x0, 0x0)


fcntl$lock(0xffffffffffffffff, 0x0, &(0x7f0000000040)={0x0, 0x0, 0x0, 0x8000000000000001})
r0 = socket$unix(0x1, 0x5, 0x0)
bind(r0, &(0x7f0000000140), 0xc)
mkdir(&(0x7f00000000c0)='./file0\x00', 0x1d7)
mkdir(&(0x7f0000000080)='./file0/file0\x00', 0x0)
rmdir(&(0x7f0000000240)='./file0\x00')
semctl$SETALL(0x0, 0x0, 0x9, &(0x7f0000000040)=[0x7ff])
ioctl$WSMUXIO_INJECTEVENT(0xffffffffffffffff, 0x80185760, &(0x7f0000000000)={0x40001ff})
mprotect(&(0x7f000019e000/0x1000)=nil, 0x1000, 0x4)
r1 = semget$private(0x0, 0x2, 0x84)
semop(r1, &(0x7f0000000000)=[{0x0, 0xfff9, 0x800}], 0x1)
semctl$SETVAL(r1, 0x4, 0x8, &(0x7f0000000100)=0x9)
r2 = socket(0x18, 0x1, 0x0)
setsockopt(r2, 0x1000000029, 0xc, &(0x7f0000000000)="ebffcbff13b9fd812eaa4e713048e69931929648", 0x14)
setsockopt(r2, 0x1000000029, 0xc, &(0x7f0000000040)="ebffcbff13b9fd812eaa4e713048e69931929648", 0x14)
r3 = socket$unix(0x1, 0x5, 0x0)
getsockopt$sock_int(r3, 0xffff, 0x1000, &(0x7f0000000040), &(0x7f0000000100)=0x34)
setsockopt(r2, 0x1000000029, 0xd, &(0x7f0000000000)="ebffcbff13b9fd812eaa4e713048e69931929648", 0x14)
getpeername(r2, &(0x7f00000000c0), &(0x7f0000000080)=0xc)


mknod(&(0x7f0000000400)='./file0\x00', 0x2000, 0x287e)
open$dir(&(0x7f00000000c0)='./file0\x00', 0x90dbb0cea2db8439, 0x0)


mknod(&(0x7f00000000c0)='./bus\x00', 0x2000, 0x4f4b)
r0 = open$dir(&(0x7f0000000000)='./bus\x00', 0x0, 0x0)
ioctl$FIOASYNC(r0, 0x20007461, 0x0)
mkdir(&(0x7f00000007c0)='./file0\x00', 0x0)
__mount50(&(0x7f0000000040)='procfs\x00', &(0x7f0000000080)='./file0\x00', 0x0, &(0x7f00000000c0)="82", 0x1)
__mount50(0x0, &(0x7f0000000200)='./file0\x00', 0x400000, &(0x7f0000000300), 0x0)


r0 = fcntl$dupfd(0xffffffffffffffff, 0xb, 0xffffffffffffffff)
ioctl$FIOASYNC(r0, 0x8004667d, &(0x7f0000000040))


mknod(&(0x7f0000000100)='./file0\x00', 0x80002005, 0x5300)
r0 = open(&(0x7f0000002600)='./file0\x00', 0x0, 0x0)
ioctl$WSKBDIO_SETMAP(r0, 0x80145003, &(0x7f0000000000)={0x0, 0x0})


r0 = getsid(0x0)
ptrace(0x9, r0, 0x0, 0x0)
compat_50_wait4(0x0, 0x0, 0x10, 0x0)


pipe2(&(0x7f0000000000)={0xffffffffffffffff, <r0=>0xffffffffffffffff}, 0x0)
__fstat50(r0, &(0x7f00000003c0))


socketpair$unix(0x1, 0x2, 0x0, &(0x7f0000000040)={0xffffffffffffffff, <r0=>0xffffffffffffffff})
getsockopt$sock_cred(r0, 0xffff, 0x1022, &(0x7f00000001c0)={0x0, 0x0, <r1=>0x0}, &(0x7f0000000200)=0xc)
semctl$IPC_SET(0x0, 0x0, 0x1, &(0x7f0000000080)={{0x0, 0x0, 0x0, 0x0, r1}})
r2 = socket(0x800000018, 0x1, 0x0)
setsockopt$sock_int(r2, 0xffff, 0x1000, &(0x7f0000000000), 0x4)
bind$unix(r2, &(0x7f0000000080)=@abs={0x1f95d27d48731892, 0x7}, 0x1c)
r3 = socket(0x18, 0x2, 0x0)
connect$unix(r3, &(0x7f00000000c0)=@abs={0x682eb13985c518e6, 0x7}, 0x1c)
getsockname$inet(r3, &(0x7f0000000080), &(0x7f0000000000)=0xfffffffffffffe22)
r4 = socket(0x18, 0x2, 0x0)
getsockname$inet(r4, &(0x7f0000000080), &(0x7f0000000040)=0xc)
r5 = socket(0x800000018, 0x1, 0x0)
bind$unix(r5, &(0x7f0000000080)=@abs={0x1f95d27d48731892, 0x7}, 0x1c)
msgctl$IPC_SET(0x0, 0x1, &(0x7f0000000080)={{0x0, 0x0, 0xffffffffffffffff}})
r6 = socket(0x800000018, 0x1, 0x0)
bind$unix(r6, &(0x7f0000000080)=@abs={0x1f95d27d48731892, 0x7}, 0x1c)


compat_40_mount(&(0x7f0000000040)='puffs\x00', &(0x7f00000000c0)='.\x00', 0x0, &(0x7f00000002c0))


setuid(0xee01)
r0 = shmget$private(0x0, 0x2000, 0x0, &(0x7f0000ffe000/0x2000)=nil)
shmctl$IPC_STAT(r0, 0x4, 0x0)


r0 = socket(0x1f, 0x1, 0x0)
setsockopt$sock_int(r0, 0xffff, 0x0, 0x0, 0x0)
r1 = socket(0x18, 0x1, 0x0)
getsockname$inet(r1, 0x0, 0x0)


socket$inet(0x2, 0x0, 0x0)
socketpair$unix(0x1, 0x1, 0x0, &(0x7f0000000040)={<r0=>0xffffffffffffffff})
compat_50_mknod(&(0x7f0000000000)='./file0\x00', 0x0, 0x0)
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
recvmsg(r0, &(0x7f0000000000)={0x0, 0x0, 0x0, 0x0, 0x0}, 0x0)


compat_40_mount(&(0x7f0000000040)='tmpfs\x00', &(0x7f0000000140)='.\x00', 0x0, &(0x7f00000001c0)="01")
mknod(&(0x7f0000000000)='./file1\x00', 0x2000, 0xa718)
mkdirat(0xffffffffffffff9c, &(0x7f0000000680)='./file0\x00', 0x0)
open$dir(&(0x7f0000000000)='.\x00', 0x0, 0x0)
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
unmount(&(0x7f0000000000)='./file0\x00', 0x255a0100)


syz_usb_connect$hid(0x0, 0x0, 0x0, 0x0)
munmap(&(0x7f0000001000/0x3000)=nil, 0x3000)
r0 = shmget$private(0x0, 0x3000, 0x0, &(0x7f0000ffa000/0x3000)=nil)
shmat(r0, &(0x7f0000001000/0x3000)=nil, 0x0)
madvise(&(0x7f0000000000/0x3000)=nil, 0x3000, 0x3)


mknod(&(0x7f00000000c0)='./bus\x00', 0x2000, 0x0)
r0 = open$dir(&(0x7f0000000000)='./bus\x00', 0x0, 0x0)
ioctl$FIOASYNC(r0, 0x80067409, &(0x7f00000001c0))
swapctl$SWAP_ON(0x1, 0x0, 0x5)


writev(0xffffffffffffffff, &(0x7f0000000180)=[{&(0x7f00000000c0)="98", 0x1}], 0x1)
socketpair$unix(0x1, 0x2, 0x0, &(0x7f0000000040)={0xffffffffffffffff, <r0=>0xffffffffffffffff})
sendmmsg(r0, &(0x7f0000000000)={0x0}, 0xfffffe32, 0x0, 0x0)


compat_40_mount(&(0x7f0000000040)='tmpfs\x00', &(0x7f00000000c0)='.\x00', 0x0, &(0x7f0000000140)="01")
mknod(&(0x7f0000000000)='./file1\x00', 0xe015, 0x0)
__utimes50(&(0x7f0000000080)='./file1/file0/../file0\x00', 0x0)
__mount50(&(0x7f0000000000)='overlay\x00', &(0x7f0000000180)='.\x00', 0x0, &(0x7f0000000540), 0x0)
open(0x0, 0x0, 0x0)
open(&(0x7f0000000480)='./file0\x00', 0x7fffe, 0x0)
setpgid(0x0, 0x0)
getpriority(0x1, 0x0)
truncate(&(0x7f0000000440)='./file0\x00', 0x0, 0x3e2b649f)
truncate(&(0x7f0000000780)='./file0\x00', 0x0, 0x10001)


mknod(&(0x7f0000000000)='./file0\x00', 0x8000, 0x6381)
open(&(0x7f0000000040)='./file0\x00', 0x2, 0x0)
socketpair$unix(0x1, 0x1, 0x0, &(0x7f0000000040)={<r0=>0xffffffffffffffff})
r1 = msgget$private(0x0, 0x282)
msgrcv(r1, &(0x7f0000000600), 0x6e, 0xffffffffffffffff, 0x1000)
msgsnd(r1, &(0x7f00000000c0)=ANY=[@ANYBLOB="01000000000000007bfeabe6eefc095a29576d9e2b8227a421879f791a1010415f068b8835673e7ecc"], 0x1f, 0x0)
msgsnd(r1, &(0x7f0000000d00)=ANY=[@ANYRESHEX], 0x401, 0x0)
r2 = socket(0x2, 0x3, 0x0)
getsockopt(r2, 0x0, 0x64, 0x0, 0x0)
msgsnd(r1, &(0x7f00000005c0)=ANY=[@ANYRESDEC=r2], 0x401, 0x0)
socket(0x1, 0x3, 0x1)
msgrcv(r1, &(0x7f0000000400)={0x0, ""/137}, 0xaf, 0x0, 0x1800)
msgsnd(r1, &(0x7f0000000500)=ANY=[@ANYBLOB="0200000000000d3aeae0f6a3aaf1a737"], 0x14, 0x0)
ioctl$FIONREAD(r0, 0x4004667f, &(0x7f0000000540))
msgsnd(r1, &(0x7f0000000780)=ANY=[@ANYRESHEX=r0], 0xe1, 0x0)
getsockopt$sock_cred(r0, 0xffff, 0x1022, &(0x7f0000000300), &(0x7f0000000340)=0xc)
msgrcv(r1, &(0x7f00000002c0)={0x0, ""/43}, 0x33, 0x0, 0x800)
msgrcv(r1, &(0x7f00000007c0)={0x0, ""/250}, 0x102, 0x0, 0x0)


setrlimit(0x0, 0x0)
r0 = socket$unix(0x1, 0x5, 0x0)
bind$unix(r0, &(0x7f0000000040)=@file={0xd570d0466b6018f, './file0\x00'}, 0xa)
listen(r0, 0x0)
shutdown(r0, 0x0)
r1 = socket$unix(0x1, 0x5, 0x0)
connect$unix(r1, &(0x7f0000000000)=@file={0xd1653077bafa0114, './file0\x00'}, 0xa)


r0 = open(&(0x7f0000000300)='.\x00', 0x0, 0x0)
mkdirat(r0, &(0x7f0000000340)='\x13\x13w\xc5\xfc5\xd4\x14T\xd5\xd4\x1d)\xad\x1a`)Y\x81F\xe6\xbe\x16nA\xad\r\xbd@T\x03<\x9f3\xbb\xda\x82$\xa2\xf3\xd7r\xe7cnH\xb3<\xbfp\x83r\xe8\xf1\xb9\x93>\xc5\x12wC\xbe\"\x06 \x9e\xf0-\xf9\xcb\xf2\xf6\xe8\x80\xd38/\x00', 0x0)
symlink(&(0x7f0000000080)='.\x00', &(0x7f0000000240)='./file0\x00')
chflags(&(0x7f0000000000)='./file0\x00', 0x20000)
mkdirat(r0, &(0x7f0000000440)='\x13\x13w\xc5\xfc5\xd4\x14T\xd5\xd4\x1d)\xad\x1a`)Y\x81F\xe6\xbe\x16nA\xad\r\xbd@T\x03<\x9f3\xbb\xda\x82$\xa2\xf3\xd7r\xe7cnH\xb3<\xbfp\x83r\xe8\xf1\xb9\x93>\xc5\x12wC\xbe\"\x06 \x9e\xf0-\xf9\xcb\xf2\xf6\xe8\x80\xd38//file0\x00', 0x0)
rename(&(0x7f0000000100)='\x13\x13w\xc5\xfc5\xd4\x14T\xd5\xd4\x1d)\xad\x1a`)Y\x81F\xe6\xbe\x16nA\xad\r\xbd@T\x03<\x9f3\xbb\xda\x82$\xa2\xf3\xd7r\xe7cnH\xb3<\xbfp\x83r\xe8\xf1\xb9\x93>\xc5\x12wC\xbe\"\x06 \x9e\xf0-\xf9\xcb\xf2\xf6\xe8\x80\xd38//file0\x00', &(0x7f0000000180)='./file0\x00')


umask(0x0)


r0 = open$dir(&(0x7f0000000100)='.\x00', 0x0, 0x0)
fchownat(r0, 0x0, 0x0, 0x0, 0x0)


pipe2(&(0x7f0000000000)={<r0=>0xffffffffffffffff, <r1=>0xffffffffffffffff}, 0x0)
__clock_getres50(0x0, 0x0)
close(r0)
sendmmsg(r1, 0x0, 0x0, 0x0, 0x0)
compat_43_oftruncate(r0, 0x0)


pipe(&(0x7f0000000380)={<r0=>0xffffffffffffffff, <r1=>0xffffffffffffffff})
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
read(r1, &(0x7f00000000c0)=""/189, 0xbd)
writev(r0, &(0x7f0000000340)=[{0x0}], 0x1)


__mount50(&(0x7f0000000040)='null\x00', &(0x7f0000000080)='.\x00', 0x0, &(0x7f0000000300), 0x0)


ptrace(0x0, 0x0, 0x0, 0x0)
r0 = getpid()
ptrace(0x7, r0, 0x0, 0x0)


mmap(&(0x7f0000ffe000/0x2000)=nil, 0x2000, 0x0, 0x1810, 0xffffffffffffffff, 0x0, 0x0)


compat_40_mount(&(0x7f0000000040)='tmpfs\x00', &(0x7f00000000c0)='.\x00', 0x0, &(0x7f00000002c0)="01")
__mount50(&(0x7f00000002c0)='overlay\x00', &(0x7f0000000040)='.\x00', 0x0, &(0x7f0000000540), 0x0)
mkdir(&(0x7f0000000080)='./file0\x00', 0x0)
pathconf(&(0x7f0000000080)='./file0\x00', 0x8)


mknod(&(0x7f0000000480)='./file0\x00', 0x6000, 0x0)
open(&(0x7f00000000c0)='./file0\x00', 0x0, 0x0)


mkdir(&(0x7f0000000080)='./file0\x00', 0x0)
compat_50___msgctl13$IPC_STAT(0x0, 0x2, &(0x7f0000000280)={{}, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, &(0x7f00000001c0)={&(0x7f0000000100)}})
__mount50(&(0x7f0000000000)='fdesc\x00', &(0x7f0000000200)='./file0\x00', 0x0, 0x0, 0x0)
lchown(&(0x7f0000000100)='./file0\x00', 0x0, 0xffffffffffffffff)
compat_40_mount(&(0x7f0000000140)='umap\x00', &(0x7f0000000040)='./file0\x00', 0x0, &(0x7f00000001c0))
pathconf(&(0x7f00000000c0)='./file0\x00', 0x7)


__mount50(&(0x7f0000000000)='msdos\x00', &(0x7f0000000040)='.\x00', 0x0, 0x0, 0x0)


mkdir(&(0x7f0000000080)='./file0\x00', 0x0)
compat_50___msgctl13$IPC_STAT(0x0, 0x2, &(0x7f0000000280)={{}, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, &(0x7f00000001c0)={&(0x7f0000000100), 0xdffffffffffff7ff}})
lchown(&(0x7f0000000100)='./file0\x00', 0x0, 0xffffffffffffffff)
__mount50(&(0x7f0000000000)='fdesc\x00', &(0x7f0000000200)='./file0\x00', 0x0, 0x0, 0x0)
compat_40_mount(&(0x7f0000000380)='union\x00', &(0x7f0000000140)='./file0\x00', 0x3, &(0x7f00000001c0))
pathconf(&(0x7f0000000200)='./file0\x00', 0x7)


compat_40_mount(&(0x7f0000000040)='tmpfs\x00', &(0x7f00000000c0)='.\x00', 0x0, &(0x7f00000002c0)="01")
accept(0xffffffffffffffff, 0x0, 0x0)
open(&(0x7f00000000c0)='./file0\x00', 0x80000000000206, 0x0)
ktrace(&(0x7f0000000200)='./file0\x00', 0x4, 0xd27d43220c7df9b, 0x0)
unlink(0x0)
r0 = socket(0x1, 0x2, 0x0)
ioctl$FIONREAD(r0, 0xc0106914, &(0x7f0000000080))
open(&(0x7f00000000c0)='./file0\x00', 0x615, 0x0)
setrlimit(0x0, &(0x7f0000000140))


mknod(&(0x7f0000000280)='./file0\x00', 0x2000, 0x200)
r0 = socket(0x2, 0x2, 0x0)
ioctl$FIONREAD(r0, 0x80206979, &(0x7f00000001c0))
r1 = open$dir(&(0x7f0000000000)='./file0\x00', 0x0, 0x0)
compat_43_ommap(&(0x7f0000ffd000/0x3000)=nil, 0x3000, 0x0, 0x0, r1, 0x0)
r2 = open(&(0x7f0000000040)='./file0\x00', 0x0, 0x0)
compat_43_ommap(&(0x7f00002f5000/0x1000)=nil, 0x1000, 0x0, 0x0, r2, 0x0)


r0 = open(&(0x7f00000000c0)='./file0\x00', 0x615, 0x0)
writev(r0, &(0x7f0000001480)=[{&(0x7f0000000100)="cc60d3d79a1a7122a5fb9de9ea6ca0b30fb2942ef7652736f10f86d45ae419b2ecf7af93d33b3de1f2c56c0d93da7484095d0d543baf7f0161a025c9c02a561f9a76d06f1eaec7d8df99d6c28b10fa891e47d6c3bba690a611889032ac7b3581d7c92d1c546e59764f069048316871794f0615dd4d8bb1057a61fb177be649004fb56a7d46c856a20d73f050bf29d58b803a676cc3761e8a47857f51d90a4465f9c605ab2829d78ea2571900", 0xac}, {&(0x7f0000000080)="b389e5ae", 0x4}, {&(0x7f0000000200)="5f6a8de84b4c523a583f91557c748a02cda4cedf13579996073843ec5cbabaadcec85d62a8a897a78307f3d2c30eea6356105ec8aaba86930f1aed0b3d111d", 0x3f}, {&(0x7f0000000240)="2a5f4cd9b73c31d629e8d68e605e5940744ef138960b671d42c56f79fcb70549247e5504e02f85a6099eff9291731e94399a7e966e783d4073ecd5dc8e540f32190bb47050aee3907495ecf7708d1c295a0836f79ba03ea13db12836c2441989726ffbf7fd1da55bfe2caaf081a1ef55599935c0e60cb0866ee48d420b77a6e269bc30b2650349e16792252c6be8bcc6e9ce2b941299ab505e4779fa1960bb7cf924dcbc53c82e76bcd80731a694d12e8a139d8ecef6f5560939054b7842d25cd7396377f39bcfe2a2383feea17b87393cd892b90742631718b1877d8d83d5f66587e3e5b1e129433ead750143a4c51aa46e78e3b1a54a4634eea2846511719b1637b19ea8c96a95332abd69014f53e91a7319a36efb4052387dc6330aa00b5680142650e8dab5eb6b83ff7d0bbf1293338a8d3e8a6dcd7ed4916279eac230dda46e6fcdebe8a4c12d598883d9392da05791e153d905ce1930aebfc5dfc0fc22a88777c9fbc24c0b021189c3503466bf850f59e6649dd0b40d935ba155a9d7e065df6d90a500be9d7e1e77607d495898a8d6c9c48dda88533e75e760dceae91d670f97ebbe68b845c72eb698a9e0389b15cfe768cd4e524dbe939bc2d8363ae3b71a454a0a0378cd1f36565ab9d1b92263688195df65151ee9b8158c7a5280019a38f00ea05142b3cc832ece46e326e36a55d7a30f866ca97da61a3877612f0d9f23c052edf60a430d3a9acbae3d9f14f22f38a1eeac7df37ae17748a960b9e0d265bf6cdec73bd2a7b5f25035d845d936936cbfe6f013f8d7c73eba2bd9876ff6db3224e50e75c051979f4cdc41d28ea54e28f560fb0291cd6b6ba527e0f358ffa46b988f0bdbf801063d2024380071709ac00ed463e08fd1718a857b567d8b3eda143915e943d9530d25ed15743e117be375dc08e241328a53fa05b58c7c72a35d471a72706afce82b6c57eef81444269e06338bfd0af6f6ef6479dcc26aab0b10c7e58572aa507a69831d63c78c947cecda2c57bb8ae6b2ad7862d160b423de941be69928467db10ee4817536196ad0c5cc02a74ba1a1fb467e472f025f200846112b6b3efd298284d8c0cc498fb9203db18e48e34c137d1285120d809a774d56a62b3dbe8f202cb9cf18c33099f5b9651fc7ae41de53842f327fd40bcfaa601a791da1103351a64cc9815ff1f32cde6fbfe4c92f55a9aff41c055d4edce9c656ecd1becb9035bb8bd46cf47bca0dcb1848da94d05d1475c83a48c02f8f808597eaa9e88a4e4b12d0ef95458eeab115cb189fa7e368186bb68610af4651e27feab061c4ada46e8c12d6d33fe32cd40485b02120418a9aaba8f04c17ea1abdcb9e6801f5d8090c0780b3b5ae4382efbb64f62c825527b7e3d60ad9c73ba7f174ea7c7f2d155808052a0ef6ab2af85cf9a758a8c1227c85f2986fc06c5f54807968b18b6b5f9168d0b2de2728689d75f0efaedf79bd8bbbbc1d46b0f6072d46d46687bc92c0b16c4ad36c3fe4c7a2fde0cfec859052dd78d841464a6b4432342ea973bf18f1f5abd7b2fe7962faef39ff38bd88a40dbab0f8c08541e38efe924f9ec1addbd47362e349bfca42fd62cf18fef8db7d3cbf0b1733d37dcdae5b9c738b8426a1cfa205c9671870210508c614f256d199fb822613518b1a6fccfee08c48f0feae1ca4baaf0a02635282d4f630a8df115dc73e72d9f7b8feb4333b2c796f1d3f2cfb398ba1e9c81584185b8b578bac3c64d667da7d91062f0c2ccec0c9d09e0c15c62e7bc509c3204d054c22d4eac685626e48e25cff2fc382279d9cb941c2ca182b1f65b05ff05db880a37b1e7f9b3d8975558858352bbbea9213a885b7d43451230e0624403baa77839baa33ae92de771ca76319c2353e9f98ea8191fbb686aed6d08e87bcc2bb8084ae02fafbac1a0bba93f0f155fa2d7bc21bf77f116429c6a1373c34d080518b73481fcf012c0a9127e8f174695ccd4add75b026bf2935990d5cebc4e74b97ae67a414029252336880232635fce7ed7bf0b069e08e10f7e1cc4d2973dcd2ea0b63f9dc4f4c2db92e03c553afcb1060b12f6b9b4d1d799edceefdfafdae63ad797e400645c149b20f8a10af54db3289441499e122413495c7b32179ab6c274bbdf6dde48ce26f46f07ede9a314c1a73669ee8cb0d854c8a3206dcd4f532d36fa7affca1226962e20ca2c6ca0da1ab0848d23d6f198681911592815c080cead9fe6c81cf0d7ce3578978dbf1dc2f1902e51f740f95b02ebff4cf25ac8d28b8e16998f5e329e9a377a82b5bc227f77c2f1ad505286aac8618799c2363a2d07d229519370be62ef0dde6525df45a7b807759b9ba1c8557bdbecb429432c4e1a84094edda567f1245ba887686342d0a31b7dbe09bd2be75516b255d2bfad9337649e86a29b0a19e16b4059c7c45e9f81db6780af6487a11f9088ea265a99d6e95535e14fe839945f8f7da9bf9142dd12c305bd41bb392cae8eb81b28fda176d4b538f4ddaaed47913cd9149d934ca36879718bbd76a2d7ce11e6294398ea433af6cf53b951974ec96a9527178b6cbad3088a2e2dc75113fa88ede462e68b4d6b63020a45e94fbc6c3e6677d6cffee0e3bf606cd2d4b9fac89f91db8f3a1fb9f506708314249fa5c36912e3cb281acbdec3b080735ae24ebb4bca3c7900c21175435afbe5024a706e90f8ecf257c99cd8b67b5a9c2bdc2bb624201f55882349526f8faf463eecb183635db72a6aef5fa2048d97e6f6c513598f49158050df1efc6b13b8ae396cd6a905c26b94d1d50d361f10c0362a8c206c7dd553dd8b116694aa82ac029c64aa71b8a307793da12f93ce030d3c42443c37b91043687befb7968da97ffe91d45e4fd506ba4d9c5d7fe5ac7433fb6bb3ce471f59ade734d109d2093fa119827b65a360f2128e7e85ab494609b60d6e236ed8a72984f6c75652f5baa07e2e921365af2b819eef58343e65ffb45ba8580d62f5b8a9988f7176c1559f8ede3b1b92e0ff27978fc5225ed1512b605f1fd4fce46e768970b11a7b4802c93eb05a8571887cdc15f69929eadcd1f9d08ed2d6aab2997ffa2102c2d8983a78ad43779d03aa612ecbbdde634b14e980b176f04f99efad82a04001f34d6addb89e0ce629856f5ad401fed1a2c046e1fe18741b24d4253830af94a1defeecdf6f70b430e48d7b2e4ae2fb331322d96e1a361e4dcae1d0865e7ccc4aa58bcbb3ad1ce0b294437887e03088a6bb4c92c3044bb33d2a893975023ad979fb2f84d4842326cf058595398779116245c452e54989df0591238af63c5b187d18457f008f2759ae470282ee4640e92639a46915fb67f22967beb1f46736aa9c90df08f39ee481f6859f9eafdecd473bdc24ef3ad71b4c54dde360f2d550db18af7589c8107f29b0a713b65f039da10c2dc55e408ce75c77aba97c116ab078ed1467464fe4041668228316f0fff7c0e0f1a3cc47aae570182197cc3a030a4af7fe2d401b5a13d59b8fe55d48b0667b782cbf1c2e9e491167d87830bb25008aff93c4459a3ce9fd8655a542c6311d91ec6b6bd7935aba1cd01d10439f5e5a3883566c8538edb97c5d2a7c7533d415973729f52dc03134f3d777b7ad07c0f5b0d985ca48fde6cce9ec99f1223f476dcfbb5ef865a3f54459415b25e2031eeb469c83ac68fd77e59c6b6a4b9543a943c4461e47e94d038c63abf2734a3c0e99b95505e13af6cf7b66840dc60e96fb218436ea00b4a577c2effdba43e6adf97cb47c0ce606e87a7834dec04396c1e4f7df59ae5acedfd1d68c87e405fc5d788884686270cb08f0fb15fe8b11589792f9afc11152cd45f43fac6e758588cfaa88874267c3096bd689108bbb0296f3d182df7d46cf7e6b8bb923ba887443c3d2211785e364c1e55d4e9396d950e50cc11ba9a4da70a64f6acbbcd421183eb2c0521054c4f39650409e21a9d3b781725b0c38abce7b4615fa64a9cbf71a19f7e414279a7494d2e66e4e2b2d5ce7027109cfc763a9dcac37065ae80ec0cf3962deec0f9eba3bd7c514a9854ddec6542cf60a1e69cf5d1a73239633675d4ac1f3b0758e69d1d9fb8659f43f07ef6b0b2a7c9740649198b44f3272a8810d374b00c1d49b7f667ed538d89e91e5734f8d6e270aef0a406fcb4072497dd5f25b49d31cfd64f7af2d5b4df72b74c39a66f442ae61aebe1cdb96b660e3248dad3d05bfce3d8630902a565c3a7636220e0435679ce1c45367570dd08722ddfe7f1bf56c44bcab8ef2d0979eeaa569dade4ec34751833d455b31b55edcb9b64f3045725fcd3a08ce0c20b09a437133df5738092eba91430208814b4135b0a64331aff1847ca60426c91e96a029cff039b14462da6e5f9c545980b3712f7fa5f3958d92278194cf6b37ef76e91103d14db403956a7070d87afa81df227f2189926e1265a2872e9d64c64496083caff36dd3c27e961957acb86f0f9a3cbb5ddbb6bf9c62dc102d70d62eea969b1c2c5f42823f76827e918ffa77cc12ca8d31effd0b55529c496da762b6a95d43360ea4e811979ee316e143733f886a6eca5ac76746d4a302ec57b6b4af57c25631d7773243abd400d15274703154dfde2f1a63e075e4b22ed4516dd1496e3f7848bbf19e3215edda64140649d9b1a06975773d5dd0ca9c4bdfd5eb9fb6164aa659e2a8415098dd05496d0b2a20d966a4c9efefa964e007a3b731d3bad501fd21ca564c6155d893c3075c234af6378a12595d8a88ae0518350b75d57fb91b7610bfc3168f548a9a63432b49a9dbee2ee1a4ddd80255e2c655cc80d59d7e90e21fb090db6d59ef84c0becfda6bd01c5a806ce67bea77fdba62ba40e2114d31e6087ad67b28adc20d7cabe7911aa2db119cf5f43b2011bb92556e8e303a264f813764027fbec36f8248a667c766cfc12c80d96965f222d39b5f58a54b255d3b1eb4530952b6d251ff1f9a2cbc585d539704c8ea6884699ebc95d57cd1daef169da86cc10fe7300352e939c35bbe3104aea1b1c681f3e98be8fad7c91bb56724ffa794ea4ac7c4d51d4aad8e94087b4cdcab897e70558e6648ab42dfadab447735912af011fdbd9a47a4958b953b01d2e274449854cb7d4b44d88fb2db162b8dd664c7c32858c7714c69b838cb30b17e6a42ca760256c4c6da57ad5be955ca8c787262d8bd3f4a5f0170e70f92ee2ec9d227d066e41d0d4b640166b2f6937a66bc84506d102d3edac818b246954ba200a3057820da10b64ff99f016b42d90a29b43290d731ef23a72612d646f92acc05620068fd94edd12902197733cc9dbb55d95930ccc749c4a169b4b7056dba24f767da022eb01243353e1f4d0af2eb32e3feb331283ceab3e0112b37556749a701d11ada5cbe1b4c829fac46e12612679e47d4a066be939b886f0fdc2b4c5b4dd183ca025c5191ac67d7f992643333504da30e8300a0b40152671", 0xf12}], 0x4)
setrlimit(0x6, &(0x7f00000000c0))
r1 = open(&(0x7f0000000040)='./file0\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x13000)=nil, 0x13000, 0x5, 0x10, r1, 0x0, 0x0)
setrlimit(0x3, 0x0)
mlockall(0x1)
madvise(&(0x7f0000000000/0x3000)=nil, 0x3000, 0x4)


mkdir(&(0x7f0000000080)='./file0\x00', 0x0)
compat_40_mount(&(0x7f0000000180)='ptyfs\x00', &(0x7f00000002c0)='./file0\x00', 0x0, &(0x7f0000000500))
compat_50___msgctl13$IPC_STAT(0x0, 0x2, &(0x7f0000000280)={{}, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, &(0x7f00000001c0)={&(0x7f0000000100)}})
lchown(&(0x7f0000000100)='./file0\x00', 0x0, 0xffffffffffffffff)
compat_40_mount(&(0x7f0000000000)='null\x00', &(0x7f0000000140)='./file0\x00', 0x0, &(0x7f00000001c0))
r0 = open(&(0x7f0000002600)='./file0\x00', 0x0, 0x0)
ioctl$WSKBDIO_SETBELL(r0, 0x80105703, &(0x7f0000000300))


symlink(&(0x7f0000000080)='.\x00', &(0x7f0000000240)='./file0\x00')
symlink(&(0x7f0000000380)='.\x02\x00', &(0x7f00000002c0)='.\x02\x00')
chroot(&(0x7f0000000000)='.\x00')
rename(&(0x7f00000000c0)='.\x02\x00', &(0x7f0000000100)='./file0/file0/../file0\x00')


mkdir(&(0x7f0000000080)='./file0\x00', 0x0)
__mount50(&(0x7f0000000000)='fdesc\x00', &(0x7f0000000200)='./file0\x00', 0x0, 0x0, 0x0)
r0 = open$dir(&(0x7f0000000280)='.\x00', 0x0, 0x0)
__posix_fadvise50(r0, 0x0, 0x0, 0x0, 0x5)
__utimes50(&(0x7f0000000040)='./file0\x00', &(0x7f00000000c0)={0x0, 0x6})
lchown(0x0, 0x0, 0xffffffffffffffff)
compat_40_mount(&(0x7f0000000380)='mfs\x00', &(0x7f0000000140)='./file0\x00', 0x3, &(0x7f0000000240))
open(0x0, 0x0, 0x0)
compat_40_mount(&(0x7f0000000380)='union\x00', &(0x7f0000000140)='./file0\x00', 0x0, 0x0)


r0 = socket(0x2, 0x3, 0x0)
ioctl$FIOSEEKHOLE(r0, 0x80206913, &(0x7f0000000180)=0x8000000000000032)


compat_40_mount(&(0x7f0000000040)='tmpfs\x00', &(0x7f0000000140)='.\x00', 0x0, &(0x7f0000000080)="01")
symlink(&(0x7f0000000080)='.\x00', &(0x7f0000000240)='./file0\x00')
mkdir(&(0x7f00000001c0)='./file1\x00', 0x0)
rename(&(0x7f00000003c0)='./file0\x00', &(0x7f0000000380)='./file1/file0\x00')


__mount50(&(0x7f00000001c0)='fdesc\x00', &(0x7f0000000040)='.\x00', 0x290d261a44dbd839, 0x0, 0xffffffffffffffb2)


__mount50(&(0x7f00000001c0)='fdesc\x00', &(0x7f0000000040)='.\x00', 0x0, 0x0, 0x0)
__mount50(&(0x7f00000002c0)='overlay\x00', &(0x7f0000000040)='.\x00', 0x0, &(0x7f0000000540), 0x0)
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
r0 = open$dir(&(0x7f0000000040)='./bus\x00', 0x0, 0x0)
ioctl$FIOASYNC(r0, 0xc0104302, &(0x7f00000001c0))


compat_40_mount(&(0x7f0000000380)='tmpfs\x00', &(0x7f00000003c0)='.\x00', 0x0, &(0x7f0000000140)="01")
mknod(&(0x7f0000000000)='./file0\x00', 0x8000, 0xa718)
__mount50(0x0, &(0x7f0000000040)='.\x00', 0xe680bf986d21abfb, &(0x7f0000000140), 0x0)
truncate(&(0x7f0000000100)='./file0\x00', 0x0, 0x0)


mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
compat_50_nanosleep(&(0x7f0000000300), &(0x7f0000000340))


mknod(&(0x7f0000000480)='./file0\x00', 0x2000, 0x1733)
r0 = open$dir(&(0x7f00000000c0)='./file0\x00', 0x0, 0x0)
compat_30_getdents(r0, 0x0, 0x0)


compat_43_ocreat(&(0x7f0000000000)='./file0\x00', 0x0)
r0 = getpid()
ktrace(&(0x7f0000000000)='./file0\x00', 0x0, 0x40000424, r0)
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x0)
compat_60__lwp_park(&(0x7f0000000100), 0x0, 0x0, 0x0)


mknod(&(0x7f00000000c0)='./bus\x00', 0x2000, 0x0)
r0 = open$dir(&(0x7f0000000000)='./bus\x00', 0x0, 0x0)
ioctl$FIOASYNC(r0, 0x80067475, &(0x7f0000000040))


mknod(&(0x7f0000000300)='./file0\x00', 0x2000, 0x6da)
r0 = open(&(0x7f0000000140)='./file0\x00', 0x0, 0x0)
ioctl$FIOASYNC(r0, 0x2000747b, 0x0)


r0 = socket(0x18, 0x3, 0x0)
ioctl$FIOSEEKHOLE(r0, 0xc0986980, &(0x7f0000000180)=0x8000000000000032)


open(&(0x7f0000000180)='./file0\x00', 0x200, 0x0)
ktrace(&(0x7f0000000200)='./file0\x00', 0x4, 0xd27d43220c7df9b, 0x0)
__clone(0x800, 0x0)


mkdir(&(0x7f00000007c0)='./file0\x00', 0x0)
compat_40_mount(&(0x7f0000000180)='ptyfs\x00', &(0x7f00000002c0)='./file0\x00', 0x0, &(0x7f0000000500))
__mount50(0x0, &(0x7f0000000200)='./file0\x00', 0x1f8a66ef1d93fae2, 0x0, 0x0)


open(&(0x7f00000000c0)='./file0\x00', 0x205, 0x0)
ktrace(&(0x7f0000000200)='./file0\x00', 0x4, 0xd27d43220c7df9b, 0x0)
readlink(0x0, 0x0, 0x0)


shmget(0x2, 0x3000, 0x0, &(0x7f0000ffc000/0x3000)=nil)
mknod(&(0x7f00000000c0)='./file0\x00', 0x6000, 0x0)
compat_50___stat30(&(0x7f0000002f40)='./file0\x00', 0x0)


mkdir(&(0x7f0000000300)='./file1\x00', 0x0)
open(&(0x7f00000000c0)='./file0\x00', 0x615, 0x0)
rename(&(0x7f0000000140)='./file1\x00', &(0x7f00000002c0)='./file0\x00')


compat_50_select(0x0, 0x0, 0x0, 0x0, &(0x7f0000000180)={0x0, 0x7fffffffffffffff})


open(&(0x7f00000000c0)='./file0\x00', 0x615, 0x0)
open$dir(&(0x7f00000000c0)='./file0\x00', 0x40000400001803c1, 0x0)
mknod(&(0x7f00000000c0)='./bus\x00', 0x2000, 0x1604)
open$dir(&(0x7f0000000040)='./bus\x00', 0x0, 0x0)


msgget(0x3, 0x42)
semctl$IPC_SET(0x0, 0x0, 0x1, 0x0)
setreuid(0xee00, 0x0)
socket$inet(0x2, 0x5, 0x0)
socket$inet(0x2, 0x2, 0x0)
socket(0x0, 0x3, 0x0)
r0 = semget$private(0x0, 0x4000000009, 0x82)
semop(r0, &(0x7f0000000440), 0x2aaaad68)
poll(&(0x7f0000000000)=[{0xffffffffffffffff, 0x1}], 0x1, 0x0)
semctl$IPC_SET(r0, 0x0, 0x1, &(0x7f00000001c0)={{0xffffcd0d, 0x0, 0x0, 0x0, 0x0, 0x5c, 0x101}, 0x49, 0x0, 0xfe})
getpid()
r1 = socket(0x11, 0x3, 0x0)
sendto$unix(r1, &(0x7f00000000c0)="b10005040000000000000000340000001a5113fecea10500fef96ecfc72fd3357a89583535673039d2d236acf20b7804be38164991f7c8cf5f882b297be1aa5b23edeb51e2f0ac3ebbc2feb3fda1139b672f4d3360223e7d026ba8af630037a840c4f2bd53eb067e7335a069d7ac434e0c0000000000008904000000000022830cf41bed66f40066ccdcf3e4999d9d20002002c5dbfad800000008e371a3f8340412051e0000000000000200000000", 0xaf, 0x0, 0x0, 0x0)


fcntl$lock(0xffffffffffffffff, 0x0, &(0x7f00000000c0)={0x0, 0x0, 0x0, 0x100000000000000, 0xffffffffffffffff})
socket(0x18, 0x3, 0x0)
socketpair(0x1, 0x1, 0x0, &(0x7f0000000100)={<r0=>0xffffffffffffffff, <r1=>0xffffffffffffffff})
sendmsg$unix(r1, &(0x7f00000016c0)={0x0, 0x0, &(0x7f0000001600)=[{&(0x7f0000000280)="bd", 0x1}], 0x1}, 0x0)
recvmsg(r1, &(0x7f0000000140)={0x0, 0x0, &(0x7f00000006c0), 0x0, 0x0}, 0x840)
r2 = dup2(r0, r1)
r3 = semget$private(0x0, 0x5, 0x2c4)
semop(r3, &(0x7f0000000100)=[{0x0, 0x401e}, {0x0, 0x0, 0x1000}], 0x2)
semop(r3, 0xffffffffffffffff, 0x53)
semctl$GETZCNT(r3, 0x4, 0x7, &(0x7f0000000400)=""/30)
semctl$GETZCNT(0xffffffffffffffff, 0x1, 0x7, &(0x7f0000001540)=""/175)
semctl$GETZCNT(r3, 0x0, 0x7, &(0x7f0000000440)=""/61)
setsockopt$sock_linger(r1, 0xffff, 0x80, &(0x7f0000001280)={0xffffffc0, 0x9}, 0x8)
semctl$GETVAL(r3, 0x0, 0x5, &(0x7f0000000040)=""/4096)
r4 = geteuid()
getgroups(0x3, &(0x7f0000001080)=[0x0, <r5=>0x0, 0x0])
semctl$IPC_SET(r3, 0x0, 0x1, &(0x7f0000000280)={{0x1, 0x0, 0x0, r4, r5, 0x80, 0x2}, 0x4, 0x7fffffffffffffff, 0xfffffffffffffffe})
recvmsg(r2, &(0x7f00000011c0)={0x0, 0x0, &(0x7f0000001040)=[{&(0x7f0000001300)=""/252, 0xfc}, {&(0x7f0000001700)=""/221, 0xdd}], 0x2, &(0x7f00000012c0)=""/41, 0x29}, 0x2)
setsockopt(0xffffffffffffffff, 0x1000000000029, 0xe, &(0x7f0000000040)='\x00\x00\x00\x00', 0x4)
r6 = msgget$private(0x0, 0x100)
msgrcv(r6, 0x0, 0x0, 0x0, 0x0)
r7 = getegid()
semctl$IPC_SET(0x0, 0x0, 0x1, &(0x7f0000001200)={{0x8001, 0x0, 0x0, 0xffffffffffffffff, r7, 0x19, 0x2}, 0x808000, 0x2, 0x5})
getsockopt$sock_cred(0xffffffffffffffff, 0xffff, 0x1022, &(0x7f0000000000), &(0x7f0000000040)=0xc)
msgrcv(r6, &(0x7f0000001440), 0xc4, 0x0, 0x823f7551b1ab5156)
r8 = geteuid()
semctl$IPC_SET(r3, 0x0, 0x1, &(0x7f0000001100)={{0x9, 0x0, r5, r8, 0x0, 0x38, 0x6}, 0x4, 0x7fffffffffffffff, 0x3})


compat_50_clock_gettime(0x2, 0x0)


socketpair$unix(0x1, 0x1, 0x0, &(0x7f0000000200)={<r0=>0xffffffffffffffff, <r1=>0xffffffffffffffff})
write(r0, &(0x7f0000000040)="ed", 0x1)
recvmmsg(r1, &(0x7f0000000880)={&(0x7f0000000840)={0x0, 0x0, &(0x7f0000000ac0)=[{&(0x7f0000000240)=""/217, 0xd9}], 0x1, 0x0}}, 0x10, 0x1060, 0x0)
close(r0)


compat_40_mount(&(0x7f0000000080)='tmpfs\x00', &(0x7f00000000c0)='.\x00', 0x0, &(0x7f00000002c0)="01")
mknod(&(0x7f0000000280)='./file0\x00', 0x1100, 0x0)
r0 = open(&(0x7f0000000180)='./file0\x00', 0x80000000000206, 0x0)
writev(r0, &(0x7f0000000100)=[{&(0x7f00000001c0)="ee", 0x1}], 0x1)
read(r0, 0x0, 0x0)


r0 = compat_30_socket(0x22, 0x3, 0x0)
mmap(&(0x7f0000000000/0x3000)=nil, 0x3000, 0x1, 0x1012, 0xffffffffffffffff, 0x0, 0x0)
compat_43_osend(r0, &(0x7f0000000040)="08200203", 0x358, 0x0)
recvfrom$inet6(r0, &(0x7f0000000080)=""/138, 0x8a, 0x0, 0x0, 0x0)


mkdir(&(0x7f0000000080)='./file0\x00', 0x0)
compat_40_mount(&(0x7f0000000180)='ptyfs\x00', &(0x7f00000002c0)='./file0\x00', 0x0, &(0x7f0000000500))
setuid(0xee01)
compat_50_utimes(&(0x7f0000000040)='./file0\x00', 0x0)


faccessat(0xffffffffffffff9c, 0x0, 0x8, 0x0)


fcntl$lock(0xffffffffffffffff, 0x0, 0x0)
getsockopt$SO_PEERCRED(0xffffffffffffffff, 0xffff, 0x1022, &(0x7f0000000340), 0x267e50366ae18406)
sendmsg$unix(0xffffffffffffffff, &(0x7f0000000940)={0x0, 0x0, 0x0, 0x0, &(0x7f0000000600)=ANY=[@ANYBLOB, @ANYBLOB, @ANYRES8, @ANYRES32=0x0, @ANYRES32=0x0, @ANYRES32=0x0, @ANYRES8, @ANYRES32=0x0, @ANYRESDEC], 0x128}, 0x0)
getgroups(0x0, 0x0)
setsockopt(0xffffffffffffffff, 0x0, 0x0, 0x0, 0x47)
setrlimit(0x3, &(0x7f0000000180)={0xb66c, 0x100000})


mknod(&(0x7f00000000c0)='./bus\x00', 0x2000, 0x0)
r0 = open$dir(&(0x7f0000000000)='./bus\x00', 0x0, 0x0)
ioctl$FIOASYNC(r0, 0x80067409, &(0x7f00000001c0))


modctl$MODCTL_UNLOAD(0x2, 0x0)
mknod(&(0x7f00000000c0)='./bus\x00', 0x2000, 0x4f4b)
r0 = open$dir(&(0x7f0000000040)='./bus\x00', 0x0, 0x0)
ioctl$FIOASYNC(r0, 0xc0104302, &(0x7f00000001c0)=0x20000002)


ptrace(0x1b, 0x0, 0x0, 0x0)


r0 = socket(0x10, 0x2, 0x0)
r1 = socket(0x18, 0x40000003, 0x0)
setsockopt$sock_timeval(r1, 0xffff, 0x0, 0x0, 0x0)
acct(0x0)
recvmmsg(r0, &(0x7f0000002440)={0x0}, 0x10, 0x40011121, 0x0)


compat_40_mount(&(0x7f0000000040)='tmpfs\x00', &(0x7f00000000c0)='.\x00', 0x0, &(0x7f00000002c0)="01")
mknod(&(0x7f0000000000)='./file0\x00', 0xe015, 0x0)
symlink(&(0x7f0000000080)='.\x00', &(0x7f0000000240)='./file0\x00')
compat_40_mount(0x0, 0x0, 0x0, 0x0)


mknod$loop(&(0x7f0000000000)='./file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\x00', 0x0, 0x1)
open(0x0, 0x0, 0x0)
r0 = compat_30_socket(0x22, 0x30000003, 0x0)
compat_43_osend(r0, &(0x7f0000000040)="00060409", 0x600, 0x0)


compat_50___msgctl13$IPC_STAT(0x0, 0x2, 0x0)
lchown(0x0, 0x0, 0xffffffffffffffff)
__mount50(&(0x7f00000002c0)='overlay\x00', &(0x7f0000000040)='.\x00', 0x0, &(0x7f0000000540), 0x0)
mknod(&(0x7f0000000080)='./bus\x00', 0x2000, 0x0)
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
__mount50(&(0x7f00000002c0)='coda\x00', &(0x7f0000000040)='.\x00', 0x0, &(0x7f0000000540), 0x0)
r0 = open(&(0x7f0000000080)='./file0\x00', 0x0, 0x0)
fdatasync(r0)


r0 = socket(0x1f, 0x5, 0x2)
recvmmsg(r0, &(0x7f0000001140)={0x0}, 0x10, 0x3, 0x0)


ptrace(0xf, 0x0, 0x0, 0x0)


mknod(&(0x7f0000000240)='./file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\x00', 0x2000, 0x0)
mknod$loop(&(0x7f0000000000)='./file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\x00', 0x2000, 0x1)
link(&(0x7f0000000940)='./file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\x00', &(0x7f0000000d40)='./file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\x00')
mkdirat(0xffffffffffffff9c, &(0x7f0000001240)='./file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\x00', 0x0)
rename(&(0x7f0000000600)='./file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\x00', &(0x7f0000000f40)='./file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\x00')
rename(&(0x7f0000001140)='./file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\x00', &(0x7f0000000240)='./file2\x00')
symlink(&(0x7f0000001340)='./file0\x00', &(0x7f0000001440)='./file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\x00')
rename(&(0x7f00000007c0)='./file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\x00', &(0x7f0000001040)='./file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\x00')
symlink(&(0x7f00000001c0)='./file2\x00', &(0x7f0000001bc0)='./file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\x00')
rename(&(0x7f0000000140)='./file2\x00', &(0x7f0000001540)='./file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\x00')


mkdir(&(0x7f0000000300)='./file0\x00', 0x0)
__mount50(&(0x7f0000000000)='kernfs\x00', &(0x7f0000000180)='./file0\x00', 0x0, 0x0, 0x0)
r0 = open$dir(&(0x7f0000000080)='.\x00', 0x0, 0x0)
faccessat(r0, &(0x7f00000000c0)='./file0\x00', 0x4, 0x0)


compat_43_ocreat(&(0x7f0000000000)='./file0\x00', 0x0)
ktrace(&(0x7f00000000c0)='./file0\x00', 0x4, 0x4, 0xffffffffffffffff)
truncate(0x0, 0x0, 0x0)


mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
rename(0x0, 0x0)
r0 = socket(0x18, 0x3, 0x4)
setsockopt(r0, 0x1000000029, 0x1a, &(0x7f0000000000)="5afee7d8", 0x4)


symlink(&(0x7f0000000080)='.\x00', &(0x7f0000000240)='./file0\x00')
compat_50___msgctl13$IPC_STAT(0x0, 0x2, &(0x7f0000000280)={{}, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, &(0x7f00000001c0)={&(0x7f0000000100)}})
sendmsg$unix(0xffffffffffffffff, &(0x7f0000000240)={&(0x7f0000000180)=@abs={0x0, 0x0, 0x2}, 0x8, &(0x7f0000000200)}, 0x0)
open(&(0x7f0000000100)='./file0\x00', 0x0, 0x0)
compat_40_mount(&(0x7f0000000140)='umap\x00', &(0x7f0000000040)='./file0\x00', 0x0, &(0x7f00000001c0))
mknod(&(0x7f0000000000)='./bus\x00', 0x2000, 0x2e00)
r0 = open$dir(&(0x7f0000000080)='./bus\x00', 0x0, 0x0)
ioctl$FIOASYNC(r0, 0xc1c0526b, &(0x7f00000001c0))


__mount50(0x0, &(0x7f0000000040)='.\x00', 0x0, 0x0, 0x0)
modctl$MODCTL_UNLOAD(0x2, 0x0)
_lwp_detach(0x0)


mknodat(0xffffffffffffff9c, &(0x7f0000000040)='./file0\x00', 0xc0e99db6de761f86, 0x0)
open$dir(&(0x7f0000000000)='./file0\x00', 0x0, 0x0)
modctl$MODCTL_UNLOAD(0x2, 0x0)
open(&(0x7f0000000040)='./file0\x00', 0x70e, 0x0)


compat_50_setitimer(0x1, &(0x7f0000000000)={{}, {0x0, 0x613}}, 0x0)
__setitimer50(0x1, &(0x7f0000000140)={{}, {0x0, 0x3}}, 0x0)


mknod(&(0x7f0000000540)='./file0\x00', 0x6000, 0x1003)
r0 = open(&(0x7f0000000000)='./file0\x00', 0x0, 0x0)
ioctl$FIOASYNC(r0, 0x8004646d, &(0x7f0000000380))


socket(0x0, 0x0, 0x0)
socket$inet(0x2, 0x0, 0x0)
__mount50(0x0, &(0x7f0000000040)='.\x00', 0x0, 0x0, 0x0)
r0 = socket$inet(0x2, 0x1, 0x0)
setsockopt$inet_opts(r0, 0x6, 0x3, &(0x7f0000000040), 0x4)


compat_43_osetrlimit(0x0, &(0x7f0000000080))
setrlimit(0x8, &(0x7f0000000100))


munmap(&(0x7f0000ffb000/0x2000)=nil, 0x2000)
mincore(&(0x7f0000ffc000/0x2000)=nil, 0x2000, &(0x7f00000001c0)=""/208)


mknod(&(0x7f0000000200)='./file0\x00', 0x2000, 0x0)
open$dir(&(0x7f0000000180)='./file0\x00', 0x190, 0x0)
open(&(0x7f0000000040)='./file0\x00', 0x10, 0x0)
r0 = open(&(0x7f0000000380)='./file0\x00', 0x0, 0x0)
fcntl$lock(r0, 0x9, &(0x7f0000000080)={0x0, 0x1, 0x0, 0x100000001})


mknodat(0xffffffffffffff9c, &(0x7f0000000100)='./file0\x00', 0x1000, 0x0)
open(&(0x7f0000000200)='./file0\x00', 0x0, 0x0)
ktrace(&(0x7f0000000140)='./file0\x00', 0x0, 0x0, 0x0)


open(&(0x7f0000000040)='./file0\x00', 0x70e, 0x0)
ktrace(&(0x7f0000000200)='./file0\x00', 0x4, 0xd27d43220c7df9b, 0x0)
munlockall()


symlink(&(0x7f0000000080)='.\x00', &(0x7f0000000300)='./file0\x00')
compat_50___msgctl13$IPC_STAT(0x0, 0x2, &(0x7f0000000280)={{}, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, &(0x7f00000001c0)={&(0x7f0000000100)}})
compat_40_mount(0x0, &(0x7f0000000100)='./file0\x00', 0x0, 0x0)
compat_40_mount(&(0x7f0000000140)='lfs\x00', &(0x7f00000000c0)='./file0\x00', 0x0, &(0x7f00000001c0))


sendmsg$unix(0xffffffffffffffff, &(0x7f0000001a00)={0x0, 0x0, 0x0, 0x0, 0x0, 0x18}, 0x0)
socketpair$unix(0x1, 0x2, 0x0, &(0x7f0000000040)={<r0=>0xffffffffffffffff, <r1=>0xffffffffffffffff})
open(&(0x7f0000000080)='./file0\x00', 0x200, 0x0)
r2 = getpid()
ktrace(&(0x7f0000001d40)='./file0\x00', 0x0, 0x1720, r2)
sendmmsg(r1, &(0x7f0000000000)={0x0}, 0xfffffe32, 0x0, 0x0)
recvmsg(r0, &(0x7f0000000100)={0x0, 0x0, &(0x7f0000000080)=[{&(0x7f0000000180)=""/236}], 0x100000000000039d, 0x0, 0x28}, 0x0)


mknod(&(0x7f0000000000)='./file1\x00', 0x2000, 0xa718)
open$dir(&(0x7f00000000c0)='./file1\x00', 0x0, 0x0)
mknod(&(0x7f0000000000)='./file0\x00', 0x2000, 0x1803)
openat(0xffffffffffffff9c, &(0x7f0000000000)='./file0\x00', 0x0, 0x0)


symlink(0x0, 0x0)
mkdir(&(0x7f00000001c0)='./file1\x00', 0x0)
mknodat(0xffffffffffffff9c, &(0x7f00000000c0)='./file1/file2\x00', 0x0, 0x0)


open(&(0x7f0000000340)='./file0\x00', 0x300, 0x0)
__posix_rename(&(0x7f0000000000)='./file0\x00', &(0x7f0000000040)='./file0\x00')


symlink(&(0x7f0000000080)='.\x00', &(0x7f0000000300)='./file0\x00')
compat_50___msgctl13$IPC_STAT(0xffffffffffffffff, 0x2, &(0x7f0000000280)={{}, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, &(0x7f00000001c0)={&(0x7f0000000100), 0x4af}})
lchown(&(0x7f0000000100)='./file0\x00', 0x0, 0xffffffffffffffff)
compat_40_mount(&(0x7f0000000380)='union\x00', &(0x7f0000000140)='./file0\x00', 0x0, &(0x7f00000001c0))
__mount50(0x0, &(0x7f0000000200)='.\x00', 0xb0afbd006181d6de, &(0x7f0000000640), 0x0)


socketpair$unix(0x1, 0x1, 0x0, &(0x7f0000000040)={<r0=>0xffffffffffffffff, <r1=>0xffffffffffffffff})
recvmmsg(0xffffffffffffffff, &(0x7f0000000700)={&(0x7f00000001c0)={0x0, 0x0, 0x0, 0x0, &(0x7f0000000680)=""/100, 0x64}, 0x3f8d}, 0x10, 0x0, 0x0)
setreuid(0x0, 0xee01)
r2 = fcntl$dupfd(r0, 0x0, r1)
recvmsg(r2, &(0x7f0000000540)={0x0, 0x0, &(0x7f0000000280)=[{&(0x7f0000000100)=""/152, 0x98}], 0x1, 0x0}, 0x0)
setuid(0x0)
sendmmsg(r1, &(0x7f0000000080)={0x0}, 0x10, 0x0, 0x0)


compat_50_clock_getres(0x3, 0x0)


setpriority(0x2, 0x1000, 0x0)


compat_40_mount(&(0x7f0000000040)='tmpfs\x00', 0x0, 0x0, 0x0)
modctl$MODCTL_LOAD(0x0, &(0x7f0000000140)={&(0x7f0000000040), 0x0, 0x0})


_ksem_init(0x0, &(0x7f0000000000)=<r0=>0x50535244)
r1 = socket$inet(0x2, 0x2, 0x0)
setsockopt$inet_opts(r1, 0x0, 0x200000000000c, &(0x7f0000000080)="eaef125c00000000", 0x8)
_ksem_trywait(r0)


socketpair$unix(0x1, 0x5, 0x0, &(0x7f0000000040)={<r0=>0xffffffffffffffff})
getsockopt$sock_cred(r0, 0xffff, 0x1022, 0x0, &(0x7f0000000140))
setregid(0x0, 0x0)
getgid()
getegid()
mknod(&(0x7f0000000480)='./file0\x00', 0x2000, 0x1733)
r1 = open(&(0x7f00000000c0)='./file0\x00', 0x80000000000206, 0x0)
writev(r1, &(0x7f0000000280)=[{0x0}], 0x1)


mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
_lwp_setname(0x0, &(0x7f0000001100)='*/-\x00')
_lwp_getname(0x0, &(0x7f0000001140)=""/4, 0x4)


mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
compat_60__lwp_park(&(0x7f0000000140), 0x0, 0x0, 0x0)


mknod(&(0x7f00000000c0)='./bus\x00', 0x2000, 0x4f4b)
r0 = open$dir(&(0x7f0000000080)='./bus\x00', 0x0, 0x0)
compat_50_select(0x40, &(0x7f0000000000)={0xfffffffb, 0x0, 0xfffffffffffffffd, 0x0, 0x0, 0xffffffffffffffff}, 0x0, 0x0, 0x0)
ioctl$FIOASYNC(r0, 0xc0104304, &(0x7f00000001c0)=0x20000002)


setrlimit(0x0, &(0x7f0000000980))
setrlimit(0x8, &(0x7f00000000c0)={0x3668, 0x100000001})


mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
compat_50_clock_settime(0x0, &(0x7f0000000000))


r0 = socket(0x2, 0x3, 0x6)
sendto$inet(r0, &(0x7f00000023c0)="8ce2ad4d4f95e087a7846d3f81", 0xffffffffffffff4d, 0x0, &(0x7f0000002400)={0x2, 0x0}, 0x10)


mknod(&(0x7f0000000000)='./file1\x00', 0x2000, 0xa718)
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
setrlimit(0x8, &(0x7f0000000100))
ktrace(&(0x7f0000000000)='./file0\x00', 0x0, 0x0, 0x0)


ptrace(0x0, 0x0, 0x0, 0x0)
ptrace(0x0, 0x0, 0x0, 0x0)


compat_50_clock_settime(0x0, &(0x7f0000000340)={0xfffffffc})


pipe(&(0x7f0000000640)={<r0=>0xffffffffffffffff, <r1=>0xffffffffffffffff})
r2 = getpid()
fktrace(r1, 0x0, 0xf0709cfa615b9be3, r2)
compat_90_getvfsstat(0x0, 0x0, 0x0)
compat_43_osethostname(&(0x7f0000000480), 0x0)
fktrace(r0, 0x2, 0x0, 0x0)


compat_40_mount(&(0x7f0000000040)='tmpfs\x00', &(0x7f0000000140)='.\x00', 0x0, &(0x7f0000000080)="01")
mkdirat(0xffffffffffffff9c, &(0x7f0000000680)='./file0\x00', 0x0)
mkdir(0x0, 0x0)
r0 = open$dir(&(0x7f00000000c0)='./file0\x00', 0x0, 0x0)
mkdir(&(0x7f0000000040)='./file2\x00', 0x0)
mkdir(&(0x7f0000000300)='./file2/file0\x00', 0x0)
rename(&(0x7f00000002c0)='./file2/file0\x00', &(0x7f0000000340)='./file0\x00')
compat_30_getdents(r0, 0x0, 0x0)


r0 = socket(0x18, 0x3, 0x0)
bind$inet6(r0, &(0x7f0000000000)={0x18, 0x0}, 0xc)


openat(0xffffffffffffff9c, 0x0, 0x0, 0x0)
write(0xffffffffffffffff, &(0x7f0000000140)="4e8f8cdc90bf00ba12aaa92982b8b4b7630aa9bd5db9dab4749648139ce6d31e8c0219fddb943501650e4f653434f29e824bfaf628632cc6628d8cf30ae00a546ca3b4d1584dcb4de2c59dfa86a50eadde287b4643dc1052ab5d03c4cab84ff29acc3ec822", 0x65)
r0 = socket(0x18, 0x3, 0x0)
ioctl$FIOSEEKHOLE(r0, 0xc048696d, &(0x7f0000000180)=0x8000000000000032)


msgsnd(0xffffffffffffffff, 0x0, 0x0, 0x0)


mknod(&(0x7f0000000480)='./file0\x00', 0x2000, 0x1733)
r0 = open$dir(&(0x7f00000000c0)='./file0\x00', 0x0, 0x0)
mknodat(r0, &(0x7f0000000040)='./file0\x00', 0x0, 0x0)


r0 = socket(0x18, 0x2, 0x0)
setsockopt(r0, 0x1000000000029, 0xa, &(0x7f0000000040)='\x00\x00\x00\x00', 0x4)
setsockopt(r0, 0x1000000000029, 0xb, &(0x7f0000000040)='\x00\x00\x00\x00', 0x4)


r0 = open(&(0x7f0000001640)='./file0\x00', 0x615, 0x0)
compat_20_statfs(0x0, 0x0)
fstatat(r0, &(0x7f0000000300)='./file0\x00', 0x0, 0x0)


setreuid(0x0, 0xee01)
__fhstatvfs190(0x0, 0x0, 0x0, 0x0)


r0 = open(&(0x7f0000000480)='./file0\x00', 0x200, 0x0)
truncate(&(0x7f0000000440)='./file0\x00', 0x0, 0x3e2b649f)
fsync(r0)


mknod(&(0x7f0000000100)='./file0\x00', 0x2000, 0x0)
r0 = open(&(0x7f0000000140)='./file0\x00', 0x0, 0x0)
fcntl$lock(0xffffffffffffffff, 0x0, &(0x7f0000000000)={0x4, 0x0, 0x0, 0x0, 0xffffffffffffffff})
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
ioctl$FIOASYNC(r0, 0x80047401, &(0x7f0000000000))


modctl$MODCTL_UNLOAD(0x2, 0x0)
mknod(&(0x7f0000000400)='./file0\x00', 0x2000, 0x287e)
r0 = open(&(0x7f00000000c0)='./file0\x00', 0x80000000000206, 0x0)
write(r0, &(0x7f0000000180)="1e", 0x1)


r0 = socket$inet6(0x18, 0x3, 0x0)
sendto$inet6(r0, &(0x7f0000000000)="85", 0x329, 0x2, &(0x7f0000000040)={0x18, 0x3}, 0x1c)


compat_40_mount(&(0x7f0000000040)='tmpfs\x00', &(0x7f00000000c0)='.\x00', 0x0, &(0x7f00000002c0)="01")
r0 = open$dir(&(0x7f0000000000)='./file0\x00', 0x800, 0x44cfbbe6a88f6964)
r1 = openat(r0, &(0x7f0000000300)='./file1aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\x00', 0x80, 0x38)
symlink(&(0x7f0000000080)='.\x00', &(0x7f0000000240)='./file0\x00')
pathconf(&(0x7f00000000c0)='./file0/file0\x00', 0x1)
readlinkat(r1, &(0x7f0000000080)='./file0\x00', &(0x7f0000000100)=""/206, 0xce)
mlock(&(0x7f00005fb000/0x4000)=nil, 0x4000)
mlock(&(0x7f0000000000/0x800000)=nil, 0x800000)
mknod(&(0x7f0000001200)='./file0\x00', 0x2000, 0x400)
r2 = open$dir(&(0x7f0000000000)='./file0\x00', 0x40000400001803c1, 0x0)
pwritev(r2, &(0x7f0000000080)=[{&(0x7f0000000100)="1f319c1fde2abc05119e9bdbed82410f2932fbd8845cf92b5b8ff03fab37e84f062a6661e620d93bf8e9bfdbd8850fa7aa8788b2bd66a10ccd45801e6147b7272a71be5f82227bb3d03acda3661252bbde1da4947d5032f057eb2fcfe99888b1e034125885a1770f221ca28972f352b2d04db1e8d522a260085a40c38e403025b4212719d15f6e1de8b9043ef294c1", 0xff82}], 0x1, 0x4af)
pipe(&(0x7f0000001400)={<r3=>0xffffffffffffffff})
fktrace(r3, 0x0, 0x2, 0x0)
__getdents30(r1, 0x0, 0x7)


r0 = socket(0x12, 0x2, 0x0)
setsockopt(r0, 0x0, 0x0, 0x0, 0x0)


symlink(&(0x7f0000000300)='.\x00', &(0x7f0000000240)='./file0\x00')
compat_40_mount(&(0x7f0000000080)='mfs\x00', &(0x7f00000000c0)='./file0\x00', 0x0, &(0x7f0000000340)="e627e39a44142f9cb0cb23869a4e2f91412cb2170e808afb354c76800e537be4d7172b78187d30c4f7bb177920fcdc174078cef96576823a1c24588a72d822986a1dc9b5f9948c47d4c8925383985a4812d5ba4f464f1bc43bbb79cd421a8e4001ccdbc2674d91d9850fe1bc76dd4a44beeef1c01f7bbf316b97ecb81eba55b5d4d4cc60447fea0af9cdee52fd39f1")


setrlimit(0x8, &(0x7f0000000100))
__fhopen40(0x0, 0x0, 0x0)


mknod(&(0x7f0000000280)='./file0\x00', 0x2000, 0x200)
r0 = open(&(0x7f00000000c0)='./file0\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x2000)=nil, 0x2000, 0x1, 0x10, r0, 0x0, 0x0)
mknod(&(0x7f0000000240)='./file0\x00', 0x2000, 0x0)
rename(&(0x7f0000000340)='./file0\x00', &(0x7f0000000480)='./file0\x00')


mkdir(&(0x7f0000000080)='./file0\x00', 0x0)
compat_50___msgctl13$IPC_STAT(0x0, 0x2, &(0x7f0000000280)={{}, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, &(0x7f00000001c0)={&(0x7f0000000100)}})
open(&(0x7f0000000100)='./file0\x00', 0x0, 0x0)
compat_40_mount(&(0x7f0000000140)='ffs\x00', &(0x7f0000000040)='./file0\x00', 0x0, &(0x7f00000001c0))


__mount50(&(0x7f00000001c0)='fdesc\x00', &(0x7f0000000040)='.\x00', 0x0, 0x0, 0x57)
mknod(&(0x7f00000000c0)='./bus\x00', 0x0, 0x0)


mknod(&(0x7f0000000000)='./file0\x00', 0xe015, 0x0)
mkdir(&(0x7f0000000080)='./file0\x00', 0x0)


profil(0x0, 0x0, 0x0, 0x8001)
profil(0x0, 0x0, 0x0, 0x0)


write(0xffffffffffffffff, &(0x7f00000001c0)="39e4aff151", 0x5)
r0 = socket$inet6(0x18, 0x3, 0x0)
pathconf(&(0x7f00000000c0)='./file0\x00', 0x0)
sendto$inet6(r0, &(0x7f0000000000)='3', 0x329, 0x2, &(0x7f0000000040)={0x18, 0x3}, 0x1c)


ioctl$WSKBDIO_BELL(0xffffffffffffffff, 0x4080426f)
unlink(0x0)
mknod(&(0x7f0000000180)='./file0\x00', 0x2000, 0x202)
r0 = open(&(0x7f00000000c0)='./file0\x00', 0x0, 0x0)
__getfh30(0x0, 0x0, 0x0)
fcntl$setstatus(r0, 0x4, 0x20044)


r0 = socket(0x11, 0x3, 0x0)
getsockopt$sock_int(r0, 0xffff, 0x1003, 0x0, 0x0)


r0 = socket(0x18, 0x3, 0x0)
connect$unix(r0, &(0x7f00000000c0)=@abs={0x682eb13985c518e6, 0x7}, 0x1c)
sendmsg$unix(r0, &(0x7f0000001700)={&(0x7f0000000200)=@file={0x0, './file0\x00'}, 0xa, 0x0}, 0x0)


r0 = socket(0x18, 0x1, 0x0)
setsockopt(r0, 0x29, 0x14, &(0x7f0000000000)="02000000", 0x4)


mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
r0 = socket(0x18, 0x1, 0x0)
r1 = dup(r0)
faccessat(r1, &(0x7f0000000040)='./file0\x00', 0x0, 0x0)


compat_40_mount(&(0x7f0000000040)='tmpfs\x00', &(0x7f00000000c0)='.\x00', 0x0, &(0x7f00000002c0)="01")
symlink(&(0x7f0000000080)='.\x00', &(0x7f0000000240)='./file0\x00')
compat_50___msgctl13$IPC_STAT(0x0, 0x2, &(0x7f0000000280)={{}, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, &(0x7f00000001c0)={&(0x7f0000000100), 0xdffffffffffff7ff}})
lchown(&(0x7f0000000100)='./file0\x00', 0x0, 0xffffffffffffffff)
compat_40_mount(&(0x7f0000000380)='union\x00', &(0x7f0000000140)='./file0\x00', 0x0, &(0x7f00000001c0))
pathconf(&(0x7f0000000200)='./file0\x00', 0xe)


fcntl$lock(0xffffffffffffffff, 0x0, &(0x7f00000000c0)={0x0, 0x0, 0x40000000000000, 0x300100000})
sendmsg$unix(0xffffffffffffffff, &(0x7f0000001700)={&(0x7f00000000c0), 0x1c, 0x0}, 0x0)
connect$unix(0xffffffffffffffff, &(0x7f00000000c0)=@abs={0x682eb13985c518e6, 0x7}, 0x1c)
sendmsg$unix(0xffffffffffffff9c, 0x0, 0x0)
ioctl$WSMUXIO_INJECTEVENT(0xffffffffffffffff, 0x80185760, &(0x7f0000000000)={0x0, 0x0, {0x0, 0x10000000000002}})
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
r0 = socket(0x18, 0x1, 0x0)
close(r0)
r1 = socket(0x18, 0x2, 0x0)
setsockopt(r1, 0x1000000029, 0x2e, &(0x7f0000000000)="ebffcbff13b9fd812eaa4e713048e69931929648", 0x14)
connect$unix(r0, &(0x7f00000000c0)=@abs={0x0, 0x7}, 0x1c)


r0 = _lwp_self()
_lwp_exit()
_lwp_wait(r0, 0x0)
_lwp_getname(r0, 0x0, 0x0)


__mount50(&(0x7f0000000000)='ntfs\x00', &(0x7f0000000080)='.\x00', 0x0, &(0x7f0000000540), 0x0)


mknod(&(0x7f0000000000)='./file1\x00', 0x2000, 0xa718)
r0 = open$dir(&(0x7f00000000c0)='./file1\x00', 0x0, 0x0)
open(&(0x7f0000000040)='./file0\x00', 0x0, 0x0)
ioctl$OFIOGETBMAP(r0, 0xc010447d, &(0x7f0000000040))


mknod(&(0x7f0000000540)='./file0\x00', 0x6000, 0x1003)
r0 = open(&(0x7f0000000000)='./file0\x00', 0x0, 0x0)
ioctl$FIOASYNC(r0, 0xc018647c, &(0x7f0000000380))


mknod(&(0x7f0000000480)='./file0\x00', 0x2000, 0x1733)
setreuid(0x0, 0xee01)
r0 = socket(0x2, 0x2, 0x0)
connect$unix(r0, &(0x7f0000000300)=@file={0x1}, 0x2)
r1 = semget$private(0x0, 0x7, 0x3c0)
semop(r1, &(0x7f0000000180)=[{0x0, 0x43, 0x1800}, {0x4, 0xe6, 0x1800}, {0x0, 0xfd, 0x1000}, {0x0, 0x20, 0x1800}, {0x2, 0x5, 0x1800}, {0x4, 0x9e, 0x1000}, {0x2, 0xfffb, 0x1000}, {0x0, 0x40, 0x1000}, {0x3, 0x8, 0x1000}], 0x9)
getsockopt$SO_PEERCRED(r0, 0xffff, 0x1022, &(0x7f0000000200), 0xc)
accept$unix(r0, &(0x7f0000000a40), &(0x7f0000000140)=0x65)
geteuid()
semctl$IPC_SET(0xffffffffffffffff, 0x0, 0x1, &(0x7f00000000c0))
r2 = semget$private(0x0, 0x7, 0x3c0)
semctl$SETALL(r2, 0x0, 0x9, &(0x7f00000002c0))
msgctl$IPC_SET(0x0, 0x1, &(0x7f00000000c0)={{0xfff, 0x0, 0x0, 0x0, 0x0, 0x180}, 0x0, 0x7})
semop(0x0, &(0x7f00000001c0)=[{0x2, 0x2100, 0x1800}, {0x4, 0x5, 0x400}, {0x1, 0x2006, 0x800}, {0x1, 0x266, 0x800}, {}], 0x2aaaace1)
____semctl50$IPC_STAT(r1, 0x0, 0x2, &(0x7f0000000080)=@buf=&(0x7f0000000040)={{0x0, 0x0, 0x8b, 0x2, 0xc0c, 0x200, 0xb86}, 0xf800, 0x0, 0x0, 0x0})


r0 = socket(0x2, 0x3, 0x100000001)
bind$inet(r0, &(0x7f0000000080)={0x2, 0x0}, 0x10)
connect$inet(r0, &(0x7f00000000c0)={0x2, 0x0}, 0x10)
sendto$inet(r0, &(0x7f0000000000)="832e", 0x2, 0x8004, 0x0, 0x0)
sendto$inet(r0, &(0x7f00000001c0)="965033", 0x3, 0x0, 0x0, 0x0)


compat_50_____semctl13$GETALL(0x0, 0x0, 0x6, 0xfffffffffffffffe)


r0 = getppid()
r1 = getsid(0x0)
ptrace(0x9, r1, 0x0, 0x0)
compat_50_wait4(0x0, 0x0, 0x0, 0x0)
ptrace(0x40000002c, r0, 0x0, 0x7ff)


compat_40_mount(&(0x7f0000000040)='tmpfs\x00', &(0x7f00000000c0)='.\x00', 0x0, &(0x7f00000002c0)="01")
mknod(0x0, 0x0, 0xe03)
setreuid(0xffffffffffffffff, 0xee00)
compat_50_quotactl(&(0x7f0000000000)='./file0\x00', 0x0, 0x0, 0x0)


compat_40_mount(&(0x7f0000000380)='tmpfs\x00', &(0x7f00000003c0)='.\x00', 0x0, &(0x7f0000000140)="01")
mknod(&(0x7f0000000000)='./file0\x00', 0x2000, 0x0)
__mount50(0x0, &(0x7f0000000040)='.\x00', 0xe680bf986d21abfb, &(0x7f0000000140), 0x0)
open(&(0x7f0000000480)='./file0\x00', 0x0, 0x0)


__mount50(&(0x7f0000000080)='overlay\x00', &(0x7f0000000040)='.\x00', 0x0, &(0x7f0000000540), 0x0)
mknod$loop(&(0x7f0000000000)='./file0\x00', 0x2000, 0x1)
pathconf(&(0x7f0000000100)='./file0\x00', 0x6)


writev(0xffffffffffffffff, &(0x7f0000000240)=[{&(0x7f0000000140)="10", 0x3}], 0x1)
symlink(&(0x7f0000000080)='.\x00', &(0x7f0000000240)='./file0\x00')
compat_50___msgctl13$IPC_STAT(0x0, 0x2, &(0x7f0000000280)={{}, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, &(0x7f00000001c0)={&(0x7f0000000100)}})
lchown(&(0x7f0000000100)='./file0\x00', 0x0, 0xffffffffffffffff)
compat_40_mount(&(0x7f0000000140)='umap\x00', &(0x7f0000000040)='./file0\x00', 0x0, &(0x7f00000001c0))


modctl$MODCTL_LOAD(0x0, &(0x7f0000000080)={&(0x7f0000000240), 0x0, &(0x7f00000000c0)="3c5ffe26b51ceab496c8f0677a2c27bd63e23ddcc2870779da41ad20", 0x1c})


compat_40_mount(&(0x7f0000000040)='tmpfs\x00', &(0x7f00000005c0)='.\x00', 0x4, &(0x7f00000002c0)="01")
unlink(0x0)
open(&(0x7f00000000c0)='./file0\x00', 0x615, 0x7)
execve(&(0x7f0000000140)='./file0\x00', 0x0, 0x0)


__mount50(&(0x7f0000000000)='nfs\x00', &(0x7f0000000040)='.\x00', 0x0, &(0x7f00000002c0), 0x0)


r0 = compat_30_socket(0x22, 0x3, 0x0)
recvmmsg(r0, &(0x7f0000000340)={&(0x7f00000002c0)={0x0, 0x0, 0x0, 0x0, 0x0}, 0xfffffffd}, 0x10, 0x0, 0x0)
r1 = socket(0x18, 0x2, 0x0)
setuid(0xee01)
setsockopt(r1, 0x0, 0x0, &(0x7f0000000000)="02000000", 0x4)
open(0x0, 0x0, 0x0)
mkdir(0x0, 0x0)
r2 = semget$private(0x0, 0x1, 0x0)
semctl$SETVAL(r2, 0x2, 0x8, &(0x7f00000001c0)=0x8000)


symlink(&(0x7f0000000080)='.\x00', &(0x7f0000000240)='./file0\x00')
compat_50___msgctl13$IPC_STAT(0x0, 0x2, &(0x7f0000000280)={{}, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, &(0x7f00000001c0)={&(0x7f0000000100), 0xdffffffffffff7ff}})
lchown(&(0x7f0000000100)='./file0\x00', 0x0, 0xffffffffffffffff)
compat_40_mount(&(0x7f0000000380)='union\x00', &(0x7f0000000140)='./file0\x00', 0x0, &(0x7f00000001c0))
compat_40_mount(&(0x7f0000000380)='union\x00', &(0x7f0000000140)='./file0\x00', 0x0, &(0x7f00000001c0))
r0 = open(&(0x7f0000002600)='./file0\x00', 0x0, 0x0)
ioctl$WSKBDIO_SETMAP(r0, 0xc004667a, &(0x7f0000000180)={0x0, 0x0})


mknod(&(0x7f0000000040)='./bus\x00', 0x100000000205f, 0x2802)
socketpair$unix(0x1, 0x5, 0x0, &(0x7f0000000200)={<r0=>0xffffffffffffffff})
ioctl$FIOSEEKHOLE(r0, 0x8090697f, &(0x7f0000000000))
open$dir(&(0x7f0000000040)='./bus\x00', 0x0, 0x0)


paccept(0xffffffffffffffff, 0x0, 0x0, 0x10000000)


symlink(&(0x7f0000000080)='.\x00', &(0x7f0000000000)='./file0\x00')
__mount50(&(0x7f0000000040)='procfs\x00', &(0x7f0000000080)='./file0\x00', 0x0, &(0x7f00000000c0)='@', 0x1)
rmdir(&(0x7f0000000040)='./control\x00')


socketpair$unix(0x1, 0x5, 0x0, &(0x7f0000000040)={<r0=>0xffffffffffffffff, <r1=>0xffffffffffffffff})
recvfrom$unix(r0, &(0x7f00000000c0), 0x832f1f7d, 0x0, &(0x7f0000000000)=@abs, 0x2000c600)
shutdown(r1, 0x2)


open(&(0x7f0000000040)='./file0\x00', 0x70e, 0x0)
r0 = getpid()
ktrace(&(0x7f0000000000)='./file0\x00', 0x0, 0x40001404, r0)
setreuid(0x0, 0xee01)
setrlimit(0xb, &(0x7f0000000240)={0x0, 0xffffffff})


socketpair$unix(0x1, 0x1, 0x0, &(0x7f0000000040)={0xffffffffffffffff, <r0=>0xffffffffffffffff})
fdatasync(r0)


open$dir(&(0x7f0000000000)='./file0\x00', 0x0, 0x0)
mkdir(&(0x7f0000000080)='./file0\x00', 0x0)
writev(0xffffffffffffffff, 0x0, 0x0)
swapctl$SWAP_ON(0x7, &(0x7f0000000000), 0x0)


setrlimit(0x8, &(0x7f00000000c0)={0x3668, 0x100000001})


undelete(0x0)
setsockopt$sock_timeval(0xffffffffffffffff, 0xffff, 0x0, 0x0, 0x0)
r0 = socket$inet(0x2, 0x1, 0x0)
setsockopt(r0, 0x0, 0x6, &(0x7f0000000040)="8b589d9d", 0x4)


__nanosleep50(&(0x7f0000000100)={0x0, 0x9}, &(0x7f0000000140))


ptrace(0x8, 0x0, 0x0, 0x0)


r0 = socket(0x18, 0x3, 0x0)
__futimes50(r0, 0x0)


munmap(&(0x7f0000001000/0x3000)=nil, 0x3000)
r0 = shmget$private(0x0, 0x3000, 0x0, &(0x7f0000374000/0x3000)=nil)
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
r1 = socket(0x18, 0x1, 0x0)
setsockopt(r1, 0x0, 0xc, &(0x7f0000000000)="ebffcbff13b9fd812eaa4e713048e69931929648", 0x14)
shmat(r0, &(0x7f0000001000/0x3000)=nil, 0x0)
mprotect(&(0x7f0000000000/0x2000)=nil, 0x2000, 0x1)


r0 = socket(0x1d, 0x40000003, 0x0)
getsockname$inet(r0, 0x0, 0x0)


socketpair$unix(0x1, 0x1, 0x0, &(0x7f0000000040)={0xffffffffffffffff, <r0=>0xffffffffffffffff})
ioctl$FIOSETOWN(r0, 0x40046678, &(0x7f0000000000))


mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
r0 = socket(0x18, 0x2, 0x0)
getsockopt$sock_int(r0, 0xffff, 0x100, 0x0, 0x0)


getppid()
r0 = msgget$private(0x0, 0x10e)
r1 = msgget$private(0x0, 0x100)
msgrcv(r1, &(0x7f0000000c00), 0xc0, 0x0, 0x1800)
msgrcv(r1, 0x0, 0x0, 0x1, 0x1000)
msgctl$IPC_STAT(r1, 0x2, &(0x7f0000000040)=""/56)
msgrcv(r1, &(0x7f0000000e00), 0xd4, 0x0, 0x800)
getsockopt$SO_PEERCRED(0xffffffffffffff9c, 0xffff, 0x1022, &(0x7f0000000080)={<r2=>0x0, <r3=>0x0}, 0xc)
r4 = getgid()
r5 = getgid()
setregid(r4, r5)
msgctl$IPC_RMID(r1, 0x0)
r6 = getpgid(0xffffffffffffffff)
msgctl$IPC_SET(r1, 0x1, &(0x7f0000000140)={{0x20009c1, r3, r4, 0x0, r5, 0x40, 0x8000}, 0x3ff, 0x3, r2, r6, 0x1f8, 0x6, 0x7e, 0x3cb})
getpgrp()
getpgid(0x0)
msgctl$IPC_SET(r0, 0x1, 0x0)
r7 = socket(0x0, 0x0, 0x0)
getsockopt$sock_cred(r7, 0xffff, 0x1022, &(0x7f0000001300)={0x0, <r8=>0x0}, &(0x7f0000001340)=0xc)
socket$inet(0x2, 0x0, 0x0)
msgctl$IPC_SET(0x0, 0x1, &(0x7f00000000c0)={{0x20007, r8, r5, r3, 0xffffffffffffffff, 0x101}, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x2, 0x4})
r9 = socket(0x18, 0x1, 0x0)
r10 = socket(0x18, 0x1, 0x0)
setsockopt(r10, 0x1000000029, 0x36, &(0x7f0000000040)="03000000", 0x4)
dup2(r10, r9)
ioctl$FIONBIO(r10, 0x8004667e, &(0x7f0000000080)=0x8001)
setsockopt$sock_int(r9, 0xffff, 0x1, &(0x7f0000000000)=0x3, 0x4)
connect$unix(r9, &(0x7f00000000c0)=@abs={0x682eb13985c518e6, 0x7}, 0x1c)


r0 = socket(0x18, 0x2, 0x0)
readv(0xffffffffffffffff, &(0x7f0000000100)=[{&(0x7f0000000040)=""/62, 0x3e}], 0x1)
connect$unix(r0, &(0x7f00000000c0)=@abs={0x682eb13985c518e6, 0x7}, 0x1c)
sendmsg(r0, &(0x7f00000003c0)={0x0, 0x0, 0x0, 0x0, &(0x7f0000000100)=ANY=[@ANYBLOB=' \x00\x00\x00)'], 0x3e}, 0x0)


mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1)
r0 = socket$inet(0x2, 0x1, 0x0)
setsockopt(r0, 0x0, 0x5, &(0x7f0000000a00)="8b589d9d", 0x4)


symlink(&(0x7f0000000080)='.\x00', &(0x7f0000000000)='./file0\x00')
compat_50___msgctl13$IPC_STAT(0x0, 0x2, &(0x7f00000003c0)={{}, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, &(0x7f00000001c0)={&(0x7f0000000100)}})
lchown(&(0x7f0000000100)='./file0\x00', 0x0, 0xffffffffffffffff)
compat_40_mount(&(0x7f0000000000)='null\x00', &(0x7f0000000140)='./file0\x00', 0x0, &(0x7f00000001c0))
rename(0x0, 0x0)
__mount50(0x0, &(0x7f0000000040)='.\x00', 0xb0afbd006181d6de, &(0x7f0000000540), 0x0)
open(&(0x7f0000000100)='./file0\x00', 0x0, 0x0)


r0 = open$dir(&(0x7f00000000c0)='./file0\x00', 0x40000400001803c1, 0x0)
pwritev(0xffffffffffffffff, 0x0, 0x0, 0x0)
ioctl$FIONWRITE(r0, 0x40046679, &(0x7f0000000040))


compat_40_mount(&(0x7f0000000040)='tmpfs\x00', &(0x7f00000000c0)='.\x00', 0x0, &(0x7f0000000140)="01")
__mount50(&(0x7f0000000080)='overlay\x00', &(0x7f0000000040)='.\x00', 0x0, &(0x7f0000000540), 0x0)
mknod$loop(&(0x7f0000000000)='./file0\x00', 0x2000, 0x1)
link(0x0, 0x0)
unlink(&(0x7f00000003c0)='./file0\x00')


compat_40_mount(&(0x7f0000000040)='tmpfs\x00', &(0x7f0000000140)='.\x00', 0x0, &(0x7f0000000080)="01")
symlink(&(0x7f0000000080)='.\x00', &(0x7f0000000240)='./file0\x00')
mkdir(&(0x7f00000001c0)='./file1\x00', 0x0)
rename(&(0x7f00000003c0)='./file0\x00', &(0x7f0000000380)='./file1/file0\x00')
symlink(&(0x7f0000000080)='.\x00', &(0x7f0000000240)='./file0\x00')
rename(&(0x7f00000003c0)='./file0\x00', &(0x7f0000000380)='./file1/file0\x00')


open(&(0x7f0000000340)='./file0\x00', 0x300, 0x0)
msgget(0x3, 0x0)
write(0xffffffffffffffff, 0x0, 0x0)
symlink(0x0, 0x0)
__mount50(&(0x7f00000002c0)='overlay\x00', &(0x7f0000000040)='.\x00', 0x0, &(0x7f0000000280), 0x0)
r0 = open$dir(&(0x7f0000000280)='./file0\x00', 0x0, 0x0)
compat_50_futimes(r0, 0x0)


r0 = shmget$private(0x0, 0x12000, 0x0, &(0x7f00002b9000/0x12000)=nil)
shmctl$SHM_LOCK(r0, 0x3)
compat_50___shmctl13$IPC_RMID(r0, 0x3)


mkdir(&(0x7f0000000000)='./file0\x00', 0x0)
chmod(&(0x7f0000000280)='./file0\x00', 0x3a)
chdir(&(0x7f0000000240)='./file0\x00')
mkdir(&(0x7f00000001c0)='./file1\x00', 0x0)
socketpair$unix(0x1, 0x2, 0x0, &(0x7f0000000540)={<r0=>0xffffffffffffffff})
getsockopt$sock_cred(r0, 0xffff, 0x1022, &(0x7f0000000140)={0x0, 0x0, <r1=>0x0}, &(0x7f0000001a00)=0xc)
chown(&(0x7f0000000040)='./file1\x00', 0x0, r1)
chmod(&(0x7f00000000c0)='./file1\x00', 0x13)
setreuid(0x0, 0xee01)
mkdir(&(0x7f0000000100)='./file0\x00', 0x184)
rename(&(0x7f00000018c0)='./file0\x00', &(0x7f0000001900)='./file1\x00')


compat_43_ocreat(&(0x7f0000000000)='./file0\x00', 0x0)
chroot(&(0x7f0000000180)='./file0\x00')


compat_40_mount(&(0x7f0000000040)='tmpfs\x00', &(0x7f00000000c0)='.\x00', 0x0, &(0x7f00000002c0)="01")
open(&(0x7f0000000040)='./file0\x00', 0xf8e, 0x0)
mknod(0x0, 0x0, 0x0)
__mount50(&(0x7f00000002c0)='overlay\x00', &(0x7f0000000040)='.\x00', 0x0, &(0x7f0000000540), 0x0)
pathconf(&(0x7f0000000180)='./file0\x00', 0x5)


r0 = getsid(0x0)
ptrace(0x9, r0, 0x0, 0x0)
compat_50_wait4(0x0, 0x0, 0x0, 0x0)
ptrace(0xa, r0, 0x0, 0x0)


mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
__fhopen40(0x0, 0x0, 0x0)
r0 = socket$unix(0x1, 0x5, 0x0)
getsockopt$sock_linger(r0, 0xffff, 0x4, 0x0, 0x0)


mlock(&(0x7f0000ffc000/0x4000)=nil, 0x4000)
setsockopt(0xffffffffffffffff, 0x0, 0x0, 0x0, 0x0)
getuid()
socketpair$unix(0x1, 0x2, 0x0, &(0x7f0000000540)={<r0=>0xffffffffffffffff})
getsockopt$sock_cred(r0, 0xffff, 0x1022, 0x0, 0x0)
sendmsg$unix(0xffffffffffffffff, 0x0, 0x0)
mincore(&(0x7f0000ffd000/0x3000)=nil, 0x3000, &(0x7f0000000200)=""/181)


readv(0xffffffffffffffff, &(0x7f0000000100)=[{&(0x7f00000002c0)=""/51, 0x33}], 0x1)
sendmsg(0xffffffffffffffff, &(0x7f00000003c0)={0x0, 0x0, 0x0, 0x0, &(0x7f0000000100)=ANY=[@ANYBLOB="1800000029"], 0x3e}, 0x0)
r0 = socket(0x18, 0x2, 0x0)
connect$unix(r0, &(0x7f00000000c0)=@abs={0x682eb13985c518e6, 0x7}, 0x1c)
sendmsg(r0, &(0x7f0000000080)={0x0, 0x0, 0x0, 0x0, &(0x7f0000000100)=ANY=[@ANYBLOB="10"], 0x10}, 0x0)


symlink(&(0x7f0000000080)='.\x00', &(0x7f0000000300)='./file0\x00')
compat_50___msgctl13$IPC_STAT(0x0, 0x2, &(0x7f0000000280)={{}, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, &(0x7f00000001c0)={&(0x7f0000000100)}})
compat_40_mount(&(0x7f0000000140)='lfs\x00', &(0x7f00000000c0)='./file0\x00', 0x0, &(0x7f00000001c0))


ioctl$WSMUXIO_REMOVE_DEVICE(0xffffffffffffffff, 0x80085762, &(0x7f0000000040)={0x1})
r0 = socket(0x18, 0x3, 0x3a)
setsockopt(r0, 0x29, 0x6c, &(0x7f0000000040), 0x4)
setsockopt$inet6_MRT6_ADD_MIF(r0, 0x29, 0x65, 0x0, 0x0)


compat_40_mount(&(0x7f0000000040)='tmpfs\x00', &(0x7f0000000140)='.\x00', 0x0, &(0x7f0000000080)="01")
open(&(0x7f0000001700)='./file0\x00', 0x70e, 0x0)
pathconf(&(0x7f0000000300)='./file0\x00', 0xa)


socketpair(0x18, 0x3, 0x3c, 0x0)


compat_50_setitimer(0x0, &(0x7f0000001800)={{}, {0x1}}, 0x0)
compat_50_getitimer(0x3, &(0x7f0000000140))


mknod(&(0x7f0000000280)='./file0\x00', 0x2000, 0x0)
r0 = open(&(0x7f0000000140)='./file0\x00', 0x0, 0x0)
ioctl$FIOASYNC(r0, 0x80047410, &(0x7f0000000180))


open(&(0x7f0000000100)='./file0\x00', 0x10310, 0x0)
compat_50_quotactl(&(0x7f0000000000)='./file0\x00', 0x0, 0x0, 0x0)


r0 = socket$inet6(0x18, 0x3, 0x0)
poll(&(0x7f0000000400)=[{r0, 0x80}, {r0, 0x80}], 0x2, 0x7)


mknod(&(0x7f00000000c0)='./bus\x00', 0x2000, 0x4f4b)
r0 = open$dir(&(0x7f0000000140)='./bus\x00', 0x0, 0x0)
compat_50_select(0x40, &(0x7f0000000000)={0xffffffff}, 0x0, 0x0, 0x0)
ioctl$FIOASYNC(r0, 0xc0104304, &(0x7f00000001c0)=0x20000002)


modctl$MODCTL_UNLOAD(0x2, 0x0)
mknod(0x0, 0x0, 0x0)
r0 = socket(0x1, 0x1, 0x0)
getsockopt$inet_opts(r0, 0x0, 0x0, 0x0, 0x0)


mkdir(&(0x7f0000000080)='./file0\x00', 0x0)
__mount50(&(0x7f0000000000)='kernfs\x00', &(0x7f0000000180)='./file0\x00', 0x0, 0x0, 0x0)
r0 = open(&(0x7f0000000140)='./file0\x00', 0x0, 0x0)
fcntl$setown(r0, 0xf, 0x0)


mknod(&(0x7f0000000000)='./file0\x00', 0x2000, 0x40000802)
r0 = open$dir(&(0x7f00000000c0)='./file0\x00', 0x2, 0x0)
pwritev(r0, &(0x7f0000000080)=[{&(0x7f0000000440)="0210990a", 0x4}], 0x1, 0x0)


syz_emit_ethernet(0xe, &(0x7f0000000000))
r0 = open(&(0x7f0000000080)='./file0\x00', 0x70e, 0x0)
mmap(&(0x7f000000f000/0x2000)=nil, 0x2000, 0x4, 0x10, r0, 0x0, 0x0)
semget$private(0x0, 0x0, 0x0)
mmap(&(0x7f0000fff000/0x1000)=nil, 0xfffffffffffffff7, 0x0, 0x10, 0xffffffffffffffff, 0x0, 0x0)
semctl$GETZCNT(0x0, 0x0, 0x7, 0x0)
fsync(0xffffffffffffffff)
mlock(&(0x7f000000f000/0x4000)=nil, 0x4000)


compat_50___msgctl13$IPC_STAT(0x0, 0x2, &(0x7f0000000280)={{}, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, &(0x7f00000001c0)={&(0x7f0000000100), 0xdffffffffffff7ff}})
symlink(&(0x7f0000000080)='.\x00', &(0x7f0000000240)='./file0\x00')
getppid()
lchown(&(0x7f0000000100)='./file0\x00', 0x0, 0x0)
compat_40_mount(&(0x7f0000000380)='union\x00', &(0x7f0000000140)='./file0\x00', 0x0, &(0x7f00000001c0))
unlink(&(0x7f0000000000)='./file0\x00')
__mount50(&(0x7f0000000440)='overlay\x00', &(0x7f0000000040)='.\x00', 0x0, &(0x7f0000000540), 0x0)
compat_40_mount(0x0, &(0x7f0000000140)='./file0\x00', 0x0, 0x0)


_lwp_create(&(0x7f0000000d00)={0x4, 0x0, {}, {}, {0x0, 0x0, '-{*\x00'}}, 0x0, 0x0)


mknod(&(0x7f00000000c0)='./file0\x00', 0x2000, 0x4301)
r0 = open(&(0x7f0000000040)='./file0\x00', 0x0, 0x0)
ioctl$WSKBDIO_SETMAP(r0, 0x80187701, &(0x7f0000000000)={0x0, 0x0})


mkdirat(0xffffffffffffffff, &(0x7f0000000100)='./bus\x00', 0x0)
writev(0xffffffffffffffff, &(0x7f00000002c0)=[{&(0x7f0000000080)="76e5eac907f9ccf7a251ceddcec7d6aa45cffe2c63a56077123a276d3ba4e9d17eb3eb5db12a3783a8e0620d357de1fe04fa9465b5bd1286e9624dec06a00c222f", 0x41}], 0x1)
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
symlink(&(0x7f00000000c0)='./file0\x00', &(0x7f0000000100)='.\x00')
truncate(&(0x7f0000000100)='./file0\x00', 0x0, 0x0)


socketpair$unix(0x1, 0x2, 0x0, &(0x7f00000000c0)={<r0=>0xffffffffffffffff})
recvfrom$unix(r0, &(0x7f00000000c0), 0xffffffffffffff9d, 0x0, &(0x7f0000000000)=@abs, 0x8)


poll(0xffffffffffffffff, 0xffffffff000004e8, 0x0)


r0 = semget$private(0x0, 0x2, 0x0)
semctl$GETALL(r0, 0x0, 0x6, &(0x7f0000000000)=""/4096)


setuid(0xee01)
compat_30_fhstat(0x0, 0x0)


mkdir(&(0x7f0000000080)='./file0\x00', 0x0)
__mount50(&(0x7f0000000180)='kernfs\x00', &(0x7f0000000380)='./file0\x00', 0x0, 0x0, 0x0)
unmount(&(0x7f0000000400)='./file0\x00', 0x0)


open(&(0x7f00000000c0)='./file0\x00', 0x205, 0x0)
ktrace(&(0x7f0000000200)='./file0\x00', 0x4, 0xd27d43220c7df9b, 0x0)
lchmod(&(0x7f0000000000)='./file0\x00', 0x0)


socketpair$unix(0x1, 0x1, 0x0, &(0x7f0000000040)={0xffffffffffffffff, <r0=>0xffffffffffffffff})
listen(r0, 0x0)


mkdir(&(0x7f0000000000)='./file0\x00', 0x0)
compat_40_mount(&(0x7f0000000180)='ptyfs\x00', &(0x7f00000002c0)='./file0\x00', 0x0, &(0x7f0000000500))
compat_40_mount(0x0, &(0x7f0000000140)='./file0\x00', 0x400000, &(0x7f0000000000))


mkdir(&(0x7f0000000080)='./file0\x00', 0x0)
compat_50___msgctl13$IPC_STAT(0x0, 0x2, &(0x7f0000000280)={{}, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, &(0x7f00000001c0)={&(0x7f0000000100), 0xdffffffffffff7ff}})
lchown(&(0x7f0000000100)='./file0\x00', 0x0, 0xffffffffffffffff)
__mount50(&(0x7f0000000000)='fdesc\x00', &(0x7f0000000200)='./file0\x00', 0x0, 0x0, 0x0)
compat_40_mount(&(0x7f0000000380)='union\x00', &(0x7f0000000140)='./file0\x00', 0x3, &(0x7f00000001c0))
pathconf(&(0x7f0000000200)='./file0\x00', 0xe)


open(&(0x7f0000000180)='./file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\x00', 0x0, 0x0)
mknod(&(0x7f00000000c0)='./bus\x00', 0x2000, 0x4f4b)
r0 = open$dir(&(0x7f0000000040)='./bus\x00', 0x0, 0x0)
ioctl$FIOASYNC(r0, 0x80104305, &(0x7f00000001c0))


_ksem_close(0x50535244)


sendmsg$unix(0xffffffffffffffff, &(0x7f0000000640)={0x0, 0x0, 0x0, 0x0, &(0x7f0000000080)=ANY=[@ANYBLOB="18000000ff"], 0x18}, 0x0)
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
r0 = socket$inet(0x2, 0x1, 0x0)
setsockopt$sock_timeval(r0, 0xffff, 0x1006, &(0x7f0000000080), 0x10)


compat_43_osetrlimit(0x9, &(0x7f0000000080))
socket(0x18, 0x3, 0x0)


modctl$MODCTL_UNLOAD(0x2, 0x0)
compat_50___msgctl13$IPC_STAT(0x0, 0x2, &(0x7f0000000280)={{}, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, &(0x7f00000001c0)={0x0, 0xdffffffffffff7ff}})
mknod(&(0x7f00000000c0)='./bus\x00', 0x2000, 0x4f4b)
r0 = open$dir(&(0x7f0000000040)='./bus\x00', 0x0, 0x0)
ioctl$FIOASYNC(r0, 0xc0104302, &(0x7f00000001c0))


mkdir(&(0x7f0000000080)='./file0\x00', 0x0)
chroot(&(0x7f0000000000)='.\x00')
getsid(0x0)
__mount50(&(0x7f0000000000)='overlay\x00', &(0x7f0000000040)='.\x00', 0x0, &(0x7f0000000540), 0x0)
rmdir(&(0x7f0000000180)='./file0/../file0\x00')


mknod(&(0x7f0000000200)='./file0\x00', 0x2000, 0x0)
r0 = open(&(0x7f00000000c0)='./file0\x00', 0x0, 0x0)
mknodat(r0, &(0x7f00000009c0)='./file0\x00', 0x1000, 0x0)
symlink(&(0x7f0000000080)='.\x00', &(0x7f0000000240)='./file0\x00')
chflags(&(0x7f0000000180)='./file0/file0\x00', 0x40001)
r1 = open(&(0x7f0000000480)='./file0\x00', 0x0, 0x0)
__futimes50(r1, 0x0)
r2 = socket(0x1f, 0x40000003, 0x0)
modctl$MODCTL_STAT(0x4, 0x0)
setsockopt$sock_timeval(r2, 0xffff, 0x100b, &(0x7f0000000200), 0x10)


mknod(&(0x7f0000000080)='./file0\x00', 0x2000, 0x0)
r0 = open(&(0x7f0000000100)='./file0\x00', 0x80000000000206, 0x0)
writev(r0, &(0x7f00000000c0)=[{&(0x7f0000000280)='#', 0x1}, {&(0x7f0000000000)="8d", 0x1}], 0x64)


open(&(0x7f0000000480)='./file0\x00', 0x80000000000206, 0x4ebfac6bbaf7949)
__posix_chown(&(0x7f0000000880)='./file0\x00', 0x0, 0x0)


r0 = socket(0x1, 0x1, 0x0)
close(r0)
fcntl$lock(r0, 0x0, 0x0)


open(&(0x7f00000000c0)='./file0\x00', 0x615, 0x0)
r0 = getpid()
ktrace(&(0x7f0000000000)='./file0\x00', 0x0, 0x144, r0)
_ksem_unlink(&(0x7f0000000300))


compat_40_mount(&(0x7f0000000080)='ffs\x00', 0x0, 0x0, 0x0)
modctl$MODCTL_UNLOAD(0x1, &(0x7f0000000080))


pipe(0x0)
socketpair$unix(0x1, 0x5, 0x0, &(0x7f0000000600)={0xffffffffffffffff, <r0=>0xffffffffffffffff})
ioctl$WSKBDIO_GTYPE(0xffffffffffffffff, 0x40045700, 0x0)
fcntl$dupfd(r0, 0xc, 0xffffffffffffffff)


compat_40_mount(0x0, 0x0, 0x0, 0x0)
compat_90_statvfs1(&(0x7f0000000040)='./file0\x00', 0x0, 0x0)


__clock_gettime50(0x20000001, 0x0)


mkdir(&(0x7f0000000080)='./file0\x00', 0x0)
compat_40_mount(&(0x7f0000000180)='ptyfs\x00', &(0x7f00000002c0)='./file0\x00', 0x0, &(0x7f0000000500))
compat_50___msgctl13$IPC_STAT(0x0, 0x2, &(0x7f0000000280)={{}, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, &(0x7f00000001c0)={&(0x7f0000000100), 0xdffffffffffff7ff}})
lchown(&(0x7f0000000100)='./file0\x00', 0x0, 0xffffffffffffffff)
compat_40_mount(&(0x7f0000000380)='union\x00', &(0x7f0000000140)='./file0\x00', 0x3, &(0x7f00000001c0))
pathconf(&(0x7f0000000200)='./file0\x00', 0x2)


compat_43_ocreat(&(0x7f0000000000)='./file0\x00', 0x0)
r0 = open(&(0x7f0000000280)='./file0\x00', 0x0, 0x0)
fcntl$lock(r0, 0x9, &(0x7f00000001c0)={0x0, 0x0, 0x0, 0x100000004})
flock(r0, 0x2)
close(r0)
setsockopt(r0, 0x0, 0x0, 0x0, 0x0)


mkdir(&(0x7f0000000080)='./file0\x00', 0x0)
r0 = open(&(0x7f0000002600)='./file0\x00', 0x0, 0x0)
ioctl$WSKBDIO_SETMAP(r0, 0xc004667a, &(0x7f0000000180)={0xfcffffff, 0x0})


mknod(&(0x7f0000000480)='./file0\x00', 0x2000, 0x1733)
r0 = open(&(0x7f00000000c0)='./file0\x00', 0x0, 0x0)
ioctl$FIOASYNC(r0, 0x2000427e, 0x0)
r1 = fcntl$dupfd(r0, 0x0, r0)
ioctl$FIOASYNC(r1, 0x40044271, &(0x7f0000000000))


setreuid(0x0, 0xee01)
compat_30_fhopen(0x0, 0x0)


r0 = socket$inet(0x2, 0x1, 0x0)
setsockopt$inet_opts(r0, 0x0, 0x1, &(0x7f0000000040)="6228951c7f850b317c2795c973d086f0b5976af4357c2b84254b38b4984076233bc439188634b4e6", 0x28)
r1 = dup2(r0, r0)
setsockopt$inet_opts(r1, 0x0, 0x1, &(0x7f0000000900)='P', 0x1)


setrlimit(0x4, &(0x7f00000010c0))


symlink(&(0x7f0000000080)='.\x00', &(0x7f0000000240)='./file0\x00')
compat_50___msgctl13$IPC_STAT(0x0, 0x2, &(0x7f0000000280)={{}, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, &(0x7f00000001c0)={&(0x7f0000000100)}})
open(&(0x7f0000000100)='./file0\x00', 0x0, 0x0)
compat_40_mount(&(0x7f0000000140)='umap\x00', &(0x7f0000000040)='./file0\x00', 0x0, &(0x7f00000001c0))
undelete(&(0x7f00000000c0)='./bus\x00')


r0 = socket(0x1f, 0x5, 0x0)
setsockopt$sock_int(r0, 0xffff, 0x0, 0x0, 0x0)
ioctl$FIOASYNC(0xffffffffffffffff, 0xc0104304, 0x0)
compat_50_clock_gettime(0x20000000, &(0x7f00000003c0))


pipe(&(0x7f0000000140)={<r0=>0xffffffffffffffff})
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
r1 = getpid()
fktrace(r0, 0x0, 0x62e2dd08f149ff1b, r1)
compat_50_setitimer(0x0, 0x0, 0x0)


open(&(0x7f0000000040)='./file0\x00', 0x70e, 0x0)
ktrace(&(0x7f0000000200)='./file0\x00', 0x4, 0xd27d43220c7df9b, 0x0)
mknod(0x0, 0x0, 0x4f4b)
__getrusage50(0x0, &(0x7f0000000000))


_ksem_open(&(0x7f0000000140)="cbfd50b9d558d8434b5a8cd90f3678e3f991305dd036e805107d0ea139225e64b82b89af8b886bae9a0a694f3bd2c843e4f8ba39445d0961e4eae949a5d9e01e95482973a10176bf8bac056bf8732696e5b4abbdcc2ceb27f9aba00fbf7b91f29b856f9f95cc39a13bb36870804e97a419f64ece661fe59cac434b6faa26437e259e9b465f2318", 0x0, 0x0, 0x0, 0x0)
mknod(&(0x7f00000000c0)='./bus\x00', 0x2000, 0x4f4b)
r0 = open$dir(&(0x7f0000000080)='./bus\x00', 0x0, 0x0)
ioctl$FIOASYNC(r0, 0xc0104304, &(0x7f00000001c0))


__mount50(&(0x7f00000001c0)='fdesc\x00', &(0x7f0000000040)='.\x00', 0x0, 0x0, 0x0)
__mount50(&(0x7f00000002c0)='overlay\x00', &(0x7f0000000040)='.\x00', 0x0, &(0x7f0000000540), 0x0)
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
compat_40_mount(0x0, &(0x7f00000002c0)='./file0\x00', 0x0, 0x0)


r0 = socket$inet(0x2, 0x1, 0x0)
fcntl$lock(0xffffffffffffffff, 0x0, &(0x7f0000000040)={0x2, 0x2, 0x0, 0x0, 0xffffffffffffffff})
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
bind$inet(r0, &(0x7f0000000040)={0x2, 0x0}, 0x10)


__mount50(&(0x7f0000000080)='lfs\x00', &(0x7f0000000000)='.\x00', 0x0, &(0x7f00000000c0), 0x0)


mkdir(&(0x7f0000000080)='./file0\x00', 0x0)
compat_43_stat43(&(0x7f0000000300)='./file0/../file0\x00', &(0x7f0000000340))


mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
compat_30___lstat13(&(0x7f0000000040)='./file0\x00', 0x0)


r0 = socket(0x10, 0x2, 0x0)
setreuid(0xee00, 0x0)
r1 = getuid()
setuid(r1)
ioctl$FIOSEEKHOLE(r0, 0x8090690c, &(0x7f0000000180)=0x8000000000000032)


mkdir(&(0x7f0000000080)='./file0\x00', 0x0)
__mount50(&(0x7f0000000000)='fdesc\x00', &(0x7f0000000180)='./file0\x00', 0x0, 0x0, 0x0)
pathconf(&(0x7f0000000480)='./file0\x00', 0x9)


r0 = socket(0x18, 0x2, 0x0)
setsockopt(r0, 0x1000000029, 0x23, &(0x7f0000000080)="b6", 0x1)


mknod(&(0x7f00000000c0)='./file0\x00', 0x6000, 0xe03)
r0 = open(&(0x7f0000000480)='./file0\x00', 0x80000000000206, 0x0)
writev(r0, &(0x7f0000000880)=[{0x0, 0x1000000}], 0x1)


compat_90_fstatvfs1(0xffffffffffffffff, 0x0, 0x0)
fchmodat(0xffffffffffffffff, 0x0, 0x0, 0x0)
open(&(0x7f0000001640)='./file0\x00', 0x615, 0x0)
ktrace(&(0x7f0000000200)='./file0\x00', 0x4, 0xd27d43220c7df9b, 0x0)
__getrusage50(0x748c23bba96db813, 0x0)


writev(0xffffffffffffffff, &(0x7f0000000240)=[{&(0x7f0000000140)}], 0x1)
socket$inet(0x2, 0x0, 0x1)
fcntl$lock(0xffffffffffffffff, 0x0, &(0x7f00000000c0)={0x0, 0x0, 0x0, 0x100000000000000, 0xffffffffffffffff})
r0 = socket(0x18, 0x1, 0x0)
close(r0)
semctl$IPC_SET(0x0, 0x0, 0x1, &(0x7f00000000c0)={{0x0, 0x0, 0xffffffffffffffff, 0x0, 0x0, 0x0, 0x200}})
r1 = socket(0x18, 0x3, 0x0)
connect$unix(r1, &(0x7f00000000c0)=@abs={0x682eb13985c518e6, 0x7}, 0x1c)
getsockname$inet(r1, &(0x7f00000000c0), &(0x7f0000000000)=0xffffffffffffff35)
r2 = socket(0x18, 0x2, 0x0)
r3 = socket(0x18, 0x3, 0x0)
connect$unix(r2, &(0x7f00000000c0)=@abs={0x682eb13985c518e6, 0x7}, 0x1c)
r4 = dup2(r2, r3)
setsockopt(r4, 0x1000000029, 0x23, 0x0, 0x0)
sendmsg(r3, &(0x7f0000000e00)={0x0, 0x0, 0x0, 0x0, 0x0}, 0x0)
socket(0x18, 0x2, 0x0)
connect$unix(r0, &(0x7f00000000c0)=@abs={0x0, 0x7}, 0x1c)


modctl$MODCTL_UNLOAD(0x2, 0x0)
modctl$MODCTL_UNLOAD(0x1, &(0x7f0000000180))


compat_40_mount(&(0x7f0000000040)='tmpfs\x00', &(0x7f00000000c0)='.\x00', 0x0, &(0x7f00000002c0)="01")
open(&(0x7f0000000040)='./file0\x00', 0xf8e, 0x0)
__mount50(&(0x7f00000002c0)='overlay\x00', &(0x7f0000000040)='.\x00', 0x0, &(0x7f0000000540), 0x0)
r0 = open$dir(&(0x7f0000000080)='./file0\x00', 0x0, 0x0)
ioctl$OFIOGETBMAP(r0, 0xc004667a, &(0x7f0000000140))


r0 = socket$inet(0x2, 0x2, 0x0)
recvmmsg(r0, 0x0, 0x0, 0x0, 0x0)


mkdir(&(0x7f0000000200)='./file0\x00', 0x0)
compat_40_mount(&(0x7f0000000180)='ptyfs\x00', &(0x7f00000002c0)='./file0\x00', 0x0, &(0x7f0000000500))
r0 = open(&(0x7f0000000140)='./file0\x00', 0x0, 0x0)
ioctl$WSDISPLAYIO_ADDSCREEN(r0, 0x8018574e, &(0x7f00000001c0)={0x0, 0x0, 0x0})


r0 = socket$inet(0x2, 0x1, 0x0)
setsockopt$sock_int(r0, 0xffff, 0x2000, &(0x7f0000000340), 0x57)


modctl$MODCTL_LOAD(0x3, 0x0)


mmap(&(0x7f0000000000/0xfbe000)=nil, 0xfbe000, 0x2, 0x31, 0xffffffffffffffff, 0x0, 0x0)
openat(0xffffffffffffff9c, &(0x7f0000000080)='.\x00', 0x0, 0x0)
madvise(&(0x7f0000000000/0x600000)=nil, 0x600003, 0x3)


_ksem_init(0x0, 0x0)
_ksem_post(0x0)
_ksem_open(0x0, 0x0, 0x0, 0x0, 0x0)
pipe2(&(0x7f0000001840)={<r0=>0xffffffffffffffff}, 0x0)
compat_30___fstat13(r0, &(0x7f0000001900))


mkdir(&(0x7f0000000080)='./file0\x00', 0x0)
compat_40_mount(&(0x7f0000000180)='ptyfs\x00', &(0x7f00000002c0)='./file0\x00', 0x0, &(0x7f0000000500))
compat_50___msgctl13$IPC_STAT(0x0, 0x2, &(0x7f0000000280)={{}, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, &(0x7f00000001c0)={&(0x7f0000000100)}})
open(&(0x7f0000000100)='./file0\x00', 0x0, 0x0)
compat_40_mount(&(0x7f0000000140)='umap\x00', &(0x7f0000000040)='./file0\x00', 0x0, &(0x7f00000001c0))
pathconf(&(0x7f0000000080)='./file0\x00', 0x3)


compat_40_mount(0x0, &(0x7f00000000c0)='.\x00', 0x0, 0x0)
__mount50(&(0x7f0000000000)='fdesc\x00', &(0x7f0000000040)='.\x00', 0x0, 0x0, 0x0)
__mount50(&(0x7f0000000000)='overlay\x00', 0x0, 0x0, 0x0, 0x49)
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
compat_40_mount(&(0x7f0000000000)='ntfs\x00', &(0x7f0000000040)='./file0\x00', 0x0, &(0x7f0000000180))
r0 = open$dir(&(0x7f00000000c0)='./file0\x00', 0x0, 0x0)
r1 = dup(r0)
__getdents30(r1, 0x0, 0xa9d9)


poll(0xffffffffffffffff, 0x0, 0x0)


mlock(&(0x7f0000ffc000/0x2000)=nil, 0x2000)
open(&(0x7f0000000040)='./file0\x00', 0x70e, 0x0)
r0 = getpid()
r1 = socket(0x2, 0x3, 0x0)
getsockopt(r1, 0x0, 0x67, 0x0, 0x0)
ktrace(&(0x7f0000000000)='./file0\x00', 0x0, 0x144, r0)
r2 = getppid()
ktrace(&(0x7f0000000040)='./file0\x00', 0x4, 0x20001410, r2)
socket(0x0, 0x0, 0x0)
connect$unix(0xffffffffffffffff, 0x0, 0x0)
getsockname$unix(0xffffffffffffffff, 0x0, 0x0)
r3 = socket(0x0, 0x0, 0x0)
bind(r3, &(0x7f00000000c0), 0x10)
dup(r3)
listen(0xffffffffffffffff, 0x0)
r4 = socket(0x0, 0x1, 0x0)
connect$unix(r4, &(0x7f0000000000), 0x10)
r5 = fcntl$dupfd(0xffffffffffffffff, 0x2, 0xffffffffffffffff)
close(r5)
setsockopt$sock_int(r5, 0xffff, 0x100, &(0x7f0000000140)=0x401, 0x4)
sendmsg$unix(r4, &(0x7f0000000c00)={0x0, 0x0, &(0x7f0000000040)=[{&(0x7f0000000740)="fe7f9a0e114c76e869455e5246d0b56f404c023f7137734703d8", 0x1a}, {&(0x7f0000000800)="3adcdd4a52eba1c8f8cade36bf2b05ea3f4fa5d762686994fa8307e0ce6b6c5d7562fc0e21c89ef827471fc93c0d2c6b03fe826c1a54e4193534", 0x3a}, {&(0x7f0000000180)="1186d6a1a165befebbb57ab4394138ad17f7f939b9ce31126ab7ee0e36cdcb4d2be37b3b9ee77f7421faeb37bbf1d43b9f7800715a67a64bdac225bf76", 0x3d}], 0x3}, 0x0)
recvmsg(r5, &(0x7f00000005c0)={0x0, 0x0, &(0x7f0000000500)=[{0x0}], 0x1, 0x0}, 0x0)
pwritev(0xffffffffffffffff, 0x0, 0x0, 0x40)
r6 = socket(0x11, 0x3, 0x0)
sendto$unix(r6, &(0x7f0000000000)="b10005136000009f05003e0800000000331c13fecea10500fef96ecfc72fd3357a068d02bc31a3a5673039d2d236acf20b7804be38164991f7cccf5f882b297be1aa5b23edeb51e2f0ac3ebbc257699a1f139b672f4d335c223e7d026ba8faff0037282112000000720fd38bfbb770c1f5a872c881e2772ec5a10400000000000000361b1257aea8c5d0002012000000000000880d6633c556ae9b287948a62310db415f779642cdcd71a3f8343712051e", 0xb1, 0x0, 0x0, 0x0)
setrlimit(0x0, &(0x7f0000000000)={0x0, 0x7})
open(&(0x7f0000000040)='./file0\x00', 0x0, 0x0)


symlink(&(0x7f0000000080)='.\x00', 0x0)
mknod(&(0x7f0000000280)='./file0\x00', 0x1ffa, 0x0)
r0 = open(&(0x7f0000000000)='./file0\x00', 0x2, 0x0)
ioctl$FIONREAD(r0, 0x8020699d, &(0x7f00000001c0))
compat_40_mount(&(0x7f0000000200)='ptyfs\x00', &(0x7f00000003c0)='.\x00', 0x0, &(0x7f0000000a80))
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x4)
pathconf(&(0x7f0000000080)='./file0\x00', 0x2)


fcntl$lock(0xffffffffffffffff, 0x0, &(0x7f0000000040)={0x1})
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
r0 = socket(0x18, 0x2, 0x0)
setsockopt(r0, 0x1000000000029, 0x2a, &(0x7f0000000040)='\x00\x00\x00\x00', 0x4)


shmget(0x2, 0x3000, 0x200, &(0x7f0000ffc000/0x3000)=nil)
shmget(0x2, 0x2000, 0x57403f0476536eb5, &(0x7f0000ffe000/0x2000)=nil)


r0 = getsid(0x0)
ptrace(0x9, r0, 0x0, 0x0)
mkdir(&(0x7f00000002c0)='./bus\x00', 0x0)
compat_50_wait4(0x0, &(0x7f00000002c0), 0x0, &(0x7f0000000300))


mkdir(&(0x7f0000000080)='./file0\x00', 0x0)
__mount50(&(0x7f0000000040)='procfs\x00', &(0x7f0000000080)='./file0\x00', 0x0, &(0x7f00000000c0)='@', 0x1)
open(&(0x7f0000000180)='./file0\x00', 0x0, 0x0)
unmount(&(0x7f0000000000)='./file0\x00', 0x0)


r0 = socket$inet6(0x18, 0x3, 0x0)
setsockopt$sock_timeval(r0, 0xffff, 0x100c, &(0x7f0000000000)={0x0, 0xfffffffffffffff9}, 0x10)


pipe(&(0x7f0000001400))
getpid()
rasctl(0x0, 0x0, 0x8)
socket$inet(0x2, 0x0, 0x0)
listen(0xffffffffffffffff, 0x0)
getsockopt$sock_int(0xffffffffffffffff, 0xffff, 0x1007, 0x0, 0x0)
getsockopt$SO_PEERCRED(0xffffffffffffffff, 0xffff, 0x1022, &(0x7f0000000240)={<r0=>0x0}, 0xc)
r1 = geteuid()
msgctl$IPC_SET(0x0, 0x1, &(0x7f00000004c0)={{0x29a4, r1}, 0x3, 0x0, r0, 0x0, 0xb, 0x9, 0x2})
r2 = msgget$private(0x0, 0xfffffffffffffffd)
msgrcv(r2, &(0x7f0000000940), 0x8, 0x2, 0x1000)
getsockopt$SO_PEERCRED(0xffffffffffffffff, 0xffff, 0x1022, &(0x7f0000000140), 0xc)
msgsnd(r2, &(0x7f0000000180)=ANY=[@ANYBLOB="0300000000000000a486714b3b6964c6220190d7f39c044dac99fec5afca3ec3e155903698d635e2ab348195cce43ab9e134935e4edf5efe4e5ec4bec02d51201f93b9860f69d58fca21e1f36041df344b049af8bf321177b2fdc7cc2725691dc000"/110], 0x6e, 0x0)
msgrcv(r2, &(0x7f0000000380), 0x8, 0x3, 0x1800)


symlink(&(0x7f0000000080)='.\x00', &(0x7f0000000240)='./file0\x00')
compat_50___msgctl13$IPC_STAT(0x0, 0x2, &(0x7f0000000280)={{}, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, &(0x7f00000001c0)={&(0x7f0000000100), 0xdffffffffffff7ff}})
lchown(&(0x7f0000000100)='./file0\x00', 0x0, 0xffffffffffffffff)
compat_40_mount(&(0x7f0000000380)='union\x00', &(0x7f0000000140)='./file0\x00', 0x3, &(0x7f00000001c0))
link(&(0x7f0000001f40)='./file0\x00', &(0x7f0000002040)='./file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\x00')


r0 = __clone(0x0, 0x0)
__wait450(r0, 0x0, 0x0, 0x0)


r0 = socket(0x18, 0x2, 0x0)
socket(0x0, 0x0, 0x0)
paccept(r0, 0x0, 0x0, 0x0)


open$dir(0x0, 0x0, 0x0)
r0 = socket(0x18, 0x1, 0x0)
r1 = socket(0x18, 0x1, 0x0)
setsockopt(r1, 0x6, 0x2, &(0x7f0000000140)="03000000", 0x4)
dup2(r1, r0)
connect$unix(r0, &(0x7f00000000c0)=@abs={0x682eb13985c518e6, 0x7}, 0x1c)


symlink(&(0x7f0000000100)='./bus/\x00', &(0x7f0000000140)='./bus\x00')
open(&(0x7f0000000040)='./bus\x00', 0x0, 0x0)


r0 = socket$inet(0x2, 0x2, 0x0)
setsockopt$sock_timeval(r0, 0xffff, 0x4000, 0x0, 0x0)


pipe(&(0x7f0000000a40)={<r0=>0xffffffffffffffff})
r1 = getpid()
fktrace(r0, 0x0, 0x0, r1)
r2 = shmget$private(0x0, 0x2000, 0x0, &(0x7f0000ffe000/0x2000)=nil)
shmctl$IPC_STAT(r2, 0x2, &(0x7f0000000580)=""/4)


open(&(0x7f0000000500)='./file0\x00', 0x70e, 0x0)
writev(0xffffffffffffffff, 0x0, 0x0)
compat_30_getfh(0x0, 0x0)
r0 = open(&(0x7f0000000040)='./file0\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x200000)=nil, 0x200000, 0x0, 0x10, r0, 0x0, 0x0)
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
madvise(&(0x7f0000000000/0x3000)=nil, 0x3000, 0x0)


readlink(&(0x7f0000000300)='./file0\x00', 0x0, 0x0)
__setitimer50(0x0, &(0x7f0000000000)={{}, {0x2}}, 0x0)
compat_60__lwp_park(&(0x7f0000000000)={0x8000000000000000}, 0x0, 0x0, 0x0)
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
r0 = socket(0x18, 0x1, 0x0)
setsockopt(r0, 0x1000000029, 0xd, &(0x7f0000000000)="ebffcbff13b9fd5e2eaa3d713048e69931929648", 0x14)


mkdir(&(0x7f0000000080)='./file0\x00', 0x0)
compat_40_mount(&(0x7f0000000180)='ptyfs\x00', &(0x7f00000002c0)='./file0\x00', 0x0, &(0x7f0000000500))
open$dir(&(0x7f0000000040)='./file0\x00', 0x20, 0x0)


r0 = msgget(0x2, 0x0)
shmctl$SHM_LOCK(r0, 0xb)


compat_50___msgctl13$IPC_STAT(0x0, 0x2, &(0x7f0000000280)={{}, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, &(0x7f00000001c0)={&(0x7f0000000100), 0xdffffffffffff7ff}})
symlink(&(0x7f0000000080)='.\x00', &(0x7f0000000240)='./file0\x00')
lchown(0x0, 0x0, 0xffffffffffffffff)
lchown(&(0x7f0000000100)='./file0\x00', 0x0, 0x0)
compat_40_mount(&(0x7f0000000380)='union\x00', &(0x7f0000000140)='./file0\x00', 0x0, &(0x7f00000001c0))
__mount50(&(0x7f0000000440)='overlay\x00', &(0x7f0000000040)='.\x00', 0x0, &(0x7f0000000540), 0x0)
compat_40_mount(&(0x7f0000000380)='union\x00', &(0x7f0000000140)='./file0\x00', 0x0, &(0x7f00000001c0))


compat_40_mount(&(0x7f0000000380)='tmpfs\x00', &(0x7f00000003c0)='.\x00', 0x0, &(0x7f0000000140)="01")
r0 = socket$inet(0x2, 0x2, 0x0)
mknod(&(0x7f00000000c0)='./file0\x00', 0x2000, 0x4100)
open(&(0x7f0000000100)='./file0\x00', 0x615, 0x0)
fcntl$lock(r0, 0xa, 0x0)


r0 = open(&(0x7f0000000040)='./file0\x00', 0x70e, 0x0)
truncate(&(0x7f0000000200)='./file0\x00', 0x0, 0x7fff)
_lwp_self()
_lwp_wakeup(0x0)
mmap(&(0x7f0000000000/0x13000)=nil, 0x13000, 0x3, 0x10, r0, 0x0, 0x0)
accept(0xffffffffffffffff, 0x0, &(0x7f00000000c0))
madvise(&(0x7f0000000000/0x3000)=nil, 0x3000, 0x6)


ioctl$FIOSEEKHOLE(0xffffffffffffffff, 0x8018698d, &(0x7f0000000180)=0x8000000000000031)
poll(0x0, 0x0, 0x0)
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x4)
r0 = socket(0x18, 0x1, 0x0)
ioctl$FIOSEEKHOLE(r0, 0xc118691d, &(0x7f0000000180))


setrlimit(0x0, &(0x7f00000000c0))
setrlimit(0xb, &(0x7f0000000080))


r0 = socket(0x1, 0x2, 0x0)
ioctl$FIONREAD(r0, 0xc0106914, &(0x7f0000000080))
open(&(0x7f0000000040)='./file0\x00', 0x70e, 0x0)
__mount50(&(0x7f00000002c0)='overlay\x00', &(0x7f0000000040)='.\x00', 0x0, &(0x7f0000000540), 0x0)
chdir(&(0x7f0000000100)='./file0\x00')


mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x0)
r0 = socket$inet(0x2, 0x1, 0x0)
setsockopt(r0, 0x0, 0x19, 0x0, 0x0)


r0 = open$dir(&(0x7f0000001240)='.\x00', 0x0, 0x0)
compat_30_getdents(r0, 0x0, 0x0)


madvise(&(0x7f0000ffd000/0x3000)=nil, 0x3000, 0x0)
mlock(&(0x7f0000ffc000/0x3000)=nil, 0x3000)
mlock(&(0x7f0000ffa000/0x4000)=nil, 0x4000)
munlock(&(0x7f0000ffb000/0x3000)=nil, 0x3000)


__setitimer50(0x0, &(0x7f0000000080), &(0x7f0000000140))


open(&(0x7f0000000180)='./file0\x00', 0x75f493fec6515f78, 0x0)
__lutimes50(&(0x7f0000000080)='./file0\x00', 0x0)
r0 = open(&(0x7f0000000100)='./file0\x00', 0x0, 0x0)
fcntl$lock(r0, 0x9, &(0x7f0000000080)={0x0, 0x0, 0x0, 0x100000001})


symlink(&(0x7f0000000000)='./file0\x00', &(0x7f0000000100)='./file0\x00')
symlink(0x0, 0x0)
execve(&(0x7f00000004c0)='./file0/file0\x00', 0x0, 0x0)


open(&(0x7f0000000140)='./file0\x00', 0x78e, 0x0)
compat_90_statvfs1(&(0x7f0000000040)='./file0\x00', &(0x7f0000000400), 0x3)


socket(0x0, 0x0, 0x0)
__mount50(&(0x7f00000002c0)='overlay\x00', &(0x7f0000000040)='.\x00', 0x0, &(0x7f0000000540), 0x0)
symlink(&(0x7f0000000080)='.\x00', &(0x7f0000000240)='./file0\x00')
compat_50___msgctl13$IPC_STAT(0x0, 0x2, &(0x7f0000000280)={{}, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, &(0x7f00000001c0)={&(0x7f0000000100), 0xdffffffffffff7ff}})
__mount50(&(0x7f0000000000)='overlay\x00', &(0x7f0000000040)='.\x00', 0x0, &(0x7f0000000540), 0x0)
lchown(&(0x7f0000000100)='./file0/file0\x00', 0x0, 0x0)
compat_40_mount(&(0x7f0000000380)='union\x00', &(0x7f0000000140)='./file0\x00', 0x0, &(0x7f00000001c0))
close(0x4)
modctl$MODCTL_UNLOAD(0x2, 0x0)
unlink(&(0x7f0000000000)='./file0\x00')
r0 = open(&(0x7f0000000040)='./file0\x00', 0x70e, 0x0)
mmap(&(0x7f0000000000/0x13000)=nil, 0x13000, 0x3, 0x10, r0, 0x0, 0x0)
open(&(0x7f0000000180)='./file0\x00', 0x0, 0x0)


mknod(&(0x7f0000000080)='./file0\x00', 0x2000, 0x5200)
r0 = open(&(0x7f0000000000)='./file0\x00', 0x0, 0x0)
poll(&(0x7f0000000040)=[{r0, 0x40}], 0x1, 0x0)


r0 = socket(0x1, 0x2, 0x0)
ioctl$FIONREAD(r0, 0xc0106914, &(0x7f0000000080))
open(0x0, 0x0, 0x0)
__select50(0x0, 0x0, &(0x7f00000000c0)={0x9}, 0x0, 0x0)
socketpair(0x1f, 0x5, 0x0, 0x0)


symlink(&(0x7f0000000080)='.\x00', &(0x7f0000000240)='./file0\x00')
r0 = open(&(0x7f0000000100)='./file0\x00', 0x0, 0x0)
r1 = openat(r0, &(0x7f00000000c0)='./file0\x00', 0x0, 0x0)
ioctl$FIOGETBMAP(r1, 0xc008667a, &(0x7f00000001c0)=0x200000000000)


socketpair$unix(0x1, 0x5, 0x0, &(0x7f0000000000)={0xffffffffffffffff, <r0=>0xffffffffffffffff})
ioctl$FIOASYNC(r0, 0x8004667d, &(0x7f0000002d00)=0x9)
r1 = getpid()
fcntl$setown(r0, 0x6, r1)
shutdown(r0, 0x1)


r0 = socket(0x11, 0x3, 0x0)
recvmmsg(r0, &(0x7f0000000440)={&(0x7f00000000c0)={0x0, 0x0, &(0x7f0000000580)=[{0x0}, {0x0}, {&(0x7f0000000240)=""/90, 0x5a}], 0x3, 0x0}}, 0x10, 0x0, 0x0)
execve(0x0, 0x0, 0x0)
shutdown(0xffffffffffffffff, 0x0)
sendto$unix(r0, 0x0, 0x0, 0x0, &(0x7f0000000100)=@file={0x0, './file0\x00'}, 0x37)


madvise(&(0x7f0000ffa000/0x4000)=nil, 0x4000, 0x0)
mlock(&(0x7f0000ffb000/0x1000)=nil, 0x1000)
mmap(&(0x7f0000ff9000/0x4000)=nil, 0x4000, 0xd49f275d97cc01bb, 0x1810, 0xffffffffffffffff, 0x0, 0x0)


open(&(0x7f0000000480)='./file0\x00', 0x80000000000206, 0x4ebfac6bbaf7949)
r0 = open(&(0x7f00000000c0)='./file0\x00', 0x80000000000206, 0x0)
writev(r0, &(0x7f0000000280)=[{&(0x7f0000000000)='#!', 0x2}], 0x1)
writev(r0, &(0x7f00000002c0)=[{&(0x7f0000000340)="200028b70f11e20902a317db14bf16df41fbf10e4c48f890bb925329f2b8481b55749075b2e012ee115ca9c10cbee72c801a384d188a10c56365228efe0fdf245adef39775b4da7c6a324c224b938743ddbb9340990cb7f3619208b103f202faf951a0bc67e21738e741589f262141d357790a", 0x73}], 0x1)
execve(&(0x7f0000000140)='./file0\x00', 0x0, 0x0)


socket$inet(0x2, 0x0, 0x0)
setreuid(0xee00, 0x0)
getuid()
setreuid(0x0, 0x0)
socket(0x0, 0x2, 0x0)
socket(0x18, 0x1, 0x0)
setsockopt(0xffffffffffffffff, 0x0, 0x0, 0x0, 0x0)
dup2(0xffffffffffffffff, 0xffffffffffffffff)
r0 = semget$private(0x0, 0x7, 0x3c0)
semop(r0, &(0x7f0000000180)=[{}, {0x0, 0x9}, {0x2, 0x5, 0x1800}, {0x4, 0x9e, 0x1000}], 0x4)
semctl$SETALL(r0, 0x0, 0x9, &(0x7f00000004c0)=[0x0])


r0 = getsid(0x0)
ptrace(0x9, r0, 0x0, 0x0)
compat_50_wait4(0x0, 0x0, 0x0, 0x0)
ptrace(0x14, r0, 0x0, 0x88)


fork()
r0 = getpid()
__wait450(r0, 0x0, 0x0, 0x0)


open(&(0x7f0000000040)='./file0\x00', 0x70e, 0x0)
_ksem_destroy(0x0)
getpid()
ktrace(&(0x7f0000000000)='./file0\x00', 0x0, 0x0, 0x0)
rasctl(&(0x7f0000000240)="a06d37e9bce81527b2270943159274251cf0a0fc92bc9464f9738b09f5dc4689a13b50d7039f3b3830650a5db64716cede7618dc5d2841f37c00ba0bd2667db5bd3893e480cb62c686e5bca46964a668fa4e86fc03afbc268284486bfec2a029361885291c7a1e06b7673376c6612eaf83149664772b6f19c81b9b850a097cd7aad0b25c1b0c03bd90101184f289227d", 0xffff, 0x2)


__mount50(&(0x7f00000002c0)='overlay\x00', &(0x7f0000000040)='.\x00', 0x0, &(0x7f0000000540), 0x0)
mknod(&(0x7f0000000280)='./file0\x00', 0x1100, 0x0)
ktrace(&(0x7f0000000000)='./file0\x00', 0x0, 0x0, 0x0)


compat_40_mount(&(0x7f0000000080)='ext2fs\x00', &(0x7f00000005c0)='.\x00', 0x0, &(0x7f00000002c0)="01")


__mount50(&(0x7f00000002c0)='overlay\x00', &(0x7f0000000040)='.\x00', 0x0, &(0x7f0000000540), 0x0)
open(&(0x7f0000000480)='./file0\x00', 0x80000000000206, 0x0)
__getfh30(&(0x7f0000000300)='./file0\x00', 0x0, 0x0)


mmap(&(0x7f0000001000/0x2000)=nil, 0x2000, 0x0, 0x46b610ad5bca5b31, 0xffffffffffffffff, 0x0, 0x0)


mknod(&(0x7f00000000c0)='.\x00', 0x1000, 0x0)


open(&(0x7f0000000480)='./file0\x00', 0x80000000000206, 0x0)
r0 = getpid()
ktrace(&(0x7f0000000340)='./file0\x00', 0x0, 0xf, r0)
open(0x0, 0x0, 0x0)
_lwp_getname(0x0, 0x0, 0x0)


r0 = socket(0x18, 0x2, 0x0)
r1 = socket(0x18, 0x2, 0x0)
r2 = dup2(r0, r1)
compat_43_ogetsockname(r2, &(0x7f0000000000)=""/2, &(0x7f0000000040)=0x2)


ioctl$FIOASYNC(0xffffffffffffffff, 0x8004667d, &(0x7f0000000100)=0x80)
r0 = socket(0x18, 0x3, 0x0)
getsockopt$sock_cred(r0, 0xffff, 0x1022, 0x0, 0x0)
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
mknod(&(0x7f0000000100)='./file0\x00', 0x80002005, 0x5300)


socketpair$unix(0x1, 0x2, 0x0, &(0x7f0000000080)={0xffffffffffffffff, <r0=>0xffffffffffffffff})
getsockopt$sock_int(r0, 0xffff, 0x1004, 0xfffffffffffffffe, &(0x7f0000000040)=0x18)
syz_emit_ethernet(0xe, &(0x7f0000000000))
writev(0xffffffffffffffff, &(0x7f0000000580)=[{&(0x7f0000000000)="b886b4e47f", 0x5}], 0x1)
syz_emit_ethernet(0x138, &(0x7f0000000000))
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1)
r1 = socket(0x2, 0x1, 0x0)
bind(r1, &(0x7f0000000000), 0x10)
setsockopt(r1, 0x6, 0x8, &(0x7f00000001c0)="fcda85f8", 0x4)
listen(r1, 0x0)
r2 = socket(0x2, 0x1, 0x0)
connect$unix(r2, &(0x7f0000000000), 0x10)
socket$unix(0x1, 0x0, 0x0)
setsockopt(0xffffffffffffffff, 0x0, 0x0, &(0x7f0000000040), 0x0)
sendto$inet(r2, &(0x7f0000000100)="18", 0xffffff36, 0x195a05e282d6161, 0x0, 0x0)
getpeername$unix(0xffffffffffffffff, 0x0, 0x0)
execve(0x0, 0x0, 0x0)


__mount50(&(0x7f00000001c0)='fdesc\x00', &(0x7f0000000040)='.\x00', 0x0, 0x0, 0x0)
__mount50(&(0x7f00000002c0)='overlay\x00', &(0x7f0000000040)='.\x00', 0x0, &(0x7f0000000540), 0x0)
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
pathconf(&(0x7f0000000040)='./file0\x00', 0x6)


fcntl$lock(0xffffffffffffffff, 0x0, &(0x7f00000000c0)={0x0, 0x0, 0x83fe})
recvmsg(0xffffffffffffffff, &(0x7f0000000540)={0x0, 0x0, &(0x7f0000000700)=[{&(0x7f0000000100)=""/211, 0xd3}, {&(0x7f0000000280)=""/219, 0xdb}], 0x2, 0x0}, 0x0)
r0 = socket(0x2, 0x4001, 0x0)
r1 = dup(r0)
r2 = fcntl$dupfd(r1, 0x2, 0xffffffffffffffff)
close(r2)
r3 = socket(0x18, 0x2, 0x0)
connect$unix(r3, &(0x7f00000000c0)=@abs={0x682eb13985c518e6, 0x7}, 0x1c)
getsockname$inet(r3, &(0x7f00000000c0), &(0x7f0000000380)=0xffffffffffffff24)
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x4)
r4 = socket(0x18, 0x1, 0x0)
dup2(r3, r4)
connect$unix(r4, &(0x7f00000000c0)=@abs={0x0, 0x7}, 0x1c)
sendmsg$unix(r4, &(0x7f0000000540)={0x0, 0x0, 0x0}, 0x0)
setsockopt$sock_int(r2, 0xffff, 0x1023, &(0x7f0000000040), 0x4)


r0 = socket$inet(0x2, 0x2, 0x0)
getsockopt(r0, 0x0, 0x9, 0x0, 0x0)


__fstat50(0xffffffffffffffff, 0x0)
modctl$MODCTL_UNLOAD(0x2, 0x0)
mkdir(&(0x7f0000000080)='./file0\x00', 0x0)
r0 = open(&(0x7f0000000200)='./file0\x00', 0x0, 0x0)
preadv(r0, &(0x7f0000000a80)=[{0x0}], 0x1, 0x0)


symlink(&(0x7f0000000080)='.\x00', &(0x7f0000000300)='./file0\x00')
compat_50___msgctl13$IPC_STAT(0x0, 0x2, &(0x7f0000000280)={{}, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, &(0x7f00000001c0)={&(0x7f0000000100)}})
compat_40_mount(&(0x7f0000000140)='efs\x00', &(0x7f00000000c0)='./file0\x00', 0x0, &(0x7f00000001c0))


mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x0)
__getvfsstat90(&(0x7f0000000100), 0xce0, 0x2)


mknod(&(0x7f0000000100)='./file0\x00', 0x80002005, 0x5300)
r0 = open(&(0x7f0000002600)='./file0\x00', 0x0, 0x0)
ioctl$WSKBDIO_SETMAP(r0, 0xc0145002, &(0x7f0000000000)={0x0, 0x0})


open(0x0, 0x0, 0x0)
socket$inet(0x2, 0x0, 0x0)
r0 = socket(0x18, 0x2, 0x0)
msgctl$IPC_SET(0x0, 0x1, &(0x7f00000000c0))
r1 = socket(0x0, 0x0, 0x0)
pread(0xffffffffffffffff, 0x0, 0x0, 0x0)
ioctl$FIONREAD(0xffffffffffffffff, 0xc0106924, 0x0)
mkdir(0x0, 0x0)
socket(0x0, 0x0, 0x0)
msgctl$IPC_SET(0x0, 0x1, 0x0)
getsockopt$SO_PEERCRED(r1, 0xffff, 0x1022, 0x0, 0x0)
socket(0x0, 0x0, 0x0)
sendto$unix(0xffffffffffffffff, 0x0, 0x0, 0x0, 0x0, 0x0)
r2 = semget$private(0x0, 0x4, 0x7a4)
semop(r2, &(0x7f0000000400), 0x0)
semop(r2, &(0x7f0000000440), 0x0)
semctl$IPC_SET(r2, 0x0, 0x1, &(0x7f00000000c0)={{0x3, 0x0, 0x0, 0x0, 0x0, 0x0, 0x7}, 0xffffffffffffffff})
socket(0x0, 0x0, 0x0)
socket$inet6(0x18, 0x4000, 0xa)
open(&(0x7f0000000040)='./file0\x00', 0x202, 0x0)
chown(&(0x7f0000000140)='./file0\x00', 0xffffffffffffffff, 0xffffffffffffffff)
connect$unix(0xffffffffffffffff, 0x0, 0x0)
connect$unix(0xffffffffffffffff, &(0x7f00000000c0)=@abs={0x0, 0x7}, 0x1c)
setsockopt$sock_timeval(0xffffffffffffffff, 0xffff, 0x0, &(0x7f00000000c0)={0x0, 0x1ff}, 0x10)
connect$unix(r0, &(0x7f00000000c0)=@abs={0x682eb13985c518e6, 0x7}, 0x1c)


bind$inet(0xffffffffffffffff, 0x0, 0x53)
syz_emit_ethernet(0x66, &(0x7f0000000200))


mknod(&(0x7f00000000c0)='./file0\x00', 0x6000, 0xe03)
r0 = open(&(0x7f0000000040)='./file0\x00', 0x1, 0x0)
ioctl$WSMUXIO_INJECTEVENT(r0, 0xc0284600, &(0x7f00000001c0)={0x0, 0x0, {0x9}})


munmap(&(0x7f0000ffb000/0x2000)=nil, 0x2000)
compat_43_ommap(&(0x7f0000ffd000/0x2000)=nil, 0x2000, 0x0, 0x402, 0xffffffffffffffff, 0x0)
socket(0x0, 0x0, 0x0)
madvise(&(0x7f0000ffa000/0x2000)=nil, 0x2000, 0x0)
mlock(&(0x7f0000ffc000/0x3000)=nil, 0x3000)
mlock(&(0x7f0000ffa000/0x4000)=nil, 0x4000)


_lwp_exit()
_lwp_wait(0x0, &(0x7f0000001300))


mmap(&(0x7f0000ffc000/0x1000)=nil, 0x1000, 0x6, 0x1010, 0xffffffffffffffff, 0x0, 0x0)


symlink(&(0x7f0000000180)='\x00', &(0x7f00000001c0)='./file0\x00')
readlink(&(0x7f0000000200)='./file0\x00', 0x0, 0x0)


mkdir(&(0x7f0000000080)='./file0\x00', 0x0)
compat_50___msgctl13$IPC_STAT(0x0, 0x2, &(0x7f0000000280)={{}, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, &(0x7f00000001c0)={&(0x7f0000000100)}})
compat_40_mount(&(0x7f0000000180)='ptyfs\x00', &(0x7f00000002c0)='./file0\x00', 0x0, &(0x7f0000000500))
lchown(&(0x7f0000000100)='./file0\x00', 0xffffffffffffffff, 0x0)
compat_40_mount(&(0x7f0000000140)='umap\x00', &(0x7f0000000040)='./file0\x00', 0x0, &(0x7f00000001c0))
open$dir(&(0x7f0000000040)='./file0\x00', 0x20, 0x0)


r0 = socket(0x1f, 0x3, 0x0)
ioctl$FIOSEEKHOLE(r0, 0x801269ee, &(0x7f0000000180)=0x8000000000000032)


mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x0)
r0 = socket(0x11, 0x3, 0x0)
recvmmsg(r0, &(0x7f0000000500)={0x0}, 0x10, 0x0, 0x0)


mknod(&(0x7f00000000c0)='./bus\x00', 0x2000, 0xd02)
r0 = open(&(0x7f0000000240)='./bus\x00', 0x0, 0x0)
pread(r0, &(0x7f0000000040)="3cd15db7c30016", 0x50cc00, 0x0)
madvise(&(0x7f0000000000/0xb000)=nil, 0xb000, 0x4)


sendmsg$unix(0xffffffffffffffff, 0x0, 0x0)
pipe2(&(0x7f0000000000)={<r0=>0xffffffffffffffff, <r1=>0xffffffffffffffff}, 0x0)
readv(r0, &(0x7f00000000c0)=[{&(0x7f0000000040)=""/50, 0xfd9f}], 0x2f)
writev(r1, &(0x7f0000000200)=[{&(0x7f0000000100)="19", 0x3a4e1e905c56cdb7}], 0x1)


mknod(&(0x7f0000000000)='./bus\x00', 0x2000, 0x2e00)
open$dir(&(0x7f0000000080)='./bus\x00', 0x0, 0x0)
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
r0 = open(&(0x7f0000000080)='./file0\x00', 0x0, 0x0)
pread(r0, &(0x7f0000000040)="ee", 0x1, 0x0)


r0 = _lwp_self()
_lwp_exit()
_lwp_detach(r0)
_lwp_detach(r0)


ioctl$WSMUXIO_INJECTEVENT(0xffffffffffffffff, 0x80185760, &(0x7f0000000000)={0x1ff, 0x0, {0x0, 0x1}})
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
r0 = socket(0x18, 0x1, 0x0)
setsockopt(r0, 0x1000000029, 0xc, &(0x7f0000002a80)="ebffcbff13b9fd812eaa4e713048e69931929648", 0x14)
setsockopt(r0, 0x1000000029, 0xc, &(0x7f0000000000)="ebffcbff13b9fd812eaa4e713048e69931929648", 0x14)


r0 = socket$unix(0x1, 0x5, 0x0)
ioctl$KDENABIO(r0, 0x5450)


mkdir(&(0x7f0000000080)='./file0\x00', 0x0)
__mount50(&(0x7f0000000000)='fdesc\x00', &(0x7f0000000180)='./file0\x00', 0x0, 0x0, 0x0)
pathconf(&(0x7f0000000140)='./file0\x00', 0xe)


r0 = socket(0x18, 0x1, 0x0)
r1 = socket(0x18, 0x1, 0x0)
r2 = dup2(r1, r0)
compat_50_setitimer(0x0, 0x0, 0x0)
mknod(&(0x7f00000000c0)='./bus\x00', 0x2000, 0x0)
r3 = open$dir(&(0x7f0000000000)='./bus\x00', 0x0, 0x0)
ioctl$FIOASYNC(r3, 0x40043105, 0x0)
compat_43_orecv(r2, 0x0, 0x0, 0x6005)


open(&(0x7f0000000000)='./file0\x00', 0x9cab835cfdc52675, 0x0)
r0 = socket(0x2, 0x1, 0x0)
r1 = dup(r0)
syz_emit_ethernet(0x138, &(0x7f0000000000))
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
r2 = fcntl$dupfd(r1, 0x2, 0xffffffffffffffff)
r3 = socket(0x2, 0x4001, 0x0)
r4 = dup(r3)
r5 = fcntl$dupfd(r4, 0x2, 0xffffffffffffffff)
close(r5)
r6 = socket(0x2, 0x1, 0x0)
connect$unix(r6, &(0x7f0000000000), 0x10)
shutdown(r2, 0x1)


mknod(&(0x7f0000000000)='./file1\x00', 0x2000, 0xa718)
r0 = open(&(0x7f0000000200)='./file1\x00', 0x615, 0x0)
mknod(0x0, 0x0, 0x1733)
writev(r0, &(0x7f0000000440)=[{0x0}], 0x1)


pipe(&(0x7f0000000040)={<r0=>0xffffffffffffffff})
__clone(0x0, &(0x7f00000001c0))
__wait450(0x0, 0x0, 0x0, 0x0)
write(r0, &(0x7f0000000340), 0xd4e688a67930cd)


compat_50___msgctl13$IPC_STAT(0x0, 0x2, &(0x7f0000000280)={{}, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, &(0x7f00000001c0)={&(0x7f0000000100)}})
lchown(&(0x7f0000000100)='./file0\x00', 0x0, 0xffffffffffffffff)
compat_40_mount(0x0, 0x0, 0x0, 0x0)
open$dir(&(0x7f00000000c0)='./file0\x00', 0x40000400001803c1, 0x0)
compat_40_mount(&(0x7f0000000140)='msdos\x00', &(0x7f00000000c0)='.\x00', 0x0, &(0x7f00000001c0))


mlockall(0x2)
shmget$private(0x0, 0xc00000, 0x0, &(0x7f0000000000/0xc00000)=nil)
shmat(0x0, &(0x7f000001a000/0x4000)=nil, 0x4000)
mlockall(0x6)
shmat(0x0, &(0x7f000024d000/0x4000)=nil, 0x0)
mlockall(0x0)
shmctl$SHM_LOCK(0x0, 0xb)
shmctl$SHM_UNLOCK(0x0, 0xc)


r0 = socket(0x800000018, 0x1, 0x0)
msgctl$IPC_SET(0x0, 0x1, &(0x7f0000000080)={{0x0, 0x0, 0xffffffffffffffff, 0x0, 0x0, 0xd6}})
r1 = socket(0x18, 0x2, 0x0)
close(r1)
r2 = socket(0x800000018, 0x1, 0x0)
fcntl$lock(0xffffffffffffffff, 0x0, &(0x7f00000000c0)={0x0, 0x0, 0x0, 0x20000005a})
bind$unix(r2, &(0x7f0000000080)=@abs={0x1f95d27d48731892, 0x7}, 0x1c)
poll(&(0x7f0000000000), 0x200000000000002c, 0x0)
connect$unix(r1, &(0x7f00000000c0)=@abs={0x682eb13985c518e6, 0x7}, 0x1c)
bind$unix(r0, &(0x7f0000000080)=@abs={0x1f95d27d48731892, 0x7}, 0x1c)


open$dir(&(0x7f00000000c0)='./file0\x00', 0x40000400001803c1, 0x0)
r0 = open(&(0x7f0000000140)='./file0\x00', 0x0, 0x0)
r1 = open$dir(&(0x7f0000000000)='./file0\x00', 0x2, 0x0)
writev(r1, &(0x7f0000000340)=[{&(0x7f0000000000), 0x2cfea}], 0x1000000000000013)
mmap(&(0x7f0000000000/0x200000)=nil, 0x200000, 0x0, 0x10, r0, 0x0, 0x0)
pwritev(0xffffffffffffffff, 0x0, 0x0, 0x0)


mknod(&(0x7f0000000000)='./file0\x00', 0x2000, 0x1803)
open(&(0x7f0000000040)='./file0\x00', 0x0, 0x0)
compat_50___msgctl13$IPC_STAT(0x0, 0x2, 0x0)
__mount50(&(0x7f00000002c0)='overlay\x00', &(0x7f0000000040)='.\x00', 0x0, &(0x7f0000000540), 0x0)
ioctl$FIOASYNC(0xffffffffffffffff, 0x80067409, 0x0)
open$dir(&(0x7f0000000180)='./file0\x00', 0x2, 0x0)


r0 = getsid(0x0)
ptrace(0x9, r0, 0x0, 0x0)
compat_50_wait4(0x0, 0x0, 0x0, 0x0)
ptrace(0x5, r0, 0x0, 0x0)


_lwp_unpark(0x0, &(0x7f0000001280))
compat_50_clock_getres(0x0, &(0x7f0000001380))


r0 = socket$inet(0x2, 0x1, 0x0)
setsockopt(r0, 0x0, 0x1b, &(0x7f0000000a00)="8b589d9d", 0x4)


__mount50(&(0x7f00000002c0)='overlay\x00', &(0x7f0000000040)='.\x00', 0x0, &(0x7f0000000540), 0x0)
symlink(&(0x7f0000000080)='.\x00', &(0x7f0000000240)='./file0\x00')
compat_50___msgctl13$IPC_STAT(0x0, 0x2, &(0x7f0000000280)={{}, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, &(0x7f00000001c0)={&(0x7f0000000100), 0xdffffffffffff7ff}})
lchown(&(0x7f0000000100)='./file0\x00', 0x0, 0xffffffffffffffff)
compat_40_mount(&(0x7f0000000380)='union\x00', &(0x7f0000000140)='./file0\x00', 0x0, &(0x7f00000001c0))
mknod(&(0x7f0000000000)='./file1\x00', 0x2000, 0xa718)
open$dir(&(0x7f00000000c0)='./file1\x00', 0x0, 0x0)


minherit(&(0x7f0000ffd000/0x3000)=nil, 0x3000, 0x0)
mmap(&(0x7f0000ffe000/0x2000)=nil, 0x2000, 0xd49f275d97cc01bb, 0x1810, 0xffffffffffffffff, 0x0, 0x0)
fcntl$setown(0xffffffffffffffff, 0xa, 0x0)
fork()


ioctl$WSMUXIO_INJECTEVENT(0xffffffffffffffff, 0x80185760, &(0x7f0000000000)={0x0, 0x0, {0x4000000000}})
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
r0 = socket(0x18, 0x1, 0x0)
setsockopt(r0, 0x1000000029, 0xc, &(0x7f0000000000)="ebffcbff13b9fd812eaa4e713048e69931929648", 0x14)


mkdir(&(0x7f0000000080)='./file0\x00', 0x0)
compat_50___msgctl13$IPC_STAT(0x0, 0x2, &(0x7f0000000280)={{}, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, &(0x7f00000001c0)={&(0x7f0000000100), 0xdffffffffffff7ff}})
lchown(&(0x7f0000000100)='./file0\x00', 0x0, 0xffffffffffffffff)
__mount50(&(0x7f0000000000)='fdesc\x00', &(0x7f0000000200)='./file0\x00', 0x0, 0x0, 0x0)
compat_40_mount(&(0x7f0000000380)='union\x00', &(0x7f0000000140)='./file0\x00', 0x3, &(0x7f00000001c0))
pathconf(&(0x7f0000000200)='./file0\x00', 0x2)


mknod(&(0x7f0000000480)='./file0\x00', 0x2000, 0x1733)
r0 = open(&(0x7f0000000300)='./file0\x00', 0x0, 0x0)
mkdir(&(0x7f0000000080)='./file0\x00', 0x0)
ioctl$FIONREAD(r0, 0x8010427f, &(0x7f0000000080))


compat_40_mount(&(0x7f0000000040)='tmpfs\x00', &(0x7f00000000c0)='.\x00', 0x0, &(0x7f0000000140)="01")
__utimes50(0x0, 0x0)
__mount50(&(0x7f0000000000)='overlay\x00', &(0x7f0000000180)='.\x00', 0x0, &(0x7f0000000540), 0x0)
open(&(0x7f0000000480)='./file0\x00', 0x7fffe, 0x0)
setpgid(0x0, 0x0)
truncate(&(0x7f0000000440)='./file0\x00', 0x0, 0x3e2b649f)
truncate(&(0x7f0000000780)='./file0\x00', 0x0, 0x10001)
r0 = openat(0xffffffffffffff9c, &(0x7f0000000040)='./file0\x00', 0x0, 0x0)
readv(r0, &(0x7f00000003c0)=[{&(0x7f0000000300)=""/53, 0x35}], 0x1)


__mount50(0x0, 0x0, 0x0, 0x0, 0x0)
open(&(0x7f0000000000)='./bus\x00', 0x0, 0x0)
pipe(&(0x7f0000000000)={0xffffffffffffffff, <r0=>0xffffffffffffffff})
setsockopt$inet_opts(r0, 0x0, 0x0, 0x0, 0x0)


compat_40_mount(&(0x7f0000000380)='tmpfs\x00', &(0x7f00000003c0)='.\x00', 0x0, &(0x7f0000000140)="01")
open(&(0x7f0000000040)='./file0\x00', 0x70e, 0x0)
r0 = getpid()
ktrace(&(0x7f0000000000)='./file0\x00', 0x0, 0x144, r0)
__lstat50(&(0x7f0000000000)='.\x00', 0x0)
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
unmount(&(0x7f0000000000)='./file0\x00', 0x0)


open$dir(&(0x7f0000000180)='./file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\x00', 0x0, 0x0)
unlink(&(0x7f0000000000)='./file0\x00')
r0 = socket$inet6(0x18, 0x3, 0x0)
sendto$inet6(r0, &(0x7f0000000000)="87", 0x358, 0x2, &(0x7f0000000040)={0x18, 0x3}, 0x1c)


mknod(&(0x7f00000000c0)='./bus\x00', 0x2000, 0x0)
r0 = open$dir(&(0x7f0000000000)='./bus\x00', 0x0, 0x0)
ioctl$FIOASYNC(r0, 0x8004747e, &(0x7f00000001c0))
munmap(&(0x7f0000fec000/0x14000)=nil, 0x14000)
r1 = shmget$private(0x0, 0x4000, 0x0, &(0x7f000055b000/0x4000)=nil)
shmat(r1, &(0x7f0000ff5000/0x4000)=nil, 0x0)
lchown(0x0, 0x0, 0xffffffffffffffff)
__clone(0x0, 0x0)


mkdir(0x0, 0x0)
r0 = accept$unix(0xffffffffffffffff, &(0x7f0000000880)=@file={0x0, ""/4087}, &(0x7f0000000440)=0xff9)
sendto(r0, 0x0, 0x0, 0x0, 0x0, 0x0)
chroot(0x0)
socketpair$unix(0x1, 0x0, 0x0, &(0x7f0000000280)={<r1=>0xffffffffffffffff, <r2=>0xffffffffffffffff})
getsockopt$sock_cred(r2, 0xffff, 0x1022, &(0x7f00000001c0), 0x0)
socketpair$unix(0x1, 0x0, 0x0, 0x0)
r3 = socket(0x11, 0x3, 0x0)
sendto(r3, &(0x7f0000000840)="ae088843", 0x4, 0x0, 0x0, 0x0)
connect$unix(0xffffffffffffffff, 0x0, 0x0)
bind$unix(0xffffffffffffffff, 0x0, 0x0)
socket(0x0, 0x5, 0x6)
fcntl$getown(r1, 0x5)
bind(r3, &(0x7f0000000600), 0xc)


socket(0x800000018, 0x1, 0x0)
r0 = socket(0x18, 0x1, 0x0)
listen(r0, 0x0)
accept$unix(r0, 0x0, 0x0)
shutdown(r0, 0x2)
mmap(&(0x7f0000000000/0x400000)=nil, 0x400000, 0x3, 0x5012, 0xffffffffffffffff, 0x0, 0x0)
msgctl$IPC_SET(0x0, 0x1, &(0x7f00000000c0)={{0x0, 0x0, 0x0, 0x0, 0x0, 0xb0}})
r1 = socket(0x18, 0x1, 0x0)
connect$unix(r1, &(0x7f00000000c0)=@abs={0x682eb13985c518e6, 0x7}, 0x1c)
getsockname$inet(r1, 0x0, &(0x7f0000000000))
bind$unix(0xffffffffffffffff, &(0x7f0000000080)=@abs={0x1f95d27d48731892, 0x7}, 0x1c)


symlink(&(0x7f0000000080)='.\x00', 0x0)
__mount50(&(0x7f00000001c0)='fdesc\x00', &(0x7f0000000040)='.\x00', 0x0, 0x0, 0x0)
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
r0 = open$dir(&(0x7f0000000080)='./bus\x00', 0x0, 0x0)
compat_30_getdents(r0, 0x0, 0x0)


setsockopt$inet_opts(0xffffffffffffffff, 0x0, 0x0, 0x0, 0x0)
ioctl$WSMUXIO_INJECTEVENT(0xffffffffffffff9c, 0x80185760, &(0x7f0000000000)={0x0, 0x40000})
r0 = socket(0x18, 0x1, 0x0)
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x4)
compat_50_clock_settime(0x0, 0x0)
setsockopt(r0, 0x0, 0xd, &(0x7f0000000000)="ebffcbff13b9fd812eaa4e713048e69931929648", 0x14)


open(&(0x7f0000000080)='./file0\x00', 0x70e, 0x0)
r0 = open$dir(&(0x7f00000000c0)='./file0\x00', 0x0, 0x0)
__getdents30(r0, 0x0, 0x0)


ioctl$WSMUXIO_INJECTEVENT(0xffffffffffffffff, 0x80185760, &(0x7f0000000000)={0xffffffff})
r0 = socket$inet6(0x18, 0x3, 0x0)
sendto$inet(0xffffffffffffffff, &(0x7f0000000240)="8e", 0x1, 0x0, 0x0, 0x0)
sendto$inet6(r0, &(0x7f0000000000)='+', 0x2000, 0x2, &(0x7f0000000040)={0x18, 0x3}, 0x1c)


mkdir(&(0x7f0000000080)='./file0\x00', 0x0)
compat_50___msgctl13$IPC_STAT(0x0, 0x2, &(0x7f0000000280)={{}, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, &(0x7f00000001c0)={&(0x7f0000000100), 0xdffffffffffff7ff}})
lchown(&(0x7f0000000100)='./file0\x00', 0x0, 0xffffffffffffffff)
modctl$MODCTL_UNLOAD(0x2, 0x0)
compat_40_mount(&(0x7f0000000380)='union\x00', &(0x7f0000000140)='./file0\x00', 0x0, &(0x7f00000001c0))
unmount(&(0x7f0000000200)='./file0/../file0\x00', 0x0)


r0 = socket(0x1f, 0x5, 0x0)
ioctl$FIOSEEKHOLE(r0, 0x80906918, &(0x7f0000000180)=0x8000000000000032)


r0 = socket(0x18, 0x1, 0x0)
setsockopt(r0, 0x0, 0xd, &(0x7f0000000000)="ebffcbff13b9fd812eaa4e713048e69931929648", 0x14)


mknod(&(0x7f0000000000)='./bus\x00', 0x2000, 0x2e00)
r0 = open$dir(&(0x7f0000000080)='./bus\x00', 0x0, 0x0)
ioctl$FIOASYNC(r0, 0x801c5268, &(0x7f00000001c0))


modctl$MODCTL_UNLOAD(0x2, 0x0)
mknod(&(0x7f00000000c0)='./bus\x00', 0x2000, 0x0)
r0 = open$dir(&(0x7f0000000000)='./bus\x00', 0x0, 0x0)
ioctl$FIOASYNC(r0, 0x80047462, &(0x7f00000001c0))


r0 = socket$inet(0x2, 0x1, 0x0)
bind$inet(r0, &(0x7f0000000040)={0x2, 0x0}, 0x10)
connect$inet(r0, &(0x7f0000000000)={0x2, 0x0}, 0x10)
compat_43_osend(r0, &(0x7f0000000140)='1', 0x8402, 0xb)


mknod$loop(&(0x7f0000000000)='./file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\x00', 0x2000, 0x1)
mknod(&(0x7f00000000c0)='./bus\x00', 0x2000, 0x4f4b)
rename(&(0x7f0000000600)='./file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\x00', &(0x7f0000000f40)='./file0\x00')
rename(&(0x7f0000000440)='./file0\x00', &(0x7f0000000800)='./file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\x00')


r0 = socket(0x18, 0x3, 0x0)
ioctl$FIOSEEKHOLE(r0, 0xc118691d, &(0x7f0000000180))
r1 = msgget$private(0x0, 0x0)
msgctl$IPC_SET(r1, 0x11, 0x0)


mknod(&(0x7f00000000c0)='./file0\x00', 0x2000, 0x4100)
r0 = open(&(0x7f00000000c0)='./file0\x00', 0x615, 0x0)
poll(&(0x7f0000000140)=[{r0}], 0x1, 0x0)


fcntl$lock(0xffffffffffffffff, 0x0, &(0x7f0000000040)={0x0, 0x0, 0x0, 0x100000001})
ioctl$WSMUXIO_INJECTEVENT(0xffffffffffffffff, 0x80185760, &(0x7f0000000000)={0x7fffffff, 0x0, {0x4000000000, 0x1}})
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
r0 = socket(0x18, 0x1, 0x0)
setsockopt(r0, 0x1000000029, 0xc, &(0x7f0000000000)="ebffcbff13b9fd812eaa4e713048e69931929648", 0x14)
setsockopt(r0, 0x1000000029, 0xc, &(0x7f0000000040)="ebffcbff13b9fd812eaa4e713048e69931929648", 0x14)
setsockopt(r0, 0x1000000029, 0xd, &(0x7f0000000000)="ebffcbff13b9fd812eaa4e713048e69931929648", 0x14)


socketpair$unix(0x1, 0x5, 0x0, &(0x7f0000000140)={<r0=>0xffffffffffffffff})
sendto$unix(r0, 0x0, 0x0, 0x44004, 0x0, 0x0)


mknod(&(0x7f0000000200)='./file0\x00', 0x2000, 0x0)
__mount50(&(0x7f00000002c0)='overlay\x00', &(0x7f0000000040)='.\x00', 0x0, &(0x7f0000000540), 0x0)
pathconf(&(0x7f0000000080)='./file0\x00', 0x2)


r0 = socket$inet(0x2, 0x2, 0x0)
getsockopt$inet_opts(r0, 0x0, 0x1a, 0x0, 0x0)


r0 = socket(0x800000018, 0x2, 0x0)
listen(r0, 0x0)


chroot(&(0x7f0000000000)='.\x00')
__mount50(&(0x7f00000001c0)='fdesc\x00', &(0x7f0000000040)='.\x00', 0x0, 0x0, 0x0)
__getvfsstat90(0x0, 0x0, 0x0)
fktrace(0xffffffffffffffff, 0x0, 0x0, 0x0)
chmod(&(0x7f0000000280)='./file0\x00', 0x3a)
socketpair$unix(0x1, 0x0, 0x0, 0x0)
pwritev(0xffffffffffffffff, &(0x7f00000016c0)=[{&(0x7f0000000300)="10", 0x1}, {0x0}, {&(0x7f0000000500)="85", 0x1}, {0x0}], 0x4, 0x7)
__mount50(0x0, &(0x7f0000000040)='.\x00', 0x0, 0x0, 0x0)
r0 = socket(0x18, 0x0, 0x0)
setsockopt(r0, 0x1000000000029, 0x2a, &(0x7f0000000040), 0x0)
chdir(&(0x7f0000000240)='./file0\x00')
mkdir(&(0x7f00000001c0)='./file1\x00', 0x0)
setreuid(0x0, 0xee01)
compat_50_nanosleep(&(0x7f0000001180), &(0x7f00000011c0))
socket$unix(0x1, 0x5, 0x0)
minherit(&(0x7f0000ffc000/0x1000)=nil, 0xffffffffdf003fff, 0x0)
setsockopt(0xffffffffffffffff, 0x0, 0x0, 0x0, 0x0)
_ksem_post(0x0)


r0 = socket(0x18, 0x3, 0x0)
setsockopt$sock_int(r0, 0xffff, 0x4, &(0x7f0000000880), 0x4)


mknod(&(0x7f0000000100)='./file0\x00', 0x80002005, 0x5300)
r0 = open(&(0x7f0000002600)='./file0\x00', 0x0, 0x0)
ioctl$WSDISPLAYIO_LDFONT(r0, 0x8030574d, &(0x7f0000000200)={0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0})


symlink(&(0x7f0000000080)='.\x00', &(0x7f0000000240)='./file0\x00')
compat_40_mount(&(0x7f0000000380)='union\x00', &(0x7f0000000140)='./file0\x00', 0x0, &(0x7f00000001c0))


r0 = open$dir(&(0x7f00000000c0)='./file0\x00', 0x40000400001803c1, 0x0)
r1 = open(&(0x7f0000000140)='./file0\x00', 0x0, 0x0)
r2 = open$dir(&(0x7f0000000000)='./file0\x00', 0x2, 0x0)
writev(r2, &(0x7f0000000100)=[{&(0x7f0000000040)="cb", 0x1}, {0x0}], 0x2)
mmap(&(0x7f0000000000/0x200000)=nil, 0x200000, 0x3, 0x10, r1, 0x0, 0x0)
pwritev(r0, &(0x7f0000000080)=[{&(0x7f00000006c0), 0xf0f75}], 0x1, 0x0)


mknod(&(0x7f0000000300)='./file0\x00', 0x2000, 0x6da)
r0 = open$dir(&(0x7f0000000000)='./file0\x00', 0x0, 0x0)
ioctl$FIOSEEKHOLE(r0, 0xc0086662, &(0x7f0000000040))
mknod(0x0, 0x0, 0x0)
socketpair(0x22, 0x3, 0x0, 0x0)


symlink(&(0x7f0000000000)='./file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa/../file0\x00', 0x0)
mknod(&(0x7f00000000c0)='./bus\x00', 0x2000, 0x4f4b)
r0 = open$dir(&(0x7f0000000040)='./bus\x00', 0x0, 0x0)
ioctl$FIOASYNC(r0, 0xc0104304, &(0x7f00000001c0)=0x20000002)


r0 = getsid(0x0)
ptrace(0x9, r0, 0x0, 0x0)
compat_50_wait4(0x0, 0x0, 0x0, 0x0)
ptrace(0x20, r0, &(0x7f0000000000), 0x0)


__mount50(&(0x7f0000000240)='ffs\x00', &(0x7f0000000040)='.\x00', 0x0, &(0x7f0000000300)="7a8a6e0a214a5682", 0x8)


r0 = socket(0x800000018, 0x2, 0x0)
r1 = socket(0x18, 0x1, 0x0)
dup2(r1, r0)
listen(r0, 0x0)
connect(r0, &(0x7f00000001c0)=@family=0xd, 0xe)


r0 = socket(0x2, 0x2, 0x0)
r1 = dup(r0)
ioctl$NETBSD_DM_IOCTL(r1, 0xc010fd00, &(0x7f0000000140)={0x0})


r0 = shmget$private(0x0, 0x2000, 0x0, &(0x7f0000ffe000/0x2000)=nil)
shmctl$IPC_STAT(r0, 0x4, 0x0)


r0 = socket(0x2, 0x1, 0x0)
r1 = dup(r0)
r2 = fcntl$dupfd(r1, 0x2, 0xffffffffffffffff)
close(r2)
socketpair$unix(0x1, 0x2, 0x0, &(0x7f0000000040)={0xffffffffffffffff, <r3=>0xffffffffffffffff})
sendmsg$unix(0xffffffffffffffff, &(0x7f0000000080)={0x0, 0x0, 0x0, 0x0, &(0x7f0000000000)=ANY=[@ANYBLOB="89000000ffff000001"], 0x9}, 0x0)
recvmsg(r3, &(0x7f0000000100)={0x0, 0x0, 0x0, 0x0, &(0x7f0000000180)=""/225, 0xe1}, 0x0)
r4 = socket$inet(0x2, 0x2, 0x0)
setsockopt$inet_opts(r4, 0x0, 0x200000000000c, &(0x7f0000000240)="ea00000100000000", 0xc)
r5 = socket$inet(0x2, 0x2, 0x0)
close(r5)
r6 = socket$inet(0x2, 0x2, 0x0)
dup2(r4, r6)
setsockopt$inet_opts(r6, 0x0, 0x200000000000c, &(0x7f0000000200)="ea00000000000000", 0x8)
setsockopt$inet_opts(r6, 0x0, 0x200000000000c, &(0x7f0000000240)="ea02000000000000", 0x8)
setsockopt$inet_opts(r5, 0x0, 0xd, &(0x7f0000000240)="ea00000100000000", 0x8)


mkdir(&(0x7f0000000080)='./file0\x00', 0x0)
compat_50___msgctl13$IPC_STAT(0x0, 0x2, &(0x7f0000000280)={{}, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, &(0x7f00000001c0)={&(0x7f0000000100)}})
__mount50(&(0x7f0000000000)='fdesc\x00', &(0x7f0000000180)='./file0\x00', 0x0, 0x0, 0x0)
lchown(&(0x7f0000000100)='./file0\x00', 0x0, 0xffffffffffffffff)
compat_40_mount(&(0x7f0000000140)='umap\x00', &(0x7f0000000040)='./file0\x00', 0x0, &(0x7f00000001c0))
pathconf(&(0x7f0000000140)='./file0\x00', 0x2)


r0 = getppid()
getpriority(0x2, r0)


mkdir(&(0x7f0000000080)='./file0\x00', 0x0)
compat_40_mount(&(0x7f0000000180)='ptyfs\x00', &(0x7f00000002c0)='./file0\x00', 0x0, &(0x7f0000000500))
r0 = openat(0xffffffffffffff9c, &(0x7f0000000140)='.\x00', 0x0, 0x0)
utimensat(r0, &(0x7f00000000c0)='./file0\x00', 0x0, 0x0)


mknod(&(0x7f00000000c0)='./bus\x00', 0x2000, 0x4f4b)
unmount(&(0x7f0000000000)='./bus/\x00', 0x0)


mknod(&(0x7f00000000c0)='./file0\x00', 0x2000, 0x4100)
r0 = open(&(0x7f00000000c0)='./file0\x00', 0x0, 0x0)
ioctl$FIONREAD(r0, 0x40185727, &(0x7f0000000080))


symlink(&(0x7f0000000080)='.\x00', &(0x7f0000000240)='./file0\x00')
r0 = open$dir(&(0x7f0000000000)='./file0\x00', 0x0, 0x0)
fstatat(r0, &(0x7f00000001c0)='./file0/file0\x00', 0x0, 0x0)


modctl$MODCTL_UNLOAD(0x2, 0x0)
__mount50(&(0x7f00000002c0)='overlay\x00', 0x0, 0x0, &(0x7f0000000540), 0x0)
r0 = getpid()
ktrace(0x0, 0x0, 0x4, r0)
_ksem_timedwait(0x0, 0xfffffffffffffffe)


mkdir(0x0, 0x0)
sendmsg$unix(0xffffffffffffffff, 0x0, 0x0)
listen(0xffffffffffffffff, 0x0)
r0 = msgget$private(0x0, 0x0)
msgrcv(r0, &(0x7f0000000940), 0x8, 0x2, 0x1000)
getsockopt$SO_PEERCRED(0xffffffffffffffff, 0xffff, 0x1022, &(0x7f0000000140), 0xc)
msgsnd(r0, &(0x7f0000000180)=ANY=[@ANYBLOB="0300000000000000a486714b3b6964c6220190d7f39c044dac99fec5afca3ec3e155903698d635e2ab348195cce43ab9e134935e4edf5efe4e5ec4bec02d51201f93b9860f69d58fca21e1f36041df344b049af8bf321177b2fdc7cc2725691dc000"/110], 0x6e, 0x0)
msgrcv(r0, &(0x7f0000000380), 0x8, 0x3, 0x1800)


socketpair$unix(0x1, 0x0, 0x0, 0x0)
connect$unix(0xffffffffffffffff, &(0x7f00000000c0)=@abs={0x682eb13985c518e6, 0x7}, 0x1c)
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x4)
r0 = socket(0x18, 0x1, 0x0)
connect$unix(r0, &(0x7f00000000c0)=@abs={0x0, 0x7}, 0x8)


mkdir(&(0x7f0000000080)='./file0\x00', 0x0)
compat_50___msgctl13$IPC_STAT(0x0, 0x2, &(0x7f0000000480)={{}, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, &(0x7f00000001c0)={&(0x7f0000000100)}})
lchown(&(0x7f0000000100)='./file0\x00', 0x0, 0xffffffffffffffff)
compat_40_mount(&(0x7f0000000000)='null\x00', &(0x7f0000000140)='./file0\x00', 0x0, &(0x7f00000001c0))
r0 = open(&(0x7f0000000180)='./file0\x00', 0x0, 0x0)
compat_43_ogetdirentries(r0, 0x0, 0x0, 0x0)


modctl$MODCTL_LOAD(0x0, &(0x7f0000000080)={&(0x7f0000000240), 0x0, &(0x7f0000000000)="3c5ffe26b51cea0a", 0x8})


r0 = socket(0x2, 0x3, 0x0)
ioctl$FIOSEEKHOLE(r0, 0xc0986981, &(0x7f0000000000)=0x8000000000000034)


r0 = socket$unix(0x1, 0x2, 0x0)
getsockopt$sock_timeval(r0, 0xffff, 0x100c, &(0x7f0000000040), &(0x7f00000000c0)=0x10)


compat_40_mount(&(0x7f0000000040)='tmpfs\x00', 0x0, 0x0, 0x0)
modctl$MODCTL_UNLOAD(0x1, &(0x7f0000000040))


fcntl$lock(0xffffffffffffffff, 0x0, &(0x7f0000000040)={0x0, 0x0, 0xffffffffffffffff})
r0 = compat_30_socket(0x22, 0x30000003, 0x0)
compat_43_osend(r0, &(0x7f0000000040)="00060404", 0x600, 0x0)


compat_40_mount(&(0x7f0000000040)='tmpfs\x00', &(0x7f00000005c0)='.\x00', 0x0, &(0x7f00000002c0)="01")
mknod(&(0x7f0000000280)='./file0\x00', 0x1100, 0x0)
chflags(&(0x7f0000000140)='./file0\x00', 0x20000)
open(&(0x7f0000001640)='./file0\x00', 0x615, 0x0)


r0 = socket(0x1f, 0x5, 0x0)
ioctl$FIOSEEKHOLE(r0, 0x8080696e, &(0x7f0000000180)=0x8000000000000032)


symlink(0x0, 0x0)
socketpair$unix(0x1, 0x1, 0x0, &(0x7f0000000140)={<r0=>0xffffffffffffffff, <r1=>0xffffffffffffffff})
sendmsg$unix(r1, &(0x7f00000000c0)={0x0, 0x0, 0x0, 0x0, &(0x7f0000000100)=ANY=[@ANYBLOB="28000000ffff000001"], 0x28}, 0x0)
recvmsg(r0, &(0x7f0000000180)={0x0, 0x0, &(0x7f0000000640)}, 0x42)
ktrace(0x0, 0x0, 0x0, 0x0)
socket$inet(0x2, 0x0, 0x0)
dup2(0xffffffffffffffff, 0xffffffffffffffff)


open(&(0x7f00000000c0)='./file0\x00', 0x205, 0x0)
setsockopt$sock_timeval(0xffffffffffffffff, 0xffff, 0x0, 0x0, 0x0)
close(0xffffffffffffffff)
r0 = open(&(0x7f0000000000)='./file0\x00', 0x0, 0x0)
fcntl$lock(r0, 0x8, &(0x7f0000000180)={0x0, 0x0, 0x0, 0x100000401})
r1 = open(&(0x7f00000000c0)='./file0\x00', 0x205, 0x0)
fcntl$lock(r1, 0x8, &(0x7f0000000000)={0x0, 0x0, 0xfffffffffffffffe, 0x1000300010008, 0xffffffffffffffff})
r2 = open(&(0x7f00000000c0)='./file0\x00', 0x205, 0x0)
fcntl$lock(r2, 0x8, &(0x7f0000000000)={0x0, 0x0, 0xfffffffffdfffffd, 0x1000300010008})


setpgid(0x0, 0x0)
fork()
r0 = getppid()
setpgid(0x0, r0)


mlock(&(0x7f0000ffc000/0x3000)=nil, 0x3000)
munlock(&(0x7f0000ffc000/0x1000)=nil, 0x1000)


modctl$MODCTL_UNLOAD(0x2, 0x0)
getpgid(0x0)


compat_40_mount(&(0x7f0000000380)='tmpfs\x00', &(0x7f0000000000)='.\x00', 0x0, &(0x7f0000000140)="01")
__mount50(0x0, &(0x7f0000000040)='.\x00', 0xe680bf986d21ac03, &(0x7f0000000140), 0x30)


r0 = socket(0x2, 0x3, 0x0)
sendto$inet(r0, 0x0, 0x0, 0x0, &(0x7f0000000000)={0x2, 0x0}, 0x10)


compat_40_mount(0x0, &(0x7f0000000040)='./file0\x00', 0x0, 0x0)
fcntl$lock(0xffffffffffffffff, 0x0, &(0x7f0000000040))
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
r0 = socket(0x18, 0x1, 0x0)
setsockopt(r0, 0x1000000029, 0xc, &(0x7f0000000040)="ebffcbff13b9fd812eaa4e713048e69931929648", 0x14)


mknod(&(0x7f0000000000)='./file0\x00', 0x2000, 0x40000802)
r0 = open(&(0x7f00000000c0)='./file0\x00', 0x0, 0x0)
ioctl$FIOASYNC(r0, 0x80017472, &(0x7f0000000040)=0xffffffff)


symlink(&(0x7f0000000080)='.\x00', &(0x7f0000000240)='./file0\x00')
compat_50___msgctl13$IPC_STAT(0x0, 0x2, &(0x7f0000000280)={{}, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, &(0x7f00000001c0)={&(0x7f0000000100), 0xdffffffffffff7ff}})
lchown(&(0x7f0000000100)='./file0\x00', 0x0, 0xffffffffffffffff)
compat_40_mount(&(0x7f0000000380)='union\x00', &(0x7f0000000140)='./file0\x00', 0x0, &(0x7f00000001c0))
pathconf(&(0x7f0000000040)='./file0\x00', 0xc)


socketpair$unix(0x1, 0x2, 0x0, &(0x7f0000000080))
open(&(0x7f0000000480)='./file0\x00', 0x200, 0x4ebfac6bbaf796d)
setuid(0xee01)
execve(&(0x7f0000000140)='./file0\x00', 0x0, 0x0)
poll(&(0x7f00000000c0)=[{0xffffffffffffffff, 0x6e}], 0x1, 0x0)
r0 = msgget$private(0x0, 0x0)
msgctl$IPC_STAT(r0, 0x2, 0x0)


r0 = socket$inet(0x2, 0x1, 0x0)
__fstat50(r0, &(0x7f00000000c0))


open(&(0x7f0000000040)='./file0\x00', 0x70e, 0x0)
ktrace(&(0x7f0000000200)='./file0\x00', 0x4, 0xd27d43220c7df9b, 0x0)
setreuid(0x0, 0xee01)
fchroot(0xffffffffffffffff)


socketpair(0x2, 0x2, 0x0, 0x0)


r0 = socket$unix(0x1, 0x5, 0x0)
read(r0, 0x0, 0xfffffffffffffedd)


syz_emit_ethernet(0x36, &(0x7f0000000000))
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
r0 = socket(0x18, 0x1, 0x0)
r1 = socket(0x18, 0x2, 0x0)
setsockopt(r1, 0x1000000000029, 0xa, &(0x7f0000000040)='\x00\x00\x00\x00', 0x4)
dup2(r1, r0)
setsockopt(r0, 0x1000000029, 0xc, &(0x7f0000000000)="ebffcbff13b9fd812eaa4e713048e69931929648", 0x14)


mkdir(&(0x7f0000000080)='./file0\x00', 0x0)
__mount50(0x0, &(0x7f0000000200)='./file0\x00', 0x400000, 0x0, 0x0)


mkdir(0x0, 0x0)
open$dir(0x0, 0x0, 0x0)
pipe(&(0x7f0000001400)={<r0=>0xffffffffffffffff})
fchownat(r0, 0x0, 0x0, 0x0, 0x0)


r0 = socket(0x18, 0x3, 0x0)
compat_43_ogetdirentries(r0, 0x0, 0x0, 0x0)


mknod(&(0x7f00000000c0)='./bus\x00', 0x2000, 0x4f4b)
r0 = open$dir(&(0x7f0000000040)='./bus\x00', 0x0, 0x0)
ioctl$FIOASYNC(r0, 0xc0104306, &(0x7f00000001c0))


r0 = semget(0x2, 0x0, 0x5c0)
r1 = msgget$private(0x0, 0x100)
msgrcv(r1, 0x0, 0x0, 0x0, 0x0)
msgctl$IPC_STAT(r1, 0x2, &(0x7f0000000040)=""/56)
r2 = socket$inet(0x2, 0x1, 0x0)
msgrcv(r1, &(0x7f0000000840), 0xfa, 0x0, 0x0)
getsockopt(r2, 0x0, 0x5, 0x0, 0x0)
getsockopt$SO_PEERCRED(r2, 0xffff, 0x1022, &(0x7f0000000080)={0x0, <r3=>0x0}, 0xc)
r4 = getgid()
setregid(r4, 0x0)
r5 = getpgid(0xffffffffffffffff)
r6 = getgid()
msgctl$IPC_SET(r1, 0x1, &(0x7f00000005c0)={{0x9c4, r3, r4, 0x0, r6, 0x1, 0x8000}, 0x3ff, 0x5, 0x0, r5, 0x1f8, 0x2, 0x2})
r7 = getgid()
r8 = getuid()
seteuid(r8)
getgroups(0x3, &(0x7f0000000000)=[0x0, 0xffffffffffffffff, <r9=>0x0])
semctl$IPC_SET(r0, 0x0, 0x1, &(0x7f0000000080)={{0x1, r3, r7, r8, r9, 0x80, 0xd78c}, 0x3fa, 0x47b, 0x1})
syz_emit_ethernet(0x18, &(0x7f0000000040))


__getcwd(&(0x7f0000000000)=""/29, 0xff39)
r0 = getsid(0x0)
ptrace(0x9, r0, 0x0, 0x0)
compat_50_wait4(0x0, 0x0, 0x0, 0x0)
ptrace(0x2a, r0, &(0x7f0000000000), 0x0)


munmap(&(0x7f0000ffb000/0x4000)=nil, 0x4000)
compat_43_ommap(&(0x7f0000ffd000/0x2000)=nil, 0xfffffffffffff000, 0x0, 0x402, 0xffffffffffffffff, 0x0)


mkdir(&(0x7f0000000080)='./file0\x00', 0x0)
r0 = open(&(0x7f00000000c0)='./file0\x00', 0x0, 0x0)
recvmsg(r0, &(0x7f0000000800)={&(0x7f0000000200), 0xe, 0x0, 0x0, 0x0}, 0x0)


modctl$MODCTL_LOAD(0x0, &(0x7f0000000000)={&(0x7f0000000040), 0x0, &(0x7f00000000c0)='\r', 0x1})


writev(0xffffffffffffffff, 0x0, 0x0)
socketpair$unix(0x1, 0x1, 0x0, &(0x7f0000000040)={<r0=>0xffffffffffffffff})
connect$unix(r0, &(0x7f00000000c0)=@abs={0x0, 0x0, 0x2}, 0x8)


compat_50__lwp_park(&(0x7f0000000000), 0xffffffffffffffff, 0x0, 0x0)


mknod(&(0x7f00000000c0)='./bus\x00', 0x2000, 0x4f4b)
r0 = open$dir(&(0x7f0000000040)='./bus\x00', 0x0, 0x0)
ioctl$FIOASYNC(r0, 0xc0104304, &(0x7f00000001c0))
__stat50(0x0, 0x0)
r1 = getsid(0x0)
ptrace(0x9, r1, 0x0, 0x0)
compat_50_wait4(0x0, 0x0, 0x0, 0x0)
ptrace(0x100000001, r1, &(0x7f0000000580), 0x0)


symlink(0x0, 0x0)
pipe(&(0x7f0000000140)={<r0=>0xffffffffffffffff})
r1 = getpid()
fktrace(r0, 0x0, 0x62e2dd08f149ff1b, r1)
compat_30_fhstatvfs1(0x0, 0x0, 0x0)


r0 = posix_spawn(0x0, 0x0, 0x0, 0x0, 0x0, 0x0)
getpriority(0x0, r0)


mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
setuid(0xee01)
r0 = socket(0x18, 0x1, 0x0)
setsockopt(r0, 0x0, 0xc, &(0x7f0000000000)="ebffcbff13b9fd812eaa4e713048e69931929648", 0x14)


open$dir(&(0x7f0000000180)='./file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\x00', 0x0, 0x0)
r0 = socket(0x18, 0x3, 0x0)
ioctl$FIOSEEKHOLE(r0, 0x8090698e, &(0x7f0000000180)=0x8000000000000032)


__clock_gettime50(0x20000000, &(0x7f0000000300))


acct(0x0)
pipe(&(0x7f0000000440)={<r0=>0xffffffffffffffff})
__clock_gettime50(0x0, 0x0)
fcntl$setown(r0, 0x6, 0x0)


modctl$MODCTL_UNLOAD(0x1, &(0x7f0000000080)="f7f18c4b0d602d76648e1e31046b3d046bdf3bf31d62c7487d077681d6fcd0998d")
modctl$MODCTL_LOAD(0x0, &(0x7f0000000080)={0x0, 0x0, 0x0, 0xc1})
compat_40_mount(&(0x7f0000000040)='tmpfs\x00', &(0x7f0000000140)='.\x00', 0x0, &(0x7f0000000080)="01")
symlink(&(0x7f0000000080)='.\x00', &(0x7f0000000240)='./file0\x00')
compat_50___msgctl13$IPC_STAT(0xffffffffffffffff, 0x2, &(0x7f0000000280)={{}, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, &(0x7f00000001c0)={&(0x7f0000000100)}})
r0 = open(&(0x7f0000000100)='./file0\x00', 0x0, 0x0)
compat_40_mount(&(0x7f0000000140)='umap\x00', &(0x7f0000000040)='./file0\x00', 0x0, &(0x7f00000001c0))
fchmodat(r0, &(0x7f0000000000)='./file0\x00', 0x0, 0x0)


mkdir(0x0, 0x0)
mkdir(&(0x7f0000000080)='./file0\x00', 0x0)
compat_50___msgctl13$IPC_STAT(0x0, 0x2, &(0x7f0000000280)={{}, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, &(0x7f00000001c0)={&(0x7f0000000100), 0xdffffffffffff7ff}})
lchown(&(0x7f0000000100)='./file0\x00', 0x0, 0x0)
compat_40_mount(&(0x7f0000000380)='union\x00', &(0x7f0000000140)='./file0\x00', 0x0, &(0x7f00000001c0))
compat_40_mount(&(0x7f0000000140)='umap\x00', &(0x7f0000000040)='./file0\x00', 0x0, &(0x7f00000001c0))
compat_40_mount(&(0x7f0000000380)='union\x00', &(0x7f0000000040)='./file0\x00', 0x0, &(0x7f00000001c0))


setsockopt(0xffffffffffffffff, 0x0, 0x0, 0x0, 0x0)
r0 = posix_spawn(0x0, 0x0, 0x0, 0x0, 0x0, 0x0)
setpgid(r0, 0x0)


accept$inet6(0xffffffffffffffff, 0x0, 0x0)
getpeername$inet6(0xffffffffffffffff, &(0x7f00000000c0), &(0x7f0000000100)=0xc)
open(&(0x7f0000000140)='./file0\x00', 0x78e, 0x0)
profil(0x0, 0x52, 0x60000000000, 0x80000000)


r0 = _lwp_self()
_lwp_exit()
_lwp_detach(r0)


mkdirat(0xffffffffffffffff, &(0x7f0000000000)='./file0\x00', 0x0)
__mount50(0x0, 0x0, 0x0, 0x0, 0x0)
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x0)
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
r0 = socket$inet6(0x18, 0x3, 0x0)
compat_43_ogetsockname(r0, &(0x7f0000000080)=""/152, &(0x7f0000000000)=0x98)


r0 = compat_30_socket(0x22, 0x3, 0x0)
shutdown(r0, 0x1)


compat_43_osethostname(&(0x7f0000000000)="00c89d4daabb01b7ce59ca7067e1760e6b1795667b662fbefc30bf033511709751fe8d599efe24381ea20f094e9aec5696541821ab67dc9b83b7e805c8936c9369d204f24c0da54ddc5004d682ac", 0x4e)
r0 = compat_30_socket(0x22, 0x30000003, 0x0)
compat_43_osend(r0, &(0x7f0000000040)="00060409", 0x600, 0x0)


symlink(&(0x7f0000000080)='.\x00', &(0x7f0000000240)='./file0\x00')
compat_50___msgctl13$IPC_STAT(0x0, 0x2, &(0x7f0000000280)={{}, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, &(0x7f00000001c0)={&(0x7f0000000100), 0xdffffffffffff7ff}})
lchown(&(0x7f0000000100)='./file0\x00', 0x0, 0xffffffffffffffff)
compat_40_mount(&(0x7f0000000380)='union\x00', &(0x7f0000000140)='./file0\x00', 0x0, &(0x7f00000001c0))
pathconf(&(0x7f0000000040)='./file0\x00', 0xd)


r0 = getpgrp()
getpriority(0x0, 0x0)
getsid(r0)


symlink(&(0x7f0000000080)='.\x00', &(0x7f0000000240)='./file0\x00')
compat_50___msgctl13$IPC_STAT(0x0, 0x2, &(0x7f0000000280)={{}, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, &(0x7f00000001c0)={&(0x7f0000000100), 0xdffffffffffff7ff}})
lchown(&(0x7f0000000100)='./file0\x00', 0x0, 0xffffffffffffffff)
compat_40_mount(&(0x7f0000000380)='union\x00', &(0x7f0000000140)='./file0\x00', 0x0, &(0x7f00000001c0))
compat_40_mount(&(0x7f0000000140)='umap\x00', &(0x7f0000000040)='./file0\x00', 0x0, &(0x7f00000001c0))
pathconf(&(0x7f00000001c0)='./file0\x00', 0x6)


__fhstat50(&(0x7f0000000180)="eb01b685c8f859535d", 0x9, 0x0)
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
r0 = socket(0x18, 0x3, 0x0)
ioctl$FIOSEEKHOLE(r0, 0xc0106926, &(0x7f0000000180))


socketpair$unix(0x1, 0x2, 0x0, &(0x7f0000000000)={<r0=>0xffffffffffffffff, <r1=>0xffffffffffffffff})
recvmsg(r0, &(0x7f0000000780)={0x0, 0x0, &(0x7f0000000640)=[{&(0x7f0000000080)=""/23, 0x17}], 0x1, 0x0}, 0x0)
dup2(r1, r0)


mkdir(&(0x7f0000000080)='./file0\x00', 0x0)
__mount50(&(0x7f0000000040)='procfs\x00', &(0x7f0000000080)='./file0\x00', 0x0, &(0x7f00000000c0)='@', 0x1)
r0 = open(&(0x7f0000000180)='./file0\x00', 0x0, 0x0)
compat_43_ogetdirentries(r0, &(0x7f0000000000)=""/39, 0x21, &(0x7f00000000c0))


__msync13(&(0x7f0000001000/0x4000)=nil, 0x0, 0x1)


setrlimit(0x8, &(0x7f0000000980))
pipe(0x0)


pipe(0x0)
r0 = getpid()
fktrace(0xffffffffffffffff, 0x0, 0xf0709cfa615b9be3, r0)
compat_50_____semctl13$IPC_SET(0x0, 0x0, 0x1, &(0x7f0000000000)=@val=0x4)
____semctl50$GETNCNT(0x0, 0x0, 0x3)
fcntl$lock(0xffffffffffffffff, 0xfffffffd, 0x0)


setrlimit(0xb, &(0x7f00000000c0)={0x100000, 0x100000001})


r0 = open$dir(&(0x7f00000000c0)='./file0\x00', 0x40000400001803c1, 0x0)
lchflags(&(0x7f0000000200)='./file0\x00', 0x60006)
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
writev(r0, &(0x7f0000000b40)=[{0x0}], 0x1)


compat_50___msgctl13$IPC_STAT(0x0, 0x2, &(0x7f0000000280)={{}, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, &(0x7f00000001c0)={&(0x7f0000000100), 0xdffffffffffff7ff}})
lchown(&(0x7f0000000100)='./file0\x00', 0x0, 0xffffffffffffffff)
symlink(&(0x7f0000000080)='.\x00', &(0x7f0000000240)='./file0\x00')
compat_40_mount(&(0x7f0000000380)='union\x00', &(0x7f0000000140)='./file0\x00', 0x0, &(0x7f00000001c0))
__mount50(&(0x7f00000002c0)='overlay\x00', &(0x7f0000000040)='.\x00', 0x0, &(0x7f00000001c0), 0x0)
symlink(&(0x7f0000000ac0)='./file0\x00', &(0x7f0000000e40)='./file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\x00')


compat_40_mount(&(0x7f0000000080)='tmpfs\x00', &(0x7f00000000c0)='.\x00', 0x0, &(0x7f00000002c0)="01")
symlink(&(0x7f0000000080)='.\x00', &(0x7f0000000240)='./file0\x00')
compat_50___msgctl13$IPC_STAT(0x0, 0x2, &(0x7f0000000280)={{}, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, &(0x7f00000001c0)={&(0x7f0000000100), 0xdffffffffffff7ff}})
lchown(&(0x7f0000000100)='./file0\x00', 0x0, 0xffffffffffffffff)
compat_40_mount(&(0x7f0000000380)='union\x00', &(0x7f0000000140)='./file0\x00', 0x0, &(0x7f00000001c0))
pathconf(&(0x7f00000001c0)='./file0\x00', 0x7)


mkdir(&(0x7f0000000080)='./file0\x00', 0x0)
r0 = open(&(0x7f00000000c0)='./file0\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x2000)=nil, 0x2000, 0x0, 0x10, r0, 0x0, 0x0)


mlock(&(0x7f0000ffa000/0x4000)=nil, 0x4000)
munmap(&(0x7f0000ffc000/0x4000)=nil, 0x4000)
munlock(&(0x7f0000ffb000/0x2000)=nil, 0x2000)


mknod(&(0x7f00000000c0)='./bus\x00', 0x2000, 0x4f4b)
r0 = open$dir(&(0x7f0000000040)='./bus\x00', 0x0, 0x0)
compat_50_futimes(r0, &(0x7f0000000180))


r0 = socket(0x1f, 0x5, 0x2)
listen(r0, 0x0)
close(r0)


mknod(&(0x7f0000000100)='./file0\x00', 0x0, 0x0)
open(&(0x7f0000000140)='./file0\x00', 0x0, 0x0)
compat_50___msgctl13$IPC_STAT(0xffffffffffffffff, 0x2, &(0x7f0000000280)={{}, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, &(0x7f0000000240)={&(0x7f0000000200)={0x0, 0x8001}}})
r0 = socket$inet6(0x18, 0x3, 0x0)
__mount50(&(0x7f0000000000)='v7fs\x00', 0x0, 0x0, 0x0, 0x0)
sendto$inet6(r0, &(0x7f0000000000)='<', 0x328, 0x2, &(0x7f0000000040)={0x18, 0x3}, 0x1c)


compat_20_getfsstat(&(0x7f0000000000), 0xffffffffffffff39, 0x0)
mknod(&(0x7f00000000c0)='./file0\x00', 0x2000, 0x4100)
r0 = open(&(0x7f0000000000)='./file0\x00', 0x0, 0x0)
ioctl$FIONREAD(r0, 0x80045729, &(0x7f0000000080))


madvise(&(0x7f0000001000/0x1000)=nil, 0x20001000, 0x6)


mkdir(&(0x7f0000000000)='./file0\x00', 0x0)
compat_50___msgctl13$IPC_STAT(0x0, 0x2, &(0x7f0000000280)={{}, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, &(0x7f00000001c0)={&(0x7f0000000100)}})
lchown(&(0x7f0000000100)='./file0\x00', 0x0, 0xffffffffffffffff)
compat_90_getvfsstat(0x0, 0x0, 0x0)
compat_40_mount(&(0x7f0000000140)='umap\x00', &(0x7f0000000040)='./file0\x00', 0x0, &(0x7f00000001c0))
compat_40_mount(0x0, &(0x7f0000000140)='./file0\x00', 0x400000, &(0x7f0000000000))


r0 = socket(0x2, 0x1, 0x0)
r1 = socket(0x2, 0x2, 0x0)
close(r0)
getsockname$unix(r1, &(0x7f0000000000)=@abs, &(0x7f0000001200)=0x8)
r2 = socket(0x2, 0x3, 0x0)
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
connect$inet(r0, &(0x7f0000000000), 0x10)
sendto$unix(r2, 0x0, 0x0, 0x0, &(0x7f00000000c0)=@file={0x0, './file1\x00'}, 0xa)


setreuid(0xee00, 0x0)
r0 = open$dir(&(0x7f0000001240)='.\x00', 0x0, 0x0)
mkdirat(0xffffffffffffff9c, &(0x7f0000000340)='./file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\x00', 0x0)
renameat(0xffffffffffffff9c, &(0x7f0000000440)='./file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\x00', 0xffffffffffffff9c, &(0x7f0000000240)='./file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\x00')
faccessat(r0, &(0x7f0000000000)='./file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\x00', 0x6, 0x0)


r0 = open(&(0x7f00000000c0)='./file0\x00', 0x615, 0x0)
r1 = open(&(0x7f00000000c0)='./file0\x00', 0x615, 0x0)
writev(r1, &(0x7f0000001480)=[{&(0x7f0000001240)="cc", 0x1}], 0x1)
mmap(&(0x7f0000000000/0x2000)=nil, 0x2000, 0x0, 0x10, r1, 0x0, 0x0)
mmap(&(0x7f0000001000/0x2000)=nil, 0x2000, 0x0, 0x10, r0, 0x0, 0x0)
madvise(&(0x7f0000000000/0x3000)=nil, 0x3000, 0x4)


shmget$private(0x0, 0x1000, 0x0, &(0x7f0000ffc000/0x1000)=nil)
pipe(&(0x7f0000000080)={<r0=>0xffffffffffffffff})
r1 = getpid()
fktrace(r0, 0x0, 0x62e2dd08f149ff1b, r1)
compat_60__lwp_park(0x0, 0xffffffffffffffff, 0x0, 0x0)


mknod(&(0x7f0000000100)='./file0\x00', 0x80002005, 0x5300)
r0 = open$dir(&(0x7f0000000b80)='./file0\x00', 0x0, 0x0)
ioctl$WSMUXIO_INJECTEVENT(0xffffffffffffffff, 0x80185760, &(0x7f0000000000)={0xffffffff, 0x0, {0x0, 0x1}})
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
r1 = socket(0x18, 0x1, 0x0)
setsockopt(r1, 0x0, 0xc, &(0x7f0000000000)="ebffcbff13b9fd812eaa4e713048e69931929648", 0x14)
ioctl$FIOASYNC(r0, 0xc01c5005, &(0x7f0000000000))


open(&(0x7f00000000c0)='./file0\x00', 0x615, 0x0)
r0 = open$dir(&(0x7f00000000c0)='./file0\x00', 0x2, 0x0)
open(&(0x7f00000000c0)='./file0\x00', 0x615, 0x0)
r1 = open$dir(0x0, 0x0, 0x0)
r2 = open$dir(&(0x7f00000000c0)='./file0\x00', 0x2, 0x0)
pwritev(r2, &(0x7f0000000080)=[{&(0x7f00000006c0), 0xf0f75}], 0x1, 0x0)
ftruncate(0xffffffffffffffff, 0x80002, 0x0)
pwritev(r1, &(0x7f0000000080)=[{0x0}], 0x1, 0x0)
pwritev(r0, &(0x7f0000000080)=[{&(0x7f00000006c0), 0xf0f75}], 0x1, 0x0)


__mount50(&(0x7f0000000000)='kernfs\x00', 0x0, 0x0, 0x0, 0x0)
modctl$MODCTL_LOAD(0x0, &(0x7f00000000c0)={&(0x7f0000000180), 0x0, &(0x7f0000000000)='\t', 0x1})


open(&(0x7f0000000040)='./file0\x00', 0x70e, 0x0)
ktrace(&(0x7f0000000200)='./file0\x00', 0x4, 0xd27d43220c7df9b, 0x0)
__posix_fadvise50(0xffffffffffffffff, 0x0, 0x0, 0x0, 0x0)


r0 = getpgrp()
setpgid(r0, 0x0)


sendto$unix(0xffffffffffffffff, 0x0, 0x0, 0x0, 0x0, 0x0)
mlock(&(0x7f0000fff000/0x1000)=nil, 0x1000)
mlock(&(0x7f0000e98000/0x2000)=nil, 0x2000)
mlock(&(0x7f0000ff4000/0x9000)=nil, 0x9000)
munmap(&(0x7f0000d35000/0x2000)=nil, 0x2000)
munlock(&(0x7f0000ffb000/0x3000)=nil, 0x3000)
munmap(&(0x7f0000e02000/0x3000)=nil, 0x3000)
munmap(&(0x7f0000ed0000/0x4000)=nil, 0x4000)
mlock(&(0x7f0000e2b000/0x3000)=nil, 0x3000)
compat_40_mount(0x0, 0x0, 0x0, 0x0)
mknod$loop(0x0, 0x0, 0x1)
_ksem_open(&(0x7f00000008c0), 0x0, 0x0, 0x1, &(0x7f0000000440))


mknod(&(0x7f00000000c0)='./bus\x00', 0x2000, 0x0)
r0 = open$dir(&(0x7f0000000000)='./bus\x00', 0x0, 0x0)
ioctl$FIOASYNC(r0, 0x4004741a, &(0x7f00000001c0))


r0 = socket(0x18, 0x3, 0x0)
socket(0x0, 0x0, 0x0)
syz_emit_ethernet(0x0, 0x0)
fcntl$setown(r0, 0x6, 0xffffffffffffffff)


symlink(&(0x7f0000000080)='.\x00', &(0x7f0000000240)='./file0\x00')
compat_50___msgctl13$IPC_STAT(0x0, 0x2, &(0x7f0000000280)={{}, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, &(0x7f00000001c0)={&(0x7f0000000100), 0xb}})
lchown(&(0x7f0000000100)='./file0\x00', 0x0, 0x0)
compat_40_mount(&(0x7f0000000380)='union\x00', &(0x7f0000000140)='./file0\x00', 0x0, &(0x7f00000001c0))
__mount50(&(0x7f00000002c0)='overlay\x00', &(0x7f0000000040)='.\x00', 0x0, &(0x7f0000000540), 0x0)
mknod(&(0x7f0000000000)='./file1\x00', 0x2000, 0xa718)
open$dir(&(0x7f00000000c0)='./file1\x00', 0x0, 0x0)


connect$unix(0xffffffffffffffff, &(0x7f0000000040)=@abs={0x1, 0x0, 0x1}, 0x8)
r0 = socket(0x18, 0x3, 0x0)
setsockopt(r0, 0x29, 0x1a, &(0x7f0000000040), 0x4)


compat_40_mount(&(0x7f0000000040)='tmpfs\x00', &(0x7f00000000c0)='.\x00', 0x0, &(0x7f00000002c0)="01")
mknod(&(0x7f0000000280)='./file0\x00', 0x1ffa, 0x0)
pathconf(&(0x7f0000000180)='./file0\x00', 0x5)


r0 = socket(0x18, 0x2, 0x0)
connect$unix(r0, 0x0, 0x0)
compat_90_fstatvfs1(r0, 0x0, 0x0)


mkdir(&(0x7f0000000080)='./file0\x00', 0x0)
compat_50___msgctl13$IPC_STAT(0x0, 0x2, &(0x7f0000000280)={{}, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, &(0x7f00000001c0)={&(0x7f0000000100), 0xdffffffffffff7ff}})
lchown(&(0x7f0000000100)='./file0\x00', 0x0, 0xffffffffffffffff)
__mount50(&(0x7f0000000000)='fdesc\x00', &(0x7f0000000200)='./file0\x00', 0x0, 0x0, 0x0)
compat_40_mount(&(0x7f0000000380)='union\x00', &(0x7f0000000140)='./file0\x00', 0x3, &(0x7f00000001c0))
pathconf(&(0x7f0000000200)='./file0\x00', 0x6)


symlink(&(0x7f0000000080)='.\x00', &(0x7f0000000240)='./file0\x00')
compat_50___msgctl13$IPC_STAT(0x0, 0x2, &(0x7f0000000280)={{}, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, &(0x7f00000001c0)={&(0x7f0000000100), 0xdffffffffffff7ff}})
lchown(&(0x7f0000000100)='./file0\x00', 0x0, 0xffffffffffffffff)
compat_40_mount(&(0x7f0000000380)='union\x00', &(0x7f0000000140)='./file0\x00', 0x0, &(0x7f00000001c0))
readlink(&(0x7f0000000040)='./file0\x00', 0x0, 0x0)


mprotect(&(0x7f0000fee000/0x2000)=nil, 0xffffffffdf011fff, 0x0)


writev(0xffffffffffffffff, 0x0, 0x0)
compat_50_select(0x0, 0x0, &(0x7f0000000080), &(0x7f0000000100), 0x0)


__mount50(&(0x7f00000002c0)='overlay\x00', 0x0, 0x0, 0x0, 0x0)
__mount50(&(0x7f0000000000)='fdesc\x00', &(0x7f0000000040)='.\x00', 0x0, 0x0, 0x0)
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
__mount50(&(0x7f00000002c0)='overlay\x00', &(0x7f0000000040)='.\x00', 0x0, &(0x7f0000000540), 0x0)
pathconf(&(0x7f0000000040)='./file0\x00', 0x2)


r0 = _lwp_self()
_lwp_exit()
_lwp_kill(r0, 0x0)


modctl$MODCTL_LOAD(0x0, 0x0)
compat_40_mount(&(0x7f0000000040)='tmpfs\x00', &(0x7f0000000140)='.\x00', 0x0, &(0x7f0000000080)="01")
symlink(&(0x7f0000000080)='.\x00', &(0x7f0000000240)='./file0\x00')
__mount50(&(0x7f00000002c0)='overlay\x00', &(0x7f0000000040)='.\x00', 0x0, &(0x7f0000000540), 0x0)
compat_50___msgctl13$IPC_STAT(0x0, 0x2, &(0x7f00000003c0)={{}, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, &(0x7f00000001c0)={&(0x7f0000000100), 0xdffffffffffff803}})
lchown(&(0x7f0000000100)='./file0\x00', 0x0, 0xffffffffffffffff)
compat_40_mount(&(0x7f0000000380)='union\x00', &(0x7f0000000140)='./file0\x00', 0x0, &(0x7f00000001c0))


mknod(&(0x7f0000000040)='./file0\x00', 0x1000, 0x0)
r0 = open(&(0x7f0000000180)='./file0\x00', 0x2, 0x0)
setreuid(0x0, 0xee01)
ioctl$FIONREAD(r0, 0x8020697a, &(0x7f0000000100))


__utimes50(0x0, &(0x7f0000000040)={0x80000001})
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
r0 = socket(0x18, 0x2, 0x0)
setsockopt(r0, 0x1000000000029, 0x3d, &(0x7f0000000040)='\x00\x00\x00\x00', 0x4)


compat_43_ocreat(&(0x7f0000000000)='./file0\x00', 0x0)
__mount50(&(0x7f00000002c0)='overlay\x00', &(0x7f0000000040)='.\x00', 0x0, &(0x7f0000000540), 0x0)
r0 = open$dir(&(0x7f0000000080)='./file0\x00', 0x0, 0x0)
ioctl$OFIOGETBMAP(r0, 0xc004667a, &(0x7f0000000140))


__mount50(&(0x7f0000000180)='null\x00', &(0x7f0000000040)='.\x00', 0x0, &(0x7f0000000100)="cf", 0x1)


semctl$IPC_SET(0x0, 0x0, 0x1, &(0x7f0000000080)={{0x0, 0x0, 0x0, 0xffffffffffffffff}})
r0 = socket(0x800000018, 0x1, 0x0)
setsockopt$sock_int(r0, 0xffff, 0x1000, &(0x7f0000000000)=0x7, 0x4)
bind$unix(r0, &(0x7f0000000080)=@abs={0x1f95d27d48731892, 0x7}, 0x1c)
r1 = socket(0x800000018, 0x1, 0x0)
setsockopt$sock_int(r1, 0xffff, 0x1000, &(0x7f0000000000)=0x800008, 0x4)
fcntl$lock(0xffffffffffffffff, 0x0, &(0x7f0000000080)={0x0, 0x0, 0x0, 0x100080001})
bind$unix(r1, &(0x7f0000000080)=@abs={0x1f95d27d48731892, 0x7}, 0x1c)


posix_spawn(0x0, 0x0, 0xfffffffffffffffe, 0x0, 0x0, 0x0)


poll(0x0, 0x0, 0xfffffffd)


fcntl$lock(0xffffffffffffffff, 0x0, &(0x7f0000000040)={0x0, 0x0, 0x0, 0x1000300000001, 0xffffffffffffffff})
semctl$SETALL(0x0, 0x0, 0x9, &(0x7f0000000040)=[0x7ff])
ioctl$WSMUXIO_INJECTEVENT(0xffffffffffffffff, 0x80185760, &(0x7f0000000000)={0x0, 0x0, {0x0, 0x1}})
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
r0 = socket(0x18, 0x1, 0x0)
setsockopt(r0, 0x1000000029, 0xc, &(0x7f0000000000)="ebffcbff13b9fd812eaa4e713048e69931929648", 0x14)
setsockopt(r0, 0x1000000029, 0xc, &(0x7f0000000040)="ebffcbff13b9fd812eaa4e713048e69931929648", 0x14)


r0 = openat(0xffffffffffffff9c, &(0x7f0000000040)='.\x00', 0x0, 0x0)
fchmod(r0, 0x504)
open(&(0x7f0000000000)='./file0\x00', 0x140, 0x41c)


mknod(&(0x7f0000000300)='./file0\x00', 0x2000, 0x6da)
r0 = open(&(0x7f0000000000)='./file0\x00', 0x0, 0x0)
ioctl$FIOSEEKDATA(r0, 0x2000745f, 0x0)


swapctl$SWAP_NSWAP(0x3)


r0 = socket$inet6(0x18, 0x3, 0x0)
getsockopt(r0, 0x3a, 0x0, 0x0, 0x0)


mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
r0 = socket$inet(0x2, 0x1, 0x0)
setsockopt(r0, 0x6, 0x2, &(0x7f0000000380)="a16a3247", 0x4)


mkdir(&(0x7f0000000080)='./file0\x00', 0x0)
compat_50___msgctl13$IPC_STAT(0x0, 0x2, &(0x7f0000000280)={{}, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, &(0x7f00000001c0)={&(0x7f0000000100), 0xdffffffffffff7ff}})
lchown(&(0x7f0000000100)='./file0\x00', 0x0, 0xffffffffffffffff)
compat_40_mount(&(0x7f0000000380)='union\x00', &(0x7f0000000140)='./file0\x00', 0x0, &(0x7f00000001c0))
pathconf(&(0x7f0000000200)='./file0\x00', 0xe)


__clone(0x4100, &(0x7f00000001c0)="fa8e0bba78fce4a60baecd1bb8b975f5b67823fc8e8b0b9fc77af8c8d241b977b8aac9a5dde20da9b0a8bc881c4b32bf86ea5243832245e02fcdf6f9cfa026e5a3926eaba7b6c725cd64832fa6852299935e4437dd24b78153ef6e41f523fe8a5b15c432a48718c1afb1efcd1c09849acbd694cf7d8642ce6a8ee7d909da3f9eb180e7e0c8d31802436d4494fc75ea673a290b4d1e701ffcdd294d8ae05049277c271a9ca64890dfa3190cb196c9d788eb30b16995963aa9457bf82b108f77044d0b9e0ee719a45d23261005ba363a491a9c69ed474f7ca9dffdc5b87080ff12041f06fc159fc10a4b20b91117edd5c9214cf0a31d881f55028190bc9486380aeb25592c6228e8bb7bd5bac684d8b6842e6933932cc0eab841c7d417c2516ac4eb9c8b7f3bd0740fdfc257c1d82617b334a81b994f03ca44587a23602c6b8f1b7b1bec79ad64ec457aca618a115549007e8f8d15099bf6aca0f7699e68798c9b99c885236b2b589cc52bcb45000d3ba331a5f312f5d35d15a6118fcf9f8bbad4d51756deca6daa061eff23db6be71ca0a57bb4b2f7e189587157b0094f143805c2b90b95c509229ed27db8621b75c47b963ec532dee2fca1f6fa557029df514b36e1586cccfb0710c7d41bed81b57e9ba5d799bb5fe143d8b4c43b7671ce82c34e784565b0aa2badb78bc70eb41c6503632f29e1681eaa6c7e5d049c4bad28c74d61dd5b51738eb17f0879eede992eaed0b528364070764f21bf783b185593f602a9a6ade16d7faaa1f57d01ab50ef550183a79e70eba58cf242c959b58ac3777136ee3b6b64d02eb1c4dc387bc78e5a9504cbe175d4b1ecf3f4c88261b45c438e8a4a0612ee42025a6d122257dd6e4313bcc413ae76ca1f16bef39d86ba84f34dfb4ad1f4037c0ebe6299ac1ad24a40be285bd9010f4ed9d314f8f37b91cdbf3214275c68be0756bcf486568f72aecc6307f1e9b97d8b768e41bfae8619ff1f905b7169b9f6a3ab52e5c77279ad7a1d22cab1d6f20ec9729fafae24c8c153301b8db7628f77e48746f85937eb5b36bded5ac663d5cfa708d60515a75efd44e22a7dcbd029f9fd4d868b0f9a66223513f8fecb72f28b1cd42c7733897e569e7cba6ec41050e5cd997632c53272539fad936c97eefc5d175fa866aeee81aa62f3740780bc3e31f6f0acab54e1478b54e4c362da0bc48ab9d3af418171beb4d27a7fee302e095e902dc2adba4d5a18398bacc21d76c6fca683ca2f85d573c2a259df9f3bffe42386a9e26e7fccc39d6eb89b689599d4bf47d588e2c3c517aaedc65baba1083ded55c8a4359f0a48c4924c4fc02a30f85f6aa74d2692efcf1b0d4794e92a5c14111950a15987c0bcc1bd31c5836d673c170c4abde2b467fe3381503478a97b439d15e871c28a1a0ce561f9acd03fae4319d055def602a94c2461f877f8ef8b6df86d79868e67311c2378da21efafaab2269af448dbfb387ab76e6cf5506b986ef99ce4a94865b53ffcf14353542a24f194bd825eb1a83600f516ee26d8d14798270acfea6f75630be7adc6239ee9e9dbe89e744db148a08e6f53a146a1bd2b7e523c39db28305f0aed62a330a825719e7faeb2ed3ae9d3a3ee0e3b355370ffe346a74779473030256317a3dd873eaceb89af78ecabc735578615dfef89a400fea5edf899651a3f8da60483ca278c17ddefda48bc6db700bfa50820d699c63c1e3a62c5156048eacd3e5f210459fcd9d3df31300e95f60d7b40b40b2c0f54626ab926cbad568ecf39497440788f51aa8b9176e9fd6ab505acd9a5940ed32da86c4a6f8c4a644d911bfbf0c50069e1a908f86efbbb463ded4a1c430935925869a056d1ecd90f4e75e5f4d59037a6265da7386b3b93586c2e76922d0cd10ab6ba9cd61b047bb7e9949237b30348ec3ba6163f30e1d40017d3604fc9d843ef6e13e2d83524ff5d4baec2c4824ba01213afb79b2769b00fb2aeb5a8691d11f4b876566c1924e5b26d5c26429ba1a7cc426c5dfba54fae2736b1da88578cab5d447cdee1795c24418d0ec0ab6bd841c995688bb7789c3a1ad6d9f360208371fb5596756d1d9cf677ba1218f831b5e3f40f8d38c5be2cf49a35bcac6f973ec14059e1fd5989139e8e62baabe9f95074c1a92821b18eaead701ca579270fcfeb19988f98e05a1b1e9c74feca3e25f45538032508d7f1bdb3dc531dad294385ae2bc5f6bb224910554207f9ae074d088da3fa9581ada08ecf8e774da967450908e12269fabc20ddfe1d727813d88cd9dd10b9fad36c2cf9fa7d6135b8f42f2bfb3878a99f84b9b7703b3134f7d929eac8f39920397ccd7e00fb95c6663131789b0baa82a9537fd0b116c4f1aba7ecbc713c19aa0a14ef77d68ee86e513c31c2797814da9468ca90bf238a4198884e7a2a95461f657c56c70d6ce11c7e134db27afff514bbc7f054c56a7ae1c65603502b976cf416231f0ad6df4a5d002633660953f4f3d3e00d377424ca5b1fc41275c6ffec456aacd276bf66c3b941c1c8e9da18c2a7ce0faa2714c7b0a94b7daad9f67831757f73291688d59c34a4d35efc8d1c74225f6a9c061efbc6205b0c4b47deab0498d5ec5258a54b50823a63137917e205644ea7794de55d5eb0961da7bd995a8caefb71a51d47b4231e52b86c9aac0ab9264442f9765e480f7d30a9b92f94dde181a20e8074fead297b503ae35f7d215d3a770b69a78d4f717bbb3bc1681ec16951ba158f13281d443b84d8b3bec728f6b58026186a2fbf227ba5044079afb51f2b1ac5499e90d6f7eb916261926999a828f89ba05525581aa4e1005feec48eaeacafe330a163781930892f741545e30dc3602f4805bc443b71a09cc8da334bc051ee792fbb5faad6a5e47650bed20a7cca26e408f6b74d758465ac990ab26f814d5f46abe442eaa1a30a6c011cd54c0ea6e39c0f1f8e42e7b52cd58842f09da9ade71d9791a153b7728ff34b0c87388c02f25e00f1e70705fb6c34b3873bc00b2716dc5626be0103d7d8a5ed92d731ea248f558cc4b9bcb3ed5c78acc2d2b02fc1fd3600ac52b265bda7732e685a59d39da3eb1ed5b606a6d7839e42700277b6dee7179a3835059c1be20044ca49412b640edaf28d319b96581311b291cf8022c921ac2013e891f5db6fd4303bb2275ebeb0d37db9dac10aeec5ba4c298602bae1a5c519a151bb1f81d51279fafa6f1058002eb3e1b8286ac4d776ece6515e8d18d750a71f93a5b28dadd15e4153472655bab80aeedfe9ec6f2e33f17b53192ae1467cac16639969f398b35f1309a7833d14509a714fbdcb67fa0dbc2cb4414024ca03a7e6ffd9a0e8dc603b9295e4917d8df4550bb66bafd736a268120b737df7aec4d5bf57aac047bc341d8d9ffd68d936c0046aee6eab4e55827fe40a08a66a97b86b149956f057c1bb743ade47d38e2c9a3d1ea719d3c0caf2bb0d145ff2a88af644c3200689e13b96185a5307dc048660657f9593b5323989bddd25d6ef26d5b4f83a4254b9e91080ab5f9b88a6670c1b237e5bf70e25842904c41893a800cc30487c67033d5b29a819ca4be33c31ed0e6ca6f0c236dabc977b97bc64c1ca6d8f41dcfffd64ee9d8b34b30e47130e3ef0bad7deb5205792ed4b0ef77273401b03579ac5d7d57dd28fc55a15fc42227ce7d2d16f7e3d4bf4f7a124a1b854322045f1fee969ed55976e220210c104a1541308632a3b4a4084a43b6edd55367ff6bdb0b4b68d10934d87613b1c883ea4dbb7bb8c0dcd2e26f4563fd0fd214bc71fe2a546558f12b5bb569f543019af8906a7d8fa57810c1d2c4afd64d887f4baacc32cde725f8c63b416fe4246a161e4078ae72bca2602f515ee1d531b5e2ef2e1dc872381986b8ed7261c950f10fa7f7b6278b1f7f5421661bd427e0fca4aa49d0cc77631c0b5b599b3915f1deb314cecea05fe820edbc1fa6b191c4101c124498747a2ac8cdd19da0162c5d9e8c8cfa5601d7277965577004f1c0333311035ba5e04a268d4918f2cc229fd365531231498bd315cd0c69a381eb94c5dcfdde7e89588238c8eff135d6df19b93b2105461d426aa27532ec5cc845573609ed9dcf1fc329b08ddc8f812e3e68eb604abf03801ac909da3fc3fc63e03b5544769842088cd1e2576c1c7d9536a4877805e5f17ba53f50321f70dc377835495713e7a8cad8ed7d3f769e87d67dc9349690582af01cfe2492236c3051c280ab6c39810a9dc4a3fd68b6ebf2a327cbd51833765b3e54776922de7c110560b279db57a9237e7771108e84c03d2d5f820c5e359773eda2b6f8f04817fa854c0a6bf1094bb4231ae26050bef165812c56c231e064785ae06dd3662be16a397df879bc90f9e839d9a5fc7317f66f7abafb78cda981c51dedd609a77fa15b930864291de56fe535c1424359a2cef4352e9042f9900caf6553c79e6c368bcabd905ae00547c7ee4a3b026a78b42cb51013863ba74ac17cdacd3115129345be55df1dd012a45c6f4909b72687220cece8c9918b1976ae56cf3593c9c63c2b5be7a770c01ccb37ea509b83d5ef1778f7be67993159a13928d9511b9121e38ee8db8d6bae980e9fc4060b5866f5ab5834a67584592f46774c76469570d05b26215690279df9c4b5747592c9d06a4e3a318b0ab3b404d3f813a10e2ce931405784dd03ad0a8ba9d053b6bb89ece1b673595a697064721f49c452a5c794d070005057666e0ca5211e8240dd28876a1f0963dbf2dee9796488a01fc4f91550948e123392abf82d77e352918540e21502d93c84a90a42dc22b42e9595b9732105c265b5d6e2b05f1bd49523df7e805c4dc42bd0d9ede96c82321de021528bcfd02421b6d056b95cbaa6776d14f3ce5e2b9611d0f5a2733334d595eb1c2f8422e0d59ed87d8f6e78f7e527bf417e6ca9c772023b0160abe12211453cbb3df908cb74314d159da31ef76058ec0301a96db164ad174468dbe947021fdecf053296c997cc24e1cad8cfb7988fc95828fcececc8bb580b27afa3dfc62119b4e30befeb1a61a9b0440ad7ec7cc2ec8e7b6c6fcfd82e02a638339ad3d4d070a4d02c005de6d52beb04dcae0319d7890693304bbc62a388134b76402b7c4ce6c8364b55d6301ed48e6caa411298072780e577aa10d1ab61b5af0a2")
__wait450(0x0, &(0x7f0000001400), 0x4, 0x0)


open(&(0x7f0000000340)='./file0\x00', 0x300, 0x0)
r0 = open(&(0x7f0000000100)='./file0\x00', 0x0, 0x0)
fcntl$lock(r0, 0x9, &(0x7f0000000340)={0x0, 0x0, 0x0, 0x100000001})
syz_usb_connect$printer(0x0, 0x36, &(0x7f0000000040)=ANY=[], 0x0)


pipe2(&(0x7f0000000000)={0xffffffffffffffff, <r0=>0xffffffffffffffff}, 0x0)
fcntl$setstatus(r0, 0x4, 0xbb4289d0ac784055)


mknodat(0xffffffffffffff9c, &(0x7f0000000040)='./file0\x00', 0xc0e99db6de761f86, 0x0)
open(&(0x7f0000000480)='./file0\x00', 0x0, 0x0)
r0 = open$dir(&(0x7f00000001c0)='./file0\x00', 0x1, 0x0)
writev(0xffffffffffffffff, &(0x7f0000000e00)=[{&(0x7f0000000080)="c3778fe486fa8f9f05ce4bfcaad7868e4bd7872ff0597773fa1e061cebd702547202798b96c0a24deae72d0db883bac94c4761e84558e6ca6270794d9b5eb06bb75ce478a7f5a6d622", 0x49}], 0x1)
ioctl$FIONREAD(r0, 0xc0106978, &(0x7f00000000c0))


r0 = _lwp_self()
_lwp_kill(r0, 0x0)


compat_50_setitimer(0x0, &(0x7f0000001800)={{}, {0x1}}, 0x0)
__setitimer50(0x0, &(0x7f0000000400), 0x0)
__getitimer50(0x0, &(0x7f00000001c0))


compat_40_mount(&(0x7f0000000040)='tmpfs\x00', &(0x7f00000000c0)='.\x00', 0x0, &(0x7f00000002c0)="01")
mkdirat(0xffffffffffffff9c, &(0x7f00000005c0)='./file0\x00', 0x0)
r0 = open(&(0x7f0000000180)='./file0\x00', 0x0, 0x0)
compat_43_ogetdirentries(r0, 0x0, 0x0, 0x0)


open$dir(&(0x7f00000000c0)='./file0\x00', 0x40000400001803c1, 0x0)
lchflags(&(0x7f0000000200)='./file0\x00', 0x60006)
open(&(0x7f00000000c0)='./file0\x00', 0x0, 0x0)


socket$inet(0x2, 0x0, 0x0)
mlock(&(0x7f0000ffc000/0x3000)=nil, 0x3000)
__msync13(&(0x7f0000ffd000/0x1000)=nil, 0x0, 0x2)


mknod(&(0x7f00000000c0)='./file0\x00', 0x2000, 0x4100)
chroot(0x0)
r0 = open$dir(&(0x7f00000000c0)='./file0\x00', 0x0, 0x0)
compat_90_fstatvfs1(r0, &(0x7f0000001480), 0x0)


r0 = compat_30_socket(0x22, 0x30000003, 0x0)
compat_43_osend(r0, &(0x7f0000000080)="7a5f6590", 0x4, 0x0)


__getrusage50(0x0, 0x0)
mknod(&(0x7f0000000100)='./file0\x00', 0x80002005, 0x5300)
open(0x0, 0x0, 0x0)
ktrace(&(0x7f0000000000)='./file0\x00', 0x0, 0x0, 0x0)
r0 = open$dir(&(0x7f0000000b80)='./file0\x00', 0x0, 0x0)
ioctl$FIOASYNC(r0, 0xc0185005, &(0x7f0000000000))


munmap(&(0x7f0000001000/0x3000)=nil, 0x3000)
__msync13(&(0x7f0000001000/0x4000)=nil, 0x1000000, 0x2)
r0 = semget$private(0x0, 0x1, 0x420)
mkdir(&(0x7f00000002c0)='./bus\x00', 0x0)
chflags(&(0x7f0000000380)='./bus\x00', 0x4)
rmdir(&(0x7f00000000c0)='./bus\x00')
____semctl50$SETALL(r0, 0x0, 0x9, &(0x7f0000000080)=@array=&(0x7f0000000040))
compat_50_____semctl13$GETALL(0x0, 0x0, 0x6, 0x0)
socket$unix(0x1, 0x0, 0x0)


__mount50(&(0x7f00000002c0)='overlay\x00', &(0x7f0000000040)='.\x00', 0x0, &(0x7f0000000540), 0x0)
mkdir(&(0x7f0000000080)='./file0\x00', 0x0)
__lstat50(&(0x7f0000000000)='.\x00', 0x0)
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
unmount(&(0x7f0000000000)='./file0\x00', 0x255a0100)


__mount50(&(0x7f0000000180)='puffs\x00', &(0x7f0000000040)='.\x00', 0x0, &(0x7f00000001c0)='X', 0x1)


compat_43_ocreat(&(0x7f0000000000)='./file0\x00', 0x0)
truncate(&(0x7f0000000040)='./file0\x00', 0x0, 0x10000010001)
r0 = open$dir(&(0x7f0000000180)='./file0\x00', 0x0, 0x0)
preadv(r0, &(0x7f00000012c0)=[{0x0, 0x1b}], 0x1, 0x0)


r0 = socket(0x1f, 0x1, 0x0)
fstatat(r0, &(0x7f00000001c0)='./file0\x00', 0x0, 0x0)


__mount50(&(0x7f00000002c0)='overlay\x00', &(0x7f0000000040)='.\x00', 0x0, &(0x7f0000000540), 0x0)
symlink(&(0x7f0000000080)='.\x00', &(0x7f0000000240)='./file0\x00')
compat_50___msgctl13$IPC_STAT(0x0, 0x2, &(0x7f0000000280)={{}, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, &(0x7f00000001c0)={&(0x7f0000000100), 0xdffffffffffff7ff}})
lchown(&(0x7f0000000100)='./file0/file0\x00', 0x0, 0x0)
compat_40_mount(&(0x7f0000000380)='union\x00', &(0x7f0000000140)='./file0\x00', 0x0, &(0x7f00000001c0))
modctl$MODCTL_UNLOAD(0x2, 0x0)
mknod(&(0x7f00000000c0)='./bus\x00', 0x2000, 0x0)
r0 = open$dir(&(0x7f0000000000)='./bus\x00', 0x0, 0x0)
ioctl$FIOASYNC(r0, 0x40043105, 0x0)


r0 = socket$inet6(0x18, 0x3, 0x0)
sendto$inet6(r0, &(0x7f0000000000)="88", 0x358, 0x2, &(0x7f0000000040)={0x18, 0x3}, 0x1c)


msgget(0x3, 0x0)
open(&(0x7f0000001640)='./file0\x00', 0x615, 0x0)
ktrace(&(0x7f0000000200)='./file0\x00', 0x4, 0xd27d43220c7df9b, 0x0)
paccept(0xffffffffffffffff, 0x0, 0x0, 0x0)


r0 = socket(0x11, 0x3, 0x0)
recvmmsg(r0, &(0x7f0000000340)={0x0}, 0x10, 0x403, 0x0)


madvise(&(0x7f0000ffa000/0x4000)=nil, 0x4000, 0x1)
mlock(&(0x7f0000ffb000/0x1000)=nil, 0x1000)
mmap(&(0x7f0000ff9000/0x4000)=nil, 0x4000, 0xd49f275d97cc01bb, 0x1810, 0xffffffffffffffff, 0x0, 0x0)


msgctl$IPC_SET(0x0, 0x1, &(0x7f0000000080)={{0x0, 0x0, 0xffffffffffffffff}})
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
r0 = socket$inet(0x2, 0x1, 0x0)
setsockopt$sock_timeval(r0, 0xffff, 0x1006, &(0x7f0000000080), 0x10)


symlink(0x0, &(0x7f0000000240)='./file0\x00')
minherit(&(0x7f000000b000/0x1000)=nil, 0x1000, 0x1)
__clone(0x0, &(0x7f0000000040))
mlock(&(0x7f0000ffd000/0x2000)=nil, 0x2000)
mlock(&(0x7f000000a000/0x4000)=nil, 0x4000)


connect$unix(0xffffffffffffffff, 0x0, 0x0)
r0 = socket$inet(0x2, 0x2, 0x0)
getsockopt(r0, 0x11, 0x0, 0x0, 0x0)


open(&(0x7f0000002600)='./file0\x00', 0x0, 0x0)
compat_50_setitimer(0x0, 0x0, &(0x7f0000002000))


symlink(&(0x7f0000000080)='.\x00', &(0x7f0000000240)='./file0\x00')
compat_50___msgctl13$IPC_STAT(0x0, 0x2, &(0x7f0000000280)={{}, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, &(0x7f00000001c0)={&(0x7f0000000100)}})
__mount50(&(0x7f00000002c0)='overlay\x00', &(0x7f0000000040)='.\x00', 0x0, &(0x7f0000000540), 0x0)
sendmsg$unix(0xffffffffffffffff, &(0x7f0000000240)={&(0x7f0000000180)=@abs={0x0, 0x0, 0x3}, 0x8, &(0x7f00000038c0)}, 0x0)
open(&(0x7f0000000100)='./file0\x00', 0x0, 0x0)
compat_40_mount(&(0x7f0000000140)='umap\x00', &(0x7f0000000040)='./file0\x00', 0x0, &(0x7f00000001c0))
chmod(0x0, 0x0)
lchown(&(0x7f0000000100)='./file0\x00', 0x0, 0xffffffffffffffff)


r0 = socket$inet(0x2, 0x1, 0x0)
setsockopt(r0, 0x6, 0x5, &(0x7f0000000a00)="8b589d9d", 0x4)


modctl$MODCTL_UNLOAD(0x2, 0x0)
mkdir(&(0x7f0000000080)='./file0\x00', 0x0)


__mount50(&(0x7f00000001c0)='fdesc\x00', &(0x7f0000000040)='.\x00', 0x0, 0x0, 0x0)
setsockopt$sock_int(0xffffffffffffffff, 0xffff, 0x0, &(0x7f0000000000)=0x7, 0x4)
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
mknod(&(0x7f0000000000)='./file1\x00', 0x0, 0x0)


mkdir(&(0x7f0000000080)='.\x00', 0x0)


socket(0x18, 0x3, 0x0)
mknod(&(0x7f00000000c0)='./bus\x00', 0x2000, 0x0)
r0 = open$dir(&(0x7f0000000000)='./bus\x00', 0x0, 0x0)
ioctl$FIOASYNC(r0, 0x80017472, &(0x7f0000000040)=0x1ff)


socketpair$unix(0x1, 0x2, 0x0, &(0x7f0000000200)={<r0=>0xffffffffffffffff, <r1=>0xffffffffffffffff})
dup2(r1, r0)
poll(&(0x7f0000000040)=[{r0, 0x1}], 0x1, 0x0)


compat_40_mount(&(0x7f0000000040)='tmpfs\x00', &(0x7f00000000c0)='.\x00', 0x0, &(0x7f00000002c0)="01")
open(&(0x7f0000000280)='./file0\x00', 0x70e, 0x0)
pathconf(&(0x7f00000000c0)='./file0\x00', 0x6)


mkdir(&(0x7f0000000500)='./file0\x00', 0x0)
setreuid(0xee00, 0x0)
r0 = getuid()
chown(&(0x7f0000000180)='./file0\x00', r0, 0xffffffffffffffff)
r1 = getuid()
setreuid(0x0, r1)
chmod(&(0x7f0000000080)='./file0\x00', 0x2ea)
chdir(&(0x7f0000000100)='./file0\x00')
mknodat(0xffffffffffffff9c, &(0x7f0000000040)='./file0\x00', 0xc0e99db6de761f86, 0x0)


open(&(0x7f00000000c0)='./file0\x00', 0x615, 0x0)
truncate(&(0x7f0000000200)='./file0\x00', 0x0, 0x7fff)
r0 = open(&(0x7f0000000040)='./file0\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x13000)=nil, 0x13000, 0x5, 0x10, r0, 0x0, 0x0)
setrlimit(0x0, &(0x7f0000000140))
modctl$MODCTL_LOAD(0x0, &(0x7f0000001180)={0x0, 0x0, 0x0})


r0 = socket$inet(0x2, 0x1, 0x0)
connect$inet(r0, &(0x7f0000000000)={0x2, 0x0}, 0x10)
bind$inet(r0, &(0x7f0000000080)={0x2, 0x0}, 0xc)


setreuid(0xee00, 0x0)
r0 = getuid()
seteuid(r0)
compat_20_getfsstat(&(0x7f0000000000), 0xffffffffffffff8e, 0x0)
modctl$MODCTL_STAT(0x4, &(0x7f0000000340)={&(0x7f0000000240)=""/212, 0xd4})


recvmsg(0xffffffffffffffff, &(0x7f0000000180)={0x0, 0x0, &(0x7f0000000640), 0x0, 0x0}, 0x0)
r0 = socket(0x2, 0x1, 0x0)
ioctl$FIOSEEKHOLE(r0, 0x8020690e, &(0x7f0000000180)=0x8000000000000032)


mknod(&(0x7f0000000480)='./file0\x00', 0x2000, 0x1733)
r0 = open(&(0x7f00000000c0)='./file0\x00', 0x0, 0x0)
ioctl$FIONREAD(r0, 0x80044270, &(0x7f0000000080))


mknod(&(0x7f00000000c0)='./bus\x00', 0x2000, 0x4f4b)
r0 = open$dir(&(0x7f0000000040)='./bus\x00', 0x0, 0x0)
ioctl$FIOASYNC(r0, 0xc0104304, &(0x7f00000001c0))
r1 = socket(0x1d, 0x40000003, 0x0)
paccept(r1, 0x0, 0x0, 0x20000000)


__mount50(&(0x7f0000000240)='ffs\x00', &(0x7f0000000040)='.\x00', 0x0, &(0x7f0000000300)='z', 0x1)


__mount50(&(0x7f00000002c0)='overlay\x00', &(0x7f0000000040)='.\x00', 0x0, &(0x7f0000000580)="264b101aa2098808730d45b82d50e7508b9362e1fd3d3213f15473f2ce5f14d13165125f079ee8fc57d8b3ea4c0af729f51f66607a0e7633657987fc6d3df4799fbfa04429d84017ef8d5116d597a647812ef6ac02b7ae61072daa31c0275c5f8d01694b1334f1b8432da7bae94348f74723ed87ba08000000675dc4713b22da04fa2bf8203fa2e06419d7fec3de8d74fe01dd52c280d50c1f2f399daf4a71792cce47bf9119a642b0a64ad139797b8c5ccd410cc9497eb591e6d129223dbec41188e3681e13006e25e4282089480020bfb4944965cdddbe1d43943512977169d76aa8507ba259a6c9536e152f625bd88bf3e5495a2bc029f7fde37733b657205eabd23cafd268fe995b686ca52b92719b9b2bc21b32e50c4b656d9cba95e9e73ef6d144b35917b5d6a2e840ab8232ec28b8d6d9e25ecfd09d1761778d8b4fbac129b1161764e7cb4fd3b9e322f2d75128eee24ed797014c0e606dd63f0c7a609a4f89f81d0b25d86ae89ef274509d36ef9957932e6a5aee0387cfd106131792e8e25800000000", 0xfffffffffffffe61)


open(&(0x7f00000000c0)='./file0\x00', 0x205, 0x0)
r0 = open(&(0x7f00000000c0)='./file0\x00', 0x0, 0x0)
fcntl$lock(r0, 0x8, &(0x7f0000000000)={0x0, 0x0, 0x0, 0x1000300010008, 0xffffffffffffffff})


mkdir(&(0x7f0000000040)='./file0\x00', 0x0)
compat_40_mount(&(0x7f00000001c0)='ptyfs\x00', &(0x7f00000002c0)='./file0\x00', 0x0, &(0x7f00000000c0))
chflags(&(0x7f0000000000)='./file0\x00', 0x0)
open(&(0x7f0000000040)='./file0\x00', 0x0, 0x0)


r0 = getsid(0x0)
ptrace(0x9, r0, 0x0, 0x0)
compat_50_wait4(0x0, 0x0, 0x0, 0x0)
ptrace(0x13, r0, 0x0, 0x0)


modctl$MODCTL_UNLOAD(0x2, 0x0)
__mount50(&(0x7f0000000000)='overlay\x00', &(0x7f0000000040)='.\x00', 0x0, &(0x7f0000000540), 0x0)
__mount50(0x0, &(0x7f0000000040)='.\x00', 0xb0afbd006181d6de, &(0x7f0000000540), 0x0)


ptrace(0x15, 0x0, 0x0, 0x0)


r0 = _lwp_self()
compat_50__lwp_park(0x0, r0, 0x0, 0x0)


socketpair$unix(0x1, 0x2, 0x0, &(0x7f0000000100)={0xffffffffffffffff, <r0=>0xffffffffffffffff})
bind$unix(r0, &(0x7f0000000140)=@file={0x1, './file0\x00'}, 0xa)
open(&(0x7f0000000000)='./file0\x00', 0x0, 0x0)


r0 = socket(0x2, 0x1, 0x0)
ioctl$FIOSEEKHOLE(r0, 0x8020690e, &(0x7f0000000180)=0x8000000000000032)


compat_43_ogetrlimit(0x0, &(0x7f00000004c0))


mknod(&(0x7f0000000240)='./file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\x00', 0x2000, 0x0)
mknod$loop(&(0x7f0000000000)='./file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\x00', 0x2000, 0x1)
link(&(0x7f0000000940)='./file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\x00', &(0x7f0000000d40)='./file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\x00')
link(&(0x7f0000000100)='./file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\x00', &(0x7f0000000bc0)='./file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\x00')
symlink(&(0x7f0000000ac0)='./file0\x00', &(0x7f0000000e40)='./file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\x00')
symlink(&(0x7f0000000080)='.\x00', &(0x7f0000000240)='./file0\x00')
compat_50___msgctl13$IPC_STAT(0x0, 0x2, &(0x7f0000000280)={{}, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, &(0x7f00000001c0)={&(0x7f0000000100), 0x3f}})
socket(0x0, 0x1, 0x0)
lchown(&(0x7f0000000100)='./file0\x00', 0x0, 0xffffffffffffffff)
compat_40_mount(&(0x7f0000000380)='union\x00', &(0x7f0000000140)='./file0\x00', 0x0, &(0x7f00000001c0))
__mount50(&(0x7f00000002c0)='overlay\x00', &(0x7f0000000040)='.\x00', 0x0, &(0x7f0000000540), 0x0)


open$dir(&(0x7f0000000080)='./bus\x00', 0x0, 0x0)
mknod(&(0x7f00000000c0)='./file0\x00', 0x2000, 0x4100)
r0 = open(&(0x7f00000000c0)='./file0\x00', 0x0, 0x0)
ioctl$FIOASYNC(r0, 0x8010572a, &(0x7f0000000080))


r0 = getsid(0x0)
ptrace(0x9, r0, 0x0, 0x0)
compat_50_wait4(0x0, 0x0, 0x0, 0x0)
ptrace(0xb, r0, 0x0, 0x0)


socketpair$unix(0x1, 0x1, 0x0, &(0x7f0000000040)={0xffffffffffffffff, <r0=>0xffffffffffffffff})
r1 = socket(0x18, 0x3, 0x0)
ioctl$FIOSEEKHOLE(r1, 0x80206913, &(0x7f0000000180))
setsockopt$sock_timeval(r0, 0xffff, 0x1005, 0x0, 0x0)


compat_40_mount(&(0x7f0000000200)='procfs\x00', &(0x7f00000000c0)='.\x00', 0x0, &(0x7f00000002c0)="01")
__mount50(&(0x7f00000002c0)='overlay\x00', &(0x7f0000000040)='.\x00', 0x0, &(0x7f0000000540), 0x0)
pathconf(&(0x7f00000001c0)='./file0\x00', 0x0)


r0 = socket(0x18, 0x3, 0x0)
ioctl$FIOSEEKHOLE(r0, 0xc0986981, &(0x7f0000000180)=0x8000000000000032)


mknod(&(0x7f00000000c0)='./bus\x00', 0x2000, 0x4f4b)
compat_50___msgctl13$IPC_STAT(0x0, 0x2, &(0x7f0000000280)={{}, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, &(0x7f00000001c0)={&(0x7f0000000100)}})
lchown(&(0x7f0000000100)='./file0\x00', 0x0, 0xffffffffffffffff)
r0 = open$dir(&(0x7f0000000080)='./bus\x00', 0x0, 0x0)
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1)
r1 = socket(0x2, 0x1, 0x0)
r2 = dup(r1)
r3 = fcntl$dupfd(r2, 0x2, 0xffffffffffffffff)
close(r3)
socket(0x2, 0x1, 0x0)
setsockopt$sock_int(r3, 0xffff, 0x0, 0x0, 0x0)
ioctl$FIOASYNC(r0, 0xc0104304, &(0x7f00000001c0))


compat_43_osethostname(&(0x7f0000000200)="e590a201fb206660e42b30e304d412ece1c9a71a15729d2bddc60a868471bfec9e38556548ec4d3cd1b1ee4a72a807fdf74193d297b8291ae09c6bddfceca461e667e464e3f4f56379c9613e6ea66b35aecd778c70007554c081b1f06deb01db77fa34d260c253deda137ac5632c0700c12db42141efbe1217b1526c44ae54a87b5db76ec629965ddc5286d4a8993a0fd31ee3d1c061d15d24e31299c044b1e6fdf41aa6b1f86bcdbfcc9a9c38c84f35fa86ef5cddae41eb3601972cc7988213887d17d87b2d3f1bbf3bd8c0308f08b4212734bb661d9be9aea0aadaccc097a65463290323550c2bd7b88c61ad5d22a0829f00"/256, 0xffffffffffffff30)


modctl$MODCTL_UNLOAD(0x2, 0x0)
setpriority(0x3, 0x1000, 0x0)


r0 = open(&(0x7f00000000c0)='./file0\x00', 0x615, 0x0)
writev(r0, &(0x7f0000001480)=[{&(0x7f0000001240)="cc", 0x1}], 0x1)
mmap(&(0x7f0000000000/0x2000)=nil, 0x2000, 0x2, 0x10, r0, 0x0, 0x0)
socketpair$unix(0x1, 0x2, 0x0, &(0x7f0000000040)={<r1=>0xffffffffffffffff, <r2=>0xffffffffffffffff})
sendmsg$unix(r2, &(0x7f00000000c0)={0x0, 0x0, 0x0, 0x0, &(0x7f0000000100)=ANY=[@ANYBLOB="28000000ffff000001"], 0x28}, 0x0)
recvmsg(r1, &(0x7f0000000000)={0x0, 0x0, 0x0, 0x0, &(0x7f0000001380)=""/193, 0xc1}, 0x0)


compat_43_ommap(&(0x7f000064f000/0x2000)=nil, 0xfffffffffffff000, 0x0, 0x0, 0xffffffffffffffff, 0x0)
compat_40_mount(&(0x7f0000000040)='tmpfs\x00', &(0x7f00000000c0)='.\x00', 0x0, &(0x7f0000000140)="01")
__mount50(&(0x7f0000000080)='overlay\x00', &(0x7f0000000040)='.\x00', 0x0, &(0x7f0000000540), 0x0)
mknod$loop(&(0x7f0000000000)='./file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\x00', 0x2000, 0x1)
link(&(0x7f0000000940)='./file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\x00', &(0x7f0000000d40)='./file0\x00')


read(0xffffffffffffffff, 0x0, 0x0)


setreuid(0x0, 0xee01)
acct(0x0)
setreuid(0x0, 0x0)
open(0x0, 0x0, 0x0)
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x0)
minherit(&(0x7f0000003000/0x2000)=nil, 0x2000, 0x0)


compat_40_mount(0x0, 0x0, 0x0, 0x0)
__mount50(&(0x7f0000000000)='overlay\x00', &(0x7f0000000040)='.\x00', 0x0, &(0x7f0000000100), 0x0)
mkdir(&(0x7f0000000000)='./control\x00', 0x0)
open(&(0x7f0000022ff6)='./control\x00', 0x0, 0x0)
pwrite(0xffffffffffffffff, 0x0, 0x0, 0x0)
rmdir(&(0x7f0000000040)='./control\x00')


r0 = socket$inet(0x2, 0x3, 0x0)
r1 = socket$inet(0x2, 0x2, 0x0)
setsockopt$inet_opts(r1, 0x0, 0x200000000000c, &(0x7f0000000480)="eaef125c00000000", 0x8)
setsockopt$inet_opts(r1, 0x0, 0x200000000000c, &(0x7f0000000000)="ea00000100000000", 0x8)
dup2(r0, r1)


pathconf(0x0, 0x0)
setpgid(0x0, 0x0)
r0 = getpgrp()
getpriority(0x1, r0)


setreuid(0xffffffffffffffff, 0xee00)
__getfh30(0x0, 0x0, 0x0)


r0 = getsid(0x0)
ptrace(0x9, r0, 0x0, 0x0)
ptrace(0x18, r0, 0x0, 0x0)


mkdir(&(0x7f0000000040)='./file0\x00', 0x0)
r0 = open(&(0x7f0000000100)='./file0\x00', 0x0, 0x0)
compat_43_oftruncate(r0, 0xffffffffffffffff)


mknod(&(0x7f0000000300)='./file0\x00', 0x2000, 0x6da)
r0 = open(&(0x7f0000000480)='./file0\x00', 0x0, 0x0)
ioctl$VT_WAITACTIVE(r0, 0x20007606)


fcntl$setstatus(0xffffffffffffffff, 0x9, 0xd0ebc5fcfebcfbe6)


open(&(0x7f0000000500)='./file0\x00', 0x70e, 0x0)
fchroot(0xffffffffffffffff)


r0 = socket(0x18, 0x3, 0x0)
ioctl$FIOSEEKHOLE(r0, 0xc118691d, &(0x7f0000000180))
r1 = msgget$private(0x0, 0x0)
msgrcv(r1, 0x0, 0x0, 0x0, 0x0)
compat_50___msgctl13$IPC_RMID(r1, 0x0)


mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x0)
__clone(0x0, &(0x7f0000000300))
compat_50_wait4(0x0, 0x0, 0x4, &(0x7f0000000440))


symlink(&(0x7f0000000080)='.\x00', &(0x7f0000000040)='./bus\x00')
__stat50(&(0x7f0000000100)='./bus\x00', &(0x7f0000000200)={<r0=>0x0})
munmap(&(0x7f0000001000/0x3000)=nil, 0x3000)
mknod(&(0x7f0000000140)='./file0\x00', 0x2000, r0)
r1 = open$dir(&(0x7f00000000c0)='./file0\x00', 0x40000400001803c1, 0x0)
pwritev(r1, &(0x7f0000000080)=[{&(0x7f00000006c0), 0x2cfea}], 0x1, 0x0)


r0 = socket(0x2b, 0x1, 0x0)
bind$unix(r0, &(0x7f0000000580)=@abs={0x0, 0x0, 0x2}, 0x6e)


r0 = socket(0x18, 0x1, 0x0)
setsockopt(r0, 0x29, 0x1c, 0x0, 0x0)


__mount50(0x0, 0x0, 0x0, &(0x7f0000000300)="06", 0x1)
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x0)
fork()
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
symlink(&(0x7f0000000180)='./file1\x00', 0x0)


__mount50(&(0x7f0000000000)='ntfs\x00', &(0x7f0000000080)='.\x00', 0x100, 0x0, 0x0)


r0 = socket(0x10, 0x2, 0x0)
ioctl$FIOSEEKHOLE(r0, 0x8028697b, &(0x7f0000000180)=0x8000000000000032)


shmget(0x2, 0xfffffffffeffffff, 0xa00, &(0x7f0000ffc000/0x4000)=nil)
getsockopt$SO_PEERCRED(0xffffffffffffffff, 0xffff, 0x1022, &(0x7f0000000040), 0xc)
getsockopt$SO_PEERCRED(0xffffffffffffffff, 0xffff, 0x1022, &(0x7f0000000080), 0xc)
getsockopt$SO_PEERCRED(0xffffffffffffff9c, 0xffff, 0x1022, &(0x7f00000000c0), 0xc)
getsockopt$SO_PEERCRED(0xffffffffffffffff, 0xffff, 0x1022, &(0x7f0000000100), 0xc)
semctl$SETALL(0x0, 0x0, 0x9, &(0x7f0000000040)=[0x7ff])
ioctl$WSMUXIO_INJECTEVENT(0xffffffffffffffff, 0x80185760, &(0x7f0000000000)={0x0, 0x0, {0x0, 0x1}})
r0 = getegid()
getgroups(0x1, &(0x7f0000000000)=[r0])
mprotect(&(0x7f00002f3000/0x4000)=nil, 0x4000, 0x4)
r1 = socket(0x18, 0x1, 0x0)
getsockopt$sock_cred(r1, 0xffff, 0x1022, &(0x7f0000000140), &(0x7f0000000180)=0xc)
setsockopt(r1, 0x7, 0xe, &(0x7f0000000000)="ebffcbff13b9fd812eaa4e713048e69931", 0x11)
setsockopt(r1, 0x1000000029, 0xc, &(0x7f0000000040)="ebffcbff13b9fd812eaa4e713048e69931929648", 0x14)
setsockopt(r1, 0x2b, 0xf, &(0x7f0000000200)="eb7fd4ffcb02af6dd359ff13b9fd6c1daa4ebce7c11b0fcc66", 0x19)
sendmsg$unix(0xffffffffffffffff, &(0x7f0000002a40)={&(0x7f0000000000), 0x10, 0x0}, 0x0)
syz_emit_ethernet(0x138, &(0x7f0000000000))
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
r2 = socket(0x2, 0x2, 0x0)
sendmsg$unix(r2, &(0x7f0000002a40)={0x0, 0x0, 0x0}, 0x0)
r3 = msgget$private(0x0, 0x3c)
msgctl$IPC_SET(r3, 0x1, &(0x7f0000001340)={{0xffff, 0x0, 0x0, 0x0, 0x0, 0x0, 0x6}, 0x800, 0xff, 0xffffffffffffffff, 0x0, 0x2, 0xffffffffffffff86, 0x7, 0x45})


mknod(&(0x7f00000000c0)='./bus\x00', 0x2000, 0x0)
r0 = open$dir(&(0x7f0000000000)='./bus\x00', 0x0, 0x0)
ioctl$FIOASYNC(r0, 0x8004747f, &(0x7f00000001c0))


r0 = socket(0x1d, 0x40000003, 0x0)
writev(r0, &(0x7f0000001340)=[{0x0}], 0x1)


open(&(0x7f0000001180)='./file1\x00', 0x615, 0x0)
socket(0x0, 0x2, 0x0)
sendmsg(0xffffffffffffffff, 0x0, 0x0)
ktrace(&(0x7f0000000240)='./file1\x00', 0x4, 0x121a, 0x0)
modctl$MODCTL_UNLOAD(0x2, 0x0)
ioctl$FIOASYNC(0xffffffffffffffff, 0xc0104302, &(0x7f00000001c0))
_lwp_wait(0x0, 0x0)
_lwp_exit()
_lwp_exit()


mkdir(&(0x7f0000000080)='./file0\x00', 0x0)
__mount50(&(0x7f0000000040)='procfs\x00', &(0x7f0000000080)='./file0\x00', 0x0, &(0x7f00000000c0)='@', 0x1)
pathconf(&(0x7f0000000080)='./file0\x00', 0x6)


modctl$MODCTL_UNLOAD(0x2, 0x0)
mkdir(&(0x7f0000000080)='./file0\x00', 0x0)
__mount50(&(0x7f0000000000)='fdesc\x00', &(0x7f0000000200)='./file0\x00', 0x0, 0x0, 0x0)
compat_50___msgctl13$IPC_STAT(0x0, 0x2, &(0x7f0000000280)={{}, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, &(0x7f00000001c0)={&(0x7f0000000100), 0xdffffffffffff7ff}})
lchown(&(0x7f0000000100)='./file0\x00', 0x0, 0xffffffffffffffff)
compat_40_mount(&(0x7f0000000380)='union\x00', &(0x7f0000000140)='./file0\x00', 0x3, &(0x7f00000001c0))
r0 = open$dir(&(0x7f00000000c0)='./file0\x00', 0x0, 0x0)
compat_30_getdents(r0, 0x0, 0x0)


mknod(&(0x7f00000000c0)='./file0\x00', 0x2000, 0x4100)
pathconf(&(0x7f0000000300)='./file0\x00', 0x3)


ioctl$WSMUXIO_INJECTEVENT(0xffffffffffffffff, 0x80185760, &(0x7f0000000000)={0x40001ff})
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
r0 = socket(0x18, 0x1, 0x0)
setsockopt(r0, 0x1000000029, 0xc, &(0x7f0000000000)="ebffcbff13b9fd812eaa4e713048e69931929648", 0x14)
setsockopt(r0, 0x1000000029, 0xd, &(0x7f0000000000)="ebffcbff13b9fd812eaa4e713048e69931929648", 0x14)


fcntl$lock(0xffffffffffffffff, 0x0, &(0x7f0000000040)={0x0, 0x0, 0x0, 0x1000300000001, 0xffffffffffffffff})
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
r0 = socket(0x0, 0x1, 0x0)
setsockopt(r0, 0x1000000029, 0x2e, &(0x7f0000000040), 0x0)
r1 = socket(0x18, 0x1, 0x0)
setsockopt(r1, 0x1000000029, 0xc, &(0x7f0000002a80)="ebffcbff13b9fd812eaa4e713048e69931929648", 0x14)
setsockopt(r1, 0x1000000029, 0xc, &(0x7f0000000040)="ebffcbff13b9fd812eaa4e713048e69931929648", 0x14)
mknodat(0xffffffffffffff9c, 0x0, 0x0, 0x0)
open(0x0, 0x0, 0x0)
setsockopt(r1, 0x1000000029, 0xc, &(0x7f0000000000)="ebffcbff13b9fd812eaa4e713048e69931929648", 0x14)


compat_43_orecvmsg(0xffffffffffffffff, &(0x7f0000001680)={0x0, 0x0, 0x0, 0x0, &(0x7f0000000680)=""/4096, 0x1000}, 0x0)


chflags(0x0, 0x0)
modctl$MODCTL_UNLOAD(0x2, 0x0)
r0 = socket$inet6(0x18, 0x3, 0x0)
poll(&(0x7f0000000000)=[{r0}], 0x1, 0x0)


symlink(&(0x7f0000000080)='.\x00', &(0x7f0000000240)='./file0\x00')
__mount50(&(0x7f00000002c0)='overlay\x00', &(0x7f0000000040)='.\x00', 0x0, &(0x7f0000000540), 0x0)
linkat(0xffffffffffffff9c, &(0x7f00000003c0)='./file0/file1aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\x00', 0xffffffffffffffff, 0x0, 0x0)


open(&(0x7f00000000c0)='./file0\x00', 0x205, 0x0)
r0 = open(&(0x7f0000000000)='./file0\x00', 0x0, 0x0)
fcntl$lock(r0, 0x8, &(0x7f0000000180)={0x0, 0x0, 0x0, 0x100000401})
r1 = open(&(0x7f00000000c0)='./file0\x00', 0x205, 0x0)
fcntl$lock(r1, 0x8, &(0x7f0000000000)={0x0, 0x0, 0xfffffffffffffffe, 0x1000300010008, 0xffffffffffffffff})
open(&(0x7f0000000000)='./file0\x00', 0x611, 0x0)
r2 = open(&(0x7f00000000c0)='./file0\x00', 0x205, 0x0)
fcntl$lock(r2, 0x8, &(0x7f0000000000)={0x0, 0x0, 0xfffffffffdfffffd, 0x1000300010008})


setrlimit(0x2, &(0x7f0000000000)={0x0, 0xffffffffffffff00})


mknod(&(0x7f0000000540)='./file0\x00', 0x6000, 0x1003)
r0 = open(&(0x7f0000000000)='./file0\x00', 0x1e, 0x0)
ioctl$WSDISPLAYIO_GET_FBINFO(r0, 0xc0204610, &(0x7f0000000100)={0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, @fbi_cmapinfo})


r0 = socket(0x18, 0x1, 0x0)
sendmmsg(r0, 0x0, 0x0, 0x0, 0x0)


r0 = socket(0x18, 0x3, 0x0)
setsockopt(r0, 0x1000000029, 0x11, 0x0, 0x0)


modctl$MODCTL_UNLOAD(0x2, 0x0)
r0 = socket$unix(0x1, 0x5, 0x0)
ioctl$FIONBIO(r0, 0x8004667e, &(0x7f0000000080))


__select50(0xfffffffffffffe5a, 0x0, 0xffffffffffffffff, 0x0, 0x0)


_lwp_unpark_all(&(0x7f0000000000)=[0x0, 0xffffffffffffffff], 0x2, 0x0)


open(&(0x7f0000000040)='./file0\x00', 0x18289, 0x0)
r0 = getpid()
ktrace(&(0x7f0000000000)='./file0\x00', 0x0, 0x0, r0)
recvmmsg(0xffffffffffffffff, &(0x7f0000000080)={&(0x7f0000000100)={&(0x7f00000006c0), 0x213, 0x0, 0x0, 0x0}}, 0x10, 0x0, 0x0)
socketpair$unix(0x1, 0x1, 0x0, &(0x7f0000000040)={<r1=>0xffffffffffffffff, <r2=>0xffffffffffffffff})
recvmmsg(0xffffffffffffffff, &(0x7f0000000700)={&(0x7f00000001c0)={0x0, 0x0, 0x0, 0x0, 0x0}, 0x3f8d}, 0x10, 0x0, 0x0)
sendmmsg(r2, &(0x7f0000000080)={0x0}, 0x10, 0x0, 0x0)
setsockopt$sock_timeval(r2, 0xffff, 0x1005, &(0x7f0000000000)={0x0, 0x8}, 0x10)
recvfrom$unix(r1, 0x0, 0x0, 0x0, 0x0, 0x0)
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1)


r0 = socket(0x2, 0x3, 0x0)
ioctl$FIOSEEKHOLE(r0, 0xc118691d, &(0x7f0000000180)=0x8000000000000032)


open(&(0x7f00000000c0)='./file0\x00', 0x0, 0x0)
mkdirat(0xffffffffffffffff, &(0x7f0000000100)='./bus\x00', 0x0)
writev(0xffffffffffffffff, &(0x7f00000002c0)=[{&(0x7f0000000080)="76e5eac907f9ccf7a251ceddcec7d6aa45cffe2c63a56077123a276d3ba4e9d17eb3eb5db12a3783a8e0620d357de1fe04fa9465b5bd1286e9624dec06a00c222f", 0x41}], 0x1)
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
symlink(&(0x7f00000000c0)='./file0\x00', &(0x7f0000000100)='.\x00')
truncate(&(0x7f0000000100)='./file0\x00', 0x0, 0x0)


compat_40_mount(&(0x7f0000000040)='tmpfs\x00', &(0x7f00000000c0)='.\x00', 0x0, &(0x7f00000002c0)="01")
mkdir(&(0x7f0000000000)='./file0\x00', 0x0)
mkdir(&(0x7f0000000000)='./file0/file0\x00', 0x0)
rename(&(0x7f00000000c0)='./file0/file0/..\x00', 0x0)
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
mknod(&(0x7f00000000c0)='./bus\x00', 0x0, 0x4f4b)


r0 = open(&(0x7f0000000100)='.\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x3000)=nil, 0x3000, 0x1, 0x1012, 0xffffffffffffffff, 0x0, 0x0)
preadv(r0, &(0x7f0000001400)=[{0x0}], 0x1, 0x0)


ptrace(0x1a, 0x0, 0x0, 0x0)


compat_90_getvfsstat(&(0x7f0000001680), 0xffffff17, 0x0)


socketpair$unix(0x1, 0x5, 0x0, &(0x7f0000000000)={0xffffffffffffffff, <r0=>0xffffffffffffffff})
sendmsg$unix(r0, &(0x7f00000013c0)={0x0, 0x0, &(0x7f0000001340)=[{&(0x7f0000000080)="388687d9e47346edb11fd84fd36b09244ea09ec3d67efbe3056f24ac", 0x1c}, {&(0x7f00000000c0)="a8ed7f11a4fe24136c1cfc90", 0xc}, {&(0x7f0000000100)="a6307a67454080c4b305d53ee4518047140d5e622fb84b6e044f8005e146c6eff737fc729c48c5cd4daf54d0976436b9c84b3395e367b02aeafbad4f4e72d3762b9b4fcc083f47f41ccef9e1a66b8c9bc9fedd5b0f56639c6d5fcd3a98276dbad159600e2a9189c6648531eb9b106667548f9475e084689375", 0x79}, {&(0x7f0000000180)="ed4e25acf1201b6002d2f8930373a243b4d0eee6cf08286d3fa99ee1655b5a", 0x1f}, {&(0x7f00000001c0)="8403fe0f229e9b380bbb1320c5aba6273ddac6932d0fa046b14fefbb9c112d5bb151e61bdae6205a8e9caf1581391255c87e3ce196e2d0dd0fa2b0bc2eca04be22885a41ed4d249ec796c274205663820124787cfaff647567243b8b51d61f97ef5baffb19aec38fb7c6fcb12eb38ef0", 0x70}, {&(0x7f0000000240)="9936cba9007a3c0fc7d7cbde97b692dd3de1221b1f3ac3c7fc6cbe95a463946a18d6480aac7a17ac3653931dcc8d7217c07441e39716bbe4", 0x38}, {&(0x7f0000000280)="9be30fe187f2e4c7d7f4506500e02bd0ee6176c5ba82175eb76ef52302f77b7b30566d25cafe3a7f5501a5bdd5d90aa70f0635a771c1346c9c4c5bb10b151d4e57f83a395cf1b56e23215ac312e18614cc36e33d8a393979937127bc99dc7d07590123f085ab738379b0a2d5d2cbf2b29d943ec7c6fcbf1f97ed9fc5f6fb3e4b85273a77050fd6d336a85565f5269a08624dc69b0a012a0ab5efb86de4ec522873c584c8ad001dc20d0922f95d7521da7693db7f1edb8598f8080106c346daed0fbb3c1be1e6569d00c6d6c94c46dad44426c977cf217fb10b87ae03610ce5767d6d46b3af15c369fbff9381b0791b440d69106cb9b9b8740be02a76fab1a577403a45991b044086e5e1bc2a2801643e80c15db511025a290858b8046bd51e10226ea98ad218e5461603ddfa7382fd5dc1f746655cc671d6c71bcff26d2aa9a10b45c0fdf3a45717d34c7645adb9646cc12a6f65d5acbdbdae6b1677910cc22d7f59d1ad3c689448cff60e951c6717152ec7bcf54ae6b99e140277075b04b78dc303fc93565b71a0a3ff8115d5e01a2c99aad42c98216fd9a62f3c58d8986a4fd8503396a4326e6ab453d11dc83ebea498f7c883d7d09fb8fb18c5361a2220c802b46da1124ceb25deaa57f0c25f570748c10c2236bbcd76f88f6285dfcd80ec290ca405586604508e1b2a5fc08fd7e934a0f6a0f59d63e707c29656f5d1d07ec70f8d89e8b3b81ce9730466014c88c40f2f95f49534b303159da8b8db56b4f32f4650954f48cd04298294c8cc42a0fca7d80186c7b33a006651c7a4f7df419201258cb17d320e7b8342327b28c0bec76e98b8d7bd67e63aafea075bad5aecff4fd00bf758ba7394b1fed11657bcdcb08ef51f8b3054c64cb7c51a108d97e2486c9cec86f484660fa0b9c04de065abc694f8ab6e548bc70decfeb18c2bf32e7224d3e4311d4e8f93fb01f24f395fac3685bb8ebecfddac01dbdc0f6bf9454518a6bce84e2e4b5318699eee98020cd5be5d407b349ed16b83127e6f4eec00aadc2f833812320d8f031f4df458ab79d6e23af0a8ae068271b93f46880e4b12b84aafb969a4a0136071f12abcf3cc17ac4612a28eac646d4bb125eb7ac7acdb3997804c4a76dbecd1b4c9ec66f4feb62bf819f8bd20b7cff9d98cd886f5d4b2b81e6aed8cb19c8a1c3600fad2771e12244efb1291456a36a61a5caeb82d2dce38fe01b332d3dcfdeecc1c6b15305d11996dc1ac861cf9b3e37bed6a033deb75cdfe1f3780f1a4159add5094c08d938af5a43fb201494d655b2d2a6fac2d788a07b70c54f59e13aba39b5a63445e468f5ff09c9439baddda0b3dff1178835bfc8c2b905669cb2764267bf8e9712cd2115b74df62b23234e7e9681be1a5499cb99a98c159d3d2900f710f829394442e062c448703afd133a2e636325ca6cdf115c9af336e874ce175ed250c1164514d0ab75142773c417540cc5b8ab661d616eddfaefb8044a1f950932a5b7ddc3c2704e382588c6e8132a862ca432c7145c27ea25505bdfd1587bb1f88f4b6384684ab9bedbe608d814eff7224f99651c748ee91eb5e899b79f92353908d6114b979b693472d97a8c8088f010469c423f1861104c8b31142d565c6b70dfe8133cf7f8b7274d720bd48a5de15012e1739b76cd41df6c9cc647e7b5be1de705debd5ee36517550c42bb306fae5528662bff1b9338fafc0a88b9174506fbd3cd127cf4038ad1b21afe2991feeb1dd4112fafec4b459c810cce9a7b79af68922db2f86ae487f2e906dc8275e581d5e0e3320bf385d7951176c34cc1fc336e23724abdf2d9d4c4e6bc22c572a8d77c8011c7c5d9692225d0e2bc56a8cc95835fa784e63e2ebab3cef76b5e27264d3a6a8cbcbc83cd1e01205919fa18ec2ab821838851dbf9961dd46ad6c0498f8755fce7dc584593df64ba628052e94ae71644fb58a1ddef434a48eed3500e744d4787c0728c195b34e1393780d83b5da44cbf0d0b921a7b2091dc07132f9a55ea043e68477078138a9dc6163d159d3c21516fc008e57ca912318085a12ceb422763e8c288b19a326345329839807b1f4c602bd9c921c0252a1505c47c5854458cc5c283f0e13450d36cc53a1a186aebacb0172019f8d89b959997a1115360d12c821f9cf09a710b6ff8c83860cdbd301f377c15292674e974e15e6502612a72b0135f3afa806dcbac9dda5d48e54bc50096fb273494e6ff20601c0c05efa52753173179fc2eb1c7d846d96c24945f1e47456393c0ffacf69156815b563d34375f8dedebe0676338848954bcdfa4d79851d817133ce128e108e38c2e2697a6f600afb1bb08ca00abea33025d28e2f507b6f2ff559444fc57835a7a002f9402b17a487f862157ca04a070fdbbe87b9b5c4e695fa1ecbfcb31d26683defe5205945e70ead4dc8d7465b7c81a2a3d40bf107b3049fa1e205cf376d62ead24ca097b2f830c7e08d192d071a55629e1c2a4bfc65cade5b0f8e9eb7820eed10e94665c4bcfa19abb90c62a6b02365df909092eef0397ee4a4f7dd622b5051ae3912d5b9537ada80434b9ade470ebe72e29b6217e93d071b7fe93e89d5ea8c886dad426d2e4a81ad720e03c8ba17cda967159548ff544f462670339cab15da5eeaa79e0f4282895afb485848bb2dd44a82839679572f038a9e331380d7b457772424dbf14e1f9eed9164a5b16d9c63884df543c796047369136f8cb5fa92a7dab40193232117fac1218121a68b3b393fa38702d79abcc07d7fd41501274dbc47c21fb12a24b512c0a22439a968cc90421e7acfcbaa36c4386aa7cc7bff0a6ed8a4bc50c680300cc76277b81d4a01daf855017adb7ee900800235cb3724dbc69498c288ec3f4b4e556390c74894e472bf6abc097bdbb8fba803839d85bfb97d77ad45b3092d7d6109fe481d12b555d3e9b6eb079cbb11166c9c469d3b2ea35583f727f32f74127d152ec5f67556288b2d0f1b680e6fa04c213bfbc75d613529e2e075275e6af82358ed372633282e9be63abea9928603d51cc14ae0a65ea17efb5c29af97af4bdfab907aeedd5d7d88e5cce174e4600192e73c0057abb32d5cc1839f6b753baea4355530ec11229155f020a6e5087a5b540abcf1ea370fa1f09c12e7fdfbb5797bc7f3b35c5e97721bb6953ca5ceac95e8e0d23135c7e22baff9bd8ddad9c6c5e751d975656fab25ddfd07f438bff67589501728f0eec5ea03d33881191f63764b16fca280d24600994d4d4d876f813a6a1a5116aeedb0d1d357dfc402f0898805c37f49de1a06615d52839f4d670dcc1e440fce7f8b749f281f8a4e9dffcd82f2a485ca4db790904272ad432359f9526e8ca4cd7d79834339757474e02508884c052e9da27880c614b97373631897e8ddc84fcb9952837addd5f91077fc0c618418675c2827b51c136096bf4199d5b6675f94cf04c3c7ea05f6930e4ea22fe2ae04f540c811c280d7c4d718e86ac98bf4caa688cc9826fdf03f98beb5288b04b55053153aa664e3eb7320a861542bb90319fb07333412efea40db712df4ed4b83794578e2a26345b3fcef005571de3a2efac0652a600152ff91fe9b1859d6f037abd139cf7c29fe29a75b3b58b3dc24f3cbea12a4a72003485c6dfe78ab2593fa0c3f8ac016b94114f2d1ae8fac274ec1706b4cc4de25753a010ed365cd97199480bcc7ecca8c4f977e21052a1ce9cd03f4d9510272935725da35362f63a448a7d11c6b15115df602a4317eefbbdd03315088cc226231fd3ec1ff277c9195f9f2db116cdd4fa8576394c000ac320e23cd215f049857cb0b9f0b888aeeb5ab52cc95248b455aec53181412fc1cbe437b04ea00bd9d2946eb63b839fe775263ede3e202cdbcf5c7c1e42afe41503c2c6020779a33bc9947e2df489dd04010a00a649494e9e491b19bd5b6214f558ef58032bf7a9c2395d0c007369b5a76911f8270842bad774df256422037e3a392b52bfbf39dbc70de1d1a9b3c8f58360e2c57ebe77e47703c1a3bbadd9912cd25410e73cd3e7451a242683bd17fac8856710e9f9ae5d1bb7003027241f546925b43206aaca9adb5b1b49023d0ff3a248d9234e40097592348722ca47228bca6b09d0f63f9e8cb58c6e2304de169336f7944e3c7b09966e729c48f50c2614c8e8a5db43d01aa90f752eed077eee35e0079b500c753ebacbc29e80d7e4d4eace3e80719de7063a104c99d26c88c89f22124478f803287a2361df05aa5d72b9d170f3b40b633ad9b203f70a597dd3b753e5d9db07987d3c35a72b65b437607de6915c70769f96398adac7bb69268c570f28470a48428ff42ade616a885de4f84c4401c144d160df8469d4ae296dbdeec0c2e966b7b4e08d762d5c2088dbce5f0ad62c72f70e6efb27b832b057b954fc706bb70c488b9452017b5e8f5652ad2d41e6690a00960000f32bdc2039470dc0ea0694c7acb3ce57929dfafde387e0b58ed00b186ffce123c01444bba881171a067cb43981ca283a8e11108f6221f90f9e097c8055333fd5e450c9efe905f95c61a26a4f235f9c9965a84a2864e98f30d7ae947b9f5a1a4ed1e4f800de8b3837d108831afebfbdb78b361bf9fcaf90b78aeff2878df08feaa05681b2f1ca4ffa82ead092e1cc7a08bebc996d33957dee7a84c806ad1d88b6c1a257d5666cb00f6931567939d8e84a9f299ff93caa465e12e3500758e85e69d6cf4d27a61beddc7d6272efff650e4c9ee0d5f2e0d941c2f28388fb6e21581c571aab7a1512b8c25014d6c2cc304cfd3432c1f00bc9ce5a455fc3ba383025569b6e775f02c008548653de90024340e775c24140523cb5480a05594486e69f8555746f94a1ac7d742671a589c6e7e1683781c7f069822243fc5a8dab7655a16d54b247a44c8a2cb184f51a5679f3904b8cf490f103f86a627225d460e9edea67e470f14c5564ed2189a0d1e2bb98cfbc836917c247f106f292dad9794a8d3c6333f95920c81342d7af2f2e2f029796fe6d3c9f76bf17df18d328c61462fc65897ded99a0ec21db6df615d4d48bebd8548950233fca3b478e4de9b2f46040030897290c0c7205397870675c480a715281cba5d4e78528c5cf5d61168e433a4eb04085eb2f60c490a5b8dc6e8083db6d9dc372706fb185809f63bc854d625279d7d7f", 0xe4d}], 0x7, 0x0, 0x0, 0x2}, 0x0)


modctl$MODCTL_UNLOAD(0x2, 0x0)
r0 = open(&(0x7f00000000c0)='./file0\x00', 0x615, 0x0)
truncate(&(0x7f0000000200)='./file0\x00', 0x0, 0x7fff)
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x0)
sendto$unix(0xffffffffffffffff, 0x0, 0x0, 0x0, 0x0, 0x0)
__posix_fadvise50(r0, 0x0, 0x0, 0x0, 0x3)


r0 = _lwp_self()
_lwp_suspend(r0)
_lwp_detach(r0)


mkdir(&(0x7f0000000340)='./file0\x00', 0x0)
compat_40_mount(&(0x7f0000000180)='ptyfs\x00', &(0x7f00000002c0)='./file0\x00', 0x0, &(0x7f0000000500))
compat_50___msgctl13$IPC_STAT(0x0, 0x2, &(0x7f0000000280)={{}, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, &(0x7f00000001c0)={&(0x7f0000000100)}})
lchown(&(0x7f0000000100)='./file0\x00', 0x0, 0xffffffffffffffff)
compat_40_mount(&(0x7f0000000000)='null\x00', &(0x7f0000000140)='./file0\x00', 0x0, &(0x7f00000001c0))
r0 = open(&(0x7f0000000480)='./file0\x00', 0x0, 0x0)
read(r0, 0x0, 0x0)


connect$unix(0xffffffffffffffff, 0x0, 0x0)
r0 = socket$inet(0x2, 0x2, 0x0)
getsockopt(r0, 0x0, 0x6, 0x0, 0x0)


compat_43_osend(0xffffffffffffffff, &(0x7f0000000080)="6b0451b0f8fad7242e718e3071633a6179f26b6e748203eac7a8b896b69a5c45e89c51c5667a7b992b7347c4bf4c0ece77fba5f7c779e5c3ecee3a66c445bc25127eff9395afc5d0386cceba1a3411ed1137d8b211acbe28739f4d0d221b32c71611583ed84b83b04a9548dcb8b7a1accfcd5c96f3278a5da2d99a693059d82a33c493bd546ea12d64ee6f968930df3d673280ee33f44c407a380abea0ee4a9b60115538242e39244e5d1f6fd305c1bdee780a51f71b668802603d186dea25d115dab7841eb7efbea971dcf3e254da6686f3b2058b6a17bf9a92265777f7e2010f8c122a6a5d005def7eb1faf5f59e6a51f164e1f2d914eff1f5f191cec8203f25e551fda914eb8e38124d6b54402a6f158ce5b4081bb7b3bea9da488d17799891a70f936993b962f89258ade53db8eb807b3c646ad23894b53bf33250b10d9a6bda6b9c46", 0x145, 0x0)
mknod(&(0x7f0000000000)='./bus\x00', 0x2000, 0x2e00)
r0 = open$dir(&(0x7f0000000080)='./bus\x00', 0x0, 0x0)
ioctl$FIOASYNC(r0, 0x82085269, &(0x7f00000001c0)=0x8)


mknod(&(0x7f0000000280)='./file0\x00', 0x2000, 0x200)
r0 = open(&(0x7f00000000c0)='./file0\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x2000)=nil, 0x2000, 0x1, 0x10, r0, 0x0, 0x0)
getsockopt$sock_cred(0xffffffffffffffff, 0xffff, 0x1022, &(0x7f0000000140), &(0x7f0000001a00)=0xc)


mknod(&(0x7f0000000140)='./bus\x00', 0x2000, 0x4f4b)
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
__getfh30(&(0x7f0000000140)='./file0\x00', 0x0, &(0x7f0000000280))


compat_40_mount(&(0x7f0000000040)='tmpfs\x00', &(0x7f00000000c0)='.\x00', 0x0, &(0x7f0000000140)="01")
open(&(0x7f0000000480)='./file0\x00', 0x7fffe, 0x0)
compat_50_quotactl(&(0x7f0000000000)='./file0\x00', 0x10001, 0x0, &(0x7f0000000040))


link(&(0x7f0000000100)='./file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\x00', 0x0)
r0 = socket(0x2, 0x3, 0x0)
ioctl$FIOSEEKHOLE(r0, 0x80206916, &(0x7f0000000180)=0x8000000000000032)


writev(0xffffffffffffffff, &(0x7f0000000200)=[{&(0x7f0000000040)="d9bdb35356fcfaba44b2a3a9a27609f8da050000000000000088c1634fe44c3d26f2d741f5de1de66ed5b66139b08ccaeb0c21a4935045a1234067ef05c0aeb579070b1ef6b87bd75690", 0x4a}], 0x1)
mknod(&(0x7f00000000c0)='./file0\x00', 0x2000, 0x4100)
r0 = open(&(0x7f00000003c0)='./file0\x00', 0x1, 0x0)
open(&(0x7f0000000100)='./file0\x00', 0x0, 0x0)
ioctl$FIONREAD(r0, 0x8010572b, &(0x7f0000000080))


symlink(&(0x7f0000000640)='./file0\x00', &(0x7f0000000680)='.\x00')


mkdir(&(0x7f0000000080)='./file0\x00', 0x0)
r0 = open(&(0x7f0000000200)='./file0\x00', 0x0, 0x0)
preadv(r0, &(0x7f0000000a80)=[{&(0x7f0000000080)=""/185, 0xb9}, {&(0x7f0000000300)=""/243, 0xf3}], 0xac01, 0x0)


syz_usb_connect$hid(0x0, 0x0, 0x0, 0x0)
munmap(&(0x7f0000001000/0x3000)=nil, 0x3000)
r0 = shmget$private(0x0, 0x3000, 0x0, &(0x7f0000ffa000/0x3000)=nil)
shmat(r0, &(0x7f0000001000/0x3000)=nil, 0x0)
madvise(&(0x7f0000000000/0x3000)=nil, 0x3000, 0x80000000002)
modctl$MODCTL_UNLOAD(0x2, 0x0)
socketpair(0x1, 0x2, 0x0, &(0x7f0000001640)={<r1=>0xffffffffffffffff})
recvmmsg(r1, &(0x7f0000001dc0)={0x0}, 0xb, 0x0, 0x0)


mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x0)
compat_50_select(0x40, &(0x7f0000000080), 0x0, 0x0, 0x0)


r0 = getsid(0x0)
ptrace(0x9, r0, 0x0, 0x0)
compat_50_wait4(0x0, 0x0, 0x0, 0x0)
ptrace(0xd, r0, &(0x7f0000000240), 0x8)


compat_40_mount(&(0x7f0000000040)='tmpfs\x00', &(0x7f00000000c0)='.\x00', 0x0, &(0x7f00000002c0)="01")
mknod(&(0x7f00000000c0)='./file0\x00', 0x6000, 0xe03)
open(&(0x7f0000000080)='./file0\x00', 0x70e, 0x0)


r0 = _lwp_self()
_lwp_wait(r0, 0x0)
_lwp_continue(0x0)
_lwp_wakeup(0x0)
_lwp_wakeup(r0)


mkdir(&(0x7f0000000000)='./file0\x00', 0x0)
chmod(&(0x7f0000000280)='./file0\x00', 0x3a)
setreuid(0x0, 0xee01)
__utimes50(&(0x7f0000000340)='./file0\x00', 0x0)


r0 = socket$unix(0x1, 0x1, 0x0)
r1 = socket$unix(0x1, 0x1, 0x0)
bind$unix(r1, &(0x7f0000000080)=@file={0x1, '\xe9\x1fq\x89Y\x1e\x923aK\x00'}, 0x6e)
listen(r1, 0x0)
connect$unix(r0, &(0x7f0000000280)=@file={0x1, '\xe9\x1fq\x89Y\x1e\x923aK\x00'}, 0x6e)
r2 = dup2(r1, r0)
r3 = accept$inet6(r2, 0x0, 0x0)
getpeername$unix(r3, 0x0, &(0x7f0000000100))


mknod(&(0x7f0000000480)='./file0\x00', 0x2000, 0x1733)
open(&(0x7f00000000c0)='./file0\x00', 0x0, 0x0)
compat_43_oaccept(0xffffffffffffffff, &(0x7f0000000040)=""/10, 0x0)
pipe2(0x0, 0x0)
symlink(&(0x7f0000000080)='.\x00', 0x0)
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
compat_50_select(0x40, &(0x7f0000000080), 0x0, 0x0, 0x0)
compat_50_select(0x40, &(0x7f0000000080), 0x0, 0x0, 0x0)
ioctl$FIONREAD(0xffffffffffffffff, 0x4090426b, 0x0)


compat_50_setitimer(0x1, &(0x7f0000000140)={{0x0, 0x4}, {0x0, 0xffff}}, 0x0)
mknod(&(0x7f00000000c0)='./bus\x00', 0x2000, 0x0)
r0 = open$dir(&(0x7f0000000000)='./bus\x00', 0x0, 0x0)
ioctl$FIOASYNC(r0, 0x40043105, 0x0)
__setitimer50(0x1, &(0x7f0000000000), 0x0)


socketpair$unix(0x1, 0x5, 0x0, &(0x7f0000000200)={0xffffffffffffffff, <r0=>0xffffffffffffffff})
sendmsg$unix(0xffffffffffffffff, &(0x7f0000000640)={0x0, 0x0, 0x0, 0x0, &(0x7f0000000080)=ANY=[@ANYBLOB="18000000ffff000001"], 0x18}, 0x0)
sendmmsg(r0, &(0x7f0000000000)={0x0}, 0xffffffffffffff2d, 0x0, 0x0)


mkdir(&(0x7f0000000080)='./file0\x00', 0x0)
compat_50___msgctl13$IPC_STAT(0x0, 0x2, &(0x7f0000000280)={{}, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, &(0x7f00000001c0)={&(0x7f0000000100), 0xdffffffffffff7ff}})
lchown(&(0x7f0000000100)='./file0\x00', 0x0, 0xffffffffffffffff)
__mount50(&(0x7f0000000040)='kernfs\x00', &(0x7f00000000c0)='./file0\x00', 0x0, 0x0, 0x0)
compat_40_mount(&(0x7f0000000380)='union\x00', &(0x7f0000000140)='./file0\x00', 0x3, &(0x7f00000001c0))
chroot(&(0x7f0000000000)='./file0\x00')
mkdir(&(0x7f0000000080)='./file0\x00', 0x0)


r0 = open(&(0x7f0000000280)='./file0\x00', 0x615, 0x0)
compat_43_ommap(&(0x7f0000fff000/0x1000)=nil, 0x1000, 0x0, 0x0, r0, 0x0)
r1 = open$dir(&(0x7f0000000000)='./file0\x00', 0x2, 0x0)
writev(r1, &(0x7f0000000340)=[{&(0x7f0000000000), 0x2cfea}], 0x1000000000000013)


symlink(&(0x7f0000000300)='.\x00', &(0x7f0000000240)='./file0\x00')
compat_40_mount(&(0x7f0000000180)='ptyfs\x00', &(0x7f00000002c0)='./file0\x00', 0x0, &(0x7f0000000500))
__mount50(&(0x7f0000000000)='overlay\x00', &(0x7f0000000040)='.\x00', 0x0, &(0x7f0000001080), 0x0)
r0 = open$dir(&(0x7f0000001240)='.\x00', 0x0, 0x0)
compat_30_getdents(r0, 0x0, 0x0)


open(&(0x7f0000000140)='./file0\x00', 0x78e, 0x0)
r0 = open(&(0x7f0000000180)='./file0\x00', 0x0, 0x0)
fcntl$lock(r0, 0x8, &(0x7f0000000080)={0x0, 0x0, 0x0, 0x100080004})
r1 = open(&(0x7f0000000240)='./file0\x00', 0x205, 0x0)
fcntl$lock(r1, 0x8, &(0x7f0000000280)={0x2, 0x0, 0x0, 0x1000301010009})
r2 = open(&(0x7f0000000400)='./file0\x00', 0x0, 0x0)
fcntl$lock(r2, 0x9, &(0x7f0000000140)={0x0, 0x0, 0xffffffffffffffff, 0x269000000, 0xffffffffffffffff})


symlinkat(&(0x7f0000001040)='./file0/file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa/file0\x00', 0xffffffffffffff9c, 0x0)
recvmmsg(0xffffffffffffffff, &(0x7f0000000080)={&(0x7f0000000100)={&(0x7f00000006c0), 0x213, 0x0, 0x0, 0x0}}, 0x10, 0x0, 0x0)
socketpair$unix(0x1, 0x1, 0x0, &(0x7f0000000340)={0xffffffffffffffff, <r0=>0xffffffffffffffff})
sendmmsg(r0, &(0x7f0000000080)={0x0}, 0x10, 0x0, 0x0)


mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
setsockopt$inet_opts(0xffffffffffffffff, 0x0, 0x0, 0x0, 0x0)
r0 = socket(0x18, 0x1, 0x0)
setsockopt(r0, 0x0, 0xd, &(0x7f0000000000)="ebffcbff13b9fd812eaa4e713048e69931929648", 0x14)


mknod(&(0x7f00000000c0)='./bus\x00', 0x2000, 0x0)
msgctl$IPC_SET(0x0, 0x1, 0x0)
open(&(0x7f0000000040)='./file0\x00', 0x0, 0x0)
r0 = open$dir(&(0x7f0000000000)='./bus\x00', 0x0, 0x0)
ioctl$FIOASYNC(r0, 0x80283103, &(0x7f0000000040))


symlink(&(0x7f0000000080)='.\x00', &(0x7f0000000240)='./file0\x00')
compat_40_mount(&(0x7f0000000140)='umap\x00', &(0x7f0000000040)='./file0\x00', 0x0, 0x0)


pipe2(&(0x7f0000000000)={<r0=>0xffffffffffffffff}, 0x400004)
setsockopt(0xffffffffffffffff, 0x1000000000029, 0x0, 0x0, 0x0)
getsockname$inet(r0, &(0x7f00000000c0), 0x0)


setrlimit(0xb, &(0x7f0000000000)={0x0, 0xffff})


r0 = open(&(0x7f0000000100)='./file0\x00', 0x80000000000206, 0x0)
writev(r0, &(0x7f00000000c0)=[{&(0x7f0000000280)='#', 0x1}, {&(0x7f0000000000)="8d", 0x7ffffffffffffffe}], 0x2)


__mount50(&(0x7f0000000000)='overlay\x00', 0x0, 0x0, 0x0, 0x0)
fcntl$lock(0xffffffffffffffff, 0x0, &(0x7f0000000040)={0x0, 0x0, 0x0, 0x8000000000000001})
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
r0 = socket(0x18, 0x1, 0x0)
setsockopt(r0, 0x1000000029, 0xc, &(0x7f0000000180)="ebffcbff13b9fd812eaa4e713048e69931929648", 0x14)
setsockopt(r0, 0x1000000029, 0xc, &(0x7f0000000040)="ebffcbff13b9fd812eaa4e713048e69931929648", 0x14)
setsockopt(r0, 0x1000000029, 0xd, &(0x7f0000000000)="ebffcbff13b9fd812eaa4e713048e69931929648", 0x14)


compat_50_futimes(0xffffffffffffffff, 0x0)


socket(0x0, 0x2, 0x0)
mkdir(&(0x7f0000000080)='./file0\x00', 0x0)
__mount50(&(0x7f0000000040)='procfs\x00', &(0x7f0000000080)='./file0\x00', 0x0, &(0x7f0000000400)='@', 0x1)
setreuid(0x0, 0xee01)
setsockopt(0xffffffffffffffff, 0x0, 0x0, 0x0, 0x0)
open(&(0x7f0000000200)='./file0\x00', 0x0, 0x0)


compat_40_mount(0x0, 0x0, 0x0, &(0x7f00000002c0)="01")
compat_40_mount(&(0x7f0000000280)='procfs\x00', &(0x7f0000000240)='.\x00', 0x0, &(0x7f00000002c0))
__mount50(&(0x7f0000000000)='overlay\x00', &(0x7f0000000040)='.\x00', 0x0, &(0x7f0000000540), 0x0)
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
pathconf(&(0x7f0000000040)='./file0/file0\x00', 0x7)


r0 = getpid()
fktrace(0xffffffffffffffff, 0x0, 0xf0709cfa615b9be3, r0)
fchflags(0xffffffffffffffff, 0x0)


mmap(&(0x7f0000000000/0x3000)=nil, 0x3000, 0x0, 0x1012, 0xffffffffffffffff, 0x0, 0x0)
compat_50_select(0x0, 0x0, 0x0, 0x0, &(0x7f0000000100))


mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x0)
__clone(0x0, &(0x7f00000001c0))
__wait450(0x0, &(0x7f0000001400), 0x4, 0x0)


compat_50___msgctl13$IPC_STAT(0x0, 0x2, &(0x7f0000000080)={{}, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, &(0x7f00000001c0)={&(0x7f0000000100)={0x0, 0x0, 0x0, 0x3}}})
socketpair$unix(0x1, 0x1, 0x0, &(0x7f0000000140)={0xffffffffffffffff, <r0=>0xffffffffffffffff})
sendmsg$unix(r0, &(0x7f00000000c0)={0x0, 0x0, 0x0, 0x0, &(0x7f0000000100)=ANY=[@ANYBLOB="28000000ffff000001"], 0x28}, 0x0)


r0 = _lwp_self()
_lwp_exit()
_lwp_suspend(r0)


r0 = fcntl$dupfd(0xffffffffffffffff, 0xb, 0xffffffffffffffff)
write(r0, 0x0, 0x0)


_ksem_init(0x0, &(0x7f0000000100)=<r0=>0x50535244)
_ksem_close(r0)


chroot(&(0x7f0000000000)='.\x00')
ktrace(0x0, 0x0, 0x0, 0x0)
compat_20_getfsstat(0x0, 0x0, 0x2)


__mount50(&(0x7f0000000180)='fdesc\x00', &(0x7f0000000040)='.\x00', 0x0, 0x0, 0x0)
symlink(&(0x7f0000000080)='.\x00', 0x0)
__mount50(&(0x7f0000000000)='overlay\x00', &(0x7f0000000040)='.\x00', 0x0, &(0x7f0000000540), 0x0)
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
pathconf(&(0x7f0000000080)='./file0\x00', 0x1)


r0 = socket(0x1, 0x2, 0x0)
ioctl$FIONREAD(r0, 0xc0106914, &(0x7f0000000080))
r1 = open$dir(&(0x7f0000000080)='.\x00', 0x0, 0x0)
mkdirat(r1, &(0x7f0000000180)='./file1\x00', 0x0)
chdir(&(0x7f0000000040)='./file1\x00')


connect$unix(0xffffffffffffffff, &(0x7f00000000c0)=@abs={0x0, 0x7}, 0x8)
syz_emit_ethernet(0x2a, &(0x7f0000000200))
r0 = socket(0x2, 0x2, 0x0)
getsockname$unix(r0, &(0x7f0000000000)=@abs, &(0x7f0000001200)=0x8)
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
r1 = socket(0x2, 0x3, 0x0)
connect$unix(r1, &(0x7f0000000000), 0x10)
setsockopt(r1, 0x0, 0x2, &(0x7f00000000c0)="63e1303d", 0x4)
write(r1, &(0x7f0000000200)="f92a9bd300"/20, 0x14)


mknod(&(0x7f0000000040)='./file0\x00', 0x6000, 0x0)
mknod$loop(&(0x7f0000000000)='./file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\x00', 0x2000, 0x1)
link(&(0x7f0000000940)='./file0\x00', &(0x7f0000000d40)='./file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\x00')
link(&(0x7f0000001240)='./file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\x00', &(0x7f0000000bc0)='./file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\x00')
symlink(&(0x7f0000001640)='./file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa/../file0\x00', &(0x7f0000000e40)='./file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\x00')
rename(&(0x7f0000000600)='./file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\x00', &(0x7f0000000f40)='./file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\x00')
rename(&(0x7f0000000300)='./file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\x00', &(0x7f0000000200)='./file0\x00')
r0 = open(&(0x7f0000000300)='.\x00', 0x0, 0x0)
mkdirat(r0, &(0x7f0000000340)='\x13\x13w\xc5\xfc5\xd4\x14T\xd5\xd4\x1d)\xad\x1a`)Y\x81F\xe6\xbe\x16nA\xad\r\xbd@T\x03<\x9f3\xbb\xda\x82$\xa2\xf3\xd7r\xe7cnH\xb3<\xbfp\x83r\xe8\xf1\xb9\x93>\xc5\x12wC\xbe\"\x06 \x9e\xf0-\xf9\xcb\xf2\xf6\xe8\x80\xd38/\x00', 0x0)
rename(&(0x7f0000000440)='\x13\x13w\xc5\xfc5\xd4\x14T\xd5\xd4\x1d)\xad\x1a`)Y\x81F\xe6\xbe\x16nA\xad\r\xbd@T\x03<\x9f3\xbb\xda\x82$\xa2\xf3\xd7r\xe7cnH\xb3<\xbfp\x83r\xe8\xf1\xb9\x93>\xc5\x12wC\xbe\"\x06 \x9e\xf0-\xf9\xcb\xf2\xf6\xe8\x80\xd38/\x00', &(0x7f0000000a40)='./file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\x00')


symlink(&(0x7f0000000000)='./file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa/../file0\x00', 0x0)
mknod(&(0x7f00000000c0)='./bus\x00', 0x2000, 0x4f4b)
r0 = open$dir(&(0x7f0000000040)='./bus\x00', 0x0, 0x0)
setsockopt(0xffffffffffffffff, 0x0, 0x0, &(0x7f0000000000)="5ab777", 0x3)
ioctl$FIOASYNC(r0, 0xc0104304, &(0x7f00000001c0)=0x20000002)


modctl$MODCTL_UNLOAD(0x2, 0x0)
r0 = socket$inet6(0x18, 0x3, 0x1f)
setsockopt$sock_int(r0, 0xffff, 0x10, &(0x7f0000000000), 0x4)


mknod(&(0x7f00000000c0)='./bus\x00', 0x2000, 0xd02)
chroot(&(0x7f0000000000)='./bus\x00')


open(0x0, 0x0, 0x0)
r0 = socket(0x18, 0x3, 0x0)
setsockopt$inet6_MRT6_ADD_MFC(r0, 0x29, 0x68, 0x0, 0x0)


mkdir(&(0x7f0000000080)='./file0\x00', 0x0)
compat_50___msgctl13$IPC_STAT(0x0, 0x2, &(0x7f0000000280)={{}, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, &(0x7f00000001c0)={&(0x7f0000000100)}})
compat_30___stat13(&(0x7f00000000c0)='./file0\x00', &(0x7f0000000200))
open(&(0x7f0000000100)='./file0\x00', 0x0, 0x0)
compat_40_mount(&(0x7f0000000140)='umap\x00', &(0x7f0000000040)='./file0\x00', 0x0, &(0x7f00000001c0))


fcntl$lock(0xffffffffffffffff, 0x0, 0x0)
compat_50_select(0x0, 0x0, 0x0, &(0x7f0000000080), &(0x7f00000000c0)={0xffffffffffffffff})


mknod(&(0x7f0000000100)='./file0\x00', 0x80002005, 0x5300)
r0 = open$dir(&(0x7f0000000b80)='./file0\x00', 0x0, 0x0)
mknod(&(0x7f0000000000)='./bus\x00', 0x0, 0x2e00)
ioctl$FIOASYNC(r0, 0xc01c5005, &(0x7f0000000000))


recvmmsg(0xffffffffffffffff, 0x0, 0x0, 0x0, &(0x7f0000002300)={0x0, 0x20000000e47})


setpriority(0x0, 0x1000, 0x0)
setpriority(0x1, 0x401, 0x0)


mknod(&(0x7f00000000c0)='./bus\x00', 0x2000, 0x4f4b)
r0 = open$dir(&(0x7f0000000000)='./bus\x00', 0x0, 0x0)
compat_43_ocreat(0x0, 0x0)
fchdir(r0)


open(&(0x7f0000001640)='./file0\x00', 0x615, 0x0)
ktrace(&(0x7f0000000200)='./file0\x00', 0x4, 0xd27d43220c7df9b, 0x0)
compat_43_ogetdirentries(0xffffffffffffffff, 0x0, 0x0, 0x0)


mknod(&(0x7f0000000400)='./file0\x00', 0x2000, 0x287e)
open$dir(&(0x7f0000000b80)='./file0\x00', 0x0, 0x0)
fcntl$lock(0xffffffffffffffff, 0x0, &(0x7f0000000040)={0x3})
poll(&(0x7f0000000000)=[{}], 0x20000000000000fe, 0x0)


open$dir(&(0x7f00000002c0)='./file0\x00', 0x200, 0x0)
open(0x0, 0x0, 0x0)
ktrace(&(0x7f0000000200)='./file0\x00', 0x4, 0xd27d43220c7df9b, 0x0)
fpathconf(0xffffffffffffffff, 0x0)


compat_40_mount(&(0x7f0000000380)='tmpfs\x00', &(0x7f00000003c0)='.\x00', 0x0, &(0x7f0000000140)="01")
mkdir(&(0x7f0000000080)='./file0\x00', 0x0)
compat_40_mount(0x0, &(0x7f0000000580)='./file0/../file0\x00', 0x0, 0x0)


_ksem_init(0x0, 0xfffffffffffffffe)


compat_43_ocreat(&(0x7f0000000000)='./file0\x00', 0x0)
r0 = getpid()
__fstat50(0xffffffffffffffff, 0x0)
ktrace(&(0x7f0000000000)='./file0\x00', 0x0, 0x40000424, r0)
open(0x0, 0x0, 0x0)
mmap(&(0x7f0000000000/0x13000)=nil, 0x13000, 0x0, 0x10, 0xffffffffffffffff, 0x0, 0x0)
compat_30_getdents(0xffffffffffffffff, 0x0, 0x0)
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x0)
connect$unix(0xffffffffffffffff, &(0x7f0000000080)=@file={0x0, './file0\x00'}, 0xa)


munmap(&(0x7f0000001000/0x3000)=nil, 0x3000)
r0 = shmget$private(0x0, 0x3000, 0x0, &(0x7f0000ffa000/0x3000)=nil)
shmat(r0, &(0x7f0000001000/0x3000)=nil, 0x0)
shmdt(0x0)


semctl$IPC_SET(0x0, 0x0, 0x1, &(0x7f00000000c0)={{0x0, 0x0, 0xffffffffffffffff, 0x0, 0x0, 0x0, 0x200}})
r0 = socket(0x18, 0x3, 0x0)
connect$unix(r0, &(0x7f00000000c0)=@abs={0x682eb13985c518e6, 0x7}, 0x1c)
getsockname$inet(r0, &(0x7f00000000c0), &(0x7f0000000000)=0xffffffffffffff35)
connect$unix(0xffffffffffffffff, &(0x7f00000000c0)=@abs={0x682eb13985c518e6, 0x7}, 0x1c)
ioctl$WSMUXIO_INJECTEVENT(0xffffffffffffffff, 0x80185760, &(0x7f0000000000))
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
r1 = socket(0x18, 0x1, 0x0)
setsockopt(r1, 0x1000000029, 0x2e, &(0x7f0000000000)="ebffcbff13b9fd812eaa4e713048e6993192964a", 0x14)
connect$unix(r1, &(0x7f00000000c0)=@abs={0x0, 0x7}, 0x1c)


mknod(&(0x7f0000000200)='./file0\x00', 0xc035cd953ea0fd64, 0xffffffffffffffff)


fcntl$lock(0xffffffffffffffff, 0x0, &(0x7f00000000c0)={0x0, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff})
fcntl$setstatus(0xffffffffffffffff, 0x4, 0xc0)
r0 = socket(0x18, 0x3, 0x0)
connect$unix(r0, &(0x7f00000000c0)=@abs={0x682eb13985c518e6, 0x7}, 0x1c)
getsockopt$sock_cred(r0, 0xffff, 0x1022, &(0x7f0000000000), &(0x7f0000000040)=0xc)
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
r1 = socket$unix(0x1, 0x2, 0x0)
getsockopt$sock_int(r1, 0xffff, 0x2000, &(0x7f0000000380), &(0x7f0000000040)=0x4)
getsockopt(r0, 0x29, 0x2c, 0x0, 0x0)


sendmsg$unix(0xffffffffffffffff, &(0x7f0000001700)={&(0x7f00000000c0), 0x1c, 0x0}, 0x0)
r0 = socket(0x18, 0x3, 0x0)
close(r0)
r1 = socket(0x800000018, 0x2, 0x0)
syz_emit_ethernet(0x2a, &(0x7f0000000080))
setsockopt$sock_int(r1, 0xffff, 0x1000, &(0x7f0000000000)=0x7, 0x4)
bind$unix(r1, &(0x7f0000000080)=@abs={0x1f95d27d48731892, 0x7}, 0x1c)
connect$unix(0xffffffffffffffff, &(0x7f00000000c0)=@abs={0x682eb13985c518e6, 0x7}, 0x1c)
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
r2 = socket(0x18, 0x2, 0x0)
r3 = dup2(r0, r2)
sendmsg$unix(r3, &(0x7f0000001700)={0x0, 0x0, 0x0}, 0x0)


setrlimit(0x0, 0x0)
socketpair$unix(0x1, 0x2, 0x0, &(0x7f0000000100)={<r0=>0xffffffffffffffff, <r1=>0xffffffffffffffff})
bind$unix(r1, &(0x7f0000000040)=@file={0xd19450564dee018c, './file0\x00'}, 0xa)
connect$unix(r1, &(0x7f0000000000)=@file={0xd1653077bafa0114, './file0\x00'}, 0xa)
socketpair$unix(0x1, 0x2, 0x0, &(0x7f0000000100)={0xffffffffffffffff, <r2=>0xffffffffffffffff})
connect$unix(r2, &(0x7f0000000000)=@file={0xd1653077bafa0114, './file0\x00'}, 0xa)
connect$unix(r0, &(0x7f0000000180)=@file={0x0, './file0\x00'}, 0xa)


mkdir(&(0x7f0000000080)='./file0\x00', 0x0)
compat_50___msgctl13$IPC_STAT(0x0, 0x2, &(0x7f0000000280)={{}, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, &(0x7f00000001c0)={&(0x7f0000000100)}})
compat_40_mount(&(0x7f0000000180)='ptyfs\x00', &(0x7f00000002c0)='./file0\x00', 0x0, &(0x7f0000000500))
lchown(&(0x7f0000000100)='./file0\x00', 0xffffffffffffffff, 0x0)
compat_40_mount(&(0x7f0000000140)='umap\x00', &(0x7f0000000040)='./file0\x00', 0x0, &(0x7f00000001c0))
open(&(0x7f00000000c0)='./file0\x00', 0x0, 0x0)
compat_50_select(0x40, &(0x7f0000000000)={0x9}, 0x0, 0x0, 0x0)


openat$wscons(0xffffffffffffff9c, 0x0, 0x0, 0x0)
symlink(&(0x7f0000000080)='.\x00', &(0x7f0000000240)='./file0\x00')
compat_50___msgctl13$IPC_STAT(0x0, 0x2, &(0x7f0000000280)={{}, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, &(0x7f00000001c0)={&(0x7f0000000100)}})
lchown(&(0x7f0000000100)='./file0/file0\x00', 0x0, 0x0)
compat_40_mount(&(0x7f0000000140)='umap\x00', &(0x7f0000000040)='./file0\x00', 0x0, &(0x7f00000001c0))
compat_50___msgctl13$IPC_STAT(0x0, 0x2, &(0x7f0000000280)={{}, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, &(0x7f00000001c0)={&(0x7f0000000100), 0xdffffffffffff7ff}})
lchown(&(0x7f0000000100)='./file0\x00', 0x0, 0xffffffffffffffff)
compat_40_mount(&(0x7f0000000380)='union\x00', &(0x7f0000000140)='./file0\x00', 0x0, &(0x7f00000001c0))
open(&(0x7f0000000100)='./file0\x00', 0x0, 0x0)


_lwp_ctl(0x0, &(0x7f0000000040)=0x0)
__fhstat50(&(0x7f0000001180)='%', 0x1, &(0x7f0000001200))
setuid(0x0)
modctl$MODCTL_LOAD(0x0, 0x0)
setgroups(0x0, 0x0)
mknod(0x0, 0x0, 0x800)
compat_50___lstat30(0x0, 0x0)
r0 = semget$private(0x0, 0x4000000009, 0x0)
semop(0x0, 0x0, 0x0)
getgid()
compat_50_____semctl13$IPC_STAT(r0, 0x0, 0x2, &(0x7f0000001300)=@array=&(0x7f00000012c0))
semctl$GETNCNT(0x0, 0x0, 0x3, 0x0)


compat_40_mount(&(0x7f0000000040)='tmpfs\x00', &(0x7f0000000140)='.\x00', 0x0, &(0x7f0000000080)="01")
mkdir(&(0x7f00000000c0)='./file0\x00', 0x0)
r0 = open$dir(&(0x7f00000000c0)='.\x00', 0x0, 0x0)
mknodat(r0, &(0x7f0000000580)='./file0/file1\x00', 0x8000, 0x0)
rmdir(&(0x7f0000000180)='./file0\x00')


getsockopt$sock_cred(0xffffffffffffffff, 0xffff, 0x11, 0x0, &(0x7f0000000080))
mlockall(0x2)
mprotect(&(0x7f000029c000/0x3000)=nil, 0x3000, 0x0)
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x2)


open(&(0x7f0000000040)='./file0\x00', 0x70e, 0x0)
ktrace(0x0, 0x4, 0xd27d43220c7df9b, 0x0)
__setitimer50(0x0, 0x0, &(0x7f0000001300))


r0 = socket(0x18, 0x3, 0x0)
ioctl$FIOSEEKHOLE(r0, 0x80047308, &(0x7f0000000180))


ptrace(0x27, 0x0, 0x0, 0x0)


shmget(0x1, 0x1000, 0x0, &(0x7f0000ffd000/0x1000)=nil)
r0 = socket$unix(0x1, 0x2, 0x0)
sendmsg$unix(r0, &(0x7f00000013c0)={&(0x7f0000000000)=@abs={0x0, 0x0, 0x1}, 0x8, 0x0}, 0x4)


open(&(0x7f00000000c0)='./file0\x00', 0x615, 0x0)
open$dir(&(0x7f00000000c0)='./file0\x00', 0x0, 0x0)
mknod(&(0x7f00000000c0)='./bus\x00', 0x2000, 0x1604)
open$dir(&(0x7f0000000040)='./bus\x00', 0x0, 0x0)


mknod(&(0x7f0000000300)='./file0\x00', 0x2000, 0x6da)
r0 = open(&(0x7f0000000480)='./file0\x00', 0x0, 0x0)
ioctl$WSDISPLAYIO_GCURPOS(r0, 0x2000745e, 0x0)
ioctl$WSKBDIO_GETMODE(r0, 0x40045714, 0x0)


socket(0x18, 0x1, 0x0)
socketpair$unix(0x1, 0x1, 0x0, &(0x7f0000000040)={0xffffffffffffffff, <r0=>0xffffffffffffffff})
recvmmsg(0xffffffffffffffff, &(0x7f0000000700)={&(0x7f00000001c0)={0x0, 0x0, 0x0, 0x0, &(0x7f0000000680)=""/100, 0x64}, 0x3f8d}, 0x10, 0x0, 0x0)
sendmmsg(r0, &(0x7f0000000080)={0x0}, 0x10, 0x0, 0x0)
poll(&(0x7f0000000000)=[{}], 0x4e8, 0x0)


mknod(&(0x7f00000000c0)='./bus\x00', 0x2000, 0x0)
r0 = open$dir(&(0x7f0000000000)='./bus\x00', 0x0, 0x0)
r1 = open(&(0x7f00000002c0)='./bus\x00', 0x0, 0x0)
read(r1, &(0x7f0000000180)=""/198, 0xc6)
ioctl$FIOASYNC(r0, 0x80017472, &(0x7f00000001c0))


r0 = socket(0x18, 0x2, 0x0)
setsockopt$sock_int(r0, 0xffff, 0x0, 0x0, 0x0)
r1 = compat_30_socket(0x18, 0x3, 0x0)
ioctl$WSMOUSEIO_SRES(r1, 0xc028756b, &(0x7f0000003440))


msgctl$IPC_SET(0x0, 0x1, 0x0)
r0 = msgget$private(0x0, 0x0)
msgsnd(r0, &(0x7f00000003c0)=ANY=[@ANYRESDEC=r0], 0x15, 0x0)
msgsnd(r0, &(0x7f0000000180)=ANY=[@ANYBLOB="0300000000000000a486714b3b6964c6220190d7f39c044dac99fec5afca3ec3e155903698d635e2ab348195cce43ab9e134935e4edf5efe4e5ec4bec02d51201f93b9860f69d58fca21e1f36041df344b049af8bf321177b2fdc7cc2725691dc000"/110], 0x6e, 0x0)
msgrcv(r0, &(0x7f0000000380), 0x8, 0x3, 0x1800)


r0 = getsid(0x0)
ptrace(0x9, r0, 0x0, 0x0)
compat_50_wait4(0x0, 0x0, 0x0, 0x0)
ptrace(0x10000000021, r0, &(0x7f0000000000), 0x2)


r0 = socket(0x18, 0x3, 0x0)
ioctl$FIOSEEKHOLE(r0, 0x80206913, &(0x7f0000000180))
clock_nanosleep(0x20000000, 0x0, &(0x7f0000000000), 0x0)


mknod(&(0x7f0000000040)='./file0\x00', 0x201a, 0x0)
__mount50(&(0x7f0000000000)='overlay\x00', &(0x7f0000000040)='.\x00', 0x0, &(0x7f0000000540), 0x0)
write(0xffffffffffffffff, 0x0, 0x0)
__mount50(0x0, &(0x7f0000000040)='.\x00', 0xb0afbd006181d6de, &(0x7f0000000540), 0x0)
open(&(0x7f0000000100)='./file0\x00', 0x0, 0x0)


r0 = fcntl$dupfd(0xffffffffffffffff, 0xb, 0xffffffffffffffff)
fcntl$setown(r0, 0x6, 0x0)


mknod(&(0x7f00000000c0)='./bus\x00', 0x2000, 0x4f4b)
r0 = open$dir(&(0x7f0000000040)='./bus\x00', 0x0, 0x0)
ioctl$FIOASYNC(r0, 0x80104301, &(0x7f00000001c0))


modctl$MODCTL_UNLOAD(0x2, 0x0)
syz_usb_connect$printer(0x0, 0x2d, &(0x7f0000001840)={{0x12, 0x1, 0x0, 0x0, 0x0, 0x0, 0x0, 0x525, 0xa4a8, 0x40, 0x1, 0x2, 0x3, 0x1, [{{0x9, 0x2, 0x1b, 0x1}}]}}, 0x0)


r0 = socket$inet(0x2, 0x2, 0x0)
setsockopt$sock_timeval(r0, 0xffff, 0x8, 0x0, 0x0)


semctl$IPC_SET(0x0, 0x0, 0x1, &(0x7f00000000c0)={{0x0, 0x0, 0xffffffffffffffff, 0x0, 0x0, 0x0, 0x200}})
r0 = socket(0x18, 0x3, 0x0)
connect$unix(r0, &(0x7f00000000c0)=@abs={0x682eb13985c518e6, 0x7}, 0x1c)
getsockname$inet(r0, &(0x7f00000000c0), &(0x7f0000000000)=0xffffffffffffff35)
sendmsg$unix(0xffffffffffffffff, &(0x7f0000001700)={&(0x7f00000000c0), 0x1c, 0x0}, 0x0)
connect$unix(0xffffffffffffffff, &(0x7f00000000c0)=@abs={0x682eb13985c518e6, 0x7}, 0x1c)
setsockopt(0xffffffffffffffff, 0x0, 0x0, &(0x7f0000000000)="ff", 0x1)
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
r1 = socket(0x18, 0x400000002, 0x0)
setsockopt(r1, 0x1000000029, 0x2e, &(0x7f0000000000)="ebffcbff13b9fd812eaa4e713048e69931929648", 0x14)
r2 = socket(0x18, 0x1, 0x0)
r3 = dup2(r1, r2)
sendmsg$unix(r3, &(0x7f0000001700)={0x0, 0x0, 0x0}, 0x0)


symlink(&(0x7f0000000080)='.\x00', &(0x7f0000000240)='./file0\x00')
compat_50___msgctl13$IPC_STAT(0x0, 0x2, &(0x7f00000003c0)={{}, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, &(0x7f00000001c0)={&(0x7f0000000100)}})
lchown(&(0x7f0000000100)='./file0\x00', 0x0, 0x0)
compat_40_mount(&(0x7f0000000000)='null\x00', &(0x7f0000000140)='./file0\x00', 0x0, &(0x7f00000001c0))
mkdir(&(0x7f0000000000)='./control\x00', 0x0)


symlink(&(0x7f0000000080)='.\x00', &(0x7f0000000240)='./file0\x00')
compat_50___msgctl13$IPC_STAT(0x0, 0x2, &(0x7f0000000280)={{}, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, &(0x7f00000001c0)={&(0x7f0000000100)}})
open(&(0x7f0000000100)='./file0\x00', 0x0, 0x0)
compat_40_mount(&(0x7f0000000140)='umap\x00', &(0x7f0000000040)='./file0\x00', 0x0, &(0x7f00000001c0))
r0 = openat(0xffffffffffffff9c, &(0x7f0000000140)='./file0\x00', 0x0, 0x0)
fdatasync(r0)


mkdir(0x0, 0x0)
modctl$MODCTL_UNLOAD(0x2, 0x0)
r0 = getppid()
getpriority(0x2, r0)


_lwp_getprivate()


clock_nanosleep(0x0, 0x0, &(0x7f0000000140)={0x0, 0x6}, 0x0)


socketpair$unix(0x1, 0x5, 0x0, &(0x7f0000000000)={<r0=>0xffffffffffffffff, <r1=>0xffffffffffffffff})
sendmsg$unix(r1, &(0x7f00000000c0)={0x0, 0x0, 0x0, 0x0, 0x0, 0x28}, 0x0)
recvmsg(r0, &(0x7f0000000780)={0x0, 0x0, &(0x7f0000000640)=[{&(0x7f0000000080)=""/23, 0x17}], 0x1, 0x0}, 0x42)


open(&(0x7f0000000480)='./file0\x00', 0x80000000000206, 0x0)
ktrace(&(0x7f0000000200)='./file0\x00', 0x4, 0xd27d43220c7df9b, 0x0)
_lwp_unpark(0x0, 0x0)


getpid()
compat_50_nanosleep(0x0, 0x0)
minherit(&(0x7f0000ffe000/0x2000)=nil, 0x2000, 0x2)
r0 = socket(0x18, 0x3, 0x0)
ioctl$FIOSEEKHOLE(r0, 0x8020690c, &(0x7f0000000180)=0x8000000000000032)


r0 = open(&(0x7f0000000180)='./file0\x00', 0x80000000000206, 0x0)
ktrace(&(0x7f0000000000)='./file0\x00', 0x4, 0x310, 0x0)
fsync(r0)


r0 = msgget$private(0x0, 0x0)
setuid(0xee01)
msgctl$IPC_SET(r0, 0x1, &(0x7f0000001840))


socketpair$unix(0x1, 0x5, 0x0, &(0x7f0000000000)={0xffffffffffffffff, <r0=>0xffffffffffffffff})
mknod(&(0x7f0000000000)='./file1\x00', 0x2000, 0xa718)
open$dir(&(0x7f00000000c0)='./file1\x00', 0x0, 0x0)
symlink(&(0x7f0000000080)='.\x00', 0x0)
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
compat_50_select(0x40, &(0x7f0000000080), 0x0, 0x0, 0x0)
shutdown(r0, 0x1)


r0 = getsid(0x0)
ptrace(0x9, r0, 0x0, 0x0)
compat_50_wait4(0x0, 0x0, 0x0, 0x0)
openat$tprof(0xffffffffffffff05, &(0x7f0000000000), 0x0, 0x0)
ptrace(0x29, r0, &(0x7f0000000000), 0x0)


poll(&(0x7f0000000000)=[{}], 0x4e8, 0x0)
setegid(0x0)
madvise(&(0x7f0000000000/0x3000)=nil, 0x3000, 0x6)


mkdir(&(0x7f0000000080)='./file0\x00', 0x0)
__mount50(&(0x7f0000000000)='fdesc\x00', &(0x7f0000000180)='./file0\x00', 0x0, 0x0, 0x0)
compat_50___msgctl13$IPC_STAT(0x0, 0x2, &(0x7f0000000280)={{}, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, &(0x7f00000001c0)={&(0x7f0000000100), 0xdffffffffffff7ff}})
lchown(&(0x7f0000000100)='./file0\x00', 0x0, 0xffffffffffffffff)
compat_40_mount(&(0x7f0000000380)='union\x00', &(0x7f0000000140)='./file0\x00', 0x0, &(0x7f00000001c0))


open$dir(0x0, 0x0, 0x0)
setsockopt$sock_timeval(0xffffffffffffffff, 0xffff, 0x0, 0x0, 0x0)
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x0)
r0 = socket(0x2, 0x2, 0x0)
compat_30___fstat13(r0, 0x0)


compat_40_mount(&(0x7f0000000040)='tmpfs\x00', &(0x7f00000000c0)='.\x00', 0x0, &(0x7f00000002c0)="01")
mknod(&(0x7f0000000280)='./file0\x00', 0x1ffa, 0x0)
lchown(&(0x7f0000000100)='./file0\x00', 0x0, 0xffffffffffffffff)


r0 = socket$inet(0x2, 0x2, 0x0)
r1 = socket(0x11, 0x3, 0x0)
ioctl$FIOASYNC(r1, 0x8004667d, &(0x7f0000000180)=0x7fffffff)
fcntl$lock(r0, 0xa, 0x0)


compat_30___stat13(&(0x7f0000000180)='./file0\x00', 0x0)


chroot(&(0x7f0000000000)='.\x00')
open(&(0x7f0000000040)='./file0\x00', 0x10386, 0x0)
r0 = open(&(0x7f0000000480)='./file0\x00', 0x0, 0x0)
compat_20_fstatfs(r0, &(0x7f0000000200))


compat_43_ocreat(&(0x7f0000000000)='./file0\x00', 0x0)
truncate(&(0x7f0000000340)='./file0\x00', 0x0, 0x1001)
acct(&(0x7f0000000080)='./file0\x00')


r0 = socket$inet(0x2, 0x1, 0x0)
bind$inet(r0, &(0x7f0000000040)={0x2, 0x0}, 0x10)
connect$inet(r0, &(0x7f0000000000)={0x2, 0x0}, 0x10)
compat_43_osend(r0, &(0x7f0000000200)="b72a", 0x2, 0x5)
recvfrom$inet(r0, &(0x7f0000000080)=""/67, 0x43, 0x0, 0x0, 0x0)


symlink(&(0x7f0000000080)='.\x00', &(0x7f0000000240)='./file0\x00')
compat_50___msgctl13$IPC_STAT(0x0, 0x2, &(0x7f0000000280)={{}, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, &(0x7f00000001c0)={&(0x7f0000000100)}})
sendmsg$unix(0xffffffffffffffff, &(0x7f0000000240)={&(0x7f0000000180)=@abs={0x0, 0x0, 0x2}, 0x8, &(0x7f0000000200)}, 0x0)
open(&(0x7f0000000100)='./file0\x00', 0x0, 0x0)
compat_40_mount(&(0x7f0000000140)='umap\x00', &(0x7f0000000040)='./file0\x00', 0x0, &(0x7f00000001c0))
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
r0 = open(&(0x7f0000000140)='./file0\x00', 0x78e, 0x0)
writev(r0, &(0x7f00000000c0)=[{0x0}], 0x1)


setrlimit(0x7, &(0x7f00000000c0))
__clone(0x0, 0x0)


socketpair$unix(0x1, 0x2, 0x0, &(0x7f0000000000)={<r0=>0xffffffffffffffff, <r1=>0xffffffffffffffff})
sendto(r1, 0x0, 0x0, 0x0, 0x0, 0x0)
recvmsg(r0, &(0x7f0000000600)={0x0, 0x0, 0x0, 0x0, 0x0}, 0x1002)


r0 = socket(0x2, 0x1, 0x0)
close(r0)
fsync(r0)


swapctl$SWAP_ON(0x1, 0x0, 0x80231260)


setreuid(0x0, 0xee01)
setgroups(0x0, 0x0)


open(&(0x7f0000000480)='./file0\x00', 0x80000000000206, 0x0)
ktrace(&(0x7f0000000200)='./file0\x00', 0x4, 0xd27d43220c7df9b, 0x0)
mknod(&(0x7f0000000180)='./file0\x00', 0x2000, 0x202)
compat_43_osethostid(0x0)
r0 = open(&(0x7f0000000140)='./file0\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x200000)=nil, 0x200000, 0x3, 0x10, r0, 0x0, 0x0)
__mount50(0x0, 0x0, 0x0, 0x0, 0x0)
__clone(0x0, 0x0)
mlock(&(0x7f0000009000/0x2000)=nil, 0x2000)


modctl$MODCTL_UNLOAD(0x2, 0x0)
mknod(&(0x7f0000000000)='./file1\x00', 0x2000, 0xa718)
r0 = open$dir(&(0x7f00000000c0)='./file1\x00', 0x0, 0x0)
ioctl$OFIOGETBMAP(r0, 0xc010447d, &(0x7f0000000040))


mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x0)
semop(0x0, &(0x7f0000000180)=[{}, {}, {}, {}, {}, {}, {0x0, 0xfffb}, {}, {0x3}], 0x9)


symlink(&(0x7f0000000080)='.\x00', &(0x7f0000000240)='./file0\x00')
compat_50___msgctl13$IPC_STAT(0x0, 0x2, &(0x7f0000000280)={{}, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, &(0x7f00000001c0)={&(0x7f0000000100)}})
lchown(&(0x7f0000000100)='./file0\x00', 0x0, 0xffffffffffffffff)
compat_40_mount(&(0x7f0000000380)='union\x00', &(0x7f0000000140)='./file0\x00', 0x0, &(0x7f00000001c0))


sendmsg$unix(0xffffffffffffffff, &(0x7f0000000100)={&(0x7f0000000080)=@file={0x0, './file1\x00'}, 0xa, 0x0}, 0x0)
r0 = open(&(0x7f0000000200)='./file0\x00', 0x200, 0x0)
fcntl$lock(r0, 0x8, &(0x7f0000000080)={0x0, 0x0, 0x0, 0x100000001})
r1 = open(&(0x7f0000000040)='./file0\x00', 0x0, 0x0)
fcntl$lock(r1, 0x9, &(0x7f0000000000)={0x0, 0x0, 0x20000, 0x269000000})


socketpair$unix(0x1, 0x1, 0x0, &(0x7f00000001c0)={0xffffffffffffffff, <r0=>0xffffffffffffffff})
fcntl$lock(r0, 0x6, &(0x7f0000002000)={0x1})
fcntl$lock(r0, 0x26, &(0x7f00000031c0))
fcntl$lock(r0, 0x26, &(0x7f0000000180))


mknod(&(0x7f00000000c0)='./file0\x00', 0x2000, 0x4100)
open(&(0x7f00000000c0)='./file0\x00', 0x1, 0x0)
open(&(0x7f00000000c0)='./file0\x00', 0x0, 0x0)
compat_50_select(0x40, &(0x7f00000001c0)={0x7b8}, 0x0, 0x0, 0x0)


open(&(0x7f00000000c0)='./file0\x00', 0x615, 0x0)
open(&(0x7f0000000040)='./file0\x00', 0x10, 0x0)
r0 = open(&(0x7f0000000340)='./file0\x00', 0x0, 0x0)
flock(r0, 0x1)


modctl$MODCTL_UNLOAD(0x2, 0x0)
compat_50___msgctl13$IPC_STAT(0xffffffffffffffff, 0x2, 0x0)


modctl$MODCTL_LOAD(0x2, 0xffffffffffffffff)
rasctl(0x0, 0x9, 0x0)
rasctl(&(0x7f0000000180), 0xd39, 0x0)
rasctl(0x0, 0x9, 0x1)


chmod(0x0, 0x0)
chown(0x0, 0x0, 0x0)
setreuid(0x0, 0x0)
setpgid(0x0, 0x0)
madvise(&(0x7f0000000000/0x4000)=nil, 0x0, 0x0)
symlinkat(0x0, 0xffffffffffffffff, 0x0)
connect$unix(0xffffffffffffffff, 0x0, 0x0)
syz_emit_ethernet(0x4e, &(0x7f0000000000))


r0 = socket(0x18, 0x2, 0x0)
connect$unix(r0, &(0x7f00000000c0)=@abs={0x682eb13985c518e6, 0x7}, 0x1c)
sendmsg(r0, &(0x7f0000000080)={0x0, 0x0, 0x0, 0x0, 0x0, 0x10}, 0x3)


r0 = socket(0x1f, 0x1, 0x0)
mknod(0x0, 0x0, 0x2e00)
recvfrom(r0, 0x0, 0x0, 0x285, 0x0, 0x0)


r0 = socket(0x18, 0x1, 0x0)
setsockopt(r0, 0x29, 0x2a, &(0x7f0000000000)="02000000", 0x4)


mknod(&(0x7f0000000100)='./file0\x00', 0x80002005, 0x5300)
r0 = open$dir(&(0x7f0000000b80)='./file0\x00', 0x0, 0x0)
ioctl$FIOASYNC(r0, 0xc01c5005, &(0x7f0000000000))


symlink(&(0x7f0000000080)='.\x00', &(0x7f0000000240)='./file0\x00')
compat_50___msgctl13$IPC_STAT(0x0, 0x2, &(0x7f0000000280)={{}, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, &(0x7f00000001c0)={&(0x7f0000000100)}})
__mount50(&(0x7f00000002c0)='overlay\x00', &(0x7f0000000040)='.\x00', 0x0, &(0x7f0000000540), 0x0)
mknod(&(0x7f0000000000)='./bus\x00', 0x2000, 0x2e00)
open(&(0x7f0000000100)='./file0\x00', 0x0, 0x0)
compat_40_mount(&(0x7f0000000140)='umap\x00', &(0x7f0000000040)='./file0\x00', 0x0, &(0x7f00000001c0))
link(&(0x7f00000003c0)='./bus\x00', &(0x7f0000000400)='./file1\x00')


r0 = socket(0x18, 0x3, 0x0)
ioctl$FIOSEEKHOLE(r0, 0xc118691d, &(0x7f0000000180)=0x8000000000000033)


symlink(&(0x7f0000000080)='.\x00', &(0x7f0000000240)='./file0\x00')
compat_50___msgctl13$IPC_STAT(0x0, 0x2, &(0x7f0000000280)={{}, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, &(0x7f00000001c0)={&(0x7f0000000100)}})
lchown(&(0x7f0000000100)='./file0\x00', 0x0, 0xffffffffffffffff)
compat_40_mount(&(0x7f0000000380)='ntfs\x00', &(0x7f0000000140)='./file0\x00', 0x0, &(0x7f00000001c0))


setrlimit(0x7, &(0x7f00000000c0)={0x0, 0x1fcc})


setpriority(0x3, 0x1000, 0x0)


fktrace(0xffffffffffffffff, 0x0, 0x0, 0x0)
setsockopt$sock_int(0xffffffffffffffff, 0xffff, 0x0, 0x0, 0x0)
modctl$MODCTL_UNLOAD(0x2, 0x0)
r0 = msgget$private(0x0, 0x0)
msgsnd(r0, &(0x7f0000000d00)=ANY=[], 0x401, 0x0)


symlink(&(0x7f0000000080)='.\x00', &(0x7f0000000240)='./file0\x00')
compat_50___msgctl13$IPC_STAT(0x0, 0x2, &(0x7f0000000280)={{}, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, &(0x7f00000001c0)={&(0x7f0000000100)}})
lchown(&(0x7f0000000100)='./file0\x00', 0x0, 0xffffffffffffffff)
compat_40_mount(&(0x7f0000000000)='null\x00', &(0x7f0000000140)='./file0\x00', 0x0, &(0x7f00000001c0))
unlink(&(0x7f0000000000)='./file0\x00')


open(&(0x7f0000000080)='./file0\x00', 0x3, 0x0)


setreuid(0xee00, 0x0)
mknod(&(0x7f00000000c0)='./bus\x00', 0x2000, 0x1604)
open(&(0x7f00000000c0)='./bus\x00', 0x0, 0x0)


symlink(&(0x7f0000000000)='.\x00', 0x0)
__mount50(&(0x7f00000001c0)='fdesc\x00', &(0x7f0000000040)='.\x00', 0x0, 0x0, 0x0)
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
unlink(&(0x7f0000000000)='./file0\x00')


mknodat(0xffffffffffffff9c, &(0x7f0000000040)='./file0\x00', 0xc0e99db6de761f86, 0x0)
r0 = open$dir(&(0x7f0000000000)='./file0\x00', 0x1, 0x0)
openat(0xffffffffffffff9c, &(0x7f0000000000)='./file0\x00', 0x0, 0x0)
setreuid(0x0, 0xee01)
ioctl$FIONREAD(r0, 0x80206979, &(0x7f0000000200))


mlock(&(0x7f0000ffc000/0x3000)=nil, 0x3000)
socket$inet(0x2, 0x0, 0x0)
__msync13(&(0x7f0000ffd000/0x1000)=nil, 0xff, 0x2)


r0 = open(&(0x7f0000000080)='./file0\x00', 0x70e, 0x2de)
writev(r0, &(0x7f0000000280)=[{&(0x7f0000000400)="56cf5daafc6597872431ff6c668c28adc33a60651a078a116b3548a8be66444658a069b71e92feadcd7b3434dadfc4fc3fd4ef7525b58b0086932b0a93463511a8889f58073c33282b2afa1967c7908d4d0aea830b68a43711d6c4457ec4a41a772d6ae311730b7823d96ef1c25a7bae68954da1ffde05cf138758ef54d053a94bce1d28f7ea235a66647b6db6eda0d5e78db16b66fae3f13f", 0x99}], 0x1)
r1 = open(&(0x7f0000000040)='./file0\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x200000)=nil, 0x200000, 0x3, 0x10, r1, 0x0, 0x0)
close(0xffffffffffffffff)
mknodat(0xffffffffffffff9c, &(0x7f0000000040)='./file0\x00', 0x8000, 0x1192)
open$dir(&(0x7f0000000100)='./file0\x00', 0x0, 0x0)
socketpair$unix(0x1, 0x2, 0x0, &(0x7f0000000040)={<r2=>0xffffffffffffffff, <r3=>0xffffffffffffffff})
dup2(r3, r2)
connect$unix(0xffffffffffffff9c, &(0x7f00000000c0)=@file={0x0, './file0\x00'}, 0xa)
r4 = getuid()
seteuid(r4)
r5 = getpid()
ktrace(0x0, 0x1, 0x40000930, r5)
shutdown(r2, 0x1)
r6 = socket$inet(0x2, 0x3, 0x0)
sendmmsg(r6, &(0x7f0000000c40)={&(0x7f0000000c00)={&(0x7f00000002c0), 0xc, 0x0, 0x0, 0x0}, 0x10}, 0x10, 0x0, 0x0)


unlink(&(0x7f0000000080)='./file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\x00')
r0 = socket(0x2, 0x3, 0x0)
ioctl$FIOSEEKHOLE(r0, 0x80906931, &(0x7f0000000080)=0x8000000000000031)


compat_43_ommap(&(0x7f0000ffc000/0x2000)=nil, 0x20000000, 0x0, 0x1082, 0xffffffffffffffff, 0x0)


r0 = socket(0x18, 0x2, 0x0)
setsockopt(r0, 0x1000000000029, 0x13, &(0x7f0000000040)="00fb6c4f", 0x4)


__mount50(&(0x7f00000002c0)='overlay\x00', &(0x7f0000000040)='.\x00', 0x0, &(0x7f0000000540), 0x0)
r0 = open(&(0x7f0000000300)='.\x00', 0x0, 0x0)
mkdirat(r0, &(0x7f0000000440)='\x13\x13w\xc5\xfc5\xd4\x14T\xd5\xd4\x1d)\xad\x1a`)Y\x81F\xe6\xbe\x16nA\xad\r\xbd@T\x03<\x9f3\xbb\xda\x82$\xa2\xf3\xd7r\xe7cnH\xb3<\xbfp\x83r\xe8\xf1\xb9\x93>\xc5\x12wC\xbe\"\x06 \x9e\xf0-\xf9\xcb\xf2\xf6\xe8\x80\xd38//file0\x00', 0x0)


compat_43_osetrlimit(0x9, &(0x7f0000000080))
socket(0x1f, 0x5, 0x0)


__mount50(&(0x7f00000001c0)='fdesc\x00', &(0x7f0000000040)='.\x00', 0x0, 0x0, 0x0)
open$dir(&(0x7f0000000000)='.\x00', 0x0, 0x0)
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
mknod(&(0x7f0000000000)='./file1\x00', 0x0, 0x0)


open(&(0x7f0000000200)='./file0\x00', 0x80000000000206, 0x0)
pathconf(&(0x7f0000000080)='./file0\x00', 0x8)


write(0xffffffffffffffff, &(0x7f00000001c0)="39e4aff151", 0x5)
ktrace(&(0x7f0000000200)='./file0\x00', 0x0, 0x0, 0x0)
mkdir(&(0x7f0000000080)='./file0\x00', 0x0)
r0 = socket$inet6(0x18, 0x3, 0x0)
open$dir(&(0x7f0000000000)='./file0\x00', 0x0, 0x0)
pathconf(&(0x7f00000000c0)='./file0\x00', 0x0)
__mount50(&(0x7f0000000000)='fdesc\x00', 0x0, 0x0, 0x0, 0x0)
sendto$inet6(r0, &(0x7f0000000000)='<', 0x329, 0x2, &(0x7f0000000040)={0x18, 0x3}, 0x1c)


open$dir(&(0x7f0000000b80)='./file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\x00', 0x200, 0x0)
r0 = open$dir(&(0x7f0000000000)='.\x00', 0x0, 0x0)
symlinkat(&(0x7f0000000300)='./file0\x00', r0, &(0x7f0000000c80)='./file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\x00')
open$dir(&(0x7f00000001c0)='./file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\x00', 0x200, 0x0)
symlinkat(&(0x7f0000000dc0)='./file0\x00', r0, &(0x7f0000000ec0)='./file1aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\x00')
mkdirat(r0, &(0x7f0000000600)='./file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\x00', 0x0)
mknod(&(0x7f0000000300)='./file0\x00', 0x2000, 0x6da)
rename(&(0x7f0000000d80)='./file0\x00', &(0x7f0000000180)='./file1\x00')
rename(&(0x7f0000000040)='./file1\x00', &(0x7f0000000480)='./file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\x00')


r0 = socket(0x18, 0x1, 0x0)
setsockopt(r0, 0x6, 0x2, 0x0, 0x0)


compat_40_mount(0x0, 0x0, 0x0, 0x0)
_lwp_create(&(0x7f0000000300)={0x0, &(0x7f00000001c0)={0x0, &(0x7f0000000140)={0x0, 0x0, {}, {}, {0x0, 0x0, '\x03\x00'}}, {}, {}, {0x0, 0x0, ':^\x00'}}, {}, {}, {0x0, 0x0, '\x00'}}, 0x0, 0x0)
mknod(&(0x7f00000000c0)='./bus\x00', 0x2000, 0x4f4b)
r0 = open$dir(&(0x7f0000000040)='./bus\x00', 0x0, 0x0)
ioctl$FIOASYNC(r0, 0x80104305, &(0x7f00000001c0))


modctl$MODCTL_UNLOAD(0x2, 0x0)
setpriority(0x2, 0x1000, 0x0)


socketpair(0x22, 0x3, 0x21, 0x0)


open(0x0, 0x0, 0x0)
open(0x0, 0x0, 0x0)
setreuid(0xee00, 0x0)
open$dir(0x0, 0x0, 0x0)
socketpair$unix(0x1, 0x0, 0x0, 0x0)
getsockopt$sock_cred(0xffffffffffffffff, 0xffff, 0x1022, &(0x7f0000000300), &(0x7f0000000040)=0xc)
setegid(0x0)
r0 = msgget$private(0x0, 0x604)
msgctl$IPC_RMID(r0, 0x0)
msgsnd(r0, &(0x7f00000002c0)=ANY=[@ANYBLOB="0000000000000000fbb2d30026d68ea90fe95c690000002800e40e16de108a0e5932d8"], 0xb, 0x800)
msgrcv(0x0, &(0x7f0000000140)={0x0, ""/188}, 0xc4, 0x3, 0x0)
msgctl$IPC_SET(0x0, 0x1, 0x0)
socket(0x0, 0x0, 0x0)
mknod$loop(&(0x7f0000000000)='./file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\x00', 0x2000, 0x1)
recvmsg(0xffffffffffffffff, 0xffffffffffffffff, 0x0)
link(&(0x7f0000000940)='./file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\x00', &(0x7f0000000d40)='./file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\x00')
link(&(0x7f0000001240)='./file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\x00', &(0x7f0000000bc0)='./file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\x00')
symlink(&(0x7f0000000ac0)='./file0\x00', &(0x7f0000000e40)='./file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\x00')
setreuid(0xee00, 0x0)
open$dir(0x0, 0x0, 0x0)
getuid()
fchown(0xffffffffffffffff, 0x0, 0x0)
utimensat(0xffffffffffffff9c, 0x0, 0x0, 0x0)
rename(&(0x7f0000000600)='./file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\x00', &(0x7f0000001680)='./file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\x00')
rename(&(0x7f0000001140)='./file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\x00', &(0x7f0000000240)='./file2\x00')


compat_40_mount(&(0x7f0000000040)='tmpfs\x00', &(0x7f00000000c0)='.\x00', 0x0, &(0x7f00000002c0)="01")
mknod(&(0x7f00000000c0)='./file0\x00', 0x8000, 0x412dff)
compat_50_quotactl(&(0x7f0000000080)='./file0\x00', 0x4, 0x0, 0x0)
unmount(&(0x7f0000000000)='./file0\x00', 0x0)


socketpair$unix(0x1, 0x5, 0x0, &(0x7f00000004c0)={0xffffffffffffffff, <r0=>0xffffffffffffffff})
setsockopt$sock_linger(r0, 0xffff, 0x80, 0x0, 0x0)


mknod(&(0x7f00000000c0)='./file0\x00', 0x6000, 0xe03)
r0 = open(&(0x7f0000000480)='./file0\x00', 0x80000000000206, 0x0)
writev(r0, &(0x7f0000000880)=[{0x0, 0x3}], 0x1)


modctl$MODCTL_UNLOAD(0x2, 0x0)
mkdir(&(0x7f0000000080)='./file0\x00', 0x0)
__mount50(&(0x7f0000000040)='procfs\x00', &(0x7f0000000080)='./file0\x00', 0x0, &(0x7f0000000400)='@', 0x1)
r0 = open(&(0x7f0000000180)='./file0\x00', 0x0, 0x0)
read(r0, 0x0, 0x0)


r0 = socket(0x18, 0x3, 0x0)
symlink(&(0x7f0000000080)='.\x00', &(0x7f0000000240)='./file0\x00')
__mount50(&(0x7f0000000000)='overlay\x00', &(0x7f0000000140)='./file0/file0\x00', 0x0, &(0x7f0000000540), 0x0)
compat_50___msgctl13$IPC_STAT(0x0, 0x2, &(0x7f0000000280)={{}, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, &(0x7f00000001c0)={&(0x7f0000000100), 0xdffffffffffff7ff}})
lchown(&(0x7f0000000100)='./file0/file0\x00', 0x0, 0x0)
compat_40_mount(&(0x7f0000000380)='union\x00', &(0x7f0000000140)='./file0\x00', 0x0, &(0x7f00000001c0))
ioctl$FIOSEEKHOLE(r0, 0x8090697a, &(0x7f0000000180)=0x8000000000000032)
__mount50(&(0x7f00000002c0)='overlay\x00', &(0x7f0000000040)='.\x00', 0x0, &(0x7f0000000540), 0x0)


mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
r0 = socket$inet(0x2, 0x1, 0x0)
setsockopt(r0, 0x0, 0x1a, &(0x7f0000000080)="301dc649", 0x4)


mknod(&(0x7f00000000c0)='./bus\x00', 0x2000, 0x0)
r0 = open$dir(&(0x7f0000000000)='./bus\x00', 0x0, 0x0)
__mount50(0x0, &(0x7f0000000040)='.\x00', 0x0, 0x0, 0x0)
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
ioctl$FIOASYNC(r0, 0x80047480, &(0x7f0000000040))


socketpair(0x18, 0x2, 0x0, 0x0)


mknod(&(0x7f0000000100)='./file0\x00', 0x80002005, 0x5300)
open(&(0x7f0000000000)='./file0\x00', 0x0, 0x0)
ioctl$FIONREAD(0xffffffffffffffff, 0x80145003, &(0x7f00000001c0))
open(&(0x7f0000000480)='./file0\x00', 0x80000000000206, 0xd6)
symlink(&(0x7f0000000080)='.\x00', &(0x7f0000000240)='./file0\x00')
compat_50___msgctl13$IPC_STAT(0x0, 0x2, &(0x7f0000000280)={{}, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, &(0x7f00000001c0)={&(0x7f0000000100), 0xdffffffffffff7ff}})
compat_40_mount(&(0x7f0000000380)='union\x00', &(0x7f0000000140)='./file0\x00', 0x0, &(0x7f00000001c0))
compat_40_mount(&(0x7f0000000380)='union\x00', &(0x7f0000000140)='./file0\x00', 0x0, &(0x7f00000001c0))
open(&(0x7f0000000100)='./file0\x00', 0x0, 0x0)
mknod(&(0x7f00000000c0)='./bus\x00', 0x2000, 0x4f4b)
compat_40_mount(&(0x7f0000000040)='tmpfs\x00', &(0x7f00000000c0)='.\x00', 0x0, &(0x7f00000002c0)="01")
mknod(&(0x7f00000000c0)='./file0\x00', 0x6000, 0xe03)
__mount50(&(0x7f0000000000)='overlay\x00', &(0x7f0000000040)='.\x00', 0x0, &(0x7f0000000540), 0x0)
r0 = open(&(0x7f0000000080)='./file0\x00', 0x70e, 0x0)
writev(r0, &(0x7f0000001580)=[{&(0x7f0000000280)="98", 0x1}], 0x1)
open$dir(&(0x7f0000000240)='./bus\x00', 0x0, 0x0)
compat_50_select(0x0, 0x0, 0x0, 0x0, 0x0)
execve(&(0x7f0000000180)='./file0\x00', 0x0, 0x0)


__futimes50(0xffffffffffffffff, 0x0)


r0 = socket$inet(0x2, 0x2, 0x0)
compat_50_setitimer(0x0, &(0x7f0000000080)={{0x0, 0x8000a15}}, 0x0)
mknod(&(0x7f0000000080)='./file0\x00', 0x0, 0x5200)
ioctl$FIONREAD(r0, 0xc0106924, &(0x7f0000000080))


mknod(&(0x7f0000000000)='./file1\x00', 0x2000, 0xa718)
r0 = open$dir(&(0x7f00000000c0)='./file1\x00', 0x0, 0x0)
ioctl$OFIOGETBMAP(r0, 0x40104480, 0x0)


r0 = socket$inet(0x2, 0x2, 0x0)
setsockopt$sock_timeval(r0, 0xffff, 0x100a, 0x0, 0x0)


compat_40_mount(0x0, 0x0, 0x0, &(0x7f00000002c0)="01")
modctl$MODCTL_UNLOAD(0x2, 0x0)
compat_40_mount(&(0x7f0000000200)='procfs\x00', &(0x7f00000000c0)='.\x00', 0x1000000, &(0x7f00000002c0))


mknod(&(0x7f00000000c0)='./bus\x00', 0x2000, 0x0)
r0 = open$dir(&(0x7f0000000000)='./bus\x00', 0x0, 0x0)
ioctl$FIOASYNC(r0, 0x80207443, &(0x7f00000001c0))


r0 = open(&(0x7f0000000000)='./file1\x00', 0x200, 0x0)
open(&(0x7f0000000000)='./file0\x00', 0x200, 0x0)
rename(&(0x7f00000001c0)='./file0\x00', &(0x7f0000000200)='./file1\x00')
mmap(&(0x7f0000ffd000/0x1000)=nil, 0x1000, 0x0, 0x10, r0, 0x0, 0x0)
close(r0)
r1 = open(&(0x7f0000000000)='./file0\x00', 0x200, 0x0)
mmap(&(0x7f0000ffd000/0x3000)=nil, 0x3000, 0x0, 0x10, r1, 0x0, 0x0)


open(&(0x7f0000000040)='./file0\x00', 0x70e, 0x0)
r0 = getpid()
ktrace(&(0x7f0000000000)='./file0\x00', 0x0, 0x40001404, r0)
setreuid(0x0, 0xee01)
mkdir(0x0, 0x0)
socket$inet(0x2, 0x0, 0x0)
setsockopt$inet_opts(0xffffffffffffffff, 0x0, 0x0, 0x0, 0x0)
compat_30_getfh(0x0, 0x0)


mkdir(&(0x7f0000000080)='./file0\x00', 0x0)
__mount50(0x0, 0x0, 0x0, 0x0, 0x0)
setuid(0xee01)
__utimes50(&(0x7f0000000040)='./file0\x00', &(0x7f00000000c0))


r0 = fork()
r1 = getsid(0x0)
ptrace(0x9, r1, 0x0, 0x0)
compat_50_wait4(r0, 0x0, 0x0, 0x0)


r0 = socket(0x2, 0x3, 0x0)
compat_43_orecvmsg(r0, &(0x7f00000001c0)={0x0, 0x0, 0x0, 0x0, 0x0}, 0x41)


mkdir(&(0x7f0000000080)='./file0\x00', 0x0)
__mount50(&(0x7f0000000040)='procfs\x00', &(0x7f0000000080)='./file0\x00', 0x0, &(0x7f00000000c0)="82", 0x1)
open(&(0x7f0000000480)='./file0\x00', 0x80000000000206, 0x0)


mknod(&(0x7f0000000100)='./file0\x00', 0x0, 0x0)
mknod(&(0x7f0000000000)='./file0\x00', 0x0, 0x1803)
r0 = socket$inet6(0x18, 0x3, 0x0)
rasctl(&(0x7f0000000140)="ef4b46576915a8a1d8a7483629690845415a47a4e17aac216f66f971fb993b769ad8a320eaedac75724c8c01e1961608e255daaf400723f09306fa50b306c92fae5937e6aae5b1ca1b1c53e718602c3419959c9501d12529f96e4e3afa2888a20d123f94b38b39ac2e15279d634cbe243c56a16b41d05c32078511fddd0b7198ec20328ac38342de4527eda6ce53a5444214f668c10879d988ad0f099993ee327ff2e1e5f9bac03a967b5b31cc7ec0cf6338a45fc71b25293445b7f3b88180fbc7b1af", 0x0, 0x0)
sendto$inet6(r0, &(0x7f0000000000)=':', 0x358, 0x2, &(0x7f0000000040)={0x18, 0x3, 0x0, 0x20080fe}, 0x1c)


r0 = socket(0x2, 0x1, 0x0)
r1 = dup2(r0, r0)
shutdown(r0, 0x1)
setsockopt$sock_int(r1, 0xffff, 0x1007, &(0x7f0000000080), 0x4)
shutdown(r1, 0x1)


r0 = _lwp_self()
compat_60__lwp_park(&(0x7f0000000080), r0, &(0x7f00000000c0), 0x0)


symlink(&(0x7f0000000080)='.\x00', &(0x7f0000000240)='./file0\x00')
compat_50___msgctl13$IPC_STAT(0x0, 0x2, &(0x7f0000000280)={{}, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, &(0x7f00000001c0)={&(0x7f0000000100), 0xdffffffffffff7ff}})
lchown(&(0x7f0000000100)='./file0\x00', 0x0, 0xffffffffffffffff)
compat_40_mount(&(0x7f0000000380)='union\x00', &(0x7f0000000140)='./file0\x00', 0x0, &(0x7f00000001c0))
compat_40_mount(&(0x7f0000000380)='union\x00', &(0x7f0000000140)='./file0\x00', 0x0, &(0x7f00000001c0))
mknod(&(0x7f00000000c0)='./bus\x00', 0x2000, 0x0)
r0 = open$dir(&(0x7f0000000000)='./bus\x00', 0x0, 0x0)
ioctl$FIOASYNC(r0, 0x8004746b, &(0x7f00000001c0))


r0 = socket(0x18, 0x1, 0x0)
sendmsg(r0, &(0x7f0000000080)={0x0, 0x0, 0x0, 0x0, &(0x7f0000000100)=ANY=[], 0x10}, 0x401)


socketpair$unix(0x1, 0x5, 0x0, &(0x7f0000000400)={0xffffffffffffffff, <r0=>0xffffffffffffffff})
getsockopt$sock_int(r0, 0xffff, 0x1001, 0x0, 0x0)


__mount50(&(0x7f0000000280)='kernfs\x00', &(0x7f00000002c0)='.\x00', 0x0, 0x0, 0x0)
__mount50(&(0x7f0000000000)='overlay\x00', &(0x7f0000000040)='.\x00', 0x0, &(0x7f0000000540), 0x0)
open$dir(&(0x7f0000000000)='./bus\x00', 0x0, 0x0)


mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x0)
mknod(&(0x7f0000001300)='./file0\x00', 0x6000, 0x6da)
link(&(0x7f0000000140)='./file0\x00', &(0x7f0000000180)='./file0\x00')
socketpair$unix(0x1, 0x1, 0x0, &(0x7f0000000040))
posix_spawn(0x0, 0x0, &(0x7f0000000100)={0x4, 0x1ff, 0x0}, 0x0, 0x0, 0x0)
setsockopt$sock_int(0xffffffffffffffff, 0xffff, 0x0, &(0x7f0000000000)=0x3ff, 0x4)
r0 = socket(0x18, 0x2, 0x0)
ioctl$FIONWRITE(r0, 0x40046679, &(0x7f0000000000))
connect$unix(0xffffffffffffffff, &(0x7f00000000c0)=@abs={0x682eb13985c518e6, 0x7}, 0x1c)
madvise(&(0x7f0000ffd000/0x3000)=nil, 0x3000, 0x0)
mlock(&(0x7f0000ffc000/0x3000)=nil, 0x3000)
__clone(0x0, &(0x7f0000000180))
mmap(&(0x7f0000ffb000/0x4000)=nil, 0x4000, 0xd49f275d97cc01bb, 0x1810, 0xffffffffffffffff, 0x0, 0x0)
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
r1 = semget$private(0x0, 0x2, 0x0)
semop(r1, &(0x7f00000001c0)=[{}, {}], 0x2)


open$dir(&(0x7f0000000040)='./file0\x00', 0x200, 0x0)
ktrace(&(0x7f0000000200)='./file0\x00', 0x4, 0x814, 0x0)
socketpair$unix(0x1, 0x2, 0x0, &(0x7f00000000c0)={<r0=>0xffffffffffffffff, <r1=>0xffffffffffffffff})
recvfrom$unix(r0, &(0x7f00000000c0), 0x832f1f7d, 0x0, &(0x7f0000000000)=@abs, 0x2000c600)
sendto$unix(r1, 0x0, 0x0, 0x0, 0x0, 0x0)


msgctl$IPC_SET(0x0, 0x1, &(0x7f00000000c0)={{0x0, 0x0, 0x0, 0x0, 0x0, 0x101}})
connect$unix(0xffffffffffffffff, &(0x7f00000000c0)=@abs={0x682eb13985c518e6, 0x7}, 0x1c)
pipe(0x0)
fcntl$lock(0xffffffffffffffff, 0x0, &(0x7f0000000040)={0x1, 0x0, 0xfffffffffffffffc, 0x1000c})
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
r0 = socket(0x1, 0x1, 0x0)
close(r0)
r1 = socket(0x18, 0x2, 0x0)
setsockopt(r1, 0x1000000000029, 0x2a, &(0x7f0000000040)='\x00\x00\x00\x00', 0x4)
connect$unix(r0, &(0x7f00000000c0)=@abs={0x0, 0x7}, 0x1c)
write(r1, &(0x7f0000000c00)="ee4b847cba24c072bcc4d235436c44ff765bbdee055f55fb5648914dcb5e3d889c7e49d11cc1e338cf6dc800b3ffc3be419bd83095489ef712a05f2547abf74a946852d3984fa7aece83a41f6567761dd1064322c6155f566373d0321845bd0c529c171e9921661522362aa95c04e876acc9dd4de9765d6cbbbcf03c225c955e1e2d2b7e7a50b6fcc0dfb13824b4176b4a470980b32f879a2f227450a229af0308eb50222c75a9980353dd62f4d0e4b2e501834ea93ae4cde92875f6c735ee3ff5d3d8074f43f7c2f04b2831957e53b1cc7186905088ebf996f9158672e026ae3e4a9742e9e0bc5c7a494710c79a23f75679e7c1975d3a50e430a6beba5fb8a41cc8a1198a4c32856870d0fd193d6847033bdf837a11fada6a1035846bb31c10f5cbf566a5730cd677c7b53644fee38269ba1b2df39f9be1959ea2626ae4010a4edbd9e8eca796220cd80ce4e1f0ee1720037a73bf55c1590e7866938029f36fad47430c96a4ee9a1c2a43b97083b1d30fe18ca96d013119d0d18146ce0189257c7f33fd5aa761878127651aab9254ada2c9ffdc5e704ff1aef42436a7737d953064226641dac82fd7b44743629eea92a64221ac4983f9c454d0426fe9de8596436e070cad897b0b9ec100e427718a6a18e0ab26e73515237b7bc83bbaea150063dceaa530080481331fe23d686bb3b87f5920acfaf3fe57b266c17c5823e3981e84f031bf499733d2c44299d8cd047d74aa55b0db0f2197df60d321312c3712ee84e3957ef005ec7d3be18900d9fdcfee5bb4cb5b4ec0ac13941c772f4b53eb12d3682f0628f6b376db842eaac9326d02ac6e7d34906859610d72c5791426dd63a57f35987fc77d1de1ada900c52869ca6a35486ceba045e3ae770f17ae0b4d5159b3a96b503a008699d1344692c0cbc154b0c945f89e7428e14dacc6a2dce816f59d972c051e1d7452138a9e1d914f9b7487e87ca1312ca2a7e6608920ab3c2fbe61baceb0991cfb7f799f74d255315f21bcf0820af537c78ffe2340bdd0a998445e25779e32d5022cb3edf0d1555f486d5d9ed5b7a333a4a76f8467176fbca4cf7c4d22a48b02aa90c9b54d3cd5c999a1d29a6ca417805b47bb0acfc60df9514cb2a5f4d3d212e20891699f84dd04847414331d69e99be592c2536926883b19859d9dab17ab3bf65aa5e3115343a964fa95008d4079dbe41d993f8e636e4f36a3c5f8c96716d1c9310b8c03e15c2a184329fa479259328e5366185d1cd61d34c0d1dbbc79c1180d98e4bc0dfa3075086d86a6480732cf9f29fb3c19489c0fa79b3d9c70dc1bab4c35c52e9db4bafdec90685cedee5e2d95dbe694b3918d3e97de801e8d6b3b4d56e66057e33af4ba1dbb68cebcffc96aed7cc648a1fef8d1b1921e1698e492765e2daaa9b6946b72739721bc95c78fe0c926635b616e6685c46ad49f799ee77ae7ecebd22251aa16fec57bc5d01b68e0d2ed00b293d3d21937a1ae8023e8afb5425b2be37b593125d9fc7399da09f64d8dab444b53b48540e06f1a443c22eb1cbbc7d02abf1ddf079539329891aa6f5ea1b8d7fd06f5eb612e552b544a24734b9f26b6710c92e0e2fa8c54a108ebfef6319c523cb83af44aaefcd8a7ea4d7ed66d6fdc66f25aebc479e88b2c297d26d4ad2910130861e54249719d8ccf3fbaa8ab2457a6054cb59fbb1518712dd9357cb9befa3f43c4921da26cc600dcb56c7c0e4384e65adbfb58a9", 0x4d1)


mknod(&(0x7f0000000100)='./file0\x00', 0x2000, 0x1)
sendto$unix(0xffffffffffffffff, &(0x7f0000000000)="b10005016000009f0000000000070000001c130500000000fef96ecfc72fd3357ae380b37b673039d2d236acf60b7804be78164991f7c8cf5f882b297be1aa5b23edeb51e2f0ac3ebbc215000000eeffffff028ea8af630037282102000000720fd38bfbb770c1f572c881ea772ec5920400000000000000361b12578ea8c500002002fbff0c2300008a", 0x8a, 0x0, 0x0, 0x0)
sendmsg$unix(0xffffffffffffffff, &(0x7f0000001700)={&(0x7f0000000080), 0x1c, 0x0}, 0x0)
setsockopt$sock_timeval(0xffffffffffffffff, 0xffff, 0x0, &(0x7f00000000c0)={0x0, 0x1ff}, 0x10)
socket(0x18, 0x1, 0x4)
r0 = socket(0x18, 0x3, 0x0)
connect$unix(r0, &(0x7f00000000c0)=@abs={0x682eb13985c518e6, 0x7}, 0x1c)
getsockname$inet(r0, &(0x7f00000000c0), &(0x7f0000000000)=0xffffffffffffff35)
r1 = socket(0x18, 0x2, 0x0)
connect$unix(r1, &(0x7f00000000c0)=@abs={0x682eb13985c518e6, 0x7, 0x3}, 0x8)
bind$unix(0xffffffffffffffff, &(0x7f0000000080)=@abs={0x1f95d27d48731892, 0x7}, 0x1c)
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
r2 = dup2(r0, r0)
sendmsg$unix(r2, &(0x7f0000001700)={0x0, 0x0, 0x0}, 0x0)

fpu_area_restore---of 11
fpu_area_save---of 6
fpu_clear---of 17
fpu_handle_deferred28%of 11
fpu_kern_enter34%of 24
fpu_kern_leave31%of 13
fpu_lwp_abandon67%of 3
fpu_lwp_fork58%of 7
fpu_save45%of 9
fpu_set_default_cw---of 9
fpu_sigreset58%of 7
fpu_switch47%of 13
fpuinit---of 1
fpuinit_mxcsr_mask---of 1
fputrap---of 6
fxrstor---of 1
fxrstor64---of 1
fxsave---of 1
fxsave64---of 1
process_read_fpregs_s87---of 7
process_read_fpregs_xmm58%of 7
process_read_xstate30%of 31
process_verify_xstate25%of 8
process_write_fpregs_s87---of 9
process_write_fpregs_xmm---of 9
process_write_xstate33%of 37
x86_curlwp100%of 1
xrstor---of 1
xrstor64100%of 1
xsave---of 1
xsave64---of 1
xsaveopt---of 1
xsaveopt64100%of 1
-----------
SUMMARY38%of 173
ra_startio67%of 12
uvm_ra_allocctx67%of 3
uvm_ra_freectx---of 3
uvm_ra_init---of 1
uvm_ra_request50%of 20
uvm_readahead100%of 1
-----------
SUMMARY59%of 36
-----------
SUMMARY---of 0
do_udpinit---of 1
sysctl_net_inet_udp_stats---of 1
udp4_input_checksum---of 15
udp4_sendup---of 23
udp_abort---of 3
udp_accept---of 3
udp_attach_wrapper55%of 11
udp_bind_wrapper---of 7
udp_connect2_wrapper67%of 3
udp_connect_wrapper67%of 9
udp_ctlinput38%of 8
udp_ctloutput42%of 12
udp_detach_wrapper60%of 5
udp_disconnect_wrapper---of 5
udp_init---of 3
udp_init_common---of 3
udp_input---of 65
udp_input_checksum---of 4
udp_ioctl_wrapper100%of 1
udp_listen_wrapper67%of 3
udp_notify---of 13
udp_output34%of 21
udp_peeraddr_wrapper---of 7
udp_purgeif_wrapper---of 1
udp_rcvd_wrapper---of 3
udp_recvoob_wrapper---of 3
udp_send57%of 16
udp_send_wrapper100%of 1
udp_sendoob_wrapper---of 3
udp_shutdown_wrapper67%of 3
udp_sockaddr_wrapper58%of 7
udp_stat_wrapper67%of 3
udp_statinc---of 3
-----------
SUMMARY52%of 103
npf_getkernctx100%of 1
npf_setkernctx---of 1
npf_stats_clear_cb---of 1
npf_stats_collect---of 1
npf_stats_dec---of 1
npf_stats_inc---of 1
npfk_create---of 3
npfk_destroy---of 1
npfk_gc---of 1
npfk_getarg---of 1
npfk_load---of 1
npfk_stats---of 1
npfk_stats_clear---of 1
npfk_sysfini---of 1
npfk_sysinit---of 1
npfk_thread_register---of 1
npfk_thread_unregister---of 1
-----------
SUMMARY100%of 1
vnd_alloc---of 1
vnd_attach67%of 3
vnd_detach50%of 4
vnd_free---of 1
vnd_match---of 1
vnd_modcmd---of 1
vndattach---of 3
vndclose42%of 17
vnddoclear---of 30
vnddump---of 1
vndioctl15%of 127
vndioctl_get---of 11
vndiodone---of 7
vndopen35%of 29
vndread---of 6
vndsize19%of 11
vndstrategy9%of 23
vndthread---of 58
vndwrite---of 6
x86_curlwp---of 1
-----------
SUMMARY21%of 214
compat_43_sys_getdtablesize---of 1
compat_43_sys_gethostid100%of 1
compat_43_sys_gethostname100%of 1
compat_43_sys_getkerninfo---of 25
compat_43_sys_sethostid100%of 1
compat_43_sys_sethostname100%of 1
kern_info_43_fini---of 1
kern_info_43_init---of 1
-----------
SUMMARY100%of 4
ffs_alloc47%of 30
ffs_alloccg45%of 61
ffs_alloccgblk23%of 58
ffs_blkalloc---of 6
ffs_blkalloc_ump---of 43
ffs_blkfree13%of 16
ffs_blkfree_cg36%of 14
ffs_blkfree_common16%of 97
ffs_blkfree_snap---of 16
ffs_blkfree_td---of 9
ffs_blkpref_ufs1---of 24
ffs_blkpref_ufs242%of 24
ffs_checkfreefile---of 19
ffs_discard_finish---of 5
ffs_discard_init---of 3
ffs_discardcb---of 3
ffs_freefile28%of 11
ffs_freefile_common45%of 27
ffs_freefile_snap---of 13
ffs_fserr---of 4
ffs_mapsearch45%of 20
ffs_nodealloccg45%of 87
ffs_realloccg44%of 111
ffs_valloc38%of 66
ffs_vfree100%of 1
x86_curlwp---of 1
-----------
SUMMARY36%of 623
cpu_ipi31%of 13
x86_broadcast_ipi---of 8
x86_ipi_ast---of 1
x86_ipi_generic---of 1
x86_ipi_halt---of 1
x86_ipi_handler---of 6
x86_ipi_kpreempt---of 1
x86_ipi_reload_mtrr---of 3
x86_ipi_xcall---of 1
x86_send_ipi34%of 6
xc_send_ipi62%of 13
-----------
SUMMARY44%of 32
at_broadcast---of 10
at_control18%of 67
at_ifinit12%of 34
at_purgeaddr---of 7
at_purgeif---of 1
at_scrub19%of 11
-----------
SUMMARY17%of 112
doubletrap---of 4
nmitrap---of 5
startlwp---of 3
trap27%of 115
trap_print---of 3
userret64%of 19
x86_curlwp100%of 1
-----------
SUMMARY33%of 135
compat_43_ttioctl66%of 44
kern_tty_43_fini---of 3
kern_tty_43_init---of 1
ttcompatgetflags54%of 15
-----------
SUMMARY63%of 59
cpu_count80%of 5
cpu_count_sync67%of 9
cpu_getmodel---of 1
cpu_setmodel---of 1
cpu_softintr_p100%of 1
cpu_topology_init---of 75
cpu_topology_set---of 1
cpu_topology_setspeed---of 1
curcpu_stable---of 4
mi_cpu_init---of 5
x86_curlwp100%of 1
-----------
SUMMARY75%of 16
msg_freehdr67%of 12
msgctl174%of 23
msgfini---of 7
msginit---of 20
msgrcv170%of 40
msgrealloc---of 66
msgsnd166%of 43
sys___msgctl50100%of 6
sys_msgget70%of 20
sys_msgrcv100%of 1
sys_msgsnd100%of 1
sysctl_ipc_msg_setup---of 3
sysctl_ipc_msgmni---of 3
sysctl_ipc_msgseg---of 3
-----------
SUMMARY71%of 146
filt_soempty---of 14
filt_solisten---of 8
filt_sordetach---of 9
filt_soread---of 15
filt_sowdetach---of 9
filt_sowrite---of 22
fsocreate100%of 7
sbsavetimestamp---of 6
so_setsockopt---of 15
soabort58%of 7
soaccept63%of 8
sobind67%of 9
socket_listener_cb79%of 14
sockopt_destroy100%of 3
sockopt_get100%of 3
sockopt_getint100%of 3
sockopt_getmbuf---of 6
sockopt_init67%of 6
sockopt_set86%of 7
sockopt_setint100%of 3
sockopt_setmbuf---of 8
soclose38%of 45
soconnect77%of 13
soconnect267%of 3
socreate80%of 15
sodisconnect---of 6
sofamily---of 4
sofree60%of 22
sogetopt61%of 51
sohasoutofband---of 1
soinit---of 5
soinit1---of 3
sokvaalloc---of 7
sokvafree---of 1
solisten75%of 12
soloanfree---of 3
soo_kqfilter---of 11
sopendfree_thread---of 10
sopoll75%of 35
soreceive55%of 226
sorestart58%of 7
sorflush55%of 11
sosend63%of 95
sosetopt68%of 105
soshutdown86%of 7
sysctl_kern_sbmax---of 3
sysctl_kern_somaxkva---of 4
sysctl_kern_sooptions---of 4
x86_curlwp100%of 1
-----------
SUMMARY62%of 718
ptyfs_allocvp100%of 1
ptyfs_clr_active---of 4
ptyfs_get_node54%of 13
ptyfs_hashdone---of 1
ptyfs_hashinit---of 1
ptyfs_next_active43%of 7
ptyfs_set_active---of 9
-----------
SUMMARY53%of 21
-----------
SUMMARY---of 0
rb_tree_find_node100%of 5
rb_tree_find_node_geq100%of 6
rb_tree_find_node_leq---of 6
rb_tree_init100%of 1
rb_tree_insert_node52%of 80
rb_tree_iterate54%of 13
rb_tree_removal_rebalance63%of 78
rb_tree_remove_node55%of 74
rb_tree_reparent_nodes67%of 9
-----------
SUMMARY59%of 266
npf_active_p---of 1
npf_autounload_p---of 3
npf_dev_close---of 1
npf_dev_ioctl---of 18
npf_dev_open---of 1
npf_dev_poll---of 1
npf_dev_read---of 1
npf_ebr_create---of 1
npf_ebr_destroy---of 1
npf_ebr_enter---of 3
npf_ebr_exit---of 3
npf_ebr_full_sync---of 1
npf_ebr_incrit_p---of 3
npf_ebr_register---of 3
npf_ebr_unregister---of 3
npf_ifaddrhook30%of 10
npf_ifhook50%of 4
npf_ifop_flush---of 5
npf_ifop_getmeta---of 1
npf_ifop_getname100%of 1
npf_ifop_lookup---of 1
npf_ifop_setmeta100%of 1
npf_modcmd---of 7
npf_pfil_register---of 16
npf_pfil_unregister---of 7
npfattach---of 1
npfos_packet_handler---of 1
-----------
SUMMARY44%of 16
blist_alloc---of 10
blist_create75%of 4
blist_destroy---of 1
blist_fill---of 6
blist_free40%of 5
blist_resize---of 6
blst_copy---of 27
blst_meta_alloc---of 26
blst_meta_fill---of 18
blst_meta_free65%of 17
blst_radix_init92%of 12
-----------
SUMMARY72%of 38
strnlen80%of 5
-----------
SUMMARY80%of 5
cpu_puc_cnprobe---of 4
device_pci_register---of 37
pci_attach_hook---of 26
pci_bridge_foreach---of 1
pci_bridge_hook---of 5
pci_bus_maxdevs---of 1
pci_chipset_tag_create---of 15
pci_chipset_tag_destroy---of 1
pci_conf_lock29%of 14
pci_conf_read29%of 21
pci_conf_unlock43%of 7
pci_conf_write---of 21
pci_decompose_tag---of 18
pci_device_foreach---of 1
pci_device_foreach_min---of 16
pci_make_tag30%of 10
pci_mode_detect---of 21
pci_mode_set---of 3
x86_genfb_resume---of 4
x86_genfb_set_mapreg---of 1
x86_genfb_setmode---of 4
x86_genfb_suspend---of 1
-----------
SUMMARY31%of 52
strlcpy70%of 13
-----------
SUMMARY70%of 13
ufs_done---of 3
ufs_fhtovp---of 10
ufs_init---of 3
ufs_modcmd---of 6
ufs_quotactl67%of 3
ufs_reinit---of 1
ufs_root---of 3
ufs_start---of 1
ufs_vget---of 4
x86_curlwp100%of 1
-----------
SUMMARY75%of 4
rip6_abort_wrapper---of 3
rip6_accept_wrapper---of 3
rip6_attach50%of 8
rip6_attach_wrapper100%of 1
rip6_bind_wrapper57%of 23
rip6_connect2_wrapper67%of 3
rip6_connect_wrapper48%of 17
rip6_ctlinput25%of 12
rip6_ctloutput69%of 19
rip6_detach56%of 9
rip6_detach_wrapper100%of 1
rip6_disconnect_wrapper58%of 7
rip6_init---of 1
rip6_input---of 43
rip6_ioctl_wrapper100%of 1
rip6_listen_wrapper---of 3
rip6_output63%of 54
rip6_peeraddr_wrapper58%of 7
rip6_purgeif_wrapper---of 1
rip6_rcvd_wrapper---of 3
rip6_recvoob_wrapper67%of 3
rip6_sbappendaddr---of 12
rip6_send_wrapper67%of 15
rip6_sendoob_wrapper---of 3
rip6_shutdown_wrapper67%of 3
rip6_sockaddr_wrapper58%of 7
rip6_stat_wrapper---of 3
sysctl_net_inet6_raw6_stats---of 1
x86_curlwp100%of 1
-----------
SUMMARY59%of 191
compat_90_sys_fhstatvfs167%of 3
compat_90_sys_fstatvfs1100%of 3
compat_90_sys_getvfsstat100%of 1
compat_90_sys_statvfs1100%of 3
statvfs_to_statvfs90_copy100%of 1
vfs_syscalls_90_fini---of 1
vfs_syscalls_90_init---of 1
-----------
SUMMARY91%of 11
uvm_pagecheckdirty75%of 8
uvm_pagegetdirty63%of 8
uvm_pagemarkdirty74%of 30
-----------
SUMMARY72%of 46
percpu_alloc100%of 1
percpu_backend_alloc---of 13
percpu_cpu_swap---of 5
percpu_create22%of 19
percpu_foreach67%of 6
percpu_foreach_xcall75%of 4
percpu_free23%of 22
percpu_getptr_remote67%of 3
percpu_getref67%of 3
percpu_init---of 1
percpu_init_cpu---of 14
percpu_putref100%of 1
percpu_traverse_enter---of 1
percpu_traverse_exit---of 1
percpu_xcfunc---of 3
x86_curlwp---of 1
-----------
SUMMARY38%of 59
union_modcmd---of 4
union_mount52%of 37
union_renamelock_enter100%of 1
union_renamelock_exit100%of 1
union_root50%of 6
union_start100%of 1
union_statvfs43%of 7
union_sync100%of 1
union_unmount78%of 9
union_unmount_selector67%of 3
union_vget---of 1
unionfs_sysctl_setup---of 1
x86_curlwp100%of 1
-----------
SUMMARY59%of 67
fifo_bmap---of 7
fifo_close64%of 19
fifo_inactive---of 1
fifo_ioctl88%of 8
fifo_kqfilter---of 10
fifo_lookup---of 1
fifo_open57%of 55
fifo_pathconf84%of 6
fifo_poll44%of 41
fifo_print---of 3
fifo_read58%of 7
fifo_socantrcvmore58%of 7
fifo_write67%of 3
filt_fifordetach---of 9
filt_fiforead---of 12
filt_fifowdetach---of 9
filt_fifowrite---of 17
x86_curlwp100%of 1
-----------
SUMMARY58%of 147
-----------
SUMMARY---of 0
lwp0_init---of 6
lwp_addref60%of 5
lwp_alive60%of 5
lwp_changepri50%of 6
lwp_continue50%of 8
lwp_create56%of 54
lwp_ctl_alloc8%of 26
lwp_ctl_exit---of 10
lwp_ctl_free28%of 18
lwp_ctor67%of 3
lwp_delref100%of 1
lwp_delref256%of 9
lwp_drainrefs---of 6
lwp_dtor60%of 5
lwp_eprio84%of 6
lwp_exit36%of 50
lwp_find67%of 6
lwp_find2---of 23
lwp_find_first54%of 13
lwp_free65%of 40
lwp_lendpri60%of 5
lwp_lock63%of 8
lwp_locked100%of 1
lwp_migrate---of 16
lwp_need_userret78%of 9
lwp_pctr100%of 1
lwp_setlock60%of 5
lwp_setprivate---of 1
lwp_start---of 10
lwp_startup---of 20
lwp_suspend43%of 19
lwp_thread_cleanup---of 5
lwp_trylock63%of 8
lwp_unlock100%of 1
lwp_unlock_to60%of 5
lwp_unsleep67%of 3
lwp_unstop---of 14
lwp_userret58%of 19
lwp_wait71%of 41
lwp_whatis---of 6
lwpinit---of 1
spc_lock100%of 1
sysctl_kern_maxlwp---of 5
x86_curlwp100%of 1
-----------
SUMMARY53%of 382
-----------
SUMMARY---of 0
rfcomm_attach_pcb56%of 9
rfcomm_bind_pcb---of 3
rfcomm_connect_pcb---of 24
rfcomm_detach_pcb58%of 7
rfcomm_disconnect_pcb---of 12
rfcomm_getopt29%of 7
rfcomm_listen_pcb---of 31
rfcomm_peeraddr_pcb---of 1
rfcomm_rcvd_pcb50%of 6
rfcomm_send_pcb---of 5
rfcomm_setopt---of 9
rfcomm_sockaddr_pcb---of 1
-----------
SUMMARY49%of 29
copypktopts26%of 31
ip6_clearpktopts46%of 37
ip6_copypktopts---of 4
ip6_ctloutput50%of 308
ip6_freemoptions60%of 10
ip6_freepcbopts---of 3
ip6_get_membership77%of 21
ip6_if_output56%of 9
ip6_initpktopts---of 1
ip6_mloopback---of 8
ip6_optlen60%of 10
ip6_output46%of 271
ip6_raw_ctloutput47%of 13
ip6_setpktopt40%of 111
ip6_setpktopts58%of 14
ip6_splithdr75%of 4
x86_curlwp100%of 1
-----------
SUMMARY48%of 840
syn_cache_add---of 38
syn_cache_cleanup23%of 9
syn_cache_get---of 51
syn_cache_init---of 3
syn_cache_insert---of 26
syn_cache_lookup---of 11
syn_cache_reset---of 8
syn_cache_respond---of 43
syn_cache_rm---of 12
syn_cache_timer---of 7
syn_cache_unreach---of 9
-----------
SUMMARY23%of 9
process_domem40%of 5
ptrace_hooks---of 1
-----------
SUMMARY40%of 5
uvm_analloc50%of 10
uvm_anfree37%of 22
uvm_anon_ctor100%of 1
uvm_anon_dropswap67%of 3
uvm_anon_init---of 1
uvm_anon_lockloanpg---of 12
uvm_anon_pagein---of 15
uvm_anon_release---of 21
-----------
SUMMARY45%of 36
gang_lookup_init61%of 38
radix_tree_await_memory---of 1
radix_tree_clear_tag52%of 35
radix_tree_empty_tagged_tree_p67%of 3
radix_tree_empty_tree_p100%of 1
radix_tree_fini_tree60%of 5
radix_tree_free_node67%of 3
radix_tree_gang_lookup_node64%of 47
radix_tree_gang_lookup_node_reverse---of 49
radix_tree_gang_lookup_tagged_node65%of 62
radix_tree_gang_lookup_tagged_node_reverse48%of 63
radix_tree_get_tag54%of 13
radix_tree_grow42%of 12
radix_tree_init---of 1
radix_tree_init_tree100%of 1
radix_tree_insert_node50%of 8
radix_tree_lookup_node80%of 10
radix_tree_lookup_ptr46%of 33
radix_tree_remove_node56%of 40
radix_tree_replace_node---of 16
radix_tree_set_tag54%of 28
radix_tree_undo_insert_node---of 23
-----------
SUMMARY57%of 402
_if_byindex---of 3
_if_down53%of 23
doifioctl51%of 136
if_acquire67%of 3
if_activate_sadl---of 15
if_addr_init37%of 11
if_alloc---of 1
if_alloc_sadl67%of 3
if_attach100%of 1
if_attachdomain---of 14
if_byindex100%of 4
if_clone_attach---of 4
if_clone_detach---of 6
if_clone_list38%of 8
if_clone_lookup30%of 27
if_deactivate---of 3
if_deferred_start_common---of 3
if_deferred_start_init---of 1
if_deferred_start_softint---of 1
if_delroute_matcher---of 1
if_detach---of 68
if_dl_create100%of 1
if_do_dad50%of 4
if_domain_link_state_change---of 6
if_down---of 3
if_down_locked---of 3
if_export_if_data100%of 1
if_flags_set---of 11
if_free---of 1
if_free_sadl---of 15
if_get31%of 33
if_get_byindex60%of 5
if_get_bylla---of 11
if_held---of 1
if_init---of 3
if_initialize27%of 23
if_initname100%of 1
if_input---of 8
if_ioctl---of 4
if_is_deactivated100%of 1
if_link_state_change17%of 18
if_link_state_change_work---of 18
if_linkstate_change_disestablish---of 1
if_linkstate_change_establish---of 1
if_listener_cb100%of 1
if_mcast_op67%of 3
if_nulldrain---of 1
if_nullinit---of 1
if_nullinput---of 1
if_nullioctl---of 1
if_nulloutput---of 1
if_nullslowtimo---of 1
if_nullstart---of 1
if_nullstop---of 1
if_nulltransmit---of 1
if_percpuq_create34%of 6
if_percpuq_destroy---of 3
if_percpuq_drops---of 1
if_percpuq_enqueue---of 5
if_percpuq_init_ifq100%of 1
if_percpuq_purge_ifq---of 6
if_percpuq_softint---of 9
if_purgeaddrs---of 12
if_put100%of 3
if_register34%of 39
if_schedule_deferred_start---of 8
if_sdl_sysctl---of 11
if_set_sadl---of 6
if_slowtimo_intr---of 8
if_slowtimo_work---of 5
if_stop---of 3
if_transmit58%of 7
if_transmit_lock---of 1
if_tunnel_alloc_ro_percpu---of 1
if_tunnel_check_nesting---of 5
if_tunnel_free_ro_percpu---of 1
if_tunnel_ro_fini_pc---of 1
if_tunnel_ro_init_pc---of 1
if_tunnel_ro_percpu_rtcache_free---of 1
if_tunnel_rtcache_free_pc---of 1
if_up---of 1
if_up_locked67%of 12
ifa_acquire100%of 1
ifa_held---of 1
ifa_ifwithaddr53%of 23
ifa_ifwithaddr_psref67%of 3
ifa_ifwithaf---of 17
ifa_ifwithdstaddr43%of 21
ifa_ifwithdstaddr_psref---of 3
ifa_ifwithladdr---of 4
ifa_ifwithladdr_psref60%of 5
ifa_ifwithnet31%of 39
ifa_ifwithnet_psref---of 3
ifa_insert47%of 13
ifa_is_destroying100%of 1
ifa_psref_init100%of 1
ifa_release100%of 3
ifa_remove---of 25
ifaddrpref_ioctl---of 22
ifafree58%of 7
ifaof_ifpforaddr39%of 21
ifaof_ifpforaddr_psref67%of 3
ifaref100%of 1
ifinit---of 1
ifinit1---of 7
ifinit_post---of 1
ifioctl_common37%of 63
ifpromisc---of 9
ifpromisc_locked---of 9
ifq_enqueue100%of 1
ifq_enqueue2---of 5
ifreq_setaddr43%of 7
ifunit35%of 32
link_rtrequest---of 6
p2p_rtrequest---of 22
pslist_writer_insert_after54%of 15
pslist_writer_insert_head50%of 12
sysctl_if_watchdog---of 4
sysctl_percpuq_drops_handler---of 1
x86_curlwp100%of 1
-----------
SUMMARY44%of 648
-----------
SUMMARY---of 0
rip_abort---of 3
rip_accept---of 3
rip_attach_wrapper55%of 11
rip_bind_wrapper34%of 15
rip_connect2_wrapper67%of 3
rip_connect_wrapper55%of 11
rip_ctlinput17%of 18
rip_ctloutput40%of 15
rip_detach_wrapper60%of 5
rip_disconnect_wrapper---of 5
rip_init---of 1
rip_input---of 24
rip_ioctl_wrapper100%of 1
rip_listen_wrapper---of 3
rip_output27%of 30
rip_pcbnotify---of 9
rip_peeraddr_wrapper58%of 7
rip_purgeif_wrapper---of 1
rip_rcvd_wrapper---of 3
rip_recvoob_wrapper67%of 3
rip_sbappendaddr---of 14
rip_send_wrapper62%of 18
rip_sendoob_wrapper---of 3
rip_shutdown_wrapper---of 3
rip_sockaddr_wrapper---of 7
rip_stat_wrapper67%of 3
-----------
SUMMARY43%of 140
route_abort---of 3
route_accept---of 3
route_attach_wrapper60%of 10
route_bind_wrapper---of 3
route_connect2_wrapper67%of 3
route_connect_wrapper---of 3
route_ctloutput25%of 24
route_detach_wrapper55%of 11
route_disconnect_wrapper60%of 5
route_enqueue75%of 4
route_filter50%of 28
route_init---of 1
route_intr---of 6
route_ioctl_wrapper100%of 1
route_listen_wrapper---of 3
route_output21%of 132
route_output_report---of 10
route_peeraddr_wrapper56%of 9
route_rcvd_wrapper---of 3
route_recvoob_wrapper---of 3
route_send_wrapper60%of 5
route_sendoob_wrapper67%of 3
route_shutdown_wrapper67%of 3
route_sockaddr_wrapper---of 9
route_stat_wrapper67%of 3
rt_addrmsg100%of 1
rt_addrmsg046%of 42
rt_addrmsg_rt100%of 1
rt_addrmsg_src---of 1
rt_clonedmsg---of 5
rt_ieee80211msg---of 13
rt_ifannouncemsg50%of 4
rt_ifmsg43%of 7
rt_missmsg40%of 5
rt_msg150%of 28
rt_msg2---of 33
rt_msg3---of 1
rt_pr_init---of 1
rt_setmetrics---of 18
sysctl_dumpentry---of 17
sysctl_net_route_setup---of 1
sysctl_rtable---of 86
x86_curlwp100%of 1
-----------
SUMMARY38%of 330
cprng_fast---of 4
cprng_fast32100%of 1
cprng_fast64---of 1
cprng_fast_buf_short58%of 14
cprng_fast_init---of 1
cprng_fast_init_cpu---of 1
-----------
SUMMARY60%of 15
in6_undefer_cksum---of 12
in6_undefer_cksum_tcpudp50%of 12
ip6_tso_output---of 6
tcp6_segment---of 56
-----------
SUMMARY50%of 12
ffs_clrblock34%of 6
ffs_clusteracct58%of 33
ffs_fragacct73%of 11
ffs_getblk64%of 11
ffs_isblock34%of 6
ffs_isfreeblock34%of 6
ffs_load_inode40%of 5
ffs_setblock34%of 6
-----------
SUMMARY53%of 84
lfs_done---of 1
lfs_extattrctl---of 1
lfs_fhtovp---of 6
lfs_flushfiles---of 17
lfs_gop_write---of 67
lfs_init---of 1
lfs_init_vnode---of 7
lfs_issequential_hole---of 11
lfs_loadvnode---of 46
lfs_modcmd---of 7
lfs_mount14%of 37
lfs_mountfs---of 106
lfs_mountroot---of 5
lfs_newvnode---of 25
lfs_reinit---of 1
lfs_resize_fs---of 57
lfs_statvfs---of 18
lfs_sync---of 9
lfs_sysctl_setup---of 4
lfs_unmount---of 6
lfs_vget---of 4
lfs_vinit---of 32
lfs_vptofh---of 3
lfs_writerd---of 59
sysctl_lfs_dostats---of 3
x86_curlwp100%of 1
-----------
SUMMARY16%of 38
fdesc_done---of 1
fdesc_getattr18%of 17
fdesc_inactive50%of 4
fdesc_init---of 1
fdesc_ioctl67%of 3
fdesc_kqfilter---of 5
fdesc_lookup29%of 39
fdesc_open50%of 4
fdesc_pathconf78%of 9
fdesc_poll67%of 3
fdesc_print---of 1
fdesc_read67%of 3
fdesc_readdir38%of 43
fdesc_readlink---of 4
fdesc_reclaim100%of 1
fdesc_setattr40%of 5
fdesc_write---of 3
x86_curlwp100%of 1
-----------
SUMMARY39%of 132
-----------
SUMMARY---of 0
-----------
SUMMARY---of 0
bufq_disksort_cancel---of 10
bufq_disksort_fini67%of 3
bufq_disksort_get25%of 8
bufq_disksort_init100%of 1
bufq_disksort_modcmd---of 4
bufq_disksort_put---of 49
-----------
SUMMARY42%of 12
pckbc_attach---of 23
pckbc_attach_slot---of 7
pckbc_cnattach---of 9
pckbc_intr_establish---of 1
pckbc_is_console---of 4
pckbc_poll_data1---of 13
pckbc_resume---of 8
pckbc_send_cmd---of 4
pckbc_send_devcmd38%of 8
pckbc_set_poll---of 4
pckbc_slot_enable50%of 4
pckbc_xt_translation---of 14
pckbcintr---of 9
pckbcintr_hard---of 8
pckbcintr_soft---of 4
-----------
SUMMARY42%of 12
vfs_hooks_attach---of 4
vfs_hooks_detach---of 9
vfs_hooks_init---of 1
vfs_hooks_reexport58%of 7
vfs_hooks_unmount50%of 6
-----------
SUMMARY54%of 13
ppsratecheck72%of 7
ratecheck---of 7
-----------
SUMMARY72%of 7
_prop_object_copyout---of 8
prop_array_copyin---of 6
prop_array_copyin_ioctl---of 1
prop_array_copyin_ioctl_size---of 7
prop_array_copyin_size---of 6
prop_array_copyout---of 1
prop_array_copyout_ioctl---of 3
prop_dictionary_copyin---of 6
prop_dictionary_copyin_ioctl100%of 1
prop_dictionary_copyin_ioctl_size43%of 7
prop_dictionary_copyin_size---of 6
prop_dictionary_copyout---of 1
prop_dictionary_copyout_ioctl---of 3
prop_kern_init---of 4
x86_curlwp---of 1
-----------
SUMMARY50%of 8
strncmp67%of 6
-----------
SUMMARY67%of 6
-----------
SUMMARY---of 0
do_ptrace55%of 234
ptrace_common_modcmd---of 4
ptrace_doio35%of 23
ptrace_listener_cb72%of 45
-----------
SUMMARY56%of 302
pktq_barrier---of 15
pktq_collect_counts---of 1
pktq_create---of 7
pktq_dequeue---of 12
pktq_destroy---of 8
pktq_enqueue58%of 7
pktq_fini_cpu---of 3
pktq_flush---of 11
pktq_ifdetach---of 6
pktq_init_cpu---of 1
pktq_rps_hash---of 5
pktq_rps_hash_curcpu---of 1
pktq_rps_hash_toeplitz---of 7
pktq_rps_hash_toeplitz_othercpus---of 8
pktq_rps_hash_zero---of 1
pktq_set_maxlen---of 17
pktq_set_maxlen_cpu---of 1
pktq_sysctl_setup---of 7
pktqueue_list_init---of 1
sysctl_pktq_drops---of 1
sysctl_pktq_maxlen---of 3
sysctl_pktq_nitems---of 1
sysctl_pktq_rps_hash_handler---of 20
-----------
SUMMARY58%of 7
fops_pad_close---of 3
fops_pad_ioctl100%of 1
fops_pad_kqfilter---of 1
fops_pad_mmap---of 1
fops_pad_poll---of 1
fops_pad_read---of 13
fops_pad_stat---of 1
fops_pad_write---of 1
pad_attach---of 5
pad_childdet---of 5
pad_detach---of 6
pad_done_output---of 1
pad_get_locks---of 1
pad_get_port---of 6
pad_get_props---of 1
pad_getdev---of 1
pad_halt_output---of 3
pad_match---of 1
pad_modcmd---of 1
pad_open---of 9
pad_query_devinfo---of 8
pad_query_format---of 1
pad_set_format---of 3
pad_set_port---of 6
pad_start_output---of 10
pad_swvol_codec---of 7
padattach---of 3
x86_curlwp---of 1
-----------
SUMMARY100%of 1
sys_nomodule---of 1
syscall_disestablish---of 17
syscall_establish---of 9
trace_enter50%of 8
trace_exit50%of 8
trace_is_enabled100%of 3
x86_curlwp100%of 1
-----------
SUMMARY60%of 20
-----------
SUMMARY---of 0
vfs_quotactl_cursoratend---of 1
vfs_quotactl_cursorclose---of 1
vfs_quotactl_cursorget---of 1
vfs_quotactl_cursoropen---of 1
vfs_quotactl_cursorrewind---of 1
vfs_quotactl_cursorskipidtype---of 1
vfs_quotactl_del---of 1
vfs_quotactl_get100%of 1
vfs_quotactl_idtypestat---of 1
vfs_quotactl_objtypestat---of 1
vfs_quotactl_put---of 1
vfs_quotactl_quotaoff100%of 1
vfs_quotactl_quotaon100%of 1
vfs_quotactl_stat100%of 1
-----------
SUMMARY100%of 4
_bpf_change_type---of 6
_bpf_deregister_track_event---of 13
_bpf_mtap---of 10
_bpf_mtap2---of 7
_bpf_mtap_af---of 7
_bpf_mtap_sl_in---of 21
_bpf_mtap_sl_out---of 9
_bpf_mtap_softint---of 10
_bpf_mtap_softint_init---of 7
_bpf_register_track_event---of 7
_bpfattach50%of 10
_bpfdetach---of 37
bpf_attachd---of 19
bpf_close37%of 41
bpf_deliver---of 34
bpf_detachd---of 23
bpf_ioctl36%of 184
bpf_jit_freecode---of 3
bpf_jit_generate---of 3
bpf_kqfilter---of 3
bpf_mcpy---of 5
bpf_modcmd---of 4
bpf_mtap_si---of 6
bpf_poll37%of 11
bpf_read14%of 15
bpf_stat100%of 1
bpf_stats---of 1
bpf_sysctl_gstats_handler---of 1
bpf_timed_out---of 5
bpf_write6%of 55
bpfilterattach---of 1
bpfopen70%of 13
filt_bpfrdetach---of 1
filt_bpfread---of 3
sysctl_net_bpf_maxbufsize---of 4
sysctl_net_bpf_peers---of 17
sysctl_net_bpf_setup---of 3
x86_curlwp100%of 1
-----------
SUMMARY32%of 331
mutex_obj_alloc100%of 1
mutex_obj_free72%of 7
mutex_obj_hold60%of 5
mutex_obj_refcnt---of 1
mutex_obj_tryalloc---of 3
-----------
SUMMARY70%of 13
semctl135%of 95
semexit---of 16
semfini---of 7
seminit---of 9
seminit_exithook100%of 1
semrealloc---of 52
semu_alloc19%of 16
semundo_adjust27%of 19
semundo_clear---of 15
sys_____semctl5093%of 14
sys_semconfig---of 3
sys_semget68%of 25
sys_semop29%of 73
sysctl_ipc_sem_setup---of 3
sysctl_ipc_semmni---of 3
sysctl_ipc_semmns---of 3
sysctl_ipc_semmnu---of 3
-----------
SUMMARY39%of 243
_mountlist_iterator_next50%of 24
_mountlist_next---of 12
dounmount59%of 29
makefstype75%of 4
mount_domount47%of 56
mount_finispecific---of 1
mount_getspecific100%of 1
mount_initspecific---of 3
mount_setspecific---of 1
mount_specific_key_create---of 1
mount_specific_key_delete---of 1
mountlist_append---of 3
mountlist_iterator_destroy50%of 10
mountlist_iterator_init50%of 4
mountlist_iterator_next100%of 1
mountlist_iterator_trynext---of 1
mountlist_remove47%of 13
rawdev_mounted63%of 16
vflush57%of 30
vfs_busy43%of 7
vfs_getnewfsid64%of 11
vfs_getvfs60%of 10
vfs_insmntque59%of 17
vfs_mount_sysinit---of 1
vfs_mountalloc60%of 5
vfs_mountedon---of 5
vfs_mountroot---of 36
vfs_ref50%of 4
vfs_rele67%of 6
vfs_rootmountalloc---of 7
vfs_set_lowermount39%of 13
vfs_shutdown---of 3
vfs_sync_all---of 1
vfs_trybusy---of 8
vfs_unbusy67%of 3
vfs_unmount_forceone---of 4
vfs_unmount_next---of 14
vfs_unmountall---of 1
vfs_unmountall1---of 15
vfs_vnode_iterator_destroy30%of 10
vfs_vnode_iterator_init50%of 4
vfs_vnode_iterator_next100%of 1
vfs_vnode_iterator_next150%of 24
x86_curlwp100%of 1
-----------
SUMMARY53%of 304
ktd_callout---of 1
ktdrel64%of 11
ktealloc100%of 4
ktesethdrlen---of 1
ktr_csw48%of 17
ktr_emul---of 4
ktr_execarg---of 4
ktr_execenv---of 4
ktr_execfd---of 4
ktr_genio100%of 3
ktr_geniov100%of 3
ktr_io60%of 15
ktr_kuser75%of 4
ktr_mib---of 4
ktr_mibio100%of 3
ktr_namei75%of 4
ktr_namei2---of 4
ktr_point100%of 1
ktr_psig---of 6
ktr_syscall60%of 10
ktr_sysret80%of 5
ktrace_common68%of 76
ktrace_listener_cb89%of 9
ktrace_thread---of 40
ktraddentry25%of 28
ktradref60%of 5
ktrcanset60%of 5
ktrderef---of 5
ktrderefall47%of 15
ktrinit---of 1
ktrops68%of 25
ktruser---of 9
sys_fktrace75%of 4
sys_utrace---of 1
x86_curlwp100%of 1
-----------
SUMMARY63%of 248
sysctl_kern_threadpool_idle_ms---of 4
sysctl_threadpool_setup---of 5
threadnamesuffix---of 5
threadpool_cancel_job---of 7
threadpool_cancel_job_async---of 16
threadpool_create---of 23
threadpool_destroy---of 27
threadpool_dispatcher_thread---of 71
threadpool_get---of 25
threadpool_job_destroy---of 12
threadpool_job_done---of 13
threadpool_job_init---of 1
threadpool_percpu_fini---of 3
threadpool_percpu_get---of 25
threadpool_percpu_init---of 4
threadpool_percpu_ok---of 3
threadpool_percpu_put---of 27
threadpool_percpu_ref---of 1
threadpool_percpu_ref_remote---of 1
threadpool_put---of 27
threadpool_schedule_job35%of 23
threadpool_thread---of 44
threadpools_init---of 1
x86_curlwp---of 1
-----------
SUMMARY35%of 23
sd_diskstart48%of 21
sd_dumpblocks---of 1
sd_firstopen---of 13
sd_flush---of 4
sd_get_parms---of 47
sd_get_parms_page4---of 13
sd_get_parms_page5---of 14
sd_interpret_sense---of 20
sd_iosize---of 5
sd_label---of 3
sd_lastclose---of 8
sd_read_capacity---of 6
sd_shutdown---of 4
sd_suspend---of 6
sdattach---of 18
sdclose---of 1
sddetach---of 4
sddone---of 6
sddump---of 5
sdioctl---of 41
sdmatch---of 1
sdminphys40%of 5
sdopen34%of 6
sdread---of 1
sdrestart---of 1
sdsize---of 4
sdstart---of 1
sdstrategy50%of 4
sdwrite---of 1
-----------
SUMMARY45%of 36
ext2fs_cgupdate---of 15
ext2fs_done---of 1
ext2fs_fhtovp---of 9
ext2fs_flushfiles---of 1
ext2fs_init---of 1
ext2fs_init_vnode---of 5
ext2fs_loadvnode---of 9
ext2fs_loadvnode_content---of 13
ext2fs_modcmd---of 4
ext2fs_mount12%of 42
ext2fs_mountfs---of 23
ext2fs_mountroot---of 7
ext2fs_newvnode---of 34
ext2fs_reinit---of 1
ext2fs_reload---of 25
ext2fs_sbfill---of 23
ext2fs_sbupdate---of 3
ext2fs_set_inode_guid---of 3
ext2fs_statvfs---of 15
ext2fs_sync---of 15
ext2fs_sync_selector---of 9
ext2fs_sysctl_setup---of 1
ext2fs_unmount---of 8
ext2fs_vptofh---of 3
x86_curlwp100%of 1
-----------
SUMMARY14%of 43
ld_rbto_compare_key100%of 1
ld_rbto_compare_nodes100%of 1
lockdebug_abort---of 3
lockdebug_abort1---of 8
lockdebug_alloc38%of 27
lockdebug_barrier29%of 21
lockdebug_dismiss---of 1
lockdebug_dump---of 5
lockdebug_free47%of 13
lockdebug_lock_print---of 10
lockdebug_locked55%of 11
lockdebug_mem_check34%of 6
lockdebug_more39%of 13
lockdebug_show_all_locks---of 27
lockdebug_show_lockstats---of 9
lockdebug_unlocked57%of 30
lockdebug_wantlock71%of 17
x86_curlwp100%of 1
-----------
SUMMARY48%of 141
compat_43_sys_creat100%of 1
compat_43_sys_fstat100%of 3
compat_43_sys_ftruncate100%of 1
compat_43_sys_getdirentries50%of 24
compat_43_sys_lseek100%of 1
compat_43_sys_lstat100%of 3
compat_43_sys_quota---of 1
compat_43_sys_stat100%of 3
compat_43_sys_truncate---of 1
cvtstat40%of 10
vfs_syscalls_43_fini---of 1
vfs_syscalls_43_init---of 1
x86_curlwp---of 1
-----------
SUMMARY61%of 46
-----------
SUMMARY---of 0
extensions_modcmd---of 7
secmodel_extensions_network_cb34%of 6
secmodel_extensions_process_cb42%of 12
sysctl_extensions_curtain_handler---of 6
sysctl_extensions_user_handler---of 6
sysctl_security_extensions_setup---of 1
-----------
SUMMARY39%of 18
popcount32100%of 1
-----------
SUMMARY100%of 1
cv_broadcast80%of 5
cv_destroy60%of 5
cv_enter56%of 9
cv_has_waiters100%of 1
cv_init67%of 3
cv_is_valid100%of 1
cv_signal80%of 5
cv_timedwait67%of 3
cv_timedwait_sig67%of 3
cv_timedwaitbt---of 20
cv_timedwaitbt_sig---of 20
cv_unsleep58%of 7
cv_wait67%of 3
cv_wait_sig67%of 3
cv_wakeup_all60%of 10
cv_wakeup_one50%of 8
x86_curlwp100%of 1
-----------
SUMMARY65%of 67
scsi_async_event_xfer_mode---of 21
scsi_change_def---of 1
scsi_fc_sas_async_event_xfer_mode---of 10
scsi_kill_pending---of 6
scsi_print_addr---of 3
scsi_scsipi_cmd67%of 3
-----------
SUMMARY67%of 3
hci_ioctl_pcb5%of 41
-----------
SUMMARY5%of 41
change_owner86%of 7
change_root100%of 5
chdir_lookup84%of 6
do_fhstat63%of 8
do_fhstatvfs63%of 8
do_open100%of 9
do_posix_mknodat---of 3
do_sys_accessat100%of 15
do_sys_chdir---of 3
do_sys_chmodat100%of 6
do_sys_chownat100%of 6
do_sys_fchdir42%of 12
do_sys_fstatvfs100%of 3
do_sys_getvfsstat73%of 11
do_sys_linkat84%of 18
do_sys_mkdir---of 1
do_sys_mkdirat84%of 12
do_sys_mkfifoat80%of 10
do_sys_mknod100%of 1
do_sys_mknodat80%of 25
do_sys_mount70%of 70
do_sys_openat80%of 15
do_sys_pstatvfs100%of 3
do_sys_quotactl---of 46
do_sys_readlinkat65%of 14
do_sys_rename---of 1
do_sys_renameat65%of 80
do_sys_stat100%of 1
do_sys_statat89%of 9
do_sys_symlink---of 1
do_sys_symlinkat70%of 20
do_sys_sync12%of 9
do_sys_unlink---of 1
do_sys_unlinkat70%of 20
do_sys_utimens---of 1
do_sys_utimensat77%of 30
do_sys_utimes55%of 11
dofhopen34%of 21
dorevoke---of 4
dostatvfs67%of 18
fd_open---of 4
filt_fs---of 4
filt_fsattach---of 1
filt_fsdetach---of 1
kern_pathconf100%of 4
open_setfp100%of 6
sync_vnode_filter---of 3
sys___fhopen40100%of 1
sys___fhstat5067%of 3
sys___fhstatvfs19067%of 3
sys___fstatvfs190---of 4
sys___futimes5050%of 10
sys___getdents30100%of 5
sys___getfh3089%of 9
sys___getvfsstat9064%of 11
sys___lstat50100%of 3
sys___lutimes5056%of 9
sys___mknod50100%of 3
sys___mount50100%of 1
sys___posix_chown100%of 3
sys___posix_fchown67%of 3
sys___posix_lchown67%of 3
sys___posix_rename100%of 1
sys___quotactl---of 3
sys___stat50100%of 3
sys___statvfs190---of 4
sys___utimes5078%of 9
sys_access100%of 1
sys_chdir100%of 3
sys_chflags100%of 3
sys_chmod100%of 1
sys_chown100%of 3
sys_chroot100%of 4
sys_faccessat100%of 1
sys_fchdir100%of 1
sys_fchflags100%of 3
sys_fchmod100%of 3
sys_fchmodat100%of 1
sys_fchown100%of 3
sys_fchownat100%of 1
sys_fchroot67%of 6
sys_fdatasync100%of 3
sys_fdiscard---of 6
sys_fstatat100%of 3
sys_fsync100%of 3
sys_fsync_range---of 8
sys_ftruncate75%of 4
sys_futimens---of 3
sys_lchflags100%of 3
sys_lchmod100%of 3
sys_lchown100%of 3
sys_link100%of 1
sys_linkat100%of 1
sys_lpathconf---of 1
sys_lseek80%of 5
sys_mkdir100%of 1
sys_mkdirat100%of 1
sys_mkfifo---of 1
sys_mkfifoat---of 1
sys_mknodat100%of 3
sys_open100%of 3
sys_openat100%of 3
sys_pathconf100%of 1
sys_posix_fallocate---of 6
sys_pread84%of 6
sys_preadv100%of 1
sys_pwrite67%of 6
sys_pwritev100%of 1
sys_readlink100%of 1
sys_readlinkat100%of 1
sys_rename100%of 1
sys_renameat100%of 1
sys_revoke---of 3
sys_rmdir100%of 1
sys_symlink100%of 1
sys_symlinkat100%of 1
sys_sync100%of 1
sys_truncate86%of 7
sys_umask100%of 1
sys_undelete55%of 11
sys_unlink100%of 1
sys_unlinkat100%of 1
sys_unmount89%of 9
sys_utimensat100%of 1
vfs_composefh67%of 6
vfs_composefh_alloc50%of 8
vfs_composefh_free---of 1
vfs_copyinfh_alloc72%of 7
vfs_copyinfh_free---of 1
vfs_evfilt_fs_init---of 1
vfs_fhtovp---of 4
vfs_syncwait---of 14
x86_curlwp100%of 1
-----------
SUMMARY75%of 723
-----------
SUMMARY---of 0
_prop_number_alloc50%of 8
_prop_number_equals---of 9
_prop_number_externalize---of 4
_prop_number_free100%of 1
_prop_number_init---of 1
_prop_number_internalize---of 9
_prop_number_lock67%of 3
_prop_number_rb_compare_key43%of 7
_prop_number_rb_compare_nodes43%of 7
_prop_number_unlock100%of 1
prop_number_copy---of 4
prop_number_create_integer---of 1
prop_number_create_signed---of 1
prop_number_create_unsigned100%of 1
prop_number_create_unsigned_integer---of 1
prop_number_equals---of 5
prop_number_equals_integer---of 6
prop_number_equals_signed---of 6
prop_number_equals_unsigned---of 6
prop_number_equals_unsigned_integer---of 6
prop_number_int16_value---of 7
prop_number_int32_value---of 7
prop_number_int64_value---of 5
prop_number_int8_value---of 7
prop_number_int_value---of 7
prop_number_integer_value---of 4
prop_number_intptr_value---of 5
prop_number_long_value---of 5
prop_number_longlong_value---of 5
prop_number_schar_value---of 7
prop_number_short_value---of 7
prop_number_signed_value---of 4
prop_number_size---of 9
prop_number_uchar_value---of 7
prop_number_uint16_value---of 7
prop_number_uint32_value---of 7
prop_number_uint64_value---of 5
prop_number_uint8_value---of 7
prop_number_uint_value---of 7
prop_number_uintptr_value---of 5
prop_number_ulong_value---of 5
prop_number_ulonglong_value---of 5
prop_number_unsigned---of 1
prop_number_unsigned_integer_value---of 4
prop_number_unsigned_value---of 4
prop_number_ushort_value---of 7
-----------
SUMMARY54%of 28
sack_dump---of 4
sack_removehole---of 7
tcp_del_sackholes---of 7
tcp_free_sackholes50%of 6
tcp_new_dsack---of 3
tcp_sack_adjust25%of 8
tcp_sack_init---of 1
tcp_sack_numblks75%of 4
tcp_sack_option---of 48
tcp_sack_output---of 8
-----------
SUMMARY45%of 18
_uvm_map_sanity---of 9
_uvm_tree_sanity---of 28
sysctl_user_va0_disable---of 5
sysctl_uvmmap_setup---of 1
sysctl_vmproc---of 44
uvm_map62%of 13
uvm_map_advice48%of 21
uvm_map_checkprot---of 8
uvm_map_clean48%of 67
uvm_map_clip_end30%of 47
uvm_map_clip_start30%of 47
uvm_map_compare_key---of 3
uvm_map_compare_nodes78%of 9
uvm_map_enter53%of 141
uvm_map_extract27%of 122
uvm_map_findspace50%of 173
uvm_map_inherit72%of 21
uvm_map_init---of 1
uvm_map_init_caches---of 1
uvm_map_lock_entry100%of 5
uvm_map_lookup_entry48%of 51
uvm_map_pageable51%of 113
uvm_map_pageable_all54%of 80
uvm_map_prepare46%of 53
uvm_map_printit---of 5
uvm_map_protect79%of 46
uvm_map_protect_user100%of 3
uvm_map_reference---of 1
uvm_map_replace14%of 65
uvm_map_reserve---of 1
uvm_map_setup---of 1
uvm_map_space_avail62%of 21
uvm_map_submap---of 19
uvm_map_unlock_entry100%of 5
uvm_map_willneed75%of 12
uvm_mapent_clone56%of 18
uvm_mapent_splitadj58%of 14
uvm_mapent_trymerge41%of 117
uvm_rb_fixup74%of 26
uvm_rb_insert63%of 8
uvm_rb_remove86%of 21
uvm_unmap167%of 12
uvm_unmap_detach70%of 13
uvm_unmap_remove50%of 82
uvm_voaddr_acquire---of 45
uvm_voaddr_compare---of 11
uvm_voaddr_release---of 12
uvm_whatis---of 9
uvmspace_addref60%of 5
uvmspace_alloc---of 1
uvmspace_exec---of 12
uvmspace_fork70%of 42
uvmspace_free17%of 12
uvmspace_init---of 3
uvmspace_share60%of 5
uvmspace_spawn---of 1
vm_map_busy---of 5
vm_map_lock86%of 7
vm_map_lock_read100%of 1
vm_map_lock_try---of 4
vm_map_locked_p---of 1
vm_map_unbusy---of 3
vm_map_unlock50%of 6
vm_map_unlock_read100%of 1
x86_curlwp100%of 1
-----------
SUMMARY49%of 1506
pmap_pv_init---of 1
pmap_pv_track---of 8
pmap_pv_tracked25%of 12
pmap_pv_untrack---of 13
-----------
SUMMARY25%of 12
file_free---of 11
fileassoc_add---of 53
fileassoc_clear---of 4
fileassoc_decuse---of 7
fileassoc_deregister---of 6
fileassoc_file_delete23%of 9
fileassoc_file_lookup---of 23
fileassoc_init---of 3
fileassoc_lookup---of 3
fileassoc_register---of 8
fileassoc_table_clear---of 14
fileassoc_table_delete---of 12
fileassoc_table_run---of 14
table_dtor---of 7
-----------
SUMMARY23%of 9
compat_60_sys__lwp_park100%of 7
kern_time_60_fini---of 1
kern_time_60_init---of 1
-----------
SUMMARY100%of 7
compat_43_sys_getpagesize---of 1
compat_43_sys_mmap100%of 1
vm_43_fini---of 1
vm_43_init---of 1
-----------
SUMMARY100%of 1
uarea_poolpage_alloc60%of 5
uarea_poolpage_free---of 3
uarea_system_poolpage_alloc67%of 3
uarea_system_poolpage_free---of 3
uvm_idle---of 3
uvm_init_limits---of 3
uvm_kernacc---of 1
uvm_lwp_exit100%of 1
uvm_lwp_fork100%of 1
uvm_lwp_getuarea100%of 1
uvm_lwp_setuarea100%of 1
uvm_proc_exit---of 7
uvm_proc_fork100%of 3
uvm_scheduler---of 1
uvm_uarea_alloc100%of 1
uvm_uarea_free---of 1
uvm_uarea_init---of 1
uvm_uarea_system_alloc100%of 1
uvm_uarea_system_free---of 1
uvm_vslock100%of 1
uvm_vsunlock100%of 1
x86_curlwp---of 1
-----------
SUMMARY85%of 19
filt_wseventrdetach---of 1
filt_wseventread---of 4
sysctl_wsevent_setup---of 3
wsevent_fini---of 3
wsevent_init67%of 3
wsevent_inject46%of 11
wsevent_intr---of 3
wsevent_kqfilter---of 3
wsevent_poll50%of 4
wsevent_read12%of 17
wsevent_setversion50%of 4
wsevent_wakeup---of 5
-----------
SUMMARY34%of 39
specificdata_domain_create---of 1
specificdata_fini13%of 16
specificdata_getspecific50%of 4
specificdata_getspecific_unlocked50%of 4
specificdata_init100%of 1
specificdata_key_create---of 8
specificdata_key_delete---of 12
specificdata_noop_dtor---of 1
specificdata_setspecific---of 17
-----------
SUMMARY29%of 25
l2cap_abort_wrapper---of 8
l2cap_accept_wrapper---of 7
l2cap_attach_wrapper58%of 14
l2cap_bind_wrapper---of 9
l2cap_complete---of 10
l2cap_connect2_wrapper---of 3
l2cap_connect_wrapper---of 9
l2cap_connected---of 1
l2cap_connecting---of 1
l2cap_ctloutput---of 6
l2cap_detach_wrapper60%of 5
l2cap_disconnect_wrapper---of 5
l2cap_disconnected---of 1
l2cap_input---of 13
l2cap_ioctl_wrapper100%of 1
l2cap_linkmode---of 5
l2cap_listen_wrapper60%of 5
l2cap_newconn---of 3
l2cap_peeraddr_wrapper---of 7
l2cap_purgeif_wrapper---of 1
l2cap_rcvd_wrapper---of 3
l2cap_recvoob_wrapper67%of 3
l2cap_send_wrapper---of 14
l2cap_sendoob_wrapper---of 3
l2cap_shutdown_wrapper---of 3
l2cap_sockaddr_wrapper58%of 7
l2cap_stat_wrapper---of 3
-----------
SUMMARY60%of 35
in6_cksum50%of 16
-----------
SUMMARY50%of 16
exec_read---of 4
exec_setup_stack---of 12
kill_vmcmds29%of 7
new_vmcmd---of 11
vmcmd_map_pagedvn---of 13
vmcmd_map_readvn---of 4
vmcmd_map_zero---of 5
vmcmd_readvn---of 10
vmcmdset_extend---of 4
x86_curlwp---of 1
-----------
SUMMARY29%of 7
genfs_renamelock_enter100%of 1
genfs_renamelock_exit100%of 1
genfs_statvfs100%of 1
genfs_suspendctl50%of 6
-----------
SUMMARY67%of 9
do_filereadv71%of 34
do_filewritev69%of 38
dofileread60%of 10
dofilewrite50%of 14
sys_ioctl70%of 36
sys_read100%of 4
sys_readv100%of 1
sys_write100%of 4
sys_writev100%of 1
x86_curlwp100%of 1
-----------
SUMMARY70%of 143
-----------
SUMMARY---of 0
localcount_acquire60%of 5
localcount_debug_refcnt---of 1
localcount_drain40%of 10
localcount_fini67%of 3
localcount_init100%of 1
localcount_release43%of 7
localcount_xc---of 1
-----------
SUMMARY50%of 26
uvm_io39%of 13
-----------
SUMMARY39%of 13
compat_50_sys___msgctl13100%of 6
-----------
SUMMARY100%of 6
sys___msync1372%of 7
sys_madvise90%of 10
sys_mincore68%of 40
sys_minherit50%of 4
sys_mlock50%of 6
sys_mlockall100%of 3
sys_mmap66%of 32
sys_mprotect75%of 4
sys_munlock75%of 4
sys_munlockall100%of 1
sys_munmap67%of 6
uvm_default_mapaddr67%of 3
uvm_mmap68%of 34
uvm_mmap_anon---of 3
uvm_mmap_dev---of 7
x86_curlwp100%of 1
-----------
SUMMARY70%of 155
process_dodbregs40%of 10
process_dofpregs---of 10
process_doregs40%of 10
process_read_lwpstatus---of 1
process_validdbregs100%of 1
process_validfpregs---of 1
process_validregs100%of 1
ptrace_read_lwpstatus50%of 4
ptrace_update_lwp43%of 7
-----------
SUMMARY46%of 33
-----------
SUMMARY---of 0
uvm_vnp_setsize55%of 11
uvm_vnp_setwritesize50%of 12
uvn_detach100%of 1
uvn_findpage57%of 37
uvn_findpages89%of 17
uvn_get62%of 13
uvn_markdirty60%of 5
uvn_put67%of 3
uvn_reference100%of 1
uvn_text_p67%of 3
-----------
SUMMARY64%of 103
pci_devioctl---of 5
pciioctl67%of 27
pcimmap32%of 22
pciopen100%of 1
-----------
SUMMARY52%of 50
dead_newvnode56%of 9
-----------
SUMMARY56%of 9
uvm_page_physload---of 7
uvm_page_physunload---of 25
uvm_page_physunload_force---of 16
uvm_phys_to_vm_page75%of 8
uvm_physseg_find50%of 8
uvm_physseg_get_avail_end---of 5
uvm_physseg_get_avail_start---of 5
uvm_physseg_get_end---of 5
uvm_physseg_get_first---of 1
uvm_physseg_get_free_list40%of 5
uvm_physseg_get_highest_frame---of 8
uvm_physseg_get_last---of 1
uvm_physseg_get_next---of 4
uvm_physseg_get_pg---of 5
uvm_physseg_get_prev---of 4
uvm_physseg_get_start---of 5
uvm_physseg_get_start_hint---of 5
uvm_physseg_init---of 1
uvm_physseg_init_seg---of 14
uvm_physseg_plug---of 30
uvm_physseg_seg_alloc_from_slab---of 15
uvm_physseg_seg_chomp_slab---of 5
uvm_physseg_set_start_hint---of 5
uvm_physseg_unplug---of 30
uvm_physseg_valid_p---of 4
-----------
SUMMARY58%of 21
-----------
SUMMARY---of 0
rt_addaddr100%of 1
rt_assert_inactive---of 3
rt_deladdr50%of 4
rt_gettable67%of 3
rt_inithead---of 3
rt_lookup67%of 3
rt_matchaddr100%of 3
rt_refines---of 1
rt_walktree_visitor---of 1
rtbl_init---of 6
rtbl_search_matched_entry---of 3
rtbl_walktree---of 3
-----------
SUMMARY72%of 14
child_psignal58%of 7
coredump_elf32---of 3
coredump_elf64---of 3
coredump_netbsd---of 3
coredump_netbsd32---of 3
eventswitch---of 36
eventswitchchild---of 3
execsigs---of 17
filt_sigattach---of 1
filt_sigdetach---of 1
filt_signal---of 4
getucontext---of 5
issignal---of 47
killpg1---of 24
killproc---of 3
kpgsignal---of 13
kpsendsig---of 3
kpsignal48%of 17
kpsignal233%of 99
ksiginfo_alloc50%of 6
ksiginfo_exechook---of 8
ksiginfo_free---of 3
ksiginfo_queue_drain034%of 12
pgsignal---of 5
postsig---of 25
proc_stop_callout---of 14
proc_stop_done---of 12
proc_stop_lwps---of 11
proc_stoptrace---of 29
proc_unstop40%of 15
psignal60%of 5
sendsig29%of 7
sendsig_reset43%of 7
setucontext---of 9
sigacts_ctor100%of 1
sigacts_poolpage_alloc---of 1
sigacts_poolpage_free---of 1
sigactsfree---of 3
sigactsinit100%of 3
sigactsunshare---of 4
sigchecktrace---of 8
sigclear53%of 21
sigclearall---of 6
sigexit---of 25
sigget---of 11
siggetinfo---of 24
siginit---of 13
sigismasked---of 3
sigispending40%of 5
signal_init---of 1
signal_listener_cb34%of 6
signotify67%of 3
sigpost27%of 34
sigput42%of 17
sigswitch---of 30
sigswitch_unlock_and_switch_away---of 17
trapsignal22%of 28
x86_curlwp100%of 1
-----------
SUMMARY38%of 294
addupc_intr---of 6
addupc_task---of 8
sys_profil100%of 4
-----------
SUMMARY100%of 4
kill1---of 18
sigaction1---of 50
sigaltstack1---of 9
sigpending1---of 1
sigprocmask1---of 13
sigsuspend1---of 5
sigsuspendsetup67%of 3
sigsuspendteardown50%of 4
sigtimedwait1---of 41
sys_____sigtimedwait50---of 41
sys___sigaction_sigtramp---of 8
sys___sigaltstack14---of 15
sys___sigpending14---of 1
sys___sigprocmask14---of 8
sys___sigsuspend14---of 7
sys_getcontext---of 1
sys_kill---of 1
sys_setcontext---of 4
sys_sigqueueinfo---of 3
-----------
SUMMARY58%of 7
check_exec39%of 36
check_posix_spawn50%of 6
copyargs---of 15
do_posix_spawn8%of 42
exec_add---of 31
exec_free_emul_arg---of 5
exec_init---of 33
exec_makepathbuf80%of 10
exec_pool_alloc---of 1
exec_pool_free---of 1
exec_remove---of 29
exec_sigcode_alloc---of 10
exec_sigcode_free---of 8
exec_vm_minaddr---of 1
execve1---of 3
execve_fetch_element---of 1
execve_free_data---of 9
execve_loadvm29%of 59
execve_runproc---of 129
posix_spawn_fa_free---of 9
spawn_exec_data_release25%of 12
spawn_return---of 59
sys_execve67%of 3
sys_fexecve---of 3
sys_posix_spawn70%of 43
x86_curlwp---of 1
-----------
SUMMARY38%of 211
-----------
SUMMARY---of 0
module_hook_exit100%of 1
module_hook_init---of 1
module_hook_set---of 5
module_hook_tryenter100%of 3
module_hook_unset---of 5
-----------
SUMMARY100%of 4
l2cap_attach_pcb56%of 9
l2cap_bind_pcb---of 3
l2cap_connect_pcb---of 24
l2cap_detach_pcb34%of 12
l2cap_disconnect_pcb---of 9
l2cap_getopt---of 8
l2cap_listen_pcb6%of 35
l2cap_peeraddr_pcb---of 1
l2cap_send_pcb---of 14
l2cap_setopt---of 9
l2cap_sockaddr_pcb100%of 1
-----------
SUMMARY22%of 57
in6_control28%of 229
in6_domifattach100%of 1
in6_domifdetach---of 1
in6_if_down100%of 1
in6_if_link_down53%of 17
in6_if_link_state_change---of 3
in6_if_link_up40%of 23
in6_if_up100%of 1
in6_ifaddlocal---of 5
in6_ifawithifp53%of 34
in6_ifremlocal---of 13
in6_ifremprefix---of 45
in6_in_2_v4mapin6100%of 1
in6_init---of 1
in6_is_addr_deprecated---of 10
in6_lltable_create---of 21
in6_lltable_delete---of 11
in6_lltable_destroy_lle---of 3
in6_lltable_dump_entry---of 5
in6_lltable_fill_sa_entry---of 1
in6_lltable_free_entry---of 3
in6_lltable_hash---of 1
in6_lltable_lookup37%of 11
in6_lltable_match_prefix---of 8
in6_localaddr73%of 18
in6_mask2len21%of 24
in6_matchlen58%of 21
in6_prefixlen2mask---of 11
in6_purge_mcast_references59%of 12
in6_purgeaddr---of 28
in6_purgeif---of 1
in6_rt_ifa_matcher---of 1
in6_sin6_2_sin---of 1
in6_sin6_2_sin_in_sock---of 1
in6_sin_2_v4mapsin6---of 1
in6_sin_2_v4mapsin6_in_sock---of 1
in6_tunnel_validate---of 3
in6_update_ifa---of 1
in6_update_ifa1---of 247
in6ifa_ifpforlinklocal54%of 15
in6ifa_ifpforlinklocal_psref100%of 3
in6ifa_ifpwithaddr62%of 13
in6ifa_ifpwithaddr_psref---of 3
in6ifa_ifwithaddr---of 11
x86_curlwp100%of 1
-----------
SUMMARY39%of 425
cwdexec---of 5
cwdfree34%of 6
cwdinit72%of 7
cwdshare100%of 1
cwdunshare---of 3
x86_curlwp100%of 1
-----------
SUMMARY60%of 15
addrsel_policy_init---of 1
in6_selectroute47%of 30
in6_selectsrc42%of 157
in6_src_ioctl9%of 24
in6pcb_selecthlim67%of 6
in6pcb_selecthlim_rt67%of 6
in6pcb_set_port50%of 4
sysctl_net_inet6_addrctlpolicy---of 13
x86_curlwp100%of 1
-----------
SUMMARY41%of 228
bintime---of 6
binuptime63%of 8
dtrace_getnanotime---of 4
dummy_get_timecount---of 1
getbinboottime---of 6
getbintime---of 8
getbinuptime---of 4
getmicroboottime50%of 6
getmicrotime50%of 4
getmicrouptime50%of 4
getnanoboottime---of 6
getnanotime50%of 4
getnanouptime50%of 4
inittimecounter---of 3
microtime40%of 10
microuptime50%of 6
nanotime40%of 10
nanouptime50%of 6
pps_capture---of 7
pps_event---of 1
pps_init67%of 3
pps_ioctl58%of 14
pps_ref_event---of 23
sysctl_kern_timecounter_choice---of 11
sysctl_kern_timecounter_hardware---of 14
sysctl_timecounter_setup---of 3
tc_detach---of 17
tc_getfrequency100%of 1
tc_gonebad---of 1
tc_init---of 14
tc_setclock43%of 7
tc_ticktock---of 12
tc_windup64%of 19
x86_curlwp100%of 1
-----------
SUMMARY54%of 107
in_addmulti34%of 15
in_addrhash_insert---of 1
in_addrhash_insert_locked50%of 6
in_addrhash_remove---of 5
in_addrhash_remove_locked45%of 18
in_broadcast35%of 23
in_canforward---of 5
in_control67%of 3
in_control019%of 224
in_delmulti29%of 7
in_direct---of 25
in_domifattach100%of 1
in_domifdetach---of 1
in_first_multi---of 9
in_get_ia_from_ifp_psref42%of 12
in_if_down100%of 1
in_if_link_down57%of 16
in_if_link_state_change---of 3
in_if_link_up44%of 23
in_if_up100%of 1
in_ifinit48%of 63
in_init---of 1
in_len2mask---of 10
in_lltable_create---of 31
in_lltable_delete---of 11
in_lltable_destroy_lle---of 3
in_lltable_dump_entry---of 5
in_lltable_fill_sa_entry---of 1
in_lltable_free_entry---of 5
in_lltable_hash---of 1
in_lltable_lookup37%of 11
in_lltable_match_prefix40%of 5
in_localaddr14%of 15
in_lookup_multi---of 9
in_multi_group27%of 19
in_multi_lock---of 1
in_multi_lock_held100%of 1
in_multi_unlock---of 1
in_multicast_sysctl---of 26
in_next_multi---of 7
in_purgeaddr---of 25
in_purgeif---of 1
in_rt_ifa_matcher---of 1
in_scrubaddr---of 21
in_scrubprefix46%of 22
in_selectsrc34%of 36
in_setmaxmtu55%of 11
in_socktrim---of 4
in_tunnel_validate---of 15
pslist_writer_insert_head50%of 12
x86_curlwp100%of 1
-----------
SUMMARY33%of 546
compat_50_sys___sigtimedwait---of 3
compat_50_sys__lwp_park86%of 7
compat_50_sys_wait4100%of 7
kern_50_fini---of 1
kern_50_init---of 1
tscopyin---of 5
tscopyout---of 3
-----------
SUMMARY93%of 14
debug_init---of 5
freecheck_in29%of 7
freecheck_out23%of 9
-----------
SUMMARY25%of 16
md_attach_hook67%of 3
md_open_hook---of 1
md_root_setconf---of 1
-----------
SUMMARY67%of 3
ufsdirhash_add56%of 18
ufsdirhash_adjfree77%of 13
ufsdirhash_build35%of 80
ufsdirhash_checkblock10%of 21
ufsdirhash_dirtrunc35%of 20
ufsdirhash_done---of 3
ufsdirhash_enduseful50%of 8
ufsdirhash_findfree47%of 26
ufsdirhash_findslot65%of 17
ufsdirhash_free29%of 14
ufsdirhash_init---of 3
ufsdirhash_lookup44%of 64
ufsdirhash_move40%of 5
ufsdirhash_newblk38%of 8
ufsdirhash_remove56%of 18
ufsdirhash_sysctl_init---of 1
-----------
SUMMARY42%of 312
buildcontext---of 1
cpu_dump---of 15
cpu_dump_mempagecnt---of 8
cpu_dump_prep_sparse---of 1
cpu_dumpconf28%of 11
cpu_dumpsize---of 1
cpu_fsgs_reload---of 5
cpu_getmcontext67%of 3
cpu_init_idt---of 1
cpu_init_tss---of 1
cpu_mcontext_validate12%of 25
cpu_reboot---of 23
cpu_reset---of 1
cpu_segregs32_zero---of 7
cpu_segregs64_zero---of 7
cpu_setmcontext---of 12
cpu_startup---of 15
dodumpsys---of 30
dump_header_addbytes---of 4
dump_header_addseg---of 1
dump_header_finish---of 1
dump_header_flush---of 1
dump_header_start---of 1
dump_misc_init---of 15
dump_seg_count_range---of 1
dump_seg_iter---of 14
dump_seg_prep---of 13
dumpsys_seg---of 12
idt_vec_init_cpu_md---of 5
init_bootspace---of 1
init_slotspace---of 1
init_x86_64---of 13
mm_md_direct_mapped_io---of 1
mm_md_direct_mapped_phys100%of 1
mm_md_kernacc---of 17
reserve_dumppages---of 1
sendsig_siginfo46%of 11
set_mem_segment---of 1
set_sys_segment---of 1
setgate---of 1
setregion---of 1
setregs---of 1
sparse_dump_mark---of 18
sparse_dump_reset---of 1
unsetgate---of 1
x86_curlwp100%of 1
-----------
SUMMARY29%of 52
cn_set_tab---of 1
cnbell---of 4
cnclose---of 4
cnflush50%of 4
cngetc---of 5
cngetsn---of 22
cnhalt---of 4
cnioctl58%of 14
cnkqfilter---of 10
cnopen25%of 8
cnpoll40%of 10
cnpollc---of 6
cnputc75%of 4
cnread38%of 8
cnwrite40%of 10
cons_modcmd---of 4
nullcnpollc---of 1
-----------
SUMMARY45%of 58
module_autoload29%of 7
module_builtin_add---of 32
module_builtin_remove---of 16
module_builtin_require_force---of 4
module_compatible---of 1
module_do_builtin---of 49
module_do_load8%of 136
module_do_unload24%of 43
module_enqueue---of 11
module_error100%of 1
module_fetch_info---of 5
module_find_section---of 5
module_getspecific---of 1
module_hold---of 1
module_init---of 12
module_init_class---of 36
module_kernel---of 1
module_listener_cb100%of 1
module_load78%of 9
module_name---of 1
module_prime---of 17
module_print67%of 3
module_print_list---of 9
module_register_callbacks---of 5
module_rele---of 3
module_setspecific---of 1
module_source---of 1
module_specific_key_create---of 1
module_specific_key_delete---of 1
module_start_unload_thread---of 3
module_thread---of 15
module_thread_kick---of 1
module_unload100%of 3
module_unregister_callbacks---of 10
module_whatis---of 8
sysctl_module_autotime---of 4
x86_curlwp100%of 1
-----------
SUMMARY19%of 204
-----------
SUMMARY---of 0
mountnfs---of 17
nfs_decode_args---of 50
nfs_fhtovp---of 10
nfs_fsinfo---of 45
nfs_modcmd---of 4
nfs_mount18%of 17
nfs_mountroot---of 9
nfs_root---of 3
nfs_start---of 1
nfs_statvfs---of 42
nfs_sync---of 6
nfs_sync_selector---of 5
nfs_sysctl_init---of 1
nfs_unmount---of 13
nfs_vfs_done---of 1
nfs_vfs_init---of 1
nfs_vget---of 1
nfs_vptofh---of 3
sysctl_vfs_nfs_iothreads---of 3
x86_curlwp100%of 1
-----------
SUMMARY23%of 18
do_sched_getparam---of 8
do_sched_setparam---of 33
sched_init---of 3
sched_listener_cb29%of 7
sys__sched_getaffinity---of 8
sys__sched_getparam---of 5
sys__sched_protect---of 16
sys__sched_setaffinity---of 31
sys__sched_setparam---of 3
sys_sched_yield---of 1
x86_curlwp---of 1
-----------
SUMMARY29%of 7
_mutex_init50%of 10
mutex_abort---of 1
mutex_destroy56%of 9
mutex_dump---of 1
mutex_enter65%of 90
mutex_init100%of 1
mutex_ownable67%of 3
mutex_owned75%of 4
mutex_owner67%of 3
mutex_spin_retry---of 18
mutex_tryenter56%of 18
mutex_vector_exit57%of 23
x86_curlwp100%of 1
-----------
SUMMARY62%of 162
pollcommon100%of 9
sel_do_scan83%of 56
selclear64%of 11
selcommon96%of 23
seldestroy20%of 10
selinit100%of 1
selnotify29%of 28
selrecord67%of 9
selrecord_knote---of 1
selremove_knote---of 1
selsysinit---of 3
seltrue100%of 1
sys___pollts50---of 7
sys___pselect50---of 7
sys___select5080%of 5
sys_poll100%of 3
sysctl_select_setup---of 1
x86_curlwp100%of 1
-----------
SUMMARY71%of 157
nd6_dad_find---of 22
nd6_dad_input---of 25
nd6_dad_start34%of 18
nd6_dad_stop40%of 10
nd6_dad_stoptimer---of 9
nd6_dad_timer---of 12
nd6_ifptomac---of 9
nd6_na_input---of 45
nd6_na_output---of 32
nd6_nbr_init---of 1
nd6_ns_input---of 83
nd6_ns_output---of 32
-----------
SUMMARY36%of 28
ttyerrpoll---of 1
ttyldisc_attach---of 42
ttyldisc_default---of 1
ttyldisc_detach---of 16
ttyldisc_init---of 4
ttyldisc_lookup43%of 7
ttyldisc_lookup_bynum43%of 7
ttyldisc_release40%of 5
ttynullioctl100%of 1
-----------
SUMMARY45%of 20
-----------
SUMMARY---of 0
ip_ctloutput42%of 129
ip_fragment---of 32
ip_freemoptions80%of 5
ip_get_membership39%of 13
ip_getmoptions10%of 21
ip_if_output67%of 6
ip_multicast_if38%of 16
ip_optcopy---of 17
ip_optlen75%of 4
ip_output38%of 193
ip_pktinfo_prepare---of 19
ip_setmoptions57%of 65
ip_setpktopts13%of 16
x86_curlwp100%of 1
-----------
SUMMARY41%of 469
sysmon_wdog_critpoll40%of 5
sysmon_wdog_find---of 5
sysmon_wdog_fini---of 4
sysmon_wdog_init---of 7
sysmon_wdog_ktickle---of 4
sysmon_wdog_modcmd---of 4
sysmon_wdog_ref---of 1
sysmon_wdog_register---of 10
sysmon_wdog_release---of 3
sysmon_wdog_setmode---of 12
sysmon_wdog_shutdown---of 4
sysmon_wdog_unregister---of 10
sysmonclose_wdog---of 5
sysmonioctl_wdog19%of 27
sysmonopen_wdog100%of 1
wdog_preinit---of 1
-----------
SUMMARY25%of 33
uvm_page_array_advance60%of 5
uvm_page_array_clear67%of 3
uvm_page_array_fill63%of 37
uvm_page_array_fill_and_peek72%of 7
uvm_page_array_fini100%of 1
uvm_page_array_init100%of 1
uvm_page_array_peek60%of 5
-----------
SUMMARY65%of 59
in_purgeifmcast---of 12
inpcb_bind50%of 10
inpcb_bind_port24%of 39
inpcb_bindableaddr28%of 18
inpcb_connect28%of 51
inpcb_create45%of 20
inpcb_destroy50%of 22
inpcb_disconnect---of 6
inpcb_fetch_peeraddr67%of 3
inpcb_fetch_sockaddr67%of 3
inpcb_init---of 3
inpcb_lookup20%of 26
inpcb_lookup_bound---of 26
inpcb_lookup_local22%of 33
inpcb_losing---of 9
inpcb_notify---of 11
inpcb_notifyall---of 8
inpcb_poolinit---of 1
inpcb_purgeif---of 9
inpcb_purgeif0---of 33
inpcb_rtchange---of 3
inpcb_rtentry75%of 4
inpcb_rtentry_unref100%of 1
inpcb_set_state48%of 19
x86_curlwp100%of 1
-----------
SUMMARY34%of 250
tmpfs_spec_close100%of 1
tmpfs_spec_read100%of 1
tmpfs_spec_write100%of 1
-----------
SUMMARY100%of 3
chkdq122%of 37
chkiq125%of 37
dq1get---of 16
dq1sync---of 8
q1sync---of 16
quota1_handle_cmd_get---of 11
quota1_handle_cmd_put---of 32
quota1_handle_cmd_quotaoff19%of 16
quota1_handle_cmd_quotaon8%of 28
quota1_umount---of 7
x86_curlwp---of 1
-----------
SUMMARY19%of 118
clock_secs_to_ymdhms53%of 19
clock_ymdhms_to_secs---of 28
-----------
SUMMARY53%of 19
-----------
SUMMARY---of 0
sysctl_kern_veriexec_algorithms---of 7
sysctl_kern_veriexec_setup---of 1
sysctl_kern_veriexec_strict---of 4
veriexec_convert---of 3
veriexec_dump---of 4
veriexec_file_add---of 35
veriexec_file_convert---of 3
veriexec_file_delete---of 6
veriexec_file_dump---of 3
veriexec_file_free---of 6
veriexec_file_purge_cb---of 3
veriexec_file_report---of 9
veriexec_file_verify---of 40
veriexec_flush---of 6
veriexec_fp_status---of 18
veriexec_fpops_add---of 26
veriexec_init---of 6
veriexec_listener_cb50%of 4
veriexec_lookup---of 1
veriexec_mountspecific_dtor---of 3
veriexec_openchk13%of 16
veriexec_purge---of 3
veriexec_raw_cb59%of 12
veriexec_removechk25%of 8
veriexec_renamechk16%of 13
veriexec_table_delete---of 4
veriexec_unmountchk19%of 11
veriexec_verify34%of 6
-----------
SUMMARY28%of 70
export3%of 80
free_netcred---of 4
mountd_set_exports_list46%of 37
netexport_check---of 14
netexport_clear36%of 14
netexport_fini---of 6
netexport_hasexports---of 1
netexport_init---of 1
netexport_rdlock---of 1
netexport_rdunlock---of 1
netexport_unmount59%of 12
nfs_export_update_3075%of 4
x86_curlwp100%of 1
-----------
SUMMARY24%of 148
pcq_create---of 6
pcq_destroy---of 1
pcq_get---of 12
pcq_maxitems---of 1
pcq_peek---of 6
pcq_put45%of 9
-----------
SUMMARY45%of 9
-----------
SUMMARY---of 0
radio_attach_mi---of 1
radioattach---of 1
radioclose---of 3
radiodetach---of 1
radioioctl---of 10
radioopen40%of 5
radioprint---of 3
radioprobe---of 1
-----------
SUMMARY40%of 5
fault_close---of 1
fault_inject14%of 15
fault_ioctl---of 23
fault_lwp_free---of 3
fault_modcmd---of 4
fault_open---of 1
-----------
SUMMARY14%of 15
key_abort---of 3
key_accept---of 3
key_attach_wrapper48%of 17
key_bind_wrapper---of 3
key_connect2_wrapper---of 3
key_connect_wrapper---of 3
key_detach_wrapper---of 9
key_disconnect_wrapper60%of 5
key_init_so---of 1
key_ioctl_wrapper---of 1
key_listen_wrapper---of 3
key_output24%of 13
key_peeraddr_wrapper---of 9
key_pr_init---of 1
key_rcvd_wrapper---of 3
key_recvoob_wrapper---of 3
key_send_wrapper60%of 5
key_sendoob_wrapper---of 3
key_sendup0---of 24
key_sendup_mbuf---of 40
key_shutdown_wrapper67%of 3
key_sockaddr_wrapper56%of 9
key_stat_wrapper---of 3
-----------
SUMMARY47%of 52
genfs_compat_getpages---of 19
genfs_compat_gop_write---of 1
genfs_dio_iodone---of 6
genfs_directio30%of 34
genfs_do_io54%of 26
genfs_do_putpages60%of 132
genfs_getpages42%of 221
genfs_gop_putrange100%of 1
genfs_gop_write100%of 1
genfs_gop_write_rwmap---of 1
genfs_putpages100%of 1
x86_curlwp100%of 1
-----------
SUMMARY48%of 417
ffs_balloc14%of 243
-----------
SUMMARY14%of 243
VFS_EXTATTRCTL100%of 1
VFS_FHTOVP---of 5
VFS_MOUNT100%of 5
VFS_QUOTACTL60%of 5
VFS_ROOT100%of 5
VFS_SNAPSHOT---of 5
VFS_START100%of 5
VFS_STATVFS100%of 5
VFS_SUSPENDCTL100%of 5
VFS_SYNC100%of 5
VFS_UNMOUNT100%of 5
VFS_VPTOFH100%of 5
bdevvp100%of 1
bgetvp62%of 18
brelvp57%of 23
cdevvp---of 1
copy_statvfs_info100%of 3
printlockedvnodes---of 9
reassignbuf60%of 30
sched_sync---of 40
set_statvfs_info55%of 11
setrootfstime---of 1
sysctl_kern_vnode---of 13
sysctl_vfs_generic_fstypes---of 12
vattr_null100%of 1
vdevgone---of 10
vfinddev100%of 1
vflushbuf50%of 16
vfs_buf_print---of 1
vfs_getopsbyname60%of 5
vfs_mount_print---of 12
vfs_mount_print_all---of 4
vfs_syncer_add_to_worklist67%of 12
vfs_syncer_remove_from_worklist58%of 7
vfs_timestamp40%of 5
vfs_unixify_accmode100%of 4
vfs_vnode_lock_print---of 10
vfs_vnode_print---of 17
vinvalbuf30%of 24
vn_syncer_add131%of 13
vn_syncer_add_to_worklist60%of 5
vn_syncer_remove_from_worklist42%of 12
vntblinit---of 4
vprint---of 14
vstate_name---of 3
vtruncbuf60%of 20
vtype2dt100%of 1
x86_curlwp---of 1
-----------
SUMMARY62%of 258
do_ksem_init41%of 22
do_ksem_open52%of 35
do_ksem_wait82%of 11
ksem_close_fop75%of 4
ksem_create72%of 7
ksem_free50%of 14
ksem_get57%of 16
ksem_listener_cb75%of 4
ksem_modcmd---of 10
ksem_read_fop---of 3
ksem_release64%of 11
ksem_stat_fop---of 3
sys__ksem_close80%of 5
sys__ksem_destroy39%of 18
sys__ksem_getvalue75%of 4
sys__ksem_init100%of 1
sys__ksem_open100%of 1
sys__ksem_post72%of 7
sys__ksem_timedwait100%of 5
sys__ksem_trywait100%of 1
sys__ksem_unlink31%of 23
sys__ksem_wait100%of 1
x86_curlwp100%of 1
-----------
SUMMARY56%of 191
ufs_deleteextattr---of 7
ufs_extattr_autostart---of 17
ufs_extattr_disable---of 13
ufs_extattr_done---of 1
ufs_extattr_enable---of 23
ufs_extattr_enable_with_open---of 4
ufs_extattr_get_header---of 13
ufs_extattr_init---of 1
ufs_extattr_lookup---of 14
ufs_extattr_rm---of 15
ufs_extattr_start---of 11
ufs_extattr_stop---of 10
ufs_extattr_subdir---of 29
ufs_extattr_uepm_destroy---of 4
ufs_extattr_uepm_init---of 1
ufs_extattr_vnode_inactive20%of 10
ufs_extattrctl---of 19
ufs_getextattr---of 28
ufs_listextattr---of 28
ufs_setextattr---of 69
x86_curlwp---of 1
-----------
SUMMARY20%of 10
md_attach60%of 5
md_detach---of 5
md_set_disklabel---of 1
mdattach---of 3
mdclose---of 8
mdioctl12%of 27
mdopen31%of 13
mdread50%of 4
mdsize---of 4
mdstrategy---of 11
mdwrite---of 4
-----------
SUMMARY25%of 49
usb_match_device---of 7
usb_transfer_complete48%of 55
usbd_abort_default_pipe---of 1
usbd_abort_pipe67%of 3
usbd_ar_pipe34%of 30
usbd_clear_endpoint_stall---of 5
usbd_clear_endpoint_stall_async---of 1
usbd_clear_endpoint_stall_task---of 5
usbd_clear_endpoint_toggle---of 3
usbd_close_pipe40%of 15
usbd_create_xfer43%of 21
usbd_destroy_xfer---of 5
usbd_device2interface_handle---of 4
usbd_do_request100%of 1
usbd_do_request_flags100%of 1
usbd_do_request_len53%of 19
usbd_dopoll---of 1
usbd_endpoint_count---of 5
usbd_free_xfer67%of 6
usbd_get_buffer100%of 1
usbd_get_config_descriptor---of 3
usbd_get_device_descriptor---of 3
usbd_get_endpoint_descriptor---of 5
usbd_get_interface---of 1
usbd_get_interface_altindex---of 1
usbd_get_interface_descriptor---of 3
usbd_get_no_alts---of 10
usbd_get_pipe0---of 1
usbd_get_quirks---of 3
usbd_get_string---of 1
usbd_get_string0---of 18
usbd_get_xfer_status---of 9
usbd_interface2device_handle---of 1
usbd_interface2endpoint_descriptor---of 3
usbd_interface_count---of 3
usbd_open_pipe---of 1
usbd_open_pipe_intr---of 11
usbd_open_pipe_ival---of 14
usbd_pipe2device_handle---of 3
usbd_ratecheck---of 1
usbd_resume_pipe---of 3
usbd_set_interface---of 4
usbd_set_polling---of 9
usbd_setup_default_xfer---of 3
usbd_setup_isoc_xfer---of 8
usbd_setup_xfer---of 3
usbd_start_next50%of 20
usbd_suspend_pipe---of 1
usbd_sync_transfer---of 1
usbd_sync_transfer_sig---of 1
usbd_transfer43%of 61
usbd_xfer_abort---of 7
usbd_xfer_cancel_timeout_async---of 16
usbd_xfer_probe_timeout---of 28
usbd_xfer_schedule_timeout---of 14
usbd_xfer_timeout---of 3
usbd_xfer_timeout_task---of 8
usbd_xfer_trycomplete---of 6
x86_curlwp100%of 1
-----------
SUMMARY46%of 234
kernfs_access67%of 3
kernfs_addentry---of 8
kernfs_alloctype---of 15
kernfs_close34%of 6
kernfs_default_fileop_getattr---of 1
kernfs_default_xread---of 5
kernfs_default_xwrite---of 10
kernfs_getattr14%of 23
kernfs_getpages---of 3
kernfs_inactive100%of 1
kernfs_ioctl34%of 6
kernfs_lookup38%of 24
kernfs_open34%of 6
kernfs_pathconf67%of 9
kernfs_print---of 1
kernfs_read---of 10
kernfs_readdir22%of 70
kernfs_reclaim29%of 7
kernfs_setattr100%of 1
kernfs_try_fileop---of 6
kernfs_try_xread---of 6
kernfs_try_xwrite---of 6
kernfs_write---of 7
kernfs_xread---of 20
kfsfileoptree_SPLAY---of 20
kfsfileoptree_SPLAY_INSERT---of 7
kfsfileoptree_SPLAY_MINMAX---of 8
kfsfileoptree_SPLAY_REMOVE---of 6
-----------
SUMMARY29%of 156
ptrace_copyin_piod100%of 3
ptrace_copyin_siginfo67%of 3
ptrace_copyout_lwpstatus100%of 1
ptrace_copyout_piod---of 3
ptrace_copyout_siginfo67%of 3
ptrace_modcmd---of 4
sys_ptrace100%of 1
-----------
SUMMARY82%of 11
compat_uvm_swap_stats50100%of 1
swapent50_cvt100%of 1
uvm_50_fini---of 1
uvm_50_init---of 1
-----------
SUMMARY100%of 2
in6_addmulti50%of 18
in6_delmulti---of 1
in6_delmulti_locked64%of 11
in6_joingroup50%of 4
in6_leavegroup67%of 3
in6_lookup_and_delete_multi---of 7
in6_lookup_multi---of 7
in6_mkludge_sysctl---of 3
in6_multi_group43%of 7
in6_multi_lock---of 1
in6_multi_locked50%of 4
in6_multi_unlock---of 1
in6_multicast_sysctl---of 34
in6_purge_multi---of 13
in6_sysctl_multicast_setup---of 1
in6m_destroy44%of 23
mld_init---of 1
mld_input---of 64
mld_sendpkt38%of 32
mld_start_listening63%of 8
mld_starttimer72%of 7
mld_timeo---of 6
x86_curlwp100%of 1
-----------
SUMMARY50%of 118
strlcat54%of 15
-----------
SUMMARY54%of 15
-----------
SUMMARY---of 0
do_setresgid59%of 48
do_setresuid73%of 51
sys___getlogin100%of 1
sys___setlogin---of 7
sys_getegid100%of 1
sys_geteuid100%of 1
sys_getgid---of 1
sys_getgid_with_egid100%of 1
sys_getgroups100%of 4
sys_getpgid100%of 4
sys_getpgrp100%of 1
sys_getpid---of 1
sys_getpid_with_ppid100%of 1
sys_getppid100%of 1
sys_getsid100%of 4
sys_getuid---of 1
sys_getuid_with_euid100%of 1
sys_issetugid---of 1
sys_setegid100%of 1
sys_seteuid100%of 1
sys_setgid100%of 1
sys_setgroups100%of 3
sys_setpgid100%of 4
sys_setregid100%of 5
sys_setreuid100%of 5
sys_setsid---of 1
sys_setuid100%of 1
-----------
SUMMARY76%of 140
clockctl_50_fini---of 3
clockctl_50_init---of 1
compat50_clockctlioctl80%of 15
-----------
SUMMARY80%of 15
ffs_indirtrunc42%of 60
ffs_itimes62%of 18
ffs_truncate42%of 190
ffs_update50%of 30
x86_curlwp---of 1
-----------
SUMMARY44%of 298
-----------
SUMMARY---of 0
ov_mount59%of 12
ov_unmount75%of 4
overlay_modcmd---of 4
overlay_sysctl_setup---of 1
x86_curlwp100%of 1
-----------
SUMMARY65%of 17
do_sys_wait100%of 3
do_sys_waitid42%of 110
exit1---of 100
exit_lwps---of 16
match_process40%of 25
proc_changeparent38%of 8
proc_reparent43%of 19
spc_lock---of 1
sys___wait450100%of 8
sys_exit---of 3
sys_wait6---of 7
x86_curlwp100%of 1
-----------
SUMMARY46%of 174
ufs_bmap80%of 5
ufs_bmaparray39%of 88
ufs_getlbns74%of 15
ufs_issequential67%of 3
x86_curlwp---of 1
-----------
SUMMARY46%of 111
kthread_create44%of 23
kthread_exit---of 11
kthread_fpu_enter---of 9
kthread_fpu_exit---of 13
kthread_join---of 10
kthread_sysinit---of 1
x86_curlwp---of 1
-----------
SUMMARY44%of 23
dead_bmap---of 1
dead_default_error100%of 1
dead_getpages---of 3
dead_inactive---of 1
dead_ioctl---of 1
dead_link---of 1
dead_lookup100%of 1
dead_open---of 1
dead_poll100%of 1
dead_print---of 1
dead_putpages---of 1
dead_read---of 1
dead_remove---of 1
dead_rename---of 6
dead_rmdir---of 1
dead_strategy---of 1
dead_write100%of 1
-----------
SUMMARY100%of 4
umap_bypass45%of 81
umap_getattr40%of 10
umap_lookup44%of 30
umap_print---of 1
umap_rename30%of 10
-----------
SUMMARY43%of 131
sys_ktrace80%of 10
x86_curlwp100%of 1
-----------
SUMMARY82%of 11
min_check---of 5
uvmpdpol_anfree100%of 1
uvmpdpol_balancequeue---of 35
uvmpdpol_estimatepageable---of 5
uvmpdpol_idle---of 12
uvmpdpol_init---of 1
uvmpdpol_init_cpu---of 1
uvmpdpol_needsscan_p---of 1
uvmpdpol_pageactivate58%of 7
uvmpdpol_pageactivate_locked56%of 9
uvmpdpol_pageactivate_p67%of 3
uvmpdpol_pagedeactivate58%of 7
uvmpdpol_pagedeactivate_locked39%of 21
uvmpdpol_pagedequeue58%of 7
uvmpdpol_pagedequeue_locked42%of 24
uvmpdpol_pageenqueue58%of 7
uvmpdpol_pageisqueued_p67%of 3
uvmpdpol_pagerealize78%of 9
uvmpdpol_pagerealize_locked70%of 10
uvmpdpol_reinit---of 1
uvmpdpol_scanfini---of 7
uvmpdpol_scaninit---of 8
uvmpdpol_selectvictim---of 31
uvmpdpol_sysctlsetup---of 1
uvmpdpol_tune---of 3
-----------
SUMMARY54%of 108
strncpy64%of 11
-----------
SUMMARY64%of 11
in_cksum67%of 3
-----------
SUMMARY67%of 3
dogetrandom28%of 22
sys_getrandom---of 3
x86_curlwp---of 1
-----------
SUMMARY28%of 22
do_lwp_create---of 4
lwp_park59%of 12
lwp_unpark80%of 10
mi_startlwp---of 4
sys____lwp_park6040%of 10
sys__lwp_continue100%of 3
sys__lwp_create50%of 8
sys__lwp_ctl75%of 4
sys__lwp_detach56%of 9
sys__lwp_exit100%of 1
sys__lwp_getname100%of 6
sys__lwp_getprivate100%of 1
sys__lwp_kill80%of 5
sys__lwp_self100%of 1
sys__lwp_setname89%of 9
sys__lwp_setprivate---of 1
sys__lwp_suspend59%of 12
sys__lwp_unpark100%of 1
sys__lwp_unpark_all78%of 9
sys__lwp_wait100%of 4
sys__lwp_wakeup80%of 5
x86_curlwp100%of 1
-----------
SUMMARY72%of 111
-----------
SUMMARY---of 0
raw_attach67%of 9
raw_detach50%of 10
raw_disconnect67%of 3
-----------
SUMMARY60%of 22
bdev_cancel16%of 13
bdev_close59%of 12
bdev_detached---of 8
bdev_discard---of 12
bdev_dump---of 5
bdev_flags---of 5
bdev_ioctl59%of 12
bdev_open52%of 29
bdev_size58%of 14
bdev_strategy58%of 14
bdev_type60%of 5
bdevsw_getname50%of 6
bdevsw_lookup75%of 4
bdevsw_lookup_major---of 7
cdev_cancel16%of 13
cdev_close59%of 12
cdev_detached---of 8
cdev_discard---of 12
cdev_flags---of 5
cdev_ioctl59%of 12
cdev_kqfilter---of 12
cdev_mmap59%of 12
cdev_open59%of 29
cdev_poll59%of 12
cdev_read59%of 12
cdev_stop42%of 12
cdev_tty---of 6
cdev_type80%of 5
cdev_write59%of 12
cdevsw_getname67%of 6
cdevsw_lookup75%of 4
cdevsw_lookup_major58%of 7
dev_minor_unit100%of 1
devsw_attach---of 87
devsw_blk2chr34%of 9
devsw_blk2name---of 9
devsw_chr2blk45%of 9
devsw_detach---of 1
devsw_detach_locked---of 37
devsw_init---of 5
devsw_name2blk---of 14
devsw_name2chr---of 14
nommap---of 1
-----------
SUMMARY53%of 276
hashdone67%of 3
hashinit20%of 31
hashstat_register---of 3
hashstat_sysctl---of 19
sysctl_hash_setup---of 1
-----------
SUMMARY24%of 34
iostat_alloc67%of 3
iostat_busy69%of 19
iostat_find---of 7
iostat_free---of 8
iostat_init---of 1
iostat_isbusy---of 1
iostat_rename---of 1
iostat_seek---of 1
iostat_unbusy---of 14
iostat_wait80%of 10
iostati_getnames---of 15
sysctl_hw_disknames---of 1
sysctl_hw_iostatnames---of 1
sysctl_hw_iostats---of 9
-----------
SUMMARY72%of 32
filt_genfsdetach---of 1
filt_genfsread---of 6
filt_genfsvnode---of 10
filt_genfswrite---of 6
genfs_abortop100%of 1
genfs_access67%of 3
genfs_accessx50%of 4
genfs_can_access73%of 11
genfs_can_access_acl_nfs4---of 41
genfs_can_access_acl_posix1e---of 44
genfs_can_chflags100%of 3
genfs_can_chmod89%of 9
genfs_can_chown63%of 8
genfs_can_chtimes100%of 5
genfs_can_extattr---of 5
genfs_can_sticky75%of 4
genfs_deadlock19%of 11
genfs_deadunlock100%of 1
genfs_ebadf---of 1
genfs_einval100%of 1
genfs_enoioctl100%of 1
genfs_eopnotsupp39%of 18
genfs_erofs_link---of 1
genfs_fcntl100%of 1
genfs_islocked100%of 3
genfs_kqfilter---of 5
genfs_lock70%of 10
genfs_mmap100%of 1
genfs_node_destroy100%of 1
genfs_node_init100%of 1
genfs_node_rdlock100%of 1
genfs_node_rdtrylock100%of 1
genfs_node_unlock100%of 1
genfs_node_wrlock100%of 1
genfs_node_wrlocked100%of 1
genfs_null_putpages67%of 3
genfs_nullop100%of 1
genfs_parsepath100%of 4
genfs_pathconf100%of 5
genfs_poll100%of 1
genfs_revoke67%of 3
genfs_seek100%of 1
genfs_size---of 1
genfs_unlock100%of 1
-----------
SUMMARY71%of 121
-----------
SUMMARY---of 0
uhub_attach---of 55
uhub_childdet---of 14
uhub_detach---of 16
uhub_explore3%of 91
uhub_intr40%of 10
uhub_match---of 1
uhub_rescan---of 3
-----------
SUMMARY6%of 101
compat_30_sys___fhstat3067%of 3
compat_30_sys___fstat13100%of 3
compat_30_sys___lstat1367%of 3
compat_30_sys___stat13100%of 3
compat_30_sys_fhopen100%of 1
compat_30_sys_fhstat67%of 3
compat_30_sys_fhstatvfs167%of 3
compat_30_sys_getdents62%of 18
compat_30_sys_getfh100%of 6
vfs_syscalls_30_fini---of 1
vfs_syscalls_30_init---of 1
-----------
SUMMARY75%of 43
scdebug_init---of 1
sys___syscall50%of 8
sys_syscall---of 8
-----------
SUMMARY50%of 8
ffs_bufrd43%of 47
ffs_bufwr42%of 39
ffs_fsync6%of 34
ffs_full_fsync20%of 30
ffs_gop_size100%of 4
ffs_read52%of 25
ffs_reclaim39%of 13
ffs_spec_fsync23%of 9
ffs_write67%of 57
ufs_post_write_update62%of 18
x86_curlwp---of 1
-----------
SUMMARY43%of 276
acct_chkfree43%of 7
acct_init---of 1
acct_process---of 28
acct_stop67%of 9
acctwatch---of 10
sys_acct74%of 15
-----------
SUMMARY65%of 31
-----------
SUMMARY---of 0
change_keepalive50%of 6
sysctl_inpcblist---of 27
sysctl_net_inet_ip_ports---of 24
sysctl_net_inet_tcp_ident---of 66
sysctl_net_inet_tcp_mssdflt---of 4
sysctl_net_inet_tcp_setup2---of 1
sysctl_net_inet_tcp_stats---of 1
sysctl_tcp_congctl---of 4
sysctl_tcp_init_win---of 4
sysctl_tcp_keep---of 5
sysctl_update_tcpcb_template---of 4
tcp_abort_wrapper---of 3
tcp_accept_wrapper---of 5
tcp_attach_wrapper45%of 18
tcp_bind_wrapper67%of 9
tcp_connect2_wrapper67%of 3
tcp_connect_wrapper60%of 25
tcp_ctloutput40%of 45
tcp_detach_wrapper67%of 3
tcp_disconnect129%of 7
tcp_disconnect_wrapper---of 3
tcp_ioctl_wrapper75%of 4
tcp_listen_wrapper70%of 10
tcp_peeraddr_wrapper---of 5
tcp_purgeif_wrapper---of 4
tcp_rcvd_wrapper50%of 4
tcp_recvoob_wrapper20%of 10
tcp_send_wrapper60%of 5
tcp_sendoob_wrapper45%of 9
tcp_shutdown_wrapper100%of 4
tcp_sockaddr_wrapper100%of 6
tcp_stat_wrapper67%of 3
tcp_usrclosed38%of 24
tcp_usrreq_init---of 1
x86_curlwp---of 1
-----------
SUMMARY51%of 195
_fini_once---of 8
_init_once46%of 11
once_init---of 1
-----------
SUMMARY46%of 11
-----------
SUMMARY---of 0
prop_dictionary_get_bool---of 3
prop_dictionary_get_cstring---of 5
prop_dictionary_get_cstring_nocopy---of 4
prop_dictionary_get_data---of 5
prop_dictionary_get_dict---of 3
prop_dictionary_get_int---of 1
prop_dictionary_get_int16---of 1
prop_dictionary_get_int32---of 1
prop_dictionary_get_int64---of 1
prop_dictionary_get_int8---of 1
prop_dictionary_get_intptr---of 1
prop_dictionary_get_long---of 1
prop_dictionary_get_longlong---of 1
prop_dictionary_get_schar---of 1
prop_dictionary_get_short---of 1
prop_dictionary_get_string50%of 4
prop_dictionary_get_uchar---of 1
prop_dictionary_get_uint---of 1
prop_dictionary_get_uint16---of 1
prop_dictionary_get_uint32---of 1
prop_dictionary_get_uint64---of 1
prop_dictionary_get_uint8---of 1
prop_dictionary_get_uintptr---of 1
prop_dictionary_get_ulong---of 1
prop_dictionary_get_ulonglong---of 1
prop_dictionary_get_ushort---of 1
prop_dictionary_set_and_rel---of 3
prop_dictionary_set_bool---of 3
prop_dictionary_set_cstring---of 3
prop_dictionary_set_cstring_nocopy---of 3
prop_dictionary_set_data---of 3
prop_dictionary_set_data_nocopy---of 3
prop_dictionary_set_int---of 3
prop_dictionary_set_int16---of 3
prop_dictionary_set_int32---of 3
prop_dictionary_set_int64---of 3
prop_dictionary_set_int8---of 3
prop_dictionary_set_intptr---of 3
prop_dictionary_set_long---of 3
prop_dictionary_set_longlong---of 3
prop_dictionary_set_schar---of 3
prop_dictionary_set_short---of 3
prop_dictionary_set_string67%of 3
prop_dictionary_set_string_nocopy67%of 3
prop_dictionary_set_uchar---of 3
prop_dictionary_set_uint---of 3
prop_dictionary_set_uint1667%of 3
prop_dictionary_set_uint32---of 3
prop_dictionary_set_uint64---of 3
prop_dictionary_set_uint8---of 3
prop_dictionary_set_uintptr---of 3
prop_dictionary_set_ulong---of 3
prop_dictionary_set_ulonglong---of 3
prop_dictionary_set_ushort---of 3
-----------
SUMMARY62%of 13
_prop_string_equals---of 4
_prop_string_externalize---of 5
_prop_string_free56%of 9
_prop_string_init---of 1
_prop_string_instantiate45%of 9
_prop_string_internalize---of 10
_prop_string_rb_compare_key---of 3
_prop_string_rb_compare_nodes60%of 5
prop_string_append---of 8
prop_string_append_cstring---of 9
prop_string_compare---of 5
prop_string_compare_string---of 4
prop_string_copy---of 9
prop_string_copy_mutable---of 8
prop_string_copy_value---of 5
prop_string_create---of 3
prop_string_create_copy100%of 1
prop_string_create_cstring---of 6
prop_string_create_cstring_nocopy---of 3
prop_string_create_format50%of 6
prop_string_create_nocopy67%of 3
prop_string_cstring---of 5
prop_string_cstring_nocopy---of 4
prop_string_equals---of 5
prop_string_equals_cstring---of 4
prop_string_equals_string---of 4
prop_string_mutable---of 4
prop_string_size---of 4
prop_string_value---of 5
-----------
SUMMARY55%of 33
ccd_60_fini---of 3
ccd_60_init---of 1
compat_60_ccdioctl100%of 1
-----------
SUMMARY100%of 1
if_sl_modcmd---of 7
sl_clone_create---of 4
sl_clone_destroy---of 7
slattach---of 1
slclose---of 14
slinput---of 27
slintr---of 66
slioctl---of 14
slopen17%of 12
sloutput---of 22
slstart---of 5
sltioctl---of 3
x86_curlwp100%of 1
-----------
SUMMARY24%of 13
copy_procargs---of 25
copy_procargs_sysctl_cb---of 1
copyin_psstrings---of 4
fill_eproc---of 38
fill_kproc2---of 54
fixjobc49%of 33
get_expose_address---of 1
pg_delete31%of 13
pg_remove43%of 14
pgid_in_session40%of 10
pgrp_find84%of 6
pidtbl_dump---of 15
proc0_init---of 16
proc_alloc50%of 6
proc_alloc_lwpid54%of 13
proc_alloc_pid58%of 7
proc_alloc_pid_slot16%of 26
proc_crmod_enter60%of 5
proc_crmod_leave73%of 18
proc_ctor100%of 1
proc_enterpgrp43%of 71
proc_find77%of 13
proc_find_locked---of 7
proc_find_lwp64%of 11
proc_find_lwp_acquire_proc---of 8
proc_find_lwp_unlocked60%of 10
proc_find_lwpid---of 12
proc_find_raw34%of 6
proc_finispecific---of 1
proc_free_lwpid30%of 10
proc_free_mem67%of 3
proc_free_pid67%of 3
proc_free_pid_internal56%of 9
proc_getauxv---of 9
proc_getspecific---of 1
proc_initspecific---of 3
proc_leavepgrp40%of 10
proc_listener_cb38%of 16
proc_sesshold---of 3
proc_sessrele50%of 8
proc_setspecific---of 1
proc_specific_key_create---of 1
proc_specific_key_delete---of 1
proc_uidmatch---of 5
proc_vmspace_getref50%of 6
procinit---of 5
procinit_sysctl---of 1
proclist_foreach_call48%of 17
sysctl_doeproc---of 90
sysctl_kern_proc_args---of 40
sysctl_security_expose_address---of 5
x86_curlwp100%of 1
-----------
SUMMARY48%of 346
npf_table_check---of 15
npf_table_create---of 10
npf_table_destroy---of 21
npf_table_flush---of 6
npf_table_gc---of 13
npf_table_getid---of 1
npf_table_getsome---of 7
npf_table_insert---of 29
npf_table_list---of 26
npf_table_lookup---of 14
npf_table_remove---of 20
npf_tableset_create---of 1
npf_tableset_destroy---of 7
npf_tableset_export---of 8
npf_tableset_getbyid---of 4
npf_tableset_getbyname29%of 7
npf_tableset_insert---of 5
npf_tableset_reload---of 14
npf_tableset_swap---of 5
npf_tableset_sysfini---of 1
npf_tableset_sysinit---of 1
table_ipset_flush---of 9
table_tree_flush---of 9
-----------
SUMMARY29%of 7
sysctl_basenode_init---of 1
sysctl_hw_machine_arch---of 1
sysctl_hwbase_setup---of 1
sysctl_kernbase_setup---of 1
sysctl_setlen60%of 5
-----------
SUMMARY60%of 5
compat_50_iflist---of 3
compat_50_route_abort---of 3
compat_50_route_accept---of 3
compat_50_route_attach_wrapper60%of 10
compat_50_route_bind_wrapper---of 3
compat_50_route_connect2_wrapper67%of 3
compat_50_route_connect_wrapper---of 3
compat_50_route_detach_wrapper55%of 11
compat_50_route_disconnect_wrapper60%of 5
compat_50_route_enqueue---of 4
compat_50_route_filter18%of 28
compat_50_route_init---of 1
compat_50_route_intr---of 6
compat_50_route_ioctl_wrapper---of 1
compat_50_route_listen_wrapper---of 3
compat_50_route_output13%of 132
compat_50_route_peeraddr_wrapper56%of 9
compat_50_route_rcvd_wrapper---of 3
compat_50_route_recvoob_wrapper67%of 3
compat_50_route_send_wrapper60%of 5
compat_50_route_sendoob_wrapper---of 3
compat_50_route_shutdown_wrapper67%of 3
compat_50_route_sockaddr_wrapper---of 9
compat_50_route_stat_wrapper67%of 3
compat_50_rt_addrmsg---of 1
compat_50_rt_addrmsg0---of 41
compat_50_rt_addrmsg_rt---of 1
compat_50_rt_addrmsg_src---of 1
compat_50_rt_ieee80211msg---of 15
compat_50_rt_ifannouncemsg---of 6
compat_50_rt_ifmsg---of 9
compat_50_rt_missmsg---of 7
compat_50_rt_msg1---of 28
compat_50_rt_oifmsg50%of 4
route_ctloutput---of 24
route_output_report---of 10
rt_msg2---of 26
rt_pr_init---of 1
rtsock_50_fini---of 19
rtsock_50_init---of 1
x86_curlwp100%of 1
-----------
SUMMARY26%of 217
ufs_accessx53%of 21
ufs_advlock100%of 1
ufs_bufio65%of 20
ufs_close100%of 3
ufs_create50%of 4
ufs_do_nfs4_acl_inheritance---of 3
ufs_getattr60%of 10
ufs_gop_alloc46%of 11
ufs_gop_markupdate60%of 5
ufs_link32%of 25
ufs_makeinode25%of 44
ufs_mkdir20%of 62
ufs_mknod50%of 6
ufs_open75%of 4
ufs_pathconf78%of 18
ufs_print---of 3
ufs_readdir54%of 28
ufs_readlink45%of 9
ufs_remove47%of 15
ufs_rmdir55%of 22
ufs_setattr49%of 120
ufs_strategy37%of 22
ufs_symlink53%of 17
ufs_vinit64%of 11
ufs_whiteout34%of 15
ufsfifo_close100%of 3
ufsfifo_read100%of 1
ufsfifo_write100%of 1
ufsspec_close100%of 3
ufsspec_read67%of 3
ufsspec_write67%of 3
x86_curlwp100%of 1
-----------
SUMMARY46%of 508
-----------
SUMMARY---of 0
hci_abort_wrapper---of 3
hci_accept_wrapper---of 3
hci_attach_wrapper63%of 16
hci_bind_wrapper---of 16
hci_connect2_wrapper67%of 3
hci_connect_wrapper---of 11
hci_ctloutput15%of 14
hci_detach34%of 18
hci_detach_wrapper100%of 1
hci_device_cb10%of 22
hci_disconnect_wrapper---of 5
hci_drop---of 7
hci_init---of 3
hci_ioctl_wrapper100%of 1
hci_listen_wrapper---of 3
hci_mtap---of 41
hci_peeraddr_wrapper---of 7
hci_purgeif_wrapper---of 1
hci_rcvd_wrapper---of 3
hci_recvoob_wrapper67%of 3
hci_send_wrapper---of 25
hci_sendoob_wrapper---of 3
hci_shutdown_wrapper---of 3
hci_sockaddr_wrapper58%of 7
hci_stat_wrapper67%of 3
x86_curlwp100%of 1
-----------
SUMMARY38%of 89
rn_addmask52%of 27
rn_addroute21%of 88
rn_delayedinit---of 3
rn_delete100%of 1
rn_delete129%of 50
rn_init---of 21
rn_inithead---of 4
rn_inithead0---of 1
rn_insert84%of 12
rn_lookup38%of 8
rn_match66%of 49
rn_newpair---of 1
rn_refines---of 11
rn_search---of 4
rn_search_m---of 7
rn_search_matched---of 18
rn_walktree---of 18
-----------
SUMMARY40%of 235
-----------
SUMMARY---of 0
rfcomm_abort_wrapper---of 8
rfcomm_accept_wrapper---of 7
rfcomm_attach_wrapper48%of 21
rfcomm_bind_wrapper---of 9
rfcomm_complete---of 7
rfcomm_connect2_wrapper---of 3
rfcomm_connect_wrapper---of 9
rfcomm_connected---of 3
rfcomm_connecting---of 3
rfcomm_ctloutput50%of 6
rfcomm_detach_wrapper60%of 5
rfcomm_disconnect_wrapper---of 5
rfcomm_disconnected---of 3
rfcomm_input---of 15
rfcomm_ioctl_wrapper100%of 1
rfcomm_linkmode---of 5
rfcomm_listen_wrapper---of 5
rfcomm_newconn---of 3
rfcomm_peeraddr_wrapper---of 7
rfcomm_purgeif_wrapper---of 1
rfcomm_rcvd_wrapper---of 9
rfcomm_recvoob_wrapper67%of 3
rfcomm_send_wrapper---of 10
rfcomm_sendoob_wrapper---of 3
rfcomm_shutdown_wrapper---of 3
rfcomm_sockaddr_wrapper---of 7
rfcomm_stat_wrapper---of 3
-----------
SUMMARY53%of 36
kcpuset_atomic_clear67%of 3
kcpuset_atomic_set67%of 3
kcpuset_atomicly_intersect---of 5
kcpuset_atomicly_merge---of 5
kcpuset_atomicly_remove---of 5
kcpuset_clear---of 6
kcpuset_clone---of 1
kcpuset_copy43%of 7
kcpuset_copyin---of 9
kcpuset_copyout---of 9
kcpuset_countset100%of 3
kcpuset_create34%of 6
kcpuset_destroy58%of 7
kcpuset_export_u32---of 3
kcpuset_ffs---of 4
kcpuset_ffs_intersecting---of 4
kcpuset_fill---of 7
kcpuset_intersect---of 7
kcpuset_intersecting_p---of 5
kcpuset_isotherset---of 6
kcpuset_isset46%of 11
kcpuset_iszero---of 5
kcpuset_match100%of 1
kcpuset_merge58%of 7
kcpuset_remove---of 6
kcpuset_set---of 6
kcpuset_sysinit---of 9
kcpuset_unuse---of 13
kcpuset_use---of 3
kcpuset_zero43%of 7
-----------
SUMMARY53%of 55
null_modcmd---of 4
nullfs_mount70%of 13
nullfs_sysctl_setup---of 1
nullfs_unmount50%of 4
x86_curlwp100%of 1
-----------
SUMMARY67%of 18
wsmux_attach_sc---of 18
wsmux_create---of 1
wsmux_detach_sc---of 12
wsmux_do_displayioctl---of 9
wsmux_do_ioctl28%of 51
wsmux_evsrc_set_display---of 7
wsmux_getmux60%of 5
wsmux_mux_close---of 6
wsmux_mux_open---of 9
wsmux_set_display---of 12
wsmuxattach---of 1
wsmuxclose29%of 7
wsmuxioctl100%of 1
wsmuxkqfilter---of 4
wsmuxopen39%of 13
wsmuxpoll75%of 4
wsmuxread50%of 4
-----------
SUMMARY36%of 85
inittodr---of 1
resettodr50%of 4
todr_attach---of 5
todr_init---of 1
todr_lock---of 1
todr_lock_owned---of 1
todr_save_systime30%of 20
todr_set_systime---of 40
todr_unlock---of 1
-----------
SUMMARY34%of 24
uvm_fault_internal50%of 313
uvm_fault_lower_enter49%of 47
uvm_fault_lower_upgrade32%of 16
uvm_fault_unwire100%of 1
uvm_fault_unwire_locked49%of 27
uvm_fault_upper_enter39%of 31
uvm_fault_upper_upgrade32%of 16
uvm_fault_wire86%of 7
uvmfault_anonget7%of 74
uvmfault_promote43%of 42
uvmfault_update_stats67%of 6
x86_curlwp100%of 1
-----------
SUMMARY43%of 581
igmp_fasttimo---of 9
igmp_init---of 1
igmp_input---of 91
igmp_joingroup47%of 15
igmp_leavegroup38%of 8
igmp_purgeif---of 11
igmp_sendpkt50%of 10
igmp_slowtimo---of 7
sysctl_net_inet_igmp_stats---of 1
-----------
SUMMARY46%of 33
_bus_dma_alloc_bouncebuf---of 6
_bus_dmamap_load_busaddr54%of 15
_bus_dmamem_alloc---of 19
_bus_dmamem_free---of 8
_bus_dmamem_map---of 9
_bus_dmamem_unmap---of 10
bus_dma_tag_create---of 22
bus_dma_tag_destroy---of 9
bus_dmamap_create---of 21
bus_dmamap_destroy---of 8
bus_dmamap_load27%of 23
bus_dmamap_load_mbuf18%of 39
bus_dmamap_load_raw---of 14
bus_dmamap_load_uio---of 23
bus_dmamap_sync21%of 49
bus_dmamap_unload---of 7
bus_dmamem_alloc---of 5
bus_dmamem_free---of 5
bus_dmamem_map---of 5
bus_dmamem_mmap---of 15
bus_dmamem_unmap---of 5
bus_dmatag_destroy---of 8
bus_dmatag_subregion---of 17
-----------
SUMMARY25%of 126
phtree_SPLAY100%of 12
phtree_SPLAY_INSERT---of 5
phtree_SPLAY_MINMAX---of 8
phtree_SPLAY_REMOVE---of 5
pool_cache_bootstrap---of 45
pool_cache_bootstrap_destroy---of 21
pool_cache_cpu_init---of 9
pool_cache_destroy---of 1
pool_cache_destruct_object---of 3
pool_cache_get_paddr36%of 28
pool_cache_get_slow26%of 27
pool_cache_init---of 3
pool_cache_invalidate---of 14
pool_cache_invalidate_groups---of 12
pool_cache_nget---of 1
pool_cache_nput---of 1
pool_cache_prime---of 6
pool_cache_put_paddr60%of 10
pool_cache_reclaim---of 1
pool_cache_set_drain_hook---of 3
pool_cache_sethardlimit---of 1
pool_cache_sethiwat---of 1
pool_cache_setlowat---of 10
pool_cache_transfer---of 9
pool_chk---of 13
pool_chk_page---of 15
pool_destroy---of 31
pool_drain---of 10
pool_get47%of 94
pool_grow55%of 62
pool_init---of 55
pool_nget---of 1
pool_nput---of 1
pool_page_alloc100%of 1
pool_page_alloc_meta100%of 1
pool_page_free---of 1
pool_page_free_meta---of 1
pool_pcg_put---of 8
pool_pcg_trunc---of 8
pool_prime---of 6
pool_printall---of 4
pool_printit---of 60
pool_put50%of 63
pool_reclaim---of 31
pool_set_drain_hook---of 3
pool_sethardlimit---of 1
pool_sethiwat---of 1
pool_setlowat---of 10
pool_subsystem_init---of 3
pool_sysctl---of 21
pool_totalpages---of 6
pool_totalpages_locked---of 6
pool_whatis---of 78
pr_pagelist_free14%of 15
pr_rmpage---of 26
sysctl_pool_setup---of 1
-----------
SUMMARY48%of 313
pserialize_create---of 1
pserialize_destroy---of 1
pserialize_in_read_section67%of 3
pserialize_init---of 1
pserialize_not_in_read_section100%of 1
pserialize_perform50%of 8
pserialize_read_enter100%of 1
pserialize_read_exit50%of 6
-----------
SUMMARY58%of 19
compat_100___kevent50_fetch_changes---of 6
compat_100___kevent50_put_events---of 5
compat_50_kevent_fetch_timeout---of 5
compat_50_sys_kevent---of 1
compat_50_sys_pollts---of 7
compat_50_sys_pselect---of 7
compat_50_sys_select100%of 5
kern_select_50_fini---of 1
kern_select_50_init---of 1
-----------
SUMMARY100%of 5
wsmouse_activate---of 3
wsmouse_add_mux---of 5
wsmouse_attach---of 6
wsmouse_detach---of 10
wsmouse_handle_params10%of 30
wsmouse_input---of 44
wsmouse_match---of 1
wsmouse_mux_close---of 1
wsmouse_mux_open40%of 5
wsmouse_precision_scroll---of 6
wsmouse_repeat---of 11
wsmouseclose---of 3
wsmousedevprint---of 3
wsmousedoioctl34%of 24
wsmouseioctl---of 1
wsmousekqfilter---of 3
wsmouseopen---of 10
wsmousepoll---of 3
wsmouseread---of 5
-----------
SUMMARY23%of 59
wd_discard---of 17
wd_diskstart---of 8
wd_dumpblocks---of 14
wd_firstopen---of 6
wd_flushcache---of 11
wd_get_params---of 7
wd_iosize---of 3
wd_lastclose---of 5
wd_shutdown---of 8
wd_suspend---of 7
wdattach---of 51
wdbiorequeue---of 6
wdbioretry---of 6
wdclose---of 1
wddebug---of 9
wddetach---of 11
wddiscard---of 1
wddone---of 31
wddump---of 4
wdioctl---of 44
wdioctlstrategy---of 47
wdminphys---of 3
wdopen---of 9
wdperror---of 18
wdprobe---of 6
wdread---of 3
wdrestart---of 5
wdsize50%of 6
wdstart1---of 40
wdstrategy---of 6
wdwrite---of 3
-----------
SUMMARY50%of 6
mm_init---of 5
mm_ioctl50%of 10
mm_mmap67%of 3
mm_open100%of 1
mm_readwrite27%of 23
-----------
SUMMARY38%of 37
random_close100%of 1
random_ioctl67%of 3
random_kqfilter---of 6
random_open100%of 1
random_poll50%of 4
random_read50%of 4
random_write42%of 12
rndattach---of 1
x86_curlwp100%of 1
-----------
SUMMARY54%of 26
msdos_modcmd---of 4
msdosfs_fhtovp---of 6
msdosfs_mount34%of 30
msdosfs_mountfs9%of 59
msdosfs_mountroot---of 6
msdosfs_root---of 4
msdosfs_start---of 1
msdosfs_statvfs---of 1
msdosfs_sync---of 9
msdosfs_sync_selector---of 9
msdosfs_sysctl_setup---of 1
msdosfs_unmount---of 4
msdosfs_vget---of 1
msdosfs_vptofh---of 4
update_mp---of 7
x86_curlwp100%of 1
-----------
SUMMARY18%of 90
addlog---of 8
addtstamp40%of 15
aprint_debug---of 7
aprint_debug_dev---of 11
aprint_debug_ifnet---of 9
aprint_error---of 1
aprint_error_dev---of 3
aprint_error_ifnet---of 3
aprint_error_internal---of 10
aprint_get_error_count---of 5
aprint_naive---of 7
aprint_naive_dev---of 11
aprint_naive_ifnet---of 9
aprint_normal---of 8
aprint_normal_dev---of 12
aprint_normal_ifnet---of 10
aprint_verbose---of 8
aprint_verbose_dev---of 12
aprint_verbose_ifnet---of 10
db_printf---of 3
db_vprintf---of 3
device_printf---of 6
klogpri---of 3
kprintf29%of 227
kprintf_init---of 5
kprintf_internal---of 1
kprintf_lock---of 3
kprintf_unlock---of 4
log50%of 10
logpri---of 8
printf50%of 6
printf_flags---of 6
printf_nolog---of 6
printf_nostamp---of 6
printf_tolog---of 6
putlogpri48%of 25
putone75%of 16
snprintf67%of 3
tablefull---of 3
tprintf---of 10
tprintf_close---of 3
tprintf_open---of 4
ttyprintf---of 1
twiddle---of 6
uprintf---of 4
uprintf_locked---of 4
vasprintf---of 3
vlog---of 10
vpanic---of 21
vprintf50%of 8
vprintf_flags---of 6
vsnprintf100%of 3
x86_curlwp---of 1
-----------
SUMMARY36%of 313
-----------
SUMMARY---of 0
in6_print40%of 35
sin6_print---of 3
-----------
SUMMARY40%of 35
ptyfs__allocvp38%of 8
ptyfs__getmp67%of 6
ptyfs__getvattr100%of 1
ptyfs__makename---of 12
ptyfs_done---of 1
ptyfs_init---of 1
ptyfs_loadvnode56%of 9
ptyfs_modcmd---of 4
ptyfs_mount48%of 17
ptyfs_reinit---of 1
ptyfs_root50%of 4
ptyfs_start100%of 1
ptyfs_sync100%of 1
ptyfs_sysctl_setup---of 1
ptyfs_unmount40%of 15
ptyfs_vget---of 1
x86_curlwp100%of 1
-----------
SUMMARY51%of 63
default_bus_space_handle_is_equal---of 1
default_bus_space_is_equal---of 1
deveopnotsupp100%of 1
devnullop100%of 1
enodev100%of 1
enoioctl---of 1
enosys100%of 1
enxio---of 1
nullret---of 1
sys_sa_yield---of 1
ttyvenodev---of 1
-----------
SUMMARY100%of 4
-----------
SUMMARY---of 0
Fhash---of 4
algo_bsd55%of 11
algo_doublehash---of 17
algo_hash---of 15
algo_randinc---of 13
algo_random_pick---of 11
algo_random_start---of 11
check_suitable_port27%of 19
portalgo_algo_index_select20%of 10
portalgo_randport29%of 21
sysctl_portalgo_available---of 1
sysctl_portalgo_reserve4---of 1
sysctl_portalgo_reserve6---of 1
sysctl_portalgo_selected---of 10
sysctl_portalgo_selected4---of 1
sysctl_portalgo_selected6---of 1
-----------
SUMMARY32%of 61
bounds_check_with_label---of 11
bounds_check_with_mediasize34%of 6
convertdisklabel---of 21
disk_attach100%of 1
disk_begindetach---of 5
disk_busy100%of 1
disk_destroy100%of 1
disk_detach---of 3
disk_find---of 4
disk_init100%of 1
disk_ioctl17%of 24
disk_isbusy---of 1
disk_read_sectors---of 4
disk_rename---of 1
disk_set_info---of 25
disk_unbusy---of 1
disk_wait100%of 1
diskerr---of 10
disklabel_dev_unit100%of 1
-----------
SUMMARY34%of 36
compat_20_sys_fhstatfs---of 7
compat_20_sys_fstatfs50%of 4
compat_20_sys_getfsstat100%of 1
compat_20_sys_statfs75%of 4
statvfs_to_statfs12_copy59%of 12
vfs_syscalls_20_fini---of 1
vfs_syscalls_20_init---of 1
-----------
SUMMARY62%of 21
ufs_balloc_range55%of 11
ufs_inactive47%of 28
ufs_reclaim50%of 10
ufs_truncate_all50%of 16
ufs_truncate_retry47%of 13
x86_curlwp100%of 1
-----------
SUMMARY50%of 79
chacha_core_sse2---of 4
chacha_stream_sse236%of 14
chacha_stream_xor_sse2---of 19
hchacha_sse2---of 4
xchacha_stream_sse2---of 4
xchacha_stream_xor_sse2---of 4
-----------
SUMMARY36%of 14
extattr_check_cred---of 3
extattr_delete_vp---of 5
extattr_get_vp---of 9
extattr_list_vp---of 7
extattr_set_vp---of 14
sys_extattr_delete_fd---of 4
sys_extattr_delete_file---of 4
sys_extattr_delete_link---of 4
sys_extattr_get_fd---of 4
sys_extattr_get_file---of 4
sys_extattr_get_link---of 4
sys_extattr_list_fd---of 3
sys_extattr_list_file---of 3
sys_extattr_list_link---of 3
sys_extattr_set_fd---of 4
sys_extattr_set_file---of 4
sys_extattr_set_link---of 4
sys_extattrctl---of 11
sys_fgetxattr---of 7
sys_flistxattr---of 6
sys_fremovexattr---of 7
sys_fsetxattr---of 7
sys_getxattr---of 7
sys_lgetxattr---of 7
sys_listxattr---of 6
sys_llistxattr---of 6
sys_lremovexattr---of 7
sys_lsetxattr---of 7
sys_removexattr---of 7
sys_setxattr---of 7
vfs_stdextattrctl67%of 3
x86_curlwp---of 1
-----------
SUMMARY67%of 3
sco_abort_wrapper---of 8
sco_accept_wrapper---of 7
sco_attach_wrapper65%of 14
sco_bind_wrapper---of 9
sco_complete---of 10
sco_connect2_wrapper67%of 3
sco_connect_wrapper---of 9
sco_connected---of 1
sco_connecting---of 1
sco_ctloutput34%of 6
sco_detach_wrapper60%of 5
sco_disconnect_wrapper---of 5
sco_disconnected---of 1
sco_input---of 13
sco_ioctl_wrapper100%of 1
sco_linkmode---of 1
sco_listen_wrapper60%of 5
sco_newconn---of 3
sco_peeraddr_wrapper---of 7
sco_purgeif_wrapper---of 1
sco_rcvd_wrapper---of 3
sco_recvoob_wrapper67%of 3
sco_send_wrapper---of 12
sco_sendoob_wrapper---of 3
sco_shutdown_wrapper---of 3
sco_sockaddr_wrapper58%of 7
sco_stat_wrapper67%of 3
-----------
SUMMARY60%of 47
sched_lwp_collect100%of 1
sched_lwp_fork100%of 1
sched_newts100%of 1
sched_nice73%of 11
sched_oncpu100%of 1
sched_proc_exit58%of 7
sched_proc_fork67%of 3
sched_pstats_hook---of 16
sched_rqinit---of 1
sched_schedclock---of 9
sched_setrunnable80%of 15
sched_slept100%of 1
sched_tick---of 11
sched_wakeup100%of 1
sysctl_sched_4bsd_setup---of 3
sysctl_sched_rtts---of 1
x86_curlwp100%of 1
-----------
SUMMARY77%of 43
tmpfs_fifo_close100%of 1
tmpfs_fifo_read100%of 1
tmpfs_fifo_write100%of 1
-----------
SUMMARY100%of 3
secmodel_suser_device_cb48%of 19
secmodel_suser_generic_cb---of 3
secmodel_suser_init---of 1
secmodel_suser_machdep_cb14%of 15
secmodel_suser_network_cb38%of 35
secmodel_suser_process_cb42%of 24
secmodel_suser_start---of 1
secmodel_suser_stop---of 1
secmodel_suser_system_cb53%of 36
secmodel_suser_vnode_cb100%of 3
suser_eval---of 3
suser_modcmd---of 7
sysctl_security_suser_setup---of 1
-----------
SUMMARY43%of 132
tmpfs_access85%of 20
tmpfs_advlock67%of 3
tmpfs_close67%of 3
tmpfs_create50%of 6
tmpfs_fsync67%of 3
tmpfs_getattr80%of 5
tmpfs_getpages55%of 24
tmpfs_inactive75%of 8
tmpfs_link45%of 29
tmpfs_lookup74%of 57
tmpfs_mkdir67%of 3
tmpfs_mknod80%of 5
tmpfs_open60%of 5
tmpfs_pathconf90%of 10
tmpfs_print---of 5
tmpfs_putpages63%of 8
tmpfs_read54%of 15
tmpfs_readdir58%of 33
tmpfs_readlink55%of 11
tmpfs_reclaim72%of 7
tmpfs_remove52%of 29
tmpfs_rmdir44%of 53
tmpfs_setattr61%of 28
tmpfs_symlink67%of 3
tmpfs_whiteout48%of 17
tmpfs_write46%of 22
x86_curlwp100%of 1
-----------
SUMMARY60%of 408
cd_bounce_buffer_done---of 11
cd_diskstart---of 12
cd_firstopen---of 24
cd_interpret_sense---of 17
cd_label---of 8
cd_lastclose---of 3
cd_mode_select---of 3
cd_mode_sense---of 3
cd_set_pa_immed---of 8
cd_setchan---of 6
cdattach---of 9
cdclose---of 1
cddetach---of 4
cddone---of 4
cddump---of 1
cdioctl---of 255
cdmatch---of 1
cdminphys---of 4
cdopen40%of 5
cdread---of 1
cdrestart---of 1
cdsize---of 1
cdstart---of 1
cdstrategy---of 17
cdwrite---of 1
do_cdioreadentries---of 13
mmc_gettrackinfo---of 24
read_cd_capacity---of 12
-----------
SUMMARY40%of 5
shm_delete_mapping75%of 12
shm_find_segment_perm_by_index---of 3
shmctl167%of 33
shmexit---of 13
shmfini---of 5
shmfork60%of 5
shminit---of 8
shmmap_getprivate80%of 10
sys___shmctl50100%of 6
sys_shmat69%of 22
sys_shmdt67%of 15
sys_shmget48%of 40
sysctl_ipc_shm_setup---of 1
sysctl_ipc_shmmax---of 4
sysctl_ipc_shmmaxpgs---of 4
sysctl_ipc_shmmni---of 24
-----------
SUMMARY65%of 143
m_add---of 4
m_adj50%of 18
m_align39%of 13
m_apply---of 17
m_cat---of 10
m_clget50%of 6
m_copy_internal43%of 33
m_copy_pkthdr50%of 8
m_copyback40%of 5
m_copyback_cow42%of 12
m_copyback_internal25%of 80
m_copydata59%of 12
m_copym100%of 1
m_copypacket55%of 22
m_copyup---of 16
m_defrag---of 45
m_devget---of 17
m_dup---of 1
m_ensure_contig44%of 16
m_ext_free62%of 18
m_free84%of 6
m_freem100%of 4
m_get50%of 6
m_get_n---of 6
m_getcl---of 7
m_gethdr67%of 3
m_gethdr_n---of 6
m_getptr---of 6
m_makewritable---of 12
m_move_pkthdr63%of 8
m_prepend40%of 10
m_print---of 34
m_pulldown39%of 54
m_pullup43%of 7
m_remove_pkthdr---of 3
m_split---of 1
m_split_internal---of 24
m_tag_copy---of 3
m_tag_copy_chain34%of 9
m_tag_delete---of 6
m_tag_delete_chain18%of 17
m_tag_find43%of 7
m_tag_free---of 1
m_tag_get---of 4
m_tag_prepend---of 3
m_tag_unlink---of 6
m_verify_packet---of 11
mb_ctor100%of 1
mb_drain---of 20
mbinit---of 7
mbstat_convert_to_user_cb---of 3
mbstat_type_add---of 1
sysctl_kern_mbuf---of 14
sysctl_kern_mbuf_stats---of 1
x86_curlwp---of 1
-----------
SUMMARY42%of 376
agp_acquire---of 3
agp_alloc_dmamem---of 6
agp_alloc_gatt---of 3
agp_alloc_memory---of 1
agp_bind_memory---of 1
agp_enable---of 1
agp_find_device---of 1
agp_free_dmamem---of 1
agp_free_gatt---of 1
agp_free_memory---of 1
agp_generic_alloc_memory---of 8
agp_generic_bind_memory---of 1
agp_generic_bind_memory_bounded---of 22
agp_generic_detach---of 1
agp_generic_enable---of 5
agp_generic_free_memory---of 8
agp_generic_unbind_memory---of 5
agp_get_info---of 1
agp_map_aperture---of 3
agp_memory_info---of 1
agp_release---of 4
agp_resume---of 1
agp_state---of 1
agp_unbind_memory---of 1
agpattach---of 25
agpclose---of 9
agpdev_match---of 4
agpioctl---of 28
agpmatch---of 13
agpmmap---of 4
agpopen40%of 5
-----------
SUMMARY40%of 5
usb_delay_ms---of 3
usb_delay_ms_locked---of 3
usb_disconnect_port---of 10
usb_findproduct---of 5
usb_findvendor---of 5
usb_free_device---of 23
usbd_attach_roothub---of 3
usbd_attachinterfaces---of 28
usbd_attachwholedevice---of 5
usbd_delay_ms---of 3
usbd_delay_ms_locked---of 3
usbd_devinfo---of 3
usbd_devinfo_alloc---of 1
usbd_devinfo_free---of 1
usbd_devinfo_vp---of 26
usbd_endpoint_acquire---of 4
usbd_endpoint_release67%of 3
usbd_errstr---of 3
usbd_fill_deviceinfo---of 31
usbd_fill_iface_data---of 37
usbd_find_edesc---of 14
usbd_find_idesc---of 7
usbd_free_iface_data---of 9
usbd_get_device_strings---of 16
usbd_iface_fini---of 13
usbd_iface_lock---of 7
usbd_iface_locked---of 1
usbd_iface_piperef---of 5
usbd_iface_pipeunref---of 7
usbd_iface_unlock---of 3
usbd_ifprint---of 3
usbd_kill_pipe---of 1
usbd_new_device25%of 44
usbd_print---of 4
usbd_printBCD---of 1
usbd_probe_and_attach---of 13
usbd_properties---of 14
usbd_reattach_device---of 19
usbd_reload_device_desc---of 5
usbd_remove_device56%of 9
usbd_reset_port30%of 10
usbd_set_config_index---of 64
usbd_set_config_no---of 7
usbd_setup_pipe---of 1
usbd_setup_pipe_flags22%of 14
-----------
SUMMARY30%of 80
virtio_pci_alloc_interrupts---of 48
virtio_pci_attach---of 60
virtio_pci_detach---of 10
virtio_pci_find_cap---of 10
virtio_pci_free_interrupts---of 9
virtio_pci_intr---of 8
virtio_pci_kick_09100%of 1
virtio_pci_kick_10---of 1
virtio_pci_match---of 7
virtio_pci_modcmd---of 1
virtio_pci_msix_config_intr---of 3
virtio_pci_msix_queue_intr---of 4
virtio_pci_negotiate_features_09---of 1
virtio_pci_negotiate_features_10---of 4
virtio_pci_read_queue_size_09---of 1
virtio_pci_read_queue_size_10---of 1
virtio_pci_rescan---of 3
virtio_pci_set_status_09---of 3
virtio_pci_set_status_10---of 3
virtio_pci_setup_interrupts_09---of 8
virtio_pci_setup_interrupts_10---of 8
virtio_pci_setup_queue_09---of 3
virtio_pci_setup_queue_10---of 7
-----------
SUMMARY100%of 1
exec_netbsd32_makecmds38%of 8
netbsd32_exec_aout_prep_nmagic---of 3
netbsd32_exec_aout_prep_omagic---of 3
netbsd32_exec_aout_prep_zmagic---of 4
-----------
SUMMARY38%of 8
udv_attach43%of 28
udv_detach19%of 11
udv_fault23%of 22
udv_init---of 1
udv_reference100%of 1
-----------
SUMMARY33%of 62
if43_20_fini---of 3
if43_20_init---of 1
if43_cvtcmd_20100%of 1
-----------
SUMMARY100%of 1
enforce_rlimit_fsize72%of 7
vn_advlock100%of 3
vn_bdev_open---of 4
vn_bdev_openpath---of 4
vn_close75%of 4
vn_closefile100%of 1
vn_extattr_get---of 5
vn_extattr_rm---of 7
vn_extattr_set---of 3
vn_fcntl100%of 1
vn_fifo_bypass100%of 1
vn_fpathconf---of 1
vn_ioctl78%of 22
vn_knote_attach---of 10
vn_knote_detach---of 21
vn_kqfilter---of 1
vn_lock54%of 28
vn_markexec50%of 4
vn_marktext---of 6
vn_mmap72%of 42
vn_open75%of 55
vn_openchk---of 9
vn_poll100%of 1
vn_posix_fadvise62%of 18
vn_rdwr50%of 12
vn_read72%of 7
vn_readdir50%of 16
vn_seek58%of 19
vn_stat75%of 4
vn_statfile100%of 1
vn_truncate50%of 6
vn_write70%of 10
vn_writechk100%of 1
x86_curlwp100%of 1
-----------
SUMMARY68%of 264
compat_ifdatareq75%of 8
uipc_syscalls_50_fini---of 3
uipc_syscalls_50_init---of 1
-----------
SUMMARY75%of 8
in_undefer_cksum---of 18
in_undefer_cksum_tcpudp50%of 12
ip_tso_output---of 6
tcp4_segment---of 48
-----------
SUMMARY50%of 12
_kernel_lock54%of 28
_kernel_lock_dump---of 1
_kernel_locked_p100%of 1
_kernel_unlock64%of 22
assert_sleepable40%of 10
kernel_lock_init---of 1
kernel_lock_trace_ipi---of 3
x86_curlwp100%of 1
-----------
SUMMARY57%of 62
check_sigcontext32---of 22
cpu_coredump32---of 6
cpu_exec_aout_makecmds100%of 1
cpu_getmcontext32---of 3
cpu_mcontext32_validate---of 22
cpu_mcontext32from64_validate---of 1
cpu_setmcontext32---of 12
netbsd32_buildcontext---of 5
netbsd32_machdep_md_fini---of 7
netbsd32_machdep_md_init---of 1
netbsd32_machine32---of 1
netbsd32_process_doxmmregs23%of 9
netbsd32_process_read_dbregs---of 1
netbsd32_process_read_fpregs---of 1
netbsd32_process_read_regs---of 1
netbsd32_process_write_dbregs---of 7
netbsd32_process_write_fpregs---of 1
netbsd32_process_write_regs---of 22
netbsd32_ptrace_translate_request---of 3
netbsd32_sendsig_siginfo---of 10
netbsd32_setregs---of 1
netbsd32_sysarch---of 39
netbsd32_vm_default_addr---of 1
startlwp32---of 17
x86_curlwp---of 1
-----------
SUMMARY30%of 10
kern_pset_destroy---of 17
psets_init---of 1
psets_listener_cb67%of 3
sys__pset_bind---of 27
sys_pset_assign---of 36
sys_pset_create---of 12
sys_pset_destroy---of 3
sysctl_pset_setup---of 3
sysctl_psets_list---of 8
sysctl_psets_max---of 8
x86_curlwp---of 1
-----------
SUMMARY67%of 3
hardupdate58%of 19
ntp_adjtime194%of 32
ntp_gettime---of 1
ntp_init---of 1
ntp_timestatus75%of 4
ntp_update_second22%of 23
sys___ntp_gettime50---of 7
sys_ntp_adjtime---of 9
sysctl_kern_ntptime---of 1
sysctl_kern_ntptime_setup---of 1
-----------
SUMMARY63%of 78
_prop_dict_init---of 1
_prop_dict_keysym_equals---of 1
_prop_dict_keysym_externalize---of 6
_prop_dict_keysym_free40%of 5
_prop_dict_keysym_rb_compare_key100%of 1
_prop_dict_keysym_rb_compare_nodes100%of 1
_prop_dictionary_alloc---of 6
_prop_dictionary_emergency_free---of 5
_prop_dictionary_equals---of 15
_prop_dictionary_equals_finish---of 1
_prop_dictionary_externalize---of 28
_prop_dictionary_free40%of 15
_prop_dictionary_get_keysym---of 15
_prop_dictionary_internalize---of 6
_prop_dictionary_internalize_body---of 15
_prop_dictionary_internalize_continue---of 6
_prop_dictionary_iterator_next_object---of 12
_prop_dictionary_iterator_reset---of 4
_prop_dictionary_lock67%of 3
_prop_dictionary_unlock100%of 1
prop_dictionary_all_keys---of 7
prop_dictionary_copy---of 7
prop_dictionary_copy_mutable---of 5
prop_dictionary_count---of 4
prop_dictionary_create67%of 3
prop_dictionary_create_with_capacity---of 1
prop_dictionary_ensure_capacity---of 7
prop_dictionary_equals---of 5
prop_dictionary_externalize---of 6
prop_dictionary_get28%of 11
prop_dictionary_get_keysym---of 1
prop_dictionary_internalize100%of 1
prop_dictionary_iterator---of 6
prop_dictionary_keysym_cstring_nocopy---of 3
prop_dictionary_keysym_equals---of 5
prop_dictionary_keysym_value---of 3
prop_dictionary_make_immutable---of 3
prop_dictionary_remove---of 15
prop_dictionary_remove_keysym---of 5
prop_dictionary_set43%of 38
prop_dictionary_set_keysym---of 5
-----------
SUMMARY45%of 79
accept_filt_add---of 9
accept_filt_clear20%of 15
accept_filt_del---of 7
accept_filt_get---of 7
accept_filt_getopt50%of 6
accept_filt_setopt23%of 40
accept_filter_init---of 3
accept_filter_init0---of 1
-----------
SUMMARY25%of 61
ras_fork50%of 6
ras_lookup75%of 12
ras_purgeall72%of 7
sys_rasctl78%of 22
x86_curlwp100%of 1
-----------
SUMMARY73%of 48
compat_30_sys_socket100%of 1
uipc_syscalls_30_fini---of 1
uipc_syscalls_30_init---of 1
-----------
SUMMARY100%of 1
compat_50_vndioctl50%of 4
vnd_50_fini---of 3
vnd_50_init---of 1
-----------
SUMMARY50%of 4
clock_gettime184%of 12
clock_timeleft---of 3
gettimeleft54%of 13
inittimeleft84%of 6
itimerfix100%of 5
itimespecfix---of 6
timespecaddok54%of 13
timespecsubok58%of 14
ts2timo85%of 20
tshzto100%of 1
tshztoup100%of 1
tstohz100%of 1
tvhzto---of 3
tvtohz46%of 11
x86_curlwp100%of 1
-----------
SUMMARY70%of 98
procfs_done---of 1
procfs_exechook_cb---of 15
procfs_hashrem34%of 6
procfs_init---of 1
procfs_listener_cb34%of 9
procfs_loadvnode9%of 58
procfs_modcmd---of 6
procfs_mount64%of 11
procfs_reinit---of 1
procfs_root50%of 4
procfs_start100%of 1
procfs_statvfs100%of 1
procfs_sync100%of 1
procfs_sysctl_setup---of 1
procfs_unmount100%of 3
procfs_vget---of 1
x86_curlwp100%of 1
-----------
SUMMARY28%of 95
handle_modctl_load82%of 11
sys_modctl60%of 44
x86_curlwp100%of 1
-----------
SUMMARY65%of 56
scsipi_adapter_addref---of 13
scsipi_adapter_delref---of 11
scsipi_adapter_enable---of 5
scsipi_adapter_ioctl---of 6
scsipi_adapter_minphys60%of 5
scsipi_adapter_request---of 9
scsipi_async_event---of 41
scsipi_channel_freeze---of 3
scsipi_channel_init---of 5
scsipi_channel_shutdown---of 4
scsipi_channel_thaw---of 4
scsipi_channel_timed_thaw---of 4
scsipi_complete---of 80
scsipi_completion_thread---of 27
scsipi_done---of 23
scsipi_enqueue25%of 29
scsipi_execute_xs30%of 34
scsipi_free_opcodeinfo---of 3
scsipi_get_opcodeinfo---of 14
scsipi_get_xs25%of 33
scsipi_init---of 3
scsipi_inquire---of 5
scsipi_insert_periph---of 4
scsipi_interpret_sense---of 65
scsipi_kill_pending---of 4
scsipi_lookup_periph---of 9
scsipi_lookup_periph_locked---of 9
scsipi_mode_select---of 1
scsipi_mode_select_big---of 1
scsipi_mode_sense---of 1
scsipi_mode_sense_big---of 1
scsipi_periph_freeze---of 1
scsipi_periph_freeze_locked---of 1
scsipi_periph_thaw---of 5
scsipi_periph_thaw_locked---of 5
scsipi_periph_timed_thaw---of 7
scsipi_prevent---of 3
scsipi_print_cdb---of 3
scsipi_put_xs---of 23
scsipi_remove_periph---of 6
scsipi_run_queue25%of 69
scsipi_set_xfer_mode---of 15
scsipi_start---of 1
scsipi_sync_factor_to_freq---of 3
scsipi_sync_factor_to_period---of 3
scsipi_sync_period_to_factor---of 7
scsipi_target_detach---of 19
scsipi_test_unit_ready---of 3
scsipi_thread_call_callback---of 5
scsipi_wait_drain---of 4
-----------
SUMMARY27%of 170
encap4_input---of 33
encap6_ctlinput14%of 22
encap6_input---of 5
encap6_lookup---of 31
encap_attach_addr---of 21
encap_attach_func---of 17
encap_detach---of 33
encap_init---of 3
encap_lock_enter---of 4
encap_lock_exit---of 3
encap_lock_held---of 1
encapinit---of 3
x86_curlwp---of 1
-----------
SUMMARY14%of 22
workqueue_create---of 13
workqueue_destroy---of 20
workqueue_enqueue42%of 12
workqueue_exit---of 5
workqueue_initqueue---of 8
workqueue_q_wait---of 13
workqueue_wait---of 12
workqueue_worker---of 22
x86_curlwp---of 1
-----------
SUMMARY42%of 12
nd6_cache_lladdr---of 42
nd6_create---of 5
nd6_free---of 11
nd6_ifattach34%of 6
nd6_ifdetach---of 6
nd6_init---of 3
nd6_ioctl38%of 27
nd6_is_addr_neighbor---of 25
nd6_llinfo_holdsrc---of 5
nd6_llinfo_missed---of 10
nd6_llinfo_output---of 1
nd6_llinfo_reachable---of 1
nd6_llinfo_release_pkts---of 14
nd6_llinfo_retrans---of 1
nd6_lookup---of 1
nd6_need_cache---of 10
nd6_nud_enabled---of 1
nd6_nud_hint---of 3
nd6_option_init---of 3
nd6_options---of 33
nd6_purge---of 6
nd6_purge_entry---of 7
nd6_resolve---of 14
nd6_rtrequest---of 46
nd6_setifflags23%of 45
nd6_slowtimo---of 10
nd6_sysctl---of 9
nd6_timer---of 1
nd6_timer_work---of 20
x86_curlwp---of 1
-----------
SUMMARY29%of 78
ipsec4_output---of 16
ipsec6_check_policy---of 8
ipsec6_udp_cksum---of 7
ipsec_address---of 4
ipsec_attach---of 1
ipsec_checkpolicy---of 18
ipsec_chkreplay---of 11
ipsec_delete_pcbpolicy50%of 10
ipsec_get_policy---of 14
ipsec_get_reqlevel---of 40
ipsec_getpolicybyaddr---of 14
ipsec_getpolicybysock---of 92
ipsec_hdrsiz---of 9
ipsec_in_reject---of 24
ipsec_init_pcbpolicy50%of 8
ipsec_invalpcbcacheall---of 1
ipsec_ip_input_checkpolicy---of 5
ipsec_logsastr---of 9
ipsec_mtu---of 8
ipsec_pcbconn60%of 5
ipsec_pcbdisconn60%of 5
ipsec_set_policy20%of 30
ipsec_setspidx---of 54
ipsec_sp_hdrsiz---of 22
ipsec_updatereplay---of 25
key_get_default_sp---of 12
xform_init---of 8
xform_register---of 1
-----------
SUMMARY37%of 58
st_interpret_sense---of 77
st_load---of 13
st_mode_select---of 8
st_rdpos---of 14
st_rewind---of 9
st_space---of 34
st_unmount---of 11
st_write_filemarks---of 12
stattach---of 24
stclose---of 18
stdetach---of 1
stdone---of 7
stdump---of 1
stioctl---of 71
stopen3%of 67
stread---of 1
strestart---of 1
ststart---of 24
ststrategy---of 8
stwrite---of 1
-----------
SUMMARY3%of 67
module_load_vfs17%of 30
module_load_vfs_init---of 1
x86_curlwp---of 1
-----------
SUMMARY17%of 30
filt_tunrdetach---of 1
filt_tunread---of 10
if_tun_modcmd---of 5
tun_clone_create50%of 6
tun_clone_destroy---of 20
tun_find_zunit19%of 11
tun_i_softintr---of 4
tun_ioctl---of 34
tun_o_softintr---of 4
tun_output---of 39
tunattach---of 1
tunclose16%of 25
tunioctl23%of 27
tunkqfilter---of 7
tunopen54%of 13
tunpoll37%of 11
tunread---of 21
tunwrite26%of 39
-----------
SUMMARY28%of 132
mount_listener_cb100%of 5
usermount_common_policy---of 3
vfs_attach---of 8
vfs_delref100%of 1
vfs_detach16%of 13
vfs_opv_free---of 4
vfs_opv_init---of 17
vfs_reinit---of 6
vfsinit---of 6
vn_default_error100%of 1
-----------
SUMMARY45%of 20
com_attach_subr---of 44
com_break---of 4
com_cleanup---of 3
com_common_getc---of 14
com_config---of 7
com_detach---of 10
com_hwiflow---of 4
com_iflush43%of 7
com_init_regs---of 1
com_init_regs_stride---of 1
com_init_regs_stride_width---of 4
com_intr_poll---of 1
com_is_console---of 5
com_loadchannelregs43%of 14
com_modem---of 7
com_probe_subr---of 3
com_read_1100%of 1
com_read_4---of 1
com_resume---of 1
com_shutdown32%of 22
com_suspend---of 1
com_to_tiocm---of 1
com_write_1100%of 1
com_write_4---of 1
com_write_multi_1100%of 1
com_write_multi_4---of 4
comclose29%of 7
comcnattach---of 3
comcnattach1---of 3
comcngetc---of 1
comcnpollc---of 1
comcnputc37%of 11
comdiag---of 1
comhwiflow---of 14
cominit---of 17
comintr---of 66
comioctl31%of 42
comopen35%of 40
comparam36%of 68
compoll50%of 4
comprobe1---of 3
comread50%of 4
comsoft---of 39
comspeed---of 5
comstart50%of 10
comstop50%of 4
comtty---of 1
comwrite50%of 4
tiocm_to_com---of 8
-----------
SUMMARY38%of 240
procfs_allocvp100%of 1
procfs_doemul---of 1
procfs_proc_find60%of 5
procfs_proc_lock63%of 8
procfs_proc_unlock100%of 1
procfs_rw10%of 32
procfs_use_linux_compat100%of 1
vfs_findname---of 5
vfs_getuserstr---of 7
x86_curlwp100%of 1
-----------
SUMMARY31%of 49
bt_init---of 1
hci_ctloutput_wrapper100%of 1
l2cap_ctloutput_wrapper---of 1
netbt_modcmd---of 4
rfcomm_ctloutput_wrapper100%of 1
sco_ctloutput_wrapper100%of 1
-----------
SUMMARY100%of 3
_icmp6_input---of 345
icmp6_ctloutput34%of 9
icmp6_errcount23%of 9
icmp6_error38%of 53
icmp6_error2---of 9
icmp6_init---of 1
icmp6_input---of 1
icmp6_mtudisc_callback_register---of 6
icmp6_mtudisc_timeout---of 8
icmp6_mtudisc_update---of 26
icmp6_redirect_input---of 43
icmp6_redirect_output---of 38
icmp6_redirect_timeout---of 7
icmp6_reflect25%of 41
icmp6_statinc---of 3
ni6_nametodns---of 36
sysctl_net_inet6_icmp6_redirtimeout---of 7
sysctl_net_inet6_icmp6_stats---of 1
-----------
SUMMARY32%of 112
compare_ugen---of 1
compare_ugen_key---of 1
filt_ugenrdetach---of 1
filt_ugenread_bulk---of 5
filt_ugenread_intr---of 3
filt_ugenread_isoc---of 5
filt_ugenwrite_bulk---of 5
ugen_activate---of 3
ugen_attach---of 1
ugen_bulkra_intr---of 11
ugen_bulkwb_intr---of 11
ugen_detach---of 34
ugen_do_close---of 43
ugen_get_cdesc---of 8
ugen_isoc_rintr---of 13
ugen_match---of 5
ugen_modcmd---of 3
ugen_set_config---of 68
ugenclose---of 10
ugenif_acquire40%of 5
ugenif_attach---of 20
ugenif_match---of 1
ugenintr---of 4
ugenioctl---of 115
ugenkqfilter---of 12
ugenopen9%of 34
ugenpoll---of 25
ugenread---of 57
ugenwrite---of 45
-----------
SUMMARY13%of 39
npf_config_create---of 1
npf_config_destroy---of 9
npf_config_enter100%of 1
npf_config_exit100%of 1
npf_config_fini---of 9
npf_config_init---of 3
npf_config_load---of 20
npf_config_locked_p---of 1
npf_config_natset---of 6
npf_config_read_enter---of 1
npf_config_read_exit---of 1
npf_config_ruleset---of 6
npf_config_sync---of 3
npf_config_tableset---of 6
npf_default_pass---of 6
-----------
SUMMARY100%of 2
hash_df_block84%of 6
nist_sha256_hash_drbg_destroy---of 1
nist_sha256_hash_drbg_generate53%of 19
nist_sha256_hash_drbg_initialize---of 16
nist_sha256_hash_drbg_instantiate100%of 1
nist_sha256_hash_drbg_reseed---of 1
-----------
SUMMARY62%of 26
compat_50_sys___ntp_gettime30---of 5
compat_50_sys_adjtime---of 7
compat_50_sys_aio_suspend---of 7
compat_50_sys_clock_getres100%of 4
compat_50_sys_clock_gettime100%of 3
compat_50_sys_clock_settime100%of 3
compat_50_sys_getitimer100%of 3
compat_50_sys_getrusage67%of 3
compat_50_sys_gettimeofday---of 6
compat_50_sys_mq_timedreceive---of 6
compat_50_sys_mq_timedsend---of 4
compat_50_sys_nanosleep100%of 4
compat_50_sys_setitimer88%of 8
compat_50_sys_settimeofday---of 3
compat_50_sys_timer_gettime---of 3
compat_50_sys_timer_settime---of 4
compat_sysctl_time---of 1
kern_time_50_fini---of 1
kern_time_50_init---of 1
-----------
SUMMARY93%of 28
ah4_ctlinput_wrapper100%of 1
esp4_ctlinput_wrapper100%of 1
rip_ctlinput_wrapper100%of 1
rip_ctloutput_wrapper100%of 1
sockaddr_in_addr---of 3
sockaddr_in_cmp---of 6
sockaddr_in_const_addr---of 3
tcp_ctlinput_wrapper100%of 1
tcp_ctloutput_wrapper100%of 1
udp_ctlinput_wrapper100%of 1
udp_ctloutput_wrapper100%of 1
-----------
SUMMARY100%of 8
in6_get_hw_ifid---of 41
in6_ifattach13%of 49
in6_ifdetach---of 1
in6_nigroup---of 12
x86_curlwp100%of 1
-----------
SUMMARY15%of 50
uvm_deallocate67%of 3
-----------
SUMMARY67%of 3
npf_ifmap_copylogname---of 5
npf_ifmap_copyname---of 5
npf_ifmap_fini---of 1
npf_ifmap_flush---of 4
npf_ifmap_getid---of 1
npf_ifmap_init---of 3
npf_ifmap_register---of 13
npfk_ifmap_attach43%of 7
npfk_ifmap_detach---of 1
-----------
SUMMARY43%of 7
hash_value---of 1
hash_value_ensure_initialized---of 3
old_sysctl100%of 7
random_address_init---of 1
sys___sysctl---of 14
sysctl_copyin---of 4
sysctl_copyinstr---of 4
sysctl_copyout---of 4
sysctl_create25%of 169
sysctl_createv45%of 72
sysctl_describe---of 71
sysctl_destroy---of 66
sysctl_destroyv---of 16
sysctl_dispatch18%of 23
sysctl_finalize---of 1
sysctl_free---of 21
sysctl_init---of 4
sysctl_locate56%of 27
sysctl_lock---of 3
sysctl_log_print---of 8
sysctl_lookup41%of 49
sysctl_map_flags---of 6
sysctl_mmap---of 12
sysctl_needfunc---of 6
sysctl_notavail---of 4
sysctl_null---of 1
sysctl_query---of 41
sysctl_relock---of 1
sysctl_teardown---of 14
sysctl_unlock---of 1
x86_curlwp100%of 1
-----------
SUMMARY35%of 348
do_fcntl_lock82%of 16
do_posix_fadvise---of 4
do_sys_fstat100%of 3
dodup80%of 5
sys___fstat50100%of 4
sys___posix_fadvise50100%of 4
sys_close75%of 4
sys_dup100%of 3
sys_dup2100%of 1
sys_dup3100%of 1
sys_fcntl69%of 58
sys_flock59%of 12
sys_fpathconf75%of 4
sys_pipe100%of 3
sys_pipe2100%of 4
x86_curlwp100%of 1
-----------
SUMMARY77%of 123
compat43_set_accrights---of 5
compat_43_sys_accept34%of 6
compat_43_sys_getpeername75%of 4
compat_43_sys_getsockname100%of 4
compat_43_sys_recv100%of 1
compat_43_sys_recvfrom50%of 6
compat_43_sys_recvmsg50%of 18
compat_43_sys_send100%of 1
compat_43_sys_sendmsg34%of 9
uipc_syscalls_43_fini---of 1
uipc_syscalls_43_init---of 1
-----------
SUMMARY54%of 49
lookup_crossmount46%of 24
lookup_for_nfsd---of 21
lookup_for_nfsd_index---of 33
lookup_once73%of 43
lookup_parsepath86%of 14
namei58%of 19
namei_hash---of 7
namei_simple_kernel100%of 1
namei_simple_user100%of 1
namei_tryemulroot42%of 297
nameiat_simple_kernel30%of 17
nameiat_simple_user65%of 17
pathbuf_assimilate100%of 1
pathbuf_copyin58%of 7
pathbuf_copystring100%of 1
pathbuf_create50%of 4
pathbuf_destroy60%of 5
pathbuf_maybe_copyin67%of 3
pathbuf_stringcopy_get100%of 3
pathbuf_stringcopy_put72%of 7
relookup48%of 23
x86_curlwp100%of 1
-----------
SUMMARY49%of 488
ufs_blkatoff43%of 14
ufs_dirbad---of 1
ufs_dirbadentry43%of 14
ufs_dirempty57%of 16
ufs_direnter56%of 60
ufs_dirremove52%of 27
ufs_dirrewrite47%of 15
ufs_lookup73%of 112
ufs_makedirentry67%of 3
-----------
SUMMARY61%of 261
do_enable35%of 20
pms_disable---of 3
pms_enable67%of 3
pms_ioctl40%of 5
pms_reset_thread---of 10
pms_resume---of 9
pms_sliced_command---of 1
pms_suspend---of 4
pmsattach---of 11
pmsinput---of 17
pmsprobe---of 6
-----------
SUMMARY40%of 28
process_machdep_doxstate58%of 7
process_machdep_validfpu---of 1
process_read_dbregs100%of 1
process_read_fpregs---of 1
process_read_regs67%of 3
process_set_pc67%of 3
process_sstep100%of 1
process_write_dbregs67%of 3
process_write_fpregs---of 1
process_write_regs---of 7
ptrace_machdep_dorequest45%of 18
-----------
SUMMARY56%of 36
rw_obj_alloc100%of 1
rw_obj_free72%of 7
rw_obj_hold60%of 5
rw_obj_refcnt100%of 1
rw_obj_tryalloc67%of 3
-----------
SUMMARY71%of 17
lf_advlock51%of 113
lf_alloc50%of 4
lf_clearlock67%of 12
lf_findoverlap81%of 26
lf_init---of 1
lf_split50%of 4
lf_wakelock42%of 12
x86_curlwp100%of 1
-----------
SUMMARY56%of 172
secmodel_extensions_system_cb23%of 9
secmodel_extensions_vfs_start---of 1
secmodel_extensions_vfs_stop---of 1
secmodel_extensions_vfs_sysctl---of 1
secmodel_extensions_vnode_cb45%of 9
-----------
SUMMARY34%of 18
compat_70_iflist_addr---of 7
compat_70_rt_newaddrmsg145%of 9
rtsock_70_fini---of 5
rtsock_70_init---of 1
-----------
SUMMARY45%of 9
compat_50_sys_____semctl1392%of 12
-----------
SUMMARY92%of 12
compat_cvtcmd18%of 68
compat_ifioctl71%of 27
do_compat_cvtcmd100%of 1
if_43_fini---of 5
if_43_init---of 1
x86_curlwp100%of 1
-----------
SUMMARY35%of 97
ntfs_calccfree---of 5
ntfs_done---of 1
ntfs_fhtovp---of 4
ntfs_init---of 1
ntfs_loadvnode---of 24
ntfs_modcmd---of 6
ntfs_mount36%of 14
ntfs_mountfs---of 52
ntfs_mountroot---of 5
ntfs_reinit---of 1
ntfs_root---of 3
ntfs_start---of 1
ntfs_statvfs---of 1
ntfs_sync---of 1
ntfs_unmount---of 64
ntfs_vget---of 1
ntfs_vgetex---of 9
ntfs_vptofh---of 3
x86_curlwp100%of 1
-----------
SUMMARY40%of 15
chacha_core_sse2_impl---of 1
chacha_probe_sse2---of 1
chacha_stream_sse2_impl100%of 1
chacha_stream_xor_sse2_impl---of 1
hchacha_sse2_impl---of 1
xchacha_stream_sse2_impl---of 1
xchacha_stream_xor_sse2_impl---of 1
-----------
SUMMARY100%of 1
layer_access63%of 8
layer_bmap100%of 1
layer_bypass43%of 49
layer_close75%of 4
layer_fsync80%of 5
layer_getattr67%of 3
layer_getpages50%of 8
layer_inactive100%of 1
layer_lookup86%of 14
layer_open100%of 7
layer_print---of 1
layer_putpages72%of 7
layer_reclaim80%of 5
layer_remove100%of 3
layer_rename72%of 7
layer_revoke---of 1
layer_rmdir100%of 3
layer_setattr88%of 16
-----------
SUMMARY67%of 141
-----------
SUMMARY---of 0
fdesc_loadvnode28%of 11
fdesc_modcmd---of 4
fdesc_mount60%of 5
fdesc_root100%of 1
fdesc_start100%of 1
fdesc_sync---of 1
fdesc_sysctl_setup---of 1
fdesc_unmount50%of 4
fdesc_vget---of 1
x86_curlwp100%of 1
-----------
SUMMARY48%of 23
mly_alloc_ccbs---of 8
mly_attach---of 47
mly_ccb_alloc---of 3
mly_ccb_map---of 17
mly_ccb_submit---of 5
mly_ccb_unmap---of 4
mly_check_event---of 11
mly_complete_event---of 29
mly_complete_rescan---of 19
mly_dmamem_alloc---of 6
mly_dmamem_free---of 1
mly_find_ident---of 17
mly_get_xfer_mode---of 9
mly_intr---of 18
mly_ioctl---of 26
mly_match---of 1
mly_release_ccbs---of 4
mly_scan_btl---of 11
mly_scsipi_complete---of 21
mly_scsipi_ioctl---of 6
mly_scsipi_minphys---of 3
mly_scsipi_request---of 22
mly_shutdown---of 7
mly_thread---of 21
mlyclose---of 1
mlyioctl---of 36
mlyopen40%of 5
-----------
SUMMARY40%of 5
coda_abortop---of 1
coda_access---of 8
coda_bmap---of 3
coda_close---of 11
coda_create---of 14
coda_fsync---of 10
coda_getattr---of 11
coda_getpages---of 18
coda_grab_vnode---of 7
coda_inactive---of 9
coda_ioctl---of 19
coda_islocked---of 4
coda_link---of 23
coda_lock---of 4
coda_lookup---of 27
coda_mkdir---of 15
coda_open---of 15
coda_pathconf---of 1
coda_putpages---of 7
coda_rdwr---of 25
coda_read---of 4
coda_readdir---of 33
coda_readlink---of 12
coda_reclaim---of 11
coda_remove---of 17
coda_rename---of 23
coda_rmdir---of 11
coda_setattr---of 22
coda_strategy---of 3
coda_symlink---of 12
coda_unlock---of 4
coda_vnodeopstats_init100%of 3
coda_vop_error---of 4
coda_vop_nop---of 4
coda_write---of 4
make_coda_node---of 10
x86_curlwp---of 1
-----------
SUMMARY100%of 3
_rw_init100%of 1
rw_abort---of 3
rw_destroy60%of 5
rw_downgrade---of 36
rw_dump---of 1
rw_enter60%of 72
rw_exit55%of 33
rw_init100%of 1
rw_lock_held67%of 3
rw_lock_op50%of 4
rw_owner100%of 1
rw_read_held67%of 3
rw_tryenter63%of 16
rw_tryupgrade53%of 21
rw_write_held67%of 3
x86_curlwp100%of 1
-----------
SUMMARY60%of 164
if_stats_fini---of 3
if_stats_init100%of 1
if_stats_to_if_data100%of 1
if_stats_to_if_data_cb---of 5
-----------
SUMMARY100%of 2
uao_create48%of 19
uao_detach59%of 24
uao_dropswap60%of 5
uao_dropswap_range30%of 34
uao_find_swhash_elt30%of 10
uao_find_swslot34%of 9
uao_get45%of 56
uao_init---of 3
uao_pagein_page---of 10
uao_put57%of 25
uao_reference100%of 3
uao_set_pgfl---of 4
uao_set_swslot25%of 20
uao_swap_off---of 41
-----------
SUMMARY44%of 205
if_loop_modcmd---of 5
loioctl70%of 10
loop_clone_create---of 3
loop_clone_destroy---of 3
loop_rtrequest67%of 3
loopattach---of 1
loopinit---of 3
looutput32%of 29
-----------
SUMMARY43%of 42
ccd_components_sysctl---of 18
ccd_info_sysctl---of 8
ccd_modcmd---of 1
ccd_units_sysctl---of 8
ccdattach---of 1
ccdclose34%of 12
ccddetach---of 1
ccdget40%of 10
ccdgetdisklabel---of 19
ccdioctl12%of 172
ccdiodone---of 14
ccdopen34%of 15
ccdread---of 8
ccdsize---of 13
ccdstart---of 42
ccdstrategy28%of 11
ccdthread---of 13
ccdwrite---of 8
sysctl_kern_ccd_setup---of 3
x86_curlwp---of 1
-----------
SUMMARY17%of 220
sb_max_set---of 3
sbappend75%of 8
sbappendaddr66%of 29
sbappendaddrchain---of 26
sbappendcontrol48%of 25
sbappendrecord64%of 11
sbappendstream58%of 7
sbcompress75%of 31
sbcreatecontrol---of 3
sbcreatecontrol1---of 10
sbdrop36%of 28
sbdroprecord58%of 14
sbflush57%of 16
sbinsertoob---of 19
sblock42%of 17
sbrelease67%of 3
sbreserve75%of 12
sbunlock60%of 5
sbwait39%of 13
socantrcvmore67%of 9
socantsendmore67%of 9
socket_print---of 41
sofindproc---of 33
soget67%of 3
soinit2---of 1
soisconnected49%of 27
soisconnecting67%of 3
soisdisconnected67%of 15
soisdisconnecting---of 15
solocked100%of 1
solocked267%of 3
solockreset67%of 3
solockretry---of 6
sonewconn33%of 28
soput58%of 7
soqinsque58%of 14
soqremque41%of 22
soreserve62%of 13
soroverflow---of 10
sosetlock80%of 5
sowait40%of 15
sowakeup62%of 13
x86_curlwp100%of 1
-----------
SUMMARY55%of 410
sw_reg_biodone---of 1
sw_reg_iodone---of 11
sw_reg_start---of 10
swap_off---of 19
swapdrum_sdp_is---of 13
swaplist_find48%of 21
swaplist_insert46%of 22
swaplist_trim58%of 14
swapsys_lock---of 1
swapsys_unlock---of 1
swopen---of 3
swread100%of 1
swstrategy29%of 25
swwrite100%of 1
sys_swapctl65%of 56
sysctl_uvmswap_setup---of 1
uvm_swap_alloc---of 26
uvm_swap_free---of 20
uvm_swap_get---of 7
uvm_swap_init---of 6
uvm_swap_io---of 66
uvm_swap_markbad---of 15
uvm_swap_put---of 1
uvm_swap_shutdown---of 17
uvm_swap_stats56%of 20
uvm_swapisfull---of 4
x86_curlwp---of 1
-----------
SUMMARY53%of 160
sm_init_once---of 1
sysmon_attach_minor---of 5
sysmon_fini---of 1
sysmon_init---of 3
sysmon_modcmd---of 5
sysmonclose---of 6
sysmonioctl50%of 4
sysmonkqfilter---of 4
sysmonopen38%of 8
sysmonpoll---of 3
sysmonread67%of 3
-----------
SUMMARY47%of 15
umap_findid---of 5
umap_mapids53%of 19
umap_reverse_findid40%of 5
-----------
SUMMARY50%of 24
kernconfig_is_held100%of 1
kernconfig_lock50%of 6
kernconfig_lock_init---of 1
kernconfig_unlock58%of 7
x86_curlwp100%of 1
-----------
SUMMARY60%of 15
-----------
SUMMARY---of 0
amap_add50%of 18
amap_adjref_anons63%of 8
amap_alloc58%of 7
amap_alloc134%of 15
amap_copy63%of 40
amap_cow_now35%of 26
amap_ctor58%of 7
amap_dtor38%of 8
amap_extend54%of 84
amap_free60%of 10
amap_lookup59%of 12
amap_lookups60%of 15
amap_pp_adjref70%of 43
amap_pp_establish60%of 10
amap_ref100%of 3
amap_share_protect---of 19
amap_splitref64%of 11
amap_swap_off---of 48
amap_unadd62%of 13
amap_unref86%of 7
amap_wipeout58%of 19
amap_wiperange84%of 18
uvm_amap_init---of 1
-----------
SUMMARY59%of 374
pfil_add_hook---of 19
pfil_add_ihook---of 11
pfil_head_create60%of 10
pfil_head_destroy---of 6
pfil_head_get---of 7
pfil_init---of 1
pfil_list_add---of 14
pfil_list_remove---of 10
pfil_remove_hook---of 7
pfil_remove_ihook---of 9
pfil_run_addrhooks100%of 1
pfil_run_arg50%of 10
pfil_run_hooks32%of 25
pfil_run_ifhooks100%of 1
x86_curlwp100%of 1
-----------
SUMMARY46%of 48
kern_uipc_socket_50_fini---of 7
kern_uipc_socket_50_init---of 1
uipc_socket_50_getopt160%of 5
uipc_socket_50_sbts---of 4
uipc_socket_50_setopt144%of 25
-----------
SUMMARY47%of 30
elf32_check_header---of 6
elf32_copyargs---of 3
elf32_free_emul_arg---of 3
elf32_load_psection---of 17
elf32_populate_auxv---of 16
exec_elf32_makecmds3%of 106
exec_elf32_modcmd---of 1
netbsd_elf32_note---of 29
netbsd_elf32_probe---of 3
netbsd_elf32_signature---of 14
-----------
SUMMARY3%of 106
ubc_alloc44%of 64
ubc_fault41%of 37
ubc_init---of 11
ubc_purge50%of 22
ubc_release42%of 41
ubc_uiomove63%of 16
ubc_zerorange75%of 4
ubchash_stats---of 10
ubchist_init---of 1
-----------
SUMMARY46%of 184
ipcperm100%of 1
sysctl_ipc_setup---of 1
sysctl_kern_sysvipc---of 23
sysv_ipc_modcmd---of 16
sysvipc_listener_cb78%of 9
sysvipcfini---of 3
sysvipcinit---of 3
-----------
SUMMARY80%of 10
compat_50_rnd_ioctl34%of 9
rndpseudo_50_fini---of 3
rndpseudo_50_init---of 1
-----------
SUMMARY34%of 9
compat_60_ptmget_ioctl---of 9
compat_60_ptmioctl---of 3
compat_60_ttioctl50%of 4
kern_tty_60_fini---of 5
kern_tty_60_init---of 1
-----------
SUMMARY50%of 4
elf64_check_header---of 6
elf64_copyargs---of 3
elf64_free_emul_arg---of 3
elf64_load_psection---of 17
elf64_populate_auxv---of 16
exec_elf64_makecmds3%of 105
exec_elf64_modcmd---of 4
netbsd_elf64_note---of 29
netbsd_elf64_probe---of 3
netbsd_elf64_signature---of 14
-----------
SUMMARY3%of 105
b_to_q39%of 21
catq---of 10
clalloc---of 3
clfree---of 5
firstc---of 5
getc25%of 8
ndflush---of 10
ndqb16%of 13
nextc---of 10
putc67%of 9
q_to_b---of 11
unputc---of 8
-----------
SUMMARY36%of 51
-----------
SUMMARY---of 0
compat_80_modstat49%of 29
kern_mod_80_fini---of 3
kern_mod_80_init---of 1
-----------
SUMMARY49%of 29
kpause59%of 12
kpreempt73%of 22
kpreempt_disable100%of 1
kpreempt_disabled67%of 6
kpreempt_enable80%of 5
mi_switch53%of 97
mtsleep42%of 12
preempt72%of 7
preempt_needed80%of 5
preempt_point86%of 7
sched_changepri56%of 9
sched_lendpri45%of 9
sched_pstats---of 33
setrunnable55%of 24
suspendsched---of 24
synch_init---of 1
syncobj_noowner---of 1
tsleep40%of 10
updatertime50%of 10
wakeup---of 3
x86_curlwp100%of 1
yield72%of 7
-----------
SUMMARY58%of 244
ip6_get_prevhdr---of 9
ip6_getdstifaddr100%of 1
ip6_hopopts_input43%of 28
ip6_init---of 14
ip6_lasthdr50%of 8
ip6_nexthdr58%of 21
ip6_notify_pmtu---of 13
ip6_pullexthdr---of 13
ip6_savecontrol---of 32
ip6_statinc---of 3
ip6_unknown_opt72%of 7
ip6intr---of 137
sysctl_net_inet6_ip6_stats---of 1
-----------
SUMMARY53%of 65
-----------
SUMMARY---of 0
umap_modcmd---of 4
umapfs_mount46%of 22
umapfs_sysctl_setup---of 1
umapfs_unmount---of 4
x86_curlwp100%of 1
-----------
SUMMARY48%of 23
cd9660_fhtovp---of 5
cd9660_loadvnode---of 22
cd9660_modcmd---of 4
cd9660_mount24%of 30
cd9660_mountroot---of 5
cd9660_root---of 4
cd9660_start---of 1
cd9660_statvfs---of 1
cd9660_sync---of 1
cd9660_sysctl_setup---of 1
cd9660_unmount---of 4
cd9660_vget---of 4
cd9660_vptofh---of 3
iso_mountfs14%of 52
x86_curlwp100%of 1
-----------
SUMMARY19%of 83
dk_open_parent---of 6
dk_set_geometry---of 3
dkcancel---of 9
dkclose---of 23
dkdiscard---of 19
dkdump---of 10
dkioctl31%of 23
dkiodone---of 7
dkminphys50%of 8
dkopen24%of 39
dkread---of 9
dkrestart---of 1
dksize40%of 5
dkstart50%of 10
dkstrategy50%of 14
dkwedge_add---of 62
dkwedge_attach---of 11
dkwedge_del---of 10
dkwedge_delall---of 1
dkwedge_delall1---of 13
dkwedge_delidle---of 1
dkwedge_detach---of 13
dkwedge_discover---of 16
dkwedge_find_by_parent---of 9
dkwedge_find_by_wname---of 10
dkwedge_find_partition---of 12
dkwedge_get_parent_name---of 5
dkwedge_init---of 22
dkwedge_list---of 9
dkwedge_match---of 1
dkwedge_print_wnames---of 7
dkwedge_read---of 7
dkwrite56%of 9
x86_curlwp---of 1
-----------
SUMMARY37%of 108
lookup_ifnet_table50%of 4
npf_ifaddr_flush---of 3
npf_ifaddr_sync29%of 7
npf_ifaddr_syncall---of 5
replace_ifnet_table---of 5
-----------
SUMMARY37%of 11
pax_aslr_exec_offset---of 5
pax_aslr_init_vm---of 6
pax_aslr_mmap54%of 13
pax_aslr_offset---of 3
pax_aslr_rtld_offset---of 5
pax_aslr_stack---of 6
pax_aslr_stack_gap---of 6
pax_init---of 1
pax_mprotect_maxprotect100%of 1
pax_mprotect_prot100%of 1
pax_mprotect_validate75%of 4
pax_segvguard17%of 24
pax_segvguard_cleanup20%of 10
pax_set_flags---of 4
pax_setup_elf_flags---of 10
sysctl_security_pax_setup---of 1
-----------
SUMMARY34%of 53
filt_logrdetach---of 1
filt_logread50%of 8
initmsgbuf---of 7
logclose---of 1
loginit---of 1
logioctl---of 10
logkqfilter---of 3
logopen50%of 4
logpoll---of 4
logputchar36%of 14
logread---of 14
logsoftintr---of 3
logwakeup50%of 4
sysctl_msgbuf---of 14
-----------
SUMMARY44%of 30
cpu_speculation_init---of 59
mitigation_mds_change_cpu---of 12
mitigation_taa_change_cpu---of 6
mitigation_v2_change_cpu---of 17
mitigation_v4_change_cpu---of 9
speculation_barrier50%of 6
sysctl_machdep_mds_mitigated---of 18
sysctl_machdep_spectreV2_mitigated---of 11
sysctl_machdep_spectreV4_mitigated---of 23
sysctl_machdep_taa_mitigated---of 25
sysctl_speculation_init---of 1
v2_set_name---of 4
-----------
SUMMARY50%of 6
getdiskinfo---of 4
getdisksize34%of 9
opendisk---of 10
-----------
SUMMARY34%of 9
bt_alloc43%of 14
bt_freetrim15%of 27
bt_insbusy75%of 8
bt_insfree47%of 13
bt_refill_locked32%of 29
bt_rembusy50%of 8
bt_remfree50%of 8
bt_remseg43%of 7
pool_page_alloc_vmem_meta100%of 1
pool_page_free_vmem_meta---of 1
qc_poolpage_alloc67%of 3
qc_poolpage_free---of 1
vmem_add---of 1
vmem_add125%of 20
vmem_add_bts---of 7
vmem_alloc56%of 18
vmem_create---of 3
vmem_destroy---of 6
vmem_destroy1---of 26
vmem_dump---of 14
vmem_fit46%of 22
vmem_free72%of 7
vmem_init---of 44
vmem_print---of 1
vmem_printall---of 4
vmem_rehash_all---of 29
vmem_rehash_all_kick---of 1
vmem_rehash_start---of 3
vmem_roundup_size---of 1
vmem_size---of 5
vmem_subsystem_init---of 1
vmem_whatis---of 11
vmem_xalloc40%of 122
vmem_xalloc_addr---of 8
vmem_xcreate---of 3
vmem_xfree47%of 13
vmem_xfree_bt48%of 42
vmem_xfreeall---of 8
-----------
SUMMARY42%of 362
-----------
SUMMARY---of 0
pmap_tlb_cpu_init---of 1
pmap_tlb_init---of 1
pmap_tlb_intr---of 15
pmap_tlb_shootdown82%of 11
pmap_tlb_shootnow49%of 39
-----------
SUMMARY57%of 50
kern_free100%of 3
kern_malloc75%of 4
kern_realloc---of 10
-----------
SUMMARY86%of 7
-----------
SUMMARY---of 0
tcp_congctl_bystruct---of 7
tcp_congctl_init---of 12
tcp_congctl_register---of 12
tcp_congctl_release58%of 7
tcp_congctl_select47%of 15
tcp_congctl_unregister---of 16
tcp_cubic_congestion_exp---of 5
tcp_cubic_fast_retransmit---of 9
tcp_cubic_newack---of 18
tcp_cubic_slow_retransmit---of 7
tcp_newreno_fast_retransmit---of 7
tcp_newreno_fast_retransmit_newack---of 10
tcp_newreno_newack---of 8
tcp_reno_congestion_exp---of 3
tcp_reno_do_fast_retransmit---of 4
tcp_reno_fast_retransmit---of 3
tcp_reno_fast_retransmit_newack---of 7
tcp_reno_newack---of 7
tcp_reno_slow_retransmit---of 3
-----------
SUMMARY50%of 22
procfs_access67%of 3
procfs_close---of 5
procfs_dir---of 18
procfs_getattr10%of 97
procfs_getpages---of 3
procfs_inactive100%of 1
procfs_lookup24%of 55
procfs_open14%of 15
procfs_pathconf67%of 9
procfs_print---of 1
procfs_readdir14%of 114
procfs_readlink---of 26
procfs_reclaim100%of 1
procfs_root_readdir_callback43%of 7
procfs_setattr100%of 1
procfs_validfile---of 4
procfs_validfile_linux40%of 5
x86_curlwp100%of 1
-----------
SUMMARY19%of 309
layerfs_fhtovp---of 6
layerfs_loadvnode80%of 5
layerfs_modcmd---of 1
layerfs_quotactl---of 3
layerfs_renamelock_enter100%of 1
layerfs_renamelock_exit100%of 1
layerfs_root67%of 3
layerfs_snapshot---of 1
layerfs_start100%of 1
layerfs_statvfs50%of 4
layerfs_suspendctl---of 1
layerfs_sync100%of 1
layerfs_vget---of 6
layerfs_vptofh100%of 1
sysctl_vfs_layerfs_setup---of 1
-----------
SUMMARY77%of 17
dk_attach---of 5
dk_close---of 11
dk_detach---of 3
dk_discard---of 15
dk_done---of 5
dk_drain---of 3
dk_dump---of 35
dk_getdefaultlabel---of 3
dk_getdisklabel---of 14
dk_init---of 1
dk_ioctl---of 48
dk_open16%of 19
dk_size---of 10
dk_start53%of 21
dk_strategy43%of 7
dk_strategy_defer---of 7
dk_strategy_pending---of 4
dk_subr_modcmd---of 1
dk_translate34%of 12
x86_curlwp---of 1
-----------
SUMMARY36%of 59
uvm_pgflcache_alloc62%of 13
uvm_pgflcache_fill73%of 18
uvm_pgflcache_fini_cpu---of 19
uvm_pgflcache_free60%of 10
uvm_pgflcache_init---of 1
uvm_pgflcache_pause---of 3
uvm_pgflcache_resume---of 7
uvm_pgflcache_spill62%of 13
uvm_pgflcache_start---of 11
-----------
SUMMARY65%of 54
_rtcache_init56%of 20
db_show_routes---of 1
db_show_rtentry---of 34
ifa_ifwithroute_psref13%of 31
route_listener_cb67%of 3
rt_check_reject_route50%of 6
rt_delete_matched_entries---of 22
rt_free58%of 7
rt_free_work---of 15
rt_get_ifa---of 4
rt_getifa38%of 16
rt_gettag---of 1
rt_ifa_addlocal50%of 14
rt_ifa_remlocal---of 17
rt_init---of 3
rt_newmsg---of 3
rt_newmsg_dynamic---of 9
rt_replace_ifa---of 36
rt_replace_ifa_matched_entries---of 12
rt_setgate28%of 22
rt_settag---of 4
rt_timer_add---of 26
rt_timer_count---of 1
rt_timer_queue_change---of 1
rt_timer_queue_create---of 8
rt_timer_queue_destroy---of 24
rt_timer_timer---of 1
rt_timer_work---of 23
rt_unref58%of 7
rt_update---of 60
rt_update_finish---of 1
rt_update_prepare---of 8
rt_walktree---of 1
rtalloc1---of 1
rtalloc1_locked50%of 8
rtcache_copy---of 21
rtcache_free75%of 4
rtcache_init100%of 1
rtcache_init_noclone---of 1
rtcache_lookup240%of 25
rtcache_percpu_alloc---of 1
rtcache_percpu_init_cpu---of 1
rtcache_setdst42%of 17
rtcache_unref100%of 1
rtcache_update100%of 1
rtcache_validate86%of 7
rtinit37%of 36
rtredirect---of 41
rtrequest100%of 1
rtrequest138%of 124
x86_curlwp100%of 1
-----------
SUMMARY41%of 352
in4_cksum34%of 6
-----------
SUMMARY34%of 6
-----------
SUMMARY---of 0
do_tcpinit---of 1
ipsec4_hdrsiz_tcp---of 6
ipsec6_hdrsiz_tcp---of 6
tcp6_ctlinput17%of 18
tcp6_mtudisc---of 8
tcp6_mtudisc_callback---of 1
tcp_close27%of 19
tcp_ctlinput15%of 27
tcp_drain---of 10
tcp_drainstub---of 1
tcp_drop---of 8
tcp_established---of 22
tcp_fasttimo---of 3
tcp_freeq16%of 19
tcp_hdrsz---of 1
tcp_init---of 7
tcp_init_common---of 7
tcp_iss_secret_init---of 1
tcp_mss_from_peer---of 17
tcp_mss_to_advertise67%of 9
tcp_mtudisc---of 8
tcp_mtudisc_callback---of 1
tcp_new_iss75%of 4
tcp_new_iss150%of 4
tcp_newtcpcb60%of 5
tcp_notify---of 21
tcp_optlen100%of 1
tcp_quench---of 3
tcp_respond---of 71
tcp_rmx_rtt---of 10
tcp_statadd---of 3
tcp_statinc---of 3
tcp_tcpcb_template---of 1
tcp_template39%of 34
-----------
SUMMARY31%of 140
ipi_broadcast---of 15
ipi_cpu_handler---of 9
ipi_mark_pending56%of 9
ipi_msg_cpu_handler---of 10
ipi_multicast---of 16
ipi_percpu_init---of 6
ipi_register---of 9
ipi_sysinit---of 1
ipi_trigger60%of 5
ipi_trigger_broadcast---of 1
ipi_trigger_multi---of 1
ipi_trigger_multi_internal---of 16
ipi_unicast---of 11
ipi_unregister---of 6
ipi_wait---of 10
put_msg---of 12
-----------
SUMMARY58%of 14
compat_50_sys___fhstat40---of 3
compat_50_sys___fstat30100%of 3
compat_50_sys___lstat30100%of 3
compat_50_sys___stat30100%of 3
compat_50_sys_futimes100%of 5
compat_50_sys_lfs_segwait---of 1
compat_50_sys_lutimes75%of 4
compat_50_sys_mknod100%of 1
compat_50_sys_utimes100%of 4
vfs_syscalls_50_fini---of 1
vfs_syscalls_50_init---of 1
-----------
SUMMARY96%of 23
secmodel_securelevel_device_cb58%of 14
secmodel_securelevel_init---of 1
secmodel_securelevel_machdep_cb29%of 7
secmodel_securelevel_network_cb50%of 4
secmodel_securelevel_process_cb67%of 6
secmodel_securelevel_start---of 1
secmodel_securelevel_stop---of 1
secmodel_securelevel_sysctl---of 5
secmodel_securelevel_system_cb54%of 15
secmodel_securelevel_vnode_cb100%of 3
securelevel_eval---of 3
securelevel_modcmd---of 7
sysctl_security_securelevel_setup---of 1
-----------
SUMMARY56%of 49
efs_done---of 1
efs_fhtovp---of 6
efs_init---of 1
efs_loadvnode---of 14
efs_modcmd---of 4
efs_mount22%of 28
efs_reinit---of 1
efs_root---of 3
efs_start---of 1
efs_statvfs---of 1
efs_unmount---of 3
efs_vget---of 4
efs_vptofh---of 3
x86_curlwp100%of 1
-----------
SUMMARY25%of 29
compat_ifmediareq_post---of 8
compat_ifmediareq_pre50%of 6
ifmedia_80_fini---of 5
ifmedia_80_init---of 1
-----------
SUMMARY50%of 6
explicit_memset100%of 1
-----------
SUMMARY100%of 1
raw_ctlinput100%of 1
raw_input55%of 33
raw_send46%of 11
raw_setpeeraddr100%of 1
raw_setsockaddr100%of 1
raw_usrreq---of 23
-----------
SUMMARY56%of 47
fd_motor_off---of 11
fd_motor_on---of 4
fd_mountroot_hook---of 4
fd_nvtotype---of 3
fd_set_motor---of 11
fdattach---of 11
fdc_childdet---of 3
fdcattach---of 5
fdcdetach---of 3
fdcfinishattach---of 31
fdcintr---of 1
fdcintr1---of 197
fdcintrcb---of 1
fdclose---of 1
fdcresult---of 7
fdcresume---of 1
fdcretry---of 10
fdcstart---of 6
fdcstatus---of 13
fdcsuspend---of 22
fdctimeout---of 3
fddetach---of 3
fdfinish---of 8
fdformat---of 3
fdioctl---of 41
fdopen19%of 11
fdprint---of 3
fdprobe---of 31
fdread---of 1
fdstart---of 10
fdstrategy---of 17
fdwrite---of 1
out_fdc---of 4
-----------
SUMMARY19%of 11
uvm_availmem100%of 1
uvm_cpu_attach---of 3
uvm_page_init---of 32
uvm_page_lookup_freelist67%of 3
uvm_page_numa_load---of 3
uvm_page_owner_locked_p100%of 6
uvm_page_physget---of 12
uvm_page_print_freelists---of 9
uvm_page_printall---of 7
uvm_page_printit---of 20
uvm_page_rebucket---of 13
uvm_page_recolor---of 1
uvm_page_redim---of 49
uvm_page_unbusy40%of 28
uvm_pageactivate60%of 10
uvm_pagealloc_pgb62%of 26
uvm_pagealloc_pgfl63%of 8
uvm_pagealloc_strat40%of 78
uvm_pageboot_alloc---of 11
uvm_pagecopy100%of 1
uvm_pagedeactivate55%of 11
uvm_pagedequeue70%of 10
uvm_pageenqueue37%of 11
uvm_pagefree38%of 66
uvm_pageinsert_object67%of 15
uvm_pageismanaged---of 1
uvm_pagelock100%of 1
uvm_pagelock2---of 1
uvm_pagelookup56%of 9
uvm_pagereadonly_p78%of 9
uvm_pagerealloc24%of 17
uvm_pageremove_object67%of 15
uvm_pagereplace---of 20
uvm_pageunlock100%of 3
uvm_pageunlock2---of 9
uvm_pageunwire54%of 15
uvm_pagewait40%of 10
uvm_pagewakeup60%of 5
uvm_pagewanted_p---of 8
uvm_pagewire59%of 12
uvm_pagezero---of 1
uvm_pgfl_lock---of 1
uvm_pgfl_unlock---of 1
uvm_setpagesize---of 6
uvm_vm_page_to_phys100%of 1
x86_curlwp100%of 1
-----------
SUMMARY50%of 372
copyout_msg_control55%of 11
copyout_sockname60%of 15
copyout_sockname_sb---of 14
do_sys_accept67%of 33
do_sys_bind---of 3
do_sys_connect41%of 22
do_sys_getpeername70%of 10
do_sys_getsockname63%of 8
do_sys_peeloff100%of 1
do_sys_recvmsg100%of 3
do_sys_recvmsg_so84%of 37
do_sys_sendmsg67%of 6
do_sys_sendmsg_so79%of 52
free_control_mbuf67%of 18
getsockopt84%of 12
sockargs52%of 25
sys___socket30100%of 3
sys_accept62%of 13
sys_bind100%of 9
sys_connect100%of 7
sys_getpeername60%of 10
sys_getsockname90%of 10
sys_getsockopt100%of 1
sys_getsockopt2---of 1
sys_listen100%of 3
sys_paccept32%of 16
sys_recvfrom100%of 5
sys_recvmmsg75%of 39
sys_recvmsg79%of 19
sys_sendmmsg100%of 10
sys_sendmsg100%of 3
sys_sendto100%of 1
sys_setsockopt100%of 10
sys_shutdown63%of 8
sys_socketpair79%of 14
x86_curlwp100%of 1
-----------
SUMMARY73%of 435
pat_init---of 3
pmap_activate50%of 8
pmap_activate_sync---of 12
pmap_bootstrap---of 46
pmap_changeprot_local---of 7
pmap_check_pv70%of 20
pmap_clear_attrs58%of 14
pmap_compare_key---of 1
pmap_compare_nodes67%of 3
pmap_copy_page50%of 4
pmap_cpu_init_late---of 1
pmap_create100%of 1
pmap_ctor45%of 9
pmap_deactivate57%of 16
pmap_deactivate_sync---of 15
pmap_destroy57%of 32
pmap_drain_pv47%of 15
pmap_dtor34%of 6
pmap_dump---of 16
pmap_enter67%of 3
pmap_enter_ma52%of 182
pmap_extract_ma65%of 17
pmap_find_ptp70%of 13
pmap_fork34%of 6
pmap_free_ptp50%of 12
pmap_free_pv41%of 27
pmap_freepage60%of 15
pmap_get_physpage---of 8
pmap_growkernel---of 32
pmap_init---of 1
pmap_init_tmp_pgtbl---of 7
pmap_is_curpmap---of 3
pmap_is_user---of 3
pmap_kenter_ma50%of 12
pmap_kremove100%of 1
pmap_kremove165%of 14
pmap_kremove_local---of 1
pmap_ldt_cleanup---of 5
pmap_ldt_sync---of 3
pmap_ldt_xcall---of 3
pmap_load62%of 13
pmap_load156%of 9
pmap_lookup_pv58%of 21
pmap_map_ptes78%of 9
pmap_page_remove100%of 1
pmap_pdes_valid---of 9
pmap_pp_clear_attrs50%of 16
pmap_pp_remove59%of 63
pmap_pv_clear_attrs---of 3
pmap_pv_remove---of 3
pmap_pvp_ctor75%of 8
pmap_pvp_dtor60%of 5
pmap_reactivate67%of 9
pmap_reference---of 1
pmap_remove63%of 45
pmap_remove_all---of 73
pmap_remove_pte50%of 28
pmap_remove_pv53%of 46
pmap_resident_count100%of 1
pmap_sync_pv55%of 55
pmap_test_attrs---of 18
pmap_unget_ptp---of 20
pmap_unmap_ptes65%of 14
pmap_unwire30%of 17
pmap_update56%of 18
pmap_virtual_space---of 1
pmap_vpage_cpu_init---of 5
pmap_wired_count100%of 1
pmap_write_protect54%of 26
pmap_zero_page67%of 3
slotspace_rand---of 21
vtophys100%of 1
x86_curlwp100%of 1
x86_mmap_flags100%of 1
-----------
SUMMARY56%of 841
kmeminit_nkmempages---of 3
uvm_km_alloc55%of 22
uvm_km_bootstrap---of 12
uvm_km_check_empty57%of 16
uvm_km_free42%of 12
uvm_km_init---of 1
uvm_km_kmem_alloc50%of 12
uvm_km_kmem_free100%of 1
uvm_km_pgremove47%of 15
uvm_km_pgremove_intrsafe60%of 25
uvm_km_protect---of 1
uvm_km_suballoc---of 8
uvm_km_va_starved_p---of 3
-----------
SUMMARY54%of 103
copyin_pid---of 7
copyin_proc---of 6
copyin_vmspace---of 5
copyout_proc---of 6
copyout_vmspace---of 5
ioctl_copyin---of 3
ioctl_copyout---of 3
ucas_int---of 1
ucas_ptr---of 1
ufetch_16---of 1
ufetch_32---of 1
ufetch_64---of 1
ufetch_8---of 1
uio_setup_sysspace100%of 1
uiomove73%of 22
uiomove_frombuf---of 4
uiopeek48%of 19
uioskip60%of 15
ureadc---of 8
ustore_16---of 1
ustore_32---of 1
ustore_char100%of 1
ustore_long---of 1
x86_curlwp100%of 1
-----------
SUMMARY63%of 59
_lwp_getspecific_by_lwp---of 1
lwp_finispecific100%of 1
lwp_getspecific100%of 1
lwp_initspecific67%of 3
lwp_setspecific---of 1
lwp_setspecific_by_lwp---of 1
lwp_specific_key_create---of 1
lwp_specific_key_delete---of 1
lwpinit_specificdata---of 3
x86_curlwp100%of 1
-----------
SUMMARY84%of 6
cpu_lwp_fork67%of 9
cpu_lwp_free40%of 5
cpu_lwp_free260%of 5
cpu_proc_fork100%of 1
cpu_uarea_alloc50%of 4
cpu_uarea_free---of 5
kvtop---of 3
vmapbuf67%of 6
vunmapbuf---of 3
x86_curlwp100%of 1
-----------
SUMMARY62%of 31
mfs_bmap---of 7
mfs_close---of 11
mfs_doio---of 5
mfs_inactive67%of 3
mfs_open---of 3
mfs_print---of 3
mfs_reclaim67%of 3
mfs_strategy25%of 12
x86_curlwp100%of 1
-----------
SUMMARY43%of 19
cache_activate---of 9
cache_compare_nodes80%of 5
cache_cpu_init---of 3
cache_cross_mount100%of 1
cache_deactivate19%of 16
cache_enter50%of 60
cache_enter_id100%of 3
cache_enter_mount55%of 11
cache_have_id58%of 7
cache_lookup54%of 60
cache_lookup_entry63%of 16
cache_lookup_linked63%of 62
cache_lookup_mount67%of 3
cache_lookup_raw---of 1
cache_purge131%of 26
cache_purge_children84%of 6
cache_purgevfs75%of 4
cache_remove57%of 37
cache_revlookup47%of 49
cache_stat_sysctl---of 4
cache_update_stats---of 28
cache_vdir_filter100%of 1
cache_vnode_fini60%of 5
cache_vnode_init100%of 1
namecache_count_2passes60%of 5
namecache_count_pass260%of 5
namecache_print---of 22
nchinit---of 5
x86_curlwp100%of 1
-----------
SUMMARY54%of 384
cpu_spawn_return---of 1
md_child_return---of 1
syscall69%of 19
syscall_intern100%of 1
userret64%of 19
x86_curlwp100%of 1
-----------
SUMMARY68%of 40
-----------
SUMMARY---of 0
joyattach---of 3
joyclose---of 1
joydetach---of 1
joyioctl---of 9
joyopen50%of 4
joyread---of 9
-----------
SUMMARY50%of 4
exec_script_makecmds67%of 33
exec_script_modcmd---of 5
-----------
SUMMARY67%of 33
if_vioif_modcmd---of 1
vioif_attach---of 134
vioif_cfg_softint---of 1
vioif_config_change---of 1
vioif_ctrl_intr---of 4
vioif_ctrl_release---of 5
vioif_ctrl_send_command---of 19
vioif_deferred_transmit---of 1
vioif_finalize_teardown---of 3
vioif_ifflags13%of 16
vioif_ifflags_cb---of 1
vioif_init---of 21
vioif_ioctl60%of 5
vioif_match---of 1
vioif_net_dequeue_commit---of 3
vioif_net_enqueue60%of 5
vioif_net_sched_handle---of 11
vioif_populate_rx_mbufs_locked---of 17
vioif_rx_deq_locked---of 16
vioif_rx_filter10%of 20
vioif_rx_handle---of 5
vioif_rx_handle_locked---of 10
vioif_rx_intr---of 4
vioif_send_common_locked30%of 30
vioif_set_rx_filter---of 8
vioif_start100%of 1
vioif_stop---of 30
vioif_transmit---of 6
vioif_tx_deq_locked---of 14
vioif_tx_handle---of 5
vioif_tx_handle_locked---of 13
vioif_tx_intr---of 4
vioif_update_link_status---of 7
vioif_watchdog---of 9
vioif_workq_work---of 3
x86_curlwp---of 1
-----------
SUMMARY26%of 77
ptmattach---of 4
ptmclose---of 1
ptmioctl---of 19
ptmopen---of 12
pty_alloc_master---of 13
pty_fill_ptmget---of 4
pty_getmp67%of 3
pty_grant_slave34%of 6
pty_makedev100%of 1
pty_sethandler---of 1
pty_vn_open---of 6
x86_curlwp---of 1
-----------
SUMMARY50%of 10
add_suspensor---of 35
complete_suspension---of 51
device_pmf_remove_suspensor---of 39
input_activity_handler---of 3
input_idle---of 7
pmf_check_system_drivers---of 8
pmf_class_display_deregister---of 9
pmf_class_display_register---of 6
pmf_class_input_deregister---of 1
pmf_class_input_register---of 3
pmf_class_network_register---of 1
pmf_class_network_resume---of 6
pmf_class_network_suspend---of 1
pmf_device_deregister100%of 1
pmf_device_descendants_release---of 6
pmf_device_descendants_resume---of 10
pmf_device_recursive_resume---of 5
pmf_device_recursive_suspend---of 7
pmf_device_register1100%of 1
pmf_device_resume---of 30
pmf_device_subtree_release---of 1
pmf_device_subtree_resume---of 3
pmf_device_suspend---of 22
pmf_event_deregister---of 14
pmf_event_inject---of 7
pmf_event_register---of 3
pmf_event_worker---of 12
pmf_get_platform---of 3
pmf_init---of 8
pmf_qual_recursive_copy---of 1
pmf_self_suspensor_init---of 1
pmf_set_platform---of 4
pmf_suspend_worker---of 15
pmf_system_bus_resume---of 9
pmf_system_resume---of 14
pmf_system_shutdown---of 8
pmf_system_suspend---of 11
sysctl_pmf_setup---of 1
-----------
SUMMARY100%of 2
x86_curlwp---of 1
xc__highpri_intr---of 9
xc_barrier50%of 10
xc_broadcast40%of 10
xc_encode_ipl67%of 3
xc_highpri54%of 13
xc_init_cpu---of 12
xc_ipi_handler---of 5
xc_lowpri50%of 14
xc_nop---of 1
xc_thread---of 14
xc_unicast36%of 14
xc_wait60%of 10
-----------
SUMMARY49%of 74
-----------
SUMMARY---of 0
minphys100%of 3
physio61%of 46
physio_biodone50%of 10
physio_done---of 17
physio_init---of 3
x86_curlwp100%of 1
-----------
SUMMARY62%of 60
_netbsd_keccakf1600100%of 3
-----------
SUMMARY100%of 3
cgaccount---of 76
expunge---of 41
ffs_copyonwrite---of 56
ffs_snapblkfree5%of 60
ffs_snapgone---of 28
ffs_snapremove---of 55
ffs_snapshot---of 262
ffs_snapshot_fini---of 5
ffs_snapshot_init100%of 1
ffs_snapshot_mount---of 27
ffs_snapshot_read---of 17
ffs_snapshot_unmount---of 15
fullacct---of 3
indiracct---of 18
mapacct---of 29
snapacct---of 44
snapblkaddr---of 19
snapshot_expunge_selector---of 8
syncsnap---of 10
x86_curlwp100%of 1
-----------
SUMMARY9%of 62
mq_close_fop---of 6
mq_handle_open---of 33
mq_listener_cb67%of 3
mq_poll_fop---of 7
mq_recv1---of 31
mq_send1---of 44
mq_stat_fop---of 1
mqueue_destroy---of 15
mqueue_get---of 6
mqueue_modcmd---of 7
mqueue_print_list---of 4
mqueue_sysctl_init---of 3
sys___mq_timedreceive50---of 6
sys___mq_timedsend50---of 4
sys_mq_close---of 1
sys_mq_getattr---of 4
sys_mq_notify---of 11
sys_mq_open---of 6
sys_mq_receive---of 3
sys_mq_send---of 1
sys_mq_setattr---of 8
sys_mq_unlink---of 17
x86_curlwp---of 1
-----------
SUMMARY67%of 3
kobj_load_vfs100%of 1
-----------
SUMMARY100%of 1
_prop_generic_internalize25%of 12
_prop_object_externalize_append_char---of 10
_prop_object_externalize_append_cstring---of 5
_prop_object_externalize_append_encoded_cstring---of 22
_prop_object_externalize_context_alloc---of 4
_prop_object_externalize_context_free---of 1
_prop_object_externalize_empty_tag---of 13
_prop_object_externalize_end_tag---of 9
_prop_object_externalize_footer---of 11
_prop_object_externalize_header---of 29
_prop_object_externalize_start_tag---of 10
_prop_object_fini---of 1
_prop_object_init100%of 1
_prop_object_internalize_by_tag---of 30
_prop_object_internalize_context_alloc32%of 19
_prop_object_internalize_context_free---of 1
_prop_object_internalize_decode_string---of 30
_prop_object_internalize_find_tag17%of 59
_prop_object_internalize_match---of 3
prop_object_equals---of 1
prop_object_equals_with_error---of 16
prop_object_iterator_next---of 1
prop_object_iterator_release---of 1
prop_object_iterator_reset---of 1
prop_object_release41%of 32
prop_object_retain67%of 3
prop_object_type67%of 3
-----------
SUMMARY29%of 129
mfs_done---of 3
mfs_init---of 3
mfs_modcmd---of 4
mfs_mount29%of 21
mfs_mountroot---of 4
mfs_reinit---of 1
mfs_start---of 16
mfs_statvfs---of 3
mfs_sysctl_setup---of 1
x86_curlwp100%of 1
-----------
SUMMARY32%of 22
ffs_acls---of 14
ffs_cgupdate---of 13
ffs_done---of 3
ffs_extattrctl---of 3
ffs_fhtovp---of 14
ffs_flushfiles---of 13
ffs_init---of 3
ffs_init_vnode60%of 5
ffs_loadvnode---of 10
ffs_modcmd---of 7
ffs_mount7%of 86
ffs_mountfs10%of 86
ffs_mountroot---of 5
ffs_newvnode34%of 48
ffs_oldfscompat_read---of 16
ffs_reinit---of 1
ffs_reload---of 48
ffs_sbupdate---of 12
ffs_snapshot_cb67%of 3
ffs_statvfs100%of 1
ffs_superblock_validate---of 26
ffs_sync15%of 27
ffs_sync_selector50%of 10
ffs_sysctl_setup---of 1
ffs_unmount---of 24
ffs_vfs_fsync---of 18
ffs_vptofh100%of 3
x86_curlwp100%of 1
-----------
SUMMARY19%of 270
child_return---of 5
fork154%of 94
sys___clone100%of 3
sys___vfork14---of 1
sys_fork100%of 1
sys_vfork100%of 1
x86_curlwp100%of 1
-----------
SUMMARY57%of 100
cpu_frequency---of 1
cpu_hascounter100%of 1
rdtsc_cpuid---of 1
rdtsc_lfence---of 1
rdtsc_mfence---of 1
tsc_apply_cpu---of 3
tsc_delay---of 4
tsc_get_timecount34%of 6
tsc_is_invariant---of 14
tsc_post_ap---of 6
tsc_read_bp---of 10
tsc_setfunc---of 4
tsc_sync_ap---of 4
tsc_sync_bp---of 4
tsc_sync_drift---of 3
tsc_tc_init---of 9
tsc_tc_reset---of 4
tsc_user_disable---of 1
tsc_user_enable---of 1
x86_curlwp100%of 1
-----------
SUMMARY50%of 8
_psref_held---of 15
psref_acquire50%of 12
psref_class_create---of 1
psref_class_destroy---of 3
psref_copy---of 13
psref_cpu_drained_p---of 3
psref_held---of 1
psref_init---of 1
psref_release55%of 22
psref_target_destroy---of 9
psref_target_init100%of 1
psreffed_p_xc---of 3
x86_curlwp100%of 1
-----------
SUMMARY56%of 36
-----------
SUMMARY---of 0
fill_lwp---of 13
sysctl_consdev---of 6
sysctl_debug_setup---of 6
sysctl_hw_cnmagic---of 5
sysctl_hw_misc_setup---of 1
sysctl_hw_usermem---of 4
sysctl_kern_boottime---of 1
sysctl_kern_cpid---of 13
sysctl_kern_cptime---of 14
sysctl_kern_defcorename---of 4
sysctl_kern_drivers---of 11
sysctl_kern_forkfsleep---of 4
sysctl_kern_hostid67%of 3
sysctl_kern_lwp---of 38
sysctl_kern_maxproc---of 4
sysctl_kern_maxptys---of 3
sysctl_kern_maxvnodes---of 7
sysctl_kern_messages---of 13
sysctl_kern_root_partition---of 1
sysctl_kern_rtc_offset---of 6
sysctl_kern_setup---of 1
sysctl_root_device---of 1
sysctl_security_setidcore---of 4
sysctl_security_setidcorename---of 5
x86_curlwp---of 1
-----------
SUMMARY67%of 3
filt_pipedetach---of 6
filt_piperead50%of 8
filt_pipewrite---of 8
pipe130%of 27
pipe_close100%of 1
pipe_ctor75%of 4
pipe_dtor100%of 3
pipe_fpathconf---of 3
pipe_init---of 5
pipe_ioctl54%of 13
pipe_kqfilter---of 5
pipe_poll60%of 15
pipe_posix_fadvise---of 1
pipe_read37%of 46
pipe_restart100%of 1
pipe_stat75%of 4
pipe_write46%of 64
pipeclose54%of 26
sysctl_kern_pipe_setup---of 1
x86_curlwp100%of 1
-----------
SUMMARY47%of 213
virtio_alloc_vq---of 16
virtio_attach_failed---of 5
virtio_child---of 1
virtio_child_attach_failed---of 4
virtio_child_attach_finish---of 27
virtio_child_attach_start---of 7
virtio_child_detach---of 4
virtio_dequeue---of 15
virtio_dequeue_commit---of 1
virtio_dmat100%of 1
virtio_enqueue62%of 21
virtio_enqueue_abort100%of 1
virtio_enqueue_commit58%of 21
virtio_enqueue_p60%of 22
virtio_enqueue_prep60%of 5
virtio_enqueue_reserve29%of 25
virtio_features---of 1
virtio_free_vq---of 7
virtio_init_vq---of 1
virtio_init_vq_vqdone---of 1
virtio_intrhand---of 1
virtio_modcmd---of 1
virtio_negotiate_features---of 3
virtio_postpone_intr---of 11
virtio_postpone_intr_far---of 5
virtio_postpone_intr_smart---of 5
virtio_print_device_type---of 3
virtio_read_device_config_1---of 1
virtio_read_device_config_2---of 3
virtio_read_device_config_4---of 3
virtio_read_device_config_8---of 5
virtio_read_device_config_le_2---of 3
virtio_read_device_config_le_4---of 3
virtio_reinit_end---of 1
virtio_reinit_start---of 9
virtio_reset---of 1
virtio_reset_vq---of 20
virtio_rw1660%of 5
virtio_rw32---of 5
virtio_rw6460%of 5
virtio_set_status---of 1
virtio_soft_intr---of 3
virtio_start_vq_intr---of 10
virtio_stop_vq_intr---of 6
virtio_vq_done---of 1
virtio_vq_intr---of 6
virtio_vq_is_enqueued---of 7
virtio_write_device_config_1---of 1
virtio_write_device_config_2---of 3
virtio_write_device_config_4---of 3
virtio_write_device_config_8---of 7
virtio_write_device_config_le_2---of 3
virtio_write_device_config_le_4---of 3
vq_alloc_slot60%of 15
vq_free_slot50%of 4
-----------
SUMMARY54%of 125
vioscsi_attach---of 25
vioscsi_detach---of 10
vioscsi_free_reqs---of 5
vioscsi_match---of 1
vioscsi_modcmd---of 1
vioscsi_scsipi_request35%of 26
vioscsi_vq_done---of 10
-----------
SUMMARY35%of 26
soo_close67%of 3
soo_fpathconf67%of 3
soo_ioctl56%of 45
soo_poll100%of 1
soo_posix_fadvise---of 1
soo_read100%of 1
soo_restart100%of 1
soo_stat58%of 7
soo_write100%of 1
x86_curlwp100%of 1
-----------
SUMMARY61%of 63
clockrnd_get---of 3
get_intr_timecount---of 1
getticks100%of 1
hardclock---of 16
initclocks---of 10
schedclock---of 3
startprofclock50%of 6
statclock---of 35
stopprofclock67%of 6
sysctl_kern_clockrate---of 1
x86_curlwp---of 1
-----------
SUMMARY62%of 13
in6_addrscope84%of 6
in6_clearscope67%of 6
in6_getscopename---of 9
in6_setscope86%of 27
in6_setzoneid---of 6
sa6_embedscope50%of 20
sa6_recoverscope64%of 11
scope6_addr2default---of 11
scope6_ifattach100%of 1
scope6_ifdetach---of 1
scope6_init---of 1
sockaddr_in6_externalize---of 3
-----------
SUMMARY71%of 71
-----------
SUMMARY---of 0
link_abort---of 3
link_accept---of 3
link_attach67%of 3
link_bind---of 3
link_connect---of 3
link_connect2---of 3
link_detach---of 3
link_disconnect---of 3
link_init---of 1
link_ioctl---of 51
link_listen---of 3
link_peeraddr---of 3
link_purgeif---of 1
link_rcvd---of 3
link_recvoob---of 3
link_send---of 3
link_sendoob---of 3
link_shutdown---of 3
link_sockaddr67%of 3
link_stat---of 3
sockaddr_dl_alloc---of 4
sockaddr_dl_cmp---of 24
sockaddr_dl_init50%of 6
sockaddr_dl_measure100%of 1
sockaddr_dl_setaddr---of 3
-----------
SUMMARY62%of 13
-----------
SUMMARY---of 0
stub_compat_70_unp_addsockcred---of 1
uipc_ctloutput30%of 30
uipc_init---of 5
unp_abort44%of 16
unp_accept43%of 21
unp_addsockcred---of 5
unp_attach63%of 27
unp_bind43%of 26
unp_connect56%of 38
unp_connect160%of 15
unp_connect272%of 7
unp_detach47%of 39
unp_discard_now50%of 6
unp_disconnect60%of 5
unp_disconnect167%of 12
unp_dispose69%of 19
unp_externalize62%of 21
unp_ioctl100%of 1
unp_listen58%of 7
unp_mark---of 6
unp_peeraddr---of 11
unp_rcvd62%of 18
unp_recvoob67%of 3
unp_resetlock67%of 6
unp_send60%of 92
unp_sendoob67%of 3
unp_setpeerlocks58%of 14
unp_shutdown73%of 11
unp_sockaddr56%of 9
unp_stat67%of 12
unp_thread---of 130
x86_curlwp100%of 1
-----------
SUMMARY56%of 459
layer_node_create50%of 4
layerfs_done---of 1
layerfs_init---of 1
-----------
SUMMARY50%of 4
copystr50%of 8
-----------
SUMMARY50%of 8
-----------
SUMMARY---of 0
compat_50_quota_modcmd---of 4
compat_50_sys_quotactl58%of 14
-----------
SUMMARY58%of 14
kauth_accmode_to_action100%of 1
kauth_authorize_action---of 4
kauth_authorize_action_internal54%of 15
kauth_authorize_device50%of 4
kauth_authorize_device_passthru---of 15
kauth_authorize_device_spec54%of 15
kauth_authorize_device_tty75%of 4
kauth_authorize_generic---of 4
kauth_authorize_machdep75%of 4
kauth_authorize_network75%of 4
kauth_authorize_process75%of 4
kauth_authorize_system75%of 4
kauth_authorize_vnode75%of 4
kauth_cred_alloc27%of 15
kauth_cred_clone100%of 1
kauth_cred_clone134%of 27
kauth_cred_copy---of 9
kauth_cred_dup43%of 7
kauth_cred_free32%of 22
kauth_cred_get100%of 1
kauth_cred_getdata---of 7
kauth_cred_getegid40%of 5
kauth_cred_geteuid40%of 5
kauth_cred_getgid40%of 5
kauth_cred_getgroups50%of 6
kauth_cred_getrefcnt---of 5
kauth_cred_getsvgid40%of 5
kauth_cred_getsvuid40%of 5
kauth_cred_getuid40%of 5
kauth_cred_group43%of 7
kauth_cred_groupmember50%of 14
kauth_cred_hold43%of 7
kauth_cred_ismember_gid64%of 11
kauth_cred_ngroups40%of 5
kauth_cred_setdata---of 7
kauth_cred_setegid43%of 7
kauth_cred_seteuid43%of 7
kauth_cred_setgid43%of 7
kauth_cred_setgroups64%of 11
kauth_cred_setsvgid43%of 7
kauth_cred_setsvuid43%of 7
kauth_cred_setuid43%of 7
kauth_cred_to_uucred---of 9
kauth_cred_topcred---of 7
kauth_cred_toucred---of 7
kauth_cred_uidmatch34%of 12
kauth_cred_uucmp---of 20
kauth_deregister_key---of 3
kauth_deregister_scope---of 7
kauth_extattr_action---of 1
kauth_init---of 1
kauth_listen_scope---of 7
kauth_proc_chroot30%of 17
kauth_proc_fork30%of 17
kauth_proc_setgroups53%of 17
kauth_register_key---of 5
kauth_register_scope---of 14
kauth_unlisten_scope---of 7
kauth_uucred_to_cred---of 7
x86_curlwp100%of 1
-----------
SUMMARY45%of 329
union_allocvp28%of 81
union_check_rmdir16%of 25
union_copyfile---of 6
union_copyup---of 11
union_dircache---of 8
union_dircache_r---of 8
union_diruncache---of 7
union_do_lookup---of 5
union_done---of 1
union_dowhiteout40%of 5
union_freevp100%of 1
union_init---of 1
union_loadvnode50%of 20
union_mkshadow---of 5
union_mkwhiteout---of 3
union_newsize67%of 12
union_newupper---of 19
union_readdirhook25%of 8
union_reinit---of 14
union_rele56%of 18
union_removed_upper31%of 13
union_vn_close---of 3
union_vn_create---of 5
-----------
SUMMARY35%of 183
pageflush_selector---of 6
puffs_modcmd---of 4
puffs_vfsop_done---of 1
puffs_vfsop_extattrctl---of 14
puffs_vfsop_fhtovp---of 11
puffs_vfsop_init---of 1
puffs_vfsop_loadvnode---of 5
puffs_vfsop_mount7%of 43
puffs_vfsop_root---of 5
puffs_vfsop_snapshot---of 1
puffs_vfsop_start---of 3
puffs_vfsop_statvfs---of 7
puffs_vfsop_sync---of 11
puffs_vfsop_unmount---of 27
puffs_vfsop_vptofh---of 17
x86_curlwp100%of 1
-----------
SUMMARY10%of 44
tmpfs_gro_directory_empty_p53%of 19
tmpfs_gro_genealogy49%of 45
tmpfs_gro_lock_directory56%of 9
tmpfs_gro_lookup53%of 19
tmpfs_gro_remove50%of 40
tmpfs_gro_remove_check_permitted50%of 26
tmpfs_gro_remove_check_possible52%of 25
tmpfs_gro_rename53%of 123
tmpfs_gro_rename_check_permitted52%of 52
tmpfs_gro_rename_check_possible54%of 50
tmpfs_rename100%of 1
tmpfs_rmdired_p56%of 9
tmpfs_sane_rename100%of 1
-----------
SUMMARY53%of 419
compat_30_vndioctl50%of 4
vnd_30_fini---of 3
vnd_30_init---of 1
-----------
SUMMARY50%of 4
chacha_core---of 1
chacha_md_init---of 5
chacha_modcmd---of 6
chacha_stream100%of 1
chacha_stream_xor---of 1
hchacha---of 1
sysctl_kern_crypto_chacha_selected---of 1
sysctl_kern_crypto_chacha_setup---of 1
xchacha_stream---of 1
xchacha_stream_xor---of 1
-----------
SUMMARY100%of 1
del_m6if---of 10
expire_upcalls---of 12
ip6_mdq---of 65
ip6_mforward---of 54
ip6_mrouter_detach---of 18
ip6_mrouter_done32%of 16
ip6_mrouter_get50%of 4
ip6_mrouter_set12%of 67
mrt6_ioctl19%of 11
pim6_init---of 1
pim6_input---of 19
socket_send---of 9
sysctl_net_inet6_pim6_stats---of 1
-----------
SUMMARY18%of 98
closef65%of 14
fbadop_close---of 1
fbadop_ioctl---of 1
fbadop_read100%of 1
fbadop_stat100%of 1
fbadop_write100%of 1
fd_abort56%of 18
fd_affix53%of 21
fd_alloc58%of 33
fd_allocfile50%of 14
fd_checkmaps59%of 24
fd_clone60%of 5
fd_close62%of 26
fd_closeexec---of 17
fd_copy52%of 89
fd_dup60%of 5
fd_dup250%of 16
fd_dupopen75%of 8
fd_free12%of 54
fd_getfile70%of 13
fd_getfile2---of 7
fd_getsock100%of 4
fd_getsock1100%of 4
fd_getvnode80%of 5
fd_hold100%of 1
fd_init---of 62
fd_putfile50%of 18
fd_set_exclose80%of 5
fd_share100%of 1
fd_sys_init---of 5
fd_tryexpand---of 34
fd_unused57%of 30
fd_used50%of 16
fgetdummy---of 1
fgetown100%of 1
file_ctor40%of 5
file_dtor38%of 8
filedesc_ctor100%of 1
filedesc_dtor---of 1
filedescopen100%of 1
fnullop_fcntl100%of 1
fnullop_kqfilter---of 1
fnullop_poll100%of 1
fnullop_restart100%of 1
fownsignal50%of 8
fputdummy---of 1
fsetown80%of 10
sysctl_file_marker_reset---of 14
sysctl_kern_file---of 34
sysctl_kern_file2---of 50
x86_curlwp100%of 1
-----------
SUMMARY53%of 465
filt_ptcrdetach---of 1
filt_ptcread---of 21
filt_ptcwdetach---of 1
filt_ptcwrite---of 12
ptcclose100%of 1
ptckqfilter---of 4
ptcopen50%of 4
ptcpoll27%of 26
ptcread14%of 23
ptcwakeup50%of 8
ptcwrite---of 32
ptsclose---of 1
ptsopen46%of 11
ptspoll67%of 3
ptsread---of 26
ptsstart---of 8
ptsstop---of 9
ptswrite---of 3
pty_check25%of 12
pty_isfree50%of 8
pty_maxptys---of 4
ptyattach---of 1
ptyioctl23%of 70
ptytty---of 1
x86_curlwp---of 1
-----------
SUMMARY29%of 166
cpu_initclocks---of 1
cpu_intr_p60%of 5
cpu_kpreempt_disabled100%of 1
cpu_kpreempt_enter80%of 5
cpu_kpreempt_exit80%of 5
cpu_need_proftick---of 5
cpu_need_resched37%of 11
cpu_signotify60%of 5
get_booted_kernel---of 5
init_x86_clusters---of 24
init_x86_msgbuf---of 11
init_x86_vm---of 41
intr_findpic---of 1
lookup_bootinfo---of 5
machdep_init---of 1
mm_md_physacc86%of 7
sysctl_machdep_booted_kernel---of 5
sysctl_machdep_bootmethod---of 1
sysctl_machdep_cpu_idle---of 1
sysctl_machdep_diskinfo---of 3
sysctl_machdep_hypervisor---of 3
sysctl_machdep_setup---of 1
sysctl_machdep_tsc_enable---of 5
x86_add_cluster---of 18
x86_cpu_idle_get---of 1
x86_cpu_idle_init---of 1
x86_cpu_idle_set---of 1
x86_cpu_is_lcall---of 5
x86_curlwp100%of 1
x86_listener_cb40%of 5
x86_parse_clusters---of 18
x86_reset---of 7
x86_rndseed---of 10
x86_select_freelist---of 9
x86_startup---of 1
-----------
SUMMARY63%of 45
_vstate_assert54%of 13
holdrele---of 1
holdrelel50%of 10
lru_iter_first50%of 6
lru_iter_next24%of 13
lru_iter_release37%of 11
lru_requeue70%of 23
vcache_alloc100%of 1
vcache_dealloc---of 3
vcache_free60%of 15
vcache_get41%of 47
vcache_make_anon38%of 35
vcache_new42%of 31
vcache_reclaim44%of 66
vcache_rekey_enter---of 39
vcache_rekey_exit---of 46
vcache_stats---of 10
vcache_tryvget50%of 6
vcache_vget32%of 16
vdead_check50%of 14
vdrain_one---of 24
vdrain_task---of 4
vfs_drainvnodes---of 17
vfs_vnode_sysinit---of 7
vgone45%of 9
vhold---of 1
vholdl56%of 9
vnalloc_marker100%of 1
vnfree_marker67%of 3
vnis_marker100%of 1
vput58%of 7
vrecycle47%of 13
vref60%of 5
vrefcnt67%of 3
vrele100%of 3
vrele_async---of 3
vrele_deferred---of 13
vrele_flush38%of 8
vrele_task---of 9
vrelel51%of 93
vrevoke40%of 15
vrevoke_suspend_next37%of 11
vshareilock100%of 1
vshareklist67%of 3
vstate_assert_change69%of 19
vstate_assert_wait_stable43%of 7
vtryrele63%of 8
vwakeup56%of 9
x86_curlwp100%of 1
-----------
SUMMARY49%of 536
vhci_activate---of 3
vhci_allocx100%of 1
vhci_attach---of 4
vhci_device_ctrl_abort---of 15
vhci_device_ctrl_cleartoggle---of 1
vhci_device_ctrl_close100%of 1
vhci_device_ctrl_done---of 1
vhci_device_ctrl_start60%of 32
vhci_device_ctrl_transfer100%of 1
vhci_fd_close---of 3
vhci_fd_ioctl29%of 14
vhci_fd_open50%of 4
vhci_fd_read13%of 24
vhci_fd_write---of 26
vhci_freex60%of 5
vhci_get_lock---of 1
vhci_match---of 1
vhci_open29%of 7
vhci_pkt_destroy---of 20
vhci_root_intr_abort---of 10
vhci_root_intr_cleartoggle---of 1
vhci_root_intr_close---of 5
vhci_root_intr_done58%of 7
vhci_root_intr_start43%of 7
vhci_root_intr_transfer---of 7
vhci_roothub_ctrl19%of 22
vhci_softintr---of 1
vhci_usb_detach---of 37
vhciattach---of 3
-----------
SUMMARY38%of 125
compat_ifconf56%of 43
uipc_syscalls_40_fini---of 3
uipc_syscalls_40_init---of 1
x86_curlwp100%of 1
-----------
SUMMARY57%of 44
-----------
SUMMARY---of 0
callout_ack---of 6
callout_active50%of 6
callout_destroy50%of 12
callout_expired---of 6
callout_halt55%of 22
callout_hardclock---of 14
callout_init60%of 5
callout_init_cpu---of 5
callout_invoking50%of 6
callout_pending50%of 6
callout_reset50%of 10
callout_schedule50%of 6
callout_schedule_locked54%of 13
callout_setfunc50%of 10
callout_softclock---of 34
callout_startup---of 5
callout_stop60%of 10
callout_wait---of 17
db_show_callout---of 9
db_show_callout_bucket---of 5
x86_curlwp---of 1
-----------
SUMMARY53%of 106
db_usb_xfer---of 3
db_usb_xferlist---of 5
filt_usbrdetach---of 1
filt_usbread---of 3
usb_activate---of 3
usb_add_event---of 14
usb_add_task---of 9
usb_async_intr---of 3
usb_attach---of 7
usb_childdet---of 7
usb_detach---of 11
usb_discover---of 17
usb_doattach---of 10
usb_event_thread---of 8
usb_get_next_event---of 9
usb_in_event_thread43%of 7
usb_match---of 1
usb_needs_explore67%of 3
usb_needs_reattach---of 3
usb_once_init---of 5
usb_rem_task---of 16
usb_rem_task_wait40%of 23
usb_schedsoftintr---of 4
usb_soft_intr---of 1
usb_task_pending---of 1
usb_task_thread---of 20
usbclose---of 3
usbctlprint---of 3
usbd_add_dev_event---of 1
usbd_add_drv_event---of 1
usbioctl---of 28
usbkqfilter---of 4
usbopen---of 6
usbpoll---of 4
usbread---of 15
x86_curlwp100%of 1
-----------
SUMMARY45%of 34
compat_50_sys___shmctl13100%of 6
-----------
SUMMARY100%of 6
coda_done---of 4
coda_fhtovp---of 15
coda_init---of 4
coda_loadvnode---of 3
coda_modcmd---of 4
coda_mount30%of 20
coda_nb_statvfs---of 8
coda_root---of 14
coda_start---of 4
coda_sync---of 4
coda_unmount---of 9
coda_vfsopstats_init---of 1
coda_vget---of 4
coda_vptofh---of 4
devtomp---of 3
getNewVnode---of 6
sysctl_vfs_coda_setup---of 1
x86_curlwp100%of 1
-----------
SUMMARY34%of 21
entpool_enter69%of 16
entpool_enter_nostir50%of 14
entpool_extract50%of 10
entpool_selftest---of 172
entpool_stir---of 6
-----------
SUMMARY58%of 40
-----------
SUMMARY---of 0
ufs_gro_directory_empty_p53%of 17
ufs_gro_genealogy47%of 60
ufs_gro_lock_directory54%of 13
ufs_gro_lookup53%of 17
ufs_gro_remove50%of 32
ufs_gro_remove_check_permitted53%of 21
ufs_gro_remove_check_possible53%of 21
ufs_gro_rename47%of 169
ufs_gro_rename_check_permitted56%of 43
ufs_gro_rename_check_possible56%of 43
ufs_rename100%of 1
ufs_rename_ulr_overlap_p62%of 13
ufs_sane_rename100%of 1
-----------
SUMMARY51%of 451
uvm_aio_aiodone60%of 15
uvm_aio_aiodone_pages25%of 52
uvm_pager_init---of 9
uvm_pager_realloc_emerg---of 8
uvm_pageratop---of 5
uvm_pagermapin42%of 17
uvm_pagermapout43%of 7
x86_curlwp100%of 1
-----------
SUMMARY36%of 92
VOP_ABORTOP75%of 8
VOP_ACCESS100%of 5
VOP_ACCESSX100%of 5
VOP_ACLCHECK---of 8
VOP_ADVLOCK100%of 5
VOP_BMAP75%of 8
VOP_BWRITE50%of 8
VOP_CLOSE75%of 8
VOP_CLOSEEXTATTR---of 5
VOP_CREATE64%of 11
VOP_DELETEEXTATTR---of 5
VOP_FALLOCATE---of 5
VOP_FCNTL100%of 5
VOP_FDISCARD---of 5
VOP_FSYNC100%of 5
VOP_GETACL---of 8
VOP_GETATTR100%of 5
VOP_GETEXTATTR---of 5
VOP_GETPAGES100%of 5
VOP_INACTIVE100%of 5
VOP_IOCTL100%of 5
VOP_ISLOCKED100%of 5
VOP_KQFILTER---of 8
VOP_LINK70%of 10
VOP_LISTEXTATTR---of 5
VOP_LOCK80%of 15
VOP_LOOKUP78%of 9
VOP_MKDIR64%of 11
VOP_MKNOD64%of 11
VOP_MMAP75%of 8
VOP_OPEN88%of 8
VOP_OPENEXTATTR---of 5
VOP_PARSEPATH100%of 5
VOP_PATHCONF100%of 5
VOP_POLL75%of 8
VOP_PRINT---of 8
VOP_PUTPAGES100%of 5
VOP_READ88%of 8
VOP_READDIR100%of 5
VOP_READLINK100%of 5
VOP_RECLAIM100%of 5
VOP_REMOVE70%of 13
VOP_RENAME75%of 8
VOP_REVOKE60%of 5
VOP_RMDIR70%of 13
VOP_SEEK75%of 8
VOP_SETACL---of 8
VOP_SETATTR47%of 15
VOP_SETEXTATTR---of 5
VOP_STRATEGY60%of 5
VOP_SYMLINK64%of 11
VOP_UNLOCK100%of 5
VOP_WHITEOUT100%of 5
VOP_WRITE64%of 11
-----------
SUMMARY79%of 310
nd_attach_domain---of 3
nd_nud_hint---of 11
nd_resolve23%of 27
nd_set_timer---of 30
nd_timer---of 54
-----------
SUMMARY23%of 27
at_pcbconnect6%of 37
at_pcbsetaddr---of 36
ddp_abort_wrapper---of 3
ddp_accept_wrapper---of 3
ddp_attach_wrapper84%of 6
ddp_bind_wrapper---of 5
ddp_connect2_wrapper67%of 3
ddp_connect_wrapper---of 10
ddp_detach56%of 9
ddp_detach_wrapper100%of 1
ddp_disconnect_wrapper---of 7
ddp_init---of 5
ddp_ioctl_wrapper100%of 1
ddp_listen_wrapper67%of 3
ddp_peeraddr_wrapper---of 3
ddp_purgeif_wrapper---of 1
ddp_rcvd_wrapper---of 3
ddp_recvoob_wrapper67%of 3
ddp_search---of 16
ddp_send_wrapper40%of 10
ddp_sendoob_wrapper---of 3
ddp_shutdown_wrapper67%of 3
ddp_sockaddr_wrapper---of 7
ddp_stat_wrapper67%of 3
sysctl_net_atalk_ddp_setup---of 1
sysctl_net_atalk_ddp_stats---of 1
-----------
SUMMARY36%of 79
genfs_insane_rename60%of 30
genfs_rename_cache_purge65%of 28
genfs_rename_exit60%of 30
genfs_rename_knote56%of 38
genfs_rename_lock47%of 98
genfs_sane_rename52%of 184
genfs_ufslike_remove_check_permitted50%of 20
genfs_ufslike_remove_check_possible100%of 1
genfs_ufslike_rename_check_permitted57%of 48
genfs_ufslike_rename_check_possible50%of 4
-----------
SUMMARY54%of 481
uvm_obj_clean_p67%of 3
uvm_obj_destroy80%of 5
uvm_obj_init100%of 3
uvm_obj_nowriteback_p67%of 3
uvm_obj_page_clear_dirty60%of 5
uvm_obj_page_clear_writeback60%of 5
uvm_obj_page_dirty_p60%of 5
uvm_obj_page_set_dirty60%of 5
uvm_obj_page_set_writeback60%of 5
uvm_obj_page_writeback_p60%of 5
uvm_obj_setlock80%of 5
uvm_obj_unwirepages---of 8
uvm_obj_wirepages46%of 24
uvm_object_printit---of 10
-----------
SUMMARY61%of 73
-----------
SUMMARY---of 0
_rnd_add_uint32---of 1
_rnd_add_uint64---of 1
attach_seed_rndsource---of 9
entropy_account_cpu12%of 45
entropy_bootrequest---of 9
entropy_consolidate50%of 6
entropy_consolidate_xc---of 25
entropy_cpu_put58%of 7
entropy_enter53%of 17
entropy_enter_early---of 5
entropy_epoch100%of 1
entropy_extract24%of 34
entropy_fini_cpu---of 1
entropy_init_cpu---of 1
entropy_ioctl30%of 70
entropy_kqfilter---of 6
entropy_notify---of 15
entropy_pending_cpu---of 5
entropy_poll37%of 11
entropy_ready---of 1
entropy_request---of 17
entropy_reset_xc---of 6
entropy_softintr---of 5
entropy_thread---of 14
filt_entropy_read_detach---of 3
filt_entropy_read_event---of 11
rnd_add_data56%of 9
rnd_add_data_135%of 32
rnd_add_data_internal50%of 14
rnd_add_data_intr---of 1
rnd_add_data_sync---of 1
rnd_add_uint32100%of 1
rnd_attach_source---of 29
rnd_detach_source---of 16
rnd_init---of 15
rnd_init_softint---of 6
rnd_lock_sources---of 15
rnd_seed---of 14
rnd_system_ioctl100%of 1
rnd_unlock_sources---of 10
rndsource_entropybits_cpu---of 3
rndsource_setcb---of 1
rndsource_to_user---of 11
rndsource_to_user_est---of 5
rndsource_to_user_est_cpu---of 5
sysctl_entropy_consolidate---of 3
sysctl_entropy_gather---of 3
x86_curlwp100%of 1
-----------
SUMMARY33%of 249
clockctl_listener_cb100%of 1
clockctl_modcmd---of 5
clockctlattach---of 1
clockctlclose100%of 1
clockctlioctl67%of 15
clockctlopen100%of 1
-----------
SUMMARY73%of 18
pckbport_attach---of 3
pckbport_attach_slot---of 5
pckbport_cleanqueue---of 11
pckbport_cleanup---of 7
pckbport_cmdresponse---of 23
pckbport_cnattach---of 3
pckbport_enqueue_cmd29%of 28
pckbport_flush---of 1
pckbport_poll_cmd---of 4
pckbport_poll_cmd1---of 28
pckbport_poll_data---of 5
pckbport_set_inputhandler---of 3
pckbport_set_poll---of 1
pckbport_slot_enable100%of 1
pckbport_start17%of 18
pckbport_xt_translation---of 1
pckbportintr---of 7
pckbportprint---of 3
-----------
SUMMARY26%of 47
critpollhook_disestablish---of 1
critpollhook_establish---of 7
docritpollhooks50%of 4
doexechooks---of 8
doexithooks---of 6
doforkhooks50%of 6
domountroothook---of 5
dopowerhooks---of 17
doshutdownhooks---of 9
exechook_disestablish---of 1
exechook_establish---of 1
exithook_disestablish---of 1
exithook_establish100%of 1
forkhook_disestablish---of 1
forkhook_establish---of 1
hook_disestablish---of 12
hook_establish46%of 11
hook_init---of 1
mountroothook_destroy---of 9
mountroothook_disestablish---of 1
mountroothook_establish---of 7
powerhook_disestablish---of 10
powerhook_establish---of 5
shutdownhook_disestablish---of 1
shutdownhook_establish---of 7
simplehook_create100%of 1
simplehook_destroy---of 11
simplehook_disestablish---of 16
simplehook_dohooks---of 17
simplehook_establish---of 4
simplehook_has_hooks---of 1
x86_curlwp---of 1
-----------
SUMMARY53%of 23
tmpfs_alloc_dirent50%of 4
tmpfs_chflags67%of 9
tmpfs_chmod56%of 9
tmpfs_chown58%of 14
tmpfs_chsize57%of 16
tmpfs_chtimes72%of 14
tmpfs_construct_node40%of 50
tmpfs_dir_attach52%of 27
tmpfs_dir_cached78%of 9
tmpfs_dir_detach53%of 38
tmpfs_dir_getdents49%of 29
tmpfs_dir_getdotents75%of 8
tmpfs_dir_getseq32%of 19
tmpfs_dir_lookup64%of 22
tmpfs_dir_lookupbyseq37%of 22
tmpfs_free_dirent60%of 5
tmpfs_free_node56%of 27
tmpfs_init_vnode69%of 16
tmpfs_loadvnode---of 5
tmpfs_newvnode53%of 44
tmpfs_reg_resize58%of 19
tmpfs_update72%of 7
tmpfs_update_lazily72%of 7
tmpfs_update_locked84%of 12
-----------
SUMMARY55%of 427
-----------
SUMMARY---of 0
-----------
SUMMARY---of 0
compat_43_sys_getrlimit67%of 3
compat_43_sys_setrlimit100%of 3
kern_resource_43_fini---of 1
kern_resource_43_init---of 1
-----------
SUMMARY84%of 6
iskmemdev---of 4
iskmemvp100%of 4
rawio_listener_cb100%of 1
spec_advlock100%of 1
spec_bmap---of 7
spec_close60%of 42
spec_fdiscard---of 7
spec_fsync60%of 5
spec_inactive67%of 3
spec_init---of 1
spec_io_drain38%of 8
spec_io_enter34%of 9
spec_io_exit64%of 11
spec_ioctl50%of 6
spec_kqfilter---of 4
spec_lookup---of 1
spec_mmap60%of 5
spec_node_destroy50%of 14
spec_node_getmountedfs67%of 3
spec_node_init74%of 15
spec_node_lookup_by_dev56%of 18
spec_node_lookup_by_mount---of 11
spec_node_revoke53%of 23
spec_node_setmountedfs45%of 9
spec_open54%of 64
spec_pathconf89%of 9
spec_poll75%of 4
spec_print---of 1
spec_read39%of 26
spec_reclaim60%of 5
spec_strategy58%of 14
spec_write46%of 24
x86_curlwp100%of 1
-----------
SUMMARY56%of 324
x86_curlwp---of 1
x86_dbregs_abandon100%of 1
x86_dbregs_clear---of 6
x86_dbregs_init---of 1
x86_dbregs_read67%of 3
x86_dbregs_restore---of 4
x86_dbregs_save---of 4
x86_dbregs_store_dr6---of 5
x86_dbregs_switch37%of 11
x86_dbregs_user_trap---of 15
x86_dbregs_validate25%of 8
x86_dbregs_write---of 3
-----------
SUMMARY40%of 23
ptyfs_access67%of 3
ptyfs_advlock67%of 3
ptyfs_close31%of 13
ptyfs_getattr36%of 14
ptyfs_inactive67%of 3
ptyfs_ioctl67%of 3
ptyfs_itimes56%of 9
ptyfs_kqfilter---of 3
ptyfs_lookup36%of 14
ptyfs_open40%of 5
ptyfs_pathconf89%of 9
ptyfs_poll67%of 3
ptyfs_print---of 1
ptyfs_read17%of 12
ptyfs_readdir31%of 26
ptyfs_reclaim100%of 1
ptyfs_setattr49%of 47
ptyfs_write---of 11
x86_curlwp---of 1
-----------
SUMMARY45%of 165
_prop_stack_init100%of 1
_prop_stack_pop30%of 20
_prop_stack_push43%of 7
-----------
SUMMARY36%of 28
-----------
SUMMARY---of 0
compat_40_sys_mount100%of 1
vfs_syscalls_40_fini---of 1
vfs_syscalls_40_init---of 1
-----------
SUMMARY100%of 1
chglwpcnt67%of 3
chgproccnt67%of 3
chgsbsize100%of 3
chgsemcnt67%of 3
sysctl_kern_uidinfo_cnt---of 7
uid_find20%of 10
uid_init---of 10
uid_stats---of 10
-----------
SUMMARY50%of 22
turnstile_block61%of 56
turnstile_changepri---of 1
turnstile_ctor100%of 1
turnstile_exit100%of 1
turnstile_init---of 3
turnstile_lookup80%of 5
turnstile_print---of 10
turnstile_remove50%of 12
turnstile_wakeup56%of 36
x86_curlwp100%of 1
-----------
SUMMARY60%of 112
adjtime178%of 9
clock_getres184%of 6
clock_settime1100%of 3
dogetitimer100%of 5
dosetitimer82%of 32
dotimer_gettime---of 4
dotimer_settime---of 28
itimer_arm_real67%of 9
itimer_callout---of 19
itimer_decr---of 17
itimer_fini---of 5
itimer_gettime50%of 24
itimer_init50%of 14
itimer_lock---of 1
itimer_lock_held---of 1
itimer_poison---of 11
itimer_settime29%of 46
itimer_unlock---of 1
nanosleep136%of 28
ptimer_fire---of 7
ptimer_free---of 14
ptimer_intr---of 16
ptimer_tick---of 10
ptimers_alloc67%of 3
ptimers_free---of 35
settime---of 1
settime140%of 15
settimeofday189%of 9
sys___adjtime50---of 7
sys___clock_getres5075%of 8
sys___clock_gettime50100%of 3
sys___clock_settime50100%of 4
sys___getitimer50100%of 5
sys___gettimeofday50---of 6
sys___nanosleep50100%of 4
sys___setitimer50100%of 8
sys___settimeofday50---of 7
sys___timer_gettime50---of 4
sys___timer_settime50---of 4
sys_clock_getcpuclockid2---of 6
sys_clock_nanosleep100%of 5
sys_timer_create---of 1
sys_timer_delete---of 13
sys_timer_getoverrun---of 4
time_init---of 1
time_wraps---of 4
timer_create1---of 49
-----------
SUMMARY61%of 240
ah6_ctlinput_wrapper100%of 1
encap6_ctlinput_wrapper100%of 1
esp6_ctlinput_wrapper100%of 1
icmp6_ctloutput_wrapper100%of 1
in6_dom_init---of 1
rip6_ctlinput_wrapper100%of 1
rip6_ctloutput_wrapper100%of 1
tcp6_ctlinput_wrapper100%of 1
tcp6_init---of 1
tcp_ctloutput_wrapper100%of 1
udp6_ctlinput_wrapper100%of 1
udp6_ctloutput_wrapper100%of 1
-----------
SUMMARY100%of 10
ah4_ctlinput38%of 8
ah6_ctlinput30%of 10
esp4_ctlinput38%of 8
esp6_ctlinput30%of 10
sysctl_ipsec---of 11
sysctl_net_inet6_ipsec6_setup---of 1
sysctl_net_inet_ah_stats---of 1
sysctl_net_inet_esp_stats---of 1
sysctl_net_inet_ipcomp_stats---of 1
sysctl_net_inet_ipip_stats---of 1
sysctl_net_inet_ipsec_setup---of 3
sysctl_net_inet_ipsec_stats---of 1
sysctl_net_ipsec_enabled---of 7
-----------
SUMMARY34%of 36
kmem_alloc50%of 8
kmem_asprintf---of 3
kmem_create_caches---of 17
kmem_free60%of 5
kmem_init---of 1
kmem_intr_alloc50%of 20
kmem_intr_free53%of 17
kmem_intr_zalloc67%of 3
kmem_roundup_size100%of 1
kmem_strdupsize50%of 4
kmem_strfree67%of 3
kmem_strndup---of 7
kmem_tmpbuf_alloc---of 3
kmem_tmpbuf_free---of 3
kmem_zalloc50%of 8
-----------
SUMMARY54%of 69
-----------
SUMMARY---of 0
roothub_ctrl_abort---of 8
roothub_ctrl_close---of 1
roothub_ctrl_done100%of 1
roothub_ctrl_start18%of 76
roothub_ctrl_transfer100%of 1
roothub_noop---of 1
usb_makelangtbl---of 5
usb_makestrdesc---of 6
-----------
SUMMARY20%of 78
bufq_priocscan_cancel---of 13
bufq_priocscan_fini---of 3
bufq_priocscan_get32%of 57
bufq_priocscan_init---of 1
bufq_priocscan_modcmd---of 4
bufq_priocscan_put67%of 3
cscan_tree_compare_key38%of 8
cscan_tree_compare_nodes20%of 21
-----------
SUMMARY31%of 89
in6pcb_bind43%of 35
in6pcb_bind_port50%of 36
in6pcb_connect48%of 72
in6pcb_disconnect60%of 5
in6pcb_fetch_peeraddr67%of 3
in6pcb_fetch_sockaddr67%of 3
in6pcb_init---of 1
in6pcb_lookup24%of 34
in6pcb_lookup_bound---of 57
in6pcb_lookup_local30%of 92
in6pcb_notify---of 62
in6pcb_purgeif---of 9
in6pcb_purgeif0---of 28
in6pcb_rtchange---of 3
in6pcb_rtentry36%of 25
in6pcb_rtentry_unref---of 1
in6pcb_set_state53%of 19
x86_curlwp100%of 1
-----------
SUMMARY40%of 325
getcwd_common71%of 58
proc_isunder67%of 3
sys___getcwd100%of 6
vn_isunder100%of 1
vnode_to_path45%of 9
-----------
SUMMARY71%of 77
cpu_attach---of 51
cpu_boot_secondary_processors---of 13
cpu_broadcast_halt---of 1
cpu_childdetached---of 9
cpu_debug_dump---of 6
cpu_defer---of 1
cpu_get_tsc_freq---of 12
cpu_hatch---of 13
cpu_init---of 14
cpu_init_first---of 3
cpu_init_idle_lwps---of 8
cpu_init_msrs---of 5
cpu_kick100%of 1
cpu_load_pmap67%of 3
cpu_match---of 1
cpu_offline_md---of 1
cpu_rescan---of 11
cpu_resume---of 7
cpu_shutdown---of 4
cpu_stop---of 8
cpu_suspend---of 3
mp_cpu_start---of 9
mp_cpu_start_cleanup---of 1
x86_cpu_idle_halt---of 5
x86_cpu_idle_mwait---of 5
x86_curlwp---of 1
-----------
SUMMARY75%of 4
mc146818_read100%of 1
mc146818_write---of 1
rtc_get_ymdhms---of 13
rtc_register---of 1
rtc_set_ymdhms58%of 7
rtcget50%of 4
rtcinit67%of 3
-----------
SUMMARY60%of 15
tmpfs_bytes_max100%of 1
tmpfs_dirent_get67%of 3
tmpfs_dirent_put67%of 3
tmpfs_mem_decr67%of 3
tmpfs_mem_incr67%of 3
tmpfs_mem_info100%of 1
tmpfs_mntmem_destroy67%of 3
tmpfs_mntmem_init100%of 1
tmpfs_mntmem_set67%of 3
tmpfs_node_get50%of 4
tmpfs_node_put67%of 3
tmpfs_pages_avail100%of 1
tmpfs_strname_alloc60%of 5
tmpfs_strname_free60%of 5
tmpfs_strname_neqlen100%of 3
-----------
SUMMARY70%of 42
-----------
SUMMARY---of 0
-----------
SUMMARY---of 0
usb_desc_iter_init---of 1
usb_desc_iter_next---of 6
usb_desc_iter_next_interface---of 20
usb_desc_iter_next_non_interface---of 7
usb_desc_iter_peek---of 6
usb_detach_waitold---of 3
usb_detach_wakeupold---of 1
usb_find_desc---of 12
usb_find_desc_if---of 29
usbd_bulk_transfer---of 3
usbd_clear_endpoint_feature---of 1
usbd_clear_hub_feature---of 1
usbd_clear_port_feature100%of 1
usbd_get_bos_desc---of 5
usbd_get_config---of 1
usbd_get_config_desc---of 5
usbd_get_config_desc_full---of 3
usbd_get_desc---of 4
usbd_get_device_desc100%of 1
usbd_get_device_status---of 1
usbd_get_hid_descriptor---of 8
usbd_get_hub_status---of 1
usbd_get_initial_ddesc50%of 4
usbd_get_port_status100%of 1
usbd_get_port_status_ext---of 1
usbd_get_protocol---of 3
usbd_get_report---of 3
usbd_get_report_descriptor---of 1
usbd_get_string_desc---of 6
usbd_intr_transfer---of 3
usbd_read_report_desc---of 11
usbd_set_address100%of 1
usbd_set_config---of 1
usbd_set_hub_feature---of 1
usbd_set_idle---of 3
usbd_set_port_feature---of 1
usbd_set_port_u1_timeout---of 1
usbd_set_port_u2_timeout---of 1
usbd_set_protocol---of 3
usbd_set_report---of 3
-----------
SUMMARY75%of 8
sysctl_net_inet6_udp6_stats---of 1
udp6_abort_wrapper---of 5
udp6_accept_wrapper---of 3
udp6_attach_wrapper58%of 7
udp6_bind_wrapper60%of 5
udp6_connect2_wrapper67%of 3
udp6_connect_wrapper46%of 11
udp6_ctlinput22%of 14
udp6_ctloutput34%of 12
udp6_detach_wrapper60%of 5
udp6_disconnect_wrapper60%of 10
udp6_init---of 1
udp6_input---of 14
udp6_input_checksum---of 9
udp6_ioctl_wrapper100%of 1
udp6_listen_wrapper67%of 3
udp6_notify---of 13
udp6_output47%of 86
udp6_peeraddr_wrapper58%of 7
udp6_purgeif_wrapper---of 1
udp6_rcvd_wrapper---of 3
udp6_realinput---of 58
udp6_recvoob_wrapper67%of 3
udp6_send_wrapper58%of 7
udp6_sendoob_wrapper67%of 3
udp6_sendup---of 23
udp6_shutdown_wrapper100%of 1
udp6_sockaddr_wrapper58%of 7
udp6_stat_wrapper67%of 3
udp6_statinc---of 3
x86_curlwp100%of 1
-----------
SUMMARY50%of 189
arp_dad_start---of 19
arp_dad_stop---of 8
arp_dad_stoptimer---of 9
arp_dad_timer---of 24
arp_drain---of 1
arp_drainstub---of 1
arp_fasttimo---of 3
arp_free---of 7
arp_ifinit---of 7
arp_init---of 3
arp_llinfo_holdsrc---of 5
arp_llinfo_missed---of 6
arp_llinfo_output---of 20
arp_llinfo_reachable---of 1
arp_llinfo_retrans---of 1
arp_nud_enabled---of 1
arp_nud_hint---of 4
arp_rtrequest---of 55
arp_stat_add---of 1
arpannounce---of 4
arpintr---of 21
arpioctl---of 1
arplookup---of 4
arprequest---of 15
arpresolve32%of 19
in_arpinput---of 153
revarpinput---of 26
revarpwhoarewe---of 10
sysctl_net_inet_arp_stats---of 1
x86_curlwp---of 1
-----------
SUMMARY32%of 19
chkdq60%of 5
chkiq60%of 5
dqdone---of 1
dqflush---of 8
dqget16%of 46
dqinit---of 1
dqref---of 5
dqreinit---of 14
dqrele17%of 18
getinoquota40%of 15
qsync---of 4
quota_handle_cmd10%of 53
ufsquota_free100%of 1
ufsquota_init100%of 1
-----------
SUMMARY21%of 144
_fstrans_start69%of 19
cow_change_enter---of 14
fscow_disestablish---of 17
fscow_establish---of 10
fscow_run39%of 26
fstrans_alloc_lwp_info64%of 25
fstrans_done60%of 32
fstrans_dump---of 51
fstrans_getstate---of 11
fstrans_held55%of 11
fstrans_init---of 3
fstrans_is_owner56%of 9
fstrans_lwp_dtor50%of 16
fstrans_lwp_pcc50%of 4
fstrans_lwp_pcd34%of 6
fstrans_mount100%of 1
fstrans_mount_dtor62%of 18
fstrans_mount_get59%of 12
fstrans_setstate44%of 32
fstrans_start67%of 3
fstrans_start_lazy---of 3
fstrans_start_nowait100%of 1
fstrans_unmount67%of 6
vfs_resume72%of 7
vfs_suspend50%of 16
x86_curlwp100%of 1
-----------
SUMMARY56%of 245
coredump13%of 40
coredump_modcmd---of 17
coredump_offset---of 1
coredump_write---of 5
-----------
SUMMARY13%of 40
allocbuf49%of 49
bawrite60%of 5
bbusy54%of 15
bdwrite38%of 29
binvalbuf---of 5
bio_doread64%of 11
biodone50%of 6
biodone264%of 11
biohist_init---of 1
biointr---of 12
biowait60%of 10
bread67%of 6
breadn64%of 11
brelse100%of 1
brelsel36%of 78
bremfree40%of 20
buf_destroy---of 1
buf_drain---of 7
buf_init---of 1
buf_memcalc---of 7
buf_nbuf---of 1
buf_setvalimit---of 3
buf_trim---of 11
bufhash_stats---of 10
bufinit---of 12
bufinit2---of 3
bufpool_page_alloc100%of 1
bufpool_page_free---of 1
bwrite66%of 23
getblk32%of 29
geteblk---of 7
getiobuf75%of 4
getnewbuf9%of 58
incore73%of 11
nestiobuf_done56%of 9
nestiobuf_iodone---of 9
nestiobuf_setup60%of 5
putiobuf100%of 1
sysctl_bufvm_update---of 25
sysctl_dobuf---of 20
vfs_bufstats---of 40
vn_bwrite100%of 1
x86_curlwp100%of 1
-----------
SUMMARY43%of 395
tmpfs_done---of 1
tmpfs_fhtovp---of 14
tmpfs_init---of 1
tmpfs_modcmd50%of 4
tmpfs_mount54%of 28
tmpfs_root50%of 6
tmpfs_snapshot---of 1
tmpfs_start100%of 1
tmpfs_statvfs67%of 3
tmpfs_sync100%of 1
tmpfs_unmount71%of 17
tmpfs_vget---of 1
tmpfs_vptofh75%of 4
x86_curlwp100%of 1
-----------
SUMMARY62%of 65
ether_add_vlantag---of 23
ether_addmulti60%of 15
ether_aton_r---of 13
ether_bpf_mtap---of 11
ether_crc32_be---of 4
ether_crc32_le---of 7
ether_del_vlantag---of 17
ether_delmulti47%of 13
ether_disable_vlan_mtu---of 6
ether_enable_vlan_mtu---of 5
ether_ifattach---of 10
ether_ifdetach---of 15
ether_ifdetachhook_disestablish---of 3
ether_ifdetachhook_establish---of 3
ether_inject_vlantag---of 16
ether_input---of 90
ether_ioctl13%of 31
ether_ioctl_reinit---of 9
ether_multiaddr60%of 10
ether_multicast_sysctl---of 17
ether_output22%of 69
ether_set_ifflags_cb---of 1
ether_set_vlan_cb---of 1
ether_snprintf---of 7
ether_sprintf---of 1
ether_strip_vlantag---of 11
etherinit---of 1
x86_curlwp---of 1
-----------
SUMMARY29%of 138
-----------
SUMMARY---of 0
-----------
SUMMARY---of 0
exec_aout_makecmds50%of 8
exec_aout_modcmd---of 4
exec_aout_prep_nmagic---of 3
exec_aout_prep_omagic---of 3
exec_aout_prep_zmagic---of 4
-----------
SUMMARY50%of 8
sco_attach_pcb55%of 11
sco_bind_pcb---of 4
sco_connect_pcb---of 19
sco_detach_pcb50%of 10
sco_disconnect_pcb---of 4
sco_getopt---of 5
sco_listen_pcb67%of 3
sco_peeraddr_pcb---of 1
sco_send_pcb---of 13
sco_setopt---of 1
sco_sockaddr_pcb100%of 1
-----------
SUMMARY57%of 25
sleepq_abort---of 3
sleepq_block56%of 27
sleepq_changepri67%of 3
sleepq_enqueue62%of 13
sleepq_enter86%of 7
sleepq_init100%of 1
sleepq_insert43%of 21
sleepq_lendpri67%of 3
sleepq_reinsert42%of 12
sleepq_remove48%of 17
sleepq_timeout---of 4
sleepq_transfer---of 10
sleepq_uncatch---of 1
sleepq_unsleep58%of 7
sleepq_wake54%of 13
sleeptab_init---of 5
x86_curlwp100%of 1
-----------
SUMMARY55%of 125
devmon_insert34%of 12
drvctl_close22%of 14
drvctl_command_get_properties---of 7
drvctl_fini---of 1
drvctl_init---of 1
drvctl_ioctl29%of 81
drvctl_modcmd---of 7
drvctl_poll67%of 3
drvctl_read---of 1
drvctl_stat100%of 1
drvctl_write100%of 1
drvctlattach---of 1
drvctlopen100%of 3
x86_curlwp100%of 1
-----------
SUMMARY33%of 116
runq_init---of 1
sched_bestcpu60%of 30
sched_catchlwp---of 29
sched_cpuattach---of 8
sched_curcpu_runnable_p---of 1
sched_dequeue60%of 32
sched_enqueue43%of 28
sched_idle---of 44
sched_lwp_stats---of 7
sched_nextlwp56%of 9
sched_preempted35%of 23
sched_print_runqueue---of 14
sched_resched_cpu67%of 18
sched_resched_lwp60%of 5
sched_takecpu44%of 50
sched_vforkexec---of 6
sysctl_sched_setup---of 3
x86_curlwp100%of 1
-----------
SUMMARY52%of 196
_x86_memio_map---of 4
_x86_memio_unmap---of 9
bus_space_alloc---of 15
bus_space_barrier29%of 7
bus_space_copy_region_1---of 21
bus_space_copy_region_2---of 21
bus_space_copy_region_4---of 21
bus_space_free---of 9
bus_space_is_equal---of 3
bus_space_map---of 15
bus_space_mmap67%of 3
bus_space_release---of 10
bus_space_reservation_map---of 8
bus_space_reservation_unmap---of 5
bus_space_reservation_unmap1---of 8
bus_space_reserve---of 11
bus_space_reserve_subregion---of 12
bus_space_set_multi_1---of 11
bus_space_set_multi_2---of 11
bus_space_set_multi_4---of 11
bus_space_set_region_1---of 11
bus_space_set_region_2---of 11
bus_space_set_region_4---of 11
bus_space_subregion---of 1
bus_space_tag_create---of 15
bus_space_tag_destroy---of 1
bus_space_unmap---of 5
bus_space_vaddr---of 1
x86_bus_space_init---of 1
x86_bus_space_mallocok---of 1
x86_mem_add_mapping---of 6
-----------
SUMMARY40%of 10
SHA224_256_Final56%of 9
SHA224_Final---of 1
SHA224_Init---of 3
SHA224_Transform---of 1
SHA224_Update---of 1
SHA256_Final100%of 1
SHA256_Init67%of 3
SHA256_Transform100%of 4
SHA256_Update47%of 13
SHA384_Final---of 3
SHA384_Init---of 3
SHA384_Transform---of 1
SHA384_Update---of 1
SHA512_Final---of 3
SHA512_Init---of 3
SHA512_Last---of 5
SHA512_Transform---of 4
SHA512_Update---of 20
-----------
SUMMARY60%of 30
softint_block---of 3
softint_disestablish---of 14
softint_dispatch---of 48
softint_establish48%of 19
softint_init---of 10
softint_init_isr---of 3
softint_schedule54%of 13
softint_schedule_cpu63%of 8
x86_curlwp---of 1
-----------
SUMMARY53%of 40
htable_foreach_lle72%of 7
htable_free_tbl---of 1
htable_link_entry---of 7
htable_prefix_free43%of 7
htable_prefix_free_cb40%of 5
htable_unlink_entry---of 11
lla_rt_output---of 53
llentry_alloc---of 9
llentry_free---of 18
llentry_pool_get---of 3
llentry_pool_put---of 1
lltable_allocate_htbl60%of 5
lltable_drain---of 11
lltable_drop_entry_queue---of 10
lltable_dump_entry---of 15
lltable_fill_sa_entry---of 1
lltable_foreach_lle---of 1
lltable_free---of 6
lltable_free_cb---of 4
lltable_free_entry---of 1
lltable_get_af---of 1
lltable_get_ifp---of 1
lltable_link100%of 1
lltable_link_entry---of 1
lltable_prefix_free84%of 6
lltable_purge_entries45%of 9
lltable_sysctl_dump---of 10
lltable_unlink_entry---of 1
lltableinit---of 1
x86_curlwp---of 1
-----------
SUMMARY58%of 40
cpu_lookup40%of 10
cpu_setintr---of 15
cpu_setstate---of 16
cpu_ucode_load---of 8
cpu_xc_intr---of 1
cpu_xc_nointr---of 1
cpu_xc_offline---of 17
cpu_xc_online---of 1
cpuctl_ioctl---of 17
cpuctlattach---of 3
mi_cpu_attach---of 9
x86_curlwp---of 1
-----------
SUMMARY40%of 10
addrulwp80%of 5
calcru67%of 21
donice60%of 10
dosetrlimit91%of 32
getrusage175%of 8
lim_addref100%of 1
lim_copy34%of 9
lim_free84%of 6
lim_privatise38%of 8
lim_setcorename---of 3
pstatscopy100%of 1
pstatsfree100%of 1
resource_init---of 1
resource_listener_cb60%of 10
ruadd72%of 7
rulwps---of 6
ruspace---of 1
sys___getrusage50100%of 3
sys_getpriority79%of 23
sys_getrlimit100%of 3
sys_setpriority79%of 23
sys_setrlimit100%of 3
sysctl_proc_corename---of 22
sysctl_proc_paxflags---of 10
sysctl_proc_plimit---of 19
sysctl_proc_stop---of 13
x86_curlwp100%of 1
-----------
SUMMARY75%of 175
devhandle_compare---of 12
devhandle_impl_inherit---of 1
devhandle_invalid---of 1
devhandle_is_valid---of 3
devhandle_lookup_device_call---of 7
devhandle_type---of 3
device_activation---of 5
device_attached_to_iattr---of 4
device_call_generic---of 7
device_cfattach---of 1
device_cfdata100%of 1
device_cfdriver100%of 1
device_class---of 1
device_enumerate_children---of 7
device_handle---of 1
device_has_power---of 1
device_is_a50%of 4
device_is_active100%of 1
device_is_enabled---of 1
device_locator---of 3
device_parent100%of 1
device_private100%of 3
device_properties100%of 1
device_set_handle---of 1
device_set_private---of 5
device_unit100%of 1
device_xname100%of 1
-----------
SUMMARY86%of 14
filt_ttyrdetach---of 1
filt_ttyread---of 12
filt_ttywdetach---of 1
filt_ttywrite---of 8
nullmodem---of 7
sysctl_kern_tty_qsize---of 3
tputchar75%of 12
ttioctl42%of 185
ttpoll45%of 20
ttread13%of 85
ttrstrt---of 4
ttsetwater67%of 3
ttspeedtab60%of 5
ttstart---of 3
ttwakeup50%of 8
ttwrite32%of 60
tty_acquire67%of 3
tty_alloc---of 1
tty_attach---of 3
tty_detach---of 8
tty_free---of 8
tty_get_qsize50%of 4
tty_getctrlchar67%of 3
tty_init---of 3
tty_listener_cb100%of 4
tty_release37%of 11
tty_set_qsize---of 4
tty_setctrlchar67%of 3
tty_try_xonxoff---of 8
tty_unit---of 1
ttycancel---of 1
ttychars100%of 1
ttycheckoutq---of 6
ttyclose50%of 6
ttyecho62%of 13
ttyflush55%of 11
ttygetinfo19%of 61
ttyinput67%of 3
ttyinput_wlock20%of 140
ttykqfilter---of 5
ttylclose40%of 5
ttylock---of 1
ttylocked---of 1
ttylopen80%of 5
ttymodem40%of 10
ttyopen15%of 14
ttyoutput33%of 28
ttypause---of 6
ttypend50%of 6
ttyprintf_nolock100%of 1
ttypull100%of 3
ttyputinfo50%of 8
ttyretype---of 11
ttyrub---of 28
ttysig---of 5
ttysigintr---of 36
ttysleep34%of 9
ttyunlock---of 1
ttywait---of 1
ttywait_timo16%of 13
ttywflush---of 4
x86_curlwp100%of 1
-----------
SUMMARY34%of 744
-----------
SUMMARY---of 0
domain_attach---of 12
domaininit---of 31
domaininit_post---of 1
pfctlinput78%of 9
pfctlinput2---of 11
pffasttimo---of 9
pffinddomain75%of 8
pffindproto84%of 18
pffindtype75%of 12
pfslowtimo---of 9
sockaddr_addr---of 10
sockaddr_alloc60%of 10
sockaddr_any---of 8
sockaddr_any_by_family---of 8
sockaddr_anyaddr---of 18
sockaddr_cmp30%of 17
sockaddr_const_addr---of 10
sockaddr_copy60%of 10
sockaddr_dup67%of 3
sockaddr_externalize---of 10
sockaddr_format---of 15
sockaddr_free100%of 1
sockaddr_getsize_by_family29%of 7
sysctl_unpcblist---of 41
x86_curlwp100%of 1
-----------
SUMMARY63%of 96
union_abortop67%of 3
union_access32%of 19
union_advlock67%of 3
union_bmap50%of 6
union_bwrite---of 11
union_close47%of 13
union_create40%of 5
union_fsync37%of 11
union_getattr32%of 16
union_getpages63%of 8
union_inactive40%of 5
union_ioctl67%of 3
union_islocked67%of 3
union_kqfilter---of 4
union_link29%of 14
union_lock40%of 10
union_lookup43%of 42
union_lookup148%of 21
union_mkdir40%of 5
union_mknod40%of 5
union_mmap67%of 3
union_open47%of 13
union_parsepath45%of 9
union_pathconf50%of 6
union_poll67%of 3
union_print---of 9
union_putpages67%of 9
union_read46%of 11
union_readdir30%of 10
union_readlink50%of 6
union_reclaim50%of 4
union_remove43%of 7
union_rename36%of 25
union_revoke---of 5
union_rmdir38%of 8
union_seek67%of 3
union_setattr42%of 34
union_strategy---of 11
union_symlink67%of 3
union_unlock67%of 3
union_whiteout67%of 3
union_write34%of 9
x86_curlwp100%of 1
-----------
SUMMARY44%of 362
bufq_fcfs_cancel---of 10
bufq_fcfs_fini67%of 3
bufq_fcfs_get50%of 8
bufq_fcfs_init100%of 1
bufq_fcfs_modcmd---of 4
bufq_fcfs_put67%of 3
-----------
SUMMARY60%of 15
bufq_alloc41%of 22
bufq_cancel---of 1
bufq_drain---of 4
bufq_free67%of 3
bufq_get100%of 1
bufq_getstrategyname---of 1
bufq_init---of 3
bufq_move---of 4
bufq_peek100%of 1
bufq_put100%of 1
bufq_register---of 1
bufq_unregister---of 5
sysctl_kern_bufq_strategies---of 17
-----------
SUMMARY50%of 28
i82489_ipi50%of 8
lapic_boot_init---of 36
lapic_calibrate_timer---of 30
lapic_clockintr---of 1
lapic_cpu_number---of 3
lapic_delay45%of 18
lapic_dump---of 15
lapic_enable---of 3
lapic_get_timecount---of 8
lapic_hwmask---of 4
lapic_hwunmask---of 4
lapic_initclock---of 3
lapic_is_x2apic---of 4
lapic_readreg---of 3
lapic_reset---of 7
lapic_set_lvt---of 28
lapic_setup---of 1
lapic_write_tpri---of 1
lapic_writereg---of 3
x2apic_ipi---of 1
x86_ipi_init---of 9
x86_ipi_startup---of 9
-----------
SUMMARY47%of 26
filt_fileattach---of 1
filt_kqdetach---of 1
filt_kqueue---of 3
filt_nopdetach---of 1
filt_nopevent---of 1
filt_proc---of 3
filt_procattach---of 4
filt_procdetach---of 7
filt_seltrue---of 1
filt_seltruedetach---of 1
filt_timer---of 1
filt_timerattach---of 13
filt_timercompute---of 17
filt_timerdetach---of 1
filt_timerexpire---of 6
filt_timertouch---of 27
filt_user---of 1
filt_userattach---of 1
filt_userdetach---of 1
filt_usertouch---of 14
filter_event50%of 10
kevent1---of 188
kevent_fetch_changes---of 1
kevent_put_events---of 1
kfilter_register---of 30
kfilter_unregister---of 20
klist_fini34%of 6
klist_init100%of 1
klist_insert---of 1
klist_remove---of 4
knote50%of 6
knote_activate_locked50%of 8
knote_clear_eof---of 1
knote_detach---of 39
knote_detach_quiesce---of 27
knote_fdclose---of 6
knote_proc_exec---of 9
knote_proc_exit---of 12
knote_proc_fork---of 16
knote_proc_fork_track---of 26
knote_set_eof---of 1
kqueue1---of 3
kqueue_check42%of 12
kqueue_close---of 16
kqueue_doclose---of 10
kqueue_fcntl---of 1
kqueue_fpathconf---of 1
kqueue_init---of 1
kqueue_ioctl---of 29
kqueue_kqfilter---of 5
kqueue_listener_cb50%of 4
kqueue_poll---of 4
kqueue_printit---of 8
kqueue_restart---of 3
kqueue_stat---of 1
seltrue_kqfilter---of 3
sys___kevent100---of 1
sys_kqueue---of 1
sys_kqueue1---of 1
x86_curlwp---of 1
-----------
SUMMARY47%of 47
-----------
SUMMARY---of 0
tcp_output42%of 336
tcp_setpersist---of 4
-----------
SUMMARY42%of 336
secmodel_deregister---of 16
secmodel_eval---of 9
secmodel_init---of 1
secmodel_nsecmodels100%of 1
secmodel_register---of 13
secmodel_setinfo---of 1
-----------
SUMMARY100%of 1
kernfs_done---of 1
kernfs_get_rrootdev---of 5
kernfs_init---of 1
kernfs_loadvnode50%of 24
kernfs_modcmd---of 4
kernfs_mount38%of 8
kernfs_reinit---of 1
kernfs_root50%of 4
kernfs_start100%of 1
kernfs_sync100%of 1
kernfs_sysctl_setup---of 1
kernfs_unmount67%of 3
kernfs_vget---of 1
x86_curlwp100%of 1
-----------
SUMMARY53%of 42
cfdata_ifattr---of 1
cfiattr_lookup---of 19
config_alldevs_enter44%of 16
config_alldevs_exit44%of 25
config_attach---of 5
config_attach_acquire---of 7
config_attach_internal---of 30
config_attach_pseudo67%of 3
config_attach_pseudo_acquire45%of 9
config_cfattach_attach---of 14
config_cfattach_detach---of 17
config_cfattach_lookup---of 12
config_cfdata_attach---of 30
config_cfdata_detach---of 21
config_cfdriver_attach---of 9
config_cfdriver_detach---of 13
config_cfdriver_lookup---of 7
config_create_interruptthreads---of 1
config_create_mountrootthreads---of 5
config_deactivate---of 12
config_defer---of 8
config_deferred---of 3
config_detach67%of 3
config_detach_all---of 11
config_detach_children---of 11
config_detach_commit50%of 6
config_detach_exit50%of 6
config_detach_release33%of 59
config_devalloc31%of 72
config_devlink60%of 5
config_devunlink34%of 18
config_finalize---of 28
config_finalize_mountroot---of 7
config_finalize_register---of 11
config_fini_component---of 43
config_found---of 5
config_found_acquire---of 14
config_init---of 36
config_init_component---of 35
config_init_mi---of 3
config_interrupts---of 8
config_interrupts_thread---of 12
config_match---of 14
config_mountroot---of 8
config_mountroot_thread---of 10
config_pending_decr30%of 10
config_pending_incr50%of 6
config_probe---of 1
config_process_deferred20%of 15
config_rootfound---of 10
config_rootsearch---of 7
config_search---of 7
config_search_internal---of 54
config_stdsubmatch---of 13
config_twiddle_fn---of 3
config_twiddle_init---of 3
device_acquire---of 3
device_active---of 5
device_active_deregister---of 10
device_active_register---of 18
device_compatible_lookup---of 9
device_compatible_lookup_id---of 5
device_compatible_lookup_strlist---of 6
device_compatible_match---of 9
device_compatible_match_id---of 5
device_compatible_match_strlist---of 6
device_compatible_plookup---of 9
device_compatible_plookup_strlist---of 6
device_compatible_pmatch---of 9
device_compatible_pmatch_strlist---of 6
device_find_by_driver_unit---of 10
device_find_by_xname60%of 5
device_getlock---of 1
device_lookup60%of 5
device_lookup_acquire29%of 14
device_lookup_private80%of 5
device_pmf_bus_deregister67%of 3
device_pmf_bus_private---of 1
device_pmf_bus_register---of 1
device_pmf_bus_resume---of 6
device_pmf_bus_shutdown---of 4
device_pmf_bus_suspend---of 7
device_pmf_class_deregister67%of 3
device_pmf_class_private---of 1
device_pmf_class_register---of 1
device_pmf_class_resume---of 7
device_pmf_class_suspend---of 6
device_pmf_driver_child_register50%of 4
device_pmf_driver_deregister50%of 4
device_pmf_driver_register100%of 1
device_pmf_driver_resume---of 7
device_pmf_driver_set_child_register---of 1
device_pmf_driver_shutdown---of 4
device_pmf_driver_suspend---of 7
device_pmf_is_registered---of 1
device_pmf_lock---of 6
device_pmf_unlock---of 5
device_release100%of 1
deviter_first100%of 1
deviter_init23%of 18
deviter_next10%of 21
deviter_next155%of 11
deviter_release100%of 1
devmon_report_device37%of 11
ifattr_match---of 3
no_devmon_insert---of 1
null_childdetached---of 1
shutdown_first---of 6
shutdown_next---of 4
x86_curlwp100%of 1
-----------
SUMMARY38%of 362